diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
index 86322616fa..8165ec95fc 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -20,4 +20,4 @@ A clear and concise description of what you expected to happen.
  - Environment location: [Bare-metal, Docker, Cloud(specify cloud provider)] 
 
 **Additional context**
-Add any other context about the problem here.
\ No newline at end of file
+Add any other context about the problem here.
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 0000000000..4572ae1b98
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,5 @@
+blank_issues_enabled: true
+contact_links:
+  - name: CUTLASS Discord
+    url: https://discord.gg/nvidiadeveloper
+    about: Come chat about using and contributing to CUTLASS!
diff --git a/.github/ISSUE_TEMPLATE/documentation_request.md b/.github/ISSUE_TEMPLATE/documentation_request.md
index 9e96105f5d..c9fa21fac9 100644
--- a/.github/ISSUE_TEMPLATE/documentation_request.md
+++ b/.github/ISSUE_TEMPLATE/documentation_request.md
@@ -32,4 +32,4 @@ A clear and concise description of what documentation you believe it is needed a
 A clear and concise description of what you want to happen.
 
 **Steps taken to search for needed documentation**
-List any steps you have taken:
\ No newline at end of file
+List any steps you have taken:
diff --git a/.github/ISSUE_TEMPLATE/submit_question.md b/.github/ISSUE_TEMPLATE/submit_question.md
index 743f893fcb..5aa2a672d2 100644
--- a/.github/ISSUE_TEMPLATE/submit_question.md
+++ b/.github/ISSUE_TEMPLATE/submit_question.md
@@ -7,4 +7,4 @@ assignees: ''
 
 ---
 
-**What is your question?**
\ No newline at end of file
+**What is your question?**
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
index 6510938e85..23956a02fb 100644
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -8,4 +8,4 @@ jobs:
     steps:
     - uses: actions/labeler@main
       with:
-        repo-token: "${{ secrets.GITHUB_TOKEN }}"
\ No newline at end of file
+        repo-token: "${{ secrets.GITHUB_TOKEN }}"
diff --git a/.github/workflows/new-issues-to-triage-projects.yml b/.github/workflows/new-issues-to-triage-projects.yml
index 3049176e3b..a963cb2f89 100644
--- a/.github/workflows/new-issues-to-triage-projects.yml
+++ b/.github/workflows/new-issues-to-triage-projects.yml
@@ -32,4 +32,4 @@ jobs:
       env:
         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         GITHUB_PROJECT_URL: https://github.com/NVIDIA/cutlass
-        GITHUB_PROJECT_COLUMN_NAME: 'Needs prioritizing'
\ No newline at end of file
+        GITHUB_PROJECT_COLUMN_NAME: 'Needs prioritizing'
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index cb2e7275bd..8b65da69aa 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -54,4 +54,4 @@ jobs:
           exempt-pr-labels: "0 - Blocked,0 - Backlog,good first issue"
           days-before-pr-stale: 90
           days-before-pr-close: -1
-          operations-per-run: 50
\ No newline at end of file
+          operations-per-run: 50
diff --git a/.gitignore b/.gitignore
index 1328f6b7d6..acddb1f9d1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 # PyCache files
 __pycache__/
+cutlass_library.egg-info/
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d09b4981e4..1ba870eba2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,57 +1,270 @@
 # NVIDIA CUTLASS Changelog
+## [3.6.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.6.0) (2024-10-03)
+
+- [Hopper structured sparse GEMM](./examples/62_hopper_sparse_gemm/62_hopper_sparse_gemm.cu).
+  + [FP16](./test/unit/gemm/device/sm90_sparse_gemm_f16_f16_f32_tensor_op_f32.cu)
+  + [FP8](./test/unit/gemm/device/sm90_sparse_gemm_f8_f8_f32_tensor_op_f32.cu)
+  + [INT8](./test/unit/gemm/device/sm90_sparse_gemm_s8_s8_s32_tensor_op_s32.cu)
+  + [TF32](./test/unit/gemm/device/sm90_sparse_gemm_tf32_tf32_f32_tensor_op_f32.cu)
+- A refactor to the CUTLASS 3.x convolution `kernel::ConvUniversal` [API](./include/cutlass/conv/kernel/sm90_implicit_gemm_tma_warpspecialized.hpp) to bring it in line with `gemm::GemmUniversal`. Now the 3.x convolution API is no longer considered as a beta API.
+- Improve [mixed input GEMM](./examples/55_hopper_mixed_dtype_gemm/README.md).
+  + Added a [lookup table implementation](./examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_fp8_gemm.cu) for `INT4`x`FP8` scale-only mode.
+  + Added [layout pre-shuffling](./examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_fp8_gemm.cu#L50-55) to optimize memory loading.
+  + Added [interleaved conversion](./examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_bf16_gemm.cu#L50-52) for `{INT4, UINT4, INT8}` x `{FP16, BF16}`.
+  + Other general optimizations.
+- The suffixes of the mixed input kernel schedules have been removed. Use `KernelTmaWarpSpecialized`, `KernelTmaWarpSpecializedPingpong` and `KernelTmaWarpSpecializedCooperative` instead.
+- [EVT nodes for Top-K selection and softmax](./include/cutlass/epilogue/fusion/sm90_visitor_topk_softmax.hpp) and [GEMM example using those](./examples/61_hopper_gemm_with_topk_and_softmax/61_hopper_gemm_with_topk_and_softmax.cu).
+- [Programmatic Dependent Launch](./include/cutlass/arch/grid_dependency_control.h) (PDL) that leverages a new Hopper feature to speedup two back-to-back kernels, and its corresponding [documentations](./media/docs/dependent_kernel_launch.md).
+- [A new debugging tool, synclog](./include/cutlass/arch/synclog.hpp), for dumping out all synchronization events from within a kernel to a file. Please see [synclog documentation](./media/docs/utilities.md#debugging-asynchronous-kernels-with-cutlasss-built-in-synclog-tool) for details.
+- A new TMA-enabled [epilogue](./include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp) for grouped GEMM that brings significant performance improvement, as well as its EVT support.
+- A SIMT-enabled pointer-array [epilogue](./include/cutlass/epilogue/collective/sm70_epilogue_vectorized_array.hpp).
+- A new [Ping-Pong kernel schedule for Grouped GEMM](./include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp) and some other optimizations.
+- [A new instantiation strategy for CUTLASS profiler kernels](./python/cutlass_library/sm90_shapes.py) along with [improved documentation for instantiation level in CUTLASS profiler](./media/docs/profiler.md#instantiating-more-kernels-with-hopper).
+- A new hardware support for comparisons and computations of [`cutlass::bfloat16_t`](./include/cutlass/bfloat16.h)
+- Fixed use of isnan on Windows for [`half_t`](./test/unit/core/functional.cu).
+- Various improvements and fixes from the community and CUTLASS team. Thanks to everyone who submitted PRs!
+
+- [Minimal SM90 WGMMA + TMA GEMM example in 100 lines of code](./examples/cute/tutorial/wgmma_sm90.cu)
+- [Exposure of L2 `cache_hint`s in TMA copy atoms](./include/cute/arch/copy_sm90_tma.hpp#L48)
+- Exposure of raster order and tile swizzle extent in [CUTLASS library profiler](./media/docs/profiler.md#GEMM), and
+[example 48](./examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu).
+- [TMA store based and EVT supported epilogues](./include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp) for [Hopper pointer array batched kernels](./test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_ptr_array.cu).
+- A new [`GemmSparseUniversal` API for CUTLASS 2.x Ampere kernels](./include/cutlass/gemm/device/gemm_sparse_universal.h) to enable serial and parallel split-k for sparse tensor cores and new tiny tile sizes to better support LLM inferrence:
+  + [FP16 TN](./test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sparse_sm80.cu#L269-L393) and [NT](./test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sparse_sm80.cu#L269-L411).
+  + [int8 TN](./test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu#L264-L452).
+  + [int4 TN](./test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu#L264-L452).
+  + [FP32 TN](./test/unit/gemm/device/gemm_f32t_f32n_f32t_tensor_op_f32_sparse_sm80.cu#L427-L642) and [NT](./test/unit/gemm/device/gemm_f32n_f32t_f32t_tensor_op_f32_sparse_sm80.cu#L427-L456).
+- [CUDA host adapter](./include/cutlass/cuda_host_adapter.hpp) extensions to support TMA descriptor construction driver APIs.
+- Inclusion of more [Hopper fprop, dgrad, and wgrad convolution kernels in CUTLASS library and profiler](./python/cutlass_library/generator.py).
+- Support for residual add (beta != 0) in convolution kernels.
+- A new convolution [epilogue](./examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu#L269) for CUTLASS 2.x to support non-packed NHWC output.
+- A refactor of [include files throughout CUTLASS core directories](./include/cutlass/gemm/collective/collective_mma_decl.hpp) to reduce circular dependencies and [tests to guard against them](./test/self_contained_includes/CMakeLists.txt).
+- [A guide for setting up VSCode to work well with CUTLASS](./media/docs/ide_setup.md) and [expanded code style guide](./media/docs/programming_guidelines.md).
+- Better support for MSVC as a host compiler.
+- Many performance optimizations, improvements, and bug fixes including fixes for FlashAttention-2.
+- Optimal code generation with CUDA toolkit versions 12.4 and 12.5u1.
+
+## [3.5.1](https://github.com/NVIDIA/cutlass/releases/tag/v3.5.1) (2024-07-25)
+
+- [Minimal SM90 WGMMA + TMA GEMM example in 100 lines of code](./examples/cute/tutorial/wgmma_sm90.cu)
+- [Exposure of L2 `cache_hint`s in TMA copy atoms](./include/cute/arch/copy_sm90_tma.hpp#L48)
+- Exposure of raster order and tile swizzle extent in [CUTLASS library profiler](./media/docs/profiler.md#GEMM), and
+[example 48](./examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu).
+- [TMA store based and EVT supported epilogues](./include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp) for [Hopper pointer array batched kernels](./test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_ptr_array.cu).
+- A new [`GemmSparseUniversal` API for CUTLASS 2.x Ampere kernels](./include/cutlass/gemm/device/gemm_sparse_universal.h) to enable serial and parallel split-k for sparse tensor cores and new tiny tile sizes to better support LLM inferrence:
+  + [FP16 TN](./test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sparse_sm80.cu#L269-L393) and [NT](./test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sparse_sm80.cu#L269-L411).
+  + [int8 TN](./test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu#L264-L452).
+  + [int4 TN](./test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu#L264-L452).
+  + [FP32 TN](./test/unit/gemm/device/gemm_f32t_f32n_f32t_tensor_op_f32_sparse_sm80.cu#L427-L642) and [NT](./test/unit/gemm/device/gemm_f32n_f32t_f32t_tensor_op_f32_sparse_sm80.cu#L427-L456).
+- [CUDA host adapter](./include/cutlass/cuda_host_adapter.hpp) extensions to support TMA descriptor construction driver APIs.
+- Inclusion of more [Hopper fprop, dgrad, and wgrad convolution kernels in CUTLASS library and profiler](./python/cutlass_library/generator.py).
+- Support for residual add (beta != 0) in convolution kernels.
+- A new convolution [epilogue](./examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu#L269) for CUTLASS 2.x to support non-packed NHWC output.
+- A refactor of [include files throughout CUTLASS core directories](./include/cutlass/gemm/collective/collective_mma_decl.hpp) to reduce circular dependencies and [tests to guard against them](./test/self_contained_includes/CMakeLists.txt).
+- [A guide for setting up VSCode to work well with CUTLASS](./media/docs/ide_setup.md) and [expanded code style guide](./media/docs/programming_guidelines.md).
+- Better support for MSVC as a host compiler.
+- Many performance optimizations, improvements, and bug fixes including fixes for FlashAttention-2.
+- Optimal code generation with CUDA toolkit versions 12.4 and 12.5u1.
+
+## [3.5.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.5.0) (2024-04-09)
+
+- Implicit GEMM Convolutions targeting Hopper SM90A via WGMMA + [TMA im2col](./include/cute/atom/copy_traits_sm90_im2col.hpp)
+  + Native implementation in CUTLASS 3.x using CuTe, mirroring the [same design hierarchy as that of GEMMs](./media/docs/gemm_api_3x.md).
+  + Support for 1D, 2D, and 3D convolutions in a [rank-agnostic fashion](./include/cutlass/conv/convnd_problem_shape.hpp).
+  + Support for [Fprop](./test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu), [Dgrad](./test/unit/conv/device_3x/dgrad/sm90_conv2d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu), and [Wgrad](./test/unit/conv/device_3x/wgrad/sm90_conv1d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu) algorithms
+  + [CUTLASS profiler support](./python/cutlass_library/conv3x_emitter.py) for 2D and 3D convolutions implemented via the 3.x API.
+  + NOTE: this is a beta release. Further updates to CUTLASS will include major performance improvements, feature enablement, and possible breaking changes to the API until 3.7 release. Your feedback is welcome on the design!
+- Support for [Ada (SM89) FP8 tensor cores via the 2.x API](./examples/58_ada_fp8_gemm/ada_fp8_gemm.cu). Requires CUDA 12.4 or newer.
+- [Ampere gather/scatter convolution example](./examples/59_ampere_gather_scatter_conv/README.md) in CuTe and CUTLASS 3.x
+  + Showcasing how custom kernels can be written and optimized using CUTLASS 3.x and CuTe and the general strategy for implementing convolutions as specializations of GETTs.
+  + Implementation of a coarse grained sparse gather/scatter kernel achieving peak performance on Ampere class tensor cores.
+- 32x and 16x tile sizes are added to CUTLASS 2.x to improve the performance of narrow-tall and wide-short matrices.
+  + [Ampere FP16 TN](./test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f32_sm80.cu) and [NT](./test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu#L227-L301), [Ampere INT8 TN](./test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu#L392-L1342), [Ampere INT4 TN](./test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm80.cu#L372-L934).
+  + [Turing FP16 TN](./test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f32_sm75.cu#L55-L394), [Turing INT8 TN](./test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu#L166-L537), [Turing INT4 TN](./test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu#L310-L564).
+- Updates to CuTe documentation for [`cute::Tensor<>`](./media/docs/cute/03_tensor.md), [MMA atoms](./media/docs/cute/0t_mma_atom.md), and an overhauled [CuTe GEMM tutorial series](./examples/cute/tutorial).
+- Extensions to CuTe to support [L2 prefetching](./include/cute/algorithm/prefetch.hpp) and [TMA store+reductions](./include/cute/arch/copy_sm90_tma.hpp#L1337).
+- Remove C++11 requirement on a few CUTLASS 2.x API header files. All CUTLASS files now require C++17.
+- Fixes to greatly reduce build warnings.
+- Updates and bugfixes from the community (thanks!)
+
+## [3.4.1](https://github.com/NVIDIA/cutlass/releases/tag/v3.4.1) (2024-02-14)
+
+- Statically available [CUTLASS Version macros](./include/cutlass/version.h) that allow for handling API changes between CUTLASS releases on the users' side.
+- Improvements for Hopper [Group-GEMMs](./examples/57_hopper_grouped_gemm) and [Pointer-Array Batched GEMMs](./examples/56_hopper_ptr_array_batched_gemm).
+- Updates and bugfixes from the community (thanks!).
+
+## [3.4.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.4.0) (2024-01-12)
+* Expanded [Mixed-input Hopper GEMMs](./examples/55_hopper_mixed_dtype_gemm) support covering {16-bit, 8-bit} x {8-bit, 4-bit} input types with fast numerical converters and group scaling factors.
+* Performance improvements to [Mixed-input Hopper GEMMs](./examples/55_hopper_mixed_dtype_gemm)
+* Beta release of [Pointer-Array Batched GEMMs](./examples/56_hopper_ptr_array_batched_gemm) now available on Hopper GPUs utilizing TMA and WGMMA (requires CUDA 12.3 or above).
+* Beta release of [Group-GEMM](./examples/57_hopper_grouped_gemm) utilizing TMA and WGMMA (requires CUDA 12.3 or above).
+* [Ampere Sparse GEMM](./examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm_with_visitor.cu) supports Epilogue Visitor Tree (EVT) now.
+* NamedBarriers usability improvement and list of [ReservedNamedBarriers](./include/cutlass/arch/barrier.h) has been officially released.
+* Improved [CuTe documentation](./media/docs/cute/) including improved clarity and depth of [Quickstart](./media/docs/cute/00_quickstart.md), [CuTe Layout](./media/docs/cute/01_layout.md), and [CuTe Layout Algebra](./media/docs/cute/02_layout_algebra.md). Associated code comments, post-conditions, and details in [CuTe Core Unit Tests](./test/unit/cute/core/) also improved.
+
+## [3.3](https://github.com/NVIDIA/cutlass/releases/tag/v3.3.0) (2023-10-31)
+* [Mixed-input Hopper GEMMs](./examples/55_hopper_mixed_dtype_gemm) support covering 16-bit x 8-bit input operand types.
+* [Mixed-input Ampere GEMMs](https://github.com/NVIDIA/cutlass/pull/1084) with support for canonical layouts (TN). The implementation supports upcast on operandB {fp16, bf16} x {s8, u8}, and upcast on operandA {s8, u8} x {fp16, bf16}.
+* [Copy Async based Hopper GEMMs](./test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32_warpspecialized_cooperative.cu) - which support lower than 16B aligned input tensors.
+* Kernel schedules and Builder support for mixed precision and Copy Async GEMMs with < 16B aligned input tensors.
+* Profiler support for lower-aligned Hopper GEMMs.
+* Performance Improvements to [Scatter-Gather Hopper Example](./examples/52_hopper_gather_scatter_fusion).
+* Sub-Byte type fixes and improvements.
+* EVT Support for RELU with Aux bitmap tensor store (used in dRELU). See [SM90 EVT fusions](./include/cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp) for details.
+* Fusion support for backprop fusions including drelu, dgelu, and dbias.
+* Support for void-C kernels and SM80 mixed-input GEMMs in the CUTLASS Python interface
+
+## [3.2.2](https://github.com/NVIDIA/cutlass/releases/tag/v3.2.2) (2023-10-25)
+* Minor patch for issue/1138
+
+## [3.2.1](https://github.com/NVIDIA/cutlass/releases/tag/v3.2.1) (2023-09-22)
+* Python support SM90 Epilogue Visitor Tree (EVT) on top of the C++ support released in 3.2.0.
+* SM80 EVT support in C++ and Python.
+* Other SM90 epilogue improvements.
+* Splitting CUTLASS library into smaller units based on operation, arch and datatypes. See [1105](https://github.com/NVIDIA/cutlass/discussions/1105) for details.
+* Making `tools/library/scripts` packageable - `tools/library/scripts` is now moving to `python/cutlass_library`. See the Python [README](./python/README.md) for details.
+* SM90 TF32 kernel improvements for all layouts.
+* SM90 rasterization direction support in the CUTLASS profiler.
+* Improvement for CUTLASS profiler build times.
+* Remove Python-C++ bindings.
+
+## [3.2.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.2.0) (2023-08-03)
+
+* New warp-specialized persistent FP8 GEMM kernel [kernel schedules](./include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp) and [mainloops](./include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8.hpp)  targeting Hopper architecture that achieve great performance with TMA, WGMMA, and threadblock clusters. An example showcasing [Hopper warp-specialized FP8 GEMMs](./examples/54_hopper_fp8_warp_specialized_gemm). FP8 GEMMs come with a fast accumulation mode. When enabled, problem execution might be faster but at the cost of lower accuracy because intermediate results will not periodically be promoted to a higher precision.
+* New [Epilogue Visitor Tree (EVT)](./examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu) support for Hopper TMA epilogues. EVTs allows for user-defined customized epilogue fusion patterns without having to write a new epilogue.
+* [Stream-K](./include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp) feature for Hopper. Note that this is only a functional implementation of stream-K, and should not be used for performance comparison. Optimizations are expected in a future release.
+* Improved CTA rasterization and support for CTA swizzling for Hopper kernels using the [Tile Scheduler](./include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp).
+* Improved performance for [warp-specialized TensorFloat-32 (TF32) GEMM kernels](test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_tensor_op_f32_gmma_rs_cluster_warpspecialized.cu) targeting Hopper TMA.
+* [Hopper GEMM+Permute](./examples/53_hopper_gemm_permute/53_hopper_gemm_permute.cu), an example of fusing tensor reordering (permutation) with GEMM mainloop or epilogue.
+* New CUTLASS 2D Convolution Python interface. New [example](./examples/python/03_basic_conv2d.ipynb) here.
+* Support for Windows (MSVC) builds. Tested with Visual Studio 2019 v16.11.27 on Windows 10.0.
+* Optimal performance using [**CUDA 12.2u1**](https://developer.nvidia.com/cuda-downloads)
+* Updates and bugfixes from the community (thanks!)
+
+## [3.1.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.1.0) (2023-04-14)
+* New CUTLASS Python interface that aims to provide an ease-of-use interface for instantiating, emitting, compiling, and running CUTLASS kernels via Python. More details [here](./python/README.md) and new [examples](./examples/python).
+* New [efficient epilogues](test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative.cu#L783) using TMA for Hopper.
+* Support for [fused epilogues](test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_bias_elementwise.cu), such Bias, ReLU and GELU, using the new efficient epilogues.
+* New [warp-specialized TensorFloat-32 (TF32) GEMM kernels](test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_tensor_op_f32_gmma_rs_cluster_warpspecialized.cu) targeting Hopper TMA.
+* New [*warp-specialized persistent cooperative*](./include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp) kernel design that allows for larger tile sizes and improves performance on Hopper.
+* An [example](./examples/51_hopper_gett) showcasing GEMM-Like Tensor-Tensor Contraction (GETT) capability on Hopper.
+* Epilogue builders. Similar to mainloop builders (see [example 49](./examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu)), epilogue builders aim to generate the best-possible epilogue while exposing incremental opt-ins for greater customization.
+* Profiler support for overriding kernel and epilogue builder auto schedules for 3.x API kernels, allowing specific policies to be run in the CUTLASS profiler.
+* Performance optimizations for the [*warp-specialized persistent ping-pong*](./include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp) kernel.
+* Changes to the [GEMM API 3.x](./media/docs/gemm_api_3x.md), involving the host-facing arguments and the underlying `Params` structs.
+* [FMHA Backward Pass](./examples/41_fused_multi_head_attention/fused_multi_head_attention_backward.cu) from Meta xFormers.
+* [Streamk GEMM with Broadcast](./examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk_broadcast.cu) enables epilogue broadcast with StreamK GEMM.
+* [Batched B2B GEMM](./examples/13_two_tensor_op_fusion) now can run multiple Back-to-Back GEMM with the same problem size in parallel.
+* [Batched Strided GEMV](test/unit/gemm/device/gemv.cu) support both row major and column major input matrix.
+* [Permute + GEMM fusion](./examples/39_gemm_permute) can fuse Permute with following GEMM now.  Before, we only support fusing GEMM with Permute in the epilogue.
+* [Row Broadcast](./include/cutlass/epilogue/threadblock/predicated_tile_iterator_row_broadcast.h) can be fused in the epilogue.
+* The GitHub branch is renamed from `master` to `main` in this release.
+* Optimal performance using [**CUDA 12.1**](https://developer.nvidia.com/cuda-downloads)
+* Updates and bugfixes from the community (thanks!)
+
+## [3.0.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.0.0) (2023-01-23)
+* [CuTe](./media/docs/cute/00_quickstart.md), a [new core library and backend](./include/cute) for CUTLASS 3.0 that defines a single Layout vocabulary type and an associated algebra of layouts for a much more expressive and composable abstraction for tensors, sets of parallel agents, and operations by said agents on tensors.
+* [A new conceptual operation hierarchy](./media/docs/cutlass_3x_design.md) that replaces the architecture-centric hierarchy of CUTLASS 2.x and [documentation for CUTLASS 3.0's GEMM API changes](./media/docs/gemm_api_3x.md).
+* Strict API backwards compatibility that exposes both 2.x and 3.x API kernels through the same [`device::GemmUniversalAdapter`](./include/cutlass/gemm/device/gemm_universal_adapter.h) and [`kernel::GemmUniversal`](./include/cutlass/gemm/kernel/gemm_universal.hpp) types, allowing users to include both APIs in the same translation units. More information can be found in the [3.x backwards compatibility section](./media/docs/cutlass_3x_backwards_compatibility.md).
+* Updates to [Functionality](./media/docs/functionality.md) which directs users on which kernels are supported via CUTLASS-2 and CUTLASS-3.
+* Updates to [Compatibility](./README.md#compatibility) Section regarding supported compilers, operating systems, CUDA Toolkits, Hardware Architectures and [Target Architecture](./README.md#Target-Architecture).
+* New warp-specialized GEMM [kernel schedules](./include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp) and [mainloops](./include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp) targeting Hopper architecture that achieve great performance with TMA, WGMMA, and threadblock clusters.
+* Extensions to CUTLASS profiler to support threadblock cluster shapes in library and profiler tile configurations.
+* [CUTLASS library integration](./tools/library/src/gemm_operation_3x.hpp) for 3.x API kernels built through the new `CollectiveBuilder` API, enabling CUTLASS profiler.
+* Support for [Hopper GEMMs](./examples/48_hopper_warp_specialized_gemm) through the new 3.0 API with CuTe-based exposure of the Hopper [Tensor Memory Accelerator](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor) and [WGMMA Tensor Core](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-instructions) features.
+* Set of examples that demonstrate the usage of the new 3.0 API to easily build GEMM kernels targeting Hopper: examples [48](./examples/48_hopper_warp_specialized_gemm), [49](./examples/49_hopper_gemm_schedules_with_collective_builder), and [50](./examples/50_hopper_gemm_with_epilogue_swizzle).
+
+## [2.11.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.11.0) (2022-11-19)
+* [Stream-K](./examples/47_ampere_gemm_universal_streamk), which is a new general way to do split-K.  It can not only improve performance, but can also significantly reduce the number of tile sizes that need to be profiled to find the best one.
+* [Fused multi-head attention Kernel](./examples/41_fused_multi_head_attention).  It has two variants: one uses batched GEMM for the fixed sequence length, and the other one uses group GEMM for the variable sequence length.  Both versions just need one kernel.
+* [Dual GEMM](./examples/45_dual_gemm), which can fuse A x B and A x C into one kernel. Two GEMMs has no producer-consumer dependency.
+* Hopper improves [double precision matrix multiplication](./test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm90.cu) by 2x compared to Ampere at iso-clocks. It is supported since CUDA 11.8.
+* [BLAS3](./test/unit/gemm/device/hemm_cf64_cf64_cf64_tensor_op_f64_sm90.cu) functions with Hoppers new double precision matrix multiplication instructions.
+* [ELL Block Sparse GEMM](./examples/43_ell_block_sparse_gemm), which uses an [ELL matrix](https://developer.nvidia.com/blog/accelerating-matrix-multiplication-with-block-sparse-format-and-nvidia-tensor-cores/) to describe the sparsity of A matrix.  B and output matrices are still dense. The block size can be arbitary.
+* Optimized [Group Conv](./examples/42_ampere_tensorop_group_conv) for SingleGroup mode, which requires that the output channel per group is a multiple of Threadblock tile N.
+* [Optimized DepthWise Conv](./examples/46_depthwise_simt_conv2dfprop/depthwise_simt_conv2dfprop.cu).  Two new modes are added
+  * [kOptimized](./test/unit/conv/device/depthwise_conv2d_fprop_direct_conv_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu) - use direct conv to compute instead of implicit GEMM.
+    *  The restrictions are: 1) input ,output channel and group number should be multiple of (128 / sizeof(input element)). 2) The input filter size should be the same as the template parameter configuration.
+  * [kFixedStrideDilation](./test/unit/conv/device/depthwise_conv2d_fprop_direct_conv_fixed_stride_dilation_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu) - which puts stride and dilation into templates to further improve the performance. In this mode, kernel persistents some inputs into register to squeeze more performance, so large filter/stride/dilation is not recommanded.
+    * The restrictions are: 1) input, output channel and group number should be multiple of (128 / sizeof(input element)). 2) input filter size, stride, dilation should same as the template parameter configuration.
+* [Scripts](./examples/44_multi_gemm_ir_and_codegen) to fuse multiple back-to-back GEMM.  Its implementation was discussed in a GTC'22 Spring [talk](https://www.nvidia.com/en-us/on-demand/session/gtcspring22-s41606/).
+* [FP8 data type definition](./include/cutlass/float8.h) and [conversion routines](./include/cutlass/numeric_conversion.h#L1274-2115).
+* Updates and bugfixes from the community (thanks!).  Big shout out to Meta's [xFormers](https://github.com/facebookresearch/xformers).
+
+* **Deprecation announcement:** CUTLASS plans to deprecate the following:
+  * Maxwell and Pascal GPU architectures
+  * Ubuntu 16.04
+  * CUDA 10.2
+
+## [2.10.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.10.0) (2022-08-23)
+* [CUTLASS Python](./examples/40_cutlass_py) now supports GEMM, CONV, Group GEMM for different data types as well as different epilogue flavours.
+* Optimizations for CUTLASS's [Grouped GEMM](./examples/24_gemm_grouped/gemm_grouped.cu) kernel.  Threadblock scheduling part is improved.  Some computation can be moved to the host side if applicable.  [Grouped Syr2k](./examples/38_syr2k_grouped/syr2k_grouped.cu) kernels are added, too.
+* Optimizations for [GEMM+Softmax](./examples/35_gemm_softmax).  All the reduction computation is fused into the previous GEMM.  More template arguments are provided to fine tune the performance.
+* [Grouped GEMM for Multihead Attention](./examples/41_multi_head_attention).  This general group gemm based MHA does not require the sequence length of all GEMMs to be the same which makes it most useful for natural language processing.
+* [GEMM + Layer norm fusion for Ampere](./examples/37_gemm_layernorm_gemm_fusion/) splits the layernorm into two parts and both of them can be fused into the GEMMs before and after separately.  In addition to use square sum to compute variance of layernorm, [Shift-K](https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Computing_shifted_data) is provided if square sum raise numerical issues.
+* [GEMM Epilogue Permutation Fusion](./examples/39_gemm_permute) can apply user provided permutation layout mapping in the GEMM epilogue.
+* [Grouped convolution targeting implicit GEMM](test/unit/conv/device/group_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu) introduces the first group convolution implementation to CUTLASS.  It is an Analytical implementation, not an Optimized.  The restrictions are: 1) input and output channel number should be multiple of group number. 2) split-K is not supported.  The implementation has 2 modes:
+  * kSingleGroup: output channel per group is multiple of Threadblock tile N.
+  * kMultipleGroup: Threadblock tile N is multiple of output channel per group.
+* [Depthwise separable convolution](test/unit/conv/device/depthwise_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu) introduces the first depthwise convolution which is also Analytical for now.  The restrictions are: 1) SIMT only 2) No split-K 3) input channel equals to output channel equals to group number.
+* Standalone [Layernorm](./tools/util/include/cutlass/util/device_layernorm.h) and [Pooling](./tools/util/include/cutlass/util/device_nhwc_pooling.h) kernels.
+* [Back-to-back GEMM/CONV](./examples/13_two_tensor_op_fusion) relaxes the requirement that the first GEMM K dimension needs to be the multiple of Threadblock Tile K dimension.
+* Optimal performance using [**CUDA 11.6u2**](https://developer.nvidia.com/cuda-downloads)
+* Updates and bugfixes from the community (thanks!)
 
 ## [2.9.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.9.0) (2022-04-21)
 
-* [First layer Convolution kernels](/test/unit/conv/device/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu) specialized for small channel counts and reduced alignment
-  * [Few channels](/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h) specialization for reduced alignment capabilities
-  * [Fixed channels](/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h) further specialized when channel count perfectly matches the access vector size
-  * [Unit tests](/test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu)
-  * [Python-based instance emitter](/tools/library/scripts/generator.py) in the CUTLASS Library and support in the Profiler
+* [First layer Convolution kernels](./test/unit/conv/device/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu) specialized for small channel counts and reduced alignment
+  * [Few channels](./include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h) specialization for reduced alignment capabilities
+  * [Fixed channels](./include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h) further specialized when channel count perfectly matches the access vector size
+  * [Unit tests](./test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu)
+  * [Python-based instance emitter](./python/cutlass_library/generator.py) in the CUTLASS Library and support in the Profiler
 * [BLAS3](https://docs.nvidia.com/cuda/cublas/index.html#cublas-level-3-function-reference) operators accelerated by Tensor Cores
   * Supported types: f32, cf32, f64, cf64, tf32x3, complex tf32x3
-  * [HERK](/test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_fast_f32_sm80.cu) with [emitter](/tools/library/scripts/rank_k_operation.py)
-  * [SYRK](/test/unit/gemm/device/syrk_f32n_f32t_tensor_op_fast_f32_sm80.cu) with [emitter](/tools/library/scripts/rank_k_operation.py)
-  * [SYMM](/test/unit/gemm/device/symm_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu) with [emitter](/tools/library/scripts/symm_operation.py)
-  * [TRMM](/test/unit/gemm/device/trmm_f32n_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu) with [emitter](/tools/library/scripts/trmm_operation.py)
-  * [Unit tests](/test/unit/gemm/device/testbed_rank_k_universal.h)
-* [CUTLASS Python](/examples/40_cutlass_py) demonstrating JIT compilation of CUTLASS kernels and a Python-based runtime using [CUDA Python](https://developer.nvidia.com/cuda-python)
-  * [Python-based runtime](/tools/library/scripts/rt.py) interoperable with existing emitters
-* [GEMM + Softmax example](/examples/35_gemm_softmax)
-* [Gather and Scatter Fusion with GEMM](/examples/36_gather_scatter_fusion) can gather inputs and scatters outputs based on indices vectors in the same GEMM kernel.
+  * [HERK](./test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_fast_f32_sm80.cu) with [emitter](./python/cutlass_library/rank_k_operation.py)
+  * [SYRK](./test/unit/gemm/device/syrk_f32n_f32t_tensor_op_fast_f32_sm80.cu) with [emitter](./python/cutlass_library/rank_k_operation.py)
+  * [SYMM](./test/unit/gemm/device/symm_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu) with [emitter](./python/cutlass_library/symm_operation.py)
+  * [TRMM](./test/unit/gemm/device/trmm_f32n_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu) with [emitter](./python/cutlass_library/trmm_operation.py)
+  * [Unit tests](./test/unit/gemm/device/testbed_rank_k_universal.h)
+* [CUTLASS Python](./examples/40_cutlass_py) demonstrating JIT compilation of CUTLASS kernels and a Python-based runtime using [CUDA Python](https://developer.nvidia.com/cuda-python)
+  * [Python-based runtime](./tools/library/scripts/rt.py) interoperable with existing emitters
+* [GEMM + Softmax example](./examples/35_gemm_softmax)
+* [Gather and Scatter Fusion with GEMM](./examples/36_gather_scatter_fusion) can gather inputs and scatters outputs based on indices vectors in the same GEMM kernel.
   * It can select random rows in a row major matrix.
   * It can select random columns in a column major matrix.
-* [Back-to-back GEMM/CONV](examples/13_two_tensor_op_fusion) fully supports buffering the first GEMM/CONV results in the shared memory for the latter one to use.  It can eliminate register spill when the tile size is big.  Additionally, bias vector add is supported in the first GEMM/CONV.
+* [Back-to-back GEMM/CONV](./examples/13_two_tensor_op_fusion) fully supports buffering the first GEMM/CONV results in the shared memory for the latter one to use.  It can eliminate register spill when the tile size is big.  Additionally, bias vector add is supported in the first GEMM/CONV.
   * Supported kernels: GEMM and CONV.
   * Supported types: fp16 and int8.
   * Supported architectures: Turing and Ampere.
-* [Transposed Convolution](/examples/34_transposed_conv2d) (a.k.a Deconvolution) support which reuses Dgrad implementation.
-* [Utility functions](/tools/util/include/cutlass/util) that can pad NHWC and convert between NCHW and NHWC.
+* [Transposed Convolution](./examples/34_transposed_conv2d) (a.k.a Deconvolution) support which reuses Dgrad implementation.
+* [Utility functions](./tools/util/include/cutlass/util) that can pad NHWC and convert between NCHW and NHWC.
 * [Small alignment implicit gemm](https://github.com/NVIDIA/cutlass/issues/242) support for Fprop/Dgrad/Wgrad so that padding is no longer mandated to use tensor cores in these kernels.
 * Epilogue enhancement:
   * Eliminate bank conflicts in int8 tensor core kernels.
   * Half2 usage if epilogue compute type is fp16.
   * More activation functions: Silu, Hardswish, Leaky Relu.
-  * New elementwise fusion pattern for [residual block](/include/cutlass/epilogue/thread/linear_combination_residual_block.h).
-* [Group GEMM](/examples/24_gemm_grouped) thread block number calculation fix which helps to launch the intended number of threadblocks to fully occupy the GPUs.
+  * New elementwise fusion pattern for [residual block](./include/cutlass/epilogue/thread/linear_combination_residual_block.h).
+* [Group GEMM](./examples/24_gemm_grouped) thread block number calculation fix which helps to launch the intended number of threadblocks to fully occupy the GPUs.
 * [Parallel GEMM splitk](https://github.com/NVIDIA/cutlass/pull/277) support in the CUTLASS profiler.
-* Optimal performance using [**CUDA 11.7**](https://developer.nvidia.com/cuda-downloads)
+* Optimal performance using [**CUDA 11.6u2**](https://developer.nvidia.com/cuda-downloads)
 * Updates and bugfixes from the community (thanks!)
 
+
 ## [2.8.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.8.0) (2021-11-19)
 
 * **TF32x3:** emulated single-precision using Tensor Cores
   * 45+ TFLOPs on NVIDIA A100
-  * [GEMM SDK example](/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/27_ampere_3xtf32_fast_accurate_tensorop_gemm.cu) (real)
-  * [COMPLEX GEMM SDK example](/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm.cu) (complex)
-  * [Implicit GEMM Convolution SDK example](/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu)
+  * [GEMM SDK example](./examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/27_ampere_3xtf32_fast_accurate_tensorop_gemm.cu) (real)
+  * [COMPLEX GEMM SDK example](./examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/29_3xtf32_complex_gemm.cu) (complex)
+  * [Implicit GEMM Convolution SDK example](./examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu)
 * **Mainloop fusion for Convolution:** convolution with fused per-channel scale-bias-relu
-  * [Conv Fprop SDK example](/examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu)
-  * [Conv WGrad SDK example](/examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu) 
-  * [cutlass::conv::device::ImplicitGemmConvolutionFusion](/include/cutlass/conv/device/implicit_gemm_convolution_fusion.h)
+  * [Conv Fprop SDK example](./examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu)
+  * [Conv WGrad SDK example](./examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu)
+  * [cutlass::conv::device::ImplicitGemmConvolutionFusion](./include/cutlass/conv/device/implicit_gemm_convolution_fusion.h)
 * **Grouped GEMM:** similar to batched GEMM with distinct problem size per group
-  * [SDK example](/examples/24_gemm_grouped) with performance comparison with Batched Strided GEMM
-  * [cutlass::gemm::device::GemmGrouped](/include/cutlass/gemm/device/gemm_grouped.h)
-* [Implicit GEMM Convolution fusion](/examples/13_two_tensor_op_fusion/) supports staging 1st convolution's output accumulator in the shared memory on Turing. This allows more flexible warp tile sizes and less regsiter pressue.
+  * [SDK example](./examples/24_gemm_grouped) with performance comparison with Batched Strided GEMM
+  * [cutlass::gemm::device::GemmGrouped](./include/cutlass/gemm/device/gemm_grouped.h)
+* [Implicit GEMM Convolution fusion](./examples/13_two_tensor_op_fusion/) supports staging 1st convolution's output accumulator in the shared memory on Turing. This allows more flexible warp tile sizes and less regsiter pressue.
 * Optimal performance using [**CUDA 11.5**](https://developer.nvidia.com/cuda-downloads)
 * Updates from the community (thanks!)
 
@@ -61,11 +274,11 @@
   * CUDA 10.2
 
 ## [2.7.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.7.0) (2021-09-24)
-  * Mainloop fusion for GEMM: [summation over A or B](/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu)
-  * [Strided DGRAD (optimized iterators)](/include/cutlass/conv/kernel/default_conv2d_dgrad.h)
-  * [Half-precision GELU_taylor activation functions](/include/cutlass/epilogue/thread/activation.h#L196)
+  * Mainloop fusion for GEMM: [summation over A or B](./examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu)
+  * [Strided DGRAD (optimized iterators)](./include/cutlass/conv/kernel/default_conv2d_dgrad.h)
+  * [Half-precision GELU_taylor activation functions](./include/cutlass/epilogue/thread/activation.h#L196)
     * Use these when accumulation and epilogue compute types are all `cutlass::half_t`
-  * Tuning and bug fixes to [fused GEMM + GEMM example](/examples/13_two_tensor_op_fusion/)
+  * Tuning and bug fixes to [fused GEMM + GEMM example](./examples/13_two_tensor_op_fusion/)
   * Support for smaller than 128b aligned Convolutions: [see examples](test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu#L272)
   * Caching of results to accelerate Convolution [unit tests](test/unit/conv/device/cache_testbed_output.h)
     * Can be enabled or disabled by running `cmake .. -DCUTLASS_TEST_ENABLE_CACHED_RESULTS=OFF`
@@ -80,27 +293,27 @@
 
 ## [2.6.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.6.0) (2021-07-22)
   * Optimal performance when compiled with the [CUDA 11.4 Toolkit](https://developer.nvidia.com/cuda-toolkit)
-    * Adopt the new L2 prefetch feature in [cp.async](/include/cutlass/arch/memory.h) and [global load](/include/cutlass/arch/memory_sm80.h)
+    * Adopt the new L2 prefetch feature in [cp.async](./include/cutlass/arch/memory.h) and [global load](./include/cutlass/arch/memory_sm80.h)
   * Fused operators with GEMM and Convolution
     * [Fused broadcast in epilogue](test/unit/gemm/device/gemm_with_broadcast_f16n_f16n_f16n_tensorop_f32_sm75.cu)
-    * [Fused partial reduction in epilogue](/test/unit/gemm/device/gemm_with_reduction_f16n_f16n_f16n_tensorop_f32_sm75.cu)
+    * [Fused partial reduction in epilogue](./test/unit/gemm/device/gemm_with_reduction_f16n_f16n_f16n_tensorop_f32_sm75.cu)
   * 64b tensor strides and leading dimensions support for GEMMs
-  * Affine rank=2 matrix layouts 
-    * Row stride and column stride for matrices using [cutlass::layout::AffineRank2](/include/cutlass/layout/matrix.h)
-    * Support [FP64 tensor core](/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu) and SIMT GEMM.
-  * [Batched GEMV](/test/unit/gemm/device/gemv.cu) preview implementation
+  * Affine rank=2 matrix layouts
+    * Row stride and column stride for matrices using [cutlass::layout::AffineRank2](./include/cutlass/layout/matrix.h)
+    * Support [FP64 tensor core](./examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu) and SIMT GEMM.
+  * [Batched GEMV](./test/unit/gemm/device/gemv.cu) preview implementation
   * [New strided Dgrad](test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu) implementation
     * Accelerates over previous implementation by cutting down redundant math by 4x
     * Support using new `Dy` and `w` analytic iterators and existing `cutlass::conv::device::ImplicitGemmConvolution` interface
   * Quaternion-valued GEMM and Convolution in single- and double-precision (targeting CUDA Cores)
-    * Updates to [quaternion.h](/include/cutlass/quaternion.h) and [functional.h](/include/cutlass/functional.h)
-    * SDK Example for [GEMM](/examples/21_quaternion_gemm/quaternion_gemm.cu) and [Convolution](/examples/22_quaternion_gemm/quaternion_conv.cu)
-    * [Unit tests for GEMM](/test/unit/gemm/device/simt_qgemm_nn_sm50.cu) and [Convolution](/test/unit/conv/device/conv2d_fprop_implicit_gemm_qf32nhwc_qf32nhwc_qf32nhwc_simt_f32_sm50.cu)
+    * Updates to [quaternion.h](./include/cutlass/quaternion.h) and [functional.h](./include/cutlass/functional.h)
+    * SDK Example for [GEMM](./examples/21_quaternion_gemm/quaternion_gemm.cu) and [Convolution](./examples/22_quaternion_conv/quaternion_conv.cu)
+    * [Unit tests for GEMM](./test/unit/gemm/device/simt_qgemm_nn_sm50.cu) and [Convolution](./test/unit/conv/device/conv2d_fprop_implicit_gemm_qf32nhwc_qf32nhwc_qf32nhwc_simt_f32_sm50.cu)
   * Many improvements to the epilogue.
-    * Provide an [option](/include/cutlass/epilogue/threadblock/epilogue.h) to not fully unroll the epilogue to reduce the code size and improve the performance when using complicated elementwise operations
+    * Provide an [option](./include/cutlass/epilogue/threadblock/epilogue.h) to not fully unroll the epilogue to reduce the code size and improve the performance when using complicated elementwise operations
     * Performance improvement for FP16 tensor core kernels
     * Bug fixes
-  * Enhanced Clang support and the combination of Clang 13 and CUDA 11.4 can build and run kernels from Pascal and Ampere.   
+  * Enhanced Clang support and the combination of Clang 13 and CUDA 11.4 can build and run kernels from Pascal and Ampere.
   * Updated minimum CUDA Toolkit requirement to 10.2
     * [CUDA 11.4 Toolkit](https://developer.nvidia.com/cuda-toolkit) recommended
   * Corrections and bug fixes reported by the CUTLASS community
@@ -109,17 +322,17 @@
 ## [2.5.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.5.0) (2021-02-26)
   * Tensor reductions
     * _m_-to-_n_ reductions of tensors with affine layout
-    * [Specializations](/test/unit/reduction/device/tensor_reduce_contiguous.cu) for reductions including contiguous dimension
-    * [Specializations](/test/unit/reduction/device/tensor_reduce_strided.cu) for reductions excluding contiguous dimension
+    * [Specializations](./test/unit/reduction/device/tensor_reduce_contiguous.cu) for reductions including contiguous dimension
+    * [Specializations](./test/unit/reduction/device/tensor_reduce_strided.cu) for reductions excluding contiguous dimension
     * Custom reduction functors such as `cutlass::logical_and`
     * Large tensor support, up to 2^63 elements (however, each dimension is limited to an extent of 2^31)
   * Optimizations for 3-D convolution
-    * [Optimized tile iterators](include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h) using precomputed delta table for 3-D convolution
+    * [Optimized tile iterators](./include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h) using precomputed delta table for 3-D convolution
     * Full coverage of [forward](test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu) and [backwards](test/unit/conv/device/conv3d_dgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu) passes for 3D convolution
-  * [Fused Convolution+Convolution example](/examples/13_two_tensor_op_fusion/README.md)
+  * [Fused Convolution+Convolution example](./examples/13_two_tensor_op_fusion/README.md)
   * Corrections and bug fixes reported by the CUTLASS community
     * Thank you for filing these issues!
-  
+
 
 ## [2.4.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.4.0) (2020-11-19)
   * Implicit GEMM convolution kernels supporting CUDA and Tensor Cores on NVIDIA GPUs
@@ -127,11 +340,11 @@
     * Data type: FP32, complex<FP32>, Tensor Float 32 (TF32), BFloat16 (BF16), Float16, Int4, Int8, Int32
     * Spatial dimensions: 1-D, 2-D, and 3-D
     * Layout: NHWC, NCxHWx
-  * Implicit GEMM convolution components: 
+  * Implicit GEMM convolution components:
     * Global memory iterators supporting Fprop, Dgrad, and Wgrad
     * `MmaMultistage` for implicit GEMM convolution for NVIDIA Ampere architecture
     * `MmaPipeline` for implicit GEMM convolution for NVIDIA Volta and Turing architectures
-    * [Documentation](/media/docs/implicit_gemm_convolution.md) describing Implicit GEMM Convolution algorithm and implementation
+    * [Documentation](./media/docs/implicit_gemm_convolution.md) describing Implicit GEMM Convolution algorithm and implementation
 
 ## [2.3.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.3.0) (2020-09-23)
  * [NVIDIA Ampere Architecture features](https://devblogs.nvidia.com/nvidia-ampere-architecture-in-depth/)
@@ -139,21 +352,21 @@
      * Direct access to Sparse Tensor Cores and maximum performance via [`mma.sp.sync`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma-and-friends)
    * Fast SGEMM targeting GeForce RTX 30-series CUDA Cores
  * Minor Features:
-   * [Activation functions](/include/cutlass/epilogue/thread/activation.h) such as [GeLU](/include/cutlass/epilogue/thread/linear_combination_gelu.h) and [Sigmoid](/include/cutlass/epilogue/thread/linear_combination_sigmoid.h)
-   * Small [matrix](/include/cutlass/matrix.h) and [quaternion](/include/cutlass/quaternion.h) template classes in device code
-   * [Floating-point constants](/include/cutlass/constants.h)
+   * [Activation functions](./include/cutlass/epilogue/thread/activation.h) such as [GeLU](./include/cutlass/epilogue/thread/linear_combination_gelu.h) and [Sigmoid](./include/cutlass/epilogue/thread/linear_combination_sigmoid.h)
+   * Small [matrix](./include/cutlass/matrix.h) and [quaternion](./include/cutlass/quaternion.h) template classes in device code
+   * [Floating-point constants](./include/cutlass/constants.h)
  * NVIDIA Ampere GPU Architecture examples and documentation:
-   * [Tensor Float 32](/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu) and 
-   * [Sparse Tensor Cores](/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu)
-   * Documentation added on CUTLASS [efficient row-major epilogue](/media/docs/gemm_api.md#efficient-epilogue)
+   * [Tensor Float 32](./examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu) and
+   * [Sparse Tensor Cores](./examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu)
+   * Documentation added on CUTLASS [efficient row-major epilogue](./media/docs/gemm_api.md#efficient-epilogue)
 
 ## [2.2.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.2.0) (2020-06-08)
  * [NVIDIA Ampere Architecture features](https://devblogs.nvidia.com/nvidia-ampere-architecture-in-depth/)
-   * Fast Tensor Core operations: 
+   * Fast Tensor Core operations:
     * Maximum performance via [`mma.sync`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma-and-friends)
     * Tensor Float 32, BFloat16, and double-precision data types
     * Mixed integer data types (int8, int4, bin1)
-   * Asynchronous copy for deep software pipelines via [`cp.async`](https://docs.nvidia.com/cuda/parallel-thread-execution)   
+   * Asynchronous copy for deep software pipelines via [`cp.async`](https://docs.nvidia.com/cuda/parallel-thread-execution)
    * Described in [GTC 2020 Webinar (SR 21745)](https://developer.nvidia.com/gtc/2020/video/s21745) (free registration required)
  * Features:
    * SDK examples showing GEMM fused with bias+relu and fused GEMM+GEMM
@@ -165,11 +378,11 @@
    * Disabled F16C by default for compatibility - enable on cmake command line with `-DCUTLASS_ENABLE_F16C=ON`
 
 ## [2.1.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.1.0) (2020-04-06)
- * BLAS-style host-side API added to [CUTLASS Library](/media/docs/quickstart.md#cutlass-library)
+ * BLAS-style host-side API added to [CUTLASS Library](./media/docs/quickstart.md#cutlass-library)
     * API to launch compiled kernel instances for GEMM and planar complex GEMM
  * Planar Complex GEMM kernels targeting Volta and Turing Tensor Cores
     * Computes complex matrix products on matrices stored as disjoint real and imaginary parts
-    * [SDK Examples of Planar Complex GEMMs](/examples/10_planar_complex/planar_complex.cu)
+    * [SDK Examples of Planar Complex GEMMs](./examples/10_planar_complex/planar_complex.cu)
  * Minor enhancements and bug fixes
 
 ## [2.0.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.0.0) (2019-11-19)
@@ -179,10 +392,10 @@
     * Encapsulated functionality embodying modern C++11 programming techniques
     * Optimized containers and data types for efficient, generic, portable device code
   * Updates to:
-    * [Quick start guide](/media/docs/quickstart.md)
-    * [Documentation](/README.md#documentation)
-    * [Utilities](/media/docs/utilities.md)
-    * [CUTLASS Profiler](/media/docs/profiler.md)
+    * [Quick start guide](./media/docs/quickstart.md)
+    * [Documentation](./README.md#documentation)
+    * [Utilities](./media/docs/utilities.md)
+    * [CUTLASS Profiler](./media/docs/profiler.md)
  * Native Turing Tensor Cores
     * Efficient GEMM kernels targeting Turing Tensor Cores
     * Mixed-precision floating point, 8-bit integer, 4-bit integer, and binarized operands
@@ -246,7 +459,7 @@
 
 ## Copyright
 
-Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/CITATION.cff b/CITATION.cff
index ea053e66fb..ea97f1f68e 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -5,33 +5,61 @@ message: >-
   following metadata.
 type: software
 authors:
-  - given-names: Andrew
-    email: akerr@nvidia.com
-    family-names: Kerr
+  - given-names: Vijay
+    family-names: Thakkar
+    email: vithakkar@nvidia.com
+    affiliation: NVIDIA
+  - given-names: Pradeep
+    family-names: Ramani
+    email: prramani@nvidia.com
+    affiliation: NVIDIA
+  - given-names: Cris
+    family-names: Cecka
+    email: ccecka@nvidia.com
+    affiliation: NVIDIA
+  - given-names: Aniket
+    family-names: Shivam
+    email: ashivam@nvidia.com
+    affiliation: NVIDIA
+  - given-names: Honghao
+    family-names: Lu
+    email: honghaol@nvidia.com
+    affiliation: NVIDIA
+  - given-names: Ethan
+    family-names: Yan
+    email: etyan@nvidia.com
+    affiliation: NVIDIA
+  - given-names: Jack
+    family-names: Kosaian
+    email: jkosaian@nvidia.com
+    affiliation: NVIDIA
+  - given-names: Mark
+    family-names: Hoemmen
+    email: mhoemmen@nvidia.com
     affiliation: NVIDIA
   - given-names: Haicheng
     family-names: Wu
-    affiliation: NVIDIA
     email: haichengw@nvidia.com
-  - given-names: Manish
-    family-names: Gupta
-    affiliation: Google
-    email: manigupta@google.com
-  - given-names: Dustyn
-    family-names: Blasig
-    email: dblasig@nvidia.com
     affiliation: NVIDIA
-  - given-names: Pradeep
-    family-names: Ramini
-    email: prramani@nvidia.com
+  - given-names: Andrew
+    family-names: Kerr
+    email: akerr@nvidia.com
+    affiliation: NVIDIA
+  - given-names: Matt
+    family-names: Nicely
+    email: mnicely@nvidia.com
     affiliation: NVIDIA
   - given-names: Duane
     family-names: Merrill
     email: dumerrill@nvidia.com
     affiliation: NVIDIA
-  - given-names: Aniket
-    family-names: Shivam
-    email: ashivam@nvidia.com
+  - given-names: Dustyn
+    family-names: Blasig
+    email: dblasig@nvidia.com
+    affiliation: NVIDIA
+  - given-names: Fengqi
+    family-names: Qiao
+    email: fqiao@nvidia.com
     affiliation: NVIDIA
   - given-names: Piotr
     family-names: Majcher
@@ -49,10 +77,12 @@ authors:
     family-names: Wang
     email: jinw@nvidia.com
     affiliation: NVIDIA
-  - given-names: Matt
-    family-names: Nicely
-    email: mnicely@nvidia.com
-    affiliation: NVIDIA
+  - given-names: Manish
+    family-names: Gupta
+    affiliation: Google
+    email: manigupta@google.com
+
+
 repository-code: 'https://github.com/NVIDIA/cutlass'
 abstract: >-
   CUTLASS is a collection of CUDA C++ template
@@ -71,12 +101,12 @@ abstract: >-
   flexibility simplifies their use as building blocks
   within custom kernels and applications.
 keywords:
-  - 'cutlass, tensor cores, cuda'
+  - 'cutlass, tensor cores, cuda, cute, nvidia, gpu, linear algebra, matrix computations'
 license: BSD-3-Clause
-license-url: https://github.com/NVIDIA/cutlass/blob/v2.9.0/LICENSE.txt
-version: '2.9'
-date-released: '2022-04-27'
+license-url: https://github.com/NVIDIA/cutlass/blob/v3.0.0/LICENSE.txt
+version: '3.0.0'
+date-released: '2023-01-23'
 identifiers:
   - type: url
-    value: "https://github.com/NVIDIA/cutlass/tree/v2.9.0"
-    description: The GitHub release URL of tag 2.9.0
+    value: "https://github.com/NVIDIA/cutlass/tree/v3.0.0"
+    description: The GitHub release URL of tag 3.0.0
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cfed600b72..e9c501bc2b 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -26,7 +26,8 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-cmake_minimum_required(VERSION 3.12.4 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.19 FATAL_ERROR)
+cmake_policy(SET CMP0112 NEW)
 
 if(cutlass_LOADED)
   # If CUTLASS has been previously fetched and loaded, don't do it again.
@@ -37,31 +38,69 @@ else()
 endif()
 
 message(STATUS "CMake Version: ${CMAKE_VERSION}")
+set(IMPLICIT_CMAKE_CXX_STANDARD OFF CACHE BOOL "Do not explicitly specify -std=c++17 if set")
+
+# To reduce duplicate version locations, parse the version out of the
+# main versions.h file and reuse it here.
+
+file(READ ${CMAKE_CURRENT_SOURCE_DIR}/include/cutlass/version.h VERSION_FILE_CONTENTS)
+string(REGEX MATCH "#define CUTLASS_MAJOR ([0-9]+)" _CUTLASS_VERSION_MAJOR "${VERSION_FILE_CONTENTS}")
+set(_CUTLASS_VERSION_MAJOR ${CMAKE_MATCH_1})
+string(REGEX MATCH "#define CUTLASS_MINOR ([0-9]+)" _CUTLASS_VERSION_MINOR "${VERSION_FILE_CONTENTS}")
+set(_CUTLASS_VERSION_MINOR ${CMAKE_MATCH_1})
+string(REGEX MATCH "#define CUTLASS_PATCH ([0-9]+)" _CUTLASS_VERSION_PATCH "${VERSION_FILE_CONTENTS}")
+set(_CUTLASS_VERSION_PATCH ${CMAKE_MATCH_1})
+
+message(STATUS "CUTLASS ${_CUTLASS_VERSION_MAJOR}.${_CUTLASS_VERSION_MINOR}.${_CUTLASS_VERSION_PATCH}")
+
+## CUTLASS PROJECT #############################################################
+
+project(CUTLASS VERSION ${_CUTLASS_VERSION_MAJOR}.${_CUTLASS_VERSION_MINOR}.${_CUTLASS_VERSION_PATCH} LANGUAGES CXX)
+
+################################################################################
+
+if (CMAKE_CXX_COMPILER_ID MATCHES "GNU")
+  set(CUTLASS_GNU_HOST_COMPILE ON CACHE BOOL "Using GNU tools for host code compilation")
+endif()
+if (CMAKE_CXX_COMPILER_ID MATCHES "[Cc]lang")
+  set(CUTLASS_CLANG_HOST_COMPILE ON CACHE BOOL "Using Clang tools for host code compilation")
+endif()
+if (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
+  set(CUTLASS_MSVC_HOST_COMPILE ON CACHE BOOL "Using MSVC tools for host code compilation")
+endif()
+
+################################################################################
 
-project(CUTLASS VERSION 2.9.0 LANGUAGES CXX)
 include(${CMAKE_CURRENT_SOURCE_DIR}/CUDA.cmake)
 
-if (CUDA_VERSION VERSION_LESS 10.2)
-  message(WARNING "CUTLASS ${CUTLASS_VERSION} requires CUDA 10.2 or higher, and strongly recommends CUDA 11.0 or higher.")
-elseif (CUDA_VERSION VERSION_LESS 11.0)
-  message(WARNING "CUTLASS ${CUTLASS_VERSION} support for CUDA ${CUDA_VERSION} is deprecated, please use CUDA 11.0 or higher.")
+if (CUDA_VERSION VERSION_LESS 11.3)
+  message(WARNING "CUTLASS ${CUTLASS_VERSION} requires CUDA 11.4 or higher, and strongly recommends CUDA 11.8 or higher.")
+elseif (CUDA_VERSION VERSION_LESS 11.4)
+  message(WARNING "CUTLASS ${CUTLASS_VERSION} support for CUDA ${CUDA_VERSION} is deprecated, please use CUDA 11.8 or higher.")
+endif()
+
+if(CUTLASS_GNU_HOST_COMPILE AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.3)
+  message(FATAL_ERROR "GCC version must be at least 7.3!")
 endif()
 
+if (CUTLASS_CLANG_DEVICE_COMPILE AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0)
+  message(FATAL_ERROR "Clang 7.0+ required for GPU compilation")
+endif()
 find_package(Doxygen QUIET)
 
+################################################################################
+
 #
-# CUTLASS 2.x requires C++11
+# CUTLASS 3.x requires C++17
 #
-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 
-if(CUTLASS_NATIVE_CUDA)
-  set(CMAKE_CUDA_STANDARD 11)
-  set(CMAKE_CUDA_STANDARD_REQUIRED ON)
-else()
-  list(APPEND CUTLASS_CUDA_NVCC_FLAGS --std=c++11)
-endif()
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+
+list(APPEND CUTLASS_CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
 
 if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
   set(CMAKE_INSTALL_PREFIX install CACHE PATH "Default installation location." FORCE)
@@ -69,16 +108,28 @@ endif()
 
 message(STATUS "Default Install Location: ${CMAKE_INSTALL_PREFIX}")
 
+set(CUTLASS_TEST_LEVEL "0" CACHE STRING "Level of tests to compile.")
+# 0 - Sanity, 1 - Release-Quality, 2 - Exhaustive
+
+find_package(Python3 3.5 COMPONENTS Interpreter REQUIRED)
+
+################################################################################
 set(CUTLASS_ENABLE_HEADERS_ONLY OFF CACHE BOOL "Enable only the header library")
 
 if(CUTLASS_ENABLE_HEADERS_ONLY)
   set(CUTLASS_ENABLE_EXAMPLES_INIT OFF)
   set(CUTLASS_ENABLE_TOOLS_INIT ON)
   set(CUTLASS_ENABLE_LIBRARY_INIT OFF)
+  set(CUTLASS_ENABLE_TESTS_INIT OFF)
 else()
   set(CUTLASS_ENABLE_EXAMPLES_INIT ON)
   set(CUTLASS_ENABLE_TOOLS_INIT ON)
   set(CUTLASS_ENABLE_LIBRARY_INIT ON)
+  if(${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME})
+    set(CUTLASS_ENABLE_TESTS_INIT ON)
+  else()
+    set(CUTLASS_ENABLE_TESTS_INIT OFF)
+  endif()
 endif()
 
 set(CUTLASS_TEST_UNIT_ENABLE_WARNINGS OFF CACHE BOOL "Enable warnings on waived unit tests.")
@@ -87,44 +138,46 @@ set(CUTLASS_ENABLE_EXAMPLES ${CUTLASS_ENABLE_EXAMPLES_INIT} CACHE BOOL "Enable C
 set(CUTLASS_ENABLE_TOOLS ${CUTLASS_ENABLE_TOOLS_INIT} CACHE BOOL "Enable CUTLASS Tools")
 set(CUTLASS_ENABLE_LIBRARY ${CUTLASS_ENABLE_LIBRARY_INIT} CACHE BOOL "Enable CUTLASS Library")
 set(CUTLASS_ENABLE_PROFILER ${CUTLASS_ENABLE_LIBRARY} CACHE BOOL "Enable CUTLASS Profiler")
-
-if(${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME})
-  set(CUTLASS_ENABLE_TESTS_INIT ${CUTLASS_ENABLE_LIBRARY}})
-else()
-  set(CUTLASS_ENABLE_TESTS_INIT OFF)
-endif()
+set(CUTLASS_ENABLE_PERFORMANCE ${CUTLASS_ENABLE_PROFILER} CACHE BOOL "Enable CUTLASS Performance")
 
 set(CUTLASS_ENABLE_TESTS ${CUTLASS_ENABLE_TESTS_INIT} CACHE BOOL "Enable CUTLASS Tests")
-
-if (CUTLASS_ENABLE_TESTS)
-  include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/googletest.cmake)
+set(CUTLASS_ENABLE_GTEST_UNIT_TESTS ${CUTLASS_ENABLE_TESTS} CACHE BOOL "Enable CUTLASS GTest-based Unit Tests")
+set(CUTLASS_USE_SYSTEM_GOOGLETEST OFF CACHE BOOL "Use system/external installation of GTest")
+set(CUTLASS_USE_PACKED_TUPLE ON CACHE BOOL "If ON, make cute::tuple be new standard-layout tuple type; if OFF, use the original cute::tuple implementation that is _not_ standard-layout.")
+if (CUTLASS_USE_PACKED_TUPLE)
+  list(APPEND CUTLASS_CUDA_NVCC_FLAGS -DCUTE_USE_PACKED_TUPLE=1)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCUTLASS_USE_PACKED_TUPLE=1")
+  message(STATUS "Make cute::tuple be the new standard-layout tuple type")
+elseif()
+  message(STATUS "Use the original cute::tuple implementation that is _not_ standard-layout")
 endif()
 
+################################################################################
+
 set(CUTLASS_NVCC_ARCHS_SUPPORTED "")
-if (NOT CUDA_VERSION VERSION_LESS 7.5)
-  list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 53)
-endif()
-if (NOT CUDA_VERSION VERSION_LESS 8.0)
-  list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 60 61)
-endif()
-if (NOT CUDA_VERSION VERSION_LESS 9.0)
-  list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 70)
+if (CUDA_VERSION VERSION_GREATER_EQUAL 11.4)
+  list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 70 72 75 80 86 87)
 endif()
-if (NOT CUDA_VERSION VERSION_LESS 9.2)
-  list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 72)
+if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
+  list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 89 90)
 endif()
-if (NOT CUDA_VERSION VERSION_LESS 10.0)
-  list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 75)
-endif()
-if (NOT CUDA_VERSION VERSION_LESS 11.0)
-  list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 80)
-endif()
-if (NOT CUDA_VERSION VERSION_LESS 11.1 AND NOT CUDA_COMPILER MATCHES "[Cc]lang")
-  list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 86)
+if (CUDA_VERSION VERSION_GREATER_EQUAL 12.0)
+  list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 90a)
 endif()
 set(CUTLASS_NVCC_ARCHS ${CUTLASS_NVCC_ARCHS_SUPPORTED} CACHE STRING "The SM architectures requested.")
 set(CUTLASS_NVCC_ARCHS_ENABLED ${CUTLASS_NVCC_ARCHS} CACHE STRING "The SM architectures to build code for.")
 
+# Find unsupported and deprecated compute capabilities
+if (CUTLASS_NVCC_ARCHS_SUPPORTED)
+  set(CUTLASS_NVCC_ARCHS_UNSUPPORTED ${CUTLASS_NVCC_ARCHS})
+  list(REMOVE_ITEM CUTLASS_NVCC_ARCHS_UNSUPPORTED ${CUTLASS_NVCC_ARCHS_SUPPORTED})
+  if (CUTLASS_NVCC_ARCHS_UNSUPPORTED)
+    message(WARNING "Using unsupported or deprecated compute capabilities ${CUTLASS_NVCC_ARCHS_UNSUPPORTED}. Support may be removed in future versions.")
+  endif()
+else()
+  message(WARNING "No supported compute capabilities for CUDA ${CUDA_VERSION}.")
+endif()
+
 # Special policy introduced in CMake 3.13
 if (POLICY CMP0076)
   cmake_policy(SET CMP0076 NEW)
@@ -133,6 +186,7 @@ endif()
 include(GNUInstallDirs)
 
 link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs)
+link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
 
 ###################################################################################################
 #
@@ -161,9 +215,12 @@ if(WIN32)
   set(gtest_force_shared_crt ON CACHE BOOL "Use shared (DLL) run-time lib even when Google Test is built as static lib" FORCE)
 endif()
 
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCUTLASS_VERSIONS_GENERATED")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DCUTLASS_VERSIONS_GENERATED")
+
 if (WIN32)
-  # Enable more warnings and treat as errors
-  list(APPEND CUTLASS_CUDA_NVCC_FLAGS -Xcompiler=/W3 -Xcompiler=/WX)
+  # Enable more warnings.  Add "-Xcompiler=/WX" to enable warnings as errors.
+  list(APPEND CUTLASS_CUDA_NVCC_FLAGS -Xcompiler=/W3)
 
   # Disable warning on Unicode characters
   list(APPEND CUTLASS_CUDA_NVCC_FLAGS -Xcompiler=/wd4819)
@@ -177,7 +234,7 @@ if (${CUTLASS_NVCC_VERBOSE})
 endif()
 
 #
-# CUTLASS NAMESPACE 
+# CUTLASS NAMESPACE
 #
 set(CUTLASS_NAMESPACE "cutlass" CACHE STRING "Top level namespace of CUTLASS")
 
@@ -186,15 +243,44 @@ set(CUTLASS_NVCC_EMBED_PTX ON CACHE BOOL "Embed compiled PTX into executables.")
 set(CUTLASS_NVCC_KEEP OFF CACHE BOOL "Keep intermediate files generated by NVCC.")
 set(CUTLASS_ENABLE_F16C OFF CACHE BOOL "Enable F16C x86 extensions in host code.")
 
+################################################################################
 #
 # CUTLASS generator cmake configuration
 #
-set(CUTLASS_LIBRARY_OPERATIONS "all" CACHE STRING "Comma delimited list of operation name filters. Default '' means all operations are enabled.")
-set(CUTLASS_LIBRARY_KERNELS "" CACHE STRING "Comma delimited list of kernel name filters. If unspecified, only the largest tile size is enabled. If 'all' is specified, all kernels are enabled.")
-set(CUTLASS_LIBRARY_IGNORE_KERNELS "" CACHE STRING "Comma delimited list of kernel names to exclude from build.")
 
-# Test Levels L0, L1, L2
-set(CUTLASS_TEST_LEVEL "0" CACHE STRING "Level of tests to compile.")
+# Kernel unified filter file
+
+set(KERNEL_FILTER_FILE "" CACHE STRING "KERNEL FILTER FILE FULL PATH")
+
+if (KERNEL_FILTER_FILE AND NOT CUTLASS_LIBRARY_KERNELS)
+  # If a kernel filter file is specified, we want to generate and then
+  # filter on the entire kernel set, not the default kernel
+  # (sub)set. The user may have overridden CUTLASS_LIBRARY_KERNELS, in which
+  # case the resulting kernel set will be the intersection of the two
+  # options differenced against CUTLASS_LIBRARY_IGNORE_KERNELS.
+  set(CUTLASS_LIBRARY_KERNELS_INIT "*")
+else()
+  set(CUTLASS_LIBRARY_KERNELS_INIT "")
+endif()
+
+if (KERNEL_FILTER_FILE)
+  get_filename_component(KERNEL_FILTER_FILE "${KERNEL_FILTER_FILE}" ABSOLUTE)
+  set(KERNEL_FILTER_FILE "${KERNEL_FILTER_FILE}" CACHE STRING "KERNEL FILTER FILE FULL PATH" FORCE)
+endif()
+
+set(SELECTED_KERNEL_LIST "selected" CACHE STRING "Name of the filtered kernel list")
+
+if(KERNEL_FILTER_FILE)
+  message(STATUS "Full path of filter file: ${KERNEL_FILTER_FILE}")
+endif()
+
+set(CUTLASS_LIBRARY_OPERATIONS "all" CACHE STRING "Comma-delimited list of operation name filters. Default '' means all operations are enabled.")
+set(CUTLASS_LIBRARY_KERNELS ${CUTLASS_LIBRARY_KERNELS_INIT} CACHE STRING "Comma-delimited list of kernel name filters. If unspecified, only the largest tile size is enabled. If the string 'all' is specified, all kernels are enabled.")
+set(CUTLASS_LIBRARY_IGNORE_KERNELS "" CACHE STRING "Comma-delimited list of kernels to exclude from build. This option ONLY takes effect if CUTLASS_LIBRARY_KERNELS is set.")
+set(CUTLASS_LIBRARY_EXCLUDE_KERNELS "" CACHE STRING "Comma-delimited list of kernels to exclude from build. This option always takes effect, whether or not CUTLASS_LIBRARY_KERNELS is set. It also can exclude kernels from the filter file (see KERNEL_FILTER_FILE).")
+set(CUTLASS_LIBRARY_INSTANTIATION_LEVEL "" CACHE STRING "Instantiation level for SM90 kernels. Set to `max` and make sure CUTLASS_LIBRARY_KERNELS is non-empty to stamp all possible kernel configurations.")
+
+################################################################################
 
 set(CUTLASS_TEST_ENABLE_CACHED_RESULTS ON CACHE BOOL "Enable caching and reuse of test results in unit tests")
 
@@ -214,6 +300,8 @@ if (CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED)
   list(APPEND CUTLASS_CUDA_NVCC_FLAGS -DCUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED=1)
 endif()
 
+################################################################################
+
 #
 # CUDA 10.1 introduces "mma" in PTX performing collective matrix multiply operations.
 #
@@ -231,6 +319,15 @@ list(APPEND CUTLASS_CUDA_NVCC_FLAGS -DCUTLASS_DEBUG_TRACE_LEVEL=${CUTLASS_DEBUG_
 set(CUTLASS_ENABLE_TENSOR_CORE_MMA ${CUTLASS_ENABLE_TENSOR_CORE_MMA_DEFAULT} CACHE BOOL
   "Enable PTX mma instruction for collective matrix multiply operations.")
 
+set(CUTLASS_ENABLE_SM90_EXTENDED_MMA_SHAPES OFF CACHE BOOL
+  "Enable an extended set of SM90 WGMMA instruction shapes (may lead to increased compilation times)")
+if(CUTLASS_ENABLE_SM90_EXTENDED_MMA_SHAPES)
+  message(STATUS "Enabled extended SM90 WGMMA instruction shapes")
+  list(APPEND CUTLASS_CUDA_NVCC_FLAGS -DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+endif()
+
+set(CUTLASS_SKIP_REDUCTION_INIT OFF CACHE BOOL "Disable init reduction workspace")
+
 #
 # NOTE: running with asan and CUDA requires the following environment variable:
 #
@@ -258,10 +355,53 @@ if(CUTLASS_NVCC_EMBED_PTX)
   list(APPEND CUTLASS_CUDA_CLANG_FLAGS --cuda-include-ptx=all)
 endif()
 
+if (CUTLASS_SKIP_REDUCTION_INIT)
+  list(APPEND CUTLASS_CUDA_NVCC_FLAGS -DCUTLASS_SKIP_REDUCTION_INIT=1)
+endif()
+
 if (CUTLASS_ENABLE_TENSOR_CORE_MMA)
   list(APPEND CUTLASS_CUDA_FLAGS -DCUTLASS_ENABLE_TENSOR_CORE_MMA=1)
 endif()
 
+set(CUTLASS_PROFILER_DISABLE_REFERENCE OFF CACHE BOOL "Disable compilation of reference kernels in the CUTLASS profiler.")
+if (CUTLASS_PROFILER_DISABLE_REFERENCE)
+  list(APPEND CUTLASS_CUDA_NVCC_FLAGS -DCUTLASS_PROFILER_DISABLE_REFERENCE=1)
+endif()
+
+if (CUTLASS_ENABLE_GDC_FOR_SM90)
+  message(STATUS "Grid Dependency Control (GDC) is enabled for SM90 kernels (required for programmatic dependent launches).")
+  list(APPEND CUTLASS_CUDA_NVCC_FLAGS -DCUTLASS_ENABLE_GDC_FOR_SM90=1)
+endif()
+
+set(CUTLASS_ENABLE_SYNCLOG OFF CACHE BOOL "Enable synchronization event logging for race condition debugging. WARNING: This redefines __syncthreads() and __syncwarp() in all downstream code!")
+
+if (CUTLASS_ENABLE_SYNCLOG)
+  set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
+  string(APPEND CMAKE_CXX_FLAGS " -DCUTLASS_ENABLE_SYNCLOG=1")
+  string(APPEND CMAKE_CUDA_FLAGS " -DCUTLASS_ENABLE_SYNCLOG=1")
+endif()
+
+
+
+# Warnings-as-error exceptions and warning suppressions for Clang builds
+if (CUTLASS_CLANG_HOST_COMPILE)
+  
+  set(FLAGS_TO_ADD
+    "-Wno-error=implicit-int-conversion"
+    "-Wno-error=pass-failed"
+    "-Wno-error=inconsistent-missing-override"
+    "-Wno-sign-conversion"
+    "-Wno-unused-parameter"
+  )
+  
+  foreach(FLAG ${FLAGS_TO_ADD})
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAG}")
+    list(APPEND CUTLASS_CUDA_NVCC_FLAGS "${FLAG}")
+    list(APPEND CUTLASS_CUDA_CLANG_FLAGS "${FLAG}")
+  endforeach()
+  
+endif()
+
 if (NOT MSVC AND CUTLASS_NVCC_KEEP)
   # MSVC flow handles caching already, but for other generators we handle it here.
   set(CUTLASS_NVCC_KEEP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp CACHE PATH "Location to store NVCC scratch files")
@@ -272,15 +412,26 @@ endif()
 
 if (CUTLASS_ENABLE_F16C AND NOT CMAKE_CROSSCOMPILING)
   list(APPEND CUTLASS_CUDA_FLAGS -DCUTLASS_ENABLE_F16C=1)
-  if ((CMAKE_CXX_COMPILER_ID MATCHES "GNU") OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang"))
+  if (CUTLASS_GNU_HOST_COMPILE OR CUTLASS_CLANG_HOST_COMPILE)
     list(APPEND CUTLASS_CUDA_NVCC_FLAGS -Xcompiler=-mf16c)
-  elseif((CMAKE_CXX_COMPILER_ID MATCHES "MSVC"))
+  elseif(CUTLASS_MSVC_HOST_COMPILE)
     list(APPEND CUTLASS_CUDA_NVCC_FLAGS -Xcompiler=/arch:AVX2)
   endif()
 endif()
 
-list(APPEND CUTLASS_CUDA_NVCC_FLAGS $<$<BOOL:${UNIX}>:-Xcompiler=-Wconversion>)
-list(APPEND CUTLASS_CUDA_NVCC_FLAGS $<$<BOOL:${UNIX}>:-Xcompiler=-fno-strict-aliasing>)
+if (CUTLASS_ENABLE_OPENMP_TESTS)
+  find_package(OpenMP)
+  if(OpenMP_CXX_FOUND)
+    list(APPEND CUTLASS_CUDA_NVCC_FLAGS -Xcompiler=${OpenMP_CXX_FLAGS})
+  else()
+    message(WARNING "CUTLASS_ENABLE_OPENMP_TESTS set but OpenMP not found.")
+  endif()
+endif()
+
+if(UNIX)
+  list(APPEND CUTLASS_CUDA_NVCC_FLAGS -Xcompiler=-Wconversion)
+  list(APPEND CUTLASS_CUDA_NVCC_FLAGS -Xcompiler=-fno-strict-aliasing)
+endif()
 
 # Don't leak lineinfo in release builds
 if (NOT CMAKE_BUILD_TYPE MATCHES "Release")
@@ -288,28 +439,13 @@ if (NOT CMAKE_BUILD_TYPE MATCHES "Release")
   list(APPEND CUTLASS_CUDA_NVCC_FLAGS -lineinfo)
 endif()
 
-#Report CUDA build flags
-if (CUDA_COMPILER MATCHES "[Cc]lang")
-  if(CUTLASS_CUDA_CLANG_FLAGS)
-    message(STATUS "Using CLANG flags: ${CUTLASS_CUDA_CLANG_FLAGS}")
-  endif()
-else()
-  if(CUTLASS_CUDA_NVCC_FLAGS)
-    message(STATUS "Using NVCC flags: ${CUTLASS_CUDA_NVCC_FLAGS}")
-  endif()
-endif()
-
-if(CUDA_COMPILER MATCHES "[Cc]lang")
-  if( NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang" )
+if (CUTLASS_CLANG_DEVICE_COMPILE)
+  if (NOT CUTLASS_CLANG_HOST_COMPILE)
     message(FATAL_ERROR "Clang CUDA compilation requires Clang CXX compilation. Currently CMAKE_CXX_COMPILER is ${CMAKE_CXX_COMPILER_ID}" )
   endif()
 
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0)
-    message(FATAL_ERROR "Clang 7.0+ required for GPU compilation")
-  endif()
-
-  # There are numerous Clang versions that can work with each CUDA toolkit and the 
-  # the checks are not very useful so we are turning them off and using testing to 
+  # There are numerous Clang versions that can work with each CUDA toolkit and the
+  # the checks are not very useful so we are turning them off and using testing to
   # ensure the various combinations work properly.
 
   list(APPEND CUTLASS_CUDA_CLANG_FLAGS --cuda-path=${CUDA_TOOLKIT_ROOT_DIR})
@@ -320,71 +456,102 @@ if(CUDA_COMPILER MATCHES "[Cc]lang")
   list(APPEND CUTLASS_CUDA_CLANG_FLAGS -mllvm -unroll-threshold=5000)
   list(APPEND CUTLASS_CUDA_CLANG_FLAGS -Wno-unused-command-line-argument)
 
-  string(REPLACE "." ";" CUDA_VERSION_PARTS ${CMAKE_CUDA_COMPILER_VERSION})
-  list(GET CUDA_VERSION_PARTS 0 CUDA_VERSION_MAJOR)
-  list(GET CUDA_VERSION_PARTS 1 CUDA_VERSION_MINOR)
   list(APPEND CUTLASS_CUDA_CLANG_FLAGS -D__CUDACC_VER_MAJOR__=${CUDA_VERSION_MAJOR} -D__CUDACC_VER_MINOR__=${CUDA_VERSION_MINOR})
 
-
   # needed for libcublasLt.so in case it's installed in the same location as libcudart.so
   # dynamic linker can find it if linker sets RPATH (forced by --disable-new-tags)
   # Otherwise linker uses RUNPATH and that does not propagate to loaded libs.
   list(APPEND CUTLASS_CUDA_CLANG_FLAGS -Wl,--disable-new-dtags)
 
   link_libraries(nvidia::cudart)
+  link_libraries(nvidia::cuda_driver)
+  
+endif()
+
+#Report CUDA build flags
+if (CUTLASS_CLANG_DEVICE_COMPILE AND CUTLASS_CUDA_CLANG_FLAGS)
+  set(__FLAG_GROUP Clang)
+  set(__FLAG_LIST CUTLASS_CUDA_CLANG_FLAGS)
+else(CUTLASS_NVCC_DEVICE_COMPILE AND CUTLASS_CUDA_NVCC_FLAGS)
+  set(__FLAG_GROUP NVCC)
+  set(__FLAG_LIST CUTLASS_CUDA_NVCC_FLAGS)
+endif()
+
+set(__FLAG_DISPLAY_STRING "")
+set(__FLAG_DISPLAY_SEPARATOR)
+list(JOIN ${__FLAG_LIST} "\n  " __FLAG_DISPLAY_STRING)
+message(STATUS "Using the following ${__FLAG_GROUP} flags: \n  ${__FLAG_DISPLAY_STRING}")
+
+# Known gcc 8.1-8.3 SFINAE issue (fixed in gcc 8.4), check https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87748
+# Also see https://github.com/NVIDIA/nccl/issues/835 for nvtx3.hpp
+if (CUTLASS_GNU_HOST_COMPILE AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1 AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS_EQUAL 8.3)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNVTX3_USE_CHECKED_OVERLOADS_FOR_GET=0")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DNVTX3_USE_CHECKED_OVERLOADS_FOR_GET=0")
 endif()
 
-# Support for 128-bit integers if using NVIDIA C++ compiler 
+# Support for 128-bit integers if using NVIDIA C++ compiler
 if (${CMAKE_CXX_COMPILER_ID} MATCHES "PGI" OR ${CMAKE_CXX_COMPILER_ID} MATCHES "NVHPC")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Mint128 ")
 endif()
 
-if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
-  # CMake 3.18 added support for CUDA_ARCHITECTURES target property. We will use this
-  # property for CMake 3.18+, so we request the NEW behavior for correct compatibility.
-  # https://cmake.org/cmake/help/v3.18/policy/CMP0104.html#policy:CMP0104 
-  cmake_policy(SET CMP0104 NEW)
+# CMake 3.18 added support for CUDA_ARCHITECTURES target property. We will use this
+# property for CMake 3.18+, so we request the NEW behavior for correct compatibility.
+# https://cmake.org/cmake/help/v3.18/policy/CMP0104.html#policy:CMP0104
+cmake_policy(SET CMP0104 NEW)
+
+if (MSVC)
+
+  # MSVC by default does not apply the correct __cplusplus version as specified by the C++ standard
+  # because MSVC is not a completely compliant implementation. This option forces MSVC to use the
+  # appropriate value given the requested --std option. This fixes a compilation issue mismatch
+  # between GCC/Clang and MSVC.
+  #
+  # error : a constexpr function cannot have a nonliteral return type "dim3"
+  #
+  # See https://developercommunity.visualstudio.com/t/msvc-incorrectly-defines-cplusplus/139261
+
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zc:__cplusplus")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler  /Zc:__cplusplus")
+
+endif()
+
+# Some tests require this build option in order to link.
+if (MSVC)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /bigobj")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /bigobj")
 endif()
 
 function(cutlass_apply_cuda_gencode_flags TARGET)
+  set(options)
+  set(oneValueArgs)
+  set(multiValueArgs SM_ARCHS)
+  cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  if (__SM_ARCHS)
+    set(ARCHS_ENABLED ${__SM_ARCHS})
+  else()
+    set(ARCHS_ENABLED ${CUTLASS_NVCC_ARCHS_ENABLED})
+  endif()
 
-  set(NVCC_FLAGS)
-  set(CLANG_FLAGS)
   set(__CMAKE_CUDA_ARCHS)
-  foreach(ARCH ${CUTLASS_NVCC_ARCHS_ENABLED})
-    list(APPEND CLANG_FLAGS --cuda-gpu-arch=sm_${ARCH})
+  foreach(ARCH ${ARCHS_ENABLED})
     set(CODES)
     if(CUTLASS_NVCC_EMBED_CUBIN)
-      list(APPEND CODES sm_${ARCH})
       list(APPEND __CMAKE_CUDA_ARCHS ${ARCH}-real)
     endif()
-    if(CUTLASS_NVCC_EMBED_PTX)
-      list(APPEND CODES compute_${ARCH})
+    if(CUTLASS_NVCC_EMBED_PTX AND NOT CUTLASS_CLANG_DEVICE_COMPILE)
+      # If we're using clang for device compilation, the ptx is inserted 
+      # via another command line option and the `-virtual` flags will cause an error.
       list(APPEND __CMAKE_CUDA_ARCHS ${ARCH}-virtual)
     endif()
     list(JOIN CODES "," CODES_STR)
-    list(APPEND NVCC_FLAGS -gencode=arch=compute_${ARCH},code=[${CODES_STR}])
   endforeach()
 
-  if (CUDA_COMPILER MATCHES "[Cc]lang")
-    target_compile_options(
-      ${TARGET}
-      PRIVATE
-      $<$<COMPILE_LANGUAGE:CXX>:${CLANG_FLAGS}>
-      )
-  elseif(CMAKE_VERSION GREATER_EQUAL 3.18)
-    set_property(TARGET ${TARGET} PROPERTY CUDA_ARCHITECTURES ${__CMAKE_CUDA_ARCHS})
-  else()
-    target_compile_options(
-      ${TARGET}
-      PRIVATE
-      $<$<COMPILE_LANGUAGE:CUDA>:${NVCC_FLAGS}>
-      )
-  endif()
+  set_property(TARGET ${TARGET} PROPERTY CUDA_ARCHITECTURES ${__CMAKE_CUDA_ARCHS})
 
 endfunction()
 
-# Cache the flags so they are available when the function below is called anywhere globally. 
+# Cache the flags so they are available when the function below is called anywhere globally.
 
 set(__CUTLASS_CUDA_FLAGS ${CUTLASS_CUDA_FLAGS} CACHE INTERNAL "")
 set(__CUTLASS_CUDA_FLAGS_RELEASE ${CUTLASS_CUDA_FLAGS_RELEASE} CACHE INTERNAL "")
@@ -401,8 +568,8 @@ set(__CUTLASS_CUDA_NVCC_FLAGS_DEBUG ${CUTLASS_CUDA_NVCC_FLAGS_DEBUG} CACHE INTER
 
 function(cutlass_apply_standard_compile_options TARGET)
 
-  if(CUDA_COMPILER MATCHES "[Cc]lang")
-    set(CUDA_COMPILE_LANGUAGE CXX)
+  if(CUTLASS_CLANG_DEVICE_COMPILE)
+    set(CUDA_COMPILE_LANGUAGE CUDA)
     set(_FLAGS ${__CUTLASS_CUDA_FLAGS} ${__CUTLASS_CUDA_CLANG_FLAGS})
     set(_FLAGS_RELEASE ${__CUTLASS_CUDA_FLAGS_RELEASE} ${__CUTLASS_CUDA_CLANG_FLAGS_RELEASE})
     set(_FLAGS_RELWITHDEBINFO ${__CUTLASS_CUDA_FLAGS_RELWITHDEBINFO} ${__CUTLASS_CUDA_CLANG_FLAGS_RELWITHDEBINFO})
@@ -434,7 +601,8 @@ endfunction()
 
 # GLOB for CUTLASS header files. Should we use a static list instead?
 file(GLOB_RECURSE CUTLASS_INCLUDE RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} include/cutlass/*.h)
-file(GLOB_RECURSE CUTLASS_CUTLASS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}/include include/cutlass/*.h)
+file(GLOB_RECURSE CUTLASS_CUTLASS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}/include include/cutlass/*.h include/cutlass/*.hpp include/cutlass/*.inl)
+file(GLOB_RECURSE CUTLASS_CUTE RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}/include include/cute/*.h*)
 file(GLOB_RECURSE CUTLASS_NVRTC RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}/test test/unit/nvrtc/kernel/*.h)
 
 ###################################################################################################
@@ -459,7 +627,10 @@ set(CUTLASS_TOOLS_UTIL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/tools/util/includ
 include_directories(${CUTLASS_INCLUDE_DIR})
 
 target_compile_features(CUTLASS INTERFACE cxx_std_11)
-target_compile_definitions(CUTLASS INTERFACE CUTLASS_NAMESPACE=${CUTLASS_NAMESPACE})
+
+if (NOT CUTLASS_NAMESPACE STREQUAL "cutlass")
+  target_compile_definitions(CUTLASS INTERFACE CUTLASS_NAMESPACE=${CUTLASS_NAMESPACE})
+endif()
 
 if (NOT DEFINED CUTLASS_REVISION)
 
@@ -481,8 +652,8 @@ if (NOT DEFINED CUTLASS_REVISION)
 endif()
 
 configure_file(
-  ${CMAKE_CURRENT_SOURCE_DIR}/cmake/version.h.in
-  ${CMAKE_CURRENT_BINARY_DIR}/include/cutlass/version.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/cmake/version_extended.h.in
+  ${CMAKE_CURRENT_BINARY_DIR}/include/cutlass/version_extended.h
   @ONLY)
 
 target_include_directories(
@@ -491,6 +662,12 @@ target_include_directories(
   $<INSTALL_INTERFACE:include>
   $<BUILD_INTERFACE:${CUTLASS_INCLUDE_DIR}>
   $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>
+  )
+
+# Mark CTK headers as system to supress warnings from them
+target_include_directories(
+  CUTLASS
+  SYSTEM INTERFACE
   $<BUILD_INTERFACE:${CUDA_TOOLKIT_ROOT_DIR}/include>
   )
 
@@ -543,6 +720,7 @@ if(NOT WIN32)
     "-Wl,-rpath,'$ORIGIN/../lib'"
     "-Wl,-rpath,'${CUDA_TOOLKIT_ROOT_DIR}/lib64'"
     "-Wl,-rpath,'${CUDA_TOOLKIT_ROOT_DIR}/lib'"
+    ${CMAKE_DL_LIBS}
     )
 endif()
 
@@ -550,6 +728,15 @@ endif()
 
 include(CTest)
 enable_testing()
+
+if (CUTLASS_ENABLE_GTEST_UNIT_TESTS)
+  if (CUTLASS_USE_SYSTEM_GOOGLETEST)
+    find_package(GTest REQUIRED)
+  else()
+    include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/googletest.cmake)
+  endif()
+endif()
+
 if (NOT TARGET test_all)
   add_custom_target(test_all)
 endif()
@@ -569,6 +756,9 @@ install(DIRECTORY DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX}/ctest)
 
 ################################################################################
 
+set(CUTLASS_ENABLE_CUBLAS OFF CACHE BOOL "cuBLAS usage for tests")
+set(CUTLASS_ENABLE_CUDNN OFF CACHE BOOL "cuDNN usage for tests")
+
 include(${CMAKE_CURRENT_SOURCE_DIR}/cuBLAS.cmake)
 
 if (CUTLASS_ENABLE_CUBLAS)
@@ -583,35 +773,54 @@ endif()
 
 ################################################################################
 
-set(CUTLASS_CTEST_TEMPLATE_FILE ${CMAKE_CURRENT_LIST_DIR}/cmake/CTestTestfile.config.cmake)
+set(CUTLASS_DEFAULT_ACTIVE_TEST_SETS "default" CACHE STRING "Default
+  activated test sets. In `make test` mode, this string determines the
+  active set of tests. In `ctest` mode, this value can be overriden
+  with CUTLASS_TEST_SETS environment variable when running the ctest
+  executable.")
+
+file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_BINDIR}")
+set(CUTLASS_CTEST_TEMPLATE_FILE ${CMAKE_CURRENT_LIST_DIR}/cmake/CTestTestfile.configure.cmake)
 set(CUTLASS_CTEST_GENERATED_FILES "" CACHE INTERNAL "")
 
 function(cutlass_add_executable_tests NAME TARGET)
-# 
-# Generates test rules for `make test`, `make test_all`, and `ctest` invoked from either the 
+#
+# Generates test rules for `make test`, `make test_all`, and `ctest` invoked from either the
 # <CMAKE_BINARY_DIR> or the <CMAKE_INSTALL_PREFIX>/<CUTLASS_TEST_INSTALL_PREFIX> after installation.
-# 
+#
 # NAME: The base name for the test. Can be run with `make <NAME>` or `ctest -R 'c<NAME>'`.
 # TARGET: The target corresponding to the executable under test.
 # DISABLE_EXECUTABLE_INSTALL_RULE: An option, if given, that disables creating an install rule for TARGET.
 # DEPENDS: A list of targets or files on which this test is dependent.
 # DEPENDEES: A list of targets which should depend on this test.
 # TEST_COMMAND_OPTIONS: A list of variables (i.e. by reference params) which contain command line arguments
-#   to pass to the test executable. A unique test with suffix _0, _1, ... is generated for each set of 
+#   to pass to the test executable. A unique test is generated for each set of
 #   options given. If this option is not used, a single test with no arguments is generated.
+# TEST_COMMAND_OPTIONS_PREFIX: If provided, is added as a prefix to each TEST_COMMAND_OPTIONS value for
+#   generating the full variable name to be referenced.
 # RESULT_CACHE_FILE: A file to be installed alongside the test executable with pre-computed
 #   test results to speed up test runtime.
-# 
+# TEST_SETS_SUPPORTED: A list of test set names these tests support.
+#
 
-  set(options DISABLE_EXECUTABLE_INSTALL_RULE)
-  set(oneValueArgs DISABLE_TESTS RESULT_CACHE_FILE)
-  set(multiValueArgs DEPENDS DEPENDEES TEST_COMMAND_OPTIONS)
+  set(options DISABLE_EXECUTABLE_INSTALL_RULE DO_NOT_LOWERCASE_TEST_NAME)
+  set(oneValueArgs DISABLE_TESTS RESULT_CACHE_FILE TEST_COMMAND_OPTIONS_PREFIX)
+  set(multiValueArgs DEPENDS DEPENDEES TEST_COMMAND_OPTIONS TEST_SETS_SUPPORTED)
   cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
   if (NOT DEFINED __DISABLE_TESTS)
     set(__DISABLE_TESTS OFF)
   endif()
 
+  set(TEST_EXE $<TARGET_FILE_NAME:${TARGET}>)
+  set(TEST_EXE_WORKING_DIRECTORY ./${CMAKE_INSTALL_BINDIR})
+
+  if (NOT DEFINED __TEST_SETS_SUPPORTED)
+    set(__TEST_SETS_SUPPORTED ${CUTLASS_DEFAULT_ACTIVE_TEST_SETS})
+  endif()
+
+  set(TEST_SETS_SUPPORTED ${__TEST_SETS_SUPPORTED})
+
   if (__RESULT_CACHE_FILE)
 
     add_custom_command(
@@ -624,9 +833,9 @@ function(cutlass_add_executable_tests NAME TARGET)
   endif()
 
   if (NOT __DISABLE_EXECUTABLE_INSTALL_RULE AND CUTLASS_INSTALL_TESTS)
-  
+
     # file(RELATIVE_PATH CMAKE_CURRENT_BINARY_RELATIVE_DIR ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR})
-  
+
     install(
       TARGETS ${TARGET}
       RUNTIME DESTINATION ${CUTLASS_TEST_INSTALL_BINDIR}
@@ -640,7 +849,7 @@ function(cutlass_add_executable_tests NAME TARGET)
        )
 
     endif()
-  
+
   endif()
 
   if (NOT __TEST_COMMAND_OPTIONS)
@@ -648,7 +857,6 @@ function(cutlass_add_executable_tests NAME TARGET)
   endif()
 
   list(LENGTH __TEST_COMMAND_OPTIONS CMD_COUNT)
-  set(CMD_IDX 0)
 
   if (CMD_COUNT GREATER 1)
     add_custom_target(${NAME} DEPENDS ${TARGET} ${__DEPENDS})
@@ -657,74 +865,96 @@ function(cutlass_add_executable_tests NAME TARGET)
     endforeach()
   endif()
 
-  foreach(CMD_OPTIONS ${__TEST_COMMAND_OPTIONS})
+  if (CUTLASS_INSTALL_TESTS)
+
+    set(_INLINE_PER_TEST_CODE)
+
+    file(READ "${PROJECT_SOURCE_DIR}/cmake/CTestTestfile.test.configure.cmake" _INLINE_PER_TEST_CODE_TEMPLATE)
+
+  endif()
+
+  set(TEST_GROUP_NAME ${NAME})
+
+  # To run the tests from an install package with tests enabled, we need to generate test files
+  # that don't rely on the current directory structure in build.
+
+  set(TEST_NAME c${NAME})
+  set(TEST_GEN_DIR ${CMAKE_CURRENT_BINARY_DIR}/ctest/${TEST_NAME})
+  file(MAKE_DIRECTORY ${TEST_GEN_DIR})
+
+  set(TEST_EXE_PATH $<TARGET_FILE:${TARGET}>)
+  set(TEST_USE_EXTENDED_FORMAT ON)
+  configure_file("${CUTLASS_CTEST_TEMPLATE_FILE}" "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.cmake" @ONLY)
+
+  set(TEST_EXE_PATH $<TARGET_FILE_NAME:${TARGET}>)
+  set(TEST_USE_EXTENDED_FORMAT OFF) # ctest does not support extended add_test format.
+  configure_file("${CUTLASS_CTEST_TEMPLATE_FILE}" "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.install.cmake.in" @ONLY)
+
+  foreach(CMD_OPTIONS_VAR IN LISTS __TEST_COMMAND_OPTIONS)
 
     if (CMD_COUNT GREATER 1)
-      set(TEST_NAME ${NAME}_${CMD_IDX})
+      set(TESTCASE_NAME "${NAME}_${CMD_OPTIONS_VAR}")
     else()
-      set(TEST_NAME ${NAME})
+      set(TESTCASE_NAME "${NAME}")
     endif()
 
-    # The following rigmarole is needed to deal with spaces and possible quotes in 
+    if (NOT __DO_NOT_LOWERCASE_TEST_NAME)
+      string(TOLOWER "${TESTCASE_NAME}" TESTCASE_NAME)
+    endif()
+    
+    # The following rigmarole is needed to deal with spaces and possible quotes in
     # command line arguments. The options are passed "by reference" as the actual
     # variable names holding the real options. We then expand these in a way that
-    # preserves any quotes. Note, they have to be in this order for it to work for 
+    # preserves any quotes. Note, they have to be in this order for it to work for
     # all the use cases below.
 
-    set(CMD_OPTIONS ${${CMD_OPTIONS}})
-    list(JOIN CMD_OPTIONS " " TEST_COMMAND_OPTIONS)
-    separate_arguments(CMD_OPTIONS)
-   
+    set(TEST_COMMAND_OPTIONS ${${__TEST_COMMAND_OPTIONS_PREFIX}${CMD_OPTIONS_VAR}})
+    list(JOIN TEST_COMMAND_OPTIONS " " TEST_COMMAND_OPTIONS)
+    separate_arguments(TEST_COMMAND_OPTIONS)
+
     add_custom_target(
-      ${TEST_NAME}
+      ${TESTCASE_NAME}
       COMMAND
-      ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $<TARGET_FILE:${TARGET}> ${CMD_OPTIONS}
+      ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $<TARGET_FILE:${TARGET}> ${TEST_COMMAND_OPTIONS}
       DEPENDS
       ${TARGET}
       )
 
     if (CMD_COUNT GREATER 1)
-      add_dependencies(${NAME} ${TEST_NAME})
+      add_dependencies(${NAME} ${TESTCASE_NAME})
     endif()
 
     foreach(DEPENDEE ${__DEPENDEES})
-      add_dependencies(${DEPENDEE} ${TEST_NAME})
+      add_dependencies(${DEPENDEE} ${TESTCASE_NAME})
     endforeach()
 
-    add_test(
-      NAME c${TEST_NAME}
-      COMMAND ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $<TARGET_FILE:${TARGET}> ${CMD_OPTIONS}
-      )
+    set(TESTCASE_NAME c${TESTCASE_NAME})
+    string(CONFIGURE "${_INLINE_PER_TEST_CODE_TEMPLATE}" _TEST_CODE @ONLY)
+    file(APPEND "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.cmake" "${_TEST_CODE}")
+    file(APPEND "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.install.cmake.in" "${_TEST_CODE}")
 
-    set_tests_properties(c${TEST_NAME} PROPERTIES DISABLED ${__DISABLE_TESTS})
+  endforeach()
 
-    if (CUTLASS_INSTALL_TESTS)
+  # The following line imports the tests for immediate run via `make test`.
 
-      # To run the tests from an install package with tests enabled, we need to generate test files
-      # that don't rely on the current directory structure in build.  
+  include(${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.cmake)
 
-      set(TEST_NAME c${TEST_NAME})
-      set(TEST_EXE $<TARGET_FILE_NAME:${TARGET}>)
-      set(TEST_EXE_WORKING_DIRECTORY ./${CMAKE_INSTALL_BINDIR})
-      configure_file("${CUTLASS_CTEST_TEMPLATE_FILE}" "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.config.cmake" @ONLY)
+  set(CUTLASS_CTEST_GENERATED_FILES ${CUTLASS_CTEST_GENERATED_FILES};ctest/${TEST_NAME}/CTestTestfile.${TEST_NAME}.cmake CACHE INTERNAL "")
 
-      file(GENERATE 
-        OUTPUT "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.cmake" 
-        INPUT "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.config.cmake"
-        )
-  
-      install(
-        FILES "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.cmake"
-        DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX}/ctest/
-        )
-  
-      set(CUTLASS_CTEST_GENERATED_FILES ${CUTLASS_CTEST_GENERATED_FILES};ctest/CTestTestfile.${TEST_NAME}.cmake CACHE INTERNAL "")
-  
-    endif()
+    if (CUTLASS_INSTALL_TESTS)
 
-    math(EXPR CMD_IDX "${CMD_IDX} + 1")
+    file(GENERATE
+      OUTPUT "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.install.cmake"
+      INPUT "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.install.cmake.in"
+      )
 
-  endforeach()
+    install(
+      FILES "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.install.cmake"
+      DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX}/ctest/${TEST_NAME}
+      RENAME CTestTestfile.${TEST_NAME}.cmake
+      )
+
+    endif()
 
 endfunction()
 
@@ -732,8 +962,9 @@ if (CUTLASS_ENABLE_TOOLS)
   add_subdirectory(tools)
   if (CUTLASS_ENABLE_PROFILER)
     add_dependencies(test_all test_profiler)
-  endif()  
+  endif()
 endif()
+
 if (CUTLASS_ENABLE_EXAMPLES)
   add_subdirectory(examples)
   add_dependencies(test_all test_examples)
@@ -741,52 +972,61 @@ endif()
 
 if (CUTLASS_ENABLE_TESTS)
   add_subdirectory(test)
+  if (CUTLASS_ENABLE_GTEST_UNIT_TESTS)
   add_dependencies(test_all test_unit)
+  endif()
 endif()
 
 if (CUTLASS_INSTALL_TESTS)
 
-  file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/cmake")
+  file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/ctest")
+
+  file(WRITE "${CMAKE_BINARY_DIR}/ctest/CTestTestfile.cmake" "# Generated File\n\n")
+  file(APPEND "${CMAKE_BINARY_DIR}/ctest/CTestTestfile.cmake" "cmake_policy(SET CMP0057 NEW) # Allow IN_LIST for if()\n\n")
+  file(APPEND "${CMAKE_BINARY_DIR}/ctest/CTestTestfile.cmake" "if (NOT DEFINED ENV{CUTLASS_TEST_SETS})\n")
+  file(APPEND "${CMAKE_BINARY_DIR}/ctest/CTestTestfile.cmake" "  set(ENV{CUTLASS_TEST_SETS} ${CUTLASS_DEFAULT_ACTIVE_TEST_SETS})\n")
+  file(APPEND "${CMAKE_BINARY_DIR}/ctest/CTestTestfile.cmake" "endif()\n\n")
 
-  file(WRITE "${CMAKE_BINARY_DIR}/cmake/CTestTestfile.cmake" "# Generated File\n")
   foreach(GENERATED_FILE ${CUTLASS_CTEST_GENERATED_FILES})
-    file(APPEND "${CMAKE_BINARY_DIR}/cmake/CTestTestfile.cmake" "include(${GENERATED_FILE})\n")
+    file(APPEND "${CMAKE_BINARY_DIR}/ctest/CTestTestfile.cmake" "include(${GENERATED_FILE})\n")
   endforeach()
 
   install(
-    FILES "${CMAKE_BINARY_DIR}/cmake/CTestTestfile.cmake"
+    FILES "${CMAKE_BINARY_DIR}/ctest/CTestTestfile.cmake"
     DESTINATION "${CUTLASS_TEST_INSTALL_PREFIX}/"
     )
 
 endif()
 
-#? install(
-#?   FILES ${CMAKE_BINARY_DIR}/CTestTestfile.cmake
-#?   DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX}/
-#?   )
-#? 
-#? install(
-#?   DIRECTORY 
-#?     ${CMAKE_BINARY_DIR}/tools
-#?     ${CMAKE_BINARY_DIR}/test
-#?   DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX}/
-#?   FILES_MATCHING PATTERN "CTestTestfile.cmake"
-#?   )
-
 ################################################################################
 
+include(CMakePackageConfigHelpers)
+
+write_basic_package_version_file(
+  ${CMAKE_CURRENT_BINARY_DIR}/NvidiaCutlassConfigVersion.cmake
+  COMPATIBILITY AnyNewerVersion)
+
+configure_file(
+  ${CMAKE_CURRENT_SOURCE_DIR}/cmake/NvidiaCutlassConfig.cmake.in
+  ${CMAKE_CURRENT_BINARY_DIR}/NvidiaCutlassConfig.cmake
+  @ONLY
+  )
+
 install(
-  FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/NvidiaCutlassConfig.cmake
-  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/
+  FILES
+    ${CMAKE_CURRENT_BINARY_DIR}/NvidiaCutlassConfig.cmake
+    ${CMAKE_CURRENT_BINARY_DIR}/NvidiaCutlassConfigVersion.cmake
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/NvidiaCutlass/
   )
 
 install(
   EXPORT NvidiaCutlass
   NAMESPACE nvidia::cutlass::
-  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/NvidiaCutlass/
   FILE NvidiaCutlassTargets.cmake
   )
 
 ################################################################################
 
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/NvidiaCutlassPackageConfig.cmake)
+
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index dccfbda6fc..538bb65843 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -1,65 +1,87 @@
-![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS")
+![ALT](./media/images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS")
 
-[README](/README.md#documentation) > **Contributors**
+[README](./README.md#documentation) > **Contributors**
 
 # CUTLASS Developers and Contributors
 
 This is the official list of CUTLASS developers and contributors.
 
 ## DEVELOPERS
-Andrew Kerr  
-Haicheng Wu  
-Manish Gupta  
-Dustyn Blasig  
-Pradeep Ramani  
-Naila Farooqui  
-Piotr Majcher  
-Paul Springer  
-Jin Wang   
-Aniket Shivam  
-Chinmay Talegaonkar  
-Shang Zhang   
-Scott Yokim      
-Markus Hohnerbach  
-Aditya Atluri  
-David Tanner  
-Manikandan Ananth  
+Vijay Thakkar<br />
+Pradeep Ramani<br />
+Cris Cecka<br />
+Aniket Shivam<br />
+Jack Kosaian<br />
+Mark Hoemmen<br />
+Richard Cai<br />
+Honghao Lu<br />
+Ethan Yan<br />
+Haicheng Wu<br />
+Andrew Kerr<br />
+Dustyn Blasig<br />
+Fengqi Qiao<br />
+Duane Merrill<br />
+Yujia Zhai<br />
+Rawn Henry<br />
+Sergey Klevtsov<br />
+Shang Zhang<br />
+Piotr Majcher<br />
+Paul Springer<br />
+Markus Hohnerbach<br />
+Jin Wang<br />
+Aditya Atluri<br />
+
+## CuTe
+Cris Cecka<br />
+Vijay Thakkar<br />
 
 ## CUTLASS Product Manager
-Matthew Nicely
-  
-## CONTRIBUTORS
-Timothy Costa  
-Julien Demouth  
-Brian Fahs  
-Michael Goldfarb  
-Mostafa Hagog  
-Fei Hu  
-Alan Kaatz  
-Tina Li  
-Timmy Liu  
-Duane Merrill  
-Kevin Siu  
-Markus Tavenrath  
-John Tran  
-Vicki Wang  
-Junkai Wu  
-Fung Xie  
-Albert Xu  
-Jack Yang  
-Xiuxia Zhang  
-Nick Zhao  
+Matthew Nicely<br />
 
-## ACKNOWLEDGEMENTS
+## Former CUTLASS Developers
+Manish Gupta<br />
+Naila Farooqui<br />
+David Tanner<br />
+Manikandan Ananth<br />
+Zhaodong Chen<br />
+Chinmay Talegaonkar<br />
 
-Girish Bharambe  
-Cris Cecka  
-Luke Durant  
-Olivier Giroux  
-Stephen Jones  
-Rishkul Kulkarni  
-Bryce Lelbach  
-Joel McCormack  
-Kyrylo Perelygin  
+## CONTRIBUTORS
+Timothy Costa<br />
+Julien Demouth<br />
+Brian Fahs<br />
+Michael Garland<br />
+Michael Goldfarb<br />
+Mostafa Hagog<br />
+Fei Hu<br />
+Alan Kaatz<br />
+Tina Li<br />
+Timmy Liu<br />
+Wei Liu<br />
+Tim Martin<br />
+Duane Merrill<br />
+Kevin Siu<br />
+Markus Tavenrath<br />
+John Tran<br />
+Vicki Wang<br />
+Junkai Wu<br />
+Fung Xie<br />
+Albert Xu<br />
+Yang Xu<br />
+Jack Yang<br />
+Scott Yokim<br />
+Xiuxia Zhang<br />
+Nick Zhao<br />
 
+## ACKNOWLEDGEMENTS
 
+Girish Bharambe<br />
+Luke Durant<br />
+Carter Edwards<br />
+Olivier Giroux<br />
+Stephen Jones<br />
+Rishkul Kulkarni<br />
+Bryce Lelbach<br />
+Joel McCormack<br />
+Kyrylo Perelygin<br />
+Sean Treichler<br />
diff --git a/CUDA.cmake b/CUDA.cmake
index ff6a6afc8a..7e91adb88d 100644
--- a/CUDA.cmake
+++ b/CUDA.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -26,49 +26,46 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-if(CUDA_COMPILER MATCHES "[Cc]lang")
-  set(CUTLASS_NATIVE_CUDA_INIT ON)
-elseif(CMAKE_VERSION VERSION_LESS 3.12.4)
-  set(CUTLASS_NATIVE_CUDA_INIT OFF)
-else()
-  set(CUTLASS_NATIVE_CUDA_INIT ON)
+if (CUDA_COMPILER MATCHES "[Cc]lang")
+  message(WARNING "CUDA_COMPILER flag is deprecated, set CMAKE_CUDA_COMPILER to desired compiler executable.")
+  set(__CLANG_DEVICE_COMPILATION_REQUESTED ON)
+elseif(CUDA_COMPILER)
+  message(WARNING "Deprecated flag CUDA_COMPILER used with unknown argument ${CUDA_COMPILER}, ignoring.")
 endif()
 
-set(CUTLASS_NATIVE_CUDA ${CUTLASS_NATIVE_CUDA_INIT} CACHE BOOL "Utilize the CMake native CUDA flow")
-
-if(NOT DEFINED ENV{CUDACXX} AND NOT DEFINED ENV{CUDA_BIN_PATH} AND DEFINED ENV{CUDA_PATH})
-  # For backward compatibility, allow use of CUDA_PATH.
-  set(ENV{CUDACXX} $ENV{CUDA_PATH}/bin/nvcc)
+if (__CLANG_DEVICE_COMPILATION_REQUESTED AND NOT DEFINED CMAKE_CUDA_COMPILER)
+  set(CMAKE_CUDA_COMPILER clang++) # We will let the system find Clang or error out
 endif()
 
-if(CUTLASS_NATIVE_CUDA)
+enable_language(CUDA)
+find_package(CUDAToolkit REQUIRED)
 
-  enable_language(CUDA)
-
-  if(NOT CUDA_VERSION)
-    set(CUDA_VERSION ${CMAKE_CUDA_COMPILER_VERSION})
-  endif()
-  if(NOT CUDA_TOOLKIT_ROOT_DIR)
-    get_filename_component(CUDA_TOOLKIT_ROOT_DIR "${CMAKE_CUDA_COMPILER}/../.." ABSOLUTE)
-  endif()
+if(NOT CUDA_VERSION)
+  # For backward compatibility with older CMake code.
+  set(CUDA_VERSION ${CUDAToolkit_VERSION})
+  set(CUDA_VERSION_MAJOR ${CUDAToolkit_VERSION_MAJOR})
+  set(CUDA_VERSION_MINOR ${CUDAToolkit_VERSION_MINOR})
+endif()
+if(NOT CUDA_TOOLKIT_ROOT_DIR)
+  # In some scenarios, such as clang device compilation, the toolkit root may not be set, so we 
+  # force it here to the nvcc we found via the CUDAToolkit package.
+  get_filename_component(CUDA_TOOLKIT_ROOT_DIR "${CUDAToolkit_NVCC_EXECUTABLE}/../.." ABSOLUTE)
+endif()
 
+if (CMAKE_CUDA_COMPILER_ID MATCHES "(nvcc|[Nn][Vv][Ii][Dd][Ii][Aa])")
+  set(CUTLASS_NVCC_DEVICE_COMPILE ON CACHE BOOL "Using nvcc tools for device compilation")
+elseif (CMAKE_CUDA_COMPILER_ID MATCHES "[Cc]lang")
+  set(CUTLASS_CLANG_DEVICE_COMPILE ON CACHE BOOL "Using Clang tools for device compilation")
 else()
+  message(FATAL_ERROR "Uknown device-side compiler ${CMAKE_CUDA_COMPILER_ID} found. Set CMAKE_CUDA_COMPILER to either nvcc or clang++.")
+endif()
 
-  find_package(CUDA REQUIRED)
-  # We workaround missing variables with the native flow by also finding the CUDA toolkit the old way.
-
-  if(NOT CMAKE_CUDA_COMPILER_VERSION)
-    set(CMAKE_CUDA_COMPILER_VERSION ${CUDA_VERSION})
-  endif()
-
+if (CUTLASS_CLANG_DEVICE_COMPILE AND CMAKE_VERSION VERSION_LESS_EQUAL "3.30")
+  message(FATAL_ERROR "Clang device compilation for CUTLASS requires CMake 3.30 or higher.")
 endif()
 
 if (CUDA_VERSION VERSION_LESS 9.2)
-  message(FATAL_ERROR "CUDA 9.2+ Required, Found ${CUDA_VERSION}.")
-endif()
-if(NOT CUTLASS_NATIVE_CUDA OR CUDA_COMPILER MATCHES "[Cc]lang")
-  set(CMAKE_CUDA_COMPILER ${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc)
-  message(STATUS "CUDA Compiler: ${CMAKE_CUDA_COMPILER}")
+  message(FATAL_ERROR "CUDA 9.2+ required, found ${CUDA_VERSION}.")
 endif()
 
 find_library(
@@ -76,11 +73,12 @@ find_library(
   PATHS
   ${CUDA_TOOLKIT_ROOT_DIR}
   PATH_SUFFIXES
+  lib/x86_64-linux-gnu
   lib/x64
   lib64
   lib
   NO_DEFAULT_PATH
-  # We aren't going to search any system paths. We want to find the runtime
+  # We aren't going to search any system paths. We want to find the runtime 
   # in the CUDA toolkit we're building against.
   )
 
@@ -95,10 +93,10 @@ if(NOT TARGET cudart AND CUDART_LIBRARY)
     # from the PATH search.
   else()
     add_library(cudart SHARED IMPORTED GLOBAL)
-  endif()
+  endif()  
 
   add_library(nvidia::cudart ALIAS cudart)
-
+  
   set_property(
     TARGET cudart
     PROPERTY IMPORTED_LOCATION
@@ -120,13 +118,14 @@ find_library(
   PATHS
   ${CUDA_TOOLKIT_ROOT_DIR}
   PATH_SUFFIXES
+  lib/x86_64-linux-gnu
   lib/x64
   lib64
   lib
   lib64/stubs
   lib/stubs
   NO_DEFAULT_PATH
-  # We aren't going to search any system paths. We want to find the runtime
+  # We aren't going to search any system paths. We want to find the runtime 
   # in the CUDA toolkit we're building against.
   )
 
@@ -141,10 +140,10 @@ if(NOT TARGET cuda_driver AND CUDA_DRIVER_LIBRARY)
     # from the PATH search.
   else()
     add_library(cuda_driver SHARED IMPORTED GLOBAL)
-  endif()
+  endif()  
 
   add_library(nvidia::cuda_driver ALIAS cuda_driver)
-
+  
   set_property(
     TARGET cuda_driver
     PROPERTY IMPORTED_LOCATION
@@ -170,7 +169,7 @@ find_library(
   lib64
   lib
   NO_DEFAULT_PATH
-  # We aren't going to search any system paths. We want to find the runtime
+  # We aren't going to search any system paths. We want to find the runtime 
   # in the CUDA toolkit we're building against.
   )
 
@@ -185,10 +184,10 @@ if(NOT TARGET nvrtc AND NVRTC_LIBRARY)
     # from the PATH search.
   else()
     add_library(nvrtc SHARED IMPORTED GLOBAL)
-  endif()
-
+  endif()  
+  
   add_library(nvidia::nvrtc ALIAS nvrtc)
-
+  
   set_property(
     TARGET nvrtc
     PROPERTY IMPORTED_LOCATION
@@ -209,16 +208,6 @@ include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
 # Some platforms (e.g. Visual Studio) don't add the CUDA include directories to the system include
 # paths by default, so we add it explicitly here.
 
-function(cutlass_correct_source_file_language_property)
-  if(CUDA_COMPILER MATCHES "[Cc]lang")
-    foreach(File ${ARGN})
-      if(File MATCHES ".*\.cu$")
-        set_source_files_properties(${File} PROPERTIES LANGUAGE CXX)
-      endif()
-    endforeach()
-  endif()
-endfunction()
-
 if (MSVC OR CUTLASS_LIBRARY_KERNELS MATCHES "all")
   set(CUTLASS_UNITY_BUILD_ENABLED_INIT ON)
 else()
@@ -226,7 +215,14 @@ else()
 endif()
 
 set(CUTLASS_UNITY_BUILD_ENABLED ${CUTLASS_UNITY_BUILD_ENABLED_INIT} CACHE BOOL "Enable combined source compilation")
-set(CUTLASS_UNITY_BUILD_BATCH_SIZE 16 CACHE STRING "Batch size for unified source files")
+
+if (MSVC)
+  set(CUTLASS_UNITY_BUILD_BATCH_SIZE_INIT 8)
+else()
+  set(CUTLASS_UNITY_BUILD_BATCH_SIZE_INIT 16)
+endif()
+
+set(CUTLASS_UNITY_BUILD_BATCH_SIZE ${CUTLASS_UNITY_BUILD_BATCH_SIZE_INIT} CACHE STRING "Batch size for unified source files")
 
 function(cutlass_unify_source_files TARGET_ARGS_VAR)
 
@@ -239,15 +235,19 @@ function(cutlass_unify_source_files TARGET_ARGS_VAR)
     message(FATAL_ERROR "TARGET_ARGS_VAR parameter is required")
   endif()
 
+  if (NOT DEFINED __BATCH_SOURCES)
+    set(__BATCH_SOURCES ON)
+  endif()
+
   if (__BATCH_SOURCES AND NOT DEFINED __BATCH_SIZE)
     set(__BATCH_SIZE ${CUTLASS_UNITY_BUILD_BATCH_SIZE})
   endif()
 
-  if (CUTLASS_UNITY_BUILD_ENABLED AND DEFINED __BATCH_SIZE AND __BATCH_SIZE GREATER 1)
+  if (CUTLASS_UNITY_BUILD_ENABLED AND __BATCH_SOURCES AND __BATCH_SIZE GREATER 1)
 
     set(CUDA_FILE_ARGS)
     set(TARGET_SOURCE_ARGS)
-
+    
     foreach(ARG ${__UNPARSED_ARGUMENTS})
       if(${ARG} MATCHES ".*\.cu$")
         list(APPEND CUDA_FILE_ARGS ${ARG})
@@ -255,7 +255,7 @@ function(cutlass_unify_source_files TARGET_ARGS_VAR)
         list(APPEND TARGET_SOURCE_ARGS ${ARG})
       endif()
     endforeach()
-
+    
     list(LENGTH CUDA_FILE_ARGS NUM_CUDA_FILE_ARGS)
     while(NUM_CUDA_FILE_ARGS GREATER 0)
       list(SUBLIST CUDA_FILE_ARGS 0 ${__BATCH_SIZE} CUDA_FILE_BATCH)
@@ -287,23 +287,20 @@ function(cutlass_unify_source_files TARGET_ARGS_VAR)
 endfunction()
 function(cutlass_add_library NAME)
 
-  set(options)
+  set(options SKIP_GENCODE_FLAGS)
   set(oneValueArgs EXPORT_NAME)
   set(multiValueArgs)
   cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
   cutlass_unify_source_files(TARGET_SOURCE_ARGS ${__UNPARSED_ARGUMENTS})
-
-  if(CUTLASS_NATIVE_CUDA OR CUDA_COMPILER MATCHES "clang")
-    cutlass_correct_source_file_language_property(${TARGET_SOURCE_ARGS})
-    add_library(${NAME} ${TARGET_SOURCE_ARGS})
-  else()
-    set(CUDA_LINK_LIBRARIES_KEYWORD PRIVATE)
-    cuda_add_library(${NAME} ${TARGET_SOURCE_ARGS})
-  endif()
+  
+  add_library(${NAME} ${TARGET_SOURCE_ARGS} "")
 
   cutlass_apply_standard_compile_options(${NAME})
-  cutlass_apply_cuda_gencode_flags(${NAME})
+
+  if (NOT __SKIP_GENCODE_FLAGS)
+    cutlass_apply_cuda_gencode_flags(${NAME})
+  endif()
 
   target_compile_features(
    ${NAME}
@@ -311,6 +308,14 @@ function(cutlass_add_library NAME)
    cxx_std_11
    )
 
+  get_target_property(TARGET_TYPE ${NAME} TYPE)
+
+  if (TARGET_TYPE MATCHES "SHARED")
+    set_target_properties(${NAME} PROPERTIES CUDA_RUNTIME_LIBRARY Shared)
+  elseif(TARGET_TYPE MATCHES "STATIC")
+    set_target_properties(${NAME} PROPERTIES CUDA_RUNTIME_LIBRARY Static)
+  endif()
+
   if(__EXPORT_NAME)
     add_library(nvidia::cutlass::${__EXPORT_NAME} ALIAS ${NAME})
     set_target_properties(${NAME} PROPERTIES EXPORT_NAME ${__EXPORT_NAME})
@@ -321,20 +326,23 @@ endfunction()
 function(cutlass_add_executable NAME)
 
   set(options)
-  set(oneValueArgs)
+  set(oneValueArgs CUDA_RUNTIME_LIBRARY)
   set(multiValueArgs)
   cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
-  cutlass_unify_source_files(TARGET_SOURCE_ARGS ${__UNPARSED_ARGUMENTS})
+  if (NOT DEFINED __CUDA_RUNTIME_LIBRARY)
+    set(__CUDA_RUNTIME_LIBRARY Shared)
+  endif()
 
-  if(CUTLASS_NATIVE_CUDA OR CUDA_COMPILER MATCHES "clang")
-    cutlass_correct_source_file_language_property(${TARGET_SOURCE_ARGS})
-    add_executable(${NAME} ${TARGET_SOURCE_ARGS})
-  else()
-    set(CUDA_LINK_LIBRARIES_KEYWORD PRIVATE)
-    cuda_add_executable(${NAME} ${TARGET_SOURCE_ARGS})
+  set(__CUDA_RUNTIME_LIBRARY_ALLOWED None Shared Static)
+  if (NOT __CUDA_RUNTIME_LIBRARY IN_LIST __CUDA_RUNTIME_LIBRARY_ALLOWED)
+    message(FATAL_ERROR "CUDA_RUNTIME_LIBRARY value '${__CUDA_RUNTIME_LIBRARY}' is not in allowed list of '${__CUDA_RUNTIME_LIBRARY_ALLOWED}'")
   endif()
 
+  cutlass_unify_source_files(TARGET_SOURCE_ARGS ${__UNPARSED_ARGUMENTS})
+
+  add_executable(${NAME} ${TARGET_SOURCE_ARGS})
+
   cutlass_apply_standard_compile_options(${NAME})
   cutlass_apply_cuda_gencode_flags(${NAME})
 
@@ -344,6 +352,8 @@ function(cutlass_add_executable NAME)
    cxx_std_11
    )
 
+  set_target_properties(${NAME} PROPERTIES CUDA_RUNTIME_LIBRARY ${__CUDA_RUNTIME_LIBRARY})
+
 endfunction()
 
 function(cutlass_target_sources NAME)
@@ -354,7 +364,6 @@ function(cutlass_target_sources NAME)
   cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
   cutlass_unify_source_files(TARGET_SOURCE_ARGS ${__UNPARSED_ARGUMENTS})
-  cutlass_correct_source_file_language_property(${TARGET_SOURCE_ARGS})
   target_sources(${NAME} ${TARGET_SOURCE_ARGS})
 
 endfunction()
diff --git a/LICENSE.txt b/LICENSE.txt
index d9219ec9b9..525500841e 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,4 +1,4 @@
-Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 Redistribution and use in source and binary forms, with or without
diff --git a/PUBLICATIONS.md b/PUBLICATIONS.md
index 5c36742a88..ba0ef4cff8 100644
--- a/PUBLICATIONS.md
+++ b/PUBLICATIONS.md
@@ -1,16 +1,56 @@
 # Publications Using Cutlass
 
+## 2024
+
+- ["ShadowKV: KV Cache in Shadows for High-Throughput Long-Context LLM Inference"](https://arxiv.org/abs/2410.21465). Hanshi Sun, Li-Wen Chang, Wenlei Bao, Size Zheng, Ningxin Zheng, Xin Liu, Harry Dong, Yuejie Chi, Beidi Chen. _arXiv_, October 2024.
+
+- ["FLUX: Fast Software-based Communication Overlap On GPUs Through Kernel Fusion"](https://arxiv.org/abs/2406.06858). Li-Wen Chang, Wenlei Bao, Qi Hou, Chengquan Jiang, Ningxin Zheng, Yinmin Zhong, Xuanrun Zhang, Zuquan Song, Chengji Yao, Ziheng Jiang, Haibin Lin, Xin Jin, Xin Liu. _arXiv_, June 2024.
+
+- ["EVT: Accelerating Deep Learning Training with Epilogue Visitor Tree"](https://dl.acm.org/doi/10.1145/3620666.3651369). Zhaodong Chen, Andrew Kerr, Richard Cai, Jack Kosaian, Haicheng Wu, Yufei Ding, and Yuan Xie. _Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems_, April 2024.
+
+- ["Faster Neighborhood Attention: Reducing the O(n^2) Cost of Self Attention at the Threadblock Level"](https://arxiv.org/abs/2403.04690). Ali Hassani, Wen-Mei Hwu, Humphrey Shi. _arXiv_, March 2024.
+
+## 2023
+
+- ["A Case Study in CUDA Kernel Fusion: Implementing FlashAttention-2 on NVIDIA Hopper Architecture using the CUTLASS Library"](https://arxiv.org/abs/2312.11918). Ganesh Bikshandi, Jay Shah. _arXiv_, December 2023.
+
+- ["Benchmarking GPU Tensor Cores on General Matrix Multiplication Kernels through CUTLASS"](https://www.mdpi.com/2076-3417/13/24/13022). Xuanteng Huang, Xianwei Zhang, Panfei Yang, Nong Xiao. _Journal of Applied Sciences_, December 2023.
+
+- ["A Speed Odyssey for Deployable Quantization of LLMs"](https://arxiv.org/abs/2311.09550). Qingyuan Li, Ran Meng, Yiduo Li, Bo Zhang, Liang Li, Yifan Lu, Xiangxiang Chu, Yerui Sun, Yuchen Xie. _arXiv_, November 2023.
+
+- ["FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning"](https://arxiv.org/abs/2307.08691). Tri Dao. _Technical Report_, July 2023.
+
+- ["MegaBlocks: Efficient Sparse Training with Mixture-of-Experts"](https://arxiv.org/abs/2211.15841). Trevor Gale, Deepak Narayanan, Cliff Young, Matei Zaharia. _Proceedings of the Sixth Machine Learning and Systems_, May 2023.
+
+- ["ByteTransformer: A High-Performance Transformer Boosted for Variable-Length Inputs"](https://arxiv.org/abs/2210.03052). Yujia Zhai, Chengquan Jiang, Leyuan Wang, Xiaoying Jia, Shang Zhang, Zizhong Chen, Xin Liu, Yibo Zhu. _Proceedings of the 37th IEEE International Parallel & Distributed Processing Symposium (Best Paper)_, May 2023.
+
+- ["A Framework for Fine-Grained Synchronization of Dependent GPU Kernels"](https://arxiv.org/abs/2305.13450). Abhinav Jangda, Saeed Maleki, Maryam Mehri Dehnavi, Madan Musuvathi, Olli Saarikivi. _Computing Research Repository_, May 2023.
+
+- ["Graphene: An IR for Optimized Tensor Computations on GPUs"](https://dl.acm.org/doi/pdf/10.1145/3582016.3582018). Hagedorn, Bastian, Bin Fan, Hanfeng Chen, Cris Cecka, Michael Garland, Vinod Grover. _Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems_, March 2023.
+
+- ["Mixed Precision Post Training Quantization of Neural Networks with Sensitivity Guided Search"](https://arxiv.org/abs/2302.01382). Clemens JS Schaefer, Elfie Guo, Caitlin Stanton, Xiaofan Zhang, Tom Jablin, Navid Lambert-Shirzad, Jian Li, Chiachen Chou, Siddharth Joshi, Yu Emma Wang. _arXiv_, Feburary 2023.
+
+- ["Dynamic N:M Fine-Grained Structured Sparse Attention Mechanism"](https://dl.acm.org/doi/abs/10.1145/3572848.3577500). Zhaodong Chen, Zheng Qu, Yuying Quan, Liu Liu, Yufei Ding, Yuan Xie. _Proceedings of the 28th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming_, Feburary 2023.
+
+- ["Stream-K: Work-centric Parallel Decomposition for Dense Matrix-Matrix Multiplication on the GPU"](https://arxiv.org/abs/2301.03598). Muhammad Osama, Duane Merrill, Cris Cecka, Michael Garland, John D. Owens. _arXiv_, January 2023.
+
 ## 2022
 
+- ["GPU Load Balancing"](https://arxiv.org/abs/2212.08964). Muhammad Osama. _Doctoral dissertation, University of California, Davis_, December 2022.
+
+- ["Who Says Elephants Can't Run: Bringing Large Scale MoE Models into Cloud Scale Production"](https://arxiv.org/abs/2211.10017). Young Jin Kim, Rawn Henry, Raffy Fahim, Hany Hassan Awadalla. _Proceedings of the Third Workshop on Simple and Efficient Natural Language Processing_, December 2022.
+
 - ["Bolt: Bridging the Gap between Auto-tuners and Hardware-native Performance"](https://arxiv.org/abs/2110.15238). Jiarong Xing, Leyuan Wang, Shang Zhang, Jack Chen, Ang Chen, Yibo Zhu. _Proceedings of the 5th MLSys Conference_, August 2022.
 
 - ["Recovering single precision accuracy from Tensor Cores while surpassing the FP32 theoretical peak performance"](https://arxiv.org/abs/2203.03341). Hiroyuki Ootomo, Rio Yokota. _International Journal of High Performance Computing_, March 2022.
 
+- ["Breaking the Computation and Communication Abstraction Barrier in Distributed Machine Learning Workloads"](https://arxiv.org/abs/2105.05720). Abhinav Jangda, Jun Huang, Guodong Liu, Amir Hossein Nodehi Sabet, Saeed Maleki, Youshan Miao, Madanlal Musuvathi, Todd Mytkowicz, Olli Sarikivi. _Proceedings of the 27th ACM International Conference on Architectural Support for Programming Languages and Operating Systems_, February 2022.
+
 ## 2021
 
 - ["Arithmetic-intensity-guided fault tolerance for neural network inference on GPUs"](https://dl.acm.org/doi/abs/10.1145/3458817.3476184). Jack Kosaian, K. V. Rashmi. _Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis_, November 2021.
 
-- ["Real-time Neural Radiance Caching for Path Tracing"](https://d1qx31qr3h6wln.cloudfront.net/publications/paper_4.pdf). Thomas Muller, Fabrice Rousselle, Jan Novak, Alex Keller. _ACM Trans. Graph._, August 2021.
+- ["Real-time Neural Radiance Caching for Path Tracing"](https://dl.acm.org/doi/abs/10.1145/3450626.3459812). Thomas Muller, Fabrice Rousselle, Jan Novak, Alex Keller. _ACM Trans. Graph._, August 2021.
 
 ## 2020
 
diff --git a/README.md b/README.md
index 78ca725c1a..e61335f240 100644
--- a/README.md
+++ b/README.md
@@ -1,153 +1,195 @@
-![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition")
+![ALT](./media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition")
 
-# CUTLASS 2.9
+# CUTLASS 3.6.0
 
-_CUTLASS 2.9 - April 2022_
+_CUTLASS 3.6.0 - October 2024_
 
 CUTLASS is a collection of CUDA C++ template abstractions for implementing
-high-performance matrix-multiplication (GEMM) and related computations at all levels 
+high-performance matrix-matrix multiplication (GEMM) and related computations at all levels 
 and scales within CUDA. It incorporates strategies for hierarchical decomposition and 
 data movement similar to those used to implement cuBLAS and cuDNN.  CUTLASS decomposes 
 these "moving parts" into reusable, modular software components abstracted by C++ template 
-classes.  These thread-wide, warp-wide, block-wide, and device-wide primitives can be specialized
-and tuned via custom tiling sizes, data types, and other algorithmic policy. The
-resulting flexibility simplifies their use as building blocks within custom kernels
-and applications.
+classes.  Primitives for different levels of a conceptual parallelization hierarchy
+can be specialized and tuned via custom tiling sizes, data types,
+and other algorithmic policy. The resulting flexibility simplifies their use
+as building blocks within custom kernels and applications.
 
 To support a wide variety of applications, CUTLASS provides extensive support for
 mixed-precision computations, providing specialized data-movement and
 multiply-accumulate abstractions for half-precision floating
 point (FP16), BFloat16 (BF16), Tensor Float 32 (TF32),
-single-precision floating point (FP32), double-precision floating
-point (FP64) types, integer data types (4b and 8b), and binary data types (1b). 
-CUTLASS demonstrates warp-synchronous matrix multiply operations 
-targeting the  programmable, high-throughput _Tensor Cores_ implemented by 
-NVIDIA's Volta, Turing, and Ampere architectures.
+single-precision floating point (FP32),
+[FP32 emulation via tensor core instruction](./examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm),
+double-precision floating
+point (FP64) types, integer data types (4b and 8b), and binary data types (1b).
+CUTLASS demonstrates warp-synchronous matrix multiply operations
+targeting the programmable, high-throughput _Tensor Cores_ implemented by
+NVIDIA's Volta, Turing, Ampere, and Hopper architectures.
 
-CUTLASS implements high-performance Convolution via the implicit GEMM algorithm.
-Implicit GEMM is the formulation of a convolution operation as a GEMM thereby taking advantage of
-CUTLASS's modular GEMM pipeline. 
-This allows CUTLASS to build convolutions by reusing highly optimized warp-wide GEMM components and below. 
+See the [Quick Start Guide](./media/docs/quickstart.md) to get started quickly.
 
-See the [Quick Start Guide](/media/docs/quickstart.md) to get started quickly.
-
-See the [functionality listing](/media/docs/functionality.md) for the list of operations
+See the [functionality listing](./media/docs/functionality.md) for the list of operations
 supported at each level of the execution model hierarchy.
 
-# What's New in CUTLASS 2.9
-
-CUTLASS 2.9 is an update to CUTLASS adding:
-- [First layer Convolution kernels](/test/unit/conv/device/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu) specialized for small channel counts and reduced alignment
-- [BLAS3](https://docs.nvidia.com/cuda/cublas/index.html#cublas-level-3-function-reference) operators accelerated by Tensor Cores
-  - [SYRK](/test/unit/gemm/device/syrk_f32n_f32t_tensor_op_fast_f32_sm80.cu), [HERK](/test/unit/gemm/device/herk_cf32h_cf32n_tensor_op_fast_f32_sm80.cu),
-  - [SYR2K](/test/unit/gemm/device/syr2k_f32n_f32n_tensor_op_fast_f32_sm80.cu), [HER2K](/test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_fast_f32_sm80.cu),
-  - [Out-of-place TRMM](/test/unit/gemm/device/trmm_f32n_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu), and 
-  - [SYMM](/test/unit/gemm/device/symm_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu), [HEMM](/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_fast_f32_ls_sm80.cu)
-- [CUTLASS Python](/examples/40_cutlass_py) demonstrating JIT compilation of CUTLASS kernels and a Python-based runtime using [CUDA Python](https://developer.nvidia.com/cuda-python)
-- [GEMM + Softmax example](/examples/35_gemm_softmax)
-- [Gather and Scatter Fusion with GEMM](/examples/36_gather_scatter_fusion) can gather inputs and scatters outputs based on indices vectors in the same GEMM kernel.
-- [Back-to-back GEMM/CONV](examples/13_two_tensor_op_fusion) fully supports buffering the first GEMM/CONV results in the shared memory for the latter one to use.  Bias Vector add is also supported in the first GEMM/CONV.
-- [Transposed Convolution](/examples/34_transposed_conv2d) (a.k.a Deconvolution) support which reuses Dgrad implementation.
-- [Utility functions](/tools/util/include/cutlass/util) that can pad NHWC and convert between NCHW and NHWC.
-- [Small alignment implicit gemm](https://github.com/NVIDIA/cutlass/issues/242) support for Fprop/Dgrad/Wgrad so that padding is no longer mandated to use tensor cores.
-- Epilogue enhancement with performance improvement, more activation functions, and more fusion patterns.
-- [Group GEMM](/examples/24_gemm_grouped) thread block number calculation fix.
-- Optimal performance using [CUDA 11.7](https://developer.nvidia.com/cuda-downloads)
-- [Parallel GEMM splitk](https://github.com/NVIDIA/cutlass/pull/277) support in the CUTLASS profiler.
-- Updates and bugfixes from the community (thanks!)
-- **Deprecation announcement:** CUTLASS plans to deprecate the following:
-  - Maxwell and Pascal GPU architectures
-  - Ubuntu 16.04
-  - CUDA 10.2
+CUTLASS 3.0 introduced a new core library, CuTe, to describe and manipulate tensors of threads and data.
+CuTe is a collection of C++ CUDA template abstractions for defining and operating on hierarchically multidimensional layouts of threads and data. CuTe provides `Layout` and `Tensor` objects that compactly package the type, shape, memory space, and layout of data, while performing the complicated indexing for the user. This lets programmers focus on the logical descriptions of their algorithms while CuTe does the mechanical bookkeeping for them. With these tools, we can quickly design, implement, and modify all dense linear algebra operations.
+
+The core abstractions of CuTe are hierarchically multidimensional layouts which can be composed with data arrays to represent tensors. The representation of layouts is powerful enough to represent nearly everything we need to implement efficient dense linear algebra. Layouts can also be combined and manipulated via functional composition, on which we build a large set of common operations such as tiling and partitioning.
+
+CUTLASS 3.0 and beyond adopts CuTe throughout the GEMM hierarchy in its templates. This greatly simplifies the design
+and improves code composability and readability. More documentation specific to CuTe can be found in its [dedicated documentation directory](./media/docs/cute/00_quickstart.md).
+
+In addition to GEMMs, CUTLASS implements high-performance convolution via the implicit GEMM algorithm. Implicit GEMM is the formulation of a convolution operation as a GEMM thereby taking advantage of CUTLASS's modular GEMM pipeline. This allows CUTLASS to build convolutions by reusing highly-optimized GEMM components.
+
+
+# What's New in CUTLASS 3.6
+
+CUTLASS 3.6.0 is an update to CUTLASS adding:
+
+- [Hopper structured sparse GEMM](./examples/62_hopper_sparse_gemm/62_hopper_sparse_gemm.cu).
+  + [FP16](./test/unit/gemm/device/sm90_sparse_gemm_f16_f16_f32_tensor_op_f32.cu)
+  + [FP8](./test/unit/gemm/device/sm90_sparse_gemm_f8_f8_f32_tensor_op_f32.cu)
+  + [INT8](./test/unit/gemm/device/sm90_sparse_gemm_s8_s8_s32_tensor_op_s32.cu)
+  + [TF32](./test/unit/gemm/device/sm90_sparse_gemm_tf32_tf32_f32_tensor_op_f32.cu)
+- A refactor to the CUTLASS 3.x convolution `kernel::ConvUniversal` [API](./include/cutlass/conv/kernel/sm90_implicit_gemm_tma_warpspecialized.hpp) to bring it in line with `gemm::GemmUniversal`. Now the 3.x convolution API is no longer considered as a beta API.
+- [An improved mixed input GEMM](./examples/55_hopper_mixed_dtype_gemm/README.md) and a [lookup table implementation](./examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_fp8_gemm.cu) for `INT4`x`FP8` scale-only mode.
+- [EVT nodes for Top-K selection and softmax](./include/cutlass/epilogue/fusion/sm90_visitor_topk_softmax.hpp) and [GEMM example using those](./examples/61_hopper_gemm_with_topk_and_softmax/61_hopper_gemm_with_topk_and_softmax.cu).
+- [Programmatic Dependent Launch](./include/cutlass/arch/grid_dependency_control.h) (PDL) that leverages a new Hopper feature to speedup two back-to-back kernels, and its corresponding [documentations](./media/docs/dependent_kernel_launch.md).
+- [A new debugging tool, synclog](./include/cutlass/arch/synclog.hpp), for dumping out all synchronization events from within a kernel to a file. Please see [synclog documentation](./media/docs/utilities.md#debugging-asynchronous-kernels-with-cutlasss-built-in-synclog-tool) for details.
+- A new TMA-enabled [epilogue](./include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp) for grouped GEMM that brings significant performance improvement, as well as its EVT support.
+- A SIMT-enabled pointer-array [epilogue](./include/cutlass/epilogue/collective/sm70_epilogue_vectorized_array.hpp).
+- A new [Ping-Pong kernel schedule for Grouped GEMM](./include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp) and some other optimizations.
+- [A new instantiation strategy for CUTLASS profiler kernels](./python/cutlass_library/sm90_shapes.py) along with [improved documentation for instantiation level in CUTLASS profiler](./media/docs/profiler.md#instantiating-more-kernels-with-hopper).
+- A new hardware support for comparisons and computations of [`cutlass::bfloat16_t`](./include/cutlass/bfloat16.h)
+- Fixed use of isnan on Windows for [`half_t`](./test/unit/core/functional.cu).
+
+Minimum requirements:
+
+- Architecture: Volta
+- Compiler: Must support at least C++17
+- CUDA Toolkit version: 11.4
+
+Starting from CUTLASS 3.0, CUTLASS removed support for the following:
+
+- Maxwell and Pascal GPU architectures
+- Ubuntu 16.04
+- CUDA 10.2
+- C++ language versions less than 17.
 
 **See the [CHANGELOG](CHANGELOG.md) for a detailed listing of releases and updates.**
 
 # Performance
 
-<p align="center"><img src=/media/images/cutlass-2.8-gemm-performance.png></p>
+<p align="center"><img src=media/images/cutlass-3.5.1-gemm-peak-performance.png></p>
+<p align="center"><img src=media/images/cutlass-3.5.1-gemm-peak-performance-fp8.png></p>
 
 CUTLASS primitives are very efficient.  When used to construct device-wide GEMM kernels,
-they exhibit performance comparable to cuBLAS for scalar GEMM
-computations. The above figure shows CUTLASS performance relative to cuBLAS
-for large matrix dimensions on an [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/), 
-an [NVIDIA A2](https://www.nvidia.com/en-us/data-center/products/a2/), 
-an [NVIDIA TitanV](https://www.nvidia.com/en-us/titan/titan-v/), 
-and an [NVIDIA GeForce 2080 Ti](https://www.nvidia.com/en-us/geforce/graphics-cards/rtx-2080-ti/)
-compiled with the [CUDA 11.5 Toolkit](https://developer.nvidia.com/cuda-downloads). Tensor Core operations are implemented using CUDA's 
-[mma instruction](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma).
+they exhibit peak performance comparable to cuBLAS for scalar GEMM
+computations. The above figure shows the continual CUTLASS performance improvements 
+on an [NVIDIA H100](https://www.nvidia.com/en-us/data-center/h100/) (NVIDIA Hopper architecture) since
+CUTLASS 3.1.
+CUTLASS 3.5.1 was compiled with the [CUDA 12.5u1 Toolkit](https://developer.nvidia.com/cuda-downloads). 
+Tensor Core operations are implemented using CUDA's 
+[mma](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma) and
+[wgmma](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-instructions) instructions.
 
-<p align="center"><img src=/media/images/cutlass-2.9-implicit-gemm-performance.png></p>
+<p align="center"><img src=media/images/cutlass-2.9-implicit-gemm-performance.png></p>
 
 When using CUTLASS building blocks to construct device-wide implicit gemm (Fprop, Dgrad, and Wgrad)
 kernels, CUTLASS performance is also comparable to cuDNN when running Resnet-50 layers on an [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/)
-as shown in the above figure.  Tensor Core operations are still implemented using CUDA's
+as shown in the above figure.  Tensor Core operations are implemented using CUDA's
 [mma instruction](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma).
 
 # Compatibility
 
-CUTLASS requires a C++11 host compiler and 
-performs best when built with the [**CUDA 11.6u2 Toolkit**](https://developer.nvidia.com/cuda-toolkit).
-It is also compatible with CUDA 11.0, CUDA 11.1, CUDA 11.2, CUDA 11.3, CUDA 11.4, and CUDA 11.5.
+CUTLASS requires a C++17 host compiler and 
+performs best when built with the [**CUDA 12.4 Toolkit**](https://developer.nvidia.com/cuda-downloads).
+It is also compatible with CUDA 11.4, CUDA 11.5, CUDA 11.6, CUDA 11.7, CUDA 11.8, CUDA 12.0, CUDA 12.1, CUDA 12.2.2, CUDA 12.3.1 and CUDA 12.3.2.
 
+## Operating Systems
 We have tested the following environments.
 
 |**Operating System** | **Compiler** |
 |-----------------|----------|
-| Windows 10      | Microsoft Visual Studio 2015|
-|                 | Microsoft Visual Studio 2017|
-|                 | Microsoft Visual Studio 2019|
-| Ubuntu 18.04 | GCC 7.5.0 |
+| Ubuntu 18.04 | GCC 7.5.0  |
 | Ubuntu 20.04 | GCC 10.3.0 |
-| Ubuntu 21.04 | GCC 11.2.0 |
+| Ubuntu 22.04 | GCC 11.2.0 |
+| Ubuntu 22.04 | Clang 10.0.0 |
+| Ubuntu 22.04 | Clang 14.0.6 |
+| Ubuntu 22.04 | Clang 17.0.6 |
+| Windows 10.0 | Visual Studio 2019 v16.11.27 |
+
+Note: GCC 8.5.0 has known regressions regarding fold expressions and overloaded operators. Using GCC 7.5.0 or (preferred) GCC >= 9 is recommended.
+
+## Hardware
+CUTLASS runs successfully on the following NVIDIA GPUs, and it is expected to be efficient on Volta, Turing, Ampere, Ada, and Hopper architecture based NVIDIA GPUs.
+
+|**GPU**|**CUDA Compute Capability**|**Minimum CUDA Toolkit Required by CUTLASS-3**|
+|---|---|---|
+|NVIDIA V100 Tensor Core GPU            |7.0|11.4|
+|NVIDIA TitanV                          |7.0|11.4|
+|NVIDIA GeForce RTX 2080 TI, 2080, 2070 |7.5|11.4|
+|NVIDIA T4                              |7.5|11.4|
+|NVIDIA A100 Tensor Core GPU            |8.0|11.4|
+|NVIDIA A10                             |8.6|11.4|
+|NVIDIA GeForce RTX 3090                |8.6|11.4|
+|NVIDIA GeForce RTX 4090                |8.9|11.8|
+|NVIDIA L40                             |8.9|11.8|
+|NVIDIA H100 Tensor Core GPU            |9.0|11.8|
 
-Additionally, CUTLASS may be built with clang. 
-See [these instructions](media/docs/quickstart.md#clang) for more details.
+## Target Architecture
 
-CUTLASS runs successfully on the following NVIDIA GPUs, and it is expected to be efficient on
-any Volta-, Turing-, or NVIDIA Ampere- architecture NVIDIA GPU. 
+In general, PTX code generated for one target architecture can be run on future architectures (i.e., it is forward compatible).  However, CUDA 12.0 introduced the concept of "architecture-accelerated features" whose PTX does not have forward compatibility guarantees. Several Hopper PTX instructions fall under this category of architecture-accelerated features, and thus require a `sm_90a` target architecture (note the "a" appended). For more details on this and other architecture-accelerated instructions, please refer to the [CUDA Documentation](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#feature-availability).
 
-|**GPU**|**CUDA Compute Capability**|**Minimum CUDA Toolkit**|**Minimum CUDA Toolkit Enabling Native Tensor Cores**|
-|---|---|---|---|
-|NVIDIA Tesla V100|7.0|9.2|10.1|
-|NVIDIA TitanV|7.0|9.2|10.1|
-|NVIDIA GeForce RTX 2080 TI, 2080, 2070|7.5|10.0|10.2|
-|NVIDIA Tesla T4|7.5|10.0|10.2|
-|NVIDIA A100|8.0|11.0|11.0|
-|NVIDIA A10 |8.6|11.1|11.1|
-|NVIDIA GeForce 3090|8.6|11.1|11.1|
+The target architecture information is passed on to CUTLASS via the cmake flag `CUTLASS_NVCC_ARCHS`. In order to maximize performance on Hopper GH100, users are required to build CUTLASS with `90a` as the target architecture. If a user accidentally builds a kernel which uses SM90a features (e.g. Hopper Tensor Core Instructions), using the SM90 target (note the lack of "a"), with either CUDA Toolkit 12 or 11.8, the kernel is expected to fail with a runtime error.
 
-For all GPUs, we recommend compiling with the [CUDA 11.6u2 Toolkit](https://developer.nvidia.com/cuda-toolkit)
-for best performance.
+```
+cmake .. -DCUTLASS_NVCC_ARCHS="90a" 
+```
+
+Please refer to the [functionality documentation](./media/docs/functionality.md) for details on which kernels require which target architectures.
 
 # Documentation
 
 CUTLASS is described in the following documents and the accompanying
 [Doxygen documentation](https://nvidia.github.io/cutlass).
 
-- [Quick Start Guide](/media/docs/quickstart.md) - build and run CUTLASS
-- [Functionality](/media/docs/functionality.md) - summarizes functionality available in CUTLASS
-- [Efficient GEMM in CUDA](media/docs/efficient_gemm.md) - describes how GEMM kernels may be implemented efficiently in CUDA
-- [GEMM API](media/docs/gemm_api.md) - describes the CUTLASS GEMM model and C++ template concepts 
-- [Implicit GEMM Convolution](media/docs/implicit_gemm_convolution.md) - describes 2-D and 3-D convolution in CUTLASS
-- [Code Organization](media/docs/code_organization.md) - describes the organization and contents of the CUTLASS project
-- [Terminology](media/docs/terminology.md) - describes terms used in the code
-- [Programming Guidelines](media/docs/programming_guidelines.md) - guidelines for writing efficient modern CUDA C++
-- [Fundamental types](media/docs/fundamental_types.md) - describes basic C++ classes used in CUTLASS to represent numeric quantities and arrays
-- [Layouts](media/docs/layout.md) - describes layouts of matrices and tensors in memory
-- [Tile Iterators](media/docs/tile_iterator_concept.md) - describes C++ concepts for iterating over tiles of matrices in memory
-- [CUTLASS Profiler](media/docs/profiler.md) - command-line driven profiling application
-- [CUTLASS Utilities](media/docs/utilities.md) - additional templates used to facilate rapid development
-
+- [Quick Start Guide](./media/docs/quickstart.md) - build and run CUTLASS
+- [Functionality](./media/docs/functionality.md) - summarizes functionality available in CUTLASS
+- [Efficient GEMM in CUDA](./media/docs/efficient_gemm.md) - describes how GEMM kernels may be implemented efficiently in CUDA
+- [CUTLASS 3.x Design](./media/docs/cutlass_3x_design.md) - describes the CUTLASS 3.x design, its benefits, and how CuTe enables us to write much more composable components
+- [GEMM API 3.x](./media/docs/gemm_api_3x.md) - describes the CUTLASS 3.x GEMM model and C++ template concepts
+- [GEMM API 2.x](./media/docs/gemm_api.md) - describes the CUTLASS 2.x GEMM model and C++ template concepts
+- [Implicit GEMM Convolution](./media/docs/implicit_gemm_convolution.md) - describes 2-D and 3-D convolution in CUTLASS
+- [Code Organization](./media/docs/code_organization.md) - describes the organization and contents of the CUTLASS project
+- [Terminology](./media/docs/terminology.md) - describes terms used in the code
+- [Programming Guidelines](./media/docs/programming_guidelines.md) - guidelines for writing efficient modern CUDA C++
+- [Fundamental types](./media/docs/fundamental_types.md) - describes basic C++ classes used in CUTLASS to represent numeric quantities and arrays
+- [Layouts](./media/docs/layout.md) - describes layouts of matrices and tensors in memory
+- [Tile Iterators](./media/docs/tile_iterator_concept.md) - describes C++ concepts for iterating over tiles of matrices in memory
+- [CUTLASS Profiler](./media/docs/profiler.md) - command-line driven profiling application
+- [CUTLASS Utilities](./media/docs/utilities.md) - additional templates used to facilate rapid development
+- [Dependent kernel launch](./media/docs/dependent_kernel_launch.md) - describes a new feature in Hopper which allows overlapping dependent 
+kernels in the same stream, and how it is used in CUTLASS.
+
+# Resources
 We have also described the structure of an efficient GEMM in our talk at the
 [GPU Technology Conference 2018](http://on-demand.gputechconf.com/gtc/2018/presentation/s8854-cutlass-software-primitives-for-dense-linear-algebra-at-all-levels-and-scales-within-cuda.pdf).
 
+ - [CUTLASS: Software Primitives for Dense Linear Algebra at All Levels and Scales within CUDA](https://www.nvidia.com/en-us/on-demand/session/gtcsiliconvalley2018-s8854/)
+ - [Developing CUDA Kernels to Push Tensor Cores to the Absolute Limit on NVIDIA A100](https://www.nvidia.com/en-us/on-demand/session/gtcsj20-s21745/)
+ - [Accelerating Convolution with Tensor Cores in CUTLASS](https://www.nvidia.com/en-us/on-demand/session/gtcspring21-s31883/)
+ - [Accelerating Backward Data Gradient by Increasing Tensor Core Utilization in CUTLASS](https://www.nvidia.com/en-us/on-demand/session/gtcspring22-s41996/)
+ - [CUTLASS: Python API, Enhancements, and NVIDIA Hopper](https://www.nvidia.com/en-us/on-demand/session/gtcfall22-a41131/)
+
 # Building CUTLASS
 
 CUTLASS is a header-only template library and does not need to be built to be used by other
 projects. Client applications should target CUTLASS's `include/` directory in their include
 paths.
 
-CUTLASS unit tests, examples, and utilities can be build with CMake starting version 3.12. 
+CUTLASS unit tests, examples, and utilities can be build with CMake.
+The minimum version of CMake is given in the [Quickstart guide](./media/docs/quickstart.md).
 Make sure the `CUDACXX` environment  variable points to NVCC in the CUDA Toolkit installed
 on your system.
 
@@ -156,7 +198,8 @@ $ export CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc
 ```
 
 Create a build directory within the CUTLASS project, then run CMake. By default CUTLASS will build kernels
-for CUDA architecture versions 5.0, 6.0, 6.1, 7.0, 7.5, 8.0, and 8.6. To reduce compile time you can specify
+for CUDA architecture versions 5.0, 6.0, 6.1, 7.0, 7.5, 8.0, 8.6, 8.9, and 9.0.
+To reduce compile time you can specify
 the architectures to build CUTLASS for by changing the CMake configuration setting
 `CUTLASS_NVCC_ARCHS`.
 
@@ -191,7 +234,7 @@ CUTLASS is arranged as a header-only library along with Utilities, Tools, Exampl
 and template concepts defined in the CUTLASS project.
 
 A detailed explanation of the source code organization may be found in the 
-[CUTLASS documentation](media/docs/code_organization.md), but several main components are summarized below.
+[CUTLASS documentation](./media/docs/code_organization.md), but several main components are summarized below.
 
 ## CUTLASS Template Library
 
@@ -204,6 +247,8 @@ include/                     # client applications should target this directory
 
     conv/                    # code specialized for convolution
 
+    epilogue/                # code specialized for the epilogue of gemm/convolution
+
     gemm/                    # code specialized for general matrix product computations
 
     layout/                  # layout definitions for matrices, tensors, and other mathematical objects in memory
@@ -211,58 +256,34 @@ include/                     # client applications should target this directory
     platform/                # CUDA-capable Standard Library components
 
     reduction/               # bandwidth-limited reduction kernels that do not fit the "gemm" model
+
+    thread/                  # simt code that can be performed within a CUDA thread
     
     transform/               # code specialized for layout, type, and domain transformations
 
     *                        # core vocabulary types, containers, and basic numeric operations
-```
-
-### CUTLASS SDK Examples
-
-[CUTLASS SDK examples](/examples) apply CUTLASS templates to implement basic computations.
-
-```
-examples/
-  00_basic_gemm/                   # launches a basic GEMM with single precision inputs and outputs
-
-  01_cutlass_utilities/            # demonstrates CUTLASS Utilities for allocating and initializing tensors
-  
-  02_dump_reg_smem/                # debugging utilities for printing register and shared memory contents
-  
-  03_visualize_layout/             # utility for visualizing all layout functions in CUTLASS
-
-  04_tile_iterator/                # example demonstrating an iterator over tiles in memory
-
-  05_batched_gemm/                 # example demonstrating CUTLASS's batched strided GEMM operation
-
-  06_splitK_gemm/                  # exmaple demonstrating CUTLASS's Split-K parallel reduction kernel
-
-  07_volta_tensorop_gemm/          # example demonstrating mixed precision GEMM using Volta Tensor Cores
 
-  08_turing_tensorop_gemm/         # example demonstrating integer GEMM using Turing Tensor Cores
+  cute/                      # CuTe Layout, layout algebra, MMA/Copy atoms, tiled MMA/Copy
 
-  09_turing_tensorop_conv2dfprop/  # example demonstrating integer implicit GEMM convolution (forward propagation) using Turing Tensor Cores
+    algorithm/               # Definitions of core operations such as copy, gemm, and operations on cute::tuples
 
-  10_planar_complex/               # example demonstrating planar complex GEMM kernels
+    arch/                    # Bare bones PTX wrapper structs for copy and math instructions
 
-  11_planar_complex_array/         # example demonstrating planar complex kernels with batch-specific problem sizes
+    atom/                    # Meta-information either link to or built from arch/ operators
 
-  12_gemm_bias_relu/               # example demonstrating GEMM fused with bias and relu
+      mma_atom.hpp           # cute::Mma_Atom and cute::TiledMma
 
-  13_fused_two_gemms/              # example demonstrating two GEMms fused in one kernel
+      copy_atom.hpp          # cute::Copy_Atom and cute::TiledCopy
 
-  22_ampere_tensorop_conv2dfprop/  # example demonstrating integer implicit GEMM convolution (forward propagation) using Ampere Tensor Cores
+      *sm*.hpp               # Arch specific meta-information for copy and math operations
 
-  31_basic_syrk                    # example demonstrating Symetric rank-K update
+    *                        # Core library types such as Shape, Stride, Layout, Tensor, and associated operations
 
-  32_basic_trmm                    #
+```
 
-  33_ampere_3xtf32_tensorop_symm   #
+### CUTLASS SDK Examples
 
-  35_gemm_softmax                  # example demonstrating GEMM fused with Softmax in mixed precision using Ampere Tensor Cores
-
-  40_cutlass_py                    # example demonstrating CUTLASS with CUDA Python
-```
+[CUTLASS SDK examples](./examples) apply CUTLASS templates to implement basic computations.
 
 ### Tools
 
@@ -287,7 +308,7 @@ tools/
 The `test/unit/` directory consist of unit tests implemented with Google Test that demonstrate
 basic usage of Core API components and complete tests of the CUTLASS GEMM computations.
 
-Instructions for building and running the Unit tests are described in the [Quickstart guide](media/docs/quickstart.md).
+Instructions for building and running the Unit tests are described in the [Quickstart guide](./media/docs/quickstart.md).
 
 # Performance Profiling
 
@@ -301,9 +322,11 @@ $ make cutlass_profiler -j16
 
 By default, only one tile size is instantiated for each data type, math instruction, and layout.
 To instantiate all, set the following environment variable when running CMake from an empty `build/` directory.
-Beware, this results in *thousands* of kernels and long build times.
+Beware, this results in *tens of thousands* of kernels and long build times. 
+This would also result in a large binary size and on some platforms linker to fail on building the library.
+Therefore, it's highly recommended to generate only a subset of kernels as demonstrated in the sub-section below.
 ```bash
-$ cmake .. -DCUTLASS_NVCC_ARCHS=75 -DCUTLASS_LIBRARY_KERNELS=all
+$ cmake .. -DCUTLASS_NVCC_ARCHS=90a -DCUTLASS_LIBRARY_KERNELS=all
 ...
 $ make cutlass_profiler -j16
 ```
@@ -316,7 +339,7 @@ or a subset of kernels for NVIDIA Ampere and Turing architecture:
 
 ### Building a subset Tensor Core GEMM kernels
 
-To compile a subset of Tensor Core GEMM kernels with FP32 accumulation and FP16 input targetting NVIDIA Ampere and Turing architecture, 
+To compile a subset of Tensor Core GEMM kernels with FP32 accumulation and FP16 input targeting NVIDIA Ampere and Turing architecture, 
 use the below cmake command line:
 ```bash
 $ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_s*gemm_f16_*_nt_align8
@@ -364,7 +387,7 @@ reference_device: Passed
 
 ### Building one CUDA Core GEMM kernel
 
-To compile one SGEMM kernel targetting NVIDIA Ampere and Turing architecture, use the below cmake command line:
+To compile one SGEMM kernel targeting NVIDIA Ampere and Turing architecture, use the below cmake command line:
 ```bash
 $ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_simt_sgemm_128x128_8x2_nn_align1
 ...
@@ -406,7 +429,7 @@ $ ./tools/profiler/cutlass_profiler --kernels=sgemm --m=3456 --n=4096 --k=4096
 ### Building a subset of Tensor Core Convolution kernels
 
 To compile a subset of Tensor core convolution kernels implementing forward propagation (fprop) with FP32 accumulation 
-and FP16 input targetting NVIDIA Ampere and Turing architecture, use the below cmake command line:
+and FP16 input targeting NVIDIA Ampere and Turing architecture, use the below cmake command line:
 ```bash
 $ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_s*fprop_optimized_f16
 ...
@@ -454,7 +477,7 @@ reference_device: Passed
 ### Building one Convolution CUDA kernel
 
 To compile and run one CUDA Core convolution kernel implementing forward propagation (fprop) with F32 accumulation 
-and FP32 input targetting NVIDIA Ampere and Turing architecture, use the below cmake command line:
+and FP32 input targeting NVIDIA Ampere and Turing architecture, use the below cmake command line:
 ```bash
 $ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_simt_sfprop_optimized_128x128_8x2_nhwc
 ...
@@ -501,9 +524,9 @@ reference_device: Passed
 
 ## More Details on Compiling CUTLASS Kernels and CUTLASS Profiler
 - Please follow the links for more CMake examples on selectively compiling CUTLASS kernels:
-  - [GEMM CMake Examples](media/docs/quickstart.md#gemm-cmake-examples) 
-  - [Implicit GEMM conovlution CMake Examples](media/docs/quickstart.md#convolution-cmake-examples)
-- [Further details about the CUTLASS Profiler are described here.](media/docs/profiler.md)
+  - [GEMM CMake Examples](./media/docs/quickstart.md#gemm-cmake-examples) 
+  - [Implicit GEMM convolution CMake Examples](./media/docs/quickstart.md#convolution-cmake-examples)
+- [Further details about the CUTLASS Profiler are described here.](./media/docs/profiler.md)
 
 
 # About
@@ -517,7 +540,7 @@ The official list of CUTLASS developers and contributors is available here: [CON
 
 # Copyright
 
-Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
@@ -546,4 +569,3 @@ SPDX-License-Identifier: BSD-3-Clause
   OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ```
-
diff --git a/bin2hex.cmake b/bin2hex.cmake
index b0773dd659..b34e02849f 100644
--- a/bin2hex.cmake
+++ b/bin2hex.cmake
@@ -1,3 +1,31 @@
+# Copyright (c) 2019 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 # A small utility function which generates a C-header from an input file
 function(FILE_TO_C_STRING FILENAME VARIABLE_NAME OUTPUT_STRING ZERO_TERMINATED)
   FILE(READ "${FILENAME}" HEX_INPUT HEX)
@@ -6,7 +34,7 @@ function(FILE_TO_C_STRING FILENAME VARIABLE_NAME OUTPUT_STRING ZERO_TERMINATED)
   endif()
 
   string(REGEX REPLACE "(....)" "\\1\n" HEX_OUTPUT ${HEX_INPUT})
-  string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," HEX_OUTPUT ${HEX_OUTPUT})
+  string(REGEX REPLACE "([0-9a-f][0-9a-f])" "char(0x\\1)," HEX_OUTPUT ${HEX_OUTPUT})
 
   set(HEX_OUTPUT "static char const ${VARIABLE_NAME}[] = {\n  ${HEX_OUTPUT}\n};\n")
 
diff --git a/cmake/CTestTestfile.config.cmake b/cmake/CTestTestfile.config.cmake
deleted file mode 100644
index 0705b19c12..0000000000
--- a/cmake/CTestTestfile.config.cmake
+++ /dev/null
@@ -1,21 +0,0 @@
-# Generated file
-
-if (DEFINED ENV{CUTLASS_TEST_EXECUTION_ENVIRONMENT})
-  set(_CUTLASS_TEST_EXECUTION_ENVIRONMENT $ENV{CUTLASS_TEST_EXECUTION_ENVIRONMENT})
-else()
-  set(_CUTLASS_TEST_EXECUTION_ENVIRONMENT @CUTLASS_TEST_EXECUTION_ENVIRONMENT@)
-endif()
-
-if (NOT "@TEST_EXE_DIR@" STREQUAL "")
-  set(TEST_EXE_PATH @TEST_EXE_DIR@/@TEST_EXE@)
-else()
-  set(TEST_EXE_PATH @TEST_EXE@)
-endif()
-
-add_test("@TEST_NAME@" ${_CUTLASS_TEST_EXECUTION_ENVIRONMENT} "${TEST_EXE_PATH}" @TEST_COMMAND_OPTIONS@)
-
-if (NOT "@TEST_EXE_WORKING_DIRECTORY@" STREQUAL "")
-  set_tests_properties("@TEST_NAME@" PROPERTIES WORKING_DIRECTORY "@TEST_EXE_WORKING_DIRECTORY@")
-endif()
-
-set_tests_properties(@TEST_NAME@ PROPERTIES DISABLED @__DISABLE_TESTS@)
diff --git a/cmake/CTestTestfile.configure.cmake b/cmake/CTestTestfile.configure.cmake
new file mode 100644
index 0000000000..611b3d181f
--- /dev/null
+++ b/cmake/CTestTestfile.configure.cmake
@@ -0,0 +1,52 @@
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# Generated file
+
+set(TEST_SETS_SUPPORTED @TEST_SETS_SUPPORTED@)
+
+if (NOT DEFINED ENV{CUTLASS_TEST_SETS})
+  set(ENV{CUTLASS_TEST_SETS} @CUTLASS_DEFAULT_ACTIVE_TEST_SETS@)
+endif()
+
+foreach(TEST_SET_REQUESTED IN ITEMS $ENV{CUTLASS_TEST_SETS})
+  if (NOT TEST_SET_REQUESTED IN_LIST TEST_SETS_SUPPORTED) 
+    message(STATUS "Skipping tests for @TEST_EXE_PATH@ as ${TEST_SET_REQUESTED} is not in the set of [${TEST_SETS_SUPPORTED}].")
+    return()
+  endif()
+endforeach()
+
+set(TEST_EXE_PATH @TEST_EXE_PATH@)
+set(TEST_EXE_WORKING_DIRECTORY @TEST_EXE_WORKING_DIRECTORY@)
+set(CUTLASS_USE_EXTENDED_ADD_TEST_FORMAT @TEST_USE_EXTENDED_FORMAT@)
+
+if (DEFINED ENV{CUTLASS_TEST_EXECUTION_ENVIRONMENT})
+  set(_CUTLASS_TEST_EXECUTION_ENVIRONMENT $ENV{CUTLASS_TEST_EXECUTION_ENVIRONMENT})
+else()
+  set(_CUTLASS_TEST_EXECUTION_ENVIRONMENT @CUTLASS_TEST_EXECUTION_ENVIRONMENT@)
+endif()
diff --git a/cmake/CTestTestfile.test.configure.cmake b/cmake/CTestTestfile.test.configure.cmake
new file mode 100644
index 0000000000..31dba54498
--- /dev/null
+++ b/cmake/CTestTestfile.test.configure.cmake
@@ -0,0 +1,43 @@
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+if (CUTLASS_USE_EXTENDED_ADD_TEST_FORMAT)
+  # The longform/extended format allows generator expressions to be
+  # expanded property and is useful in contexts where the files need
+  # to be immediately included into being-processed cmake code.
+  add_test(NAME @TESTCASE_NAME@ COMMAND ${_CUTLASS_TEST_EXECUTION_ENVIRONMENT} "${TEST_EXE_PATH}" @TEST_COMMAND_OPTIONS@)
+else()
+  add_test(@TESTCASE_NAME@ ${_CUTLASS_TEST_EXECUTION_ENVIRONMENT} "${TEST_EXE_PATH}" @TEST_COMMAND_OPTIONS@)
+endif()
+
+if (TEST_EXE_WORKING_DIRECTORY)
+  set_tests_properties(@TESTCASE_NAME@ PROPERTIES WORKING_DIRECTORY "${TEST_EXE_WORKING_DIRECTORY}")
+endif()
+
+set_tests_properties(@TESTCASE_NAME@ PROPERTIES DISABLED @__DISABLE_TESTS@)
+
diff --git a/cmake/NvidiaCutlassConfig.cmake b/cmake/NvidiaCutlassConfig.cmake.in
similarity index 52%
rename from cmake/NvidiaCutlassConfig.cmake
rename to cmake/NvidiaCutlassConfig.cmake.in
index 701ecb4af4..2fe69119a1 100644
--- a/cmake/NvidiaCutlassConfig.cmake
+++ b/cmake/NvidiaCutlassConfig.cmake.in
@@ -2,6 +2,8 @@ get_filename_component(NvidiaCutlass_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH
 
 include(CMakeFindDependencyMacro)
 
-if(NOT TARGET nvidia::cutlass::CUTLASS)
-    include("${NvidiaCutlass_CMAKE_DIR}/NvidiaCutlassTargets.cmake")
+if(TARGET nvidia::cutlass::CUTLASS)
+  return()
 endif()
+
+include("${NvidiaCutlass_CMAKE_DIR}/NvidiaCutlassTargets.cmake")
diff --git a/cmake/NvidiaCutlassPackageConfig.cmake b/cmake/NvidiaCutlassPackageConfig.cmake
index bb15b1bb70..364fba7a20 100644
--- a/cmake/NvidiaCutlassPackageConfig.cmake
+++ b/cmake/NvidiaCutlassPackageConfig.cmake
@@ -1,3 +1,31 @@
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 set(CPACK_PACKAGE_NAME NvidiaCutlass)
 set(CPACK_PACKAGE_VENDOR NVIDIA)
 set(CPACK_PACKAGE_CONTACT info@nvidia.com)
diff --git a/cmake/googletest.cmake b/cmake/googletest.cmake
index 85edc807c9..d220cfadc2 100644
--- a/cmake/googletest.cmake
+++ b/cmake/googletest.cmake
@@ -1,3 +1,31 @@
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 include(FetchContent)
 
 set(GOOGLETEST_DIR "" CACHE STRING "Location of local GoogleTest repo to build against")
@@ -6,10 +34,11 @@ if(GOOGLETEST_DIR)
   set(FETCHCONTENT_SOURCE_DIR_GOOGLETEST ${GOOGLETEST_DIR} CACHE STRING "GoogleTest source directory override")
 endif()
 
+set(GTEST_REPOSITORY "https://github.com/google/googletest.git" CACHE STRING "GoogleTest repo to fetch")
 FetchContent_Declare(
   googletest
-  GIT_REPOSITORY https://github.com/google/googletest.git
-  GIT_TAG        0fe9660
+  GIT_REPOSITORY ${GTEST_REPOSITORY}
+  GIT_TAG        v1.14.0
   )
 
 FetchContent_GetProperties(googletest)
diff --git a/cmake/nop.cu b/cmake/nop.cu
index f477557225..be2b15881e 100644
--- a/cmake/nop.cu
+++ b/cmake/nop.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/cmake/version.h.in b/cmake/version.h.in
deleted file mode 100644
index 1b48e1abc2..0000000000
--- a/cmake/version.h.in
+++ /dev/null
@@ -1,38 +0,0 @@
-#include <cstdint>
-#include <string>
-
-#define CUTLASS_MAJOR @CUTLASS_VERSION_MAJOR@
-#define CUTLASS_MINOR @CUTLASS_VERSION_MINOR@
-#define CUTLASS_PATCH @CUTLASS_VERSION_PATCH@
-#define CUTLASS_BUILD @CUTLASS_VERSION_BUILD@
-#define CUTLASS_VERSION ((CUTLASS_MAJOR)*100 + (CUTLASS_MINOR)*10 + CUTLASS_PATCH)
-
-namespace cutlass {
-
-  inline uint32_t getVersion() {
-    return CUTLASS_VERSION;
-  }
-  inline uint32_t getVersionMajor() {
-    return CUTLASS_MAJOR;
-  }
-  inline uint32_t getVersionMinor() {
-    return CUTLASS_MINOR;
-  }
-  inline uint32_t getVersionPatch() {
-    return CUTLASS_PATCH;
-  }
-  inline uint32_t getVersionBuild() {
-    return CUTLASS_BUILD + 0;
-  }
-  inline std::string getVersionString() {
-    std::string version = "@CUTLASS_VERSION@";
-    if (getVersionBuild()) {
-      version += "." + std::to_string(getVersionBuild());
-    }
-    return version;
-  }
-  inline std::string getGitRevision() {
-    return "@CUTLASS_REVISION@";
-  }
-
-} // namespace cutlass
diff --git a/cmake/version_extended.h.in b/cmake/version_extended.h.in
new file mode 100644
index 0000000000..3613063022
--- /dev/null
+++ b/cmake/version_extended.h.in
@@ -0,0 +1,34 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#define CUTLASS_BUILD @CUTLASS_VERSION_BUILD@
+#define CUTLASS_REVISION "@CUTLASS_REVISION@"
diff --git a/cuBLAS.cmake b/cuBLAS.cmake
index a66274eb78..383871fdff 100644
--- a/cuBLAS.cmake
+++ b/cuBLAS.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -40,7 +40,7 @@ elseif(NOT TARGET cublas)
  
   find_path(
     _CUBLAS_INCLUDE_DIR
-    NAMES cublas.h
+    NAMES cublas_v2.h
     HINTS
       ${CUBLAS_INCLUDE_PATH}
       ENV CUBLAS_INCLUDE_PATH
diff --git a/cuDNN.cmake b/cuDNN.cmake
index 4f89f43dd2..0b37ff7c30 100644
--- a/cuDNN.cmake
+++ b/cuDNN.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/docs/annotated.html b/docs/annotated.html
index d587a26d9a..233691c235 100644
--- a/docs/annotated.html
+++ b/docs/annotated.html
@@ -280,15 +280,15 @@
 <tr id="row_0_3_0_13_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1DefaultGemmConfiguration_3_01arch_1_1OpClassWmmaTensorOp_00_0884059ecad03bea3e86c4cf722226097.html" target="_self">DefaultGemmConfiguration&lt; arch::OpClassWmmaTensorOp, ArchTag, ElementA, ElementB, ElementC, ElementAccumulator &gt;</a></td><td class="desc"></td></tr>
 <tr id="row_0_3_0_14_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span id="arr_0_3_0_14_" class="arrow" onclick="toggleFolder('0_3_0_14_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm.html" target="_self">Gemm</a></td><td class="desc"></td></tr>
 <tr id="row_0_3_0_14_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1Gemm_1_1Arguments.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
-<tr id="row_0_3_0_15_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span id="arr_0_3_0_15_" class="arrow" onclick="toggleFolder('0_3_0_15_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html" target="_self">Gemm&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero &gt;</a></td><td class="desc">Parital specialization for column-major output exchanges problem size and operand </td></tr>
+<tr id="row_0_3_0_15_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span id="arr_0_3_0_15_" class="arrow" onclick="toggleFolder('0_3_0_15_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html" target="_self">Gemm&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero &gt;</a></td><td class="desc">Partial specialization for column-major output exchanges problem size and operand </td></tr>
 <tr id="row_0_3_0_15_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layou1b211cc9c97c022d8fe10f2dd32c8709.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
 <tr id="row_0_3_0_16_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span id="arr_0_3_0_16_" class="arrow" onclick="toggleFolder('0_3_0_16_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched.html" target="_self">GemmBatched</a></td><td class="desc"></td></tr>
 <tr id="row_0_3_0_16_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmBatched_1_1Arguments.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
-<tr id="row_0_3_0_17_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span id="arr_0_3_0_17_" class="arrow" onclick="toggleFolder('0_3_0_17_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html" target="_self">GemmBatched&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ &gt;</a></td><td class="desc">Parital specialization for column-major output exchanges problem size and operand </td></tr>
+<tr id="row_0_3_0_17_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span id="arr_0_3_0_17_" class="arrow" onclick="toggleFolder('0_3_0_17_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html" target="_self">GemmBatched&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ &gt;</a></td><td class="desc">Partial specialization for column-major output exchanges problem size and operand </td></tr>
 <tr id="row_0_3_0_17_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_213d78696663f4231cd52c6a277c60e5.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
 <tr id="row_0_3_0_18_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span id="arr_0_3_0_18_" class="arrow" onclick="toggleFolder('0_3_0_18_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex.html" target="_self">GemmComplex</a></td><td class="desc"></td></tr>
 <tr id="row_0_3_0_18_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmComplex_1_1Arguments.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
-<tr id="row_0_3_0_19_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span id="arr_0_3_0_19_" class="arrow" onclick="toggleFolder('0_3_0_19_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html" target="_self">GemmComplex&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial &gt;</a></td><td class="desc">Parital specialization for column-major output exchanges problem size and operand </td></tr>
+<tr id="row_0_3_0_19_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span id="arr_0_3_0_19_" class="arrow" onclick="toggleFolder('0_3_0_19_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html" target="_self">GemmComplex&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial &gt;</a></td><td class="desc">Partial specialization for column-major output exchanges problem size and operand </td></tr>
 <tr id="row_0_3_0_19_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_a3923967cafb5cb9774c320dc24baa77.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
 <tr id="row_0_3_0_20_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span id="arr_0_3_0_20_" class="arrow" onclick="toggleFolder('0_3_0_20_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmSplitKParallel.html" target="_self">GemmSplitKParallel</a></td><td class="desc"></td></tr>
 <tr id="row_0_3_0_20_0_" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmSplitKParallel_1_1Arguments.html" target="_self">Arguments</a></td><td class="desc">Argument structure </td></tr>
@@ -594,7 +594,7 @@
 <tr id="row_0_8_1_4_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm.html" target="_self">Gemm</a></td><td class="desc"></td></tr>
 <tr id="row_0_8_1_5_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout4e016ab7cfc644acd7cb4ae770339773.html" target="_self">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpMultiplyAdd &gt;</a></td><td class="desc">Partial specialization for multiply-add </td></tr>
 <tr id="row_0_8_1_6_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout30b72addd464a2ca4a26785cbfd77a8e.html" target="_self">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpMultiplyAddSaturate &gt;</a></td><td class="desc">Partial specialization for multiply-add-saturate </td></tr>
-<tr id="row_0_8_1_7_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html" target="_self">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpXorPopc &gt;</a></td><td class="desc">Parital specialization for XOR-popc </td></tr>
+<tr id="row_0_8_1_7_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html" target="_self">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpXorPopc &gt;</a></td><td class="desc">Partial specialization for XOR-popc </td></tr>
 <tr id="row_0_8_1_8_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1TensorDiagonalForEach.html" target="_self">TensorDiagonalForEach</a></td><td class="desc">Launches a kernel calling a functor for each element along a tensor's diagonal </td></tr>
 <tr id="row_0_8_1_9_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1TensorForEach.html" target="_self">TensorForEach</a></td><td class="desc">Launches a kernel calling a functor for each element in a tensor's index space </td></tr>
 <tr id="row_0_8_2_" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span id="arr_0_8_2_" class="arrow" onclick="toggleFolder('0_8_2_')">&#9658;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacecutlass_1_1reference_1_1host.html" target="_self">host</a></td><td class="desc"></td></tr>
@@ -620,7 +620,7 @@
 <tr id="row_0_8_2_2_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm.html" target="_self">Gemm</a></td><td class="desc"></td></tr>
 <tr id="row_0_8_2_3_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_193dd3a37f00deff1e5dcd7c310afb1f.html" target="_self">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpMultiplyAdd &gt;</a></td><td class="desc">Partial specialization for multiply-add </td></tr>
 <tr id="row_0_8_2_4_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_55729eac7dbd6bf311ea36f680e83e93.html" target="_self">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpMultiplyAddSaturate &gt;</a></td><td class="desc">Partial specialization for multiply-add-saturate </td></tr>
-<tr id="row_0_8_2_5_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html" target="_self">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpXorPopc &gt;</a></td><td class="desc">Parital specialization for XOR-popc </td></tr>
+<tr id="row_0_8_2_5_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html" target="_self">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpXorPopc &gt;</a></td><td class="desc">Partial specialization for XOR-popc </td></tr>
 <tr id="row_0_9_" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span id="arr_0_9_" class="arrow" onclick="toggleFolder('0_9_')">&#9658;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacecutlass_1_1thread.html" target="_self">thread</a></td><td class="desc"></td></tr>
 <tr id="row_0_9_0_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1thread_1_1Matrix.html" target="_self">Matrix</a></td><td class="desc">Per-thread matrix object storing a packed matrix </td></tr>
 <tr id="row_0_10_" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span id="arr_0_10_" class="arrow" onclick="toggleFolder('0_10_')">&#9658;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacecutlass_1_1transform.html" target="_self">transform</a></td><td class="desc"></td></tr>
@@ -718,7 +718,7 @@
 <tr id="row_0_10_8_0_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1transform_1_1PitchLinearWarpStripedThreadMap_1_1Detail.html" target="_self">Detail</a></td><td class="desc">Internal details made public to facilitate introspection Iterations along each dimension (concept: PitchLinearShape) </td></tr>
 <tr id="row_0_10_9_" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span id="arr_0_10_9_" class="arrow" onclick="toggleFolder('0_10_9_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1transform_1_1TransposePitchLinearThreadMap.html" target="_self">TransposePitchLinearThreadMap</a></td><td class="desc"></td></tr>
 <tr id="row_0_10_9_0_" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1transform_1_1TransposePitchLinearThreadMap_1_1Detail.html" target="_self">Detail</a></td><td class="desc">Internal details made public to facilitate introspection Iterations along each dimension (concept: PitchLinearShape) </td></tr>
-<tr id="row_0_10_10_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1transform_1_1TransposePitchLinearThreadMap2DThreadTile.html" target="_self">TransposePitchLinearThreadMap2DThreadTile</a></td><td class="desc">Thread Mapping a 2D threadtiled mapping as a tranposed Pitchlinear2DThreadTile mapping </td></tr>
+<tr id="row_0_10_10_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1transform_1_1TransposePitchLinearThreadMap2DThreadTile.html" target="_self">TransposePitchLinearThreadMap2DThreadTile</a></td><td class="desc">Thread Mapping a 2D threadtiled mapping as a transposed Pitchlinear2DThreadTile mapping </td></tr>
 <tr id="row_0_10_11_" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1transform_1_1TransposePitchLinearThreadMapSimt.html" target="_self">TransposePitchLinearThreadMapSimt</a></td><td class="desc"></td></tr>
 <tr id="row_0_11_" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1AlignedArray.html" target="_self">AlignedArray</a></td><td class="desc">Aligned array type </td></tr>
 <tr id="row_0_12_" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1AlignedBuffer.html" target="_self">AlignedBuffer</a></td><td class="desc">Modifies semantics of cutlass::Array&lt;&gt; to provide guaranteed alignment </td></tr>
diff --git a/docs/classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html b/docs/classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html
index d53d8d67dc..6800f4fe70 100644
--- a/docs/classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html
+++ b/docs/classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html
@@ -108,7 +108,7 @@
 </div><!--header-->
 <div class="contents">
 
-<p>Parital specialization for column-major output exchanges problem size and operand.  
+<p>Partial specialization for column-major output exchanges problem size and operand.
 </p>
 
 <p><code>#include &lt;<a class="el" href="device_2gemm__batched_8h_source.html">gemm_batched.h</a>&gt;</code></p>
diff --git a/docs/classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html b/docs/classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html
index 8468585615..d09783439c 100644
--- a/docs/classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html
+++ b/docs/classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html
@@ -108,7 +108,7 @@
 </div><!--header-->
 <div class="contents">
 
-<p>Parital specialization for column-major output exchanges problem size and operand.  
+<p>Partial specialization for column-major output exchanges problem size and operand.
 </p>
 
 <p><code>#include &lt;<a class="el" href="include_2cutlass_2gemm_2device_2gemm__complex_8h_source.html">gemm_complex.h</a>&gt;</code></p>
diff --git a/docs/classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html b/docs/classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html
index f34be6e5ee..323ecfc2b0 100644
--- a/docs/classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html
+++ b/docs/classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html
@@ -108,7 +108,7 @@
 </div><!--header-->
 <div class="contents">
 
-<p>Parital specialization for column-major output exchanges problem size and operand.  
+<p>Partial specialization for column-major output exchanges problem size and operand.
 </p>
 
 <p><code>#include &lt;<a class="el" href="include_2cutlass_2gemm_2device_2gemm_8h_source.html">gemm.h</a>&gt;</code></p>
diff --git a/docs/command__line_8h_source.html b/docs/command__line_8h_source.html
index f98c9f1eaf..325a303405 100644
--- a/docs/command__line_8h_source.html
+++ b/docs/command__line_8h_source.html
@@ -98,7 +98,7 @@
 <div class="title">command_line.h</div>  </div>
 </div><!--header-->
 <div class="contents">
-<a href="command__line_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a name="l00001"></a><span class="lineno">    1</span>&#160;<span class="comment">/******************************************************************************</span></div><div class="line"><a name="l00002"></a><span class="lineno">    2</span>&#160;<span class="comment"> * Copyright (c) 2011-2019, NVIDIA CORPORATION.  All rights reserved.</span></div><div class="line"><a name="l00003"></a><span class="lineno">    3</span>&#160;<span class="comment"> *</span></div><div class="line"><a name="l00004"></a><span class="lineno">    4</span>&#160;<span class="comment"> * Redistribution and use in source and binary forms, with or without</span></div><div class="line"><a name="l00005"></a><span class="lineno">    5</span>&#160;<span class="comment"> * modification, are not permitted.</span></div><div class="line"><a name="l00006"></a><span class="lineno">    6</span>&#160;<span class="comment"> *</span></div><div class="line"><a name="l00007"></a><span class="lineno">    7</span>&#160;<span class="comment"> * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS &quot;AS IS&quot; AND</span></div><div class="line"><a name="l00008"></a><span class="lineno">    8</span>&#160;<span class="comment"> * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED</span></div><div class="line"><a name="l00009"></a><span class="lineno">    9</span>&#160;<span class="comment"> * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE</span></div><div class="line"><a name="l00010"></a><span class="lineno">   10</span>&#160;<span class="comment"> * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY</span></div><div class="line"><a name="l00011"></a><span class="lineno">   11</span>&#160;<span class="comment"> * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES</span></div><div class="line"><a name="l00012"></a><span class="lineno">   12</span>&#160;<span class="comment"> * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;</span></div><div class="line"><a name="l00013"></a><span class="lineno">   13</span>&#160;<span class="comment"> * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND</span></div><div class="line"><a name="l00014"></a><span class="lineno">   14</span>&#160;<span class="comment"> * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT</span></div><div class="line"><a name="l00015"></a><span class="lineno">   15</span>&#160;<span class="comment"> * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS</span></div><div class="line"><a name="l00016"></a><span class="lineno">   16</span>&#160;<span class="comment"> * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.</span></div><div class="line"><a name="l00017"></a><span class="lineno">   17</span>&#160;<span class="comment"> *</span></div><div class="line"><a name="l00018"></a><span class="lineno">   18</span>&#160;<span class="comment"> ******************************************************************************/</span></div><div class="line"><a name="l00019"></a><span class="lineno">   19</span>&#160;</div><div class="line"><a name="l00020"></a><span class="lineno">   20</span>&#160;<span class="preprocessor">#pragma once</span></div><div class="line"><a name="l00021"></a><span class="lineno">   21</span>&#160;</div><div class="line"><a name="l00027"></a><span class="lineno">   27</span>&#160;<span class="preprocessor">#include &lt;iostream&gt;</span></div><div class="line"><a name="l00028"></a><span class="lineno">   28</span>&#160;<span class="preprocessor">#include &lt;limits&gt;</span></div><div class="line"><a name="l00029"></a><span class="lineno">   29</span>&#160;<span class="preprocessor">#include &lt;sstream&gt;</span></div><div class="line"><a name="l00030"></a><span class="lineno">   30</span>&#160;<span class="preprocessor">#include &lt;string&gt;</span></div><div class="line"><a name="l00031"></a><span class="lineno">   31</span>&#160;<span class="preprocessor">#include &lt;vector&gt;</span></div><div class="line"><a name="l00032"></a><span class="lineno">   32</span>&#160;</div><div class="line"><a name="l00033"></a><span class="lineno">   33</span>&#160;<span class="preprocessor">#include &lt;cuda_runtime.h&gt;</span></div><div class="line"><a name="l00034"></a><span class="lineno">   34</span>&#160;</div><div class="line"><a name="l00035"></a><span class="lineno">   35</span>&#160;<span class="keyword">namespace </span><a class="code" href="namespacecutlass.html">cutlass</a> {</div><div class="line"><a name="l00036"></a><span class="lineno">   36</span>&#160;</div><div class="line"><a name="l00037"></a><span class="lineno">   37</span>&#160;<span class="comment">/******************************************************************************</span></div><div class="line"><a name="l00038"></a><span class="lineno">   38</span>&#160;<span class="comment"> * command_line</span></div><div class="line"><a name="l00039"></a><span class="lineno">   39</span>&#160;<span class="comment"> ******************************************************************************/</span></div><div class="line"><a name="l00040"></a><span class="lineno">   40</span>&#160;</div><div class="line"><a name="l00044"></a><span class="lineno"><a class="line" href="structcutlass_1_1CommandLine.html">   44</a></span>&#160;<span class="keyword">struct </span><a class="code" href="structcutlass_1_1CommandLine.html">CommandLine</a> {</div><div class="line"><a name="l00045"></a><span class="lineno"><a class="line" href="structcutlass_1_1CommandLine.html#a1603f1c65c6d8d3d4262443b40e5c290">   45</a></span>&#160;  std::vector&lt;std::string&gt; <a class="code" href="structcutlass_1_1CommandLine.html#a1603f1c65c6d8d3d4262443b40e5c290">keys</a>;</div><div class="line"><a name="l00046"></a><span class="lineno"><a class="line" href="structcutlass_1_1CommandLine.html#ade127841e9730589f611b618e9440012">   46</a></span>&#160;  std::vector&lt;std::string&gt; <a class="code" href="structcutlass_1_1CommandLine.html#ade127841e9730589f611b618e9440012">values</a>;</div><div class="line"><a name="l00047"></a><span class="lineno"><a class="line" href="structcutlass_1_1CommandLine.html#a6a338671a8d323882f9d9463863eb1c1">   47</a></span>&#160;  std::vector&lt;std::string&gt; <a class="code" href="structcutlass_1_1CommandLine.html#a6a338671a8d323882f9d9463863eb1c1">args</a>;</div><div class="line"><a name="l00048"></a><span class="lineno">   48</span>&#160;</div><div class="line"><a name="l00052"></a><span class="lineno"><a class="line" href="structcutlass_1_1CommandLine.html#a7156975dc884e8b58b91c710495fc79d">   52</a></span>&#160;  <a class="code" href="structcutlass_1_1CommandLine.html#a7156975dc884e8b58b91c710495fc79d">CommandLine</a>(<span class="keywordtype">int</span> argc, <span class="keyword">const</span> <span class="keywordtype">char</span>** argv) {</div><div class="line"><a name="l00053"></a><span class="lineno">   53</span>&#160;    <span class="keyword">using namespace </span><a class="code" href="namespacestd.html">std</a>;</div><div class="line"><a name="l00054"></a><span class="lineno">   54</span>&#160;</div><div class="line"><a name="l00055"></a><span class="lineno">   55</span>&#160;    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 1; i &lt; argc; i++) {</div><div class="line"><a name="l00056"></a><span class="lineno">   56</span>&#160;      <span class="keywordtype">string</span> <a class="code" href="namespacecutlass.html#a2d34a68d49bf2a35de81b6600425b645">arg</a> = argv[i];</div><div class="line"><a name="l00057"></a><span class="lineno">   57</span>&#160;</div><div class="line"><a name="l00058"></a><span class="lineno">   58</span>&#160;      <span class="keywordflow">if</span> ((arg[0] != <span class="charliteral">&#39;-&#39;</span>) || (arg[1] != <span class="charliteral">&#39;-&#39;</span>)) {</div><div class="line"><a name="l00059"></a><span class="lineno">   59</span>&#160;        args.push_back(arg);</div><div class="line"><a name="l00060"></a><span class="lineno">   60</span>&#160;        <span class="keywordflow">continue</span>;</div><div class="line"><a name="l00061"></a><span class="lineno">   61</span>&#160;      }</div><div class="line"><a name="l00062"></a><span class="lineno">   62</span>&#160;</div><div class="line"><a name="l00063"></a><span class="lineno">   63</span>&#160;      string::size_type pos;</div><div class="line"><a name="l00064"></a><span class="lineno">   64</span>&#160;      <span class="keywordtype">string</span> key, val;</div><div class="line"><a name="l00065"></a><span class="lineno">   65</span>&#160;      <span class="keywordflow">if</span> ((pos = arg.find(<span class="charliteral">&#39;=&#39;</span>)) == string::npos) {</div><div class="line"><a name="l00066"></a><span class="lineno">   66</span>&#160;        key = string(arg, 2, arg.length() - 2);</div><div class="line"><a name="l00067"></a><span class="lineno">   67</span>&#160;        val = <span class="stringliteral">&quot;&quot;</span>;</div><div class="line"><a name="l00068"></a><span class="lineno">   68</span>&#160;      } <span class="keywordflow">else</span> {</div><div class="line"><a name="l00069"></a><span class="lineno">   69</span>&#160;        key = string(arg, 2, pos - 2);</div><div class="line"><a name="l00070"></a><span class="lineno">   70</span>&#160;        val = string(arg, pos + 1, arg.length() - 1);</div><div class="line"><a name="l00071"></a><span class="lineno">   71</span>&#160;      }</div><div class="line"><a name="l00072"></a><span class="lineno">   72</span>&#160;</div><div class="line"><a name="l00073"></a><span class="lineno">   73</span>&#160;      keys.push_back(key);</div><div class="line"><a name="l00074"></a><span class="lineno">   74</span>&#160;      values.push_back(val);</div><div class="line"><a name="l00075"></a><span class="lineno">   75</span>&#160;    }</div><div class="line"><a name="l00076"></a><span class="lineno">   76</span>&#160;  }</div><div class="line"><a name="l00077"></a><span class="lineno">   77</span>&#160;</div><div class="line"><a name="l00081"></a><span class="lineno"><a class="line" href="structcutlass_1_1CommandLine.html#a5a20785501f9ed3d4a57241b08399552">   81</a></span>&#160;  <span class="keywordtype">bool</span> <a class="code" href="structcutlass_1_1CommandLine.html#a5a20785501f9ed3d4a57241b08399552">check_cmd_line_flag</a>(<span class="keyword">const</span> <span class="keywordtype">char</span>* arg_name)<span class="keyword"> const </span>{</div><div class="line"><a name="l00082"></a><span class="lineno">   82</span>&#160;    <span class="keyword">using namespace </span><a class="code" href="namespacestd.html">std</a>;</div><div class="line"><a name="l00083"></a><span class="lineno">   83</span>&#160;</div><div class="line"><a name="l00084"></a><span class="lineno">   84</span>&#160;    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; int(keys.size()); ++i) {</div><div class="line"><a name="l00085"></a><span class="lineno">   85</span>&#160;      <span class="keywordflow">if</span> (keys[i] == <span class="keywordtype">string</span>(arg_name)) <span class="keywordflow">return</span> <span class="keyword">true</span>;</div><div class="line"><a name="l00086"></a><span class="lineno">   86</span>&#160;    }</div><div class="line"><a name="l00087"></a><span class="lineno">   87</span>&#160;    <span class="keywordflow">return</span> <span class="keyword">false</span>;</div><div class="line"><a name="l00088"></a><span class="lineno">   88</span>&#160;  }</div><div class="line"><a name="l00089"></a><span class="lineno">   89</span>&#160;</div><div class="line"><a name="l00093"></a><span class="lineno">   93</span>&#160;  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> value_t&gt;</div><div class="line"><a name="l00094"></a><span class="lineno"><a class="line" href="structcutlass_1_1CommandLine.html#a0bee40a3cc6078a08eec5d4ca4711f61">   94</a></span>&#160;  <span class="keywordtype">int</span> <a class="code" href="structcutlass_1_1CommandLine.html#a0bee40a3cc6078a08eec5d4ca4711f61">num_naked_args</a>()<span class="keyword"> const </span>{</div><div class="line"><a name="l00095"></a><span class="lineno">   95</span>&#160;    <span class="keywordflow">return</span> args.size();</div><div class="line"><a name="l00096"></a><span class="lineno">   96</span>&#160;  }</div><div class="line"><a name="l00097"></a><span class="lineno">   97</span>&#160;</div><div class="line"><a name="l00101"></a><span class="lineno">  101</span>&#160;  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> value_t&gt;</div><div class="line"><a name="l00102"></a><span class="lineno"><a class="line" href="structcutlass_1_1CommandLine.html#a06962a53ee69752551c0353e1eb98d98">  102</a></span>&#160;  <span class="keywordtype">void</span> <a class="code" href="structcutlass_1_1CommandLine.html#a06962a53ee69752551c0353e1eb98d98">get_cmd_line_argument</a>(<span class="keywordtype">int</span> index, value_t&amp; val)<span class="keyword"> const </span>{</div><div class="line"><a name="l00103"></a><span class="lineno">  103</span>&#160;    <span class="keyword">using namespace </span><a class="code" href="namespacestd.html">std</a>;</div><div class="line"><a name="l00104"></a><span class="lineno">  104</span>&#160;    <span class="keywordflow">if</span> (index &lt; args.size()) {</div><div class="line"><a name="l00105"></a><span class="lineno">  105</span>&#160;      istringstream str_stream(args[index]);</div><div class="line"><a name="l00106"></a><span class="lineno">  106</span>&#160;      str_stream &gt;&gt; val;</div><div class="line"><a name="l00107"></a><span class="lineno">  107</span>&#160;    }</div><div class="line"><a name="l00108"></a><span class="lineno">  108</span>&#160;  }</div><div class="line"><a name="l00109"></a><span class="lineno">  109</span>&#160;</div><div class="line"><a name="l00113"></a><span class="lineno"><a class="line" href="structcutlass_1_1CommandLine.html#a9ac897e414cfeddad031b1384ffe815e">  113</a></span>&#160;  <span class="keywordtype">void</span> <a class="code" href="structcutlass_1_1CommandLine.html#a9ac897e414cfeddad031b1384ffe815e">get_cmd_line_argument</a>(<span class="keyword">const</span> <span class="keywordtype">char</span>* arg_name, <span class="keywordtype">bool</span>&amp; val, <span class="keywordtype">bool</span> _default = <span class="keyword">true</span>)<span class="keyword"> const </span>{</div><div class="line"><a name="l00114"></a><span class="lineno">  114</span>&#160;    val = _default;</div><div class="line"><a name="l00115"></a><span class="lineno">  115</span>&#160;    <span class="keywordflow">if</span> (<a class="code" href="structcutlass_1_1CommandLine.html#a5a20785501f9ed3d4a57241b08399552">check_cmd_line_flag</a>(arg_name)) {</div><div class="line"><a name="l00116"></a><span class="lineno">  116</span>&#160;      std::string value;</div><div class="line"><a name="l00117"></a><span class="lineno">  117</span>&#160;      <a class="code" href="structcutlass_1_1CommandLine.html#a06962a53ee69752551c0353e1eb98d98">get_cmd_line_argument</a>(arg_name, value);</div><div class="line"><a name="l00118"></a><span class="lineno">  118</span>&#160;</div><div class="line"><a name="l00119"></a><span class="lineno">  119</span>&#160;      val = !(value == <span class="stringliteral">&quot;0&quot;</span> || value == <span class="stringliteral">&quot;false&quot;</span>);</div><div class="line"><a name="l00120"></a><span class="lineno">  120</span>&#160;    }</div><div class="line"><a name="l00121"></a><span class="lineno">  121</span>&#160;  }</div><div class="line"><a name="l00122"></a><span class="lineno">  122</span>&#160;</div><div class="line"><a name="l00126"></a><span class="lineno">  126</span>&#160;  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> value_t&gt;</div><div class="line"><a name="l00127"></a><span class="lineno"><a class="line" href="structcutlass_1_1CommandLine.html#a206ae1ef3a4cc1a10dabd9d651be50d0">  127</a></span>&#160;  <span class="keywordtype">void</span> <a class="code" href="structcutlass_1_1CommandLine.html#a206ae1ef3a4cc1a10dabd9d651be50d0">get_cmd_line_argument</a>(<span class="keyword">const</span> <span class="keywordtype">char</span>* arg_name,</div><div class="line"><a name="l00128"></a><span class="lineno">  128</span>&#160;                             value_t&amp; val,</div><div class="line"><a name="l00129"></a><span class="lineno">  129</span>&#160;                             value_t <span class="keyword">const</span>&amp; _default = value_t())<span class="keyword"> const </span>{</div><div class="line"><a name="l00130"></a><span class="lineno">  130</span>&#160;    <span class="keyword">using namespace </span><a class="code" href="namespacestd.html">std</a>;</div><div class="line"><a name="l00131"></a><span class="lineno">  131</span>&#160;</div><div class="line"><a name="l00132"></a><span class="lineno">  132</span>&#160;    val = _default;</div><div class="line"><a name="l00133"></a><span class="lineno">  133</span>&#160;</div><div class="line"><a name="l00134"></a><span class="lineno">  134</span>&#160;    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; int(keys.size()); ++i) {</div><div class="line"><a name="l00135"></a><span class="lineno">  135</span>&#160;      <span class="keywordflow">if</span> (keys[i] == <span class="keywordtype">string</span>(arg_name)) {</div><div class="line"><a name="l00136"></a><span class="lineno">  136</span>&#160;        istringstream str_stream(values[i]);</div><div class="line"><a name="l00137"></a><span class="lineno">  137</span>&#160;        str_stream &gt;&gt; val;</div><div class="line"><a name="l00138"></a><span class="lineno">  138</span>&#160;      }</div><div class="line"><a name="l00139"></a><span class="lineno">  139</span>&#160;    }</div><div class="line"><a name="l00140"></a><span class="lineno">  140</span>&#160;  }</div><div class="line"><a name="l00141"></a><span class="lineno">  141</span>&#160;</div><div class="line"><a name="l00145"></a><span class="lineno">  145</span>&#160;  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> value_t&gt;</div><div class="line"><a name="l00146"></a><span class="lineno"><a class="line" href="structcutlass_1_1CommandLine.html#a604c5d891f1328b071290d5341119c2c">  146</a></span>&#160;  <span class="keywordtype">void</span> <a class="code" href="structcutlass_1_1CommandLine.html#a604c5d891f1328b071290d5341119c2c">get_cmd_line_arguments</a>(<span class="keyword">const</span> <span class="keywordtype">char</span>* arg_name,</div><div class="line"><a name="l00147"></a><span class="lineno">  147</span>&#160;                              std::vector&lt;value_t&gt;&amp; vals,</div><div class="line"><a name="l00148"></a><span class="lineno">  148</span>&#160;                              <span class="keywordtype">char</span> sep = <span class="charliteral">&#39;,&#39;</span>)<span class="keyword"> const </span>{</div><div class="line"><a name="l00149"></a><span class="lineno">  149</span>&#160;    <span class="keyword">using namespace </span><a class="code" href="namespacestd.html">std</a>;</div><div class="line"><a name="l00150"></a><span class="lineno">  150</span>&#160;</div><div class="line"><a name="l00151"></a><span class="lineno">  151</span>&#160;    <span class="keywordflow">if</span> (<a class="code" href="structcutlass_1_1CommandLine.html#a5a20785501f9ed3d4a57241b08399552">check_cmd_line_flag</a>(arg_name)) {</div><div class="line"><a name="l00152"></a><span class="lineno">  152</span>&#160;      <span class="comment">// Clear any default values</span></div><div class="line"><a name="l00153"></a><span class="lineno">  153</span>&#160;      vals.clear();</div><div class="line"><a name="l00154"></a><span class="lineno">  154</span>&#160;</div><div class="line"><a name="l00155"></a><span class="lineno">  155</span>&#160;      <span class="comment">// Recover from multi-value string</span></div><div class="line"><a name="l00156"></a><span class="lineno">  156</span>&#160;      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; keys.size(); ++i) {</div><div class="line"><a name="l00157"></a><span class="lineno">  157</span>&#160;        <span class="keywordflow">if</span> (keys[i] == <span class="keywordtype">string</span>(arg_name)) {</div><div class="line"><a name="l00158"></a><span class="lineno">  158</span>&#160;          <span class="keywordtype">string</span> val_string(values[i]);</div><div class="line"><a name="l00159"></a><span class="lineno">  159</span>&#160;          <a class="code" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">seperate_string</a>(val_string, vals, sep);</div><div class="line"><a name="l00160"></a><span class="lineno">  160</span>&#160;        }</div><div class="line"><a name="l00161"></a><span class="lineno">  161</span>&#160;      }</div><div class="line"><a name="l00162"></a><span class="lineno">  162</span>&#160;    }</div><div class="line"><a name="l00163"></a><span class="lineno">  163</span>&#160;  }</div><div class="line"><a name="l00164"></a><span class="lineno">  164</span>&#160;</div><div class="line"><a name="l00169"></a><span class="lineno"><a class="line" href="structcutlass_1_1CommandLine.html#a38f905a17e6c6e7bd2d1bea9e0c72088">  169</a></span>&#160;  <span class="keywordtype">void</span> <a class="code" href="structcutlass_1_1CommandLine.html#a38f905a17e6c6e7bd2d1bea9e0c72088">get_cmd_line_argument_pairs</a>(<span class="keyword">const</span> <span class="keywordtype">char</span>* arg_name,</div><div class="line"><a name="l00170"></a><span class="lineno">  170</span>&#160;                                   std::vector&lt;std::pair&lt;std::string, std::string&gt; &gt;&amp; tokens,</div><div class="line"><a name="l00171"></a><span class="lineno">  171</span>&#160;                                   <span class="keywordtype">char</span> delim = <span class="charliteral">&#39;,&#39;</span>,</div><div class="line"><a name="l00172"></a><span class="lineno">  172</span>&#160;                                   <span class="keywordtype">char</span> sep = <span class="charliteral">&#39;:&#39;</span>)<span class="keyword"> const </span>{</div><div class="line"><a name="l00173"></a><span class="lineno">  173</span>&#160;    <span class="keywordflow">if</span> (<a class="code" href="structcutlass_1_1CommandLine.html#a5a20785501f9ed3d4a57241b08399552">check_cmd_line_flag</a>(arg_name)) {</div><div class="line"><a name="l00174"></a><span class="lineno">  174</span>&#160;      std::string value;</div><div class="line"><a name="l00175"></a><span class="lineno">  175</span>&#160;      <a class="code" href="structcutlass_1_1CommandLine.html#a06962a53ee69752551c0353e1eb98d98">get_cmd_line_argument</a>(arg_name, value);</div><div class="line"><a name="l00176"></a><span class="lineno">  176</span>&#160;</div><div class="line"><a name="l00177"></a><span class="lineno">  177</span>&#160;      <a class="code" href="structcutlass_1_1CommandLine.html#a1944da52162e04b12a82ce0c1ade676e">tokenize</a>(tokens, value, delim, sep);</div><div class="line"><a name="l00178"></a><span class="lineno">  178</span>&#160;    }</div><div class="line"><a name="l00179"></a><span class="lineno">  179</span>&#160;  }</div><div class="line"><a name="l00180"></a><span class="lineno">  180</span>&#160;</div><div class="line"><a name="l00185"></a><span class="lineno"><a class="line" href="structcutlass_1_1CommandLine.html#a935f23b162d87148cadb56f9a16e094e">  185</a></span>&#160;  <span class="keywordtype">void</span> <a class="code" href="structcutlass_1_1CommandLine.html#a935f23b162d87148cadb56f9a16e094e">get_cmd_line_argument_ranges</a>(<span class="keyword">const</span> <span class="keywordtype">char</span>* arg_name,</div><div class="line"><a name="l00186"></a><span class="lineno">  186</span>&#160;                                    std::vector&lt;std::vector&lt;std::string&gt; &gt;&amp; vals,</div><div class="line"><a name="l00187"></a><span class="lineno">  187</span>&#160;                                    <span class="keywordtype">char</span> delim = <span class="charliteral">&#39;,&#39;</span>,</div><div class="line"><a name="l00188"></a><span class="lineno">  188</span>&#160;                                    <span class="keywordtype">char</span> sep = <span class="charliteral">&#39;:&#39;</span>)<span class="keyword"> const </span>{</div><div class="line"><a name="l00189"></a><span class="lineno">  189</span>&#160;    std::vector&lt;std::string&gt; ranges;</div><div class="line"><a name="l00190"></a><span class="lineno">  190</span>&#160;    <a class="code" href="structcutlass_1_1CommandLine.html#a604c5d891f1328b071290d5341119c2c">get_cmd_line_arguments</a>(arg_name, ranges, delim);</div><div class="line"><a name="l00191"></a><span class="lineno">  191</span>&#160;</div><div class="line"><a name="l00192"></a><span class="lineno">  192</span>&#160;    <span class="keywordflow">for</span> (std::vector&lt;std::string&gt;::const_iterator range = ranges.begin();</div><div class="line"><a name="l00193"></a><span class="lineno">  193</span>&#160;      range != ranges.end(); ++range) {</div><div class="line"><a name="l00194"></a><span class="lineno">  194</span>&#160;</div><div class="line"><a name="l00195"></a><span class="lineno">  195</span>&#160;      std::vector&lt;std::string&gt; range_vals;</div><div class="line"><a name="l00196"></a><span class="lineno">  196</span>&#160;      <a class="code" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">seperate_string</a>(*range, range_vals, sep);</div><div class="line"><a name="l00197"></a><span class="lineno">  197</span>&#160;      vals.push_back(range_vals);</div><div class="line"><a name="l00198"></a><span class="lineno">  198</span>&#160;    }</div><div class="line"><a name="l00199"></a><span class="lineno">  199</span>&#160;  }</div><div class="line"><a name="l00200"></a><span class="lineno">  200</span>&#160;</div><div class="line"><a name="l00204"></a><span class="lineno"><a class="line" href="structcutlass_1_1CommandLine.html#a228e1a273d223eec4b2f6d73135d3c1e">  204</a></span>&#160;  <span class="keywordtype">int</span> <a class="code" href="structcutlass_1_1CommandLine.html#a228e1a273d223eec4b2f6d73135d3c1e">parsed_argc</a>()<span class="keyword"> const </span>{ <span class="keywordflow">return</span> (<span class="keywordtype">int</span>)keys.size(); }</div><div class="line"><a name="l00205"></a><span class="lineno">  205</span>&#160;</div><div class="line"><a name="l00206"></a><span class="lineno">  206</span>&#160;  <span class="comment">//-------------------------------------------------------------------------</span></div><div class="line"><a name="l00207"></a><span class="lineno">  207</span>&#160;  <span class="comment">// Utility functions</span></div><div class="line"><a name="l00208"></a><span class="lineno">  208</span>&#160;  <span class="comment">//-------------------------------------------------------------------------</span></div><div class="line"><a name="l00209"></a><span class="lineno">  209</span>&#160;</div><div class="line"><a name="l00211"></a><span class="lineno"><a class="line" href="structcutlass_1_1CommandLine.html#a1944da52162e04b12a82ce0c1ade676e">  211</a></span>&#160;  <span class="keyword">static</span> <span class="keywordtype">void</span> <a class="code" href="structcutlass_1_1CommandLine.html#a1944da52162e04b12a82ce0c1ade676e">tokenize</a>(std::vector&lt;std::pair&lt;std::string, std::string&gt; &gt;&amp; tokens,</div><div class="line"><a name="l00212"></a><span class="lineno">  212</span>&#160;                       std::string <span class="keyword">const</span>&amp; str,</div><div class="line"><a name="l00213"></a><span class="lineno">  213</span>&#160;                       <span class="keywordtype">char</span> delim = <span class="charliteral">&#39;,&#39;</span>,</div><div class="line"><a name="l00214"></a><span class="lineno">  214</span>&#160;                       <span class="keywordtype">char</span> sep = <span class="charliteral">&#39;:&#39;</span>) {</div><div class="line"><a name="l00215"></a><span class="lineno">  215</span>&#160;    <span class="comment">// Home-built to avoid Boost dependency</span></div><div class="line"><a name="l00216"></a><span class="lineno">  216</span>&#160;    <span class="keywordtype">size_t</span> s_idx = 0;</div><div class="line"><a name="l00217"></a><span class="lineno">  217</span>&#160;    <span class="keywordtype">size_t</span> d_idx = std::string::npos;</div><div class="line"><a name="l00218"></a><span class="lineno">  218</span>&#160;    <span class="keywordflow">while</span> (s_idx &lt; str.size()) {</div><div class="line"><a name="l00219"></a><span class="lineno">  219</span>&#160;      d_idx = str.find_first_of(delim, s_idx);</div><div class="line"><a name="l00220"></a><span class="lineno">  220</span>&#160;</div><div class="line"><a name="l00221"></a><span class="lineno">  221</span>&#160;      <span class="keywordtype">size_t</span> end_idx = (d_idx != std::string::npos ? d_idx : str.size());</div><div class="line"><a name="l00222"></a><span class="lineno">  222</span>&#160;      <span class="keywordtype">size_t</span> sep_idx = str.find_first_of(sep, s_idx);</div><div class="line"><a name="l00223"></a><span class="lineno">  223</span>&#160;      <span class="keywordtype">size_t</span> offset = 1;</div><div class="line"><a name="l00224"></a><span class="lineno">  224</span>&#160;      <span class="keywordflow">if</span> (sep_idx == std::string::npos || sep_idx &gt;= end_idx) {</div><div class="line"><a name="l00225"></a><span class="lineno">  225</span>&#160;        sep_idx = end_idx;</div><div class="line"><a name="l00226"></a><span class="lineno">  226</span>&#160;        offset = 0;</div><div class="line"><a name="l00227"></a><span class="lineno">  227</span>&#160;      }</div><div class="line"><a name="l00228"></a><span class="lineno">  228</span>&#160;</div><div class="line"><a name="l00229"></a><span class="lineno">  229</span>&#160;      std::pair&lt;std::string, std::string&gt; item(</div><div class="line"><a name="l00230"></a><span class="lineno">  230</span>&#160;          str.substr(s_idx, sep_idx - s_idx),</div><div class="line"><a name="l00231"></a><span class="lineno">  231</span>&#160;          str.substr(sep_idx + offset, end_idx - sep_idx - offset));</div><div class="line"><a name="l00232"></a><span class="lineno">  232</span>&#160;</div><div class="line"><a name="l00233"></a><span class="lineno">  233</span>&#160;      tokens.push_back(item);</div><div class="line"><a name="l00234"></a><span class="lineno">  234</span>&#160;      s_idx = end_idx + 1;</div><div class="line"><a name="l00235"></a><span class="lineno">  235</span>&#160;    }</div><div class="line"><a name="l00236"></a><span class="lineno">  236</span>&#160;  }</div><div class="line"><a name="l00237"></a><span class="lineno">  237</span>&#160;</div><div class="line"><a name="l00239"></a><span class="lineno"><a class="line" href="structcutlass_1_1CommandLine.html#a440c25cfb006f218ff4705a43320a28b">  239</a></span>&#160;  <span class="keyword">static</span> <span class="keywordtype">void</span> <a class="code" href="structcutlass_1_1CommandLine.html#a440c25cfb006f218ff4705a43320a28b">tokenize</a>(std::vector&lt;std::string&gt;&amp; tokens,</div><div class="line"><a name="l00240"></a><span class="lineno">  240</span>&#160;                       std::string <span class="keyword">const</span>&amp; str,</div><div class="line"><a name="l00241"></a><span class="lineno">  241</span>&#160;                       <span class="keywordtype">char</span> delim = <span class="charliteral">&#39;,&#39;</span>,</div><div class="line"><a name="l00242"></a><span class="lineno">  242</span>&#160;                       <span class="keywordtype">char</span> sep = <span class="charliteral">&#39;:&#39;</span>) {</div><div class="line"><a name="l00243"></a><span class="lineno">  243</span>&#160;    <span class="keyword">typedef</span> std::vector&lt;std::pair&lt;std::string, std::string&gt; &gt; TokenVector;</div><div class="line"><a name="l00244"></a><span class="lineno">  244</span>&#160;    <span class="keyword">typedef</span> TokenVector::const_iterator token_iterator;</div><div class="line"><a name="l00245"></a><span class="lineno">  245</span>&#160;</div><div class="line"><a name="l00246"></a><span class="lineno">  246</span>&#160;    std::vector&lt;std::pair&lt;std::string, std::string&gt; &gt; token_pairs;</div><div class="line"><a name="l00247"></a><span class="lineno">  247</span>&#160;    <a class="code" href="structcutlass_1_1CommandLine.html#a1944da52162e04b12a82ce0c1ade676e">tokenize</a>(token_pairs, str, delim, sep);</div><div class="line"><a name="l00248"></a><span class="lineno">  248</span>&#160;    <span class="keywordflow">for</span> (token_iterator tok = token_pairs.begin(); tok != token_pairs.end(); ++tok) {</div><div class="line"><a name="l00249"></a><span class="lineno">  249</span>&#160;      tokens.push_back(tok-&gt;first);</div><div class="line"><a name="l00250"></a><span class="lineno">  250</span>&#160;    }</div><div class="line"><a name="l00251"></a><span class="lineno">  251</span>&#160;  }</div><div class="line"><a name="l00252"></a><span class="lineno">  252</span>&#160;</div><div class="line"><a name="l00253"></a><span class="lineno">  253</span>&#160;  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> value_t&gt;</div><div class="line"><a name="l00254"></a><span class="lineno"><a class="line" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">  254</a></span>&#160;  <span class="keyword">static</span> <span class="keywordtype">void</span> <a class="code" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">seperate_string</a>(std::string <span class="keyword">const</span>&amp; str,</div><div class="line"><a name="l00255"></a><span class="lineno">  255</span>&#160;                              std::vector&lt;value_t&gt;&amp; vals,</div><div class="line"><a name="l00256"></a><span class="lineno">  256</span>&#160;                              <span class="keywordtype">char</span> sep = <span class="charliteral">&#39;,&#39;</span>) {</div><div class="line"><a name="l00257"></a><span class="lineno">  257</span>&#160;    std::istringstream str_stream(str);</div><div class="line"><a name="l00258"></a><span class="lineno">  258</span>&#160;    std::string::size_type old_pos = 0;</div><div class="line"><a name="l00259"></a><span class="lineno">  259</span>&#160;    std::string::size_type new_pos = 0;</div><div class="line"><a name="l00260"></a><span class="lineno">  260</span>&#160;</div><div class="line"><a name="l00261"></a><span class="lineno">  261</span>&#160;    <span class="comment">// Iterate &lt;sep&gt;-delimited values</span></div><div class="line"><a name="l00262"></a><span class="lineno">  262</span>&#160;    value_t val;</div><div class="line"><a name="l00263"></a><span class="lineno">  263</span>&#160;    <span class="keywordflow">while</span> ((new_pos = str.find(sep, old_pos)) != std::string::npos) {</div><div class="line"><a name="l00264"></a><span class="lineno">  264</span>&#160;      <span class="keywordflow">if</span> (new_pos != old_pos) {</div><div class="line"><a name="l00265"></a><span class="lineno">  265</span>&#160;        str_stream.width(new_pos - old_pos);</div><div class="line"><a name="l00266"></a><span class="lineno">  266</span>&#160;        str_stream &gt;&gt; val;</div><div class="line"><a name="l00267"></a><span class="lineno">  267</span>&#160;        vals.push_back(val);</div><div class="line"><a name="l00268"></a><span class="lineno">  268</span>&#160;      }</div><div class="line"><a name="l00269"></a><span class="lineno">  269</span>&#160;</div><div class="line"><a name="l00270"></a><span class="lineno">  270</span>&#160;      <span class="comment">// skip over delimiter</span></div><div class="line"><a name="l00271"></a><span class="lineno">  271</span>&#160;      str_stream.ignore(1);</div><div class="line"><a name="l00272"></a><span class="lineno">  272</span>&#160;      old_pos = new_pos + 1;</div><div class="line"><a name="l00273"></a><span class="lineno">  273</span>&#160;    }</div><div class="line"><a name="l00274"></a><span class="lineno">  274</span>&#160;</div><div class="line"><a name="l00275"></a><span class="lineno">  275</span>&#160;    <span class="comment">// Read last value</span></div><div class="line"><a name="l00276"></a><span class="lineno">  276</span>&#160;    str_stream &gt;&gt; val;</div><div class="line"><a name="l00277"></a><span class="lineno">  277</span>&#160;    vals.push_back(val);</div><div class="line"><a name="l00278"></a><span class="lineno">  278</span>&#160;  }</div><div class="line"><a name="l00279"></a><span class="lineno">  279</span>&#160;};</div><div class="line"><a name="l00280"></a><span class="lineno">  280</span>&#160;</div><div class="line"><a name="l00281"></a><span class="lineno">  281</span>&#160;}  <span class="comment">// namespace cutlass</span></div><div class="ttc" id="namespacecutlass_html"><div class="ttname"><a href="namespacecutlass.html">cutlass</a></div><div class="ttdef"><b>Definition:</b> aligned_buffer.h:35</div></div>
+<a href="command__line_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a name="l00001"></a><span class="lineno">    1</span>&#160;<span class="comment">/******************************************************************************</span></div><div class="line"><a name="l00002"></a><span class="lineno">    2</span>&#160;<span class="comment"> * Copyright (c) 2011-2019, NVIDIA CORPORATION.  All rights reserved.</span></div><div class="line"><a name="l00003"></a><span class="lineno">    3</span>&#160;<span class="comment"> *</span></div><div class="line"><a name="l00004"></a><span class="lineno">    4</span>&#160;<span class="comment"> * Redistribution and use in source and binary forms, with or without</span></div><div class="line"><a name="l00005"></a><span class="lineno">    5</span>&#160;<span class="comment"> * modification, are not permitted.</span></div><div class="line"><a name="l00006"></a><span class="lineno">    6</span>&#160;<span class="comment"> *</span></div><div class="line"><a name="l00007"></a><span class="lineno">    7</span>&#160;<span class="comment"> * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS &quot;AS IS&quot; AND</span></div><div class="line"><a name="l00008"></a><span class="lineno">    8</span>&#160;<span class="comment"> * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED</span></div><div class="line"><a name="l00009"></a><span class="lineno">    9</span>&#160;<span class="comment"> * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE</span></div><div class="line"><a name="l00010"></a><span class="lineno">   10</span>&#160;<span class="comment"> * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY</span></div><div class="line"><a name="l00011"></a><span class="lineno">   11</span>&#160;<span class="comment"> * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES</span></div><div class="line"><a name="l00012"></a><span class="lineno">   12</span>&#160;<span class="comment"> * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;</span></div><div class="line"><a name="l00013"></a><span class="lineno">   13</span>&#160;<span class="comment"> * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND</span></div><div class="line"><a name="l00014"></a><span class="lineno">   14</span>&#160;<span class="comment"> * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT</span></div><div class="line"><a name="l00015"></a><span class="lineno">   15</span>&#160;<span class="comment"> * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS</span></div><div class="line"><a name="l00016"></a><span class="lineno">   16</span>&#160;<span class="comment"> * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.</span></div><div class="line"><a name="l00017"></a><span class="lineno">   17</span>&#160;<span class="comment"> *</span></div><div class="line"><a name="l00018"></a><span class="lineno">   18</span>&#160;<span class="comment"> ******************************************************************************/</span></div><div class="line"><a name="l00019"></a><span class="lineno">   19</span>&#160;</div><div class="line"><a name="l00020"></a><span class="lineno">   20</span>&#160;<span class="preprocessor">#pragma once</span></div><div class="line"><a name="l00021"></a><span class="lineno">   21</span>&#160;</div><div class="line"><a name="l00027"></a><span class="lineno">   27</span>&#160;<span class="preprocessor">#include &lt;iostream&gt;</span></div><div class="line"><a name="l00028"></a><span class="lineno">   28</span>&#160;<span class="preprocessor">#include &lt;limits&gt;</span></div><div class="line"><a name="l00029"></a><span class="lineno">   29</span>&#160;<span class="preprocessor">#include &lt;sstream&gt;</span></div><div class="line"><a name="l00030"></a><span class="lineno">   30</span>&#160;<span class="preprocessor">#include &lt;string&gt;</span></div><div class="line"><a name="l00031"></a><span class="lineno">   31</span>&#160;<span class="preprocessor">#include &lt;vector&gt;</span></div><div class="line"><a name="l00032"></a><span class="lineno">   32</span>&#160;</div><div class="line"><a name="l00033"></a><span class="lineno">   33</span>&#160;<span class="preprocessor">#include &lt;cuda_runtime.h&gt;</span></div><div class="line"><a name="l00034"></a><span class="lineno">   34</span>&#160;</div><div class="line"><a name="l00035"></a><span class="lineno">   35</span>&#160;<span class="keyword">namespace </span><a class="code" href="namespacecutlass.html">cutlass</a> {</div><div class="line"><a name="l00036"></a><span class="lineno">   36</span>&#160;</div><div class="line"><a name="l00037"></a><span class="lineno">   37</span>&#160;<span class="comment">/******************************************************************************</span></div><div class="line"><a name="l00038"></a><span class="lineno">   38</span>&#160;<span class="comment"> * command_line</span></div><div class="line"><a name="l00039"></a><span class="lineno">   39</span>&#160;<span class="comment"> ******************************************************************************/</span></div><div class="line"><a name="l00040"></a><span class="lineno">   40</span>&#160;</div><div class="line"><a name="l00044"></a><span class="lineno"><a class="line" href="structcutlass_1_1CommandLine.html">   44</a></span>&#160;<span class="keyword">struct </span><a class="code" href="structcutlass_1_1CommandLine.html">CommandLine</a> {</div><div class="line"><a name="l00045"></a><span class="lineno"><a class="line" href="structcutlass_1_1CommandLine.html#a1603f1c65c6d8d3d4262443b40e5c290">   45</a></span>&#160;  std::vector&lt;std::string&gt; <a class="code" href="structcutlass_1_1CommandLine.html#a1603f1c65c6d8d3d4262443b40e5c290">keys</a>;</div><div class="line"><a name="l00046"></a><span class="lineno"><a class="line" href="structcutlass_1_1CommandLine.html#ade127841e9730589f611b618e9440012">   46</a></span>&#160;  std::vector&lt;std::string&gt; <a class="code" href="structcutlass_1_1CommandLine.html#ade127841e9730589f611b618e9440012">values</a>;</div><div class="line"><a name="l00047"></a><span class="lineno"><a class="line" href="structcutlass_1_1CommandLine.html#a6a338671a8d323882f9d9463863eb1c1">   47</a></span>&#160;  std::vector&lt;std::string&gt; <a class="code" href="structcutlass_1_1CommandLine.html#a6a338671a8d323882f9d9463863eb1c1">args</a>;</div><div class="line"><a name="l00048"></a><span class="lineno">   48</span>&#160;</div><div class="line"><a name="l00052"></a><span class="lineno"><a class="line" href="structcutlass_1_1CommandLine.html#a7156975dc884e8b58b91c710495fc79d">   52</a></span>&#160;  <a class="code" href="structcutlass_1_1CommandLine.html#a7156975dc884e8b58b91c710495fc79d">CommandLine</a>(<span class="keywordtype">int</span> argc, <span class="keyword">const</span> <span class="keywordtype">char</span>** argv) {</div><div class="line"><a name="l00053"></a><span class="lineno">   53</span>&#160;    <span class="keyword">using namespace </span><a class="code" href="namespacestd.html">std</a>;</div><div class="line"><a name="l00054"></a><span class="lineno">   54</span>&#160;</div><div class="line"><a name="l00055"></a><span class="lineno">   55</span>&#160;    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 1; i &lt; argc; i++) {</div><div class="line"><a name="l00056"></a><span class="lineno">   56</span>&#160;      <span class="keywordtype">string</span> <a class="code" href="namespacecutlass.html#a2d34a68d49bf2a35de81b6600425b645">arg</a> = argv[i];</div><div class="line"><a name="l00057"></a><span class="lineno">   57</span>&#160;</div><div class="line"><a name="l00058"></a><span class="lineno">   58</span>&#160;      <span class="keywordflow">if</span> ((arg[0] != <span class="charliteral">&#39;-&#39;</span>) || (arg[1] != <span class="charliteral">&#39;-&#39;</span>)) {</div><div class="line"><a name="l00059"></a><span class="lineno">   59</span>&#160;        args.push_back(arg);</div><div class="line"><a name="l00060"></a><span class="lineno">   60</span>&#160;        <span class="keywordflow">continue</span>;</div><div class="line"><a name="l00061"></a><span class="lineno">   61</span>&#160;      }</div><div class="line"><a name="l00062"></a><span class="lineno">   62</span>&#160;</div><div class="line"><a name="l00063"></a><span class="lineno">   63</span>&#160;      string::size_type pos;</div><div class="line"><a name="l00064"></a><span class="lineno">   64</span>&#160;      <span class="keywordtype">string</span> key, val;</div><div class="line"><a name="l00065"></a><span class="lineno">   65</span>&#160;      <span class="keywordflow">if</span> ((pos = arg.find(<span class="charliteral">&#39;=&#39;</span>)) == string::npos) {</div><div class="line"><a name="l00066"></a><span class="lineno">   66</span>&#160;        key = string(arg, 2, arg.length() - 2);</div><div class="line"><a name="l00067"></a><span class="lineno">   67</span>&#160;        val = <span class="stringliteral">&quot;&quot;</span>;</div><div class="line"><a name="l00068"></a><span class="lineno">   68</span>&#160;      } <span class="keywordflow">else</span> {</div><div class="line"><a name="l00069"></a><span class="lineno">   69</span>&#160;        key = string(arg, 2, pos - 2);</div><div class="line"><a name="l00070"></a><span class="lineno">   70</span>&#160;        val = string(arg, pos + 1, arg.length() - 1);</div><div class="line"><a name="l00071"></a><span class="lineno">   71</span>&#160;      }</div><div class="line"><a name="l00072"></a><span class="lineno">   72</span>&#160;</div><div class="line"><a name="l00073"></a><span class="lineno">   73</span>&#160;      keys.push_back(key);</div><div class="line"><a name="l00074"></a><span class="lineno">   74</span>&#160;      values.push_back(val);</div><div class="line"><a name="l00075"></a><span class="lineno">   75</span>&#160;    }</div><div class="line"><a name="l00076"></a><span class="lineno">   76</span>&#160;  }</div><div class="line"><a name="l00077"></a><span class="lineno">   77</span>&#160;</div><div class="line"><a name="l00081"></a><span class="lineno"><a class="line" href="structcutlass_1_1CommandLine.html#a5a20785501f9ed3d4a57241b08399552">   81</a></span>&#160;  <span class="keywordtype">bool</span> <a class="code" href="structcutlass_1_1CommandLine.html#a5a20785501f9ed3d4a57241b08399552">check_cmd_line_flag</a>(<span class="keyword">const</span> <span class="keywordtype">char</span>* arg_name)<span class="keyword"> const </span>{</div><div class="line"><a name="l00082"></a><span class="lineno">   82</span>&#160;    <span class="keyword">using namespace </span><a class="code" href="namespacestd.html">std</a>;</div><div class="line"><a name="l00083"></a><span class="lineno">   83</span>&#160;</div><div class="line"><a name="l00084"></a><span class="lineno">   84</span>&#160;    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; int(keys.size()); ++i) {</div><div class="line"><a name="l00085"></a><span class="lineno">   85</span>&#160;      <span class="keywordflow">if</span> (keys[i] == <span class="keywordtype">string</span>(arg_name)) <span class="keywordflow">return</span> <span class="keyword">true</span>;</div><div class="line"><a name="l00086"></a><span class="lineno">   86</span>&#160;    }</div><div class="line"><a name="l00087"></a><span class="lineno">   87</span>&#160;    <span class="keywordflow">return</span> <span class="keyword">false</span>;</div><div class="line"><a name="l00088"></a><span class="lineno">   88</span>&#160;  }</div><div class="line"><a name="l00089"></a><span class="lineno">   89</span>&#160;</div><div class="line"><a name="l00093"></a><span class="lineno">   93</span>&#160;  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> value_t&gt;</div><div class="line"><a name="l00094"></a><span class="lineno"><a class="line" href="structcutlass_1_1CommandLine.html#a0bee40a3cc6078a08eec5d4ca4711f61">   94</a></span>&#160;  <span class="keywordtype">int</span> <a class="code" href="structcutlass_1_1CommandLine.html#a0bee40a3cc6078a08eec5d4ca4711f61">num_naked_args</a>()<span class="keyword"> const </span>{</div><div class="line"><a name="l00095"></a><span class="lineno">   95</span>&#160;    <span class="keywordflow">return</span> args.size();</div><div class="line"><a name="l00096"></a><span class="lineno">   96</span>&#160;  }</div><div class="line"><a name="l00097"></a><span class="lineno">   97</span>&#160;</div><div class="line"><a name="l00101"></a><span class="lineno">  101</span>&#160;  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> value_t&gt;</div><div class="line"><a name="l00102"></a><span class="lineno"><a class="line" href="structcutlass_1_1CommandLine.html#a06962a53ee69752551c0353e1eb98d98">  102</a></span>&#160;  <span class="keywordtype">void</span> <a class="code" href="structcutlass_1_1CommandLine.html#a06962a53ee69752551c0353e1eb98d98">get_cmd_line_argument</a>(<span class="keywordtype">int</span> index, value_t&amp; val)<span class="keyword"> const </span>{</div><div class="line"><a name="l00103"></a><span class="lineno">  103</span>&#160;    <span class="keyword">using namespace </span><a class="code" href="namespacestd.html">std</a>;</div><div class="line"><a name="l00104"></a><span class="lineno">  104</span>&#160;    <span class="keywordflow">if</span> (index &lt; args.size()) {</div><div class="line"><a name="l00105"></a><span class="lineno">  105</span>&#160;      istringstream str_stream(args[index]);</div><div class="line"><a name="l00106"></a><span class="lineno">  106</span>&#160;      str_stream &gt;&gt; val;</div><div class="line"><a name="l00107"></a><span class="lineno">  107</span>&#160;    }</div><div class="line"><a name="l00108"></a><span class="lineno">  108</span>&#160;  }</div><div class="line"><a name="l00109"></a><span class="lineno">  109</span>&#160;</div><div class="line"><a name="l00113"></a><span class="lineno"><a class="line" href="structcutlass_1_1CommandLine.html#a9ac897e414cfeddad031b1384ffe815e">  113</a></span>&#160;  <span class="keywordtype">void</span> <a class="code" href="structcutlass_1_1CommandLine.html#a9ac897e414cfeddad031b1384ffe815e">get_cmd_line_argument</a>(<span class="keyword">const</span> <span class="keywordtype">char</span>* arg_name, <span class="keywordtype">bool</span>&amp; val, <span class="keywordtype">bool</span> _default = <span class="keyword">true</span>)<span class="keyword"> const </span>{</div><div class="line"><a name="l00114"></a><span class="lineno">  114</span>&#160;    val = _default;</div><div class="line"><a name="l00115"></a><span class="lineno">  115</span>&#160;    <span class="keywordflow">if</span> (<a class="code" href="structcutlass_1_1CommandLine.html#a5a20785501f9ed3d4a57241b08399552">check_cmd_line_flag</a>(arg_name)) {</div><div class="line"><a name="l00116"></a><span class="lineno">  116</span>&#160;      std::string value;</div><div class="line"><a name="l00117"></a><span class="lineno">  117</span>&#160;      <a class="code" href="structcutlass_1_1CommandLine.html#a06962a53ee69752551c0353e1eb98d98">get_cmd_line_argument</a>(arg_name, value);</div><div class="line"><a name="l00118"></a><span class="lineno">  118</span>&#160;</div><div class="line"><a name="l00119"></a><span class="lineno">  119</span>&#160;      val = !(value == <span class="stringliteral">&quot;0&quot;</span> || value == <span class="stringliteral">&quot;false&quot;</span>);</div><div class="line"><a name="l00120"></a><span class="lineno">  120</span>&#160;    }</div><div class="line"><a name="l00121"></a><span class="lineno">  121</span>&#160;  }</div><div class="line"><a name="l00122"></a><span class="lineno">  122</span>&#160;</div><div class="line"><a name="l00126"></a><span class="lineno">  126</span>&#160;  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> value_t&gt;</div><div class="line"><a name="l00127"></a><span class="lineno"><a class="line" href="structcutlass_1_1CommandLine.html#a206ae1ef3a4cc1a10dabd9d651be50d0">  127</a></span>&#160;  <span class="keywordtype">void</span> <a class="code" href="structcutlass_1_1CommandLine.html#a206ae1ef3a4cc1a10dabd9d651be50d0">get_cmd_line_argument</a>(<span class="keyword">const</span> <span class="keywordtype">char</span>* arg_name,</div><div class="line"><a name="l00128"></a><span class="lineno">  128</span>&#160;                             value_t&amp; val,</div><div class="line"><a name="l00129"></a><span class="lineno">  129</span>&#160;                             value_t <span class="keyword">const</span>&amp; _default = value_t())<span class="keyword"> const </span>{</div><div class="line"><a name="l00130"></a><span class="lineno">  130</span>&#160;    <span class="keyword">using namespace </span><a class="code" href="namespacestd.html">std</a>;</div><div class="line"><a name="l00131"></a><span class="lineno">  131</span>&#160;</div><div class="line"><a name="l00132"></a><span class="lineno">  132</span>&#160;    val = _default;</div><div class="line"><a name="l00133"></a><span class="lineno">  133</span>&#160;</div><div class="line"><a name="l00134"></a><span class="lineno">  134</span>&#160;    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; int(keys.size()); ++i) {</div><div class="line"><a name="l00135"></a><span class="lineno">  135</span>&#160;      <span class="keywordflow">if</span> (keys[i] == <span class="keywordtype">string</span>(arg_name)) {</div><div class="line"><a name="l00136"></a><span class="lineno">  136</span>&#160;        istringstream str_stream(values[i]);</div><div class="line"><a name="l00137"></a><span class="lineno">  137</span>&#160;        str_stream &gt;&gt; val;</div><div class="line"><a name="l00138"></a><span class="lineno">  138</span>&#160;      }</div><div class="line"><a name="l00139"></a><span class="lineno">  139</span>&#160;    }</div><div class="line"><a name="l00140"></a><span class="lineno">  140</span>&#160;  }</div><div class="line"><a name="l00141"></a><span class="lineno">  141</span>&#160;</div><div class="line"><a name="l00145"></a><span class="lineno">  145</span>&#160;  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> value_t&gt;</div><div class="line"><a name="l00146"></a><span class="lineno"><a class="line" href="structcutlass_1_1CommandLine.html#a604c5d891f1328b071290d5341119c2c">  146</a></span>&#160;  <span class="keywordtype">void</span> <a class="code" href="structcutlass_1_1CommandLine.html#a604c5d891f1328b071290d5341119c2c">get_cmd_line_arguments</a>(<span class="keyword">const</span> <span class="keywordtype">char</span>* arg_name,</div><div class="line"><a name="l00147"></a><span class="lineno">  147</span>&#160;                              std::vector&lt;value_t&gt;&amp; vals,</div><div class="line"><a name="l00148"></a><span class="lineno">  148</span>&#160;                              <span class="keywordtype">char</span> sep = <span class="charliteral">&#39;,&#39;</span>)<span class="keyword"> const </span>{</div><div class="line"><a name="l00149"></a><span class="lineno">  149</span>&#160;    <span class="keyword">using namespace </span><a class="code" href="namespacestd.html">std</a>;</div><div class="line"><a name="l00150"></a><span class="lineno">  150</span>&#160;</div><div class="line"><a name="l00151"></a><span class="lineno">  151</span>&#160;    <span class="keywordflow">if</span> (<a class="code" href="structcutlass_1_1CommandLine.html#a5a20785501f9ed3d4a57241b08399552">check_cmd_line_flag</a>(arg_name)) {</div><div class="line"><a name="l00152"></a><span class="lineno">  152</span>&#160;      <span class="comment">// Clear any default values</span></div><div class="line"><a name="l00153"></a><span class="lineno">  153</span>&#160;      vals.clear();</div><div class="line"><a name="l00154"></a><span class="lineno">  154</span>&#160;</div><div class="line"><a name="l00155"></a><span class="lineno">  155</span>&#160;      <span class="comment">// Recover from multi-value string</span></div><div class="line"><a name="l00156"></a><span class="lineno">  156</span>&#160;      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; keys.size(); ++i) {</div><div class="line"><a name="l00157"></a><span class="lineno">  157</span>&#160;        <span class="keywordflow">if</span> (keys[i] == <span class="keywordtype">string</span>(arg_name)) {</div><div class="line"><a name="l00158"></a><span class="lineno">  158</span>&#160;          <span class="keywordtype">string</span> val_string(values[i]);</div><div class="line"><a name="l00159"></a><span class="lineno">  159</span>&#160;          <a class="code" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">separate_string</a>(val_string, vals, sep);</div><div class="line"><a name="l00160"></a><span class="lineno">  160</span>&#160;        }</div><div class="line"><a name="l00161"></a><span class="lineno">  161</span>&#160;      }</div><div class="line"><a name="l00162"></a><span class="lineno">  162</span>&#160;    }</div><div class="line"><a name="l00163"></a><span class="lineno">  163</span>&#160;  }</div><div class="line"><a name="l00164"></a><span class="lineno">  164</span>&#160;</div><div class="line"><a name="l00169"></a><span class="lineno"><a class="line" href="structcutlass_1_1CommandLine.html#a38f905a17e6c6e7bd2d1bea9e0c72088">  169</a></span>&#160;  <span class="keywordtype">void</span> <a class="code" href="structcutlass_1_1CommandLine.html#a38f905a17e6c6e7bd2d1bea9e0c72088">get_cmd_line_argument_pairs</a>(<span class="keyword">const</span> <span class="keywordtype">char</span>* arg_name,</div><div class="line"><a name="l00170"></a><span class="lineno">  170</span>&#160;                                   std::vector&lt;std::pair&lt;std::string, std::string&gt; &gt;&amp; tokens,</div><div class="line"><a name="l00171"></a><span class="lineno">  171</span>&#160;                                   <span class="keywordtype">char</span> delim = <span class="charliteral">&#39;,&#39;</span>,</div><div class="line"><a name="l00172"></a><span class="lineno">  172</span>&#160;                                   <span class="keywordtype">char</span> sep = <span class="charliteral">&#39;:&#39;</span>)<span class="keyword"> const </span>{</div><div class="line"><a name="l00173"></a><span class="lineno">  173</span>&#160;    <span class="keywordflow">if</span> (<a class="code" href="structcutlass_1_1CommandLine.html#a5a20785501f9ed3d4a57241b08399552">check_cmd_line_flag</a>(arg_name)) {</div><div class="line"><a name="l00174"></a><span class="lineno">  174</span>&#160;      std::string value;</div><div class="line"><a name="l00175"></a><span class="lineno">  175</span>&#160;      <a class="code" href="structcutlass_1_1CommandLine.html#a06962a53ee69752551c0353e1eb98d98">get_cmd_line_argument</a>(arg_name, value);</div><div class="line"><a name="l00176"></a><span class="lineno">  176</span>&#160;</div><div class="line"><a name="l00177"></a><span class="lineno">  177</span>&#160;      <a class="code" href="structcutlass_1_1CommandLine.html#a1944da52162e04b12a82ce0c1ade676e">tokenize</a>(tokens, value, delim, sep);</div><div class="line"><a name="l00178"></a><span class="lineno">  178</span>&#160;    }</div><div class="line"><a name="l00179"></a><span class="lineno">  179</span>&#160;  }</div><div class="line"><a name="l00180"></a><span class="lineno">  180</span>&#160;</div><div class="line"><a name="l00185"></a><span class="lineno"><a class="line" href="structcutlass_1_1CommandLine.html#a935f23b162d87148cadb56f9a16e094e">  185</a></span>&#160;  <span class="keywordtype">void</span> <a class="code" href="structcutlass_1_1CommandLine.html#a935f23b162d87148cadb56f9a16e094e">get_cmd_line_argument_ranges</a>(<span class="keyword">const</span> <span class="keywordtype">char</span>* arg_name,</div><div class="line"><a name="l00186"></a><span class="lineno">  186</span>&#160;                                    std::vector&lt;std::vector&lt;std::string&gt; &gt;&amp; vals,</div><div class="line"><a name="l00187"></a><span class="lineno">  187</span>&#160;                                    <span class="keywordtype">char</span> delim = <span class="charliteral">&#39;,&#39;</span>,</div><div class="line"><a name="l00188"></a><span class="lineno">  188</span>&#160;                                    <span class="keywordtype">char</span> sep = <span class="charliteral">&#39;:&#39;</span>)<span class="keyword"> const </span>{</div><div class="line"><a name="l00189"></a><span class="lineno">  189</span>&#160;    std::vector&lt;std::string&gt; ranges;</div><div class="line"><a name="l00190"></a><span class="lineno">  190</span>&#160;    <a class="code" href="structcutlass_1_1CommandLine.html#a604c5d891f1328b071290d5341119c2c">get_cmd_line_arguments</a>(arg_name, ranges, delim);</div><div class="line"><a name="l00191"></a><span class="lineno">  191</span>&#160;</div><div class="line"><a name="l00192"></a><span class="lineno">  192</span>&#160;    <span class="keywordflow">for</span> (std::vector&lt;std::string&gt;::const_iterator range = ranges.begin();</div><div class="line"><a name="l00193"></a><span class="lineno">  193</span>&#160;      range != ranges.end(); ++range) {</div><div class="line"><a name="l00194"></a><span class="lineno">  194</span>&#160;</div><div class="line"><a name="l00195"></a><span class="lineno">  195</span>&#160;      std::vector&lt;std::string&gt; range_vals;</div><div class="line"><a name="l00196"></a><span class="lineno">  196</span>&#160;      <a class="code" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">separate_string</a>(*range, range_vals, sep);</div><div class="line"><a name="l00197"></a><span class="lineno">  197</span>&#160;      vals.push_back(range_vals);</div><div class="line"><a name="l00198"></a><span class="lineno">  198</span>&#160;    }</div><div class="line"><a name="l00199"></a><span class="lineno">  199</span>&#160;  }</div><div class="line"><a name="l00200"></a><span class="lineno">  200</span>&#160;</div><div class="line"><a name="l00204"></a><span class="lineno"><a class="line" href="structcutlass_1_1CommandLine.html#a228e1a273d223eec4b2f6d73135d3c1e">  204</a></span>&#160;  <span class="keywordtype">int</span> <a class="code" href="structcutlass_1_1CommandLine.html#a228e1a273d223eec4b2f6d73135d3c1e">parsed_argc</a>()<span class="keyword"> const </span>{ <span class="keywordflow">return</span> (<span class="keywordtype">int</span>)keys.size(); }</div><div class="line"><a name="l00205"></a><span class="lineno">  205</span>&#160;</div><div class="line"><a name="l00206"></a><span class="lineno">  206</span>&#160;  <span class="comment">//-------------------------------------------------------------------------</span></div><div class="line"><a name="l00207"></a><span class="lineno">  207</span>&#160;  <span class="comment">// Utility functions</span></div><div class="line"><a name="l00208"></a><span class="lineno">  208</span>&#160;  <span class="comment">//-------------------------------------------------------------------------</span></div><div class="line"><a name="l00209"></a><span class="lineno">  209</span>&#160;</div><div class="line"><a name="l00211"></a><span class="lineno"><a class="line" href="structcutlass_1_1CommandLine.html#a1944da52162e04b12a82ce0c1ade676e">  211</a></span>&#160;  <span class="keyword">static</span> <span class="keywordtype">void</span> <a class="code" href="structcutlass_1_1CommandLine.html#a1944da52162e04b12a82ce0c1ade676e">tokenize</a>(std::vector&lt;std::pair&lt;std::string, std::string&gt; &gt;&amp; tokens,</div><div class="line"><a name="l00212"></a><span class="lineno">  212</span>&#160;                       std::string <span class="keyword">const</span>&amp; str,</div><div class="line"><a name="l00213"></a><span class="lineno">  213</span>&#160;                       <span class="keywordtype">char</span> delim = <span class="charliteral">&#39;,&#39;</span>,</div><div class="line"><a name="l00214"></a><span class="lineno">  214</span>&#160;                       <span class="keywordtype">char</span> sep = <span class="charliteral">&#39;:&#39;</span>) {</div><div class="line"><a name="l00215"></a><span class="lineno">  215</span>&#160;    <span class="comment">// Home-built to avoid Boost dependency</span></div><div class="line"><a name="l00216"></a><span class="lineno">  216</span>&#160;    <span class="keywordtype">size_t</span> s_idx = 0;</div><div class="line"><a name="l00217"></a><span class="lineno">  217</span>&#160;    <span class="keywordtype">size_t</span> d_idx = std::string::npos;</div><div class="line"><a name="l00218"></a><span class="lineno">  218</span>&#160;    <span class="keywordflow">while</span> (s_idx &lt; str.size()) {</div><div class="line"><a name="l00219"></a><span class="lineno">  219</span>&#160;      d_idx = str.find_first_of(delim, s_idx);</div><div class="line"><a name="l00220"></a><span class="lineno">  220</span>&#160;</div><div class="line"><a name="l00221"></a><span class="lineno">  221</span>&#160;      <span class="keywordtype">size_t</span> end_idx = (d_idx != std::string::npos ? d_idx : str.size());</div><div class="line"><a name="l00222"></a><span class="lineno">  222</span>&#160;      <span class="keywordtype">size_t</span> sep_idx = str.find_first_of(sep, s_idx);</div><div class="line"><a name="l00223"></a><span class="lineno">  223</span>&#160;      <span class="keywordtype">size_t</span> offset = 1;</div><div class="line"><a name="l00224"></a><span class="lineno">  224</span>&#160;      <span class="keywordflow">if</span> (sep_idx == std::string::npos || sep_idx &gt;= end_idx) {</div><div class="line"><a name="l00225"></a><span class="lineno">  225</span>&#160;        sep_idx = end_idx;</div><div class="line"><a name="l00226"></a><span class="lineno">  226</span>&#160;        offset = 0;</div><div class="line"><a name="l00227"></a><span class="lineno">  227</span>&#160;      }</div><div class="line"><a name="l00228"></a><span class="lineno">  228</span>&#160;</div><div class="line"><a name="l00229"></a><span class="lineno">  229</span>&#160;      std::pair&lt;std::string, std::string&gt; item(</div><div class="line"><a name="l00230"></a><span class="lineno">  230</span>&#160;          str.substr(s_idx, sep_idx - s_idx),</div><div class="line"><a name="l00231"></a><span class="lineno">  231</span>&#160;          str.substr(sep_idx + offset, end_idx - sep_idx - offset));</div><div class="line"><a name="l00232"></a><span class="lineno">  232</span>&#160;</div><div class="line"><a name="l00233"></a><span class="lineno">  233</span>&#160;      tokens.push_back(item);</div><div class="line"><a name="l00234"></a><span class="lineno">  234</span>&#160;      s_idx = end_idx + 1;</div><div class="line"><a name="l00235"></a><span class="lineno">  235</span>&#160;    }</div><div class="line"><a name="l00236"></a><span class="lineno">  236</span>&#160;  }</div><div class="line"><a name="l00237"></a><span class="lineno">  237</span>&#160;</div><div class="line"><a name="l00239"></a><span class="lineno"><a class="line" href="structcutlass_1_1CommandLine.html#a440c25cfb006f218ff4705a43320a28b">  239</a></span>&#160;  <span class="keyword">static</span> <span class="keywordtype">void</span> <a class="code" href="structcutlass_1_1CommandLine.html#a440c25cfb006f218ff4705a43320a28b">tokenize</a>(std::vector&lt;std::string&gt;&amp; tokens,</div><div class="line"><a name="l00240"></a><span class="lineno">  240</span>&#160;                       std::string <span class="keyword">const</span>&amp; str,</div><div class="line"><a name="l00241"></a><span class="lineno">  241</span>&#160;                       <span class="keywordtype">char</span> delim = <span class="charliteral">&#39;,&#39;</span>,</div><div class="line"><a name="l00242"></a><span class="lineno">  242</span>&#160;                       <span class="keywordtype">char</span> sep = <span class="charliteral">&#39;:&#39;</span>) {</div><div class="line"><a name="l00243"></a><span class="lineno">  243</span>&#160;    <span class="keyword">typedef</span> std::vector&lt;std::pair&lt;std::string, std::string&gt; &gt; TokenVector;</div><div class="line"><a name="l00244"></a><span class="lineno">  244</span>&#160;    <span class="keyword">typedef</span> TokenVector::const_iterator token_iterator;</div><div class="line"><a name="l00245"></a><span class="lineno">  245</span>&#160;</div><div class="line"><a name="l00246"></a><span class="lineno">  246</span>&#160;    std::vector&lt;std::pair&lt;std::string, std::string&gt; &gt; token_pairs;</div><div class="line"><a name="l00247"></a><span class="lineno">  247</span>&#160;    <a class="code" href="structcutlass_1_1CommandLine.html#a1944da52162e04b12a82ce0c1ade676e">tokenize</a>(token_pairs, str, delim, sep);</div><div class="line"><a name="l00248"></a><span class="lineno">  248</span>&#160;    <span class="keywordflow">for</span> (token_iterator tok = token_pairs.begin(); tok != token_pairs.end(); ++tok) {</div><div class="line"><a name="l00249"></a><span class="lineno">  249</span>&#160;      tokens.push_back(tok-&gt;first);</div><div class="line"><a name="l00250"></a><span class="lineno">  250</span>&#160;    }</div><div class="line"><a name="l00251"></a><span class="lineno">  251</span>&#160;  }</div><div class="line"><a name="l00252"></a><span class="lineno">  252</span>&#160;</div><div class="line"><a name="l00253"></a><span class="lineno">  253</span>&#160;  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> value_t&gt;</div><div class="line"><a name="l00254"></a><span class="lineno"><a class="line" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">  254</a></span>&#160;  <span class="keyword">static</span> <span class="keywordtype">void</span> <a class="code" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">separate_string</a>(std::string <span class="keyword">const</span>&amp; str,</div><div class="line"><a name="l00255"></a><span class="lineno">  255</span>&#160;                              std::vector&lt;value_t&gt;&amp; vals,</div><div class="line"><a name="l00256"></a><span class="lineno">  256</span>&#160;                              <span class="keywordtype">char</span> sep = <span class="charliteral">&#39;,&#39;</span>) {</div><div class="line"><a name="l00257"></a><span class="lineno">  257</span>&#160;    std::istringstream str_stream(str);</div><div class="line"><a name="l00258"></a><span class="lineno">  258</span>&#160;    std::string::size_type old_pos = 0;</div><div class="line"><a name="l00259"></a><span class="lineno">  259</span>&#160;    std::string::size_type new_pos = 0;</div><div class="line"><a name="l00260"></a><span class="lineno">  260</span>&#160;</div><div class="line"><a name="l00261"></a><span class="lineno">  261</span>&#160;    <span class="comment">// Iterate &lt;sep&gt;-delimited values</span></div><div class="line"><a name="l00262"></a><span class="lineno">  262</span>&#160;    value_t val;</div><div class="line"><a name="l00263"></a><span class="lineno">  263</span>&#160;    <span class="keywordflow">while</span> ((new_pos = str.find(sep, old_pos)) != std::string::npos) {</div><div class="line"><a name="l00264"></a><span class="lineno">  264</span>&#160;      <span class="keywordflow">if</span> (new_pos != old_pos) {</div><div class="line"><a name="l00265"></a><span class="lineno">  265</span>&#160;        str_stream.width(new_pos - old_pos);</div><div class="line"><a name="l00266"></a><span class="lineno">  266</span>&#160;        str_stream &gt;&gt; val;</div><div class="line"><a name="l00267"></a><span class="lineno">  267</span>&#160;        vals.push_back(val);</div><div class="line"><a name="l00268"></a><span class="lineno">  268</span>&#160;      }</div><div class="line"><a name="l00269"></a><span class="lineno">  269</span>&#160;</div><div class="line"><a name="l00270"></a><span class="lineno">  270</span>&#160;      <span class="comment">// skip over delimiter</span></div><div class="line"><a name="l00271"></a><span class="lineno">  271</span>&#160;      str_stream.ignore(1);</div><div class="line"><a name="l00272"></a><span class="lineno">  272</span>&#160;      old_pos = new_pos + 1;</div><div class="line"><a name="l00273"></a><span class="lineno">  273</span>&#160;    }</div><div class="line"><a name="l00274"></a><span class="lineno">  274</span>&#160;</div><div class="line"><a name="l00275"></a><span class="lineno">  275</span>&#160;    <span class="comment">// Read last value</span></div><div class="line"><a name="l00276"></a><span class="lineno">  276</span>&#160;    str_stream &gt;&gt; val;</div><div class="line"><a name="l00277"></a><span class="lineno">  277</span>&#160;    vals.push_back(val);</div><div class="line"><a name="l00278"></a><span class="lineno">  278</span>&#160;  }</div><div class="line"><a name="l00279"></a><span class="lineno">  279</span>&#160;};</div><div class="line"><a name="l00280"></a><span class="lineno">  280</span>&#160;</div><div class="line"><a name="l00281"></a><span class="lineno">  281</span>&#160;}  <span class="comment">// namespace cutlass</span></div><div class="ttc" id="namespacecutlass_html"><div class="ttname"><a href="namespacecutlass.html">cutlass</a></div><div class="ttdef"><b>Definition:</b> aligned_buffer.h:35</div></div>
 <div class="ttc" id="structcutlass_1_1CommandLine_html_a206ae1ef3a4cc1a10dabd9d651be50d0"><div class="ttname"><a href="structcutlass_1_1CommandLine.html#a206ae1ef3a4cc1a10dabd9d651be50d0">cutlass::CommandLine::get_cmd_line_argument</a></div><div class="ttdeci">void get_cmd_line_argument(const char *arg_name, value_t &amp;val, value_t const &amp;_default=value_t()) const </div><div class="ttdef"><b>Definition:</b> command_line.h:127</div></div>
 <div class="ttc" id="structcutlass_1_1CommandLine_html_a38f905a17e6c6e7bd2d1bea9e0c72088"><div class="ttname"><a href="structcutlass_1_1CommandLine.html#a38f905a17e6c6e7bd2d1bea9e0c72088">cutlass::CommandLine::get_cmd_line_argument_pairs</a></div><div class="ttdeci">void get_cmd_line_argument_pairs(const char *arg_name, std::vector&lt; std::pair&lt; std::string, std::string &gt; &gt; &amp;tokens, char delim= ',', char sep= ':') const </div><div class="ttdef"><b>Definition:</b> command_line.h:169</div></div>
 <div class="ttc" id="namespacestd_html"><div class="ttname"><a href="namespacestd.html">std</a></div><div class="ttdoc">STL namespace. </div></div>
@@ -116,7 +116,7 @@
 <div class="ttc" id="structcutlass_1_1CommandLine_html_a7156975dc884e8b58b91c710495fc79d"><div class="ttname"><a href="structcutlass_1_1CommandLine.html#a7156975dc884e8b58b91c710495fc79d">cutlass::CommandLine::CommandLine</a></div><div class="ttdeci">CommandLine(int argc, const char **argv)</div><div class="ttdef"><b>Definition:</b> command_line.h:52</div></div>
 <div class="ttc" id="structcutlass_1_1CommandLine_html_a6a338671a8d323882f9d9463863eb1c1"><div class="ttname"><a href="structcutlass_1_1CommandLine.html#a6a338671a8d323882f9d9463863eb1c1">cutlass::CommandLine::args</a></div><div class="ttdeci">std::vector&lt; std::string &gt; args</div><div class="ttdef"><b>Definition:</b> command_line.h:47</div></div>
 <div class="ttc" id="structcutlass_1_1CommandLine_html"><div class="ttname"><a href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></div><div class="ttdef"><b>Definition:</b> command_line.h:44</div></div>
-<div class="ttc" id="structcutlass_1_1CommandLine_html_a5f86e4b2bd8c44b739c83530d77c5590"><div class="ttname"><a href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">cutlass::CommandLine::seperate_string</a></div><div class="ttdeci">static void seperate_string(std::string const &amp;str, std::vector&lt; value_t &gt; &amp;vals, char sep= ',')</div><div class="ttdef"><b>Definition:</b> command_line.h:254</div></div>
+<div class="ttc" id="structcutlass_1_1CommandLine_html_a5f86e4b2bd8c44b739c83530d77c5590"><div class="ttname"><a href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">cutlass::CommandLine::separate_string</a></div><div class="ttdeci">static void separate_string(std::string const &amp;str, std::vector&lt; value_t &gt; &amp;vals, char sep= ',')</div><div class="ttdef"><b>Definition:</b> command_line.h:254</div></div>
 <div class="ttc" id="structcutlass_1_1CommandLine_html_a228e1a273d223eec4b2f6d73135d3c1e"><div class="ttname"><a href="structcutlass_1_1CommandLine.html#a228e1a273d223eec4b2f6d73135d3c1e">cutlass::CommandLine::parsed_argc</a></div><div class="ttdeci">int parsed_argc() const </div><div class="ttdef"><b>Definition:</b> command_line.h:204</div></div>
 </div><!-- fragment --></div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/default__mma__core__simt_8h_source.html b/docs/default__mma__core__simt_8h_source.html
index 6897c764f0..6e068e5a87 100644
--- a/docs/default__mma__core__simt_8h_source.html
+++ b/docs/default__mma__core__simt_8h_source.html
@@ -144,7 +144,7 @@
 <div class="ttc" id="structcutlass_1_1sizeof__bits_html"><div class="ttname"><a href="structcutlass_1_1sizeof__bits.html">cutlass::sizeof_bits</a></div><div class="ttdoc">Defines the size of an element in bits. </div><div class="ttdef"><b>Definition:</b> numeric_types.h:42</div></div>
 <div class="ttc" id="structcutlass_1_1gemm_1_1threadblock_1_1DefaultMmaCore_3_01Shape___00_01WarpShape___00_01GemmSha2c0d0b7cdb5c4bcb11e83c058eb65345_html_ac2bc54c65c50ee88b7ac877c2c8469a9"><div class="ttname"><a href="structcutlass_1_1gemm_1_1threadblock_1_1DefaultMmaCore_3_01Shape___00_01WarpShape___00_01GemmSha2c0d0b7cdb5c4bcb11e83c058eb65345.html#ac2bc54c65c50ee88b7ac877c2c8469a9">cutlass::gemm::threadblock::DefaultMmaCore&lt; Shape_, WarpShape_, GemmShape&lt; 1, 1, 4 &gt;, int8_t, layout::ColumnMajor, int8_t, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ &gt;::OperatorClass</a></div><div class="ttdeci">arch::OpClassSimt OperatorClass</div><div class="ttdef"><b>Definition:</b> default_mma_core_simt.h:1233</div></div>
 <div class="ttc" id="structcutlass_1_1gemm_1_1threadblock_1_1DefaultMmaCore_3_01Shape___00_01WarpShape___00_01GemmShab94a11a77dd0565102710907089acee0_html_aeb7ff4ca2dd952e411f92e8628d6c015"><div class="ttname"><a href="structcutlass_1_1gemm_1_1threadblock_1_1DefaultMmaCore_3_01Shape___00_01WarpShape___00_01GemmShab94a11a77dd0565102710907089acee0.html#aeb7ff4ca2dd952e411f92e8628d6c015">cutlass::gemm::threadblock::DefaultMmaCore&lt; Shape_, WarpShape_, GemmShape&lt; 1, 1, 1 &gt;, ElementA_, layout::ColumnMajor, ElementB_, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ &gt;::LayoutC</a></div><div class="ttdeci">LayoutC_ LayoutC</div><div class="ttdef"><b>Definition:</b> default_mma_core_simt.h:597</div></div>
-<div class="ttc" id="structcutlass_1_1transform_1_1TransposePitchLinearThreadMap2DThreadTile_html"><div class="ttname"><a href="structcutlass_1_1transform_1_1TransposePitchLinearThreadMap2DThreadTile.html">cutlass::transform::TransposePitchLinearThreadMap2DThreadTile</a></div><div class="ttdoc">Thread Mapping a 2D threadtiled mapping as a tranposed Pitchlinear2DThreadTile mapping. </div><div class="ttdef"><b>Definition:</b> pitch_linear_thread_map.h:713</div></div>
+<div class="ttc" id="structcutlass_1_1transform_1_1TransposePitchLinearThreadMap2DThreadTile_html"><div class="ttname"><a href="structcutlass_1_1transform_1_1TransposePitchLinearThreadMap2DThreadTile.html">cutlass::transform::TransposePitchLinearThreadMap2DThreadTile</a></div><div class="ttdoc">Thread Mapping a 2D threadtiled mapping as a transposed Pitchlinear2DThreadTile mapping. </div><div class="ttdef"><b>Definition:</b> pitch_linear_thread_map.h:713</div></div>
 <div class="ttc" id="default__mma__core_8h_html"><div class="ttname"><a href="default__mma__core_8h.html">default_mma_core.h</a></div><div class="ttdoc">Defines basic properties needed by CTA-level GEMMs assuming expectations about data layout of the glo...</div></div>
 <div class="ttc" id="structcutlass_1_1gemm_1_1threadblock_1_1DefaultMmaCore_3_01Shape___00_01WarpShape___00_01GemmSha84e9f8afb6a4ca9f5dcd219b182d16e7_html_aea4174e3f2e32dcd170dea50d5675369"><div class="ttname"><a href="structcutlass_1_1gemm_1_1threadblock_1_1DefaultMmaCore_3_01Shape___00_01WarpShape___00_01GemmSha84e9f8afb6a4ca9f5dcd219b182d16e7.html#aea4174e3f2e32dcd170dea50d5675369">cutlass::gemm::threadblock::DefaultMmaCore&lt; Shape_, WarpShape_, GemmShape&lt; 1, 1, 1 &gt;, ElementA_, layout::RowMajor, ElementB_, layout::RowMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ &gt;::ElementC</a></div><div class="ttdeci">ElementC_ ElementC</div><div class="ttdef"><b>Definition:</b> default_mma_core_simt.h:434</div></div>
 <div class="ttc" id="structcutlass_1_1gemm_1_1threadblock_1_1DefaultMmaCore_3_01Shape___00_01WarpShape___00_01GemmShaaf312aafe9da92ea9d417bcc12a8e7dc_html_a9e0ab047c9957fc0e3ce232e98c91a7e"><div class="ttname"><a href="structcutlass_1_1gemm_1_1threadblock_1_1DefaultMmaCore_3_01Shape___00_01WarpShape___00_01GemmShaaf312aafe9da92ea9d417bcc12a8e7dc.html#a9e0ab047c9957fc0e3ce232e98c91a7e">cutlass::gemm::threadblock::DefaultMmaCore&lt; Shape_, WarpShape_, GemmShape&lt; 1, 1, 4 &gt;, int8_t, layout::RowMajor, int8_t, layout::ColumnMajor, ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_ &gt;::WarpShape</a></div><div class="ttdeci">WarpShape_ WarpShape</div><div class="ttdef"><b>Definition:</b> default_mma_core_simt.h:903</div></div>
diff --git a/docs/device_2gemm__batched_8h.html b/docs/device_2gemm__batched_8h.html
index e648c67eea..3ef58bf18e 100644
--- a/docs/device_2gemm__batched_8h.html
+++ b/docs/device_2gemm__batched_8h.html
@@ -130,7 +130,7 @@
 <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Argument structure.  <a href="structcutlass_1_1gemm_1_1device_1_1GemmBatched_1_1Arguments.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html">cutlass::gemm::device::GemmBatched&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ &gt;</a></td></tr>
-<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Parital specialization for column-major output exchanges problem size and operand.  <a href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html#details">More...</a><br /></td></tr>
+<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for column-major output exchanges problem size and operand.  <a href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_213d78696663f4231cd52c6a277c60e5.html">cutlass::gemm::device::GemmBatched&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ &gt;::Arguments</a></td></tr>
 <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Argument structure.  <a href="structcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_213d78696663f4231cd52c6a277c60e5.html#details">More...</a><br /></td></tr>
diff --git a/docs/device_2kernel_2tensor__foreach_8h_source.html b/docs/device_2kernel_2tensor__foreach_8h_source.html
index a4839c25f7..21c8fb591f 100644
--- a/docs/device_2kernel_2tensor__foreach_8h_source.html
+++ b/docs/device_2kernel_2tensor__foreach_8h_source.html
@@ -100,7 +100,7 @@
 <div class="contents">
 <a href="device_2kernel_2tensor__foreach_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a name="l00001"></a><span class="lineno">    1</span>&#160;<span class="comment">/***************************************************************************************************</span></div><div class="line"><a name="l00002"></a><span class="lineno">    2</span>&#160;<span class="comment"> * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.</span></div><div class="line"><a name="l00003"></a><span class="lineno">    3</span>&#160;<span class="comment"> *</span></div><div class="line"><a name="l00004"></a><span class="lineno">    4</span>&#160;<span class="comment"> * Redistribution and use in source and binary forms, with or without modification, are permitted</span></div><div class="line"><a name="l00005"></a><span class="lineno">    5</span>&#160;<span class="comment"> * provided that the following conditions are met:</span></div><div class="line"><a name="l00006"></a><span class="lineno">    6</span>&#160;<span class="comment"> *     * Redistributions of source code must retain the above copyright notice, this list of</span></div><div class="line"><a name="l00007"></a><span class="lineno">    7</span>&#160;<span class="comment"> *       conditions and the following disclaimer.</span></div><div class="line"><a name="l00008"></a><span class="lineno">    8</span>&#160;<span class="comment"> *     * Redistributions in binary form must reproduce the above copyright notice, this list of</span></div><div class="line"><a name="l00009"></a><span class="lineno">    9</span>&#160;<span class="comment"> *       conditions and the following disclaimer in the documentation and/or other materials</span></div><div class="line"><a name="l00010"></a><span class="lineno">   10</span>&#160;<span class="comment"> *       provided with the distribution.</span></div><div class="line"><a name="l00011"></a><span class="lineno">   11</span>&#160;<span class="comment"> *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used</span></div><div class="line"><a name="l00012"></a><span class="lineno">   12</span>&#160;<span class="comment"> *       to endorse or promote products derived from this software without specific prior written</span></div><div class="line"><a name="l00013"></a><span class="lineno">   13</span>&#160;<span class="comment"> *       permission.</span></div><div class="line"><a name="l00014"></a><span class="lineno">   14</span>&#160;<span class="comment"> *</span></div><div class="line"><a name="l00015"></a><span class="lineno">   15</span>&#160;<span class="comment"> * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS &quot;AS IS&quot; AND ANY EXPRESS OR</span></div><div class="line"><a name="l00016"></a><span class="lineno">   16</span>&#160;<span class="comment"> * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND</span></div><div class="line"><a name="l00017"></a><span class="lineno">   17</span>&#160;<span class="comment"> * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE</span></div><div class="line"><a name="l00018"></a><span class="lineno">   18</span>&#160;<span class="comment"> * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,</span></div><div class="line"><a name="l00019"></a><span class="lineno">   19</span>&#160;<span class="comment"> * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;</span></div><div class="line"><a name="l00020"></a><span class="lineno">   20</span>&#160;<span class="comment"> * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,</span></div><div class="line"><a name="l00021"></a><span class="lineno">   21</span>&#160;<span class="comment"> * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE</span></div><div class="line"><a name="l00022"></a><span class="lineno">   22</span>&#160;<span class="comment"> * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.</span></div><div class="line"><a name="l00023"></a><span class="lineno">   23</span>&#160;<span class="comment"> *</span></div><div class="line"><a name="l00024"></a><span class="lineno">   24</span>&#160;<span class="comment"> **************************************************************************************************/</span></div><div class="line"><a name="l00025"></a><span class="lineno">   25</span>&#160;</div><div class="line"><a name="l00026"></a><span class="lineno">   26</span>&#160;<span class="preprocessor">#pragma once</span></div><div class="line"><a name="l00027"></a><span class="lineno">   27</span>&#160;</div><div class="line"><a name="l00028"></a><span class="lineno">   28</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="cutlass_8h.html">cutlass/cutlass.h</a>&quot;</span></div><div class="line"><a name="l00029"></a><span class="lineno">   29</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="coord_8h.html">cutlass/coord.h</a>&quot;</span></div><div class="line"><a name="l00030"></a><span class="lineno">   30</span>&#160;</div><div class="line"><a name="l00031"></a><span class="lineno">   31</span>&#160;<span class="keyword">namespace </span><a class="code" href="namespacecutlass.html">cutlass</a> {</div><div class="line"><a name="l00032"></a><span class="lineno">   32</span>&#160;<span class="keyword">namespace </span>reference {</div><div class="line"><a name="l00033"></a><span class="lineno">   33</span>&#160;<span class="keyword">namespace </span>device {</div><div class="line"><a name="l00034"></a><span class="lineno">   34</span>&#160;<span class="keyword">namespace </span>kernel {</div><div class="line"><a name="l00035"></a><span class="lineno">   35</span>&#160;</div><div class="line"><a name="l00037"></a><span class="lineno">   37</span>&#160;</div><div class="line"><a name="l00039"></a><span class="lineno"><a class="line" href="namespacecutlass_1_1reference_1_1device_1_1kernel_1_1detail.html">   39</a></span>&#160;<span class="keyword">namespace </span>detail {</div><div class="line"><a name="l00040"></a><span class="lineno">   40</span>&#160;</div><div class="line"><a name="l00042"></a><span class="lineno">   42</span>&#160;<span class="keyword">template</span> &lt;<span class="keyword">typename</span> Func, <span class="keywordtype">int</span> Rank, <span class="keywordtype">int</span> RankRemaining&gt;</div><div class="line"><a name="l00043"></a><span class="lineno"><a class="line" href="structcutlass_1_1reference_1_1device_1_1kernel_1_1detail_1_1TensorForEachHelper.html">   43</a></span>&#160;<span class="keyword">struct </span><a class="code" href="structcutlass_1_1reference_1_1device_1_1kernel_1_1detail_1_1TensorForEachHelper.html">TensorForEachHelper</a> {</div><div class="line"><a name="l00044"></a><span class="lineno">   44</span>&#160;</div><div class="line"><a name="l00046"></a><span class="lineno">   46</span>&#160;  __inline__ __device__</div><div class="line"><a name="l00047"></a><span class="lineno"><a class="line" href="structcutlass_1_1reference_1_1device_1_1kernel_1_1detail_1_1TensorForEachHelper.html#a3f3002a3173247d60a18298ef3ff9dbf">   47</a></span>&#160;  <a class="code" href="structcutlass_1_1reference_1_1device_1_1kernel_1_1detail_1_1TensorForEachHelper.html#a3f3002a3173247d60a18298ef3ff9dbf">TensorForEachHelper</a>(Func &amp;func, <a class="code" href="structcutlass_1_1Coord.html">Coord&lt;Rank&gt;</a> <span class="keyword">const</span> &amp;size, <a class="code" href="structcutlass_1_1Coord.html">Coord&lt;Rank&gt;</a> &amp;coord, int64_t index) {</div><div class="line"><a name="l00048"></a><span class="lineno">   48</span>&#160;</div><div class="line"><a name="l00049"></a><span class="lineno">   49</span>&#160;    int64_t product = 1;</div><div class="line"><a name="l00050"></a><span class="lineno">   50</span>&#160;</div><div class="line"><a name="l00051"></a><span class="lineno">   51</span>&#160;    <a class="code" href="cutlass_8h.html#a4b1c9f25ab6eaa25e1f2258dd63e6ce4">CUTLASS_PRAGMA_UNROLL</a></div><div class="line"><a name="l00052"></a><span class="lineno">   52</span>&#160;    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = Rank - RankRemaining; i &lt; Rank; ++i) {</div><div class="line"><a name="l00053"></a><span class="lineno">   53</span>&#160;      product *= size[i];</div><div class="line"><a name="l00054"></a><span class="lineno">   54</span>&#160;    }</div><div class="line"><a name="l00055"></a><span class="lineno">   55</span>&#160;</div><div class="line"><a name="l00056"></a><span class="lineno">   56</span>&#160;    coord[Rank - 1 - RankRemaining] = index / product;</div><div class="line"><a name="l00057"></a><span class="lineno">   57</span>&#160;    int64_t remaining = index % product;</div><div class="line"><a name="l00058"></a><span class="lineno">   58</span>&#160;    </div><div class="line"><a name="l00059"></a><span class="lineno">   59</span>&#160;    <a class="code" href="structcutlass_1_1reference_1_1device_1_1kernel_1_1detail_1_1TensorForEachHelper.html">TensorForEachHelper</a>&lt;Func, Rank, RankRemaining-1&gt;(func, size, coord, remaining);</div><div class="line"><a name="l00060"></a><span class="lineno">   60</span>&#160;  }</div><div class="line"><a name="l00061"></a><span class="lineno">   61</span>&#160;};</div><div class="line"><a name="l00062"></a><span class="lineno">   62</span>&#160;</div><div class="line"><a name="l00064"></a><span class="lineno">   64</span>&#160;<span class="keyword">template</span> &lt;<span class="keyword">typename</span> Func, <span class="keywordtype">int</span> Rank&gt;</div><div class="line"><a name="l00065"></a><span class="lineno"><a class="line" href="structcutlass_1_1reference_1_1device_1_1kernel_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html">   65</a></span>&#160;<span class="keyword">struct </span><a class="code" href="structcutlass_1_1reference_1_1device_1_1kernel_1_1detail_1_1TensorForEachHelper.html">TensorForEachHelper</a>&lt;Func, Rank, 0&gt; {</div><div class="line"><a name="l00066"></a><span class="lineno">   66</span>&#160;</div><div class="line"><a name="l00068"></a><span class="lineno">   68</span>&#160;  __inline__ __device__</div><div class="line"><a name="l00069"></a><span class="lineno"><a class="line" href="structcutlass_1_1reference_1_1device_1_1kernel_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html#a89e10e059c3ffcfe2640cf6291353937">   69</a></span>&#160;  <a class="code" href="structcutlass_1_1reference_1_1device_1_1kernel_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html#a89e10e059c3ffcfe2640cf6291353937">TensorForEachHelper</a>(Func &amp;func, <a class="code" href="structcutlass_1_1Coord.html">Coord&lt;Rank&gt;</a> <span class="keyword">const</span> &amp;size, <a class="code" href="structcutlass_1_1Coord.html">Coord&lt;Rank&gt;</a> &amp;coord, int64_t index) {</div><div class="line"><a name="l00070"></a><span class="lineno">   70</span>&#160;</div><div class="line"><a name="l00071"></a><span class="lineno">   71</span>&#160;    coord[Rank - 1] = index;</div><div class="line"><a name="l00072"></a><span class="lineno">   72</span>&#160;</div><div class="line"><a name="l00073"></a><span class="lineno">   73</span>&#160;    <span class="keywordflow">if</span> (coord &lt; size) {</div><div class="line"><a name="l00074"></a><span class="lineno">   74</span>&#160;      func(coord);</div><div class="line"><a name="l00075"></a><span class="lineno">   75</span>&#160;    }</div><div class="line"><a name="l00076"></a><span class="lineno">   76</span>&#160;  }</div><div class="line"><a name="l00077"></a><span class="lineno">   77</span>&#160;};</div><div class="line"><a name="l00078"></a><span class="lineno">   78</span>&#160;</div><div class="line"><a name="l00079"></a><span class="lineno">   79</span>&#160;} <span class="comment">// namespace detail</span></div><div class="line"><a name="l00080"></a><span class="lineno">   80</span>&#160;</div><div class="line"><a name="l00082"></a><span class="lineno">   82</span>&#160;</div><div class="line"><a name="l00084"></a><span class="lineno">   84</span>&#160;<span class="keyword">template</span> &lt;<span class="keyword">typename</span> Func, <span class="keywordtype">int</span> Rank, <span class="keyword">typename</span> Params&gt;</div><div class="line"><a name="l00085"></a><span class="lineno"><a class="line" href="namespacecutlass_1_1reference_1_1device_1_1kernel.html#ae22a592321cef9a9f586d3f094933e3f">   85</a></span>&#160;__global__ <span class="keywordtype">void</span> <a class="code" href="namespacecutlass_1_1reference_1_1device_1_1kernel.html#ae22a592321cef9a9f586d3f094933e3f">TensorForEach</a>(<a class="code" href="structcutlass_1_1Coord.html">Coord&lt;Rank&gt;</a> size, Params params = Params()) {</div><div class="line"><a name="l00086"></a><span class="lineno">   86</span>&#160;</div><div class="line"><a name="l00087"></a><span class="lineno">   87</span>&#160;  Func func(params);</div><div class="line"><a name="l00088"></a><span class="lineno">   88</span>&#160;</div><div class="line"><a name="l00089"></a><span class="lineno">   89</span>&#160;  int64_t index = threadIdx.x + blockIdx.x * blockDim.x;</div><div class="line"><a name="l00090"></a><span class="lineno">   90</span>&#160;  int64_t max_index = 1;</div><div class="line"><a name="l00091"></a><span class="lineno">   91</span>&#160;</div><div class="line"><a name="l00092"></a><span class="lineno">   92</span>&#160;  <a class="code" href="cutlass_8h.html#a4b1c9f25ab6eaa25e1f2258dd63e6ce4">CUTLASS_PRAGMA_UNROLL</a></div><div class="line"><a name="l00093"></a><span class="lineno">   93</span>&#160;  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; Rank; ++i) {</div><div class="line"><a name="l00094"></a><span class="lineno">   94</span>&#160;    max_index *= size[i];</div><div class="line"><a name="l00095"></a><span class="lineno">   95</span>&#160;  }</div><div class="line"><a name="l00096"></a><span class="lineno">   96</span>&#160;</div><div class="line"><a name="l00097"></a><span class="lineno">   97</span>&#160;  <a class="code" href="cutlass_8h.html#adb3bc73d74b4a4bf13099d5696db3352">CUTLASS_PRAGMA_NO_UNROLL</a></div><div class="line"><a name="l00098"></a><span class="lineno">   98</span>&#160;  <span class="keywordflow">while</span>  (index &lt; max_index) {</div><div class="line"><a name="l00099"></a><span class="lineno">   99</span>&#160;    <a class="code" href="structcutlass_1_1Coord.html">Coord&lt;Rank&gt;</a> coord;</div><div class="line"><a name="l00100"></a><span class="lineno">  100</span>&#160;</div><div class="line"><a name="l00101"></a><span class="lineno">  101</span>&#160;    <a class="code" href="structcutlass_1_1reference_1_1device_1_1kernel_1_1detail_1_1TensorForEachHelper.html">detail::TensorForEachHelper</a>&lt;Func, Rank, Rank - 1&gt;(func, size, coord, index); </div><div class="line"><a name="l00102"></a><span class="lineno">  102</span>&#160;    index += blockDim.x * gridDim.x;</div><div class="line"><a name="l00103"></a><span class="lineno">  103</span>&#160;  }</div><div class="line"><a name="l00104"></a><span class="lineno">  104</span>&#160;}</div><div class="line"><a name="l00105"></a><span class="lineno">  105</span>&#160;</div><div class="line"><a name="l00107"></a><span class="lineno">  107</span>&#160;</div><div class="line"><a name="l00109"></a><span class="lineno">  109</span>&#160;<span class="keyword">template</span> &lt;<span class="keyword">typename</span> Func, <span class="keywordtype">int</span> Rank, <span class="keyword">typename</span> Params&gt;</div><div class="line"><a name="l00110"></a><span class="lineno"><a class="line" href="namespacecutlass_1_1reference_1_1device_1_1kernel.html#ab3b42b1c0e6f28c3b62b65a373db5fd7">  110</a></span>&#160;__global__ <span class="keywordtype">void</span> <a class="code" href="namespacecutlass_1_1reference_1_1device_1_1kernel.html#ab3b42b1c0e6f28c3b62b65a373db5fd7">TensorDiagonalForEach</a>(<a class="code" href="structcutlass_1_1Coord.html">Coord&lt;Rank&gt;</a> size, Params params, <span class="keywordtype">int</span> start, <span class="keywordtype">int</span> end) {</div><div class="line"><a name="l00111"></a><span class="lineno">  111</span>&#160;</div><div class="line"><a name="l00112"></a><span class="lineno">  112</span>&#160;  Func func(params);</div><div class="line"><a name="l00113"></a><span class="lineno">  113</span>&#160;</div><div class="line"><a name="l00114"></a><span class="lineno">  114</span>&#160;  int64_t index = threadIdx.x + blockIdx.x * blockDim.x + start;</div><div class="line"><a name="l00115"></a><span class="lineno">  115</span>&#160;</div><div class="line"><a name="l00116"></a><span class="lineno">  116</span>&#160;  <span class="keywordflow">if</span> (index &lt; end) {</div><div class="line"><a name="l00117"></a><span class="lineno">  117</span>&#160;    <a class="code" href="structcutlass_1_1Coord.html">Coord&lt;Rank&gt;</a> coord;</div><div class="line"><a name="l00118"></a><span class="lineno">  118</span>&#160;</div><div class="line"><a name="l00119"></a><span class="lineno">  119</span>&#160;    <a class="code" href="cutlass_8h.html#a4b1c9f25ab6eaa25e1f2258dd63e6ce4">CUTLASS_PRAGMA_UNROLL</a></div><div class="line"><a name="l00120"></a><span class="lineno">  120</span>&#160;    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; Rank; ++i) {</div><div class="line"><a name="l00121"></a><span class="lineno">  121</span>&#160;      coord[i] = index;</div><div class="line"><a name="l00122"></a><span class="lineno">  122</span>&#160;    }</div><div class="line"><a name="l00123"></a><span class="lineno">  123</span>&#160;</div><div class="line"><a name="l00124"></a><span class="lineno">  124</span>&#160;    func(coord);</div><div class="line"><a name="l00125"></a><span class="lineno">  125</span>&#160;  }</div><div class="line"><a name="l00126"></a><span class="lineno">  126</span>&#160;}</div><div class="line"><a name="l00127"></a><span class="lineno">  127</span>&#160;</div><div class="line"><a name="l00129"></a><span class="lineno">  129</span>&#160;</div><div class="line"><a name="l00130"></a><span class="lineno">  130</span>&#160;<span class="keyword">template</span> &lt;<span class="keyword">typename</span> Element, <span class="keyword">typename</span> Func&gt;</div><div class="line"><a name="l00131"></a><span class="lineno"><a class="line" href="namespacecutlass_1_1reference_1_1device_1_1kernel.html#a0100d78891f9e00e75453ef8dc24daa6">  131</a></span>&#160;__global__ <span class="keywordtype">void</span> <a class="code" href="namespacecutlass_1_1reference_1_1device_1_1kernel.html#a0100d78891f9e00e75453ef8dc24daa6">BlockForEach</a>(</div><div class="line"><a name="l00132"></a><span class="lineno">  132</span>&#160;  Element *ptr, </div><div class="line"><a name="l00133"></a><span class="lineno">  133</span>&#160;  <span class="keywordtype">size_t</span> capacity, </div><div class="line"><a name="l00134"></a><span class="lineno">  134</span>&#160;  <span class="keyword">typename</span> Func::Params params) {</div><div class="line"><a name="l00135"></a><span class="lineno">  135</span>&#160;</div><div class="line"><a name="l00136"></a><span class="lineno">  136</span>&#160;  Func func(params);</div><div class="line"><a name="l00137"></a><span class="lineno">  137</span>&#160;</div><div class="line"><a name="l00138"></a><span class="lineno">  138</span>&#160;  <span class="keywordtype">size_t</span> index = threadIdx.x + blockIdx.x * blockDim.x;</div><div class="line"><a name="l00139"></a><span class="lineno">  139</span>&#160;</div><div class="line"><a name="l00140"></a><span class="lineno">  140</span>&#160;  <span class="keywordflow">for</span> (; index &lt; capacity; index += blockDim.x * gridDim.x) {</div><div class="line"><a name="l00141"></a><span class="lineno">  141</span>&#160;    ptr[index] = func();</div><div class="line"><a name="l00142"></a><span class="lineno">  142</span>&#160;  }</div><div class="line"><a name="l00143"></a><span class="lineno">  143</span>&#160;}</div><div class="line"><a name="l00144"></a><span class="lineno">  144</span>&#160;</div><div class="line"><a name="l00146"></a><span class="lineno">  146</span>&#160;</div><div class="line"><a name="l00147"></a><span class="lineno">  147</span>&#160;} <span class="comment">// namespace kernel</span></div><div class="line"><a name="l00148"></a><span class="lineno">  148</span>&#160;} <span class="comment">// namespace device</span></div><div class="line"><a name="l00149"></a><span class="lineno">  149</span>&#160;} <span class="comment">// namespace reference</span></div><div class="line"><a name="l00150"></a><span class="lineno">  150</span>&#160;} <span class="comment">// namespace cutlass</span></div><div class="line"><a name="l00151"></a><span class="lineno">  151</span>&#160;</div><div class="ttc" id="namespacecutlass_html"><div class="ttname"><a href="namespacecutlass.html">cutlass</a></div><div class="ttdef"><b>Definition:</b> aligned_buffer.h:35</div></div>
 <div class="ttc" id="coord_8h_html"><div class="ttname"><a href="coord_8h.html">coord.h</a></div><div class="ttdoc">A Coord is a coordinate of arbitrary rank into a tensor or matrix. </div></div>
-<div class="ttc" id="structcutlass_1_1reference_1_1device_1_1kernel_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4_html_a89e10e059c3ffcfe2640cf6291353937"><div class="ttname"><a href="structcutlass_1_1reference_1_1device_1_1kernel_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html#a89e10e059c3ffcfe2640cf6291353937">cutlass::reference::device::kernel::detail::TensorForEachHelper&lt; Func, Rank, 0 &gt;::TensorForEachHelper</a></div><div class="ttdeci">__inline__ __device__ TensorForEachHelper(Func &amp;func, Coord&lt; Rank &gt; const &amp;size, Coord&lt; Rank &gt; &amp;coord, int64_t index)</div><div class="ttdoc">Constructor for fastest chaning rank. </div><div class="ttdef"><b>Definition:</b> device/kernel/tensor_foreach.h:69</div></div>
+<div class="ttc" id="structcutlass_1_1reference_1_1device_1_1kernel_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4_html_a89e10e059c3ffcfe2640cf6291353937"><div class="ttname"><a href="structcutlass_1_1reference_1_1device_1_1kernel_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html#a89e10e059c3ffcfe2640cf6291353937">cutlass::reference::device::kernel::detail::TensorForEachHelper&lt; Func, Rank, 0 &gt;::TensorForEachHelper</a></div><div class="ttdeci">__inline__ __device__ TensorForEachHelper(Func &amp;func, Coord&lt; Rank &gt; const &amp;size, Coord&lt; Rank &gt; &amp;coord, int64_t index)</div><div class="ttdoc">Constructor for fastest changing rank. </div><div class="ttdef"><b>Definition:</b> device/kernel/tensor_foreach.h:69</div></div>
 <div class="ttc" id="cutlass_8h_html_a4b1c9f25ab6eaa25e1f2258dd63e6ce4"><div class="ttname"><a href="cutlass_8h.html#a4b1c9f25ab6eaa25e1f2258dd63e6ce4">CUTLASS_PRAGMA_UNROLL</a></div><div class="ttdeci">#define CUTLASS_PRAGMA_UNROLL</div><div class="ttdef"><b>Definition:</b> cutlass.h:110</div></div>
 <div class="ttc" id="namespacecutlass_1_1reference_1_1device_1_1kernel_html_a0100d78891f9e00e75453ef8dc24daa6"><div class="ttname"><a href="namespacecutlass_1_1reference_1_1device_1_1kernel.html#a0100d78891f9e00e75453ef8dc24daa6">cutlass::reference::device::kernel::BlockForEach</a></div><div class="ttdeci">__global__ void BlockForEach(Element *ptr, size_t capacity, typename Func::Params params)</div><div class="ttdef"><b>Definition:</b> device/kernel/tensor_foreach.h:131</div></div>
 <div class="ttc" id="cutlass_8h_html_adb3bc73d74b4a4bf13099d5696db3352"><div class="ttname"><a href="cutlass_8h.html#adb3bc73d74b4a4bf13099d5696db3352">CUTLASS_PRAGMA_NO_UNROLL</a></div><div class="ttdeci">#define CUTLASS_PRAGMA_NO_UNROLL</div><div class="ttdef"><b>Definition:</b> cutlass.h:111</div></div>
diff --git a/docs/device_2tensor__fill_8h.html b/docs/device_2tensor__fill_8h.html
index c2af8c44d3..5a99459cfc 100644
--- a/docs/device_2tensor__fill_8h.html
+++ b/docs/device_2tensor__fill_8h.html
@@ -237,7 +237,7 @@
 <tr class="separator:a6e23d479ebb3760d5846ed1b67e450e4"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a6b0f21995c4fd5c33617550e6905c78e"><td class="memTemplParams" colspan="2">template&lt;typename Element , typename Layout &gt; </td></tr>
 <tr class="memitem:a6b0f21995c4fd5c33617550e6905c78e"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1device.html#a6b0f21995c4fd5c33617550e6905c78e">cutlass::reference::device::TensorFillIdentity</a> (TensorView&lt; Element, Layout &gt; view)</td></tr>
-<tr class="memdesc:a6b0f21995c4fd5c33617550e6905c78e"><td class="mdescLeft">&#160;</td><td class="mdescRight">Fills a tensor's digonal with 1 and 0 everywhere else.  <a href="namespacecutlass_1_1reference_1_1device.html#a6b0f21995c4fd5c33617550e6905c78e">More...</a><br /></td></tr>
+<tr class="memdesc:a6b0f21995c4fd5c33617550e6905c78e"><td class="mdescLeft">&#160;</td><td class="mdescRight">Fills a tensor's diagonal with 1 and 0 everywhere else.  <a href="namespacecutlass_1_1reference_1_1device.html#a6b0f21995c4fd5c33617550e6905c78e">More...</a><br /></td></tr>
 <tr class="separator:a6b0f21995c4fd5c33617550e6905c78e"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:aaff3d7919a2f2dce14eb254c17eead9a"><td class="memTemplParams" colspan="2">template&lt;typename Element , typename Layout &gt; </td></tr>
 <tr class="memitem:aaff3d7919a2f2dce14eb254c17eead9a"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1device.html#aaff3d7919a2f2dce14eb254c17eead9a">cutlass::reference::device::TensorUpdateDiagonal</a> (TensorView&lt; Element, Layout &gt; view, Element diag=Element(1))</td></tr>
diff --git a/docs/device_2tensor__fill_8h_source.html b/docs/device_2tensor__fill_8h_source.html
index dd5debdaa8..908ccba847 100644
--- a/docs/device_2tensor__fill_8h_source.html
+++ b/docs/device_2tensor__fill_8h_source.html
@@ -125,7 +125,7 @@
 <div class="ttc" id="structcutlass_1_1reference_1_1device_1_1detail_1_1RandomGaussianFunc_1_1Params_html"><div class="ttname"><a href="structcutlass_1_1reference_1_1device_1_1detail_1_1RandomGaussianFunc_1_1Params.html">cutlass::reference::device::detail::RandomGaussianFunc::Params</a></div><div class="ttdoc">Parameters structure. </div><div class="ttdef"><b>Definition:</b> device/tensor_fill.h:99</div></div>
 <div class="ttc" id="structcutlass_1_1Distribution_html_a07cb089b346ef06e198f6043128264fb"><div class="ttname"><a href="structcutlass_1_1Distribution.html#a07cb089b346ef06e198f6043128264fb">cutlass::Distribution::kind</a></div><div class="ttdeci">Kind kind</div><div class="ttdoc">Active variant kind. </div><div class="ttdef"><b>Definition:</b> distribution.h:64</div></div>
 <div class="ttc" id="structcutlass_1_1reference_1_1device_1_1detail_1_1TensorFillRandomUniformFunc_1_1Params_html_a267e7ea4e77076cc9be7d639b3cef64d"><div class="ttname"><a href="structcutlass_1_1reference_1_1device_1_1detail_1_1TensorFillRandomUniformFunc_1_1Params.html#a267e7ea4e77076cc9be7d639b3cef64d">cutlass::reference::device::detail::TensorFillRandomUniformFunc::Params::Params</a></div><div class="ttdeci">Params(TensorView view_=TensorView(), typename RandomFunc::Params random_=RandomFunc::Params())</div><div class="ttdoc">Construction of Gaussian RNG functor. </div><div class="ttdef"><b>Definition:</b> device/tensor_fill.h:422</div></div>
-<div class="ttc" id="namespacecutlass_1_1reference_1_1device_html_a6b0f21995c4fd5c33617550e6905c78e"><div class="ttname"><a href="namespacecutlass_1_1reference_1_1device.html#a6b0f21995c4fd5c33617550e6905c78e">cutlass::reference::device::TensorFillIdentity</a></div><div class="ttdeci">void TensorFillIdentity(TensorView&lt; Element, Layout &gt; view)</div><div class="ttdoc">Fills a tensor&amp;#39;s digonal with 1 and 0 everywhere else. </div><div class="ttdef"><b>Definition:</b> device/tensor_fill.h:630</div></div>
+<div class="ttc" id="namespacecutlass_1_1reference_1_1device_html_a6b0f21995c4fd5c33617550e6905c78e"><div class="ttname"><a href="namespacecutlass_1_1reference_1_1device.html#a6b0f21995c4fd5c33617550e6905c78e">cutlass::reference::device::TensorFillIdentity</a></div><div class="ttdeci">void TensorFillIdentity(TensorView&lt; Element, Layout &gt; view)</div><div class="ttdoc">Fills a tensor&amp;#39;s diagonal with 1 and 0 everywhere else. </div><div class="ttdef"><b>Definition:</b> device/tensor_fill.h:630</div></div>
 <div class="ttc" id="classcutlass_1_1TensorView_html_a7d3914dd5042c9c40be9e21a7b4e9ece"><div class="ttname"><a href="classcutlass_1_1TensorView.html#a7d3914dd5042c9c40be9e21a7b4e9ece">cutlass::TensorView::extent</a></div><div class="ttdeci">CUTLASS_HOST_DEVICE TensorCoord const &amp; extent() const </div><div class="ttdoc">Returns the extent of the view (the size along each logical dimension). </div><div class="ttdef"><b>Definition:</b> tensor_view.h:167</div></div>
 <div class="ttc" id="structcutlass_1_1reference_1_1device_1_1detail_1_1TensorUpdateDiagonalFunc_html"><div class="ttname"><a href="structcutlass_1_1reference_1_1device_1_1detail_1_1TensorUpdateDiagonalFunc.html">cutlass::reference::device::detail::TensorUpdateDiagonalFunc</a></div><div class="ttdoc">Computes a random Gaussian distribution. </div><div class="ttdef"><b>Definition:</b> device/tensor_fill.h:645</div></div>
 <div class="ttc" id="structcutlass_1_1reference_1_1device_1_1detail_1_1RandomUniformFunc_1_1Params_html_afe8637b103e25ec2e9b731389fa049be"><div class="ttname"><a href="structcutlass_1_1reference_1_1device_1_1detail_1_1RandomUniformFunc_1_1Params.html#afe8637b103e25ec2e9b731389fa049be">cutlass::reference::device::detail::RandomUniformFunc::Params::int_scale</a></div><div class="ttdeci">int int_scale</div><div class="ttdef"><b>Definition:</b> device/tensor_fill.h:315</div></div>
diff --git a/docs/device_2tensor__foreach_8h_source.html b/docs/device_2tensor__foreach_8h_source.html
index 90c5402ef2..0380fa93f3 100644
--- a/docs/device_2tensor__foreach_8h_source.html
+++ b/docs/device_2tensor__foreach_8h_source.html
@@ -98,7 +98,7 @@
 <div class="title">device/tensor_foreach.h</div>  </div>
 </div><!--header-->
 <div class="contents">
-<a href="device_2tensor__foreach_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a name="l00001"></a><span class="lineno">    1</span>&#160;<span class="comment">/***************************************************************************************************</span></div><div class="line"><a name="l00002"></a><span class="lineno">    2</span>&#160;<span class="comment"> * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.</span></div><div class="line"><a name="l00003"></a><span class="lineno">    3</span>&#160;<span class="comment"> *</span></div><div class="line"><a name="l00004"></a><span class="lineno">    4</span>&#160;<span class="comment"> * Redistribution and use in source and binary forms, with or without modification, are permitted</span></div><div class="line"><a name="l00005"></a><span class="lineno">    5</span>&#160;<span class="comment"> * provided that the following conditions are met:</span></div><div class="line"><a name="l00006"></a><span class="lineno">    6</span>&#160;<span class="comment"> *     * Redistributions of source code must retain the above copyright notice, this list of</span></div><div class="line"><a name="l00007"></a><span class="lineno">    7</span>&#160;<span class="comment"> *       conditions and the following disclaimer.</span></div><div class="line"><a name="l00008"></a><span class="lineno">    8</span>&#160;<span class="comment"> *     * Redistributions in binary form must reproduce the above copyright notice, this list of</span></div><div class="line"><a name="l00009"></a><span class="lineno">    9</span>&#160;<span class="comment"> *       conditions and the following disclaimer in the documentation and/or other materials</span></div><div class="line"><a name="l00010"></a><span class="lineno">   10</span>&#160;<span class="comment"> *       provided with the distribution.</span></div><div class="line"><a name="l00011"></a><span class="lineno">   11</span>&#160;<span class="comment"> *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used</span></div><div class="line"><a name="l00012"></a><span class="lineno">   12</span>&#160;<span class="comment"> *       to endorse or promote products derived from this software without specific prior written</span></div><div class="line"><a name="l00013"></a><span class="lineno">   13</span>&#160;<span class="comment"> *       permission.</span></div><div class="line"><a name="l00014"></a><span class="lineno">   14</span>&#160;<span class="comment"> *</span></div><div class="line"><a name="l00015"></a><span class="lineno">   15</span>&#160;<span class="comment"> * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS &quot;AS IS&quot; AND ANY EXPRESS OR</span></div><div class="line"><a name="l00016"></a><span class="lineno">   16</span>&#160;<span class="comment"> * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND</span></div><div class="line"><a name="l00017"></a><span class="lineno">   17</span>&#160;<span class="comment"> * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE</span></div><div class="line"><a name="l00018"></a><span class="lineno">   18</span>&#160;<span class="comment"> * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,</span></div><div class="line"><a name="l00019"></a><span class="lineno">   19</span>&#160;<span class="comment"> * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;</span></div><div class="line"><a name="l00020"></a><span class="lineno">   20</span>&#160;<span class="comment"> * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,</span></div><div class="line"><a name="l00021"></a><span class="lineno">   21</span>&#160;<span class="comment"> * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE</span></div><div class="line"><a name="l00022"></a><span class="lineno">   22</span>&#160;<span class="comment"> * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.</span></div><div class="line"><a name="l00023"></a><span class="lineno">   23</span>&#160;<span class="comment"> *</span></div><div class="line"><a name="l00024"></a><span class="lineno">   24</span>&#160;<span class="comment"> **************************************************************************************************/</span></div><div class="line"><a name="l00025"></a><span class="lineno">   25</span>&#160;<span class="preprocessor">#pragma once</span></div><div class="line"><a name="l00026"></a><span class="lineno">   26</span>&#160;</div><div class="line"><a name="l00027"></a><span class="lineno">   27</span>&#160;<span class="preprocessor">#include &lt;stdexcept&gt;</span></div><div class="line"><a name="l00028"></a><span class="lineno">   28</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="cutlass_8h.html">cutlass/cutlass.h</a>&quot;</span></div><div class="line"><a name="l00029"></a><span class="lineno">   29</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="device_2kernel_2tensor__foreach_8h.html">cutlass/util/reference/device/kernel/tensor_foreach.h</a>&quot;</span></div><div class="line"><a name="l00030"></a><span class="lineno">   30</span>&#160;</div><div class="line"><a name="l00031"></a><span class="lineno">   31</span>&#160;<span class="keyword">namespace </span><a class="code" href="namespacecutlass.html">cutlass</a>  {</div><div class="line"><a name="l00032"></a><span class="lineno">   32</span>&#160;<span class="keyword">namespace </span>reference {</div><div class="line"><a name="l00033"></a><span class="lineno">   33</span>&#160;<span class="keyword">namespace </span>device {</div><div class="line"><a name="l00034"></a><span class="lineno">   34</span>&#160;</div><div class="line"><a name="l00036"></a><span class="lineno">   36</span>&#160;</div><div class="line"><a name="l00038"></a><span class="lineno">   38</span>&#160;<span class="keyword">template</span> &lt;<span class="keyword">typename</span> Func, <span class="keywordtype">int</span> Rank, <span class="keyword">typename</span> Params&gt;</div><div class="line"><a name="l00039"></a><span class="lineno"><a class="line" href="structcutlass_1_1reference_1_1device_1_1TensorForEach.html">   39</a></span>&#160;<span class="keyword">struct </span><a class="code" href="structcutlass_1_1reference_1_1device_1_1TensorForEach.html">TensorForEach</a> {</div><div class="line"><a name="l00040"></a><span class="lineno">   40</span>&#160;</div><div class="line"><a name="l00042"></a><span class="lineno"><a class="line" href="structcutlass_1_1reference_1_1device_1_1TensorForEach.html#ad693751cf94aea83a14235a5ec7c7e92">   42</a></span>&#160;  <a class="code" href="structcutlass_1_1reference_1_1device_1_1TensorForEach.html#ad693751cf94aea83a14235a5ec7c7e92">TensorForEach</a>(<a class="code" href="structcutlass_1_1Coord.html">Coord&lt;Rank&gt;</a> size, Params params = Params(), <span class="keywordtype">int</span> grid_size = 0, <span class="keywordtype">int</span> block_size = 0) {</div><div class="line"><a name="l00043"></a><span class="lineno">   43</span>&#160;</div><div class="line"><a name="l00044"></a><span class="lineno">   44</span>&#160;    <span class="keywordflow">if</span> (!grid_size || !block_size) {</div><div class="line"><a name="l00045"></a><span class="lineno">   45</span>&#160;</div><div class="line"><a name="l00046"></a><span class="lineno">   46</span>&#160;      <span class="comment">// if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API</span></div><div class="line"><a name="l00047"></a><span class="lineno">   47</span>&#160;      cudaError_t result = cudaOccupancyMaxPotentialBlockSize(</div><div class="line"><a name="l00048"></a><span class="lineno">   48</span>&#160;        &amp;grid_size,</div><div class="line"><a name="l00049"></a><span class="lineno">   49</span>&#160;        &amp;block_size,</div><div class="line"><a name="l00050"></a><span class="lineno">   50</span>&#160;        reinterpret_cast&lt;void const *&gt;(kernel::TensorForEach&lt;Func, Rank, Params&gt;));</div><div class="line"><a name="l00051"></a><span class="lineno">   51</span>&#160;</div><div class="line"><a name="l00052"></a><span class="lineno">   52</span>&#160;      <span class="keywordflow">if</span> (result != cudaSuccess) {</div><div class="line"><a name="l00053"></a><span class="lineno">   53</span>&#160;        <span class="keywordflow">throw</span> std::runtime_error(<span class="stringliteral">&quot;Failed to query occupancy.&quot;</span>);</div><div class="line"><a name="l00054"></a><span class="lineno">   54</span>&#160;      }</div><div class="line"><a name="l00055"></a><span class="lineno">   55</span>&#160;</div><div class="line"><a name="l00056"></a><span class="lineno">   56</span>&#160;      <span class="comment">// Limit block size. This has the effect of increasing the number of items processed by a</span></div><div class="line"><a name="l00057"></a><span class="lineno">   57</span>&#160;      <span class="comment">// single thread and reduces the impact of initialization overhead.</span></div><div class="line"><a name="l00058"></a><span class="lineno">   58</span>&#160;      block_size = (block_size &lt; 128 ? block_size : 128);</div><div class="line"><a name="l00059"></a><span class="lineno">   59</span>&#160;    }</div><div class="line"><a name="l00060"></a><span class="lineno">   60</span>&#160;</div><div class="line"><a name="l00061"></a><span class="lineno">   61</span>&#160;    dim3 grid(grid_size, 1, 1);</div><div class="line"><a name="l00062"></a><span class="lineno">   62</span>&#160;    dim3 block(block_size, 1, 1);</div><div class="line"><a name="l00063"></a><span class="lineno">   63</span>&#160;</div><div class="line"><a name="l00064"></a><span class="lineno">   64</span>&#160;    kernel::TensorForEach&lt;Func, Rank, Params&gt;&lt;&lt;&lt; grid, block &gt;&gt;&gt;(size, params);</div><div class="line"><a name="l00065"></a><span class="lineno">   65</span>&#160;  }</div><div class="line"><a name="l00066"></a><span class="lineno">   66</span>&#160;};</div><div class="line"><a name="l00067"></a><span class="lineno">   67</span>&#160;</div><div class="line"><a name="l00069"></a><span class="lineno">   69</span>&#160;</div><div class="line"><a name="l00071"></a><span class="lineno">   71</span>&#160;<span class="keyword">template</span> &lt;<span class="keyword">typename</span> Func, <span class="keywordtype">int</span> Rank, <span class="keyword">typename</span> Params&gt;</div><div class="line"><a name="l00072"></a><span class="lineno"><a class="line" href="structcutlass_1_1reference_1_1device_1_1TensorDiagonalForEach.html">   72</a></span>&#160;<span class="keyword">struct </span><a class="code" href="structcutlass_1_1reference_1_1device_1_1TensorDiagonalForEach.html">TensorDiagonalForEach</a> {</div><div class="line"><a name="l00073"></a><span class="lineno">   73</span>&#160;</div><div class="line"><a name="l00075"></a><span class="lineno"><a class="line" href="structcutlass_1_1reference_1_1device_1_1TensorDiagonalForEach.html#adab64f903f234d0266400bd2416134ee">   75</a></span>&#160;  <a class="code" href="structcutlass_1_1reference_1_1device_1_1TensorDiagonalForEach.html#adab64f903f234d0266400bd2416134ee">TensorDiagonalForEach</a>(<a class="code" href="structcutlass_1_1Coord.html">Coord&lt;Rank&gt;</a> size, Params params = Params(), <span class="keywordtype">int</span> start = 0, <span class="keywordtype">int</span> end = -1, <span class="keywordtype">int</span> block_size = 128) { </div><div class="line"><a name="l00076"></a><span class="lineno">   76</span>&#160;</div><div class="line"><a name="l00077"></a><span class="lineno">   77</span>&#160;    <span class="keywordflow">if</span> (end &lt; 0) {</div><div class="line"><a name="l00078"></a><span class="lineno">   78</span>&#160;      end = size.min();</div><div class="line"><a name="l00079"></a><span class="lineno">   79</span>&#160;    }</div><div class="line"><a name="l00080"></a><span class="lineno">   80</span>&#160;</div><div class="line"><a name="l00081"></a><span class="lineno">   81</span>&#160;    dim3 block(block_size, 1, 1);</div><div class="line"><a name="l00082"></a><span class="lineno">   82</span>&#160;    dim3 grid((end - start + block_size - 1) / block_size, 1, 1);</div><div class="line"><a name="l00083"></a><span class="lineno">   83</span>&#160;</div><div class="line"><a name="l00084"></a><span class="lineno">   84</span>&#160;    kernel::TensorDiagonalForEach&lt;Func, Rank, Params&gt;&lt;&lt;&lt; grid, block &gt;&gt;&gt;(size, params, start, end);</div><div class="line"><a name="l00085"></a><span class="lineno">   85</span>&#160;  }</div><div class="line"><a name="l00086"></a><span class="lineno">   86</span>&#160;};</div><div class="line"><a name="l00087"></a><span class="lineno">   87</span>&#160;</div><div class="line"><a name="l00088"></a><span class="lineno">   88</span>&#160;</div><div class="line"><a name="l00090"></a><span class="lineno">   90</span>&#160;</div><div class="line"><a name="l00091"></a><span class="lineno">   91</span>&#160;<span class="keyword">template</span> &lt;<span class="keyword">typename</span> Element, <span class="keyword">typename</span> Func&gt;</div><div class="line"><a name="l00092"></a><span class="lineno"><a class="line" href="structcutlass_1_1reference_1_1device_1_1BlockForEach.html">   92</a></span>&#160;<span class="keyword">struct </span><a class="code" href="structcutlass_1_1reference_1_1device_1_1BlockForEach.html">BlockForEach</a> {</div><div class="line"><a name="l00093"></a><span class="lineno">   93</span>&#160;</div><div class="line"><a name="l00095"></a><span class="lineno"><a class="line" href="structcutlass_1_1reference_1_1device_1_1BlockForEach.html#a161e212b9b7ddbac36888de97538e106">   95</a></span>&#160;  <a class="code" href="structcutlass_1_1reference_1_1device_1_1BlockForEach.html#a161e212b9b7ddbac36888de97538e106">BlockForEach</a>(</div><div class="line"><a name="l00096"></a><span class="lineno">   96</span>&#160;    Element *ptr, </div><div class="line"><a name="l00097"></a><span class="lineno">   97</span>&#160;    <span class="keywordtype">size_t</span> capacity,</div><div class="line"><a name="l00098"></a><span class="lineno">   98</span>&#160;    <span class="keyword">typename</span> Func::Params params = <span class="keyword">typename</span> Func::Params(), </div><div class="line"><a name="l00099"></a><span class="lineno">   99</span>&#160;    <span class="keywordtype">int</span> grid_size = 0, </div><div class="line"><a name="l00100"></a><span class="lineno">  100</span>&#160;    <span class="keywordtype">int</span> block_size = 0) {</div><div class="line"><a name="l00101"></a><span class="lineno">  101</span>&#160;</div><div class="line"><a name="l00102"></a><span class="lineno">  102</span>&#160;    <span class="keywordflow">if</span> (!grid_size || !block_size) {</div><div class="line"><a name="l00103"></a><span class="lineno">  103</span>&#160;</div><div class="line"><a name="l00104"></a><span class="lineno">  104</span>&#160;      <span class="comment">// if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API</span></div><div class="line"><a name="l00105"></a><span class="lineno">  105</span>&#160;      cudaError_t result = cudaOccupancyMaxPotentialBlockSize(</div><div class="line"><a name="l00106"></a><span class="lineno">  106</span>&#160;        &amp;grid_size,</div><div class="line"><a name="l00107"></a><span class="lineno">  107</span>&#160;        &amp;block_size,</div><div class="line"><a name="l00108"></a><span class="lineno">  108</span>&#160;        reinterpret_cast&lt;void const *&gt;(kernel::BlockForEach&lt;Element, Func&gt;));</div><div class="line"><a name="l00109"></a><span class="lineno">  109</span>&#160;</div><div class="line"><a name="l00110"></a><span class="lineno">  110</span>&#160;      <span class="keywordflow">if</span> (result != cudaSuccess) {</div><div class="line"><a name="l00111"></a><span class="lineno">  111</span>&#160;        <span class="keywordflow">throw</span> std::runtime_error(<span class="stringliteral">&quot;Failed to query occupancy.&quot;</span>);</div><div class="line"><a name="l00112"></a><span class="lineno">  112</span>&#160;      }</div><div class="line"><a name="l00113"></a><span class="lineno">  113</span>&#160;</div><div class="line"><a name="l00114"></a><span class="lineno">  114</span>&#160;      <span class="comment">// Limit block size. This has the effect of increasing the number of items processed by a</span></div><div class="line"><a name="l00115"></a><span class="lineno">  115</span>&#160;      <span class="comment">// single thread and reduces the impact of initialization overhead.</span></div><div class="line"><a name="l00116"></a><span class="lineno">  116</span>&#160;      block_size = (block_size &lt; 128 ? block_size : 128);</div><div class="line"><a name="l00117"></a><span class="lineno">  117</span>&#160;    }</div><div class="line"><a name="l00118"></a><span class="lineno">  118</span>&#160;</div><div class="line"><a name="l00119"></a><span class="lineno">  119</span>&#160;    dim3 grid(grid_size, 1, 1);</div><div class="line"><a name="l00120"></a><span class="lineno">  120</span>&#160;    dim3 block(block_size, 1, 1);</div><div class="line"><a name="l00121"></a><span class="lineno">  121</span>&#160;</div><div class="line"><a name="l00122"></a><span class="lineno">  122</span>&#160;    kernel::BlockForEach&lt;Element, Func&gt;&lt;&lt;&lt; grid, block &gt;&gt;&gt;(ptr, capacity, params);</div><div class="line"><a name="l00123"></a><span class="lineno">  123</span>&#160;  }</div><div class="line"><a name="l00124"></a><span class="lineno">  124</span>&#160;};</div><div class="line"><a name="l00125"></a><span class="lineno">  125</span>&#160;</div><div class="line"><a name="l00127"></a><span class="lineno">  127</span>&#160;</div><div class="line"><a name="l00128"></a><span class="lineno">  128</span>&#160;} <span class="comment">// namespace device</span></div><div class="line"><a name="l00129"></a><span class="lineno">  129</span>&#160;} <span class="comment">// namespace reference</span></div><div class="line"><a name="l00130"></a><span class="lineno">  130</span>&#160;} <span class="comment">// namesace cutlass</span></div><div class="ttc" id="namespacecutlass_html"><div class="ttname"><a href="namespacecutlass.html">cutlass</a></div><div class="ttdef"><b>Definition:</b> aligned_buffer.h:35</div></div>
+<a href="device_2tensor__foreach_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a name="l00001"></a><span class="lineno">    1</span>&#160;<span class="comment">/***************************************************************************************************</span></div><div class="line"><a name="l00002"></a><span class="lineno">    2</span>&#160;<span class="comment"> * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.</span></div><div class="line"><a name="l00003"></a><span class="lineno">    3</span>&#160;<span class="comment"> *</span></div><div class="line"><a name="l00004"></a><span class="lineno">    4</span>&#160;<span class="comment"> * Redistribution and use in source and binary forms, with or without modification, are permitted</span></div><div class="line"><a name="l00005"></a><span class="lineno">    5</span>&#160;<span class="comment"> * provided that the following conditions are met:</span></div><div class="line"><a name="l00006"></a><span class="lineno">    6</span>&#160;<span class="comment"> *     * Redistributions of source code must retain the above copyright notice, this list of</span></div><div class="line"><a name="l00007"></a><span class="lineno">    7</span>&#160;<span class="comment"> *       conditions and the following disclaimer.</span></div><div class="line"><a name="l00008"></a><span class="lineno">    8</span>&#160;<span class="comment"> *     * Redistributions in binary form must reproduce the above copyright notice, this list of</span></div><div class="line"><a name="l00009"></a><span class="lineno">    9</span>&#160;<span class="comment"> *       conditions and the following disclaimer in the documentation and/or other materials</span></div><div class="line"><a name="l00010"></a><span class="lineno">   10</span>&#160;<span class="comment"> *       provided with the distribution.</span></div><div class="line"><a name="l00011"></a><span class="lineno">   11</span>&#160;<span class="comment"> *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used</span></div><div class="line"><a name="l00012"></a><span class="lineno">   12</span>&#160;<span class="comment"> *       to endorse or promote products derived from this software without specific prior written</span></div><div class="line"><a name="l00013"></a><span class="lineno">   13</span>&#160;<span class="comment"> *       permission.</span></div><div class="line"><a name="l00014"></a><span class="lineno">   14</span>&#160;<span class="comment"> *</span></div><div class="line"><a name="l00015"></a><span class="lineno">   15</span>&#160;<span class="comment"> * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS &quot;AS IS&quot; AND ANY EXPRESS OR</span></div><div class="line"><a name="l00016"></a><span class="lineno">   16</span>&#160;<span class="comment"> * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND</span></div><div class="line"><a name="l00017"></a><span class="lineno">   17</span>&#160;<span class="comment"> * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE</span></div><div class="line"><a name="l00018"></a><span class="lineno">   18</span>&#160;<span class="comment"> * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,</span></div><div class="line"><a name="l00019"></a><span class="lineno">   19</span>&#160;<span class="comment"> * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;</span></div><div class="line"><a name="l00020"></a><span class="lineno">   20</span>&#160;<span class="comment"> * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,</span></div><div class="line"><a name="l00021"></a><span class="lineno">   21</span>&#160;<span class="comment"> * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE</span></div><div class="line"><a name="l00022"></a><span class="lineno">   22</span>&#160;<span class="comment"> * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.</span></div><div class="line"><a name="l00023"></a><span class="lineno">   23</span>&#160;<span class="comment"> *</span></div><div class="line"><a name="l00024"></a><span class="lineno">   24</span>&#160;<span class="comment"> **************************************************************************************************/</span></div><div class="line"><a name="l00025"></a><span class="lineno">   25</span>&#160;<span class="preprocessor">#pragma once</span></div><div class="line"><a name="l00026"></a><span class="lineno">   26</span>&#160;</div><div class="line"><a name="l00027"></a><span class="lineno">   27</span>&#160;<span class="preprocessor">#include &lt;stdexcept&gt;</span></div><div class="line"><a name="l00028"></a><span class="lineno">   28</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="cutlass_8h.html">cutlass/cutlass.h</a>&quot;</span></div><div class="line"><a name="l00029"></a><span class="lineno">   29</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="device_2kernel_2tensor__foreach_8h.html">cutlass/util/reference/device/kernel/tensor_foreach.h</a>&quot;</span></div><div class="line"><a name="l00030"></a><span class="lineno">   30</span>&#160;</div><div class="line"><a name="l00031"></a><span class="lineno">   31</span>&#160;<span class="keyword">namespace </span><a class="code" href="namespacecutlass.html">cutlass</a>  {</div><div class="line"><a name="l00032"></a><span class="lineno">   32</span>&#160;<span class="keyword">namespace </span>reference {</div><div class="line"><a name="l00033"></a><span class="lineno">   33</span>&#160;<span class="keyword">namespace </span>device {</div><div class="line"><a name="l00034"></a><span class="lineno">   34</span>&#160;</div><div class="line"><a name="l00036"></a><span class="lineno">   36</span>&#160;</div><div class="line"><a name="l00038"></a><span class="lineno">   38</span>&#160;<span class="keyword">template</span> &lt;<span class="keyword">typename</span> Func, <span class="keywordtype">int</span> Rank, <span class="keyword">typename</span> Params&gt;</div><div class="line"><a name="l00039"></a><span class="lineno"><a class="line" href="structcutlass_1_1reference_1_1device_1_1TensorForEach.html">   39</a></span>&#160;<span class="keyword">struct </span><a class="code" href="structcutlass_1_1reference_1_1device_1_1TensorForEach.html">TensorForEach</a> {</div><div class="line"><a name="l00040"></a><span class="lineno">   40</span>&#160;</div><div class="line"><a name="l00042"></a><span class="lineno"><a class="line" href="structcutlass_1_1reference_1_1device_1_1TensorForEach.html#ad693751cf94aea83a14235a5ec7c7e92">   42</a></span>&#160;  <a class="code" href="structcutlass_1_1reference_1_1device_1_1TensorForEach.html#ad693751cf94aea83a14235a5ec7c7e92">TensorForEach</a>(<a class="code" href="structcutlass_1_1Coord.html">Coord&lt;Rank&gt;</a> size, Params params = Params(), <span class="keywordtype">int</span> grid_size = 0, <span class="keywordtype">int</span> block_size = 0) {</div><div class="line"><a name="l00043"></a><span class="lineno">   43</span>&#160;</div><div class="line"><a name="l00044"></a><span class="lineno">   44</span>&#160;    <span class="keywordflow">if</span> (!grid_size || !block_size) {</div><div class="line"><a name="l00045"></a><span class="lineno">   45</span>&#160;</div><div class="line"><a name="l00046"></a><span class="lineno">   46</span>&#160;      <span class="comment">// if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API</span></div><div class="line"><a name="l00047"></a><span class="lineno">   47</span>&#160;      cudaError_t result = cudaOccupancyMaxPotentialBlockSize(</div><div class="line"><a name="l00048"></a><span class="lineno">   48</span>&#160;        &amp;grid_size,</div><div class="line"><a name="l00049"></a><span class="lineno">   49</span>&#160;        &amp;block_size,</div><div class="line"><a name="l00050"></a><span class="lineno">   50</span>&#160;        reinterpret_cast&lt;void const *&gt;(kernel::TensorForEach&lt;Func, Rank, Params&gt;));</div><div class="line"><a name="l00051"></a><span class="lineno">   51</span>&#160;</div><div class="line"><a name="l00052"></a><span class="lineno">   52</span>&#160;      <span class="keywordflow">if</span> (result != cudaSuccess) {</div><div class="line"><a name="l00053"></a><span class="lineno">   53</span>&#160;        <span class="keywordflow">throw</span> std::runtime_error(<span class="stringliteral">&quot;Failed to query occupancy.&quot;</span>);</div><div class="line"><a name="l00054"></a><span class="lineno">   54</span>&#160;      }</div><div class="line"><a name="l00055"></a><span class="lineno">   55</span>&#160;</div><div class="line"><a name="l00056"></a><span class="lineno">   56</span>&#160;      <span class="comment">// Limit block size. This has the effect of increasing the number of items processed by a</span></div><div class="line"><a name="l00057"></a><span class="lineno">   57</span>&#160;      <span class="comment">// single thread and reduces the impact of initialization overhead.</span></div><div class="line"><a name="l00058"></a><span class="lineno">   58</span>&#160;      block_size = (block_size &lt; 128 ? block_size : 128);</div><div class="line"><a name="l00059"></a><span class="lineno">   59</span>&#160;    }</div><div class="line"><a name="l00060"></a><span class="lineno">   60</span>&#160;</div><div class="line"><a name="l00061"></a><span class="lineno">   61</span>&#160;    dim3 grid(grid_size, 1, 1);</div><div class="line"><a name="l00062"></a><span class="lineno">   62</span>&#160;    dim3 block(block_size, 1, 1);</div><div class="line"><a name="l00063"></a><span class="lineno">   63</span>&#160;</div><div class="line"><a name="l00064"></a><span class="lineno">   64</span>&#160;    kernel::TensorForEach&lt;Func, Rank, Params&gt;&lt;&lt;&lt; grid, block &gt;&gt;&gt;(size, params);</div><div class="line"><a name="l00065"></a><span class="lineno">   65</span>&#160;  }</div><div class="line"><a name="l00066"></a><span class="lineno">   66</span>&#160;};</div><div class="line"><a name="l00067"></a><span class="lineno">   67</span>&#160;</div><div class="line"><a name="l00069"></a><span class="lineno">   69</span>&#160;</div><div class="line"><a name="l00071"></a><span class="lineno">   71</span>&#160;<span class="keyword">template</span> &lt;<span class="keyword">typename</span> Func, <span class="keywordtype">int</span> Rank, <span class="keyword">typename</span> Params&gt;</div><div class="line"><a name="l00072"></a><span class="lineno"><a class="line" href="structcutlass_1_1reference_1_1device_1_1TensorDiagonalForEach.html">   72</a></span>&#160;<span class="keyword">struct </span><a class="code" href="structcutlass_1_1reference_1_1device_1_1TensorDiagonalForEach.html">TensorDiagonalForEach</a> {</div><div class="line"><a name="l00073"></a><span class="lineno">   73</span>&#160;</div><div class="line"><a name="l00075"></a><span class="lineno"><a class="line" href="structcutlass_1_1reference_1_1device_1_1TensorDiagonalForEach.html#adab64f903f234d0266400bd2416134ee">   75</a></span>&#160;  <a class="code" href="structcutlass_1_1reference_1_1device_1_1TensorDiagonalForEach.html#adab64f903f234d0266400bd2416134ee">TensorDiagonalForEach</a>(<a class="code" href="structcutlass_1_1Coord.html">Coord&lt;Rank&gt;</a> size, Params params = Params(), <span class="keywordtype">int</span> start = 0, <span class="keywordtype">int</span> end = -1, <span class="keywordtype">int</span> block_size = 128) { </div><div class="line"><a name="l00076"></a><span class="lineno">   76</span>&#160;</div><div class="line"><a name="l00077"></a><span class="lineno">   77</span>&#160;    <span class="keywordflow">if</span> (end &lt; 0) {</div><div class="line"><a name="l00078"></a><span class="lineno">   78</span>&#160;      end = size.min();</div><div class="line"><a name="l00079"></a><span class="lineno">   79</span>&#160;    }</div><div class="line"><a name="l00080"></a><span class="lineno">   80</span>&#160;</div><div class="line"><a name="l00081"></a><span class="lineno">   81</span>&#160;    dim3 block(block_size, 1, 1);</div><div class="line"><a name="l00082"></a><span class="lineno">   82</span>&#160;    dim3 grid((end - start + block_size - 1) / block_size, 1, 1);</div><div class="line"><a name="l00083"></a><span class="lineno">   83</span>&#160;</div><div class="line"><a name="l00084"></a><span class="lineno">   84</span>&#160;    kernel::TensorDiagonalForEach&lt;Func, Rank, Params&gt;&lt;&lt;&lt; grid, block &gt;&gt;&gt;(size, params, start, end);</div><div class="line"><a name="l00085"></a><span class="lineno">   85</span>&#160;  }</div><div class="line"><a name="l00086"></a><span class="lineno">   86</span>&#160;};</div><div class="line"><a name="l00087"></a><span class="lineno">   87</span>&#160;</div><div class="line"><a name="l00088"></a><span class="lineno">   88</span>&#160;</div><div class="line"><a name="l00090"></a><span class="lineno">   90</span>&#160;</div><div class="line"><a name="l00091"></a><span class="lineno">   91</span>&#160;<span class="keyword">template</span> &lt;<span class="keyword">typename</span> Element, <span class="keyword">typename</span> Func&gt;</div><div class="line"><a name="l00092"></a><span class="lineno"><a class="line" href="structcutlass_1_1reference_1_1device_1_1BlockForEach.html">   92</a></span>&#160;<span class="keyword">struct </span><a class="code" href="structcutlass_1_1reference_1_1device_1_1BlockForEach.html">BlockForEach</a> {</div><div class="line"><a name="l00093"></a><span class="lineno">   93</span>&#160;</div><div class="line"><a name="l00095"></a><span class="lineno"><a class="line" href="structcutlass_1_1reference_1_1device_1_1BlockForEach.html#a161e212b9b7ddbac36888de97538e106">   95</a></span>&#160;  <a class="code" href="structcutlass_1_1reference_1_1device_1_1BlockForEach.html#a161e212b9b7ddbac36888de97538e106">BlockForEach</a>(</div><div class="line"><a name="l00096"></a><span class="lineno">   96</span>&#160;    Element *ptr, </div><div class="line"><a name="l00097"></a><span class="lineno">   97</span>&#160;    <span class="keywordtype">size_t</span> capacity,</div><div class="line"><a name="l00098"></a><span class="lineno">   98</span>&#160;    <span class="keyword">typename</span> Func::Params params = <span class="keyword">typename</span> Func::Params(), </div><div class="line"><a name="l00099"></a><span class="lineno">   99</span>&#160;    <span class="keywordtype">int</span> grid_size = 0, </div><div class="line"><a name="l00100"></a><span class="lineno">  100</span>&#160;    <span class="keywordtype">int</span> block_size = 0) {</div><div class="line"><a name="l00101"></a><span class="lineno">  101</span>&#160;</div><div class="line"><a name="l00102"></a><span class="lineno">  102</span>&#160;    <span class="keywordflow">if</span> (!grid_size || !block_size) {</div><div class="line"><a name="l00103"></a><span class="lineno">  103</span>&#160;</div><div class="line"><a name="l00104"></a><span class="lineno">  104</span>&#160;      <span class="comment">// if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API</span></div><div class="line"><a name="l00105"></a><span class="lineno">  105</span>&#160;      cudaError_t result = cudaOccupancyMaxPotentialBlockSize(</div><div class="line"><a name="l00106"></a><span class="lineno">  106</span>&#160;        &amp;grid_size,</div><div class="line"><a name="l00107"></a><span class="lineno">  107</span>&#160;        &amp;block_size,</div><div class="line"><a name="l00108"></a><span class="lineno">  108</span>&#160;        reinterpret_cast&lt;void const *&gt;(kernel::BlockForEach&lt;Element, Func&gt;));</div><div class="line"><a name="l00109"></a><span class="lineno">  109</span>&#160;</div><div class="line"><a name="l00110"></a><span class="lineno">  110</span>&#160;      <span class="keywordflow">if</span> (result != cudaSuccess) {</div><div class="line"><a name="l00111"></a><span class="lineno">  111</span>&#160;        <span class="keywordflow">throw</span> std::runtime_error(<span class="stringliteral">&quot;Failed to query occupancy.&quot;</span>);</div><div class="line"><a name="l00112"></a><span class="lineno">  112</span>&#160;      }</div><div class="line"><a name="l00113"></a><span class="lineno">  113</span>&#160;</div><div class="line"><a name="l00114"></a><span class="lineno">  114</span>&#160;      <span class="comment">// Limit block size. This has the effect of increasing the number of items processed by a</span></div><div class="line"><a name="l00115"></a><span class="lineno">  115</span>&#160;      <span class="comment">// single thread and reduces the impact of initialization overhead.</span></div><div class="line"><a name="l00116"></a><span class="lineno">  116</span>&#160;      block_size = (block_size &lt; 128 ? block_size : 128);</div><div class="line"><a name="l00117"></a><span class="lineno">  117</span>&#160;    }</div><div class="line"><a name="l00118"></a><span class="lineno">  118</span>&#160;</div><div class="line"><a name="l00119"></a><span class="lineno">  119</span>&#160;    dim3 grid(grid_size, 1, 1);</div><div class="line"><a name="l00120"></a><span class="lineno">  120</span>&#160;    dim3 block(block_size, 1, 1);</div><div class="line"><a name="l00121"></a><span class="lineno">  121</span>&#160;</div><div class="line"><a name="l00122"></a><span class="lineno">  122</span>&#160;    kernel::BlockForEach&lt;Element, Func&gt;&lt;&lt;&lt; grid, block &gt;&gt;&gt;(ptr, capacity, params);</div><div class="line"><a name="l00123"></a><span class="lineno">  123</span>&#160;  }</div><div class="line"><a name="l00124"></a><span class="lineno">  124</span>&#160;};</div><div class="line"><a name="l00125"></a><span class="lineno">  125</span>&#160;</div><div class="line"><a name="l00127"></a><span class="lineno">  127</span>&#160;</div><div class="line"><a name="l00128"></a><span class="lineno">  128</span>&#160;} <span class="comment">// namespace device</span></div><div class="line"><a name="l00129"></a><span class="lineno">  129</span>&#160;} <span class="comment">// namespace reference</span></div><div class="line"><a name="l00130"></a><span class="lineno">  130</span>&#160;} <span class="comment">// namespace cutlass</span></div><div class="ttc" id="namespacecutlass_html"><div class="ttname"><a href="namespacecutlass.html">cutlass</a></div><div class="ttdef"><b>Definition:</b> aligned_buffer.h:35</div></div>
 <div class="ttc" id="structcutlass_1_1reference_1_1device_1_1TensorDiagonalForEach_html_adab64f903f234d0266400bd2416134ee"><div class="ttname"><a href="structcutlass_1_1reference_1_1device_1_1TensorDiagonalForEach.html#adab64f903f234d0266400bd2416134ee">cutlass::reference::device::TensorDiagonalForEach::TensorDiagonalForEach</a></div><div class="ttdeci">TensorDiagonalForEach(Coord&lt; Rank &gt; size, Params params=Params(), int start=0, int end=-1, int block_size=128)</div><div class="ttdoc">Constructor performs the operation. </div><div class="ttdef"><b>Definition:</b> device/tensor_foreach.h:75</div></div>
 <div class="ttc" id="structcutlass_1_1reference_1_1device_1_1TensorForEach_html_ad693751cf94aea83a14235a5ec7c7e92"><div class="ttname"><a href="structcutlass_1_1reference_1_1device_1_1TensorForEach.html#ad693751cf94aea83a14235a5ec7c7e92">cutlass::reference::device::TensorForEach::TensorForEach</a></div><div class="ttdeci">TensorForEach(Coord&lt; Rank &gt; size, Params params=Params(), int grid_size=0, int block_size=0)</div><div class="ttdoc">Constructor performs the operation. </div><div class="ttdef"><b>Definition:</b> device/tensor_foreach.h:42</div></div>
 <div class="ttc" id="structcutlass_1_1reference_1_1device_1_1TensorDiagonalForEach_html"><div class="ttname"><a href="structcutlass_1_1reference_1_1device_1_1TensorDiagonalForEach.html">cutlass::reference::device::TensorDiagonalForEach</a></div><div class="ttdoc">Launches a kernel calling a functor for each element along a tensor&amp;#39;s diagonal. </div><div class="ttdef"><b>Definition:</b> device/tensor_foreach.h:72</div></div>
diff --git a/docs/functions_func_s.html b/docs/functions_func_s.html
index 785813104b..548b6d97d9 100644
--- a/docs/functions_func_s.html
+++ b/docs/functions_func_s.html
@@ -141,7 +141,7 @@ <h3><a class="anchor" id="index_s"></a>- s -</h3><ul>
 <li>Semaphore()
 : <a class="el" href="classcutlass_1_1Semaphore.html#a2ce4cd07fe773efa429f726cfbd98070">cutlass::Semaphore</a>
 </li>
-<li>seperate_string()
+<li>separate_string()
 : <a class="el" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">cutlass::CommandLine</a>
 </li>
 <li>set()
diff --git a/docs/functions_s.html b/docs/functions_s.html
index 72c08b0531..4b1ec8e0e1 100644
--- a/docs/functions_s.html
+++ b/docs/functions_s.html
@@ -172,7 +172,7 @@ <h3><a class="anchor" id="index_s"></a>- s -</h3><ul>
 <li>Semaphore()
 : <a class="el" href="classcutlass_1_1Semaphore.html#a2ce4cd07fe773efa429f726cfbd98070">cutlass::Semaphore</a>
 </li>
-<li>seperate_string()
+<li>separate_string()
 : <a class="el" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">cutlass::CommandLine</a>
 </li>
 <li>sequential
diff --git a/docs/hierarchy.html b/docs/hierarchy.html
index b63cb84471..c9a5c204b1 100644
--- a/docs/hierarchy.html
+++ b/docs/hierarchy.html
@@ -312,23 +312,23 @@
 <tr id="row_197_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm.html" target="_self">cutlass::reference::host::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, InnerProductOp &gt;</a></td><td class="desc"></td></tr>
 <tr id="row_198_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout4e016ab7cfc644acd7cb4ae770339773.html" target="_self">cutlass::reference::device::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpMultiplyAdd &gt;</a></td><td class="desc">Partial specialization for multiply-add </td></tr>
 <tr id="row_199_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout30b72addd464a2ca4a26785cbfd77a8e.html" target="_self">cutlass::reference::device::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpMultiplyAddSaturate &gt;</a></td><td class="desc">Partial specialization for multiply-add-saturate </td></tr>
-<tr id="row_200_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html" target="_self">cutlass::reference::device::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpXorPopc &gt;</a></td><td class="desc">Parital specialization for XOR-popc </td></tr>
+<tr id="row_200_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html" target="_self">cutlass::reference::device::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpXorPopc &gt;</a></td><td class="desc">Partial specialization for XOR-popc </td></tr>
 <tr id="row_201_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_193dd3a37f00deff1e5dcd7c310afb1f.html" target="_self">cutlass::reference::host::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpMultiplyAdd &gt;</a></td><td class="desc">Partial specialization for multiply-add </td></tr>
 <tr id="row_202_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_55729eac7dbd6bf311ea36f680e83e93.html" target="_self">cutlass::reference::host::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpMultiplyAddSaturate &gt;</a></td><td class="desc">Partial specialization for multiply-add-saturate </td></tr>
-<tr id="row_203_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html" target="_self">cutlass::reference::host::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpXorPopc &gt;</a></td><td class="desc">Parital specialization for XOR-popc </td></tr>
-<tr id="row_204_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html" target="_self">cutlass::gemm::device::Gemm&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero &gt;</a></td><td class="desc">Parital specialization for column-major output exchanges problem size and operand </td></tr>
+<tr id="row_203_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html" target="_self">cutlass::reference::host::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpXorPopc &gt;</a></td><td class="desc">Partial specialization for XOR-popc </td></tr>
+<tr id="row_204_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html" target="_self">cutlass::gemm::device::Gemm&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero &gt;</a></td><td class="desc">Partial specialization for column-major output exchanges problem size and operand </td></tr>
 <tr id="row_205_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm.html" target="_self">cutlass::gemm::device::Gemm&lt; ElementB, typename layout::LayoutTranspose&lt; LayoutB &gt;::type, ElementA, typename layout::LayoutTranspose&lt; LayoutA &gt;::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA, SplitKSerial, Operator, kIsBetaZero &gt;</a></td><td class="desc"></td></tr>
 <tr id="row_206_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1library_1_1GemmArguments.html" target="_self">cutlass::library::GemmArguments</a></td><td class="desc">Arguments for GEMM </td></tr>
 <tr id="row_207_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1library_1_1GemmArrayArguments.html" target="_self">cutlass::library::GemmArrayArguments</a></td><td class="desc">Arguments for GEMM - used by all the GEMM operations </td></tr>
 <tr id="row_208_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1library_1_1GemmArrayConfiguration.html" target="_self">cutlass::library::GemmArrayConfiguration</a></td><td class="desc">Configuration for batched GEMM in which multiple matrix products are computed </td></tr>
 <tr id="row_209_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched.html" target="_self">cutlass::gemm::device::GemmBatched&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ &gt;</a></td><td class="desc"></td></tr>
 <tr id="row_210_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1kernel_1_1GemmBatched.html" target="_self">cutlass::gemm::kernel::GemmBatched&lt; Mma_, Epilogue_, ThreadblockSwizzle_ &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_211_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html" target="_self">cutlass::gemm::device::GemmBatched&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ &gt;</a></td><td class="desc">Parital specialization for column-major output exchanges problem size and operand </td></tr>
+<tr id="row_211_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html" target="_self">cutlass::gemm::device::GemmBatched&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ &gt;</a></td><td class="desc">Partial specialization for column-major output exchanges problem size and operand </td></tr>
 <tr id="row_212_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched.html" target="_self">cutlass::gemm::device::GemmBatched&lt; ElementB, typename layout::LayoutTranspose&lt; LayoutB &gt;::type, ElementA, typename layout::LayoutTranspose&lt; LayoutA &gt;::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA &gt;</a></td><td class="desc"></td></tr>
 <tr id="row_213_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1library_1_1GemmBatchedConfiguration.html" target="_self">cutlass::library::GemmBatchedConfiguration</a></td><td class="desc">Configuration for batched GEMM in which multiple matrix products are computed </td></tr>
 <tr id="row_214_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1threadblock_1_1GemmBatchedIdentityThreadblockSwizzle.html" target="_self">cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle</a></td><td class="desc">Threadblock swizzling function for batched GEMMs </td></tr>
 <tr id="row_215_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex.html" target="_self">cutlass::gemm::device::GemmComplex&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_216_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html" target="_self">cutlass::gemm::device::GemmComplex&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial &gt;</a></td><td class="desc">Parital specialization for column-major output exchanges problem size and operand </td></tr>
+<tr id="row_216_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html" target="_self">cutlass::gemm::device::GemmComplex&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial &gt;</a></td><td class="desc">Partial specialization for column-major output exchanges problem size and operand </td></tr>
 <tr id="row_217_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex.html" target="_self">cutlass::gemm::device::GemmComplex&lt; ElementB, typename layout::LayoutTranspose&lt; LayoutB &gt;::type, ElementA, typename layout::LayoutTranspose&lt; LayoutA &gt;::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, SplitKSerial &gt;</a></td><td class="desc"></td></tr>
 <tr id="row_218_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1library_1_1GemmConfiguration.html" target="_self">cutlass::library::GemmConfiguration</a></td><td class="desc">Configuration for basic GEMM operations </td></tr>
 <tr id="row_219_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1gemm_1_1threadblock_1_1GemmHorizontalThreadblockSwizzle.html" target="_self">cutlass::gemm::threadblock::GemmHorizontalThreadblockSwizzle</a></td><td class="desc">Threadblock swizzling function for GEMMs </td></tr>
@@ -889,7 +889,7 @@
 <tr id="row_734_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classcutlass_1_1transform_1_1thread_1_1Transpose.html" target="_self">cutlass::transform::thread::Transpose&lt; ElementCount, TransposeShape, Element &gt;</a></td><td class="desc">Transforms a fragment by doing a transpose </td></tr>
 <tr id="row_735_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1transform_1_1thread_1_1Transpose_3_01ElementCount___00_01layout_1_1PitchLinearS99f8e05faf0bb5ed48a0154afe740d81.html" target="_self">cutlass::transform::thread::Transpose&lt; ElementCount_, layout::PitchLinearShape&lt; 4, 4 &gt;, int8_t &gt;</a></td><td class="desc">Specialization for int8_t 4x4 transpose </td></tr>
 <tr id="row_736_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1transform_1_1TransposePitchLinearThreadMap.html" target="_self">cutlass::transform::TransposePitchLinearThreadMap&lt; ThreadMap_, WarpThreadArrangement_ &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_737_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1transform_1_1TransposePitchLinearThreadMap2DThreadTile.html" target="_self">cutlass::transform::TransposePitchLinearThreadMap2DThreadTile&lt; ThreadMap_ &gt;</a></td><td class="desc">Thread Mapping a 2D threadtiled mapping as a tranposed Pitchlinear2DThreadTile mapping </td></tr>
+<tr id="row_737_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1transform_1_1TransposePitchLinearThreadMap2DThreadTile.html" target="_self">cutlass::transform::TransposePitchLinearThreadMap2DThreadTile&lt; ThreadMap_ &gt;</a></td><td class="desc">Thread Mapping a 2D threadtiled mapping as a transposed Pitchlinear2DThreadTile mapping </td></tr>
 <tr id="row_738_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1transform_1_1TransposePitchLinearThreadMapSimt.html" target="_self">cutlass::transform::TransposePitchLinearThreadMapSimt&lt; ThreadMap_ &gt;</a></td><td class="desc"></td></tr>
 <tr id="row_739_"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1reference_1_1host_1_1detail_1_1TrivialConvert.html" target="_self">cutlass::reference::host::detail::TrivialConvert&lt; DstElement, SrcElement &gt;</a></td><td class="desc">Helper to convert between types </td></tr>
 <tr id="row_740_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcutlass_1_1PredicateVector_1_1TrivialIterator.html" target="_self">cutlass::PredicateVector&lt; kPredicates_, kPredicatesPerByte_, kPredicateStart_ &gt;::TrivialIterator</a></td><td class="desc"><a class="el" href="classcutlass_1_1PredicateVector_1_1Iterator.html" title="An iterator implementing Predicate Iterator Concept enabling sequential read and write access to pred...">Iterator</a> that always returns true </td></tr>
diff --git a/docs/host_2tensor__fill_8h.html b/docs/host_2tensor__fill_8h.html
index cc01344313..0bc5fab0e0 100644
--- a/docs/host_2tensor__fill_8h.html
+++ b/docs/host_2tensor__fill_8h.html
@@ -192,7 +192,7 @@
 <tr class="separator:a1c81144ca36832a48d04d1b5b6498080"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a29548cb522d9c147cf34263ecac75d89"><td class="memTemplParams" colspan="2">template&lt;typename Element , typename Layout &gt; </td></tr>
 <tr class="memitem:a29548cb522d9c147cf34263ecac75d89"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1host.html#a29548cb522d9c147cf34263ecac75d89">cutlass::reference::host::TensorFillIdentity</a> (TensorView&lt; Element, Layout &gt; dst)</td></tr>
-<tr class="memdesc:a29548cb522d9c147cf34263ecac75d89"><td class="mdescLeft">&#160;</td><td class="mdescRight">Helper to fill a tensor's digonal with 1 and 0 everywhere else.  <a href="namespacecutlass_1_1reference_1_1host.html#a29548cb522d9c147cf34263ecac75d89">More...</a><br /></td></tr>
+<tr class="memdesc:a29548cb522d9c147cf34263ecac75d89"><td class="mdescLeft">&#160;</td><td class="mdescRight">Helper to fill a tensor's diagonal with 1 and 0 everywhere else.  <a href="namespacecutlass_1_1reference_1_1host.html#a29548cb522d9c147cf34263ecac75d89">More...</a><br /></td></tr>
 <tr class="separator:a29548cb522d9c147cf34263ecac75d89"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:acbf747241e8ac6ef9b1702b735a7913e"><td class="memTemplParams" colspan="2">template&lt;typename Element , typename Layout &gt; </td></tr>
 <tr class="memitem:acbf747241e8ac6ef9b1702b735a7913e"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1host.html#acbf747241e8ac6ef9b1702b735a7913e">cutlass::reference::host::TensorUpdateDiagonal</a> (TensorView&lt; Element, Layout &gt; dst, Element val=Element(1))</td></tr>
diff --git a/docs/host_2tensor__fill_8h_source.html b/docs/host_2tensor__fill_8h_source.html
index d90cc56a31..12919ee0c8 100644
--- a/docs/host_2tensor__fill_8h_source.html
+++ b/docs/host_2tensor__fill_8h_source.html
@@ -132,7 +132,7 @@
 <div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomGaussianFunc_html_a4c9943f36faab7d4928b1f130d0b784c"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomGaussianFunc.html#a4c9943f36faab7d4928b1f130d0b784c">cutlass::reference::host::detail::RandomGaussianFunc::int_scale</a></div><div class="ttdeci">int int_scale</div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:115</div></div>
 <div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorUpdateOffDiagonalFunc_html"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorUpdateOffDiagonalFunc.html">cutlass::reference::host::detail::TensorUpdateOffDiagonalFunc</a></div><div class="ttdoc">&lt; Layout function </div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:597</div></div>
 <div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomUniformFunc_3_01complex_3_01Element_01_4_01_4_html_ad0de7d4946af855288d7f9cccb9a18eb"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomUniformFunc_3_01complex_3_01Element_01_4_01_4.html#ad0de7d4946af855288d7f9cccb9a18eb">cutlass::reference::host::detail::RandomUniformFunc&lt; complex&lt; Element &gt; &gt;::int_scale</a></div><div class="ttdeci">int int_scale</div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:357</div></div>
-<div class="ttc" id="namespacecutlass_1_1reference_1_1host_html_a29548cb522d9c147cf34263ecac75d89"><div class="ttname"><a href="namespacecutlass_1_1reference_1_1host.html#a29548cb522d9c147cf34263ecac75d89">cutlass::reference::host::TensorFillIdentity</a></div><div class="ttdeci">void TensorFillIdentity(TensorView&lt; Element, Layout &gt; dst)</div><div class="ttdoc">Helper to fill a tensor&amp;#39;s digonal with 1 and 0 everywhere else. </div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:564</div></div>
+<div class="ttc" id="namespacecutlass_1_1reference_1_1host_html_a29548cb522d9c147cf34263ecac75d89"><div class="ttname"><a href="namespacecutlass_1_1reference_1_1host.html#a29548cb522d9c147cf34263ecac75d89">cutlass::reference::host::TensorFillIdentity</a></div><div class="ttdeci">void TensorFillIdentity(TensorView&lt; Element, Layout &gt; dst)</div><div class="ttdoc">Helper to fill a tensor&amp;#39;s diagonal with 1 and 0 everywhere else. </div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:564</div></div>
 <div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomUniformFunc_3_01complex_3_01Element_01_4_01_4_html_a6ef7020f1108432fe51853dffb7e727c"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1RandomUniformFunc_3_01complex_3_01Element_01_4_01_4.html#a6ef7020f1108432fe51853dffb7e727c">cutlass::reference::host::detail::RandomUniformFunc&lt; complex&lt; Element &gt; &gt;::operator()</a></div><div class="ttdeci">complex&lt; Element &gt; operator()() const </div><div class="ttdoc">Compute random value and update RNG state. </div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:375</div></div>
 <div class="ttc" id="namespacecutlass_html_a67f9e83dd59615eff837ea66984c121c"><div class="ttname"><a href="namespacecutlass.html#a67f9e83dd59615eff837ea66984c121c">cutlass::log</a></div><div class="ttdeci">CUTLASS_HOST_DEVICE complex&lt; T &gt; log(complex&lt; T &gt; const &amp;z)</div><div class="ttdoc">Computes the complex exponential of z. </div><div class="ttdef"><b>Definition:</b> complex.h:381</div></div>
 <div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorFillGaussianFunc_html_a4e447a80bd94cde69fa66f9e9d882b28"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorFillGaussianFunc.html#a4e447a80bd94cde69fa66f9e9d882b28">cutlass::reference::host::detail::TensorFillGaussianFunc::operator()</a></div><div class="ttdeci">void operator()(Coord&lt; Layout::kRank &gt; const &amp;coord) const </div><div class="ttdoc">Compute random value and update RNG state. </div><div class="ttdef"><b>Definition:</b> host/tensor_fill.h:236</div></div>
diff --git a/docs/host_2tensor__foreach_8h_source.html b/docs/host_2tensor__foreach_8h_source.html
index 34c5472143..4dee3539a5 100644
--- a/docs/host_2tensor__foreach_8h_source.html
+++ b/docs/host_2tensor__foreach_8h_source.html
@@ -104,7 +104,7 @@
 <div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_html_a1161a761c596e714982fe30141211cca"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper.html#a1161a761c596e714982fe30141211cca">cutlass::reference::host::detail::TensorForEachHelper::kActiveRank</a></div><div class="ttdeci">static int const kActiveRank</div><div class="ttdoc">Index of the active rank. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:44</div></div>
 <div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_html_aa63906bbecfe42eec1991c9176f066d9"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper.html#aa63906bbecfe42eec1991c9176f066d9">cutlass::reference::host::detail::TensorForEachHelper::TensorForEachHelper</a></div><div class="ttdeci">TensorForEachHelper(Func &amp;func, Coord&lt; Rank &gt; const &amp;extent, Coord&lt; Rank &gt; &amp;coord)</div><div class="ttdoc">Constructor for general rank. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:47</div></div>
 <div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_html"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper.html">cutlass::reference::host::detail::TensorForEachHelper</a></div><div class="ttdoc">Helper to perform for-each operation. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:41</div></div>
-<div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4_html_a5029a4405a9a5e64011addb43bb88120"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html#a5029a4405a9a5e64011addb43bb88120">cutlass::reference::host::detail::TensorForEachHelper&lt; Func, Rank, 0 &gt;::TensorForEachHelper</a></div><div class="ttdeci">TensorForEachHelper(Func &amp;func, Coord&lt; Rank &gt; const &amp;extent, Coord&lt; Rank &gt; &amp;coord)</div><div class="ttdoc">Constructor for fastest chaning rank. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:67</div></div>
+<div class="ttc" id="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4_html_a5029a4405a9a5e64011addb43bb88120"><div class="ttname"><a href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html#a5029a4405a9a5e64011addb43bb88120">cutlass::reference::host::detail::TensorForEachHelper&lt; Func, Rank, 0 &gt;::TensorForEachHelper</a></div><div class="ttdeci">TensorForEachHelper(Func &amp;func, Coord&lt; Rank &gt; const &amp;extent, Coord&lt; Rank &gt; &amp;coord)</div><div class="ttdoc">Constructor for fastest changing rank. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:67</div></div>
 <div class="ttc" id="structcutlass_1_1Coord_html"><div class="ttname"><a href="structcutlass_1_1Coord.html">cutlass::Coord</a></div><div class="ttdoc">Statically-sized array specifying Coords within a tensor. </div><div class="ttdef"><b>Definition:</b> coord.h:43</div></div>
 <div class="ttc" id="namespacecutlass_1_1reference_1_1host_html_a3825b1aaaf5e5abf0de5f427e3481ada"><div class="ttname"><a href="namespacecutlass_1_1reference_1_1host.html#a3825b1aaaf5e5abf0de5f427e3481ada">cutlass::reference::host::TensorForEachLambda</a></div><div class="ttdeci">void TensorForEachLambda(Coord&lt; Rank &gt; extent, Func func)</div><div class="ttdoc">Iterates over the index space of a tensor and calls a C++ lambda. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:98</div></div>
 <div class="ttc" id="namespacecutlass_1_1reference_1_1host_html_a8c798c04df572b34e3ed3976d69f993d"><div class="ttname"><a href="namespacecutlass_1_1reference_1_1host.html#a8c798c04df572b34e3ed3976d69f993d">cutlass::reference::host::TensorForEach</a></div><div class="ttdeci">void TensorForEach(Coord&lt; Rank &gt; extent, Func &amp;func)</div><div class="ttdoc">Iterates over the index space of a tensor. </div><div class="ttdef"><b>Definition:</b> host/tensor_foreach.h:87</div></div>
diff --git a/docs/include_2cutlass_2gemm_2device_2gemm_8h.html b/docs/include_2cutlass_2gemm_2device_2gemm_8h.html
index f1e5b09a19..c3d18abc5f 100644
--- a/docs/include_2cutlass_2gemm_2device_2gemm_8h.html
+++ b/docs/include_2cutlass_2gemm_2device_2gemm_8h.html
@@ -130,7 +130,7 @@
 <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Argument structure.  <a href="structcutlass_1_1gemm_1_1device_1_1Gemm_1_1Arguments.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html">cutlass::gemm::device::Gemm&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero &gt;</a></td></tr>
-<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Parital specialization for column-major output exchanges problem size and operand.  <a href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html#details">More...</a><br /></td></tr>
+<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for column-major output exchanges problem size and operand.  <a href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layou1b211cc9c97c022d8fe10f2dd32c8709.html">cutlass::gemm::device::Gemm&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero &gt;::Arguments</a></td></tr>
 <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Argument structure.  <a href="structcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layou1b211cc9c97c022d8fe10f2dd32c8709.html#details">More...</a><br /></td></tr>
diff --git a/docs/include_2cutlass_2gemm_2device_2gemm__complex_8h.html b/docs/include_2cutlass_2gemm_2device_2gemm__complex_8h.html
index 7f21249eca..fffdd34a6f 100644
--- a/docs/include_2cutlass_2gemm_2device_2gemm__complex_8h.html
+++ b/docs/include_2cutlass_2gemm_2device_2gemm__complex_8h.html
@@ -130,7 +130,7 @@
 <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Argument structure.  <a href="structcutlass_1_1gemm_1_1device_1_1GemmComplex_1_1Arguments.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html">cutlass::gemm::device::GemmComplex&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial &gt;</a></td></tr>
-<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Parital specialization for column-major output exchanges problem size and operand.  <a href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html#details">More...</a><br /></td></tr>
+<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for column-major output exchanges problem size and operand.  <a href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_a3923967cafb5cb9774c320dc24baa77.html">cutlass::gemm::device::GemmComplex&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial &gt;::Arguments</a></td></tr>
 <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Argument structure.  <a href="structcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_a3923967cafb5cb9774c320dc24baa77.html#details">More...</a><br /></td></tr>
diff --git a/docs/mma__pipelined_8h_source.html b/docs/mma__pipelined_8h_source.html
index d4952ffb98..08ea72e92e 100644
--- a/docs/mma__pipelined_8h_source.html
+++ b/docs/mma__pipelined_8h_source.html
@@ -98,7 +98,7 @@
 <div class="title">mma_pipelined.h</div>  </div>
 </div><!--header-->
 <div class="contents">
-<a href="mma__pipelined_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a name="l00001"></a><span class="lineno">    1</span>&#160;<span class="comment">/***************************************************************************************************</span></div><div class="line"><a name="l00002"></a><span class="lineno">    2</span>&#160;<span class="comment"> * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.</span></div><div class="line"><a name="l00003"></a><span class="lineno">    3</span>&#160;<span class="comment"> *</span></div><div class="line"><a name="l00004"></a><span class="lineno">    4</span>&#160;<span class="comment"> * Redistribution and use in source and binary forms, with or without modification, are permitted</span></div><div class="line"><a name="l00005"></a><span class="lineno">    5</span>&#160;<span class="comment"> * provided that the following conditions are met:</span></div><div class="line"><a name="l00006"></a><span class="lineno">    6</span>&#160;<span class="comment"> *     * Redistributions of source code must retain the above copyright notice, this list of</span></div><div class="line"><a name="l00007"></a><span class="lineno">    7</span>&#160;<span class="comment"> *       conditions and the following disclaimer.</span></div><div class="line"><a name="l00008"></a><span class="lineno">    8</span>&#160;<span class="comment"> *     * Redistributions in binary form must reproduce the above copyright notice, this list of</span></div><div class="line"><a name="l00009"></a><span class="lineno">    9</span>&#160;<span class="comment"> *       conditions and the following disclaimer in the documentation and/or other materials</span></div><div class="line"><a name="l00010"></a><span class="lineno">   10</span>&#160;<span class="comment"> *       provided with the distribution.</span></div><div class="line"><a name="l00011"></a><span class="lineno">   11</span>&#160;<span class="comment"> *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used</span></div><div class="line"><a name="l00012"></a><span class="lineno">   12</span>&#160;<span class="comment"> *       to endorse or promote products derived from this software without specific prior written</span></div><div class="line"><a name="l00013"></a><span class="lineno">   13</span>&#160;<span class="comment"> *       permission.</span></div><div class="line"><a name="l00014"></a><span class="lineno">   14</span>&#160;<span class="comment"> *</span></div><div class="line"><a name="l00015"></a><span class="lineno">   15</span>&#160;<span class="comment"> * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS &quot;AS IS&quot; AND ANY EXPRESS OR</span></div><div class="line"><a name="l00016"></a><span class="lineno">   16</span>&#160;<span class="comment"> * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND</span></div><div class="line"><a name="l00017"></a><span class="lineno">   17</span>&#160;<span class="comment"> * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE</span></div><div class="line"><a name="l00018"></a><span class="lineno">   18</span>&#160;<span class="comment"> * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,</span></div><div class="line"><a name="l00019"></a><span class="lineno">   19</span>&#160;<span class="comment"> * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;</span></div><div class="line"><a name="l00020"></a><span class="lineno">   20</span>&#160;<span class="comment"> * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,</span></div><div class="line"><a name="l00021"></a><span class="lineno">   21</span>&#160;<span class="comment"> * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE</span></div><div class="line"><a name="l00022"></a><span class="lineno">   22</span>&#160;<span class="comment"> * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.</span></div><div class="line"><a name="l00023"></a><span class="lineno">   23</span>&#160;<span class="comment"> *</span></div><div class="line"><a name="l00024"></a><span class="lineno">   24</span>&#160;<span class="comment"> **************************************************************************************************/</span></div><div class="line"><a name="l00029"></a><span class="lineno">   29</span>&#160;<span class="preprocessor">#pragma once</span></div><div class="line"><a name="l00030"></a><span class="lineno">   30</span>&#160;</div><div class="line"><a name="l00031"></a><span class="lineno">   31</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="cutlass_8h.html">cutlass/cutlass.h</a>&quot;</span></div><div class="line"><a name="l00032"></a><span class="lineno">   32</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="array_8h.html">cutlass/array.h</a>&quot;</span></div><div class="line"><a name="l00033"></a><span class="lineno">   33</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="aligned__buffer_8h.html">cutlass/aligned_buffer.h</a>&quot;</span></div><div class="line"><a name="l00034"></a><span class="lineno">   34</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="numeric__conversion_8h.html">cutlass/numeric_conversion.h</a>&quot;</span></div><div class="line"><a name="l00035"></a><span class="lineno">   35</span>&#160;</div><div class="line"><a name="l00036"></a><span class="lineno">   36</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="numeric__types_8h.html">cutlass/numeric_types.h</a>&quot;</span></div><div class="line"><a name="l00037"></a><span class="lineno">   37</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="matrix__shape_8h.html">cutlass/matrix_shape.h</a>&quot;</span></div><div class="line"><a name="l00038"></a><span class="lineno">   38</span>&#160;</div><div class="line"><a name="l00039"></a><span class="lineno">   39</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="include_2cutlass_2gemm_2gemm_8h.html">cutlass/gemm/gemm.h</a>&quot;</span></div><div class="line"><a name="l00040"></a><span class="lineno">   40</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="mma__base_8h.html">cutlass/gemm/threadblock/mma_base.h</a>&quot;</span></div><div class="line"><a name="l00041"></a><span class="lineno">   41</span>&#160;</div><div class="line"><a name="l00043"></a><span class="lineno">   43</span>&#160;</div><div class="line"><a name="l00044"></a><span class="lineno">   44</span>&#160;<span class="keyword">namespace </span><a class="code" href="namespacecutlass.html">cutlass</a> {</div><div class="line"><a name="l00045"></a><span class="lineno">   45</span>&#160;<span class="keyword">namespace </span>gemm {</div><div class="line"><a name="l00046"></a><span class="lineno">   46</span>&#160;<span class="keyword">namespace </span>threadblock {</div><div class="line"><a name="l00047"></a><span class="lineno">   47</span>&#160;</div><div class="line"><a name="l00049"></a><span class="lineno">   49</span>&#160;</div><div class="line"><a name="l00051"></a><span class="lineno">   51</span>&#160;<span class="keyword">template</span> &lt;</div><div class="line"><a name="l00053"></a><span class="lineno">   53</span>&#160;  <span class="keyword">typename</span> Shape_,</div><div class="line"><a name="l00055"></a><span class="lineno">   55</span>&#160;  <span class="comment">//  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)</span></div><div class="line"><a name="l00056"></a><span class="lineno">   56</span>&#160;  <span class="keyword">typename</span> IteratorA_,</div><div class="line"><a name="l00059"></a><span class="lineno">   59</span>&#160;  <span class="keyword">typename</span> SmemIteratorA_,</div><div class="line"><a name="l00061"></a><span class="lineno">   61</span>&#160;  <span class="comment">//  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)</span></div><div class="line"><a name="l00062"></a><span class="lineno">   62</span>&#160;  <span class="keyword">typename</span> IteratorB_,</div><div class="line"><a name="l00065"></a><span class="lineno">   65</span>&#160;  <span class="keyword">typename</span> SmemIteratorB_,</div><div class="line"><a name="l00067"></a><span class="lineno">   67</span>&#160;  <span class="keyword">typename</span> ElementC_,</div><div class="line"><a name="l00069"></a><span class="lineno">   69</span>&#160;  <span class="keyword">typename</span> LayoutC_,</div><div class="line"><a name="l00071"></a><span class="lineno">   71</span>&#160;  <span class="keyword">typename</span> Policy_,</div><div class="line"><a name="l00073"></a><span class="lineno">   73</span>&#160;  <span class="keyword">typename</span> TransformA_ = NumericArrayConverter&lt;</div><div class="line"><a name="l00074"></a><span class="lineno">   74</span>&#160;    <span class="keyword">typename</span> SmemIteratorA_::Element, </div><div class="line"><a name="l00075"></a><span class="lineno">   75</span>&#160;    <span class="keyword">typename</span> IteratorA_::Element, </div><div class="line"><a name="l00076"></a><span class="lineno">   76</span>&#160;    IteratorA_::Fragment::kElements&gt;,</div><div class="line"><a name="l00079"></a><span class="lineno">   79</span>&#160;  <span class="keyword">typename</span> TransformB_ = NumericArrayConverter&lt;</div><div class="line"><a name="l00080"></a><span class="lineno">   80</span>&#160;    <span class="keyword">typename</span> SmemIteratorB_::Element, </div><div class="line"><a name="l00081"></a><span class="lineno">   81</span>&#160;    <span class="keyword">typename</span> IteratorB_::Element, </div><div class="line"><a name="l00082"></a><span class="lineno">   82</span>&#160;    IteratorB_::Fragment::kElements&gt;,</div><div class="line"><a name="l00084"></a><span class="lineno">   84</span>&#160;  <span class="keyword">typename</span> Enable = <span class="keywordtype">bool</span></div><div class="line"><a name="l00085"></a><span class="lineno">   85</span>&#160;&gt;</div><div class="line"><a name="l00086"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html">   86</a></span>&#160;<span class="keyword">class </span><a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html">MmaPipelined</a> : <span class="keyword">public</span> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html">MmaBase</a>&lt;Shape_, Policy_, 2&gt; {</div><div class="line"><a name="l00087"></a><span class="lineno">   87</span>&#160;<span class="keyword">public</span>:</div><div class="line"><a name="l00088"></a><span class="lineno">   88</span>&#160;</div><div class="line"><a name="l00090"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a1ca2ed2c51ec508a6b6bb4af5f969076">   90</a></span>&#160;  <span class="keyword">using</span> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html">Base</a> = <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html">MmaBase&lt;Shape_, Policy_, 2&gt;</a>;</div><div class="line"><a name="l00091"></a><span class="lineno">   91</span>&#160;</div><div class="line"><a name="l00092"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#af42727c7f832320ae88178d163421dca">   92</a></span>&#160;  <span class="keyword">using</span> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#af42727c7f832320ae88178d163421dca">Shape</a> = Shape_;             </div><div class="line"><a name="l00093"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a258197bea0d4894569c627c56a64203a">   93</a></span>&#160;  <span class="keyword">using</span> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a258197bea0d4894569c627c56a64203a">IteratorA</a> = IteratorA_;     </div><div class="line"><a name="l00094"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a3989c584460e66c75a3f37ef91ecf06d">   94</a></span>&#160;  <span class="keyword">using</span> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a3989c584460e66c75a3f37ef91ecf06d">IteratorB</a> = IteratorB_;     </div><div class="line"><a name="l00095"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a102ddf57c1f46f5ed411b9d953e3f802">   95</a></span>&#160;  <span class="keyword">using</span> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a102ddf57c1f46f5ed411b9d953e3f802">ElementC</a> = ElementC_;       </div><div class="line"><a name="l00096"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a3bfd1d89c66d37d5a81d36262ad333d7">   96</a></span>&#160;  <span class="keyword">using</span> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a3bfd1d89c66d37d5a81d36262ad333d7">LayoutC</a> = LayoutC_;         </div><div class="line"><a name="l00097"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a09e5e5bfa35c1398346c452eb3deeb97">   97</a></span>&#160;  <span class="keyword">using</span> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a09e5e5bfa35c1398346c452eb3deeb97">Policy</a> = Policy_;           </div><div class="line"><a name="l00098"></a><span class="lineno">   98</span>&#160;</div><div class="line"><a name="l00099"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#aba5cccb236e9a9368a9ae1e1df869e57">   99</a></span>&#160;  <span class="keyword">using</span> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#aba5cccb236e9a9368a9ae1e1df869e57">SmemIteratorA</a> = SmemIteratorA_;</div><div class="line"><a name="l00100"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a0bbed549aeb0185a8486faa5fde22dd0">  100</a></span>&#160;  <span class="keyword">using</span> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a0bbed549aeb0185a8486faa5fde22dd0">SmemIteratorB</a> = SmemIteratorB_;</div><div class="line"><a name="l00101"></a><span class="lineno">  101</span>&#160;</div><div class="line"><a name="l00102"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#abbab0f268458143773b2654f48b54c1c">  102</a></span>&#160;  <span class="keyword">using</span> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#abbab0f268458143773b2654f48b54c1c">TransformA</a> = TransformA_;</div><div class="line"><a name="l00103"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a66d61ceb64d44f645a7ef8223ddac50d">  103</a></span>&#160;  <span class="keyword">using</span> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a66d61ceb64d44f645a7ef8223ddac50d">TransformB</a> = TransformB_;</div><div class="line"><a name="l00104"></a><span class="lineno">  104</span>&#160;</div><div class="line"><a name="l00105"></a><span class="lineno">  105</span>&#160;  <span class="comment">//</span></div><div class="line"><a name="l00106"></a><span class="lineno">  106</span>&#160;  <span class="comment">// Dependent types</span></div><div class="line"><a name="l00107"></a><span class="lineno">  107</span>&#160;  <span class="comment">//</span></div><div class="line"><a name="l00108"></a><span class="lineno">  108</span>&#160;</div><div class="line"><a name="l00110"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a8ab2e07ec7f4cf23ef158974fe3b8dbb">  110</a></span>&#160;  <span class="keyword">using</span> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a8ab2e07ec7f4cf23ef158974fe3b8dbb">FragmentA</a> = <span class="keyword">typename</span> IteratorA::Fragment;</div><div class="line"><a name="l00111"></a><span class="lineno">  111</span>&#160;</div><div class="line"><a name="l00113"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a8a6b28dec3f9687f46180d59ac9187fb">  113</a></span>&#160;  <span class="keyword">using</span> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a8a6b28dec3f9687f46180d59ac9187fb">FragmentB</a> = <span class="keyword">typename</span> IteratorB::Fragment;</div><div class="line"><a name="l00114"></a><span class="lineno">  114</span>&#160;</div><div class="line"><a name="l00116"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a2993390a8feebe3cf4bfb77a23c5ef33">  116</a></span>&#160;  <span class="keyword">using</span> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a2993390a8feebe3cf4bfb77a23c5ef33">FragmentC</a> = <span class="keyword">typename</span> Policy::Operator::FragmentC;</div><div class="line"><a name="l00117"></a><span class="lineno">  117</span>&#160;</div><div class="line"><a name="l00119"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#abecfbe5eb8946865dc9bbf14a6fc8aff">  119</a></span>&#160;  <span class="keyword">using</span> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#abecfbe5eb8946865dc9bbf14a6fc8aff">Operator</a> = <span class="keyword">typename</span> Policy::Operator;</div><div class="line"><a name="l00120"></a><span class="lineno">  120</span>&#160;</div><div class="line"><a name="l00121"></a><span class="lineno">  121</span>&#160;  <span class="comment">// staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)</span></div><div class="line"><a name="l00122"></a><span class="lineno">  122</span>&#160;  <a class="code" href="platform_8h.html#adde4c9ea91b753491851361a4198c009">static_assert</a>((<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#ad1267d78374c170d9addd137310d2d9a">Base::kStages</a>==2), <span class="stringliteral">&quot;MmaPipelined requires kStages set to value 2&quot;</span>);</div><div class="line"><a name="l00123"></a><span class="lineno">  123</span>&#160;</div><div class="line"><a name="l00124"></a><span class="lineno">  124</span>&#160;<span class="keyword">private</span>:</div><div class="line"><a name="l00125"></a><span class="lineno">  125</span>&#160;</div><div class="line"><a name="l00126"></a><span class="lineno">  126</span>&#160;  <span class="keyword">using</span> WarpFragmentA = <span class="keyword">typename</span> Operator::FragmentA;</div><div class="line"><a name="l00127"></a><span class="lineno">  127</span>&#160;  <span class="keyword">using</span> WarpFragmentB = <span class="keyword">typename</span> Operator::FragmentB;</div><div class="line"><a name="l00128"></a><span class="lineno">  128</span>&#160;</div><div class="line"><a name="l00129"></a><span class="lineno">  129</span>&#160;<span class="keyword">protected</span>:</div><div class="line"><a name="l00130"></a><span class="lineno">  130</span>&#160;</div><div class="line"><a name="l00132"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a9d3cbfd5a2bbe4d105df9555ddbeeb2d">  132</a></span>&#160;  <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#aba5cccb236e9a9368a9ae1e1df869e57">SmemIteratorA</a> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a9d3cbfd5a2bbe4d105df9555ddbeeb2d">smem_iterator_A_</a>;</div><div class="line"><a name="l00133"></a><span class="lineno">  133</span>&#160;</div><div class="line"><a name="l00135"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#aec92c64ede1c4233dac30d0b2ec9e394">  135</a></span>&#160;  <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a0bbed549aeb0185a8486faa5fde22dd0">SmemIteratorB</a> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#aec92c64ede1c4233dac30d0b2ec9e394">smem_iterator_B_</a>;</div><div class="line"><a name="l00136"></a><span class="lineno">  136</span>&#160;</div><div class="line"><a name="l00137"></a><span class="lineno">  137</span>&#160;<span class="keyword">public</span>:</div><div class="line"><a name="l00138"></a><span class="lineno">  138</span>&#160;</div><div class="line"><a name="l00140"></a><span class="lineno">  140</span>&#160;  CUTLASS_DEVICE</div><div class="line"><a name="l00141"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#ac8dc63b0f3ced3d8f615b56678e26400">  141</a></span>&#160;  <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#ac8dc63b0f3ced3d8f615b56678e26400">MmaPipelined</a>(</div><div class="line"><a name="l00142"></a><span class="lineno">  142</span>&#160;    <span class="keyword">typename</span> Base::SharedStorage &amp;shared_storage,       </div><div class="line"><a name="l00143"></a><span class="lineno">  143</span>&#160;    <span class="keywordtype">int</span> thread_idx,                                     </div><div class="line"><a name="l00144"></a><span class="lineno">  144</span>&#160;    <span class="keywordtype">int</span> warp_idx,                                       </div><div class="line"><a name="l00145"></a><span class="lineno">  145</span>&#160;    <span class="keywordtype">int</span> lane_idx                                        </div><div class="line"><a name="l00146"></a><span class="lineno">  146</span>&#160;  ):</div><div class="line"><a name="l00147"></a><span class="lineno">  147</span>&#160;    <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html">Base</a>(shared_storage, thread_idx, warp_idx, lane_idx),</div><div class="line"><a name="l00148"></a><span class="lineno">  148</span>&#160;    smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),</div><div class="line"><a name="l00149"></a><span class="lineno">  149</span>&#160;    smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) {</div><div class="line"><a name="l00150"></a><span class="lineno">  150</span>&#160;</div><div class="line"><a name="l00151"></a><span class="lineno">  151</span>&#160;    <span class="comment">// Compute warp location within threadblock tile by mapping the warp_id to</span></div><div class="line"><a name="l00152"></a><span class="lineno">  152</span>&#160;    <span class="comment">// three coordinates:</span></div><div class="line"><a name="l00153"></a><span class="lineno">  153</span>&#160;    <span class="comment">//   _m: the warp&#39;s position within the threadblock along the M dimension</span></div><div class="line"><a name="l00154"></a><span class="lineno">  154</span>&#160;    <span class="comment">//   _n: the warp&#39;s position within the threadblock along the N dimension</span></div><div class="line"><a name="l00155"></a><span class="lineno">  155</span>&#160;    <span class="comment">//   _k: the warp&#39;s position within the threadblock along the K dimension</span></div><div class="line"><a name="l00156"></a><span class="lineno">  156</span>&#160;</div><div class="line"><a name="l00157"></a><span class="lineno">  157</span>&#160;    <span class="keywordtype">int</span> warp_idx_mn = warp_idx % (<a class="code" href="structcutlass_1_1gemm_1_1GemmShape.html#a7a47fe0c44571a0a68a43c5a47cf676a">Base::WarpCount::kM</a> * <a class="code" href="structcutlass_1_1gemm_1_1GemmShape.html#a9fcbaa4b47b83d0c8a09979ad5c98a1e">Base::WarpCount::kN</a>);</div><div class="line"><a name="l00158"></a><span class="lineno">  158</span>&#160;    <span class="keywordtype">int</span> warp_idx_k = warp_idx / (<a class="code" href="structcutlass_1_1gemm_1_1GemmShape.html#a7a47fe0c44571a0a68a43c5a47cf676a">Base::WarpCount::kM</a> * <a class="code" href="structcutlass_1_1gemm_1_1GemmShape.html#a9fcbaa4b47b83d0c8a09979ad5c98a1e">Base::WarpCount::kN</a>);</div><div class="line"><a name="l00159"></a><span class="lineno">  159</span>&#160;</div><div class="line"><a name="l00160"></a><span class="lineno">  160</span>&#160;    <span class="keywordtype">int</span> warp_idx_m = warp_idx_mn % <a class="code" href="structcutlass_1_1gemm_1_1GemmShape.html#a7a47fe0c44571a0a68a43c5a47cf676a">Base::WarpCount::kM</a>;</div><div class="line"><a name="l00161"></a><span class="lineno">  161</span>&#160;    <span class="keywordtype">int</span> warp_idx_n = warp_idx_mn / <a class="code" href="structcutlass_1_1gemm_1_1GemmShape.html#a7a47fe0c44571a0a68a43c5a47cf676a">Base::WarpCount::kM</a>;</div><div class="line"><a name="l00162"></a><span class="lineno">  162</span>&#160;</div><div class="line"><a name="l00163"></a><span class="lineno">  163</span>&#160;    <span class="comment">// Add per-warp offsets in units of warp-level tiles</span></div><div class="line"><a name="l00164"></a><span class="lineno">  164</span>&#160;    this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a98647e72bab767b280e69679bdf9947f">warp_tile_iterator_A_</a>.add_tile_offset({warp_idx_m, <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a02f496a0fd1df929d8d4db9fea19160d">Base::kWarpGemmIterations</a> * warp_idx_k});</div><div class="line"><a name="l00165"></a><span class="lineno">  165</span>&#160;    this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a3ccfb6c23f8be68463a081dc18a7b6f1">warp_tile_iterator_B_</a>.add_tile_offset({<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a02f496a0fd1df929d8d4db9fea19160d">Base::kWarpGemmIterations</a> * warp_idx_k, warp_idx_n});</div><div class="line"><a name="l00166"></a><span class="lineno">  166</span>&#160;  }</div><div class="line"><a name="l00167"></a><span class="lineno">  167</span>&#160;</div><div class="line"><a name="l00169"></a><span class="lineno">  169</span>&#160;  CUTLASS_DEVICE</div><div class="line"><a name="l00170"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a2e45872db16c8ebe01289d7cb74fe3d7">  170</a></span>&#160;  <span class="keywordtype">void</span> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a2e45872db16c8ebe01289d7cb74fe3d7">operator()</a>(</div><div class="line"><a name="l00171"></a><span class="lineno">  171</span>&#160;    <span class="keywordtype">int</span> gemm_k_iterations,                            </div><div class="line"><a name="l00172"></a><span class="lineno">  172</span>&#160;    <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a2993390a8feebe3cf4bfb77a23c5ef33">FragmentC</a> &amp;accum,                                 </div><div class="line"><a name="l00173"></a><span class="lineno">  173</span>&#160;    <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a258197bea0d4894569c627c56a64203a">IteratorA</a> iterator_A,                             </div><div class="line"><a name="l00174"></a><span class="lineno">  174</span>&#160;    <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a3989c584460e66c75a3f37ef91ecf06d">IteratorB</a> iterator_B,                             </div><div class="line"><a name="l00175"></a><span class="lineno">  175</span>&#160;    <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a2993390a8feebe3cf4bfb77a23c5ef33">FragmentC</a> <span class="keyword">const</span> &amp;src_accum,                       </div><div class="line"><a name="l00176"></a><span class="lineno">  176</span>&#160;    <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#abbab0f268458143773b2654f48b54c1c">TransformA</a> transform_A = <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#abbab0f268458143773b2654f48b54c1c">TransformA</a>(),            </div><div class="line"><a name="l00177"></a><span class="lineno">  177</span>&#160;    <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a66d61ceb64d44f645a7ef8223ddac50d">TransformB</a> transform_B = <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a66d61ceb64d44f645a7ef8223ddac50d">TransformB</a>()) {          </div><div class="line"><a name="l00178"></a><span class="lineno">  178</span>&#160;</div><div class="line"><a name="l00179"></a><span class="lineno">  179</span>&#160;    <span class="comment">//</span></div><div class="line"><a name="l00180"></a><span class="lineno">  180</span>&#160;    <span class="comment">// Prologue</span></div><div class="line"><a name="l00181"></a><span class="lineno">  181</span>&#160;    <span class="comment">//</span></div><div class="line"><a name="l00182"></a><span class="lineno">  182</span>&#160;</div><div class="line"><a name="l00183"></a><span class="lineno">  183</span>&#160;    <span class="comment">// Perform accumulation in the &#39;d&#39; output operand</span></div><div class="line"><a name="l00184"></a><span class="lineno">  184</span>&#160;    accum = src_accum;</div><div class="line"><a name="l00185"></a><span class="lineno">  185</span>&#160;</div><div class="line"><a name="l00186"></a><span class="lineno">  186</span>&#160;    <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a8ab2e07ec7f4cf23ef158974fe3b8dbb">FragmentA</a> tb_frag_A;</div><div class="line"><a name="l00187"></a><span class="lineno">  187</span>&#160;    <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a8a6b28dec3f9687f46180d59ac9187fb">FragmentB</a> tb_frag_B;</div><div class="line"><a name="l00188"></a><span class="lineno">  188</span>&#160;</div><div class="line"><a name="l00189"></a><span class="lineno">  189</span>&#160;    tb_frag_A.clear();</div><div class="line"><a name="l00190"></a><span class="lineno">  190</span>&#160;    tb_frag_B.clear();</div><div class="line"><a name="l00191"></a><span class="lineno">  191</span>&#160;</div><div class="line"><a name="l00192"></a><span class="lineno">  192</span>&#160;    <span class="comment">// The last kblock is loaded in the prolog</span></div><div class="line"><a name="l00193"></a><span class="lineno">  193</span>&#160;    iterator_A.load(tb_frag_A);</div><div class="line"><a name="l00194"></a><span class="lineno">  194</span>&#160;    iterator_B.load(tb_frag_B);</div><div class="line"><a name="l00195"></a><span class="lineno">  195</span>&#160;</div><div class="line"><a name="l00196"></a><span class="lineno">  196</span>&#160;    ++iterator_A;</div><div class="line"><a name="l00197"></a><span class="lineno">  197</span>&#160;    ++iterator_B;</div><div class="line"><a name="l00198"></a><span class="lineno">  198</span>&#160;</div><div class="line"><a name="l00199"></a><span class="lineno">  199</span>&#160;    this-&gt;smem_iterator_A_.store(transform_A(tb_frag_A));</div><div class="line"><a name="l00200"></a><span class="lineno">  200</span>&#160;    this-&gt;smem_iterator_B_.store(transform_B(tb_frag_B));</div><div class="line"><a name="l00201"></a><span class="lineno">  201</span>&#160;</div><div class="line"><a name="l00202"></a><span class="lineno">  202</span>&#160;    ++this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a9d3cbfd5a2bbe4d105df9555ddbeeb2d">smem_iterator_A_</a>;</div><div class="line"><a name="l00203"></a><span class="lineno">  203</span>&#160;    ++this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#aec92c64ede1c4233dac30d0b2ec9e394">smem_iterator_B_</a>;</div><div class="line"><a name="l00204"></a><span class="lineno">  204</span>&#160;</div><div class="line"><a name="l00205"></a><span class="lineno">  205</span>&#160;    __syncthreads();</div><div class="line"><a name="l00206"></a><span class="lineno">  206</span>&#160;</div><div class="line"><a name="l00207"></a><span class="lineno">  207</span>&#160;    <span class="comment">// Pair of fragments used to overlap shared memory loads and math instructions</span></div><div class="line"><a name="l00208"></a><span class="lineno">  208</span>&#160;    WarpFragmentA warp_frag_A[2];</div><div class="line"><a name="l00209"></a><span class="lineno">  209</span>&#160;    WarpFragmentB warp_frag_B[2];</div><div class="line"><a name="l00210"></a><span class="lineno">  210</span>&#160;</div><div class="line"><a name="l00211"></a><span class="lineno">  211</span>&#160;    this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a98647e72bab767b280e69679bdf9947f">warp_tile_iterator_A_</a>.set_kgroup_index(0);</div><div class="line"><a name="l00212"></a><span class="lineno">  212</span>&#160;    this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a3ccfb6c23f8be68463a081dc18a7b6f1">warp_tile_iterator_B_</a>.set_kgroup_index(0);</div><div class="line"><a name="l00213"></a><span class="lineno">  213</span>&#160;</div><div class="line"><a name="l00214"></a><span class="lineno">  214</span>&#160;    this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a98647e72bab767b280e69679bdf9947f">warp_tile_iterator_A_</a>.load(warp_frag_A[0]);</div><div class="line"><a name="l00215"></a><span class="lineno">  215</span>&#160;    this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a3ccfb6c23f8be68463a081dc18a7b6f1">warp_tile_iterator_B_</a>.load(warp_frag_B[0]);</div><div class="line"><a name="l00216"></a><span class="lineno">  216</span>&#160;</div><div class="line"><a name="l00217"></a><span class="lineno">  217</span>&#160;    ++this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a98647e72bab767b280e69679bdf9947f">warp_tile_iterator_A_</a>;</div><div class="line"><a name="l00218"></a><span class="lineno">  218</span>&#160;    ++this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a3ccfb6c23f8be68463a081dc18a7b6f1">warp_tile_iterator_B_</a>;</div><div class="line"><a name="l00219"></a><span class="lineno">  219</span>&#160;</div><div class="line"><a name="l00220"></a><span class="lineno">  220</span>&#160;    <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#abecfbe5eb8946865dc9bbf14a6fc8aff">Operator</a> warp_mma;</div><div class="line"><a name="l00221"></a><span class="lineno">  221</span>&#160;</div><div class="line"><a name="l00222"></a><span class="lineno">  222</span>&#160;    <span class="keywordtype">int</span> smem_write_stage_idx = 1;</div><div class="line"><a name="l00223"></a><span class="lineno">  223</span>&#160;</div><div class="line"><a name="l00224"></a><span class="lineno">  224</span>&#160;    <span class="comment">// Avoid reading out of bounds</span></div><div class="line"><a name="l00225"></a><span class="lineno">  225</span>&#160;    <span class="keywordflow">if</span> (gemm_k_iterations &lt;= 1) {</div><div class="line"><a name="l00226"></a><span class="lineno">  226</span>&#160;      iterator_A.clear_mask();</div><div class="line"><a name="l00227"></a><span class="lineno">  227</span>&#160;      iterator_B.clear_mask();</div><div class="line"><a name="l00228"></a><span class="lineno">  228</span>&#160;    }</div><div class="line"><a name="l00229"></a><span class="lineno">  229</span>&#160;</div><div class="line"><a name="l00230"></a><span class="lineno">  230</span>&#160;    <span class="comment">// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing </span></div><div class="line"><a name="l00231"></a><span class="lineno">  231</span>&#160;    <span class="comment">// shared memory loads (which have the tighest latency requirement).</span></div><div class="line"><a name="l00232"></a><span class="lineno">  232</span>&#160;</div><div class="line"><a name="l00233"></a><span class="lineno">  233</span>&#160;    <span class="comment">//</span></div><div class="line"><a name="l00234"></a><span class="lineno">  234</span>&#160;    <span class="comment">// Mainloop</span></div><div class="line"><a name="l00235"></a><span class="lineno">  235</span>&#160;    <span class="comment">//</span></div><div class="line"><a name="l00236"></a><span class="lineno">  236</span>&#160;</div><div class="line"><a name="l00237"></a><span class="lineno">  237</span>&#160;    <span class="comment">// Note: The main loop does not support Base::kWarpGemmIterations == 2.</span></div><div class="line"><a name="l00238"></a><span class="lineno">  238</span>&#160;    <a class="code" href="cutlass_8h.html#a8e18ced39c05ab3304bb4fcdc0cc9f71">CUTLASS_GEMM_LOOP</a></div><div class="line"><a name="l00239"></a><span class="lineno">  239</span>&#160;    <span class="keywordflow">for</span> (; gemm_k_iterations &gt; 0; --gemm_k_iterations) {</div><div class="line"><a name="l00240"></a><span class="lineno">  240</span>&#160;      <span class="comment">//</span></div><div class="line"><a name="l00241"></a><span class="lineno">  241</span>&#160;      <span class="comment">// Loop over GEMM K dimension</span></div><div class="line"><a name="l00242"></a><span class="lineno">  242</span>&#160;      <span class="comment">//</span></div><div class="line"><a name="l00243"></a><span class="lineno">  243</span>&#160;</div><div class="line"><a name="l00244"></a><span class="lineno">  244</span>&#160;      <a class="code" href="cutlass_8h.html#a4b1c9f25ab6eaa25e1f2258dd63e6ce4">CUTLASS_PRAGMA_UNROLL</a></div><div class="line"><a name="l00245"></a><span class="lineno">  245</span>&#160;      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> warp_mma_k = 0; warp_mma_k &lt; <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a02f496a0fd1df929d8d4db9fea19160d">Base::kWarpGemmIterations</a>; ++warp_mma_k) {</div><div class="line"><a name="l00246"></a><span class="lineno">  246</span>&#160;</div><div class="line"><a name="l00247"></a><span class="lineno">  247</span>&#160;        <span class="comment">// Load warp-level tiles from shared memory, wrapping to k offset if this is the last group</span></div><div class="line"><a name="l00248"></a><span class="lineno">  248</span>&#160;        <span class="comment">// as the case may be.</span></div><div class="line"><a name="l00249"></a><span class="lineno">  249</span>&#160;</div><div class="line"><a name="l00250"></a><span class="lineno">  250</span>&#160;        <span class="keywordflow">if</span> (warp_mma_k == Base::kWarpGemmIterations - 1) {</div><div class="line"><a name="l00251"></a><span class="lineno">  251</span>&#160;</div><div class="line"><a name="l00252"></a><span class="lineno">  252</span>&#160;          <span class="comment">// Write fragments to shared memory</span></div><div class="line"><a name="l00253"></a><span class="lineno">  253</span>&#160;          this-&gt;smem_iterator_A_.store(transform_A(tb_frag_A));</div><div class="line"><a name="l00254"></a><span class="lineno">  254</span>&#160;</div><div class="line"><a name="l00255"></a><span class="lineno">  255</span>&#160;          this-&gt;smem_iterator_B_.store(transform_B(tb_frag_B));</div><div class="line"><a name="l00256"></a><span class="lineno">  256</span>&#160;</div><div class="line"><a name="l00257"></a><span class="lineno">  257</span>&#160;          __syncthreads();</div><div class="line"><a name="l00258"></a><span class="lineno">  258</span>&#160;          </div><div class="line"><a name="l00259"></a><span class="lineno">  259</span>&#160;          ++this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#aec92c64ede1c4233dac30d0b2ec9e394">smem_iterator_B_</a>;</div><div class="line"><a name="l00260"></a><span class="lineno">  260</span>&#160;          ++this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a9d3cbfd5a2bbe4d105df9555ddbeeb2d">smem_iterator_A_</a>;</div><div class="line"><a name="l00261"></a><span class="lineno">  261</span>&#160;</div><div class="line"><a name="l00262"></a><span class="lineno">  262</span>&#160;          <span class="comment">// Add negative offsets to return iterators to the &#39;start&#39; of the circular buffer in shared memory</span></div><div class="line"><a name="l00263"></a><span class="lineno">  263</span>&#160;          <span class="keywordflow">if</span> (smem_write_stage_idx == 1) {</div><div class="line"><a name="l00264"></a><span class="lineno">  264</span>&#160;            this-&gt;smem_iterator_A_.add_tile_offset({0, -<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#ad1267d78374c170d9addd137310d2d9a">Base::kStages</a>});</div><div class="line"><a name="l00265"></a><span class="lineno">  265</span>&#160;            this-&gt;smem_iterator_B_.add_tile_offset({-<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#ad1267d78374c170d9addd137310d2d9a">Base::kStages</a>, 0});</div><div class="line"><a name="l00266"></a><span class="lineno">  266</span>&#160;          }</div><div class="line"><a name="l00267"></a><span class="lineno">  267</span>&#160;          <span class="keywordflow">else</span> {</div><div class="line"><a name="l00268"></a><span class="lineno">  268</span>&#160;            this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a98647e72bab767b280e69679bdf9947f">warp_tile_iterator_A_</a>.add_tile_offset(</div><div class="line"><a name="l00269"></a><span class="lineno">  269</span>&#160;                {0, -<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#ad1267d78374c170d9addd137310d2d9a">Base::kStages</a> * Policy::kPartitionsK * Base::kWarpGemmIterations});</div><div class="line"><a name="l00270"></a><span class="lineno">  270</span>&#160;            this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a3ccfb6c23f8be68463a081dc18a7b6f1">warp_tile_iterator_B_</a>.add_tile_offset(</div><div class="line"><a name="l00271"></a><span class="lineno">  271</span>&#160;                {-<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#ad1267d78374c170d9addd137310d2d9a">Base::kStages</a> * Policy::kPartitionsK * <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a02f496a0fd1df929d8d4db9fea19160d">Base::kWarpGemmIterations</a>,</div><div class="line"><a name="l00272"></a><span class="lineno">  272</span>&#160;                 0});</div><div class="line"><a name="l00273"></a><span class="lineno">  273</span>&#160;          }</div><div class="line"><a name="l00274"></a><span class="lineno">  274</span>&#160;</div><div class="line"><a name="l00275"></a><span class="lineno">  275</span>&#160;          smem_write_stage_idx ^= 1;</div><div class="line"><a name="l00276"></a><span class="lineno">  276</span>&#160;        }</div><div class="line"><a name="l00277"></a><span class="lineno">  277</span>&#160;</div><div class="line"><a name="l00278"></a><span class="lineno">  278</span>&#160;        this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a98647e72bab767b280e69679bdf9947f">warp_tile_iterator_A_</a>.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);</div><div class="line"><a name="l00279"></a><span class="lineno">  279</span>&#160;        this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a3ccfb6c23f8be68463a081dc18a7b6f1">warp_tile_iterator_B_</a>.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);</div><div class="line"><a name="l00280"></a><span class="lineno">  280</span>&#160;        </div><div class="line"><a name="l00281"></a><span class="lineno">  281</span>&#160;        this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a98647e72bab767b280e69679bdf9947f">warp_tile_iterator_A_</a>.load(warp_frag_A[(warp_mma_k + 1) % 2]);</div><div class="line"><a name="l00282"></a><span class="lineno">  282</span>&#160;        this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a3ccfb6c23f8be68463a081dc18a7b6f1">warp_tile_iterator_B_</a>.load(warp_frag_B[(warp_mma_k + 1) % 2]);</div><div class="line"><a name="l00283"></a><span class="lineno">  283</span>&#160;</div><div class="line"><a name="l00284"></a><span class="lineno">  284</span>&#160;        ++this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a98647e72bab767b280e69679bdf9947f">warp_tile_iterator_A_</a>;</div><div class="line"><a name="l00285"></a><span class="lineno">  285</span>&#160;        ++this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a3ccfb6c23f8be68463a081dc18a7b6f1">warp_tile_iterator_B_</a>;</div><div class="line"><a name="l00286"></a><span class="lineno">  286</span>&#160;</div><div class="line"><a name="l00287"></a><span class="lineno">  287</span>&#160;        <span class="keywordflow">if</span> (warp_mma_k == 0) {</div><div class="line"><a name="l00288"></a><span class="lineno">  288</span>&#160;</div><div class="line"><a name="l00289"></a><span class="lineno">  289</span>&#160;          iterator_A.load(tb_frag_A);</div><div class="line"><a name="l00290"></a><span class="lineno">  290</span>&#160;          iterator_B.load(tb_frag_B);</div><div class="line"><a name="l00291"></a><span class="lineno">  291</span>&#160;</div><div class="line"><a name="l00292"></a><span class="lineno">  292</span>&#160;          ++iterator_A;</div><div class="line"><a name="l00293"></a><span class="lineno">  293</span>&#160;          ++iterator_B;</div><div class="line"><a name="l00294"></a><span class="lineno">  294</span>&#160;</div><div class="line"><a name="l00295"></a><span class="lineno">  295</span>&#160;          <span class="comment">// Avoid reading out of bounds if this was the last loop iteration</span></div><div class="line"><a name="l00296"></a><span class="lineno">  296</span>&#160;          <span class="keywordflow">if</span> (gemm_k_iterations &lt;= 2) {</div><div class="line"><a name="l00297"></a><span class="lineno">  297</span>&#160;            iterator_A.clear_mask();</div><div class="line"><a name="l00298"></a><span class="lineno">  298</span>&#160;            iterator_B.clear_mask();</div><div class="line"><a name="l00299"></a><span class="lineno">  299</span>&#160;          }</div><div class="line"><a name="l00300"></a><span class="lineno">  300</span>&#160;        }</div><div class="line"><a name="l00301"></a><span class="lineno">  301</span>&#160;</div><div class="line"><a name="l00302"></a><span class="lineno">  302</span>&#160;        warp_mma(accum, warp_frag_A[warp_mma_k % 2], warp_frag_B[warp_mma_k % 2], accum);</div><div class="line"><a name="l00303"></a><span class="lineno">  303</span>&#160;      }</div><div class="line"><a name="l00304"></a><span class="lineno">  304</span>&#160;    }</div><div class="line"><a name="l00305"></a><span class="lineno">  305</span>&#160;</div><div class="line"><a name="l00306"></a><span class="lineno">  306</span>&#160;  }</div><div class="line"><a name="l00307"></a><span class="lineno">  307</span>&#160;};</div><div class="line"><a name="l00308"></a><span class="lineno">  308</span>&#160;</div><div class="line"><a name="l00310"></a><span class="lineno">  310</span>&#160;</div><div class="line"><a name="l00311"></a><span class="lineno">  311</span>&#160;} <span class="comment">// namespace threadblock</span></div><div class="line"><a name="l00312"></a><span class="lineno">  312</span>&#160;} <span class="comment">// namespace gemm</span></div><div class="line"><a name="l00313"></a><span class="lineno">  313</span>&#160;} <span class="comment">// namespace cutlass</span></div><div class="ttc" id="structcutlass_1_1gemm_1_1GemmShape_html_a7a47fe0c44571a0a68a43c5a47cf676a"><div class="ttname"><a href="structcutlass_1_1gemm_1_1GemmShape.html#a7a47fe0c44571a0a68a43c5a47cf676a">cutlass::gemm::GemmShape::kM</a></div><div class="ttdeci">static int const kM</div><div class="ttdef"><b>Definition:</b> include/cutlass/gemm/gemm.h:58</div></div>
+<a href="mma__pipelined_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a name="l00001"></a><span class="lineno">    1</span>&#160;<span class="comment">/***************************************************************************************************</span></div><div class="line"><a name="l00002"></a><span class="lineno">    2</span>&#160;<span class="comment"> * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.</span></div><div class="line"><a name="l00003"></a><span class="lineno">    3</span>&#160;<span class="comment"> *</span></div><div class="line"><a name="l00004"></a><span class="lineno">    4</span>&#160;<span class="comment"> * Redistribution and use in source and binary forms, with or without modification, are permitted</span></div><div class="line"><a name="l00005"></a><span class="lineno">    5</span>&#160;<span class="comment"> * provided that the following conditions are met:</span></div><div class="line"><a name="l00006"></a><span class="lineno">    6</span>&#160;<span class="comment"> *     * Redistributions of source code must retain the above copyright notice, this list of</span></div><div class="line"><a name="l00007"></a><span class="lineno">    7</span>&#160;<span class="comment"> *       conditions and the following disclaimer.</span></div><div class="line"><a name="l00008"></a><span class="lineno">    8</span>&#160;<span class="comment"> *     * Redistributions in binary form must reproduce the above copyright notice, this list of</span></div><div class="line"><a name="l00009"></a><span class="lineno">    9</span>&#160;<span class="comment"> *       conditions and the following disclaimer in the documentation and/or other materials</span></div><div class="line"><a name="l00010"></a><span class="lineno">   10</span>&#160;<span class="comment"> *       provided with the distribution.</span></div><div class="line"><a name="l00011"></a><span class="lineno">   11</span>&#160;<span class="comment"> *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used</span></div><div class="line"><a name="l00012"></a><span class="lineno">   12</span>&#160;<span class="comment"> *       to endorse or promote products derived from this software without specific prior written</span></div><div class="line"><a name="l00013"></a><span class="lineno">   13</span>&#160;<span class="comment"> *       permission.</span></div><div class="line"><a name="l00014"></a><span class="lineno">   14</span>&#160;<span class="comment"> *</span></div><div class="line"><a name="l00015"></a><span class="lineno">   15</span>&#160;<span class="comment"> * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS &quot;AS IS&quot; AND ANY EXPRESS OR</span></div><div class="line"><a name="l00016"></a><span class="lineno">   16</span>&#160;<span class="comment"> * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND</span></div><div class="line"><a name="l00017"></a><span class="lineno">   17</span>&#160;<span class="comment"> * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE</span></div><div class="line"><a name="l00018"></a><span class="lineno">   18</span>&#160;<span class="comment"> * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,</span></div><div class="line"><a name="l00019"></a><span class="lineno">   19</span>&#160;<span class="comment"> * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;</span></div><div class="line"><a name="l00020"></a><span class="lineno">   20</span>&#160;<span class="comment"> * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,</span></div><div class="line"><a name="l00021"></a><span class="lineno">   21</span>&#160;<span class="comment"> * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE</span></div><div class="line"><a name="l00022"></a><span class="lineno">   22</span>&#160;<span class="comment"> * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.</span></div><div class="line"><a name="l00023"></a><span class="lineno">   23</span>&#160;<span class="comment"> *</span></div><div class="line"><a name="l00024"></a><span class="lineno">   24</span>&#160;<span class="comment"> **************************************************************************************************/</span></div><div class="line"><a name="l00029"></a><span class="lineno">   29</span>&#160;<span class="preprocessor">#pragma once</span></div><div class="line"><a name="l00030"></a><span class="lineno">   30</span>&#160;</div><div class="line"><a name="l00031"></a><span class="lineno">   31</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="cutlass_8h.html">cutlass/cutlass.h</a>&quot;</span></div><div class="line"><a name="l00032"></a><span class="lineno">   32</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="array_8h.html">cutlass/array.h</a>&quot;</span></div><div class="line"><a name="l00033"></a><span class="lineno">   33</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="aligned__buffer_8h.html">cutlass/aligned_buffer.h</a>&quot;</span></div><div class="line"><a name="l00034"></a><span class="lineno">   34</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="numeric__conversion_8h.html">cutlass/numeric_conversion.h</a>&quot;</span></div><div class="line"><a name="l00035"></a><span class="lineno">   35</span>&#160;</div><div class="line"><a name="l00036"></a><span class="lineno">   36</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="numeric__types_8h.html">cutlass/numeric_types.h</a>&quot;</span></div><div class="line"><a name="l00037"></a><span class="lineno">   37</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="matrix__shape_8h.html">cutlass/matrix_shape.h</a>&quot;</span></div><div class="line"><a name="l00038"></a><span class="lineno">   38</span>&#160;</div><div class="line"><a name="l00039"></a><span class="lineno">   39</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="include_2cutlass_2gemm_2gemm_8h.html">cutlass/gemm/gemm.h</a>&quot;</span></div><div class="line"><a name="l00040"></a><span class="lineno">   40</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="mma__base_8h.html">cutlass/gemm/threadblock/mma_base.h</a>&quot;</span></div><div class="line"><a name="l00041"></a><span class="lineno">   41</span>&#160;</div><div class="line"><a name="l00043"></a><span class="lineno">   43</span>&#160;</div><div class="line"><a name="l00044"></a><span class="lineno">   44</span>&#160;<span class="keyword">namespace </span><a class="code" href="namespacecutlass.html">cutlass</a> {</div><div class="line"><a name="l00045"></a><span class="lineno">   45</span>&#160;<span class="keyword">namespace </span>gemm {</div><div class="line"><a name="l00046"></a><span class="lineno">   46</span>&#160;<span class="keyword">namespace </span>threadblock {</div><div class="line"><a name="l00047"></a><span class="lineno">   47</span>&#160;</div><div class="line"><a name="l00049"></a><span class="lineno">   49</span>&#160;</div><div class="line"><a name="l00051"></a><span class="lineno">   51</span>&#160;<span class="keyword">template</span> &lt;</div><div class="line"><a name="l00053"></a><span class="lineno">   53</span>&#160;  <span class="keyword">typename</span> Shape_,</div><div class="line"><a name="l00055"></a><span class="lineno">   55</span>&#160;  <span class="comment">//  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)</span></div><div class="line"><a name="l00056"></a><span class="lineno">   56</span>&#160;  <span class="keyword">typename</span> IteratorA_,</div><div class="line"><a name="l00059"></a><span class="lineno">   59</span>&#160;  <span class="keyword">typename</span> SmemIteratorA_,</div><div class="line"><a name="l00061"></a><span class="lineno">   61</span>&#160;  <span class="comment">//  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)</span></div><div class="line"><a name="l00062"></a><span class="lineno">   62</span>&#160;  <span class="keyword">typename</span> IteratorB_,</div><div class="line"><a name="l00065"></a><span class="lineno">   65</span>&#160;  <span class="keyword">typename</span> SmemIteratorB_,</div><div class="line"><a name="l00067"></a><span class="lineno">   67</span>&#160;  <span class="keyword">typename</span> ElementC_,</div><div class="line"><a name="l00069"></a><span class="lineno">   69</span>&#160;  <span class="keyword">typename</span> LayoutC_,</div><div class="line"><a name="l00071"></a><span class="lineno">   71</span>&#160;  <span class="keyword">typename</span> Policy_,</div><div class="line"><a name="l00073"></a><span class="lineno">   73</span>&#160;  <span class="keyword">typename</span> TransformA_ = NumericArrayConverter&lt;</div><div class="line"><a name="l00074"></a><span class="lineno">   74</span>&#160;    <span class="keyword">typename</span> SmemIteratorA_::Element, </div><div class="line"><a name="l00075"></a><span class="lineno">   75</span>&#160;    <span class="keyword">typename</span> IteratorA_::Element, </div><div class="line"><a name="l00076"></a><span class="lineno">   76</span>&#160;    IteratorA_::Fragment::kElements&gt;,</div><div class="line"><a name="l00079"></a><span class="lineno">   79</span>&#160;  <span class="keyword">typename</span> TransformB_ = NumericArrayConverter&lt;</div><div class="line"><a name="l00080"></a><span class="lineno">   80</span>&#160;    <span class="keyword">typename</span> SmemIteratorB_::Element, </div><div class="line"><a name="l00081"></a><span class="lineno">   81</span>&#160;    <span class="keyword">typename</span> IteratorB_::Element, </div><div class="line"><a name="l00082"></a><span class="lineno">   82</span>&#160;    IteratorB_::Fragment::kElements&gt;,</div><div class="line"><a name="l00084"></a><span class="lineno">   84</span>&#160;  <span class="keyword">typename</span> Enable = <span class="keywordtype">bool</span></div><div class="line"><a name="l00085"></a><span class="lineno">   85</span>&#160;&gt;</div><div class="line"><a name="l00086"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html">   86</a></span>&#160;<span class="keyword">class </span><a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html">MmaPipelined</a> : <span class="keyword">public</span> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html">MmaBase</a>&lt;Shape_, Policy_, 2&gt; {</div><div class="line"><a name="l00087"></a><span class="lineno">   87</span>&#160;<span class="keyword">public</span>:</div><div class="line"><a name="l00088"></a><span class="lineno">   88</span>&#160;</div><div class="line"><a name="l00090"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a1ca2ed2c51ec508a6b6bb4af5f969076">   90</a></span>&#160;  <span class="keyword">using</span> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html">Base</a> = <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html">MmaBase&lt;Shape_, Policy_, 2&gt;</a>;</div><div class="line"><a name="l00091"></a><span class="lineno">   91</span>&#160;</div><div class="line"><a name="l00092"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#af42727c7f832320ae88178d163421dca">   92</a></span>&#160;  <span class="keyword">using</span> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#af42727c7f832320ae88178d163421dca">Shape</a> = Shape_;             </div><div class="line"><a name="l00093"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a258197bea0d4894569c627c56a64203a">   93</a></span>&#160;  <span class="keyword">using</span> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a258197bea0d4894569c627c56a64203a">IteratorA</a> = IteratorA_;     </div><div class="line"><a name="l00094"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a3989c584460e66c75a3f37ef91ecf06d">   94</a></span>&#160;  <span class="keyword">using</span> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a3989c584460e66c75a3f37ef91ecf06d">IteratorB</a> = IteratorB_;     </div><div class="line"><a name="l00095"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a102ddf57c1f46f5ed411b9d953e3f802">   95</a></span>&#160;  <span class="keyword">using</span> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a102ddf57c1f46f5ed411b9d953e3f802">ElementC</a> = ElementC_;       </div><div class="line"><a name="l00096"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a3bfd1d89c66d37d5a81d36262ad333d7">   96</a></span>&#160;  <span class="keyword">using</span> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a3bfd1d89c66d37d5a81d36262ad333d7">LayoutC</a> = LayoutC_;         </div><div class="line"><a name="l00097"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a09e5e5bfa35c1398346c452eb3deeb97">   97</a></span>&#160;  <span class="keyword">using</span> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a09e5e5bfa35c1398346c452eb3deeb97">Policy</a> = Policy_;           </div><div class="line"><a name="l00098"></a><span class="lineno">   98</span>&#160;</div><div class="line"><a name="l00099"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#aba5cccb236e9a9368a9ae1e1df869e57">   99</a></span>&#160;  <span class="keyword">using</span> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#aba5cccb236e9a9368a9ae1e1df869e57">SmemIteratorA</a> = SmemIteratorA_;</div><div class="line"><a name="l00100"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a0bbed549aeb0185a8486faa5fde22dd0">  100</a></span>&#160;  <span class="keyword">using</span> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a0bbed549aeb0185a8486faa5fde22dd0">SmemIteratorB</a> = SmemIteratorB_;</div><div class="line"><a name="l00101"></a><span class="lineno">  101</span>&#160;</div><div class="line"><a name="l00102"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#abbab0f268458143773b2654f48b54c1c">  102</a></span>&#160;  <span class="keyword">using</span> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#abbab0f268458143773b2654f48b54c1c">TransformA</a> = TransformA_;</div><div class="line"><a name="l00103"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a66d61ceb64d44f645a7ef8223ddac50d">  103</a></span>&#160;  <span class="keyword">using</span> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a66d61ceb64d44f645a7ef8223ddac50d">TransformB</a> = TransformB_;</div><div class="line"><a name="l00104"></a><span class="lineno">  104</span>&#160;</div><div class="line"><a name="l00105"></a><span class="lineno">  105</span>&#160;  <span class="comment">//</span></div><div class="line"><a name="l00106"></a><span class="lineno">  106</span>&#160;  <span class="comment">// Dependent types</span></div><div class="line"><a name="l00107"></a><span class="lineno">  107</span>&#160;  <span class="comment">//</span></div><div class="line"><a name="l00108"></a><span class="lineno">  108</span>&#160;</div><div class="line"><a name="l00110"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a8ab2e07ec7f4cf23ef158974fe3b8dbb">  110</a></span>&#160;  <span class="keyword">using</span> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a8ab2e07ec7f4cf23ef158974fe3b8dbb">FragmentA</a> = <span class="keyword">typename</span> IteratorA::Fragment;</div><div class="line"><a name="l00111"></a><span class="lineno">  111</span>&#160;</div><div class="line"><a name="l00113"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a8a6b28dec3f9687f46180d59ac9187fb">  113</a></span>&#160;  <span class="keyword">using</span> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a8a6b28dec3f9687f46180d59ac9187fb">FragmentB</a> = <span class="keyword">typename</span> IteratorB::Fragment;</div><div class="line"><a name="l00114"></a><span class="lineno">  114</span>&#160;</div><div class="line"><a name="l00116"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a2993390a8feebe3cf4bfb77a23c5ef33">  116</a></span>&#160;  <span class="keyword">using</span> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a2993390a8feebe3cf4bfb77a23c5ef33">FragmentC</a> = <span class="keyword">typename</span> Policy::Operator::FragmentC;</div><div class="line"><a name="l00117"></a><span class="lineno">  117</span>&#160;</div><div class="line"><a name="l00119"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#abecfbe5eb8946865dc9bbf14a6fc8aff">  119</a></span>&#160;  <span class="keyword">using</span> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#abecfbe5eb8946865dc9bbf14a6fc8aff">Operator</a> = <span class="keyword">typename</span> Policy::Operator;</div><div class="line"><a name="l00120"></a><span class="lineno">  120</span>&#160;</div><div class="line"><a name="l00121"></a><span class="lineno">  121</span>&#160;  <span class="comment">// staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)</span></div><div class="line"><a name="l00122"></a><span class="lineno">  122</span>&#160;  <a class="code" href="platform_8h.html#adde4c9ea91b753491851361a4198c009">static_assert</a>((<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#ad1267d78374c170d9addd137310d2d9a">Base::kStages</a>==2), <span class="stringliteral">&quot;MmaPipelined requires kStages set to value 2&quot;</span>);</div><div class="line"><a name="l00123"></a><span class="lineno">  123</span>&#160;</div><div class="line"><a name="l00124"></a><span class="lineno">  124</span>&#160;<span class="keyword">private</span>:</div><div class="line"><a name="l00125"></a><span class="lineno">  125</span>&#160;</div><div class="line"><a name="l00126"></a><span class="lineno">  126</span>&#160;  <span class="keyword">using</span> WarpFragmentA = <span class="keyword">typename</span> Operator::FragmentA;</div><div class="line"><a name="l00127"></a><span class="lineno">  127</span>&#160;  <span class="keyword">using</span> WarpFragmentB = <span class="keyword">typename</span> Operator::FragmentB;</div><div class="line"><a name="l00128"></a><span class="lineno">  128</span>&#160;</div><div class="line"><a name="l00129"></a><span class="lineno">  129</span>&#160;<span class="keyword">protected</span>:</div><div class="line"><a name="l00130"></a><span class="lineno">  130</span>&#160;</div><div class="line"><a name="l00132"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a9d3cbfd5a2bbe4d105df9555ddbeeb2d">  132</a></span>&#160;  <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#aba5cccb236e9a9368a9ae1e1df869e57">SmemIteratorA</a> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a9d3cbfd5a2bbe4d105df9555ddbeeb2d">smem_iterator_A_</a>;</div><div class="line"><a name="l00133"></a><span class="lineno">  133</span>&#160;</div><div class="line"><a name="l00135"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#aec92c64ede1c4233dac30d0b2ec9e394">  135</a></span>&#160;  <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a0bbed549aeb0185a8486faa5fde22dd0">SmemIteratorB</a> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#aec92c64ede1c4233dac30d0b2ec9e394">smem_iterator_B_</a>;</div><div class="line"><a name="l00136"></a><span class="lineno">  136</span>&#160;</div><div class="line"><a name="l00137"></a><span class="lineno">  137</span>&#160;<span class="keyword">public</span>:</div><div class="line"><a name="l00138"></a><span class="lineno">  138</span>&#160;</div><div class="line"><a name="l00140"></a><span class="lineno">  140</span>&#160;  CUTLASS_DEVICE</div><div class="line"><a name="l00141"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#ac8dc63b0f3ced3d8f615b56678e26400">  141</a></span>&#160;  <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#ac8dc63b0f3ced3d8f615b56678e26400">MmaPipelined</a>(</div><div class="line"><a name="l00142"></a><span class="lineno">  142</span>&#160;    <span class="keyword">typename</span> Base::SharedStorage &amp;shared_storage,       </div><div class="line"><a name="l00143"></a><span class="lineno">  143</span>&#160;    <span class="keywordtype">int</span> thread_idx,                                     </div><div class="line"><a name="l00144"></a><span class="lineno">  144</span>&#160;    <span class="keywordtype">int</span> warp_idx,                                       </div><div class="line"><a name="l00145"></a><span class="lineno">  145</span>&#160;    <span class="keywordtype">int</span> lane_idx                                        </div><div class="line"><a name="l00146"></a><span class="lineno">  146</span>&#160;  ):</div><div class="line"><a name="l00147"></a><span class="lineno">  147</span>&#160;    <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html">Base</a>(shared_storage, thread_idx, warp_idx, lane_idx),</div><div class="line"><a name="l00148"></a><span class="lineno">  148</span>&#160;    smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),</div><div class="line"><a name="l00149"></a><span class="lineno">  149</span>&#160;    smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) {</div><div class="line"><a name="l00150"></a><span class="lineno">  150</span>&#160;</div><div class="line"><a name="l00151"></a><span class="lineno">  151</span>&#160;    <span class="comment">// Compute warp location within threadblock tile by mapping the warp_id to</span></div><div class="line"><a name="l00152"></a><span class="lineno">  152</span>&#160;    <span class="comment">// three coordinates:</span></div><div class="line"><a name="l00153"></a><span class="lineno">  153</span>&#160;    <span class="comment">//   _m: the warp&#39;s position within the threadblock along the M dimension</span></div><div class="line"><a name="l00154"></a><span class="lineno">  154</span>&#160;    <span class="comment">//   _n: the warp&#39;s position within the threadblock along the N dimension</span></div><div class="line"><a name="l00155"></a><span class="lineno">  155</span>&#160;    <span class="comment">//   _k: the warp&#39;s position within the threadblock along the K dimension</span></div><div class="line"><a name="l00156"></a><span class="lineno">  156</span>&#160;</div><div class="line"><a name="l00157"></a><span class="lineno">  157</span>&#160;    <span class="keywordtype">int</span> warp_idx_mn = warp_idx % (<a class="code" href="structcutlass_1_1gemm_1_1GemmShape.html#a7a47fe0c44571a0a68a43c5a47cf676a">Base::WarpCount::kM</a> * <a class="code" href="structcutlass_1_1gemm_1_1GemmShape.html#a9fcbaa4b47b83d0c8a09979ad5c98a1e">Base::WarpCount::kN</a>);</div><div class="line"><a name="l00158"></a><span class="lineno">  158</span>&#160;    <span class="keywordtype">int</span> warp_idx_k = warp_idx / (<a class="code" href="structcutlass_1_1gemm_1_1GemmShape.html#a7a47fe0c44571a0a68a43c5a47cf676a">Base::WarpCount::kM</a> * <a class="code" href="structcutlass_1_1gemm_1_1GemmShape.html#a9fcbaa4b47b83d0c8a09979ad5c98a1e">Base::WarpCount::kN</a>);</div><div class="line"><a name="l00159"></a><span class="lineno">  159</span>&#160;</div><div class="line"><a name="l00160"></a><span class="lineno">  160</span>&#160;    <span class="keywordtype">int</span> warp_idx_m = warp_idx_mn % <a class="code" href="structcutlass_1_1gemm_1_1GemmShape.html#a7a47fe0c44571a0a68a43c5a47cf676a">Base::WarpCount::kM</a>;</div><div class="line"><a name="l00161"></a><span class="lineno">  161</span>&#160;    <span class="keywordtype">int</span> warp_idx_n = warp_idx_mn / <a class="code" href="structcutlass_1_1gemm_1_1GemmShape.html#a7a47fe0c44571a0a68a43c5a47cf676a">Base::WarpCount::kM</a>;</div><div class="line"><a name="l00162"></a><span class="lineno">  162</span>&#160;</div><div class="line"><a name="l00163"></a><span class="lineno">  163</span>&#160;    <span class="comment">// Add per-warp offsets in units of warp-level tiles</span></div><div class="line"><a name="l00164"></a><span class="lineno">  164</span>&#160;    this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a98647e72bab767b280e69679bdf9947f">warp_tile_iterator_A_</a>.add_tile_offset({warp_idx_m, <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a02f496a0fd1df929d8d4db9fea19160d">Base::kWarpGemmIterations</a> * warp_idx_k});</div><div class="line"><a name="l00165"></a><span class="lineno">  165</span>&#160;    this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a3ccfb6c23f8be68463a081dc18a7b6f1">warp_tile_iterator_B_</a>.add_tile_offset({<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a02f496a0fd1df929d8d4db9fea19160d">Base::kWarpGemmIterations</a> * warp_idx_k, warp_idx_n});</div><div class="line"><a name="l00166"></a><span class="lineno">  166</span>&#160;  }</div><div class="line"><a name="l00167"></a><span class="lineno">  167</span>&#160;</div><div class="line"><a name="l00169"></a><span class="lineno">  169</span>&#160;  CUTLASS_DEVICE</div><div class="line"><a name="l00170"></a><span class="lineno"><a class="line" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a2e45872db16c8ebe01289d7cb74fe3d7">  170</a></span>&#160;  <span class="keywordtype">void</span> <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a2e45872db16c8ebe01289d7cb74fe3d7">operator()</a>(</div><div class="line"><a name="l00171"></a><span class="lineno">  171</span>&#160;    <span class="keywordtype">int</span> gemm_k_iterations,                            </div><div class="line"><a name="l00172"></a><span class="lineno">  172</span>&#160;    <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a2993390a8feebe3cf4bfb77a23c5ef33">FragmentC</a> &amp;accum,                                 </div><div class="line"><a name="l00173"></a><span class="lineno">  173</span>&#160;    <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a258197bea0d4894569c627c56a64203a">IteratorA</a> iterator_A,                             </div><div class="line"><a name="l00174"></a><span class="lineno">  174</span>&#160;    <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a3989c584460e66c75a3f37ef91ecf06d">IteratorB</a> iterator_B,                             </div><div class="line"><a name="l00175"></a><span class="lineno">  175</span>&#160;    <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a2993390a8feebe3cf4bfb77a23c5ef33">FragmentC</a> <span class="keyword">const</span> &amp;src_accum,                       </div><div class="line"><a name="l00176"></a><span class="lineno">  176</span>&#160;    <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#abbab0f268458143773b2654f48b54c1c">TransformA</a> transform_A = <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#abbab0f268458143773b2654f48b54c1c">TransformA</a>(),            </div><div class="line"><a name="l00177"></a><span class="lineno">  177</span>&#160;    <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a66d61ceb64d44f645a7ef8223ddac50d">TransformB</a> transform_B = <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a66d61ceb64d44f645a7ef8223ddac50d">TransformB</a>()) {          </div><div class="line"><a name="l00178"></a><span class="lineno">  178</span>&#160;</div><div class="line"><a name="l00179"></a><span class="lineno">  179</span>&#160;    <span class="comment">//</span></div><div class="line"><a name="l00180"></a><span class="lineno">  180</span>&#160;    <span class="comment">// Prologue</span></div><div class="line"><a name="l00181"></a><span class="lineno">  181</span>&#160;    <span class="comment">//</span></div><div class="line"><a name="l00182"></a><span class="lineno">  182</span>&#160;</div><div class="line"><a name="l00183"></a><span class="lineno">  183</span>&#160;    <span class="comment">// Perform accumulation in the &#39;d&#39; output operand</span></div><div class="line"><a name="l00184"></a><span class="lineno">  184</span>&#160;    accum = src_accum;</div><div class="line"><a name="l00185"></a><span class="lineno">  185</span>&#160;</div><div class="line"><a name="l00186"></a><span class="lineno">  186</span>&#160;    <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a8ab2e07ec7f4cf23ef158974fe3b8dbb">FragmentA</a> tb_frag_A;</div><div class="line"><a name="l00187"></a><span class="lineno">  187</span>&#160;    <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a8a6b28dec3f9687f46180d59ac9187fb">FragmentB</a> tb_frag_B;</div><div class="line"><a name="l00188"></a><span class="lineno">  188</span>&#160;</div><div class="line"><a name="l00189"></a><span class="lineno">  189</span>&#160;    tb_frag_A.clear();</div><div class="line"><a name="l00190"></a><span class="lineno">  190</span>&#160;    tb_frag_B.clear();</div><div class="line"><a name="l00191"></a><span class="lineno">  191</span>&#160;</div><div class="line"><a name="l00192"></a><span class="lineno">  192</span>&#160;    <span class="comment">// The last kblock is loaded in the prolog</span></div><div class="line"><a name="l00193"></a><span class="lineno">  193</span>&#160;    iterator_A.load(tb_frag_A);</div><div class="line"><a name="l00194"></a><span class="lineno">  194</span>&#160;    iterator_B.load(tb_frag_B);</div><div class="line"><a name="l00195"></a><span class="lineno">  195</span>&#160;</div><div class="line"><a name="l00196"></a><span class="lineno">  196</span>&#160;    ++iterator_A;</div><div class="line"><a name="l00197"></a><span class="lineno">  197</span>&#160;    ++iterator_B;</div><div class="line"><a name="l00198"></a><span class="lineno">  198</span>&#160;</div><div class="line"><a name="l00199"></a><span class="lineno">  199</span>&#160;    this-&gt;smem_iterator_A_.store(transform_A(tb_frag_A));</div><div class="line"><a name="l00200"></a><span class="lineno">  200</span>&#160;    this-&gt;smem_iterator_B_.store(transform_B(tb_frag_B));</div><div class="line"><a name="l00201"></a><span class="lineno">  201</span>&#160;</div><div class="line"><a name="l00202"></a><span class="lineno">  202</span>&#160;    ++this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a9d3cbfd5a2bbe4d105df9555ddbeeb2d">smem_iterator_A_</a>;</div><div class="line"><a name="l00203"></a><span class="lineno">  203</span>&#160;    ++this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#aec92c64ede1c4233dac30d0b2ec9e394">smem_iterator_B_</a>;</div><div class="line"><a name="l00204"></a><span class="lineno">  204</span>&#160;</div><div class="line"><a name="l00205"></a><span class="lineno">  205</span>&#160;    __syncthreads();</div><div class="line"><a name="l00206"></a><span class="lineno">  206</span>&#160;</div><div class="line"><a name="l00207"></a><span class="lineno">  207</span>&#160;    <span class="comment">// Pair of fragments used to overlap shared memory loads and math instructions</span></div><div class="line"><a name="l00208"></a><span class="lineno">  208</span>&#160;    WarpFragmentA warp_frag_A[2];</div><div class="line"><a name="l00209"></a><span class="lineno">  209</span>&#160;    WarpFragmentB warp_frag_B[2];</div><div class="line"><a name="l00210"></a><span class="lineno">  210</span>&#160;</div><div class="line"><a name="l00211"></a><span class="lineno">  211</span>&#160;    this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a98647e72bab767b280e69679bdf9947f">warp_tile_iterator_A_</a>.set_kgroup_index(0);</div><div class="line"><a name="l00212"></a><span class="lineno">  212</span>&#160;    this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a3ccfb6c23f8be68463a081dc18a7b6f1">warp_tile_iterator_B_</a>.set_kgroup_index(0);</div><div class="line"><a name="l00213"></a><span class="lineno">  213</span>&#160;</div><div class="line"><a name="l00214"></a><span class="lineno">  214</span>&#160;    this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a98647e72bab767b280e69679bdf9947f">warp_tile_iterator_A_</a>.load(warp_frag_A[0]);</div><div class="line"><a name="l00215"></a><span class="lineno">  215</span>&#160;    this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a3ccfb6c23f8be68463a081dc18a7b6f1">warp_tile_iterator_B_</a>.load(warp_frag_B[0]);</div><div class="line"><a name="l00216"></a><span class="lineno">  216</span>&#160;</div><div class="line"><a name="l00217"></a><span class="lineno">  217</span>&#160;    ++this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a98647e72bab767b280e69679bdf9947f">warp_tile_iterator_A_</a>;</div><div class="line"><a name="l00218"></a><span class="lineno">  218</span>&#160;    ++this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a3ccfb6c23f8be68463a081dc18a7b6f1">warp_tile_iterator_B_</a>;</div><div class="line"><a name="l00219"></a><span class="lineno">  219</span>&#160;</div><div class="line"><a name="l00220"></a><span class="lineno">  220</span>&#160;    <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#abecfbe5eb8946865dc9bbf14a6fc8aff">Operator</a> warp_mma;</div><div class="line"><a name="l00221"></a><span class="lineno">  221</span>&#160;</div><div class="line"><a name="l00222"></a><span class="lineno">  222</span>&#160;    <span class="keywordtype">int</span> smem_write_stage_idx = 1;</div><div class="line"><a name="l00223"></a><span class="lineno">  223</span>&#160;</div><div class="line"><a name="l00224"></a><span class="lineno">  224</span>&#160;    <span class="comment">// Avoid reading out of bounds</span></div><div class="line"><a name="l00225"></a><span class="lineno">  225</span>&#160;    <span class="keywordflow">if</span> (gemm_k_iterations &lt;= 1) {</div><div class="line"><a name="l00226"></a><span class="lineno">  226</span>&#160;      iterator_A.clear_mask();</div><div class="line"><a name="l00227"></a><span class="lineno">  227</span>&#160;      iterator_B.clear_mask();</div><div class="line"><a name="l00228"></a><span class="lineno">  228</span>&#160;    }</div><div class="line"><a name="l00229"></a><span class="lineno">  229</span>&#160;</div><div class="line"><a name="l00230"></a><span class="lineno">  230</span>&#160;    <span class="comment">// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing </span></div><div class="line"><a name="l00231"></a><span class="lineno">  231</span>&#160;    <span class="comment">// shared memory loads (which have the tightest latency requirement).</span></div><div class="line"><a name="l00232"></a><span class="lineno">  232</span>&#160;</div><div class="line"><a name="l00233"></a><span class="lineno">  233</span>&#160;    <span class="comment">//</span></div><div class="line"><a name="l00234"></a><span class="lineno">  234</span>&#160;    <span class="comment">// Mainloop</span></div><div class="line"><a name="l00235"></a><span class="lineno">  235</span>&#160;    <span class="comment">//</span></div><div class="line"><a name="l00236"></a><span class="lineno">  236</span>&#160;</div><div class="line"><a name="l00237"></a><span class="lineno">  237</span>&#160;    <span class="comment">// Note: The main loop does not support Base::kWarpGemmIterations == 2.</span></div><div class="line"><a name="l00238"></a><span class="lineno">  238</span>&#160;    <a class="code" href="cutlass_8h.html#a8e18ced39c05ab3304bb4fcdc0cc9f71">CUTLASS_GEMM_LOOP</a></div><div class="line"><a name="l00239"></a><span class="lineno">  239</span>&#160;    <span class="keywordflow">for</span> (; gemm_k_iterations &gt; 0; --gemm_k_iterations) {</div><div class="line"><a name="l00240"></a><span class="lineno">  240</span>&#160;      <span class="comment">//</span></div><div class="line"><a name="l00241"></a><span class="lineno">  241</span>&#160;      <span class="comment">// Loop over GEMM K dimension</span></div><div class="line"><a name="l00242"></a><span class="lineno">  242</span>&#160;      <span class="comment">//</span></div><div class="line"><a name="l00243"></a><span class="lineno">  243</span>&#160;</div><div class="line"><a name="l00244"></a><span class="lineno">  244</span>&#160;      <a class="code" href="cutlass_8h.html#a4b1c9f25ab6eaa25e1f2258dd63e6ce4">CUTLASS_PRAGMA_UNROLL</a></div><div class="line"><a name="l00245"></a><span class="lineno">  245</span>&#160;      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> warp_mma_k = 0; warp_mma_k &lt; <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a02f496a0fd1df929d8d4db9fea19160d">Base::kWarpGemmIterations</a>; ++warp_mma_k) {</div><div class="line"><a name="l00246"></a><span class="lineno">  246</span>&#160;</div><div class="line"><a name="l00247"></a><span class="lineno">  247</span>&#160;        <span class="comment">// Load warp-level tiles from shared memory, wrapping to k offset if this is the last group</span></div><div class="line"><a name="l00248"></a><span class="lineno">  248</span>&#160;        <span class="comment">// as the case may be.</span></div><div class="line"><a name="l00249"></a><span class="lineno">  249</span>&#160;</div><div class="line"><a name="l00250"></a><span class="lineno">  250</span>&#160;        <span class="keywordflow">if</span> (warp_mma_k == Base::kWarpGemmIterations - 1) {</div><div class="line"><a name="l00251"></a><span class="lineno">  251</span>&#160;</div><div class="line"><a name="l00252"></a><span class="lineno">  252</span>&#160;          <span class="comment">// Write fragments to shared memory</span></div><div class="line"><a name="l00253"></a><span class="lineno">  253</span>&#160;          this-&gt;smem_iterator_A_.store(transform_A(tb_frag_A));</div><div class="line"><a name="l00254"></a><span class="lineno">  254</span>&#160;</div><div class="line"><a name="l00255"></a><span class="lineno">  255</span>&#160;          this-&gt;smem_iterator_B_.store(transform_B(tb_frag_B));</div><div class="line"><a name="l00256"></a><span class="lineno">  256</span>&#160;</div><div class="line"><a name="l00257"></a><span class="lineno">  257</span>&#160;          __syncthreads();</div><div class="line"><a name="l00258"></a><span class="lineno">  258</span>&#160;          </div><div class="line"><a name="l00259"></a><span class="lineno">  259</span>&#160;          ++this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#aec92c64ede1c4233dac30d0b2ec9e394">smem_iterator_B_</a>;</div><div class="line"><a name="l00260"></a><span class="lineno">  260</span>&#160;          ++this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a9d3cbfd5a2bbe4d105df9555ddbeeb2d">smem_iterator_A_</a>;</div><div class="line"><a name="l00261"></a><span class="lineno">  261</span>&#160;</div><div class="line"><a name="l00262"></a><span class="lineno">  262</span>&#160;          <span class="comment">// Add negative offsets to return iterators to the &#39;start&#39; of the circular buffer in shared memory</span></div><div class="line"><a name="l00263"></a><span class="lineno">  263</span>&#160;          <span class="keywordflow">if</span> (smem_write_stage_idx == 1) {</div><div class="line"><a name="l00264"></a><span class="lineno">  264</span>&#160;            this-&gt;smem_iterator_A_.add_tile_offset({0, -<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#ad1267d78374c170d9addd137310d2d9a">Base::kStages</a>});</div><div class="line"><a name="l00265"></a><span class="lineno">  265</span>&#160;            this-&gt;smem_iterator_B_.add_tile_offset({-<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#ad1267d78374c170d9addd137310d2d9a">Base::kStages</a>, 0});</div><div class="line"><a name="l00266"></a><span class="lineno">  266</span>&#160;          }</div><div class="line"><a name="l00267"></a><span class="lineno">  267</span>&#160;          <span class="keywordflow">else</span> {</div><div class="line"><a name="l00268"></a><span class="lineno">  268</span>&#160;            this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a98647e72bab767b280e69679bdf9947f">warp_tile_iterator_A_</a>.add_tile_offset(</div><div class="line"><a name="l00269"></a><span class="lineno">  269</span>&#160;                {0, -<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#ad1267d78374c170d9addd137310d2d9a">Base::kStages</a> * Policy::kPartitionsK * Base::kWarpGemmIterations});</div><div class="line"><a name="l00270"></a><span class="lineno">  270</span>&#160;            this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a3ccfb6c23f8be68463a081dc18a7b6f1">warp_tile_iterator_B_</a>.add_tile_offset(</div><div class="line"><a name="l00271"></a><span class="lineno">  271</span>&#160;                {-<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#ad1267d78374c170d9addd137310d2d9a">Base::kStages</a> * Policy::kPartitionsK * <a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a02f496a0fd1df929d8d4db9fea19160d">Base::kWarpGemmIterations</a>,</div><div class="line"><a name="l00272"></a><span class="lineno">  272</span>&#160;                 0});</div><div class="line"><a name="l00273"></a><span class="lineno">  273</span>&#160;          }</div><div class="line"><a name="l00274"></a><span class="lineno">  274</span>&#160;</div><div class="line"><a name="l00275"></a><span class="lineno">  275</span>&#160;          smem_write_stage_idx ^= 1;</div><div class="line"><a name="l00276"></a><span class="lineno">  276</span>&#160;        }</div><div class="line"><a name="l00277"></a><span class="lineno">  277</span>&#160;</div><div class="line"><a name="l00278"></a><span class="lineno">  278</span>&#160;        this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a98647e72bab767b280e69679bdf9947f">warp_tile_iterator_A_</a>.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);</div><div class="line"><a name="l00279"></a><span class="lineno">  279</span>&#160;        this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a3ccfb6c23f8be68463a081dc18a7b6f1">warp_tile_iterator_B_</a>.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);</div><div class="line"><a name="l00280"></a><span class="lineno">  280</span>&#160;        </div><div class="line"><a name="l00281"></a><span class="lineno">  281</span>&#160;        this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a98647e72bab767b280e69679bdf9947f">warp_tile_iterator_A_</a>.load(warp_frag_A[(warp_mma_k + 1) % 2]);</div><div class="line"><a name="l00282"></a><span class="lineno">  282</span>&#160;        this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a3ccfb6c23f8be68463a081dc18a7b6f1">warp_tile_iterator_B_</a>.load(warp_frag_B[(warp_mma_k + 1) % 2]);</div><div class="line"><a name="l00283"></a><span class="lineno">  283</span>&#160;</div><div class="line"><a name="l00284"></a><span class="lineno">  284</span>&#160;        ++this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a98647e72bab767b280e69679bdf9947f">warp_tile_iterator_A_</a>;</div><div class="line"><a name="l00285"></a><span class="lineno">  285</span>&#160;        ++this-&gt;<a class="code" href="classcutlass_1_1gemm_1_1threadblock_1_1MmaBase.html#a3ccfb6c23f8be68463a081dc18a7b6f1">warp_tile_iterator_B_</a>;</div><div class="line"><a name="l00286"></a><span class="lineno">  286</span>&#160;</div><div class="line"><a name="l00287"></a><span class="lineno">  287</span>&#160;        <span class="keywordflow">if</span> (warp_mma_k == 0) {</div><div class="line"><a name="l00288"></a><span class="lineno">  288</span>&#160;</div><div class="line"><a name="l00289"></a><span class="lineno">  289</span>&#160;          iterator_A.load(tb_frag_A);</div><div class="line"><a name="l00290"></a><span class="lineno">  290</span>&#160;          iterator_B.load(tb_frag_B);</div><div class="line"><a name="l00291"></a><span class="lineno">  291</span>&#160;</div><div class="line"><a name="l00292"></a><span class="lineno">  292</span>&#160;          ++iterator_A;</div><div class="line"><a name="l00293"></a><span class="lineno">  293</span>&#160;          ++iterator_B;</div><div class="line"><a name="l00294"></a><span class="lineno">  294</span>&#160;</div><div class="line"><a name="l00295"></a><span class="lineno">  295</span>&#160;          <span class="comment">// Avoid reading out of bounds if this was the last loop iteration</span></div><div class="line"><a name="l00296"></a><span class="lineno">  296</span>&#160;          <span class="keywordflow">if</span> (gemm_k_iterations &lt;= 2) {</div><div class="line"><a name="l00297"></a><span class="lineno">  297</span>&#160;            iterator_A.clear_mask();</div><div class="line"><a name="l00298"></a><span class="lineno">  298</span>&#160;            iterator_B.clear_mask();</div><div class="line"><a name="l00299"></a><span class="lineno">  299</span>&#160;          }</div><div class="line"><a name="l00300"></a><span class="lineno">  300</span>&#160;        }</div><div class="line"><a name="l00301"></a><span class="lineno">  301</span>&#160;</div><div class="line"><a name="l00302"></a><span class="lineno">  302</span>&#160;        warp_mma(accum, warp_frag_A[warp_mma_k % 2], warp_frag_B[warp_mma_k % 2], accum);</div><div class="line"><a name="l00303"></a><span class="lineno">  303</span>&#160;      }</div><div class="line"><a name="l00304"></a><span class="lineno">  304</span>&#160;    }</div><div class="line"><a name="l00305"></a><span class="lineno">  305</span>&#160;</div><div class="line"><a name="l00306"></a><span class="lineno">  306</span>&#160;  }</div><div class="line"><a name="l00307"></a><span class="lineno">  307</span>&#160;};</div><div class="line"><a name="l00308"></a><span class="lineno">  308</span>&#160;</div><div class="line"><a name="l00310"></a><span class="lineno">  310</span>&#160;</div><div class="line"><a name="l00311"></a><span class="lineno">  311</span>&#160;} <span class="comment">// namespace threadblock</span></div><div class="line"><a name="l00312"></a><span class="lineno">  312</span>&#160;} <span class="comment">// namespace gemm</span></div><div class="line"><a name="l00313"></a><span class="lineno">  313</span>&#160;} <span class="comment">// namespace cutlass</span></div><div class="ttc" id="structcutlass_1_1gemm_1_1GemmShape_html_a7a47fe0c44571a0a68a43c5a47cf676a"><div class="ttname"><a href="structcutlass_1_1gemm_1_1GemmShape.html#a7a47fe0c44571a0a68a43c5a47cf676a">cutlass::gemm::GemmShape::kM</a></div><div class="ttdeci">static int const kM</div><div class="ttdef"><b>Definition:</b> include/cutlass/gemm/gemm.h:58</div></div>
 <div class="ttc" id="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined_html_a3bfd1d89c66d37d5a81d36262ad333d7"><div class="ttname"><a href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a3bfd1d89c66d37d5a81d36262ad333d7">cutlass::gemm::threadblock::MmaPipelined::LayoutC</a></div><div class="ttdeci">LayoutC_ LayoutC</div><div class="ttdoc">Layout of accumulator matrix. </div><div class="ttdef"><b>Definition:</b> mma_pipelined.h:96</div></div>
 <div class="ttc" id="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined_html_a66d61ceb64d44f645a7ef8223ddac50d"><div class="ttname"><a href="classcutlass_1_1gemm_1_1threadblock_1_1MmaPipelined.html#a66d61ceb64d44f645a7ef8223ddac50d">cutlass::gemm::threadblock::MmaPipelined::TransformB</a></div><div class="ttdeci">TransformB_ TransformB</div><div class="ttdef"><b>Definition:</b> mma_pipelined.h:103</div></div>
 <div class="ttc" id="namespacecutlass_html"><div class="ttname"><a href="namespacecutlass.html">cutlass</a></div><div class="ttdef"><b>Definition:</b> aligned_buffer.h:35</div></div>
diff --git a/docs/namespacecutlass_1_1gemm_1_1device.html b/docs/namespacecutlass_1_1gemm_1_1device.html
index 7023f4f88a..c0b27cbe97 100644
--- a/docs/namespacecutlass_1_1gemm_1_1device.html
+++ b/docs/namespacecutlass_1_1gemm_1_1device.html
@@ -134,17 +134,17 @@
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm.html">Gemm</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html">Gemm&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero &gt;</a></td></tr>
-<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Parital specialization for column-major output exchanges problem size and operand.  <a href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html#details">More...</a><br /></td></tr>
+<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for column-major output exchanges problem size and operand.  <a href="classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched.html">GemmBatched</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html">GemmBatched&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ &gt;</a></td></tr>
-<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Parital specialization for column-major output exchanges problem size and operand.  <a href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html#details">More...</a><br /></td></tr>
+<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for column-major output exchanges problem size and operand.  <a href="classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex.html">GemmComplex</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html">GemmComplex&lt; ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial &gt;</a></td></tr>
-<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Parital specialization for column-major output exchanges problem size and operand.  <a href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html#details">More...</a><br /></td></tr>
+<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for column-major output exchanges problem size and operand.  <a href="classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcutlass_1_1gemm_1_1device_1_1GemmSplitKParallel.html">GemmSplitKParallel</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
diff --git a/docs/namespacecutlass_1_1reference_1_1device.html b/docs/namespacecutlass_1_1reference_1_1device.html
index 86f21a00bb..54f5009f41 100644
--- a/docs/namespacecutlass_1_1reference_1_1device.html
+++ b/docs/namespacecutlass_1_1reference_1_1device.html
@@ -125,7 +125,7 @@
 <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for multiply-add-saturate.  <a href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout30b72addd464a2ca4a26785cbfd77a8e.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpXorPopc &gt;</a></td></tr>
-<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Parital specialization for XOR-popc.  <a href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html#details">More...</a><br /></td></tr>
+<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for XOR-popc.  <a href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1device_1_1TensorDiagonalForEach.html">TensorDiagonalForEach</a></td></tr>
 <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Launches a kernel calling a functor for each element along a tensor's diagonal.  <a href="structcutlass_1_1reference_1_1device_1_1TensorDiagonalForEach.html#details">More...</a><br /></td></tr>
@@ -183,7 +183,7 @@
 <tr class="separator:a6e23d479ebb3760d5846ed1b67e450e4"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a6b0f21995c4fd5c33617550e6905c78e"><td class="memTemplParams" colspan="2">template&lt;typename Element , typename Layout &gt; </td></tr>
 <tr class="memitem:a6b0f21995c4fd5c33617550e6905c78e"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1device.html#a6b0f21995c4fd5c33617550e6905c78e">TensorFillIdentity</a> (<a class="el" href="classcutlass_1_1TensorView.html">TensorView</a>&lt; Element, Layout &gt; view)</td></tr>
-<tr class="memdesc:a6b0f21995c4fd5c33617550e6905c78e"><td class="mdescLeft">&#160;</td><td class="mdescRight">Fills a tensor's digonal with 1 and 0 everywhere else.  <a href="#a6b0f21995c4fd5c33617550e6905c78e">More...</a><br /></td></tr>
+<tr class="memdesc:a6b0f21995c4fd5c33617550e6905c78e"><td class="mdescLeft">&#160;</td><td class="mdescRight">Fills a tensor's diagonal with 1 and 0 everywhere else.  <a href="#a6b0f21995c4fd5c33617550e6905c78e">More...</a><br /></td></tr>
 <tr class="separator:a6b0f21995c4fd5c33617550e6905c78e"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:aaff3d7919a2f2dce14eb254c17eead9a"><td class="memTemplParams" colspan="2">template&lt;typename Element , typename Layout &gt; </td></tr>
 <tr class="memitem:aaff3d7919a2f2dce14eb254c17eead9a"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1device.html#aaff3d7919a2f2dce14eb254c17eead9a">TensorUpdateDiagonal</a> (<a class="el" href="classcutlass_1_1TensorView.html">TensorView</a>&lt; Element, Layout &gt; view, Element diag=Element(1))</td></tr>
diff --git a/docs/namespacecutlass_1_1reference_1_1host.html b/docs/namespacecutlass_1_1reference_1_1host.html
index 90f9a01442..194bdfe88d 100644
--- a/docs/namespacecutlass_1_1reference_1_1host.html
+++ b/docs/namespacecutlass_1_1reference_1_1host.html
@@ -122,7 +122,7 @@
 <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for multiply-add-saturate.  <a href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_55729eac7dbd6bf311ea36f680e83e93.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html">Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpXorPopc &gt;</a></td></tr>
-<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Parital specialization for XOR-popc.  <a href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html#details">More...</a><br /></td></tr>
+<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for XOR-popc.  <a href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="func-members"></a>
@@ -247,7 +247,7 @@
 <tr class="separator:a1c81144ca36832a48d04d1b5b6498080"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a29548cb522d9c147cf34263ecac75d89"><td class="memTemplParams" colspan="2">template&lt;typename Element , typename Layout &gt; </td></tr>
 <tr class="memitem:a29548cb522d9c147cf34263ecac75d89"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1host.html#a29548cb522d9c147cf34263ecac75d89">TensorFillIdentity</a> (<a class="el" href="classcutlass_1_1TensorView.html">TensorView</a>&lt; Element, Layout &gt; dst)</td></tr>
-<tr class="memdesc:a29548cb522d9c147cf34263ecac75d89"><td class="mdescLeft">&#160;</td><td class="mdescRight">Helper to fill a tensor's digonal with 1 and 0 everywhere else.  <a href="#a29548cb522d9c147cf34263ecac75d89">More...</a><br /></td></tr>
+<tr class="memdesc:a29548cb522d9c147cf34263ecac75d89"><td class="mdescLeft">&#160;</td><td class="mdescRight">Helper to fill a tensor's diagonal with 1 and 0 everywhere else.  <a href="#a29548cb522d9c147cf34263ecac75d89">More...</a><br /></td></tr>
 <tr class="separator:a29548cb522d9c147cf34263ecac75d89"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:acbf747241e8ac6ef9b1702b735a7913e"><td class="memTemplParams" colspan="2">template&lt;typename Element , typename Layout &gt; </td></tr>
 <tr class="memitem:acbf747241e8ac6ef9b1702b735a7913e"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacecutlass_1_1reference_1_1host.html#acbf747241e8ac6ef9b1702b735a7913e">TensorUpdateDiagonal</a> (<a class="el" href="classcutlass_1_1TensorView.html">TensorView</a>&lt; Element, Layout &gt; dst, Element val=Element(1))</td></tr>
@@ -1677,7 +1677,7 @@ <h2 class="groupheader">Function Documentation</h2>
         </tr>
       </table>
 </div><div class="memdoc">
-<p>Returns a pair containing a boolean of whether a value exists in a tensor and the location of of the first occurrence. If the value is not contained in the tensor, the second element of the pair is undefined. </p>
+<p>Returns a pair containing a boolean of whether a value exists in a tensor and the location of the first occurrence. If the value is not contained in the tensor, the second element of the pair is undefined. </p>
 
 </div>
 </div>
diff --git a/docs/namespacecutlass_1_1transform.html b/docs/namespacecutlass_1_1transform.html
index 974fe32468..c8eb5ba164 100644
--- a/docs/namespacecutlass_1_1transform.html
+++ b/docs/namespacecutlass_1_1transform.html
@@ -128,7 +128,7 @@
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1transform_1_1TransposePitchLinearThreadMap.html">TransposePitchLinearThreadMap</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1transform_1_1TransposePitchLinearThreadMap2DThreadTile.html">TransposePitchLinearThreadMap2DThreadTile</a></td></tr>
-<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Thread Mapping a 2D threadtiled mapping as a tranposed Pitchlinear2DThreadTile mapping.  <a href="structcutlass_1_1transform_1_1TransposePitchLinearThreadMap2DThreadTile.html#details">More...</a><br /></td></tr>
+<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Thread Mapping a 2D threadtiled mapping as a transposed Pitchlinear2DThreadTile mapping.  <a href="structcutlass_1_1transform_1_1TransposePitchLinearThreadMap2DThreadTile.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1transform_1_1TransposePitchLinearThreadMapSimt.html">TransposePitchLinearThreadMapSimt</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
diff --git a/docs/pitch__linear__thread__map_8h.html b/docs/pitch__linear__thread__map_8h.html
index 13fef0cc77..0266464109 100644
--- a/docs/pitch__linear__thread__map_8h.html
+++ b/docs/pitch__linear__thread__map_8h.html
@@ -164,7 +164,7 @@
 <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Internal implementation details.  <a href="structcutlass_1_1transform_1_1PitchLinear2DThreadTileStripminedThreadMap_3_01Shape___00_01Thread896c01a3c466da1bf392e0cdfced4d53.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1transform_1_1TransposePitchLinearThreadMap2DThreadTile.html">cutlass::transform::TransposePitchLinearThreadMap2DThreadTile&lt; ThreadMap_ &gt;</a></td></tr>
-<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Thread Mapping a 2D threadtiled mapping as a tranposed Pitchlinear2DThreadTile mapping.  <a href="structcutlass_1_1transform_1_1TransposePitchLinearThreadMap2DThreadTile.html#details">More...</a><br /></td></tr>
+<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Thread Mapping a 2D threadtiled mapping as a transposed Pitchlinear2DThreadTile mapping.  <a href="structcutlass_1_1transform_1_1TransposePitchLinearThreadMap2DThreadTile.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="namespaces"></a>
diff --git a/docs/pitch__linear__thread__map_8h_source.html b/docs/pitch__linear__thread__map_8h_source.html
index fcdf70ae10..9f18269e15 100644
--- a/docs/pitch__linear__thread__map_8h_source.html
+++ b/docs/pitch__linear__thread__map_8h_source.html
@@ -129,7 +129,7 @@
 <div class="ttc" id="structcutlass_1_1transform_1_1PitchLinearWarpStripedThreadMap_html"><div class="ttname"><a href="structcutlass_1_1transform_1_1PitchLinearWarpStripedThreadMap.html">cutlass::transform::PitchLinearWarpStripedThreadMap</a></div><div class="ttdef"><b>Definition:</b> pitch_linear_thread_map.h:491</div></div>
 <div class="ttc" id="structcutlass_1_1transform_1_1PitchLinearTilePolicyStripminedThreadStrided_html_ab970c0505583b4e6f928b9a790c03856"><div class="ttname"><a href="structcutlass_1_1transform_1_1PitchLinearTilePolicyStripminedThreadStrided.html#ab970c0505583b4e6f928b9a790c03856">cutlass::transform::PitchLinearTilePolicyStripminedThreadStrided::initial_offset</a></div><div class="ttdeci">static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)</div><div class="ttdef"><b>Definition:</b> pitch_linear_thread_map.h:187</div></div>
 <div class="ttc" id="structcutlass_1_1transform_1_1PitchLinearWarpRakedThreadMap_1_1Detail_html_ae5f2c732f0ae6ecedb89b86de30aacf1"><div class="ttname"><a href="structcutlass_1_1transform_1_1PitchLinearWarpRakedThreadMap_1_1Detail.html#ae5f2c732f0ae6ecedb89b86de30aacf1">cutlass::transform::PitchLinearWarpRakedThreadMap::Detail::WarpThreadArrangement</a></div><div class="ttdeci">WarpThreadArrangement_ WarpThreadArrangement</div><div class="ttdoc">Fixed arrangement of threads within a warp (units of threads). </div><div class="ttdef"><b>Definition:</b> pitch_linear_thread_map.h:226</div></div>
-<div class="ttc" id="structcutlass_1_1transform_1_1TransposePitchLinearThreadMap2DThreadTile_html"><div class="ttname"><a href="structcutlass_1_1transform_1_1TransposePitchLinearThreadMap2DThreadTile.html">cutlass::transform::TransposePitchLinearThreadMap2DThreadTile</a></div><div class="ttdoc">Thread Mapping a 2D threadtiled mapping as a tranposed Pitchlinear2DThreadTile mapping. </div><div class="ttdef"><b>Definition:</b> pitch_linear_thread_map.h:713</div></div>
+<div class="ttc" id="structcutlass_1_1transform_1_1TransposePitchLinearThreadMap2DThreadTile_html"><div class="ttname"><a href="structcutlass_1_1transform_1_1TransposePitchLinearThreadMap2DThreadTile.html">cutlass::transform::TransposePitchLinearThreadMap2DThreadTile</a></div><div class="ttdoc">Thread Mapping a 2D threadtiled mapping as a transposed Pitchlinear2DThreadTile mapping. </div><div class="ttdef"><b>Definition:</b> pitch_linear_thread_map.h:713</div></div>
 <div class="ttc" id="structcutlass_1_1transform_1_1TransposePitchLinearThreadMap_1_1Detail_html_a6df9849627a40c17ac90a758ef5d2f9e"><div class="ttname"><a href="structcutlass_1_1transform_1_1TransposePitchLinearThreadMap_1_1Detail.html#a6df9849627a40c17ac90a758ef5d2f9e">cutlass::transform::TransposePitchLinearThreadMap::Detail::WarpThreadArrangement</a></div><div class="ttdeci">WarpThreadArrangement_ WarpThreadArrangement</div><div class="ttdoc">Fixed arrangement of threads within a warp (units of threads). </div><div class="ttdef"><b>Definition:</b> pitch_linear_thread_map.h:355</div></div>
 <div class="ttc" id="structcutlass_1_1transform_1_1TransposePitchLinearThreadMap_1_1Detail_html"><div class="ttname"><a href="structcutlass_1_1transform_1_1TransposePitchLinearThreadMap_1_1Detail.html">cutlass::transform::TransposePitchLinearThreadMap::Detail</a></div><div class="ttdoc">Internal details made public to facilitate introspection Iterations along each dimension (concept: Pi...</div><div class="ttdef"><b>Definition:</b> pitch_linear_thread_map.h:353</div></div>
 <div class="ttc" id="structcutlass_1_1transform_1_1PitchLinearWarpRakedThreadMap_html"><div class="ttname"><a href="structcutlass_1_1transform_1_1PitchLinearWarpRakedThreadMap.html">cutlass::transform::PitchLinearWarpRakedThreadMap</a></div><div class="ttdef"><b>Definition:</b> pitch_linear_thread_map.h:205</div></div>
diff --git a/docs/search/all_12.js b/docs/search/all_12.js
index 0e091040bc..c9f8a45c70 100644
--- a/docs/search/all_12.js
+++ b/docs/search/all_12.js
@@ -14,7 +14,7 @@ var searchData=
   ['semaphore',['Semaphore',['../classcutlass_1_1Semaphore.html',1,'cutlass']]],
   ['semaphore',['Semaphore',['../classcutlass_1_1Semaphore.html#a2ce4cd07fe773efa429f726cfbd98070',1,'cutlass::Semaphore::Semaphore()'],['../structcutlass_1_1gemm_1_1kernel_1_1Gemm_1_1Params.html#adec6d0c6d74e7f456196f453e302fbbb',1,'cutlass::gemm::kernel::Gemm::Params::semaphore()']]],
   ['semaphore_2eh',['semaphore.h',['../semaphore_8h.html',1,'']]],
-  ['seperate_5fstring',['seperate_string',['../structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590',1,'cutlass::CommandLine']]],
+  ['separate_5fstring',['separate_string',['../structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590',1,'cutlass::CommandLine']]],
   ['sequential',['sequential',['../structcutlass_1_1Distribution.html#ab86d975567ef141ff82067b1f41cd3ee',1,'cutlass::Distribution::sequential()'],['../structcutlass_1_1Distribution.html#a499f4023e0d42356ce71d38cc32bf92aa39d3cf55e90573c8d1dfb483cfb410dc',1,'cutlass::Distribution::Sequential()']]],
   ['set',['set',['../classcutlass_1_1PredicateVector_1_1Iterator.html#aadfd039b5622098c9e46706a27122575',1,'cutlass::PredicateVector::Iterator::set()'],['../structcutlass_1_1PredicateVector.html#a062fa8a8df725ef08ced2ffcca8336af',1,'cutlass::PredicateVector::set()'],['../classcutlass_1_1SubbyteReference.html#a6473e57520d8ee7afbd95c1e1641e05a',1,'cutlass::SubbyteReference::set()']]],
   ['set_5fgaussian',['set_gaussian',['../structcutlass_1_1Distribution.html#ad594b5ec1d577e8ef03d4d808a8220b1',1,'cutlass::Distribution']]],
diff --git a/docs/search/functions_12.js b/docs/search/functions_12.js
index f2b3bff9cf..6648b431e4 100644
--- a/docs/search/functions_12.js
+++ b/docs/search/functions_12.js
@@ -3,7 +3,7 @@ var searchData=
   ['scalar_5fop',['scalar_op',['../structcutlass_1_1minimum_3_01Array_3_01T_00_01N_01_4_01_4.html#a4b42227184cb7c796460062c46a84b57',1,'cutlass::minimum&lt; Array&lt; T, N &gt; &gt;']]],
   ['scalario',['ScalarIO',['../structcutlass_1_1ScalarIO.html#ad4166575521254088bf6c6300c351714',1,'cutlass::ScalarIO::ScalarIO()'],['../structcutlass_1_1ScalarIO.html#a5227e1e9ed24326ad4f8dc94d186186f',1,'cutlass::ScalarIO::ScalarIO(T value)']]],
   ['semaphore',['Semaphore',['../classcutlass_1_1Semaphore.html#a2ce4cd07fe773efa429f726cfbd98070',1,'cutlass::Semaphore']]],
-  ['seperate_5fstring',['seperate_string',['../structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590',1,'cutlass::CommandLine']]],
+  ['separate_5fstring',['separate_string',['../structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590',1,'cutlass::CommandLine']]],
   ['set',['set',['../classcutlass_1_1PredicateVector_1_1Iterator.html#aadfd039b5622098c9e46706a27122575',1,'cutlass::PredicateVector::Iterator::set()'],['../structcutlass_1_1PredicateVector.html#a062fa8a8df725ef08ced2ffcca8336af',1,'cutlass::PredicateVector::set()'],['../classcutlass_1_1SubbyteReference.html#a6473e57520d8ee7afbd95c1e1641e05a',1,'cutlass::SubbyteReference::set()']]],
   ['set_5fgaussian',['set_gaussian',['../structcutlass_1_1Distribution.html#ad594b5ec1d577e8ef03d4d808a8220b1',1,'cutlass::Distribution']]],
   ['set_5fidentity',['set_identity',['../structcutlass_1_1Distribution.html#aad2cf02af3d520544d89843cc4295858',1,'cutlass::Distribution']]],
diff --git a/docs/structcutlass_1_1CommandLine-members.html b/docs/structcutlass_1_1CommandLine-members.html
index 77668951c3..6a17b2f0f5 100644
--- a/docs/structcutlass_1_1CommandLine-members.html
+++ b/docs/structcutlass_1_1CommandLine-members.html
@@ -115,7 +115,7 @@
   <tr><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a1603f1c65c6d8d3d4262443b40e5c290">keys</a></td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a0bee40a3cc6078a08eec5d4ca4711f61">num_naked_args</a>() const </td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
   <tr><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a228e1a273d223eec4b2f6d73135d3c1e">parsed_argc</a>() const </td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">seperate_string</a>(std::string const &amp;str, std::vector&lt; value_t &gt; &amp;vals, char sep= ',')</td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">separate_string</a>(std::string const &amp;str, std::vector&lt; value_t &gt; &amp;vals, char sep= ',')</td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
   <tr><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a1944da52162e04b12a82ce0c1ade676e">tokenize</a>(std::vector&lt; std::pair&lt; std::string, std::string &gt; &gt; &amp;tokens, std::string const &amp;str, char delim= ',', char sep= ':')</td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#a440c25cfb006f218ff4705a43320a28b">tokenize</a>(std::vector&lt; std::string &gt; &amp;tokens, std::string const &amp;str, char delim= ',', char sep= ':')</td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
   <tr><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html#ade127841e9730589f611b618e9440012">values</a></td><td class="entry"><a class="el" href="structcutlass_1_1CommandLine.html">cutlass::CommandLine</a></td><td class="entry"></td></tr>
diff --git a/docs/structcutlass_1_1CommandLine.html b/docs/structcutlass_1_1CommandLine.html
index 0bde0ec4ad..01cd35a270 100644
--- a/docs/structcutlass_1_1CommandLine.html
+++ b/docs/structcutlass_1_1CommandLine.html
@@ -151,7 +151,7 @@
 <tr class="memdesc:a440c25cfb006f218ff4705a43320a28b"><td class="mdescLeft">&#160;</td><td class="mdescRight">Tokenizes a comma-delimited list of string pairs delimited by ':'.  <a href="#a440c25cfb006f218ff4705a43320a28b">More...</a><br /></td></tr>
 <tr class="separator:a440c25cfb006f218ff4705a43320a28b"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a5f86e4b2bd8c44b739c83530d77c5590"><td class="memTemplParams" colspan="2">template&lt;typename value_t &gt; </td></tr>
-<tr class="memitem:a5f86e4b2bd8c44b739c83530d77c5590"><td class="memTemplItemLeft" align="right" valign="top">static void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">seperate_string</a> (std::string const &amp;str, std::vector&lt; value_t &gt; &amp;vals, char sep= ',')</td></tr>
+<tr class="memitem:a5f86e4b2bd8c44b739c83530d77c5590"><td class="memTemplItemLeft" align="right" valign="top">static void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590">separate_string</a> (std::string const &amp;str, std::vector&lt; value_t &gt; &amp;vals, char sep= ',')</td></tr>
 <tr class="separator:a5f86e4b2bd8c44b739c83530d77c5590"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="pub-attribs"></a>
@@ -548,7 +548,7 @@ <h2 class="groupheader">Member Function Documentation</h2>
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">static void cutlass::CommandLine::seperate_string </td>
+          <td class="memname">static void cutlass::CommandLine::separate_string </td>
           <td>(</td>
           <td class="paramtype">std::string const &amp;&#160;</td>
           <td class="paramname"><em>str</em>, </td>
diff --git a/docs/structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html b/docs/structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html
index 37cb3e5ddc..2f4bf08eac 100644
--- a/docs/structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html
+++ b/docs/structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html
@@ -104,7 +104,7 @@
 </div><!--header-->
 <div class="contents">
 
-<p>Parital specialization for XOR-popc.  
+<p>Partial specialization for XOR-popc.
 </p>
 
 <p><code>#include &lt;<a class="el" href="tools_2util_2include_2cutlass_2util_2reference_2device_2gemm_8h_source.html">gemm.h</a>&gt;</code></p>
diff --git a/docs/structcutlass_1_1reference_1_1device_1_1kernel_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html b/docs/structcutlass_1_1reference_1_1device_1_1kernel_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html
index 2c89af687f..2daeadcc77 100644
--- a/docs/structcutlass_1_1reference_1_1device_1_1kernel_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html
+++ b/docs/structcutlass_1_1reference_1_1device_1_1kernel_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html
@@ -112,7 +112,7 @@
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="pub-methods"></a>
 Public Member Functions</h2></td></tr>
 <tr class="memitem:a89e10e059c3ffcfe2640cf6291353937"><td class="memItemLeft" align="right" valign="top">__inline__ __device__&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1device_1_1kernel_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html#a89e10e059c3ffcfe2640cf6291353937">TensorForEachHelper</a> (Func &amp;func, <a class="el" href="structcutlass_1_1Coord.html">Coord</a>&lt; Rank &gt; const &amp;size, <a class="el" href="structcutlass_1_1Coord.html">Coord</a>&lt; Rank &gt; &amp;coord, int64_t index)</td></tr>
-<tr class="memdesc:a89e10e059c3ffcfe2640cf6291353937"><td class="mdescLeft">&#160;</td><td class="mdescRight">Constructor for fastest chaning rank.  <a href="#a89e10e059c3ffcfe2640cf6291353937">More...</a><br /></td></tr>
+<tr class="memdesc:a89e10e059c3ffcfe2640cf6291353937"><td class="mdescLeft">&#160;</td><td class="mdescRight">Constructor for fastest changing rank.  <a href="#a89e10e059c3ffcfe2640cf6291353937">More...</a><br /></td></tr>
 <tr class="separator:a89e10e059c3ffcfe2640cf6291353937"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Constructor &amp; Destructor Documentation</h2>
diff --git a/docs/structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html b/docs/structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html
index 0840df5980..7a83a97d8c 100644
--- a/docs/structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html
+++ b/docs/structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html
@@ -104,7 +104,7 @@
 </div><!--header-->
 <div class="contents">
 
-<p>Parital specialization for XOR-popc.  
+<p>Partial specialization for XOR-popc.
 </p>
 
 <p><code>#include &lt;<a class="el" href="tools_2util_2include_2cutlass_2util_2reference_2host_2gemm_8h_source.html">gemm.h</a>&gt;</code></p>
diff --git a/docs/structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html b/docs/structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html
index 2e440e45ad..6c63b40e38 100644
--- a/docs/structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html
+++ b/docs/structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html
@@ -113,7 +113,7 @@
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="pub-methods"></a>
 Public Member Functions</h2></td></tr>
 <tr class="memitem:a5029a4405a9a5e64011addb43bb88120"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html#a5029a4405a9a5e64011addb43bb88120">TensorForEachHelper</a> (Func &amp;func, <a class="el" href="structcutlass_1_1Coord.html">Coord</a>&lt; Rank &gt; const &amp;extent, <a class="el" href="structcutlass_1_1Coord.html">Coord</a>&lt; Rank &gt; &amp;coord)</td></tr>
-<tr class="memdesc:a5029a4405a9a5e64011addb43bb88120"><td class="mdescLeft">&#160;</td><td class="mdescRight">Constructor for fastest chaning rank.  <a href="#a5029a4405a9a5e64011addb43bb88120">More...</a><br /></td></tr>
+<tr class="memdesc:a5029a4405a9a5e64011addb43bb88120"><td class="mdescLeft">&#160;</td><td class="mdescRight">Constructor for fastest changing rank.  <a href="#a5029a4405a9a5e64011addb43bb88120">More...</a><br /></td></tr>
 <tr class="separator:a5029a4405a9a5e64011addb43bb88120"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="pub-static-attribs"></a>
diff --git a/docs/structcutlass_1_1transform_1_1TransposePitchLinearThreadMap2DThreadTile.html b/docs/structcutlass_1_1transform_1_1TransposePitchLinearThreadMap2DThreadTile.html
index 41e0af2623..bc5294f7c1 100644
--- a/docs/structcutlass_1_1transform_1_1TransposePitchLinearThreadMap2DThreadTile.html
+++ b/docs/structcutlass_1_1transform_1_1TransposePitchLinearThreadMap2DThreadTile.html
@@ -106,7 +106,7 @@
 </div><!--header-->
 <div class="contents">
 
-<p>Thread Mapping a 2D threadtiled mapping as a tranposed Pitchlinear2DThreadTile mapping.  
+<p>Thread Mapping a 2D threadtiled mapping as a transposed Pitchlinear2DThreadTile mapping.
 </p>
 
 <p><code>#include &lt;<a class="el" href="pitch__linear__thread__map_8h_source.html">pitch_linear_thread_map.h</a>&gt;</code></p>
diff --git a/docs/tools_2util_2include_2cutlass_2util_2reference_2device_2gemm_8h.html b/docs/tools_2util_2include_2cutlass_2util_2reference_2device_2gemm_8h.html
index 2a0a978e79..cc75285506 100644
--- a/docs/tools_2util_2include_2cutlass_2util_2reference_2device_2gemm_8h.html
+++ b/docs/tools_2util_2include_2cutlass_2util_2reference_2device_2gemm_8h.html
@@ -134,7 +134,7 @@
 <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for multiply-add-saturate.  <a href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout30b72addd464a2ca4a26785cbfd77a8e.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html">cutlass::reference::device::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpXorPopc &gt;</a></td></tr>
-<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Parital specialization for XOR-popc.  <a href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html#details">More...</a><br /></td></tr>
+<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for XOR-popc.  <a href="structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="namespaces"></a>
diff --git a/docs/tools_2util_2include_2cutlass_2util_2reference_2host_2gemm_8h.html b/docs/tools_2util_2include_2cutlass_2util_2reference_2host_2gemm_8h.html
index d20a078440..b0bfdbc283 100644
--- a/docs/tools_2util_2include_2cutlass_2util_2reference_2host_2gemm_8h.html
+++ b/docs/tools_2util_2include_2cutlass_2util_2reference_2host_2gemm_8h.html
@@ -141,7 +141,7 @@
 <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for multiply-add-saturate.  <a href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_55729eac7dbd6bf311ea36f680e83e93.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html">cutlass::reference::host::Gemm&lt; ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpXorPopc &gt;</a></td></tr>
-<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Parital specialization for XOR-popc.  <a href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html#details">More...</a><br /></td></tr>
+<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Partial specialization for XOR-popc.  <a href="structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html#details">More...</a><br /></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="namespaces"></a>
diff --git a/docs/wmma__sm75_8h_source.html b/docs/wmma__sm75_8h_source.html
index 72ad72f91f..6ff6405d34 100644
--- a/docs/wmma__sm75_8h_source.html
+++ b/docs/wmma__sm75_8h_source.html
@@ -98,7 +98,7 @@
 <div class="title">wmma_sm75.h</div>  </div>
 </div><!--header-->
 <div class="contents">
-<a href="wmma__sm75_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a name="l00001"></a><span class="lineno">    1</span>&#160;<span class="comment">/***************************************************************************************************</span></div><div class="line"><a name="l00002"></a><span class="lineno">    2</span>&#160;<span class="comment"> * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.</span></div><div class="line"><a name="l00003"></a><span class="lineno">    3</span>&#160;<span class="comment"> *</span></div><div class="line"><a name="l00004"></a><span class="lineno">    4</span>&#160;<span class="comment"> * Redistribution and use in source and binary forms, with or without modification, are permitted</span></div><div class="line"><a name="l00005"></a><span class="lineno">    5</span>&#160;<span class="comment"> * provided that the following conditions are met:</span></div><div class="line"><a name="l00006"></a><span class="lineno">    6</span>&#160;<span class="comment"> *     * Redistributions of source code must retain the above copyright notice, this list of</span></div><div class="line"><a name="l00007"></a><span class="lineno">    7</span>&#160;<span class="comment"> *       conditions and the following disclaimer.</span></div><div class="line"><a name="l00008"></a><span class="lineno">    8</span>&#160;<span class="comment"> *     * Redistributions in binary form must reproduce the above copyright notice, this list of</span></div><div class="line"><a name="l00009"></a><span class="lineno">    9</span>&#160;<span class="comment"> *       conditions and the following disclaimer in the documentation and/or other materials</span></div><div class="line"><a name="l00010"></a><span class="lineno">   10</span>&#160;<span class="comment"> *       provided with the distribution.</span></div><div class="line"><a name="l00011"></a><span class="lineno">   11</span>&#160;<span class="comment"> *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used</span></div><div class="line"><a name="l00012"></a><span class="lineno">   12</span>&#160;<span class="comment"> *       to endorse or promote products derived from this software without specific prior written</span></div><div class="line"><a name="l00013"></a><span class="lineno">   13</span>&#160;<span class="comment"> *       permission.</span></div><div class="line"><a name="l00014"></a><span class="lineno">   14</span>&#160;<span class="comment"> *</span></div><div class="line"><a name="l00015"></a><span class="lineno">   15</span>&#160;<span class="comment"> * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS &quot;AS IS&quot; AND ANY EXPRESS OR</span></div><div class="line"><a name="l00016"></a><span class="lineno">   16</span>&#160;<span class="comment"> * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND</span></div><div class="line"><a name="l00017"></a><span class="lineno">   17</span>&#160;<span class="comment"> * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE</span></div><div class="line"><a name="l00018"></a><span class="lineno">   18</span>&#160;<span class="comment"> * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,</span></div><div class="line"><a name="l00019"></a><span class="lineno">   19</span>&#160;<span class="comment"> * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;</span></div><div class="line"><a name="l00020"></a><span class="lineno">   20</span>&#160;<span class="comment"> * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,</span></div><div class="line"><a name="l00021"></a><span class="lineno">   21</span>&#160;<span class="comment"> * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE</span></div><div class="line"><a name="l00022"></a><span class="lineno">   22</span>&#160;<span class="comment"> * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.</span></div><div class="line"><a name="l00023"></a><span class="lineno">   23</span>&#160;<span class="comment"> *</span></div><div class="line"><a name="l00024"></a><span class="lineno">   24</span>&#160;<span class="comment"> **************************************************************************************************/</span></div><div class="line"><a name="l00029"></a><span class="lineno">   29</span>&#160;<span class="preprocessor">#pragma once</span></div><div class="line"><a name="l00030"></a><span class="lineno">   30</span>&#160;</div><div class="line"><a name="l00031"></a><span class="lineno">   31</span>&#160;<span class="preprocessor">#include &lt;assert.h&gt;</span></div><div class="line"><a name="l00032"></a><span class="lineno">   32</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="layout_2matrix_8h.html">cutlass/layout/matrix.h</a>&quot;</span></div><div class="line"><a name="l00033"></a><span class="lineno">   33</span>&#160;</div><div class="line"><a name="l00035"></a><span class="lineno">   35</span>&#160;<span class="keyword">namespace </span><a class="code" href="namespacecutlass.html">cutlass</a> {</div><div class="line"><a name="l00036"></a><span class="lineno">   36</span>&#160;<span class="keyword">namespace </span>arch {</div><div class="line"><a name="l00037"></a><span class="lineno">   37</span>&#160;</div><div class="line"><a name="l00039"></a><span class="lineno">   39</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00040"></a><span class="lineno">   40</span>&#160;<span class="comment">// WMMA template structure defines nvcuda::wmma::fragments and static assert for</span></div><div class="line"><a name="l00041"></a><span class="lineno">   41</span>&#160;<span class="comment">// wmma native instruction sizes supported for cutlass::int4b_t (experimental::s4).</span></div><div class="line"><a name="l00042"></a><span class="lineno">   42</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00044"></a><span class="lineno">   44</span>&#160;<span class="comment"></span><span class="keyword">template</span> &lt;</div><div class="line"><a name="l00045"></a><span class="lineno">   45</span>&#160;<span class="keyword">typename</span> Shape_, </div><div class="line"><a name="l00046"></a><span class="lineno">   46</span>&#160;<span class="keyword">typename</span> LayoutA_, </div><div class="line"><a name="l00047"></a><span class="lineno">   47</span>&#160;<span class="keyword">typename</span> LayoutB_,</div><div class="line"><a name="l00048"></a><span class="lineno">   48</span>&#160;<span class="keyword">typename</span> LayoutC_&gt;</div><div class="line"><a name="l00049"></a><span class="lineno"><a class="line" href="structcutlass_1_1arch_1_1Wmma_3_01Shape___00_01cutlass_1_1int4b__t_00_01LayoutA___00_01cutlass_16fd808a90b3cf9d7cfc99f30888ca3fe.html">   49</a></span>&#160;<span class="keyword">struct </span>Wmma&lt;</div><div class="line"><a name="l00050"></a><span class="lineno">   50</span>&#160;  Shape_,                                   </div><div class="line"><a name="l00051"></a><span class="lineno">   51</span>&#160;  <a class="code" href="namespacecutlass.html">cutlass</a>::<a class="code" href="structcutlass_1_1integer__subbyte.html">int4b_t</a>,                         </div><div class="line"><a name="l00052"></a><span class="lineno">   52</span>&#160;  LayoutA_,                                 </div><div class="line"><a name="l00053"></a><span class="lineno">   53</span>&#160;  <a class="code" href="structcutlass_1_1integer__subbyte.html">cutlass::int4b_t</a>,                         </div><div class="line"><a name="l00054"></a><span class="lineno">   54</span>&#160;  LayoutB_,                                 </div><div class="line"><a name="l00055"></a><span class="lineno">   55</span>&#160;  int32_t,                                  </div><div class="line"><a name="l00056"></a><span class="lineno">   56</span>&#160;  LayoutC_,                                 </div><div class="line"><a name="l00057"></a><span class="lineno">   57</span>&#160;  cutlass::arch::OpMultiplyAdd              </div><div class="line"><a name="l00058"></a><span class="lineno">   58</span>&#160;&gt; {</div><div class="line"><a name="l00059"></a><span class="lineno">   59</span>&#160;<span class="preprocessor">#if defined(CUTLASS_ARCH_WMMA_SM75_ENABLED)</span></div><div class="line"><a name="l00060"></a><span class="lineno">   60</span>&#160;  <span class="keyword">using</span> Shape = Shape_;</div><div class="line"><a name="l00061"></a><span class="lineno">   61</span>&#160;  <span class="keyword">using</span> ElementA = <a class="code" href="namespacecutlass.html#a30f409bb0c8a88a3307e5c7cd31f2384">cutlass::int4b_t</a>;</div><div class="line"><a name="l00062"></a><span class="lineno">   62</span>&#160;  <span class="keyword">using</span> LayoutA = LayoutA_;</div><div class="line"><a name="l00063"></a><span class="lineno">   63</span>&#160;  <span class="keyword">using</span> ElementB = <a class="code" href="namespacecutlass.html#a30f409bb0c8a88a3307e5c7cd31f2384">cutlass::int4b_t</a>;</div><div class="line"><a name="l00064"></a><span class="lineno">   64</span>&#160;  <span class="keyword">using</span> LayoutB = LayoutB_;</div><div class="line"><a name="l00065"></a><span class="lineno">   65</span>&#160;  <span class="keyword">using</span> ElementC = int32_t;</div><div class="line"><a name="l00066"></a><span class="lineno">   66</span>&#160;  <span class="keyword">using</span> LayoutC = LayoutC_;</div><div class="line"><a name="l00067"></a><span class="lineno">   67</span>&#160;  <span class="keyword">using</span> Operator = cutlass::arch::OpMultiplyAdd;</div><div class="line"><a name="l00068"></a><span class="lineno">   68</span>&#160;</div><div class="line"><a name="l00069"></a><span class="lineno">   69</span>&#160;  <span class="comment">// check supported wmma shape for the given multiplicand data types</span></div><div class="line"><a name="l00070"></a><span class="lineno">   70</span>&#160;  <a class="code" href="platform_8h.html#adde4c9ea91b753491851361a4198c009">static_assert</a>(</div><div class="line"><a name="l00071"></a><span class="lineno">   71</span>&#160;    <a class="code" href="structcutlass_1_1platform_1_1is__same.html">platform::is_same</a>&lt;<a class="code" href="structcutlass_1_1gemm_1_1GemmShape.html">cutlass::gemm::GemmShape&lt;8, 8, 32&gt;</a>, Shape&gt;::value,</div><div class="line"><a name="l00072"></a><span class="lineno">   72</span>&#160;    <span class="stringliteral">&quot;Supported list of wmma operator shape for s8 multiplicands is: 8x8x32&quot;</span>);</div><div class="line"><a name="l00073"></a><span class="lineno">   73</span>&#160;</div><div class="line"><a name="l00074"></a><span class="lineno">   74</span>&#160;</div><div class="line"><a name="l00075"></a><span class="lineno">   75</span>&#160;  <span class="comment">// Wmma Fragment</span></div><div class="line"><a name="l00076"></a><span class="lineno">   76</span>&#160;  <span class="keyword">using</span> FragmentA = nvcuda::wmma::fragment&lt;</div><div class="line"><a name="l00077"></a><span class="lineno">   77</span>&#160;          nvcuda::wmma::matrix_a,</div><div class="line"><a name="l00078"></a><span class="lineno">   78</span>&#160;          Shape::kM,</div><div class="line"><a name="l00079"></a><span class="lineno">   79</span>&#160;          Shape::kN,</div><div class="line"><a name="l00080"></a><span class="lineno">   80</span>&#160;          Shape::kK,</div><div class="line"><a name="l00081"></a><span class="lineno">   81</span>&#160;          <span class="keyword">typename</span> CutlassToWmmaDataType&lt;ElementA&gt;::Type,</div><div class="line"><a name="l00082"></a><span class="lineno">   82</span>&#160;          <span class="keyword">typename</span> CutlassToWmmaLayout&lt;LayoutA&gt;::Layout&gt;;</div><div class="line"><a name="l00083"></a><span class="lineno">   83</span>&#160;</div><div class="line"><a name="l00084"></a><span class="lineno">   84</span>&#160;  <span class="keyword">using</span> FragmentB = nvcuda::wmma::fragment&lt;</div><div class="line"><a name="l00085"></a><span class="lineno">   85</span>&#160;          nvcuda::wmma::matrix_b,</div><div class="line"><a name="l00086"></a><span class="lineno">   86</span>&#160;          Shape::kM,</div><div class="line"><a name="l00087"></a><span class="lineno">   87</span>&#160;          Shape::kN,</div><div class="line"><a name="l00088"></a><span class="lineno">   88</span>&#160;          Shape::kK,</div><div class="line"><a name="l00089"></a><span class="lineno">   89</span>&#160;          <span class="keyword">typename</span> CutlassToWmmaDataType&lt;ElementB&gt;::Type,</div><div class="line"><a name="l00090"></a><span class="lineno">   90</span>&#160;          <span class="keyword">typename</span> CutlassToWmmaLayout&lt;LayoutB&gt;::Layout&gt;;</div><div class="line"><a name="l00091"></a><span class="lineno">   91</span>&#160;</div><div class="line"><a name="l00092"></a><span class="lineno">   92</span>&#160;  <span class="keyword">using</span> FragmentC = nvcuda::wmma::fragment&lt;</div><div class="line"><a name="l00093"></a><span class="lineno">   93</span>&#160;          nvcuda::wmma::accumulator,</div><div class="line"><a name="l00094"></a><span class="lineno">   94</span>&#160;          Shape::kM,</div><div class="line"><a name="l00095"></a><span class="lineno">   95</span>&#160;          Shape::kN,</div><div class="line"><a name="l00096"></a><span class="lineno">   96</span>&#160;          Shape::kK,</div><div class="line"><a name="l00097"></a><span class="lineno">   97</span>&#160;          <span class="keyword">typename</span> CutlassToWmmaDataType&lt;ElementC&gt;::Type&gt;;</div><div class="line"><a name="l00098"></a><span class="lineno">   98</span>&#160;</div><div class="line"><a name="l00100"></a><span class="lineno">  100</span>&#160;  CUTLASS_DEVICE</div><div class="line"><a name="l00101"></a><span class="lineno">  101</span>&#160;  <span class="keywordtype">void</span> operator()(</div><div class="line"><a name="l00102"></a><span class="lineno">  102</span>&#160;    FragmentC &amp;D, </div><div class="line"><a name="l00103"></a><span class="lineno">  103</span>&#160;    FragmentA <span class="keyword">const</span> &amp;A, </div><div class="line"><a name="l00104"></a><span class="lineno">  104</span>&#160;    FragmentB <span class="keyword">const</span> &amp;B, </div><div class="line"><a name="l00105"></a><span class="lineno">  105</span>&#160;    FragmentC <span class="keyword">const</span> &amp;C)<span class="keyword"> const </span>{</div><div class="line"><a name="l00106"></a><span class="lineno">  106</span>&#160;      nvcuda::wmma::mma_sync(D, A, B, C);</div><div class="line"><a name="l00107"></a><span class="lineno">  107</span>&#160;  }</div><div class="line"><a name="l00108"></a><span class="lineno">  108</span>&#160;</div><div class="line"><a name="l00109"></a><span class="lineno">  109</span>&#160;<span class="preprocessor">#else</span></div><div class="line"><a name="l00110"></a><span class="lineno">  110</span>&#160;    <a class="code" href="platform_8h.html#adde4c9ea91b753491851361a4198c009">static_assert</a>(<span class="keyword">false</span>, <span class="stringliteral">&quot;wmma.mma.sync interger type multiplicands is avialable only for SM75 and beyond&quot;</span>);</div><div class="line"><a name="l00111"></a><span class="lineno">  111</span>&#160;<span class="preprocessor">#endif</span></div><div class="line"><a name="l00112"></a><span class="lineno">  112</span>&#160;</div><div class="line"><a name="l00113"></a><span class="lineno">  113</span>&#160;};</div><div class="line"><a name="l00114"></a><span class="lineno">  114</span>&#160;</div><div class="line"><a name="l00116"></a><span class="lineno">  116</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00117"></a><span class="lineno">  117</span>&#160;<span class="comment">// WMMA template structure defines nvcuda::wmma::fragments and static assert for</span></div><div class="line"><a name="l00118"></a><span class="lineno">  118</span>&#160;<span class="comment">// wmma native instruction sizes supported for cutlass::uint1b_t (experimental::b1)</span></div><div class="line"><a name="l00119"></a><span class="lineno">  119</span>&#160;<span class="comment">// (nvcuda::wmma targetting SASS instruction BMMA)</span></div><div class="line"><a name="l00120"></a><span class="lineno">  120</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00122"></a><span class="lineno">  122</span>&#160;<span class="comment"></span><span class="keyword">template</span> &lt;</div><div class="line"><a name="l00123"></a><span class="lineno">  123</span>&#160;<span class="keyword">typename</span> Shape_, </div><div class="line"><a name="l00124"></a><span class="lineno">  124</span>&#160;<span class="keyword">typename</span> LayoutA_, </div><div class="line"><a name="l00125"></a><span class="lineno">  125</span>&#160;<span class="keyword">typename</span> LayoutB_,</div><div class="line"><a name="l00126"></a><span class="lineno">  126</span>&#160;<span class="keyword">typename</span> LayoutC_&gt;</div><div class="line"><a name="l00127"></a><span class="lineno"><a class="line" href="structcutlass_1_1arch_1_1Wmma_3_01Shape___00_01cutlass_1_1uint1b__t_00_01LayoutA___00_01cutlass_c80a7ea4d219cd9b13b560b493338028.html">  127</a></span>&#160;<span class="keyword">struct </span>Wmma&lt;</div><div class="line"><a name="l00128"></a><span class="lineno">  128</span>&#160;  Shape_,                                   </div><div class="line"><a name="l00129"></a><span class="lineno">  129</span>&#160;  <a class="code" href="namespacecutlass.html">cutlass</a>::<a class="code" href="structcutlass_1_1integer__subbyte.html">uint1b_t</a>,                        </div><div class="line"><a name="l00130"></a><span class="lineno">  130</span>&#160;  LayoutA_,                                 </div><div class="line"><a name="l00131"></a><span class="lineno">  131</span>&#160;  <a class="code" href="structcutlass_1_1integer__subbyte.html">cutlass::uint1b_t</a>,                        </div><div class="line"><a name="l00132"></a><span class="lineno">  132</span>&#160;  LayoutB_,                                 </div><div class="line"><a name="l00133"></a><span class="lineno">  133</span>&#160;  int32_t,                                  </div><div class="line"><a name="l00134"></a><span class="lineno">  134</span>&#160;  LayoutC_,                                 </div><div class="line"><a name="l00135"></a><span class="lineno">  135</span>&#160;  cutlass::arch::OpXorPopc                  </div><div class="line"><a name="l00136"></a><span class="lineno">  136</span>&#160;&gt; {</div><div class="line"><a name="l00137"></a><span class="lineno">  137</span>&#160;<span class="preprocessor">#if defined(CUTLASS_ARCH_WMMA_SM75_ENABLED)</span></div><div class="line"><a name="l00138"></a><span class="lineno">  138</span>&#160;  <span class="keyword">using</span> Shape = Shape_;</div><div class="line"><a name="l00139"></a><span class="lineno">  139</span>&#160;  <span class="keyword">using</span> ElementA = <a class="code" href="namespacecutlass.html#a09a3695d6126aed7a7c01f431fa34b7d">cutlass::uint1b_t</a>;</div><div class="line"><a name="l00140"></a><span class="lineno">  140</span>&#160;  <span class="keyword">using</span> LayoutA = LayoutA_;</div><div class="line"><a name="l00141"></a><span class="lineno">  141</span>&#160;  <span class="keyword">using</span> ElementB = <a class="code" href="namespacecutlass.html#a09a3695d6126aed7a7c01f431fa34b7d">cutlass::uint1b_t</a>;</div><div class="line"><a name="l00142"></a><span class="lineno">  142</span>&#160;  <span class="keyword">using</span> LayoutB = LayoutB_;</div><div class="line"><a name="l00143"></a><span class="lineno">  143</span>&#160;  <span class="keyword">using</span> ElementC = int32_t;</div><div class="line"><a name="l00144"></a><span class="lineno">  144</span>&#160;  <span class="keyword">using</span> LayoutC = LayoutC_;</div><div class="line"><a name="l00145"></a><span class="lineno">  145</span>&#160;  <span class="keyword">using</span> Operator = cutlass::arch::OpXorPopc;</div><div class="line"><a name="l00146"></a><span class="lineno">  146</span>&#160;</div><div class="line"><a name="l00147"></a><span class="lineno">  147</span>&#160;  <span class="comment">// check supported wmma shape for the given multiplicand data types</span></div><div class="line"><a name="l00148"></a><span class="lineno">  148</span>&#160;  <a class="code" href="platform_8h.html#adde4c9ea91b753491851361a4198c009">static_assert</a>(</div><div class="line"><a name="l00149"></a><span class="lineno">  149</span>&#160;    <a class="code" href="structcutlass_1_1platform_1_1is__same.html">platform::is_same</a>&lt;<a class="code" href="structcutlass_1_1gemm_1_1GemmShape.html">cutlass::gemm::GemmShape&lt;8, 8, 128&gt;</a>, Shape&gt;::value,</div><div class="line"><a name="l00150"></a><span class="lineno">  150</span>&#160;    <span class="stringliteral">&quot;Supported list of wmma operator shape for b1 multiplicands is: 8x8x128&quot;</span>);</div><div class="line"><a name="l00151"></a><span class="lineno">  151</span>&#160;</div><div class="line"><a name="l00152"></a><span class="lineno">  152</span>&#160;</div><div class="line"><a name="l00153"></a><span class="lineno">  153</span>&#160;  <span class="comment">// Wmma Fragment</span></div><div class="line"><a name="l00154"></a><span class="lineno">  154</span>&#160;  <span class="keyword">using</span> FragmentA = nvcuda::wmma::fragment&lt;</div><div class="line"><a name="l00155"></a><span class="lineno">  155</span>&#160;          nvcuda::wmma::matrix_a,</div><div class="line"><a name="l00156"></a><span class="lineno">  156</span>&#160;          Shape::kM,</div><div class="line"><a name="l00157"></a><span class="lineno">  157</span>&#160;          Shape::kN,</div><div class="line"><a name="l00158"></a><span class="lineno">  158</span>&#160;          Shape::kK,</div><div class="line"><a name="l00159"></a><span class="lineno">  159</span>&#160;          <span class="keyword">typename</span> CutlassToWmmaDataType&lt;ElementA&gt;::Type,</div><div class="line"><a name="l00160"></a><span class="lineno">  160</span>&#160;          <span class="keyword">typename</span> CutlassToWmmaLayout&lt;LayoutA&gt;::Layout&gt;;</div><div class="line"><a name="l00161"></a><span class="lineno">  161</span>&#160;</div><div class="line"><a name="l00162"></a><span class="lineno">  162</span>&#160;  <span class="keyword">using</span> FragmentB = nvcuda::wmma::fragment&lt;</div><div class="line"><a name="l00163"></a><span class="lineno">  163</span>&#160;          nvcuda::wmma::matrix_b,</div><div class="line"><a name="l00164"></a><span class="lineno">  164</span>&#160;          Shape::kM,</div><div class="line"><a name="l00165"></a><span class="lineno">  165</span>&#160;          Shape::kN,</div><div class="line"><a name="l00166"></a><span class="lineno">  166</span>&#160;          Shape::kK,</div><div class="line"><a name="l00167"></a><span class="lineno">  167</span>&#160;          <span class="keyword">typename</span> CutlassToWmmaDataType&lt;ElementB&gt;::Type,</div><div class="line"><a name="l00168"></a><span class="lineno">  168</span>&#160;          <span class="keyword">typename</span> CutlassToWmmaLayout&lt;LayoutB&gt;::Layout&gt;;</div><div class="line"><a name="l00169"></a><span class="lineno">  169</span>&#160;</div><div class="line"><a name="l00170"></a><span class="lineno">  170</span>&#160;  <span class="keyword">using</span> FragmentC = nvcuda::wmma::fragment&lt;</div><div class="line"><a name="l00171"></a><span class="lineno">  171</span>&#160;          nvcuda::wmma::accumulator,</div><div class="line"><a name="l00172"></a><span class="lineno">  172</span>&#160;          Shape::kM,</div><div class="line"><a name="l00173"></a><span class="lineno">  173</span>&#160;          Shape::kN,</div><div class="line"><a name="l00174"></a><span class="lineno">  174</span>&#160;          Shape::kK,</div><div class="line"><a name="l00175"></a><span class="lineno">  175</span>&#160;          <span class="keyword">typename</span> CutlassToWmmaDataType&lt;ElementC&gt;::Type&gt;;</div><div class="line"><a name="l00176"></a><span class="lineno">  176</span>&#160;  </div><div class="line"><a name="l00178"></a><span class="lineno">  178</span>&#160;  CUTLASS_DEVICE</div><div class="line"><a name="l00179"></a><span class="lineno">  179</span>&#160;  <span class="keywordtype">void</span> operator()(</div><div class="line"><a name="l00180"></a><span class="lineno">  180</span>&#160;    FragmentC &amp;D, </div><div class="line"><a name="l00181"></a><span class="lineno">  181</span>&#160;    FragmentA <span class="keyword">const</span> &amp;A, </div><div class="line"><a name="l00182"></a><span class="lineno">  182</span>&#160;    FragmentB <span class="keyword">const</span> &amp;B, </div><div class="line"><a name="l00183"></a><span class="lineno">  183</span>&#160;    FragmentC <span class="keyword">const</span> &amp;C)<span class="keyword"> const </span>{</div><div class="line"><a name="l00184"></a><span class="lineno">  184</span>&#160;</div><div class="line"><a name="l00185"></a><span class="lineno">  185</span>&#160;      nvcuda::wmma::bmma_sync(D, A, B, C, nvcuda::wmma::experimental::bmmaBitOpXOR, </div><div class="line"><a name="l00186"></a><span class="lineno">  186</span>&#160;                                          nvcuda::wmma::experimental::bmmaAccumulateOpPOPC);</div><div class="line"><a name="l00187"></a><span class="lineno">  187</span>&#160;  }</div><div class="line"><a name="l00188"></a><span class="lineno">  188</span>&#160;</div><div class="line"><a name="l00189"></a><span class="lineno">  189</span>&#160;<span class="preprocessor">#else</span></div><div class="line"><a name="l00190"></a><span class="lineno">  190</span>&#160;    <a class="code" href="platform_8h.html#adde4c9ea91b753491851361a4198c009">static_assert</a>(<span class="keyword">false</span>, <span class="stringliteral">&quot;wmma.mma.sync interger type multiplicands is avialable only for SM75 and beyond&quot;</span>);</div><div class="line"><a name="l00191"></a><span class="lineno">  191</span>&#160;<span class="preprocessor">#endif</span></div><div class="line"><a name="l00192"></a><span class="lineno">  192</span>&#160;</div><div class="line"><a name="l00193"></a><span class="lineno">  193</span>&#160;};</div><div class="line"><a name="l00194"></a><span class="lineno">  194</span>&#160;</div><div class="line"><a name="l00195"></a><span class="lineno">  195</span>&#160;} <span class="comment">// namespace arch</span></div><div class="line"><a name="l00196"></a><span class="lineno">  196</span>&#160;} <span class="comment">// namespace cutlass</span></div><div class="ttc" id="namespacecutlass_html"><div class="ttname"><a href="namespacecutlass.html">cutlass</a></div><div class="ttdef"><b>Definition:</b> aligned_buffer.h:35</div></div>
+<a href="wmma__sm75_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a name="l00001"></a><span class="lineno">    1</span>&#160;<span class="comment">/***************************************************************************************************</span></div><div class="line"><a name="l00002"></a><span class="lineno">    2</span>&#160;<span class="comment"> * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.</span></div><div class="line"><a name="l00003"></a><span class="lineno">    3</span>&#160;<span class="comment"> *</span></div><div class="line"><a name="l00004"></a><span class="lineno">    4</span>&#160;<span class="comment"> * Redistribution and use in source and binary forms, with or without modification, are permitted</span></div><div class="line"><a name="l00005"></a><span class="lineno">    5</span>&#160;<span class="comment"> * provided that the following conditions are met:</span></div><div class="line"><a name="l00006"></a><span class="lineno">    6</span>&#160;<span class="comment"> *     * Redistributions of source code must retain the above copyright notice, this list of</span></div><div class="line"><a name="l00007"></a><span class="lineno">    7</span>&#160;<span class="comment"> *       conditions and the following disclaimer.</span></div><div class="line"><a name="l00008"></a><span class="lineno">    8</span>&#160;<span class="comment"> *     * Redistributions in binary form must reproduce the above copyright notice, this list of</span></div><div class="line"><a name="l00009"></a><span class="lineno">    9</span>&#160;<span class="comment"> *       conditions and the following disclaimer in the documentation and/or other materials</span></div><div class="line"><a name="l00010"></a><span class="lineno">   10</span>&#160;<span class="comment"> *       provided with the distribution.</span></div><div class="line"><a name="l00011"></a><span class="lineno">   11</span>&#160;<span class="comment"> *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used</span></div><div class="line"><a name="l00012"></a><span class="lineno">   12</span>&#160;<span class="comment"> *       to endorse or promote products derived from this software without specific prior written</span></div><div class="line"><a name="l00013"></a><span class="lineno">   13</span>&#160;<span class="comment"> *       permission.</span></div><div class="line"><a name="l00014"></a><span class="lineno">   14</span>&#160;<span class="comment"> *</span></div><div class="line"><a name="l00015"></a><span class="lineno">   15</span>&#160;<span class="comment"> * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS &quot;AS IS&quot; AND ANY EXPRESS OR</span></div><div class="line"><a name="l00016"></a><span class="lineno">   16</span>&#160;<span class="comment"> * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND</span></div><div class="line"><a name="l00017"></a><span class="lineno">   17</span>&#160;<span class="comment"> * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE</span></div><div class="line"><a name="l00018"></a><span class="lineno">   18</span>&#160;<span class="comment"> * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,</span></div><div class="line"><a name="l00019"></a><span class="lineno">   19</span>&#160;<span class="comment"> * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;</span></div><div class="line"><a name="l00020"></a><span class="lineno">   20</span>&#160;<span class="comment"> * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,</span></div><div class="line"><a name="l00021"></a><span class="lineno">   21</span>&#160;<span class="comment"> * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE</span></div><div class="line"><a name="l00022"></a><span class="lineno">   22</span>&#160;<span class="comment"> * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.</span></div><div class="line"><a name="l00023"></a><span class="lineno">   23</span>&#160;<span class="comment"> *</span></div><div class="line"><a name="l00024"></a><span class="lineno">   24</span>&#160;<span class="comment"> **************************************************************************************************/</span></div><div class="line"><a name="l00029"></a><span class="lineno">   29</span>&#160;<span class="preprocessor">#pragma once</span></div><div class="line"><a name="l00030"></a><span class="lineno">   30</span>&#160;</div><div class="line"><a name="l00031"></a><span class="lineno">   31</span>&#160;<span class="preprocessor">#include &lt;assert.h&gt;</span></div><div class="line"><a name="l00032"></a><span class="lineno">   32</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="layout_2matrix_8h.html">cutlass/layout/matrix.h</a>&quot;</span></div><div class="line"><a name="l00033"></a><span class="lineno">   33</span>&#160;</div><div class="line"><a name="l00035"></a><span class="lineno">   35</span>&#160;<span class="keyword">namespace </span><a class="code" href="namespacecutlass.html">cutlass</a> {</div><div class="line"><a name="l00036"></a><span class="lineno">   36</span>&#160;<span class="keyword">namespace </span>arch {</div><div class="line"><a name="l00037"></a><span class="lineno">   37</span>&#160;</div><div class="line"><a name="l00039"></a><span class="lineno">   39</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00040"></a><span class="lineno">   40</span>&#160;<span class="comment">// WMMA template structure defines nvcuda::wmma::fragments and static assert for</span></div><div class="line"><a name="l00041"></a><span class="lineno">   41</span>&#160;<span class="comment">// wmma native instruction sizes supported for cutlass::int4b_t (experimental::s4).</span></div><div class="line"><a name="l00042"></a><span class="lineno">   42</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00044"></a><span class="lineno">   44</span>&#160;<span class="comment"></span><span class="keyword">template</span> &lt;</div><div class="line"><a name="l00045"></a><span class="lineno">   45</span>&#160;<span class="keyword">typename</span> Shape_, </div><div class="line"><a name="l00046"></a><span class="lineno">   46</span>&#160;<span class="keyword">typename</span> LayoutA_, </div><div class="line"><a name="l00047"></a><span class="lineno">   47</span>&#160;<span class="keyword">typename</span> LayoutB_,</div><div class="line"><a name="l00048"></a><span class="lineno">   48</span>&#160;<span class="keyword">typename</span> LayoutC_&gt;</div><div class="line"><a name="l00049"></a><span class="lineno"><a class="line" href="structcutlass_1_1arch_1_1Wmma_3_01Shape___00_01cutlass_1_1int4b__t_00_01LayoutA___00_01cutlass_16fd808a90b3cf9d7cfc99f30888ca3fe.html">   49</a></span>&#160;<span class="keyword">struct </span>Wmma&lt;</div><div class="line"><a name="l00050"></a><span class="lineno">   50</span>&#160;  Shape_,                                   </div><div class="line"><a name="l00051"></a><span class="lineno">   51</span>&#160;  <a class="code" href="namespacecutlass.html">cutlass</a>::<a class="code" href="structcutlass_1_1integer__subbyte.html">int4b_t</a>,                         </div><div class="line"><a name="l00052"></a><span class="lineno">   52</span>&#160;  LayoutA_,                                 </div><div class="line"><a name="l00053"></a><span class="lineno">   53</span>&#160;  <a class="code" href="structcutlass_1_1integer__subbyte.html">cutlass::int4b_t</a>,                         </div><div class="line"><a name="l00054"></a><span class="lineno">   54</span>&#160;  LayoutB_,                                 </div><div class="line"><a name="l00055"></a><span class="lineno">   55</span>&#160;  int32_t,                                  </div><div class="line"><a name="l00056"></a><span class="lineno">   56</span>&#160;  LayoutC_,                                 </div><div class="line"><a name="l00057"></a><span class="lineno">   57</span>&#160;  cutlass::arch::OpMultiplyAdd              </div><div class="line"><a name="l00058"></a><span class="lineno">   58</span>&#160;&gt; {</div><div class="line"><a name="l00059"></a><span class="lineno">   59</span>&#160;<span class="preprocessor">#if defined(CUTLASS_ARCH_WMMA_SM75_ENABLED)</span></div><div class="line"><a name="l00060"></a><span class="lineno">   60</span>&#160;  <span class="keyword">using</span> Shape = Shape_;</div><div class="line"><a name="l00061"></a><span class="lineno">   61</span>&#160;  <span class="keyword">using</span> ElementA = <a class="code" href="namespacecutlass.html#a30f409bb0c8a88a3307e5c7cd31f2384">cutlass::int4b_t</a>;</div><div class="line"><a name="l00062"></a><span class="lineno">   62</span>&#160;  <span class="keyword">using</span> LayoutA = LayoutA_;</div><div class="line"><a name="l00063"></a><span class="lineno">   63</span>&#160;  <span class="keyword">using</span> ElementB = <a class="code" href="namespacecutlass.html#a30f409bb0c8a88a3307e5c7cd31f2384">cutlass::int4b_t</a>;</div><div class="line"><a name="l00064"></a><span class="lineno">   64</span>&#160;  <span class="keyword">using</span> LayoutB = LayoutB_;</div><div class="line"><a name="l00065"></a><span class="lineno">   65</span>&#160;  <span class="keyword">using</span> ElementC = int32_t;</div><div class="line"><a name="l00066"></a><span class="lineno">   66</span>&#160;  <span class="keyword">using</span> LayoutC = LayoutC_;</div><div class="line"><a name="l00067"></a><span class="lineno">   67</span>&#160;  <span class="keyword">using</span> Operator = cutlass::arch::OpMultiplyAdd;</div><div class="line"><a name="l00068"></a><span class="lineno">   68</span>&#160;</div><div class="line"><a name="l00069"></a><span class="lineno">   69</span>&#160;  <span class="comment">// check supported wmma shape for the given multiplicand data types</span></div><div class="line"><a name="l00070"></a><span class="lineno">   70</span>&#160;  <a class="code" href="platform_8h.html#adde4c9ea91b753491851361a4198c009">static_assert</a>(</div><div class="line"><a name="l00071"></a><span class="lineno">   71</span>&#160;    <a class="code" href="structcutlass_1_1platform_1_1is__same.html">platform::is_same</a>&lt;<a class="code" href="structcutlass_1_1gemm_1_1GemmShape.html">cutlass::gemm::GemmShape&lt;8, 8, 32&gt;</a>, Shape&gt;::value,</div><div class="line"><a name="l00072"></a><span class="lineno">   72</span>&#160;    <span class="stringliteral">&quot;Supported list of wmma operator shape for s8 multiplicands is: 8x8x32&quot;</span>);</div><div class="line"><a name="l00073"></a><span class="lineno">   73</span>&#160;</div><div class="line"><a name="l00074"></a><span class="lineno">   74</span>&#160;</div><div class="line"><a name="l00075"></a><span class="lineno">   75</span>&#160;  <span class="comment">// Wmma Fragment</span></div><div class="line"><a name="l00076"></a><span class="lineno">   76</span>&#160;  <span class="keyword">using</span> FragmentA = nvcuda::wmma::fragment&lt;</div><div class="line"><a name="l00077"></a><span class="lineno">   77</span>&#160;          nvcuda::wmma::matrix_a,</div><div class="line"><a name="l00078"></a><span class="lineno">   78</span>&#160;          Shape::kM,</div><div class="line"><a name="l00079"></a><span class="lineno">   79</span>&#160;          Shape::kN,</div><div class="line"><a name="l00080"></a><span class="lineno">   80</span>&#160;          Shape::kK,</div><div class="line"><a name="l00081"></a><span class="lineno">   81</span>&#160;          <span class="keyword">typename</span> CutlassToWmmaDataType&lt;ElementA&gt;::Type,</div><div class="line"><a name="l00082"></a><span class="lineno">   82</span>&#160;          <span class="keyword">typename</span> CutlassToWmmaLayout&lt;LayoutA&gt;::Layout&gt;;</div><div class="line"><a name="l00083"></a><span class="lineno">   83</span>&#160;</div><div class="line"><a name="l00084"></a><span class="lineno">   84</span>&#160;  <span class="keyword">using</span> FragmentB = nvcuda::wmma::fragment&lt;</div><div class="line"><a name="l00085"></a><span class="lineno">   85</span>&#160;          nvcuda::wmma::matrix_b,</div><div class="line"><a name="l00086"></a><span class="lineno">   86</span>&#160;          Shape::kM,</div><div class="line"><a name="l00087"></a><span class="lineno">   87</span>&#160;          Shape::kN,</div><div class="line"><a name="l00088"></a><span class="lineno">   88</span>&#160;          Shape::kK,</div><div class="line"><a name="l00089"></a><span class="lineno">   89</span>&#160;          <span class="keyword">typename</span> CutlassToWmmaDataType&lt;ElementB&gt;::Type,</div><div class="line"><a name="l00090"></a><span class="lineno">   90</span>&#160;          <span class="keyword">typename</span> CutlassToWmmaLayout&lt;LayoutB&gt;::Layout&gt;;</div><div class="line"><a name="l00091"></a><span class="lineno">   91</span>&#160;</div><div class="line"><a name="l00092"></a><span class="lineno">   92</span>&#160;  <span class="keyword">using</span> FragmentC = nvcuda::wmma::fragment&lt;</div><div class="line"><a name="l00093"></a><span class="lineno">   93</span>&#160;          nvcuda::wmma::accumulator,</div><div class="line"><a name="l00094"></a><span class="lineno">   94</span>&#160;          Shape::kM,</div><div class="line"><a name="l00095"></a><span class="lineno">   95</span>&#160;          Shape::kN,</div><div class="line"><a name="l00096"></a><span class="lineno">   96</span>&#160;          Shape::kK,</div><div class="line"><a name="l00097"></a><span class="lineno">   97</span>&#160;          <span class="keyword">typename</span> CutlassToWmmaDataType&lt;ElementC&gt;::Type&gt;;</div><div class="line"><a name="l00098"></a><span class="lineno">   98</span>&#160;</div><div class="line"><a name="l00100"></a><span class="lineno">  100</span>&#160;  CUTLASS_DEVICE</div><div class="line"><a name="l00101"></a><span class="lineno">  101</span>&#160;  <span class="keywordtype">void</span> operator()(</div><div class="line"><a name="l00102"></a><span class="lineno">  102</span>&#160;    FragmentC &amp;D, </div><div class="line"><a name="l00103"></a><span class="lineno">  103</span>&#160;    FragmentA <span class="keyword">const</span> &amp;A, </div><div class="line"><a name="l00104"></a><span class="lineno">  104</span>&#160;    FragmentB <span class="keyword">const</span> &amp;B, </div><div class="line"><a name="l00105"></a><span class="lineno">  105</span>&#160;    FragmentC <span class="keyword">const</span> &amp;C)<span class="keyword"> const </span>{</div><div class="line"><a name="l00106"></a><span class="lineno">  106</span>&#160;      nvcuda::wmma::mma_sync(D, A, B, C);</div><div class="line"><a name="l00107"></a><span class="lineno">  107</span>&#160;  }</div><div class="line"><a name="l00108"></a><span class="lineno">  108</span>&#160;</div><div class="line"><a name="l00109"></a><span class="lineno">  109</span>&#160;<span class="preprocessor">#else</span></div><div class="line"><a name="l00110"></a><span class="lineno">  110</span>&#160;    <a class="code" href="platform_8h.html#adde4c9ea91b753491851361a4198c009">static_assert</a>(<span class="keyword">false</span>, <span class="stringliteral">&quot;wmma.mma.sync interger type multiplicands is avialable only for SM75 and beyond&quot;</span>);</div><div class="line"><a name="l00111"></a><span class="lineno">  111</span>&#160;<span class="preprocessor">#endif</span></div><div class="line"><a name="l00112"></a><span class="lineno">  112</span>&#160;</div><div class="line"><a name="l00113"></a><span class="lineno">  113</span>&#160;};</div><div class="line"><a name="l00114"></a><span class="lineno">  114</span>&#160;</div><div class="line"><a name="l00116"></a><span class="lineno">  116</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00117"></a><span class="lineno">  117</span>&#160;<span class="comment">// WMMA template structure defines nvcuda::wmma::fragments and static assert for</span></div><div class="line"><a name="l00118"></a><span class="lineno">  118</span>&#160;<span class="comment">// wmma native instruction sizes supported for cutlass::uint1b_t (experimental::b1)</span></div><div class="line"><a name="l00119"></a><span class="lineno">  119</span>&#160;<span class="comment">// (nvcuda::wmma targeting SASS instruction BMMA)</span></div><div class="line"><a name="l00120"></a><span class="lineno">  120</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00122"></a><span class="lineno">  122</span>&#160;<span class="comment"></span><span class="keyword">template</span> &lt;</div><div class="line"><a name="l00123"></a><span class="lineno">  123</span>&#160;<span class="keyword">typename</span> Shape_, </div><div class="line"><a name="l00124"></a><span class="lineno">  124</span>&#160;<span class="keyword">typename</span> LayoutA_, </div><div class="line"><a name="l00125"></a><span class="lineno">  125</span>&#160;<span class="keyword">typename</span> LayoutB_,</div><div class="line"><a name="l00126"></a><span class="lineno">  126</span>&#160;<span class="keyword">typename</span> LayoutC_&gt;</div><div class="line"><a name="l00127"></a><span class="lineno"><a class="line" href="structcutlass_1_1arch_1_1Wmma_3_01Shape___00_01cutlass_1_1uint1b__t_00_01LayoutA___00_01cutlass_c80a7ea4d219cd9b13b560b493338028.html">  127</a></span>&#160;<span class="keyword">struct </span>Wmma&lt;</div><div class="line"><a name="l00128"></a><span class="lineno">  128</span>&#160;  Shape_,                                   </div><div class="line"><a name="l00129"></a><span class="lineno">  129</span>&#160;  <a class="code" href="namespacecutlass.html">cutlass</a>::<a class="code" href="structcutlass_1_1integer__subbyte.html">uint1b_t</a>,                        </div><div class="line"><a name="l00130"></a><span class="lineno">  130</span>&#160;  LayoutA_,                                 </div><div class="line"><a name="l00131"></a><span class="lineno">  131</span>&#160;  <a class="code" href="structcutlass_1_1integer__subbyte.html">cutlass::uint1b_t</a>,                        </div><div class="line"><a name="l00132"></a><span class="lineno">  132</span>&#160;  LayoutB_,                                 </div><div class="line"><a name="l00133"></a><span class="lineno">  133</span>&#160;  int32_t,                                  </div><div class="line"><a name="l00134"></a><span class="lineno">  134</span>&#160;  LayoutC_,                                 </div><div class="line"><a name="l00135"></a><span class="lineno">  135</span>&#160;  cutlass::arch::OpXorPopc                  </div><div class="line"><a name="l00136"></a><span class="lineno">  136</span>&#160;&gt; {</div><div class="line"><a name="l00137"></a><span class="lineno">  137</span>&#160;<span class="preprocessor">#if defined(CUTLASS_ARCH_WMMA_SM75_ENABLED)</span></div><div class="line"><a name="l00138"></a><span class="lineno">  138</span>&#160;  <span class="keyword">using</span> Shape = Shape_;</div><div class="line"><a name="l00139"></a><span class="lineno">  139</span>&#160;  <span class="keyword">using</span> ElementA = <a class="code" href="namespacecutlass.html#a09a3695d6126aed7a7c01f431fa34b7d">cutlass::uint1b_t</a>;</div><div class="line"><a name="l00140"></a><span class="lineno">  140</span>&#160;  <span class="keyword">using</span> LayoutA = LayoutA_;</div><div class="line"><a name="l00141"></a><span class="lineno">  141</span>&#160;  <span class="keyword">using</span> ElementB = <a class="code" href="namespacecutlass.html#a09a3695d6126aed7a7c01f431fa34b7d">cutlass::uint1b_t</a>;</div><div class="line"><a name="l00142"></a><span class="lineno">  142</span>&#160;  <span class="keyword">using</span> LayoutB = LayoutB_;</div><div class="line"><a name="l00143"></a><span class="lineno">  143</span>&#160;  <span class="keyword">using</span> ElementC = int32_t;</div><div class="line"><a name="l00144"></a><span class="lineno">  144</span>&#160;  <span class="keyword">using</span> LayoutC = LayoutC_;</div><div class="line"><a name="l00145"></a><span class="lineno">  145</span>&#160;  <span class="keyword">using</span> Operator = cutlass::arch::OpXorPopc;</div><div class="line"><a name="l00146"></a><span class="lineno">  146</span>&#160;</div><div class="line"><a name="l00147"></a><span class="lineno">  147</span>&#160;  <span class="comment">// check supported wmma shape for the given multiplicand data types</span></div><div class="line"><a name="l00148"></a><span class="lineno">  148</span>&#160;  <a class="code" href="platform_8h.html#adde4c9ea91b753491851361a4198c009">static_assert</a>(</div><div class="line"><a name="l00149"></a><span class="lineno">  149</span>&#160;    <a class="code" href="structcutlass_1_1platform_1_1is__same.html">platform::is_same</a>&lt;<a class="code" href="structcutlass_1_1gemm_1_1GemmShape.html">cutlass::gemm::GemmShape&lt;8, 8, 128&gt;</a>, Shape&gt;::value,</div><div class="line"><a name="l00150"></a><span class="lineno">  150</span>&#160;    <span class="stringliteral">&quot;Supported list of wmma operator shape for b1 multiplicands is: 8x8x128&quot;</span>);</div><div class="line"><a name="l00151"></a><span class="lineno">  151</span>&#160;</div><div class="line"><a name="l00152"></a><span class="lineno">  152</span>&#160;</div><div class="line"><a name="l00153"></a><span class="lineno">  153</span>&#160;  <span class="comment">// Wmma Fragment</span></div><div class="line"><a name="l00154"></a><span class="lineno">  154</span>&#160;  <span class="keyword">using</span> FragmentA = nvcuda::wmma::fragment&lt;</div><div class="line"><a name="l00155"></a><span class="lineno">  155</span>&#160;          nvcuda::wmma::matrix_a,</div><div class="line"><a name="l00156"></a><span class="lineno">  156</span>&#160;          Shape::kM,</div><div class="line"><a name="l00157"></a><span class="lineno">  157</span>&#160;          Shape::kN,</div><div class="line"><a name="l00158"></a><span class="lineno">  158</span>&#160;          Shape::kK,</div><div class="line"><a name="l00159"></a><span class="lineno">  159</span>&#160;          <span class="keyword">typename</span> CutlassToWmmaDataType&lt;ElementA&gt;::Type,</div><div class="line"><a name="l00160"></a><span class="lineno">  160</span>&#160;          <span class="keyword">typename</span> CutlassToWmmaLayout&lt;LayoutA&gt;::Layout&gt;;</div><div class="line"><a name="l00161"></a><span class="lineno">  161</span>&#160;</div><div class="line"><a name="l00162"></a><span class="lineno">  162</span>&#160;  <span class="keyword">using</span> FragmentB = nvcuda::wmma::fragment&lt;</div><div class="line"><a name="l00163"></a><span class="lineno">  163</span>&#160;          nvcuda::wmma::matrix_b,</div><div class="line"><a name="l00164"></a><span class="lineno">  164</span>&#160;          Shape::kM,</div><div class="line"><a name="l00165"></a><span class="lineno">  165</span>&#160;          Shape::kN,</div><div class="line"><a name="l00166"></a><span class="lineno">  166</span>&#160;          Shape::kK,</div><div class="line"><a name="l00167"></a><span class="lineno">  167</span>&#160;          <span class="keyword">typename</span> CutlassToWmmaDataType&lt;ElementB&gt;::Type,</div><div class="line"><a name="l00168"></a><span class="lineno">  168</span>&#160;          <span class="keyword">typename</span> CutlassToWmmaLayout&lt;LayoutB&gt;::Layout&gt;;</div><div class="line"><a name="l00169"></a><span class="lineno">  169</span>&#160;</div><div class="line"><a name="l00170"></a><span class="lineno">  170</span>&#160;  <span class="keyword">using</span> FragmentC = nvcuda::wmma::fragment&lt;</div><div class="line"><a name="l00171"></a><span class="lineno">  171</span>&#160;          nvcuda::wmma::accumulator,</div><div class="line"><a name="l00172"></a><span class="lineno">  172</span>&#160;          Shape::kM,</div><div class="line"><a name="l00173"></a><span class="lineno">  173</span>&#160;          Shape::kN,</div><div class="line"><a name="l00174"></a><span class="lineno">  174</span>&#160;          Shape::kK,</div><div class="line"><a name="l00175"></a><span class="lineno">  175</span>&#160;          <span class="keyword">typename</span> CutlassToWmmaDataType&lt;ElementC&gt;::Type&gt;;</div><div class="line"><a name="l00176"></a><span class="lineno">  176</span>&#160;  </div><div class="line"><a name="l00178"></a><span class="lineno">  178</span>&#160;  CUTLASS_DEVICE</div><div class="line"><a name="l00179"></a><span class="lineno">  179</span>&#160;  <span class="keywordtype">void</span> operator()(</div><div class="line"><a name="l00180"></a><span class="lineno">  180</span>&#160;    FragmentC &amp;D, </div><div class="line"><a name="l00181"></a><span class="lineno">  181</span>&#160;    FragmentA <span class="keyword">const</span> &amp;A, </div><div class="line"><a name="l00182"></a><span class="lineno">  182</span>&#160;    FragmentB <span class="keyword">const</span> &amp;B, </div><div class="line"><a name="l00183"></a><span class="lineno">  183</span>&#160;    FragmentC <span class="keyword">const</span> &amp;C)<span class="keyword"> const </span>{</div><div class="line"><a name="l00184"></a><span class="lineno">  184</span>&#160;</div><div class="line"><a name="l00185"></a><span class="lineno">  185</span>&#160;      nvcuda::wmma::bmma_sync(D, A, B, C, nvcuda::wmma::experimental::bmmaBitOpXOR, </div><div class="line"><a name="l00186"></a><span class="lineno">  186</span>&#160;                                          nvcuda::wmma::experimental::bmmaAccumulateOpPOPC);</div><div class="line"><a name="l00187"></a><span class="lineno">  187</span>&#160;  }</div><div class="line"><a name="l00188"></a><span class="lineno">  188</span>&#160;</div><div class="line"><a name="l00189"></a><span class="lineno">  189</span>&#160;<span class="preprocessor">#else</span></div><div class="line"><a name="l00190"></a><span class="lineno">  190</span>&#160;    <a class="code" href="platform_8h.html#adde4c9ea91b753491851361a4198c009">static_assert</a>(<span class="keyword">false</span>, <span class="stringliteral">&quot;wmma.mma.sync interger type multiplicands is avialable only for SM75 and beyond&quot;</span>);</div><div class="line"><a name="l00191"></a><span class="lineno">  191</span>&#160;<span class="preprocessor">#endif</span></div><div class="line"><a name="l00192"></a><span class="lineno">  192</span>&#160;</div><div class="line"><a name="l00193"></a><span class="lineno">  193</span>&#160;};</div><div class="line"><a name="l00194"></a><span class="lineno">  194</span>&#160;</div><div class="line"><a name="l00195"></a><span class="lineno">  195</span>&#160;} <span class="comment">// namespace arch</span></div><div class="line"><a name="l00196"></a><span class="lineno">  196</span>&#160;} <span class="comment">// namespace cutlass</span></div><div class="ttc" id="namespacecutlass_html"><div class="ttname"><a href="namespacecutlass.html">cutlass</a></div><div class="ttdef"><b>Definition:</b> aligned_buffer.h:35</div></div>
 <div class="ttc" id="structcutlass_1_1platform_1_1is__same_html"><div class="ttname"><a href="structcutlass_1_1platform_1_1is__same.html">cutlass::platform::is_same</a></div><div class="ttdoc">std::is_same (false specialization) </div><div class="ttdef"><b>Definition:</b> platform.h:394</div></div>
 <div class="ttc" id="namespacecutlass_html_a09a3695d6126aed7a7c01f431fa34b7d"><div class="ttname"><a href="namespacecutlass.html#a09a3695d6126aed7a7c01f431fa34b7d">cutlass::uint1b_t</a></div><div class="ttdeci">integer_subbyte&lt; 1, false &gt; uint1b_t</div><div class="ttdoc">1-bit Unsigned integer type </div><div class="ttdef"><b>Definition:</b> integer_subbyte.h:152</div></div>
 <div class="ttc" id="structcutlass_1_1integer__subbyte_html"><div class="ttname"><a href="structcutlass_1_1integer__subbyte.html">cutlass::integer_subbyte</a></div><div class="ttdoc">4-bit signed integer type </div><div class="ttdef"><b>Definition:</b> integer_subbyte.h:42</div></div>
diff --git a/examples/00_basic_gemm/CMakeLists.txt b/examples/00_basic_gemm/CMakeLists.txt
index 5af8fcf363..9002aad943 100644
--- a/examples/00_basic_gemm/CMakeLists.txt
+++ b/examples/00_basic_gemm/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/00_basic_gemm/basic_gemm.cu b/examples/00_basic_gemm/basic_gemm.cu
index 7c633b30a5..c867112fc2 100644
--- a/examples/00_basic_gemm/basic_gemm.cu
+++ b/examples/00_basic_gemm/basic_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -47,7 +47,7 @@
   or utilities within CUTLASS. Such utilities are demonstrated elsewhere in other examples and are
   prevalent in the CUTLASS unit tests.
 
-  This example has delibrately been kept similar to the basic_gemm example from cutass-1.3 to 
+  This example has delibrately been kept similar to the basic_gemm example from cutlass-1.3 to
   highlight the minimum amount of differences needed to transition to cutlass-2.0.
 
   Cutlass-1.3 sgemm: https://github.com/NVIDIA/cutlass/blob/master/examples/00_basic_gemm/basic_gemm.cu
diff --git a/examples/01_cutlass_utilities/CMakeLists.txt b/examples/01_cutlass_utilities/CMakeLists.txt
index 5673303843..bf37d18a84 100644
--- a/examples/01_cutlass_utilities/CMakeLists.txt
+++ b/examples/01_cutlass_utilities/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/01_cutlass_utilities/cutlass_utilities.cu b/examples/01_cutlass_utilities/cutlass_utilities.cu
index 30ec28412d..43a3d46d99 100644
--- a/examples/01_cutlass_utilities/cutlass_utilities.cu
+++ b/examples/01_cutlass_utilities/cutlass_utilities.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/02_dump_reg_shmem/CMakeLists.txt b/examples/02_dump_reg_shmem/CMakeLists.txt
index 9dd94ab4a5..0216f2b480 100644
--- a/examples/02_dump_reg_shmem/CMakeLists.txt
+++ b/examples/02_dump_reg_shmem/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -31,4 +31,5 @@
 cutlass_example_add_executable(
   02_dump_reg_shmem 
   dump_reg_shmem.cu
+  DISABLE_TESTS ON
 )
diff --git a/examples/02_dump_reg_shmem/dump_reg_shmem.cu b/examples/02_dump_reg_shmem/dump_reg_shmem.cu
index 159b0b4924..3db7821ffa 100644
--- a/examples/02_dump_reg_shmem/dump_reg_shmem.cu
+++ b/examples/02_dump_reg_shmem/dump_reg_shmem.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/03_visualize_layout/CMakeLists.txt b/examples/03_visualize_layout/CMakeLists.txt
index 27c38249ec..be8c7436fa 100644
--- a/examples/03_visualize_layout/CMakeLists.txt
+++ b/examples/03_visualize_layout/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -29,7 +29,6 @@
 
 
 set(TEST_COMMAND_00 RowMajor --extent=16,16)
-set(TEST_COMMAND_01 \"ColumnMajorInterleaved<4>\" --extent=32,8 --output-shape=16 --vectorize=4)
 
 cutlass_example_add_executable(
   03_visualize_layout
@@ -37,6 +36,5 @@ cutlass_example_add_executable(
   register_layout.cu
   TEST_COMMAND_OPTIONS
   TEST_COMMAND_00
-  TEST_COMMAND_01
   )
 
diff --git a/examples/03_visualize_layout/options.h b/examples/03_visualize_layout/options.h
index 2b1d8fdb5f..d422466852 100644
--- a/examples/03_visualize_layout/options.h
+++ b/examples/03_visualize_layout/options.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/03_visualize_layout/register_layout.cu b/examples/03_visualize_layout/register_layout.cu
index 060abe353b..d20c893a16 100644
--- a/examples/03_visualize_layout/register_layout.cu
+++ b/examples/03_visualize_layout/register_layout.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -64,15 +64,15 @@ void RegisterLayouts(std::map<std::string, std::unique_ptr<VisualizeLayoutBase>
       // All Ampere/Turing H/Integer matrix multiply tensor core kernels uses the same swizzling
       // layout implementation with different templates.
       //
-      // BMMA 88128  Interleaved-256
-      // BMMA 168256 Interleaved-256
+      // mma.sync.aligned.m8n8k128.s32.b1.b1.s32 Interleaved-256
+      // mma.sync.aligned.m16n8k256.s32.b1.b1.s32 Interleaved-256
       {"TensorOpMultiplicand<1,256>",
        new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<1, 256>>},
-      // BMMA 88128  TN kblock512
-      // BMMA 168256 TN kblock512
+      // mma.sync.aligned.m8n8k128.s32.b1.b1.s32 TN kblock512
+      // mma.sync.aligned.m16n8k256.s32.b1.b1.s32 TN kblock512
       {"TensorOpMultiplicand<1,512>",
        new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<1, 512>>},
-      // BMMA 168256 TN kblock1024
+      // mma.sync.aligned.m16n8k256.s32.b1.b1.s32 TN kblock1024
       {"TensorOpMultiplicand<1,1024>",
        new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<1, 1024>>},
       // Integer matrix multiply.int4 8832  Interleaved-64
diff --git a/examples/03_visualize_layout/register_layout.h b/examples/03_visualize_layout/register_layout.h
index b473279af9..0375f3251c 100644
--- a/examples/03_visualize_layout/register_layout.h
+++ b/examples/03_visualize_layout/register_layout.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/03_visualize_layout/visualize_layout.cpp b/examples/03_visualize_layout/visualize_layout.cpp
index 81be32901d..1edf830d8e 100644
--- a/examples/03_visualize_layout/visualize_layout.cpp
+++ b/examples/03_visualize_layout/visualize_layout.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -95,7 +95,7 @@ void print_usage(std::ostream &out) {
          "--extent=16,16 --vectorize=2 --output-shape=16,4\n"
       << "$ 03_visualize_layout \"VoltaTensorOpMultiplicandCrosswise<16,32>\" "
          "--extent=32,64 --vectorize=4 --output-shape=64,4\n"
-      << "$ 03_visualize_layout \"VotlaTensorOpMultiplicandCongruous<16>\" "
+      << "$ 03_visualize_layout \"VoltaTensorOpMultiplicandCongruous<16>\" "
          "--extent=64,32 --vectorize=8 --output-shape=64,4\n";
 
   out << std::endl;
diff --git a/examples/03_visualize_layout/visualize_layout.h b/examples/03_visualize_layout/visualize_layout.h
index 7c9a1bc2f5..f070bad265 100644
--- a/examples/03_visualize_layout/visualize_layout.h
+++ b/examples/03_visualize_layout/visualize_layout.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -260,7 +260,7 @@ class VisualizeLayout : public VisualizeLayoutBase {
     if (options.vectorize <= 2) return std::make_pair(false, -1);
 
     // Boundary check.
-    if (i > elements.size() || (i + options.vectorize - 1) > elements.size())
+    if (i > int(elements.size()) || (i + options.vectorize - 1) > int(elements.size()))
       return std::make_pair(false, -1);
 
     // Check if either all elements are valid or invalid.
diff --git a/examples/04_tile_iterator/CMakeLists.txt b/examples/04_tile_iterator/CMakeLists.txt
index 52e1d665b6..55482729bd 100644
--- a/examples/04_tile_iterator/CMakeLists.txt
+++ b/examples/04_tile_iterator/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/04_tile_iterator/tile_iterator.cu b/examples/04_tile_iterator/tile_iterator.cu
index 886c17701a..b9441a562d 100644
--- a/examples/04_tile_iterator/tile_iterator.cu
+++ b/examples/04_tile_iterator/tile_iterator.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -94,7 +94,7 @@ __global__ void copy(
 
     typename Iterator::Fragment fragment;
 
-    for(int i = 0; i < fragment.size(); ++i) {
+    for(size_t i = 0; i < fragment.size(); ++i) {
       fragment[i] = 0;
     }
 
diff --git a/examples/05_batched_gemm/CMakeLists.txt b/examples/05_batched_gemm/CMakeLists.txt
index f42e76b235..cd69403aa9 100644
--- a/examples/05_batched_gemm/CMakeLists.txt
+++ b/examples/05_batched_gemm/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/05_batched_gemm/batched_gemm.cu b/examples/05_batched_gemm/batched_gemm.cu
index 2ce552c79f..5fb7518f61 100644
--- a/examples/05_batched_gemm/batched_gemm.cu
+++ b/examples/05_batched_gemm/batched_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -81,7 +81,7 @@ matrix A can be seen as
 ---------------------------------------
      batch 0      |      batch 1
 , where batch size is 2, M is 6 and K is 2
-The stride (batch_stride_B) between the first element of two batches is lda * k
+The stride (batch_stride_A) between the first element of two batches is lda * k
 
 matrix B can be seen as
 -----------------------------
@@ -94,7 +94,7 @@ matrix B can be seen as
 (1,1,0) | (1,1,1) | (1,1,2) |
 -----------------------------
 , where the batch size is 2, N is 3 and K is 2
-The stride (batch_stride_C) between the first element of two batches is k
+The stride (batch_stride_B) between the first element of two batches is k
 
 
 */
@@ -207,15 +207,15 @@ cudaError_t strided_batched_gemm_nn_reference(
   
   cudaError_t result = cudaSuccess;
 
-  if (A.size() < lda * k * batch_count) {
+  if (A.size() < size_t(lda * k * batch_count)) {
     std::cout << "the size of A is too small" << std::endl;
     return cudaErrorInvalidValue;
   }
-  if (B.size() < ldb * n) {
+  if (B.size() < size_t(ldb * n)) {
     std::cout << "the size of B is too small" << std::endl;
     return cudaErrorInvalidValue;
   }
-  if (C.size() < ldc * n * batch_count) {
+  if (C.size() < size_t(ldc * n * batch_count)) {
     std::cout << "the size of C is too small" << std::endl;
     return cudaErrorInvalidValue;
   }
diff --git a/examples/06_splitK_gemm/CMakeLists.txt b/examples/06_splitK_gemm/CMakeLists.txt
index 04d7af8cf4..e0d11d0c0c 100644
--- a/examples/06_splitK_gemm/CMakeLists.txt
+++ b/examples/06_splitK_gemm/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/06_splitK_gemm/splitk_gemm.cu b/examples/06_splitK_gemm/splitk_gemm.cu
index 03e60788b1..1a559b8326 100644
--- a/examples/06_splitK_gemm/splitk_gemm.cu
+++ b/examples/06_splitK_gemm/splitk_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -55,7 +55,7 @@ composed from lower level ones. Multiple thread-tiles (tile size each thread com
 to form warp-tiles (tile size each warp computes) and multiple warp tiles can be used to compute
 threadblock-tile (tile size computed by a threadblock).
 
-In thie example, we split variable initialization into
+In this example, we split variable initialization into
 1. Setting up data properties : describes how matrices are laid out in the memory and how the kernel
 can view them (logical to physical mapping)
 2. Setting up computation properties : describes how the above set matrices will be used to compute
@@ -74,10 +74,10 @@ ElementAccumulator (float), ElementComputeEpilogue (float), ElementInputA (cutla
 ElementInputB (cutlass::half_t), ElementOutput (float). Communicating just the data type is not
 enough. As the data is laid out linearly in memory, we have to convey the layout of matrices. We do
 that by initializing template variable LayoutInputA to column major cutlass variable, LayoutInputB
-to row major and LayoutOutput to row major. Next, we setup rules to comptue alpha * X + beta * C
+to row major and LayoutOutput to row major. Next, we setup rules to compute alpha * X + beta * C
 which is called epilogue of the kernel. We initialize template variable EpilogueOp, which takes the
-data type of output ElementOutput (int32_t), the number of elements per vector memory access (16),
-data type of accumulator (int32_t) and data type of computation of linear combination (alpha * X +
+data type of output ElementOutput (float), the number of elements per vector memory access (16),
+data type of accumulator (float) and data type of computation of linear combination (alpha * X +
 beta * C).
 
 Now that we setup the properties of data, we have to setup properties of computation.
@@ -85,7 +85,7 @@ Now that we setup the properties of data, we have to setup properties of computa
 Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x32,
 64x64x4, 8x8x4 (MxNxK) respectively. When passed to instantiate CUTLASS GEMM kernel, it internally
 deduce the amount of threads needed per thread-block, amount of shared memory, storing data in
-bank-conflict free manner, and ton of other variables required to compose, intialize and launch a
+bank-conflict free manner, and ton of other variables required to compose, initialize and launch a
 high performance GEMM kernel. This is the beauty of CUTLASS, it relieves developer from
 understanding and coding complicated hardware optimizations which can easily go wrong.
 
@@ -95,7 +95,7 @@ is done which threadblock launched on an SM, CUDA SM architecture of GPU you wan
 These are all put together to create a template variable which describes CUTLASS GEMM kernel using
 cutlass::gemm::device::GemmSplitKParallel template.
 
-The next step is to intialize physical data, instantiate and initialize CUTLASS kernel and run it.
+The next step is to initialize physical data, instantiate and initialize CUTLASS kernel and run it.
 We use CUTLASS utilities to initialize, fill, compare matrices as they are simple and doesn't come
 in the way of learning CUTLASS.
 
@@ -103,7 +103,7 @@ Once all the matrices are initialized and filled with data, create arguments tup
 kernel which takes problem size (M = 5120, N = 4096 and K = 4096), matrices, alpha, beta and the
 important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space
 memory required by the kernel we instantiated. If yes, we create it and pass it along with other
-arguments created to intialize CUTLASS kernel then, the kernel is launched.
+arguments created to initialize CUTLASS kernel then, the kernel is launched.
 
 In this example, we later on launch a reference gemm kernel (from CUTLASS utilities) to compare if
 the output from CUTLASS kernel is same as reference GEMM kernel.
@@ -149,9 +149,6 @@ using ShapeMMAWarp = cutlass::gemm::GemmShape<64, 64, 32>;  // <- warp tile M =
 // This code section describes the size of MMA op
 using ShapeMMAOp = cutlass::gemm::GemmShape<8, 8, 4>;  // <- MMA Op tile M = 8, N = 8, K = 4
 
-// This code section describes how threadblocks are scheduled on GPU
-using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;  // <- ??
-
 // This code section describes ?
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
     ElementOutput,                                     // <- data type of output matrix
diff --git a/examples/07_volta_tensorop_gemm/CMakeLists.txt b/examples/07_volta_tensorop_gemm/CMakeLists.txt
index c53367ac5a..2503cd3d43 100644
--- a/examples/07_volta_tensorop_gemm/CMakeLists.txt
+++ b/examples/07_volta_tensorop_gemm/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu b/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu
index eecd86cd45..23c2d9f45f 100644
--- a/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu
+++ b/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -75,7 +75,7 @@ Now that we setup the properties of data, we have to setup properties of computa
 Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x32,
 64x64x32, 8x8x4 (MxNxK) respectively. When passed to instantiate CUTLASS GEMM kernel, it internally
 deduce the amount of threads needed per thread-block, amount of shared memory, storing data in
-bank-conflict free manner, and ton of other variables required to compose, intialize and launch a
+bank-conflict free manner, and ton of other variables required to compose, initialize and launch a
 high performance GEMM kernel. This is the beauty of CUTLASS, it relieves developer from
 understanding and coding complicated hardware optimizations which can easily go wrong.
 
@@ -107,7 +107,7 @@ is done which threadblock launched on an SM, CUDA SM architecture of GPU you wan
 These are all put together to create a template variable which describes CUTLASS GEMM kernel using
 cutlass::gemm::device::Gemm template.
 
-The next step is to intialize physical data, instantiate and initialize CUTLASS kernel and run it.
+The next step is to initialize physical data, instantiate and initialize CUTLASS kernel and run it.
 We use CUTLASS utilities to initialize, fill, compare matrices as they are simple and doesn't come
 in the way of learning CUTLASS.
 
@@ -115,7 +115,7 @@ Once all the matrices are initialized and filled with data, create arguments tup
 kernel which takes problem size (M = 5120, N = 4096 and K = 4096), matrices, alpha, beta and the
 important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space
 memory required by the kernel we instantiated. If yes, we create it and pass it along with other
-arguments created to intialize CUTLASS kernel then, the kernel is launched.
+arguments created to initialize CUTLASS kernel then, the kernel is launched.
 
 In this example, we later on launch a reference gemm kernel (from CUTLASS utilities) to compare if
 the output from CUTLASS kernel is same as reference GEMM kernel.
@@ -162,7 +162,7 @@ using ShapeMMAWarp = cutlass::gemm::GemmShape<64, 64, 32>;  // <- warp tile M =
 using ShapeMMAOp = cutlass::gemm::GemmShape<8, 8, 4>;  // <- MMA Op tile M = 8, N = 8, K = 4
 
 // This code section describes how threadblocks are scheduled on GPU
-using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;  // <- ??
+using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
 
 // This code section describes ?
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
diff --git a/examples/08_turing_tensorop_gemm/CMakeLists.txt b/examples/08_turing_tensorop_gemm/CMakeLists.txt
index c84bc6d49b..2e0a54817d 100644
--- a/examples/08_turing_tensorop_gemm/CMakeLists.txt
+++ b/examples/08_turing_tensorop_gemm/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu b/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu
index 1869c41f00..34f682deb0 100644
--- a/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu
+++ b/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -74,7 +74,7 @@ Now that we setup the properties of data, we have to setup properties of computa
 Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x256x64,
 64x64x16, 8x8x16 (MxNxK) respectively. When passed to instantiate CUTLASS GEMM kernel, it internally
 deduce the amount of threads needed per thread-block, amount of shared memory, storing data in
-bank-conflict free manner, and ton of other variables required to compose, intialize and launch a
+bank-conflict free manner, and ton of other variables required to compose, initialize and launch a
 high performance GEMM kernel. This is the beauty of CUTLASS, it relieves developer from
 understanding and coding complicated hardware optimizations which can easily go wrong.
 
@@ -106,7 +106,7 @@ is done which threadblock launched on an SM, CUDA SM architecture of GPU you wan
 These are all put together to create a template variable which describes CUTLASS GEMM kernel using
 cutlass::gemm::device::Gemm template.
 
-The next step is to intialize physical data, instantiate and initialize CUTLASS kernel and run it.
+The next step is to initialize physical data, instantiate and initialize CUTLASS kernel and run it.
 We use CUTLASS utilities to initialize, fill, compare matrices as they are simple and doesn't come
 in the way of learning CUTLASS.
 
@@ -114,7 +114,7 @@ Once all the matrices are initialized and filled with data, create arguments tup
 kernel which takes problem size (M = 5120, N = 4096 and K = 4096), matrices, alpha, beta and the
 important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space
 memory required by the kernel we instantiated. If yes, we create it and pass it along with other
-arguments created to intialize CUTLASS kernel then, the kernel is launched.
+arguments created to initialize CUTLASS kernel then, the kernel is launched.
 
 In this example, we later on launch a reference gemm kernel (from CUTLASS utilities) to compare if
 the output from CUTLASS kernel is same as reference GEMM kernel.
@@ -140,8 +140,8 @@ using ElementInputA = int8_t;                       // <- data type of elements
 using ElementInputB = int8_t;                       // <- data type of elements in input matrix B
 using ElementOutput = int32_t;                      // <- data type of elements in output matrix D
 
-// The code section below describes matrix layout of input and output matrices. Column Major for
-// Matrix A, Row Major for Matrix B and Row Major for Matrix C
+// The code section below describes matrix layout of input and output matrices. Row Major for
+// Matrix A, Column Major for Matrix B and Row Major for Matrix C
 using LayoutInputA = cutlass::layout::RowMajor;
 using LayoutInputB = cutlass::layout::ColumnMajor;
 using LayoutOutput = cutlass::layout::RowMajor;
@@ -161,7 +161,7 @@ using ShapeMMAWarp = cutlass::gemm::GemmShape<64, 64, 64>;  // <- warp tile M =
 using ShapeMMAOp = cutlass::gemm::GemmShape<8, 8, 16>;  // <- MMA Op tile M = 8, N = 8, K = 16
 
 // This code section describes how threadblocks are scheduled on GPU
-using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;  // <- ??
+using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
 
 // This code section describes the epilogue part of the kernel
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
@@ -355,4 +355,3 @@ int main() {
 
   return run();
 }
-
diff --git a/examples/09_turing_tensorop_conv2dfprop/CMakeLists.txt b/examples/09_turing_tensorop_conv2dfprop/CMakeLists.txt
index 09057a28d5..673064edc4 100644
--- a/examples/09_turing_tensorop_conv2dfprop/CMakeLists.txt
+++ b/examples/09_turing_tensorop_conv2dfprop/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu b/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu
index bd74ce12da..adca0568bd 100644
--- a/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu
+++ b/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -76,7 +76,7 @@ Now that we setup the properties of data, we have to setup properties of computa
 Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x128,
 64x64x128, 8x8x32 (MxNxK) respectively. When passed to instantiate CUTLASS Implicit GEMM kernel, it
 internally deduces the amount of threads needed per thread-block, amount of shared memory, storing
-data in bank-conflict free manner, and ton of other variables required to compose, intialize and
+data in bank-conflict free manner, and ton of other variables required to compose, initialize and
 launch a high performance Implicit GEMM kernel. This is the beauty of CUTLASS, it relieves developer
 from understanding and coding complicated hardware optimizations which can easily go wrong.
 
@@ -108,7 +108,7 @@ is done which threadblock launched on an SM, CUDA SM architecture of GPU you wan
 These are all put together to create a template variable which describes CUTLASS Implicit GEMM
 kernel using cutlass::conv::device::ImplicitGemm template.
 
-The next step is to intialize physical data, instantiate and initialize CUTLASS kernel and run it.
+The next step is to initialize physical data, instantiate and initialize CUTLASS kernel and run it.
 We use CUTLASS utilities to initialize, fill, compare tensors as they are simple and doesn't come
 in the way of learning CUTLASS.
 
@@ -117,7 +117,7 @@ kernel which takes problem size (N = 1, H = 64, W = 64, C = 128), filter size (K
 R = 3, S = 3, C = 128 ), padding, strides, dilation, tensors, alpha, beta and the
 important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space
 memory required by the kernel we instantiated. If yes, we create it and pass it along with other
-arguments created to intialize CUTLASS kernel then, the kernel is launched.
+arguments created to initialize CUTLASS kernel then, the kernel is launched.
 
 In this example, we later on launch a reference convolution kernel (from CUTLASS utilities) to
 compare if the output from CUTLASS kernel is same as the reference implicit GEMM kernel.
@@ -143,7 +143,6 @@ compare if the output from CUTLASS kernel is same as the reference implicit GEMM
 #include "cutlass/util/tensor_view_io.h"
 
 #include "helper.h"
-
 // The code section below describes datatype for input, output tensors and computation between
 // elements
 using ElementAccumulator = int32_t;                 // Data type of accumulator
@@ -555,6 +554,7 @@ Result profile_convolution(Options const &options) {
       LayoutOutput,
       ElementComputeEpilogue,
       ElementAccumulator,
+      ElementOutput,
       cutlass::NumericConverterClamp<ElementOutput, ElementComputeEpilogue>
     >(
       problem_size,
@@ -674,7 +674,6 @@ Result profile_convolution(Options const &options) {
 
   return result;
 }
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 int main(int argc, char const **args) {
@@ -761,11 +760,7 @@ int main(int argc, char const **args) {
     Result::print_header(std::cout, options) << std::endl;
     result.print(std::cout, 1, options) << std::endl;
   }
-
   return 0;
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-
diff --git a/examples/10_planar_complex/CMakeLists.txt b/examples/10_planar_complex/CMakeLists.txt
index eaf41fde7c..ebe78d6b61 100644
--- a/examples/10_planar_complex/CMakeLists.txt
+++ b/examples/10_planar_complex/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -27,7 +27,10 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
+#
+# This example depends on the CUTLASS Library
+#
+if (CUTLASS_ENABLE_LIBRARY)
 
 # Planar Complex GEMM example
 cutlass_example_add_executable(
@@ -35,15 +38,12 @@ cutlass_example_add_executable(
   planar_complex.cu
 )
 
-
-#
-# This example depends on the CUTLASS Library
-#
-
 target_link_libraries(
   10_planar_complex
   PRIVATE
   cutlass_lib
   cutlass_tools_util_includes
+  cuda
 )
 
+endif()
diff --git a/examples/10_planar_complex/planar_complex.cu b/examples/10_planar_complex/planar_complex.cu
index 9a9dc88888..2d7ee95eec 100644
--- a/examples/10_planar_complex/planar_complex.cu
+++ b/examples/10_planar_complex/planar_complex.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/11_planar_complex_array/CMakeLists.txt b/examples/11_planar_complex_array/CMakeLists.txt
index b5ad07cf7a..0e3fc9e987 100644
--- a/examples/11_planar_complex_array/CMakeLists.txt
+++ b/examples/11_planar_complex_array/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -27,7 +27,10 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
+#
+# This example depends on the CUTLASS Library
+#
+if (CUTLASS_ENABLE_LIBRARY)
 
 # Planar Complex Array GEMM example
 cutlass_example_add_executable(
@@ -35,15 +38,12 @@ cutlass_example_add_executable(
   planar_complex_array.cu
 )
 
-
-#
-# This example depends on the CUTLASS Library
-#
-
 target_link_libraries(
   11_planar_complex_array
   PRIVATE
   cutlass_lib
   cutlass_tools_util_includes
+  cuda
 )
 
+endif()
diff --git a/examples/11_planar_complex_array/planar_complex_array.cu b/examples/11_planar_complex_array/planar_complex_array.cu
index 272390f26b..0df6e57284 100644
--- a/examples/11_planar_complex_array/planar_complex_array.cu
+++ b/examples/11_planar_complex_array/planar_complex_array.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/12_gemm_bias_relu/CMakeLists.txt b/examples/12_gemm_bias_relu/CMakeLists.txt
index 3b681b31fa..e3e428dfcb 100644
--- a/examples/12_gemm_bias_relu/CMakeLists.txt
+++ b/examples/12_gemm_bias_relu/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/12_gemm_bias_relu/gemm_bias_relu.cu b/examples/12_gemm_bias_relu/gemm_bias_relu.cu
index 62eb294028..bca8e0ac74 100644
--- a/examples/12_gemm_bias_relu/gemm_bias_relu.cu
+++ b/examples/12_gemm_bias_relu/gemm_bias_relu.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -54,12 +54,11 @@ using ElementInputA = cutlass::half_t;              // <- data type of elements
 using ElementInputB = cutlass::half_t;              // <- data type of elements in input matrix B
 using ElementOutput = float;                        // <- data type of elements in output matrix D
 
-// The code section below describes matrix layout of input and output matrices.
-// Column Major for Matrix A, B and C.
-
 // Note that if the output is column major, the bias has to be per row. i.e. every row has different bias.
 // If the output is row major, the bias has to be per column, i.e. every column has different bias.
 // Below list some other notices:
+//
+// Note this example only works for ColumnMajor output because
 //   1) we only have row major epilogue.
 //   2) we swap A and B if the output is column major then we can still use the
 //      row major epilogue.
@@ -82,10 +81,10 @@ using ShapeMMAThreadBlock =
 // This code section describes tile size a warp will compute
 using ShapeMMAWarp = cutlass::gemm::GemmShape<64, 64, 32>;  // <- warp tile M = 64, N = 64, K = 32 
 // This code section describes the size of MMA op
-using ShapeMMAOp = cutlass::gemm::GemmShape<16, 8, 8>;  // <- MMA Op tile M = 8, N = 8, K = 4
+using ShapeMMAOp = cutlass::gemm::GemmShape<16, 8, 8>;  // <- MMA Op tile M = 16, N = 8, K = 8
 
 // This code section describes how threadblocks are scheduled on GPU
-using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;  // <- ??
+using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
 
 // Define the epilogue operation as LinearCombinationRelu. This is approximately equal to
 //
diff --git a/examples/13_two_tensor_op_fusion/CMakeLists.txt b/examples/13_two_tensor_op_fusion/CMakeLists.txt
index 04d55bbec5..6819a9766e 100644
--- a/examples/13_two_tensor_op_fusion/CMakeLists.txt
+++ b/examples/13_two_tensor_op_fusion/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -64,6 +64,7 @@ endforeach()
 foreach(FUSION_GEMM_EXAMPLE
   fused_two_gemms_f16_sm75_rf
   fused_two_gemms_f16_sm75_shmem
+  fused_two_gemms_grouped_f16_sm80_rf
   fused_two_gemms_f16_sm80_rf
   fused_two_gemms_f16_sm80_shmem
   fused_two_gemms_s8_sm75_rf
@@ -79,4 +80,3 @@ foreach(FUSION_GEMM_EXAMPLE
   add_dependencies(13_fused_two_gemms 13_${FUSION_GEMM_EXAMPLE})
 
 endforeach()
-
diff --git a/examples/13_two_tensor_op_fusion/README.md b/examples/13_two_tensor_op_fusion/README.md
index 134644a063..4b9cb6d128 100644
--- a/examples/13_two_tensor_op_fusion/README.md
+++ b/examples/13_two_tensor_op_fusion/README.md
@@ -1,11 +1,11 @@
 # Introduction
 
-This example shows fusing two back-to-back GEMMs/Convolutions into one kernel. 
+This example shows fusing two back-to-back GEMMs/Convolutions into one kernel.
 
 <p align="center"><img src=/media/images/13_example_fusion.png></p>
 
-When running two unfused GEMM/Conv operations, each operation loads one input 
-activation matrix, one weight matrix (or filter matrix) from the memory and then 
+When running two unfused GEMM/Conv operations, each operation loads one input
+activation matrix, one weight matrix (or filter matrix) from the memory and then
 stores the result activation matrix back to the memory.
 
 When the two GEMM/Conv operations are fused together, the mainloops of the two
@@ -27,10 +27,10 @@ In order to run two GEMM/Convs in a single kernel, the example requires the same
 threadblocks are used across 2 GEMMs/Convs. This also ensures the same threadblock tile M across
 2 GEMMs/Convs.
 
-In order to reuse the output accumulator (stored in register-file) of the 1st GEMM as the 
+In order to reuse the output accumulator (stored in register-file) of the 1st GEMM as the
 input activation, the example enforces the following two constraints:
 
-- thread_block_tile_N = problem_N 
+- thread_block_tile_N = problem_N
 
 <p align="center"><img src=/media/images/13_example_block_resident_fusion.png></p>
 
@@ -39,7 +39,7 @@ addition to its own input activation tile. Therefore the input activation tile o
 2nd GEMM/Conv only depends on the output activation tile of the 1st GEMM/Conv, and the
 operation can be fully block-resident.
 
-- warp_tile_N = thread_block_tile_N 
+- warp_tile_N = thread_block_tile_N
 
 <p align="center"><img src=/media/images/13_example_rf_resident_fusion.png></p>
 
@@ -82,11 +82,11 @@ threadblock. Typically this requires the 2nd Convolution uses 1x1 filter without
   - `./examples/13_two_tensor_op_fusion/13_fused_two_gemms_s8_sm75_shmem`
   - `./examples/13_two_tensor_op_fusion/13_fused_two_gemms_s8_sm80_rf`
   - `./examples/13_two_tensor_op_fusion/13_fused_two_gemms_s8_sm80_shmem`
- 
+
 
 # Copyright
 
-Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/examples/13_two_tensor_op_fusion/b2b_conv2d_run.h b/examples/13_two_tensor_op_fusion/b2b_conv2d_run.h
index b050906317..03ae75c62c 100644
--- a/examples/13_two_tensor_op_fusion/b2b_conv2d_run.h
+++ b/examples/13_two_tensor_op_fusion/b2b_conv2d_run.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/b2b_gemm_run.h b/examples/13_two_tensor_op_fusion/b2b_gemm_run.h
index 6cc4ffd9f4..8e828d1f17 100644
--- a/examples/13_two_tensor_op_fusion/b2b_gemm_run.h
+++ b/examples/13_two_tensor_op_fusion/b2b_gemm_run.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -42,6 +42,7 @@
 #include "cutlass/util/reference/host/tensor_compare.h"
 #include "cutlass/util/reference/host/tensor_norm.h"
 #include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/device/gemm_complex.h"
 #include "cutlass/util/reference/device/tensor_relu.h"
 
 #include "reference/device/tensor_scale_bias.h"
@@ -77,9 +78,9 @@ struct B2bNonFusedGemmRun
   //
 
   B2bNonFusedGemmRun(
-    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform, 
-    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform, 
-    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform, 
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
     cutlass::Distribution::Kind init_Bias_ = cutlass::Distribution::Uniform,
     uint64_t seed_ = 2080
   ):
@@ -88,7 +89,7 @@ struct B2bNonFusedGemmRun
   /// Helper to initialize a tensor view
   template <typename Element, typename Layout>
   bool initialize_tensor(
-    cutlass::TensorView<Element, Layout> view, 
+    cutlass::TensorView<Element, Layout> view,
     cutlass::Distribution::Kind dist_kind,
     uint64_t seed) {
 
@@ -96,7 +97,7 @@ struct B2bNonFusedGemmRun
 
       cutlass::reference::host::TensorFillRandomUniform(
         view, seed, 2, -2, 0);
-    } 
+    }
     else if (dist_kind == cutlass::Distribution::Identity) {
 
       cutlass::reference::host::TensorFillIdentity(view);
@@ -129,62 +130,62 @@ struct B2bNonFusedGemmRun
 
   /// Executes one test
   bool run(
-    cutlass::gemm::GemmCoord problem_size_0, 
-    cutlass::gemm::GemmCoord problem_size_1, 
-    ElementCompute alpha0 = ElementCompute(1), 
+    cutlass::gemm::GemmCoord problem_size_0,
+    cutlass::gemm::GemmCoord problem_size_1,
+    ElementCompute alpha0 = ElementCompute(1),
     ElementCompute beta0 = ElementCompute(0),
-    ElementCompute alpha1 = ElementCompute(1), 
+    ElementCompute alpha1 = ElementCompute(1),
     ElementCompute beta1 = ElementCompute(0),
     bool relu = true,
     int warm_ups = 1,
     int runs = 100) {
-    
+
     //
     // Allocate the GEMM workspace
     //
 
     cutlass::HostTensor<
-      typename Gemm0::ElementA, 
+      typename Gemm0::ElementA,
       typename Gemm0::LayoutA> tensor_A0(problem_size_0.mk());
 
     cutlass::HostTensor<
-      typename Gemm0::ElementB, 
+      typename Gemm0::ElementB,
       typename Gemm0::LayoutB> tensor_B0(problem_size_0.kn());
 
     cutlass::HostTensor<
-      typename Gemm0::ElementC, 
+      typename Gemm0::ElementC,
       typename Gemm0::LayoutC> tensor_C0(problem_size_0.mn());
 
     cutlass::HostTensor<
-      ElementCompute, 
+      ElementCompute,
       typename Gemm0::LayoutC> tensor_Bias0({1, problem_size_0.n()});
 
     cutlass::HostTensor<
-      typename Gemm0::ElementC, 
+      typename Gemm0::ElementC,
       typename Gemm0::LayoutC> tensor_D0(problem_size_0.mn());
 
     cutlass::HostTensor<
-      typename Gemm0::ElementC, 
+      typename Gemm0::ElementC,
       typename Gemm0::LayoutC> reference_D0(problem_size_0.mn());
 
     cutlass::HostTensor<
-      typename Gemm1::ElementB, 
+      typename Gemm1::ElementB,
       typename Gemm1::LayoutB> tensor_B1(problem_size_1.kn());
 
     cutlass::HostTensor<
-      typename Gemm1::ElementC, 
+      typename Gemm1::ElementC,
       typename Gemm1::LayoutC> tensor_C1(problem_size_1.mn());
 
     cutlass::HostTensor<
-      ElementCompute, 
+      ElementCompute,
       typename Gemm1::LayoutC> tensor_Bias1({1, problem_size_1.n()});
 
     cutlass::HostTensor<
-      typename Gemm1::ElementC, 
+      typename Gemm1::ElementC,
       typename Gemm1::LayoutC> tensor_D1(problem_size_1.mn());
 
     cutlass::HostTensor<
-      typename Gemm1::ElementC, 
+      typename Gemm1::ElementC,
       typename Gemm1::LayoutC> reference_D1(problem_size_1.mn());
 
 
@@ -270,13 +271,13 @@ struct B2bNonFusedGemmRun
 
     for(int i = 0; i < runs; i++) {
         status = gemm_op_0();
-    
+
         CUTLASS_CHECK(status);
     }
     cudaEventRecord(stop1);
     for(int i = 0; i < runs; i++) {
         status = gemm_op_1();
-    
+
         CUTLASS_CHECK(status);
     }
 
@@ -312,32 +313,32 @@ struct B2bNonFusedGemmRun
 
     reference_gemm_0(
       problem_size_0,
-      alpha0, 
-      tensor_A0.device_ref(), 
-      tensor_B0.device_ref(), 
-      beta0, 
+      alpha0,
+      tensor_A0.device_ref(),
+      tensor_B0.device_ref(),
+      beta0,
       {tensor_Bias0.device_data(), typename Gemm0::LayoutC::Stride(0)},
       reference_D0.device_ref()
     );
 
     if(relu) {
-       cutlass::reference::device::TensorReLu(reference_D0.device_view()); 
+       cutlass::reference::device::TensorReLu(reference_D0.device_view());
     }
 
     reference_gemm_1(
       problem_size_1,
-      alpha1, 
-      reference_D0.device_ref(), 
-      tensor_B1.device_ref(), 
+      alpha1,
+      reference_D0.device_ref(),
+      tensor_B1.device_ref(),
       beta1,
       {tensor_Bias1.device_data(), typename Gemm1::LayoutC::Stride(0)},
       reference_D1.device_ref()
     );
-    
+
     if(relu) {
-       cutlass::reference::device::TensorReLu(reference_D1.device_view()); 
+       cutlass::reference::device::TensorReLu(reference_D1.device_view());
     }
-   
+
     // Wait for kernels to finish
     cudaDeviceSynchronize();
     reference_D0.sync_host();
@@ -349,7 +350,7 @@ struct B2bNonFusedGemmRun
     CHECK_GT(cutlass::reference::host::TensorNorm(reference_D1.host_view()), 0);
 
     bool passed = cutlass::reference::host::TensorEquals(
-      reference_D1.host_view(), 
+      reference_D1.host_view(),
       tensor_D1.host_view());
 
     CHECK_TRUE(passed);
@@ -362,7 +363,7 @@ struct B2bNonFusedGemmRun
 
       std::ofstream file(fname.str());
 
-      file 
+      file
         << "A0 =\n" << tensor_A0.host_view()
         << "\nB0 =\n" << tensor_B0.host_view()
         << "\nC0 =\n" << tensor_C0.host_view()
@@ -399,9 +400,9 @@ struct B2bFusedGemmRun
   //
 
   B2bFusedGemmRun(
-    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform, 
-    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform, 
-    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform, 
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
     cutlass::Distribution::Kind init_Scale_ = cutlass::Distribution::Uniform,
     cutlass::Distribution::Kind init_Bias_ = cutlass::Distribution::Uniform,
     uint64_t seed_ = 2080
@@ -412,7 +413,7 @@ struct B2bFusedGemmRun
   /// Helper to initialize a tensor view
   template <typename Element, typename Layout>
   bool initialize_tensor(
-    cutlass::TensorView<Element, Layout> view, 
+    cutlass::TensorView<Element, Layout> view,
     cutlass::Distribution::Kind dist_kind,
     uint64_t seed) {
 
@@ -420,11 +421,11 @@ struct B2bFusedGemmRun
 
       cutlass::reference::host::TensorFillRandomUniform(
         view, seed, 2, -2, 0);
-    } 
+    }
     else if (dist_kind == cutlass::Distribution::Identity) {
 
       cutlass::reference::host::TensorFillIdentity(view);
-    } 
+    }
     else if (dist_kind == cutlass::Distribution::Gaussian) {
 
       cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
@@ -453,70 +454,90 @@ struct B2bFusedGemmRun
 
   /// Executes one test
   bool run(
-    cutlass::gemm::GemmCoord problem_size_0, 
-    cutlass::gemm::GemmCoord problem_size_1, 
-    ElementCompute alpha0 = ElementCompute(1), 
+    cutlass::gemm::GemmCoord problem_size_0,
+    cutlass::gemm::GemmCoord problem_size_1,
+    ElementCompute alpha0 = ElementCompute(1),
     ElementCompute beta0 = ElementCompute(0),
-    ElementCompute alpha1 = ElementCompute(1), 
+    ElementCompute alpha1 = ElementCompute(1),
     ElementCompute beta1 = ElementCompute(0),
+    cutlass::gemm::GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm,
+
+    // batch_count is used as split-k when mode is kGemm according
+    // to the GemmUniversal interface
+
+    int batch_count = 1,
+    int64_t batch_stride_A0 = 0,
+    int64_t batch_stride_B0 = 0,
+    int64_t batch_stride_C0 = 0,
+    int64_t batch_stride_B1 = 0,
+    int64_t batch_stride_C1 = 0,
+    int64_t batch_stride_D1 = 0,
+    int64_t batch_stride_Bias0 = 0,
+    int64_t batch_stride_Scale0 = 0,
     bool relu = true,
     int warm_ups = 1,
     int runs = 100) {
-    
+
     //
     // Allocate the GEMM workspace
     //
 
+    cutlass::gemm::GemmCoord CoordA0(problem_size_0.m(), problem_size_0.n(), batch_count * problem_size_0.k());
+    cutlass::gemm::GemmCoord CoordB0(problem_size_0.m(), problem_size_0.n(), batch_count * problem_size_0.k());
+    cutlass::gemm::GemmCoord CoordC0(problem_size_0.m(), batch_count * problem_size_0.n(), problem_size_0.k());
+    cutlass::gemm::GemmCoord CoordB1(problem_size_1.m(), problem_size_1.n(), batch_count * problem_size_1.k());
+    cutlass::gemm::GemmCoord CoordC1(problem_size_1.m(), batch_count * problem_size_1.n(), problem_size_1.k());
+
     cutlass::HostTensor<
-      typename B2bGemm::ElementA, 
-      typename B2bGemm::LayoutA> tensor_A0(problem_size_0.mk());
+      typename B2bGemm::ElementA,
+      typename B2bGemm::LayoutA> tensor_A0(CoordA0.mk());
 
     cutlass::HostTensor<
-      typename B2bGemm::ElementB, 
-      typename B2bGemm::LayoutB> tensor_B0(problem_size_0.kn());
+      typename B2bGemm::ElementB,
+      typename B2bGemm::LayoutB> tensor_B0(CoordB0.kn());
 
     cutlass::HostTensor<
-      typename B2bGemm::ElementC, 
-      typename B2bGemm::LayoutC> tensor_C0(problem_size_0.mn());
+      typename B2bGemm::ElementC,
+      typename B2bGemm::LayoutC> tensor_C0(CoordC0.mn());
 
     cutlass::HostTensor<
-      typename B2bGemm::ElementScaleBias, 
+      typename B2bGemm::ElementScaleBias,
       typename B2bGemm::LayoutScaleBias> tensor_Scale0;
 
     if(alpha0 == ElementCompute(0)) //per-channel scale
-        tensor_Scale0.resize({1, problem_size_0.n()});
+        tensor_Scale0.resize({1, batch_count * problem_size_0.n()});
 
     cutlass::HostTensor<
-      typename B2bGemm::ElementScaleBias, 
-      typename B2bGemm::LayoutScaleBias> tensor_Bias0({1, problem_size_0.n()});
+      typename B2bGemm::ElementScaleBias,
+      typename B2bGemm::LayoutScaleBias> tensor_Bias0({1, batch_count * problem_size_0.n()});
 
     cutlass::HostTensor<
-      ElementAccumulator, 
-      typename B2bGemm::LayoutC> reference_Z0(problem_size_0.mn());
+      ElementAccumulator,
+      typename B2bGemm::LayoutC> reference_Z0(CoordC0.mn());
 
     cutlass::HostTensor<
-      typename B2bGemm::ElementC, 
-      typename B2bGemm::LayoutC> reference_D0(problem_size_0.mn());
+      typename B2bGemm::ElementC,
+      typename B2bGemm::LayoutC> reference_D0(CoordC0.mn());
 
     cutlass::HostTensor<
-      typename B2bGemm::ElementB, 
-      typename B2bGemm::LayoutB> tensor_B1(problem_size_1.kn());
+      typename B2bGemm::ElementB,
+      typename B2bGemm::LayoutB> tensor_B1(CoordB1.kn());
 
     cutlass::HostTensor<
-      typename B2bGemm::ElementC, 
-      typename B2bGemm::LayoutC> tensor_C1(problem_size_1.mn());
+      typename B2bGemm::ElementC,
+      typename B2bGemm::LayoutC> tensor_C1(CoordC1.mn());
 
     cutlass::HostTensor<
-      ElementCompute, 
-      typename B2bGemm::LayoutScaleBias> tensor_Bias1({1, problem_size_1.n()});
+      typename B2bGemm::ElementC,
+      typename B2bGemm::LayoutScaleBias> tensor_Bias1({1, batch_count * problem_size_1.n()});
 
     cutlass::HostTensor<
-      typename B2bGemm::ElementC, 
-      typename B2bGemm::LayoutC> tensor_D1(problem_size_1.mn());
+      typename B2bGemm::ElementC,
+      typename B2bGemm::LayoutC> tensor_D1(CoordC1.mn());
 
     cutlass::HostTensor<
-      typename B2bGemm::ElementC, 
-      typename B2bGemm::LayoutC> reference_D1(problem_size_1.mn());
+      typename B2bGemm::ElementC,
+      typename B2bGemm::LayoutC> reference_D1(CoordC1.mn());
 
 
     CHECK_TRUE(initialize_tensor(tensor_A0.host_view(), init_A, seed + 2019));
@@ -554,6 +575,7 @@ struct B2bFusedGemmRun
     //
 
     typename B2bGemm::Arguments arguments{
+      mode,
       problem_size_0,
       problem_size_1,
       tensor_A0.device_ref(),
@@ -564,8 +586,16 @@ struct B2bFusedGemmRun
       tensor_B1.device_ref(),
       {tensor_Bias1.device_data(), typename B2bGemm::LayoutC::Stride(0)},
       tensor_D1.device_ref(),
+      batch_stride_A0,
+      batch_stride_B0,
+      batch_stride_B1,
+      batch_stride_C1,
+      batch_stride_D1,
+      batch_stride_Bias0,
+      batch_stride_Scale0,
       {alpha0, beta0},
       {alpha1, beta1},
+      batch_count,
     };
 
     B2bGemm b2b_gemm_op;
@@ -618,32 +648,31 @@ struct B2bFusedGemmRun
     // Verify
     //
 
-    cutlass::reference::device::Gemm<
-        typename B2bGemm::ElementA, typename B2bGemm::LayoutA,
-        typename B2bGemm::ElementB, typename B2bGemm::LayoutB,
-        ElementAccumulator, typename B2bGemm::LayoutC, 
-        ElementAccumulator, ElementAccumulator>
-        reference_gemm_0;
-
-    cutlass::reference::device::Gemm<
-        typename B2bGemm::ElementA, typename B2bGemm::LayoutA,
-        typename B2bGemm::ElementB, typename B2bGemm::LayoutB,
-        typename B2bGemm::ElementC, typename B2bGemm::LayoutC, ElementCompute,
-        ElementAccumulator, typename B2bGemm::Operator>
-        reference_gemm_1;
+    cutlass::reference::device::GemmComplex<
+      typename B2bGemm::ElementA, typename B2bGemm::LayoutA,
+      typename B2bGemm::ElementB, typename B2bGemm::LayoutB,
+      ElementAccumulator, typename B2bGemm::LayoutC,
+      ElementAccumulator, ElementAccumulator
+    >(
 
-    reference_gemm_0(
       problem_size_0,
       ElementAccumulator(1), //intermediate alpha=1
-      tensor_A0.device_ref(), 
-      tensor_B0.device_ref(), 
+      tensor_A0.device_ref(),
+      cutlass::ComplexTransform::kNone,
+      tensor_B0.device_ref(),
+      cutlass::ComplexTransform::kNone,
       ElementAccumulator(0), //beta = 0
       reference_Z0.device_ref(),
       reference_Z0.device_ref(),
-      ElementAccumulator(0)
+      ElementAccumulator(0),
+      int(batch_count),
+      batch_stride_A0,
+      batch_stride_B0,
+      batch_stride_C0,
+      batch_stride_C0
     );
 
-    cutlass::reference::device::TensorScaleBiasGemm<
+    cutlass::reference::device::TensorScaleBiasGemmBatched<
       ElementAccumulator, typename B2bGemm::ElementC, typename B2bGemm::LayoutC,
       ElementCompute, typename B2bGemm::LayoutScaleBias
     > (
@@ -652,25 +681,45 @@ struct B2bFusedGemmRun
       reference_D0.device_ref(),
       alpha0,
       tensor_Scale0.device_ref(),
-      tensor_Bias0.device_ref()
+      tensor_Bias0.device_ref(),
+      int(batch_count),
+      batch_stride_C0,
+      batch_stride_C0,
+      batch_stride_Scale0,
+      batch_stride_Bias0
     );
 
     if(relu) {
-       cutlass::reference::device::TensorReLu(reference_D0.device_view()); 
+       cutlass::reference::device::TensorReLu(reference_D0.device_view());
     }
 
-    reference_gemm_1(
+    cutlass::reference::device::GemmComplex<
+      typename B2bGemm::ElementA, typename B2bGemm::LayoutA,
+      typename B2bGemm::ElementB, typename B2bGemm::LayoutB,
+      typename B2bGemm::ElementC, typename B2bGemm::LayoutC,
+      ElementCompute, ElementAccumulator
+    >(
       problem_size_1,
-      alpha1, 
-      reference_D0.device_ref(), 
-      tensor_B1.device_ref(), 
-      beta1, 
+      alpha1, //intermediate alpha=1
+      reference_D0.device_ref(),
+      cutlass::ComplexTransform::kNone,
+      tensor_B1.device_ref(),
+      cutlass::ComplexTransform::kNone,
+      beta1, //beta = 0
       {tensor_Bias1.device_data(), typename B2bGemm::LayoutC::Stride(0)},
-      reference_D1.device_ref()
+      reference_D1.device_ref(),
+      ElementAccumulator(0),
+      int(batch_count),
+      batch_stride_C0,
+      batch_stride_B1,
+      batch_stride_C1,
+      batch_stride_D1
     );
+
     if(relu) {
-       cutlass::reference::device::TensorReLu(reference_D1.device_view()); 
+       cutlass::reference::device::TensorReLu(reference_D1.device_view());
     }
+
     cudaDeviceSynchronize();
     reference_D0.sync_host();
     reference_D1.sync_host();
@@ -680,7 +729,7 @@ struct B2bFusedGemmRun
     CHECK_GT(cutlass::reference::host::TensorNorm(reference_D1.host_view()), 0);
 
     bool passed = cutlass::reference::host::TensorEquals(
-      reference_D1.host_view(), 
+      reference_D1.host_view(),
       tensor_D1.host_view());
 
     CHECK_TRUE(passed);
@@ -694,7 +743,7 @@ struct B2bFusedGemmRun
 
       std::ofstream file(fname.str());
 
-      file 
+      file
         << "A0 =\n" << tensor_A0.host_view()
         << "\nB0 =\n" << tensor_B0.host_view()
         << "\nC0 =\n" << tensor_C0.host_view()
diff --git a/examples/13_two_tensor_op_fusion/b2b_grouped_gemm_run.h b/examples/13_two_tensor_op_fusion/b2b_grouped_gemm_run.h
new file mode 100644
index 0000000000..2206bac0e6
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/b2b_grouped_gemm_run.h
@@ -0,0 +1,450 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Containers for running grouped back-to-back GEMMs
+*/
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/device/tensor_relu.h"
+
+#include "reference/device/tensor_scale_bias.h"
+#include "helper.h"
+
+#define CHECK_GT(val1, val2) \
+    if((val1) <= (val2)) \
+        std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_GT failed\n";
+#define CHECK_TRUE(val) \
+    if(!(val)) \
+        std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_TRUE failed\n";
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename B2bGemm_>
+struct B2bFusedGroupedGemmRun
+{
+
+  using B2bGemm = B2bGemm_;
+  using ElementAccumulator = typename B2bGemm::ElementAccumulator;
+  using ElementCompute = typename B2bGemm::BaseKernel::Epilogue::OutputOp::ElementCompute;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  cutlass::Distribution::Kind init_Scale;
+  cutlass::Distribution::Kind init_Bias;
+  uint64_t seed;
+
+  //
+  // Methods
+  //
+
+  B2bFusedGroupedGemmRun(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform, 
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform, 
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform, 
+    cutlass::Distribution::Kind init_Scale_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_Bias_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_),
+    init_Scale(init_Scale_), init_Bias(init_Bias_), seed(seed_) { }
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, 1, -1, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(
+        view.data(), view.capacity());
+    }
+    else if (dist_kind == cutlass::Distribution::AllZeros) {
+      cutlass::reference::host::TensorFill(view, Element(0));
+    }
+    else if (dist_kind == cutlass::Distribution::AllOnes) {
+      cutlass::reference::host::TensorFill(view, Element(1));
+    }
+    else {
+      std::cerr << "Not implemented\n";
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(
+    std::vector<cutlass::gemm::GemmCoord> problem_sizes_0,
+    std::vector<cutlass::gemm::GemmCoord> problem_sizes_1,
+    ElementCompute alpha0 = ElementCompute(1),
+    ElementCompute beta0 = ElementCompute(0),
+    ElementCompute alpha1 = ElementCompute(1),
+    ElementCompute beta1 = ElementCompute(0),
+    bool relu = true,
+    int warm_ups = 1,
+    int runs = 100) {
+
+    using HostTensorA = cutlass::HostTensor<typename B2bGemm::ElementA, typename B2bGemm::LayoutA>;
+    using HostTensorB = cutlass::HostTensor<typename B2bGemm::ElementB, typename B2bGemm::LayoutB>;
+    using HostTensorC = cutlass::HostTensor<typename B2bGemm::ElementC, typename B2bGemm::LayoutC>;
+    using HostTensorScale = cutlass::HostTensor<ElementCompute, typename B2bGemm::LayoutC>;
+    using HostTensorZ = cutlass::HostTensor<ElementAccumulator, typename B2bGemm::LayoutC>;
+    using HostTensorBias = cutlass::HostTensor<ElementCompute, typename B2bGemm::LayoutC>;
+
+    int problem_count = (int)problem_sizes_0.size();
+
+    std::vector<HostTensorA> host_tensor_A0(problem_count);
+    std::vector<HostTensorB> host_tensor_B0(problem_count);
+    std::vector<HostTensorC> host_tensor_C0(problem_count);
+    std::vector<HostTensorScale> host_tensor_Scale0(problem_count);
+    std::vector<HostTensorScale> host_tensor_Bias0(problem_count);
+    std::vector<HostTensorB> host_tensor_B1(problem_count);
+    std::vector<HostTensorC> host_tensor_C1(problem_count);
+    std::vector<HostTensorBias> host_tensor_Bias1(problem_count);
+    std::vector<HostTensorC> host_tensor_D1(problem_count);
+    std::vector<HostTensorZ> host_tensor_Z(problem_count);
+    std::vector<HostTensorC> host_tensor_ref_D0(problem_count);
+    std::vector<HostTensorC> host_tensor_ref_D1(problem_count);
+
+    std::vector<typename HostTensorA::TensorRef> ref_A0(problem_count);
+    std::vector<typename HostTensorB::TensorRef> ref_B0(problem_count);
+    std::vector<typename HostTensorC::TensorRef> ref_C0(problem_count);
+    std::vector<typename HostTensorScale::TensorRef> ref_Scale0(problem_count);
+    std::vector<typename HostTensorScale::TensorRef> ref_Bias0(problem_count);
+    std::vector<typename HostTensorB::TensorRef> ref_B1(problem_count);
+    std::vector<typename HostTensorC::TensorRef> ref_C1(problem_count);
+    std::vector<typename HostTensorBias::TensorRef> ref_Bias1(problem_count);
+    std::vector<typename HostTensorC::TensorRef> ref_D1(problem_count);
+    std::vector<typename HostTensorZ::TensorRef> ref_Z(problem_count);
+    std::vector<typename HostTensorC::TensorRef> ref_ref_D0(problem_count);
+    std::vector<typename HostTensorC::TensorRef> ref_ref_D1(problem_count);
+
+    for (int i = 0; i < problem_count; ++i) {
+      //
+      // Allocate the GEMM workspace
+      //
+
+      auto problem_size_0 = problem_sizes_0[i];
+      auto problem_size_1 = problem_sizes_1[i];
+
+      host_tensor_A0.at(i) = HostTensorA(problem_size_0.mk());
+      host_tensor_B0.at(i) = HostTensorB(problem_size_0.kn());
+      host_tensor_C0.at(i) = HostTensorC(problem_size_0.mn());
+      if (alpha0 == ElementCompute(0)) //per-channel scale
+        host_tensor_Scale0.at(i) = HostTensorScale(typename HostTensorZ::Layout::TensorCoord{1, problem_size_0.n()});
+      host_tensor_Bias0.at(i) = HostTensorScale(typename HostTensorBias::Layout::TensorCoord{1, problem_size_0.n()});
+      host_tensor_Z.at(i) = HostTensorZ(problem_size_0.mn());
+      host_tensor_ref_D0.at(i) = HostTensorC(problem_size_0.mn());
+      host_tensor_B1.at(i) = HostTensorB(problem_size_1.kn());
+      host_tensor_C1.at(i) = HostTensorC(problem_size_1.mn());
+      host_tensor_Bias1.at(i) = HostTensorScale(typename HostTensorBias::Layout::TensorCoord{1, problem_size_1.n()});
+      host_tensor_D1.at(i) = HostTensorC(problem_size_1.mn());
+      host_tensor_ref_D1.at(i) = HostTensorC(problem_size_1.mn());
+
+      CHECK_TRUE(initialize_tensor(host_tensor_A0.at(i).host_view(), init_A, seed + 2019));
+      CHECK_TRUE(initialize_tensor(host_tensor_B0.at(i).host_view(), init_B, seed + 2018));
+      CHECK_TRUE(initialize_tensor(host_tensor_C0.at(i).host_view(), init_C, seed + 2017));
+      if (alpha0 == ElementCompute(0)) //per-channel scale
+        CHECK_TRUE(initialize_tensor(host_tensor_Scale0.at(i).host_view(), init_Scale, seed + 2014));
+      CHECK_TRUE(initialize_tensor(host_tensor_Bias0.at(i).host_view(), init_Bias, seed + 2013));
+      CHECK_TRUE(initialize_tensor(host_tensor_B1.at(i).host_view(), init_B, seed + 2016));
+      CHECK_TRUE(initialize_tensor(host_tensor_C1.at(i).host_view(), init_C, seed + 2015));
+      CHECK_TRUE(initialize_tensor(host_tensor_Bias1.at(i).host_view(), init_Bias, seed + 2012));
+
+      cutlass::reference::host::TensorFill(
+        host_tensor_D1.at(i).host_view());
+      cutlass::reference::host::TensorFill(
+        host_tensor_ref_D0.at(i).host_view());
+      cutlass::reference::host::TensorFill(
+        host_tensor_ref_D1.at(i).host_view());
+
+      host_tensor_A0.at(i).sync_device();
+      host_tensor_B0.at(i).sync_device();
+      host_tensor_C0.at(i).sync_device();
+      if (alpha0 == ElementCompute(0)) //per-channel scale
+        host_tensor_Scale0.at(i).sync_device();
+      host_tensor_Bias0.at(i).sync_device();
+      host_tensor_B1.at(i).sync_device();
+      host_tensor_C1.at(i).sync_device();
+      host_tensor_Bias1.at(i).sync_device();
+      host_tensor_D1.at(i).sync_device();
+      host_tensor_ref_D0.at(i).sync_device();
+      host_tensor_ref_D1.at(i).sync_device();
+
+      ref_A0.at(i) = (host_tensor_A0.at(i).device_ref());
+      ref_B0.at(i) = (host_tensor_B0.at(i).device_ref());
+      ref_C0.at(i) = (host_tensor_C0.at(i).device_ref());
+      if (alpha0 == ElementCompute(0)) //per-channel scale
+        ref_Scale0.at(i) = (host_tensor_Scale0.at(i).device_ref());
+      ref_Bias0.at(i) = (host_tensor_Bias0.at(i).device_ref());
+      ref_B1.at(i) = (host_tensor_B1.at(i).device_ref());
+      ref_C1.at(i) = {host_tensor_Bias1.at(i).device_data(), typename B2bGemm::LayoutC::Stride(0)};
+      ref_Bias1.at(i) = (host_tensor_Bias1.at(i).device_ref());
+      ref_D1.at(i) = (host_tensor_D1.at(i).device_ref());
+      ref_Z.at(i) = (host_tensor_Z.at(i).device_ref());
+      ref_ref_D0.at(i) = (host_tensor_ref_D0.at(i).device_ref());
+      ref_ref_D1.at(i) = (host_tensor_ref_D1.at(i).device_ref());
+    }
+
+    //
+    // Initialize the GEMM operator
+    //
+
+    cutlass::DeviceAllocation<typename HostTensorA::TensorRef> device_ref_A0(problem_count);
+    device_ref_A0.copy_from_host(ref_A0.data());
+    cutlass::DeviceAllocation<typename HostTensorB::TensorRef> device_ref_B0(problem_count);
+    device_ref_B0.copy_from_host(ref_B0.data());
+    cutlass::DeviceAllocation<typename HostTensorC::TensorRef> device_ref_C0(problem_count);
+    device_ref_C0.copy_from_host(ref_C0.data());
+    cutlass::DeviceAllocation<typename HostTensorScale::TensorRef> device_ref_Scale0(problem_count);
+    device_ref_Scale0.copy_from_host(ref_Scale0.data());
+    cutlass::DeviceAllocation<typename HostTensorScale::TensorRef> device_ref_Bias0(problem_count);
+    device_ref_Bias0.copy_from_host(ref_Bias0.data());
+    cutlass::DeviceAllocation<typename HostTensorB::TensorRef> device_ref_B1(problem_count);
+    device_ref_B1.copy_from_host(ref_B1.data());
+    cutlass::DeviceAllocation<typename HostTensorC::TensorRef> device_ref_C1(problem_count);
+    device_ref_C1.copy_from_host(ref_C1.data());
+    cutlass::DeviceAllocation<typename HostTensorBias::TensorRef> device_ref_Bias1(problem_count);
+    device_ref_Bias1.copy_from_host(ref_Bias1.data());
+    cutlass::DeviceAllocation<typename HostTensorC::TensorRef> device_ref_D1(problem_count);
+    device_ref_D1.copy_from_host(ref_D1.data());
+
+    cutlass::DeviceAllocation<cutlass::gemm::GemmCoord> device_problem_sizes_0(problem_count);
+    device_problem_sizes_0.copy_from_host(problem_sizes_0.data());
+    cutlass::DeviceAllocation<cutlass::gemm::GemmCoord> device_problem_sizes_1(problem_count);
+    device_problem_sizes_1.copy_from_host(problem_sizes_1.data());
+
+    B2bGemm b2b_gemm_op;
+
+    int threadblock_count = B2bGemm::sufficient(problem_sizes_1.data(), problem_count);
+    if (!threadblock_count) {
+      std::cout << "Active CUDA device lacks hardware resources to run CUTLASS Grouped GEMM kernel." << std::endl;
+      return false;
+    }
+
+    typename B2bGemm::Arguments arguments{
+      problem_count,
+      device_problem_sizes_0.get(),
+      device_problem_sizes_1.get(),
+      device_ref_A0.get(),
+      device_ref_B0.get(),
+      device_ref_C0.get(),
+      device_ref_Scale0.get(),
+      device_ref_Bias0.get(),
+      device_ref_B1.get(),
+      device_ref_C1.get(),
+      device_ref_D1.get(),
+      {alpha0, beta0},
+      {alpha1, beta1},
+      threadblock_count
+    };
+
+    cutlass::Status status = b2b_gemm_op.can_implement(arguments);
+
+    if(status != cutlass::Status::kSuccess) {
+        std::cout << "Problem sizes not supported.\n"
+                << "Requirments:\n"
+                << "    problem_size_0.M = problem_size_1.M\n"
+                << "    problem_size_0.N = problem_size_1.K\n"
+                << "    ThreadblockShape0::kN = problem_size_0.N\n"
+                << "    ThreadblockShape1::kN = problem_size_1.N" << std::endl;
+    }
+
+    status = b2b_gemm_op.initialize(arguments);
+
+    CUTLASS_CHECK(status);
+
+    for(int i = 0; i < warm_ups; i++) {
+        status = b2b_gemm_op();
+        CUTLASS_CHECK(status);
+    }
+
+    //
+    // Run the GEMM
+    //
+
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+    cudaEventRecord(start);
+
+    for(int i = 0; i < runs; i++) {
+        status = b2b_gemm_op();
+        CUTLASS_CHECK(status);
+    }
+
+    cudaEventRecord(stop);
+    cudaDeviceSynchronize();
+    float gemmTime;
+    cudaEventElapsedTime(&gemmTime, start, stop);
+    std::cout << "Fusion time " << gemmTime / (float)runs << " ms\n";
+
+    for (int i = 0; i < problem_count; ++i) {
+      host_tensor_D1.at(i).sync_host();
+
+      //
+      // Verify
+      //
+
+      cutlass::reference::device::Gemm<
+          typename B2bGemm::ElementA, typename B2bGemm::LayoutA,
+          typename B2bGemm::ElementB, typename B2bGemm::LayoutB,
+          ElementAccumulator, typename B2bGemm::LayoutC, 
+          ElementAccumulator, ElementAccumulator>
+          reference_gemm_0;
+
+      cutlass::reference::device::Gemm<
+          typename B2bGemm::ElementA, typename B2bGemm::LayoutA,
+          typename B2bGemm::ElementB, typename B2bGemm::LayoutB,
+          typename B2bGemm::ElementC, typename B2bGemm::LayoutC, ElementCompute,
+          ElementAccumulator>
+          reference_gemm_1;
+
+      auto problem_size_0 = problem_sizes_0[i];
+      auto problem_size_1 = problem_sizes_1[i];
+
+      reference_gemm_0(
+        problem_size_0,
+        ElementAccumulator(1), //intermediate alpha=1
+        ref_A0.at(i), 
+        ref_B0.at(i), 
+        ElementAccumulator(0), //beta = 0
+        ref_Z.at(i),
+        ref_Z.at(i),
+        ElementAccumulator(0)
+      );
+
+      cutlass::reference::device::TensorScaleBiasGemm<
+        ElementAccumulator, typename B2bGemm::ElementC, typename B2bGemm::LayoutC,
+        ElementCompute, typename B2bGemm::LayoutC
+      > (
+        problem_size_0,
+        ref_Z.at(i),
+        ref_ref_D0.at(i),
+        alpha0,
+        ref_Scale0.at(i),
+        ref_Bias0.at(i)
+      );
+
+      if(relu) {
+        cutlass::reference::device::TensorReLu(host_tensor_ref_D0.at(i).device_view()); 
+      }
+
+      reference_gemm_1(
+        problem_size_1,
+        alpha1, 
+        ref_ref_D0.at(i), 
+        ref_B1.at(i), 
+        beta1, 
+        {host_tensor_Bias1.at(i).device_data(), typename B2bGemm::LayoutC::Stride(0)},
+        ref_ref_D1.at(i)
+      );
+      if(relu) {
+        cutlass::reference::device::TensorReLu(host_tensor_ref_D1.at(i).device_view()); 
+      }
+      cudaDeviceSynchronize();
+      host_tensor_ref_D0.at(i).sync_host();
+      host_tensor_ref_D1.at(i).sync_host();
+
+      CHECK_GT(cutlass::reference::host::TensorNorm(host_tensor_ref_D0.at(i).host_view()), 0);
+      CHECK_GT(cutlass::reference::host::TensorNorm(host_tensor_D1.at(i).host_view()), 0);
+      CHECK_GT(cutlass::reference::host::TensorNorm(host_tensor_ref_D1.at(i).host_view()), 0);
+
+      bool passed = cutlass::reference::host::TensorEquals(
+        host_tensor_ref_D1.at(i).host_view(), 
+        host_tensor_D1.at(i).host_view());
+
+      CHECK_TRUE(passed);
+      if (!passed)
+      {
+
+        std::stringstream fname;
+
+        fname << "error_B2bGemm_device_fused.txt";
+        std::cerr << "Check failed for GEMM " << i << " in the group." << std::endl;
+        std::cerr << "Dumping results in " << fname.str() << "\n";
+
+        std::ofstream file(fname.str());
+
+        file 
+          << "GEMM " << i << " in group\n"
+          << "A0 =\n" << host_tensor_A0.at(i).host_view()
+          << "\nB0 =\n" << host_tensor_B0.at(i).host_view()
+          << "\nC0 =\n" << host_tensor_C0.at(i).host_view()
+          << "\nScale0:\n" << host_tensor_Scale0.at(i).host_view() << "\n"
+          << "\nBias0:\n" << host_tensor_Bias0.at(i).host_view() << "\n"
+          << "\nB1 =\n" << host_tensor_B1.at(i).host_view()
+          << "\nC1 =\n" << host_tensor_C1.at(i).host_view()
+          << "\nBias1:\n" << host_tensor_Bias1.at(i).host_view() << "\n"
+          << "\n\nReference =\n" << host_tensor_ref_D1.at(i).host_view()
+          << "\nComputed =\n" << host_tensor_D1.at(i).host_view();
+
+        return false;
+      }
+    }
+    return true;
+  }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_two_tensor_op_fusion/b2b_interleaved_conv2d_run.h b/examples/13_two_tensor_op_fusion/b2b_interleaved_conv2d_run.h
index f9905fa521..f70c21af8b 100644
--- a/examples/13_two_tensor_op_fusion/b2b_interleaved_conv2d_run.h
+++ b/examples/13_two_tensor_op_fusion/b2b_interleaved_conv2d_run.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/b2b_interleaved_gemm_run.h b/examples/13_two_tensor_op_fusion/b2b_interleaved_gemm_run.h
index 95c404d9ec..43a33b12f7 100644
--- a/examples/13_two_tensor_op_fusion/b2b_interleaved_gemm_run.h
+++ b/examples/13_two_tensor_op_fusion/b2b_interleaved_gemm_run.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -43,6 +43,7 @@
 #include "cutlass/util/reference/host/tensor_norm.h"
 #include "cutlass/util/host_reorder.h"
 #include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/device/gemm_complex.h"
 #include "cutlass/util/reference/device/tensor_relu.h"
 
 #include "reference/device/tensor_scale_bias.h"
@@ -76,9 +77,9 @@ struct B2bInterleavedNonFusedGemmRun
   //
 
   B2bInterleavedNonFusedGemmRun(
-    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform, 
-    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform, 
-    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform, 
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
     cutlass::Distribution::Kind init_Bias_ = cutlass::Distribution::Uniform,
     uint64_t seed_ = 2080
   ):
@@ -87,7 +88,7 @@ struct B2bInterleavedNonFusedGemmRun
   /// Helper to initialize a tensor view
   template <typename Element, typename Layout>
   bool initialize_tensor(
-    cutlass::TensorView<Element, Layout> view, 
+    cutlass::TensorView<Element, Layout> view,
     cutlass::Distribution::Kind dist_kind,
     uint64_t seed) {
 
@@ -95,7 +96,7 @@ struct B2bInterleavedNonFusedGemmRun
 
       cutlass::reference::host::TensorFillRandomUniform(
         view, seed, 2, -2, 0);
-    } 
+    }
     else if (dist_kind == cutlass::Distribution::Identity) {
 
       cutlass::reference::host::TensorFillIdentity(view);
@@ -128,73 +129,72 @@ struct B2bInterleavedNonFusedGemmRun
 
   /// Executes one test
   bool run(
-    cutlass::gemm::GemmCoord problem_size_0, 
-    cutlass::gemm::GemmCoord problem_size_1, 
-    ElementCompute alpha0 = ElementCompute(1), 
+    cutlass::gemm::GemmCoord problem_size_0,
+    cutlass::gemm::GemmCoord problem_size_1,
+    ElementCompute alpha0 = ElementCompute(1),
     ElementCompute beta0 = ElementCompute(0),
-    ElementCompute alpha1 = ElementCompute(1), 
+    ElementCompute alpha1 = ElementCompute(1),
     ElementCompute beta1 = ElementCompute(0),
     bool relu = true,
     int warm_ups = 1,
     int runs = 100) {
-    
+
     //
     // Allocate the GEMM workspace
     //
 
     cutlass::HostTensor<
-      typename Gemm0::ElementA, 
+      typename Gemm0::ElementA,
       typename Gemm0::LayoutA> tensor_A0(problem_size_0.mk());
 
     cutlass::HostTensor<
-      typename Gemm0::ElementB, 
+      typename Gemm0::ElementB,
       typename Gemm0::LayoutB> tensor_B0(problem_size_0.kn());
 
     cutlass::HostTensor<
-      typename Gemm0::ElementB, 
+      typename Gemm0::ElementB,
       typename Gemm0::LayoutB> tensor_B0_reordered(problem_size_0.kn());
 
     cutlass::HostTensor<
-      typename Gemm0::ElementC, 
+      typename Gemm0::ElementC,
       typename Gemm0::LayoutC> tensor_C0(problem_size_0.mn());
 
     cutlass::HostTensor<
-      typename Gemm0::ElementC, 
+      typename Gemm0::ElementC,
       typename Gemm0::LayoutC> tensor_Bias0({1, problem_size_0.n()});
 
     cutlass::HostTensor<
-      typename Gemm0::ElementC, 
+      typename Gemm0::ElementC,
       typename Gemm0::LayoutC> tensor_D0(problem_size_0.mn());
 
     cutlass::HostTensor<
-      typename Gemm0::ElementC, 
+      typename Gemm0::ElementC,
       typename Gemm0::LayoutC> reference_D0(problem_size_0.mn());
 
     cutlass::HostTensor<
-      typename Gemm1::ElementB, 
+      typename Gemm1::ElementB,
       typename Gemm1::LayoutB> tensor_B1(problem_size_1.kn());
 
     cutlass::HostTensor<
-      typename Gemm1::ElementB, 
+      typename Gemm1::ElementB,
       typename Gemm1::LayoutB> tensor_B1_reordered(problem_size_1.kn());
 
     cutlass::HostTensor<
-      typename Gemm1::ElementC, 
+      typename Gemm1::ElementC,
       typename Gemm1::LayoutC> tensor_C1(problem_size_1.mn());
 
     cutlass::HostTensor<
-      typename Gemm0::ElementC, 
+      typename Gemm0::ElementC,
       typename Gemm1::LayoutC> tensor_Bias1({1, problem_size_1.n()});
 
     cutlass::HostTensor<
-      typename Gemm1::ElementC, 
+      typename Gemm1::ElementC,
       typename Gemm1::LayoutC> tensor_D1(problem_size_1.mn());
 
     cutlass::HostTensor<
-      typename Gemm1::ElementC, 
+      typename Gemm1::ElementC,
       typename Gemm1::LayoutC> reference_D1(problem_size_1.mn());
 
-
     CHECK_TRUE(initialize_tensor(tensor_A0.host_view(), init_A, seed + 2019));
     CHECK_TRUE(initialize_tensor(tensor_B0.host_view(), init_B, seed + 2018));
     CHECK_TRUE(initialize_tensor(tensor_C0.host_view(), init_C, seed + 2017));
@@ -285,13 +285,13 @@ struct B2bInterleavedNonFusedGemmRun
 
     for(int i = 0; i < runs; i++) {
         status = gemm_op_0();
-    
+
         CUTLASS_CHECK(status);
     }
     cudaEventRecord(stop1);
     for(int i = 0; i < runs; i++) {
         status = gemm_op_1();
-    
+
         CUTLASS_CHECK(status);
     }
 
@@ -327,36 +327,36 @@ struct B2bInterleavedNonFusedGemmRun
 
     reference_gemm_0(
       problem_size_0,
-      alpha0, 
-      tensor_A0.device_ref(), 
-      tensor_B0.device_ref(), 
-      beta0, 
+      alpha0,
+      tensor_A0.device_ref(),
+      tensor_B0.device_ref(),
+      beta0,
       {tensor_Bias0.device_data(), typename Gemm0::LayoutC::Stride(0)},
       reference_D0.device_ref()
     );
 
     if(relu) {
-       cutlass::reference::device::TensorReLu(reference_D0.device_view()); 
+       cutlass::reference::device::TensorReLu(reference_D0.device_view());
     }
 
     reference_gemm_1(
       problem_size_1,
-      alpha1, 
-      reference_D0.device_ref(), 
-      tensor_B1.device_ref(), 
+      alpha1,
+      reference_D0.device_ref(),
+      tensor_B1.device_ref(),
       beta1,
       {tensor_Bias1.device_data(), typename Gemm1::LayoutC::Stride(0)},
       reference_D1.device_ref()
     );
-  
+
     if(relu) {
-       cutlass::reference::device::TensorReLu(reference_D1.device_view()); 
+       cutlass::reference::device::TensorReLu(reference_D1.device_view());
     }
 
     // Wait for kernels to finish
     cudaDeviceSynchronize();
-    reference_D0.sync_host(); 
-    reference_D1.sync_host(); 
+    reference_D0.sync_host();
+    reference_D1.sync_host();
 
     CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0.host_view()), 0);
     CHECK_GT(cutlass::reference::host::TensorNorm(reference_D0.host_view()), 0);
@@ -364,7 +364,7 @@ struct B2bInterleavedNonFusedGemmRun
     CHECK_GT(cutlass::reference::host::TensorNorm(reference_D1.host_view()), 0);
 
     bool passed = cutlass::reference::host::TensorEquals(
-      reference_D1.host_view(), 
+      reference_D1.host_view(),
       tensor_D1.host_view());
 
     CHECK_TRUE(passed);
@@ -377,7 +377,7 @@ struct B2bInterleavedNonFusedGemmRun
 
       std::ofstream file(fname.str());
 
-      file 
+      file
         << "A0 =\n" << tensor_A0.host_view()
         << "\nB0 =\n" << tensor_B0.host_view()
         << "\nB0_reordered =\n" << tensor_B0_reordered.host_view()
@@ -416,9 +416,9 @@ struct B2bInterleavedFusedGemmRun
   //
 
   B2bInterleavedFusedGemmRun(
-    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform, 
-    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform, 
-    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform, 
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
     cutlass::Distribution::Kind init_Scale_ = cutlass::Distribution::Uniform,
     cutlass::Distribution::Kind init_Bias_ = cutlass::Distribution::Uniform,
     uint64_t seed_ = 2080
@@ -429,7 +429,7 @@ struct B2bInterleavedFusedGemmRun
   /// Helper to initialize a tensor view
   template <typename Element, typename Layout>
   bool initialize_tensor(
-    cutlass::TensorView<Element, Layout> view, 
+    cutlass::TensorView<Element, Layout> view,
     cutlass::Distribution::Kind dist_kind,
     uint64_t seed) {
 
@@ -437,11 +437,11 @@ struct B2bInterleavedFusedGemmRun
 
       cutlass::reference::host::TensorFillRandomUniform(
         view, seed, 2, -2, 0);
-    } 
+    }
     else if (dist_kind == cutlass::Distribution::Identity) {
 
       cutlass::reference::host::TensorFillIdentity(view);
-    } 
+    }
     else if (dist_kind == cutlass::Distribution::Gaussian) {
 
       cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
@@ -470,78 +470,99 @@ struct B2bInterleavedFusedGemmRun
 
   /// Executes one test
   bool run(
-    cutlass::gemm::GemmCoord problem_size_0, 
-    cutlass::gemm::GemmCoord problem_size_1, 
-    ElementCompute alpha0 = ElementCompute(1), 
+    cutlass::gemm::GemmCoord problem_size_0,
+    cutlass::gemm::GemmCoord problem_size_1,
+    ElementCompute alpha0 = ElementCompute(1),
     ElementCompute beta0 = ElementCompute(0),
-    ElementCompute alpha1 = ElementCompute(1), 
+    ElementCompute alpha1 = ElementCompute(1),
     ElementCompute beta1 = ElementCompute(0),
+    cutlass::gemm::GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm,
+
+    // batch_count is used as split-k when mode is kGemm according
+    // to the GemmUniversal interface
+
+    int batch_count = 1,
+
+    int64_t batch_stride_A0 = 0,
+    int64_t batch_stride_B0 = 0,
+    int64_t batch_stride_C0 = 0,
+    int64_t batch_stride_B1 = 0,
+    int64_t batch_stride_C1 = 0,
+    int64_t batch_stride_D1 = 0,
+    int64_t batch_stride_Bias0 = 0,
+    int64_t batch_stride_Scale0 = 0,
     bool relu = true,
     int warm_ups = 1,
     int runs = 100) {
-    
+
     //
     // Allocate the GEMM workspace
     //
 
+    cutlass::gemm::GemmCoord CoordA0(problem_size_0.m(), problem_size_0.n(), batch_count * problem_size_0.k());
+    cutlass::gemm::GemmCoord CoordB0(problem_size_0.m(), problem_size_0.n(), batch_count * problem_size_0.k());
+    cutlass::gemm::GemmCoord CoordC0(problem_size_0.m(), batch_count * problem_size_0.n(), problem_size_0.k());
+    cutlass::gemm::GemmCoord CoordB1(problem_size_1.m(), problem_size_1.n(), batch_count * problem_size_1.k());
+    cutlass::gemm::GemmCoord CoordC1(problem_size_1.m(), batch_count * problem_size_1.n(), problem_size_1.k());
+
     cutlass::HostTensor<
-      typename B2bGemm::ElementA, 
-      typename B2bGemm::LayoutA> tensor_A0(problem_size_0.mk());
+      typename B2bGemm::ElementA,
+      typename B2bGemm::LayoutA> tensor_A0(CoordA0.mk());
 
     cutlass::HostTensor<
-      typename B2bGemm::ElementB, 
-      typename B2bGemm::LayoutB> tensor_B0(problem_size_0.kn());
+      typename B2bGemm::ElementB,
+      typename B2bGemm::LayoutB> tensor_B0(CoordB0.kn());
 
     cutlass::HostTensor<
-      typename B2bGemm::ElementB, 
-      typename B2bGemm::LayoutB> tensor_B0_reordered(problem_size_0.kn());
+      typename B2bGemm::ElementB,
+      typename B2bGemm::LayoutB> tensor_B0_reordered(CoordB0.kn());
 
     cutlass::HostTensor<
-      typename B2bGemm::ElementC, 
-      typename B2bGemm::LayoutC> tensor_C0(problem_size_0.mn());
+      typename B2bGemm::ElementC,
+      typename B2bGemm::LayoutC> tensor_C0(CoordC0.mn());
 
     cutlass::HostTensor<
-      typename B2bGemm::ElementScaleBias, 
+      typename B2bGemm::ElementScaleBias,
       typename B2bGemm::LayoutScaleBias> tensor_Scale0;
 
     if(alpha0 == ElementCompute(0)) //per-channel scale
-        tensor_Scale0.resize({1, problem_size_0.n()});
+        tensor_Scale0.resize({1, batch_count * problem_size_0.n()});
 
     cutlass::HostTensor<
-      typename B2bGemm::ElementScaleBias, 
-      typename B2bGemm::LayoutScaleBias> tensor_Bias0({1, problem_size_0.n()});
+      typename B2bGemm::ElementScaleBias,
+      typename B2bGemm::LayoutScaleBias> tensor_Bias0({1, batch_count * problem_size_0.n()});
 
     cutlass::HostTensor<
-      ElementAccumulator, 
-      typename B2bGemm::LayoutC> reference_Z0(problem_size_0.mn());
+      ElementAccumulator,
+      typename B2bGemm::LayoutC> reference_Z0(CoordC0.mn());
 
     cutlass::HostTensor<
-      typename B2bGemm::ElementC, 
-      typename B2bGemm::LayoutC> reference_D0(problem_size_0.mn());
+      typename B2bGemm::ElementC,
+      typename B2bGemm::LayoutC> reference_D0(CoordC0.mn());
 
     cutlass::HostTensor<
-      typename B2bGemm::ElementB, 
-      typename B2bGemm::LayoutB> tensor_B1(problem_size_1.kn());
+      typename B2bGemm::ElementB,
+      typename B2bGemm::LayoutB> tensor_B1(CoordB1.kn());
 
     cutlass::HostTensor<
-      typename B2bGemm::ElementB, 
-      typename B2bGemm::LayoutB> tensor_B1_reordered(problem_size_1.kn());
+      typename B2bGemm::ElementB,
+      typename B2bGemm::LayoutB> tensor_B1_reordered(CoordB1.kn());
 
     cutlass::HostTensor<
-      typename B2bGemm::ElementC, 
-      typename B2bGemm::LayoutC> tensor_C1(problem_size_1.mn());
+      typename B2bGemm::ElementC,
+      typename B2bGemm::LayoutC> tensor_C1(CoordC1.mn());
 
     cutlass::HostTensor<
-      typename B2bGemm::ElementC, 
-      typename B2bGemm::LayoutScaleBias> tensor_Bias1({1, problem_size_1.n()});
+      typename B2bGemm::ElementC,
+      typename B2bGemm::LayoutScaleBias> tensor_Bias1({1, batch_count * problem_size_1.n()});
 
     cutlass::HostTensor<
-      typename B2bGemm::ElementC, 
-      typename B2bGemm::LayoutC> tensor_D1(problem_size_1.mn());
+      typename B2bGemm::ElementC,
+      typename B2bGemm::LayoutC> tensor_D1(CoordC1.mn());
 
     cutlass::HostTensor<
-      typename B2bGemm::ElementC, 
-      typename B2bGemm::LayoutC> reference_D1(problem_size_1.mn());
+      typename B2bGemm::ElementC,
+      typename B2bGemm::LayoutC> reference_D1(CoordC1.mn());
 
 
     CHECK_TRUE(initialize_tensor(tensor_A0.host_view(), init_A, seed + 2019));
@@ -556,9 +577,9 @@ struct B2bInterleavedFusedGemmRun
 
     //Reorder B0
     cutlass::reorder_column<16>(
-        tensor_B0_reordered.host_ref(), tensor_B0.host_ref(), problem_size_0);
+        tensor_B0_reordered.host_ref(), tensor_B0.host_ref(), CoordB0);
     cutlass::reorder_column<InterleavedK_>(
-        tensor_B1_reordered.host_ref(), tensor_B1.host_ref(), problem_size_1);
+        tensor_B1_reordered.host_ref(), tensor_B1.host_ref(), CoordB1);
 
     cutlass::reference::host::TensorFill(
       tensor_D1.host_view());
@@ -581,12 +602,14 @@ struct B2bInterleavedFusedGemmRun
     tensor_D1.sync_device();
     reference_D0.sync_device();
     reference_D1.sync_device();
+    // tensor_Bias0_batched.sync_device();
 
     //
     // Initialize the GEMM operator
     //
 
     typename B2bGemm::Arguments arguments{
+      mode,
       problem_size_0,
       problem_size_1,
       tensor_A0.device_ref(),
@@ -597,8 +620,16 @@ struct B2bInterleavedFusedGemmRun
       tensor_B1_reordered.device_ref(),
       {tensor_Bias1.device_data(), typename B2bGemm::LayoutC::Stride(0)},
       tensor_D1.device_ref(),
+      batch_stride_A0,
+      batch_stride_B0,
+      batch_stride_B1,
+      batch_stride_C1,
+      batch_stride_D1,
+      batch_stride_Bias0,
+      batch_stride_Scale0,
       {alpha0, beta0},
       {alpha1, beta1},
+      batch_count,
     };
 
     B2bGemm b2b_gemm_op;
@@ -651,32 +682,30 @@ struct B2bInterleavedFusedGemmRun
     // Verify
     //
 
-    cutlass::reference::device::Gemm<
-        typename B2bGemm::ElementA, typename B2bGemm::LayoutA,
-        typename B2bGemm::ElementB, typename B2bGemm::LayoutB,
-        ElementAccumulator, typename B2bGemm::LayoutC, 
-        ElementAccumulator, ElementAccumulator>
-        reference_gemm_0;
-
-    cutlass::reference::device::Gemm<
-        typename B2bGemm::ElementA, typename B2bGemm::LayoutA,
-        typename B2bGemm::ElementB, typename B2bGemm::LayoutB,
-        typename B2bGemm::ElementC, typename B2bGemm::LayoutC, ElementCompute,
-        ElementAccumulator, typename B2bGemm::Operator>
-        reference_gemm_1;
-
-    reference_gemm_0(
+    cutlass::reference::device::GemmComplex<
+      typename B2bGemm::ElementA, typename B2bGemm::LayoutA,
+      typename B2bGemm::ElementB, typename B2bGemm::LayoutB,
+      ElementAccumulator, typename B2bGemm::LayoutC,
+      ElementAccumulator, ElementAccumulator
+    >(
       problem_size_0,
       ElementAccumulator(1), //intermediate alpha=1
-      tensor_A0.device_ref(), 
-      tensor_B0.device_ref(), 
+      tensor_A0.device_ref(),
+      cutlass::ComplexTransform::kNone,
+      tensor_B0.device_ref(),
+      cutlass::ComplexTransform::kNone,
       ElementAccumulator(0), //beta = 0
       reference_Z0.device_ref(),
       reference_Z0.device_ref(),
-      ElementAccumulator(0)
+      ElementAccumulator(0),
+      int(batch_count),
+      batch_stride_A0,
+      batch_stride_B0,
+      batch_stride_C0,
+      batch_stride_C0
     );
 
-    cutlass::reference::device::TensorScaleBiasGemm<
+    cutlass::reference::device::TensorScaleBiasGemmBatched<
       ElementAccumulator, typename B2bGemm::ElementC, typename B2bGemm::LayoutC,
       ElementCompute, typename B2bGemm::LayoutScaleBias
     > (
@@ -685,25 +714,45 @@ struct B2bInterleavedFusedGemmRun
       reference_D0.device_ref(),
       alpha0,
       tensor_Scale0.device_ref(),
-      tensor_Bias0.device_ref()
+      tensor_Bias0.device_ref(),
+      int(batch_count),
+      batch_stride_C0,
+      batch_stride_C0,
+      batch_stride_Scale0,
+      batch_stride_Bias0
     );
 
     if(relu) {
-       cutlass::reference::device::TensorReLu(reference_D0.device_view()); 
+       cutlass::reference::device::TensorReLu(reference_D0.device_view());
     }
 
-    reference_gemm_1(
+    cutlass::reference::device::GemmComplex<
+      typename B2bGemm::ElementA, typename B2bGemm::LayoutA,
+      typename B2bGemm::ElementB, typename B2bGemm::LayoutB,
+      typename B2bGemm::ElementC, typename B2bGemm::LayoutC,
+      ElementCompute, ElementAccumulator
+    >(
       problem_size_1,
-      alpha1, 
-      reference_D0.device_ref(), 
-      tensor_B1.device_ref(), 
-      beta1, 
+      alpha1, //intermediate alpha=1
+      reference_D0.device_ref(),
+      cutlass::ComplexTransform::kNone,
+      tensor_B1.device_ref(),
+      cutlass::ComplexTransform::kNone,
+      beta1, //beta = 0
       {tensor_Bias1.device_data(), typename B2bGemm::LayoutC::Stride(0)},
-      reference_D1.device_ref()
+      reference_D1.device_ref(),
+      ElementAccumulator(0),
+      int(batch_count),
+      batch_stride_C0,
+      batch_stride_B1,
+      batch_stride_C1,
+      batch_stride_D1
     );
+
     if(relu) {
-       cutlass::reference::device::TensorReLu(reference_D1.device_view()); 
+       cutlass::reference::device::TensorReLu(reference_D1.device_view());
     }
+
     cudaDeviceSynchronize();
     reference_D0.sync_host();
     reference_D1.sync_host();
@@ -713,7 +762,7 @@ struct B2bInterleavedFusedGemmRun
     CHECK_GT(cutlass::reference::host::TensorNorm(reference_D1.host_view()), 0);
 
     bool passed = cutlass::reference::host::TensorEquals(
-      reference_D1.host_view(), 
+      reference_D1.host_view(),
       tensor_D1.host_view());
 
     CHECK_TRUE(passed);
@@ -727,7 +776,7 @@ struct B2bInterleavedFusedGemmRun
 
       std::ofstream file(fname.str());
 
-      file 
+      file
         << "A0 =\n" << tensor_A0.host_view()
         << "\nB0 =\n" << tensor_B0.host_view()
         << "\nB0_reordered =\n" << tensor_B0_reordered.host_view()
diff --git a/examples/13_two_tensor_op_fusion/device/b2b_gemm.h b/examples/13_two_tensor_op_fusion/device/b2b_gemm.h
index 3751cc82b8..338090764e 100644
--- a/examples/13_two_tensor_op_fusion/device/b2b_gemm.h
+++ b/examples/13_two_tensor_op_fusion/device/b2b_gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -119,8 +119,6 @@ template <
     int AlignmentB =
         DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
                                  ElementC_, ElementAccumulator_>::kAlignmentB,
-    /// If true, kernel supports split-K with serial reduction
-    bool SplitKSerial = false,
     /// Operation performed by GEMM
     typename Operator_ = typename DefaultGemmConfiguration<
         OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
@@ -154,7 +152,6 @@ class B2bGemm {
   static int const kAlignmentA = AlignmentA;
   static int const kAlignmentB = AlignmentB;
   static int const kAlignmentC = EpilogueOutputOp1::kCount;
-  static bool const kSplitKSerial = SplitKSerial;
   static ComplexTransform const kTransformA = ComplexTransform::kNone;
   static ComplexTransform const kTransformB = ComplexTransform::kNone;
 
@@ -184,77 +181,11 @@ class B2bGemm {
     EpilogueOutputOp1,
     ThreadblockSwizzle,
     kStages,
-    kSplitKSerial,
     Operator,
     SmemAccumulator
   >::B2bGemmKernel;
 
-  /// Argument structure
-  struct Arguments {
-
-    //
-    // Data members
-    //
-
-    GemmCoord problem_size_0;
-    GemmCoord problem_size_1;
-    TensorRef<ElementA const, LayoutA> ref_A0;
-    TensorRef<ElementB const, LayoutB> ref_B0;
-    TensorRef<ElementC const, LayoutC> ref_C0;
-    TensorRef<ElementScaleBias const, LayoutScaleBias> ref_Scale0;
-    TensorRef<ElementScaleBias const, LayoutScaleBias> ref_Bias0;
-    TensorRef<ElementB const, LayoutB> ref_B1;
-    TensorRef<ElementC const, LayoutC> ref_C1;
-    TensorRef<ElementC, LayoutC> ref_D1;
-    typename EpilogueOutputOp0::Params epilogue0;
-    typename EpilogueOutputOp1::Params epilogue1;
-    int split_k_slices;
-
-    //
-    // Methods
-    //
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Arguments(): problem_size_0(0, 0, 0), problem_size_1(0, 0, 0), split_k_slices(1) {
-
-    }
-
-    /// Constructs an Arguments structure 
-    CUTLASS_HOST_DEVICE
-    Arguments(
-      GemmCoord problem_size_0_,
-      GemmCoord problem_size_1_,
-      TensorRef<ElementA const, LayoutA> ref_A0_,
-      TensorRef<ElementB const, LayoutB> ref_B0_,
-      TensorRef<ElementC const, LayoutC> ref_C0_,
-      TensorRef<ElementScaleBias const, LayoutScaleBias> ref_Scale0_,
-      TensorRef<ElementScaleBias const, LayoutScaleBias> ref_Bias0_,
-      TensorRef<ElementB const, LayoutB> ref_B1_,
-      TensorRef<ElementC const, LayoutC> ref_C1_,
-      TensorRef<ElementC, LayoutC> ref_D1_,
-      typename EpilogueOutputOp0::Params epilogue0_ = 
-        typename EpilogueOutputOp0::Params(),
-      typename EpilogueOutputOp1::Params epilogue1_ = 
-        typename EpilogueOutputOp1::Params(),
-      int split_k_slices_ = 1
-    ):
-      problem_size_0(problem_size_0_),
-      problem_size_1(problem_size_1_),
-      ref_A0(ref_A0_),
-      ref_B0(ref_B0_),
-      ref_C0(ref_C0_),
-      ref_Scale0(ref_Scale0_),
-      ref_Bias0(ref_Bias0_),
-      ref_B1(ref_B1_),
-      ref_C1(ref_C1_),
-      ref_D1(ref_D1_),
-      epilogue0(epilogue0_),
-      epilogue1(epilogue1_),
-      split_k_slices(split_k_slices_) {
-
-    }
-  };
+  using Arguments = typename B2bGemmKernel::Arguments;
 
 private:
 
@@ -269,10 +200,6 @@ class B2bGemm {
   /// Determines whether the GEMM can execute the given problem.
   static Status can_implement(Arguments const &args) {
 
-    if (!kSplitKSerial && args.split_k_slices > 1) {
-      return Status::kErrorInvalidProblem;
-    }
-
     Status status = B2bGemmKernel::can_implement(
       args.problem_size_0,
       args.problem_size_1,
@@ -295,20 +222,14 @@ class B2bGemm {
   static size_t get_workspace_size(Arguments const &args) {
 
     size_t bytes = 0;
-      
+
     // Determine grid shape
     ThreadblockSwizzle threadblock_swizzle;
 
     cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
-      args.problem_size_0, 
+      args.problem_size_0,
       {ThreadblockShape0::kM, ThreadblockShape0::kN, ThreadblockShape0::kK},
-      args.split_k_slices);
-
-    if (kSplitKSerial && args.split_k_slices > 1) {
-
-
-      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
-    }
+      args.batch_count);
 
     return bytes;
   }
@@ -320,38 +241,17 @@ class B2bGemm {
     ThreadblockSwizzle threadblock_swizzle;
 
     cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
-      args.problem_size_0, 
+      args.problem_size_0,
       {ThreadblockShape0::kM, ThreadblockShape0::kN, ThreadblockShape0::kK},
-      args.split_k_slices);
+      args.batch_count);
 //    cutlass::gemm::GemmCoord grid_shape_1 = threadblock_swizzle.get_tiled_shape(
-//      args.problem_size_1, 
+//      args.problem_size_1,
 //      {ThreadblockShape1::kM, ThreadblockShape1::kN, ThreadblockShape1::kK},
-//      args.split_k_slices);
-
-    if (kSplitKSerial) {
-      if (args.split_k_slices > 1) {
-        if (!workspace) {
-          return Status::kErrorWorkspaceNull;
-        }
-
-        size_t bytes = get_workspace_size(args);
-      
-        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
-
-        if (result != cudaSuccess) {
-          return Status::kErrorInternal;
-        }
-      }
-    }
-    else {
-
-      if (args.split_k_slices > 1) {
-        return Status::kErrorInvalidProblem;
-      }
-    }
+//      args.batch_count);
 
     // Initialize the Params structure
     params_ = typename B2bGemmKernel::Params{
+      args.mode,
       args.problem_size_0,
       args.problem_size_1,
       grid_shape,
@@ -363,6 +263,13 @@ class B2bGemm {
       args.ref_B1.non_const_ref(),
       args.ref_C1.non_const_ref(),
       args.ref_D1,
+      args.batch_stride_A0,
+      args.batch_stride_B0,
+      args.batch_stride_B1,
+      args.batch_stride_C1,
+      args.batch_stride_D1,
+      args.batch_stride_Bias0,
+      args.batch_stride_Scale0,
       args.epilogue0,
       args.epilogue1,
       static_cast<int *>(workspace),
@@ -373,12 +280,6 @@ class B2bGemm {
 
   /// Lightweight update given a subset of arguments
   Status update(Arguments const &args, void *workspace = nullptr) {
-    
-    if (kSplitKSerial && args.split_k_slices > 1) {  
-      if (!workspace) {
-        return Status::kErrorWorkspaceNull;
-      }
-    }
 
     params_.ref_A0.reset(args.ref_A0.non_const_ref().data());
     params_.ref_B0.reset(args.ref_B0.non_const_ref().data());
@@ -430,12 +331,12 @@ class B2bGemm {
 
   /// Runs the kernel using initialized state.
   Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
+    Arguments const &args,
+    void *workspace = nullptr,
     cudaStream_t stream = nullptr) {
-    
+
     Status status = initialize(args, workspace, stream);
-    
+
     if (status == Status::kSuccess) {
       status = run(stream);
     }
diff --git a/examples/13_two_tensor_op_fusion/device/b2b_implicit_gemm_convolution.h b/examples/13_two_tensor_op_fusion/device/b2b_implicit_gemm_convolution.h
index 7dd8fe2889..5d6a0e94f5 100644
--- a/examples/13_two_tensor_op_fusion/device/b2b_implicit_gemm_convolution.h
+++ b/examples/13_two_tensor_op_fusion/device/b2b_implicit_gemm_convolution.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm75_rf.cu b/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm75_rf.cu
index d9c59db0e1..9f5b89e550 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm75_rf.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm75_rf.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm75_shmem.cu b/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm75_shmem.cu
index 54a1315908..cf7133ee1f 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm75_shmem.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm75_shmem.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm80_rf.cu b/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm80_rf.cu
index 7a66f8e88b..be6d7d54e9 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm80_rf.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm80_rf.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm80_shmem.cu b/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm80_shmem.cu
index 5a60714160..50c886d1fe 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm80_shmem.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm80_shmem.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_rf.cu b/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_rf.cu
index 2481fbd82e..5e94c7485a 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_rf.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_rf.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -220,7 +220,6 @@ bool run_fused_conv2d_fprop_optimized_s8_sm75_rf_res() {
 
   return pass;
 }
-
 int main() {
 
   std::vector<bool (*)()>funcs = {
@@ -229,10 +228,6 @@ int main() {
   };
 
   return testRun(75, funcs, "conv int8 RF residency");
-
 }
 
-
-
 ////////////////////////////////////////////////////////////////////////////////
-
diff --git a/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_shmem.cu b/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_shmem.cu
index 917ae93063..aeea07f2b1 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_shmem.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_shmem.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -39,7 +39,6 @@
 #include "device/b2b_implicit_gemm_convolution.h"
 #include "b2b_interleaved_conv2d_run.h"
 #include "test_run.h"
-
 ////////////////////////////////////////////////////////////////////////////////
 
 cutlass::conv::Conv2dProblemSize conv2d_s8_sm75_problem_size_0 (
@@ -219,20 +218,13 @@ bool run_fused_conv2d_fprop_optimized_s8_sm75_shmem() {
 
   return pass;
 }
-
-
 int main() {
-
   std::vector<bool (*)()>funcs = {
     &run_nonfused_conv2d_fprop_optimized_s8_sm75,
     &run_fused_conv2d_fprop_optimized_s8_sm75_shmem
   };
 
   return testRun(75, funcs, "conv int8 shmem staging");
-
 }
 
-
-
 ////////////////////////////////////////////////////////////////////////////////
-
diff --git a/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm80_rf.cu b/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm80_rf.cu
index a515f1255b..d91df2a64a 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm80_rf.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm80_rf.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm80_shmem.cu b/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm80_shmem.cu
index 9a5b2c1c56..2b865e6b0a 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm80_shmem.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm80_shmem.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm75_rf.cu b/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm75_rf.cu
index 54c8835543..44243c5558 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm75_rf.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm75_rf.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm75_shmem.cu b/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm75_shmem.cu
index 30ba26990d..e4709be43d 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm75_shmem.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm75_shmem.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm80_rf.cu b/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm80_rf.cu
index 0c2239ac2f..0e64d401c0 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm80_rf.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm80_rf.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm80_shmem.cu b/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm80_shmem.cu
index 045e4a8e58..9f6a2a08ec 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm80_shmem.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm80_shmem.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/fused_two_gemms_grouped_f16_sm80_rf.cu b/examples/13_two_tensor_op_fusion/fused_two_gemms_grouped_f16_sm80_rf.cu
new file mode 100644
index 0000000000..87331d04d3
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/fused_two_gemms_grouped_f16_sm80_rf.cu
@@ -0,0 +1,297 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Example of running grouped back-to-back GEMMs when intermediate results are RF resident
+*/
+
+#include <iostream>
+#include <vector>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/base_grouped.h"
+#include "cutlass/gemm/device/gemm.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "device/b2b_gemm.h"
+#include "kernel/default_b2b_gemm.h"
+#include "threadblock/grouped_threadblock_swizzle.h"
+#include "b2b_grouped_gemm_run.h"
+#include "test_run.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+std::vector<cutlass::gemm::GemmCoord> gemm_f16_sm80_problem_sizes_0;
+std::vector<cutlass::gemm::GemmCoord> gemm_f16_sm80_problem_sizes_1;
+
+// Constraints:
+//   1. Warp shape N must equal thread block shape N
+//   2. Problem size N must equal thread block shape N
+using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
+using WarpShape0 = cutlass::gemm::GemmShape<16, 64, 32>;
+using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 32>;
+using WarpShape1 = cutlass::gemm::GemmShape<16, 128, 32>;
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+  bool error;
+  bool reference_check;
+  int alignment = 8;
+
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes0;
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes1;
+
+  int problem_count;
+  bool verbose;
+
+  //
+  // Methods
+  //
+
+  Options():
+    help(false),
+    error(false),
+    reference_check(true),
+    problem_count(15),
+    verbose(false)
+  { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("problems", problem_count, 15);
+    cmd.get_cmd_line_argument("reference-check", reference_check, true);
+    cmd.get_cmd_line_argument("verbose", verbose, false);
+
+    randomize_problems(cmd);
+  }
+
+  void randomize_problems(cutlass::CommandLine &cmd) {
+
+    //
+    // For now, randomly choose the problem sizes.
+    //
+
+    int cmd_line_m = -1;
+    int cmd_line_k = -1;
+
+    cmd.get_cmd_line_argument("m", cmd_line_m);
+    cmd.get_cmd_line_argument("k", cmd_line_k);
+
+    problem_sizes0.reserve(problem_count);
+    problem_sizes1.reserve(problem_count);
+
+    for (int i = 0; i < problem_count; ++i) {
+
+      int m = cmd_line_m;
+      int k = cmd_line_k;
+
+      if (m < 1) {
+        m = alignment * ((rand() % 256) + 1);
+      }
+
+      if (k < 1) {
+        k = alignment * ((rand() % 256) + 1);
+      }
+
+      cutlass::gemm::GemmCoord problem0(m, ThreadblockShape0::kN, k);
+      cutlass::gemm::GemmCoord problem1(m, ThreadblockShape1::kN, ThreadblockShape0::kN);
+
+      problem_sizes0.push_back(problem0);
+      problem_sizes1.push_back(problem1);
+    }
+
+    if (verbose) {
+      print_problem_sizes();
+    }
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "13_fused_two_gemms_grouped_f16_sm80_rf\n\n"
+      << "  This example runs a grouped back-to-back GEMM kernel. A group of independent back-to-back GEMMs are\n"
+      << "  run in a single kernel. Each indivdual problem in the group is subject to the same constraints that non-grouped\n"
+      << "  back-to-back GEMMs are subject to.s"
+      << "Options:\n\n"
+      << "  --help                           If specified, displays this usage statement.\n\n"
+      << "  --problems=<int>                 Number of individual GEMM problems (default: --problems=15)\n"
+      << "  --m=<int>                        Sets the M dimension of both GEMMs for all groups. Otherwise, it is selected randomly\n"
+      << "  --k=<int>                        Sets the K dimension of the first GEMM for all groups. Otherwise, it is selected randomly\n"
+      << "  --verbose=<bool>                 If true, prints problem sizes.\n";
+
+    out << "\n\nExamples:\n\n"
+
+      << "# Runs a grouped B2b GEMM with 10 random problem sizes\n"
+      << "$ ./examples/13_two_tensor_op_fusion/13_fused_two_gemms_grouped_f16_sm80_rf --groups=10\n\n";
+
+    return out;
+  }
+
+  void print_problem_sizes() {
+    std::cout << std::endl;
+    std::cout << "Executing " << problem_count << " independent back-to-back GEMMs in a group" << std::endl;
+    for (int i = 0; i < problem_count; ++i) {
+      cutlass::gemm::GemmCoord problem0 = problem_sizes0.at(i);
+      cutlass::gemm::GemmCoord problem1 = problem_sizes1.at(i);
+      std::cout << "Problem " << i
+                << "\t\tGEMM0: " << problem0.m() << 'x' << problem0.n() << 'x' << problem0.k()
+                << "\t\tGEMM1: " << problem1.m() << 'x' << problem1.n() << 'x' << problem1.k()
+                << std::endl;
+    }
+  }
+};
+
+bool run_fused_grouped_gemm_f16_sm80_rf_res() {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementCompute = cutlass::half_t;
+
+  ElementCompute alpha0 = ElementCompute(1);
+  //Fused kernel has built-in bias, setting beta=0
+  ElementCompute beta0 = ElementCompute(0); 
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(1); //beta=1 for bias
+
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  using EpilogueOutputOp0 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      InstructionShape::kM * InstructionShape::kN / 32,
+      ElementAccumulator,
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
+    >;
+
+  using EpilogueOutputOp1 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::NoBetaScaling
+    >;
+
+  using GroupedThreadblockSwizzle = cutlass::gemm::threadblock::B2bGemmGroupedThreadblockSwizzle<
+                                                                    ThreadblockShape0,
+                                                                    cutlass::layout::RowMajor // LayoutC
+                                                                    >;
+
+  const int kAlignment = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+  const int kStages = 3;
+  using B2bGemmKernel = cutlass::gemm::kernel::DefaultB2bGemm<
+        cutlass::half_t,
+        cutlass::layout::RowMajor,
+        kAlignment,
+        cutlass::half_t,
+        cutlass::layout::ColumnMajor,
+        kAlignment,
+        cutlass::half_t,
+        cutlass::layout::RowMajor,
+        ElementAccumulator,
+        cutlass::arch::OpClassTensorOp,
+        cutlass::arch::Sm80,
+        ThreadblockShape0,
+        ThreadblockShape1,
+        WarpShape0,
+        WarpShape1,
+        InstructionShape,
+        EpilogueOutputOp0,
+        EpilogueOutputOp1,
+        GroupedThreadblockSwizzle,
+        kStages,
+        cutlass::arch::OpMultiplyAdd
+    >::B2bGemmKernel;
+
+  using B2bGemm = cutlass::gemm::device::BaseGrouped<B2bGemmKernel>;
+
+  B2bFusedGroupedGemmRun<B2bGemm> fusedGemm;
+
+  std::cout << "Running Fused back-to-back FP16 TN Grouped GEMMs with RF residency...\n";
+  bool passed = fusedGemm.run(gemm_f16_sm80_problem_sizes_0, gemm_f16_sm80_problem_sizes_1, alpha0, beta0, alpha1, beta1);
+  if(passed)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+  return passed;
+}
+
+int main(int argc, char const **args) {
+
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.error) {
+    std::cerr << "Aborting execution." << std::endl;
+    return -1;
+  }
+
+  gemm_f16_sm80_problem_sizes_0 = options.problem_sizes0;
+  gemm_f16_sm80_problem_sizes_1 = options.problem_sizes1;
+
+  std::vector<bool (*)()>funcs = {
+    &run_fused_grouped_gemm_f16_sm80_rf_res
+  };
+
+  return testRun(80, funcs, "grouped gemm f16 RF residency");
+}
+
+
+
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_rf.cu b/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_rf.cu
index 2c00eb86c5..a7f39d2d3d 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_rf.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_rf.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -195,7 +195,6 @@ bool run_fused_gemm_s8_rf_res() {
   return passed;
 
 }
-
 int main() {
 
   std::vector<bool (*)()>funcs = {
@@ -204,9 +203,6 @@ int main() {
   };
 
   return testRun(75, funcs, "gemm int8 RF residency");
-
-
 }
 
-
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_shmem.cu b/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_shmem.cu
index 10f4cb7b51..671f48b712 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_shmem.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_shmem.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -43,7 +43,6 @@
 #include "device/b2b_gemm.h"
 #include "b2b_interleaved_gemm_run.h"
 #include "test_run.h"
-
 ////////////////////////////////////////////////////////////////////////////////
 
 cutlass::gemm::GemmCoord gemm_s8_sm75_problem_size_0(128*640, 64, 576);
@@ -197,18 +196,13 @@ bool run_fused_gemm_s8_shmem() {
   return passed;
 
 }
-
 int main() {
 
   std::vector<bool (*)()>funcs = {
     &run_nonfused_gemm_s8,
     &run_fused_gemm_s8_shmem
   };
-
   return testRun(75, funcs, "gemm int8 shmem staing");
-
-
 }
 
-
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm80_rf.cu b/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm80_rf.cu
index 38845371b8..b2f12b45f3 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm80_rf.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm80_rf.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -152,7 +152,7 @@ bool run_fused_gemm_s8_sm80_rf_res() {
   using WarpShape1 = cutlass::gemm::GemmShape<16, 128, 64>;
   using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
 
-  using EpilogueOutputOp0 = 
+  using EpilogueOutputOp0 =
     cutlass::epilogue::thread::LinearCombinationRelu<
       ElementOutput,
       8 * InstructionShape::kN / 32,
@@ -161,7 +161,7 @@ bool run_fused_gemm_s8_sm80_rf_res() {
       cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
     >;
 
-  using EpilogueOutputOp1 = 
+  using EpilogueOutputOp1 =
     cutlass::epilogue::thread::LinearCombinationRelu<
       ElementOutput,
       64 / cutlass::sizeof_bits<ElementOutput>::value,
@@ -194,14 +194,21 @@ bool run_fused_gemm_s8_sm80_rf_res() {
     SmemAccumulator,
     16,
     16,
-    false,
     cutlass::arch::OpMultiplyAddSaturate
   >;
 
   B2bInterleavedFusedGemmRun<B2bGemm, 32> fusedGemm;
 
   std::cout << "Running Fused back-to-back INT8 NT interleaved GEMMs with RF residency...\n";
-  bool passed = fusedGemm.run(gemm_s8_sm80_problem_size_0, gemm_s8_sm80_problem_size_1, alpha0, beta0, alpha1, beta1);
+  bool passed = fusedGemm.run(
+    gemm_s8_sm80_problem_size_0,
+    gemm_s8_sm80_problem_size_1,
+    alpha0,
+    beta0,
+    alpha1,
+    beta1
+  );
+
   if(passed)
     std::cout << "Pass\n";
   else
@@ -210,18 +217,123 @@ bool run_fused_gemm_s8_sm80_rf_res() {
   return passed;
 }
 
+bool run_fused_gemm_s8_sm80_rf_res_batch() {
+
+
+  cutlass::gemm::GemmCoord gemm_s8_sm80_problem_size_0(256, 64, 128);
+  cutlass::gemm::GemmCoord gemm_s8_sm80_problem_size_1(256, 128, 64);
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  ElementCompute alpha0 = ElementCompute(1);
+  //Fused kernel has built-in bias, setting beta=0
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(1); //beta=1 for bias
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<16, 64, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 64>;
+
+  using WarpShape1 = cutlass::gemm::GemmShape<16, 128, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+
+  using EpilogueOutputOp0 =
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      8 * InstructionShape::kN / 32,
+      ElementAccumulator,
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
+    >;
+
+  using EpilogueOutputOp1 =
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::NoBetaScaling
+    >;
+
+  const bool SmemAccumulator = false;
+
+  using B2bGemm = cutlass::gemm::device::B2bGemm<
+    int8_t,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    int8_t,
+    cutlass::layout::RowMajorInterleaved<32>,
+    ElementOutput,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    SmemAccumulator,
+    16,
+    16,
+    cutlass::arch::OpMultiplyAddSaturate
+  >;
+
+  B2bInterleavedFusedGemmRun<B2bGemm, 32> fusedGemm;
+
+  int batch_count = 2;
+  int64_t batch_stride_A0 = gemm_s8_sm80_problem_size_0.m() * gemm_s8_sm80_problem_size_0.k();
+  int64_t batch_stride_B0 = gemm_s8_sm80_problem_size_1.k() * gemm_s8_sm80_problem_size_1.n();
+  int64_t batch_stride_C0 = gemm_s8_sm80_problem_size_0.m() * gemm_s8_sm80_problem_size_0.n();
+  int64_t batch_stride_B1 = gemm_s8_sm80_problem_size_1.k() * gemm_s8_sm80_problem_size_1.n();
+  int64_t batch_stride_C1 = gemm_s8_sm80_problem_size_1.n();
+  int64_t batch_stride_D1 = gemm_s8_sm80_problem_size_1.m() * gemm_s8_sm80_problem_size_1.n();
+  int64_t batch_stride_Bias0 = gemm_s8_sm80_problem_size_0.n();
+  int64_t batch_stride_Scale0 = 0;
+
+  std::cout << "Running Fused back-to-back INT8 NT interleaved Batched GEMMs with RF residency...\n";
+  bool passed = fusedGemm.run(
+    gemm_s8_sm80_problem_size_0,
+    gemm_s8_sm80_problem_size_1,
+    alpha0,
+    beta0,
+    alpha1,
+    beta1,
+    cutlass::gemm::GemmUniversalMode::kBatched,
+    batch_count,
+    batch_stride_A0,
+    batch_stride_B0,
+    batch_stride_C0,
+    batch_stride_B1,
+    batch_stride_C1,
+    batch_stride_D1,
+    batch_stride_Bias0,
+    batch_stride_Scale0
+  );
+
+  if(passed)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+  return passed;
+}
 
 int main() {
 
   std::vector<bool (*)()>funcs = {
     &run_nonfused_gemm_s8_sm80,
-    &run_fused_gemm_s8_sm80_rf_res
+    &run_fused_gemm_s8_sm80_rf_res,
+    &run_fused_gemm_s8_sm80_rf_res_batch
   };
 
   return testRun(80, funcs, "gemm int8 RF residency");
-
-
 }
 
-
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm80_shmem.cu b/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm80_shmem.cu
index 7afe440941..84354221cf 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm80_shmem.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm80_shmem.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -151,7 +151,7 @@ bool run_fused_gemm_s8_sm80_shmem() {
   using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
   using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
 
-  using EpilogueOutputOp0 = 
+  using EpilogueOutputOp0 =
     cutlass::epilogue::thread::LinearCombinationRelu<
       ElementOutput,
       8 * InstructionShape::kN / 32,
@@ -160,7 +160,7 @@ bool run_fused_gemm_s8_sm80_shmem() {
       cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
     >;
 
-  using EpilogueOutputOp1 = 
+  using EpilogueOutputOp1 =
     cutlass::epilogue::thread::LinearCombinationRelu<
       ElementOutput,
       64 / cutlass::sizeof_bits<ElementOutput>::value,
@@ -168,7 +168,7 @@ bool run_fused_gemm_s8_sm80_shmem() {
       ElementCompute,
       cutlass::epilogue::thread::ScaleType::NoBetaScaling
     >;
-  
+
   const bool SmemAccumulator = true;
 
   using B2bGemm = cutlass::gemm::device::B2bGemm<
@@ -193,7 +193,6 @@ bool run_fused_gemm_s8_sm80_shmem() {
     SmemAccumulator,
     16,
     16,
-    false,
     cutlass::arch::OpMultiplyAddSaturate
   >;
 
diff --git a/examples/13_two_tensor_op_fusion/kernel/b2b_gemm.h b/examples/13_two_tensor_op_fusion/kernel/b2b_gemm.h
index 306e8cf47e..fca87a1d09 100644
--- a/examples/13_two_tensor_op_fusion/kernel/b2b_gemm.h
+++ b/examples/13_two_tensor_op_fusion/kernel/b2b_gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -40,19 +40,66 @@
 #include "cutlass/matrix_coord.h"
 #include "cutlass/semaphore.h"
 
+#include "kernel/b2b_gemm_grouped_problem_visitor.h"
+#include "threadblock/grouped_threadblock_swizzle.h"
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
 namespace gemm {
 namespace kernel {
 
+namespace detail {
+
+/// Utility struct for returning the type of the problem visitor used by the swizzling function,
+/// if it is a grouped swizzling function, or a default visitor. This is used only for defining
+/// the parameters of the problem visitor used in GroupedParams.
+template <
+  typename B2bMma_,
+  typename ThreadblockSwizzle_,
+  typename Enable = void
+>
+struct ProblemVisitorOrDefault;
+
+/// Return a generic problem visitor for GEMM problems
+template <
+  typename B2bMma_,
+  typename ThreadblockSwizzle_
+>
+struct ProblemVisitorOrDefault<B2bMma_,
+                               ThreadblockSwizzle_,
+                               typename platform::enable_if<
+                                                  ! cutlass::gemm::threadblock::detail::IsGroupedSwizzle<ThreadblockSwizzle_>::value
+                                                >::type> {
+  using value = B2bGemmGroupedProblemVisitor<typename B2bMma_::Shape,
+                                             GroupScheduleMode::kDeviceOnly,
+                                             128,
+                                             128,
+                                             platform::is_same<typename B2bMma_::LayoutC,
+                                                               cutlass::layout::ColumnMajor>::value>;
+};
+
+/// Return the problem visitor specified by the swizzling function
+template <
+  typename B2bMma_,
+  typename ThreadblockSwizzle_
+>
+struct ProblemVisitorOrDefault<B2bMma_,
+                               ThreadblockSwizzle_,
+                               typename platform::enable_if<
+                                                  cutlass::gemm::threadblock::detail::IsGroupedSwizzle<ThreadblockSwizzle_>::value
+                                                >::type>  {
+  using value = typename ThreadblockSwizzle_::ProblemVisitor;
+};
+
+} // namespace detail
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <
-  typename B2bMma_,               ///! Threadblock-scoped matrix multiply-accumulate 
+  typename B2bMma_,               ///! Threadblock-scoped matrix multiply-accumulate
   typename Epilogue_,             ///! Epilogue
-  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
-  bool SplitKSerial               ///! If true, code supporting split-K via serial reduction is enabled.
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
 >
 struct B2bGemm {
 
@@ -61,50 +108,225 @@ struct B2bGemm {
   using OutputOp0 = typename B2bMma::OutputOp;
   using OutputOp1 = typename Epilogue::OutputOp;
   using ThreadblockSwizzle = ThreadblockSwizzle_;
-  static bool const kSplitKSerial = SplitKSerial;
+
+  using ElementA0 = typename B2bMma::IteratorA0::Element;
+  using LayoutA0 = typename B2bMma::IteratorA0::Layout;
+  using ElementB0 = typename B2bMma::IteratorB0::Element;
+  using LayoutB0 = typename B2bMma::IteratorB0::Layout;
+  using ElementB1 = typename B2bMma::IteratorB1::Element;
+  using LayoutB1 = typename B2bMma::IteratorB1::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+  using ScaleBiasData = typename B2bMma::IteratorAccumulatorScaleBias::Element;
+
+  /// Data types needed for higher-level containers. In some cases, a single type must be exposed
+  /// despite the B2b GEMM using two GEMMs under the hood. In such cases, we select the values from
+  /// the second GEMM (other than for ElementA/ElementB)
+  using ElementA = typename B2bMma::IteratorA0::Element;
+  using LayoutA = typename B2bMma::IteratorA0::Layout;
+  using ElementB = typename B2bMma::IteratorB0::Element;
+  using LayoutB = typename B2bMma::IteratorB0::Layout;
+
+  static ComplexTransform const kTransformA = B2bMma::kTransformA;
+  static ComplexTransform const kTransformB = B2bMma::kTransformB;
+  using Operator = typename B2bMma::Operator0;
+
+  using OperatorClass = typename Operator::OperatorClass;
+  using ThreadblockShape = typename B2bMma::Shape0;
+  using WarpShape = typename Operator::Shape;
+  using InstructionShape = typename Operator::InstructionShape;
+  using ArchTag = typename B2bMma::ArchTag;
+
+  static int const kStages = B2bMma::kStages;
+  static int const kAlignmentA = B2bMma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = B2bMma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  using Mma = B2bMma;
+  using EpilogueOutputOp = OutputOp1;
 
   /// Warp count (concept: GemmShape)
   using WarpCount0 = typename B2bMma::WarpCount0;
   static int const kThreadCount = 32 * WarpCount0::kCount;
 
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
+    GemmCoord problem_size_0{0,0,0};
+    GemmCoord problem_size_1{0,0,0};
+    typename B2bMma::IteratorA0::TensorRef ref_A0{};
+    typename B2bMma::IteratorB0::TensorRef ref_B0{};
+    typename Epilogue::OutputTileIterator::TensorRef ref_C0{};
+    typename B2bMma::IteratorAccumulatorScaleBias::TensorRef ref_Scale0{};
+    typename B2bMma::IteratorAccumulatorScaleBias::TensorRef ref_Bias0{};
+    typename B2bMma::IteratorB1::TensorRef ref_B1{};
+    typename Epilogue::OutputTileIterator::TensorRef ref_C1{};
+    typename Epilogue::OutputTileIterator::TensorRef ref_D1{};
+    int64_t batch_stride_A0{0};
+    int64_t batch_stride_B0{0};
+    int64_t batch_stride_B1{0};
+    int64_t batch_stride_C1{0};
+    int64_t batch_stride_D1{0};
+    int64_t batch_stride_Bias0{0};
+    int64_t batch_stride_Scale0{0};
+    typename OutputOp0::Params epilogue0 {};
+    typename OutputOp1::Params epilogue1 {};
+    int batch_count{1};
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    Arguments() = default;
+
+    /// Constructs an Arguments structure
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmUniversalMode mode_,
+      GemmCoord problem_size_0_,
+      GemmCoord problem_size_1_,
+      typename B2bMma::IteratorA0::TensorRef ref_A0_,
+      typename B2bMma::IteratorB0::TensorRef ref_B0_,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C0_,
+      typename B2bMma::IteratorAccumulatorScaleBias::TensorRef ref_Scale0_,
+      typename B2bMma::IteratorAccumulatorScaleBias::TensorRef ref_Bias0_,
+      typename B2bMma::IteratorB1::TensorRef ref_B1_,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C1_,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D1_,
+      int64_t batch_stride_A0_,
+      int64_t batch_stride_B0_,
+      int64_t batch_stride_B1_,
+      int64_t batch_stride_C1_,
+      int64_t batch_stride_D1_,
+      int64_t batch_stride_Bias0_,
+      int64_t batch_stride_Scale0_,
+      typename OutputOp0::Params epilogue0_ = typename OutputOp0::Params(),
+      typename OutputOp1::Params epilogue1_ = typename OutputOp1::Params(),
+      int batch_count_ = 1
+    ):
+      mode(mode_),
+      problem_size_0(problem_size_0_),
+      problem_size_1(problem_size_1_),
+      ref_A0(ref_A0_),
+      ref_B0(ref_B0_),
+      ref_C0(ref_C0_),
+      ref_Scale0(ref_Scale0_),
+      ref_Bias0(ref_Bias0_),
+      ref_B1(ref_B1_),
+      ref_C1(ref_C1_),
+      ref_D1(ref_D1_),
+      batch_stride_A0(batch_stride_A0_),
+      batch_stride_B0(batch_stride_B0_),
+      batch_stride_B1(batch_stride_B1_),
+      batch_stride_C1(batch_stride_C1_),
+      batch_stride_D1(batch_stride_D1_),
+      batch_stride_Bias0(batch_stride_Bias0_),
+      batch_stride_Scale0(batch_stride_Scale0_),
+      epilogue0(epilogue0_),
+      epilogue1(epilogue1_),
+      batch_count(batch_count_) {
+    }
+  };
+
+  // Arguments structure for grouped B2B problems
+  struct GroupedArguments {
+    GemmCoord* problem_size_0;
+    GemmCoord* problem_size_1;
+    typename B2bMma::IteratorA0::TensorRef* ref_A0;
+    typename B2bMma::IteratorB0::TensorRef* ref_B0;
+    typename Epilogue::OutputTileIterator::TensorRef* ref_C0;
+    typename B2bMma::IteratorAccumulatorScaleBias::TensorRef* ref_Scale0;
+    typename B2bMma::IteratorAccumulatorScaleBias::TensorRef* ref_Bias0;
+    typename B2bMma::IteratorB1::TensorRef* ref_B1;
+    typename Epilogue::OutputTileIterator::TensorRef* ref_C1;
+    typename Epilogue::OutputTileIterator::TensorRef* ref_D1;
+
+    // Epilogue params remain constant across all problmes in the group. Thus,
+    // the parameter here is not a pointer.
+    typename OutputOp0::Params epilogue0;
+    typename OutputOp1::Params epilogue1;
+
+    int problem_count;
+    int threadblock_count;
+    GemmCoord* host_problem_sizes;
+
+    CUTLASS_HOST_DEVICE
+    GroupedArguments(
+      int problem_count,
+      GemmCoord* problem_size_0_,
+      GemmCoord* problem_size_1_,
+      typename B2bMma::IteratorA0::TensorRef* ref_A0_,
+      typename B2bMma::IteratorB0::TensorRef* ref_B0_,
+      typename Epilogue::OutputTileIterator::TensorRef* ref_C0_,
+      typename B2bMma::IteratorAccumulatorScaleBias::TensorRef* ref_Scale0_,
+      typename B2bMma::IteratorAccumulatorScaleBias::TensorRef* ref_Bias0_,
+      typename B2bMma::IteratorB1::TensorRef* ref_B1_,
+      typename Epilogue::OutputTileIterator::TensorRef* ref_C1_,
+      typename Epilogue::OutputTileIterator::TensorRef* ref_D1_,
+      typename OutputOp0::Params epilogue0_ = typename OutputOp0::Params(),
+      typename OutputOp1::Params epilogue1_ = typename OutputOp1::Params(),
+      int threadblock_count = 0
+    ) : problem_size_0(problem_size_0_), problem_size_1(problem_size_1_),
+        ref_A0(ref_A0_), ref_B0(ref_B0_), ref_C0(ref_C0_),
+        ref_Scale0(ref_Scale0_), ref_Bias0(ref_Bias0_), ref_B1(ref_B1_),
+        ref_C1(ref_C1_), ref_D1(ref_D1_), epilogue0(epilogue0_), epilogue1(epilogue1_),
+        problem_count(problem_count),
+        threadblock_count(threadblock_count)
+        {}
+  };
+
   /// Parameters structure
   struct Params {
-    cutlass::gemm::GemmCoord problem_size_0;
-    cutlass::gemm::GemmCoord problem_size_1;
-    cutlass::gemm::GemmCoord grid_tiled_shape;
-    int swizzle_log_tile;
-    typename B2bMma::IteratorA0::Params params_A0;
-    typename B2bMma::IteratorA0::TensorRef ref_A0;
-    typename B2bMma::IteratorB0::Params params_B0;
-    typename B2bMma::IteratorB0::TensorRef ref_B0;
-    typename Epilogue::OutputTileIterator::Params params_C0;
-    typename Epilogue::OutputTileIterator::TensorRef ref_C0;
-    typename B2bMma::IteratorAccumulatorScaleBias::TensorRef ref_Scale0;
-    typename B2bMma::IteratorAccumulatorScaleBias::TensorRef ref_Bias0;
-    typename B2bMma::IteratorB1::Params params_B1;
-    typename B2bMma::IteratorB1::TensorRef ref_B1;
-    typename Epilogue::OutputTileIterator::Params params_C1;
-    typename Epilogue::OutputTileIterator::TensorRef ref_C1;
-    typename Epilogue::OutputTileIterator::Params params_D1;
-    typename Epilogue::OutputTileIterator::TensorRef ref_D1;
-    typename OutputOp0::Params output_op_0;
-    typename OutputOp1::Params output_op_1;
-    int *semaphore;
-    int gemm_k_iterations_0;
-    int gemm_k_size_0;
-    int gemm_k_iterations_1;
-    int gemm_k_size_1;
+    cutlass::gemm::GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
+    cutlass::gemm::GemmCoord problem_size_0{};
+    cutlass::gemm::GemmCoord problem_size_1{};
+    cutlass::gemm::GemmCoord grid_tiled_shape{};
+    int swizzle_log_tile{0};
+    typename B2bMma::IteratorA0::Params params_A0{};
+    typename B2bMma::IteratorA0::TensorRef ref_A0{};
+    typename B2bMma::IteratorB0::Params params_B0{};
+    typename B2bMma::IteratorB0::TensorRef ref_B0{};
+    typename Epilogue::OutputTileIterator::Params params_C0{};
+    typename Epilogue::OutputTileIterator::TensorRef ref_C0{};
+    typename B2bMma::IteratorAccumulatorScaleBias::TensorRef ref_Scale0{};
+    typename B2bMma::IteratorAccumulatorScaleBias::TensorRef ref_Bias0{};
+    typename B2bMma::IteratorB1::Params params_B1{};
+    typename B2bMma::IteratorB1::TensorRef ref_B1{};
+    typename Epilogue::OutputTileIterator::Params params_C1{};
+    typename Epilogue::OutputTileIterator::TensorRef ref_C1{};
+    typename Epilogue::OutputTileIterator::Params params_D1{};
+    typename Epilogue::OutputTileIterator::TensorRef ref_D1{};
+    typename OutputOp0::Params output_op_0{};
+    typename OutputOp1::Params output_op_1{};
+    int64_t batch_stride_A0{0};
+    int64_t batch_stride_B0{0};
+    int64_t batch_stride_B1{0};
+    int64_t batch_stride_C1{0};
+    int64_t batch_stride_D1{0};
+    int64_t batch_stride_Bias0{0};
+    int64_t batch_stride_Scale0{0};
+    int *semaphore = nullptr;
+    int gemm_k_iterations_0{0};
+    int gemm_k_size_0{0};
+    int gemm_k_iterations_1{0};
+    int gemm_k_size_1{0};
 
     //
     // Methods
     //
 
-    CUTLASS_HOST_DEVICE
-    Params(): swizzle_log_tile(0), semaphore(0), gemm_k_iterations_0(0), gemm_k_size_0(0),
-        gemm_k_iterations_1(0), gemm_k_size_1(0) { }
+    Params() = default;
 
     CUTLASS_HOST_DEVICE
     Params(
+      cutlass::gemm::GemmUniversalMode mode,
       cutlass::gemm::GemmCoord const & problem_size_0,
       cutlass::gemm::GemmCoord const & problem_size_1,
       cutlass::gemm::GemmCoord const & grid_tiled_shape,
@@ -116,14 +338,22 @@ struct B2bGemm {
       typename B2bMma::IteratorB1::TensorRef ref_B1,
       typename Epilogue::OutputTileIterator::TensorRef ref_C1,
       typename Epilogue::OutputTileIterator::TensorRef ref_D1,
+      int64_t batch_stride_A0,
+      int64_t batch_stride_B0,
+      int64_t batch_stride_B1,
+      int64_t batch_stride_C1,
+      int64_t batch_stride_D1,
+      int64_t batch_stride_Bias0,
+      int64_t batch_stride_Scale0,
       typename OutputOp0::Params output_op_0 = typename OutputOp0::Params(),
       typename OutputOp1::Params output_op_1 = typename OutputOp1::Params(),
       int *workspace = nullptr
     ):
+      mode(mode),
       problem_size_0(problem_size_0),
       problem_size_1(problem_size_1),
       grid_tiled_shape(grid_tiled_shape),
-      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      swizzle_log_tile(ThreadblockSwizzle::get_log_tile(grid_tiled_shape)),
       params_A0(ref_A0.layout()),
       ref_A0(ref_A0),
       params_B0(ref_B0.layout()),
@@ -138,6 +368,13 @@ struct B2bGemm {
       ref_C1(ref_C1),
       params_D1(ref_D1.layout()),
       ref_D1(ref_D1),
+      batch_stride_A0(batch_stride_A0),
+      batch_stride_B0(batch_stride_B0),
+      batch_stride_B1(batch_stride_B1),
+      batch_stride_C1(batch_stride_C1),
+      batch_stride_D1(batch_stride_D1),
+      batch_stride_Bias0(batch_stride_Bias0),
+      batch_stride_Scale0(batch_stride_Scale0),
       output_op_0(output_op_0),
       output_op_1(output_op_1) {
 
@@ -152,6 +389,81 @@ struct B2bGemm {
     }
   };
 
+  struct GroupedParams {
+    cutlass::gemm::GemmCoord* problem_size_0;
+    cutlass::gemm::GemmCoord* problem_size_1;
+    cutlass::gemm::GemmCoord* grid_tiled_shape;
+    typename B2bMma::IteratorA0::TensorRef* ref_A0;
+    typename B2bMma::IteratorB0::TensorRef* ref_B0;
+    typename Epilogue::OutputTileIterator::TensorRef* ref_C0;
+    typename B2bMma::IteratorAccumulatorScaleBias::TensorRef* ref_Scale0;
+    typename B2bMma::IteratorAccumulatorScaleBias::TensorRef* ref_Bias0;
+    typename B2bMma::IteratorB1::TensorRef* ref_B1;
+    typename Epilogue::OutputTileIterator::TensorRef* ref_C1;
+    typename Epilogue::OutputTileIterator::TensorRef* ref_D1;
+
+    // Epilogue params remain constant across all problmes in the group. Thus,
+    // the parameter here is not a pointer.
+    typename OutputOp0::Params output_op_0;
+    typename OutputOp1::Params output_op_1;
+
+    using ProblemVisitor = typename detail::ProblemVisitorOrDefault<B2bMma, ThreadblockSwizzle>::value;
+    typename ProblemVisitor::Params problem_visitor;
+    int threadblock_count;
+    int* workspace;
+
+    CUTLASS_HOST_DEVICE
+    GroupedParams() {}
+
+    CUTLASS_HOST_DEVICE
+    GroupedParams(
+      GroupedArguments const &args,
+      void *workspace = nullptr,
+      int tile_count = 0
+    ) :
+        problem_size_0(args.problem_size_0), problem_size_1(args.problem_size_1),
+        ref_A0(args.ref_A0), ref_B0(args.ref_B0), ref_C0(args.ref_C0),
+        ref_Scale0(args.ref_Scale0), ref_Bias0(args.ref_Bias0), ref_B1(args.ref_B1), ref_C1(args.ref_C1), ref_D1(args.ref_D1),
+        output_op_0(args.epilogue0), output_op_1(args.epilogue1),
+        problem_visitor(args.problem_size_0, args.problem_size_1, args.problem_count, workspace, tile_count),
+        threadblock_count(args.threadblock_count),
+        workspace(reinterpret_cast<int*>(workspace)) {}
+
+    CUTLASS_HOST_DEVICE
+    void transpose() {
+      // Only row-major outputs are currently supported, so no transpose is performed
+    }
+
+    /// Returns non-grouped paramaters to be used as input to the kernel-level
+    /// operator for the problem indicated by problem_visitor.
+    CUTLASS_HOST_DEVICE
+    Params to_single_params(const ProblemVisitor& problem_visitor) const {
+      GemmCoord problem_size0 = problem_visitor.problem_size0();
+      GemmCoord problem_size1 = problem_visitor.problem_size1();
+      int32_t idx = problem_visitor.problem_index();
+      GemmCoord grid_shape = problem_visitor.grid_shape(problem_size1);
+
+      return Params(
+        cutlass::gemm::GemmUniversalMode::kGemm,
+        problem_size0,
+        problem_size1,
+        grid_shape,
+        ref_A0[idx],
+        ref_B0[idx],
+        ref_C0[idx],
+        ref_Scale0[idx],
+        ref_Bias0[idx],
+        ref_B1[idx],
+        ref_C1[idx],
+        ref_D1[idx],
+        0, 0, 0, 0, 0, 0, 0, // Batched B2B GEMMs within the grouped kernel are currently unsupported
+        output_op_0,
+        output_op_1,
+        workspace
+      );
+    }
+  };
+
   /// Shared memory storage structure
   union SharedStorage {
     typename B2bMma::B2bMmaSharedStorage main_loop;
@@ -163,7 +475,7 @@ struct B2bGemm {
   //
 
   CUTLASS_HOST_DEVICE
-  B2bGemm() { } 
+  B2bGemm() { }
 
   /// Determines whether kernel satisfies alignment
     static Status can_implement(
@@ -223,7 +535,7 @@ struct B2bGemm {
 
     if(problem_size_0.n() > B2bMma::Shape0::kN)
       return Status::kErrorInvalidProblem;
-    
+
     if(problem_size_1.n() > B2bMma::Shape1::kN)
       return Status::kErrorInvalidProblem;
 
@@ -233,9 +545,13 @@ struct B2bGemm {
   /// Executes one GEMM
   CUTLASS_DEVICE
   void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    // Compute threadblock location
     ThreadblockSwizzle threadblock_swizzle;
+    run_with_swizzle(params, shared_storage, threadblock_swizzle);
+  }
+
+  /// Executes one GEMM with an externally-provided swizzling function
+  CUTLASS_DEVICE
+  void run_with_swizzle(Params const &params, SharedStorage &shared_storage, ThreadblockSwizzle& threadblock_swizzle) {
 
     cutlass::gemm::GemmCoord threadblock_tile_offset =
         threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
@@ -247,37 +563,64 @@ struct B2bGemm {
       return;
     }
 
+    ElementA0 *ptr_A0 = static_cast<ElementA0 *>(params.ref_A0.data());
+    ElementB0 *ptr_B0 = static_cast<ElementB0 *>(params.ref_B0.data());
+    ElementB1 *ptr_B1 = static_cast<ElementB1 *>(params.ref_B1.data());
+
+    ScaleBiasData *ptr_Bias0 = static_cast<ScaleBiasData *>(params.ref_Bias0.data());
+    ScaleBiasData *ptr_Scale0 = static_cast<ScaleBiasData *>(params.ref_Scale0.data());
+
+    int offset_k_0 = 0;
+    int offset_k_1 = 0;
+
+    int problem_size_k_0 = params.problem_size_0.k();
+    int problem_size_k_1 = params.problem_size_1.k();
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // Problem size is a function of threadblock index in the K dimension
+      problem_size_k_0 = min(
+        problem_size_k_0,
+        (threadblock_tile_offset.k() + 1) * params.gemm_k_size_0);
+
+      // Problem size is a function of threadblock index in the K dimension
+      problem_size_k_1 = min(
+        problem_size_k_1,
+        (threadblock_tile_offset.k() + 1) * params.gemm_k_size_1);
+
+      offset_k_0 = threadblock_tile_offset.k() * params.gemm_k_size_0;
+      offset_k_1 = threadblock_tile_offset.k() * params.gemm_k_size_1;
+    }
+
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A0 += threadblock_tile_offset.k() * params.batch_stride_A0;
+      ptr_B0 += threadblock_tile_offset.k() * params.batch_stride_B0;
+      ptr_B1 += threadblock_tile_offset.k() * params.batch_stride_B1;
+      ptr_Bias0 += threadblock_tile_offset.k() * params.batch_stride_Bias0;
+      ptr_Scale0 += threadblock_tile_offset.k() * params.batch_stride_Scale0;
+    }
+
     // Compute initial location in logical coordinates
     cutlass::MatrixCoord tb_offset_A0{
       threadblock_tile_offset.m() * B2bMma::Shape0::kM,
-      threadblock_tile_offset.k() * params.gemm_k_size_0,
+      offset_k_0,
     };
 
     cutlass::MatrixCoord tb_offset_B0{
-      threadblock_tile_offset.k() * params.gemm_k_size_0,
+      offset_k_0,
       threadblock_tile_offset.n() * B2bMma::Shape0::kN
     };
 
     cutlass::MatrixCoord tb_offset_B1{
-      threadblock_tile_offset.k() * params.gemm_k_size_1,
+      offset_k_1,
       threadblock_tile_offset.n() * B2bMma::Shape1::kN
     };
 
-    // Problem size is a function of threadblock index in the K dimension
-    int problem_size_k_0 = min(
-      params.problem_size_0.k(), 
-      (threadblock_tile_offset.k() + 1) * params.gemm_k_size_0);
-
     // Compute threadblock-scoped matrix multiply-add
     int gemm_k_iterations_0 = (problem_size_k_0 - tb_offset_A0.column() + B2bMma::Shape0::kK - 1) / B2bMma::Shape0::kK;
 
-    // Problem size is a function of threadblock index in the K dimension
-    int problem_size_k_1 = min(
-      params.problem_size_1.k(), 
-      (threadblock_tile_offset.k() + 1) * params.gemm_k_size_1);
-
     // Compute threadblock-scoped matrix multiply-add
-//    int gemm_k_iterations_1 = (problem_size_k_1 - tb_offset_B1.row() + B2bMma::Shape1::kK - 1) / B2bMma::Shape1::kK;
+    // int gemm_k_iterations_1 = (problem_size_k_1 - tb_offset_B1.row() + B2bMma::Shape1::kK - 1) / B2bMma::Shape1::kK;
 
 
     // Compute position within threadblock
@@ -286,34 +629,33 @@ struct B2bGemm {
     // Construct iterators to A and B operands
     typename B2bMma::IteratorA0 iterator_A0(
       params.params_A0,
-      params.ref_A0.data(),
+      ptr_A0,
       {params.problem_size_0.m(), problem_size_k_0},
       thread_idx,
       tb_offset_A0);
 
     typename B2bMma::IteratorB0 iterator_B0(
       params.params_B0,
-      params.ref_B0.data(),
+      ptr_B0,
       {problem_size_k_0, params.problem_size_0.n()},
       thread_idx,
       tb_offset_B0);
 
     typename B2bMma::IteratorB1 iterator_B1(
       params.params_B1,
-      params.ref_B1.data(),
+      ptr_B1,
       {problem_size_k_1, params.problem_size_1.n()},
       thread_idx,
       tb_offset_B1);
 
-
     // Broadcast the warp_id computed by lane 0 to ensure dependent code
     // is compiled as warp-uniform.
-    int warp_idx = __shfl_sync(0x1f, threadIdx.x / 32, 0);
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
     int lane_idx = threadIdx.x % 32;
 
     // Construct iterators to accumulator scale/bias vector
     typename B2bMma::IteratorAccumulatorScaleBias iterator_Scale0(
-      params.ref_Scale0.data(),
+      ptr_Scale0,
       {1, params.problem_size_0.n()},
       thread_idx,
       warp_idx,
@@ -323,7 +665,7 @@ struct B2bGemm {
     );
 
     typename B2bMma::IteratorAccumulatorScaleBias iterator_Bias0(
-      params.ref_Bias0.data(),
+      ptr_Bias0,
       {1, params.problem_size_0.n()},
       thread_idx,
       warp_idx,
@@ -332,16 +674,19 @@ struct B2bGemm {
       )
     );
 
-
-
     //
     // Main loop
     //
 
     OutputOp0 output_op_0(params.output_op_0);
 
+    if (cutlass::gemm::threadblock::detail::IsGroupedSwizzle<ThreadblockSwizzle>::value) {
+      // Wait for all threads to finish their epilogue phases from the previous tile.
+      __syncthreads();
+    }
+
     // Construct thread-scoped matrix multiply
-    B2bMma b2bMma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+    B2bMma b2bMma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx, params.problem_size_0.n());
 
     typename B2bMma::FragmentC0 src_accum;
     typename B2bMma::FragmentC1 accumulators;
@@ -349,11 +694,9 @@ struct B2bGemm {
     src_accum.clear();
     accumulators.clear();
 
-    if (!kSplitKSerial || gemm_k_iterations_0 > 0) {
-      // Compute threadblock-scoped matrix multiply-add
-      b2bMma(gemm_k_iterations_0, accumulators, iterator_A0, iterator_B0,
-        iterator_Scale0, iterator_Bias0, iterator_B1, src_accum, output_op_0);
-    }
+    // Compute threadblock-scoped matrix multiply-add
+    b2bMma(gemm_k_iterations_0, accumulators, iterator_A0, iterator_B0,
+      iterator_Scale0, iterator_Bias0, iterator_B1, src_accum, output_op_0);
 
     //
     // Epilogue
@@ -376,23 +719,32 @@ struct B2bGemm {
 
     int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
 
+    ElementC *ptr_C1 = static_cast<ElementC *>(params.ref_C1.data());
+    ElementC *ptr_D1 = static_cast<ElementC *>(params.ref_D1.data());
+
     // Construct the semaphore.
     Semaphore semaphore(params.semaphore + block_idx, thread_idx);
 
-    // If performing a reduction via split-K, fetch the initial synchronization
-    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
-      
-      // Fetch the synchronization lock initially but do not block.
-      semaphore.fetch();
+    if (params.mode == GemmUniversalMode::kGemm) {
+      // If performing a reduction via split-K, fetch the initial synchronization
+
+      if (params.grid_tiled_shape.k() > 1) {
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
 
-      // Indicate which position in a serial reduction the output operator is currently updating
-      output_op_1.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+        // Indicate which position in a serial reduction the output operator is currently updating
+        output_op_1.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C1 += threadblock_tile_offset.k() * params.batch_stride_C1;
+      ptr_D1 += threadblock_tile_offset.k() * params.batch_stride_D1;
     }
 
     // Tile iterator loading from source tensor.
     typename Epilogue::OutputTileIterator iterator_C1(
       params.params_C1,
-      params.ref_C1.data(),
+      ptr_C1,
       params.problem_size_1.mn(),
       thread_idx,
       threadblock_offset
@@ -401,21 +753,21 @@ struct B2bGemm {
     // Tile iterator writing to destination tensor.
     typename Epilogue::OutputTileIterator iterator_D1(
       params.params_D1,
-      params.ref_D1.data(),
+      ptr_D1,
       params.problem_size_1.mn(),
       thread_idx,
       threadblock_offset
     );
 
     Epilogue epilogue(
-      shared_storage.epilogue, 
-      thread_idx, 
-      warp_idx, 
+      shared_storage.epilogue,
+      thread_idx,
+      warp_idx,
       lane_idx);
 
     // Wait on the semaphore - this latency may have been covered by iterator construction
-    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
-        
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+
       // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
       if (threadblock_tile_offset.k()) {
         iterator_C1 = iterator_D1;
@@ -427,14 +779,14 @@ struct B2bGemm {
     }
 
     // Execute the epilogue operator to update the destination tensor.
-    epilogue(output_op_1, iterator_D1, accumulators, iterator_C1); 
-    
+    epilogue(output_op_1, iterator_D1, accumulators, iterator_C1);
+
     //
     // Release the semaphore
     //
 
-    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
-      
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+
       int lock = 0;
       if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
 
@@ -457,4 +809,3 @@ struct B2bGemm {
 } // namespace kernel
 } // namespace gemm
 } // namespace cutlass
-
diff --git a/examples/13_two_tensor_op_fusion/kernel/b2b_gemm_grouped_problem_visitor.h b/examples/13_two_tensor_op_fusion/kernel/b2b_gemm_grouped_problem_visitor.h
new file mode 100644
index 0000000000..13faadf039
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/kernel/b2b_gemm_grouped_problem_visitor.h
@@ -0,0 +1,157 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Scheduler for grouped B2b GEMMs
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/gemm/kernel/grouped_problem_visitor.h"
+#include "cutlass/gemm/kernel/gemm_grouped_problem_visitor.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Visitor class to abstract away the algorithm for iterating over tiles
+template <typename ThreadblockShape,
+          GroupScheduleMode GroupScheduleMode_,
+          int PrefetchTileCount,
+          int ThreadCount,
+          bool Transposed = false>
+struct B2bGemmGroupedProblemVisitor : public GroupedProblemVisitor<
+                                            detail::GemmGroupedProblemSizeHelper<ThreadblockShape, Transposed>,
+                                            ThreadblockShape,
+                                            GroupScheduleMode_,
+                                            PrefetchTileCount,
+                                            ThreadCount> {
+
+  using ProblemSizeHelper = detail::GemmGroupedProblemSizeHelper<ThreadblockShape, Transposed>;
+  using Base = GroupedProblemVisitor<ProblemSizeHelper, ThreadblockShape, GroupScheduleMode_, PrefetchTileCount, ThreadCount>;
+  using BaseParams = typename Base::Params;
+  using SharedStorage = typename Base::SharedStorage;
+  static bool const kTransposed = Transposed;
+
+  cutlass::gemm::GemmCoord const *problem_sizes0;
+  cutlass::gemm::GemmCoord const *problem_sizes1;
+
+  struct Params {
+    cutlass::gemm::GemmCoord const *problem_sizes0;
+    cutlass::gemm::GemmCoord const *problem_sizes1;
+    int32_t                         problem_count;
+    void const                     *workspace;
+    int32_t                         tile_count;
+
+    //
+    // Methods
+    //
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Params(): problem_sizes0(nullptr), problem_sizes1(nullptr),
+              problem_count(0), workspace(nullptr), tile_count(0) { }
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const *problem_sizes0,
+      cutlass::gemm::GemmCoord const *problem_sizes1,
+      int32_t                         problem_count,
+      void const                     *workspace = nullptr,
+      int32_t                         tile_count = 0
+    ):
+      problem_sizes0(problem_sizes0),
+      problem_sizes1(problem_sizes1),
+      problem_count(problem_count),
+      workspace(workspace),
+      tile_count(tile_count)
+    {}
+
+    /// Convert the B2b-GEMM-specific parameters to those used by the base class
+    CUTLASS_HOST_DEVICE
+    BaseParams to_base() const {
+        return BaseParams(// Set problem_sizes as problem_sizes0 because these determine
+                          // shape of the grid used in the non-grouped B2b GEMM
+                          problem_sizes0,
+                          problem_count,
+                          workspace,
+                          tile_count);
+    }
+
+  };
+
+  //
+  // Methods
+  //
+  CUTLASS_DEVICE
+  B2bGemmGroupedProblemVisitor(
+    Params const &params_,
+    SharedStorage &shared_storage_, 
+    int32_t block_idx
+  ): Base (
+        params_.to_base(),
+        shared_storage_, block_idx),
+     problem_sizes0(params_.problem_sizes0),
+     problem_sizes1(params_.problem_sizes1)
+  {}
+
+  /// Returns the problem size 0 for the current problem
+  CUTLASS_HOST_DEVICE
+  cutlass::gemm::GemmCoord problem_size0() const {
+    GemmCoord problem = problem_sizes0[this->problem_idx];
+    ProblemSizeHelper::possibly_transpose_problem(problem);
+    return problem;
+  }
+
+  /// Returns the problem size 1 for the current problem
+  CUTLASS_HOST_DEVICE
+  cutlass::gemm::GemmCoord problem_size1() const {
+    GemmCoord problem = problem_sizes1[this->problem_idx];
+    ProblemSizeHelper::possibly_transpose_problem(problem);
+    return problem;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_two_tensor_op_fusion/kernel/b2b_implicit_gemm_convolution.h b/examples/13_two_tensor_op_fusion/kernel/b2b_implicit_gemm_convolution.h
index 8ebc10c0a2..d249a2c237 100644
--- a/examples/13_two_tensor_op_fusion/kernel/b2b_implicit_gemm_convolution.h
+++ b/examples/13_two_tensor_op_fusion/kernel/b2b_implicit_gemm_convolution.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop.h b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop.h
index da7c9aef68..1b604c040b 100644
--- a/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop.h
+++ b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_sm75.h b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_sm75.h
index bd07b8b954..0168637b6d 100644
--- a/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_sm75.h
+++ b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_sm75.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_sm80.h b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_sm80.h
index ab45a7a411..d76fe8125d 100644
--- a/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_sm80.h
+++ b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_smem_accumulator_sm75.h b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_smem_accumulator_sm75.h
index 454f13ffd4..462ad1efd7 100644
--- a/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_smem_accumulator_sm75.h
+++ b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_smem_accumulator_sm75.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_smem_accumulator_sm80.h b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_smem_accumulator_sm80.h
index 55619134d9..e953567610 100644
--- a/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_smem_accumulator_sm80.h
+++ b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_smem_accumulator_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/kernel/default_b2b_gemm.h b/examples/13_two_tensor_op_fusion/kernel/default_b2b_gemm.h
index 4d807b9dc4..2ad3d7f386 100644
--- a/examples/13_two_tensor_op_fusion/kernel/default_b2b_gemm.h
+++ b/examples/13_two_tensor_op_fusion/kernel/default_b2b_gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -30,10 +30,10 @@
  **************************************************************************************************/
 
 /*! \file
-    \brief 
+    \brief
       Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
       the appropriate threadblock-scoped epilogue.
-  
+
       Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
       accommodated by exchanging A and B operands and assuming transposed layouts. Partial
       specializations here choose 'device::GemmTransposed' to implement this functionality.
@@ -63,7 +63,9 @@
 #include "cutlass/transform/threadblock/predicated_tile_iterator.h"
 
 #include "kernel/b2b_gemm.h"
+#include "kernel/grouped.h"
 #include "threadblock/default_b2b_mma.h"
+#include "threadblock/grouped_threadblock_swizzle.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -73,6 +75,9 @@ namespace kernel {
 
 ////////////////////////////////////////////////////////////////////////////////
 
+template <typename T>
+using IsGroupedSwizzle = cutlass::gemm::threadblock::detail::IsGroupedSwizzle<T>;
+
 template <
   /// Element type for A matrix operand
   typename ElementA_,
@@ -114,12 +119,12 @@ template <
   typename ThreadblockSwizzle,
   /// Number of stages used in the pipelined mainloop
   int Stages,
-  /// If true, kernel is configured to support serial reduction in the epilogue
-  bool SplitKSerial,
   /// Operation performed by GEMM
   typename Operator,
   /// Stage accumulator in shared memory
-  bool SmemAccumulator = false
+  bool SmemAccumulator = false,
+  /// Whether or not the operation is grouped
+  typename Enable = void
 >
 struct DefaultB2bGemm;
 
@@ -161,17 +166,77 @@ template <
     typename ThreadblockSwizzle,
     /// Number of stages used in the pipelined mainloop
     int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
     /// Operation performed by GEMM
     typename Operator>
 struct DefaultB2bGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
                    layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
                    arch::Sm80, ThreadblockShape0, ThreadblockShape1,
                    WarpShape0, WarpShape1, InstructionShape,
-                   EpilogueOutputOp0, EpilogueOutputOp1, ThreadblockSwizzle, Stages, SplitKSerial,
-                   Operator> {
+                   EpilogueOutputOp0, EpilogueOutputOp1, ThreadblockSwizzle, Stages,
+                   Operator, false, typename platform::enable_if<!IsGroupedSwizzle<ThreadblockSwizzle>::value>::type> {
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using B2bMma = typename cutlass::gemm::threadblock::DefaultB2bMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
+      InstructionShape, Stages, Operator, EpilogueOutputOp0>::ThreadblockB2bMma;
+
+  static const int kPartitionsK1 = ThreadblockShape1::kK / WarpShape1::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape1, typename B2bMma::Operator1, kPartitionsK1, EpilogueOutputOp1,
+          EpilogueOutputOp1::kCount>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle>;
+};
+
+/// Partial specialization for Ampere Architecture with grouped operation
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp0,
+    /// Epilogue output operator
+    typename EpilogueOutputOp1,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultB2bGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
+                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+                   arch::Sm80, ThreadblockShape0, ThreadblockShape1,
+                   WarpShape0, WarpShape1, InstructionShape,
+                   EpilogueOutputOp0, EpilogueOutputOp1, ThreadblockSwizzle, Stages,
+                   Operator, false, typename platform::enable_if<IsGroupedSwizzle<ThreadblockSwizzle>::value>::type> {
   /// Define the threadblock-scoped matrix multiply-accumulate
   using B2bMma = typename cutlass::gemm::threadblock::DefaultB2bMma<
       ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
@@ -188,7 +253,9 @@ struct DefaultB2bGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignm
           EpilogueOutputOp1::kCount>::Epilogue;
 
   /// Define the kernel-level GEMM operator.
-  using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+  using UnderlyingB2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle>;
+
+  using B2bGemmKernel = kernel::GroupedKernel<UnderlyingB2bGemmKernel>;
 };
 
 
@@ -228,8 +295,6 @@ template <
   typename EpilogueOutputOp1,
   /// Threadblock-level swizzling operator
   typename ThreadblockSwizzle,
-  /// If true, kernel is configured to support serial reduction in the epilogue
-  bool SplitKSerial,
   /// Operation performed by GEMM
   typename Operator
 >
@@ -249,8 +314,9 @@ struct DefaultB2bGemm<
   EpilogueOutputOp1,
   ThreadblockSwizzle,
   2,
-  SplitKSerial,
-  Operator
+  Operator,
+  false,
+  typename platform::enable_if<!IsGroupedSwizzle<ThreadblockSwizzle>::value>::type
 > {
 
   /// Define the threadblock-scoped matrix multiply-accumulate
@@ -274,7 +340,7 @@ struct DefaultB2bGemm<
     Operator,
     EpilogueOutputOp0
   >::ThreadblockB2bMma;
-  
+
   static const int kPartitionsK1 = ThreadblockShape1::kK / WarpShape1::kK;
 
   /// Define the epilogue
@@ -287,7 +353,7 @@ struct DefaultB2bGemm<
   >::Epilogue;
 
   /// Define the kernel-level GEMM operator.
-  using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+  using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle>;
 };
 
 
@@ -323,20 +389,17 @@ template <
     int Stages,
     /// Number of Interleaved k
     int InterleavedK,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
     /// Operation performed by GEMM
     typename Operator>
 struct DefaultB2bGemm<
     ElementA, layout::ColumnMajorInterleaved<InterleavedK>, kAlignmentA,
-    ElementB, layout::RowMajorInterleaved<InterleavedK>, kAlignmentB, 
+    ElementB, layout::RowMajorInterleaved<InterleavedK>, kAlignmentB,
     ElementC, layout::ColumnMajorInterleaved<InterleavedK>, int32_t,
     arch::OpClassTensorOp, arch::Sm80,
     ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
     InstructionShape, EpilogueOutputOp0, EpilogueOutputOp1,
     ThreadblockSwizzle, Stages,
-    SplitKSerial, Operator> {
+    Operator, false, typename platform::enable_if<!IsGroupedSwizzle<ThreadblockSwizzle>::value>::type> {
   using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
   using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
   using LayoutC = layout::ColumnMajorInterleaved<InterleavedK>;
@@ -360,7 +423,7 @@ struct DefaultB2bGemm<
           64 / sizeof_bits<ElementC>::value, InterleavedK>::Epilogue;
 
   /// Define the kernel-level GEMM operator.
-  using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+  using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle>;
 };
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -396,19 +459,17 @@ template <
     typename ThreadblockSwizzle,
     /// Number of Interleaved k
     int InterleavedK,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
     /// Operation performed by GEMM
     typename Operator>
 struct DefaultB2bGemm<ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
                    kAlignmentA, ElementB,
                    layout::RowMajorInterleaved<InterleavedK>, kAlignmentB,
                    ElementC, layout::ColumnMajorInterleaved<InterleavedK>,
-                   int32_t, arch::OpClassTensorOp, arch::Sm75, 
+                   int32_t, arch::OpClassTensorOp, arch::Sm75,
                    ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
                    InstructionShape, EpilogueOutputOp0, EpilogueOutputOp1,
-                   ThreadblockSwizzle, 2, SplitKSerial, Operator> {
+                   ThreadblockSwizzle, 2, Operator, false,
+                   typename platform::enable_if<!IsGroupedSwizzle<ThreadblockSwizzle>::value>::type> {
   using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
   using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
   using LayoutC = layout::ColumnMajorInterleaved<InterleavedK>;
@@ -418,7 +479,7 @@ struct DefaultB2bGemm<ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
   /// Define the threadblock-scoped matrix multiply-accumulate
   using B2bMma = typename cutlass::gemm::threadblock::DefaultB2bMma<
       ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementAccumulator, LayoutC,
-      arch::OpClassTensorOp, arch::Sm75, ThreadblockShape0, ThreadblockShape1, 
+      arch::OpClassTensorOp, arch::Sm75, ThreadblockShape0, ThreadblockShape1,
       WarpShape0, WarpShape1, InstructionShape, 2, Operator, EpilogueOutputOp0, true>::ThreadblockB2bMma;
 
   static const int kPartitionsK1 = ThreadblockShape1::kK / WarpShape1::kK;
@@ -430,7 +491,7 @@ struct DefaultB2bGemm<ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
           64 / sizeof_bits<ElementC>::value, InterleavedK>::Epilogue;
 
   /// Define the kernel-level GEMM operator.
-  using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+  using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle>;
 };
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_two_tensor_op_fusion/kernel/default_b2b_gemm_smem_accumulator.h b/examples/13_two_tensor_op_fusion/kernel/default_b2b_gemm_smem_accumulator.h
index fcff8672bf..ad548bc98b 100644
--- a/examples/13_two_tensor_op_fusion/kernel/default_b2b_gemm_smem_accumulator.h
+++ b/examples/13_two_tensor_op_fusion/kernel/default_b2b_gemm_smem_accumulator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -30,10 +30,10 @@
  **************************************************************************************************/
 
 /*! \file
-    \brief 
+    \brief
       Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
       the appropriate threadblock-scoped epilogue.
-  
+
       Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
       accommodated by exchanging A and B operands and assuming transposed layouts. Partial
       specializations here choose 'device::GemmTransposed' to implement this functionality.
@@ -112,22 +112,19 @@ template <
     typename ThreadblockSwizzle,
     /// Number of stages used in the pipelined mainloop
     int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
     /// Operation performed by GEMM
     typename Operator>
 struct DefaultB2bGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
                    layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
                    arch::Sm80, ThreadblockShape0, ThreadblockShape1,
                    WarpShape0, WarpShape1, InstructionShape,
-                   EpilogueOutputOp0, EpilogueOutputOp1, ThreadblockSwizzle, Stages, SplitKSerial,
+                   EpilogueOutputOp0, EpilogueOutputOp1, ThreadblockSwizzle, Stages,
                    Operator, true> {
   /// Define the threadblock-scoped matrix multiply-accumulate
   using B2bMma = typename cutlass::gemm::threadblock::DefaultB2bMma<
       ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
       ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
-      ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1, 
+      ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
       InstructionShape, Stages, Operator, EpilogueOutputOp0, false, true>::ThreadblockB2bMma;
 
   static const int kPartitionsK1 = ThreadblockShape1::kK / WarpShape1::kK;
@@ -139,10 +136,9 @@ struct DefaultB2bGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignm
           EpilogueOutputOp1::kCount>::Epilogue;
 
   /// Define the kernel-level GEMM operator.
-  using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+  using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle>;
 };
 
-
 ////////////////////////////////////////////////////////////////////////////////
 
 /// Partial specialization for Turing Architecture
@@ -179,8 +175,6 @@ template <
   typename EpilogueOutputOp1,
   /// Threadblock-level swizzling operator
   typename ThreadblockSwizzle,
-  /// If true, kernel is configured to support serial reduction in the epilogue
-  bool SplitKSerial,
   /// Operation performed by GEMM
   typename Operator
 >
@@ -200,7 +194,6 @@ struct DefaultB2bGemm<
   EpilogueOutputOp1,
   ThreadblockSwizzle,
   2,
-  SplitKSerial,
   Operator,
   true
 > {
@@ -228,7 +221,7 @@ struct DefaultB2bGemm<
     false,
     true
   >::ThreadblockB2bMma;
-  
+
   static const int kPartitionsK1 = ThreadblockShape1::kK / WarpShape1::kK;
 
   /// Define the epilogue
@@ -241,7 +234,7 @@ struct DefaultB2bGemm<
   >::Epilogue;
 
   /// Define the kernel-level GEMM operator.
-  using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+  using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle>;
 };
 
 
@@ -277,20 +270,17 @@ template <
     int Stages,
     /// Number of Interleaved k
     int InterleavedK,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
     /// Operation performed by GEMM
     typename Operator>
 struct DefaultB2bGemm<
     ElementA, layout::ColumnMajorInterleaved<InterleavedK>, kAlignmentA,
-    ElementB, layout::RowMajorInterleaved<InterleavedK>, kAlignmentB, 
+    ElementB, layout::RowMajorInterleaved<InterleavedK>, kAlignmentB,
     ElementC, layout::ColumnMajorInterleaved<InterleavedK>, int32_t,
     arch::OpClassTensorOp, arch::Sm80,
     ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
     InstructionShape, EpilogueOutputOp0, EpilogueOutputOp1,
     ThreadblockSwizzle, Stages,
-    SplitKSerial, Operator, true> {
+    Operator, true> {
   using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
   using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
   using LayoutC = layout::ColumnMajorInterleaved<InterleavedK>;
@@ -314,7 +304,7 @@ struct DefaultB2bGemm<
           64 / sizeof_bits<ElementC>::value, InterleavedK>::Epilogue;
 
   /// Define the kernel-level GEMM operator.
-  using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+  using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle>;
 };
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -350,19 +340,16 @@ template <
     typename ThreadblockSwizzle,
     /// Number of Interleaved k
     int InterleavedK,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
     /// Operation performed by GEMM
     typename Operator>
 struct DefaultB2bGemm<ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
                    kAlignmentA, ElementB,
                    layout::RowMajorInterleaved<InterleavedK>, kAlignmentB,
                    ElementC, layout::ColumnMajorInterleaved<InterleavedK>,
-                   int32_t, arch::OpClassTensorOp, arch::Sm75, 
+                   int32_t, arch::OpClassTensorOp, arch::Sm75,
                    ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
                    InstructionShape, EpilogueOutputOp0, EpilogueOutputOp1,
-                   ThreadblockSwizzle, 2, SplitKSerial, Operator, true> {
+                   ThreadblockSwizzle, 2, Operator, true> {
   using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
   using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
   using LayoutC = layout::ColumnMajorInterleaved<InterleavedK>;
@@ -371,9 +358,9 @@ struct DefaultB2bGemm<ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
 
   /// Define the threadblock-scoped matrix multiply-accumulate
   using B2bMma = typename cutlass::gemm::threadblock::DefaultB2bMma<
-      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, 
-      ElementAccumulator, LayoutC, arch::OpClassTensorOp, arch::Sm75, 
-      ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1, 
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp, arch::Sm75,
+      ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
       InstructionShape, 2, Operator, EpilogueOutputOp0, true, true>::ThreadblockB2bMma;
 
   static const int kPartitionsK1 = ThreadblockShape1::kK / WarpShape1::kK;
@@ -385,7 +372,7 @@ struct DefaultB2bGemm<ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
           64 / sizeof_bits<ElementC>::value, InterleavedK>::Epilogue;
 
   /// Define the kernel-level GEMM operator.
-  using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+  using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle>;
 };
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_two_tensor_op_fusion/kernel/grouped.h b/examples/13_two_tensor_op_fusion/kernel/grouped.h
new file mode 100644
index 0000000000..2698a281a1
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/kernel/grouped.h
@@ -0,0 +1,168 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief High-level interface for running a grouped version of a CUTLASS kernel
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/trace.h"
+#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+#include "cutlass/gemm/kernel/gemm_grouped_problem_visitor.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// High-level interface for running a grouped version of a CUTLASS kernel
+template <
+  typename BaseKernel_   ///! Kernel-scoped matrix multiply-accumulate
+>
+struct GroupedKernel {
+public:
+
+  using BaseKernel = BaseKernel_;
+  using Epilogue = typename BaseKernel::Epilogue;
+
+  /// Types that need to be exported to work properly with device::BaseGrouped
+  using ElementA = typename BaseKernel::ElementA;
+  using LayoutA = typename BaseKernel::LayoutA;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  static ComplexTransform const kTransformA = BaseKernel::kTransformA;
+  static int const kAlignmentA = BaseKernel::kAlignmentA;
+
+  using ElementB = typename BaseKernel::ElementB;
+  using LayoutB = typename BaseKernel::LayoutB;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  static ComplexTransform const kTransformB = BaseKernel::kTransformB;
+  static int const kAlignmentB = BaseKernel::kAlignmentB;
+
+  using ElementC = typename BaseKernel::ElementC;
+  using LayoutC = typename BaseKernel::LayoutC;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  static int const kAlignmentC = BaseKernel::kAlignmentC;
+
+  using ElementAccumulator = typename BaseKernel::Mma::Policy::Operator::ElementC;
+
+  using EpilogueOutputOp = typename BaseKernel::EpilogueOutputOp;
+  using ThreadblockSwizzle = typename BaseKernel::ThreadblockSwizzle;
+
+  using Operator = typename BaseKernel::Operator;
+  using WarpMmaOperator = typename BaseKernel::Mma::Policy::Operator;
+
+  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
+  using MathOperator = typename WarpMmaOperator::MathOperator;
+  using OperatorClass = typename WarpMmaOperator::OperatorClass;
+  using ArchTag = typename WarpMmaOperator::ArchTag;
+  using ThreadblockShape = typename BaseKernel::Mma::Shape;
+  using WarpShape = typename BaseKernel::WarpShape;
+  using InstructionShape = typename BaseKernel::InstructionShape;
+  static int const kStages = BaseKernel::Mma::kStages;
+
+  using Mma = typename BaseKernel::Mma;
+
+  using Arguments = typename BaseKernel::GroupedArguments;
+  using Params = typename BaseKernel::GroupedParams;
+  using ProblemVisitor = typename ThreadblockSwizzle::ProblemVisitor;
+
+  static int const kThreadCount = BaseKernel::kThreadCount;
+
+  /// Shared memory storage structure
+  struct SharedStorage {
+    typename BaseKernel::SharedStorage kernel;
+
+    // ProblemVisitor shared storage can't be overlapped with others
+    typename ProblemVisitor::SharedStorage problem_visitor;
+  };
+
+public:
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  GroupedKernel() { }
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(cutlass::gemm::GemmCoord const & problem_size) {
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return Status::kSuccess;
+  }
+
+  /// Executes a kernel-level GEMM in a loop
+  CUTLASS_DEVICE
+  void operator()(Params &params, SharedStorage &shared_storage) {
+
+    ThreadblockSwizzle swizzle(params.problem_visitor, shared_storage.problem_visitor, blockIdx.x);
+
+    if (ProblemVisitor::kTransposed) {
+      params.transpose();
+    }
+
+    BaseKernel mma;
+
+    // Outer 'persistent' loop to iterate over tiles
+    while (swizzle.problem_visitor.next_tile()) {
+
+      typename BaseKernel::Params mma_params = params.to_single_params(swizzle.problem_visitor);
+      mma.run_with_swizzle(mma_params, shared_storage.kernel, swizzle);
+
+      // Next tile
+      swizzle.problem_visitor.advance(gridDim.x);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_two_tensor_op_fusion/reference/device/tensor_scale_bias.h b/examples/13_two_tensor_op_fusion/reference/device/tensor_scale_bias.h
index cc33731dd9..e1ba6c563d 100644
--- a/examples/13_two_tensor_op_fusion/reference/device/tensor_scale_bias.h
+++ b/examples/13_two_tensor_op_fusion/reference/device/tensor_scale_bias.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -69,7 +69,7 @@ __global__ void TensorScaleBiasGemm(
   TensorRefScalar tensor_scale,           ///< scale tensor
   TensorRefScalar tensor_bias             ///< bias tensor
 ) {
-    
+
   ConvertOp convert_op;
 
   MatrixCoord output_coord(
@@ -89,7 +89,7 @@ __global__ void TensorScaleBiasGemm(
 
         ScalarType bias = ScalarType(0);
 
-        if(tensor_bias.good()) 
+        if(tensor_bias.good())
           bias = tensor_bias.at({0, coord.column()});
 
         tensor_out.at(coord) = convert_op(
@@ -99,6 +99,70 @@ __global__ void TensorScaleBiasGemm(
   }
 }
 
+template <
+  typename TensorRefIn,                   ///< Input TensorRef Type
+  typename TensorRefOut,                  ///< Output TensorRef Type
+  typename ScalarType,                    ///< alpha Type
+  typename TensorRefScalar,               ///< Scale/Bias TensorRef Type
+  typename ConvertOp = NumericConverter<typename TensorRefOut::Element, ScalarType>,
+  int kMblock = 4,
+  int kNblock = 4
+>
+__global__ void TensorScaleBiasGemmBatched(
+  gemm::GemmCoord problem_size,
+  TensorRefIn tensor_in,                  ///< input tensor
+  TensorRefOut tensor_out,                ///< output tensor
+  ScalarType alpha,                       ///< alpha
+  TensorRefScalar tensor_scale,           ///< scale tensor
+  TensorRefScalar tensor_bias,             ///< bias tensor
+  int batch_count = 1,
+  int64_t batch_stride_tensor_in = 0,
+  int64_t batch_stride_tensor_out = 0,
+  int64_t batch_stride_tensor_scale = 0,
+  int64_t batch_stride_tensor_bias = 0
+) {
+
+  ConvertOp convert_op;
+  int row_block = (blockIdx.x * blockDim.x + threadIdx.x) * kMblock;
+  int col_block = (blockIdx.y * blockDim.y + threadIdx.y) * kNblock;
+  int batch_idx = blockIdx.z;
+
+  tensor_in.add_pointer_offset(batch_idx * batch_stride_tensor_in);
+  tensor_out.add_pointer_offset(batch_idx * batch_stride_tensor_out);
+  tensor_scale.add_pointer_offset(batch_idx * batch_stride_tensor_scale);
+  tensor_bias.add_pointer_offset(batch_idx * batch_stride_tensor_bias);
+
+  for (; batch_idx < batch_count; batch_idx += gridDim.z) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < kNblock; j++) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kMblock; i++) {
+        int row = row_block + i;
+        int col = col_block + j;
+        MatrixCoord coord = MatrixCoord(row, col);
+        if (coord.row() < problem_size.m() && coord.column() < problem_size.n()) {
+
+          ScalarType scale = alpha;
+          if(tensor_scale.good())
+            scale = tensor_scale.at({0, coord.column()});
+
+          ScalarType bias = ScalarType(0);
+
+          if(tensor_bias.good())
+            bias = tensor_bias.at({0, coord.column()});
+
+          tensor_out.at(coord) = convert_op(
+            scale * ScalarType(tensor_in.at(coord)) + bias);
+        }
+      }
+    }
+    tensor_in.add_pointer_offset(batch_stride_tensor_in * gridDim.z);
+    tensor_out.add_pointer_offset(batch_stride_tensor_out * gridDim.z);
+    tensor_scale.add_pointer_offset(batch_stride_tensor_scale * gridDim.z);
+    tensor_bias.add_pointer_offset(batch_stride_tensor_bias * gridDim.z);
+  }
+}
+
 template <
   typename TensorRefIn,                   ///< Input TensorRef Type
   typename TensorRefOut,                  ///< Output TensorRef Type
@@ -118,7 +182,7 @@ __global__ void TensorScaleBiasConv2d(
   TensorRefScalar tensor_scale,           ///< scale tensor
   TensorRefScalar tensor_bias             ///< bias tensor
 ) {
-    
+
   ConvertOp convert_op;
 
   int64_t npq_start = int64_t(blockIdx.x) * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
@@ -137,7 +201,7 @@ __global__ void TensorScaleBiasConv2d(
     int64_t npq = npq_start + m;
 
     thread_n[m] = int(npq / PQ);
-    
+
     int64_t residual = npq % PQ;
     thread_p[m] = int(residual / problem_size.Q);
     thread_q[m] = int(residual % problem_size.Q);
@@ -155,17 +219,17 @@ __global__ void TensorScaleBiasConv2d(
           ScalarType scale = alpha;
           if(tensor_scale.good())
             scale = tensor_scale.at({0, thread_k});
-    
+
           ScalarType bias = ScalarType(0);
-          if(tensor_bias.good()) 
+          if(tensor_bias.good())
             bias = tensor_bias.at({0, thread_k});
-    
+
           tensor_out.at({thread_n[m], thread_p[m], thread_q[m], thread_k}) = convert_op(
             scale * ScalarType(
               tensor_in.at({thread_n[m], thread_p[m], thread_q[m], thread_k})
             ) + bias);
         }
-      } 
+      }
     }
   }
 
@@ -217,6 +281,62 @@ void TensorScaleBiasGemm(
   );
 }
 
+/// Apply scale and bias on a tensor
+template <
+  typename ElementIn,                   ///< Input Type
+  typename ElementOut,                  ///< Output Type
+  typename Layout,                      ///< Layout of input/output tensor
+  typename ScalarType,                  ///< alpha Type
+  typename LayoutScaleBias,             ///< Layout of scale and bias
+  typename ConvertOp = NumericConverter<ElementOut, ScalarType>
+>
+void TensorScaleBiasGemmBatched(
+  gemm::GemmCoord problem_size,
+  TensorRef<ElementIn, Layout> tensor_in,              ///< input tensor
+  TensorRef<ElementOut, Layout> tensor_out,            ///< output tensor
+  ScalarType alpha,                                    ///< alpha
+  TensorRef<ScalarType, LayoutScaleBias> tensor_scale, ///< scale tensor
+  TensorRef<ScalarType, LayoutScaleBias> tensor_bias,    ///< bias tensor
+  int batch_count = 1,
+  int64_t batch_stride_tensor_in = 0,
+  int64_t batch_stride_tensor_out = 0,
+  int64_t batch_stride_tensor_scale = 0,
+  int64_t batch_stride_tensor_bias = 0
+) {
+
+  int const kMblock = 4;
+  int const kNblock = 4;
+
+  dim3 block(16, 8);
+  dim3 grid(
+    (problem_size.m() + block.x * kMblock - 1) / (block.x * kMblock),
+    (problem_size.n() + block.y * kNblock - 1) / (block.y * kNblock),
+    batch_count % std::numeric_limits<uint16_t>::max()
+  );
+
+  kernel::TensorScaleBiasGemmBatched<
+    TensorRef<ElementIn, Layout>,
+    TensorRef<ElementOut, Layout>,
+    ScalarType,
+    TensorRef<ScalarType, LayoutScaleBias>,
+    ConvertOp,
+    kMblock,
+    kNblock
+  ><<< grid, block >>> (
+    problem_size,
+    tensor_in,
+    tensor_out,
+    alpha,
+    tensor_scale,
+    tensor_bias,
+    batch_count,
+    batch_stride_tensor_in,
+    batch_stride_tensor_out,
+    batch_stride_tensor_scale,
+    batch_stride_tensor_bias
+  );
+}
+
 /// Apply scale and bias on a tensor
 template <
   typename ElementIn,                   ///< Input Type
diff --git a/examples/13_two_tensor_op_fusion/test_run.h b/examples/13_two_tensor_op_fusion/test_run.h
index b14becafc9..2bd6c720a4 100644
--- a/examples/13_two_tensor_op_fusion/test_run.h
+++ b/examples/13_two_tensor_op_fusion/test_run.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_multistage.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_multistage.h
index 6229b59506..574b123dc7 100644
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_multistage.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_multistage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_multistage_smem_accumulator.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_multistage_smem_accumulator.h
index ab08c0051d..e7c7ad12dc 100644
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_multistage_smem_accumulator.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_multistage_smem_accumulator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined.h
index 6d0f6db4c6..8313cef8e6 100644
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -321,7 +321,7 @@ class B2bImplicitGemmPipelined :
     int smem_write_stage_idx = 1;
 
     // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
-    // shared memory loads (which have the tighest latency requirement).
+    // shared memory loads (which have the tightest latency requirement).
 
     //
     // Mainloop
@@ -461,7 +461,7 @@ class B2bImplicitGemmPipelined :
     int gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1;
 
     // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
-    // shared memory loads (which have the tighest latency requirement).
+    // shared memory loads (which have the tightest latency requirement).
 
     //
     // Mainloop
diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined_smem_accumulator.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined_smem_accumulator.h
index 04e4f29f43..9775c19e26 100644
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined_smem_accumulator.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined_smem_accumulator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -341,7 +341,7 @@ class B2bImplicitGemmPipelinedSmemAccumulator :
     int smem_write_stage_idx = 1;
 
     // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
-    // shared memory loads (which have the tighest latency requirement).
+    // shared memory loads (which have the tightest latency requirement).
 
     //
     // Mainloop
diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base.h
index 36d6d8c9f3..55a41be19c 100644
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base_smem_accumulator.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base_smem_accumulator.h
index 22643570b2..2d5f616323 100644
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base_smem_accumulator.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base_smem_accumulator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage.h
index 8104f63854..3fb684ce57 100644
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -119,8 +119,10 @@ class B2bMmaMultistage :
   using Shape0 = Shape0_;
   ///< Iterates over tiles of A operand in global memory
   using IteratorA0 = IteratorA0_;
+  using IteratorA = IteratorA0;
   ///< Iterates over tiles of B operand in global memory
   using IteratorB0 = IteratorB0_;
+  using IteratorB = IteratorB0;
   ///< Policy describing tuning details
   using Policy0 = Policy0_;
 
@@ -139,6 +141,10 @@ class B2bMmaMultistage :
   using IteratorB1 = IteratorB1_;
   ///< Policy describing tuning details
   using Policy1 = Policy1_;
+
+  ///< Export Policy0 as the threadblock-level Mma's policy
+  using Policy = Policy0;
+  using Shape = Shape0;
   
   using SmemIteratorB1 = SmemIteratorB1_;
 
@@ -188,6 +194,10 @@ class B2bMmaMultistage :
   /// Complex transform on B operand
   static ComplexTransform const kTransformB1 = Operator1::kTransformB;
 
+  /// Complex transform exports needed by higher-level kernels
+  static ComplexTransform const kTransformA = kTransformA0;
+  static ComplexTransform const kTransformB = kTransformB0;
+
   /// Internal structure exposed for introspection.
   struct Detail {
 
@@ -199,15 +209,15 @@ class B2bMmaMultistage :
                   "GEMM operations.");
 
     /// Number of cp.async instructions to load one stage of operand A
-    static int const TBLDGSTSIterationsA0 =
+    static int const TBLoadIterationsA0 =
         IteratorA0::ThreadMap::Iterations::kCount;
 
     /// Number of cp.async instructions to load one stage of operand B
-    static int const TBLDGSTSIterationsB0 =
+    static int const TBLoadIterationsB0 =
         IteratorB0::ThreadMap::Iterations::kCount;
 
     /// Number of cp.async instructions to load one stage of operand B
-    static int const TBLDGSTSIterationsB1 =
+    static int const TBLoadIterationsB1 =
         IteratorB1::ThreadMap::Iterations::kCount;
 
     /// Number of stages
@@ -215,15 +225,15 @@ class B2bMmaMultistage :
 
     /// Number of cp.async instructions to load on group of operand A
     static int const kAccessesPerGroupA0 =
-        (TBLDGSTSIterationsA0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0;
+        (TBLoadIterationsA0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0;
 
     /// Number of cp.async instructions to load on group of operand B
     static int const kAccessesPerGroupB0 =
-        (TBLDGSTSIterationsB0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0;
+        (TBLoadIterationsB0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0;
 
     /// Number of cp.async instructions to load on group of operand B
     static int const kAccessesPerGroupB1 =
-        (TBLDGSTSIterationsB1 + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1;
+        (TBLoadIterationsB1 + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1;
   };
 
  private:
@@ -267,7 +277,9 @@ class B2bMmaMultistage :
       ///< ID of warp
       int warp_idx,
       ///< ID of each thread within a warp
-      int lane_idx
+      int lane_idx,
+      ///< GEMM0 N is used for accumulator extent
+      int problem_size_0_n
     ):
       Base(shared_storage, thread_idx, warp_idx, lane_idx),
       smem_iterator_A0_(shared_storage.shared_storage0.operand_A_ref(), thread_idx),
@@ -302,10 +314,10 @@ class B2bMmaMultistage :
                                    IteratorA0::kAccessesPerVector);
     this->smem_iterator_A0_.set_iteration_index(group_start_A0);
 
-    // LDGSTS for operand A
+    // Load for operand A
     CUTLASS_PRAGMA_UNROLL
     for (int j = 0; j < Detail::kAccessesPerGroupA0; ++j) {
-      if (group_start_A0 + j < Detail::TBLDGSTSIterationsA0) {
+      if (group_start_A0 + j < Detail::TBLoadIterationsA0) {
         typename IteratorA0::AccessType *dst_ptr =
             reinterpret_cast<typename IteratorA0::AccessType *>(
                 this->smem_iterator_A0_.get());
@@ -332,10 +344,10 @@ class B2bMmaMultistage :
                                    IteratorB0::kAccessesPerVector);
     this->smem_iterator_B0_.set_iteration_index(group_start_B0);
 
-    // LDGSTS for operand B
+    // Load for operand B
     CUTLASS_PRAGMA_UNROLL
     for (int j = 0; j < Detail::kAccessesPerGroupB0; ++j) {
-      if (group_start_B0 + j < Detail::TBLDGSTSIterationsB0) {
+      if (group_start_B0 + j < Detail::TBLoadIterationsB0) {
         typename IteratorB0::AccessType *dst_ptr =
             reinterpret_cast<typename IteratorB0::AccessType *>(
                 this->smem_iterator_B0_.get());
@@ -365,10 +377,10 @@ class B2bMmaMultistage :
                                    IteratorB1::kAccessesPerVector);
     this->smem_iterator_B1_.set_iteration_index(group_start_B1);
 
-    // LDGSTS for operand B
+    // Load for operand B
     CUTLASS_PRAGMA_UNROLL
     for (int j = 0; j < Detail::kAccessesPerGroupB1; ++j) {
-      if (group_start_B1 + j < Detail::TBLDGSTSIterationsB1) {
+      if (group_start_B1 + j < Detail::TBLoadIterationsB1) {
         typename IteratorB1::AccessType *dst_ptr =
             reinterpret_cast<typename IteratorB1::AccessType *>(
                 this->smem_iterator_B1_.get());
@@ -428,9 +440,9 @@ class B2bMmaMultistage :
       iterator_A0.set_iteration_index(0);
       this->smem_iterator_A0_.set_iteration_index(0);
 
-      // LDGSTS for operand A
+      // Load for operand A
       CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::TBLDGSTSIterationsA0; ++j) {
+      for (int j = 0; j < Detail::TBLoadIterationsA0; ++j) {
         typename IteratorA0::AccessType *dst_ptr =
             reinterpret_cast<typename IteratorA0::AccessType *>(
                 this->smem_iterator_A0_.get());
@@ -456,9 +468,9 @@ class B2bMmaMultistage :
       iterator_B0.set_iteration_index(0);
       this->smem_iterator_B0_.set_iteration_index(0);
 
-      // LDGSTS for operand B
+      // Load for operand B
       CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::TBLDGSTSIterationsB0; ++j) {
+      for (int j = 0; j < Detail::TBLoadIterationsB0; ++j) {
         typename IteratorB0::AccessType *dst_ptr =
             reinterpret_cast<typename IteratorB0::AccessType *>(
                 this->smem_iterator_B0_.get());
@@ -639,6 +651,10 @@ class B2bMmaMultistage :
 
     }
 
+    // Commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
 
     // 2nd Gemm
 
@@ -657,12 +673,11 @@ class B2bMmaMultistage :
     tb_frag_A1_bias.clear();
     iterator_A1_bias.load(tb_frag_A1_bias);
     ++iterator_A1_bias;
-
-
+ 
     //
     // Prologue
     //
-    int gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1;
+    int gemm_k_iterations_1 = (FragmentIteratorA1::Policy::kIterations + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1;
 
     // Issue several complete stages
     CUTLASS_PRAGMA_UNROLL
@@ -674,9 +689,9 @@ class B2bMmaMultistage :
       iterator_B1.set_iteration_index(0);
       this->smem_iterator_B1_.set_iteration_index(0);
 
-      // LDGSTS for operand B
+      // Load for operand B
       CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::TBLDGSTSIterationsB1; ++j) {
+      for (int j = 0; j < Detail::TBLoadIterationsB1; ++j) {
         typename IteratorB1::AccessType *dst_ptr =
             reinterpret_cast<typename IteratorB1::AccessType *>(
                 this->smem_iterator_B1_.get());
@@ -750,9 +765,9 @@ class B2bMmaMultistage :
     // Mainloop
     //
 
+    gemm_k_iterations_1 = (FragmentIteratorA1::Policy::kIterations + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1 - (Base::kStages - 1);
     CUTLASS_PRAGMA_UNROLL
-    for (gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1 - (Base::kStages - 1); 
-            gemm_k_iterations_1 > (-Base::kStages + 1); gemm_k_iterations_1--) {
+    for (; gemm_k_iterations_1 > (-Base::kStages + 1); gemm_k_iterations_1--) {
       //
       // Loop over GEMM K dimension
       //
@@ -871,7 +886,10 @@ class B2bMmaMultistage :
 
     }
 
-
+    // Commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
 
   }
 };
diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage_smem_accumulator.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage_smem_accumulator.h
index c28f4e49cd..35c4f5cc3f 100644
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage_smem_accumulator.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage_smem_accumulator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -121,8 +121,10 @@ class B2bMmaMultistageSmemAccumulator :
   using Shape0 = Shape0_;
   ///< Iterates over tiles of A operand in global memory
   using IteratorA0 = IteratorA0_;
+  using IteratorA = IteratorA0;
   ///< Iterates over tiles of B operand in global memory
   using IteratorB0 = IteratorB0_;
+  using IteratorB = IteratorB0;
   ///< Iterates over tiles of the scale and bias vectors in global memory
   using IteratorAccumulatorScaleBias = IteratorAccumulatorScaleBias_;
   ///< Policy describing tuning details
@@ -141,6 +143,10 @@ class B2bMmaMultistageSmemAccumulator :
   ///< Policy describing tuning details
   using Policy1 = Policy1_;
 
+  ///< Export Policy0 as the threadblock-level Mma's policy
+  using Policy = Policy0;
+  using Shape = Shape0;
+
   using SmemIteratorB1 = SmemIteratorB1_;
   using WarpIteratorA1 = WarpIteratorA1_;   ///< Iterates over the intermediate accumulator tile in shared memory
 
@@ -194,6 +200,10 @@ class B2bMmaMultistageSmemAccumulator :
   /// Complex transform on B operand
   static ComplexTransform const kTransformB1 = Operator1::kTransformB;
 
+  /// Complex transform exports needed by higher-level kernels
+  static ComplexTransform const kTransformA = kTransformA0;
+  static ComplexTransform const kTransformB = kTransformB0;
+
   /// Internal structure exposed for introspection.
   struct Detail {
 
@@ -205,15 +215,15 @@ class B2bMmaMultistageSmemAccumulator :
                   "GEMM operations.");
 
     /// Number of cp.async instructions to load one stage of operand A
-    static int const TBLDGSTSIterationsA0 =
+    static int const TBLoadIterationsA0 =
         IteratorA0::ThreadMap::Iterations::kCount;
 
     /// Number of cp.async instructions to load one stage of operand B
-    static int const TBLDGSTSIterationsB0 =
+    static int const TBLoadIterationsB0 =
         IteratorB0::ThreadMap::Iterations::kCount;
 
     /// Number of cp.async instructions to load one stage of operand B
-    static int const TBLDGSTSIterationsB1 =
+    static int const TBLoadIterationsB1 =
         IteratorB1::ThreadMap::Iterations::kCount;
 
     /// Number of stages
@@ -221,15 +231,15 @@ class B2bMmaMultistageSmemAccumulator :
 
     /// Number of cp.async instructions to load on group of operand A
     static int const kAccessesPerGroupA0 =
-        (TBLDGSTSIterationsA0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0;
+        (TBLoadIterationsA0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0;
 
     /// Number of cp.async instructions to load on group of operand B
     static int const kAccessesPerGroupB0 =
-        (TBLDGSTSIterationsB0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0;
+        (TBLoadIterationsB0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0;
 
     /// Number of cp.async instructions to load on group of operand B
     static int const kAccessesPerGroupB1 =
-        (TBLDGSTSIterationsB1 + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1;
+        (TBLoadIterationsB1 + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1;
   };
 
  private:
@@ -276,13 +286,15 @@ class B2bMmaMultistageSmemAccumulator :
       ///< ID of warp
       int warp_idx,
       ///< ID of each thread within a warp
-      int lane_idx
+      int lane_idx,
+      ///< GEMM0 N is used for accumulator extent
+      int problem_size_0_n
     ):
       Base(shared_storage, thread_idx, warp_idx, lane_idx),
       smem_iterator_A0_(shared_storage.b2b_mma_shared_storage.shared_storage0.operand_A_ref(), thread_idx),
       smem_iterator_B0_(shared_storage.b2b_mma_shared_storage.shared_storage0.operand_B_ref(), thread_idx),
       smem_iterator_D0_(shared_storage.accumulator_shared_storage0.accum_ref(), lane_idx),
-      warp_tile_iterator_A1_(shared_storage.accumulator_shared_storage0.accum_ref(), lane_idx),
+      warp_tile_iterator_A1_(shared_storage.accumulator_shared_storage0.accum_ref(), {Base::WarpGemm1::kM, problem_size_0_n}, lane_idx ),
       smem_iterator_B1_(shared_storage.b2b_mma_shared_storage.shared_storage1.operand_B_ref(), thread_idx)
   {
     // Compute warp location within threadblock tile by mapping the warp_id to
@@ -325,10 +337,10 @@ class B2bMmaMultistageSmemAccumulator :
                                    IteratorA0::kAccessesPerVector);
     this->smem_iterator_A0_.set_iteration_index(group_start_A0);
 
-    // LDGSTS for operand A
+    // cp.async for operand A
     CUTLASS_PRAGMA_UNROLL
     for (int j = 0; j < Detail::kAccessesPerGroupA0; ++j) {
-      if (group_start_A0 + j < Detail::TBLDGSTSIterationsA0) {
+      if (group_start_A0 + j < Detail::TBLoadIterationsA0) {
         typename IteratorA0::AccessType *dst_ptr =
             reinterpret_cast<typename IteratorA0::AccessType *>(
                 this->smem_iterator_A0_.get());
@@ -355,10 +367,10 @@ class B2bMmaMultistageSmemAccumulator :
                                    IteratorB0::kAccessesPerVector);
     this->smem_iterator_B0_.set_iteration_index(group_start_B0);
 
-    // LDGSTS for operand B
+    // cp.async for operand B
     CUTLASS_PRAGMA_UNROLL
     for (int j = 0; j < Detail::kAccessesPerGroupB0; ++j) {
-      if (group_start_B0 + j < Detail::TBLDGSTSIterationsB0) {
+      if (group_start_B0 + j < Detail::TBLoadIterationsB0) {
         typename IteratorB0::AccessType *dst_ptr =
             reinterpret_cast<typename IteratorB0::AccessType *>(
                 this->smem_iterator_B0_.get());
@@ -388,10 +400,10 @@ class B2bMmaMultistageSmemAccumulator :
                                    IteratorB1::kAccessesPerVector);
     this->smem_iterator_B1_.set_iteration_index(group_start_B1);
 
-    // LDGSTS for operand B
+    // cp.async for operand B
     CUTLASS_PRAGMA_UNROLL
     for (int j = 0; j < Detail::kAccessesPerGroupB1; ++j) {
-      if (group_start_B1 + j < Detail::TBLDGSTSIterationsB1) {
+      if (group_start_B1 + j < Detail::TBLoadIterationsB1) {
         typename IteratorB1::AccessType *dst_ptr =
             reinterpret_cast<typename IteratorB1::AccessType *>(
                 this->smem_iterator_B1_.get());
@@ -451,9 +463,9 @@ class B2bMmaMultistageSmemAccumulator :
       iterator_A0.set_iteration_index(0);
       this->smem_iterator_A0_.set_iteration_index(0);
 
-      // LDGSTS for operand A
+      // cp.async for operand A
       CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::TBLDGSTSIterationsA0; ++j) {
+      for (int j = 0; j < Detail::TBLoadIterationsA0; ++j) {
         typename IteratorA0::AccessType *dst_ptr =
             reinterpret_cast<typename IteratorA0::AccessType *>(
                 this->smem_iterator_A0_.get());
@@ -479,9 +491,9 @@ class B2bMmaMultistageSmemAccumulator :
       iterator_B0.set_iteration_index(0);
       this->smem_iterator_B0_.set_iteration_index(0);
 
-      // LDGSTS for operand B
+      // cp.async for operand B
       CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::TBLDGSTSIterationsB0; ++j) {
+      for (int j = 0; j < Detail::TBLoadIterationsB0; ++j) {
         typename IteratorB0::AccessType *dst_ptr =
             reinterpret_cast<typename IteratorB0::AccessType *>(
                 this->smem_iterator_B0_.get());
@@ -662,6 +674,11 @@ class B2bMmaMultistageSmemAccumulator :
 
     }
 
+    // Insert fence and wait for all outstanding cp.async operations to commit.
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
     /// Epilogue for the first Implicit Gemm
     Epilogue0 epilogue0;
 
@@ -687,9 +704,9 @@ class B2bMmaMultistageSmemAccumulator :
       iterator_B1.set_iteration_index(0);
       this->smem_iterator_B1_.set_iteration_index(0);
 
-      // LDGSTS for operand B
+      // cp.async for operand B
       CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::TBLDGSTSIterationsB1; ++j) {
+      for (int j = 0; j < Detail::TBLoadIterationsB1; ++j) {
         typename IteratorB1::AccessType *dst_ptr =
             reinterpret_cast<typename IteratorB1::AccessType *>(
                 this->smem_iterator_B1_.get());
@@ -853,7 +870,10 @@ class B2bMmaMultistageSmemAccumulator :
 
     }
 
-
+    // Commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
 
   }
 };
diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined.h
index 4e39fda5b6..d5f1629466 100644
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -126,7 +126,9 @@ class B2bMmaPipelined :
 
   using Shape0 = Shape0_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
   using IteratorA0 = IteratorA0_;     ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA0;
   using IteratorB0 = IteratorB0_;     ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB0;
   using Policy0 = Policy0_;           ///< Policy describing tuning details
 
   using SmemIteratorA0 = SmemIteratorA0_;
@@ -139,6 +141,8 @@ class B2bMmaPipelined :
     FragmentIteratorA1ScaleBias_;     ///< WarpIterator to load Scale or Bias vector from the threadblock fragment
   using IteratorB1 = IteratorB1_;     ///< Iterates over tiles of B operand in global memory
   using Policy1 = Policy1_;           ///< Policy describing tuning details
+  using Policy = Policy1;             ///< Export Policy1 as the threadblock-level Mma's policy
+  using Shape = Shape1;
 
   using SmemIteratorB1 = SmemIteratorB1_;
 
@@ -195,6 +199,10 @@ class B2bMmaPipelined :
   /// Complex transform on B1 operand
   static ComplexTransform const kTransformB1 = Operator1::kTransformB;
 
+  /// Complex transform exports needed by higher-level kernels
+  static ComplexTransform const kTransformA = kTransformA0;
+  static ComplexTransform const kTransformB = kTransformB0;
+
   /// staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
   static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2");
 
@@ -228,7 +236,8 @@ class B2bMmaPipelined :
     typename Base::B2bMmaSharedStorage &shared_storage, ///< Shared storage needed for internal use by threadblock-scoped GEMM
     int thread_idx,                                     ///< ID within the threadblock
     int warp_idx,                                       ///< ID of warp
-    int lane_idx                                        ///< ID of each thread within a warp
+    int lane_idx,                                       ///< ID of each thread within a warp
+    int problem_size_0_n                                ///< GEMM0 N is used for accumulator extent
   ):
     Base(shared_storage, thread_idx, warp_idx, lane_idx),
     smem_iterator_A_(shared_storage.shared_storage0.operand_A_ref(), thread_idx), 
@@ -324,7 +333,7 @@ class B2bMmaPipelined :
     iterator_B0.clear_mask(gemm_k_iterations_0 <= 1);
 
     // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
-    // shared memory loads (which have the tighest latency requirement).
+    // shared memory loads (which have the tightest latency requirement).
 
     //
     // Mainloop
diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined_smem_accumulator.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined_smem_accumulator.h
index b548c85763..c3393e0ccf 100644
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined_smem_accumulator.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined_smem_accumulator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -128,7 +128,9 @@ class B2bMmaPipelinedSmemAccumulator :
 
   using Shape0 = Shape0_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
   using IteratorA0 = IteratorA0_;     ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA0;
   using IteratorB0 = IteratorB0_;     ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB0;
   using IteratorAccumulatorScaleBias = IteratorAccumulatorScaleBias_;   ///< Iterates over tiles of the scale and bias vectors in global memory
   using Policy0 = Policy0_;           ///< Policy0 describing tuning details
 
@@ -141,6 +143,8 @@ class B2bMmaPipelinedSmemAccumulator :
   using Shape1 = Shape1_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
   using IteratorB1 = IteratorB1_;     ///< Iterates over tiles of B operand in global memory
   using Policy1 = Policy1_;           ///< Policy1 describing tuning details
+  using Policy = Policy1;             ///< Export Policy1 as the threadblock-level Mma's policy
+  using Shape = Shape1;
 
   using SmemIteratorB1 = SmemIteratorB1_;
   using WarpIteratorA1 = WarpIteratorA1_;   ///< Iterates over the intermediate accumulator tile in shared memory
@@ -192,6 +196,10 @@ class B2bMmaPipelinedSmemAccumulator :
   /// Complex transform on B1 operand
   static ComplexTransform const kTransformB1 = Operator1::kTransformB;
 
+  /// Complex transform exports needed by higher-level kernels
+  static ComplexTransform const kTransformA = kTransformA0;
+  static ComplexTransform const kTransformB = kTransformB0;
+
   /// staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
   static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2");
 
@@ -236,13 +244,14 @@ class B2bMmaPipelinedSmemAccumulator :
     typename Base::B2bMmaSharedStorage &shared_storage, ///< Shared storage needed for internal use by threadblock-scoped GEMM
     int thread_idx,                                     ///< ID within the threadblock
     int warp_idx,                                       ///< ID of warp
-    int lane_idx                                        ///< ID of each thread within a warp
+    int lane_idx,                                        ///< ID of each thread within a warp
+    int problem_size_0_n                                ///< GEMM0 N is used for accumulator extent
   ):
     Base(shared_storage, thread_idx, warp_idx, lane_idx),
     smem_iterator_A_(shared_storage.b2b_mma_shared_storage.shared_storage0.operand_A_ref(), thread_idx),
     smem_iterator_B0_(shared_storage.b2b_mma_shared_storage.shared_storage0.operand_B_ref(), thread_idx),
     smem_iterator_D0_(shared_storage.accumulator_shared_storage0.accum_ref(), lane_idx),
-    warp_tile_iterator_A1_(shared_storage.accumulator_shared_storage0.accum_ref(), lane_idx),
+    warp_tile_iterator_A1_(shared_storage.accumulator_shared_storage0.accum_ref(), {Base::WarpGemm1::kM, problem_size_0_n}, lane_idx),
     smem_iterator_B1_(shared_storage.b2b_mma_shared_storage.shared_storage1.operand_B_ref(), thread_idx) {
 
     // Compute warp location within threadblock tile by mapping the warp_id to
@@ -345,7 +354,7 @@ class B2bMmaPipelinedSmemAccumulator :
     iterator_B0.clear_mask(gemm_k_iterations_0 <= 1);
 
     // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
-    // shared memory loads (which have the tighest latency requirement).
+    // shared memory loads (which have the tightest latency requirement).
 
     //
     // Mainloop
diff --git a/examples/13_two_tensor_op_fusion/threadblock/default_b2b_mma.h b/examples/13_two_tensor_op_fusion/threadblock/default_b2b_mma.h
index 3c12e05cc0..2ea38cebfa 100644
--- a/examples/13_two_tensor_op_fusion/threadblock/default_b2b_mma.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/default_b2b_mma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/threadblock/default_b2b_mma_smem_accumulator.h b/examples/13_two_tensor_op_fusion/threadblock/default_b2b_mma_smem_accumulator.h
index ea1a258fbd..7a97ce0312 100644
--- a/examples/13_two_tensor_op_fusion/threadblock/default_b2b_mma_smem_accumulator.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/default_b2b_mma_smem_accumulator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -43,7 +43,7 @@
 #include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
 #include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
 #include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
-#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h"
 
 #include "threadblock/b2b_mma_pipelined_smem_accumulator.h"
 #include "threadblock/b2b_mma_multistage_smem_accumulator.h"
@@ -158,11 +158,11 @@ struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
 
   static int const kThreadCount = 32;
   // load warp tile from Shared Memory accumulator
-  using WarpIteratorA1 = cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator<
-    MatrixShape<WarpShape1::kM, InstructionShape::kK>, cutlass::gemm::Operand::kA, 
+  using WarpIteratorA1 = cutlass::gemm::warp::MmaTensorOpMultiplicandTileAccessIterator<
+    MatrixShape<WarpShape1::kM, WarpShape1::kK>, cutlass::gemm::Operand::kA, 
     ElementA, SmemAccumulatorLayout,
     MatrixShape<InstructionShape::kM, InstructionShape::kK>,
-    WarpMmaTensorOp1::Policy::OpDelta::kRow, kThreadCount>;
+    WarpMmaTensorOp1::Policy::OpDelta::kRow, kThreadCount, true>;
  
   // Define the threadblock-scoped pipelined matrix multiply
   using ThreadblockB2bMma = cutlass::gemm::threadblock::B2bMmaPipelinedSmemAccumulator<
@@ -303,11 +303,11 @@ struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
 
   static int const kThreadCount = 32;
   // load warp tile from Shared Memory accumulator
-  using WarpIteratorA1 = cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator<
-    MatrixShape<WarpShape1::kM, InstructionShape::kK>, cutlass::gemm::Operand::kA, 
+  using WarpIteratorA1 = cutlass::gemm::warp::MmaTensorOpMultiplicandTileAccessIterator<
+    MatrixShape<WarpShape1::kM, WarpShape1::kK>, cutlass::gemm::Operand::kA, 
     ElementA, SmemAccumulatorLayout,
     MatrixShape<InstructionShape::kM, InstructionShape::kK>,
-    WarpMmaTensorOp1::Policy::OpDelta::kRow, kThreadCount>;
+    WarpMmaTensorOp1::Policy::OpDelta::kRow, kThreadCount, true>;
  
   // Define the threadblock-scoped pipelined matrix multiply
   using ThreadblockB2bMma = cutlass::gemm::threadblock::B2bMmaMultistageSmemAccumulator<
@@ -436,11 +436,11 @@ struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
 
   static int const kThreadCount = 32;
   // load warp tile from Shared Memory accumulator
-  using WarpIteratorA1 = cutlass::gemm::warp::MmaTensorOpMultiplicandTileIteratorCanonical<
-    MatrixShape<WarpShape1::kM, InstructionShape::kK>, cutlass::gemm::Operand::kA, 
+  using WarpIteratorA1 = cutlass::gemm::warp::MmaTensorOpMultiplicandTileAccessIterator<
+    MatrixShape<WarpShape1::kM, WarpShape1::kK>, cutlass::gemm::Operand::kA, 
     ElementA, SmemAccumulatorLayout,
     MatrixShape<InstructionShape::kM, InstructionShape::kK>,
-    WarpMmaTensorOp1::Policy::OpDelta::kRow, kThreadCount>;
+    WarpMmaTensorOp1::Policy::OpDelta::kRow, kThreadCount, true>;
  
   // Define the threadblock-scoped pipelined matrix multiply
   using ThreadblockB2bMma = cutlass::gemm::threadblock::B2bMmaPipelinedSmemAccumulator<
@@ -574,11 +574,11 @@ struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
 
   static int const kThreadCount = 32;
   // load warp tile from Shared Memory accumulator
-  using WarpIteratorA1 = cutlass::gemm::warp::MmaTensorOpMultiplicandTileIteratorCanonical<
-    MatrixShape<WarpShape1::kM, InstructionShape::kK>, cutlass::gemm::Operand::kA, 
+  using WarpIteratorA1 = cutlass::gemm::warp::MmaTensorOpMultiplicandTileAccessIterator<
+    MatrixShape<WarpShape1::kM, WarpShape1::kK>, cutlass::gemm::Operand::kA, 
     ElementA, SmemAccumulatorLayout,
     MatrixShape<InstructionShape::kM, InstructionShape::kK>,
-    WarpMmaTensorOp1::Policy::OpDelta::kRow, kThreadCount>;
+    WarpMmaTensorOp1::Policy::OpDelta::kRow, kThreadCount, true >;
  
 
   // Define the threadblock-scoped multistage matrix multiply
diff --git a/examples/13_two_tensor_op_fusion/threadblock/grouped_threadblock_swizzle.h b/examples/13_two_tensor_op_fusion/threadblock/grouped_threadblock_swizzle.h
new file mode 100644
index 0000000000..c79b7c77f9
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/threadblock/grouped_threadblock_swizzle.h
@@ -0,0 +1,125 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implements several threadblock-swizzling functions for grouped kernels
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/kernel/grouped_problem_visitor.h"
+#include "cutlass/gemm/kernel/gemm_grouped_problem_visitor.h"
+#include "kernel/b2b_gemm_grouped_problem_visitor.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+struct GroupedThreadblockSwizzleBase {};
+
+/// Helper for determining if a swizzling function is specialized for grouped operation
+template <typename ThreadblockSwizzle>
+struct IsGroupedSwizzle {
+  static bool const value = cutlass::platform::is_base_of<GroupedThreadblockSwizzleBase, ThreadblockSwizzle>::value;
+};
+
+} // namespace detail
+
+/// Swizzling function for grouped kernels
+template <typename ProblemVisitor_>
+struct GroupedThreadblockSwizzle : detail::GroupedThreadblockSwizzleBase {
+
+  using ProblemVisitor = ProblemVisitor_;
+  ProblemVisitor problem_visitor;
+
+  CUTLASS_HOST_DEVICE
+  GroupedThreadblockSwizzle(typename ProblemVisitor::Params& params,
+                            typename ProblemVisitor::SharedStorage& shared_storage,
+                            int block_idx) : problem_visitor(params, shared_storage, block_idx) {}
+
+  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
+  CUTLASS_DEVICE
+  GemmCoord get_tile_offset(int /*log_tile*/) const {
+    GemmCoord problem_size = problem_visitor.problem_size();
+    int32_t threadblock_idx = int32_t(problem_visitor.threadblock_idx());
+    GemmCoord grid_shape = problem_visitor.grid_shape(problem_size);
+
+    return GemmCoord(int(threadblock_idx / grid_shape.n()),
+                     int(threadblock_idx % grid_shape.n()),
+                     0);
+  }
+
+  /// Dummy method to satisfy API for threadblock swizzling functions
+  CUTLASS_HOST_DEVICE
+  static int get_log_tile(GemmCoord /*tiled_shape*/) {
+    return 0;
+  }
+};
+
+template <
+  typename ThreadblockShape,
+  typename LayoutC,
+  cutlass::gemm::kernel::GroupScheduleMode GroupScheduleMode_ = cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly,
+  int PrefetchTileCount = 128,
+  int ThreadCount = PrefetchTileCount>
+struct B2bGemmGroupedThreadblockSwizzle : GroupedThreadblockSwizzle<
+                                            cutlass::gemm::kernel::B2bGemmGroupedProblemVisitor<
+                                              ThreadblockShape,
+                                              GroupScheduleMode_,
+                                              PrefetchTileCount,
+                                              ThreadCount,
+                                              platform::is_same<LayoutC, cutlass::layout::ColumnMajor>::value
+                                            >
+                                          > {
+  using Base = GroupedThreadblockSwizzle<cutlass::gemm::kernel::B2bGemmGroupedProblemVisitor<
+                                          ThreadblockShape,
+                                          GroupScheduleMode_,
+                                          PrefetchTileCount,
+                                          ThreadCount,
+                                          platform::is_same<LayoutC, cutlass::layout::ColumnMajor>::value>>;
+
+  CUTLASS_HOST_DEVICE
+  B2bGemmGroupedThreadblockSwizzle(typename Base::ProblemVisitor::Params& params,
+                                   typename Base::ProblemVisitor::SharedStorage& shared_storage,
+                                   int block_idx) : Base(params, shared_storage, block_idx) {}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt b/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt
index 23b8f0dd46..3e0b870f30 100644
--- a/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt
+++ b/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu
index eed9f9b299..99d3cdb178 100644
--- a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu
+++ b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -194,7 +194,7 @@ using ShapeMMAWarp = cutlass::gemm::GemmShape<64, 64, 16>;  // <- warp tile M =
 using ShapeMMAOp = cutlass::gemm::GemmShape<16, 8, 8>;  // <- MMA Op tile M = 16, N = 8, K = 8
 
 // This code section describes how threadblocks are scheduled on GPU
-using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;  // <- ??
+using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
 
 // This code section describes the epilogue part of the kernel
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
diff --git a/examples/15_ampere_sparse_tensorop_gemm/CMakeLists.txt b/examples/15_ampere_sparse_tensorop_gemm/CMakeLists.txt
index fbd20df8a8..02d3205889 100644
--- a/examples/15_ampere_sparse_tensorop_gemm/CMakeLists.txt
+++ b/examples/15_ampere_sparse_tensorop_gemm/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -33,3 +33,12 @@ cutlass_example_add_executable(
   ampere_sparse_tensorop_gemm.cu
   )
 
+cutlass_example_add_executable(
+  15_ampere_sparse_tensorop_gemm_universal
+  ampere_sparse_tensorop_gemm_universal.cu
+  )
+
+cutlass_example_add_executable(
+  15_ampere_sparse_tensorop_gemm_with_visitor
+  ampere_sparse_tensorop_gemm_with_visitor.cu
+  )
diff --git a/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu b/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu
index 4a25a96341..e92b717caa 100644
--- a/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu
+++ b/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -84,7 +84,7 @@ using ShapeMMAWarp = cutlass::gemm::GemmShape<64, 64, 256>;  // <- warp tile M =
 using ShapeMMAOp = cutlass::gemm::GemmShape<16, 8, 128>;  // <- MMA Op tile M = 16, N = 8, K = 128
 
 // This code section describes how threadblocks are scheduled on GPU
-using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;  // <- ??
+using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
 
 // This code section describes the epilogue part of the kernel
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
diff --git a/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm_universal.cu b/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm_universal.cu
new file mode 100644
index 0000000000..dcab5ac144
--- /dev/null
+++ b/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm_universal.cu
@@ -0,0 +1,329 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/**
+Please check example 07, 08 and 17 for the basics of dense tensor op gemm kernels.  NVIDIA Ampere
+architecture also supports structured sparse tensor op for tf32, fp16, int8 and int4.
+
+Sparse GEMM kernels needs to takes an additional E matrix which stores the meta data.  The format of
+meta data is different for every data types.   CUTLASS templates can automatically infer it based on
+input A and B.  Check code below.
+
+Moreover, matrix E needs to be preprocessed so that it can use ldmatrix to load into the registers
+efficiently.
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm_sparse_universal.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/host_reorder.h"
+#include "cutlass/util/host_uncompress.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "helper.h"
+
+// The code section below describes datatype for input, output matrices and computation between
+// elements in input matrices.
+using ElementAccumulator = int32_t;                 // <- data type of accumulator
+using ElementComputeEpilogue = ElementAccumulator;  // <- data type of epilogue operations
+using ElementInputA = cutlass::int4b_t;             // <- data type of elements in input matrix A
+using ElementInputB = cutlass::int4b_t;             // <- data type of elements in input matrix B
+using ElementOutput = int32_t;                      // <- data type of elements in output matrix D
+
+// The code section below describes matrix layout of input and output matrices. Row Major for
+// Matrix A, Column Major for Matrix B and Row Major for Matrix C
+using LayoutInputA = cutlass::layout::RowMajor;
+using LayoutInputB = cutlass::layout::ColumnMajor;
+using LayoutOutput = cutlass::layout::RowMajor;
+
+// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
+using MMAOp = cutlass::arch::OpClassTensorOp;
+
+// This code section describes CUDA SM architecture number
+using SmArch = cutlass::arch::Sm80;
+
+// This code section describes the tile size a thread block will compute
+using ShapeMMAThreadBlock =
+    cutlass::gemm::GemmShape<128, 128, 256>;  // <- threadblock tile M = 128, N = 128, K = 256
+// This code section describes tile size a warp will compute
+using ShapeMMAWarp = cutlass::gemm::GemmShape<64, 64, 256>;  // <- warp tile M = 64, N = 64, K = 256
+// This code section describes the size of MMA op
+using ShapeMMAOp = cutlass::gemm::GemmShape<16, 8, 128>;  // <- MMA Op tile M = 16, N = 8, K = 128
+
+// This code section describes how threadblocks are scheduled on GPU
+using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
+
+// This code section describes the epilogue part of the kernel
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+    ElementOutput,                                     // <- data type of output matrix
+    128 / cutlass::sizeof_bits<ElementOutput>::value,  // <- the number of elements per vectorized
+                                                       // memory access. For a byte, it's 16
+                                                       // elements. This becomes the vector width of
+                                                       // math instructions in the epilogue too
+    ElementAccumulator,                                // <- data type of accumulator
+    ElementComputeEpilogue>;  // <- data type for alpha/beta in linear combination function
+
+// Number of pipelines you want to use
+constexpr int NumStages = 3;
+
+using Gemm = cutlass::gemm::device::GemmSparseUniversal<ElementInputA,
+                                                        LayoutInputA,
+                                                        ElementInputB,
+                                                        LayoutInputB,
+                                                        ElementOutput,
+                                                        LayoutOutput,
+                                                        ElementAccumulator,
+                                                        MMAOp,
+                                                        SmArch,
+                                                        ShapeMMAThreadBlock,
+                                                        ShapeMMAWarp,
+                                                        ShapeMMAOp,
+                                                        EpilogueOp,
+                                                        SwizzleThreadBlock,
+                                                        NumStages>;
+
+// Data type and layout of meta data matrix E can be inferred from template Gemm.
+using ElementInputE = typename Gemm::ElementE;
+using LayoutInputE = cutlass::layout::RowMajor;
+using ReorderedLayoutInputE = typename Gemm::LayoutE;
+
+// Blow property is defined in include/cutlass/arch/sp_mma_sm80.h
+// 50% Sparsity on Ampere
+constexpr int kSparse = Gemm::kSparse;
+// How many elements of A are covered per ElementE
+constexpr int kElementsPerElementE = Gemm::kElementsPerElementE;
+// The size of individual meta data 
+constexpr int kMetaSizeInBits = Gemm::kMetaSizeInBits;
+
+int run() {
+
+  const int length_m = 512;
+  const int length_n = 512;
+  const int length_k = 1024;
+
+  // Create a tuple of problem size for matrix multiplication
+  cutlass::gemm::GemmCoord problem_size(length_m, length_n, length_k);
+
+  // Initialize tensors using CUTLASS helper functions
+  cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(
+      cutlass::make_Coord(problem_size.m(), problem_size.k() / kSparse));  // <- Create matrix A with dimensions M x (K / 2)
+  cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a_uncompressed(
+      problem_size.mk());  // <- Create uncompressed matrix A with dimensions M x K for reference computing
+
+  cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(
+      problem_size.kn());  // <- Create matrix B with dimensions K x N
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(
+      problem_size.mn());  // <- Create matrix C with dimensions M x N
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(
+      problem_size.mn());  // <- Create matrix D with dimensions M x N used to store output from
+                           // CUTLASS kernel
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_d(
+      problem_size.mn());  // <- Create matrix D with dimensions M x N used to store output from
+                           // reference kernel
+
+  // Create matrix E with dimensions M x (K / 2 / kElementsPerElementE). This one is used by reference computing.
+  cutlass::HostTensor<ElementInputE, LayoutInputE> tensor_e(
+      cutlass::make_Coord(problem_size.m(), problem_size.k() / kSparse / kElementsPerElementE));
+  // Same size as the above.  The above one needs to be reordered and stored in this one.
+  cutlass::HostTensor<ElementInputE, ReorderedLayoutInputE> tensor_e_reordered(
+      cutlass::make_Coord(problem_size.m(), problem_size.k() / kSparse / kElementsPerElementE));
+
+  // Fill input and output matrices on host using CUTLASS helper functions
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_a.host_view(),
+      1,
+      ElementInputA(2),
+      ElementInputA(-2),
+      0);  // <- Fill matrix A on host with uniform-distribution random data
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_b.host_view(),
+      1,
+      ElementInputB(2),
+      ElementInputB(-2),
+      0);  // <- Fill matrix B on host with uniform-distribution random data
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_c.host_view(),
+      1,
+      ElementOutput(2),
+      ElementOutput(-2),
+      0);  // <- Fill matrix C on host with uniform-distribution random data
+  cutlass::reference::host::TensorFillRandomSparseMeta(                                           
+      tensor_e.host_view(),
+      1,
+      kMetaSizeInBits);   // <- Fill matrix E on host with uniform-distribution random meta data
+  cutlass::reference::host::TensorFill(
+      tensor_d.host_view());  // <- fill matrix D on host with zeros
+  cutlass::reference::host::TensorFill(
+      tensor_ref_d.host_view());  // <- fill matrix D for reference on host with zeros
+
+  // Reorder the meta data matrix so that we can use ldmatrix to load them to tensor core
+  // instructions.
+  cutlass::reorder_meta(tensor_e_reordered.host_ref(), tensor_e.host_ref(),                         
+                        {problem_size.m(), problem_size.n(),                                        
+                         problem_size.k() / kSparse / kElementsPerElementE});
+
+  // Copy data from host to GPU
+  tensor_a.sync_device();
+  tensor_b.sync_device();
+  tensor_c.sync_device();
+  tensor_d.sync_device();
+  tensor_e_reordered.sync_device();
+  tensor_ref_d.sync_device();
+
+  // Initialize alpha and beta for dot product computation
+  ElementComputeEpilogue alpha = ElementComputeEpilogue(1);
+  ElementComputeEpilogue beta = ElementComputeEpilogue(0);
+
+  // Split K dimension into 1 partitions
+  int split_k_slices = 2;
+
+  // Create a tuple of gemm kernel arguments. This is later passed as arguments to launch
+  // instantiated CUTLASS kernel
+  typename Gemm::Arguments arguments{cutlass::gemm::GemmUniversalMode::kGemm,
+                                     problem_size,  // <- problem size of matrix multiplication
+                                     split_k_slices,// <- k-dimension split factor
+                                     {alpha, beta},          // <- tuple of alpha and beta
+                                     tensor_a.device_data(),  // <- reference to matrix A on device
+                                     tensor_b.device_data(),  // <- reference to matrix B on device
+                                     tensor_c.device_data(),  // <- reference to matrix C on device
+                                     tensor_d.device_data(),  // <- reference to matrix D on device
+                                     tensor_e_reordered.device_data(),  // <- reference to matrix E on device
+                                     int64_t(),
+                                     int64_t(),
+                                     int64_t(),
+                                     int64_t(),
+                                     int64_t(),
+                                     tensor_a.layout().stride(0),
+                                     tensor_b.layout().stride(0),
+                                     tensor_c.layout().stride(0),
+                                     tensor_d.layout().stride(0),
+                                     tensor_e_reordered.layout().stride(0)
+                                     };
+
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  // Instantiate CUTLASS kernel depending on templates
+  Gemm gemm_op;
+
+  // Check the problem size is supported or not 
+  cutlass::Status status = gemm_op.can_implement(arguments);
+  CUTLASS_CHECK(status);
+
+  // Initialize CUTLASS kernel with arguments and workspace pointer
+  status = gemm_op.initialize(arguments, workspace.get());
+  CUTLASS_CHECK(status);
+
+  // Launch initialized CUTLASS kernel
+  status = gemm_op();
+  CUTLASS_CHECK(status);
+
+  // uncompress tensor_a based on meta data tensor_e. We need it for reference computing.
+  cutlass::uncompress(tensor_a_uncompressed.host_ref(), tensor_a.host_ref(),
+                      tensor_e.host_ref(), problem_size.m(), problem_size.k());
+ 
+  // Create instantiation for host reference gemm kernel
+  cutlass::reference::host::Gemm<ElementInputA,
+                                 LayoutInputA,
+                                 ElementInputB,
+                                 LayoutInputB,
+                                 ElementOutput,
+                                 LayoutOutput,
+                                 ElementComputeEpilogue,
+                                 ElementComputeEpilogue,
+                                 typename Gemm::Operator>
+      gemm_host;
+
+  // Launch host reference gemm kernel
+  gemm_host(problem_size,
+            alpha,
+            tensor_a_uncompressed.host_ref(),
+            tensor_b.host_ref(),
+            beta,
+            tensor_c.host_ref(),
+            tensor_ref_d.host_ref());
+
+  // Copy output data from CUTLASS host for comparison
+  tensor_d.sync_host();
+
+  // Check if output from CUTLASS kernel and reference kernel are equal or not
+  bool passed = cutlass::reference::host::TensorEquals(
+    tensor_d.host_view(),
+    tensor_ref_d.host_view());
+
+  std::cout << (passed ? "Passed" : "Failed") << std::endl;
+
+  return (passed ? 0  : -1);
+}
+
+int main() {
+  
+  bool notSupported = false;
+
+  // Ampere Sparse Tensor Core operations exposed with mma.sync and ldmatrix are first available
+  // in CUDA 11.1. 
+  //
+  // CUTLASS must be compiled with CUDA 11.1 Toolkit to run these examples.
+  
+  if (!(__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 1))) {
+    std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.1 Toolkit or later." << std::endl;
+    notSupported = true;
+  }
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (props.major * 10 + props.minor < 80) {
+    std::cerr << "Ampere Tensor Core operations must be run on a machine with compute capability at least 80."
+              << std::endl;
+    notSupported = true;
+  }
+
+  if (notSupported) {
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+
+  return run();
+}
diff --git a/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm_with_visitor.cu b/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm_with_visitor.cu
new file mode 100644
index 0000000000..90aa44528e
--- /dev/null
+++ b/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm_with_visitor.cu
@@ -0,0 +1,377 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/**
+Please check example 07, 08 and 17 for the basics of dense tensor op gemm kernels.  NVIDIA Ampere
+architecture also supports structured sparse tensor op for tf32, fp16, int8 and int4.
+Sparse GEMM kernels needs to takes an additional E matrix which stores the meta data.  The format of
+meta data is different for every data types.   CUTLASS templates can automatically infer it based on
+input A and B.  Check code below.
+Moreover, matrix E needs to be preprocessed so that it can use ldmatrix to load into the registers
+efficiently.
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm_sparse_with_visitor.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/host_reorder.h"
+#include "cutlass/util/host_uncompress.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
+
+#include "helper.h"
+
+// The code section below describes datatype for input, output matrices and computation between
+// elements in input matrices.
+using ElementAccumulator = int32_t;                 // <- data type of accumulator
+using ElementComputeEpilogue = ElementAccumulator;  // <- data type of epilogue operations
+using ElementInputA = int8_t;                       // <- data type of elements in input matrix A
+using ElementInputB = int8_t;                       // <- data type of elements in input matrix B
+using ElementOutput = int32_t;                      // <- data type of elements in output matrix D
+
+// The code section below describes matrix layout of input and output matrices. Row Major for
+// Matrix A, Column Major for Matrix B and Row Major for Matrix C
+using LayoutInputA = cutlass::layout::RowMajor;
+using LayoutInputB = cutlass::layout::ColumnMajor;
+using LayoutOutput = cutlass::layout::RowMajor;
+
+// The number of elements per vectorized memory access. 
+constexpr int AlignmentInputA = 128 / cutlass::sizeof_bits<ElementInputA>::value;
+constexpr int AlignmentInputB = 128 / cutlass::sizeof_bits<ElementInputB>::value;
+constexpr int AlignmentComputeEpilogue = 128 / cutlass::sizeof_bits<ElementComputeEpilogue>::value;
+constexpr int AlignmentOutput = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+
+// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
+using MMAOp = cutlass::arch::OpClassTensorOp;
+
+// This code section describes CUDA SM architecture number
+using SmArch = cutlass::arch::Sm80;
+
+// This code section describes the tile size a thread block will compute
+using ShapeMMAThreadBlock =
+    cutlass::gemm::GemmShape<128, 128, 128>;  // <- threadblock tile M = 128, N = 128, K = 128
+// This code section describes tile size a warp will compute
+using ShapeMMAWarp = cutlass::gemm::GemmShape<64, 64, 128>;  // <- warp tile M = 64, N = 64, K = 128
+// This code section describes the size of MMA op
+using ShapeMMAOp = cutlass::gemm::GemmShape<16, 8, 64>;  // <- MMA Op tile M = 16, N = 8, K = 64
+
+// This code section describes how threadblocks are scheduled on GPU
+using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
+
+using Operator = cutlass::arch::OpMultiplyAddSaturate;
+
+// Number of pipelines you want to use
+constexpr int NumStages = 3;
+
+constexpr auto NumEVTEpilogueStages = 1;
+
+using Accum = cutlass::epilogue::threadblock::VisitorAccFetch;
+
+using BiasTileThreadMap = cutlass::epilogue::threadblock::OutputTileThreadLayout<
+    ShapeMMAThreadBlock,
+    ShapeMMAWarp,
+    ElementComputeEpilogue,
+    AlignmentComputeEpilogue,
+    NumEVTEpilogueStages>;
+
+using Bias = cutlass::epilogue::threadblock::VisitorAuxLoad<
+    BiasTileThreadMap,
+    ElementComputeEpilogue,
+    cute::Stride<int64_t, cute::_1, int64_t>>;
+
+using ApplyBias = cutlass::epilogue::threadblock::VisitorCompute<
+    cutlass::plus, ElementComputeEpilogue, ElementComputeEpilogue,
+    cutlass::FloatRoundStyle::round_to_nearest>;
+
+using EVTApplyBias = cutlass::epilogue::threadblock::Sm80EVT<
+    ApplyBias,
+    Accum,
+    Bias>;
+
+using OutputTileThreadMap = cutlass::epilogue::threadblock::OutputTileThreadLayout<
+    ShapeMMAThreadBlock,
+    ShapeMMAWarp,
+    ElementOutput,
+    AlignmentOutput,
+    NumEVTEpilogueStages>;
+
+using Output = cutlass::epilogue::threadblock::VisitorAuxStore<
+    OutputTileThreadMap, ElementOutput,
+    cutlass::FloatRoundStyle::round_to_nearest,
+    cute::Stride<int64_t, cute::_1, int64_t>>;
+
+using EVTOutput = cutlass::epilogue::threadblock::Sm80EVT<
+    Output,
+    EVTApplyBias>;
+
+// Use element type in EVT with the smallest bitwidth as ElementC.
+using ElementC = ElementComputeEpilogue;
+using LayoutC = LayoutOutput;
+
+using Gemm =
+    typename cutlass::gemm::device::SparseGemmWithVisitor<
+      ElementInputA, LayoutInputA,
+      ElementInputB, LayoutInputB,
+      ElementC, LayoutC,
+      ElementAccumulator,
+      MMAOp,
+      SmArch,
+      ShapeMMAThreadBlock,
+      ShapeMMAWarp,
+      ShapeMMAOp,
+      EVTOutput,
+      SwizzleThreadBlock,
+      NumStages,
+      AlignmentInputA,
+      AlignmentInputB,
+      Operator,
+      NumEVTEpilogueStages>;
+
+// Data type and layout of meta data matrix E can be inferred from template Gemm.
+using ElementInputE = typename Gemm::GemmKernel::ElementE;
+using LayoutInputE = cutlass::layout::RowMajor;
+using ReorderedLayoutInputE = typename Gemm::GemmKernel::LayoutE;
+
+// Blow property is defined in include/cutlass/arch/sp_mma_sm80.h
+// 50% Sparsity on Ampere
+constexpr int kSparse = Gemm::kSparse;
+// How many elements of A are covered per ElementE
+constexpr int kElementsPerElementE = Gemm::kElementsPerElementE;
+// The size of individual meta data 
+constexpr int kMetaSizeInBits = Gemm::kMetaSizeInBits;
+
+int run() {
+
+  const int length_m = 512;
+  const int length_n = 512;
+  const int length_k = 1024;
+
+  // Create a tuple of problem size for matrix multiplication
+  cutlass::gemm::GemmCoord problem_size(length_m, length_n, length_k);
+
+  // Initialize tensors using CUTLASS helper functions
+  cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(
+      cutlass::make_Coord(problem_size.m(), problem_size.k() / kSparse));  // <- Create matrix A with dimensions M x (K / 2)
+  cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a_uncompressed(
+      problem_size.mk());  // <- Create uncompressed matrix A with dimensions M x K for reference computing
+
+  cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(
+      problem_size.kn());  // <- Create matrix B with dimensions K x N
+  cutlass::HostTensor<ElementComputeEpilogue, LayoutOutput> tensor_c(
+      problem_size.mn());  // <- Create matrix C with dimensions M x N
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(
+      problem_size.mn());  // <- Create matrix D with dimensions M x N used to store output from
+                           // CUTLASS kernel
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_d(
+      problem_size.mn());  // <- Create matrix D with dimensions M x N used to store output from
+                           // reference kernel
+
+  // Create matrix E with dimensions M x (K / 2 / kElementsPerElementE). This one is used by reference computing.
+  cutlass::HostTensor<ElementInputE, LayoutInputE> tensor_e(
+      cutlass::make_Coord(problem_size.m(), problem_size.k() / kSparse / kElementsPerElementE));
+  // Same size as the above.  The above one needs to be reordered and stored in this one.
+  cutlass::HostTensor<ElementInputE, ReorderedLayoutInputE> tensor_e_reordered(
+      cutlass::make_Coord(problem_size.m(), problem_size.k() / kSparse / kElementsPerElementE));
+
+  // Fill input and output matrices on host using CUTLASS helper functions
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_a.host_view(),
+      1,
+      ElementInputA(8),
+      ElementInputA(-8),
+      0);  // <- Fill matrix A on host with uniform-distribution random data
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_b.host_view(),
+      1,
+      ElementInputB(8),
+      ElementInputB(-8),
+      0);  // <- Fill matrix B on host with uniform-distribution random data
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_c.host_view(),
+      1,
+      ElementOutput(8),
+      ElementOutput(-8),
+      0);  // <- Fill matrix C on host with uniform-distribution random data
+  cutlass::reference::host::TensorFillRandomSparseMeta(                                           
+      tensor_e.host_view(),
+      1,
+      kMetaSizeInBits);   // <- Fill matrix E on host with uniform-distribution random meta data
+  cutlass::reference::host::TensorFill(
+      tensor_d.host_view());  // <- fill matrix D on host with zeros
+  cutlass::reference::host::TensorFill(
+      tensor_ref_d.host_view());  // <- fill matrix D for reference on host with zeros
+
+  // Reorder the meta data matrix so that we can use ldmatrix to load them to tensor core
+  // instructions.
+  cutlass::reorder_meta(tensor_e_reordered.host_ref(), tensor_e.host_ref(),                         
+                        {problem_size.m(), problem_size.n(),                                        
+                         problem_size.k() / kSparse / kElementsPerElementE});
+
+  // Copy data from host to GPU
+  tensor_a.sync_device();
+  tensor_b.sync_device();
+  tensor_c.sync_device();
+  tensor_d.sync_device();
+  tensor_e_reordered.sync_device();
+  tensor_ref_d.sync_device();
+
+  // Initialize alpha and beta for dot product computation
+  ElementComputeEpilogue alpha = ElementComputeEpilogue(1);
+  ElementComputeEpilogue beta = ElementComputeEpilogue(1);
+
+  typename Bias::Arguments bias_arguments{
+    tensor_c.device_data(),
+    ElementComputeEpilogue(0),
+    {problem_size.n(), cute::_1{}, problem_size.mn().product()}
+  };
+  typename Output::Arguments output_arguments{
+    tensor_d.device_data(),
+    {problem_size.n(), cute::_1{}, problem_size.mn().product()}
+  };
+  typename EVTOutput::Arguments callback_arguments{
+    {
+      {},                    // Accum
+      bias_arguments,        // Bias
+      {}                     // ApplyBias
+    },                       // EVTApplyBias
+    output_arguments         // Output
+  };                         // EVTOutput
+
+  // Create a tuple of gemm kernel arguments. This is later passed as arguments to launch
+  // instantiated CUTLASS kernel
+  typename Gemm::Arguments arguments{problem_size,  // <- problem size of matrix multiplication
+                                     tensor_a.device_ref(),  // <- reference to matrix A on device
+                                     tensor_b.device_ref(),  // <- reference to matrix B on device
+                                     tensor_e_reordered.device_ref(),  // <- reference to matrix E on device
+                                     callback_arguments};    // <- epilogue arguments
+
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  // Instantiate CUTLASS kernel depending on templates
+  Gemm gemm_op;
+
+  // Check the problem size is supported or not 
+  cutlass::Status status = gemm_op.can_implement(arguments);
+  CUTLASS_CHECK(status);
+
+  // Initialize CUTLASS kernel with arguments and workspace pointer
+  status = gemm_op.initialize(arguments, workspace.get());
+  CUTLASS_CHECK(status);
+
+  // Launch initialized CUTLASS kernel
+  status = gemm_op();
+  CUTLASS_CHECK(status);
+
+  // uncompress tensor_a based on meta data tensor_e. We need it for reference computing.
+  cutlass::uncompress(tensor_a_uncompressed.host_ref(), tensor_a.host_ref(),
+                      tensor_e.host_ref(), problem_size.m(), problem_size.k());
+
+  // Create instantiation for host reference gemm kernel
+  cutlass::reference::host::Gemm<ElementInputA,
+                                 LayoutInputA,
+                                 ElementInputB,
+                                 LayoutInputB,
+                                 ElementOutput,
+                                 LayoutOutput,
+                                 ElementComputeEpilogue,
+                                 ElementComputeEpilogue,
+                                 typename Gemm::Operator>
+      gemm_host;
+
+  // Launch host reference gemm kernel
+  gemm_host(problem_size,
+            alpha,
+            tensor_a_uncompressed.host_ref(),
+            tensor_b.host_ref(),
+            beta,
+            tensor_c.host_ref(),
+            tensor_ref_d.host_ref());
+
+  // Copy output data from CUTLASS host for comparison
+  tensor_d.sync_host();
+
+  // Check if output from CUTLASS kernel and reference kernel are equal or not
+  bool passed = cutlass::reference::host::TensorEquals(
+    tensor_d.host_view(),
+    tensor_ref_d.host_view());
+
+  std::cout << (passed ? "Passed" : "Failed") << std::endl;
+
+  return (passed ? 0  : -1);
+}
+
+int main() {
+
+  bool notSupported = false;
+
+  // Ampere Sparse Tensor Core operations exposed with mma.sync and ldmatrix are first available
+  // in CUDA 11.1. 
+  //
+  // CUTLASS must be compiled with CUDA 11.1 Toolkit to run these examples.
+
+  if (!(__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 1))) {
+    std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.1 Toolkit or later." << std::endl;
+    notSupported = true;
+  }
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (props.major * 10 + props.minor < 80) {
+    std::cerr << "Ampere Tensor Core operations must be run on a machine with compute capability at least 80."
+              << std::endl;
+    notSupported = true;
+  }
+
+  if (notSupported) {
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+
+  return run();
+}
diff --git a/examples/16_ampere_tensorop_conv2dfprop/CMakeLists.txt b/examples/16_ampere_tensorop_conv2dfprop/CMakeLists.txt
index e3afbb9650..cdc3f11ba3 100644
--- a/examples/16_ampere_tensorop_conv2dfprop/CMakeLists.txt
+++ b/examples/16_ampere_tensorop_conv2dfprop/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu b/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu
index 66b0dee500..c0395f5899 100644
--- a/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu
+++ b/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -31,83 +31,181 @@
 
 /**
 
-This example shows how to run convolution kernels using functions and data structures
-provided by CUTLASS using tensor cores; which we run on a NVIDIA Ampere GPU.
-
-Writing a single high performance convolution kernel is hard but do-able. Whereas writing
-high performance kernels at scale which works for multiple problem sizes with good abstractions is
-really hard. CUTLASS solves this problem by providing simplified abstractions to compose
-multiple sections of implicit gemm kernel. When used properly, the kernels can hit peak performance
-of GPU easily.
-
-CUTLASS divides a kernel into hierarchical composable sections. Which means, at each thread, warp
-and thread-block level, they compute on their own tile-size with higher level of tile sizes being
-composed from lower level ones. Multiple thread-tiles (tile size each thread computes) can be used
-to form warp-tiles (tile size each warp computes) and multiple warp tiles can be used to compute
-threadblock-tile (tile size computed by a threadblock).
-
-In thie example, we split variable initialization into
-1. Setting up data properties : describes how tensors are laid out in the memory and how the kernel
-can view them (logical to physical mapping)
-2. Setting up computation properties : describes how the above set tensors will be used to compute
-output of convolution.
-
-First, we setup the data types of the input tensor A, weights' tensor B and output tensor C along
-with alpha, beta as the equation for convolution is C = alpha * Conv2dFprop(A, B) + beta * C. In CUTLASS,
-the kernels first compute Conv2dFprop(A, B) and leave the rest of the computation to end of the kernel as
-alpha * X + beta * C is a simple element-wise operation on X (Conv2dFprop(A, B)) and C. We call this as 
-epilogue of kernel. Hence, we setup data types for alpha and beta to be equal to 
-ElementComputeEpilogue = float. We use the data type for elements in input tensor A and B as 
-cutlass::half_t. We convey this to CUTLASS kernel by initializing template variables ElementAccumulator (float),
-ElementComputeEpilogue (float), ElementInputA (cutlass::half_t), ElementInputB (cutlass::half_t),
-ElementOutput (float). Communicating just the data type is not enough. As the data is laid out 
-linearly in memory, we have to convey the layout of tensors. We do that by initializing template
-variables LayoutInputA, LayoutInputB and LayoutOutput to TensorNHWC cutlass variable. Next, we setup
-rules to comptue alpha * X + beta * C which is called epilogue of the kernel. We initialize template
-variable EpilogueOp, which takes the data type of output ElementOutput (float), the number of
-elements per vector memory access (8), data type of accumulator (float) and data type of
-computation of linear combination (alpha * X + beta * C).
-
-Now that we setup the properties of data, we have to setup properties of computation.
-
-Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x64,
-64x64x64, 16x8x16 (MxNxK) respectively. When passed to instantiate CUTLASS Implicit GEMM kernel, it
-internally deduces the amount of threads needed per thread-block, amount of shared memory, storing
-data in bank-conflict free manner, and ton of other variables required to compose, intialize and
-launch a high performance Implicit GEMM kernel. This is the beauty of CUTLASS, it relieves developer
-from understanding and coding complicated hardware optimizations which can easily go wrong.
-
-CUTLASS also supports multiple MMA pipelines in a threadblock. What are MMA pipelines? MMA pipelines
-constitute the whole process of loading input data from global memory to shared memory, loading data
-from shared memory to registers, doing matrix multiplication, store to global memory. The below flow
-sequence shows a typical mma multistage pipeline.
-(see include/cutlass/conv/threadblock/implicit_gemm_multistage.h)
-
-tensor in global memory --cp_async--> tile in shared memory --smem loads--> registers 
---mma--> registers --global stores--> output to global memory
-
-NVIDIA Ampere uses `cp_async` to build multistage software pipeline to better hide latencies.
-
-
-There are few more template variables initialized such as, which threadblock tile of output matrix
-is done which threadblock launched on an SM, CUDA SM architecture of GPU you want to run on.
-
-These are all put together to create a template variable which describes CUTLASS Implicit GEMM
-kernel using cutlass::conv::device::ImplicitGemm template.
-
-The next step is to intialize physical data, instantiate and initialize CUTLASS kernel and run it.
-We use CUTLASS utilities to initialize, fill, compare tensors as they are simple and doesn't come
-in the way of learning CUTLASS.
-
-Once all the tensors are initialized and filled with data, create arguments tuple to launch CUTLASS
-kernel which takes problem size (N = 1, H = 64, W = 64, C = 128), filter size (K = 64,
-R = 3, S = 3, C = 128 ), padding, strides, dilation, tensors, alpha, beta and the
-important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space
-memory required by the kernel we instantiated. If yes, we create it and pass it along with other
-arguments created to intialize CUTLASS kernel then, the kernel is launched.
-
-In this example, we later on launch a reference convolution kernel (from CUTLASS utilities) to
-compare if the output from CUTLASS kernel is same as the reference implicit GEMM kernel.
+This example shows how to run CUTLASS's convolution kernels
+based on the Implicit GEMM algorithm, that use the Tensor Cores
+on an NVIDIA Ampere GPU.
+
+Writing a single high-performance convolution kernel is hard enough,
+let alone writing kernels that perform well for multiple problem sizes
+and use good software abstractions.
+CUTLASS provides simplified abstractions
+to compose multiple sections of a convolution kernel.
+When used properly, the kernels can reach peak GPU performance.
+
+CUTLASS divides a kernel into hierarchical composable sections
+for each level of the GPU hardware hierarchy:
+thread, warp, and threadblock.
+Each section computes on its own tile shape,
+with each higher level's tile shape
+being composed from lower-level tile shapes.
+Multiple thread tiles (the tile shape each thread computes)
+can be used to form warp tiles (the tile shape each warp computes),
+and multiple warp tiles can be used to compute threadblock tiles
+(the tile shape computed by a threadblock).
+
+In this example, we split variable initialization into two parts.
+
+1. Setting up data properties: describes how tensors are laid out in the memory
+   and how the kernel can view them (logical to physical mapping)
+
+2. Setting up computation properties: describes how the above tensors
+   will be used to compute the output of convolution
+
+We begin by setting up the data types
+of all the input and output elements of a convolution.
+A convolution computes
+C = alpha * Conv2dFprop(A, B) + beta * C,
+so we set up data types for the input tensor A,
+weights tensor B, output tensor C,
+and the scaling factors alpha and beta.
+CUTLASS divides the convolution into two parts:
+the "mainloop" that computes X = Conv2dFprop(A, B),
+and the "epilogue" that computes C = alpha * X + beta * C.
+The epilogue is an element-wise operation on X and C.
+In this case, it is a linear combination,
+but other epilogues are possible.
+
+In this example, we want
+
+* the scaling factors alpha and beta to be float,
+
+* the elements of A and B to be cutlass::half_t
+  (a 16-bit floating-point type),
+
+* the elements of C to be float, and
+
+* intermediate sums to be accumulated in float.
+
+We convey this to the CUTLASS kernel
+by setting the following template parameters.
+
+* alpha and beta: ElementComputeEpilogue = float
+
+* Elements of input tensor A: ElementInputA = cutlass::half_t
+
+* Elements of input tensor B: ElementInputB = cutlass::half_t
+
+* Elements of output tensor C: ElementOutput = float
+
+* Accumulation type: ElementAccumulator = float
+
+Next, we describe the layout of the input and output tensors.
+We convey this to the CUTLASS kernel
+by setting the following template parameters.
+
+* Layout of input tensor A: LayoutInputA = TensorNHWC
+
+* Layout of input tensor B: LayoutInputB = TensorNHWC
+
+* Layout of output tensor C: LayoutOutput = TensorNHWC
+
+After that, we set up rules to compute the epilogue.
+The epilogue in this case is a simple linear combination
+C = alpha * X + beta * C.
+Thus, we set the kernel's template parameter EpilogueOp
+to LinearCombination.  LinearCombination itself
+has template parameters:
+
+* the element type of the output tensor (ElementOutput),
+
+* the number of elements per vector memory access (8),
+
+* the data type of the accumulator (ElementAccumulator),
+
+* and the data type used to compute the linear combination
+  (ElementComputeEpilogue).
+
+We then define the tile shapes
+that each level of the computation uses.
+We define these as types that encode the tile shapes
+as compile-time integer values.
+Each shape expresses the dimensions M x N x K.
+Here, the letters refer to the dimensions
+of a matrix-matrix multiply.
+
+* ThreadblockShape defines the threadblock tile shape
+  as 128 x 128 x 64.
+
+* WarpShape defines the warp tile shape as 64 x 64 x 64.
+
+* InstructionShape defines the MMA
+  (matrix multiply-accumulate) operation shape
+  as 16 x 8 x 16.
+
+These types become template arguments
+of the kernel properties type
+cutlass::conv::kernel::DefaultConv2dFprop.
+The kernel uses these shapes to deduce
+the number of threads needed per threadblock,
+the required amount of shared memory,
+the internal layouts needed to access
+shared memory without bank conflicts,
+and many other properties that the kernel needs
+for good performance.
+CUTLASS deduces all these properties automatically,
+so that users don't have to.
+DefaultConv2dFprop accepts other template parameters
+that describe things like the target CUDA SM architecture.
+
+CUTLASS also supports multiple MMA pipelines in a threadblock.
+An MMA pipeline constitutes the whole process
+of loading input data from global memory to shared memory,
+loading data from shared memory to registers,
+doing matrix multiplication,
+and storing the result to global memory.
+The below flow sequence shows a typical MMA multistage pipeline
+(see include/cutlass/conv/threadblock/implicit_gemm_multistage.h).
+
+tensor in global memory
+--cp_async-->
+tile in shared memory
+--smem loads-->
+registers
+--mma-->
+registers
+--global stores-->
+output to global memory
+
+On NVIDIA Ampere, the kernel uses `cp_async`
+to build a multistage software pipeline.
+This helps it better hide latency.
+
+At this point, we can define the actual CUTLASS kernel type
+as the alias ImplicitGemm, a specialization of
+cutlass::conv::device::ImplicitGemmConvolution.
+The latter accepts the kernel properties type alias
+Conv2dFpropKernel as its one template argument.
+
+This example then sets up a test problem
+and arguments to the kernel.
+We use CUTLASS utilities to allocate
+the input and output tensors
+and fill them with sample input data.
+We then create the kernel arguments
+as an instance of ImplicitGemm::Arguments.
+The arguments include
+the problem size (N = 1, H = 64, W = 64, C = 128),
+filter size (K = 64, R = 3, S = 3, C = 128),
+padding, strides, dilation, tensors, alpha, beta,
+and the split k-dimension factor.
+We also query CUTLASS if the kernel we instantiated
+requires any memory for scratch space.
+If yes, we reserve scratch space and pass it along
+with other arguments to initialize the CUTLASS kernel.
+
+After lauching the CUTLASS kernel, this example runs
+a reference convolution kernel (from CUTLASS utilities)
+to check correctness.
 */
 
 #include <iostream>
@@ -131,8 +229,8 @@ compare if the output from CUTLASS kernel is same as the reference implicit GEMM
 
 #include "helper.h"
 
-// The code section below describes datatype for input, output tensors and computation between
-// elements 
+// Data types for input and output tensors
+// and computation between elements
 using ElementAccumulator = float;                  // Data type of accumulator
 using ElementComputeEpilogue = float;              // Data type of epilogue computation (alpha, beta)
 using ElementInputA = cutlass::half_t;             // Data type of elements in input tensor
@@ -143,39 +241,44 @@ using LayoutInputA = cutlass::layout::TensorNHWC;
 using LayoutInputB = cutlass::layout::TensorNHWC;
 using LayoutOutput = cutlass::layout::TensorNHWC;
 
-// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
+// Whether to use tensor cores or regular SIMT cores on GPU SM
 using MMAOp = cutlass::arch::OpClassTensorOp;
 
-// This code section describes CUDA SM architecture number
+// SM architecture number
 using SmArch = cutlass::arch::Sm80;
 
-// This code section describes the tile size a thread block will compute
-using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 64>;  // Threadblock tile shape
+// Threadblock tile shape
+using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 64>;
 
-// This code section describes tile size a warp will compute
-using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;         // Warp tile shape
+// Warp tile shape
+using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
 
-// This code section describes the size of MMA op
-using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;    // TensorCore instruction shape
+// MMA (Tensor Core instruction, in this case) tile shape
+using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
 
-// This code section describes how threadblocks are scheduled on GPU
+// How the kernel schedules threadblocks
 using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
 
-// Number of pipelines you want to use
+// Number of pipeline stages to use
 constexpr int NumStages = 3;
 
-// This code section describe iterator algorithm selected is Analytic or Optimized
+// Which iterator algorithm to use: Analytic or Optimized
 static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm = cutlass::conv::IteratorAlgorithm::kOptimized;
 
-// This code section describes the epilogue part of the kernel, we use default value
+// Is the output packed or strided
+// Use kStride if using strided output
+static cutlass::conv::StrideSupport const OutputStride = cutlass::conv::StrideSupport::kUnity;
+
+// The epilogue part of the kernel
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
     ElementOutput,                                     // Data type of output matrix.
-    128 / cutlass::sizeof_bits<ElementOutput>::value,  // The number of elements per vectorized.
+    128 / cutlass::sizeof_bits<ElementOutput>::value,  // The number of elements per vectorized
                                                        // memory access. This becomes the vector width of
                                                        // math instructions in the epilogue too.
     ElementAccumulator,                                // Data type of accumulator
     ElementComputeEpilogue>;                           // Data type for alpha/beta in linear combination
 
+// Kernel properties type
 using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
   ElementInputA, LayoutInputA,
   ElementInputB, LayoutInputB,
@@ -190,9 +293,11 @@ using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
   SwizzleThreadBlock,
   NumStages,
   cutlass::arch::OpMultiplyAdd,
-  IteratorAlgorithm
+  IteratorAlgorithm,
+  OutputStride
 >::Kernel;
 
+// Type of the actual kernel
 using ImplicitGemm = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -230,7 +335,7 @@ struct Options {
     beta(0),
     benchmark(false) { }
 
-  // Verify the problem size is compatible with the CUTLASS Convolution implementation.
+  // Verify that the problem size is compatible with CUTLASS's convolution implementation
   bool valid() {
 
     //
@@ -256,7 +361,7 @@ struct Options {
     return true;
   }
 
-  /// Updates input and filter sizes
+  /// Update input and filter sizes
   void update(
     cutlass::Tensor4DCoord input_size,
     cutlass::Tensor4DCoord filter_size) {
@@ -270,7 +375,7 @@ struct Options {
     padding.c() = filter_size.w() / 2;
   }
 
-  // Parses the command line
+  // Parse command-line arguments
   void parse(int argc, char const **args) {
     cutlass::CommandLine cmd(argc, args);
 
@@ -302,11 +407,11 @@ struct Options {
     cmd.get_cmd_line_argument("k", filter_size.n());
     cmd.get_cmd_line_argument("r", filter_size.h());
     cmd.get_cmd_line_argument("s", filter_size.w());
-    filter_size.c() = input_size.c(); 
+    filter_size.c() = input_size.c();
 
     cmd.get_cmd_line_argument("alpha", alpha);
     cmd.get_cmd_line_argument("beta", beta);
-    
+
     cmd.get_cmd_line_argument("iterations", iterations);
     cmd.get_cmd_line_argument("tag", tag);
 
@@ -320,12 +425,12 @@ struct Options {
     }
   }
 
-  /// Prints the usage statement.
+  /// Print an explanation of the command-line arguments
   std::ostream & print_usage(std::ostream &out) const {
 
     out << "16_ampere_tensorop_conv2dfprop example\n\n"
-      << "  This example uses Ampere's Tensor Core operators on F16 data types to compute\n"
-      << "  forward convolution on tensors of layout NHWC.\n\n"
+      << "  This example uses Ampere's Tensor Core operators on F16 data types\n"
+      << "  to compute forward convolution on tensors of layout NHWC.\n\n"
       << "Options:\n\n"
       << "  --help               If specified, displays this usage statement.\n\n"
       << "  --n=<int>            Input tensor extent N\n"
@@ -350,7 +455,7 @@ struct Options {
 
     return out;
   }
-  
+
   /// Computes the output tensor size (NPQK)
   cutlass::Tensor4DCoord output_size() const {
     return cutlass::Tensor4DCoord(
@@ -360,19 +465,20 @@ struct Options {
       filter_size.n());
   }
 
-  /// Compute performance in GFLOP/s
+  /// Compute performance in Gflop/s
+  ///
+  /// Gflop/s stands for billions (10^9) of
+  /// floating-point operations per second (Gflop/s).
   double gflops(double runtime_s) const {
 
     // Number of multiply-adds = NPQK * CRS
     int64_t fmas = output_size().product() * int64_t(filter_size.h() * filter_size.w() * filter_size.c());
-    
+
     // Two flops per multiply-add
     return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
   }
 };
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
 struct Result {
   double runtime_ms;
   double gflops;
@@ -380,14 +486,14 @@ struct Result {
   cutlass::Status reference_check;
   cudaError_t error;
 
-  Result(): 
-    runtime_ms(0), 
+  Result():
+    runtime_ms(0),
     gflops(0),
     status(cutlass::Status::kSuccess),
     reference_check(cutlass::Status::kInvalid),
     error(cudaSuccess) { }
 
-  static std::ostream & print_header(std::ostream &out, Options const &options) {
+  static std::ostream& print_header(std::ostream &out, Options const &options) {
 
     if (!options.tag.empty()) {
       out << "Name,";
@@ -404,7 +510,7 @@ struct Result {
       out << options.tag << ",";
     }
 
-    out 
+    out
       << "conv_" << idx << ","
       << options.input_size.n() << ","
       << options.input_size.h() << ","
@@ -420,8 +526,6 @@ struct Result {
   }
 };
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
 /// Runs one benchmark
 Result profile_convolution(Options const &options) {
 
@@ -441,7 +545,7 @@ Result profile_convolution(Options const &options) {
   // Initialize tensors
   //
 
-  // Fill tensor A on host with uniform-distribution random data
+  // Fill tensor A on host with uniformly distributed random data
   cutlass::reference::host::TensorFillRandomUniform(
       tensor_a.host_view(),
       1,
@@ -449,7 +553,7 @@ Result profile_convolution(Options const &options) {
       ElementInputA(-8),
       0);
 
-  // Fill tensor B on host with uniform-distribution random data
+  // Fill tensor B on host with uniformly distributed random data
   cutlass::reference::host::TensorFillRandomUniform(
       tensor_b.host_view(),
       1,
@@ -457,9 +561,13 @@ Result profile_convolution(Options const &options) {
       ElementInputB(-8),
       0);
 
-  // Fill tensor C on host with zeros
-  cutlass::reference::host::TensorFill(
-      tensor_c.host_view());
+  // Fill tensor C on host with uniformly distributed random data
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_c.host_view(),
+      1,
+      ElementOutput(7),
+      ElementOutput(-8),
+      0);
 
   // Fill tensor D on host with zeros
   cutlass::reference::host::TensorFill(
@@ -486,7 +594,7 @@ Result profile_convolution(Options const &options) {
   int split_k_slices = 1;
 
   // Construct Conv2dProblemSize with user defined output size
-  cutlass::conv::Conv2dProblemSize problem_size(      
+  cutlass::conv::Conv2dProblemSize problem_size(
       options.input_size,
       options.filter_size,
       options.padding,
@@ -497,7 +605,7 @@ Result profile_convolution(Options const &options) {
       split_k_slices
   );
 
-  // Construct ImplicitGemm::Argument structure with conv2d 
+  // Construct ImplicitGemm::Argument structure with conv2d
   // problem size, data pointers, and epilogue values
   typename ImplicitGemm::Arguments arguments{
     problem_size,
@@ -535,7 +643,7 @@ Result profile_convolution(Options const &options) {
   //
   // Optional reference check
   //
-  
+
   if (options.reference_check) {
     std::cout << "Verification on host...\n";
 
@@ -548,8 +656,7 @@ Result profile_convolution(Options const &options) {
       ElementOutput,
       LayoutOutput,
       ElementComputeEpilogue,
-      ElementAccumulator,
-      cutlass::NumericConverter<ElementOutput, ElementComputeEpilogue>
+      ElementAccumulator
     >(
       problem_size,
       tensor_a.host_ref(),
@@ -560,7 +667,7 @@ Result profile_convolution(Options const &options) {
       options.beta
     );
 
-    // Check if output from CUTLASS kernel and reference kernel are equal or not
+    // Check if CUTLASS kernel and reference kernel produced the same output
     tensor_d.sync_host();
 
     bool passed = cutlass::reference::host::TensorEquals(
@@ -585,14 +692,14 @@ Result profile_convolution(Options const &options) {
     std::stringstream ss;
 
     ss << "16_ampere_workspace_conv2dfprop_"
-      << options.input_size.n() << "x" << options.input_size.h() << "x" << options.input_size.w() << "x" << options.input_size.c() 
+      << options.input_size.n() << "x" << options.input_size.h() << "x" << options.input_size.w() << "x" << options.input_size.c()
       << "_"
-      << options.filter_size.n() << "x" << options.filter_size.h() << "x" << options.filter_size.w() << "x" << options.filter_size.c() 
+      << options.filter_size.n() << "x" << options.filter_size.h() << "x" << options.filter_size.w() << "x" << options.filter_size.c()
       << ".dat";
 
     std::ofstream output_workspace(ss.str());
 
-    output_workspace 
+    output_workspace
       << "Input = \n" << tensor_a.host_view() << "\n\n"
       << "Filters = \n" << tensor_b.host_view() << "\n\n";
 
@@ -612,7 +719,7 @@ Result profile_convolution(Options const &options) {
   if (options.measure_performance) {
 
     cudaEvent_t events[2];
-    
+
     for (auto & event : events) {
       result.error = cudaEventCreate(&event);
       if (result.error != cudaSuccess) {
@@ -628,7 +735,7 @@ Result profile_convolution(Options const &options) {
       return result;
     }
 
-    // Launch a sequence of implicit GEMM operations on the device
+    // Launch a sequence of implicit GEMM operations on the device.
     for (int iteration = 0; iteration < options.iterations; ++iteration) {
       result.status = implicit_gemm_op();
       CUTLASS_CHECK(result.status);
@@ -648,7 +755,7 @@ Result profile_convolution(Options const &options) {
       return result;
     }
 
-    // Measure elapsed runtime
+    // Measure elapsed runtime.
     float runtime_ms = 0;
     result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
     if (result.error != cudaSuccess) {
@@ -656,7 +763,7 @@ Result profile_convolution(Options const &options) {
       return result;
     }
 
-    // Print average runtime and GFLOPs.
+    // Print average run time and floating-point throughput (Gflop/s).
     result.runtime_ms = double(runtime_ms) / double(options.iterations);
     result.gflops = options.gflops(result.runtime_ms / 1000.0);
 
@@ -669,8 +776,6 @@ Result profile_convolution(Options const &options) {
   return result;
 }
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
 int main(int argc, char const **args) {
 
   bool notSupported = false;
@@ -686,7 +791,7 @@ int main(int argc, char const **args) {
   cudaDeviceProp props;
   CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
 
-  if (!(props.major > 8 || (props.major == 8 && props.minor >= 0))) {
+  if (!(props.major >= 8)) {
     std::cerr << "Ampere Tensor Ops must be run on a machine with compute capability at least 80."
               << std::endl;
     notSupported = true;
@@ -697,7 +802,7 @@ int main(int argc, char const **args) {
   }
 
   Options options;
-  
+
   options.parse(argc, args);
 
   if (options.help) {
@@ -764,5 +869,3 @@ int main(int argc, char const **args) {
 
   return 0;
 }
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/17_fprop_per_channel_bias/CMakeLists.txt b/examples/17_fprop_per_channel_bias/CMakeLists.txt
index 5ca41a419c..350a27998a 100644
--- a/examples/17_fprop_per_channel_bias/CMakeLists.txt
+++ b/examples/17_fprop_per_channel_bias/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/17_fprop_per_channel_bias/fprop_per_channel_bias.cu b/examples/17_fprop_per_channel_bias/fprop_per_channel_bias.cu
index 2b6b25c73c..f1658c0fe6 100644
--- a/examples/17_fprop_per_channel_bias/fprop_per_channel_bias.cu
+++ b/examples/17_fprop_per_channel_bias/fprop_per_channel_bias.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -290,7 +290,7 @@ int main(int argc, char const **args) {
   cudaDeviceProp props;
   CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
 
-  if (!(props.major > 8 || (props.major == 8 && props.minor >= 0))) {
+  if (!(props.major >= 8)) {
     std::cerr << "Ampere Tensor Ops must be run on a machine with compute capability at least 80."
               << std::endl;
     notSupported = true;
diff --git a/examples/18_ampere_fp64_tensorop_affine2_gemm/CMakeLists.txt b/examples/18_ampere_fp64_tensorop_affine2_gemm/CMakeLists.txt
index 21a21e90de..5f4541c3fb 100644
--- a/examples/18_ampere_fp64_tensorop_affine2_gemm/CMakeLists.txt
+++ b/examples/18_ampere_fp64_tensorop_affine2_gemm/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu b/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu
index 62450e2138..1595dd6088 100644
--- a/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu
+++ b/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -326,7 +326,7 @@ int main(int argc, char const **args) {
   cudaDeviceProp props;
   CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
 
-  if (!(props.major > 8 || (props.major == 8 && props.minor >= 0))) {
+  if (!(props.major >= 8)) {
     std::cerr << "Ampere Tensor Ops must be run on a machine with compute capability at least 80."
               << std::endl;
     notSupported = true;
diff --git a/examples/19_tensorop_canonical/CMakeLists.txt b/examples/19_tensorop_canonical/CMakeLists.txt
index c48ff26008..140f51bf92 100644
--- a/examples/19_tensorop_canonical/CMakeLists.txt
+++ b/examples/19_tensorop_canonical/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/19_tensorop_canonical/tensorop_canonical.cu b/examples/19_tensorop_canonical/tensorop_canonical.cu
index 15ad17f003..1f0aa93282 100644
--- a/examples/19_tensorop_canonical/tensorop_canonical.cu
+++ b/examples/19_tensorop_canonical/tensorop_canonical.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/20_simt_canonical/CMakeLists.txt b/examples/20_simt_canonical/CMakeLists.txt
index 25f2969f3c..36dcda7af6 100644
--- a/examples/20_simt_canonical/CMakeLists.txt
+++ b/examples/20_simt_canonical/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/20_simt_canonical/simt_canonical.cu b/examples/20_simt_canonical/simt_canonical.cu
index d905d4da60..8f2fbc4de0 100644
--- a/examples/20_simt_canonical/simt_canonical.cu
+++ b/examples/20_simt_canonical/simt_canonical.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/21_quaternion_gemm/CMakeLists.txt b/examples/21_quaternion_gemm/CMakeLists.txt
index 14d71c0796..742a9e7868 100644
--- a/examples/21_quaternion_gemm/CMakeLists.txt
+++ b/examples/21_quaternion_gemm/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/21_quaternion_gemm/quaternion_gemm.cu b/examples/21_quaternion_gemm/quaternion_gemm.cu
index 95c7b4d7a2..025a9c974e 100644
--- a/examples/21_quaternion_gemm/quaternion_gemm.cu
+++ b/examples/21_quaternion_gemm/quaternion_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/22_quaternion_conv/CMakeLists.txt b/examples/22_quaternion_conv/CMakeLists.txt
index 9bfad4ff98..52e1727977 100644
--- a/examples/22_quaternion_conv/CMakeLists.txt
+++ b/examples/22_quaternion_conv/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/22_quaternion_conv/quaternion_conv.cu b/examples/22_quaternion_conv/quaternion_conv.cu
index 756d465124..bc7173d1ac 100644
--- a/examples/22_quaternion_conv/quaternion_conv.cu
+++ b/examples/22_quaternion_conv/quaternion_conv.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -470,8 +470,7 @@ Result profile_convolution(Options const &options) {
       ElementOutput,
       LayoutOutput,
       ElementComputeEpilogue,
-      ElementAccumulator,
-      cutlass::NumericConverter<ElementOutput, ElementComputeEpilogue>
+      ElementAccumulator
     >(
       problem_size,
       tensor_a.host_ref(),
diff --git a/examples/23_ampere_gemm_operand_reduction_fusion/CMakeLists.txt b/examples/23_ampere_gemm_operand_reduction_fusion/CMakeLists.txt
index 49d313f4dc..e5b4ec0351 100644
--- a/examples/23_ampere_gemm_operand_reduction_fusion/CMakeLists.txt
+++ b/examples/23_ampere_gemm_operand_reduction_fusion/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -27,10 +27,14 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
+set(TEST_STANDARD --m=1024 --n=1024 --k=1024)
+set(TEST_LARGE_PERFCHECK --m=4096 --n=3456 --k=4096 --perf-check)
 
 cutlass_example_add_executable(
   23_ampere_gemm_operand_reduction_fusion
   ampere_gemm_operand_reduction_fusion.cu
+  TEST_COMMAND_OPTIONS
+  TEST_STANDARD
+  TEST_LARGE_PERFCHECK
   )
 
diff --git a/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu b/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu
index 41ea3200a1..4e5fca1a03 100644
--- a/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu
+++ b/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -30,13 +30,13 @@
  **************************************************************************************************/
 
 /**
-The example demenstrates how to reduce one of the operands of the GEMM along the k-dimension when
+The example demonstrates how to reduce one of the operands of the GEMM along the k-dimension when
 computing GEMM.  So the output also contains either a Mx1 or 1XN vector.  It only works with Ampere
-HMMA 16x8x16 FP16 tensor cores, though it is not difficult to apply to other Turing/Ampere tensor
+16x8x16 FP16/BF16 tensor cores, though it is not difficult to apply to other Turing/Ampere tensor
 core instructions.
 
 Most of the reduction is done in gemm/warp level, see gemm/warp/mma_with_reduction_tensor_op.h
-A few bit of reduction is done in the epilouge before storing the vector, see
+A few bit of reduction is done in the epilogue before storing the vector, see
 epilogue/threadblock/epilogue_gemm_k_reduction.h 
 */
 
@@ -45,7 +45,7 @@ epilogue/threadblock/epilogue_gemm_k_reduction.h
 #include <sstream>
 
 #include "cutlass/cutlass.h"
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/device/gemm_with_k_reduction.h"
 #include "cutlass/gemm/kernel/default_gemm_with_k_reduction.h"
 #include "cutlass/reduction/device/reduce_split_k.h"
 #include "cutlass/reduction/kernel/reduce_split_k.h"
@@ -67,9 +67,9 @@ epilogue/threadblock/epilogue_gemm_k_reduction.h
 // elements 
 using ElementAccumulator = float;                  // Data type of accumulator
 using ElementComputeEpilogue = ElementAccumulator; // Data type of epilogue computation
-using ElementInputA = cutlass::half_t;             // Data type of elements in input tensor
-using ElementInputB = cutlass::half_t;             // Data type of elements in input tensor
-using ElementOutput = cutlass::half_t;             // Data type of elements in output tensor
+using ElementInputA = cutlass::bfloat16_t;         // Data type of elements in input tensor
+using ElementInputB = cutlass::bfloat16_t;         // Data type of elements in input tensor
+using ElementOutput = cutlass::bfloat16_t;         // Data type of elements in output tensor
 
 using LayoutInputA = cutlass::layout::ColumnMajor;
 using LayoutInputB = cutlass::layout::RowMajor;
@@ -101,6 +101,12 @@ constexpr int NumStages = 4;
 // Reduce A or B operand along the K dimension
 constexpr bool ReduceKForA = true;
 
+// Alignment of A operand
+constexpr int AlignmentA = 8;
+
+// Alignment of B operand
+constexpr int AlignmentB = 8;
+
 // This code section describes the epilogue part of the kernel, we use default value
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
     ElementOutput,                                        // Data type of output matrix.
@@ -110,9 +116,9 @@ using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
     ElementAccumulator,                                   // Data type of accumulator
     ElementComputeEpilogue>;
 
-using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmWithKReduction<
-  ElementInputA, LayoutInputA, cutlass::ComplexTransform::kNone, 8,
-  ElementInputB, LayoutInputB, cutlass::ComplexTransform::kNone, 8,
+using Gemm = typename cutlass::gemm::device::GemmWithKReduction<
+  ElementInputA, LayoutInputA,
+  ElementInputB, LayoutInputB,
   ElementOutput, LayoutOutput,
   ElementAccumulator,
   MMAOp,
@@ -124,13 +130,15 @@ using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmWithKReduction<
   EpilogueOp,
   SwizzleThreadBlock,
   NumStages,
-  cutlass::arch::OpMultiplyAdd
->::GemmKernel;
-
-using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  AlignmentA,
+  AlignmentB,
+  cutlass::arch::OpMultiplyAdd,
+  cutlass::ComplexTransform::kNone,
+  cutlass::ComplexTransform::kNone
+>;
 
 // Below is the reduction kernel used in the case of parallel split-k
-using ReduceGemmSplitKShape = cutlass::MatrixShape<4, 64>;;
+using ReduceGemmSplitKShape = cutlass::MatrixShape<4, 64>;
 
 using ReduceOp = cutlass::reduction::thread::ReduceAdd<
     ElementAccumulator,
@@ -146,7 +154,7 @@ using ReduceGemmSplitKKernel = cutlass::reduction::kernel::ReduceSplitK<
 
 using ReduceGemmSplitK = cutlass::reduction::device::ReduceSplitK<ReduceGemmSplitKKernel>;
 
-using ReduceVectorSplitKShape = cutlass::MatrixShape<1, 256>;;
+using ReduceVectorSplitKShape = cutlass::MatrixShape<1, 256>;
 
 // This code section describes the epilogue part of the kernel, we use default value
 using DummyEpilogueOp = cutlass::epilogue::thread::LinearCombination<
@@ -271,7 +279,7 @@ struct Options {
   /// Prints the usage statement.
   std::ostream & print_usage(std::ostream &out) const {
 
-    out << "28_ampere_gemm_bias_fusion example\n\n"
+    out << "23_ampere_operand_gemm_reduction_fusion\n\n"
       << "Options:\n\n"
       << "  --help               If specified, displays this usage statement.\n\n"
       << "  --m=<int>            GEMM M\n"
@@ -289,7 +297,7 @@ struct Options {
       << "  --tag=<string>       String to replicate across the first column in the results table\n";
 
     out << "\n\nExamples:\n\n"
-      << "$ ./examples/23_ampere_gemm_bias_fusion_example/ampere_gemm_bias_fusion  --m=1024 --n=1024 --k=1024 \n\n";
+      << "$ ./examples/23_ampere_gemm_operand_reduction_fusion/23_ampere_gemm_operand_reduction_fusion  --m=1024 --n=1024 --k=1024 \n\n";
 
     return out;
   }
@@ -368,23 +376,23 @@ Result profile(Options const &options) {
   // Fill input and output matrices on host using CUTLASS helper functions
   cutlass::reference::host::TensorFillRandomUniform(
       tensor_a.host_view(),
-      1,
-      ElementInputA(4),
-      ElementInputA(-4),
+      1997,
+      ElementInputA(1),
+      ElementInputA(-1),
       0);  // <- Fill tensor A on host with uniform-distribution random data
 
   cutlass::reference::host::TensorFillRandomUniform(
       tensor_b.host_view(),
-      1,
-      ElementInputB(4),
-      ElementInputB(-4),
+      2003,
+      ElementInputB(1),
+      ElementInputB(-1),
       0);  // <- Fill tensor B on host with uniform-distribution random data
 
   cutlass::reference::host::TensorFillRandomUniform(
       tensor_c.host_view(),
-      1,
-      ElementOutput(4),
-      ElementOutput(-4),
+      2017,
+      ElementOutput(1),
+      ElementOutput(-1),
       0);  // <- Fill matrix C on host with uniform-distribution random data
   cutlass::reference::host::TensorFill(
       tensor_d.host_view());  // <- fill matrix D on host with zeros
@@ -418,7 +426,7 @@ Result profile(Options const &options) {
 
   // Create a tuple of gemm kernel arguments. This is later passed as arguments to launch
   // instantiated CUTLASS kernel
-  typename Gemm::Arguments arguments{
+  typename Gemm::Arguments arguments(
     mode,
     options.problem_size,
     batch_count,
@@ -437,8 +445,7 @@ Result profile(Options const &options) {
     tensor_b.layout().stride(0),
     tensor_c.layout().stride(0),
     tensor_d.layout().stride(0),
-    tensor_reduction.layout().stride(0)
-  };                    
+    tensor_reduction.layout().stride(0));
 
   // Instantiate CUTLASS kernel depending on templates
   Gemm gemm_op;
@@ -507,15 +514,14 @@ Result profile(Options const &options) {
 
     cutlass::TensorRef<ElementOutput, cutlass::layout::RowMajor> tensor_nullptr_tensorref(nullptr, splitk_vector_layout);
 
-    typename ReduceVectorSplitK::Arguments reduce_vector_splitk_arguments{
+    typename ReduceVectorSplitK::Arguments reduce_vector_splitk_arguments(
       cutlass::MatrixCoord(1, reduce_vector_length),
       batch_count,
       size_t(reduce_vector_length),
       workspace_vector_tensorref,
       tensor_reduction_tensorref,
       tensor_nullptr_tensorref,
-      {1.0f, 0.0f} 
-    };
+      {1.0f, 0.0f});
 
     ReduceVectorSplitK reduce_vector_splitk_op;
    
@@ -561,7 +567,7 @@ Result profile(Options const &options) {
   
     tensor_reduction.sync_host();
   
-    // Compute bias + relu in host code
+    // Reduce K in host code
     if (ReduceKForA) {
       for (int m = 0; m < options.problem_size.m(); ++m) {
         for (int k = 0; k < options.problem_size.k(); ++k) {
@@ -581,7 +587,7 @@ Result profile(Options const &options) {
     // Check if output from CUTLASS kernel and reference kernel are equal or not
     bool pass = cutlass::reference::host::TensorEquals(tensor_d.host_view(),
                                                        tensor_ref_d.host_view());
-  
+
     pass &= cutlass::reference::host::TensorEquals(tensor_ref_reduction.host_view(),
                                                    tensor_reduction.host_view());
 
@@ -612,10 +618,10 @@ Result profile(Options const &options) {
 
     if (options.reference_check) {
       output_workspace << "Reference D = \n" << tensor_ref_d.host_view() << "\n\n";
-      output_workspace << "Reference reduction vector= \n" << tensor_ref_reduction.host_view() << "\n\n";
+      output_workspace << "Reference reduction vector = \n" << tensor_ref_reduction.host_view() << "\n\n";
     }
 
-    output_workspace << "Computed = \n" << tensor_d.host_view() << std::endl;
+    output_workspace << "Computed D = \n" << tensor_d.host_view() << std::endl;
     output_workspace << "Computed reduction vector = \n" << tensor_reduction.host_view() << std::endl;
 
     std::cout << "Results written to '" << ss.str() << "'." << std::endl;
@@ -699,7 +705,7 @@ int main(int argc, char const **args) {
   cudaDeviceProp props;
   CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
 
-  if (!(props.major > 8 || (props.major == 8 && props.minor >= 0))) {
+  if (!(props.major >= 8)) {
     std::cerr << "Ampere Tensor Ops must be run on a machine with compute capability at least 80."
               << std::endl;
     notSupported = true;
diff --git a/examples/24_gemm_grouped/CMakeLists.txt b/examples/24_gemm_grouped/CMakeLists.txt
index c9f3558e4c..32614a075d 100644
--- a/examples/24_gemm_grouped/CMakeLists.txt
+++ b/examples/24_gemm_grouped/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -31,6 +31,7 @@
 
 cutlass_example_add_executable(
   24_gemm_grouped
-  gemm_grouped.cu 
+  gemm_grouped.cu
   )
 
+
diff --git a/examples/24_gemm_grouped/gemm_grouped.cu b/examples/24_gemm_grouped/gemm_grouped.cu
index a32c80d755..993d554f64 100644
--- a/examples/24_gemm_grouped/gemm_grouped.cu
+++ b/examples/24_gemm_grouped/gemm_grouped.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -37,7 +37,7 @@
     leading dimensions and problem sizes are stored in arrays in GMEM.
 
     This differs from "Batched Array" GEMM because the size of each GEMM problem in the Grouped GEMM
-    concept may be distinct. 
+    concept may be distinct.
 
     This benchmark program initializes a workspace with random problem sizes for a given number of
     groups. Command line options enable overriding M, N, and/or K dimensions with uniform values to
@@ -66,6 +66,7 @@
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+#include <chrono>
 #include <iostream>
 #include <fstream>
 #include <sstream>
@@ -98,6 +99,7 @@
 struct Result {
 
   double runtime_ms;
+  double initialization_time_ms;
   double gflops;
   cutlass::Status status;
   cudaError_t error;
@@ -109,11 +111,13 @@ struct Result {
 
   Result(
     double runtime_ms = 0,
+    double initialization_time_ms = 0,
     double gflops = 0,
     cutlass::Status status = cutlass::Status::kSuccess,
     cudaError_t error = cudaSuccess
   ):
-    runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { }
+    runtime_ms(runtime_ms), initialization_time_ms(initialization_time_ms), gflops(gflops),
+    status(status), error(error), passed(true) { }
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -134,6 +138,8 @@ struct Options {
   bool help;
   bool error;
   bool reference_check;
+  bool profile_initialization;
+  bool sort_problems;
 
   std::vector<cutlass::gemm::GemmCoord> problem_sizes;
 
@@ -155,21 +161,47 @@ struct Options {
   std::string   output_tag;
   std::ofstream output_file;
 
+  using GroupScheduleMode = cutlass::gemm::kernel::GroupScheduleMode;
+  std::vector<GroupScheduleMode> scheduler_modes;
+
+  std::unordered_map<std::string, GroupScheduleMode>
+    str_to_scheduler_mode = {
+      {"kDeviceOnly", GroupScheduleMode::kDeviceOnly},
+      {"kHostPrecompute", GroupScheduleMode::kHostPrecompute}
+    };
+
+  struct GroupScheduleModeHash {
+    size_t operator()(GroupScheduleMode m) const {
+      return static_cast<size_t>(m);
+    }
+  };
+
+  std::unordered_map<GroupScheduleMode, std::string, GroupScheduleModeHash>
+    scheduler_mode_to_str = {
+      {GroupScheduleMode::kDeviceOnly, "kDeviceOnly"},
+      {GroupScheduleMode::kHostPrecompute, "kHostPrecompute"}
+    };
+
+  std::vector<GroupScheduleMode> all_scheduler_modes = {GroupScheduleMode::kDeviceOnly, GroupScheduleMode::kHostPrecompute};
+
   //
   // Methods
-  // 
+  //
 
   Options():
     help(false),
     error(false),
     alignment(8),
     reference_check(true),
+    profile_initialization(false),
+    sort_problems(false),
     problem_count(15),
     iterations(20),
     cuda_streams(0),
     verbose(false),
     alpha(1),
-    beta()
+    beta(),
+    scheduler_modes({GroupScheduleMode::kDeviceOnly})
   { }
 
   // Parses the command line
@@ -184,13 +216,40 @@ struct Options {
     cmd.get_cmd_line_argument("alignment", alignment, 8);
     cmd.get_cmd_line_argument("groups", problem_count, 15);
     cmd.get_cmd_line_argument("alpha", alpha, 1.0f);
-    cmd.get_cmd_line_argument("beta", beta, 0.0f);    
+    cmd.get_cmd_line_argument("beta", beta, 0.0f);
     cmd.get_cmd_line_argument("iterations", iterations, 20);
     cmd.get_cmd_line_argument("streams", cuda_streams, 0);
     cmd.get_cmd_line_argument("verbose", verbose, false);
     cmd.get_cmd_line_argument("reference-check", reference_check, true);
+    cmd.get_cmd_line_argument("profile-initialization", profile_initialization, false);
+    cmd.get_cmd_line_argument("sort-problems", sort_problems, false);
     cmd.get_cmd_line_argument("benchmark", benchmark_path);
 
+    std::vector<std::string> scheduler_mode_strs;
+    cmd.get_cmd_line_arguments("scheduler-modes", scheduler_mode_strs);
+
+    if (!scheduler_mode_strs.empty()) {
+      scheduler_modes.clear();
+      if (scheduler_mode_strs.size() == 1 && scheduler_mode_strs[0] == "all") {
+        scheduler_modes = all_scheduler_modes;
+      } else {
+        for (std::string precomp_str : scheduler_mode_strs) {
+          auto it = str_to_scheduler_mode.find(precomp_str);
+          if (it != str_to_scheduler_mode.end()) {
+            scheduler_modes.push_back(it->second);
+          } else if (precomp_str == "all") {
+            std::cerr << "Flag --scheduler-modes=all must not contain other scheduler modes in list." << std::endl;
+            error = true;
+            return;
+          } else {
+            std::cerr << "Unrecognized scheduler mode '" << precomp_str << "'" << std::endl;
+            error = true;
+            return;
+          }
+        }
+      }
+    }
+
     std::string output_path;
     cmd.get_cmd_line_argument("tag", output_tag);
     cmd.get_cmd_line_argument("output_file", output_path);
@@ -314,6 +373,8 @@ struct Options {
   /// Post processes the problems
   void bin_problems() {
 
+    problem_bins.clear();
+
     problem_count = int(problem_sizes.size());
 
     //
@@ -340,19 +401,22 @@ struct Options {
       << "  'group' may compute a unique problem size. Problem sizes and pointers to matrices are both stored\n"
       << "  in device Global Memory and loaded by the kernel.\n\n"
       << "Options:\n\n"
-      << "  --help                      If specified, displays this usage statement.\n\n"
-      << "  --benchmark=<str>           Executes a benchmark problem size.\n"
-      << "  --output_file=<str>         Path to a CSV file to output results. If it exists already, results are appended.\n"
-      << "  --tag=<str>                 String tag to prepend to the CSV file.\n"
-      << "  --groups=<int>              Number of individual GEMM problems (default: --groups=15)\n"
-      << "  --m=<int>                   Sets the M dimension for all groups. Otherwise, it is selected randomly\n"
-      << "  --n=<int>                   Sets the N dimension for all groups. Otherwise, it is selected randomly\n"
-      << "  --k=<int>                   Sets the K dimension for all groups. Otherwise, it is selected randomly\n"
-      << "  --alpha=<f32>               Epilogue scalar alpha (real part)\n"
-      << "  --beta=<f32>                Epilogue scalar beta (real part)\n\n"
-      << "  --iterations=<int>          Number of profiling iterations to perform.\n"
-      << "  --reference-check=<bool>    If true, performs reference check.\n"
-      << "  --verbose=<bool>            If true, prints problem sizes and batching structure.\n";
+      << "  --help                           If specified, displays this usage statement.\n\n"
+      << "  --benchmark=<str>                Executes a benchmark problem size.\n"
+      << "  --output_file=<str>              Path to a CSV file to output results. If it exists already, results are appended.\n"
+      << "  --tag=<str>                      String tag to prepend to the CSV file.\n"
+      << "  --groups=<int>                   Number of individual GEMM problems (default: --groups=15)\n"
+      << "  --m=<int>                        Sets the M dimension for all groups. Otherwise, it is selected randomly\n"
+      << "  --n=<int>                        Sets the N dimension for all groups. Otherwise, it is selected randomly\n"
+      << "  --k=<int>                        Sets the K dimension for all groups. Otherwise, it is selected randomly\n"
+      << "  --alpha=<f32>                    Epilogue scalar alpha (real part)\n"
+      << "  --beta=<f32>                     Epilogue scalar beta (real part)\n"
+      << "  --scheduler-modes=<str>          List of scheduler modes to be profile for grouped GEMM scheduler (default: --scheduler_modes=kDeviceOnly)\n"
+      << "  --iterations=<int>               Number of profiling iterations to perform.\n"
+      << "  --reference-check=<bool>         If true, performs reference check.\n"
+      << "  --verbose=<bool>                 If true, prints problem sizes and batching structure.\n"
+      << "  --profile-initialization=<bool>  If true, profiles the device-level kernel's initialization.\n"
+      << "  --sort-problems=<bool>           If true, sorts problem sizes in descending order of GEMM-K dimension.\n";
 
     out << "\n\nExamples:\n\n"
 
@@ -365,6 +429,12 @@ struct Options {
       << "# Runs a grouped GEMM that is equivalent to a batched GEMM\n"
       << "$ ./examples/24_gemm_grouped/24_gemm_grouped --groups=100 --m=2048 --n=1024 --k=1024 --verbose=true\n\n"
 
+      << "# Runs a grouped GEMM with each different scheduler mode\n"
+      << "$ ./examples/24_gemm_grouped/24_gemm_grouped --scheduler-modes=all\n\n"
+
+      << "# Runs a grouped GEMM with each different scheduler mode and profiles host-side initialization time\n"
+      << "$ ./examples/24_gemm_grouped/24_gemm_grouped --scheduler-modes=all --profile-initialization=true\n\n"
+
       << "# Runs a grouped GEMM problem given an externally supplied benchmark file. This is a text file in which\n"
       << "# Each line contains a unique group index and an MxNxK triple indicating problemsize.\n"
       << "#\n"
@@ -385,13 +455,13 @@ struct Options {
   /// Compute performance in GFLOP/s
   double gflops(double runtime_s) const {
 
-    // Number of real-valued multiply-adds 
+    // Number of real-valued multiply-adds
     int64_t fmas = int64_t();
 
     for (auto const & problem : problem_sizes) {
       fmas += problem.product();
     }
-    
+
     // Two flops per multiply-add
     return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
   }
@@ -399,10 +469,9 @@ struct Options {
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
-template <typename Gemm, typename GemmBatched>
-class TestbedGrouped {
+template <typename Gemm>
+class BaseTestbed {
 public:
-
   //
   // Type definitions
   //
@@ -421,8 +490,6 @@ public:
 
   using MatrixCoord = typename LayoutC::TensorCoord;
 
-private:
-
   //
   // Data members
   //
@@ -462,13 +529,7 @@ private:
   cutlass::DeviceAllocation<ElementC *> ptr_C;
   cutlass::DeviceAllocation<ElementC *> ptr_D;
 
-public:
-
-  //
-  // Methods
-  //
-
-  TestbedGrouped(
+  BaseTestbed(
     Options &options_,
     cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
     cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
@@ -481,13 +542,11 @@ public:
     return options.problem_count;
   }
 
-private:
-
   /// Helper to initialize a tensor view
   template <typename Element>
-  void initialize_tensor_(
+  void initialize_tensor(
     Element *ptr,
-    size_t capacity, 
+    size_t capacity,
     cutlass::Distribution::Kind dist_kind,
     uint32_t seed) {
 
@@ -519,7 +578,7 @@ private:
 
       cutlass::reference::device::BlockFillRandomUniform(
         ptr, capacity, seed, scope_max, scope_min, 0);
-    } 
+    }
     else if (dist_kind == cutlass::Distribution::Gaussian) {
 
       cutlass::reference::device::BlockFillRandomGaussian(
@@ -530,7 +589,7 @@ private:
       // Fill with increasing elements
       cutlass::reference::device::BlockFillSequential(
         ptr, capacity, Element(1), Element());
-    } 
+    }
     else {
 
       // Fill with all 1s
@@ -539,65 +598,13 @@ private:
     }
   }
 
-  /// Verbose printing of problem sizes
-  void print_problem_sizes_() {
-
-    // Print groups
-    std::cout << problem_count() << " groups:\n";  
-    
-    int32_t idx = 0;
-    int64_t total_tiles = 0;
-
-    for (auto const & problem : options.problem_sizes) {
-
-      int tiles = 
-        ((problem.m() + Gemm::ThreadblockShape::kM - 1) / Gemm::ThreadblockShape::kM) * 
-        ((problem.n() + Gemm::ThreadblockShape::kN - 1) / Gemm::ThreadblockShape::kN);
-
-      total_tiles += tiles;
-
-      std::cout << "  [" << idx << "]: " 
-        << problem.m() << "-by-" << problem.n() << "-by-" << problem.k() 
-        << " (" << tiles << " threadblock tiles)" << "\n";
-
-      ++idx;
-    }
-
-    // Print batched GEMM equivalent
-    size_t bin_idx = 0;
-    size_t problem_count_check = 0;
-    std::cout << "\nConventionally executed as " << options.problem_bins.size() << " batched GEMMs:\n";
-    for (auto const & bin : options.problem_bins) {
-
-      std::cout << "  [" << bin_idx << "]: " 
-        << bin.first.m() << "-by-" << bin.first.n() << "-by-" << bin.first.k() 
-        << ", batch count: " << bin.second.size() << "\n";
-
-      ++bin_idx;
-      problem_count_check += bin.second.size();
-    }
-
-    if (problem_count_check != problem_count()) {
-      std::cout << "\n***\nERROR in BINNING LOGIC!\n***\n" << std::endl;
-    }
-  }
-
-  /// Initializes data structures
-  void initialize_() {
-
-    //
-    // Choose random problem sizes
-    //
-
-    // construct a few problems of random sizes
-    srand(seed);
-
+  /// Allocates device-side data
+  void allocate() {
     int64_t total_elements_A = 0;
     int64_t total_elements_B = 0;
     int64_t total_elements_C = 0;
     int64_t total_elements_D = 0;
 
-
     lda_host.resize(problem_count());
     ldb_host.resize(problem_count());
     ldc_host.resize(problem_count());
@@ -628,14 +635,22 @@ private:
       total_elements_D += elements_D;
     }
 
-    problem_sizes_device.reset(problem_count());
-    problem_sizes_device.copy_from_host(options.problem_sizes.data());
-
     lda.reset(problem_count());
     ldb.reset(problem_count());
     ldc.reset(problem_count());
     ldd.reset(problem_count());
 
+    block_A.reset(total_elements_A);
+    block_B.reset(total_elements_B);
+    block_C.reset(total_elements_C);
+    block_D.reset(total_elements_D);
+  }
+
+  /// Initializes device-side data
+  void initialize() {
+    problem_sizes_device.reset(problem_count());
+    problem_sizes_device.copy_from_host(options.problem_sizes.data());
+
     lda.copy_from_host(lda_host.data());
     ldb.copy_from_host(ldb_host.data());
     ldc.copy_from_host(ldc_host.data());
@@ -645,11 +660,6 @@ private:
     // Assign pointers
     //
 
-    block_A.reset(total_elements_A);
-    block_B.reset(total_elements_B);
-    block_C.reset(total_elements_C);
-    block_D.reset(total_elements_D);
-
     std::vector<ElementA *> ptr_A_host(problem_count());
     std::vector<ElementB *> ptr_B_host(problem_count());
     std::vector<ElementC *> ptr_C_host(problem_count());
@@ -664,13 +674,13 @@ private:
 
     ptr_A.reset(problem_count());
     ptr_A.copy_from_host(ptr_A_host.data());
-    
+
     ptr_B.reset(problem_count());
     ptr_B.copy_from_host(ptr_B_host.data());
-    
+
     ptr_C.reset(problem_count());
     ptr_C.copy_from_host(ptr_C_host.data());
-    
+
     ptr_D.reset(problem_count());
     ptr_D.copy_from_host(ptr_D_host.data());
 
@@ -678,16 +688,16 @@ private:
     // Initialize the problems of the workspace
     //
 
-    initialize_tensor_(block_A.get(), total_elements_A, init_A, seed * 2021);
-    initialize_tensor_(block_B.get(), total_elements_B, init_B, seed * 2022);
-    initialize_tensor_(block_C.get(), total_elements_C, init_C, seed * 2023);
+    initialize_tensor(block_A.get(), block_A.size(), init_A, seed * 2021);
+    initialize_tensor(block_B.get(), block_B.size(), init_B, seed * 2022);
+    initialize_tensor(block_C.get(), block_C.size(), init_C, seed * 2023);
 
     cutlass::reference::device::BlockFillSequential(
-      block_D.get(), total_elements_D, ElementC(), ElementC());
+      block_D.get(), block_D.size(), ElementC(), ElementC());
   }
 
   /// Verifies the result is a GEMM
-  bool verify_() {
+  bool verify() {
 
     bool passed = true;
 
@@ -702,7 +712,7 @@ private:
       MatrixCoord extent_A{problem.m(), problem.k()};
       MatrixCoord extent_B{problem.k(), problem.n()};
       MatrixCoord extent_C{problem.m(), problem.n()};
-      
+
       cutlass::TensorView<ElementA, LayoutA> view_A(block_A.get() + offset_A.at(i), layout_A, extent_A);
       cutlass::TensorView<ElementB, LayoutB> view_B(block_B.get() + offset_B.at(i), layout_B, extent_B);
       cutlass::TensorView<ElementC, LayoutC> view_C(block_C.get() + offset_C.at(i), layout_C, extent_C);
@@ -714,18 +724,18 @@ private:
       cutlass::reference::device::GemmComplex<
           ElementA, LayoutA,
           ElementB, LayoutB,
-          ElementC, LayoutC, 
+          ElementC, LayoutC,
           ElementCompute, ElementAccumulator
       >(
         problem,
-        options.alpha, 
+        options.alpha,
         view_A,
         Gemm::kTransformA,
         view_B,
         Gemm::kTransformB,
-        options.beta, 
-        view_C, 
-        view_Ref_device, 
+        options.beta,
+        view_C,
+        view_Ref_device,
         ElementAccumulator(0)
       );
 
@@ -738,7 +748,7 @@ private:
 
       cutlass::TensorView<ElementC, LayoutC> view_D(  matrix_D.data(),   layout_D, extent_C);
       cutlass::TensorView<ElementC, LayoutC> view_Ref(matrix_Ref.data(), layout_D, extent_C);
-      
+
       // Reference check
       passed = cutlass::reference::host::TensorEquals(view_D, view_Ref);
 
@@ -751,227 +761,62 @@ private:
     return passed;
   }
 
-public:
+};
 
-  /// Returns the number of threadblocks to launch if the kernel can run on the target
-  /// device. Otherwise, returns zero.
-  int sufficient() const {
-    cudaDeviceProp properties;
-    int device_idx;
-    cudaError_t result = cudaGetDevice(&device_idx);
+template <typename Gemm>
+class TestbedBatched : BaseTestbed<Gemm> {
+public:
+  TestbedBatched(
+    Options &options_,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint32_t seed_ = 3080
+  ): BaseTestbed<Gemm>(options_, init_A_, init_B_, init_C_, seed_) {}
 
-    if (result != cudaSuccess) {
-      throw std::runtime_error("cudaGetDevice() API call failed.");
-    }
+  void print_problem_sizes() {
+    std::cout << std::endl;
+    size_t bin_idx = 0;
+    size_t problem_count_check = 0;
+    std::cout << "Conventionally executed as " << this->options.problem_bins.size() << " batched GEMMs:\n";
+    for (auto const & bin : this->options.problem_bins) {
 
-    result = cudaGetDeviceProperties(&properties, device_idx);
+      std::cout << "  [" << bin_idx << "]: "
+        << bin.first.m() << "-by-" << bin.first.n() << "-by-" << bin.first.k()
+        << ", batch count: " << bin.second.size() << "\n";
 
-    if (result != cudaSuccess) {
-      throw std::runtime_error("cudaGetDeviceProperties() failed");
+      ++bin_idx;
+      problem_count_check += bin.second.size();
     }
 
-    int occupancy = Gemm::maximum_active_blocks();
-
-    return properties.multiProcessorCount * occupancy;
+    if (problem_count_check != size_t(this->problem_count())) {
+      std::cout << "\n***\nERROR in BINNING LOGIC!\n***\n" << std::endl;
+    }
 
+    std::cout << std::endl;
   }
 
-
-  /// Executes a Grouped GEMM kernel and measures runtime.
-  Result profile_grouped() {
+  /// Executes a batched kernel and measures runtime
+  Result profile() {
+    std::cout << "Batched GEMM:\n"
+      << "====================================================" << std::endl;
 
     Result result;
-
-    int threadblock_count = sufficient();
-
-    // Early exit
-    if (!threadblock_count) {
-      std::cout << "Active CUDA device lacks hardware resources to run CUTLASS Grouped GEMM kernel." << std::endl;
-      return result;
-    }
-
-    if (options.verbose) {
-      print_problem_sizes_();
-    }
-
     result.passed = false;
 
     // Initialize the problem
-    initialize_();
-
-    // Configure the GEMM arguments
-    typename EpilogueOutputOp::Params epilogue_op(options.alpha, options.beta);
+    this->allocate();
+    this->initialize();
 
-    // Configure GEMM arguments
-    typename Gemm::Arguments args(
-      problem_sizes_device.get(),
-      problem_count(),
-      threadblock_count,
-      epilogue_op,
-      ptr_A.get(),
-      ptr_B.get(),
-      ptr_C.get(),
-      ptr_D.get(),
-      lda.get(),
-      ldb.get(),
-      ldc.get(),
-      ldd.get()
-    );
-
-    // Initialize the GEMM object
-    Gemm gemm;
-
-    result.status = gemm.initialize(args);
-
-    if (result.status != cutlass::Status::kSuccess) {
-      std::cerr << "Failed to initialize CUTLASS Grouped GEMM kernel." << std::endl;
-      return result;
-    }
-
-    // Run the grouped GEMM object
-    result.status = gemm.run();
-
-    if (result.status != cutlass::Status::kSuccess) {
-      std::cerr << "Failed to run CUTLASS Grouped GEMM kernel." << std::endl;
-      return result;
-    }
-
-    // Wait for completion
-    result.error = cudaDeviceSynchronize();
-
-    if (result.error != cudaSuccess)  {
-      std::cerr << "Kernel execution error: " << cudaGetErrorString(result.error);
-      return result;
-    }
-
-    //
-    // Verify correctness
-    //
-    result.passed = true;
-
-    if (options.reference_check) {
-      result.passed = verify_();
-    }
-
-    //
-    // Warm-up run of the grouped GEMM object
-    //
-    result.status = gemm.run();
-
-    if (result.status != cutlass::Status::kSuccess) {
-      std::cerr << "Failed to run CUTLASS Grouped GEMM kernel." << std::endl;
-      return result;
-    }
-
-    //
-    // Construct events
-    //
-
-    cudaEvent_t events[2];
-
-    for (auto & event : events) {
-      result.error = cudaEventCreate(&event);
-      if (result.error != cudaSuccess) {
-        std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
-        return -1;
-      }
-    }
-
-    // Record an event at the start of a series of GEMM operations
-    result.error = cudaEventRecord(events[0]);
-    if (result.error != cudaSuccess) {
-      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
-      return result;
-    }
-
-    //
-    // Run profiling loop
-    //
-
-    for (int iter = 0; iter < options.iterations; ++iter) {
-      gemm();
-    }
-
-    //
-    // Stop profiling loop
-    //
-
-    // Record an event when the GEMM operations have been launched.
-    result.error = cudaEventRecord(events[1]);
-    if (result.error != cudaSuccess) {
-      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
-      return result;
-    }
-
-    // Wait for work on the device to complete.
-    result.error = cudaEventSynchronize(events[1]);
-    if (result.error != cudaSuccess) {
-      std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
-      return result;
-    }
-
-    // Measure elapsed runtime
-    float runtime_ms = 0;
-    result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
-    if (result.error != cudaSuccess) {
-      std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
-      return result;
+    if (this->options.verbose) {
+      print_problem_sizes();
     }
 
-    // Compute average runtime and GFLOPs.
-    result.runtime_ms = double(runtime_ms) / double(options.iterations);
-    result.gflops = options.gflops(result.runtime_ms / 1000.0);
-
-    //
-    // Cleanup
-    //
-
-    for (auto event : events) {
-      (void)cudaEventDestroy(event);
-    }
-    
-    int32_t idx = 0;
-    int64_t total_tiles = 0;
-
-    for (auto const & problem : options.problem_sizes) {
-
-      int tiles = 
-        ((problem.m() + Gemm::ThreadblockShape::kM - 1) / Gemm::ThreadblockShape::kM) * 
-        ((problem.n() + Gemm::ThreadblockShape::kN - 1) / Gemm::ThreadblockShape::kN);
-
-      total_tiles += tiles;
-      ++idx;
-    }
-
-    std::cout << std::endl;
-    std::cout << "Grouped GEMM (CUTLASS):\n"
-      << "====================================================" << std::endl;
-
-    std::cout << "    " << total_tiles << " total threadblock tiles." << std::endl;
-
-    std::cout << std::endl;
-    std::cout << "    " << "Grouped Runtime: " << result.runtime_ms << " ms" << std::endl;
-    std::cout << "    " << "Grouped  GFLOPs: " << result.gflops << std::endl;
-
-    if (options.output_file.good()) {
-      options.output_file << options.output_tag << ",CUTLASS,grouped,"
-        << problem_count() << "," << result.runtime_ms << "," << result.gflops << std::endl;
-    }
-
-    return result;
-  }
-
-  /// Executes a conventional batched GEMM kernel.
-  Result profile_batched() {
-
-    Result result;
-    result.passed = false;
-
     //
     // Prepare batched GEMM environment
     //
 
-    int32_t effective_streams = (options.cuda_streams ? options.cuda_streams : 1);
+    int32_t effective_streams = (this->options.cuda_streams ? this->options.cuda_streams : 1);
 
     // Array of leading dimensions used by batched GEMM calls
     std::vector<cutlass::gemm::GemmCoord> bin_problem_sizes;
@@ -985,15 +830,15 @@ public:
     std::vector<void const *> ptr_B_batched_host;
     std::vector<void       *> ptr_C_batched_host;
 
-    for (auto const & bin : options.problem_bins) {
+    for (auto const & bin : this->options.problem_bins) {
       int first_idx = bin.second.front();
-      
-      bin_problem_sizes.push_back(options.problem_sizes.at(first_idx));
+
+      bin_problem_sizes.push_back(this->options.problem_sizes.at(first_idx));
       bin_count.push_back(int32_t(bin.second.size()));
 
-      bin_ldm_A.push_back(static_cast<int32_t>(lda_host.at(first_idx)));
-      bin_ldm_B.push_back(static_cast<int32_t>(ldb_host.at(first_idx)));
-      bin_ldm_C.push_back(static_cast<int32_t>(ldc_host.at(first_idx)));
+      bin_ldm_A.push_back(static_cast<int32_t>(this->lda_host.at(first_idx)));
+      bin_ldm_B.push_back(static_cast<int32_t>(this->ldb_host.at(first_idx)));
+      bin_ldm_C.push_back(static_cast<int32_t>(this->ldc_host.at(first_idx)));
 
       if (ptr_A_batched_host.size() % 2) {
         ptr_A_batched_host.push_back(nullptr);
@@ -1005,29 +850,29 @@ public:
 
       for (int idx : bin.second) {
 
-        if (bin_problem_sizes.back() != options.problem_sizes.at(idx)) {
+        if (bin_problem_sizes.back() != this->options.problem_sizes.at(idx)) {
           std::cerr << "Error - failed to group problems.\n";
           return result;
         }
 
-        if (bin_ldm_A.back() != lda_host.at(idx)) {
+        if (bin_ldm_A.back() != this->lda_host.at(idx)) {
           std::cerr << "Error - failed to group problems.\n";
           return result;
         }
 
-        if (bin_ldm_B.back() != ldb_host.at(idx)) {
+        if (bin_ldm_B.back() != this->ldb_host.at(idx)) {
           std::cerr << "Error - failed to group problems.\n";
           return result;
         }
 
-        if (bin_ldm_C.back() != ldc_host.at(idx)) {
+        if (bin_ldm_C.back() != this->ldc_host.at(idx)) {
           std::cerr << "Error - failed to group problems.\n";
           return result;
         }
 
-        ptr_A_batched_host.push_back(block_A.get() + offset_A.at(idx));
-        ptr_B_batched_host.push_back(block_B.get() + offset_B.at(idx));
-        ptr_C_batched_host.push_back(block_D.get() + offset_C.at(idx));
+        ptr_A_batched_host.push_back(this->block_A.get() + this->offset_A.at(idx));
+        ptr_B_batched_host.push_back(this->block_B.get() + this->offset_B.at(idx));
+        ptr_C_batched_host.push_back(this->block_D.get() + this->offset_C.at(idx));
       }
     }
 
@@ -1048,15 +893,14 @@ public:
     // Create CUDA streams to maximize concurrency of batched-array GEMM kernels
     //
     std::vector<cudaStream_t>   cuda_streams;
-    char const *provider = "CUTLASS";
 
     //
     // Warmup run
     //
 
 
-    if (options.cuda_streams) {
-      for (int i = 0; i < options.cuda_streams; ++i) {
+    if (this->options.cuda_streams) {
+      for (int i = 0; i < this->options.cuda_streams; ++i) {
         cudaStream_t stream;
 
         result.error = cudaStreamCreate(&stream);
@@ -1074,7 +918,7 @@ public:
     }
 
     // Use 'D' for the in/out workspace
-    block_D.copy_from_device(block_C.get());
+    this->block_D.copy_from_device(this->block_C.get());
 
     for (int bin_idx = 0; bin_idx < int32_t(bin_problem_sizes.size()); ++bin_idx) {
 
@@ -1094,9 +938,9 @@ public:
       //
 
       // Configure the GEMM arguments
-      typename EpilogueOutputOp::Params epilogue_op(options.alpha, options.beta);
+      typename Gemm::EpilogueOutputOp::Params epilogue_op(this->options.alpha, this->options.beta);
 
-      typename GemmBatched::Arguments arguments{
+      typename Gemm::Arguments arguments{
         cutlass::gemm::GemmUniversalMode::kArray,
         problem,
         batch_count,
@@ -1115,7 +959,7 @@ public:
         int64_t(ldc)
       };
 
-      GemmBatched gemm_op;
+      Gemm gemm_op;
 
       cutlass::Status status = gemm_op.initialize(arguments);
 
@@ -1130,7 +974,7 @@ public:
         std::cerr << "CUTLASS error on line " << __LINE__ << std::endl;
         return result;
       }
-      
+
     }
 
     //
@@ -1182,8 +1026,8 @@ public:
 
     int last_stream_idx = 0;
 
-    for (int iter = 0; iter < options.iterations; ++iter) {
-      
+    for (int iter = 0; iter < this->options.iterations; ++iter) {
+
       for (int bin_idx = 0; bin_idx < int32_t(bin_problem_sizes.size()); ++bin_idx) {
 
         cutlass::gemm::GemmCoord const & problem = bin_problem_sizes[bin_idx];
@@ -1204,9 +1048,9 @@ public:
         //
 
         // Configure the GEMM arguments
-        typename EpilogueOutputOp::Params epilogue_op(options.alpha, options.beta);
+        typename Gemm::EpilogueOutputOp::Params epilogue_op(this->options.alpha, this->options.beta);
 
-        typename GemmBatched::Arguments arguments{
+        typename Gemm::Arguments arguments{
           cutlass::gemm::GemmUniversalMode::kArray,
           problem,
           batch_count,
@@ -1225,7 +1069,7 @@ public:
           int64_t(ldc)
         };
 
-        GemmBatched gemm_op;
+        Gemm gemm_op;
 
         cutlass::Status status = gemm_op.initialize(arguments);
 
@@ -1254,7 +1098,7 @@ public:
       std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
       return result;
     }
-    
+
     //
     // Wait for work to be completed
     //
@@ -1266,15 +1110,263 @@ public:
       return result;
     }
 
-    // Wait for work on the device to complete.
-    result.error = cudaEventSynchronize(events[1]);
+    // Measure elapsed runtime
+    float runtime_ms = 0;
+    result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
     if (result.error != cudaSuccess) {
-      std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
+      std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Compute average runtime and GFLOPs.
+    result.runtime_ms = double(runtime_ms) / double(this->options.iterations);
+    result.gflops = this->options.gflops(result.runtime_ms / 1000.0);
+
+    //
+    // Cleanup
+    //
+
+    for (auto event : events) {
+      (void)cudaEventDestroy(event);
+    }
+
+    for (auto stream : cuda_streams) {
+      if (stream) {
+        (void)cudaStreamDestroy(stream);
+      }
+    }
+
+    std::cout << "    " << this->options.problem_bins.size() << " batched GEMMs launched" << std::endl;
+    std::cout << std::endl;
+    std::cout << "    " << "Batched Runtime: " << result.runtime_ms << " ms" << std::endl;
+    std::cout << "    " << "Batched  GFLOPs: " << result.gflops << std::endl;
+
+    std::string provider = "CUTLASS";
+
+    if (this->options.output_file.good()) {
+      this->options.output_file << this->options.output_tag << "," << provider << ",batched,"
+        << this->options.problem_count << "," << result.runtime_ms << "," << result.gflops << std::endl;
+    }
+
+    result.passed = true;
+    return result;
+  }
+};
+
+template <typename Gemm_, cutlass::gemm::kernel::GroupScheduleMode GroupScheduleMode_>
+class TestbedGrouped : BaseTestbed<Gemm_> {
+public:
+  TestbedGrouped(
+    Options &options_,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint32_t seed_ = 3080
+  ): BaseTestbed<Gemm_>(options_, init_A_, init_B_, init_C_, seed_) {}
+
+  // Redefine GEMM with different GroupScheduleMode_
+  using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmGrouped<
+    typename Gemm_::ElementA,
+    typename Gemm_::LayoutA,
+    Gemm_::kTransformA,
+    Gemm_::kAlignmentA,
+    typename Gemm_::ElementB,
+    typename Gemm_::LayoutB,
+    Gemm_::kTransformB,
+    Gemm_::kAlignmentB,
+    typename Gemm_::ElementC,
+    typename Gemm_::LayoutC,
+    typename Gemm_::ElementAccumulator,
+    typename Gemm_::OperatorClass,
+    typename Gemm_::ArchTag,
+    typename Gemm_::ThreadblockShape,
+    typename Gemm_::WarpShape,
+    typename Gemm_::InstructionShape,
+    typename Gemm_::EpilogueOutputOp,
+    typename Gemm_::ThreadblockSwizzle,
+    Gemm_::kStages,
+    GroupScheduleMode_>::GemmKernel;
+
+  using Gemm = cutlass::gemm::device::GemmGrouped<GemmKernel>;
+
+  /// Verbose printing of problem sizes
+  void print_problem_sizes() {
+    std::cout << std::endl;
+
+    // Print groups
+    std::cout << this->problem_count() << " groups:\n";
+
+    int32_t idx = 0;
+    int64_t total_tiles = 0;
+
+    for (auto const & problem : this->options.problem_sizes) {
+      int tiles = Gemm::problem_tile_count(problem);
+      total_tiles += tiles;
+
+      std::cout << "  [" << idx << "]: "
+        << problem.m() << "-by-" << problem.n() << "-by-" << problem.k()
+        << " (" << tiles << " threadblock tiles)" << "\n";
+
+      ++idx;
+    }
+    std::cout << std::endl;
+  }
+
+  /// Sort problems in descending order of problem-K dimension
+  void sort_problems() {
+    Gemm::sort_problems(this->options.problem_count,
+                        this->options.problem_sizes.data(),
+                        this->lda_host.data(),
+                        this->ldb_host.data(),
+                        this->ldc_host.data(),
+                        this->ldd_host.data(),
+                        this->offset_A.data(),
+                        this->offset_B.data(),
+                        this->offset_C.data(),
+                        this->offset_D.data());
+  }
+
+  /// Executes a grouped kernel and measures runtime
+  Result profile() {
+    std::string sched_mode = this->options.scheduler_mode_to_str.find(GroupScheduleMode_)->second;
+
+    std::cout << std::endl;
+    std::cout << "Grouped GEMM (CUTLASS) with mode " << sched_mode << ":\n"
+      << "====================================================" << std::endl;
+
+    Result result;
+
+    int threadblock_count = Gemm::sufficient(this->options.problem_sizes.data(), this->options.problem_count);
+
+    // Early exit
+    if (!threadblock_count) {
+      std::cout << "Active CUDA device lacks hardware resources to run CUTLASS Grouped GEMM kernel." << std::endl;
+      return result;
+    }
+
+    result.passed = false;
+
+    // Initialize the problem
+    this->allocate();
+    if (this->options.sort_problems) {
+      sort_problems();
+    }
+    this->initialize();
+
+    if (this->options.verbose) {
+      print_problem_sizes();
+    }
+
+    // Configure the GEMM arguments
+    typename Gemm::EpilogueOutputOp::Params epilogue_op(this->options.alpha, this->options.beta);
+
+    // Configure GEMM arguments
+    typename Gemm::Arguments args(
+      this->problem_sizes_device.get(),
+      this->problem_count(),
+      threadblock_count,
+      epilogue_op,
+      this->ptr_A.get(),
+      this->ptr_B.get(),
+      this->ptr_C.get(),
+      this->ptr_D.get(),
+      this->lda.get(),
+      this->ldb.get(),
+      this->ldc.get(),
+      this->ldd.get(),
+      this->options.problem_sizes.data()
+    );
+
+    // Initialize the GEMM object
+    Gemm gemm;
+
+    size_t workspace_size = gemm.get_workspace_size(args);
+    cutlass::DeviceAllocation<uint8_t> workspace(workspace_size);
+
+    result.status = gemm.initialize(args, workspace.get());
+
+    if (result.status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to initialize CUTLASS Grouped GEMM kernel." << std::endl;
+      return result;
+    }
+
+    // Run the grouped GEMM object
+    result.status = gemm.run();
+
+    if (result.status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to run CUTLASS Grouped GEMM kernel." << std::endl;
+      return result;
+    }
+
+    // Wait for completion
+    result.error = cudaDeviceSynchronize();
+
+    if (result.error != cudaSuccess)  {
+      std::cerr << "Kernel execution error: " << cudaGetErrorString(result.error);
+      return result;
+    }
+
+    //
+    // Verify correctness
+    //
+    result.passed = true;
+
+    if (this->options.reference_check) {
+      result.passed = this->verify();
+    }
+
+    //
+    // Warm-up run of the grouped GEMM object
+    //
+    result.status = gemm.run();
+
+    if (result.status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to run CUTLASS Grouped GEMM kernel." << std::endl;
+      return result;
+    }
+
+    //
+    // Construct events
+    //
+
+    cudaEvent_t events[2];
+
+    for (auto & event : events) {
+      result.error = cudaEventCreate(&event);
+      if (result.error != cudaSuccess) {
+        std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+        return -1;
+      }
+    }
+
+    // Record an event at the start of a series of GEMM operations
+    result.error = cudaEventRecord(events[0]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    //
+    // Run profiling loop
+    //
+
+    for (int iter = 0; iter < this->options.iterations; ++iter) {
+      gemm();
+    }
+
+    //
+    // Stop profiling loop
+    //
+
+    // Record an event when the GEMM operations have been launched.
+    result.error = cudaEventRecord(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
       return result;
     }
 
     // Wait for work on the device to complete.
-    result.error = cudaEventSynchronize(events[0]);
+    result.error = cudaEventSynchronize(events[1]);
     if (result.error != cudaSuccess) {
       std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
       return result;
@@ -1289,8 +1381,8 @@ public:
     }
 
     // Compute average runtime and GFLOPs.
-    result.runtime_ms = double(runtime_ms) / double(options.iterations);
-    result.gflops = options.gflops(result.runtime_ms / 1000.0);
+    result.runtime_ms = double(runtime_ms) / double(this->options.iterations);
+    result.gflops = this->options.gflops(result.runtime_ms / 1000.0);
 
     //
     // Cleanup
@@ -1299,28 +1391,40 @@ public:
     for (auto event : events) {
       (void)cudaEventDestroy(event);
     }
-    
-    for (auto stream : cuda_streams) {
-      if (stream) {
-        (void)cudaStreamDestroy(stream);  
+
+    // Optionally profile initialization
+    if (this->options.profile_initialization) {
+      // Warm up
+      gemm.initialize(args, workspace.get());
+
+      auto start_time = std::chrono::high_resolution_clock::now();
+      for (int32_t i = 0; i < this->options.iterations; ++i) {
+        gemm.initialize(args, workspace.get());
       }
+      auto end_time = std::chrono::high_resolution_clock::now();
+
+      std::chrono::duration<double, std::milli> duration = end_time - start_time;
+      duration /= double(this->options.iterations);
+      result.initialization_time_ms = duration.count();
     }
 
-    std::cout << std::endl;
-    std::cout << "Batched GEMM:\n"
-      << "====================================================" << std::endl;
+    int64_t total_tiles = Gemm::group_tile_count(args);
+    std::cout << "    " << total_tiles << " total threadblock tiles." << std::endl;
 
-    std::cout << "    " << bin_problem_sizes.size() << " batched GEMMs launched" << std::endl;
     std::cout << std::endl;
-    std::cout << "    " << "Batched Runtime: " << result.runtime_ms << " ms" << std::endl;
-    std::cout << "    " << "Batched  GFLOPs: " << result.gflops << std::endl;
+    std::cout << "    " << "Grouped Runtime: " << result.runtime_ms << " ms" << std::endl;
+    std::cout << "    " << "Grouped  GFLOPs: " << result.gflops << std::endl;
+    if (this->options.profile_initialization) {
+      std::cout << "    " << "Init    Runtime: " << result.initialization_time_ms << " ms" << std::endl;
+    }
 
-    if (options.output_file.good()) {
-      options.output_file << options.output_tag << "," << provider << ",batched,"
-        << problem_count() << "," << result.runtime_ms << "," << result.gflops << std::endl;
+    if (this->options.output_file.good()) {
+      this->options.output_file << this->options.output_tag << ",CUTLASS,grouped-" << sched_mode << ","
+        << this->options.problem_count << "," << result.runtime_ms << "," << result.gflops << std::endl;
     }
 
-    result.passed = true;
+    std::cout << "\nPassed\n";
+
     return result;
   }
 };
@@ -1329,10 +1433,6 @@ public:
 
 int main(int argc, char const **args) {
 
-  //
-  // This example uses mma.sync to directly access Tensor Cores to achieve peak performance.
-  //
-
   cudaDeviceProp props;
 
   cudaError_t error = cudaGetDeviceProperties(&props, 0);
@@ -1342,12 +1442,12 @@ int main(int argc, char const **args) {
   }
 
   if (__CUDACC_VER_MAJOR__ < 11 || props.major < 8) {
-  
+
     //
     // This example requires an NVIDIA Ampere-architecture GPU.
     //
 
-    std::cout 
+    std::cout
       << "CUTLASS's Grouped GEMM example requires a GPU of NVIDIA's Ampere Architecture or "
       << "later (compute capability 80 or greater).\n";
 
@@ -1359,7 +1459,7 @@ int main(int argc, char const **args) {
   //
 
   Options options;
-  
+
   options.parse(argc, args);
 
   if (options.help) {
@@ -1373,9 +1473,11 @@ int main(int argc, char const **args) {
   }
 
   //
-  // Define the Grouped GEMM type
+  // Define the Grouped and Batched GEMM types
   //
 
+  using ElementA = cutlass::half_t;
+  using ElementB = cutlass::half_t;
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = float;
 
@@ -1383,18 +1485,42 @@ int main(int argc, char const **args) {
   using LayoutB = cutlass::layout::ColumnMajor;
   using LayoutC = cutlass::layout::ColumnMajor;
 
+  // Gemm operator cutlass_tensorop_f16_s16816gemm_f16_128x128_32x4_nt_align8
+  using GemmBatched = cutlass::gemm::device::GemmUniversal<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementOutput,   LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
+    4
+  >;
+
+  // Define a grouped GEMM kernel with all template parameters set except
+  // for scheduling mode. This will be used as the template for all scheduling
+  // modes executed.
   using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmGrouped<
-    cutlass::half_t, 
+    ElementA,
     LayoutA,
     cutlass::ComplexTransform::kNone,
     8,
-    cutlass::half_t,
+    ElementB,
     LayoutB,
     cutlass::ComplexTransform::kNone,
     8,
     ElementOutput, LayoutC,
-    ElementAccumulator, 
-    cutlass::arch::OpClassTensorOp, 
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
     cutlass::arch::Sm80,
     cutlass::gemm::GemmShape<128, 128, 32>,
     cutlass::gemm::GemmShape<64, 64, 32>,
@@ -1402,64 +1528,50 @@ int main(int argc, char const **args) {
     cutlass::epilogue::thread::LinearCombination<
         ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
         ElementAccumulator, ElementAccumulator>,
-    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, 
+    // NOTE: Threadblock swizzling is currently not supported by CUTLASS's grouped kernels.
+    // This parameter is passed in at present to match the APIs of other kernels. The parameter
+    // is unused within the kernel.
+    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle,
     4>::GemmKernel;
 
   using GemmGrouped = cutlass::gemm::device::GemmGrouped<GemmKernel>;
 
-  //
-  // Define a conventional batched GEMM type
-  //
-
-  // Gemm operator cutlass_tensorop_f16_s16816gemm_f16_128x128_32x4_nt_align8
-  using GemmBatched = cutlass::gemm::device::GemmUniversal<
-    cutlass::half_t, LayoutA,
-    cutlass::half_t, LayoutB,
-    ElementOutput,   LayoutC,
-    ElementAccumulator,
-    cutlass::arch::OpClassTensorOp,
-    cutlass::arch::Sm80,
-    cutlass::gemm::GemmShape<128, 128, 32>,
-    cutlass::gemm::GemmShape<64, 64, 32>,
-    cutlass::gemm::GemmShape<16, 8, 16>,
-    cutlass::epilogue::thread::LinearCombination<
-      ElementOutput, 
-      128 / cutlass::sizeof_bits<ElementOutput>::value,
-      ElementAccumulator, 
-      ElementAccumulator
-    >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
-    4
-  >;
-
   //
   // Profile it
   //
 
-  TestbedGrouped<GemmGrouped, GemmBatched> testbed(options);
-
-  if (!testbed.sufficient()) {
-    std::cout << "The active CUDA device lacks sufficient hardware resources to execute this kernel.\n";
-    return 0;
+  TestbedBatched<GemmBatched> testbed_batched(options);
+  Result result = testbed_batched.profile();
+  if (result.error) {
+    return 1;
   }
 
-  Result result = testbed.profile_grouped();
-  if (!result.passed) {
-    std::cout << "Profiling CUTLASS grouped GEMM has failed.\n";
-    std::cout << "\nFailed\n";
-    return -1;
-  }
+  using GroupScheduleMode = cutlass::gemm::kernel::GroupScheduleMode;
+  for (GroupScheduleMode mode : options.scheduler_modes) {
+    Result result;
+    switch (mode) {
+      case GroupScheduleMode::kDeviceOnly:
+        {
+          TestbedGrouped<GemmGrouped, GroupScheduleMode::kDeviceOnly> runner(options);
+          result = runner.profile();
+          break;
+        }
+      case GroupScheduleMode::kHostPrecompute:
+        {
+          TestbedGrouped<GemmGrouped, GroupScheduleMode::kHostPrecompute> runner(options);
+          result = runner.profile();
+          break;
+        }
+    }
 
-  result = testbed.profile_batched();
-  if (!result.passed) {
+    if (result.error != cudaSuccess) {
+      return 1;
+    }
 
-    std::cout << "Profiling batched GEMM has failed.\n";
-    std::cout << "\nFailed\n";
-    return -1;
+    // Override verbose flag to avoid printing duplicate information for each scheduling mode
+    options.verbose = false;
   }
 
-  std::cout << "\nPassed\n";
-
   return 0;
 }
 
diff --git a/examples/25_ampere_fprop_mainloop_fusion/CMakeLists.txt b/examples/25_ampere_fprop_mainloop_fusion/CMakeLists.txt
index 0bf0c775e9..ce9a0bd0c3 100644
--- a/examples/25_ampere_fprop_mainloop_fusion/CMakeLists.txt
+++ b/examples/25_ampere_fprop_mainloop_fusion/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -34,3 +34,8 @@ cutlass_example_add_executable(
   ampere_fprop_mainloop_fusion.cu
   )
 
+cutlass_example_add_executable(
+  25_ampere_3d_fprop_mainloop_fusion
+  ampere_3d_fprop_mainloop_fusion.cu
+  )
+
diff --git a/examples/25_ampere_fprop_mainloop_fusion/ampere_3d_fprop_mainloop_fusion.cu b/examples/25_ampere_fprop_mainloop_fusion/ampere_3d_fprop_mainloop_fusion.cu
new file mode 100644
index 0000000000..a1ca2b078f
--- /dev/null
+++ b/examples/25_ampere_fprop_mainloop_fusion/ampere_3d_fprop_mainloop_fusion.cu
@@ -0,0 +1,776 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/**
+
+This example shows how to fuse per channel scale+bias+relu of the activations 
+into the 3D fprop mainloop.
+
+Compared with original 3D fprop kernel, this example has two more vectors, one for
+the scale and one for the bias.  The length of the vectors is the same as the
+activation channel number.  This kernel loads the vectors when the associated
+activation channels are loaded in the mainloop.  Between reading the 
+activations and scale/bias data from the shared memory and calling tensor core
+instructions, scale+bias+relu is computed in the register file.
+
+This example is customized for Ampere 16816 fp16 tensor core instruction.
+Changing to different data types or different tensor core instruction require
+source code changing.  See
+include/cutlass/conv/threadblock/implicit_gemm_fprop_fusion_multistage.h for more
+technical details.
+
+This example is modified based on 25_ampere_fprop_mainloop_fusion.  The command
+line is the same.
+*/
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/conv/kernel/default_conv3d_fprop_fusion.h"
+#include "cutlass/conv/device/implicit_gemm_convolution_fusion.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/device/convolution.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "helper.h"
+
+// The code section below describes datatype for input, output tensors and computation between
+// elements 
+using ElementAccumulator = float;                  // Data type of accumulator
+using ElementComputeEpilogue = float;              // Data type of epilogue computation (alpha, beta)
+using ElementInputA = cutlass::half_t;             // Data type of elements in input tensor
+using ElementInputB = cutlass::half_t;             // Data type of elements in input tensor
+using ElementInputScaleBias = cutlass::half_t;     // Data type of elements in input sclae and bias vectors
+using ElementOutput = float;                       // Data type of elements in output tensor
+
+using LayoutInputA = cutlass::layout::TensorNDHWC;
+using LayoutInputB = cutlass::layout::TensorNDHWC;
+using LayoutInputScaleBias = cutlass::layout::RowMajor;
+using LayoutOutput = cutlass::layout::TensorNDHWC;
+
+// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
+using MMAOp = cutlass::arch::OpClassTensorOp;
+
+// This code section describes CUDA SM architecture number
+using SmArch = cutlass::arch::Sm80;
+
+// This code section describes the tile size a thread block will compute
+using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>; // Threadblock tile shape
+
+// This code section describes tile size a warp will compute
+using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;          // Warp tile shape
+
+// This code section describes the size of MMA op
+using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;    // TensorCore instruction shape
+
+// This code section describes how threadblocks are scheduled on GPU
+using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
+
+// Number of pipelines you want to use
+constexpr int NumStages = 4;
+
+// This code section describe iterator algorithm selected is Analytic or Optimized
+static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm = cutlass::conv::IteratorAlgorithm::kOptimized;
+
+// This code section describes the epilogue part of the kernel, we use default value
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+    ElementOutput,                                     // Data type of output matrix.
+    128 / cutlass::sizeof_bits<ElementOutput>::value,  // The number of elements per vectorized.
+                                                       // memory access. This becomes the vector width of
+                                                       // math instructions in the epilogue too.
+    ElementAccumulator,                                // Data type of accumulator
+    ElementComputeEpilogue>;                           // Data type for alpha/beta in linear combination
+
+using Conv3dFpropFusionKernel = typename cutlass::conv::kernel::DefaultConv3dFpropFusion<
+  ElementInputA, LayoutInputA,
+  ElementInputB, LayoutInputB,
+  ElementInputScaleBias, LayoutInputScaleBias,
+  ElementOutput, LayoutOutput,
+  ElementAccumulator,
+  MMAOp,
+  SmArch,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOp,
+  SwizzleThreadBlock,
+  NumStages,
+  cutlass::arch::OpMultiplyAdd,
+  IteratorAlgorithm
+>::Kernel;
+
+using ImplicitGemmFusion = cutlass::conv::device::ImplicitGemmConvolutionFusion<Conv3dFpropFusionKernel>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+  cutlass::Tensor5DCoord input_size;
+  cutlass::Tensor5DCoord filter_size;
+  cutlass::Coord<3> padding;
+  cutlass::Coord<3> conv_stride;
+  cutlass::Coord<3> dilation;
+  bool reference_check;
+  bool measure_performance;
+  int iterations;
+  bool save_workspace;
+  ElementComputeEpilogue alpha;
+  ElementComputeEpilogue beta;
+  bool benchmark;
+  std::string tag;
+
+  Options():
+    help(false),
+    input_size(1, 32, 32, 32, 32),
+    filter_size(32, 3, 3, 3, 32),
+    padding(cutlass::make_Coord(1, 1, 1)),
+    conv_stride(cutlass::make_Coord(1, 1, 1)),
+    dilation(cutlass::make_Coord(1, 1, 1)),
+    reference_check(true),
+    measure_performance(false),
+    iterations(20),
+    save_workspace(false),
+    alpha(1),
+    beta(0),
+    benchmark(false) { }
+
+  // Verify the problem size is compatible with the CUTLASS Convolution implementation.
+  bool valid() {
+
+    //
+    // CUTLASS attempts to load 128b vectors of cutlass::half_t (F16) elements. Consequently,
+    // all pointers, strides, and tensor extents must be divisible by 8 elements.
+    //
+    int const kAlignment = 8;
+
+    if ((input_size.c() % kAlignment) ||
+      (filter_size.n() % kAlignment)) {
+
+      // misaligned tensors
+      return false;
+    }
+
+    // Invalid padding
+    if ((padding[0] != filter_size.d() / 2) || 
+      (padding[1] != filter_size.h() / 2) ||
+      (padding[2] != filter_size.w() / 2)) {
+
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Updates input and filter sizes
+  void update(
+    cutlass::Tensor5DCoord input_size,
+    cutlass::Tensor5DCoord filter_size,
+    cutlass::Coord<3> stride) {
+
+    this->input_size = input_size;
+    this->filter_size = filter_size;
+    conv_stride = stride;
+
+    padding[0] = filter_size.d() / 2;
+    padding[1] = filter_size.h() / 2;
+    padding[2] = filter_size.w() / 2;
+  }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+
+    if (cmd.check_cmd_line_flag("ref-check")) {
+      reference_check = true;
+    }
+
+    if (cmd.check_cmd_line_flag("perf-check")) {
+      measure_performance = true;
+    }
+
+    if (cmd.check_cmd_line_flag("save-workspace")) {
+      save_workspace = true;
+    }
+
+    if (cmd.check_cmd_line_flag("benchmark")) {
+      benchmark = true;
+    }
+
+    cmd.get_cmd_line_argument("n", input_size.n());
+    cmd.get_cmd_line_argument("d", input_size.d());
+    cmd.get_cmd_line_argument("h", input_size.h());
+    cmd.get_cmd_line_argument("w", input_size.w());
+    cmd.get_cmd_line_argument("c", input_size.c());
+
+    cmd.get_cmd_line_argument("k", filter_size.n());
+    cmd.get_cmd_line_argument("t", filter_size.d());
+    cmd.get_cmd_line_argument("r", filter_size.h());
+    cmd.get_cmd_line_argument("s", filter_size.w());
+    filter_size.c() = input_size.c(); 
+
+    cmd.get_cmd_line_argument("alpha", alpha);
+    cmd.get_cmd_line_argument("beta", beta);
+    
+    cmd.get_cmd_line_argument("iterations", iterations);
+    cmd.get_cmd_line_argument("tag", tag);
+
+    if (filter_size.d() == 3 && filter_size.h() == 3 && filter_size.w() == 3) {
+      padding = cutlass::make_Coord(1, 1, 1);
+    }
+    else {
+      filter_size.d() = 1;
+      filter_size.h() = 1;
+      filter_size.w() = 1;
+      padding = cutlass::make_Coord(0, 0, 0);
+    }
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "25_ampere_3d_fprop_mainloop_fusion example\n\n"
+      << "  This example fuses scale+bias+relu of the activations into Ampere's\n"
+      << "  Tensor Core operators on F16 data types to compute\n"
+      << "  forward convolution on tensors of layout NDHWC.\n\n"
+      << "Options:\n\n"
+      << "  --help               If specified, displays this usage statement.\n\n"
+      << "  --n <int>            Input tensor extent N\n"
+      << "  --d <int>            Input tensor extent D\n"
+      << "  --h <int>            Input tensor extent H\n"
+      << "  --w <int>            Input tensor extent W\n"
+      << "  --c <int>            Input tensor extent C\n"
+      << "  --k <int>            Filter extent K\n"
+      << "  --t <int>            Filter extent T\n"
+      << "  --r <int>            Filter extent R\n"
+      << "  --s <int>            Filter extent S\n\n"
+      << "  --alpha <float>      Epilogue scalar alpha\n"
+      << "  --beta <float>       Epilogue scalar beta\n\n"
+      << "  --ref-check          If set (true), reference check on the host is computed\n"
+      << "  --perf-check         If set (true), performance is measured.\n"
+      << "  --benchmark          If set (true), performance benchmarking on several layers and batch-size.\n"
+      << "  --iterations <int>   Number of profiling iterations to perform.\n"
+      << "  --save-workspace     If set, workspace is written to a text file.\n"
+      << "  --tag <string>       String to replicate across the first column in the results table\n";
+
+    out << "\n\nExamples:\n\n"
+      << "$ ./25_ampere_3d_fprop_mainloop_fusion  --n=32 --d=96 --h=96 --w=96 --c=64 --k=64 --t=1 --r=1 --s=1\n\n"
+      << "$ ./25_ampere_3d_fprop_mainloop_fusion  --n=1  --d=224 --h=224 --w=224 --c=32 --k=32 --t=3 --r=3 --s=3 --ref-check\n\n"
+      << "$ ./25_ampere_3d_fprop_mainloop_fusion  --n=19 --d=94 --h=96 --w=96 --c=128 --k=128 --t=1 --r=1 --s=1\n\n";
+
+    return out;
+  }
+  
+  /// Computes the output tensor size (NPQK)
+  cutlass::Tensor5DCoord output_size() const {
+    return cutlass::Tensor5DCoord(
+      input_size.n(),
+      (input_size.d() + padding[0] + padding[0] - filter_size.d()) / conv_stride[0] + 1,
+      (input_size.h() + padding[1] + padding[1] - filter_size.h()) / conv_stride[1] + 1,
+      (input_size.w() + padding[2] + padding[2] - filter_size.w()) / conv_stride[2] + 1,
+      filter_size.n());
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+
+    // Number of multiply-adds = NPQK * CRS
+    int64_t fmas = output_size().product() * int64_t(filter_size.d() * filter_size.h() * filter_size.w() * filter_size.c());
+    
+    // Two flops per multiply-add
+    return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Result {
+  double runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cutlass::Status reference_check;
+  cudaError_t error;
+
+  Result(): 
+    runtime_ms(0), 
+    gflops(0),
+    status(cutlass::Status::kSuccess),
+    reference_check(cutlass::Status::kInvalid),
+    error(cudaSuccess) { }
+
+  static std::ostream & print_header(std::ostream &out, Options const &options) {
+
+    if (!options.tag.empty()) {
+      out << "Name,";
+    }
+
+    out << "Layer,N,D,H,W,C,K,T,R,S,Stride_D,Stride_H,Stride_W,Runtime,GFLOPs";
+
+    return out;
+  }
+
+  std::ostream & print(std::ostream &out, int idx, Options const &options) {
+
+    if (!options.tag.empty()) {
+      out << options.tag << ",";
+    }
+
+    out 
+      << "conv_" << idx << ","
+      << options.input_size.n() << ","
+      << options.input_size.d() << ","
+      << options.input_size.h() << ","
+      << options.input_size.w() << ","
+      << options.input_size.c() << ","
+      << options.filter_size.n() << ","
+      << options.filter_size.d() << ","
+      << options.filter_size.h() << ","
+      << options.filter_size.w() << ","
+      << options.conv_stride[0] << ","
+      << options.conv_stride[1] << ","
+      << options.conv_stride[2] << ","
+      << runtime_ms << ","
+      << gflops;
+
+    return out;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Runs one benchmark
+Result profile_convolution(Options const &options) {
+
+  Result result;
+
+  //
+  // Allocate host-device tensors using the CUTLASS Utilities.
+  //
+
+  cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(options.input_size);
+  cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_transformed_a(options.input_size);
+  cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(options.filter_size);
+  cutlass::HostTensor<ElementInputScaleBias, LayoutInputScaleBias>
+      tensor_a_scale({1, options.input_size.c()});
+  cutlass::HostTensor<ElementInputScaleBias, LayoutInputScaleBias>
+      tensor_a_bias({1, options.input_size.c()});
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(options.output_size());
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(options.output_size());
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_d(options.output_size());
+
+  //
+  // Initialize tensors
+  //
+
+  // Fill tensor A on host with uniform-distribution random data
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_a.host_view(),
+      1,
+      ElementInputA(3),
+      ElementInputA(-4),
+      0);
+
+  // Fill scale vector for tensor A on host with uniform-distribution random
+  // data
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_a_scale.host_view(),
+      1,
+      ElementInputA(3),
+      ElementInputA(-4),
+      0);
+
+  // Fill bias vector for tensor A on host with uniform-distribution random
+  // data
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_a_bias.host_view(),
+      1,
+      ElementInputA(3),
+      ElementInputA(-4),
+      0);
+
+  // Fill tensor B on host with uniform-distribution random data
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_b.host_view(),
+      1,
+      ElementInputB(7),
+      ElementInputB(-8),
+      0);
+
+  // Fill tensor C on host with uniform-distribution random data 
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_c.host_view(),
+      1,
+      ElementOutput(7),
+      ElementOutput(-8),
+      0);
+
+  // Fill tensor D for reference on host with zeros
+  cutlass::reference::host::TensorFill(
+      tensor_ref_d.host_view());
+
+  // Copy data from host to GPU
+  tensor_a.sync_device();
+  tensor_a_scale.sync_device();
+  tensor_a_bias.sync_device();
+  tensor_b.sync_device();
+  tensor_c.sync_device();
+  tensor_d.sync_device();
+  tensor_ref_d.sync_device();
+
+  //
+  // Define arguments for CUTLASS Convolution
+  //
+
+  cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;
+
+  // Split K dimension into 1 partitions
+  int split_k_slices = 1;
+
+  // Construct Conv3dProblemSize with user defined output size
+  cutlass::conv::Conv3dProblemSize problem_size(      
+      options.input_size,
+      options.filter_size,
+      options.padding,
+      options.conv_stride,
+      options.dilation,
+      options.output_size(),
+      mode,
+      split_k_slices
+  );
+
+  typename ImplicitGemmFusion::Arguments arguments{
+    problem_size,
+    tensor_a.device_ref(),
+    tensor_b.device_ref(),
+    tensor_a_scale.device_ref(),
+    tensor_a_bias.device_ref(),
+    tensor_c.device_ref(),
+    tensor_d.device_ref(),
+    {options.alpha, options.beta},
+  };
+
+  //
+  // Initialize CUTLASS Convolution
+  //
+
+  ImplicitGemmFusion implicit_gemm_fusion_op;
+
+  size_t workspace_size = implicit_gemm_fusion_op.get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  result.status = implicit_gemm_fusion_op.can_implement(arguments);
+  CUTLASS_CHECK(result.status);
+
+  result.status = implicit_gemm_fusion_op.initialize(arguments, workspace.get());
+  CUTLASS_CHECK(result.status);
+
+  //
+  // Launch initialized CUTLASS kernel
+  //
+  result.status = implicit_gemm_fusion_op();
+
+  CUTLASS_CHECK(result.status);
+
+  //
+  // Optional reference check
+  //
+  
+  if (options.reference_check) {
+    std::cout << "Verification on device...\n";
+
+    // Compute scale + bias + relu in host code
+    for (int n = 0; n < options.input_size.n(); ++n) {
+      for (int d = 0; d < options.input_size.d(); ++d) {
+        for (int h = 0; h < options.input_size.h(); ++h) {
+          for (int w = 0; w < options.input_size.w(); ++w) {
+            for (int c = 0; c < options.input_size.c(); ++c) {
+              tensor_transformed_a.at({n, d, h, w, c}) = std::max(
+                  ElementOutput(0), ElementOutput(tensor_a.at({n, d, h, w, c}) *
+                                                      tensor_a_scale.at({0, c}) +
+                                                  tensor_a_bias.at({0, c})));
+            }
+          }
+        }
+      }
+    }
+
+    tensor_transformed_a.sync_device();
+
+    // Compute with reference implementation
+    cutlass::reference::device::Conv3dFprop<
+      ElementInputA,
+      LayoutInputA,
+      ElementInputB,
+      LayoutInputB,
+      ElementOutput,
+      LayoutOutput,
+      ElementComputeEpilogue,
+      ElementAccumulator,
+      cutlass::NumericConverter<ElementOutput, ElementComputeEpilogue>
+    >(
+      problem_size,
+      tensor_transformed_a.device_ref(),
+      tensor_b.device_ref(),
+      tensor_c.device_ref(),
+      tensor_ref_d.device_ref(),
+      options.alpha,
+      options.beta
+    );
+
+    // Check if output from CUTLASS kernel and reference kernel are equal or not
+    tensor_d.sync_host();
+    tensor_ref_d.sync_host();
+
+    bool passed = cutlass::reference::host::TensorEquals(
+      tensor_d.host_view(),
+      tensor_ref_d.host_view());
+
+    if (!passed) {
+      result.reference_check = cutlass::Status::kErrorInternal;
+      std::cout << "ERROR - results miscompared.\n";
+    }
+    else {
+      result.reference_check = cutlass::Status::kSuccess;
+      std::cout << "Passed.\n";
+    }
+  }
+  else {
+    result.reference_check = cutlass::Status::kInvalid;
+  }
+
+  if (options.save_workspace) {
+
+    std::stringstream ss;
+
+    ss << "25_ampere_3d_fprop_mainloop_fusion"
+      << options.input_size.n() << "x" << options.input_size.h() << "x" << options.input_size.w() << "x" << options.input_size.c() 
+      << "_"
+      << options.filter_size.n() << "x" << options.filter_size.h() << "x" << options.filter_size.w() << "x" << options.filter_size.c() 
+      << ".dat";
+
+    std::ofstream output_workspace(ss.str());
+
+    output_workspace 
+      << "Input = \n" << tensor_a.host_view() << "\n\n"
+      << "Filters = \n" << tensor_b.host_view() << "\n\n";
+
+    if (options.reference_check) {
+      output_workspace << "Reference = \n" << tensor_ref_d.host_view() << "\n\n";
+    }
+
+    output_workspace << "Computed = \n" << tensor_d.host_view() << std::endl;
+
+    std::cout << "Results written to '" << ss.str() << "'." << std::endl;
+  }
+ 
+  //
+  // Performance measurement
+  //
+
+  if (options.measure_performance) {
+
+    cudaEvent_t events[2];
+    
+    for (auto & event : events) {
+      result.error = cudaEventCreate(&event);
+      if (result.error != cudaSuccess) {
+        std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+        return result;
+      }
+    }
+
+    // Record an event at the start of a series of convolution operations.
+    result.error = cudaEventRecord(events[0]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Launch a sequence of implicit GEMM operations on the device
+    for (int iteration = 0; iteration < options.iterations; ++iteration) {
+      result.status = implicit_gemm_fusion_op();
+      CUTLASS_CHECK(result.status);
+    }
+
+    // Record an event when the convolutions have been launched.
+    result.error = cudaEventRecord(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Wait for work on the device to complete.
+    result.error = cudaEventSynchronize(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Measure elapsed runtime
+    float runtime_ms = 0;
+    result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Print average runtime and GFLOPs.
+    result.runtime_ms = double(runtime_ms) / double(options.iterations);
+    result.gflops = options.gflops(result.runtime_ms / 1000.0);
+
+    // Cleanup
+    for (auto event : events) {
+      (void)cudaEventDestroy(event);
+    }
+  }
+
+  return result;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  bool notSupported = false;
+
+  // Ampere Tensor Core operations exposed with mma.sync are first available in CUDA 11.0.
+  //
+  // CUTLASS must be compiled with CUDA 11 Toolkit to run Conv3dFprop examples.
+  if (!(__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))) {
+    std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl;
+    notSupported = true;
+  }
+
+  cudaDeviceProp props;
+  CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
+
+  if (!(props.major >= 8)) {
+    std::cerr << "This test must run on SM80 or above.\n";
+    notSupported = true;
+  }
+
+  if (notSupported) {
+    return 0;
+  }
+
+  Options options;
+  
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.benchmark) {
+    // Benchmark several layers
+
+    int batch_sizes[] = {34, 18};
+
+    struct Benchmark {
+      int d, h, w, c, k, t, r, s, stride_d, stride_h, stride_w;
+    } layers[] = {
+      {56, 56, 56,   64,  256, 1, 1, 1, 1, 1, 1},
+      {56, 56, 56,   64,   64, 1, 1, 1, 1, 1, 1},
+      {56, 56, 56,   64,   64, 3, 3, 3, 1, 1, 1},
+      {56, 56, 56, 256,   64, 1, 1, 1, 1, 1, 1},
+      {56, 56, 56,  256,  512, 1, 1, 1, 2, 2, 2},
+      {56, 56, 56,  256,  128, 1, 1, 1, 1, 1, 1},
+      {56, 56, 56,  128,  128, 3, 3, 3, 2, 2, 2},
+      {28, 28, 28, 128,  512, 1, 1, 1, 1, 1, 1},
+      {28, 28, 28,  512,  128, 1, 1, 1, 1, 1, 1},
+      {28, 28, 28,  128,  128, 3, 3, 3, 1, 1, 1},
+      {28, 28, 28,  512, 1024, 1, 1, 1, 2, 2, 2},
+      {28, 28, 28, 512,  256, 1, 1, 1, 1, 1, 1},
+      {28, 28, 28,  256,  256, 3, 3, 3, 2, 2, 2},
+      {14, 14, 14,  256, 1024, 1, 1, 1, 1, 1, 1},
+      {14, 14, 14, 1024,  256, 1, 1, 1, 1, 1, 1},
+      {14, 14, 14,  256,  256, 3, 3, 3, 1, 1, 1},
+      {14, 14, 14, 1024, 2048, 1, 1, 1, 2, 2, 2},
+      {14, 14, 14, 1024,  512, 1, 1, 1, 1, 1, 1},
+      {14, 14,  14, 512,  512, 3, 3, 3, 2, 2, 2},
+      { 7,  7,  7, 512, 2048, 1, 1, 1, 1, 1, 1},
+      { 7,  7, 7, 2048,  512, 1, 1, 1, 1, 1, 1},
+      { 7,  7, 7, 512,  512, 3, 3, 3, 1, 1, 1},
+    };
+
+    Result::print_header(std::cout, options) << std::endl;
+
+    int idx = 1;
+
+    for (auto const &layer : layers) {
+      for (auto N : batch_sizes) {
+        options.update({N, layer.d, layer.h, layer.w, layer.c},
+                       {layer.k, layer.t, layer.r, layer.s, layer.c},
+                       cutlass::make_Coord(layer.stride_d, layer.stride_h, layer.stride_w));
+
+        Result result = profile_convolution(options);
+        result.print(std::cout, idx, options) << std::endl;
+      }
+
+      ++idx;
+    }
+  }
+  else {
+
+    // Execute one problem size
+    if (!options.valid()) {
+      std::cerr << "Invalid problem." << std::endl;
+      return -1;
+    }
+
+    Result result = profile_convolution(options);
+
+    Result::print_header(std::cout, options) << std::endl;
+    result.print(std::cout, 1, options) << std::endl;
+  }
+
+  return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu b/examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu
index fe756fbadd..87ed21c013 100644
--- a/examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu
+++ b/examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -429,9 +429,13 @@ Result profile_convolution(Options const &options) {
       ElementInputB(-8),
       0);
 
-  // Fill tensor C on host with zeros
-  cutlass::reference::host::TensorFill(
-      tensor_c.host_view());
+  // Fill tensor C on host with uniform-distribution random data 
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_c.host_view(),
+      1,
+      ElementOutput(7),
+      ElementOutput(-8),
+      0);
 
   // Fill tensor D on host with zeros
   cutlass::reference::host::TensorFill(
@@ -575,7 +579,7 @@ Result profile_convolution(Options const &options) {
 
     std::stringstream ss;
 
-    ss << "25_ampere_fprop_mainloop_fusion_"
+    ss << "25_ampere_fprop_mainloop_fusion"
       << options.input_size.n() << "x" << options.input_size.h() << "x" << options.input_size.w() << "x" << options.input_size.c() 
       << "_"
       << options.filter_size.n() << "x" << options.filter_size.h() << "x" << options.filter_size.w() << "x" << options.filter_size.c() 
@@ -677,8 +681,8 @@ int main(int argc, char const **args) {
   cudaDeviceProp props;
   CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
 
-  if (!(props.major == 8 && props.minor == 0)) {
-    std::cerr << "This test must run on SM80 A100.\n";
+  if (!(props.major >= 8)) {
+    std::cerr << "This test must run on SM80 or above.\n";
     notSupported = true;
   }
 
diff --git a/examples/26_ampere_wgrad_mainloop_fusion/CMakeLists.txt b/examples/26_ampere_wgrad_mainloop_fusion/CMakeLists.txt
index f836f6e08c..e96050c370 100644
--- a/examples/26_ampere_wgrad_mainloop_fusion/CMakeLists.txt
+++ b/examples/26_ampere_wgrad_mainloop_fusion/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu b/examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu
index 72d7284f6f..abb66b52d6 100644
--- a/examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu
+++ b/examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -266,8 +266,8 @@ struct Options {
   /// Prints the usage statement.
   std::ostream & print_usage(std::ostream &out) const {
 
-    out << "26_ampere_fused_wgrad_batch_normalization example\n\n"
-      << "  This example fuses scale+bias+relu from batch norm into Ampere's\n"
+    out << "26_ampere_wgrad_mainloop_fusion example\n\n"
+      << "  This example fuses scale+bias+relu of the activation into Ampere's\n"
       << "  Tensor Core operators on F16 data types to compute\n"
       << "  backward convolution on tensors of layout NHWC.\n\n"
       << "Options:\n\n"
@@ -289,8 +289,8 @@ struct Options {
       << "  --tag=<string>       String to replicate across the first column in the results table\n";
 
     out << "\n\nExamples:\n\n"
-      << "$ ./examples/26_ampere_fused_fprop_batch_normalization/26_ampere_fused_wgrad_batch_normalization  --n=32 --h=224 --w=224 --c=128 --k=256 --r=1 --s=1\n\n"
-      << "$ ./examples/26_ampere_fused_fprop_batch_normalization/26_ampere_fused_wgrad_batch_normalization  --n=1 --h=224 --w=224 --c=32 --k=32 --r=3 --s=3 --ref-check\n\n";
+      << "$ ./examples/26_ampere_wgrad_mainloop_fusion/26_ampere_wgrad_mainloop_fusion  --n=32 --h=224 --w=224 --c=128 --k=256 --r=1 --s=1\n\n"
+      << "$ ./examples/26_ampere_wgrad_mainloop_fusion/26_ampere_wgrad_mainloop_fusion  --n=1 --h=224 --w=224 --c=32 --k=32 --r=3 --s=3 --ref-check\n\n";
 
     return out;
   }
@@ -427,9 +427,13 @@ Result profile_convolution(Options const &options) {
       ElementInputA(-4),
       0);
 
-  // Fill tensor C on host with zeros
-  cutlass::reference::host::TensorFill(
-      tensor_c.host_view());
+  // Fill tensor C on host with uniform-distribution random data 
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_c.host_view(),
+      1,
+      ElementOutput(7),
+      ElementOutput(-8),
+      0);
 
   // Fill tensor D on host with zeros
   cutlass::reference::host::TensorFill(
diff --git a/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/27_ampere_3xtf32_fast_accurate_tensorop_gemm.cu b/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/27_ampere_3xtf32_fast_accurate_tensorop_gemm.cu
index 06559637fe..9e561cb6a2 100644
--- a/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/27_ampere_3xtf32_fast_accurate_tensorop_gemm.cu
+++ b/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/27_ampere_3xtf32_fast_accurate_tensorop_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -36,7 +36,7 @@ implicitly to tf32 inside the GEMM kernel which means no change is needed to acc
 fp32 data by using NVIDIA Ampere architecture.
 
 We can use the tf32 mode of tensor core to emulate a fast accurate SGEMM kernel which is accelerated
-using Ampere Tensor Cores (see include/cutlass/gemm/warp/mma_tensor_op_fast_f32.h). 
+using Ampere Tensor Cores (see include/cutlass/gemm/warp/mma_tensor_op_fast_f32.h).
 
 The trick is very simple
   a x b = (a_big + a_small) x (b_big + b_small) = a_big x b_big + a_big x b_small + a_small x b_big
@@ -45,11 +45,11 @@ The trick is very simple
 
 a_small x b_small is discarded because they are too small.
 
-This example demonstrates usage of this kernel, along with accuracy measurements w.r.t. actual FP32 
+This example demonstrates usage of this kernel, along with accuracy measurements w.r.t. actual FP32
 results (SGEMM using SIMT) and against FP64 results (DGEMM)
 
-To enable this feature, the only change needs to make is to change the default OpMultiplyAdd to 
-OpMultiplyAddFastF32. 
+To enable this feature, the only change needs to make is to change the default OpMultiplyAdd to
+OpMultiplyAddFastF32.
 
 Now, we have several different flavors of sgemm now in the profiler for Ampere.  Here are the difference
 
@@ -97,14 +97,14 @@ struct Result {
   double l2_norm_fp32_vs_fp64;
 
   // ctor
-  Result(  
+  Result(
     int m, int n, int k,
     double runtime_ms, double gflops,
     double l2_norm_3xtf32_vs_fp64,
     double l2_norm_1xtf32_vs_fp64,
-    double l2_norm_fp32_vs_fp64) : 
+    double l2_norm_fp32_vs_fp64) :
     m(m), n(n), k(k),
-    runtime_ms(runtime_ms), gflops(gflops), 
+    runtime_ms(runtime_ms), gflops(gflops),
     l2_norm_3xtf32_vs_fp64(l2_norm_3xtf32_vs_fp64),
     l2_norm_1xtf32_vs_fp64(l2_norm_1xtf32_vs_fp64),
     l2_norm_fp32_vs_fp64(l2_norm_fp32_vs_fp64)   {}
@@ -147,7 +147,7 @@ struct Options {
   int iterations;
   int seed;
   bool benchmark;
-  
+
   Options():
     help(false),
     problem_size({3456, 4096, 4096}),
@@ -190,7 +190,7 @@ struct Options {
 
     cmd.get_cmd_line_argument("alpha", alpha);
     cmd.get_cmd_line_argument("beta", beta);
-    
+
     cmd.get_cmd_line_argument("iterations", iterations);
     cmd.get_cmd_line_argument("seed", seed);
     cmd.get_cmd_line_argument("rand_mode", rand_mode);
@@ -227,9 +227,9 @@ struct Options {
   /// Compute performance in GFLOP/s
   double gflops(double runtime_s) const {
 
-    // Number of real-valued multiply-adds 
+    // Number of real-valued multiply-adds
     int64_t fmas = problem_size.product();
-    
+
     // Two flops per multiply-add
     return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
   }
@@ -258,7 +258,7 @@ using ShapeMMAWarp = cutlass::gemm::GemmShape<64, 32, 16>;  // <- warp tile M =
 using ShapeMMAOp = cutlass::gemm::GemmShape<16, 8, 8>;  // <- MMA Op tile M = 16, N = 8, K = 8
 
 // This code section describes how threadblocks are scheduled on GPU
-using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;  // <- ??
+using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
 
 // This code section describes the epilogue part of the kernel
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
@@ -272,10 +272,10 @@ using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
 
 // Number of pipelines you want to use
 constexpr int NumStages = 3;
-// Alignment 
+// Alignment
 constexpr int Alignment = 4;
 
-// 
+//
 // Gemm Operators (Gemm_3xTF32, Gemm_1xTF32, GEMM_F32, GEMM_F64)
 //
 
@@ -296,7 +296,7 @@ using Gemm_3xTF32 = cutlass::gemm::device::Gemm<
                                               EpilogueOp,
                                               SwizzleThreadBlock,
                                               NumStages,
-                                              Alignment, 
+                                              Alignment,
                                               Alignment,
                                               false,
                                               cutlass::arch::OpMultiplyAddFastF32>;
@@ -318,7 +318,7 @@ using Gemm_1xTF32 = cutlass::gemm::device::Gemm<
                                               EpilogueOp,
                                               SwizzleThreadBlock,
                                               NumStages,
-                                              Alignment, 
+                                              Alignment,
                                               Alignment,
                                               false,
                                               cutlass::arch::OpMultiplyAdd>;
@@ -356,7 +356,7 @@ bool run(Options &options) {
   cutlass::HostTensor<float, LayoutInputA> tensor_a_F32(problem_size.mk());  // <- Create matrix A with dimensions M x K
   cutlass::HostTensor<float, LayoutInputB> tensor_b_F32(problem_size.kn());  // <- Create matrix B with dimensions K x N
   cutlass::HostTensor<float, LayoutOutput> tensor_c_F32(problem_size.mn());  // <- Create matrix C with dimensions M x N
-  cutlass::HostTensor<float, LayoutOutput> tensor_d_F32(problem_size.mn());  // <- Create matrix D with dimensions M x N 
+  cutlass::HostTensor<float, LayoutOutput> tensor_d_F32(problem_size.mn());  // <- Create matrix D with dimensions M x N
 
   if (options.rand_mode == "uniform") {
     const float min = -1;
@@ -397,7 +397,7 @@ bool run(Options &options) {
   }
   cutlass::reference::host::TensorFill(
       tensor_d_F32.host_view());  // <- fill matrix D on host with zeros
-  
+
   // Copy data from host to GPU
   tensor_a_F32.sync_device();
   tensor_b_F32.sync_device();
@@ -411,7 +411,7 @@ bool run(Options &options) {
   cutlass::HostTensor<double, LayoutInputA> tensor_a_F64(problem_size.mk());  // <- Create matrix A with dimensions M x K
   cutlass::HostTensor<double, LayoutInputB> tensor_b_F64(problem_size.kn());  // <- Create matrix B with dimensions K x N
   cutlass::HostTensor<double, LayoutOutput> tensor_c_F64(problem_size.mn());  // <- Create matrix C with dimensions M x N
-  
+
   // Gemm output (D) for GEMM_F64
   cutlass::HostTensor<double, LayoutOutput> tensor_d_F64(problem_size.mn());  // <- Create matrix D with dimensions M x N
   // Gemm output (D) for GEMM_3xTF32
@@ -426,7 +426,7 @@ bool run(Options &options) {
   cutlass::reference::host::TensorCopy(tensor_d_F64.host_view(), tensor_d_F32.host_view());
   cutlass::reference::host::TensorCopy(tensor_d_3xTF32.host_view(), tensor_d_F32.host_view());
   cutlass::reference::host::TensorCopy(tensor_d_1xTF32.host_view(), tensor_d_F32.host_view());
-  
+
   // Copy data from host to GPU
   tensor_a_F64.sync_device();
   tensor_b_F64.sync_device();
@@ -464,7 +464,7 @@ bool run(Options &options) {
   // Instantiate CUTLASS kernel depending on templates
   Gemm_3xTF32 gemm_op_3xTF32;
 
-  // Check the problem size is supported or not 
+  // Check the problem size is supported or not
   cutlass::Status status_3xtf32 = gemm_op_3xTF32.can_implement(arguments_3xtf32);
   CUTLASS_CHECK(status_3xtf32);
 
@@ -568,7 +568,7 @@ bool run(Options &options) {
   // Instantiate CUTLASS kernel depending on templates
   Gemm_1xTF32 gemm_op_1xtf32;
 
-  // Check the problem size is supported or not 
+  // Check the problem size is supported or not
   cutlass::Status status_1xtf32 = gemm_op_1xtf32.can_implement(arguments_1xtf32);
   CUTLASS_CHECK(status_1xtf32);
 
@@ -627,7 +627,7 @@ bool run(Options &options) {
   tensor_d_F32.sync_host();
 
   ////////////////////////////////////////////////////////////////////////////////
-  ///////               Compute l2 norms 
+  ///////               Compute l2 norms
   ////////////////////////////////////////////////////////////////////////////////
 
   // l2 norm 3xTF32 vs F64
@@ -664,7 +664,7 @@ bool run(Options &options) {
   std::cout << "GFLOPs: " << result.gflops << std::endl;
   std::cout << "Normalized L2 norm of" << std::endl;
   std::cout.precision(8);
-  std::cout << std::scientific 
+  std::cout << std::scientific
             << " - 3xTF32 error with FP64 reference : " << result.l2_norm_3xtf32_vs_fp64 << std::endl
             << " - 1xTF32 error with FP64 reference : " << result.l2_norm_1xtf32_vs_fp64 << std::endl
             << " - FP32 error with FP64 reference   : " << result.l2_norm_fp32_vs_fp64 << std::endl;
@@ -673,11 +673,11 @@ bool run(Options &options) {
 }
 
 int main(int argc, const char **argv) {
-  
+
   bool notSupported = false;
 
   // Ampere Tensor Core operations exposed with mma.sync and ldmatrix are first available
-  // in CUDA 11.0. 
+  // in CUDA 11.0.
   //
   // CUTLASS must be compiled with CUDA 11.0 Toolkit to run these examples.
   if (!(__CUDACC_VER_MAJOR__ >= 11)) {
@@ -690,7 +690,7 @@ int main(int argc, const char **argv) {
   cudaError_t error = cudaGetDeviceProperties(&props, 0);
   if (error != cudaSuccess) {
     std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
-    return false;
+    return -1;
   }
 
   if (!((props.major * 10 + props.minor) >= 80)) {
@@ -716,17 +716,17 @@ int main(int argc, const char **argv) {
 
   if (options.benchmark) {
     for (int k = 4; k <= 65536; k *= 2) {
-  
+
       options.problem_size[2] = k;
-  
+
       printf("Gemm problem size: %d x %d x %d\n", \
         options.problem_size.m(), options.problem_size.n(), options.problem_size.k());
-  
+
       if (!options.valid()) {
         std::cerr << "Invalid problem." << std::endl;
         return -1;
       }
-  
+
       result &= run(options);
     }
   } else {
diff --git a/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/CMakeLists.txt b/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/CMakeLists.txt
index c551b1256f..5b38de6e91 100644
--- a/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/CMakeLists.txt
+++ b/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/CMakeLists.txt b/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/CMakeLists.txt
index 04ac4bd183..50a7c9e619 100644
--- a/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/CMakeLists.txt
+++ b/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu b/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu
index a197e2efce..d2a3b4c693 100644
--- a/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu
+++ b/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -740,7 +740,7 @@ int main(int argc, char const **args) {
   cudaDeviceProp props;
   CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
 
-  if (!(props.major > 8 || (props.major == 8 && props.minor >= 0))) {
+  if (!(props.major >= 8)) {
     std::cerr << "Ampere Tensor Ops must be run on a machine with compute capability at least 80."
               << std::endl;
     notSupported = true;
diff --git a/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm.cu b/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/29_3xtf32_complex_gemm.cu
similarity index 97%
rename from examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm.cu
rename to examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/29_3xtf32_complex_gemm.cu
index 6f89d64fa7..0a995bf929 100644
--- a/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm.cu
+++ b/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/29_3xtf32_complex_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -34,7 +34,7 @@
   difference is that this example uses 3xtf32 on complex gemm.
 
   To enable this feature, the only change needs to make is to change OpMultiplyAddComplex
-  to OpMultiplyAddComplexFastF32. 
+  to OpMultiplyAddComplexFastF32.
 */
 
 #include <iostream>
@@ -74,14 +74,14 @@ struct Result {
   double l2_norm_fp32_vs_fp64;
 
   // ctor
-  Result(  
+  Result(
     int m, int n, int k,
     double runtime_ms, double gflops,
     double l2_norm_3xtf32_vs_fp64,
     double l2_norm_1xtf32_vs_fp64,
-    double l2_norm_fp32_vs_fp64) : 
+    double l2_norm_fp32_vs_fp64) :
     m(m), n(n), k(k),
-    runtime_ms(runtime_ms), gflops(gflops), 
+    runtime_ms(runtime_ms), gflops(gflops),
     l2_norm_3xtf32_vs_fp64(l2_norm_3xtf32_vs_fp64),
     l2_norm_1xtf32_vs_fp64(l2_norm_1xtf32_vs_fp64),
     l2_norm_fp32_vs_fp64(l2_norm_fp32_vs_fp64)   {}
@@ -124,7 +124,7 @@ struct Options {
   int iterations;
   int seed;
   bool benchmark;
-  
+
   Options():
     help(false),
     problem_size({3456, 4096, 4096}),
@@ -153,7 +153,7 @@ struct Options {
 
     cmd.get_cmd_line_argument("alpha", alpha);
     cmd.get_cmd_line_argument("beta", beta);
-    
+
     cmd.get_cmd_line_argument("iterations", iterations);
     cmd.get_cmd_line_argument("seed", seed);
     cmd.get_cmd_line_argument("rand_mode", rand_mode);
@@ -181,7 +181,7 @@ struct Options {
       << "  --benchmark                 If set (true), performance benchmarking on several layers and batch-size.\n\n";
 
     out << "\n\nExamples:\n\n"
-      << "$ ./examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/29_ampere_3xtf32_fast_accurate_complex_gemm --m=1024 --n=512 \\\n"
+      << "$ ./examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/29_3xtf32_complex_gemm --m=1024 --n=512 \\\n"
       << "     --alpha=2 --beta=0.707 \n\n";
 
     return out;
@@ -190,9 +190,9 @@ struct Options {
   /// Compute performance in GFLOP/s
   double gflops(double runtime_s) const {
 
-    // Number of real-valued multiply-adds 
+    // Number of real-valued multiply-adds
     int64_t fmas = problem_size.product();
-    
+
     // Two flops per multiply-add
     return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
   }
@@ -221,7 +221,7 @@ using ShapeMMAWarp = cutlass::gemm::GemmShape<32, 32, 16>;  // <- warp tile M =
 using ShapeMMAOp = cutlass::gemm::GemmShape<16, 8, 8>;  // <- MMA Op tile M = 16, N = 8, K = 8
 
 // This code section describes how threadblocks are scheduled on GPU
-using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;  // <- ??
+using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
 
 // This code section describes the epilogue part of the kernel
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
@@ -239,7 +239,7 @@ constexpr int NumStages = 3;
 constexpr cutlass::ComplexTransform TransformA = cutlass::ComplexTransform::kNone;
 constexpr cutlass::ComplexTransform TransformB = cutlass::ComplexTransform::kNone;
 
-// 
+//
 // Gemm Operators (Gemm_3xTF32, Gemm_1xTF32, GEMM_F32, GEMM_F64)
 //
 
@@ -260,7 +260,7 @@ using Gemm_3xTF32 = cutlass::gemm::device::GemmComplex<
                                               EpilogueOp,
                                               SwizzleThreadBlock,
                                               NumStages,
-                                              TransformA, 
+                                              TransformA,
                                               TransformB,
                                               cutlass::arch::OpMultiplyAddComplexFastF32>;
 
@@ -281,7 +281,7 @@ using Gemm_1xTF32 = cutlass::gemm::device::GemmComplex<
                                               EpilogueOp,
                                               SwizzleThreadBlock,
                                               NumStages,
-                                              TransformA, 
+                                              TransformA,
                                               TransformB,
                                               cutlass::arch::OpMultiplyAddComplex>;
 
@@ -296,7 +296,7 @@ bool run(Options &options) {
   cutlass::HostTensor<cutlass::complex<float>, LayoutInputA> tensor_a_F32(problem_size.mk());  // <- Create matrix A with dimensions M x K
   cutlass::HostTensor<cutlass::complex<float>, LayoutInputB> tensor_b_F32(problem_size.kn());  // <- Create matrix B with dimensions K x N
   cutlass::HostTensor<cutlass::complex<float>, LayoutOutput> tensor_c_F32(problem_size.mn());  // <- Create matrix C with dimensions M x N
-  cutlass::HostTensor<cutlass::complex<float>, LayoutOutput> tensor_d_F32(problem_size.mn());  // <- Create matrix D with dimensions M x N 
+  cutlass::HostTensor<cutlass::complex<float>, LayoutOutput> tensor_d_F32(problem_size.mn());  // <- Create matrix D with dimensions M x N
 
   if (options.rand_mode == "uniform") {
     const float min = -1;
@@ -337,7 +337,7 @@ bool run(Options &options) {
   }
   cutlass::reference::host::TensorFill(
       tensor_d_F32.host_view());  // <- fill matrix D on host with zeros
-  
+
   // Copy data from host to GPU
   tensor_a_F32.sync_device();
   tensor_b_F32.sync_device();
@@ -351,7 +351,7 @@ bool run(Options &options) {
   cutlass::HostTensor<cutlass::complex<double>, LayoutInputA> tensor_a_F64(problem_size.mk());  // <- Create matrix A with dimensions M x K
   cutlass::HostTensor<cutlass::complex<double>, LayoutInputB> tensor_b_F64(problem_size.kn());  // <- Create matrix B with dimensions K x N
   cutlass::HostTensor<cutlass::complex<double>, LayoutOutput> tensor_c_F64(problem_size.mn());  // <- Create matrix C with dimensions M x N
-  
+
   // Gemm output (D) for GEMM_F64
   cutlass::HostTensor<cutlass::complex<double>, LayoutOutput> tensor_d_F64(problem_size.mn());  // <- Create matrix D with dimensions M x N
   // Gemm output (D) for GEMM_3xTF32
@@ -366,7 +366,7 @@ bool run(Options &options) {
   cutlass::reference::host::TensorCopy(tensor_d_F64.host_view(), tensor_d_F32.host_view());
   cutlass::reference::host::TensorCopy(tensor_d_3xTF32.host_view(), tensor_d_F32.host_view());
   cutlass::reference::host::TensorCopy(tensor_d_1xTF32.host_view(), tensor_d_F32.host_view());
-  
+
   // Copy data from host to GPU
   tensor_a_F64.sync_device();
   tensor_b_F64.sync_device();
@@ -404,7 +404,7 @@ bool run(Options &options) {
   // Instantiate CUTLASS kernel depending on templates
   Gemm_3xTF32 gemm_op;
 
-  // Check the problem size is supported or not 
+  // Check the problem size is supported or not
   cutlass::Status status_3xtf32 = gemm_op.can_implement(arguments_3xtf32);
   CUTLASS_CHECK(status_3xtf32);
 
@@ -508,7 +508,7 @@ bool run(Options &options) {
   // Instantiate CUTLASS kernel depending on templates
   Gemm_1xTF32 gemm_op_1xtf32;
 
-  // Check the problem size is supported or not 
+  // Check the problem size is supported or not
   cutlass::Status status_1xtf32 = gemm_op_1xtf32.can_implement(arguments_1xtf32);
   CUTLASS_CHECK(status_1xtf32);
 
@@ -569,7 +569,7 @@ bool run(Options &options) {
   tensor_d_F32.sync_host();
 
   ////////////////////////////////////////////////////////////////////////////////
-  ///////               Compute l2 norms 
+  ///////               Compute l2 norms
   ////////////////////////////////////////////////////////////////////////////////
 
   // l2 norm 3xTF32 vs F64
@@ -606,7 +606,7 @@ bool run(Options &options) {
   std::cout << "GFLOPs: " << result.gflops << std::endl;
   std::cout << "Normalized L2 norm of" << std::endl;
   std::cout.precision(8);
-  std::cout << std::scientific 
+  std::cout << std::scientific
             << " - 3xTF32 error with FP64 reference : " << result.l2_norm_3xtf32_vs_fp64 << std::endl
             << " - 1xTF32 error with FP64 reference : " << result.l2_norm_1xtf32_vs_fp64 << std::endl
             << " - FP32 error with FP64 reference   : " << result.l2_norm_fp32_vs_fp64 << std::endl;
@@ -615,11 +615,11 @@ bool run(Options &options) {
 }
 
 int main(int argc, const char **argv) {
-  
+
   bool notSupported = false;
 
   // Ampere Tensor Core operations exposed with mma.sync and ldmatrix are first available
-  // in CUDA 11.0. 
+  // in CUDA 11.0.
   //
   // CUTLASS must be compiled with CUDA 11.0 Toolkit to run these examples.
   if (!(__CUDACC_VER_MAJOR__ >= 11)) {
@@ -632,7 +632,7 @@ int main(int argc, const char **argv) {
   cudaError_t error = cudaGetDeviceProperties(&props, 0);
   if (error != cudaSuccess) {
     std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
-    return false;
+    return -1;
   }
 
   if (!((props.major * 10 + props.minor) >= 80)) {
@@ -658,17 +658,17 @@ int main(int argc, const char **argv) {
 
   if (options.benchmark) {
     for (int k = 4; k <= 65536; k *= 2) {
-  
+
       options.problem_size[2] = k;
-  
+
       printf("Gemm problem size: %d x %d x %d\n", \
         options.problem_size.m(), options.problem_size.n(), options.problem_size.k());
-  
+
       if (!options.valid()) {
         std::cerr << "Invalid problem." << std::endl;
         return -1;
       }
-  
+
       result &= run(options);
     }
   } else {
diff --git a/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/CMakeLists.txt b/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/CMakeLists.txt
index c7e896ba90..e406a7eda7 100644
--- a/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/CMakeLists.txt
+++ b/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -27,9 +27,9 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
+# Both filenames are shorter to avoid MAX_PATH issues on Windows.
 cutlass_example_add_executable(
-  29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm
-  29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm.cu
+  29_3xtf32_complex_gemm
+  29_3xtf32_complex_gemm.cu
   )
 
diff --git a/examples/30_wgrad_split_k/30_wgrad_split_k.cu b/examples/30_wgrad_split_k/30_wgrad_split_k.cu
index 5016adf292..822a7a55f8 100644
--- a/examples/30_wgrad_split_k/30_wgrad_split_k.cu
+++ b/examples/30_wgrad_split_k/30_wgrad_split_k.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -531,17 +531,17 @@ Result profile_convolution(Options const &options) {
         // Reduction input
         {
             reinterpret_cast<ElementAccumulator*> (workspace.get()),
-            ReductionStrideIndex(tensor_c.stride()[ImplicitGemm::ImplicitGemmKernel::kTensorCStrideIdx])
+            ReductionStrideIndex(tensor_c.stride()[ImplicitGemm::UnderlyingKernel::kTensorCStrideIdx])
         },
         // Destination
         {
             tensor_d.device_data(),
-            ReductionStrideIndex(tensor_d.stride()[ImplicitGemm::ImplicitGemmKernel::kTensorCStrideIdx])
+            ReductionStrideIndex(tensor_d.stride()[ImplicitGemm::UnderlyingKernel::kTensorCStrideIdx])
         },
         // Source
         {
             tensor_c.device_data(),
-            ReductionStrideIndex(tensor_c.stride()[ImplicitGemm::ImplicitGemmKernel::kTensorCStrideIdx])
+            ReductionStrideIndex(tensor_c.stride()[ImplicitGemm::UnderlyingKernel::kTensorCStrideIdx])
         },
         {options.alpha, options.beta}
     );
@@ -602,7 +602,7 @@ Result profile_convolution(Options const &options) {
 
     std::stringstream ss;
 
-    ss << "26_ampere_fused_wgrad_batch_normalization_"
+    ss << "30_wgrad_split_k_"
        << options.input_size.n() << "x" << options.input_size.h() << "x" << options.input_size.w() << "x" << options.input_size.c()
        << "_"
        << options.filter_size.n() << "x" << options.filter_size.h() << "x" << options.filter_size.w() << "x" << options.filter_size.c()
@@ -703,7 +703,7 @@ int main(int argc, char const **args) {
   cudaDeviceProp props;
   CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
 
-  if (!(props.major > 8 || (props.major == 8 && props.minor >= 0))) {
+  if (!(props.major >= 8)) {
     std::cerr << "Ampere Tensor Ops must be run on a machine with compute capability at least 80."
               << std::endl;
     notSupported = true;
diff --git a/examples/30_wgrad_split_k/CMakeLists.txt b/examples/30_wgrad_split_k/CMakeLists.txt
index 3fc5a8954f..98eda79126 100644
--- a/examples/30_wgrad_split_k/CMakeLists.txt
+++ b/examples/30_wgrad_split_k/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/31_basic_syrk/CMakeLists.txt b/examples/31_basic_syrk/CMakeLists.txt
index e6c077502e..8d5571d237 100644
--- a/examples/31_basic_syrk/CMakeLists.txt
+++ b/examples/31_basic_syrk/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/31_basic_syrk/basic_syrk.cu b/examples/31_basic_syrk/basic_syrk.cu
index 79b3ab6019..9f9cd93a35 100644
--- a/examples/31_basic_syrk/basic_syrk.cu
+++ b/examples/31_basic_syrk/basic_syrk.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -34,7 +34,7 @@
   matrix multiply kernel to verify its correctness.
 
   The CUTLASS Syrk template is instantiated in the function CutlassSsyrkNN. This is kernel computes
-  the symmetric rank-k update (SYRK) using double-precision doubleing-point arithmetic and assumes
+  the symmetric rank-k update (SYRK) using double-precision floating-point arithmetic and assumes
   all matrices have column-major layout.
 
   The threadblock tile size is chosen as 16x32x16 which offers good performance for large matrices.
@@ -113,10 +113,10 @@ cudaError_t CutlassSsyrkNN(
     >,
     cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
     5,     // Stages
-    1,     // AligmentA
+    1,     // AlignmentA
     false, // SplitKSerail
-    cutlass::arch::OpMultiplyAdd, 
-    cutlass::ComplexTransform::kNone, 
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::ComplexTransform::kNone,
     cutlass::BlasMode::kSymmetric
   >;
 
@@ -149,7 +149,7 @@ cudaError_t CutlassSsyrkNN(
   //
   // Launch the CUTLASS SYRK kernel.
   //
-  
+
   cutlass::Status status = syrk_operator(args);
 
   //
diff --git a/examples/32_basic_trmm/CMakeLists.txt b/examples/32_basic_trmm/CMakeLists.txt
index 0e1afff190..459dbe8f94 100644
--- a/examples/32_basic_trmm/CMakeLists.txt
+++ b/examples/32_basic_trmm/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/32_basic_trmm/basic_trmm.cu b/examples/32_basic_trmm/basic_trmm.cu
index 988b7a6298..d2eda76a0c 100644
--- a/examples/32_basic_trmm/basic_trmm.cu
+++ b/examples/32_basic_trmm/basic_trmm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -34,7 +34,7 @@
   matrix multiply kernel to verify its correctness.
 
   The CUTLASS Trmm template is instantiated in the function CutlassStrmmNN. This is kernel computes
-  the triangular matrix product (TRMM) using double-precision doubleing-point arithmetic and assumes
+  the triangular matrix product (TRMM) using double-precision floating-point arithmetic and assumes
   all matrices have column-major layout.
 
   The threadblock tile size is chosen as 64x64x16 which offers good performance for large matrices.
diff --git a/examples/33_ampere_3xtf32_tensorop_symm/CMakeLists.txt b/examples/33_ampere_3xtf32_tensorop_symm/CMakeLists.txt
index 858378da58..115040396e 100644
--- a/examples/33_ampere_3xtf32_tensorop_symm/CMakeLists.txt
+++ b/examples/33_ampere_3xtf32_tensorop_symm/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/33_ampere_3xtf32_tensorop_symm/ampere_3xtf32_tensorop_symm.cu b/examples/33_ampere_3xtf32_tensorop_symm/ampere_3xtf32_tensorop_symm.cu
index 2e875a91c0..22cb3286eb 100644
--- a/examples/33_ampere_3xtf32_tensorop_symm/ampere_3xtf32_tensorop_symm.cu
+++ b/examples/33_ampere_3xtf32_tensorop_symm/ampere_3xtf32_tensorop_symm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -36,7 +36,7 @@ implicitly to tf32 inside the SYMM kernel which means no change is needed to acc
 F32 data by using NVIDIA Ampere architecture.
 
 We can use the tf32 mode of tensor core to emulate a fast accurate SYMM kernel which is accelerated
-using Ampere Tensor Cores (see include/cutlass/gemm/warp/mma_tensor_op_fast_f32.h). 
+using Ampere Tensor Cores (see include/cutlass/gemm/warp/mma_tensor_op_fast_f32.h).
 
 The trick is very simple
   a x b = (a_big + a_small) x (b_big + b_small) = a_big x b_big + a_big x b_small + a_small x b_big
@@ -45,11 +45,11 @@ The trick is very simple
 
 a_small x b_small is discarded because they are too small.
 
-This example demonstrates usage of this kernel, along with accuracy measurements w.r.t. actual F32 
+This example demonstrates usage of this kernel, along with accuracy measurements w.r.t. actual F32
 results (SSYMM from cuBLAS) and against F64 results (DSYMM from CUTLASS)
 
-To enable this feature, the only change needs to make is to change the default OpMultiplyAdd to 
-OpMultiplyAddFastF32. 
+To enable this feature, the only change needs to make is to change the default OpMultiplyAdd to
+OpMultiplyAddFastF32.
 
 Now, we have two different flavors of SSYMM in the profiler for Ampere:
 
@@ -95,7 +95,7 @@ struct Options {
   float beta;
   std::string rand_mode;
   int seed;
-  
+
   Options():
     help(false),
     problem_size({4096, 4096, 4096}),
@@ -137,7 +137,7 @@ struct Options {
 
     cmd.get_cmd_line_argument("alpha", alpha);
     cmd.get_cmd_line_argument("beta", beta);
-    
+
     cmd.get_cmd_line_argument("seed", seed);
     cmd.get_cmd_line_argument("rand_mode", rand_mode);
 
@@ -193,7 +193,7 @@ using ShapeMMAWarp = cutlass::gemm::GemmShape<64, 32, 16>;  // <- warp tile M =
 using ShapeMMAOp = cutlass::gemm::GemmShape<16, 8, 8>;  // <- MMA Op tile M = 16, N = 8, K = 8
 
 // This code section describes how threadblocks are scheduled on GPU
-using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;  // <- ??
+using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
 
 // This code section describes the epilogue part of the kernel
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
@@ -207,10 +207,10 @@ using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
 
 // Number of pipelines you want to use
 constexpr int NumStages = 3;
-// Alignment 
+// Alignment
 constexpr int Alignment = 4;
 
-// 
+//
 // CUTLASS Symm Operators (SSYM: Symm_3xTF32, Symm_1xTF32, DSYMM: Symm_F64)
 //
 
@@ -233,7 +233,7 @@ using Symm_3xTF32 = cutlass::gemm::device::Symm<
                                               EpilogueOp,
                                               SwizzleThreadBlock,
                                               NumStages,
-                                              1, // Symmetric matrix is always align 1 
+                                              1, // Symmetric matrix is always align 1
                                               Alignment,
                                               false,
                                               cutlass::arch::OpMultiplyAddFastF32>;
@@ -257,7 +257,7 @@ using Symm_1xTF32 = cutlass::gemm::device::Symm<
                                               EpilogueOp,
                                               SwizzleThreadBlock,
                                               NumStages,
-                                              1, // Symmetric matrix is always align 1 
+                                              1, // Symmetric matrix is always align 1
                                               Alignment,
                                               false,
                                               cutlass::arch::OpMultiplyAdd>;
@@ -298,7 +298,7 @@ bool run(Options &options) {
   cutlass::HostTensor<float, LayoutInputA> tensor_a_F32(problem_size.mk());  // <- Create matrix A with dimensions M x K
   cutlass::HostTensor<float, LayoutInputB> tensor_b_F32(problem_size.kn());  // <- Create matrix B with dimensions K x N
   cutlass::HostTensor<float, LayoutOutput> tensor_c_F32(problem_size.mn());  // <- Create matrix C with dimensions M x N
-  cutlass::HostTensor<float, LayoutOutput> tensor_d_F32(problem_size.mn());  // <- Create matrix D with dimensions M x N 
+  cutlass::HostTensor<float, LayoutOutput> tensor_d_F32(problem_size.mn());  // <- Create matrix D with dimensions M x N
 
   if (options.rand_mode == "uniform") {
     const float min = -1;
@@ -339,7 +339,7 @@ bool run(Options &options) {
   }
   cutlass::reference::host::TensorFill(
       tensor_d_F32.host_view());  // <- fill matrix D on host with zeros
-  
+
   // Copy data from host to GPU
   tensor_a_F32.sync_device();
   tensor_b_F32.sync_device();
@@ -353,7 +353,7 @@ bool run(Options &options) {
   cutlass::HostTensor<double, LayoutInputA> tensor_a_F64(problem_size.mk());  // <- Create matrix A with dimensions M x K
   cutlass::HostTensor<double, LayoutInputB> tensor_b_F64(problem_size.kn());  // <- Create matrix B with dimensions K x N
   cutlass::HostTensor<double, LayoutOutput> tensor_c_F64(problem_size.mn());  // <- Create matrix C with dimensions M x N
-  
+
   // Symm output (D) for SYMM_3xTF32
   cutlass::HostTensor<float, LayoutOutput> tensor_d_3xTF32(problem_size.mn());  // <- Create matrix D with dimensions M x N
   // Symm output (D) for SYMM_1xTF32
@@ -375,7 +375,7 @@ bool run(Options &options) {
 #if CUTLASS_ENABLE_CUBLAS
   cutlass::reference::host::TensorCopy(tensor_d_cublasF32.host_view(), tensor_d_F32.host_view());
 #endif
-  
+
   // Copy data from host to GPU
   tensor_a_F64.sync_device();
   tensor_b_F64.sync_device();
@@ -430,7 +430,7 @@ bool run(Options &options) {
   // Instantiate CUTLASS kernel depending on templates
   Symm_3xTF32 symm_op_3xtf32;
 
-  // Check the problem size is supported or not 
+  // Check the problem size is supported or not
   cutlass::Status status_3xtf32 = symm_op_3xtf32.can_implement(arguments_3xtf32);
   CUTLASS_CHECK(status_3xtf32);
 
@@ -477,7 +477,7 @@ bool run(Options &options) {
   // Instantiate CUTLASS kernel depending on templates
   Symm_1xTF32 symm_op_1xtf32;
 
-  // Check the problem size is supported or not 
+  // Check the problem size is supported or not
   cutlass::Status status_1xtf32 = symm_op_1xtf32.can_implement(arguments_1xtf32);
   CUTLASS_CHECK(status_1xtf32);
 
@@ -524,7 +524,7 @@ bool run(Options &options) {
   // Instantiate CUTLASS kernel depending on templates
   Symm_F64 symm_op_f64;
 
-  // Check the problem size is supported or not 
+  // Check the problem size is supported or not
   cutlass::Status status_f64 = symm_op_f64.can_implement(arguments_f64);
   CUTLASS_CHECK(status_f64);
 
@@ -568,7 +568,7 @@ bool run(Options &options) {
       static_cast<const float*>(&beta),
       static_cast<float*>(tensor_d_cublasF32.device_data()),
       int(tensor_d_cublasF32.layout().stride(0))
-    );   
+    );
 
   cudaDeviceSynchronize();
 
@@ -576,7 +576,7 @@ bool run(Options &options) {
 #endif
 
   ////////////////////////////////////////////////////////////////////////////////
-  /// 7. Compute l2 norms 
+  /// 7. Compute l2 norms
   ////////////////////////////////////////////////////////////////////////////////
 
 #if CUTLASS_ENABLE_CUBLAS
@@ -605,20 +605,20 @@ bool run(Options &options) {
   double l2_norm_3xtf32_vs_cublasf32 = cutlass::reference::host::TensorRelativeErrorMetric(
     tensor_d_3xTF32.host_view(), tensor_d_cublasF32.host_view());
 #endif
-  
+
   // l2 norm 3xTF32 vs 1xTF32
   double l2_norm_3xtf32_vs_1xtf32 = cutlass::reference::host::TensorRelativeErrorMetric(
     tensor_d_3xTF32.host_view(), tensor_d_1xTF32.host_view());
 
   ///////////////////////////////////////////////////////////////////////////////
 
-  // Print kernel info and L2 norms 
+  // Print kernel info and L2 norms
   std::cout << "Problem Size: (" << problem_size.m() << "," << problem_size.n() << "," << problem_size.k() << ") "
             << "Alpha: "  << alpha << "," << " Beta: "  << beta << std::endl;
   std::cout << std::fixed;
   std::cout << "Normalized L2 norm of" << std::endl;
   std::cout.precision(8);
-  std::cout << std::scientific 
+  std::cout << std::scientific
 #if CUTLASS_ENABLE_CUBLAS
             << " - cuBLAS F32 error with F64 reference    : " << l2_norm_cublasf32_vs_f64 << std::endl
 #endif
@@ -633,11 +633,11 @@ bool run(Options &options) {
 }
 
 int main(int argc, const char **argv) {
-  
+
   bool notSupported = false;
 
   // Ampere Tensor Core operations exposed with mma.sync and ldmatrix are first available
-  // in CUDA 11.0. 
+  // in CUDA 11.0.
   //
   // CUTLASS must be compiled with CUDA 11.0 Toolkit to run these examples.
   if (!(__CUDACC_VER_MAJOR__ >= 11)) {
@@ -650,7 +650,7 @@ int main(int argc, const char **argv) {
   cudaError_t error = cudaGetDeviceProperties(&props, 0);
   if (error != cudaSuccess) {
     std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
-    return false;
+    return -1;
   }
 
   if (!((props.major * 10 + props.minor) >= 80)) {
diff --git a/examples/34_transposed_conv2d/34_transposed_conv2d.cu b/examples/34_transposed_conv2d/34_transposed_conv2d.cu
index d9d878ad27..f3393c7ce5 100644
--- a/examples/34_transposed_conv2d/34_transposed_conv2d.cu
+++ b/examples/34_transposed_conv2d/34_transposed_conv2d.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -251,7 +251,7 @@ struct Options {
 	<< "  --tag=<string>       String to replicate across the first column in the results table\n";
 
     out << "\n\nExamples:\n\n"
-	<< "$ ./examples/31_transposed_conv2d/31_transposed_conv2d --n=8 --h=32 --w=32 --c=16 --k=32 --r=3 --s=3\n\n";
+	<< "$ ./examples/34_transposed_conv2d/34_transposed_conv2d --n=8 --h=32 --w=32 --c=16 --k=32 --r=3 --s=3\n\n";
 
     return out;
   }
@@ -603,7 +603,7 @@ int main(int argc, char const **args) {
   cudaDeviceProp props;
   CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
 
-  if (!(props.major > 8 || (props.major == 8 && props.minor >= 0))) {
+  if (!(props.major >= 8)) {
     std::cerr << "Ampere Tensor Ops must be run on a machine with compute capability at least 80."
               << std::endl;
     notSupported = true;
diff --git a/examples/34_transposed_conv2d/CMakeLists.txt b/examples/34_transposed_conv2d/CMakeLists.txt
index 00a0dbce73..414b011ac3 100644
--- a/examples/34_transposed_conv2d/CMakeLists.txt
+++ b/examples/34_transposed_conv2d/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/35_gemm_softmax/CMakeLists.txt b/examples/35_gemm_softmax/CMakeLists.txt
index 51611290bd..b7ecd99fcc 100644
--- a/examples/35_gemm_softmax/CMakeLists.txt
+++ b/examples/35_gemm_softmax/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/35_gemm_softmax/gemm_softmax.cu b/examples/35_gemm_softmax/gemm_softmax.cu
index 0d18077ee4..731e37b4d9 100644
--- a/examples/35_gemm_softmax/gemm_softmax.cu
+++ b/examples/35_gemm_softmax/gemm_softmax.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -42,19 +42,24 @@
 #include "cutlass/arch/memory.h"
 #include "cutlass/arch/memory_sm75.h"
 #include "cutlass/gemm/device/gemm_complex.h"
-
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_size.h"
 #include "cutlass/util/command_line.h"
 #include "cutlass/util/host_tensor.h"
 
 #include "cutlass/util/reference/host/gemm_complex.h"
+#include "cutlass/util/reference/device/gemm_complex.h"
 #include "cutlass/util/reference/host/tensor_reduce.h"
 #include "cutlass/util/reference/host/tensor_compare.h"
 #include "cutlass/util/reference/host/tensor_norm.h"
 #include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
 #include "cutlass/util/reference/host/tensor_fill.h"
 #include "cutlass/util/reference/host/error_metrics.h"
 #include "cutlass/util/tensor_view_io.h"
+#include "cutlass/numeric_size.h" // cutlass::bits_to_bytes
 
+#include "cutlass/layout/matrix.h"
 #include "cutlass/epilogue/thread/linear_combination.h"
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -85,18 +90,18 @@ struct Options {
   float alpha;
   float beta;
   bool verification_enabled;
-  double tolerance;
+  float tolerance;
 
   Options():
     help(false),
     problem_size({16, 24, 64}),
-    batch_count(1),             // As a temporary limitation to the test bench, batch count must be 1. The kernels support arbitrary batching.
+    batch_count(16),
     iterations(20),
     seed(2022),
     alpha(1),
-    beta(),
+    beta(0),
     verification_enabled(true),
-    tolerance(0.01)
+    tolerance(1e-5f)
   { }
 
   bool valid() {
@@ -116,6 +121,8 @@ struct Options {
     cmd.get_cmd_line_argument("n", problem_size.n());
     cmd.get_cmd_line_argument("k", problem_size.k());
 
+    cmd.get_cmd_line_argument("batch_count", batch_count);
+
     cmd.get_cmd_line_argument("alpha", alpha);
     cmd.get_cmd_line_argument("beta", beta);
 
@@ -135,6 +142,7 @@ struct Options {
       << "  --m=<int>                   GEMM M dimension\n"
       << "  --n=<int>                   GEMM N dimension\n"
       << "  --k=<int>                   GEMM K dimension\n"
+      << "  --batch_count=<int>         Batch number\n"
       << "  --alpha=<f32>               Epilogue scalar alpha\n"
       << "  --beta=<f32>                Epilogue scalar beta\n\n"
       << "  --seed=<int>                Random number seed (1*)\n\n"
@@ -198,13 +206,28 @@ struct Testbed {
   using ElementA = cutlass::half_t;
   using ElementB = cutlass::half_t;
   using ElementC = cutlass::half_t;
-  using ElementD = cutlass::half_t;
   using ElementCompute = float;
-  using ElementSoftmax = cutlass::half_t;
+  using ElementD = ElementC;
+  using ElementSoftmax = ElementC;
 
   using LayoutA = cutlass::layout::RowMajor;
   using LayoutB = cutlass::layout::ColumnMajor;
 
+  using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
+  using WarpShape        = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+  using ArchTag = cutlass::arch::Sm80;
+
+  // ApplyShape impacts the final Softmax performance a lot.
+  // Set ApplyShape::kColumn to be the next multiple of 32 number that is after
+  // (gemm_N / alignment).
+  // Set ApplyShape::kRow to max(1, 128 / ApplyShape::kColumn).
+  using ApplyShape = cutlass::MatrixShape<1, 1024>;
+
+  static int const kStages = 3;
+
   /// Linear scaling operator
   using EpilogueFunctorOp = cutlass::epilogue::thread::LinearCombination<
     ElementC,
@@ -218,12 +241,22 @@ struct Testbed {
     ElementB, LayoutB,
     ElementC,
     ElementCompute,
-    EpilogueFunctorOp
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueFunctorOp,
+    kStages,
+    ApplyShape
   >;
 
   using ElementNorm = typename GemmSoftmax::ElementNorm;
   using ElementSum = typename GemmSoftmax::ElementSum;
   using LayoutC = typename GemmSoftmax::LayoutC;
+  using LayoutN = typename GemmSoftmax::LayoutN;
+  using LayoutS = typename GemmSoftmax::LayoutS;
+  using MatrixCoord = typename LayoutC::TensorCoord;
 
   //
   // Data members
@@ -231,20 +264,42 @@ struct Testbed {
 
   Options const &options;
 
-  cutlass::HostTensor<ElementA, LayoutA>        tensor_A;
-  cutlass::HostTensor<ElementB, LayoutB>        tensor_B;
-  cutlass::HostTensor<ElementC, LayoutC>        tensor_C;
-  cutlass::HostTensor<ElementD, LayoutC>        tensor_D;
-  cutlass::HostTensor<ElementNorm, LayoutC>     tensor_N;
-  cutlass::HostTensor<ElementSum, LayoutC>      tensor_S;
-  cutlass::HostTensor<ElementSoftmax, LayoutC>  tensor_Softmax;
 
-  cutlass::HostTensor<ElementD, LayoutC>        reference_D;
   cutlass::HostTensor<ElementNorm, LayoutC>     reference_N;
-  cutlass::HostTensor<ElementSoftmax, LayoutC>  reference_Softmax;
+
+  cutlass::DeviceAllocation<ElementA> block_A;
+  cutlass::DeviceAllocation<ElementB> block_B;
+  cutlass::DeviceAllocation<ElementC> block_C;
+  cutlass::DeviceAllocation<ElementD> block_D;
+  cutlass::DeviceAllocation<ElementD> block_Ref;
+  cutlass::DeviceAllocation<ElementSoftmax> block_Softmax;
+  cutlass::DeviceAllocation<ElementNorm> block_Norm;
+  cutlass::DeviceAllocation<ElementSum> block_Sum;
 
   int block_num = (options.problem_size.n() + GemmSoftmax::ThreadblockShape::kN - 1) / GemmSoftmax::ThreadblockShape::kN;
 
+  cutlass::gemm::GemmCoord problem = options.problem_size;
+
+  int64_t lda = LayoutA::packed({problem.m(), problem.k()}).stride(0);
+  int64_t ldb = LayoutB::packed({problem.k(), problem.n()}).stride(0);
+  int64_t ldc = LayoutC::packed({problem.m(), problem.n()}).stride(0);
+
+  // fixed rowmajor for norm and sum
+  int64_t ldn = problem.m();
+  int64_t lds = ldn;
+
+  int64_t total_elements_A_per_batch = problem.m() * problem.k();
+  int64_t total_elements_B_per_batch = problem.k() * problem.n();
+  int64_t total_elements_C_per_batch = problem.m() * problem.n();
+  int64_t total_elements_D_per_batch = problem.m() * problem.n();
+  int64_t total_elements_partial_norm_per_batch = block_num * problem.m();
+
+  int64_t total_elements_A = total_elements_A_per_batch * options.batch_count;
+  int64_t total_elements_B = total_elements_B_per_batch * options.batch_count;
+  int64_t total_elements_C = total_elements_C_per_batch * options.batch_count;
+  int64_t total_elements_D = total_elements_D_per_batch * options.batch_count;
+  int64_t total_elements_partial_norm = total_elements_partial_norm_per_batch * options.batch_count;
+
   //
   // Methods
   //
@@ -254,20 +309,7 @@ struct Testbed {
   ):
     options(options_)
   {
-
-    tensor_A.reset({options.problem_size.m(), options.problem_size.k()});
-    tensor_B.reset({options.problem_size.k(), options.problem_size.n()});
-
-    tensor_C.reset({options.problem_size.m(), options.problem_size.n()});
-    tensor_D.reset({options.problem_size.m(), options.problem_size.n()});
-
-    tensor_N.reset({block_num, options.problem_size.m()});
-    tensor_S.reset({block_num, options.problem_size.m()});
-    tensor_Softmax.reset({options.problem_size.m(), options.problem_size.n()});
-
-    reference_D.reset({options.problem_size.m(), options.problem_size.n()}, false);
     reference_N.reset({options.problem_size.m(), 1}, false);
-    reference_Softmax.reset({options.problem_size.m(), options.problem_size.n()}, false);
   }
 
   /// Run
@@ -300,11 +342,6 @@ struct Testbed {
       return disposition;
     }
 
-    //
-    // Compute the reference
-    //
-    compute_reference();
-
     //
     // Verify
     //
@@ -334,43 +371,38 @@ struct Testbed {
   /// Random initialization
   void initialize() {
 
-    cutlass::reference::host::TensorFillRandomUniform(
-      tensor_A.host_view(),
-        options.seed,
-        ElementD(5),
-        ElementD(-5),
-        0
-      );
+    block_A.reset(total_elements_A);
+    block_B.reset(total_elements_B);
+    block_C.reset(total_elements_C);
+    block_D.reset(total_elements_D);
+    block_Softmax.reset(total_elements_D);
+    block_Ref.reset(total_elements_D_per_batch);
+    block_Norm.reset(total_elements_partial_norm);
+    block_Sum.reset(total_elements_partial_norm);
 
-    cutlass::reference::host::TensorFillRandomUniform(
-      tensor_B.host_view(),
-        options.seed + 19,
-        ElementD(5),
-        ElementD(-5),
-        0
-      );
+    cutlass::reference::device::BlockFillRandomUniform(
+            block_A.get(), total_elements_A, options.seed, ElementA(5), ElementA(-5), 0);
 
-    cutlass::reference::host::TensorFill(
-      reference_D.host_view(),
-      ElementD()
-      );
+    cutlass::reference::device::BlockFillRandomUniform(
+            block_B.get(), total_elements_B, options.seed + 1, ElementB(5), ElementB(-5), 0);
+
+    cutlass::reference::device::BlockFillRandomUniform(
+            block_C.get(), total_elements_C, options.seed + 2, ElementC(5), ElementC(-5), 0);
+
+    cutlass::reference::device::BlockFillRandomUniform(
+            block_D.get(), total_elements_D, options.seed + 3, ElementD(5), ElementD(-5), 0);
+
+    cutlass::reference::device::BlockFillRandomUniform(
+            block_Ref.get(), total_elements_D_per_batch, options.seed + 3, ElementD(5), ElementD(-5), 0);
+
+    cutlass::reference::device::BlockFillRandomUniform(
+            block_Softmax.get(), total_elements_D, options.seed + 3, ElementSoftmax(5), ElementSoftmax(-5), 0);
 
     cutlass::reference::host::TensorFill(
       reference_N.host_view(),
       ElementNorm()
     );
 
-    cutlass::reference::host::TensorFill(
-      reference_Softmax.host_view(),
-      ElementSoftmax()
-    );
-
-    tensor_A.sync_device();
-    tensor_B.sync_device();
-    tensor_D.sync_device();
-    tensor_N.sync_device();
-    tensor_S.sync_device();
-    tensor_Softmax.sync_device();
   }
 
   cutlass::Status execute_device_kernel() {
@@ -384,17 +416,24 @@ struct Testbed {
     GemmSoftmax::Arguments args(
       options.problem_size,
       options.batch_count,
-      tensor_A.device_ref(),
-      tensor_B.device_ref(),
-      tensor_C.device_ref(),
-      tensor_D.device_ref(),
+      {block_A.get(), lda},
+      {block_B.get(), ldb},
+      {block_C.get(), ldc},
+      {block_D.get(), ldc},
       {
         ElementCompute(options.alpha),
         ElementCompute(options.beta)
       },
-      tensor_N.device_ref(),
-      tensor_S.device_ref(),
-      tensor_Softmax.device_ref()
+      {block_Norm.get(), ldn},
+      {block_Sum.get(), lds},
+      {block_Softmax.get(), ldc},
+      total_elements_A_per_batch,
+      total_elements_B_per_batch,
+      total_elements_C_per_batch,
+      total_elements_D_per_batch,
+      total_elements_partial_norm_per_batch,
+      total_elements_partial_norm_per_batch,
+      total_elements_D_per_batch
     );
 
     //
@@ -415,68 +454,21 @@ struct Testbed {
     return status;
   }
 
-  /// Reference calculation
-  void compute_reference() {
-
-    // Compute GEMM
-
-    cutlass::reference::host::GemmComplex(
-      options.problem_size,
-      options.alpha,
-      tensor_A.host_ref(),
-      cutlass::ComplexTransform::kNone,
-      tensor_B.host_ref(),
-      cutlass::ComplexTransform::kNone,
-      options.beta,
-      tensor_C.host_ref(),
-      reference_D.host_ref(),
-      double()
-    );
-
-    // Compute the norm
-    for (int m = 0; m < options.problem_size.m(); ++m) {
-      reference_N.at({m, 0}) = reference_D.at({m, 0});
-      for (int n = 1; n < options.problem_size.n(); ++n) {
-        reference_N.at({m, 0}) = std::max(reference_N.at({m, 0}), ElementNorm(reference_D.at({m, n})));
-      }
-    }
-
-    // Compute softmax
-    for (int m = 0; m < options.problem_size.m(); ++m) {
-
-      float sum = float();
-
-      for (int n = 0; n < options.problem_size.n(); ++n) {
-        sum += std::exp( float(reference_D.at({m, n})) - float(reference_N.at({m, 0})) );
-      }
-
-      float inv_sum = float(1.0f / sum);
-
-      for (int n = 0; n < options.problem_size.n(); ++n) {
-
-        reference_Softmax.at({m, n}) = ElementSoftmax(
-          std::exp( float(reference_D.at({m, n})) - float(reference_N.at({m, 0})) ) * inv_sum
-        );
-      }
-    }
-  }
-
-  /// Emits all tensor values
-  void emit_results() {
-    std::cout << "D = \n" << tensor_D.host_view() << "\n\n";
-    std::cout << "N = \n" << tensor_N.host_view() << "\n\n";
-    std::cout << "Softmax = \n" << tensor_Softmax.host_view() << "\n\n";
-    std::cout << "Reference N = \n" << reference_N.host_view() << "\n\n";
-    std::cout << "Reference D = \n" << reference_D.host_view() << "\n\n";
-    std::cout << "Reference Softmax = \n" << reference_Softmax.host_view() << "\n\n";
-  }
-
-  bool verify_tensor_N(cutlass::HostTensor<ElementNorm, LayoutC> tensor_N, \
-                       cutlass::HostTensor<ElementNorm, LayoutC> reference_N) {
-
-    for (int m = 0; m < options.problem_size.m(); ++m) {
-      float diff = (float)(tensor_N.at({0, m}) - reference_N.at({m, 0}));
-      if (fabs(diff) > options.tolerance) {
+  template<typename Element>
+  bool verify_tensor(std::vector<Element> vector_Input, \
+                       std::vector<Element> vector_Input_Ref) {
+
+    auto size = int64_t((vector_Input.size() < vector_Input_Ref.size()) ? vector_Input.size() : vector_Input_Ref.size());
+    float abs_tol = options.tolerance;
+    float rel_tol = options.tolerance;
+    
+    for (int64_t i = 0; i < size; ++i) {
+      float diff = (float)(vector_Input.at(i) - vector_Input_Ref.at(i));
+      float abs_diff = fabs(diff);
+      float abs_ref = fabs((float)vector_Input_Ref.at(i));
+      float relative_diff = abs_ref > abs_tol ? abs_diff / abs_ref : 0;
+      if ( (isnan(abs_diff) || isinf(abs_diff)) ||  (abs_diff > rel_tol && relative_diff > rel_tol)) {
+        printf("diff = %f, {%f, %f}.\n", abs_diff, (float)(vector_Input.at(i)), (float)(vector_Input_Ref.at(i)));
         return false;
       }
 
@@ -488,80 +480,112 @@ struct Testbed {
   /// Verifies the reference matches
   bool verify() {
 
-    tensor_D.sync_host();
-    tensor_N.sync_host();
-    tensor_Softmax.sync_host();
-
-    double const kThreshold = options.tolerance;
-
-    // Verification checks - set any of these to 'true' to override the verification checks.
-    bool verified_D = false;
-    bool verified_N = false;
-    bool verified_Softmax = false;
+    LayoutA layout_A(lda);
+    LayoutB layout_B(ldb);
+    LayoutC layout_C(ldc);
+    LayoutN Layout_N(ldn);
+    LayoutS Layout_S(lds);
+
+    MatrixCoord extent_A{problem.m(), problem.k()};
+    MatrixCoord extent_B{problem.k(), problem.n()};
+    MatrixCoord extent_C{problem.m(), problem.n()};
+
+    for (int batch_idx = 0; batch_idx < options.batch_count; batch_idx++) {
+
+      cutlass::TensorView<ElementA, LayoutA> view_A(block_A.get() + total_elements_A_per_batch * batch_idx, layout_A, extent_A);
+      cutlass::TensorView<ElementB, LayoutB> view_B(block_B.get() + total_elements_B_per_batch * batch_idx, layout_B, extent_B);
+      cutlass::TensorView<ElementC, LayoutC> view_C(block_C.get() + total_elements_C_per_batch * batch_idx, layout_C, extent_C);
+      cutlass::TensorView<ElementC, LayoutC> view_Ref_device(block_Ref.get(), layout_C, extent_C);
+
+      cutlass::reference::device::GemmComplex<
+          ElementA, LayoutA,
+          ElementB, LayoutB,
+          ElementC, LayoutC, 
+          ElementCompute, ElementCompute
+      >(
+        problem,
+        options.alpha, 
+        view_A,
+        cutlass::ComplexTransform::kNone,
+        view_B,
+        cutlass::ComplexTransform::kNone,
+        options.beta, 
+        view_C, 
+        view_Ref_device, 
+        ElementCompute(0)
+      );
 
-    // Verify softmax output
-    if (!verified_D) {
+      // Copy reference results to host memory for verification
+      std::vector<ElementD> matrix_D_Ref(layout_C.capacity(extent_C));
+      cutlass::device_memory::copy_to_host(matrix_D_Ref.data(), block_Ref.get(), matrix_D_Ref.size());
+      cutlass::TensorView<ElementD, LayoutC> view_Ref(matrix_D_Ref.data(), layout_C, extent_C);
 
-      double norm_diff = cutlass::reference::host::TensorNormDiff(
-        tensor_D.host_view(),
-        reference_D.host_view());
+      std::vector<ElementSoftmax> matrix_Softmax_Ref(layout_C.capacity(extent_C));
+      cutlass::TensorView<ElementSoftmax, LayoutC> view_Softmax_Ref(matrix_Softmax_Ref.data(), layout_C, extent_C);
 
-      double norm_reference = cutlass::reference::host::TensorNorm(
-        reference_D.host_view());
+      // Copy computed results to host memory
+      std::vector<ElementD> matrix_D(layout_C.capacity(extent_C));
+      cutlass::device_memory::copy_to_host(matrix_D.data(), block_D.get() + total_elements_D_per_batch * batch_idx, matrix_D.size());
 
-      double rel_error = norm_diff / norm_reference;
+      std::vector<ElementD> matrix_Softmax(layout_C.capacity(extent_C));
+      cutlass::device_memory::copy_to_host(matrix_Softmax.data(), block_Softmax.get() + total_elements_D_per_batch * batch_idx, matrix_Softmax.size());
 
-      if (rel_error > kThreshold) {
-        std::cerr << "\n\nTensor D Relative error: " << rel_error << std::endl;
-      }
-      else {
-        verified_D = true;
+      // Compute the norm
+      for (int m = 0; m < options.problem_size.m(); ++m) {
+        reference_N.at({m, 0}) = view_Ref.ref().at({m, 0});
+        for (int n = 1; n < options.problem_size.n(); ++n) {
+          reference_N.at({m, 0}) = std::max(reference_N.at({m, 0}), ElementNorm(view_Ref.ref().at({m, n})));
+        }
       }
-    }
 
-    if (!verified_N) {
-      verified_N = verify_tensor_N(tensor_N, reference_N);
-    }
+      // Compute softmax
+      for (int m = 0; m < options.problem_size.m(); ++m) {
 
-    if (!verified_Softmax) {
+        float sum = float();
 
-      double norm_diff = cutlass::reference::host::TensorNormDiff(
-        tensor_Softmax.host_view(),
-        reference_Softmax.host_view());
+        for (int n = 0; n < options.problem_size.n(); ++n) {
+          sum += std::exp( float(view_Ref.ref().at({m, n})) - float(reference_N.at({m, 0})) );
+        }
 
-      double norm_reference = cutlass::reference::host::TensorNorm(
-        reference_Softmax.host_view());
+        float inv_sum = float(1.0f / sum);
 
-      double rel_error = norm_diff / norm_reference;
+        for (int n = 0; n < options.problem_size.n(); ++n) {
 
-      if (rel_error > kThreshold) {
-        std::cerr << "\n\nSoftmax Relative error: " << rel_error << std::endl;
+          view_Softmax_Ref.ref().at({m, n}) = ElementSoftmax(
+            std::exp( float(view_Ref.ref().at({m, n})) - float(reference_N.at({m, 0})) ) * inv_sum
+          );
+        }
       }
-      else {
-        verified_Softmax = true;
-      }
-    }
 
-    if (!verified_D || !verified_N || !verified_Softmax) {
+      // Verification checks - set any of these to 'true' to override the verification checks.
+      bool verified_D = false;
+      bool verified_Softmax = false;
 
-      std::cerr << "Verification check failed for tensor Softmax" << std::endl;
-
-      emit_results();
-
-      // Summarize which checks failed
+      // Verify softmax output
       if (!verified_D) {
-        std::cerr << "Verification of D tensor failed\n";
+        verified_D = verify_tensor<ElementC>(matrix_D, matrix_D_Ref);
       }
 
-      if (!verified_N) {
-        std::cerr << "Verification of N tensor failed\n";
+      if (!verified_Softmax) {
+        verified_Softmax = verify_tensor<ElementSoftmax>(matrix_Softmax, matrix_Softmax_Ref);
       }
 
-      if (!verified_Softmax) {
-        std::cerr << "Verification of Softmax tensor failed\n";
+      if (!verified_D || !verified_Softmax) {
+
+        std::cerr << "Verification check failed for tensor Softmax at batch " << batch_idx << "\n";
+
+        // Summarize which checks failed
+        if (!verified_D) {
+          std::cerr << "Verification of D tensor failed\n";
+        }
+
+        if (!verified_Softmax) {
+          std::cerr << "Verification of Softmax tensor failed\n";
+        }
+
+        return false;
       }
 
-      return false;
     }
 
     return true;
@@ -635,16 +659,21 @@ struct Testbed {
     }
 
     int64_t flops = int64_t(options.problem_size.m()) * options.problem_size.n() * options.problem_size.k() * 2;
-    int64_t bytes = (sizeof(ElementD) * 2 + sizeof(ElementSoftmax)) * options.problem_size.m() * options.problem_size.n();
+    int64_t bytes = cutlass::bits_to_bytes(
+      (cutlass::sizeof_bits<ElementD>::value * 2 + cutlass::sizeof_bits<ElementSoftmax>::value) *
+      options.problem_size.m() * options.problem_size.n());
+
+    double gflops_per_second = double(flops) * kIterations * options.batch_count / double(elapsed_ms / 1000.0f) / double(1.0e9);
+    double gbytes_per_second = double(bytes) * kIterations * options.batch_count / double(elapsed_ms / 1000.0f) / double(1 << 30);
 
-    double gflops_per_second = double(flops) * kIterations / double(elapsed_ms / 1000.0f) / double(1.0e9);
-    double gbytes_per_second = double(bytes) * kIterations / double(elapsed_ms / 1000.0f) / double(1 << 30);
+    double elapsed_ms_per_iter = double(elapsed_ms) / kIterations;
 
     std::cout << "         Problem: "
               << options.problem_size.m() << "-by-" << options.problem_size.n() << "-by-" << options.problem_size.k()
+              << ", batch size: " << options.batch_count
               << std::endl;
 
-    std::cout << "         Runtime: " << elapsed_ms << " ms\n" << std::endl;
+    std::cout << "         Runtime: " << elapsed_ms_per_iter << " ms\n" << std::endl;
 
     std::cout << "          GFLOPs: " << gflops_per_second << "  GFLOPs" << std::endl;
     std::cout << "Memory bandwidth: " << gbytes_per_second << "  GiB/s" << std::endl;
@@ -692,6 +721,4 @@ int main(int argc, const char **argv) {
   return (disposition == Disposition::kPassed ? 0 : -1);
 }
 
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/examples/35_gemm_softmax/gemm_with_epilogue_visitor.h b/examples/35_gemm_softmax/gemm_with_epilogue_visitor.h
index 814de5ae7f..43208150d6 100644
--- a/examples/35_gemm_softmax/gemm_with_epilogue_visitor.h
+++ b/examples/35_gemm_softmax/gemm_with_epilogue_visitor.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -29,7 +29,8 @@
  *
  **************************************************************************************************/
 /*! \file
-    \brief GEMM kernel to support the 'epilogue visitor' model for fusion.
+    \brief GEMM kernel to support the epilogue visitor model 
+    for customized softmax partial reduction epilogue fusion.
 
     This source file will likely be moved to `include/cutlass/gemm/kernel/` in the future once
     its usage has been stabilized. For now, it is included in this example to demonstrate
@@ -78,6 +79,7 @@ struct GemmWithEpilogueVisitor {
 
   using ElementC = typename EpilogueVisitor::ElementOutput;
   using LayoutC = typename Epilogue::Layout;
+  using TensorRefC = TensorRef<ElementC, LayoutC>;
 
   static ComplexTransform const kTransformA = Mma::kTransformA;
   static ComplexTransform const kTransformB = Mma::kTransformB;
@@ -89,6 +91,9 @@ struct GemmWithEpilogueVisitor {
   using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
   using ArchTag = typename Mma::ArchTag;
 
+  using ElementNorm = typename EpilogueVisitor::ElementNorm;
+  using ElementSum = typename EpilogueVisitor::ElementSum;
+
   static int const kStages = Mma::kStages;
   static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
   static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
@@ -121,6 +126,11 @@ struct GemmWithEpilogueVisitor {
 
     TensorRefA ref_A;
     TensorRefB ref_B;
+    TensorRefC ref_C;
+    TensorRefC ref_D;
+
+    ElementNorm *ptr_Max; 
+    ElementSum  *ptr_Sum;
 
     int64_t    batch_stride_A;
     int64_t    batch_stride_B;
@@ -144,6 +154,10 @@ struct GemmWithEpilogueVisitor {
       int batch_count_,
       TensorRefA ref_A_,
       TensorRefB ref_B_,
+      TensorRefC ref_C_,
+      TensorRefC ref_D_,
+      ElementNorm *ptr_Max_,
+      ElementSum *ptr_Sum_,
       int64_t batch_stride_A_,
       int64_t batch_stride_B_,
       typename EpilogueVisitor::Arguments epilogue_visitor_
@@ -153,6 +167,10 @@ struct GemmWithEpilogueVisitor {
       batch_count(batch_count_),
       ref_A(ref_A_),
       ref_B(ref_B_),
+      ref_C(ref_C_),
+      ref_D(ref_D_),
+      ptr_Max(ptr_Max_),
+      ptr_Sum(ptr_Sum_),
       batch_stride_A(batch_stride_A_),
       batch_stride_B(batch_stride_B_),
       epilogue_visitor(epilogue_visitor_)
@@ -174,6 +192,8 @@ struct GemmWithEpilogueVisitor {
 
     typename Mma::IteratorA::Params params_A;
     typename Mma::IteratorB::Params params_B;
+    typename EpilogueVisitor::OutputTileIterator::Params params_C;
+    typename EpilogueVisitor::OutputTileIterator::Params params_D;
 
     GemmUniversalMode mode;
     int batch_count;
@@ -181,6 +201,11 @@ struct GemmWithEpilogueVisitor {
 
     void * ptr_A;
     void * ptr_B;
+    ElementC * ptr_C;
+    ElementC * ptr_D;
+
+    ElementNorm * ptr_Max;
+    ElementSum * ptr_Sum;
 
     int64_t batch_stride_A;
     int64_t batch_stride_B;
@@ -196,11 +221,17 @@ struct GemmWithEpilogueVisitor {
       swizzle_log_tile(0),
       params_A(0),
       params_B(0),
+      params_C(0),
+      params_D(0),
       batch_count(0),
       gemm_k_size(0),
       mode(cutlass::gemm::GemmUniversalMode::kGemm),
       ptr_A(nullptr),
       ptr_B(nullptr),
+      ptr_C(nullptr),
+      ptr_D(nullptr),
+      ptr_Max(nullptr),
+      ptr_Sum(nullptr),
       batch_stride_A(0),
       batch_stride_B(0)
     { }
@@ -213,11 +244,17 @@ struct GemmWithEpilogueVisitor {
       swizzle_log_tile(0),
       params_A(args.ref_A.layout()),
       params_B(args.ref_B.layout()),
+      params_C(args.ref_C.layout()),
+      params_D(args.ref_D.layout()),
       mode(args.mode),
       batch_count(args.batch_count),
       gemm_k_size(args.problem_size.k()),
       ptr_A(args.ref_A.data()),
       ptr_B(args.ref_B.data()),
+      ptr_C(args.ref_C.data()),
+      ptr_D(args.ref_D.data()),
+      ptr_Max(args.ptr_Max),
+      ptr_Sum(args.ptr_Sum),
       batch_stride_A(args.batch_stride_A),
       batch_stride_B(args.batch_stride_B),
       epilogue_visitor(args.epilogue_visitor)
@@ -330,12 +367,6 @@ struct GemmWithEpilogueVisitor {
     return can_implement(args.problem_size);
   }
 
-  static size_t get_extra_workspace_size(Arguments const &args,
-                                         cutlass::gemm::GemmCoord const &grid_tiled_shape) {
-
-    return 0;
-  }
-
   #define SPLIT_K_ENABLED 1
 
   /// Executes one GEMM
@@ -467,7 +498,14 @@ struct GemmWithEpilogueVisitor {
       thread_idx,
       warp_idx,
       lane_idx,
-      threadblock_offset);
+      params.params_C,
+      params.params_D,
+      params.ptr_C,
+      params.ptr_D,
+      params.ptr_Max,
+      params.ptr_Sum,
+      threadblock_offset,
+      blockIdx.y *params.problem_size.m() );
 
     if (params.mode == GemmUniversalMode::kGemm) {
       // Indicate which position in a serial reduction the output operator is currently updating
diff --git a/examples/35_gemm_softmax/gemm_with_softmax.h b/examples/35_gemm_softmax/gemm_with_softmax.h
index 213f8c5a44..748905d9f8 100644
--- a/examples/35_gemm_softmax/gemm_with_softmax.h
+++ b/examples/35_gemm_softmax/gemm_with_softmax.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -49,10 +49,12 @@
 #include "cutlass/gemm/kernel/default_gemm.h"
 #include "cutlass/gemm/kernel/default_gemm_complex.h"
 #include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/epilogue/threadblock/epilogue_visitor_with_softmax.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_visitor.h"
+#include "cutlass/reduction/kernel/reduce_softmax_final.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-#include "epilogue_with_visitor.h"
 #include "gemm_with_epilogue_visitor.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -77,7 +79,7 @@ template <
   typename ElementSoft_,
   typename ElementSoftmaxCompute_,
   int Alignment,
-  typename Shape_ = MatrixShape<4, 16>
+  typename ApplyShape_ = MatrixShape<1, 1024>
 >
 class ApplySoftmax {
 public:
@@ -89,7 +91,7 @@ class ApplySoftmax {
   using ElementSoftmaxCompute = ElementSoftmaxCompute_;
 
   static int const kAlignment = Alignment;
-  using Shape = Shape_;
+  using ApplyShape = ApplyShape_;
 
   using Layout = cutlass::layout::RowMajor;
 
@@ -200,7 +202,7 @@ class ApplySoftmax {
     using AccessTypeD = AlignedArray<ElementD, kAlignment>;
 
     int block_batch = blockIdx.z;
-    int block_m = blockIdx.x * Shape::kRow;
+    int block_m = blockIdx.x * ApplyShape::kRow;
     int block_n = 0;
 
     int thread_m = threadIdx.y;
@@ -209,6 +211,9 @@ class ApplySoftmax {
     int idx_m = block_m + thread_m;
     int idx_n = block_n + thread_n;
 
+    int batch_offset_norm = block_batch * params.args.batch_stride_N;
+    int batch_offset_sum = block_batch * params.args.batch_stride_S;
+
     // Kill off thread if it is outside the row boundary
     if (params.args.extent.row() <= idx_m) {
       return;
@@ -251,8 +256,8 @@ class ApplySoftmax {
       params.args.batch_stride_Soft * block_batch +
       params.args.ref_Soft.layout()({idx_m, idx_n}));
 
-    ElementSum inv_sum = (params.args.ref_S.data())[block_m];
-    ElementNorm norm = (params.args.ref_N.data())[block_m];
+    ElementSum inv_sum = (params.args.ref_S.data())[idx_m + batch_offset_sum];
+    ElementNorm norm = (params.args.ref_N.data())[idx_m + batch_offset_norm];
 
     //
     // Loop
@@ -261,10 +266,9 @@ class ApplySoftmax {
     for (
       int idx = 0;
       idx < params.args.extent.column();
-      idx += Shape::kColumn * kAlignment) {
+      idx += ApplyShape::kColumn * kAlignment) {
 
       if (idx_n < params.args.extent.column()) {
-
         AccessTypeD fetch;
         arch::global_load<AccessTypeD, sizeof(AccessTypeD)>(fetch, access_d, true);
 
@@ -274,561 +278,10 @@ class ApplySoftmax {
         arch::global_store<FragmentSoft, sizeof(FragmentSoft)>(soft, access_soft, true);
       }
 
-      access_d += Shape::kColumn;
-      access_soft += Shape::kColumn;
-      idx_n += Shape::kColumn * kAlignment;
-    }
-  }
-};
-
-template <
-  typename ElementNorm_,
-  typename ElementSum_,
-  typename ElementSoftmaxCompute_,
-  typename ThreadblockShape_
->
-class ApplyFinalReduction {
-public:
-
-  using ElementNorm = ElementNorm_;
-  using ElementSum = ElementSum_;
-  using ElementSoftmaxCompute = ElementSoftmaxCompute_;
-  using ThreadblockShape = ThreadblockShape_;
-
-  using Layout = cutlass::layout::RowMajor;
-
-  using TensorRefN = TensorRef<ElementNorm, Layout>;
-  using TensorRefSum = TensorRef<ElementSum, Layout>;  
-
-  //
-  // Arguments
-  //
-
-  struct Arguments {
-
-    MatrixCoord     extent;             ///< Extent of D and Softmax matrices
-    int             batch_count;        ///< Batch count
-    TensorRefN      ref_N;              ///< Norm tensor (input / output)
-    TensorRefSum    ref_Sum;            ///< Sum tensor (input / output)
-    int64_t         batch_stride_N;     ///< Batch stride for N tensor
-    int64_t         batch_stride_Sum;   ///< Batch stride for softmax tensor
-
-    //
-    // Methods
-    //
-    Arguments():
-      batch_count(1),
-      batch_stride_N(0),
-      batch_stride_Sum(0)
-    { }
-
-    Arguments(
-      MatrixCoord     extent_,             ///< Extent of D and Softmax matrices
-      int             batch_count_,        ///< Batch count
-      TensorRefN      ref_N_,              ///< Output parameter for N
-      TensorRefSum    ref_Sum_ ,           ///< Sum
-      int64_t         batch_stride_N_ = 0,
-      int64_t         batch_stride_Sum_ = 0
-    ):
-      extent(extent_),
-      batch_count(batch_count_),
-      ref_N(ref_N_),
-      ref_Sum(ref_Sum_),
-      batch_stride_N(batch_stride_N_),
-      batch_stride_Sum(batch_stride_Sum_)
-    {
-
-    }
-  };
-
-  struct SharedStorage {
-
-
-  };
-
-  //
-  // Params struct
-  //
-
-  struct Params {
-    Arguments args;
-
-    //
-    // Methods
-    //
-    Params() { }
-
-    Params(Arguments const &args_): args(args_) { }
-  };
-
-private:
-
-public:
-
-  CUTLASS_DEVICE
-  ApplyFinalReduction() { }
-
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    apply(params, shared_storage);
-  }
-
-private:
-
-  /// Partial reduction
-  CUTLASS_DEVICE
-  void apply(Params const &params, SharedStorage &shared_storage) {
-
-    int threadblock_num = (params.args.extent.column() + ThreadblockShape::kN - 1) / ThreadblockShape::kN;
-
-    int block_batch = blockIdx.z;
-
-    int block_n = blockIdx.x * blockDim.x;
-
-    int thread_n = threadIdx.x;
-
-    int idx_n = block_n + thread_n;
-
-    if (idx_n >= params.args.extent.row()) {
-      return;
-    }
-
-
-    using ConvertSumOutput = cutlass::NumericConverter<ElementSum, ElementSoftmaxCompute>;
-    using ConvertNormOutput = cutlass::NumericConverter<ElementNorm, ElementSoftmaxCompute>;
-
-    using ConvertSum = cutlass::NumericConverter<ElementSoftmaxCompute, ElementSum>;
-    using ConvertNorm = cutlass::NumericConverter<ElementSoftmaxCompute, ElementNorm>;
-
-    ConvertSum   convert_sum;
-    ConvertNorm  convert_norm;
-
-    ConvertSumOutput   convert_sum_output;
-    ConvertNormOutput  convert_norm_output;
-
-    ElementNorm *access_n = params.args.ref_N.data() + params.args.batch_stride_N * block_batch + idx_n;
-    ElementSum *access_s = params.args.ref_Sum.data() + params.args.batch_stride_Sum * block_batch + idx_n;
-
-    ElementNorm *access_n_bak = access_n;
-    ElementSum *access_s_bak = access_s;
-
-    uint32_t float_max_bits = 0xff7fffff;
-    float min_float = reinterpret_cast<float const &>(float_max_bits);
-
-    ElementSoftmaxCompute max_val = ElementSoftmaxCompute(min_float);
-    ElementSoftmaxCompute sum_val = ElementSoftmaxCompute(0);
-    ElementNorm fetch_n;
-    ElementSum fetch_s;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int idx_m = 0; idx_m < threadblock_num; idx_m++) {
-      arch::global_load<ElementNorm, sizeof(ElementNorm)>(fetch_n, access_n, true);
-      max_val = fast_max(max_val, convert_norm(fetch_n));
-      access_n += params.args.extent.row();
-    }
-    
-    access_n = access_n_bak;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int idx_m = 0; idx_m < threadblock_num; idx_m++) {
-      arch::global_load<ElementNorm, sizeof(ElementNorm)>(fetch_n, access_n, true);
-      arch::global_load<ElementSum, sizeof(ElementSum)>(fetch_s, access_s, true);
-      sum_val += convert_sum(fetch_s) * fast_exp(convert_norm(fetch_n) - max_val);
-      access_n += params.args.extent.row();
-      access_s += params.args.extent.row();
-    }
-
-    ElementSoftmaxCompute inv_sum = cutlass::constants::one<ElementSoftmaxCompute>() / sum_val;
-
-    access_n = access_n_bak;
-    access_s = access_s_bak;
-
-    access_n[0] = convert_norm_output(max_val);
-    access_s[0] = convert_sum_output(inv_sum);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ThreadblockShape_,
-  int ThreadCount,
-  typename OutputTileIterator_,
-  typename ElementAccumulator_,
-  typename ElementNorm_,
-  typename ElementSum_,
-  typename ElementSoftmaxCompute_,
-  typename ElementwiseFunctor_
->
-class EpilogueVisitorBiasMax {
-public:
-
-  using ThreadblockShape   = ThreadblockShape_;
-  static int const kThreadCount = ThreadCount;
-
-  using OutputTileIterator = OutputTileIterator_;
-  using ElementwiseFunctor = ElementwiseFunctor_;
-
-  static int const kIterations = OutputTileIterator::kIterations;
-  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
-
-  using ElementOutput = typename OutputTileIterator::Element;
-  using LayoutOutput = cutlass::layout::RowMajor;
-  using ElementAccumulator = ElementAccumulator_;
-
-  using ElementNorm = ElementNorm_;
-  using ElementSum = ElementSum_;
-  using ElementSoftmaxCompute = ElementSoftmaxCompute_;
-
-  using AccumulatorFragment = Array<ElementAccumulator, kElementsPerAccess>;
-  using SoftmaxFragment = Array<ElementSoftmaxCompute, kElementsPerAccess>;
-  using OutputVector = Array<ElementOutput, kElementsPerAccess>;
-  using TensorRefD = TensorRef<ElementOutput, LayoutOutput>;
-
-  /// Argument structure
-  struct Arguments {
-
-    typename ElementwiseFunctor::Params   elementwise;
-    TensorRefD                            ref_C;
-    TensorRefD                            ref_D;
-    ElementNorm                           *ptr_Max;
-    ElementSum                            *ptr_Sum;
-    int64_t                               batch_stride_C;
-    int64_t                               batch_stride_D;
-    int64_t                               batch_stride_Max;
-    int64_t                               batch_stride_Sum;
-
-    //
-    // Methods
-    //
-    Arguments():
-      ptr_Max(nullptr),
-      ptr_Sum(nullptr),
-      batch_stride_C(0),
-      batch_stride_D(0),
-      batch_stride_Max(0),
-      batch_stride_Sum(0)
-    {
-
-    }
-
-    Arguments(
-      typename ElementwiseFunctor::Params   elementwise_,
-      TensorRefD                            ref_C_,
-      TensorRefD                            ref_D_,
-      ElementNorm                           *ptr_Max_,
-      ElementSum                            *ptr_Sum_,
-      int64_t                               batch_stride_C_,
-      int64_t                               batch_stride_D_,
-      int64_t                               batch_stride_Max_,
-      int64_t                               batch_stride_Sum_
-    ):
-      elementwise(elementwise_),
-      ref_C(ref_C_),
-      ref_D(ref_D_),
-      ptr_Max(ptr_Max_),
-      ptr_Sum(ptr_Sum_),
-      batch_stride_C(batch_stride_C_),
-      batch_stride_D(batch_stride_D_),
-      batch_stride_Max(batch_stride_Max_),
-      batch_stride_Sum(batch_stride_Sum_)
-    {
-
-    }
-  };
-
-  struct Params {
-
-    typename ElementwiseFunctor::Params   elementwise;
-    typename OutputTileIterator::Params   params_C;
-    typename OutputTileIterator::Params   params_D;
-    typename OutputTileIterator::Element *ptr_C;
-    typename OutputTileIterator::Element *ptr_D;
-    ElementNorm                           *ptr_Max;
-    ElementSum                            *ptr_Sum;
-    int64_t                               batch_stride_C;
-    int64_t                               batch_stride_D;
-    int64_t                               batch_stride_Max;
-    int64_t                               batch_stride_Sum;
-
-    //
-    // Methods
-    //
-    CUTLASS_HOST_DEVICE
-    Params():
-      ptr_D(nullptr),
-      ptr_Max(nullptr),
-      ptr_Sum(nullptr)
-    {
-
-    }
-
-    CUTLASS_HOST_DEVICE
-    Params(Arguments const &args):
-      elementwise(args.elementwise),
-      params_C(args.ref_C.layout()),
-      params_D(args.ref_D.layout()),
-      ptr_C(args.ref_C.data()),
-      ptr_D(args.ref_D.data()),
-      ptr_Max(args.ptr_Max),
-      ptr_Sum(args.ptr_Sum),
-      batch_stride_C(args.batch_stride_C),
-      batch_stride_D(args.batch_stride_D),
-      batch_stride_Max(args.batch_stride_Max),
-      batch_stride_Sum(args.batch_stride_Sum)
-    {
-
-    }
-  };
-
-  /// Shared storage
-  struct SharedStorage {
-
-  };
-
-private:
-
-  Params const &                        params_;
-  SharedStorage &                       shared_storage_;
-  MatrixCoord                           extent_;
-  ElementwiseFunctor                    elementwise_;
-
-  OutputTileIterator                    iterator_C_;
-  OutputTileIterator                    iterator_D_;
-  typename OutputTileIterator::Fragment fragment_C_;
-  typename OutputTileIterator::Fragment fragment_D_;
-
-  ElementAccumulator                    alpha_;
-  ElementAccumulator                    beta_;
-
-  ElementSoftmaxCompute                 accum_max_;
-  int                                   threadblock_row_;
-
-public:
-
-  CUTLASS_DEVICE
-  EpilogueVisitorBiasMax(
-    Params const &params,                                         ///< Parameters routed to the epilogue
-    SharedStorage &shared_storage,                                ///< Shared storage needed by the functors here
-    MatrixCoord const &problem_size,                              ///< Problem size of the output
-    int thread_idx,                                               ///< Thread index within the threadblock
-    int warp_idx,                                                 ///< Warp index within the threadblock
-    int lane_idx,                                                 ///< Lane index within the warp
-    MatrixCoord const &threadblock_offset = MatrixCoord(0, 0)
-  ):
-    params_(params),
-    shared_storage_(shared_storage),
-    extent_(problem_size),
-    elementwise_(params.elementwise),
-    iterator_C_(params.params_C, params.ptr_C, problem_size, thread_idx, threadblock_offset),
-    iterator_D_(params.params_D, params.ptr_D, problem_size, thread_idx, threadblock_offset),
-    threadblock_row_(threadblock_offset.row())
-  {
-    alpha_ = (params.elementwise.alpha_ptr ? *params.elementwise.alpha_ptr : params.elementwise.alpha);
-    beta_ =  (params.elementwise.beta_ptr ? *params.elementwise.beta_ptr : params.elementwise.beta);
-
-    if (beta_ == ElementAccumulator()) {
-      iterator_C_.clear_mask();
-    }
-  }
-
-  /// Helper to indicate split-K behavior
-  CUTLASS_DEVICE
-  void set_k_partition(
-    int split_k_index,                                            ///< Index of this threadblock within split-K partitioned scheme
-    int split_k_slices) {                                         ///< Total number of split-K slices
-
-  }
-
-  /// Called to set the batch index
-  CUTLASS_DEVICE
-  void set_batch_index(int batch_idx) {
-    iterator_C_.add_pointer_offset(batch_idx * params_.batch_stride_C);
-    iterator_D_.add_pointer_offset(batch_idx * params_.batch_stride_D);
-  }
-
-  /// Called at the start of the epilogue just before iterating over accumulator slices
-  CUTLASS_DEVICE
-  void begin_epilogue() {
-
-  }
-
-  /// Called at the start of one step before starting accumulator exchange
-  CUTLASS_DEVICE
-  void begin_step(int step_idx) {
-    fragment_D_.clear();
-    fragment_C_.clear();
-
-    if (elementwise_.kScale != cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling) {
-      iterator_C_.load(fragment_C_);
-      ++iterator_C_;
-    }
-    
-  }
-
-  /// Called at the start of a row
-  CUTLASS_DEVICE
-  void begin_row(int row_idx) {
-
-  }
-
-  /// Called after accumulators have been exchanged for each accumulator vector
-  CUTLASS_DEVICE
-  void visit(
-    int row_idx,
-    int column_idx,
-    int frag_idx,
-    AccumulatorFragment const &accum) {
-
-    using Mul = cutlass::multiplies<SoftmaxFragment>;
-    using Minus = cutlass::minus<SoftmaxFragment>;
-    using Exp   = cutlass::fast_exp_op<SoftmaxFragment>;
-
-    Minus     minus;
-    Exp       exponential;
-
-    SoftmaxFragment result;
-
-    using ConvertSumOutput = cutlass::NumericConverter<ElementSoftmaxCompute, ElementSum>;
-    using ConvertNormOutput = cutlass::NumericConverter<ElementSoftmaxCompute, ElementNorm>;
-
-    ConvertSumOutput   convert_sum_output;
-    ConvertNormOutput  convert_norm_output;
-
-    NumericArrayConverter<ElementSoftmaxCompute, ElementOutput, kElementsPerAccess> source_converter;
-    OutputVector &source_vector = reinterpret_cast<OutputVector *>(&fragment_C_)[frag_idx];
-
-    if (elementwise_.kScale == cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling) {
-      result = source_converter(elementwise_(accum));
-    }else{
-      result = source_converter(elementwise_(accum, source_vector));
-    }
-
-    MatrixCoord thread_offset =
-      iterator_D_.thread_start() +
-      OutputTileIterator::ThreadMap::iteration_offset(frag_idx);
-
-    int thread_in_row = OutputTileIterator::ThreadMap::Detail::RowArrangement::Detail::kShapeWidth;
-    int half_thread_in_row = (thread_in_row >> 1);
-
-    bool column_guard = (thread_offset.column() < extent_.column());
-
-    // Compute the maximum within one row
-    if (!column_idx) {
-      // This is the first fragment in a new row
-      if (column_guard) {
-        accum_max_ = maximum_accumulator_(result);
-      }
-    }
-    else {
-      // This is an additional fragment in the same row
-      if (column_guard) {
-        accum_max_ = maximum_accumulator_(result, accum_max_);
-      }
-    }
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = half_thread_in_row; i > 0; i >>= 1) {
-      ElementSoftmaxCompute tmp = __shfl_xor_sync(0xFFFFFFFF, accum_max_, i);
-      accum_max_ = fast_max(accum_max_, tmp);
-    }
-
-    SoftmaxFragment sum_frag = exponential(minus(result, accum_max_));
-
-    ElementSoftmaxCompute reduction_sum = sum_accumulator_(sum_frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = half_thread_in_row; i > 0; i >>= 1) {
-      ElementSoftmaxCompute tmp = __shfl_xor_sync(0xFFFFFFFF, reduction_sum, i);
-      reduction_sum += tmp;
-    }
-
-    bool is_write_thread = (thread_offset.row() < extent_.row() && (threadIdx.x % thread_in_row) == 0);
-    ElementNorm *curr_ptr_max = params_.ptr_Max + thread_offset.row() + blockIdx.y * extent_.row();
-    ElementSum *curr_ptr_sum = params_.ptr_Sum + thread_offset.row() + blockIdx.y * extent_.row();
-    
-    arch::global_store<ElementNorm, sizeof(ElementNorm)>(
-              convert_norm_output(accum_max_),
-              (void *)curr_ptr_max,
-              is_write_thread);
-
-    arch::global_store<ElementSum, sizeof(ElementSum)>(
-              convert_sum_output(reduction_sum),
-              (void *)curr_ptr_sum,
-              is_write_thread);
-
-     clear_accum_max_();
-
-    // Convert to the output
-    NumericArrayConverter<ElementOutput, ElementSoftmaxCompute, kElementsPerAccess> output_converter;
-    OutputVector &output = reinterpret_cast<OutputVector *>(&fragment_D_)[frag_idx];
-    output = output_converter(result);
-  }
-
-  /// Called at the start of a row
-  CUTLASS_DEVICE
-  void end_row(int row_idx) {
-
-  }
-
-  /// Called after all accumulator elements have been visited
-  CUTLASS_DEVICE
-  void end_step(int step_idx) {
-
-    iterator_D_.store(fragment_D_);
-    ++iterator_D_;
-  }
-
-  /// Called after all steps have been completed
-  CUTLASS_DEVICE
-  void end_epilogue() {
-
-  }
-
-private:
-
-  CUTLASS_DEVICE
-  void clear_accum_max_() {
-
-    uint32_t float_max_bits = 0xff7fffff;   // -FLT_MAX
-    float min_float = reinterpret_cast<float const &>(float_max_bits);
-    accum_max_ = ElementSoftmaxCompute(min_float);
-  }
-
-  CUTLASS_DEVICE
-  ElementSoftmaxCompute sum_accumulator_(SoftmaxFragment const &accum) {
-    ElementSoftmaxCompute sum_ = ElementSoftmaxCompute(0);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < SoftmaxFragment::kElements; ++i) {
-      sum_ += ElementSoftmaxCompute(accum[i]);
-    }
-
-    return sum_;
-  }
-
-  CUTLASS_DEVICE
-  ElementSoftmaxCompute maximum_accumulator_(SoftmaxFragment const &accum) {
-    ElementSoftmaxCompute max_ = accum[0];
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 1; i < SoftmaxFragment::kElements; ++i) {
-      max_ = fast_max(max_, ElementSoftmaxCompute(accum[i]));
-    }
-
-    return max_;
-  }
-
-  CUTLASS_DEVICE
-  ElementSoftmaxCompute maximum_accumulator_(SoftmaxFragment const &accum, ElementSoftmaxCompute max_) {
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < SoftmaxFragment::kElements; ++i) {
-      max_ = fast_max(max_, ElementSoftmaxCompute(accum[i]));
+      access_d += ApplyShape::kColumn;
+      access_soft += ApplyShape::kColumn;
+      idx_n += ApplyShape::kColumn * kAlignment;
     }
-
-    return max_;
   }
 };
 
@@ -846,10 +299,19 @@ template <
   typename LayoutB_,
   typename ElementC_,
   typename ElementCompute_,
+  typename OperatorClass_,
+  typename ArchTag_,
+  typename ThreadblockShape_,
+  typename WarpShape_,
+  typename InstructionShape_,
   typename EpilogueFunctorOp_,
+  int kStages_,
+  typename ApplyShape_ = MatrixShape<1, 1024>,
+  int AlignmentA_ = 128 / cutlass::sizeof_bits<ElementA_>::value,
+  int AlignmentB_ = 128 / cutlass::sizeof_bits<ElementB_>::value,
+  int AlignmentSoftmax_ = 128 / cutlass::sizeof_bits<ElementC_>::value,
   typename ElementNorm_ = float,
   typename ElementSum_ = float,
-  int Alignment = 128 / cutlass::sizeof_bits<ElementA_>::value,
   typename ElementSoftmax_ = ElementC_
 >
 class GemmSoftmax {
@@ -872,11 +334,11 @@ class GemmSoftmax {
   using LayoutA = LayoutA_;
   using LayoutB = LayoutB_;
 
-  static int const kAlignment = Alignment;
-
   using EpilogueFunctorOp = EpilogueFunctorOp_;
   using ElementNorm = ElementNorm_;
 
+  using ApplyShape = ApplyShape_;
+
   // These are mandatory layouts.
   using LayoutC = cutlass::layout::RowMajor;
   using LayoutN = cutlass::layout::RowMajor;
@@ -890,13 +352,17 @@ class GemmSoftmax {
   using TensorRefSum = TensorRef<ElementSum, LayoutS>;
   using TensorRefSoft = TensorRef<ElementSoft, LayoutSoft>;
 
-  using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
-  using WarpShape        = cutlass::gemm::GemmShape<64, 64, 32>;
-  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape        = WarpShape_;
+  using InstructionShape = InstructionShape_;
+
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
 
-  using OperatorClass       = cutlass::arch::OpClassTensorOp;
-  using ArchTag             = cutlass::arch::Sm80;
-  static int const kStages  = 3;
+  static int const kStages  = kStages_;
+  static int const AlignmentA = AlignmentA_;
+  static int const AlignmentB = AlignmentB_;
+  static int const AlignmentSoftmax = AlignmentSoftmax_;
 
   using ThreadblockSwizzle = cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle;
 
@@ -906,10 +372,10 @@ class GemmSoftmax {
   using DefaultGemmKernel = typename cutlass::gemm::kernel::DefaultGemm<
     ElementA,
     LayoutA,
-    kAlignment,
+    AlignmentA,
     ElementB,
     LayoutB,
-    kAlignment,
+    AlignmentB,
     ElementC,
     LayoutC,
     ElementCompute,
@@ -930,7 +396,7 @@ class GemmSoftmax {
   ///////////////////////////////////////////////////////////////////////////////////////////////
 
   // Epilogue visitor
-  using EpilogueVisitor = kernel::EpilogueVisitorBiasMax<
+  using EpilogueVisitor = typename cutlass::epilogue::threadblock::EpilogueVisitorSoftmax<
     ThreadblockShape,
     DefaultGemmKernel::kThreadCount,
     typename DefaultGemmKernel::Epilogue::OutputTileIterator,
@@ -961,13 +427,11 @@ class GemmSoftmax {
     ElementSum,
     ElementSoft,
     ElementSoftmaxCompute,
-    kAlignment,
-    MatrixShape<
-      1, 1024
-    >
+    AlignmentSoftmax,
+    ApplyShape
   >;
 
-  using ApplyFinalReductionKernel = kernel::ApplyFinalReduction<
+  using ApplyFinalReductionKernel = cutlass::reduction::kernel::ApplySoftmaxFinalReduction<
     ElementNorm,
     ElementSum,
     ElementSoftmaxCompute,
@@ -983,6 +447,7 @@ class GemmSoftmax {
     typename SoftmaxApplyKernel::Arguments softmax;
     typename ApplyFinalReductionKernel::Arguments reduction;
     cutlass::gemm::GemmCoord extend;
+
     //
     // Methods
     //
@@ -1013,14 +478,14 @@ class GemmSoftmax {
         batch_count_,
         ref_A_,
         ref_B_,
+        ref_C_,
+        ref_D_,
+        ref_N_.data(),
+        ref_S_.data(),
         batch_stride_A_,
         batch_stride_B_,
         typename EpilogueVisitor::Arguments(
           linear_scaling,
-          ref_C_,
-          ref_D_,
-          ref_N_.data(),
-          ref_S_.data(),
           batch_stride_C_,
           batch_stride_D_,
           batch_stride_Max_,
@@ -1028,10 +493,9 @@ class GemmSoftmax {
         )
       ),
       reduction(
-        MatrixCoord(problem_size.m(), problem_size.n()),
-        batch_count_,
-        ref_N_,
-        ref_S_,
+        problem_size,
+        ref_N_.data(),
+        ref_S_.data(),
         batch_stride_Max_,
         batch_stride_Sum_
       ), 
@@ -1114,9 +578,21 @@ class GemmSoftmax {
 
     int gemm_smem_size = int(sizeof(typename GemmKernel::SharedStorage));
 
+    cudaError_t result;
+
+    if (gemm_smem_size >= (48 << 10)) {
+      result = cudaFuncSetAttribute(cutlass::Kernel<GemmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    gemm_smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
     cutlass::Kernel<GemmKernel><<<gemm_grid, gemm_block, gemm_smem_size, stream>>>(params_.gemm);
 
-    cudaError_t result = cudaGetLastError();
+    result = cudaGetLastError();
 
     if (result != cudaSuccess) {
       return cutlass::Status::kErrorInternal;
@@ -1127,42 +603,38 @@ class GemmSoftmax {
     // Launch the ApplyFinalReductionKernel
     //
 
-    int threadblock_num_in_column = (params_.extend.column() + ThreadblockShape::kN - 1) / ThreadblockShape::kN;
-
-    if (threadblock_num_in_column > 1) {
-      int thread_per_block = 128;
-      int block_per_row = (params_.extend.row() + thread_per_block - 1) / thread_per_block;
-      if (block_per_row < 4) {
-        thread_per_block = 32;
-        block_per_row = (params_.extend.row() + thread_per_block - 1) / thread_per_block;
-      }
+    int thread_per_block = 128;
+    int block_per_row = (params_.extend.row() + thread_per_block - 1) / thread_per_block;
+    if (block_per_row < 4) {
+      thread_per_block = 32;
+      block_per_row = (params_.extend.row() + thread_per_block - 1) / thread_per_block;
+    }
 
-      dim3 final_reduction_grid(block_per_row);
-      dim3 final_reduction_block(thread_per_block);
+    dim3 final_reduction_grid(block_per_row, 1, params_.softmax.args.batch_count);
+    dim3 final_reduction_block(thread_per_block);
 
-      Kernel<ApplyFinalReductionKernel><<<
-        final_reduction_grid, final_reduction_block, sizeof(typename ApplyFinalReductionKernel::SharedStorage), stream
-      >>>(params_.reduction);
+    Kernel<ApplyFinalReductionKernel><<<
+      final_reduction_grid, final_reduction_block, sizeof(typename ApplyFinalReductionKernel::SharedStorage), stream
+    >>>(params_.reduction);
 
-      result = cudaGetLastError();
+    result = cudaGetLastError();
 
-      if (result != cudaSuccess) {
-        return cutlass::Status::kErrorInternal;
-      }
+    if (result != cudaSuccess) {
+      return cutlass::Status::kErrorInternal;
     }
 
     //
     // Launch the SoftmaxApplyKernel
     //
 
-    dim3 apply_block(SoftmaxApplyKernel::Shape::kColumn, SoftmaxApplyKernel::Shape::kRow);
+    dim3 apply_block(SoftmaxApplyKernel::ApplyShape::kColumn, SoftmaxApplyKernel::ApplyShape::kRow);
 
-    int cta_rows = SoftmaxApplyKernel::Shape::kRow;
-    int cta_columns = SoftmaxApplyKernel::Shape::kColumn * SoftmaxApplyKernel::kAlignment;
+    int threadblock_rows = SoftmaxApplyKernel::ApplyShape::kRow;
+    int threadblock_columns = SoftmaxApplyKernel::ApplyShape::kColumn * SoftmaxApplyKernel::kAlignment;
 
     dim3 apply_grid(
-      (params_.softmax.args.extent.row() + cta_rows - 1) / cta_rows,
-      (params_.softmax.args.extent.column() + cta_columns - 1) / cta_columns,
+      (params_.softmax.args.extent.row() + threadblock_rows - 1) / threadblock_rows,
+      (params_.softmax.args.extent.column() + threadblock_columns - 1) / threadblock_columns,
       params_.softmax.args.batch_count);
 
     Kernel<SoftmaxApplyKernel><<<
diff --git a/examples/36_gather_scatter_fusion/CMakeLists.txt b/examples/36_gather_scatter_fusion/CMakeLists.txt
index 28edd47868..b54ea9ff81 100644
--- a/examples/36_gather_scatter_fusion/CMakeLists.txt
+++ b/examples/36_gather_scatter_fusion/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/36_gather_scatter_fusion/gather_scatter_fusion.cu b/examples/36_gather_scatter_fusion/gather_scatter_fusion.cu
index f8fbcc33c3..55852730c2 100644
--- a/examples/36_gather_scatter_fusion/gather_scatter_fusion.cu
+++ b/examples/36_gather_scatter_fusion/gather_scatter_fusion.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -40,18 +40,17 @@
 //      for (int j = 0; j < options.index_size; ++j) {
 //        int b_c_d_col = tensor_indices.at({j, 0});
 //
-//        for (int k = 0; k < problem_size.k(); ++k) {
+//        for (int k = 0; k < options.index_size; ++k) {
 //            tensor_d_ref.at({i, b_c_d_col}) +=
 //              alpha * tensor_a.at({i, k}) * tensor_b.at({k, b_c_d_col});
 //        }
 //      }
-//    }
 //
 // Note that the index vector contains unique random integers with max to be N - 1
 //
 // The gather/scatter operation works best when we can still keep the biggest
 // alignment. For example, when the matrix is row major, we select rows. When
-// the matrix is column major, we selct columns.
+// the matrix is column major, we select columns.
 //
 // Not all the combination of gather and scatter are legal. For example, if A is
 // row major and C/D is column major, we cannot gather A and scatter C/D at the
@@ -60,11 +59,11 @@
 // Also, we don't check the index value is legal and index array point is valid
 // for the sake of the performance.
  
-#include <stdlib.h>
-#include <stdio.h>
-#include <time.h>
-#include <math.h>
-#include <assert.h>
+#include <cstdlib>
+#include <cstdio>
+#include <ctime>
+#include <cmath>
+#include <cassert>
 #include <cuda_runtime.h>
 
 #include <algorithm>
@@ -174,8 +173,8 @@ struct Options {
   /// Compute performance in GFLOP/s
   double gflops(double runtime_s) const {
 
-    // Number of real-valued multiply-adds 
-    int64_t fmas = problem_size.product();
+    // Number of real-valued multiply-adds
+    int64_t fmas = problem_size.m() * int64_t(index_size) * problem_size.k();
     
     // Two flops per multiply-add
     return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
@@ -188,8 +187,8 @@ struct Options {
 // elements in input matrices.
 using ElementAccumulator = float;                   // <- data type of accumulator
 using ElementComputeEpilogue = ElementAccumulator;  // <- data type of epilogue operations
-using ElementInputA = cutlass::half_t;;             // <- data type of elements in input matrix A
-using ElementInputB = cutlass::half_t;;             // <- data type of elements in input matrix B
+using ElementInputA = cutlass::half_t;              // <- data type of elements in input matrix A
+using ElementInputB = cutlass::half_t;              // <- data type of elements in input matrix B
 using ElementOutput = float;                        // <- data type of elements in output matrix D
 
 // The code section below describes matrix layout of input and output matrices.
@@ -216,7 +215,7 @@ using ShapeMMAOp = cutlass::gemm::GemmShape<16, 8, 16>;  // <- MMA Op tile M = 8
 // 16, 8, 16 -> Ampere
 
 // This code section describes how threadblocks are scheduled on GPU
-using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;  // <- ??
+using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
 
 // Define the epilogue operation as LinearCombination. This is approximately equal to
 //
@@ -253,11 +252,11 @@ using Gemm = cutlass::gemm::device::GemmUniversal<ElementInputA,
                                                   SwizzleThreadBlock,
                                                   NumStages,
                                                   8,     /*alignmentA*/
-                                                  8,     /*alignmengB*/
+                                                  8,     /*alignmentB*/
                                                   cutlass::arch::OpMultiplyAdd,
                                                   cutlass::ComplexTransform::kNone,
                                                   cutlass::ComplexTransform::kNone,
-                                                  false,  /*GatherA*/
+                                                  false, /*GatherA*/
                                                   true,  /*GatherB*/
                                                   true   /*ScatterD*/
                                                  >;
@@ -317,7 +316,11 @@ int run(Options &options) {
   // <- Fill tensor_b_indices on host with unique random integers
   std::vector<int> to_fill(problem_size.n()) ; // vector with ints.
   std::iota (std::begin(to_fill), std::end(to_fill), 0); // Fill with 0, 1, ...., problem_size.n()
-  std::random_shuffle(to_fill.begin(), to_fill.end());
+  { // std::random_shuffle was deprecated in C++14 and removed in C++17
+    std::random_device make_seed;
+    std::mt19937 source_of_randomness(make_seed());
+    std::shuffle(to_fill.begin(), to_fill.end(), source_of_randomness);
+  }
   memcpy(tensor_indices.host_data(), to_fill.data(), options.index_size * sizeof(int));
 
   // Copy data from host to GPU
@@ -346,14 +349,14 @@ int run(Options &options) {
       tensor_c.device_data(),             // <- reference to matrix C on device
       tensor_d_scattered.device_data(),   // <- reference to matrix D on device
       tensor_a.layout().capacity(problem_size.mk()),
-      tensor_b.layout().capacity(cutlass::make_Coord(options.index_size, problem_size.n())),
+      tensor_b.layout().capacity(cutlass::make_Coord(options.index_size, problem_size.k())),
       tensor_c.layout().capacity(problem_size.mn()),
       tensor_d_scattered.layout().capacity(problem_size.mn()),
       tensor_a.layout().stride(),
       tensor_b.layout().stride(),
       tensor_c.layout().stride(),
       tensor_d_scattered.layout().stride(),
-      nullptr,                             // <- pointer to index vector to gather A on device
+      nullptr,                            // <- pointer to index vector to gather A on device
       tensor_indices.device_data(),       // <- pointer to index vector to gather B on device
       tensor_indices.device_data()};      // <- pointer to index vector to scatter D on device
 
@@ -392,7 +395,7 @@ int run(Options &options) {
             tensor_d_ref.at({i, b_c_d_col}) +=
               alpha * tensor_a.at({i, k}) * tensor_b.at({k, b_c_d_col});
         }
-       
+
         tensor_d_ref.at({i, b_c_d_col}) += (beta * tensor_c.at({i, b_c_d_col}));
       }
     }
@@ -515,7 +518,7 @@ int main(int argc, const char ** argv) {
   cudaDeviceProp props;
   CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
 
-  if (!(props.major > 8 || (props.major == 8 && props.minor >= 0))) {
+  if (!(props.major >= 8)) {
     std::cerr << "Ampere Tensor Ops must be run on a machine with compute capability at least 80."
               << std::endl;
     notSupported = true;
diff --git a/examples/37_gemm_layernorm_gemm_fusion/CMakeLists.txt b/examples/37_gemm_layernorm_gemm_fusion/CMakeLists.txt
new file mode 100644
index 0000000000..334ec381a1
--- /dev/null
+++ b/examples/37_gemm_layernorm_gemm_fusion/CMakeLists.txt
@@ -0,0 +1,36 @@
+
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+
+cutlass_example_add_executable(
+  37_gemm_layernorm_gemm_fusion
+  gemm_layernorm.cu
+  )
+
diff --git a/examples/37_gemm_layernorm_gemm_fusion/gemm_layernorm.cu b/examples/37_gemm_layernorm_gemm_fusion/gemm_layernorm.cu
new file mode 100644
index 0000000000..b5a0a1dcb7
--- /dev/null
+++ b/examples/37_gemm_layernorm_gemm_fusion/gemm_layernorm.cu
@@ -0,0 +1,937 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief CUTLASS Layernorm Example.
+
+    This workload provides a layer normalization example using a one-pass, square-sum-based
+    variance calculation. Specifically, we fuse the reduction operation to find 
+    local mean and local square sum mean in the epilogue of 1st GEMM. After a light 
+    full reduction kernel, the mean / variance values are readily calculated for element-wise
+    operations which are fused into the 2nd GEMM.
+
+    As stated in https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Computing_shifted_data,
+    the square-sum based one-pass implementation may raise concerns on numerical stability issues. 
+    That being said, though this fully fused layernorm example almost perfectly hides all the memory cost to 
+    access the intermediate matrix for layernorm computation, the numerical issue might hinder a persuasive 
+    usage in real-world scenarios. If that is the case, a user may turn to the stand-alone CUTLASS layernorm
+    example in tools/util/include/cutlass/util/device_layernorm.h
+
+    Examples:
+
+      # Run a CUTLASS layernorm example with default setup , 
+      # using the language of the transformer model as an example,
+      (Column Major output matrix, hidden dimension = 768, valid word number = 4096, intermediate_scale = 4)
+      $ ./examples/37_gemm_layernorm_gemm_fusion/37_gemm_layernorm_gemm_fusion
+
+      # Run an attention example with hidden dimension = 512
+      $ ./examples/37_gemm_layernorm_gemm_fusion/37_gemm_layernorm_gemm_fusion --hidden_dim=512
+
+*/
+
+#include <cmath>
+#include <iostream>
+#include <vector>
+#include <limits>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/gemm/device/gemm_complex.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/host/gemm_complex.h"
+#include "cutlass/util/reference/host/tensor_reduce.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/error_metrics.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/fast_math.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "gemm_with_layernorm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+enum class Disposition {
+  kPassed,
+  kIncorrect,
+  kNotVerified
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+template<typename LayoutOutput_>
+struct Options {
+
+  using LayoutOutput = LayoutOutput_;
+
+  static bool const kIsColumnMajorOutput = cutlass::platform::is_same<LayoutOutput, cutlass::layout::ColumnMajor>::value;
+
+  bool help;
+  cutlass::gemm::GemmCoord problem_size0;
+  cutlass::gemm::GemmCoord problem_size1;
+  int hidden_dim;
+  int valid_word_num;
+  int intermediate_scale;
+  int iterations;
+  unsigned seed;
+  float alpha;
+  float beta;
+  bool verification_enabled;
+  double tolerance;
+
+  Options():
+    help(false),
+    iterations(20),
+    seed(2022),
+    hidden_dim(768),
+    valid_word_num(4096),
+    intermediate_scale(4),
+    alpha(1),
+    beta(0),
+    verification_enabled(true),
+    tolerance(0.01),
+    problem_size1(problem_size0.m() * 4, problem_size0.n(), problem_size0.m())
+  { }
+
+  bool valid() {
+
+    return true;
+  }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+
+    cmd.get_cmd_line_argument("hidden_dim", hidden_dim, 768);
+    cmd.get_cmd_line_argument("valid_word_num", valid_word_num, 4096);
+    cmd.get_cmd_line_argument("iterations", iterations);
+    cmd.get_cmd_line_argument("verify", verification_enabled);
+    cmd.get_cmd_line_argument("seed", seed);
+    cmd.get_cmd_line_argument("tolerance", tolerance);
+
+    if (kIsColumnMajorOutput) {
+      // column major output setup
+      problem_size0.m() = hidden_dim;
+      problem_size0.n() = valid_word_num;
+      problem_size0.k() = hidden_dim;
+
+      problem_size1.m() = hidden_dim * intermediate_scale;
+      problem_size1.n() = valid_word_num;
+      problem_size1.k() = hidden_dim;
+    }else{
+      // row major output setup
+      problem_size0.m() = valid_word_num;
+      problem_size0.n() = hidden_dim;
+      problem_size0.k() = hidden_dim;
+
+      problem_size1.m() = valid_word_num;
+      problem_size1.n() = hidden_dim * intermediate_scale;
+      problem_size1.k() = hidden_dim;
+    }
+
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "37_gemm_layernorm_gemm_fusion example\n\n"
+      << "  This example uses the CUTLASS Library to compute GEMM + Layernorm for arbitrary problem sizes.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement.\n\n"
+      << "  --hidden_dim=<int>          Hidden dimension\n"
+      << "  --valid_word_num=<int>      Valid word number\n"
+      << "  --seed=<int>                Random number seed (1*)\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform (0 to disable profiling).\n\n"
+      << "  --verify=<bool>             If true, performs reference calculation.\n\n"
+      << "  --tolerance <float>         Error tolerance\n"
+    ;
+
+    out << "\n\nExamples:\n\n"
+      << "$ ./examples/37_gemm_layernorm_gemm_fusion/37_gemm_layernorm_gemm_fusion \\\n"
+      << "     --hidden_dim=768 --valid_word_num=1024 \n\n";
+
+    return out;
+  }
+
+  /// Returns true if the environment and Toolkit support this
+  bool supported(bool verbose = true) const {
+
+    // Ampere Tensor Core operations exposed with mma.sync and ldmatrix are first available
+    // in CUDA 11.0.
+    //
+    // CUTLASS must be compiled with CUDA 11.0 Toolkit to run these examples.
+    if (!(__CUDACC_VER_MAJOR__ >= 11)) {
+      if (verbose) {
+        std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl;
+      }
+      return false;
+    }
+
+    cudaDeviceProp props;
+
+    cudaError_t error = cudaGetDeviceProperties(&props, 0);
+    if (error != cudaSuccess) {
+      if (verbose) {
+        std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+      }
+      return false;
+    }
+
+    if (!((props.major * 10 + props.minor) >= 80)) {
+      if (verbose) {
+        std::cerr << "Ampere Tensor Core operations must be run on a machine with compute capability at least 80."
+                  << std::endl;
+      }
+      return false;
+    }
+
+    //
+    // CUTLASS attempts to load 128b vectors of cutlass::half_t (F16) elements. Consequently,
+    // all pointers, strides, and tensor extents must be divisible by 8 elements.
+    //
+    int const kAlignment = 8;
+
+    if ((problem_size0.m() % kAlignment) ||
+        (problem_size0.n() % kAlignment) ||
+        (problem_size0.k() % kAlignment)) {
+      if (verbose) {
+        std::cerr << "Misaligned input in 1st GEMM." << std::endl;
+      }
+      // misaligned tensors for Gemm1
+      return false;
+    }
+
+    if ((problem_size1.m() % kAlignment) ||
+        (problem_size1.n() % kAlignment) ||
+        (problem_size1.k() % kAlignment)) {
+      if (verbose) {
+        std::cerr << "Misaligned input in 2nd GEMM." << std::endl;
+      }
+      // misaligned tensors for Gemm2
+      return false;
+    }
+
+    return true;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+  typename LayoutOutput_>
+struct Testbed {
+
+  //
+  // Type definitions
+  //
+
+  // User-defined data types
+  using ElementInputA0 = cutlass::half_t;
+  using ElementInputB0 = cutlass::half_t;
+  using ElementOutput = cutlass::half_t;
+  using ElementCompute = cutlass::half_t;
+
+  using LayoutInputA0 = cutlass::layout::RowMajor;
+  using LayoutInputB0 = cutlass::layout::ColumnMajor;
+  using LayoutOutput = LayoutOutput_;
+
+  static bool const kIsColumnMajorOutput = cutlass::platform::is_same<LayoutOutput, cutlass::layout::ColumnMajor>::value;
+  // turn of shifted K by default
+  static bool const kIsShiftedVariance = false;
+
+  /// Linear scaling operator
+  using EpilogueFunctorOp = cutlass::epilogue::thread::LinearCombination<
+    ElementOutput,
+    128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementCompute,
+    ElementCompute
+  >;
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
+  using WarpShape        = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  
+  static int const kStages0  = 3;
+  static int const kStages1  = 4;
+
+  using GemmLayernorm = cutlass::GemmLayernorm<
+    ElementInputA0,
+    LayoutInputA0,
+    ElementInputB0,
+    LayoutInputB0,
+    ElementOutput,
+    LayoutOutput,
+    ElementCompute,
+    EpilogueFunctorOp,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    kStages0,
+    kStages1,
+    kIsShiftedVariance
+  >;
+  
+  using ElementInputA1 = typename GemmLayernorm::ElementInputA1;
+  using ElementOutputC1 = typename GemmLayernorm::ElementOutputC1;
+  using ElementInputScaleBias = typename GemmLayernorm::ElementInputScaleBias;
+  using ElementLayernormCompute = typename GemmLayernorm::ElementLayernormCompute;
+
+  using LayoutInputA1 = typename GemmLayernorm::LayoutInputA1;
+  using LayoutOutputC0 = typename GemmLayernorm::LayoutOutputC0;
+  using LayoutOutputC1 = typename GemmLayernorm::LayoutOutputC1;
+  using LayoutInputScaleBias = typename GemmLayernorm::LayoutInputScaleBias;
+
+  //
+  // Data members
+  //
+
+  Options<LayoutOutput> const &options;
+
+  cutlass::HostTensor<ElementInputA0, LayoutInputA0>                 tensor_A0;
+  cutlass::HostTensor<ElementInputB0, LayoutInputB0>                 tensor_B0;
+  cutlass::HostTensor<ElementOutput, LayoutOutputC0>                 tensor_C0;
+  cutlass::HostTensor<ElementInputA1, LayoutInputA1>                 tensor_A1;
+  cutlass::HostTensor<ElementOutputC1, LayoutOutputC1>               tensor_C1;
+
+  cutlass::HostTensor<ElementOutput, LayoutOutputC0>                 reference_C0;
+  cutlass::HostTensor<ElementOutputC1, LayoutOutputC1>               reference_C1;
+
+  cutlass::HostTensor<ElementInputScaleBias, LayoutInputScaleBias>   tensor_Variance;
+  cutlass::HostTensor<ElementInputScaleBias, LayoutInputScaleBias>   tensor_Mean;
+  cutlass::HostTensor<ElementInputScaleBias, LayoutInputScaleBias>   tensor_Beta;
+  cutlass::HostTensor<ElementInputScaleBias, LayoutInputScaleBias>   tensor_Gamma;
+
+  cutlass::HostTensor<ElementInputScaleBias, LayoutInputScaleBias>   reference_Mean;
+  cutlass::HostTensor<ElementInputScaleBias, LayoutInputScaleBias>   reference_Variance;
+
+  // shifted K tensor to better ensure the numerical stability
+  // According to https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+  // the closer shifted K to the actual mean, the better numerical stability we'll observe
+  cutlass::HostTensor<ElementOutput, LayoutOutputC0>                 tensor_Shifted_K;
+
+  //
+  // Methods
+  //
+
+  Testbed(
+    Options<LayoutOutput> const &options_
+  ):
+    options(options_)
+  {
+
+    tensor_A0.reset({options.problem_size0.m(), options.problem_size0.k()});
+    tensor_B0.reset({options.problem_size0.k(), options.problem_size0.n()});
+
+    tensor_C0.reset({options.problem_size0.m(), options.problem_size0.n()});
+
+    tensor_A1.reset({options.problem_size1.m(), options.problem_size1.k()});
+    tensor_C1.reset({options.problem_size1.m(), options.problem_size1.n()});
+
+    reference_C0.reset({options.problem_size0.m(), options.problem_size0.n()});
+    reference_C1.reset({options.problem_size1.m(), options.problem_size1.n()});
+
+    int leading_dim_0 = kIsColumnMajorOutput ? options.problem_size0.n() : options.problem_size0.m();
+    int leading_dim_1 = kIsColumnMajorOutput ? options.problem_size0.m() : options.problem_size0.n();
+
+    int block_num = (leading_dim_1 + GemmLayernorm::ThreadblockShape::kM - 1) / GemmLayernorm::ThreadblockShape::kM;
+
+    tensor_Variance.reset({block_num, leading_dim_0});
+    tensor_Mean.reset({block_num, leading_dim_0});
+    tensor_Shifted_K.reset({1, leading_dim_0});
+
+    tensor_Beta.reset({1, leading_dim_1});
+    tensor_Gamma.reset({1, leading_dim_1});
+
+    reference_Mean.reset({1, leading_dim_0}, false);
+    reference_Variance.reset({1, leading_dim_0}, false);
+    
+  }
+
+  /// Run
+  Disposition run() {
+
+    Disposition disposition = Disposition::kNotVerified;
+
+    //
+    // Initialize the workspace
+    //
+
+    initialize();
+
+    //
+    // Launch device kernel
+    //
+    cutlass::Status status = cutlass::Status::kSuccess;
+
+    status = execute_device_kernel();
+
+    if (status != cutlass::Status::kSuccess) {
+      std::cerr << "Device execution failed." << std::endl;
+      return disposition;
+    }
+
+    cudaError_t result = cudaDeviceSynchronize();
+    if (result != cudaSuccess) {
+      std::cerr << "Device synchronize failed with error "
+        << cudaGetErrorString(result) << std::endl;
+      return disposition;
+    }
+
+    //
+    // Compute the reference
+    //
+    compute_reference();
+
+    //
+    // Verify
+    //
+
+    if (options.verification_enabled) {
+
+      bool passed = verify();
+
+      if (passed) {
+        disposition = Disposition::kPassed;
+      }
+      else {
+        disposition = Disposition::kIncorrect;
+      }
+    }
+
+    //
+    // Profiling
+    //
+    if (options.iterations) {
+      profile();
+    }
+
+    return disposition;
+  }
+
+  /// Random initialization
+  void initialize() {
+
+    cutlass::reference::host::TensorFillRandomUniform(
+      tensor_A0.host_view(),
+        options.seed,
+        ElementInputA0(4),
+        ElementInputA0(-4),
+        0
+      );
+
+    cutlass::reference::host::TensorFillRandomUniform(
+      tensor_B0.host_view(),
+        options.seed + 1,
+        ElementInputB0(4),
+        ElementInputB0(-4),
+        0
+      );
+
+    cutlass::reference::host::TensorFillRandomUniform(
+      tensor_A1.host_view(),
+        options.seed + 2,
+        ElementInputA1(4),
+        ElementInputA1(-4),
+        0
+      );
+
+    cutlass::reference::host::TensorFillRandomUniform(
+      tensor_Beta.host_view(),
+        options.seed + 3,
+        ElementInputScaleBias(4),
+        ElementInputScaleBias(-4),
+        0
+      );
+
+    cutlass::reference::host::TensorFillRandomUniform(
+      tensor_Gamma.host_view(),
+        options.seed + 4,
+        ElementInputScaleBias(4),
+        ElementInputScaleBias(-4),
+        0
+      );
+
+    cutlass::reference::host::TensorFillRandomUniform(
+      tensor_Shifted_K.host_view(),
+        options.seed + 5,
+        ElementOutput(4),
+        ElementOutput(-5),
+        0
+      );
+
+    tensor_A0.sync_device();
+    tensor_B0.sync_device();
+    tensor_A1.sync_device();
+    tensor_Beta.sync_device();
+    tensor_Gamma.sync_device();
+
+  }
+
+
+
+  cutlass::Status execute_device_kernel() {
+
+    cutlass::Status status = cutlass::Status::kSuccess;
+
+    //
+    // Setup arguments
+    //
+
+    typename GemmLayernorm::Arguments args(
+      options.problem_size0,
+      options.problem_size1,
+      tensor_A0.device_ref().data(),
+      tensor_B0.device_ref().data(),
+      tensor_C0.device_ref().data(),
+      tensor_C0.device_ref().data(),
+      tensor_A1.device_ref().data(),
+      tensor_C1.device_ref().data(),
+      tensor_A0.device_ref().stride(0),
+      tensor_B0.device_ref().stride(0),
+      tensor_C0.device_ref().stride(0),
+      tensor_C0.device_ref().stride(0),
+      tensor_A1.device_ref().stride(0),
+      tensor_C1.device_ref().stride(0),
+      {
+        ElementCompute(options.alpha),
+        ElementCompute(options.beta)
+      },
+      tensor_Variance.device_ref(),
+      tensor_Mean.device_ref(),
+      tensor_Gamma.device_ref(),
+      tensor_Beta.device_ref(),
+      tensor_Shifted_K.device_ref().data()
+    );
+
+    //
+    // Launch
+    //
+
+    GemmLayernorm gemm_layernorm;
+
+    // Initialize
+    status = gemm_layernorm.initialize(args);
+    if (status != cutlass::Status::kSuccess) {
+      return status;
+    }
+
+    // Run
+    status = gemm_layernorm();
+
+    return status;
+  }
+
+  /// Reference calculation
+  void compute_reference() {
+
+    cutlass::reference::device::Gemm<
+      ElementInputA0,
+      LayoutInputA0,
+      ElementInputB0,
+      LayoutInputB0,
+      ElementOutput,
+      LayoutOutputC0,
+      ElementCompute,
+      ElementCompute
+    > gemm_device0;
+
+    cutlass::reference::device::Gemm<
+      ElementInputA1,
+      LayoutInputA1,
+      ElementOutput,
+      LayoutOutputC0,
+      ElementOutputC1,
+      LayoutOutputC1,
+      ElementCompute,
+      ElementCompute
+    > gemm_device1;
+
+    // Compute 1st GEMM
+    gemm_device0(
+      options.problem_size0,
+      ElementCompute(options.alpha),
+      tensor_A0.device_ref(),
+      tensor_B0.device_ref(),
+      ElementCompute(options.beta),
+      tensor_C0.device_ref(),
+      reference_C0.device_ref()
+    );
+
+    reference_C0.sync_host();
+
+    tensor_Mean.sync_host();
+    tensor_Variance.sync_host();
+    tensor_Gamma.sync_host();
+    tensor_Beta.sync_host();
+    tensor_Shifted_K.sync_host();
+
+    // Compute the sum and square sum for verification purpose
+    if (kIsColumnMajorOutput) {
+      for (int n = 0; n < options.problem_size0.n(); ++n) {
+      
+        ElementLayernormCompute sum = ElementLayernormCompute(0);
+        ElementLayernormCompute square_sum = ElementLayernormCompute(0);
+        for (int m = 0; m < options.problem_size0.m(); ++m) {
+          sum += ElementLayernormCompute(reference_C0.at({m, n}));
+          square_sum += ElementLayernormCompute(reference_C0.at({m, n})) * ElementLayernormCompute(reference_C0.at({m, n}));
+        }
+        
+        ElementLayernormCompute mean = sum / ElementLayernormCompute(options.problem_size0.m());
+        ElementLayernormCompute square_mean = square_sum / ElementLayernormCompute(options.problem_size0.m());
+        ElementLayernormCompute variance = cutlass::constants::one<ElementLayernormCompute>() / cutlass::fast_sqrt(square_mean - mean * mean + ElementLayernormCompute(1e-6) ) ;
+
+        mean = -mean * variance;
+
+        reference_Mean.at({0, n}) = ElementInputScaleBias(mean);
+        reference_Variance.at({0, n}) = ElementInputScaleBias(variance);
+      }
+    }else{
+      for (int m = 0; m < options.problem_size0.m(); ++m) {
+      
+        ElementLayernormCompute sum = ElementLayernormCompute(0);
+        ElementLayernormCompute square_sum = ElementLayernormCompute(0);
+        for (int n = 0; n < options.problem_size0.n(); ++n) {
+          sum += ElementLayernormCompute(reference_C0.at({m, n})) ;
+          square_sum += ElementLayernormCompute(reference_C0.at({m, n})) * ElementLayernormCompute(reference_C0.at({m, n})) ;
+        }
+
+        ElementLayernormCompute mean = sum / ElementLayernormCompute(options.problem_size0.n());
+        ElementLayernormCompute square_mean = square_sum / ElementLayernormCompute(options.problem_size0.n());
+        ElementLayernormCompute variance = cutlass::constants::one<ElementLayernormCompute>() / cutlass::fast_sqrt(square_mean - mean * mean + ElementLayernormCompute(1e-6)) ;
+
+        mean = -mean * variance;
+
+        reference_Mean.at({0, m}) = ElementInputScaleBias(mean);
+        reference_Variance.at({0, m}) = ElementInputScaleBias(variance);
+      }
+    }
+
+    // Element-wise transform for OutputC0 using 1-pass layernorm algo
+    if (kIsColumnMajorOutput) {
+      for (int n = 0; n < options.problem_size0.n(); ++n) {
+
+        ElementLayernormCompute sum = ElementLayernormCompute(0);
+        for (int m = 0; m < options.problem_size0.m(); ++m) {
+          sum += ElementLayernormCompute(reference_C0.at({m, n})) ;
+        }
+
+        ElementInputScaleBias mean = ElementInputScaleBias(sum / ElementLayernormCompute(options.problem_size0.m()));
+        sum = ElementLayernormCompute(0);
+        for (int m = 0; m < options.problem_size0.m(); ++m) {
+          sum += ElementLayernormCompute(reference_C0.at({m, n}) - ElementLayernormCompute(mean)) * ElementLayernormCompute(reference_C0.at({m, n}) - ElementLayernormCompute(mean)) ;
+        }
+
+        ElementLayernormCompute square_mean = sum / ElementLayernormCompute(options.problem_size0.m());
+        ElementInputScaleBias variance = ElementInputScaleBias(cutlass::constants::one<ElementLayernormCompute>() 
+                            / cutlass::fast_sqrt(square_mean + ElementLayernormCompute(1e-6))) ;
+
+        for (int m = 0; m < options.problem_size0.m(); ++m) {
+          reference_C0.at({m, n}) = 
+              ElementOutput( ( (ElementInputScaleBias(reference_C0.at({m, n})) - mean) * variance )
+                * tensor_Gamma.at({0, m}) + tensor_Beta.at({0, m}));
+
+        }
+
+      }
+    }else{
+
+      for (int m = 0; m < options.problem_size0.m(); ++m) {
+
+        float sum = float(0);
+        for (int n = 0; n < options.problem_size0.n(); ++n) {
+          sum += float(reference_C0.at({m, n})) ;
+        }
+
+        float mean = sum / float(options.problem_size0.n());
+        sum = float(0);
+        for (int n = 0; n < options.problem_size0.n(); ++n) {
+          sum += float(reference_C0.at({m, n}) - mean) * float(reference_C0.at({m, n}) - mean) ;
+        }
+
+        float square_mean = sum / float(options.problem_size0.n());
+        float variance = cutlass::constants::one<float>() / cutlass::fast_sqrt(square_mean + ElementLayernormCompute(1e-6)) ;
+
+        for (int n = 0; n < options.problem_size0.n(); ++n) {
+          reference_C0.at({m, n}) = 
+              ElementOutput( ( (float(reference_C0.at({m, n})) - mean) * variance )
+                * float(tensor_Gamma.at({0, n})) + float(tensor_Beta.at({0, n})));
+
+        }
+
+      }
+
+    }
+
+
+    // Sync host data with device after element-wise transform
+    reference_C0.sync_device();
+
+    // Compute 2nd GEMM
+    gemm_device1(
+      options.problem_size1,
+      ElementCompute(options.alpha),
+      kIsColumnMajorOutput ? tensor_A1.device_ref() : reference_C0.device_ref(),
+      kIsColumnMajorOutput ? reference_C0.device_ref() :tensor_A1.device_ref(),
+      ElementCompute(options.beta),
+      reference_C1.device_ref(),
+      reference_C1.device_ref()
+    );
+
+  }
+
+  /// Emits all tensor values
+  void emit_results() {
+    std::cout << "tensor_C1 = \n" << tensor_C1.host_view() << "\n\n";
+    std::cout << "Reference C1 = \n" << reference_C1.host_view() << "\n\n";
+    std::cout << "Mean = \n" << tensor_Mean.host_view() << "\n\n";
+    std::cout << "rsqrt(Variance) = \n" << tensor_Variance.host_view() << "\n\n";
+    std::cout << "Reference Mean = \n" << reference_Mean.host_view() << "\n\n";
+    std::cout << "Reference rsqrt(Variance) = \n" << reference_Variance.host_view() << "\n\n";
+  }
+
+  template<typename Element, typename Layout>
+  bool verify_tensor(cutlass::HostTensor<Element, Layout> tensor, \
+                       cutlass::HostTensor<Element, Layout> reference,
+                       int leading_dim0, int leading_dim1, bool is_print = false) {
+    float const kThreshold = float(options.tolerance);
+    float const kAbsThreshold = 0.5f;
+    float const kRelativeThreshold = 0.1f;
+    // Adds a constant bias to avoid being divided by '0'
+    float const kBias = 1e-5f;
+    int counter = 0;
+    for (int m = 0; m < leading_dim0; m++) {
+      for (int n = 0; n < leading_dim1; ++n) {
+        float diff = (float)(tensor.at({m, n}) - reference.at({m, n}));
+        float rel_diff = fabs(diff) / fabs(reference.at({m, n}) + kBias);
+        if (fabs(diff) > kAbsThreshold && rel_diff > kRelativeThreshold) {
+          counter++;
+        }
+      }
+    }
+
+    float err_rate = float(counter) / (float(leading_dim0) * float(leading_dim1));
+    return (err_rate < kThreshold);
+  }
+
+  /// Verifies the reference matches
+  bool verify() {
+
+    tensor_Variance.sync_host();
+    tensor_Mean.sync_host();
+    tensor_C1.sync_host();
+    reference_C1.sync_host();
+
+    // Verification checks - set any of these to 'true' to override the verification checks.
+    bool verified_C1 = false;
+    bool verified_Mean = false;
+    bool verified_Variance = false;
+
+    // Verify layernorm output
+    if (!verified_C1) {
+      verified_C1 = verify_tensor<ElementOutputC1, LayoutOutputC1>(tensor_C1, reference_C1, options.problem_size1.m(), options.problem_size1.n());
+    }
+
+    if (!verified_Variance) {
+      verified_Variance = verify_tensor<ElementInputScaleBias, LayoutInputScaleBias>(tensor_Variance, reference_Variance, 1, options.problem_size0.n());
+    }
+
+    if (!verified_Mean) {
+      verified_Mean = verify_tensor<ElementInputScaleBias, LayoutInputScaleBias>(tensor_Mean, reference_Mean, 1, options.problem_size0.n());
+    }
+
+    if (!verified_C1 || !verified_Mean || !verified_Variance) {
+
+      // emit_results();
+
+      std::cerr << "Verification check failed for tensor Layernorm" << std::endl;
+
+      // Summarize which checks failed
+      if (!verified_C1) {
+        std::cerr << "Verification of O tensor failed\n";
+      }
+
+      if (!verified_Mean) {
+        std::cerr << "Verification of Mean tensor failed\n";
+      }
+
+      if (!verified_Variance) {
+        std::cerr << "Verification of Variance tensor failed\n";
+      }
+
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Profiles
+  bool profile() {
+
+    //
+    // Profile
+    //
+
+    cutlass::Status status = cutlass::Status::kSuccess;
+    cudaError_t result;
+    cudaEvent_t events[2];
+    int const kIterations = options.iterations;
+
+    for (cudaEvent_t &evt : events) {
+      result = cudaEventCreate(&evt);
+      if (result != cudaSuccess) {
+        std::cerr << "cudaEventCreate failed with error " << cudaGetErrorString(result) << std::endl;
+        return false;
+      }
+    }
+
+    result = cudaEventRecord(events[0]);
+
+    if (result != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed with error " << cudaGetErrorString(result) << std::endl;
+      return false;
+    }
+
+    for (int iter = 0; iter < kIterations; ++iter) {
+
+      status = execute_device_kernel();
+
+      if (status != cutlass::Status::kSuccess) {
+        std::cerr << "Device execution failed." << std::endl;
+        return false;
+      }
+    }
+
+    result = cudaEventRecord(events[1]);
+
+    if (result != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed with error " << cudaGetErrorString(result) << std::endl;
+      return false;
+    }
+
+    result = cudaDeviceSynchronize();
+
+    if (result != cudaSuccess) {
+      std::cerr << "cudaDeviceSynchronize() failed with error " << cudaGetErrorString(result) << std::endl;
+      return false;
+    }
+
+    float elapsed_ms = 0;
+    result = cudaEventElapsedTime(&elapsed_ms, events[0], events[1]);
+
+    float elapsed_ms_per_iter = elapsed_ms / float(kIterations);
+
+    if (result != cudaSuccess) {
+      std::cerr << "cudaEventElapsedTime() failed with error " << cudaGetErrorString(result) << std::endl;
+      return false;
+    }
+
+    for (cudaEvent_t &evt : events) {
+      result = cudaEventDestroy(evt);
+      if (result != cudaSuccess) {
+        std::cerr << "cudaEventDestroy() failed with error " << cudaGetErrorString(result) << std::endl;
+        return false;
+      }
+    }
+
+    int64_t flops = int64_t(options.problem_size0.m()) * options.problem_size0.n() * options.problem_size0.k() * 2 \
+                   + int64_t(options.problem_size1.m()) * options.problem_size1.n() * options.problem_size1.k() * 2;
+
+    double gflops_per_second = double(flops) * kIterations / double(elapsed_ms / 1000.0f) / double(1.0e9);
+
+    std::cout << "    1st  GEMM: "
+              << options.problem_size0.m() << "-by-" << options.problem_size0.n() << "-by-" << options.problem_size0.k() << "\n"
+              << "    2nd  GEMM: "
+              << options.problem_size1.m() << "-by-" << options.problem_size1.n() << "-by-" << options.problem_size1.k()
+              << std::endl;
+
+    std::cout << " Runtime / iteration: " << elapsed_ms_per_iter << " ms\n" << std::endl;
+    std::cout << "              GFLOPs: " << gflops_per_second << "  GFLOPs" << std::endl;
+
+    return true;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, const char **argv) {
+  
+  // Define final layout
+  using LayoutOutput = cutlass::layout::ColumnMajor;
+
+  // Options parsing
+  Options<LayoutOutput> options;
+  options.parse(argc, argv);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (!options.supported()) {
+    return 0;
+  }
+
+  // Run
+  Testbed<LayoutOutput> testbed(options);
+
+  Disposition disposition = testbed.run();
+
+  std::cout << std::endl;
+
+  switch (disposition) {
+    case Disposition::kPassed:
+      std::cout << "Passed" << std::endl;
+      break;
+    case Disposition::kIncorrect:
+      std::cout << "Incorrect" << std::endl;
+      break;
+    case Disposition::kNotVerified:
+      std::cout << "Not verified" << std::endl;
+      break;
+  }
+
+  return (disposition == Disposition::kPassed ? 0 : -1);
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/37_gemm_layernorm_gemm_fusion/gemm_with_epilogue_visitor.h b/examples/37_gemm_layernorm_gemm_fusion/gemm_with_epilogue_visitor.h
new file mode 100644
index 0000000000..666f3cb566
--- /dev/null
+++ b/examples/37_gemm_layernorm_gemm_fusion/gemm_with_epilogue_visitor.h
@@ -0,0 +1,444 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief GEMM kernel to support the epilogue visitor model 
+    for customized layernorm partial reduction epilogue fusion.
+
+    This source file will likely be moved to `include/cutlass/gemm/kernel/` in the future once
+    its usage has been stabilized. For now, it is included in this example to demonstrate
+    some basic output fusion options.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+struct GemmWithEpilogueVisitor {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueVisitor = typename Epilogue::Visitor;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using TensorRefA = TensorRef<ElementA, LayoutA>;
+
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using TensorRefB = TensorRef<ElementB, LayoutB>;
+
+  using ElementC = typename EpilogueVisitor::ElementOutput;
+  using LayoutC = typename Epilogue::Layout;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = EpilogueVisitor::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(
+    128 / sizeof_bits<ElementA>::value,
+    128 / sizeof_bits<ElementB>::value
+  );
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmUniversalMode mode;
+    GemmCoord problem_size;
+
+    TensorRefA ref_A;
+    TensorRefB ref_B;
+
+    typename EpilogueVisitor::Arguments epilogue_visitor;
+
+    //
+    // Methods
+    //
+
+    Arguments():
+      mode(GemmUniversalMode::kGemm)
+    { }
+
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode_,
+      GemmCoord problem_size_,
+      TensorRefA ref_A_,
+      TensorRefB ref_B_,
+      typename EpilogueVisitor::Arguments epilogue_visitor_
+    ):
+      mode(mode_),
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      ref_B(ref_B_),
+      epilogue_visitor(epilogue_visitor_)
+    {
+
+    }
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+
+    cutlass::gemm::GemmCoord problem_size;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int swizzle_log_tile;
+
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorB::Params params_B;
+
+    GemmUniversalMode mode;
+    int gemm_k_size;
+
+    void * ptr_A;
+    void * ptr_B;
+
+    typename EpilogueVisitor::Params epilogue_visitor;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params():
+      swizzle_log_tile(0),
+      params_A(0),
+      params_B(0),
+      gemm_k_size(0),
+      mode(cutlass::gemm::GemmUniversalMode::kGemm),
+      ptr_A(nullptr),
+      ptr_B(nullptr)
+    { }
+
+
+    Params(
+      Arguments const &args
+    ):
+      problem_size(args.problem_size),
+      swizzle_log_tile(0),
+      params_A(args.ref_A.layout()),
+      params_B(args.ref_B.layout()),
+      mode(args.mode),
+      gemm_k_size(args.problem_size.k()),
+      ptr_A(args.ref_A.data()),
+      ptr_B(args.ref_B.data()),
+      epilogue_visitor(args.epilogue_visitor)
+    {
+
+      ThreadblockSwizzle threadblock_swizzle;
+
+      grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+        args.problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK}, 1);
+
+      if (args.mode == GemmUniversalMode::kGemm || args.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+        int const kAlignK = const_max(const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value), 1);
+
+        gemm_k_size = round_up(args.problem_size.k(), kAlignK);
+
+        if (gemm_k_size) {
+          grid_tiled_shape.k() = ceil_div(args.problem_size.k(), gemm_k_size);
+        }
+      }
+
+      swizzle_log_tile = threadblock_swizzle.get_log_tile(grid_tiled_shape);
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+
+    typename Mma::SharedStorage main_loop;
+
+    struct {
+      typename Epilogue::SharedStorage epilogue;
+      typename EpilogueVisitor::SharedStorage visitor;
+    } epilogue;
+  };
+
+public:
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  GemmWithEpilogueVisitor() { }
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size) {
+
+    CUTLASS_TRACE_HOST("GemmWithEpilogueVisitor::can_implement()");
+
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
+            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    CUTLASS_TRACE_HOST("  returning kSuccess");
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations,
+      accumulators,
+      iterator_A,
+      iterator_B,
+      accumulators);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    //
+    // Construct the epilogue visitor
+    //
+
+    EpilogueVisitor epilogue_visitor(
+      params.epilogue_visitor,
+      shared_storage.epilogue.visitor,
+      params.problem_size.mn(),
+      thread_idx,
+      warp_idx,
+      lane_idx,
+      threadblock_offset);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+      // Indicate which position in a serial reduction the output operator is currently updating
+      epilogue_visitor.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+    }
+    else if (params.mode == GemmUniversalMode::kBatched || params.mode == GemmUniversalMode::kArray) {
+      epilogue_visitor.set_batch_index(threadblock_tile_offset.k());
+    }
+
+    // Construct the epilogue
+    Epilogue epilogue(
+      shared_storage.epilogue.epilogue,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(epilogue_visitor, accumulators);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/37_gemm_layernorm_gemm_fusion/gemm_with_layernorm.h b/examples/37_gemm_layernorm_gemm_fusion/gemm_with_layernorm.h
new file mode 100644
index 0000000000..b33954ecce
--- /dev/null
+++ b/examples/37_gemm_layernorm_gemm_fusion/gemm_with_layernorm.h
@@ -0,0 +1,1066 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief A file contains all functioning classes needed by GemmLayernorm.
+
+    GemmLayernorm example =  GEMM0 with partial reduction fused in epilogue (EpilogueVisitorLayerNorm)
+                          +  lightweight full reduction kernel (ApplyFinalReduction)
+                          +  GEMM1 with elemenwise operations fused in mainloop (GemmLayernormMainloopFusion)
+
+*/
+
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <cmath>
+#include <iostream>
+#include <vector>
+#include <limits>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/gemm/device/gemm_layernorm_mainloop_fusion.h"
+#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/kernel/default_gemm_complex.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_visitor.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "gemm_with_epilogue_visitor.h"
+#include "helper.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementVariance_,
+  typename ElementMean_,
+  typename ElementLayernormCompute_,
+  typename ElementOutput,
+  typename ThreadblockShape_,
+  bool IsShiftedVariance_ = false
+>
+class ApplyFinalReduction {
+public:
+
+  using ElementVariance = ElementVariance_;
+  using ElementMean = ElementMean_;
+  using ElementLayernormCompute = ElementLayernormCompute_;
+  using ThreadblockShape = ThreadblockShape_;
+
+  // Pre-processing has ensured the layout equivelent to RowMajor
+  using Layout = cutlass::layout::RowMajor;
+
+  using TensorVariance = TensorRef<ElementVariance, Layout>;
+  using TensorMean = TensorRef<ElementMean, Layout>;
+
+  static bool const kIsShiftedVariance = IsShiftedVariance_;
+
+  //
+  // Arguments
+  //
+
+  struct Arguments {
+
+    MatrixCoord     extent;             ///< Extent of D and Layernorm matrices
+    TensorVariance  ref_Variance;       ///< Sum Square or Variance tensor (input / output)
+    TensorMean      ref_Mean;           ///< Sum or Mean tensor (input / output)
+    ElementOutput   *ptr_Shifted_K;     ///< Shifted K tensor pointer
+
+    //
+    // Methods
+    //
+    Arguments(){ }
+
+    Arguments(
+      MatrixCoord     extent_,
+      TensorVariance  ref_Variance_,
+      TensorMean      ref_Mean_,
+      ElementOutput   *ptr_Shifted_K_
+    ):
+      extent(extent_),
+      ref_Variance(ref_Variance_),
+      ref_Mean(ref_Mean_),
+      ptr_Shifted_K(ptr_Shifted_K_)
+    {
+
+    }
+  };
+
+  struct SharedStorage {
+
+
+  };
+
+  //
+  // Params struct
+  //
+
+  struct Params {
+    Arguments args;
+
+    //
+    // Methods
+    //
+    Params() { }
+
+    Params(Arguments const &args_): args(args_) { }
+  };
+
+private:
+
+public:
+
+  CUTLASS_DEVICE
+  ApplyFinalReduction() { }
+
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    apply(params, shared_storage);
+  }
+
+private:
+
+  /// Partial reduction
+  CUTLASS_DEVICE
+  void apply(Params const &params, SharedStorage &shared_storage) {
+
+    int threadblock_num = (params.args.extent.column() + ThreadblockShape::kM - 1) / ThreadblockShape::kM;
+
+    int block_n = blockIdx.x * blockDim.x;
+
+    int thread_n = threadIdx.x;
+
+    int idx_n = block_n + thread_n;
+
+    if (idx_n >= params.args.extent.row()) {
+      return;
+    }
+
+    using ConvertVarianceOutput = cutlass::NumericConverter<ElementVariance, ElementLayernormCompute>;
+    using ConvertMeanOutput = cutlass::NumericConverter<ElementMean, ElementLayernormCompute>;
+
+    using ConvertVariance = cutlass::NumericConverter<ElementLayernormCompute, ElementVariance>;
+    using ConvertMean = cutlass::NumericConverter<ElementLayernormCompute, ElementMean>;
+
+    using ConvertShiftK = cutlass::NumericConverter<ElementLayernormCompute, ElementOutput>;
+
+    ConvertVariance   convert_variance;
+    ConvertMean  convert_mean;
+
+    ConvertVarianceOutput   convert_variance_output;
+    ConvertMeanOutput  convert_mean_output;
+
+    ElementVariance *access_square = params.args.ref_Variance.data() + idx_n;
+    ElementMean *access_mean = params.args.ref_Mean.data() + idx_n;
+
+    ElementVariance *access_square_bak = access_square;
+    ElementMean *access_mean_bak = access_mean;
+
+    ElementLayernormCompute frag_square_sum = ElementLayernormCompute(0);
+    ElementLayernormCompute frag_element_sum = ElementLayernormCompute(0);
+    ElementVariance fetch_square;
+    ElementMean fetch_mean;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int idx_m = 0; idx_m < threadblock_num; idx_m++) {
+      arch::global_load<ElementVariance, sizeof(ElementVariance)>(fetch_square, access_square, true);
+      arch::global_load<ElementMean, sizeof(ElementMean)>(fetch_mean, access_mean, true);
+      frag_element_sum += convert_mean(fetch_mean);
+      frag_square_sum += convert_variance(fetch_square);
+      access_square += params.args.extent.row();
+      access_mean += params.args.extent.row();
+    }
+
+    ElementLayernormCompute mean = frag_element_sum;
+    ElementLayernormCompute square_mean = frag_square_sum;
+
+    ElementLayernormCompute variance;
+
+    if (kIsShiftedVariance && params.args.ptr_Shifted_K != nullptr) {
+      ElementOutput *access_shift_k = params.args.ptr_Shifted_K + idx_n;
+      ElementOutput fetch_shift_k;
+      ConvertShiftK convert_shift_k;
+      arch::global_load<ElementOutput, sizeof(ElementOutput)>(fetch_shift_k, access_shift_k, true);
+      ElementLayernormCompute shifted_mean =  mean - convert_shift_k(fetch_shift_k);
+      variance = cutlass::constants::one<ElementLayernormCompute>() / cutlass::fast_sqrt(square_mean - shifted_mean * shifted_mean + ElementLayernormCompute(1e-6));
+    }else{
+      variance = cutlass::constants::one<ElementLayernormCompute>() / cutlass::fast_sqrt(square_mean - mean * mean + ElementLayernormCompute(1e-6));
+    }
+
+    mean = -mean * variance;
+
+    access_square = access_square_bak;
+    access_mean = access_mean_bak;
+
+    access_square[0] = convert_variance_output(variance);
+    access_mean[0] = convert_mean_output(mean);
+
+  }
+
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ThreadblockShape_,
+  int ThreadCount,
+  typename OutputTileIterator_,
+  typename AccumulatorTile_,
+  typename ElementAccumulator_,
+  typename ElementVariance_,
+  typename ElementMean_,
+  typename ElementLayernormCompute_,
+  typename ElementwiseFunctor_,
+  bool IsShiftedVariance_ = false
+>
+class EpilogueVisitorLayerNorm {
+public:
+
+  using ElementVariance = ElementVariance_;
+  using ElementMean = ElementMean_;
+  using ElementLayernormCompute = ElementLayernormCompute_;
+
+  using AccumulatorTile = AccumulatorTile_;
+
+  using ThreadblockShape   = ThreadblockShape_;
+  static int const kThreadCount = ThreadCount;
+
+  using OutputTileIterator = OutputTileIterator_;
+  using ElementwiseFunctor = ElementwiseFunctor_;
+
+  static int const kIterations = OutputTileIterator::kIterations;
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+  static int const kRowIterations = OutputTileIterator::ThreadMap::Iterations::kRow;
+
+  static int const kThreads = OutputTileIterator::ThreadMap::kThreads;
+
+  static bool const kIsShiftedVariance = IsShiftedVariance_;
+
+  using ElementOutput = typename OutputTileIterator::Element;
+
+  static int const kDeltaRow = OutputTileIterator::ThreadMap::Delta::kRow;
+
+  /// Array type used in Shift-K Layernorm
+  static int const kRowAccessCount = kIterations * kRowIterations;
+
+  using ConvertedShiftFragment = Array<ElementLayernormCompute, kRowAccessCount>;
+
+  // Conducts manual transpose externally (already supported) for column major
+  using LayoutOutput = cutlass::layout::RowMajor;
+
+  using ElementAccumulator = ElementAccumulator_;
+
+  using AccumulatorFragment = Array<ElementAccumulator, kElementsPerAccess>;
+  using LayernormFragment = Array<ElementLayernormCompute, kElementsPerAccess>;
+  using OutputVector = Array<ElementOutput, kElementsPerAccess>;
+  using TensorRefD = TensorRef<ElementOutput, LayoutOutput>;
+
+  static int const kThreadsPerRow = OutputTileIterator::ThreadMap::Detail::RowArrangement::Detail::kShapeWidth;
+  static int const kThreadsInColumn = kThreads / kThreadsPerRow;
+  static int const kHalfThreadsPerRow = (kThreadsPerRow >> 1);
+
+  /// Argument structure
+  struct Arguments {
+
+    typename ElementwiseFunctor::Params   elementwise;
+    TensorRefD                            ref_C;
+    TensorRefD                            ref_D;
+    ElementVariance                       *ptr_Variance;
+    ElementMean                           *ptr_Mean;
+    ElementOutput                         *ptr_Shifted_K;
+
+    //
+    // Methods
+    //
+    Arguments():
+      ptr_Variance(nullptr),
+      ptr_Mean(nullptr),
+      ptr_Shifted_K(nullptr)
+    {
+
+    }
+
+    Arguments(
+      typename ElementwiseFunctor::Params   elementwise_,
+      TensorRefD                            ref_C_,
+      TensorRefD                            ref_D_,
+      ElementVariance                       *ptr_Variance,
+      ElementMean                           *ptr_Mean_,
+      ElementOutput                         *ptr_Shifted_K_ = nullptr
+    ):
+      elementwise(elementwise_),
+      ref_C(ref_C_),
+      ref_D(ref_D_),
+      ptr_Variance(ptr_Variance),
+      ptr_Mean(ptr_Mean_),
+      ptr_Shifted_K(ptr_Shifted_K_)
+    {
+
+    }
+  };
+
+  struct Params {
+
+    typename ElementwiseFunctor::Params   elementwise;
+    typename OutputTileIterator::Params   params_C;
+    typename OutputTileIterator::Params   params_D;
+    typename OutputTileIterator::Element *ptr_C;
+    typename OutputTileIterator::Element *ptr_D;
+    ElementVariance                       *ptr_Variance;
+    ElementMean                           *ptr_Mean;
+    ElementOutput                         *ptr_Shifted_K;
+
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params():
+      ptr_D(nullptr),
+      ptr_Variance(nullptr),
+      ptr_Mean(nullptr)
+    {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(Arguments const &args):
+      elementwise(args.elementwise),
+      params_C(args.ref_C.layout()),
+      params_D(args.ref_D.layout()),
+      ptr_C(args.ref_C.data()),
+      ptr_D(args.ref_D.data()),
+      ptr_Variance(args.ptr_Variance),
+      ptr_Mean(args.ptr_Mean),
+      ptr_Shifted_K(args.ptr_Shifted_K)
+    {
+
+    }
+  };
+
+  /// Shared storage
+  struct SharedStorage {
+
+  };
+
+private:
+
+  Params const &                        params_;
+  SharedStorage &                       shared_storage_;
+  MatrixCoord                           extent_;
+  ElementwiseFunctor                    elementwise_;
+
+  OutputTileIterator                    iterator_C_;
+  OutputTileIterator                    iterator_D_;
+  typename OutputTileIterator::Fragment fragment_C_;
+  typename OutputTileIterator::Fragment fragment_D_;
+
+  ElementAccumulator                    alpha_;
+  ElementAccumulator                    beta_;
+  ConvertedShiftFragment                shift_k_frag_;
+
+  ElementLayernormCompute               accum_sum_square_;
+  ElementLayernormCompute               accum_sum_element_;
+
+  MatrixCoord                           thread_offset_;
+
+public:
+
+  CUTLASS_DEVICE
+  EpilogueVisitorLayerNorm(
+    Params const &params,                                         ///< Parameters routed to the epilogue
+    SharedStorage &shared_storage,                                ///< Shared storage needed by the functors here
+    MatrixCoord const &problem_size0,                              ///< Problem size of the output
+    int thread_idx,                                               ///< Thread index within the threadblock
+    int warp_idx,                                                 ///< Warp index within the threadblock
+    int lane_idx,                                                 ///< Lane index within the warp
+    MatrixCoord const &threadblock_offset = MatrixCoord(0, 0)
+  ):
+    params_(params),
+    shared_storage_(shared_storage),
+    extent_(problem_size0),
+    elementwise_(params.elementwise),
+    iterator_C_(params.params_C, params.ptr_C, problem_size0, thread_idx, threadblock_offset),
+    iterator_D_(params.params_D, params.ptr_D, problem_size0, thread_idx, threadblock_offset)
+  {
+    alpha_ = (params.elementwise.alpha_ptr ? *params.elementwise.alpha_ptr : params.elementwise.alpha);
+    beta_ =  (params.elementwise.beta_ptr ? *params.elementwise.beta_ptr : params.elementwise.beta);
+
+    if (beta_ == ElementAccumulator()) {
+      iterator_C_.clear_mask();
+    }
+  }
+
+  /// Helper to indicate split-K behavior
+  CUTLASS_DEVICE
+  void set_k_partition(
+    int split_k_index,                                            ///< Index of this threadblock within split-K partitioned scheme
+    int split_k_slices) {                                         ///< Total number of split-K slices
+
+  }
+
+  /// Called to set the batch index
+  CUTLASS_DEVICE
+  void set_batch_index(int batch_idx) {
+
+  }
+
+  /// Called at the start of the epilogue just before iterating over accumulator slices
+  CUTLASS_DEVICE
+  void begin_epilogue() {
+
+    // If shift-K feature is enabled, we load shift-k fragment
+    // at the very beginning of an epilogue
+    if (kIsShiftedVariance && params_.ptr_Shifted_K != nullptr) {
+      shift_k_frag_.clear();
+      int thread_offset_row_base = iterator_D_.thread_start_row();
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int iter_idx = 0; iter_idx < kIterations; ++iter_idx) {
+        int step_offset = iter_idx * OutputTileIterator::Shape::kRow;
+        CUTLASS_PRAGMA_UNROLL
+        for (int rid = 0; rid < kRowIterations; ++rid) {
+          int row_step_offset = rid * kDeltaRow;
+          int row_offset = thread_offset_row_base + step_offset + row_step_offset;
+          bool is_load = (row_offset < extent_.row());
+          shift_k_frag_[iter_idx * kRowIterations + rid] = load_shift_k_(row_offset, is_load);
+        }
+
+      }
+
+    }
+
+  }
+
+  /// Called at the start of one step before starting accumulator exchange
+  CUTLASS_DEVICE
+  void begin_step(int step_idx) {
+    fragment_D_.clear();
+
+    if (elementwise_.kScale != cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling) {
+      fragment_C_.clear();
+      iterator_C_.load(fragment_C_);
+      ++iterator_C_;
+    }
+  }
+
+  /// Called at the start of a row
+  CUTLASS_DEVICE
+  void begin_row(int row_idx) {
+
+  }
+
+  /// Called after accumulators have been exchanged for each accumulator vector
+  CUTLASS_DEVICE
+  void visit(
+    int iter_idx,
+    int row_idx,
+    int column_idx,
+    int frag_idx,
+    AccumulatorFragment const &accum) {
+
+    using Mul = cutlass::multiplies<ElementLayernormCompute>;
+    using Minus = cutlass::minus<ElementLayernormCompute>;
+    using Exp   = cutlass::fast_exp_op<ElementLayernormCompute>;
+
+    [[maybe_unused]] Minus minus;
+    [[maybe_unused]] Mul   mul;
+    [[maybe_unused]] Exp   exponential;
+
+    LayernormFragment result;
+
+    thread_offset_ =
+      iterator_D_.thread_start() +
+      OutputTileIterator::ThreadMap::iteration_offset(frag_idx);
+
+    NumericArrayConverter<ElementLayernormCompute, ElementOutput, kElementsPerAccess> source_converter;
+    OutputVector &source_vector = reinterpret_cast<OutputVector *>(&fragment_C_)[frag_idx];
+
+    bool column_guard = (thread_offset_.column() < extent_.column());
+
+    if (elementwise_.kScale == cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling) {
+      result = source_converter(elementwise_(accum));
+    }else{
+      result = source_converter(elementwise_(accum, source_vector));
+    }
+
+
+    ElementLayernormCompute inv_scalar = cutlass::constants::one<ElementLayernormCompute>() / ElementLayernormCompute(extent_.column());
+
+    // Fragment is cleared for non-reachable columns so no need to check against column guard
+    accum_sum_element_ = element_sum_accumulator_(result);
+
+    // Square sum is different. Non-reachable columns should've been computed for shift-k
+    // Otherwise we will incorrectly have some extra k^2 added into square sum.
+    if (column_guard) {
+      accum_sum_square_ = (kIsShiftedVariance) ? \
+                        square_sum_accumulator_(result, shift_k_frag_[iter_idx * kRowIterations + row_idx]) : \
+                        square_sum_accumulator_(result);
+    }
+    else {
+      accum_sum_square_ = ElementLayernormCompute(0);
+    }
+
+    accum_sum_element_ *= inv_scalar;
+    accum_sum_square_ *= inv_scalar;
+
+    // After performing the in-thread reduction, we then perform cross-thread / in-warp reduction
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = kHalfThreadsPerRow; i > 0; i >>= 1) {
+      accum_sum_element_ += __shfl_xor_sync(0xFFFFFFFF, accum_sum_element_, i);
+      accum_sum_square_ += __shfl_xor_sync(0xFFFFFFFF, accum_sum_square_, i);
+    }
+
+    // Convert to the output
+    NumericArrayConverter<ElementOutput, ElementLayernormCompute, kElementsPerAccess> output_converter;
+    OutputVector &output = reinterpret_cast<OutputVector *>(&fragment_D_)[frag_idx];
+    output = output_converter(result);
+  }
+
+  /// Called at the start of a row
+  CUTLASS_DEVICE
+  void end_row(int row_idx) {
+
+    using ConvertVarianceOutput = cutlass::NumericConverter<ElementVariance, ElementLayernormCompute>;
+    using ConvertMeanOutput = cutlass::NumericConverter<ElementMean, ElementLayernormCompute>;
+
+    ConvertVarianceOutput   convert_variance_output;
+    ConvertMeanOutput  convert_mean_output;
+
+    bool is_write_thread = (thread_offset_.row() < extent_.row() && (threadIdx.x % kThreadsPerRow) == 0);
+    int row_offset = thread_offset_.row() + blockIdx.y * extent_.row();
+
+    ElementVariance *curr_ptr_sum_square = params_.ptr_Variance + row_offset;
+    ElementMean *curr_ptr_element_sum = params_.ptr_Mean + row_offset;
+
+    arch::global_store<ElementVariance, sizeof(ElementVariance)>(
+              convert_variance_output(accum_sum_square_),
+              (void *)curr_ptr_sum_square,
+              is_write_thread);
+
+    arch::global_store<ElementMean, sizeof(ElementMean)>(
+              convert_mean_output(accum_sum_element_),
+              (void *)curr_ptr_element_sum,
+              is_write_thread);
+
+  }
+
+  /// Called after all accumulator elements have been visited
+  CUTLASS_DEVICE
+  void end_step(int step_idx) {
+
+    iterator_D_.store(fragment_D_);
+    ++iterator_D_;
+  }
+
+  /// Called after all steps have been completed
+  CUTLASS_DEVICE
+  void end_epilogue() {
+
+  }
+
+private:
+
+  CUTLASS_DEVICE
+  ElementLayernormCompute load_shift_k_(int row_offset, bool is_load) {
+    using ConvertShiftK = cutlass::NumericConverter<ElementLayernormCompute, ElementOutput>;
+    ConvertShiftK convert_shift_k;
+    ElementOutput shift_k_val;
+
+    // Computes the address to load shift_k element
+    ElementOutput *curr_ptr_shift_k = params_.ptr_Shifted_K + row_offset;
+    // Conditionally loads from global memory
+    arch::global_load<ElementOutput, sizeof(ElementOutput)>(shift_k_val, (void *)curr_ptr_shift_k, is_load);
+    // Converts data type to return
+    ElementLayernormCompute converted_shift_k_val = convert_shift_k(shift_k_val);
+
+    return converted_shift_k_val;
+  }
+
+  CUTLASS_DEVICE
+  ElementLayernormCompute square_sum_accumulator_(LayernormFragment const &accum) {
+    ElementLayernormCompute sum_ = ElementLayernormCompute(0);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < LayernormFragment::kElements; ++i) {
+      auto accum_ = accum[i];
+      sum_ += accum_ * accum_;
+    }
+
+    return sum_;
+  }
+
+  CUTLASS_DEVICE
+  ElementLayernormCompute square_sum_accumulator_(LayernormFragment const &accum, ElementLayernormCompute shift_k_val) {
+    ElementLayernormCompute sum_ = ElementLayernormCompute(0);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < LayernormFragment::kElements; ++i) {
+      auto accum_ = accum[i] - shift_k_val;
+      sum_ += accum_ * accum_;
+    }
+
+    return sum_;
+  }
+
+  CUTLASS_DEVICE
+  ElementLayernormCompute element_sum_accumulator_(LayernormFragment const &accum) {
+    ElementLayernormCompute sum_ = ElementLayernormCompute(0);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < LayernormFragment::kElements; ++i) {
+      sum_ += accum[i];
+    }
+
+    return sum_;
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+///
+template <
+  typename ElementInputA0_,
+  typename LayoutInputA0_,
+  typename ElementInputB0_,
+  typename LayoutInputB0_,
+  typename ElementOutput_,
+  typename LayoutOutput_,
+  typename ElementCompute_,
+  typename EpilogueFunctorOp_,
+  typename ThreadblockShape_,
+  typename WarpShape_,
+  typename InstructionShape_,
+  int Stages0,
+  int Stages1,
+  bool IsShiftedVariance_ = false
+>
+class GemmLayernorm {
+public:
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////
+
+  //
+  // Type definitions
+  //
+
+  static bool const kInternalTranspose = cutlass::platform::is_same<LayoutOutput_, cutlass::layout::ColumnMajor>::value;
+  static bool const kIsShiftedVariance = IsShiftedVariance_;
+
+  // These is mandatory layout.
+  using LayoutInputScaleBias = cutlass::layout::RowMajor;
+
+  // These are mandatory data types.
+  using ElementLayernormCompute = float;
+  using ElementInputScaleBias = cutlass::half_t;
+
+  // These are mandatory params required by mainloop fusion
+  using OperatorClass       = cutlass::arch::OpClassTensorOp;
+  using ArchTag             = cutlass::arch::Sm80;
+
+  // These are mandatory layouts and data types
+  // that are inheritated from pre-defined params
+
+  using LayoutSumSqr = LayoutInputScaleBias;
+  using LayoutSum = LayoutInputScaleBias;
+
+  using ElementMean = ElementInputScaleBias;
+  using ElementVariance = ElementInputScaleBias;
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////
+
+  using LayoutInputA0 = LayoutInputA0_;
+  using LayoutInputB0 = LayoutInputB0_;
+  using LayoutInputA1 = LayoutOutput_;
+  using LayoutInputB1 = LayoutOutput_;
+  using LayoutOutputC0 = LayoutOutput_;
+  using LayoutOutputC1 = LayoutOutput_;
+
+  using ElementInputA0 = ElementInputA0_;
+  using ElementInputB0 = ElementInputB0_;
+  using ElementOutputC0 = ElementOutput_;
+  using ElementCompute = ElementCompute_;
+  using ElementInputB1 = ElementInputB0_;
+
+  using ElementInputA1 = ElementOutputC0;
+  using ElementOutputC1 = ElementOutputC0;
+
+  using EpilogueFunctorOp = EpilogueFunctorOp_;
+
+  using TensorRefA = TensorRef<ElementInputA0, LayoutInputA0>;
+  using TensorRefB = TensorRef<ElementInputB0, LayoutInputB0>;
+  using TensorRefC = TensorRef<ElementOutputC0, LayoutOutputC0>;
+  using TensorVariance = TensorRef<ElementVariance, LayoutSumSqr>;
+  using TensorMean = TensorRef<ElementMean, LayoutSum>;
+
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape        = WarpShape_;
+  using InstructionShape = InstructionShape_;
+
+  static int const kStages0 = Stages0;
+  static int const kStages1 = Stages1;
+
+  using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////
+
+  using MapArguments = cutlass::gemm::kernel::detail::MapArguments<
+    ElementInputA0,
+    LayoutInputA0,
+    cutlass::ComplexTransform::kNone,
+    128 / cutlass::sizeof_bits<ElementInputA0>::value,
+    ElementInputB0,
+    LayoutInputB0,
+    cutlass::ComplexTransform::kNone,
+    128 / cutlass::sizeof_bits<ElementInputB0>::value,
+    LayoutOutputC0,
+    kInternalTranspose
+  >;
+
+  using DefaultGemmKernel = typename cutlass::gemm::kernel::DefaultGemm<
+    typename MapArguments::ElementA,
+    typename MapArguments::LayoutA,
+    MapArguments::kAlignmentA,
+    typename MapArguments::ElementB,
+    typename MapArguments::LayoutB,
+    MapArguments::kAlignmentB,
+    ElementOutputC0,
+    typename MapArguments::LayoutC,
+    ElementCompute,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueFunctorOp,
+    SwizzleThreadBlock,
+    kStages0,
+    true,
+    typename cutlass::gemm::device::DefaultGemmConfiguration<
+        OperatorClass, ArchTag, ElementInputA0, ElementInputB0, ElementOutputC0, ElementCompute>::Operator,
+    cutlass::gemm::SharedMemoryClearOption::kNone
+  >::GemmKernel;
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////
+
+  // Epilogue visitor
+  using EpilogueVisitor = kernel::EpilogueVisitorLayerNorm<
+    ThreadblockShape,
+    DefaultGemmKernel::kThreadCount,
+    typename DefaultGemmKernel::Epilogue::OutputTileIterator,
+    typename DefaultGemmKernel::Epilogue::AccumulatorFragmentIterator::AccumulatorTile,
+    ElementCompute,
+    ElementVariance,
+    ElementMean,
+    ElementLayernormCompute,
+    EpilogueFunctorOp,
+    kIsShiftedVariance
+  >;
+
+  /// Epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::EpilogueWithVisitorFromExistingEpilogue<
+    EpilogueVisitor,
+    typename DefaultGemmKernel::Epilogue
+  >::Epilogue;
+
+  // GEMM
+  using GemmEpilogueFusion = gemm::kernel::GemmWithEpilogueVisitor<
+    typename DefaultGemmKernel::Mma,
+    Epilogue,
+    SwizzleThreadBlock
+  >;
+
+  using ApplyFinalReductionKernel = kernel::ApplyFinalReduction<
+    ElementVariance,
+    ElementMean,
+    ElementLayernormCompute,
+    ElementOutputC0,
+    ThreadblockShape,
+    kIsShiftedVariance
+  >;
+
+using GemmMainloopFusion = typename cutlass::gemm::device::GemmLayernormMainloopFusion<
+  ElementInputA1, LayoutInputA1,
+  ElementInputB1, LayoutInputB1,
+  ElementInputScaleBias, LayoutInputScaleBias,
+  ElementOutputC1, LayoutOutputC1,
+  ElementCompute,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueFunctorOp,
+  SwizzleThreadBlock,
+  kStages1
+>;
+
+public:
+
+  /// Arguments class
+  struct Arguments {
+
+    typename GemmEpilogueFusion::Arguments         gemm0;
+    typename GemmMainloopFusion::Arguments         gemm1;
+    typename ApplyFinalReductionKernel::Arguments reduction;
+    cutlass::gemm::GemmCoord extend;
+
+    //
+    // Methods
+    //
+    Arguments() { }
+
+    Arguments(
+      cutlass::gemm::GemmCoord problem_size0,
+      cutlass::gemm::GemmCoord problem_size1,
+      ElementInputA0 * ptr_A,
+      ElementInputB0 * ptr_B,
+      ElementOutputC0 * ptr_C,
+      ElementOutputC0 * ptr_D,
+      ElementOutputC0 * ptr_E,
+      ElementOutputC0 * ptr_O,
+      int64_t    ldm_A,
+      int64_t    ldm_B,
+      int64_t    ldm_C,
+      int64_t    ldm_D,
+      int64_t    ldm_E,
+      int64_t    ldm_O,
+      typename EpilogueFunctorOp::Params linear_scaling,
+      TensorVariance ref_Variance_,
+      TensorMean ref_Mean_,
+      TensorVariance ref_Gamma_,
+      TensorMean ref_Beta_,
+      ElementOutputC0 *ptr_Shifted_K = nullptr
+    ):
+      gemm0(
+        cutlass::gemm::GemmUniversalMode::kGemm,
+        {kInternalTranspose ? problem_size0.n() : problem_size0.m(),\
+         kInternalTranspose ? problem_size0.m() : problem_size0.n(),\
+         problem_size0.k()},
+        {kInternalTranspose ? ptr_B : ptr_A, \
+        kInternalTranspose ? ldm_B : ldm_A},
+        {kInternalTranspose ? ptr_A : ptr_B, \
+        kInternalTranspose ? ldm_A : ldm_B},
+        typename EpilogueVisitor::Arguments(
+          linear_scaling,
+          {ptr_C, ldm_C},
+          {ptr_D, ldm_D},
+          ref_Variance_.data(),
+          ref_Mean_.data(),
+          ptr_Shifted_K
+        )
+      ),
+      reduction(
+        MatrixCoord(kInternalTranspose ? problem_size0.n() : problem_size0.m(),\
+                    kInternalTranspose ? problem_size0.m() : problem_size0.n()),
+        ref_Variance_,
+        ref_Mean_,
+        ptr_Shifted_K
+      ),
+      gemm1(
+        cutlass::gemm::GemmUniversalMode::kGemm,
+        problem_size1,
+        1,
+        linear_scaling,
+        kInternalTranspose ? ptr_E : ptr_D,
+        kInternalTranspose ? ptr_D : ptr_E,
+        ref_Variance_.data(),
+        ref_Mean_.data(),
+        ref_Gamma_.data(),
+        ref_Beta_.data(),
+        ptr_O,
+        ptr_O,
+        problem_size1.m() * problem_size1.k(),
+        problem_size1.n() * problem_size1.k(),
+        problem_size1.n(),
+        problem_size1.n(),
+        problem_size1.k(),
+        problem_size1.k(),
+        problem_size1.m() * problem_size1.n(),
+        problem_size1.m() * problem_size1.n(),
+        kInternalTranspose ? ldm_E : ldm_D,
+        kInternalTranspose ? ldm_D : ldm_D,
+        ref_Variance_.layout().stride(0),
+        ref_Mean_.layout().stride(0),
+        ref_Gamma_.layout().stride(0),
+        ref_Beta_.layout().stride(0),
+        ldm_O,
+        ldm_O
+      ),
+      extend(problem_size0)
+    {
+
+    }
+  };
+
+  struct Params {
+
+    typename GemmEpilogueFusion::Params         gemm0;
+    typename ApplyFinalReductionKernel::Params reduction;
+    MatrixCoord extend;
+    //
+    // Methods
+    //
+    Params() { }
+
+    Params(Arguments const &args):
+      gemm0(args.gemm0),
+      reduction(args.reduction),
+      extend(MatrixCoord(args.extend.m(), args.extend.n()))
+    {
+
+    }
+  };
+
+public:
+
+  // Gemm
+
+
+  //
+  // Methods
+  //
+
+private:
+
+  Params params_;
+  GemmMainloopFusion gemm_fusion_op;
+
+public:
+
+  /// Ctor
+  GemmLayernorm() {
+
+  }
+
+  /// Initialize
+  Status initialize(Arguments const &args) {
+
+    params_ = Params(args);
+    cutlass::Status status;
+    size_t workspace_size = gemm_fusion_op.get_workspace_size(args.gemm1);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+    status = gemm_fusion_op.can_implement(args.gemm1);
+    CUTLASS_CHECK(status);
+
+    status = gemm_fusion_op.initialize(args.gemm1, workspace.get());
+    CUTLASS_CHECK(status);
+
+    return cutlass::Status::kSuccess;
+  }
+
+  /// Run
+  Status run(cudaStream_t stream) {
+
+    //
+    // Launch the GEMM + layernorm kernel
+    //
+
+    dim3 gemm_grid = SwizzleThreadBlock().get_grid_shape(params_.gemm0.grid_tiled_shape);
+    dim3 gemm_block(GemmEpilogueFusion::kThreadCount, 1, 1);
+
+    int gemm_smem_size = int(sizeof(typename GemmEpilogueFusion::SharedStorage));
+
+    cutlass::Kernel<GemmEpilogueFusion><<<gemm_grid, gemm_block, gemm_smem_size, stream>>>(params_.gemm0);
+
+    cudaError_t result = cudaGetLastError();
+
+    if (result != cudaSuccess) {
+      return cutlass::Status::kErrorInternal;
+    }
+
+    //
+    // Launch the ApplyFinalReductionKernel
+    //
+
+    // always performs reduction from leading dimension
+    int leading_dim_0 = kInternalTranspose ? params_.extend.row() : params_.extend.column();
+    int leading_dim_1 = kInternalTranspose ? params_.extend.column() : params_.extend.row();
+
+    int thread_per_block = 128;
+    int block_per_row = (leading_dim_1 + thread_per_block - 1) / thread_per_block;
+    if (block_per_row < 4) {
+      thread_per_block = 32;
+      block_per_row = (leading_dim_1 + thread_per_block - 1) / thread_per_block;
+    }
+
+    dim3 final_reduction_block(thread_per_block);
+    dim3 final_reduction_grid(block_per_row);
+
+    Kernel<ApplyFinalReductionKernel><<<
+      final_reduction_grid, final_reduction_block, sizeof(typename ApplyFinalReductionKernel::SharedStorage), stream
+    >>>(params_.reduction);
+
+    result = cudaGetLastError();
+
+    if (result != cudaSuccess) {
+      return cutlass::Status::kErrorInternal;
+    }
+
+    //
+    // Launch the GEMM + mainloop fusion kernel
+    //
+
+    cutlass::Status status = gemm_fusion_op();
+    CUTLASS_CHECK(status);
+
+    return cutlass::Status::kSuccess;
+  }
+
+  /// Function call operator
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/38_syr2k_grouped/CMakeLists.txt b/examples/38_syr2k_grouped/CMakeLists.txt
new file mode 100644
index 0000000000..461619ed3b
--- /dev/null
+++ b/examples/38_syr2k_grouped/CMakeLists.txt
@@ -0,0 +1,36 @@
+
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+
+cutlass_example_add_executable(
+  38_syr2k_grouped
+  syr2k_grouped.cu
+  )
+
diff --git a/examples/38_syr2k_grouped/syr2k_grouped.cu b/examples/38_syr2k_grouped/syr2k_grouped.cu
new file mode 100644
index 0000000000..c1fb82e839
--- /dev/null
+++ b/examples/38_syr2k_grouped/syr2k_grouped.cu
@@ -0,0 +1,1466 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief SYR2K Grouped Example.
+
+    This workload computes a batch of SYR2K operations with distinct problem sizes. This example closely
+    follows 24_gemm_grouped.
+
+    Examples:
+
+      # Runs a grouped SYR2K with 100 random problem sizes
+      $ ./examples/38_syr2k_grouped/38_syr2k_grouped --groups=100
+
+      # Runs a grouped SYR2K with 100 random problem sizes (with SYR2K-K dimension equal to 1024)
+      $ ./examples/38_syr2k_grouped/24_gemm_grouped --groups=100 --k=1024 --verbose=true
+
+      # Runs a grouped SYR2K that is equivalent to a batched SYR2K
+      $ ./examples/38_syr2k_grouped/38_syr2k_grouped --groups=100 --n=1024 --k=1024 --verbose=true
+
+      # Execute grouped SYR2K and profile with NSight
+      $ nv-nsight-cu-cli ./examples/38_syr2k_grouped/38_syr2k_grouped --n=256 --k=256 --verbose=true \
+                                                                    --iterations=1 --reference-check=false
+
+*/
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <chrono>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <unordered_map>
+#include <vector>
+
+#include "cutlass/blas3.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/kernel/rank_2k_grouped.h"
+#include "cutlass/gemm/kernel/default_rank_2k_grouped.h"
+#include "cutlass/gemm/device/rank_2k_grouped.h"
+#include "cutlass/gemm/device/rank_2k.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_2k_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Result structure
+struct Result {
+
+  double runtime_ms;
+  double initialization_time_ms;
+  double gflops;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  //
+  // Methods
+  //
+
+  Result(
+    double runtime_ms = 0,
+    double initialization_time_ms = 0,
+    double gflops = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess
+  ):
+    runtime_ms(runtime_ms), initialization_time_ms(initialization_time_ms), gflops(gflops),
+    status(status), error(error), passed(true) { }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+  bool error;
+  bool reference_check;
+  bool profile_initialization;
+  bool sort_problems;
+
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes;
+
+  int alignment;
+  int problem_count;
+  int iterations;
+  int cuda_streams;
+  bool verbose;
+  float alpha;
+  float beta;
+  std::string benchmark_path;
+
+  std::string   output_tag;
+  std::ofstream output_file;
+
+  using GroupScheduleMode = cutlass::gemm::kernel::GroupScheduleMode;
+  std::vector<GroupScheduleMode> scheduler_modes;
+
+  std::unordered_map<std::string, GroupScheduleMode>
+    str_to_scheduler_mode = {
+      {"kDeviceOnly", GroupScheduleMode::kDeviceOnly},
+      {"kHostPrecompute", GroupScheduleMode::kHostPrecompute}
+    };
+
+  struct GroupScheduleModeHash {
+    size_t operator()(GroupScheduleMode m) const {
+      return static_cast<size_t>(m);
+    }
+  };
+
+  std::unordered_map<GroupScheduleMode, std::string, GroupScheduleModeHash>
+    scheduler_mode_to_str = {
+      {GroupScheduleMode::kDeviceOnly, "kDeviceOnly"},
+      {GroupScheduleMode::kHostPrecompute, "kHostPrecompute"}
+    };
+
+  std::vector<GroupScheduleMode> all_scheduler_modes = {GroupScheduleMode::kDeviceOnly, GroupScheduleMode::kHostPrecompute};
+
+  //
+  // Methods
+  //
+
+  Options():
+    help(false),
+    error(false),
+    alignment(8),
+    reference_check(true),
+    profile_initialization(false),
+    sort_problems(false),
+    problem_count(5),
+    iterations(20),
+    cuda_streams(0),
+    verbose(false),
+    alpha(1),
+    beta(),
+    scheduler_modes({GroupScheduleMode::kDeviceOnly})
+  { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("alignment", alignment, 8);
+    cmd.get_cmd_line_argument("groups", problem_count, 5);
+    cmd.get_cmd_line_argument("alpha", alpha, 1.0f);
+    cmd.get_cmd_line_argument("beta", beta, 0.0f);
+    cmd.get_cmd_line_argument("iterations", iterations, 20);
+    cmd.get_cmd_line_argument("streams", cuda_streams, 0);
+    cmd.get_cmd_line_argument("verbose", verbose, false);
+    cmd.get_cmd_line_argument("reference-check", reference_check, true);
+    cmd.get_cmd_line_argument("profile-initialization", profile_initialization, false);
+    cmd.get_cmd_line_argument("sort-problems", sort_problems, false);
+    cmd.get_cmd_line_argument("benchmark", benchmark_path);
+
+    std::vector<std::string> scheduler_mode_strs;
+    cmd.get_cmd_line_arguments("scheduler-modes", scheduler_mode_strs);
+
+    if (!scheduler_mode_strs.empty()) {
+      scheduler_modes.clear();
+      if (scheduler_mode_strs.size() == 1 && scheduler_mode_strs[0] == "all") {
+        scheduler_modes = all_scheduler_modes;
+      } else {
+        for (std::string precomp_str : scheduler_mode_strs) {
+          auto it = str_to_scheduler_mode.find(precomp_str);
+          if (it != str_to_scheduler_mode.end()) {
+            scheduler_modes.push_back(it->second);
+          } else if (precomp_str == "all") {
+            std::cerr << "Flag --scheduler-modes=all must not contain other scheduler modes in list." << std::endl;
+            error = true;
+            return;
+          } else {
+            std::cerr << "Unrecognized scheduler mode '" << precomp_str << "'" << std::endl;
+            error = true;
+            return;
+          }
+        }
+      }
+    }
+
+    std::string output_path;
+    cmd.get_cmd_line_argument("tag", output_tag);
+    cmd.get_cmd_line_argument("output_file", output_path);
+
+    if (!output_path.empty()) {
+
+      std::ios_base::openmode open_mode = std::ios_base::out;
+
+      std::ifstream input_file(output_path.c_str());
+
+      if (input_file.good()) {
+        open_mode = std::ios_base::app;
+        input_file.close();
+      }
+
+      output_file.open(output_path.c_str(), open_mode);
+
+      if (output_file.good() && open_mode != std::ios_base::app) {
+        output_file << "Tag,Provider,Kind,Groups,Runtime,GFLOPs\n";
+      }
+    }
+
+    // Decide how to initialize the problems
+    if (!benchmark_path.empty()) {
+      if (!benchmark_problems()) {
+        error = true;
+        problem_sizes.clear();
+        return;
+      }
+    }
+    else {
+      randomize_problems(cmd);
+    }
+  }
+
+  void randomize_problems(cutlass::CommandLine &cmd) {
+
+    //
+    // For now, randomly choose the problem sizes.
+    //
+
+    int cmd_line_m = -1;
+    int cmd_line_n = -1;
+    int cmd_line_k = -1;
+
+    cmd.get_cmd_line_argument("m", cmd_line_m);
+    cmd.get_cmd_line_argument("n", cmd_line_n);
+    cmd.get_cmd_line_argument("k", cmd_line_k);
+
+    // SYR2K is defined via only N and K.
+    if (cmd_line_m != -1) {
+      std::cerr << "Parameter M is ignored for SYR2K\n";
+      error = true;
+      return;
+    }
+
+    problem_sizes.reserve(problem_count);
+
+    for (int i = 0; i < problem_count; ++i) {
+      int n = cmd_line_n;
+      int k = cmd_line_k;
+
+      if (n < 1) {
+        n = alignment * ((rand() % 256) + 1);
+      }
+
+      if (k < 1) {
+        k = alignment * ((rand() % 256) + 1);
+      }
+
+      // SYR2K is defined only in terms of N and K. Replicate N into
+      // the SYR2K-N dimension.
+      cutlass::gemm::GemmCoord problem(n, n, k);
+
+      problem_sizes.push_back(problem);
+    }
+  }
+
+  /// Load a benchmark
+  bool benchmark_problems() {
+    std::ifstream file(benchmark_path);
+    if (!file.good()) {
+      return false;
+    }
+
+    while (file.good()) {
+
+      int idx = -1;
+      std::string extent_str;
+
+      file >> idx >> extent_str;
+
+      if (idx < 0 || extent_str.empty()) {
+        break;
+      }
+
+      cutlass::gemm::GemmCoord extent;
+      std::vector<std::string> tokens;
+
+      cutlass::CommandLine::tokenize(tokens, extent_str, 'x');
+
+      for (int i = 0; i < int(tokens.size()); ++i) {
+        int x = std::atoi(tokens.at(i).c_str());
+
+        // round up
+        if (x % alignment) {
+          x += (alignment - (x % alignment));
+        }
+
+        extent.at(i) = x;
+      }
+
+      if (extent.product()) {
+        problem_sizes.push_back(extent);
+      }
+    }
+
+    problem_count = int(problem_sizes.size());
+    return true;
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "38_syr2k_grouped\n\n"
+      << "  This example profiles the performance of a 'grouped' SYR2K kernel. This example closely follows 24_gemm_grouped\n\n"
+      << "Options:\n\n"
+      << "  --help                           If specified, displays this usage statement.\n\n"
+      << "  --benchmark=<str>                Executes a benchmark problem size.\n"
+      << "  --output_file=<str>              Path to a CSV file to output results. If it exists already, results are appended.\n"
+      << "  --tag=<str>                      String tag to prepend to the CSV file.\n"
+      << "  --groups=<int>                   Number of individual SYR2K problems (default: --groups=15)\n"
+      << "  --m=<int>                        Sets the M dimension for all groups. Otherwise, it is selected randomly\n"
+      << "  --n=<int>                        Sets the N dimension for all groups. Otherwise, it is selected randomly\n"
+      << "  --k=<int>                        Sets the K dimension for all groups. Otherwise, it is selected randomly\n"
+      << "  --alpha=<f32>                    Epilogue scalar alpha (real part)\n"
+      << "  --beta=<f32>                     Epilogue scalar beta (real part)\n"
+      << "  --scheduler-modes=<str>          List of scheduler modes to be profile for grouped GEMM scheduler (default: --scheduler_modes=kDeviceOnly)\n"
+      << "  --iterations=<int>               Number of profiling iterations to perform.\n"
+      << "  --reference-check=<bool>         If true, performs reference check.\n"
+      << "  --verbose=<bool>                 If true, prints problem sizes and batching structure.\n"
+      << "  --profile-initialization=<bool>  If true, profiles the device-level kernel's initialization.\n"
+      << "  --sort-problems=<bool>           If true, sorts problem sizes in descending order of SYR2K-K dimension.\n";
+
+    out << "\n\nExamples:\n\n"
+
+      << "# Runs a grouped SYR2K with 100 random problem sizes\n"
+      << "$ ./examples/38_syr2k_grouped/38_syr2k_grouped --groups=100\n\n"
+
+      << "# Runs a grouped SYR2K with 100 random problem sizes (with K dimension equal to 1024)\n"
+      << "$ ./examples/38_syr2k_grouped/38_syr2k_grouped --groups=100 --k=1024 --verbose=true\n\n"
+
+      << "# Runs a grouped SYR2K that is equivalent to a batched SYR2K\n"
+      << "$ ./examples/38_syr2k_grouped/38_syr2k_grouped --groups=100 --n=1024 --k=1024 --verbose=true\n\n"
+
+      << "# Runs a grouped SYR2K with each different scheduler mode\n"
+      << "$ ./examples/38_syr2k_grouped/38_syr2k_grouped --scheduler-modes=all\n\n"
+
+      << "# Runs a grouped SYR2K with each different scheduler mode and profiles host-side initialization time\n"
+      << "$ ./examples/38_syr2k_grouped/38_syr2k_grouped --scheduler-modes=all --profile-initialization=true\n\n"
+
+      << "# Runs a grouped SYR2K problem given an externally supplied benchmark file. This is a text file in which\n"
+      << "# Each line contains a unique group index and an MxNxK triple indicating problemsize. NOTE that the\n"
+      << "# GEMM-M and GEMM-N dimensions must match.\n"
+      << "#\n"
+      << "# For example, assume the following are the contents of 'problems.txt'\n"
+      << "#\n"
+      << "# 0 256x256x520\n"
+      << "# 1 264x264x1024\n"
+      << "# 2 48x48x1024\n"
+      << "#\n"
+      << "$ ./examples/38_syr2k_grouped/38_syr2k_grouped --benchmark=problems.txt\n\n"
+
+      << "# Execute Grouped SYR2K and profile with NSight\n"
+      << "$ nv-nsight-cu-cli ./examples/38_syr2k_grouped/38_syr2k_grouped --n=256 --k=256 --verbose=true --iterations=1 --reference-check=false\n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+
+    // Number of real-valued multiply-adds
+    int64_t fmas = int64_t();
+
+    for (auto const & problem : problem_sizes) {
+      fmas += problem.product();
+    }
+
+    // SYR2K is defined as (A x BT) + (B x AT), so the number of FMAs is twice that in a GEMM
+    fmas *= 2;
+
+    // Two flops per multiply-add
+    return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Rank2K>
+class BaseTestbed {
+public:
+  //
+  // Type definitions
+  //
+
+  using ElementA = typename Rank2K::ElementA;
+  using ElementB = typename Rank2K::ElementB;
+  using ElementC = typename Rank2K::ElementC;
+  using ElementAccumulator = typename Rank2K::ElementAccumulator;
+
+  using EpilogueOutputOp = typename Rank2K::Rank2Kkernel::Epilogue::OutputOp;
+  using ElementCompute = typename EpilogueOutputOp::ElementCompute;
+
+  using LayoutA = typename Rank2K::LayoutA;
+  using LayoutB = typename Rank2K::LayoutB;
+  using LayoutC = typename Rank2K::LayoutC;
+
+  using MatrixCoord = typename LayoutC::TensorCoord;
+
+  //
+  // Data members
+  //
+
+  Options & options;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint32_t seed;
+
+  cutlass::DeviceAllocation<cutlass::gemm::GemmCoord> problem_sizes_device;
+
+  std::vector<int64_t> offset_A;
+  std::vector<int64_t> offset_B;
+  std::vector<int64_t> offset_C;
+  std::vector<int64_t> offset_D;
+
+  std::vector<int64_t> lda_host;
+  std::vector<int64_t> ldb_host;
+  std::vector<int64_t> ldc_host;
+  std::vector<int64_t> ldd_host;
+
+  cutlass::DeviceAllocation<int64_t> lda;
+  cutlass::DeviceAllocation<int64_t> ldb;
+  cutlass::DeviceAllocation<int64_t> ldc;
+  cutlass::DeviceAllocation<int64_t> ldd;
+
+  cutlass::DeviceAllocation<ElementA> block_A;
+  cutlass::DeviceAllocation<ElementB> block_B;
+  cutlass::DeviceAllocation<ElementC> block_C;
+  cutlass::DeviceAllocation<ElementC> block_D;
+
+  cutlass::DeviceAllocation<ElementA *> ptr_A;
+  cutlass::DeviceAllocation<ElementB *> ptr_B;
+  cutlass::DeviceAllocation<ElementC *> ptr_C;
+  cutlass::DeviceAllocation<ElementC *> ptr_D;
+
+  BaseTestbed(
+    Options &options_,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint32_t seed_ = 3080
+  ):
+    options(options_), init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { }
+
+  int problem_count() const {
+    return options.problem_count;
+  }
+
+  /// Helper to initialize a tensor view
+  template <typename Element>
+  void initialize_tensor(
+    Element *ptr,
+    size_t capacity,
+    cutlass::Distribution::Kind dist_kind,
+    uint32_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      Element scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<ElementC>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        if (cutlass::sizeof_bits<ElementAccumulator>::value <= 16) {
+          scope_max = 5;
+          scope_min = -5;
+        }
+        else {
+          scope_max = 8;
+          scope_min = -8;
+        }
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::device::BlockFillRandomUniform(
+        ptr, capacity, seed, scope_max, scope_min, 0);
+    }
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::device::BlockFillRandomGaussian(
+        ptr, capacity, seed, Element(), Element(0.5f));
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      // Fill with increasing elements
+      cutlass::reference::device::BlockFillSequential(
+        ptr, capacity, Element(1), Element());
+    }
+    else {
+
+      // Fill with all 1s
+      cutlass::reference::device::BlockFillSequential(
+        ptr, capacity, Element(), Element(1));
+    }
+  }
+
+  /// Allocates device-side data
+  void allocate() {
+    int64_t total_elements_A = 0;
+    int64_t total_elements_B = 0;
+    int64_t total_elements_C = 0;
+    int64_t total_elements_D = 0;
+
+    lda_host.resize(problem_count());
+    ldb_host.resize(problem_count());
+    ldc_host.resize(problem_count());
+    ldd_host.resize(problem_count());
+
+    for (int32_t i = 0; i < problem_count(); ++i) {
+
+      auto problem = options.problem_sizes.at(i);
+
+      lda_host.at(i) = LayoutA::packed({problem.n(), problem.k()}).stride(0);
+      ldb_host.at(i) = LayoutB::packed({problem.n(), problem.k()}).stride(0);
+      ldc_host.at(i) = LayoutC::packed({problem.n(), problem.n()}).stride(0);
+      ldd_host.at(i) = LayoutC::packed({problem.n(), problem.n()}).stride(0);
+
+      offset_A.push_back(total_elements_A);
+      offset_B.push_back(total_elements_B);
+      offset_C.push_back(total_elements_C);
+      offset_D.push_back(total_elements_D);
+
+      int64_t elements_A = problem.n() * problem.k();
+      int64_t elements_B = problem.n() * problem.k();
+      int64_t elements_C = problem.n() * problem.n();
+      int64_t elements_D = problem.n() * problem.n();
+
+      total_elements_A += elements_A;
+      total_elements_B += elements_B;
+      total_elements_C += elements_C;
+      total_elements_D += elements_D;
+    }
+
+    lda.reset(problem_count());
+    ldb.reset(problem_count());
+    ldc.reset(problem_count());
+    ldd.reset(problem_count());
+
+    block_A.reset(total_elements_A);
+    block_B.reset(total_elements_B);
+    block_C.reset(total_elements_C);
+    block_D.reset(total_elements_D);
+  }
+
+  /// Initializes device-side data
+  void initialize() {
+    problem_sizes_device.reset(problem_count());
+    problem_sizes_device.copy_from_host(options.problem_sizes.data());
+
+    lda.copy_from_host(lda_host.data());
+    ldb.copy_from_host(ldb_host.data());
+    ldc.copy_from_host(ldc_host.data());
+    ldd.copy_from_host(ldd_host.data());
+
+    //
+    // Assign pointers
+    //
+
+    std::vector<ElementA *> ptr_A_host(problem_count());
+    std::vector<ElementB *> ptr_B_host(problem_count());
+    std::vector<ElementC *> ptr_C_host(problem_count());
+    std::vector<ElementC *> ptr_D_host(problem_count());
+
+    for (int32_t i = 0; i < problem_count(); ++i) {
+      ptr_A_host.at(i) = block_A.get() + offset_A.at(i);
+      ptr_B_host.at(i) = block_B.get() + offset_B.at(i);
+      ptr_C_host.at(i) = block_C.get() + offset_C.at(i);
+      ptr_D_host.at(i) = block_D.get() + offset_D.at(i);
+    }
+
+    ptr_A.reset(problem_count());
+    ptr_A.copy_from_host(ptr_A_host.data());
+
+    ptr_B.reset(problem_count());
+    ptr_B.copy_from_host(ptr_B_host.data());
+
+    ptr_C.reset(problem_count());
+    ptr_C.copy_from_host(ptr_C_host.data());
+
+    ptr_D.reset(problem_count());
+    ptr_D.copy_from_host(ptr_D_host.data());
+
+    //
+    // Initialize the problems of the workspace
+    //
+
+    initialize_tensor(block_A.get(), block_A.size(), init_A, seed * 2021);
+    initialize_tensor(block_B.get(), block_B.size(), init_B, seed * 2022);
+    initialize_tensor(block_C.get(), block_C.size(), init_C, seed * 2023);
+
+    cutlass::reference::device::BlockFillSequential(
+      block_D.get(), block_D.size(), ElementC(), ElementC());
+  }
+
+  /// Verifies the result is a SYR2K
+  bool verify() {
+
+    bool passed = true;
+
+    for (int32_t i = 0; i < problem_count(); ++i) {
+      cutlass::gemm::GemmCoord problem = options.problem_sizes.at(i);
+
+      LayoutA layout_A(lda_host.at(i));
+      LayoutB layout_B(ldb_host.at(i));
+      LayoutC layout_C(ldc_host.at(i));
+      LayoutC layout_D(ldd_host.at(i));
+
+      cutlass::HostTensor<ElementA, LayoutA> host_A(
+        typename LayoutA::TensorCoord(problem.n(), problem.k()), /*device_backed=*/false);
+      cutlass::HostTensor<ElementB, LayoutB> host_B(
+        typename LayoutB::TensorCoord(problem.n(), problem.k()), /*device_backed=*/false);
+      cutlass::HostTensor<ElementC, LayoutC> host_C(
+        typename LayoutC::TensorCoord(problem.n(), problem.n()), /*device_backed=*/false);
+      cutlass::HostTensor<ElementC, LayoutC> host_D(
+        typename LayoutC::TensorCoord(problem.n(), problem.n()), /*device_backed=*/false);
+
+      cutlass::device_memory::copy_to_host(host_A.host_data(), block_A.get() + offset_A.at(i), problem.n() * problem.k());
+      cutlass::device_memory::copy_to_host(host_B.host_data(), block_B.get() + offset_B.at(i), problem.n() * problem.k());
+      cutlass::device_memory::copy_to_host(host_C.host_data(), block_C.get() + offset_C.at(i), problem.n() * problem.n());
+      cutlass::reference::host::BlockFillSequential(
+        host_D.host_data(), problem.n() * problem.n(), ElementC(), ElementC());
+
+      MatrixCoord extent_C{problem.n(), problem.n()};
+
+      // Reference Rank2K
+      cutlass::reference::host::Rank2KComplex<
+        ElementA, LayoutA,
+        ElementB, LayoutB,
+        ElementC, LayoutC,
+        ElementC, ElementAccumulator
+      >(
+        problem,
+        (double)options.alpha,
+        host_A.host_view(),
+        Rank2K::kTransformA,
+        host_B.host_view(),
+        Rank2K::kTransformB,
+        (double)options.beta,
+        host_C.host_view(),
+        host_D.host_view(),
+        ElementAccumulator(0),
+        Rank2K::kFillModeC,
+        Rank2K::kBlasMode
+      );
+
+      // Copy to host memory
+      std::vector<ElementC> matrix_D(layout_D.capacity(extent_C));
+      cutlass::device_memory::copy_to_host(matrix_D.data(), block_D.get() + offset_D.at(i), matrix_D.size());
+
+      cutlass::TensorView<ElementC, LayoutC> view_D(matrix_D.data(), layout_D, extent_C);
+      cutlass::TensorView<ElementC, LayoutC> view_Ref = host_D.host_view();
+
+      // Reference check
+      passed = cutlass::reference::host::TensorEquals(view_D, view_Ref);
+
+      if (!passed) {
+        std::cerr << "\n***\nError - problem " << i << " failed the QA check\n***\n" << std::endl;
+        return passed;
+      }
+    }
+
+    return passed;
+  }
+};
+
+template <typename Rank2K>
+class TestbedConventional : BaseTestbed<Rank2K> {
+public:
+  TestbedConventional(
+    Options &options_,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint32_t seed_ = 3080
+  ): BaseTestbed<Rank2K>(options_, init_A_, init_B_, init_C_, seed_) {}
+
+  /// Verbose printing of problem sizes
+  void print_problem_sizes() {
+
+    // Print groups
+    std::cout << this->problem_count() << " groups:\n";
+
+    int32_t idx = 0;
+    int64_t total_tiles = 0;
+
+    for (auto const & problem : this->options.problem_sizes) {
+      int tiles =
+        ((problem.m() + Rank2K::ThreadblockShape::kM - 1) / Rank2K::ThreadblockShape::kM) *
+        ((problem.n() + Rank2K::ThreadblockShape::kN - 1) / Rank2K::ThreadblockShape::kN);
+
+      total_tiles += tiles;
+
+      std::cout << "  [" << idx << "]: "
+        << problem.m() << "-by-" << problem.n() << "-by-" << problem.k()
+        << " (" << tiles << " threadblock tiles)" << "\n";
+
+      ++idx;
+    }
+    std::cout << std::endl;
+  }
+
+  /// Executes a conventional SYR2K kernel.
+  Result profile() {
+    std::cout << "Conventional Rank2K:\n"
+      << "====================================================" << std::endl;
+
+    Result result;
+    result.passed = false;
+
+    // Initialize the problem
+    this->allocate();
+    this->initialize();
+
+    if (this->options.verbose) {
+      print_problem_sizes();
+    }
+
+    //
+    // Create CUDA streams to maximize concurrency of SYR2K kernels
+    //
+    int32_t effective_streams = (this->options.cuda_streams ? this->options.cuda_streams : 1);
+    std::vector<cudaStream_t> cuda_streams;
+    char const *provider = "CUTLASS";
+
+    //
+    // Warmup run
+    //
+
+    if (this->options.cuda_streams) {
+      for (int i = 0; i < this->options.cuda_streams; ++i) {
+        cudaStream_t stream;
+
+        result.error = cudaStreamCreate(&stream);
+        if (result.error != cudaSuccess) {
+        std::cerr << "Failed to create CUDA stream." << std::endl;
+          return result;
+        }
+        cuda_streams.push_back(stream);
+      }
+    }
+    else {
+      cuda_streams.push_back(nullptr);
+    }
+
+    // Use 'D' for the in/out workspace
+    this->block_D.copy_from_device(this->block_C.get());
+
+    for (size_t i = 0; i < this->options.problem_sizes.size(); ++i) {
+      cutlass::gemm::GemmCoord const & problem = this->options.problem_sizes[i];
+      int32_t batch_count = 1;
+      int64_t lda = this->lda_host.at(i);
+      int64_t ldb = this->ldb_host.at(i);
+      int64_t ldc = this->ldc_host.at(i);
+      typename Rank2K::ElementA* ptrA = this->block_A.get() + this->offset_A.at(i);
+      typename Rank2K::ElementB* ptrB = this->block_B.get() + this->offset_B.at(i);
+      typename Rank2K::ElementC* ptrC = this->block_C.get() + this->offset_C.at(i);
+      typename Rank2K::ElementC* ptrD = this->block_D.get() + this->offset_D.at(i);
+
+      //
+      // Initialize the CUTLASS SYR2K operator
+      //
+
+      // Configure the SYR2K arguments
+      typename Rank2K::EpilogueOutputOp::Params epilogue_op(this->options.alpha, this->options.beta);
+
+      typename Rank2K::Arguments arguments{
+        cutlass::gemm::GemmUniversalMode::kGemm,
+        problem,
+        batch_count,
+        epilogue_op,
+        (void const *)ptrA,
+        (void const *)ptrB,
+        (void const *)ptrC,
+        (void       *)ptrD,
+        int64_t(),
+        int64_t(),
+        int64_t(),
+        int64_t(),
+        int64_t(lda),
+        int64_t(ldb),
+        int64_t(ldc),
+        int64_t(ldc)
+      };
+
+      Rank2K rank2k_op;
+
+      cutlass::Status status = rank2k_op.initialize(arguments);
+
+      if (status != cutlass::Status::kSuccess) {
+        std::cerr << "CUTLASS error on line " << __LINE__ << std::endl;
+        return result;
+      }
+
+      status = rank2k_op();
+
+      if (status != cutlass::Status::kSuccess) {
+        std::cerr << "CUTLASS error on line " << __LINE__ << std::endl;
+        return result;
+      }
+    }
+
+    //
+    // Wait for completion
+    //
+
+    result.error = cudaDeviceSynchronize();
+
+    if (result.error != cudaSuccess)  {
+      std::cerr << "Kernel execution error: " << cudaGetErrorString(result.error);
+      return result;
+    }
+
+    //
+    // Construct events
+    //
+
+    cudaEvent_t events[2];
+
+    for (auto & event : events) {
+      result.error = cudaEventCreate(&event);
+      if (result.error != cudaSuccess) {
+        std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+        return -1;
+      }
+    }
+
+    //
+    // Wait for completion
+    //
+
+    result.error = cudaDeviceSynchronize();
+
+    if (result.error != cudaSuccess)  {
+      std::cerr << "Kernel execution error: " << cudaGetErrorString(result.error);
+      return result;
+    }
+
+    // Record an event at the start of a series of SYR2K operations
+    result.error = cudaEventRecord(events[0]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    //
+    // Run profiling loop
+    //
+
+    size_t last_stream_idx = 0;
+
+    for (int iter = 0; iter < this->options.iterations; ++iter) {
+      for (size_t i = 0; i < this->options.problem_sizes.size(); ++i) {
+        cutlass::gemm::GemmCoord const & problem = this->options.problem_sizes[i];
+        int32_t batch_count = 1;
+        int64_t lda = this->lda_host.at(i);
+        int64_t ldb = this->ldb_host.at(i);
+        int64_t ldc = this->ldc_host.at(i);
+        typename Rank2K::ElementA* ptrA = this->block_A.get() + this->offset_A.at(i);
+        typename Rank2K::ElementB* ptrB = this->block_B.get() + this->offset_B.at(i);
+        typename Rank2K::ElementC* ptrC = this->block_C.get() + this->offset_C.at(i);
+        typename Rank2K::ElementC* ptrD = this->block_D.get() + this->offset_D.at(i);
+
+        last_stream_idx = (i % effective_streams);
+
+        //
+        // Initialize the CUTLASS SYR2K operator
+        //
+
+        // Configure the SYR2K arguments
+        typename Rank2K::EpilogueOutputOp::Params epilogue_op(this->options.alpha, this->options.beta);
+
+        typename Rank2K::Arguments arguments{
+          cutlass::gemm::GemmUniversalMode::kGemm,
+          problem,
+          batch_count,
+          epilogue_op,
+          (void const *)ptrA,
+          (void const *)ptrB,
+          (void const *)ptrC,
+          (void       *)ptrD,
+          int64_t(),
+          int64_t(),
+          int64_t(),
+          int64_t(),
+          int64_t(lda),
+          int64_t(ldb),
+          int64_t(ldc),
+          int64_t(ldc)
+        };
+
+        Rank2K rank2k_op;
+
+        cutlass::Status status = rank2k_op.initialize(arguments);
+
+        if (status != cutlass::Status::kSuccess) {
+          std::cerr << "CUTLASS error on line " << __LINE__ << std::endl;
+          return result;
+        }
+
+        status = rank2k_op(cuda_streams[last_stream_idx]);
+
+        if (status != cutlass::Status::kSuccess) {
+          std::cerr << "CUTLASS error on line " << __LINE__ << std::endl;
+          return result;
+        }
+      }
+    }
+
+    //
+    // Stop profiling loop
+    //
+
+    // Record an event when the SYR2K operations have been launched.
+    result.error = cudaEventRecord(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    //
+    // Wait for work to be completed
+    //
+
+    result.error = cudaDeviceSynchronize();
+
+    if (result.error != cudaSuccess)  {
+      std::cerr << "Kernel execution error: " << cudaGetErrorString(result.error);
+      return result;
+    }
+
+    // Measure elapsed runtime
+    float runtime_ms = 0;
+    result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Compute average runtime and GFLOPs.
+    result.runtime_ms = double(runtime_ms) / double(this->options.iterations);
+    result.gflops = this->options.gflops(result.runtime_ms / 1000.0);
+
+    //
+    // Cleanup
+    //
+
+    for (auto event : events) {
+      (void)cudaEventDestroy(event);
+    }
+
+    for (auto stream : cuda_streams) {
+      if (stream) {
+        (void)cudaStreamDestroy(stream);
+      }
+    }
+
+    std::cout << "    " << this->options.problem_sizes.size() << " conventional Rank2Ks launched" << std::endl;
+    std::cout << std::endl;
+    std::cout << "    " << "Conventional Runtime: " << result.runtime_ms << " ms" << std::endl;
+    std::cout << "    " << "Conventional  GFLOPS: " << result.gflops << std::endl;
+
+    if (this->options.output_file.good()) {
+      this->options.output_file << this->options.output_tag << "," << provider << ",conventional,"
+        << this->problem_count() << "," << result.runtime_ms << "," << result.gflops << std::endl;
+    }
+
+    result.passed = true;
+    return result;
+  }
+};
+
+template <typename Rank2K_, cutlass::gemm::kernel::GroupScheduleMode GroupScheduleMode_>
+class TestbedGrouped : BaseTestbed<Rank2K_> {
+public:
+  TestbedGrouped(
+    Options &options_,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint32_t seed_ = 3080
+  ) : BaseTestbed<Rank2K_>(options_, init_A_, init_B_, init_C_, seed_) {}
+
+  // Redefine Rank2K with different GroupScheduleMode_
+    using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    typename Rank2K_::ElementA, typename Rank2K_::LayoutA, Rank2K_::kTransformA, Rank2K_::kAlignmentA,
+    typename Rank2K_::ElementB, typename Rank2K_::LayoutB, Rank2K_::kTransformB, Rank2K_::kAlignmentB,
+    typename Rank2K_::ElementC, typename Rank2K_::LayoutC, Rank2K_::kFillModeC,
+    typename Rank2K_::ElementAccumulator,
+    typename Rank2K_::OperatorClass,
+    typename Rank2K_::ArchTag,
+    typename Rank2K_::ThreadblockShape,
+    typename Rank2K_::WarpShape,
+    typename Rank2K_::InstructionShape,
+    typename Rank2K_::EpilogueOutputOp,
+    typename Rank2K_::ThreadblockSwizzle,
+    Rank2K_::kStages,
+    typename Rank2K_::Operator::ArchMmaOperator::Operator,
+    Rank2K_::kBlasMode,
+    GroupScheduleMode_>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  /// Verbose printing of problem sizes
+  void print_problem_sizes() {
+
+    // Print groups
+    std::cout << this->problem_count() << " groups:\n";
+
+    int32_t idx = 0;
+    int64_t total_tiles = 0;
+
+    for (auto const & problem : this->options.problem_sizes) {
+      int tiles = Rank2K::problem_tile_count(problem);
+      total_tiles += tiles;
+
+      std::cout << "  [" << idx << "]: "
+        << problem.m() << "-by-" << problem.n() << "-by-" << problem.k()
+        << " (" << tiles << " threadblock tiles)" << "\n";
+
+      ++idx;
+    }
+    std::cout << std::endl;
+  }
+
+  /// Sort problems in descending order of problem-K dimension
+  void sort_problems() {
+    Rank2K::sort_problems(this->options.problem_count,
+                          this->options.problem_sizes.data(),
+                          this->lda_host.data(),
+                          this->ldb_host.data(),
+                          this->ldc_host.data(),
+                          this->ldd_host.data(),
+                          this->offset_A.data(),
+                          this->offset_B.data(),
+                          this->offset_C.data(),
+                          this->offset_D.data());
+  }
+
+  /// Executes a grouped kernel and measures runtime.
+  Result profile() {
+    std::string sched_mode = this->options.scheduler_mode_to_str.find(GroupScheduleMode_)->second;
+    std::cout << std::endl;
+    std::cout << "Grouped Rank2K (CUTLASS) with mode " << sched_mode << ":\n"
+      << "====================================================" << std::endl;
+
+    Result result;
+
+    int threadblock_count = Rank2K::sufficient(this->options.problem_sizes.data(), this->options.problem_count);
+
+    // Early exit
+    if (!threadblock_count) {
+      std::cout << "Active CUDA device lacks hardware resources to run CUTLASS Grouped SYR2K kernel." << std::endl;
+      return result;
+    }
+
+    result.passed = false;
+
+    // Initialize the problem
+    this->allocate();
+    if (this->options.sort_problems) {
+      sort_problems();
+    }
+    this->initialize();
+
+    if (this->options.verbose) {
+      print_problem_sizes();
+    }
+
+    // Configure the Rank2K arguments
+    typename Rank2K::EpilogueOutputOp::Params epilogue_op(this->options.alpha, this->options.beta);
+
+    // Configure Rank2K arguments
+    typename Rank2K::Arguments args(
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      this->problem_sizes_device.get(),
+      this->problem_count(),
+      threadblock_count,
+      epilogue_op,
+      this->ptr_A.get(),
+      this->ptr_B.get(),
+      this->ptr_C.get(),
+      this->ptr_D.get(),
+      this->lda.get(),
+      this->ldb.get(),
+      this->ldc.get(),
+      this->ldd.get(),
+      this->options.problem_sizes.data()
+    );
+
+    // Initialize the Rank2K object
+    Rank2K rank2k{};
+    size_t workspace_size = rank2k.get_workspace_size(args);
+    cutlass::DeviceAllocation<uint8_t> workspace(workspace_size);
+
+    result.status = rank2k.initialize(args, workspace.get());
+
+    if (result.status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to initialize CUTLASS Grouped Rank2K kernel." << std::endl;
+      return result;
+    }
+
+    // Run the grouped Rank2K object
+    result.status = rank2k.run();
+
+    if (result.status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to run CUTLASS Grouped Rank2K kernel." << std::endl;
+      return result;
+    }
+
+    // Wait for completion
+    result.error = cudaDeviceSynchronize();
+
+    if (result.error != cudaSuccess)  {
+      std::cerr << "Kernel execution error: " << cudaGetErrorString(result.error);
+      return result;
+    }
+
+    //
+    // Verify correctness
+    //
+    result.passed = true;
+    if (this->options.reference_check) {
+      result.passed = this->verify();
+    }
+
+    //
+    // Warm-up run of the grouped Rank2K object
+    //
+    result.status = rank2k.run();
+
+    if (result.status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to run CUTLASS Grouped Rank2K kernel." << std::endl;
+      return result;
+    }
+
+    //
+    // Construct events
+    //
+
+    cudaEvent_t events[2];
+
+    for (auto & event : events) {
+      result.error = cudaEventCreate(&event);
+      if (result.error != cudaSuccess) {
+        std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+        return -1;
+      }
+    }
+
+    // Record an event at the start of a series of SYR2K operations
+    result.error = cudaEventRecord(events[0]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    //
+    // Run profiling loop
+    //
+
+    for (int iter = 0; iter < this->options.iterations; ++iter) {
+      rank2k();
+    }
+
+    //
+    // Stop profiling loop
+    //
+
+    // Record an event when the Rank2K operations have been launched.
+    result.error = cudaEventRecord(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Wait for work on the device to complete.
+    result.error = cudaEventSynchronize(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Measure elapsed runtime
+    float runtime_ms = 0;
+    result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Compute average runtime and GFLOPs.
+    result.runtime_ms = double(runtime_ms) / double(this->options.iterations);
+    result.gflops = this->options.gflops(result.runtime_ms / 1000.0);
+
+    //
+    // Cleanup
+    //
+
+    for (auto event : events) {
+      (void)cudaEventDestroy(event);
+    }
+
+    // Optionally profile initialization
+    if (this->options.profile_initialization) {
+      // Warm up
+      rank2k.initialize(args, workspace.get());
+
+      auto start_time = std::chrono::high_resolution_clock::now();
+      for (int32_t i = 0; i < this->options.iterations; ++i) {
+        rank2k.initialize(args, workspace.get());
+      }
+      auto end_time = std::chrono::high_resolution_clock::now();
+
+      std::chrono::duration<double, std::milli> duration = end_time - start_time;
+      duration /= double(this->options.iterations);
+      result.initialization_time_ms = duration.count();
+    }
+
+    int64_t total_tiles = Rank2K::group_tile_count(args);
+    std::cout << "    " << total_tiles << " total threadblock tiles." << std::endl;
+
+    std::cout << std::endl;
+    std::cout << "    " << "Grouped Runtime: " << result.runtime_ms << " ms" << std::endl;
+    std::cout << "    " << "Grouped  GFLOPs: " << result.gflops << std::endl;
+    if (this->options.profile_initialization) {
+      std::cout << "    " << "Init    Runtime: " << result.initialization_time_ms << " ms" << std::endl;
+    }
+
+    if (this->options.output_file.good()) {
+      this->options.output_file << this->options.output_tag << ",CUTLASS,grouped-" << sched_mode << ","
+        << this->problem_count() << "," << result.runtime_ms << "," << result.gflops << std::endl;
+    }
+
+    std::cout << "\nPassed\n";
+
+    return result;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (__CUDACC_VER_MAJOR__ < 11 || props.major < 8) {
+
+    //
+    // This example requires an NVIDIA Ampere-architecture GPU.
+    //
+
+    std::cout
+      << "CUTLASS's Grouped Rank2K example requires a GPU of NVIDIA's Ampere Architecture or "
+      << "later (compute capability 80 or greater).\n";
+
+    return 0;
+  }
+
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.error) {
+    std::cerr << "Aborting execution." << std::endl;
+    return -1;
+  }
+
+  //
+  // Define the Grouped and Conventional Rank2K types
+  //
+
+  using ElementA = double;
+  using ElementB = double;
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+  const cutlass::FillMode kFillModeC = cutlass::FillMode::kLower;
+  const int kAlignmentA = 1;
+  const int kAlignmentB = 1;
+  const cutlass::ComplexTransform kTransformA = cutlass::ComplexTransform::kNone;
+  const cutlass::ComplexTransform kTransformB = cutlass::ComplexTransform::kNone;
+
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+  using ArchTag = cutlass::arch::Sm80;
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 16>;
+  using WarpShape = cutlass::gemm::GemmShape<16, 16, 16>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombination<
+        ElementOutput, 1,
+        ElementAccumulator, ElementAccumulator>;
+
+  // NOTE: Threadblock swizzling is currently not supported by CUTLASS's grouped kernels.
+  // This parameter is passed in at present to match the APIs of other kernels. The parameter
+  // is unused within the kernel.
+  using ThreadblockSwizzle = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
+
+  const int kStages = 4;
+  const bool kSplitKSerial = false;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+  const cutlass::BlasMode kBlasMode = cutlass::BlasMode::kSymmetric;
+
+  // Define a grouped Rank2K kernel with all template parameters set except
+  // for scheduling mode. This will be used as the template for all scheduling
+  // modes executed.
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, kTransformA, kAlignmentA,
+    ElementB, LayoutB, kTransformB, kAlignmentB,
+    ElementOutput, LayoutC, kFillModeC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    Operator,
+    kBlasMode>::Rank2Kkernel;
+
+  using Rank2KGrouped = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  // Rank2k operator
+  using Rank2KConventional = cutlass::gemm::device::Rank2K<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementOutput, LayoutC, kFillModeC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kAlignmentA,
+    kAlignmentB,
+    kSplitKSerial,
+    Operator,
+    kTransformA,
+    kTransformB,
+    kBlasMode
+  >;
+
+  //
+  // Profile it
+  //
+
+  TestbedConventional<Rank2KConventional> testbed(options);
+
+  Result result = testbed.profile();
+  if (!result.passed) {
+    std::cout << "Profiling CUTLASS conventional Rank2K has failed.\n";
+    std::cout << "\nFailed\n";
+    return -1;
+  }
+
+  using GroupScheduleMode = cutlass::gemm::kernel::GroupScheduleMode;
+  for (GroupScheduleMode mode : options.scheduler_modes) {
+    Result result;
+    switch (mode) {
+      case GroupScheduleMode::kDeviceOnly:
+        {
+          TestbedGrouped<Rank2KGrouped, GroupScheduleMode::kDeviceOnly> runner(options);
+          result = runner.profile();
+          break;
+        }
+      case GroupScheduleMode::kHostPrecompute:
+        {
+          TestbedGrouped<Rank2KGrouped, GroupScheduleMode::kHostPrecompute> runner(options);
+          result = runner.profile();
+          break;
+        }
+    }
+
+    if (result.error != cudaSuccess) {
+      return 1;
+    }
+
+    // Override verbose flag to avoid printing duplicate information for each scheduling mode
+    options.verbose = false;
+  }
+
+  return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/39_gemm_permute/CMakeLists.txt b/examples/39_gemm_permute/CMakeLists.txt
new file mode 100644
index 0000000000..dd916fdf5d
--- /dev/null
+++ b/examples/39_gemm_permute/CMakeLists.txt
@@ -0,0 +1,36 @@
+
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+
+cutlass_example_add_executable(
+  39_gemm_permute
+  gemm_permute.cu 
+  )
+
diff --git a/examples/39_gemm_permute/gemm_permute.cu b/examples/39_gemm_permute/gemm_permute.cu
new file mode 100644
index 0000000000..3651b9c568
--- /dev/null
+++ b/examples/39_gemm_permute/gemm_permute.cu
@@ -0,0 +1,1223 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief GEMM Permute Example.
+
+    This example computes batched GEMM operations with output results permuted as reshaped tensors.
+
+    We provide layout plugin as a flexible tool for users to add any customized input/output tensor permute operation, 
+    or any other generalized global memory writeout address computation. To add a customized layout, add new class
+    in include/cutlass/layout/permute.h
+
+    In this example we use several permute operations (permute([0, 2, 1, 3]))
+
+    In this example, we used Tensor4DPermuteBMM0213 layout to perform Batched GEMM with permute([0, 2, 1, 3]) on BMM
+    whole output tensor, and used Tensor5DPermute20314 layout to perform Normal GEMM with permute([2, 0, 3, 1, 4]) on
+    output matrix. The address computations are performed in compute(col_init, row_init, stride_init, 
+    BMM_batch_idx) with {col_permute, row_permute and stride_permute} as new addresses after permute op.
+    (check include/cutlass/layout/permute.h)
+
+    Tips:
+    
+      1) Make sure to set batch_stride to zero for BMM permute; also the BMM GEMM should be in mode
+      cutlass::gemm::GemmUniversalMode::kBatched instead of kArray.
+
+      2) When the contiguous dimension is touched in permute op (for example [0, 2, 3, 1] for row-major matrix 
+      or [1, 0, 2, 3] for column-major), Alignment should be set to 1 for the corresponding matrix. 
+      If the last dimension is untouched,  one can set Alignment to be larger like 8 in our example.
+      As a result, permute op without touching the unit stride dimension is recommended to obtain the best performance.
+
+    Examples:
+
+      # Runs a batched GEMM with 96 batches
+      $ ./examples/39_gemm_permute/39_gemm_permute --problem-count=96
+
+      # Runs a batched GEMM with 96 batches (with GEMM-K dimension equal to 1024)
+      $ ./examples/39_gemm_permute/39_gemm_permute --problem-count=96 --k=1024 --verbose=true
+
+      # Execute batched GEMM and profile with NSight
+      $ nv-nsight-cu-cli ./examples/39_gemm_permute/39_gemm_permute --m=256 --n=192 --k=256 --verbose=true --iterations=1 --reference-check=false
+
+*/
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <vector>
+#include <map>
+#include <unordered_map>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm_complex.h"
+#include "cutlass/util/reference/device/gemm_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+
+#include "cutlass/layout/permute.h"
+
+#include "layouts.h"
+#include "permute_info.h"
+
+/// Tensor4DPermuteBMM0213 --->
+/// Permute layout function for 4-D permuted tensors for BMM with BMM tensor (dimension as [B, M, N]) reshaped
+/// as [B/D1, D1, M, N]. Then perform permute([0, 2, 1, 3]) on the corresponding whole BMM tensor.
+int constexpr D1 = 12;
+
+/// Tensor5DPermute20314 --->
+/// Permute layout function for 5-D permuted tensors with matrix (dimension as [M, N]) reshaped
+/// as [M/T1, T1, T2, T3, N/T2/T3]. Then perform permute([2, 0, 3, 1, 4]) on the corresponding tensor.
+int constexpr T1 = 16; 
+int constexpr T2 = 3;
+int constexpr T3 = 8;
+
+/// Tensor4DPermute0213 --->
+/// Permute layout function for 4-D permuted tensors with matrix (dimension as [M, N]) reshaped
+/// as [M/S1, S1, S2, N/S2]. Then perform permute([0, 2, 1, 3]) on the corresponding tensor.
+int constexpr S1 = 8; 
+int constexpr S2 = 4;
+
+// // // Alignments
+int constexpr AlignmentA = 8;
+int constexpr AlignmentB = 8;
+int constexpr AlignmentC = 8;
+
+/// GEMM element types
+using ElementInput = cutlass::half_t;
+using ElementOutput = cutlass::half_t;
+using ElementAccumulator = float;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Useful macros
+
+#define CHECK_CUDA_CALL(call, handler) \
+do { \
+  cudaError_t __err = (call); \
+  if (__err != cudaSuccess) { \
+    std::cerr << #call " failed: " << cudaGetErrorString(__err) << std::endl; \
+    handler; \
+  } \
+} while(0)
+
+#define CHECK_CUTLASS_CALL(call, handler) \
+do { \
+  cutlass::Status __status = (call); \
+  if (__status != cutlass::Status::kSuccess) { \
+    std::cerr << #call " failed: " << cutlass::cutlassGetStatusString(__status) << std::endl; \
+    handler; \
+  } \
+} while(0)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+  bool error;
+  bool reference_check;
+
+  cutlass::gemm::GemmCoord problem_each;
+
+  int batch_count;
+  int iterations;
+  int cuda_streams;
+  bool verbose;
+  float alpha;
+  float beta;
+
+  //
+  // Methods
+  // 
+
+  Options():
+    help(false),
+    error(false),
+    reference_check(true),
+    batch_count(-1),
+    iterations(20),
+    cuda_streams(0),
+    verbose(false),
+    alpha(1),
+    beta()
+  { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("alpha", alpha, 1.0f);
+    cmd.get_cmd_line_argument("beta", beta, 0.0f);    
+    cmd.get_cmd_line_argument("iterations", iterations, 20);
+    cmd.get_cmd_line_argument("streams", cuda_streams, 0);
+    cmd.get_cmd_line_argument("verbose", verbose, false);
+    cmd.get_cmd_line_argument("reference-check", reference_check, true);
+
+    int m, n, k;
+
+    cmd.get_cmd_line_argument("m", m, 384);
+    cmd.get_cmd_line_argument("n", n, 192);
+    cmd.get_cmd_line_argument("k", k, 384);
+    cmd.get_cmd_line_argument("batch-count", batch_count, 96);
+
+    problem_each = cutlass::gemm::GemmCoord(m, n, k);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << 
+      "39_gemm_permute\n"
+      "\n"
+      " This example tests and profiles the performance of normal GEMM and batched GEMM with different"
+      " combinations of fused permutations of input and output tensors."
+      "\n"
+      " Permutations considered in this example:\n"
+      "\n"
+      " Normal GEMM:\n"
+      " 1) Tensor4DPermute0213: matrix of shape [X, Y] is reshaped as [X/S1, S1, S2, Y/S2] and has its dimensions"
+      " permuted as [0, 2, 1, 3], resulting in shape [X/S1, S2, S1, Y/S2] viewed as matrix of shape [X*S2/S1, Y*S1/S2].\n"
+      " 2) Tensor5DPermute20314: matrix of shape [X, Y] is reshaped as [X/T1, T1, T2, T3, Y/T2/T3] and has its dimensions"
+      " permuted as [2, 0, 3, 1, 4], resulting in shape [T2, X/T1, T3, T1, Y/T2/T3] viewed as matrix of shape [X*T2/T1, Y*T1/T2].\n"
+       "\n"
+      " Batched GEMM:\n"
+      " 3) Tensor4DPermuteBMM0213: batched tensor of 3D shape [B, X, Y] is reshaped as 4D shape [B/D1, D1, X, Y]"
+      " and has its dimensions permuted as [0, 2, 1, 3], resulting in shape [B/D1, X, D1, Y] viewed as"
+      " a matrix of shape [B/D1, X, Y*D1] for batched GEMM purposes.\n"
+      "\n"
+      " Note: S1, S2, D1, D2, T1, T2, T3 are compile-time constants defined in gemm_permute.cu."
+      " Runtime specification of these values is not supported."
+      " These values along with alignment requirements place constraints on supported matrix sizes.\n"
+      "\n"
+      " Note: X, Y above may refer to M, N or K dimensions of GEMM problem, depending on the tensor considered (A, B or D)."
+      " For the output tensor D the values correspond directly to dimensions of D, whereas for A and B the original dimensions"
+      " X', Y' are inferred from the ones supplied to the GEMM, taking into account the permute operation.\n"
+      "\n"
+      "Options:\n"
+      "\n"
+      "  --help                      If specified, displays this usage statement.\n\n"
+      "  --batch-count=<int>         Sets the number of batches in batched GEMM (batch number for BMM). (default: --batch-count=768)\n"
+      "  --m=<int>                   Sets the M dimension for both batched GEMM and normal GEMM problems. (default: --m=128)\n"
+      "  --n=<int>                   Sets the N dimension for both batched GEMM and normal GEMM problems. (default: --n=192)\n"
+      "  --k=<int>                   Sets the K dimension for both batched GEMM and normal GEMM problems. (default: --k=384)\n"
+      "  --alpha=<f32>               Epilogue scalar alpha (real part)\n"
+      "  --beta=<f32>                Epilogue scalar beta (real part)\n\n"
+      "  --iterations=<int>          Number of profiling iterations to perform.\n"
+      "  --reference-check=<bool>    If true, performs reference check.\n"
+      "  --verbose=<bool>            If true, prints problem sizes and batching structure.\n"
+      "\n"
+      "Examples:\n"
+      "\n"
+      "# Runs a batched GEMM with 96 batches\n"
+      "$ ./examples/39_gemm_permute/39_gemm_permute --batch-count=96\n"
+      "\n"
+      "# Runs a batched GEMM with 96 batches (with GEMM-K dimension equal to 1024)\n"
+      "$ ./examples/39_gemm_permute/39_gemm_permute --batch-count=96 --k=1024 --verbose=true\n"
+      "\n"
+      "# Execute batched GEMM and profile with NSight\n"
+      "$ nv-nsight-cu-cli ./examples/39_gemm_permute/39_gemm_permute --m=256 --n=192 --k=256 --verbose=true --iterations=1 --reference-check=false\n"
+      "\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s, bool batched) const {
+
+    // Number of real-valued multiply-adds 
+    int64_t fmas = int64_t();
+
+    fmas += problem_each.product() * (batched ? batch_count : 1);
+    
+    // Two flops per multiply-add
+    return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace { // (anonymous)
+
+/// Dimension-generic permutation loop
+template<int I, typename Element, typename Layout, typename PermuteOp, typename Coord>
+void permute_host_impl(
+    cutlass::TensorView<Element const, Layout> const & input,
+    cutlass::TensorView<Element, Layout> const & output,
+    PermuteOp && permute,
+    Coord & coord
+) {
+  static_assert(Layout::kRank == Coord::kRank, "Incompatible Layout and Coord types");
+  if constexpr (I == Coord::kRank) {
+    output.at(permute(coord)) = input.at(coord);
+  }
+  else {
+    for (coord[I] = 0; coord[I] < input.extent(I); ++coord[I]) {
+      permute_host_impl<I+1>(input, output, std::forward<PermuteOp>(permute), coord);
+    }
+  }
+}
+
+} // namespace (anonymous)
+
+/// Perform a reference (host-based) permutation of an input tensor
+template<typename PermuteLayout, typename Element, typename Layout>
+void permute_host(
+    cutlass::TensorView<Element const, Layout> const &input,
+    cutlass::TensorView<Element, Layout> const &output,
+    int batch_count) {
+  Layout layout = input.layout();
+  cutlass::MatrixCoord extent = input.extent();
+
+  std::size_t num_elems = layout.capacity(extent) * batch_count;
+  std::vector<Element> h_input(num_elems);
+  cutlass::device_memory::copy_to_host(h_input.data(), input.data(), num_elems);
+
+  std::vector<Element> h_output(num_elems);
+
+  using Info = PermuteInfo<PermuteLayout>;
+  using TensorLayout = typename Info::Layout;
+
+  auto shape_orig = Info::original_shape(extent, batch_count);
+  auto shape_perm = Info::permute(shape_orig);
+
+  cutlass::TensorView<Element const, TensorLayout> view_input(h_input.data(), TensorLayout::packed(shape_orig), shape_orig); 
+  cutlass::TensorView<Element, TensorLayout> view_output(h_output.data(), TensorLayout::packed(shape_perm), shape_perm);
+
+  decltype(shape_orig) coord;
+  permute_host_impl<0>(view_input, view_output, Info::permute, coord);
+
+  cutlass::device_memory::copy_to_device(output.data(), h_output.data(), num_elems);
+}
+
+template<typename Layout>
+struct LayoutInfo;
+
+template<>
+struct LayoutInfo<cutlass::layout::RowMajor> {
+  static std::string name() { return "RowMajor"; }
+};
+
+template<>
+struct LayoutInfo<cutlass::layout::ColumnMajor> {
+  static std::string name() { return "ColumnMajor"; }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename ElementA, typename ElementB, typename ElementC>
+class Testbed {
+private:
+
+  //
+  // Data members
+  //
+
+  Options & options;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint32_t seed;
+
+  cutlass::DeviceAllocation<ElementA> block_A;
+  cutlass::DeviceAllocation<ElementB> block_B;
+  cutlass::DeviceAllocation<ElementC> block_C;
+  cutlass::DeviceAllocation<ElementC> block_D;
+
+public:
+
+  //
+  // Methods
+  //
+
+  Testbed(
+    Options &options_,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint32_t seed_ = 3090
+  ):
+    options(options_), init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { }
+
+private:
+
+  /// Print permutation info for one tensor
+  template<typename PermuteLayout>
+  void print_tensor_info(
+      std::ostream & os,
+      std::string const &tensor_name,
+      int row_dim,
+      int col_dim) {
+
+    cutlass::MatrixCoord extent(options.problem_each.at(row_dim), options.problem_each.at(col_dim));
+    using Info = PermuteInfo<PermuteLayout>;
+
+    os << "tensor " << tensor_name << ": " << Info::desc() << "\n";
+    os << "    extent: [" << extent.row() << ", " << extent.column() << "]";
+    if (Info::kBatched) {
+      os << ", batch count: " << options.batch_count;
+    }
+    os << "\n";
+    if (!cutlass::layout::is_trivial_permute<PermuteLayout>) {
+      auto shape_orig = Info::original_shape(extent, options.batch_count);
+      auto shape_perm = Info::permute(shape_orig);
+      os << "  original: [" << shape_orig << "]\n";
+      os << "  permuted: [" << shape_perm << "]\n";
+    }
+  }
+
+  /// Check shape compatibility for one tensor
+  template<typename Layout, typename PermuteLayout, int Alignment>
+  bool check_tensor_shape(
+      std::string const &tensor_name,
+      int row_dim,
+      int col_dim) {
+
+    cutlass::MatrixCoord extent(options.problem_each.at(row_dim), options.problem_each.at(col_dim));
+
+    using Info = PermuteInfo<PermuteLayout>;
+
+    auto rowAlign = cutlass::platform::is_same<Layout, cutlass::layout::ColumnMajor>::value ? Alignment : 1;
+    auto colAlign = cutlass::platform::is_same<Layout, cutlass::layout::RowMajor>::value ? Alignment : 1;
+
+    auto rowFactor = Info::kRowFactor * rowAlign;
+    auto colFactor = Info::kColumnFactor * colAlign;
+
+    // Assumes row-major layout
+    bool const valid_row = extent.row() % rowFactor == 0;
+    if (!valid_row) {
+      std::cerr << "Invalid tensor " << tensor_name << " row size = " << extent.row() << ", "
+                   "must be divisible by " << rowFactor << ", "
+                   "required by " << Info::name() << 
+                   (rowAlign > 1 ? (" and alignment of " + std::to_string(rowAlign)) : "") << std::endl;
+    }
+
+    bool const valid_col = extent.column() % colFactor == 0;
+    if (!valid_col) {
+      std::cerr << "Invalid tensor " << tensor_name << " column size = " << extent.column() << ", "
+                   "must be divisible by " << colFactor << ", "
+                   "required by " << Info::name() << 
+                   (colAlign > 1 ? (" and alignment of " + std::to_string(colAlign)) : "") << std::endl;
+    }
+
+    bool const valid_bsz = options.batch_count % Info::kBatchFactor == 0;
+    if (!valid_bsz) {
+      std::cerr << "Invalid batch count = " << options.batch_count << ", "
+                   "must be divisible by " << Info::kBatchFactor << ", "
+                   "required by " << Info::name() << std::endl;
+    }
+
+    return valid_row && valid_col && valid_bsz;
+  }
+
+  /// Helper to initialize a tensor view
+  template <typename Element>
+  void initialize_tensor_(
+      Element *ptr,
+      size_t capacity, 
+      cutlass::Distribution::Kind dist_kind,
+      uint32_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      Element scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<ElementC>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        if (cutlass::sizeof_bits<ElementAccumulator>::value <= 16) {
+          scope_max = 5;
+          scope_min = -5;
+        }
+        else {
+          scope_max = 8;
+          scope_min = -8;
+        }
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::device::BlockFillRandomUniform(
+        ptr, capacity, seed, scope_max, scope_min, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::device::BlockFillRandomGaussian(
+        ptr, capacity, seed, Element(), Element(0.5f));
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      // Fill with increasing elements
+      cutlass::reference::device::BlockFillSequential(
+        ptr, capacity, Element(1), Element());
+    } 
+    else {
+
+      // Fill with all 1s
+      cutlass::reference::device::BlockFillSequential(
+        ptr, capacity, Element(), Element(1));
+    }
+  }
+
+  /// Initializes data structures
+  void initialize(int batch_count) {
+
+    srand(seed);
+
+    int64_t total_elements_A = options.problem_each.m() * options.problem_each.k() * batch_count;
+    int64_t total_elements_B = options.problem_each.n() * options.problem_each.k() * batch_count;
+    int64_t total_elements_C = options.problem_each.m() * options.problem_each.n() * batch_count;
+    int64_t total_elements_D = options.problem_each.m() * options.problem_each.n() * batch_count;
+
+    // Allocate space
+    block_A.reset(total_elements_A);
+    block_B.reset(total_elements_B);
+    block_C.reset(total_elements_C);
+    block_D.reset(total_elements_D);
+
+    // Initialize input tensors
+    initialize_tensor_(block_A.get(), total_elements_A, init_A, seed * 2021);
+    initialize_tensor_(block_B.get(), total_elements_B, init_B, seed * 2022);
+    initialize_tensor_(block_C.get(), total_elements_C, init_C, seed * 2023);
+
+    cutlass::reference::device::BlockFillSequential(
+      block_D.get(), total_elements_D, ElementC(), ElementC());
+  }
+
+
+  /// Check device GEMM results against a reference implementation with separate host-based permutation
+  template<typename Gemm>
+  bool validate(Gemm const &gemm) {
+
+    bool constexpr kBatched = PermuteInfo<typename Gemm::PermuteALayout>::kBatched 
+                           || PermuteInfo<typename Gemm::PermuteBLayout>::kBatched 
+                           || PermuteInfo<typename Gemm::PermuteDLayout>::kBatched;
+                      
+    int const batch_count = kBatched ? options.batch_count : 1;
+
+    cutlass::gemm::GemmCoord problem = options.problem_each;
+
+    cutlass::MatrixCoord extent_A{problem.m(), problem.k()};
+    cutlass::MatrixCoord extent_B{problem.k(), problem.n()};
+    cutlass::MatrixCoord extent_C{problem.m(), problem.n()};
+
+    using LayoutA = typename Gemm::LayoutA;
+    using LayoutB = typename Gemm::LayoutB;
+    using LayoutC = typename Gemm::LayoutC;
+
+    LayoutA layout_A(LayoutA::packed(extent_A));
+    LayoutB layout_B(LayoutB::packed(extent_B));
+    LayoutC layout_C(LayoutC::packed(extent_C));
+
+    auto size_A = layout_A.capacity(extent_A) * batch_count;
+    auto size_B = layout_B.capacity(extent_B) * batch_count;
+    auto size_C = layout_C.capacity(extent_C) * batch_count;
+    
+    cutlass::TensorView<ElementA, LayoutA> view_A(block_A.get(), layout_A, extent_A);
+    cutlass::TensorView<ElementB, LayoutB> view_B(block_B.get(), layout_B, extent_B);
+    cutlass::TensorView<ElementC, LayoutC> view_C(block_C.get(), layout_C, extent_C);
+    cutlass::TensorView<ElementC, LayoutC> view_D(block_D.get(), layout_C, extent_C);
+
+    cutlass::DeviceAllocation<ElementA> block_A_perm(size_A);
+    cutlass::DeviceAllocation<ElementA> block_B_perm(size_B);
+
+    cutlass::TensorView<ElementA, LayoutA> view_A_perm(block_A_perm.get(), layout_A, extent_A);
+    cutlass::TensorView<ElementB, LayoutB> view_B_perm(block_B_perm.get(), layout_B, extent_B);
+
+    permute_host<typename Gemm::PermuteALayout>(view_A.const_view(), view_A_perm, batch_count);
+    permute_host<typename Gemm::PermuteBLayout>(view_B.const_view(), view_B_perm, batch_count);
+
+    cutlass::DeviceAllocation<ElementC>    block_D_ref(size_C);
+    cutlass::TensorView<ElementC, LayoutC> view_D_ref(block_D_ref.get(), layout_C, extent_C);
+
+    using EpilogueOutputOp = typename Gemm::GemmKernel::Epilogue::OutputOp;
+
+    // Reference GEMM
+    cutlass::reference::device::GemmComplex<
+        ElementA, LayoutA,
+        ElementB, LayoutB,
+        ElementC, LayoutC, 
+        typename EpilogueOutputOp::ElementCompute,
+        typename Gemm::ElementAccumulator
+    >(
+      problem,
+      options.alpha, 
+      view_A_perm,
+      Gemm::kTransformA,
+      view_B_perm,
+      Gemm::kTransformB,
+      options.beta, 
+      view_C, 
+      view_D_ref, 
+      ElementAccumulator(0),
+      batch_count,
+      options.problem_each.m() * options.problem_each.k(),
+      options.problem_each.n() * options.problem_each.k(),
+      options.problem_each.m() * options.problem_each.n(),
+      options.problem_each.m() * options.problem_each.n()
+    );
+
+    cutlass::DeviceAllocation<ElementC>    block_D_perm(size_C);
+    cutlass::TensorView<ElementC, LayoutC> view_D_perm(block_D_perm.get(), layout_C, extent_C);
+    permute_host<typename Gemm::PermuteDLayout>(view_D_ref.const_view(), view_D_perm, batch_count);
+
+    // Reference check
+    return cutlass::reference::device::BlockCompareEqual(view_D_perm.data(), view_D.data(), size_C);
+}
+
+public:
+
+  template<typename Gemm>
+  bool profile_GEMM_permute() {
+
+    using LayoutA = typename Gemm::LayoutA;
+    using LayoutB = typename Gemm::LayoutB;
+    using LayoutC = typename Gemm::LayoutC;
+
+    using PermuteALayout = typename Gemm::PermuteALayout;
+    using PermuteBLayout = typename Gemm::PermuteBLayout;
+    using PermuteDLayout = typename Gemm::PermuteDLayout;
+
+    bool constexpr kBatched = PermuteInfo<PermuteALayout>::kBatched 
+                           || PermuteInfo<PermuteBLayout>::kBatched 
+                           || PermuteInfo<PermuteDLayout>::kBatched;
+
+    std::cout << "\n"
+                 "====================================================\n"
+                 << (kBatched ? "Batched" : "Normal") << " GEMM:"
+                 << "\n  A=" << LayoutInfo<LayoutA>::name() << "," << PermuteInfo<PermuteALayout>::name()
+                 << "\n  B=" << LayoutInfo<LayoutB>::name() << "," << PermuteInfo<PermuteBLayout>::name()
+                 << "\n  D=" << LayoutInfo<LayoutC>::name() << "," << PermuteInfo<PermuteDLayout>::name()
+                 << "\n"
+                 "====================================================\n";
+
+    if (options.verbose) {
+      print_tensor_info<PermuteALayout>(std::cout, "A", 0, 2);
+      print_tensor_info<PermuteBLayout>(std::cout, "B", 2, 1);
+      print_tensor_info<PermuteDLayout>(std::cout, "D", 0, 1);
+    }
+    std::cout << std::endl;
+
+    bool valid = true;
+    valid &= check_tensor_shape<LayoutA, PermuteALayout, Gemm::kAlignmentA>("A", 0, 2);
+    valid &= check_tensor_shape<LayoutB, PermuteBLayout, Gemm::kAlignmentB>("B", 2, 1);
+    valid &= check_tensor_shape<LayoutC, PermuteDLayout, Gemm::kAlignmentC>("D", 0, 1);
+    if (!valid)
+    {
+      std::cout << "Skipped test" << std::endl;
+      return true;
+    }
+
+    int const batch_count = kBatched ? options.batch_count : 1;
+
+    // Initialize the problem
+    initialize(batch_count);
+
+    // Configure the GEMM arguments
+    using EpilogueOutputOp = typename Gemm::GemmKernel::Epilogue::OutputOp;
+    typename EpilogueOutputOp::Params epilogue_op(options.alpha, options.beta);
+
+    // Please make sure all problem_sizes are the same for kBatched mode
+    auto problem = options.problem_each;
+
+    cutlass::MatrixCoord extent_A{problem.m(), problem.k()};
+    cutlass::MatrixCoord extent_B{problem.k(), problem.n()};
+    cutlass::MatrixCoord extent_C{problem.m(), problem.n()};
+
+    LayoutA layout_A(LayoutA::packed(extent_A));
+    LayoutB layout_B(LayoutB::packed(extent_B));
+    LayoutC layout_C(LayoutC::packed(extent_C));
+
+    // Configure GEMM arguments
+    typename Gemm::Arguments arguments{
+      kBatched ? cutlass::gemm::GemmUniversalMode::kBatched : cutlass::gemm::GemmUniversalMode::kGemm,
+      problem,
+      batch_count,
+      epilogue_op,
+      (void*)block_A.get(),
+      (void*)block_B.get(),
+      (void*)block_C.get(),
+      (void*)block_D.get(),
+      // For any non-trivial permute the batch stride must be set to 0
+      cutlass::layout::is_trivial_permute<PermuteALayout> ? layout_A.capacity(extent_A) : 0,
+      cutlass::layout::is_trivial_permute<PermuteBLayout> ? layout_B.capacity(extent_B) : 0,
+      layout_C.capacity(extent_C),
+      cutlass::layout::is_trivial_permute<PermuteDLayout> ? layout_C.capacity(extent_C) : 0,
+      layout_A.stride(0),
+      layout_B.stride(0),
+      layout_C.stride(0),
+      layout_C.stride(0),
+    };
+
+    // Initialize the GEMM object
+    Gemm gemm_normal;
+
+    CHECK_CUTLASS_CALL(gemm_normal.initialize(arguments, nullptr), return false);
+
+    // Run the normal GEMM object
+    CHECK_CUTLASS_CALL(gemm_normal.run(), return false);
+
+    // Wait for completion
+    CHECK_CUDA_CALL(cudaDeviceSynchronize(), return false);
+
+    //
+    // Verify correctness
+    //
+    if (options.reference_check) {
+      if (validate(gemm_normal)) {
+        std::cout << "\nPassed verification\n" << std::endl;
+      }
+      else {
+        std::cerr << "\n*** Error - problem failed the QA check ***\n" << std::endl;
+        return false;
+      }
+    }
+
+    // Warm-up run of the normal GEMM object
+    CHECK_CUTLASS_CALL(gemm_normal.run(), return false);
+
+    // Construct events
+    cudaEvent_t events[2];
+    for (auto & event : events) {
+      CHECK_CUDA_CALL(cudaEventCreate(&event), return false);
+    }
+
+    // Record an event at the start of a series of GEMM operations
+    CHECK_CUDA_CALL(cudaEventRecord(events[0]), return false);
+
+    // Run profiling loop
+    for (int iter = 0; iter < options.iterations; ++iter) {
+      gemm_normal();
+    }
+
+    // Record an event when the GEMM operations have been launched.
+    CHECK_CUDA_CALL(cudaEventRecord(events[1]), return false);
+
+    // Wait for work on the device to complete.
+    CHECK_CUDA_CALL(cudaEventSynchronize(events[1]), return false);
+
+    // Measure elapsed runtime
+    float runtime_total_ms = 0;
+    CHECK_CUDA_CALL(cudaEventElapsedTime(&runtime_total_ms, events[0], events[1]), return false);
+
+    // Compute average runtime and GFLOPs.
+    double runtime_avg_ms = double(runtime_total_ms) / double(options.iterations);
+    double gflops = options.gflops(runtime_avg_ms / 1000.0, kBatched);
+
+    // Cleanup
+    for (auto event : events) {
+      CHECK_CUDA_CALL(cudaEventDestroy(event), return false);
+    }
+
+    std::cout << "    Runtime: " << runtime_avg_ms << " ms\n"
+                 "     GFLOPs: " << gflops << std::endl;
+
+    return true;
+  }
+};
+
+/// Shorthand alist for GEMM instantiations
+template<typename LayoutA, typename PermuteALayout,
+         typename LayoutB, typename PermuteBLayout,
+         typename LayoutC, typename PermuteDLayout>
+using GemmPermute = cutlass::gemm::device::GemmUniversal<
+  ElementInput, LayoutA,
+  ElementInput, LayoutB,
+  ElementOutput, LayoutC,
+  ElementAccumulator,
+  cutlass::arch::OpClassTensorOp,
+  cutlass::arch::Sm80,
+  cutlass::gemm::GemmShape<128, 128, 32>,
+  cutlass::gemm::GemmShape<64, 64, 32>,
+  cutlass::gemm::GemmShape<16, 8, 16>,
+  cutlass::epilogue::thread::LinearCombination<
+    ElementOutput, 
+    AlignmentC, //128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementAccumulator, 
+    ElementAccumulator
+  >,
+  cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
+  4,              /*kStages*/
+  AlignmentA,     /*AlignmentA*/
+  AlignmentB,     /*AlignmentB*/
+  cutlass::arch::OpMultiplyAdd,
+  cutlass::ComplexTransform::kNone,
+  cutlass::ComplexTransform::kNone,
+  false,  /*GatherA*/
+  false,  /*GatherB*/
+  false,  /*ScatterD*/
+  PermuteDLayout,  /*PermuteDLayout*/
+  typename cutlass::layout::InversePermute<PermuteALayout>::type,  /*PermuteALayout*/
+  typename cutlass::layout::InversePermute<PermuteBLayout>::type   /*PermuteBLayout*/
+>;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  //
+  // This example uses mma.sync to directly access Tensor Cores to achieve peak performance.
+  //
+
+  cudaDeviceProp props;
+
+  CHECK_CUDA_CALL(cudaGetDeviceProperties(&props, 0), return EXIT_FAILURE);
+
+  if (__CUDACC_VER_MAJOR__ < 11 || props.major < 8) {
+  
+    //
+    // This example requires an NVIDIA Ampere-architecture GPU.
+    //
+
+    std::cout << "CUTLASS's GEMM+Permute example requires a GPU of NVIDIA's Ampere Architecture "
+                 "or later (compute capability 80 or greater).\n";
+
+    return EXIT_SUCCESS;
+  }
+
+  //
+  // Parse options
+  //
+
+  Options options;
+  
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return EXIT_SUCCESS;
+  }
+
+  if (options.error) {
+    std::cerr << "Aborting execution." << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  //
+  // Define GEMM types to test
+  //
+
+  //
+  // TTT (Row-major) GEMMs
+  //
+
+  using TTTGemmNormalPermuteNone = GemmPermute<
+    cutlass::layout::RowMajor, cutlass::layout::NoPermute,
+    cutlass::layout::RowMajor, cutlass::layout::NoPermute,
+    cutlass::layout::RowMajor, cutlass::layout::NoPermute
+  >;
+
+  using TTTGemmNormalPermuteA = GemmPermute<
+    cutlass::layout::RowMajor, cutlass::layout::Tensor4DPermute0213RowMajor<S1, S2>,
+    cutlass::layout::RowMajor, cutlass::layout::NoPermute,
+    cutlass::layout::RowMajor, cutlass::layout::NoPermute
+  >;
+
+  using TTTGemmNormalPermuteAD = GemmPermute<
+    cutlass::layout::RowMajor, cutlass::layout::Tensor4DPermute0213RowMajor<S1, S2>,
+    cutlass::layout::RowMajor, cutlass::layout::NoPermute,
+    cutlass::layout::RowMajor, cutlass::layout::Tensor5DPermute20314RowMajor<T1, T2, T3>
+  >;
+
+  using TTTGemmNormalPermuteB = GemmPermute<
+    cutlass::layout::RowMajor, cutlass::layout::NoPermute,
+    cutlass::layout::RowMajor, cutlass::layout::Tensor4DPermute0213RowMajor<S1, S2>,
+    cutlass::layout::RowMajor, cutlass::layout::NoPermute
+  >;
+
+  using TTTGemmNormalPermuteBD = GemmPermute<
+    cutlass::layout::RowMajor, cutlass::layout::NoPermute,
+    cutlass::layout::RowMajor, cutlass::layout::Tensor4DPermute0213RowMajor<S1, S2>,
+    cutlass::layout::RowMajor, cutlass::layout::Tensor5DPermute20314RowMajor<T1, T2, T3>
+  >;
+
+  using TTTGemmNormalPermuteD = GemmPermute<
+    cutlass::layout::RowMajor, cutlass::layout::NoPermute,
+    cutlass::layout::RowMajor, cutlass::layout::NoPermute,
+    cutlass::layout::RowMajor, cutlass::layout::Tensor5DPermute20314RowMajor<T1, T2, T3>
+  >;
+
+  using TTTGemmNormalPermuteAB = GemmPermute<
+    cutlass::layout::RowMajor, cutlass::layout::Tensor4DPermute0213RowMajor<S1, S2>,
+    cutlass::layout::RowMajor, cutlass::layout::Tensor4DPermute0213RowMajor<S1, S2>,
+    cutlass::layout::RowMajor, cutlass::layout::NoPermute
+  >;
+
+  using TTTGemmNormalPermuteABD = GemmPermute<
+    cutlass::layout::RowMajor, cutlass::layout::Tensor4DPermute0213RowMajor<S1, S2>,
+    cutlass::layout::RowMajor, cutlass::layout::Tensor4DPermute0213RowMajor<S1, S2>,
+    cutlass::layout::RowMajor, cutlass::layout::Tensor5DPermute20314RowMajor<T1, T2, T3>
+  >;
+
+  //
+  // NNN (Col-major) GEMMs
+  //
+
+  using NNNGemmNormalPermuteNone = GemmPermute<
+    cutlass::layout::ColumnMajor, cutlass::layout::NoPermute,
+    cutlass::layout::ColumnMajor, cutlass::layout::NoPermute,
+    cutlass::layout::ColumnMajor, cutlass::layout::NoPermute
+  >;
+
+  using NNNGemmNormalPermuteA = GemmPermute<
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor5DPermute02413ColumnMajor<T1, T2, T3>,
+    cutlass::layout::ColumnMajor, cutlass::layout::NoPermute,
+    cutlass::layout::ColumnMajor, cutlass::layout::NoPermute
+  >;
+
+  using NNNGemmNormalPermuteAD = GemmPermute<
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor5DPermute02413ColumnMajor<T1, T2, T3>,
+    cutlass::layout::ColumnMajor, cutlass::layout::NoPermute,
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor5DPermute02413ColumnMajor<T1, T2, T3>
+  >;
+
+  using NNNGemmNormalPermuteB = GemmPermute<
+    cutlass::layout::ColumnMajor, cutlass::layout::NoPermute,
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor4DPermute0213ColumnMajor<S1, S2>,
+    cutlass::layout::ColumnMajor, cutlass::layout::NoPermute
+  >;
+
+  using NNNGemmNormalPermuteBD = GemmPermute<
+    cutlass::layout::ColumnMajor, cutlass::layout::NoPermute,
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor4DPermute0213ColumnMajor<S1, S2>,
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor5DPermute02413ColumnMajor<T1, T2, T3>
+  >;
+
+  using NNNGemmNormalPermuteD = GemmPermute<
+    cutlass::layout::ColumnMajor, cutlass::layout::NoPermute,
+    cutlass::layout::ColumnMajor, cutlass::layout::NoPermute,
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor5DPermute02413ColumnMajor<T1, T2, T3>
+  >;
+
+  using NNNGemmNormalPermuteAB = GemmPermute<
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor5DPermute02413ColumnMajor<T1, T2, T3>,
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor4DPermute0213ColumnMajor<S1, S2>,
+    cutlass::layout::ColumnMajor, cutlass::layout::NoPermute
+  >;
+
+  using NNNGemmNormalPermuteABD = GemmPermute<
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor5DPermute02413ColumnMajor<T1, T2, T3>,
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor4DPermute0213ColumnMajor<S1, S2>,
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor5DPermute02413ColumnMajor<T1, T2, T3>
+  >;
+
+  //
+  // NNT (Col-major inputs, row-major output) GEMMs
+  //
+
+  using NNTGemmNormalPermuteNone = GemmPermute<
+    cutlass::layout::ColumnMajor, cutlass::layout::NoPermute,
+    cutlass::layout::ColumnMajor, cutlass::layout::NoPermute,
+    cutlass::layout::RowMajor,    cutlass::layout::NoPermute
+  >;
+
+  using NNTGemmNormalPermuteA = GemmPermute<
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor4DPermute0213RowMajor<S1, S2>,
+    cutlass::layout::ColumnMajor, cutlass::layout::NoPermute,
+    cutlass::layout::RowMajor,    cutlass::layout::NoPermute
+  >;
+
+  using NNTGemmNormalPermuteAD = GemmPermute<
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor4DPermute0213RowMajor<S1, S2>,
+    cutlass::layout::ColumnMajor, cutlass::layout::NoPermute,
+    cutlass::layout::RowMajor,    cutlass::layout::Tensor5DPermute20314RowMajor<T1, T2, T3>
+  >;
+
+  using NNTGemmNormalPermuteB = GemmPermute<
+    cutlass::layout::ColumnMajor, cutlass::layout::NoPermute,
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor4DPermute0213ColumnMajor<S1, S2>,
+    cutlass::layout::RowMajor,    cutlass::layout::NoPermute
+  >;
+
+  using NNTGemmNormalPermuteBD = GemmPermute<
+    cutlass::layout::ColumnMajor, cutlass::layout::NoPermute,
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor4DPermute0213ColumnMajor<S1, S2>,
+    cutlass::layout::RowMajor,    cutlass::layout::Tensor5DPermute20314RowMajor<T1, T2, T3>
+  >;
+
+  using NNTGemmNormalPermuteD = GemmPermute<
+    cutlass::layout::ColumnMajor, cutlass::layout::NoPermute,
+    cutlass::layout::ColumnMajor, cutlass::layout::NoPermute,
+    cutlass::layout::RowMajor,    cutlass::layout::Tensor5DPermute20314RowMajor<T1, T2, T3>
+  >;
+
+  using NNTGemmNormalPermuteAB = GemmPermute<
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor4DPermute0213RowMajor<S1, S2>,
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor4DPermute0213ColumnMajor<S1, S2>,
+    cutlass::layout::RowMajor,    cutlass::layout::NoPermute
+  >;
+
+  using NNTGemmNormalPermuteABD = GemmPermute<
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor4DPermute0213RowMajor<S1, S2>,
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor4DPermute0213ColumnMajor<S1, S2>,
+    cutlass::layout::RowMajor,    cutlass::layout::Tensor5DPermute20314RowMajor<T1, T2, T3>
+  >;
+
+  //
+  // TTN (Row-major inputs, col-major output) GEMMs
+  //
+
+  using TTNGemmNormalPermuteNone = GemmPermute<
+    cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+    cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+    cutlass::layout::ColumnMajor, cutlass::layout::NoPermute
+  >;
+
+  using TTNGemmNormalPermuteA = GemmPermute<
+    cutlass::layout::RowMajor,    cutlass::layout::Tensor4DPermute0213RowMajor<S1, S2>,
+    cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+    cutlass::layout::ColumnMajor, cutlass::layout::NoPermute
+  >;
+
+  using TTNGemmNormalPermuteAD = GemmPermute<
+    cutlass::layout::RowMajor,    cutlass::layout::Tensor4DPermute0213RowMajor<S1, S2>,
+    cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor5DPermute02413ColumnMajor<T1, T2, T3>
+  >;
+
+  using TTNGemmNormalPermuteB = GemmPermute<
+    cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+    cutlass::layout::RowMajor,    cutlass::layout::Tensor4DPermute0213RowMajor<S1, S2>,
+    cutlass::layout::ColumnMajor, cutlass::layout::NoPermute
+  >;
+
+  using TTNGemmNormalPermuteBD = GemmPermute<
+    cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+    cutlass::layout::RowMajor,    cutlass::layout::Tensor4DPermute0213RowMajor<S1, S2>,
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor5DPermute02413ColumnMajor<T1, T2, T3>
+  >;
+
+  using TTNGemmNormalPermuteD = GemmPermute<
+    cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+    cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor5DPermute02413ColumnMajor<T1, T2, T3>
+  >;
+
+  using TTNGemmNormalPermuteAB = GemmPermute<
+    cutlass::layout::RowMajor,    cutlass::layout::Tensor4DPermute0213RowMajor<S1, S2>,
+    cutlass::layout::RowMajor,    cutlass::layout::Tensor4DPermute0213RowMajor<S1, S2>,
+    cutlass::layout::ColumnMajor, cutlass::layout::NoPermute
+  >;
+
+  using TTNGemmNormalPermuteABD = GemmPermute<
+    cutlass::layout::RowMajor,    cutlass::layout::Tensor4DPermute0213RowMajor<S1, S2>,
+    cutlass::layout::RowMajor,    cutlass::layout::Tensor4DPermute0213RowMajor<S1, S2>,
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor5DPermute02413ColumnMajor<T1, T2, T3>
+  >;
+
+  //
+  // TTT (Row-major) BMMs
+  //
+
+  using TTTGemmBatchedPermuteA = GemmPermute<
+    cutlass::layout::RowMajor, cutlass::layout::Tensor4DPermuteBMM0213RowMajor<D1>,
+    cutlass::layout::RowMajor, cutlass::layout::NoPermute,
+    cutlass::layout::RowMajor, cutlass::layout::NoPermute
+  >;
+
+  using TTTGemmBatchedPermuteAD = GemmPermute<
+    cutlass::layout::RowMajor, cutlass::layout::Tensor4DPermuteBMM0213RowMajor<D1>,
+    cutlass::layout::RowMajor, cutlass::layout::NoPermute,
+    cutlass::layout::RowMajor, cutlass::layout::Tensor4DPermuteBMM0213RowMajor<D1>
+  >;
+
+  using TTTGemmBatchedPermuteB = GemmPermute<
+    cutlass::layout::RowMajor, cutlass::layout::NoPermute,
+    cutlass::layout::RowMajor, cutlass::layout::Tensor4DPermuteBMM0213RowMajor<D1>,
+    cutlass::layout::RowMajor, cutlass::layout::NoPermute
+  >;
+
+  using TTTGemmBatchedPermuteBD = GemmPermute<
+    cutlass::layout::RowMajor, cutlass::layout::NoPermute,
+    cutlass::layout::RowMajor, cutlass::layout::Tensor4DPermuteBMM0213RowMajor<D1>,
+    cutlass::layout::RowMajor, cutlass::layout::Tensor4DPermuteBMM0213RowMajor<D1>
+  >;
+
+  using TTTGemmBatchedPermuteD = GemmPermute<
+    cutlass::layout::RowMajor, cutlass::layout::NoPermute,
+    cutlass::layout::RowMajor, cutlass::layout::NoPermute,
+    cutlass::layout::RowMajor, cutlass::layout::Tensor4DPermuteBMM0213RowMajor<D1>
+  >;
+
+  using TTTGemmBatchedPermuteAB = GemmPermute<
+    cutlass::layout::RowMajor, cutlass::layout::NoPermute,
+    cutlass::layout::RowMajor, cutlass::layout::Tensor4DPermuteBMM0213RowMajor<D1>,
+    cutlass::layout::RowMajor, cutlass::layout::Tensor4DPermuteBMM0213RowMajor<D1>
+  >;
+
+  using TTTGemmBatchedPermuteABD = GemmPermute<
+    cutlass::layout::RowMajor, cutlass::layout::Tensor4DPermuteBMM0213RowMajor<D1>,
+    cutlass::layout::RowMajor, cutlass::layout::Tensor4DPermuteBMM0213RowMajor<D1>,
+    cutlass::layout::RowMajor, cutlass::layout::Tensor4DPermuteBMM0213RowMajor<D1>
+  >;
+
+  //
+  // NNN (Col-major) BMMs
+  //
+
+  using NNNGemmBatchedPermuteA = GemmPermute<
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor4DPermuteBMM0321ColumnMajor<D1>,
+    cutlass::layout::ColumnMajor, cutlass::layout::NoPermute,
+    cutlass::layout::ColumnMajor, cutlass::layout::NoPermute
+  >;
+
+  using NNNGemmBatchedPermuteAD = GemmPermute<
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor4DPermuteBMM0321ColumnMajor<D1>,
+    cutlass::layout::ColumnMajor, cutlass::layout::NoPermute,
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor4DPermuteBMM0321ColumnMajor<D1>
+  >;
+
+  using NNNGemmBatchedPermuteB = GemmPermute<
+    cutlass::layout::ColumnMajor, cutlass::layout::NoPermute,
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor4DPermuteBMM0321ColumnMajor<D1>,
+    cutlass::layout::ColumnMajor, cutlass::layout::NoPermute
+  >;
+
+  using NNNGemmBatchedPermuteBD = GemmPermute<
+    cutlass::layout::ColumnMajor, cutlass::layout::NoPermute,
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor4DPermuteBMM0321ColumnMajor<D1>,
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor4DPermuteBMM0321ColumnMajor<D1>
+  >;
+
+  using NNNGemmBatchedPermuteD = GemmPermute<
+    cutlass::layout::ColumnMajor, cutlass::layout::NoPermute,
+    cutlass::layout::ColumnMajor, cutlass::layout::NoPermute,
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor4DPermuteBMM0321ColumnMajor<D1>
+  >;
+
+  using NNNGemmBatchedPermuteAB = GemmPermute<
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor4DPermuteBMM0321ColumnMajor<D1>,
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor4DPermuteBMM0321ColumnMajor<D1>,
+    cutlass::layout::ColumnMajor, cutlass::layout::NoPermute
+  >;
+
+  using NNNGemmBatchedPermuteABD = GemmPermute<
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor4DPermuteBMM0321ColumnMajor<D1>,
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor4DPermuteBMM0321ColumnMajor<D1>,
+    cutlass::layout::ColumnMajor, cutlass::layout::Tensor4DPermuteBMM0321ColumnMajor<D1>
+  >;
+
+  //
+  // Profile it
+  //
+
+  Testbed<ElementInput, ElementInput, ElementOutput> testbed(options);
+
+  bool result = true;
+
+  result &= testbed.profile_GEMM_permute<TTTGemmNormalPermuteNone>();
+  result &= testbed.profile_GEMM_permute<TTTGemmNormalPermuteA>();
+  result &= testbed.profile_GEMM_permute<TTTGemmNormalPermuteAD>();
+  result &= testbed.profile_GEMM_permute<TTTGemmNormalPermuteB>();
+  result &= testbed.profile_GEMM_permute<TTTGemmNormalPermuteBD>();
+  result &= testbed.profile_GEMM_permute<TTTGemmNormalPermuteD>();
+  result &= testbed.profile_GEMM_permute<TTTGemmNormalPermuteAB>();
+  result &= testbed.profile_GEMM_permute<TTTGemmNormalPermuteABD>();
+
+  result &= testbed.profile_GEMM_permute<NNNGemmNormalPermuteNone>();
+  result &= testbed.profile_GEMM_permute<NNNGemmNormalPermuteA>();
+  result &= testbed.profile_GEMM_permute<NNNGemmNormalPermuteAD>();
+  result &= testbed.profile_GEMM_permute<NNNGemmNormalPermuteB>();
+  result &= testbed.profile_GEMM_permute<NNNGemmNormalPermuteBD>();
+  result &= testbed.profile_GEMM_permute<NNNGemmNormalPermuteD>();
+  result &= testbed.profile_GEMM_permute<NNNGemmNormalPermuteAB>();
+  result &= testbed.profile_GEMM_permute<NNNGemmNormalPermuteABD>();
+
+  result &= testbed.profile_GEMM_permute<NNTGemmNormalPermuteNone>();
+  result &= testbed.profile_GEMM_permute<NNTGemmNormalPermuteA>();
+  result &= testbed.profile_GEMM_permute<NNTGemmNormalPermuteAD>();
+  result &= testbed.profile_GEMM_permute<NNTGemmNormalPermuteB>();
+  result &= testbed.profile_GEMM_permute<NNTGemmNormalPermuteBD>();
+  result &= testbed.profile_GEMM_permute<NNTGemmNormalPermuteD>();
+  result &= testbed.profile_GEMM_permute<NNTGemmNormalPermuteAB>();
+  result &= testbed.profile_GEMM_permute<NNTGemmNormalPermuteABD>();
+
+  result &= testbed.profile_GEMM_permute<TTNGemmNormalPermuteNone>();
+  result &= testbed.profile_GEMM_permute<TTNGemmNormalPermuteA>();
+  result &= testbed.profile_GEMM_permute<TTNGemmNormalPermuteAD>();
+  result &= testbed.profile_GEMM_permute<TTNGemmNormalPermuteB>();
+  result &= testbed.profile_GEMM_permute<TTNGemmNormalPermuteBD>();
+  result &= testbed.profile_GEMM_permute<TTNGemmNormalPermuteD>();
+  result &= testbed.profile_GEMM_permute<TTNGemmNormalPermuteAB>();
+  result &= testbed.profile_GEMM_permute<TTNGemmNormalPermuteABD>();
+
+  result &= testbed.profile_GEMM_permute<TTTGemmBatchedPermuteA>();
+  result &= testbed.profile_GEMM_permute<TTTGemmBatchedPermuteAD>();
+  result &= testbed.profile_GEMM_permute<TTTGemmBatchedPermuteB>();
+  result &= testbed.profile_GEMM_permute<TTTGemmBatchedPermuteBD>();
+  result &= testbed.profile_GEMM_permute<TTTGemmBatchedPermuteD>();
+  result &= testbed.profile_GEMM_permute<TTTGemmBatchedPermuteAB>();
+  result &= testbed.profile_GEMM_permute<TTTGemmBatchedPermuteABD>();
+
+  result &= testbed.profile_GEMM_permute<NNNGemmBatchedPermuteA>();
+  result &= testbed.profile_GEMM_permute<NNNGemmBatchedPermuteAD>();
+  result &= testbed.profile_GEMM_permute<NNNGemmBatchedPermuteB>();
+  result &= testbed.profile_GEMM_permute<NNNGemmBatchedPermuteBD>();
+  result &= testbed.profile_GEMM_permute<NNNGemmBatchedPermuteD>();
+  result &= testbed.profile_GEMM_permute<NNNGemmBatchedPermuteAB>();
+  result &= testbed.profile_GEMM_permute<NNNGemmBatchedPermuteABD>();
+
+  std::cout << "\n"
+               "====================================================\n"
+               "Finished (" << (result ? "PASS" : "FAIL") << ")\n"
+               "====================================================" << std::endl;
+
+  return result ? EXIT_SUCCESS : EXIT_FAILURE;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/39_gemm_permute/layouts.h b/examples/39_gemm_permute/layouts.h
new file mode 100644
index 0000000000..3632ec0afb
--- /dev/null
+++ b/examples/39_gemm_permute/layouts.h
@@ -0,0 +1,506 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines additional layout functions used in Permute GEMM example to simplify
+    computing reference permutations of 4/5D tensors when source data is column-major.
+*/
+#pragma once
+#include <cuda/std/cassert>
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/coord.h"
+#include "cutlass/tensor_coord.h"
+
+namespace cutlass {
+namespace layout {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Mapping function for 4-D CWHN tensors.
+class TensorCWHN {
+public:
+  /// Logical rank of tensor
+  static int const kRank = 4;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 3;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate (n, h, w, c)
+  using TensorCoord = Tensor4DCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member - [n, hn, whn]
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TensorCWHN(Stride const &stride = Stride(0)): stride_(stride) { }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TensorCWHN(
+    typename Stride::Index stride_h,    ///< number of elements between adjacent N coordinates
+    typename Stride::Index stride_w,    ///< number of elements between adjacent C coordinates
+    typename Stride::Index stride_c     ///< number of elements between adjacent W coordinates
+  ): 
+    stride_(make_Coord(stride_h, stride_w, stride_c)) { }
+
+  /// Constructor
+  // Once convolutions implement 64b stride this ctor can be deleted
+  CUTLASS_HOST_DEVICE
+  TensorCWHN(Coord<kStrideRank, LongIndex> const &stride): 
+    stride_(make_Coord(
+      static_cast<typename Stride::Index>(stride[0]), 
+      static_cast<typename Stride::Index>(stride[1]), 
+      static_cast<typename Stride::Index>(stride[2]))
+    ) { }
+
+  /// Helper returns a layout to a tightly packed WCNH tensor.
+  CUTLASS_HOST_DEVICE
+  static TensorCWHN packed(TensorCoord const &extent) {
+    return TensorCWHN(
+      make_Coord(
+        extent.n(), 
+        extent.h() * extent.n(),
+        extent.w() * extent.h() * extent.n()
+      )
+    );
+  }
+  
+  /// Returns the offset of a coordinate (n, h, w, c) in linear memory. 
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return coord.n() + 
+      LongIndex(stride_[0] * coord.h()) + 
+      LongIndex(stride_[1] * coord.w()) +
+      LongIndex(stride_[2] * coord.c());
+  }
+  
+  /// Returns the offset of a pitchlinear coordinate in linear memory. 
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const {
+    return coord.contiguous() + LongIndex(coord.strided() * stride_[2]);
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    // it does not make sense if the extent is larger than stride
+    // and we could not rely on the capacity calculation in such cases
+    // we could move this checkers to debug code only
+    if ((extent.n() > stride_[0])
+        || (extent.h() * stride_[0] > stride_[1]) 
+        || (extent.w() * stride_[1] > stride_[2])) {
+      assert(0);
+    }
+    return extent.c() * stride_[2];
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Mapping function for 4-D NHCW tensors.
+class TensorNHCW {
+public:
+  /// Logical rank of tensor
+  static int const kRank = 4;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 3;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate (n, h, w, c)
+  using TensorCoord = Tensor4DCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member - [w, cw, hcw]
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TensorNHCW(Stride const &stride = Stride(0)): stride_(stride) { }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TensorNHCW(
+    typename Stride::Index stride_c,    ///< number of elements between adjacent C coordinates
+    typename Stride::Index stride_h,    ///< number of elements between adjacent H coordinates
+    typename Stride::Index stride_n     ///< number of elements between adjacent N coordinates
+  ): 
+    stride_(make_Coord(stride_c, stride_h, stride_n)) { }
+
+  /// Constructor
+  // Once convolutions implement 64b stride this ctor can be deleted
+  CUTLASS_HOST_DEVICE
+  TensorNHCW(Coord<kStrideRank, LongIndex> const &stride): 
+    stride_(make_Coord(
+      static_cast<typename Stride::Index>(stride[0]), 
+      static_cast<typename Stride::Index>(stride[1]), 
+      static_cast<typename Stride::Index>(stride[2]))
+    ) { }
+
+  /// Helper returns a layout to a tightly packed WCNH tensor.
+  CUTLASS_HOST_DEVICE
+  static TensorNHCW packed(TensorCoord const &extent) {
+    return TensorNHCW(
+      make_Coord(
+        extent.w(), 
+        extent.c() * extent.w(),
+        extent.h() * extent.c() * extent.w()
+      )
+    );
+  }
+  
+  /// Returns the offset of a coordinate (n, h, w, c) in linear memory. 
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return coord.w() + 
+      LongIndex(stride_[0] * coord.c()) + 
+      LongIndex(stride_[1] * coord.h()) +
+      LongIndex(stride_[2] * coord.n());
+  }
+  
+  /// Returns the offset of a pitchlinear coordinate in linear memory. 
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const {
+    return coord.contiguous() + LongIndex(coord.strided() * stride_[2]);
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    // it does not make sense if the extent is larger than stride
+    // and we could not rely on the capacity calculation in such cases
+    // we could move this checkers to debug code only
+    if ((extent.w() > stride_[0])
+        || (extent.c() * stride_[0] > stride_[1]) 
+        || (extent.h() * stride_[1] > stride_[2])) {
+      assert(0);
+    }
+    return extent.n() * stride_[2];
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Mapping function for 4-D NHCW tensors.
+class TensorNCWH {
+public:
+  /// Logical rank of tensor
+  static int const kRank = 4;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 3;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate (n, h, w, c)
+  using TensorCoord = Tensor4DCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member - [h, wh, cwh]
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TensorNCWH(Stride const &stride = Stride(0)): stride_(stride) { }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TensorNCWH(
+    typename Stride::Index stride_w,    ///< number of elements between adjacent C coordinates
+    typename Stride::Index stride_c,    ///< number of elements between adjacent H coordinates
+    typename Stride::Index stride_n     ///< number of elements between adjacent N coordinates
+  ): 
+    stride_(make_Coord(stride_w, stride_c, stride_n)) { }
+
+  /// Constructor
+  // Once convolutions implement 64b stride this ctor can be deleted
+  CUTLASS_HOST_DEVICE
+  TensorNCWH(Coord<kStrideRank, LongIndex> const &stride): 
+    stride_(make_Coord(
+      static_cast<typename Stride::Index>(stride[0]), 
+      static_cast<typename Stride::Index>(stride[1]), 
+      static_cast<typename Stride::Index>(stride[2]))
+    ) { }
+
+  /// Helper returns a layout to a tightly packed WCNH tensor.
+  CUTLASS_HOST_DEVICE
+  static TensorNCWH packed(TensorCoord const &extent) {
+    return TensorNCWH(
+      make_Coord(
+        extent.h(), 
+        extent.w() * extent.h(),
+        extent.c() * extent.w() * extent.h()
+      )
+    );
+  }
+  
+  /// Returns the offset of a coordinate (n, h, w, c) in linear memory. 
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return coord.h() + 
+      LongIndex(stride_[0] * coord.w()) + 
+      LongIndex(stride_[1] * coord.c()) +
+      LongIndex(stride_[2] * coord.n());
+  }
+  
+  /// Returns the offset of a pitchlinear coordinate in linear memory. 
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const {
+    return coord.contiguous() + LongIndex(coord.strided() * stride_[2]);
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    // it does not make sense if the extent is larger than stride
+    // and we could not rely on the capacity calculation in such cases
+    // we could move this checkers to debug code only
+    if ((extent.h() > stride_[0])
+        || (extent.w() * stride_[0] > stride_[1]) 
+        || (extent.c() * stride_[1] > stride_[2])) {
+      assert(0);
+    }
+    return extent.n() * stride_[2];
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Mapping function for 5-D CWHDN tensors.
+class TensorCWHDN {
+public:
+  /// Logical rank of tensor
+  static int const kRank = 5;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 4;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate (n, d, h, w, c)
+  using TensorCoord = Tensor5DCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member - [n, dn, hdn, whdn]
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TensorCWHDN(Stride const &stride = Stride(0)): stride_(stride) { }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TensorCWHDN(
+    typename Stride::Index n, 
+    typename Stride::Index dn, 
+    typename Stride::Index hdn, 
+    typename Stride::Index whdn): 
+  stride_(make_Coord(n, dn, hdn, whdn)) { }
+
+  /// Constructor
+  // Once convolutions implement 64b stride this ctor can be deleted
+  CUTLASS_HOST_DEVICE
+  TensorCWHDN(Coord<kStrideRank, LongIndex> const &stride): 
+    stride_(make_Coord(
+      static_cast<typename Stride::Index>(stride[0]), 
+      static_cast<typename Stride::Index>(stride[1]), 
+      static_cast<typename Stride::Index>(stride[2]),
+      static_cast<typename Stride::Index>(stride[3]))
+    ) { }
+
+  /// Helper returns a layout to a tightly packed CWHDN tensor.
+  CUTLASS_HOST_DEVICE
+  static TensorCWHDN packed(TensorCoord const &extent) {
+    return TensorCWHDN(
+      make_Coord(
+        extent.n(), 
+        extent.d() * extent.n(),
+        extent.h() * extent.d() * extent.n(),
+        extent.w() * extent.h() * extent.d() * extent.n()
+      )
+    );
+  }
+  
+  /// Returns the offset of a coordinate (n, d, h, w, c) in linear memory. 
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return coord.n() + 
+      LongIndex(stride_[0] * coord.d()) + 
+      LongIndex(stride_[1] * coord.h()) +
+      LongIndex(stride_[2] * coord.w()) +
+      LongIndex(stride_[3] * coord.c());
+  }
+
+  /// Returns the offset of a pitchlinear coordinate in linear memory. 
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const {
+    return coord.contiguous() + LongIndex(coord.strided() * stride_[3]);
+  }
+  
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    // it does not make sense if the extent is larger than stride
+    // and we could not rely on the capacity calculation in such cases
+    // we could move this checkers to debug code only
+    if ((extent.n() > stride_[0])
+        || (extent.d() * stride_[0] > stride_[1]) 
+        || (extent.h() * stride_[1] > stride_[2])
+        || (extent.w() * stride_[2] > stride_[3])) {
+      assert(0);
+    }
+    return extent.c() * stride_[3];
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace layout
+} // namespace cutlass
diff --git a/examples/39_gemm_permute/permute_info.h b/examples/39_gemm_permute/permute_info.h
new file mode 100644
index 0000000000..57672e7c49
--- /dev/null
+++ b/examples/39_gemm_permute/permute_info.h
@@ -0,0 +1,344 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Contains additional metadata about layout permute functions used in the example.
+*/
+
+#include "cutlass/tensor_coord.h"
+#include "cutlass/layout/permute.h"
+
+/// Additional permutation metadata to facilitate testing/printing 
+template<typename PermuteLayout>
+struct PermuteInfo;
+
+/// Specialization for default case (no permute). Other specializations must follow this template.
+template<>
+struct PermuteInfo<cutlass::layout::NoPermute> {
+
+  /// Whether this is a BMM or GEMM permutation (NoPermute can actually be either)
+  static bool constexpr kBatched = false;
+
+  /// Minimal divisor for row extent
+  static int  constexpr kRowFactor = 1;
+
+  /// Minimum divisor for column extent
+  static int  constexpr kColumnFactor = 1;
+
+  /// Minimum divisor for batch size dimension
+  static int  constexpr kBatchFactor = 1;
+
+  /// Tensor layout used in permutation operation
+  using Layout = cutlass::layout::PackedVectorLayout;
+
+  static std::string name() {
+    return "NoPermute";
+  }
+
+  /// User-friendly description of the permute operation
+  static std::string desc() {
+    return "no permutation";
+  }
+
+  /// Infer original higher-rank tensor shape from GEMM/BMM matrix extents.
+  /// For direct (output) permutations, must be a simple reshape of extent.
+  /// For inverse (input) permutations, must return shape *before* permute operation.
+  /// In case of NoPermute, simply use a linear (rank 1) view of the memory
+  static Layout::TensorCoord original_shape(cutlass::MatrixCoord extent, int batch_count) {
+    return Layout::TensorCoord(extent.row() * extent.column() * batch_count);
+  }
+
+  /// Compute the permuted higher-rank tensor shape from the original shape.
+  static Layout::TensorCoord permute(Layout::TensorCoord const &s) {
+    return s;
+  }
+};
+
+template<int D1>
+struct PermuteInfo<cutlass::layout::Tensor4DPermuteBMM0213RowMajor<D1>> {
+
+  static bool constexpr kBatched = true;
+  static int  constexpr kRowFactor = 1;
+  static int  constexpr kColumnFactor = 1;
+  static int  constexpr kBatchFactor = D1;
+
+  using Layout = cutlass::layout::TensorNHWC;
+
+  static std::string name() {
+    return "Tensor4DPermuteBMM0213<" + std::to_string(D1) + ">";
+  }
+
+  static std::string desc() {
+    return "batched GEMM permutation [0, 2, 1, 3]";
+  }
+
+  static Layout::TensorCoord original_shape(cutlass::MatrixCoord extent, int batch_count) {
+    int D0 = batch_count / D1;
+    int D2 = extent.row();
+    int D3 = extent.column();
+    return {D0, D1, D2, D3};
+  }
+
+  static Layout::TensorCoord permute(Layout::TensorCoord const &s) {
+    return {s[0], s[2], s[1], s[3]};
+  }
+};
+
+template<int D1>
+struct PermuteInfo<cutlass::layout::Tensor4DPermuteBMM0213RowMajorInverse<D1>> 
+: public PermuteInfo<cutlass::layout::Tensor4DPermuteBMM0213RowMajor<D1>> {
+
+  static bool constexpr kBatched = true;
+  static int  constexpr kRowFactor = 1;
+  static int  constexpr kColumnFactor = D1;
+  static int  constexpr kBatchFactor = 1;
+
+  using Base = PermuteInfo<cutlass::layout::Tensor4DPermuteBMM0213RowMajor<D1>>;
+  using Layout = typename Base::Layout;
+
+  static typename Layout::TensorCoord original_shape(cutlass::MatrixCoord extent, int batch_count) {
+    int D0 = batch_count;
+    int D2 = extent.row();
+    int D3 = extent.column() / D1;
+    return {D0, D1, D2, D3};
+  }
+};
+
+template<int D1>
+struct PermuteInfo<cutlass::layout::Tensor4DPermuteBMM0321ColumnMajor<D1>> {
+  
+  static bool constexpr kBatched = true;
+  static int  constexpr kRowFactor = 1;
+  static int  constexpr kColumnFactor = 1;
+  static int  constexpr kBatchFactor = D1;
+
+  using Layout = cutlass::layout::TensorNHCW;
+
+  static std::string name() {
+    return "Tensor4DPermuteBMM0321<" + std::to_string(D1) + ">";
+  }
+
+  static std::string desc() {
+    return "batched GEMM permutation [0, 3, 2, 1]";
+  }
+
+  static Layout::TensorCoord original_shape(cutlass::MatrixCoord extent, int batch_count) {
+    int D0 = batch_count / D1;
+    int D2 = extent.row();
+    int D3 = extent.column();
+    return {D0, D1, D2, D3};
+  }
+
+  static Layout::TensorCoord permute(Layout::TensorCoord const &s) {
+    return {s[0], s[3], s[2], s[1]};
+  }
+};
+
+template<int D1>
+struct PermuteInfo<cutlass::layout::Tensor4DPermuteBMM0321ColumnMajorInverse<D1>> 
+: public PermuteInfo<cutlass::layout::Tensor4DPermuteBMM0321ColumnMajor<D1>> {
+  
+  static bool constexpr kBatched = true;
+  static int  constexpr kRowFactor = D1;
+  static int  constexpr kColumnFactor = 1;
+  static int  constexpr kBatchFactor = 1;
+
+  using Base = PermuteInfo<cutlass::layout::Tensor4DPermuteBMM0321ColumnMajor<D1>>;
+  using Layout = typename Base::Layout;
+
+  static typename Layout::TensorCoord original_shape(cutlass::MatrixCoord extent, int batch_count) {
+    int D0 = batch_count;
+    int D2 = extent.row() / D1;
+    int D3 = extent.column();
+    return {D0, D1, D2, D3};
+  }
+};
+
+template<int D1, int D2>
+struct PermuteInfo<cutlass::layout::Tensor4DPermute0213RowMajor<D1, D2>> {
+
+  static bool constexpr kBatched = false;
+  static int  constexpr kRowFactor = D1;
+  static int  constexpr kColumnFactor = D2;
+  static int  constexpr kBatchFactor = 1;
+
+  using Layout = cutlass::layout::TensorNHWC;
+
+  static std::string name() {
+    return "Tensor4DPermute0213<" + std::to_string(D1) + "," + std::to_string(D2) + ">";
+  }
+
+  static std::string desc() {
+    return "normal GEMM permutation [0, 2, 1, 3]";
+  }
+
+  static Layout::TensorCoord original_shape(cutlass::MatrixCoord extent, int batch_count) {
+    int D0 = extent.row() / D1;
+    int D3 = extent.column() / D2;
+    return {D0, D1, D2, D3};
+  }
+
+  static Layout::TensorCoord permute(Layout::TensorCoord const &s) {
+    return {s[0], s[2], s[1], s[3]};
+  }
+};
+
+template<int D1, int D2>
+struct PermuteInfo<cutlass::layout::Tensor4DPermute0213RowMajorInverse<D1, D2>>
+: public PermuteInfo<cutlass::layout::Tensor4DPermute0213RowMajor<D1, D2>> {
+
+  static bool constexpr kBatched = false;
+  static int  constexpr kRowFactor = D2;
+  static int  constexpr kColumnFactor = D1;
+  static int  constexpr kBatchFactor = 1;
+
+  using Base = PermuteInfo<cutlass::layout::Tensor4DPermute0213RowMajor<D1, D2>>;
+  using Layout = typename Base::Layout;
+
+  static typename Layout::TensorCoord original_shape(cutlass::MatrixCoord extent, int batch_count) {
+    int D0 = extent.row() / D2;
+    int D3 = extent.column() / D1;
+    return {D0, D1, D2, D3};
+  }
+};
+
+template<int D1, int D2>
+struct PermuteInfo<cutlass::layout::Tensor4DPermute0213ColumnMajor<D1, D2>>
+: public PermuteInfo<cutlass::layout::Tensor4DPermute0213RowMajor<D1, D2>> {
+  using Layout = cutlass::layout::TensorCWHN;
+};
+
+template<int D1, int D2>
+struct PermuteInfo<cutlass::layout::Tensor4DPermute0213ColumnMajorInverse<D1, D2>>
+: public PermuteInfo<cutlass::layout::Tensor4DPermute0213RowMajorInverse<D1, D2>> {
+  using Layout = cutlass::layout::TensorCWHN;
+};
+
+template<int T1, int T2, int T3>
+struct PermuteInfo<cutlass::layout::Tensor5DPermute20314RowMajor<T1, T2, T3>> {
+
+  static bool constexpr kBatched = false;
+  static int  constexpr kRowFactor = T1;
+  static int  constexpr kColumnFactor = T2 * T3;
+  static int  constexpr kBatchFactor = 1;
+
+  using Layout = cutlass::layout::TensorNDHWC;
+
+  static std::string name() {
+    return "Tensor5DPermute20314<" + std::to_string(T1) + "," + std::to_string(T2) + "," + std::to_string(T3) + ">";
+  }
+
+  static std::string desc() {
+    return "normal GEMM permutation [2, 0, 3, 1, 4]";
+  }
+
+  static Layout::TensorCoord original_shape(cutlass::MatrixCoord extent, int batch_count)
+  {
+    int const T0 = extent.row() / T1;
+    int const T4 = extent.column() / (T2 * T3);
+    return {T0, T1, T2, T3, T4};
+  }
+
+  static Layout::TensorCoord permute(Layout::TensorCoord const &s)
+  {
+    return {s[2], s[0], s[3], s[1], s[4]};
+  }
+};
+
+template<int T1, int T2, int T3>
+struct PermuteInfo<cutlass::layout::Tensor5DPermute20314RowMajorInverse<T1, T2, T3>>
+: public PermuteInfo<cutlass::layout::Tensor5DPermute20314RowMajor<T1, T2, T3>> {
+
+  static bool constexpr kBatched = false;
+  static int  constexpr kRowFactor = T2;
+  static int  constexpr kColumnFactor = T1 * T3;
+  static int  constexpr kBatchFactor = 1;
+
+  using Base = PermuteInfo<cutlass::layout::Tensor5DPermute20314RowMajor<T1, T2, T3>>;
+  using Layout = typename Base::Layout;
+
+  static typename Layout::TensorCoord original_shape(cutlass::MatrixCoord extent, int batch_count) {
+    int const T0 = extent.row() / T2;
+    int const T4 = extent.column() / (T1 * T3);
+    return {T0, T1, T2, T3, T4};
+  }
+};
+
+template<int T1, int T2, int T3>
+struct PermuteInfo<cutlass::layout::Tensor5DPermute02413ColumnMajor<T1, T2, T3>> {
+
+  static bool constexpr kBatched = false;
+  static int  constexpr kRowFactor = T1;
+  static int  constexpr kColumnFactor = T2 * T3;
+  static int  constexpr kBatchFactor = 1;
+
+  using Layout = cutlass::layout::TensorCWHDN;
+
+  static std::string name() {
+    return "Tensor5DPermute02413<" + std::to_string(T1) + "," + std::to_string(T2) + "," + std::to_string(T3) + ">";
+  }
+
+  static std::string desc() {
+    return "normal GEMM permutation [0, 2, 4, 1, 3]";
+  }
+
+  using Coord = cutlass::Tensor5DCoord;
+
+  static Layout::TensorCoord original_shape(cutlass::MatrixCoord extent, int batch_count)
+  {
+    int const T0 = extent.row() / T1;
+    int const T4 = extent.column() / (T2 * T3);
+    return {T0, T1, T2, T3, T4};
+  }
+
+  static Layout::TensorCoord permute(Layout::TensorCoord const &s)
+  {
+    return {s[0], s[2], s[4], s[1], s[3]};
+  }
+};
+
+template<int T1, int T2, int T3>
+struct PermuteInfo<cutlass::layout::Tensor5DPermute02413ColumnMajorInverse<T1, T2, T3>>
+: public PermuteInfo<cutlass::layout::Tensor5DPermute02413ColumnMajor<T1, T2, T3>> {
+
+  static bool constexpr kBatched = false;
+  static int  constexpr kRowFactor = T2;
+  static int  constexpr kColumnFactor = T1 * T3;
+  static int  constexpr kBatchFactor = 1;
+
+  using Base = PermuteInfo<cutlass::layout::Tensor5DPermute02413ColumnMajor<T1, T2, T3>>;
+  using Layout = typename Base::Layout;
+
+  static typename Layout::TensorCoord original_shape(cutlass::MatrixCoord extent, int batch_count) {
+    int const T0 = extent.row() / T2;
+    int const T4 = extent.column() / (T1 * T3);
+    return {T0, T1, T2, T3, T4};
+  }
+};
diff --git a/examples/40_cutlass_py/README.md b/examples/40_cutlass_py/README.md
new file mode 100644
index 0000000000..c670e34072
--- /dev/null
+++ b/examples/40_cutlass_py/README.md
@@ -0,0 +1,4 @@
+# PyCUTLASS Examples
+
+This directory contains deprecated examples for PyCUTLASS, a precursor to the CUTLASS Python interface.
+For examples of using CUTLASS's actively-maintained Pythonic interface, see the [examples/python](/examples/python) directory.
diff --git a/examples/40_cutlass_py/conv2d.py b/examples/40_cutlass_py/conv2d.py
new file mode 100644
index 0000000000..71e94259ff
--- /dev/null
+++ b/examples/40_cutlass_py/conv2d.py
@@ -0,0 +1,177 @@
+################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+"""
+Basic example of using the CUTLASS Python interface to run a 2d convolution
+"""
+
+import sys
+print("This example is deprecated. Please see examples/python for examples of using "
+      "the CUTLASS Python interface.")
+sys.exit(0)
+
+import argparse
+import numpy as np
+import torch
+
+import cutlass_bindings
+import cutlass.backend as pycutlass
+from cutlass.backend import *
+from cutlass.backend.utils.reference_model import Conv2dReferenceModule
+from cutlass.backend.utils.device import device_cc
+
+
+parser = argparse.ArgumentParser(
+    description=("Launch a 2d convolution kernel from Python. "
+                 "See https://docs.nvidia.com/deeplearning/performance/dl-performance-convolutional/index.html#convo-intro for notation."))
+parser.add_argument("--n", default=1, type=int,  help="N dimension of the convolution")
+parser.add_argument("--c", default=64, type=int, help="C dimension of the convolution")
+parser.add_argument("--h", default=32, type=int, help="H dimension of the convolution")
+parser.add_argument("--w", default=32, type=int, help="W dimension of the convolution")
+parser.add_argument("--k", default=32, type=int,  help="N dimension of the convolution")
+parser.add_argument("--r", default=3, type=int, help="R dimension of the convolution")
+parser.add_argument("--s", default=3, type=int, help="S dimension of the convolution")
+parser.add_argument('--print_cuda', action="store_true", help="Print the underlying CUDA kernel")
+
+try:
+    args = parser.parse_args()
+except:
+    sys.exit(0)
+
+# Check that the device is of a sufficient compute capability
+cc = device_cc()
+assert cc >= 70, "The CUTLASS Python Conv2d example requires compute capability greater than or equal to 70."
+
+alignment = 1
+
+np.random.seed(0)
+
+# Allocate a pool of device memory to be used by the kernel
+pycutlass.get_memory_pool(init_pool_size=2**30, max_pool_size=2**32)
+
+# Set the compiler to use to NVCC
+pycutlass.compiler.nvcc()
+
+# Set up A, B, C and accumulator
+A = TensorDescription(cutlass_bindings.float16, cutlass_bindings.TensorNHWC, alignment)
+B = TensorDescription(cutlass_bindings.float16, cutlass_bindings.TensorNHWC, alignment)
+C = TensorDescription(cutlass_bindings.float32, cutlass_bindings.TensorNHWC, alignment)
+element_acc = cutlass_bindings.float32
+element_epilogue = cutlass_bindings.float32
+
+# Select instruction shape based on the Tensor Core instructions supported
+# by the device on which we are running
+if cc == 70:
+    instruction_shape = [8, 8, 4]
+elif cc == 75:
+    instruction_shape = [16, 8, 8]
+else:
+    # Use CUTLASS kernels for CC 80 by default (e.g., for cases in which SM86 is used)
+    cc = 80
+    instruction_shape = [16, 8, 16]
+
+math_inst = MathInstruction(
+    instruction_shape,
+    A.element, B.element, element_acc,
+    cutlass_bindings.OpClass.TensorOp,
+    MathOperation.multiply_add
+)
+
+tile_description = TileDescription(
+    [128, 128, 32],   # Threadblock shape
+    2,                # Number of stages
+    [2, 2, 1],        # Number of warps within each dimension of the threadblock shape
+    math_inst
+)
+
+epilogue_functor = pycutlass.LinearCombination(C.element, C.alignment, element_acc, element_epilogue)
+
+operation = Conv2dOperation(
+    conv_kind=cutlass_bindings.conv.Operator.fprop,
+    iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
+    arch=cc, tile_description=tile_description,
+    A=A, B=B, C=C, stride_support=StrideSupport.Unity,
+    epilogue_functor=epilogue_functor
+)
+
+if args.print_cuda:
+    print(operation.rt_module.emit())
+
+operations = [operation, ]
+
+# Compile the operation
+pycutlass.compiler.add_module(operations)
+
+# Randomly initialize tensors
+
+problem_size = cutlass_bindings.conv.Conv2dProblemSize(
+    cutlass_bindings.Tensor4DCoord(args.n, args.h, args.c, args.w),
+    cutlass_bindings.Tensor4DCoord(args.k, args.r, args.s, args.c),
+    cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),      # Padding
+    cutlass_bindings.MatrixCoord(1, 1),              # Strides
+    cutlass_bindings.MatrixCoord(1, 1),              # Dilation
+    cutlass_bindings.conv.Mode.cross_correlation, 
+    1,                                      # Split k slices
+    1                                       # Groups
+)
+
+tensor_A_size = cutlass_bindings.conv.implicit_gemm_tensor_a_size(operation.conv_kind, problem_size)
+tensor_B_size = cutlass_bindings.conv.implicit_gemm_tensor_b_size(operation.conv_kind, problem_size)
+tensor_C_size = cutlass_bindings.conv.implicit_gemm_tensor_c_size(operation.conv_kind, problem_size)
+
+tensor_A = torch.ceil(torch.empty(size=(tensor_A_size,), dtype=torch.float16, device="cuda").uniform_(-8.5, 7.5))
+tensor_B = torch.ceil(torch.empty(size=(tensor_B_size,), dtype=torch.float16, device="cuda").uniform_(-8.5, 7.5))
+tensor_C = torch.ceil(torch.empty(size=(tensor_C_size,), dtype=torch.float32, device="cuda").uniform_(-8.5, 7.5))
+tensor_D = torch.ones(size=(tensor_C_size,), dtype=torch.float32, device="cuda")
+
+alpha = 1.
+beta = 0.
+
+arguments = Conv2dArguments(
+    operation=operation, problem_size=problem_size,
+    A=tensor_A, B=tensor_B, C=tensor_C, D=tensor_D,
+    output_op=operation.epilogue_type(alpha, beta)
+)
+
+# Run the operation
+operation.run(arguments)
+arguments.sync()
+
+# Run the host reference module and compare to the CUTLASS result
+reference = Conv2dReferenceModule(A, B, C, operation.conv_kind)
+tensor_D_ref = reference.run(tensor_A, tensor_B, tensor_C, problem_size, alpha, beta)
+
+try:
+    assert torch.equal(tensor_D, tensor_D_ref)
+except:
+    assert torch.allclose(tensor_D, tensor_D_ref, rtol=1e-2)
+
+print("Passed.")
diff --git a/examples/40_cutlass_py/customizable/README.md b/examples/40_cutlass_py/customizable/README.md
new file mode 100644
index 0000000000..e8aeee9e71
--- /dev/null
+++ b/examples/40_cutlass_py/customizable/README.md
@@ -0,0 +1,167 @@
+# Customizable Python Interface Examples
+This directory contains examples of using the CUTLASS Python interface with a variety of configurations for kernels.
+
+For all the tests, add `--print_cuda` to print the underlying CUDA kernel. Use `-h` or `--help` to display the help message.
+
+## GEMM Examples
+The GEMM examples use numpy to create input tensors and verify the results.
+### GEMM F64 Example
+Example 1: SM80_Device_Gemm_f64t_f64n_f64n_tensor_op_f64_32x32x16_16x16x16
+```python
+python gemm.py -i 8 8 4 -ta float64 -tb float64 -tc float64 -tacc float64 -m multiply_add -op TensorOp -b 32 32 16 -s 4 -w 2 2 1 -cc 80 -la ColumnMajor -aa 1 -lb RowMajor -ab 1 -lc RowMajor -ac 1 -te float64 -ep LinearCombination -sw IdentitySwizzle1 -p 512 256 128 -alpha 1.0 -beta 0.5 -gm Gemm -k 1
+```
+Example 2: SM80_Device_Gemm_f64n_f64t_f64n_tensor_op_f64_64x64x16_32x32x16, split_k(2)_serial
+```python
+python gemm.py -i 8 8 4 -ta float64 -tb float64 -tc float64 -tacc float64 -m multiply_add -op TensorOp -b 64 64 16 -s 4 -w 2 2 1 -cc 80 -la RowMajor -aa 1 -lb ColumnMajor -ab 1 -lc RowMajor -ac 1 -te float64 -ep LinearCombination -sw IdentitySwizzle1 -p 512 256 128 -alpha 1.0 -beta 0.5 -gm Gemm -k 2
+```
+
+### GEMM F32 Example
+Example 1: SM80_Device_Gemm_f32n_f32t_f32n_tensor_op_bf16_f32_128x128x32_64x64x32
+```python
+python gemm.py -i 16 8 8 -ta float32 -tb float32 -tc float32 -tacc float32 -m multiply_add_fast_bf16 -op TensorOp -b 128 128 32 -s 3 -w 2 2 1 -cc 80 -la RowMajor -aa 4 -lb ColumnMajor -ab 4 -lc RowMajor -ac 4 -te float32 -ep LinearCombination -sw IdentitySwizzle1 -p 512 256 128 -alpha 1.0 -beta 0.5 -gm Gemm -k 1
+```
+Example 2: SM80_Device_Gemm_f32t_f32t_f32n_tensor_op_f32_128x128x32_64x64x32, split_k(2)_parallel
+```python
+python gemm.py -i 16 8 8 -ta float32 -tb float32 -tc float32 -tacc float32 -m multiply_add -op TensorOp -b 128 128 32 -s 3 -w 2 2 1 -cc 80 -la ColumnMajor -aa 4 -lb ColumnMajor -ab 4 -lc RowMajor -ac 4 -te float32 -ep LinearCombination -sw IdentitySwizzle1 -p 512 256 128 -alpha 1.0 -beta 0.5 -gm GemmSplitKParallel -k 2
+```
+Example 3: SM80_Device_Gemm_f32t_f32t_f32n_tensor_op_fast_accurate_f32_64x64x32_32x32x32, split_k(4)_serial
+```python
+python gemm.py -i 16 8 8 -ta float32 -tb float32 -tc float32 -tacc float32 -m multiply_add_fast_f32 -op TensorOp -b 64 64 32 -s 3 -w 2 2 1 -cc 80 -la ColumnMajor -aa 4 -lb ColumnMajor -ab 4 -lc RowMajor -ac 4 -te float32 -ep LinearCombination -sw IdentitySwizzle1 -p 512 256 128 -alpha 1.0 -beta 0.5 -gm Gemm -k 4
+```
+
+### GEMM F16 Example
+Example 1: SM80_Device_Gemm_f32t_f32n_f32t_tensor_op_bf16_f32_128x128x32_64x64x32
+```python
+python gemm.py -i 16 8 16 -ta float16 -tb float16 -tc float32 -tacc float32 -m multiply_add -op TensorOp -b 128 128 32 -s 3 -w 2 2 1 -cc 80 -la ColumnMajor -aa 8 -lb RowMajor -ab 8 -lc ColumnMajor -ac 4 -te float32 -ep LinearCombination -sw IdentitySwizzle4 -p 512 256 128 -alpha 1.0 -beta 0.5 -gm Gemm -k 1
+```
+Example 2: SM80_Device_Gemm_f16t_f16t_f16n_tensor_op_f32_128x128x64_64x64x64, split_k(2)_serial
+```python
+python gemm.py -i 16 8 16 -ta float16 -tb float16 -tc float16 -tacc float32 -m multiply_add -op TensorOp -b 128 128 64 -s 3 -w 2 2 1 -cc 80 -la ColumnMajor -aa 8 -lb ColumnMajor -ab 8 -lc RowMajor -ac 8 -te float32 -ep LinearCombination -sw IdentitySwizzle2 -p 512 256 128 -alpha 1.0 -beta 0.5 -gm Gemm -k 2
+```
+Example 3: SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32_256x128x64_64x64x64, split_k(3)_serial
+```python
+python gemm.py -i 16 8 16 -ta float16 -tb float16 -tc float32 -tacc float32 -m multiply_add -op TensorOp -b 256 128 64 -s 3 -w 4 2 1 -cc 80 -la ColumnMajor -aa 8 -lb ColumnMajor -ab 8 -lc RowMajor -ac 4 -te float32 -ep LinearCombination -sw IdentitySwizzle1 -p 512 256 128 -alpha 1.0 -beta 0.5 -gm GemmSplitKParallel -k 3
+```
+
+### GEMM BF16 Example
+Example 1: Device_Gemm_bf16t_bf16t_f32n_tensor_op_f32_64x128x64_32x64x64, split_k(5)_parallel
+```python
+python gemm.py -i 16 8 16 -ta bfloat16 -tb bfloat16 -tc float32 -tacc float32 -m multiply_add -op TensorOp -b 64 128 64 -s 3 -w 2 2 1 -cc 80 -la ColumnMajor -aa 8 -lb ColumnMajor -ab 8 -lc RowMajor -ac 4 -te float32 -ep LinearCombination -sw IdentitySwizzle2 -p 512 256 128 -alpha 1.0 -beta 0.5 -gm GemmSplitKParallel -k 5
+```
+
+### GEMM Int8 Example
+Example 1: SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32_256x128x128_64x64x128
+```python
+python gemm.py -i 16 8 32 -ta int8 -tb int8 -tc int8 -tacc int32 -m multiply_add -op TensorOp -b 128 128 128 -s 3 -w 2 2 1 -cc 80 -la RowMajor -aa 16 -lb ColumnMajor -ab 16 -lc RowMajor -ac 16 -te float32 -ep FastLinearCombinationClamp -sw IdentitySwizzle2 -p 512 512 512 -alpha 1.0 -beta 0.0 -gm Gemm -k 1
+```
+
+### Batched & Array GEMM
+Example 1: Batched GEMM
+```python
+python gemm.py -i 16 8 8 -ta float32 -tb float32 -tc float32 -tacc float32 -m multiply_add_fast_bf16 -op TensorOp -b 128 128 32 -s 3 -w 2 2 1 -cc 80 -la RowMajor -aa 4 -lb ColumnMajor -ab 4 -lc RowMajor -ac 4 -te float32 -ep LinearCombination -sw BatchedIdentitySwizzle -p 512 256 128 -alpha 1.0 -beta 0.5 -gm Batched -k 1 -batch 3
+```
+Example 2: Array GEMM
+```python
+python gemm.py -i 16 8 16 -ta float16 -tb float16 -tc float32 -tacc float32 -m multiply_add -op TensorOp -b 128 128 32 -s 3 -w 2 2 1 -cc 80 -la ColumnMajor -aa 8 -lb RowMajor -ab 8 -lc ColumnMajor -ac 4 -te float32 -ep LinearCombination -sw IdentitySwizzle4 -p 512 256 128 -alpha 1.0 -beta 0.5 -gm Array -k 1 -batch 2
+```
+***
+## GEMM Grouped Examples
+The GEMM Grouped examples use numpy to create input tensors and verify the results.
+
+Example 1: SM80_Device_GemmGrouped_f16t_f16t_f32t_tensor_op_f32_128x128x32_64x64x32, device schedule
+```python
+python gemm_grouped.py -i 16 8 16 -ta float16 -tb float16 -tc float32 -tacc float32 -m multiply_add -op TensorOp -b 128 128 32 -s 3 -w 2 2 1 -cc 80 -la ColumnMajor -aa 8 -lb ColumnMajor -ab 8 -lc ColumnMajor -ac 4 -te float32 -ep LinearCombination -p ./grouped_gemm_problem_size.csv -alpha 1.0 -beta 0.0 -pm Device
+```
+Example 2: SM80_Device_GemmGrouped_f64n_f64n_f64t_tensor_op_f64_64x64x16_32x32x16, host schedule
+```python
+python gemm_grouped.py -i 8 8 4 -ta float64 -tb float64 -tc float64 -tacc float64 -m multiply_add -op TensorOp -b 64 64 16 -s 4 -w 2 2 1 -cc 80 -la RowMajor -aa 1 -lb RowMajor -ab 1 -lc ColumnMajor -ac 1 -te float64 -ep LinearCombination -p ./grouped_gemm_problem_size.csv -alpha 1.0 -beta 1.0 -pm Host
+```
+Example 3: SM80_Device_GemmGrouped_f32n_f32n_f32n_simt_f32_128x64x8_64x32x1, device schedule
+```python
+python gemm_grouped.py -i 1 1 1 -ta float32 -tb float32 -tc float32 -tacc float32 -m multiply_add -op Simt -b 128 64 8 -s 4 -w 2 2 1 -cc 80 -la RowMajor -aa 1 -lb RowMajor -ab 1 -lc RowMajor -ac 1 -te float32 -ep LinearCombination -p ./grouped_gemm_problem_size.csv -alpha 2.0 -beta 1.0 -pm Device
+```
+Example 4: SM80_Device_GemmGrouped_f16t_f16t_f32t_tensor_op_f32_128x128x32_64x64x32, device schedule
+```python
+python gemm_grouped.py -i 16 8 16 -ta float16 -tb float16 -tc float32 -tacc float32 -m multiply_add -op TensorOp -b 128 128 32 -s 3 -w 2 2 1 -cc 80 -la ColumnMajor -aa 8 -lb ColumnMajor -ab 8 -lc ColumnMajor -ac 4 -te float32 -ep LinearCombination -p ./grouped_gemm_problem_size.csv -alpha 2.0 -beta 1.0 -pm Device
+```
+***
+## Conv2d Example
+The Conv2d examples use pytorch to create input tensors and verify the results. Pytorch can be installed following the [official website](https://pytorch.org/get-started/locally/).
+### Conv2d F32 Fprop
+Example 1: SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32
+```python
+python conv2d.py -i 16 8 8 -ta float32 -tb float32 -tc float32 -tacc float32 -m multiply_add -op TensorOp -b 128 128 16 -s 3 -w 2 2 1 -cc 80 -la TensorNHWC -aa 4 -lb TensorNHWC -ab 4 -lc TensorNHWC -ac 4 -te float32 -ep LinearCombination -sw IdentitySwizzle1 -co fprop -st Strided -ia optimized -sm Serial -k 1 -nhwc 1 13 17 8 -krsc 24 3 3 8 -pad 0 0 0 0 -stride 2 2 -dilation 1 1 -alpha 1.0 -beta 0.0
+```
+Example 2: SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_align2
+```python
+python conv2d.py -i 16 8 8 -ta float32 -tb float32 -tc float32 -tacc float32 -m multiply_add -op TensorOp -b 128 128 16 -s 3 -w 2 2 1 -cc 80 -la TensorNHWC -aa 2 -lb TensorNHWC -ab 2 -lc TensorNHWC -ac 2 -te float32 -ep LinearCombination -sw IdentitySwizzle2 -co fprop -st Strided -ia optimized -sm Serial -k 2 -nhwc 1 4 4 12 -krsc 8 3 3 12 -pad 0 0 0 0 -stride 3 3 -dilation 1 1 -alpha 1.0 -beta 1.0
+```
+Example 3: SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32
+```python
+python conv2d.py -i 1 1 1 -ta float32 -tb float32 -tc float32 -tacc float32 -m multiply_add -op Simt -b 128 128 8 -s 4 -w 4 2 1 -cc 80 -la TensorNHWC -aa 4 -lb TensorNHWC -ab 4 -lc TensorNHWC -ac 1 -te float32 -ep LinearCombination -sw IdentitySwizzle4 -co fprop -st Strided -ia analytic -sm Parallel -k 3 -nhwc 1 71 80 32 -krsc 64 5 5 32 -pad 2 2 2 2 -stride 2 2 -dilation 1 1 -alpha 1.0 -beta 1.0
+```
+### Conv2d F32 Wgrad
+Example 1: Device_Conv2d_Wgrad_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_align1
+```python
+python conv2d.py -i 16 8 8 -ta float32 -tb float32 -tc float32 -tacc float32 -m multiply_add -op TensorOp -b 128 128 32 -s 3 -w 2 2 1 -cc 80 -la TensorNHWC -aa 1 -lb TensorNHWC -ab 1 -lc TensorNHWC -ac 4 -te float32 -ep LinearCombination -sw IdentitySwizzle1 -co wgrad -st Strided -ia optimized -sm Serial -k 1 -nhwc 1 8 8 1 -krsc 1 3 3 1 -pad 1 1 1 1 -stride 1 1 -dilation 1 1 -alpha 1.0 -beta 0.0
+```
+Example 2: Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32
+```python
+python conv2d.py -i 1 1 1 -ta float32 -tb float32 -tc float32 -tacc float32 -m multiply_add -op Simt -b 128 128 8 -s 4 -w 2 4 1 -cc 80 -la TensorNHWC -aa 4 -lb TensorNHWC -ab 4 -lc TensorNHWC -ac 1 -te float32 -ep LinearCombination -sw IdentitySwizzle1 -co wgrad -st Strided -ia optimized -sm Serial -k 2 -nhwc 1 27 27 256 -krsc 512 3 3 256 -pad 1 1 1 1 -stride 2 1 -dilation 1 1 -alpha 1.0 -beta 0.0
+```
+### Conv2d F32 Dgrad
+Example 1: Device_Conv2d_Dgrad_Analytic_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32
+```python
+python conv2d.py -i 16 8 8 -ta float32 -tb float32 -tc float32 -tacc float32 -m multiply_add -op TensorOp -b 128 128 16 -s 3 -w 2 2 1 -cc 80 -la TensorNHWC -aa 4 -lb TensorNHWC -ab 4 -lc TensorNHWC -ac 4 -te float32 -ep LinearCombination -sw StridedDgradIdentitySwizzle1 -co dgrad -st Strided -ia optimized -sm Serial -k 2 -nhwc 1 27 27 256 -krsc 512 3 3 256 -pad 1 1 1 1 -stride 2 1 -dilation 1 1 -alpha 1.0 -beta 0.0
+```
+
+### Conv2d F16 Fprop
+Example 1: SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32
+```python
+python conv2d.py -i 16 8 16 -ta float16 -tb float16 -tc float16 -tacc float32 -m multiply_add -op TensorOp -b 128 128 64 -s 3 -w 2 2 1 -cc 80 -la TensorNHWC -aa 8 -lb TensorNHWC -ab 8 -lc TensorNHWC -ac 8 -te float32 -ep LinearCombination -sw IdentitySwizzle1 -co fprop -st Strided -ia optimized -sm Serial -k 1 -nhwc 1 27 27 256 -krsc 512 3 3 256 -pad 1 1 1 1 -stride 2 1 -dilation 1 1 -alpha 1.0 -beta 0.0
+```
+Example 2: SM80_Device_Conv2d_Fprop_Few_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_2
+```python
+python conv2d.py -i 16 8 16 -ta float16 -tb float16 -tc float16 -tacc float32 -m multiply_add -op TensorOp -b 128 128 64 -s 3 -w 2 2 1 -cc 80 -la TensorNHWC -aa 2 -lb TensorNHWC -ab 2 -lc TensorNHWC -ac 8 -te float32 -ep LinearCombination -sw IdentitySwizzle1 -co fprop -st Strided -ia few_channels -sm Serial -k 1 -nhwc 1 16 16 2 -krsc 16 3 3 2 -pad 1 1 1 1 -stride 2 2 -dilation 1 1 -alpha 1.0 -beta 0.0
+```
+Example 3: SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_8
+```python
+python conv2d.py -i 16 8 16 -ta float16 -tb float16 -tc float16 -tacc float32 -m multiply_add -op TensorOp -b 128 128 64 -s 3 -w 2 2 1 -cc 80 -la TensorNHWC -aa 8 -lb TensorNHWC -ab 8 -lc TensorNHWC -ac 8 -te float32 -ep LinearCombination -sw IdentitySwizzle2 -co fprop -st Strided -ia fixed_channels -sm Serial -k 1 -nhwc 1 8 8 8 -krsc 16 3 3 8 -pad 1 1 1 1 -stride 2 2 -dilation 1 1 -alpha 1.0 -beta 0.0
+```
+Example 4: SM80_Device_Conv2d_Strided_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_128x128_32x3_64x64x32_align4
+```python
+python conv2d.py -i 16 8 16 -ta float16 -tb float16 -tc float16 -tacc float32 -m multiply_add -op TensorOp -b 128 128 32 -s 3 -w 2 2 1 -cc 80 -la TensorNHWC -aa 4 -lb TensorNHWC -ab 4 -lc TensorNHWC -ac 4 -te float32 -ep LinearCombination -sw StridedDgradIdentitySwizzle1 -co dgrad -st Strided -ia optimized -sm Serial -k 1 -nhwc 1 56 56 12 -krsc 8 1 1 12 -pad 0 0 0 0 -stride 2 2 -dilation 1 1 -alpha 1.0 -beta 0.0
+```
+
+## Epilogue
+### Bias 
+To replace C with a bias vector, add `-bias` flag.
+### Activation function
+Example 1: ReLU
+```python
+python gemm.py -i 8 8 4 -ta float64 -tb float64 -tc float64 -tacc float64 -m multiply_add -op TensorOp -b 32 32 16 -s 4 -w 2 2 1 -cc 80 -la ColumnMajor -aa 1 -lb RowMajor -ab 1 -lc RowMajor -ac 1 -te float64 -ep LinearCombination -sw IdentitySwizzle1 -p 512 256 128 -alpha 1.0 -beta 0.5 -gm Gemm -k 1 -bias -activ relu
+```
+Example 2: leaky ReLU
+```python
+python gemm.py -i 8 8 4 -ta float64 -tb float64 -tc float64 -tacc float64 -m multiply_add -op TensorOp -b 64 64 16 -s 4 -w 2 2 1 -cc 80 -la RowMajor -aa 1 -lb ColumnMajor -ab 1 -lc RowMajor -ac 1 -te float64 -ep LinearCombination -sw IdentitySwizzle1 -p 512 256 128 -alpha 1.0 -beta 0.5 -gm Gemm -k 2 -bias -activ leaky_relu -activ_arg 0.2
+```
+Example 3: tanh (alpha=0 to avoid saturation)
+```python
+python gemm.py -i 16 8 8 -ta float32 -tb float32 -tc float32 -tacc float32 -m multiply_add -op TensorOp -b 128 128 32 -s 3 -w 2 2 1 -cc 80 -la ColumnMajor -aa 4 -lb ColumnMajor -ab 4 -lc RowMajor -ac 4 -te float32 -ep LinearCombination -sw IdentitySwizzle1 -p 512 256 128 -alpha 1.0 -beta 0.5 -gm GemmSplitKParallel -k 2 -bias -activ tanh
+```
+Example 4: sigmoid
+```python
+python gemm_grouped.py -i 8 8 4 -ta float64 -tb float64 -tc float64 -tacc float64 -m multiply_add -op TensorOp -b 64 64 16 -s 4 -w 2 2 1 -cc 80 -la RowMajor -aa 1 -lb RowMajor -ab 1 -lc ColumnMajor -ac 1 -te float64 -ep LinearCombination -p ./grouped_gemm_problem_size.csv -alpha 0.0 -beta 0.5 -pm Host -bias -activ sigmoid -bias -activ sigmoid
+```
+Example 5: SiLU
+```python
+python conv2d.py -i 16 8 8 -ta float32 -tb float32 -tc float32 -tacc float32 -m multiply_add -op TensorOp -b 128 128 16 -s 3 -w 2 2 1 -cc 80 -la TensorNHWC -aa 2 -lb TensorNHWC -ab 2 -lc TensorNHWC -ac 2 -te float32 -ep LinearCombination -sw IdentitySwizzle2 -co fprop -st Strided -ia optimized -sm Serial -k 2 -nhwc 1 4 4 12 -krsc 8 3 3 12 -pad 0 0 0 0 -stride 3 3 -dilation 1 1 -alpha 0.0 -beta 0.5 -bias -activ silu
+```
+Example 6: HardSwish
+```python
+python conv2d.py -i 16 8 16 -ta float16 -tb float16 -tc float16 -tacc float32 -m multiply_add -op TensorOp -b 128 128 64 -s 3 -w 2 2 1 -cc 80 -la TensorNHWC -aa 2 -lb TensorNHWC -ab 2 -lc TensorNHWC -ac 8 -te float32 -ep LinearCombination -sw IdentitySwizzle1 -co fprop -st Strided -ia few_channels -sm Serial -k 1 -nhwc 1 16 16 2 -krsc 16 3 3 2 -pad 1 1 1 1 -stride 2 2 -dilation 1 1 -alpha 0.0 -beta 0.5 -bias -activ hardswish
+```
+Example 7: GELU
+```python
+python gemm.py -i 16 8 16 -ta bfloat16 -tb bfloat16 -tc float32 -tacc float32 -m multiply_add -op TensorOp -b 64 128 64 -s 3 -w 2 2 1 -cc 80 -la ColumnMajor -aa 8 -lb ColumnMajor -ab 8 -lc RowMajor -ac 4 -te float32 -ep LinearCombination -sw IdentitySwizzle2 -p 512 256 128 -alpha 0.0 -beta 0.5 -gm GemmSplitKParallel -k 5 -bias -activ gelu
+```
diff --git a/examples/40_cutlass_py/customizable/conv2d.py b/examples/40_cutlass_py/customizable/conv2d.py
new file mode 100644
index 0000000000..c6cbf87a8d
--- /dev/null
+++ b/examples/40_cutlass_py/customizable/conv2d.py
@@ -0,0 +1,331 @@
+################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+import sys
+print("This example is deprecated. Please see examples/python for examples of using "
+      "the CUTLASS Python interface.")
+sys.exit(0)
+
+import numpy as np
+import cutlass.backend as pycutlass
+from cutlass.backend import *
+from cutlass.backend.utils.device import device_cc
+from cutlass.backend.conv2d_operation import *
+from cutlass.backend.utils.reference_model import Conv2dReferenceModule
+import torch.nn.functional as F
+
+import argparse
+
+# parse the arguments
+parser = argparse.ArgumentParser(description="Launch CUTLASS convolution 2d kernels from Python")
+
+# Operation description
+# math instruction description
+parser.add_argument("-i", "--instruction_shape",
+                    default=[1, 1, 1], nargs=3, type=int, 
+                    help="This option describes the size of MMA op")
+parser.add_argument("-ta", "--element_a", default="float32", type=str,
+                    choices=['float64', 'float32', 'float16', 'bfloat16', 'int32', 'int8'], 
+                    help='Data type of elements in input tensor A')
+parser.add_argument("-tb", "--element_b", default="float32", type=str,
+                    choices=['float64', 'float32', 'float16', 'bfloat16', 'int32', 'int8'], 
+                    help='Data type of elements in input tensor B')
+parser.add_argument("-tc", "--element_c", default="float32", type=str,
+                    choices=['float64', 'float32', 'float16', 'bfloat16', 'int32', 'int8'], 
+                    help='Data type of elements in input tensor C and output tensor D')
+parser.add_argument("-tacc", "--element_acc", default="float32", type=str,
+                    choices=['float64', 'float32', 'float16', 'bfloat16', 'int32', 'int8'], 
+                    help='Data type of accumulator')
+parser.add_argument('-m', "--math", default="multiply_add",
+                    type=str, choices=["multiply_add", "multiply_add_fast_bf16", "multiply_add_fast_f32"], help="math instruction")
+parser.add_argument('-op', "--opcode", default="Simt", type=str,
+                    choices=["Simt", 'TensorOp'], 
+                    help='This option describes whether you want to use tensor \
+                        cores (TensorOp) or regular SIMT cores (Simt) on GPU SM')
+# tile description
+parser.add_argument("-b", "--threadblock_shape",
+                    default=[128, 128, 8], nargs=3, type=int, 
+                    help="This option describes the tile size a thread block with compute")
+parser.add_argument("-s", "--stages", default=4,
+                    type=int, help="Number of pipelines you want to use")
+parser.add_argument("-w", "--warp_count", default=[
+                    4, 2, 1], nargs=3, type=int, 
+                    help="This option describes the number of warps along M, N, and K of the threadblock")
+parser.add_argument("-cc", "--compute_capability", default=80,
+                    type=int, help="This option describes CUDA SM architecture number")
+# A
+parser.add_argument('-la', "--layout_a", default="TensorNHWC", type=str, choices=[
+                    "TensorNHWC", "TensorNC32HW32"], 
+                    help="Memory layout of input tensor A")
+parser.add_argument('-aa', '--alignment_a', default=1,
+                    type=int, help="Memory alignement of input tensor A")
+# B
+parser.add_argument('-lb', "--layout_b", default="TensorNHWC", type=str, choices=[
+                    "TensorNHWC", "TensorC32RSK32"], 
+                    help="Memory layout of input tensor B")
+parser.add_argument('-ab', '--alignment_b', default=1,
+                    type=int, help="Memory alignment of input tensor B")
+# C
+parser.add_argument('-lc', "--layout_c", default="TensorNHWC", type=str, choices=[
+                    "TensorNHWC", "TensorNC32HW32"], 
+                    help="Memory layout of input tensor C and output tensor D")
+parser.add_argument('-ac', '--alignment_c', default=1,
+                    type=int, help="Memory alignment of input tensor C and output tensor D")
+# epilogue
+parser.add_argument("-te", "--element_epilogue", default="float32", type=str,
+                    choices=['float64', 'float32', 'float16', 'bfloat16'], 
+                    help='Data type of computation in the epilogue')
+parser.add_argument("-ep", "--epilogue_functor", default="LinearCombination",
+                    type=str, choices=['LinearCombination', 'FastLinearCombinationClamp', 'LinearCombinationClamp'], 
+                    help="This option describes the epilogue part of the kernel")
+# swizzling
+parser.add_argument("-sw", "--swizzling_functor", default="IdentitySwizzle1", type=str, choices=[
+                    "IdentitySwizzle1", "IdentitySwizzle2", "IdentitySwizzle4", "IdentitySwizzle8", 
+                    "HorizontalSwizzle", "StridedDgradIdentitySwizzle1", "StridedDgradIdentitySwizzle4", 
+                    "StridedDgradHorizontalSwizzle"],
+                    help="This option describes how thread blocks are scheduled on GPU")
+# conv related
+parser.add_argument("-co", "--conv_kind", default="fprop", type=str, choices=['fprop', 'dgrad', 'wgrad'],
+                    help="The type of convolution: forward propagation (fprop), \
+                        gradient of activation (dgrad), gradient of weight (wgrad)")
+parser.add_argument("-st", "--stride_support", default="Strided", type=str, choices=["Strided", "Unity"],
+                    )
+parser.add_argument("-ia", "--iterator_algorithm", default="analytic", type=str, 
+                    choices=["analytic", "optimized", "fixed_channels", "few_channels"],
+                    help="This option describes iterator algorithm")
+
+# arguments
+parser.add_argument("-sm", "--split_k_mode", default="Serial", type=str, choices=["Serial", "Parallel"],
+                    help="Split K Mode. Serial is used for non-splitK or serial-splitK.\
+                        Parallel is used for parallel splitK.")
+parser.add_argument('-k', '--split_k_slices', default=1,
+                    type=int, help="Number of split-k partitions. (default 1)")
+parser.add_argument("-nhwc", "--nhwc", nargs=4, type=int, help="input size (NHWC)")
+parser.add_argument("-krsc", "--krsc", nargs=4, type=int, help="filter size (KRSC)")
+parser.add_argument("-pad", "--pad", nargs=4, type=int, help="padding (pad_h, _, pad_w, _)")
+parser.add_argument("-stride", "--stride", nargs=2, type=int, help="stride (stride_h, stride_w)")
+parser.add_argument("-dilation", "--dilation", nargs=2, type=int, help="dilation (dilation_h, dilation_w)")
+parser.add_argument("-alpha", "--alpha", default=1.0, type=float, help="alpha")
+parser.add_argument("-beta", "--beta", default=0.0, type=float, help="beta")
+parser.add_argument('-bias', '--bias', action='store_true', help="C is bias vector")
+# Activation function
+parser.add_argument("-activ", "--activation_function", default="identity",
+    choices=["identity", "relu", "leaky_relu", "tanh", "sigmoid", "silu", "hardswish", "gelu"], help="activation function")
+parser.add_argument("-activ_arg", "--activation_args", default=[], nargs="+", type=float,
+    help="addition arguments for activation")
+
+
+parser.add_argument('--print_cuda', action="store_true",
+                    help="print the underlying CUDA kernel")
+
+try:
+    args = parser.parse_args()
+except:
+    sys.exit(0)
+
+cc = device_cc()
+if args.compute_capability != cc:
+    raise Exception(("Parameter --compute-capability of {} "
+                    "does not match that of the device of {}.").format(args.compute_capability, cc))
+
+pycutlass.get_memory_pool(init_pool_size=2**30, max_pool_size=2**32)
+
+np.random.seed(0)
+
+element_a = getattr(cutlass_bindings, args.element_a)
+element_b = getattr(cutlass_bindings, args.element_b)
+element_c = getattr(cutlass_bindings, args.element_c)
+element_acc = getattr(cutlass_bindings, args.element_acc)
+math_operation = getattr(MathOperation, args.math)
+opclass = getattr(cutlass_bindings.OpClass, args.opcode)
+
+math_inst = MathInstruction(
+    args.instruction_shape, element_a, element_b,
+    element_acc, opclass, math_operation
+)
+
+tile_description = TileDescription(
+    args.threadblock_shape, args.stages, args.warp_count,
+    math_inst
+)
+
+layout_a = getattr(cutlass_bindings, args.layout_a)
+layout_b = getattr(cutlass_bindings, args.layout_b)
+layout_c = getattr(cutlass_bindings, args.layout_c)
+
+A = TensorDescription(
+    element_a, layout_a, args.alignment_a
+)
+
+B = TensorDescription(
+    element_b, layout_b, args.alignment_b
+)
+
+C = TensorDescription(
+    element_c, layout_c, args.alignment_c
+)
+
+element_epilogue = getattr(cutlass_bindings, args.element_epilogue)
+if (args.activation_function == "identity" 
+    or (args.split_k_mode == "Parallel" and args.split_k_slices > 1)):
+    #
+    epilogue_functor = getattr(pycutlass, args.epilogue_functor)(
+        C.element, C.alignment, math_inst.element_accumulator, element_epilogue)
+else:
+    epilogue_functor = getattr(pycutlass, "LinearCombinationGeneric")(
+        getattr(pycutlass, args.activation_function)(element_epilogue),
+        C.element, C.alignment, math_inst.element_accumulator, element_epilogue)
+
+iterator_algorithm = getattr(cutlass_bindings.conv.IteratorAlgorithm, args.iterator_algorithm)
+swizzling_functor = getattr(cutlass_bindings, args.swizzling_functor)
+stride_support = getattr(StrideSupport, args.stride_support)
+conv_kind = getattr(cutlass_bindings.conv.Operator, args.conv_kind)
+
+operation = Conv2dOperation(
+    conv_kind=conv_kind, iterator_algorithm=iterator_algorithm,
+    arch=args.compute_capability, tile_description=tile_description,
+    A=A, B=B, C=C, stride_support=stride_support,
+    epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+)
+
+if args.print_cuda:
+    print(operation.rt_module.emit())
+
+operations = [operation,]
+
+if args.split_k_mode == "Parallel" and args.split_k_slices > 1:
+    if (args.activation_function == "identity"):
+        epilogue_functor_reduction = getattr(pycutlass, args.epilogue_functor)(
+            C.element, C.alignment, math_inst.element_accumulator, element_epilogue)
+    else:
+        epilogue_functor_reduction = getattr(pycutlass, "LinearCombinationGeneric")(
+            getattr(pycutlass, args.activation_function)(element_epilogue),
+            C.element, C.alignment, math_inst.element_accumulator, element_epilogue)
+    reduction_operation = ReductionOperation(
+        shape=cutlass_bindings.MatrixCoord(4, 32 * C.alignment),
+        C=C, element_accumulator=element_acc,
+        element_compute=element_epilogue,
+        epilogue_functor=epilogue_functor_reduction,
+        count=C.alignment
+    )
+    operations.append(reduction_operation)
+
+pycutlass.compiler.add_module(operations)
+
+problem_size = cutlass_bindings.conv.Conv2dProblemSize(
+    cutlass_bindings.Tensor4DCoord(args.nhwc[0], args.nhwc[1], args.nhwc[2], args.nhwc[3]),
+    cutlass_bindings.Tensor4DCoord(args.krsc[0], args.krsc[1], args.krsc[2], args.krsc[3]),
+    cutlass_bindings.Tensor4DCoord(args.pad[0], args.pad[1], args.pad[2], args.pad[3]),
+    cutlass_bindings.MatrixCoord(args.stride[0], args.stride[1]),
+    cutlass_bindings.MatrixCoord(args.dilation[0], args.dilation[1]),
+    cutlass_bindings.conv.Mode.cross_correlation, 
+    args.split_k_slices, 1
+)
+
+
+# User-provide inputs
+tensor_A_size = cutlass_bindings.conv.implicit_gemm_tensor_a_size(
+    conv_kind, problem_size
+)
+tensor_B_size = cutlass_bindings.conv.implicit_gemm_tensor_b_size(
+    conv_kind, problem_size
+)
+if args.bias:
+    tensor_C_size = cutlass_bindings.conv.implicit_gemm_tensor_c_extent(
+        conv_kind, problem_size
+    ).at(3)
+else:
+    tensor_C_size = cutlass_bindings.conv.implicit_gemm_tensor_c_size(
+        conv_kind, problem_size
+    )
+
+tensor_D_size = cutlass_bindings.conv.implicit_gemm_tensor_c_size(
+        conv_kind, problem_size
+    )
+
+if args.element_a != "int8":
+    tensor_A = torch.ceil(torch.empty(size=(tensor_A_size,), dtype=getattr(torch, args.element_a), device="cuda").uniform_(-8.5, 7.5))
+else:
+    tensor_A = torch.empty(size=(tensor_A_size,), dtype=getattr(torch, args.element_a), device="cuda").uniform_(-2, 2)
+
+if args.element_b != "int8":
+    tensor_B = torch.ceil(torch.empty(size=(tensor_B_size,), dtype=getattr(torch, args.element_b), device="cuda").uniform_(-8.5, 7.5))
+else:
+    tensor_B = torch.empty(size=(tensor_B_size,), dtype=getattr(torch, args.element_b), device="cuda").uniform_(-2, 2)
+
+if args.element_c != "int8":
+    tensor_C = torch.ceil(torch.empty(size=(tensor_C_size,), dtype=getattr(torch, args.element_c), device="cuda").uniform_(-8.5, 7.5))
+else:
+    tensor_C = torch.empty(size=(tensor_C_size,), dtype=getattr(torch, args.element_c), device="cuda").uniform_(-2, 2)
+
+tensor_D = torch.ones(size=(tensor_D_size,), dtype=getattr(torch, args.element_c), device="cuda")
+
+arguments = Conv2dArguments(
+    operation=operation, problem_size=problem_size, A=tensor_A,
+    B=tensor_B, C=tensor_C, D=tensor_D, 
+    output_op = operation.epilogue_type(*([args.alpha, args.beta] + args.activation_args)), 
+    split_k_mode=getattr(cutlass_bindings.conv.SplitKMode, args.split_k_mode),
+    split_k_slices=problem_size.split_k_slices
+)
+
+if args.split_k_mode == "Parallel" and args.split_k_slices > 1:
+    implicit_gemm_size = cutlass_bindings.conv.implicit_gemm_problem_size(conv_kind, arguments.problem_size)
+    reduction_arguments = ReductionArguments(
+        reduction_operation,
+        problem_size=[implicit_gemm_size.m(), implicit_gemm_size.n()], 
+        partitions=problem_size.split_k_slices,
+        workspace=arguments.ptr_D,
+        destination=tensor_D,
+        source=tensor_C,
+        output_op = reduction_operation.epilogue_type(*([args.alpha, args.beta] + args.activation_args)),
+        bias = arguments.bias
+    )
+
+operation.run(arguments)
+
+if args.split_k_mode == "Parallel" and args.split_k_slices > 1:
+    reduction_operation.run(reduction_arguments)
+    reduction_arguments.sync()
+else:
+    arguments.sync()
+
+reference_model = Conv2dReferenceModule(A, B, C, conv_kind)
+
+tensor_D_ref = reference_model.run(tensor_A, tensor_B, tensor_C, arguments.problem_size, args.alpha, args.beta, args.bias)
+if (args.activation_function != "identity"):
+    tensor_D_ref = getattr(F, args.activation_function)(*([tensor_D_ref,] + args.activation_args))
+
+try:
+    assert torch.equal(tensor_D, tensor_D_ref)
+except:
+    assert torch.allclose(tensor_D, tensor_D_ref, rtol=1e-2)
+print("Passed.")
diff --git a/examples/40_cutlass_py/customizable/gemm.py b/examples/40_cutlass_py/customizable/gemm.py
new file mode 100644
index 0000000000..670294ad2d
--- /dev/null
+++ b/examples/40_cutlass_py/customizable/gemm.py
@@ -0,0 +1,331 @@
+################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+import sys
+print("This example is deprecated. Please see examples/python for examples of using "
+      "the CUTLASS Python interface.")
+sys.exit(0)
+
+import numpy as np
+import cutlass.backend as pycutlass
+from cutlass.backend import *
+from cutlass.backend.utils.device import device_cc
+import cutlass_bindings
+from bfloat16 import bfloat16
+
+import argparse
+
+
+# parse the arguments
+parser = argparse.ArgumentParser(description="Launch CUTLASS GEMM kernels from Python: 'D = alpha * A * B + beta * C'")
+
+# Operation description
+# math instruction description
+parser.add_argument("-i", "--instruction_shape",
+                    default=[1, 1, 1], nargs=3, type=int, 
+                    help="This option describes the size of MMA op")
+parser.add_argument("-ta", "--element_a", default="float32", type=str,
+                    choices=['float64', 'float32', 'float16', 'bfloat16', 'int32', 'int8'], 
+                    help='Data type of elements in input tensor A')
+parser.add_argument("-tb", "--element_b", default="float32", type=str,
+                    choices=['float64', 'float32', 'float16', 'bfloat16', 'int32', 'int8'], 
+                    help='Data type of elements in input tensor B')
+parser.add_argument("-tc", "--element_c", default="float32", type=str,
+                    choices=['float64', 'float32', 'float16', 'bfloat16', 'int32', 'int8'], 
+                    help='Data type of elements in input tensor C and output tensor D')
+parser.add_argument("-tacc", "--element_acc", default="float32", type=str,
+                    choices=['float64', 'float32', 'float16', 'bfloat16', 'int32', 'int8'], 
+                    help='Data type of accumulator')
+parser.add_argument('-m', "--math", default="multiply_add",
+                    type=str, choices=["multiply_add", "multiply_add_fast_bf16", "multiply_add_fast_f32"], help="math instruction")
+parser.add_argument('-op', "--opcode", default="Simt", type=str,
+                    choices=["Simt", 'TensorOp'], 
+                    help="This option describes whether you want to use tensor \
+                        cores (TensorOp) or regular SIMT cores (Simt) on GPU SM")
+# tile description
+parser.add_argument("-b", "--threadblock_shape",
+                    default=[128, 128, 8], nargs=3, type=int, 
+                    help="This option describes the tile size a thread block with compute")
+parser.add_argument("-s", "--stages", default=4,
+                    type=int, help="Number of pipelines you want to use")
+parser.add_argument("-w", "--warp_count", default=[4, 2, 1], nargs=3, type=int, 
+                    help="This option describes the number of warps along M, N, and K of the threadblock")
+parser.add_argument("-cc", "--compute_capability", default=80,
+                    type=int, help="This option describes CUDA SM architecture number")
+# A
+parser.add_argument('-la', "--layout_a", default="RowMajor", type=str, choices=[
+                    "RowMajor", "ColumnMajor", "RowMajorInterleaved32", "ColumnMajorInterleaved32"], 
+                    help="Memory layout of input tensor A")
+parser.add_argument('-aa', '--alignment_a', default=1,
+                    type=int, help="Memory alignement of input tensor A")
+# B
+parser.add_argument('-lb', "--layout_b", default="RowMajor", type=str, choices=[
+                    "RowMajor", "ColumnMajor", "RowMajorInterleaved32", "ColumnMajorInterleaved32"], 
+                    help="Memory layout of input tensor B")
+parser.add_argument('-ab', '--alignment_b', default=1,
+                    type=int, help="Memory alignment of input tensor B")
+# C
+parser.add_argument('-lc', "--layout_c", default="RowMajor", type=str, choices=[
+                    "RowMajor", "ColumnMajor", "RowMajorInterleaved32", "ColumnMajorInterleaved32"], 
+                    help="Memory layout of input tensor C and output tensor D")
+parser.add_argument('-ac', '--alignment_c', default=1,
+                    type=int, help="Memory alignment of input tensor C and output tensor D")
+# epilogue
+parser.add_argument("-te", "--element_epilogue", default="float32", type=str,
+                    choices=['float64', 'float32', 'float16', 'bfloat16'], help='Epilogue datatype')
+parser.add_argument("-ep", "--epilogue_functor", default="LinearCombination",
+                    type=str, choices=['LinearCombination', 'FastLinearCombinationClamp', 'LinearCombinationClamp'], 
+                    help="This option describes the epilogue part of the kernel")
+# swizzling
+parser.add_argument("-sw", "--swizzling_functor", default="IdentitySwizzle1", type=str, choices=[
+                    "IdentitySwizzle1", "IdentitySwizzle2", "IdentitySwizzle4", "IdentitySwizzle8", "HorizontalSwizzle", "BatchedIdentitySwizzle"],
+                    help="This option describes how thread blocks are scheduled on GPU")
+
+# Argument
+parser.add_argument("-p", "--problem_size",
+                    default=[128, 128, 128], nargs=3, type=int, 
+                    help="GEMM problem size M, N, K")
+parser.add_argument("-alpha", "--alpha", default=1.0, type=float, 
+                    help="Scaling factor of A * B")
+parser.add_argument("-beta", "--beta", default=0.0, type=float, 
+                    help="Scaling factor of C")
+parser.add_argument("-gm", "--gemm_mode", default="Gemm", type=str,
+                    choices=["Gemm", "GemmSplitKParallel", "Batched", "Array"], 
+                    help="GEMM mode. Gemm is used for non-splitK or serial-splitK. \
+                        GemmSplitKParallel is used for parallel splitK")
+parser.add_argument('-k', '--split_k_slices', default=1,
+                    type=int, help="Number of split-k partitions. (default 1)")
+parser.add_argument('-bias', '--bias', action='store_true', help="C is bias vector")
+parser.add_argument('-batch', '--batch', default=1, type=int, help="batch size for batched GEMM")
+
+# Activation function
+parser.add_argument("-activ", "--activation_function", default="identity",
+    choices=["identity", "relu", "leaky_relu", "tanh", "sigmoid", "silu", "hardswish", "gelu"], help="activation function")
+parser.add_argument("-activ_arg", "--activation_args", default=[], nargs="+", type=float,
+    help="addition arguments for activation")
+parser.add_argument('--print_cuda', action="store_true",
+                    help="print the underlying CUDA kernel")
+
+try:
+    args = parser.parse_args()
+except:
+    sys.exit(0)
+
+cc = device_cc()
+if args.compute_capability != cc:
+    raise Exception(("Parameter --compute-capability of {} "
+                    "does not match that of the device of {}.").format(args.compute_capability, cc))
+
+pycutlass.get_memory_pool(init_pool_size=2**30, max_pool_size=2**32)
+pycutlass.compiler.nvcc()
+
+np.random.seed(0)
+
+element_a = getattr(cutlass_bindings, args.element_a)
+element_b = getattr(cutlass_bindings, args.element_b)
+element_c = getattr(cutlass_bindings, args.element_c)
+element_acc = getattr(cutlass_bindings, args.element_acc)
+math_operation = getattr(MathOperation, args.math)
+opclass = getattr(cutlass_bindings.OpClass, args.opcode)
+
+math_inst = MathInstruction(
+    args.instruction_shape, element_a, element_b,
+    element_acc, opclass, math_operation
+)
+
+tile_description = TileDescription(
+    args.threadblock_shape, args.stages, args.warp_count,
+    math_inst
+)
+
+layout_a = getattr(cutlass_bindings, args.layout_a)
+layout_b = getattr(cutlass_bindings, args.layout_b)
+layout_c = getattr(cutlass_bindings, args.layout_c)
+
+A = TensorDescription(
+    element_a, layout_a, args.alignment_a
+)
+
+B = TensorDescription(
+    element_b, layout_b, args.alignment_b
+)
+
+C = TensorDescription(
+    element_c, layout_c, args.alignment_c
+)
+
+element_epilogue = getattr(cutlass_bindings, args.element_epilogue)
+if (args.activation_function == "identity" 
+    or (args.gemm_mode == "GemmSplitKParallel" and args.split_k_slices > 1)):
+    #
+    epilogue_functor = getattr(pycutlass, args.epilogue_functor)(
+        C.element, C.alignment, math_inst.element_accumulator, element_epilogue)
+else:
+    epilogue_functor = getattr(pycutlass, "LinearCombinationGeneric")(
+        getattr(pycutlass, args.activation_function)(element_epilogue),
+        C.element, C.alignment, math_inst.element_accumulator, element_epilogue)
+
+swizzling_functor = getattr(cutlass_bindings, args.swizzling_functor)
+
+operation = GemmOperationUniversal(
+    arch=args.compute_capability, tile_description=tile_description,
+    A=A, B=B, C=C,
+    epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+)
+
+if args.print_cuda:
+    print(operation.rt_module.emit())
+
+operations = [operation, ]
+
+if args.gemm_mode == "GemmSplitKParallel":
+    if (args.activation_function == "identity"):
+        epilogue_functor_reduction = getattr(pycutlass, args.epilogue_functor)(
+            C.element, C.alignment, math_inst.element_accumulator, element_epilogue)
+    else:
+        epilogue_functor_reduction = getattr(pycutlass, "LinearCombinationGeneric")(
+            getattr(pycutlass, args.activation_function)(element_epilogue),
+            C.element, C.alignment, math_inst.element_accumulator, element_epilogue)
+
+    reduction_operation = ReductionOperation(
+        shape=cutlass_bindings.MatrixCoord(4, 32 * C.alignment),
+        C=C, element_accumulator=element_acc,
+        element_compute=element_epilogue, 
+        epilogue_functor=epilogue_functor_reduction,
+        count=C.alignment
+    )
+    operations.append(reduction_operation)
+
+pycutlass.compiler.add_module(operations)
+
+# User-provide inputs
+
+problem_size = cutlass_bindings.gemm.GemmCoord(
+    args.problem_size[0], args.problem_size[1], args.problem_size[2])
+
+tensor_a_size = args.batch * problem_size.m() * problem_size.k()
+if args.element_a != "int8":
+    if args.element_a == "bfloat16":
+        tensor_A = np.ceil(
+            np.random.uniform(low=-8.5, high=7.5, size=(tensor_a_size,))
+            ).astype(bfloat16)
+    else:
+        tensor_A = np.ceil(
+            np.random.uniform(low=-8.5, high=7.5, size=(tensor_a_size,))
+            ).astype(getattr(np, args.element_a))
+else:
+    tensor_A = np.random.uniform(
+        low=-2, high=2,size=(tensor_a_size,)
+        ).astype(getattr(np, args.element_a))
+
+tensor_b_size = args.batch * problem_size.k() * problem_size.n()
+if args.element_b != "int8":
+    if args.element_b == "bfloat16":
+        tensor_B = np.ceil(
+            np.random.uniform(low=-8.5, high=7.5, size=(tensor_b_size,))
+            ).astype(bfloat16)
+    else:
+        tensor_B = np.ceil(
+            np.random.uniform(low=-8.5, high=7.5, size=(tensor_b_size,))
+            ).astype(getattr(np, args.element_b))
+else:
+    tensor_B = np.random.uniform(
+        low=-2, high=2, size=(tensor_b_size,)
+        ).astype(getattr(np, args.element_b))
+
+if args.element_c != "int8":
+    if args.bias:
+        if args.layout_c == "RowMajor":
+            tensor_c_size = args.batch * problem_size.n()
+        elif args.layout_c == "ColumnMajor":
+            tensor_c_size = args.batch * problem_size.m()
+        else:
+            raise ValueError(args.layout_c)
+    else:
+        tensor_c_size = args.batch * problem_size.m() * problem_size.n()
+    if args.element_c == "bfloat16":
+        tensor_C = np.ceil(
+            np.random.uniform(low=-8.5, high=7.5, size=(tensor_c_size,))
+        ).astype(bfloat16)
+    else:
+        tensor_C = np.ceil(
+            np.random.uniform(low=-8.5, high=7.5, size=(tensor_c_size,))
+        ).astype(getattr(np, args.element_c))
+else:
+    tensor_C = np.random.uniform(
+        low=-2, high=2, size=(args.batch * problem_size.m() * problem_size.n(),)
+    ).astype(getattr(np, args.element_c))
+
+tensor_D = np.zeros(
+    shape=(args.batch * problem_size.m() * problem_size.n(),)
+).astype(getattr(np, args.element_c))
+
+output_op = operation.epilogue_type(*([args.alpha, args.beta] + args.activation_args))
+
+arguments = GemmArguments(
+    operation=operation, problem_size=problem_size,
+    A=tensor_A, B=tensor_B, C=tensor_C, D=tensor_D,
+    output_op=output_op,
+    gemm_mode=getattr(cutlass_bindings.gemm.Mode, args.gemm_mode),
+    split_k_slices=args.split_k_slices, batch=args.batch
+)
+
+if args.gemm_mode == "GemmSplitKParallel":
+    reduction_arguments = ReductionArguments(
+        operation=reduction_operation,
+        problem_size=[problem_size.m(), problem_size.n()],
+        partitions=args.split_k_slices, workspace=arguments.ptr_D,
+        destination=tensor_D, source=tensor_C,
+        output_op=reduction_operation.epilogue_type(*([args.alpha, args.beta] + args.activation_args)),
+        bias = arguments.bias
+    )
+
+operation.run(arguments)
+
+if args.gemm_mode == "GemmSplitKParallel":
+    reduction_operation.run(reduction_arguments)
+    reduction_arguments.sync()
+else:
+    arguments.sync()
+
+# run the host reference module
+reference = ReferenceModule(A, B, C)
+tensor_D_ref = reference.run(
+    tensor_A, tensor_B, tensor_C, problem_size, args.alpha, args.beta, args.bias, args.batch)
+
+tensor_D_ref = getattr(pycutlass, args.activation_function).numpy(*([tensor_D_ref,] + args.activation_args))
+
+try:
+    assert np.array_equal(tensor_D, tensor_D_ref)
+except:
+    assert np.allclose(tensor_D, tensor_D_ref, atol=1e-5)
+print("Passed.")
diff --git a/examples/40_cutlass_py/customizable/gemm_grouped.py b/examples/40_cutlass_py/customizable/gemm_grouped.py
new file mode 100644
index 0000000000..ac2adefacb
--- /dev/null
+++ b/examples/40_cutlass_py/customizable/gemm_grouped.py
@@ -0,0 +1,298 @@
+################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+import sys
+print("This example is deprecated. Please see examples/python for examples of using "
+      "the CUTLASS Python interface.")
+sys.exit(0)
+
+import numpy as np
+import cutlass.backend as pycutlass
+from cutlass.backend import *
+from cutlass.backend.utils.device import device_cc
+import csv
+
+import argparse
+
+# parse the arguments
+parser = argparse.ArgumentParser(
+    description="Launch CUTLASS GEMM Grouped kernels from Python")
+
+# Operation description
+# math instruction description
+parser.add_argument("-i", "--instruction_shape",
+                    default=[1, 1, 1], nargs=3, type=int, 
+                    help="This option describes the size of MMA op")
+parser.add_argument("-ta", "--element_a", default="float32", type=str,
+                    choices=['float64', 'float32', 'float16', 'bfloat16', 'int32', 'int8'], 
+                    help='Data type of elements in input tensor A')
+parser.add_argument("-tb", "--element_b", default="float32", type=str,
+                    choices=['float64', 'float32', 'float16', 'bfloat16', 'int32', 'int8'], 
+                    help='Data type of elements in input tensor B')
+parser.add_argument("-tc", "--element_c", default="float32", type=str,
+                    choices=['float64', 'float32', 'float16', 'bfloat16', 'int32', 'int8'], 
+                    help='Data type of elements in input tensor C and output tensor D')
+parser.add_argument("-tacc", "--element_acc", default="float32", type=str,
+                    choices=['float64', 'float32', 'float16', 'bfloat16', 'int32', 'int8'], 
+                    help='Data type of accumulator')
+parser.add_argument('-m', "--math", default="multiply_add",
+                    type=str, choices=["multiply_add", "multiply_add_fast_bf16", "multiply_add_fast_f32"], help="math instruction")
+parser.add_argument('-op', "--opcode", default="Simt", type=str,
+                    choices=["Simt", 'TensorOp'], help='This option describes whether you want to use tensor \
+                        cores (TensorOp) or regular SIMT cores (Simt) on GPU SM')
+# tile description
+parser.add_argument("-b", "--threadblock_shape",
+                    default=[128, 128, 8], nargs=3, type=int, 
+                    help="This option describes the tile size a thread block with compute")
+parser.add_argument("-s", "--stages", default=4,
+                    type=int, help="Number of pipelines you want to use")
+parser.add_argument("-w", "--warp_count", default=[
+                    4, 2, 1], nargs=3, type=int, 
+                    help="This option describes the number of warps along M, N, and K of the threadblock")
+parser.add_argument("-cc", "--compute_capability", default=80,
+                    type=int, help="This option describes CUDA SM architecture number")
+# A
+parser.add_argument('-la', "--layout_a", default="RowMajor", type=str, choices=[
+                    "RowMajor", "ColumnMajor", "RowMajorInterleaved32", "ColumnMajorInterleaved32"], 
+                    help="Memory layout of input tensor A")
+parser.add_argument('-aa', '--alignment_a', default=1,
+                    type=int, help="Memory alignment of input tensor A")
+# B
+parser.add_argument('-lb', "--layout_b", default="RowMajor", type=str, choices=[
+                    "RowMajor", "ColumnMajor", "RowMajorInterleaved32", "ColumnMajorInterleaved32"], 
+                    help="Memory layout of input tensor B")
+parser.add_argument('-ab', '--alignment_b', default=1,
+                    type=int, help="Memory alignment of input tensor B")
+# C
+parser.add_argument('-lc', "--layout_c", default="RowMajor", type=str, choices=[
+                    "RowMajor", "ColumnMajor", "RowMajorInterleaved32", "ColumnMajorInterleaved32"], 
+                    help="Memory layout of input tensor C and output tensor D")
+parser.add_argument('-ac', '--alignment_c', default=1,
+                    type=int, help="Memory alignment of input tensor C and output tensor D")
+# epilogue
+parser.add_argument("-te", "--element_epilogue", default="float32", type=str,
+                    choices=['float64', 'float32', 'float16', 'bfloat16'], help='Epilogue datatype')
+parser.add_argument("-ep", "--epilogue_functor", default="LinearCombination",
+                    type=str, choices=['LinearCombination', 'FastLinearCombinationClamp', 'LinearCombinationClamp'], 
+                    help="This option describes the epilogue part of the kernel")
+# swizzling
+parser.add_argument("-sw", "--swizzling_functor", default="IdentitySwizzle1", type=str, choices=[
+                    "IdentitySwizzle1", "IdentitySwizzle2", "IdentitySwizzle4", "IdentitySwizzle8", "HorizontalSwizzle"],
+                    help="This option describes how thread blocks are scheduled on GPU. \
+                         NOTE: Threadblock swizzling is currently not supported by CUTLASS's grouped kernels. \
+                         This parameter is passed in at present to match the APIs of other kernels. The parameter \
+                         is unused within the kernel")
+# precompute mode
+parser.add_argument("-pm", "--precompute_mode",
+                    default="Device", type=str, choices=["Host", "Device"],
+                    help="Grouped Gemm Scheduing on device only (Device) or using host precompute (Host)")
+# arguments
+parser.add_argument("-p", "--problem_size_dir", type=str, default="grouped_gemm_problem_size.csv",
+                    help="path to the csv file contains the problem sizes")
+parser.add_argument("-alpha", "--alpha", default=1.0, type=float, help="alpha")
+parser.add_argument("-beta", "--beta", default=0.0, type=float, help="beta")
+parser.add_argument('-bias', '--bias', action='store_true', help="C is bias vector")
+
+# Activation function
+parser.add_argument("-activ", "--activation_function", default="identity",
+    choices=["identity", "relu", "leaky_relu", "tanh", "sigmoid", "silu", "hardswish", "gelu"], help="activation function")
+parser.add_argument("-activ_arg", "--activation_args", default=[], nargs="+", type=float,
+    help="addition arguments for activation")
+parser.add_argument('--print_cuda', action="store_true",
+                    help="print the underlying CUDA kernel")
+
+try:
+    args = parser.parse_args()
+except:
+    sys.exit(0)
+
+cc = device_cc()
+if args.compute_capability != cc:
+    raise Exception(("Parameter --compute-capability of {} "
+                    "does not match that of the device of {}.").format(args.compute_capability, cc))
+
+pycutlass.get_memory_pool(init_pool_size=2**30, max_pool_size=2**32)
+
+np.random.seed(0)
+
+element_a = getattr(cutlass_bindings, args.element_a)
+element_b = getattr(cutlass_bindings, args.element_b)
+element_c = getattr(cutlass_bindings, args.element_c)
+element_acc = getattr(cutlass_bindings, args.element_acc)
+math_operation = getattr(MathOperation, args.math)
+opclass = getattr(cutlass_bindings.OpClass, args.opcode)
+
+math_inst = MathInstruction(
+    args.instruction_shape, element_a, element_b,
+    element_acc, opclass, math_operation
+)
+
+tile_description = TileDescription(
+    args.threadblock_shape, args.stages, args.warp_count,
+    math_inst
+)
+
+layout_a = getattr(cutlass_bindings, args.layout_a)
+layout_b = getattr(cutlass_bindings, args.layout_b)
+layout_c = getattr(cutlass_bindings, args.layout_c)
+
+A = TensorDescription(
+    element_a, layout_a, args.alignment_a
+)
+
+B = TensorDescription(
+    element_b, layout_b, args.alignment_b
+)
+
+C = TensorDescription(
+    element_c, layout_c, args.alignment_c
+)
+
+element_epilogue = getattr(cutlass_bindings, args.element_epilogue)
+if args.activation_function == "identity":
+    epilogue_functor = getattr(pycutlass, args.epilogue_functor)(
+        C.element, C.alignment, math_inst.element_accumulator, element_epilogue)
+else:
+    epilogue_functor = getattr(pycutlass, "LinearCombinationGeneric")(
+        getattr(pycutlass, args.activation_function)(element_epilogue),
+        C.element, C.alignment, math_inst.element_accumulator, element_epilogue)
+swizzling_functor = getattr(cutlass_bindings, args.swizzling_functor)
+precompute_mode = getattr(SchedulerMode, args.precompute_mode)
+
+operation = GemmOperationGrouped(
+    arch=args.compute_capability, tile_description=tile_description,
+    A=A, B=B, C=C,
+    epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor,
+    precompute_mode=precompute_mode
+)
+
+if args.print_cuda:
+    print(operation.rt_module.emit())
+
+pycutlass.compiler.add_module([operation, ])
+
+reference_module = ReferenceModule(A, B, C)
+
+# get problems
+problem_sizes = []
+with open(args.problem_size_dir) as csv_file:
+    reader = csv.reader(csv_file)
+    for row in reader:
+        problem_sizes.append(
+            cutlass_bindings.gemm.GemmCoord(int(row[0]), int(row[1]), int(row[2]))
+        )
+
+problem_count = len(problem_sizes)
+
+tensor_As = []
+tensor_Bs = []
+tensor_Cs = []
+tensor_Ds = []
+problem_sizes_coord = []
+tensor_D_refs = []
+
+for problem_size in problem_sizes:
+    if args.element_a != "int8":
+        if args.element_a == "bfloat16":
+            tensor_A = np.ceil(np.random.uniform(low=-8.5, high=7.5, size=(problem_size.m()
+                                                                           * problem_size.k(),))).astype(bfloat16)
+        else:
+            tensor_A = np.ceil(np.random.uniform(low=-8.5, high=7.5, size=(problem_size.m()
+                                                                           * problem_size.k(),))).astype(getattr(np, args.element_a))
+    else:
+        tensor_A = np.random.uniform(low=-2, high=2, size=(problem_size.m()
+                                                           * problem_size.k(),)).astype(getattr(np, args.element_a))
+
+    if args.element_b != "int8":
+        if args.element_b == "bfloat16":
+            tensor_B = np.ceil(np.random.uniform(low=-8.5, high=7.5, size=(problem_size.k()
+                                                                           * problem_size.n(),))).astype(bfloat16)
+        else:
+            tensor_B = np.ceil(np.random.uniform(low=-8.5, high=7.5, size=(problem_size.k()
+                                                                           * problem_size.n(),))).astype(getattr(np, args.element_b))
+    else:
+        tensor_B = np.random.uniform(low=-2, high=2, size=(problem_size.k()
+                                                           * problem_size.n(),)).astype(getattr(np, args.element_b))
+
+    if args.element_c != "int8":
+        if args.bias:
+            if args.layout_c == "RowMajor":
+                c_size = problem_size.n()
+            elif args.layout_c == "ColumnMajor":
+                c_size = problem_size.m()
+            else:
+                raise ValueError(args.layout_c)
+        else:
+            c_size = problem_size.m() * problem_size.n()
+        if args.element_c == "bfloat16":
+            tensor_C = np.ceil(
+                np.random.uniform(low=-8.5, high=7.5, size=(c_size,))
+            ).astype(bfloat16)
+        else:
+            tensor_C = np.ceil(
+                np.random.uniform(low=-8.5, high=7.5, size=(c_size,))
+            ).astype(getattr(np, args.element_c))
+    else:
+        tensor_C = np.random.uniform(
+            low=-2, high=2, size=(problem_size.m() * problem_size.n(),)
+        ).astype(getattr(np, args.element_c))
+    tensor_D = np.zeros(
+        shape=(problem_size.m() * problem_size.n(),)
+    ).astype(getattr(np, args.element_c))
+
+    tensor_As.append(tensor_A)
+    tensor_Bs.append(tensor_B)
+    tensor_Cs.append(tensor_C)
+    tensor_Ds.append(tensor_D)
+    tensor_D_ref = reference_module.run(
+        tensor_A, tensor_B, tensor_C, problem_size, 
+        args.alpha, args.beta, args.bias)
+    tensor_D_ref = getattr(pycutlass, args.activation_function).numpy(*([tensor_D_ref,] + args.activation_args))
+    tensor_D_refs.append(tensor_D_ref)
+    problem_sizes_coord.append(problem_size)
+
+arguments = GemmGroupedArguments(
+    operation, problem_sizes_coord, tensor_As, tensor_Bs, tensor_Cs, tensor_Ds,
+    output_op=operation.epilogue_type(*([args.alpha, args.beta] + args.activation_args))
+)
+
+operation.run(arguments)
+
+arguments.sync()
+
+for tensor_d, tensor_d_ref in zip(tensor_Ds, tensor_D_refs):
+    try:
+        assert np.array_equal(tensor_d, tensor_d_ref)
+    except:
+        assert np.allclose(tensor_d, tensor_d_ref, rtol=1e-5)
+
+print("Passed.")
diff --git a/examples/40_cutlass_py/customizable/grouped_gemm_problem_size.csv b/examples/40_cutlass_py/customizable/grouped_gemm_problem_size.csv
new file mode 100644
index 0000000000..d1d0dd00b2
--- /dev/null
+++ b/examples/40_cutlass_py/customizable/grouped_gemm_problem_size.csv
@@ -0,0 +1,3 @@
+128,128,128
+128,128,256
+512,128,384
diff --git a/examples/40_cutlass_py/gemm.py b/examples/40_cutlass_py/gemm.py
new file mode 100644
index 0000000000..076f758287
--- /dev/null
+++ b/examples/40_cutlass_py/gemm.py
@@ -0,0 +1,153 @@
+################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+"""
+Basic example of using the CUTLASS Python interface to run a GEMM
+"""
+
+import sys
+print("This example is deprecated. Please see examples/python for examples of using "
+      "the CUTLASS Python interface.")
+sys.exit(0)
+
+import argparse
+import numpy as np
+
+import cutlass_bindings
+import cutlass.backend as pycutlass
+from cutlass.backend import *
+from cutlass.backend.utils.device import device_cc
+
+
+parser = argparse.ArgumentParser(description="Launch a GEMM kernel from Python: 'D = alpha * A * B + beta * C'")
+parser.add_argument("--m", default=128, type=int, help="M dimension of the GEMM")
+parser.add_argument("--n", default=128, type=int, help="N dimension of the GEMM")
+parser.add_argument("--k", default=128, type=int, help="K dimension of the GEMM")
+parser.add_argument('--print_cuda', action="store_true", help="Print the underlying CUDA kernel")
+
+try:
+    args = parser.parse_args()
+except:
+    sys.exit(0)
+
+# Check that the device is of a sufficient compute capability
+cc = device_cc()
+assert cc >= 70, "The CUTLASS Python GEMM example requires compute capability greater than or equal to 70."
+
+alignment = 8
+assert args.m % alignment == 0, "M dimension of size {} is not divisible by alignment of {}".format(args.m, alignment)
+assert args.n % alignment == 0, "N dimension of size {} is not divisible by alignment of {}".format(args.n, alignment)
+assert args.k % alignment == 0, "K dimension of size {} is not divisible by alignment of {}".format(args.k, alignment)
+
+np.random.seed(0)
+
+# Allocate a pool of device memory to be used by the kernel
+pycutlass.get_memory_pool(init_pool_size=2**30, max_pool_size=2**32)
+
+# Set the compiler to use to NVCC
+pycutlass.compiler.nvcc()
+
+# Set up A, B, C and accumulator
+A = TensorDescription(cutlass_bindings.float16, cutlass_bindings.ColumnMajor, alignment)
+B = TensorDescription(cutlass_bindings.float16, cutlass_bindings.RowMajor, alignment)
+C = TensorDescription(cutlass_bindings.float32, cutlass_bindings.ColumnMajor, alignment)
+element_acc = cutlass_bindings.float32
+element_epilogue = cutlass_bindings.float32
+
+# Select instruction shape based on the Tensor Core instructions supported
+# by the device on which we are running
+if cc == 70:
+    instruction_shape = [8, 8, 4]
+elif cc == 75:
+    instruction_shape = [16, 8, 8]
+else:
+    # Use CUTLASS kernels for CC 80 by default (e.g., for cases in which SM86 is used)
+    cc = 80
+    instruction_shape = [16, 8, 16]
+
+math_inst = MathInstruction(
+    instruction_shape,
+    A.element, B.element, element_acc,
+    cutlass_bindings.OpClass.TensorOp,
+    MathOperation.multiply_add
+)
+
+tile_description = TileDescription(
+    [128, 128, 32],   # Threadblock shape
+    2,                # Number of stages
+    [2, 2, 1],        # Number of warps within each dimension of the threadblock shape
+    math_inst
+)
+
+epilogue_functor = pycutlass.LinearCombination(C.element, C.alignment, element_acc, element_epilogue)
+
+operation = GemmOperationUniversal(
+    arch=cc, tile_description=tile_description,
+    A=A, B=B, C=C,
+    epilogue_functor=epilogue_functor)
+
+if args.print_cuda:
+    print(operation.rt_module.emit())
+
+operations = [operation, ]
+
+# Compile the operation
+pycutlass.compiler.add_module(operations)
+
+# Randomly initialize tensors
+tensor_A = np.ceil(np.random.uniform(low=-8.5, high=7.5, size=(args.m * args.k,))).astype(np.float16)
+tensor_B = np.ceil(np.random.uniform(low=-8.5, high=7.5, size=(args.k * args.n,))).astype(np.float16)
+tensor_C = np.ceil(np.random.uniform(low=-8.5, high=7.5, size=(args.m * args.n,))).astype(np.float32)
+tensor_D = np.zeros(shape=(args.m * args.n,)).astype(np.float32)
+
+problem_size = cutlass_bindings.gemm.GemmCoord(args.m, args.n, args.k)
+alpha = 1.
+beta = 0.
+
+arguments = GemmArguments(
+    operation=operation, problem_size=problem_size,
+    A=tensor_A, B=tensor_B, C=tensor_C, D=tensor_D,
+    output_op=operation.epilogue_type(alpha, beta))
+
+# Run the operation
+operation.run(arguments)
+arguments.sync()
+
+# Run the host reference module and compare to the CUTLASS result
+reference = ReferenceModule(A, B, C)
+tensor_D_ref = reference.run(tensor_A, tensor_B, tensor_C, problem_size, alpha, beta)
+
+try:
+    assert np.array_equal(tensor_D, tensor_D_ref)
+except:
+    assert np.allclose(tensor_D, tensor_D_ref, atol=1e-5)
+
+print("Passed.")
diff --git a/examples/40_cutlass_py/gemm_grouped.py b/examples/40_cutlass_py/gemm_grouped.py
new file mode 100644
index 0000000000..9ba2fa313a
--- /dev/null
+++ b/examples/40_cutlass_py/gemm_grouped.py
@@ -0,0 +1,172 @@
+################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+"""
+Basic example of using the CUTLASS Python interface to run a grouped GEMM
+"""
+
+import sys
+print("This example is deprecated. Please see examples/python for examples of using "
+      "the CUTLASS Python interface.")
+sys.exit(0)
+
+import argparse
+import numpy as np
+
+import cutlass_bindings
+import cutlass.backend as pycutlass
+from cutlass.backend import *
+from cutlass.backend.utils.device import device_cc
+
+
+parser = argparse.ArgumentParser(description="Launch a grouped GEMM kernel from Python")
+parser.add_argument('--print_cuda', action="store_true", help="Print the underlying CUDA kernel")
+
+try:
+    args = parser.parse_args()
+except:
+    sys.exit(0)
+
+# Check that the device is of a sufficient compute capability
+cc = device_cc()
+assert cc >= 70, "The CUTLASS Python grouped GEMM example requires compute capability greater than or equal to 70."
+
+np.random.seed(0)
+
+# Allocate a pool of device memory to be used by the kernel
+pycutlass.get_memory_pool(init_pool_size=2**30, max_pool_size=2**32)
+
+# Set the compiler to use to NVCC
+pycutlass.compiler.nvcc()
+
+# Set up A, B, C and accumulator
+alignment = 1
+A = TensorDescription(cutlass_bindings.float16, cutlass_bindings.ColumnMajor, alignment)
+B = TensorDescription(cutlass_bindings.float16, cutlass_bindings.RowMajor, alignment)
+C = TensorDescription(cutlass_bindings.float32, cutlass_bindings.ColumnMajor, alignment)
+element_acc = cutlass_bindings.float32
+element_epilogue = cutlass_bindings.float32
+
+# Select instruction shape based on the Tensor Core instructions supported
+# by the device on which we are running
+if cc == 70:
+    instruction_shape = [8, 8, 4]
+elif cc == 75:
+    instruction_shape = [16, 8, 8]
+else:
+    # Use CUTLASS kernels for CC 80 by default (e.g., for cases in which SM86 is used)
+    cc = 80
+    instruction_shape = [16, 8, 16]
+
+math_inst = MathInstruction(
+    instruction_shape,
+    A.element, B.element, element_acc,
+    cutlass_bindings.OpClass.TensorOp,
+    MathOperation.multiply_add
+)
+
+tile_description = TileDescription(
+    [128, 128, 32],   # Threadblock shape
+    2,                # Number of stages
+    [2, 2, 1],        # Number of warps within each dimension of the threadblock shape
+    math_inst
+)
+
+epilogue_functor = pycutlass.LinearCombination(C.element, C.alignment, element_acc, element_epilogue)
+
+operation = GemmOperationGrouped(
+    arch=cc, tile_description=tile_description,
+    A=A, B=B, C=C,
+    epilogue_functor=epilogue_functor,
+    precompute_mode=SchedulerMode.Device)
+
+if args.print_cuda:
+    print(operation.rt_module.emit())
+
+operations = [operation, ]
+
+# Compile the operation
+pycutlass.compiler.add_module(operations)
+
+# Initialize tensors for each problem in the group
+problem_sizes = [
+    cutlass_bindings.gemm.GemmCoord(128, 128, 64),
+    cutlass_bindings.gemm.GemmCoord(512, 256, 128)
+]
+problem_count = len(problem_sizes)
+
+alpha = 1.
+beta = 0.
+
+tensor_As = []
+tensor_Bs = []
+tensor_Cs = []
+tensor_Ds = []
+tensor_D_refs = []
+
+reference = ReferenceModule(A, B, C)
+
+for problem_size in problem_sizes:
+    # Randomly initialize tensors
+    m = problem_size.m()
+    n = problem_size.n()
+    k = problem_size.k()
+    tensor_A = np.ceil(np.random.uniform(low=-8.5, high=7.5, size=(m * k,))).astype(np.float16)
+    tensor_B = np.ceil(np.random.uniform(low=-8.5, high=7.5, size=(k * n,))).astype(np.float16)
+    tensor_C = np.ceil(np.random.uniform(low=-8.5, high=7.5, size=(m * n,))).astype(np.float32)
+    tensor_D = np.zeros(shape=(m * n,)).astype(np.float32)
+
+    tensor_As.append(tensor_A)
+    tensor_Bs.append(tensor_B)
+    tensor_Cs.append(tensor_C)
+    tensor_Ds.append(tensor_D)
+
+    # Run the reference GEMM
+    tensor_D_ref = reference.run(tensor_A, tensor_B, tensor_C, problem_size, alpha, beta)
+    tensor_D_refs.append(tensor_D_ref)
+
+arguments = GemmGroupedArguments(
+    operation, problem_sizes, tensor_As, tensor_Bs, tensor_Cs, tensor_Ds,
+    output_op=operation.epilogue_type(alpha, beta)
+)
+
+# Run the operation
+operation.run(arguments)
+arguments.sync()
+
+# Compare the CUTLASS result to the host reference result
+for tensor_d, tensor_d_ref in zip(tensor_Ds, tensor_D_refs):
+    try:
+        assert np.array_equal(tensor_d, tensor_d_ref)
+    except:
+        assert np.allclose(tensor_d, tensor_d_ref, rtol=1e-5)
+
+print("Passed.")
diff --git a/examples/40_cutlass_py/test-cutlass-py.py b/examples/40_cutlass_py/test-cutlass-py.py
deleted file mode 100644
index e1ee636b4d..0000000000
--- a/examples/40_cutlass_py/test-cutlass-py.py
+++ /dev/null
@@ -1,169 +0,0 @@
-
-# System modules
-import numpy as np
-import os.path
-import sys
-import ctypes
-
-# CUDA Python modules
-from cuda import cuda
-from cuda import nvrtc
-
-# CUTLASS modules
-import library
-import manifest as cutlass_manifest
-import generator
-import rt
-
-
-#
-# Construct an SGEMM
-#
-
-manifest = cutlass_manifest.Manifest()
-
-generator.GenerateSM50_Simt(manifest, "11.5.0")
-
-#
-# Construct a GEMM operation
-#
-
-operation = manifest.operations_by_name['cutlass_simt_sgemm_128x128_8x2_nt_align1']
-
-#
-# Construct a runtime GEMM operation
-#
-gemm = rt.Gemm(operation)
-
-#
-# Initialize context
-#
-err, = cuda.cuInit(0)
-
-if err != cuda.CUresult.CUDA_SUCCESS:
-  raise RuntimeError("CUDA Error %s" % str(err))
-
-err, device = cuda.cuDeviceGet(0)
-
-if err != cuda.CUresult.CUDA_SUCCESS:
-  raise RuntimeError("CUDA Error %s" % str(err))
-
-err, context = cuda.cuCtxCreate(0, device)
-
-if err != cuda.CUresult.CUDA_SUCCESS:
-  raise RuntimeError("CUDA Error %s" % str(err))
-
-#
-# Construct a module
-#
-
-architectures = [80,]
-include_paths = [
-  '../../include',
-  '../../tools/util/include',
-]
-
-compilation_options = rt.CompilationOptions(architectures, include_paths)
-
-module = rt.Module('module.cu', [gemm], compilation_options)
-
-#
-# Setup a workspace
-#
-
-M, N, K = (128, 128, 128)
-
-tensor_A = np.ndarray(M * K, dtype=np.float32)
-tensor_B = np.ndarray(N * K, dtype=np.float32)
-tensor_C = np.ndarray(M * N, dtype=np.float32)
-tensor_D = np.ndarray(M * N, dtype=np.float32)
-
-err, tensor_A_d = cuda.cuMemAlloc(tensor_A.size * tensor_A.itemsize)
-if err != cuda.CUresult.CUDA_SUCCESS:
-  raise RuntimeError("CUDA Error %s" % str(err))
-
-err, tensor_B_d = cuda.cuMemAlloc(tensor_B.size * tensor_B.itemsize)
-if err != cuda.CUresult.CUDA_SUCCESS:
-  raise RuntimeError("CUDA Error %s" % str(err))
-
-err, tensor_C_d = cuda.cuMemAlloc(tensor_C.size * tensor_C.itemsize)
-if err != cuda.CUresult.CUDA_SUCCESS:
-  raise RuntimeError("CUDA Error %s" % str(err))
-
-err, tensor_D_d = cuda.cuMemAlloc(tensor_D.size * tensor_D.itemsize)
-if err != cuda.CUresult.CUDA_SUCCESS:
-  raise RuntimeError("CUDA Error %s" % str(err))
-
-err, stream = cuda.cuStreamCreate(0)
-if err != cuda.CUresult.CUDA_SUCCESS:
-  raise RuntimeError("CUDA Error %s" % str(err))
-
-tensors = [
-  (tensor_A_d, tensor_A),
-  (tensor_B_d, tensor_B),
-  (tensor_C_d, tensor_C),
-  (tensor_D_d, tensor_D)
-]
-
-for tensor_device, tensor_host in tensors:
-  bytes = tensor_host.size * tensor_host.itemsize
-  print("Tensor has dimensions: %s (%d bytes)" % (str(tensor_host.size), tensor_host.itemsize))
-  err, = cuda.cuMemcpyHtoDAsync(tensor_device, tensor_host, bytes, stream)
-  print("updating tensor in device memory ", hex(int(tensor_device)))
-  if err != cuda.CUresult.CUDA_SUCCESS:
-    raise RuntimeError('CUDA Error %s' % str(err))
-
-#
-# Initialize a host buffer
-#
-
-arguments = rt.GemmArguments()
-
-arguments.problem_size = rt.GemmCoord(M, N, K)
-
-arguments.A = rt.TensorRef(tensor_A_d, M)
-arguments.B = rt.TensorRef(tensor_B_d, N)
-arguments.C = rt.TensorRef(tensor_C_d, M)
-arguments.D = rt.TensorRef(tensor_D_d, M)
-
-host_workspace = bytearray(gemm.get_host_workspace_size(arguments))
-device_workspace = None
-
-launch_config = gemm.plan(arguments)
-
-byte_count = gemm.initialize(host_workspace, device_workspace, launch_config, arguments)
-
-#
-# Launch the kernel
-#
-
-err = gemm.run(host_workspace, device_workspace, launch_config)
-
-if err != cuda.CUresult.CUDA_SUCCESS:
-  raise RuntimeError('CUDA Error %s' % str(err))
-
-#
-# Verify results
-#
-err, = cuda.cuStreamSynchronize(stream)
-
-if err != cuda.CUresult.CUDA_SUCCESS:
-  raise RuntimeError("CUDA Error %s" % str(err))
-
-
-#
-# Debug reporting of byte array contents
-#
-
-def PrintBytearray(host_workspace):
-  uint_str = None
-  prefix = None
-  print("uint32_t host_workspace[] = {")
-  for idx, byte in enumerate(host_workspace):
-    if not (idx % 4):
-      if uint_str is not None:
-        print(prefix, uint_str, ",")
-      prefix = "/* offset: %d B */    0x" % idx
-      uint_str = ""
-    uint_str = "{:02x}".format(byte) + uint_str
-  print("};")
diff --git a/examples/41_fused_multi_head_attention/CMakeLists.txt b/examples/41_fused_multi_head_attention/CMakeLists.txt
new file mode 100644
index 0000000000..8ed6227010
--- /dev/null
+++ b/examples/41_fused_multi_head_attention/CMakeLists.txt
@@ -0,0 +1,56 @@
+
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cutlass_example_add_executable(
+  41_fused_multi_head_attention_fixed_seqlen
+  fused_multihead_attention_fixed_seqlen.cu
+  )
+
+cutlass_example_add_executable(
+  41_fused_multi_head_attention_variable_seqlen
+  fused_multihead_attention_variable_seqlen.cu
+  )
+
+cutlass_example_add_executable(
+  41_fused_multi_head_attention_backward
+  fused_multi_head_attention_backward.cu
+  DISABLE_TESTS ON
+  )
+
+
+add_custom_target(41_fused_multi_head_attention
+DEPENDS 41_fused_multi_head_attention_fixed_seqlen
+        41_fused_multi_head_attention_variable_seqlen
+        41_fused_multi_head_attention_backward
+)
+
+add_test(
+  NAME ctest_examples_41_fmha_backward_python
+  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/fmha_backward_test.py $<TARGET_FILE:41_fused_multi_head_attention_backward>
+)
diff --git a/examples/41_fused_multi_head_attention/debug_utils.h b/examples/41_fused_multi_head_attention/debug_utils.h
new file mode 100644
index 0000000000..efca4f132d
--- /dev/null
+++ b/examples/41_fused_multi_head_attention/debug_utils.h
@@ -0,0 +1,234 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+#include <cfloat>
+#include <cstdio>
+#include <cmath>
+
+////////////////////////////////////////////////////////////////////////////////
+// Debugging functions
+////////////////////////////////////////////////////////////////////////////////
+// Nans & inf detection
+#define NANCHECK(frag)                         \
+  {                                            \
+    for (size_t _i = 0; _i < frag.size(); ++_i) { \
+      assert(std::isfinite(float(frag[_i])));  \
+      assert(!std::isnan(float(frag[_i])));    \
+    }                                          \
+  }
+
+// Print on the first thread of the first block
+#if 1
+#define PRINT_WARP_ID 0
+#define PRINT_LANE_ID 0
+#define PRINT_B0_T0(msg, ...)                                         \
+  if (blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 &&        \
+      threadIdx.x == PRINT_LANE_ID && threadIdx.y == PRINT_WARP_ID && \
+      threadIdx.z == 0) {                                             \
+    printf(msg "\n", ##__VA_ARGS__);                                  \
+  }
+#define PRINT_T0(msg, ...)                                            \
+  if (threadIdx.x == PRINT_LANE_ID && threadIdx.y == PRINT_WARP_ID && \
+      threadIdx.z == 0) {                                             \
+    printf(msg "\n", ##__VA_ARGS__);                                  \
+  }
+#define PRINT_TX_LX(msg, ...)                                                 \
+  for (int bx = 0; bx < gridDim.x; ++bx) {                                    \
+    for (int by = 0; by < gridDim.y; ++by) {                                  \
+      for (int bz = 0; bz < gridDim.z; ++bz) {                                \
+        for (int tx = 0; tx < blockDim.x; ++tx) {                             \
+          for (int ty = 0; ty < blockDim.y; ++ty) {                           \
+            for (int tz = 0; tz < blockDim.z; ++tz) {                         \
+              __syncthreads();                                                \
+              if (blockIdx.x == bx && blockIdx.y == by && blockIdx.z == bz && \
+                  threadIdx.x == tx && threadIdx.y == ty &&                   \
+                  threadIdx.z == tz) {                                        \
+                printf(                                                       \
+                    "[%d,%d,%d][%d,%d,%d]" msg "\n",                          \
+                    bx,                                                       \
+                    by,                                                       \
+                    bz,                                                       \
+                    tx,                                                       \
+                    ty,                                                       \
+                    tz,                                                       \
+                    ##__VA_ARGS__);                                           \
+              }                                                               \
+            }                                                                 \
+          }                                                                   \
+        }                                                                     \
+      }                                                                       \
+    }                                                                         \
+  }
+#else
+#define PRINT_B0_T0
+#define PRINT_TX_LX
+#endif
+
+struct __string_view {
+  char const* data;
+  std::size_t size;
+};
+#if __cplusplus >= 201402L
+template <class T>
+constexpr __string_view __get_type_name() {
+  char const* p = __PRETTY_FUNCTION__;
+  while (*p++ != '=')
+    ;
+  for (; *p == ' '; ++p)
+    ;
+  char const* p2 = p;
+  int count = 1;
+  for (;; ++p2) {
+    switch (*p2) {
+      case '[':
+        ++count;
+        break;
+      case ']':
+        --count;
+        if (!count)
+          return {p, std::size_t(p2 - p)};
+    }
+  }
+  return {};
+}
+#else
+template <class T>
+constexpr __string_view __get_type_name() {
+  return {"unsupported", 11};
+}
+#endif
+
+// Print a given array
+#define PRINT_ACCUM8_T0_L0_START(name, accum, start)  \
+  PRINT_B0_T0(                                        \
+      "%s[%d:%d] - {%f, %f, %f, %f, %f, %f, %f, %f}", \
+      name,                                           \
+      int(start),                                     \
+      int(start + 8),                                 \
+      float(accum[start + 0]),                        \
+      float(accum[start + 1]),                        \
+      float(accum[start + 2]),                        \
+      float(accum[start + 3]),                        \
+      float(accum[start + 4]),                        \
+      float(accum[start + 5]),                        \
+      float(accum[start + 6]),                        \
+      float(accum[start + 7]));
+#define PRINT_ACCUM8_T0_L0(name, accum) PRINT_ACCUM8_T0_L0_START(name, accum, 0)
+#define PRINT_FRAG_T0_L0(name, frag)                          \
+  {                                                           \
+    auto typeStr = __get_type_name<decltype(frag)>();         \
+    PRINT_B0_T0("printing %s (%s)", name, typeStr.data);      \
+    for (size_t _start = 0; _start < frag.size(); _start += 8) { \
+      PRINT_ACCUM8_T0_L0_START("  ", frag, _start);           \
+    }                                                         \
+    /*__syncthreads();                                        \
+    NANCHECK(frag); */                                        \
+  }
+#define PRINT_ARRAY_T0_L0_INCR(name, array, length, incr)   \
+  {                                                         \
+    PRINT_B0_T0("printing %s (len=%d)", name, int(length)); \
+    for (int _start = 0; _start < length; _start += incr) { \
+      PRINT_ACCUM8_T0_L0_START("  ", array, _start);        \
+    }                                                       \
+  }
+#define PRINT_ARRAY_T0_L0(name, array, length) \
+  PRINT_ARRAY_T0_L0_INCR(name, array, length, 8)
+
+// Print a 4x4 matrix
+#define PRINT_TENSOR4x4_T0_L0_START(name, ref, start_x, start_y)                                           \
+  PRINT_B0_T0(                                                                                             \
+      "%s[%d:%d, %d:%d]:\n    %f, %f, %f, %f\n    %f, %f, %f, %f\n    %f, %f, %f, %f\n    %f, %f, %f, %f", \
+      name,                                                                                                \
+      int(start_x),                                                                                        \
+      int(start_x + 4),                                                                                    \
+      int(start_y),                                                                                        \
+      int(start_y + 4),                                                                                    \
+      float(ref.at({start_x + 0, start_y + 0})),                                                           \
+      float(ref.at({start_x + 0, start_y + 1})),                                                           \
+      float(ref.at({start_x + 0, start_y + 2})),                                                           \
+      float(ref.at({start_x + 0, start_y + 3})),                                                           \
+      float(ref.at({start_x + 1, start_y + 0})),                                                           \
+      float(ref.at({start_x + 1, start_y + 1})),                                                           \
+      float(ref.at({start_x + 1, start_y + 2})),                                                           \
+      float(ref.at({start_x + 1, start_y + 3})),                                                           \
+      float(ref.at({start_x + 2, start_y + 0})),                                                           \
+      float(ref.at({start_x + 2, start_y + 1})),                                                           \
+      float(ref.at({start_x + 2, start_y + 2})),                                                           \
+      float(ref.at({start_x + 2, start_y + 3})),                                                           \
+      float(ref.at({start_x + 3, start_y + 0})),                                                           \
+      float(ref.at({start_x + 3, start_y + 1})),                                                           \
+      float(ref.at({start_x + 3, start_y + 2})),                                                           \
+      float(ref.at({start_x + 3, start_y + 3})));
+#define PRINT_TENSOR4x4_T0_L0(name, ref) \
+  PRINT_TENSOR4x4_T0_L0_START(name, ref, 0, 0)
+
+#define PRINT_PROBLEM_SIZE(name, ps)            \
+  PRINT_B0_T0(                                  \
+      "%s.problem_size: {.m=%d, .n=%d, .k=%d}", \
+      name,                                     \
+      int(ps.m()),                              \
+      int(ps.n()),                              \
+      int(ps.k()))
+
+template <typename LambdaIterator, typename LaneOffsetT, typename AccumT>
+CUTLASS_DEVICE void print_warp_accum(
+    AccumT accum,
+    LaneOffsetT lane_offset,
+    int32_t num_rows,
+    int32_t num_cols) {
+  bool is_main = blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 &&
+      threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0;
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      if (col % 32 == 0) {
+        if (is_main) {
+          printf("\nmat[%3d, %3d:%3d]", row, col, col + 32);
+        }
+        __syncthreads();
+      }
+      LambdaIterator::iterateRows(
+          lane_offset,
+          [&](int accum_m) {},
+          [&](int accum_m, int accum_n, int idx) {
+            if (row == accum_m && col == accum_n &&
+                (blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0)) {
+              printf(" %6.1f", float(accum[idx]));
+            }
+          },
+          [&](int accum_m) {});
+      __syncthreads();
+    }
+    if (is_main) {
+      printf("\n");
+    }
+  }
+}
diff --git a/examples/41_fused_multi_head_attention/default_fmha_grouped.h b/examples/41_fused_multi_head_attention/default_fmha_grouped.h
new file mode 100644
index 0000000000..54e537c9ca
--- /dev/null
+++ b/examples/41_fused_multi_head_attention/default_fmha_grouped.h
@@ -0,0 +1,299 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "fmha_grouped.h"
+#include "gemm_kernel_utils.h"
+#include "gemm/custom_mma.h"
+#include "gemm/find_default_mma.h"
+#include "gemm/mma_from_smem.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The datatype of Q/K/V
+    typename scalar_t_,
+    // Architecture we are targeting (eg `cutlass::arch::Sm80`)
+    typename ArchTag_,
+    // If Q/K/V are correctly aligned in memory and we can run a fast kernel
+    bool isAligned_,
+    int kQueriesPerBlock,
+    int kKeysPerBlock,
+    int kMaxK = (int)cutlass::platform::numeric_limits<uint32_t>::max(),
+    GroupScheduleMode GroupScheduleMode_ = GroupScheduleMode::kDeviceOnly
+    >
+struct DefaultFMHAGrouped {
+  using scalar_t = scalar_t_;
+  using accum_t = float;
+  using output_t = scalar_t;
+
+  // Accumulator between 2 iterations
+  // Using `accum_t` improves perf on f16 at the cost of
+  // numerical errors
+  using output_accum_t = accum_t;
+
+  using ArchTag = ArchTag_;
+  static bool const kIsAligned = isAligned_;
+  static bool const kSingleValueIteration = kMaxK <= kKeysPerBlock;
+  static constexpr bool kIsHalf = cutlass::sizeof_bits<scalar_t>::value == 16;
+  static int const kWarpSize = 32;
+  static int const kNumWarpsPerBlock = kQueriesPerBlock * kKeysPerBlock / (kWarpSize * kWarpSize);
+
+  struct MM0 {
+    /*
+      In this first matmul, we compute a block of `Q @ K.T`.
+      While the calculation result is still hot in registers, we update
+      `mi`, `m_prime`, `s_prime` in shared-memory, and then store this value
+      into a shared-memory ("AccumulatorSharedStorage") that is used later as
+      operand A for the second matmul (see MM1)
+    */
+
+    using GemmType = gemm_kernel_utils::DefaultGemmType<ArchTag, scalar_t>;
+    using OpClass = typename GemmType::OpClass;
+
+    using ElementA = scalar_t;
+    using ElementB = scalar_t;
+    using ElementC = scalar_t;
+    using ElementAccumulator = accum_t;
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+
+    using DefaultConfig =
+        typename cutlass::gemm::device::DefaultGemmConfiguration<
+            OpClass,
+            ArchTag,
+            ElementA,
+            ElementB,
+            ElementC,
+            ElementAccumulator
+            >;
+
+    static int const kAlignmentA =
+        kIsAligned ? DefaultConfig::kAlignmentA : GemmType::kMinimumAlignment;
+    static int const kAlignmentB =
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment;
+
+    using ThreadblockShape = cutlass::gemm::GemmShape<kQueriesPerBlock, kKeysPerBlock, GemmType::ThreadK>;
+    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+    using InstructionShape = typename GemmType::InstructionShape;
+
+    static int const kStages = DefaultConfig::kStages;
+    using Operator = typename GemmType::Operator;
+
+    using DefaultMma = typename cutlass::gemm::threadblock::FindDefaultMma<
+        ElementA,
+        LayoutA,
+        kAlignmentA,
+        ElementB,
+        LayoutB,
+        kAlignmentB,
+        ElementAccumulator,
+        LayoutC,
+        OpClass,
+        ArchTag,
+        ThreadblockShape,
+        WarpShape,
+        InstructionShape,
+        ArchTag::kMinComputeCapability >= 80 && kIsHalf
+            ? 4
+            : DefaultConfig::kStages,
+        Operator
+        >::DefaultMma;
+
+    using MmaCore = typename DefaultMma::MmaCore;
+    using IteratorA = typename DefaultMma::IteratorA;
+    using IteratorB = typename DefaultMma::IteratorB;
+    using DefaultThreadblockMma = typename DefaultMma::ThreadblockMma;
+    using Mma = typename cutlass::platform::conditional<
+        kSingleValueIteration,
+        typename MakeCustomMma<DefaultThreadblockMma, kMaxK>::Mma,
+        DefaultThreadblockMma>::type;
+    using AccumLambdaIterator = typename DefaultMmaAccumLambdaIterator<
+        typename Mma::Operator::IteratorC,
+        ElementAccumulator,
+        kWarpSize>::Iterator;
+
+    static_assert(MmaCore::WarpCount::kCount == kNumWarpsPerBlock, "");
+
+    // Epilogue to store to shared-memory in a format that we can use later for
+    // the second matmul
+    using B2bGemm = typename cutlass::gemm::threadblock::B2bGemm<
+        typename Mma::Operator::IteratorC,
+        typename Mma::Operator,
+        scalar_t,
+        WarpShape,
+        ThreadblockShape>;
+    using AccumulatorSharedStorage = typename B2bGemm::AccumulatorSharedStorage;
+  };
+
+  struct MM1 {
+    /*
+      Second matmul: perform `attn @ V` where `attn` is the attention (not
+      normalized) and stored in shared memory
+    */
+
+    using GemmType = typename MM0::GemmType;
+    using OpClass = typename GemmType::OpClass;
+
+    using ElementA = scalar_t;
+    using ElementB = scalar_t;
+    using ElementC = output_accum_t;
+    using ElementAccumulator = accum_t;
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::RowMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+
+    using DefaultConfig =
+        typename cutlass::gemm::device::DefaultGemmConfiguration<
+            OpClass,
+            ArchTag,
+            ElementA,
+            ElementB,
+            ElementC,
+            ElementAccumulator
+            >;
+
+    static int const kAlignmentA = DefaultConfig::kAlignmentA;
+    static int const kAlignmentB =
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment;
+
+    using ThreadblockShape = typename MM0::ThreadblockShape;
+    using WarpShape = typename MM0::WarpShape;
+    using InstructionShape = typename MM0::InstructionShape;
+
+    using EpilogueOutputOp = typename DefaultConfig::EpilogueOutputOp;
+
+    static int const kStages = DefaultConfig::kStages;
+    using Operator = typename GemmType::Operator;
+
+    using ThreadblockSwizzle = void; // Swizzling is unused
+    static bool const kSplitKSerial = false;
+
+    using DefaultGemm = cutlass::gemm::kernel::DefaultGemm<
+        ElementA,
+        LayoutA,
+        kAlignmentA,
+        ElementB,
+        LayoutB,
+        kAlignmentB,
+        ElementC,
+        LayoutC,
+        ElementAccumulator,
+        OpClass,
+        ArchTag,
+        ThreadblockShape,
+        WarpShape,
+        InstructionShape,
+        EpilogueOutputOp,
+        ThreadblockSwizzle,
+        ArchTag::kMinComputeCapability >= 80 && kIsHalf
+            ? 4
+            : DefaultConfig::kStages,
+        kSplitKSerial,
+        Operator>;
+
+    using WarpIteratorA = typename cutlass::gemm::threadblock::
+    DefaultWarpIteratorAFromSharedMemory<
+        typename DefaultGemm::Mma::Policy::Operator::Shape, // WarpShape
+        typename DefaultGemm::Mma::Policy::Operator::InstructionShape,
+        typename DefaultGemm::Mma::Policy::Operator::IteratorA,
+        typename DefaultGemm::Mma::Policy>::WarpIterator;
+
+    using DefaultMmaFromSmem =
+        typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
+            typename DefaultGemm::Mma,
+            MM0::AccumulatorSharedStorage::Shape::kN,  // kMaxK
+            WarpIteratorA,
+            false>; // kScaleOperandA
+
+    using Mma = typename DefaultMmaFromSmem::Mma;
+    using IteratorB = typename Mma::IteratorB;
+    using WarpCount = typename Mma::WarpCount;
+    static_assert(WarpCount::kCount == kNumWarpsPerBlock, "");
+
+    using DefaultEpilogue = typename DefaultGemm::Epilogue;
+    using OutputTileIterator =
+        typename cutlass::epilogue::threadblock::PredicatedTileIterator<
+            typename DefaultEpilogue::OutputTileIterator::ThreadMap,
+            output_t>;
+    using OutputTileIteratorAccum =
+        typename cutlass::epilogue::threadblock::PredicatedTileIterator<
+            typename DefaultEpilogue::OutputTileIterator::ThreadMap,
+            output_accum_t>;
+  };
+
+/// Define the kernel in terms of the default kernel
+  using FMHAKernel = kernel::FMHAGrouped<
+    MM0,
+    MM1,
+    scalar_t,
+    accum_t,
+    output_t,
+    output_accum_t,
+    kSingleValueIteration,
+    GroupScheduleMode_
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/41_fused_multi_head_attention/epilogue/epilogue_pipelined.h b/examples/41_fused_multi_head_attention/epilogue/epilogue_pipelined.h
new file mode 100644
index 0000000000..e166af4de4
--- /dev/null
+++ b/examples/41_fused_multi_head_attention/epilogue/epilogue_pipelined.h
@@ -0,0 +1,624 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  File copied from "cutlass/epilogue/threadblock/epilogue.h"
+  then modified to:
+  (1) load 2 source fragments at the same time (pipelining)
+  (2) support reading from a different dtype
+  (3) pass the row id to the OutputOp if it takes it
+    (see MemoryEfficientAttentionNormalize)
+  Note that in general the fragment passed to the OutputOp could
+  span multiple rows but it does not happen with the configurations we have
+*/
+
+#pragma once
+
+#include <cuda/std/cassert>
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/functional.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+template <typename Op>
+struct ApplyEpilogueOp {
+  static CUTLASS_DEVICE typename Op::FragmentOutput apply(
+      Op const& output_op,
+      int row_id,
+      typename Op::FragmentAccumulator const& accum,
+      typename Op::FragmentOutput const& source) {
+    return output_op(accum, source);
+  }
+  static CUTLASS_DEVICE typename Op::FragmentOutput apply(
+      Op const& output_op,
+      int row_id,
+      typename Op::FragmentAccumulator const& accum) {
+    return output_op(accum);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator
+template <
+    typename Shape_, ///< Shape of threadblock tile (concept: GemmShape)
+    typename WarpMmaOperator_, ///< Warp-level MMA operator (concept:
+                               ///< gemm::warp::MmaTensorOp)
+    int PartitionsK, ///< Number of partitions of the K dimension
+    typename OutputTileIterator_, ///< Tile iterator writing output tensors
+    typename AccumulatorFragmentIterator_, ///< Fragment iterator selecting
+                                           ///< accumulators
+    typename WarpTileIterator_, ///< Warp-scoped tile iterator writing
+                                ///< accumulators to SMEM
+    typename SharedLoadIterator_, ///< Threadblock-scoped tile iterator loading
+                                  ///< from SMEM
+    typename OutputOp_, ///< Output operator
+    typename Padding_, ///< Padding added to SMEM allocation to avoid bank
+                       ///< conflicts (concept: MatrixShape)
+    int FragmentsPerPartition =
+        1, ///< Used to coarsten the epilogue granularity
+    int IterationsUnroll = ///< Used to reduce binary size when epilogue op is
+                           ///< large
+    (!IsEpilogueFunctorHeavy<OutputOp_>::value),
+    typename OutputTileSourceIterator_ =
+        OutputTileIterator_ ///< Tile iterator reading tensors
+    >
+class EpiloguePipelined : public EpilogueBase<
+                              Shape_,
+                              typename WarpMmaOperator_::Shape,
+                              PartitionsK,
+                              AccumulatorFragmentIterator_,
+                              WarpTileIterator_,
+                              Padding_,
+                              FragmentsPerPartition> {
+ public:
+  using Base = EpilogueBase<
+      Shape_,
+      typename WarpMmaOperator_::Shape,
+      PartitionsK,
+      AccumulatorFragmentIterator_,
+      WarpTileIterator_,
+      Padding_,
+      FragmentsPerPartition>;
+
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using OutputTileSourceIterator = OutputTileSourceIterator_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = Padding_;
+
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename Base::AccumulatorTile;
+
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+  using ElementSource = typename OutputTileSourceIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef =
+      typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+  /// Array type used to output
+  using OutputAccessType = Array<
+      typename OutputTileIterator::Element,
+      OutputTileIterator::kElementsPerAccess>;
+  using SourceAccessType = Array<
+      typename OutputTileSourceIterator::Element,
+      OutputTileSourceIterator::kElementsPerAccess>;
+
+  /// Array type used by output functor
+  using AccumulatorAccessType = Array<
+      typename WarpTileIterator::Element,
+      OutputTileIterator::kElementsPerAccess>;
+
+  /// Number of warps
+  using WarpCount = typename Base::WarpCount;
+
+  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1
+      ? Base::kFragmentsPerIteration
+      : kPartitionsK;
+  static int constexpr kSmemPointerOffset =
+      Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+
+ public:
+  static_assert(
+      OutputTileSourceIterator::Fragment::kElements ==
+          OutputTileIterator::Fragment::kElements,
+      "Mismatch between input tile and output tile iterator (kElements)");
+  static_assert(
+      OutputTileSourceIterator::kIterations == OutputTileIterator::kIterations,
+      "Mismatch between input tile and output tile iterator (kIterations)");
+  static_assert(
+      SharedLoadIterator::Fragment::kElements ==
+          OutputTileIterator::Fragment::kElements,
+      "Mismatch between shared load iterator and output tile iterator.");
+
+  static_assert(
+      OutputTileIterator::kElementsPerAccess,
+      "OutputTileIterator::kElementsPerAccess must not be zero.");
+
+  static_assert(
+      !(OutputTileIterator::Fragment::kElements %
+        OutputTileIterator::kElementsPerAccess),
+      "Divisibility");
+
+ private:
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+
+ public:
+  /// Constructor
+  CUTLASS_DEVICE
+  EpiloguePipelined(
+      typename Base::SharedStorage& shared_storage, ///< Shared storage object
+      int thread_idx, ///< ID of a thread within the threadblock
+      int warp_idx, ///< ID of warp within threadblock
+      int lane_idx ///< Id of thread within warp
+      )
+      : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+        shared_load_iterator_(shared_storage.reference(), thread_idx) {}
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+      OutputOp const& output_op, ///< Output operator
+      OutputTileIterator
+          destination_iterator, ///< Tile iterator for destination
+      AccumulatorTile const&
+          accumulators, ///< Complete warp-level accumulator tile
+      OutputTileSourceIterator
+          source_iterator) { ///< Threadblock tile coordinate in GEMM (in units
+                             ///< of threadblock tiles)
+
+    if (!output_op.is_source_needed()) {
+      compute_source_not_needed_(output_op, destination_iterator, accumulators);
+    } else {
+      compute_source_needed_(
+          output_op, destination_iterator, accumulators, source_iterator);
+    }
+  }
+  CUTLASS_DEVICE
+  void operator()(
+      OutputOp const& output_op, ///< Output operator
+      OutputTileIterator
+          destination_iterator, ///< Tile iterator for destination
+      AccumulatorTile const&
+          accumulators) { ///< Complete warp-level accumulator tile
+    compute_source_not_needed_(output_op, destination_iterator, accumulators);
+  }
+
+ private:
+  template <class Seq>
+  struct acc2smem_source_not_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_not_needed<cutlass::index_sequence<Seq...>> {
+    template <int Advance>
+    CUTLASS_DEVICE static void helper(
+        AccumulatorFragmentIterator accum_fragment_iterator,
+        WarpTileIterator& warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+        typename AccumulatorFragmentIterator::Fragment accum_fragment;
+
+        accum_fragment_iterator.load(accum_fragment);
+        ++accum_fragment_iterator;
+
+        warp_tile_iterator.store(accum_fragment);
+        if (p < Base::kFragmentsPerIteration - 1) {
+          warp_tile_iterator.add_pointer_offset(kSmemPointerOffset);
+        }
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        warp_tile_iterator.add_pointer_offset(
+            kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
+    }
+
+    CUTLASS_DEVICE
+    static void push(
+        size_t pos,
+        AccumulatorFragmentIterator const& iterator_begin,
+        WarpTileIterator& warp_tile_iterator) {
+      int dummy[] = {
+          (pos == (Seq * Base::kFragmentsPerIteration)) &&
+          (helper<Seq * Base::kFragmentsPerIteration>(
+               iterator_begin, warp_tile_iterator),
+           0)...};
+
+      CUTLASS_UNUSED(dummy[0]);
+    }
+  };
+
+  static_assert(
+      kPartitionsK == 1 || Base::kFragmentsPerIteration == 1,
+      "One of these must be exactly 1.");
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_not_needed_(
+      OutputOp const& output_op, ///< Output operator
+      OutputTileIterator
+          destination_iterator, ///< Tile iterator for destination
+      AccumulatorTile const&
+          accumulators ///< Complete warp-level accumulator tile
+  ) {
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+#pragma unroll(                                                          \
+    IterationsUnroll                                                     \
+        ? OutputTileIterator::kIterations / Base::kFragmentsPerIteration \
+        : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations;
+         iter += Base::kFragmentsPerIteration) {
+      //
+      // Convert and store fragment
+      //
+
+      __syncthreads();
+
+      acc2smem_source_not_needed<cutlass::make_index_sequence<
+          OutputTileIterator::kIterations / Base::kFragmentsPerIteration>>::
+          push(iter, accum_fragment_iterator, this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+        typename SharedLoadIterator::Fragment
+            aligned_accum_fragment[kPartitionsK];
+
+        shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+        if (p < Base::kFragmentsPerIteration - 1) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+        } else if (kPartitionsK > 1) {
+          plus<typename SharedLoadIterator::Fragment> add_fragments;
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 1; i < kPartitionsK; ++i) {
+            shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+            shared_load_iterator_.load(aligned_accum_fragment[i]);
+            aligned_accum_fragment[0] = add_fragments(
+                aligned_accum_fragment[0], aligned_accum_fragment[i]);
+          }
+
+          shared_load_iterator_.add_pointer_offset(
+              (1 - kPartitionsK) * kSmemPointerOffset);
+        }
+
+        //
+        // Compute the output result
+        //
+
+        typename OutputTileIterator::Fragment output_fragment;
+
+        apply_output_operator_source_not_needed_(
+            destination_iterator.thread_start_row(),
+            output_fragment,
+            output_op,
+            aligned_accum_fragment[0]);
+
+        //
+        // Store the final result
+        //
+
+        destination_iterator.store(output_fragment);
+        ++destination_iterator;
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        shared_load_iterator_.add_pointer_offset(
+            kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
+    }
+  }
+
+  template <class Seq>
+  struct acc2smem_source_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
+    template <int Advance>
+    CUTLASS_DEVICE static void helper(
+        AccumulatorFragmentIterator accum_fragment_iterator,
+        WarpTileIterator& warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+      accum_fragment_iterator.load(accum_fragment);
+      warp_tile_iterator.store(accum_fragment);
+    }
+
+    CUTLASS_DEVICE
+    static void push(
+        size_t pos,
+        AccumulatorFragmentIterator const& iterator_begin,
+        WarpTileIterator& warp_tile_iterator) {
+      int dummy[] = {
+          (pos == Seq) &&
+          (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
+    }
+  };
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_needed_(
+      OutputOp const& output_op, ///< Output operator
+      OutputTileIterator
+          destination_iterator, ///< Tile iterator for destination
+      AccumulatorTile const&
+          accumulators, ///< Complete warp-level accumulator tile
+      OutputTileSourceIterator
+          source_iterator ///< Threadblock tile coordinate in GEMM (in units of
+                          ///< threadblock tiles)
+  ) {
+    typename OutputTileSourceIterator::Fragment source_fragment[2];
+
+    source_fragment[0].clear();
+    source_iterator.load(source_fragment[0]);
+    ++source_iterator;
+    source_fragment[1].clear();
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+#pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+      if (iter > 0) {
+        __syncthreads();
+      }
+      //
+      // Load the source for next iteration (pipelining)
+      //
+
+      if (iter + 1 < OutputTileIterator::kIterations) {
+        source_iterator.load(source_fragment[(iter + 1) % 2]);
+      }
+      ++source_iterator;
+      acc2smem_source_needed<
+          cutlass::make_index_sequence<OutputTileIterator::kIterations>>::
+          push(iter, accum_fragment_iterator, this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      typename SharedLoadIterator::Fragment
+          aligned_accum_fragment[kPartitionsK];
+
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+      // If the number of k-slices is > 1 - perform a reduction amongst the
+      // k-slices
+      if (kPartitionsK > 1) {
+        plus<typename SharedLoadIterator::Fragment> add_fragments;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(
+              aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+
+        shared_load_iterator_.add_pointer_offset(
+            (1 - kPartitionsK) * kSmemPointerOffset);
+      }
+
+      //
+      // Compute the output result
+      //
+
+      typename OutputTileIterator::Fragment output_fragment;
+
+      apply_output_operator_(
+          destination_iterator.thread_start_row(),
+          output_fragment,
+          output_op,
+          aligned_accum_fragment[0],
+          source_fragment[iter % 2]);
+
+      //
+      // Store the final result
+      //
+
+      destination_iterator.store(output_fragment);
+      ++destination_iterator;
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_(
+      int begin_row,
+      typename OutputTileIterator::Fragment& output_fragment,
+      OutputOp const& output_op, ///< Output operator
+      typename SharedLoadIterator::Fragment const& aligned_accum_fragment,
+      typename OutputTileSourceIterator::Fragment const& source_fragment) {
+    OutputAccessType* output_frag_ptr =
+        reinterpret_cast<OutputAccessType*>(&output_fragment);
+
+    AccumulatorAccessType const* compute_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const*>(&aligned_accum_fragment);
+
+    SourceAccessType const* source_frag_ptr =
+        reinterpret_cast<SourceAccessType const*>(&source_fragment);
+
+    int const kOutputOpIterations = OutputTileIterator::Fragment::kElements /
+        OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+      // Call the output operator
+      output_frag_ptr[i] = ApplyEpilogueOp<OutputOp>::apply(
+          output_op,
+          begin_row + getRowOffset(i * OutputTileIterator::kElementsPerAccess),
+          compute_frag_ptr[i],
+          source_frag_ptr[i]);
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_source_not_needed_(
+      int begin_row,
+      typename OutputTileIterator::Fragment& output_fragment,
+      OutputOp const& output_op, ///< Output operator
+      typename SharedLoadIterator::Fragment const& aligned_accum_fragment) {
+    OutputAccessType* output_frag_ptr =
+        reinterpret_cast<OutputAccessType*>(&output_fragment);
+
+    AccumulatorAccessType const* compute_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const*>(&aligned_accum_fragment);
+
+    int const kOutputOpIterations = OutputTileIterator::Fragment::kElements /
+        OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+      // Call the output operator
+      output_frag_ptr[i] = ApplyEpilogueOp<OutputOp>::apply(
+          output_op,
+          begin_row + getRowOffset(i * OutputTileIterator::kElementsPerAccess),
+          compute_frag_ptr[i]);
+    }
+  }
+
+  // This should be constexpr, but it's only supported on c++14
+  static int CUTLASS_HOST_DEVICE getRowOffset(int i) {
+    using ThreadMap = typename OutputTileIterator::ThreadMap;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            int frag_idx = ThreadMap::kElementsPerAccess *
+                (frag_row_idx * ThreadMap::Iterations::kColumn + column);
+            if (i < frag_idx + ThreadMap::kElementsPerAccess) {
+              return row_offset;
+            }
+          }
+        }
+      }
+    }
+    return -1;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/41_fused_multi_head_attention/epilogue/epilogue_rescale_output.h b/examples/41_fused_multi_head_attention/epilogue/epilogue_rescale_output.h
new file mode 100644
index 0000000000..6860ee9e4c
--- /dev/null
+++ b/examples/41_fused_multi_head_attention/epilogue/epilogue_rescale_output.h
@@ -0,0 +1,254 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory
+  to match canonical tensor layouts in global memory. Epilogues support
+  conversion and reduction operations.
+
+  This is a copy of cutlass/epilogue/threadblock/epilogue.h that can
+  handle "row_id" as a first argument, as uses it to get the corresponding
+  `m_prime` / `s_prime` to rescale the output.
+*/
+
+#pragma once
+
+#include <cuda/std/cassert>
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/functional.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "epilogue_pipelined.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements.
+// output <- alpha * accumulator + beta * source
+//   with:
+//     alpha = 1 / s_prime (to normalize when isLast=True, 1 otherwise)
+//     beta = alpha / m_prime (renormalize the output when the max changes)
+//     source is the current output
+template <
+    typename ElementOutput_, ///< Data type used to store tensors
+    typename ElementSource_, //< Data type for source (usually matches
+                             //`ElementOutput`)
+    int Count, ///< Number of elements computed per operation.
+               ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+               ///< but we use 64 or 32 sometimes when there are not enough data
+               ///< to store
+    typename ElementAccumulator_, ///< Accumulator data type
+    typename ElementCompute_, ///< Data type used to compute linear combination
+    bool isFirst,
+    bool isLast,
+    typename FragmentAlphaBeta_,
+    FloatRoundStyle Round = FloatRoundStyle::round_to_nearest>
+class MemoryEfficientAttentionNormalize {
+ public:
+  using ElementOutput = ElementOutput_;
+  using ElementSource = ElementSource_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+
+  static int const kCount = Count;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentSource = Array<ElementSource, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using ComputeFragment = Array<ElementCompute, kCount>;
+  using FragmentAlphaBeta = FragmentAlphaBeta_;
+
+  static FloatRoundStyle const kRound = Round;
+
+ private:
+  //
+  // Data members
+  //
+
+  FragmentAlphaBeta const& s_prime_;
+  FragmentAlphaBeta const& m_prime_;
+
+ public:
+  /// Constructs the function object, possibly loading from pointers in host
+  /// memory
+  CUTLASS_HOST_DEVICE
+  MemoryEfficientAttentionNormalize(
+      FragmentAlphaBeta const& s_prime,
+      FragmentAlphaBeta const& m_prime)
+      : s_prime_(s_prime), m_prime_(m_prime) {}
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return !isFirst;
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {}
+
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+      int row,
+      FragmentAccumulator const& accumulator,
+      FragmentSource const& source) const {
+    assert(!isFirst);
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round>
+        source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
+        accumulator_converter;
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+        destination_converter;
+
+    ComputeFragment converted_source = source_converter(source);
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    ComputeFragment intermediate;
+
+    multiplies<ComputeFragment> mul_add_source;
+    multiply_add<ComputeFragment> mul_add_accumulator;
+
+    ElementCompute alpha = isLast ? (1 / s_prime_[row]) : 1;
+    ElementCompute beta = alpha * m_prime_[row];
+
+    intermediate = mul_add_source(beta, converted_source); // X =  beta * C
+
+    intermediate = mul_add_accumulator(
+        alpha, converted_accumulator, intermediate); // D = alpha * Accum + X
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(int row, FragmentAccumulator const& accumulator)
+      const {
+    assert(isFirst);
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
+        accumulator_converter;
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+        destination_converter;
+
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    ComputeFragment intermediate;
+    multiplies<ComputeFragment> mul_accumulator;
+
+    ElementCompute alpha = isLast ? (1 / s_prime_[row]) : 1;
+
+    intermediate = mul_accumulator(
+        alpha, converted_accumulator); // X =  alpha * C + uniform
+
+    return destination_converter(intermediate);
+  }
+};
+
+} // namespace thread
+
+namespace threadblock {
+template <
+    typename EO,
+    typename ES,
+    int Count,
+    typename EA,
+    typename EC,
+    bool F,
+    bool L,
+    typename FAB,
+    FloatRoundStyle R>
+struct ApplyEpilogueOp<thread::MemoryEfficientAttentionNormalize<
+    EO,
+    ES,
+    Count,
+    EA,
+    EC,
+    F,
+    L,
+    FAB,
+    R>> {
+  using Op = thread::
+      MemoryEfficientAttentionNormalize<EO, ES, Count, EA, EC, F, L, FAB, R>;
+  static CUTLASS_DEVICE typename Op::FragmentOutput apply(
+      Op const& output_op,
+      int row_id,
+      typename Op::FragmentAccumulator const& accum,
+      typename Op::FragmentSource const& source) {
+    return output_op(row_id, accum, source);
+  }
+  static CUTLASS_DEVICE typename Op::FragmentOutput apply(
+      Op const& output_op,
+      int row_id,
+      typename Op::FragmentAccumulator const& accum) {
+    return output_op(row_id, accum);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/41_fused_multi_head_attention/epilogue/epilogue_thread_apply_logsumexp.h b/examples/41_fused_multi_head_attention/epilogue/epilogue_thread_apply_logsumexp.h
new file mode 100644
index 0000000000..bc2a28c0cf
--- /dev/null
+++ b/examples/41_fused_multi_head_attention/epilogue/epilogue_thread_apply_logsumexp.h
@@ -0,0 +1,174 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination operations used by epilogues.
+*/
+
+#pragma once
+
+#include <cuda_fp16.h>
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <typename Element, int ElementsPerAccess>
+struct ArrayExponential {
+  CUTLASS_HOST_DEVICE
+  Array<Element, ElementsPerAccess> operator()(
+      Array<Element, ElementsPerAccess> const& input) const {
+    Array<Element, ElementsPerAccess> result;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      result[i] = expf(input[i]);
+    }
+
+    return result;
+  }
+};
+
+template <int ElementsPerAccess>
+struct ArrayExponential<half_t, ElementsPerAccess> {
+  CUTLASS_DEVICE
+  Array<half_t, ElementsPerAccess> operator()(
+      Array<half_t, ElementsPerAccess> const& input) const {
+    Array<half_t, ElementsPerAccess> result;
+
+    int const kVectorCount = ElementsPerAccess / 2;
+
+    __half2 const* input_ptr =
+        reinterpret_cast<__half2 const*>(input.raw_data());
+    __half2* res_ptr = reinterpret_cast<__half2*>(result.raw_data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kVectorCount; ++i) {
+      res_ptr[i] = h2exp(input_ptr[i]);
+    }
+
+    return result;
+  }
+};
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies:
+/// output <- (input - lse).exp()
+template <
+    typename ElementOutput_, // output
+    typename ElementLSE_, // accumulator from LSE
+    typename ElementAccumulator_, // accumulator from matmul
+    typename ElementCompute_, // intermediate compute (and exp calculation)
+    int ElementsPerAccess>
+class ApplyLogSumExp {
+ public:
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementLSE = ElementLSE_;
+
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kCount = kElementsPerAccess;
+  static const ScaleType::Kind kScale =
+      cutlass::epilogue::thread::ScaleType::NoBetaScaling;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
+  using FragmentLSE = Array<ElementLSE, kElementsPerAccess>;
+  using FragmentScaleBias = FragmentLSE; // Used by epilogue_smem_accumulator.h
+
+ public:
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  ApplyLogSumExp() {}
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return true;
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {}
+
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+      FragmentAccumulator const& AB,
+      FragmentLSE const& scale_unused,
+      // bias used as LSE
+      FragmentLSE const& bias) const {
+    FragmentCompute frag_AB = NumericArrayConverter<
+        ElementCompute,
+        ElementAccumulator,
+        kElementsPerAccess>()(AB);
+    FragmentCompute frag_lse_compute =
+        NumericArrayConverter<ElementCompute, ElementLSE, kElementsPerAccess>()(
+            bias);
+    FragmentCompute frag_compute;
+
+    minus<FragmentCompute> minus_lse;
+    detail::ArrayExponential<ElementCompute, kElementsPerAccess> apply_exp;
+    frag_compute = minus_lse(frag_AB, frag_lse_compute);
+    frag_compute = apply_exp(frag_compute);
+
+    return NumericArrayConverter<
+        ElementOutput,
+        ElementCompute,
+        kElementsPerAccess>()(frag_compute);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/41_fused_multi_head_attention/fmha_backward_test.py b/examples/41_fused_multi_head_attention/fmha_backward_test.py
new file mode 100644
index 0000000000..cdea9ded4d
--- /dev/null
+++ b/examples/41_fused_multi_head_attention/fmha_backward_test.py
@@ -0,0 +1,232 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import argparse
+import torch
+import sys
+import os
+from piped_subprocess import PipedSubprocess, TORCH_DTYPE_NAME
+import math
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("example_exe", type=str, help="Path to the 41_fused_multi_head_attention_backward executable")
+args = parser.parse_args()
+
+torch.manual_seed(0)
+dtype = torch.float16
+B, Mq, Mkv, H, K, Kv = 2, 1024, 1024, 5, 128, 128
+causal = True
+repeat_count = 100
+
+ATOL = {
+    torch.float: 5e-4,
+    torch.half: 9.5e-2,
+    torch.bfloat16: 7e-1,
+}[dtype]
+
+RTOL = {
+    torch.float: 1e-4,
+    torch.half: 2e-2,
+    torch.bfloat16: 1e-1,
+}[dtype]
+
+
+assert not (causal and Mq < Mkv), "causal only supports seqlenK <= seqlenQ"
+
+fmha_bw_binary = args.example_exe
+if not os.path.isfile(fmha_bw_binary):
+    print(f"""No such file: `{fmha_bw_binary}`\nDid you forget to run "make 41_fused_multi_head_attention"?""")
+    sys.exit(1)
+
+def create_lower_triangular_mask():
+    return torch.triu(torch.full(  # type: ignore
+        [1, Mq, Mkv],
+        dtype=dtype,
+        fill_value=float("-inf"),
+    ), diagonal=1)
+
+def ref_mha_bmk(q, k, v, mask):
+    # Multi-head attention with inputs/outputs in BMK format
+    q = q.float()
+    k = k.float()
+    v = v.float()
+
+    q = q * (1 / q.shape[-1] ** 0.5)
+    attn = q @ k.transpose(-2, -1)
+    if mask is not None:
+        attn += mask
+    attn_max = attn.max(-1, True).values
+    attn_norm = (attn - attn_max).exp().sum(-1, True)
+    attn = attn.softmax(-1)
+    lse = attn_max + attn_norm.log()
+    lse = lse.squeeze(2)
+    return attn @ v, lse
+
+
+def bmhk2bmk(t):
+    return t.permute((0, 2, 1, 3)).reshape(
+        [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
+    )
+
+def ref_mha_bmhk(q, k, v, mask):
+    # Multi-head attention with inputs/outputs in BMHK format
+    assert q.ndim == 4
+
+    out, lse = ref_mha_bmk(bmhk2bmk(q), bmhk2bmk(k), bmhk2bmk(v), mask=mask)
+    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
+    return out.permute((0, 2, 1, 3)), lse.reshape([q.shape[0], q.shape[2], q.shape[1]])
+
+def ref_mha_bw_bmhk(q, k, v, mask, lse, out, grad_out, delta):
+    lse = lse[:, :, :q.shape[1]]  #BMH, unpad Q dimension
+    delta = delta.reshape([-1, delta.shape[-1], 1])
+
+    # bmhk -> bmk
+    q, k, v, out, grad_out = [bmhk2bmk(x).float() for x in (q, k, v, out, grad_out)]
+
+    attn_T = k @ q.transpose(-2, -1)
+    if mask is not None:
+        attn_T += mask.transpose(-2, -1)
+    attn_T = attn_T * (1 / q.shape[-1] ** 0.5)
+    attn_T = attn_T - lse.reshape([-1, 1, lse.shape[-1]])
+    attn_T = attn_T.exp()
+
+    grad_v = attn_T @ grad_out
+
+    dov = grad_out @ v.transpose(-2, -1)
+    tmp = (dov - delta) * attn_T.transpose(-2, -1)
+    tmp = tmp / (q.shape[-1] ** 0.5)
+
+    grad_q = tmp @ k
+    grad_k = tmp.transpose(-2, -1) @ q
+
+    return [x.reshape([B, H, x.shape[1], x.shape[-1]]).permute([0, 2, 1, 3]) for x in [grad_q, grad_k, grad_v]]
+
+
+print("initializing tensors...")
+query = torch.randn([B, Mq, H, K], dtype=dtype)
+key = 3 * torch.randn([B, Mkv, H, K], dtype=dtype)
+value = 3 * torch.randn([B, Mkv, H, Kv], dtype=dtype)
+mask = create_lower_triangular_mask() if causal else None
+
+# let PyTorch compute gradients
+query.requires_grad_(True)
+key.requires_grad_(True)
+value.requires_grad_(True)
+
+print("computing fw...")
+out, lse = ref_mha_bmhk(query, key, value, mask=mask)
+out = out.to(dtype).contiguous()
+grad_out = 3 * torch.randn([B, Mq, H, Kv], dtype=dtype)
+
+print("computing bw with autograd...")
+out.backward(grad_out)
+scale = (1 / query.shape[-1] ** 0.5)
+
+
+# Additional data needed by the kernel
+delta = (grad_out.float() * out.float()).sum(-1).transpose(-2, -1).contiguous()
+pad_amount = (32 - (lse.shape[2] % 32)) % 32
+lse = torch.nn.functional.pad(lse, [0, pad_amount], value=math.inf)
+
+print("computing bw with reference implem...")
+gQr, gKr, gVr = ref_mha_bw_bmhk(query, key, value, mask, lse, out, grad_out, delta)
+
+with PipedSubprocess(fmha_bw_binary) as bw_kernel:
+    # Send kernel arguments
+    bw_kernel.write(
+        TORCH_DTYPE_NAME[query.dtype],
+        "scale", scale,
+        "head_dim", K,
+        "head_dim_value", Kv,
+        "num_queries", Mq,
+        "num_keys", Mkv,
+        "num_heads", H,
+        "custom_mask_type", (1 if causal else 0),
+        "num_batches", B,
+        "repeat_count", repeat_count,
+        "num_splits_key", (Mkv // 128),
+    )
+    bw_kernel.writeTensor(query, "query", ["q_strideB", "q_strideM", "q_strideH"])
+    bw_kernel.writeTensor(key, "key", ["k_strideB", "k_strideM", "k_strideH"])
+    bw_kernel.writeTensor(value, "value", ["v_strideB", "v_strideM", "v_strideH"])
+    bw_kernel.writeTensor(lse, "logsumexp", ["lse_strideB", "lse_strideH"])
+    bw_kernel.writeTensor(out, "output", ["o_strideB", "o_strideM", "o_strideH"])
+    bw_kernel.writeTensor(grad_out, "grad_output", ["gO_strideB", "gO_strideM", "gO_strideH"])
+    bw_kernel.writeTensor(delta, "delta", ["delta_strideB", "delta_strideH"])
+
+    if bw_kernel.read() != "OK":
+        print("Got unexpected output")
+        print(bw_kernel.subp.communicate()[0])
+        sys.exit(0)
+
+    # Read kernel output
+    gQ = bw_kernel.readTensor("grad_query", ["gQ_strideB", "gQ_strideM", "gQ_strideH"], query.shape).float()
+    gK = bw_kernel.readTensor("grad_key", ["gK_strideB", "gK_strideM", "gK_strideH"], key.shape).float()
+    gV = bw_kernel.readTensor("grad_value", ["gV_strideB", "gV_strideM", "gV_strideH"], value.shape).float()
+    runtime_ms = float(bw_kernel.readNamed("runtime_ms"))
+
+float_ops = B * H * sum([
+    # att = Q @ K.transpose
+    Mq * Mkv * K * 2,
+    # att @ dO
+    Mkv * Mq * Kv * 2,
+    # dov = dO @ V
+    Mq * Kv * Mkv * 2,
+    # dov @ K
+    Mq * K * Mkv * 2,
+    # dov @ Q
+    Mq * K * Mkv * 2,
+])
+if causal:
+    float_ops //= 2
+
+print(f"""
+Fused multi-head attention - backward
+    batch_size={B}
+    num_queries={Mq}
+    num_keys={Mkv}
+    num_heads={H}
+    head_dim={K}
+    head_dim_value={Kv}
+
+    Correctness:
+        grad_query: {"PASS" if torch.allclose(gQ, gQr, rtol=RTOL, atol=ATOL) else "FAIL"} (delta: {(gQ - gQr).abs().max()})
+        grad_key:   {"PASS" if torch.allclose(gK, gKr, rtol=RTOL, atol=ATOL) else "FAIL"} (delta: {(gK - gKr).abs().max()})
+        grad_value: {"PASS" if torch.allclose(gV, gVr, rtol=RTOL, atol=ATOL) else "FAIL"} (delta: {(gV - gVr).abs().max()})
+        (atol={ATOL} / rtol={RTOL})
+    Runtime: {runtime_ms}ms ({(float_ops / (1024 ** 4)) / (runtime_ms / 1000):.4f} TFlops)
+""")
+
+assert torch.allclose(query.grad.float(), gQr, rtol=RTOL, atol=ATOL), "Reference implementation does not match PyTorch autograd!"
+assert torch.allclose(key.grad.float(), gKr, rtol=RTOL, atol=ATOL), "Reference implementation does not match PyTorch autograd!"
+assert torch.allclose(value.grad.float(), gVr, rtol=RTOL, atol=ATOL), "Reference implementation does not match PyTorch autograd!"
diff --git a/examples/41_fused_multi_head_attention/fmha_grouped.h b/examples/41_fused_multi_head_attention/fmha_grouped.h
new file mode 100644
index 0000000000..5a2f928ad8
--- /dev/null
+++ b/examples/41_fused_multi_head_attention/fmha_grouped.h
@@ -0,0 +1,1023 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Grouped FMHA kernel
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/trace.h"
+#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+
+#include "fmha_grouped_problem_visitor.h"
+#include "gemm_kernel_utils.h"
+#include "gemm/mma_accum_lambda_iterator.h"
+#include "epilogue/epilogue_rescale_output.h"
+
+
+namespace {
+  static CUTLASS_DEVICE float atomicMaxFloat(float* addr, float value) {
+  // source: https://stackoverflow.com/a/51549250
+  return (value >= 0)
+      ? __int_as_float(atomicMax((int*)addr, __float_as_int(value)))
+      : __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));
+}
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename MM0_,                           ///! Structure for computing P = Q @ K
+  typename MM1_,                           ///! Structure for computing O = P @ V
+  typename scalar_t_,
+  typename accum_t_,
+  typename output_t_,
+  typename output_accum_t_,
+  bool kKeepOutputInRF,                    ///! Whether the intermediate output from MM0_ should be kept in the register file
+  GroupScheduleMode GroupScheduleMode_     ///! Type of scheduling to perform
+>
+struct FMHAGrouped {
+public:
+  using MM0 = MM0_;
+  using MM1 = MM1_;
+
+  using scalar_t = scalar_t_;
+  using accum_t = accum_t_;
+  using output_t = output_t_;
+  using output_accum_t = output_accum_t_;
+
+  static GroupScheduleMode const kGroupScheduleMode = GroupScheduleMode_;
+
+  static constexpr bool kNeedsOutputAccumulatorBuffer = !kKeepOutputInRF &&
+      !cutlass::platform::is_same<output_accum_t, output_t>::value;
+
+  // Parameters to satisfy BaseGrouped
+  using ElementA = scalar_t;
+  using ElementB = scalar_t;
+  using ElementC = accum_t;
+  using LayoutA = typename MM0::LayoutA;
+  using LayoutB = typename MM0::ElementB;
+  using LayoutC = typename MM1::ElementC;
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+  static int const kAlignmentA = MM0::kAlignmentA;
+  static int const kAlignmentB = MM0::kAlignmentB;
+  static int const kAlignmentC = 1;
+  using Mma = typename MM1::Mma;
+  using EpilogueOutputOp = typename MM1::EpilogueOutputOp;
+  using ThreadblockSwizzle = void;
+  using Operator = typename MM1::Operator;
+  using WarpShape = typename MM1::WarpShape;
+  using InstructionShape = typename MM1::InstructionShape;
+
+  using ElementQ = scalar_t;
+  using ElementK = scalar_t;
+  using ElementP = accum_t;
+  using ElementV = scalar_t;
+  using ElementO = output_t;
+  using ElementOAccum = output_accum_t;
+  using ElementAccumulator = accum_t;
+
+  using LayoutQ = typename MM0::LayoutA;
+  using LayoutK = typename MM0::LayoutB;
+  using LayoutP = typename MM0::LayoutC;
+  using LayoutV = typename MM1::LayoutB;
+  using LayoutO = typename MM1::LayoutC;
+
+  static bool const kPreloadV = (MM1::Mma::ArchTag::kMinComputeCapability >= 80 &&
+                                 cutlass::sizeof_bits<ElementV>::value == 16);
+
+  static int const kAlignmentQ = MM0::kAlignmentA;
+  static int const kAlignmentK = MM0::kAlignmentB;
+  static int const kAlignmentV = 1;
+
+  using ThreadblockShape = typename MM0::ThreadblockShape;
+
+  static int const kQueriesPerBlock = ThreadblockShape::kM;
+  static int const kKeysPerBlock = ThreadblockShape::kN;
+
+  static constexpr bool kSupportsDropout = false;
+  static constexpr bool kSupportsBias = false;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename MM1::WarpCount;
+  static int const kThreadsPerWarp = 32;
+  static int const kThreadCount = kThreadsPerWarp * WarpCount::kCount;
+
+  static constexpr int kNumWarpsPerBlock =
+    kQueriesPerBlock * kKeysPerBlock / (kThreadsPerWarp * kThreadsPerWarp);
+
+  using ProblemVisitor = FMHAGroupedProblemVisitor<
+                            ThreadblockShape,
+                            kGroupScheduleMode,
+                            kThreadCount,
+                            kThreadCount>;
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord *problem_sizes0{nullptr};
+    GemmCoord *problem_sizes1{nullptr};
+
+    int problem_count{0};
+    int threadblock_count{0};
+
+    ElementQ ** ptr_Q{nullptr};
+    ElementK ** ptr_K{nullptr};
+    ElementP ** ptr_P{nullptr};
+    ElementV ** ptr_V{nullptr};
+    ElementO ** ptr_O{nullptr};
+    ElementOAccum ** ptr_O_accum{nullptr};
+
+    typename LayoutQ::Stride::LongIndex *ldq{nullptr};
+    typename LayoutK::Stride::LongIndex *ldk{nullptr};
+    typename LayoutP::Stride::LongIndex *ldv{nullptr};
+    typename LayoutO::Stride::LongIndex *ldo{nullptr};
+
+    // Whether causal masking is to be performed
+    bool causal{false};
+
+    // Scale
+    ElementAccumulator scale{0};
+
+    // Only used by device-level operator
+    GemmCoord *host_problem_sizes{nullptr};
+
+    //
+    // Methods
+    //
+  
+      /// Default ctor
+    Arguments() = default;
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord *problem_sizes0,
+      GemmCoord *problem_sizes1,
+      int problem_count,
+      int threadblock_count,
+      ElementQ ** ptr_Q,
+      ElementK ** ptr_K,
+      ElementP ** ptr_P,
+      ElementV ** ptr_V,
+      ElementO ** ptr_O,
+      ElementOAccum ** ptr_O_accum,
+      typename LayoutQ::Stride::LongIndex *ldq,
+      typename LayoutK::Stride::LongIndex *ldk,
+      typename LayoutP::Stride::LongIndex *ldp,
+      typename LayoutV::Stride::LongIndex *ldv,
+      typename LayoutO::Stride::LongIndex *ldo,
+      bool causal,
+      ElementAccumulator scale,
+      GemmCoord *host_problem_sizes=nullptr
+    ):
+      problem_sizes0(problem_sizes0),
+      problem_sizes1(problem_sizes1),
+      problem_count(problem_count),
+      threadblock_count(threadblock_count),
+      ptr_Q(ptr_Q),
+      ptr_K(ptr_K),
+      ptr_P(ptr_P),
+      ptr_V(ptr_V),
+      ptr_O(ptr_O),
+      ptr_O_accum(kNeedsOutputAccumulatorBuffer ? ptr_O_accum : (accum_t**)ptr_O),
+      ldq(ldq),
+      ldk(ldk),
+      ldv(ldv),
+      ldo(ldo),
+      causal(causal),
+      scale(scale),
+      host_problem_sizes(host_problem_sizes)
+    {
+
+    }
+
+    bool __host__ check_supported() {
+      CHECK_ALIGNED_PTR(ptr_Q, kAlignmentQ);
+      CHECK_ALIGNED_PTR(ptr_K, kAlignmentK);
+      CHECK_ALIGNED_PTR(ptr_V, kAlignmentV);
+      XFORMERS_CHECK(ldq % kAlignmentQ == 0, "query is not correctly aligned");
+      XFORMERS_CHECK(ldk % kAlignmentK == 0, "key is not correctly aligned");
+      XFORMERS_CHECK(ldv % kAlignmentV == 0, "value is not correctly aligned");
+      return true;
+    }
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+
+    typename ProblemVisitor::Params problem_visitor;
+    int threadblock_count;
+
+    ElementQ ** ptr_Q;
+    ElementK ** ptr_K;
+    ElementP ** ptr_P;
+    ElementV ** ptr_V;
+    ElementO ** ptr_O;
+    ElementOAccum ** ptr_O_accum;
+
+    typename LayoutQ::Stride::LongIndex *ldq;
+    typename LayoutK::Stride::LongIndex *ldk;
+    typename LayoutP::Stride::LongIndex *ldv;
+    typename LayoutO::Stride::LongIndex *ldo;
+
+    ElementAccumulator scale;
+    bool causal;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params():
+      ptr_Q(nullptr),
+      ptr_K(nullptr),
+      ptr_P(nullptr),
+      ptr_V(nullptr),
+      ptr_O(nullptr),
+      ptr_O_accum(nullptr),
+      ldq(nullptr),
+      ldk(nullptr),
+      ldv(nullptr),
+      ldo(nullptr),
+      causal(false),
+      scale(0)
+    { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Arguments const &args,
+          void *workspace = nullptr,
+          int tile_count = 0):
+      problem_visitor(args.problem_sizes0, args.problem_sizes1, args.problem_count, workspace, tile_count),
+      threadblock_count(args.threadblock_count),
+      ptr_Q(args.ptr_Q),
+      ptr_K(args.ptr_K),
+      ptr_P(args.ptr_P),
+      ptr_V(args.ptr_V),
+      ptr_O(args.ptr_O),
+      ptr_O_accum(kNeedsOutputAccumulatorBuffer ? args.ptr_O_accum : (accum_t**)args.ptr_O),
+      ldq(args.ldq),
+      ldk(args.ldk),
+      ldv(args.ldv),
+      ldo(args.ldo),
+      causal(args.causal),
+      scale(args.scale)
+    { 
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    void update(
+      Arguments const &args,
+      void *workspace = nullptr,
+      int tile_count = 0) {
+
+      problem_visitor = typename ProblemVisitor::Params(args.problem_sizes0,
+                                                        args.problem_sizes1,
+                                                        args.problem_count,
+                                                        workspace, tile_count);
+      threadblock_count = args.threadblock_count;
+      ptr_Q = args.ptr_Q;
+      ptr_K = args.ptr_K;
+      ptr_P = args.ptr_P;
+      ptr_V = args.ptr_V;
+      ptr_O = args.ptr_O;
+      ptr_O_accum = kNeedsOutputAccumulatorBuffer ? args.ptr_O_accum : (accum_t**)args.ptr_O;
+      ldq = args.ldq;
+      ldk = args.ldk;
+      ldv = args.ldv;
+      ldo = args.ldo;
+      causal = args.causal;
+      scale = args.scale;
+    }
+  };
+
+  // Shared storage - depends on kernel params
+  struct ScalingCoefs {
+    cutlass::Array<ElementAccumulator, kQueriesPerBlock> m_prime;
+    cutlass::Array<ElementAccumulator, kQueriesPerBlock> s_prime;
+    cutlass::Array<ElementAccumulator, kQueriesPerBlock> mi;
+    cutlass::Array<ElementAccumulator, kQueriesPerBlock> out_rescale;
+    cutlass::Array<ElementAccumulator, kQueriesPerBlock * MM0::MmaCore::WarpCount::kN>
+        addition_storage;
+  };
+
+  struct SharedStorageEpilogueAtEnd : ScalingCoefs {
+    struct SharedStorageAfterMM0 {
+      // Everything here might be overwritten during MM0
+      typename MM0::AccumulatorSharedStorage si;
+      typename MM1::Mma::SharedStorage mm1;
+    };
+
+    union {
+      typename MM0::Mma::SharedStorage mm0;
+      SharedStorageAfterMM0 after_mm0;
+      typename MM1::DefaultEpilogue::SharedStorage epilogue;
+    };
+
+    CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage&
+    epilogue_shared_storage() {
+      return epilogue;
+    }
+
+    // ProblemVisitor shared storage can't be overlapped with others
+    typename ProblemVisitor::SharedStorage problem_visitor;
+  };
+
+  struct SharedStorageEpilogueInLoop : ScalingCoefs {
+    struct SharedStorageAfterMM0 {
+      // Everything here might be overwritten during MM0
+      typename MM0::AccumulatorSharedStorage si;
+      typename MM1::Mma::SharedStorage mm1;
+      typename MM1::DefaultEpilogue::SharedStorage epilogue;
+    };
+
+    union {
+      typename MM0::Mma::SharedStorage mm0;
+      SharedStorageAfterMM0 after_mm0;
+    };
+
+    CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage&
+    epilogue_shared_storage() {
+      return after_mm0.epilogue;
+    }
+
+    // ProblemVisitor shared storage can't be overlapped with others
+    typename ProblemVisitor::SharedStorage problem_visitor;
+  };
+
+  using SharedStorage = typename cutlass::platform::conditional<
+      kKeepOutputInRF,
+      SharedStorageEpilogueAtEnd,
+      SharedStorageEpilogueInLoop>::type;
+
+private:
+
+  // Parameters to be used by an individual tile
+  struct TileParams {
+
+    CUTLASS_HOST_DEVICE
+    static int query_start(int threadblock_idx) {
+      return threadblock_idx * kQueriesPerBlock;
+    }
+
+    // Returns whether this threadblock computes within the number of queries,
+    // which is determined by the M dimension of problem 0
+    CUTLASS_HOST_DEVICE
+    static bool can_compute(int threadblock_idx, const GemmCoord& problem_size0) {
+      return query_start(threadblock_idx) < problem_size0.m();
+    }
+
+    CUTLASS_HOST_DEVICE
+    static int num_queries(int threadblock_idx, const GemmCoord& problem_size0) {
+      return problem_size0.m() - query_start(threadblock_idx);
+    }
+
+    CUTLASS_HOST_DEVICE
+    static int num_keys(int threadblock_idx, const GemmCoord& problem_size0, bool causal) {
+      int nk = problem_size0.n();
+      if (causal) {
+        nk = cutlass::fast_min(int32_t(query_start(threadblock_idx) + kQueriesPerBlock), nk);
+      }
+      return nk;
+    }
+
+  };
+
+public:
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  FMHAGrouped() { }
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(cutlass::gemm::GemmCoord const & problem_size) {
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return Status::kSuccess;
+  }
+
+  static CUTLASS_DEVICE int16_t thread_id() {
+    return threadIdx.x;
+  }
+
+  static CUTLASS_DEVICE int8_t warp_id() {
+    return threadIdx.x / kThreadsPerWarp;
+  }
+
+  static CUTLASS_DEVICE int8_t lane_id() {
+    return threadIdx.x % kThreadsPerWarp;
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+    auto& m_prime = shared_storage.m_prime;
+    auto& s_prime = shared_storage.s_prime;
+    [[maybe_unused]] auto& si = shared_storage.after_mm0.si;
+    auto& mi = shared_storage.mi;
+    auto& out_rescale = shared_storage.out_rescale;
+
+    ProblemVisitor problem_visitor(
+      params.problem_visitor,
+      shared_storage.problem_visitor,
+      blockIdx.x);
+
+    // Outer 'persistent' loop to iterate over tiles
+    while (problem_visitor.next_tile()) {
+
+      GemmCoord problem_size0 = problem_visitor.problem_size0();
+      GemmCoord problem_size1 = problem_visitor.problem_size1();
+      const int32_t threadblock_idx = int32_t(problem_visitor.threadblock_idx());
+
+      if (!TileParams::can_compute(threadblock_idx, problem_size0)) {
+        problem_visitor.advance(gridDim.x);
+        continue;
+      }
+
+      const int32_t problem_idx = problem_visitor.problem_index();
+
+      if (thread_id() < kQueriesPerBlock) {
+        s_prime[thread_id()] = ElementAccumulator(0);
+        out_rescale[thread_id()] = accum_t(1.0);
+        m_prime[thread_id()] =
+            -cutlass::platform::numeric_limits<ElementAccumulator>::infinity();
+        mi[thread_id()] = -cutlass::platform::numeric_limits<ElementAccumulator>::infinity();
+      }
+
+      ElementO *ptr_O = params.ptr_O[problem_idx]  + TileParams::query_start(threadblock_idx) * params.ldo[problem_idx];
+      ElementOAccum *ptr_O_accum = params.ptr_O_accum[problem_idx]  + TileParams::query_start(threadblock_idx) * params.ldo[problem_idx];
+      const int num_queries = TileParams::num_queries(threadblock_idx, problem_size0);
+
+      auto createOutputIter = [&](int col) -> typename MM1::OutputTileIterator {
+        using OutputTileIterator = typename MM1::OutputTileIterator;
+        return OutputTileIterator(
+            typename OutputTileIterator::Params{(int32_t)params.ldo[problem_idx]},
+            ptr_O,
+            typename OutputTileIterator::TensorCoord{
+                num_queries, problem_size1.n()},
+            thread_id(),
+            {0, col});
+      };
+
+      auto createOutputAccumIter = [&](int col) ->
+        typename MM1::OutputTileIteratorAccum {
+          using OutputTileIteratorAccum = typename MM1::OutputTileIteratorAccum;
+          return OutputTileIteratorAccum(
+              typename OutputTileIteratorAccum::Params{(int32_t)params.ldo[problem_idx]},
+              ptr_O_accum,
+              typename OutputTileIteratorAccum::TensorCoord{
+                  num_queries, problem_size1.n()},
+              thread_id(),
+              {0, col});
+        };
+
+      typename MM1::Mma::FragmentC accum_o;
+      accum_o.clear();
+
+      const int num_keys = TileParams::num_keys(threadblock_idx, problem_size0, params.causal);
+
+      for (int32_t iter_key_start = 0; iter_key_start < num_keys;
+           iter_key_start += kKeysPerBlock) {
+        int32_t problem_size_0_m =
+            cutlass::fast_min((int32_t)kQueriesPerBlock, num_queries);
+        int32_t problem_size_0_n = cutlass::fast_min(
+            (int32_t)kKeysPerBlock, num_keys - iter_key_start);
+        int32_t const& problem_size_0_k = problem_size0.k();
+        int32_t const& problem_size_1_n = problem_size1.n();
+        int32_t const& problem_size_1_k = problem_size_0_n;
+
+        auto prologueV = [&](int blockN) {
+          typename MM1::Mma::IteratorB iterator_V(
+              typename MM1::IteratorB::Params{typename MM1::LayoutB(params.ldv[problem_idx])},
+              params.ptr_V[problem_idx] + iter_key_start * params.ldv[problem_idx],
+              {problem_size_1_k, problem_size_1_n},
+              thread_id(),
+              cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN});
+
+          MM1::Mma::prologue(
+              shared_storage.after_mm0.mm1,
+              iterator_V,
+              thread_id(),
+              problem_size_1_k);
+        };
+
+        __syncthreads(); // Need to have shared memory initialized, and `m_prime`
+                         // updated from end of prev iter
+
+        //
+        // MATMUL: Q.K_t
+        //
+        // Computes the block-matrix product of:
+        // (a) query[query_start:query_end, :]
+        // with
+        // (b) key[iter_key_start:iter_key_start + kKeysPerBlock]
+        // and stores that into `shared_storage.si`
+        //
+
+        ElementQ *ptr_Q = params.ptr_Q[problem_idx] + TileParams::query_start(threadblock_idx) * params.ldq[problem_idx];
+
+        // Construct iterators to A and B operands
+        typename MM0::IteratorA iterator_A(
+          typename MM0::IteratorA::Params(
+              typename MM0::MmaCore::LayoutA(params.ldq[problem_idx])),
+          ptr_Q,
+          {problem_size_0_m, problem_size_0_k},
+          thread_id(),
+          {0, 0});
+
+        typename MM0::IteratorB iterator_B(
+            typename MM0::IteratorB::Params(
+                typename MM0::MmaCore::LayoutB(params.ldk[problem_idx])),
+            params.ptr_K[problem_idx] + iter_key_start * params.ldk[problem_idx],
+            {problem_size_0_k, problem_size_0_n},
+            thread_id(),
+            {0, 0});
+
+        // Construct thread-scoped matrix multiply
+        typename MM0::Mma mma(
+            shared_storage.mm0, thread_id(), warp_id(), lane_id());
+
+        typename MM0::Mma::FragmentC accum;
+
+        accum.clear();
+
+        auto gemm_k_iterations =
+            (problem_size_0_k + MM0::Mma::Shape::kK - 1) / MM0::Mma::Shape::kK;
+
+        // Compute threadblock-scoped matrix multiply-add
+        mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
+        __syncthreads();
+
+        if (kPreloadV) {
+          prologueV(0);
+        } else {
+          MM1::Mma::drain_cp_asyncs();
+        }
+
+        typename MM0::Mma::Operator::IteratorC::TensorCoord
+          iteratorC_tile_offset = {
+              (warp_id() % MM0::Mma::WarpCount::kM),
+              (warp_id() / MM0::Mma::WarpCount::kM)
+            };
+
+        // Mask out last if causal
+        if (params.causal && num_keys - iter_key_start <= kKeysPerBlock) {
+          auto lane_offset = MM0::AccumLambdaIterator::get_lane_offset(
+              lane_id(), warp_id(), iteratorC_tile_offset);
+          int32_t last_col;
+          MM0::AccumLambdaIterator::iterateRows(
+              lane_offset,
+              [&](int accum_m) {
+                last_col = TileParams::query_start(threadblock_idx) + accum_m - iter_key_start;
+              },
+              [&](int accum_m, int accum_n, int idx) {
+                if (accum_n > last_col) {
+                  accum[idx] =
+                      -cutlass::platform::numeric_limits<accum_t>::infinity();
+                }
+              },
+              [&](int accum_m) {});
+        }
+        // DISPATCH_BOOL(iter_key_start == 0, kIsFirst, ([&] {
+        //         DISPATCH_BOOL(
+        //             num_keys - iter_key_start >= kKeysPerBlock,
+        //             kFullColumns,
+        //             ([&] {
+        //               // Update `mi` from accum stored in registers
+        //               // Also does accum[i] <- exp(accum[i] - mi)
+        //               iterative_softmax<
+        //                   typename MM0::Mma::Operator::IteratorC,
+        //                   kFullColumns,
+        //                   kIsFirst>(
+        //                   accum_o,
+        //                   accum,
+        //                   mi,
+        //                   m_prime,
+        //                   s_prime,
+        //                   lane_id(),
+        //                   thread_id(),
+        //                   warp_id(),
+        //                   num_keys - iter_key_start,
+        //                   iteratorC_tile_offset,
+        //                   kSupportsBias ? 1.0f : params.scale);
+        //             }));
+        //       }));
+
+        // Update `mi` from accum stored in registers
+        // Also does accum[i] <- exp(accum[i] - mi)
+        iterative_softmax<typename MM0::Mma::Operator::IteratorC>(
+            accum_o,
+            accum,
+            mi,
+            m_prime,
+            s_prime,
+            out_rescale,
+            shared_storage.addition_storage,
+            lane_id(),
+            thread_id(),
+            warp_id(),
+            num_keys - iter_key_start,
+            iter_key_start == 0,
+            iteratorC_tile_offset,
+            kSupportsBias ? 1.0f : params.scale);
+
+        // Output results to shared-memory
+        int warp_idx_mn_0 = warp_id() %
+            (MM0::Mma::Base::WarpCount::kM * MM0::Mma::Base::WarpCount::kN);
+        auto output_tile_coords = cutlass::MatrixCoord{
+            warp_idx_mn_0 % MM0::Mma::Base::WarpCount::kM,
+            warp_idx_mn_0 / MM0::Mma::Base::WarpCount::kM};
+
+        MM0::B2bGemm::accumToSmem(
+            shared_storage.after_mm0.si, accum, lane_id(), output_tile_coords);
+
+        __syncthreads();
+
+        //
+        // MATMUL: Attn . V
+        // Run the matmul `attn @ V` for a block of attn and V.
+        // `attn` is read from shared memory (in `shared_storage_si`)
+        // `V` is read from global memory (with iterator_B)
+        //
+
+        const int64_t nBlockN = kKeepOutputInRF ? 1
+                                                : ceil_div(
+                                                      (int64_t)problem_size_1_n,
+                                                      int64_t(MM1::ThreadblockShape::kN));
+
+        // Iterate over the N dimension of GEMM1
+        for (int blockN = 0; blockN < nBlockN; ++blockN) {
+          int gemm_k_iterations =
+              (problem_size_1_k + MM1::Mma::Shape::kK - 1) / MM1::Mma::Shape::kK;
+
+          // Compute threadblock-scoped matrix multiply-add and store it in accum
+          // (in registers)
+          if (!kPreloadV) {
+            __syncthreads(); // we share shmem between mma and epilogue
+          }
+
+          typename MM1::Mma::IteratorB iterator_V(
+            typename MM1::IteratorB::Params{typename MM1::LayoutB(params.ldv[problem_idx])},
+            params.ptr_V[problem_idx] + iter_key_start * params.ldv[problem_idx],
+            {problem_size_1_k, problem_size_1_n},
+            thread_id(),
+            cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN});
+
+          typename MM1::Mma mma_pv(
+            // operand A: Pij_dropped in shared memory
+            shared_storage.after_mm0.si.accum_ref(),
+            // operand B: shared memory staging area for Vj, which is loaded
+            // from global memory
+            shared_storage.after_mm0.mm1.operand_B_ref(),
+            (int)thread_id(),
+            (int)warp_id(),
+            (int)lane_id());
+
+          mma_pv.set_prologue_done(kPreloadV);
+          if (!kKeepOutputInRF) {
+            accum_o.clear();
+          }
+
+          mma_pv(gemm_k_iterations, accum_o, iterator_V, accum_o);
+          __syncthreads();
+
+          if (kPreloadV && !kKeepOutputInRF && blockN + 1 < nBlockN) {
+            prologueV(blockN + 1);
+          }
+
+          if (!kKeepOutputInRF) {
+            MM1::Mma::drain_cp_asyncs();
+            DISPATCH_BOOL(
+                iter_key_start == 0, kIsFirst, ([&] {
+                  DISPATCH_BOOL(
+                      (iter_key_start + kKeysPerBlock) >= num_keys,
+                      kIsLast,
+                      ([&] {
+                        using DefaultEpilogue = typename MM1::DefaultEpilogue;
+                        using DefaultOp = typename MM1::DefaultConfig::EpilogueOutputOp;
+                        using ElementCompute = typename DefaultOp::ElementCompute;
+                        using EpilogueOutputOp = typename cutlass::epilogue::
+                            thread::MemoryEfficientAttentionNormalize<
+                                typename cutlass::platform::conditional<
+                                    kIsLast::value,
+                                    output_t,
+                                    output_accum_t>::type,
+                                output_accum_t,
+                                DefaultOp::kCount,
+                                typename DefaultOp::ElementAccumulator,
+                                output_accum_t,
+                                kIsFirst::value,
+                                kIsLast::value,
+                                cutlass::Array<ElementCompute, kQueriesPerBlock>>;
+                        using Epilogue = typename cutlass::epilogue::threadblock::
+                            EpiloguePipelined<
+                                typename DefaultEpilogue::Shape,
+                                typename MM1::Mma::Operator,
+                                DefaultEpilogue::kPartitionsK,
+                                typename cutlass::platform::conditional<
+                                    kIsLast::value,
+                                    typename MM1::OutputTileIterator,
+                                    typename MM1::OutputTileIteratorAccum>::type,
+                                typename DefaultEpilogue::
+                                    AccumulatorFragmentIterator,
+                                typename DefaultEpilogue::WarpTileIterator,
+                                typename DefaultEpilogue::SharedLoadIterator,
+                                EpilogueOutputOp,
+                                typename DefaultEpilogue::Padding,
+                                DefaultEpilogue::kFragmentsPerIteration,
+                                true, // IterationsUnroll
+                                typename MM1::OutputTileIteratorAccum // Read
+                                                                      // iterator
+                                >;
+
+                        int col = blockN * MM1::Mma::Shape::kN;
+                        auto source_iter = createOutputAccumIter(col);
+                        auto dest_iter = gemm_kernel_utils::call_conditional<
+                            kIsLast::value,
+                            decltype(createOutputIter),
+                            decltype(createOutputAccumIter)>::
+                            apply(createOutputIter, createOutputAccumIter, col);
+                        EpilogueOutputOp rescale(s_prime, out_rescale);
+                        Epilogue epilogue(
+                            shared_storage.epilogue_shared_storage(),
+                            thread_id(),
+                            warp_id(),
+                            lane_id());
+                        epilogue(rescale, dest_iter, accum_o, source_iter);
+                      }));
+                }));
+            if (!kKeepOutputInRF) {
+              __syncthreads();
+            }
+          }
+        }
+         __syncthreads(); // we modify `m_prime` after
+      }
+
+      if (kKeepOutputInRF) {
+        constexpr bool kIsFirst = true;
+        constexpr bool kIsLast = true;
+        using DefaultEpilogue = typename MM1::DefaultEpilogue;
+        using DefaultOp = typename MM1::DefaultConfig::EpilogueOutputOp;
+        using ElementCompute = typename DefaultOp::ElementCompute;
+        using EpilogueOutputOp =
+            typename cutlass::epilogue::thread::MemoryEfficientAttentionNormalize<
+                output_t,       // output
+                output_accum_t, // source
+                DefaultOp::kCount,
+                typename DefaultOp::ElementAccumulator, // accum
+                output_accum_t, // compute
+                kIsFirst,
+                kIsLast,
+                cutlass::Array<ElementCompute, kQueriesPerBlock>>;
+        using Epilogue =
+            typename cutlass::epilogue::threadblock::EpiloguePipelined<
+                typename DefaultEpilogue::Shape,
+                typename MM1::Mma::Operator,
+                DefaultEpilogue::kPartitionsK,
+                typename MM1::OutputTileIterator, // destination
+                typename DefaultEpilogue::AccumulatorFragmentIterator,
+                typename DefaultEpilogue::WarpTileIterator,
+                typename DefaultEpilogue::SharedLoadIterator,
+                EpilogueOutputOp,
+                typename DefaultEpilogue::Padding,
+                DefaultEpilogue::kFragmentsPerIteration,
+                true, // IterationsUnroll
+                typename MM1::OutputTileIteratorAccum // source tile
+                >;
+        auto dest_iter = createOutputIter(0);
+        EpilogueOutputOp rescale(s_prime, out_rescale);
+        Epilogue epilogue(
+            shared_storage.epilogue_shared_storage(),
+            thread_id(),
+            warp_id(),
+            lane_id());
+        MM1::Mma::drain_cp_asyncs();
+        epilogue(rescale, dest_iter, accum_o);
+      }
+
+      // Next tile
+      problem_visitor.advance(gridDim.x);
+      __syncthreads(); // Don't start the next iteration until all threads are done using shared memory.
+    }
+  }
+
+  template <typename WarpIteratorC>
+  CUTLASS_DEVICE static void iterative_softmax(
+      typename WarpIteratorC::Fragment& frag_o, // output so far
+      typename WarpIteratorC::Fragment& frag,
+      cutlass::Array<accum_t, kQueriesPerBlock>& mi,
+      cutlass::Array<accum_t, kQueriesPerBlock>& m_prime,
+      cutlass::Array<accum_t, kQueriesPerBlock>& s_prime,
+      cutlass::Array<accum_t, kQueriesPerBlock>& out_rescale,
+      cutlass::Array<accum_t, kQueriesPerBlock * MM0::MmaCore::WarpCount::kN>&
+          addition_storage,
+      int8_t lane_id,
+      int8_t thread_id,
+      int8_t warp_id,
+      int max_col,
+      bool is_first,
+      typename WarpIteratorC::TensorCoord const& tile_offset,
+      float scaling) {
+    /* Iterates on the accumulator and corresponding position on result matrix
+
+    (1) Update `mi[r]` to the max value of the row `r`
+    (2) In a second iteration do the following:
+        (a) accum   <- exp(accum - mi)
+        (b) m_prime <- exp(m_prime - mi)
+        (c) s_prime <- s_prime * m_prime + sum(accum)
+
+    All of this is done on registers, before we store all of this
+    on shared memory for the next matmul with Value.
+    */
+    using Fragment = typename WarpIteratorC::Fragment;
+    using LambdaIterator = typename DefaultMmaAccumLambdaIterator<
+        WarpIteratorC,
+        accum_t,
+        kThreadsPerWarp>::Iterator;
+    // Convert to `accum_t` (rather than double)
+    constexpr float kLog2e = 1.4426950408889634074; // log_2(e) = M_LOG2E
+
+    static_assert(kQueriesPerBlock % kNumWarpsPerBlock == 0, "");
+    static constexpr int kLinesPerWarp = kQueriesPerBlock / kNumWarpsPerBlock;
+
+    frag = cutlass::multiplies<Fragment>()(scaling * kLog2e, frag);
+
+    auto lane_offset =
+        LambdaIterator::get_lane_offset(lane_id, warp_id, tile_offset);
+
+    // First update `mi` to the max per-row
+    {
+      accum_t max;
+      LambdaIterator::iterateRows(
+          lane_offset,
+          [&](int accum_m) {
+            max = -cutlass::platform::numeric_limits<accum_t>::infinity();
+          },
+          [&](int accum_m, int accum_n, int idx) {
+            if (accum_n < max_col) {
+              max = cutlass::fast_max(max, frag[idx]);
+            }
+          },
+          [&](int accum_m) {
+            // Having 4x atomicMax seems faster than reduce within warp
+            // first...
+            atomicMaxFloat(&mi[accum_m], max);
+          });
+    }
+
+    // Make sure we all share the update values for `mi`
+    __syncthreads();
+
+    // Doing this `exp` is quite expensive. Let's
+    // split it across the warps
+    bool restore_mi_to_minus_inf = false;
+    if (lane_id < kLinesPerWarp) {
+      int id = warp_id * kLinesPerWarp + lane_id;
+      auto m_prime_id = m_prime[id];
+      auto mi_id = mi[id];
+      bool changed = m_prime_id < mi_id; // `false` if both are -inf
+      if (changed) {
+        auto m_prime_exp = exp2f(m_prime_id - mi_id);
+        out_rescale[id] = m_prime_exp;
+        s_prime[id] *= m_prime_exp;
+      } else {
+        // Only when bias is enabled, it's possible that all the first values
+        // of attention are masked to `-inf`. In that case we want to avoid
+        // `nan = exp2f(-inf - (-inf))` so we temporarily set `mi` to 0
+        if (kSupportsBias &&
+            mi_id == -cutlass::platform::numeric_limits<accum_t>::infinity()) {
+          restore_mi_to_minus_inf = true;
+          mi[id] = 0.0f;
+        }
+        out_rescale[id] = 1.0f;
+      }
+    }
+    __syncthreads(); // Update output fragments
+    if (kKeepOutputInRF && !is_first) {
+      accum_t line_rescale;
+      LambdaIterator::iterateRows(
+          lane_offset,
+          [&](int accum_m) { line_rescale = out_rescale[accum_m]; },
+          [&](int accum_m, int accum_n, int idx) {
+            frag_o[idx] = frag_o[idx] * line_rescale;
+          },
+          [&](int accum_m) {});
+    }
+    // Update accum_m, accum_n, ...
+    {
+      accum_t mi_row, total_row;
+      LambdaIterator::iterateRows(
+          lane_offset,
+          [&](int accum_m) { mi_row = mi[accum_m]; },
+          [&](int accum_m, int accum_n, int idx) {
+            frag[idx] =
+                (accum_n < max_col) ? exp2f(frag[idx] - mi_row) : accum_t(0.0);
+          },
+          [&](int accum_m) {});
+      LambdaIterator::iterateRows(
+          lane_offset,
+          [&](int accum_m) { total_row = 0.0; },
+          [&](int accum_m, int accum_n, int idx) { total_row += frag[idx]; },
+          [&](int accum_m) {
+            if (LambdaIterator::reduceSameRow(
+                    lane_id, total_row, [](accum_t a, accum_t b) {
+                      return a + b;
+                    })) {
+              // NOTE: we could atomically add `total_row` to `s_prime`, but
+              // it's faster (and deterministic) to avoid atomics here
+              addition_storage
+                  [accum_m + kQueriesPerBlock * tile_offset.column()] =
+                      total_row;
+            }
+          });
+    }
+
+    __syncthreads();
+    if (lane_id < kLinesPerWarp) {
+      int id = warp_id * kLinesPerWarp + lane_id;
+      accum_t total_row = s_prime[id];
+      if (restore_mi_to_minus_inf) {
+        // Restore `mi`, see above when we set `restore_mi_to_minus_inf=true`
+        mi[id] = -cutlass::platform::numeric_limits<accum_t>::infinity();
+      } else {
+        m_prime[id] = mi[id];
+      }
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < MM0::MmaCore::WarpCount::kN; ++i) {
+        total_row += addition_storage[id + kQueriesPerBlock * i];
+      }
+      s_prime[id] = total_row;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/41_fused_multi_head_attention/fmha_grouped_problem_visitor.h b/examples/41_fused_multi_head_attention/fmha_grouped_problem_visitor.h
new file mode 100644
index 0000000000..38695d5a81
--- /dev/null
+++ b/examples/41_fused_multi_head_attention/fmha_grouped_problem_visitor.h
@@ -0,0 +1,178 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Scheduler for grouped FMHA
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/gemm/kernel/grouped_problem_visitor.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+// Helper for correctly representing problem sizes in grouped kernels
+template <typename ThreadblockShape>
+struct FMHAGroupedProblemSizeHelper {
+
+  CUTLASS_HOST_DEVICE
+  static cutlass::gemm::GemmCoord grid_shape(const cutlass::gemm::GemmCoord& problem) {
+    // FMHA only partitions tiles across the M dimension.
+    return cutlass::gemm::GemmCoord(
+      ((problem.m() - 1 + ThreadblockShape::kM) / ThreadblockShape::kM), 1, 1);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static void possibly_transpose_problem(cutlass::gemm::GemmCoord& problem) {}
+
+  CUTLASS_HOST_DEVICE
+  static int32_t tile_count(const cutlass::gemm::GemmCoord& grid) {
+    return grid.m() * grid.n();
+  }
+};
+
+} // namespace detail
+
+/// Visitor class to abstract away the algorithm for iterating over tiles
+template <typename ThreadblockShape,
+          GroupScheduleMode GroupScheduleMode_,
+          int PrefetchTileCount,
+          int ThreadCount,
+          bool Transposed = false>
+struct FMHAGroupedProblemVisitor : public GroupedProblemVisitor<
+                                            detail::FMHAGroupedProblemSizeHelper<ThreadblockShape>,
+                                            ThreadblockShape,
+                                            GroupScheduleMode_,
+                                            PrefetchTileCount,
+                                            ThreadCount> {
+
+  using ProblemSizeHelper = detail::FMHAGroupedProblemSizeHelper<ThreadblockShape>;
+  using Base = GroupedProblemVisitor<ProblemSizeHelper, ThreadblockShape, GroupScheduleMode_, PrefetchTileCount, ThreadCount>;
+  using BaseParams = typename Base::Params;
+  using SharedStorage = typename Base::SharedStorage;
+
+  cutlass::gemm::GemmCoord const *problem_sizes0;
+  cutlass::gemm::GemmCoord const *problem_sizes1;
+
+  struct Params {
+    cutlass::gemm::GemmCoord const *problem_sizes0;
+    cutlass::gemm::GemmCoord const *problem_sizes1;
+    int32_t                         problem_count;
+    void const                     *workspace;
+    int32_t                         tile_count;
+
+    //
+    // Methods
+    //
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Params(): problem_sizes0(nullptr), problem_sizes1(nullptr),
+              problem_count(0), workspace(nullptr), tile_count(0) { }
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const *problem_sizes0,
+      cutlass::gemm::GemmCoord const *problem_sizes1,
+      int32_t                         problem_count,
+      void const                     *workspace = nullptr,
+      int32_t                         tile_count = 0
+    ):
+      problem_sizes0(problem_sizes0),
+      problem_sizes1(problem_sizes1),
+      problem_count(problem_count),
+      workspace(workspace),
+      tile_count(tile_count)
+    {}
+
+    /// Convert the FMHA-specific parameters to those used by the base class
+    CUTLASS_HOST_DEVICE
+    BaseParams to_base() const {
+        return BaseParams(// Set problem_sizes as problem_sizes1 because these determine
+                          // shape of the final output of FMHA
+                          problem_sizes1,
+                          problem_count,
+                          workspace,
+                          tile_count);
+    }
+
+  };
+
+  //
+  // Methods
+  //
+  CUTLASS_DEVICE
+  FMHAGroupedProblemVisitor(
+    Params const &params_,
+    SharedStorage &shared_storage_, 
+    int32_t block_idx
+  ): Base (
+        params_.to_base(),
+        shared_storage_, block_idx),
+     problem_sizes0(params_.problem_sizes0),
+     problem_sizes1(params_.problem_sizes1)
+  {}
+
+  /// Returns the problem size 0 for the current problem
+  CUTLASS_HOST_DEVICE
+  cutlass::gemm::GemmCoord problem_size0() const {
+    GemmCoord problem = problem_sizes0[this->problem_idx];
+    ProblemSizeHelper::possibly_transpose_problem(problem);
+    return problem;
+  }
+
+  /// Returns the problem size 1 for the current problem
+  CUTLASS_HOST_DEVICE
+  cutlass::gemm::GemmCoord problem_size1() const {
+    GemmCoord problem = problem_sizes1[this->problem_idx];
+    ProblemSizeHelper::possibly_transpose_problem(problem);
+    return problem;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/41_fused_multi_head_attention/fused_multi_head_attention_backward.cu b/examples/41_fused_multi_head_attention/fused_multi_head_attention_backward.cu
new file mode 100644
index 0000000000..544e400fc9
--- /dev/null
+++ b/examples/41_fused_multi_head_attention/fused_multi_head_attention_backward.cu
@@ -0,0 +1,298 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <vector>
+#include <iostream>
+#include <fstream>
+
+#include "kernel_backward.h"
+
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/host_tensor.h"
+
+
+using Arch = cutlass::arch::Sm80;
+static constexpr int kMaxK = 128;
+
+template <typename ArchTag, typename Element, int kMaxK>
+struct DefaultKernel {
+    // Some heuristics to select the best kernel (tested on Sm60, Sm70, Sm80)
+    // NOTE: Requires quite a lot of shmem for Sm80+,
+    // so might require tweaking those manually for Sm86/Sm89
+
+    static constexpr bool kSupports64x128 =
+        ArchTag::kMinComputeCapability >= 80 ||
+        (ArchTag::kMinComputeCapability >= 70 &&
+        cutlass::sizeof_bits<Element>::value <= 16);
+    static constexpr int kBlockSizeI = kSupports64x128 && kMaxK > 64 ? 128 : 64;
+    static constexpr bool kIsHalf = cutlass::sizeof_bits<Element>::value <= 16;
+    static constexpr bool kOutputInRF = kIsHalf && kMaxK <= kBlockSizeI;
+    static constexpr bool kPreload = kIsHalf && ArchTag::kMinComputeCapability >= 80 && kOutputInRF;
+    static constexpr int kBlockSizeJ = kPreload && kMaxK > 64 ? 128 : 64;
+
+    using Kernel = AttentionBackwardKernel<
+        Arch,
+        Element,
+        true,        // kIsAligned_
+        false,       // kApplyDropout_
+        kPreload,    // kPreload_
+        kBlockSizeI, // kBlockSizeI_,
+        kBlockSizeJ, // kBlockSizeJ_,
+        kMaxK,       // kMaxK
+        false,       // kKeysQueriesAlignedToBlockSize
+        true         // kEnableSplitKeys
+    >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+template <typename T> struct TypeName;
+template <> struct TypeName<float> { static constexpr const char* Name = "f32"; };
+template <> struct TypeName<cutlass::half_t> { static constexpr const char* Name = "f16"; };
+template <> struct TypeName<cutlass::bfloat16_t> { static constexpr const char* Name = "b16"; };
+
+void readExpect(std::string const& expected) {
+    std::string read;
+    std::cin >> read;
+    if (read != expected) {
+        std::cerr << "FATAL: Read '" << read << "' but expected '" << expected << "'" << std::endl;
+        std::exit(1);
+    }
+}
+
+/// Helpers to read from stdin
+template <typename Element>
+cutlass::HostTensor<Element, cutlass::layout::RowMajor> readTensorOnDevice(std::string const& expectedName) {
+    readExpect("tensor_begin");
+    readExpect(std::string(TypeName<Element>::Name) + ":" + expectedName);
+    uint64_t len = 0;
+    std::cin >> len;
+    readExpect("file");
+    std::string filename;
+    std::cin >> filename;
+
+    cutlass::HostTensor<Element, cutlass::layout::RowMajor> tensor({int64_t(1), int64_t(len / sizeof(Element))});
+    uint8_t* data = (uint8_t*)tensor.host_data();
+
+    std::fstream myFile(filename, std::ios::in | std::ios::binary );
+    myFile.read((char*)data, len);
+    readExpect("tensor_end");
+    tensor.sync_device();
+    return tensor;
+}
+
+int64_t readInt64(std::string const& expectedName) {
+    readExpect(expectedName);
+    int64_t s = 0;
+    std::cin >> s;
+    return s;
+}
+
+float readFloat(std::string const& expectedName) {
+    readExpect(expectedName);
+    float s = 0;
+    std::cin >> s;
+    return s;
+}
+
+// Writing
+template <typename Element>
+void writeTensor(std::string const& name, cutlass::HostTensor<Element, cutlass::layout::RowMajor>& tensor) {
+    tensor.sync_host(); // device->host
+    size_t u8len = tensor.size() * sizeof(Element);
+
+    // Python is expected to provide a file name to write to
+    readExpect("tmpfile");
+    std::string tmpfile;
+    std::cin >> tmpfile;
+
+    uint8_t* data = (uint8_t*)tensor.host_data();
+    std::fstream myFile(tmpfile, std::ios::out | std::ios::binary );
+    myFile.write((char*)data, u8len);
+    myFile.close();
+
+    std::cout << "tensor_begin " << TypeName<Element>::Name << ":" << name << " ";
+    std::cout << u8len << " file " << tmpfile << " tensor_end" << std::endl;
+}
+
+void writeInt64(std::string const& name, int64_t value) {
+    std::cout << name << " " << value << std::endl;
+}
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Element>
+int runKernel() {
+    using Kernel = typename DefaultKernel<Arch, Element, kMaxK>::Kernel;
+
+#define READ_I64(NAME) p.NAME = (decltype(p.NAME))readInt64(#NAME)
+#define READ_TENSOR_AND_STRIDES_BMH(DT, NAME, NAME_XS) \
+    auto storage##NAME = readTensorOnDevice<DT>(#NAME); \
+    p.NAME##_ptr = storage##NAME.device_data(); \
+    READ_I64(NAME_XS##_strideB); \
+    READ_I64(NAME_XS##_strideM); \
+    READ_I64(NAME_XS##_strideH);
+
+#define CUDA_CHECK(FN) { \
+    auto cudaError = FN; \
+    if (cudaError != cudaSuccess) { \
+        std::cerr << "FATAL: " #FN " failed: " << cudaGetErrorString(cudaError) << std::endl; \
+        return -1; \
+    } \
+}
+
+    typename Kernel::Params p;
+    p.scale = readFloat("scale");
+    READ_I64(head_dim);
+    READ_I64(head_dim_value);
+    READ_I64(num_queries);
+    READ_I64(num_keys);
+    READ_I64(num_heads);
+    READ_I64(custom_mask_type);
+    READ_I64(num_batches);
+    int64_t repeat_count = readInt64("repeat_count");
+    READ_I64(num_splits_key);
+
+    READ_TENSOR_AND_STRIDES_BMH(Element, query, q);
+    READ_TENSOR_AND_STRIDES_BMH(Element, key, k);
+    READ_TENSOR_AND_STRIDES_BMH(Element, value, v);
+    auto lse = readTensorOnDevice<typename Kernel::lse_scalar_t>("logsumexp");
+    p.logsumexp_ptr = lse.device_data();
+    p.lse_strideB = readInt64("lse_strideB");
+    p.lse_strideH = readInt64("lse_strideH");
+
+    // output
+    auto stOutput = readTensorOnDevice<Element>("output");
+    p.output_ptr = stOutput.device_data();
+    READ_I64(o_strideB);
+    auto o_strideM = readInt64("o_strideM");
+    if (o_strideM != p.o_strideM()) {
+        std::cerr << "Invalid `o_strideM`: " << o_strideM << " - expected " << p.o_strideM();
+        return 2;
+    }
+    READ_I64(o_strideH);
+
+    READ_TENSOR_AND_STRIDES_BMH(Element, grad_output, gO);
+
+    auto stDelta = readTensorOnDevice<typename Kernel::accum_t>("delta");
+    p.delta_ptr = stDelta.device_data();
+    READ_I64(delta_strideB);
+    READ_I64(delta_strideH);
+
+    // Allocate workspace
+    if (p.workspace_size()) {
+        cudaMalloc(&p.workspace, p.workspace_size());
+    }
+
+    // Allocate outputs in BMHK format
+    p.gQKV_strideM_multiplier = 1;
+    p.gQ_strideH = p.head_dim;
+    p.gQ_strideB = p.gQ_strideM() * p.num_queries;
+    p.gK_strideH = p.head_dim;
+    p.gK_strideB = p.gK_strideM() * p.num_keys;
+    p.gV_strideH = p.head_dim_value;
+    p.gV_strideB = p.gV_strideM() * p.num_keys;
+
+    cutlass::HostTensor<Element, cutlass::layout::RowMajor> gQ({int64_t(1), p.gQ_strideB * p.num_batches});
+    cutlass::HostTensor<Element, cutlass::layout::RowMajor> gK({int64_t(1), p.gK_strideB * p.num_batches});
+    cutlass::HostTensor<Element, cutlass::layout::RowMajor> gV({int64_t(1), p.gV_strideB * p.num_batches});
+    p.grad_query_ptr = gQ.device_data();
+    p.grad_key_ptr = gK.device_data();
+    p.grad_value_ptr = gV.device_data();
+
+    if (!Kernel::check_supported(p)) {
+      std::cerr << "FATAL: Kernel does not support these inputs" << std::endl;
+      return 2;
+    }
+
+    // Run kernel
+    cudaDeviceSynchronize();
+    auto kernel_fn = attention_kernel_backward_batched_impl<Kernel>;
+    size_t smem_bytes = sizeof(typename Kernel::SharedStorage);
+    CUDA_CHECK(cudaFuncSetAttribute(kernel_fn, cudaFuncAttributeMaxDynamicSharedMemorySize, int(smem_bytes)));
+    kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes>>>(p);
+
+    // Write outputs
+    std::cout << "OK ";
+    writeTensor("grad_query", gQ);
+    writeInt64("gQ_strideB", p.gQ_strideB);
+    writeInt64("gQ_strideM", p.gQ_strideM());
+    writeInt64("gQ_strideH", p.gQ_strideH);
+    writeTensor("grad_key", gK);
+    writeInt64("gK_strideB", p.gK_strideB);
+    writeInt64("gK_strideM", p.gK_strideM());
+    writeInt64("gK_strideH", p.gK_strideH);
+    writeTensor("grad_value", gV);
+    writeInt64("gV_strideB", p.gV_strideB);
+    writeInt64("gV_strideM", p.gV_strideM());
+    writeInt64("gV_strideH", p.gV_strideH);
+
+    // Timing
+    cudaEvent_t events[2];
+    for (auto & event : events) {
+      CUDA_CHECK(cudaEventCreate(&event));
+    }
+    CUDA_CHECK(cudaEventRecord(events[0]));
+    for (int i = 0; i < repeat_count; ++i) {
+        kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes>>>(p);
+    }
+    CUDA_CHECK(cudaEventRecord(events[1]));
+    CUDA_CHECK(cudaEventSynchronize(events[1]));
+    // Measure elapsed runtime
+    float runtime_ms = 0;
+    CUDA_CHECK(cudaEventElapsedTime(&runtime_ms, events[0], events[1]));
+
+    std::cout << "runtime_ms " << runtime_ms / float(repeat_count) << std::endl;
+    return 0;
+}
+
+int main() {
+    std::ios_base::sync_with_stdio(false);
+
+    std::string dtype;
+    std::cin >> dtype;
+    std::cerr << "Running kernel with dtype: " << dtype << std::endl;
+    if (dtype == "f16") {
+        return runKernel<cutlass::half_t>();
+    } else if (dtype == "b16") {
+        return runKernel<cutlass::bfloat16_t>();
+    } else if (dtype == "f32") {
+        return runKernel<float>();
+    } else {
+        std::cerr << "FATAL: Unknown dtype: " << dtype << std::endl;
+        return 3;
+    }
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/41_fused_multi_head_attention/fused_multihead_attention_fixed_seqlen.cu b/examples/41_fused_multi_head_attention/fused_multihead_attention_fixed_seqlen.cu
new file mode 100644
index 0000000000..cf02a7b933
--- /dev/null
+++ b/examples/41_fused_multi_head_attention/fused_multihead_attention_fixed_seqlen.cu
@@ -0,0 +1,1110 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief CUTLASS Attention Example.
+
+    This workload computes a fused multi head attention.
+    Because it keeps the attention matrix in shared memory, it's both faster and
+    uses less global memory.
+
+    This is based on `"Self-Attention Does Not Need O(n^2) Memory" <http://arxiv.org/abs/2112.05682>`_,
+    and very similar to `"FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness" <https://arxiv.org/abs/2205.14135>`_.
+
+    Algorithm:
+      In short, we can compute the output incrementally in blocks of size B,
+      we just need to divide the final result by the sum of all coefficients in
+      the softmax (which we compute incrementally) with the following pseudo-code:
+
+      ```
+      s_prime = torch.zeros([num_queries, B])
+      O = torch.zeros([num_queries, head_size_v])
+      for i in range(0, K.shape[0], B):
+        si = exp((Q . K[i * B:(i+1) * B].t) * scale)
+        sum_coefs += attn_unscaled.sum(-1)
+        O  += si . V[i * B:(i+1) * B]
+      O = O / s_prime
+      ```
+
+      In practice, and for numerical stability reasons,
+      we also substract the maximum so far (`mi`) before doing
+      the exponential. When we encounter new keys, the maximum
+      used to compute O so far (`m_prime`) can differ from the
+      current maximum, so we update O before accumulating with
+
+      ```
+      O       = O * exp(m_prime - mi)
+      m_prime = mi
+      ```
+
+    Implementation details:
+      - `si` is stored in shared memory between the 2 back to back gemms
+      - we keep and accumulate the output
+      directly in registers if we can (`head_size_v <= 128`).
+      Otherwise, we store it & accumulate in global memory (slower)
+      - blocks are parallelized across the batch dimension, the number
+      of heads, and the query sequence size
+
+
+    Examples:
+
+      # Run an attention example with default setup
+      $ ./examples/41_fused_multi_head_attention/41_fused_multi_head_attention_fixed_seqlen
+
+      # Run an attention example with custom setup
+      $ ./examples/41_fused_multi_head_attention/41_fused_multi_head_attention_fixed_seqlen --head_number=2 --batch_size=3 --head_size=32 --head_size_v=64 --seq_length=512 --seq_length_kv=1024 --causal=true
+
+      Acknowledgement: Fixed-sequence-length FMHA code was upstreamed by Meta xFormers (https://github.com/facebookresearch/xformers).
+*/
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <vector>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm_grouped.h"
+#include "cutlass/gemm/kernel/default_gemm_grouped.h"
+#include "cutlass/gemm/device/gemm_grouped.h"
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm_complex.h"
+#include "cutlass/util/reference/device/gemm_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/kernel/gemm_grouped.h"
+#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/kernel/default_gemm_complex.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_with_visitor.h"
+#include "cutlass/fast_math.h"
+#include "kernel_forward.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Result structure
+struct Result {
+
+  double runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  //
+  // Methods
+  //
+
+  Result(
+    double runtime_ms = 0,
+    double gflops = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess
+  ):
+    runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+  bool error;
+  bool reference_check;
+  bool use_mask;
+  bool causal;
+
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes0;
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes1;
+
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes0_real;
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes1_real;
+
+  int alignment;
+  int head_number;
+  int batch_size;
+  int head_size;
+  int head_size_v;
+  int seq_length;
+  int seq_length_kv;
+  int iterations;
+
+  // alpha0, alpha1 and beta are fixed 
+  // in this multi-head attention example
+  float alpha0;
+  float alpha1;
+  float beta;
+
+  //
+  // Methods
+  // 
+
+  Options():
+    help(false),
+    error(false),
+    alignment(1),
+    reference_check(true),
+    head_number(12),
+    batch_size(16),
+    head_size(64),
+    head_size_v(64),
+    seq_length(1024),
+    seq_length_kv(1024),
+    use_mask(false),
+    iterations(20),
+    causal(false)
+  { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("alignment", alignment, 1);
+    cmd.get_cmd_line_argument("head_number", head_number, 12);
+    cmd.get_cmd_line_argument("batch_size", batch_size, 16);
+    cmd.get_cmd_line_argument("head_size", head_size, 64);
+    cmd.get_cmd_line_argument("head_size_v", head_size_v, head_size);
+    cmd.get_cmd_line_argument("seq_length", seq_length, 1024);
+    cmd.get_cmd_line_argument("seq_length_kv", seq_length_kv, seq_length);
+    cmd.get_cmd_line_argument("use_mask", use_mask, false);
+    cmd.get_cmd_line_argument("iterations", iterations, 20);
+    cmd.get_cmd_line_argument("reference-check", reference_check, true);
+    cmd.get_cmd_line_argument("causal", causal, true);
+
+    randomize_problems();
+
+  }
+
+  void randomize_problems() {
+
+    int problem_count = head_number * batch_size;
+
+    problem_sizes0.reserve(problem_count);
+    problem_sizes1.reserve(problem_count);
+
+    // When using mask, the original inputs are not padded
+    // and we need to save these info.
+    if (use_mask) {
+      problem_sizes0_real.reserve(problem_count);
+      problem_sizes1_real.reserve(problem_count);
+    }
+
+    for (int i = 0; i < batch_size; ++i) {
+      // problems belonging to the same batch share the same seq len
+      int m_real = seq_length;
+      int mkv_real = seq_length_kv;
+      int m = (m_real + alignment - 1) / alignment * alignment;
+      int mkv = (mkv_real + alignment - 1) / alignment * alignment;
+      int k0 = head_size;
+      int k1 = head_size_v;
+
+      for (int j = 0; j < head_number; ++j) {
+        cutlass::gemm::GemmCoord problem0(m, mkv, k0);
+        cutlass::gemm::GemmCoord problem1(m, k1, mkv);
+        problem_sizes0.push_back(problem0);
+        problem_sizes1.push_back(problem1);
+
+        if (use_mask) {
+          cutlass::gemm::GemmCoord problem0_real(m_real, mkv_real, k0);
+          cutlass::gemm::GemmCoord problem1_real(m_real, k1, mkv_real);
+          problem_sizes0_real.push_back(problem0_real);
+          problem_sizes1_real.push_back(problem1_real);
+        }
+      }
+    }
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "41_fused_multi_head_attention_fixed_seqlen\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement.\n\n"
+      << "  --head_number=<int>         Head number in multi-head attention (default: --head_number=12)\n"
+      << "  --batch_size=<int>          Batch size in multi-head attention (default: --batch_size=16)\n"
+      << "  --head_size=<int>           Head size in multi-head attention (default: --head_size=64)\n"
+      << "  --head_size_v=<int>         Head size in multi-head attention for V (default: --head_size_v=head_size)\n"
+      << "  --seq_length=<int>          Sequence length in multi-head attention for Q (default: --seq_length=1024)\n"
+      << "  --seq_length_kv=<int>       Sequence length in multi-head attention for K/V (default: --seq_length_kv=seq_length)\n"
+      << "  --use_mask=<bool>           If true, performs padding-like masking in softmax.\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n"
+      << "  --reference-check=<bool>    If true, performs reference check.\n"
+      << "  --causal=<bool>             If true, uses causal masking.\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+
+    // Number of real-valued multiply-adds 
+    int64_t fops = int64_t();
+
+    for (size_t i = 0; i < problem_sizes0.size(); ++i) {
+      auto const& problem0 = problem_sizes0[i];
+      auto const& problem1 = problem_sizes1[i];
+      for (int row = 0; row < problem0.m(); ++row) {
+        int num_cols0 = problem0.n();
+        if (causal) {
+          num_cols0 = std::min(row + 1, num_cols0);
+        }
+        // P <- Q . K_t
+        fops += 2 * num_cols0 * problem0.k();
+        // P <- exp(P - max(P))
+        fops += 2 * num_cols0;
+        // S <- sum(P)
+        fops += num_cols0 - 1;
+        // O <- P . V
+        fops += 2 * num_cols0 * problem1.n();
+        // O <- O / S
+        fops += num_cols0 * problem1.n();
+      }
+    }
+
+    return double(fops) / double(1.0e9) / runtime_s;
+  }
+};
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Attention>
+class TestbedAttention {
+public:
+
+  //
+  // Type definitions
+  //
+
+  using ElementQ = typename Attention::scalar_t;
+  using ElementK = typename Attention::scalar_t;
+  using ElementP = typename Attention::accum_t;
+  using ElementAccumulator = typename Attention::accum_t;
+  using ElementV = typename Attention::scalar_t;
+  using ElementO = typename Attention::output_t;
+
+  using ElementCompute = typename Attention::accum_t;
+
+  using ElementNorm = typename Attention::accum_t;
+  using ElementSum = typename Attention::accum_t;
+  using ElementSoftmaxCompute = typename Attention::accum_t;
+
+  using LayoutQ = cutlass::layout::RowMajor;
+  using LayoutK = cutlass::layout::ColumnMajor;
+  using LayoutP = cutlass::layout::RowMajor;
+  using LayoutV = cutlass::layout::RowMajor;
+  using LayoutO = cutlass::layout::RowMajor;
+
+  using MatrixCoord = typename LayoutP::TensorCoord;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Options & options;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_Q;
+  cutlass::Distribution::Kind init_K;
+  cutlass::Distribution::Kind init_P;
+  cutlass::Distribution::Kind init_V;
+  cutlass::Distribution::Kind init_O;
+  uint32_t seed;
+
+  cutlass::DeviceAllocation<cutlass::gemm::GemmCoord> problem_sizes_device0;
+  cutlass::DeviceAllocation<cutlass::gemm::GemmCoord> problem_sizes_device1;
+  cutlass::DeviceAllocation<cutlass::gemm::GemmCoord> problem_sizes_device0_real;
+
+  std::vector<int64_t> offset_Q;
+  std::vector<int64_t> offset_K;
+  std::vector<int64_t> offset_P;
+  std::vector<int64_t> offset_V;
+  std::vector<int64_t> offset_O;
+
+  std::vector<int64_t> ldq_host;
+  std::vector<int64_t> ldk_host;
+  std::vector<int64_t> ldp_host;
+  std::vector<int64_t> ldv_host;
+  std::vector<int64_t> ldo_host;
+  std::vector<int64_t> seqlen_host;
+
+  cutlass::DeviceAllocation<int64_t> ldq;
+  cutlass::DeviceAllocation<int64_t> ldk;
+  cutlass::DeviceAllocation<int64_t> ldp;
+  cutlass::DeviceAllocation<int64_t> ldv;
+  cutlass::DeviceAllocation<int64_t> ldo;
+  cutlass::DeviceAllocation<int64_t> seqlen;
+
+  cutlass::DeviceAllocation<ElementQ> block_Q;
+  cutlass::DeviceAllocation<ElementK> block_K;
+  cutlass::DeviceAllocation<ElementP> block_P;
+  cutlass::DeviceAllocation<ElementV> block_V;
+  cutlass::DeviceAllocation<ElementO> block_O;
+  cutlass::DeviceAllocation<ElementNorm> block_Norm;
+  cutlass::DeviceAllocation<ElementSum> block_Sum;
+
+  cutlass::DeviceAllocation<int64_t> offset_P_Device;
+
+  cutlass::DeviceAllocation<ElementQ *> ptr_Q;
+  cutlass::DeviceAllocation<ElementK *> ptr_K;
+  cutlass::DeviceAllocation<ElementP *> ptr_P;
+  cutlass::DeviceAllocation<ElementV *> ptr_V;
+  cutlass::DeviceAllocation<ElementO *> ptr_O;
+
+public:
+
+  //
+  // Methods
+  //
+
+  TestbedAttention(
+    Options &options_,
+    cutlass::Distribution::Kind init_Q_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_K_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_P_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_V_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_O_ = cutlass::Distribution::Uniform,
+    uint32_t seed_ = 3080
+  ):
+    options(options_), init_Q(init_Q_), init_K(init_K_), init_P(init_P_), init_V(init_V_), init_O(init_O_), seed(seed_) { }
+
+  int problem_count() const {
+    return (options.head_number * options.batch_size);
+  }
+
+private:
+
+  /// Helper to initialize a tensor view
+  template <typename Element>
+  void initialize_tensor_(
+    Element *ptr,
+    size_t capacity, 
+    cutlass::Distribution::Kind dist_kind,
+    uint32_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      Element scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<ElementP>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        scope_max = 8;
+        scope_min = -8;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::device::BlockFillRandomUniform(
+        ptr, capacity, seed, scope_max, scope_min, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::device::BlockFillRandomGaussian(
+        ptr, capacity, seed, Element(), Element(0.5f));
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      // Fill with increasing elements
+      cutlass::reference::device::BlockFillSequential(
+        ptr, capacity, Element(1), Element());
+    } 
+    else {
+
+      // Fill with all 1s
+      cutlass::reference::device::BlockFillSequential(
+        ptr, capacity, Element(), Element(1));
+    }
+  }
+
+  /// Initializes data structures
+  void initialize_() {
+
+    //
+    // Set scalors for the mha example
+    //
+
+    options.alpha0 = 1.0f / sqrt(float(options.head_size));
+    options.alpha1 = 1.0f;
+    options.beta = 0;
+
+    //
+    // Choose random problem sizes
+    //
+
+    // construct a few problems of random sizes
+    srand(seed);
+
+    int64_t total_elements_Q = 0;
+    int64_t total_elements_K = 0;
+    int64_t total_elements_P = 0;
+    int64_t total_elements_V = 0;
+    int64_t total_elements_O = 0;
+
+    ldq_host.resize(problem_count());
+    ldk_host.resize(problem_count());
+    ldp_host.resize(problem_count());
+    ldv_host.resize(problem_count());
+    ldo_host.resize(problem_count());
+    seqlen_host.resize(problem_count());
+
+    // Create tensors in BMHK format, where
+    // B = batch_size
+    // M = sequence length
+    // H = num_heads
+    // K = embedding size per head
+    int64_t batch_offset_Q, batch_offset_K, batch_offset_V, batch_offset_O;
+
+    for (int32_t b = 0; b < options.batch_size; ++b) {
+      batch_offset_Q = total_elements_Q;
+      batch_offset_K = total_elements_K;
+      batch_offset_V = total_elements_V;
+      batch_offset_O = total_elements_O;
+      for (int32_t h = 0; h < options.head_number; ++h) {
+        int32_t i = h + b * options.head_number;
+
+        auto problem0 = options.problem_sizes0.at(i);
+        auto problem1 = options.problem_sizes1.at(i);
+
+        ldq_host.at(i) = LayoutQ::packed({problem0.m(), options.head_number * problem0.k()}).stride(0);
+        ldk_host.at(i) = LayoutK::packed({options.head_number * problem0.k(), problem0.n()}).stride(0);
+        ldp_host.at(i) = LayoutP::packed({problem0.m(), problem0.n()}).stride(0);
+        ldv_host.at(i) = LayoutV::packed({problem1.k(), options.head_number * problem1.n()}).stride(0);
+        ldo_host.at(i) = LayoutO::packed({problem1.m(), options.head_number * problem1.n()}).stride(0);
+
+        // m = n for attention problems.
+        seqlen_host.at(i) = problem0.m();
+
+        offset_Q.push_back(batch_offset_Q + h * problem0.k());
+        offset_K.push_back(batch_offset_K + h * problem0.k());
+        offset_P.push_back(total_elements_P);
+        offset_V.push_back(batch_offset_V + h * problem0.k());
+        offset_O.push_back(batch_offset_O + h * problem1.n());
+
+        int64_t elements_Q = problem0.m() * problem0.k();
+        int64_t elements_K = problem0.k() * problem0.n();
+        int64_t elements_P = problem0.m() * problem0.n();
+        int64_t elements_V = problem1.k() * problem1.n();
+        int64_t elements_O = problem1.m() * problem1.n();
+
+        total_elements_Q += elements_Q;
+        total_elements_K += elements_K;
+        total_elements_P += elements_P;
+        total_elements_V += elements_V;
+        total_elements_O += elements_O;
+      }
+    }
+
+    problem_sizes_device0.reset(problem_count());
+    problem_sizes_device1.reset(problem_count());
+    problem_sizes_device0.copy_from_host(options.problem_sizes0.data());
+    problem_sizes_device1.copy_from_host(options.problem_sizes1.data());
+
+    if (options.use_mask) {
+      problem_sizes_device0_real.reset(problem_count());
+      problem_sizes_device0_real.copy_from_host(options.problem_sizes0_real.data());
+    }
+
+    ldq.reset(problem_count());
+    ldk.reset(problem_count());
+    ldp.reset(problem_count());
+    ldv.reset(problem_count());
+    ldo.reset(problem_count());
+    seqlen.reset(problem_count());
+
+    ldq.copy_from_host(ldq_host.data());
+    ldk.copy_from_host(ldk_host.data());
+    ldp.copy_from_host(ldp_host.data());
+    ldv.copy_from_host(ldv_host.data());
+    ldo.copy_from_host(ldo_host.data());
+    seqlen.copy_from_host(seqlen_host.data());
+
+    //
+    // Assign pointers
+    //
+
+    block_Q.reset(total_elements_Q);
+    block_K.reset(total_elements_K);
+    block_P.reset(total_elements_P);
+    block_V.reset(total_elements_V);
+    block_O.reset(total_elements_O);
+
+    offset_P_Device.reset(problem_count());
+
+    // sync offset with device
+    cutlass::device_memory::copy_to_device(offset_P_Device.get(), offset_P.data(), offset_P.size());
+
+    std::vector<ElementQ *> ptr_Q_host(problem_count());
+    std::vector<ElementK *> ptr_K_host(problem_count());
+    std::vector<ElementP *> ptr_P_host(problem_count());
+    std::vector<ElementV *> ptr_V_host(problem_count());
+    std::vector<ElementO *> ptr_O_host(problem_count());
+    std::vector<ElementNorm *> ptr_norm_host(problem_count());
+    std::vector<ElementSum *> ptr_sum_host(problem_count());
+
+    for (int32_t i = 0; i < problem_count(); ++i) {
+      ptr_Q_host.at(i) = block_Q.get() + offset_Q.at(i);
+      ptr_K_host.at(i) = block_K.get() + offset_K.at(i);
+      ptr_P_host.at(i) = block_P.get() + offset_P.at(i);
+      ptr_V_host.at(i) = block_V.get() + offset_V.at(i);
+      ptr_O_host.at(i) = block_O.get() + offset_O.at(i);
+    }
+
+    ptr_Q.reset(problem_count());
+    ptr_Q.copy_from_host(ptr_Q_host.data());
+    
+    ptr_K.reset(problem_count());
+    ptr_K.copy_from_host(ptr_K_host.data());
+    
+    ptr_P.reset(problem_count());
+    ptr_P.copy_from_host(ptr_P_host.data());
+
+    ptr_V.reset(problem_count());
+    ptr_V.copy_from_host(ptr_V_host.data());
+
+    ptr_O.reset(problem_count());
+    ptr_O.copy_from_host(ptr_O_host.data());
+
+    //
+    // Initialize the problems of the workspace
+    //
+
+    initialize_tensor_(block_Q.get(), total_elements_Q, init_Q, seed + 1);
+    initialize_tensor_(block_K.get(), total_elements_K, init_K, seed + 2);
+    initialize_tensor_(block_V.get(), total_elements_V, init_V, seed + 3);
+
+  }
+
+  template<typename Element>
+  bool verify_tensor_(std::vector<Element> vector_Input, \
+                       std::vector<Element> vector_Input_Ref,
+                       int64_t verify_length = -1) {
+
+    int64_t size = (vector_Input.size() < vector_Input_Ref.size()) ? vector_Input.size() : vector_Input_Ref.size();
+    size = (verify_length == -1) ? size : verify_length;
+
+    // 0.05 for absolute error
+    float abs_tol = 5e-2f;
+    // 10% for relative error
+    float rel_tol = 1e-1f;
+    for (int64_t i = 0; i < size; ++i) {
+      float diff = (float)(vector_Input.at(i) - vector_Input_Ref.at(i));
+      float abs_diff = fabs(diff);
+      float abs_ref = fabs((float)vector_Input_Ref.at(i) + 1e-5f);
+      float relative_diff = abs_diff / abs_ref;
+      if ( (isnan(vector_Input_Ref.at(i)) || isnan(abs_diff) || isinf(abs_diff)) ||  (abs_diff > abs_tol && relative_diff > rel_tol)) {
+        printf("[%d/%d] diff = %f, rel_diff = %f, {computed=%f, ref=%f}.\n", int(i), int(size), abs_diff, relative_diff, (float)(vector_Input.at(i)), (float)(vector_Input_Ref.at(i)));
+        return false;
+      }
+
+    }
+
+    return true;
+  }
+
+  /// Verifies the result is a GEMM
+  bool verify_() {
+
+    bool passed = true;
+
+    for (int32_t b = 0; b < options.batch_size; ++b) {
+      int32_t i = b * options.head_number;
+      // Problem size is the same for all heads
+      cutlass::gemm::GemmCoord problem0 = options.problem_sizes0.at(b * options.head_number);
+      cutlass::gemm::GemmCoord problem1 = options.problem_sizes1.at(b * options.head_number);
+
+      MatrixCoord extent_Q{problem0.m(), problem0.k()};
+      MatrixCoord extent_K{problem0.k(), problem0.n()};
+      MatrixCoord extent_P{problem0.m(), problem0.n()};
+      MatrixCoord extent_V{problem1.k(), problem1.n()};
+      MatrixCoord extent_O{problem1.m(), problem1.n()};
+
+      LayoutO layout_O(ldo_host.at(i));
+      std::vector<ElementO> matrix_O(layout_O.capacity(extent_O));
+      cutlass::device_memory::copy_to_host(matrix_O.data(),   block_O.get() + offset_O.at(i), matrix_O.size());
+      cutlass::DeviceAllocation<ElementO>    block_Ref_O(layout_O.capacity(extent_O));
+
+      for (int32_t h = 0; h < options.head_number; ++h) {
+        i = h + b * options.head_number;
+
+        LayoutQ layout_Q(ldq_host.at(i));
+        LayoutK layout_K(ldk_host.at(i));
+        LayoutP layout_P(ldp_host.at(i));
+        LayoutV layout_V(ldv_host.at(i));
+
+        cutlass::TensorView<ElementQ, LayoutQ> view_Q(block_Q.get() + offset_Q.at(i), layout_Q, extent_Q);
+        cutlass::TensorView<ElementK, LayoutK> view_K(block_K.get() + offset_K.at(i), layout_K, extent_K);
+        cutlass::TensorView<ElementV, LayoutV> view_V(block_V.get() + offset_V.at(i), layout_V, extent_V);
+        cutlass::TensorView<ElementO, LayoutO> view_Ref_O_device(block_Ref_O.get() + offset_O.at(i) - offset_O.at(b * options.head_number), layout_O, extent_O);
+
+        cutlass::DeviceAllocation<ElementP>    block_Ref_P(layout_P.capacity(extent_P));
+        cutlass::TensorView<ElementP, LayoutP> view_Ref_P_device(block_Ref_P.get(), layout_P, extent_P);
+
+        // Reference GEMM
+        cutlass::reference::device::GemmComplex<
+            ElementQ, LayoutQ,
+            ElementK, LayoutK,
+            ElementP, LayoutP, 
+            ElementCompute, ElementAccumulator
+        >(
+          problem0,
+          ElementAccumulator(options.alpha0), 
+          view_Q,
+          Attention::MM0::Mma::kTransformA,
+          view_K,
+          Attention::MM0::Mma::kTransformB,
+          ElementAccumulator(options.beta), 
+          view_Ref_P_device, 
+          view_Ref_P_device, 
+          ElementAccumulator(0)
+        );
+
+        // Compute softmax for P. We need to explicitly compute softmax
+        // over P because softmax is fused to the second GEMM in the
+        // profiled implementation.
+        std::vector<ElementP> matrix_Ref(layout_P.capacity(extent_P));
+        cutlass::device_memory::copy_to_host(matrix_Ref.data(), block_Ref_P.get(), matrix_Ref.size());
+        cutlass::TensorView<ElementP, LayoutP> view_Ref_host(matrix_Ref.data(), layout_P, extent_P);
+        std::vector<ElementNorm> vector_Norm_Ref(problem0.m());
+        std::vector<ElementSum> vector_Sum_Ref(problem0.m());
+
+        int n_dim = options.use_mask ? options.problem_sizes0_real.at(i).n() : problem0.n();
+
+        // Compute softmax for reference matrix
+        for (int m = 0; m < problem0.m(); m++) {
+          int n_dim_row = n_dim;
+          if (options.causal) {
+            n_dim_row = std::min(m + 1, n_dim);
+          }
+          ElementSoftmaxCompute max = ElementSoftmaxCompute(view_Ref_host.ref().at({m, 0}));
+          for (int n = 1; n < n_dim_row; n++) {
+            max = std::max(max, ElementSoftmaxCompute(view_Ref_host.ref().at({m, n})));
+          }
+
+          vector_Norm_Ref.at(m) = ElementNorm(max);
+
+          ElementSoftmaxCompute sum = ElementSoftmaxCompute();
+          for (int n = 0; n < n_dim_row; n++) {
+            sum += std::exp( ElementSoftmaxCompute(view_Ref_host.ref().at({m, n})) - max );
+          }
+          ElementSoftmaxCompute inv_sum = ElementSoftmaxCompute(1.0f / sum);
+
+          vector_Sum_Ref.at(m) = ElementSum(inv_sum);
+
+          for (int n = 0; n < n_dim_row; n++) {
+            view_Ref_host.ref().at({m, n}) = ElementP(
+              std::exp( ElementSoftmaxCompute(view_Ref_host.ref().at({m, n})) - max ) * inv_sum
+            );
+          }
+          // Mask out the rest of the attention matrix
+          for (int n = n_dim_row; n < n_dim; ++n) {
+            view_Ref_host.ref().at({m, n}) = ElementP(0);
+          }
+        }
+
+        // when not using mask, problem_real and problem share the same sizes
+        if (options.use_mask) {
+          for (int m = 0; m < problem0.m(); m++) {
+            for (int n = n_dim; n < problem0.n(); n++) {
+              view_Ref_host.ref().at({m, n}) = ElementP(0);
+            }
+          }
+        }
+
+        cutlass::device_memory::copy_to_device(block_Ref_P.get(), matrix_Ref.data(), matrix_Ref.size());
+
+        // Reference GEMM
+        cutlass::reference::device::GemmComplex<
+            ElementP, LayoutP,
+            ElementV, LayoutV,
+            ElementO, LayoutO, 
+            ElementCompute, ElementAccumulator
+        >(
+          problem1,
+          ElementAccumulator(options.alpha1), 
+          view_Ref_P_device,
+          Attention::MM0::Mma::kTransformA,
+          view_V,
+          Attention::MM0::Mma::kTransformB,
+          ElementAccumulator(options.beta), 
+          view_Ref_O_device, 
+          view_Ref_O_device, 
+          ElementAccumulator(0)
+        );
+      }
+
+      // Copy to host memory
+      std::vector<ElementO> matrix_Ref_O(layout_O.capacity(extent_O));
+      cutlass::device_memory::copy_to_host(matrix_Ref_O.data(), block_Ref_O.get(), matrix_Ref_O.size());
+
+      // printf("Pb %d: \n    Q=(offset=%d, ldq=%d)\n    K=(offset=%d, ldk=%d)\n    O=(offset=%d, ldo=%d)\n",
+      //   int(i), int(offset_Q[i]), int(ldq_host[i]), int(offset_K[i]), int(ldk_host[i]), int(offset_O[i]), int(ldo_host[i]));
+  
+      bool verified_O = false;
+
+      if (!verified_O) {
+        verified_O = verify_tensor_<ElementO>(matrix_O, matrix_Ref_O);
+      }
+
+      passed = passed && verified_O;
+
+      if (!passed) {
+        std::cerr << "\n***\nError - problem " << i << " (batch " << b << ") failed the QA check\n***\n" << std::endl;
+
+        if (!verified_O) {
+          std::cout << "Final matrix output is incorrect" << std::endl;
+        }
+
+        return passed;
+      }
+    }
+
+    return passed;
+  }
+
+public:
+
+
+  /// Executes a CUTLASS Attention kernel and measures runtime.
+  Result profile() {
+
+    Result result;
+    result.passed = false;
+
+    // Initialize the problem
+    initialize_();
+
+    typename Attention::Params p;
+    { // set parameters
+      p.query_ptr = block_Q.get();
+      p.key_ptr = block_K.get();
+      p.value_ptr = block_V.get();
+      p.logsumexp_ptr = nullptr; // Only needed for bw
+      p.output_accum_ptr = nullptr;
+      if (Attention::kNeedsOutputAccumulatorBuffer) {
+        cudaMalloc(&p.output_accum_ptr, block_O.size() * sizeof(typename Attention::output_accum_t));
+      }
+      p.output_ptr = block_O.get();
+
+      // TODO: support arbitrary seq lengths
+      // if (cu_seqlens_q.has_value()) {
+      //   p.cu_seqlens_q_ptr = (int32_t*)cu_seqlens_q->data_ptr();
+      //   p.cu_seqlens_k_ptr = (int32_t*)cu_seqlens_k->data_ptr();
+      // }
+
+      p.scale = options.alpha0;
+
+      p.num_heads = options.head_number;
+      p.num_batches = options.batch_size;
+      p.head_dim = options.head_size;
+      p.head_dim_value = options.head_size_v;
+      p.num_queries = options.seq_length;
+      p.num_keys = options.seq_length_kv;
+      if (options.causal) {
+        p.custom_mask_type = Attention::CausalFromTopLeft;
+      }
+
+      // All tensors are in BMHK shapes
+      p.q_strideH = options.head_size;
+      p.k_strideH = options.head_size;
+      p.v_strideH = options.head_size_v;
+      p.q_strideM = int32_t(ldq_host[0]);
+      p.k_strideM = int32_t(ldk_host[0]);
+      p.v_strideM = int32_t(ldv_host[0]);
+      p.q_strideB = p.q_strideM * options.seq_length;
+      p.k_strideB = p.k_strideM * options.seq_length_kv;
+      p.v_strideB = p.v_strideM * options.seq_length_kv;
+      p.o_strideM = p.head_dim_value * p.num_heads;
+    }
+
+    // launch kernel :)
+    constexpr auto kernel_fn = attention_kernel_batched_impl<Attention>;
+    int smem_bytes = sizeof(typename Attention::SharedStorage);
+    if (smem_bytes > 0xc000) {
+      cudaFuncSetAttribute(kernel_fn, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes);
+    }
+    if (!Attention::check_supported(p)) {
+      std::cerr << "Kernel does not support these inputs" << std::endl;
+      return result;
+    }
+    kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes>>>(p);
+
+    // Wait for completion
+    result.error = cudaDeviceSynchronize();
+
+    if (result.error != cudaSuccess)  {
+      std::cerr << "Kernel execution error: " << cudaGetErrorString(result.error);
+      return result;
+    }
+
+    //
+    // Verify correctness
+    //
+    result.passed = true;
+
+    if (options.reference_check) {
+      result.passed = verify_();
+    }
+
+    //
+    // Warm-up run
+    //
+
+    kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes>>>(p);
+
+    if (result.status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to run CUTLASS Attention kernel." << std::endl;
+      return result;
+    }
+
+    //
+    // Construct events
+    //
+
+    cudaEvent_t events[2];
+
+    for (auto & event : events) {
+      result.error = cudaEventCreate(&event);
+      if (result.error != cudaSuccess) {
+        std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+        return -1;
+      }
+    }
+
+    // Record an event at the start of a series of GEMM operations
+    result.error = cudaEventRecord(events[0]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    //
+    // Run profiling loop
+    //
+
+    for (int iter = 0; iter < options.iterations; ++iter) {
+      kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes>>>(p);
+    }
+
+    //
+    // Stop profiling loop
+    //
+
+    // Record an event when the GEMM operations have been launched.
+    result.error = cudaEventRecord(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Wait for work on the device to complete.
+    result.error = cudaEventSynchronize(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Measure elapsed runtime
+    float runtime_ms = 0;
+    result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Compute average runtime and GFLOPs.
+    result.runtime_ms = double(runtime_ms) / double(options.iterations);
+    result.gflops = options.gflops(result.runtime_ms / 1000.0);
+
+    //
+    // Cleanup
+    //
+
+    for (auto event : events) {
+      (void)cudaEventDestroy(event);
+    }
+
+    std::cout << std::endl;
+    std::cout << "CUTLASS Attention:\n"
+      << "====================================================" << std::endl;
+    std::cout << "    " << " {seq length Q, seq length KV, head size, head size V, head number, batch size} = {" << options.seq_length \
+      << ", " << options.seq_length_kv << ", " << options.head_size << ", " << options.head_size_v << ", " << options.head_number\
+      << ", " << options.batch_size << "}." << std::endl;
+    std::cout << std::endl;
+    std::cout << "    " << "Runtime: " << result.runtime_ms << " ms" << std::endl;
+    std::cout << "    " << "GFLOPs: " << result.gflops << std::endl;
+
+    return result;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int kQueriesPerBlock,
+  int kKeysPerBlock,
+  int kMaxK
+>
+int run_attention(Options& options) {
+  using Attention = AttentionKernel<
+    cutlass::half_t,      // scalar_t
+    cutlass::arch::Sm80,  // ArchTag
+    true,                 // Memory is aligned
+    kQueriesPerBlock,
+    kKeysPerBlock,
+    kMaxK,
+    false,                // Supports dropout
+    false                 // Supports bias
+  >;
+
+  //
+  // Test and profile
+  //
+
+  TestbedAttention<Attention> testbed(options);
+
+  Result result = testbed.profile();
+  if (!result.passed) {
+    std::cout << "Profiling CUTLASS attention has failed.\n";
+    std::cout << "\nFailed\n";
+    return -1;
+  }
+
+  std::cout << "\nPassed\n";
+  return 0;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  //
+  // This example uses mma.sync to directly access Tensor Cores to achieve peak performance.
+  //
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (__CUDACC_VER_MAJOR__ < 11 || props.major < 8) {
+  
+    //
+    // This example requires an NVIDIA Ampere-architecture GPU.
+    //
+
+    std::cout 
+      << "CUTLASS's CUTLASS Attention example requires a GPU of NVIDIA's Ampere Architecture or "
+      << "later (compute capability 80 or greater).\n";
+
+    return 0;
+  }
+
+  //
+  // Parse options
+  //
+
+  Options options;
+  
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.error) {
+    std::cerr << "Aborting execution." << std::endl;
+    return -1;
+  }
+
+  if (options.use_mask) {
+    std::cerr << "--use_mask is not supported at the moment\n";
+    return -2;
+  }
+  if (options.alignment != 1) {
+    std::cerr << "--alignment=1 is the only supported value\n";
+    return -2;
+  }
+
+  // Determine kernel configuration based on head size.
+  // If head size is less than or equal to 64, each block operates over 64 queries and
+  // 64 keys, and partial results can be stored in the register file.
+  // If head size is greater than 64, each block operates over 32 queries and 128 keys,
+  // and partial results are stored in shared memory.
+  if (options.head_size_v > 64) {
+    static int const kQueriesPerBlock = 32;
+    static int const kKeysPerBlock = 128;
+    if (options.head_size_v <= 128) {
+      return run_attention<kQueriesPerBlock, kKeysPerBlock, 128>(options);
+    } else {
+      return run_attention<kQueriesPerBlock, kKeysPerBlock, 65536>(options);
+    }
+  } else {
+    static constexpr int kMaxK = 64; // <- Decrease to 32/16 if your problem is smaller
+    static int const kQueriesPerBlock = 64;
+    static int const kKeysPerBlock = 64;
+    return run_attention<kQueriesPerBlock, kKeysPerBlock, kMaxK>(options);
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/41_fused_multi_head_attention/fused_multihead_attention_variable_seqlen.cu b/examples/41_fused_multi_head_attention/fused_multihead_attention_variable_seqlen.cu
new file mode 100644
index 0000000000..49d8699a64
--- /dev/null
+++ b/examples/41_fused_multi_head_attention/fused_multihead_attention_variable_seqlen.cu
@@ -0,0 +1,1195 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief CUTLASS Attention Example.
+
+    This workload computes a fused multi head attention that supports variable sequence lengths.
+    Because it keeps the attention matrix in shared memory, it's both faster and
+    uses less global memory.
+
+    This is based on `"Self-Attention Does Not Need O(n^2) Memory" <http://arxiv.org/abs/2112.05682>`_,
+    and very similar to `"FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness" <https://arxiv.org/abs/2205.14135>`_.
+
+    Algorithm:
+      In short, we can compute the output incrementally in blocks of size B,
+      we just need to divide the final result by the sum of all coefficients in
+      the softmax (which we compute incrementally) with the following pseudo-code:
+
+      ```
+      s_prime = torch.zeros([num_queries, B])
+      O = torch.zeros([num_queries, head_size_v])
+      for i in range(0, K.shape[0], B):
+        si = exp((Q . K[i * B:(i+1) * B].t) * scale)
+        sum_coefs += attn_unscaled.sum(-1)
+        O  += si . V[i * B:(i+1) * B]
+      O = O / s_prime
+      ```
+
+      In practice, and for numerical stability reasons,
+      we also substract the maximum so far (`mi`) before doing
+      the exponential. When we encounter new keys, the maximum
+      used to compute O so far (`m_prime`) can differ from the
+      current maximum, so we update O before accumulating with
+
+      ```
+      O       = O * exp(m_prime - mi)
+      m_prime = mi
+      ```
+
+    Implementation details:
+      - `si` is stored in shared memory between the 2 back to back gemms
+      - we keep and accumulate the output
+      directly in registers if we can (`head_size_v <= 128`).
+      Otherwise, we store it & accumulate in global memory (slower)
+      - blocks are parallelized across the batch dimension, the number
+      of heads, and the query sequence size
+
+
+    Examples:
+
+      # Run an attention example with default setup
+      $ ./examples/41_fused_multi_head_attention/41_fused_multi_head_attention_variable_seqlen
+
+      # Run an attention example with custom setup
+      $ ./examples/41_fused_multi_head_attention/41_fused_multi_head_attention_variable_seqlen --head_number=2 --batch_size=3 --head_size=32 --head_size_v=64 --seq_length=512 --seq_length_kv=1024 --causal=true
+
+      Acknowledgement: Fixed-sequence-length FMHA code was upstreamed by Meta xFormers (https://github.com/facebookresearch/xformers).
+                       Using grouped GEMM to handle variable sequence lengths is inspired by an idea originally prototyped by ByteDance Inc.
+*/
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <vector>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/device/gemm_grouped.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm_complex.h"
+#include "cutlass/util/reference/device/gemm_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/kernel/default_gemm_complex.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/fast_math.h"
+
+#include "default_fmha_grouped.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Result structure
+struct Result {
+
+  double runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  //
+  // Methods
+  //
+
+  Result(
+    double runtime_ms = 0,
+    double gflops = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess
+  ):
+    runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+  bool error;
+  bool reference_check;
+  bool use_mask;
+  bool causal;
+  bool fixed_seq_length;
+
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes0;
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes1;
+
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes0_real;
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes1_real;
+
+  int alignment;
+  int head_number;
+  int batch_size;
+  int head_size;
+  int head_size_v;
+  int seq_length;
+  int seq_length_kv;
+  int iterations;
+  int problem_count;
+
+  // alpha0, alpha1 and beta are fixed 
+  // in this multi-head attention example
+  float alpha0;
+  float alpha1;
+  float beta;
+
+  cutlass::gemm::kernel::GroupScheduleMode scheduler_mode;
+
+  //
+  // Methods
+  // 
+
+  Options():
+    help(false),
+    error(false),
+    alignment(1),
+    reference_check(true),
+    head_number(12),
+    batch_size(16),
+    head_size(64),
+    head_size_v(64),
+    seq_length(1024),
+    seq_length_kv(1024),
+    use_mask(false),
+    iterations(20),
+    causal(false),
+    fixed_seq_length(false),
+    problem_count(batch_size * head_number),
+    scheduler_mode(cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly)
+  { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("alignment", alignment, 1);
+    cmd.get_cmd_line_argument("head_number", head_number, 12);
+    cmd.get_cmd_line_argument("batch_size", batch_size, 16);
+    cmd.get_cmd_line_argument("head_size", head_size, 64);
+    cmd.get_cmd_line_argument("head_size_v", head_size_v, head_size);
+    cmd.get_cmd_line_argument("seq_length", seq_length, 1024);
+    cmd.get_cmd_line_argument("seq_length_kv", seq_length_kv, seq_length);
+    cmd.get_cmd_line_argument("use_mask", use_mask, false);
+    cmd.get_cmd_line_argument("iterations", iterations, 20);
+    cmd.get_cmd_line_argument("reference-check", reference_check, true);
+    cmd.get_cmd_line_argument("causal", causal, true);
+    cmd.get_cmd_line_argument("fixed_seq_length", fixed_seq_length, false);
+
+    std::vector<std::string> scheduler_mode_strs;
+    cmd.get_cmd_line_arguments("scheduler-mode", scheduler_mode_strs);
+
+    if (!scheduler_mode_strs.empty()) {
+      if (scheduler_mode_strs.size() > 1) {
+        std::cerr << "Only one scheduler mode may be passed in" << std::endl;
+        error = true;
+        return;
+      }
+      std::string scheduler_mode_str = scheduler_mode_strs[0];
+      if (scheduler_mode_str == "kDeviceOnly") {
+        scheduler_mode = cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly;
+      } else if (scheduler_mode_str == "kHostPrecompute") {
+        scheduler_mode = cutlass::gemm::kernel::GroupScheduleMode::kHostPrecompute;
+      } else {
+          std::cerr << "Unrecognized scheduler mode '" << scheduler_mode_str << "'" << std::endl;
+          error = true;
+          return;
+      }
+    }
+
+    if (fixed_seq_length) {
+      std::cout << "NOTE: Better performance is expected for fixed-sized sequence length from 41_fused_multi_head_attention_fixed_seqlen." << std::endl;
+    }
+
+    randomize_problems();
+  }
+
+  void randomize_problems() {
+
+    problem_count = head_number * batch_size;
+
+    problem_sizes0.reserve(problem_count);
+    problem_sizes1.reserve(problem_count);
+
+    // When using mask, the original inputs are not padded
+    // and we need to save these info.
+    if (use_mask) {
+      problem_sizes0_real.reserve(problem_count);
+      problem_sizes1_real.reserve(problem_count);
+    }
+
+    for (int i = 0; i < batch_size; ++i) {
+      // problems belonging to the same batch share the same seq len
+
+      int m_real, mkv_real;
+      if (fixed_seq_length) {
+        m_real = seq_length;
+        mkv_real = seq_length_kv;
+      } else {
+        m_real = (rand() % seq_length) + 1;
+
+        // Only randomize seq_length_kv if it was set to a different value than
+        // seq_length originally.
+        if (seq_length != seq_length_kv) {
+          mkv_real = (rand() % seq_length_kv) + 1;
+        } else {
+          mkv_real = m_real;
+        }
+      }
+
+      int m = (m_real + alignment - 1) / alignment * alignment;
+      int mkv = (mkv_real + alignment - 1) / alignment * alignment;
+      int k0 = head_size;
+      int k1 = head_size_v;
+
+      for (int j = 0; j < head_number; ++j) {
+        cutlass::gemm::GemmCoord problem0(m, mkv, k0);
+        cutlass::gemm::GemmCoord problem1(m, k1, mkv);
+
+        problem_sizes0.push_back(problem0);
+        problem_sizes1.push_back(problem1);
+
+        if (use_mask) {
+          cutlass::gemm::GemmCoord problem0_real(m_real, mkv_real, k0);
+          cutlass::gemm::GemmCoord problem1_real(m_real, k1, mkv_real);
+          problem_sizes0_real.push_back(problem0_real);
+          problem_sizes1_real.push_back(problem1_real);
+        }
+
+      }
+    }
+  }
+
+  void print_problems() {
+    std::cout << "     Running " << batch_size << " batches, each with " << head_number << " heads of size " << head_size << ":" << std::endl;
+    for (int i = 0; i < batch_size; ++i) {
+      int idx = i * head_number;
+      std::cout << "       [" << i << "] seq_length = " << problem_sizes0[idx].m() << " seq_length_kv = " << problem_sizes0[idx].n() << std::endl;
+    }
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "41_fused_multi_head_attention_variable_seqlen\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement.\n\n"
+      << "  --head_number=<int>         Head number in multi-head attention (default: --head_number=12)\n"
+      << "  --batch_size=<int>          Batch size in multi-head attention (default: --batch_size=16)\n"
+      << "  --head_size=<int>           Head size in multi-head attention (default: --head_size=64)\n"
+      << "  --head_size_v=<int>         Head size in multi-head attention for V (default: --head_size_v=head_size)\n"
+      << "  --seq_length=<int>          Sequence length in multi-head attention for Q (default: --seq_length=1024)\n"
+      << "  --seq_length_kv=<int>       Sequence length in multi-head attention for K/V (default: --seq_length_kv=seq_length)\n"
+      << "  --use_mask=<bool>           If true, performs padding-like masking in softmax.\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n"
+      << "  --reference-check=<bool>    If true, performs reference check.\n"
+      << "  --causal=<bool>             If true, uses causal masking.\n"
+      << "  --fixed_seq_length=<bool>   If true, uses the same sequence length for each item in the batch.\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+
+    // Number of real-valued multiply-adds 
+    int64_t fops = int64_t();
+
+    for (size_t i = 0; i < problem_sizes0.size(); ++i) {
+      auto const& problem0 = problem_sizes0[i];
+      auto const& problem1 = problem_sizes1[i];
+
+      for (int row = 0; row < problem0.m(); ++row) {
+        int num_cols0 = problem0.n();
+        if (causal) {
+          num_cols0 = std::min(row + 1, num_cols0);
+        }
+        // P <- Q . K_t
+        fops += 2 * num_cols0 * problem0.k();
+        // P <- exp(P - max(P))
+        fops += 2 * num_cols0;
+        // S <- sum(P)
+        fops += num_cols0 - 1;
+        // O <- P . V
+        fops += 2 * num_cols0 * problem1.n();
+        // O <- O / S
+        fops += num_cols0 * problem1.n();
+      }
+    }
+
+    return double(fops) / double(1.0e9) / runtime_s;
+  }
+};
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Attention>
+class TestbedAttention {
+public:
+
+  //
+  // Type definitions
+  //
+
+  using scalar_t = typename Attention::GemmKernel::scalar_t;
+  using accum_t = typename Attention::GemmKernel::accum_t;
+  using output_t = typename Attention::GemmKernel::output_t;
+  using output_accum_t = typename Attention::GemmKernel::output_accum_t;
+
+  using ElementQ = scalar_t;
+  using ElementK = scalar_t;
+  using ElementP = accum_t;
+  using ElementAccumulator = accum_t;
+  using ElementV = scalar_t;
+  using ElementO = output_t;
+  using ElementOAccum = output_accum_t;
+
+  using ElementCompute = accum_t;
+
+  using ElementNorm = accum_t;
+  using ElementSum = accum_t;
+  using ElementSoftmaxCompute = accum_t;
+
+  using LayoutQ = cutlass::layout::RowMajor;
+  using LayoutK = cutlass::layout::ColumnMajor;
+  using LayoutP = cutlass::layout::RowMajor;
+  using LayoutV = cutlass::layout::RowMajor;
+  using LayoutO = cutlass::layout::RowMajor;
+
+  using MatrixCoord = typename LayoutP::TensorCoord;
+
+  static bool const kNeedsOutputAccumulatorBuffer = Attention::GemmKernel::kNeedsOutputAccumulatorBuffer;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Options & options;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_Q;
+  cutlass::Distribution::Kind init_K;
+  cutlass::Distribution::Kind init_P;
+  cutlass::Distribution::Kind init_V;
+  cutlass::Distribution::Kind init_O;
+  uint32_t seed;
+
+  cutlass::DeviceAllocation<cutlass::gemm::GemmCoord> problem_sizes_device0;
+  cutlass::DeviceAllocation<cutlass::gemm::GemmCoord> problem_sizes_device1;
+  cutlass::DeviceAllocation<cutlass::gemm::GemmCoord> problem_sizes_device0_real;
+
+  std::vector<int64_t> offset_Q;
+  std::vector<int64_t> offset_K;
+  std::vector<int64_t> offset_P;
+  std::vector<int64_t> offset_V;
+  std::vector<int64_t> offset_O;
+
+  std::vector<int64_t> ldq_host;
+  std::vector<int64_t> ldk_host;
+  std::vector<int64_t> ldp_host;
+  std::vector<int64_t> ldv_host;
+  std::vector<int64_t> ldo_host;
+  std::vector<int64_t> seqlen_host;
+
+  cutlass::DeviceAllocation<int64_t> ldq;
+  cutlass::DeviceAllocation<int64_t> ldk;
+  cutlass::DeviceAllocation<int64_t> ldp;
+  cutlass::DeviceAllocation<int64_t> ldv;
+  cutlass::DeviceAllocation<int64_t> ldo;
+  cutlass::DeviceAllocation<int64_t> seqlen;
+
+  cutlass::DeviceAllocation<ElementQ> block_Q;
+  cutlass::DeviceAllocation<ElementK> block_K;
+  cutlass::DeviceAllocation<ElementP> block_P;
+  cutlass::DeviceAllocation<ElementV> block_V;
+  cutlass::DeviceAllocation<ElementO> block_O;
+  cutlass::DeviceAllocation<ElementOAccum> block_O_accumulate;
+  cutlass::DeviceAllocation<ElementNorm> block_Norm;
+  cutlass::DeviceAllocation<ElementSum> block_Sum;
+
+  cutlass::DeviceAllocation<int64_t> offset_P_Device;
+
+  cutlass::DeviceAllocation<ElementQ *> ptr_Q;
+  cutlass::DeviceAllocation<ElementK *> ptr_K;
+  cutlass::DeviceAllocation<ElementP *> ptr_P;
+  cutlass::DeviceAllocation<ElementV *> ptr_V;
+  cutlass::DeviceAllocation<ElementO *> ptr_O;
+  cutlass::DeviceAllocation<ElementOAccum *> ptr_O_accumulate;
+
+
+public:
+
+  //
+  // Methods
+  //
+
+  TestbedAttention(
+    Options &options_,
+    cutlass::Distribution::Kind init_Q_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_K_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_P_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_V_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_O_ = cutlass::Distribution::Uniform,
+    uint32_t seed_ = 3080
+  ):
+    options(options_), init_Q(init_Q_), init_K(init_K_), init_P(init_P_), init_V(init_V_), init_O(init_O_), seed(seed_) { }
+
+  int problem_count() const {
+    return (options.head_number * options.batch_size);
+  }
+
+private:
+
+  /// Helper to initialize a tensor view
+  template <typename Element>
+  void initialize_tensor_(
+    Element *ptr,
+    size_t capacity, 
+    cutlass::Distribution::Kind dist_kind,
+    uint32_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      Element scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<ElementP>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        scope_max = 8;
+        scope_min = -8;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::device::BlockFillRandomUniform(
+        ptr, capacity, seed, scope_max, scope_min, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::device::BlockFillRandomGaussian(
+        ptr, capacity, seed, Element(), Element(0.5f));
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      // Fill with increasing elements
+      cutlass::reference::device::BlockFillSequential(
+        ptr, capacity, Element(1), Element());
+    } 
+    else {
+
+      // Fill with all 1s
+      cutlass::reference::device::BlockFillSequential(
+        ptr, capacity, Element(), Element(1));
+    }
+  }
+
+  /// Initializes data structures
+  void initialize_() {
+
+    //
+    // Set scalors for the mha example
+    //
+
+    options.alpha0 = 1.0f / sqrt(float(options.head_size));
+    options.alpha1 = 1.0f;
+    options.beta = 0;
+
+    //
+    // Choose random problem sizes
+    //
+
+    // construct a few problems of random sizes
+    srand(seed);
+
+    int64_t total_elements_Q = 0;
+    int64_t total_elements_K = 0;
+    int64_t total_elements_P = 0;
+    int64_t total_elements_V = 0;
+    int64_t total_elements_O = 0;
+
+    ldq_host.resize(problem_count());
+    ldk_host.resize(problem_count());
+    ldp_host.resize(problem_count());
+    ldv_host.resize(problem_count());
+    ldo_host.resize(problem_count());
+    seqlen_host.resize(problem_count());
+
+    for (int32_t i = 0; i < problem_count(); ++i) {
+
+      auto problem0 = options.problem_sizes0.at(i);
+      auto problem1 = options.problem_sizes1.at(i);
+
+      ldq_host.at(i) = LayoutQ::packed({problem0.m(), problem0.k()}).stride(0);
+      ldk_host.at(i) = LayoutK::packed({problem0.k(), problem0.n()}).stride(0);
+      ldp_host.at(i) = LayoutP::packed({problem0.m(), problem0.n()}).stride(0);
+      ldv_host.at(i) = LayoutV::packed({problem1.k(), problem1.n()}).stride(0);
+      ldo_host.at(i) = LayoutO::packed({problem1.m(), problem1.n()}).stride(0);
+
+      // m = n for attention problems.
+      seqlen_host.at(i) = problem0.m();
+
+      offset_Q.push_back(total_elements_Q);
+      offset_K.push_back(total_elements_K);
+      offset_P.push_back(total_elements_P);
+      offset_V.push_back(total_elements_V);
+      offset_O.push_back(total_elements_O);
+
+      int64_t elements_Q = problem0.m() * problem0.k();
+      int64_t elements_K = problem0.k() * problem0.n();
+      int64_t elements_P = problem0.m() * problem0.n();
+      int64_t elements_V = problem1.k() * problem1.n();
+      int64_t elements_O = problem1.m() * problem1.n();
+
+      total_elements_Q += elements_Q;
+      total_elements_K += elements_K;
+      total_elements_P += elements_P;
+      total_elements_V += elements_V;
+      total_elements_O += elements_O;
+
+    }
+
+    problem_sizes_device0.reset(problem_count());
+    problem_sizes_device1.reset(problem_count());
+    problem_sizes_device0.copy_from_host(options.problem_sizes0.data());
+    problem_sizes_device1.copy_from_host(options.problem_sizes1.data());
+
+    if (options.use_mask) {
+      problem_sizes_device0_real.reset(problem_count());
+      problem_sizes_device0_real.copy_from_host(options.problem_sizes0_real.data());
+    }
+
+    ldq.reset(problem_count());
+    ldk.reset(problem_count());
+    ldp.reset(problem_count());
+    ldv.reset(problem_count());
+    ldo.reset(problem_count());
+    seqlen.reset(problem_count());
+
+    ldq.copy_from_host(ldq_host.data());
+    ldk.copy_from_host(ldk_host.data());
+    ldp.copy_from_host(ldp_host.data());
+    ldv.copy_from_host(ldv_host.data());
+    ldo.copy_from_host(ldo_host.data());
+    seqlen.copy_from_host(seqlen_host.data());
+
+    //
+    // Assign pointers
+    //
+
+    block_Q.reset(total_elements_Q);
+    block_K.reset(total_elements_K);
+    block_P.reset(total_elements_P);
+    block_V.reset(total_elements_V);
+    block_O.reset(total_elements_O);
+
+    if (kNeedsOutputAccumulatorBuffer) {
+      block_O_accumulate.reset(total_elements_O);
+    }
+
+    offset_P_Device.reset(problem_count());
+
+    // sync offset with device
+    cutlass::device_memory::copy_to_device(offset_P_Device.get(), offset_P.data(), offset_P.size());
+
+    std::vector<ElementQ *> ptr_Q_host(problem_count());
+    std::vector<ElementK *> ptr_K_host(problem_count());
+    std::vector<ElementP *> ptr_P_host(problem_count());
+    std::vector<ElementV *> ptr_V_host(problem_count());
+    std::vector<ElementO *> ptr_O_host(problem_count());
+    std::vector<ElementOAccum *> ptr_O_accumulate_host(problem_count());
+    std::vector<ElementNorm *> ptr_norm_host(problem_count());
+    std::vector<ElementSum *> ptr_sum_host(problem_count());
+
+    for (int32_t i = 0; i < problem_count(); ++i) {
+      ptr_Q_host.at(i) = block_Q.get() + offset_Q.at(i);
+      ptr_K_host.at(i) = block_K.get() + offset_K.at(i);
+      ptr_P_host.at(i) = block_P.get() + offset_P.at(i);
+      ptr_V_host.at(i) = block_V.get() + offset_V.at(i);
+      ptr_O_host.at(i) = block_O.get() + offset_O.at(i);
+
+      if (kNeedsOutputAccumulatorBuffer) {
+        ptr_O_accumulate_host.at(i) = block_O_accumulate.get() + offset_O.at(i);
+      }
+    }
+
+    ptr_Q.reset(problem_count());
+    ptr_Q.copy_from_host(ptr_Q_host.data());
+    
+    ptr_K.reset(problem_count());
+    ptr_K.copy_from_host(ptr_K_host.data());
+    
+    ptr_P.reset(problem_count());
+    ptr_P.copy_from_host(ptr_P_host.data());
+
+    ptr_V.reset(problem_count());
+    ptr_V.copy_from_host(ptr_V_host.data());
+
+    ptr_O.reset(problem_count());
+    ptr_O.copy_from_host(ptr_O_host.data());
+
+    if (kNeedsOutputAccumulatorBuffer) {
+      ptr_O_accumulate.reset(problem_count());
+      ptr_O_accumulate.copy_from_host(ptr_O_accumulate_host.data());
+    }
+
+    //
+    // Initialize the problems of the workspace
+    //
+
+    initialize_tensor_(block_Q.get(), total_elements_Q, init_Q, seed + 1);
+    initialize_tensor_(block_K.get(), total_elements_K, init_K, seed + 2);
+    initialize_tensor_(block_V.get(), total_elements_V, init_V, seed + 3);
+
+  }
+
+  template<typename Element>
+  bool verify_tensor_(std::vector<Element> vector_Input, \
+                       std::vector<Element> vector_Input_Ref,
+                       int64_t verify_length = -1) {
+
+    int64_t size = (vector_Input.size() < vector_Input_Ref.size()) ? vector_Input.size() : vector_Input_Ref.size();
+    size = (verify_length == -1) ? size : verify_length;
+
+    // 0.05 for absolute error
+    float abs_tol = 5e-2f;
+    // 10% for relative error
+    float rel_tol = 1e-1f;
+    for (int64_t i = 0; i < size; ++i) {
+      float diff = (float)(vector_Input.at(i) - vector_Input_Ref.at(i));
+      float abs_diff = fabs(diff);
+      float abs_ref = fabs((float)vector_Input_Ref.at(i) + 1e-5f);
+      float relative_diff = abs_diff / abs_ref;
+      if ( (isnan(abs_diff) || isinf(abs_diff)) ||  (abs_diff > abs_tol && relative_diff > rel_tol)) {
+        printf("[%d/%d] diff = %f, rel_diff = %f, {computed=%f, ref=%f}.\n", int(i), int(size), abs_diff, relative_diff, (float)(vector_Input.at(i)), (float)(vector_Input_Ref.at(i)));
+        return false;
+      }
+
+    }
+    
+    return true;
+  }
+
+  /// Verifies the result is a GEMM
+  bool verify_() {
+
+    bool passed = true;
+
+    for (int32_t i = 0; i < problem_count(); ++i) {
+      cutlass::gemm::GemmCoord problem0 = options.problem_sizes0.at(i);
+      cutlass::gemm::GemmCoord problem1 = options.problem_sizes1.at(i);
+
+      LayoutQ layout_Q(ldq_host.at(i));
+      LayoutK layout_K(ldk_host.at(i));
+      LayoutP layout_P(ldp_host.at(i));
+      LayoutV layout_V(ldv_host.at(i));
+      LayoutO layout_O(ldo_host.at(i));
+
+      MatrixCoord extent_Q{problem0.m(), problem0.k()};
+      MatrixCoord extent_K{problem0.k(), problem0.n()};
+      MatrixCoord extent_P{problem0.m(), problem0.n()};
+      MatrixCoord extent_V{problem1.k(), problem1.n()};
+      MatrixCoord extent_O{problem1.m(), problem1.n()};
+
+      cutlass::TensorView<ElementQ, LayoutQ> view_Q(block_Q.get() + offset_Q.at(i), layout_Q, extent_Q);
+      cutlass::TensorView<ElementK, LayoutK> view_K(block_K.get() + offset_K.at(i), layout_K, extent_K);
+      cutlass::TensorView<ElementP, LayoutP> view_P(block_P.get() + offset_P.at(i), layout_P, extent_P);
+      cutlass::TensorView<ElementV, LayoutV> view_V(block_V.get() + offset_V.at(i), layout_V, extent_V);
+
+      cutlass::DeviceAllocation<ElementP>    block_Ref(layout_P.capacity(extent_P));
+      cutlass::TensorView<ElementP, LayoutP> view_Ref_device(block_Ref.get(), layout_P, extent_P);
+
+      cutlass::DeviceAllocation<ElementO>    block_Ref_O(layout_O.capacity(extent_O));
+      cutlass::TensorView<ElementO, LayoutO> view_Ref_O_device(block_Ref_O.get(), layout_O, extent_O);
+      cutlass::reference::device::TensorFill(view_Ref_O_device, ElementO(0));
+
+      // Reference GEMM
+      cutlass::reference::device::GemmComplex<
+          ElementQ, LayoutQ,
+          ElementK, LayoutK,
+          ElementP, LayoutP, 
+          ElementCompute, ElementAccumulator
+      >(
+        problem0,
+        ElementAccumulator(options.alpha0), 
+        view_Q,
+        Attention::GemmKernel::MM0::Mma::kTransformA,
+        view_K,
+        Attention::GemmKernel::MM0::Mma::kTransformB,
+        ElementAccumulator(options.beta), 
+        view_P, 
+        view_Ref_device, 
+        ElementAccumulator(0)
+      );
+
+      // Compute softmax for P. We need to explicitly compute softmax
+      // over P because softmax is fused to the second GEMM in the
+      // profiled implementation.
+      std::vector<ElementP> matrix_Ref(layout_P.capacity(extent_P));
+      cutlass::device_memory::copy_to_host(matrix_Ref.data(), block_Ref.get(), matrix_Ref.size());
+      cutlass::TensorView<ElementP, LayoutP> view_Ref_host(matrix_Ref.data(), layout_P, extent_P);
+      std::vector<ElementNorm> vector_Norm_Ref(problem0.m());
+      std::vector<ElementSum> vector_Sum_Ref(problem0.m());
+
+      int n_dim = options.use_mask ? options.problem_sizes0_real.at(i).n() : problem0.n();
+
+      // Compute softmax for reference matrix
+      for (int m = 0; m < problem0.m(); m++) {
+        int n_dim_row = n_dim;
+        if (options.causal) {
+          n_dim_row = std::min(m + 1, n_dim);
+        }
+        ElementSoftmaxCompute max = ElementSoftmaxCompute(view_Ref_host.ref().at({m, 0}));
+        for (int n = 1; n < n_dim_row; n++) {
+           max = std::max(max, ElementSoftmaxCompute(view_Ref_host.ref().at({m, n})));
+        }
+
+        vector_Norm_Ref.at(m) = ElementNorm(max);
+
+        ElementSoftmaxCompute sum = ElementSoftmaxCompute();
+        for (int n = 0; n < n_dim_row; n++) {
+          sum += std::exp( ElementSoftmaxCompute(view_Ref_host.ref().at({m, n})) - max );
+        }
+        ElementSoftmaxCompute inv_sum = ElementSoftmaxCompute(1.0f / sum);
+
+        vector_Sum_Ref.at(m) = ElementSum(inv_sum);
+
+        for (int n = 0; n < n_dim_row; n++) {
+          view_Ref_host.ref().at({m, n}) = ElementP(
+            std::exp( ElementSoftmaxCompute(view_Ref_host.ref().at({m, n})) - max ) * inv_sum
+          );
+        }
+        // Mask out the rest of the attention matrix
+        for (int n = n_dim_row; n < n_dim; ++n) {
+          view_Ref_host.ref().at({m, n}) = ElementP(0);
+        }
+
+      }
+
+      // when not using mask, problem_real and problem share the same sizes
+      if (options.use_mask) {
+        for (int m = 0; m < problem0.m(); m++) {
+          for (int n = n_dim; n < problem0.n(); n++) {
+            view_Ref_host.ref().at({m, n}) = ElementP(0);
+          }
+        }
+      }
+
+      cutlass::device_memory::copy_to_device(block_P.get() + offset_P.at(i), matrix_Ref.data(), matrix_Ref.size());
+
+      // Reference GEMM
+      cutlass::reference::device::GemmComplex<
+          ElementP, LayoutP,
+          ElementV, LayoutV,
+          ElementO, LayoutO, 
+          ElementCompute, ElementAccumulator
+      >(
+        problem1,
+        ElementAccumulator(options.alpha1), 
+        view_P,
+        Attention::GemmKernel::MM0::Mma::kTransformA,
+        view_V,
+        Attention::GemmKernel::MM0::Mma::kTransformB,
+        ElementAccumulator(options.beta), 
+        view_Ref_O_device, 
+        view_Ref_O_device, 
+        ElementAccumulator(0)
+      );
+
+      // Copy to host memory
+      cutlass::TensorView<ElementP, LayoutP> view_Ref(matrix_Ref.data(), layout_P, extent_P);
+
+      std::vector<ElementO> matrix_O(layout_O.capacity(extent_O));
+      cutlass::device_memory::copy_to_host(matrix_O.data(),   block_O.get() + offset_O.at(i), matrix_O.size());
+      std::vector<ElementO> matrix_Ref_O(layout_O.capacity(extent_O));
+      cutlass::device_memory::copy_to_host(matrix_Ref_O.data(), block_Ref_O.get(), matrix_Ref_O.size());
+
+
+      bool verified_O = false;
+      if (!verified_O) {
+        verified_O = verify_tensor_<ElementO>(matrix_O, matrix_Ref_O);
+      }
+
+      passed = passed && verified_O;
+
+      if (!passed) {
+        std::cerr << "\n***\nError - problem " << i << " failed the QA check\n***\n" << std::endl;
+
+        if (!verified_O) {
+          std::cout << "Final matrix output is incorrect" << std::endl;
+        }
+
+        return passed;
+      }
+
+    }
+
+    return passed;
+  }
+
+public:
+
+
+  /// Executes a CUTLASS Attention kernel and measures runtime.
+  Result profile() {
+
+    Result result;
+    result.passed = false;
+
+    int threadblock_count = Attention::sufficient(options.problem_sizes1.data(), options.problem_count);
+
+    // Early exit
+    if (!threadblock_count) {
+      std::cout << "Active CUDA device lacks hardware resources to run CUTLASS Grouped FMHA kernel." << std::endl;
+      return result;
+    }
+
+    result.passed = false;
+
+    // Initialize the problem
+    initialize_();
+
+    typename Attention::Arguments args(
+      problem_sizes_device0.get(),
+      problem_sizes_device1.get(),
+      options.problem_count,
+      threadblock_count,
+      ptr_Q.get(),
+      ptr_K.get(),
+      ptr_P.get(),
+      ptr_V.get(),
+      ptr_O.get(),
+      ptr_O_accumulate.get(),
+      ldq.get(),
+      ldk.get(),
+      ldp.get(),
+      ldv.get(),
+      ldo.get(),
+      options.causal,
+      options.alpha0,
+      options.problem_sizes1.data()
+    );
+
+    Attention fmha;
+
+    size_t workspace_size = fmha.get_workspace_size(args);
+    cutlass::DeviceAllocation<uint8_t> workspace(workspace_size);
+
+    result.status = fmha.initialize(args, workspace.get());
+
+    if (result.status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to initialize CUTLASS Grouped FMHA kernel." << std::endl;
+      return result;
+    }
+
+    // Run the grouped FMHA object
+    result.status = fmha.run();
+
+    if (result.status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to run CUTLASS Grouped FMHA kernel." << std::endl;
+      return result;
+    }
+
+    // Wait for completion
+    result.error = cudaDeviceSynchronize();
+
+    if (result.error != cudaSuccess)  {
+      std::cerr << "Kernel execution error: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    //
+    // Verify correctness
+    //
+    result.passed = true;
+
+    if (options.reference_check) {
+      result.passed = verify_();
+    }
+
+    //
+    // Warm-up run of the grouped FMHA object
+    //
+    result.status = fmha.run();
+
+    if (result.status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to run CUTLASS Grouped FMHA kernel." << std::endl;
+      return result;
+    }
+
+    //
+    // Construct events
+    //
+
+    cudaEvent_t events[2];
+
+    for (auto & event : events) {
+      result.error = cudaEventCreate(&event);
+      if (result.error != cudaSuccess) {
+        std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+        return -1;
+      }
+    }
+
+    // Record an event at the start of a series of FMHA operations
+    result.error = cudaEventRecord(events[0]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    //
+    // Run profiling loop
+    //
+
+    for (int iter = 0; iter < this->options.iterations; ++iter) {
+      fmha();
+    }
+
+    //
+    // Stop profiling loop
+    //
+
+    // Record an event when the GEMM operations have been launched.
+    result.error = cudaEventRecord(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Wait for work on the device to complete.
+    result.error = cudaEventSynchronize(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Measure elapsed runtime
+    float runtime_ms = 0;
+    result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Compute average runtime and GFLOPs.
+    result.runtime_ms = double(runtime_ms) / double(this->options.iterations);
+    result.gflops = this->options.gflops(result.runtime_ms / 1000.0);
+
+    //
+    // Cleanup
+    //
+
+    for (auto event : events) {
+      (void)cudaEventDestroy(event);
+    }
+
+    std::cout << std::endl;
+    std::cout << "CUTLASS Attention:\n"
+      << "====================================================" << std::endl;
+    std::cout << "    " << " {seq length Q, seq length KV, head size, head size V, head number, batch size} = {" << options.seq_length \
+      << ", " << options.seq_length_kv << ", " << options.head_size << ", " << options.head_size_v << ", " << options.head_number\
+      << ", " << options.batch_size << "}." << std::endl;
+    options.print_problems();
+    std::cout << std::endl;
+    std::cout << "    " << "Runtime: " << result.runtime_ms << " ms" << std::endl;
+    std::cout << "    " << "GFLOPs: " << result.gflops << std::endl;
+
+    return result;
+  }
+
+
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int kQueriesPerBlock,
+  int kKeysPerBlock,
+  int kMaxK,
+  cutlass::gemm::kernel::GroupScheduleMode GroupScheduleMode_
+>
+int run_grouped(Options& options) {
+  using AttentionKernel = typename cutlass::gemm::kernel::DefaultFMHAGrouped<
+    cutlass::half_t,      // scalar_t
+    cutlass::arch::Sm80,  // ArchTag
+    true,                 // Memory is aligned
+    kQueriesPerBlock,
+    kKeysPerBlock,
+    kMaxK,
+    GroupScheduleMode_
+  >::FMHAKernel;
+
+  using FMHA = cutlass::gemm::device::GemmGrouped<AttentionKernel>;
+
+  //
+  // Test and profile
+  //
+
+  TestbedAttention<FMHA> testbed(options);
+
+  Result result = testbed.profile();
+  if (!result.passed) {
+    std::cout << "Profiling CUTLASS attention has failed.\n";
+    std::cout << "\nFailed\n";
+    return -1;
+  }
+
+  std::cout << "\nPassed\n";
+  return 0;
+}
+
+
+template <
+  int kQueriesPerBlock,
+  int kKeysPerBlock,
+  int kMaxK
+>
+int run_attention(Options& options) {
+  if (options.scheduler_mode == cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly) {
+    return run_grouped<kQueriesPerBlock,
+                       kKeysPerBlock,
+                       kMaxK,
+                       cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly>(options);
+  } else {
+    return run_grouped<kQueriesPerBlock,
+                       kKeysPerBlock,
+                       kMaxK,
+                       cutlass::gemm::kernel::GroupScheduleMode::kHostPrecompute>(options);
+  }
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  //
+  // This example uses mma.sync to directly access Tensor Cores to achieve peak performance.
+  //
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (__CUDACC_VER_MAJOR__ < 11 || props.major < 8) {
+  
+    //
+    // This example requires an NVIDIA Ampere-architecture GPU.
+    //
+
+    std::cout 
+      << "CUTLASS's CUTLASS Attention example requires a GPU of NVIDIA's Ampere Architecture or "
+      << "later (compute capability 80 or greater).\n";
+
+    return 0;
+  }
+
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.error) {
+    std::cerr << "Aborting execution." << std::endl;
+    return -1;
+  }
+
+  if (options.use_mask) {
+    std::cerr << "--use_mask is not supported at the moment\n";
+    return -2;
+  }
+  if (options.alignment != 1) {
+    std::cerr << "--alignment=1 is the only supported value\n";
+    return -2;
+  }
+
+  // Determine kernel configuration based on head size.
+  // If head size is less than or equal to 64, each block operates over 64 queries and
+  // 64 keys, and partial results can be stored in the register file.
+  // If head size is greater than 64, each block operates over 32 queries and 128 keys,
+  // and partial results are stored in shared memory.
+  if (options.head_size_v > 64) {
+    static int const kQueriesPerBlock = 32;
+    static int const kKeysPerBlock = 128;
+    if (options.head_size_v <= kKeysPerBlock) {
+      return run_attention<kQueriesPerBlock, kKeysPerBlock, 128>(options);
+    } else {
+      return run_attention<kQueriesPerBlock, kKeysPerBlock, 65536>(options);
+    }
+  } else {
+    static constexpr int kMaxK = 64; // <- Decrease to 32/16 if your problem is smaller
+    static int const kQueriesPerBlock = 64;
+    static int const kKeysPerBlock = 64;
+    return run_attention<kQueriesPerBlock, kKeysPerBlock, kMaxK>(options);
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/41_fused_multi_head_attention/gemm/custom_mma.h b/examples/41_fused_multi_head_attention/gemm/custom_mma.h
new file mode 100644
index 0000000000..80f5d4ea11
--- /dev/null
+++ b/examples/41_fused_multi_head_attention/gemm/custom_mma.h
@@ -0,0 +1,124 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "custom_mma_multistage.h"
+#include "custom_mma_pipelined.h"
+#include "cutlass/gemm/threadblock/mma_multistage.h"
+#include "cutlass/gemm/threadblock/mma_pipelined.h"
+
+template <typename Mma, int kMaxK>
+struct MakeCustomMma;
+
+template <
+    typename Shape,
+    typename IteratorA,
+    typename SmemIteratorA,
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    typename IteratorB,
+    typename SmemIteratorB,
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    typename ElementC,
+    typename LayoutC,
+    typename Policy,
+    int Stages,
+    cutlass::gemm::SharedMemoryClearOption SharedMemoryClear,
+    int kMaxK>
+struct MakeCustomMma<
+    cutlass::gemm::threadblock::MmaMultistage<
+        Shape,
+        IteratorA,
+        SmemIteratorA,
+        CacheOpA,
+        IteratorB,
+        SmemIteratorB,
+        CacheOpB,
+        ElementC,
+        LayoutC,
+        Policy,
+        Stages,
+        SharedMemoryClear>,
+    kMaxK> {
+  // Reduce the number of stages if we don't need that many
+  static int constexpr kStages =
+      kMaxK == cutlass::platform::numeric_limits<int>::max()
+      ? Stages
+      : cutlass::const_min(
+            Stages,
+            (kMaxK + int(Shape::kK) - 1) / int(Shape::kK));
+  using Mma = cutlass::gemm::threadblock::CustomMmaMultistage<
+      Shape,
+      IteratorA,
+      SmemIteratorA,
+      CacheOpA,
+      IteratorB,
+      SmemIteratorB,
+      CacheOpB,
+      ElementC,
+      LayoutC,
+      Policy,
+      kStages,
+      SharedMemoryClear,
+      kMaxK>;
+};
+
+template <
+    typename Shape,
+    typename IteratorA,
+    typename SmemIteratorA,
+    typename IteratorB,
+    typename SmemIteratorB,
+    typename ElementC,
+    typename LayoutC,
+    typename Policy,
+    int kMaxK>
+struct MakeCustomMma<
+    cutlass::gemm::threadblock::MmaPipelined<
+        Shape,
+        IteratorA,
+        SmemIteratorA,
+        IteratorB,
+        SmemIteratorB,
+        ElementC,
+        LayoutC,
+        Policy>,
+    kMaxK> {
+  using Mma = cutlass::gemm::threadblock::CustomMmaPipelined<
+      Shape,
+      IteratorA,
+      SmemIteratorA,
+      IteratorB,
+      SmemIteratorB,
+      ElementC,
+      LayoutC,
+      Policy>;
+};
diff --git a/examples/41_fused_multi_head_attention/gemm/custom_mma_base.h b/examples/41_fused_multi_head_attention/gemm/custom_mma_base.h
new file mode 100644
index 0000000000..be25f79c4e
--- /dev/null
+++ b/examples/41_fused_multi_head_attention/gemm/custom_mma_base.h
@@ -0,0 +1,182 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class CustomMmaBase {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = GemmShape<
+      Shape::kM / WarpGemm::kM,
+      Shape::kN / WarpGemm::kN,
+      Shape::kK / WarpGemm::kK>;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations =
+      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  template <typename Element, typename OperandShape, typename OperandLayout>
+  struct OperandSharedStorage {
+    AlignedBuffer<Element, OperandShape::kCount> buffer;
+    using TensorRef = TensorRef<Element, OperandLayout>;
+
+    CUTLASS_DEVICE
+    static OperandLayout Layout() {
+      return OperandLayout::packed({OperandShape::kRow, OperandShape::kColumn});
+    }
+
+    /// Returns a TensorRef to the operand
+    CUTLASS_HOST_DEVICE
+    TensorRef ref() {
+      return TensorRef{buffer.data(), Layout()};
+    }
+  };
+
+  /// Shape of the A matrix operand in shared memory
+  using ShapeA = MatrixShape<
+      Shape::kM + Policy::SmemPaddingA::kRow,
+      Shape::kK * kStages + Policy::SmemPaddingA::kColumn>;
+
+  /// Shape of the B matrix operand in shared memory
+  using ShapeB = MatrixShape<
+      Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+      Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+  using SharedStorageA = OperandSharedStorage<
+      typename Operator::ElementA,
+      ShapeA,
+      typename Operator::LayoutA>;
+  using SharedStorageB = OperandSharedStorage<
+      typename Operator::ElementB,
+      ShapeB,
+      typename Operator::LayoutB>;
+  using TensorRefA = typename SharedStorageA::TensorRef;
+  using TensorRefB = typename SharedStorageB::TensorRef;
+
+  struct SharedStorage {
+    /// Buffer for A operand
+    SharedStorageA operand_A;
+
+    /// Buffer for B operand
+    SharedStorageB operand_B;
+  };
+
+ protected:
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A operand from shared memory
+  typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  CustomMmaBase(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      SharedStorageA& shared_storageA,
+      SharedStorageB& shared_storageB,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : warp_tile_iterator_A_(shared_storageA.ref(), lane_idx),
+        warp_tile_iterator_B_(shared_storageB.ref(), lane_idx) {}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/41_fused_multi_head_attention/gemm/custom_mma_multistage.h b/examples/41_fused_multi_head_attention/gemm/custom_mma_multistage.h
new file mode 100644
index 0000000000..eedcb6376b
--- /dev/null
+++ b/examples/41_fused_multi_head_attention/gemm/custom_mma_multistage.h
@@ -0,0 +1,760 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/cache_operation.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "custom_mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Upper boundon the K dimension
+    int kMaxK = cutlass::platform::numeric_limits<int>::max(),
+    /// Used for partial specialization
+    typename Enable = bool>
+class CustomMmaMultistage : public CustomMmaBase<Shape_, Policy_, Stages> {
+ public:
+  ///< Base class
+  using Base = CustomMmaBase<Shape_, Policy_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+    static_assert(
+        Base::kWarpGemmIterations > 1,
+        "The pipelined structure requires at least two warp-level "
+        "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) /
+        Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) /
+        Base::kWarpGemmIterations;
+  };
+
+  static bool const kSmemContainsEntireMat = kMaxK <= Shape::kK * Stages;
+  static constexpr int kNumStagesConcurrentLoad =
+      kSmemContainsEntireMat ? Stages : Stages - 1;
+
+ private:
+  using WarpLoadedFragmentA = typename Operator::FragmentA;
+  using WarpLoadedFragmentB = typename Operator::FragmentB;
+  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  bool prologue_done_;
+
+  // Set to `True` to ensure the accumulator will be zero outside the GEMM
+  // footprint
+  bool zero_outside_bounds_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  CustomMmaMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorageA& shared_storageA,
+      typename Base::SharedStorageB& shared_storageB,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : Base(shared_storageA, shared_storageB, thread_idx, warp_idx, lane_idx),
+        smem_iterator_A_(shared_storageA.ref(), thread_idx),
+        smem_iterator_B_(shared_storageB.ref(), thread_idx),
+        prologue_done_(false),
+        zero_outside_bounds_(false) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+  CUTLASS_DEVICE
+  CustomMmaMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage& st,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : CustomMmaMultistage(
+            st.operand_A,
+            st.operand_B,
+            thread_idx,
+            warp_idx,
+            lane_idx) {}
+
+  CUTLASS_DEVICE
+  bool set_prologue_done(bool value) {
+    prologue_done_ = value;
+    return true;
+  }
+
+  CUTLASS_DEVICE
+  bool set_zero_outside_bounds(bool value) {
+    zero_outside_bounds_ = value;
+    return true;
+  }
+
+  template <bool kLoadA = true, bool kLoadB = true>
+  CUTLASS_DEVICE static void prologue(
+      typename Base::SharedStorage& shared_storage,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      int thread_idx,
+      int problem_size_k) {
+    prologue<kLoadA, kLoadB>(
+        shared_storage.operand_A,
+        shared_storage.operand_B,
+        iterator_A,
+        iterator_B,
+        thread_idx,
+        problem_size_k);
+  }
+
+  template <bool kLoadA = true, bool kLoadB = true>
+  CUTLASS_DEVICE static void prologue(
+      typename Base::SharedStorageA& shared_storageA,
+      typename Base::SharedStorageB& shared_storageB,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      int thread_idx,
+      int problem_size_k) {
+    SmemIteratorA smem_iterator_A(shared_storageA.ref(), thread_idx);
+    SmemIteratorB smem_iterator_B(shared_storageB.ref(), thread_idx);
+    int32_t iter = (problem_size_k + Base::Shape::kK - 1) / Base::Shape::kK;
+    _prologue<kLoadA, kLoadB>(
+        iterator_A, iterator_B, iter, smem_iterator_A, smem_iterator_B);
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(
+      IteratorA& iterator_A,
+      IteratorB& iterator_B,
+      int group_start_A = 0,
+      int group_start_B = 0) {
+    iterator_A.set_iteration_index(
+        group_start_A * IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType*>(
+                this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+            IteratorA::ThreadMap::kElementsPerAccess /
+            IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A.get();
+
+          if (zero_outside_bounds_ ||
+              SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, iterator_A.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, iterator_A.valid());
+          }
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    iterator_B.set_iteration_index(
+        group_start_B * IteratorB::kAccessesPerVector);
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType*>(
+                this->smem_iterator_B_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+            IteratorB::ThreadMap::kElementsPerAccess /
+            IteratorB::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B.get();
+
+          if (zero_outside_bounds_ ||
+              SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, iterator_B.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, iterator_B.valid());
+          }
+
+          ++iterator_B;
+        }
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+  template <bool kLoadA = true, bool kLoadB = true>
+  CUTLASS_DEVICE static void _prologue(
+      IteratorA& iterator_A,
+      IteratorB& iterator_B,
+      int32_t& gemm_k_iterations,
+      SmemIteratorA& smem_iterator_A_,
+      SmemIteratorB& smem_iterator_B_) {
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < kNumStagesConcurrentLoad;
+         ++stage, --gemm_k_iterations) {
+      iterator_A.clear_mask(gemm_k_iterations == 0);
+      iterator_B.clear_mask(gemm_k_iterations == 0);
+
+      iterator_A.set_iteration_index(0);
+      smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType*>(
+                smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorA::Element>::value *
+              IteratorA::ThreadMap::kElementsPerAccess /
+              IteratorA::kAccessesPerVector / 8;
+
+          int src_bytes = (iterator_A.valid() ? kSrcBytes : 0);
+
+          if (kLoadA) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, iterator_A.get(), iterator_A.valid());
+          }
+
+          ++iterator_A;
+        }
+
+        ++smem_iterator_A_;
+      }
+
+      iterator_B.set_iteration_index(0);
+      smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType*>(
+                smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB::Element>::value *
+              IteratorB::ThreadMap::kElementsPerAccess /
+              IteratorB::kAccessesPerVector / 8;
+
+          if (kLoadB) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, iterator_B.get(), iterator_B.valid());
+          }
+
+          ++iterator_B;
+        }
+
+        ++smem_iterator_B_;
+      }
+
+      // Move to the next stage
+      iterator_A.add_tile_offset({0, 1});
+      iterator_B.add_tile_offset({1, 0});
+
+      smem_iterator_A_.add_tile_offset({0, 1});
+      smem_iterator_B_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC& accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< initial value of accumulator
+      FragmentC const& src_accum) {
+    //
+    // Prologue
+    //
+
+    if (!prologue_done_) {
+      _prologue<true, true>(
+          iterator_A,
+          iterator_B,
+          gemm_k_iterations,
+          smem_iterator_A_,
+          smem_iterator_B_);
+    } else if (!kSmemContainsEntireMat) {
+      _prologue<false, false>(
+          iterator_A,
+          iterator_B,
+          gemm_k_iterations,
+          smem_iterator_A_,
+          smem_iterator_B_);
+    } else {
+      gemm_k_iterations -= kNumStagesConcurrentLoad;
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    //
+    // Clear the remaining tiles of SMEM. This is a functional requirement for
+    // some kernels so that all accumulator elements outside the GEMM footprint
+    // are zero.
+    //
+
+    if (SharedMemoryClear == SharedMemoryClearOption::kClearLastStage) {
+      /// Iterator to write threadblock-scoped tile of A operand to shared
+      /// memory
+      SmemIteratorA last_smem_iterator_A(this->smem_iterator_A_);
+
+      typename IteratorA::AccessType zero_A;
+      zero_A.clear();
+
+      last_smem_iterator_A.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType*>(
+                last_smem_iterator_A.get());
+
+        *dst_ptr = zero_A;
+
+        ++last_smem_iterator_A;
+      }
+
+      /// Iterator to write threadblock-scoped tile of B operand to shared
+      /// memory
+      SmemIteratorB last_smem_iterator_B(this->smem_iterator_B_);
+      typename IteratorB::AccessType zero_B;
+
+      zero_B.clear();
+      last_smem_iterator_B.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType*>(
+                last_smem_iterator_B.get());
+
+        *dst_ptr = zero_B;
+
+        ++last_smem_iterator_B;
+      }
+    }
+
+    // Waits until kStages-2 stages have committed.
+    cutlass::arch::cp_async_wait<kNumStagesConcurrentLoad - 1>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA warp_loaded_frag_A[2];
+    WarpLoadedFragmentB warp_loaded_frag_B[2];
+    WarpTransformedFragmentA warp_transformed_frag_A[2];
+    WarpTransformedFragmentB warp_transformed_frag_B[2];
+
+    Operator warp_mma;
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    iterator_A.clear_mask(gemm_k_iterations == 0);
+    iterator_B.clear_mask(gemm_k_iterations == 0);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma.transform(
+        warp_transformed_frag_A[0],
+        warp_transformed_frag_B[0],
+        warp_loaded_frag_A[0],
+        warp_loaded_frag_B[0]);
+
+    // tf32x3 kernels use staging accumulation. warp_mma uses a temporary
+    // accumulator and this temporary accumulator is added to the final
+    // accumulator once in every mainloop iteration.
+    plus<FragmentC> plus_accum;
+
+    FragmentC tmp_accum;
+
+    if (platform::is_same<
+            typename Operator::MathOperator,
+            arch::OpMultiplyAddFastF32>::value ||
+        platform::is_same<
+            typename Operator::MathOperator,
+            arch::OpMultiplyAddComplexFastF32>::value) {
+      tmp_accum.clear();
+    }
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-kNumStagesConcurrentLoad);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A_.set_kgroup_index(
+            (warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index(
+            (warp_mma_k + 1) % Base::kWarpGemmIterations);
+
+        // In case of a non-circular buffer ("kSmemContainsEntireMat")
+        // make sure we don't load out of bounds data.
+        if (!kSmemContainsEntireMat ||
+            gemm_k_iterations > (-kNumStagesConcurrentLoad) ||
+            warp_mma_k < Base::kWarpGemmIterations - 1) {
+          this->warp_tile_iterator_A_.load(
+              warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
+          this->warp_tile_iterator_B_.load(
+              warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+        }
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k > 0)
+          warp_mma.transform(
+              warp_transformed_frag_A[warp_mma_k % 2],
+              warp_transformed_frag_B[warp_mma_k % 2],
+              warp_loaded_frag_A[warp_mma_k % 2],
+              warp_loaded_frag_B[warp_mma_k % 2]);
+
+        if (platform::is_same<
+                typename Operator::MathOperator,
+                arch::OpMultiplyAddFastF32>::value ||
+            platform::is_same<
+                typename Operator::MathOperator,
+                arch::OpMultiplyAddComplexFastF32>::value) {
+          warp_mma(
+              tmp_accum,
+              warp_transformed_frag_A[warp_mma_k % 2],
+              warp_transformed_frag_B[warp_mma_k % 2],
+              tmp_accum);
+
+          if (warp_mma_k == 0) {
+            accum = plus_accum(accum, tmp_accum);
+            tmp_accum.clear();
+          }
+        } else {
+          warp_mma(
+              accum,
+              warp_transformed_frag_A[warp_mma_k % 2],
+              warp_transformed_frag_B[warp_mma_k % 2],
+              accum);
+        }
+
+        // Issue global->shared copies for the this stage
+        if (!kSmemContainsEntireMat &&
+            warp_mma_k < Base::kWarpGemmIterations - 1) {
+          int group_start_iteration_A, group_start_iteration_B;
+
+          group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
+          group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance(
+              iterator_A,
+              iterator_B,
+              group_start_iteration_A,
+              group_start_iteration_B);
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+          if (!kSmemContainsEntireMat) {
+            int group_start_iteration_A, group_start_iteration_B;
+            group_start_iteration_A =
+                (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+            group_start_iteration_B =
+                (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+
+            copy_tiles_and_advance(
+                iterator_A,
+                iterator_B,
+                group_start_iteration_A,
+                group_start_iteration_B);
+          }
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          cutlass::arch::cp_async_wait<kNumStagesConcurrentLoad - 1>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A.add_tile_offset({0, 1});
+          iterator_B.add_tile_offset({1, 0});
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (!kSmemContainsEntireMat &&
+              smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0,
+                 -Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+          iterator_A.clear_mask(gemm_k_iterations == 0);
+          iterator_B.clear_mask(gemm_k_iterations == 0);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations)
+          warp_mma.transform(
+              warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+              warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+              warp_loaded_frag_A[(warp_mma_k + 1) % 2],
+              warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+      }
+    }
+
+    if (platform::is_same<
+            typename Operator::MathOperator,
+            arch::OpMultiplyAddFastF32>::value ||
+        platform::is_same<
+            typename Operator::MathOperator,
+            arch::OpMultiplyAddComplexFastF32>::value) {
+      accum = plus_accum(accum, tmp_accum);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/41_fused_multi_head_attention/gemm/custom_mma_pipelined.h b/examples/41_fused_multi_head_attention/gemm/custom_mma_pipelined.h
new file mode 100644
index 0000000000..fd527a17b6
--- /dev/null
+++ b/examples/41_fused_multi_head_attention/gemm/custom_mma_pipelined.h
@@ -0,0 +1,401 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "custom_mma_base.h"
+#include "cutlass/gemm/gemm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Transformation applied to A operand
+    typename TransformA_ = NumericArrayConverter<
+        typename SmemIteratorA_::Element,
+        typename IteratorA_::Element,
+        IteratorA_::Fragment::kElements>,
+    ///
+    /// Transformation applied to B operand
+    typename TransformB_ = NumericArrayConverter<
+        typename SmemIteratorB_::Element,
+        typename IteratorB_::Element,
+        IteratorB_::Fragment::kElements>,
+    /// Used for partial specialization
+    typename Enable = bool>
+class CustomMmaPipelined : public CustomMmaBase<Shape_, Policy_, 2> {
+ public:
+  ///< Base class
+  using Base = CustomMmaBase<Shape_, Policy_, 2>;
+
+  using Shape =
+      Shape_; ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using IteratorA =
+      IteratorA_; ///< Iterates over tiles of A operand in global memory
+  using IteratorB =
+      IteratorB_; ///< Iterates over tiles of B operand in global memory
+  using ElementC = ElementC_; ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_; ///< Layout of accumulator matrix
+  using Policy = Policy_; ///< Policy describing tuning details
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand A loaded from global memory
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Obtain the arch tag from the warp-level operator
+  using ArchTag = typename Policy::Operator::ArchTag;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+  static_assert(
+      (Base::kStages == 2),
+      "MmaPipelined requires kStages set to value 2");
+
+  static bool const kSmemContainsEntireMat = false;
+
+ private:
+  using WarpFragmentA = typename Operator::FragmentA;
+  using WarpFragmentB = typename Operator::FragmentB;
+
+ protected:
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  CustomMmaPipelined(
+      typename Base::SharedStorageA& shared_storageA,
+      typename Base::SharedStorageB& shared_storageB,
+      int thread_idx, ///< ID within the threadblock
+      int warp_idx, ///< ID of warp
+      int lane_idx ///< ID of each thread within a warp
+      )
+      : Base(shared_storageA, shared_storageB, thread_idx, warp_idx, lane_idx),
+        smem_iterator_A_(shared_storageA.ref(), thread_idx),
+        smem_iterator_B_(shared_storageB.ref(), thread_idx) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+  CUTLASS_DEVICE
+  CustomMmaPipelined(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage& st,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : CustomMmaPipelined(
+            st.operand_A,
+            st.operand_B,
+            thread_idx,
+            warp_idx,
+            lane_idx) {}
+
+  CUTLASS_DEVICE
+  bool set_prologue_done(bool value) {
+    // NOT IMPLEMENTED FOR PIPELINED
+  }
+
+  CUTLASS_DEVICE
+  bool set_zero_outside_bounds(bool value) {
+    // NOT NEEDED FOR PIPELINED
+    // shared memory will always be zero-filled
+  }
+
+  template <bool kLoadA = true, bool kLoadB = true>
+  CUTLASS_DEVICE static void prologue(
+      typename Base::SharedStorage& shared_storage,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      int thread_idx,
+      int problem_size_k) {
+    prologue<kLoadA, kLoadB>(
+        shared_storage.operand_A,
+        shared_storage.operand_B,
+        iterator_A,
+        iterator_B,
+        thread_idx,
+        problem_size_k);
+  }
+
+  template <bool kLoadA = true, bool kLoadB = true>
+  CUTLASS_DEVICE static void prologue(
+      typename Base::SharedStorageA& shared_storageA,
+      typename Base::SharedStorageB& shared_storageB,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      int thread_idx,
+      int problem_size_k) {
+    // NOT IMPLEMENTED FOR PIPELINED
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      int gemm_k_iterations, ///< number of iterations of the mainloop
+      FragmentC& accum, ///< destination accumulator tile
+      IteratorA iterator_A, ///< iterator over A operand in global memory
+      IteratorB iterator_B, ///< iterator over B operand in global memory
+      FragmentC const& src_accum, ///< source accumulator tile
+      TransformA transform_A =
+          TransformA(), ///< transformation applied to A fragment
+      TransformB transform_B =
+          TransformB()) { ///< transformation applied to B fragment
+
+    //
+    // Prologue
+    //
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    FragmentA tb_frag_A;
+    FragmentB tb_frag_B;
+
+    tb_frag_A.clear();
+    tb_frag_B.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_A.load(tb_frag_A);
+    iterator_B.load(tb_frag_B);
+
+    ++iterator_A;
+    ++iterator_B;
+
+    this->smem_iterator_A_.store(transform_A(tb_frag_A));
+    this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+    ++this->smem_iterator_A_;
+    ++this->smem_iterator_B_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpFragmentA warp_frag_A[2];
+    WarpFragmentB warp_frag_B[2];
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    Operator warp_mma;
+
+    int smem_write_stage_idx = 1;
+
+    // Avoid reading out of bounds
+    iterator_A.clear_mask(gemm_k_iterations <= 1);
+    iterator_B.clear_mask(gemm_k_iterations <= 1);
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER*
+    // issuing shared memory loads (which have the tightest latency
+    // requirement).
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations - 1) {
+          // Write fragments to shared memory
+          this->smem_iterator_A_.store(transform_A(tb_frag_A));
+
+          this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+          __syncthreads();
+
+          ++this->smem_iterator_A_;
+          ++this->smem_iterator_B_;
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+          } else {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0,
+                 -Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+        }
+
+        this->warp_tile_iterator_A_.set_kgroup_index(
+            (warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index(
+            (warp_mma_k + 1) % Base::kWarpGemmIterations);
+
+        this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k == 0) {
+          iterator_A.load(tb_frag_A);
+          iterator_B.load(tb_frag_B);
+
+          ++iterator_A;
+          ++iterator_B;
+
+          // Avoid reading out of bounds if this was the last loop iteration
+          iterator_A.clear_mask(gemm_k_iterations <= 2);
+          iterator_B.clear_mask(gemm_k_iterations <= 2);
+        }
+
+        warp_mma(
+            accum,
+            warp_frag_A[warp_mma_k % 2],
+            warp_frag_B[warp_mma_k % 2],
+            accum);
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/41_fused_multi_head_attention/gemm/find_default_mma.h b/examples/41_fused_multi_head_attention/gemm/find_default_mma.h
new file mode 100644
index 0000000000..ee7d3d6027
--- /dev/null
+++ b/examples/41_fused_multi_head_attention/gemm/find_default_mma.h
@@ -0,0 +1,191 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Cutlass provides helper template functions to figure out the right
+   datastructures to instanciate to run a GEMM with various parameters (see
+   `cutlass/gemm/threadblock/default_mma.h`). However, due to template
+   instantiation priority rules, it will only create an MmaMultiStage with
+   kStages=3 (otherwise creates an MmePipelined - which is not compatible with
+   FastF32). kStages=3 uses too much shared memory and we want to use kStages=2,
+   so we just copy-pasted some code from `default_mma.h` and
+   `default_mma_core.h` files and wrapped this template to allow our usecase.
+
+    This is really only for the FastF32 case - aka using TensorCores with fp32.
+*/
+
+#pragma once
+
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operand
+    typename LayoutC,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator,
+    typename Enable_ = void>
+struct FindDefaultMma {
+  static constexpr bool AccumulatorsInRowMajor = false;
+  static constexpr SharedMemoryClearOption SharedMemoryClear =
+      SharedMemoryClearOption::kNone;
+  using DefaultMma = cutlass::gemm::threadblock::DefaultMma<
+      ElementA,
+      LayoutA,
+      kAlignmentA,
+      ElementB,
+      LayoutB,
+      kAlignmentB,
+      ElementAccumulator,
+      LayoutC,
+      OperatorClass,
+      ArchTag,
+      ThreadblockShape,
+      WarpShape,
+      InstructionShape,
+      Stages,
+      Operator,
+      AccumulatorsInRowMajor,
+      SharedMemoryClear>;
+};
+
+/// Specialization for sm80 / FastF32 / multistage with kStages=2
+template <
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    int kStages,
+    typename Operator>
+struct FindDefaultMma<
+    ElementA_,
+    LayoutA_,
+    kAlignmentA,
+    ElementB_,
+    LayoutB_,
+    kAlignmentB,
+    ElementAccumulator,
+    layout::RowMajor,
+    arch::OpClassTensorOp,
+    arch::Sm80,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    kStages,
+    Operator,
+    typename cutlass::platform::enable_if<(kAlignmentA > 1)>::type> {
+  using LayoutC = layout::RowMajor;
+  using OperatorClass = arch::OpClassTensorOp;
+  using ArchTag = arch::Sm80;
+
+  using DefaultMma_ = cutlass::gemm::threadblock::DefaultMma<
+      ElementA_,
+      LayoutA_,
+      kAlignmentA,
+      ElementB_,
+      LayoutB_,
+      kAlignmentB,
+      ElementAccumulator,
+      LayoutC,
+      OperatorClass,
+      ArchTag,
+      ThreadblockShape,
+      WarpShape,
+      InstructionShape,
+      3,
+      Operator>;
+  struct DefaultMma : DefaultMma_ {
+    using MmaCore_ = typename DefaultMma_::MmaCore;
+    // Define the threadblock-scoped multistage matrix multiply
+    using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
+        typename MmaCore_::Shape,
+        typename DefaultMma_::IteratorA,
+        typename MmaCore_::SmemIteratorA,
+        MmaCore_::kCacheOpA,
+        typename DefaultMma_::IteratorB,
+        typename MmaCore_::SmemIteratorB,
+        MmaCore_::kCacheOpB,
+        ElementAccumulator,
+        LayoutC,
+        typename MmaCore_::MmaPolicy,
+        kStages>;
+  };
+};
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/examples/41_fused_multi_head_attention/gemm/mma_accum_lambda_iterator.h b/examples/41_fused_multi_head_attention/gemm/mma_accum_lambda_iterator.h
new file mode 100644
index 0000000000..0a67c4e853
--- /dev/null
+++ b/examples/41_fused_multi_head_attention/gemm/mma_accum_lambda_iterator.h
@@ -0,0 +1,378 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/functional.h"
+#include "cutlass/gemm/warp/mma_simt_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+#include "cutlass/matrix_shape.h"
+
+/*
+TensorCores have different accumulator layouts.
+This file provides a class to easily map the accumulator
+i-th element with the corresponding matrix row/col.
+*/
+
+template <typename T, typename accum_t, int kWarpSize>
+struct AccumLambdaIteratorSm80 {
+  static_assert(
+      cutlass::platform::
+          is_same<typename T::Layout, cutlass::layout::RowMajor>::value,
+      "only RowMajor is supported");
+
+  using Policy = typename T::Policy;
+  using InstructionShape = typename T::InstructionShape;
+  using OpDelta = typename T::OpDelta;
+  using Shape = typename T::Shape;
+  static int const kElementsPerAccess = InstructionShape::kN / 4;
+  static int const kRowsPerTile = 8;
+  static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile;
+
+  static cutlass::MatrixCoord CUTLASS_DEVICE get_lane_offset(
+      int8_t lane_id,
+      int8_t warp_id,
+      typename T::TensorCoord const& tile_offset) {
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+    return cutlass::MatrixCoord(
+        quad + tile_offset.row() * Shape::kRow,
+        lane_in_quad * kElementsPerAccess +
+            tile_offset.column() * Shape::kColumn);
+  }
+
+  template <typename FA, typename FB, typename FC>
+  CUTLASS_DEVICE static void iterateRows(
+      cutlass::MatrixCoord& lane_offset,
+      FA beginRow,
+      FB op,
+      FC endRow) {
+    // See cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int row = 0; row < kAccumulatorRows; ++row) {
+        int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
+            row * kRowsPerTile + lane_offset.row();
+        beginRow(accum_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+          int mma_accum_start = kAccumulatorRows * kElementsPerAccess *
+              (mma_n * Policy::MmaIterations::kRow + mma_m);
+          CUTLASS_PRAGMA_UNROLL
+          for (int col = 0; col < kElementsPerAccess; ++col) {
+            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn +
+                col + lane_offset.column();
+            int idx = mma_accum_start + row * kElementsPerAccess + col;
+            op(accum_m, accum_n, idx);
+          }
+        }
+
+        endRow(accum_m);
+      }
+    }
+  }
+
+  template <typename DT, typename F>
+  CUTLASS_DEVICE static bool reduceSameRow(int lane_id, DT& myValue, F fn) {
+    // In each warp, 4 threads will work on the same row
+    // - the ones with the same `quad`
+    auto otherV = __shfl_xor_sync(0xffffffff, myValue, 1);
+    myValue = fn(myValue, otherV);
+    otherV = __shfl_xor_sync(0xffffffff, myValue, 2);
+    myValue = fn(myValue, otherV);
+    int lane_in_quad = (lane_id & 3);
+    return lane_in_quad == 0;
+  }
+};
+
+template <typename T, typename accum_t, int kWarpSize>
+struct AccumLambdaIteratorSm70 {
+  static_assert(
+      cutlass::platform::
+          is_same<typename T::Layout, cutlass::layout::RowMajor>::value,
+      "only RowMajor is supported");
+
+  using Policy = typename T::Policy;
+  using InstructionShape = typename T::InstructionShape;
+  using OpDelta = typename T::OpDelta;
+  using Shape = typename T::Shape;
+  using Element = accum_t;
+
+  static int const kElementsPerPartial = 4;
+  using EleShapePerPatial = typename cutlass::platform::conditional<
+      cutlass::platform::is_same<Element, float>::value,
+      cutlass::MatrixShape<2, 2>,
+      cutlass::MatrixShape<1, 4>>::type;
+  static int const kElementsPerMma = 8;
+  static int const kAccumulatorPatials = 2;
+  using QuadShapePerPatialMma = cutlass::MatrixShape<4, 4>;
+
+  static cutlass::MatrixCoord CUTLASS_DEVICE get_lane_offset(
+      int8_t lane_id,
+      int8_t warp_id,
+      typename T::TensorCoord const& tile_offset) {
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+    int accum_m, accum_n;
+
+    if (cutlass::platform::is_same<Element, float>::value) {
+      // (quad[2],quad[0])+lane_in_quad[0]
+      accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + (lane_in_quad & 1);
+      // (quad[1])+lane_in_quad[1]
+      accum_n =
+          ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials +
+          (lane_in_quad & 2);
+    } else {
+      accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 +
+          lane_in_quad; // (quad[2],quad[0])
+      accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials;
+    }
+    return cutlass::MatrixCoord(
+        accum_m + tile_offset.row() * Shape::kRow,
+        accum_n + tile_offset.column() * Shape::kColumn);
+  }
+
+  template <typename DT, typename F>
+  CUTLASS_DEVICE static bool reduceSameRow(int lane_id, DT& myValue, F fn) {
+    static_assert(
+        cutlass::platform::is_same<Element, float>::value,
+        "update to support non-float accum");
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-fragment-mma-884-f16
+    // T0 & T2 share same line within a quad
+    auto otherV = __shfl_xor_sync(0xffffffff, myValue, 1 << 1);
+    myValue = fn(myValue, otherV);
+    // quad 0 and quad 2 are on the same lines
+    otherV = __shfl_xor_sync(0xffffffff, myValue, 1 << 3);
+    myValue = fn(myValue, otherV);
+    return (lane_id & ((1 << 1) | (1 << 3))) == 0;
+  }
+
+  template <typename FA, typename FB, typename FC>
+  CUTLASS_DEVICE static void iterateRows(
+      cutlass::MatrixCoord& lane_offset,
+      FA beginRow,
+      FB op,
+      FC endRow) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < EleShapePerPatial::kRow; ++m) {
+          int accum_m = tile_m * Policy::InterleavedTile::kRow +
+              mma_m * QuadShapePerPatialMma::kRow + m * 2 + lane_offset.row();
+          beginRow(accum_m);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn;
+               ++tile_n) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn;
+                 ++mma_n) {
+              CUTLASS_PRAGMA_UNROLL
+              for (int p = 0; p < kAccumulatorPatials; ++p) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int n = 0; n < EleShapePerPatial::kColumn; ++n) {
+                  int mma_accum_start =
+                      (((tile_n * Policy::TileIterations::kRow + tile_m) *
+                            Policy::MmaIterations::kColumn +
+                        mma_n) *
+                           Policy::MmaIterations::kRow +
+                       mma_m) *
+                      kElementsPerMma;
+                  int accum_n = tile_n * Policy::InterleavedTile::kColumn +
+                      mma_n * QuadShapePerPatialMma::kColumn +
+                      p * Policy::InterleavedTile::kColumn / 2 + n +
+                      lane_offset.column();
+                  int idx = mma_accum_start + p * kElementsPerPartial +
+                      m * EleShapePerPatial::kColumn + n;
+                  op(accum_m, accum_n, idx);
+                }
+              }
+            }
+          }
+          endRow(accum_m);
+        }
+      }
+    }
+  }
+};
+
+template <typename T, typename accum_t, int kWarpSize>
+struct AccumLambdaIteratorSimt {
+  using Policy = typename T::Policy;
+  using Iterations = typename T::Iterations;
+  using Element = typename T::Element;
+  using Delta = typename T::Delta;
+  using Shape = typename T::Shape;
+  static_assert(
+      cutlass::platform::
+          is_same<typename T::Layout, cutlass::layout::RowMajor>::value,
+      "only RowMajor is supported");
+
+  template <typename DT, typename F>
+  CUTLASS_DEVICE static bool reduceSameRow(int lane_id, DT& myValue, F fn) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int bit = 1; bit < Policy::WarpShape::kColumn; bit *= 2) {
+      auto otherV = __shfl_xor_sync(0xffffffff, myValue, bit);
+      myValue = fn(myValue, otherV);
+    }
+    return (lane_id & (Policy::WarpShape::kColumn - 1)) == 0;
+  }
+
+  template <typename FA, typename FB, typename FC>
+  CUTLASS_DEVICE static void iterateRows(
+      cutlass::MatrixCoord& lane_offset,
+      FA beginRow,
+      FB op,
+      FC endRow) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < Policy::LaneMmaShape::kM; ++m) {
+        int accum_m = mma_m * Delta::kRow + m + lane_offset.row();
+        beginRow(accum_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) {
+          int accum_n =
+              mma_n * Policy::WarpShape::kColumn * Policy::LaneMmaShape::kN +
+              lane_offset.column();
+          CUTLASS_PRAGMA_UNROLL
+          for (int n = 0; n < Policy::LaneMmaShape::kN; ++n) {
+            int idx = n +
+                Policy::LaneMmaShape::kN *
+                    (mma_n +
+                     Iterations::kColumn *
+                         (m + mma_m * Policy::LaneMmaShape::kM));
+            op(accum_m, accum_n + n, idx);
+          }
+        }
+        endRow(accum_m);
+      }
+    }
+  }
+
+  static cutlass::MatrixCoord CUTLASS_DEVICE get_lane_offset(
+      int8_t lane_id,
+      int8_t warp_id,
+      typename T::TensorCoord const& tile_offset) {
+    static_assert(
+        cutlass::platform::is_same<
+            typename Policy::LaneLayout,
+            cutlass::layout::RowMajorInterleaved<1>>::value,
+        "");
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    cutlass::MatrixCoord lane_offset = lane_layout.inverse(lane_id) *
+        cutlass::MatrixCoord(Policy::LaneMmaShape::kM,
+                             Policy::LaneMmaShape::kN);
+    return lane_offset +
+        tile_offset * cutlass::MatrixCoord(Shape::kRow, Shape::kColumn);
+  }
+};
+
+template <typename T, typename accum_t, int kWarpSize>
+struct DefaultMmaAccumLambdaIterator;
+
+// Simt
+template <typename S, typename P, typename accum_t, int kWarpSize>
+struct DefaultMmaAccumLambdaIterator<
+    cutlass::gemm::warp::MmaSimtTileIterator<
+        S,
+        cutlass::gemm::Operand::kC,
+        accum_t,
+        cutlass::layout::RowMajor,
+        P,
+        1,
+        1>,
+    accum_t,
+    kWarpSize> {
+  using WarpIterator = typename cutlass::gemm::warp::MmaSimtTileIterator<
+      S,
+      cutlass::gemm::Operand::kC,
+      accum_t,
+      cutlass::layout::RowMajor,
+      P,
+      1,
+      1>;
+  using Iterator = AccumLambdaIteratorSimt<WarpIterator, accum_t, kWarpSize>;
+};
+
+// TensorOp - Volta
+template <typename S1, typename S2, typename accum_t, int kWarpSize>
+struct DefaultMmaAccumLambdaIterator<
+    cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
+        S1,
+        accum_t,
+        cutlass::layout::RowMajor,
+        S2,
+        cutlass::MatrixShape<1, 1>>,
+    accum_t,
+    kWarpSize> {
+  using WarpIterator =
+      typename cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
+          S1,
+          accum_t,
+          cutlass::layout::RowMajor,
+          S2,
+          cutlass::MatrixShape<1, 1>>;
+  using Iterator = AccumLambdaIteratorSm70<WarpIterator, accum_t, kWarpSize>;
+};
+
+// TensorOp - Sm75+
+template <
+    typename S1,
+    typename S2,
+    typename S3,
+    typename accum_t,
+    int kWarpSize>
+struct DefaultMmaAccumLambdaIterator<
+    cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator<
+        S1,
+        accum_t,
+        cutlass::layout::RowMajor,
+        S2,
+        S3>,
+    accum_t,
+    kWarpSize> {
+  using WarpIterator =
+      typename cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator<
+          S1,
+          accum_t,
+          cutlass::layout::RowMajor,
+          S2,
+          S3>;
+  using Iterator = AccumLambdaIteratorSm80<WarpIterator, accum_t, kWarpSize>;
+};
diff --git a/examples/41_fused_multi_head_attention/gemm/mma_from_smem.h b/examples/41_fused_multi_head_attention/gemm/mma_from_smem.h
new file mode 100644
index 0000000000..3e41274349
--- /dev/null
+++ b/examples/41_fused_multi_head_attention/gemm/mma_from_smem.h
@@ -0,0 +1,1955 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tools and utils to store a GEMM output in shmem, and to use that
+   output as operandA for another GEMM back-to-back
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/functional.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/transform/threadblock/vector_iterator.h"
+
+#include "../epilogue/epilogue_thread_apply_logsumexp.h"
+#include "../gemm/mma_accum_lambda_iterator.h"
+#include "../gemm_kernel_utils.h"
+#include "../iterators/default_warp_iterator_from_smem.h"
+#include "../iterators/make_residual_last.h"
+#include "../iterators/transpose_warp_iterator.h"
+#include "../iterators/warp_iterator_from_smem.h"
+#include "cutlass/epilogue/threadblock/epilogue_smem_accumulator.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+#include "cutlass/gemm/threadblock/mma_multistage.h"
+#include "cutlass/gemm/threadblock/mma_pipelined.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h"
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/// Shared storage object needed by accumulator
+/// From 13_two_tensor_op_fusion/threadblock/b2b_mma_base_smem_accumulator.h
+template <
+    typename Shape_,
+    typename Element_,
+    typename Layout_,
+    typename Padding_>
+class AccumulatorSharedStorage {
+ public:
+  //
+  // Type definitions
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using Padding = Padding_;
+
+  /// Tensor reference to the accumulator
+  using TensorRefAccum = cutlass::TensorRef<Element, Layout>;
+
+  /// Shape of the accumulator matrix in shared memory
+  using ShapeAccum = cutlass::
+      MatrixShape<Shape::kM + Padding::kRow, Shape::kN + Padding::kColumn>;
+
+ public:
+  //
+  // Data members
+  //
+
+  /// Buffer for accumulator
+  cutlass::AlignedBuffer<Element, ShapeAccum::kCount> accum;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Returns a layout object for the Accum matrix
+  CUTLASS_DEVICE
+  static Layout LayoutAccum() {
+    return Layout::packed({ShapeAccum::kRow, ShapeAccum::kColumn});
+  }
+
+  /// Returns a TensorRef to the Accumulator
+  CUTLASS_HOST_DEVICE
+  TensorRefAccum accum_ref() {
+    return TensorRefAccum{accum.data(), LayoutAccum()};
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Taken from
+// https://github.com/NVIDIA/cutlass/blob/master/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base_smem_accumulator.h
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    // Maximum K dimension - also the dimension of the shared-memory
+    // holding `OperandA`
+    int kMaxK_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Layout in shared-memory of operand A
+    typename SmemLayoutA,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaBaseFromSharedMemory {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  static constexpr int kMaxK = kMaxK_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = GemmShape<
+      Shape::kM / WarpGemm::kM,
+      Shape::kN / WarpGemm::kN,
+      Shape::kK / WarpGemm::kK>;
+  using WarpCount1 = WarpCount;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations =
+      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+  static int const kWarpGemmIterations1 = kWarpGemmIterations;
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  /// If this is true, we fill the entire shmem buffer at start
+  /// and don't need to iterate through it in a circular fashion
+  static bool const kSmemContainsEntireB = kMaxK <= Shape::kK * kStages;
+
+  /// Tensor reference to the A operand
+  using TensorRefA = TensorRef<typename Operator::ElementA, SmemLayoutA>;
+
+  /// Tensor reference to the B operand
+  using TensorRefB =
+      TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB = MatrixShape<
+        Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+        Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+   public:
+    //
+    // Data members
+    //
+
+    /// Buffer for B operand
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+   public:
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() {
+      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+    }
+
+    /// Returns a TensorRef to the B operand
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() {
+      return TensorRefB{operand_B.data(), LayoutB()};
+    }
+  };
+
+ protected:
+  //
+  // Data members
+  //
+
+  // /// Iterator to load a warp-scoped tile of A operand from shared memory
+  // typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaBaseFromSharedMemory(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      TensorRefB& b_tile,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : warp_tile_iterator_B_(b_tile, lane_idx) {}
+};
+
+namespace {
+
+// has necessary trait compliance with WarpIteratorFromSmem but doesn't do
+// anything, can be default initialized, and uses fragment that takes up
+// (almost) no space. this warp iterator is selected at compile time when
+// elementwise on-the-fly scaling for operand A is disabled, in which case
+// operations related to loading scale factors for operand A get wiped out by
+// the compiler.
+template <typename TensorRef>
+class NoOpWarpIteratorScale {
+ public:
+  // in pipelined+multistage MMA implementations we keep an array of fragments.
+  // if we aren't using scaling we don't want to waste registers on fragments
+  // of scale elements, so ideally this would be sized 0.
+  // Since arrays of zero-sized objects are not allowed, using size as 1.
+  // The compiler will most likely wipe it out anyways.
+  using Fragment = cutlass::Array<char, 1>;
+
+  CUTLASS_HOST_DEVICE
+  NoOpWarpIteratorScale() {}
+
+  CUTLASS_HOST_DEVICE
+  NoOpWarpIteratorScale(TensorRef const&, int) {}
+
+  CUTLASS_HOST_DEVICE
+  NoOpWarpIteratorScale& add_tile_offset(
+      typename TensorRef::TensorCoord const&) {
+    return *this;
+  }
+
+  CUTLASS_HOST_DEVICE
+  NoOpWarpIteratorScale& operator++() {
+    return *this;
+  }
+
+  CUTLASS_DEVICE
+  void load(Fragment&) const {}
+};
+
+// if scaling is enabled, performs fragment elementwise multiplication between
+// fragment and its scaling factor.
+template <typename Fragment, typename FragmentScale, bool ScalingEnabled>
+class FragmentElementwiseScaler;
+
+// specialization for scaling being enabled.
+template <typename Fragment, typename FragmentScale>
+class FragmentElementwiseScaler<Fragment, FragmentScale, true> {
+ public:
+  // cast scale_frag to correct type then apply elementwise to fragment
+  CUTLASS_DEVICE
+  static Fragment apply(Fragment frag, FragmentScale const& scale_frag) {
+    Fragment converted_scale_frag = cutlass::NumericArrayConverter<
+        typename Fragment::Element,
+        typename FragmentScale::Element,
+        FragmentScale::kElements>()(scale_frag);
+    return cutlass::multiplies<Fragment>()(frag, converted_scale_frag);
+  }
+};
+
+// specialization for scaling being disabled. doesn't do anything and should
+// just get wiped out by the compiler.
+template <typename Fragment, typename FragmentScale>
+class FragmentElementwiseScaler<Fragment, FragmentScale, false> {
+ public:
+  CUTLASS_DEVICE
+  static Fragment apply(Fragment frag, FragmentScale const&) {
+    return frag;
+  }
+};
+} // namespace
+
+////////////////////////////////////////////////////////////////////////////////
+// Taken from
+// https://github.com/NVIDIA/cutlass/blob/master/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined_smem_accumulator.h
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    // BEGIN smem
+    /// Iterates over the intermediate accumulator tile in shared memory
+    typename WarpIteratorA_,
+    /// whether or not to perform elementwise multiplication of A
+    //  by another matrix (A_scale) that is also kept in shared memory prior
+    //  to matmul A @ B
+    bool ScaleOperandA_,
+    /// Max GEMM problem size in K dimension
+    int MaxK,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Transformation applied to B operand
+    typename TransformB_ = NumericArrayConverter<
+        typename SmemIteratorB_::Element,
+        typename IteratorB_::Element,
+        IteratorB_::Fragment::kElements>,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaPipelinedFromSharedMemory : public MmaBaseFromSharedMemory<
+                                         Shape_,
+                                         MaxK,
+                                         Policy_,
+                                         2,
+                                         typename WarpIteratorA_::Layout> {
+ public:
+  ///< Base class
+  using Base = MmaBaseFromSharedMemory<
+      Shape_,
+      MaxK,
+      Policy_,
+      2,
+      typename WarpIteratorA_::Layout>;
+
+  using Shape =
+      Shape_; ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  static constexpr bool ScaleOperandA = ScaleOperandA_;
+
+  using WarpIteratorA = WarpIteratorA_;
+  ///< loads fragments of A_scale from shared memory if operand A scaling is
+  ///< enabled. otherwise no-op.
+  using WarpIteratorAScale = typename cutlass::platform::conditional<
+      ScaleOperandA,
+      WarpIteratorA,
+      NoOpWarpIteratorScale<typename WarpIteratorA::TensorRef>>::type;
+
+  using IteratorB =
+      IteratorB_; ///< Iterates over tiles of B operand in global memory
+  using ElementC = ElementC_; ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_; ///< Layout of accumulator matrix
+  using Policy = Policy_; ///< Policy describing tuning details
+
+  using SmemIteratorB = SmemIteratorB_;
+
+  using TransformB = TransformB_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Obtain the arch tag from the warp-level operator
+  using ArchTag = typename Policy::Operator::ArchTag;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+  static_assert(
+      (Base::kStages == 2),
+      "MmaPipelined requires kStages set to value 2");
+
+ private:
+  using WarpFragmentA = typename Operator::FragmentA;
+
+  /// fragment type of OperandA elementwise scaling matrix. (almost) empty
+  /// if operand A scaling is disabled.
+  using WarpFragmentAScale = typename WarpIteratorAScale::Fragment;
+
+  using WarpFragmentB = typename Operator::FragmentB;
+
+  /// applies scaling factor to operand A fragment if operand A scaling is
+  /// enabled. otherwise no-op.
+  using FragmentAScaler = FragmentElementwiseScaler<
+      WarpFragmentA,
+      WarpFragmentAScale,
+      ScaleOperandA>;
+
+ protected:
+  // /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  // SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  /// Iterator to load a warp-scoped tile of A operand from intermediate
+  /// accumulator tile
+  WarpIteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of A_scale from intermediate
+  /// accumulator tile (only used if ScaleOperandA_ is true)
+  WarpIteratorAScale warp_tile_iterator_A_scale_;
+
+ public:
+  /// constructor for MMA with operand A scaling enabled.
+  CUTLASS_DEVICE
+  MmaPipelinedFromSharedMemory(
+      typename Base::TensorRefA a, // Operand A in shared memory
+      typename Base::TensorRefA a_scale, // Operand A_scale in shared memory
+      typename Base::TensorRefB
+          b_staging, // staging memory for loading tiles of B
+      int thread_idx,
+      int warp_idx,
+      int lane_idx)
+      : Base(b_staging, thread_idx, warp_idx, lane_idx),
+        warp_tile_iterator_A_(a, lane_idx),
+        warp_tile_iterator_A_scale_(a_scale, lane_idx),
+        smem_iterator_B_(b_staging, thread_idx) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_A_scale_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaPipelinedFromSharedMemory(
+      typename Base::TensorRefA a, ///< Operand A in shared memory
+      typename Base::TensorRefB b_staging, ///< staging memory for loading B
+      int thread_idx, ///< ID within the threadblock
+      int warp_idx, ///< ID of warp
+      int lane_idx) ///< ID of each thread within a warp
+      : Base(b_staging, thread_idx, warp_idx, lane_idx),
+        warp_tile_iterator_A_(a, lane_idx),
+        smem_iterator_B_(b_staging, thread_idx) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  // For API compatibility with MmaMultistageFromSharedMemory
+  // but not supported as it worsens perf: older gpus < sm80 don't
+  // support async tranfers and have to waste registers
+  CUTLASS_DEVICE
+  void set_prologue_done(bool value) {}
+  CUTLASS_DEVICE
+  static void prologue(
+      typename Base::SharedStorage& shared_storage,
+      IteratorB iterator_B1,
+      int thread_idx,
+      int problem_size_0_n) {}
+
+  CUTLASS_DEVICE
+  static void drain_cp_asyncs() {}
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      int gemm_k_iterations, ///< number of iterations of the mainloop
+      FragmentC& accum, ///< destination accumulator tile
+      // IteratorA iterator_A,                             ///< iterator over A
+      // operand in global memory
+      IteratorB iterator_B, ///< iterator over B operand in global memory
+      FragmentC const& src_accum, ///< source accumulator tile
+      // TransformA transform_A = TransformA(),            ///< transformation
+      // applied to A fragment
+      TransformB transform_B =
+          TransformB()) { ///< transformation applied to B fragment
+
+    //
+    // Prologue
+    //
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    FragmentB tb_frag_B;
+
+    tb_frag_B.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_B.set_residual_tile(gemm_k_iterations == 1);
+    iterator_B.load(tb_frag_B);
+
+    ++iterator_B;
+
+    this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+    ++this->smem_iterator_B_;
+
+    __syncthreads();
+
+    // remember that WarpFragmentAScale and WarpIteratorAScale are empty/no-op
+    // if scaling is disabled.
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpFragmentA warp_frag_A[2];
+    WarpFragmentAScale warp_frag_A_scale[2];
+    WarpFragmentB warp_frag_B[2];
+    warp_frag_A[0].clear();
+    warp_frag_A_scale[0].clear();
+    warp_frag_B[0].clear();
+
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+    this->warp_tile_iterator_A_scale_.load(warp_frag_A_scale[0]);
+    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_A_scale_;
+    ++this->warp_tile_iterator_B_;
+
+    Operator warp_mma;
+
+    int smem_write_stage_idx = 1;
+
+    // Avoid reading out of bounds
+    iterator_B.set_residual_tile(gemm_k_iterations == 2);
+    iterator_B.clear_mask(gemm_k_iterations <= 1);
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER*
+    // issuing shared memory loads (which have the tightest latency
+    // requirement).
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+        bool hasNext = true;
+
+        if (warp_mma_k == Base::kWarpGemmIterations - 1) {
+          if (gemm_k_iterations > 1) {
+            // Write fragments to shared memory
+            this->smem_iterator_B_.store(transform_B(tb_frag_B));
+          }
+
+          __syncthreads();
+
+          ++this->smem_iterator_B_;
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory SMEM: Don't reset iterator A, as
+          // we are continuing our iteration at this point
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+          } else {
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+          hasNext = gemm_k_iterations > 1;
+        }
+
+        // Only read the next if we need to
+        if (hasNext) {
+          this->warp_tile_iterator_B_.set_kgroup_index(
+              (warp_mma_k + 1) % Base::kWarpGemmIterations);
+
+          this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+          this->warp_tile_iterator_A_scale_.load(
+              warp_frag_A_scale[(warp_mma_k + 1) % 2]);
+          this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);
+
+          ++this->warp_tile_iterator_A_;
+          ++this->warp_tile_iterator_A_scale_;
+          ++this->warp_tile_iterator_B_;
+
+          if (warp_mma_k == 0) {
+            iterator_B.load(tb_frag_B);
+
+            ++iterator_B;
+
+            // Avoid reading out of bounds if this was the last loop iteration
+            iterator_B.set_residual_tile(gemm_k_iterations == 3);
+            iterator_B.clear_mask(gemm_k_iterations <= 2);
+          }
+        }
+
+        warp_mma(
+            accum,
+            FragmentAScaler::apply(
+                warp_frag_A[warp_mma_k % 2], warp_frag_A_scale[warp_mma_k % 2]),
+            warp_frag_B[warp_mma_k % 2],
+            accum);
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Taken from
+// https://github.com/NVIDIA/cutlass/blob/master/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage_smem_accumulator.h
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape1_,
+    /// Iterates over the intermediate accumulator tile in shared memory
+    typename WarpIteratorA1_,
+    /// whether or not to perform elementwise multiplication of A
+    //  by another matrix (A_scale) that is also kept in shared memory prior
+    //  to matmul A @ B
+    bool ScaleOperandA_,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB1_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB1_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB1,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy1_,
+    /// Number of stages,
+    int Stages_,
+    int kMaxK_,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaMultistageFromSharedMemory : public MmaBaseFromSharedMemory<
+                                          Shape1_,
+                                          kMaxK_,
+                                          Policy1_,
+                                          Stages_,
+                                          typename WarpIteratorA1_::Layout> {
+ public:
+  ///< Base class
+  using Base = MmaBaseFromSharedMemory<
+      Shape1_,
+      kMaxK_,
+      Policy1_,
+      Stages_,
+      typename WarpIteratorA1_::Layout>;
+
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape1 = Shape1_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB1 = IteratorB1_;
+  using IteratorB = IteratorB1;
+  ///< Policy describing tuning details
+  using Policy1 = Policy1_;
+
+  using SmemIteratorB1 = SmemIteratorB1_;
+  using WarpIteratorA1 = WarpIteratorA1_; ///< Iterates over the intermediate
+                                          ///< accumulator tile in shared memory
+  static constexpr bool ScaleOperandA = ScaleOperandA_;
+
+  ///< warp level iterator over A_scale matrix tile kept in shared memory.
+  ///< if elementwise A scaling is disabled then everything this does is no-op.
+  using WarpIteratorAScale = typename cutlass::platform::conditional<
+      ScaleOperandA,
+      WarpIteratorA1,
+      NoOpWarpIteratorScale<typename WarpIteratorA1::TensorRef>>::type;
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB1 = CacheOpB1;
+  static constexpr bool kSmemContainsEntireB = Base::kSmemContainsEntireB;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC1 = typename Policy1::Operator::FragmentC;
+  using FragmentC = FragmentC1;
+
+  /// Warp-level Mma
+  using Operator1 = typename Policy1::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB1 = Operator1::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+    static_assert(
+        Base::kWarpGemmIterations1 > 1,
+        "The pipelined structure requires at least two warp-level "
+        "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const TBLoadIterationsB1 =
+        IteratorB1::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB1 =
+        (TBLoadIterationsB1 + Base::kWarpGemmIterations1 - 1) /
+        Base::kWarpGemmIterations1;
+  };
+
+  static constexpr int kNumStagesConcurrentLoad =
+      kSmemContainsEntireB ? Base::kStages : Base::kStages - 1;
+
+ private:
+  using WarpLoadedFragmentA1 = typename Operator1::FragmentA;
+  /// fragment of OperandA scale matrix. if operand A scaling is disabled this
+  /// is (almost) empty.
+  using WarpLoadedFragmentA1Scale = typename WarpIteratorAScale::Fragment;
+  using WarpLoadedFragmentB1 = typename Operator1::FragmentB;
+  using WarpTransformedFragmentA1 = typename Operator1::TransformedFragmentA;
+  using WarpTransformedFragmentB1 = typename Operator1::TransformedFragmentB;
+
+  /// applies elementwise scaling to fragment of A. if operand A scaling is
+  /// disabled this is a no-op.
+  using FragmentAScaler = FragmentElementwiseScaler<
+      WarpLoadedFragmentA1,
+      WarpLoadedFragmentA1Scale,
+      ScaleOperandA>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A1 operand from intermediate
+  /// accumulator tile
+  WarpIteratorA1 warp_tile_iterator_A1_;
+
+  /// Iterator to load a warp-scoped tile of A1_scale operand from shared memory
+  /// if operand A scaling is disabled everything this does is a no-op.
+  WarpIteratorAScale warp_tile_iterator_A1_scale_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB1 smem_iterator_B1_;
+
+  bool prologue_done_;
+
+ public:
+  /// constructor for MMA with operand A scaling enabled.
+  CUTLASS_DEVICE
+  MmaMultistageFromSharedMemory(
+      typename Base::TensorRefA a,
+      typename Base::TensorRefA a_scale,
+      typename Base::TensorRefB b_tile,
+      int thread_idx,
+      int warp_idx,
+      int lane_idx)
+      : Base(b_tile, thread_idx, warp_idx, lane_idx),
+        warp_tile_iterator_A1_(a, lane_idx),
+        warp_tile_iterator_A1_scale_(a_scale, lane_idx),
+        smem_iterator_B1_(b_tile, thread_idx),
+        prologue_done_(false) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+    int warp_idx_mn_1 =
+        warp_idx % (Base::WarpCount1::kM * Base::WarpCount1::kN);
+    int warp_idx_k_1 = warp_idx / (Base::WarpCount1::kM * Base::WarpCount1::kN);
+    int warp_idx_m_1 = warp_idx_mn_1 % Base::WarpCount1::kM;
+    int warp_idx_n_1 = warp_idx_mn_1 / Base::WarpCount1::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    warp_tile_iterator_A1_.add_tile_offset(
+        {warp_idx_m_1, Base::kWarpGemmIterations1 * warp_idx_k_1});
+    warp_tile_iterator_A1_scale_.add_tile_offset(
+        {warp_idx_m_1, Base::kWarpGemmIterations1 * warp_idx_k_1});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations1 * warp_idx_k_1, warp_idx_n_1});
+  }
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaMultistageFromSharedMemory(
+      typename Base::TensorRefA a,
+      typename Base::TensorRefB b_tile,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : Base(b_tile, thread_idx, warp_idx, lane_idx),
+        warp_tile_iterator_A1_(a, lane_idx),
+        smem_iterator_B1_(b_tile, thread_idx),
+        prologue_done_(false) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn_1 =
+        warp_idx % (Base::WarpCount1::kM * Base::WarpCount1::kN);
+    int warp_idx_k_1 = warp_idx / (Base::WarpCount1::kM * Base::WarpCount1::kN);
+
+    int warp_idx_m_1 = warp_idx_mn_1 % Base::WarpCount1::kM;
+    int warp_idx_n_1 = warp_idx_mn_1 / Base::WarpCount1::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    warp_tile_iterator_A1_.add_tile_offset(
+        {warp_idx_m_1, Base::kWarpGemmIterations1 * warp_idx_k_1});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations1 * warp_idx_k_1, warp_idx_n_1});
+  }
+
+  CUTLASS_DEVICE
+  void set_prologue_done(bool value) {
+    prologue_done_ = value;
+  }
+
+  CUTLASS_DEVICE
+  static void prologue(
+      typename Base::SharedStorage& shared_storage,
+      IteratorB iterator_B1,
+      int thread_idx,
+      int problem_size_0_n) {
+    SmemIteratorB1 smem_iterator_B1(shared_storage.operand_B_ref(), thread_idx);
+    _prologue(
+        iterator_B1,
+        (problem_size_0_n + Base::Shape::kK - 1) / Base::Shape::kK,
+        smem_iterator_B1);
+  }
+
+  CUTLASS_DEVICE
+  static void drain_cp_asyncs() {
+    // commit and drain all pending and predicated cp.async pnz from the GEMM
+    // mainloop
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance_1(
+      IteratorB1& iterator_B1,
+      int group_start_B1 = 0) {
+    iterator_B1.set_iteration_index(
+        group_start_B1 * IteratorB1::kAccessesPerVector);
+    this->smem_iterator_B1_.set_iteration_index(group_start_B1);
+
+    // Load for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB1; ++j) {
+      if (group_start_B1 + j < Detail::TBLoadIterationsB1) {
+        typename IteratorB1::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB1::AccessType*>(
+                this->smem_iterator_B1_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB1::Element>::value *
+            IteratorB1::ThreadMap::kElementsPerAccess /
+            IteratorB1::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B1.get();
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
+              dst_ptr + v, gmem_ptr, iterator_B1.valid());
+
+          ++iterator_B1;
+        }
+        ++this->smem_iterator_B1_;
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  static void _prologue(
+      IteratorB& iterator_B1,
+      int32_t gemm_k_iterations_1,
+      SmemIteratorB1& smem_iterator_B1_) {
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < kNumStagesConcurrentLoad;
+         ++stage, --gemm_k_iterations_1) {
+      iterator_B1.set_residual_tile(gemm_k_iterations_1 == 1);
+      iterator_B1.clear_mask(gemm_k_iterations_1 == 0);
+
+      iterator_B1.set_iteration_index(0);
+      smem_iterator_B1_.set_iteration_index(0);
+
+      // Load for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::TBLoadIterationsB1; ++j) {
+        typename IteratorB1::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB1::AccessType*>(
+                smem_iterator_B1_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB1::Element>::value *
+              IteratorB1::ThreadMap::kElementsPerAccess /
+              IteratorB1::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
+              dst_ptr + v, iterator_B1.get(), iterator_B1.valid());
+
+          ++iterator_B1;
+        }
+
+        ++smem_iterator_B1_;
+      }
+
+      // Move to the next stage
+      iterator_B1.add_tile_offset({1, 0});
+
+      smem_iterator_B1_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+    iterator_B1.set_residual_tile(gemm_k_iterations_1 == 1);
+    iterator_B1.clear_mask(gemm_k_iterations_1 == 0);
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations_1_,
+      ///< destination accumulator tile
+      FragmentC1& accum,
+      ///< iterator over B1 operand in global memory
+      IteratorB1 iterator_B1,
+      ///< initial value of accumulator
+      FragmentC1 const& src_accum) {
+    // 2nd Gemm
+
+    //
+    // Prologue
+    //
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    if (!prologue_done_) {
+      _prologue(iterator_B1, gemm_k_iterations_1_, smem_iterator_B1_);
+    } else if (!kSmemContainsEntireB) {
+      // Restore the iterators increments
+
+      int gemm_k_iterations_1 = gemm_k_iterations_1_;
+      // Issue several complete stages
+      CUTLASS_PRAGMA_UNROLL
+      for (int stage = 0; stage < kNumStagesConcurrentLoad;
+           ++stage, --gemm_k_iterations_1) {
+        iterator_B1.set_iteration_index(0);
+        this->smem_iterator_B1_.set_iteration_index(0);
+
+        // Load for operand B
+        CUTLASS_PRAGMA_UNROLL
+        for (int j = 0; j < Detail::TBLoadIterationsB1; ++j) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+            ++iterator_B1;
+          }
+          ++this->smem_iterator_B1_;
+        }
+        iterator_B1.add_tile_offset({1, 0});
+        this->smem_iterator_B1_.add_tile_offset({1, 0});
+      }
+      iterator_B1.set_residual_tile(gemm_k_iterations_1 <= 1);
+      iterator_B1.clear_mask(gemm_k_iterations_1 <= 0);
+    }
+
+    // DEPBAR+SYNC
+    cutlass::arch::cp_async_wait<kNumStagesConcurrentLoad - 1>();
+    __syncthreads();
+
+    // remember that WarpFragmentAScale and WarpIteratorAScale are no-op/empty
+    // if scaling is disabled.
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA1 warp_loaded_frag_A1[2];
+    WarpLoadedFragmentA1Scale warp_loaded_frag_A1_scale[2];
+    WarpLoadedFragmentB1 warp_loaded_frag_B1[2];
+    WarpTransformedFragmentA1 warp_transformed_frag_A1[2];
+    WarpTransformedFragmentB1 warp_transformed_frag_B1[2];
+
+    Operator1 warp_mma1;
+
+    warp_tile_iterator_A1_.load(warp_loaded_frag_A1[0]);
+    ++warp_tile_iterator_A1_;
+
+    warp_tile_iterator_A1_scale_.load(warp_loaded_frag_A1_scale[0]);
+    ++warp_tile_iterator_A1_scale_;
+
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B1[0]);
+    ++this->warp_tile_iterator_B_;
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma1.transform(
+        warp_transformed_frag_A1[0],
+        warp_transformed_frag_B1[0],
+        FragmentAScaler::apply(
+            warp_loaded_frag_A1[0], warp_loaded_frag_A1_scale[0]),
+        warp_loaded_frag_B1[0]);
+
+    // tf32x3 kernels use staging accumulation. warp_mma uses a temporary
+    // accumulator and this temporary accumulator is added to the final
+    // accumulator once in every mainloop iteration.
+    plus<FragmentC1> plus_accum;
+
+    FragmentC1 tmp_accum;
+
+    if (platform::is_same<
+            typename Operator1::MathOperator,
+            arch::OpMultiplyAddFastF32>::value ||
+        platform::is_same<
+            typename Operator1::MathOperator,
+            arch::OpMultiplyAddComplexFastF32>::value) {
+      tmp_accum.clear();
+    }
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int gemm_k_iterations_1 = gemm_k_iterations_1_ - (Base::kStages - 1);
+         gemm_k_iterations_1 > (-Base::kStages + 1);
+         gemm_k_iterations_1--) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1;
+           ++warp_mma_k) {
+        // Load warp-level tile from accumulator fragment (A)
+        // or shared memory (operand B)
+        this->warp_tile_iterator_B_.set_kgroup_index(
+            (warp_mma_k + 1) % Base::kWarpGemmIterations1);
+        // skip warp tile loading for the last kgroup (we are out of the buf)
+        if (gemm_k_iterations_1 > (-Base::kStages + 2) ||
+            warp_mma_k < Base::kWarpGemmIterations1 - 1) {
+          warp_tile_iterator_A1_.load(
+              warp_loaded_frag_A1[(warp_mma_k + 1) % 2]);
+          warp_tile_iterator_A1_scale_.load(
+              warp_loaded_frag_A1_scale[(warp_mma_k + 1) % 2]);
+          this->warp_tile_iterator_B_.load(
+              warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+        }
+        ++warp_tile_iterator_A1_;
+        ++warp_tile_iterator_A1_scale_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k > 0)
+          warp_mma1.transform(
+              warp_transformed_frag_A1[warp_mma_k % 2],
+              warp_transformed_frag_B1[warp_mma_k % 2],
+              FragmentAScaler::apply(
+                  warp_loaded_frag_A1[warp_mma_k % 2],
+                  warp_loaded_frag_A1_scale[warp_mma_k % 2]),
+              warp_loaded_frag_B1[warp_mma_k % 2]);
+
+        if (platform::is_same<
+                typename Operator1::MathOperator,
+                arch::OpMultiplyAddFastF32>::value ||
+            platform::is_same<
+                typename Operator1::MathOperator,
+                arch::OpMultiplyAddComplexFastF32>::value) {
+          warp_mma1(
+              tmp_accum,
+              warp_transformed_frag_A1[warp_mma_k % 2],
+              warp_transformed_frag_B1[warp_mma_k % 2],
+              tmp_accum);
+
+          if (warp_mma_k == 0) {
+            accum = plus_accum(accum, tmp_accum);
+            tmp_accum.clear();
+          }
+        } else {
+          warp_mma1(
+              accum,
+              warp_transformed_frag_A1[warp_mma_k % 2],
+              warp_transformed_frag_B1[warp_mma_k % 2],
+              accum);
+        }
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations1 - 1) {
+          int group_start_iteration_B1;
+
+          group_start_iteration_B1 = warp_mma_k * Detail::kAccessesPerGroupB1;
+
+          if (!kSmemContainsEntireB) {
+            copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1);
+          }
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations1) {
+          int group_start_iteration_B1;
+          group_start_iteration_B1 =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB1;
+
+          if (!kSmemContainsEntireB) {
+            copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1);
+          }
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          arch::cp_async_wait<kNumStagesConcurrentLoad - 1>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_B1.add_tile_offset({1, 0});
+
+          this->smem_iterator_B1_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (!kSmemContainsEntireB) {
+            if (smem_write_stage_idx == (Base::kStages - 1)) {
+              this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
+              smem_write_stage_idx = 0;
+            } else {
+              ++smem_write_stage_idx;
+            }
+
+            if (smem_read_stage_idx == (Base::kStages - 1)) {
+              this->warp_tile_iterator_B_.add_tile_offset(
+                  {-Base::kStages * Policy1::kPartitionsK *
+                       Base::kWarpGemmIterations1,
+                   0});
+              smem_read_stage_idx = 0;
+            } else {
+              ++smem_read_stage_idx;
+            }
+          }
+
+          iterator_B1.set_residual_tile(gemm_k_iterations_1 == 2);
+          iterator_B1.clear_mask(gemm_k_iterations_1 == 1);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations1)
+          warp_mma1.transform(
+              warp_transformed_frag_A1[(warp_mma_k + 1) % 2],
+              warp_transformed_frag_B1[(warp_mma_k + 1) % 2],
+              FragmentAScaler::apply(
+                  warp_loaded_frag_A1[(warp_mma_k + 1) % 2],
+                  warp_loaded_frag_A1_scale[(warp_mma_k + 1) % 2]),
+              warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+      }
+    }
+
+    if (platform::is_same<
+            typename Operator1::MathOperator,
+            arch::OpMultiplyAddFastF32>::value ||
+        platform::is_same<
+            typename Operator1::MathOperator,
+            arch::OpMultiplyAddComplexFastF32>::value) {
+      accum = plus_accum(accum, tmp_accum);
+    }
+  }
+};
+
+// Converts a "regular" Mma into their counterpart from shared memory
+template <
+    typename Mma_,
+    int kMaxK,
+    typename WarpIteratorA_,
+    /// whether or not to apply elementwise multiplication of operand A by
+    /// another matrix in shared memory before usage in A @ B
+    bool kScaleOperandA,
+    bool kTransposeA = false>
+struct DefaultMmaFromSharedMemory;
+
+// Mma pipelined
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    typename WarpIteratorA_,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Transformation applied to A operand
+    typename TransformA_,
+    /// Transformation applied to B operand
+    typename TransformB_,
+    // Max MMA problem size K
+    int kMaxK,
+    /// whether or not to apply elementwise multiplication of operand A by
+    /// another matrix in shared memory before usage in A @ B
+    bool kScaleOperandA,
+    bool kTransposeA>
+struct DefaultMmaFromSharedMemory<
+    MmaPipelined<
+        Shape_,
+        IteratorA_,
+        SmemIteratorA_,
+        IteratorB_,
+        SmemIteratorB_,
+        ElementC_,
+        LayoutC_,
+        Policy_,
+        TransformA_,
+        TransformB_>,
+    kMaxK,
+    WarpIteratorA_,
+    kScaleOperandA,
+    kTransposeA> {
+  using RegularMma = MmaPipelined<
+      Shape_,
+      IteratorA_,
+      SmemIteratorA_,
+      IteratorB_,
+      SmemIteratorB_,
+      ElementC_,
+      LayoutC_,
+      Policy_,
+      TransformA_,
+      TransformB_>;
+
+  using WarpShape = typename Policy_::Operator::Shape;
+  using InstructionShape = typename Policy_::Operator::InstructionShape;
+  using ArchMmaOperator = typename Policy_::Operator;
+
+  static constexpr bool kIsTransposedA = false;
+  using WarpIteratorA = WarpIteratorA_;
+  using IteratorB =
+      typename cutlass::transform::threadblock::MakeIteratorResidualLast<
+          IteratorB_>::Iterator;
+
+  using Mma = typename cutlass::gemm::threadblock::MmaPipelinedFromSharedMemory<
+      Shape_,
+      WarpIteratorA,
+      kScaleOperandA,
+      kMaxK,
+      IteratorB,
+      SmemIteratorB_,
+      ElementC_,
+      LayoutC_,
+      Policy_>;
+};
+
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    typename WarpIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    int kMaxK,
+    /// whether or not to apply elementwise multiplication of operand A by
+    /// another matrix in shared memory before usage in A @ B
+    bool kScaleOperandA,
+    bool kTransposeA>
+struct DefaultMmaFromSharedMemory<
+    MmaMultistage<
+        Shape_,
+        IteratorA_,
+        SmemIteratorA_,
+        CacheOpA,
+        IteratorB_,
+        SmemIteratorB_,
+        CacheOpB,
+        ElementC_,
+        LayoutC_,
+        Policy_,
+        Stages,
+        SharedMemoryClear>,
+    kMaxK,
+    WarpIteratorA_,
+    kScaleOperandA,
+    kTransposeA> {
+  using RegularMma = MmaMultistage<
+      Shape_,
+      IteratorA_,
+      SmemIteratorA_,
+      CacheOpA,
+      IteratorB_,
+      SmemIteratorB_,
+      CacheOpB,
+      ElementC_,
+      LayoutC_,
+      Policy_,
+      Stages,
+      SharedMemoryClear>;
+
+  using WarpShape = typename Policy_::Operator::Shape;
+  using InstructionShape = typename Policy_::Operator::InstructionShape;
+  using WarpIteratorTranspose = TransposeWarpIterator<WarpIteratorA_>;
+  static constexpr bool kIsTransposedA =
+      WarpIteratorTranspose::kSupportsTranspose && kTransposeA;
+  using WarpIteratorA = typename platform::conditional<
+      kIsTransposedA,
+      typename WarpIteratorTranspose::Iterator,
+      WarpIteratorA_>::type;
+
+  // Reduce the number of stages if we don't need that many
+  static int constexpr kStagesMax =
+      (kMaxK + int(Shape_::kK) - 1) / int(Shape_::kK);
+  static int constexpr kStages = cutlass::const_min(Stages, kStagesMax);
+
+  using IteratorB =
+      typename cutlass::transform::threadblock::MakeIteratorResidualLast<
+          IteratorB_>::Iterator;
+  using Mma =
+      typename cutlass::gemm::threadblock::MmaMultistageFromSharedMemory<
+          Shape_,
+          WarpIteratorA,
+          kScaleOperandA,
+          IteratorB,
+          SmemIteratorB_,
+          RegularMma::kCacheOpB,
+          ElementC_,
+          LayoutC_,
+          Policy_,
+          kStages,
+          kMaxK>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    typename IteratorC,
+    typename Operator,
+    typename scalar_t,
+    typename WarpShape_,
+    typename ThreadblockShape_>
+struct B2bGemm;
+
+// Tensor Cores >= Sm75 specialization (Ampere ...)
+template < /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Element type
+    typename Element_,
+    /// Layout of operand in memory
+    typename Layout_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions, concept: MatrixShape)
+    typename OpDelta_,
+    typename Operator,
+    typename scalar_t,
+    typename WarpShape_,
+    typename ThreadblockShape_>
+struct B2bGemm<
+    cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator<
+        Shape_,
+        Element_,
+        Layout_,
+        InstructionShape_,
+        OpDelta_>,
+    Operator,
+    scalar_t,
+    WarpShape_,
+    ThreadblockShape_> {
+  using IteratorC =
+      typename cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator<
+          Shape_,
+          Element_,
+          Layout_,
+          InstructionShape_,
+          OpDelta_>;
+  using FragmentC = typename IteratorC::Fragment;
+  using InstructionShape = InstructionShape_;
+  using WarpShape = WarpShape_;
+  using ThreadblockShape = ThreadblockShape_;
+  using accum_t = Element_;
+  using lse_scalar_t = float;
+
+  using SmemAccumulatorLayout = cutlass::layout::RowMajor;
+
+  // Iterator to load accumulators (results of matmul in registers)
+  using FragmentIteratorAccumulator =
+      cutlass::epilogue::warp::FragmentIteratorTensorOp<
+          WarpShape,
+          InstructionShape,
+          accum_t,
+          typename Operator::Policy::Operator::FragmentC,
+          cutlass::layout::RowMajor>;
+
+  // Iterator to store to shared-memory
+  using SmemIteratorD0 = typename cutlass::epilogue::warp::TileIteratorTensorOp<
+      WarpShape,
+      InstructionShape,
+      scalar_t, // accum_t,
+      SmemAccumulatorLayout>;
+  using AccumulatorSharedStorage =
+      cutlass::gemm::threadblock::AccumulatorSharedStorage<
+          ThreadblockShape,
+          typename SmemIteratorD0::Element,
+          typename SmemIteratorD0::TensorLayout,
+          typename SmemIteratorD0::Padding>;
+  // We need to provide an operation for the epilogue. Let's create an
+  // operation that does nothing (ScaleType::Nothing), just converts
+  // from accum_t (float) -> scalar_t (can be half)
+  using OutputOpNoOp = cutlass::epilogue::thread::LinearCombination<
+      typename SmemIteratorD0::Element, // ElementOutput
+      FragmentIteratorAccumulator::Fragment::kElements,
+      accum_t, // ElementAccumulator
+      typename SmemIteratorD0::Element, // ElementCompute
+      cutlass::epilogue::thread::ScaleType::Nothing>;
+  using Epilogue = cutlass::epilogue::threadblock::EpilogueSmemAccumulator<
+      SmemIteratorD0,
+      FragmentIteratorAccumulator,
+      SmemIteratorD0, // ScaleBiasIterator - not used
+      OutputOpNoOp>;
+
+  // Epilogue 2: with LSE (for backwards pass)
+  static int const kElementsPerAccess = 2; // TODO: Why 2?
+  using IteratorAccumulatorLSE =
+      cutlass::transform::threadblock::VectorIterator<
+          cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+              // Shape
+              cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kN>,
+              // WarpShape
+              cutlass::MatrixShape<WarpShape::kM, WarpShape::kN>,
+              lse_scalar_t,
+              cutlass::layout::RowMajor,
+              kElementsPerAccess>>;
+  using EpilogueOpApplyLSE = cutlass::epilogue::thread::ApplyLogSumExp<
+      scalar_t, // ElementOutput_
+      lse_scalar_t, // ElementLSE_
+      accum_t, // ElementAccumulator_
+      accum_t, // ElementCompute_
+      128 / cutlass::sizeof_bits<scalar_t>::value
+      // FragmentIteratorAccumulator::Fragment::kElements
+      // InstructionShape::kM * InstructionShape::kN / 32
+      >;
+  using EpilogueWithLSE =
+      cutlass::epilogue::threadblock::EpilogueSmemAccumulator<
+          SmemIteratorD0,
+          FragmentIteratorAccumulator,
+          IteratorAccumulatorLSE,
+          EpilogueOpApplyLSE>;
+
+  static void CUTLASS_DEVICE accumToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      FragmentC const& accum,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    SmemIteratorD0 smem_iterator_attn(shared_storage.accum_ref(), lane_id);
+    smem_iterator_attn.add_tile_offset(
+        tile_coords *
+        cutlass::MatrixCoord{
+            SmemIteratorD0::TileIterations::kRow,
+            SmemIteratorD0::TileIterations::kColumn});
+    Epilogue epilogue;
+    epilogue(OutputOpNoOp({}), smem_iterator_attn, accum);
+  }
+
+  static void CUTLASS_DEVICE accumApplyLSEToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      FragmentC& accum,
+      lse_scalar_t const* lse,
+      int32_t lse_extents,
+      int thread_id,
+      int warp_id,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    constexpr int32_t kAlignLSE = 32;
+    IteratorAccumulatorLSE iterator_lse(
+        lse,
+        {(int32_t)0, (int32_t)ceil_div(lse_extents, kAlignLSE) * kAlignLSE},
+        thread_id,
+        warp_id,
+        cutlass::MatrixCoord{0, 0} // offset
+    );
+
+    SmemIteratorD0 smem_iterator_attn(shared_storage.accum_ref(), lane_id);
+    smem_iterator_attn.add_tile_offset(
+        tile_coords *
+        cutlass::MatrixCoord{
+            SmemIteratorD0::TileIterations::kRow,
+            SmemIteratorD0::TileIterations::kColumn});
+    EpilogueWithLSE epilogue;
+    EpilogueOpApplyLSE minus_lse_exp({});
+    epilogue(
+        minus_lse_exp,
+        smem_iterator_attn,
+        accum,
+        // scale - unused
+        iterator_lse,
+        // bias
+        iterator_lse);
+  }
+};
+
+// Volta Specialization
+// only supported for f16
+template <typename Operator, typename WarpShape_, typename ThreadblockShape_>
+struct B2bGemm<
+    cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
+        cutlass::MatrixShape<32, 32>,
+        float,
+        cutlass::layout::RowMajor,
+        cutlass::gemm::GemmShape<16, 16, 4>,
+        cutlass::MatrixShape<1, 1>>,
+    Operator,
+    cutlass::half_t,
+    WarpShape_,
+    ThreadblockShape_> {
+  using IteratorC =
+      cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
+          cutlass::MatrixShape<32, 32>,
+          float,
+          cutlass::layout::RowMajor,
+          cutlass::gemm::GemmShape<16, 16, 4>,
+          cutlass::MatrixShape<1, 1>>;
+  using scalar_t = cutlass::half_t;
+  using accum_t = IteratorC::Element;
+  using WarpShape = WarpShape_;
+  using ThreadblockShape = ThreadblockShape_;
+  using FragmentC = IteratorC::Fragment;
+  using lse_scalar_t = float;
+
+  // Storage in shared-memory for Q.Kt
+  using SmemAccumulatorLayout =
+      cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<16, 32>;
+  using AccumulatorSharedStorage =
+      cutlass::gemm::threadblock::AccumulatorSharedStorage<
+          ThreadblockShape,
+          scalar_t,
+          SmemAccumulatorLayout,
+          cutlass::MatrixShape<0, 0> // Padding
+          >;
+  using TensorRef = cutlass::TensorRef<scalar_t, SmemAccumulatorLayout>;
+  using Policy = typename IteratorC::Policy;
+  using Element = accum_t;
+  // Those are MmaVoltaTensorOpAccumulatorTileIterator private fields
+  // Let's copy their values
+  static int const kElementsPerPartial = 4;
+  using EleShapePerPatial = typename cutlass::platform::conditional<
+      cutlass::platform::is_same<Element, float>::value,
+      cutlass::MatrixShape<2, 2>,
+      cutlass::MatrixShape<1, 4>>::type;
+  static int const kElementsPerMma = 8;
+  static int const kAccumulatorPatials = 2;
+  using QuadShapePerPatialMma = cutlass::MatrixShape<4, 4>;
+
+  static void CUTLASS_DEVICE accumToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      FragmentC const& accum,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    // ctor - from MmaVoltaTensorOpAccumulatorTileIterator
+    TensorRef ref_(shared_storage.accum_ref());
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+    int accum_m, accum_n;
+
+    if (cutlass::platform::is_same<Element, float>::value) {
+      // (quad[2],quad[0])+lane_in_quad[0]
+      accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + (lane_in_quad & 1);
+      // (quad[1])+lane_in_quad[1]
+      accum_n =
+          ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials +
+          (lane_in_quad & 2);
+    } else {
+      accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 +
+          lane_in_quad; // (quad[2],quad[0])
+      accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials;
+    }
+    cutlass::MatrixCoord lane_offset(accum_m, accum_n);
+
+    // Tile offset
+    ref_.add_coord_offset(
+        tile_coords *
+        cutlass::MatrixCoord(
+            {IteratorC::Shape::kRow, IteratorC::Shape::kColumn}));
+
+    using AccessType = cutlass::Array<scalar_t, EleShapePerPatial::kColumn>;
+
+    // store - from MmaVoltaTensorOpAccumulatorTileIterator
+    CUTLASS_PRAGMA_UNROLL
+    for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn; ++tile_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+            int mma_accum_start =
+                (((tile_n * Policy::TileIterations::kRow + tile_m) *
+                      Policy::MmaIterations::kColumn +
+                  mma_n) *
+                     Policy::MmaIterations::kRow +
+                 mma_m) *
+                kElementsPerMma;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int p = 0; p < kAccumulatorPatials; ++p) {
+              CUTLASS_PRAGMA_UNROLL
+              for (int m = 0; m < EleShapePerPatial::kRow; ++m) {
+                int accum_m = tile_m * Policy::InterleavedTile::kRow +
+                    mma_m * QuadShapePerPatialMma::kRow + m * 2;
+                int accum_n = tile_n * Policy::InterleavedTile::kColumn +
+                    mma_n * QuadShapePerPatialMma::kColumn +
+                    p * Policy::InterleavedTile::kColumn / 2;
+                int r = (accum_m + lane_offset.row());
+                AccessType to_store;
+                CUTLASS_PRAGMA_UNROLL
+                for (int n = 0; n < EleShapePerPatial::kColumn; ++n) {
+                  int idx = mma_accum_start + p * kElementsPerPartial +
+                      m * EleShapePerPatial::kColumn + n;
+                  int c = (accum_n + n + lane_offset.column());
+                  to_store[n] = scalar_t(accum[idx]);
+                }
+                int c = (accum_n + lane_offset.column());
+                assert(r < 32);
+                assert(c < 32);
+                *reinterpret_cast<AccessType*>(
+                    ref_.data() + ref_.offset({r, c})) = to_store;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  static void CUTLASS_DEVICE accumApplyLSEToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      typename IteratorC::Fragment& accum,
+      lse_scalar_t const* lse,
+      int lse_extent,
+      int thread_id,
+      int warp_id,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    // Non-optimized way to apply LSE to registers
+    // NOTE: accum is attn.T
+    // TODO: Optimize for each architecture
+    static constexpr int WarpSize = 32;
+    using AccumLambdaIterator =
+        typename DefaultMmaAccumLambdaIterator<IteratorC, accum_t, WarpSize>::
+            Iterator;
+    auto lane_offset =
+        AccumLambdaIterator::get_lane_offset(lane_id, warp_id, tile_coords);
+
+    cutlass::Array<lse_scalar_t, IteratorC::Fragment::kElements> lse_prefetched;
+    lse_prefetched.clear();
+    int rowIdx = 0;
+    int colIdx = 0;
+    AccumLambdaIterator::iterateRows(
+        lane_offset,
+        [&](int accum_m) {
+          ++rowIdx;
+          colIdx = 0;
+        },
+        [&](int accum_m, int accum_n, int idx) {
+          if (rowIdx == 1) {
+            lse_prefetched[colIdx] = accum_n < lse_extent
+                ? lse[accum_n]
+                : cutlass::platform::numeric_limits<accum_t>::infinity();
+          }
+          accum[idx] = expf(accum[idx] - lse_prefetched[colIdx]);
+          ++colIdx;
+        },
+        [&](int accum_m) {});
+    accumToSmem(shared_storage, accum, lane_id, tile_coords);
+  }
+};
+
+// Simt Specialization
+// for f32 on Sm70-Sm75 and f16/f32 below
+
+template <
+    typename Operator,
+    typename OperatorPolicy,
+    typename scalar_t,
+    typename WarpShape_,
+    typename ThreadblockShape_>
+struct B2bGemm<
+    cutlass::gemm::warp::MmaSimtTileIterator<
+        cutlass::MatrixShape<32, 32>,
+        cutlass::gemm::Operand::kC,
+        float,
+        cutlass::layout::RowMajor,
+        OperatorPolicy,
+        1,
+        1>,
+    Operator,
+    scalar_t,
+    WarpShape_,
+    ThreadblockShape_> {
+  using IteratorC = cutlass::gemm::warp::MmaSimtTileIterator<
+      cutlass::MatrixShape<32, 32>,
+      cutlass::gemm::Operand::kC,
+      float,
+      cutlass::layout::RowMajor,
+      OperatorPolicy,
+      1,
+      1>;
+  using accum_t = typename IteratorC::Element;
+  using WarpShape = WarpShape_;
+  using ThreadblockShape = ThreadblockShape_;
+  using FragmentC = typename IteratorC::Fragment;
+  using lse_scalar_t = float;
+
+  // Storage in shared-memory for Q.Kt
+  using AccumulatorSharedStorage =
+      cutlass::gemm::threadblock::AccumulatorSharedStorage<
+          ThreadblockShape,
+          scalar_t,
+          cutlass::layout::ColumnMajor,
+          cutlass::MatrixShape<0, 0> // Padding
+          >;
+
+  static void CUTLASS_DEVICE accumToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      FragmentC const& accum,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    using Policy = typename IteratorC::Policy;
+    using Element = typename IteratorC::Element;
+    using Iterations = typename IteratorC::Iterations;
+    using Delta = typename IteratorC::Delta;
+
+    auto ref_ = shared_storage.accum_ref();
+    // ctor - MmaSimtTileIterator
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id) *
+        MatrixCoord(Policy::LaneMmaShape::kM, Policy::LaneMmaShape::kN);
+
+    ref_.add_coord_offset(lane_offset);
+
+    // Tile offset
+    ref_.add_coord_offset(
+        tile_coords *
+        cutlass::MatrixCoord(
+            {IteratorC::Shape::kRow, IteratorC::Shape::kColumn}));
+
+    // store - MmaSimtTileIterator
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Policy::LaneMmaShape::kN; ++n) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < Policy::LaneMmaShape::kM; ++m) {
+            int r =
+                Policy::LaneMmaShape::kM * (mma_m * Policy::WarpShape::kRow) +
+                m;
+            int c = mma_n * Delta::kColumn + n;
+            int idx = n +
+                Policy::LaneMmaShape::kN *
+                    (mma_n +
+                     Iterations::kColumn *
+                         (m + mma_m * Policy::LaneMmaShape::kM));
+            ref_.at({r, c}) = scalar_t(accum[idx]);
+          }
+        }
+      }
+    }
+  }
+
+  static void CUTLASS_DEVICE accumApplyLSEToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      typename IteratorC::Fragment& accum,
+      lse_scalar_t const* lse,
+      int lse_extent,
+      int thread_id,
+      int warp_id,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    // Non-optimized way to apply LSE to registers
+    // NOTE: accum is attn.T
+    // TODO: Optimize for each architecture
+    static constexpr int WarpSize = 32;
+    using AccumLambdaIterator =
+        typename DefaultMmaAccumLambdaIterator<IteratorC, accum_t, WarpSize>::
+            Iterator;
+    auto lane_offset =
+        AccumLambdaIterator::get_lane_offset(lane_id, warp_id, tile_coords);
+
+    cutlass::Array<lse_scalar_t, IteratorC::Fragment::kElements> lse_prefetched;
+    lse_prefetched.clear();
+    int rowIdx = 0;
+    int colIdx = 0;
+    AccumLambdaIterator::iterateRows(
+        lane_offset,
+        [&](int accum_m) {
+          ++rowIdx;
+          colIdx = 0;
+        },
+        [&](int accum_m, int accum_n, int idx) {
+          if (rowIdx == 1) {
+            lse_prefetched[colIdx] = accum_n < lse_extent
+                ? lse[accum_n]
+                : cutlass::platform::numeric_limits<accum_t>::infinity();
+          }
+          accum[idx] = expf(accum[idx] - lse_prefetched[colIdx]);
+          ++colIdx;
+        },
+        [&](int accum_m) {});
+    accumToSmem(shared_storage, accum, lane_id, tile_coords);
+  }
+};
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/41_fused_multi_head_attention/gemm_kernel_utils.h b/examples/41_fused_multi_head_attention/gemm_kernel_utils.h
new file mode 100644
index 0000000000..a770e0b671
--- /dev/null
+++ b/examples/41_fused_multi_head_attention/gemm_kernel_utils.h
@@ -0,0 +1,258 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/arch/mma.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// Some helper functions
+////////////////////////////////////////////////////////////////////////////////
+#define DISPATCH_TYPES(tensor, func)                                           \
+  {                                                                            \
+    if (query.scalar_type() == at::ScalarType::Float) {                        \
+      using scalar_t = float;                                                  \
+      func();                                                                  \
+    } else if (query.scalar_type() == at::ScalarType::Half) {                  \
+      using scalar_t = cutlass::half_t;                                        \
+      func();                                                                  \
+    } else if (query.scalar_type() == at::ScalarType::BFloat16) {              \
+      using scalar_t = cutlass::bfloat16_t;                                    \
+      func();                                                                  \
+    } else {                                                                   \
+      XFORMERS_CHECK(false, "Only fp32, half & bf16 supported at the moment"); \
+    }                                                                          \
+  }
+
+#define DISPATCH_BOOL(BOOL_V, BOOL_NAME, F) \
+  {                                         \
+    if (BOOL_V) {                           \
+      using BOOL_NAME = std::true_type;      \
+      F();                                  \
+    } else {                                \
+      using BOOL_NAME = std::false_type;      \
+      F();                                  \
+    }                                       \
+  }
+
+#define DISPATCH_ARCHTAG(CC, func)                                        \
+  {                                                                       \
+    if (CC >= 80) {                                                       \
+      using ArchTag = cutlass::arch::Sm80;                                \
+      func();                                                             \
+    } else if (CC >= 75) {                                                \
+      using ArchTag = cutlass::arch::Sm75;                                \
+      func();                                                             \
+    } else if (CC >= 70) {                                                \
+      using ArchTag = cutlass::arch::Sm70;                                \
+      func();                                                             \
+    } else if (CC >= 50) {                                                \
+      using ArchTag = cutlass::arch::Sm50;                                \
+      func();                                                             \
+    } else {                                                              \
+      XFORMERS_CHECK(                                                     \
+          false,                                                          \
+          "Your device is too old. We require compute capability >= 50"); \
+    }                                                                     \
+  }
+
+#define CHECK_NOSPARSE_CONTIGUOUS_CUDA(TENSOR)                            \
+  XFORMERS_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
+  XFORMERS_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
+  XFORMERS_CHECK(TENSOR.is_contiguous());
+
+#define CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(TENSOR)                        \
+  XFORMERS_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
+  XFORMERS_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
+  XFORMERS_CHECK(                                                         \
+      TENSOR.stride(-1) == 1, #TENSOR ": last dimension must be contiguous");
+
+#ifdef TORCH_CHECK
+#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT) \
+  XFORMERS_CHECK(                         \
+      uint64_t(PTR) % ALIGNMENT == 0, #PTR " is not correctly aligned")
+#define XFORMERS_CHECK TORCH_CHECK
+#elif defined(__CUDACC_RTC__)
+#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT)  \
+  if (!(uint64_t(PTR) % ALIGNMENT == 0)) { \
+    return false;                          \
+  }
+#define XFORMERS_CHECK(COND, ERR) \
+  if (!(COND)) {                  \
+    return false;                 \
+  }
+#else
+#include <iostream>
+#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT)            \
+  if (!(uint64_t(PTR) % ALIGNMENT == 0)) {           \
+    std::cerr << #PTR " is not correctly aligned\n"; \
+    return false;                                    \
+  }
+#define XFORMERS_CHECK(COND, ERR)                       \
+  if (!(COND)) {                                        \
+    std::cerr << "'" #COND "' failed: " << ERR << "\n"; \
+    return false;                                       \
+  }
+#endif
+
+#define ASSIGN_CHECK_OVERFLOW(A, B)                                    \
+  {                                                                    \
+    A = B;                                                             \
+    XFORMERS_CHECK(                                                    \
+        B < std::numeric_limits<decltype(A)>::max(), #B " overflows"); \
+  }
+
+namespace gemm_kernel_utils {
+
+template <typename integer>
+constexpr CUTLASS_HOST_DEVICE integer ceil_div(integer n, integer m) {
+  return (n + m - 1) / m;
+}
+
+template <typename integer>
+constexpr CUTLASS_HOST_DEVICE integer align_up(integer n, integer m) {
+  return ((n + m - 1) / m) * m;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Determine the type of GEMM we do (TensorCores or not, Shapes ...)
+// TODO: Maybe we could rely on Cutlass's DefaultGemm templates
+////////////////////////////////////////////////////////////////////////////////
+
+// Fallback to Simt (FMA on cuda cores) if not in a special case below
+template <typename ArchTag, typename scalar_t_, typename Enable = void>
+struct DefaultGemmType {
+  static constexpr int ThreadK = 8;
+  static constexpr int WarpK = 8;
+  static constexpr int kMinimumAlignment = 1;
+  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  using OpClass = cutlass::arch::OpClassSimt;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+// Specialization for tensorcores with f32
+template <typename ArchTag>
+struct DefaultGemmType<
+    ArchTag,
+    float,
+    typename cutlass::platform::enable_if<
+        ArchTag::kMinComputeCapability >= 80>::type> {
+  static constexpr int ThreadK = 32;
+  static constexpr int WarpK = 32;
+  static constexpr int kMinimumAlignment = 4;
+  using OpClass = cutlass::arch::OpClassTensorOp;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Operator = cutlass::arch::OpMultiplyAddFastF32;
+};
+
+// Specialization for tensorcores with f16/bf16 - Sm75+
+template <typename ArchTag, typename scalar_t>
+struct DefaultGemmType<
+    ArchTag,
+    scalar_t,
+    typename cutlass::platform::enable_if<
+        ArchTag::kMinComputeCapability >= 75 &&
+        cutlass::sizeof_bits<scalar_t>::value == 16>::type> {
+  static constexpr int ThreadK = 32;
+  static constexpr int WarpK = 32;
+  static constexpr int kMinimumAlignment = 4;
+  using OpClass = cutlass::arch::OpClassTensorOp;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+// Specialization for tensorcores with f16 - Volta
+template <>
+struct DefaultGemmType<cutlass::arch::Sm70, cutlass::half_t, void> {
+  static constexpr int ThreadK = 32;
+  static constexpr int WarpK = 32;
+  static constexpr int kMinimumAlignment = 2;
+  using OpClass = cutlass::arch::OpClassTensorOp;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+// Enables to do
+// `auto x = kCondition ? fa(arg) : fb(arg)`
+// when `fa` and `fb` have different types
+template <bool kVal, typename TA, typename TB>
+struct call_conditional;
+
+template <typename TA, typename TB>
+struct call_conditional<true, TA, TB> {
+  template <typename Arg>
+  static CUTLASS_HOST_DEVICE auto apply(TA ta, TB tb, Arg arg)
+      -> decltype(ta(arg)) {
+    return ta(arg);
+  }
+};
+
+template <typename TA, typename TB>
+struct call_conditional<false, TA, TB> {
+  template <typename Arg>
+  static CUTLASS_HOST_DEVICE auto apply(TA ta, TB tb, Arg arg)
+      -> decltype(tb(arg)) {
+    return tb(arg);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Mark a variable as warp-uniform - enables some compiler optimizations
+// The cheapest way to do it is just to broadcast it from lane 0
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+CUTLASS_DEVICE T warp_uniform(T value) {
+  struct {
+    union {
+      T value;
+      uint32_t asInt;
+    };
+  } p;
+  p.value = value;
+  p.asInt = __shfl_sync(0xffffffff, (unsigned)p.asInt, 0);
+  return p.value;
+}
+
+template <typename T>
+CUTLASS_DEVICE T* warp_uniform(T* ptr) {
+  struct {
+    union {
+      T* ptr;
+      uint32_t asInt[2];
+    };
+  } p;
+  p.ptr = ptr;
+  p.asInt[0] = warp_uniform(p.asInt[0]);
+  p.asInt[1] = warp_uniform(p.asInt[1]);
+  return p.ptr;
+}
+} // namespace gemm_kernel_utils
diff --git a/examples/41_fused_multi_head_attention/iterators/default_warp_iterator_from_smem.h b/examples/41_fused_multi_head_attention/iterators/default_warp_iterator_from_smem.h
new file mode 100644
index 0000000000..3dbb0cf285
--- /dev/null
+++ b/examples/41_fused_multi_head_attention/iterators/default_warp_iterator_from_smem.h
@@ -0,0 +1,142 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Instanciates the right WarpIterator to read from shared memory
+    The class `DefaultWarpIteratorAFromSharedMemory` is useful when reading
+        data dumped with `B2bGemm::accumToSmem`.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h"
+#include "cutlass/platform/platform.h"
+
+#include "warp_iterator_from_smem.h"
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+template <
+    typename WarpShape,
+    typename InstructionShape,
+    typename RegularWarpIterator,
+    typename Policy,
+    typename Enable = void>
+struct DefaultWarpIteratorAFromSharedMemory {};
+
+// TensorOp - Ampere half
+template <typename RegularWarpIterator, typename Policy, int kInstrK>
+struct DefaultWarpIteratorAFromSharedMemory<
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, kInstrK>,
+    RegularWarpIterator,
+    Policy,
+    typename platform::enable_if<(
+        sizeof_bits<typename RegularWarpIterator::Element>::value == 16 &&
+        Policy::Operator::Policy::OpDelta::kRow == 1)>::type> {
+  using OpDelta = typename Policy::Operator::Policy::OpDelta;
+  using WarpShape = cutlass::MatrixShape<32, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, kInstrK>;
+
+  using WarpIterator = cutlass::gemm::warp::WarpIteratorFromSmem<
+      cutlass::gemm::Operand::kA,
+      typename RegularWarpIterator::Element,
+      cutlass::MatrixShape<InstructionShape::kM, InstructionShape::kK>>;
+};
+
+// TensorOp - Ampere f32
+template <typename WarpShape, typename RegularWarpIterator, typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory<
+    WarpShape,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    RegularWarpIterator,
+    Policy,
+    typename platform::enable_if<(
+        sizeof_bits<typename RegularWarpIterator::Element>::value != 16 ||
+        Policy::Operator::Policy::OpDelta::kRow != 1)>::type> {
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  static constexpr auto kWarpSize = 32;
+  using OpDelta = typename Policy::Operator::Policy::OpDelta;
+
+  using WarpIterator =
+      cutlass::gemm::warp::MmaTensorOpMultiplicandTileAccessIterator<
+          cutlass::MatrixShape<WarpShape::kM, WarpShape::kK>,
+          cutlass::gemm::Operand::kA,
+          typename RegularWarpIterator::Element,
+          cutlass::layout::RowMajor,
+          cutlass::MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+          OpDelta::kRow,
+          kWarpSize>;
+};
+
+// TensorOp - Volta
+template <typename WarpShape, typename RegularWarpIterator, typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory<
+    WarpShape,
+    cutlass::gemm::GemmShape<16, 16, 4>,
+    RegularWarpIterator,
+    Policy> {
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 4>;
+  static constexpr auto kWarpSize = 32;
+  using OpDelta = typename Policy::Operator::Policy::OpDelta;
+
+  using WarpIterator =
+      cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator<
+          cutlass::MatrixShape<32, 32>, // MatrixShape<WarpShape::kM,
+                                        // WarpShape::kK>,
+          cutlass::gemm::Operand::kA,
+          typename RegularWarpIterator::Element,
+          cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<16, 32>,
+          cutlass::MatrixShape<16, 4>,
+          OpDelta::kRow,
+          kWarpSize>;
+};
+
+// Simt
+template <typename WarpShape, typename RegularWarpIterator, typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory<
+    WarpShape,
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    RegularWarpIterator,
+    Policy> {
+  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  static constexpr auto kWarpSize = 32;
+
+  // We just use the same iterator, as we reproduced the same shared-memory
+  // schema. Just modify it to handle non-complete tiles.
+  using WarpIterator = RegularWarpIterator;
+};
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/examples/41_fused_multi_head_attention/iterators/epilogue_predicated_tile_iterator.h b/examples/41_fused_multi_head_attention/iterators/epilogue_predicated_tile_iterator.h
new file mode 100644
index 0000000000..64a58278fe
--- /dev/null
+++ b/examples/41_fused_multi_head_attention/iterators/epilogue_predicated_tile_iterator.h
@@ -0,0 +1,751 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue iterator that supports prefetching
+
+  Mostly copied from "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+*/
+
+#pragma once
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load and store output tile from global memory in
+/// epilogue.
+///
+/// Satisfies: ReadableTileIterator | PredicatedTileIterator |
+/// ForwardTileIterator
+///
+template <
+    typename ThreadMap_, ///< Thread map (conept: OutputTileThreadMap)
+    typename Element_, ///< Element data type
+    bool ScatterD = false, ///< Scatter D operand or not
+    bool UseCUDAStore = false>
+class PredicatedTileIteratorPrefetch {
+ public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+
+  using Element = Element_;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kThreads = ThreadMap::kThreads;
+  static int const kIterations = ThreadMap::Count::kTile;
+
+  static_assert(
+      ThreadMap::Iterations::kRow > 0,
+      "ThreadMap::Iterations::kRow must be > 0");
+  static_assert(
+      ThreadMap::Iterations::kGroup > 0,
+      "ThreadMap::Iterations::kGroup must be > 0");
+  static_assert(
+      ThreadMap::Iterations::kCluster > 0,
+      "ThreadMap::Iterations::kCluster must be > 0");
+  static_assert(
+      ThreadMap::Iterations::kColumn > 0,
+      "ThreadMap::Iterations::kColumn must be > 0");
+
+  /// Fragment object
+  using Fragment = Array<
+      Element,
+      ThreadMap::Iterations::kColumn * ThreadMap::Iterations::kRow *
+          ThreadMap::Iterations::kGroup * ThreadMap::Iterations::kCluster *
+          ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+
+  //
+  // Parameters struct
+  //
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileIteratorParams {
+    using Base = PredicatedTileIteratorParams;
+
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : PredicatedTileIteratorParams(
+              layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
+              make_OutputTileThreadMapDesc<ThreadMap>()) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const& base) : Base(base) {}
+  };
+
+  /// Mask object
+  struct Mask {
+    static int const kCount = ThreadMap::Iterations::kColumn;
+
+    /// Predicate state
+    bool predicates[kCount];
+
+    //
+    // Mask
+    //
+    CUTLASS_HOST_DEVICE
+    Mask() {
+      enable();
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_HOST_DEVICE void clear() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = false;
+      }
+    }
+
+    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = true;
+      }
+    }
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Parameters structure containing reference and precomputed state.
+  PredicatedTileIteratorParams params_;
+
+  /// Byte-level pointer
+  uint8_t* byte_pointer_;
+
+  /// Array of boolean values to contain steady-state predicates
+  Mask mask_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_row_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_column_;
+
+  /// A thread's starting row position (assuming steady-state predicates have
+  /// been computed)
+  Index thread_start_row_;
+
+  /// A thread's starting column
+  Index thread_start_column_;
+
+  /// Internal state counter
+  int state_[3];
+
+  /// Scatter indices
+  int const* indices_;
+
+  //
+  // Static asserts about internal strides
+  //
+
+  static_assert(sizeof(extent_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
+  static_assert(
+      sizeof(PredicatedTileIteratorParams::stride) == 8,
+      "Expected 64b strides");
+
+ private:
+  //
+  // Methods
+  //
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  PredicatedTileIteratorPrefetch(
+      PredicatedTileIteratorParams const& params,
+      Element* pointer,
+      TensorCoord extent,
+      int thread_idx,
+      TensorCoord threadblock_offset = TensorCoord(),
+      int const* indices = nullptr)
+      : params_(params), indices_(indices) {
+    TensorCoord thread_offset =
+        ThreadMap::initial_offset(thread_idx) + threadblock_offset;
+
+    extent_row_ = extent.row();
+    extent_column_ = extent.column();
+
+    thread_start_row_ = thread_offset.row();
+    thread_start_column_ = thread_offset.column();
+
+    // Initialize predicates
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
+      mask_.predicates[c] =
+          ((thread_offset.column() + ThreadMap::Delta::kColumn * c) <
+           extent.column());
+    }
+
+    // Null pointer performs no accesses
+    if (!pointer) {
+      mask_.clear();
+    }
+
+    if (ScatterD && !indices) {
+      mask_.clear();
+    }
+
+    // Initialize pointer
+    byte_pointer_ = reinterpret_cast<uint8_t*>(pointer) +
+        LongIndex(thread_offset.row()) * LongIndex(params_.stride) +
+        LongIndex(thread_offset.column()) * sizeof(AccessType) /
+            kElementsPerAccess;
+
+    if (ScatterD) {
+      byte_pointer_ = reinterpret_cast<uint8_t*>(pointer) +
+          LongIndex(thread_offset.column()) * sizeof(AccessType) /
+              kElementsPerAccess;
+    }
+
+    // Initialize internal state counter
+    state_[0] = state_[1] = state_[2] = 0;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_DEVICE
+  void prefetch_all() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int iter = 0; iter < kIterations; ++iter) {
+      prefetch();
+      ++(*this);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void prefetch() {
+    uint8_t* byte_pointer = byte_pointer_;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            // on windows using unsigned long here gives the error
+            // error: asm operand type size(4) does not match
+            // type/size implied by constraint 'l'
+            uint64_t addr = (uint64_t)((void*)&memory_pointer
+                                           [column * ThreadMap::Delta::kColumn /
+                                            kElementsPerAccess]);
+            asm volatile("prefetch.global.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            if (!ScatterD) {
+              byte_pointer += params_.increment_row;
+            }
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, int64_t byte_offset) const {
+    uint8_t* byte_pointer = byte_pointer_;
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+          if (ScatterD && row_guard) {
+            assert(indices_);
+
+            memory_pointer = reinterpret_cast<AccessType*>(
+                byte_pointer + byte_offset +
+                LongIndex(indices_[row_offset + thread_start_row_]) *
+                    LongIndex(params_.stride));
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+                frag_ptr
+                    [frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                (void*)&memory_pointer
+                    [column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            if (!ScatterD) {
+              byte_pointer += params_.increment_row;
+            }
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) const {
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, int64_t byte_offset) const {
+    uint8_t* byte_pointer = byte_pointer_;
+    AccessType const* frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+          if (ScatterD && row_guard) {
+            assert(indices_);
+
+            memory_pointer = reinterpret_cast<AccessType*>(
+                byte_pointer + byte_offset +
+                LongIndex(indices_[row_offset + thread_start_row_]) *
+                    LongIndex(params_.stride));
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            bool guard = row_guard && mask_.predicates[column];
+
+            if (UseCUDAStore) {
+              if (guard) {
+                memory_pointer
+                    [column * ThreadMap::Delta::kColumn / kElementsPerAccess] =
+                        frag_ptr
+                            [frag_row_idx * ThreadMap::Iterations::kColumn +
+                             column];
+              }
+            } else {
+              cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+                  frag_ptr
+                      [frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                  (void*)&memory_pointer
+                      [column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                  guard);
+            }
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            if (!ScatterD) {
+              byte_pointer += params_.increment_row;
+            }
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) const {
+    store_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void downsample_load_with_byte_offset(
+      Fragment& frag,
+      int64_t byte_offset,
+      int convolution_P,
+      int convolution_Q,
+      int add_P,
+      int add_Q,
+      int problem_N) const {
+    uint8_t* byte_pointer = byte_pointer_;
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          int output_row = row_offset + thread_start_row_;
+          int output_N = output_row / (convolution_P * convolution_Q);
+          int output_PQ = output_row % (convolution_P * convolution_Q);
+          int output_P = output_PQ / convolution_Q;
+          int output_Q = output_PQ % convolution_Q;
+
+          int input_row = output_N * 2 * convolution_P * 2 * convolution_Q +
+              (2 * output_P + add_P) * 2 * convolution_Q + 2 * output_Q + add_Q;
+
+          int64_t byte_offset =
+              (input_row - output_row) * problem_N * sizeof(float);
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+                frag_ptr
+                    [frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                (void*)&memory_pointer
+                    [column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            byte_pointer += params_.increment_row;
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void upsample_load_with_byte_offset(
+      Fragment& frag,
+      int64_t byte_offset,
+      int convolution_P,
+      int convolution_Q,
+      int add_P,
+      int add_Q,
+      int problem_N) const {
+    uint8_t* byte_pointer = byte_pointer_;
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          int output_row = row_offset + thread_start_row_;
+          int output_N = output_row / (convolution_P * convolution_Q);
+          int output_PQ = output_row % (convolution_P * convolution_Q);
+          int output_P = output_PQ / convolution_Q;
+          int output_Q = output_PQ % convolution_Q;
+          int row_add_P = add_P;
+          int row_add_Q = add_Q;
+          if (output_P > convolution_P - 2)
+            row_add_P = 0;
+          if (output_Q > convolution_Q - 2)
+            row_add_Q = 0;
+
+          int input_row = output_N * (convolution_P / 2) * (convolution_Q / 2) +
+              ((output_P + row_add_P) / 2) * (convolution_Q / 2) +
+              (output_Q + row_add_Q) / 2;
+
+          int64_t byte_offset =
+              (input_row - output_row) * problem_N * sizeof(float);
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+                frag_ptr
+                    [frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                (void*)&memory_pointer
+                    [column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            byte_pointer += params_.increment_row;
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  MatrixCoord thread_start() const {
+    return MatrixCoord(thread_start_row_, thread_start_column_);
+  }
+
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_row() const {
+    return thread_start_row_;
+  }
+
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_column() const {
+    return thread_start_column_;
+  }
+
+  /// Extent of the matrix in rows
+  CUTLASS_DEVICE
+  Index extent_row() const {
+    return extent_row_;
+  }
+
+  /// Extent of the matrix in columns
+  CUTLASS_DEVICE
+  Index extent_column() const {
+    return extent_column_;
+  }
+
+  /// Advances to the next position to load or store
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorPrefetch& operator++() {
+    ++state_[0];
+
+    if (!ScatterD) {
+      byte_pointer_ += params_.advance_row;
+    }
+
+    thread_start_row_ += ThreadMap::Shape::kRow;
+
+    if (state_[0] == ThreadMap::Count::kRow) {
+      state_[0] = 0;
+      ++state_[1];
+      byte_pointer_ += params_.advance_group;
+
+      thread_start_row_ += (ThreadMap::Shape::kGroup - 1) *
+          ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
+
+      if (state_[1] == ThreadMap::Count::kGroup) {
+        state_[1] = 0;
+        ++state_[2];
+        byte_pointer_ += params_.advance_cluster;
+
+        thread_start_row_ += ThreadMap::Count::kGroup *
+            ThreadMap::Shape::kGroup * ThreadMap::Count::kRow *
+            ThreadMap::Shape::kRow;
+
+        if (state_[2] == ThreadMap::Count::kCluster) {
+          state_[2] = 0;
+          byte_pointer_ += params_.advance_tile;
+        }
+      }
+    }
+
+    return *this;
+  }
+
+  ///< Efficiently disables all accesses guarded by mask
+  CUTLASS_DEVICE void clear_mask() {
+    mask_.clear();
+  }
+
+  ///< Efficiently enables all accesses guarded by mask
+  CUTLASS_DEVICE void enable_mask() {
+    mask_.enable();
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void get_mask(Mask& mask) const {
+    mask = mask_;
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void set_mask(Mask const& mask) {
+    mask_ = mask;
+  }
+};
+
+template <typename IT>
+struct MakePrefetchableIterator {
+  using Iterator = PredicatedTileIteratorPrefetch<
+      typename IT::ThreadMap,
+      typename IT::Element>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/41_fused_multi_head_attention/iterators/make_residual_last.h b/examples/41_fused_multi_head_attention/iterators/make_residual_last.h
new file mode 100644
index 0000000000..845a3c6b7a
--- /dev/null
+++ b/examples/41_fused_multi_head_attention/iterators/make_residual_last.h
@@ -0,0 +1,97 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "predicated_tile_access_iterator_residual_last.h"
+#include "predicated_tile_iterator_residual_last.h"
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+template <typename BaseIterator>
+struct MakeIteratorResidualLast;
+
+template <
+    typename Shape,
+    typename Element,
+    typename Layout,
+    int AdvanceRank,
+    typename ThreadMap,
+    int AccessSize,
+    bool Gather>
+struct MakeIteratorResidualLast<PredicatedTileIterator<
+    Shape,
+    Element,
+    Layout,
+    AdvanceRank,
+    ThreadMap,
+    AccessSize,
+    Gather>> {
+  using Iterator = PredicatedTileIteratorResidualLast<
+      Shape,
+      Element,
+      Layout,
+      AdvanceRank,
+      ThreadMap,
+      AccessSize,
+      Gather>;
+};
+
+template <
+    typename Shape,
+    typename Element,
+    typename Layout,
+    int AdvanceRank,
+    typename ThreadMap,
+    typename AccessType,
+    bool Gather>
+struct MakeIteratorResidualLast<PredicatedTileAccessIterator<
+    Shape,
+    Element,
+    Layout,
+    AdvanceRank,
+    ThreadMap,
+    AccessType,
+    Gather>> {
+  using Iterator = PredicatedTileAccessIteratorResidualLast<
+      Shape,
+      Element,
+      Layout,
+      AdvanceRank,
+      ThreadMap,
+      AccessType,
+      Gather>;
+};
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
diff --git a/examples/41_fused_multi_head_attention/iterators/predicated_tile_access_iterator_residual_last.h b/examples/41_fused_multi_head_attention/iterators/predicated_tile_access_iterator_residual_last.h
new file mode 100644
index 0000000000..6bc9e52c3c
--- /dev/null
+++ b/examples/41_fused_multi_head_attention/iterators/predicated_tile_access_iterator_residual_last.h
@@ -0,0 +1,2114 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates calculating the address and predicates to the load of tiles
+    from pitch-linear rank=2 tensors.
+
+    This iterator uses masks to guard out-of-bounds accesses. The first tile
+   this iterator visits maybe partial, then the remaining tiles are complete.
+   So, we only need to compute the predicates twice, once before the first tile
+   and once for the remaining full tiles which can share the same predicates.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+    stored in registers, and integer addition is used to advance the pointer
+    through memory.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator_params.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedTileAccessIteratorResidualLast
+///
+template <
+    typename Shape,
+    typename Element,
+    typename Layout,
+    int AdvanceRank,
+    typename ThreadMap,
+    typename AccessType,
+    bool Gather = false>
+class PredicatedTileAccessIteratorResidualLast;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for pitch-linear
+/// data.
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    bool Gather>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::PitchLinear,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingPredicates = PredicatedTileAccessIteratorPredicates<
+      Shape,
+      Element,
+      Layout,
+      AdvanceRank,
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector =
+      ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static_assert(
+      !(ThreadMap::kElementsPerAccess % AccessType::kElements),
+      "Vectors implied by the thread map must be divisible by the access type.");
+
+  using Mask = typename UnderlyingPredicates::Mask;
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileAccessIteratorParams {
+    using Base = PredicatedTileAccessIteratorParams;
+
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : Base(
+              layout.stride(0),
+              MakePredicatedTileAccessIteratorDesc<
+                  Shape,
+                  Element,
+                  Layout,
+                  kAdvanceRank,
+                  ThreadMap>()()) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const& base) : Base(base) {}
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char*;
+
+ private:
+  //
+  // Data members
+  //
+
+  UnderlyingPredicates the_predicates;
+  Mask residual_tile_mask;
+
+  /// Parameters object with precomputed internal state
+  Params params_;
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  /// Below is used when Gather is turned on.  We need to record strided_offset
+  /// and contiguous_offset separated to compute the offset by using
+  ///
+  /// offset = contiguous_offset + indices[strided_offset]
+  ///
+
+  /// Gather indices
+  int const* indices_;
+
+  Index gather_offset_strided;
+
+ private:
+  /// Computes predicates based on internally tracked per-thread offset.
+  CUTLASS_DEVICE
+  void compute_predicates_(
+      /// Extent of the matrix window
+      TensorCoord extent,
+      /// optionally, simplify predicate calculation during 'steady state' phase
+      bool is_steady_state = false) {
+    the_predicates.compute_predicates_(extent, is_steady_state);
+  }
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      /// Gather indices
+      int const* indices = nullptr)
+      : params_(params),
+        pointer_(reinterpret_cast<BytePointer>(
+            const_cast<NonConstPointer>(pointer))),
+        the_predicates(extent),
+        indices_(indices) {
+    the_predicates.set_predicates(thread_id, threadblock_offset);
+    the_predicates.get_mask(residual_tile_mask);
+
+    // Working around a weird compiler bug happening on P100 for the backward.
+    // I've seen together: the_predicates.predicates_[0] = 14 (instead of 15)
+    // residual_tile_mask[0] = 15 (correct)
+    //
+    // Adding prints when the value is calculated (in `compute_predicates_`)
+    // sometimes removes the bug. The consequence is that we skip some
+    // element of a tensor, leading to wrong results
+    // Setting `compute_predicates_`'s second argument (`is_steady_state`) to
+    // true also seems to get rid of the bug - at the cost of twice as many
+    // comparisons.
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
+    constexpr bool kWorkAroundCompilerBug = false;
+#else
+    constexpr bool kWorkAroundCompilerBug = true;
+#endif
+    the_predicates.compute_predicates_(extent, true && !kWorkAroundCompilerBug);
+
+    // update internal pointers
+    Layout layout(params_.stride_);
+
+    if (!Gather) {
+      add_pointer_offset(layout(the_predicates.thread_offset_));
+    } else {
+      gather_offset_strided = the_predicates.thread_offset_.strided();
+      add_pointer_offset(
+          layout(make_Coord(the_predicates.thread_offset_.contiguous(), 0)));
+    }
+  }
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id)
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    the_predicates.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool is_residual_tile) {
+    if (is_residual_tile) {
+      the_predicates.set_mask(residual_tile_mask);
+    }
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    if (!Gather) {
+      if (kAdvanceRank) {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided());
+        pointer_ += Shape::kContiguous * tile_offset.contiguous();
+      } else {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous());
+        pointer_ += Shape::kStrided * tile_offset.strided();
+      }
+    } else {
+      add_pointer_offset(Shape::kContiguous * tile_offset.contiguous());
+      gather_offset_strided += Shape::kStrided * tile_offset.strided();
+    }
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    if (Gather) {
+      assert(indices_);
+
+      if (!valid()) {
+        return nullptr;
+      }
+
+      LongIndex contiguous_offset = the_predicates.iteration_contiguous_ *
+              (ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value /
+               8) +
+          the_predicates.iteration_vector_;
+      int strided_index = gather_offset_strided +
+          the_predicates.iteration_strided_ * ThreadMap::Delta::kStrided;
+
+      LongIndex strided_offset = indices_[strided_index] *
+          LongIndex(params_.stride_) * sizeof_bits<Element>::value / 8;
+
+      return reinterpret_cast<AccessType*>(
+          pointer_ + contiguous_offset + strided_offset);
+    }
+
+    return reinterpret_cast<AccessType*>(
+               pointer_ +
+               the_predicates.iteration_contiguous_ *
+                   (ThreadMap::Delta::kContiguous *
+                    sizeof_bits<Element>::value) /
+                   8) +
+        the_predicates.iteration_vector_;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    the_predicates.operator++();
+
+    ++the_predicates.iteration_vector_;
+    if (the_predicates.iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+
+    the_predicates.iteration_vector_ = 0;
+    ++the_predicates.iteration_contiguous_;
+
+    if (the_predicates.iteration_contiguous_ <
+        ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    the_predicates.iteration_contiguous_ = 0;
+    ++the_predicates.iteration_strided_;
+
+    if (the_predicates.iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      if (!Gather) {
+        pointer_ += params_.inc_strided_;
+      }
+
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    the_predicates.iteration_strided_ = 0;
+
+    if (!Gather) {
+      // advance to next tile
+      pointer_ += params_.inc_next_;
+
+      // now return to start tile - if the iterator is subsequently advanced,
+      // this subtraction as well as the subsequent integer addition are both
+      // elided by the compiler.
+      pointer_ -= params_.inc_advance_;
+    }
+
+    return *this;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    the_predicates.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    the_predicates.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    the_predicates.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    the_predicates.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    return the_predicates.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for column-major
+/// data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    bool Gather>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::ColumnMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessType,
+      Gather>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))){};
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.row(), extent.column()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row(),
+                threadblock_offset.column()),
+            indices) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for row-major
+/// data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    bool Gather>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::RowMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessType,
+      Gather>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))){};
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      /// Gather indices
+      int const* indices = nullptr)
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.column(), extent.row()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column(),
+                threadblock_offset.row()),
+            indices) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for affine rank 2
+/// data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRankN<2>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRankN<2>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingPredicates = PredicatedTileAccessIteratorPredicates<
+      Shape,
+      Element,
+      layout::PitchLinear,
+      AdvanceRank,
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector =
+      ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static_assert(
+      !(ThreadMap::kElementsPerAccess % AccessType::kElements),
+      "Vectors implied by the thread map must be divisible by the access type.");
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingPredicates::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+   private:
+    /// stride of pitch-linear layout (units of Element)
+    Coord<Layout::kStrideRank, Layout::LongIndex> stride_;
+    /// amount (in byte) to increment pointer to move to next access along
+    /// contiguous dimension
+    LongIndex inc_contiguous_;
+    /// amount (in byte) to increment pointer from first access of current
+    /// contiguous dimension to first access of next one.
+    LongIndex inc_strided_;
+    /// amount (in byte) to increment pointer from last access of current
+    /// contiguous dimension to first access of next one.
+    LongIndex inc_next_strided_;
+    /// amount (in byte) to increment pointer from last access to first access
+    /// of next tile
+    LongIndex inc_next_;
+    /// amount (in byte) to increment pointer from first access of current tile
+    /// to first access of next tile
+    LongIndex inc_advance_;
+
+   public:
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    Params()
+        : stride_(0),
+          inc_contiguous_(0),
+          inc_strided_(0),
+          inc_next_(0),
+          inc_advance_(0) {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : stride_({layout.stride(0), layout.stride(1)}) {
+      inc_contiguous_ =
+          (LongIndex(stride_[0]) * ThreadMap::Delta::kContiguous) *
+          sizeof_bits<Element>::value / 8;
+
+      inc_strided_ = (LongIndex(stride_[1]) * ThreadMap::Delta::kStrided) *
+          sizeof_bits<Element>::value / 8;
+
+      inc_next_strided_ = inc_strided_ -
+          LongIndex(ThreadMap::Iterations::kContiguous - 1) * inc_contiguous_;
+
+      if (kAdvanceRank) {
+        // advance along strided dimension
+        inc_advance_ = Shape::kStrided * LongIndex(stride_[1]) *
+            sizeof_bits<Element>::value / 8;
+      } else {
+        // advance along contiguous dimension
+        inc_advance_ =
+            Shape::kContiguous * stride_[0] * sizeof_bits<Element>::value / 8;
+      }
+
+      inc_next_ = inc_advance_ -
+          LongIndex(ThreadMap::Iterations::kContiguous - 1) * inc_contiguous_ -
+          LongIndex(ThreadMap::Iterations::kStrided - 1) * inc_strided_;
+    };
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char*;
+
+  //
+  // Data members
+  //
+
+  /// Parameters object with precomputed internal state
+  Params params_;
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  UnderlyingPredicates the_predicates;
+  Mask residual_tile_mask;
+
+ private:
+  /// Computes predicates based on internally tracked per-thread offset.
+  CUTLASS_DEVICE
+  void compute_predicates_(
+      /// Extent of the matrix window
+      TensorCoord extent,
+      /// optionally, simplify predicate calculation during 'steady state' phase
+      bool is_steady_state = false) {
+    the_predicates.compute_predicates_(extent, is_steady_state);
+  }
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : params_(params),
+        pointer_(reinterpret_cast<BytePointer>(
+            const_cast<NonConstPointer>(pointer))),
+        the_predicates(extent) {
+    the_predicates.set_predicates(thread_id, threadblock_offset);
+
+    // update internal pointers
+    Layout layout(params_.stride_);
+    add_pointer_offset(layout(the_predicates.thread_offset_));
+  }
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    the_predicates.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool is_residual_tile) {
+    if (is_residual_tile) {
+      the_predicates.set_mask(residual_tile_mask);
+    }
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    if (kAdvanceRank) {
+      pointer_ += params_.inc_advance_ * LongIndex(tile_offset[1]);
+      pointer_ += Shape::kContiguous * tile_offset[0];
+    } else {
+      pointer_ += params_.inc_advance_ * LongIndex(tile_offset[0]);
+      pointer_ += Shape::kStrided * tile_offset[1];
+    }
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(pointer_) +
+        the_predicates.iteration_vector_;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    the_predicates.operator++();
+    ++the_predicates.iteration_vector_;
+    if (the_predicates.iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+
+    the_predicates.iteration_vector_ = 0;
+    ++the_predicates.iteration_contiguous_;
+
+    if (the_predicates.iteration_contiguous_ <
+        ThreadMap::Iterations::kContiguous) {
+      pointer_ += params_.inc_contiguous_;
+      return *this;
+    }
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    the_predicates.iteration_contiguous_ = 0;
+    ++the_predicates.iteration_strided_;
+
+    if (the_predicates.iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      pointer_ += params_.inc_next_strided_;
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    the_predicates.iteration_strided_ = 0;
+
+    // advance to next tile
+    pointer_ += params_.inc_next_;
+
+    // now return to start tile - if the iterator is subsequently advanced, this
+    // subtraction as well as the subsequent integer addition are both elided by
+    // the compiler.
+    pointer_ -= params_.inc_advance_;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    the_predicates.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    the_predicates.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    the_predicates.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    the_predicates.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return the_predicates.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for affine rank 2
+/// column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRank2ColumnMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+      Element,
+      layout::AffineRankN<2>,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.row(), extent.column()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row(),
+                threadblock_offset.column())) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset(
+        make_Coord(tile_offset.row(), tile_offset.column()));
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for affine rank-2
+/// row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRank2RowMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+      Element,
+      layout::AffineRankN<2>,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.column(), extent.row()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column(),
+                threadblock_offset.row())) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset(
+        make_Coord(tile_offset.column(), tile_offset.row()));
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for column-major
+/// interleaved data. It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    int InterleavedK>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::ColumnMajorInterleaved<InterleavedK>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<
+          Shape::kRow * kInterleavedK,
+          Shape::kColumn / kInterleavedK>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(
+                extent.row() * kInterleavedK,
+                extent.column() / kInterleavedK),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row() * kInterleavedK,
+                threadblock_offset.column() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for row-major
+/// interleaved data.
+//  It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    int InterleavedK>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::RowMajorInterleaved<InterleavedK>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<
+          Shape::kColumn * kInterleavedK,
+          Shape::kRow / kInterleavedK>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(
+                extent.column() * kInterleavedK,
+                extent.row() / kInterleavedK),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column() * kInterleavedK,
+                threadblock_offset.row() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/41_fused_multi_head_attention/iterators/predicated_tile_iterator_residual_last.h b/examples/41_fused_multi_head_attention/iterators/predicated_tile_iterator_residual_last.h
new file mode 100644
index 0000000000..4db56560fc
--- /dev/null
+++ b/examples/41_fused_multi_head_attention/iterators/predicated_tile_iterator_residual_last.h
@@ -0,0 +1,2119 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of tiles from pitch-linear rank=2
+   tensors.
+
+    This iterator uses masks to guard out-of-bounds accesses. The first tile
+   this iterator visits maybe partial, then the remaining tiles are complete.
+   So, we only need to compute the predicates twice, once before the first tile
+   and once for the remaining full tiles which can share the same predicates.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+   stored in registers, and integer addition is used to advance the pointer
+   through memory.
+*/
+
+#pragma once
+
+#include "cutlass/arch/memory.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedTileIteratorResidualLast
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+/// Regular tile iterator using a precomputed control structure to minimize
+/// register liveness and integer arithmetic.
+///
+/// Layout is assumed to be invariant at the time the precomputed "Params"
+/// object is constructed.
+///
+/// Base pointer and tensor extents may be specified at the time the iterator is
+/// constructed. Subsequently, they are assumed to be immutable.
+///
+/// Adding a logical coordinate offset may be performed at the time the iterator
+/// is constructed. Subsequent additions to logical coordinate offset may be
+/// performed but are relatively expensive.
+///
+/// Visitation order is intended to first visit a "residual" tile that may be
+/// partially full in both the advance dimension and the steady-state dimension.
+/// This is assumed to be the last tile in the iteration sequence. Advancing an
+/// iterator that has just been constructed moves to the first tile that is full
+/// in the advance dimension and recomputes predicates. Subsequent accesses may
+/// be performed without updating internal predicates and are efficient in terms
+/// of live register state and pointer arithmetic instructions.
+///
+/// To be efficient, this assumes the iterator will be dereferenced and advanced
+/// at least once outside any looping structure to minimize integer arithmetic.
+///
+/// Acceses out of bounds are safe so long as `clear_mask()` is called prior to
+/// dereferencing the iterator.
+///
+///
+/// Example:
+///
+/// An efficient pipeline structure may be constructed as follows:
+///
+// template <typename Iterator>
+// __global__ void kernel(
+//   typename Iterator::Params params,
+//   typename Iterator::Element *ptr,
+//   TensorCoord extent) {
+//
+//   typename Iterator::Fragment fragment;
+//
+//   TensorCoord threadblock_offset(0, 0);
+//
+//   Iterator iter(params, ptr, extent, threadIdx.x, threadblock_offsets);
+//
+//
+//   fragment = *iter;        // load "residue" tile first
+//   ++iter;                  // advance to first "steady state" tile and update
+//   internal masks
+//
+//
+//   #pragma unroll
+//   for (int i = Remaining - 1; i >= 0; --i) {
+//
+//     f(fragment);
+//
+//     if (!i) {
+//       iter.clear_mask();   // light-weight operation to clear masks -
+//       subsequent loads become NO-OPs.
+//     }
+//
+//     fragment = *iter;      // load tile during "steady state" phase
+//     ++iter;                // advance to next tile - lightweight due to
+//     steady-state masks
+//   }
+// }
+//
+// void host(TensorView<Element, 2, layout::PitchLinear> view) {
+//
+//   using Iterator =
+//   transform::threadblock::PredicatedTileIteratorResidualLast;
+//
+//   typename Iterator::Params params(view.layout());
+//
+//   kernel<Iterator>(params, view.data());
+// }
+///
+///
+template <
+    typename Shape,
+    typename Element,
+    typename Layout,
+    int AdvanceRank,
+    typename ThreadMap,
+    int AccessSize = ThreadMap::kElementsPerAccess,
+    bool Gather = false>
+class PredicatedTileIteratorResidualLast;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    bool Gather>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::PitchLinear,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  /// Type used for internal memory accesses
+  using AccessType = AlignedArray<
+      Element,
+      AccessSize,
+      (AccessSize * sizeof_bits<Element>::value / 8)>;
+
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator = PredicatedTileAccessIteratorResidualLast<
+      Shape,
+      Element,
+      Layout,
+      kAdvanceRank,
+      ThreadMap,
+      AccessType,
+      Gather>;
+
+  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename TileAccessIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    using Base = typename TileAccessIterator::Params::Base;
+
+    friend PredicatedTileIteratorResidualLast;
+
+   private:
+    /// Parameters object
+    typename TileAccessIterator::Params params_;
+
+   public:
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout) : params_(layout) {}
+
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const& base) : params_(base) {}
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char*;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      /// Gather indices
+      int const* indices = nullptr)
+      : address_iterator_(
+            params.params_,
+            pointer,
+            extent,
+            thread_id,
+            threadblock_offset,
+            indices) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    if (kAdvanceRank)
+      address_iterator_.add_tile_offset({0, 1});
+    else
+      address_iterator_.add_tile_offset({1, 0});
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    address_iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    address_iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    address_iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    address_iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    address_iterator_.get_mask(mask);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    load_with_byte_offset(
+        frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v +
+              kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          address_iterator_.set_iteration_index(idx);
+          char const* byte_ptr =
+              reinterpret_cast<char const*>(address_iterator_.get()) +
+              byte_offset;
+
+          AccessType const* access_ptr =
+              reinterpret_cast<AccessType const*>(byte_ptr);
+
+          cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+              frag_ptr[idx], access_ptr, address_iterator_.valid());
+
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    store_with_byte_offset(
+        frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType const* frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v +
+              kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          char* byte_ptr =
+              reinterpret_cast<char*>(address_iterator_.get()) + byte_offset;
+          AccessType* access_ptr = reinterpret_cast<AccessType*>(byte_ptr);
+
+          if (address_iterator_.valid()) {
+            *access_ptr = frag_ptr[idx];
+          }
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_byte_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    bool Gather>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::ColumnMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessSize,
+      Gather>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id, ///< ID of each participating thread
+      TensorCoord const& threadblock_offset, ///< Initial offset of threadblock
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.row(), extent.column()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row(),
+                threadblock_offset.column()),
+            indices) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    bool Gather>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::RowMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessSize,
+      Gather>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id, ///< ID of each participating thread
+      TensorCoord const& threadblock_offset, ///< Initial offset of threadblock
+      int const* indices = nullptr ///< Gather indices
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.column(), extent.row()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column(),
+                threadblock_offset.row()),
+            indices) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for affine rank-2 data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRankN<2>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRankN<2>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  /// Type used for internal memory accesses
+  using AccessType = AlignedArray<
+      Element,
+      AccessSize,
+      (AccessSize * sizeof_bits<Element>::value / 8)>;
+
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator = PredicatedTileAccessIteratorResidualLast<
+      Shape,
+      Element,
+      Layout,
+      kAdvanceRank,
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename TileAccessIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend PredicatedTileIteratorResidualLast;
+
+   private:
+    /// Parameters object
+    typename TileAccessIterator::Params params_;
+
+   public:
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout) : params_(layout) {}
+
+    CUTLASS_HOST_DEVICE
+    Params() {}
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char*;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : address_iterator_(
+            params.params_,
+            pointer,
+            extent,
+            thread_id,
+            threadblock_offset) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    if (kAdvanceRank)
+      address_iterator_.add_tile_offset(make_Coord(0, 1));
+    else
+      address_iterator_.add_tile_offset(make_Coord(1, 0));
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    address_iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    address_iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    address_iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    address_iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    address_iterator_.get_mask(mask);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    load_with_byte_offset(
+        frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v +
+              kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          address_iterator_.set_iteration_index(idx);
+          char const* byte_ptr =
+              reinterpret_cast<char const*>(address_iterator_.get()) +
+              byte_offset;
+
+          AccessType const* access_ptr =
+              reinterpret_cast<AccessType const*>(byte_ptr);
+
+          cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+              frag_ptr[idx], access_ptr, address_iterator_.valid());
+
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    store_with_byte_offset(
+        frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType const* frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v +
+              kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          char* byte_ptr =
+              reinterpret_cast<char*>(address_iterator_.get()) + byte_offset;
+          AccessType* access_ptr = reinterpret_cast<AccessType*>(byte_ptr);
+
+          if (address_iterator_.valid()) {
+            *access_ptr = frag_ptr[idx];
+          }
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_byte_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for affine rank 2
+/// column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRank2ColumnMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+      Element,
+      layout::AffineRankN<2>,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessSize>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id, ///< ID of each participating thread
+      TensorCoord const& threadblock_offset, ///< Initial offset of threadblock
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.row(), extent.column()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row(),
+                threadblock_offset.column())) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for affine rank 2
+/// row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRank2RowMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+      Element,
+      layout::AffineRankN<2>,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessSize>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id, ///< ID of each participating thread
+      TensorCoord const& threadblock_offset, ///< Initial offset of threadblock
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.column(), extent.row()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column(),
+                threadblock_offset.row())) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for interleaved data.
+/// It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    int InterleavedK>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::ColumnMajorInterleaved<InterleavedK>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<
+          Shape::kRow * kInterleavedK,
+          Shape::kColumn / kInterleavedK>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessSize>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(
+                extent.row() * kInterleavedK,
+                extent.column() / kInterleavedK),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row() * kInterleavedK,
+                threadblock_offset.column() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for interleaved-32
+/// data.  It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    int InterleavedK>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::RowMajorInterleaved<InterleavedK>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<
+          Shape::kColumn * kInterleavedK,
+          Shape::kRow / kInterleavedK>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessSize>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(
+                extent.column() * kInterleavedK,
+                extent.row() / kInterleavedK),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column() * kInterleavedK,
+                threadblock_offset.row() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/41_fused_multi_head_attention/iterators/transpose_warp_iterator.h b/examples/41_fused_multi_head_attention/iterators/transpose_warp_iterator.h
new file mode 100644
index 0000000000..f0f8ea6086
--- /dev/null
+++ b/examples/41_fused_multi_head_attention/iterators/transpose_warp_iterator.h
@@ -0,0 +1,55 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "warp_iterator_from_smem.h"
+
+template <typename WarpIterator>
+struct TransposeWarpIterator {
+  using Iterator = char;
+  static bool constexpr kSupportsTranspose = false;
+};
+
+template <
+    /// Operand identity
+    cutlass::gemm::Operand Operand,
+    /// Data type of A elements
+    typename Element,
+    typename InstructionShape,
+    bool kTranspose>
+struct TransposeWarpIterator<
+    cutlass::gemm::warp::
+        WarpIteratorFromSmem<Operand, Element, InstructionShape, kTranspose>> {
+  using Iterator = cutlass::gemm::warp::
+      WarpIteratorFromSmem<Operand, Element, InstructionShape, !kTranspose>;
+  static bool constexpr kSupportsTranspose = true;
+};
diff --git a/examples/41_fused_multi_head_attention/iterators/warp_iterator_from_smem.h b/examples/41_fused_multi_head_attention/iterators/warp_iterator_from_smem.h
new file mode 100644
index 0000000000..d19b1907d5
--- /dev/null
+++ b/examples/41_fused_multi_head_attention/iterators/warp_iterator_from_smem.h
@@ -0,0 +1,283 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Inspired from
+   "cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h" Loads tiles of GEMM
+   operands from a RowMajor shared-memory layout into registers to use by A100
+   TensorCores.
+
+    The difference with "mma_tensor_op_tile_access_iterator.h" is that:
+    (1) We use "ldmatrix" to load tiles, rather than manual loads (slightly
+   faster) (2) We support to transpose the operand (eg read `A.transpose()` when
+   the shared memory holds `A`)
+
+    This is only implemented for the specific shapes.
+*/
+#pragma once
+
+#include <cutlass/gemm/gemm.h>
+
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+template <
+    /// Operand identity
+    Operand Operand_,
+    /// Data type of A elements
+    typename Element_,
+    typename InstructionShape_,
+    bool kTranspose = false>
+class WarpIteratorFromSmem {
+ public:
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = cutlass::MatrixShape<32, 32>;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+  static_assert(
+      kOperand == Operand::kA,
+      "No support for OperandB at the moment");
+
+  /// Basic check
+  static_assert(
+      kOperand == Operand::kA || kOperand == Operand::kB,
+      "WarpIteratorFromSmem may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+  static_assert(sizeof_bits<Element>::value == 16, "Only supported for half");
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+  static_assert(InstructionShape::kRow == 16, "Only supports 16x8x8 / 16x8x16");
+  static_assert(
+      InstructionShape::kColumn == 8 || InstructionShape::kColumn == 16,
+      "Only supports 16x8x8 / 16x8x16");
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept:
+  /// MatrixShape)
+  static int const kOpDelta = 1;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Number of elements accessed per Shared Memory load
+  static int const kElementsPerAccess =
+      (sizeof_bits<Element>::value >= 32 ? 1
+                                         : 32 / sizeof_bits<Element>::value);
+
+  using InstructionCount = MatrixShape<
+      Shape::kRow / InstructionShape::kRow,
+      Shape::kColumn / InstructionShape::kColumn>;
+
+  static int const kIterations = (kOperand == Operand::kA)
+      ? InstructionCount::kColumn
+      : InstructionCount::kRow;
+
+ public:
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<
+      Element,
+      (kOperand == Operand::kA)
+          ? (Shape::kRow* InstructionShape::kColumn / kThreads)
+          : (Shape::kColumn* InstructionShape::kRow / kThreads)>;
+
+  /// Memory access type
+  // using AccessType = AlignedArray<Element, kElementsPerAccess>;
+  using AccessType = Array<unsigned, 4>;
+
+  static int constexpr kWarpShapeDivisibleInner =
+      (kOperand == Operand::kA ? InstructionShape::kColumn
+                               : InstructionShape::kRow);
+  static int constexpr kAccessesInner =
+      (kWarpShapeDivisibleInner / kElementsPerAccess) / 4;
+  // Number of 32bits tiles to load per `ldmatrix`
+  static int const kTilesPerInstruction = InstructionShape::kRow / 8;
+  static_assert(kTilesPerInstruction == 2, "Only supports 16x8x16 and 16x8x8");
+
+ private:
+  /// Underlying tensor reference
+  TensorRef ref_;
+
+  /// Origin
+  MatrixCoord origin_;
+
+  /// Iterations in a tile
+  int iterations_;
+
+ public:
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  WarpIteratorFromSmem(TensorRef const& ref, int lane_id)
+      : WarpIteratorFromSmem(ref, {Shape::kRow, Shape::kColumn}, lane_id) {}
+  CUTLASS_HOST_DEVICE
+  WarpIteratorFromSmem(TensorRef const& ref, TensorCoord extent, int lane_id)
+      : ref_(ref), iterations_(0) {
+    // See also:
+    // https://docs.nvidia.com/cuda/archive/11.7.1/parallel-thread-execution/index.html#warp-level-matrix-fragment-mma-1688
+    // 16x8x8: kAccessesInner = 1 (1 ldmatrix.x4)
+    // 16x8x16: kAccessesInner = 2 (2 ldmatrix.x4)
+    int ldsm_vec_num = (lane_id >> 3);
+    if (kOperand == Operand::kA) {
+      origin_ = MatrixCoord(lane_id % 8, 0);
+      static_assert(
+          InstructionCount::kRow * kTilesPerInstruction == 4,
+          "can't use ldmatrix.x4");
+      int access_m_idx = ldsm_vec_num % kTilesPerInstruction;
+      int inner_idx = (ldsm_vec_num / kTilesPerInstruction) % kAccessesInner;
+      int inst_m_idx = ldsm_vec_num / (kTilesPerInstruction * kAccessesInner);
+      MatrixCoord offset(
+          access_m_idx * 8 + inst_m_idx * InstructionShape::kRow,
+          inner_idx * 4 * kElementsPerAccess);
+      if (kTranspose) {
+        offset = MatrixCoord(offset.column(), offset.row());
+      }
+      origin_ += offset;
+    } else {
+      // Note: This is not tested or used
+      origin_ = MatrixCoord(0, lane_id % 8);
+      static_assert(InstructionCount::kColumn * kAccessesInner == 4, "");
+      CUTLASS_PRAGMA_UNROLL
+      for (int inst_n_idx = 0; inst_n_idx < InstructionCount::kColumn;
+           ++inst_n_idx) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int inner_idx = 0; inner_idx < kAccessesInner; ++inner_idx) {
+          int access_idx = inner_idx + kAccessesInner * inst_n_idx;
+
+          MatrixCoord offset(
+              inner_idx * 4 * kElementsPerAccess, inst_n_idx * 8);
+
+          if (access_idx == ldsm_vec_num) {
+            if (kTranspose) {
+              offset = MatrixCoord(offset.column(), offset.row());
+            }
+            origin_ += offset;
+          }
+        }
+      }
+    }
+
+    ref_.add_coord_offset(origin_);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  WarpIteratorFromSmem& add_tile_offset(TensorCoord const& tile_offset) {
+    TensorCoord coord_offset(
+        tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+    if (kTranspose) {
+      coord_offset = TensorCoord{coord_offset.column(), coord_offset.row()};
+    }
+    origin_ += coord_offset;
+
+    ref_.add_coord_offset(coord_offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  void advance() {
+    if (kOperand == Operand::kA) {
+      add_tile_offset({0, 1});
+    } else {
+      add_tile_offset({1, 0});
+    }
+
+    iterations_ = 0;
+  }
+
+  /// increase iterations in a tile
+  CUTLASS_HOST_DEVICE
+  WarpIteratorFromSmem& operator++() {
+    iterations_++;
+
+    if (iterations_ >= kIterations)
+      advance();
+
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_DEVICE
+  void load(Fragment& frag) const {
+    AccessType* access_ptr = reinterpret_cast<AccessType*>(&frag);
+    using LoadLayout = typename platform::
+        conditional<kTranspose, layout::ColumnMajor, layout::RowMajor>::type;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int access_m_idx = 0; access_m_idx <
+         (InstructionCount::kRow * kTilesPerInstruction * kAccessesInner) / 4;
+         ++access_m_idx) {
+      MatrixCoord offset;
+      if (kOperand == Operand::kA) {
+        offset = MatrixCoord(
+            access_m_idx * 16, iterations_ * InstructionShape::kColumn);
+      } else {
+        offset = MatrixCoord(iterations_ * InstructionShape::kRow, 0);
+      }
+      if (kTranspose) {
+        offset = MatrixCoord(offset.column(), offset.row());
+      }
+      cutlass::arch::ldsm<LoadLayout, 4>(
+          access_ptr[access_m_idx], ref_.data() + ref_.offset(offset));
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/41_fused_multi_head_attention/kernel_backward.h b/examples/41_fused_multi_head_attention/kernel_backward.h
new file mode 100644
index 0000000000..6fd94a6c58
--- /dev/null
+++ b/examples/41_fused_multi_head_attention/kernel_backward.h
@@ -0,0 +1,2554 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cmath>
+#include <cinttypes>
+#include <type_traits>
+#include <vector>
+
+#include <cuda_fp16.h>
+#include <curand_kernel.h>
+
+#ifdef HAS_PYTORCH
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAGeneratorImpl.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAGraphsUtils.cuh>
+#endif
+
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/functional.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+
+#include "debug_utils.h"
+#include "gemm_kernel_utils.h"
+
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/threadblock/epilogue_smem_accumulator.h"
+#include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/integer_subbyte.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/vector_iterator.h"
+#include "epilogue/epilogue_pipelined.h"
+#include "iterators/epilogue_predicated_tile_iterator.h"
+
+#include "gemm/custom_mma.h"
+#include "gemm/find_default_mma.h"
+#include "gemm/mma_accum_lambda_iterator.h"
+#include "gemm/mma_from_smem.h"
+#include "transform/tile_smem_loader.h"
+
+using namespace gemm_kernel_utils;
+
+namespace {
+
+template <typename FragmentType, int32_t kNumThreads>
+struct GmemTile {
+  /*
+    Helper functions to efficient store/load RF to gmem
+
+    GEMM accumulators have a particular format on A100, and
+    it takes some compute/shared-memory to rearrange them to
+    a RowMajor or ColumnMajor format in global memory through
+    an Epilogue. The same complexity goes for loading into RF.
+
+    This class loads/stores RF as they are, and can be used for
+    efficient accumulation across gemms for instance:
+
+    ```
+    GmemTile tile;
+    for (int i = 0; i < N; ++i) {
+      // ...
+
+      Fragment accum;
+      if (i == 0) {
+        accum.clear();
+      } else {
+        tile.load(accum);
+      }
+      mma(accum, ...);
+      if (i < N-1) {
+        // Store for next GEMM
+        tile.store(accum);
+      } else {
+        // Store in tensor (eg RowMajor)
+        epilogue(accum);
+      }
+
+      // ...
+    }
+    ```
+  */
+
+  // 128bits per thread
+  using AccessType = cutlass::Array<float, 4>;
+  static constexpr int32_t kBytes = sizeof(AccessType);
+  static constexpr int32_t kStride = kNumThreads * AccessType::kElements;
+  static constexpr int32_t kNumIters =
+      FragmentType::kElements / AccessType::kElements;
+  static constexpr int32_t kElementsStored =
+      kNumThreads * FragmentType::kElements;
+  static_assert(
+      FragmentType::kElements % AccessType::kElements == 0,
+      "fragment not aligned on 128 bits");
+
+  float* ptr;
+
+  CUTLASS_DEVICE void load(FragmentType& fragment, int thread_id) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kNumIters; ++i) {
+      AccessType* __restrict__ gmem_ptr = reinterpret_cast<AccessType*>(
+          ptr + thread_id * AccessType::kElements + i * kStride);
+      AccessType sub_fragment;
+      cutlass::arch::global_load<AccessType, kBytes>(
+          sub_fragment, gmem_ptr, true);
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < AccessType::kElements; ++j) {
+        fragment[i * AccessType::kElements + j] = sub_fragment[j];
+      }
+    }
+  }
+
+  CUTLASS_DEVICE void store(FragmentType const& fragment, int thread_id) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kNumIters; ++i) {
+      AccessType* __restrict__ gmem_ptr = reinterpret_cast<AccessType*>(
+          ptr + thread_id * AccessType::kElements + i * kStride);
+      AccessType sub_fragment;
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < AccessType::kElements; ++j) {
+        sub_fragment[j] = fragment[i * AccessType::kElements + j];
+      }
+      cutlass::arch::global_store<AccessType, kBytes>(
+          sub_fragment, gmem_ptr, true);
+    }
+  }
+
+  CUTLASS_DEVICE void storeAtomicAdd(
+      FragmentType const& fragment,
+      int thread_id) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kNumIters; ++i) {
+      float* gmem_ptr = ptr + thread_id * AccessType::kElements + i * kStride;
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < AccessType::kElements; ++j) {
+        float val = fragment[i * AccessType::kElements + j];
+        float* ptr = gmem_ptr + j;
+        atomicAdd(ptr, val);
+      }
+    }
+  }
+};
+
+struct AtomicLock {
+  CUTLASS_DEVICE static void acquire(
+      int32_t* lock,
+      int set_val,
+      int thread_id) {
+    if (thread_id == 0) {
+      while (atomicCAS(lock, 0 /*cmp*/, set_val /*setval*/) != set_val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+        __nanosleep(40);
+#endif
+      }
+    }
+    __syncthreads();
+  }
+  CUTLASS_DEVICE static void release(int32_t* lock, int thread_id) {
+    if (thread_id == 0) {
+      int status = 0;
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+      asm volatile("st.global.release.gpu.b32 [%0], %1;\n"
+                   :
+                   : "l"(lock), "r"(status));
+#else
+      asm volatile("st.global.cg.b32 [%0], %1;\n" : : "l"(lock), "r"(status));
+#endif
+    }
+  }
+};
+
+template <typename scalar_t, typename Arch>
+constexpr int getWarpsPerSmBw() {
+  bool is_half = !cutlass::platform::is_same<scalar_t, float>::value;
+  if (Arch::kMinComputeCapability >= 80) {
+    return is_half ? 12 : 8;
+  }
+  return 8;
+}
+} // namespace
+
+template <
+    // which arch we target (eg `cutlass::arch::Sm80`)
+    typename ArchTag_,
+    // input/output type
+    typename scalar_t_,
+    // run optimized kernel because memory accesses will be aligned
+    bool kIsAligned_,
+    // use dropout if enabled
+    bool kApplyDropout_,
+    // when doing a GEMM, preload the next one (uses more shmem)
+    bool kPreload_,
+    // block dimensions
+    int kBlockSizeI_,
+    int kBlockSizeJ_,
+    // upperbound on `max(value.shape[-1], query.shape[-1])`
+    int kMaxK_ = (int)cutlass::platform::numeric_limits<uint32_t>::max(),
+    // assumes that `cu_seqlen` is None, and
+    // (1) `num_queries % kBlockSizeI == 0`
+    // (2) `num_keys % kBlockSizeJ == 0`
+    bool kKeysQueriesAlignedToBlockSize_ = false,
+    // Allows to parallelize across keys
+    bool kEnableSplitKeys_ = true>
+struct AttentionBackwardKernel {
+  enum CustomMaskType {
+    NoCustomMask = 0,
+    CausalFromTopLeft = 1,
+    CausalFromBottomRight = 2,
+    NumCustomMaskTypes,
+  };
+  using scalar_t = scalar_t_;
+  using output_t = scalar_t;
+  using output_accum_t = float;
+  using lse_scalar_t = float;
+  using accum_t = float;
+  using ArchTag = ArchTag_;
+  static constexpr bool kIsAligned = kIsAligned_;
+  static constexpr bool kApplyDropout = kApplyDropout_;
+  static constexpr bool kPreload = kPreload_;
+  static constexpr int kBlockSizeI = kBlockSizeI_;
+  static constexpr int kBlockSizeJ = kBlockSizeJ_;
+  static constexpr int kMaxK = kMaxK_;
+  static constexpr bool kKeysQueriesAlignedToBlockSize =
+      kKeysQueriesAlignedToBlockSize_;
+
+  static constexpr int64_t kWarpSize = 32;
+
+  // If this is true, we store and accumulate dK/dV in RF
+  // rather than going back to gmem everytime
+  static constexpr bool kIsHalf = cutlass::sizeof_bits<scalar_t>::value <= 16;
+  static constexpr bool kOutputInRF = kIsHalf && kMaxK <= kBlockSizeI;
+  static_assert(
+      !kPreload ||
+          (kIsHalf && ArchTag::kMinComputeCapability >= 80 && kOutputInRF),
+      "preload MMA not supported");
+  static constexpr bool kPrologueQK = kPreload;
+  static constexpr bool kPrologueGV = kPreload;
+  static constexpr bool kPrologueDOV = kPreload;
+  static constexpr bool kPrologueGQ = kPreload;
+  static constexpr bool kPrologueGK = kPreload;
+
+  static constexpr int64_t kNumWarpsPerBlock =
+      (kBlockSizeI * kBlockSizeJ) / (32 * 32);
+
+  // Compute delta for the f16 kernels
+  // TODO: Figure out why it's slower on the f32 kernels
+  // (something due to RF pressure?)
+  // TODO: Remove condition on `kOutputInRF` - this is needed to work
+  // around a compiler bug on V100, not exactly sure why but I spent
+  // too much time on this already. Reproducible with
+  // (B, Mq, Mkv, K) = (1, 1, 1, 136) for instance
+  static constexpr bool kKernelComputesDelta =
+      kIsHalf && (kOutputInRF || ArchTag::kMinComputeCapability != 70);
+
+  // Launch bounds
+  static constexpr int64_t kNumThreads = kWarpSize * kNumWarpsPerBlock;
+  static constexpr int64_t kMinBlocksPerSm =
+      getWarpsPerSmBw<scalar_t, ArchTag>() / kNumWarpsPerBlock;
+
+  using GemmType = DefaultGemmType<ArchTag, scalar_t>;
+  using DefaultConfig =
+      typename cutlass::gemm::device::DefaultGemmConfiguration<
+          typename GemmType::OpClass,
+          ArchTag,
+          scalar_t,
+          scalar_t,
+          scalar_t, // ElementC
+          accum_t // ElementAccumulator
+          >;
+  static constexpr auto kOptimalAlignement = cutlass::platform::max(
+      DefaultConfig::kAlignmentA,
+      DefaultConfig::kAlignmentB);
+  static constexpr auto kMinimumAlignment = GemmType::kMinimumAlignment;
+
+  struct MatmulQK {
+    /*
+    attn_T = k_j @ q_i.transpose(-2, -1) # matmul
+    attn_T = (attn_T - logsumexp[i_start:i_end].unsqueeze(1).transpose(-2,
+    -1)).exp() # epilogue
+
+    with attn_T.shape = (kBlockSizeJ, kBlockSizeI)
+    */
+    using ThreadblockShape =
+        cutlass::gemm::GemmShape<kBlockSizeJ, kBlockSizeI, GemmType::ThreadK>;
+    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+    using DefaultMma = typename cutlass::gemm::threadblock::DefaultMma<
+        scalar_t, // ElementA
+        cutlass::layout::RowMajor, // LayoutA
+        kIsAligned ? DefaultConfig::kAlignmentA : GemmType::kMinimumAlignment,
+        scalar_t, // ElementB
+        cutlass::layout::ColumnMajor, // LayoutB
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment,
+        accum_t, // ElementC
+        cutlass::layout::RowMajor, // LayoutC
+        typename GemmType::OpClass,
+        ArchTag,
+        ThreadblockShape,
+        WarpShape,
+        typename GemmType::InstructionShape,
+        DefaultConfig::kStages,
+        typename GemmType::Operator,
+        false, // AccumulatorsInRowMajor = false,
+        cutlass::gemm::SharedMemoryClearOption::kNone>;
+    using MmaCore = typename DefaultMma::MmaCore;
+    using Mma =
+        typename MakeCustomMma<typename DefaultMma::ThreadblockMma, kMaxK>::Mma;
+
+    // used for efficient load of bias tile (Bij) from global memory to shared
+    // memory
+    using BiasLoader = TileSmemLoader<
+        scalar_t,
+        // Bij is applied to transposed attn matrix tile (Pij.T). Bij is loaded
+        // row-major but needs to have transposed shape so we get the same
+        // elements.
+        cutlass::MatrixShape<ThreadblockShape::kN, ThreadblockShape::kM>,
+        MmaCore::kThreads,
+        // input restriction: kv_len has to be a multiple of this value
+        128 / cutlass::sizeof_bits<scalar_t>::value>;
+
+    // Epilogue to store to shared-memory in a format that we can use later for
+    // the second matmul
+    using B2bGemm = typename cutlass::gemm::threadblock::B2bGemm<
+        typename Mma::Operator::IteratorC,
+        typename Mma::Operator,
+        scalar_t,
+        WarpShape,
+        ThreadblockShape>;
+    using AccumLambdaIterator = typename DefaultMmaAccumLambdaIterator<
+        typename Mma::Operator::IteratorC,
+        accum_t,
+        kWarpSize>::Iterator;
+    using AccumulatorSharedStorage = typename B2bGemm::AccumulatorSharedStorage;
+  };
+
+  struct MatmulGradV {
+    /*
+    grad_v[j_start:j_end] += attn_T @ do_i # matmul
+
+    Dimensions: (kBlockSizeJ * kNumWarpsPerBlock, kBlockSizeI, K)
+    (we might need to iterate multiple times on K)
+    */
+    using ThreadblockShape =
+        cutlass::gemm::GemmShape<kBlockSizeJ, kBlockSizeI, GemmType::ThreadK>;
+    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+    using InstructionShape = typename GemmType::InstructionShape;
+
+    using DefaultGemm = cutlass::gemm::kernel::DefaultGemm<
+        scalar_t, // ElementA,
+        cutlass::layout::RowMajor, // LayoutA,
+        DefaultConfig::kAlignmentA,
+        scalar_t, // ElementB,
+        cutlass::layout::RowMajor, // LayoutB,
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment,
+        output_t,
+        cutlass::layout::RowMajor, // LayoutC,
+        accum_t,
+        typename GemmType::OpClass,
+        ArchTag,
+        ThreadblockShape,
+        WarpShape,
+        typename GemmType::InstructionShape,
+        typename DefaultConfig::EpilogueOutputOp,
+        void, // ThreadblockSwizzle - not used
+        DefaultConfig::kStages,
+        false, // SplitKSerial
+        typename GemmType::Operator>;
+
+    // if dropout:
+    //   for computing dVj += (Pij.T * Zij) @ dOi
+    //   Pij_dropped.T = Pij.T * Zij is computed on the fly as fragments of
+    //   Pij.T are loaded in. The reason we do it this way is because Pij.T and
+    //   Zij are reused in later steps, while Pij_dropped.T is only needed in
+    //   this step. computing Pij_dropped.T on the fly allows us to avoid
+    //   keeping all 3 of Pij_dropped.T, Pij.T, and Zij in shared memory at the
+    //   same time.
+    // if no dropout:
+    //   for computing dVj += Pij.T @ dOi
+    using WarpIteratorA = typename cutlass::gemm::threadblock::
+        DefaultWarpIteratorAFromSharedMemory<
+            typename DefaultGemm::Mma::Operator::Shape, // WarpShape
+            typename DefaultGemm::Mma::Operator::
+                InstructionShape, // InstructionShape
+            typename DefaultGemm::Mma::Operator::
+                IteratorA, // RegularWarpIterator
+            typename DefaultGemm::Mma::Policy // Policy
+            >::WarpIterator;
+    using DefaultMmaFromSmem =
+        typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
+            typename DefaultGemm::Mma,
+            MatmulQK::AccumulatorSharedStorage::Shape::kN,
+            WarpIteratorA,
+            kApplyDropout>; // kScaleOperandA
+
+    using Mma = typename DefaultMmaFromSmem::Mma;
+    using IteratorB = typename Mma::IteratorB;
+    using WarpCount = typename Mma::WarpCount;
+
+    // Epilogue
+    using DefaultOutputOp = typename DefaultConfig::EpilogueOutputOp;
+    using DefaultEpilogue = typename DefaultGemm::Epilogue;
+    using OutputTileIterator =
+        typename cutlass::epilogue::threadblock::MakePrefetchableIterator<
+            typename DefaultEpilogue::OutputTileIterator>::Iterator;
+    using AccumTileGmem = GmemTile<typename Mma::FragmentC, (int)kNumThreads>;
+  };
+
+  struct MatmulDOIVJ {
+    /*
+    doi_t_vj = do_i @ v_j.transpose(-2, -1) # matmul
+    tmp = (doi_t_vj - Di.unsqueeze(1)) * attn # inplace / epilogue?
+    */
+    using ThreadblockShape =
+        cutlass::gemm::GemmShape<kBlockSizeI, kBlockSizeJ, GemmType::ThreadK>;
+    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+
+    using ElementC = output_t;
+    using ElementAccum = accum_t;
+
+    // no-op output op - epilogue just stores result to global memory
+    using BiasGradEpilogueOutputOp =
+        typename cutlass::epilogue::thread::LinearCombination<
+            ElementC,
+            DefaultConfig::EpilogueOutputOp::kCount,
+            typename DefaultConfig::EpilogueOutputOp::ElementAccumulator,
+            typename DefaultConfig::EpilogueOutputOp::ElementCompute,
+            cutlass::epilogue::thread::ScaleType::Nothing>;
+
+    using DefaultGemm = typename cutlass::gemm::kernel::DefaultGemm<
+        scalar_t, // ElementA
+        cutlass::layout::RowMajor, // LayoutA
+        kIsAligned ? DefaultConfig::kAlignmentA : GemmType::kMinimumAlignment,
+        scalar_t, // ElementB
+        cutlass::layout::ColumnMajor, // LayoutB
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment,
+        ElementC, // ElementC
+        cutlass::layout::RowMajor, // LayoutC
+        ElementAccum, // ElementAccumulator
+        typename GemmType::OpClass,
+        ArchTag,
+        ThreadblockShape,
+        WarpShape,
+        typename GemmType::InstructionShape,
+        BiasGradEpilogueOutputOp, // EpilogueOutputOp
+        void, // ThreadblockSwizzle (not used)
+        // multiple preloads, dropout Zij tile, and 3 stages push us over shared
+        // memory capacity on A100. set a ceiling on number of stages to save
+        // shared memory if dropout is in use.
+        kPreload && kApplyDropout && (kBlockSizeI * kBlockSizeJ > 64 * 64)
+            ? cutlass::const_min(2, DefaultConfig::kStages)
+            : DefaultConfig::kStages, // Stages
+        false, // SplitKSerial
+        typename GemmType::Operator,
+        cutlass::gemm::SharedMemoryClearOption::kNone>;
+    using Mma = typename MakeCustomMma<typename DefaultGemm::Mma, kMaxK>::Mma;
+    using AccumLambdaIterator = typename DefaultMmaAccumLambdaIterator<
+        typename Mma::Operator::IteratorC,
+        ElementAccum,
+        kWarpSize>::Iterator;
+
+    // epilogue used to write bias gradient, which is just the output of this
+    // matmul with some operations applied to the fragment
+    using BiasGradEpilogue = typename DefaultGemm::Epilogue;
+
+    // Epilogue to store to shared-memory in a format that we can use later for
+    // the second matmul
+    using B2bGemm = typename cutlass::gemm::threadblock::B2bGemm<
+        typename DefaultGemm::Mma::Operator::IteratorC,
+        typename DefaultGemm::Mma::Operator,
+        scalar_t,
+        WarpShape,
+        ThreadblockShape>;
+    using AccumulatorSharedStorage = typename B2bGemm::AccumulatorSharedStorage;
+  };
+
+  struct MatmulGradQ {
+    // grad_q <- tmp @ k_j
+    using ThreadblockShape =
+        cutlass::gemm::GemmShape<kBlockSizeI, kBlockSizeJ, GemmType::ThreadK>;
+    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+    using InstructionShape = typename GemmType::InstructionShape;
+
+    using DefaultGemm = cutlass::gemm::kernel::DefaultGemm<
+        scalar_t, // ElementA,
+        cutlass::layout::RowMajor, // LayoutA,
+        DefaultConfig::kAlignmentA,
+        scalar_t, // ElementB,
+        cutlass::layout::RowMajor, // LayoutB,
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment,
+        output_t,
+        cutlass::layout::RowMajor, // LayoutC,
+        accum_t,
+        typename GemmType::OpClass,
+        ArchTag,
+        ThreadblockShape,
+        WarpShape,
+        typename GemmType::InstructionShape,
+        typename DefaultConfig::EpilogueOutputOp,
+        void, // ThreadblockSwizzle - not used
+        DefaultConfig::kStages,
+        false, // SplitKSerial
+        typename GemmType::Operator>;
+
+    using WarpIteratorA = typename cutlass::gemm::threadblock::
+        DefaultWarpIteratorAFromSharedMemory<
+            typename DefaultGemm::Mma::Operator::Shape,
+            typename DefaultGemm::Mma::Operator::InstructionShape,
+            typename DefaultGemm::Mma::Operator::IteratorA,
+            typename DefaultGemm::Mma::Policy>::WarpIterator;
+    using DefaultMmaFromSmem =
+        typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
+            typename DefaultGemm::Mma,
+            MatmulDOIVJ::AccumulatorSharedStorage::Shape::kN,
+            WarpIteratorA,
+            false>; // kScaleOperandA
+    using Mma = typename DefaultMmaFromSmem::Mma;
+    using IteratorB = typename Mma::IteratorB;
+    using WarpCount = typename Mma::WarpCount;
+
+    // Epilogue
+    using DefaultOutputOp = typename DefaultConfig::EpilogueOutputOp;
+    using DefaultEpilogue = typename DefaultGemm::Epilogue;
+    using OutputTileIterator =
+        typename cutlass::epilogue::threadblock::MakePrefetchableIterator<
+            typename DefaultEpilogue::OutputTileIterator>::Iterator;
+    using AccumTileGmem = GmemTile<typename Mma::FragmentC, (int)kNumThreads>;
+  };
+  struct MatmulGradK {
+    // grad_k <- tmp.transpose(-2, -1) @ q_i
+    using ThreadblockShape =
+        cutlass::gemm::GemmShape<kBlockSizeJ, kBlockSizeI, GemmType::ThreadK>;
+    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+    using InstructionShape = typename GemmType::InstructionShape;
+
+    using DefaultGemm = cutlass::gemm::kernel::DefaultGemm<
+        scalar_t, // ElementA,
+        cutlass::layout::RowMajor, // LayoutA,
+        DefaultConfig::kAlignmentA,
+        scalar_t, // ElementB,
+        cutlass::layout::RowMajor, // LayoutB,
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment,
+        output_t,
+        cutlass::layout::RowMajor, // LayoutC,
+        accum_t,
+        typename GemmType::OpClass,
+        ArchTag,
+        ThreadblockShape,
+        WarpShape,
+        typename GemmType::InstructionShape,
+        typename DefaultConfig::EpilogueOutputOp,
+        void, // ThreadblockSwizzle - not used
+        DefaultConfig::kStages,
+        false, // SplitKSerial
+        typename GemmType::Operator>;
+
+    using WarpIteratorA = typename cutlass::gemm::threadblock::
+        DefaultWarpIteratorAFromSharedMemory<
+            typename DefaultGemm::Mma::Operator::Shape,
+            typename DefaultGemm::Mma::Operator::InstructionShape,
+            typename DefaultGemm::Mma::Operator::IteratorA,
+            typename DefaultGemm::Mma::Policy>::WarpIterator;
+    using DefaultMmaFromSmemN =
+        typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
+            typename DefaultGemm::Mma,
+            MatmulQK::AccumulatorSharedStorage::Shape::kN, // kMaxK
+            WarpIteratorA,
+            false>; // kScaleOperandA
+    using DefaultMmaFromSmemT =
+        typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
+            typename DefaultGemm::Mma,
+            MatmulDOIVJ::AccumulatorSharedStorage::Shape::kM, // kMaxK
+            WarpIteratorA,
+            false, // kScaleOperandA
+            kPreload>; // kTransposeA
+    using DefaultMmaFromSmem = typename cutlass::platform::conditional<
+        DefaultMmaFromSmemT::kIsTransposedA,
+        DefaultMmaFromSmemT,
+        DefaultMmaFromSmemN>::type;
+    using Mma = typename DefaultMmaFromSmem::Mma;
+    using IteratorB = typename Mma::IteratorB;
+    using WarpCount = typename Mma::WarpCount;
+
+    // Epilogue
+    using DefaultOutputOp = typename DefaultConfig::EpilogueOutputOp;
+    using DefaultEpilogue = typename DefaultGemm::Epilogue;
+    using OutputTileIterator =
+        typename cutlass::epilogue::threadblock::MakePrefetchableIterator<
+            typename DefaultEpilogue::OutputTileIterator>::Iterator;
+    using AccumTileGmem = GmemTile<typename Mma::FragmentC, (int)kNumThreads>;
+  };
+
+  static constexpr bool kEnableSplitKeys = kEnableSplitKeys_;
+
+  static constexpr bool kNeedsAccumGradQ = kEnableSplitKeys ||
+      !cutlass::platform::is_same<output_accum_t, output_t>::value;
+  static constexpr bool kNeedsAccumGradK = !kOutputInRF &&
+      !cutlass::platform::is_same<output_accum_t, output_t>::value;
+  static constexpr bool kNeedsAccumGradV = !kOutputInRF &&
+      !cutlass::platform::is_same<output_accum_t, output_t>::value;
+
+  struct GradQTempStorage {
+    int32_t lock;
+    int32_t counter;
+    int32_t pad[2]; // pad to 128bits
+    output_accum_t buffer[MatmulGradQ::AccumTileGmem::kElementsStored];
+  };
+
+  struct Params {
+    // Input tensors
+    scalar_t* query_ptr = nullptr; // [Mq, nH, K]
+    scalar_t* key_ptr = nullptr; // [Mk, nH, K]
+    scalar_t* value_ptr = nullptr; // [Mk, nH, Kv]
+    scalar_t* bias_ptr = nullptr;
+    lse_scalar_t* logsumexp_ptr = nullptr; // [nH, Mq]
+    scalar_t* output_ptr = nullptr; // [Mq, nH, Kv]
+    scalar_t* grad_output_ptr = nullptr; // [Mq, nH, Kv]
+    accum_t* delta_ptr = nullptr; // [nH, Mq]
+    int32_t* cu_seqlens_q_ptr = nullptr;
+    int32_t* cu_seqlens_k_ptr = nullptr;
+
+    // Output tensors
+    output_t* grad_query_ptr = nullptr; //  [Mq, nH, K]
+    output_t* grad_key_ptr = nullptr; //    [Mk, nH, K]
+    output_t* grad_value_ptr = nullptr; //  [Mk, nH, Kv]
+    output_t* grad_bias_ptr = nullptr;
+
+    // Accumulators
+    output_accum_t* workspace = nullptr; // [Mq, Kq] + [Mkv, Kq] + [Mkv, Kv]
+    output_accum_t* workspace_gv =
+        nullptr; // (will be calculated by the kernel)
+    GradQTempStorage* workspace_gq =
+        nullptr; // (will be calculated by the kernel)
+
+    // Scale
+    accum_t scale = 1.0f;
+
+    // Dimensions/strides
+    int32_t head_dim = -1;
+    int32_t head_dim_value = -1;
+    int32_t num_queries = -1;
+    int32_t num_keys = -1;
+    int32_t num_heads = -1;
+    uint8_t custom_mask_type = NoCustomMask;
+
+    int32_t q_strideM = -1;
+    int32_t k_strideM = -1;
+    int32_t v_strideM = -1;
+    int32_t bias_strideM = 0;
+    int32_t gO_strideM = -1;
+    int32_t gB_strideM = -1;
+    int8_t gQKV_strideM_multiplier = 1; // 3 for packed, 1 otherwise
+
+#ifdef HAS_PYTORCH
+    // dropout
+    at::PhiloxCudaState rng_engine_inputs = {0, 0};
+#endif
+    // RNG sequence offset based on batch_id and head_id
+    unsigned long long dropout_batch_head_rng_offset = 0;
+    float dropout_prob = 0.0f;
+
+    CUTLASS_HOST_DEVICE int32_t o_strideM() const {
+      return head_dim_value * num_heads;
+    }
+    CUTLASS_HOST_DEVICE int32_t gQ_strideM() const {
+      return gQKV_strideM_multiplier * num_heads * head_dim;
+    }
+    CUTLASS_HOST_DEVICE int32_t gK_strideM() const {
+      return gQKV_strideM_multiplier * num_heads * head_dim;
+    }
+    CUTLASS_HOST_DEVICE int32_t gV_strideM() const {
+      return gQKV_strideM_multiplier * num_heads * head_dim_value;
+    }
+
+    // Everything below is only used in `advance_to_block`
+    // and shouldn't use registers
+    int64_t o_strideH = -1;
+    int32_t q_strideH = -1;
+    int32_t k_strideH = -1;
+    int32_t v_strideH = -1;
+    int64_t bias_strideH = 0;
+    int64_t o_strideB = -1;
+    int64_t q_strideB = -1;
+    int64_t k_strideB = -1;
+    int64_t v_strideB = -1;
+    int64_t bias_strideB = 0;
+    int64_t lse_strideB = -1;
+    int64_t lse_strideH = -1;
+    int64_t delta_strideB = -1;
+    int64_t delta_strideH = -1;
+    int32_t num_batches = -1;
+    int16_t num_splits_key = 1; // We use `gridDim.x` inside kernel
+
+    int64_t gO_strideB = 0;
+    int64_t gQ_strideB = 0;
+    int64_t gK_strideB = 0;
+    int64_t gV_strideB = 0;
+    int64_t gB_strideB = 0;
+    int64_t gO_strideH = 0;
+    int64_t gQ_strideH = 0;
+    int64_t gK_strideH = 0;
+    int64_t gV_strideH = 0;
+    int64_t gB_strideH = 0;
+
+    CUTLASS_DEVICE int16_t num_splits_key_device() const {
+      return kEnableSplitKeys ? gridDim.x : 1;
+    }
+    CUTLASS_DEVICE int16_t split_key_device() const {
+      return kEnableSplitKeys ? blockIdx.x : 0;
+    }
+
+    CUTLASS_DEVICE bool advance_to_block() {
+      int64_t batch_id = blockIdx.z;
+      int32_t head_id = blockIdx.y;
+
+      if (kNeedsAccumGradQ || kNeedsAccumGradK || kNeedsAccumGradV) {
+        assert(workspace_size() == 0 || workspace != nullptr);
+
+        workspace += (batch_id * num_heads + head_id) * workspace_strideBH();
+        workspace = warp_uniform(workspace);
+        workspace_gv = workspace + workspace_elements_gk();
+        workspace_gq =
+            (GradQTempStorage*)(workspace_gv + workspace_elements_gv());
+        if (kEnableSplitKeys) {
+          workspace_gv += workspace_elements_gv() * split_key_device() /
+              num_splits_key_device();
+          workspace += workspace_elements_gk() * split_key_device() /
+              num_splits_key_device();
+        }
+      } else {
+        workspace = nullptr;
+      }
+
+      // Advance pointers that depend on the total concatenated
+      // number of queries, as `num_queries` is modified in the block
+      // below
+      dropout_batch_head_rng_offset =
+          batch_id * (num_heads * num_queries * num_keys) +
+          head_id * (num_queries * num_keys);
+      logsumexp_ptr += batch_id * lse_strideB + head_id * lse_strideH;
+
+      if (cu_seqlens_q_ptr != nullptr) {
+        assert(cu_seqlens_k_ptr != nullptr);
+        cu_seqlens_q_ptr += batch_id;
+        cu_seqlens_k_ptr += batch_id;
+        int32_t q_start = cu_seqlens_q_ptr[0];
+        int32_t k_start = cu_seqlens_k_ptr[0];
+        int64_t q_next_start = cu_seqlens_q_ptr[1];
+        int64_t k_next_start = cu_seqlens_k_ptr[1];
+        assert(q_next_start - q_start <= num_queries);
+        assert(k_next_start - k_start <= num_keys);
+        num_queries = q_next_start - q_start;
+        num_keys = k_next_start - k_start;
+
+        // Jump manually
+        batch_id = 0;
+
+        query_ptr += q_start * q_strideM;
+        key_ptr += k_start * k_strideM;
+        value_ptr += k_start * v_strideM;
+        assert(bias_ptr == nullptr);
+        assert(grad_bias_ptr == nullptr);
+        output_ptr += q_start * o_strideM();
+        grad_output_ptr += q_start * gO_strideM;
+        delta_ptr += q_start;
+
+        grad_query_ptr += q_start * gQ_strideM();
+        grad_key_ptr += k_start * gK_strideM();
+        grad_value_ptr += k_start * gV_strideM();
+      }
+
+      query_ptr += batch_id * q_strideB + head_id * q_strideH;
+      key_ptr += batch_id * k_strideB + head_id * k_strideH;
+      value_ptr += batch_id * v_strideB + head_id * v_strideH;
+      if (bias_ptr != nullptr) {
+        bias_ptr += batch_id * bias_strideB + head_id * bias_strideH;
+      }
+      output_ptr += batch_id * o_strideB + head_id * o_strideH;
+      grad_output_ptr += batch_id * gO_strideB + head_id * gO_strideH;
+      delta_ptr += batch_id * delta_strideB + head_id * delta_strideH;
+
+      grad_query_ptr += batch_id * gQ_strideB + head_id * gQ_strideH;
+      grad_key_ptr += batch_id * gK_strideB + head_id * gK_strideH;
+      grad_value_ptr += batch_id * gV_strideB + head_id * gV_strideH;
+      if (grad_bias_ptr != nullptr) {
+        grad_bias_ptr += batch_id * gB_strideB + head_id * gB_strideH;
+      }
+
+      // Some values are modified above
+      // Signal to the compiler that they are the same in all threads
+      // and can be stored in warp-uniform registers (Sm75+)
+      num_queries = warp_uniform(num_queries);
+      num_keys = warp_uniform(num_keys);
+      custom_mask_type = warp_uniform(custom_mask_type);
+
+      query_ptr = warp_uniform(query_ptr);
+      key_ptr = warp_uniform(key_ptr);
+      value_ptr = warp_uniform(value_ptr);
+      bias_ptr = warp_uniform(bias_ptr);
+      logsumexp_ptr = warp_uniform(logsumexp_ptr);
+      output_ptr = warp_uniform(output_ptr);
+      grad_output_ptr = warp_uniform(grad_output_ptr);
+      delta_ptr = warp_uniform(delta_ptr);
+
+      grad_query_ptr = warp_uniform(grad_query_ptr);
+      grad_key_ptr = warp_uniform(grad_key_ptr);
+      grad_value_ptr = warp_uniform(grad_value_ptr);
+      grad_bias_ptr = warp_uniform(grad_bias_ptr);
+
+#if 0
+      PRINT_T0("[b:%d h:%d] dp[0]:%f Q:%f K:%f V:%f LSE:%f",
+        int(blockIdx.z), int(blockIdx.y),
+        float(delta_ptr[0]),
+        float(query_ptr[0]), float(key_ptr[0]), float(value_ptr[0]),
+        float(logsumexp_ptr[0])
+      )
+#endif
+      return true;
+    }
+
+    __host__ dim3 getBlocksGrid() const {
+      return dim3(num_splits_key, num_heads, num_batches);
+    }
+    __host__ dim3 getThreadsGrid() const {
+      return dim3(kWarpSize * kNumWarpsPerBlock, 1, 1);
+    }
+    CUTLASS_HOST_DEVICE int64_t workspace_elements_gk() const {
+      if (!kNeedsAccumGradK) {
+        return 0;
+      }
+      return num_splits_key * align_up(num_keys, (int32_t)kBlockSizeJ) *
+          align_up(head_dim, (int32_t)kBlockSizeI);
+    }
+    CUTLASS_HOST_DEVICE int64_t workspace_elements_gv() const {
+      if (!kNeedsAccumGradV) {
+        return 0;
+      }
+      return num_splits_key * align_up(num_keys, (int32_t)kBlockSizeJ) *
+          align_up(head_dim_value, (int32_t)kBlockSizeI);
+    }
+    CUTLASS_HOST_DEVICE int64_t workspace_elements_gq() const {
+      if (!kNeedsAccumGradQ) {
+        return 0;
+      }
+      int num_blocks = ceil_div(num_queries, kBlockSizeI);
+      int num_cols = ceil_div(head_dim, MatmulGradQ::ThreadblockShape::kN);
+      return num_blocks * num_cols * sizeof(GradQTempStorage) /
+          sizeof(output_accum_t);
+    }
+    CUTLASS_HOST_DEVICE int64_t workspace_strideBH() const {
+      // Aligned on 128bits
+      return align_up(
+          workspace_elements_gk() + workspace_elements_gv() +
+              workspace_elements_gq(),
+          int64_t(4));
+    }
+    CUTLASS_HOST_DEVICE int64_t workspace_size() const {
+      // Returns size of buffer we need to run this kernel
+      return num_batches * num_heads * workspace_strideBH() * sizeof(float);
+    }
+    CUTLASS_HOST_DEVICE bool should_zero_workspace() const {
+      return num_splits_key > 1;
+    }
+  };
+
+  // shared storage for keeping Zij matrix. not needed if we aren't using
+  // dropout, in which case we use an empty array to save shared memory
+  using ZijSharedStorage = typename cutlass::platform::conditional<
+      kApplyDropout,
+      typename MatmulQK::AccumulatorSharedStorage,
+      // dummy shared storage object that takes up no space.
+      typename cutlass::gemm::threadblock::AccumulatorSharedStorage<
+#ifdef _WIN32
+          // windows builds throw the error:
+          // "type containing an unknown-size array is not allowed"
+          // if we try to make Zij shared storage zero-sized.
+          // To get around this just make it sized 1 on windows.
+          typename cutlass::gemm::GemmShape<1, 1, 0>,
+#else
+          typename cutlass::gemm::GemmShape<0, 0, 0>,
+#endif
+          typename MatmulQK::AccumulatorSharedStorage::Element,
+          typename MatmulQK::AccumulatorSharedStorage::Layout,
+          typename cutlass::MatrixShape<0, 0>>>::type;
+
+  struct SharedStoragePrologue {
+    struct {
+      cutlass::Array<accum_t, kBlockSizeI> di; // (do_i * o_i).sum(-1)
+      typename MatmulQK::Mma::SharedStorageA mm_qk_k;
+    } persistent;
+    union {
+      struct {
+        // part1 - after Q.K / dV / dO.V
+        union {
+          // 1. efficient load of bias tile Bij, which is then applied to Pij
+          typename MatmulQK::BiasLoader::SmemTile bias;
+          // 4. store Pij. it is needed:
+          // - in dVj += (Pij.T * Zij) @ dOi
+          // - in dSij = Pij * (dPij - Di)
+          // 6. dVj += (Pij.T * Zij) @ dOi
+          // 10. write to fragment
+          typename MatmulQK::AccumulatorSharedStorage attn_shared_storage;
+        };
+        // 5. store Zij. it is needed in dVj += (Pij.T * Zij) @ dOi
+        ZijSharedStorage zij;
+
+        union {
+          // 2. prologue for dVj
+          // 6. workspace for dVj += (Pij.T * Zij) @ dOi
+          typename MatmulGradV::Mma::SharedStorage mm_gradV;
+          // 7. dVj epilogue
+          typename MatmulGradV::DefaultEpilogue::SharedStorage gradV_epilogue;
+        };
+
+        // 3. prologue for dPij_dropped
+        // 8. used in dPij_dropped = dOi @ Vj.T
+        typename MatmulDOIVJ::Mma::SharedStorage mm_doivj;
+      } part1;
+
+      struct {
+        // part2 - dQ
+        union {
+          typename MatmulQK::AccumulatorSharedStorage
+              tmpT_shared_storage; // (from part1)
+          typename MatmulDOIVJ::AccumulatorSharedStorage tmp_shared_storage;
+        };
+        typename MatmulGradK::Mma::SharedStorage mm_gradK; // (preload)
+        typename MatmulGradQ::Mma::SharedStorage mm_gradQ; // (preload)
+        union {
+          // store dB = dSij to global memory
+          typename MatmulDOIVJ::BiasGradEpilogue::SharedStorage gradB_epilogue;
+          typename MatmulGradQ::DefaultEpilogue::SharedStorage gradQ_epilogue;
+        };
+
+      } part2;
+
+      struct {
+        // part3 - after last iteration on dQ's epilogue / dK
+        union {
+          typename MatmulQK::AccumulatorSharedStorage
+              tmpT_shared_storage; // (from part1)
+          typename MatmulDOIVJ::AccumulatorSharedStorage tmp_shared_storage;
+        };
+        typename MatmulGradK::Mma::SharedStorage mm_gradK; // (preload)
+        typename MatmulGradQ::DefaultEpilogue::SharedStorage
+            gradQ_epilogue_lastIter;
+
+        typename MatmulGradK::DefaultEpilogue::SharedStorage gradK_epilogue;
+      } part3;
+
+      struct {
+        // part4 - after last iteration on dK's epilogue / preload next K.Q_t
+        typename MatmulQK::Mma::SharedStorageB mm_qk_q;
+
+        // If we reach end of current key, dump RF->gmem with "final" epilogues
+        typename MatmulGradK::DefaultEpilogue::SharedStorage
+            gradK_epilogue_final;
+        typename MatmulGradV::DefaultEpilogue::SharedStorage
+            gradV_epilogue_final;
+      } part4;
+    };
+    static void print_size() {
+      // Field size
+#define FSZ(f) int((sizeof(((SharedStoragePrologue*)0)->f)))
+
+      printf("Total smem: %d bytes\n", int(sizeof(SharedStoragePrologue)));
+      printf("  persistent: %db\n", FSZ(persistent));
+      printf("    mm_qk_k: %db\n", FSZ(persistent.mm_qk_k));
+      printf("  part1: %db\n", FSZ(part1));
+      printf("    bias: %db\n", FSZ(part1.bias));
+      printf("    attn_shared_storage: %db\n", FSZ(part1.attn_shared_storage));
+      printf("    zij: %db\n", FSZ(part1.zij));
+      printf("    mm_gradV: %db\n", FSZ(part1.mm_gradV));
+      printf("    gradV_epilogue: %db\n", FSZ(part1.gradV_epilogue));
+      printf("    mm_doivj: %db\n", FSZ(part1.mm_doivj));
+      printf("  part2: %db\n", FSZ(part2));
+      printf("    tmpT_shared_storage: %db\n", FSZ(part2.tmpT_shared_storage));
+      printf("    tmp_shared_storage: %db\n", FSZ(part2.tmp_shared_storage));
+      printf("    mm_gradK: %db\n", FSZ(part2.mm_gradK));
+      printf("    mm_gradQ: %db\n", FSZ(part2.mm_gradQ));
+      printf("    gradB_epilogue: %db\n", FSZ(part2.gradB_epilogue));
+      printf("    gradQ_epilogue: %db\n", FSZ(part2.gradQ_epilogue));
+      printf("  part3: %db\n", FSZ(part3));
+      printf("    tmpT_shared_storage: %db\n", FSZ(part3.tmpT_shared_storage));
+      printf("  part4: %db\n", FSZ(part4));
+      printf("    mm_qk_q: %db\n", FSZ(part4.mm_qk_q));
+      printf(
+          "    gradK_epilogue_final: %db\n", FSZ(part4.gradK_epilogue_final));
+      printf(
+          "    gradV_epilogue_final: %db\n", FSZ(part4.gradV_epilogue_final));
+    }
+// ===========================================
+#define FIELD(INSIDE_STRUCT, FIELDNAME) \
+  CUTLASS_DEVICE auto& FIELDNAME() {    \
+    return INSIDE_STRUCT.FIELDNAME;     \
+  }
+
+    FIELD(persistent, di)
+    FIELD(persistent, mm_qk_k)
+    FIELD(part1, bias)
+    FIELD(part1, attn_shared_storage)
+    FIELD(part1, zij)
+    FIELD(part1, mm_gradV)
+    FIELD(part1, gradV_epilogue)
+    FIELD(part1, mm_doivj)
+    FIELD(part2, mm_gradK)
+    FIELD(part2, mm_gradQ)
+    FIELD(part2, gradB_epilogue)
+    FIELD(part2, gradQ_epilogue)
+    FIELD(part2, tmp_shared_storage)
+    FIELD(part3, tmpT_shared_storage)
+    FIELD(part3, gradQ_epilogue_lastIter)
+    FIELD(part3, gradK_epilogue)
+    FIELD(part4, mm_qk_q)
+    FIELD(part4, gradK_epilogue_final)
+    FIELD(part4, gradV_epilogue_final)
+  };
+
+  struct SharedStorageNoPrologue {
+    struct {
+      cutlass::Array<accum_t, kBlockSizeI> di; // (do_i * o_i).sum(-1)
+    } persistent;
+    union {
+      struct {
+        // part1 - Q.K matmul
+        typename MatmulQK::Mma::SharedStorageA mm_qk_k;
+        typename MatmulQK::Mma::SharedStorageB mm_qk_q;
+      } part1;
+
+      struct {
+        // part2 - compute gradV
+        union {
+          // 1. efficient load of bias tile Bij, which is then applied to Pij
+          typename MatmulQK::BiasLoader::SmemTile bias;
+          // 2. store Pij to shared memory. it is needed:
+          // - in this step, where it is used in dVj += (Pij.T * Zij) @ dOi
+          // - in next step where it is used in dSij = Pij * (dPij - Di)
+          typename MatmulQK::AccumulatorSharedStorage attn_shared_storage;
+        };
+        // 3. store Zij. it is needed in this step, where it is used
+        // to compute Pij_dropped = Pij * Zij on the fly as fragments of Pij are
+        // loaded for the computation of dVj.
+        ZijSharedStorage zij;
+
+        union {
+          typename MatmulGradV::Mma::SharedStorage mm_gradV;
+          typename MatmulGradV::DefaultEpilogue::SharedStorage gradV_epilogue;
+        };
+      } part2;
+
+      struct {
+        // part3 - DO.V matmul
+        union {
+          // first compute dPij = (dOi @ Vj.T) * Zij
+          // and dSij = Pij * (dPij - Di)
+          struct {
+            // (from part2) - Pij for computing dSij = Pij * (dPij - Di)
+            typename MatmulQK::AccumulatorSharedStorage attn_shared_storage;
+            // matmul to compute dOiVj
+            typename MatmulDOIVJ::Mma::SharedStorage mm_doivj;
+          };
+          // then store dB = dSij to global memory
+          typename MatmulDOIVJ::BiasGradEpilogue::SharedStorage gradB_epilogue;
+        };
+      } part3;
+
+      struct {
+        // part4 - compute gradQ
+        typename MatmulQK::AccumulatorSharedStorage
+            tmpT_shared_storage; // (from part2)
+        typename MatmulDOIVJ::AccumulatorSharedStorage tmp_shared_storage;
+        union {
+          typename MatmulGradQ::Mma::SharedStorage mm_gradQ;
+          typename MatmulGradQ::DefaultEpilogue::SharedStorage gradQ_epilogue;
+          typename MatmulGradQ::DefaultEpilogue::SharedStorage
+              gradQ_epilogue_lastIter;
+        };
+      } part4;
+
+      struct {
+        // part5 - compute gradK
+        typename MatmulQK::AccumulatorSharedStorage
+            tmpT_shared_storage; // (from part2)
+        typename MatmulDOIVJ::AccumulatorSharedStorage tmp_shared_storage;
+        union {
+          typename MatmulGradK::Mma::SharedStorage mm_gradK;
+          typename MatmulGradK::DefaultEpilogue::SharedStorage gradK_epilogue;
+        };
+      } part5;
+
+      struct {
+        // part6 - store RF accumulated into gmem
+        typename MatmulGradK::DefaultEpilogue::SharedStorage
+            gradK_epilogue_final;
+        typename MatmulGradV::DefaultEpilogue::SharedStorage
+            gradV_epilogue_final;
+      } part6;
+    };
+    static void print_size() {
+#define FIELD_SIZEOF(f) int((sizeof(((SharedStorageNoPrologue*)0)->f)))
+      printf("Total smem: %d bytes\n", int(sizeof(SharedStorageNoPrologue)));
+      printf("  persistent: %db\n", FIELD_SIZEOF(persistent));
+      printf("  part1: %db\n", FIELD_SIZEOF(part1));
+      printf("  part2: %db\n", FIELD_SIZEOF(part2));
+      printf("  part3: %db\n", FIELD_SIZEOF(part3));
+      printf("  part4: %db\n", FIELD_SIZEOF(part4));
+      printf("  part5: %db\n", FIELD_SIZEOF(part5));
+      printf("  part6: %db\n", FIELD_SIZEOF(part6));
+    }
+// ===========================================
+#define FIELD(INSIDE_STRUCT, FIELDNAME) \
+  CUTLASS_DEVICE auto& FIELDNAME() {    \
+    return INSIDE_STRUCT.FIELDNAME;     \
+  }
+
+    FIELD(persistent, di)
+    FIELD(part1, mm_qk_k)
+    FIELD(part1, mm_qk_q)
+    FIELD(part2, bias)
+    FIELD(part2, attn_shared_storage)
+    FIELD(part2, zij)
+    FIELD(part2, mm_gradV)
+    FIELD(part2, gradV_epilogue)
+    FIELD(part3, mm_doivj)
+    FIELD(part3, gradB_epilogue)
+    FIELD(part4, tmpT_shared_storage)
+    FIELD(part4, tmp_shared_storage)
+    FIELD(part4, mm_gradQ)
+    FIELD(part4, gradQ_epilogue)
+    FIELD(part4, gradQ_epilogue_lastIter)
+    FIELD(part5, mm_gradK)
+    FIELD(part5, gradK_epilogue)
+    FIELD(part6, gradK_epilogue_final)
+    FIELD(part6, gradV_epilogue_final)
+  };
+
+  using SharedStorage = typename cutlass::platform::conditional<
+      kPreload,
+      SharedStoragePrologue,
+      SharedStorageNoPrologue>::type;
+
+  struct OutputFragments {
+    typename MatmulGradV::Mma::FragmentC gradV;
+    typename MatmulGradK::Mma::FragmentC gradK;
+
+    CUTLASS_DEVICE void clear() {
+      gradV.clear();
+      gradK.clear();
+    }
+  };
+
+  static bool __host__ check_supported(Params const& p) {
+    CHECK_ALIGNED_PTR(p.query_ptr, kMinimumAlignment);
+    CHECK_ALIGNED_PTR(p.key_ptr, kMinimumAlignment);
+    CHECK_ALIGNED_PTR(p.value_ptr, kMinimumAlignment);
+    CHECK_ALIGNED_PTR(p.output_ptr, kMinimumAlignment);
+    CHECK_ALIGNED_PTR(p.grad_output_ptr, kMinimumAlignment);
+    CHECK_ALIGNED_PTR(p.bias_ptr, kMinimumAlignment);
+    XFORMERS_CHECK(p.lse_strideH % 8 == 0, "LSE is not correctly aligned");
+    XFORMERS_CHECK(p.lse_strideB % 8 == 0, "LSE is not correctly aligned");
+    XFORMERS_CHECK(
+        p.num_heads <= 1 || p.q_strideH % kMinimumAlignment == 0,
+        "query is not correctly aligned (strideH)");
+    XFORMERS_CHECK(
+        p.num_heads <= 1 || p.k_strideH % kMinimumAlignment == 0,
+        "key is not correctly aligned (strideH)");
+    XFORMERS_CHECK(
+        p.num_heads <= 1 || p.v_strideH % kMinimumAlignment == 0,
+        "value is not correctly aligned (strideH)");
+    XFORMERS_CHECK(
+        p.num_batches <= 1 || p.q_strideB % kMinimumAlignment == 0,
+        "query is not correctly aligned (strideB)");
+    XFORMERS_CHECK(
+        p.num_batches <= 1 || p.k_strideB % kMinimumAlignment == 0,
+        "key is not correctly aligned (strideB)");
+    XFORMERS_CHECK(
+        p.num_batches <= 1 || p.v_strideB % kMinimumAlignment == 0,
+        "value is not correctly aligned (strideB)");
+    XFORMERS_CHECK(
+        p.q_strideM % kMinimumAlignment == 0,
+        "query is not correctly aligned (strideM)");
+    XFORMERS_CHECK(
+        p.k_strideM % kMinimumAlignment == 0,
+        "key is not correctly aligned (strideM)");
+    XFORMERS_CHECK(
+        p.v_strideM % kMinimumAlignment == 0,
+        "value is not correctly aligned (strideM)");
+    if (p.bias_ptr) {
+      XFORMERS_CHECK(
+          p.num_batches <= 1 || p.bias_strideB % kMinimumAlignment == 0,
+          "attn_bias is not correctly aligned (strideB)");
+      XFORMERS_CHECK(
+          p.num_heads <= 1 || p.bias_strideH % kMinimumAlignment == 0,
+          "attn_bias is not correctly aligned (strideH)");
+      XFORMERS_CHECK(
+          p.bias_strideM % kMinimumAlignment == 0,
+          "attn_bias is not correctly aligned (strideM)");
+    }
+    if (p.grad_bias_ptr) {
+      XFORMERS_CHECK(
+          p.num_batches <= 1 || p.gB_strideB % kMinimumAlignment == 0,
+          "attn_bias.grad is not correctly aligned (strideB)");
+      XFORMERS_CHECK(
+          p.num_heads <= 1 || p.gB_strideH % kMinimumAlignment == 0,
+          "attn_bias.grad is not correctly aligned (strideH)");
+      XFORMERS_CHECK(
+          p.gB_strideM % kMinimumAlignment == 0,
+          "attn_bias.grad is not correctly aligned (strideM)");
+    }
+    XFORMERS_CHECK(
+        !(p.cu_seqlens_q_ptr && p.bias_ptr),
+        "CuSeqlen + bias not implemented yet");
+    XFORMERS_CHECK(
+        p.custom_mask_type < NumCustomMaskTypes,
+        "Invalid value for `custom_mask_type`");
+    XFORMERS_CHECK(
+        p.dropout_prob <= 1.0f && p.dropout_prob >= 0.0f,
+        "Invalid value for `dropout_prob`");
+    XFORMERS_CHECK(
+        kApplyDropout || p.dropout_prob == 0.0f,
+        "Set `kApplyDropout`=True to support `dropout_prob > 0`");
+    XFORMERS_CHECK(p.head_dim > 0, "Invalid value for `head_dim`");
+    XFORMERS_CHECK(p.head_dim_value > 0, "Invalid value for `head_dim_value`");
+    XFORMERS_CHECK(p.num_queries > 0, "Invalid value for `num_queries`");
+    XFORMERS_CHECK(p.num_keys > 0, "Invalid value for `num_keys`");
+    XFORMERS_CHECK(p.num_heads > 0, "Invalid value for `num_heads`");
+    XFORMERS_CHECK(p.num_batches > 0, "Invalid value for `num_batches`");
+    XFORMERS_CHECK(p.head_dim <= kMaxK, "kMaxK: Expected `head_dim < kMaxK`");
+    XFORMERS_CHECK(
+        p.head_dim_value <= kMaxK, "kMaxK: Expected `head_dim_value < kMaxK`");
+    if (kKeysQueriesAlignedToBlockSize) {
+      XFORMERS_CHECK(
+          p.cu_seqlens_k_ptr == nullptr,
+          "This kernel does not support cu_seqlen");
+      XFORMERS_CHECK(
+          p.cu_seqlens_q_ptr == nullptr,
+          "This kernel does not support cu_seqlen");
+      XFORMERS_CHECK(
+          p.num_queries % kBlockSizeI == 0,
+          "kKeysQueriesAlignedToBlockSize condition not respected");
+      XFORMERS_CHECK(
+          p.num_keys % kBlockSizeJ == 0,
+          "kKeysQueriesAlignedToBlockSize condition not respected");
+    }
+    XFORMERS_CHECK(
+        kEnableSplitKeys || p.num_splits_key == 1, "SplitKeys is disabled");
+    XFORMERS_CHECK(
+        p.num_splits_key > 0, "Invalid `num_splits_key` (expected >0)");
+    XFORMERS_CHECK(
+        p.num_splits_key <= cutlass::ceil_div(p.num_keys, kBlockSizeJ),
+        "Invalid `num_splits_key` (too large)");
+    return true;
+  }
+
+  static CUTLASS_DEVICE void attention_kernel(Params p) {
+    extern __shared__ char smem_buffer[];
+    SharedStorage& shared_storage = *((SharedStorage*)smem_buffer);
+
+    uint16_t thread_id = threadIdx.x;
+    uint8_t warp_id = warp_uniform(thread_id / 32);
+    uint8_t lane_id = thread_id % 32;
+
+    int32_t key_start = p.split_key_device() * kBlockSizeJ;
+    if (key_start >= p.num_keys) {
+      return;
+    }
+    if (kPrologueQK) {
+      int32_t query_start = getQueryStart(p, key_start);
+      prologueQkNextIteration<true>(
+          shared_storage, p, query_start, key_start, warp_id, lane_id);
+    }
+
+    // Computes (dO*out).sum(-1) and writes it to `p.delta_ptr`
+    if (kKernelComputesDelta) {
+      constexpr int kOptimalElements =
+          128 / cutlass::sizeof_bits<scalar_t>::value;
+      if (p.head_dim_value % kOptimalElements == 0) {
+        for (int query_start = 0; query_start < p.num_queries;
+             query_start += kBlockSizeI) {
+          computeDelta<kOptimalElements>(p, query_start, warp_id, lane_id);
+        }
+      } else {
+        for (int query_start = 0; query_start < p.num_queries;
+             query_start += kBlockSizeI) {
+          computeDelta<1>(p, query_start, warp_id, lane_id);
+        }
+      }
+      __syncthreads();
+    }
+
+    OutputFragments output_frags;
+
+    curandStatePhilox4_32_10_t rng_state_init;
+#ifdef HAS_PYTORCH
+    if (kApplyDropout) {
+      auto seeds = at::cuda::philox::unpack(p.rng_engine_inputs);
+      // each element of the attention matrix P with shape
+      // (batch_sz, n_heads, n_queries, n_keys) is associated with a single
+      // offset in RNG sequence. we initialize the RNG state with offset that
+      // starts at the beginning of a (n_queries, n_keys) matrix for this
+      // block's batch_id and head_id
+      // initializing rng state is very expensive, so we run once per kernel,
+      // rather than once per iteration. each iteration takes a copy of the
+      // initialized RNG state and offsets it as needed.
+      curand_init(
+          std::get<0>(seeds),
+          0,
+          std::get<1>(seeds) + p.dropout_batch_head_rng_offset,
+          &rng_state_init);
+    }
+#endif
+    CUTLASS_PRAGMA_UNROLL
+    for (; key_start < p.num_keys;
+         key_start += p.num_splits_key_device() * kBlockSizeJ) {
+      output_frags.clear();
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int32_t query_start_shifted = getQueryStart(p, key_start);
+           query_start_shifted < getQueryStartShift(p) + getQueryEnd(p);
+           query_start_shifted += kBlockSizeI) {
+        // This line here
+        // vvvvvvvvvvvvvv
+        warp_id = warp_uniform(warp_id);
+        // ^^^^^^^^^^^^^^
+        // ... makes everything use less RF and be 10% faster. Why?
+        // I don't know. My theory is that it forces `nvcc` to
+        // re-compute indices, offsets etc... and not keep them
+        // from the previous iteration, which prevents MASSIVE
+        // register spilling.
+
+        int32_t query_start = query_start_shifted;
+        if (query_start >= p.num_queries) {
+          query_start = query_start % getQueryEnd(p);
+        }
+
+        processBlockIJ<kKeysQueriesAlignedToBlockSize>(
+            shared_storage,
+            output_frags,
+            p,
+            query_start,
+            key_start,
+            rng_state_init,
+            warp_id,
+            lane_id);
+      }
+      if (kOutputInRF) {
+        writeFragsToGmem<kKeysQueriesAlignedToBlockSize>(
+            shared_storage, output_frags, p, key_start, warp_id, lane_id);
+      } else if (getQueryStart(p, key_start) >= p.num_queries) {
+        zfillGradKV<kKeysQueriesAlignedToBlockSize>(
+            p, key_start, warp_id, lane_id);
+      }
+      __syncthreads();
+    }
+  }
+
+  template <bool skipBoundsChecks>
+  static CUTLASS_DEVICE void zfillGradKV(
+      Params const& p,
+      int32_t key_start,
+      uint8_t warp_id,
+      uint8_t lane_id) {
+    constexpr int kThreadsPerKey = 8;
+    constexpr int kParallelKeys = kNumThreads / kThreadsPerKey;
+    static_assert(kBlockSizeJ % kParallelKeys == 0, "");
+    // This function is not really optimized, but should rarely be used
+    // It's only used when some keys are "useless" and don't attend to
+    // any query, due to causal masking
+
+    int thread_id = 32 * warp_id + lane_id;
+    int k_shift = lane_id % kThreadsPerKey;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < kBlockSizeJ; j += kParallelKeys) {
+      int key = key_start + j + (thread_id / kThreadsPerKey);
+      if (!skipBoundsChecks && key >= p.num_keys) {
+        continue;
+      }
+      auto gv_ptr = p.grad_value_ptr + key * p.gV_strideM();
+      auto gk_ptr = p.grad_key_ptr + key * p.gK_strideM();
+
+      for (int k = k_shift; k < p.head_dim_value; k += kThreadsPerKey) {
+        gv_ptr[k] = scalar_t(0);
+      }
+      for (int k = k_shift; k < p.head_dim; k += kThreadsPerKey) {
+        gk_ptr[k] = scalar_t(0);
+      }
+    }
+  }
+
+  template <bool skipBoundsChecks>
+  static CUTLASS_DEVICE void processBlockIJ(
+      SharedStorage& shared_storage,
+      OutputFragments& output_frags,
+      Params& p,
+      int32_t query_start,
+      int32_t key_start,
+      const curandStatePhilox4_32_10_t& curand_state_init,
+      uint8_t warp_id,
+      uint8_t lane_id) {
+    cutlass::Array<cutlass::uint1b_t, MatmulDOIVJ::Mma::FragmentC::kElements>
+        dropout_keep_mask_doivj;
+    dropout_keep_mask_doivj.fill(cutlass::uint1b_t{1});
+    const float dropout_scale =
+        kApplyDropout ? 1.0 / (1.0 - p.dropout_prob) : 1.0f;
+
+    cutlass::MatrixCoord no_offset{0, 0};
+    accum_t scale = p.scale;
+    int16_t thread_id = 32 * warp_id + lane_id;
+
+    auto rematerializeThreadIds = [&]() {
+      // Prevents `nvcc` from keeping values deduced from
+      // `thread_id`, `warp_id`, ... in RF - to reduce register pressure
+      warp_id = warp_uniform(thread_id / 32);
+      lane_id = thread_id % 32;
+      thread_id = 32 * warp_id + lane_id;
+    };
+
+    bool isFirstQuery = (query_start == getQueryStart(p, key_start));
+    int32_t next_query, next_key;
+    incrIteration(p, query_start, key_start, next_query, next_key);
+    bool isLastQuery = next_key != key_start;
+
+    accum_t di_rf = accum_t(0);
+    if (thread_id < kBlockSizeI) {
+      if (query_start + thread_id < p.num_queries) {
+        di_rf = p.delta_ptr[query_start + thread_id];
+      }
+      shared_storage.di()[thread_id] = di_rf;
+    }
+
+    int32_t num_queries_in_block = skipBoundsChecks
+        ? MatmulQK::Mma::Shape::kN
+        : warp_uniform(cutlass::fast_min(
+              (int32_t)MatmulQK::Mma::Shape::kN, p.num_queries - query_start));
+    int32_t num_keys_in_block = skipBoundsChecks
+        ? MatmulQK::Mma::Shape::kM
+        : warp_uniform(cutlass::fast_min(
+              (int32_t)MatmulQK::Mma::Shape::kM, p.num_keys - key_start));
+
+    auto prologueGradV = [&](int col) {
+      typename MatmulGradV::Mma::IteratorB iterator_dO(
+          {int32_t(p.gO_strideM)},
+          p.grad_output_ptr + query_start * p.gO_strideM + col,
+          {num_queries_in_block, p.head_dim_value - col},
+          thread_id,
+          no_offset);
+      MatmulGradV::Mma::prologue(
+          shared_storage.mm_gradV(),
+          iterator_dO,
+          thread_id,
+          num_queries_in_block);
+    };
+    auto prologueGradQ = [&](int col) {
+      typename MatmulGradQ::Mma::IteratorB iterator_K(
+          {int32_t(p.k_strideM)},
+          p.key_ptr + key_start * p.k_strideM + col,
+          {num_keys_in_block, p.head_dim - col},
+          thread_id,
+          no_offset);
+      MatmulGradQ::Mma::prologue(
+          shared_storage.mm_gradQ(), iterator_K, thread_id, num_keys_in_block);
+    };
+    auto prologueGradK = [&](int col) {
+      typename MatmulGradK::Mma::IteratorB iterator_Q(
+          {int32_t(p.q_strideM)},
+          p.query_ptr + query_start * p.q_strideM + col,
+          {num_queries_in_block, p.head_dim - col},
+          thread_id,
+          no_offset);
+      MatmulGradK::Mma::prologue(
+          shared_storage.mm_gradK(),
+          iterator_Q,
+          thread_id,
+          num_queries_in_block);
+    };
+    auto prologueDOV = [&]() {
+      typename MatmulDOIVJ::Mma::IteratorA iterator_A(
+          {int32_t(p.gO_strideM)},
+          p.grad_output_ptr + query_start * p.gO_strideM,
+          {num_queries_in_block, p.head_dim_value},
+          thread_id,
+          no_offset);
+      typename MatmulDOIVJ::Mma::IteratorB iterator_B(
+          {int32_t(p.v_strideM)},
+          p.value_ptr + key_start * p.v_strideM,
+          {p.head_dim_value, num_keys_in_block},
+          thread_id,
+          no_offset);
+      MatmulDOIVJ::Mma::prologue(
+          shared_storage.mm_doivj(),
+          iterator_A,
+          iterator_B,
+          thread_id,
+          p.head_dim_value);
+    };
+
+    /////////////////////////////////////////////////////////////////////////////////////////////////
+    // MatmulQK
+    /////////////////////////////////////////////////////////////////////////////////////////////////
+    {
+      using Mma = typename MatmulQK::Mma;
+
+      cutlass::gemm::GemmCoord problem_size(
+          num_keys_in_block,
+          num_queries_in_block,
+          p.head_dim // k
+      );
+
+      // k_j
+      typename Mma::IteratorA iterator_A(
+          {int32_t(p.k_strideM)},
+          p.key_ptr + key_start * p.k_strideM,
+          {problem_size.m(), problem_size.k()},
+          thread_id,
+          no_offset);
+
+      // q_i.transpose(-2, -1)
+      typename Mma::IteratorB iterator_B(
+          {int32_t(p.q_strideM)},
+          p.query_ptr + query_start * p.q_strideM,
+          {problem_size.k(), problem_size.n()},
+          thread_id,
+          no_offset);
+
+      Mma mma(
+          shared_storage.mm_qk_k(),
+          shared_storage.mm_qk_q(),
+          thread_id,
+          warp_id,
+          lane_id);
+
+      typename Mma::FragmentC accum;
+
+      accum.clear();
+
+      auto gemm_k_iterations =
+          (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+      // Compute threadblock-scoped matrix multiply-add
+      mma.set_prologue_done(kPrologueQK);
+      mma.set_zero_outside_bounds(!skipBoundsChecks);
+      mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
+      accum = cutlass::multiplies<typename Mma::FragmentC>()(scale, accum);
+
+      // Epilogue: add LSE + exp and store that to our shared memory buffer
+      // shmem <- (matmul_result -
+      // logsumexp[i_start:i_end].unsqueeze(1)).exp()
+      int warp_idx_mn_0 =
+          warp_id % (Mma::Base::WarpCount::kM * Mma::Base::WarpCount::kN);
+      auto output_tile_coords = cutlass::MatrixCoord{
+          warp_idx_mn_0 % Mma::Base::WarpCount::kM,
+          warp_idx_mn_0 / Mma::Base::WarpCount::kM};
+
+      // apply bias if applicable
+      if (p.bias_ptr != nullptr) {
+        // load bias tile Bij into shared memory
+        typename MatmulQK::BiasLoader::GmemTileIterator bias_iter(
+            {cutlass::layout::RowMajor(p.bias_strideM)},
+            p.bias_ptr + query_start * p.bias_strideM + key_start,
+            {num_queries_in_block, num_keys_in_block},
+            thread_id);
+        cutlass::TensorRef<scalar_t, cutlass::layout::RowMajor> bias_tensor_ref(
+            shared_storage.bias().data(),
+            cutlass::layout::RowMajor(MatmulQK::ThreadblockShape::kM));
+        typename MatmulQK::BiasLoader::SmemTileIterator smem_tile_iter(
+            bias_tensor_ref, thread_id);
+        MatmulQK::BiasLoader::load(bias_iter, smem_tile_iter);
+
+        // Pij += Bij, where Pij is in register fragment and Bij is in shmem
+        auto lane_offset = MatmulQK::AccumLambdaIterator::get_lane_offset(
+            lane_id, warp_id, output_tile_coords);
+        MatmulQK::AccumLambdaIterator::iterateRows(
+            lane_offset,
+            [&](int accum_n) {},
+            [&](int accum_m, int accum_n, int idx) {
+              // remember we are transposed
+              accum[idx] += bias_tensor_ref.at({accum_n, accum_m});
+            },
+            [&](int accum_n) {});
+      }
+
+      // Apply mask
+      if (p.custom_mask_type == CausalFromTopLeft ||
+          p.custom_mask_type == CausalFromBottomRight) {
+        auto lane_offset = MatmulQK::AccumLambdaIterator::get_lane_offset(
+            lane_id, warp_id, output_tile_coords);
+        int shift = query_start - key_start;
+        if (p.custom_mask_type == CausalFromBottomRight) {
+          shift += p.num_keys - p.num_queries;
+        }
+        // current_key = key_start + accum_m
+        // current_query = query_start + accum_n
+        // mask if: `current_key > current_query`
+        MatmulQK::AccumLambdaIterator::iterateRows(
+            lane_offset,
+            [&](int accum_m) {},
+            [&](int accum_m, int accum_n, int idx) {
+              if (accum_m > accum_n + shift) {
+                accum[idx] =
+                    -cutlass::platform::numeric_limits<accum_t>::infinity();
+              }
+            },
+            [&](int accum_m) {});
+      }
+
+      __syncthreads();
+      if (kPrologueGV) {
+        prologueGradV(0);
+      }
+      if (kPrologueDOV) {
+        prologueDOV();
+      }
+
+      MatmulQK::B2bGemm::accumApplyLSEToSmem(
+          shared_storage.attn_shared_storage(),
+          accum,
+          p.logsumexp_ptr + query_start,
+          problem_size.n(),
+          thread_id,
+          warp_id,
+          lane_id,
+          output_tile_coords);
+#if 0
+      auto accum_ref_attnT = shared_storage.attn_shared_storage().accum_ref();
+      PRINT_TENSOR4x4_T0_L0("attn_T", accum_ref_attnT);
+#endif
+
+      // if we are using dropout, compute Zij, writing it to shared memory.
+      // each element of Zij is:
+      // - 0 with probability dropout_p
+      // - 1 / (1 - dropout_p) with probability 1 - dropout_p
+      if (kApplyDropout) {
+        auto zij = shared_storage.zij().accum_ref();
+        // each thread generates a contiguous sequence of elements in Zij, all
+        // in the same row. the reason they have to come from the same row is
+        // that sampling random numbers from a contiguous random number sequence
+        // is much more efficient than jumping around, and the linear offset of
+        // each element of Z (the global matrix) maps to an offset in a random
+        // number sequence. for Z, the end of a row and the beginning of the
+        // next have adjacent offsets, but for Zij (tile of global matrix), this
+        // is not necessarily the case.
+        // We must fill the entire `zij` shmem with values (even out of bounds
+        // on the K-dimension) otherwise we can get NaNs during the GEMM
+        const int kQueriesPerBlock = kBlockSizeI;
+        const int threads_per_row = cutlass::fast_min(
+            int32_t(kNumThreads / kQueriesPerBlock), num_keys_in_block);
+        const int elts_per_thread = cutlass::round_nearest(
+            cutlass::ceil_div(num_keys_in_block, threads_per_row), 4);
+
+        const int thread_i = thread_id / threads_per_row;
+        const int thread_start_j =
+            (thread_id % threads_per_row) * elts_per_thread;
+
+        if (thread_i < kQueriesPerBlock && thread_start_j < num_keys_in_block) {
+          curandStatePhilox4_32_10_t curand_state = curand_state_init;
+          skipahead(
+              (query_start + thread_i) * p.num_keys +
+                  (key_start + thread_start_j),
+              &curand_state);
+
+          // generate elements of Zij, 4 elements at a time
+          for (int zij_start_col_idx = thread_start_j; zij_start_col_idx <
+               cutlass::fast_min<int32_t>(thread_start_j + elts_per_thread,
+                                          num_keys_in_block);
+               zij_start_col_idx += 4) {
+            const float4 rand_uniform_quad = curand_uniform4(&curand_state);
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int quad_idx = 0; quad_idx < 4; ++quad_idx) {
+              // we'll write Zij transposed since attention is also transposed
+              // during the matmul to compute dV.
+              zij.at({zij_start_col_idx + quad_idx /*k*/, thread_i /*q*/}) =
+                  (&rand_uniform_quad.x)[quad_idx] > p.dropout_prob
+                  ? scalar_t(dropout_scale)
+                  : scalar_t(0);
+            }
+          }
+        }
+        __syncthreads();
+#if 0
+        PRINT_TENSOR4x4_T0_L0("zij", zij);
+        PRINT_TENSOR4x4_T0_L0_START("zij", zij, kBlockSizeJ - 4, kBlockSizeI - 4);
+#endif
+
+        // Save mask for later DOIVJ matmul
+
+        int warp_idx_mn_0 = warp_id %
+            (MatmulDOIVJ::Mma::Base::WarpCount::kM *
+             MatmulDOIVJ::Mma::Base::WarpCount::kN);
+        auto output_tile_coords_doivj = cutlass::MatrixCoord{
+            warp_idx_mn_0 % MatmulDOIVJ::Mma::Base::WarpCount::kM,
+            warp_idx_mn_0 / MatmulDOIVJ::Mma::Base::WarpCount::kM};
+        auto lane_offset = MatmulDOIVJ::AccumLambdaIterator::get_lane_offset(
+            lane_id, warp_id, output_tile_coords_doivj);
+        MatmulDOIVJ::AccumLambdaIterator::iterateRows(
+            lane_offset,
+            [&](int accum_m) {},
+            [&](int accum_m /*q*/, int accum_n /*k*/, int idx) {
+              if (zij.at({accum_n, accum_m}) == scalar_t(0)) {
+                dropout_keep_mask_doivj[idx] = cutlass::uint1b_t{0};
+              }
+            },
+            [&](int accum_m) {});
+      }
+      __syncthreads();
+    }
+    rematerializeThreadIds();
+
+    /////////////////////////////////////////////////////////////////////////////////////////////////
+    // GradV matmul
+    //
+    // grad_v[j_start:j_end] += attn_T @ do_i
+    /////////////////////////////////////////////////////////////////////////////////////////////////
+    constexpr bool kSingleIterationGradV =
+        kMaxK <= MatmulGradV::ThreadblockShape::kN;
+    for (int col = 0; col < (kSingleIterationGradV ? 1 : p.head_dim_value);
+         col += MatmulGradV::ThreadblockShape::kN) {
+      using Mma = typename MatmulGradV::Mma;
+      using AccumTileGmem = typename MatmulGradQ::AccumTileGmem;
+
+      cutlass::gemm::GemmCoord problem_size(
+          num_keys_in_block, p.head_dim_value - col, num_queries_in_block);
+      auto createEpilogueIter = [&]() {
+        return typename MatmulGradV::OutputTileIterator(
+            typename MatmulGradV::OutputTileIterator::Params{p.gV_strideM()},
+            p.grad_value_ptr + key_start * p.gV_strideM() + col,
+            {num_keys_in_block, p.head_dim_value - col},
+            thread_id);
+      };
+      typename Mma::IteratorB iterator_B(
+          {int32_t(p.gO_strideM)},
+          p.grad_output_ptr + query_start * p.gO_strideM + col,
+          {num_queries_in_block, p.head_dim_value - col},
+          thread_id,
+          no_offset);
+
+      // if dropout: dVj += (Pij.T * Zij) @ dOi
+      // otherwise:  dVj += Pij.T @ dOi
+      Mma mma(
+          // operand A: Pij.T
+          shared_storage.attn_shared_storage().accum_ref(),
+          // operand A_scale Zij.T:
+          // if we're using dropout, operand A is Pij_dropped.T = Pij.T * Zij.T
+          // which is computed on the fly as fragments of Pij.T are loaded in
+          shared_storage.zij().accum_ref(),
+          // operand B: dOi - which was loaded into shared memory previously
+          // when we computed dVj
+          shared_storage.mm_gradV().operand_B_ref(),
+          thread_id,
+          warp_id,
+          lane_id);
+
+      int storage_id = col / MatmulGradV::ThreadblockShape::kN;
+      AccumTileGmem gmem_tile{
+          p.workspace_gv + storage_id * AccumTileGmem::kElementsStored};
+      if (!kOutputInRF) {
+        if (isFirstQuery || !kNeedsAccumGradV) {
+          output_frags.gradV.clear();
+        } else {
+          gmem_tile.load(output_frags.gradV, thread_id);
+        }
+      }
+      mma.set_prologue_done(kPrologueGV);
+
+      auto gemm_k_iterations =
+          (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+      // Compute threadblock-scoped matrix multiply-add
+      __syncthreads();
+
+      mma(gemm_k_iterations,
+          output_frags.gradV,
+          iterator_B,
+          output_frags.gradV);
+      __syncthreads();
+      if (kPrologueGV && !kSingleIterationGradV &&
+          col + MatmulGradV::ThreadblockShape::kN < p.head_dim_value) {
+        prologueGradV(col + MatmulGradV::ThreadblockShape::kN);
+      }
+
+      if (!kOutputInRF) {
+        if (kNeedsAccumGradV && !isLastQuery) {
+          gmem_tile.store(output_frags.gradV, thread_id);
+        } else {
+          accumulateInGmem<MatmulGradV>(
+              shared_storage.gradV_epilogue(),
+              output_frags.gradV,
+              createEpilogueIter(),
+              isFirstQuery || kNeedsAccumGradV,
+              warp_id,
+              lane_id);
+        }
+      }
+    }
+    __syncthreads();
+
+    /////////////////////////////////////////////////////////////////////////////////////////////////
+    // MatmulDOIVJ
+    /////////////////////////////////////////////////////////////////////////////////////////////////
+    {
+      using Mma = typename MatmulDOIVJ::Mma;
+      // do_i
+      typename Mma::IteratorA iterator_A(
+          {int32_t(p.gO_strideM)},
+          p.grad_output_ptr + query_start * p.gO_strideM,
+          {num_queries_in_block, p.head_dim_value},
+          thread_id,
+          no_offset);
+
+      // v_j.transpose(-2, -1)
+      typename Mma::IteratorB iterator_B(
+          {int32_t(p.v_strideM)},
+          p.value_ptr + key_start * p.v_strideM,
+          {p.head_dim_value, num_keys_in_block},
+          thread_id,
+          no_offset);
+
+      Mma mma(shared_storage.mm_doivj(), thread_id, warp_id, lane_id);
+      mma.set_prologue_done(kPrologueDOV);
+      mma.set_zero_outside_bounds(!skipBoundsChecks);
+
+      typename Mma::FragmentC accum;
+
+      accum.clear();
+
+      auto gemm_k_iterations =
+          (p.head_dim_value + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+      // Compute threadblock-scoped matrix multiply-add
+      mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
+      __syncthreads();
+      if (kPrologueGQ) {
+        prologueGradQ(0);
+      }
+      if (kPrologueGK) {
+        prologueGradK(0);
+      }
+
+      int warp_idx_mn_0 =
+          warp_id % (Mma::Base::WarpCount::kM * Mma::Base::WarpCount::kN);
+      auto output_tile_coords = cutlass::MatrixCoord{
+          warp_idx_mn_0 % Mma::Base::WarpCount::kM,
+          warp_idx_mn_0 / Mma::Base::WarpCount::kM};
+      // TODO: This must be terribly inefficient. There must be a better way
+      // tmp [RF] <- (accum [RF] - Di [smem] ) * attn_T.T [smem]
+      // attn_shared_storage  [smem] <- tmp.T
+      // tmp_shared_storage [smem] <- tmp
+      {
+        using LambdaIterator = typename MatmulDOIVJ::AccumLambdaIterator;
+        auto lane_offset = LambdaIterator::get_lane_offset(
+            lane_id, warp_id, output_tile_coords);
+        // if dropout was used, compute dPij = dPij_dropped * Zij
+        if (kApplyDropout) {
+          LambdaIterator::iterateRows(
+              lane_offset,
+              [&](int accum_m) {},
+              [&](int accum_m, int accum_n, int idx) {
+                if (dropout_keep_mask_doivj[idx].get()) {
+                  accum[idx] *= dropout_scale;
+                } else {
+                  accum[idx] = 0;
+                }
+              },
+              [&](int accum_m) {});
+        }
+
+        auto attn_T = shared_storage.attn_shared_storage().accum_ref();
+#if 0
+        PRINT_B0_T0("doivj_dropped");
+        print_warp_accum<LambdaIterator>(accum, lane_offset, 4, 4);
+        PRINT_TENSOR4x4_T0_L0("attn_T", attn_T)
+#endif
+        accum_t current_di;
+        // dSij = (dPij - Di) * Pij
+        LambdaIterator::iterateRows(
+            lane_offset,
+            [&](int accum_m) { current_di = shared_storage.di()[accum_m]; },
+            [&](int accum_m, int accum_n, int idx) {
+              // TODO: Otherwise we can get nans as we
+              // might have infs here (only seen on f16 tho)
+              if (skipBoundsChecks ||
+                  (accum_m < num_queries_in_block &&
+                   accum_n < num_keys_in_block)) {
+                accum_t attn = attn_T.at({accum_n, accum_m});
+                accum[idx] = (accum[idx] - current_di) * attn;
+              } else {
+                accum[idx] = 0;
+              }
+            },
+            [&](int accum_m) {
+
+            });
+
+        // store bias gradient tile dBij to global memory,
+        // where dBij = dSij = Pij * (dPij - Di)
+        if (p.grad_bias_ptr != nullptr) {
+          typename MatmulDOIVJ::BiasGradEpilogue::OutputTileIterator
+              output_iter(
+                  typename MatmulDOIVJ::BiasGradEpilogue::OutputTileIterator::
+                      Params{p.gB_strideM},
+                  // grad_bias_ptr is offset to point at beginning of
+                  // matrix of shape (queries, keys) for a given
+                  // (batch_id, head_id) the pointer arithmetic here produces
+                  // a pointer to the start of the current tile within that
+                  // matrix
+                  p.grad_bias_ptr + query_start * p.gB_strideM + key_start,
+                  {num_queries_in_block, num_keys_in_block},
+                  thread_id);
+
+          // no-op epilogue operator - just casting and storing contents of
+          // accum to global memory
+          typename MatmulDOIVJ::BiasGradEpilogue::OutputOp output_op(
+              typename MatmulDOIVJ::BiasGradEpilogue::OutputOp::Params{1, 1});
+          typename MatmulDOIVJ::BiasGradEpilogue epilogue(
+              shared_storage.gradB_epilogue(), thread_id, warp_id, lane_id);
+          epilogue(output_op, output_iter, accum, output_iter);
+        }
+
+        accum = accum * scale;
+
+#if 0
+        PRINT_B0_T0("(doivj - di) * attn * scale");
+        print_warp_accum<LambdaIterator>(accum, lane_offset, 4, 4);
+#endif
+
+        __syncthreads();
+        if (!MatmulGradK::DefaultMmaFromSmem::kIsTransposedA) {
+          auto tmpT = shared_storage.tmpT_shared_storage().accum_ref();
+          // attn <- attn_T.T
+          LambdaIterator::iterateRows(
+              lane_offset,
+              [&](int accum_m) {},
+              [&](int accum_m, int accum_n, int idx) {
+                tmpT.at({accum_n, accum_m}) = scalar_t(accum[idx]);
+              },
+              [&](int accum_m) {});
+        }
+      }
+
+      MatmulDOIVJ::B2bGemm::accumToSmem(
+          shared_storage.tmp_shared_storage(),
+          accum,
+          lane_id,
+          output_tile_coords);
+      __syncthreads();
+    }
+    // Force `nvcc` to recompute values that depend on the variables just below
+    // to use less RF and prevent some spilling
+    p.head_dim = warp_uniform(p.head_dim);
+    p.k_strideM = warp_uniform(p.k_strideM);
+    rematerializeThreadIds();
+
+    /////////////////////////////////////////////////////////////////////////////////////////////////
+    // GradQ matmul
+    //
+    // grad_q[i_start:i_end] += tmp @ k_j
+    /////////////////////////////////////////////////////////////////////////////////////////////////
+    // Skip the loop & associated branches if we know at compile time the number
+    // of iterations
+    constexpr bool kSingleIterationGradQ =
+        kMaxK <= MatmulGradQ::ThreadblockShape::kN;
+    for (int col = 0; col < (kSingleIterationGradQ ? 1 : p.head_dim);
+         col += MatmulGradQ::ThreadblockShape::kN) {
+      using Mma = typename MatmulGradQ::Mma;
+      using AccumTileGmem = typename MatmulGradQ::AccumTileGmem;
+
+      cutlass::gemm::GemmCoord problem_size(
+          num_queries_in_block,
+          false ? MatmulGradQ::ThreadblockShape::kN : p.head_dim - col,
+          num_keys_in_block);
+
+      // k_j
+      typename Mma::IteratorB iterator_B(
+          {int32_t(p.k_strideM)},
+          p.key_ptr + key_start * p.k_strideM + col,
+          {problem_size.k(), problem_size.n()},
+          thread_id,
+          no_offset);
+
+      auto a = shared_storage.tmp_shared_storage().accum_ref();
+      Mma mma(
+          // operand A: dSij
+          shared_storage.tmp_shared_storage().accum_ref(),
+          // operand B: Kj
+          shared_storage.mm_gradQ().operand_B_ref(),
+          thread_id,
+          warp_id,
+          lane_id);
+
+      typename Mma::FragmentC accum;
+
+      int col_id = col / MatmulGradQ::ThreadblockShape::kN;
+      int num_cols = kSingleIterationGradQ
+          ? 1
+          : ceil_div(p.head_dim, MatmulGradQ::ThreadblockShape::kN);
+      int storage_id = (col_id + query_start / kBlockSizeI * num_cols);
+
+      if (p.num_splits_key_device() > 1) {
+        AtomicLock::acquire(
+            &p.workspace_gq[storage_id].lock,
+            p.split_key_device() + 1,
+            thread_id);
+        // Make sure we can see other block's output
+        __threadfence();
+      }
+
+      AccumTileGmem gmem_tile{&p.workspace_gq[storage_id].buffer[0]};
+      if (!kNeedsAccumGradQ ||
+          (p.num_splits_key_device() == 1 && key_start == 0)) {
+        // if we know we are the first to access it, we know it's only zeros.
+        // Avoids a load from gmem (and gmem init as well)
+        accum.clear();
+      } else {
+        gmem_tile.load(accum, thread_id);
+      }
+
+      auto gemm_k_iterations =
+          (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+      // Compute threadblock-scoped matrix multiply-add
+      __syncthreads();
+      mma.set_prologue_done(kPrologueGQ);
+      mma(gemm_k_iterations, accum, iterator_B, accum);
+      __syncthreads();
+      bool isLastColumn = kSingleIterationGradQ ||
+          (col + MatmulGradQ::ThreadblockShape::kN >= p.head_dim);
+      if (kPrologueGQ && !isLastColumn) {
+        prologueGradQ(col + MatmulGradQ::ThreadblockShape::kN);
+      }
+
+      bool isLast = [&]() {
+        int32_t next_key = key_start + p.num_splits_key_device() * kBlockSizeJ;
+        if (p.num_keys <= next_key) {
+          return true;
+        }
+        if (query_start < getSmallestQueryForKey(p, next_key)) {
+          return true;
+        }
+        return false;
+      }();
+      // Output results
+      if (p.num_splits_key_device() > 1) {
+        int32_t numAddsSoFar = -1;
+        if (isLast && thread_id == 0) {
+          numAddsSoFar = atomicAdd(&p.workspace_gq[storage_id].counter, 1) +
+              1; // `atomicAdd` returns the old value
+        }
+        isLast = __syncthreads_or(
+            numAddsSoFar == getNumParallelBlocksForQuery(p, query_start));
+        assert(numAddsSoFar <= getNumParallelBlocksForQuery(p, query_start));
+      }
+      if (kNeedsAccumGradQ && !isLast) {
+        gmem_tile.store(accum, thread_id);
+        if (p.num_splits_key_device() > 1) {
+          // Make sure everyone wrote before we release the lock
+          __threadfence();
+          __syncthreads();
+          AtomicLock::release(&p.workspace_gq[storage_id].lock, thread_id);
+        }
+      } else {
+        // NOTE: We're not releasing the lock because no one is expected
+        // to come after us (we're the last one to write)
+        typename MatmulGradQ::OutputTileIterator output_it(
+            typename MatmulGradQ::OutputTileIterator::Params{p.gQ_strideM()},
+            p.grad_query_ptr + query_start * p.gQ_strideM() + col,
+            {problem_size.m(), problem_size.n()},
+            thread_id);
+        bool storage_contains_zeros = kNeedsAccumGradQ || key_start == 0 ||
+            (p.num_splits_key_device() > 1);
+        accumulateInGmem<MatmulGradQ>(
+            isLastColumn ? shared_storage.gradQ_epilogue_lastIter()
+                         : shared_storage.gradQ_epilogue(),
+            accum,
+            output_it,
+            storage_contains_zeros,
+            warp_id,
+            lane_id);
+      }
+    }
+    /////////////////////////////////////////////////////////////////////////////////////////////////
+    // GradK matmul
+    //
+    // grad_k[i_start:i_end] += tmp.transpose(-2, -1) @ q_i
+    /////////////////////////////////////////////////////////////////////////////////////////////////
+    rematerializeThreadIds();
+
+    constexpr bool kSingleIterationGradK =
+        kMaxK <= MatmulGradK::ThreadblockShape::kN;
+    for (int col = 0; col < (kSingleIterationGradK ? 1 : p.head_dim);
+         col += MatmulGradK::ThreadblockShape::kN) {
+      using Mma = typename MatmulGradK::Mma;
+      using AccumTileGmem = typename MatmulGradQ::AccumTileGmem;
+
+      cutlass::gemm::GemmCoord problem_size(
+          num_keys_in_block,
+          false ? MatmulGradK::ThreadblockShape::kN : p.head_dim - col,
+          num_queries_in_block);
+      auto createEpilogueIter = [&]() {
+        return typename MatmulGradK::OutputTileIterator(
+            typename MatmulGradK::OutputTileIterator::Params{p.gK_strideM()},
+            p.grad_key_ptr + key_start * p.gK_strideM() + col,
+            {num_keys_in_block,
+             false ? MatmulGradK::ThreadblockShape::kN : p.head_dim - col},
+            thread_id);
+      };
+
+      // q_i
+      typename Mma::IteratorB iterator_B(
+          {int32_t(p.q_strideM)},
+          p.query_ptr + query_start * p.q_strideM + col,
+          {problem_size.k(), problem_size.n()},
+          thread_id,
+          no_offset);
+
+      auto getTmp = [&](int) { return &shared_storage.tmp_shared_storage(); };
+      auto getTmpT = [&](int) { return &shared_storage.tmpT_shared_storage(); };
+      // this is basically:
+      // opA = kIsTransposedA ? getTmp() : getTmpT();
+      bool constexpr kIsTransposedA =
+          MatmulGradK::DefaultMmaFromSmem::kIsTransposedA;
+      auto& opA = *call_conditional<
+          kIsTransposedA,
+          decltype(getTmp),
+          decltype(getTmpT)>::apply(getTmp, getTmpT, 0);
+      Mma mma(
+          // operand A: dSij.T
+          opA.accum_ref(),
+          // operand B: Qi
+          shared_storage.mm_gradK().operand_B_ref(),
+          thread_id,
+          warp_id,
+          lane_id);
+
+      int storage_id = col / MatmulGradK::ThreadblockShape::kN;
+      AccumTileGmem gmem_tile{
+          p.workspace + storage_id * AccumTileGmem::kElementsStored};
+      if (!kOutputInRF) {
+        if (isFirstQuery || !kNeedsAccumGradK) {
+          output_frags.gradK.clear();
+        } else {
+          gmem_tile.load(output_frags.gradK, thread_id);
+        }
+      }
+      mma.set_prologue_done(kPrologueGK);
+
+      auto gemm_k_iterations =
+          (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+      // Compute threadblock-scoped matrix multiply-add
+      __syncthreads();
+
+      mma(gemm_k_iterations,
+          output_frags.gradK,
+          iterator_B,
+          output_frags.gradK);
+      __syncthreads();
+      bool isLastColumn = kSingleIterationGradK ||
+          col + MatmulGradK::ThreadblockShape::kN >= p.head_dim;
+      if (kPrologueGK && !isLastColumn) {
+        prologueGradK(col + MatmulGradK::ThreadblockShape::kN);
+      }
+
+      if (kPrologueQK && isLastColumn) {
+        int32_t next_query, next_key;
+        incrIteration(p, query_start, key_start, next_query, next_key);
+        DISPATCH_BOOL(
+            next_key != key_start, kForceReloadK, ([&]() {
+              prologueQkNextIteration<kForceReloadK::value>(
+                  shared_storage, p, next_query, next_key, warp_id, lane_id);
+            }));
+      }
+
+      // Output results
+      if (!kOutputInRF) {
+        if (kNeedsAccumGradK && !isLastQuery) {
+          gmem_tile.store(output_frags.gradK, thread_id);
+        } else {
+          accumulateInGmem<MatmulGradK>(
+              isLastColumn ? shared_storage.gradK_epilogue_final()
+                           : shared_storage.gradK_epilogue(),
+              output_frags.gradK,
+              createEpilogueIter(),
+              isFirstQuery || kNeedsAccumGradK,
+              warp_id,
+              lane_id);
+          __syncthreads();
+        }
+      }
+    }
+  }
+
+  static CUTLASS_DEVICE int32_t getQueryStartShift(Params const& p) {
+    if (p.custom_mask_type == NoCustomMask && p.num_splits_key_device() > 1) {
+      return (p.split_key_device() * kBlockSizeI) % getQueryEnd(p);
+    }
+    return 0;
+  }
+
+  // Iteration order logic
+  static CUTLASS_DEVICE int32_t
+  getQueryStart(Params const& p, int32_t key_start) {
+    return getSmallestQueryForKey(p, key_start) + getQueryStartShift(p);
+  };
+  static CUTLASS_DEVICE int32_t getQueryEnd(Params const& p) {
+    return align_up(p.num_queries, kBlockSizeI);
+  };
+
+  static CUTLASS_DEVICE int32_t
+  getSmallestQueryForKey(Params const& p, int32_t key_start) {
+    if (p.custom_mask_type == CausalFromTopLeft) {
+      return (key_start / kBlockSizeI) * kBlockSizeI;
+    } else if (p.custom_mask_type == CausalFromBottomRight) {
+      int first_query =
+          cutlass::fast_max(0, key_start - p.num_keys + p.num_queries);
+      return (first_query / kBlockSizeI) * kBlockSizeI;
+    }
+    return 0;
+  };
+
+  // Returns how many kernel blocks will write to a given block in `grad_query`
+  // This is usually equal to the number of key splits, but can be different
+  // for instance in the causal case, or varying seqlen
+  static CUTLASS_DEVICE int32_t
+  getNumParallelBlocksForQuery(Params const& p, int32_t query_start) {
+    int16_t num_key_blocks = ceil_div(p.num_keys, kBlockSizeJ);
+    if (p.custom_mask_type == CausalFromTopLeft) {
+      int32_t last_key_for_block = query_start + kBlockSizeI - 1;
+      last_key_for_block = cutlass::fast_min(last_key_for_block, p.num_keys);
+      num_key_blocks = ceil_div(last_key_for_block, kBlockSizeJ);
+    } else if (p.custom_mask_type == CausalFromBottomRight) {
+      int32_t last_key_for_block =
+          query_start + (kBlockSizeI - 1) + (1 + p.num_keys - p.num_queries);
+      last_key_for_block = cutlass::fast_min(last_key_for_block, p.num_keys);
+      num_key_blocks = ceil_div(last_key_for_block, kBlockSizeJ);
+    }
+    return cutlass::fast_min(p.num_splits_key_device(), num_key_blocks);
+  };
+
+  // Returns the next block to process
+  static CUTLASS_DEVICE void incrIteration(
+      Params const& p,
+      int32_t query_start,
+      int32_t key_start,
+      int32_t& next_query,
+      int32_t& next_key) {
+    next_query = query_start + kBlockSizeI;
+    next_key = key_start;
+    auto query_shift = getQueryStartShift(p);
+    // Wrap around
+    if (query_shift) {
+      if (next_query >= p.num_queries) {
+        next_query = getSmallestQueryForKey(p, key_start);
+        return;
+      } else if (query_start < query_shift && query_shift <= next_query) {
+        // jump to next key
+      } else {
+        return;
+      }
+    } else {
+      if (next_query < p.num_queries) {
+        return;
+      }
+      // jump to next key
+    }
+    // Next key
+    next_key = key_start + p.num_splits_key_device() * kBlockSizeJ;
+    next_query = getQueryStart(p, next_key);
+  }
+
+  template <bool kForceReloadK>
+  static CUTLASS_DEVICE void prologueQkNextIteration(
+      SharedStorage& shared_storage,
+      Params const& p,
+      int32_t query_start,
+      int32_t key_start,
+      uint8_t warp_id,
+      uint8_t lane_id) {
+    if (query_start >= p.num_queries || key_start >= p.num_keys) {
+      return;
+    }
+
+    static constexpr bool kReloadK =
+        kForceReloadK || !MatmulQK::Mma::kSmemContainsEntireMat;
+    int thread_id = 32 * warp_id + lane_id;
+    typename MatmulQK::Mma::IteratorA iterator_A(
+        {int32_t(p.k_strideM)},
+        p.key_ptr + key_start * p.k_strideM,
+        {p.num_keys - key_start, p.head_dim},
+        thread_id,
+        cutlass::MatrixCoord{0, 0});
+
+    typename MatmulQK::Mma::IteratorB iterator_B(
+        {int32_t(p.q_strideM)},
+        p.query_ptr + query_start * p.q_strideM,
+        {p.head_dim, p.num_queries - query_start},
+        thread_id,
+        cutlass::MatrixCoord{0, 0});
+
+    MatmulQK::Mma::template prologue<kReloadK, true>(
+        shared_storage.mm_qk_k(),
+        shared_storage.mm_qk_q(),
+        iterator_A,
+        iterator_B,
+        thread_id,
+        p.head_dim);
+  }
+
+  template <bool skipBoundsChecks>
+  static CUTLASS_DEVICE void writeFragsToGmem(
+      SharedStorage& shared_storage,
+      OutputFragments& output_frags,
+      Params const& p,
+      int32_t key_start,
+      uint8_t warp_id,
+      uint8_t lane_id) {
+    uint16_t thread_id = 32 * warp_id + lane_id;
+    int32_t num_keys_in_block = skipBoundsChecks
+        ? MatmulQK::Mma::Shape::kM
+        : cutlass::fast_min(
+              (int32_t)MatmulQK::Mma::Shape::kM, p.num_keys - key_start);
+    typename MatmulGradV::OutputTileIterator outputV_it(
+        typename MatmulGradV::OutputTileIterator::Params{p.gV_strideM()},
+        p.grad_value_ptr + key_start * p.gV_strideM(),
+        {num_keys_in_block, p.head_dim_value},
+        thread_id);
+
+    accumulateInGmem<MatmulGradV>(
+        shared_storage.gradV_epilogue_final(),
+        output_frags.gradV,
+        outputV_it,
+        true,
+        warp_id,
+        lane_id);
+
+    typename MatmulGradK::OutputTileIterator outputK_it(
+        typename MatmulGradK::OutputTileIterator::Params{p.gK_strideM()},
+        p.grad_key_ptr + key_start * p.gK_strideM(),
+        {num_keys_in_block,
+         false ? MatmulGradK::ThreadblockShape::kN : p.head_dim},
+        thread_id);
+    accumulateInGmem<MatmulGradK>(
+        shared_storage.gradK_epilogue_final(),
+        output_frags.gradK,
+        outputK_it,
+        true,
+        warp_id,
+        lane_id);
+  }
+
+  template <typename MatmulT>
+  static CUTLASS_DEVICE void accumulateInGmem(
+      typename MatmulT::DefaultEpilogue::SharedStorage& epilogue_smem,
+      typename MatmulT::Mma::FragmentC const& accum,
+      typename MatmulT::OutputTileIterator output_it,
+      bool first,
+      uint8_t warp_id,
+      uint8_t lane_id) {
+    using DefaultEpilogue = typename MatmulT::DefaultEpilogue;
+    using DefaultOutputOp = typename MatmulT::DefaultOutputOp;
+    using Mma = typename MatmulT::Mma;
+    int thread_id = 32 * warp_id + lane_id;
+    DISPATCH_BOOL(
+        first, kIsFirst, ([&]() {
+          static constexpr auto ScaleType = kIsFirst::value
+              ? cutlass::epilogue::thread::ScaleType::Nothing
+              : cutlass::epilogue::thread::ScaleType::NoBetaScaling;
+          using EpilogueOutputOp =
+              typename cutlass::epilogue::thread::LinearCombination<
+                  typename DefaultOutputOp::ElementOutput,
+                  DefaultOutputOp::kCount,
+                  typename DefaultOutputOp::ElementAccumulator,
+                  typename DefaultOutputOp::ElementCompute,
+                  ScaleType>;
+          using Epilogue =
+              typename cutlass::epilogue::threadblock::EpiloguePipelined<
+                  typename DefaultEpilogue::Shape,
+                  typename Mma::Operator,
+                  DefaultEpilogue::kPartitionsK,
+                  typename MatmulT::OutputTileIterator,
+                  typename DefaultEpilogue::AccumulatorFragmentIterator,
+                  typename DefaultEpilogue::WarpTileIterator,
+                  typename DefaultEpilogue::SharedLoadIterator,
+                  EpilogueOutputOp,
+                  typename DefaultEpilogue::Padding,
+                  DefaultEpilogue::kFragmentsPerIteration,
+                  true // IterationsUnroll
+                  >;
+          EpilogueOutputOp rescale({1, 1});
+          Epilogue epilogue(epilogue_smem, thread_id, warp_id, lane_id);
+          epilogue(rescale, output_it, accum, output_it);
+        }));
+  }
+
+  template <int kElementsPerAccess>
+  static CUTLASS_DEVICE void computeDelta(
+      Params const& p,
+      int32_t query_start,
+      uint8_t warp_id,
+      uint8_t lane_id) {
+    // Each thread computes one value for Delta
+    // Depending on warp configuration, we might have multiple
+    // threads of the same warp working on the same row
+    using AccessType = cutlass::Array<scalar_t, kElementsPerAccess>;
+    static_assert(kNumThreads >= kBlockSizeI, "");
+    static constexpr int kNumThreadsPerLine = kNumThreads / kBlockSizeI;
+    int16_t thread_id = 32 * warp_id + lane_id;
+
+    int16_t laneFirstCol = kElementsPerAccess * (lane_id % kNumThreadsPerLine);
+    int16_t laneRow = thread_id / kNumThreadsPerLine;
+    bool rowPred = (query_start + laneRow) < p.num_queries;
+    bool pred = rowPred;
+
+    // on windows, previous syntax __restrict__ AccessType*
+    // resulted in error: "restrict" is not allowed
+    const AccessType* __restrict__ grad_output_ptr =
+        reinterpret_cast<const AccessType*>(
+            p.grad_output_ptr + (query_start + laneRow) * p.gO_strideM +
+            laneFirstCol);
+    const AccessType* __restrict__ output_ptr =
+        reinterpret_cast<const AccessType*>(
+            p.output_ptr + (query_start + laneRow) * p.o_strideM() +
+            laneFirstCol);
+
+    static constexpr int64_t kMaxIters =
+        kMaxK / (kElementsPerAccess * kNumThreadsPerLine);
+    constexpr int kPipelineStages = 2;
+    accum_t delta_value = accum_t(0);
+    using GlobalLoad =
+        cutlass::arch::global_load<AccessType, sizeof(AccessType)>;
+    AccessType frag_grad_output[kPipelineStages];
+    AccessType frag_output[kPipelineStages];
+
+    auto loadAndIncrement = [&](int ld_pos, bool is_valid) {
+      frag_grad_output[ld_pos].clear();
+      frag_output[ld_pos].clear();
+      GlobalLoad(frag_grad_output[ld_pos], grad_output_ptr, is_valid);
+      GlobalLoad(frag_output[ld_pos], output_ptr, is_valid);
+      grad_output_ptr += kNumThreadsPerLine;
+      output_ptr += kNumThreadsPerLine;
+    };
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int iter = 0; iter < kPipelineStages - 1; ++iter) {
+      int ld_pos = iter % kPipelineStages;
+      pred = pred &&
+          (laneFirstCol + iter * kElementsPerAccess * kNumThreadsPerLine) <
+              p.head_dim_value;
+      loadAndIncrement(ld_pos, pred);
+    }
+    auto columnIteration = [&](int iter) {
+      // Load for next iter
+      int ld_pos = (iter + kPipelineStages - 1) % kPipelineStages;
+      pred = pred &&
+          (laneFirstCol +
+           (iter + kPipelineStages - 1) * kElementsPerAccess *
+               kNumThreadsPerLine) < p.head_dim_value;
+      loadAndIncrement(ld_pos, pred);
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < AccessType::kElements; ++i) {
+        delta_value += accum_t(frag_output[iter % kPipelineStages][i]) *
+            accum_t(frag_grad_output[iter % kPipelineStages][i]);
+      }
+    };
+
+    // If we have a small lower-bound for K, we can unroll the loop
+    if (kMaxK <= 256) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int iter = 0; iter < kMaxIters; ++iter) {
+        columnIteration(iter);
+      }
+    } else {
+      int num_iters =
+          ceil_div(p.head_dim_value, kElementsPerAccess * kNumThreadsPerLine) *
+          (kElementsPerAccess * kNumThreadsPerLine);
+      for (int iter = 0; iter < num_iters; ++iter) {
+        columnIteration(iter);
+      }
+    }
+
+    // Reduce between workers
+    static_assert(
+        kNumThreadsPerLine == 1 || kNumThreadsPerLine == 2 ||
+            kNumThreadsPerLine == 4,
+        "");
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < kNumThreadsPerLine; i *= 2) {
+      delta_value = delta_value + __shfl_xor_sync(0xffffffff, delta_value, i);
+    }
+
+    // Store in gmem
+    if (rowPred) {
+      p.delta_ptr[query_start + laneRow] = delta_value;
+    }
+  }
+};
+
+template <typename AK>
+__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
+    attention_kernel_backward_batched_impl(typename AK::Params p) {
+  if (!p.advance_to_block()) {
+    return;
+  }
+  AK::attention_kernel(p);
+}
+
+template <typename AK>
+__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
+    attention_kernel_backward_batched(typename AK::Params params);
diff --git a/examples/41_fused_multi_head_attention/kernel_forward.h b/examples/41_fused_multi_head_attention/kernel_forward.h
new file mode 100644
index 0000000000..71d79415e9
--- /dev/null
+++ b/examples/41_fused_multi_head_attention/kernel_forward.h
@@ -0,0 +1,1322 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#ifdef HAS_PYTORCH
+#include <ATen/cuda/CUDAGeneratorImpl.h>
+#include <ATen/cuda/CUDAGraphsUtils.cuh>
+#endif
+
+#include <curand_kernel.h>
+#include <cmath>
+#include <cinttypes>
+#include <vector>
+
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/matrix.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "debug_utils.h"
+#include "epilogue/epilogue_pipelined.h"
+#include "epilogue/epilogue_rescale_output.h"
+#include "gemm/custom_mma.h"
+#include "gemm/find_default_mma.h"
+#include "gemm/mma_from_smem.h"
+#include "gemm_kernel_utils.h"
+#include "transform/tile_smem_loader.h"
+
+using namespace gemm_kernel_utils;
+
+namespace {
+template <typename scalar_t, typename Arch>
+constexpr int getWarpsPerSmFw() {
+  return (
+      Arch::kMinComputeCapability >= 80 &&
+              !cutlass::platform::is_same<scalar_t, float>::value
+          ? 16
+          : 12);
+}
+static CUTLASS_DEVICE float atomicMaxFloat(float* addr, float value) {
+  // source: https://stackoverflow.com/a/51549250
+  return (value >= 0)
+      ? __int_as_float(atomicMax((int*)addr, __float_as_int(value)))
+      : __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));
+}
+} // namespace
+
+// If ToBatchHookType_ is supplied other than this default (which is
+// never the case in the xformers library) then the user is
+// defining the logic which each block uses to find its data to work on,
+// with the advance_to_batch function with the following signature.
+// It should return false if there is no work to do for this block.
+// In general this will not work with saving for backward due to fixed layout
+// for logsumexp and incompatible rngs for dropout, so is likely only useful for
+// custom inference.
+struct DefaultToBatchHook {
+  template <typename Params>
+  CUTLASS_DEVICE static bool advance_to_batch(
+      Params&,
+      int64_t& /* q_start */,
+      int64_t& /* k_start */) {
+    return true;
+  }
+};
+
+template <
+    // The datatype of Q/K/V
+    typename scalar_t_,
+    // Architecture we are targeting (eg `cutlass::arch::Sm80`)
+    typename ArchTag,
+    // If Q/K/V are correctly aligned in memory and we can run a fast kernel
+    bool isAligned_,
+    int kQueriesPerBlock_,
+    int kKeysPerBlock_,
+    // upperbound on `max(value.shape[-1], query.shape[-1])`
+    int kMaxK_ = (int)cutlass::platform::numeric_limits<uint32_t>::max(),
+    // This is quite slower on V100 for some reason
+    // Set to false if you know at compile-time you will never need dropout
+    bool kSupportsDropout_ = true,
+    bool kSupportsBias_ = true,
+    typename ToBatchHookType_ = DefaultToBatchHook>
+struct AttentionKernel {
+  enum CustomMaskType {
+    NoCustomMask = 0,
+    CausalFromTopLeft = 1,
+    CausalFromBottomRight = 2,
+    NumCustomMaskTypes,
+  };
+
+  using scalar_t = scalar_t_;
+  using accum_t = float;
+  using lse_scalar_t = float;
+  using output_t = scalar_t;
+  // Accumulator between 2 iterations
+  // Using `accum_t` improves perf on f16 at the cost of
+  // numerical errors
+  using output_accum_t = accum_t;
+  static constexpr bool kSupportsDropout = kSupportsDropout_;
+  static constexpr bool kSupportsBias = kSupportsBias_;
+  static constexpr int kKeysPerBlock = kKeysPerBlock_;
+  static constexpr int kQueriesPerBlock = kQueriesPerBlock_;
+  static constexpr int kMaxK = kMaxK_;
+  static constexpr bool kIsAligned = isAligned_;
+  static constexpr bool kSingleValueIteration = kMaxK <= kKeysPerBlock;
+  static constexpr int32_t kAlignLSE = 32; // block size of backward
+  static constexpr bool kIsHalf = cutlass::sizeof_bits<scalar_t>::value == 16;
+  static constexpr bool kPreloadV =
+      ArchTag::kMinComputeCapability >= 80 && kIsHalf;
+  static constexpr bool kKeepOutputInRF = kSingleValueIteration;
+  static constexpr bool kNeedsOutputAccumulatorBuffer = !kKeepOutputInRF &&
+      !cutlass::platform::is_same<output_accum_t, output_t>::value;
+
+  static_assert(kQueriesPerBlock % 32 == 0, "");
+  static_assert(kKeysPerBlock % 32 == 0, "");
+  static constexpr int kNumWarpsPerBlock =
+      kQueriesPerBlock * kKeysPerBlock / (32 * 32);
+  static constexpr int kWarpSize = 32;
+
+  // Launch bounds
+  static constexpr int kNumThreads = kWarpSize * kNumWarpsPerBlock;
+  static constexpr int kMinBlocksPerSm =
+      getWarpsPerSmFw<scalar_t, ArchTag>() / kNumWarpsPerBlock;
+
+  struct Params {
+    // Input tensors
+    scalar_t* query_ptr = nullptr; // [num_queries, num_heads, head_dim]
+    scalar_t* key_ptr = nullptr; // [num_keys, num_heads, head_dim]
+    scalar_t* value_ptr = nullptr; // [num_keys, num_heads, head_dim_value]
+    scalar_t* attn_bias_ptr = nullptr; // [num_heads, num_queries, num_keys]
+    int32_t* seqstart_q_ptr = nullptr;
+    int32_t* seqstart_k_ptr = nullptr;
+
+    int32_t* seqlen_k_ptr = nullptr;
+    uint32_t causal_diagonal_offset = 0;
+
+    // Output tensors
+    output_t* output_ptr = nullptr; // [num_queries, num_heads, head_dim_value]
+    // [num_queries, num_heads, head_dim_value]
+    output_accum_t* output_accum_ptr = nullptr;
+    // [num_heads, num_queries] - can be null
+    lse_scalar_t* logsumexp_ptr = nullptr;
+
+    // Scale
+    accum_t scale = 0.0;
+
+    // Dimensions/strides
+    int32_t head_dim = 0;
+    int32_t head_dim_value = 0;
+    int32_t num_queries = 0;
+    int32_t num_keys = 0;
+    int32_t num_keys_absolute = 0;
+
+    uint8_t custom_mask_type = NoCustomMask;
+
+    int32_t q_strideM = 0;
+    int32_t k_strideM = 0;
+    int32_t v_strideM = 0;
+    int32_t bias_strideM = 0;
+
+    int32_t o_strideM = 0;
+
+    // Everything below is only used in `advance_to_block`
+    // and shouldn't use registers
+    int32_t q_strideH = 0;
+    int32_t k_strideH = 0;
+    int32_t v_strideH = 0;
+    int64_t bias_strideH = 0;
+
+    int64_t q_strideB = 0;
+    int64_t k_strideB = 0;
+    int64_t v_strideB = 0;
+    int64_t bias_strideB = 0;
+
+    int32_t num_batches = 0;
+    int32_t num_heads = 0;
+
+    // dropout
+    bool use_dropout = false;
+    unsigned long long dropout_batch_head_rng_offset = 0;
+    float dropout_prob = 0.0f;
+#ifdef HAS_PYTORCH
+    at::PhiloxCudaState rng_engine_inputs = at::PhiloxCudaState(0, 0);
+#endif
+
+    // Moves pointers to what we should process
+    // Returns "false" if there is no work to do
+    CUTLASS_DEVICE bool advance_to_block() {
+      auto batch_id = blockIdx.z;
+      auto head_id = blockIdx.y;
+      auto query_start = blockIdx.x * kQueriesPerBlock;
+
+      auto lse_dim = ceil_div((int32_t)num_queries, kAlignLSE) * kAlignLSE;
+
+      if (kSupportsDropout) {
+        dropout_batch_head_rng_offset =
+            batch_id * num_heads * num_queries * num_keys +
+            head_id * num_queries * num_keys;
+      }
+
+      int64_t q_start = 0, k_start = 0;
+      // Advance to current batch - in case of different sequence lengths
+      constexpr bool kToBatchHook =
+          !cutlass::platform::is_same<ToBatchHookType_, DefaultToBatchHook>::
+              value;
+      if (kToBatchHook) {
+        // Call out to a custom implementation.
+        if (!ToBatchHookType_::advance_to_batch(*this, q_start, k_start)) {
+          return false;
+        }
+      } else if (seqstart_q_ptr != nullptr) {
+        assert(seqstart_k_ptr != nullptr);
+        seqstart_q_ptr += batch_id;
+
+        q_start = seqstart_q_ptr[0];
+        int64_t q_next_start = seqstart_q_ptr[1];
+        int64_t k_end;
+        seqstart_k_ptr += batch_id;
+
+        if (seqlen_k_ptr) {
+          k_start = seqstart_k_ptr[0];
+          k_end = k_start + seqlen_k_ptr[batch_id];
+        } else {
+          k_start = seqstart_k_ptr[0];
+          k_end = seqstart_k_ptr[1];
+        }
+
+        num_queries = q_next_start - q_start;
+        num_keys = k_end - k_start;
+
+        if (query_start >= num_queries) {
+          return false;
+        }
+      } else {
+        query_ptr += batch_id * q_strideB;
+        key_ptr += batch_id * k_strideB;
+        value_ptr += batch_id * v_strideB;
+        output_ptr += int64_t(batch_id * num_queries) * o_strideM;
+        if (output_accum_ptr != nullptr) {
+          output_accum_ptr +=
+              int64_t(batch_id * num_queries) * (head_dim_value * num_heads);
+        }
+        q_start = 0;
+        k_start = 0;
+      }
+
+      // Advance to the current batch / head / query_start
+      query_ptr += (q_start + query_start) * q_strideM + head_id * q_strideH;
+      key_ptr += k_start * k_strideM + head_id * k_strideH;
+
+      value_ptr += k_start * v_strideM + head_id * v_strideH;
+      output_ptr +=
+          int64_t(q_start + query_start) * o_strideM + head_id * head_dim_value;
+
+      if (kSupportsBias && attn_bias_ptr != nullptr) {
+        attn_bias_ptr += (batch_id * bias_strideB) + (head_id * bias_strideH);
+      }
+      if (output_accum_ptr != nullptr) {
+        output_accum_ptr +=
+            int64_t(q_start + query_start) * (head_dim_value * num_heads) +
+            head_id * head_dim_value;
+      } else {
+        // Accumulate directly in the destination buffer (eg for f32)
+        output_accum_ptr = (accum_t*)output_ptr;
+      }
+
+      if (logsumexp_ptr != nullptr) {
+        // lse[batch_id, head_id, query_start]
+        logsumexp_ptr +=
+            batch_id * lse_dim * num_heads + head_id * lse_dim + query_start;
+      }
+
+      // Custom masking
+      if (custom_mask_type == CausalFromBottomRight) {
+        causal_diagonal_offset = num_keys - num_queries;
+      }
+      // We use num_keys_absolute to index into the rng_state
+      // We need this index to match between forward and backwards
+      num_keys_absolute = num_keys;
+      if (custom_mask_type == CausalFromTopLeft ||
+          custom_mask_type == CausalFromBottomRight) {
+        // the bottom row of the current block is query_start + kQueriesPerBlock
+        // the last active key is then query_start + causal_diagonal_offset +
+        // kQueriesPerBlock so num_keys is the min between actual num_keys and
+        // this to avoid extra computations
+        num_keys = cutlass::fast_min(
+            int32_t(query_start + causal_diagonal_offset + kQueriesPerBlock),
+            num_keys);
+      }
+
+      num_queries -= query_start;
+      num_batches = 0; // no longer used after
+
+      // If num_queries == 1, and there is only one key head we're wasting
+      // 15/16th of tensor core compute In that case :
+      //  - we only launch kernels for head_id % kQueriesPerBlock == 0
+      //  - we iterate over heads instead of queries (strideM = strideH)
+      if (num_queries == 1 && k_strideH == 0 && v_strideH == 0) {
+        if (head_id % kQueriesPerBlock != 0)
+          return false;
+        q_strideM = q_strideH;
+        num_queries = num_heads;
+        num_heads = 1; // unused but here for intent
+        // remove causal since n_query = 1
+        // otherwise, offset would change with head !
+        custom_mask_type = NoCustomMask;
+        o_strideM = head_dim_value;
+      }
+
+      // Make sure the compiler knows these variables are the same on all
+      // the threads of the warp.
+      // Only worth doing if they could have been modified above.
+      query_ptr = warp_uniform(query_ptr);
+      key_ptr = warp_uniform(key_ptr);
+      value_ptr = warp_uniform(value_ptr);
+      if (kSupportsBias) {
+        attn_bias_ptr = warp_uniform(attn_bias_ptr);
+      }
+      output_ptr = warp_uniform(output_ptr);
+      output_accum_ptr = warp_uniform(output_accum_ptr);
+      logsumexp_ptr = warp_uniform(logsumexp_ptr);
+      num_queries = warp_uniform(num_queries);
+      num_keys = warp_uniform(num_keys);
+      num_heads = warp_uniform(num_heads);
+      o_strideM = warp_uniform(o_strideM);
+      custom_mask_type = warp_uniform(custom_mask_type);
+      return true;
+    }
+
+    __host__ dim3 getBlocksGrid() const {
+      return dim3(
+          ceil_div(num_queries, (int32_t)kQueriesPerBlock),
+          num_heads,
+          num_batches);
+    }
+
+    __host__ dim3 getThreadsGrid() const {
+      return dim3(kWarpSize, kNumWarpsPerBlock, 1);
+    }
+  };
+
+  struct MM0 {
+    /*
+      In this first matmul, we compute a block of `Q @ K.T`.
+      While the calculation result is still hot in registers, we update
+      `mi`, `m_prime`, `s_prime` in shared-memory, and then store this value
+      into a shared-memory ("AccumulatorSharedStorage") that is used later as
+      operand A for the second matmul (see MM1)
+    */
+    using GemmType = DefaultGemmType<ArchTag, scalar_t>;
+
+    using OpClass = typename GemmType::OpClass;
+    using DefaultConfig =
+        typename cutlass::gemm::device::DefaultGemmConfiguration<
+            OpClass,
+            ArchTag,
+            scalar_t,
+            scalar_t,
+            scalar_t, // ElementC
+            accum_t // ElementAccumulator
+            >;
+    static constexpr int kAlignmentA =
+        kIsAligned ? DefaultConfig::kAlignmentA : GemmType::kMinimumAlignment;
+    static constexpr int kAlignmentB =
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment;
+    using ThreadblockShape = cutlass::gemm::
+        GemmShape<kQueriesPerBlock, kKeysPerBlock, GemmType::ThreadK>;
+    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+    using DefaultMma = typename cutlass::gemm::threadblock::FindDefaultMma<
+        scalar_t, // ElementA,
+        cutlass::layout::RowMajor, // LayoutA,
+        kAlignmentA,
+        scalar_t, // ElementB,
+        cutlass::layout::ColumnMajor, // LayoutB,
+        kAlignmentB,
+        accum_t,
+        cutlass::layout::RowMajor, // LayoutC,
+        OpClass,
+        ArchTag, // ArchTag
+        ThreadblockShape, // ThreadblockShape
+        WarpShape, // WarpShape
+        typename GemmType::InstructionShape, // InstructionShape
+        ArchTag::kMinComputeCapability >= 80 && kIsHalf
+            ? 4
+            : DefaultConfig::kStages,
+        typename GemmType::Operator // Operator
+        >::DefaultMma;
+    using MmaCore = typename DefaultMma::MmaCore;
+    using IteratorA = typename DefaultMma::IteratorA;
+    using IteratorB = typename DefaultMma::IteratorB;
+    using DefaultThreadblockMma = typename DefaultMma::ThreadblockMma;
+    using Mma = typename cutlass::platform::conditional<
+        kSingleValueIteration,
+        typename MakeCustomMma<DefaultThreadblockMma, kMaxK>::Mma,
+        DefaultThreadblockMma>::type;
+    using AccumLambdaIterator = typename DefaultMmaAccumLambdaIterator<
+        typename Mma::Operator::IteratorC,
+        accum_t,
+        kWarpSize>::Iterator;
+    static_assert(
+        MmaCore::WarpCount::kM * MmaCore::WarpCount::kN *
+                MmaCore::WarpCount::kK ==
+            kNumWarpsPerBlock,
+        "");
+
+    // used for efficient load of bias tile Bij from global to shared memory
+    using BiasLoader = TileSmemLoader<
+        scalar_t,
+        cutlass::MatrixShape<kQueriesPerBlock, kKeysPerBlock>,
+        MmaCore::kThreads,
+        // input restriction: kv_len has to be a multiple of this value
+        128 / cutlass::sizeof_bits<scalar_t>::value>;
+
+    // Epilogue to store to shared-memory in a format that we can use later for
+    // the second matmul
+    using B2bGemm = typename cutlass::gemm::threadblock::B2bGemm<
+        typename Mma::Operator::IteratorC,
+        typename Mma::Operator,
+        scalar_t,
+        WarpShape,
+        ThreadblockShape>;
+    using AccumulatorSharedStorage = typename B2bGemm::AccumulatorSharedStorage;
+  };
+
+  struct MM1 {
+    /**
+      Second matmul: perform `attn @ V` where `attn` is the attention (not
+      normalized) and stored in shared memory
+    */
+    using GemmType = DefaultGemmType<ArchTag, scalar_t>;
+
+    using OpClass = typename GemmType::OpClass;
+    using DefaultConfig =
+        typename cutlass::gemm::device::DefaultGemmConfiguration<
+            OpClass,
+            ArchTag,
+            scalar_t,
+            scalar_t,
+            output_accum_t, // ElementC
+            accum_t // ElementAccumulator
+            >;
+    static constexpr int kAlignmentA = DefaultConfig::kAlignmentA; // from smem
+    static constexpr int kAlignmentB =
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment;
+    using ThreadblockShape = cutlass::gemm::
+        GemmShape<kQueriesPerBlock, kKeysPerBlock, GemmType::ThreadK>;
+    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+    using InstructionShape = typename GemmType::InstructionShape;
+
+    using LayoutB = cutlass::layout::RowMajor;
+    using DefaultGemm = cutlass::gemm::kernel::DefaultGemm<
+        scalar_t, // ElementA,
+        cutlass::layout::RowMajor, // LayoutA,
+        kAlignmentA,
+        scalar_t, // ElementB,
+        LayoutB, // LayoutB,
+        kAlignmentB,
+        output_accum_t,
+        cutlass::layout::RowMajor, // LayoutC,
+        accum_t,
+        OpClass,
+        ArchTag,
+        ThreadblockShape,
+        WarpShape,
+        typename GemmType::InstructionShape,
+        typename DefaultConfig::EpilogueOutputOp,
+        void, // ThreadblockSwizzle - not used
+        ArchTag::kMinComputeCapability >= 80 && kIsHalf
+            ? 4
+            : DefaultConfig::kStages,
+        false, // SplitKSerial
+        typename GemmType::Operator>;
+
+    using WarpIteratorA = typename cutlass::gemm::threadblock::
+        DefaultWarpIteratorAFromSharedMemory<
+            typename DefaultGemm::Mma::Policy::Operator::Shape, // WarpShape
+            typename DefaultGemm::Mma::Policy::Operator::InstructionShape,
+            typename DefaultGemm::Mma::Policy::Operator::IteratorA,
+            typename DefaultGemm::Mma::Policy>::WarpIterator;
+    using DefaultMmaFromSmem =
+        typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
+            typename DefaultGemm::Mma,
+            MM0::AccumulatorSharedStorage::Shape::kN, // kMaxK
+            WarpIteratorA,
+            false>; // kScaleOperandA
+    using Mma = typename DefaultMmaFromSmem::Mma;
+    using IteratorB = typename Mma::IteratorB;
+    using WarpCount = typename Mma::WarpCount;
+    static_assert(
+        WarpCount::kM * WarpCount::kN * WarpCount::kK == kNumWarpsPerBlock,
+        "");
+
+    using DefaultEpilogue = typename DefaultGemm::Epilogue;
+    using OutputTileIterator =
+        typename cutlass::epilogue::threadblock::PredicatedTileIterator<
+            typename DefaultEpilogue::OutputTileIterator::ThreadMap,
+            output_t>;
+    using OutputTileIteratorAccum =
+        typename cutlass::epilogue::threadblock::PredicatedTileIterator<
+            typename DefaultEpilogue::OutputTileIterator::ThreadMap,
+            output_accum_t>;
+  };
+
+  static constexpr int64_t kAlignmentQ = MM0::kAlignmentA;
+  static constexpr int64_t kAlignmentK = MM0::kAlignmentB;
+  static constexpr int64_t kAlignmentV = 1;
+
+  // Shared storage - depends on kernel params
+  struct ScalingCoefs {
+    cutlass::Array<accum_t, kQueriesPerBlock> m_prime;
+    cutlass::Array<accum_t, kQueriesPerBlock> s_prime;
+    cutlass::Array<accum_t, kQueriesPerBlock> mi;
+    cutlass::Array<accum_t, kQueriesPerBlock> out_rescale;
+    cutlass::Array<accum_t, kQueriesPerBlock * MM0::MmaCore::WarpCount::kN>
+        addition_storage;
+  };
+
+  struct SharedStorageEpilogueAtEnd : ScalingCoefs {
+    struct SharedStorageAfterMM0 {
+      // Everything here might be overwritten during MM0
+      union {
+        typename MM0::BiasLoader::SmemTile bias;
+        typename MM0::AccumulatorSharedStorage si;
+      };
+      typename MM1::Mma::SharedStorage mm1;
+    };
+
+    union {
+      typename MM0::Mma::SharedStorage mm0;
+      SharedStorageAfterMM0 after_mm0;
+      typename MM1::DefaultEpilogue::SharedStorage epilogue;
+    };
+
+    CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage&
+    epilogue_shared_storage() {
+      return epilogue;
+    }
+  };
+
+  struct SharedStorageEpilogueInLoop : ScalingCoefs {
+    struct SharedStorageAfterMM0 {
+      // Everything here might be overwritten during MM0
+      union {
+        typename MM0::BiasLoader::SmemTile bias;
+        typename MM0::AccumulatorSharedStorage si;
+      };
+      typename MM1::Mma::SharedStorage mm1;
+      typename MM1::DefaultEpilogue::SharedStorage epilogue;
+    };
+
+    union {
+      typename MM0::Mma::SharedStorage mm0;
+      SharedStorageAfterMM0 after_mm0;
+    };
+
+    CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage&
+    epilogue_shared_storage() {
+      return after_mm0.epilogue;
+    }
+  };
+
+  using SharedStorage = typename cutlass::platform::conditional<
+      kSingleValueIteration || kKeepOutputInRF,
+      SharedStorageEpilogueAtEnd,
+      SharedStorageEpilogueInLoop>::type;
+
+  static bool __host__ check_supported(Params const& p) {
+    CHECK_ALIGNED_PTR(p.query_ptr, kAlignmentQ);
+    CHECK_ALIGNED_PTR(p.key_ptr, kAlignmentK);
+    CHECK_ALIGNED_PTR(p.value_ptr, kAlignmentV);
+    if (kSupportsBias) {
+      CHECK_ALIGNED_PTR(p.attn_bias_ptr, kAlignmentQ);
+      XFORMERS_CHECK(
+          p.num_batches <= 1 || p.bias_strideB % kAlignmentQ == 0,
+          "attn_bias is not correctly aligned (strideB)");
+      XFORMERS_CHECK(
+          p.num_heads <= 1 || p.bias_strideH % kAlignmentQ == 0,
+          "attn_bias is not correctly aligned (strideH)");
+      XFORMERS_CHECK(
+          p.bias_strideM % kAlignmentQ == 0,
+          "attn_bias is not correctly aligned");
+    }
+    XFORMERS_CHECK(
+        p.q_strideM % kAlignmentQ == 0,
+        "query is not correctly aligned (strideM)");
+    XFORMERS_CHECK(
+        p.k_strideM % kAlignmentK == 0,
+        "key is not correctly aligned (strideM)");
+    XFORMERS_CHECK(
+        p.v_strideM % kAlignmentV == 0,
+        "value is not correctly aligned (strideM)");
+    XFORMERS_CHECK(
+        p.num_heads <= 1 || p.q_strideH % kAlignmentQ == 0,
+        "query is not correctly aligned (strideH)");
+    XFORMERS_CHECK(
+        p.num_heads <= 1 || p.k_strideH % kAlignmentK == 0,
+        "key is not correctly aligned (strideH)");
+    XFORMERS_CHECK(
+        p.num_heads <= 1 || p.v_strideH % kAlignmentV == 0,
+        "value is not correctly aligned (strideH)");
+    XFORMERS_CHECK(
+        p.custom_mask_type < NumCustomMaskTypes,
+        "invalid value for `custom_mask_type`");
+    return true;
+  }
+
+  static void CUTLASS_DEVICE attention_kernel(Params& p) {
+    // In this block, we will only ever:
+    // - read query[query_start:query_end, :]
+    // - write to output[query_start:query_end, :]
+
+    extern __shared__ char smem_buffer[];
+    SharedStorage& shared_storage = *((SharedStorage*)smem_buffer);
+    auto& m_prime = shared_storage.m_prime;
+    auto& s_prime = shared_storage.s_prime;
+    auto& mi = shared_storage.mi;
+    auto& out_rescale = shared_storage.out_rescale;
+    const uint32_t query_start = blockIdx.x * kQueriesPerBlock;
+
+    static_assert(kQueriesPerBlock < kNumWarpsPerBlock * kWarpSize, "");
+    if (thread_id() < kQueriesPerBlock) {
+      s_prime[thread_id()] = accum_t(0);
+      out_rescale[thread_id()] = accum_t(1.0);
+      m_prime[thread_id()] =
+          -cutlass::platform::numeric_limits<accum_t>::infinity();
+      mi[thread_id()] = -cutlass::platform::numeric_limits<accum_t>::infinity();
+    }
+    typename MM1::Mma::FragmentC accum_o;
+    accum_o.clear();
+
+    auto createOutputIter = [&](int col) -> typename MM1::OutputTileIterator {
+      using OutputTileIterator = typename MM1::OutputTileIterator;
+      return OutputTileIterator(
+          typename OutputTileIterator::Params{(int32_t)p.o_strideM},
+          p.output_ptr,
+          typename OutputTileIterator::TensorCoord{
+              p.num_queries, p.head_dim_value},
+          thread_id(),
+          {0, col});
+    };
+
+    auto createOutputAccumIter = [&](int col) ->
+        typename MM1::OutputTileIteratorAccum {
+          using OutputTileIteratorAccum = typename MM1::OutputTileIteratorAccum;
+          return OutputTileIteratorAccum(
+              typename OutputTileIteratorAccum::Params{
+                  (int32_t)(p.head_dim_value * p.num_heads)},
+              p.output_accum_ptr,
+              typename OutputTileIteratorAccum::TensorCoord{
+                  p.num_queries, p.head_dim_value},
+              thread_id(),
+              {0, col});
+        };
+
+#ifdef HAS_PYTORCH
+    curandStatePhilox4_32_10_t curand_state_init;
+    if (kSupportsDropout && p.use_dropout) {
+      const auto seeds = at::cuda::philox::unpack(p.rng_engine_inputs);
+
+      // each element of the attention matrix P with shape
+      // (batch_sz, n_heads, n_queries, n_keys) is associated with a single
+      // offset in RNG sequence. we initialize the RNG state with offset that
+      // starts at the beginning of a (n_queries, n_keys) matrix for this
+      // block's batch_id and head_id
+      // initializing rng state is very expensive, so we run once per kernel,
+      // rather than once per iteration. each iteration takes a copy of the
+      // initialized RNG state and offsets it as needed.
+      curand_init(
+          std::get<0>(seeds),
+          0,
+          std::get<1>(seeds) + p.dropout_batch_head_rng_offset,
+          &curand_state_init);
+    }
+#endif
+
+    // Iterate through keys
+    for (int32_t iter_key_start = 0; iter_key_start < p.num_keys;
+         iter_key_start += kKeysPerBlock) {
+      int32_t problem_size_0_m =
+          cutlass::fast_min((int32_t)kQueriesPerBlock, p.num_queries);
+      int32_t problem_size_0_n = cutlass::fast_min(
+          int32_t(kKeysPerBlock), p.num_keys - iter_key_start);
+      int32_t const& problem_size_0_k = p.head_dim;
+      int32_t const& problem_size_1_n = p.head_dim_value;
+      int32_t const& problem_size_1_k = problem_size_0_n;
+
+      auto prologueV = [&](int blockN) {
+        typename MM1::Mma::IteratorB iterator_V(
+            typename MM1::IteratorB::Params{typename MM1::LayoutB(p.v_strideM)},
+            p.value_ptr + iter_key_start * p.v_strideM,
+            {problem_size_1_k, problem_size_1_n},
+            thread_id(),
+            cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN});
+        MM1::Mma::prologue(
+            shared_storage.after_mm0.mm1,
+            iterator_V,
+            thread_id(),
+            problem_size_1_k);
+      };
+
+      __syncthreads(); // Need to have shared memory initialized, and `m_prime`
+                       // updated from end of prev iter
+      //
+      // MATMUL: Q.K_t
+      //
+      // Computes the block-matrix product of:
+      // (a) query[query_start:query_end, :]
+      // with
+      // (b) key[iter_key_start:iter_key_start + kKeysPerBlock]
+      // and stores that into `shared_storage.si`
+      //
+
+      // Compute threadblock location
+      cutlass::gemm::GemmCoord tb_tile_offset = {0, 0, 0};
+
+      cutlass::MatrixCoord tb_offset_A{
+          tb_tile_offset.m() * MM0::Mma::Shape::kM, tb_tile_offset.k()};
+
+      cutlass::MatrixCoord tb_offset_B{
+          tb_tile_offset.k(), tb_tile_offset.n() * MM0::Mma::Shape::kN};
+
+      // Construct iterators to A and B operands
+      typename MM0::IteratorA iterator_A(
+          typename MM0::IteratorA::Params(
+              typename MM0::MmaCore::LayoutA(p.q_strideM)),
+          p.query_ptr,
+          {problem_size_0_m, problem_size_0_k},
+          thread_id(),
+          tb_offset_A);
+
+      typename MM0::IteratorB iterator_B(
+          typename MM0::IteratorB::Params(
+              typename MM0::MmaCore::LayoutB(p.k_strideM)),
+          p.key_ptr + iter_key_start * p.k_strideM,
+          {problem_size_0_k, problem_size_0_n},
+          thread_id(),
+          tb_offset_B);
+
+      auto my_warp_id = warp_uniform(warp_id());
+      auto my_lane_id = lane_id();
+
+      // Construct thread-scoped matrix multiply
+      typename MM0::Mma mma(
+          shared_storage.mm0, thread_id(), my_warp_id, my_lane_id);
+
+      typename MM0::Mma::FragmentC accum;
+
+      accum.clear();
+
+      auto gemm_k_iterations =
+          (problem_size_0_k + MM0::Mma::Shape::kK - 1) / MM0::Mma::Shape::kK;
+
+      // Compute threadblock-scoped matrix multiply-add
+      mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
+      __syncthreads();
+
+      if (kPreloadV) {
+        prologueV(0);
+      } else {
+        MM1::Mma::drain_cp_asyncs();
+      }
+
+      typename MM0::Mma::Operator::IteratorC::TensorCoord
+          iteratorC_tile_offset = {
+              (tb_tile_offset.m() * MM0::Mma::WarpCount::kM) +
+                  (my_warp_id % MM0::Mma::WarpCount::kM),
+              (tb_tile_offset.n() * MM0::Mma::WarpCount::kN) +
+                  (my_warp_id / MM0::Mma::WarpCount::kM)};
+
+      // multiply by scaling factor
+      if (kSupportsBias) {
+        accum =
+            cutlass::multiplies<typename MM0::Mma::FragmentC>()(p.scale, accum);
+      }
+
+      // apply attention bias if applicable
+      if (kSupportsBias && p.attn_bias_ptr != nullptr) {
+        // load bias tile Bij into shared memory
+        typename MM0::BiasLoader::GmemTileIterator bias_iter(
+            {cutlass::layout::RowMajor(p.bias_strideM)},
+            // attn_bias_pointer points to matrix of size (n_queries, n_keys)
+            // for the relevant batch_id and head_id
+            p.attn_bias_ptr + query_start * p.bias_strideM + iter_key_start,
+            {problem_size_0_m, problem_size_0_n},
+            thread_id());
+        cutlass::TensorRef<scalar_t, cutlass::layout::RowMajor> bias_tensor_ref(
+            shared_storage.after_mm0.bias.data(),
+            cutlass::layout::RowMajor(MM0::ThreadblockShape::kN));
+        typename MM0::BiasLoader::SmemTileIterator smem_tile_iter(
+            bias_tensor_ref, thread_id());
+        MM0::BiasLoader::load(bias_iter, smem_tile_iter);
+
+        // Pij += Bij, Pij is in register fragment and Bij is in shared memory
+        auto lane_offset = MM0::AccumLambdaIterator::get_lane_offset(
+            my_lane_id, my_warp_id, iteratorC_tile_offset);
+        MM0::AccumLambdaIterator::iterateRows(
+            lane_offset,
+            [&](int accum_m) {},
+            [&](int accum_m, int accum_n, int idx) {
+              if (accum_m < problem_size_0_m && accum_n < problem_size_0_n) {
+                accum[idx] += bias_tensor_ref.at({accum_m, accum_n});
+              }
+            },
+            [&](int accum_m) {});
+      }
+
+      // Mask out last if causal
+      // This is only needed if upper-right corner of current query / key block
+      // intersects the mask Coordinates of upper-right corner of current block
+      // is y=query_start x=min(iter_key_start + kKeysPerBlock, num_keys)) The
+      // first masked element is x = y + offset -> query_start + offset There is
+      // intersection (and we need to mask) if min(iter_key_start +
+      // kKeysPerBlock, num_keys)) >= query_start + offset
+      if (p.custom_mask_type &&
+          cutlass::fast_min(iter_key_start + kKeysPerBlock, p.num_keys) >=
+              (query_start + p.causal_diagonal_offset)) {
+        auto query_start = blockIdx.x * kQueriesPerBlock;
+        auto lane_offset = MM0::AccumLambdaIterator::get_lane_offset(
+            my_lane_id, my_warp_id, iteratorC_tile_offset);
+        int32_t last_col;
+        MM0::AccumLambdaIterator::iterateRows(
+            lane_offset,
+            [&](int accum_m) {
+              // last absolute col is (last absolute query + offset)
+              // last local col is (last absolute query + offset -
+              // iter_key_start)
+              last_col = query_start + accum_m + p.causal_diagonal_offset -
+                  iter_key_start;
+            },
+            [&](int accum_m, int accum_n, int idx) {
+              if (accum_n > last_col) {
+                accum[idx] =
+                    -cutlass::platform::numeric_limits<accum_t>::infinity();
+              }
+            },
+            [&](int accum_m) {});
+      }
+      // Update `mi` from accum stored in registers
+      // Also does accum[i] <- exp(accum[i] - mi)
+      iterative_softmax<typename MM0::Mma::Operator::IteratorC>(
+          accum_o,
+          accum,
+          mi,
+          m_prime,
+          s_prime,
+          out_rescale,
+          shared_storage.addition_storage,
+          my_lane_id,
+          thread_id(),
+          my_warp_id,
+          p.num_keys - iter_key_start,
+          iter_key_start == 0,
+          iteratorC_tile_offset,
+          kSupportsBias ? 1.0f : p.scale);
+
+      // Output results to shared-memory
+      int warp_idx_mn_0 = my_warp_id %
+          (MM0::Mma::Base::WarpCount::kM * MM0::Mma::Base::WarpCount::kN);
+      auto output_tile_coords = cutlass::MatrixCoord{
+          warp_idx_mn_0 % MM0::Mma::Base::WarpCount::kM,
+          warp_idx_mn_0 / MM0::Mma::Base::WarpCount::kM};
+
+      MM0::B2bGemm::accumToSmem(
+          shared_storage.after_mm0.si, accum, my_lane_id, output_tile_coords);
+
+      __syncthreads();
+
+#ifdef HAS_PYTORCH
+      // apply dropout (if applicable) after we've written Pij to smem.
+      // dropout is applied by multiplying each element of Pij by:
+      // - 0 with probability dropout_p
+      // - 1 / (1 - dropout_p) with probability 1 - dropout_p
+      //
+      // for backward purposes we want to be able to map each element of the
+      // attention matrix to the same random uniform number as the one we used
+      // in forward, without needing to use the same iteration order or having
+      // to store the dropout matrix. its possible to do this in registers but
+      // it ends up being very slow because each thread having noncontiguous
+      // strips of the Pij tile means we have to skip around a lot, and also
+      // have to generate a single random number at a time
+      if (kSupportsDropout && p.use_dropout) {
+        auto si = shared_storage.after_mm0.si.accum_ref();
+        // each thread handles a contiguous sequence of elements from Sij, all
+        // coming from the same row. the reason they have to come from the same
+        // row is that the sampling random numbers from a contiguous random
+        // number sequence is much more efficient than jumping around, and the
+        // linear offset of each element of S (the global matrix) maps to an
+        // offset in a random number sequence. for S, the end of a row and the
+        // beginning of the next have adjacent offsets, but for Sij, this is not
+        // necessarily the case.
+        const int num_threads = blockDim.x * blockDim.y * blockDim.z;
+        const int threads_per_row =
+            cutlass::fast_min(num_threads / problem_size_0_m, problem_size_0_n);
+        const int elts_per_thread = cutlass::round_nearest(
+            cutlass::ceil_div(problem_size_0_n, threads_per_row), 4);
+
+        const int thread_i = thread_id() / threads_per_row;
+        const int thread_start_j =
+            (thread_id() % threads_per_row) * elts_per_thread;
+
+        if (thread_i < problem_size_0_m && thread_start_j < problem_size_0_n) {
+          curandStatePhilox4_32_10_t curand_state = curand_state_init;
+          skipahead(
+              static_cast<unsigned long long>(
+                  (query_start + thread_i) * p.num_keys_absolute +
+                  (iter_key_start + thread_start_j)),
+              &curand_state);
+          const float dropout_scale = 1.0 / (1.0 - p.dropout_prob);
+
+          // apply dropout scaling to elements this thread is responsible for,
+          // in chunks of 4
+          for (int sij_start_col_idx = thread_start_j; sij_start_col_idx <
+               cutlass::fast_min(thread_start_j + elts_per_thread,
+                                 problem_size_0_n);
+               sij_start_col_idx += 4) {
+            const float4 rand_uniform_quad = curand_uniform4(&curand_state);
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int quad_idx = 0; quad_idx < 4; ++quad_idx) {
+              si.at({thread_i, sij_start_col_idx + quad_idx}) *=
+                  static_cast<scalar_t>(
+                      dropout_scale *
+                      ((&rand_uniform_quad.x)[quad_idx] > p.dropout_prob));
+            }
+          }
+        }
+        __syncthreads(); // p.use_dropout should have same value kernel-wide
+      }
+#endif
+
+      //
+      // MATMUL: Attn . V
+      // Run the matmul `attn @ V` for a block of attn and V.
+      // `attn` is read from shared memory (in `shared_storage_si`)
+      // `V` is read from global memory (with iterator_B)
+      //
+
+      const int64_t nBlockN = kSingleValueIteration
+          ? 1
+          : ceil_div(
+                (int64_t)problem_size_1_n, int64_t(MM1::ThreadblockShape::kN));
+      for (int blockN = 0; blockN < nBlockN; ++blockN) {
+        int gemm_k_iterations =
+            (problem_size_1_k + MM1::Mma::Shape::kK - 1) / MM1::Mma::Shape::kK;
+
+        // Compute threadblock-scoped matrix multiply-add and store it in accum
+        // (in registers)
+        if (!kPreloadV) {
+          __syncthreads(); // we share shmem between mma and epilogue
+        }
+
+        typename MM1::Mma::IteratorB iterator_V(
+            typename MM1::IteratorB::Params{typename MM1::LayoutB(p.v_strideM)},
+            p.value_ptr + iter_key_start * p.v_strideM,
+            {problem_size_1_k, problem_size_1_n},
+            thread_id(),
+            cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN});
+        typename MM1::Mma mma_pv(
+            // operand A: Pij_dropped in shared memory
+            shared_storage.after_mm0.si.accum_ref(),
+            // operand B: shared memory staging area for Vj, which is loaded
+            // from global memory
+            shared_storage.after_mm0.mm1.operand_B_ref(),
+            (int)thread_id(),
+            (int)my_warp_id,
+            (int)my_lane_id);
+        mma_pv.set_prologue_done(kPreloadV);
+        if (!kKeepOutputInRF) {
+          accum_o.clear();
+        }
+        mma_pv(gemm_k_iterations, accum_o, iterator_V, accum_o);
+        __syncthreads();
+
+        if (kPreloadV && !kSingleValueIteration && blockN + 1 < nBlockN) {
+          prologueV(blockN + 1);
+        }
+
+        if (!kKeepOutputInRF) {
+          MM1::Mma::drain_cp_asyncs();
+          DISPATCH_BOOL(
+              iter_key_start == 0, kIsFirst, ([&] {
+                DISPATCH_BOOL(
+                    (iter_key_start + kKeysPerBlock) >= p.num_keys,
+                    kIsLast,
+                    ([&] {
+                      using DefaultEpilogue = typename MM1::DefaultEpilogue;
+                      using DefaultOp =
+                          typename MM1::DefaultConfig::EpilogueOutputOp;
+                      using ElementCompute = typename DefaultOp::ElementCompute;
+                      using EpilogueOutputOp = typename cutlass::epilogue::
+                          thread::MemoryEfficientAttentionNormalize<
+                              typename cutlass::platform::conditional<
+                                  kIsLast::value,
+                                  output_t,
+                                  output_accum_t>::type,
+                              output_accum_t,
+                              DefaultOp::kCount,
+                              typename DefaultOp::ElementAccumulator,
+                              ElementCompute,
+                              kIsFirst::value,
+                              kIsLast::value,
+                              cutlass::Array<ElementCompute, kQueriesPerBlock>>;
+                      using Epilogue = typename cutlass::epilogue::threadblock::
+                          EpiloguePipelined<
+                              typename DefaultEpilogue::Shape,
+                              typename MM1::Mma::Operator,
+                              DefaultEpilogue::kPartitionsK,
+                              typename cutlass::platform::conditional<
+                                  kIsLast::value,
+                                  typename MM1::OutputTileIterator,
+                                  typename MM1::OutputTileIteratorAccum>::type,
+                              typename DefaultEpilogue::
+                                  AccumulatorFragmentIterator,
+                              typename DefaultEpilogue::WarpTileIterator,
+                              typename DefaultEpilogue::SharedLoadIterator,
+                              EpilogueOutputOp,
+                              typename DefaultEpilogue::Padding,
+                              DefaultEpilogue::kFragmentsPerIteration,
+                              true, // IterationsUnroll
+                              typename MM1::OutputTileIteratorAccum // Read
+                                                                    // iterator
+                              >;
+
+                      int col = blockN * MM1::Mma::Shape::kN;
+                      auto source_iter = createOutputAccumIter(col);
+                      auto dest_iter = call_conditional<
+                          kIsLast::value,
+                          decltype(createOutputIter),
+                          decltype(createOutputAccumIter)>::
+                          apply(createOutputIter, createOutputAccumIter, col);
+                      EpilogueOutputOp rescale(s_prime, out_rescale);
+                      Epilogue epilogue(
+                          shared_storage.epilogue_shared_storage(),
+                          thread_id(),
+                          my_warp_id,
+                          my_lane_id);
+                      epilogue(rescale, dest_iter, accum_o, source_iter);
+                    }));
+              }));
+          if (!kSingleValueIteration) {
+            __syncthreads();
+          }
+        }
+      }
+      __syncthreads(); // we modify `m_prime` after
+    }
+
+    if (kKeepOutputInRF) {
+      constexpr bool kIsFirst = true;
+      constexpr bool kIsLast = true;
+      using DefaultEpilogue = typename MM1::DefaultEpilogue;
+      using DefaultOp = typename MM1::DefaultConfig::EpilogueOutputOp;
+      using ElementCompute = typename DefaultOp::ElementCompute;
+      using EpilogueOutputOp =
+          typename cutlass::epilogue::thread::MemoryEfficientAttentionNormalize<
+              output_t, // output
+              output_accum_t, // source
+              DefaultOp::kCount,
+              typename DefaultOp::ElementAccumulator, // accum
+              output_accum_t, // compute
+              kIsFirst,
+              kIsLast,
+              cutlass::Array<ElementCompute, kQueriesPerBlock>>;
+      using Epilogue =
+          typename cutlass::epilogue::threadblock::EpiloguePipelined<
+              typename DefaultEpilogue::Shape,
+              typename MM1::Mma::Operator,
+              DefaultEpilogue::kPartitionsK,
+              typename MM1::OutputTileIterator, // destination
+              typename DefaultEpilogue::AccumulatorFragmentIterator,
+              typename DefaultEpilogue::WarpTileIterator,
+              typename DefaultEpilogue::SharedLoadIterator,
+              EpilogueOutputOp,
+              typename DefaultEpilogue::Padding,
+              DefaultEpilogue::kFragmentsPerIteration,
+              true, // IterationsUnroll
+              typename MM1::OutputTileIteratorAccum // source tile
+              >;
+      auto dest_iter = createOutputIter(0);
+      EpilogueOutputOp rescale(s_prime, out_rescale);
+      Epilogue epilogue(
+          shared_storage.epilogue_shared_storage(),
+          thread_id(),
+          warp_id(),
+          lane_id());
+      MM1::Mma::drain_cp_asyncs();
+      epilogue(rescale, dest_iter, accum_o);
+    }
+
+    // 7. Calculate logsumexp
+    // To make the backward easier, we pad logsumexp with `inf`
+    // this avoids a few bound checks, and is not more expensive during fwd
+    static_assert(kQueriesPerBlock < kNumWarpsPerBlock * kWarpSize, "");
+    if (p.logsumexp_ptr && thread_id() < kQueriesPerBlock) {
+      auto lse_dim = ceil_div((int32_t)p.num_queries, kAlignLSE) * kAlignLSE;
+      constexpr float kLog2e = 1.4426950408889634074; // log_2(e) = M_LOG2E
+      if (thread_id() < p.num_queries) {
+        p.logsumexp_ptr[thread_id()] = accum_t(mi[thread_id()] / kLog2e) +
+            cutlass::fast_log(accum_t(s_prime[thread_id()]));
+      } else if (thread_id() < lse_dim) {
+        p.logsumexp_ptr[thread_id()] =
+            cutlass::platform::numeric_limits<accum_t>::infinity();
+      }
+    }
+  }
+
+  template <typename WarpIteratorC>
+  CUTLASS_DEVICE static void iterative_softmax(
+      typename WarpIteratorC::Fragment& frag_o, // output so far
+      typename WarpIteratorC::Fragment& frag,
+      cutlass::Array<accum_t, kQueriesPerBlock>& mi,
+      cutlass::Array<accum_t, kQueriesPerBlock>& m_prime,
+      cutlass::Array<accum_t, kQueriesPerBlock>& s_prime,
+      cutlass::Array<accum_t, kQueriesPerBlock>& out_rescale,
+      cutlass::Array<accum_t, kQueriesPerBlock * MM0::MmaCore::WarpCount::kN>&
+          addition_storage,
+      int8_t lane_id,
+      int8_t thread_id,
+      int8_t warp_id,
+      int max_col,
+      bool is_first,
+      typename WarpIteratorC::TensorCoord const& tile_offset,
+      float scaling) {
+    /* Iterates on the accumulator and corresponding position on result matrix
+
+    (1) Update `mi[r]` to the max value of the row `r`
+    (2) In a second iteration do the following:
+        (a) accum   <- exp(accum - mi)
+        (b) m_prime <- exp(m_prime - mi)
+        (c) s_prime <- s_prime * m_prime + sum(accum)
+
+    All of this is done on registers, before we store all of this
+    on shared memory for the next matmul with Value.
+    */
+    using Fragment = typename WarpIteratorC::Fragment;
+    using LambdaIterator = typename DefaultMmaAccumLambdaIterator<
+        WarpIteratorC,
+        accum_t,
+        kWarpSize>::Iterator;
+    // Convert to `accum_t` (rather than double)
+    constexpr float kLog2e = 1.4426950408889634074; // log_2(e) = M_LOG2E
+
+    static_assert(kQueriesPerBlock % kNumWarpsPerBlock == 0, "");
+    static constexpr int kLinesPerWarp = kQueriesPerBlock / kNumWarpsPerBlock;
+
+    frag = cutlass::multiplies<Fragment>()(scaling * kLog2e, frag);
+
+    auto lane_offset =
+        LambdaIterator::get_lane_offset(lane_id, warp_id, tile_offset);
+
+    // First update `mi` to the max per-row
+    {
+      accum_t max;
+      LambdaIterator::iterateRows(
+          lane_offset,
+          [&](int accum_m) {
+            max = -cutlass::platform::numeric_limits<accum_t>::infinity();
+          },
+          [&](int accum_m, int accum_n, int idx) {
+            if (accum_n < max_col) {
+              max = cutlass::fast_max(max, frag[idx]);
+            }
+          },
+          [&](int accum_m) {
+            // Having 4x atomicMax seems faster than reduce within warp
+            // first...
+            atomicMaxFloat(&mi[accum_m], max);
+          });
+    }
+
+    // Make sure we all share the update values for `mi`
+    __syncthreads();
+
+    // Doing this `exp` is quite expensive. Let's
+    // split it across the warps
+    bool restore_mi_to_minus_inf = false;
+    if (lane_id < kLinesPerWarp) {
+      int id = warp_id * kLinesPerWarp + lane_id;
+      auto m_prime_id = m_prime[id];
+      auto mi_id = mi[id];
+      bool changed = m_prime_id < mi_id; // `false` if both are -inf
+      if (changed) {
+        auto m_prime_exp = exp2f(m_prime_id - mi_id);
+        out_rescale[id] = m_prime_exp;
+        s_prime[id] *= m_prime_exp;
+      } else {
+        // Only when bias is enabled, it's possible that all the first values
+        // of attention are masked to `-inf`. In that case we want to avoid
+        // `nan = exp2f(-inf - (-inf))` so we temporarily set `mi` to 0
+        if (kSupportsBias &&
+            mi_id == -cutlass::platform::numeric_limits<accum_t>::infinity()) {
+          restore_mi_to_minus_inf = true;
+          mi[id] = 0.0f;
+        }
+        out_rescale[id] = 1.0f;
+      }
+    }
+    __syncthreads(); // Update output fragments
+    if (kKeepOutputInRF && !is_first) {
+      accum_t line_rescale;
+      LambdaIterator::iterateRows(
+          lane_offset,
+          [&](int accum_m) { line_rescale = out_rescale[accum_m]; },
+          [&](int accum_m, int accum_n, int idx) {
+            frag_o[idx] = frag_o[idx] * line_rescale;
+          },
+          [&](int accum_m) {});
+    }
+    // Update accum_m, accum_n, ...
+    {
+      accum_t mi_row, total_row;
+      LambdaIterator::iterateRows(
+          lane_offset,
+          [&](int accum_m) { mi_row = mi[accum_m]; },
+          [&](int accum_m, int accum_n, int idx) {
+            frag[idx] =
+                (accum_n < max_col) ? exp2f(frag[idx] - mi_row) : accum_t(0.0);
+          },
+          [&](int accum_m) {});
+      LambdaIterator::iterateRows(
+          lane_offset,
+          [&](int accum_m) { total_row = 0.0; },
+          [&](int accum_m, int accum_n, int idx) { total_row += frag[idx]; },
+          [&](int accum_m) {
+            if (LambdaIterator::reduceSameRow(
+                    lane_id, total_row, [](accum_t a, accum_t b) {
+                      return a + b;
+                    })) {
+              // NOTE: we could atomically add `total_row` to `s_prime`, but
+              // it's faster (and deterministic) to avoid atomics here
+              addition_storage
+                  [accum_m + kQueriesPerBlock * tile_offset.column()] =
+                      total_row;
+            }
+          });
+    }
+    __syncthreads();
+    if (lane_id < kLinesPerWarp) {
+      int id = warp_id * kLinesPerWarp + lane_id;
+      accum_t total_row = s_prime[id];
+      if (restore_mi_to_minus_inf) {
+        // Restore `mi`, see above when we set `restore_mi_to_minus_inf=true`
+        mi[id] = -cutlass::platform::numeric_limits<accum_t>::infinity();
+      } else {
+        m_prime[id] = mi[id];
+      }
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < MM0::MmaCore::WarpCount::kN; ++i) {
+        total_row += addition_storage[id + kQueriesPerBlock * i];
+      }
+      s_prime[id] = total_row;
+    }
+  }
+
+  static CUTLASS_DEVICE int8_t lane_id() {
+    return threadIdx.x;
+  }
+  static CUTLASS_DEVICE int8_t warp_id() {
+    return threadIdx.y;
+  }
+  static CUTLASS_DEVICE int16_t thread_id() {
+    return threadIdx.x + threadIdx.y * blockDim.x;
+  }
+};
+
+template <typename AK>
+__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
+    attention_kernel_batched_impl(typename AK::Params p) {
+  if (!p.advance_to_block()) {
+    return;
+  }
+  AK::attention_kernel(p);
+}
+
+template <typename AK>
+__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
+    attention_kernel_batched(typename AK::Params params);
diff --git a/examples/41_fused_multi_head_attention/piped_subprocess.py b/examples/41_fused_multi_head_attention/piped_subprocess.py
new file mode 100644
index 0000000000..82351f492c
--- /dev/null
+++ b/examples/41_fused_multi_head_attention/piped_subprocess.py
@@ -0,0 +1,144 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from typing import List
+import torch
+import subprocess
+import sys
+import tempfile
+import os
+import numpy as np
+
+
+TORCH_DTYPE_NAME = {
+    torch.float32: "f32",
+    torch.float16: "f16",
+    torch.bfloat16: "b16"
+}
+NAME_TORCH_DTYPE = {v: k for k, v in TORCH_DTYPE_NAME.items()}
+
+def _tensor_from_storage(tensor: torch.Tensor, dtype) -> torch.Tensor:
+    # PyTorch >= 2.0
+    if hasattr(tensor, 'untyped_storage'):
+        return torch.tensor([], dtype=dtype).set_(tensor.untyped_storage())
+    return torch.tensor([], dtype=dtype).set_(tensor.storage().untyped())
+
+class PipedSubprocess:
+    def __init__(self, binary: str) -> None:
+        self.binary = binary
+        self.tempdir_ctx = tempfile.TemporaryDirectory()
+
+    def __enter__(self) -> "PipedSubprocess":
+        self.subp = subprocess.Popen(self.binary, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=sys.stderr, text=True, bufsize=0)
+        self.tempdir = self.tempdir_ctx.__enter__()
+        self.file_counter = 0
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
+        self.tempdir_ctx.__exit__(exc_type, exc_val, exc_tb)
+
+    def temp_filename(self, suffix: str) -> str:
+        self.file_counter += 1
+        return os.path.join(self.tempdir, f"{self.file_counter}{suffix}")
+
+    def write(self, *args) -> None:
+        for a in args:
+            self.subp.stdin.write(str(a) + " ")
+
+    def writeTensor(self, tensor: torch.Tensor, name: str, stride_names: List[str]) -> None:
+        print(f"Py ->C++: {TORCH_DTYPE_NAME[tensor.dtype]}:{name}")
+        tensor_u8 = _tensor_from_storage(tensor, torch.uint8)
+        self.write("tensor_begin", f"{TORCH_DTYPE_NAME[tensor.dtype]}:{name}", tensor_u8.shape[0])
+        filename = self.temp_filename(f"{name}.tensor")
+        assert tensor.storage_offset() == 0
+        with open(filename, "wb+") as fd:
+            fd.write(bytes(tensor_u8.numpy()))
+        self.write("file", filename)
+        self.write("tensor_end")
+
+        for stride_name, stride_value in zip(stride_names, tensor.stride()):
+            self.write(stride_name, stride_value)
+
+    def readTensor(self, name, stride_name, shape) -> torch.Tensor:
+        tmpfile = self.temp_filename(f"{name}.tensor")
+        self.write("tmpfile", tmpfile)
+
+        self.readExpect("tensor_begin")
+        dtype_str, name = self.read().split(":")
+        print(f"C++->Py : {dtype_str}:{name}")
+        u8len = int(self.read())
+        dtype = NAME_TORCH_DTYPE[dtype_str]
+
+        self.readExpect("file")
+        self.readExpect(tmpfile)
+
+        with open(tmpfile, "rb") as fd:
+            data = fd.read(u8len)
+            # `np.array` is not strictly needed, but avoids a torch warning
+            tensor_u8 = torch.frombuffer(np.array(data), dtype=torch.uint8, count=u8len)
+        self.readExpect("tensor_end")
+        
+        tensor = _tensor_from_storage(tensor_u8, dtype)
+        strides = []
+        for sn in stride_name:
+            self.readExpect(sn)
+            strides.append(int(self.read()))
+        if len(strides) != shape:
+            strides.append(1)
+        assert len(strides) == len(shape), name
+        return torch.as_strided(tensor, shape, strides)
+
+    def readNamed(self, name: str):
+        self.readExpect(name)
+        return self.read()
+
+    def readExpect(self, what: str) -> None:
+        r = self.read()
+        if r != what:
+            raise ValueError(f"Read {r} but expected {what}")
+
+    def read(self):
+        read_all = []
+        # Skip initial whitespace
+        while True:
+            r = self.subp.stdout.read(1)
+            if r not in [' ', "\n"]:
+                read_all.append(r)
+                break
+        # Read data
+        while True:
+            r = self.subp.stdout.read(1)
+            if r in [' ', "\n"]:
+                break
+            read_all.append(r)
+        return ''.join(read_all)
+        
diff --git a/examples/41_fused_multi_head_attention/transform/tile_smem_loader.h b/examples/41_fused_multi_head_attention/transform/tile_smem_loader.h
new file mode 100644
index 0000000000..2db928a84c
--- /dev/null
+++ b/examples/41_fused_multi_head_attention/transform/tile_smem_loader.h
@@ -0,0 +1,90 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cutlass/cutlass.h>
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+template <
+    typename scalar_t, // scalar type
+    typename ThreadblockTileShape, // size of tile to load
+    int Threads, // number of participating threads
+    int ElementsPerAccess> // thread access width in elements
+class TileSmemLoader {
+ public:
+  using SmemTile =
+      cutlass::AlignedBuffer<scalar_t, ThreadblockTileShape::kCount>;
+
+  using ThreadMap = cutlass::transform::PitchLinearStripminedThreadMap<
+      cutlass::layout::PitchLinearShape<
+          ThreadblockTileShape::kColumn, // contiguous
+          ThreadblockTileShape::kRow>, // strided
+      Threads, // Threads
+      ElementsPerAccess>; // ElementsPerAccess
+
+  using GmemTileIterator =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          ThreadblockTileShape, // Shape
+          scalar_t, // Element
+          cutlass::layout::RowMajor, // Layout
+          0, // AdvanceRank
+          ThreadMap>; // ThreadMap
+
+  using SmemTileIterator = cutlass::transform::threadblock::RegularTileIterator<
+      ThreadblockTileShape, // Shape
+      scalar_t, // Element
+      cutlass::layout::RowMajor, // Layout
+      0, // AdvanceRank
+      ThreadMap>; // ThreadMap
+
+  using Fragment = typename GmemTileIterator::Fragment;
+
+  /// load a tile from global memory into shared memory
+  CUTLASS_DEVICE
+  static void load(
+      GmemTileIterator tile_load_iter,
+      SmemTileIterator tile_store_iter) {
+    Fragment tb_frag;
+    tb_frag.clear();
+    tile_load_iter.load(tb_frag);
+    tile_store_iter.store(tb_frag);
+
+    __syncthreads();
+  }
+};
diff --git a/examples/42_ampere_tensorop_group_conv/CMakeLists.txt b/examples/42_ampere_tensorop_group_conv/CMakeLists.txt
new file mode 100644
index 0000000000..d470548cdc
--- /dev/null
+++ b/examples/42_ampere_tensorop_group_conv/CMakeLists.txt
@@ -0,0 +1,36 @@
+
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+
+cutlass_example_add_executable(
+  42_ampere_tensorop_group_conv
+  ampere_tensorop_group_conv.cu
+  )
+
diff --git a/examples/42_ampere_tensorop_group_conv/ampere_tensorop_group_conv.cu b/examples/42_ampere_tensorop_group_conv/ampere_tensorop_group_conv.cu
new file mode 100644
index 0000000000..120f04b649
--- /dev/null
+++ b/examples/42_ampere_tensorop_group_conv/ampere_tensorop_group_conv.cu
@@ -0,0 +1,706 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/**
+This example shows how to run group convolution kernels using functions and data structures
+provided by CUTLASS using tensor cores; which we run on a NVIDIA Ampere GPU.
+
+There are 2 group conv mode:
+  1. cutlass::conv::GroupMode::kSingleGroup
+      This mode is for large K problem size: k_per_group (K/groups) equals or larger than
+      threadblock_tile_N. One or multiple threadblocks calculate data of one group.
+  2. cutlass::conv::GroupMode::kMultipleGroup
+      This mode is for small K problem size: k_per_group (K/groups) is smaller than threadblock_tile_N.
+      One threadblock will calculate data from more than one group.
+
+Function profile_convolution_selecter() shows how to choose kernel with different group mode according
+to problem size and threadblock_tile size.
+*/
+
+#include <iostream>
+#include <sstream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/conv/kernel/default_conv2d_group_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/reference/device/convolution.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "helper.h"
+
+// The code section below describes datatype for input, output tensors and computation between
+// elements 
+using ElementAccumulator = float;                  // Data type of accumulator
+using ElementComputeEpilogue = float;              // Data type of epilogue computation (alpha, beta)
+using ElementInputA = cutlass::half_t;             // Data type of elements in input tensor
+using ElementInputB = cutlass::half_t;             // Data type of elements in input tensor
+using ElementOutput = float;                       // Data type of elements in output tensor
+
+using LayoutInputA = cutlass::layout::TensorNHWC;
+using LayoutInputB = cutlass::layout::TensorNHWC;
+using LayoutOutput = cutlass::layout::TensorNHWC;
+
+// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
+using MMAOp = cutlass::arch::OpClassTensorOp;
+
+// This code section describes CUDA SM architecture number
+using SmArch = cutlass::arch::Sm80;
+
+// This code section describes the tile size a thread block will compute
+using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 64>;   // Threadblock tile shape
+
+// This code section describes tile size a warp will compute
+using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>;          // Warp tile shape
+
+// This code section describes the size of MMA op
+using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;    // TensorCore instruction shape
+
+// This code section describes how threadblocks are scheduled on GPU
+using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
+
+// Number of pipelines you want to use
+constexpr int NumStages = 3;
+
+// This code section describes the epilogue part of the kernel, we use default value
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+    ElementOutput,                                     // Data type of output matrix.
+    128 / cutlass::sizeof_bits<ElementOutput>::value,  // The number of elements per vectorized.
+                                                       // memory access. This becomes the vector width of
+                                                       // math instructions in the epilogue too.
+    ElementAccumulator,                                // Data type of accumulator
+    ElementComputeEpilogue>;                           // Data type for alpha/beta in linear combination
+
+// Analytic kernel and operation for single group problem size
+using AnalyticSingleGroupKernel = typename cutlass::conv::kernel::DefaultConv2dGroupFprop<
+  ElementInputA, LayoutInputA,
+  ElementInputB, LayoutInputB,
+  ElementOutput, LayoutOutput,
+  ElementAccumulator,
+  MMAOp,
+  SmArch,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOp,
+  SwizzleThreadBlock,
+  NumStages,
+  cutlass::arch::OpMultiplyAdd,
+  cutlass::conv::GroupMode::kSingleGroup,
+  cutlass::conv::IteratorAlgorithm::kAnalytic
+>::Kernel;
+using AnalyticSingleGroupOperation = cutlass::conv::device::ImplicitGemmConvolution<AnalyticSingleGroupKernel>;
+
+// Analytic kernel and operation for multiple group problem size
+using AnalyticMultipleGroupKernel = typename cutlass::conv::kernel::DefaultConv2dGroupFprop<
+  ElementInputA, LayoutInputA,
+  ElementInputB, LayoutInputB,
+  ElementOutput, LayoutOutput,
+  ElementAccumulator,
+  MMAOp,
+  SmArch,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOp,
+  SwizzleThreadBlock,
+  NumStages,
+  cutlass::arch::OpMultiplyAdd,
+  cutlass::conv::GroupMode::kMultipleGroup,
+  cutlass::conv::IteratorAlgorithm::kAnalytic
+>::Kernel;
+using AnalyticMultipleGroupOperation = cutlass::conv::device::ImplicitGemmConvolution<AnalyticMultipleGroupKernel>;
+
+// Optimized kernel and operation for single group problem size
+using OptimizedSingleGroupKernel = typename cutlass::conv::kernel::DefaultConv2dGroupFprop<
+  ElementInputA, LayoutInputA,
+  ElementInputB, LayoutInputB,
+  ElementOutput, LayoutOutput,
+  ElementAccumulator,
+  MMAOp,
+  SmArch,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOp,
+  SwizzleThreadBlock,
+  NumStages,
+  cutlass::arch::OpMultiplyAdd,
+  cutlass::conv::GroupMode::kSingleGroup,
+  cutlass::conv::IteratorAlgorithm::kOptimized
+>::Kernel;
+using OptimizedSingleGroupOperation = cutlass::conv::device::ImplicitGemmConvolution<OptimizedSingleGroupKernel>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+  cutlass::Tensor4DCoord input_size;
+  cutlass::Tensor4DCoord filter_size;
+  cutlass::Tensor4DCoord padding;
+  cutlass::MatrixCoord conv_stride;
+  cutlass::MatrixCoord dilation;
+  int groups;
+  bool reference_check;
+  bool measure_performance;
+  int iterations;
+  ElementComputeEpilogue alpha;
+  ElementComputeEpilogue beta;
+  bool optimized;
+  std::string tag;
+
+  Options():
+    help(false),
+    input_size(1, 32, 32, 32),
+    filter_size(32, 3, 3, 32),
+    padding(1, 1, 1, 1),
+    conv_stride(1, 1),
+    dilation(1, 1),
+    groups(1),
+    reference_check(false),
+    measure_performance(false),
+    iterations(20),
+    alpha(1),
+    beta(0),
+    optimized(false) { }
+
+  // Verify the problem size is compatible with the CUTLASS Convolution implementation.
+  bool valid() {
+
+    //
+    // CUTLASS attempts to load 128b vectors of cutlass::half_t (F16) elements. Consequently,
+    // all pointers, strides, and tensor extents must be divisible by 8 elements.
+    //
+    int const kAlignment = 8;
+
+    if ((input_size.c() % kAlignment) ||
+      (filter_size.n() % kAlignment)) {
+
+      // misaligned tensors
+      return false;
+    }
+
+    // Invalid padding
+    if ((padding.h() != filter_size.h() / 2) ||
+      (padding.w() != filter_size.w() / 2)) {
+
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Updates input and filter sizes
+  void update(
+    cutlass::Tensor4DCoord input_size,
+    cutlass::Tensor4DCoord filter_size) {
+
+    this->input_size = input_size;
+    this->filter_size = filter_size;
+
+    padding.n() = filter_size.h() / 2;
+    padding.h() = filter_size.h() / 2;
+    padding.w() = filter_size.w() / 2;
+    padding.c() = filter_size.w() / 2;
+  }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+
+    if (cmd.check_cmd_line_flag("ref-check")) {
+      reference_check = true;
+    }
+
+    if (cmd.check_cmd_line_flag("perf-check")) {
+      measure_performance = true;
+    }
+
+    if (cmd.check_cmd_line_flag("optimized")) {
+      optimized = true;
+    }
+
+    cmd.get_cmd_line_argument("n", input_size.n());
+    cmd.get_cmd_line_argument("h", input_size.h());
+    cmd.get_cmd_line_argument("w", input_size.w());
+    cmd.get_cmd_line_argument("c", input_size.c());
+
+    cmd.get_cmd_line_argument("k", filter_size.n());
+    cmd.get_cmd_line_argument("r", filter_size.h());
+    cmd.get_cmd_line_argument("s", filter_size.w());
+
+    cmd.get_cmd_line_argument("g", groups);
+    filter_size.c() = input_size.c() / groups;
+
+    cmd.get_cmd_line_argument("u", conv_stride.row());
+    cmd.get_cmd_line_argument("v", conv_stride.column());
+
+    cmd.get_cmd_line_argument("alpha", alpha);
+    cmd.get_cmd_line_argument("beta", beta);
+    
+    cmd.get_cmd_line_argument("iterations", iterations);
+    cmd.get_cmd_line_argument("tag", tag);
+
+    if (filter_size.h() == 3 && filter_size.w() == 3) {
+      padding = {1, 1, 1, 1};
+    }
+    else {
+      filter_size.h() = 1;
+      filter_size.w() = 1;
+      padding = {0, 0, 0, 0};
+    }
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "42_ampere_tensorop_group_conv example\n\n"
+      << "  This example uses Ampere's Tensor Core operators on F16 data types to compute\n"
+      << "  forward grouped convolution on tensors of layout NHWC.\n\n"
+      << "Options:\n\n"
+      << "  --help               If specified, displays this usage statement.\n\n"
+      << "  --n=<int>            Input tensor extent N\n"
+      << "  --h=<int>            Input tensor extent H\n"
+      << "  --w=<int>            Input tensor extent W\n"
+      << "  --c=<int>            Input tensor extent C\n"
+      << "  --k=<int>            Filter extent K\n"
+      << "  --r=<int>            Filter extent R\n"
+      << "  --s=<int>            Filter extent S\n\n"
+      << "  --g=<int>            Conv groups G\n\n"
+      << "  --u=<int>            Conv stride_h\n\n"
+      << "  --v=<int>            Conv stride_w\n\n"
+      << "  --alpha=<float>      Epilogue scalar alpha\n"
+      << "  --beta=<float>       Epilogue scalar beta\n\n"
+      << "  --ref-check          If set (true), reference check is computed\n"
+      << "  --perf-check         If set (true), performance is measured.\n"
+      << "  --optimized          If set (true), use optimized kernel, otherwise use analytic kernel.\n"
+      << "  --iterations=<int>   Number of profiling iterations to perform.\n"
+      << "  --tag=<string>       String to replicate across the first column in the results table\n";
+
+    out << "\n\nExamples:\n\n"
+      << "$ ./examples/42_ampere_tensorop_group_conv/42_ampere_tensorop_group_conv  --n=4 --h=16 --w=16 --c=256 --k=128 --r=3 --s=3 --g=8 --ref-check\n\n"
+      << "$ ./examples/42_ampere_tensorop_group_conv/42_ampere_tensorop_group_conv  --n=4 --h=16 --w=16 --c=256 --k=128 --r=3 --s=3 --g=2 --ref-check\n\n"
+      << "$ ./examples/42_ampere_tensorop_group_conv/42_ampere_tensorop_group_conv  --n=4 --h=16 --w=16 --c=256 --k=128 --r=3 --s=3 --g=2 --ref-check --optimized\n\n";
+
+    return out;
+  }
+  
+  /// Computes the output tensor size (NPQK)
+  cutlass::Tensor4DCoord output_size() const {
+    return cutlass::Tensor4DCoord(
+      input_size.n(),
+      (input_size.h() + padding.n() + padding.h() - filter_size.h()) / conv_stride.row() + 1,
+      (input_size.w() + padding.w() + padding.c() - filter_size.w()) / conv_stride.column() + 1,
+      filter_size.n());
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+
+    // Number of multiply-adds = NPQK * CRS
+    int64_t fmas = output_size().product() * int64_t(filter_size.h() * filter_size.w() * filter_size.c());
+    
+    // Two flops per multiply-add
+    return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Result {
+  double runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cutlass::Status reference_check;
+  cudaError_t error;
+
+  Result(): 
+    runtime_ms(0), 
+    gflops(0),
+    status(cutlass::Status::kSuccess),
+    reference_check(cutlass::Status::kInvalid),
+    error(cudaSuccess) { }
+
+  static std::ostream & print_header(std::ostream &out, Options const &options) {
+
+    if (!options.tag.empty()) {
+      out << "Name,";
+    }
+
+    out << "Layer,N,H,W,C,K,R,S,G,Runtime,GFLOPs";
+
+    return out;
+  }
+
+  std::ostream & print(std::ostream &out, int idx, Options const &options) {
+
+    if (!options.tag.empty()) {
+      out << options.tag << ",";
+    }
+
+    out
+      << "conv_" << idx << ","
+      << options.input_size.n() << ","
+      << options.input_size.h() << ","
+      << options.input_size.w() << ","
+      << options.input_size.c() << ","
+      << options.filter_size.n() << ","
+      << options.filter_size.h() << ","
+      << options.filter_size.w() << ","
+      << options.groups << ","
+      << runtime_ms << ","
+      << gflops;
+
+    return out;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Runs one benchmark
+template <typename Conv2dOperation>
+Result profile_convolution(Options const &options) {
+
+  Result result;
+
+  //
+  // Allocate host-device tensors using the CUTLASS Utilities.
+  //
+
+  cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(options.input_size);
+  cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(options.filter_size);
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(options.output_size());
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(options.output_size());
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_d(options.output_size());
+
+  //
+  // Initialize tensors
+  //
+
+  // Fill tensor A on host with uniform-distribution random data
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_a.host_view(),
+      1,
+      ElementInputA(7),
+      ElementInputA(-8),
+      0);
+
+  // Fill tensor B on host with uniform-distribution random data
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_b.host_view(),
+      1,
+      ElementInputB(7),
+      ElementInputB(-8),
+      0);
+
+  // Fill tensor C on host with uniform-distribution random data
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_c.host_view(),
+      1,
+      ElementOutput(7),
+      ElementOutput(-8),
+      0);
+
+  // Fill tensor D on host with zeros
+  cutlass::reference::host::TensorFill(
+      tensor_d.host_view());
+
+  // Fill tensor D for reference on host with zeros
+  cutlass::reference::host::TensorFill(
+      tensor_ref_d.host_view());
+
+  // Copy data from host to GPU
+  tensor_a.sync_device();
+  tensor_b.sync_device();
+  tensor_c.sync_device();
+  tensor_d.sync_device();
+  tensor_ref_d.sync_device();
+
+  //
+  // Define arguments for CUTLASS Convolution
+  //
+
+  cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;
+
+  // Split K dimension into 1 partitions
+  int split_k_slices = 1;
+
+  // Construct Conv2dProblemSize with user defined output size
+  cutlass::conv::Conv2dProblemSize problem_size(
+      options.input_size,
+      options.filter_size,
+      options.padding,
+      options.conv_stride,
+      options.dilation,
+      options.output_size(),
+      mode,
+      split_k_slices,
+      options.groups
+  );
+
+  // Construct Conv2dOperation::Argument structure with conv2d 
+  // problem size, data pointers, and epilogue values
+  typename Conv2dOperation::Arguments arguments{
+    problem_size,
+    tensor_a.device_ref(),
+    tensor_b.device_ref(),
+    tensor_c.device_ref(),
+    tensor_d.device_ref(),
+    {options.alpha, options.beta},
+  };
+
+  //
+  // Initialize CUTLASS Convolution
+  //
+
+  Conv2dOperation implicit_gemm_op;
+
+  size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  result.status = implicit_gemm_op.can_implement(arguments);
+  CUTLASS_CHECK(result.status);
+
+  result.status = implicit_gemm_op.initialize(arguments, workspace.get());
+  CUTLASS_CHECK(result.status);
+
+  //
+  // Launch initialized CUTLASS kernel
+  //
+  result.status = implicit_gemm_op();
+
+  CUTLASS_CHECK(result.status);
+
+  //
+  // Optional reference check
+  //
+
+  if (options.reference_check) {
+    std::cout << "Verification on device...\n";
+
+    // Compute with reference implementation
+    cutlass::reference::device::Conv2dFprop<
+      ElementInputA,
+      LayoutInputA,
+      ElementInputB,
+      LayoutInputB,
+      ElementOutput,
+      LayoutOutput,
+      ElementComputeEpilogue,
+      ElementAccumulator,
+      cutlass::NumericConverter<ElementOutput, ElementComputeEpilogue>
+    >(
+      problem_size,
+      tensor_a.device_ref(),
+      tensor_b.device_ref(),
+      tensor_c.device_ref(),
+      tensor_ref_d.device_ref(),
+      options.alpha,
+      options.beta
+    );
+
+    tensor_ref_d.sync_host();
+
+    // Check if output from CUTLASS kernel and reference kernel are equal or not
+    tensor_d.sync_host();
+
+    bool passed = cutlass::reference::host::TensorEquals(
+      tensor_d.host_view(),
+      tensor_ref_d.host_view());
+
+    if (!passed) {
+      result.reference_check = cutlass::Status::kErrorInternal;
+      std::cout << "ERROR - results miscompared.\n";
+    } else {
+      result.reference_check = cutlass::Status::kSuccess;
+      std::cout << "Passed.\n";
+    }
+  } else {
+    result.reference_check = cutlass::Status::kInvalid;
+  }
+
+  //
+  // Performance measurement
+  //
+
+  if (options.measure_performance) {
+
+    cudaEvent_t events[2];
+    
+    for (auto & event : events) {
+      result.error = cudaEventCreate(&event);
+      if (result.error != cudaSuccess) {
+        std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+        return result;
+      }
+    }
+
+    // Record an event at the start of a series of convolution operations.
+    result.error = cudaEventRecord(events[0]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Launch a sequence of implicit GEMM operations on the device
+    for (int iteration = 0; iteration < options.iterations; ++iteration) {
+      result.status = implicit_gemm_op();
+      CUTLASS_CHECK(result.status);
+    }
+
+    // Record an event when the convolutions have been launched.
+    result.error = cudaEventRecord(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Wait for work on the device to complete.
+    result.error = cudaEventSynchronize(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Measure elapsed runtime
+    float runtime_ms = 0;
+    result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Print average runtime and GFLOPs.
+    result.runtime_ms = double(runtime_ms) / double(options.iterations);
+    result.gflops = options.gflops(result.runtime_ms / 1000.0);
+
+    // Cleanup
+    for (auto event : events) {
+      (void)cudaEventDestroy(event);
+    }
+  }
+
+  return result;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+Result profile_convolution_selecter(Options const &options) {
+  int k_per_group = options.filter_size.n() / options.groups;
+
+  // In group conv, if k_per_group < threadblock_N, one Threadblock will calculate multiple groups
+  if (k_per_group < ThreadblockShape::kN) { // MultipleGroup mode
+    if (options.optimized) {
+      std::cerr << "Invalid problem: optimized group conv kernel doesn't support MultipleGroup (one CTA calculate multiple groups) mode" << std::endl;
+      exit(-1);
+    } else {
+      std::cout << "Select AnalyticMultipleGroupOperation\n";
+      return profile_convolution<AnalyticMultipleGroupOperation>(options);
+    }
+  } else { // SingleGroup mode
+    if (options.optimized) {
+      std::cout << "Select OptimizedSingleGroupOperation\n";
+      return profile_convolution<OptimizedSingleGroupOperation>(options);
+    } else {
+      std::cout << "Select AnalyticSingleGroupOperation\n";
+      return profile_convolution<AnalyticSingleGroupOperation>(options);
+    }
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  bool notSupported = false;
+
+  // Ampere Tensor Core operations exposed with mma.sync are first available in CUDA 11.0.
+  //
+  // CUTLASS must be compiled with CUDA 11 Toolkit to run Conv2dFprop examples.
+  if (!(__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))) {
+    std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl;
+    notSupported = true;
+  }
+
+  cudaDeviceProp props;
+  CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
+
+  if (!(props.major > 8 || (props.major == 8 && props.minor >= 0))) {
+    std::cerr << "Ampere Tensor Ops must be run on a machine with compute capability at least 80."
+              << std::endl;
+    notSupported = true;
+  }
+
+  if (notSupported) {
+    return 0;
+  }
+
+  Options options;
+  
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  // Execute one problem size
+  if (!options.valid()) {
+    std::cerr << "Invalid problem." << std::endl;
+    return -1;
+  }
+
+  Result result = profile_convolution_selecter(options);
+
+  Result::print_header(std::cout, options) << std::endl;
+  result.print(std::cout, 1, options) << std::endl;
+
+  return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/43_ell_block_sparse_gemm/CMakeLists.txt b/examples/43_ell_block_sparse_gemm/CMakeLists.txt
new file mode 100644
index 0000000000..0676c7bd31
--- /dev/null
+++ b/examples/43_ell_block_sparse_gemm/CMakeLists.txt
@@ -0,0 +1,34 @@
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+cutlass_example_add_executable(
+  43_ell_block_sparse_gemm
+  ell_block_sparse_gemm.cu 
+  )
+
diff --git a/examples/43_ell_block_sparse_gemm/ell_block_sparse_gemm.cu b/examples/43_ell_block_sparse_gemm/ell_block_sparse_gemm.cu
new file mode 100644
index 0000000000..52d2d0cbfa
--- /dev/null
+++ b/examples/43_ell_block_sparse_gemm/ell_block_sparse_gemm.cu
@@ -0,0 +1,740 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Block-Ell sparse gemm example.
+
+    This example performs a Sparse-matrix dense-matrix multiplication (SpMM) operation.
+    Matrix A is stored in the Blocked-Ellpack (Blocked-ELL) storage format.
+    Details about the Blocked-Ellpack (Blocked-ELL) storage format can be found here:
+    https://docs.nvidia.com/cuda/cusparse/index.html#cusparse-generic-spmat-create-blockedell
+    Whereas matrix B is a dense matrix.
+
+    Blocked-Ellpack or Blocked-ELL storage format comprises of two matrices.
+    First is a packed matrix (ellValue matrix) that stores non-zero values in consecutive blocks,
+    represented by tensor_a in this example. Second is a matrix of indices (ellColInd matrix),
+    represented by tensor_ell_idx in this example, that represent the column indices of the 
+    corresponding non-zero blocks. All rows in the matrices must have the same number of blocks.
+    ellColInd can contain -1 values for indicating empty blocks. These matrices store elements in
+    row-major order.
+
+    Description of parameters and tensors used to represent the Blocked-Ellpack (ELL) format
+    for this example:
+      a_rows              - Rows in the sparse matrix.
+      a_cols              - Colums in the sparse matrix.
+      a_ell_blocksize     - Size of the ELL-Blocks.
+      a_ell_num_columns   - Number of columns in the Blocked-Ellpack format (ellValue columns)
+      tensor_a            - ellValue matrix, whose size is (a_rows * a_ell_num_columns)
+      tensor_ell_idx      - Blocked-ELL Column indices (ellColInd), whose size is
+                            (a_rows / a_ell_blocksize) * (a_ell_num_columns / a_ell_blocksize)
+      tensor_b            - Input dense matrix whose size is (a_cols * n)
+      tensor_c/tensor_d   - Output dense matrix whose size is (a_rows * n)
+      {a_rows, n, a_cols} - Problem size
+    
+*/
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <vector>
+#include <unordered_map>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm_grouped.h"
+#include "cutlass/gemm/kernel/default_gemm_grouped.h"
+#include "cutlass/gemm/device/ell_gemm.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/host_uncompress.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Result structure
+struct Result {
+
+  double runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  //
+  // Methods
+  //
+
+  Result(
+    double runtime_ms = 0,
+    double gflops = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess
+  ):
+    runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+  bool reference_check;
+  int iterations;
+  int cuda_streams;
+  int a_rows, n, a_cols;
+  int a_ell_num_columns;
+  int a_ell_blocksize;
+  int a_base;
+  float alpha;
+  float beta;
+
+  //
+  // Methods
+  // 
+
+  Options():
+    help(false),
+    reference_check(true),
+    iterations(20),
+    cuda_streams(0),
+    a_rows(1024),
+    n(1024),
+    a_cols(1024),
+    a_ell_num_columns(512),
+    a_ell_blocksize(16),
+    a_base(0),
+    alpha(1),
+    beta()
+  { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+
+    cmd.get_cmd_line_argument("alpha", alpha, 1.0f);
+    cmd.get_cmd_line_argument("beta", beta, 0.0f);    
+    cmd.get_cmd_line_argument("iterations", iterations, 20);
+    cmd.get_cmd_line_argument("streams", cuda_streams, 0);
+    cmd.get_cmd_line_argument("reference-check", reference_check, true);
+
+    cmd.get_cmd_line_argument("a_rows", a_rows, 1024);
+    cmd.get_cmd_line_argument("n", n, 1024);
+    cmd.get_cmd_line_argument("a_cols", a_cols, 1024);
+
+    cmd.get_cmd_line_argument("a_ell_num_columns", a_ell_num_columns, 512);
+    cmd.get_cmd_line_argument("a_ell_blocksize", a_ell_blocksize, 16);
+    cmd.get_cmd_line_argument("a_base", a_base, 0);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "43_ell_block_sparse_gemm\n\n"
+      << "  This example profiles the performance of a ELL block sparse GEMM kernel.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement.\n\n"
+      << "  --a_rows=<int>              Sets the number of the rows of the sparse matrix.\n"
+      << "  --n=<int>                   Sets the N dimension.\n"
+      << "  --a_cols=<int>              Sets the number of columns of the sparse matrix.\n"
+      << "  --a_ell_num_columns=<int>   Sets the actual number of columns of the Blocked-Ellpack format.\n"
+      << "  --a_ell_blocksize=<int>     Sets the size of the ELL-Block.\n"
+      << "  --a_base=<int>              Sets the base index.\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha (real part)\n"
+      << "  --beta=<f32>                Epilogue scalar beta (real part)\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n"
+      << "  --reference-check=<bool>    If true, performs reference check.\n";
+
+    out << "\n\nExamples:\n\n"
+
+      << "# Runs a 1024x1024x1024 ELL block sparse GEMM with 16x16 block size and actual 512 non-zero columns in A operand\n"
+      << "$ ./examples/43_ell_block_sparse_gemm/43_ell_block_sparse_gemm --a_rows=1024 --n=1024 --a_cols=1024 --a_ell_num_columns=512 --a_ell_blocksize=16\n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+
+    // Number of real-valued multiply-adds 
+    int64_t fmas = (int64_t)a_rows * (int64_t)a_cols * (int64_t)n;
+    
+    // Two flops per multiply-add
+    return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Gemm>
+class Testbed {
+public:
+
+  //
+  // Type definitions
+  //
+
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementC = typename Gemm::ElementC;
+  using ElementAccumulator = typename Gemm::ElementAccumulator;
+
+  using EpilogueOutputOp = typename Gemm::GemmKernel::Epilogue::OutputOp;
+  using ElementCompute = typename EpilogueOutputOp::ElementCompute;
+
+  using LayoutA = typename Gemm::LayoutA;
+  using LayoutB = typename Gemm::LayoutB;
+  using LayoutC = typename Gemm::LayoutC;
+
+  using MatrixCoord = typename LayoutC::TensorCoord;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Options options;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  cutlass::Distribution::Kind init_ELL;
+  uint32_t seed;
+
+  cutlass::HostTensor<ElementA, LayoutA> tensor_a;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_b;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_c;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_d;
+
+  cutlass::HostTensor<ElementA, LayoutA> tensor_a_uncompressed;
+  cutlass::HostTensor<ElementC, LayoutC> reference_d;
+
+  cutlass::HostTensor<int32_t, LayoutA> tensor_ell_idx;
+
+public:
+
+  //
+  // Methods
+  //
+
+  Testbed(
+    Options const &options_,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_ELL_ = cutlass::Distribution::Uniform,
+    uint32_t seed_ = 3080
+  ):
+    options(options_), init_A(init_A_), init_B(init_B_), init_C(init_C_), init_ELL(init_ELL_), seed(seed_) { }
+
+private:
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  void initialize_tensor_(
+    cutlass::TensorView<Element, Layout> view,
+    cutlass::Distribution::Kind dist_kind,
+    uint32_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      Element scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<typename Gemm::ElementC>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        if (cutlass::sizeof_bits<ElementAccumulator>::value <= 16) {
+          scope_max = 5;
+          scope_min = -5;
+        }
+        else {
+          scope_max = 8;
+          scope_min = -8;
+        }
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope_max, scope_min, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(
+        view, seed, Element(), Element(0.5f));
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      // Fill with increasing elements
+      cutlass::reference::host::BlockFillSequential(
+        view.data(), view.capacity(), Element(1), Element());
+    } else {
+
+      // Fill with all 1s
+      cutlass::reference::host::BlockFillSequential(
+        view.data(), view.capacity(), Element(), Element(1));
+    }
+  }
+
+  /// Initializes data structures
+  void initialize_() {
+    tensor_a.resize(cutlass::make_Coord(options.a_rows, options.a_ell_num_columns));
+    tensor_b.resize(cutlass::make_Coord(options.a_cols, options.n));
+    tensor_c.resize(cutlass::make_Coord(options.a_rows, options.n));
+    tensor_d.resize(cutlass::make_Coord(options.a_rows, options.n));
+
+    tensor_a_uncompressed.resize(cutlass::make_Coord(options.a_rows, options.a_cols));
+    reference_d.resize(cutlass::make_Coord(options.a_rows, options.n));
+
+    tensor_ell_idx.resize(cutlass::make_Coord(options.a_rows / options.a_ell_blocksize,
+                          options.a_ell_num_columns / options.a_ell_blocksize));
+
+    //
+    // Initialize the problems of the workspace
+    //
+
+    initialize_tensor_(tensor_a.host_view(), init_A, seed * 2021);
+    initialize_tensor_(tensor_b.host_view(), init_B, seed * 2022);
+    initialize_tensor_(tensor_c.host_view(), init_C, seed * 2023);
+
+    if (init_ELL == cutlass::Distribution::Uniform) {
+      cutlass::reference::host::TensorFillRandomEllIdx(
+          tensor_ell_idx.host_view(), seed,
+          options.a_rows / options.a_ell_blocksize,
+          options.a_ell_num_columns / options.a_ell_blocksize,
+          options.a_cols / options.a_ell_blocksize);
+
+    } else {
+      for(int i = 0; i < options.a_rows / options.a_ell_blocksize; ++i) {
+        for(int j = 0; j < options.a_ell_num_columns / options.a_ell_blocksize; ++j) {
+          tensor_ell_idx.at({i, j}) = j+3;
+        }
+      }
+    }
+
+    tensor_a.sync_device();
+    tensor_b.sync_device();
+    tensor_c.sync_device();
+    tensor_d.sync_device();
+    tensor_ell_idx.sync_device();
+  }
+
+  /// Verifies the result is a GEMM
+  bool verify_() {
+
+    bool passed = true;
+
+    tensor_d.sync_host();
+
+    cutlass::uncompress_ell_block_sparse(
+          tensor_a_uncompressed.host_ref(),
+          tensor_a.host_ref(),
+          tensor_ell_idx.host_ref(),
+          options.a_rows,
+          options.a_cols,
+          options.a_ell_num_columns,
+          options.a_ell_blocksize
+    );
+
+    cutlass::reference::host::Gemm<
+        typename Gemm::ElementA, typename Gemm::LayoutA,                                             
+        typename Gemm::ElementB, typename Gemm::LayoutB,                                             
+        typename Gemm::ElementC, typename Gemm::LayoutC,                                             
+        ElementCompute,
+        ElementAccumulator, typename Gemm::Operator>                                                 
+        reference_gemm;                                                                              
+    
+    reference_gemm(                                                                                  
+      {options.a_rows, options.n, options.a_cols},
+      options.alpha, 
+      tensor_a_uncompressed.host_ref(), 
+      tensor_b.host_ref(),
+      options.beta,
+      reference_d.host_ref(),
+      ElementAccumulator(0)
+    );
+
+    // Reference check
+    passed = cutlass::reference::host::TensorEquals(tensor_d.host_view(), reference_d.host_view());
+
+    if (!passed) {
+      std::cerr << "\n***\nError - problem failed the QA check\n***\n" << std::endl;
+
+      std::stringstream fname;
+
+      fname << "error_43_ell_block_sparse_gemm"
+            << "mnk_"
+            << options.a_rows << "x"
+            << options.n << "x"
+            << options.a_cols << "_"
+            << options.a_ell_num_columns << "_"
+            << options.a_ell_blocksize << ".txt";
+
+      std::cout << fname.str() << std::endl;
+
+      std::ofstream results(fname.str());
+
+      results
+        << "alpha: " << ElementCompute(options.alpha) << "\n"
+        << "beta: "  << ElementCompute(options.beta) << "\n"
+        << "block size: " << options.a_ell_blocksize << "\n"
+        << "\nA:\n" << tensor_a.host_view() << "\n"
+        << "\nA Ell Index:\n" << tensor_ell_idx.host_view() << "\n"
+        << "\nB:\n" << tensor_b.host_view() << "\n"
+        << "\nC:\n" << tensor_c.host_view() << "\n"
+        << "\nD reference:\n" << reference_d.host_view() << "\n"
+        << "\nD computed:\n" << tensor_d.host_view() << "\n";
+
+
+      return passed;
+    }
+    
+    return passed;
+  }
+
+public:
+
+  /// Returns the number of threadblocks to launch if the kernel can run on the target
+  /// device. Otherwise, returns zero.
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    size_t smem_size = sizeof(typename Gemm::GemmKernel::SharedStorage);
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerBlockOptin < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes a BlockedEll SpMM kernel and measures runtime.
+  Result profile() {
+
+    Result result;
+
+    // Early exit
+    if (!sufficient()) {
+      std::cout << "Active CUDA device lacks hardware resources to run CUTLASS BlockedEll SpMM kernel." << std::endl;
+      return result;
+    }
+
+    result.passed = false;
+
+    // Initialize the problem
+    initialize_();
+
+    // Configure the GEMM arguments
+    typename EpilogueOutputOp::Params epilogue_op(options.alpha, options.beta);
+
+    // Configure GEMM arguments
+    typename Gemm::Arguments args(
+      {options.a_rows, options.n, options.a_cols},
+      tensor_a.device_ref(),
+      tensor_b.device_ref(),
+      tensor_c.device_ref(),
+      tensor_d.device_ref(),
+      tensor_ell_idx.device_data(),
+      options.a_ell_num_columns,
+      options.a_ell_blocksize,
+      options.a_base,
+      epilogue_op 
+    );
+
+    // Initialize the GEMM object
+    Gemm gemm{};
+
+    result.status = gemm.initialize(args);
+
+    if (result.status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to initialize CUTLASS BlockedEll SpMM kernel." << std::endl;
+      return result;
+    }
+
+    // Run the BlockedEll SpMM object
+    result.status = gemm.run();
+
+    if (result.status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to run CUTLASS BlockedEll SpMM kernel." << std::endl;
+      return result;
+    }
+
+    // Wait for completion
+    result.error = cudaDeviceSynchronize();
+
+    if (result.error != cudaSuccess)  {
+      std::cerr << "Kernel execution error: " << cudaGetErrorString(result.error);
+      return result;
+    }
+
+    //
+    // Verify correctness
+    //
+    result.passed = true;
+
+    if (options.reference_check) {
+      result.passed = verify_();
+    }
+
+    //
+    // Warm-up run
+    //
+    result.status = gemm.run();
+
+    if (result.status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to run CUTLASS BlockedEll SpMM kernel." << std::endl;
+      return result;
+    }
+
+    //
+    // Construct events
+    //
+
+    cudaEvent_t events[2];
+
+    for (auto & event : events) {
+      result.error = cudaEventCreate(&event);
+      if (result.error != cudaSuccess) {
+        std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+        return -1;
+      }
+    }
+
+    // Record an event at the start of a series of GEMM operations
+    result.error = cudaEventRecord(events[0]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    //
+    // Run profiling loop
+    //
+
+    for (int iter = 0; iter < options.iterations; ++iter) {
+      gemm();
+    }
+
+    //
+    // Stop profiling loop
+    //
+
+    // Record an event when the GEMM operations have been launched.
+    result.error = cudaEventRecord(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Wait for work on the device to complete.
+    result.error = cudaEventSynchronize(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Measure elapsed runtime
+    float runtime_ms = 0;
+    result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Compute average runtime and GFLOPs.
+    result.runtime_ms = double(runtime_ms) / double(options.iterations);
+    result.gflops = options.gflops(result.runtime_ms / 1000.0);
+
+    //
+    // Cleanup
+    //
+
+    for (auto event : events) {
+      (void)cudaEventDestroy(event);
+    }
+
+    std::cout << std::endl;
+    std::cout << "ELL Block Sparse GEMM (CUTLASS):\n"
+      << "====================================================" << std::endl;
+
+    std::cout << std::endl;
+    std::cout << "    " << "Runtime: " << result.runtime_ms << " ms" << std::endl;
+    std::cout << "    " << " GFLOPs: " << result.gflops << std::endl;
+
+    return result;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  //
+  // This example uses mma.sync to directly access Tensor Cores to achieve peak performance.
+  //
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (__CUDACC_VER_MAJOR__ < 11 || props.major < 8) {
+  
+    //
+    // This example requires an NVIDIA Ampere-architecture GPU.
+    //
+
+    std::cout 
+      << "CUTLASS's BlockedEll SpMM example requires a GPU of NVIDIA's Ampere Architecture or "
+      << "later (compute capability 80 or greater).\n";
+
+    return 0;
+  }
+
+  //
+  // Parse options
+  //
+
+  Options options;
+  
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  //
+  // Define the BlockedEll type
+  //
+
+  using ElementA = cutlass::half_t;
+  using ElementB = cutlass::half_t;
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  constexpr int32_t kAlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
+  constexpr int32_t kAlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  constexpr int32_t kStages = 4;
+  using Gemm = typename cutlass::gemm::device::EllGemm<
+    ElementA, 
+    LayoutA, 
+    ElementB,
+    LayoutB, 
+    ElementOutput,
+    LayoutC,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    ThreadblockShape,
+    WarpShape, 
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombination<
+        ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+        ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>, 
+    kStages, kAlignmentA, kAlignmentB>;
+
+  //
+  // Profile it
+  //
+
+  Testbed<Gemm> testbed(options);
+
+  if (!testbed.sufficient()) {
+    std::cout << "The active CUDA device lacks sufficient hardware resources to execute this kernel.\n";
+    return 0;
+  }
+
+  Result result = testbed.profile();
+  if (!result.passed) {
+    std::cout << "Profiling CUTLASS ELL block sparse GEMM has failed.\n";
+    std::cout << "\nFailed\n";
+    return -1;
+  }
+
+  std::cout << "\nPassed\n";
+
+  return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/44_multi_gemm_ir_and_codegen/README.md b/examples/44_multi_gemm_ir_and_codegen/README.md
new file mode 100644
index 0000000000..fd1839c5ad
--- /dev/null
+++ b/examples/44_multi_gemm_ir_and_codegen/README.md
@@ -0,0 +1,63 @@
+This example provides utilities for generating back-to-back (B2B) GEMMs using CUTLASS.
+
+## Quick start
+A configuration file containing the GEMMs to be fused together is located in [config.json](config.json). Edit
+this to change the configuration that you would like to run.
+```shell
+cd ir_gen
+
+# Set up basic variables
+out_dir=directory_to_emit_files
+cutlass_dir=$(pwd)/../../..
+config_file=$(pwd)/../config.json
+
+# Generate code for GEMMs described in `config_file`
+./generate.sh $config_file $out_dir $cutlass_dir
+
+# Build the generated code
+cd $out_dir
+mkdir build && cd build
+cmake .. -DGPU_ARCHS="75;80"
+make -j
+
+# Run the generated code with M=1024 K0=32 and Batch=1
+./sample 1024 32 1
+```
+
+## Current restrictions
+This experimental example has the following restrictions:
+1. N tile should not exceed 256, or register spilling will occur.
+2. Only FP16 is supported currently
+3. Matrix A must be row major, matrix B must be column major, matrices C and D must be row major.
+
+## Copyright
+
+Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/examples/44_multi_gemm_ir_and_codegen/config.json b/examples/44_multi_gemm_ir_and_codegen/config.json
new file mode 100644
index 0000000000..bb8757c092
--- /dev/null
+++ b/examples/44_multi_gemm_ir_and_codegen/config.json
@@ -0,0 +1,32 @@
+{
+    "0": {
+        "A_tp": "fp16", "B_tp": "fp16", "C_tp": "fp16", "Acc_tp": "fp16",
+        "A_format": "Row", "B_format": "Col", "C_format": "Row",
+        "mnk": [15000, 256, 32],
+        "epilogue": {
+            "tp": "LeakyRelu",
+            "bias": {"addbias": false, "bias_tp": "mat"},
+            "args": [["float", "leaky_alpha", 1.3]]
+            }
+    },
+    "1": {
+        "A_tp": "fp16", "B_tp": "fp16", "C_tp": "fp16", "Acc_tp": "fp16",
+        "A_format": "Row", "B_format": "Col", "C_format": "Row",
+        "mnk": [15000, 128, 256],
+        "epilogue": {
+            "tp": "LeakyRelu",
+            "bias": {"addbias": false, "bias_tp": "mat"},
+            "args": [["float", "leaky_alpha", 1.3]]
+            }
+    },
+    "2": {
+        "A_tp": "fp16", "B_tp": "fp16", "C_tp": "fp16", "Acc_tp": "fp16",
+        "A_format": "Row", "B_format": "Col", "C_format": "Row",
+        "mnk": [15000, 64, 128],
+        "epilogue": {
+            "tp": "LeakyRelu",
+            "bias": {"addbias": false, "bias_tp": "mat"},
+            "args": [["float", "leaky_alpha", 1.3]]
+            }
+    }
+}
diff --git a/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/default_bias_act_epilogue_tensor_op.h b/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/default_bias_act_epilogue_tensor_op.h
new file mode 100644
index 0000000000..2535e28e22
--- /dev/null
+++ b/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/default_bias_act_epilogue_tensor_op.h
@@ -0,0 +1,154 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+
+#include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h"
+#include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator_mixed.h"
+
+// #include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/threadblock/interleaved_epilogue.h"
+
+#include "fused_bias_act_epilogue.h"
+#include "../warp/fused_bias_act_fragment_iterator_tensor_op.h"
+#include "output_tile_thread_map_for_fused_bias.h"
+#include "default_thread_map_tensor_op_for_fused_bias.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultFusedBiasActEpilogueTensorOp {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOpForFusedBias<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    OutputTileThreadMap,
+    ElementOutput
+  >;
+
+  using AccumulatorFragmentIterator = typename std::conditional<is_complex<ElementOutput>::value,
+                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC>,
+                                    cutlass::epilogue::warp::FusedBiasActFragmentIteratorTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC> >::type;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::FusedBiasActEpilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    OutputOp
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/default_thread_map_tensor_op_for_fused_bias.h b/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/default_thread_map_tensor_op_for_fused_bias.h
new file mode 100644
index 0000000000..22f8e282a3
--- /dev/null
+++ b/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/default_thread_map_tensor_op_for_fused_bias.h
@@ -0,0 +1,113 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief 
+
+*/
+
+#pragma once
+
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/pitch_linear.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines the optimal thread map for TensorOp accumulator layouts
+template <
+  typename ThreadblockShape_,
+  typename WarpShape_,
+  int PartitionsK,
+  typename Element_,
+  int ElementsPerAccess
+>
+struct DefaultThreadMapTensorOpForFusedBias {
+
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  static int const kPartitionsK = PartitionsK;
+  using Element = Element_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  //
+  // Definitions
+  //
+
+  struct Detail {
+
+    /// Tensor Operations fundamentally perform operations on 8 rows
+    static int const kTensorOpRows = 8;
+    static int const kWarpSize = 32;
+
+    static_assert(
+      !(ThreadblockShape::kM % WarpShape::kM) &&
+      !(ThreadblockShape::kM % WarpShape::kM), "Divisibility");
+
+    /// Number of warps
+    using WarpCount = gemm::GemmShape<
+      ThreadblockShape::kM / WarpShape::kM,
+      ThreadblockShape::kN / WarpShape::kN,
+      kPartitionsK
+    >;
+
+    /// Number of participating threads
+    static int const kThreads = WarpCount::kCount * kWarpSize;
+  };
+
+  //
+  // ThreadMap
+  //
+  
+  /// ThreadMap to be used by epilogue::PredicatedTileIterator satisfying concept OutputTileThreadMap
+  using Type = OutputTileOptimalThreadMapBiasAct <
+    OutputTileShape<ThreadblockShape::kN, Detail::kTensorOpRows, Detail::WarpCount::kM, 1, 1>,
+    OutputTileShape<1, WarpShape::kM / Detail::kTensorOpRows, 1, 1, WarpShape::kM / Detail::kTensorOpRows>,
+    Detail::kThreads,
+    kElementsPerAccess,
+    sizeof_bits<Element>::value
+  >;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/fused_bias_act_epilogue.h b/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/fused_bias_act_epilogue.h
new file mode 100644
index 0000000000..1acb4a2de6
--- /dev/null
+++ b/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/fused_bias_act_epilogue.h
@@ -0,0 +1,215 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include <cuda/std/cassert>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/functional.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator without splitk
+template <
+  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  int PartitionsK,                          ///< Number of partitions of the K dimension
+  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename OutputOp_                        ///< Output operator
+>
+class FusedBiasActEpilogue {
+
+public:
+
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using OutputOp = OutputOp_;
+
+  /// Output layout is always row-major
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename AccumulatorFragmentIterator::AccumulatorTile;
+
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  
+public:
+
+
+  static_assert(OutputTileIterator::kElementsPerAccess, "OutputTileIterator::kElementsPerAccess must not be zero.");
+
+  static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess), 
+    "Divisibility");
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  FusedBiasActEpilogue(
+  ){ }
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                    ///< Output operator
+    AccumulatorTile &accumulators,          ///< Complete warp-level accumulator tile
+    AccumulatorTile & fused_bias_act_accumlators,
+    OutputTileIterator source_iterator) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+    
+    bool need_bias = output_op.is_source_needed();
+
+    if (need_bias)
+      compute_source_needed_(output_op, accumulators, fused_bias_act_accumlators, source_iterator);
+    else
+      compute_source_no_needed_(output_op, accumulators, fused_bias_act_accumlators);
+
+
+  }
+
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                    ///< Output operator
+    AccumulatorTile &accumulators,          ///< Complete warp-level accumulator tile
+    AccumulatorTile & fused_bias_act_accumlators) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+    
+    compute_source_no_needed_(output_op, accumulators, fused_bias_act_accumlators);
+  }
+
+  CUTLASS_DEVICE
+  void compute_source_needed_(
+    OutputOp const &output_op,                    ///< Output operator
+    AccumulatorTile &accumulators,          ///< Complete warp-level accumulator tile
+    AccumulatorTile & fused_bias_act_accumlators,
+    OutputTileIterator source_iterator) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+    
+    typename OutputTileIterator::Fragment source_fragment;
+
+
+    source_fragment.clear();
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+    AccumulatorFragmentIterator fused_bias_act_fragment_iterator(fused_bias_act_accumlators);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+
+      source_iterator.load(source_fragment);
+      ++source_iterator;
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+
+      accum_fragment_iterator.load(accum_fragment);
+      ++accum_fragment_iterator;
+
+      typename AccumulatorFragmentIterator::Fragment fused_bias_act_fragment;
+      fused_bias_act_fragment = output_op(accum_fragment, source_fragment);
+
+      fused_bias_act_fragment_iterator.store(fused_bias_act_fragment);
+      ++fused_bias_act_fragment_iterator;
+    }
+  }
+
+  CUTLASS_DEVICE
+  void compute_source_no_needed_(
+    OutputOp const &output_op,                    ///< Output operator
+    AccumulatorTile &accumulators,          ///< Complete warp-level accumulator tile
+    AccumulatorTile & fused_bias_act_accumlators) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+    
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+    AccumulatorFragmentIterator fused_bias_act_fragment_iterator(fused_bias_act_accumlators);
+
+
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int iter = 0; iter < AccumulatorFragmentIterator::kIterations; ++iter) {
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+
+      accum_fragment_iterator.load(accum_fragment);
+      ++accum_fragment_iterator;
+
+      typename AccumulatorFragmentIterator::Fragment fused_bias_act_fragment;
+      fused_bias_act_fragment = output_op(accum_fragment);
+
+      fused_bias_act_fragment_iterator.store(fused_bias_act_fragment);
+      ++fused_bias_act_fragment_iterator;
+    }
+  }
+
+};
+
+
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/output_tile_thread_map_for_fused_bias.h b/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/output_tile_thread_map_for_fused_bias.h
new file mode 100644
index 0000000000..c39e8ce199
--- /dev/null
+++ b/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/output_tile_thread_map_for_fused_bias.h
@@ -0,0 +1,311 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Metaprogram for determining the mapping of output elements to threads for epilogue tiles.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/fast_math.h"
+
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// RowArrangement determines how one or more warps cover a region of consecutive rows.
+template <
+  typename Shape,
+  int WarpsRemaining,
+  int ElementsPerAccess,
+  int ElementSize,
+  bool Is2dTile
+>
+struct RowArrangementBiasAct;
+
+/// RowArrangement in which each warp's access is a 1D tiled arrangement.
+template <
+  typename Shape,
+  int WarpsRemaining,
+  int ElementsPerAccess,
+  int ElementSize
+>
+struct RowArrangementBiasAct<Shape, WarpsRemaining, ElementsPerAccess, ElementSize, false> {
+  static int const kWarpSize = 32;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kElementSize = ElementSize;
+
+  static int const kIterationsRow = 1;
+  static int const kDeltaRow = 1;
+  static int const kIterationsColumn = Shape::kColumn / kElementsPerAccess / kWarpSize;
+  static int const kDeltaColumn = kWarpSize * kElementsPerAccess;
+
+  static int const kAccessWidth = kWarpSize;
+  static int const kAccessRows = 1;
+  static int const kWarpPartitionsRow = 1;
+  static int const kWarpPartitionsColumn = WarpsRemaining;
+};
+
+/// RowArrangement in which each warp's access is a 2D tiled arrangement.
+template <
+  typename Shape,
+  int WarpsRemaining,
+  int ElementsPerAccess,
+  int ElementSize
+>
+struct RowArrangementBiasAct<Shape, WarpsRemaining, ElementsPerAccess, ElementSize, true> {
+
+  static int const kMemoryAccessSize = 4;//128;
+  static int const kWarpSize = 32;
+
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kElementSize = ElementSize;
+
+  struct Detail {
+    static int const kShapeRow = Shape::kRow / WarpsRemaining;
+    static int const kShapeWidth = Shape::kColumn / kElementsPerAccess;
+
+    static int const kTargetMemoryAccessWidth = 
+      kMemoryAccessSize / (kElementsPerAccess * kElementSize / 8);
+
+    static int const kTargetAccessRows = kWarpSize / kTargetMemoryAccessWidth;
+  };
+
+  static int const kAccessWidth = 
+    (Detail::kTargetAccessRows > Detail::kShapeRow ?
+      kWarpSize / Detail::kShapeRow
+      : const_min(
+          Detail::kShapeWidth,
+        const_min(kWarpSize, kMemoryAccessSize / (kElementsPerAccess * kElementSize / 8))
+        ));
+
+  static int const kAccessRows =
+    (Detail::kTargetAccessRows > Detail::kShapeRow ?
+      Detail::kShapeRow
+      : const_min(Shape::kRow, kWarpSize / kAccessWidth));
+
+  static int const kIterationsRow = Detail::kShapeRow / kAccessRows;
+  static int const kDeltaRow = kAccessRows;
+
+  static int const kIterationsColumn = Detail::kShapeWidth / kAccessWidth;
+  static int const kDeltaColumn = kAccessWidth * kElementsPerAccess;
+
+  static_assert( kAccessWidth * kElementsPerAccess <= Shape::kColumn, "Accessing too many elements per access");
+  static_assert( kIterationsColumn > 0, "Iteration Count Column must be > 0" );
+  static_assert( kIterationsRow > 0, "Iteration Count Row must be > 0" );
+
+  static int const kWarpPartitionsRow = 1;
+  static int const kWarpPartitionsColumn = 1;
+};
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template metaprogram for partitioning a 4D space across warps to achieve several performance
+/// objectives:
+///
+///   - coalesced memory accesses in units of 16 Byte lines
+///   - minimal address arithmetic
+///   - minimal predicate calculations
+///
+template <
+  typename Shape_,
+  typename Count_,
+  int Threads,
+  int ElementsPerAccess,
+  int ElementSize
+>
+struct OutputTileOptimalThreadMapBiasAct {
+
+  using Shape = Shape_;
+  using Count = Count_;
+
+  static int const kWarpSize = 32;
+  static int const kThreads = Threads;
+  static int const kWarpCount = kThreads / kWarpSize;
+
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kElementSize = ElementSize;
+
+  //
+  // Metaprogram computation
+  //
+
+  struct Detail {
+
+    // Clusters
+    static int const kIterationsCluster = 
+      ((Shape::kCluster > kWarpCount) ?
+        Shape::kCluster / kWarpCount
+        : 1);
+
+    static int const kDeltaCluster =
+      ((Shape::kCluster > kWarpCount) ?
+        Shape::kRow * Count::kRow * Shape::kGroup * Count::kGroup * Shape::kCluster / kIterationsCluster
+        : 1);
+
+    static int const kCompactedDeltaCluster =
+      ((Shape::kCluster > kWarpCount) ?
+        Shape::kRow * Shape::kGroup * Shape::kCluster / kIterationsCluster
+        : 1);
+
+    static int const kWarpPartitionsCluster =
+      ((Shape::kCluster > kWarpCount) ?
+        kWarpCount
+        : kWarpCount / Shape::kCluster);
+
+    static int const kWarpsRemainingForGroups =
+      ((Shape::kCluster > kWarpCount) ? 1 : kWarpCount / Shape::kCluster);
+
+    // Groups
+    static int const kIterationsGroup =
+      ((Shape::kGroup > kWarpsRemainingForGroups) ?
+        Shape::kGroup / kWarpsRemainingForGroups
+        : 1);
+
+    static int const kDeltaGroup =
+      ((Shape::kGroup > kWarpsRemainingForGroups) ?
+        Shape::kRow * Count::kRow * Shape::kGroup / kIterationsGroup
+        : 1);
+
+    static int const kCompactedDeltaGroup =
+      ((Shape::kGroup > kWarpsRemainingForGroups) ?
+        Shape::kRow * Shape::kGroup / kIterationsGroup
+        : 1);
+
+    static int const kWarpPartitionsGroup =
+      ((Shape::kGroup > kWarpsRemainingForGroups) ?
+        1
+        : kWarpsRemainingForGroups / Shape::kGroup);
+
+    static int const kWarpsRemainingForRows =
+      ((Shape::kGroup > kWarpsRemainingForGroups) ?
+        1
+        : kWarpsRemainingForGroups / Shape::kGroup);
+    
+    // Rows
+    using RowArrangement = detail::RowArrangementBiasAct<
+      Shape,
+      kWarpsRemainingForRows,
+      kElementsPerAccess,
+      kElementSize,
+      (Shape::kRow > kWarpsRemainingForRows)
+    >;
+
+    // Warp partitions
+    using WarpPartitions = OutputTileShape<
+      RowArrangement::kWarpPartitionsColumn,
+      RowArrangement::kWarpPartitionsRow,
+      kWarpPartitionsGroup,
+      kWarpPartitionsCluster,
+      1>;
+
+    static int const kAccessWidth = RowArrangement::kAccessWidth;
+    static int const kAccessRows = RowArrangement::kAccessRows;
+  };
+
+  //
+  // Output
+  //
+
+  using Iterations = OutputTileShape<
+    Detail::RowArrangement::kIterationsColumn, 
+    Detail::RowArrangement::kIterationsRow, 
+    Detail::kIterationsGroup, 
+    Detail::kIterationsCluster, 
+    1>;
+
+  using Delta = OutputTileShape<
+    Detail::RowArrangement::kDeltaColumn,
+    Detail::RowArrangement::kDeltaRow,
+    Detail::kDeltaGroup,
+    Detail::kDeltaCluster,
+    1>;
+
+  /// Initial offset function
+  CUTLASS_HOST_DEVICE
+  static MatrixCoord initial_offset(int thread_idx) {
+
+    int warp_idx = thread_idx / kWarpSize;
+    int lane_idx = thread_idx % kWarpSize;
+
+    // Compute warp location
+    int cluster_idx = warp_idx / Detail::WarpPartitions::kCluster;
+    int residual_cluster = warp_idx % Detail::WarpPartitions::kCluster;
+
+    int group_idx = residual_cluster / Detail::WarpPartitions::kGroup;
+    int residual_group = residual_cluster % Detail::WarpPartitions::kGroup;
+
+    int row_idx = residual_group / Detail::WarpPartitions::kRow;
+    int col_idx = residual_group % Detail::WarpPartitions::kRow;
+
+    // Compute per-lane offset
+    int lane_row_offset = lane_idx / Detail::kAccessWidth;
+    int lane_col_offset = lane_idx % Detail::kAccessWidth;
+
+    // Compute coordinate in output space
+    int cluster_offset = cluster_idx * Shape::kRow * Count::kRow * Shape::kGroup * Count::kGroup;
+    int group_offset = group_idx * Shape::kRow * Count::kRow;
+    int row_offset = row_idx * Iterations::kRow * Detail::kAccessRows;
+    int column_offset = col_idx * Iterations::kColumn * Detail::kAccessWidth * kElementsPerAccess;
+
+    return MatrixCoord(
+      cluster_offset + group_offset + row_offset + lane_row_offset,
+      (column_offset + lane_col_offset) * kElementsPerAccess
+    );
+  }
+
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
diff --git a/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/warp/fused_bias_act_fragment_iterator_tensor_op.h b/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/warp/fused_bias_act_fragment_iterator_tensor_op.h
new file mode 100644
index 0000000000..cf12fef3b5
--- /dev/null
+++ b/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/warp/fused_bias_act_fragment_iterator_tensor_op.h
@@ -0,0 +1,189 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief This defines a "fragment" iterator for visiting the fragments of an accumulator tile
+      that participate in one warp-level store operation.
+
+      Typically, the accumulator tile is the largest single block of register-backed storage 
+      within the kernel. Storing it to memory is best accomplished by partitioning it into
+      smaller tiles and storing these sequentially.
+
+      Round trips through shared memory during the Epilogue phase require partitioning, as
+      shared memory capacity is typically insufficient for a threadblock's total accumulator
+      size.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/epilogue/warp/tensor_op_policy.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// 
+template <
+  typename WarpShape,         ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename OperatorShape,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorElementC,  ///< matrix multiply operation data type (concept: data type)
+  typename OperatorFragmentC, ///< matrix multiply operation fragment (concept: Array)
+  typename Layout             ///< target shared memory layout
+>
+class FusedBiasActFragmentIteratorTensorOp;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for row-major shared memory
+template <
+  typename WarpShape_,         ///< shape of the warp-level GEMM tile
+  typename OperatorShape_,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorElementC_,  ///< matrix multiply operation data type (concept: data type)
+  typename OperatorFragmentC_  ///< matrix multiply operation fragment (concept: Array)
+>
+class FusedBiasActFragmentIteratorTensorOp<WarpShape_, OperatorShape_, OperatorElementC_, OperatorFragmentC_, layout::RowMajor> {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using OperatorElementC = OperatorElementC_;
+  using OperatorFragmentC = OperatorFragmentC_;
+  using Layout = layout::RowMajor;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    OperatorElementC, 
+    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = Array<
+    OperatorElementC, 
+    OperatorFragmentC::kElements * Policy::OperatorCount::kRow * Policy::OperatorCount::kColumn>;
+
+  using OutputAccumulatorTile = AccumulatorTile;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+private:
+
+  /// Internal access type
+  using AccessType = Array<OperatorElementC, Policy::kElementsPerAccess>;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType *accumulators_;
+
+  /// Internal index
+  int index_;
+
+public:
+
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  FusedBiasActFragmentIteratorTensorOp(AccumulatorTile &accum): 
+    accumulators_(reinterpret_cast<AccessType *>(&accum)), 
+    index_(0) {
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  FusedBiasActFragmentIteratorTensorOp &operator++() {
+    ++index_;
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  FusedBiasActFragmentIteratorTensorOp &operator--() {
+    --index_;
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, int index_offset = 0) const {
+
+    int index = index_ + index_offset;
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+
+      int accumulator_access_offset = 
+        index + n * Policy::kAccumulatorColumnStride / Policy::kElementsPerAccess;
+
+      frag_ptr[n] = accumulators_[accumulator_access_offset];
+    }
+  }
+  /// Stores a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void store(Fragment &frag, int index_offset = 0) const {
+
+    int index = index_ + index_offset;
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+
+      int accumulator_access_offset = 
+        index + n * Policy::kAccumulatorColumnStride / Policy::kElementsPerAccess;
+
+      accumulators_[accumulator_access_offset] = frag_ptr[n];
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/44_multi_gemm_ir_and_codegen/fixed_impl/gemm/warp/mma_tensor_op_fragment_iterator_without_output_op.h b/examples/44_multi_gemm_ir_and_codegen/fixed_impl/gemm/warp/mma_tensor_op_fragment_iterator_without_output_op.h
new file mode 100644
index 0000000000..0e89d6f875
--- /dev/null
+++ b/examples/44_multi_gemm_ir_and_codegen/fixed_impl/gemm/warp/mma_tensor_op_fragment_iterator_without_output_op.h
@@ -0,0 +1,427 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/numeric_conversion.h"
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Size of the accumulation tile shape (concept: MatrixShape)
+    typename AccumulatorShape_,
+    /// KBlocks columns to compute residual
+    int KBlocksColumn_,
+    /// Accumulator Element type
+    typename ElementAccumulator_,    
+    /// Element type
+    typename Element_,
+    /// Layout of operand in memory
+    typename Layout_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Whether beta is zero
+    bool IsBetaZero_ >
+class MmaTensorOpPureFragmentIterator;
+
+
+// Partial specialization for col-major accumulator tile
+// And Element type is the same as Accumulator Element type
+
+template <
+    /// Shape of warp tile to load (concept: MatrixShape)
+    typename Shape_,
+    /// Shape of the warp accumulation tile (concept: MatrixShape)
+    typename AccumulatorShape_,
+    /// KBlocks columns to compute residual
+    int KBlocksColumn_,    
+    /// Element type
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_>
+class MmaTensorOpPureFragmentIterator<Shape_, AccumulatorShape_, KBlocksColumn_, Element_, Element_,
+                                         cutlass::layout::ColumnMajor,
+                                         InstructionShape_, true> {
+ public:
+
+  /// Shape of warp tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+    
+  /// Shape of the warp accumulation tile (concept: MatrixShape)
+  using AccumulatorShape = AccumulatorShape_;
+
+  /// KBlocks columns to compute residual
+  static int const kKBlockColumn = KBlocksColumn_;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::ColumnMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Whether beta is zero
+  static bool const IsBetaZero = true;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kRow % InstructionShape::kM) &&
+            !(Shape::kColumn % InstructionShape::kN),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+    static_assert(
+        !(AccumulatorShape::kRow % Shape::kRow) &&
+            !(AccumulatorShape::kColumn % Shape::kColumn),
+        "Shape of Warp Accumulator must be divisible by warp shape.");
+    static_assert(
+        !(kKBlockColumn % Shape::kColumn),
+        "KBlock size must be divisible by warp shape.");
+
+    /// Number of times this iterator can be incremented
+    static int const kIterations = AccumulatorShape::kCount / Shape::kCount;
+  };
+
+private:
+
+  static int const kElementsPerAccess = InstructionShape::kM * InstructionShape::kN / kThreads;
+
+  /// Number of mma operations performed by a warp
+  using MmaIterations = MatrixShape<Shape::kRow / InstructionShape::kM,
+                                    Shape::kColumn / InstructionShape::kN>;
+  /// Number of mma operations performed by the entire accumulator
+  using AccumulatorIterations = MatrixShape<AccumulatorShape::kRow / InstructionShape::kM,
+                                              AccumulatorShape::kColumn / InstructionShape::kN>;
+
+  /// Number of K iterations    
+  static int const kKBlockIterations = (AccumulatorShape::kColumn + kKBlockColumn - 1) / kKBlockColumn;
+  static int const kResidualColumn = AccumulatorShape::kColumn - (kKBlockIterations - 1) * kKBlockColumn;
+  static int const kKBlockColumnIterations = kKBlockColumn / Shape::kColumn 
+                                     * (AccumulatorShape::kRow / Shape::kRow);
+  static int const kResidualIndex = kResidualColumn / Shape::kColumn
+                                     * (AccumulatorShape::kRow / Shape::kRow);
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<Element, Shape::kCount / kThreads>;
+
+  /// Accumulator Fragment object
+  using AccumulatorFragment = Array<Element, AccumulatorShape::kCount / kThreads>;
+
+
+private:
+
+  /// Internal access type
+  using AccessType = Array<Element, kElementsPerAccess>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType const *accumulators_;
+
+  /// Internal index
+  int index_;
+
+  /// Used to access residual tile first
+  bool is_residual_tile_;
+
+public:
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpPureFragmentIterator(AccumulatorFragment const &accum)
+      : accumulators_(reinterpret_cast<AccessType const *>(&accum)),
+        index_(0), is_residual_tile_(true) {}
+
+  /// Add offset
+  CUTLASS_HOST_DEVICE
+  void add_offset(int index_offset) {
+    index_ += index_offset; 
+    if(is_residual_tile_ && index_ >= kKBlockColumnIterations) {
+      index_ = index_ - kKBlockColumnIterations + kResidualIndex;
+      is_residual_tile_ = false;
+    }
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpPureFragmentIterator &operator++() {
+    add_offset(1);
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpPureFragmentIterator &operator--() {
+    add_offset(-1);
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    AccessType src_fragment;
+    src_fragment.clear();
+
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    int index_m = (index_ * MmaIterations::kRow) % AccumulatorIterations::kRow;
+    int index_n = (index_ * MmaIterations::kRow) / AccumulatorIterations::kRow 
+                    * MmaIterations::kColumn;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < MmaIterations::kColumn; n++) {
+      for (int m = 0; m < MmaIterations::kRow; m++) {
+        int accumulator_access_offset = 
+            (n + index_n) * AccumulatorIterations::kRow + m + index_m;
+            
+        frag_ptr[n * MmaIterations::kRow + m].clear();
+        if(!(is_residual_tile_ && index_ >= kResidualIndex))
+            frag_ptr[n * MmaIterations::kRow + m] = accumulators_[accumulator_access_offset];
+            // frag_ptr[n * MmaIterations::kRow + m] = output_op(accumulators_[accumulator_access_offset], src_fragment);
+      }
+    }
+  }
+
+};
+
+// Partial specialization for row-major accumulator tile
+
+template <
+    /// Shape of warp tile to load (concept: MatrixShape)
+    typename Shape_,
+    /// Shape of the warp accumulation tile (concept: MatrixShape)
+    typename AccumulatorShape_,
+    /// KBlocks columns to compute residual
+    int KBlocksColumn_,    
+    /// Accumulator Element type
+    typename ElementAccumulator_,    
+    /// Element type
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_>
+class MmaTensorOpPureFragmentIterator<Shape_, AccumulatorShape_, KBlocksColumn_, ElementAccumulator_, Element_,
+                                         cutlass::layout::RowMajor,
+                                         InstructionShape_, true> {
+ public:
+
+  /// Shape of warp tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+    
+  /// Shape of the warp accumulation tile (concept: MatrixShape)
+  using AccumulatorShape = AccumulatorShape_;
+
+  /// KBlocks columns to compute residual
+  static int const kKBlockColumn = KBlocksColumn_;
+
+  /// Accumulator Element type
+  using ElementAccumulator = ElementAccumulator_;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Whether beta is zero
+  static bool const IsBetaZero = true;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kRow % InstructionShape::kM) &&
+            !(Shape::kColumn % InstructionShape::kN),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+    static_assert(
+        !(AccumulatorShape::kRow % Shape::kRow) &&
+            !(AccumulatorShape::kColumn % Shape::kColumn),
+        "Shape of Warp Accumulator must be divisible by warp shape.");
+    static_assert(
+        !(kKBlockColumn % Shape::kColumn),
+        "KBlock size must be divisible by warp shape.");
+
+    /// Number of times this iterator can be incremented
+    static int const kIterations = AccumulatorShape::kCount / Shape::kCount;
+  };
+
+private:
+
+  static int const kElementsPerAccess = InstructionShape::kM * InstructionShape::kN / kThreads;
+
+  /// Number of mma operations performed by a warp
+  using MmaIterations = MatrixShape<Shape::kRow / InstructionShape::kM,
+                                    Shape::kColumn / InstructionShape::kN>;
+  /// Number of mma operations performed by the entire accumulator
+  using AccumulatorIterations = MatrixShape<AccumulatorShape::kRow / InstructionShape::kM,
+                                              AccumulatorShape::kColumn / InstructionShape::kN>;
+
+  /// Number of K iterations    
+  static int const kKBlockIterations = (AccumulatorShape::kColumn + kKBlockColumn - 1) / kKBlockColumn;
+  static int const kResidualColumn = AccumulatorShape::kColumn - (kKBlockIterations - 1) * kKBlockColumn;
+  static int const kKBlockColumnIterations = kKBlockColumn / Shape::kColumn 
+                                     * (AccumulatorShape::kRow / Shape::kRow);
+  static int const kResidualIndex = kResidualColumn / Shape::kColumn
+                                     * (AccumulatorShape::kRow / Shape::kRow);
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<Element, Shape::kCount / kThreads>;
+
+  /// Accumulator Fragment object
+  using AccumulatorFragment = Array<ElementAccumulator, AccumulatorShape::kCount / kThreads>;
+
+
+private:
+
+  /// Internal access type
+  using AccessType = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentAccessType = Array<Element, kElementsPerAccess>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType const *accumulators_;
+
+  /// Internal index
+  int index_;
+
+  /// Used to access residual tile first
+  bool is_residual_tile_;
+
+public:
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpPureFragmentIterator(AccumulatorFragment const &accum)
+      : accumulators_(reinterpret_cast<AccessType const *>(&accum)),
+        index_(0), is_residual_tile_(true) {}
+
+  /// Add offset
+  CUTLASS_HOST_DEVICE
+  void add_offset(int index_offset) {
+    index_ += index_offset; 
+    if(is_residual_tile_ && index_ >= kKBlockColumnIterations) {
+      index_ = index_ - kKBlockColumnIterations + kResidualIndex;
+      is_residual_tile_ = false;
+    }
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpPureFragmentIterator &operator++() {
+    add_offset(1);
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpPureFragmentIterator &operator--() {
+    add_offset(-1);
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+
+    FragmentAccessType src_fragment;
+    src_fragment.clear();
+
+    FragmentAccessType *frag_ptr = reinterpret_cast<FragmentAccessType *>(&frag);
+
+    int index_m = (index_ * MmaIterations::kRow) % AccumulatorIterations::kRow;
+    int index_n = (index_ * MmaIterations::kRow) / AccumulatorIterations::kRow 
+                    * MmaIterations::kColumn;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < MmaIterations::kRow; m++) {
+      for (int n = 0; n < MmaIterations::kColumn; n++) {
+        int accumulator_access_offset = 
+            (m + index_m) * AccumulatorIterations::kColumn + n + index_n;
+
+        frag_ptr[m * MmaIterations::kColumn + n].clear();
+        if(!(is_residual_tile_ && index_ >= kResidualIndex))
+           frag_ptr[m * MmaIterations::kColumn + n] = (accumulators_[accumulator_access_offset]);
+      }
+    }
+  }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_all_code.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_all_code.py
new file mode 100644
index 0000000000..6aef3bca9c
--- /dev/null
+++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_all_code.py
@@ -0,0 +1,129 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import gen_turing_and_volta as api_generator
+import gen_sample as sample_creater
+import gen_cmake as cmake_creater
+import gen_verify as verify_creater
+import gen_device as b2b_fused_generator
+import replace_fix_impl_header
+
+import argparse
+import os
+import json
+
+
+parser = argparse.ArgumentParser(description="Generates Fused Multi-GEMM CUTLASS Kernels")
+parser.add_argument("--config-file", default="config.json", help="JSON file containing configuration to generate")
+parser.add_argument("--gen-name", default="FusedMultiGemmForward", help="Specific the output name")
+parser.add_argument("--output-dir", default="", help="Specifies the output dir")
+parser.add_argument("--cutlass-dir", default="", help="Specifies the dependent CUTLASS repo dir")
+parser.add_argument("--gen-include-cutlass-dir", default="", help="Specifies the generated CUTLASS code include dir, if needed.")
+args = parser.parse_args()
+
+gen_name = args.gen_name
+
+cutlass_deps_dir = args.cutlass_dir
+
+output_dir = args.output_dir
+output_dir += "/"
+
+cutlass_deps_root = args.gen_include_cutlass_dir
+if cutlass_deps_root == '':
+    cutlass_deps_root = cutlass_deps_dir + "/include/"
+cutlass_deps_root +='/'
+
+
+if not os.path.exists(output_dir):
+    os.makedirs(output_dir) 
+
+if not os.path.exists(output_dir + "/" + "auto_gen"):
+    os.mkdir(output_dir + "/" + "auto_gen") 
+
+if not os.path.exists(output_dir + "/" + "fixed_impl"):
+    os.mkdir(output_dir + "/" + "fixed_impl" )
+
+if not os.path.exists(output_dir + "/" + "sample"):
+    os.mkdir(output_dir + "/" + "sample" )
+
+if not os.path.exists(output_dir + "/" + "auto_gen" + "/" + "device"):
+    os.mkdir(output_dir + "/" + "auto_gen" + "/" + "device") 
+if not os.path.exists(output_dir + "/" + "auto_gen" + "/" + "kernel"):
+    os.mkdir(output_dir + "/" + "auto_gen" + "/" + "kernel")
+if not os.path.exists(output_dir + "/" + "auto_gen" + "/" + "threadblock"):
+    os.mkdir(output_dir + "/" + "auto_gen" + "/" + "threadblock")
+
+with open(args.config_file, 'r') as infile:
+    gemm_info_dict = json.load(infile)
+
+keys = sorted(gemm_info_dict.keys())
+fuse_gemm_info = [gemm_info_dict[k] for k in keys]
+
+
+for_cutlass_gen_user_include_header_file = [
+    cutlass_deps_root + "cutlass/epilogue/thread/linear_combination_leaky_relu.h",
+    cutlass_deps_root + "cutlass/epilogue/thread/linear_combination.h",
+]
+
+for_fused_wrapper = [
+    cutlass_deps_root + "cutlass/epilogue/thread/linear_combination_leaky_relu.h",
+    cutlass_deps_root + "cutlass/epilogue/thread/linear_combination.h",
+    "auto_gen/device/" + gen_name + ".h",
+    cutlass_deps_root + "cutlass/gemm/device/gemm_batched.h",
+    cutlass_deps_root + "cutlass/cutlass.h",
+]
+
+# Copy fixed implementation to the output directory
+fix_impl = replace_fix_impl_header.replace_fix_impl("../fixed_impl/", output_dir +"/fixed_impl/", cutlass_deps_root)
+fix_impl.gen_code()
+
+auto_gen_output_dir = output_dir + "/auto_gen/"
+project_root = ""
+turing_plus = b2b_fused_generator.gen_device(fuse_gemm_info, gen_name, for_cutlass_gen_user_include_header_file, cutlass_deps_root, project_root, auto_gen_output_dir)
+turing_plus.gen_code(75, 'hmma1688', False)
+
+api = api_generator.gen_one_API(fuse_gemm_info, gen_name, for_fused_wrapper, output_dir)
+api.gen_code()
+
+# Generate C++ sample
+os.system("cp ../leaky_bias.h " + output_dir + "/sample/")
+os.system("cp ../utils.h " + output_dir + "/sample/")
+
+sample_dir = output_dir + "/sample/"
+sample = sample_creater.gen_test(fuse_gemm_info, gen_name, for_cutlass_gen_user_include_header_file, sample_dir)
+sample.gen_cpp_sample()
+
+cmake_gen = cmake_creater.gen_build_sys(cutlass_deps_dir, output_dir)
+cmake_gen.gen_code()
+
+verify = verify_creater.gen_verify(fuse_gemm_info, gen_name, for_fused_wrapper, output_dir)
+verify.gen_code()
diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_cmake.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_cmake.py
new file mode 100644
index 0000000000..5db6dd6e07
--- /dev/null
+++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_cmake.py
@@ -0,0 +1,131 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+class gen_build_sys:
+    def __init__(self, cutlass_deps_dir, output_dir = "../"):
+        self.output_dir = output_dir
+        self.cutlass_deps_dir = cutlass_deps_dir
+
+    def gen_top(self):
+        code = ""
+        code += '''\
+# Auto Generated code - Do not edit.
+
+cmake_minimum_required(VERSION 3.8)
+project(CUTLASS_MULTI_GEMMS LANGUAGES CXX CUDA)
+find_package(CUDAToolkit)
+set(CUDA_PATH ${{CUDA_TOOLKIT_ROOT_DIR}})
+set(CUTLASS_PATH \"{cutlass_deps_dir}/include\")
+set(CUTLASS_UTIL_PATH \"{cutlass_deps_dir}/tools/util/include\")
+list(APPEND CMAKE_MODULE_PATH ${{CUDAToolkit_LIBRARY_DIR}})
+'''.format(cutlass_deps_dir=self.cutlass_deps_dir)
+
+        code += '''\
+set(GPU_ARCHS \"\" CACHE STRING
+  \"List of GPU architectures (semicolon-separated) to be compiled for.\")
+
+if(\"${GPU_ARCHS}\" STREQUAL \"\")
+	set(GPU_ARCHS \"70\")
+endif()
+
+foreach(arch ${GPU_ARCHS})
+  set(CMAKE_CUDA_FLAGS \"${CMAKE_CUDA_FLAGS} -gencode arch=compute_${arch},code=sm_${arch}\")
+	if(SM STREQUAL 70 OR SM STREQUAL 75)
+    set(CMAKE_C_FLAGS    \"${CMAKE_C_FLAGS}    -DWMMA\")
+    set(CMAKE_CXX_FLAGS  \"${CMAKE_CXX_FLAGS}  -DWMMA\")
+    set(CMAKE_CUDA_FLAGS \"${CMAKE_CUDA_FLAGS} -DWMMA\")
+	endif()
+endforeach()
+
+set(CMAKE_C_FLAGS    \"${CMAKE_C_FLAGS}\")
+set(CMAKE_CXX_FLAGS  \"${CMAKE_CXX_FLAGS}\")
+set(CMAKE_CUDA_FLAGS \"${CMAKE_CUDA_FLAGS}  -Xcompiler -Wall\")
+
+set(CMAKE_C_FLAGS_DEBUG    \"${CMAKE_C_FLAGS_DEBUG}    -Wall -O0\")
+set(CMAKE_CXX_FLAGS_DEBUG  \"${CMAKE_CXX_FLAGS_DEBUG}  -Wall -O0\")
+set(CMAKE_CUDA_FLAGS_DEBUG \"${CMAKE_CUDA_FLAGS_DEBUG} -O0 -G -Xcompiler -Wall\")
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+if(CMAKE_CXX_STANDARD STREQUAL \"11\")
+  set(CMAKE_CUDA_FLAGS \"${CMAKE_CUDA_FLAGS} --expt-extended-lambda\")
+  set(CMAKE_CUDA_FLAGS \"${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr\")
+endif()
+
+set(CMAKE_CXX_FLAGS \"${CMAKE_CXX_FLAGS} -g -O3\")
+set(CMAKE_CUDA_FLAGS \"${CMAKE_CUDA_FLAGS} -Xcompiler -O3\")
+set(CMAKE_CUDA_FLAGS \"${CMAKE_CUDA_FLAGS} -Xcompiler=-fno-strict-aliasing\")
+
+set(COMMON_HEADER_DIRS
+  ${PROJECT_SOURCE_DIR}
+  ${CUDAToolkit_INCLUDE_DIRS}
+)
+
+set(COMMON_LIB_DIRS
+  ${CUDAToolkit_LIBRARY_DIR}
+)
+list(APPEND COMMON_HEADER_DIRS ${CUTLASS_PATH})
+list(APPEND COMMON_HEADER_DIRS ${CUTLASS_UTIL_PATH})
+'''
+        code += '''\
+include_directories(
+  ${COMMON_HEADER_DIRS}
+)
+
+link_directories(
+  ${COMMON_LIB_DIRS}
+)
+
+add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
+add_definitions(-DGOOGLE_CUDA=1)
+
+add_executable(sample
+  sample/sample.cu
+  one_api.cu
+)
+target_link_libraries(sample PRIVATE
+  -lcudart
+  -lnvToolsExt
+  ${CMAKE_THREAD_LIBS_INIT}
+)
+
+if(NOT DEFINED LIB_INSTALL_PATH)
+	set(LIB_INSTALL_PATH ${CMAKE_CURRENT_BINARY_DIR})
+endif()
+'''
+        return code
+
+    def gen_code(self):
+        top_code = self.gen_top()
+        with open(self.output_dir + "CMakeLists.txt", "w") as f:
+            f.write(top_code)
diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_customized_epilogue.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_customized_epilogue.py
new file mode 100644
index 0000000000..84621f2e79
--- /dev/null
+++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_customized_epilogue.py
@@ -0,0 +1,120 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import ast
+
+fuse_gemm_info = [
+    {
+    'epilogue': {
+        'tp': 'LeakyRelu', #'CustomizedLeaky_RELU'
+        'bias': {'addbias': False, 'bias_tp': 'mat'}, 
+        'args': [('float', 'leaky_alpha', 1.3), ], 
+        'func': '''
+y = max(leaky_alpha * x, x)
+y = y * x
+    '''
+        }
+    },
+
+]
+class AnalysisNodeVisitor(ast.NodeVisitor):
+    def visit_Import(self,node):
+        ast.NodeVisitor.generic_visit(self, node)
+
+    def visit_ImportFrom(self,node):
+        ast.NodeVisitor.generic_visit(self, node)
+
+    def visit_Assign(self,node):
+        print('Node type: Assign and fields: ', node._fields)
+        # print('Node type: Assign and targets value: ', node.targets, node.value)
+
+        ast.NodeVisitor.generic_visit(self, node)
+    
+    def visit_BinOp(self, node):
+        print('Node type: BinOp and fields: ', node._fields)
+        print('node op: ', type(node.op).__name__)
+        ast.NodeVisitor.generic_visit(self, node)
+
+    def visit_Expr(self, node):
+        print('Node type: Expr and fields: ', node._fields)
+        ast.NodeVisitor.generic_visit(self, node)
+
+    def visit_Num(self,node):
+        print('Node type: Num and fields: ', node._fields)
+        print('Node type: Num: ', node.n)
+
+    def visit_Name(self,node):
+        print('Node type: Name and fields: ', node._fields)
+        print('Node type: Name and fields: ', type(node.ctx).__name__, node.id)
+
+        ast.NodeVisitor.generic_visit(self, node)
+
+    def visit_Str(self, node):
+        print('Node type: Str and fields: ', node._fields)
+
+class CodeVisitor(ast.NodeVisitor):
+    def visit_BinOp(self, node):
+        if isinstance(node.op, ast.Add):
+            node.op = ast.Sub()
+            self.generic_visit(node)
+
+    def visit_Assign(self, node):
+        print('Assign %s' % node.value)
+        self.generic_visit(node)
+
+    def visit_Name(self, node):
+        print("Name:", node.id)
+        self.generic_visit(node)
+
+
+    def visit_FunctionDef(self, node):
+        print('Function Name:%s'% node.name.op)
+        self.generic_visit(node)
+        func_log_stmt = ast.Print(
+            dest = None,
+            values = [ast.Str(s = 'calling func: %s' % node.name, lineno = 0, col_offset = 0)],
+            nl = True,
+            lineno = 0,
+            col_offset = 0,
+        )
+        node.body.insert(0, func_log_stmt)
+
+visitor = AnalysisNodeVisitor()
+
+code = \
+'''
+
+a=max(leaky_alpha * x, x +1)
+
+'''
+
+visitor.visit(ast.parse(code))
diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_device.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_device.py
new file mode 100644
index 0000000000..371a4be847
--- /dev/null
+++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_device.py
@@ -0,0 +1,469 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from typing import *
+
+import helper
+import gen_ir
+
+import gen_kernel as gen_ker
+
+
+class gen_device:
+    def __init__(self, fuse_gemm_info, gen_class_name, user_header_file, cutlass_deps_root, project_root, output_dir = "../"):
+        self.fuse_gemm_info = fuse_gemm_info
+        self.raw_gemm_info = fuse_gemm_info
+        self.b2b_num = len(fuse_gemm_info)
+        self.user_header_file = user_header_file
+        self.args = {}
+        # device arg struct memebr
+        self.arg_member = []
+        self.gen_class_name = gen_class_name
+        self.gen_kernel_name = gen_class_name + "Kernel"
+        self.template_args = []
+        self.__tempalate_arg_list = {'Stages': int, 'SplitKSerial': bool, 'IsBetaZero': bool, 'AlignmentA': int, 'AlignmentB': int}
+
+        self.file_name = output_dir + "/device/" +gen_class_name +".h"
+        self.sample_dir = output_dir
+
+
+        self.cutlass_deps_root = cutlass_deps_root
+        self.project_root = project_root
+        self.this_file_root = output_dir + "/device/"
+
+        self.first_use_1stage = False
+
+        ## gen kernel
+        self.gen_kernel = gen_ker.gen_kernel(self.template_args, self.gen_class_name, self.b2b_num, output_dir, cutlass_deps_root, project_root)
+
+
+    def __check_arg_type(self, temp_arg):
+        if temp_arg in self.__tempalate_arg_list.keys():
+            return self.__tempalate_arg_list[temp_arg] 
+        
+        find_sub = False
+        for candidate_arg in self.__tempalate_arg_list.keys():
+            if (temp_arg.find(candidate_arg) != -1):
+                return self.__tempalate_arg_list[candidate_arg] 
+        
+        return 'typename'
+
+    # def gen_B2b2bGemm_class():
+    def set_arch(self, sm_cap, mma_tp):
+        if sm_cap == 75 or sm_cap == 80 or sm_cap == 86:
+            self.arch = "cutlass::arch::Sm" + str(sm_cap)
+
+        if mma_tp is 'hmma1688':
+            self.mma_shape = [16, 8, 8]
+            self.mma_tp = 'hmma'
+        elif mma_tp is 'imma8816':
+            self.mma_tp = 'imma'
+            self.mma_shape = [8, 8, 16]
+        else:
+            return 0
+
+    def gen_include_header(self):
+        code = '''\
+/* Auto Generated code - Do not edit.*/
+
+#pragma once
+
+#include \"{cutlass_root}cutlass/cutlass.h\"
+#include \"{cutlass_root}cutlass/numeric_types.h\"
+#include \"{cutlass_root}cutlass/arch/arch.h\"
+#include \"{cutlass_root}cutlass/device_kernel.h\"
+
+#include \"{cutlass_root}cutlass/gemm/threadblock/threadblock_swizzle.h\"
+
+#include \"{cutlass_root}cutlass/gemm/device/default_gemm_configuration.h\"
+#include \"{cutlass_root}cutlass/epilogue/thread/linear_combination_relu.h\"
+#include \"{cutlass_root}cutlass/epilogue/thread/linear_combination.h\"
+
+#include \"{project_root}../kernel/b2b_gemm.h\"
+#include \"{project_root}../kernel/default_b2b_gemm.h\"
+'''.format(cutlass_root=self.cutlass_deps_root, project_root=self.project_root, this_file_root=self.this_file_root)
+        include_user_header = ""
+        for header in self.user_header_file:
+            include_user_header += "#include \"" + header + "\"\n"
+        return code + include_user_header
+
+    def gen_code(self, sm_cap, mma_tp, ifprint = True):
+        self.set_arch(sm_cap, mma_tp)
+
+        self.update_b2b_args()
+        print(self.fuse_gemm_info)
+        self.update_b2b_class_template_args()
+
+        func_code = self.gen_all_func()
+        member_var_code = "private:\n typename B2bGemmKernel::Params params_;\n"
+
+        gen_code = gen_ir.gen_template_class(self.gen_class_name, self.template_args, func_code + member_var_code)
+        code = self.gen_include_header() + gen_ir.gen_namespace("cutlass", gen_ir.gen_namespace("gemm", gen_ir.gen_namespace("device", gen_code)))
+
+        if ifprint:
+            print(code)
+
+        print("[INFO]: Gen device code output Dir: is ", self.file_name)
+        with open(self.file_name, 'w+') as f:
+            f.write(code)
+
+
+        gen_kernel = self.gen_kernel.gen_code(self.first_use_1stage)
+        print(gen_kernel)
+
+    def update_b2b_class_template_args(self):
+        for arg in self.args.keys():
+            self.template_args.append([self.__check_arg_type(arg), arg, self.args[arg]])
+
+    def update_b2b_args(self):
+
+        self.args['ElementA'] = helper.type_2_cutlass_type(self.fuse_gemm_info[0]['A_tp'])
+        self.args['LayoutA'] = helper.type_2_cutlass_type(self.fuse_gemm_info[0]['A_format'])
+
+        cnt = 0
+
+        warp_M_tile = 32
+
+        # Determine maxmimum N_tile
+        Max_Ntile = 0
+        for layer in self.fuse_gemm_info:
+            n_tile = layer['mnk'][1]
+            if n_tile > Max_Ntile:
+                Max_Ntile = n_tile
+        if Max_Ntile >= 256:
+            warp_M_tile = 16
+
+        stages_temp = []
+
+        for layer in self.fuse_gemm_info:
+            cnt_str = str(cnt)
+            B_tp_str= 'ElementB' + cnt_str
+            B_format_str = 'LayoutB' + cnt_str
+            C_tp_str= 'ElementC' + cnt_str
+            C_format_str = 'LayoutC' + cnt_str
+            Acc_str = 'ElementAccumulator' + cnt_str
+
+            self.args[B_tp_str] = helper.type_2_cutlass_type(layer['B_tp'])
+            self.args[B_format_str] = helper.type_2_cutlass_type(layer['B_format'])
+            self.args[C_tp_str] = helper.type_2_cutlass_type(layer['C_tp'])
+            self.args[C_format_str] = helper.type_2_cutlass_type(layer['C_format'])
+            self.args[Acc_str] = helper.type_2_cutlass_type(layer['Acc_tp'])
+            
+
+            mnk = layer['mnk'][:]
+
+            tile_mnk = mnk[:]            
+
+            tile_mnk[2] = 32 # force the ktile is 32
+
+            #N tile gen
+            if mnk[1] > 1024:
+                assert(0)
+            elif mnk[1] > 512:
+                tile_mnk[1] = 1024
+            elif mnk[1] > 256:
+                tile_mnk[1] = 512
+            elif mnk[1] > 128:
+                tile_mnk[1] = 256
+            elif mnk[1] > 64:
+                tile_mnk[1] = 128
+            elif mnk[1] > 32:
+                tile_mnk[1] = 64
+            else : 
+                tile_mnk[1] = 32
+
+            if tile_mnk[1] == 512:
+                stages_temp.append(1)
+            else:
+                stages_temp.append(2)
+
+            tile_mnk[0] = 4 * warp_M_tile
+
+
+
+            epilogue_setted_type = helper.get_epilogue_tp(layer)
+            cutlass_epilogue_name = "LinearCombinationRelu"
+            if epilogue_setted_type.lower() == 'leakyrelu':
+                cutlass_epilogue_name = "LinearCombinationLeakyRelu"
+            elif epilogue_setted_type.lower() == 'identity':
+                cutlass_epilogue_name = "LinearCombination"
+
+            epilogue_str = 'EpilogueOutputOp' + cnt_str
+            if cnt != len(self.fuse_gemm_info) - 1:
+                n = layer['mnk'][1]
+                Fragments = tile_mnk[1] // 8 * 2
+                self.args[epilogue_str] = "cutlass::epilogue::thread::" + cutlass_epilogue_name + "<ElementC0_, " + str(Fragments) +", ElementAccumulator0_, ElementAccumulator0_>"
+            else:
+                n = layer['mnk'][1]
+                n_mod_8 = n % 4
+                N_align_elements = 1
+                if n_mod_8 == 0:
+                    N_align_elements = 8
+                elif n_mod_8 == 4:
+                    N_align_elements = 4
+                elif n_mod_8 == 2 or n_mod_8 == 6:
+                    N_align_elements = 2
+
+                self.args[epilogue_str] = "cutlass::epilogue::thread::" + cutlass_epilogue_name+ "<ElementC0_, " + str(N_align_elements) + ", ElementAccumulator0_, ElementAccumulator0_>"
+
+            
+
+            ThreadBlockShape_str = 'ThreadblockShape' + cnt_str
+
+            self.args[ThreadBlockShape_str] = helper.cvt_2_cutlass_shape(tile_mnk)
+
+            WarpShape_str = 'WarpShape' + cnt_str
+            tile_mnk[0] = warp_M_tile
+            self.args[WarpShape_str] = helper.cvt_2_cutlass_shape(tile_mnk)
+            cnt += 1
+
+
+        self.args['ElementD'] = helper.type_2_cutlass_type(self.fuse_gemm_info[self.b2b_num - 1]['C_tp'])
+        self.args['LayoutD'] = helper.type_2_cutlass_type(self.fuse_gemm_info[self.b2b_num - 1]['C_format'])
+        
+        self.args['InstructionShape'] = helper.cvt_2_cutlass_shape(self.mma_shape)
+        self.args['OperatorClass'] = 'arch::OpClassTensorOp'
+        self.args['ArchTag'] = self.arch
+        self.args['ThreadblockSwizzle'] = 'threadblock::GemmBatchedIdentityThreadblockSwizzle'
+        
+
+        for i in range(self.b2b_num):
+            self.args[helper.var_idx('Stages', i)] = "2"
+
+        self.args['AlignmentA'] = str(8)
+        self.args['AlignmentB'] = str(8)
+        self.args['SplitKSerial'] = 'false'
+        self.args['Operator'] = 'typename DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB0_, ElementC0_, ElementAccumulator0_>::Operator'
+        self.args['IsBetaZero'] = 'false'
+
+
+    def gen_using_kernel(self):
+        code = "using B2bGemmKernel = typename kernel::DefaultB2bGemm<\n"
+        code += "    " + "ElementA,\n"
+        code += "    " + "LayoutA,\n"
+
+        for i in range(self.b2b_num):
+            code += "    " + helper.var_idx("ElementB", i) + ",\n"
+            code += "    " + helper.var_idx("LayoutB", i) + ",\n"
+            code += "    " + helper.var_idx("ElementC", i) + ",\n"
+            code += "    " + helper.var_idx("LayoutC", i) + ",\n"
+            code += "    " + helper.var_idx("ElementAccumulator", i) + ",\n"
+            code += "    " + helper.var_idx("EpilogueOutputOp", i) + ",\n"
+            code += "    " + helper.var_idx("ThreadblockShape", i) + ",\n"
+            code += "    " + helper.var_idx("WarpShape", i) + ",\n"
+
+        code +=  "    " + "ElementD,\n"
+        code +=  "    " + "LayoutD,\n"
+        code +=  "    " + "InstructionShape,\n"
+        code +=  "    " + "OperatorClass,\n"
+        code +=  "    " + "ArchTag,\n"
+        code +=  "    " + "ThreadblockSwizzle,\n"
+
+        for i in range(self.b2b_num):
+            code +=  "    " + helper.var_idx("Stages", i) + ",\n"
+
+
+        code +=  "    " + "AlignmentA,\n"
+        code +=  "    " + "AlignmentB,\n"
+        code +=  "    " + "SplitKSerial,\n"
+        code +=  "    " + "Operator,\n"
+        code +=  "    " + "IsBetaZero_\n"
+
+        code += ">::B2bGemmKernel;\n\n"
+
+        return code
+    
+    def gen_args(self):
+
+        def gen_arg_member(b2b_num):
+            data_members = []
+
+            for i in range(b2b_num):
+                member_type = "GemmCoord"
+                member_name = "problem_size_" + str(i)
+                data_members.append((member_type, member_name))
+
+            member_type = "TensorRef<ElementA const, LayoutA>"
+            member_name = "ref_A0"
+            data_members.append((member_type, member_name))
+            
+            for i in range(b2b_num):
+                member_type = "TensorRef<ElementB" + str(i) + " const, LayoutB" + str(i) +">"
+                member_name = "ref_B" + str(i)
+                data_members.append((member_type, member_name))
+                member_type = "TensorRef<ElementC" + str(i) + " const, LayoutC" + str(i) +">"
+                member_name = "ref_C" + str(i)
+                data_members.append((member_type, member_name))
+            
+            member_type = "TensorRef<ElementD, LayoutD>"
+            member_name = helper.var_idx("ref_D", b2b_num - 1)
+            data_members.append((member_type, member_name))
+
+            for i in range(b2b_num):
+                member_type = "typename EpilogueOutputOp" + str(i) + "::Params"
+                member_name = "epilogue" + str(i)
+                data_members.append((member_type, member_name))
+
+            data_members.append(('int', 'batch_count'))
+
+            return data_members
+        
+        def gen_arg_struct_default_ctor(struct_name, data_members, inital_param_num, inital_value):
+            constructs_code = gen_ir.indentation + "CUTLASS_HOST_DEVICE\n" + \
+                              gen_ir.indentation + struct_name + " (): "
+            for i in range(inital_param_num):
+                final_param = ','
+                if i == inital_param_num - 1:
+                    final_param = '{ }'
+                constructs_code +=  data_members[i][1] + inital_value + final_param
+
+            constructs_code += "\n"
+            return constructs_code
+
+        def gen_arg_struct_ctor(struct_name, data_members):
+            constructs_code = gen_ir.indentation + "CUTLASS_HOST_DEVICE\n" + \
+                              gen_ir.indentation + struct_name + " (\n"
+            cnt = 0
+            param_num = len(data_members)
+            for param in data_members:
+                final = ',\n'
+                if cnt == param_num - 1:
+                    final = '\n):\n'
+                constructs_code +=  gen_ir.indentation + param[0] + " " + param[1] + "_" + final
+                cnt += 1
+
+            cnt = 0
+            for param in data_members:
+                final = '),\n'
+                if cnt == param_num - 1:
+                    final = ") { }\n"
+                constructs_code +=  gen_ir.indentation + param[1] + "(" + param[1] + "_" + final
+                cnt += 1
+
+            constructs_code += "\n"
+            return constructs_code    
+
+        # (variable type, variable name)
+        struct_member = gen_arg_member(self.b2b_num)
+        self.arg_member = struct_member
+
+        codeBody = ""
+        for each_member in struct_member:
+            codeBody += gen_ir.indentation + each_member[0] + " " + each_member[1] + ";\n"
+
+        codeBody += gen_arg_struct_default_ctor("Arguments", struct_member, self.b2b_num, "(0,0,0)") + "\n"
+        codeBody += gen_arg_struct_ctor("Arguments", struct_member) + "\n"
+        struct_code = gen_ir.gen_struct("Arguments", codeBody)
+        return struct_code
+
+    def gen_func_constructs(self):
+        code = self.gen_class_name +"() {}"
+        return code
+
+    def gen_func_initialize(self):
+        code = "Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {\n" + \
+                "// Determine grid shape\n" + \
+                "ThreadblockSwizzle threadblock_swizzle;\n" + \
+                "cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(\n" + \
+                "  args.problem_size_0, \n" + \
+                "  { ThreadblockShape0::kM, ThreadblockShape0::kN, ThreadblockShape0::kK },\n" + \
+                "  args.batch_count);\n" + \
+                "// Initialize the Params structure\n" + \
+                "params_ = typename B2bGemmKernel::Params{\n"
+        for i in range(self.b2b_num):
+            code += helper.var_idx("  args.problem_size_", i) + ",\n"
+        code += "  grid_shape,\n" + \
+                "  args.ref_A0.non_const_ref(),\n"
+        for i in range(self.b2b_num):
+            code += helper.var_idx("  args.ref_B", i) + ".non_const_ref(),\n"
+            code += helper.var_idx("  args.ref_C", i) + ".non_const_ref(),\n"
+
+        code += helper.var_idx("  args.ref_D", self.b2b_num - 1) + ",\n"
+        for i in range(self.b2b_num):
+            code += helper.var_idx("  args.epilogue", i) + ",\n"
+
+        code += "  args.batch_count\n"
+        code += "};\n" + \
+                "return Status::kSuccess;\n" + \
+                "}\n"
+        return code 
+
+    def gen_func_run(self):
+        code = "Status run(cudaStream_t stream = nullptr) {\n" + \
+                "\n" + \
+                "  ThreadblockSwizzle threadblock_swizzle;\n" + \
+                "\n" + \
+                "  dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);\n" + \
+                "  dim3 block(B2bGemmKernel::kThreadCount, 1, 1);\n" + \
+                "\n" + \
+                "  cudaError_t result;\n" + \
+                "\n" + \
+                "  int smem_size = int(sizeof(typename B2bGemmKernel::SharedStorage));\n" + \
+                "  if (smem_size >= (48 << 10)) {\n" + \
+                "    result = cudaFuncSetAttribute(Kernel<B2bGemmKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);\n" + \
+                "\n" + \
+                "    if (result != cudaSuccess) {\n" + \
+                "      return Status::kErrorInternal;\n" + \
+                "    }\n" + \
+                "  }\n" + \
+                "  cutlass::Kernel<B2bGemmKernel><<<grid, block, smem_size, stream>>>(params_);\n" + \
+                "  result = cudaGetLastError();\n" + \
+                "  return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;\n" + \
+                "  }\n"
+        
+        return code
+    def gen_func_operator(self):
+        opeartor_with_arg_code = "Status operator()(\n" + \
+                                "  Arguments const &args,\n" + \
+                                "  void *workspace = nullptr,\n" + \
+                                "  cudaStream_t stream = nullptr) {\n" + \
+                                "  Status status = initialize(args, workspace);\n" + \
+                                "  \n" + \
+                                "  if (status == Status::kSuccess) {\n" + \
+                                "    status = run(stream);\n" + \
+                                "  }\n" + \
+                                "  return status;\n" + \
+                                "}\n"
+        operator_code = "Status operator()(\n" + \
+                        "  cudaStream_t stream = nullptr) {\n" + \
+                        "   Status status = run(stream);\n" + \
+                        "   return status;\n" + \
+                        "}\n"
+        return opeartor_with_arg_code + "\n" + operator_code
+
+    def gen_all_func(self):
+        return  self.gen_using_kernel() + "\n" + \
+                self.gen_args() + "\n" + \
+                self.gen_func_constructs()  + "\n" + \
+                self.gen_func_initialize() + "\n" + \
+                self.gen_func_run() + "\n" + \
+                self.gen_func_operator()
diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_ir.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_ir.py
new file mode 100644
index 0000000000..919c777e42
--- /dev/null
+++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_ir.py
@@ -0,0 +1,249 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import helper
+
+
+indentation = "    "
+
+
+def append_word(word):
+    code = ""
+    code += word
+    code += " "
+    return code
+
+
+def gen_namespace(namespace, codeBody):
+    code_gen = "namespace " + namespace + " {\n"
+    code_gen += codeBody
+    code_gen += "} // namespace " + namespace + "\n"
+    return code_gen
+
+
+def gen_expression(type, lval, rval = None):
+    code_gen = ""
+    code_gen += append_word(type)
+    code_gen += append_word(lval)
+    if rval is not None:
+        code_gen += append_word("=")
+        code_gen += append_word(rval)
+    return code_gen
+
+
+def gen_class(name, codeBody, inheritance_code = None):
+    code_gen = ""
+    if inheritance_code is None:
+        code_gen = "class " + name + "{\n"
+    else:
+        code_gen = "class " + name + " : "+ inheritance_code + "{\n"
+    code_gen += codeBody
+    code_gen += "}; // class " + name + "\n"
+    return code_gen
+
+
+def gen_struct(name, codeBody, specialized = None):
+    specialized_code = ""
+    if specialized is not None:
+        specialized_code = "<" + specialized + ">"
+    code_gen = "struct " + name + specialized_code + "{\n"
+    code_gen += codeBody
+    code_gen += "}; // struct " + name + "\n"
+    return code_gen
+
+
+def gen_template_arg(arg_type, arg_name, default_val = None):
+    rval = None
+    if default_val is not None:
+        rval = str(default_val)
+
+    arg_typename = ""
+    if arg_type is int:
+        arg_typename = "int"
+    elif arg_type is bool:
+        arg_typename = "bool"
+    else:
+        arg_typename = "typename"
+
+    internal_arg_name = arg_name + "_"
+
+    code_gen = indentation
+    code_gen += gen_expression(arg_typename, internal_arg_name, rval)
+
+    return code_gen
+
+
+def gen_template_args(args, set_default = True):
+    arg_len = len(args)
+    cnt = 1
+    code_gen = ""
+    for arg_tuple in args:
+        arg_type = arg_tuple[0]
+        arg_name = arg_tuple[1]
+        arg_default_val = None
+        if len(arg_tuple) == 3 and set_default:
+            arg_default_val = arg_tuple[2]
+
+        code_gen += gen_template_arg(arg_type, arg_name, arg_default_val)
+        if cnt != arg_len:
+            code_gen += ",\n"
+        cnt += 1
+
+    return code_gen
+
+
+def gen_template_head(args, set_default = True):
+    code_gen = "template <\n"
+    code_gen += gen_template_args(args, set_default)
+    code_gen += ">\n"
+    return code_gen
+
+
+def export_template_args(args):
+    code_gen = "public:\n"
+    for arg_tuple in args:
+        code_gen += indentation
+        arg_type = arg_tuple[0]
+        arg_name = arg_tuple[1]
+        internal_arg_name = arg_name + "_"
+
+        typename = ""
+        if arg_type is int:
+            typename = "static int const"
+        elif arg_type is bool:
+            typename = "static bool const"
+        else:
+            typename = "using"
+
+        code_gen += gen_expression(typename, arg_name, internal_arg_name)
+        code_gen += ";\n"
+    return code_gen
+
+
+def gen_template_class(class_name, args, codeBody, set_default = True, inheritance_code = None):
+    code_gen = ""
+
+    code_gen += gen_template_head(args, set_default)
+    code_gen += gen_class(class_name, export_template_args(args) + codeBody, inheritance_code)
+
+    return code_gen
+
+
+def gen_template_struct(struct_name, args, codeBody, speicalized = None, set_default = True, export_args = True):
+    code_gen = ""
+    code_gen += gen_template_head(args, set_default)
+    code = export_template_args(args) + codeBody
+    if export_args is False:
+        code = codeBody
+    code_gen += gen_struct(struct_name, code , speicalized)
+
+    return code_gen
+
+
+def gen_declare_template_struct(name, *params):
+    code = name + "<"
+    cnt = 0
+    param_num = len(params)
+    for param in params:
+        final = ", "
+        if cnt == param_num - 1:
+            final = ""
+        code += param + final
+        cnt += 1
+    code += ">;\n"
+    return code
+
+
+def filtered_param(params, name_and_value_pair, keep_ = False):
+    rtn_template_args = []
+    speicalized_template_args = []
+
+    for param in params:
+        param_name = ""
+        if len(param) >= 1:
+            param_name = param[1]
+        else:
+            param_name = param[0]
+        
+        hit_flag = False
+        set_value = ""
+        for n_v_pair in name_and_value_pair:
+            
+            filter_name = n_v_pair[0]
+            set_value = n_v_pair[1]
+
+            if param_name == (filter_name + "_") or param_name == filter_name :
+                hit_flag = True
+                break
+
+            
+        if hit_flag is False:
+            rtn_template_args.append(param)
+
+        if hit_flag is True:
+            speicalized_template_args.append(set_value)
+        else:
+            if keep_ is True:
+                speicalized_template_args.append(param_name + "_")
+            else:
+                speicalized_template_args.append(param_name)
+
+    
+    specialized_template_arg_str = helper.list_2_string(speicalized_template_args)
+    
+    return rtn_template_args, specialized_template_arg_str
+
+            
+def gen_func(func_name, arg_lists, code_body, only_declare = False, with_cudaStream = True):
+    code = "void " + func_name + "(\n"
+    for arg in arg_lists:
+        arg_tp = arg[0]
+        arg_nm = arg[1]
+        code += "    " + arg_tp + " " + arg_nm + ",\n"
+    code += "cudaStream_t stream)"
+    if only_declare :
+        return code
+    code += "{\n"
+
+    code += code_body + "\n"
+    code += "}\n"
+    return code
+
+
+def indent_level(code, level = 0):
+    rtn_code = ""
+    for i in range(level):
+        rtn_code += "    "
+    
+    rtn_code += code
+
+    return rtn_code
diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_kernel.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_kernel.py
new file mode 100644
index 0000000000..2bbaf26b40
--- /dev/null
+++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_kernel.py
@@ -0,0 +1,476 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import gen_ir
+import helper
+import gen_threadblock as gen_tb
+
+
+class gen_default_Gemm:
+    def __init__(self, template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root):
+        self.gen_class_name = "B2bGemm"
+        self.template_param = template_param
+        self.b2b_num = b2b_num
+
+        self.cutlass_deps_root = cutlass_deps_root
+        self.project_root = project_root
+
+    def gen_B2bMma(self, specialized_template_args):
+        code = "using B2bMma = typename cutlass::gemm::threadblock::DefaultB2bMma<\n"
+        code += specialized_template_args
+        code += ">::ThreadblockB2bMma;\n"
+
+        # print(code)
+        return code
+
+    def gen_epilogue(self):
+        epilogue_code = ""
+        epilogue_code += helper.var_idx("static const int kPartitionsK", self.b2b_num - 1) + helper.var_idx(" = ThreadblockShape", self.b2b_num - 1) + helper.var_idx("::kK / WarpShape", self.b2b_num - 1) + "::kK;\n"
+
+        epilogue_code += "using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<\n"
+        epilogue_code += "    " + helper.var_idx("ThreadblockShape", self.b2b_num - 1) + ",\n"
+        epilogue_code += "    " + helper.var_idx("typename B2bMma::Operator", self.b2b_num - 1) + ",\n"
+        epilogue_code += "    " + helper.var_idx("kPartitionsK", self.b2b_num - 1) + ",\n"
+        epilogue_code += "    " + helper.var_idx("EpilogueOutputOp", self.b2b_num - 1) + ",\n"
+        epilogue_code += "    " + helper.var_idx("EpilogueOutputOp", self.b2b_num - 1) + "::kCount\n"
+        epilogue_code += ">::Epilogue;\n"
+
+        epilogue_code += "using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle, SplitKSerial>;\n\n"
+
+        return epilogue_code
+
+
+    def gen_include_header(self):
+        code = '''
+/* Auto Generated code - Do not edit.*/
+
+#pragma once
+#include \"{cutlass_dir}cutlass/cutlass.h\"
+
+#include \"{cutlass_dir}cutlass/layout/matrix.h\"
+#include \"{cutlass_dir}cutlass/numeric_types.h\"
+
+#include \"{cutlass_dir}cutlass/epilogue/threadblock/epilogue.h\"
+#include \"{cutlass_dir}cutlass/epilogue/thread/linear_combination.h\"
+
+#include \"{cutlass_dir}cutlass/gemm/gemm.h\"
+#include \"{cutlass_dir}cutlass/gemm/kernel/gemm_pipelined.h\"
+#include \"{cutlass_dir}cutlass/gemm/threadblock/default_mma_core_sm75.h\"
+#include \"{cutlass_dir}cutlass/gemm/threadblock/default_mma_core_sm70.h\"
+#include \"{cutlass_dir}cutlass/gemm/threadblock/default_mma_core_sm80.h\"
+#include \"{cutlass_dir}cutlass/gemm/threadblock/default_mma_core_simt.h\"
+#include \"{cutlass_dir}cutlass/gemm/threadblock/threadblock_swizzle.h\"
+#include \"{cutlass_dir}cutlass/epilogue/threadblock/default_epilogue_tensor_op.h\"
+#include \"{cutlass_dir}cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h\"
+#include \"{cutlass_dir}cutlass/epilogue/threadblock/default_epilogue_simt.h\"
+
+#include \"{cutlass_dir}cutlass/transform/threadblock/predicated_tile_iterator.h\"
+
+#include \"../kernel/b2b_gemm.h\"
+#include \"../threadblock/default_b2b_mma.h\"
+'''.format(cutlass_dir=self.cutlass_deps_root)
+        return code
+
+    def gen_code(self):
+        gen_using = ''
+        # Generate default template struct
+        gen_code = gen_ir.gen_template_struct("Default" + self.gen_class_name, self.template_param,"", speicalized = None, set_default=False)
+        
+
+        filter_list = []
+        filter_list.append(('Stages', 2))
+        filter_list.append(("OperatorClass", "arch::OpClassTensorOp"))
+        filter_list.append(("ArchTag", "arch::Sm75"))
+
+        for i in range(self.b2b_num):
+            filter_list.append((helper.var_idx("LayoutC", i), "layout::RowMajor"))
+
+
+        rtn_template_args, speicalized_template_args = gen_ir.filtered_param(self.template_param, filter_list, keep_= True)
+
+
+        B2bMma_code = self.gen_B2bMma(speicalized_template_args)
+        epilogue_and_rest_code = self.gen_epilogue()
+       
+        gen_special_code = gen_ir.gen_template_struct("Default" + self.gen_class_name, rtn_template_args, B2bMma_code + epilogue_and_rest_code, speicalized = speicalized_template_args, set_default=False)
+
+        code = gen_ir.gen_namespace("cutlass", gen_ir.gen_namespace("gemm", gen_ir.gen_namespace("kernel", gen_code + gen_special_code)))
+
+        return self.gen_include_header() + code
+
+
+class gen_Kernel:
+    def __init__(self, template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root):
+        self.gen_class_name = "B2bGemm"
+        self.template_param = template_param
+        self.b2bnum = b2b_num
+
+        self.cutlass_deps_root = cutlass_deps_root
+        self.project_root = project_root
+
+    def gen_include_header(self):
+        code = '''
+#pragma once
+
+#include \"{cutlass_dir}cutlass/cutlass.h\"
+#include \"{cutlass_dir}cutlass/gemm/gemm.h\"
+#include \"{cutlass_dir}cutlass/matrix_coord.h\"\n'''.format(cutlass_dir=self.cutlass_deps_root)  
+        return code
+
+    def gen_Params(self):
+        gen_param = ""
+        for i in range(self.b2bnum):
+            gen_param += "    " + helper.var_idx("cutlass::gemm::GemmCoord problem_size_", i) + ";\n"
+        gen_param += "    " + "cutlass::gemm::GemmCoord grid_tiled_shape;\n" 
+        gen_param += "    " + "typename B2bMma::IteratorA0::Params params_A0;\n" 
+        gen_param += "    " + "typename B2bMma::IteratorA0::TensorRef ref_A0;\n" 
+
+        for i in range(self.b2bnum):
+            gen_param += "    " + helper.var_idx("typename B2bMma::IteratorB", i) + helper.var_idx("::Params params_B", i) + ";\n"
+            gen_param += "    " + helper.var_idx("typename B2bMma::IteratorB", i) + helper.var_idx("::TensorRef ref_B", i) + ";\n"
+            if i == self.b2bnum - 1:
+                gen_param += "    " + helper.var_idx("typename Epilogue::OutputTileIterator::Params params_C", i) + ";\n"
+                gen_param += "    " + helper.var_idx("typename Epilogue::OutputTileIterator::TensorRef ref_C", i) + ";\n"
+
+            else:
+                gen_param += "    " + helper.var_idx("typename FusedAddBiasEpilogue", i) + helper.var_idx("::OutputTileIterator::Params params_C", i) + ";\n"
+                gen_param += "    " + helper.var_idx("typename FusedAddBiasEpilogue", i) + helper.var_idx("::OutputTileIterator::TensorRef ref_C", i) + ";\n"
+
+                
+
+
+        gen_param += "    " + helper.var_idx("typename Epilogue::OutputTileIterator::Params params_D", self.b2bnum - 1) + ";\n"
+        gen_param += "    " + helper.var_idx("typename Epilogue::OutputTileIterator::TensorRef ref_D", self.b2bnum - 1) + ";\n"
+
+        for i in range(self.b2bnum):
+            gen_param += "    " + helper.var_idx("typename OutputOp", i) + helper.var_idx("::Params output_op_", i) + ";\n"
+
+        gen_param += "    " + 'int batch_count' + ";\n"
+        gen_param += "    " + 'int gemm_k_iterations_0' + ";\n"
+
+
+        return gen_param
+
+    def gen_Memberfunc(self):
+        code_default = "\nCUTLASS_HOST_DEVICE\n"
+        code_default += "Params()"
+
+        code_default += " { } \n\n"
+
+        code_construct = "\nCUTLASS_HOST_DEVICE\n"
+        code_construct += "Params(\n"
+
+        for i in range(self.b2bnum):
+            code_construct += "    " + helper.var_idx("cutlass::gemm::GemmCoord const & problem_size_", i) + ",\n"
+
+        code_construct += "    " + "cutlass::gemm::GemmCoord const & grid_tiled_shape,\n"
+
+        code_construct += "    " + "typename B2bMma::IteratorA0::TensorRef ref_A0,\n" 
+
+        for i in range(self.b2bnum):
+            code_construct += "    " + helper.var_idx("typename B2bMma::IteratorB", i) + helper.var_idx("::TensorRef ref_B", i) + ",\n"
+            if i == self.b2bnum - 1:
+                code_construct += "    " + helper.var_idx("typename Epilogue::OutputTileIterator::TensorRef ref_C", i) + ",\n"
+            else:
+                code_construct += "    " + helper.var_idx("typename FusedAddBiasEpilogue", i) + helper.var_idx("::OutputTileIterator::TensorRef ref_C", i) + ",\n"
+
+        code_construct += "    " + helper.var_idx("typename Epilogue::OutputTileIterator::TensorRef ref_D", self.b2bnum - 1) + ",\n"
+        for i in range(self.b2bnum):
+            code_construct += "    " + helper.var_idx("typename OutputOp", i) + helper.var_idx("::Params output_op_", i) + helper.var_idx(" = typename OutputOp", i) + "::Params(),\n"
+
+        code_construct += "    " + "int batch_count = 1\n"
+
+        code_construct += "):\n"
+        
+        for i in range(self.b2bnum):
+            code_construct += "    " + helper.var_idx("problem_size_", i) + helper.var_idx("(problem_size_", i) + "),\n"
+
+        code_construct += "    " + "grid_tiled_shape(grid_tiled_shape),\n"
+        code_construct += "    " + "params_A0(ref_A0.layout()),\n"
+        code_construct += "    " + "ref_A0(ref_A0),\n"
+
+        for i in range(self.b2bnum):
+            code_construct += "    " + helper.var_idx("params_B", i) + helper.var_idx("(ref_B", i) + ".layout()),\n"
+            code_construct += "    " + helper.var_idx("ref_B", i) + helper.var_idx("(ref_B", i) + "),\n"
+            code_construct += "    " + helper.var_idx("params_C", i) + helper.var_idx("(ref_C", i) + ".layout()),\n"
+            code_construct += "    " + helper.var_idx("ref_C", i) + helper.var_idx("(ref_C", i) + "),\n"
+
+        code_construct += "    " + helper.var_idx("params_D", self.b2bnum - 1) + helper.var_idx("(ref_D", self.b2bnum - 1) + ".layout()),\n"
+        code_construct += "    " + helper.var_idx("ref_D", self.b2bnum - 1) + helper.var_idx("(ref_D", self.b2bnum - 1) + "),\n"
+
+        for i in range(self.b2bnum):
+            code_construct += "    " + helper.var_idx("output_op_", i) + helper.var_idx("(output_op_", i) + "), \n"
+
+        code_construct += "    " + "batch_count(batch_count) {\n"
+        code_construct += "    " + helper.var_idx("gemm_k_iterations_", 0) + helper.var_idx(" = (problem_size_", 0) + helper.var_idx(".k() + B2bMma::Shape", 0) + helper.var_idx("::kK - 1) / B2bMma::Shape", 0) + "::kK;\n"
+
+        code_construct += "}\n"
+
+        return code_default + code_construct
+
+    def gen_using(self):
+        code_using = ""
+
+        for i in range(self.b2bnum - 1):
+            code_using += "    " + helper.var_idx("using OutputOp", i) +  helper.var_idx(" = typename B2bMma::OutputOp", i) + ";\n"
+
+        code_using += "    " + helper.var_idx("using OutputOp", self.b2bnum - 1) + " = typename Epilogue::OutputOp;\n"
+
+        for i in range(self.b2bnum - 1):
+            code_using += "    " + helper.var_idx("using FusedAddBiasEpilogue", i) + helper.var_idx(" = typename B2bMma::FusedAddBiasEpilogue", i) +";\n"
+
+
+        code_using += "    "  + "using WarpCount0 = typename B2bMma::WarpCount0;\n"
+        code_using += "    "  + "static int const kThreadCount = 32 * WarpCount0::kCount;\n"
+
+        code_using += gen_ir.gen_struct("Params", self.gen_Params() + self.gen_Memberfunc())
+
+        code_using += "union SharedStorage {\n"
+        code_using += "    " + "typename B2bMma::B2bMmaSharedStorage main_loop;\n"
+        code_using += "    " + "typename Epilogue::SharedStorage epilogue;\n"
+        code_using += "};\n"
+
+        return code_using
+
+    def gen_can_implement(self):
+        gen_code = ""
+        return gen_code
+
+    def gen_operator_and_constr(self):
+        ctr_code = "CUTLASS_HOST_DEVICE\n"
+        ctr_code += self.gen_class_name + "() { } \n\n"
+        operator_code = "CUTLASS_DEVICE\n"
+        operator_code += "void operator()(Params const &params, SharedStorage &shared_storage) {\n"
+        operator_code += "    " + "ThreadblockSwizzle threadblock_swizzle;\n"
+        operator_code += "    " + "cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.grid_tiled_shape);\n"
+        operator_code += "    " + "int batch_idx = threadblock_tile_offset.k();\n"
+        operator_code += "    " + "if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||\n"
+        operator_code += "    " + "params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {\n"
+        operator_code += "    " + "    " + "return;\n"
+        operator_code += "    " + "}\n"
+        
+        operator_code += "    " + "cutlass::MatrixCoord tb_offset_A0{\n"
+        operator_code += "    " + "    " + "threadblock_tile_offset.m() * B2bMma::Shape0::kM,\n"
+        operator_code += "    " + "    " + "0\n"
+        operator_code += "    " + "};\n"
+
+        for i in range(self.b2bnum):
+            operator_code += "    " + helper.var_idx("cutlass::MatrixCoord tb_offset_B", i) + "{\n"
+            operator_code += "    " + "    " + "0,\n"
+            operator_code += "    " + "    " + helper.var_idx("threadblock_tile_offset.n() * B2bMma::Shape", i) + "::kN\n"
+            operator_code += "    " + "};\n"
+       
+        operator_code += "    " + "int thread_idx = threadIdx.x;\n\n"
+
+        operator_code += "    " + "MatrixCoord threadblock_offset(\n"
+        operator_code += "    " + "    " + helper.var_idx("threadblock_tile_offset.m() * B2bMma::Shape", self.b2bnum - 1) + "::kM,\n"
+        operator_code += "    " + "    " + helper.var_idx("threadblock_tile_offset.n() * B2bMma::Shape", self.b2bnum - 1) + "::kN\n"
+        operator_code += "    " + ");\n"
+
+        operator_code += "    " + "typename B2bMma::IteratorA0 iterator_A0(\n"
+        operator_code += "    " + "    " + "params.params_A0,\n"
+        operator_code += "    " + "    " + "params.ref_A0.data(),\n"
+        operator_code += "    " + "    " + "params.problem_size_0.mk(),\n"
+        operator_code += "    " + "    " + "thread_idx,\n"
+        operator_code += "    " + "    " + "tb_offset_A0);\n"
+
+        operator_code += "    " + "iterator_A0.add_pointer_offset(batch_idx * params.problem_size_0.m() * params.problem_size_0.k());\n\n"
+
+
+        for i in range (self.b2bnum):
+            operator_code += "    " + helper.var_idx("typename B2bMma::IteratorB", i ) + helper.var_idx(" iterator_B", i) + "(\n"
+            operator_code += "    " + "    " + helper.var_idx("params.params_B", i) + ",\n"
+            operator_code += "    " + "    " + helper.var_idx("params.ref_B", i) + ".data(),\n"
+            operator_code += "    " + "    " + helper.var_idx("params.problem_size_", i) + ".kn(),\n"
+            operator_code += "    " + "    " + "thread_idx,\n"
+            operator_code += "    " + "    " + helper.var_idx("tb_offset_B", i) + ");\n"
+            operator_code += "    " + helper.var_idx("iterator_B", i) + helper.var_idx(".add_pointer_offset(batch_idx * params.problem_size_", i) + helper.var_idx(".n() * params.problem_size_", i) + ".k());\n\n"
+
+
+        for i in range (self.b2bnum - 1):
+            operator_code += "    " + helper.var_idx("typename FusedAddBiasEpilogue", i ) + helper.var_idx("::OutputTileIterator iterator_C", i) + "(\n"
+            operator_code += "    " + "    " + helper.var_idx("params.params_C", i) + ",\n"
+            operator_code += "    " + "    " + helper.var_idx("params.ref_C", i) + ".data(),\n"
+            operator_code += "    " + "    " + helper.var_idx("params.problem_size_" , i) + ".mn(),\n"
+            operator_code += "    " + "    " + "thread_idx,\n"
+            operator_code += "    " + "    " + "threadblock_offset" + ");\n"
+            operator_code += "    " + helper.var_idx("int ref_C", i) + helper.var_idx("_stride = params.ref_C", i) + ".stride()[0];\n"
+            operator_code += "    " + helper.var_idx("iterator_C", i) + helper.var_idx(".add_pointer_offset(batch_idx * params.problem_size_", i) + helper.var_idx(".n() * (ref_C", i) + helper.var_idx("_stride == 0 ? 1 : params.problem_size_", i) + ".m()));\n\n"
+
+
+        for i in range (self.b2bnum - 1):
+            operator_code += "    " + helper.var_idx("FusedAddBiasEpilogue", i ) + helper.var_idx(" epilogue_", i ) + ";\n"
+
+
+        operator_code += "    " + "int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);\n"
+        operator_code += "    " + "int lane_idx = threadIdx.x % 32;\n"
+
+        for i in range (self.b2bnum - 1):
+            operator_code += "    " + helper.var_idx("OutputOp", i) + helper.var_idx(" output_op_", i) + helper.var_idx("(params.output_op_", i) + ");\n"
+
+        operator_code += "    " + "B2bMma b2bMma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);\n"
+        
+        operator_code += "    " + "typename B2bMma::FragmentC0 src_accum;\n"
+        operator_code += "    " + helper.var_idx("typename B2bMma::FragmentC", self.b2bnum - 1)+ " accumulators;\n"
+
+        operator_code += "    " + "src_accum.clear();\n"
+        operator_code += "    " + "accumulators.clear();\n"
+        operator_code += "    " + "b2bMma(params.gemm_k_iterations_0, accumulators, iterator_A0, "
+
+        for i in range(self.b2bnum):
+            operator_code += helper.var_idx("iterator_B", i) + ", "
+        
+        operator_code += "src_accum"
+        if self.b2bnum != 1:
+            operator_code += ", "
+        for i in range(self.b2bnum - 1):
+            operator_code += helper.var_idx("output_op_", i) + ", "
+        
+        for i in range(self.b2bnum - 1):
+            operator_code += helper.var_idx("epilogue_", i) + ", "
+
+        for i in range(self.b2bnum - 1):
+            final = ", "
+            if i == self.b2bnum - 2:
+                final =""
+            operator_code += helper.var_idx("iterator_C", i) + final
+        operator_code += ");\n"
+
+        operator_code += "    " + helper.var_idx("OutputOp", self.b2bnum - 1) + helper.var_idx(" output_op_", self.b2bnum - 1) + helper.var_idx("(params.output_op_", self.b2bnum - 1) + ");\n"
+        operator_code += "    " + "threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.grid_tiled_shape);\n"
+
+        
+
+        operator_code += "    " + helper.var_idx("typename Epilogue::OutputTileIterator iterator_C", self.b2bnum - 1) + "(\n"
+        operator_code += "    " + "    " + helper.var_idx("params.params_C", self.b2bnum - 1) + ",\n"
+        operator_code += "    " + "    " + helper.var_idx("params.ref_C", self.b2bnum - 1) + ".data(),\n"
+        operator_code += "    " + "    " + helper.var_idx("params.problem_size_", self.b2bnum - 1) + ".mn(),\n"
+        operator_code += "    " + "    " + "thread_idx,\n"
+        operator_code += "    " + "    " + "threadblock_offset\n"
+        operator_code += "    " + ");\n"
+        operator_code += "    " + helper.var_idx("int ref_C", self.b2bnum - 1) + helper.var_idx("_stride = params.ref_C", self.b2bnum - 1) + ".stride()[0];\n"
+
+        operator_code += "    " + helper.var_idx("iterator_C", self.b2bnum - 1) + helper.var_idx(".add_pointer_offset(batch_idx * params.problem_size_", self.b2bnum - 1) + helper.var_idx(".n() * (ref_C", self.b2bnum - 1) + helper.var_idx("_stride == 0 ? 1 : params.problem_size_", self.b2bnum - 1) + ".m()));\n\n"
+
+        operator_code += "    " + helper.var_idx("typename Epilogue::OutputTileIterator iterator_D", self.b2bnum - 1) + "(\n"
+        operator_code += "    " + "    " + helper.var_idx("params.params_D", self.b2bnum - 1) + ",\n"
+        operator_code += "    " + "    " + helper.var_idx("params.ref_D", self.b2bnum - 1) + ".data(),\n"
+        operator_code += "    " + "    " + helper.var_idx("params.problem_size_", self.b2bnum - 1) + ".mn(),\n"
+        operator_code += "    " + "    " + "thread_idx,\n"
+        operator_code += "    " + "    " + "threadblock_offset\n"
+        operator_code += "    " + ");\n"
+        operator_code += "    " + helper.var_idx("iterator_D", self.b2bnum - 1) + helper.var_idx(".add_pointer_offset(batch_idx * params.problem_size_", self.b2bnum - 1) + helper.var_idx(".n() * params.problem_size_", self.b2bnum - 1) + ".m());\n\n"
+
+
+        operator_code += "    " + "Epilogue epilogue(\n"
+        operator_code += "    " + "    " + "shared_storage.epilogue,\n"
+        operator_code += "    " + "    " + "thread_idx,\n"
+        operator_code += "    " + "    " + "warp_idx,\n"
+        operator_code += "    " + "    " + "lane_idx\n"
+        operator_code += "    " + ");\n"
+
+        operator_code += "    " + "epilogue("
+        operator_code += helper.var_idx("output_op_", self.b2bnum - 1) + ", "
+        operator_code += helper.var_idx("iterator_D", self.b2bnum - 1) + ", "
+        operator_code += "accumulators, "
+        operator_code += helper.var_idx("iterator_C", self.b2bnum - 1) + ");\n"
+        operator_code += "}\n"
+
+        return ctr_code + operator_code
+
+    def gen_include_header(self):
+        code = '''
+#pragma once
+
+#include \"{cutlass_dir}cutlass/cutlass.h\"
+
+#include \"{cutlass_dir}cutlass/gemm/gemm.h\"
+#include \"{cutlass_dir}cutlass/matrix_coord.h\"
+#include \"{cutlass_dir}cutlass/semaphore.h\"
+'''.format(cutlass_dir=self.cutlass_deps_root)
+        return code
+    def gen_code(self):
+        
+        template_param = []
+        template_param.append(("typename", "B2bMma"))
+        template_param.append(("typename", "Epilogue"))
+        template_param.append(("typename", "ThreadblockSwizzle"))
+        template_param.append((bool, "SplitKSerial"))
+
+        code_body = ""
+        code_body += self.gen_using()
+        code_body += self.gen_operator_and_constr()
+
+        struct_code = gen_ir.gen_template_struct(self.gen_class_name, template_param, code_body)
+        code = self.gen_include_header()
+        code += gen_ir.gen_namespace("cutlass", gen_ir.gen_namespace("gemm", gen_ir.gen_namespace("kernel", struct_code)))
+
+        return self.gen_include_header() + code
+
+
+
+class gen_kernel:
+    def __init__(self, template_param, gen_class_name, b2b_num, output_dir, cutlass_deps_root, project_root):
+        self.template_param = template_param
+
+        self.gen_class_name = "B2bGemm"
+        self.gen_kernel_name = gen_class_name + "Kernel"
+        self.template_args = []
+
+        self.cutlass_deps_root = cutlass_deps_root
+        self.project_root = project_root
+
+        self.gen_default_b2b_gemm = gen_default_Gemm(template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root)
+        self.gen_Kerenl = gen_Kernel(template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root)
+
+        # Include gen_threadBlock
+        self.gen_threadBlock = gen_tb.gen_threadblock(template_param, gen_class_name, b2b_num, output_dir, cutlass_deps_root, project_root)
+    
+        self.file_dir = output_dir + "/kernel/"
+
+    def gen_code(self, first_use_1stage):
+
+        default_b2b_gemm = self.gen_default_b2b_gemm.gen_code()
+        
+        print("[INFO]: Gen kernel code [default_b2b_gemm.h]output Dir: is ", self.file_dir)
+
+        with open(self.file_dir + "default_b2b_gemm.h", "w+") as f:
+            f.write(default_b2b_gemm)
+
+        kernel = self.gen_Kerenl.gen_code()
+        print("[INFO]: Gen kernel code [b2b_gemm.h]output Dir: is ", self.file_dir)
+
+        with open(self.file_dir + "b2b_gemm.h", "w+") as f:
+            f.write(kernel)
+
+        # Call code to gen threadblock
+        self.gen_threadBlock.gen_code(first_use_1stage)
diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_sample.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_sample.py
new file mode 100644
index 0000000000..6474d95c5d
--- /dev/null
+++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_sample.py
@@ -0,0 +1,232 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import helper 
+import gen_ir as ir
+
+class gen_test:
+    def __init__(self, fuse_gemm_info, gen_class_name, user_header_file, output_dir = "../"):
+        self.fuse_gemm_info = fuse_gemm_info
+        self.gen_class_name = gen_class_name
+        self.user_header_file = user_header_file
+        self.sample_dir = output_dir
+        self.b2b_num = len(fuse_gemm_info)
+
+    def gen_cpp_sample(self):
+        code = "/* Auto Generated code - Do not edit.*/\n"
+        code +=  "#include <cstdio> \n"
+
+        code += "#include \"cutlass/gemm/device/gemm_batched.h\" \n"
+        code += "#include \"cutlass/cutlass.h\" \n"
+
+        code += "#include \"../cutlass_irrelevant.h\" \n"
+        code += "#include \"../cutlass_verify.h\" \n"
+
+        code += "#include \"leaky_bias.h\" \n"
+
+        code +=  "#include \"utils.h\" \n"
+        
+
+
+        code += "int main(int args, char * argv[]) {\n"
+        code += "    " + "int M = atoi(argv[1]);\n"
+        code += "    " + "int K0 = " + str(self.fuse_gemm_info[0]['mnk'][0]) + ";\n"
+        code += "    " + "if(args == 3);\n"
+        code += "    " + "    " + "K0 = atoi(argv[2]);\n"
+        code += "    " + "int B = 1;\n"
+        code += "    " + "if(args == 4);\n"
+        code += "    " + "    " + "B = atoi(argv[3]);\n"
+
+        code += "    " + "srand(1234UL);\n"
+        code += "    " + "int device_id = 0;\n"
+        code += "    " + "cudaGetDevice(&device_id);\n"
+        code += "    " + "cudaDeviceProp prop;\n"
+        code += "    " + "cudaGetDeviceProperties(&prop, device_id);\n"
+        code += "    " + "int sm = prop.major *10 + prop.minor;\n"
+        code += "using ElementCompute = cutlass::half_t;\n"
+
+        for i in range(self.b2b_num):
+            code += "    " + helper.var_idx("ElementCompute alpha", i) + " = ElementCompute(1);\n"
+            addbias = helper.get_epilogue_add_bias_or_not( self.fuse_gemm_info[i])
+            if addbias:
+                code += "    " + helper.var_idx("ElementCompute beta", i) + " = ElementCompute(1);\n"
+            else:
+                code += "    " + helper.var_idx("ElementCompute beta", i) + " = ElementCompute(0);\n"
+
+        code += "    " + "size_t flops = 0;\n"
+
+        for i in range(self.b2b_num):
+            m = self.fuse_gemm_info[i]['mnk'][0]
+            n = self.fuse_gemm_info[i]['mnk'][1]
+            k = self.fuse_gemm_info[i]['mnk'][2]
+
+            bias_shape = helper.get_epilogue_bias_shape(self.fuse_gemm_info[i])
+            
+            this_k = "K0"
+            if (i > 0):
+                this_k = str(k)
+
+            code += "    " + "flops += size_t(2) * size_t(M) * size_t(B) * " + "size_t(" + str(n) + ") * size_t(" + this_k + ");\n"
+
+            code += "    " + helper.var_idx("cutlass::gemm::GemmCoord problem_size_", i) + "(" + "M" + ", " + str(n) + ", " + this_k + ");\n"
+
+            code += "    " + helper.var_idx("memory_unit<cutlass::half_t> Mat_A", i) + helper.var_idx("(B * problem_size_", i) + helper.var_idx(".m() * problem_size_", i) + ".k());\n"
+            code += "    " + helper.var_idx("memory_unit<cutlass::half_t> Mat_B", i) + helper.var_idx("(B * problem_size_", i) + helper.var_idx(".n() * problem_size_", i) + ".k());\n"
+            code += "    " + helper.var_idx("memory_unit<cutlass::half_t> Mat_C", i) + "(B * " + str(bias_shape[0]) + " * " + str(bias_shape[1]) + ");\n"
+            code += "    " + helper.var_idx("memory_unit<cutlass::half_t> Mat_D_cutlass_ref", i) + helper.var_idx("(B * problem_size_", i) + helper.var_idx(".m() * problem_size_", i) + ".n());\n"
+
+            code += "    " + helper.var_idx("Mat_A", i) + ".init();\n"
+            code += "    " + helper.var_idx("Mat_B", i) + ".init();\n"
+            code += "    " + helper.var_idx("Mat_C", i) + ".init();\n"
+
+
+
+        code += "    " + helper.var_idx("memory_unit<cutlass::half_t> Mat_D", self.b2b_num - 1) +  helper.var_idx("(B * problem_size_", i) + helper.var_idx(".m() * problem_size_",self.b2b_num - 1) + ".n());\n"
+
+        params = []
+        params.append("M")
+        params.append("B")
+
+        params.append("Mat_A0.device_ptr")
+        for i in range(self.b2b_num):
+            params.append(helper.var_idx("Mat_B", i) + ".device_ptr")
+            params.append(helper.var_idx("Mat_C", i) + ".device_ptr")
+            if i != self.b2b_num-1:
+                params.append(helper.var_idx("Mat_D_cutlass_ref", i) + ".device_ptr")
+        params.append(helper.var_idx("Mat_D", self.b2b_num - 1) + ".device_ptr")
+    
+        code += "    " + "Param arguments = {\n"
+        code += "    " + "    " + "M,\n"
+        code += "    " + "    " + "K0,\n"
+        code += "    " + "    " + "B,\n"
+
+        code += "    " + "    " + "reinterpret_cast<const void*>(Mat_A0.device_ptr),\n"
+        cnt = 1
+        for i in range(self.b2b_num):
+            bias_flag = helper.get_epilogue_add_bias_or_not( self.fuse_gemm_info[i])
+            code += "    " + "    " + "reinterpret_cast<const void*>(" + helper.var_idx("Mat_B", i) + ".device_ptr" + "),\n"
+            cnt += 1
+            if bias_flag:
+                code += "    " + "    " + "reinterpret_cast<const void*>(" + helper.var_idx("Mat_C", i) + ".device_ptr" + "),\n"
+                cnt += 1
+            else:
+                code += "    " + "    " + "reinterpret_cast<const void*>(NULL),\n"
+
+            epilogue_args = helper.get_epilogue_args(self.fuse_gemm_info[i])
+            acc_tp = helper.get_epilogue_compute_tp(self.fuse_gemm_info[i])
+            for arg in epilogue_args:
+                arg_value = str(arg[2])
+
+                code +=  "    " + "    " + helper.type_2_cutlass_type(acc_tp)  + "(" + arg_value + "),\n"
+
+            if i != self.b2b_num - 1:
+                code += "    " + "    " + "reinterpret_cast<void*>(" + helper.var_idx("Mat_D_cutlass_ref", i) + ".device_ptr" + "),\n"
+            else:
+                code += "    " + "    " + "reinterpret_cast<void*>(" + helper.var_idx("Mat_D", i) + ".device_ptr" + ")};\n"
+
+
+
+
+        code += "    " + "TI(FUSED_CUTLASS);\n"
+        code += "    " + "for(int i = 0; i < 100; i++){\n"
+        code += "    " + "    " + "one_api(arguments, sm, NULL);\n"
+
+        code += "    " + "}\n"
+        code += "    " + "TO(FUSED_CUTLASS, \"FUSED_CUTLASS\", 100);\n"
+
+        code += "\n"
+
+        for i in range(self.b2b_num):
+            code_this = ""
+
+            N_str = str(self.fuse_gemm_info[i]['mnk'][1])
+
+            code_this += "    " + helper.var_idx("typename Gemm", i) + helper.var_idx("::Arguments arguments_", i) + "{\n"
+            code_this += "    " + "    " + helper.var_idx("problem_size_", i) + ",\n"
+            ldmA = str(self.fuse_gemm_info[i]['mnk'][2])
+            if i == 0:
+                ldmA = "K0"
+            ldmB = str(self.fuse_gemm_info[i]['mnk'][2])
+            if i == 0:
+                ldmB = "K0"
+            ldmC = str(self.fuse_gemm_info[i]['mnk'][1])
+
+            ldmBias = str(helper.get_epilogue_bias_ldm(self.fuse_gemm_info[i]))
+
+            if self.fuse_gemm_info[i]['A_format'] is 'Col':
+                ldmA = "M"
+            if self.fuse_gemm_info[i]['B_format'] is 'Row':
+                ldmB = str(self.fuse_gemm_info[i]['mnk'][1])
+            if self.fuse_gemm_info[i]['C_format'] is 'Col':
+                ldmC = "M"
+
+            if i == 0:
+                code_this += "    " + "    " + "{reinterpret_cast<" + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['A_tp']) + "*>(" + helper.var_idx("Mat_A", i) + ".device_ptr), " + ldmA + "}, " + "M * " + ldmA + ",\n"
+            else:
+                code_this += "    " + "    " + "{reinterpret_cast<" + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['A_tp']) + "*>(" + helper.var_idx("Mat_D_cutlass_ref", i - 1) + ".device_ptr), " + ldmA + "}, " + "M * " + ldmA + ",\n"
+
+            code_this += "    " + "    " + "{reinterpret_cast<" + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['B_tp']) + "*>(" + helper.var_idx("Mat_B", i) + ".device_ptr), " + ldmB + "}, " + N_str + " * " + ldmB + ",\n"
+            
+            M_bias = str(helper.get_epilogue_bias_shape(self.fuse_gemm_info[i])[0])
+
+            code_this += "    " + "    " + "{reinterpret_cast<" + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['C_tp']) + "*>(" + helper.var_idx("Mat_C", i) + ".device_ptr), " + ldmBias + "}, " + M_bias + " * " + N_str + ",\n"
+            code_this += "    " + "    " + "{reinterpret_cast<" + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['C_tp']) + "*>(" + helper.var_idx("Mat_D_cutlass_ref", i) + ".device_ptr), " + ldmC + "}, " + "M * " + ldmC + ",\n"
+            code_this += "    " + "    " + "{ " + helper.var_idx("alpha", i) + ", " + helper.var_idx("beta", i) 
+            for epilogue_arg in  helper.get_epilogue_args(self.fuse_gemm_info[i]):
+                arg_value = str(epilogue_arg[2])
+                code_this += ", " + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['Acc_tp']) + "(" + str(arg_value) + ")"
+            code_this += "    " + " },\n"
+            code_this += "    " + "    " + "B};\n"
+
+            code += code_this
+
+
+
+        code += "    " + "TI(UNFUSED_CUTLASS);\n"
+        code += "    " + "for(int i = 0; i < 100; i++){\n"
+        code += "    " + "    " + self.gen_class_name + "_verify(\n"
+        for i in range(self.b2b_num):
+            code += "    " + "    " + "    " + helper.var_idx("arguments_", i) + ",\n"
+        code += "    " + "    " + "    " + "NULL);\n"
+
+        code += "    " + "}\n"
+        code += "    " + "TO(UNFUSED_CUTLASS, \"UNFUSED_CUTLASS\", 100);\n"
+
+        code += "    " + helper.var_idx("Mat_D_cutlass_ref", self.b2b_num - 1) + ".d2h();\n"
+        code += "    " + helper.var_idx("Mat_D", self.b2b_num - 1) + ".d2h();\n"
+        code += "    " + helper.var_idx("check_result(Mat_D_cutlass_ref", self.b2b_num - 1) + helper.var_idx(".host_ptr, Mat_D", self.b2b_num - 1) \
+                       + helper.var_idx(".host_ptr, Mat_D", self.b2b_num - 1) + ".elements);\n"
+
+        code += "\n\n}\n"
+
+        with open(self.sample_dir + "sample.cu", "w+") as f:
+            f.write(code)
diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_threadblock.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_threadblock.py
new file mode 100644
index 0000000000..91f9ef3ccb
--- /dev/null
+++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_threadblock.py
@@ -0,0 +1,1013 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import gen_ir
+import helper
+
+
+class gen_default_b2b_mma:
+    def __init__(self, template_param, gen_class_name, b2b_num,cutlass_deps_root, project_root):
+        self.gen_class_name = "DefaultB2bMma"
+        self.template_param = template_param
+        self.b2b_num = b2b_num
+
+        self.cutlass_deps_root = cutlass_deps_root
+        self.project_root = project_root
+
+    def gen_include_header(self):
+        code = '''
+/* Auto Generated code - Do not edit.*/
+
+#pragma once
+
+#include \"{cutlass_dir}cutlass/cutlass.h\"
+#include \"{cutlass_dir}cutlass/numeric_types.h\"
+#include \"{cutlass_dir}cutlass/arch/arch.h\"
+
+#include \"{cutlass_dir}cutlass/transform/threadblock/predicated_tile_iterator.h\"
+#include \"{cutlass_dir}cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h\"
+#include \"{cutlass_dir}cutlass/gemm/threadblock/default_mma_core_sm70.h\"
+#include \"{cutlass_dir}cutlass/gemm/threadblock/default_mma_core_sm75.h\"
+#include \"{cutlass_dir}cutlass/gemm/threadblock/default_mma_core_sm80.h\"
+
+#include \"../threadblock/b2b_mma_pipelined.h\"
+#include \"../../fixed_impl/epilogue/threadblock/fused_bias_act_epilogue.h\"
+#include \"../../fixed_impl/epilogue/threadblock/default_bias_act_epilogue_tensor_op.h\"
+#include \"../../fixed_impl/gemm/warp/mma_tensor_op_fragment_iterator_without_output_op.h\"
+'''.format(cutlass_dir=self.cutlass_deps_root)
+        return code
+
+
+    def gen_using_MmaCore(self, stage):
+        threadBlockShape = "ThreadblockShape"
+        warpShape = "WarpShape"
+        instrunctionShape = "InstructionShape"
+        Mma_typename = "typename cutlass::gemm::threadblock::DefaultMmaCore"
+
+
+        gen_code = ""
+
+        for i in range(self.b2b_num):
+            code_using = "using MmaCore" + str(i)
+            gen_code += code_using + " = " + gen_ir.gen_declare_template_struct(Mma_typename, \
+                                                helper.var_idx(threadBlockShape, i), helper.var_idx(warpShape, i), instrunctionShape, \
+                                                "ElementA", "LayoutA", \
+                                                helper.var_idx("ElementB", i), helper.var_idx("LayoutB", i), \
+                                                helper.var_idx("ElementAccumulator", i), "layout::RowMajor", \
+                                                "OperatorClass", str(stage), "Operator")
+        return gen_code
+
+    def gen_using_FusedAddBiasEpilogue(self):
+        gen_code = ""
+        for i in range(self.b2b_num - 1):
+            code_using = helper.var_idx("using FusedAddBiasEpilogue", i)
+            epilogue_name = "typename cutlass::epilogue::threadblock::DefaultFusedBiasActEpilogueTensorOp"
+            template_args = helper.var_idx("<ThreadblockShape", i) + helper.var_idx(",typename MmaCore", i) + helper.var_idx("::MmaPolicy::Operator, 1, EpilogueOutputOp", i) + ", 2>::Epilogue"
+
+            gen_code += code_using + " = " + epilogue_name + template_args + ";\n"
+
+        return gen_code        
+        
+
+    def gen_using_Iterator(self):
+        code_using = "using IteratorA0"
+        iterator_typename = "cutlass::transform::threadblock::PredicatedTileIterator"
+        MmaCore = "MmaCore0"
+        matrix_shape = "cutlass::MatrixShape<" + MmaCore + "::Shape::kM, " + MmaCore + "::Shape::kK>"
+        iterator_map = "typename " + MmaCore + "::IteratorThreadMapA"
+        gen_code = code_using + " = " + gen_ir.gen_declare_template_struct(iterator_typename, \
+                                                matrix_shape, "ElementA", "LayoutA", "1", iterator_map, "AlignmentA_")
+
+        for i in range(self.b2b_num):
+            code_using = "using IteratorB" + str(i)
+            iterator_typename = "cutlass::transform::threadblock::PredicatedTileIterator"
+            MmaCore = "MmaCore" + str(i)
+            matrix_shape = "cutlass::MatrixShape<" + MmaCore + "::Shape::kK, " + MmaCore + "::Shape::kN>"
+            iterator_map = "typename " + MmaCore + "::IteratorThreadMapB"
+
+            gen_code += code_using + " = " + gen_ir.gen_declare_template_struct(iterator_typename, \
+                                                matrix_shape, helper.var_idx("ElementB", i), helper.var_idx("LayoutB", i), "0", iterator_map, "AlignmentB_")
+        
+        return gen_code
+
+    def gen_fragment_iterator(self):
+        gen_code = "using AccumulatorLayout = cutlass::layout::ColumnMajor;\n"
+        
+        for i in range(1, self.b2b_num):
+            code_using = "using FragmentIteratorA" + str(i)
+            iterator_typename = "cutlass::gemm::warp::MmaTensorOpPureFragmentIterator"
+            curr_MmaCore = "MmaCore" + str(i)
+            prev_MmaCore = "MmaCore" + str(i - 1)
+            Matrix_shape_curr = "cutlass::MatrixShape<" + curr_MmaCore + "::WarpShape::kM, " + curr_MmaCore + "::InstructionShape::kK>"
+            Matrix_shape_prev = "cutlass::MatrixShape<" + prev_MmaCore + "::WarpShape::kM, " + prev_MmaCore + "::WarpShape::kN>"
+            Curr_shape_kK = curr_MmaCore + "::Shape::kK"
+
+            gen_code += code_using + " = " + gen_ir.gen_declare_template_struct(iterator_typename, \
+                                                Matrix_shape_curr, Matrix_shape_prev, Curr_shape_kK, \
+                                                    helper.var_idx("ElementAccumulator", i-1), "ElementA", \
+                                                        "AccumulatorLayout", "InstructionShape_", "true")
+
+        return gen_code
+
+    def gen_threadblockmma(self):
+        code_using = "using ThreadblockB2bMma"
+        iterator_typename = "cutlass::gemm::threadblock::B2bMmaPipelined"
+
+        MmaPipelined_param_Mma0_shape = "typename MmaCore0::Shape"
+        MmaPipelined_param_Mma0_iteratorA = "IteratorA0"
+        MmaPipelined_param_Mma0_smemIteratorA = "typename MmaCore0::SmemIteratorA"
+        MmaPipelined_param_Mma0_iteratorB = "IteratorB0"
+        MmaPipelined_param_Mma0_smemIteratorB = "typename MmaCore0::SmemIteratorB"
+
+        MmaPipelined_param_list = MmaPipelined_param_Mma0_shape + ", " + MmaPipelined_param_Mma0_iteratorA + ", " + MmaPipelined_param_Mma0_smemIteratorA + ", " + MmaPipelined_param_Mma0_iteratorB + ", " + MmaPipelined_param_Mma0_smemIteratorB + ", "
+
+        for i in range(1, self.b2b_num):
+            MmaPipelined_param_Mma_shape = "typename MmaCore" + str(i) + "::Shape"
+            MmaPipelined_param_Mma_iteratorA = "FragmentIteratorA" + str(i)
+            MmaPipelined_param_Mma_iteratorB = "IteratorB" + str(i)
+            MmaPipelined_param_Mma_smemIteratorB = "typename MmaCore" + str(i) + "::SmemIteratorB"
+
+            MmaPipelined_param_list += MmaPipelined_param_Mma_shape + ", " + MmaPipelined_param_Mma_iteratorA + ", " + MmaPipelined_param_Mma_iteratorB + ", " + MmaPipelined_param_Mma_smemIteratorB + ", "
+
+        MmaPipelined_param_list += "ElementAccumulator0, layout::RowMajor, "
+
+        for i in range(self.b2b_num - 1):
+            epilogue_name = "EpilogueOutputOp" + str(i)
+            MmaPipelined_param_list += epilogue_name + ", "
+
+        for i in range(self.b2b_num - 1):
+            epilogue_name = "FusedAddBiasEpilogue" + str(i)
+            MmaPipelined_param_list += epilogue_name + ", "
+
+        for i in range(self.b2b_num):
+            MmaPolicy = "typename MmaCore" + str(i) + "::MmaPolicy"
+            MmaPipelined_param_list += MmaPolicy + ", "
+            
+           
+        cnt = 0
+        for i in range(self.b2b_num):
+            MmaStage = helper.var_idx("Stages", i)
+            final = ", "
+            if cnt == self.b2b_num - 1:
+                final = ""
+            MmaPipelined_param_list += MmaStage + final
+            cnt += 1
+        
+        gen_code = code_using + " = " + gen_ir.gen_declare_template_struct(iterator_typename, MmaPipelined_param_list)
+
+        return gen_code
+
+      
+
+    def gen_code(self):
+        gen_using = ''
+        # Generate default template struct
+        gen_code = gen_ir.gen_template_struct(self.gen_class_name, self.template_param, "", speicalized = None, set_default=False)
+
+        # Generate specialized template struct
+
+        mmacore_codebody = self.gen_using_MmaCore(2)
+        iterator_codebody = self.gen_using_Iterator()
+        fragment_iterator_codebody = self.gen_fragment_iterator()
+        epilogue_iterator_codebody = self.gen_using_FusedAddBiasEpilogue()
+        threadBlockMma = self.gen_threadblockmma()
+        specialized_code = mmacore_codebody + iterator_codebody + fragment_iterator_codebody + epilogue_iterator_codebody + threadBlockMma
+
+        # Specialize layout C -> cutlass::layout::RowMajor
+
+        rtn_template_args, speicalized_template_args = gen_ir.filtered_param(self.template_param, [ ('LayoutD', "cutlass::layout::RowMajor")], keep_= True)
+
+        gen_speical_code = gen_ir.gen_template_struct(self.gen_class_name, rtn_template_args, specialized_code, speicalized = speicalized_template_args, set_default=False)
+        code = gen_ir.gen_namespace("cutlass", gen_ir.gen_namespace("gemm", gen_ir.gen_namespace("threadblock", gen_code + gen_speical_code)))
+
+        return self.gen_include_header() + code
+
+
+class gen_b2b_mme_pipelined:
+    def __init__(self, template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root):
+        self.gen_class_name = "B2bMmaPipelined"
+        self.template_param = template_param
+        self.b2b_num = b2b_num
+        self.cutlass_deps_root = cutlass_deps_root
+        self.project_root = project_root
+
+
+    def gen_include_header(self):
+        code = '''
+#pragma once
+
+#include \"{cutlass_dir}cutlass/cutlass.h\"
+#include \"{cutlass_dir}cutlass/array.h\"
+#include \"{cutlass_dir}cutlass/aligned_buffer.h\"
+#include \"{cutlass_dir}cutlass/numeric_conversion.h\"
+
+#include \"{cutlass_dir}cutlass/numeric_types.h\"
+#include \"{cutlass_dir}cutlass/matrix_shape.h\"
+
+#include \"{cutlass_dir}cutlass/gemm/gemm.h\"
+#include \"{cutlass_dir}cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h\"
+
+#include \"../threadblock/b2b_mma_base.h\"\n'''.format(cutlass_dir = self.cutlass_deps_root)
+        return code
+
+
+    def gen_using(self):
+        code_using = "using FragmentA0 = typename IteratorA0::Fragment;\n"
+        
+        code_using += "using Base = B2bMmaBase<"
+        for i in range(self.b2b_num):
+            code_using += helper.var_idx("Shape", i) + "_, "
+        for i in range(self.b2b_num):
+            code_using += helper.var_idx("Policy", i) + "_, "
+        for i in range(self.b2b_num):
+            code_using += helper.var_idx("Stage", i) + "_, "
+        code_using = code_using[: -2] + ">;\n"
+            
+
+        for i in range(self.b2b_num):
+            code_using += helper.var_idx("using FragmentB", i) + helper.var_idx(" = typename IteratorB", i) + "::Fragment;\n"
+            code_using += helper.var_idx("using FragmentC", i) + helper.var_idx(" = typename Policy", i) + "::Operator::FragmentC;\n"
+            code_using += helper.var_idx("using Operator", i) + helper.var_idx(" = typename Policy", i) + "::Operator;\n"
+
+        for i in range(self.b2b_num - 1):
+            code_using += helper.var_idx("using IteratorC", i) + helper.var_idx(" = typename FusedAddBiasEpilogue", i) + "::OutputTileIterator;\n"
+
+        code_using += "using ArchTag = typename Policy0::Operator::ArchTag;\n"
+        code_using += "static ComplexTransform const kTransformA0 = Operator0::kTransformA;\n"
+
+        for i in range(self.b2b_num):
+            code_using += helper.var_idx("static ComplexTransform const kTransformB", i) + helper.var_idx(" = Operator", i) + "::kTransformB;\n"
+        
+        code_using += "private:\n"
+        code_using += "using WarpFragmentA0 = typename Operator0::FragmentA;\n"
+        code_using += "using WarpFragmentB0 = typename Operator0::FragmentB;\n"
+
+        for i in range(1, self.b2b_num):
+            code_using += helper.var_idx("using WarpFragmentA", i) + helper.var_idx(" = typename FragmentIteratorA", i) + "::Fragment;\n"
+            code_using += helper.var_idx("using WarpFragmentB", i) + helper.var_idx(" = typename Operator", i) + "::FragmentB;\n"
+
+        code_using += "protected:\n"
+        
+        code_using += "SmemIteratorA0 smem_iterator_A_;\n"
+        
+        for i in range(self.b2b_num):
+            code_using += helper.var_idx("SmemIteratorB", i) +  helper.var_idx(" smem_iterator_B", i) + "_;\n"
+
+        return code_using
+
+
+    def gen_operator(self, first_use_1stage = False):
+        code = ""
+        def gen_operator_param(b2b_num):
+            param_code = ""
+            param_code += "int gemm_k_iterations_0,\n"
+            param_code += helper.var_idx("FragmentC", b2b_num-1) +  helper.var_idx(" &accum", b2b_num-1) + ",\n"
+            param_code += "IteratorA0 iterator_A,\n"
+
+            for i in range(b2b_num):
+                param_code += helper.var_idx("IteratorB", i) + " " + helper.var_idx("iterator_B", i) + ",\n"
+
+            param_code += "FragmentC0 const &src_accum, \n"
+
+            for i in range(b2b_num - 1):
+                param_code += helper.var_idx("OutputOp", i) + " " + helper.var_idx("output_op_", i) + ",\n"
+            for i in range(b2b_num - 1):
+                param_code += helper.var_idx("FusedAddBiasEpilogue", i) + " " + helper.var_idx("epilogue_", i) + ",\n"
+            for i in range(b2b_num - 1):
+                param_code += helper.var_idx("IteratorC", i) + " " + helper.var_idx("iterator_C", i) + ",\n"
+
+
+            param_code += "TransformA0 transform_A0 = TransformA0(), \n"
+
+            for i in range(b2b_num):
+                final = "(),\n"
+                if i == b2b_num - 1:
+                    final = "()\n"
+                param_code += helper.var_idx("TransformB", i) + " " + helper.var_idx("transform_B", i) + " = " +helper.var_idx("TransformB", i) + final
+            
+            return param_code
+        
+
+
+        def gen_first_gemm_1stage(b2b_num):
+            accu_code = "     FragmentC0 accum0 = src_accum;\n"
+            if b2b_num == 1:
+                accu_code = "    accum0 = src_accum;\n"
+            
+            code ="\
+\n\
+    FragmentA0 tb_frag_A;\n\
+    FragmentB0 tb_frag_B0;\n\
+\n\
+    int smem_write_stage_idx = 1;\n\
+\n\
+    tb_frag_A.clear();\n\
+    tb_frag_B0.clear();\n\
+\n\
+    // The last kblock is loaded in the prolog\n\
+    iterator_A.load(tb_frag_A);\n\
+    iterator_B0.load(tb_frag_B0);\n\
+\n\
+    ++iterator_A;\n\
+    ++iterator_B0;\n\
+\n\
+    WarpFragmentA0 warp_frag_A0;\n\
+    WarpFragmentB0 warp_frag_B0;\n\
+\n\
+    Operator0 warp_mma0;\n\
+\n\
+    // Avoid reading out of bounds\n\
+    if (gemm_k_iterations_0 <= 1) {\n\
+      iterator_A.clear_mask();\n\
+      iterator_B0.clear_mask();\n\
+    }\n\
+\n\
+    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing \n\
+    // shared memory loads (which have the tightest latency requirement).\n\
+\n\
+    //\n\
+    // Mainloop\n\
+    //\n\
+\n\
+    // Note: The main loop does not support Base::WarpGemmIterations == 2.\n\
+    CUTLASS_GEMM_LOOP\n\
+    for (; gemm_k_iterations_0 > 0; --gemm_k_iterations_0) {\n\
+\n\
+      this->smem_iterator_A_.store(tb_frag_A);\n\
+      this->smem_iterator_B0_.store(tb_frag_B0);\n\
+\n\
+      __syncthreads();\n\
+      //\n\
+      // Loop over GEMM K dimension\n\
+      //\n\
+\n\
+      CUTLASS_PRAGMA_UNROLL\n\
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations0; ++warp_mma_k) {\n\
+\n\
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group\n\
+        // as the case may be.\n\
+\n\
+        this->warp_tile_iterator_A0_.set_kgroup_index(warp_mma_k % Base::kWarpGemmIterations0);\n\
+        this->warp_tile_iterator_B0_.set_kgroup_index(warp_mma_k % Base::kWarpGemmIterations0);\n\
+\n\
+        this->warp_tile_iterator_A0_.load(warp_frag_A0);\n\
+        this->warp_tile_iterator_B0_.load(warp_frag_B0);\n\
+\n\
+        ++this->warp_tile_iterator_A0_;\n\
+        ++this->warp_tile_iterator_B0_;\n\
+\n\
+        warp_mma0(accum0, warp_frag_A0, warp_frag_B0, accum0);\n\
+      }\n\
+      this->warp_tile_iterator_A0_.add_tile_offset({0, -Policy0::kPartitionsK * Base::kWarpGemmIterations0});\n\
+      this->warp_tile_iterator_B0_.add_tile_offset({-Policy0::kPartitionsK * Base::kWarpGemmIterations0, 0});\n\
+\n\
+      __syncthreads();\n\
+      iterator_A.load(tb_frag_A);\n\
+      iterator_B0.load(tb_frag_B0);\n\
+\n\
+      ++iterator_A;\n\
+      ++iterator_B0;\n\
+\n\
+      if(gemm_k_iterations_0 <= 2) {\n\
+        iterator_A.clear_mask();\n\
+        iterator_B0.clear_mask();\n\
+      }\n\
+    }\n"
+
+            return accu_code + code
+
+
+        def gen_first_gemm_2stage(b2b_num):
+             
+            accu_code = "     FragmentC0 accum0 = src_accum;\n"
+            if b2b_num == 1:
+                accu_code = "    accum0 = src_accum;\n"
+
+            code ="\
+\n\
+    FragmentA0 tb_frag_A;\n\
+    FragmentB0 tb_frag_B0;\n\
+\n\
+    tb_frag_A.clear();\n\
+    tb_frag_B0.clear();\n\
+\n\
+    // The last kblock is loaded in the prolog\n\
+    iterator_A.load(tb_frag_A);\n\
+    iterator_B0.load(tb_frag_B0);\n\
+\n\
+    ++iterator_A;\n\
+    ++iterator_B0;\n\
+\n\
+    this->smem_iterator_A_.store(tb_frag_A);\n\
+    this->smem_iterator_B0_.store(tb_frag_B0);\n\
+\n\
+    ++this->smem_iterator_A_;\n\
+    ++this->smem_iterator_B0_;\n\
+\n\
+    __syncthreads();\n\
+\n\
+    // Pair of fragments used to overlap shared memory loads and math instructions\n\
+    WarpFragmentA0 warp_frag_A0[2];\n\
+    WarpFragmentB0 warp_frag_B0[2];\n\
+\n\
+    this->warp_tile_iterator_A0_.set_kgroup_index(0);\n\
+    this->warp_tile_iterator_B0_.set_kgroup_index(0);\n\
+\n\
+    this->warp_tile_iterator_A0_.load(warp_frag_A0[0]);\n\
+    this->warp_tile_iterator_B0_.load(warp_frag_B0[0]);\n\
+\n\
+    ++this->warp_tile_iterator_A0_;\n\
+    ++this->warp_tile_iterator_B0_;\n\
+\n\
+    Operator0 warp_mma0;\n\
+\n\
+    int smem_write_stage_idx = 1;\n\
+\n\
+    // Avoid reading out of bounds\n\
+    if (gemm_k_iterations_0 <= 1) {\n\
+      iterator_A.clear_mask();\n\
+      iterator_B0.clear_mask();\n\
+    }\n\
+\n\
+    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing \n\
+    // shared memory loads (which have the tightest latency requirement).\n\
+    iterator_A.load(tb_frag_A);\n\
+\n\
+    //\n\
+    // Mainloop\n\
+    //\n\
+\n\
+    // Note: The main loop does not support Base::WarpGemmIterations == 2.\n\
+    CUTLASS_GEMM_LOOP\n\
+    for (; gemm_k_iterations_0 > 0; --gemm_k_iterations_0) {\n\
+\n\
+      //\n\
+      // Loop over GEMM K dimension\n\
+      //\n\
+\n\
+      CUTLASS_PRAGMA_UNROLL\n\
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations0; ++warp_mma_k) {\n\
+\n\
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group\n\
+        // as the case may be.\n\
+\n\
+        if (warp_mma_k == Base::kWarpGemmIterations0 - 1) {\n\
+\n\
+          // Write fragments to shared memory\n\
+          this->smem_iterator_A_.store(tb_frag_A);\n\
+\n\
+          this->smem_iterator_B0_.store(tb_frag_B0);\n\
+\n\
+          __syncthreads();\n\
+\n\
+          // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing \n\
+          // shared memory loads (which have the tightest latency requirement).\n\
+          iterator_A.load(tb_frag_A);\n\
+          \n\
+          ++this->smem_iterator_B0_;\n\
+          ++this->smem_iterator_A_;\n\
+        \n\
+\n\
+          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory\n\
+          if (smem_write_stage_idx == 1) {\n\
+            this->smem_iterator_A_.add_tile_offset({0, -Base::Stage0});\n\
+            this->smem_iterator_B0_.add_tile_offset({-Base::Stage0, 0});\n\
+          }\n\
+          else {\n\
+            this->warp_tile_iterator_A0_.add_tile_offset(\n\
+                {0, -Base::Stage0 * Policy0::kPartitionsK * Base::kWarpGemmIterations0});\n\
+            this->warp_tile_iterator_B0_.add_tile_offset(\n\
+                {-Base::Stage0 * Policy0::kPartitionsK * Base::kWarpGemmIterations0,\n\
+                 0});\n\
+          }\n\
+\n\
+          smem_write_stage_idx ^= 1;\n\
+        }\n\
+\n\
+        this->warp_tile_iterator_A0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);\n\
+        this->warp_tile_iterator_B0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);\n\
+        \n\
+        this->warp_tile_iterator_A0_.load(warp_frag_A0[(warp_mma_k + 1) % 2]);\n\
+        this->warp_tile_iterator_B0_.load(warp_frag_B0[(warp_mma_k + 1) % 2]);\n\
+\n\
+        ++this->warp_tile_iterator_A0_;\n\
+        ++this->warp_tile_iterator_B0_;\n\
+\n\
+        if (warp_mma_k == 0) {\n\
+\n\
+          iterator_B0.load(tb_frag_B0);\n\
+\n\
+          ++iterator_A;\n\
+          ++iterator_B0;\n\
+\n\
+          // Avoid reading out of bounds if this was the last loop iteration\n\
+          if (gemm_k_iterations_0 <= 2) {\n\
+            iterator_A.clear_mask();\n\
+            iterator_B0.clear_mask();\n\
+          }\n\
+        }\n\
+\n\
+        warp_mma0(accum0, warp_frag_A0[warp_mma_k % 2], warp_frag_B0[warp_mma_k % 2], accum0);\n\
+      }\n\
+    }\n"
+            return accu_code + code
+
+        def gen_other_gemms_2stage(b2b_num):
+            
+            code = ""
+            
+            def gemm_teamplate(id):
+                code = "// " + str(id + 1) + " Gemm" 
+                code += "    /// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile\n"
+                
+                code += "    " + helper.var_idx("FragmentC", id - 1) + helper.var_idx(" after_epilogue_accu", id - 1) + ";\n"
+                code += "    " + helper.var_idx("epilogue_", id - 1) + helper.var_idx("(output_op_", id - 1) + helper.var_idx(", accum", id - 1) \
+                               + helper.var_idx(", after_epilogue_accu", id - 1) + helper.var_idx(", iterator_C", id - 1) +");\n"
+                
+                #    FragmentIteratorA1 warp_tile_iterator_A1_(accum0); 
+                code += "    " + helper.var_idx("FragmentIteratorA", id) + helper.var_idx(" warp_tile_iterator_A", id) +"_(" + helper.var_idx("after_epilogue_accu", id - 1) + ");\n"
+                #    FragmentB1 tb_frag_B1;
+                code += "    " +  helper.var_idx("FragmentB", id) + " " + helper.var_idx("tb_frag_B", id) + ";\n"
+                #    tb_frag_B1.clear();
+                code += "    " +  helper.var_idx("tb_frag_B", id)  + ".clear();\n"
+                #    iterator_B1.load(tb_frag_B1);
+                code += "    " + helper.var_idx("iterator_B", id) + ".load(" + helper.var_idx("tb_frag_B", id) + ");\n"
+                #    ++iterator_B1;
+                code += "    " +  "++" +  helper.var_idx("iterator_B", id) + ";\n"
+                #    this->smem_iterator_B1_.store(tb_frag_B1);
+                code += "    " +  helper.var_idx("this->smem_iterator_B", id) + "_.store(" + helper.var_idx("tb_frag_B", id) + ");\n"
+                #    ++this->smem_iterator_B1_;
+                code += "    " +  helper.var_idx("++this->smem_iterator_B", id) + "_;\n"
+                #    __syncthreads();
+                code += "    " +  "__syncthreads();\n"
+                #    WarpFragmentA1 warp_frag_A1[2];
+                code += "    " + helper.var_idx("WarpFragmentA", id) + helper.var_idx(" warp_frag_A", id) + "[2];\n"
+                #    WarpFragmentB1 warp_frag_B1[2];
+                code += "    " + helper.var_idx("WarpFragmentB", id) + helper.var_idx(" warp_frag_B", id) + "[2];\n"
+                #    this->warp_tile_iterator_B1_.set_kgroup_index(0);
+                code += "    " + helper.var_idx("this->warp_tile_iterator_B", id) + "_.set_kgroup_index(0);\n"
+                #    warp_tile_iterator_A1_.load(warp_frag_A1[0], output_op_0);
+                code += "    " + helper.var_idx("warp_tile_iterator_A", id) + helper.var_idx("_.load(warp_frag_A", id) + "[0]);\n"
+                #    this->warp_tile_iterator_B1_.load(warp_frag_B1[0]);
+                code += "    " + helper.var_idx("this->warp_tile_iterator_B", id) + helper.var_idx("_.load(warp_frag_B", id) + "[0]);\n"
+                #    ++warp_tile_iterator_A1_;
+                code +=  "    " + helper.var_idx("++warp_tile_iterator_A", id) + "_;\n"
+                #    ++this->warp_tile_iterator_B1_;
+                code +=  "    " + helper.var_idx("++this->warp_tile_iterator_B", id) + "_;\n"
+                #    Operator1 warp_mma1;
+                code +=  "    " + helper.var_idx("Operator", id) + " " + helper.var_idx("warp_mma", id) + ";\n"
+                #    smem_write_stage_idx = 1;
+                code +=  "    " + "smem_write_stage_idx = 1;\n"
+                #    int gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1;
+                code += "    " + helper.var_idx("int gemm_k_iterations_", id) + " = " + helper.var_idx("FragmentIteratorA", id) + helper.var_idx("::Policy::kIterations / Base::kWarpGemmIterations", id) +";\n"
+                #    if (gemm_k_iterations_1 <= 1) {
+                #      iterator_B1.clear_mask();
+                #    }
+                code += "    "  + "if ("  + helper.var_idx("gemm_k_iterations_", id) + " <= 1 ){\n" \
+                    + "    "  + "    " + helper.var_idx("iterator_B", id) + ".clear_mask();\n" \
+                    + "    "  +"}\n"
+                #    CUTLASS_PRAGMA_UNROLL
+                code += "    " + "CUTLASS_PRAGMA_UNROLL\n"
+                #    for (; gemm_k_iterations_1 > 0; --gemm_k_iterations_1) {
+                code += "    " + helper.var_idx("for (; gemm_k_iterations_", id) + helper.var_idx(" > 0; --gemm_k_iterations_", id) + ") {\n"
+                #      CUTLASS_PRAGMA_UNROLL
+                code += "    " + "    " + "CUTLASS_PRAGMA_UNROLL\n"
+                #      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1; ++warp_mma_k) {
+                code += "    " + "    " + helper.var_idx("for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations", id) + "; ++warp_mma_k) {\n"
+                #        if (warp_mma_k == Base::kWarpGemmIterations1 - 1) {
+                code += "    " + "    " + "    " + helper.var_idx("if (warp_mma_k == Base::kWarpGemmIterations", id) + " - 1) {\n"
+                #          this->smem_iterator_B1_.store(tb_frag_B1);
+                code += "    " + "    " + "    " + "    " + helper.var_idx(" this->smem_iterator_B", id) + helper.var_idx("_.store(tb_frag_B", id) + ");\n"
+                #          __syncthreads();
+                code += "    " + "    " + "    " + "    " + "__syncthreads();\n"
+                #          ++smem_iterator_B1_;
+                code += "    " + "    " + "    " + "    " + helper.var_idx(" ++smem_iterator_B", id)  + "_;\n"
+                #          if (smem_write_stage_idx == 1) {
+                #            smem_iterator_B1_.add_tile_offset({-Base::Stage, 0});
+                #          }
+                code += "    " + "    " + "    " + "    "  + "if ( smem_write_stage_idx == 1 ) {\n" \
+                    + "    " + "    " + "    " + "    " + "    " + helper.var_idx("smem_iterator_B", id) + helper.var_idx("_.add_tile_offset({-Base::Stage", i) + ", 0});\n" \
+                    + "    " + "    " + "    " + "    "  +"}\n"
+                #          else {
+                #            this->warp_tile_iterator_B1_.add_tile_offset(
+                #                {-Base::Stage * Policy1::kPartitionsK *
+                #                     Base::kWarpGemmIterations1,
+                #                 0});
+                #          }
+                code += "    " + "    " + "    " + "    "  + "else {\n" \
+                    + "    " + "    " + "    " + "    " + "    " + helper.var_idx("this->warp_tile_iterator_B", id) + "_.add_tile_offset(\n" \
+                    + "    " + "    " + "    " + "    " + "    " + helper.var_idx("{-Base::Stage", id) + helper.var_idx(" * Policy", id) + "::kPartitionsK *\n" \
+                    + "    " + "    " + "    " + "    " + "    " + helper.var_idx("Base::kWarpGemmIterations", id) + ",\n" \
+                    + "    " + "    " + "    " + "    " + "    " + "0});\n" \
+                    + "    " + "    " + "    " + "    "  + "}\n"
+
+                #          smem_write_stage_idx ^= 1;
+                #        }
+                code += "    " + "    " + "    " + "    "  + "smem_write_stage_idx ^= 1;\n" \
+                    + "    " + "    " + "    " + "}\n"
+
+                #        this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
+                code += "    " + "    " + "    " + helper.var_idx("this->warp_tile_iterator_B", id) + helper.var_idx("_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations", id) + ");\n"
+                #        warp_tile_iterator_A1_.load(warp_frag_A1[(warp_mma_k + 1) % 2], output_op_0);
+                code += "    " + "    " + "    " + helper.var_idx("warp_tile_iterator_A", id) + helper.var_idx("_.load(warp_frag_A", id) + "[(warp_mma_k + 1) % 2]);\n"
+                #        this->warp_tile_iterator_B1_.load(warp_frag_B1[(warp_mma_k + 1) % 2]);
+                code += "    " + "    " + "    " + helper.var_idx("this->warp_tile_iterator_B", id) + helper.var_idx("_.load(warp_frag_B", id) + "[(warp_mma_k + 1) % 2]);\n"
+                #        ++warp_tile_iterator_A1_;
+                code += "    " + "    " + "    " + helper.var_idx("++warp_tile_iterator_A", id) + "_;\n"
+                #        ++this->warp_tile_iterator_B1_;
+                code += "    " + "    " + "    " + helper.var_idx("++this->warp_tile_iterator_B", id) + "_;\n"
+                #        if (warp_mma_k == 0) {
+                #          iterator_B1.load(tb_frag_B1);
+                #          ++iterator_B1;
+                #          if (gemm_k_iterations_1 <= 2) {
+                #            iterator_B1.clear_mask();
+                #          }
+                #        }
+                code += "    " + "    " + "    " + " if (warp_mma_k == 0) {\n" \
+                    + "    " + "    " + "    " + "    " + helper.var_idx("iterator_B", id) + helper.var_idx(".load(tb_frag_B", id) + ");\n" \
+                    + "    " + "    " + "    " + "    " + helper.var_idx("++iterator_B", id) +";\n" \
+                    + "    " + "    " + "    " + "    " + helper.var_idx("if (gemm_k_iterations_", id) +" <= 2) {\n" \
+                    + "    " + "    " + "    " + "    " + "    " + helper.var_idx("iterator_B", id) + ".clear_mask();\n" \
+                    + "    " + "    " + "    " + "    " + "}\n" \
+                    + "    " + "    " + "    " + "}\n"
+                #        warp_mma1(accum, warp_frag_A1[warp_mma_k % 2], warp_frag_B1[warp_mma_k % 2], accum);
+                #      }
+                #    }
+                code += "    " + "    " + "    " + helper.var_idx("warp_mma", id) + helper.var_idx("(accum", id) + helper.var_idx(", warp_frag_A", id) + helper.var_idx("[warp_mma_k % 2], warp_frag_B", id) + helper.var_idx("[warp_mma_k % 2], accum", id) + ");\n" \
+                    + "    " + "    " + "}\n" \
+                    + "    " + "}\n\n\n"
+
+                return code
+
+            for i in range (1, b2b_num):
+                clear_accu = ""
+                if i != b2b_num - 1:
+                    clear_accu = "    " + helper.var_idx("FragmentC", i) +  helper.var_idx(" accum", i) +";\n"
+                    clear_accu += "    " + helper.var_idx("accum", i) +".clear();\n"
+                code += clear_accu + gemm_teamplate(i)
+            
+            return code
+
+        operator_code = " CUTLASS_DEVICE\n\
+  void operator()(\n " + gen_operator_param(self.b2b_num) + ") {\n"
+        if first_use_1stage:
+            operator_code += gen_first_gemm_1stage(self.b2b_num)
+        else:
+            operator_code += gen_first_gemm_2stage(self.b2b_num)
+        operator_code += gen_other_gemms_2stage(self.b2b_num) + "}\n"
+        return operator_code
+
+    def gen_construct_func(self):
+        name = self.gen_class_name
+        func_code = "CUTLASS_DEVICE\n"
+        func_code += name + "(\n" \
+                    + "    " + "typename Base::B2bMmaSharedStorage &shared_storage,\n" \
+                    + "    " + "int thread_idx,\n" \
+                    + "    " + "int warp_idx,\n" \
+                    + "    " + "int lane_idx\n" \
+                    + "):\n"
+        func_code +=  "    " + "Base(shared_storage, thread_idx, warp_idx, lane_idx),\n" \
+                    + "    " + "smem_iterator_A_(shared_storage.sharedStorage0.operand_A_ref(), thread_idx),\n"
+        
+        for i in range(self.b2b_num):
+            final = ",\n"
+            if i == self.b2b_num - 1:
+                final = " {\n"
+            func_code += helper.var_idx("smem_iterator_B", i) + helper.var_idx("_(shared_storage.sharedStorage", i) +".operand_B_ref(), thread_idx)" + final
+
+        func_code +=  "    " + "int warp_idx_mn = warp_idx % (Base::WarpCount0::kM * Base::WarpCount0::kN);\n"
+        func_code +=  "    " + "int warp_idx_k = warp_idx / (Base::WarpCount0::kM * Base::WarpCount0::kN);\n"
+
+        func_code +=  "    " + "int warp_idx_m = warp_idx_mn % Base::WarpCount0::kM;\n"
+        func_code +=  "    " + "int warp_idx_n = warp_idx_mn / Base::WarpCount0::kM;\n"
+
+        for i in range(self.b2b_num):
+            func_code +=  "    " + helper.var_idx("int tile_offset_k", i) + helper.var_idx(" = Base::kWarpGemmIterations", i) + " * warp_idx_k;\n"
+
+        func_code +=  "    " + "this->warp_tile_iterator_A0_.add_tile_offset({warp_idx_m, tile_offset_k0});\n"
+
+        for i in range(self.b2b_num):
+            func_code +=  "    " + helper.var_idx("this->warp_tile_iterator_B", i) + helper.var_idx("_.add_tile_offset({tile_offset_k", i) + ", warp_idx_n});\n"
+
+        func_code += "}\n"
+        
+        return func_code
+
+    def gen_member_func(self, first_use_1stage):
+        code = "public:\n"
+        code += self.gen_operator(first_use_1stage)
+        code += self.gen_construct_func()
+
+        return code
+
+    def gen_code(self, first_use_1stage):
+
+        def gen_template_args(b2b_num):
+            template_param = []
+            template_param.append(("typename", "Shape0"))
+            template_param.append(("typename", "IteratorA0"))
+            template_param.append(("typename", "SmemIteratorA0"))
+            template_param.append(("typename", "IteratorB0"))
+            template_param.append(("typename", "SmemIteratorB0"))
+
+            for i in range(1, b2b_num):
+                template_param.append(("typename", helper.var_idx("Shape", i)))
+                template_param.append(("typename", helper.var_idx("FragmentIteratorA", i)))
+                template_param.append(("typename", helper.var_idx("IteratorB", i)))
+                template_param.append(("typename", helper.var_idx("SmemIteratorB", i)))
+
+            template_param.append(("typename", "ElementC"))
+            template_param.append(("typename", "LayoutC"))
+
+            for i in range(0, b2b_num - 1):
+                template_param.append(("typename", helper.var_idx("OutputOp", i)))
+
+            for i in range(0, b2b_num - 1):
+                template_param.append(("typename", helper.var_idx("FusedAddBiasEpilogue", i)))
+            
+            for i in range(0, b2b_num):
+                template_param.append(("typename", helper.var_idx("Policy", i)))
+            for i in range(0, b2b_num):
+                template_param.append((int, helper.var_idx("Stage", i)))
+
+            template_param.append(("typename","TransformA0", "NumericArrayConverter<typename SmemIteratorA0_::Element, typename IteratorA0_::Element, IteratorA0_::Fragment::kElements>"))
+
+            for i in range(0, b2b_num):
+                cvtr = helper.var_idx("NumericArrayConverter<typename SmemIteratorB", i) + helper.var_idx("_::Element, typename IteratorB", i) + helper.var_idx("_::Element, IteratorB", i) + "_::Fragment::kElements>"
+                template_param.append(("typename", helper.var_idx("TransformB", i), cvtr))
+
+            template_param.append(("typename", "Enable", "bool"))
+
+            return template_param
+
+        template_param = gen_template_args(self.b2b_num)
+        inheritance_code = "public B2bMmaBase<"
+        for i in range(self.b2b_num):
+            inheritance_code += helper.var_idx("Shape", i) + "_, "
+        for i in range(self.b2b_num):
+            inheritance_code += helper.var_idx("Policy", i) + "_, "
+        for i in range(self.b2b_num - 1):
+            inheritance_code += helper.var_idx("Stage", i) + "_, "
+        inheritance_code += helper.var_idx("Stage", self.b2b_num - 1) + "_"
+        inheritance_code += ">"
+
+        code_body = ""
+        using_code= self.gen_using()
+        func_code = self.gen_member_func(first_use_1stage)
+
+        code_body = using_code + func_code
+
+        class_code = gen_ir.gen_template_class(self.gen_class_name, template_param, code_body, inheritance_code = inheritance_code)
+
+        code = self.gen_include_header()
+        code += gen_ir.gen_namespace("cutlass", gen_ir.gen_namespace("gemm", gen_ir.gen_namespace("threadblock", class_code)))
+        # print(code)
+        return code
+
+
+class gen_b2b_mma_base:
+    def __init__(self, template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root):
+        self.gen_class_name = gen_class_name
+        self.template_param = template_param
+        self.b2b_num = b2b_num
+        self.cutlass_deps_root = cutlass_deps_root
+        self.project_root = project_root
+
+    def gen_include_header(self):
+        code = '''
+#pragma once
+
+#include \"{cutlass_dirs}cutlass/aligned_buffer.h\"
+#include \"{cutlass_dirs}cutlass/arch/memory.h\"
+#include \"{cutlass_dirs}cutlass/array.h\"
+#include \"{cutlass_dirs}cutlass/cutlass.h\"
+#include \"{cutlass_dirs}cutlass/gemm/gemm.h\"
+#include \"{cutlass_dirs}cutlass/matrix_shape.h\"
+#include \"{cutlass_dirs}cutlass/numeric_types.h\"\n'''.format(cutlass_dirs=self.cutlass_deps_root)
+        return code
+
+    def gen_shared_storage(self):
+        code = \
+" template< \n\
+    typename Shape_,\n\
+    typename Policy_,\n\
+    int ThisStage_\n\
+>\n\
+class SharedStorage {\n\
+public:\n\
+    using Shape = Shape_;\n\
+    using Policy = Policy_;\n\
+    static int const ThisStage = ThisStage_;\n\
+    using Operator = typename Policy::Operator;\n\
+    \
+    using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;\n\
+    \
+    /// Tensor reference to the B operand \n\
+    using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;\n\
+\n\
+    /// Shape of the A matrix operand in shared memory \n\
+    using ShapeA = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow,\n\
+                               Shape::kK * ThisStage +\n\
+                                   Policy::SmemPaddingA::kColumn>;\n\
+\n\
+    /// Shape of the B matrix operand in shared memory\n\
+    using ShapeB =\n\
+        MatrixShape<Shape::kK * ThisStage + Policy::SmemPaddingB::kRow,\n\
+                    Shape::kN + Policy::SmemPaddingB::kColumn>;\n\
+\n\
+   public:\n\
+\n\
+    /// Buffer for A operand\n\
+    AlignedBuffer<typename Operator::ElementA, ShapeA::kCount> operand_A;\n\
+\n\
+    /// Buffer for B operand\n\
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;\n\
+\n\
+   public:\n\
+\n\
+    /// Returns a layout object for the A matrix\n\
+    CUTLASS_DEVICE\n\
+    static typename Operator::LayoutA LayoutA() {\n\
+      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});\n\
+    }\n\
+\n\
+    /// Returns a layout object for the B matrix\n\
+    CUTLASS_HOST_DEVICE\n\
+    static typename Operator::LayoutB LayoutB() {\n\
+      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});\n\
+    }\n\
+\n\
+    /// Returns a TensorRef to the A operand\n\
+    CUTLASS_HOST_DEVICE\n\
+    TensorRefA operand_A_ref() {\n\
+      return TensorRefA{operand_A.data(), LayoutA()};\n\
+    }\n\
+\n\
+    /// Returns a TensorRef to the B operand\n\
+    CUTLASS_HOST_DEVICE\n\
+    TensorRefB operand_B_ref() {\n\
+      return TensorRefB{operand_B.data(), LayoutB()};\n\
+    }\n\
+    CUTLASS_HOST_DEVICE\n\
+    void * get_B_Shared_ptr() {\n\
+      return operand_B.data();\n\
+    }\n\
+  };\n"
+        return code
+
+    def gen_using_and_misc(self, b2b_num):
+        code_using = ""
+        for i in range(b2b_num):
+            code_using += "using Operator" +str(i) + " = typename Policy" + str(i) +"::Operator;\n"
+
+        for i in range(b2b_num):
+            code_using += "using WarpGemm" +str(i) + " = typename Policy" + str(i) +"::Operator::Shape;\n"
+
+        for i in range(b2b_num):
+            code_using += "using WarpCount" +str(i) + " = GemmShape<"   + helper.var_idx("Shape", i) +"::kM / " + helper.var_idx("WarpGemm", i) +"::kM, "\
+                                                                        + helper.var_idx("Shape", i) +"::kN / " + helper.var_idx("WarpGemm", i) +"::kN, "\
+                                                                        + helper.var_idx("Shape", i) +"::kK / " + helper.var_idx("WarpGemm", i) +"::kK>;\n"
+
+        code_misc = ""
+        for i in range(b2b_num):
+            code_misc += "static int const " + helper.var_idx("kWarpGemmIterations", i) + " = (" + helper.var_idx("WarpGemm", i) + "::kK / " + helper.var_idx("Operator", i) +"::Policy::MmaShape::kK);\n"
+     
+        code = code_using + code_misc + self.gen_shared_storage()
+
+        for i in range(b2b_num):
+            code += "using " + helper.var_idx("SharedStorage", i) + " = SharedStorage<" + helper.var_idx("Shape", i) + ", " + helper.var_idx("Policy", i) +", " +  helper.var_idx("Stage", i) + ">;\n"
+
+        def gen_union_shared_storage(b2b_num):
+            code = ""
+            for i in range(b2b_num):
+                code += "    " +helper.var_idx("SharedStorage", i) + " " + helper.var_idx("sharedStorage", i) +";\n"
+            return code
+
+        code += "union B2bMmaSharedStorage {\n" + gen_union_shared_storage(self.b2b_num) + "};\n"
+
+        for i in range(b2b_num - 1):
+            code += helper.var_idx("void * C", i) + "_smm_ptr;\n"
+
+        return code
+
+    def gen_protected(self):
+        code = "\nprotected:\n"
+        code += "typename Operator0::IteratorA warp_tile_iterator_A0_;\n"
+        for i in range(self.b2b_num):
+            code += "typename Operator" +str(i) + "::IteratorB" +" warp_tile_iterator_B" + str(i) + "_;\n"
+        return code
+
+    def gen_public_member(self):
+        code = "\npublic:\n"
+
+        code += "CUTLASS_DEVICE\n"
+        code += \
+        "B2bMmaBase(\n" + \
+        "    B2bMmaSharedStorage & shared_storage,\n" + \
+        "    int thread_idx,\n" + \
+        "    int warp_idx,\n" + \
+        "    int lane_idx\n" + \
+        "):\n" + \
+        " warp_tile_iterator_A0_(shared_storage.sharedStorage0.operand_A_ref(), lane_idx),\n"
+        for i in range(self.b2b_num):
+            final = ",\n"
+            if i == self.b2b_num-1:
+                final = "\n"
+            
+            iterator = " warp_tile_iterator_B" + str(i) + "_"
+            shared_storage = "shared_storage.sharedStorage" + str(i) + ".operand_B_ref()"
+            code += iterator + "(" + shared_storage + ", lane_idx)" + final
+        
+        
+        code += "{\n"
+        for i in range(self.b2b_num - 1):
+            code += helper.var_idx("    C", i) +  helper.var_idx("_smm_ptr = shared_storage.sharedStorage", i) + ".get_B_Shared_ptr();\n"
+        code += "}\n"
+
+        return code
+        
+    def gen_code(self):
+
+        template_arg = []
+        for i in range(self.b2b_num):
+            template_arg.append(("typename", helper.var_idx("Shape", i)))
+        for i in range(self.b2b_num):
+            template_arg.append(("typename", helper.var_idx("Policy", i)))
+        for i in range(self.b2b_num):
+            template_arg.append((int, helper.var_idx("Stage", i)))
+        
+     
+
+        code_body = self.gen_using_and_misc(self.b2b_num)
+        code_body += self.gen_protected()
+        code_body += self.gen_public_member()
+
+        class_code = gen_ir.gen_template_class("B2bMmaBase", template_arg, code_body)
+
+        code = self.gen_include_header() + gen_ir.gen_namespace("cutlass", gen_ir.gen_namespace("gemm", gen_ir.gen_namespace("threadblock", class_code)))
+
+        return code
+
+
+class gen_threadblock:
+    def __init__(self, template_param, gen_class_name, b2b_num, output_dir, cutlass_deps_root, project_root):
+        self.gen_class_name = gen_class_name
+        self.template_param = template_param
+        self.b2b_num = b2b_num
+        self.file_dir = output_dir + "/threadblock/"
+
+        self.cutlass_deps_root = cutlass_deps_root
+        self.project_root = project_root
+
+
+        self.gen_b2b_mma_base = gen_b2b_mma_base(template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root)
+        self.gen_b2b_mma_pipelined = gen_b2b_mme_pipelined(template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root)
+        self.gen_default_b2b_mma = gen_default_b2b_mma(template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root)
+
+
+    def gen_code(self, first_use_1stage):
+
+        base_code = self.gen_b2b_mma_base.gen_code()
+        print("[INFO]: Gen kernel code [b2b_mma_base.h]output Dir: is ", self.file_dir)
+
+        with open(self.file_dir + "b2b_mma_base.h", "w+") as f:
+            f.write(base_code)        
+        pipeline_code = self.gen_b2b_mma_pipelined.gen_code(first_use_1stage = first_use_1stage)
+        print("[INFO]: Gen kernel code [b2b_mma_pipelined.h]output Dir: is ", self.file_dir)
+
+        with open(self.file_dir + "b2b_mma_pipelined.h", "w+") as f:
+            f.write(pipeline_code)
+        default_code = self.gen_default_b2b_mma.gen_code()
+        print("[INFO]: Gen kernel code [default_b2b_mma.h]output Dir: is ", self.file_dir)
+
+        with open(self.file_dir + "default_b2b_mma.h", "w+") as f:
+            f.write(default_code)
diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_turing_and_volta.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_turing_and_volta.py
new file mode 100644
index 0000000000..db1ec4c72f
--- /dev/null
+++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_turing_and_volta.py
@@ -0,0 +1,456 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import helper
+import gen_ir as ir
+
+class gen_turing_impl:
+    def __init__(self,fuse_gemm_info, gen_class_name, user_header_file, output_dir = "../"):
+        self.fuse_gemm_info = fuse_gemm_info
+        self.class_name = gen_class_name
+        self.gen_class_name = gen_class_name + "_turing_impl"
+        self.user_header_file = ""
+        for header in user_header_file: 
+            self.user_header_file += "#include \"" + header + "\"\n"
+        self.output_dir = output_dir
+        self.b2b_num = len(fuse_gemm_info)
+
+        self.gen_turing_unfused = gen_volta_turing_fuse_act_impl(fuse_gemm_info, gen_class_name, user_header_file, output_dir)
+
+    def gen_using(self):
+        code_using = "using b2b_gemm = typename cutlass::gemm::device::" + self.class_name + "<cutlass::half_t>;"
+        
+        return code_using + "\n"
+
+    def gen_initialize(self):
+        code = ""
+        for i in range(self.b2b_num):
+            code_this = ""
+
+            code_this += helper.var_idx(helper.type_2_cutlass_type(self.fuse_gemm_info[i]['Acc_tp']) + " alpha", i) + " = " + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['Acc_tp']) + "(1);\n"
+            beta = "(1)"
+            
+            if helper.get_epilogue_add_bias_or_not(self.fuse_gemm_info[i]) is False:
+                beta = "(0)"
+            code_this += helper.var_idx(helper.type_2_cutlass_type(self.fuse_gemm_info[i]['Acc_tp']) + " beta", i) + " = " + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['Acc_tp']) + beta + ";\n"
+            k_str = str(self.fuse_gemm_info[i]['mnk'][2])
+            if i == 0:
+                k_str = "K0"
+            code_this += helper.var_idx("cutlass::gemm::GemmCoord problem_size_", i) + "(M, " + str(self.fuse_gemm_info[i]['mnk'][1]) + ", " + k_str + ");\n" 
+            code += code_this
+        code += "typename b2b_gemm::Arguments arguments{\n"
+
+        for i in range(self.b2b_num):
+            code += "    " + helper.var_idx("problem_size_", i) + ",\n"
+
+
+        code += "    " + "{reinterpret_cast<" + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['A_tp']) + "*>(" + helper.var_idx("A", 0) + "), " + helper.var_idx("problem_size_", 0) + ".k()},\n"
+
+        for i in range(self.b2b_num):
+            
+            ldmB = str(self.fuse_gemm_info[i]['mnk'][2])
+            if i == 0:
+                ldmB = "K0"
+
+            if self.fuse_gemm_info[i]['B_format'] is 'Row':
+                ldmB = str(self.fuse_gemm_info[i]['mnk'][1])
+            
+            ldmC = str(helper.get_epilogue_bias_ldm(self.fuse_gemm_info[i]))
+
+            code += "    " + "{reinterpret_cast<" + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['B_tp']) + "*>(" + helper.var_idx("B", i) + "), " + ldmB + "},\n"
+            code += "    " + "{reinterpret_cast<" + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['C_tp']) + "*>(" + helper.var_idx("C", i) + "), " + ldmC + "},\n"
+        code += "    " + "{reinterpret_cast<" + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['C_tp']) + "*>(" + helper.var_idx("D", self.b2b_num -1) + "), " + helper.var_idx("problem_size_", self.b2b_num - 1) + ".n()},\n"
+
+
+        for i in range(self.b2b_num):
+            code += "    " + "{ " + helper.var_idx("alpha", i) + ", " + helper.var_idx("beta", i) 
+            for epilogue_arg in  helper.get_epilogue_args(self.fuse_gemm_info[i]):
+                arg_name = helper.var_idx("Epilogue", i) + "_" +  epilogue_arg[1]
+                code += ", " + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['Acc_tp']) + "(" + str(arg_name) + ")"
+            code += "},\n"
+        code += "    " + "Batch};\n\n"
+
+        code += "    " "b2b_gemm gemm_op;\n"
+        code += "    " + "gemm_op.initialize(arguments);\n"
+        return code + "\n"
+        
+
+
+    def gen_run(self):
+        code = "    " + "gemm_op(stream);\n"
+
+        return code
+
+    def gen_wrapper(self):
+        code_body = ""
+
+        arg_lists = []
+        arg_lists.append(["int", "M"])
+        arg_lists.append(["int", "K0"])
+        arg_lists.append(["int", "Batch"])
+        arg_lists.append(["void*", helper.var_idx("A", 0)])
+        for i in range(self.b2b_num):
+            arg_lists.append(["void*", helper.var_idx("B", i)])
+            arg_lists.append(["void*", helper.var_idx("C", i)])
+            arg_lists.append(["void*", helper.var_idx("D", i)])
+            epilogue_args = helper.get_epilogue_args(self.fuse_gemm_info[i])
+            acc_tp = helper.get_epilogue_compute_tp(self.fuse_gemm_info[i])
+            for arg in epilogue_args:
+                arg_tp = arg[0]
+                arg_name = helper.var_idx("Epilogue", i) + "_" +  arg[1]
+                arg_lists.append([arg_tp, arg_name])
+        
+        if self.b2b_num == 1:
+            code_body += self.gen_turing_unfused.gen_using(False)  #False -> Turing, True -> Volta
+            code_body += self.gen_turing_unfused.gen_initialize()
+            code_body += self.gen_turing_unfused.gen_run()
+        else:
+            code_body += self.gen_using()
+            code_body += self.gen_initialize()
+            code_body += self.gen_run()
+
+        code = ir.gen_func(self.gen_class_name, arg_lists, code_body)
+
+        return code 
+
+    def gen_code(self):
+
+        code = self.gen_wrapper()
+        helper.write_2_headfile("turing_impl.h", self.output_dir, self.user_header_file + "\n" + code)
+
+class gen_volta_turing_fuse_act_impl:
+    def __init__(self, fuse_gemm_info, gen_class_name, user_header_file, output_dir = "../"):
+        self.fuse_gemm_info = fuse_gemm_info
+        self.gen_class_name = gen_class_name + "_volta_impl"
+        self.user_header_file = ""
+        for header in user_header_file: 
+            self.user_header_file +=  "#include \"" + header + "\"\n"
+        self.output_dir = output_dir
+        self.b2b_num = len(fuse_gemm_info)
+
+    def perf_tiling(self, layer_mnk):
+        mnk = layer_mnk[:]
+        block_tile = mnk[:]  
+        block_tile[2] = 32 # force the K tile to be 32
+
+        # M tile gen
+        block_tile[0] = 32
+
+        # N tile gen
+        if mnk[1] > 128:
+            block_tile[1] = 256
+        elif mnk[1] > 64:
+            block_tile[1] = 128
+        elif mnk[1] > 32:
+            block_tile[1] = 64
+        else : 
+            block_tile[1] = 32
+        
+        warp_tile = block_tile[:]          
+        if block_tile[1] == 256:
+            warp_tile[1] = 64
+        elif block_tile[1] == 128:
+            warp_tile[1] = 32
+        elif block_tile[1] == 64:
+            warp_tile[1] = 32
+        else :
+            warp_tile[1] = 32
+
+        warp_tile[0] = 32
+
+        return block_tile, warp_tile
+
+
+    def process_epilogue(self, epilogue_tp, n, C_tp, Acc_tp):
+        epilogue_setted_type = epilogue_tp
+        cutlass_epilogue_name = "LinearCombinationRelu"
+        if epilogue_setted_type.lower() == 'leakyrelu':
+            cutlass_epilogue_name = "LinearCombinationLeakyRelu"
+        elif epilogue_setted_type.lower() == 'identity':
+            cutlass_epilogue_name = "LinearCombination"
+
+
+        n_mod_8 = n % 4
+        N_align_elements = 1
+        if n_mod_8 == 0:
+            N_align_elements = 8
+        elif n_mod_8 == 4:
+            N_align_elements = 4
+        elif n_mod_8 == 2 or n_mod_8 == 6:
+            N_align_elements = 2
+
+        epilogue_str = "cutlass::epilogue::thread::" + cutlass_epilogue_name+ "<" + C_tp + ", " + str(N_align_elements) + ", " + Acc_tp + ", " + Acc_tp + ">"
+
+        return epilogue_str
+
+    def gen_using(self, volta = True):
+        code_using = ""
+        volta_arch = "cutlass::arch::Sm70"
+        volta_tc = "cutlass::gemm::GemmShape<8, 8, 4>"
+
+        turing_arch = "cutlass::arch::Sm75"
+        turing_tc = "cutlass::gemm::GemmShape<16, 8, 8>"
+
+        arch = ""
+        tc = ""
+        if volta:
+            arch = volta_arch
+            tc = volta_tc
+        else:
+            arch = turing_arch
+            tc = turing_tc
+
+        for i in range(self.b2b_num):
+            
+            k = self.fuse_gemm_info[i]['mnk'][2]
+
+            k_mod_8 = k % 4
+            ab_ldm = 1
+            if k_mod_8 == 0:
+                ab_ldm = 8
+            elif k_mod_8 == 4:
+                ab_ldm = 4
+            elif k_mod_8 == 2 or k_mod_8 == 6:
+                ab_ldm = 2
+
+            block_tile, warp_tile = self.perf_tiling(self.fuse_gemm_info[i]['mnk'])
+
+            this_gemm_config =  helper.var_idx("using Gemm", i) + " = cutlass::gemm::device::GemmBatched<\n"
+            this_gemm_config += "    " + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['A_tp']) + ",\n"
+            this_gemm_config += "    " + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['A_format']) + ",\n"
+            this_gemm_config += "    " + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['B_tp']) + ",\n"
+            this_gemm_config += "    " + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['B_format']) + ",\n"
+            this_gemm_config += "    " + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['C_tp']) + ",\n"
+            this_gemm_config += "    " + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['C_format']) + ",\n"
+            this_gemm_config += "    " + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['Acc_tp']) + ",\n"
+            this_gemm_config += "    " + "cutlass::arch::OpClassTensorOp,\n"
+            this_gemm_config += "    " + arch + ",\n"
+            this_gemm_config += "    " + "cutlass::gemm::GemmShape<" + str(block_tile[0]) + ", " + str(block_tile[1]) + ", " + str(block_tile[2]) + ">,\n"
+            this_gemm_config += "    " + "cutlass::gemm::GemmShape<" + str(warp_tile[0]) + ", " + str(warp_tile[1]) + ", " + str(warp_tile[2]) + ">,\n"
+            this_gemm_config += "    " + tc + ",\n"
+            this_gemm_config += "    " + self.process_epilogue(helper.get_epilogue_tp(self.fuse_gemm_info[i]), self.fuse_gemm_info[i]['mnk'][1], helper.type_2_cutlass_type(self.fuse_gemm_info[i]['C_tp']), helper.type_2_cutlass_type(self.fuse_gemm_info[i]['Acc_tp'])) + ",\n"
+            this_gemm_config += "    " + "cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle,\n"
+            this_gemm_config += "    " + "2,\n"
+            this_gemm_config += "    " + str(ab_ldm) + ",\n"
+            this_gemm_config += "    " + str(ab_ldm) + ">;\n"
+
+            code_using += this_gemm_config + "\n"
+
+        return code_using + "\n"
+
+    def gen_initialize(self):
+        code = ""
+        for i in range(self.b2b_num):
+            code_this = ""
+
+            N_str = str(self.fuse_gemm_info[i]['mnk'][1])
+
+            code_this += helper.var_idx(helper.type_2_cutlass_type(self.fuse_gemm_info[i]['Acc_tp']) + " alpha", i) + " = " + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['Acc_tp']) + "(1);\n"
+            beta = "(1)"
+            if helper.get_epilogue_add_bias_or_not( self.fuse_gemm_info[i]) is False:
+                beta = "(0)"
+            code_this += helper.var_idx(helper.type_2_cutlass_type(self.fuse_gemm_info[i]['Acc_tp']) + " beta", i) + " = " + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['Acc_tp']) + beta + ";\n"
+
+            k_str = str(self.fuse_gemm_info[i]['mnk'][2])
+            if i == 0:
+                k_str = "K0"
+            code_this += helper.var_idx("cutlass::gemm::GemmCoord problem_size_", i) + "(M, " + str(self.fuse_gemm_info[i]['mnk'][1]) + ", " + k_str + ");\n" 
+            code_this += helper.var_idx("typename Gemm", i) + helper.var_idx("::Arguments arguments_", i) + "{\n"
+            code_this += "    " + helper.var_idx("problem_size_", i) + ",\n"
+            ldmA = k_str
+            ldmB = k_str
+            ldmC = str(self.fuse_gemm_info[i]['mnk'][1])
+
+            ldmBias = str(helper.get_epilogue_bias_ldm(self.fuse_gemm_info[i]))
+
+            if self.fuse_gemm_info[i]['A_format'] is 'Col':
+                ldmA = "M"
+            if self.fuse_gemm_info[i]['B_format'] is 'Row':
+                ldmB = str(self.fuse_gemm_info[i]['mnk'][1])
+            if self.fuse_gemm_info[i]['C_format'] is 'Col':
+                ldmC = "M"
+
+            if i == 0:
+                code_this += "    " + "{reinterpret_cast<" + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['A_tp']) + "*>(" + helper.var_idx("A", i) + "), " + ldmA + "}, " + "M * " + ldmA + ",\n"
+            else:
+                code_this += "    " + "{reinterpret_cast<" + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['A_tp']) + "*>(" + helper.var_idx("D", i - 1) + "), " + ldmA + "}, " + "M * " + ldmA + ",\n"
+
+            code_this += "    " + "{reinterpret_cast<" + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['B_tp']) + "*>(" + helper.var_idx("B", i) + "), " + ldmB + "}, " + N_str + " * " + ldmB + ",\n"
+            
+            M_bias = str(helper.get_epilogue_bias_shape(self.fuse_gemm_info[i])[0])
+
+            code_this += "    " + "{reinterpret_cast<" + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['C_tp']) + "*>(" + helper.var_idx("C", i) + "), " + ldmBias + "}, " + M_bias + " * " + N_str + ",\n"
+            code_this += "    " + "{reinterpret_cast<" + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['C_tp']) + "*>(" + helper.var_idx("D", i) + "), " + ldmC + "}, " + "M * " + ldmC + ",\n"
+            code_this += "    " + "{ " + helper.var_idx("alpha", i) + ", " + helper.var_idx("beta", i) 
+            for epilogue_arg in  helper.get_epilogue_args(self.fuse_gemm_info[i]):
+                arg_name = helper.var_idx("Epilogue", i) + "_" +  epilogue_arg[1]
+                code_this += ", " + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['Acc_tp']) + "(" + str(arg_name) + ")"
+            code_this += " },\n"
+            code_this += "    " + "Batch};\n"
+
+            code_this += "    " + helper.var_idx("Gemm", i) + helper.var_idx(" gemm_op_", i) + ";\n"
+            code_this += "    " + helper.var_idx("gemm_op_", i) + helper.var_idx(".initialize(arguments_", i) + ", nullptr);\n"
+
+            code += code_this + "\n"
+        return code + "\n"
+        
+
+    def gen_run(self):
+        code = ""
+        for i in range(self.b2b_num):
+            code_this = ""
+            code_this += "    " + helper.var_idx("gemm_op_", i) + "(stream);\n"
+
+            code += code_this 
+        return code
+
+    def gen_wrapper(self):
+        code_body = ""
+
+        arg_lists = []
+        arg_lists.append(["int", "M"])
+        arg_lists.append(["int", "K0"])
+        arg_lists.append(["int", "Batch"])
+        arg_lists.append(["void*", helper.var_idx("A", 0)])
+        for i in range(self.b2b_num):
+            arg_lists.append(["void*", helper.var_idx("B", i)])
+            arg_lists.append(["void*", helper.var_idx("C", i)])
+            arg_lists.append(["void*", helper.var_idx("D", i)])
+            epilogue_args = helper.get_epilogue_args(self.fuse_gemm_info[i])
+            acc_tp = helper.get_epilogue_compute_tp(self.fuse_gemm_info[i])
+            for arg in epilogue_args:
+                arg_tp = arg[0]
+                arg_name = helper.var_idx("Epilogue", i) + "_" +  arg[1]
+                arg_lists.append([arg_tp, arg_name])
+        code_body += self.gen_using()
+        code_body += self.gen_initialize()
+        code_body += self.gen_run()
+
+        code = ir.gen_func(self.gen_class_name, arg_lists, code_body)
+
+        return code 
+
+    def gen_code(self):
+        code = self.gen_wrapper()
+        helper.write_2_headfile("volta_impl.h", self.output_dir, self.user_header_file + "\n" +  code)
+
+class gen_one_API:
+    def __init__(self, fuse_gemm_info, gen_class_name, user_header_file, output_dir = "../"):
+        self.fuse_gemm_info = fuse_gemm_info
+        self.gen_class_name = gen_class_name
+        self.user_header_file = ""
+        for header in user_header_file: 
+            self.user_header_file += "#include \"" + header + "\"\n"
+        self.output_dir = output_dir
+        self.b2b_num = len(fuse_gemm_info)
+
+        self.gen_volta = gen_volta_turing_fuse_act_impl(fuse_gemm_info, gen_class_name, user_header_file, output_dir)
+
+        self.gen_turing = gen_turing_impl(fuse_gemm_info, gen_class_name, user_header_file, output_dir)
+
+    def gen_CUTLASS_irrelevant_API(self):
+        code = ""
+        code += "#include <cuda_runtime.h>\n"
+        code += "#include <cassert>\n"
+
+        param_name = "Fused" + str(self.b2b_num) + "xGemm_"
+        for i in range(self.b2b_num):
+            param_name += str(self.fuse_gemm_info[i]['mnk'][1]) + "_"
+        param_name += "Params"
+        params = ""
+        params += "    " + "int M;\n"
+        params += "    " + "int K0;\n"
+        params += "    " + "int Batch;\n"
+        params += "    " + "const void* A0;\n"
+        for i in range(self.b2b_num):
+            params += "    " + "const void* " + helper.var_idx("B", i) + ";\n"
+            params += "    " + "const void* " + helper.var_idx("C", i) + ";\n"
+            epilogue_args = helper.get_epilogue_args(self.fuse_gemm_info[i])
+            acc_tp = helper.get_epilogue_compute_tp(self.fuse_gemm_info[i])
+            for arg in epilogue_args:
+                arg_tp = arg[0]
+                arg_name = helper.var_idx("Epilogue", i) + "_" +  arg[1]
+                params += "    " + arg_tp + " " + arg_name + ";\n"
+            params += "    " + "void* " + helper.var_idx("D", i) + ";\n"
+        code += ir.gen_struct(param_name, params)
+        code += "using Param = " + param_name + ";\n"
+        code += "void one_api( const  Param & param, int sm, cudaStream_t stream);\n"
+
+        
+        return code
+
+    def gen_one_api(self):
+        code = ""
+        code += "/* Auto Generated code - Do not edit.*/\n"
+        code += "#include \"cutlass_irrelevant.h\"\n"
+        code += "#include \"api.h\"\n"
+        code += "void one_api( const  Param & param, int sm, cudaStream_t stream) {\n"
+        
+        code += "    " + "if (sm == 70) \n"
+        code += "    " + "    " + self.gen_class_name + "_volta_impl(param.M, param.K0, param.Batch, const_cast<void*>(param.A0), "
+        for i in range(self.b2b_num):
+            code += helper.var_idx("const_cast<void*>(param.B", i) + "), "
+            code += helper.var_idx("const_cast<void*>(param.C", i) + "), "
+            code += helper.var_idx("param.D", i) + ", "
+            epilogue_args = helper.get_epilogue_args(self.fuse_gemm_info[i])
+            for arg in epilogue_args:
+                arg_name = helper.var_idx("Epilogue", i) + "_" +  arg[1]
+                code += "param." + arg_name + ", "
+        code += "stream);\n"
+        code += "    " + "else if(sm >= 75) \n"
+        code += "    " + "    " + self.gen_class_name + "_turing_impl(param.M, param.K0, param.Batch, const_cast<void*>(param.A0), "
+        for i in range(self.b2b_num):
+            code += helper.var_idx("const_cast<void*>(param.B", i) + "), "
+            code += helper.var_idx("const_cast<void*>(param.C", i) + "), "
+            code += helper.var_idx("param.D", i) + ", "
+            epilogue_args = helper.get_epilogue_args(self.fuse_gemm_info[i])
+            for arg in epilogue_args:
+                arg_name = helper.var_idx("Epilogue", i) + "_" +  arg[1]
+                code += "param." + arg_name + ", "
+        code += "stream);\n"
+        code += "    " + "else assert(0);\n"
+        code += "}\n"
+        return code
+
+    def gen_code(self):
+
+        turing_code = self.gen_turing.gen_wrapper()
+        volta_code = self.gen_volta.gen_wrapper()
+        cutlass_irrelevant_code = self.gen_CUTLASS_irrelevant_API()
+
+        one_api_code = self.gen_one_api()
+        with open(self.output_dir + "one_api.cu", "w+") as f:
+            f.write(one_api_code)
+ 
+        helper.write_2_headfile("cutlass_irrelevant.h", self.output_dir, cutlass_irrelevant_code)
+
+        helper.write_2_headfile("api.h", self.output_dir, self.user_header_file + "\n" +  turing_code + volta_code)
diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_verify.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_verify.py
new file mode 100644
index 0000000000..44f3876588
--- /dev/null
+++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_verify.py
@@ -0,0 +1,92 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import helper
+import gen_ir as ir
+
+import gen_turing_and_volta as gen_basic
+
+
+class gen_verify:
+    def __init__(self, fuse_gemm_info, gen_class_name, user_header_file, output_dir = "../"):
+        self.fuse_gemm_info = fuse_gemm_info
+        self.name = gen_class_name + "_verify"
+        self.b2b_num = len(fuse_gemm_info)
+        self.params = []
+        self.user_header_file = ""
+        for header in user_header_file: 
+            self.user_header_file += "#include \"" + header + "\"\n"
+        self.separate_cutlass = gen_basic.gen_volta_turing_fuse_act_impl(fuse_gemm_info, gen_class_name, user_header_file, output_dir)
+        self.gen_params()
+        self.output_dir = output_dir
+
+
+    def gen_code(self):
+        code = ""
+        code += self.user_header_file
+        code += self.separate_cutlass.gen_using(False)  #False -> Turing, True -> Volta
+
+        code_body = ""
+        for i in range(self.b2b_num):
+            code_body += "    " + helper.var_idx("Gemm", i) + helper.var_idx(" gemm_op_", i) + ";\n"
+            code_body += "    " + helper.var_idx("gemm_op_", i) + helper.var_idx(".initialize(Arguments_", i) + ", nullptr);\n"
+
+        code_body += self.separate_cutlass.gen_run()
+
+        code += ir.gen_func(self.name, self.params, code_body)
+        helper.write_2_headfile("cutlass_verify.h", self.output_dir, code)
+
+
+    def gen_params(self):
+        for i in range(self.b2b_num):
+            self.params.append(
+                (
+                    helper.var_idx("typename Gemm", i)+ "::Arguments", 
+                    helper.var_idx("Arguments_", i)
+                )
+            )
+
+
+    def get_params(self, declartion = True):
+        code = ""
+        if declartion:
+            for param in self.params:
+                code += param[0] + " " + param[1] + ";\n"
+
+        return code
+
+
+    def gen_initialize():
+        code = ""
+        initialize_code = self.separate_cutlass.gen_initialize()
+
+        code = ir.gen_func("initialize", [[]])
diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/generate.sh b/examples/44_multi_gemm_ir_and_codegen/ir_gen/generate.sh
new file mode 100755
index 0000000000..19d19ea937
--- /dev/null
+++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/generate.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+#################################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+NUM_ARGS=3
+if [ $# -ne $NUM_ARGS ]; then
+    echo "Usage: $0 <config_file> <output_directory> <cutlass_directory>"
+    echo "  config_file:       JSON file containing configuration to run"
+    echo "  output_directory:  directory to store results"
+    echo "  cutlass_directory: directory containing cutlass source"
+    exit 1
+fi
+
+config_file=$1
+output_dir=$2
+cutlass_dir=$3
+
+python3 gen_all_code.py \
+    --config-file $config_file \
+    --gen-name FusedMultiGemmForward \
+    --output-dir $output_dir \
+    --cutlass-dir $cutlass_dir
diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/helper.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/helper.py
new file mode 100644
index 0000000000..d9891404fa
--- /dev/null
+++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/helper.py
@@ -0,0 +1,135 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+def type_2_cutlass_type(input_type = "fp16"):
+    # float point type
+    if input_type == "fp32":
+        return "float"
+    if input_type == "bf16":
+        return "cutlass::bfloat16_t"
+    if input_type == "fp16":
+        return "cutlass::half_t"
+
+    # integer type
+    if(input_type == "int32"):
+        return "int32_t"
+    if(input_type == "int8"):
+        return "int8_t"
+
+    if input_type == 'Row':
+        return 'cutlass::layout::RowMajor'
+    if input_type == 'Col':
+        return 'cutlass::layout::ColumnMajor'
+
+def cvt_2_cutlass_shape(gemm_shape):
+    # gemm shape
+    if len(gemm_shape) == 3:
+        val = "cutlass::gemm::GemmShape<"  \
+                                        + str(gemm_shape[0]) + ", " \
+                                        + str(gemm_shape[1]) + ", " \
+                                        + str(gemm_shape[2]) + ">" 
+        return val
+
+
+def write_2_headfile(filename, file_dir, string):
+    with open(file_dir + filename, 'w') as f:
+        f.write("/* Auto Generated code - Do not edit.*/\n\n\n#pragma once\n" + string)
+
+def var_idx(varaiable, index):
+    return varaiable + str(index)
+
+
+def list_2_string(input_list, ):
+    rtn_string = ""
+    
+    cnt = 0
+
+    for element in input_list:
+        final = ", \n"
+        if cnt == len(input_list) - 1:
+            final = "\n"
+        cnt += 1
+        rtn_string += str(element) + final
+
+    return rtn_string
+
+
+def get_epilogue_info(layer_info):
+    return layer_info['epilogue']
+
+def get_epilogue_tp(layer_info):
+    epilogue_info = get_epilogue_info(layer_info)
+    return epilogue_info['tp']
+
+def get_epilogue_add_bias_or_not(layer_info):
+    epilogue_info = get_epilogue_info(layer_info)
+    return epilogue_info['bias']['addbias']
+
+def get_epilogue_add_bias_tp(layer_info):
+    epilogue_info = get_epilogue_info(layer_info)
+    return epilogue_info['bias']['bias_tp']
+
+def get_epilogue_args(layer_info):
+    epilogue_info = get_epilogue_info(layer_info)
+    return epilogue_info['args']
+
+def get_epilogue_bias_shape(layer_info):
+    bias_tp = get_epilogue_add_bias_tp(layer_info).lower()
+    mn_shape = layer_info['mnk'][:-1]
+
+    if bias_tp == 'mat':
+        mn_shape[0] = 'M'
+        return mn_shape
+    elif bias_tp == 'vec':
+        mn_shape[0] = 1
+        return mn_shape
+    else:
+        assert(0)
+
+def get_epilogue_bias_ldm(layer_info):
+    bias_tp = get_epilogue_add_bias_tp(layer_info).lower()
+    mn_shape = layer_info['mnk'][:-1]
+
+    c_layout = layer_info['C_format'].lower()
+
+    if c_layout != 'row':
+        assert(0)
+
+    if bias_tp == 'mat':
+        return mn_shape[1]
+    elif bias_tp == 'vec':
+        return 0
+    else:
+        assert(0)
+
+def get_epilogue_compute_tp(layer_info):
+    return layer_info['Acc_tp']
diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/replace_fix_impl_header.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/replace_fix_impl_header.py
new file mode 100644
index 0000000000..bbcd050f02
--- /dev/null
+++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/replace_fix_impl_header.py
@@ -0,0 +1,67 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import os
+
+class replace_fix_impl:
+    def __init__(self, src_dir, dst_dir, cutlass_deps_root):
+        self.src_dir = src_dir
+        self.dst_dir = dst_dir
+        self.cutlass_deps_root = cutlass_deps_root
+
+
+
+    def gen_code(self):
+        for sub_dir in os.walk(self.src_dir):
+            files_in_sub_dir = sub_dir[2]
+ 
+            src_dirs = sub_dir[0]
+            output_dirs = self.dst_dir + sub_dir[0][len(self.src_dir):]
+
+            if not os.path.exists(output_dirs):
+                os.mkdir(output_dirs) 
+
+            for f in files_in_sub_dir:
+                with open(src_dirs +"/" + f, 'r') as current_file:
+                    output_lines = []
+                    lines = current_file.readlines()
+
+                    for line in lines:
+                        if(len(line) >= len("#include \"cutlass") and line[:len("#include \"cutlass")] == "#include \"cutlass"):
+                            new_line = "#include \"" + self.cutlass_deps_root + line[len("#include \""):]
+                            # print(new_line)
+                            output_lines.append(new_line)
+                        else:
+                            output_lines.append(line)
+
+                    with open(output_dirs + "/"  + f, "w+") as dest_file:
+                        dest_file.writelines(output_lines)
diff --git a/examples/44_multi_gemm_ir_and_codegen/leaky_bias.h b/examples/44_multi_gemm_ir_and_codegen/leaky_bias.h
new file mode 100644
index 0000000000..10b49049b1
--- /dev/null
+++ b/examples/44_multi_gemm_ir_and_codegen/leaky_bias.h
@@ -0,0 +1,292 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+#include <cuda_fp16.h>
+
+template <typename T>
+__device__
+T add(T const & a, T const &b){
+    return  (a + b);
+}
+
+template <>
+__device__
+half2 add(half2 const & a, half2 const &b){
+    return (__hadd2(a,b));
+}
+
+template <typename T>
+struct RELU{
+    __device__
+    T operator()(T const & a){
+        return  a > T(0) ? a : T(0);
+    }
+    __device__
+    half2 operator()(half2 const & a){
+        float2 a_fp32x2 = __half22float2(a);
+        a_fp32x2.x = a_fp32x2.x > 0.f ? a_fp32x2.x : 0.f;
+        a_fp32x2.y = a_fp32x2.y > 0.f ? a_fp32x2.y : 0.f;
+        if(a_fp32x2.x < 0.f || a_fp32x2.y < 0.f)
+        printf(" %f %f\n", a_fp32x2.x ,a_fp32x2.y);
+        return __float22half2_rn(a_fp32x2);
+    }
+};
+
+template <typename T>
+struct LEAKY_RELU{
+    __device__
+    T operator()(T const & a, T const & scale = half(1)){
+        return  a > T(0) ? a : scale * a;
+    }
+    __device__
+    half2 operator()(half2 const & a, half const & scale = half(1)){
+        half2 zero = __half2half2(half(0));
+        half2 gt_zero = __hge2(a, zero);
+        half2 le_zero = __hle2(a, zero);
+
+
+        half2 scale_f16x2 = __half2half2(scale);
+        half2 mask_scale_f16x2 = __hfma2(le_zero, scale_f16x2, gt_zero);
+        return __hmul2(a, mask_scale_f16x2);
+    }
+};
+
+template <int N, int BLOCKDIM>
+__global__ void leaky_and_activation(half* inout, half* bias, half scale, bool mat_bias){
+
+    constexpr bool N_MOD_2 = N & 1 ? false : true;
+
+    using Access_tp = typename std::conditional<N_MOD_2, half2, half>::type;
+
+    constexpr int Access_elements = sizeof(Access_tp) / sizeof(half);
+
+    constexpr int iter = (N + (BLOCKDIM * Access_elements) - 1 ) / (BLOCKDIM * Access_elements);
+
+    LEAKY_RELU<half> Act;
+    Access_tp src_v[iter];
+    Access_tp bias_v[iter];
+
+    int batch_id = blockIdx.y;
+    int batch_offset = batch_id * gridDim.x * N;
+
+    for(int i = 0; i < iter; i++){
+        int idx = (i * BLOCKDIM + threadIdx.x) * Access_elements;
+        if (idx < N){
+            src_v[i] = *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset);
+            if (mat_bias)
+                bias_v[i] = *reinterpret_cast<Access_tp*>(bias + blockIdx.x * N + idx + batch_offset);
+            else
+                bias_v[i] = *reinterpret_cast<Access_tp*>(bias + idx + batch_id * N);
+            *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset) = Act(add(src_v[i],bias_v[i]),scale); 
+        }
+        
+    }
+}
+
+
+
+template <int N, int BLOCKDIM>
+__global__ void leaky_and_activation(half* inout, half scale){
+
+    constexpr bool N_MOD_2 = N & 1 ? false : true;
+
+    using Access_tp = typename std::conditional<N_MOD_2, half2, half>::type;
+
+    constexpr int Access_elements = sizeof(Access_tp) / sizeof(half);
+
+    constexpr int iter = (N + (BLOCKDIM * Access_elements) - 1 ) / (BLOCKDIM * Access_elements);
+
+    int batch_id = blockIdx.y;
+    int batch_offset = batch_id * gridDim.x * N;
+
+    LEAKY_RELU<half> Act;
+    Access_tp src_v[iter];
+
+    for(int i = 0; i < iter; i++){
+        int idx = (i * BLOCKDIM + threadIdx.x) * Access_elements;
+        if (idx < N){
+            src_v[i] = *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset);
+            *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset) = Act(src_v[i], scale);
+        }
+        
+    }
+}
+
+
+
+template <int N, int BLOCKDIM>
+void leaky_and_activation(half* inout, half* bias, int m, int b, half scale, bool mat_bias){
+
+    dim3 grid(m, b);
+    if (bias == nullptr)
+        leaky_and_activation<N, BLOCKDIM><<<grid , BLOCKDIM>>>(inout, scale);
+    else
+        leaky_and_activation<N, BLOCKDIM><<<grid , BLOCKDIM>>>(inout, bias, scale, mat_bias);
+}
+
+template <int N, int BLOCKDIM>
+__global__ void relu_and_activation(half* inout, half* bias, bool mat_bias){
+
+    constexpr bool N_MOD_2 = N & 1 ? false : true;
+
+    using Access_tp = typename std::conditional<N_MOD_2, half2, half>::type;
+
+    constexpr int Access_elements = sizeof(Access_tp) / sizeof(half);
+
+    constexpr int iter = (N + (BLOCKDIM * Access_elements) - 1 ) / (BLOCKDIM * Access_elements);
+
+    RELU<half> Act;
+    Access_tp src_v[iter];
+    Access_tp bias_v[iter];
+    
+    int batch_id = blockIdx.y;
+    int batch_offset = batch_id * gridDim.x * N;
+
+    for(int i = 0; i < iter; i++){
+        int idx = (i * BLOCKDIM + threadIdx.x) * Access_elements;
+        if (idx < N){
+            src_v[i] = *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset);
+            if (mat_bias)
+                bias_v[i] = *reinterpret_cast<Access_tp*>(bias + blockIdx.x * N + idx + batch_offset);
+            else
+                bias_v[i] = *reinterpret_cast<Access_tp*>(bias + idx + batch_id * N);
+            *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset) = Act(add(src_v[i],bias_v[i])); 
+        }
+        
+    }
+}
+
+
+
+template <int N, int BLOCKDIM>
+__global__ void relu_and_activation(half* inout){
+
+    constexpr bool N_MOD_2 = N & 1 ? false : true;
+
+    using Access_tp = typename std::conditional<N_MOD_2, half2, half>::type;
+
+    constexpr int Access_elements = sizeof(Access_tp) / sizeof(half);
+
+    constexpr int iter = (N + (BLOCKDIM * Access_elements) - 1 ) / (BLOCKDIM * Access_elements);
+
+    int batch_id = blockIdx.y;
+    int batch_offset = batch_id * gridDim.x * N;
+
+    RELU<half> Act;
+    Access_tp src_v[iter];
+
+    for(int i = 0; i < iter; i++){
+        int idx = (i * BLOCKDIM + threadIdx.x) * Access_elements;
+        if (idx < N){
+            src_v[i] = *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset);
+            *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset) = Act(src_v[i]);
+        }
+        
+    }
+}
+
+
+
+template <int N, int BLOCKDIM>
+void relu_and_activation(half* inout, half* bias, int m, int b, bool mat_bias){
+    dim3 grid(m, b);
+    if (bias == nullptr)
+        relu_and_activation<N, BLOCKDIM><<<grid , BLOCKDIM>>>(inout);
+    else
+        relu_and_activation<N, BLOCKDIM><<<grid , BLOCKDIM>>>(inout, bias, mat_bias);
+}
+
+
+template <int N, int BLOCKDIM>
+__global__ void identity_and_activation(half* inout, half* bias, bool mat_bias){
+
+    constexpr bool N_MOD_2 = N & 1 ? false : true;
+
+    using Access_tp = typename std::conditional<N_MOD_2, half2, half>::type;
+
+    constexpr int Access_elements = sizeof(Access_tp) / sizeof(half);
+
+    constexpr int iter = (N + (BLOCKDIM * Access_elements) - 1 ) / (BLOCKDIM * Access_elements);
+
+    int batch_id = blockIdx.y;
+    int batch_offset = batch_id * gridDim.x * N;
+
+    Access_tp src_v[iter];
+    Access_tp bias_v[iter];
+
+    for(int i = 0; i < iter; i++){
+        int idx = (i * BLOCKDIM + threadIdx.x) * Access_elements;
+        if (idx < N){
+            src_v[i] = *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset);
+            if (mat_bias)
+                bias_v[i] = *reinterpret_cast<Access_tp*>(bias + blockIdx.x * N + idx + batch_offset);
+            else
+                bias_v[i] = *reinterpret_cast<Access_tp*>(bias + idx + batch_id * N);
+            *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset) = (add(src_v[i],bias_v[i])); 
+        }
+        
+    }
+}
+
+template <int N, int BLOCKDIM>
+__global__ void identity_and_activation(half* inout){
+
+    constexpr bool N_MOD_2 = N & 1 ? false : true;
+
+    using Access_tp = typename std::conditional<N_MOD_2, half2, half>::type;
+
+    constexpr int Access_elements = sizeof(Access_tp) / sizeof(half);
+
+    constexpr int iter = (N + (BLOCKDIM * Access_elements) - 1 ) / (BLOCKDIM * Access_elements);
+
+    int batch_id = blockIdx.y;
+    int batch_offset = batch_id * gridDim.x * N;
+    Access_tp src_v[iter];
+
+    for(int i = 0; i < iter; i++){
+        int idx = (i * BLOCKDIM + threadIdx.x) * Access_elements;
+        if (idx < N){
+            src_v[i] = *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset);
+            *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset) = (src_v[i]);
+        }
+        
+    }
+}
+
+template <int N, int BLOCKDIM>
+void identity_and_activation(half* inout, half* bias, int m, int b, bool mat_bias){
+    dim3 grid(m, b);
+    if (bias == nullptr)
+        identity_and_activation<N, BLOCKDIM><<<grid , BLOCKDIM>>>(inout);
+    else
+        identity_and_activation<N, BLOCKDIM><<<grid , BLOCKDIM>>>(inout, bias, mat_bias);
+}
diff --git a/examples/44_multi_gemm_ir_and_codegen/utils.h b/examples/44_multi_gemm_ir_and_codegen/utils.h
new file mode 100644
index 0000000000..2b05ae9367
--- /dev/null
+++ b/examples/44_multi_gemm_ir_and_codegen/utils.h
@@ -0,0 +1,94 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+#define TI(tag) \
+    cudaEvent_t _event_start_ ##tag; \
+    cudaEvent_t _event_end_ ##tag; \
+    float _event_time_ ##tag; \
+    cudaEventCreate(& _event_start_ ##tag); \
+    cudaEventCreate(& _event_end_ ##tag); \
+    cudaEventRecord(_event_start_ ##tag);
+
+#define TO(tag, str, times) \
+    cudaEventRecord(_event_end_ ##tag); \
+    cudaEventSynchronize(_event_end_ ##tag); \
+    cudaEventElapsedTime(&_event_time_ ##tag, _event_start_ ##tag, _event_end_ ##tag); \
+    float _event_time_once_ ##tag = _event_time_ ##tag / times; \
+    printf("%20s:\t %10.3fus\t", str, _event_time_once_ ##tag * 1000); \
+    cudaDeviceSynchronize(); \
+    printf("%20s string: %s\n",str, cudaGetErrorString(cudaGetLastError()));
+
+template<typename T>
+struct memory_unit{
+    T* host_ptr;
+    T* device_ptr;
+    int size_bytes;
+    int elements;
+    void h2d(){
+        cudaMemcpy(device_ptr, host_ptr, size_bytes, cudaMemcpyHostToDevice);
+    }
+    void d2h(){
+        cudaMemcpy(host_ptr, device_ptr, size_bytes, cudaMemcpyDeviceToHost);
+    }
+    void free_all(){
+        free(host_ptr);
+        cudaFree(device_ptr);
+    }
+    memory_unit(int elements_): size_bytes(elements_ * sizeof(T)), elements(elements_){
+        host_ptr = (T*) malloc(elements_ * sizeof(T));
+        cudaMalloc((void**)&device_ptr, elements_ * sizeof(T));
+    }
+    void init(int abs_range = 1){
+        for(int i = 0; i < elements; i++){
+            host_ptr[i] = T(rand() % 100 / float(100)  * 2 * abs_range - abs_range);
+        }
+        h2d();
+    }
+};
+
+template<typename T>
+int check_result(T * a, T * b, int N){
+    int cnt = 0;
+    for(int i = 0; i < N; i ++){
+        float std = float(a[i]);
+        float my = float(b[i]);
+
+        if(abs(std - my) / abs(std) > 1e-2)
+        {
+            // printf("my: %f , std: %f\n", my, std);
+            cnt++;
+        }
+
+    }
+    printf("total err: %d / %d\n", cnt, N);
+    return cnt;
+}
diff --git a/examples/45_dual_gemm/CMakeLists.txt b/examples/45_dual_gemm/CMakeLists.txt
new file mode 100644
index 0000000000..de704ed2b1
--- /dev/null
+++ b/examples/45_dual_gemm/CMakeLists.txt
@@ -0,0 +1,36 @@
+
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+
+cutlass_example_add_executable(
+  45_dual_gemm
+  dual_gemm.cu 
+  )
+
diff --git a/examples/45_dual_gemm/device/dual_gemm.h b/examples/45_dual_gemm/device/dual_gemm.h
new file mode 100644
index 0000000000..f48073597f
--- /dev/null
+++ b/examples/45_dual_gemm/device/dual_gemm.h
@@ -0,0 +1,499 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Performs a dual gemm in one fused kernel:
+```
+D0 = epilogue0(X @ B0, C0)
+D1 = epilogue1(X @ B1, C1)
+D2 = element_wise(D0, D1)
+```
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+
+#include "../kernel/dual_gemm.h"
+#include "../dual_gemm_common.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B0 matrix operand
+    typename LayoutB0_,
+    /// Layout type for B1 matrix operand
+    typename LayoutB1_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp0_,
+    typename EpilogueOutputOp1_,
+    typename EpilogueOutputOp2_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    bool StoreD0 = true,
+    bool StoreD1 = true,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial = false,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator>
+class DualGemm {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB0 = LayoutB0_;
+  using LayoutB1 = LayoutB1_;
+  using TensorRefB0 = TensorRef<ElementB const, LayoutB0>;
+  using TensorRefB1 = TensorRef<ElementB const, LayoutB1>;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp0 = EpilogueOutputOp0_;
+  using EpilogueOutputOp1 = EpilogueOutputOp1_;
+  using EpilogueOutputOp2 = EpilogueOutputOp2_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp1::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  static bool constexpr kStoreD0 = StoreD0;
+  static bool constexpr kStoreD1 = StoreD1;
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  using LayoutScaleBias = layout::RowMajor;
+  /// Define the kernel
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  static_assert(ArchTag::kMinComputeCapability >= 80, "Only multistage is implemented");
+  static_assert(kStages >= 3, "Only multistage is implemented");
+  using Mma0 = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB0, kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, ArchTag,
+      ThreadblockShape, WarpShape,
+      InstructionShape, Stages, Operator>::ThreadblockMma;
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB1, kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, ArchTag,
+      ThreadblockShape, WarpShape, 
+      InstructionShape, Stages, Operator>::ThreadblockMma;
+  using DualMma = threadblock::DualMmaMultistage<
+    typename Mma0::Shape,
+    typename Mma0::IteratorA,
+    typename Mma0::SmemIteratorA,
+    Mma0::kCacheOpA,
+    typename Mma0::IteratorB,
+    typename Mma0::SmemIteratorB,
+    Mma0::kCacheOpB,
+    typename Mma1::IteratorB,
+    typename Mma1::SmemIteratorB,
+    typename Mma0::ElementC,
+    typename Mma0::LayoutC,
+    typename Mma0::Policy,
+    typename Mma1::Policy,
+    Mma0::kStages,
+    SharedMemoryClearOption::kNone
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue0 =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename DualMma::Operator0, kPartitionsK, EpilogueOutputOp0,
+          EpilogueOutputOp0::kCount>::Epilogue;
+  using Epilogue1 =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename DualMma::Operator1, kPartitionsK, EpilogueOutputOp1,
+          EpilogueOutputOp1::kCount>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using DualGemmKernel = kernel::DualGemm<
+    DualMma,
+    Epilogue0, Epilogue1, EpilogueOutputOp2,
+    ThreadblockSwizzle, kSplitKSerial,
+    kStoreD0, kStoreD1>;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    DualGemmMode mode;
+    GemmCoord problem_size;
+    TensorRef<ElementA const, LayoutA> ref_A0;
+    TensorRef<ElementB const, LayoutB0> ref_B0;
+    TensorRef<ElementC const, LayoutC> ref_C0;
+    TensorRef<ElementC, LayoutC> ref_D0;
+    TensorRef<ElementB const, LayoutB1> ref_B1;
+    TensorRef<ElementC const, LayoutC> ref_C1;
+    TensorRef<ElementC, LayoutC> ref_D1;
+    TensorRef<ElementC, LayoutC> ref_D2;
+    typename EpilogueOutputOp0::Params epilogue0;
+    typename EpilogueOutputOp1::Params epilogue1;
+    typename EpilogueOutputOp2::Params epilogue2;
+    int split_k_slices;
+
+    int batch_count;
+    int64_t batch_stride_A;
+    int64_t batch_stride_B0;
+    int64_t batch_stride_B1;
+    int64_t batch_stride_C;
+    int64_t batch_stride_D;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(): problem_size(0, 0, 0), split_k_slices(1) {
+
+    }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      DualGemmMode mode,
+      GemmCoord problem_size_,
+      TensorRef<ElementA const, LayoutA> ref_A0_,
+      TensorRef<ElementB const, LayoutB0> ref_B0_,
+      TensorRef<ElementC const, LayoutC> ref_C0_,
+      TensorRef<ElementC, LayoutC> ref_D0_,
+      TensorRef<ElementB const, LayoutB1> ref_B1_,
+      TensorRef<ElementC const, LayoutC> ref_C1_,
+      TensorRef<ElementC, LayoutC> ref_D1_,
+      TensorRef<ElementC, LayoutC> ref_D2_,
+      typename EpilogueOutputOp0::Params epilogue0_ =
+        typename EpilogueOutputOp0::Params(),
+      typename EpilogueOutputOp1::Params epilogue1_ =
+        typename EpilogueOutputOp1::Params(),
+      typename EpilogueOutputOp2::Params epilogue2_ =
+        typename EpilogueOutputOp2::Params(),
+      int split_k_slices_ = 1,
+      int batch_count = 1,
+      int64_t batch_stride_A = 0,
+      int64_t batch_stride_B0 = 0,
+      int64_t batch_stride_B1 = 0,
+      int64_t batch_stride_C = 0,
+      int64_t batch_stride_D = 0
+    ):
+      mode(mode),
+      problem_size(problem_size_),
+      ref_A0(ref_A0_),
+      ref_B0(ref_B0_),
+      ref_C0(ref_C0_),
+      ref_D0(ref_D0_),
+      ref_B1(ref_B1_),
+      ref_C1(ref_C1_),
+      ref_D1(ref_D1_),
+      ref_D2(ref_D2_),
+      epilogue0(epilogue0_),
+      epilogue1(epilogue1_),
+      epilogue2(epilogue2_),
+      split_k_slices(split_k_slices_),
+      batch_count(batch_count),
+      batch_stride_A(batch_stride_A),
+      batch_stride_B0(batch_stride_B0),
+      batch_stride_B1(batch_stride_B1),
+      batch_stride_C(batch_stride_C),
+      batch_stride_D(batch_stride_D) {
+
+    }
+  };
+
+private:
+
+  /// Kernel parameters object
+  typename DualGemmKernel::Params params_;
+
+public:
+
+  /// Constructs the GEMM.
+  DualGemm() = default;
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    if (args.mode == DualGemmMode::kBatched && kSplitKSerial) {
+      return Status::kErrorInvalidProblem;
+    }
+    if (!kSplitKSerial && args.split_k_slices > 1) {
+      return Status::kErrorInvalidProblem;
+    }
+    if (kStoreD0 != (args.ref_D0.data() != nullptr)) {
+      return Status::kErrorInternal;
+    }
+    if (kStoreD1 != (args.ref_D1.data() != nullptr)) {
+      return Status::kErrorInternal;
+    }
+
+    Status status = DualGemmKernel::can_implement(
+      args.problem_size,
+      args.ref_A0.non_const_ref(),
+      args.ref_B0.non_const_ref(),
+      args.ref_C0.non_const_ref(),
+      args.ref_D0,
+      args.ref_B1.non_const_ref(),
+      args.ref_C1.non_const_ref(),
+      args.ref_D1,
+      args.ref_D2
+    );
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+
+    size_t bytes = 0;
+
+    if (kSplitKSerial && args.split_k_slices > 1) {
+      // Determine grid shape
+      ThreadblockSwizzle threadblock_swizzle;
+
+      cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
+        args.problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.split_k_slices);
+
+      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
+    }
+
+    return bytes;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.mode == DualGemmMode::kBatched ? args.batch_count : args.split_k_slices);
+
+    if (kSplitKSerial) {
+      if (args.split_k_slices > 1) {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        size_t bytes = get_workspace_size(args);
+      
+        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
+
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    else {
+
+      if (args.split_k_slices > 1) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    // Initialize the Params structure
+    params_ = typename DualGemmKernel::Params{
+      args.mode,
+      args.problem_size,
+      grid_shape,
+      args.ref_A0.non_const_ref(),
+      args.ref_B0.non_const_ref(),
+      args.ref_C0.non_const_ref(),
+      args.ref_D0,
+      args.ref_B1.non_const_ref(),
+      args.ref_C1.non_const_ref(),
+      args.ref_D1,
+      args.ref_D2,
+      args.epilogue0,
+      args.epilogue1,
+      args.epilogue2,
+      reinterpret_cast<int *>(workspace),
+      args.batch_stride_A,
+      args.batch_stride_B0,
+      args.batch_stride_B1,
+      args.batch_stride_C,
+      args.batch_stride_D,
+    };
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    
+    if (kSplitKSerial && args.split_k_slices > 1) {  
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+    }
+
+    params_.ref_A0.reset(args.ref_A0.non_const_ref().data());
+    params_.ref_B0.reset(args.ref_B0.non_const_ref().data());
+    params_.ref_C0.reset(args.ref_C0.non_const_ref().data());
+    params_.ref_D0.reset(args.ref_D0.data());
+    params_.ref_B1.reset(args.ref_B1.non_const_ref().data());
+    params_.ref_C1.reset(args.ref_C1.non_const_ref().data());
+    params_.ref_D1.reset(args.ref_D1.data());
+    params_.ref_D2.reset(args.ref_D2.data());
+    params_.output_op_0 = args.epilogue0;
+    params_.output_op_1 = args.epilogue1;
+    params_.output_op_2 = args.epilogue2;
+    params_.semaphore = reinterpret_cast<int *>(workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(DualGemmKernel::kThreadCount, 1, 1);
+
+    cudaError_t result;
+
+    int smem_size = int(sizeof(typename DualGemmKernel::SharedStorage));
+    if (smem_size >= (48 << 10)) {
+      result = cudaFuncSetAttribute(Kernel<DualGemmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    cutlass::Kernel<DualGemmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/45_dual_gemm/dual_gemm.cu b/examples/45_dual_gemm/dual_gemm.cu
new file mode 100644
index 0000000000..8043addec1
--- /dev/null
+++ b/examples/45_dual_gemm/dual_gemm.cu
@@ -0,0 +1,460 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief CUTLASS Dual-GEMM Example.
+
+    Fused kernel that outputs `D0` and `D1`.
+    We assume that B0/B1 have the same shape/layout
+
+```
+D0 = epilogue0(X @ B0, C0)
+D1 = epilogue1(X @ B1, C1)
+D2 = element_wise(D0, D1)
+```
+    D0 and D1 will be optionally stored in gmem (`kStoreD0` / `kStoreD1`)
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "device/dual_gemm.h"
+#include "thread/left_silu_and_mul.h"
+#include "dual_gemm_run.h"
+#include "test_run.h"
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+cutlass::gemm::GemmCoord problem_size(4096, 4096, 8192);
+cutlass::gemm::GemmCoord batch_problem_size(321, 256, 512);
+
+constexpr int kStages = 3;
+constexpr bool kSplitKSerial = false;
+constexpr bool kUseBias = true;
+constexpr int kBatchCount = 37;
+
+
+#if 0
+using ElementOperandA = cutlass::bfloat16_t;
+using ElementOperandB = cutlass::bfloat16_t;
+using ElementOutput = cutlass::bfloat16_t;
+using ElementAccumulator = float;
+using ElementCompute = float;
+#else
+using ElementOperandA = cutlass::half_t;
+using ElementOperandB = cutlass::half_t;
+using ElementOutput = cutlass::half_t;
+using ElementAccumulator = cutlass::half_t;
+using ElementCompute = cutlass::half_t;
+#endif
+
+constexpr auto kScaleType = kUseBias ? cutlass::epilogue::thread::ScaleType::NoBetaScaling : (
+  // No bias
+  kSplitKSerial ? cutlass::epilogue::thread::ScaleType::Default : cutlass::epilogue::thread::ScaleType::Nothing
+);
+using EpilogueOutputOp0 = cutlass::epilogue::thread::LinearCombination<
+  ElementOutput,
+  128 / cutlass::sizeof_bits<ElementOutput>::value,
+  ElementAccumulator,
+  ElementCompute,
+  kScaleType
+>;
+using EpilogueOutputOp1 = cutlass::epilogue::thread::LinearCombination<
+  ElementOutput,
+  128 / cutlass::sizeof_bits<ElementOutput>::value,
+  ElementAccumulator,
+  ElementCompute,
+  kScaleType
+>;
+using EpilogueOutputOp2 = cutlass::epilogue::thread::LeftSiLUAndMul<
+  ElementOutput,
+  128 / cutlass::sizeof_bits<ElementOutput>::value,
+  ElementOutput,
+  ElementCompute
+>;
+
+const ElementCompute alpha0 = ElementCompute(1);
+const ElementCompute beta0 = ElementCompute(kUseBias ? 1 : 0);
+const ElementCompute alpha1 = ElementCompute(1);
+const ElementCompute beta1 = ElementCompute(kUseBias ? 1 : 0);
+
+bool run_nonfused_gemm_f16_sm80() {
+  using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  using Gemm0 = cutlass::gemm::device::Gemm<
+    ElementOperandA,
+    cutlass::layout::RowMajor,
+    ElementOperandB,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp0,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    kStages,
+    8,
+    8,
+    kSplitKSerial
+  >;
+  using Gemm1 = cutlass::gemm::device::Gemm<
+    ElementOperandA,
+    cutlass::layout::RowMajor,
+    ElementOperandB,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp1,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    kStages,
+    8,
+    8,
+    kSplitKSerial
+  >;
+
+  NonFusedDualGemmRun<Gemm0, Gemm1> nonFusedGemm;
+
+  std::cout << "Running Non-fused GEMMs FP16 TN GEMMs...\n";
+
+  bool pass = nonFusedGemm.run(
+    problem_size,
+    alpha0,
+    beta0,
+    alpha1,
+    beta1,
+    true  /* is_profiling */
+  );
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+  return pass;
+}
+
+template <typename T>
+struct LeftSiLUAndMul {
+  struct Params{};
+  CUTLASS_HOST_DEVICE LeftSiLUAndMul(Params p) {}
+
+  CUTLASS_HOST_DEVICE void set_k_partition(int, int) {}
+
+  CUTLASS_HOST_DEVICE T operator() (
+    T const &lhs, 
+    T const &rhs) const {
+    cutlass::epilogue::thread::SiLu<T> silu;
+    cutlass::multiplies<T> mul;
+    auto silu_lhs = silu(lhs);
+    return mul(silu_lhs, rhs);
+  }
+
+  template <int kCount>
+  CUTLASS_HOST_DEVICE cutlass::Array<T, kCount> operator() (
+    cutlass::Array<T, kCount> const &lhs, 
+    cutlass::Array<T, kCount> const &rhs) const {
+    cutlass::epilogue::thread::SiLu<T> silu;
+    cutlass::multiplies<T> mul;
+    auto silu_lhs = silu(lhs);
+    return mul(silu_lhs, rhs);
+  }
+};
+
+bool run_fused_gemm_f16_sm80_shmem() {
+  using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  // Optionally, we might not need intermediate GEMM outputs
+  constexpr bool kStoreD0 = true;
+  constexpr bool kStoreD1 = true;
+
+  using DualGemm = cutlass::gemm::device::DualGemm<
+    ElementOperandA,
+    cutlass::layout::RowMajor,
+    ElementOperandB,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    EpilogueOutputOp2,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    kStages,
+    kStoreD0,
+    kStoreD1,
+    kSplitKSerial
+  >;
+
+  DualFusedGemmRun<DualGemm> fusedGemm;
+
+  std::cout << "Running Fused FP16 TN GEMMs + Epilogue2...\n";
+
+  bool passed = fusedGemm.run(
+    problem_size,
+    alpha0,
+    beta0,
+    alpha1,
+    beta1
+  );
+
+  if(passed)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+  return passed;
+}
+
+bool run_batched_fused_gemm_f16_sm80_shmem() {
+  using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  // Optionally, we might not need intermediate GEMM outputs
+  constexpr bool kStoreD0 = true;
+  constexpr bool kStoreD1 = true;
+
+  using DualGemm = cutlass::gemm::device::DualGemm<
+    ElementOperandA,
+    cutlass::layout::RowMajor,
+    ElementOperandB,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    EpilogueOutputOp2,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    kStages,
+    kStoreD0,
+    kStoreD1,
+    kSplitKSerial
+  >;
+
+  DualFusedGemmRun<DualGemm> fusedGemm;
+
+  std::cout << "Running Batched Fused FP16 TN GEMMs + Epilogue2...\n";
+
+  bool passed = fusedGemm.run(
+    batch_problem_size,
+    alpha0,
+    beta0,
+    alpha1,
+    beta1,
+    kBatchCount,
+    false,  /* broadcast_b1 */
+    false   /* is_profiling */
+  );
+
+  if(passed)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+  return passed;
+}
+
+bool run_broadcast_fused_gemm_f16_sm80_shmem() {
+  using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  // Optionally, we might not need intermediate GEMM outputs
+  constexpr bool kStoreD0 = true;
+  constexpr bool kStoreD1 = true;
+
+  using DualGemm = cutlass::gemm::device::DualGemm<
+    ElementOperandA,
+    cutlass::layout::RowMajor,
+    ElementOperandB,
+    // different LayoutB0 and B1
+    cutlass::layout::RowMajor,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    EpilogueOutputOp2,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    kStages,
+    kStoreD0,
+    kStoreD1,
+    kSplitKSerial
+  >;
+
+  DualFusedGemmRun<DualGemm> fusedGemm;
+
+  std::cout << "Running Broadcast Fused FP16 TN GEMMs + Epilogue2...\n";
+
+  bool passed = fusedGemm.run(
+    problem_size,
+    alpha0,
+    beta0,
+    alpha1,
+    beta1,
+    1,     /* batch_count */
+    true,  /* broadcast_b1 */
+    true   /* is_profiling */
+  );
+
+  if(passed)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+  return passed;
+}
+
+bool run_batched_broadcast_fused_gemm_f16_sm80_shmem() {
+  using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  // Optionally, we might not need intermediate GEMM outputs
+  constexpr bool kStoreD0 = true;
+  constexpr bool kStoreD1 = true;
+
+  using DualGemm = cutlass::gemm::device::DualGemm<
+    ElementOperandA,
+    cutlass::layout::RowMajor,
+    ElementOperandB,
+    // different LayoutB0 and B1
+    cutlass::layout::RowMajor,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    EpilogueOutputOp2,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    kStages,
+    kStoreD0,
+    kStoreD1,
+    kSplitKSerial
+  >;
+
+  DualFusedGemmRun<DualGemm> fusedGemm;
+
+  std::cout << "Running Batch Broadcast Fused FP16 TN GEMMs + Epilogue2...\n";
+
+  bool passed = fusedGemm.run(
+    batch_problem_size,
+    alpha0,
+    beta0,
+    alpha1,
+    beta1,
+    kBatchCount,
+    true,  /* broadcast_b1 */
+    false  /* is_profiling */
+  );
+
+  if(passed)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+  return passed;
+}
+
+int main() {
+
+  std::vector<bool (*)()>funcs = {
+    &run_nonfused_gemm_f16_sm80,
+    &run_fused_gemm_f16_sm80_shmem,
+    &run_batched_fused_gemm_f16_sm80_shmem,
+    &run_broadcast_fused_gemm_f16_sm80_shmem,
+    &run_batched_broadcast_fused_gemm_f16_sm80_shmem
+  };
+
+  std::string test_name = (
+    "dual-gemm f16 bias=" +
+    std::to_string(kUseBias) +
+    " split_k_serial=" +
+    std::to_string(kSplitKSerial) +
+    " batch_count=" +
+    std::to_string(kBatchCount)
+  );
+
+  return testRun(80, funcs, test_name);
+}
+
+
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/45_dual_gemm/dual_gemm_common.h b/examples/45_dual_gemm/dual_gemm_common.h
new file mode 100644
index 0000000000..41f5cfea6a
--- /dev/null
+++ b/examples/45_dual_gemm/dual_gemm_common.h
@@ -0,0 +1,52 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines common types used for all DualGemm operators.
+*/
+#pragma once
+
+namespace cutlass {
+namespace gemm {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+enum class DualGemmMode {
+  kGemm,
+  kBatched,
+  kInvalid
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/45_dual_gemm/dual_gemm_run.h b/examples/45_dual_gemm/dual_gemm_run.h
new file mode 100644
index 0000000000..b53ee80668
--- /dev/null
+++ b/examples/45_dual_gemm/dual_gemm_run.h
@@ -0,0 +1,938 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <type_traits>
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/device/tensor_relu.h"
+
+#include "cutlass/platform/platform.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "dual_gemm_common.h"
+#include "helper.h"
+
+#define CHECK_GT(val1, val2) \
+    if((val1) <= (val2)) \
+        std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_GT failed\n";
+#define CHECK_TRUE(val) \
+    if(!(val)) \
+        std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_TRUE failed\n";
+
+template <
+  typename OutputOp,
+  typename Element,
+  typename Layout>
+struct TensorEpilogueForEachFunc {
+  /// View type
+  using TensorView = cutlass::TensorView<Element, Layout>;
+
+  /// Coordinate in tensor's index space
+  using TensorCoord = typename TensorView::TensorCoord;
+
+  /// Parameters structure
+  struct Params {
+
+    //
+    // Data members
+    //
+
+    TensorView view_x0;
+    TensorView view_x1;
+    TensorView view_y;
+    OutputOp output_op;
+
+
+    //
+    // Methods
+    //
+
+    Params(
+      TensorView view_x0_ = TensorView(),
+      TensorView view_x1_ = TensorView(),
+      TensorView view_y_ = TensorView(),
+      OutputOp output_op_ = OutputOp(typename OutputOp::Params{})
+    ):
+      view_x0(view_x0_), view_x1(view_x1_), view_y(view_y_), output_op(output_op_) {
+    }
+  };
+
+  Params params;
+
+  CUTLASS_DEVICE
+  TensorEpilogueForEachFunc(Params const &params): params(params) {
+
+  }
+
+  CUTLASS_DEVICE
+  void operator()(TensorCoord const &coord) {
+    Element const & x0 = params.view_x0.at(coord);
+    Element const & x1 = params.view_x1.at(coord);
+    Element& y = params.view_y.at(coord);
+    y = params.output_op(x0, x1);
+  }
+};
+
+template <
+  typename OutputOp,
+  typename Element,
+  typename Layout>
+void TensorEpilogueForEach(
+  cutlass::TensorView<Element, Layout> x0,
+  cutlass::TensorView<Element, Layout> x1,
+  cutlass::TensorView<Element, Layout> y) {
+  
+  using Func = TensorEpilogueForEachFunc<OutputOp, Element, Layout>;
+  using Params = typename Func::Params;
+
+  cutlass::reference::device::TensorForEach<Func, Layout::kRank, Params>(
+    y.extent(),
+    Params(x0, x1, y)
+  );
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename Gemm0_, typename Gemm1_>
+struct NonFusedDualGemmRun
+{
+
+  using Gemm0 = Gemm0_;
+  using Gemm1 = Gemm1_;
+  using ElementAccumulator = typename Gemm0::ElementAccumulator;
+  using ElementCompute = typename Gemm0::GemmKernel::Epilogue::OutputOp::ElementCompute;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  cutlass::Distribution::Kind init_Bias;
+  uint64_t seed;
+
+  //
+  // Methods
+  //
+
+  NonFusedDualGemmRun(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform, 
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform, 
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform, 
+    cutlass::Distribution::Kind init_Bias_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), init_Bias(init_Bias_), seed(seed_) { }
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, 2, -2, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    }
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(
+        view.data(), view.capacity());
+    }
+    else if (dist_kind == cutlass::Distribution::AllZeros) {
+      cutlass::reference::host::TensorFill(view, Element(0));
+    }
+    else if (dist_kind == cutlass::Distribution::AllOnes) {
+      cutlass::reference::host::TensorFill(view, Element(1));
+    }
+    else {
+      std::cerr << "Not implemented\n";
+      return false;
+    }
+
+    return true;
+  }
+
+
+
+
+  /// Executes one test
+  bool run(
+    cutlass::gemm::GemmCoord problem_size,
+    ElementCompute alpha0 = ElementCompute(1),
+    ElementCompute beta0 = ElementCompute(0),
+    ElementCompute alpha1 = ElementCompute(1),
+    ElementCompute beta1 = ElementCompute(0),
+    bool is_profiling = true,
+    bool relu = false,
+    int warm_ups = 1,
+    int runs = 100) {
+    
+    //
+    // Allocate the GEMM workspace
+    //
+
+    cutlass::HostTensor<
+      typename Gemm0::ElementA, 
+      typename Gemm0::LayoutA> tensor_A0(problem_size.mk());
+
+    cutlass::HostTensor<
+      typename Gemm0::ElementB, 
+      typename Gemm0::LayoutB> tensor_B0(problem_size.kn());
+
+    cutlass::HostTensor<
+      typename Gemm0::ElementC, 
+      typename Gemm0::LayoutC> tensor_C0(problem_size.mn());
+
+    cutlass::HostTensor<
+      typename Gemm1::ElementC,
+      typename Gemm0::LayoutC> tensor_Bias0({1, problem_size.n()});
+
+    cutlass::HostTensor<
+      typename Gemm0::ElementC, 
+      typename Gemm0::LayoutC> tensor_D0(problem_size.mn());
+
+    cutlass::HostTensor<
+      typename Gemm0::ElementC, 
+      typename Gemm0::LayoutC> reference_D0(problem_size.mn());
+
+    cutlass::HostTensor<
+      typename Gemm1::ElementB, 
+      typename Gemm1::LayoutB> tensor_B1(problem_size.kn());
+
+    cutlass::HostTensor<
+      typename Gemm1::ElementC, 
+      typename Gemm1::LayoutC> tensor_C1(problem_size.mn());
+
+    cutlass::HostTensor<
+      typename Gemm1::ElementC,
+      typename Gemm1::LayoutC> tensor_Bias1({1, problem_size.n()});
+
+    cutlass::HostTensor<
+      typename Gemm1::ElementC, 
+      typename Gemm1::LayoutC> tensor_D1(problem_size.mn());
+
+    cutlass::HostTensor<
+      typename Gemm1::ElementC, 
+      typename Gemm1::LayoutC> reference_D1(problem_size.mn());
+
+
+    CHECK_TRUE(initialize_tensor(tensor_A0.host_view(), init_A, seed + 2019));
+    CHECK_TRUE(initialize_tensor(tensor_B0.host_view(), init_B, seed + 2018));
+    CHECK_TRUE(initialize_tensor(tensor_C0.host_view(), init_C, seed + 2017));
+    CHECK_TRUE(initialize_tensor(tensor_Bias0.host_view(), init_Bias, seed + 2014));
+    CHECK_TRUE(initialize_tensor(tensor_B1.host_view(), init_B, seed + 2016));
+    CHECK_TRUE(initialize_tensor(tensor_C1.host_view(), init_C, seed + 2015));
+    CHECK_TRUE(initialize_tensor(tensor_Bias1.host_view(), init_Bias, seed + 2013));
+
+    cutlass::reference::host::TensorFill(
+      tensor_D0.host_view());
+    cutlass::reference::host::TensorFill(
+      tensor_D1.host_view());
+    cutlass::reference::host::TensorFill(
+      reference_D0.host_view());
+    cutlass::reference::host::TensorFill(
+      reference_D1.host_view());
+
+    tensor_A0.sync_device();
+    tensor_B0.sync_device();
+    tensor_C0.sync_device();
+    tensor_Bias0.sync_device();
+    tensor_D0.sync_device();
+    reference_D0.sync_device();
+    tensor_B1.sync_device();
+    tensor_C1.sync_device();
+    tensor_Bias1.sync_device();
+    tensor_D1.sync_device();
+    reference_D1.sync_device();
+
+    //
+    // Initialize the GEMM operator
+    //
+
+    int split_k_slices = Gemm0::kSplitKSerial ? 2 : 1;
+    typename Gemm0::Arguments arguments_0{
+      problem_size,
+      tensor_A0.device_ref(),
+      tensor_B0.device_ref(),
+      {tensor_Bias0.device_data(), typename Gemm0::LayoutC::Stride(0)},
+      tensor_D0.device_ref(),
+      {alpha0, beta0},
+      split_k_slices
+    };
+
+    split_k_slices = Gemm1::kSplitKSerial ? 2 : 1;
+    typename Gemm1::Arguments arguments_1{
+      problem_size,
+      tensor_A0.device_ref(),
+      tensor_B1.device_ref(),
+      {tensor_Bias1.device_data(), typename Gemm1::LayoutC::Stride(0)},
+      tensor_D1.device_ref(),
+      {alpha1, beta1},
+      split_k_slices
+    };
+
+
+    Gemm0 gemm_op_0;
+    Gemm1 gemm_op_1;
+
+    // Allocate workspace memory
+    cutlass::device_memory::allocation<uint8_t> workspace0(gemm_op_0.get_workspace_size(arguments_0));
+    cutlass::device_memory::allocation<uint8_t> workspace1(gemm_op_1.get_workspace_size(arguments_1));
+
+    cutlass::Status status = gemm_op_0.initialize(arguments_0, workspace0.get());
+
+    CUTLASS_CHECK(status);
+
+    status = gemm_op_1.initialize(arguments_1, workspace1.get());
+
+    CUTLASS_CHECK(status);
+
+    for(int i = 0; i < warm_ups; i++) {
+        status = gemm_op_0();
+        CUTLASS_CHECK(status);
+        status = gemm_op_1();
+        CUTLASS_CHECK(status);
+    }
+
+    if (is_profiling) {
+      //
+      // Profile the GEMM
+      //
+
+      cudaEvent_t start, stop1, stop2;
+      cudaEventCreate(&start);
+      cudaEventCreate(&stop1);
+      cudaEventCreate(&stop2);
+
+      cudaEventRecord(start);
+
+      for(int i = 0; i < runs; i++) {
+          status = gemm_op_0();
+ 
+          CUTLASS_CHECK(status);
+      }
+      cudaEventRecord(stop1);
+      for(int i = 0; i < runs; i++) {
+          status = gemm_op_1();
+
+          CUTLASS_CHECK(status);
+      }
+
+      cudaEventRecord(stop2);
+      cudaDeviceSynchronize();
+      float gemm0Time, gemm1Time, totalTime;
+      cudaEventElapsedTime(&gemm0Time, start, stop1);
+      cudaEventElapsedTime(&gemm1Time, stop1, stop2);
+      cudaEventElapsedTime(&totalTime, start, stop2);
+      std::cout << "gemm 0 time " << gemm0Time / (float)runs << " ms\n";
+      std::cout << "gemm 1 time " << gemm1Time / (float)runs << " ms\n";
+      std::cout << "Non-fusion GEMM only time " << totalTime / (float)runs << " ms\n";
+    }
+
+    tensor_D0.sync_host();
+    tensor_D1.sync_host();
+
+    //
+    // Verify
+    //
+    cutlass::reference::device::Gemm<
+        typename Gemm0::ElementA, typename Gemm0::LayoutA,
+        typename Gemm0::ElementB, typename Gemm0::LayoutB,
+        typename Gemm0::ElementC, typename Gemm0::LayoutC, ElementCompute,
+        ElementAccumulator, typename Gemm0::Operator>
+        reference_gemm_0;
+
+    cutlass::reference::device::Gemm<
+        typename Gemm1::ElementA, typename Gemm1::LayoutA,
+        typename Gemm1::ElementB, typename Gemm1::LayoutB,
+        typename Gemm1::ElementC, typename Gemm1::LayoutC, ElementCompute,
+        ElementAccumulator, typename Gemm1::Operator>
+        reference_gemm_1;
+
+    reference_gemm_0(
+      problem_size,
+      alpha0, 
+      tensor_A0.device_ref(), 
+      tensor_B0.device_ref(), 
+      beta0, 
+      {tensor_Bias0.device_data(), typename Gemm0::LayoutC::Stride(0)},
+      reference_D0.device_ref()
+    );
+
+    if(relu) {
+       cutlass::reference::device::TensorReLu(reference_D0.device_view()); 
+    }
+
+    reference_gemm_1(
+      problem_size,
+      alpha1, 
+      tensor_A0.device_ref(), 
+      tensor_B1.device_ref(), 
+      beta1,
+      {tensor_Bias1.device_data(), typename Gemm1::LayoutC::Stride(0)},
+      reference_D1.device_ref()
+    );
+    
+    if(relu) {
+       cutlass::reference::device::TensorReLu(reference_D1.device_view()); 
+    }
+   
+    // Wait for kernels to finish
+    cudaDeviceSynchronize();
+    reference_D0.sync_host();
+    reference_D1.sync_host();
+
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(reference_D0.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(reference_D1.host_view()), 0);
+
+    bool passed0 = cutlass::reference::host::TensorEquals(
+      reference_D1.host_view(), 
+      tensor_D1.host_view());
+    CHECK_TRUE(passed0);
+
+    bool passed1 = cutlass::reference::host::TensorEquals(
+      reference_D1.host_view(), 
+      tensor_D1.host_view());
+    CHECK_TRUE(passed1);
+    if (!passed0 || !passed1) {
+
+      std::stringstream fname;
+
+      fname << "error_DualGemm_device_nonfused.txt";
+      std::cerr << "Dumping results in " << fname.str() << "\n";
+
+      std::ofstream file(fname.str());
+
+      file 
+        << "A0 =\n" << tensor_A0.host_view()
+        << "\nB0 =\n" << tensor_B0.host_view()
+        << "\nC0 =\n" << tensor_C0.host_view()
+        << "\nBias0:\n" << tensor_Bias0.host_view() << "\n"
+        << "\nD0 =\n" << tensor_D0.host_view()
+        << "\nB1 =\n" << tensor_B1.host_view()
+        << "\nC1 =\n" << tensor_C1.host_view()
+        << "\nBias1:\n" << tensor_Bias1.host_view() << "\n"
+        << "\n\nReference =\n" << reference_D1.host_view()
+        << "\nComputed =\n" << tensor_D1.host_view();
+    }
+    return passed0 && passed1;
+  }
+};
+
+template <typename DualGemm_>
+struct DualFusedGemmRun
+{
+
+  using DualGemm = DualGemm_;
+  using ElementAccumulator = typename DualGemm::ElementAccumulator;
+  using ElementCompute = typename DualGemm::DualGemmKernel::Epilogue0::OutputOp::ElementCompute;
+  using EpilogueOutputOp2 = typename DualGemm::EpilogueOutputOp2;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  cutlass::Distribution::Kind init_Scale;
+  cutlass::Distribution::Kind init_Bias;
+  uint64_t seed;
+
+  //
+  // Methods
+  //
+
+  DualFusedGemmRun(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform, 
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform, 
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform, 
+    cutlass::Distribution::Kind init_Scale_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_Bias_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_),
+    init_Scale(init_Scale_), init_Bias(init_Bias_), seed(seed_) { }
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, 2, -2, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(
+        view.data(), view.capacity());
+    }
+    else if (dist_kind == cutlass::Distribution::AllZeros) {
+      cutlass::reference::host::TensorFill(view, Element(0));
+    }
+    else if (dist_kind == cutlass::Distribution::AllOnes) {
+      cutlass::reference::host::TensorFill(view, Element(1));
+    }
+    else {
+      std::cerr << "Not implemented\n";
+      return false;
+    }
+
+    return true;
+  }
+
+
+
+
+  /// Executes one test
+  bool run(
+    cutlass::gemm::GemmCoord problem_size,
+    ElementCompute alpha0 = ElementCompute(1),
+    ElementCompute beta0 = ElementCompute(1),
+    ElementCompute alpha1 = ElementCompute(1),
+    ElementCompute beta1 = ElementCompute(1),
+    int batch_count = 1,
+    bool broadcast_b1 = false,
+    bool is_profiling = true,
+    bool relu = false,
+    int warm_ups = 1,
+    int runs = 100) {
+    
+    //
+    // Allocate the GEMM workspace
+    //
+
+    cutlass::HostTensor<
+      typename DualGemm::ElementA,
+      typename DualGemm::LayoutA> tensor_A0(
+        cutlass::platform::is_same<typename DualGemm::LayoutA, cutlass::layout::RowMajor>::value ?
+          cutlass::MatrixCoord(batch_count * problem_size.m(), problem_size.k()) :
+          cutlass::MatrixCoord(problem_size.m(), batch_count * problem_size.k()));
+
+    cutlass::HostTensor<
+      typename DualGemm::ElementB,
+      typename DualGemm::LayoutB0> tensor_B0(
+        cutlass::platform::is_same<typename DualGemm::LayoutB0, cutlass::layout::RowMajor>::value ?
+          cutlass::MatrixCoord(batch_count * problem_size.k(), problem_size.n()) :
+          cutlass::MatrixCoord(problem_size.k(), batch_count * problem_size.n()));
+
+    cutlass::HostTensor<
+      typename DualGemm::ElementC,
+      typename DualGemm::LayoutC> tensor_C0(
+        cutlass::platform::is_same<typename DualGemm::LayoutC, cutlass::layout::RowMajor>::value ?
+          cutlass::MatrixCoord(batch_count * problem_size.m(), problem_size.n()) :
+          cutlass::MatrixCoord(problem_size.m(), batch_count * problem_size.n()));
+
+    cutlass::HostTensor<
+      typename DualGemm::ElementC,
+      typename DualGemm::LayoutScaleBias> tensor_Bias0({batch_count, problem_size.n()});
+
+    cutlass::HostTensor<
+      typename DualGemm::ElementC,
+      typename DualGemm::LayoutC> tensor_D0(
+        cutlass::platform::is_same<typename DualGemm::LayoutC, cutlass::layout::RowMajor>::value ?
+          cutlass::MatrixCoord(batch_count * problem_size.m(), problem_size.n()) :
+          cutlass::MatrixCoord(problem_size.m(), batch_count * problem_size.n()));
+
+    cutlass::HostTensor<
+      typename DualGemm::ElementC,
+      typename DualGemm::LayoutC> reference_D0(
+        cutlass::platform::is_same<typename DualGemm::LayoutC, cutlass::layout::RowMajor>::value ?
+          cutlass::MatrixCoord(batch_count * problem_size.m(), problem_size.n()) :
+          cutlass::MatrixCoord(problem_size.m(), batch_count * problem_size.n()));
+
+    cutlass::HostTensor<
+      typename DualGemm::ElementB,
+      typename DualGemm::LayoutB1> tensor_B1(
+        cutlass::platform::is_same<typename DualGemm::LayoutB1, cutlass::layout::RowMajor>::value ?
+          cutlass::MatrixCoord(batch_count * problem_size.k(), problem_size.n()) :
+          cutlass::MatrixCoord(problem_size.k(), batch_count * problem_size.n()));
+    if (broadcast_b1) {
+      tensor_B1.resize({problem_size.k(), batch_count});
+    }
+
+    cutlass::HostTensor<
+      typename DualGemm::ElementC,
+      typename DualGemm::LayoutC> tensor_C1(
+        cutlass::platform::is_same<typename DualGemm::LayoutC, cutlass::layout::RowMajor>::value ?
+          cutlass::MatrixCoord(batch_count * problem_size.m(), problem_size.n()) :
+          cutlass::MatrixCoord(problem_size.m(), batch_count * problem_size.n()));
+
+    cutlass::HostTensor<
+      typename DualGemm::ElementC,
+      typename DualGemm::LayoutScaleBias> tensor_Bias1({batch_count, problem_size.n()});
+
+    cutlass::HostTensor<
+      typename DualGemm::ElementC,
+      typename DualGemm::LayoutC> tensor_D1(
+        cutlass::platform::is_same<typename DualGemm::LayoutC, cutlass::layout::RowMajor>::value ?
+          cutlass::MatrixCoord(batch_count * problem_size.m(), problem_size.n()) :
+          cutlass::MatrixCoord(problem_size.m(), batch_count * problem_size.n()));
+
+    cutlass::HostTensor<
+      typename DualGemm::ElementC,
+      typename DualGemm::LayoutC> tensor_D2(
+        cutlass::platform::is_same<typename DualGemm::LayoutC, cutlass::layout::RowMajor>::value ?
+          cutlass::MatrixCoord(batch_count * problem_size.m(), problem_size.n()) :
+          cutlass::MatrixCoord(problem_size.m(), batch_count * problem_size.n()));
+
+    cutlass::HostTensor<
+      typename DualGemm::ElementC,
+      typename DualGemm::LayoutC> reference_D1(
+        cutlass::platform::is_same<typename DualGemm::LayoutC, cutlass::layout::RowMajor>::value ?
+          cutlass::MatrixCoord(batch_count * problem_size.m(), problem_size.n()) :
+          cutlass::MatrixCoord(problem_size.m(), batch_count * problem_size.n()));
+
+    cutlass::HostTensor<
+      typename DualGemm::ElementC,
+      typename DualGemm::LayoutC> reference_D2(
+        cutlass::platform::is_same<typename DualGemm::LayoutC, cutlass::layout::RowMajor>::value ?
+          cutlass::MatrixCoord(batch_count * problem_size.m(), problem_size.n()) :
+          cutlass::MatrixCoord(problem_size.m(), batch_count * problem_size.n()));
+
+    CHECK_TRUE(initialize_tensor(tensor_A0.host_view(), init_A, seed + 2019));
+    CHECK_TRUE(initialize_tensor(tensor_B0.host_view(), init_B, seed + 2118));
+    CHECK_TRUE(initialize_tensor(tensor_C0.host_view(), init_C, seed + 2017));
+    CHECK_TRUE(initialize_tensor(tensor_Bias0.host_view(), init_Bias, seed + 2011));
+    CHECK_TRUE(initialize_tensor(tensor_B1.host_view(), init_B, seed + 2113));
+    CHECK_TRUE(initialize_tensor(tensor_C1.host_view(), init_C, seed + 2015));
+    CHECK_TRUE(initialize_tensor(tensor_Bias1.host_view(), init_Bias, seed + 2012));
+
+    cutlass::reference::host::TensorFill(
+      tensor_D0.host_view());
+    cutlass::reference::host::TensorFill(
+      tensor_D1.host_view());
+    cutlass::reference::host::TensorFill(
+      tensor_D2.host_view());
+    cutlass::reference::host::TensorFill(
+      reference_D0.host_view());
+    cutlass::reference::host::TensorFill(
+      reference_D1.host_view());
+    cutlass::reference::host::TensorFill(
+      reference_D2.host_view());
+
+    tensor_A0.sync_device();
+    tensor_B0.sync_device();
+    tensor_C0.sync_device();
+    tensor_Bias0.sync_device();
+    tensor_B1.sync_device();
+    tensor_C1.sync_device();
+    tensor_Bias1.sync_device();
+    tensor_D0.sync_device();
+    tensor_D1.sync_device();
+    tensor_D2.sync_device();
+    reference_D0.sync_device();
+    reference_D1.sync_device();
+    reference_D2.sync_device();
+
+    //
+    // Batch strides (irrelevant when batch_count == 1)
+    //
+
+    int64_t batch_stride_A = problem_size.m() * problem_size.k();
+    int64_t batch_stride_B0 = problem_size.k() * problem_size.n();
+    int64_t batch_stride_B1 = problem_size.k() * problem_size.n();
+    if (broadcast_b1) {
+      // B1 is a (column) vector
+      batch_stride_B1 = problem_size.k();
+    }
+    int64_t batch_stride_Bias = problem_size.n();
+    int64_t batch_stride_D = problem_size.m() * problem_size.n();
+
+    //
+    // Initialize the GEMM operator
+    //
+
+    int split_k_slices = DualGemm::kSplitKSerial ? 2 : 1;
+    typename cutlass::TensorRef<typename DualGemm::ElementC, typename DualGemm::LayoutC> nullptr_ref{};
+    decltype(nullptr_ref) ref_B0, ref_B1;
+    if (beta0 != ElementCompute(0)) {
+      ref_B0 = {tensor_Bias0.device_data(), typename DualGemm::LayoutC::Stride(0)};
+    }
+    if (beta1 != ElementCompute(0)) {
+      ref_B1 = {tensor_Bias1.device_data(), typename DualGemm::LayoutC::Stride(0)};
+    }
+    typename DualGemm::Arguments arguments{
+      (batch_count > 1 ?
+        cutlass::gemm::DualGemmMode::kBatched :
+        cutlass::gemm::DualGemmMode::kGemm),
+      problem_size,
+      tensor_A0.device_ref(),
+      tensor_B0.device_ref(),
+      ref_B0,
+      DualGemm::kStoreD0 ? tensor_D0.device_ref() : nullptr_ref,
+      (broadcast_b1 ?
+        typename DualGemm::TensorRefB1(tensor_B1.device_data(), 0) :
+        tensor_B1.device_ref()),
+      ref_B1,
+      DualGemm::kStoreD1 ? tensor_D1.device_ref() : nullptr_ref,
+      tensor_D2.device_ref(),
+      {alpha0, beta0},
+      {alpha1, beta1},
+      {},
+      split_k_slices,
+      batch_count,
+      batch_stride_A,
+      batch_stride_B0,
+      batch_stride_B1,
+      batch_stride_Bias,
+      batch_stride_D,
+    };
+
+    //
+    // Run the GEMM
+    //
+
+    DualGemm b2b_gemm_op;
+
+    cutlass::device_memory::allocation<uint8_t> workspace(b2b_gemm_op.get_workspace_size(arguments));
+  
+    cutlass::Status status = b2b_gemm_op.can_implement(arguments);
+
+    CUTLASS_CHECK(status);
+
+    status = b2b_gemm_op.initialize(arguments, workspace.get());
+
+    CUTLASS_CHECK(status);
+
+    for(int i = 0; i < warm_ups; i++) {
+        status = b2b_gemm_op();
+        CUTLASS_CHECK(status);
+    }
+
+    if (is_profiling) {
+      //
+      // Profile the GEMM
+      //
+
+      cudaEvent_t start, stop;
+      cudaEventCreate(&start);
+      cudaEventCreate(&stop);
+
+      cudaEventRecord(start);
+
+      for(int i = 0; i < runs; i++) {
+          status = b2b_gemm_op();
+          CUTLASS_CHECK(status);
+      }
+
+      cudaEventRecord(stop);
+      cudaDeviceSynchronize();
+      float gemmTime;
+      cudaEventElapsedTime(&gemmTime, start, stop);
+      std::cout << "Fusion time " << gemmTime / (float)runs << " ms\n";
+    }
+
+    tensor_D0.sync_host();
+    tensor_D1.sync_host();
+    tensor_D2.sync_host();
+
+    //
+    // Verify
+    //
+
+    using GemmUniversal0 = cutlass::gemm::device::GemmUniversal<
+      typename DualGemm::ElementA, typename DualGemm::LayoutA,
+      typename DualGemm::ElementB, typename DualGemm::LayoutB0,
+      typename DualGemm::ElementC, typename DualGemm::LayoutC,
+      ElementAccumulator
+    >;
+
+    GemmUniversal0 reference_gemm0;
+
+    typename GemmUniversal0::Arguments args0 {
+      (batch_count > 1 ?
+        cutlass::gemm::GemmUniversalMode::kBatched :
+        cutlass::gemm::GemmUniversalMode::kGemm),
+      problem_size,
+      batch_count,
+      {alpha0, beta0},
+      tensor_A0.device_data(),
+      tensor_B0.device_data(),
+      tensor_Bias0.device_data(),
+      reference_D0.device_data(),
+      batch_stride_A,
+      batch_stride_B0,
+      batch_stride_Bias,
+      batch_stride_D,
+      tensor_A0.stride(0),
+      tensor_B0.stride(0),
+      0,  // zero stride for the bias vector
+      reference_D0.stride(0),
+    };
+
+    status = reference_gemm0.can_implement(args0);
+    CUTLASS_CHECK(status);
+    status = reference_gemm0(args0);
+    CUTLASS_CHECK(status);
+
+    using GemmUniversal1 = cutlass::gemm::device::GemmUniversal<
+      typename DualGemm::ElementA, typename DualGemm::LayoutA,
+      typename DualGemm::ElementB, typename DualGemm::LayoutB1,
+      typename DualGemm::ElementC, typename DualGemm::LayoutC,
+      ElementAccumulator
+    >;
+
+    GemmUniversal1 reference_gemm1;
+
+    typename GemmUniversal1::Arguments args1 {
+      (batch_count > 1 ?
+        cutlass::gemm::GemmUniversalMode::kBatched :
+        cutlass::gemm::GemmUniversalMode::kGemm),
+      problem_size,
+      batch_count,
+      {alpha1, beta1},
+      tensor_A0.device_data(),
+      tensor_B1.device_data(),
+      tensor_Bias1.device_data(),
+      reference_D1.device_data(),
+      batch_stride_A,
+      batch_stride_B1,
+      batch_stride_Bias,
+      batch_stride_D,
+      tensor_A0.stride(0),
+      (broadcast_b1 ? 0 : tensor_B1.stride(0)),
+      0,  // zero stride for the bias vector
+      reference_D1.stride(0),
+    };
+
+    status = reference_gemm1.can_implement(args1);
+    CUTLASS_CHECK(status);
+    status = reference_gemm1(args1);
+    CUTLASS_CHECK(status);
+
+    if(relu) {
+       cutlass::reference::device::TensorReLu(reference_D0.device_view());
+       cutlass::reference::device::TensorReLu(reference_D1.device_view()); 
+    }
+
+    TensorEpilogueForEach<EpilogueOutputOp2>(reference_D0.device_view(), reference_D1.device_view(), reference_D2.device_view());
+    cudaDeviceSynchronize();
+    reference_D0.sync_host();
+    reference_D1.sync_host();
+    reference_D2.sync_host();
+
+    CHECK_GT(cutlass::reference::host::TensorNorm(reference_D0.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(reference_D1.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D2.host_view()), 0);
+    CHECK_GT(cutlass::reference::host::TensorNorm(reference_D2.host_view()), 0);
+
+    bool passed_out0 = true;
+    if (DualGemm::kStoreD0) {
+      CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0.host_view()), 0);
+      passed_out0 = cutlass::reference::host::TensorEquals(
+        reference_D0.host_view(), 
+        tensor_D0.host_view());
+    }
+    CHECK_TRUE(passed_out0);
+
+    bool passed_out1 = true;
+    if (DualGemm::kStoreD1) {
+      CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1.host_view()), 0);
+      passed_out1 = cutlass::reference::host::TensorEquals(
+        reference_D1.host_view(), 
+        tensor_D1.host_view());
+    }
+    CHECK_TRUE(passed_out1);
+
+    bool passed_out2 = cutlass::reference::host::TensorEquals(
+      reference_D2.host_view(), 
+      tensor_D2.host_view());
+    CHECK_TRUE(passed_out2);
+
+    bool passed = passed_out0 && passed_out1 && passed_out2;
+    if (!passed)
+    {
+      std::stringstream fname;
+
+      fname << "error_DualGemm_device_fused.txt";
+      std::cerr << "Dumping results in " << fname.str() << "\n";
+
+      std::ofstream file(fname.str());
+
+      file 
+        << "A0 =\n" << tensor_A0.host_view()
+        << "\nB0 =\n" << tensor_B0.host_view()
+        << "\nC0 =\n" << tensor_C0.host_view()
+        << "\nBias0:\n" << tensor_Bias0.host_view() << "\n"
+        << "\nB1 =\n" << tensor_B1.host_view()
+        << "\nC1 =\n" << tensor_C1.host_view()
+        << "\nBias1:\n" << tensor_Bias1.host_view() << "\n"
+        << "\n\nReference0 =\n" << reference_D0.host_view()
+        << "\nComputed0 =\n" << tensor_D0.host_view()
+        << "\n\nReference1 =\n" << reference_D1.host_view()
+        << "\nComputed1 =\n" << tensor_D1.host_view()
+        << "\n\nReference2 =\n" << reference_D2.host_view()
+        << "\nComputed2 =\n" << tensor_D2.host_view();
+    }
+    //std::cout << "A0 " << tensor_A0.host_view() << std::endl;
+    // std::cout << "reference_D0 " << reference_D0.host_view() << std::endl;
+    // std::cout << "reference_D1 " << reference_D1.host_view() << std::endl;
+    // std::cout << "reference_D2 " << reference_D2.host_view() << std::endl;
+    //std::cout << "reference_D0 " << reference_D0.host_view() << std::endl;
+    return passed;
+  }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/45_dual_gemm/kernel/dual_gemm.h b/examples/45_dual_gemm/kernel/dual_gemm.h
new file mode 100644
index 0000000000..417f6ff25c
--- /dev/null
+++ b/examples/45_dual_gemm/kernel/dual_gemm.h
@@ -0,0 +1,545 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/semaphore.h"
+
+#include "../threadblock/dual_mma_multistage.h"
+#include "../threadblock/dual_epilogue.h"
+#include "../dual_gemm_common.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename DualMma_,               ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue0_,             ///! Epilogue
+  typename Epilogue1_,             ///! Epilogue
+  typename OutputOp2_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  bool SplitKSerial,              ///! If true, code supporting split-K via serial reduction is enabled.
+  bool StoreD0,
+  bool StoreD1
+>
+struct DualGemm {
+
+  using DualMma = DualMma_;
+
+  using Epilogue0 = Epilogue0_;
+  using Epilogue1 = Epilogue1_;
+  using OutputOp0 = typename Epilogue0::OutputOp;
+  using OutputOp1 = typename Epilogue1::OutputOp;
+  using OutputOp2 = OutputOp2_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static constexpr bool kStoreD0 = StoreD0;
+  static constexpr bool kStoreD1 = StoreD1;
+
+  using DualEpilogue = cutlass::epilogue::threadblock::DualEpilogue<
+      typename Epilogue0::Shape,
+      typename Epilogue0::WarpMmaOperator,
+      Epilogue0::kPartitionsK,
+      typename Epilogue0::OutputTileIterator,
+      typename Epilogue0::AccumulatorFragmentIterator,
+      typename Epilogue0::WarpTileIterator,
+      typename Epilogue0::SharedLoadIterator,
+      OutputOp0,
+      OutputOp1,
+      OutputOp2,
+      typename Epilogue0::Padding,
+      kStoreD0,
+      kStoreD1,
+      Epilogue0::kFragmentsPerIteration,
+      true // IterationsUnroll
+  >;
+
+  using ElementA = typename DualMma::IteratorA::Element;
+  using ElementB = typename DualMma::IteratorB0::Element;
+  using ElementC = typename DualEpilogue::OutputTileIterator::Element;
+
+  static bool const kSplitKSerial = SplitKSerial;
+  static_assert(!kSplitKSerial || (kStoreD0 && kStoreD1),
+    "Split-K serial requires buffers for D0/D1 for reduction");
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount0 = typename DualMma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount0::kCount;
+
+  /// Parameters structure
+  struct Params {
+    DualGemmMode mode;
+    cutlass::gemm::GemmCoord problem_size;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int swizzle_log_tile;
+
+    // Mma0
+    typename DualMma::IteratorA::Params params_A0;
+    typename DualMma::IteratorA::TensorRef ref_A0;
+    typename DualMma::IteratorB0::Params params_B0;
+    typename DualMma::IteratorB0::TensorRef ref_B0;
+    typename Epilogue0::OutputTileIterator::Params params_C0;
+    typename Epilogue0::OutputTileIterator::TensorRef ref_C0;
+    typename Epilogue0::OutputTileIterator::Params params_D0;
+    typename Epilogue0::OutputTileIterator::TensorRef ref_D0;
+    typename OutputOp0::Params output_op_0;
+
+    // Mma1
+    typename DualMma::IteratorB1::Params params_B1;
+    typename DualMma::IteratorB1::TensorRef ref_B1;
+    typename Epilogue1::OutputTileIterator::Params params_C1;
+    typename Epilogue1::OutputTileIterator::TensorRef ref_C1;
+    typename Epilogue1::OutputTileIterator::Params params_D1;
+    typename Epilogue1::OutputTileIterator::TensorRef ref_D1;
+    typename OutputOp1::Params output_op_1;
+
+    typename Epilogue1::OutputTileIterator::Params params_D2;
+    typename Epilogue1::OutputTileIterator::TensorRef ref_D2;
+    typename OutputOp2::Params output_op_2;
+
+    int *semaphore;
+    int gemm_k_size;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B0;
+    int64_t batch_stride_B1;
+    int64_t batch_stride_C;
+    int64_t batch_stride_D;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): swizzle_log_tile(0), semaphore(0), gemm_k_size(0) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      DualGemmMode mode,
+      cutlass::gemm::GemmCoord const & problem_size,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      // Mma0: D0 = A @ B0 + C0
+      typename DualMma::IteratorA::TensorRef ref_A0,
+      typename DualMma::IteratorB0::TensorRef ref_B0,
+      typename Epilogue0::OutputTileIterator::TensorRef ref_C0,
+      typename Epilogue0::OutputTileIterator::TensorRef ref_D0,
+      // Mma1: D1 = A @ B1 + C1
+      typename DualMma::IteratorB1::TensorRef ref_B1,
+      typename Epilogue1::OutputTileIterator::TensorRef ref_C1,
+      typename Epilogue1::OutputTileIterator::TensorRef ref_D1,
+
+      typename Epilogue1::OutputTileIterator::TensorRef ref_D2,
+      typename OutputOp0::Params output_op_0 = typename OutputOp0::Params(),
+      typename OutputOp1::Params output_op_1 = typename OutputOp1::Params(),
+      typename OutputOp2::Params output_op_2 = typename OutputOp2::Params(),
+      int *workspace = nullptr,
+      int64_t batch_stride_A = 1,
+      int64_t batch_stride_B0 = 1,
+      int64_t batch_stride_B1 = 1,
+      int64_t batch_stride_C = 1,
+      int64_t batch_stride_D = 1
+    ):
+      mode(mode),
+      problem_size(problem_size),
+      grid_tiled_shape(grid_tiled_shape),
+      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      // Mma0
+      params_A0(ref_A0.layout()),
+      ref_A0(ref_A0),
+      params_B0(ref_B0.layout()),
+      ref_B0(ref_B0),
+      params_C0(ref_C0.layout()),
+      ref_C0(ref_C0),
+      params_D0(ref_D0.layout()),
+      ref_D0(ref_D0),
+      // Mma1
+      params_B1(ref_B1.layout()),
+      ref_B1(ref_B1),
+      params_C1(ref_C1.layout()),
+      ref_C1(ref_C1),
+      params_D1(ref_D1.layout()),
+      ref_D1(ref_D1),
+      params_D2(ref_D2.layout()),
+      ref_D2(ref_D2),
+      output_op_0(output_op_0),
+      output_op_1(output_op_1),
+      output_op_2(output_op_2),
+      batch_stride_A(batch_stride_A),
+      batch_stride_B0(batch_stride_B0),
+      batch_stride_B1(batch_stride_B1),
+      batch_stride_C(batch_stride_C),
+      batch_stride_D(batch_stride_D) {
+
+      int total_gemm_k_iterations = (problem_size.k() + DualMma::Shape::kK - 1) / DualMma::Shape::kK;
+      int gemm_k_iterations = (total_gemm_k_iterations + grid_tiled_shape.k() - 1) / grid_tiled_shape.k();
+      gemm_k_size = gemm_k_iterations * DualMma::Shape::kK;
+
+      semaphore = workspace;
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename DualMma::SharedStorage main_loop;
+    typename DualEpilogue::SharedStorage epilogue;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  DualGemm() { }
+
+  /// Determines whether kernel satisfies alignment
+    static Status can_implement(
+      cutlass::gemm::GemmCoord const & problem_size,
+      typename DualMma::IteratorA::TensorRef ref_A0,
+      typename DualMma::IteratorB0::TensorRef ref_B0,
+      typename Epilogue0::OutputTileIterator::TensorRef ref_C0,
+      typename Epilogue0::OutputTileIterator::TensorRef ref_D0,
+      typename DualMma::IteratorB1::TensorRef ref_B1,
+      typename Epilogue1::OutputTileIterator::TensorRef ref_C1,
+      typename Epilogue1::OutputTileIterator::TensorRef ref_D1,
+      typename Epilogue1::OutputTileIterator::TensorRef ref_D2) {
+
+    static int const kAlignmentA = DualMma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = DualMma::IteratorB0::AccessType::kElements;
+    static int const kAlignmentC = Epilogue0::OutputTileIterator::kElementsPerAccess;
+
+    if (!TensorRef_aligned(ref_A0, kAlignmentA)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_B0, kAlignmentB)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_C0, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_D0, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_B1, kAlignmentB)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_C1, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_D1, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_D2, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A0 = static_cast<ElementA *>(params.ref_A0.data());
+    ElementB *ptr_B0 = static_cast<ElementB *>(params.ref_B0.data());
+    ElementB *ptr_B1 = static_cast<ElementB *>(params.ref_B1.data());
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == DualGemmMode::kGemm) {
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == DualGemmMode::kBatched) {
+      ptr_A0 += threadblock_tile_offset.k() * params.batch_stride_A;
+      ptr_B0 += threadblock_tile_offset.k() * params.batch_stride_B0;
+      ptr_B1 += threadblock_tile_offset.k() * params.batch_stride_B1;
+    }
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A0{
+      threadblock_tile_offset.m() * DualMma::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B0{
+      offset_k,
+      threadblock_tile_offset.n() * DualMma::Shape::kN
+    };
+
+    cutlass::MatrixCoord tb_offset_B1{
+      offset_k,
+      threadblock_tile_offset.n() * DualMma::Shape::kN
+    };
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename DualMma::IteratorA iterator_A0(
+      params.params_A0,
+      ptr_A0,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A0);
+
+    typename DualMma::IteratorB0 iterator_B0(
+      params.params_B0,
+      ptr_B0,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B0);
+
+    typename DualMma::IteratorB1 iterator_B1(
+      params.params_B1,
+      ptr_B1,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B1);
+
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+
+    // Construct thread-scoped matrix multiply
+    typename DualMma::FragmentC accum0;
+    typename DualMma::FragmentC accum1;
+    accum0.clear();
+    accum1.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + DualMma::Shape::kK - 1) / DualMma::Shape::kK;
+
+    DualMma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+    if (!kSplitKSerial || gemm_k_iterations > 0) {
+      // Compute threadblock-scoped matrix multiply-add
+      mma(gemm_k_iterations,
+        accum0, accum1,
+        iterator_A0, iterator_B0, iterator_B1,
+        accum0, accum1);
+    }
+
+    //
+    // Epilogue
+    //
+
+    OutputOp0 output_op_0(params.output_op_0);
+    OutputOp1 output_op_1(params.output_op_1);
+    OutputOp2 output_op_2(params.output_op_2);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * DualMma::Shape::kM,
+      threadblock_tile_offset.n() * DualMma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_C0 = static_cast<ElementC *>(params.ref_C0.data());
+    ElementC *ptr_C1 = static_cast<ElementC *>(params.ref_C1.data());
+    ElementC *ptr_D0 = static_cast<ElementC *>(params.ref_D0.data());
+    ElementC *ptr_D1 = static_cast<ElementC *>(params.ref_D1.data());
+    ElementC *ptr_D2 = static_cast<ElementC *>(params.ref_D2.data());
+
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == DualGemmMode::kGemm) {
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        output_op_0.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+        output_op_1.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+      }
+    }
+    else if (params.mode == DualGemmMode::kBatched) {
+      ptr_C0 += threadblock_tile_offset.k() * params.batch_stride_C;
+      ptr_C1 += threadblock_tile_offset.k() * params.batch_stride_C;
+      ptr_D0 += threadblock_tile_offset.k() * params.batch_stride_D;
+      ptr_D1 += threadblock_tile_offset.k() * params.batch_stride_D;
+      ptr_D2 += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue0::OutputTileIterator iterator_C0(
+      params.params_C0,
+      ptr_C0,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+    typename Epilogue1::OutputTileIterator iterator_C1(
+      params.params_C1,
+      ptr_C1,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue0::OutputTileIterator iterator_D0(
+      params.params_D0,
+      ptr_D0,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+    typename Epilogue1::OutputTileIterator iterator_D1(
+      params.params_D1,
+      ptr_D1,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+    typename Epilogue1::OutputTileIterator iterator_D2(
+      params.params_D2,
+      ptr_D2,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    DualEpilogue epilogue(
+      shared_storage.epilogue,
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C0 = iterator_D0;
+        iterator_C1 = iterator_D1;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+      __threadfence();
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    typename Epilogue0::OutputTileIterator source_iters[] = {
+      iterator_C0, iterator_C1
+    };
+    const bool writeToD2 = (!kSplitKSerial || params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1);
+    epilogue(
+      output_op_0, output_op_1, output_op_2,
+      iterator_D0, iterator_D1, iterator_D2,
+      accum0, accum1,
+      source_iters,
+      writeToD2
+    );
+    
+    //
+    // Release the semaphore
+    //
+
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+      
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      __threadfence();
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
diff --git a/examples/45_dual_gemm/test_run.h b/examples/45_dual_gemm/test_run.h
new file mode 100644
index 0000000000..4a58a3a16c
--- /dev/null
+++ b/examples/45_dual_gemm/test_run.h
@@ -0,0 +1,95 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#include <iostream>
+
+// Run tests on GPUs 
+
+int testRun(int arch, std::vector<bool (*)()> & test_funcs, const std::string & test_name) {
+
+  bool supported = false;
+
+  int arch_major = arch / 10;
+  int arch_minor = arch - arch / 10 * 10;  
+
+  if(arch_major >= 8) {
+    // Ampere Tensor Core operations exposed with mma.sync are first available in CUDA 11.0.
+    //
+    // CUTLASS must be compiled with CUDA 11 Toolkit to run Conv2dFprop examples.
+    if (__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0)) {
+      supported = true;
+    }
+  }
+  else if(arch_major >= 7) {
+    // Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2.
+    //
+    // CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples.
+    if (__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2)) {
+      supported = true;
+    }
+  }
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (props.major < arch_major || (props.major == arch_major && props.minor < arch_minor) ) {
+    supported = false;
+  }
+
+  if (!supported) {
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    std::cout << "This example isn't supported on current architecture" << std::endl;
+    return 0;
+  }
+
+  bool pass = true;
+ 
+  std::cout << "Device: " << props.name << std::endl;
+  std::cout << "Arch: SM" << arch << std::endl;
+  std::cout << "Test: " << test_name << std::endl;
+  for(auto func : test_funcs) {
+    pass &= func();
+  }
+
+
+  if(pass)
+    return 0;
+  else
+    return -1;
+
+}
+
diff --git a/examples/45_dual_gemm/thread/left_silu_and_mul.h b/examples/45_dual_gemm/thread/left_silu_and_mul.h
new file mode 100644
index 0000000000..47043267f5
--- /dev/null
+++ b/examples/45_dual_gemm/thread/left_silu_and_mul.h
@@ -0,0 +1,150 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/epilogue/thread/linear_combination_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation.
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class LeftSiLUAndMul {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+
+  static int const kCount = Count;
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using ComputeFragment = Array<ElementCompute, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  struct Params{};
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LeftSiLUAndMul(Params const &/*params*/) {}
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return true;
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    assert(false);
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &lhs, 
+    FragmentAccumulator const &rhs) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_to_compute;
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> compute_to_output;
+
+    ComputeFragment converted_lhs = accumulator_to_compute(lhs);
+    ComputeFragment converted_rhs = accumulator_to_compute(rhs);
+
+    cutlass::epilogue::thread::SiLu<ComputeFragment> silu;
+    cutlass::multiplies<ComputeFragment> mul;
+    auto silu_lhs = silu(converted_lhs);
+    return compute_to_output(mul(silu_lhs, converted_rhs));
+  }
+
+  CUTLASS_HOST_DEVICE
+  ElementOutput operator()(
+      ElementAccumulator const& lhs,
+      ElementAccumulator const& rhs
+  ) const {
+      ElementCompute convert_lhs(lhs); 
+      ElementCompute convert_rhs(rhs); 
+      cutlass::epilogue::thread::SiLu<ElementCompute> silu;
+      cutlass::multiplies<ElementCompute> mul;
+      auto silu_lhs = silu(convert_lhs);
+      return ElementOutput(mul(silu_lhs, convert_rhs));
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/45_dual_gemm/threadblock/dual_epilogue.h b/examples/45_dual_gemm/threadblock/dual_epilogue.h
new file mode 100644
index 0000000000..3ef1c6d33c
--- /dev/null
+++ b/examples/45_dual_gemm/threadblock/dual_epilogue.h
@@ -0,0 +1,426 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include <cuda/std/cassert>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/functional.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator
+template <
+  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  int PartitionsK,                          ///< Number of partitions of the K dimension
+  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
+  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
+  ///< Output operator
+  typename OutputOp0_,
+  typename OutputOp1_,
+  typename OutputOp2_,
+  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
+  bool StoreD0 = true,
+  bool StoreD1 = true,
+  int FragmentsPerPartition = 1,            ///< Used to coarsten the epilogue granularity
+  int IterationsUnroll =                    ///< Used to reduce binary size when epilogue op is large
+    (!IsEpilogueFunctorHeavy<OutputOp0_>::value)
+>
+class DualEpilogue {
+
+public:
+
+  using Base = EpilogueBase<
+    Shape_, 
+    typename WarpMmaOperator_::Shape, 
+    PartitionsK, 
+    AccumulatorFragmentIterator_, 
+    WarpTileIterator_, 
+    Padding_,
+    FragmentsPerPartition>;
+
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+  static bool constexpr kStoreD0 = StoreD0;
+  static bool constexpr kStoreD1 = StoreD1;
+  using OutputTileIterator = OutputTileIterator_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp0 = OutputOp0_;
+  using OutputOp1 = OutputOp1_;
+  using OutputOp2 = OutputOp2_;
+  using Padding = Padding_;
+
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename Base::AccumulatorTile;
+
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+  /// Array type used to output
+  using OutputAccessType = Array<
+    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+  /// Array type used by output functor
+  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>; 
+  
+  /// Number of warps
+  using WarpCount = typename Base::WarpCount;
+
+  struct SharedStorage {
+    using Element = typename WarpTileIterator::Element;
+
+    /// Tensor reference to shared memory allocation
+    using TensorRef = typename WarpTileIterator::TensorRef;
+
+    /// Logical shape of the shared memory tile written to by all warps.
+    using Shape = typename Base::Shape;
+
+    /// Shape of the shared memory allocation for the epilogue    
+    using StorageShape = typename Base::SharedStorage::StorageShape;
+
+    //
+    // Data members
+    //
+
+    AlignedBuffer<Element, StorageShape::kCount> storage[2];
+
+    //
+    // Methods
+    //
+
+    /// Returns a tensor reference to the shared memory buffer
+    CUTLASS_DEVICE
+    TensorRef reference(int i) {
+      return TensorRef(
+        storage[i].data(), 
+        Layout::packed({StorageShape::kRow, StorageShape::kColumn}));
+    }
+  };
+
+  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
+  static int constexpr kSmemPointerOffset = SharedStorage::StorageShape::kCount / kSmemTiles;
+
+public:
+
+  static_assert(SharedLoadIterator::Fragment::kElements == OutputTileIterator::Fragment::kElements,
+    "Mismatch between shared load iterator and output tile iterator.");
+
+  static_assert(OutputTileIterator::kElementsPerAccess, "OutputTileIterator::kElementsPerAccess must not be zero.");
+
+  static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess), 
+    "Divisibility");
+
+private:
+
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator0_;
+  SharedLoadIterator shared_load_iterator1_;
+
+  /// Stores a warp's fragment of accumulators to SMEM
+  WarpTileIterator warp_tile_iterator0_;
+  WarpTileIterator warp_tile_iterator1_;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  DualEpilogue(
+    SharedStorage &shared_storage,    ///< Shared storage object    
+    int thread_idx,                   ///< ID of a thread within the threadblock
+    int warp_idx,                     ///< ID of warp within threadblock
+    int lane_idx                     ///< Id of thread within warp
+  ):
+    shared_load_iterator0_(shared_storage.reference(0), thread_idx),
+    shared_load_iterator1_(shared_storage.reference(1), thread_idx),
+    warp_tile_iterator0_(shared_storage.reference(0), lane_idx),
+    warp_tile_iterator1_(shared_storage.reference(1), lane_idx)
+  {
+    int warp_k = warp_idx / (WarpCount::kM * WarpCount::kN);
+    int warp_mn = warp_idx % (WarpCount::kM * WarpCount::kN);
+    int warp_m = warp_mn % WarpCount::kM;
+    int warp_n = warp_mn / WarpCount::kM;
+
+    MatrixCoord warp_offset{warp_k * WarpCount::kM + warp_m, warp_n};
+
+    warp_tile_iterator0_.add_tile_offset(warp_offset);
+    warp_tile_iterator1_.add_tile_offset(warp_offset);
+  }
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp0 const &output_op0,
+    OutputOp1 const &output_op1,
+    OutputOp2 const &output_op2,
+    OutputTileIterator dest0,
+    OutputTileIterator dest1,
+    OutputTileIterator dest2,
+    AccumulatorTile const &accumulator0,
+    AccumulatorTile const &accumulator1,
+    OutputTileIterator source_iterator[2],
+    bool writeToD2 // true if it's the final split-k
+  ) {
+    // TODO: Implement when no source is needed
+
+    typename OutputTileIterator::Fragment source_fragment[2];
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      source_fragment[i].clear();
+    }
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator[2] = {accumulator0, accumulator1};
+
+    //
+    // Iterate over accumulator tile
+    // 
+
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+
+      //
+      // Load the source
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < 2; ++i) {
+        source_iterator[i].load(source_fragment[i]);
+        ++source_iterator[i];
+      }
+
+      //
+      // Convert and store fragment
+      //
+      
+      __syncthreads();
+
+      acc2smem_source_needed<cutlass::make_index_sequence<OutputTileIterator::kIterations>>::push(
+          iter, accum_fragment_iterator[0], this->warp_tile_iterator0_);
+      acc2smem_source_needed<cutlass::make_index_sequence<OutputTileIterator::kIterations>>::push(
+          iter, accum_fragment_iterator[1], this->warp_tile_iterator1_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      typename SharedLoadIterator::Fragment aligned_accum_fragment0[kPartitionsK];
+      typename SharedLoadIterator::Fragment aligned_accum_fragment1[kPartitionsK];
+
+      shared_load_iterator0_.load(aligned_accum_fragment0[0]);
+      shared_load_iterator1_.load(aligned_accum_fragment1[0]);
+
+      // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
+      if (kPartitionsK > 1) {
+
+        plus <typename SharedLoadIterator::Fragment> add_fragments;
+
+        CUTLASS_PRAGMA_UNROLL
+        for ( int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator0_.add_pointer_offset(kSmemPointerOffset);
+          shared_load_iterator1_.add_pointer_offset(kSmemPointerOffset);
+          shared_load_iterator0_.load(aligned_accum_fragment0[i]);
+          shared_load_iterator1_.load(aligned_accum_fragment1[i]);
+          aligned_accum_fragment0[0] = add_fragments(aligned_accum_fragment0[0], aligned_accum_fragment0[i]);
+          aligned_accum_fragment1[0] = add_fragments(aligned_accum_fragment1[0], aligned_accum_fragment1[i]);
+        }
+
+        shared_load_iterator0_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
+        shared_load_iterator1_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
+      }
+
+      //
+      // Compute the output result
+      //
+     
+      typename OutputTileIterator::Fragment output_fragment[3];
+
+      apply_output_operator_(output_fragment,
+        output_op0, output_op1, output_op2,
+        aligned_accum_fragment0[0], aligned_accum_fragment1[0],
+        source_fragment);
+
+
+      //
+      // Store the final result
+      //
+
+      if (kStoreD0) {
+        dest0.store(output_fragment[0]);
+        ++dest0;
+      }
+      if (kStoreD1) {
+        dest1.store(output_fragment[1]);
+        ++dest1;
+      }
+      if (writeToD2) {
+        dest2.store(output_fragment[2]);
+        ++dest2;
+      }
+    }
+  }
+
+private:
+
+  static_assert(kPartitionsK == 1 || Base::kFragmentsPerIteration == 1, "One of these must be exactly 1.");
+
+  template<class Seq>
+  struct acc2smem_source_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
+    template<int Advance>
+    CUTLASS_DEVICE
+    static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                       WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+      accum_fragment_iterator.load(accum_fragment);
+      warp_tile_iterator.store(accum_fragment);
+    }
+
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                     AccumulatorFragmentIterator const &iterator_begin,
+                     WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
+    }
+  };
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_(
+    typename OutputTileIterator::Fragment (&output_fragment)[3],
+    OutputOp0 const &output_op0,
+    OutputOp1 const &output_op1,
+    OutputOp2 const &output_op2,
+    typename SharedLoadIterator::Fragment const& aligned_accum_fragment0,
+    typename SharedLoadIterator::Fragment const& aligned_accum_fragment1,
+    typename OutputTileIterator::Fragment const (&source_fragment)[2]) {
+      
+    OutputAccessType* output_frag_ptr[3] = {
+      reinterpret_cast<OutputAccessType *>(&output_fragment[0]),
+      reinterpret_cast<OutputAccessType *>(&output_fragment[1]),
+      reinterpret_cast<OutputAccessType *>(&output_fragment[2])
+    };
+
+    AccumulatorAccessType const *compute_frag_ptr[2] = {
+      reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment0),
+      reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment1)
+    };
+
+    OutputAccessType const *source_frag_ptr[2] = {
+      reinterpret_cast<OutputAccessType const *>(&source_fragment[0]),
+      reinterpret_cast<OutputAccessType const *>(&source_fragment[1])
+    };
+
+    int const kOutputOpIterations = 
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+
+      // Call the output operators
+      output_frag_ptr[0][i] = output_op0(compute_frag_ptr[0][i], source_frag_ptr[0][i]);
+      output_frag_ptr[1][i] = output_op1(compute_frag_ptr[1][i], source_frag_ptr[1][i]);
+      output_frag_ptr[2][i] = output_op2(output_frag_ptr[0][i], output_frag_ptr[1][i]);
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/45_dual_gemm/threadblock/dual_mma_base.h b/examples/45_dual_gemm/threadblock/dual_mma_base.h
new file mode 100644
index 0000000000..3a25da9c2c
--- /dev/null
+++ b/examples/45_dual_gemm/threadblock/dual_mma_base.h
@@ -0,0 +1,232 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/threadblock/mma_base.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy0_,
+    /// B1-specific version of the policy (concept: MmaPolicy)
+    typename Policy1_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class DualMmaBase {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Policy describing tuning details
+  using Policy0 = Policy0_;
+  using Policy1 = Policy1_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator0 = typename Policy0::Operator;
+  using Operator1 = typename Policy1::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy0::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = GemmShape<Shape::kM / WarpGemm::kM,
+                              Shape::kN / WarpGemm::kN,
+                              Shape::kK / WarpGemm::kK>;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations =
+      (WarpGemm::kK / Operator0::Policy::MmaShape::kK);
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  /// Tensor reference to the A operand
+  using TensorRefA = TensorRef<typename Operator0::ElementA, typename Operator0::LayoutA>;
+
+  /// Tensor reference to the B operand
+  using TensorRefB0 = TensorRef<typename Operator0::ElementB, typename Operator0::LayoutB>;
+  using TensorRefB1 = TensorRef<typename Operator1::ElementB, typename Operator1::LayoutB>;
+
+  static_assert(kWarpGemmIterations > 1,
+                "The pipelined structure requires at least two warp-level "
+                "GEMM operations.");
+
+  static_assert((kWarpGemmIterations % 2) == 0,
+                "Inner loop iteration must be an even number.");
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+
+    /// Shape of the A matrix operand in shared memory
+    using ShapeA = MatrixShape<Shape::kM + Policy0::SmemPaddingA::kRow,
+                               Shape::kK * kStages +
+                                   Policy0::SmemPaddingA::kColumn>;
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB0 =
+        MatrixShape<Shape::kK * kStages + Policy0::SmemPaddingB::kRow,
+                    Shape::kN + Policy0::SmemPaddingB::kColumn>;
+    using ShapeB1 =
+        MatrixShape<Shape::kK * kStages + Policy1::SmemPaddingB::kRow,
+                    Shape::kN + Policy1::SmemPaddingB::kColumn>;
+
+   public:
+    //
+    // Data members
+    //
+
+    /// Buffer for A operand
+    AlignedBuffer<typename Operator0::ElementA, ShapeA::kCount> operand_A;
+
+    /// Buffer for B operand
+    AlignedBuffer<typename Operator0::ElementB, ShapeB0::kCount> operand_B0;
+    AlignedBuffer<typename Operator1::ElementB, ShapeB1::kCount> operand_B1;
+
+   public:
+
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the A matrix
+    CUTLASS_DEVICE
+    static typename Operator0::LayoutA LayoutA() {
+      return Operator0::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
+    }
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator0::LayoutB LayoutB0() {
+      return Operator0::LayoutB::packed({ShapeB0::kRow, ShapeB0::kColumn});
+    }
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator1::LayoutB LayoutB1() {
+      return Operator1::LayoutB::packed({ShapeB1::kRow, ShapeB1::kColumn});
+    }
+
+    /// Returns a TensorRef to the A operand
+    CUTLASS_HOST_DEVICE
+    TensorRefA operand_A_ref() {
+      return TensorRefA{operand_A.data(), LayoutA()};
+    }
+
+    /// Returns a TensorRef to the B operand
+    CUTLASS_HOST_DEVICE
+    TensorRefB0 operand_B0_ref() {
+      return TensorRefB0{operand_B0.data(), LayoutB0()};
+    }
+    CUTLASS_HOST_DEVICE
+    TensorRefB1 operand_B1_ref() {
+      return TensorRefB1{operand_B1.data(), LayoutB1()};
+    }
+  };
+
+ protected:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A operand from shared memory
+  typename Operator0::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator0::IteratorB warp_tile_iterator_B0_;
+  typename Operator1::IteratorB warp_tile_iterator_B1_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  DualMmaBase(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      warp_tile_iterator_A_(shared_storage.operand_A_ref(), lane_idx),
+      warp_tile_iterator_B0_(shared_storage.operand_B0_ref(), lane_idx),
+      warp_tile_iterator_B1_(shared_storage.operand_B1_ref(), lane_idx) {
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/45_dual_gemm/threadblock/dual_mma_multistage.h b/examples/45_dual_gemm/threadblock/dual_mma_multistage.h
new file mode 100644
index 0000000000..485922ef2e
--- /dev/null
+++ b/examples/45_dual_gemm/threadblock/dual_mma_multistage.h
@@ -0,0 +1,775 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/threadblock/mma_base.h"
+#include "dual_mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B0 operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB0_,
+    /// Iterates over tiles of B0 operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB0_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Iterates over tiles of B1 operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB1_,
+    /// Iterates over tiles of B1 operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB1_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy0_,
+    /// B1-specific version of the policy (concept: MmaPolicy)
+    typename Policy1_,
+    /// Number of stages,
+    int Stages,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Used for partial specialization
+    typename Enable = bool>
+class DualMmaMultistage : 
+  public DualMmaBase<Shape_, Policy0_, Policy1_, Stages> {
+public:
+  ///< Base class
+  using Base = DualMmaBase<Shape_, Policy0_, Policy1_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B0 operand in global memory
+  using IteratorB0 = IteratorB0_;
+  ///< Iterates over tiles of B1 operand in global memory
+  using IteratorB1 = IteratorB1_;
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+  ///< Policy describing tuning details
+  using Policy0 = Policy0_;
+  using Policy1 = Policy1_;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB0 = SmemIteratorB0_;
+  using SmemIteratorB1 = SmemIteratorB1_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy0::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator0 = typename Policy0::Operator;
+  using Operator1 = typename Policy1::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+  
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator0::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB0 = Operator0::kTransformB;
+  static ComplexTransform const kTransformB1 = Operator1::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB0::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA = typename Operator0::FragmentA;
+  using WarpLoadedFragmentB0 = typename Operator0::FragmentB;
+  using WarpLoadedFragmentB1 = typename Operator1::FragmentB;
+  using WarpTransformedFragmentA = typename Operator0::TransformedFragmentA;
+  using WarpTransformedFragmentB0 = typename Operator0::TransformedFragmentB;
+  using WarpTransformedFragmentB1 = typename Operator1::TransformedFragmentB;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB0 smem_iterator_B0_;
+  SmemIteratorB1 smem_iterator_B1_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  DualMmaMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+      smem_iterator_B0_(shared_storage.operand_B0_ref(), thread_idx),
+      smem_iterator_B1_(shared_storage.operand_B1_ref(), thread_idx)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B0_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+    this->warp_tile_iterator_B1_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(IteratorA &iterator_A, IteratorB0 &iterator_B0, IteratorB1 &iterator_B1,
+                              int group_start_A = 0, int group_start_B = 0) {
+    iterator_A.set_iteration_index(group_start_A *
+                                   IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess /
+                              IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A.get();
+
+          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, iterator_A.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, iterator_A.valid());
+          }
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    iterator_B0.set_iteration_index(group_start_B *
+                                   IteratorB0::kAccessesPerVector);
+    iterator_B1.set_iteration_index(group_start_B *
+                                   IteratorB1::kAccessesPerVector);
+    this->smem_iterator_B0_.set_iteration_index(group_start_B);
+    this->smem_iterator_B1_.set_iteration_index(group_start_B);
+
+    // Async Copy for operand B0
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB0::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB0::AccessType *>(
+                this->smem_iterator_B0_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB0::Element>::value *
+                              IteratorB0::ThreadMap::kElementsPerAccess /
+                              IteratorB0::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB0::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B0.get();
+
+          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, iterator_B0.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, iterator_B0.valid());
+          }
+
+          ++iterator_B0;
+        }
+        ++this->smem_iterator_B0_;
+      }
+    }
+    // Async Copy for operand B1
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB1::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB1::AccessType *>(
+                this->smem_iterator_B1_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB1::Element>::value *
+                              IteratorB1::ThreadMap::kElementsPerAccess /
+                              IteratorB1::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B1.get();
+
+          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, iterator_B1.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, iterator_B1.valid());
+          }
+
+          ++iterator_B1;
+        }
+        ++this->smem_iterator_B1_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum0,
+      FragmentC &accum1,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB0 iterator_B0,
+      IteratorB1 iterator_B1,
+      ///< initial value of accumulator
+      FragmentC const &src_accum0,
+      FragmentC const &src_accum1
+    ) {
+
+    //
+    // Prologue
+    //
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations) {
+
+      iterator_A.clear_mask(gemm_k_iterations == 0);
+      iterator_B0.clear_mask(gemm_k_iterations == 0);
+      iterator_B1.clear_mask(gemm_k_iterations == 0);
+
+      iterator_A.set_iteration_index(0);
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorA::Element>::value *
+              IteratorA::ThreadMap::kElementsPerAccess /
+              IteratorA::kAccessesPerVector / 8;
+
+          int src_bytes = (iterator_A.valid() ? kSrcBytes : 0);
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+
+      iterator_B0.set_iteration_index(0);
+      iterator_B1.set_iteration_index(0);
+      this->smem_iterator_B0_.set_iteration_index(0);
+      this->smem_iterator_B1_.set_iteration_index(0);
+
+      // Async Copy for operand B0
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB0::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB0::AccessType *>(
+                this->smem_iterator_B0_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB0::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB0::Element>::value *
+              IteratorB0::ThreadMap::kElementsPerAccess /
+              IteratorB0::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+              dst_ptr + v, iterator_B0.get(), iterator_B0.valid());
+
+          ++iterator_B0;
+        }
+
+        ++this->smem_iterator_B0_;
+      }
+      // Async Copy for operand B1
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB1::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB1::AccessType *>(
+                this->smem_iterator_B1_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB1::Element>::value *
+              IteratorB1::ThreadMap::kElementsPerAccess /
+              IteratorB1::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+              dst_ptr + v, iterator_B1.get(), iterator_B1.valid());
+
+          ++iterator_B1;
+        }
+
+        ++this->smem_iterator_B1_;
+      }
+
+      // Move to the next stage
+      iterator_A.add_tile_offset({0, 1});
+      iterator_B0.add_tile_offset({1, 0});
+      iterator_B1.add_tile_offset({1, 0});
+
+      this->smem_iterator_A_.add_tile_offset({0, 1});
+      this->smem_iterator_B0_.add_tile_offset({1, 0});
+      this->smem_iterator_B1_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum0 = src_accum0;
+    accum1 = src_accum1;
+
+    //
+    // Clear the remaining tiles of SMEM. This is a functional requirement for some kernels
+    // so that all accumulator elements outside the GEMM footprint are zero.
+    //
+
+    if (SharedMemoryClear == SharedMemoryClearOption::kClearLastStage) {
+
+      /// Iterator to write threadblock-scoped tile of A operand to shared memory
+      SmemIteratorA last_smem_iterator_A(this->smem_iterator_A_);
+
+      typename IteratorA::AccessType zero_A;
+      zero_A.clear();
+
+      last_smem_iterator_A.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                last_smem_iterator_A.get());
+
+        *dst_ptr = zero_A;
+
+        ++last_smem_iterator_A;
+      }
+
+      typename IteratorB0::AccessType zero_B;
+      zero_B.clear();
+
+      /// Iterator to write threadblock-scoped tile of B0 operand to shared memory
+      SmemIteratorB0 last_smem_iterator_B0(this->smem_iterator_B0_);
+      last_smem_iterator_B0.set_iteration_index(0);
+
+      // Async Copy for operand B0
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB0::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB0::AccessType *>(
+                last_smem_iterator_B0.get());
+
+        *dst_ptr = zero_B;
+
+        ++last_smem_iterator_B0;
+      }
+
+      /// Iterator to write threadblock-scoped tile of B1 operand to shared memory
+      SmemIteratorB1 last_smem_iterator_B1(this->smem_iterator_B1_);
+      last_smem_iterator_B1.set_iteration_index(0);
+
+      // Async Copy for operand B1
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+
+        typename IteratorB1::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB1::AccessType *>(
+                last_smem_iterator_B1.get());
+
+        *dst_ptr = zero_B;
+
+        ++last_smem_iterator_B1;
+      }
+    }
+
+    // Waits until stages up to the previous (kStages-2)th stage have committed.
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA warp_loaded_frag_A[2];
+    WarpLoadedFragmentB0 warp_loaded_frag_B0[2];
+    WarpLoadedFragmentB1 warp_loaded_frag_B1[2];
+    WarpTransformedFragmentA warp_transformed_frag_A[2];
+    WarpTransformedFragmentB0 warp_transformed_frag_B0[2];
+    WarpTransformedFragmentB1 warp_transformed_frag_B1[2];
+
+    Operator0 warp_mma0;
+    Operator1 warp_mma1;
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B0_.set_kgroup_index(0);
+    this->warp_tile_iterator_B1_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+    this->warp_tile_iterator_B0_.load(warp_loaded_frag_B0[0]);
+    this->warp_tile_iterator_B1_.load(warp_loaded_frag_B1[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B0_;
+    ++this->warp_tile_iterator_B1_;
+
+    iterator_A.clear_mask(gemm_k_iterations == 0);
+    iterator_B0.clear_mask(gemm_k_iterations == 0);
+    iterator_B1.clear_mask(gemm_k_iterations == 0);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma0.transform(warp_transformed_frag_A[0], warp_transformed_frag_B0[0],
+                        warp_loaded_frag_A[0], warp_loaded_frag_B0[0]);
+    warp_mma1.transform(warp_transformed_frag_A[0], warp_transformed_frag_B1[0],
+                        warp_loaded_frag_A[0], warp_loaded_frag_B1[0]);
+
+    // tf32x3 kernels use staging accumulation. warp_mma uses a temporary
+    // accumulator and this temporary accumulator is added to the final
+    // accumulator once in every mainloop iteration.
+    plus<FragmentC> plus_accum;
+
+    FragmentC tmp_accum0, tmp_accum1;
+
+    if (platform::is_same<typename Operator0::MathOperator,
+                          arch::OpMultiplyAddFastF32>::value
+      || platform::is_same<typename Operator0::MathOperator,
+                           arch::OpMultiplyAddComplexFastF32>::value) {
+
+      tmp_accum0.clear();
+      tmp_accum1.clear();
+    }
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        
+        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B0_.load(warp_loaded_frag_B0[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B1_.load(warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B0_;
+        ++this->warp_tile_iterator_B1_;
+
+        if (warp_mma_k > 0) {
+          warp_mma0.transform(warp_transformed_frag_A[warp_mma_k % 2],
+                              warp_transformed_frag_B0[warp_mma_k % 2],
+                              warp_loaded_frag_A[warp_mma_k % 2],
+                              warp_loaded_frag_B0[warp_mma_k % 2]);
+          warp_mma1.transform(warp_transformed_frag_A[warp_mma_k % 2],
+                              warp_transformed_frag_B1[warp_mma_k % 2],
+                              warp_loaded_frag_A[warp_mma_k % 2],
+                              warp_loaded_frag_B1[warp_mma_k % 2]);
+        }
+
+        if (platform::is_same<typename Operator0::MathOperator,
+                              arch::OpMultiplyAddFastF32>::value
+          || platform::is_same<typename Operator0::MathOperator,
+                               arch::OpMultiplyAddComplexFastF32>::value) {
+
+          warp_mma0(
+            tmp_accum0,
+            warp_transformed_frag_A[warp_mma_k % 2],
+            warp_transformed_frag_B0[warp_mma_k % 2], 
+            tmp_accum0
+          );
+          warp_mma1(
+            tmp_accum1,
+            warp_transformed_frag_A[warp_mma_k % 2],
+            warp_transformed_frag_B1[warp_mma_k % 2], 
+            tmp_accum1
+          );
+
+          if (warp_mma_k == 0) {
+            accum0 = plus_accum(accum0, tmp_accum0);
+            accum1 = plus_accum(accum1, tmp_accum1);
+            tmp_accum0.clear();
+            tmp_accum1.clear();
+          }
+        } else {
+          warp_mma0(
+            accum0,
+            warp_transformed_frag_A[warp_mma_k % 2],
+            warp_transformed_frag_B0[warp_mma_k % 2],
+            accum0
+          );
+          warp_mma1(
+            accum1,
+            warp_transformed_frag_A[warp_mma_k % 2],
+            warp_transformed_frag_B1[warp_mma_k % 2],
+            accum1
+          );
+        }
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations - 1) {
+          int group_start_iteration_A, group_start_iteration_B;
+
+          group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
+          group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance(iterator_A, iterator_B0, iterator_B1, group_start_iteration_A, 
+                               group_start_iteration_B);
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+          int group_start_iteration_A, group_start_iteration_B;
+          group_start_iteration_A =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+          group_start_iteration_B =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance(iterator_A, iterator_B0, iterator_B1, group_start_iteration_A, 
+                               group_start_iteration_B);
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until stages up to the previous (kStages-2)th stage have committed.
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A.add_tile_offset({0, 1});
+          iterator_B0.add_tile_offset({1, 0});
+          iterator_B1.add_tile_offset({1, 0});
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_B0_.add_tile_offset({1, 0});
+          this->smem_iterator_B1_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B0_.add_tile_offset({-Base::kStages, 0});
+            this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy0::kPartitionsK *
+                        Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B0_.add_tile_offset(
+                {-Base::kStages * Policy0::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+            this->warp_tile_iterator_B1_.add_tile_offset(
+                {-Base::kStages * Policy1::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+          iterator_A.clear_mask(gemm_k_iterations == 0);
+          iterator_B0.clear_mask(gemm_k_iterations == 0);
+          iterator_B1.clear_mask(gemm_k_iterations == 0);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
+          warp_mma0.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+                              warp_transformed_frag_B0[(warp_mma_k + 1) % 2],
+                              warp_loaded_frag_A[(warp_mma_k + 1) % 2],
+                              warp_loaded_frag_B0[(warp_mma_k + 1) % 2]);
+          warp_mma1.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+                              warp_transformed_frag_B1[(warp_mma_k + 1) % 2],
+                              warp_loaded_frag_A[(warp_mma_k + 1) % 2],
+                              warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+        }
+      }
+
+    }
+
+    if (platform::is_same<typename Operator0::MathOperator,
+                          arch::OpMultiplyAddFastF32>::value
+      || platform::is_same<typename Operator0::MathOperator,
+                           arch::OpMultiplyAddComplexFastF32>::value) {
+      accum0 = plus_accum(accum0, tmp_accum0); 
+      accum1 = plus_accum(accum1, tmp_accum1); 
+    }
+ 
+    // commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/46_depthwise_simt_conv2dfprop/CMakeLists.txt b/examples/46_depthwise_simt_conv2dfprop/CMakeLists.txt
new file mode 100644
index 0000000000..9a9e74c1cf
--- /dev/null
+++ b/examples/46_depthwise_simt_conv2dfprop/CMakeLists.txt
@@ -0,0 +1,36 @@
+
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+
+cutlass_example_add_executable(
+  46_depthwise_simt_conv2dfprop
+  depthwise_simt_conv2dfprop.cu
+  )
+
diff --git a/examples/46_depthwise_simt_conv2dfprop/depthwise_simt_conv2dfprop.cu b/examples/46_depthwise_simt_conv2dfprop/depthwise_simt_conv2dfprop.cu
new file mode 100644
index 0000000000..cc7d2f10f8
--- /dev/null
+++ b/examples/46_depthwise_simt_conv2dfprop/depthwise_simt_conv2dfprop.cu
@@ -0,0 +1,682 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/**
+This example shows how to run depthwise 2d convolution kernels using functions and data structures
+provided by CUTLASS using SIMT instruction;
+
+There are 3 types of implementations of depthwise 2d convoltion
+  1. kAnalytic
+    Implicit gemm 2d convoltion algorithm.
+  2. kOptimized
+    An optimized algorithm and supports arbitrary stride and dilation.
+  3. kFixedStrideDilation
+    An optimized algorithm with fixed stride and dilation to reduce the runtime computation and do
+more optimizations.
+
+In general, the perf of kFixedStrideDilation would be better than kOptimized. However, if the filter
+size, stride or dilation is large, it would encounter register spilling and may hurt the perf. If
+in this case, please use kOptimized.
+
+For kOptimized and kFixedStrideDilation, in order to fully utilize GPU hardware resources and achieve
+better perf, when the output tensor size is large, splitk should be enabled to achieve better perf.
+
+In this example, it demonstrates how to construct and run a FixedStrideDilation depthwise 2d
+convolution kernel.
+*/
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/conv/kernel/default_depthwise_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/conv/device/direct_convolution.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "helper.h"
+
+// The code section below describes datatype for input, output tensors and computation between
+// elements
+using ElementAccumulator = cutlass::half_t;      // Data type of accumulator
+using ElementComputeEpilogue = cutlass::half_t;  // Data type of epilogue computation (alpha, beta)
+using ElementInputA = cutlass::half_t;           // Data type of elements in input tensor
+using ElementInputB = cutlass::half_t;           // Data type of elements in input tensor
+using ElementOutput = cutlass::half_t;           // Data type of elements in output tensor
+
+using LayoutInputA = cutlass::layout::TensorNHWC;
+using LayoutInputB = cutlass::layout::TensorNHWC;
+using LayoutOutput = cutlass::layout::TensorNHWC;
+
+// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
+using MMAOp = cutlass::arch::OpClassSimt;
+
+// This code section describes CUDA SM architecture number
+using SmArch = cutlass::arch::Sm60;
+
+// This code section describes the groups a thread block will compute
+constexpr int groups_per_cta = 64;
+
+// This code section describes the output tile <N, O, P, Q> a thread block will compute
+using ThreadBlockOutputShape = cutlass::conv::TensorNHWCShape<1, 8, 8, groups_per_cta>;
+
+// This code section describes the filter shape <R, S>
+using FilterShape = cutlass::MatrixShape<3, 3>;
+
+// Threadblock tile shape
+using ThreadblockShape =
+    cutlass::gemm::GemmShape<ThreadBlockOutputShape::kNHW, groups_per_cta, FilterShape::kCount>;
+
+// This code section describes tile size a warp will computes
+// WarpShape::kM = P * Q the warps would process
+// WarpShape::kN = groups_per_cta that the warps would process
+// WarpShape::kK = filter_size that the warps would process
+using WarpShape = cutlass::gemm::GemmShape<16, groups_per_cta, FilterShape::kCount>;
+
+// This code section describes the size of MMA op
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+
+// This code section describes how threadblocks are scheduled on GPU
+using SwizzleThreadBlock =
+    cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<
+        1,
+        ThreadBlockOutputShape::kN,
+        ThreadBlockOutputShape::kH,
+        ThreadBlockOutputShape::kW>;
+
+// Number of pipelines you want to use
+constexpr int NumStages = 4;
+
+// This code section describe iterator algorithm selected is kFixedStrideDilation
+static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm =
+    cutlass::conv::IteratorAlgorithm::kFixedStrideDilation;
+using StrideShape = cutlass::MatrixShape<1, 1>;
+using DilationShape = cutlass::MatrixShape<1, 1>;
+
+constexpr int kEpilogueElementsPerAccess = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+
+// This code section describes the epilogue part of the kernel, we use default value
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+    ElementOutput,               // Data type of output matrix.
+    kEpilogueElementsPerAccess,  // The number of elements per vectorized.
+                                 // memory access. This becomes the vector width of
+                                 // math instructions in the epilogue too.
+    ElementAccumulator,          // Data type of accumulator
+    ElementComputeEpilogue,      // Data type for alpha/beta in linear combination
+    cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling>;  // Epilogue scaling operation.
+
+using DepthwiseDirect2dConv = typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop<
+    ElementInputA,
+    LayoutInputA,
+    ElementInputB,
+    LayoutInputB,
+    ElementOutput,
+    LayoutOutput,
+    ElementAccumulator,
+    MMAOp,
+    SmArch,
+    ThreadblockShape,
+    ThreadBlockOutputShape,
+    FilterShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOp,
+    SwizzleThreadBlock,
+    NumStages,
+    cutlass::arch::OpMultiplyAdd,
+    IteratorAlgorithm,
+    cutlass::conv::StrideSupport::kFixed,
+    StrideShape,
+    DilationShape>::Kernel;
+
+using Direct2dConv = cutlass::conv::device::DirectConvolution<DepthwiseDirect2dConv>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+  bool help;
+  cutlass::Tensor4DCoord input_size;
+  cutlass::Tensor4DCoord filter_size;
+  cutlass::Tensor4DCoord padding;
+  cutlass::MatrixCoord conv_stride;
+  cutlass::MatrixCoord dilation;
+  int groups;
+  int splitk;
+  bool reference_check;
+  bool measure_performance;
+  int iterations;
+  bool save_workspace;
+  ElementComputeEpilogue alpha;
+  ElementComputeEpilogue beta;
+  std::string tag;
+
+  Options()
+      : help(false),
+        input_size(1, 128, 128, 32),
+        filter_size(32, 3, 3, 1),
+        groups(32),
+        padding(1, 1, 1, 1),
+        conv_stride(1, 1),
+        dilation(1, 1),
+        reference_check(false),
+        measure_performance(true),
+        iterations(20),
+        save_workspace(false),
+        alpha(1),
+        beta(0),
+        splitk(1) {}
+
+  // Verify the problem size is compatible with the CUTLASS Convolution implementation.
+  bool valid() {
+    //
+    // CUTLASS attempts to load 128b vectors of cutlass::half_t (F16) elements. Consequently,
+    // all pointers, strides, and tensor extents must be divisible by 8 elements.
+    //
+    int const kAlignment = 8;
+
+    if ((input_size.c() % kAlignment) || (filter_size.n() % kAlignment)) {
+      // misaligned tensors
+      return false;
+    }
+
+    // depthwise conv
+    if (groups != input_size.c()) {
+      return false;
+    }
+
+    if (filter_size.n() != groups) {
+      return false;
+    }
+
+    // Invalid padding
+    if ((padding.h() != filter_size.h() / 2) || (padding.w() != filter_size.w() / 2)) {
+      return false;
+    }
+
+    // Filter size passed through command line does not match filter size template parameter
+    if (filter_size.h() != FilterShape::kRow || filter_size.w() != FilterShape::kColumn) {
+      std::cerr << "Filter size passed in (" << filter_size.h() << "x" << filter_size.w() << ") "
+                << "must match the FilterShape template parameter of the convolution "
+                << "(" << FilterShape::kRow << "x" << FilterShape::kColumn << "). "
+                << "To use the filter shape passed in, change the FilterShape template "
+                << "parameter and recompile this example."
+                << std::endl;
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Updates input and filter sizes
+  void update(cutlass::Tensor4DCoord input_size, cutlass::Tensor4DCoord filter_size) {
+    this->input_size = input_size;
+    this->filter_size = filter_size;
+
+    padding.n() = filter_size.h() / 2;
+    padding.h() = filter_size.h() / 2;
+    padding.w() = filter_size.w() / 2;
+    padding.c() = filter_size.w() / 2;
+  }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+
+    if (cmd.check_cmd_line_flag("ref-check")) {
+      reference_check = true;
+    }
+
+    if (cmd.check_cmd_line_flag("perf-check")) {
+      measure_performance = true;
+    }
+
+    if (cmd.check_cmd_line_flag("save-workspace")) {
+      save_workspace = true;
+    }
+
+    cmd.get_cmd_line_argument("n", input_size.n());
+    cmd.get_cmd_line_argument("h", input_size.h());
+    cmd.get_cmd_line_argument("w", input_size.w());
+    cmd.get_cmd_line_argument("c", input_size.c());
+
+    cmd.get_cmd_line_argument("k", filter_size.n());
+    cmd.get_cmd_line_argument("r", filter_size.h());
+    cmd.get_cmd_line_argument("s", filter_size.w());
+
+    cmd.get_cmd_line_argument("g", groups);
+
+    filter_size.c() = 1;
+    filter_size.n() = input_size.c();
+
+    cmd.get_cmd_line_argument("alpha", alpha);
+    cmd.get_cmd_line_argument("beta", beta);
+    cmd.get_cmd_line_argument("splitk", splitk);
+
+    cmd.get_cmd_line_argument("iterations", iterations);
+    cmd.get_cmd_line_argument("tag", tag);
+
+    int32_t padding_h = filter_size.h() / 2;
+    int32_t padding_w = filter_size.w() / 2;
+    padding = {padding_h, padding_h, padding_w, padding_w};
+  }
+
+  /// Prints the usage statement.
+  std::ostream &print_usage(std::ostream &out) const {
+    out << "46_depthwise_gemm_fprop example\n\n"
+        << "  This example uses Ampere's Tensor Core operators on F16 data types to compute\n"
+        << "  forward convolution on tensors of layout NHWC.\n\n"
+        << "Options:\n\n"
+        << "  --help               If specified, displays this usage statement.\n\n"
+        << "  --n=<int>            Input tensor extent N\n"
+        << "  --h=<int>            Input tensor extent H\n"
+        << "  --w=<int>            Input tensor extent W\n"
+        << "  --c=<int>            Input tensor extent C\n"
+        << "  --k=<int>            Filter extent K\n"
+        << "  --r=<int>            Filter extent R\n"
+        << "  --s=<int>            Filter extent S\n\n"
+        << "  --g=<int>            Groups\n\n"
+        << "  --alpha=<float>      Epilogue scalar alpha\n"
+        << "  --beta=<float>       Epilogue scalar beta\n\n"
+        << "  --splitk=<int>       Enable splitK\n\n"
+        << "  --ref-check          If set (true), reference check on the host is computed\n"
+        << "  --perf-check         If set (true), performance is measured.\n"
+        << "  --iterations=<int>   Number of profiling iterations to perform.\n"
+        << "  --save-workspace     If set, workspace is written to a text file.\n"
+        << "  --tag=<string>       String to replicate across the first column in the results "
+           "table\n";
+
+    out << "\n\nExamples:\n\n"
+        << "$ ./examples/46_depthwise_simt_conv2dfprop/46_depthwise_simt_conv2dfprop  --n=32 "
+           "--h=224 --w=224 --c=128 --k=128 --g=128 --r=3 --s=3\n\n"
+        << "$ ./examples/46_depthwise_simt_conv2dfprop/46_depthwise_simt_conv2dfprop  --n=1 "
+           "--h=224 --w=224 --c=32 --k=32 --g=32 --r=3 --s=3 --splitk=10 --ref-check\n\n";
+
+    return out;
+  }
+
+  /// Computes the output tensor size (NPQK)
+  cutlass::Tensor4DCoord output_size() const {
+    return cutlass::Tensor4DCoord(
+        input_size.n(),
+        (input_size.h() + padding.n() + padding.h() - filter_size.h()) / conv_stride.row() + 1,
+        (input_size.w() + padding.w() + padding.c() - filter_size.w()) / conv_stride.column() + 1,
+        filter_size.n());
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+    // Number of multiply-adds = NPQK * CRS
+    int64_t fmas =
+        output_size().product() * int64_t(filter_size.h() * filter_size.w() * filter_size.c());
+
+    // Two flops per multiply-add
+    return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Result {
+  double runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cutlass::Status reference_check;
+  cudaError_t error;
+
+  Result()
+      : runtime_ms(0),
+        gflops(0),
+        status(cutlass::Status::kSuccess),
+        reference_check(cutlass::Status::kInvalid),
+        error(cudaSuccess) {}
+
+  static std::ostream &print_header(std::ostream &out, Options const &options) {
+    if (!options.tag.empty()) {
+      out << "Name,";
+    }
+
+    out << "Layer,N,H,W,C,K,R,S,G,stride_h,stride_w,dilation_h,dilation_w,splitK,Runtime,GFLOPs";
+
+    return out;
+  }
+
+  std::ostream &print(std::ostream &out, int idx, Options const &options) {
+    if (!options.tag.empty()) {
+      out << options.tag << ",";
+    }
+
+    cutlass::Tensor4DCoord output_size = options.output_size();
+    out << "conv_" << idx << "," << options.input_size.n() << "," << options.input_size.h() << ","
+        << options.input_size.w() << "," << options.input_size.c() << ","
+
+        << options.filter_size.n() << "," << options.filter_size.h() << ","
+        << options.filter_size.w() << ","
+
+        << options.groups << "," << options.conv_stride.row() << "," << options.conv_stride.column()
+        << ","
+
+        << options.dilation.row() << "," << options.dilation.column() << ","
+
+        << options.splitk << ","
+
+        << runtime_ms << "," << gflops;
+
+    return out;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Runs one testcase
+Result profile_convolution(Options const &options) {
+  Result result;
+
+  //
+  // Allocate host-device tensors using the CUTLASS Utilities.
+  //
+
+  cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(options.input_size);
+  cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(options.filter_size);
+  cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b_transpose(options.filter_size);
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(options.output_size());
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(options.output_size());
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_d(options.output_size());
+
+  //
+  // Initialize tensors
+  //
+
+  // Fill tensor A on host with uniform-distribution random data
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_a.host_view(), 1, ElementInputA(5), ElementInputA(-6), 0);
+
+  // Fill tensor B on host with uniform-distribution random data
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_b.host_view(), 1, ElementInputB(3), ElementInputB(-6), 0);
+
+  // Fill tensor C on host with uniform-distribution random data
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_c.host_view(), 1, ElementOutput(5), ElementOutput(-6), 0);
+
+  // Fill tensor D on host with zeros
+  cutlass::reference::host::TensorFill(tensor_d.host_view());
+
+  // Fill tensor D for reference on host with zeros
+  cutlass::reference::host::TensorFill(tensor_ref_d.host_view());
+
+  // Copy data from host to GPU
+  tensor_a.sync_device();
+  tensor_b.sync_device();
+  tensor_b_transpose.sync_device();
+  tensor_c.sync_device();
+  tensor_d.sync_device();
+  tensor_ref_d.sync_device();
+
+  //
+  // Define arguments for CUTLASS Convolution
+  //
+
+  cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;
+
+  // Split P*Q into multiple CTA
+  int split_k_slices = options.splitk;
+
+  // Construct Conv2dProblemSize with user defined output size
+  cutlass::conv::Conv2dProblemSize problem_size(options.input_size,
+                                                options.filter_size,
+                                                options.padding,
+                                                options.conv_stride,
+                                                options.dilation,
+                                                options.output_size(),
+                                                mode,
+                                                split_k_slices,
+                                                options.groups);
+
+  // Construct Direc2dConv::Argument structure with conv2d
+  // problem size, data pointers, and epilogue values
+  typename Direct2dConv::Arguments arguments{problem_size,
+                                             tensor_a.device_ref(),
+                                             tensor_b.device_ref(),
+                                             tensor_c.device_ref(),
+                                             tensor_d.device_ref(),
+                                             {options.alpha, options.beta},
+                                             tensor_b_transpose.device_ref()};
+
+  //
+  // Initialize CUTLASS Convolution
+  //
+
+  Direct2dConv implicit_gemm_op;
+
+  size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  result.status = implicit_gemm_op.can_implement(arguments);
+  CUTLASS_CHECK(result.status);
+
+  result.status = implicit_gemm_op.initialize(arguments, workspace.get());
+  CUTLASS_CHECK(result.status);
+
+  //
+  // Launch initialized CUTLASS kernel
+  //
+  result.status = implicit_gemm_op();
+
+  CUTLASS_CHECK(result.status);
+
+  //
+  // Optional reference check
+  //
+
+  if (options.reference_check) {
+    std::cout << "Verification on host...\n";
+
+    // Compute with reference implementation
+    cutlass::reference::host::Conv2dFprop<
+        ElementInputA,
+        LayoutInputA,
+        ElementInputB,
+        LayoutInputB,
+        ElementOutput,
+        LayoutOutput,
+        ElementComputeEpilogue,
+        ElementAccumulator >(problem_size,
+                             tensor_a.host_ref(),
+                             tensor_b.host_ref(),
+                             tensor_c.host_ref(),
+                             tensor_ref_d.host_ref(),
+                             options.alpha,
+                             options.beta);
+
+    // Check if output from CUTLASS kernel and reference kernel are equal or not
+    tensor_d.sync_host();
+
+    bool passed =
+        cutlass::reference::host::TensorEquals(tensor_d.host_view(), tensor_ref_d.host_view());
+
+    if (!passed) {
+      result.reference_check = cutlass::Status::kErrorInternal;
+      std::cout << "ERROR - results miscompared.\n";
+    } else {
+      result.reference_check = cutlass::Status::kSuccess;
+      std::cout << "Passed.\n";
+    }
+  } else {
+    result.reference_check = cutlass::Status::kInvalid;
+  }
+
+  if (options.save_workspace) {
+    std::stringstream ss;
+
+    ss << "46_depthwise_simt_conv2dfprop" << options.input_size.n() << "x" << options.input_size.h()
+       << "x" << options.input_size.w() << "x" << options.input_size.c() << "_"
+       << options.filter_size.n() << "x" << options.filter_size.h() << "x"
+       << options.filter_size.w() << "x" << options.filter_size.c() << ".dat";
+
+    std::ofstream output_workspace(ss.str());
+
+    output_workspace << "Input = \n"
+                     << tensor_a.host_view() << "\n\n"
+                     << "Filters = \n"
+                     << tensor_b.host_view() << "\n\n";
+
+    if (options.reference_check) {
+      output_workspace << "Reference = \n" << tensor_ref_d.host_view() << "\n\n";
+    }
+
+    output_workspace << "Computed = \n" << tensor_d.host_view() << std::endl;
+
+    std::cout << "Results written to '" << ss.str() << "'." << std::endl;
+  }
+
+  //
+  // Performance measurement
+  //
+
+  if (options.measure_performance) {
+    cudaEvent_t events[2];
+
+    for (auto &event : events) {
+      result.error = cudaEventCreate(&event);
+      if (result.error != cudaSuccess) {
+        std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+        return result;
+      }
+    }
+
+    // Record an event at the start of a series of convolution operations.
+    result.error = cudaEventRecord(events[0]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Launch a sequence of implicit GEMM operations on the device
+    for (int iteration = 0; iteration < options.iterations; ++iteration) {
+      result.status = implicit_gemm_op();
+      CUTLASS_CHECK(result.status);
+    }
+
+    // Record an event when the convolutions have been launched.
+    result.error = cudaEventRecord(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Wait for work on the device to complete.
+    result.error = cudaEventSynchronize(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error)
+                << std::endl;
+      return result;
+    }
+
+    // Measure elapsed runtime
+    float runtime_ms = 0;
+    result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Print average runtime and GFLOPs.
+    result.runtime_ms = double(runtime_ms) / double(options.iterations);
+    result.gflops = options.gflops(result.runtime_ms / 1000.0);
+
+    // Cleanup
+    for (auto event : events) {
+      (void)cudaEventDestroy(event);
+    }
+  }
+
+  return result;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+  bool notSupported = false;
+
+  cudaDeviceProp props;
+  CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
+
+  if (!(props.major >= 6)) {
+    std::cerr << "Run on a machine with compute capability at least 60." << std::endl;
+    notSupported = true;
+  }
+
+  if (notSupported) {
+    return 0;
+  }
+
+  Options options;
+
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  // Execute one problem size
+  if (!options.valid()) {
+    std::cerr << "Invalid problem." << std::endl;
+    return -1;
+  }
+
+  Result result = profile_convolution(options);
+
+  Result::print_header(std::cout, options) << std::endl;
+  result.print(std::cout, 1, options) << std::endl;
+
+  return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/47_ampere_gemm_universal_streamk/CMakeLists.txt b/examples/47_ampere_gemm_universal_streamk/CMakeLists.txt
new file mode 100644
index 0000000000..00be87ede6
--- /dev/null
+++ b/examples/47_ampere_gemm_universal_streamk/CMakeLists.txt
@@ -0,0 +1,45 @@
+
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+cutlass_example_add_executable(
+  47_ampere_gemm_universal_streamk
+  ampere_gemm_universal_streamk.cu
+  )
+
+# Deliberately test non-square sizes to ensure that internal transpose is
+# not triggered when using SM80 EVT
+set(TEST_COMMAND_00 --m=512 --n=768 --k=1152)
+
+cutlass_example_add_executable(
+  47_ampere_gemm_universal_streamk_broadcast
+  ampere_gemm_universal_streamk_broadcast.cu
+  TEST_COMMAND_OPTIONS
+  TEST_COMMAND_00
+  )
diff --git a/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk.cu b/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk.cu
new file mode 100644
index 0000000000..76bd0979de
--- /dev/null
+++ b/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk.cu
@@ -0,0 +1,592 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/***************************************************************************************************
+ Example contrasting the Stream-K parallel decomposition for GEMM threadblocks versus the
+ "classic data-parallel" and "Split-K" decompositions.
+
+ For more details regarding the Stream-K method, see "Stream-K: Work-centric Parallel Decomposition
+ for Dense Matrix-Matrix Multiplication on the GPU" (https://arxiv.org/abs/2301.03598)
+
+ Requires NVIDIA Ampere or newer device (SM80+).
+
+ - To lock persistence mode, power (400W), clocks (1005MHz) for evaluation (assumes device 0 and A100)
+
+     cutlass$ sudo nvidia-smi -pm 1 -i 0
+
+     cutlass$ sudo nvidia-smi -i 0 -pl 400
+
+     cutlass$ sudo nvidia-smi -i 0 -lgc 1005
+
+ - Build and run:
+
+     cutlass$ mkdir build
+
+     cutlass$ cd build
+
+     cutlass/build$ cmake .. -DCUTLASS_NVCC_ARCHS=80
+
+     cutlass/build$ make 47_ampere_gemm_universal_streamk
+
+     cutlass/build$ ./examples/47_ampere_gemm_universal_streamk/47_ampere_gemm_universal_streamk
+
+        10000 timing iterations of 2048 x 2048 x 2048 matrix-matrix multiply
+
+        Basic data-parallel GEMM
+          Disposition: Passed
+          Avg runtime: 0.112633 ms
+          GFLOPs: 152530
+
+        StreamK GEMM with default load-balancing
+          Disposition: Passed
+          Avg runtime: 0.0941929 ms
+          GFLOPs: 182390
+          Speedup vs Basic-DP: 1.196
+
+        StreamK emulating basic data-parallel GEMM
+          Disposition: Passed
+          Avg runtime: 0.113119 ms
+          GFLOPs: 151875
+          Speedup vs Basic-DP: 0.996
+
+        Basic split-K GEMM with tile-splitting factor 2
+          Disposition: Passed
+          Avg runtime: 0.104772 ms
+          GFLOPs: 163973
+
+        StreamK emulating Split-K GEMM with tile-splitting factor 2
+          Disposition: Passed
+          Avg runtime: 0.105379 ms
+          GFLOPs: 163029
+          Speedup vs Basic-SplitK: 0.994
+
+ **************************************************************************************************/
+
+#include <iostream>
+#include <string>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "helper.h"
+
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM kernel configurations (cutlass_tensorop_h16816gemm_128x128_32x4_nn_align8)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::RowMajor;                      // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+// C/D matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C and D matrix operands
+using         LayoutC     = cutlass::layout::RowMajor;                      // Layout type for C and D matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C/D matrices in units of elements (up to 16 bytes)
+
+// Multiply-accumulate blocking/pipelining details
+using ElementAccumulator  = cutlass::half_t;                          // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm80;                      // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;           // Operator class tag
+using ThreadblockShape    = cutlass::gemm::GemmShape<128, 128, 32>;   // Threadblock-level tile size (concept: GemmShape)
+using WarpShape           = cutlass::gemm::GemmShape<64, 64, 32>;     // Warp-level tile size (concept: GemmShape)
+using InstructionShape    = cutlass::gemm::GemmShape<16, 8, 16>;      // Instruction-level tile size (concept: GemmShape)
+constexpr int NumStages   = 4;                                        // Number of global->shared pipeline stages used in the GEMM mainloop
+
+// Epilogue output operator
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+    ElementC,               // Element type for C and D matrix operands
+    AlignmentC,             // Memory access granularity of C and D matrix in units of elements
+    ElementAccumulator,     // Element type from internal accumaccumulation
+    ElementAccumulator>;    // Data type used to compute linear combination
+
+// Reference device GEMM implementation type
+using DeviceGemmReference = cutlass::reference::device::Gemm<
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  ElementAccumulator>;
+
+// Classic data-parallel device GEMM implementation type
+using DeviceGemmBasic = cutlass::gemm::device::GemmUniversal<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOp,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    NumStages,
+    AlignmentA,
+    AlignmentB>;
+
+// StreamK device GEMM implementation type
+using DeviceGemmStreamK = cutlass::gemm::device::GemmUniversal<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOp,
+    cutlass::gemm::threadblock::ThreadblockSwizzleStreamK, // <-- Only difference
+    NumStages,
+    AlignmentA,
+    AlignmentB>;
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Testbed utility types
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Result structure
+struct Result
+{
+  double avg_runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  Result(
+    double avg_runtime_ms = 0,
+    double gflops = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess)
+  :
+    avg_runtime_ms(avg_runtime_ms), gflops(gflops), status(status), error(error), passed(true)
+  {}
+
+};
+
+
+/// Command line options parsing
+struct Options
+{
+  std::string               command_name;
+  bool                      help;
+  cutlass::gemm::GemmCoord  problem_size;
+  float                     alpha;
+  float                     beta;
+  int                       split_k_factor;
+  int                       avail_sms;
+  bool                      reference_check;
+  int                       iterations;
+
+  cutlass::HostTensor<ElementA, LayoutA> tensor_a;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_b;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_c;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_d;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_ref_d;
+
+  Options(std::string command_name) :
+    command_name(command_name),
+    help(false),
+    problem_size({2048, 2048, 2048}),
+    alpha(1.0f),
+    beta(0.0f),
+    split_k_factor(1),
+    avail_sms(-1),              // Number of device SMs to use is unlimited
+    reference_check(true),
+    iterations(10000)
+  {}
+
+  bool valid() const
+  {
+    return true;
+  }
+
+  void parse(int argc, char const **args)
+  {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+
+    cmd.get_cmd_line_argument("m", problem_size.m());
+    cmd.get_cmd_line_argument("n", problem_size.n());
+    cmd.get_cmd_line_argument("k", problem_size.k());
+    cmd.get_cmd_line_argument("alpha", alpha);
+    cmd.get_cmd_line_argument("beta", beta);
+    cmd.get_cmd_line_argument("split", split_k_factor);
+    cmd.get_cmd_line_argument("iterations", iterations);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const
+  {
+    out
+      << "Performs a GEMM computation.\n"
+      << "\n"
+      << "Options:\n"
+      << "\n"
+      << "  --help                      If specified, displays this usage statement.\n\n"
+      << "  --m=<int>                   GEMM M dimension\n"
+      << "  --n=<int>                   GEMM N dimension\n"
+      << "  --k=<int>                   GEMM K dimension\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha\n"
+      << "  --beta=<f32>                Epilogue scalar beta\n\n"
+      << "  --split=<int>               Split-K factor to emulate\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n";
+
+    out
+      << "\n\nExamples:\n\n"
+      << "$ " << command_name << " --m=1024 --n=512 --k=1024 --alpha=2 --beta=0.707 \n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const
+  {
+    // Two flops per multiply-add
+    return 2.0 * double(problem_size.product()) / double(1.0e9) / runtime_s;
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM evaluation
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Populates a DeviceGemmBasic::Arguments structure from the given commandline options
+typename DeviceGemmBasic::Arguments args_from_options(
+    const DeviceGemmBasic &device_gemm,
+    const Options &options,
+    cutlass::HostTensor<ElementA, LayoutA> &tensor_a,
+    cutlass::HostTensor<ElementB, LayoutB> &tensor_b,
+    cutlass::HostTensor<ElementC, LayoutC> &tensor_c,
+    cutlass::HostTensor<ElementC, LayoutC> &tensor_d)
+{
+  return typename DeviceGemmBasic::Arguments(
+    cutlass::gemm::GemmUniversalMode::kGemm,  // universal mode
+    options.problem_size,                     // problem_size
+    options.split_k_factor,                   // batch count / splitk slices
+    {                                         // epilogue parameters
+      ElementAccumulator(options.alpha),
+      ElementAccumulator(options.beta)
+    },
+    tensor_a.device_data(),                   // ptr_A
+    tensor_b.device_data(),                   // ptr_B
+    tensor_c.device_data(),                   // ptr_C
+    tensor_d.device_data(),                   // ptr_D
+    options.problem_size.mk().product(),      // batch_stride_A
+    options.problem_size.nk().product(),      // batch_stride_B
+    options.problem_size.mn().product(),      // batch_stride_C
+    options.problem_size.mn().product(),      // batch_stride_D
+    tensor_a.layout().stride(0),              // stride_a
+    tensor_b.layout().stride(0),              // stride_b
+    tensor_c.layout().stride(0),              // stride_c
+    tensor_d.layout().stride(0));             // stride_d
+}
+
+/// Populates a DeviceGemmStreamK::Arguments structure from the given commandline options
+typename DeviceGemmStreamK::Arguments args_from_options(
+    const DeviceGemmStreamK &device_gemm,
+    const Options &options,
+    cutlass::HostTensor<ElementA, LayoutA> &tensor_a,
+    cutlass::HostTensor<ElementB, LayoutB> &tensor_b,
+    cutlass::HostTensor<ElementC, LayoutC> &tensor_c,
+    cutlass::HostTensor<ElementC, LayoutC> &tensor_d)
+{
+  return typename DeviceGemmStreamK::Arguments(
+    cutlass::gemm::GemmUniversalMode::kGemm,  // universal mode
+    options.problem_size,                     // problem_size
+    options.split_k_factor,                   // batch count / splitk slices
+    {                                         // epilogue parameters
+      ElementAccumulator(options.alpha),
+      ElementAccumulator(options.beta)
+    },
+    tensor_a.device_data(),                   // ptr_A
+    tensor_b.device_data(),                   // ptr_B
+    tensor_c.device_data(),                   // ptr_C
+    tensor_d.device_data(),                   // ptr_D
+    options.problem_size.mk().product(),      // batch_stride_A
+    options.problem_size.nk().product(),      // batch_stride_B
+    options.problem_size.mn().product(),      // batch_stride_C
+    options.problem_size.mn().product(),      // batch_stride_D
+    tensor_a.layout().stride(0),              // stride_a
+    tensor_b.layout().stride(0),              // stride_b
+    tensor_c.layout().stride(0),              // stride_c
+    tensor_d.layout().stride(0),              // stride_d
+    options.avail_sms);                       // avail_sms
+}
+
+
+/// Execute a given example GEMM computation
+template <typename DeviceGemmT>
+Result run(std::string description, Options &options)
+{
+  // Display test description
+  std::cout << std::endl << description << std::endl;
+
+  // Zero-initialize test output matrix D
+  cutlass::reference::host::TensorFill(options.tensor_d.host_view());
+  options.tensor_d.sync_device();
+
+  // Instantiate CUTLASS kernel depending on templates
+  DeviceGemmT device_gemm;
+
+  // Create a structure of gemm kernel arguments suitable for invoking an instance of DeviceGemmT
+  auto arguments = args_from_options(device_gemm, options, options.tensor_a, options.tensor_b, options.tensor_c, options.tensor_d);
+
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size = DeviceGemmT::get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  // Check the problem size is supported or not
+  CUTLASS_CHECK(device_gemm.can_implement(arguments));
+
+  // Initialize CUTLASS kernel with arguments and workspace pointer
+  CUTLASS_CHECK(device_gemm.initialize(arguments, workspace.get()));
+
+  // Correctness / Warmup iteration
+  CUTLASS_CHECK(device_gemm());
+
+  // Copy output data from CUTLASS and reference kernel to host for comparison
+  options.tensor_d.sync_host();
+
+  // Check if output from CUTLASS kernel and reference kernel are equal or not
+  Result result;
+  result.passed = cutlass::reference::host::TensorEquals(
+    options.tensor_d.host_view(),
+    options.tensor_ref_d.host_view());
+
+  std::cout << "  Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
+
+  // Run profiling loop
+  if (options.iterations > 0)
+  {
+    GpuTimer timer;
+    timer.start();
+    for (int iter = 0; iter < options.iterations; ++iter) {
+      CUTLASS_CHECK(device_gemm());
+    }
+    timer.stop();
+
+    // Compute average runtime and GFLOPs.
+    float elapsed_ms = timer.elapsed_millis();
+    result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
+    result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
+
+    std::cout << "  Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
+    std::cout << "  GFLOPs: " << result.gflops << std::endl;
+  }
+
+  if (!result.passed) {
+    exit(-1);
+  }
+
+  return result;
+}
+
+
+/// Program entrypoint
+int main(int argc, const char **argv)
+{
+  // CUTLASS must be compiled with CUDA 11.0 Toolkit to run these examples.
+  if (!(__CUDACC_VER_MAJOR__ >= 11)) {
+    std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl;
+
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+
+  // Current device must must have compute capability at least 80
+  cudaDeviceProp props;
+  int current_device_id;
+  CUDA_CHECK(cudaGetDevice(&current_device_id));
+  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
+  if (!((props.major * 10 + props.minor) >= 80))
+  {
+    std::cerr << "Ampere Tensor Core operations must be run on a machine with compute capability at least 80."
+              << std::endl;
+
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+
+  // Parse commandline options
+  Options options("ampere_streamk_gemm");
+  options.parse(argc, argv);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  std::cout <<
+    options.iterations << " timing iterations of " <<
+    options.problem_size.m() << " x " <<
+    options.problem_size.n() << " x " <<
+    options.problem_size.k() << " matrix-matrix multiply" << std::endl;
+
+  if (!options.valid()) {
+    std::cerr << "Invalid problem." << std::endl;
+    return -1;
+  }
+
+
+  //
+  // Initialize GEMM datasets
+  //
+
+  // Initialize tensors using CUTLASS helper functions
+  options.tensor_a.resize(options.problem_size.mk());       // <- Create matrix A with dimensions M x K
+  options.tensor_b.resize(options.problem_size.kn());       // <- Create matrix B with dimensions K x N
+  options.tensor_c.resize(options.problem_size.mn());       // <- Create matrix C with dimensions M x N
+  options.tensor_d.resize(options.problem_size.mn());       // <- Create matrix D with dimensions M x N used to store output from CUTLASS kernel
+  options.tensor_ref_d.resize(options.problem_size.mn());   // <- Create matrix D with dimensions M x N used to store output from reference kernel
+
+  // Fill matrix A on host with uniform-random data [-2, 2]
+  cutlass::reference::host::TensorFillRandomUniform(
+      options.tensor_a.host_view(),
+      1,
+      ElementA(2),
+      ElementA(-2),
+      0);
+
+  // Fill matrix B on host with uniform-random data [-2, 2]
+  cutlass::reference::host::TensorFillRandomUniform(
+      options.tensor_b.host_view(),
+      1,
+      ElementB(2),
+      ElementB(-2),
+      0);
+
+  // Fill matrix C on host with uniform-random data [-2, 2]
+  cutlass::reference::host::TensorFillRandomUniform(
+      options.tensor_c.host_view(),
+      1,
+      ElementC(2),
+      ElementC(-2),
+      0);
+
+
+  //
+  // Compute reference output
+  //
+
+  // Copy data from host to GPU
+  options.tensor_a.sync_device();
+  options.tensor_b.sync_device();
+  options.tensor_c.sync_device();
+
+  // Zero-initialize reference output matrix D
+  cutlass::reference::host::TensorFill(options.tensor_ref_d.host_view());
+  options.tensor_ref_d.sync_device();
+
+  // Create instantiation for device reference gemm kernel
+  DeviceGemmReference gemm_reference;
+
+  // Launch device reference gemm kernel
+  gemm_reference(
+    options.problem_size,
+    ElementAccumulator(options.alpha),
+    options.tensor_a.device_ref(),
+    options.tensor_b.device_ref(),
+    ElementAccumulator(options.beta),
+    options.tensor_c.device_ref(),
+    options.tensor_ref_d.device_ref());
+
+  // Wait for kernels to finish
+  CUDA_CHECK(cudaDeviceSynchronize());
+
+  // Copy output data from reference kernel to host for comparison
+  options.tensor_ref_d.sync_host();
+
+
+  //
+  // Evaluate CUTLASS kernels
+  //
+
+  // Test default operation
+  if (options.split_k_factor == 1)
+  {
+    // Compare basic data-parallel version versus StreamK version using default load-balancing heuristics
+    Result basic_dp         = run<DeviceGemmBasic>("Basic data-parallel GEMM", options);
+    Result streamk_default  = run<DeviceGemmStreamK>("StreamK GEMM with default load-balancing", options);
+
+    printf("  Speedup vs Basic-DP: %.3f\n", (basic_dp.avg_runtime_ms / streamk_default.avg_runtime_ms));
+
+    // Show that StreamK can emulate basic data-parallel GEMM when we set the number of SMs to load-balance across = 1
+    options.avail_sms       = 1;        // Set loadbalancing width to 1 SM (no load balancing)
+    Result streamk_dp       = run<DeviceGemmStreamK>("StreamK emulating basic data-parallel GEMM", options);
+    options.avail_sms       = -1;       // Reset loadbalancing width to unspecified SMs (i.e., the number of device SMs)
+
+    printf("  Speedup vs Basic-DP: %.3f\n", (basic_dp.avg_runtime_ms / streamk_dp.avg_runtime_ms));
+
+    options.split_k_factor++;     // Increment splitting factor for next evaluation
+
+  }
+
+  // Show that StreamK can emulate "Split-K" with a tile-splitting factor
+  Result basic_splitk = run<DeviceGemmBasic>(
+    std::string("Basic split-K GEMM with tile-splitting factor ") + std::to_string(options.split_k_factor),
+    options);
+
+  Result streamk_splitk = run<DeviceGemmStreamK>(
+    std::string("StreamK emulating Split-K GEMM with tile-splitting factor ") + std::to_string(options.split_k_factor),
+    options);
+
+  printf("  Speedup vs Basic-SplitK: %.3f\n", (basic_splitk.avg_runtime_ms / streamk_splitk.avg_runtime_ms));
+
+  return 0;
+}
diff --git a/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk_broadcast.cu b/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk_broadcast.cu
new file mode 100644
index 0000000000..ed65e58c89
--- /dev/null
+++ b/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk_broadcast.cu
@@ -0,0 +1,738 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/***************************************************************************************************
+ Example contrasting the Stream-K parallel decomposition for GEMM threadblocks versus the
+ "classic data-parallel" and "Split-K" decompositions + residual add.
+
+ For more details regarding the Stream-K method, see "Stream-K: Work-centric Parallel Decomposition
+ for Dense Matrix-Matrix Multiplication on the GPU" (https://arxiv.org/abs/2301.03598)
+
+ Requires NVIDIA Ampere or newer device (SM80+).
+
+ - To lock persistence mode, power (400W), clocks (1005MHz) for evaluation (assumes device 0 and A100)
+
+     cutlass$ sudo nvidia-smi -pm 1 -i 0
+
+     cutlass$ sudo nvidia-smi -i 0 -pl 400
+
+     cutlass$ sudo nvidia-smi -i 0 -lgc 1005
+
+ - Build and run:
+
+     cutlass$ mkdir build
+
+     cutlass$ cd build
+
+     cutlass/build$ cmake .. -DCUTLASS_NVCC_ARCHS=80
+
+     cutlass/build$ make 47_ampere_gemm_universal_streamk_broadcast
+
+     cutlass/build$ ./examples/47_ampere_gemm_universal_streamk/47_ampere_gemm_universal_streamk_broadcast
+
+ - Reset clocks when done:
+
+     cutlass$ sudo nvidia-smi -rgc
+
+ **************************************************************************************************/
+
+#include <iostream>
+#include <string>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm_universal.h"
+#include "cutlass/gemm/device/gemm_universal_with_broadcast.h"
+#include "cutlass/gemm/device/gemm_universal_streamk_with_broadcast.h"
+#include "cutlass/epilogue/thread/linear_combination_residual_block.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/host/error_metrics.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_foreach.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
+#include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+
+#include "helper.h"
+
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM kernel configurations (cutlass_tensorop_h16816gemm_128x128_32x4_nn_align8)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// A matrix configuration
+using         ElementA         = cutlass::half_t;                                  // Element type for A matrix operand
+using         LayoutA          = cutlass::layout::RowMajor;                        // Layout type for A matrix operand
+constexpr int AlignmentA       = 128 / cutlass::sizeof_bits<ElementA>::value;      // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using         ElementB         = cutlass::half_t;                                  // Element type for B matrix operand
+using         LayoutB          = cutlass::layout::RowMajor;                        // Layout type for B matrix operand
+constexpr int AlignmentB       = 128 / cutlass::sizeof_bits<ElementB>::value;      // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+// C1/C2/D matrix configuration
+using         ElementC         = cutlass::half_t;                                  // Element type for C matrix operands
+using         LayoutC          = cutlass::layout::RowMajor;                        // Layout type for C matrix operands
+constexpr int AlignmentC       = 128 / cutlass::sizeof_bits<ElementC>::value;      // Memory access granularity/alignment of C matrices in units of elements (up to 16 bytes)
+
+// Output matrix configuration
+using         ElementOutput    = cutlass::half_t;                                  // Element type for output matrix operands
+using         LayoutOutput     = cutlass::layout::RowMajor;                        // Layout type for output matrix operands
+// constexpr int AlignmentOutput  = 128 / cutlass::sizeof_bits<ElementOutput>::value; // Memory access granularity/alignment of output matrices in units of elements (up to 16 bytes)
+
+// Multiply-accumulate blocking/pipelining details
+using ElementAccumulator  = cutlass::half_t;                          // Element type for internal accumulation
+using ElementCompute      = cutlass::half_t;                          // Element type for compute
+using ArchTag             = cutlass::arch::Sm80;                      // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;           // Operator class tag
+using ThreadblockShape    = cutlass::gemm::GemmShape<128, 128, 32>;   // Threadblock-level tile size (concept: GemmShape)
+using WarpShape           = cutlass::gemm::GemmShape<64, 64, 32>;     // Warp-level tile size (concept: GemmShape)
+using InstructionShape    = cutlass::gemm::GemmShape<16, 8, 16>;      // Instruction-level tile size (concept: GemmShape)
+constexpr int NumStages   = 4;                                        // Number of global->shared pipeline stages used in the GEMM mainloop
+constexpr int EVTEpilogueStages = 1;                                  // Number of epilogue stages in EVT
+
+// Residual block configuration
+
+// Epilogue output operator
+/// Using LinearCombinationResidualBlock
+/// Models a residual block of the form: UnaryOp(BinaryOp(BinaryOp(ActivationOp(TensorOp(X) + bias), residual1), residual2))
+using EpilogueOp = cutlass::epilogue::thread::LinearCombinationResidualBlock<  
+    ElementOutput,                        // Element type for output matrix
+    ElementAccumulator,                   // Element type from internal accumulation
+    ElementCompute,                       // Element type from internal accumulation
+    ElementC,                             // Element type for C1/C2/D matrix operands
+    AlignmentC,                           // Memory access granularity of C and D matrix in units of elements
+    cutlass::epilogue::thread::Identity,  // Activation
+    cutlass::plus,                        // Binary operation 1
+    cutlass::epilogue::thread::Identity,  // Unary operation
+    cutlass::plus                         // Binary operation 2
+    >;
+
+// Reference device GEMM implementation type
+using DeviceGemmReference = cutlass::reference::device::Gemm<
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  ElementAccumulator>;
+
+// Classic data-parallel device GEMM implementation type
+using DeviceGemmBasic = cutlass::gemm::device::GemmUniversalWithBroadcast<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOp,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    NumStages,
+    AlignmentA,
+    AlignmentB>;
+
+// StreamK device GEMM implementation type with EVT
+using namespace cute;
+
+using OutputTileThreadMap = cutlass::epilogue::threadblock::OutputTileThreadLayout<
+  ThreadblockShape, 
+  WarpShape, 
+  ElementC, 
+  AlignmentC, 
+  EVTEpilogueStages
+>;
+
+using Accum = cutlass::epilogue::threadblock::VisitorAccFetch;
+
+using Bias = cutlass::epilogue::threadblock::VisitorRowBroadcast<
+    OutputTileThreadMap, ElementC,
+    cute::Stride<_0, _1, int32_t>  // StrideMNL
+>;
+
+using C1 = cutlass::epilogue::threadblock::VisitorAuxLoad<
+    OutputTileThreadMap, ElementC, 
+    cute::Stride<int64_t, _1, int64_t> // StrideMNL
+>;
+
+using C2 = cutlass::epilogue::threadblock::VisitorAuxLoad<
+    OutputTileThreadMap, ElementC, 
+    cute::Stride<int64_t, _1, int64_t> // StrideMNL
+>;
+
+using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
+    cutlass::plus, ElementCompute, ElementCompute,
+    cutlass::FloatRoundStyle::round_to_nearest
+>;
+
+using EVTCompute0 = cutlass::epilogue::threadblock::Sm80EVT<
+    Compute0,
+    Accum,
+    Bias>;
+  
+using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
+    cutlass::plus, ElementCompute, ElementCompute,
+    cutlass::FloatRoundStyle::round_to_nearest
+>;
+
+using EVTCompute1 = cutlass::epilogue::threadblock::Sm80EVT<
+    Compute1,
+    EVTCompute0,
+    C1>;
+
+using Compute2 = cutlass::epilogue::threadblock::VisitorCompute<
+    cutlass::plus, ElementOutput, ElementCompute,
+    cutlass::FloatRoundStyle::round_to_nearest
+>;
+
+using EVTCompute2 = cutlass::epilogue::threadblock::Sm80EVT<
+    Compute2,
+    EVTCompute1,
+    C2>;
+
+using D = cutlass::epilogue::threadblock::VisitorAuxStore<
+    OutputTileThreadMap, ElementOutput, cutlass::FloatRoundStyle::round_to_nearest,
+    cute::Stride<int64_t, _1, int64_t> // StrideMNL
+>;
+
+using EVTD = cutlass::epilogue::threadblock::Sm80EVT<
+    D,
+    EVTCompute2>;
+
+using EVTKernelStreamK =
+    typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, AlignmentA,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, AlignmentB,
+    ElementC, LayoutC, AlignmentC,
+    ElementAccumulator,
+    ElementCompute,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EVTD,
+    cutlass::gemm::threadblock::ThreadblockSwizzleStreamK,
+    NumStages,
+    cutlass::arch::OpMultiplyAdd,
+    EVTEpilogueStages
+>::GemmKernel;
+
+using DeviceGemmStreamK = cutlass::gemm::device::GemmUniversalAdapter<EVTKernelStreamK>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Testbed utility types
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Result structure
+struct Result
+{
+  double avg_runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  Result(
+    double avg_runtime_ms = 0,
+    double gflops = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess)
+  :
+    avg_runtime_ms(avg_runtime_ms), gflops(gflops), status(status), error(error), passed(true)
+  {}
+
+};
+
+
+/// Command line options parsing
+struct Options
+{
+  std::string               command_name;
+  bool                      help;
+  cutlass::gemm::GemmCoord  problem_size;
+  float                     alpha;
+  float                     beta;
+  int                       split_k_factor;
+  int                       avail_sms;
+  int                       iterations;
+  bool                      real;
+
+  cutlass::HostTensor<ElementA, LayoutA> tensor_a;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_b;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_c1;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_c2;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_d;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_ref_d;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_Vector;
+  // cutlass::HostTensor<ElementC, LayoutC> tensor_Tensor;
+
+  Options(std::string command_name) :
+    command_name(command_name),
+    help(false),
+    problem_size({2048, 2048, 2048}),
+    alpha(1.0f),
+    beta(1.0f),
+    split_k_factor(1),
+    avail_sms(-1),              // Number of device SMs to use is unlimited
+    real(false),
+    iterations(10000)
+  {}
+
+  bool valid() const
+  {
+    return true;
+  }
+
+  void parse(int argc, char const **args)
+  {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+
+    cmd.get_cmd_line_argument("m", problem_size.m());
+    cmd.get_cmd_line_argument("n", problem_size.n());
+    cmd.get_cmd_line_argument("k", problem_size.k());
+    cmd.get_cmd_line_argument("alpha", alpha);
+    cmd.get_cmd_line_argument("beta", beta);
+    cmd.get_cmd_line_argument("split", split_k_factor);
+    cmd.get_cmd_line_argument("iterations", iterations);
+    real = cmd.check_cmd_line_flag("real");
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const
+  {
+    out
+      << "Performs a GEMM computation.\n"
+      << "\n"
+      << "Options:\n"
+      << "\n"
+      << "  --help                      If specified, displays this usage statement.\n\n"
+      << "  --m=<int>                   GEMM M dimension\n"
+      << "  --n=<int>                   GEMM N dimension\n"
+      << "  --k=<int>                   GEMM K dimension\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha\n"
+      << "  --beta=<f32>                Epilogue scalar beta\n\n"
+      << "  --split=<int>               Split-K factor to emulate\n\n"
+      << "  --real                      If specified, initializes with real values instead of whole numbers. Errors are to be expected.\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n";
+
+    out
+      << "\n\nExamples:\n\n"
+      << "$ " << command_name << " --m=1024 --n=512 --k=1024 --alpha=2 --beta=0.707 \n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const
+  {
+    // Two flops per multiply-add
+    return 2.0 * double(problem_size.product()) / double(1.0e9) / runtime_s;
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM evaluation
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Populates a DeviceGemmBasic::Arguments structure from the given commandline options
+typename DeviceGemmBasic::Arguments args_from_options(
+    const DeviceGemmBasic &device_gemm,
+    const Options &options,
+    cutlass::HostTensor<ElementA, LayoutA> &tensor_a,
+    cutlass::HostTensor<ElementB, LayoutB> &tensor_b,
+    cutlass::HostTensor<ElementC, LayoutC> &tensor_c1,
+    cutlass::HostTensor<ElementC, LayoutC> &tensor_c2,
+    cutlass::HostTensor<ElementC, LayoutC> &tensor_d,
+    cutlass::HostTensor<ElementC, LayoutC> &tensor_Vector /*,
+    cutlass::HostTensor<ElementC, LayoutC> &tensor_Tensor */
+    )
+{
+  return typename DeviceGemmBasic::Arguments(
+    cutlass::gemm::GemmUniversalMode::kGemm,  // universal mode
+    options.problem_size,                     // problem_size
+    options.split_k_factor,                   // batch count / splitk slices
+    {                                         // epilogue parameters
+      ElementAccumulator(options.alpha),
+      ElementAccumulator(options.beta)
+    },
+    tensor_a.device_data(),                   // ptr_A
+    tensor_b.device_data(),                   // ptr_B
+    tensor_c1.device_data(),                  // ptr_C1
+    tensor_c2.device_data(),                  // ptr_C2
+    tensor_d.device_data(),                   // ptr_D
+    tensor_Vector.device_data(),              // ptr_Vector
+    /* tensor_Tensor.device_data(), */nullptr,// ptr_Tensor
+    options.problem_size.mk().product(),      // batch_stride_A
+    options.problem_size.nk().product(),      // batch_stride_B
+    options.problem_size.mn().product(),      // batch_stride_C1
+    options.problem_size.mn().product(),      // batch_stride_C2
+    options.problem_size.mn().product(),      // batch_stride_D
+    options.problem_size.mn().product(),      // batch_stride_Vector
+    options.problem_size.mn().product(),      // batch_stride_Tensor
+    tensor_a.layout().stride(0),              // stride_a
+    tensor_b.layout().stride(0),              // stride_b
+    tensor_c1.layout().stride(0),             // stride_c1
+    tensor_c2.layout().stride(0),             // stride_c2
+    tensor_d.layout().stride(0),              // stride_d
+    /*tensor_Vector.layout().stride(0)*/0,    // stride_Vector
+    /*tensor_Tensor.layout().stride(0)*/0);   // stride_Tensor
+}
+
+/// Populates a DeviceGemmStreamK::Arguments structure from the given commandline options
+typename DeviceGemmStreamK::Arguments args_from_options(
+    const DeviceGemmStreamK &device_gemm,
+    const Options &options,
+    cutlass::HostTensor<ElementA, LayoutA> &tensor_a,
+    cutlass::HostTensor<ElementB, LayoutB> &tensor_b,
+    cutlass::HostTensor<ElementC, LayoutC> &tensor_c1,
+    cutlass::HostTensor<ElementC, LayoutC> &tensor_c2,
+    cutlass::HostTensor<ElementC, LayoutC> &tensor_d,
+    cutlass::HostTensor<ElementC, LayoutC> &tensor_Vector/*,
+    cutlass::HostTensor<ElementC, LayoutC> &tensor_Tensor*/
+    )
+{ 
+  typename EVTD::Arguments callback_args{
+    {
+      {
+        {
+          {},                                                                                                          // Accum
+          {tensor_Vector.device_data(), ElementC(0), {_0{}, _1{}, int32_t(options.problem_size.n())}},                 // Bias
+          {}                                                                                                           // Compute0
+        },                                                                                                             // EVTCompute0
+        {tensor_c1.device_data(), ElementC(0), {options.problem_size.n(), _1{}, options.problem_size.mn().product()}}, // C1
+        {}                                                                                                             // Compute1
+      },                                                                                                               // EVTCompute1
+      {tensor_c2.device_data(), ElementC(0), {options.problem_size.n(), _1{}, options.problem_size.mn().product()}},   // C2
+      {}                                                                                                               // Compute2
+    },                                                                                                                 // EVTCompute2
+    {tensor_d.device_data(), {options.problem_size.n(), _1{}, options.problem_size.mn().product()}},                   // D
+  };                                                                                                                   // EVTD
+
+  return typename DeviceGemmStreamK::Arguments(
+    cutlass::gemm::GemmUniversalMode::kGemm,  // universal mode
+    options.problem_size,                     // problem_size
+    options.split_k_factor,                   // batch count / splitk slices
+    callback_args,                            // argument of EVT callbacks
+    tensor_a.device_data(),                   // ptr_A
+    tensor_b.device_data(),                   // ptr_B
+    nullptr,                                  // ptr_C (unused)
+    nullptr,                                  // ptr_D (unused)
+    options.problem_size.mk().product(),      // batch_stride_A
+    options.problem_size.nk().product(),      // batch_stride_B
+    0,                                        // batch_stride_C (unused)
+    0,                                        // batch_stride_D (unused)
+    tensor_a.layout().stride(0),              // stride_a
+    tensor_b.layout().stride(0),              // stride_b
+    0,                                        // stride_c (unused)
+    0,                                        // stride_d (unused)
+    options.avail_sms);                       // avail_sms
+}
+
+/// Execute a given example GEMM computation
+template <typename DeviceGemmT>
+Result run(std::string description, Options &options)
+{
+  // Display test description
+  std::cout << std::endl << description << std::endl;
+
+  // Zero-initialize test output matrix D
+  cutlass::reference::host::TensorFill(options.tensor_d.host_view());
+  options.tensor_d.sync_device();
+
+  // Instantiate CUTLASS kernel depending on templates
+  DeviceGemmT device_gemm;
+
+  // Create a structure of gemm kernel arguments suitable for invoking an instance of DeviceGemmT
+  auto arguments = args_from_options(device_gemm, options, 
+      options.tensor_a, options.tensor_b, options.tensor_c1, options.tensor_c2, options.tensor_d, 
+      options.tensor_Vector/*, options.tensor_Tensor*/);
+
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size = DeviceGemmT::get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  // Check the problem size is supported or not
+  CUTLASS_CHECK(device_gemm.can_implement(arguments));
+
+  // Initialize CUTLASS kernel with arguments and workspace pointer
+  CUTLASS_CHECK(device_gemm.initialize(arguments, workspace.get()));
+
+  // Correctness / Warmup iteration
+  CUTLASS_CHECK(device_gemm());
+
+  // Copy output data from CUTLASS and reference kernel to host for comparison
+  options.tensor_d.sync_host();
+
+  // Check if output from CUTLASS kernel and reference kernel are equal or not
+  Result result;
+  result.passed = cutlass::reference::host::TensorEquals(
+    options.tensor_d.host_view(),
+    options.tensor_ref_d.host_view());
+
+  double err = cutlass::reference::host::TensorRelativeErrorMetric(
+    options.tensor_d.host_view(),
+    options.tensor_ref_d.host_view());
+
+  std::cout << "  Disposition: " << (result.passed ? "Passed" : "Failed") << " \t Relative error: " << err << std::endl;
+
+  // Run profiling loop
+  if (options.iterations > 0)
+  {
+    GpuTimer timer;
+    timer.start();
+    for (int iter = 0; iter < options.iterations; ++iter) {
+      CUTLASS_CHECK(device_gemm());
+    }
+    timer.stop();
+
+    // Compute average runtime and GFLOPs.
+    float elapsed_ms = timer.elapsed_millis();
+    result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
+    result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
+
+    std::cout << "  Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
+    std::cout << "  GFLOPs: " << result.gflops << std::endl;
+  }
+
+  // TODO: uncomment when results match
+  //if (!result.passed) {
+  //  exit(-1);
+  //}
+
+  return result;
+}
+
+
+/// Program entrypoint
+int main(int argc, const char **argv)
+{
+  // CUTLASS must be compiled with CUDA 11.0 Toolkit to run these examples.
+  if (!(__CUDACC_VER_MAJOR__ >= 11)) {
+    std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl;
+
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+
+  // Current device must must have compute capability at least 80
+  cudaDeviceProp props;
+  int current_device_id;
+  CUDA_CHECK(cudaGetDevice(&current_device_id));
+  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
+  if (!((props.major * 10 + props.minor) >= 80))
+  {
+    std::cerr << "Ampere Tensor Core operations must be run on a machine with compute capability at least 80."
+              << std::endl;
+
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+
+  // Parse commandline options
+  Options options("ampere_streamk_broadcast_gemm");
+  options.parse(argc, argv);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  std::cout <<
+    options.iterations << " timing iterations of " <<
+    options.problem_size.m() << " x " <<
+    options.problem_size.n() << " x " <<
+    options.problem_size.k() << " matrix-matrix multiply" << std::endl;
+
+  if (!options.valid()) {
+    std::cerr << "Invalid problem." << std::endl;
+    return -1;
+  }
+
+
+  //
+  // Initialize GEMM datasets
+  //
+
+  // Initialize tensors using CUTLASS helper functions
+  options.tensor_a.resize(options.problem_size.mk());           // <- Create matrix A with dimensions M x K
+  options.tensor_b.resize(options.problem_size.kn());           // <- Create matrix B with dimensions K x N
+  options.tensor_c1.resize(options.problem_size.mn());          // <- Create matrix C1 with dimensions M x N
+  options.tensor_c2.resize(options.problem_size.mn());          // <- Create matrix C2 with dimensions M x N
+  options.tensor_d.resize(options.problem_size.mn());           // <- Create matrix D with dimensions M x N used to store output from CUTLASS kernel
+  options.tensor_ref_d.resize(options.problem_size.mn());       // <- Create matrix D with dimensions M x N used to store output from reference kernel
+  options.tensor_Vector.resize({1, options.problem_size.n()});  // <- Create broadcast vector with dimensions N x 1
+  // options.tensor_Tensor.resize(options.problem_size.mn());   // <- Create T matrix with dimensions M x N
+
+  int _init_bits = options.real ? -1 : 0;
+
+  // Fill matrix A on host with uniform-random data [-2, 2]
+  cutlass::reference::host::TensorFillRandomUniform(
+      options.tensor_a.host_view(),
+      1,
+      ElementA(2),
+      ElementA(-2), _init_bits);
+
+  // Fill matrix B on host with uniform-random data [-2, 2]
+  cutlass::reference::host::TensorFillRandomUniform(
+      options.tensor_b.host_view(),
+      1,
+      ElementB(2),
+      ElementB(-2), _init_bits);
+
+  // Fill matrix C1 on host with uniform-random data [-2, 2]
+  cutlass::reference::host::TensorFillRandomUniform(
+      options.tensor_c1.host_view(),
+      1,
+      ElementC(2),
+      ElementC(-2), _init_bits);
+
+  // Fill matrix C2 on host with uniform-random data [-2, 2]
+  cutlass::reference::host::TensorFillRandomUniform(
+      options.tensor_c2.host_view(),
+      1,
+      ElementC(2),
+      ElementC(-2), _init_bits);
+
+  cutlass::reference::host::TensorFillRandomUniform(
+      options.tensor_Vector.host_view(),
+      1,
+      ElementC(2),
+      ElementC(-2), _init_bits);
+
+  //
+  // Compute reference output
+  //
+
+  // Copy data from host to GPU
+  options.tensor_a.sync_device();
+  options.tensor_b.sync_device();
+  options.tensor_c1.sync_device();
+  options.tensor_c2.sync_device();
+  options.tensor_Vector.sync_device();
+  // options.tensor_Tensor.sync_device();
+
+  // Zero-initialize reference output matrix D
+  cutlass::reference::host::TensorFill(options.tensor_ref_d.host_view());
+  options.tensor_ref_d.sync_device();
+
+  // Create instantiation for device reference gemm kernel
+  DeviceGemmReference gemm_reference;
+
+  // Launch device reference gemm kernel
+  gemm_reference(
+    options.problem_size,
+    ElementAccumulator(options.alpha),
+    options.tensor_a.device_ref(),
+    options.tensor_b.device_ref(),
+    ElementAccumulator(options.beta),
+    options.tensor_c1.device_ref(),
+    options.tensor_ref_d.device_ref());
+
+  // Wait for kernels to finish
+  CUDA_CHECK(cudaDeviceSynchronize());
+
+  // Copy output data from reference kernel to host for comparison
+  options.tensor_ref_d.sync_host();
+
+  // Add broadcast vector (without multiplier)
+  // This is only possible because BinaryOp is addition, and UnaryOps are identity.
+  // This makes the addition of broadcast vector commutable.
+  /// identity(plus(identity(alpha * (a * b) + v), beta * c)) ==
+  /// alpha * a * b + v + beta * c                            ==
+  /// (alpha * a * b + beta * c) + v                          ==
+  /// GEMM(a, b, c) + v
+  // Vector broadcast on host
+  for (int i=0; i < options.problem_size.m(); ++i) {
+    for (int j=0; j < options.problem_size.n(); ++j) {
+      options.tensor_ref_d.host_view().ref().at({i, j}) += options.tensor_Vector.host_view().ref().at({0, j});
+      options.tensor_ref_d.host_view().ref().at({i, j}) += options.tensor_c2.host_view().ref().at({i, j});
+    }
+  }
+
+  // Sync back with device just in case
+  options.tensor_ref_d.sync_device();
+
+  //
+  // Evaluate CUTLASS kernels
+  //
+
+  // Test default operation
+  if (options.split_k_factor == 1)
+  {
+    // Compare basic data-parallel version versus StreamK version using default load-balancing heuristics
+    Result basic_dp         = run<DeviceGemmBasic>("Basic data-parallel GEMM", options);
+    Result streamk_default  = run<DeviceGemmStreamK>("StreamK GEMM with default load-balancing", options);
+
+    printf("  Speedup vs Basic-DP: %.3f\n", (basic_dp.avg_runtime_ms / streamk_default.avg_runtime_ms));
+
+    // Show that StreamK can emulate basic data-parallel GEMM when we set the number of SMs to load-balance across = 1
+    options.avail_sms       = 1;        // Set loadbalancing width to 1 SM (no load balancing)
+    Result streamk_dp       = run<DeviceGemmStreamK>("StreamK emulating basic data-parallel GEMM", options);
+    options.avail_sms       = -1;       // Reset loadbalancing width to unspecified SMs (i.e., the number of device SMs)
+
+    printf("  Speedup vs Basic-DP: %.3f\n", (basic_dp.avg_runtime_ms / streamk_dp.avg_runtime_ms));
+
+    options.split_k_factor++;     // Increment splitting factor for next evaluation
+
+  }
+
+  // Show that StreamK can emulate "Split-K" with a tile-splitting factor
+  Result basic_splitk = run<DeviceGemmBasic>(
+    std::string("Basic split-K GEMM with tile-splitting factor ") + std::to_string(options.split_k_factor),
+    options);
+
+  Result streamk_splitk = run<DeviceGemmStreamK>(
+    std::string("StreamK emulating Split-K GEMM with tile-splitting factor ") + std::to_string(options.split_k_factor),
+    options);
+
+  printf("  Speedup vs Basic-SplitK: %.3f\n", (basic_splitk.avg_runtime_ms / streamk_splitk.avg_runtime_ms));
+
+  return 0;
+}
diff --git a/examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu b/examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu
new file mode 100644
index 0000000000..164c785e01
--- /dev/null
+++ b/examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu
@@ -0,0 +1,510 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Simple Hopper GEMM example using CUTLASS 3.0 APIs for NVIDIA Hopper architecture
+
+    This example demonstrate a simple way to instantiate and run a TF32 GEMM using the new CUTLASS 3.0
+    APIs on NVIDIA Hopper architecture. New features that will be showcased in this example are as follows:
+
+    1. NVIDIA Hopper architecture introduces a new series of tensor core instructions (GMMA)
+    which are more efficient than the Ampere tensor core instructions.
+
+    2. NVIDIA Hopper architecture includes new Tensor Memory Accelerator (TMA) unit to transfer large
+    blocks of data efficiently between global memory and shared memory. TMA also supports asynchronous
+    copies between thread blocks in a cluster. Another advantage is that TMA can load in FP32 data and
+    convert them implicitly to TF32.
+
+    3. This example uses the Warp Specialized kernel design (see /media/docs/efficient_gemm.md for details).
+
+    4. A simple way to tune the CTA rasterization direction and swizzle pattern of Hopper kernels. Both the 
+    CTA rasterization direction and swizzle pattern impact cross-CTA locality of accesses. By tuning we can 
+    improve performance.
+
+    Examples:
+
+      $ ./examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm --m=2048 --n=2048 --k=2048 --rasterization=N --swizzle=2
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+
+#include "helper.h"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM kernel configurations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// A matrix configuration
+using         ElementA    = float;                                          // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using         ElementB    = float;                                          // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+// C/D matrix configuration
+using         ElementC    = float;                                          // Element type for C and D matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C and D matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+
+// Core kernel configurations
+using ElementAccumulator  = float;                                          // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm90;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                 // Operator class tag
+using TileShape           = Shape<_128,_128,_32>;                           // Threadblock-level tile size
+using ClusterShape        = Shape<_1,_2,_1>;                                // Shape of the threadblocks in a cluster
+using StageCountType = cutlass::gemm::collective::StageCountAuto;           // Stage count maximized based on the tile size
+using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;       // Kernel to launch based on the default setting in the Collective Builder
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    TileShape, ClusterShape,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementC, LayoutC, AlignmentC,
+    cutlass::epilogue::collective::EpilogueScheduleAuto
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    TileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    cutlass::gemm::collective::KernelScheduleAuto
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    Shape<int,int,int>, // Indicates ProblemShape
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+
+using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+// Reference device GEMM implementation type
+using DeviceGemmReference = cutlass::reference::device::Gemm<
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  ElementAccumulator>;
+
+using StrideA = typename Gemm::GemmKernel::StrideA;
+using StrideB = typename Gemm::GemmKernel::StrideB;
+using StrideC = typename Gemm::GemmKernel::StrideC;
+using StrideD = typename Gemm::GemmKernel::StrideD;
+
+//
+// Data members
+//
+
+/// Initialization
+StrideA stride_A;
+StrideB stride_B;
+StrideC stride_C;
+StrideD stride_D;
+uint64_t seed;
+
+cutlass::DeviceAllocation<typename Gemm::ElementA> block_A;
+cutlass::DeviceAllocation<typename Gemm::ElementB> block_B;
+cutlass::DeviceAllocation<typename Gemm::ElementC> block_C;
+cutlass::DeviceAllocation<typename Gemm::EpilogueOutputOp::ElementOutput> block_D;
+cutlass::DeviceAllocation<typename Gemm::EpilogueOutputOp::ElementOutput> block_ref_D;
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Testbed utility types
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90Params::RasterOrderOptions;
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+
+  float alpha, beta;
+  int iterations;
+  int m, n, k;
+  RasterOrderOptions raster;
+  int swizzle;
+
+  Options():
+    help(false),
+    m(5120), n(4096), k(4096),
+    alpha(1.f), beta(0.f),
+    iterations(1000),
+    raster(RasterOrderOptions::Heuristic),
+    swizzle(1)
+  { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("m", m);
+    cmd.get_cmd_line_argument("n", n);
+    cmd.get_cmd_line_argument("k", k);
+    cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+    cmd.get_cmd_line_argument("beta", beta, 0.f);
+    cmd.get_cmd_line_argument("iterations", iterations);
+
+    char raster_char;
+    cmd.get_cmd_line_argument("raster", raster_char);
+
+    if (raster_char == 'N' || raster_char == 'n') {
+      raster = RasterOrderOptions::AlongN;
+    }
+    else if (raster_char == 'M' || raster_char == 'm') {
+      raster = RasterOrderOptions::AlongM;
+    }
+    else if (raster_char == 'H' || raster_char == 'h') {
+      raster = RasterOrderOptions::Heuristic;
+    }
+
+    cmd.get_cmd_line_argument("swizzle", swizzle, 1);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "48_hopper_warp_specialized_gemm\n\n"
+      << "  Hopper FP32 GEMM using a Warp Specialized kernel.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement\n\n"
+      << "  --m=<int>                   Sets the M extent of the GEMM\n"
+      << "  --n=<int>                   Sets the N extent of the GEMM\n"
+      << "  --k=<int>                   Sets the K extent of the GEMM\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha\n"
+      << "  --beta=<f32>                Epilogue scalar beta\n\n"
+      << "  --raster=<char>             CTA Rasterization direction (N for along N, M for along M, and H for heuristic)\n\n"
+      << "  --swizzle=<int>             CTA Rasterization swizzle\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n";
+
+    out
+      << "\n\nExamples:\n\n"
+      << "$ " << "48_hopper_warp_specialized_gemm" << " --m=1024 --n=512 --k=1024 --alpha=2 --beta=0.707 \n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const
+  {
+    // Two flops per multiply-add
+    uint64_t flop = uint64_t(2) * m * n * k;
+    double gflop = double(flop) / double(1.0e9);
+    return gflop / runtime_s;
+  }
+};
+
+/// Result structure
+struct Result
+{
+  double avg_runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  Result(
+    double avg_runtime_ms = 0,
+    double gflops = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess)
+  :
+    avg_runtime_ms(avg_runtime_ms), gflops(gflops), status(status), error(error), passed(false)
+  {}
+
+};
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM setup and evaluation
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to initialize a block of device data
+template <class Element>
+bool initialize_block(
+  cutlass::DeviceAllocation<Element>& block,
+  uint64_t seed=2023) {
+
+  Element scope_max, scope_min;
+  int bits_input = cutlass::sizeof_bits<Element>::value;
+
+  if (bits_input == 1) {
+    scope_max = Element(2);
+    scope_min = Element(0);
+  } else if (bits_input <= 8) {
+    scope_max = Element(2);
+    scope_min = Element(-2);
+  } else {
+    scope_max = Element(8);
+    scope_min = Element(-8);
+  }
+
+  cutlass::reference::device::BlockFillRandomUniform(
+    block.get(), block.size(), seed, scope_max, scope_min, 0);
+
+  return true;
+}
+
+/// Initialize operands to be used in the GEMM and reference GEMM
+void initialize(const Options &options) {
+
+  stride_A = cutlass::make_cute_packed_stride(StrideA{}, {options.m, options.k, 1});
+  stride_B = cutlass::make_cute_packed_stride(StrideB{}, {options.n, options.k, 1});
+  stride_C = cutlass::make_cute_packed_stride(StrideC{}, {options.m, options.n, 1});
+  stride_D = cutlass::make_cute_packed_stride(StrideD{}, {options.m, options.n, 1});
+
+  block_A.reset(options.m * options.k);
+  block_B.reset(options.k * options.n);
+  block_C.reset(options.m * options.n);
+  block_D.reset(options.m * options.n);
+  block_ref_D.reset(options.m * options.n);
+
+  initialize_block(block_A, seed + 2023);
+  initialize_block(block_B, seed + 2022);
+  initialize_block(block_C, seed + 2021);
+}
+
+/// Populates a Gemm::Arguments structure from the given commandline options
+typename Gemm::Arguments args_from_options(const Options &options)
+{
+  typename Gemm::Arguments arguments{
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    {options.m, options.n, options.k},
+    {block_A.get(), stride_A, block_B.get(), stride_B},
+    {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}
+  };
+
+  arguments.scheduler.raster_order = options.raster;
+  // The tile scheduler will swizzle up to 8 and with the nearest multiple of 2 (i.e., 1, 2, 4, and 8) 
+  arguments.scheduler.max_swizzle_size = options.swizzle;
+
+  return arguments;
+}
+
+bool verify(const Options &options) {
+  cutlass::TensorRef ref_A(block_A.get(), Gemm::LayoutA::packed({options.m, options.k}));
+  cutlass::TensorRef ref_B(block_B.get(), Gemm::LayoutB::packed({options.k, options.n}));
+  cutlass::TensorRef ref_C(block_C.get(), Gemm::LayoutC::packed({options.m, options.n}));
+  cutlass::TensorRef ref_D(block_ref_D.get(), Gemm::LayoutD::packed({options.m, options.n}));
+
+  //
+  // Compute reference output
+  //
+
+  // Create instantiation for device reference gemm kernel
+  DeviceGemmReference gemm_reference;
+
+  // Launch device reference gemm kernel
+  gemm_reference(
+    {options.m, options.n, options.k},
+    ElementAccumulator(options.alpha),
+    ref_A,
+    ref_B,
+    ElementAccumulator(options.beta),
+    ref_C,
+    ref_D);
+
+  // Wait for kernel to finish
+  CUDA_CHECK(cudaDeviceSynchronize());
+
+  // Check if output from CUTLASS kernel and reference kernel are equal or not
+  bool passed = cutlass::reference::device::BlockCompareEqual(block_ref_D.get(), block_D.get(), block_D.size());
+
+  return passed;
+}
+
+/// Execute a given example GEMM computation
+template <typename Gemm>
+int run(Options &options)
+{
+  initialize(options);
+
+  // Instantiate CUTLASS kernel depending on templates
+  Gemm gemm;
+
+  // Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
+  auto arguments = args_from_options(options);
+
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  // Check if the problem size is supported or not
+  CUTLASS_CHECK(gemm.can_implement(arguments));
+
+  // Initialize CUTLASS kernel with arguments and workspace pointer
+  CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
+
+  // Correctness / Warmup iteration
+  CUTLASS_CHECK(gemm.run());
+
+  // Check if output from CUTLASS kernel and reference kernel are equal or not
+  Result result;
+  result.passed = verify(options);
+
+  std::cout << "  Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
+
+  if (!result.passed) {
+    exit(-1);
+  }
+
+  // Run profiling loop
+  if (options.iterations > 0)
+  {
+    GpuTimer timer;
+    timer.start();
+    for (int iter = 0; iter < options.iterations; ++iter) {
+      CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
+      CUTLASS_CHECK(gemm.run());
+    }
+    timer.stop();
+
+    // Compute average runtime and GFLOPs.
+    float elapsed_ms = timer.elapsed_millis();
+    result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
+    result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
+
+    std::string raster = "Heuristic";
+
+    if (options.raster == RasterOrderOptions::AlongN) {
+      raster = "Along N";
+    }
+    else if (options.raster == RasterOrderOptions::AlongM) {
+      raster = "Along M";
+    }
+
+    std::cout << "  Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << std::endl;
+    std::cout << "  Rasterization: " << raster << " with a maximum CTA swizzle of " << options.swizzle << std::endl;
+    std::cout << "  Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
+    std::cout << "  GFLOPS: " << result.gflops << std::endl;
+  }
+
+  return 0;
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  // CUTLASS must be compiled with CUDA 12.0 Toolkit to run this example
+  // and must have compute capability at least 90.
+  if (__CUDACC_VER_MAJOR__ < 12) {
+    std::cerr << "This example requires CUDA 12 or newer.\n";
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+
+  cudaDeviceProp props;
+  int current_device_id;
+  CUDA_CHECK(cudaGetDevice(&current_device_id));
+  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (props.major < 9) {
+    std::cerr
+      << "This example requires a GPU of NVIDIA's Hopper Architecture or "
+      << "later (compute capability 90 or greater).\n";
+    return 0;
+  }
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  //
+  // Evaluate CUTLASS kernels
+  //
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+  run<Gemm>(options);
+#endif
+
+  return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/48_hopper_warp_specialized_gemm/CMakeLists.txt b/examples/48_hopper_warp_specialized_gemm/CMakeLists.txt
new file mode 100644
index 0000000000..903da1ea6f
--- /dev/null
+++ b/examples/48_hopper_warp_specialized_gemm/CMakeLists.txt
@@ -0,0 +1,35 @@
+
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+
+cutlass_example_add_executable(
+  48_hopper_warp_specialized_gemm
+  48_hopper_warp_specialized_gemm.cu
+  )
diff --git a/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu b/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu
new file mode 100644
index 0000000000..1e820ddb47
--- /dev/null
+++ b/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu
@@ -0,0 +1,652 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Hopper GEMM example leveraging collective operation builders.
+
+    This example showcases the use of CUTLASS's CollectiveBuilder to easily construct performant kernels
+    targeting the NVIDIA Hopper architecture.
+
+    Background and motivation
+    -------------------------
+    CUTLASS kernels are highly parameterizable via template parameters. To ease the selection of template
+    parameters, CUTLASS 2 leveraged DefaultGemmConfigurations. Given a small set of parameters, such as
+    the data types of operands and the compute capability of the GPU, DefaultGemmConfigurations defined sensible
+    defaults for the many other parameters to the kernel (e.g., warp shape, stage count).
+
+    However, DefaultGemmConfigurations leave multiple opportunities for improvement, which are addressed
+    in CUTLASS 3:
+      (1) DefaultGemmConfigurations do not allow one to use a more-performant set of parameters without
+          specifying every parameter. For example, the DefaultGemmConfigurations for GEMMs targeting
+          Ampere specify that three pipeline stages should be used regardless of the sizes of operands.
+          If one wished to increase this value, one would also need to specify all other template parameters.
+          This leaves a gap between a high-level ease-of-use interface and a lower-level detailed interface.
+      (2) A new DefaultGemmConfiguration was required for each combination of operand types, GPU architecture,
+          and operation type (e.g., Tensor Core or SIMT). This led to increased code size to cover each unique
+          configuration and a lack of extensibility from one DefaultGemmConfiguration to another.
+
+    Alongside these opportunities for improvement, the Hopper architecture offers new features that increase
+    the number of valid configurations of a kernel. In addition to the many template parameters already available
+    in CUTLASS 2 kernels, CUTLASS 3 kernels targeting Hopper also have various scheduling modes to select from that control:
+      (1) how data is to be loaded (e.g., using the Hopper TMA feature or Ampere cp.async)
+      (2) how work is to be divided among warps in a thread block (e.g., whether to use "warp specialization")
+      (3) whether persistent thread blocks should be used
+    This increased configuration space further motivates rethinking DefaultGemmConfigurations.
+
+    Introduction to the CollectiveBuilder
+    -------------------------------------
+    CUTLASS 3 introduces the CollectiveBuilder to further ease the process of selecting template parameters
+    for kernels targeting Hopper. Similar to the DefaultGemmConfigurations used in CUTLASS 2, the CollectiveBuilder
+    takes in a small set of template parameters (e.g., the data types of operands A and B). It then automatically
+    determines the data loading strategy to use depending on whether the Hopper TMA feature can be used with the provided
+    parameters. If one does not indicate a particular scheduling policy or stage count to use (by using `Auto` template
+    parameters), the CollectiveBuilder will also automatically select these.
+
+    Unlike DefaultGemmConfigurations a partial specialization of the CollectiveBuilder is not needed for many
+    configurations of operand types. Instead the CollectiveBuilder "builds" a configuration based on generic
+    properties of the specified operands, layouts, and other parameters. For example, when the stage count
+    is set to `Auto`, the CollectiveBuilder may automatically calculate the maximum number of stages that
+    will fit in shared memory given the types of operands and the thread block shape, rather than simply using
+    a single default value.
+
+    CUTLASS 3.x provides builders for both collective mainloops and epilogues. The particular implementation of
+    the collective is specified via the schedule tags that corresond to the underlying collective's
+    dispatch policy. `gemm::collective::KernelScheduleAuto` and `epilogue::collective::EpilogueScheduleAuto`
+    are special cases of these schedules that allow the builder to also decide the dispatch policy for you,
+    therefore letting the builder pick the collective specialization.
+
+    CUTLASS builders make an attempt to pick the best schedule when `Auto` is provided such that the
+    assembled collectives have the best performance, but this is not a guarantee. A user relying on `Auto`
+    may get a free performance upgrade with newer CUTLASS releases in case we can provide more optimized
+    implementations that the builder can transparently assemble for `Auto`. But a user should not rely on
+    `Auto` if they require a specific scheduling policy and/or stage count to be used.
+
+    If a user decides to let the builders pick the collective specialization via `Auto` schedules,
+    they must be used for both mainloop and epilogue alike to ensure compatibility between the
+    chosen collectives. Additionally, if a user chooses to opt in to a specific schedule, non-`Auto`
+    schedules must be used for both mainloop and epilogue builder schedules, and these schedules
+    must be compatible.
+
+    One does not need to use the CollectiveBuilder to declare CUTLASS 3 kernels; one can still provide
+    every template parameter to the `gemm::collective::CollectiveMma`. Specifying every template parameter
+    in this manner remains the primary API for using CUTLASS 3 kernels. `CollectiveBuilder`s are
+    simply meant to be a convenience interface.
+
+    Details of this example
+    -----------------------
+    This example walks through the use of the CollectiveBuilder with various schedules and stage counts specified.
+    This example also illustrates how CUTLASS 3 GEMMs targeting Hopper automatically support batched GEMMs by simply
+    extending the problem size with an additional tensor rank.
+
+    CUTLASS 3.2 provides initial support for epilogue visitor trees (EVT) for the TMA warp-specialized collective.
+    EVTs allow users to define their own customized epilogue fusion patterns without having to write a new
+    collective epilogue. This is done by representing the fusion as a compute graph, where each node is one of a
+    fundamental set of load, store, or compute operations. These operations are either elementwise for tensor
+    inputs/outputs, broadcasts for vector/scalar inputs, or reductions for vector/scalar outputs.
+    This example shows how users can define their own custom EVT and use it with the CollectiveBuilder.
+
+    Example usage:
+      $ ./examples/49_hopper_with_collective_builder/49_collective_builder \
+            --m=2048 --n=2048 --k=2048 --l=2
+*/
+
+#include <iostream>
+
+#include "cute/tensor.hpp"
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/gemm_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Command line options parsing
+struct Options {
+
+  bool help;
+  bool error;
+
+  int m, n, k, l;
+  float alpha, beta;
+
+  Options():
+    help(false),
+    error(false),
+    m(2048), n(2048), k(2048), l(1),
+    alpha(1.f), beta(0.f)
+  { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("m", m, 2048);
+    cmd.get_cmd_line_argument("n", n, 2048);
+    cmd.get_cmd_line_argument("k", k, 2048);
+    cmd.get_cmd_line_argument("l", l, 1);
+    cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+    cmd.get_cmd_line_argument("beta", beta, 0.f);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "49_hopper_with_collective_builder\n\n"
+      << "  This example showcases the use of CUTLASS's collective operation builders to easily construct\n"
+      << "  performant kernels targeting NVIDIA's Hopper architecture.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement\n\n"
+      << "  --m=<int>                   Sets the M extent of the GEMM\n"
+      << "  --n=<int>                   Sets the N extent of the GEMM\n"
+      << "  --k=<int>                   Sets the K extent of the GEMM\n"
+      << "  --l=<int>                   Sets the L extent (batch count) of the GEMM\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha\n"
+      << "  --beta=<f32>                Epilogue scalar beta\n\n";
+
+    return out;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to initialize a block of device data
+template <class Element>
+bool initialize_block(
+  cutlass::DeviceAllocation<Element>& block,
+  uint64_t seed=2023) {
+
+  Element scope_max, scope_min;
+  int bits_input = cutlass::sizeof_bits<Element>::value;
+
+  if (bits_input == 1) {
+    scope_max = 2;
+    scope_min = 0;
+  } else if (bits_input <= 8) {
+    scope_max = 2;
+    scope_min = -2;
+  } else {
+    scope_max = 8;
+    scope_min = -8;
+  }
+
+  cutlass::reference::device::BlockFillRandomUniform(
+    block.get(), block.size(), seed, scope_max, scope_min, 0);
+
+  return true;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+// Wrapper to construct, run, and verify a GEMM. This example showcases CUTLASS's collective
+// operation builders by specializing the GEMM only on the kernel schedule it will use and the
+// number of pipeline stages.
+//
+// One can use a special `Auto` type that tells the CollectiveBuilder
+// to select an appropriate value on its own. The CollectiveBuilder will attempt to select
+// configurations that will result in the most-performant kernel, but this is not a guarantee.
+//
+// If relying on 'Auto' schedules, all builders must use the 'Auto' schedule to ensure compatiblity.
+// For example, if `KernelScheduleAuto` is used for the mainloop builder, `EpilogueScheduleAuto` must
+// be used for the epilogue builder.
+//
+// Furthermore, if an override schedule is selected, both epilogue and mainloop schedules must
+// be specifically opt into a compatible selection.
+//
+// Behavior of the CollectiveBuilder with `Auto` types is subject to change in future releases
+// -- do not rely on `Auto` if you require a specific scheduling policy.
+template <
+  // Type of kernel schedule to generate
+  class MainloopScheduleType = cutlass::gemm::collective::KernelScheduleAuto,
+  // Type of epilogue schedule to generate
+  class EpilogueScheduleType = cutlass::epilogue::collective::EpilogueScheduleAuto,
+  // Number of pipeline stages to use
+  class StageCountType = cutlass::gemm::collective::StageCountAuto,
+  // Type of tile scheduler to use
+  class TileSchedulerType = cutlass::gemm::PersistentScheduler,
+  // Do we use custom epilogue visitor tree (EVT) fusion
+  bool UseCustomEVT = false
+>
+struct ExampleRunner {
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  using ElementA = cutlass::half_t;
+  using ElementB = cutlass::half_t;
+  using ElementC = cutlass::half_t;
+  using ElementD = cutlass::half_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementScalar = float;
+
+  // 16B alignment lets us use TMA
+  static constexpr int AlignmentA = 16 / sizeof(ElementA);
+  static constexpr int AlignmentB = 16 / sizeof(ElementB);
+  static constexpr int AlignmentC = 16 / sizeof(ElementC);
+  static constexpr int AlignmentD = 16 / sizeof(ElementD);
+
+  static_assert(not UseCustomEVT ||
+    (cute::is_same_v<EpilogueScheduleType, cutlass::epilogue::TmaWarpSpecialized> ||
+     cute::is_same_v<EpilogueScheduleType, cutlass::epilogue::TmaWarpSpecializedCooperative>),
+    "Epilogue visitor trees are currently only supported by the TMA warp-specialized epilogue");
+  static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
+
+  // EVTs can be constructed by composing the fundamental load/store/compute visitor operations defined in include/cutlass/epilogue/fusion
+  // For more complex examples of EVT construction please refer to include/cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp
+  using CustomEVT =  // alpha * acc + beta * C
+    cutlass::epilogue::fusion::Sm90EVT<cutlass::epilogue::fusion::Sm90Compute<cutlass::homogeneous_multiply_add, ElementD, ElementCompute, RoundStyle>, // beta * C + (alpha * acc)
+      cutlass::epilogue::fusion::Sm90ScalarBroadcast<ElementScalar>, // beta
+      cutlass::epilogue::fusion::Sm90SrcFetch<ElementC>, // C
+      cutlass::epilogue::fusion::Sm90EVT<cutlass::epilogue::fusion::Sm90Compute<cutlass::multiplies, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc
+        cutlass::epilogue::fusion::Sm90ScalarBroadcast<ElementScalar>, // alpha
+        cutlass::epilogue::fusion::Sm90AccFetch // acc
+      >
+    >;
+
+  // A predefined set of fusion operations (implemented with EVT) are supported by the TMA warp-specialized epilogue.
+  // Users can select one of these operations by passing one of the tags defined in include/cutlass/epilogue/fusion/operations.hpp
+  // to the CollectiveBuilder. This frees the user from having to compute additional parameters such as stage counts and copy atoms/layouts.
+  // These tags also provide additional metadata that can be queried at compile time.
+  using DefaultOperation = cutlass::epilogue::fusion::LinearCombination<ElementD, ElementCompute, ElementC, ElementScalar, RoundStyle>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      EpilogueScheduleType,
+      cute::conditional_t<UseCustomEVT, CustomEVT, DefaultOperation>
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, AlignmentA,
+      ElementB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      Shape<_128,_128,_64>, Shape<_2,_1,_1>,
+      cute::conditional_t<cute::is_same_v<StageCountType, cutlass::gemm::collective::StageCountAuto>,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          StageCountType>,
+      MainloopScheduleType
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      TileSchedulerType
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+
+  using LayoutTagA = cutlass::gemm::detail::StrideToLayoutTagA_t<StrideA>;
+  using LayoutTagB = cutlass::gemm::detail::StrideToLayoutTagB_t<StrideB>;
+  using LayoutTagC = cutlass::gemm::detail::StrideToLayoutTagC_t<StrideC>;
+  using LayoutTagD = cutlass::gemm::detail::StrideToLayoutTagC_t<StrideD>;
+
+  //
+  // Data members
+  //
+
+  /// Initialization
+  StrideA stride_A;
+  StrideB stride_B;
+  StrideC stride_C;
+  StrideD stride_D;
+  uint64_t seed = 0;
+
+  cutlass::DeviceAllocation<typename Gemm::ElementA> block_A;
+  cutlass::DeviceAllocation<typename Gemm::ElementB> block_B;
+  cutlass::DeviceAllocation<typename Gemm::ElementC> block_C;
+  cutlass::DeviceAllocation<typename Gemm::ElementD> block_D;
+  cutlass::DeviceAllocation<typename Gemm::ElementD> block_ref_D;
+
+  //
+  // Methods
+  //
+
+  bool verify(const ProblemShapeType& problem_size, float alpha, float beta) {
+    auto [M, N, K, L] = problem_size;
+
+    cutlass::TensorRef ref_A(block_A.get(), Gemm::LayoutA::packed({M, K}));
+    cutlass::TensorRef ref_B(block_B.get(), Gemm::LayoutB::packed({K, N}));
+    cutlass::TensorRef ref_C(block_C.get(), Gemm::LayoutC::packed({M, N}));
+    cutlass::TensorRef ref_D(block_ref_D.get(), Gemm::LayoutD::packed({M, N}));
+
+    cutlass::reference::device::GemmComplex(
+          {M, N, K},
+          ElementScalar(alpha),
+          ref_A,
+          cutlass::ComplexTransform::kNone,
+          ref_B,
+          cutlass::ComplexTransform::kNone,
+          ElementScalar(beta),
+          ref_C,
+          ref_D,
+          ElementAccumulator(0),
+          L,     // batch_count
+          M * K, // batch_stride_A
+          K * N, // batch_stride_B
+          M * N, // batch_stride_C
+          M * N  // batch_stride_D
+        );
+
+    cudaError_t result = cudaDeviceSynchronize();
+    if (result != cudaSuccess) {
+      std::cerr << "Reference kernel failed. Last CUDA error: "
+                << cudaGetErrorString(result) << std::endl;
+      return false;
+    }
+
+    // Check if output from CUTLASS kernel and reference kernel are equal or not
+    bool passed = cutlass::reference::device::BlockCompareEqual(block_ref_D.get(), block_D.get(), block_D.size());
+
+    return passed;
+  }
+
+  /// Initialize operands to be used in the GEMM and reference GEMM
+  void initialize(const ProblemShapeType& problem_size) {
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+    stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+    stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
+    stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
+
+    block_A.reset(M * K * L);
+    block_B.reset(K * N * L);
+    block_C.reset(M * N * L);
+    block_D.reset(M * N * L);
+    block_ref_D.reset(M * N * L);
+
+    initialize_block(block_A, seed + 2023);
+    initialize_block(block_B, seed + 2022);
+    initialize_block(block_C, seed + 2021);
+  }
+
+  bool run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) {
+    ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l};
+
+    initialize(problem_size);
+
+    typename Gemm::Arguments arguments{
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      problem_size,
+      {block_A.get(), stride_A, block_B.get(), stride_B},
+      {{}, // epilogue.thread
+       block_C.get(), stride_C, block_D.get(), stride_D},
+      hw_info
+    };
+
+    // Custom EVT fusions will have nested unnamed args, the structure of which
+    // can be deduced from the type definition of the EVT.
+    // Each node's arguments has the recursive structure of
+    // {first_child_args, ..., last_child_args, op_args},
+    // For more complex examples of EVT initialization please refer to
+    // include/cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp
+    if constexpr (UseCustomEVT) {
+      arguments.epilogue.thread =
+        {    // ternary op : beta * C + (alpha * acc)
+          {{options.beta}}, // leaf op+args : beta
+          {},               // leaf op+args : C
+          {                 // binary op : alpha * acc
+            {{options.alpha}}, // leaf op+args : alpha
+            {},                // leaf op+args : acc
+            {}              // binary args : multiplies
+          },                // end binary op
+          {} // ternary args : multiply_add
+        };   // end ternary op
+    }
+    // Pre-defined fusions will have flat, named args for user-friendlyness
+    else {
+      arguments.epilogue.thread.alpha = options.alpha;
+      arguments.epilogue.thread.beta = options.beta;
+    }
+
+    Gemm gemm_op;
+
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = gemm_op.can_implement(arguments);
+    if (status != cutlass::Status::kSuccess) {
+      std::cerr << "This kernel is not supported. Last CUDA error is: "
+                << cudaGetErrorString(cudaGetLastError()) << std::endl;
+      return false;
+    }
+
+    status = gemm_op.initialize(arguments, workspace.get());
+    if (status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to initialize the CUTLASS kernel. Last CUDA error is: "
+                << cudaGetErrorString(cudaGetLastError()) << std::endl;
+      return false;
+    }
+
+    // Run the GEMM
+    status = gemm_op.run();
+    if (status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to launch the CUTLASS kernel. Last CUDA error is: "
+                << cudaGetErrorString(cudaGetLastError()) << std::endl;
+      return false;
+    }
+
+    cudaError_t result = cudaDeviceSynchronize();
+    if (result != cudaSuccess) {
+      std::cerr << "Error running the CUTLASS kernel. Last CUDA error is: "
+                << cudaGetErrorString(result) << std::endl;
+      return false;
+    }
+
+    // Verify that the result is correct
+    bool passed = verify(problem_size, options.alpha, options.beta);
+    if (!passed) {
+      std::cerr << "Reference check failed" << std::endl;
+    }
+
+    return passed;
+  }
+
+};
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to print a description of the example run and its result
+void print_result(const std::string& description, bool passed) {
+  std::cout << description << ": " << (passed ? "Passed" : "Failed") << std::endl;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (__CUDACC_VER_MAJOR__ < 12 || props.major < 9) {
+    std::cout
+      << "This example requires a GPU of NVIDIA's Hopper Architecture or "
+      << "later (compute capability 90 or greater) and CUDA 12.0 or greater.\n";
+      return 0;
+  }
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.error) {
+    std::cerr << "Aborting execution." << std::endl;
+    return -1;
+  }
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+  //
+  // Run examples
+  //
+
+  // The KernelHardwareInfo struct holds the number of SMs on the GPU with a given device ID. This
+  // information is used by the underlying kernel.
+  cutlass::KernelHardwareInfo hw_info;
+
+  // Change device_id to another value if you are running on a machine with multiple GPUs and wish
+  // to use a GPU other than that with device ID 0.
+  hw_info.device_id = 0;
+  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+  bool passed;
+
+  // This first example constructs a GEMM using the default schedule and stage count provided by
+  // the CollectiveBuilder. The scheduling policy that is expected to be most performant will be
+  // selected and the maximum number of stages that can fit in shared memory will be selected.
+  //
+  // This example is equivalent to declaring
+  //    ExampleRunner<
+  //        cutlass::gemm::collective::KernelScheduleAuto,
+  //        cutlass::epilogue::collective::EpilogueScheduleAuto,
+  //        cutlass::gemm::collective::StageCountAuto>
+  // Each of the `Auto` types indicate that the CollectiveBuilder should determine the scheduling policy and
+  // stage count. Note that the behavior of the CollectiveBuilder with `Auto` parameters is subject to change
+  // -- do not rely on `Auto` if you require a specific scheduling policy.
+  // If you opt in to a non-'Auto' schedule, make sure all collectives are built using specific, compatible schedules.
+  ExampleRunner<> auto_schedule_auto_stage_runner;
+  passed = auto_schedule_auto_stage_runner.run(options, hw_info);
+  print_result("Automatically-selected schedule and stage count", passed);
+
+  // One can override the stage count used in the GEMM by replacing cutlass::gemm::collective::StageCountAuto
+  // with the number of stages to use (5 in this case).
+  ExampleRunner<
+    cutlass::gemm::collective::KernelScheduleAuto,
+    cutlass::epilogue::collective::EpilogueScheduleAuto,
+    _5> auto_schedule_5_stage_runner;
+
+  passed = auto_schedule_5_stage_runner.run(options, hw_info);
+  print_result("Automatically-selected schedule with 5 stages", passed);
+
+  // One can also override the scheduling policy to use. In this case, use the KernelTma scheduling
+  // policy, which specifies that the Hopper TMA feature should be used, and we also use an epilogue
+  // that does not use any shared memory.
+  ExampleRunner<cutlass::gemm::KernelTma, cutlass::epilogue::NoSmemWarpSpecialized> tma_schedule_auto_stage_runner;
+  passed = tma_schedule_auto_stage_runner.run(options, hw_info);
+  print_result("TMA schedule with automatically-selected stage count", passed);
+
+  // Here, we override the scheduling policy to use Hopper's TMA feature alongside the warp-specialized
+  // scheduling policy, and an epilogue that does not use any shared memory.
+  ExampleRunner<cutlass::gemm::KernelTmaWarpSpecialized, cutlass::epilogue::NoSmemWarpSpecialized> ws_schedule_auto_stage_runner;
+  passed = ws_schedule_auto_stage_runner.run(options, hw_info);
+  print_result("Warp-specialized TMA schedule with automatically-selected stage count", passed);
+
+  // Here, we override the scheduling policy to use Hopper's TMA feature, alongside the warp-specialized
+  // scheduling policy, TMA-based epilogue, leveraging persistent thread blocks.
+  ExampleRunner<
+    cutlass::gemm::KernelTmaWarpSpecializedPingpong,
+    cutlass::epilogue::TmaWarpSpecialized> ws_pingpong_schedule_auto_stage_runner;
+  passed = ws_pingpong_schedule_auto_stage_runner.run(options, hw_info);
+  print_result("Ping-pong warp-specialized TMA schedule with automatically-selected stage count", passed);
+
+  // Here, we override the scheduling policy to use stream-K problem decomposition atop the cooperative
+  // warp-specialized scheduling policy. This kernel continues to leverage persistent thread blocks
+  // as well aso TMA in both the mainloop and epilogue.
+  ExampleRunner<
+    cutlass::gemm::KernelTmaWarpSpecializedCooperative,
+    cutlass::epilogue::TmaWarpSpecializedCooperative,
+    cutlass::gemm::collective::StageCountAuto,
+    cutlass::gemm::StreamKScheduler> ws_cooperative_stream_k_schedule_auto_stage_runner;
+  passed = ws_cooperative_stream_k_schedule_auto_stage_runner.run(options, hw_info);
+  print_result("Cooperative warp-specialized TMA schedule using stream-K with automatically-selected stage count", passed);
+
+  // Here, we override the fusion operation to use a customized EVT fusion, in addition to the previous schedule overrides
+  ExampleRunner<
+    cutlass::gemm::KernelTmaWarpSpecializedCooperative,
+    cutlass::epilogue::TmaWarpSpecializedCooperative,
+    cutlass::gemm::collective::StageCountAuto,
+    cutlass::gemm::PersistentScheduler,
+    true> ws_cooperative_schedule_auto_stage_custom_evt_runner;
+  passed = ws_cooperative_schedule_auto_stage_custom_evt_runner.run(options, hw_info);
+  print_result("Cooperative warp-specialized TMA schedule using custom epilogue visitor tree with automatically-selected stage count", passed);
+
+#endif
+
+  return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/49_hopper_gemm_with_collective_builder/CMakeLists.txt b/examples/49_hopper_gemm_with_collective_builder/CMakeLists.txt
new file mode 100644
index 0000000000..4925105d75
--- /dev/null
+++ b/examples/49_hopper_gemm_with_collective_builder/CMakeLists.txt
@@ -0,0 +1,34 @@
+
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# Both filenames are shorter to avoid MAX_PATH issues on Windows.
+cutlass_example_add_executable(
+  49_collective_builder
+  49_collective_builder.cu
+  )
diff --git a/examples/50_hopper_gemm_with_epilogue_swizzle/50_hopper_gemm_with_epilogue_swizzle.cu b/examples/50_hopper_gemm_with_epilogue_swizzle/50_hopper_gemm_with_epilogue_swizzle.cu
new file mode 100644
index 0000000000..a736e5ce31
--- /dev/null
+++ b/examples/50_hopper_gemm_with_epilogue_swizzle/50_hopper_gemm_with_epilogue_swizzle.cu
@@ -0,0 +1,526 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Hopper GEMM example to create a GEMM kernel with custom Collectives
+
+    The following example shows how to assemble a custom GEMM kernel that spells out the Collectives
+    directly instead of using a builder and, in the process, instance a more efficient Epilogue
+    (from `cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp`) instead of using the default epilogue.
+
+    The GemmUniversal API takes 3 main template arguments:
+      (1) the problem shape / extents
+      (2) the collective mainloop type
+      (3) the collective epilogue type
+
+    While the collecive mainloop can be stamped out using a CollectiveBuilder interface, it is
+    possible to build a custom collective mainloop directly as well. Furthermore, since epilogues
+    do not yet have a builder interface, this example shows how to instantiate a more-efficient
+    epilogue alongside the collective mainloop.
+
+    Note: there are several ways to implement the GEMM epilogue in Hopper - each with its own set
+    of trade-offs. So it is recommended that users look at the options available under
+    cutlass/epilogue/collective and evaluate for their particular scenario.
+
+    Please refer to examples 48, 49 to learn more about kernel schedules and other CuTe examples
+    present in `test/unit/cute` to famialiarize with the basics of CuTe.
+
+    Examples:
+
+      $ ./examples/50_hopper_gemm_with_epilogue_swizzle/50_hopper_gemm_with_epilogue_swizzle
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/util/command_line.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/collective/collective_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/gemm_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+  bool error;
+
+  int m, n, k, l;
+  int alpha, beta;
+
+  Options():
+    help(false),
+    error(false),
+    m(2048), n(2048), k(2048), l(1),
+    alpha(1), beta(0)
+  { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("m", m, 2048);
+    cmd.get_cmd_line_argument("n", n, 2048);
+    cmd.get_cmd_line_argument("k", k, 2048);
+    cmd.get_cmd_line_argument("l", l, 1);
+    cmd.get_cmd_line_argument("alpha", alpha, 1);
+    cmd.get_cmd_line_argument("beta", beta, 0);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "50_hopper_gemm_with_epilogue_swizzle\n\n"
+      << "Hopper GEMM Example with Epilogue Swizzle.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement\n\n"
+      << "  --m=<int>                   Sets the M extent of the GEMM\n"
+      << "  --n=<int>                   Sets the N extent of the GEMM\n"
+      << "  --k=<int>                   Sets the K extent of the GEMM\n"
+      << "  --l=<int>                   Sets the L extent (batch count) of the GEMM\n"
+      << "  --alpha=<s32>               Epilogue scalar alpha\n"
+      << "  --beta=<s32>                Epilogue scalar beta\n\n";
+
+    return out;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to initialize a block of device data
+template <class Element>
+bool initialize_block(
+  cutlass::DeviceAllocation<Element>& block,
+  uint64_t seed=2023) {
+
+  Element scope_max, scope_min;
+  int bits_input = cutlass::sizeof_bits<Element>::value;
+
+  if (bits_input == 1) {
+    scope_max = 2;
+    scope_min = 0;
+  } else if (bits_input <= 8) {
+    scope_max = 2;
+    scope_min = -2;
+  } else {
+    scope_max = 8;
+    scope_min = -8;
+  }
+
+  cutlass::reference::device::BlockFillRandomUniform(
+    block.get(), block.size(), seed, scope_max, scope_min, 0);
+
+  return true;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+// Wrapper to run and verify a GEMM.
+template <
+  class Gemm
+>
+struct ExampleRunner {
+
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+
+  using LayoutA = typename Gemm::LayoutA;
+  using LayoutB = typename Gemm::LayoutB;
+  using LayoutC = typename Gemm::LayoutC;
+  using LayoutD = typename Gemm::LayoutD;
+
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementAcc = typename Gemm::ElementAccumulator;
+
+  using CollectiveEpilogue = typename Gemm::CollectiveEpilogue;
+  using ElementC = typename Gemm::ElementC;
+  using ElementOutput = typename CollectiveEpilogue::ElementOutput;
+  using ElementCompute = typename CollectiveEpilogue::ElementCompute;
+  using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator;
+
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+
+  //
+  // Data members
+  //
+
+  /// Initialization
+  StrideA stride_A;
+  StrideB stride_B;
+  StrideC stride_C;
+  StrideD stride_D;
+  uint64_t seed = 0;
+
+  cutlass::DeviceAllocation<ElementA> block_A;
+  cutlass::DeviceAllocation<ElementB> block_B;
+  cutlass::DeviceAllocation<ElementC> block_C;
+  cutlass::DeviceAllocation<ElementOutput> block_D;
+  cutlass::DeviceAllocation<ElementOutput> block_ref_D;
+
+  //
+  // Methods
+  //
+
+  bool verify(const ProblemShapeType& problem_size, int32_t alpha, int32_t beta) {
+    auto [M, N, K, L] = problem_size;
+
+    cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K}));
+    cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N}));
+    cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N}));
+    cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N}));
+
+    cutlass::reference::device::GemmComplex(
+          {M, N, K},
+          ElementCompute(alpha),
+          ref_A,
+          cutlass::ComplexTransform::kNone,
+          ref_B,
+          cutlass::ComplexTransform::kNone,
+          ElementCompute(beta),
+          ref_C,
+          ref_D,
+          ElementAccumulator(0),
+          L,     // batch_count
+          M * K, // batch_stride_A
+          K * N, // batch_stride_B
+          M * N, // batch_stride_C
+          M * N  // batch_stride_D
+        );
+
+    cudaError_t result = cudaDeviceSynchronize();
+    if (result != cudaSuccess) {
+      std::cerr << "Reference kernel failed. Last CUDA error: "
+                << cudaGetErrorString(result) << std::endl;
+      return false;
+    }
+
+    // Check if output from CUTLASS kernel and reference kernel are equal or not
+    bool passed = cutlass::reference::device::BlockCompareEqual(block_ref_D.get(), block_D.get(), block_D.size());
+
+    return passed;
+  }
+
+  /// Initialize operands to be used in the GEMM and reference GEMM
+  void initialize(const ProblemShapeType& problem_size) {
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+    stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+    stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
+    stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
+
+    block_A.reset(M * K * L);
+    block_B.reset(K * N * L);
+    block_C.reset(M * N * L);
+    block_D.reset(M * N * L);
+    block_ref_D.reset(M * N * L);
+
+    initialize_block(block_A, seed + 2023);
+    initialize_block(block_B, seed + 2022);
+    initialize_block(block_C, seed + 2021);
+  }
+
+  bool run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) {
+    ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l};
+
+    initialize(problem_size);
+
+    typename Gemm::GemmKernel::Arguments arguments{
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      problem_size,
+      {block_A.get(), stride_A, block_B.get(), stride_B},
+      {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D},
+      hw_info
+    };
+
+    Gemm gemm_op;
+
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = gemm_op.can_implement(arguments);
+    if (status != cutlass::Status::kSuccess) {
+      std::cerr << "This kernel is not supported. Last CUDA error is: "
+                << cudaGetErrorString(cudaGetLastError()) << std::endl;
+      return false;
+    }
+
+    status = gemm_op.initialize(arguments, workspace.get());
+    if (status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to initialize the CUTLASS kernel. Last CUDA error is: "
+                << cudaGetErrorString(cudaGetLastError()) << std::endl;
+      return false;
+    }
+
+    // Run the GEMM
+    status = gemm_op.run();
+    if (status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to launch the CUTLASS kernel. Last CUDA error is: "
+                << cudaGetErrorString(cudaGetLastError()) << std::endl;
+      return false;
+    }
+
+    cudaError_t result = cudaDeviceSynchronize();
+    if (result != cudaSuccess) {
+      std::cerr << "Error running the CUTLASS kernel. Last CUDA error is: "
+                << cudaGetErrorString(result) << std::endl;
+      return false;
+    }
+
+    // Verify that the result is correct
+    bool passed = verify(problem_size, options.alpha, options.beta);
+    if (!passed) {
+      std::cerr << "Reference check failed" << std::endl;
+    }
+
+    return passed;
+  }
+
+};
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (__CUDACC_VER_MAJOR__ < 12 || props.major < 9) {
+    std::cout
+      << "This example requires a GPU of NVIDIA's Hopper Architecture or "
+      << "later (compute capability 90 or greater) and CUDA 12.0 or greater.\n";
+      return 0;
+  }
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.error) {
+    std::cerr << "Aborting execution." << std::endl;
+    return -1;
+  }
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+  //
+  // Run examples
+  //
+
+  // The KernelHardwareInfo struct holds the number of SMs on the GPU with a given device ID. This
+  // information is used by the underlying kernel.
+  cutlass::KernelHardwareInfo hw_info;
+
+  // Change device_id to another value if you are running on a machine with multiple GPUs and wish
+  // to use a GPU other than that with device ID 0.
+  hw_info.device_id = 0;
+  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+  bool passed;
+
+  // Problem configuration
+  using ElementA = int8_t;
+  using ElementB = int8_t;
+  using ElementAcc = int32_t;
+  using ElementOutput = int8_t;
+
+  // Note : Only TN WGMMA Gemm is supported currently in 3.0
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  // Tiling configuration selection
+  using TileShape = Shape<_128,_64,_128>;
+
+  // Choosing a thread block cluster larger than 1 allows us to Multicast data across thread blocks
+  using ClusterShape = Shape<_1,_2,_1>;
+
+  //
+  // Assembling the CollectiveMainloop type
+  //
+
+  // Pipeline Depth to be used i.e number of A, B buffers in shared memory
+  constexpr int PipelineStages = 8;
+
+  // Let's choose a Warp-Specialized Mainloop implemention which uses TMA
+  // Note : This requires / assumes the tensors to be 16B aligned
+  using DispatchPolicy = cutlass::gemm::MainloopSm90TmaGmmaWarpSpecialized<PipelineStages, ClusterShape,
+                           cutlass::gemm::KernelTmaWarpSpecialized>;
+
+  // TN => K Major for both A & B
+  static constexpr cute::GMMA::Major GmmaMajorA = cute::GMMA::Major::K;
+  static constexpr cute::GMMA::Major GmmaMajorB = cute::GMMA::Major::K;
+
+  // We use the SS op selector as both A, B operands are read directly from SMEM (for TN WGMMA)
+  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector<
+      ElementA, ElementB, ElementAcc, TileShape, GmmaMajorA, GmmaMajorB>()));
+
+  // A loads can be optimized with multicast if cluster-n > 1
+  using GmemTiledCopyA = std::conditional< cute::size(shape<1>(ClusterShape{})) == 1,
+                           cute::SM90_TMA_LOAD,
+                           cute::SM90_TMA_LOAD_MULTICAST>::type;
+
+  // B loads can be optimized with multicast if cluster-m > 1
+  using GmemTiledCopyB = std::conditional< cute::size(shape<0>(ClusterShape{})) == 1,
+                           cute::SM90_TMA_LOAD,
+                           cute::SM90_TMA_LOAD_MULTICAST>::type;
+
+  using SmemLayoutAtomA = decltype(cutlass::gemm::collective::detail::ss_smem_selector<
+      GmmaMajorA, ElementA, decltype(cute::get<0>(TileShape{})), decltype(cute::get<2>(TileShape{}))
+    >());
+
+  using SmemLayoutAtomB = decltype(cutlass::gemm::collective::detail::ss_smem_selector<
+      GmmaMajorB, ElementB, decltype(cute::get<1>(TileShape{})), decltype(cute::get<2>(TileShape{}))
+    >());
+
+  using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma<
+      DispatchPolicy,
+      TileShape,
+      ElementA,
+      cutlass::gemm::TagToStrideA_t<LayoutA>,
+      ElementB,
+      cutlass::gemm::TagToStrideB_t<LayoutB>,
+      TiledMma,
+      GmemTiledCopyA,
+      SmemLayoutAtomA,
+      void, // Does not need a SmemCopyAtom, since A is read directly from SMEM
+      cute::identity,
+      GmemTiledCopyB,
+      SmemLayoutAtomB,
+      void, // Does not need a SmemCopyAtom, since B is read directly from SMEM
+      cute::identity
+    >;
+
+  //
+  // Assembling the Collective Epilogue Type
+  //
+
+  // Break the 128 along TILE_M into chunks of 32, to get a 128B leading dimension
+  using PreSwizzleLayout = Layout< Shape< Shape <_32,_4   >,_64>,
+                                   Stride<Stride< _1,_2048>,_32>>;
+
+  // 128 threads loading 16 elements each (to get vectorized global stores)
+  using TileShapeS2R = Shape<_128,_16>;
+
+  // Layout to ensure bank-conflict free loads & stores
+  using SmemLayout = ComposedLayout<
+                       Swizzle<3,4,3>,
+                       smem_ptr_flag_bits<sizeof_bits<ElementAcc>::value>,
+                       PreSwizzleLayout>;
+
+  // Tiled copy from Smem to Registers
+  // Note : CuTe will vectorize this copy if the tiling + swizzling above were right
+  using TiledCopyS2R = TiledCopy<
+                         Copy_Atom<DefaultCopy, ElementAcc>,
+                         Layout< Shape<_128,_16>, 
+                                 Stride<_16,_1>>,
+                         TileShapeS2R>;
+
+  using Epilogue = cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
+    cutlass::epilogue::collective::Epilogue<
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::gemm::TagToStrideC_t<LayoutD>,
+      cutlass::epilogue::thread::LinearCombination<int32_t, 1, int32_t, int32_t>,
+      SmemLayout,
+      Copy_Atom<DefaultCopy, ElementAcc>,
+      TiledCopyS2R,
+      Copy_Atom<DefaultCopy, ElementOutput>>>;
+
+  //
+  // Assembling the GemmKernel
+  //
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      Epilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  
+  ExampleRunner<Gemm> runner;
+
+  passed = runner.run(options, hw_info);
+
+  std::cout << "WGMMA GEMM with Epilogue Swizzle : " << (passed ? "Passed" : "Failed") << std::endl;
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+  return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/50_hopper_gemm_with_epilogue_swizzle/CMakeLists.txt b/examples/50_hopper_gemm_with_epilogue_swizzle/CMakeLists.txt
new file mode 100644
index 0000000000..5498d4effe
--- /dev/null
+++ b/examples/50_hopper_gemm_with_epilogue_swizzle/CMakeLists.txt
@@ -0,0 +1,35 @@
+
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+
+cutlass_example_add_executable(
+  50_hopper_gemm_with_epilogue_swizzle
+  50_hopper_gemm_with_epilogue_swizzle.cu
+  )
diff --git a/examples/51_hopper_gett/51_hopper_gett.cu b/examples/51_hopper_gett/51_hopper_gett.cu
new file mode 100644
index 0000000000..005eaec5a1
--- /dev/null
+++ b/examples/51_hopper_gett/51_hopper_gett.cu
@@ -0,0 +1,371 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Example of a GETT targeting Hopper tensor cores using the CUTLASS 3.x API.
+
+  CUTLASS has long provided implementations of Generalized Matrix times Matrix (GEMM) kernels.
+  However, a plethora of workloads compute on higher ranked tensors. Products of such tensors,
+  called tensor contractions, can be executed as multiple batched GEMMs, however, they can be
+  further accelerated with kernels that natively operate on these higher ranked tensors to
+  perform Generalized Tensor times Tensor contractions (GETT). CuTe's hierarchical layouts
+  and CUTLASS 3.0's unified micro-kernels make implementation of GETTs trivial. In this example,
+  we show how CUTLASS 3.0, CuTe, and Hopper's TMA feature together can accelerate GETTs while
+  making the process of authoring custom GETT kernels easier than ever before.
+
+  The modes of a tensor that participate in a GETT can be fundamentally grouped into four
+  semantic categories. The contraction modes (or K-modes) only appear in the A and B (left and right)
+  inputs but not in the C output tensor. Row modes (or M-modes) only appear in the left
+  input tensor (A) and the output tensor (C). Column modes (or N-modes) only appear in the
+  right (B) input tensor and the output tensor (C). Batch modes (or L-modes) appear in all
+  input and output tensors. If we fold the many modes of a tensor contraction into these four
+  categories, it would allow us to represent the input and output tensors as rank-3 "matrices"
+  that can be computed upon as if we were computing a batched GEMM!
+
+  This is exactly what CuTe's hierarchical layout representation allows us to do! Instead of having
+  simple integers as strides for these four modes, we can have nested strides for each of these
+  semantic categories that themselves have multiple modes within them -- multi-mode strides!
+  In CUTLASS 3.0, all one has to do to take advantage of this capability is to substitute the
+  required multi-mode strides instead of the default ones provided by gemm::detail::TagToStrideX.
+
+  In the following example, we illustrate how every Hopper GEMM in CUTLASS 3.0 is a GETT in disguise.
+  We begin by defining the four modes detailed above as Row, Col (column), Red (reduction), and
+  Bat (batch) strides, which we then nest for each of the in/out tensors to create our rank-3 stride
+  tuples. Note that although we do not define the problem shape type explicitely, it too remains a
+  rank-4 shape tuple just like any other batched GEMM, but instead with multi-mode shapes for each
+  of the four corresponding multi-modes within it. After this, the same CollectiveMma and
+  CollectiveBuilder we describe in examples 50 and 49 are used to create our kernel type. Nothing
+  else changes from a user's point of view. Note that multi-mode strides do not affect our
+  specializations in any way -- the lexical spelling of our kernels remains the same. The
+  only difference between a CUTLASS 3 batched GEMM and GETT are the instaced CuTe Layouts.
+
+  CollectiveBuilders rely on detecting the static-1 in the stride tuples to determine the major mode,
+  which is what the example demonstrates. However, it is possible to have all modes be dynamic as well
+  if the user assembles a CollectiveMma manually and ensures that the runtime strides are compatible
+  with the static micro-kernel of the collective (TiledMma, TiledCopy, and smem layouts). On the other
+  hand, a user can have more than one static stride too (which need not correspond to the major mode).
+
+  In particular, this example demonstrates a GETT where the 0th M-mode (M0) in A and the 0th K-mode (K0)
+  in B are major. All other combinations of major modes are supported, with the exception of mixed
+  K-major scenarios where both A and B are K-major (e.g. K0 is major in A but K1 is major in B).
+  NVIDIA Hopper architecture's TMA feature makes the predictaion required to implement these complicated
+  kernels trivial, as it is all handled by TMA itself without requiring any programmer effort.
+
+  Example executions, where the stride order defines the major-order (major on the left):
+  51_hopper_gett --modeC=m,n,l --modeA=m,k,l --modeB=k,n,l --extents=m:4096,n:4096,k:4096
+  51_hopper_gett --modeC=l,m,n --modeA=m,l,k --modeB=k,n,l --extents=m:128,n:128,k:128,l:64
+  51_hopper_gett --modeC=m,a,b,p,q,n,l --modeA=m,l,b,k,a --modeB=k,n,p,q,l --extents=m:32,a:32,b:3,n:128,k:128,l:4,p:3,q:3
+*/
+
+#include "gett_kernel.cuh"
+#include "thrust/host_vector.h"
+#include "thrust/device_vector.h"
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+
+#include "cutlass/util/gett_commandline.hpp"
+#include "cutlass/util/reference/device/gett.hpp"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/print_error.hpp"
+
+namespace example {
+
+// Returns true if the left-most value in the tuple is statically known to be 1
+template<class Stride>
+constexpr bool
+is_left_major() {
+  // Account for stride types with and without batch mode and batch modes with static zero stride
+  return cute::is_constant<1, decltype(cute::size<0,0>(Stride{}))>::value;
+}
+
+// Same as cute::make_int_tuple but inserts a major stride (Int<1>) for the leftmost mode if required
+template <int Rank, bool IsMajor, class Indexable>
+static constexpr
+auto
+make_stride_tuple(Indexable const& t, int n, int64_t init_default = 0) {
+  static_assert(Rank > 1);
+  if constexpr (IsMajor) {
+    return cute::transform(cute::make_seq<Rank>{}, [&](auto i) {
+      if constexpr (i == 0) {
+        return cute::Int<1>{};
+      }
+      else {
+        return i < n ? t[i] : init_default;
+      }
+    });
+  }
+  else {
+    return cute::make_int_tuple<Rank>(t, n, init_default);
+  }
+}
+
+} // namespace example
+
+//////////////////////////////////////////////////////////////////////////////
+
+int
+main(int argc, char const* argv[]) {
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+  using namespace cute;
+
+  if (argc != 5) {
+    std::cout << "Number of command line args must be 4.\n";
+    cutlass::GettCommandLine::print_usage();
+    return 0;
+  }
+
+  //
+  // Define the stride types for A, B, C, and D
+  //
+
+  // Stride for A (left input). If reduction mode is major, same must be major in B
+  // For this example, M0 is major in A.
+  using RowModeStridesA = cute::Stride<cute::Int<1>, int64_t, int64_t, int64_t>;
+  using RedModeStridesA = cute::Stride<int64_t, int64_t, int64_t>;
+  using BatModeStridesA = cute::Stride<int64_t, int64_t, int64_t, int64_t>;
+
+  // Stride for B (right input). If reduction mode is major, same must be major in A
+  // For this example, K0 is major in B.
+  using ColModeStridesB = cute::Stride<int64_t, int64_t, int64_t, int64_t>;
+  using RedModeStridesB = cute::Stride<cute::Int<1>, int64_t, int64_t>;
+  using BatModeStridesB = cute::Stride<int64_t, int64_t, int64_t, int64_t>;
+
+  // Strides for output, which can all be dynamic.
+  using RowModeStridesC = cute::Stride<int64_t, int64_t, int64_t, int64_t>;
+  using ColModeStridesC = cute::Stride<int64_t, int64_t, int64_t, int64_t>;
+  using BatModeStridesC = cute::Stride<int64_t, int64_t, int64_t, int64_t>;
+
+  // Assmble our rank-3 multi-mode strides for the in/out tensors
+  using StrideA = cute::Stride<RowModeStridesA, RedModeStridesA, BatModeStridesA>;
+  using StrideB = cute::Stride<ColModeStridesB, RedModeStridesB, BatModeStridesB>;
+  using StrideC = cute::Stride<RowModeStridesC, ColModeStridesC, BatModeStridesC>;
+
+  // Note: C and D share strides here for simplicity.
+  //       In general, they need not have the same layout.
+  using StrideD = StrideC;
+
+  //
+  // Define element types for tensors and intermediate values
+  //
+  using ElementA = cutlass::half_t;
+  using ElementB = cutlass::half_t;
+  using ElementC = cutlass::half_t;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementEpilogue = float;
+
+  // The following constexpr values set the max number of modes in each MNKL mode
+  constexpr int MaxRank_M = cute::rank(RowModeStridesA{}); // Max row modes
+  constexpr int MaxRank_N = cute::rank(ColModeStridesB{}); // Max column modes
+  constexpr int MaxRank_K = cute::rank(RedModeStridesA{}); // Max contraction modes
+  constexpr int MaxRank_L = cute::rank(BatModeStridesA{}); // Max batch modes
+  static_assert(cute::rank(RowModeStridesA{}) == cute::rank(RowModeStridesC{}));
+  static_assert(cute::rank(ColModeStridesB{}) == cute::rank(RowModeStridesC{}));
+  static_assert(cute::rank(RedModeStridesA{}) == cute::rank(RedModeStridesB{}));
+  static_assert(cute::rank(BatModeStridesA{}) == cute::rank(BatModeStridesC{}));
+  static_assert(cute::rank(BatModeStridesB{}) == cute::rank(BatModeStridesC{}));
+
+  // Parse command line to get modes, extents, and strides
+  cutlass::GettCommandLine cmd;
+  auto parsed_args = cmd.parse(argc, argv, true);
+
+  auto& m = parsed_args.M;
+  auto& ldAm = parsed_args.ldAm;
+  auto& ldCm = parsed_args.ldCm;
+  int rank_m = int(m.size());
+
+  auto& n = parsed_args.N;
+  auto& ldBn = parsed_args.ldBn;
+  auto& ldCn = parsed_args.ldCn;
+  int rank_n = int(n.size());
+
+  auto& k = parsed_args.K;
+  auto& ldAk = parsed_args.ldAk;
+  auto& ldBk = parsed_args.ldBk;
+  int rank_k = int(k.size());
+
+  auto& l = parsed_args.L;
+  auto& ldAl = parsed_args.ldAl;
+  auto& ldBl = parsed_args.ldBl;
+  auto& ldCl = parsed_args.ldCl;
+  int rank_l = int(l.size());
+
+  if ((rank_m > MaxRank_M) || (rank_n > MaxRank_N) || (rank_k > MaxRank_K) || (rank_l > MaxRank_L)) {
+    std::cerr << "ERROR: Input has more modes than statically configured.";
+    return 1;
+  }
+
+  // Check that the user input major stride match the static major strides.
+  if (example::is_left_major<RowModeStridesA>() && (ldAm[0] != 1)) {
+    std::cerr << "ERROR: A_M0 is expected to be major, but was not in the provided input!\n";
+    return 1;
+  }
+
+  if (example::is_left_major<RedModeStridesA>() && (ldAk[0] != 1)) {
+    std::cerr << "ERROR: A_K0 is expected to be major, but was not in the provided input!\n";
+    return 1;
+  }
+
+  if (example::is_left_major<ColModeStridesB>() && (ldBn[0] != 1)) {
+    std::cerr << "ERROR: B_N0 is expected to be major, but was not in the provided input!\n";
+    return 1;
+  }
+
+  if (example::is_left_major<RedModeStridesB>() && (ldBk[0] != 1)) {
+    std::cerr << "ERROR: B_K0 is expected to be major, but was not in the provided input!\n";
+    return 1;
+  }
+
+  // Convert to `cute::Tuple`s and set up arguments
+  auto M   = make_int_tuple<MaxRank_M>(m.data(), rank_m, 1);
+  auto dAm = example::make_stride_tuple<MaxRank_M, example::is_left_major<RowModeStridesA>()>(ldAm.data(), rank_m);
+  auto dCm = example::make_stride_tuple<MaxRank_M, example::is_left_major<RowModeStridesC>()>(ldCm.data(), rank_m);
+
+  auto N   = make_int_tuple<MaxRank_N>(n.data(), rank_n, 1);
+  auto dBn = example::make_stride_tuple<MaxRank_N, example::is_left_major<ColModeStridesB>()>(ldBn.data(), rank_n);
+  auto dCn = example::make_stride_tuple<MaxRank_N, example::is_left_major<ColModeStridesC>()>(ldCn.data(), rank_n);
+
+  auto K   = make_int_tuple<MaxRank_K>(k.data(), rank_k, 1);
+  auto dAk = example::make_stride_tuple<MaxRank_K, example::is_left_major<RedModeStridesA>()>(ldAk.data(), rank_k);
+  auto dBk = example::make_stride_tuple<MaxRank_K, example::is_left_major<RedModeStridesB>()>(ldBk.data(), rank_k);
+
+  auto L   = make_int_tuple<MaxRank_L>(l.data(), rank_l, 1);
+  auto dAl = make_int_tuple<MaxRank_L>(ldAl.data(), rank_l, 0);
+  auto dBl = make_int_tuple<MaxRank_L>(ldBl.data(), rank_l, 0);
+  auto dCl = make_int_tuple<MaxRank_L>(ldCl.data(), rank_l, 0);
+
+  // Concat tuples to turn it into rank-4 problem shape and rank-3 strides, just like GEMM
+  auto problem_shape = make_shape(M, N, K, L);
+  StrideA stride_A   = make_stride(dAm, dAk, dAl);
+  StrideB stride_B   = make_stride(dBn, dBk, dBl);
+  StrideC stride_C   = make_stride(dCm, dCn, dCl);
+  StrideD stride_D   = stride_C;
+
+  auto alpha = ElementEpilogue(1.0f);
+  auto beta  = ElementEpilogue(1.0f);
+
+  //
+  // Allocate and init tensors
+  //
+  auto M_size = std::accumulate(std::begin(m), std::end(m), 1, std::multiplies<>{});
+  auto N_size = std::accumulate(std::begin(n), std::end(n), 1, std::multiplies<>{});
+  auto K_size = std::accumulate(std::begin(k), std::end(k), 1, std::multiplies<>{});
+  auto L_size = std::accumulate(std::begin(l), std::end(l), 1, std::multiplies<>{});
+
+  thrust::host_vector<ElementA> h_A(M_size * K_size * L_size);
+  thrust::host_vector<ElementB> h_B(N_size * K_size * L_size);
+  thrust::host_vector<ElementC> h_C(M_size * N_size * L_size);
+  thrust::host_vector<ElementD> h_D(M_size * N_size * L_size);
+
+  // Note: the cast to int here is to avoid false-negative ref-checks which can
+  // occur due to floating point arithmetic not being purely associative.
+  for (auto& a : h_A) a = ElementA(int(4*(rand() / double(RAND_MAX)) - 1));
+  for (auto& b : h_B) b = ElementB(int(4*(rand() / double(RAND_MAX)) - 1));
+  for (auto& c : h_C) c = ElementC(int(4*(rand() / double(RAND_MAX)) - 1));
+  for (auto& d : h_D) d = ElementD(-1);
+
+  thrust::device_vector<ElementA> d_A = h_A;
+  thrust::device_vector<ElementB> d_B = h_B;
+  thrust::device_vector<ElementC> d_C = h_C;
+  thrust::device_vector<ElementD> cutlass_result = h_D;
+  thrust::device_vector<ElementD> reference_result = h_D;
+
+  //
+  // Compute GETT
+  //
+  auto status = example::gett_kernel(
+    problem_shape,
+    d_A.data().get(), stride_A,
+    d_B.data().get(), stride_B,
+    ElementAccumulator{},
+    d_C.data().get(), stride_C,
+    cutlass_result.data().get(), stride_D,
+    alpha, beta);
+
+  if (cutlass::Status::kSuccess != status) {
+    std::cerr << "ERROR: GETT operator launch failed.\n";
+    return 1;
+  }
+
+  auto cuda_err = cudaDeviceSynchronize();
+  if (cudaSuccess != cuda_err) {
+    std::cerr << "ERROR: GETT operator execution failed. with error :";
+    std::cerr << cudaGetErrorString(cuda_err) << "\n";
+    return 1;
+  }
+
+  //
+  // Verify
+  //
+
+  cutlass::reference::device::gett(
+    problem_shape,
+    d_A.data().get(), stride_A,
+    d_B.data().get(), stride_B,
+    ElementAccumulator{},
+    d_C.data().get(), stride_C,
+    reference_result.data().get(), stride_D,
+    alpha, beta);
+
+  cuda_err = cudaDeviceSynchronize();
+  if (cudaSuccess != cuda_err) {
+    std::cerr << "ERROR: GETT reference execution failed. with error :";
+    std::cerr << cudaGetErrorString(cuda_err) << "\n";
+    return 1;
+  }
+
+  // Check if output from CUTLASS kernel and reference kernel are equal or not
+  bool passed = cutlass::reference::device::BlockCompareEqual(
+      reference_result.data().get(), cutlass_result.data().get(), cutlass_result.size());
+  if (passed) {
+    std::cout << "GETT verification passed.\n";
+    return 0;
+  }
+  else {
+    std::cerr << "ERROR: GETT verification failed! Printing detailed stats.\n";
+    h_D = reference_result;
+    thrust::host_vector<ElementD> h_cutlass_result = cutlass_result;
+    print_relative_error(h_cutlass_result.size(), h_cutlass_result.data(), h_D.data());
+
+    std::cout << "StrideA: "; print(stride_A); std::cout << '\n';
+    std::cout << "StrideB: "; print(stride_B); std::cout << '\n';
+    std::cout << "StrideC: "; print(stride_C); std::cout << '\n';
+    std::cout << "StrideD: "; print(stride_D); std::cout << '\n';
+    return 1;
+  }
+#else
+  std::cerr << "Unsupported example. Please ensure CUTLASS_ARCH_MMA_SM90_SUPPORTED is defined.\n";
+  return 0;
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+}
diff --git a/examples/51_hopper_gett/CMakeLists.txt b/examples/51_hopper_gett/CMakeLists.txt
new file mode 100644
index 0000000000..f18dff3817
--- /dev/null
+++ b/examples/51_hopper_gett/CMakeLists.txt
@@ -0,0 +1,32 @@
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cutlass_example_add_executable(
+  51_hopper_gett
+  51_hopper_gett.cu
+)
diff --git a/examples/51_hopper_gett/gett_kernel.cuh b/examples/51_hopper_gett/gett_kernel.cuh
new file mode 100644
index 0000000000..6a775d137b
--- /dev/null
+++ b/examples/51_hopper_gett/gett_kernel.cuh
@@ -0,0 +1,138 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cute/tensor.hpp"
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/collective/collective_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+namespace example {
+
+//
+// GETT entry point
+//
+template <
+  class ProblemShapeMNKL,
+  class ElementA,
+  class StrideA,
+  class ElementB,
+  class StrideB,
+  class ElementAccumulator,
+  class ElementC,
+  class StrideC,
+  class ElementD,
+  class StrideD,
+  class ElementEpilogue>
+cutlass::Status
+gett_kernel(
+    ProblemShapeMNKL problem_shape_mnkl,
+    ElementA const* ptr_A, StrideA stride_a_mkl,
+    ElementB const* ptr_B, StrideB stride_b_nkl,
+    ElementAccumulator _,
+    ElementC const* ptr_C, StrideC stride_c_mnl,
+    ElementD      * ptr_D, StrideD stride_d_mnl,
+    ElementEpilogue alpha, ElementEpilogue beta,
+    cudaStream_t stream = 0) {
+  using namespace cute;
+
+  // TileShape -- GETT configuration
+  // Specify the number of elements to take from each mode 
+  // BLK_M = (M0,M1,...)  BLK_N = (M0,M1,...)  BLK_K = (K0,K1,...)
+
+  // Take 128 from m0, 128 from n0, 64 from k0
+  using TileShape = Shape<Shape<_128>, Shape<_128>, Shape<_64>>;
+
+  /* Other examples:
+   * Take 32 elements from m0 and 4 elements from m1
+   * Take 64 elements from n0 and 2 elements from n1
+   * Take  8 elements from k0 and 8 elements from k1
+  **/
+  // using TileShape = Shape<Shape<_32,_4>, Shape<_64,_2>, Shape<_8,_8>>;
+  
+  using EpilogueThreadOp = cutlass::epilogue::thread::LinearCombination<
+      ElementD, 1, ElementAccumulator, ElementEpilogue, cutlass::epilogue::thread::ScaleType::Default,
+      cutlass::FloatRoundStyle::round_to_nearest, ElementC>;
+
+  // No changes are required to the default epilogue
+  using CollectiveEpilogue = cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
+    cutlass::epilogue::collective::DefaultEpilogue<
+      StrideC,
+      StrideD,
+      EpilogueThreadOp,
+      cutlass::gemm::EpilogueDefault>>;
+
+  // CollectiveMma for GETTs can be built using the CollectiveBuilders
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, StrideA, 128 / cutlass::sizeof_bits<ElementA>::value,
+      ElementB, StrideB, 128 / cutlass::sizeof_bits<ElementB>::value,
+      ElementAccumulator,
+      TileShape, Shape<_1,_2,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<
+        static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  // The GETT kernel is a composition of a collective mainloop and epilogue, just like any 3.x GEMM
+  using GettKernel = cutlass::gemm::kernel::GemmUniversal<
+      ProblemShapeMNKL,
+      CollectiveMainloop,
+      CollectiveEpilogue>;
+
+  using GettOperator = cutlass::gemm::device::GemmUniversalAdapter<GettKernel>;
+
+  typename GettOperator::Arguments args {
+    cutlass::gemm::GemmUniversalMode::kBatched,
+    problem_shape_mnkl,
+    { ptr_A, stride_a_mkl, ptr_B, stride_b_nkl }, 
+    { {alpha, beta}, ptr_C, stride_c_mnl, ptr_D, stride_d_mnl }
+  };
+
+#if CUTLASS_DEBUG_TRACE_LEVEL > 0
+  print("Problem shape:");
+  print("\tM: "); print(cute::get<0>(problem_shape_mnkl)); print("\n");
+  print("\tN: "); print(cute::get<1>(problem_shape_mnkl)); print("\n");
+  print("\tK: "); print(cute::get<2>(problem_shape_mnkl)); print("\n");
+  print("\tL: "); print(cute::get<3>(problem_shape_mnkl)); print("\n");
+  print("TileSape:"); print(TileShape{}); print("\n");
+#endif
+
+  GettOperator op;
+  return op(args, stream);
+}
+
+} // namespace example
diff --git a/examples/52_hopper_gather_scatter_fusion/52_hopper_gather_scatter_fusion.cu b/examples/52_hopper_gather_scatter_fusion/52_hopper_gather_scatter_fusion.cu
new file mode 100644
index 0000000000..0a74e02a83
--- /dev/null
+++ b/examples/52_hopper_gather_scatter_fusion/52_hopper_gather_scatter_fusion.cu
@@ -0,0 +1,693 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Example of a Hopper gather+GEMM+scatter kernel fusion.
+
+  This example fuses gather before GEMM and scatter after GEMM into the same
+  GEMM kernel. Gather and scatter operation is controled by an index vector
+  to select rows or columns from A, B, C or D matrices.
+
+  Gather/scatter operations are always performed along a strided dimension 
+  in order to preserve vectorized loads/stores. Thus the index vector is 
+  applied to rows of row-major matrices and columns of column-major matrices.
+
+  Note that the index vector must contain integers in range [0,X) where
+  X is one of (M,N,K), depending on selected gather dimension. The problem
+  shape given to the GEMM kernel must consist of matrix sizes AFTER gather
+  and BEFORE scatter operations are applied.
+*/
+
+#include <cstdlib>
+#include <cstdio>
+#include <ctime>
+#include <cmath>
+#include <cassert>
+#include <algorithm>
+#include <iostream>
+#include <random>
+#include <numeric>
+
+#include <cuda_runtime.h>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm_universal.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "helper.h"
+#include "gather_gemm.hpp"
+#include "gather_kernel.cuh"
+#include "scatter_epilogue.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+using namespace cute;
+
+namespace example {
+
+// Command line options parsing
+struct Options {
+
+  bool help = false;
+
+  cutlass::gemm::BatchedGemmCoord problem_size = {2048, 2048, 2048, 1};
+  int index_size = 1024;
+  int mode = 1; // N-mode gather/scatter by default
+
+  float alpha = 1.0f;
+  float beta  = 0.0f;
+
+  bool reference_check = true;
+  int iterations = 20;
+
+  bool valid() const {
+    return problem_size.m() > 0
+        && problem_size.n() > 0
+        && problem_size.k() > 0
+        && problem_size.batch() > 0
+        && 0 <= mode && mode < 3
+        && index_size <= problem_size.at(mode)
+        && iterations > 0;
+  }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+
+    cmd.get_cmd_line_argument("m", problem_size.m());
+    cmd.get_cmd_line_argument("n", problem_size.n());
+    cmd.get_cmd_line_argument("k", problem_size.k());
+    cmd.get_cmd_line_argument("batch_size", problem_size.batch());
+    cmd.get_cmd_line_argument("index_size", index_size);
+
+    char const modes[] = {'m', 'n', 'k'};
+    char mode_input = modes[mode];
+    cmd.get_cmd_line_argument("mode", mode_input);
+    mode = int(std::distance(std::begin(modes), std::find(std::begin(modes), std::end(modes), mode_input)));
+
+    cmd.get_cmd_line_argument("alpha", alpha);
+    cmd.get_cmd_line_argument("beta", beta);
+
+    cmd.get_cmd_line_argument("check", reference_check, true);
+    cmd.get_cmd_line_argument("iterations", iterations);
+
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << 
+      "52_hopper_gather_scatter_fusion example\n"
+      "\n"
+      "  This example uses the CUTLASS Library to fuse gather/scatter of input/output tensors with GEMM.\n"
+      "  It validates and benchmarks the fused kernel against an unfused implementation that executes\n"
+      "  gather+GEMM+scatter in sequence and writes intermediate (gathered) tensors to memory.\n"
+      "  For the unfused implementation two GEMM kernels are considered: default one that uses the same\n"
+      "  schedule and instruction set as the fused one, and an optimized one that utilizes advanced\n"
+      "  features (such as TMA units) that cannot be used by the fused kernel due to hardware constraints."
+      "\n"
+      "Options:\n"
+      "  --help                      If specified, displays this usage statement.\n"
+      "  --m=<int>                   GEMM M dimension\n"
+      "  --n=<int>                   GEMM N dimension\n"
+      "  --k=<int>                   GEMM K dimension\n"
+      "  --batch_size=<int>          GEMM batch size\n"
+      "  --index_size=<int>          Size of N dimension gather/scatter index\n"
+      "  --mode=<m,n,k>              Gather mode (M, N, or K)\n"
+      "  --alpha=<float>             GEMM alpha parameter\n"
+      "  --beta=<float>              GEMM beta parameter\n"
+      "  --iterations=<int>          Number of profiling iterations to perform.\n"
+      "\n"
+      "Examples:\n"
+      "\n"
+      "$ ./examples/52_hopper_gather_scatter_fusion/52_hopper_gather_scatter_fusion --m=1024 --n=2048 --k=1024 --mode=n --index_size=1024\n";
+
+    return out;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<class ElementA, class LayoutA, class GatherA,
+         class ElementB, class LayoutB, class GatherB,
+         class ElementC, class LayoutC, class GatherC,
+         class ElementD, class LayoutD, class ScatterD,
+         class ElementAccumulator, class ElementComputeEpilogue>
+struct ExampleRunner
+{
+  // Useful aliases
+
+  using ProblemShape = Shape<int,int,int,int>;
+
+  using StrideA = cutlass::gemm::TagToStrideA_t<LayoutA>;
+  using StrideB = cutlass::gemm::TagToStrideB_t<LayoutB>;
+  using StrideC = cutlass::gemm::TagToStrideC_t<LayoutC>;
+  using StrideD = cutlass::gemm::TagToStrideC_t<LayoutD>;
+
+  // Alias to for the epilogue type that supports gather/scatter
+  using Epilogue = cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
+    cutlass::epilogue::collective::EpilogueGatherScatter<
+      StrideC, StrideD,
+      cutlass::epilogue::thread::LinearCombination<
+        ElementD, 1,
+        ElementAccumulator, ElementComputeEpilogue,
+        cutlass::epilogue::thread::ScaleType::Default,
+        cutlass::FloatRoundStyle::round_to_nearest, ElementC
+      >,
+      cutlass::gemm::EpilogueDefault,
+      GatherC,
+      ScatterD
+    >
+  >;
+
+  // Alias to for the mainloop type
+  using Mainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    ElementA, LayoutA, 128 / cutlass::sizeof_bits<ElementA>::value,
+    ElementB, LayoutB, 128 / cutlass::sizeof_bits<ElementB>::value,
+    ElementAccumulator,
+    Shape<_128,_128,_64>,
+    Shape<_1,_1,_1>,
+    cutlass::gemm::collective::StageCountAuto,
+    cutlass::gemm::KernelCpAsyncWarpSpecialized
+  >::CollectiveOp;
+
+  using Kernel = cutlass::gemm::kernel::GemmGather<
+    ProblemShape,
+    Mainloop,
+    Epilogue,
+    void,
+    GatherA,
+    GatherB
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<Kernel>;
+
+  static constexpr bool DoGatherA  = not cutlass::platform::is_same<GatherA,  NoGather>::value;
+  static constexpr bool DoGatherB  = not cutlass::platform::is_same<GatherB,  NoGather>::value;
+  static constexpr bool DoGatherC  = not cutlass::platform::is_same<GatherC,  NoGather>::value;
+  static constexpr bool DoScatterD = not cutlass::platform::is_same<ScatterD, NoGather>::value;
+
+  static constexpr bool GatherAonM  = DoGatherA  && cutlass::platform::is_same<LayoutA,cutlass::layout::RowMajor>::value;
+  static constexpr bool GatherAonK  = DoGatherA  && cutlass::platform::is_same<LayoutA,cutlass::layout::ColumnMajor>::value;
+  static constexpr bool GatherBonN  = DoGatherB  && cutlass::platform::is_same<LayoutB,cutlass::layout::ColumnMajor>::value;
+  static constexpr bool GatherBonK  = DoGatherB  && cutlass::platform::is_same<LayoutB,cutlass::layout::RowMajor>::value;
+  static constexpr bool GatherConM  = DoGatherC  && cutlass::platform::is_same<LayoutC,cutlass::layout::RowMajor>::value;
+  static constexpr bool GatherConN  = DoGatherC  && cutlass::platform::is_same<LayoutC,cutlass::layout::ColumnMajor>::value;
+  static constexpr bool ScatterDonM = DoScatterD && cutlass::platform::is_same<LayoutD,cutlass::layout::RowMajor>::value;
+  static constexpr bool ScatterDonN = DoScatterD && cutlass::platform::is_same<LayoutD,cutlass::layout::ColumnMajor>::value;
+
+  static constexpr bool GatherModeM = GatherAonM || GatherConM || ScatterDonM;
+  static constexpr bool GatherModeN = GatherBonN || GatherConN || ScatterDonN;
+  static constexpr bool GatherModeK = GatherAonK || GatherBonK;
+
+  static_assert( GatherModeM && !GatherModeN && !GatherModeK ||
+                !GatherModeM &&  GatherModeN && !GatherModeK ||
+                !GatherModeM && !GatherModeN &&  GatherModeK,
+                "Only one gather mode (M, N or K) is supported by example runner");
+
+  // Construct a reference (non-gather) GEMM kernel type
+
+  using MainloopRef = Mainloop;
+
+  using EpilogueRef = cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
+    cutlass::epilogue::collective::DefaultEpilogue<
+      StrideC, StrideD,
+      typename Epilogue::ThreadEpilogueOp,
+      typename Epilogue::EpilogueSchedule
+    >
+  >;
+
+  using KernelRef = cutlass::gemm::kernel::GemmUniversal<
+    ProblemShape,
+    MainloopRef,
+    EpilogueRef,
+    void
+  >;
+
+  using GemmRef = cutlass::gemm::device::GemmUniversalAdapter<KernelRef>;
+
+  // Construct an optimized reference GEMM kernel type (using TMA)
+
+  using EpilogueOpt = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    Shape<_128,_128,_64>,
+    Shape<_2,_2,_1>,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementComputeEpilogue,
+    ElementC, LayoutC, 128 / cutlass::sizeof_bits<ElementC>::value,
+    ElementD, LayoutD, 128 / cutlass::sizeof_bits<ElementD>::value,
+    cutlass::epilogue::collective::EpilogueScheduleAuto
+  >::CollectiveOp;
+
+  using MainloopOpt = typename cutlass::gemm::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    ElementA, LayoutA, 128 / cutlass::sizeof_bits<ElementA>::value,
+    ElementB, LayoutB, 128 / cutlass::sizeof_bits<ElementB>::value,
+    ElementAccumulator,
+    Shape<_128,_128,_64>,
+    Shape<_2,_2,_1>,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename EpilogueOpt::SharedStorage))>,
+    cutlass::gemm::collective::KernelScheduleAuto
+  >::CollectiveOp;
+
+  using KernelOpt = cutlass::gemm::kernel::GemmUniversal<
+    ProblemShape,
+    MainloopOpt,
+    EpilogueOpt,
+    void
+  >;
+
+  using GemmOpt = cutlass::gemm::device::GemmUniversalAdapter<KernelOpt>;
+
+  // Data members
+
+  cutlass::gemm::BatchedGemmCoord problem_size_orig;
+  cutlass::gemm::BatchedGemmCoord problem_size;
+  ProblemShape problem_shape_orig;
+  ProblemShape problem_shape;
+  cutlass::KernelHardwareInfo hw_info;
+
+  ElementComputeEpilogue alpha;
+  ElementComputeEpilogue beta;
+
+  StrideA stride_A_orig;
+  StrideB stride_B_orig;
+  StrideC stride_C_orig;
+  StrideD stride_D_orig;
+
+  StrideA stride_A;
+  StrideB stride_B;
+  StrideC stride_C;
+  StrideD stride_D;
+
+  cutlass::device_memory::allocation<ElementA> tensor_a;
+  cutlass::device_memory::allocation<ElementB> tensor_b;
+  cutlass::device_memory::allocation<ElementC> tensor_c;
+  cutlass::device_memory::allocation<ElementD> tensor_d;
+
+  cutlass::device_memory::allocation<int> gather_indices;
+
+  cutlass::device_memory::allocation<ElementA> tensor_a_gathered;
+  cutlass::device_memory::allocation<ElementB> tensor_b_gathered;
+  cutlass::device_memory::allocation<ElementC> tensor_c_gathered;
+  cutlass::device_memory::allocation<ElementD> tensor_d_gathered;
+  cutlass::device_memory::allocation<ElementD> tensor_d_reference;
+
+  cutlass::gemm::GemmUniversalMode gemm_mode;
+
+  Gemm gemm;
+  typename Gemm::Arguments arguments;
+  cutlass::device_memory::allocation<uint8_t> workspace;
+
+  GemmRef gemm_ref;
+  typename GemmRef::Arguments arguments_ref;
+  cutlass::device_memory::allocation<uint8_t> workspace_ref;
+
+  GemmOpt gemm_opt;
+  typename GemmOpt::Arguments arguments_opt;
+  cutlass::device_memory::allocation<uint8_t> workspace_opt;
+
+  ExampleRunner(Options const &options, cutlass::KernelHardwareInfo const &hw_info)
+  : problem_size_orig(options.problem_size),
+    problem_size(GatherModeM ? options.index_size : problem_size_orig.m(),
+                 GatherModeN ? options.index_size : problem_size_orig.n(),
+                 GatherModeK ? options.index_size : problem_size_orig.k(),
+                 problem_size_orig.batch()),
+    problem_shape_orig(problem_size_orig.m(), problem_size_orig.n(), problem_size_orig.k(), problem_size_orig.batch()),
+    problem_shape(problem_size.m(), problem_size.n(), problem_size.k(), problem_size.batch()),
+    hw_info(hw_info),
+    alpha(options.alpha),
+    beta(options.beta),
+    stride_A_orig(cutlass::make_cute_packed_stride(
+        StrideA{}, make_shape(problem_size_orig.m(), problem_size_orig.k(), problem_size_orig.batch()))),
+    stride_B_orig(cutlass::make_cute_packed_stride(
+        StrideB{}, make_shape(problem_size_orig.n(), problem_size_orig.k(), problem_size_orig.batch()))),
+    stride_C_orig(cutlass::make_cute_packed_stride(
+        StrideC{}, make_shape(problem_size_orig.m(), problem_size_orig.n(), problem_size_orig.batch()))),
+    stride_D_orig(cutlass::make_cute_packed_stride(
+        StrideD{}, make_shape(problem_size_orig.m(), problem_size_orig.n(), problem_size_orig.batch()))),
+    stride_A(cutlass::make_cute_packed_stride(
+        StrideA{}, make_shape(problem_size.m(), problem_size.k(), problem_size.batch()))),
+    stride_B(cutlass::make_cute_packed_stride(
+        StrideB{}, make_shape(problem_size.n(), problem_size.k(), problem_size.batch()))),
+    stride_C(cutlass::make_cute_packed_stride(
+        StrideC{}, make_shape(problem_size.m(), problem_size.n(), problem_size.batch()))),
+    stride_D(cutlass::make_cute_packed_stride(
+        StrideD{}, make_shape(problem_size.m(), problem_size.n(), problem_size.batch()))),
+    tensor_a(problem_size_orig.m() * problem_size_orig.k() * problem_size_orig.batch()),
+    tensor_b(problem_size_orig.k() * problem_size_orig.n() * problem_size_orig.batch()),
+    tensor_c(problem_size_orig.m() * problem_size_orig.n() * problem_size_orig.batch()),
+    tensor_d(problem_size_orig.m() * problem_size_orig.n() * problem_size_orig.batch()),
+    gather_indices(options.index_size),
+    tensor_a_gathered(problem_size.m() * problem_size.k() * problem_size_orig.batch()),
+    tensor_b_gathered(problem_size.k() * problem_size.n() * problem_size_orig.batch()),
+    tensor_c_gathered(problem_size.m() * problem_size.n() * problem_size_orig.batch()),
+    tensor_d_gathered(problem_size.m() * problem_size.n() * problem_size_orig.batch()),
+    tensor_d_reference(problem_size_orig.m() * problem_size_orig.n() * problem_size_orig.batch()),
+    gemm_mode(problem_size.batch() > 1 ?  cutlass::gemm::GemmUniversalMode::kBatched : cutlass::gemm::GemmUniversalMode::kGemm),
+    gemm(),
+    // When constructing arguments for gather/scatter gemm, we must pass stride arguments
+    // made for the original (non-gathered) problem size, because they are used to access
+    // tensors of the original shape. However we still use the reduced (gathered) problem
+    // shape since it corresponds to the logical indexing in reduced size GEMM.
+    arguments{
+      gemm_mode,
+      problem_shape,
+      {
+        tensor_a.get(),
+        stride_A_orig,
+        tensor_b.get(),
+        stride_B_orig
+      },
+      { 
+        { alpha, beta },
+        tensor_c.get(), stride_C_orig,
+        tensor_d.get(), stride_D_orig,
+        typename Epilogue::GatherC {gather_indices.get()},
+        typename Epilogue::ScatterD{gather_indices.get()}
+      },
+      hw_info,
+      {},
+      typename Kernel::GatherA{gather_indices.get()},
+      typename Kernel::GatherB{gather_indices.get()}
+    },
+    workspace(Gemm::get_workspace_size(arguments)),
+    gemm_ref(),
+    arguments_ref{
+      gemm_mode,
+      problem_shape,
+      {
+        DoGatherA ? tensor_a_gathered.get() : tensor_a.get(),
+        stride_A,
+        DoGatherB ? tensor_b_gathered.get() : tensor_b.get(),
+        stride_B
+      },
+      { 
+        { alpha, beta },
+        DoGatherC  ? tensor_c_gathered.get() : tensor_c.get(),
+        stride_C, 
+        DoScatterD ? tensor_d_gathered.get() : tensor_d_reference.get(),
+        stride_D
+      },
+      hw_info
+    },
+    workspace_ref(GemmRef::get_workspace_size(arguments_ref)),
+    gemm_opt(),
+    arguments_opt{
+      gemm_mode,
+      problem_shape,
+      {
+        DoGatherA ? tensor_a_gathered.get() : tensor_a.get(),
+        stride_A,
+        DoGatherB ? tensor_b_gathered.get() : tensor_b.get(),
+        stride_B
+      },
+      { 
+        { alpha, beta },
+        DoGatherC  ? tensor_c_gathered.get() : tensor_c.get(),
+        stride_C, 
+        DoScatterD ? tensor_d_gathered.get() : tensor_d_reference.get(),
+        stride_D
+      },
+      hw_info
+    },
+    workspace_opt(GemmOpt::get_workspace_size(arguments_opt))
+  {
+    // Fill input and output matrices on host using CUTLASS helper functions
+    cutlass::reference::device::BlockFillRandomUniform(tensor_a.get(), tensor_a.size(), 1, ElementA(7), ElementA(-8), 0);
+    cutlass::reference::device::BlockFillRandomUniform(tensor_b.get(), tensor_b.size(), 1, ElementB(7), ElementB(-8), 0);
+    cutlass::reference::device::BlockFillRandomUniform(tensor_c.get(), tensor_c.size(), 1, ElementC(7), ElementC(-8), 0);
+    cutlass::reference::device::BlockFillSequential(tensor_d.get(), tensor_d.size(), ElementD(0), ElementD(0));
+
+    // <- Fill gather_indices with unique random integers in range [0,n)
+    int index_range = GatherModeM ? problem_size_orig.m() : (GatherModeN ? problem_size_orig.n() : problem_size_orig.k());
+    std::vector<int> indices(index_range);
+    std::iota(indices.begin(), indices.end(), 0);
+    { // std::random_shuffle was deprecated in C++14 and removed in C++17
+      std::random_device make_seed;
+      std::mt19937 source_of_randomness(make_seed());
+      std::shuffle(indices.begin(), indices.end(), source_of_randomness);
+    }
+    gather_indices.copy_from_host(indices.data());
+
+    auto const gemm_init = [](auto & gemm, auto const & arguments, auto & workspace)
+    {
+      cutlass::Status status = gemm.can_implement(arguments);
+      CUTLASS_CHECK(status);
+      status = gemm.initialize(arguments, workspace.get());
+      CUTLASS_CHECK(status);
+    };
+
+    gemm_init(gemm,     arguments,     workspace    );
+    gemm_init(gemm_ref, arguments_ref, workspace_ref);
+    gemm_init(gemm_opt, arguments_opt, workspace_opt);
+  }
+
+  void debug_output(std::ostream & os)
+  {
+    auto print_tensor = [](std::ostream &os, char const * name, auto const & data, auto shape, auto stride)
+    {
+      std::vector<remove_cvref_t<decltype(*data.get())>> h_data(data.size());
+      data.copy_to_host(h_data.data());
+      Tensor t = make_tensor(h_data.data(), shape, stride);
+      os << "\n" << name << ": " << std::setw(4) << t << std::endl;
+    };
+    {
+      auto [M,N,K,L] = problem_shape_orig;
+      print_tensor(os, "A", tensor_a, make_shape(M,K,L), stride_A_orig);
+      print_tensor(os, "B", tensor_b, make_shape(N,K,L), stride_B_orig);
+      print_tensor(os, "C", tensor_c, make_shape(M,N,L), stride_C_orig);
+      print_tensor(os, "D", tensor_d, make_shape(M,N,L), stride_D_orig);
+      print_tensor(os, "D reference", tensor_d_reference, make_shape(M,N,L), stride_D_orig);
+      print_tensor(os, "indices", gather_indices, make_shape(gather_indices.size()), make_stride(_1{}));
+    }
+  }
+
+  template<class Gemm2>
+  static void run_gemm(Gemm2 &gemm)
+  {
+    cutlass::Status status = gemm.run();
+    CUTLASS_CHECK(status);
+  }
+
+  template<class Gemm2>
+  void run_reference(Gemm2 &gemm)
+  {
+    // Convenience wrapper around calls to separate gather/scatter kernels
+    auto run_gather = [this](auto call, auto const & input, auto & output, auto gather_func, auto batch_size, auto stride)
+    {
+      [[maybe_unused]] auto idx = find_if(stride, [](auto x){ return not is_constant<1, decltype(x)>{}; });
+      constexpr int I = decltype(idx)::value;
+      call(input.get(), 
+           output.get(),
+           gather_func,
+           batch_size,
+           static_cast<int>(input.size() / batch_size),
+           static_cast<int>(output.size() / batch_size),
+           static_cast<int>(get<I>(stride)),
+           hw_info);
+    };
+
+    // Forward calls via lambda to avoid specifying template arguments
+    auto gather_call = [](auto&&... args){ gather(static_cast<decltype(args)&&>(args)...); };
+    // MSVC doesn't count use inside a false "if constexpr" branch.
+    [[maybe_unused]] auto scatter_call = [](auto&&... args){ scatter(static_cast<decltype(args)&&>(args)...); };
+
+    if constexpr (DoGatherA) {
+      run_gather(gather_call, tensor_a, tensor_a_gathered, arguments.gather_A, problem_size.batch(), stride_A);
+    }
+    if constexpr (DoGatherB) {
+      run_gather(gather_call, tensor_b, tensor_b_gathered, arguments.gather_B, problem_size.batch(), stride_B);
+    }
+    if constexpr (DoGatherC) {
+      if (beta != ElementComputeEpilogue(0)) {
+        run_gather(gather_call, tensor_c, tensor_c_gathered, arguments.epilogue.gather_C, problem_size.batch(), stride_C);
+      }
+    }
+
+    run_gemm(gemm);
+
+    if constexpr (DoScatterD) {
+      run_gather(scatter_call, tensor_d_gathered, tensor_d_reference, arguments.epilogue.scatter_D, problem_size.batch(), stride_D);
+    }
+  }
+
+  bool verify()
+  {
+    run_gemm(gemm);
+    run_reference(gemm_ref);
+    cudaDeviceSynchronize();
+    return cutlass::reference::device::BlockCompareEqual(tensor_d.get(), tensor_d_reference.get(), tensor_d.size());
+  }
+
+  bool run(Options const &options)
+  {
+    if (options.reference_check) {
+      if (!verify()) {
+        std::cout << "Failed validation" << std::endl;
+#if 0
+        debug_output(std::cout);
+#endif
+        return false;
+      } 
+      else {
+        std::cout << "Passed validation" << std::endl;
+      }
+    }
+
+    //
+    // Run profiling loop
+    //
+
+    auto const benchmark = [&](auto name, auto func)
+    {
+      GpuTimer timer;
+      timer.start();
+      for (int iter = 0; iter < options.iterations; ++iter) {
+        func();
+      }
+      timer.stop();
+
+      double runtime = timer.elapsed_millis() / double(options.iterations);
+      double gflops = 2 * double(problem_size.product()) / 1e6 / runtime; // Two flops per multiply-add
+
+      std::cout << name << ":\n";
+      std::cout << "  Runtime: " << runtime << " ms\n";
+      std::cout << "   GFLOPs: " << gflops  << "\n";
+    };
+
+    benchmark("Fused", [&](){ run_gemm(gemm); });
+    benchmark("Unfused default", [&](){ run_reference(gemm_ref); });
+    benchmark("Unfused optimized", [&](){ run_reference(gemm_opt); });
+
+    return true;
+  }
+};
+
+} // namespace example
+
+int main(int argc, const char ** argv) {
+
+  bool notSupported = false;
+
+  // CUDA 12 minimum required
+  if (__CUDACC_VER_MAJOR__ < 12) {
+    std::cerr << "This example requires CUDA Toolkit version 12 or later.\n";
+    notSupported = true;
+  }
+
+  cudaDeviceProp props;
+  CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
+
+  if (props.major < 9) {
+    std::cerr << "This example requires a device with compute capability 90 or higher.\n";
+    notSupported = true;
+  }
+  if (notSupported) {
+    return EXIT_SUCCESS; // Do not fail CI checks on unsupported systems
+  }
+
+  example::Options options;
+  options.parse(argc, argv);
+
+  if (options.help) {
+    options.print_usage(std::cout) << "\n";
+    return EXIT_SUCCESS;
+  }
+
+  if (!options.valid()) {
+    std::cerr << "Invalid arguments." << "\n";
+    return EXIT_FAILURE;
+  }
+
+  cutlass::KernelHardwareInfo hw_info;
+  hw_info.device_id = 0;
+  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+  bool result = true;
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+  switch (options.mode) {
+    using namespace example;
+    case 0: {
+      std::cout << "Gather A,C + scatter D on M mode:" << std::endl;
+      using Runner = ExampleRunner<
+        cutlass::half_t, cutlass::layout::RowMajor,    IndexedGather<int>, // A
+        cutlass::half_t, cutlass::layout::ColumnMajor, NoGather,           // B
+        cutlass::half_t, cutlass::layout::RowMajor,    IndexedGather<int>, // C
+        cutlass::half_t, cutlass::layout::RowMajor,    IndexedGather<int>, // D
+        float, float>;
+      result &= Runner(options, hw_info).run(options);
+      break;
+    }
+    case 1: {
+      std::cout << "Gather B,C + scatter D on N mode:" << std::endl;
+      using Runner = ExampleRunner<
+        cutlass::half_t, cutlass::layout::RowMajor,    NoGather,           // A
+        cutlass::half_t, cutlass::layout::ColumnMajor, IndexedGather<int>, // B
+        cutlass::half_t, cutlass::layout::ColumnMajor, IndexedGather<int>, // C
+        cutlass::half_t, cutlass::layout::ColumnMajor, IndexedGather<int>, // D
+        float, float>;
+      result &= Runner(options, hw_info).run(options);
+      break;
+    }
+    case 2: {
+      std::cout << "Gather A,B on K mode:" << std::endl;
+      using Runner = ExampleRunner<
+        cutlass::half_t, cutlass::layout::ColumnMajor, IndexedGather<int>, // A
+        cutlass::half_t, cutlass::layout::RowMajor,    IndexedGather<int>, // B
+        cutlass::half_t, cutlass::layout::RowMajor,    NoGather,           // C
+        cutlass::half_t, cutlass::layout::RowMajor,    NoGather,           // D
+        float, float>;
+      result &= Runner(options, hw_info).run(options);
+      break;
+    }
+  }
+#endif
+
+  return result ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/examples/52_hopper_gather_scatter_fusion/CMakeLists.txt b/examples/52_hopper_gather_scatter_fusion/CMakeLists.txt
new file mode 100644
index 0000000000..bf67537002
--- /dev/null
+++ b/examples/52_hopper_gather_scatter_fusion/CMakeLists.txt
@@ -0,0 +1,32 @@
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cutlass_example_add_executable(
+  52_hopper_gather_scatter_fusion
+  52_hopper_gather_scatter_fusion.cu
+  )
diff --git a/examples/52_hopper_gather_scatter_fusion/gather_gemm.hpp b/examples/52_hopper_gather_scatter_fusion/gather_gemm.hpp
new file mode 100644
index 0000000000..c71109aa79
--- /dev/null
+++ b/examples/52_hopper_gather_scatter_fusion/gather_gemm.hpp
@@ -0,0 +1,421 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "cute/tensor.hpp"
+
+#include "gather_tensor.hpp"
+
+namespace cutlass {
+  ///Forward declaration
+  struct CudaHostAdapter;
+}
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_,
+  class GatherA_,
+  class GatherB_
+>
+class GemmGather
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(cute::rank(ProblemShape{}) == 3 or cute::rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  static_assert(cute::is_void_v<TileScheduler_> or cute::is_same_v<TileScheduler_, PersistentScheduler>,
+    "Non-persistent warp-specialized kernel does not support specializing the tile scheduler.");
+  using TileSchedulerTag = TileScheduler_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileScheduler_, ArchTag, TileShape, ClusterShape>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+
+  using GatherA = GatherA_;
+  using GatherB = GatherB_;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    union TensorStorage {
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+
+      MainloopTensorStorage mainloop;
+      EpilogueTensorStorage epilogue;
+    } tensors;
+
+    struct PipelineStorage : cute::aligned_struct<16, _2> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+    } pipelines;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  using GmemTiledCopyA = typename CollectiveMainloop::GmemTiledCopyA;
+  using GmemTiledCopyB = typename CollectiveMainloop::GmemTiledCopyB;
+  static_assert(cute::size(GmemTiledCopyA{}) == cute::size(GmemTiledCopyB{}), "Number of threads in A/B tiled copies must be the same.");
+
+  static constexpr uint32_t NumLoadWarpGroups = cute::size(GmemTiledCopyA{}) / NumThreadsPerWarpGroup;
+  static constexpr uint32_t NumMmaWarpGroups = CUTE_STATIC_V(cute::size(TiledMma{})) / NumThreadsPerWarpGroup;
+  static constexpr uint32_t NumWarpGroups = NumLoadWarpGroups + NumMmaWarpGroups;
+  static_assert(NumWarpGroups == 2 || NumWarpGroups == 3, "Number of warp groups must be 2 or 3 for good performance.");
+
+  static constexpr uint32_t MaxThreadsPerBlock = NumWarpGroups * NumThreadsPerWarpGroup;
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+    GatherA gather_A{};
+    GatherB gather_B{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    GatherA gather_A{};
+    GatherB gather_B{};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    (void) workspace;
+    auto problem_shape = args.problem_shape;
+    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
+      // swap M/N
+      get<0>(problem_shape) = get<1>(args.problem_shape);
+      get<1>(problem_shape) = get<0>(args.problem_shape);
+    }
+    return {
+      args.mode,
+      problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, workspace),
+      args.gather_A,
+      args.gather_B
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
+        (args.mode == GemmUniversalMode::kBatched && cute::rank(ProblemShape{}) == 4);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    return implementable;
+  }
+
+  static
+  size_t
+  get_workspace_size(Arguments const& args) {
+    return 0;
+  }
+
+  static
+  cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return Status::kSuccess;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    auto cluster_shape = Shape<_1,_1,_1>{};
+    auto tile_shape = TileShape{};
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    return TileScheduler::get_tiled_cta_shape_mnl(
+        problem_shape_MNKL, tile_shape, cluster_shape);
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+    // Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
+    #if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
+      if constexpr(size<0>(typename TiledMma::AtomShape_MNK{}) == 64) {
+        printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
+        return;
+      }
+    #endif
+
+    enum class WarpGroupRole {
+      Producer = 0,
+      Consumer = 1,
+    };
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    int thread_idx = int(threadIdx.x);
+    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
+    int warp_group_idx = canonical_warp_group_idx();
+    CUTLASS_ASSERT(warp_group_idx < NumWarpGroups);
+    WarpGroupRole warp_group_role = warp_group_idx < NumLoadWarpGroups ? WarpGroupRole::Producer : WarpGroupRole::Consumer;
+
+    // Mainloop Load pipeline
+    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+    typename MainloopPipeline::Params mainloop_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_pipeline_params.producer_arv_count = NumLoadWarpGroups * NumThreadsPerWarpGroup;
+    mainloop_pipeline_params.consumer_arv_count = NumMmaWarpGroups * NumThreadsPerWarpGroup;
+    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params);
+
+    // Epilogue Load pipeline
+    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.producer_arv_count = NumLoadWarpGroups * NumThreadsPerWarpGroup;
+    epi_load_pipeline_params.consumer_arv_count = NumMmaWarpGroups * NumThreadsPerWarpGroup;
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    // Initialize starting pipeline states for the collectives
+    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
+    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
+
+    // For the DMA Load (producer) we start with an opposite phase
+    // i.e., we skip all waits since we know that the buffer is indeed empty
+    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
+    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    // Preconditions
+    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    // Separate out problem shape for convenience
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    auto M = get<0>(problem_shape_MNKL);
+    auto N = get<1>(problem_shape_MNKL);
+    auto K = get<2>(problem_shape_MNKL);
+    auto L = get<3>(problem_shape_MNKL);
+
+    // Represent the full tensors
+    Tensor mA_mkl = make_gather_tensor(make_gmem_ptr(params.mainloop.ptr_A), make_shape(M,K,L), params.mainloop.dA, params.gather_A); //(m,k,l)
+    Tensor mB_nkl = make_gather_tensor(make_gmem_ptr(params.mainloop.ptr_B), make_shape(N,K,L), params.mainloop.dB, params.gather_B); //(n,k,l)
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
+    TiledMma tiled_mma;
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, blk_shape, make_coord(_,_,_), Step<_1, X,_1>{});          // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, blk_shape, make_coord(_,_,_), Step< X,_1,_1>{});          // (BLK_N,BLK_K,n,k,l)
+
+    // Compute m_coord, n_coord, and l_coord with their post-tiled shapes
+    auto m_coord = idx2crd(int(blockIdx.x), shape<2>(gA_mkl));
+    auto n_coord = idx2crd(int(blockIdx.y), shape<2>(gB_nkl));
+    auto l_coord = idx2crd(int(blockIdx.z), shape<4>(gB_nkl));
+    auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+    // Slice with m_coord and n_coord
+    Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                       // (BLK_M,BLK_K,k)
+    Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                       // (BLK_N,BLK_K,k)
+
+    // Get pipeline iterators and increments from tensor shapes
+    auto k_tile_iter  = cute::make_coord_iterator(shape<2>(gA));
+    auto k_tile_count = size<2>(gA);
+    auto c_tile_count = CollectiveEpilogue::get_load_pipe_increment(blk_shape);
+    auto d_tile_count = CollectiveEpilogue::get_store_pipe_increment(blk_shape);
+
+    // Wait for all threads in the thread block
+    __syncthreads();
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop;
+    CollectiveEpilogue collective_epilogue{params.epilogue, shared_storage.tensors.epilogue};
+
+    if (warp_group_role == WarpGroupRole::Producer) {
+      // Compute tile residues for predication
+      auto m_max_coord = M - size<0>(gA) * get<0>(blk_coord);                             // M - BLK_M * m_coord
+      auto n_max_coord = N - size<0>(gB) * get<1>(blk_coord);                             // N - BLK_N * n_coord
+      auto k_residue   = K - size<1>(gA) * size<2>(gA);                                   // K - BLK_K * k_coord_max
+      auto residue_mnk = make_tuple(m_max_coord, n_max_coord, k_residue);
+
+      collective_mainloop.load(
+        mainloop_pipeline,
+        mainloop_pipe_producer_state,
+        gA,
+        gB,
+        k_tile_iter, k_tile_count,
+        residue_mnk,
+        thread_idx,
+        shared_storage.tensors.mainloop
+      );
+      // Update starting mainloop pipeline state for the pipeline drain
+      mainloop_pipe_producer_state.advance(k_tile_count);
+      // Make sure mainloop consumer has been waited upon before issuing epilogue load
+      collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+
+      if (collective_epilogue.is_producer_load_needed()) {
+        epi_load_pipe_producer_state =
+        collective_epilogue.load(
+          epi_load_pipeline,
+          epi_load_pipe_producer_state,
+          problem_shape_MNKL,
+          blk_shape,
+          blk_coord,
+          tiled_mma,
+          thread_idx,
+          shared_storage.tensors.epilogue
+        );
+        collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
+      }
+    }
+    else if (warp_group_role == WarpGroupRole::Consumer) {
+      Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));                 // (MMA,MMA_M,MMA_N)
+
+      collective_mainloop.mma(
+        mainloop_pipeline,
+        mainloop_pipe_consumer_state,
+        accumulators,
+        k_tile_count,
+        warp_group_thread_idx,
+        shared_storage.tensors.mainloop,
+        params.mainloop
+      );
+
+      // Make sure the math instructions are done and free buffers before entering the epilogue
+      collective_mainloop.mma_tail(
+        mainloop_pipeline,
+        mainloop_pipe_consumer_state,
+        k_tile_count
+      );
+
+      // Epilogue and write to gD
+      collective_epilogue.store(
+        epi_load_pipeline,
+        epi_load_pipe_consumer_state,
+        epi_store_pipeline,
+        epi_store_pipe_producer_state,
+        problem_shape_MNKL,
+        blk_shape,
+        blk_coord,
+        accumulators,
+        tiled_mma,
+        warp_group_thread_idx,
+        shared_storage.tensors.epilogue
+      );
+    }
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/examples/52_hopper_gather_scatter_fusion/gather_kernel.cuh b/examples/52_hopper_gather_scatter_fusion/gather_kernel.cuh
new file mode 100644
index 0000000000..592bf57e39
--- /dev/null
+++ b/examples/52_hopper_gather_scatter_fusion/gather_kernel.cuh
@@ -0,0 +1,136 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cute/numeric/math.hpp"
+
+namespace example
+{
+
+// Naive grid-stride loop implementation of gather
+template<typename Element, typename Func>
+__global__ void
+gather_kernel(Element const * __restrict__ input,
+              Element       * __restrict__ output,
+              Func func,
+              int num_elems_input,
+              int num_elems_output,
+              cutlass::FastDivmod stride_divmod)
+{
+  Element const * input_b = input + blockIdx.z * num_elems_input;
+  Element * output_b = output + blockIdx.z * num_elems_output;
+  int tidx = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int k = tidx; k < num_elems_output; k += blockDim.x * gridDim.x) {
+    int i,j;
+    stride_divmod(j, i, k);
+    output_b[k] = input_b[i + func(j) * stride_divmod.divisor];
+  }
+}
+
+// Gather elements along strided dimension of the tensor according to given indices
+template<typename Element, typename Func>
+void
+gather(Element const * input,
+       Element * output,
+       Func func,
+       int batch_size,
+       int num_elems_input,
+       int num_elems_output,
+       int stride,
+       cutlass::KernelHardwareInfo const& hw_info)
+{
+  // Upcast to uint128_t data type
+  int factor = 128 / cutlass::sizeof_bits<Element>::value;
+  assert(stride % factor == 0);
+  int stride_upcast = stride/factor;
+  int num_elems_input_upcast = num_elems_input / factor;
+  int num_elems_output_upcast = num_elems_output / factor;
+
+  cutlass::FastDivmod stride_divmod(stride_upcast);
+  dim3 blocks(hw_info.sm_count, 1, batch_size);
+  gather_kernel<<<blocks, 1024>>>(reinterpret_cast<cute::uint128_t const *>(input),
+                                  reinterpret_cast<cute::uint128_t *>(output),
+                                  func,
+                                  num_elems_input_upcast,
+                                  num_elems_output_upcast,
+                                  stride_divmod);
+}
+
+// Naive grid-stride loop implementation of scatter
+template<typename Element, typename Func>
+__global__ void
+scatter_kernel(Element const * __restrict__ input,
+               Element       * __restrict__ output,
+               Func func,
+               int num_elems_input,
+               int num_elems_output,
+               cutlass::FastDivmod stride_divmod)
+{
+  Element const * input_b = input + blockIdx.z * num_elems_input;
+  Element * output_b = output + blockIdx.z * num_elems_output;
+  int tidx = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int k = tidx; k < num_elems_input; k += blockDim.x * gridDim.x) {
+    int i,j;
+    stride_divmod(j, i, k);
+    output_b[i + func(j) * stride_divmod.divisor] = input_b[k];
+  }
+}
+
+// Gather elements along strided dimension of the tensor according to given indices
+template<typename Element, typename Func>
+void
+scatter(Element const * input,
+        Element * output,
+        Func func,
+        int batch_size,
+        int num_elems_input,
+        int num_elems_output,
+        int stride,
+        cutlass::KernelHardwareInfo const& hw_info)
+{
+  // Upcast to uint128_t data type
+  int factor = 128 / cutlass::sizeof_bits<Element>::value;
+  assert(stride % factor == 0);
+  int stride_upcast = stride/factor;
+  int num_elems_input_upcast = num_elems_input / factor;
+  int num_elems_output_upcast = num_elems_output / factor;
+
+  cutlass::FastDivmod stride_divmod(stride_upcast);
+  dim3 blocks(hw_info.sm_count, 1, batch_size);
+  scatter_kernel<<<blocks, 1024>>>(reinterpret_cast<cute::uint128_t const *>(input),
+                                   reinterpret_cast<cute::uint128_t *>(output),
+                                   func,
+                                   num_elems_input_upcast,
+                                   num_elems_output_upcast,
+                                   stride_divmod);
+}
+
+} // namespace example
diff --git a/examples/52_hopper_gather_scatter_fusion/scatter_epilogue.hpp b/examples/52_hopper_gather_scatter_fusion/scatter_epilogue.hpp
new file mode 100644
index 0000000000..dc9c0df804
--- /dev/null
+++ b/examples/52_hopper_gather_scatter_fusion/scatter_epilogue.hpp
@@ -0,0 +1,222 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing elementwise operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/detail.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/numeric/numeric_types.hpp"
+
+#include "gather_tensor.hpp"
+
+namespace cutlass::epilogue::collective {
+
+/// Applies an element wise operation to all elements within the fragment
+/// and scatter-writes them out to destination storage.
+/// GatherC and ScatterD are types of user-defined functions that apply the
+/// transoformation of the strided coordinate (e.g. through an index array).
+template <
+  class StrideC_,
+  class StrideD_,
+  class ThreadEpilogueOp_,
+  class EpilogueSchedule_,
+  class GatherC_,
+  class ScatterD_
+>
+class EpilogueGatherScatter {
+public:
+  //
+  // Type Aliases
+  //
+  using EpilogueSchedule = EpilogueSchedule_;
+  
+  // derived types of output thread level operator
+  using ThreadEpilogueOp = ThreadEpilogueOp_;
+  using ElementOutput = typename ThreadEpilogueOp::ElementOutput;
+  using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator;
+  using ElementCompute = typename ThreadEpilogueOp::ElementCompute;
+  using ElementScalar = ElementCompute;
+  using ElementC = typename ThreadEpilogueOp::ElementC;
+  using StrideC = StrideC_;
+  using ElementD = typename ThreadEpilogueOp::ElementD;
+  using StrideD = StrideD_;
+
+  // Every epilogue needs these two GmemTiledCopy{C,D} aliases.
+  // If you don't know what they should be, just use void.
+  using GmemTiledCopyC = void;
+  using GmemTiledCopyD = void;
+
+  using GatherC = GatherC_;
+  using ScatterD = ScatterD_;
+
+  static const int kOutputAlignment = ThreadEpilogueOp::kCount;
+  using AlignmentType = typename cute::uint_bit<sizeof_bits<ElementOutput>::value * kOutputAlignment>::type;
+
+  static_assert(cute::rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+  static_assert(cute::rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+
+  struct SharedStorage { };
+
+  // Host side epilogue arguments
+  struct Arguments {
+    typename ThreadEpilogueOp::Params thread_params{};
+    ElementC const* ptr_C = nullptr;
+    StrideC dC{};
+    ElementD* ptr_D = nullptr;
+    StrideD dD{};
+    GatherC gather_C{};
+    ScatterD scatter_D{};
+  };
+
+  // Device side epilogue params
+  using Params = Arguments;
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      [[maybe_unused]] ProblemShape const& _,
+      Arguments const& args,
+      [[maybe_unused]] void* workspace) {
+    return args;
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      [[maybe_unused]] ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    return true;
+  }
+
+  CUTLASS_HOST_DEVICE
+  EpilogueGatherScatter(Params const& params_) : params(params_) { }
+
+  template<
+    class ProblemShapeMNKL,
+    class BlockShapeMNK,
+    class BlockCoordMNKL,
+    class FrgEngine, class FrgLayout,
+    class TiledMma,
+    class ResidueMNK
+  >
+  CUTLASS_DEVICE void
+  operator()(
+      ProblemShapeMNKL problem_shape_mnkl,
+      BlockShapeMNK blk_shape_MNK,
+      BlockCoordMNKL blk_coord_mnkl,
+      cute::Tensor<FrgEngine, FrgLayout> const& accumulators,
+      TiledMma tiled_mma,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      char* smem_buf)
+  {
+    using namespace cute;
+    using X = Underscore;
+
+    static_assert(cute::rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
+    static_assert(is_static<BlockShapeMNK>::value, "ThreadBlock tile shape must be static");
+    static_assert(cute::rank(BlockShapeMNK{}) == 3, "BlockShapeMNK must be rank 3");
+    static_assert(cute::rank(BlockCoordMNKL{}) == 4, "BlockCoordMNKL must be rank 3");
+
+    (void) smem_buf;
+    ThreadEpilogueOp epilogue_op{params.thread_params};
+
+    // Separate out problem shape for convenience
+    auto M = get<0>(problem_shape_mnkl);
+    auto N = get<1>(problem_shape_mnkl);
+    auto L = get<3>(problem_shape_mnkl);
+
+    auto stride_c = detail::get_epilogue_stride<EpilogueSchedule>(params.dC);
+    auto stride_d = detail::get_epilogue_stride<EpilogueSchedule>(params.dD);
+
+    // Represent the full output tensor
+    Tensor mC_mnl = make_gather_tensor(make_gmem_ptr(params.ptr_C), make_shape(M,N,L), stride_c, params.gather_C);  // (m,n,l)
+    Tensor mD_mnl = make_gather_tensor(make_gmem_ptr(params.ptr_D), make_shape(M,N,L), stride_d, params.scatter_D); // (m,n,l)
+
+    Tensor gC_mnl = local_tile(mC_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});    // (BLK_M,BLK_N,m,n,l)
+    Tensor gD_mnl = local_tile(mD_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});    // (BLK_M,BLK_N,m,n,l)
+
+    // Slice to get the tile this CTA is responsible for
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord_mnkl;
+    Tensor gC = gC_mnl(_,_,m_coord,n_coord,l_coord);                                                 // (BLK_M,BLK_N)
+    Tensor gD = gD_mnl(_,_,m_coord,n_coord,l_coord);                                                 // (BLK_M,BLK_N)
+
+    // Partition source and destination tiles to match the accumulator partitioning
+    auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCgD = thr_mma.partition_C(gD);                                       // (VEC,THR_M,THR_N)
+    Tensor tCgC = thr_mma.partition_C(gC);                                       // (VEC,THR_M,THR_N)
+
+    static_assert(is_static<FrgLayout>::value, "Accumulator layout must be static");
+    CUTE_STATIC_ASSERT_V(size(tCgC) == size(tCgD),
+        "Source and destination must have the same number of elements.");
+    CUTE_STATIC_ASSERT_V(size(tCgD) == size(accumulators),
+        "Accumulator count must have the same destination element count.");
+
+    // Make an identity coordinate tensor for predicating our output MN tile
+    auto cD = make_identity_tensor(make_shape(unwrap(shape<0>(gD)), unwrap(shape<1>(gD))));
+    Tensor tCcD = thr_mma.partition_C(cD);
+
+    // source is needed
+    if (epilogue_op.is_source_needed()) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(accumulators); ++i) {
+        if (elem_less(tCcD(i), make_coord(get<0>(residue_mnk), get<1>(residue_mnk)))) {
+          tCgD(i) = epilogue_op(accumulators(i), tCgC(i));
+        }
+      }
+    }
+    // source is not needed, avoid load
+    else {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(accumulators); ++i) {
+        if (elem_less(tCcD(i), make_coord(get<0>(residue_mnk), get<1>(residue_mnk)))) {
+          tCgD(i) = epilogue_op(accumulators(i));
+        }
+      }
+    }
+  }
+
+private:
+  Params params;
+};
+
+} // namespace cutlass::epilogue::collective
+
diff --git a/examples/53_hopper_gemm_permute/53_hopper_gemm_permute.cu b/examples/53_hopper_gemm_permute/53_hopper_gemm_permute.cu
new file mode 100644
index 0000000000..d24c5f294a
--- /dev/null
+++ b/examples/53_hopper_gemm_permute/53_hopper_gemm_permute.cu
@@ -0,0 +1,979 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Hopper GEMM+permute example.
+
+    This example demonstrates the fusion of tensor permutation operations with a Hopper GEMM kernel.
+    It is similar in spirit to example 39_gemm_permute, but uses CUTLASS 3 CollectiveBuilder API to
+    construct kernels that make use of Hopper architecture features: Tensor Memory Accelerator (TMA)
+    units and warpgroup-level MMA instructions.
+
+    Background
+    ----------
+
+    While a GEMM kernel computes a product of two matrices (rank-2 tensors), the source data may
+    come from higher-rank tensors by combining some if its modes (dimensions) into the row and column
+    modes of the matrix. These tensors are often outputs from previous layers of a network, and the 
+    data may sometimes need to be reordered in memory before a GEMM is computed. Similarly, the output
+    of a GEMM may need to be reordered before a subsequent operation can be executed.
+    
+    Consider this sample PyTorch code:
+
+    # Forward pass
+    D = torch.mm(A, B).view(M/D1, D1, D2, N/D2).permute(0, 2, 1, 3)
+
+    # Backward pass
+    grad_A = torch.mm(grad_D.permute(0, 2, 1, 3).view(M, N), B)
+
+    Executing the reordering as a separate operation requires committing intermediate tensor to memory 
+    and increases the latency and memory footprint of the model. By fusing the permutation with either 
+    reading of A/B matrices or writing of D matrix, we can avoid the unnecessary global memory traffic
+    and kernel launch overhead.
+
+    Implementation
+    --------------
+
+    The approach relies on two things:
+    - The ability of CUTLASS 3 to naturally perform general tensor contractions (GETT) owing to the
+    flexibility of CuTe's hierarchical layouts (see example 51_hopper_gett for more details).
+    - The harware capabilities of Hopper TMA units that allow for loading multidimensional tensors with
+    (almost) arbitrary strides, which can be used to represent a permuted view of the data.
+
+    In this example we reuse the permutation classes of examples 39_gemm_permute as operation tags.
+    For each tag, a specialization of struct PermuteTraits<> provides the necessary information about
+    the target tensor shape and ordering of modes. The main class, ExampleRunner, then figures out the
+    overall (hierarchical) shape of the GEMM operation and computes the shape and strides for each
+    tensor taking into account the permutation applied. We highlight the importance of specifying
+    consistent multidimensional shapes for all tensors (even those that are not permuted), as well as
+    choosing hierarchical GEMM tile sizes that best fit those shapes (in cases where some tensor
+    dimensions are known at compile time).
+
+    In addition, this example implements a standalone permutation kernel that is used to both verify
+    correctness of the fused kernel and benchmark the fused kernel against an unfused version that
+    writes intermediate tensor to memory.
+*/
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/mma.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/permute.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "helper.h"
+#include "permute_kernel.cuh"
+#include "permute_traits.hpp"
+
+namespace example
+{
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+struct Options {
+
+  bool help;
+
+  cutlass::gemm::BatchedGemmCoord problem_size;
+
+  float alpha;
+  float beta;
+
+  bool reference_check;
+  int iterations;
+
+  bool verbose;
+
+  Options():
+    help(false),
+    problem_size({2048, 2048, 2048, 8}),
+    alpha(1.0),
+    beta(1.0),
+    reference_check(true),
+    iterations(20),
+    verbose(false) { }
+
+  bool valid() const {
+    return problem_size.m() > 0
+        && problem_size.n() > 0
+        && problem_size.k() > 0
+        && problem_size.batch() > 0
+        && iterations > 0;
+  }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+
+    cmd.get_cmd_line_argument("m", problem_size.m());
+    cmd.get_cmd_line_argument("n", problem_size.n());
+    cmd.get_cmd_line_argument("k", problem_size.k());
+    cmd.get_cmd_line_argument("batch_size", problem_size.batch());
+
+    cmd.get_cmd_line_argument("alpha", alpha);
+    cmd.get_cmd_line_argument("beta", beta);
+
+    cmd.get_cmd_line_argument("check", reference_check, true);
+    cmd.get_cmd_line_argument("iterations", iterations);
+
+    cmd.get_cmd_line_argument("verbose", verbose, false);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out <<
+      "53_hopper_gemm_permute example\n"
+      "\n"
+      "  This example uses the CUTLASS Library to fuse permute() on input/output tensors with GEMM\n"
+      "\n"
+      "Options:\n"
+      "  --help                      If specified, displays this usage statement.\n"
+      "  --m=<int>                   GEMM M dimension\n"
+      "  --n=<int>                   GEMM N dimension\n"
+      "  --k=<int>                   GEMM K dimension\n"
+      "  --alpha=<float>             GEMM alpha parameter\n"
+      "  --beta=<float>              GEMM beta parameter\n"
+      "  --iterations=<int>          Number of profiling iterations to perform.\n"
+      "  --check=<bool>              Validate results against a reference (unfused) imlementation"
+      "  --verbose=<bool>            Enable verbose output"
+      "\n"
+      "Examples:\n"
+      "\n"
+      "$ ./examples/53_hopper_gemm_permute/53_hopper_gemm_permute --m=4096 --n=2048 --k=3072 --batch_size=8\n";
+
+    return out;
+  }
+};
+
+using namespace cute;
+
+// Check the shapes assigned to the same mode of different tensors,
+// ensure all permuted shapes are the same and return that shape.
+template<class ... Shapes>
+auto
+select_mode_shape(Shapes const & ... shapes) {
+  auto permuted_shapes = filter_tuple(cute::make_tuple(shapes...), [](auto shape) {
+    if constexpr (cute::rank(shape) > 1) {
+      return cute::make_tuple(shape);
+    }
+    else {
+      return cute::make_tuple();
+    }
+  });
+  if constexpr (cute::rank(permuted_shapes) == 0) {
+    return get<0>(cute::make_tuple(shapes...));
+  }
+  else {
+    auto ref_shape = get<0>(permuted_shapes);
+    for_each(permuted_shapes, [&](auto shape) {
+      // This static assert fails to compile on GCC 7.5
+      // static_assert(is_same<decltype(shape), decltype(ref_shape)>::value, "Inconsistent shapes for the same mode");
+      // This runtime check can be skipped if all permutations are required to be static.
+      if (shape != ref_shape)
+      {
+        print("Inconsistent shapes for the same mode: ");
+        print(ref_shape); print(" and "); print(shape); print("\n");
+        exit(EXIT_FAILURE);
+      }
+    });
+    return ref_shape;
+  }
+}
+
+template<class Shape, class StrideOrig>
+auto
+compute_default_stride(Shape const & shape, StrideOrig const & stride_orig) {
+  // Only supports column-major and row-major, batch stride always comes last
+  if constexpr (is_constant<1, decltype(get<0>(stride_orig))>::value) {
+    return compact_col_major(shape);
+  }
+  else
+  {
+    return compact_order(shape, Step<_1,_0,_2>{});
+  }
+}
+
+// Divide a static scalar TileSize into static modes of Shape until either:
+// - a dynamic mode is encountered
+// - we run out of size to divide
+// - no longer divisible by next shape
+// Examples:
+//   select_tile_shape(_128, (_8,_16)) -> (_8,_16)
+//   select_tile_shape(_128, (_8,_32)) -> (_8,_16)
+//   select_tile_shape(_128, (_8, _4)) -> (_8,_4,_4)
+//   select_tile_shape(_128, (_8,  4)) -> (_8,_16)
+template<class TileSize, class Shape>
+auto
+select_tile_shape(TileSize size, Shape const& shape)
+{
+  static_assert(is_static<TileSize>::value, "Tile size must be static");
+  if constexpr (cute::rank(Shape{}) == 0) {
+    return cute::make_tuple(size);
+  }
+  else {
+    if constexpr (is_static<tuple_element_t<0, Shape>>::value) {
+      auto div = front(shape);
+      if constexpr (size > div and size % div == 0) {
+        return prepend(select_tile_shape(size / div, take<1,tuple_size_v<Shape>>(shape)), div);
+      }
+      else {
+        return cute::make_tuple(size);
+      }
+    }
+    else {
+      return cute::make_tuple(size);
+    }
+  }
+}
+
+template<class ElementA, class LayoutA, class PermuteA,
+         class ElementB, class LayoutB, class PermuteB,
+         class ElementC, class LayoutC, class PermuteC,
+         class ElementD, class LayoutD, class PermuteD,
+         class ElementAccumulator, class ElementEpilogue,
+         class TileShape, class ClusterShape>
+class ExampleRunner
+{
+private:
+
+  // Define shapes for each operand and original GEMM problem as a whole.
+
+  using MatrixShape = Shape<int,int,int>;      // [M,N,L]/[M,K,L]/[N,K,L]
+  using ProblemShape = Shape<int,int,int,int>; // [M,N,K,L]
+
+  // Determine the CuTe stride for each of the four operands.
+
+  using StrideA = cutlass::gemm::TagToStrideA_t<LayoutA>;
+  using StrideB = cutlass::gemm::TagToStrideB_t<LayoutB>;
+  using StrideC = cutlass::gemm::TagToStrideC_t<LayoutC>;
+  using StrideD = cutlass::gemm::TagToStrideC_t<LayoutD>;
+
+  // Flags to check which operands will be permuted.
+
+  static constexpr bool DoPermuteA = not cutlass::layout::is_trivial_permute<PermuteA>;
+  static constexpr bool DoPermuteB = not cutlass::layout::is_trivial_permute<PermuteB>;
+  static constexpr bool DoPermuteC = not cutlass::layout::is_trivial_permute<PermuteC>;
+  static constexpr bool DoPermuteD = not cutlass::layout::is_trivial_permute<PermuteD>;
+
+  // For input operands, we must use inverse of the permutation operation 
+  // to read data that is stored in original (un-permuted) order.
+
+  using PermuteAReal = typename cutlass::layout::InversePermute<PermuteA>::type;
+  using PermuteBReal = typename cutlass::layout::InversePermute<PermuteB>::type;
+  using PermuteCReal = typename cutlass::layout::InversePermute<PermuteC>::type;
+  using PermuteDReal = PermuteD;
+
+  // Get permutation layout for each operand.
+  // A permutation layout is a rank-3 layout in the usual CuTe mode ordering,
+  // but each mode may have a nested shape corresponding to the reshaping of
+  // the matrix into a multidimensional tensor, and the strides are computed
+  // taking the desired permutation into account.
+
+  template<class Permute, class Stride, bool Transpose>
+  using LayoutPermute = remove_cvref_t<decltype(make_permute_layout<Permute, Transpose>(make_layout(MatrixShape{}, Stride{})))>;
+
+  using LayoutAP = LayoutPermute<PermuteAReal, StrideA, false>;
+  using LayoutBP = LayoutPermute<PermuteBReal, StrideB, true >;
+  using LayoutCP = LayoutPermute<PermuteCReal, StrideC, false>;
+  using LayoutDP = LayoutPermute<PermuteDReal, StrideD, false>;
+
+  // Now we want to build the unified problem shape for permute-GEMM.
+  // To do this, we check the corresponding mode in each tensor that has it.
+  // If at least one tensor has a mode that has been reshaped (i.e. rank > 1),
+  // its shape will be used as the reference shape for that mode in all tensors.
+  // If multiple tensors have reshaped mode, we additionally check that their 
+  // shapes for that mode match. Otherwise, we can't define a consistent GEMM shape.
+
+  using ShapeM = decltype(select_mode_shape(shape<0>(LayoutAP{}), shape<0>(LayoutCP{}), shape<0>(LayoutDP{})));
+  using ShapeN = decltype(select_mode_shape(shape<0>(LayoutBP{}), shape<1>(LayoutCP{}), shape<1>(LayoutDP{})));
+  using ShapeK = decltype(select_mode_shape(shape<1>(LayoutAP{}), shape<1>(LayoutBP{})));
+  using ShapeL = decltype(select_mode_shape(shape<2>(LayoutAP{}), shape<2>(LayoutBP{}), shape<2>(LayoutCP{}), shape<2>(LayoutDP{})));
+
+  using ProblemShapePermute = Shape<ShapeM, ShapeN, ShapeK, ShapeL>;
+
+  using ShapeAPermute = Shape<ShapeM, ShapeK, ShapeL>;
+  using ShapeBPermute = Shape<ShapeN, ShapeK, ShapeL>;
+  using ShapeCPermute = Shape<ShapeM, ShapeN, ShapeL>;
+  using ShapeDPermute = Shape<ShapeM, ShapeN, ShapeL>;
+
+  // Next, we must define the strides for each tensor.
+  // If the tensor is permuted, we take the strides produced by the permutation function.
+  // Otherwise, we compute default strides induced by the new (multidimensional) shape of the tensor.
+  //
+  // This won't always work in general if multiple tensors are permuted: e.g. if PermuteA affects
+  // modes M and K, and PermuteB affects modes N and L, the single stride for mode L of tensor A
+  // computed by PermuteA will be non-congruent with it's shape that is changed by PermuteB. 
+  // To handle this correctly, a more complicated logic is needed to reconstruct multi-mode strides.
+  // This is not addressed here, as it's not a common requirement to permute multiple tensors in one GEMM.
+
+  using StrideAPermute = conditional_t<DoPermuteA, remove_cvref_t<decltype(stride(LayoutAP{}))>, decltype(compute_default_stride(ShapeAPermute{}, StrideA{}))>;
+  using StrideBPermute = conditional_t<DoPermuteB, remove_cvref_t<decltype(stride(LayoutBP{}))>, decltype(compute_default_stride(ShapeBPermute{}, StrideB{}))>;
+  using StrideCPermute = conditional_t<DoPermuteC, remove_cvref_t<decltype(stride(LayoutCP{}))>, decltype(compute_default_stride(ShapeCPermute{}, StrideC{}))>;
+  using StrideDPermute = conditional_t<DoPermuteD, remove_cvref_t<decltype(stride(LayoutDP{}))>, decltype(compute_default_stride(ShapeDPermute{}, StrideD{}))>;
+
+  // We need to select optimal tile shape based on the tile size specified by the user. 
+  // This is done by dividing the tile size in each mode by the mode shape as much
+  // as possible (i.e. until we run out of tile size or encounter a dynamic sub-shape).
+
+  using TileMPermute = decltype(select_tile_shape(get<0>(TileShape{}), ShapeM{}));
+  using TileNPermute = decltype(select_tile_shape(get<1>(TileShape{}), ShapeN{}));
+  using TileKPermute = decltype(select_tile_shape(get<2>(TileShape{}), ShapeK{}));
+
+  using TileShapePermute = Shape<TileMPermute, TileNPermute, TileKPermute>;
+
+  // Now we are ready to define the GEMM kernel types for both fused permute and reference paths.
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    TileShape, ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementEpilogue,
+    ElementC, StrideC, 128 / cutlass::sizeof_bits<ElementC>::value,
+    ElementD, StrideD, 128 / cutlass::sizeof_bits<ElementD>::value,
+    cutlass::epilogue::collective::EpilogueScheduleAuto
+  >::CollectiveOp;
+
+  using CollectiveEpiloguePermute = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    TileShapePermute, ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementEpilogue,
+    ElementC, StrideCPermute, 128 / cutlass::sizeof_bits<ElementC>::value,
+    ElementD, StrideDPermute, 128 / cutlass::sizeof_bits<ElementD>::value,
+    cutlass::epilogue::collective::EpilogueScheduleAuto
+  >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    ElementA, StrideA, 128 / cutlass::sizeof_bits<ElementA>::value,
+    ElementB, StrideB, 128 / cutlass::sizeof_bits<ElementB>::value,
+    ElementAccumulator,
+    TileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    cutlass::gemm::collective::KernelScheduleAuto
+  >::CollectiveOp;
+
+  using CollectiveMainloopPermute = typename cutlass::gemm::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    ElementA, StrideAPermute, 128 / cutlass::sizeof_bits<ElementA>::value,
+    ElementB, StrideBPermute, 128 / cutlass::sizeof_bits<ElementB>::value,
+    ElementAccumulator,
+    TileShapePermute, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpiloguePermute::SharedStorage))>,
+    cutlass::gemm::collective::KernelScheduleAuto
+  >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    ProblemShape,
+    CollectiveMainloop,
+    CollectiveEpilogue
+  >;
+
+  using GemmKernelPermute = cutlass::gemm::kernel::GemmUniversal<
+    ProblemShapePermute,
+    CollectiveMainloopPermute,
+    CollectiveEpiloguePermute
+  >;
+
+  using GemmReference = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  using GemmPermute = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelPermute>;
+
+  // Data members
+
+  cutlass::gemm::BatchedGemmCoord problem_size;
+  ProblemShape problem_shape;
+  cutlass::KernelHardwareInfo hw_info;
+
+  ElementEpilogue alpha;
+  ElementEpilogue beta;
+
+  MatrixShape shape_A;
+  MatrixShape shape_B;
+  MatrixShape shape_C;
+  MatrixShape shape_D;
+
+  StrideA stride_A;
+  StrideB stride_B;
+  StrideC stride_C;
+  StrideD stride_D;
+
+  LayoutAP layout_AP;
+  LayoutBP layout_BP;
+  LayoutCP layout_CP;
+  LayoutDP layout_DP;
+
+  ShapeM shape_M;
+  ShapeN shape_N;
+  ShapeK shape_K;
+  ShapeL shape_L;
+
+  ProblemShapePermute problem_shape_permute;
+
+  ShapeAPermute shape_A_permute;
+  ShapeBPermute shape_B_permute;
+  ShapeCPermute shape_C_permute;
+  ShapeDPermute shape_D_permute;
+
+  StrideAPermute stride_A_permute;
+  StrideBPermute stride_B_permute;
+  StrideCPermute stride_C_permute;
+  StrideDPermute stride_D_permute;
+
+  cutlass::device_memory::allocation<ElementA> tensor_a;
+  cutlass::device_memory::allocation<ElementB> tensor_b;
+  cutlass::device_memory::allocation<ElementC> tensor_c;
+  cutlass::device_memory::allocation<ElementD> tensor_d;
+
+  cutlass::device_memory::allocation<ElementA> tensor_a_permuted;
+  cutlass::device_memory::allocation<ElementB> tensor_b_permuted;
+  cutlass::device_memory::allocation<ElementC> tensor_c_permuted;
+  cutlass::device_memory::allocation<ElementD> tensor_d_unpermuted;
+  cutlass::device_memory::allocation<ElementD> tensor_d_reference;
+
+  cutlass::gemm::GemmUniversalMode gemm_mode;
+
+  GemmPermute gemm_permute;
+  typename GemmPermute::Arguments arguments_permute;
+  cutlass::device_memory::allocation<uint8_t> workspace_permute;
+
+  GemmReference gemm_reference;
+  typename GemmReference::Arguments arguments_reference;
+  cutlass::device_memory::allocation<uint8_t> workspace_reference;
+
+  public:
+
+  ExampleRunner(Options const & options, cutlass::KernelHardwareInfo const & hw_info)
+  : problem_size(options.problem_size),
+    problem_shape(problem_size.m(), problem_size.n(), problem_size.k(), problem_size.batch()),
+    hw_info(hw_info),
+    alpha(options.alpha),
+    beta(options.beta),
+    shape_A(make_shape(problem_size.m(), problem_size.k(), problem_size.batch())),
+    shape_B(make_shape(problem_size.n(), problem_size.k(), problem_size.batch())),
+    shape_C(make_shape(problem_size.m(), problem_size.n(), problem_size.batch())),
+    shape_D(make_shape(problem_size.m(), problem_size.n(), problem_size.batch())),
+    stride_A(cutlass::make_cute_packed_stride(StrideA{}, shape_A)),
+    stride_B(cutlass::make_cute_packed_stride(StrideB{}, shape_B)),
+    stride_C(cutlass::make_cute_packed_stride(StrideC{}, shape_C)),
+    stride_D(cutlass::make_cute_packed_stride(StrideD{}, shape_D)),
+    layout_AP(make_permute_layout<PermuteAReal, false>(make_layout(shape_A, stride_A))),
+    layout_BP(make_permute_layout<PermuteBReal, true >(make_layout(shape_B, stride_B))),
+    layout_CP(make_permute_layout<PermuteCReal, false>(make_layout(shape_C, stride_C))),
+    layout_DP(make_permute_layout<PermuteDReal, false>(make_layout(shape_D, stride_D))),
+    shape_M(select_mode_shape(shape<0>(layout_AP), shape<0>(layout_CP), shape<0>(layout_DP))),
+    shape_N(select_mode_shape(shape<0>(layout_BP), shape<1>(layout_CP), shape<1>(layout_DP))),
+    shape_K(select_mode_shape(shape<1>(layout_AP), shape<1>(layout_BP))),
+    shape_L(select_mode_shape(shape<2>(layout_AP), shape<2>(layout_BP), shape<2>(layout_CP), shape<2>(layout_DP))),
+    problem_shape_permute(shape_M, shape_N, shape_K, shape_L),
+    shape_A_permute(make_shape(shape_M, shape_K, shape_L)),
+    shape_B_permute(make_shape(shape_N, shape_K, shape_L)),
+    shape_C_permute(make_shape(shape_M, shape_N, shape_L)),
+    shape_D_permute(make_shape(shape_M, shape_N, shape_L)),
+    stride_A_permute(conditional_return<DoPermuteA>(layout_AP.stride(), compute_default_stride(shape_A_permute, stride_A))),
+    stride_B_permute(conditional_return<DoPermuteB>(layout_BP.stride(), compute_default_stride(shape_B_permute, stride_B))),
+    stride_C_permute(conditional_return<DoPermuteC>(layout_CP.stride(), compute_default_stride(shape_C_permute, stride_C))),
+    stride_D_permute(conditional_return<DoPermuteD>(layout_DP.stride(), compute_default_stride(shape_D_permute, stride_D))),
+    tensor_a(problem_size.m() * problem_size.k() * problem_size.batch()),
+    tensor_b(problem_size.k() * problem_size.n() * problem_size.batch()),
+    tensor_c(problem_size.m() * problem_size.n() * problem_size.batch()),
+    tensor_d(problem_size.m() * problem_size.n() * problem_size.batch()),
+    tensor_a_permuted(problem_size.m() * problem_size.k() * problem_size.batch()),
+    tensor_b_permuted(problem_size.k() * problem_size.n() * problem_size.batch()),
+    tensor_c_permuted(problem_size.m() * problem_size.n() * problem_size.batch()),
+    tensor_d_unpermuted(problem_size.m() * problem_size.n() * problem_size.batch()),
+    tensor_d_reference(problem_size.m() * problem_size.n() * problem_size.batch()),
+    gemm_mode(problem_size.batch() > 1 ?  cutlass::gemm::GemmUniversalMode::kBatched : cutlass::gemm::GemmUniversalMode::kGemm),
+    arguments_permute{
+      gemm_mode,
+      problem_shape_permute,
+      {
+        tensor_a.get(), stride_A_permute,
+        tensor_b.get(), stride_B_permute,
+      },
+      { 
+        { alpha, beta },
+        tensor_c.get(), stride_C_permute,
+        tensor_d.get(), stride_D_permute
+      },
+      hw_info
+    },
+    workspace_permute(GemmPermute::get_workspace_size(arguments_permute)),
+    arguments_reference{
+      gemm_mode,
+      problem_shape,
+      {
+        DoPermuteA ? tensor_a_permuted.get() : tensor_a.get(), stride_A,
+        DoPermuteB ? tensor_b_permuted.get() : tensor_b.get(), stride_B
+      },
+      { 
+        { alpha, beta },
+        DoPermuteC ? tensor_c_permuted.get()   : tensor_c.get(),           stride_C, 
+        DoPermuteD ? tensor_d_unpermuted.get() : tensor_d_reference.get(), stride_D
+      },
+      hw_info
+    },
+    workspace_reference(GemmReference::get_workspace_size(arguments_reference))
+  {
+    if (options.verbose) {
+      print("Original GEMM problem:\n");
+      print("  Problem shape: "); print(problem_shape); print("\n");
+      print("  Layout A: "); print(make_layout(shape_A, stride_A)); print("\n");
+      print("  Layout B: "); print(make_layout(shape_B, stride_B)); print("\n");
+      print("  Layout C: "); print(make_layout(shape_C, stride_C)); print("\n");
+      print("  Layout D: "); print(make_layout(shape_D, stride_D)); print("\n");
+      print("  Tile shape: "); print(TileShape{}); print("\n");
+      print("With fused permutations:\n");
+      print("  Problem shape: "); print(problem_shape_permute); print("\n");
+      print("  Layout A: "); print(make_layout(shape_A_permute, stride_A_permute)); print("\n");
+      print("  Layout B: "); print(make_layout(shape_B_permute, stride_B_permute)); print("\n");
+      print("  Layout C: "); print(make_layout(shape_C_permute, stride_C_permute)); print("\n");
+      print("  Layout D: "); print(make_layout(shape_D_permute, stride_D_permute)); print("\n");
+      print("  Tile shape: "); print(TileShapePermute{}); print("\n");
+    }
+
+    cutlass::reference::device::BlockFillRandomUniform(tensor_a.get(), tensor_a.size(), 1, ElementA(7), ElementA(-8), 0);
+    cutlass::reference::device::BlockFillRandomUniform(tensor_b.get(), tensor_b.size(), 2, ElementB(7), ElementB(-8), 0);
+    cutlass::reference::device::BlockFillRandomUniform(tensor_c.get(), tensor_c.size(), 3, ElementC(7), ElementC(-8), 0);
+    cutlass::reference::device::BlockFillSequential(tensor_d.get(), tensor_d.size(), ElementD(0), ElementD(0));
+
+    auto const gemm_init = [](auto & gemm, auto const & arguments, auto & workspace) {
+      cutlass::Status status = gemm.can_implement(arguments);
+      if (status != cutlass::Status::kSuccess) {
+        std::cerr << "Requested GEMM kernel cannot be used for this problem.\n" 
+                  << "Check problem sizes and alignment requirements." << std::endl;
+        exit(EXIT_FAILURE);
+      }     
+      status = gemm.initialize(arguments, workspace.get());
+      CUTLASS_CHECK(status);
+    };
+
+    gemm_init(gemm_permute,   arguments_permute,   workspace_permute  );
+    gemm_init(gemm_reference, arguments_reference, workspace_reference);
+  }
+
+  void debug_output(std::ostream & os)
+  {
+    auto print_tensor = [](std::ostream &os, char const * name, auto const & data, auto shape, auto stride)
+    {
+      std::vector<remove_cvref_t<decltype(*data.get())>> h_data(data.size());
+      data.copy_to_host(h_data.data());
+      Tensor t = make_tensor(h_data.data(), shape, stride);
+      os << "\n" << name << ": " << std::setw(4) << t << std::endl;
+    };
+    auto [M,N,K,L] = problem_shape;
+    print_tensor(os, "A", tensor_a, make_shape(M,K,L), stride_A);
+    print_tensor(os, "B", tensor_b, make_shape(N,K,L), stride_B);
+    print_tensor(os, "C", tensor_c, make_shape(M,N,L), stride_C);
+    print_tensor(os, "D", tensor_d, make_shape(M,N,L), stride_D);
+    print_tensor(os, "D reference", tensor_d_reference, make_shape(M,N,L), stride_D);
+  }
+
+  template<bool DoTime, class Gemm>
+  static float 
+  run_gemm(Gemm &gemm)
+  {
+    GpuTimer timer;
+    if constexpr (DoTime) timer.start();
+    cutlass::Status status = gemm.run();
+    CUTLASS_CHECK(status);
+    if constexpr (DoTime) timer.stop();
+    if constexpr (DoTime) return timer.elapsed_millis();
+    else return 0;
+  }
+
+  template<bool DoTime, class Permute, class Element, class Shape, class Stride>
+  static float
+  run_permute(cutlass::device_memory::allocation<Element> const & input, 
+              cutlass::device_memory::allocation<Element> & output,
+              Layout<Shape, Stride> const& layout,
+              cutlass::KernelHardwareInfo const & hw_info)
+  {
+    auto idx = find_if(layout.stride(), [](auto x){ return not is_constant<1, decltype(x)>{}; });
+    auto stride = get<decltype(idx)::value>(layout.stride());
+
+    GpuTimer timer;
+    if constexpr (DoTime) timer.start();
+    permute<PermuteTraits<Permute>::kBatched, Permute>(input.get(), 
+                                                       output.get(),
+                                                       size(take<0,2>(layout)),
+                                                       static_cast<int>(stride),
+                                                       shape<2>(layout),
+                                                       hw_info);
+    if constexpr (DoTime) timer.stop();
+    if constexpr (DoTime) return timer.elapsed_millis();
+    else return 0;
+  };
+
+  template<bool DoTime, class Gemm2>
+  auto run_reference(Gemm2 &gemm)
+  {
+    float permute_time = 0.f;
+    if constexpr (DoPermuteA) {
+      auto orig_layout = make_original_layout<PermuteAReal, false>(make_layout(shape_A, stride_A));
+      permute_time += run_permute<DoTime, PermuteA>(tensor_a, tensor_a_permuted, orig_layout, hw_info);
+    }
+    if constexpr (DoPermuteB) {
+      auto orig_layout = make_original_layout<PermuteBReal, true>(make_layout(shape_B, stride_B));
+      permute_time += run_permute<DoTime, PermuteB>(tensor_b, tensor_b_permuted, select<1,0,2>(orig_layout), hw_info);
+    }
+    if constexpr (DoPermuteC) {
+      auto orig_layout = make_original_layout<PermuteCReal, false>(make_layout(shape_C, stride_C));
+      permute_time += run_permute<DoTime, PermuteC>(tensor_c, tensor_c_permuted, orig_layout, hw_info);
+    }
+
+    float gemm_time = run_gemm<DoTime>(gemm);
+
+    if constexpr (DoPermuteD) {
+      auto orig_layout = make_layout(shape_D, stride_D);
+      permute_time += run_permute<DoTime, PermuteD>(tensor_d_unpermuted, tensor_d_reference, orig_layout, hw_info);
+    }
+
+    return cute::make_tuple(gemm_time, permute_time);
+  }
+
+  bool verify()
+  {
+    run_gemm<false>(gemm_permute);
+    run_reference<false>(gemm_reference);
+    return cutlass::reference::device::BlockCompareEqual(tensor_d.get(), tensor_d_reference.get(), tensor_d.size());
+  }
+
+  bool run(Options const &options)
+  {
+    if (options.reference_check) {
+      if (!verify()) {
+        std::cout << "Failed validation" << std::endl;
+#if 1
+        debug_output(std::cout);
+#endif
+        return false;
+      } 
+      else {
+        std::cout << "Passed validation" << std::endl;
+      }
+    }
+
+    //
+    // Run profiling loop
+    //
+
+    auto const benchmark = [&](auto name, auto func)
+    {
+      GpuTimer timer;
+      timer.start();
+      for (int iter = 0; iter < options.iterations; ++iter) {
+        func();
+      }
+      timer.stop();
+
+      double runtime = timer.elapsed_millis() / double(options.iterations);
+      double gflops = 2 * double(problem_size.product()) / 1e6 / runtime; // Two flops per multiply-add
+
+      std::cout << name << ":\n";
+      std::cout << "  Runtime: " << runtime << " ms\n";
+      std::cout << "   GFLOPs: " << gflops  << "\n";
+    };
+
+    benchmark("Fused GEMM+permute", [&](){ run_gemm<false>(gemm_permute); });
+    benchmark("Unfused GEMM+permute", [&](){ run_reference<false>(gemm_reference); });
+    benchmark("Standalone GEMM only", [&](){ run_gemm<false>(gemm_reference); });
+    std::cout << "\n";
+
+    return true;
+  }
+};
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+} // namespace example
+
+
+int main(int argc, char const **argv)
+{
+  bool notSupported = false;
+
+  // CUDA 12 minimum required
+  if (__CUDACC_VER_MAJOR__ < 12) {
+    std::cerr << "This example requires CUDA Toolkit version 12 or later.\n";
+    notSupported = true;
+  }
+
+  cudaDeviceProp props;
+  CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
+
+  if (props.major < 9) {
+    std::cerr << "This example requires a device with compute capability 90 or higher.\n";
+    notSupported = true;
+  }
+  if (notSupported) {
+    return EXIT_SUCCESS; // Do not fail CI checks on unsupported systems
+  }
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+  example::Options options;
+  options.parse(argc, argv);
+
+  if (options.help) {
+    options.print_usage(std::cout) << "\n";
+    return EXIT_SUCCESS;
+  }
+
+  if (!options.valid()) {
+    std::cerr << "Invalid arguments." << "\n";
+    return EXIT_FAILURE;
+  }
+
+  cutlass::KernelHardwareInfo hw_info;
+  hw_info.device_id = 0;
+  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+  using namespace cute;
+
+  // Define the data types
+  using ElementA = cutlass::half_t;
+  using ElementB = cutlass::half_t;
+  using ElementC = cutlass::half_t;
+  using ElementD = cutlass::half_t;
+  using ElementAccumulator = float;
+  using ElementEpilogue    = float;
+
+  // M=64 for TMA epilogue
+  using TileShape = Shape<_128,_128,_64>;
+
+  // Cluster launch with TMA multicast for better perf
+  using ClusterShape = Shape<_2,_2,_1>;
+
+  bool result = true;
+
+#define COMPILE_ALL_EXAMPLES 0
+
+  // REGULAR GEMMS
+
+  {
+    print("===================================================\n");
+    print("Tensor A: RowMajor, Tensor4DPermute0213<8,16>\n");
+    using Runner = example::ExampleRunner<ElementA, cutlass::layout::RowMajor,    cutlass::layout::Tensor4DPermute0213RowMajor<8, 16>,
+                                          ElementB, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementC, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementD, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementAccumulator, ElementEpilogue,
+                                          TileShape, ClusterShape>;
+    Runner runner(options, hw_info);
+    result &= runner.run(options);
+  }
+#if COMPILE_ALL_EXAMPLES
+  {
+    print("===================================================\n");
+    print("Tensor A: ColumnMajor, Tensor4DPermute0213<8,16>\n");
+    using Runner = example::ExampleRunner<ElementA, cutlass::layout::ColumnMajor, cutlass::layout::Tensor4DPermute0213ColumnMajor<8, 16>,
+                                          ElementB, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementC, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementD, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementAccumulator, ElementEpilogue,
+                                          TileShape, ClusterShape>;
+    Runner runner(options, hw_info);
+    result &= runner.run(options);
+  }
+  {
+    print("===================================================\n");
+    print("Tensor B: RowMajor, Tensor4DPermute0213<8,16>\n");
+    using Runner = example::ExampleRunner<ElementA, cutlass::layout::ColumnMajor, cutlass::layout::NoPermute,
+                                          ElementB, cutlass::layout::RowMajor,    cutlass::layout::Tensor4DPermute0213RowMajor<8, 16>,
+                                          ElementC, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementD, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementAccumulator, ElementEpilogue,
+                                          TileShape, ClusterShape>;
+    Runner runner(options, hw_info);
+    result &= runner.run(options);
+  }
+#endif
+  {
+    print("===================================================\n");
+    print("Tensor B: ColumnMajor, Tensor4DPermute0213<8,16>\n");
+    using Runner = example::ExampleRunner<ElementA, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementB, cutlass::layout::ColumnMajor, cutlass::layout::Tensor4DPermute0213ColumnMajor<8, 16>,
+                                          ElementC, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementD, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementAccumulator, ElementEpilogue,
+                                          TileShape, ClusterShape>;
+    Runner runner(options, hw_info);
+    result &= runner.run(options);
+  }
+  {
+    print("===================================================\n");
+    print("Tensor D: RowMajor, Tensor4DPermute0213<8,16>\n");
+    using Runner = example::ExampleRunner<ElementA, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementB, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementC, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementD, cutlass::layout::RowMajor,    cutlass::layout::Tensor4DPermute0213RowMajor<8, 16>,
+                                          ElementAccumulator, ElementEpilogue,
+                                          TileShape, ClusterShape>;
+    Runner runner(options, hw_info);
+    result &= runner.run(options);
+  }
+#if COMPILE_ALL_EXAMPLES
+  {
+    print("===================================================\n");
+    print("Tensor D: ColumnMajor, Tensor4DPermute0213<8,16>\n");
+    using Runner = example::ExampleRunner<ElementA, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementB, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementC, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementD, cutlass::layout::ColumnMajor, cutlass::layout::Tensor4DPermute0213ColumnMajor<8, 16>,
+                                          ElementAccumulator, ElementEpilogue,
+                                          TileShape, ClusterShape>;
+    Runner runner(options, hw_info);
+    result &= runner.run(options);
+  }
+#endif
+  {
+    print("===================================================\n");
+    print("Tensor A: RowMajor, Tensor5DPermute20314<16,8,4>\n");
+    using Runner = example::ExampleRunner<ElementA, cutlass::layout::RowMajor,    cutlass::layout::Tensor5DPermute20314RowMajor<16,8,4>,
+                                          ElementB, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementC, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementD, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementAccumulator, ElementEpilogue,
+                                          TileShape, ClusterShape>;
+    Runner runner(options, hw_info);
+    result &= runner.run(options);
+  }
+#if COMPILE_ALL_EXAMPLES
+  {
+    print("===================================================\n");
+    print("Tensor A: ColumnMajor, Tensor5DPermute02413<16,8,4>\n");
+    using Runner = example::ExampleRunner<ElementA, cutlass::layout::ColumnMajor, cutlass::layout::Tensor5DPermute02413ColumnMajor<16,8,4>,
+                                          ElementB, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementC, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementD, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementAccumulator, ElementEpilogue,
+                                          TileShape, ClusterShape>;
+    Runner runner(options, hw_info);
+    result &= runner.run(options);
+  }
+#endif
+  {
+    print("===================================================\n");
+    print("Tensor D: RowMajor, Tensor5DPermute20314<16,8,4>\n");
+    using Runner = example::ExampleRunner<ElementA, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementB, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementC, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementD, cutlass::layout::RowMajor,    cutlass::layout::Tensor5DPermute20314RowMajor<16,8,4>,
+                                          ElementAccumulator, ElementEpilogue,
+                                          TileShape, ClusterShape>;
+    Runner runner(options, hw_info);
+    result &= runner.run(options);
+  }
+#if COMPILE_ALL_EXAMPLES
+  {
+    print("===================================================\n");
+    print("Tensor D: ColumnMajor, Tensor5DPermute02413<16,8,4>\n");
+    using Runner = example::ExampleRunner<ElementA, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementB, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementC, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementD, cutlass::layout::ColumnMajor, cutlass::layout::Tensor5DPermute02413ColumnMajor<16,8,4>,
+                                          ElementAccumulator, ElementEpilogue,
+                                          TileShape, ClusterShape>;
+    Runner runner(options, hw_info);
+    result &= runner.run(options);
+  }
+#endif
+
+  // BATCHED GEMMS
+
+  {
+    print("===================================================\n");
+    print("Tensor A: RowMajor, Tensor4DPermuteBMM0213<8>\n");
+    using Runner = example::ExampleRunner<ElementA, cutlass::layout::RowMajor,    cutlass::layout::Tensor4DPermuteBMM0213RowMajor<8>,
+                                          ElementB, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementC, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementD, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementAccumulator, ElementEpilogue,
+                                          TileShape, ClusterShape>;
+    Runner runner(options, hw_info);
+    result &= runner.run(options);
+  }
+  {
+    print("===================================================\n");
+    print("Tensor D: RowMajor, Tensor4DPermuteBMM0213<8>\n");
+    using Runner = example::ExampleRunner<ElementA, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementB, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementC, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementD, cutlass::layout::RowMajor,    cutlass::layout::Tensor4DPermuteBMM0213RowMajor<8>,
+                                          ElementAccumulator, ElementEpilogue,
+                                          TileShape, ClusterShape>;
+    Runner runner(options, hw_info);
+    result &= runner.run(options);
+  }
+#if COMPILE_ALL_EXAMPLES
+  {
+    print("===================================================\n");
+    print("Tensor A: ColumnMajor, Tensor4DPermuteBMM0321<8>\n");
+    using Runner = example::ExampleRunner<ElementA, cutlass::layout::ColumnMajor, cutlass::layout::Tensor4DPermuteBMM0321ColumnMajor<8>,
+                                          ElementB, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementC, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementD, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementAccumulator, ElementEpilogue,
+                                          TileShape, ClusterShape>;
+    Runner runner(options, hw_info);
+    result &= runner.run(options);
+  }
+  {
+    print("===================================================\n");
+    print("Tensor D: RowMajor, Tensor4DPermuteBMM0321<8>\n");
+    using Runner = example::ExampleRunner<ElementA, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementB, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementC, cutlass::layout::RowMajor,    cutlass::layout::NoPermute,
+                                          ElementD, cutlass::layout::ColumnMajor, cutlass::layout::Tensor4DPermuteBMM0321ColumnMajor<8>,
+                                          ElementAccumulator, ElementEpilogue,
+                                          TileShape, ClusterShape>;
+    Runner runner(options, hw_info);
+    result &= runner.run(options);
+  }
+#endif
+  return result ? EXIT_SUCCESS : EXIT_FAILURE;
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+}
diff --git a/examples/53_hopper_gemm_permute/CMakeLists.txt b/examples/53_hopper_gemm_permute/CMakeLists.txt
new file mode 100644
index 0000000000..dc70d95f6a
--- /dev/null
+++ b/examples/53_hopper_gemm_permute/CMakeLists.txt
@@ -0,0 +1,36 @@
+
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+
+cutlass_example_add_executable(
+  53_hopper_gemm_permute
+  53_hopper_gemm_permute.cu
+  )
+
diff --git a/examples/53_hopper_gemm_permute/permute_kernel.cuh b/examples/53_hopper_gemm_permute/permute_kernel.cuh
new file mode 100644
index 0000000000..8abe70108d
--- /dev/null
+++ b/examples/53_hopper_gemm_permute/permute_kernel.cuh
@@ -0,0 +1,92 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Simple permutation kernel implementation.
+*/
+
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/fast_math.h"
+#include "cute/numeric/numeric_types.hpp"
+
+namespace example
+{
+
+/**
+ * Assumes column-major input (M mode is contiguous, N mode is strided).
+ * For row major, the inputs must be switched accordingly.
+*/
+template<bool Batched, typename Element, typename Permute>
+__global__ void
+permute_kernel(Element const* __restrict__ input,
+               Element* __restrict__ output,
+               Permute permute,
+               int64_t num_elems,
+               cutlass::FastDivmod stride_divmod)
+{
+  // CUTLASS 2.x batched permute functions assume 0 batch stride for target tensor
+  Element const * input_b = input + blockIdx.z * num_elems;
+  Element * output_b = output + (Batched ? 0 : blockIdx.z * num_elems);
+  for (int64_t k = threadIdx.x + blockIdx.x * blockDim.x; k < num_elems; k += blockDim.x * gridDim.x)
+  {
+    int i, j;
+    stride_divmod(j, i, k);
+    output_b[permute(cutlass::PitchLinearCoord(i, j))] = input_b[i + j * stride_divmod.divisor];
+  }
+}
+
+template<bool Batched, typename Permute, typename Element>
+void permute(Element const* input,
+             Element * output,
+             int64_t num_elems,
+             int stride,
+             int batch_count,
+             cutlass::KernelHardwareInfo const& hw_info)
+{
+  // Upcast to uint128_t data type
+  int factor = 128 / cutlass::sizeof_bits<Element>::value;
+  assert(stride % factor == 0);
+  int stride_upcast = stride/factor;
+  int64_t num_elems_upcast = num_elems / factor;
+  Permute permute_upcast(cutlass::PitchLinearCoord(stride_upcast, int(num_elems_upcast/stride_upcast)), stride_upcast);
+
+  cutlass::FastDivmod stride_divmod(stride);
+  dim3 blocks(hw_info.sm_count, 1, batch_count);
+  permute_kernel<Batched><<<blocks, 1024>>>(reinterpret_cast<cute::uint128_t const *>(input), 
+                                            reinterpret_cast<cute::uint128_t *>(output),
+                                            permute_upcast,
+                                            num_elems_upcast,
+                                            stride_upcast);
+}
+
+} // namespace example
diff --git a/examples/53_hopper_gemm_permute/permute_traits.hpp b/examples/53_hopper_gemm_permute/permute_traits.hpp
new file mode 100644
index 0000000000..4c5baccac5
--- /dev/null
+++ b/examples/53_hopper_gemm_permute/permute_traits.hpp
@@ -0,0 +1,274 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Additional permutation information for the example.
+*/
+
+#include "cutlass/layout/permute.h"
+#include "cutlass/gemm/gemm.h"
+
+namespace example
+{
+
+using namespace cute;
+
+// This struct is specialized below for different CUTLASS 2.x permutation ops
+// to describe the operation in terms of target CuTe shape and stride order.
+template<class Permute>
+struct PermuteTraits {};
+
+// Use X as a placeholder for shape division result
+using X = Underscore;
+
+// Reshape a rank-2 shape into a multidimensional shape.
+// Input:
+//   shape = (A, B, ...)
+//   target_shape = ((A1, ..., X, ..., Am), (B1, ..., X, ..., Bn), ...)
+// Output:
+//   ((A1, ..., A/prod(A1..Am), ..., Am), (B1, ..., B/prod(B1..Bn), ..., Bn), ...)
+template<class Shape, class TargetShape>
+constexpr auto
+reshape(Shape const& shape, TargetShape const& target_shape)
+{
+  if constexpr (is_tuple<Shape>::value) {
+    return cute::transform(shape, target_shape, [](auto && s, auto && t){ return reshape(s, t); });
+  }
+  else {
+    auto idx = find_if(target_shape, [](auto x){ return is_underscore<decltype(x)>{}; });
+    constexpr int I = decltype(idx)::value;
+    static_assert(I < tuple_size_v<TargetShape>, "Each mode of TargetShape must contain a placeholder X");
+    auto divisors = remove<I>(target_shape);
+    assert(shape % product(divisors) == 0);
+    return replace<I>(target_shape, shape / product(divisors));
+  }
+}
+
+// Given a tensor layout, compute a permutation layout consisting of:
+// - sub-modes corresponding to the implied multidimensional shape of the source tensor
+// - strides accounting for the permutation operation being performed
+template<class Permute, bool Transpose, class Shape, class Stride>
+constexpr auto
+make_permute_layout(Layout<Shape,Stride> const& layout) {
+  static_assert(cute::rank(Shape{}) == 3, "Only rank-3 layouts are supported");
+  if constexpr (Transpose) {
+    // Deal with tensor B by transposing appropriately before and after computing the permute layout.
+    // Its CuTe-canonical mode order is [N,K,L], while permute operations expect [row,col,batch].
+    return select<1,0,2>(make_permute_layout<Permute, false>(select<1,0,2>(layout)));
+  }
+  else {
+    if constexpr (cutlass::layout::is_trivial_permute<Permute>) {
+      // Special case for NoPermute. Use a depth-2 layout for consistency with other permutations.
+      using ShapeProfile = tuple<tuple<X>, tuple<X>, tuple<X>>;
+      return unflatten(layout, ShapeProfile{});
+    }
+    else {
+      // Here's where the permutation layout is actually built
+      using ShapeProfile = typename PermuteTraits<Permute>::ShapeProfile;
+      using StrideOrder  = typename PermuteTraits<Permute>::StrideOrder;
+      return make_ordered_layout(reshape(layout.shape(), ShapeProfile{}), StrideOrder{});
+    }
+  }
+}
+
+namespace detail
+{
+
+template<int I>
+struct is_constant_pred {
+  template <class T>
+  constexpr auto operator()(T) {
+    return is_constant<I, T>{};
+  }
+};
+
+template<class Permutation, int... I>
+constexpr auto
+inverse_impl(Permutation const & perm, seq<I...>) {
+  return cute::make_tuple(Int<find_if(Permutation{}, is_constant_pred<I>{})>{}...);
+}
+
+} // namespace detail
+
+// Compute an inverse of a permutation represented as a tuple of cute::Int<>
+template<class Permutation>
+constexpr auto
+inverse(Permutation const & perm) {
+  auto flat_perm = flatten(perm);
+  return unflatten(detail::inverse_impl(flat_perm, tuple_seq<decltype(flat_perm)>{}), perm);
+}
+
+template<class T>
+using inverse_t = decltype(inverse(T{}));
+
+// Given a rank-2 layout of tensor that is assumed to have been permuted,
+// compute the original rank-2 layout of the tensor prior to the permutation.
+// This is needed to form the correct input to the standalone permutation kernel.
+template<class Permute, bool Transpose, class Shape, class Stride>
+constexpr auto
+make_original_layout(Layout<Shape,Stride> const& layout) {
+  static_assert(cute::rank(Shape{}) == 3, "Only rank-3 layouts are supported");
+  if constexpr (Transpose) {
+    // Deal with tensor B by transposing appropriately before and after computing the permute layout.
+    // Its CuTe-canonical mode order is [N,K,L], while permute operations expect [row,col,batch].
+    return select<1,0,2>(make_original_layout<Permute, false>(select<1,0,2>(layout)));
+  }
+  else {
+    using ShapeProfile = typename PermuteTraits<Permute>::ShapeProfile;
+    auto re_shape   = flatten(reshape(layout.shape(), ShapeProfile{}));
+    using IndexOrder   = typename PermuteTraits<Permute>::IndexOrder;
+    auto orig_shape = transform_leaf(IndexOrder{}, [&](auto i){ return get<i>(re_shape); });
+    using OrigOrder    = conditional_t<cutlass::gemm::detail::is_major<0,Stride>(), seq<0,1,2>, seq<1,0,2>>;
+    // print("Permuted shape: "); print(reshape(layout.shape(), ShapeProfile{})); print("\n");
+    // print("Original shape: "); print(orig_shape); print("\n");
+    return make_ordered_layout(product_each(orig_shape), OrigOrder{});
+  }
+}
+
+/////////////// Tensor4DPermute0213 ////////////////////
+
+template<int D1, int D2>
+struct PermuteTraits<cutlass::layout::Tensor4DPermute0213ColumnMajor<D1, D2>>
+{
+  static constexpr bool kBatched = false;
+  using ShapeProfile = Shape<Shape<X,Int<D1>>, Shape<Int<D2>,X>, Shape<X>>;
+  using IndexOrder   = Step<Step<_0,_2>, Step<_1,_3>, Step<_4>>;
+  using StrideOrder = inverse_t<IndexOrder>; // Step<Step<_0,_2>, Step<_1,_3>, Step<_4>>;
+};
+
+template<int D1, int D2>
+struct PermuteTraits<cutlass::layout::Tensor4DPermute0213ColumnMajorInverse<D1, D2>>
+{
+  static constexpr bool kBatched = false;
+  using ShapeProfile = Shape<Shape<X,Int<D2>>, Shape<Int<D1>,X>, Shape<X>>;
+  using IndexOrder   = Step<Step<_0,_2>, Step<_1,_3>, Step<_4>>;
+  using StrideOrder  = inverse_t<IndexOrder>; // Step<Step<_0,_2>, Step<_1,_3>, Step<_4>>;
+};
+
+template<int D1, int D2>
+struct PermuteTraits<cutlass::layout::Tensor4DPermute0213RowMajor<D1, D2>>
+{
+  static constexpr bool kBatched = false;
+  using ShapeProfile = Shape<Shape<Int<D1>,X>, Shape<X,Int<D2>>, Shape<X>>;
+  using IndexOrder   = Step<Step<_1,_3>, Step<_0,_2>, Step<_4>>;
+  using StrideOrder  = Step<Step<_1,_3>, Step<_0,_2>, Step<_4>>;
+};
+
+template<int D1, int D2>
+struct PermuteTraits<cutlass::layout::Tensor4DPermute0213RowMajorInverse<D1, D2>>
+{
+  static constexpr bool kBatched = false;
+  using ShapeProfile = Shape<Shape<Int<D2>,X>, Shape<X,Int<D1>>, Shape<X>>;
+  using IndexOrder   = Step<Step<_1,_3>, Step<_0,_2>, Step<_4>>;
+  using StrideOrder  = Step<Step<_1,_3>, Step<_0,_2>, Step<_4>>;
+};
+
+/////////////// Tensor4DPermuteBMM0321 ////////////////////
+
+template<int D>
+struct PermuteTraits<cutlass::layout::Tensor4DPermuteBMM0321ColumnMajor<D>>
+{
+  static constexpr bool kBatched = true;
+  using ShapeProfile = Shape<Shape<X>, Shape<X>, Shape<Int<D>,X>>;
+  using IndexOrder   = Step<Step<_0,_2>, Step<_1>, Step<_3>>;
+  using StrideOrder  = Step<Step<_0>, Step<_2>, Step<_1,_3>>;
+};
+
+template<int D>
+struct PermuteTraits<cutlass::layout::Tensor4DPermuteBMM0321ColumnMajorInverse<D>>
+{
+  static constexpr bool kBatched = true;
+  using ShapeProfile = Shape<Shape<X,Int<D>>, Shape<X>, Shape<X>>;
+  using IndexOrder   = Step<Step<_0>, Step<_2>, Step<_1,_3>>;
+  using StrideOrder  = Step<Step<_0,_2>, Step<_1>, Step<_3>>;
+};
+
+/////////////// Tensor4DPermuteBMM0213 ////////////////////
+
+template<int D>
+struct PermuteTraits<cutlass::layout::Tensor4DPermuteBMM0213RowMajor<D>>
+{
+  static constexpr bool kBatched = true;
+  using ShapeProfile = Shape<Shape<X>, Shape<X>, Shape<Int<D>,X>>;
+  using IndexOrder   = Step<Step<_0>, Step<_1,_2>, Step<_3>>;
+  using StrideOrder  = Step<Step<_2>, Step<_0>, Step<_1,_3>>;
+};
+
+template<int D>
+struct PermuteTraits<cutlass::layout::Tensor4DPermuteBMM0213RowMajorInverse<D>>
+{
+  static constexpr bool kBatched = true;
+  using ShapeProfile = Shape<Shape<X>, Shape<X,Int<D>>, Shape<X>>;
+  using IndexOrder   = Step<Step<_0>, Step<_1>, Step<_2,_3>>;
+  using StrideOrder  = Step<Step<_1>, Step<_0,_2>, Step<_3>>;
+};
+
+/////////////// Tensor5DPermute02413 ////////////////////
+
+template<int D1, int D2, int D3>
+struct PermuteTraits<cutlass::layout::Tensor5DPermute02413ColumnMajor<D1, D2, D3>>
+{
+  static constexpr bool kBatched = false;
+  using ShapeProfile = Shape<Shape<X,Int<D1>>, Shape<Int<D2>,Int<D3>,X>, Shape<X>>;
+  using IndexOrder   = Step<Step<_0,_2>, Step<_4,_1,_3>, Step<_5>>;
+  using StrideOrder  = inverse_t<IndexOrder>; // Step<Step<_0,_3>, Step<_1,_4,_2>, Step<_5>>;
+};
+
+template<int D1, int D2, int D3>
+struct PermuteTraits<cutlass::layout::Tensor5DPermute02413ColumnMajorInverse<D1, D2, D3>>
+{
+  static constexpr bool kBatched = false;
+  using ShapeProfile = Shape<Shape<X,Int<D2>>, Shape<X,Int<D1>,Int<D3>>, Shape<X>>;
+  using IndexOrder   = Step<Step<_0,_3>, Step<_1,_4,_2>, Step<_5>>;
+  using StrideOrder  = inverse_t<IndexOrder>; // Step<Step<_0,_2>, Step<_4,_1,_3>, Step<_5>>;
+};
+
+/////////////// Tensor5DPermute20314 ////////////////////
+
+template<int D1, int D2, int D3>
+struct PermuteTraits<cutlass::layout::Tensor5DPermute20314RowMajor<D1, D2, D3>>
+{
+  static constexpr bool kBatched = false;
+  using ShapeProfile = Shape<Shape<Int<D1>,X>, Shape<X,Int<D3>,Int<D2>>, Shape<X>>;
+  using IndexOrder   = Step<Step<_2,_0>, Step<_3,_1,_4>, Step<_5>>;
+  using StrideOrder  = Step<Step<_1,_3>, Step<_0,_2,_4>, Step<_5>>;
+};
+
+template<int D1, int D2, int D3>
+struct PermuteTraits<cutlass::layout::Tensor5DPermute20314RowMajorInverse<D1, D2, D3>>
+{
+  static constexpr bool kBatched = false;
+  using ShapeProfile = Shape<Shape<X,Int<D2>>, Shape<X,Int<D1>,Int<D3>>, Shape<X>>;
+  using IndexOrder   = Step<Step<_3,_0>, Step<_2,_4,_1>, Step<_5>>;
+  using StrideOrder  = Step<Step<_4,_2>, Step<_0,_3,_1>, Step<_5>>;
+};
+
+} // namespace example
diff --git a/examples/54_hopper_fp8_warp_specialized_gemm/54_hopper_fp8_warp_specialized_gemm.cu b/examples/54_hopper_fp8_warp_specialized_gemm/54_hopper_fp8_warp_specialized_gemm.cu
new file mode 100644
index 0000000000..726f6d222a
--- /dev/null
+++ b/examples/54_hopper_fp8_warp_specialized_gemm/54_hopper_fp8_warp_specialized_gemm.cu
@@ -0,0 +1,599 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Simple Hopper FP8 GEMM example using CUTLASS 3.0 APIs for NVIDIA Hopper architecture
+
+    This example demonstrate a simple way to instantiate and run a FP8 GEMM using the new CUTLASS 3.0
+    APIs on NVIDIA Hopper architecture. New features that will be showcased in this example are as follows:
+
+    1. NVIDIA Hopper architecture introduces a new series of tensor core instructions (GMMA) 
+    which are more efficient than the Ampere tensor core instructions.
+
+    2. NVIDIA Hopper architecture includes new Tensor Memory Accelerator (TMA) unit to transfer large 
+    blocks of data efficiently between global memory and shared memory. TMA also supports asynchronous
+    copies between thread blocks in a cluster.
+
+    3. This example uses the Warp Specialized kernel design (see /media/docs/efficient_gemm.md for details).
+
+    4. This example shows all important fusions used by FP8 gemm kernels, 
+    i.e., scale factor for A, B, C, D tensor, the abs_max value of D tensor.
+
+    5. A simple way to tune the CTA rasterization direction and swizzle pattern of Hopper kernels. Both the 
+    CTA rasterization direction and swizzle pattern impact cross-CTA locality of accesses. By tuning we can 
+    improve performance.
+
+    Examples:
+
+      $ ./examples/54_hopper_fp8_warp_specialized_gemm/54_hopper_fp8_warp_specialized_gemm --m=2048 --n=2048 --k=2048 --rasterization=N --swizzle=2
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/gett.hpp"
+
+
+#include "helper.h"
+#include "hopper_fp8_commandline.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM kernel configurations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// A matrix configuration
+using         ElementA    = cutlass::float_e4m3_t;                          // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using         ElementB    = cutlass::float_e5m2_t;                          // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+// C matrix configuration
+using         ElementC    = cutlass::float_e4m3_t;                          // Element type for C and D matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C and D matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+
+// D matrix configuration
+using         ElementD    = ElementC;
+using         LayoutD     = LayoutC;
+constexpr int AlignmentD  = AlignmentC;
+
+// Auxiliary matrix configuration and other fusion types
+using         ElementAux   = ElementC;
+using         LayoutAux    = LayoutC;
+using         ElementAmax  = float;
+using         ElementBias  = float;
+
+// Core kernel configurations
+using ElementAccumulator  = float;                                          // Element type for internal accumulation
+using ElementCompute      = float;                                          // Element type for epilogue computation
+using ArchTag             = cutlass::arch::Sm90;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                 // Operator class tag
+using TileShape           = Shape<_64,_128,_128>;                           // Threadblock-level tile size
+using ClusterShape        = Shape<_1,_2,_1>;                                // Shape of the threadblocks in a cluster
+using KernelSchedule      = cutlass::gemm::KernelTmaWarpSpecialized;
+using EpilogueSchedule    = cutlass::epilogue::TmaWarpSpecialized;
+using EpilogueTileType    = cutlass::epilogue::collective::EpilogueTileAuto;
+using FusionOperation     = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltActAmaxAux<
+    LayoutAux, cutlass::epilogue::thread::ReLU, ElementD, ElementCompute, ElementAux, ElementAmax, ElementBias, ElementC>;
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    TileShape, ClusterShape,
+    EpilogueTileType,
+    ElementAccumulator, ElementCompute,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule,
+    FusionOperation
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    TileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))
+    >,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    Shape<int,int,int,int>, // Indicates ProblemShape
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+
+using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+// Extract information from Gemm kernel.
+using EpilogueOutputOp  = typename Gemm::EpilogueOutputOp;
+using ElementScalar     = typename EpilogueOutputOp::ElementScalar;
+using ElementAmax       = typename EpilogueOutputOp::ElementAmax;
+using ActivationFunctor = typename EpilogueOutputOp::ActivationFn;
+
+using StrideA = typename Gemm::GemmKernel::StrideA;
+using StrideB = typename Gemm::GemmKernel::StrideB;
+using StrideC = typename Gemm::GemmKernel::StrideC;
+using StrideD = typename Gemm::GemmKernel::StrideD;
+using StrideAux = StrideD;
+
+constexpr bool IsDFp8 = 
+    cute::is_same_v<ElementD, cutlass::float_e4m3_t> or
+    cute::is_same_v<ElementD, cutlass::float_e5m2_t>;
+
+constexpr bool IsAuxFp8 =
+    cute::is_same_v<ElementAux, cutlass::float_e4m3_t> or
+    cute::is_same_v<ElementAux, cutlass::float_e5m2_t>;
+
+/// Initialization
+StrideA stride_A;
+StrideB stride_B;
+StrideC stride_C;
+StrideD stride_D;
+StrideAux stride_aux;
+uint64_t seed;
+
+cutlass::HostTensor<ElementA  , LayoutA  > tensor_A;
+cutlass::HostTensor<ElementB  , LayoutB  > tensor_B;
+cutlass::HostTensor<ElementC  , LayoutC  > tensor_C;
+cutlass::HostTensor<ElementD  , LayoutD  > tensor_D;
+cutlass::HostTensor<ElementD  , LayoutD  > tensor_ref_D;
+cutlass::HostTensor<ElementAux, LayoutAux> tensor_aux;
+cutlass::HostTensor<ElementAux, LayoutAux> tensor_ref_aux;
+
+using LayoutScalar = cutlass::layout::PackedVectorLayout;
+cutlass::HostTensor<ElementScalar, LayoutScalar> scalar_alpha;
+cutlass::HostTensor<ElementScalar, LayoutScalar> scalar_beta;
+cutlass::HostTensor<ElementScalar, LayoutScalar> scale_A;
+cutlass::HostTensor<ElementScalar, LayoutScalar> scale_B;
+cutlass::HostTensor<ElementScalar, LayoutScalar> scale_C;
+cutlass::HostTensor<ElementScalar, LayoutScalar> scale_D;
+cutlass::HostTensor<ElementScalar, LayoutScalar> scale_aux;
+cutlass::HostTensor<ElementAmax  , LayoutScalar> abs_max_D;
+cutlass::HostTensor<ElementAmax  , LayoutScalar> reference_abs_max_D;
+cutlass::HostTensor<ElementAmax  , LayoutScalar> abs_max_aux;
+cutlass::HostTensor<ElementAmax  , LayoutScalar> reference_abs_max_aux;
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Testbed utility types
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90Params::RasterOrderOptions;
+
+/// Result structure
+struct Result
+{
+  double avg_runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  Result(
+    double avg_runtime_ms = 0,
+    double gflops = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess)
+  :
+    avg_runtime_ms(avg_runtime_ms), gflops(gflops), status(status), error(error), passed(false)
+  {}
+
+};
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM setup and evaluation
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to initialize a block of device data
+template <typename Element, typename Layout>
+bool initialize_tensor(
+  cutlass::TensorView<Element, Layout> view,
+  uint64_t seed) {
+
+  double scope_max, scope_min;
+  int bits_input = cutlass::sizeof_bits<Element>::value;
+  int bits_output = cutlass::sizeof_bits<Element>::value;
+
+  if (bits_input == 1) {
+    scope_max = 2;
+    scope_min = 0;
+  }
+  else if (bits_input <= 8) {
+    scope_max = 2;
+    scope_min = -2;
+  }
+  else if (bits_output == 16) {
+    scope_max = 5;
+    scope_min = -5;
+  }
+  else {
+    scope_max = 8;
+    scope_min = -8;
+  }
+  cutlass::reference::host::TensorFillRandomUniform(
+    view, seed, scope_max, scope_min, 0);
+
+  return true;
+}
+
+/// Initialize operands to be used in the GEMM and reference GEMM
+void initialize(const Options<RasterOrderOptions> &options) {
+
+  stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(options.m, options.k, options.l));
+  stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(options.n, options.k, options.l));
+  stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(options.m, options.n, options.l));
+  stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(options.m, options.n, options.l));
+  stride_aux = stride_D;
+
+  auto a_coord = cutlass::make_Coord(options.m * options.l, options.k);
+  auto c_coord = cutlass::make_Coord(options.m * options.l, options.n);
+  auto b_coord = cutlass::make_Coord(options.k, options.n * options.l);
+
+  tensor_A.resize(a_coord);
+  tensor_B.resize(b_coord);
+  tensor_C.resize(c_coord);
+  tensor_D.resize(c_coord);
+  tensor_ref_D.resize(c_coord);
+
+  initialize_tensor(tensor_A.host_view(), seed + 2022);
+  initialize_tensor(tensor_B.host_view(), seed + 2023);
+  initialize_tensor(tensor_C.host_view(), seed + 2024);
+
+  tensor_A.sync_device();
+  tensor_B.sync_device();
+  tensor_C.sync_device();
+  tensor_D.sync_device();
+
+  if (options.save_aux) {
+    tensor_aux.resize(c_coord);
+    tensor_aux.sync_device();
+    tensor_ref_aux.resize(c_coord);
+  }
+
+  if (options.device_scale) {
+    scalar_alpha.resize(cutlass::make_Coord(1));
+    scalar_beta.resize(cutlass::make_Coord(1));
+    scale_A.resize(cutlass::make_Coord(1));
+    scale_B.resize(cutlass::make_Coord(1));
+    scale_C.resize(cutlass::make_Coord(1));
+    scale_D.resize(cutlass::make_Coord(1));
+    scale_aux.resize(cutlass::make_Coord(1));
+
+    cutlass::reference::host::TensorFill(scalar_alpha.host_view(), options.alpha);
+    cutlass::reference::host::TensorFill(scalar_beta.host_view(), options.beta);
+    cutlass::reference::host::TensorFill(scale_A.host_view(), options.scale_a);
+    cutlass::reference::host::TensorFill(scale_B.host_view(), options.scale_b);
+    cutlass::reference::host::TensorFill(scale_C.host_view(), options.scale_c);
+    cutlass::reference::host::TensorFill(scale_D.host_view(), options.scale_d);
+    cutlass::reference::host::TensorFill(scale_aux.host_view(), options.scale_aux);
+
+    scalar_alpha.sync_device();
+    scalar_beta.sync_device();
+    scale_A.sync_device();
+    scale_B.sync_device();
+    scale_C.sync_device();
+    scale_D.sync_device();
+    scale_aux.sync_device();
+  }
+
+  if (IsDFp8 && options.save_amax) {
+    abs_max_D.resize(cutlass::make_Coord(1));
+    abs_max_D.sync_device();
+    reference_abs_max_D.resize(cutlass::make_Coord(1));
+  }
+
+  if (IsAuxFp8 && options.save_aux && options.save_amax) {
+    abs_max_aux.resize(cutlass::make_Coord(1));
+    abs_max_aux.sync_device();
+    reference_abs_max_aux.resize(cutlass::make_Coord(1));
+  }
+}
+
+/// Populates a Gemm::Arguments structure from the given commandline options
+typename Gemm::Arguments args_from_options(const Options<RasterOrderOptions> &options)
+{
+  typename Gemm::Arguments arguments{
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    {options.m, options.n, options.k, options.l},
+    {tensor_A.device_data(), stride_A, tensor_B.device_data(), stride_B},
+    {
+      {}, // epilogue.thread
+      tensor_C.device_data(), stride_C,
+      tensor_D.device_data(), stride_D
+    }
+  };
+
+  auto &fusion_args = arguments.epilogue.thread;
+  fusion_args.alpha = options.alpha;
+  fusion_args.beta = options.beta;
+  fusion_args.alpha_ptr = scalar_alpha.device_data();
+  fusion_args.beta_ptr = scalar_beta.device_data();
+  fusion_args.scale_a = options.scale_a;
+  fusion_args.scale_b = options.scale_b;
+  fusion_args.scale_c = options.scale_c;
+  fusion_args.scale_a_ptr = scale_A.device_data();
+  fusion_args.scale_b_ptr = scale_B.device_data();
+  fusion_args.scale_c_ptr = scale_C.device_data();
+
+  // ignored if tensor types are not fp8
+  fusion_args.scale_d = options.scale_d;
+  fusion_args.scale_aux = options.scale_aux;
+  fusion_args.scale_d_ptr = scale_D.device_data();
+  fusion_args.scale_aux_ptr = scale_aux.device_data();
+
+  // leaving/setting these as nullptr disables the fusion at runtime
+  fusion_args.bias_ptr = nullptr;
+
+  if (options.save_aux) {
+    fusion_args.aux_ptr = tensor_aux.device_data();
+    fusion_args.dAux = stride_aux;
+    if (options.save_amax) {
+      fusion_args.amax_aux_ptr = abs_max_aux.device_data();
+    }
+  }
+
+  if (options.save_amax) {
+    fusion_args.amax_D_ptr = abs_max_D.device_data();
+  }
+
+  arguments.scheduler.raster_order = options.raster;
+  // The tile scheduler will swizzle up to 8 and with the nearest multiple of 2 (i.e., 1, 2, 4, and 8) 
+  arguments.scheduler.max_swizzle_size = options.swizzle;
+
+  return arguments;
+}
+
+bool verify(const Options<RasterOrderOptions> &options) {
+  //
+  // Compute reference output
+  //
+
+  // Create instantiation for device reference gemm kernel
+  auto A = cute::make_tensor(tensor_A.host_data(),
+      cute::make_layout(cute::make_shape(options.m, options.k, options.l), stride_A));
+  auto B = cute::make_tensor(tensor_B.host_data(),
+      cute::make_layout(cute::make_shape(options.n, options.k, options.l), stride_B));
+  auto C = cute::make_tensor(tensor_C.host_data(),
+      cute::make_layout(cute::make_shape(options.m, options.n, options.l), stride_C));
+  auto D = cute::make_tensor(tensor_ref_D.host_data(),
+      cute::make_layout(cute::make_shape(options.m, options.n, options.l), stride_D));
+  auto Aux = cute::make_tensor(tensor_ref_aux.host_data(),
+      cute::make_layout(cute::make_shape(options.m, options.n, options.l), stride_aux));
+  using unused_t = decltype(D);
+
+  cutlass::reference::host::GettMainloopParams<ElementAccumulator, decltype(A), decltype(B)> mainloop_params{A, B};
+
+  cutlass::reference::host::GettEpilogueParams<
+      ElementScalar,
+      ElementScalar,
+      ElementAccumulator,
+      ElementCompute,
+      decltype(C),
+      decltype(D),
+      unused_t, // bias
+      decltype(Aux),
+      unused_t, // valpha
+      unused_t, // vbeta
+      ActivationFunctor
+  > epilogue_params;
+
+  epilogue_params.C = C;
+  epilogue_params.D = D;
+  epilogue_params.Aux = Aux;
+  epilogue_params.alpha = options.alpha;
+  epilogue_params.beta = options.beta;
+  epilogue_params.scale_a = options.scale_a;
+  epilogue_params.scale_b = options.scale_b;
+  epilogue_params.scale_c = options.scale_c;
+  epilogue_params.scale_d = options.scale_d;
+  epilogue_params.scale_aux = options.scale_aux;
+  epilogue_params.abs_max_D = reference_abs_max_D.host_data();
+  epilogue_params.abs_max_Aux = reference_abs_max_aux.host_data();
+
+  // get reference result
+  cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
+
+  // compare_reference
+  tensor_D.sync_host();
+  bool passed = cutlass::reference::host::TensorEquals(tensor_ref_D.host_view(), tensor_D.host_view());
+
+  if (IsDFp8 && options.save_amax) {
+    abs_max_D.sync_host();
+    passed &= abs_max_D.at(cutlass::make_Coord(0)) == reference_abs_max_D.at(cutlass::make_Coord(0));
+  }
+
+  if (options.save_aux) {
+    tensor_aux.sync_host();
+    passed &= cutlass::reference::host::TensorEquals(tensor_ref_aux.host_view(), tensor_aux.host_view());
+    if (IsAuxFp8 && options.save_amax) {
+      abs_max_aux.sync_host();
+      passed &= abs_max_aux.at(cutlass::make_Coord(0)) == reference_abs_max_aux.at(cutlass::make_Coord(0));
+    }
+  }
+
+  return passed;
+}
+
+/// Execute a given example GEMM computation
+template <typename Gemm>
+int run(Options<RasterOrderOptions> &options)
+{
+  initialize(options);
+
+  // Instantiate CUTLASS kernel depending on templates
+  Gemm gemm;
+
+  // Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
+  auto arguments = args_from_options(options);
+
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  // Check if the problem size is supported or not
+  CUTLASS_CHECK(gemm.can_implement(arguments));
+
+  // Initialize CUTLASS kernel with arguments and workspace pointer
+  CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
+
+  // Correctness / Warmup iteration
+  CUTLASS_CHECK(gemm.run());
+
+  // Check if output from CUTLASS kernel and reference kernel are equal or not
+  Result result;
+  result.passed = verify(options);
+
+  std::cout << "  Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
+
+  if (!result.passed) {
+    exit(-1);
+  }
+
+  // Run profiling loop
+  if (options.iterations > 0)
+  {
+    GpuTimer timer;
+    timer.start();
+    for (int iter = 0; iter < options.iterations; ++iter) {
+      CUTLASS_CHECK(gemm.run());
+    }
+    timer.stop();
+
+    // Compute average runtime and GFLOPs.
+    float elapsed_ms = timer.elapsed_millis();
+    result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
+    result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
+
+    std::string raster = "Heuristic";
+
+    if (options.raster == RasterOrderOptions::AlongN) {
+      raster = "Along N";
+    }
+    else if (options.raster == RasterOrderOptions::AlongM) {
+      raster = "Along M";
+    }
+
+    std::cout << "  Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
+    std::cout << "  Rasterization: " << raster << " with a maximum CTA swizzle of " << options.swizzle << std::endl;
+    std::cout << "  Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
+    std::cout << "  GFLOPS: " << result.gflops << std::endl;
+  }
+
+  return 0;
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  // CUTLASS must be compiled with CUDA 12.0 Toolkit to run this example
+  // and must have compute capability at least 90.
+  if (__CUDACC_VER_MAJOR__ < 12) {
+    std::cerr << "This example requires CUDA 12 or newer.\n";
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+
+  cudaDeviceProp props;
+  int current_device_id;
+  CUDA_CHECK(cudaGetDevice(&current_device_id));
+  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (props.major < 9) {
+    std::cerr
+      << "This example requires a GPU of NVIDIA's Hopper Architecture or "
+      << "later (compute capability 90 or greater).\n";
+    return 0;
+  }
+  //
+  // Parse options
+  //
+
+  Options<RasterOrderOptions> options;
+
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  //
+  // Evaluate CUTLASS kernels
+  //
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+  run<Gemm>(options);
+#endif
+
+  return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/54_hopper_fp8_warp_specialized_gemm/CMakeLists.txt b/examples/54_hopper_fp8_warp_specialized_gemm/CMakeLists.txt
new file mode 100644
index 0000000000..209b2779fe
--- /dev/null
+++ b/examples/54_hopper_fp8_warp_specialized_gemm/CMakeLists.txt
@@ -0,0 +1,32 @@
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cutlass_example_add_executable(
+  54_hopper_fp8_warp_specialized_gemm
+  54_hopper_fp8_warp_specialized_gemm.cu
+  )
diff --git a/examples/54_hopper_fp8_warp_specialized_gemm/hopper_fp8_commandline.hpp b/examples/54_hopper_fp8_warp_specialized_gemm/hopper_fp8_commandline.hpp
new file mode 100644
index 0000000000..96d8794d8e
--- /dev/null
+++ b/examples/54_hopper_fp8_warp_specialized_gemm/hopper_fp8_commandline.hpp
@@ -0,0 +1,129 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Command line options parsing
+template<typename RasterOrderOptions>
+struct Options {
+
+  bool help = false;
+
+  float alpha = 1.f, beta = 0.f;
+  float scale_a = 1.f, scale_b = 1.f, scale_c = 1.f, scale_d = 1.f, scale_aux = 1.f;
+  bool device_scale = false;
+  bool save_aux = true;
+  bool save_amax = true;
+  int iterations = 1000;
+  int m = 1024, n = 512, k = 1024, l = 1;
+  RasterOrderOptions raster;
+  int swizzle;
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("m", m);
+    cmd.get_cmd_line_argument("n", n);
+    cmd.get_cmd_line_argument("k", k);
+    cmd.get_cmd_line_argument("l", l);
+    cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+    cmd.get_cmd_line_argument("beta", beta, 0.f);
+    cmd.get_cmd_line_argument("scale_a", scale_a, 1.f);
+    cmd.get_cmd_line_argument("scale_b", scale_b, 1.f);
+    cmd.get_cmd_line_argument("scale_c", scale_c, 1.f);
+    cmd.get_cmd_line_argument("scale_d", scale_d, 1.f);
+    cmd.get_cmd_line_argument("scale_aux", scale_aux, 1.f);
+    cmd.get_cmd_line_argument("device_scale", device_scale, false);
+    cmd.get_cmd_line_argument("save_aux", save_aux, true);
+    cmd.get_cmd_line_argument("save_amax", save_amax, true);
+    cmd.get_cmd_line_argument("iterations", iterations);
+
+    char raster_char;
+    cmd.get_cmd_line_argument("raster", raster_char);
+
+    if (raster_char == 'N' || raster_char == 'n') {
+      raster = RasterOrderOptions::AlongN;
+    }
+    else if (raster_char == 'M' || raster_char == 'm') {
+      raster = RasterOrderOptions::AlongM;
+    }
+    else if (raster_char == 'H' || raster_char == 'h') {
+      raster = RasterOrderOptions::Heuristic;
+    }
+
+    cmd.get_cmd_line_argument("swizzle", swizzle, 1);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "54_fp8_hopper_warp_specialized_gemm\n\n"
+      << "  Hopper FP8 GEMM using a Warp Specialized kernel.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement\n\n"
+      << "  --m=<int>                   Sets the M extent of the GEMM\n"
+      << "  --n=<int>                   Sets the N extent of the GEMM\n"
+      << "  --k=<int>                   Sets the K extent of the GEMM\n"
+      << "  --l=<int>                   Sets the l extent (batch) of the GEMM\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha\n"
+      << "  --beta=<f32>                Epilogue scalar beta\n"
+      << "  --scale_a=<f32>             Scaling factor for A\n"
+      << "  --scale_b=<f32>             Scaling factor for B\n"
+      << "  --scale_c=<f32>             Scaling factor for C\n"
+      << "  --scale_d=<f32>             Scaling factor for D (ignored for non-fp8 D)\n"
+      << "  --scale_aux=<f32>           Scaling factor for the auxiliary tensor (ignored for non-fp8 aux)\n"
+      << "  --device_scale=<bool>       Copy scalars to device memory before kernel launch (default: false)\n"
+      << "  --save_aux=<bool>           Save the pre-activation as an auxiliary tensor (default: true)\n"
+      << "  --save_amax=<bool>          Save the pre-scaled max absolute value of any fp8 outputs (aux and/or D) (default: true)\n"
+      << "  --raster=<char>             CTA Rasterization direction (N for along N, M for along M, and H for heuristic)\n\n"
+      << "  --swizzle=<int>             CTA Rasterization swizzle\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n";
+
+    out
+      << "\n\nExamples:\n\n"
+      << "$ " << "54_fp8_hopper_warp_specialized_gemm" << " --m=1024 --n=512 --k=1024 --alpha=2 --beta=0.707 \n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const
+  {
+    // Two flops per multiply-add
+    uint64_t flop = uint64_t(2) * m * n * k;
+    double gflop = double(flop) / double(1.0e9);
+    return gflop / runtime_s;
+  }
+};
diff --git a/examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_bf16_gemm.cu b/examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_bf16_gemm.cu
new file mode 100644
index 0000000000..ab82b40cca
--- /dev/null
+++ b/examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_bf16_gemm.cu
@@ -0,0 +1,683 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Hopper GEMM example with different data types using CUTLASS 3.0 APIs for NVIDIA Hopper architecture
+
+    This example shows how to perform INT4 x BF16 GEMM and scale up the INT4 weight during dequantization.
+
+    The narrower type always passes through the register file. Therefore, in cases where the narrower type is operand B, the collective will implicitly swap 
+    A and B in the main loop. However, as a result of this collective performing implicit swaps, it does not support TMA epilogues. Consequently, it is essential to consider this when constructing the epilogue, 
+    as illustrated in this example.
+
+    Note that in this example, we explicitly swap A and B in order to use TMA epilogues. We do this since TMA epilogues are more performant on problem sizes of interest.
+
+    As an additional optimization, we can reorder the narrow data type tensor such that elements read into register file by the same thread are contiguous in global and shared memory.
+    This promotes vectorization of shared memory loads and removes additional instructions on the critical path. For example, when MMA is performed in 16-bit data type, each thread reads
+    4 groups of 2 elements that are logically contiguous in the same row (refer to https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#wgmma-64n16-a for thread-value layout).
+    If the narrow type is INT4 and tensor is major in K dim, only 8 bits can be read at a time, leading to extra load instructions and suboptimal utilization of shared memory throughput.
+    If we reorder the data offline to place all 16 elements read by a thread contiguously in memory, a single 64-bit load is sufficient. This reordering is often feasible when the quantized
+    tensor is static (e.g. weight tensor of a NN layer at inference time). This example demonstrates how such a reordering can be performed and communicated to the kernel when the options.shuffle is set to true.
+
+    Furthermore, the conversion from {INT4, UINT4} to {FP16, BF16} can benefit from pre-shuffling the weights in the order [0,2,4,6,1,3,5,7]. This allows multiple nibbles to be efficiently extracted and up-converted
+    in parallel. The reordering is enabled by defining the layout type `ValueShuffle`. Refer to the partial specializations of `NumericArrayShuffleConverter` in "include/cutlass/detail/collective/mixed_input_utils.hpp"
+    for more details.
+
+    It is expected that the scale's K dimension be scale_k = ceil_div(problem_k, group_size). 
+    
+    Scales are always expected to be MN major. This means the fastest changing dimension must be M if A is scaled or N if B is scaled.
+    
+    If A is being scaled, the scales must have shape [M, scale_k],  while if B is scaled, it must have shape [N, scale_k].
+
+    The implementation only supports "group-wise" scales. However, we can make it work for per-column scales by setting the group's size
+    equal to the gemm problem K.
+
+    Limitations:
+      1) The INT4 weights have additional encoding requirements.
+      2) The scales must be MN major. That means if A is scaled, it must be column major, but if B is scaled it must be row major.
+      3) The scales must have the same layout and groupsize.
+      4) The groupsize must be greater or equal to the tile shape k.
+      5) Currently, TMA epilogues cannot be used when the narrow type is the B operand. This limitation arises because the implementation always swaps the 
+         operands to ensure that the narrow type passes through the register file, and TMA epilogues do not currently support implicit swap + transpose operations. 
+         We plan to address this limitation in the future. However, we address this in the example by explicitly swapping and transposing the operands.
+    
+    Optimizing suggestions:
+      1) Use a small tile size, since the register pressure for this GEMM (and RS GEMM in general) is high (it uses a lot of register space).
+
+    Examples:
+      
+      Runs the mixed input batched gemm (with batch size 2), converting B to the type of A (mode 0)
+      $ ./examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_bf16_gemm --m=2048 --n=2048 --k=2048 --l=2 --mode=0
+
+      Runs the mixed input gemm, and applies a scaling factor to B before mma (mode 1). Applies a vector of scales to the entire
+      matrix (group size is the same as the gemm k dimension).
+      $ ./examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_bf16_gemm --m=4096 --n=5120 --k=8192 --g=8192 --mode=1
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+
+#include "helper.h"
+#include "mixed_dtype_utils.hpp"
+#include "packed_scale.hpp"
+#include "reorder_utils.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM kernel configurations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+using MmaType = cutlass::bfloat16_t;
+using QuantType = cutlass::int4b_t;
+constexpr int TileShapeK = 128 * 8 / sizeof_bits<MmaType>::value;
+
+// A matrix configuration
+using         ElementA    = MmaType;                                        // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using         ElementB    = QuantType;                                      // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+// This example manually swaps and transposes, so keep transpose of input layouts
+using LayoutA_Transpose = typename cutlass::layout::LayoutTranspose<LayoutA>::type;
+using LayoutB_Transpose = typename cutlass::layout::LayoutTranspose<LayoutB>::type;
+
+using StrideA = cutlass::detail::TagToStrideA_t<LayoutA>;
+using StrideB = cutlass::detail::TagToStrideB_t<LayoutB>;
+
+// Define the CuTe layout for reoredered quantized tensor B
+// LayoutAtomQuant places values that will be read by the same thread in contiguous locations in global memory.
+// It specifies the reordering within a single warp's fragment
+//using ValueShuffle = Layout<_1>;                          // no value reordering
+using ValueShuffle = Layout<Shape<_2,_4>, Stride<_4,_1>>; // order [0,2,4,6,1,3,5,7]
+int constexpr NumShuffleAtoms = 1;
+using MmaAtomShape = Layout<Shape<_1,Int<NumShuffleAtoms>>>;
+using LayoutAtomQuant = decltype(compute_memory_reordering_atom<MmaType, MmaAtomShape, ValueShuffle>());
+using LayoutB_Reordered = decltype(tile_to_shape(LayoutAtomQuant{}, Layout<Shape<int,int,int>, StrideB>{}));
+
+using ElementScale = MmaType;
+using ElementZero = ElementScale;
+using LayoutScale = cutlass::layout::RowMajor;
+
+// C/D matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C and D matrix operands
+using         LayoutC     = cutlass::layout::RowMajor;                      // Layout type for C and D matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+
+// D matrix configuration
+using         ElementD    = ElementC;
+using         LayoutD     = LayoutC;
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+// Core kernel configurations
+using ElementAccumulator  = float;                                          // Element type for internal accumulation
+using ElementCompute      = float;                                          // Element type for epilogue computation
+using ArchTag             = cutlass::arch::Sm90;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                 // Operator class tag
+using TileShape           = Shape<_128,_128,cute::Int<TileShapeK>>;         // Threadblock-level tile size
+using ClusterShape        = Shape<_1,_1,_1>;                                // Shape of the threadblocks in a cluster
+using KernelSchedule      = cutlass::gemm::KernelTmaWarpSpecializedCooperative;  // Kernel to launch based on the default setting in the Collective Builder 
+using EpilogueSchedule    = cutlass::epilogue::TmaWarpSpecializedCooperative;
+using EpilogueTileType    = cutlass::epilogue::collective::EpilogueTileAuto;
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    TileShape, ClusterShape,
+    EpilogueTileType,
+    ElementAccumulator, ElementAccumulator,
+    // Transpose layout of D here since we use explicit swap + transpose
+    // the void type for C tells the builder to allocate 0 smem for the C matrix.
+    // We can enable this if beta == 0 by changing ElementC to void below.
+    ElementC, typename cutlass::layout::LayoutTranspose<LayoutC>::type, AlignmentC,
+    ElementD, typename cutlass::layout::LayoutTranspose<LayoutD>::type, AlignmentD,
+    EpilogueSchedule // This is the only epi supporting the required swap + transpose.
+  >::CollectiveOp;
+
+// ============================================================ MIXED INPUT NO SCALES ============================================================================
+// The collective will infer that the narrow type should be upcasted to the wide type.
+// We swap A and B operands to the builder here
+using CollectiveMainloopConvertOnly = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementB, LayoutB_Transpose, AlignmentB,
+    ElementA, LayoutA_Transpose, AlignmentA,
+    ElementAccumulator,
+    TileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))
+    >,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernelConvertOnly = cutlass::gemm::kernel::GemmUniversal<
+    Shape<int,int,int,int>, // Indicates ProblemShape
+    CollectiveMainloopConvertOnly,
+    CollectiveEpilogue
+>;
+
+using GemmConvertOnly = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelConvertOnly>;
+
+using CollectiveMainloopConvertOnlyShuffled = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementB, LayoutB_Reordered, AlignmentB,
+    ElementA, LayoutA_Transpose, AlignmentA,
+    ElementAccumulator,
+    TileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))
+    >,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernelConvertOnlyShuffled = cutlass::gemm::kernel::GemmUniversal<
+    Shape<int,int,int,int>, // Indicates ProblemShape
+    CollectiveMainloopConvertOnlyShuffled,
+    CollectiveEpilogue
+>;
+
+using GemmConvertOnlyShuffled = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelConvertOnlyShuffled>;
+
+// =========================================================== MIXED INPUT WITH SCALES ===========================================================================
+// The Scale information must get paired with the operand that will be scaled. In this example, B is scaled so we make a tuple of B's information and the scale information.
+using CollectiveMainloopScaleOnly = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    cute::tuple<ElementB, ElementScale>, LayoutB_Transpose, AlignmentB,
+    ElementA, LayoutA_Transpose, AlignmentA,
+    ElementAccumulator,
+    TileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))
+    >,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernelScaleOnly = cutlass::gemm::kernel::GemmUniversal<
+    Shape<int,int,int,int>, // Indicates ProblemShape
+    CollectiveMainloopScaleOnly,
+    CollectiveEpilogue
+>;
+
+using GemmScaleOnly = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelScaleOnly>;
+
+using CollectiveMainloopScaleOnlyShuffled = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    cute::tuple<ElementB, ElementScale>, LayoutB_Reordered, AlignmentB,
+    ElementA, LayoutA_Transpose, AlignmentA,
+    ElementAccumulator,
+    TileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))
+    >,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernelScaleOnlyShuffled = cutlass::gemm::kernel::GemmUniversal<
+    Shape<int,int,int,int>, // Indicates ProblemShape
+    CollectiveMainloopScaleOnlyShuffled,
+    CollectiveEpilogue
+>;
+
+using GemmScaleOnlyShuffled = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelScaleOnlyShuffled>;
+
+// =========================================================== MIXED INPUT WITH SCALES AND ZEROS ==================================================================
+// We specify scale + zero elements to indicate that we require both. Scales and biases have the same format.
+using CollectiveMainloopScaleWithZeroPoint = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    cute::tuple<ElementB, ElementScale, ElementZero>, LayoutB_Transpose, AlignmentB,
+    ElementA, LayoutA_Transpose, AlignmentA,
+    ElementAccumulator,
+    TileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))
+    >,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernelScaleWithZeroPoint = cutlass::gemm::kernel::GemmUniversal<
+    Shape<int,int,int,int>, // Indicates ProblemShape
+    CollectiveMainloopScaleWithZeroPoint,
+    CollectiveEpilogue
+>;
+
+using GemmScaleWithZeroPoint = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelScaleWithZeroPoint>;
+
+using CollectiveMainloopScaleWithZeroPointShuffled = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    cute::tuple<ElementB, ElementScale, ElementZero>, LayoutB_Reordered, AlignmentB,
+    ElementA, LayoutA_Transpose, AlignmentA,
+    ElementAccumulator,
+    TileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))
+    >,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernelScaleWithZeroPointShuffled = cutlass::gemm::kernel::GemmUniversal<
+    Shape<int,int,int,int>, // Indicates ProblemShape
+    CollectiveMainloopScaleWithZeroPointShuffled,
+    CollectiveEpilogue
+>;
+
+using GemmScaleWithZeroPointShuffled = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelScaleWithZeroPointShuffled>;
+// =================================================================================================================================================================
+
+using StrideC = typename GemmKernelScaleOnly::StrideC;
+using StrideD = typename GemmKernelScaleOnly::StrideD;
+
+using StrideC_ref = cutlass::detail::TagToStrideC_t<LayoutC>;
+using StrideD_ref = cutlass::detail::TagToStrideC_t<LayoutD>;
+
+//
+// Data members
+//
+
+/// Initialization
+StrideA stride_A;
+StrideB stride_B;
+StrideC stride_C;
+StrideC_ref stride_C_ref;
+StrideD stride_D;
+StrideD_ref stride_D_ref;
+uint64_t seed;
+
+LayoutB_Reordered layout_B_reordered;
+
+using StrideS = typename CollectiveMainloopScaleOnly::StrideScale;
+using StrideS_ref = cutlass::detail::TagToStrideB_t<LayoutScale>;
+StrideS stride_S;
+StrideS_ref stride_S_ref;
+
+cutlass::DeviceAllocation<ElementA> block_A;
+cutlass::DeviceAllocation<ElementB> block_B;
+cutlass::DeviceAllocation<ElementA> block_B_dq;
+cutlass::DeviceAllocation<ElementScale> block_scale;
+cutlass::DeviceAllocation<ElementZero> block_zero;
+cutlass::DeviceAllocation<ElementC> block_C;
+cutlass::DeviceAllocation<typename GemmScaleOnly::EpilogueOutputOp::ElementOutput> block_D;
+cutlass::DeviceAllocation<typename GemmScaleOnly::EpilogueOutputOp::ElementOutput> block_ref_D;
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Testbed utility types
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options : MixedDtypeOptions{
+  bool shuffle = true;
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+    cmd.get_cmd_line_argument("shuffle", shuffle);
+
+    this->MixedDtypeOptions::parse(argc, args);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "55_hopper_int4_bf16_gemm\n\n"
+      << "  Hopper Mixed Data Type GEMM using a Warp Specialized kernel.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement\n\n"
+      << "  --m=<int>                   Sets the M extent of the GEMM\n"
+      << "  --n=<int>                   Sets the N extent of the GEMM\n"
+      << "  --k=<int>                   Sets the K extent of the GEMM\n"
+      << "  --l=<int>                   The number of independent gemm problems with mnk shape\n"
+      << "  --g=<int>                   The size of each group for the scales. To broadcast a vector of scales or zeros, set the group size to K.\n"
+      << "  --mode=<int>                The mode to run the gemm. 0 does (A @ B), 1 means A @ (scale * B), 2 means A @ (scale * B + zero-point).\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha\n"
+      << "  --beta=<f32>                Epilogue scalar beta\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n"
+      << "  --warmup=<int>              Number of warmup iterations to perform.\n\n"
+      << "  --shuffle=<boolean>         Enable the offline layout swizzling.\n\n";
+
+    out
+      << "\n\nExamples:\n\n"
+      << "$ " << "55_hopper_int4_bf16_gemm" << " --m=1024 --n=512 --k=1024 -g=1024 --l=10 --alpha=2 --mode=2 --beta=0.707 \n\n";
+
+    return out;
+  }
+};
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM setup and evaluation
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Initialize operands to be used in the GEMM and reference GEMM
+void initialize(Options const& options) {
+
+  auto shape_B = cute::make_shape(options.n, options.k, options.l);
+  int const scale_k = (options.k + options.g - 1) / options.g;
+  stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(options.m, options.k, options.l));
+  stride_B = cutlass::make_cute_packed_stride(StrideB{}, shape_B);
+  // Reverse stride here due to swap and transpose
+  stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(options.n, options.m, options.l));
+  stride_C_ref = cutlass::make_cute_packed_stride(StrideC_ref{}, cute::make_shape(options.m, options.n, options.l));
+  // Reverse stride here due to swap and transpose
+  stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(options.n, options.m, options.l));
+  stride_D_ref = cutlass::make_cute_packed_stride(StrideD_ref{}, cute::make_shape(options.m, options.n, options.l));
+
+  auto layout_B = make_layout(shape_B, stride_B);
+
+  auto a_coord = cutlass::make_Coord(options.m * options.l, options.k);
+  auto b_coord = cutlass::make_Coord(options.k, options.n * options.l);
+  auto c_coord = cutlass::make_Coord(options.m * options.l, options.n);
+
+  block_A.reset(a_coord.product());
+  block_B.reset(b_coord.product());
+  block_B_dq.reset(b_coord.product());
+  block_C.reset(c_coord.product());
+  block_D.reset(c_coord.product());
+  block_ref_D.reset(c_coord.product());
+
+  block_scale.reset(scale_k * options.l * options.n);
+  block_zero.reset(scale_k * options.l * options.n);
+
+  initialize_tensor(block_A, seed + 2022);
+  initialize_quant_tensor(block_B, seed + 2021);
+  initialize_tensor(block_C, seed + 2020);
+  initialize_scale(block_scale, options);
+  initialize_zero(block_zero, options);
+
+  auto shape_scale_zero = cute::make_shape(options.n, scale_k, options.l);
+  stride_S = cutlass::make_cute_packed_stride(StrideS{}, cute::make_shape(options.n, scale_k, options.l));
+  stride_S_ref = cutlass::make_cute_packed_stride(StrideS_ref{}, cute::make_shape(options.n, scale_k, options.l));
+  auto layout_scale_zero = make_layout(shape_scale_zero, stride_S_ref);
+
+  dequantize_weight(block_B_dq.get(), block_B.get(), layout_B, block_scale.get(), block_zero.get(), layout_scale_zero, options.g);
+
+  if (options.shuffle) {
+    // Repeat the reorder layout atom to tile the whole tensor shape 
+    layout_B_reordered = tile_to_shape(LayoutAtomQuant{}, shape_B);
+    reorder_tensor(block_B.get(), layout_B, layout_B_reordered);
+
+    print("Quantized tensor layout: ");
+    print(layout_B_reordered);
+    print("\n");
+  }
+}
+
+/// Populates a Gemm::Arguments structure from the given commandline options
+/// Swap the A and B tensors, as well as problem shapes here.
+template <typename Gemm>
+typename Gemm::Arguments args_from_options(Options const& options)
+{
+  using Args = typename Gemm::Arguments;
+  auto&& dB = [&]() {
+    if constexpr (cute::is_same_v<Gemm, GemmConvertOnlyShuffled> ||
+                  cute::is_same_v<Gemm, GemmScaleOnlyShuffled> ||
+                  cute::is_same_v<Gemm, GemmScaleWithZeroPointShuffled>) {
+      // offline swizzling is enabled.
+      return layout_B_reordered;
+    }
+    else {
+      return stride_B;
+    }
+  }();
+  if (options.mode == MixedDtypeGemmMode::ConvertOnly) {
+    return Args {
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      {options.n, options.m, options.k, options.l},
+      {block_B.get(), dB, block_A.get(), stride_A},
+      {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}
+    };
+  } 
+  else if (options.mode == MixedDtypeGemmMode::ScaleOnly) {
+    return Args {
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      {options.n, options.m, options.k, options.l},
+      {block_B.get(), dB, block_A.get(), stride_A, block_scale.get(), stride_S, options.g},
+      {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}
+    };
+  } 
+  else if (options.mode == MixedDtypeGemmMode::ScaleWithZeroPoint) {
+    return Args {
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      {options.n, options.m, options.k, options.l},
+      {block_B.get(), dB, block_A.get(), stride_A, block_scale.get(), stride_S, options.g, block_zero.get()},
+      {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}
+    };
+  } else {
+    std::cerr << "Invalid mode " << options.mode << ". Must be 0, 1 or 2." << std::endl;
+    exit(-1);
+  }
+}
+
+bool verify(Options const& options) {
+  //
+  // Compute reference output
+  //
+
+  using CollectiveMainloopRef = typename cutlass::gemm::collective::CollectiveBuilder<
+      ArchTag, OperatorClass,
+      MmaType, LayoutA, AlignmentA,
+      MmaType, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogueRef = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementAccumulator,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernelRef = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>, // Indicates ProblemShape
+      CollectiveMainloopRef,
+      CollectiveEpilogueRef
+  >;
+
+  using GemmRef = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelRef>;
+
+  typename GemmRef::Arguments arguments{
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    {options.m, options.n, options.k, options.l},
+    {block_A.get(), stride_A, block_B_dq.get(), stride_B},
+    {{options.alpha, options.beta}, block_C.get(), stride_C_ref, block_ref_D.get(), stride_D_ref}
+  };
+
+  // Run the gemm where the scaling is performed outside of the kernel.
+  GemmRef gemm_ref;
+  size_t workspace_size = GemmRef::get_workspace_size(arguments);
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+  CUTLASS_CHECK(gemm_ref.can_implement(arguments));
+  CUTLASS_CHECK(gemm_ref.initialize(arguments, workspace.get()));
+  CUTLASS_CHECK(gemm_ref.run());
+
+  // compare_reference
+  ElementD const epsilon(1e-2f);
+  ElementD const non_zero_floor(1e-4f);
+  bool passed = cutlass::reference::device::BlockCompareRelativelyEqual(block_ref_D.get(), block_D.get(), block_D.size(), epsilon, non_zero_floor);
+
+  return passed;
+}
+
+/// Execute a given example GEMM computation
+template <typename Gemm>
+int run(Options &options)
+{
+  initialize(options);
+
+  // Instantiate CUTLASS kernel depending on templates
+  Gemm gemm;
+
+  // Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
+  auto arguments = args_from_options<Gemm>(options);
+
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  // Check if the problem size is supported or not
+  CUTLASS_CHECK(gemm.can_implement(arguments));
+
+  // Initialize CUTLASS kernel with arguments and workspace pointer
+  CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
+
+  // Correctness / Warmup iteration
+  CUTLASS_CHECK(gemm.run());
+
+  // Check if output from CUTLASS kernel and reference kernel are equal or not
+  MixedDtypeResult result;
+  result.passed = verify(options);
+  mixed_dtype_profiling(gemm, options, result);
+  std::cout << "  Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
+  if (!result.passed) {
+    exit(-1);
+  }
+
+  return 0;
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  // CUTLASS must be compiled with CUDA 12.0 Toolkit to run this example
+  // and must have compute capability at least 90.
+  if (__CUDACC_VER_MAJOR__ < 12) {
+    std::cerr << "This example requires CUDA 12 or newer.\n";
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+
+  cudaDeviceProp props;
+  int current_device_id;
+  CUDA_CHECK(cudaGetDevice(&current_device_id));
+  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (props.major < 9) {
+    std::cerr
+      << "This example requires a GPU of NVIDIA's Hopper Architecture or "
+      << "later (compute capability 90 or greater).\n";
+    return 0;
+  }
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  //
+  // Evaluate CUTLASS kernels
+  //
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+  if (options.mode == MixedDtypeGemmMode::ConvertOnly) {
+    std::cout << "Running in no scale mode." << std::endl;
+    if (options.shuffle) {
+      std::cout << "Offline shuffle enabled." << std::endl;
+      run<GemmConvertOnlyShuffled>(options);
+    } else {
+      std::cout << "Offline shuffle disabled." << std::endl;
+      run<GemmConvertOnly>(options);
+    }
+  }
+  else if (options.mode == MixedDtypeGemmMode::ScaleOnly) {
+    if (options.g == options.k) {
+      std::cout << "Running in per-column scale mode." << std::endl;
+    } else {
+      std::cout << "Running in group scale mode." << std::endl;
+    }
+    if (options.shuffle) {
+      std::cout << "Offline shuffle enabled." << std::endl;
+      run<GemmScaleOnlyShuffled>(options);
+    } else {
+      std::cout << "Offline shuffle disabled." << std::endl;
+      run<GemmScaleOnly>(options);
+    }
+  }
+  else if (options.mode == MixedDtypeGemmMode::ScaleWithZeroPoint) {
+    if (options.g == options.k) {
+      std::cout << "Running in per-column scale and zero mode." << std::endl;
+    } else {
+      std::cout << "Running in group scale and zero mode." << std::endl;
+    }
+    if (options.shuffle) {
+      std::cout << "Offline shuffle enabled." << std::endl;
+      run<GemmScaleWithZeroPointShuffled>(options);
+    } else {
+      std::cout << "Offline shuffle disabled." << std::endl;
+      run<GemmScaleWithZeroPoint>(options);
+    }
+  }
+#endif
+
+  return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_fp8_gemm.cu b/examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_fp8_gemm.cu
new file mode 100644
index 0000000000..40fa689489
--- /dev/null
+++ b/examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_fp8_gemm.cu
@@ -0,0 +1,562 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Hopper GEMM example with different data types using CUTLASS 3.0 APIs for NVIDIA Hopper architecture
+
+    This example shows how to perform INT4 x FP8 GEMM and scale up the INT4 weight during dequantization. It uses a look-up table to avoid the multiplications
+    between INT4 and FP8. To trigger this method, use cutlass::Array<ElementScale, 8> as the scale type in the collective's arguments.
+    
+    However, this algorithm requires changes to the encoding of INT4 weights and scale factors. These changes must happen before launching the GEMM. See the helper functions
+    `unify_quant_encoding`, `initialize_packed_scale` in the header `fp8_packed_scale.hpp` for details.
+
+    In a nutshell, the positive values of INT4 weights need to be encoded in the same way as negative values except for the sign bit. For each scale factor,
+    8 negative results (-8 x scale, -7 x scale, ... -1 x scale) are packed together, forming a cutlass::Array<ElementScale, 8> value.
+
+    The narrower type always passes through the register file. Therefore, in cases where the narrower type is operand B, the collective will implicitly swap 
+    A and B in the main loop. However, as a result of this collective performing implicit swaps, it does not support TMA epilogues. Consequently, it is essential to consider this when constructing the epilogue, 
+    as illustrated in this example.
+
+    Note that in this example, we explicitly swap A and B in order to use TMA epilogues. We do this since TMA epilogues are more performant on problem sizes of interest.
+
+    As an additional optimization, we can reorder the narrow data type tensor such that elements read into register file by the same thread are contiguous in global and shared memory.
+    This promotes vectorization of shared memory loads and removes additional instructions on the critical path. For example, when MMA is performed in FP8 data type, each thread reads
+    4 groups of 4 elements that are logically contiguous in the same row (refer to https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#wgmma-64n32-a for thread-value layout).
+    If the narrow type is INT4 and tensor is major in K dim, only 16 bits can be read at a time, leading to extra load instructions and suboptimal utilization of shared memory throughput.
+    If we reorder the data offline to place all 16 elements read by a thread contiguously in memory, a single 64-bit load is sufficient. This reordering is often feasible when the quantized
+    tensor is static (e.g. weight tensor of a NN layer at inference time). This example demonstrates how such a reordering can be performed and communicated to the kernel when the options.shuffle is set to true.
+
+    It is expected that the scale's K dimension be scale_k = ceil_div(problem_k, group_size). 
+    
+    Scales are always expected to be MN major. This means the fastest changing dimension must be M if A is scaled or N if B is scaled.
+    
+    If A is being scaled, the scales must have shape [M, scale_k],  while if B is scaled, it must have shape [N, scale_k].
+
+    The implementation only supports "group-wise" scales. However, we can make it work for per-column scales by setting the group's size
+    equal to the gemm problem K.
+
+    Limitations:
+      1) Only supports INT4 x { FP8, INT8, UINT8 }. The scales must be the same as mma Type. Scale with zero-point mode is not supported.
+      2) The INT4 weights and scale factors have additional encoding requirements.
+      3) The scales must be MN major. That means if A is scaled, it must be column major, but if B is scaled it must be row major.
+      4) The scales must have the same layout and groupsize.
+      5) The groupsize must be greater or equal to the tile shape k.
+      6) Currently, TMA epilogues cannot be used when the narrow type is the B operand. This limitation arises because the implementation always swaps the 
+         operands to ensure that the narrow type passes through the register file, and TMA epilogues do not currently support implicit swap + transpose operations. 
+         We plan to address this limitation in the future. However, we address this in the example by explicitly swapping and transposing the operands.
+    
+    Optimizing suggestions:
+      1) Use a small tile size, since the register pressure for this GEMM (and RS GEMM in general) is high (it uses a lot of register space).
+
+    Examples:
+      
+      Runs the mixed input batched gemm (with batch size 2), converting B to the type of A (mode 0)
+      $ ./examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_fp8_gemm --m=2048 --n=2048 --k=2048 --l=2 --mode=0
+
+      Runs the mixed input gemm, and applies a scaling factor to B before mma (mode 1). Applies a vector of scales to the entire
+      matrix (group size is the same as the gemm k dimension).
+      $ ./examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_fp8_gemm --m=4096 --n=5120 --k=8192 --g=8192 --mode=1
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+
+#include "helper.h"
+#include "mixed_dtype_utils.hpp"
+#include "packed_scale.hpp"
+#include "reorder_utils.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM kernel configurations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+using MmaType = cutlass::float_e4m3_t;
+using QuantType = cutlass::int4b_t;
+constexpr int TileShapeK = 128 * 8 / sizeof_bits<MmaType>::value;
+
+// A matrix configuration
+using         ElementA    = MmaType;                                        // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using         ElementB    = QuantType;                                      // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+// This example manually swaps and transposes, so keep transpose of input layouts
+using LayoutA_Transpose = typename cutlass::layout::LayoutTranspose<LayoutA>::type;
+using LayoutB_Transpose = typename cutlass::layout::LayoutTranspose<LayoutB>::type;
+
+using StrideA = cutlass::detail::TagToStrideA_t<LayoutA>;
+using StrideB = cutlass::detail::TagToStrideB_t<LayoutB>;
+
+// Define the CuTe layout for reoredered quantized tensor B
+// LayoutAtomQuant places values that will be read by the same thread in contiguous locations in global memory.
+// It specifies the reordering within a single warp's fragment
+using LayoutAtomQuant = decltype(compute_memory_reordering_atom<MmaType>());
+using LayoutB_Reordered = decltype(tile_to_shape(LayoutAtomQuant{}, Layout<Shape<int,int,int>, StrideB>{}));
+
+using ElementScale = MmaType;
+using ElementZero = ElementScale; // only for verify
+using LayoutScale = cutlass::layout::RowMajor;
+
+// C/D matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C and D matrix operands
+using         LayoutC     = cutlass::layout::RowMajor;                      // Layout type for C and D matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+
+// D matrix configuration
+using         ElementD    = ElementC;
+using         LayoutD     = LayoutC;
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+// Core kernel configurations
+using ElementAccumulator  = float;                                          // Element type for internal accumulation
+using ElementCompute      = float;                                          // Element type for epilogue computation
+using ArchTag             = cutlass::arch::Sm90;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                 // Operator class tag
+using TileShape           = Shape<_128,_128,cute::Int<TileShapeK>>;         // Threadblock-level tile size
+using ClusterShape        = Shape<_1,_1,_1>;                                // Shape of the threadblocks in a cluster
+using KernelSchedule      = cutlass::gemm::KernelTmaWarpSpecializedCooperative;  // Kernel to launch based on the default setting in the Collective Builder 
+using EpilogueSchedule    = cutlass::epilogue::TmaWarpSpecializedCooperative;
+using EpilogueTileType    = cutlass::epilogue::collective::EpilogueTileAuto;
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    TileShape, ClusterShape,
+    EpilogueTileType,
+    ElementAccumulator, ElementAccumulator,
+    // Transpose layout of D here since we use explicit swap + transpose
+    // the void type for C tells the builder to allocate 0 smem for the C matrix.
+    // We can enable this if beta == 0 by changing ElementC to void below.
+    ElementC, typename cutlass::layout::LayoutTranspose<LayoutC>::type, AlignmentC,
+    ElementD, typename cutlass::layout::LayoutTranspose<LayoutD>::type, AlignmentD,
+    EpilogueSchedule // This is the only epi supporting the required swap + transpose.
+  >::CollectiveOp;
+
+// =========================================================== MIXED INPUT WITH SCALES ===========================================================================
+// The Scale information must get paired with the operand that will be scaled. In this example, B is scaled so we make a tuple of B's information and the scale information.
+using CollectiveMainloopScaleOnly = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    cute::tuple<ElementB, cutlass::Array<ElementScale, 8>>, LayoutB_Transpose, AlignmentB,
+    ElementA, LayoutA_Transpose, AlignmentA,
+    ElementAccumulator,
+    TileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))
+    >,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernelScaleOnly = cutlass::gemm::kernel::GemmUniversal<
+    Shape<int,int,int,int>, // Indicates ProblemShape
+    CollectiveMainloopScaleOnly,
+    CollectiveEpilogue
+>;
+
+using CollectiveMainloopShuffled = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    cute::tuple<ElementB, cutlass::Array<ElementScale, 8>>, LayoutB_Reordered, AlignmentB,
+    ElementA, LayoutA_Transpose, AlignmentA,
+    ElementAccumulator,
+    TileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))
+    >,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernelShuffled = cutlass::gemm::kernel::GemmUniversal<
+    Shape<int,int,int,int>, // Indicates ProblemShape
+    CollectiveMainloopShuffled,
+    CollectiveEpilogue
+>;
+
+using GemmScaleOnly = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelScaleOnly>;
+using GemmShuffled  = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelShuffled>;
+
+using StrideC = typename GemmKernelScaleOnly::StrideC;
+using StrideD = typename GemmKernelScaleOnly::StrideD;
+
+using StrideC_ref = cutlass::detail::TagToStrideC_t<LayoutC>;
+using StrideD_ref = cutlass::detail::TagToStrideC_t<LayoutD>;
+
+//
+// Data members
+//
+
+/// Initialization
+StrideA stride_A;
+StrideB stride_B;
+StrideC stride_C;
+StrideC_ref stride_C_ref;
+StrideD stride_D;
+StrideD_ref stride_D_ref;
+uint64_t seed;
+
+LayoutB_Reordered layout_B_reordered;
+
+using StrideS = typename CollectiveMainloopScaleOnly::StrideScale;
+using StrideS_ref = cutlass::detail::TagToStrideB_t<LayoutScale>;
+StrideS stride_S;
+StrideS_ref stride_S_ref;
+
+cutlass::DeviceAllocation<ElementA> block_A;
+cutlass::DeviceAllocation<ElementB> block_B;
+cutlass::DeviceAllocation<ElementB> block_B_modified;
+cutlass::DeviceAllocation<ElementA> block_B_dq;
+cutlass::DeviceAllocation<ElementScale> block_scale;
+cutlass::DeviceAllocation<cutlass::Array<ElementScale, 8>> block_scale_packed;
+cutlass::DeviceAllocation<ElementZero> block_zero;
+cutlass::DeviceAllocation<ElementC> block_C;
+cutlass::DeviceAllocation<typename GemmScaleOnly::EpilogueOutputOp::ElementOutput> block_D;
+cutlass::DeviceAllocation<typename GemmScaleOnly::EpilogueOutputOp::ElementOutput> block_ref_D;
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Testbed utility types
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options : MixedDtypeOptions {
+  bool shuffle = true;
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+    cmd.get_cmd_line_argument("shuffle", shuffle);
+
+    this->MixedDtypeOptions::parse(argc, args);
+
+    mode = 1; // override the mode value to always be scale only mode
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "55_hopper_int4_fp8_gemm\n\n"
+      << "  Hopper Mixed Data Type GEMM using a Warp Specialized kernel.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement\n\n"
+      << "  --m=<int>                   Sets the M extent of the GEMM\n"
+      << "  --n=<int>                   Sets the N extent of the GEMM\n"
+      << "  --k=<int>                   Sets the K extent of the GEMM\n"
+      << "  --l=<int>                   The number of independent gemm problems with mnk shape\n"
+      << "  --g=<int>                   The size of each group for the scales. To broadcast a vector of scales or zeros, set the group size to K.\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha\n"
+      << "  --beta=<f32>                Epilogue scalar beta\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n"
+      << "  --warmup=<int>              Number of warmup iterations to perform.\n\n"
+      << "  --shuffle=<boolean>         Enable the offline layout swizzling.\n\n";
+
+    out
+      << "\n\nExamples:\n\n"
+      << "$ " << "55_hopper_int4_fp8_gemm" << " --m=1024 --n=512 --k=1024 -g=1024 --l=10 --alpha=2 --beta=0.707 \n\n";
+
+    return out;
+  }
+};
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM setup and evaluation
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Initialize operands to be used in the GEMM and reference GEMM
+void initialize(Options const& options) {
+
+  auto shape_B = cute::make_shape(options.n, options.k, options.l);
+  int const scale_k = (options.k + options.g - 1) / options.g;
+  stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(options.m, options.k, options.l));
+  stride_B = cutlass::make_cute_packed_stride(StrideB{}, shape_B);
+  // Reverse stride here due to swap and transpose
+  stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(options.n, options.m, options.l));
+  stride_C_ref = cutlass::make_cute_packed_stride(StrideC_ref{}, cute::make_shape(options.m, options.n, options.l));
+  // Reverse stride here due to swap and transpose
+  stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(options.n, options.m, options.l));
+  stride_D_ref = cutlass::make_cute_packed_stride(StrideD_ref{}, cute::make_shape(options.m, options.n, options.l));
+
+  auto layout_B = make_layout(shape_B, stride_B);
+
+  auto a_coord = cutlass::make_Coord(options.m * options.l, options.k);
+  auto b_coord = cutlass::make_Coord(options.k, options.n * options.l);
+  auto c_coord = cutlass::make_Coord(options.m * options.l, options.n);
+
+  block_A.reset(a_coord.product());
+  block_B.reset(b_coord.product());
+  block_B_modified.reset(b_coord.product());
+  block_B_dq.reset(b_coord.product());
+  block_C.reset(c_coord.product());
+  block_D.reset(c_coord.product());
+  block_ref_D.reset(c_coord.product());
+
+  block_scale.reset(scale_k * options.l * options.n);
+  block_scale_packed.reset(scale_k * options.l * options.n);
+  block_zero.reset(scale_k * options.l * options.n);
+
+  initialize_tensor(block_A, seed + 2022);
+  initialize_quant_tensor(block_B, seed + 2021);
+  unify_quant_encoding(block_B, block_B_modified);
+  initialize_tensor(block_C, seed + 2020);
+  initialize_scale(block_scale, options);
+  initialize_packed_scale(block_scale, block_scale_packed);
+  initialize_zero(block_zero, options);
+
+  auto shape_scale_zero = cute::make_shape(options.n, scale_k, options.l);
+  stride_S = cutlass::make_cute_packed_stride(StrideS{}, cute::make_shape(options.n, scale_k, options.l));
+  stride_S_ref = cutlass::make_cute_packed_stride(StrideS_ref{}, cute::make_shape(options.n, scale_k, options.l));
+  auto layout_scale_zero = make_layout(shape_scale_zero, stride_S_ref);
+
+  dequantize_weight(block_B_dq.get(), block_B.get(), layout_B, block_scale.get(), block_zero.get(), layout_scale_zero, options.g);
+
+  if (options.shuffle) {
+    // Repeat the reorder layout atom to tile the whole tensor shape 
+    layout_B_reordered = tile_to_shape(LayoutAtomQuant{}, shape_B);
+    reorder_tensor(block_B_modified.get(), layout_B, layout_B_reordered);
+
+    print("Quantized tensor layout: ");
+    print(layout_B_reordered);
+    print("\n");
+  }
+}
+
+/// Populates a Gemm::Arguments structure from the given commandline options
+/// Swap the A and B tensors, as well as problem shapes here.
+template <typename Gemm>
+typename Gemm::Arguments args_from_options(Options const& options)
+{
+  using Args = typename Gemm::Arguments;
+  auto&& dB = [&]() {
+    if constexpr (cute::is_same_v<Gemm, GemmShuffled>) { // offline swizzling is enabled.
+      return layout_B_reordered;
+    } 
+    else {
+      return stride_B;
+    }
+  }();
+  return Args {
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    {options.n, options.m, options.k, options.l},
+    {block_B_modified.get(), dB, block_A.get(), stride_A, block_scale_packed.get(), stride_S, options.g},
+    {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}
+  };
+}
+
+bool verify(Options const& options) {
+  //
+  // Compute reference output
+  //
+
+  // In this example, we use the GPU default kernels as a reference (unfused scale).
+  // This avoids numerical differences due to different accumulation order.
+
+  // Again, due to numerical differences, we must use fast acc here when the mma type is
+  // FP8 as the fused implementation only supports fast acc at the moment.
+  constexpr bool IsFP8Input = cute::is_same_v<MmaType, cutlass::float_e4m3_t> || cute::is_same_v<MmaType, cutlass::float_e5m2_t>;
+  using FP8Sched = cute::conditional_t<size<0>(TileShape{}) == 64, cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum, cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum>;
+  using ScheduleRef = cute::conditional_t<IsFP8Input, FP8Sched, cutlass::gemm::collective::KernelScheduleAuto>;
+
+  using CollectiveMainloopRef = typename cutlass::gemm::collective::CollectiveBuilder<
+      ArchTag, OperatorClass,
+      MmaType, LayoutA, AlignmentA,
+      MmaType, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAuto,
+      ScheduleRef
+    >::CollectiveOp;
+
+  using CollectiveEpilogueRef = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementAccumulator,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernelRef = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>, // Indicates ProblemShape
+      CollectiveMainloopRef,
+      CollectiveEpilogueRef
+  >;
+
+  using GemmRef = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelRef>;
+
+  typename GemmRef::Arguments arguments{
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    {options.m, options.n, options.k, options.l},
+    {block_A.get(), stride_A, block_B_dq.get(), stride_B},
+    {{options.alpha, options.beta}, block_C.get(), stride_C_ref, block_ref_D.get(), stride_D_ref}
+  };
+
+  // Run the gemm where the scaling is performed outside of the kernel.
+  GemmRef gemm_ref;
+  size_t workspace_size = GemmRef::get_workspace_size(arguments);
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+  CUTLASS_CHECK(gemm_ref.can_implement(arguments));
+  CUTLASS_CHECK(gemm_ref.initialize(arguments, workspace.get()));
+  CUTLASS_CHECK(gemm_ref.run());
+
+  // compare_reference
+  ElementD const epsilon(1e-2f);
+  ElementD const non_zero_floor(1e-4f);
+  bool passed = cutlass::reference::device::BlockCompareRelativelyEqual(block_ref_D.get(), block_D.get(), block_D.size(), epsilon, non_zero_floor);
+
+  return passed;
+}
+
+/// Execute a given example GEMM computation
+template <typename Gemm>
+int run(Options &options)
+{
+  initialize(options);
+
+  // Instantiate CUTLASS kernel depending on templates
+  Gemm gemm;
+
+  // Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
+  auto arguments = args_from_options<Gemm>(options);
+
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  // Check if the problem size is supported or not
+  CUTLASS_CHECK(gemm.can_implement(arguments));
+
+  // Initialize CUTLASS kernel with arguments and workspace pointer
+  CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
+
+  // Correctness / Warmup iteration
+  CUTLASS_CHECK(gemm.run());
+
+  // Check if output from CUTLASS kernel and reference kernel are equal or not
+  MixedDtypeResult result;
+  result.passed = verify(options);
+  mixed_dtype_profiling(gemm, options, result);
+  std::cout << "  Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
+  if (!result.passed) {
+    exit(-1);
+  }
+
+  return 0;
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  // CUTLASS must be compiled with CUDA 12.0 Toolkit to run this example
+  // and must have compute capability at least 90.
+  if (__CUDACC_VER_MAJOR__ < 12) {
+    std::cerr << "This example requires CUDA 12 or newer.\n";
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+
+  cudaDeviceProp props;
+  int current_device_id;
+  CUDA_CHECK(cudaGetDevice(&current_device_id));
+  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (props.major < 9) {
+    std::cerr
+      << "This example requires a GPU of NVIDIA's Hopper Architecture or "
+      << "later (compute capability 90 or greater).\n";
+    return 0;
+  }
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  //
+  // Evaluate CUTLASS kernels
+  //
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+  if (options.g == options.k) {
+    std::cout << "Running in per-column scale mode." << std::endl;
+  } else {
+    std::cout << "Running in group scale mode." << std::endl;
+  }
+  if (options.shuffle) {
+    std::cout << "Offline shuffle enabled." << std::endl;
+    run<GemmShuffled>(options);
+  } else {
+    std::cout << "Offline shuffle disabled." << std::endl;
+    run<GemmScaleOnly>(options);
+  }
+#endif
+
+  return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/55_hopper_mixed_dtype_gemm/55_hopper_mixed_dtype_gemm.cu b/examples/55_hopper_mixed_dtype_gemm/55_hopper_mixed_dtype_gemm.cu
new file mode 100644
index 0000000000..b482d0d15f
--- /dev/null
+++ b/examples/55_hopper_mixed_dtype_gemm/55_hopper_mixed_dtype_gemm.cu
@@ -0,0 +1,535 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Hopper GEMM example with different data types using CUTLASS 3.0 APIs for NVIDIA Hopper architecture
+
+    This example shows how to perform GEMM where the input tensors A and B have different element types. CUTLASS currently supports upcasting
+    from a narrower (fewer bits) to a wider (more bits) type and utilizing the tensor core instruction for the wider type. For instance, when doing
+    INT8 x FP16, CUTLASS will convert INT8 -> FP16 and do math using FP16 tensor cores. Similarly, for INT4 x INT8, it will upcast to INT8 and issue math 
+    using INT8 tensor cores.
+
+    The narrower type always passes through the register file. Therefore, in cases where the narrower type is operand B, the collective will implicitly swap 
+    A and B in the main loop. However, implicit swaps do not support TMA epilogues. Consequently, it is essential to consider this when constructing the epilogue, 
+    as illustrated in this example.
+
+    Note that in this example, we explicitly swap A and B in order to use TMA epilogues. We do this since TMA epilogues are more performant on problem sizes of interest.
+
+    It is expected that the scale's K dimension be scale_k = ceil_div(problem_k, group_size). 
+    
+    Scales are always expected to be MN major. This means the fastest changing dimension must be M if A is scaled or N if B is scaled.
+    
+    If A is being scaled, the scales should have shape [M, scale_k],  while if B is scaled, it must have shape [N, scale_k].
+
+    The implementation only supports "group-wise" scales. However, we can make it work for per-column scales by setting the groups size
+    equal to the gemm problem K.
+
+    Limitations:
+      1) The narrow type must always be in K-major format.
+      2) The scales and zeros must be MN major. That means if A is scaled, it must be column major, but if B is scaled it must be row major.
+      3) The scales and the zeros must have the same layout and groupsize.
+      4) The groupsize must be greater or equal to tile shape k.
+      5) When dealing with 8-bit x {4-bit, 2-bit}, both inputs must be in K-major format.
+      6) Currently, TMA epilogues cannot be used when the narrow type is the B operand. This limitation arises because the implementation always swaps the 
+         operands to ensure that the narrow type passes through the register file, and TMA epilogues do not currently support implicit swap + transpose operations. 
+         We plan to address this limitation in the future. However, we address this in the example by explicitly swapping and transposing the operands.
+    
+    Optimizing suggestions:
+      1) Use a small tile size, since the register pressure for this GEMM (and RS GEMM in general) is high (it uses a lot of register space).
+      2) Try avoid using scale or zero mode cause the computations will be the bottleneck.
+
+    Examples:
+      
+      Runs the mixed input batched gemm (with batch size 2), converting B to the type of A (mode 0)
+      $ ./examples/55_hopper_mixed_dtype_gemm/55_hopper_mixed_dtype_gemm --m=2048 --n=2048 --k=2048 --l=2 --mode=0
+
+      Runs the mixed input gemm, and applies a scaling factor to B before mma (mode 1). Applies a vector of scales to the entire
+      matrix (group size is the same as the gemm k dimension).
+      $ ./examples/55_hopper_mixed_dtype_gemm/55_hopper_mixed_dtype_gemm --m=4096 --n=5120 --k=8192 --g=8192 --mode=1
+
+      Runs the mixed input gemm, and applies a scaling factor and adds a zero-point to B before mma (mode 2). Uses a group size of 128.
+      $ ./examples/55_hopper_mixed_dtype_gemm/55_hopper_mixed_dtype_gemm --m=2048 --n=5120 --k=8192 --g=128 --mode=2
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+
+#include "helper.h"
+#include "mixed_dtype_utils.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM kernel configurations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+using MmaType = cutlass::half_t;
+using QuantType = cutlass::float_e4m3_t;
+constexpr int TileShapeK = 128 * 8 / sizeof_bits<MmaType>::value;
+
+// A matrix configuration
+using         ElementA    = MmaType;                                        // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using         ElementB    = QuantType;                                      // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+// This example manually swaps and transposes, so keep transpose of input layouts
+using LayoutA_Transpose = typename cutlass::layout::LayoutTranspose<LayoutA>::type;
+using LayoutB_Transpose = typename cutlass::layout::LayoutTranspose<LayoutB>::type;
+
+using ElementZero = cutlass::half_t;
+using ElementScale = cutlass::half_t;
+using LayoutScale = cutlass::layout::RowMajor;
+
+// C/D matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C and D matrix operands
+using         LayoutC     = cutlass::layout::RowMajor;                      // Layout type for C and D matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+
+// D matrix configuration
+using         ElementD    = ElementC;
+using         LayoutD     = LayoutC;
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+// Core kernel configurations
+using ElementAccumulator  = float;                                          // Element type for internal accumulation
+using ElementCompute      = float;                                          // Element type for epilogue computation
+using ArchTag             = cutlass::arch::Sm90;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                 // Operator class tag
+using TileShape           = Shape<_128,_128,cute::Int<TileShapeK>>;         // Threadblock-level tile size
+using ClusterShape        = Shape<_1,_1,_1>;                                // Shape of the threadblocks in a cluster
+using KernelSchedule      = cutlass::gemm::KernelTmaWarpSpecializedCooperative;  // Kernel to launch based on the default setting in the Collective Builder 
+using EpilogueSchedule    = cutlass::epilogue::TmaWarpSpecializedCooperative;
+using EpilogueTileType    = cutlass::epilogue::collective::EpilogueTileAuto;
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    TileShape, ClusterShape,
+    EpilogueTileType,
+    ElementAccumulator, ElementAccumulator,
+    // Transpose layout of D here since we use explicit swap + transpose
+    // the void type for C tells the builder to allocate 0 smem for the C matrix.
+    // We can enable this if beta == 0 by changing ElementC to void below.
+    ElementC, typename cutlass::layout::LayoutTranspose<LayoutC>::type, AlignmentC,
+    ElementD, typename cutlass::layout::LayoutTranspose<LayoutD>::type, AlignmentD,
+    EpilogueSchedule // This is the only epi supporting the required swap + transpose.
+  >::CollectiveOp;
+
+// ============================================================ MIXED INPUT NO SCALES ============================================================================
+// The collective will infer that the narrow type should be upcasted to the wide type.
+// We swap A and B operands to the builder here
+using CollectiveMainloopConvertOnly = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementB, LayoutB_Transpose, AlignmentB,
+    ElementA, LayoutA_Transpose, AlignmentA,
+    ElementAccumulator,
+    TileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))
+    >,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernelConvertOnly = cutlass::gemm::kernel::GemmUniversal<
+    Shape<int,int,int,int>, // Indicates ProblemShape
+    CollectiveMainloopConvertOnly,
+    CollectiveEpilogue
+>;
+
+using GemmConvertOnly = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelConvertOnly>;
+
+// =========================================================== MIXED INPUT WITH SCALES ===========================================================================
+// The Scale information must get paired with the operand that will be scaled. In this example, B is scaled so we make a tuple of B's information and the scale information.
+using CollectiveMainloopScaleOnly = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    cute::tuple<ElementB, ElementScale>, LayoutB_Transpose, AlignmentB,
+    ElementA, LayoutA_Transpose, AlignmentA,
+    ElementAccumulator,
+    TileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))
+    >,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernelScaleOnly = cutlass::gemm::kernel::GemmUniversal<
+    Shape<int,int,int,int>, // Indicates ProblemShape
+    CollectiveMainloopScaleOnly,
+    CollectiveEpilogue
+>;
+
+using GemmScaleOnly = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelScaleOnly>;
+
+// =========================================================== MIXED INPUT WITH SCALES AND ZEROS ==================================================================
+// We specify scale + zero elements to indicate that we require both. Scales and biases have the same format.
+using CollectiveMainloopScaleWithZeroPoint = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    cute::tuple<ElementB, ElementScale, ElementZero>, LayoutB_Transpose, AlignmentB,
+    ElementA, LayoutA_Transpose, AlignmentA,
+    ElementAccumulator,
+    TileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))
+    >,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernelScaleWithZeroPoint = cutlass::gemm::kernel::GemmUniversal<
+    Shape<int,int,int,int>, // Indicates ProblemShape
+    CollectiveMainloopScaleWithZeroPoint,
+    CollectiveEpilogue
+>;
+
+using GemmScaleWithZeroPoint = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelScaleWithZeroPoint>;
+// =================================================================================================================================================================
+
+using StrideA = cutlass::detail::TagToStrideA_t<LayoutA>;
+using StrideB = cutlass::detail::TagToStrideB_t<LayoutB>;
+using StrideC = typename GemmKernelScaleWithZeroPoint::StrideC;
+using StrideD = typename GemmKernelScaleWithZeroPoint::StrideD;
+
+using StrideC_ref = cutlass::detail::TagToStrideC_t<LayoutC>;
+using StrideD_ref = cutlass::detail::TagToStrideC_t<LayoutD>;
+
+//
+// Data members
+//
+
+/// Initialization
+StrideA stride_A;
+StrideB stride_B;
+StrideC stride_C;
+StrideC_ref stride_C_ref;
+StrideD stride_D;
+StrideD_ref stride_D_ref;
+uint64_t seed;
+
+// Scale and Zero share a stride since the layout and shapes must be the same.
+using StrideS = typename CollectiveMainloopScaleWithZeroPoint::StrideScale;
+using StrideS_ref = cutlass::detail::TagToStrideB_t<LayoutScale>;
+StrideS stride_S;
+StrideS_ref stride_S_ref;
+
+cutlass::DeviceAllocation<ElementA> block_A;
+cutlass::DeviceAllocation<ElementB> block_B;
+cutlass::DeviceAllocation<ElementA> block_B_dq;
+cutlass::DeviceAllocation<ElementScale> block_scale;
+cutlass::DeviceAllocation<ElementZero> block_zero;
+cutlass::DeviceAllocation<ElementC> block_C;
+cutlass::DeviceAllocation<typename GemmScaleWithZeroPoint::EpilogueOutputOp::ElementOutput> block_D;
+cutlass::DeviceAllocation<typename GemmScaleWithZeroPoint::EpilogueOutputOp::ElementOutput> block_ref_D;
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Testbed utility types
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM setup and evaluation
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Initialize operands to be used in the GEMM and reference GEMM
+void initialize(MixedDtypeOptions const& options) {
+
+  auto shape_b = cute::make_shape(options.n, options.k, options.l);
+  int const scale_k = (options.k + options.g - 1) / options.g;
+  stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(options.m, options.k, options.l));
+  stride_B = cutlass::make_cute_packed_stride(StrideB{}, shape_b);
+  // Reverse stride here due to swap and transpose
+  stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(options.n, options.m, options.l));
+  stride_C_ref = cutlass::make_cute_packed_stride(StrideC_ref{}, cute::make_shape(options.m, options.n, options.l));
+  // Reverse stride here due to swap and transpose
+  stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(options.n, options.m, options.l));
+  stride_D_ref = cutlass::make_cute_packed_stride(StrideD_ref{}, cute::make_shape(options.m, options.n, options.l));
+
+  auto a_coord = cutlass::make_Coord(options.m * options.l, options.k);
+  auto b_coord = cutlass::make_Coord(options.k, options.n * options.l);
+  auto c_coord = cutlass::make_Coord(options.m * options.l, options.n);
+
+  block_A.reset(a_coord.product());
+  block_B.reset(b_coord.product());
+  block_B_dq.reset(b_coord.product());
+  block_C.reset(c_coord.product());
+  block_D.reset(c_coord.product());
+  block_ref_D.reset(c_coord.product());
+
+  block_scale.reset(scale_k * options.l * options.n);
+  block_zero.reset(scale_k * options.l * options.n);
+
+  initialize_tensor(block_A, seed + 2022);
+  initialize_quant_tensor(block_B, seed + 2021);
+  initialize_tensor(block_C, seed + 2020);
+  initialize_scale(block_scale, options);
+  initialize_zero(block_zero, options);
+
+  auto layout_B = make_layout(shape_b, stride_B);
+
+  auto shape_scale_zero = cute::make_shape(options.n, scale_k, options.l);
+  stride_S = cutlass::make_cute_packed_stride(StrideS{}, cute::make_shape(options.n, scale_k, options.l));
+  stride_S_ref = cutlass::make_cute_packed_stride(StrideS_ref{}, cute::make_shape(options.n, scale_k, options.l));
+  auto layout_scale_zero = make_layout(shape_scale_zero, stride_S_ref);
+
+  dequantize_weight(block_B_dq.get(), block_B.get(), layout_B, block_scale.get(), block_zero.get(), layout_scale_zero, options.g);
+}
+
+/// Populates a Gemm::Arguments structure from the given commandline options
+template <typename Args>
+Args args_from_options(MixedDtypeOptions const& options)
+{
+// Swap the A and B tensors, as well as problem shapes here.
+  if (options.mode == MixedDtypeGemmMode::ConvertOnly) {
+    return Args {
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      {options.n, options.m, options.k, options.l},
+      {block_B.get(), stride_B, block_A.get(), stride_A},
+      {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}
+    };
+  } 
+  else if (options.mode == MixedDtypeGemmMode::ScaleOnly) {
+    return Args {
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      {options.n, options.m, options.k, options.l},
+      {block_B.get(), stride_B, block_A.get(), stride_A, block_scale.get(), stride_S, options.g},
+      {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}
+    };
+  } 
+  else if (options.mode == MixedDtypeGemmMode::ScaleWithZeroPoint) {
+    return Args {
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      {options.n, options.m, options.k, options.l},
+      {block_B.get(), stride_B, block_A.get(), stride_A, block_scale.get(), stride_S, options.g, block_zero.get()},
+      {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}
+    };
+  } else {
+    std::cerr << "Invalid mode " << options.mode << ". Must be 0, 1 or 2." << std::endl;
+    exit(-1);
+  }
+}
+
+bool verify(MixedDtypeOptions const& options) {
+  //
+  // Compute reference output
+  //
+
+  // In this example, we use the GPU default kernels as a reference (unfused scale)
+  // This avoids numerical differences due to different accumulation order.
+
+  // Again, due to numerical differences, we must use fast acc here when the mma type is
+  // FP8 as the fused implementation only supports fast acc at the moment.
+  constexpr bool IsFP8Input = cute::is_same_v<MmaType, cutlass::float_e4m3_t> || cute::is_same_v<MmaType, cutlass::float_e5m2_t>;
+  using FP8Sched = cute::conditional_t<size<0>(TileShape{}) == 64, cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum, cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum>;
+  using ScheduleRef = cute::conditional_t<IsFP8Input, FP8Sched, cutlass::gemm::collective::KernelScheduleAuto>;
+
+  using CollectiveMainloopRef = typename cutlass::gemm::collective::CollectiveBuilder<
+      ArchTag, OperatorClass,
+      MmaType, LayoutA, AlignmentA,
+      MmaType, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAuto,
+      ScheduleRef
+    >::CollectiveOp;
+
+  using CollectiveEpilogueRef = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementAccumulator,
+      ElementC, LayoutC, AlignmentC,
+      ElementD, LayoutD, AlignmentD,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernelRef = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>, // Indicates ProblemShape
+      CollectiveMainloopRef,
+      CollectiveEpilogueRef
+  >;
+
+  using GemmRef = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelRef>;
+
+  typename GemmRef::Arguments arguments{
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    {options.m, options.n, options.k, options.l},
+    {block_A.get(), stride_A, block_B_dq.get(), stride_B},
+    {{options.alpha, options.beta}, block_C.get(), stride_C_ref, block_ref_D.get(), stride_D_ref}
+  };
+
+  // Run the gemm where the scaling is performed outside of the kernel.
+  GemmRef gemm_ref;
+  size_t workspace_size = GemmRef::get_workspace_size(arguments);
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+  CUTLASS_CHECK(gemm_ref.can_implement(arguments));
+  CUTLASS_CHECK(gemm_ref.initialize(arguments, workspace.get()));
+  CUTLASS_CHECK(gemm_ref.run());
+
+  // compare_reference
+  ElementD const epsilon(1e-2f);
+  ElementD const non_zero_floor(1e-4f);
+  bool passed = cutlass::reference::device::BlockCompareRelativelyEqual(block_ref_D.get(), block_D.get(), block_D.size(), epsilon, non_zero_floor);
+  return passed;
+}
+
+/// Execute a given example GEMM computation
+template <typename Gemm>
+int run(MixedDtypeOptions &options)
+{
+  initialize(options);
+
+  // Instantiate CUTLASS kernel depending on templates
+  Gemm gemm;
+
+  // Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
+  auto arguments = args_from_options<typename Gemm::Arguments>(options);
+
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  // Check if the problem size is supported or not
+  CUTLASS_CHECK(gemm.can_implement(arguments));
+
+  // Initialize CUTLASS kernel with arguments and workspace pointer
+  CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
+
+  // Correctness / Warmup iteration
+  CUTLASS_CHECK(gemm.run());
+
+  // Check if output from CUTLASS kernel and reference kernel are equal or not
+  MixedDtypeResult result;
+  result.passed = verify(options);
+  mixed_dtype_profiling(gemm, options, result);
+  std::cout << "  Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
+  if (!result.passed) {
+    exit(-1);
+  }
+
+  return 0;
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  // CUTLASS must be compiled with CUDA 12.0 Toolkit to run this example
+  // and must have compute capability at least 90.
+  if (__CUDACC_VER_MAJOR__ < 12) {
+    std::cerr << "This example requires CUDA 12 or newer.\n";
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+
+  cudaDeviceProp props;
+  int current_device_id;
+  CUDA_CHECK(cudaGetDevice(&current_device_id));
+  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (props.major < 9) {
+    std::cerr
+      << "This example requires a GPU of NVIDIA's Hopper Architecture or "
+      << "later (compute capability 90 or greater).\n";
+    return 0;
+  }
+  //
+  // Parse options
+  //
+
+  MixedDtypeOptions options;
+
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  //
+  // Evaluate CUTLASS kernels
+  //
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+  if (options.mode == MixedDtypeGemmMode::ConvertOnly) {
+    std::cout << "Running in no scale mode." << std::endl;
+    run<GemmConvertOnly>(options);
+  }
+  else if (options.mode == MixedDtypeGemmMode::ScaleOnly) {
+    if (options.g == options.k) {
+      std::cout << "Running in per-column scale mode." << std::endl;
+    } else {
+      std::cout << "Running in group scale mode." << std::endl;
+    }
+    run<GemmScaleOnly>(options);
+  }
+  else if (options.mode == MixedDtypeGemmMode::ScaleWithZeroPoint) {
+    if (options.g == options.k) {
+      std::cout << "Running in per-column scale and zero mode." << std::endl;
+    } else {
+      std::cout << "Running in group scale and zero mode." << std::endl;
+    }
+    run<GemmScaleWithZeroPoint>(options);
+  }
+#endif
+
+  return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/55_hopper_mixed_dtype_gemm/CMakeLists.txt b/examples/55_hopper_mixed_dtype_gemm/CMakeLists.txt
new file mode 100644
index 0000000000..23dca4f3fd
--- /dev/null
+++ b/examples/55_hopper_mixed_dtype_gemm/CMakeLists.txt
@@ -0,0 +1,81 @@
+
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# Note that we set --iterations=0 for all tests below to disable the performance benchmarking.
+# Only the correctness check will be run by these commands.
+
+set(TEST_DIRECT_BATCHED --m=2048 --n=2048 --k=2048 --l=2 --mode=0 --iterations=0)        # Direct conversion
+
+set(TEST_SCALE_PERCOL --m=4096 --n=5120 --k=8192 --g=8192 --mode=1 --iterations=0)       # Per Column scaling
+set(TEST_SCALE_ZERO_PERCOL --m=4096 --n=5120 --k=8192 --g=8192 --mode=2 --iterations=0)  # Per Column scaling
+
+set(TEST_SCALE_GROUP --m=2048 --n=5120 --k=8192 --g=512 --mode=1 --iterations=0)         # Group-wise scaling
+set(TEST_SCALE_ZERO_GROUPED --m=2048 --n=5120 --k=8192 --g=256 --mode=2 --iterations=0)  # Group-wise scaling with zero-point
+
+set(TEST_SCALE_RESIDUE --m=128 --n=128 --k=320 --g=128 --mode=1 --iterations=0)          # Final group has residue
+set(TEST_SCALE_ZERO_RESIDUE --m=128 --n=128 --k=192 --g=128 --mode=2 --iterations=0)     # Final group has residue
+
+set(TEST_ALPHA_BETA --alpha=0.5 --beta=0.7 --mode=2 --iterations=0)                      # Alpha and Beta with default shapes
+
+
+cutlass_example_add_executable(
+  55_hopper_mixed_dtype_gemm
+  55_hopper_mixed_dtype_gemm.cu
+  TEST_COMMAND_OPTIONS
+  TEST_DIRECT_BATCHED
+  TEST_SCALE_PERCOL
+  TEST_SCALE_ZERO_PERCOL
+  TEST_SCALE_GROUP
+  TEST_SCALE_ZERO_GROUPED
+  TEST_SCALE_RESIDUE
+  TEST_SCALE_ZERO_RESIDUE
+  # TEST_ALPHA_BETA
+  )
+
+cutlass_example_add_executable(
+  55_hopper_int4_fp8_gemm
+  55_hopper_int4_fp8_gemm.cu
+  TEST_COMMAND_OPTIONS
+  TEST_DIRECT_BATCHED
+  TEST_SCALE_PERCOL
+  TEST_SCALE_GROUP
+  TEST_SCALE_RESIDUE
+  # TEST_ALPHA_BETA
+  )
+
+  cutlass_example_add_executable(
+    55_hopper_int4_bf16_gemm
+    55_hopper_int4_bf16_gemm.cu
+    TEST_COMMAND_OPTIONS
+    TEST_DIRECT_BATCHED
+    TEST_SCALE_PERCOL
+    TEST_SCALE_GROUP
+    TEST_SCALE_RESIDUE
+    # TEST_ALPHA_BETA
+    )
diff --git a/examples/55_hopper_mixed_dtype_gemm/README.md b/examples/55_hopper_mixed_dtype_gemm/README.md
new file mode 100644
index 0000000000..ecb4f41c97
--- /dev/null
+++ b/examples/55_hopper_mixed_dtype_gemm/README.md
@@ -0,0 +1,44 @@
+This example shows how to do mixed types GEMMs in CUTLASS.
+
+## High level overview 
+This example shows how to perform GEMMs on Hopper when A and B have different types. This implementation always passes the type with fewer bits through the register file and upcasts to the type with the higher bit count.
+
+When relying on `KernelScheduleAuto`, the main loop supporting different A and B types will be selected whenever the bit count of A is not equal to the bit count of B. Users can manually select the mixed type main loop and explicitly choose the scheduling policy by specifying one of the following schedules to the `CollectiveBuilder`:  `KernelTmaWarpSpecialized`, `KernelTmaWarpSpecializedPingpong` or `KernelTmaWarpSpecializedCooperative`.
+
+This first version only supports mixed type GEMMs using TMA.
+
+## Performance
+
+While the example offers a harness for straightforward benchmarking, this initial implementation isn't optimized for performance in the majority of scenarios. We expect this implementation to be performant for `{fp16, bf16} x {int8, int4, int2}` and `{fp8} x {int4}` for problems that are compute bound. Additionally, we expect good performance for `fp16`, `bf16` or `fp32` scales and zero-points. For best performance, it is ideal to have the scales and zero-points be the same type as mma's type.
+
+The scale only mode for `fp8 x int4` is significantly slower than direct conversion mode. There is a lookup-table workaround targeting this mode, as shown in `55_hopper_int4_fp8_gemm.cu`. To use this feature, use `cutlass::Array<ElementScale, 8>` as the scale type in the collective builder. However, it requires modifications to the encoding of quantized weights and scale factors. Also, scale with zero point mode is not supported for now.
+
+
+Additionally, it's recommended to reorder the narrow data type tensor such that elements read into register file by the same thread are contiguous in global and shared memory. The user can use the helper function `compute_memory_reordering_atom` and `reorder_tensor` to achieve this. See `55_hopper_int4_fp8_gemm.cu` and `55_hopper_int4_bf16_gemm.cu` for more details.
+
+
+We are currently optimizing the following cases:
+1. Memory bound cases for all types
+2. `fp8 x {int2, uint2}` case
+
+## Limitations
+
+* The type that needs to be converted must go through the register file. This means that the collective will swap and transpose whenever the type with fewer bits is the B operand. The user must be aware of when these swaps happen. Note that TMA epilogues currently do not support *implicit* swap + transpose, so non-tma epilogues must be used in this case. We plan to relax this limitation in a future release.
+
+* The layout of the narrow type must be K-major. This means the following:
+  * Narrow type is the A operand: Must be Row-Major
+  * Narrow type is the B operand: Must be Column-Major
+
+* For 8-bit x 4-bit or 2-bit, both inputs must be K-major.
+
+* TMA requires an alignment of 128 bits. As a result, for a type with `B` bits, `B x TILE_K` must be a multiple of 128 bits.
+
+* The type of the scale and zero-point type must be two bytes or more.
+
+* The group size must be equal to gemm-k size (indicating a broadcast), or it must be a multiple of the threadblock-k size.
+
+## Upcoming features
+
+* Optimizations for memory bound cases.
+
+* Optimizations for scale and zero-point loading when the group size is not equal to the threadblock-k size.
diff --git a/examples/55_hopper_mixed_dtype_gemm/mixed_dtype_utils.hpp b/examples/55_hopper_mixed_dtype_gemm/mixed_dtype_utils.hpp
new file mode 100644
index 0000000000..55de3fabb3
--- /dev/null
+++ b/examples/55_hopper_mixed_dtype_gemm/mixed_dtype_utils.hpp
@@ -0,0 +1,391 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+
+#include "cute/tensor.hpp"
+
+#include <cuda.h>
+#include <numeric>
+#include "helper.h"
+
+enum MixedDtypeGemmMode {
+  ConvertOnly,
+  ScaleOnly,
+  ScaleWithZeroPoint
+};
+
+/// Command line options parsing
+struct MixedDtypeOptions {
+
+  bool help = false;
+
+  float alpha = 1.0f;
+  float beta = 0.0f;
+  int iterations = 1000;
+  int warmup = 1000;
+  int mode = 1;
+  int m = 5120, n = 4096, k = 4096;
+  int g = 128;
+  int l = 1;
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("m", m);
+    cmd.get_cmd_line_argument("n", n);
+    cmd.get_cmd_line_argument("k", k);
+    cmd.get_cmd_line_argument("l", l);
+    cmd.get_cmd_line_argument("g", g);
+    cmd.get_cmd_line_argument("mode", mode);
+    cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+    cmd.get_cmd_line_argument("beta", beta, 0.f);
+    cmd.get_cmd_line_argument("iterations", iterations);
+    cmd.get_cmd_line_argument("warmup", warmup);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "55_hopper_mixed_dtype_gemm\n\n"
+      << "  Hopper Mixed Data Type GEMM using a Warp Specialized kernel.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement\n\n"
+      << "  --m=<int>                   Sets the M extent of the GEMM\n"
+      << "  --n=<int>                   Sets the N extent of the GEMM\n"
+      << "  --k=<int>                   Sets the K extent of the GEMM\n"
+      << "  --l=<int>                   The number of independent gemm problems with mnk shape\n"
+      << "  --g=<int>                   The size of each group for the scales and zeros. To broadcast a vector of scales or zeros, set the group size to K.\n"
+      << "  --mode=<int>                The mode to run the gemm. 0 does (A @ B), 1 means A @ (scale * B), 2 means A @ (scale * B + zero-point).\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha\n"
+      << "  --beta=<f32>                Epilogue scalar beta\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n"
+      << "  --warmup=<int>              Number of warmup iterations to perform.\n\n";
+
+    out
+      << "\n\nExamples:\n\n"
+      << "$ " << "55_hopper_mixed_dtype_gemm" << " --m=1024 --n=512 --k=1024 -g=1024 --l=10 --alpha=2 --mode=2 --beta=0.707 \n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const
+  {
+    // Two flops per multiply-add
+    uint64_t flop = uint64_t(2) * m * n * k * l;
+    double gflop = double(flop) / double(1.0e9);
+    return gflop / runtime_s;
+  }
+};
+
+/// Result structure
+struct MixedDtypeResult
+{
+  double avg_runtime_ms = 0.0;
+  double gflops = 0.0;
+  cutlass::Status status = cutlass::Status::kSuccess;
+  cudaError_t error = cudaSuccess;
+  bool passed = false;
+
+};
+
+/// Profiling Loop
+template <class Gemm>
+void mixed_dtype_profiling(
+  Gemm& gemm,
+  MixedDtypeOptions const& options,
+  MixedDtypeResult& result) {
+
+  if (options.iterations <= 0) return;
+
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  std::vector<float> runtimes;
+  runtimes.reserve(options.iterations);
+
+  for (int iter = 0; iter < options.warmup + options.iterations; ++iter) {
+    cudaEventRecord(start);
+    CUTLASS_CHECK(gemm.run());
+    cudaEventRecord(stop);
+    cudaEventSynchronize(stop);
+
+    if (iter >= options.warmup) {
+        float milliseconds = 0;
+        cudaEventElapsedTime(&milliseconds, start, stop);
+        runtimes.push_back(milliseconds);
+    }
+  }
+
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+
+  // Compute average setup and runtime and GFLOPs.
+  result.avg_runtime_ms = std::accumulate(runtimes.begin(), runtimes.end(), 0.0f) / runtimes.size();
+  result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
+
+  std::cout << "  Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
+  std::cout << "  Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
+  std::cout << "  GFLOPS: " << result.gflops << std::endl;
+  
+}
+
+/// Helpers to initialize a block of device data
+template <class Element>
+bool initialize_tensor(
+  cutlass::DeviceAllocation<Element>& block,
+  uint64_t seed = 2023) {
+
+  double scope_max, scope_min;
+  int bits_input = cutlass::sizeof_bits<Element>::value;
+  int bits_output = cutlass::sizeof_bits<Element>::value;
+
+  if (bits_input == 1) {
+    scope_max = 2;
+    scope_min = 0;
+  }
+  else if (bits_input <= 8) {
+    scope_max = 2;
+    scope_min = -2;
+  }
+  else if (bits_output == 16) {
+    scope_max = 5;
+    scope_min = -5;
+  }
+  else {
+    scope_max = 8;
+    scope_min = -8;
+  }
+  cutlass::reference::device::BlockFillRandomUniform(
+      block.get(), block.size(), seed, Element(scope_max), Element(scope_min));
+
+  return true;
+}
+
+template <typename Element>
+bool initialize_quant_tensor(
+  cutlass::DeviceAllocation<Element>& block,
+  uint64_t seed = 2023) {
+  
+  float scope_min = float(cutlass::platform::numeric_limits<Element>::lowest());
+  float scope_max = float(cutlass::platform::numeric_limits<Element>::max());
+
+  cutlass::reference::device::BlockFillRandomUniform(
+    block.get(), block.size(), seed, Element(scope_max), Element(scope_min));
+
+  return true;
+}
+
+template <class Element>
+bool initialize_scale(
+  cutlass::DeviceAllocation<Element>& block, 
+  MixedDtypeOptions const& options,
+  uint64_t seed = 2023) {
+  
+  if (options.mode == MixedDtypeGemmMode::ConvertOnly) {
+    // No scales, so just initialize with 1 so we can use the same kernel to dequantize the data.
+    std::vector<Element> stage(block.size(), Element(1.0f));
+    block.copy_from_host(stage.data());
+  } 
+  else {
+    float elt_max_f = float(cutlass::platform::numeric_limits<Element>::max());
+    const float max_dequant_val = 4.f;
+    const float min_dequant_val = 0.5f;
+
+    float scope_max(max_dequant_val / elt_max_f);
+    float scope_min(min_dequant_val / elt_max_f);
+
+    cutlass::reference::device::BlockFillRandomUniform(
+      block.get(), block.size(), seed, Element(scope_max), Element(scope_min));
+  }
+  return true;
+}
+
+template <class Element>
+bool initialize_zero(
+  cutlass::DeviceAllocation<Element>& block,
+  MixedDtypeOptions const& options,
+  uint64_t seed = 2023) {
+  
+  if (options.mode == MixedDtypeGemmMode::ScaleWithZeroPoint) {
+    cutlass::reference::device::BlockFillRandomUniform(
+      block.get(), block.size(), seed, Element(2.0f), Element(-2.0f));
+  } else {
+    // No bias, so just initialize with 1 so we can use the same kernel to dequantize the data.
+    std::vector<Element> stage(block.size(), Element(0.0f));
+    block.copy_from_host(stage.data());
+  }
+  return true;
+}
+
+/// Dequantize the weights for verification
+
+template <class QuantizedElement, 
+          class DequantizedElement,
+          class OperandLayout,
+          class ElementScale,
+          class ElementZero,
+          class ScaleBroadCastLayout,
+          class ThrLayout>
+__global__ void dequantize_weight_kernel(DequantizedElement* dq_buffer,
+                                         QuantizedElement const* q_buffer,
+                                         OperandLayout const operand_layout,
+                                         ElementScale const* scale_buffer,
+                                         ElementZero const* zero_buffer,
+                                         ScaleBroadCastLayout const broadcasted_scale_layout,
+                                         ThrLayout thr_layout) {
+  using namespace cute;
+
+  // Represent the full tensors to gmem elements. 
+  // These are expected to have shape [MN, K, L]
+  cute::Tensor gmem_op_dq = cute::make_tensor(cute::make_gmem_ptr(dq_buffer), operand_layout);
+  auto init_quantized_iterator = [&]() {
+    if constexpr (cute::sizeof_bits_v<QuantizedElement> >= 8) {
+      return cute::make_gmem_ptr(q_buffer);
+    } else {
+      return cute::subbyte_iterator<const QuantizedElement>(q_buffer);
+    }
+  };
+  cute::Tensor gmem_op_q  = cute::make_tensor(init_quantized_iterator(), operand_layout);
+  // While the scales are expected to have shape [MN, G, L] but with a stride to allow broadcasting
+  // It is expected that K % G == 0
+  cute::Tensor gmem_scale_broadcasted = cute::make_tensor(make_gmem_ptr(scale_buffer), broadcasted_scale_layout);
+  cute::Tensor gmem_zero_broadcasted = cute::make_tensor(make_gmem_ptr(zero_buffer), broadcasted_scale_layout);
+
+  // Assign 1 thread per element in the thread block
+  auto blk_shape = make_shape(size<0>(thr_layout), _1{}, _1{}); // 
+  auto blk_coord = make_coord(_, blockIdx.x, blockIdx.y);  // (MN, K, L)
+
+  // Tile across the block
+  auto gOp_dq = cute::local_tile(gmem_op_dq, blk_shape, blk_coord);
+  auto gScale = cute::local_tile(gmem_scale_broadcasted, blk_shape, blk_coord);
+  auto gZero  = cute::local_tile(gmem_zero_broadcasted,  blk_shape, blk_coord);
+  auto gOp_q  = cute::local_tile(gmem_op_q, blk_shape, blk_coord);
+  
+  auto tOpDq_gOpDq = cute::local_partition(gOp_dq, thr_layout, threadIdx.x);
+  auto tScale_gScale = cute::local_partition(gScale, thr_layout, threadIdx.x);
+  auto tZero_gZero = cute::local_partition(gZero, thr_layout, threadIdx.x);
+  auto tOpQ_gOpQ = cute::local_partition(gOp_q, thr_layout, threadIdx.x);
+
+  // Make a fragment of registers to hold gmem loads
+  cute::Tensor rmem_op_q = cute::make_fragment_like(tOpQ_gOpQ(_, _, _, 0));
+  cute::Tensor rmem_scale = cute::make_fragment_like(tScale_gScale(_, _, _, 0));
+  cute::Tensor rmem_zero = cute::make_fragment_like(tZero_gZero(_, _, _, 0));
+  cute::Tensor rmem_op_dq = cute::make_fragment_like(tOpDq_gOpDq(_, _, _, 0));
+  cute::Tensor rmem_op_scaled = cute::make_fragment_like<ElementScale>(rmem_op_dq);
+  cute::Tensor rmem_zero_buf = cute::make_fragment_like<ElementScale>(rmem_zero);
+
+  cute::Tensor pred_id = cute::make_identity_tensor(shape(operand_layout));
+  auto pred_blk_tile = cute::local_tile(pred_id, blk_shape, blk_coord);
+  auto pred_thr_partition = cute::local_partition(pred_blk_tile, thr_layout, threadIdx.x);
+
+  const auto num_iters = cute::size<3>(tOpDq_gOpDq);
+  
+  for (int ii = 0; ii < num_iters; ++ii) {
+    const auto thread_offset = cute::get<0>(pred_thr_partition(0, 0, 0, ii));
+    if (thread_offset < cute::size<0>(operand_layout)) {
+      cute::copy(tOpQ_gOpQ(_, _, _, ii), rmem_op_q);
+      cute::copy(tScale_gScale(_, _, _, ii), rmem_scale);
+      cute::copy(tZero_gZero(_, _, _, ii), rmem_zero);
+      cute::transform(rmem_op_q, rmem_op_scaled, [] (const QuantizedElement& elt) { return ElementScale(elt); } );
+      cute::transform(rmem_zero, rmem_zero_buf, [] (const ElementZero& elt) { return ElementScale(elt); } );
+      cute::transform(rmem_op_scaled, rmem_scale, rmem_op_scaled, multiplies{});
+      cute::transform(rmem_op_scaled, rmem_zero_buf, rmem_op_scaled, plus{});
+      cute::transform(rmem_op_scaled, rmem_op_dq, [] (const ElementScale& elt) { return DequantizedElement(elt); } );
+      cute::copy(rmem_op_dq, tOpDq_gOpDq(_, _, _, ii));
+    }
+  }
+}
+
+template <class QuantizedElement, 
+          class DequantizedElement,
+          class OperandLayout,
+          class ElementScale,
+          class ElementZero,
+          class ScaleLayout>
+void dequantize_weight(DequantizedElement* dq_buffer,
+                       QuantizedElement const* q_buffer,
+                       OperandLayout const operand_layout,
+                       ElementScale const* scale_buffer,
+                       ElementZero const* zero_buffer,
+                       ScaleLayout const scale_layout,
+                       int const group_size) {
+  
+  using namespace cute;
+
+  constexpr int tpb = 128;
+  auto thr_layout = make_layout(make_shape(Int<tpb>{}));
+
+  const auto num_rows = get<0>(shape(operand_layout));
+  const auto gemm_k = get<1>(shape(operand_layout));   // [MN, K, L]
+  const auto batches = get<2>(shape(operand_layout));  // [MN, K, L]
+  const auto scale_k = get<1>(shape(scale_layout));    // [MN, Scale_K, L]
+
+  if (num_rows != size<0>(scale_layout)) {
+    std::cerr << "Invalid first dimension for scales. Must match first dim for weights."
+              << " But got shapes " << shape(operand_layout) << " " << shape(scale_layout) 
+              << std::endl;
+    exit(-1);
+  }
+
+  const auto scale_stride0 = get<0>(stride(scale_layout));
+  const auto scale_stride1 = get<1>(stride(scale_layout));
+  const auto scale_stride2 = get<2>(stride(scale_layout));
+
+  auto scale_shape_bcast = make_shape(num_rows, make_shape(group_size, scale_k), batches);
+  auto scale_stride_bcast = make_stride(scale_stride0, make_stride(0, scale_stride1), scale_stride2);
+  auto scale_layout_bcast = make_layout(scale_shape_bcast, scale_stride_bcast);
+
+  const auto blocks_x = gemm_k;
+  const auto blocks_y = batches;
+
+  dim3 blocks(blocks_x, blocks_y, 1);
+  dequantize_weight_kernel<<<blocks, tpb>>>(dq_buffer, q_buffer, operand_layout, scale_buffer, zero_buffer, scale_layout_bcast, thr_layout);
+  CUDA_CHECK(cudaDeviceSynchronize());
+}
diff --git a/examples/55_hopper_mixed_dtype_gemm/packed_scale.hpp b/examples/55_hopper_mixed_dtype_gemm/packed_scale.hpp
new file mode 100644
index 0000000000..bd71e9cf28
--- /dev/null
+++ b/examples/55_hopper_mixed_dtype_gemm/packed_scale.hpp
@@ -0,0 +1,212 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cstdint>
+
+
+#include "cutlass/util/device_memory.h"
+#include "cutlass/integer_subbyte.h"
+#include "cutlass/float8.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+
+#include "cute/tensor.hpp"
+#include "cute/util/type_traits.hpp"
+
+namespace cutlass
+{
+template<typename T>
+class packed_scale_t {
+public:
+  static_assert(cute::is_same_v<T, cutlass::int8_t> ||
+                cute::is_same_v<T, cutlass::uint8_t> ||
+                cute::is_same_v<T, cutlass::float_e4m3_t> ||
+                cute::is_same_v<T, cutlass::float_e5m2_t>,
+                "only 8 bit arithmetic types are supported.");
+  CUTLASS_HOST_DEVICE
+  explicit packed_scale_t(T val) {
+    if constexpr (!cute::is_unsigned_v<T>) {
+      // Only pack negative values. The positive values are generated in flight in the mainloop.
+      storage[0] = pack4(T(float(val) * -8.f), T(float(val) * -7.f), T(float(val) * -6.f), T(float(val) * -5.f));
+      storage[1] = pack4(T(float(val) * -4.f), T(float(val) * -3.f), T(float(val) * -2.f), -val);
+    }
+    else {
+      storage[0] = pack4(T(float(val) * 8.f), T(float(val) * 7.f), T(float(val) * 6.f), T(float(val) * 5.f));
+      storage[1] = pack4(T(float(val) * 4.f), T(float(val) * 3.f), T(float(val) * 2.f), val);
+    }
+  }
+  CUTLASS_HOST_DEVICE
+  packed_scale_t() = default;
+  CUTLASS_HOST_DEVICE
+  explicit operator float() const {
+    return float(get());
+  }
+  CUTLASS_HOST_DEVICE
+  bool operator==(packed_scale_t const& rhs) const {
+    return storage[0] == rhs.storage[0] && storage[1] == rhs.storage[1];
+  }
+  CUTLASS_HOST_DEVICE
+  bool operator!=(packed_scale_t const& rhs) const {
+    return !(*this == rhs);
+  }
+  CUTLASS_HOST_DEVICE
+  friend packed_scale_t operator+(packed_scale_t const& lhs, packed_scale_t const& rhs) {
+    return packed_scale_t(lhs.get() + rhs.get());
+  }
+  CUTLASS_HOST_DEVICE
+  friend packed_scale_t operator-(packed_scale_t const& lhs, packed_scale_t const& rhs) {
+    return packed_scale_t(lhs.get() - rhs.get());
+  }
+  CUTLASS_HOST_DEVICE
+  friend packed_scale_t operator*(packed_scale_t const& lhs, packed_scale_t const& rhs) {
+    return packed_scale_t(lhs.get() * rhs.get());
+  }
+  CUTLASS_HOST_DEVICE
+  friend packed_scale_t operator/(packed_scale_t const& lhs, packed_scale_t const& rhs) {
+    return packed_scale_t(lhs.get() / rhs.get());
+  }
+
+private:
+  using Storage = uint32_t;
+  using Stage = uint8_t;
+
+  Storage storage[2] {};
+
+  CUTLASS_HOST_DEVICE
+  static Storage pack4(T c1, T c2, T c3, T c4) {
+    Storage result = 0;
+    result |= (static_cast<Storage>(reinterpret_cast<Stage const&>(c4)) << 24);
+    result |= (static_cast<Storage>(reinterpret_cast<Stage const&>(c3)) << 16);
+    result |= (static_cast<Storage>(reinterpret_cast<Stage const&>(c2)) << 8);
+    result |= static_cast<Storage>(reinterpret_cast<Stage const&>(c1));
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  T get() const {
+    auto stage = static_cast<Stage>(storage[0] >> 8);
+    #if defined(__CUDA_ARCH__)
+    return reinterpret_cast<T const&>(stage);
+    #else
+    T tmp;
+    std::memcpy(&tmp, &stage, sizeof(Stage));
+    return tmp;
+    #endif
+  }
+  CUTLASS_HOST_DEVICE
+  T get(int idx) const {
+    Stage stage;
+    if (idx < 4) stage = static_cast<Stage>(storage[0] >> (8 * idx));
+    else         stage = static_cast<Stage>(storage[1] >> (8 * idx - 32));
+    #if defined(__CUDA_ARCH__)
+    return reinterpret_cast<T const&>(stage);
+    #else
+    T tmp;
+    std::memcpy(&tmp, &stage, sizeof(Stage));
+    return tmp;
+    #endif
+  }
+};
+}
+
+/// Helpers to initialize scale lookup table
+
+// In the mainloop, PRMT selects 1 byte from only 8 bytes so the sign bit is handled in an extra PRMT.
+// Here the encodings of positive values and negative values are unified (except for the sign bit). 
+// For instance, 1 becomes 0b0111, which is the same encoding as -1 (0b1111).
+bool unify_quant_encoding(
+  cutlass::DeviceAllocation<cutlass::int4b_t> const& block_in,
+  cutlass::DeviceAllocation<cutlass::int4b_t>& block_out) {
+
+  using StorageType = cutlass::int4b_t::Storage;
+
+  if (block_in.size() != block_out.size()) {
+    std::cerr << "block_in and block_out must have same size.\n";
+    return false;
+  }
+  constexpr int pack = cute::sizeof_bits_v<StorageType> / 4;
+  std::vector<StorageType> data(block_in.size() / pack);
+  cutlass::device_memory::copy_to_host(data.data(), (StorageType*)block_in.get(), block_in.size() / pack);
+
+  for (auto&& d : data) {
+    StorageType out = 0;
+    StorageType mask = 0x0f;
+    for (int i = 0; i < pack; ++i) {
+      cutlass::int4b_t curr;
+      curr.storage = (d >> (i * 4)) & 0x0f;
+      switch (curr) {
+        case 1: curr.storage = StorageType(0b0111); break; // 2's complement
+        case 2: curr.storage = StorageType(0b0110); break; // 2's complement
+        case 3: curr.storage = StorageType(0b0101); break; // 2's complement
+        case 4: curr.storage = StorageType(0b0100); break; // 2's complement
+        case 5: curr.storage = StorageType(0b0011); break; // 2's complement
+        case 6: curr.storage = StorageType(0b0010); break; // 2's complement
+        case 7: curr.storage = StorageType(0b0001); break; // 2's complement
+        default: break;
+      }
+      out |= (curr.storage << (4 * i)) & mask;
+      mask <<= 4;
+    }
+    d = out;
+  }
+
+  cutlass::device_memory::copy_to_device((StorageType*)block_out.get(), data.data(), block_out.size() / pack);
+  return true;
+}
+
+template <class ElementScale>
+bool initialize_packed_scale(
+  cutlass::DeviceAllocation<ElementScale> const& block_in, 
+  cutlass::DeviceAllocation<cutlass::Array<ElementScale, 8> > & block_out) {
+  
+  std::vector<ElementScale> data_in(block_in.size());
+  std::vector<cutlass::Array<ElementScale, 8> > data_out(block_in.size());
+  try {
+    block_in.copy_to_host(data_in.data());
+  } catch (cutlass::cuda_exception const& e)
+  {
+    std::cerr << "CUDA Error: " << cudaGetErrorString(e.cudaError()) << std::endl;
+    return false;
+  }
+  for (size_t i = 0; i < block_in.size(); ++i)
+  {
+    cutlass::packed_scale_t<ElementScale> tmp(data_in[i]);
+    data_out[i] = reinterpret_cast<cutlass::Array<ElementScale, 8> const&>(tmp);
+  }
+  try {
+    block_out.copy_from_host(data_out.data());
+  } catch (cutlass::cuda_exception const& e)
+  {
+    std::cerr << "CUDA Error: " << cudaGetErrorString(e.cudaError()) << std::endl;
+    return false;
+  }
+  return true;
+}
diff --git a/examples/55_hopper_mixed_dtype_gemm/reorder_utils.hpp b/examples/55_hopper_mixed_dtype_gemm/reorder_utils.hpp
new file mode 100644
index 0000000000..de5a3d3fd0
--- /dev/null
+++ b/examples/55_hopper_mixed_dtype_gemm/reorder_utils.hpp
@@ -0,0 +1,162 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cute/layout.hpp"
+#include "cute/tensor.hpp"
+#include "cute/arch/mma_sm90.hpp"
+
+#include "cutlass/util/device_memory.h"
+
+// Given a type of MMA instruction, compute a memory reordering atom that places all values
+// owned by each thread in contiguous memory locations. This improves smem load vectorization,
+// particularly for mixed dtype GEMMs where a narrow type is loaded in the thread/value order
+// of the wider type and may result in inefficient sub-bank (8-bit or 16-bit) accesses.
+// In addition, we can reorder the values across several MMA instructions to get even wider
+// vectorization (AtomLayout parameter) and permute the values within each instruction to get
+// more optimal conversion instruction sequences (ValLayout parameter).
+template<class ElementMma, 
+         class AtomLayout = cute::Layout<cute::_1>,
+         class ValLayout  = cute::Layout<cute::_1>>
+constexpr auto compute_memory_reordering_atom(AtomLayout atom_layout = {}, ValLayout val_layout = {})
+{
+  using namespace cute;
+
+  static_assert(is_static_v<ValLayout>, "ValLayout must be static");
+  static_assert(is_static_v<AtomLayout>, "AtomLayout must be static");
+
+  // 1. Choose an MMA atom to access TV layout and MN shape
+  // Note: parameters like GMMA Major, TileShape, ElementC don't affect TV layout of A, use arbitrary
+  using MmaAtom = decltype(SM90::GMMA::rs_op_selector<ElementMma, ElementMma, float, Shape<_64,_16,_32>>());
+  using MmaTraits = MMA_Traits<MmaAtom>;
+  auto mk_shape_mma = select<0,2>(typename MmaTraits::Shape_MNK{});
+  auto tv_layout_mma = typename MmaTraits::ALayout{};
+  static_assert(size<1>(tv_layout_mma) % size(val_layout) == 0, "Value layout must evenly divide the MMA value layout");
+
+  // 2. Create a single warp's TV layout from that of the whole MMA and invert to get (m,k -> thr,val)
+  // Note: this assumes A is partitioned between warps along M mode
+  auto tv_tiler_warp = make_shape(Int<32>{}, size<1>(tv_layout_mma));
+  auto mk_shape_warp = shape_div(mk_shape_mma, size(typename MmaTraits::ThrID{}) / Int<32>{});
+  auto tv_layout_mma_warp = make_layout_like(composition(tv_layout_mma, tv_tiler_warp));
+  auto mk_layout_mma_warp = right_inverse(tv_layout_mma_warp).with_shape(mk_shape_warp);
+
+  // 3. Repeat the warp layout NumAtoms times along K mode to get wider vectorization
+  auto mk_layout_mma_trgt = blocked_product(mk_layout_mma_warp, atom_layout);
+
+  // 4. Compose with a contiguous layout of values in each thread (required for smem vectorization)
+  auto val_to_offset = logical_product(val_layout, size<1>(tv_layout_mma) / size(val_layout) * size(atom_layout));
+  auto thr_to_offset = make_layout(size<0>(tv_layout_mma_warp));
+  auto tv_to_offset = select<1,0>(logical_product(val_to_offset, thr_to_offset));
+  auto layout_atom = composition(tv_to_offset, mk_layout_mma_trgt);
+
+  return layout_atom;
+}
+
+template<class TileShape, class EngineSrc, class LayoutSrc, class EngineDst, class LayoutDst, class TiledCopy>
+__global__ void reorder_tensor_kernel(
+  cute::Tensor<EngineSrc, LayoutSrc> S,
+  cute::Tensor<EngineDst, LayoutDst> D,
+  TiledCopy tiled_copy)
+{
+  using namespace cute;
+
+  using T = typename EngineDst::value_type;
+
+  Tensor gS = local_tile(S, TileShape{}, make_coord(blockIdx.x, _, blockIdx.z));
+  Tensor gD = local_tile(D, TileShape{}, make_coord(blockIdx.x, _, blockIdx.z));
+
+  auto thread_copy = tiled_copy.get_slice(threadIdx.x);
+  Tensor tS = thread_copy.partition_S(gS);
+  Tensor tD = thread_copy.partition_D(gD);
+
+  copy(tiled_copy, tS, tD);
+}
+
+template<class EngineSrc, class LayoutSrc, class EngineDst, class LayoutDst>
+void reorder_tensor(
+  cute::Tensor<EngineSrc, LayoutSrc> S,
+  cute::Tensor<EngineDst, LayoutDst> D)
+{
+  using namespace cute;
+
+  using T = typename EngineDst::value_type;
+  static_assert(is_same_v<remove_const_t<typename EngineSrc::value_type>, T>, "Type mismatch");
+
+  // Construct a value layout that assigns at least 8 bits of contiguous elements in destination tensor to a thread
+  // This avoids a race condition when writing out subbyte types (e.g. int4b_t).
+  auto has_major_mode = [](auto s) {
+    return any_of(s, [](auto a){ return is_constant<1, decltype(a)>{}; });
+  };
+  static_assert(has_major_mode(stride<0>(LayoutDst{})) ^ has_major_mode(stride<1>(LayoutDst{})), 
+                "Could not find stride-1 mode in destination layout");
+  constexpr int N = shape_div(Int<8>{}, sizeof_bits<T>{});
+  auto val_layout = conditional_return<has_major_mode(stride<0>(LayoutDst{}))>(
+    make_layout(make_shape(Int<N>{}, Int<1>{}), GenColMajor{}),
+    make_layout(make_shape(Int<1>{}, Int<N>{}), GenRowMajor{}));
+
+  // Make a tiled copy with a simple row-major thread order and above layout
+  int constexpr NumThreads = 128;
+  auto const thr_layout = make_layout(make_shape(Int<1>{}, Int<NumThreads>{}));
+  auto tiled_copy = make_tiled_copy(Copy_Atom<DefaultCopy, T>{}, thr_layout, val_layout);
+
+  // Assign a group of 16 rows to a threadblock; this matches the shuffle atom size for Hopper
+  using TileShape = Shape<_16>;
+  auto tiled_D = group_modes<3,rank_v<LayoutDst>>(tiled_divide(D, TileShape{}));
+  dim3 blocks{unsigned(size<1>(tiled_D)), 1u, unsigned(size<3>(tiled_D))};
+
+  reorder_tensor_kernel<TileShape><<<blocks, NumThreads>>>(S, D, tiled_copy);
+  CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+// In-place version
+template<class T, class LayoutSrc, class LayoutDst>
+void reorder_tensor(
+  T const* src,
+  LayoutSrc const& layout_src,
+  T * dst,
+  LayoutDst const& layout_dst)
+{
+  using namespace cute;
+  reorder_tensor(make_tensor(make_gmem_ptr<T>(src), layout_src),
+                 make_tensor(make_gmem_ptr<T>(dst), layout_dst));
+}
+
+// In-place version
+template<class T, class LayoutSrc, class LayoutDst>
+void reorder_tensor(
+  T * data,
+  LayoutSrc const& layout_src,
+  LayoutDst const& layout_dst)
+{
+  using namespace cute;
+  cutlass::DeviceAllocation<T> temp(size(layout_src));
+  reorder_tensor(data, layout_src, temp.get(), layout_dst);
+  cutlass::device_memory::copy_device_to_device(data, temp.get(), static_cast<size_t>(size(layout_src)));
+}
diff --git a/examples/56_hopper_ptr_array_batched_gemm/56_hopper_ptr_array_batched_gemm.cu b/examples/56_hopper_ptr_array_batched_gemm/56_hopper_ptr_array_batched_gemm.cu
new file mode 100644
index 0000000000..51ce970dbd
--- /dev/null
+++ b/examples/56_hopper_ptr_array_batched_gemm/56_hopper_ptr_array_batched_gemm.cu
@@ -0,0 +1,550 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Hopper Ptr-Array Batched GEMM example using CUTLASS 3 APIs for NVIDIA Hopper architecture.
+
+    This example demonstrates an implementation of Ptr-Array Batched GEMM using a TMA + GMMA
+    warp-specialized cooperative kernel.
+    The new feature showcased in this example is on-the-fly modification of TMA descriptors
+    to move between batches (represented by l).
+
+    To run this example:
+
+      $ ./examples/56_hopper_ptr_array_batched_gemm/56_hopper_ptr_array_batched_gemm --m=2048 --n=2048 --k=2048 --l=10
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/group_array_problem_shape.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+
+#include "helper.h"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM kernel configurations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+// C/D matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C and D matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C and D matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+
+// Core kernel configurations
+using ElementAccumulator  = float;                                          // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm90;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                 // Operator class tag
+using StageCountType = cutlass::gemm::collective::StageCountAuto;           // Stage count maximized based on the tile size
+
+// Different configs for pingpong/cooperative
+struct CooperativeConfig {
+  using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperative;
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecializedCooperative;
+  using TileShape           = Shape<_256,_128,_64>;
+  using ClusterShape        = Shape<_1,_2,_1>;
+};
+
+struct PingpongConfig {
+  using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
+  using TileShape           = Shape<_64,_128,_64>;
+  using ClusterShape        = Shape<_1,_1,_1>;
+};
+
+template <typename ScheduleConfig>
+struct GemmGivenSchedule {
+  using TileShape           = typename ScheduleConfig::TileShape;                   // Threadblock-level tile size
+  using ClusterShape        = typename ScheduleConfig::ClusterShape;                // Shape of the threadblocks in a cluster
+  using KernelSchedule      = typename ScheduleConfig::KernelSchedule;              // Kernel to launch
+  using EpilogueSchedule    = typename ScheduleConfig::EpilogueSchedule;            // Epilogue to launch
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementAccumulator,
+      ElementC, LayoutC, AlignmentC,
+      ElementC, LayoutC, AlignmentC,
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      ArchTag, OperatorClass,
+      ElementA, LayoutA, AlignmentA,
+      ElementB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<
+        static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      KernelSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+};
+
+using GemmKernel = GemmGivenSchedule<CooperativeConfig>::GemmKernel;
+using Gemm = GemmGivenSchedule<CooperativeConfig>::Gemm;
+
+using GemmKernelPingpong = GemmGivenSchedule<PingpongConfig>::GemmKernel;
+using GemmPingpong = GemmGivenSchedule<PingpongConfig>::Gemm;
+
+
+// Reference device GEMM implementation type
+using DeviceGemmReference = cutlass::reference::device::Gemm<
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  ElementAccumulator>;
+
+using StrideA = typename Gemm::GemmKernel::StrideA;
+using StrideB = typename Gemm::GemmKernel::StrideB;
+using StrideC = typename Gemm::GemmKernel::StrideC;
+using StrideD = typename Gemm::GemmKernel::StrideD;
+
+StrideA stride_A;
+StrideB stride_B;
+StrideC stride_C;
+StrideD stride_D;
+uint64_t seed;
+
+std::vector<int64_t> offset_A;
+std::vector<int64_t> offset_B;
+std::vector<int64_t> offset_C;
+std::vector<int64_t> offset_D;
+
+cutlass::DeviceAllocation<typename Gemm::ElementA> block_A;
+cutlass::DeviceAllocation<typename Gemm::ElementB> block_B;
+cutlass::DeviceAllocation<typename Gemm::ElementC> block_C;
+cutlass::DeviceAllocation<typename Gemm::EpilogueOutputOp::ElementOutput> block_D;
+cutlass::DeviceAllocation<typename Gemm::EpilogueOutputOp::ElementOutput> block_ref_D;
+
+cutlass::DeviceAllocation<const typename Gemm::ElementA *> ptr_A;
+cutlass::DeviceAllocation<const typename Gemm::ElementB *> ptr_B;
+cutlass::DeviceAllocation<const typename Gemm::ElementC *> ptr_C;
+cutlass::DeviceAllocation<typename Gemm::EpilogueOutputOp::ElementOutput *> ptr_D;
+cutlass::DeviceAllocation<typename Gemm::EpilogueOutputOp::ElementOutput *> ptr_ref_D;
+
+#endif // defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Testbed utility types
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help = false;
+
+  float alpha = 1.0f;
+  float beta = 0.0f;
+  int iterations = 10;
+  int m = 1024, n = 512, k = 1024, l = 10;
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("m", m);
+    cmd.get_cmd_line_argument("n", n);
+    cmd.get_cmd_line_argument("k", k);
+    cmd.get_cmd_line_argument("l", l);
+    cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+    cmd.get_cmd_line_argument("beta", beta, 0.f);
+    cmd.get_cmd_line_argument("iterations", iterations);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "56_hopper_ptr_array_batched_gemm\n\n"
+      << "  Hopper FP32 GEMM using a Warp Specialized kernel.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement\n\n"
+      << "  --m=<int>                   Sets the M extent of the GEMM\n"
+      << "  --n=<int>                   Sets the N extent of the GEMM\n"
+      << "  --k=<int>                   Sets the K extent of the GEMM\n"
+      << "  --l=<int>                   Sets the batch count for Ptr-Array GEMM\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha\n"
+      << "  --beta=<f32>                Epilogue scalar beta\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform\n\n";
+
+    out
+      << "\n\nExamples:\n\n"
+      << "$ " << "56_hopper_ptr_array_batched_gemm" << " --m=1024 --n=512 --k=1024 --l=10 --alpha=2 --beta=0.707 \n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const
+  {
+    // Two flops per multiply-add
+    uint64_t flop = uint64_t(2) * m * n * k * l;
+    double gflop = double(flop) / double(1.0e9);
+    return gflop / runtime_s;
+  }
+};
+
+/// Result structure
+struct Result
+{
+  double avg_runtime_ms = 0.0;
+  double gflops = 0.0;
+  cutlass::Status status = cutlass::Status::kSuccess;
+  cudaError_t error = cudaSuccess;
+  bool passed = false;
+};
+
+#if defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM setup and evaluation
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to initialize a block of device data
+template <class Element>
+bool initialize_block(
+  cutlass::DeviceAllocation<Element>& block,
+  uint64_t seed=2023) {
+
+  Element scope_max, scope_min;
+  int bits_input = cutlass::sizeof_bits<Element>::value;
+
+  if (bits_input == 1) {
+    scope_max = static_cast<Element>(2);
+    scope_min = static_cast<Element>(0);
+  } else if (bits_input <= 8) {
+    scope_max = static_cast<Element>(2);
+    scope_min = static_cast<Element>(-2);
+  } else {
+    scope_max = static_cast<Element>(8);
+    scope_min = static_cast<Element>(-8);
+  }
+
+  cutlass::reference::device::BlockFillRandomUniform(
+    block.get(), block.size(), seed, scope_max, scope_min, 0);
+
+  return true;
+}
+
+/// Allocates device-side data
+void allocate(const Options &options) {
+  int64_t total_elements_A = 0;
+  int64_t total_elements_B = 0;
+  int64_t total_elements_C = 0;
+  int64_t total_elements_D = 0;
+
+  for (int32_t i = 0; i < options.l; ++i) {
+
+    offset_A.push_back(total_elements_A);
+    offset_B.push_back(total_elements_B);
+    offset_C.push_back(total_elements_C);
+    offset_D.push_back(total_elements_D);
+
+    int64_t elements_A = options.m * options.k;
+    int64_t elements_B = options.k * options.n;
+    int64_t elements_C = options.m * options.n;
+    int64_t elements_D = options.m * options.n;
+
+    total_elements_A += elements_A;
+    total_elements_B += elements_B;
+    total_elements_C += elements_C;
+    total_elements_D += elements_D;
+  }
+
+  block_A.reset(total_elements_A);
+  block_B.reset(total_elements_B);
+  block_C.reset(total_elements_C);
+  block_D.reset(total_elements_D);
+  block_ref_D.reset(total_elements_D);
+}
+
+/// Initialize operands to be used in the GEMM and reference GEMM
+void initialize(const Options &options) {
+
+  stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(options.m, options.k, options.l));
+  stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(options.n, options.k, options.l));
+  stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(options.m, options.n, options.l));
+  stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(options.m, options.n, options.l));
+
+  //
+  // Assign pointers
+  //
+
+  std::vector<ElementA *> ptr_A_host(options.l);
+  std::vector<ElementB *> ptr_B_host(options.l);
+  std::vector<ElementC *> ptr_C_host(options.l);
+  std::vector<ElementC *> ptr_D_host(options.l);
+
+  for (int32_t i = 0; i < options.l; ++i) {
+    ptr_A_host.at(i) = block_A.get() + offset_A.at(i);
+    ptr_B_host.at(i) = block_B.get() + offset_B.at(i);
+    ptr_C_host.at(i) = block_C.get() + offset_C.at(i);
+    ptr_D_host.at(i) = block_D.get() + offset_D.at(i);
+  }
+
+  ptr_A.reset(options.l);
+  ptr_A.copy_from_host(ptr_A_host.data());
+
+  ptr_B.reset(options.l);
+  ptr_B.copy_from_host(ptr_B_host.data());
+
+  ptr_C.reset(options.l);
+  ptr_C.copy_from_host(ptr_C_host.data());
+
+  ptr_D.reset(options.l);
+  ptr_D.copy_from_host(ptr_D_host.data());
+
+  initialize_block(block_A, seed + 2023);
+  initialize_block(block_B, seed + 2022);
+  initialize_block(block_C, seed + 2021);
+}
+
+/// Populates a Gemm::Arguments structure from the given commandline options
+template <typename GemmT>
+typename GemmT::Arguments args_from_options(const Options &options)
+{
+  cutlass::KernelHardwareInfo hw_info;
+  // Change device_id to another value if you are running on a machine with multiple GPUs and wish
+  // to use a GPU other than that with device ID 0.
+  hw_info.device_id = 0;
+  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+  typename GemmT::Arguments arguments{
+    cutlass::gemm::GemmUniversalMode::kArray,
+    {{options.m, options.n, options.k, options.l}},
+    {ptr_A.get(), stride_A, ptr_B.get(), stride_B},
+    {{options.alpha, options.beta}, ptr_C.get(), stride_C, ptr_D.get(), stride_D},
+    hw_info
+  };
+
+  return arguments;
+}
+
+bool verify(const Options &options) {
+  bool passed = true;
+  for (int32_t i = 0; i < options.l; ++i) {
+    cutlass::TensorRef ref_A(block_A.get() + offset_A.at(i), Gemm::LayoutA::packed({options.m, options.k}));
+    cutlass::TensorRef ref_B(block_B.get() + offset_B.at(i), Gemm::LayoutB::packed({options.k, options.n}));
+    cutlass::TensorRef ref_C(block_C.get() + offset_C.at(i), Gemm::LayoutC::packed({options.m, options.n}));
+    cutlass::TensorRef ref_D(block_ref_D.get() + offset_D.at(i), Gemm::LayoutD::packed({options.m, options.n}));
+
+    //
+    // Compute reference output
+    //
+
+    // Create instantiation for device reference gemm kernel
+    DeviceGemmReference gemm_reference;
+
+    // Launch device reference gemm kernel
+    gemm_reference(
+      {options.m, options.n, options.k},
+      ElementAccumulator(options.alpha),
+      ref_A,
+      ref_B,
+      ElementAccumulator(options.beta),
+      ref_C,
+      ref_D);
+
+    // Wait for kernel to finish
+    CUDA_CHECK(cudaDeviceSynchronize());
+
+    // Check if output from CUTLASS kernel and reference kernel are equal or not
+    passed &= cutlass::reference::device::BlockCompareEqual(block_ref_D.get() + offset_D.at(i), block_D.get() + offset_D.at(i), options.m * options.n);
+  }
+  return passed;
+}
+
+/// Execute a given example GEMM computation
+template <typename GemmT>
+int run(Options &options)
+{
+  allocate(options);
+  initialize(options);
+
+  // Instantiate CUTLASS kernel depending on templates
+  GemmT gemm;
+
+  // Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
+  auto arguments = args_from_options<GemmT>(options);
+
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size = GemmT::get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  // Check if the problem size is supported or not
+  CUTLASS_CHECK(gemm.can_implement(arguments));
+
+  // Initialize CUTLASS kernel with arguments and workspace pointer
+  CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
+
+  // Correctness / Warmup iteration
+  CUTLASS_CHECK(gemm.run());
+
+  // Check if output from CUTLASS kernel and reference kernel are equal or not
+  Result result;
+  result.passed = verify(options);
+
+  std::cout << "  Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
+
+  if (!result.passed) {
+    exit(-1);
+  }
+
+  // Run profiling loop
+  if (options.iterations > 0)
+  {
+    GpuTimer timer;
+    timer.start();
+    for (int iter = 0; iter < options.iterations; ++iter) {
+      CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
+      CUTLASS_CHECK(gemm.run());
+    }
+    timer.stop();
+
+    // Compute average setup and runtime and GFLOPs.
+    float elapsed_ms       = timer.elapsed_millis();
+    result.avg_runtime_ms  = double(elapsed_ms) / double(options.iterations);
+    result.gflops          = options.gflops(result.avg_runtime_ms / 1000.0);
+
+    std::cout << "  Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << std::endl;
+    std::cout << "  Batches     : " << options.l  << std::endl;
+    std::cout << "  Alpha, Beta : " << options.alpha << ',' << options.beta << std::endl;
+    std::cout << "  Avg runtime : " << result.avg_runtime_ms << " ms" << std::endl;
+    std::cout << "  GFLOPS      : " << result.gflops << std::endl;
+  }
+
+  return 0;
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  // CUTLASS must be compiled with CUDA 12.3 Toolkit to run this example
+  if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 3)) {
+    std::cerr << "This example requires CUDA 12.3 or newer.\n";
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+
+  cudaDeviceProp props;
+  int current_device_id;
+  CUDA_CHECK(cudaGetDevice(&current_device_id));
+  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (props.major < 9) {
+    std::cerr
+      << "This example requires a GPU of NVIDIA's Hopper Architecture or "
+      << "later (compute capability 90 or greater).\n";
+    return 0;
+  }
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  //
+  // Evaluate CUTLASS kernels
+  //
+
+#if defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED)
+  std::cout << "\n*** Cooperative schedule ***" << std::endl;
+  run<Gemm>(options);
+  std::cout << "\n*** Pingpong schedule ***" << std::endl;
+  run<GemmPingpong>(options);
+#endif
+
+  return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/examples/56_hopper_ptr_array_batched_gemm/CMakeLists.txt b/examples/56_hopper_ptr_array_batched_gemm/CMakeLists.txt
new file mode 100644
index 0000000000..1f59ceb8a1
--- /dev/null
+++ b/examples/56_hopper_ptr_array_batched_gemm/CMakeLists.txt
@@ -0,0 +1,54 @@
+
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+set(TEST_SQUARE --m=2048 --n=2048 --k=2048 -l=10 --iterations=1)                   # Square problem sizes
+set(TEST_SQUARE_LARGE_BATCH --m=2048 --n=2048 --k=2048 -l=500 --iterations=1)      # Square problem sizes
+
+set(TEST_EPILOGUE --alpha=0.5 --beta=0.5 --iterations=1)                           # Default problem sizes
+set(TEST_EPILOGUE_LARGE_BATCH --alpha=1.5 --beta=2.0 -l=500 --iterations=1)        # Default problem sizes
+
+set(TEST_EPILOGUE_OP --beta=0.5 --iterations=1)                                    # Default problem sizes w/ Epilogue Op test
+set(TEST_EPILOGUE_OP_LARGE_BATCH --alpha=1.5 -l=500 --iterations=1)                # Default problem sizes w/ Epilogue Op test
+
+set(TEST_SMALLK --m=2048 --n=5120 --k=128 --l=5 --iterations=1)                    # Small-k problem sizes
+set(TEST_SMALLK_LARGE_BATCH --m=1024 --n=512 --k=64 --l=500 --iterations=1)        # Small-k problem sizes
+
+cutlass_example_add_executable(
+  56_hopper_ptr_array_batched_gemm
+  56_hopper_ptr_array_batched_gemm.cu
+  TEST_COMMAND_OPTIONS
+  TEST_SQUARE
+  TEST_SQUARE_LARGE_BATCH
+  TEST_EPILOGUE
+  TEST_EPILOGUE_LARGE_BATCH
+  TEST_EPILOGUE_OP
+  TEST_EPILOGUE_OP_LARGE_BATCH
+  TEST_SMALLK
+  TEST_SMALLK_LARGE_BATCH
+  )
diff --git a/examples/57_hopper_grouped_gemm/57_hopper_grouped_gemm.cu b/examples/57_hopper_grouped_gemm/57_hopper_grouped_gemm.cu
new file mode 100644
index 0000000000..7b20a33548
--- /dev/null
+++ b/examples/57_hopper_grouped_gemm/57_hopper_grouped_gemm.cu
@@ -0,0 +1,772 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Hopper Grouped GEMM example using CUTLASS 3 APIs for NVIDIA Hopper architecture.
+
+    This example demonstrates an implementation of Grouped GEMM using a TMA + GMMA
+    warp-specialized cooperative kernel.
+    For this example all scheduling work is performed on the device.
+    The new feature showcased in this example is on-the-fly modification of TMA descriptors
+    to move between groups/problem_count (represented by groups).
+
+    To run this example:
+
+      $ ./examples/57_hopper_grouped_gemm/57_hopper_grouped_gemm --m=2048 --n=2048 --k=2048 --groups=10
+
+      The above example command makes all 10 groups to be sized at the given m, n, k sizes.
+      Skipping any of the problem dimensions randomizes it across the different groups.
+      Same applies for alpha and beta values that are randomized across the different groups.
+
+    To run this example for a set of problems using the benchmark option:
+
+      $ ./examples/57_hopper_grouped_gemm/57_hopper_grouped_gemm --benchmark=./test_benchmark.txt
+
+      Where the test_benchmark.txt may look as such:
+        0 256x512x128
+        1 256x512x512
+        2 512x256x128
+        3 256x256x128
+        4 256x512x1024
+        5 1024x512x128 and so on
+*/
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <vector>
+#include <cfloat>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/group_array_problem_shape.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+
+#include "helper.h"
+
+using namespace cute;
+using ProblemShape = cutlass::gemm::GroupProblemShape<Shape<int,int,int>>; // <M,N,K> per group
+using ElementA = cutlass::float_e4m3_t;                                    // Element type for A matrix operand
+using ElementB = cutlass::float_e5m2_t;                                    // Element type for B matrix operand
+using ElementC = cutlass::half_t;                                          // Element type for C and D matrix operands
+
+#if defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM kernel configurations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// A matrix configuration
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Alignment of B matrix in units of elements (up to 16 bytes)
+
+// C/D matrix configuration
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C and D matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Alignment of C matrix in units of elements (up to 16 bytes)
+
+// Core kernel configurations
+using ElementAccumulator  = float;                                          // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm90;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                 // Operator class tag
+using StageCountType = cutlass::gemm::collective::StageCountAuto;           // Stage count maximized based on the tile size
+
+// Different configs for pingpong/cooperative
+struct CooperativeConfig {
+  using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecializedCooperative;
+  using TileShape           = Shape<_256,_128,_128>;
+  using ClusterShape        = Shape<_2,_2,_1>;
+};
+
+struct PingpongConfig {
+  using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
+  using TileShape           = Shape<_128,_128,_128>;
+  using ClusterShape        = Shape<_2,_1,_1>;
+};
+
+template <typename ScheduleConfig>
+struct GemmGivenSchedule {
+  using TileShape           = typename ScheduleConfig::TileShape;                   // Threadblock-level tile size
+  using ClusterShape        = typename ScheduleConfig::ClusterShape;                // Shape of the threadblocks in a cluster
+  using KernelSchedule      = typename ScheduleConfig::KernelSchedule;              // Kernel to launch
+  using EpilogueSchedule    = typename ScheduleConfig::EpilogueSchedule;            // Epilogue to launch
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    TileShape, ClusterShape,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC *, AlignmentC,
+    ElementC, LayoutC *, AlignmentC,
+    EpilogueSchedule,
+    cutlass::epilogue::fusion::LinearCombination<ElementC, ElementAccumulator>
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA *, AlignmentA,
+    ElementB, LayoutB *, AlignmentB,
+    ElementAccumulator,
+    TileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+};
+
+using GemmKernel = GemmGivenSchedule<CooperativeConfig>::GemmKernel;
+using Gemm = GemmGivenSchedule<CooperativeConfig>::Gemm;
+
+using GemmKernelPingpong = GemmGivenSchedule<PingpongConfig>::GemmKernel;
+using GemmPingpong = GemmGivenSchedule<PingpongConfig>::Gemm;
+
+// Reference device GEMM implementation type
+using DeviceGemmReference = cutlass::reference::device::Gemm<
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  ElementAccumulator>;
+
+using StrideA = typename Gemm::GemmKernel::InternalStrideA;
+using StrideB = typename Gemm::GemmKernel::InternalStrideB;
+using StrideC = typename Gemm::GemmKernel::InternalStrideC;
+using StrideD = typename Gemm::GemmKernel::InternalStrideD;
+
+// Host-side allocations
+std::vector<int64_t> offset_A;
+std::vector<int64_t> offset_B;
+std::vector<int64_t> offset_C;
+std::vector<int64_t> offset_D;
+
+std::vector<StrideA> stride_A_host;
+std::vector<StrideB> stride_B_host;
+std::vector<StrideC> stride_C_host;
+std::vector<StrideD> stride_D_host;
+
+std::vector<ElementAccumulator> alpha_host;
+std::vector<ElementAccumulator> beta_host;
+
+// Device-side allocations
+cutlass::DeviceAllocation<typename ProblemShape::UnderlyingProblemShape> problem_sizes;
+
+cutlass::DeviceAllocation<typename Gemm::ElementA> block_A;
+cutlass::DeviceAllocation<typename Gemm::ElementB> block_B;
+cutlass::DeviceAllocation<typename Gemm::ElementC> block_C;
+cutlass::DeviceAllocation<typename Gemm::EpilogueOutputOp::ElementOutput> block_D;
+cutlass::DeviceAllocation<typename Gemm::EpilogueOutputOp::ElementOutput> block_ref_D;
+
+cutlass::DeviceAllocation<const typename Gemm::ElementA *> ptr_A;
+cutlass::DeviceAllocation<const typename Gemm::ElementB *> ptr_B;
+cutlass::DeviceAllocation<const typename Gemm::ElementC *> ptr_C;
+cutlass::DeviceAllocation<typename Gemm::EpilogueOutputOp::ElementOutput *> ptr_D;
+cutlass::DeviceAllocation<typename Gemm::EpilogueOutputOp::ElementOutput *> ptr_ref_D;
+
+cutlass::DeviceAllocation<StrideA> stride_A;
+cutlass::DeviceAllocation<StrideB> stride_B;
+cutlass::DeviceAllocation<StrideC> stride_C;
+cutlass::DeviceAllocation<StrideD> stride_D;
+
+// Note, this is an array of pointers to alpha and beta scaling values per group
+cutlass::DeviceAllocation<ElementAccumulator*> alpha_device;
+cutlass::DeviceAllocation<ElementAccumulator*> beta_device;
+cutlass::DeviceAllocation<ElementAccumulator> block_alpha;
+cutlass::DeviceAllocation<ElementAccumulator> block_beta;
+
+#endif // defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Testbed utility types
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help = false;
+
+  float alpha = FLT_MAX;
+  float beta  = FLT_MAX;
+  int iterations = 10;
+  int m = 1024, n = 2048, k = 512, groups = 10;
+  std::string benchmark_path;
+  std::vector<typename ProblemShape::UnderlyingProblemShape> problem_sizes_host;
+  int const tma_alignment_bits = 128;
+  int const alignment = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("m", m);
+    cmd.get_cmd_line_argument("n", n);
+    cmd.get_cmd_line_argument("k", k);
+    cmd.get_cmd_line_argument("groups", groups);
+    cmd.get_cmd_line_argument("alpha", alpha, FLT_MAX);
+    cmd.get_cmd_line_argument("beta",  beta,  FLT_MAX);
+    cmd.get_cmd_line_argument("iterations", iterations);
+    cmd.get_cmd_line_argument("benchmark", benchmark_path);
+
+    // Decide how to initialize the problems
+    if (!benchmark_path.empty()) {
+      if (!benchmark_problems()) {
+        problem_sizes_host.clear();
+        return;
+      }
+    }
+    else {
+      randomize_problems(cmd);
+    }
+  }
+
+  void randomize_problems(cutlass::CommandLine &cmd) {
+    int cmd_line_m = -1, cmd_line_n = -1, cmd_line_k = -1;
+    cmd.get_cmd_line_argument("m", cmd_line_m);
+    cmd.get_cmd_line_argument("n", cmd_line_n);
+    cmd.get_cmd_line_argument("k", cmd_line_k);
+
+    problem_sizes_host.reserve(groups);
+
+    for (int i = groups; i > 0; i--) {
+      int m = cmd_line_m;
+      int n = cmd_line_n;
+      int k = cmd_line_k;
+      if (m < 1) {
+        m = alignment * ((rand() % 64) + 1);
+      }
+      if (n < 1) {
+        n = alignment * ((rand() % 64) + 1);
+      }
+      if (k < 1) {
+        k = alignment * ((rand() % 64) + 1);
+      }
+      problem_sizes_host.push_back({m, n, k});
+    }
+  }
+
+  /// Load a benchmark
+  bool benchmark_problems() {
+    std::ifstream file(benchmark_path);
+    if (!file.good()) {
+      return false;
+    }
+
+    while (file.good()) {
+
+      int idx = -1;
+      std::string extent_str;
+
+      file >> idx >> extent_str;
+
+      if (idx < 0 || extent_str.empty()) {
+        break;
+      }
+
+      cutlass::gemm::GemmCoord extent;
+      std::vector<std::string> tokens;
+
+      cutlass::CommandLine::tokenize(tokens, extent_str, 'x');
+
+      for (int i = 0; i < int(tokens.size()); ++i) {
+        int x = std::atoi(tokens.at(i).c_str());
+
+        // round up
+        if (x % alignment) {
+          x += (alignment - (x % alignment));
+        }
+
+        extent.at(i) = x;
+      }
+
+      if (extent.product()) {
+        problem_sizes_host.push_back({extent.m(), extent.n(), extent.k()});
+      }
+    }
+    groups = static_cast<int>(problem_sizes_host.size());
+
+    return true;
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "57_hopper_grouped_gemm\n\n"
+      << "  Hopper FP8 Grouped GEMM using a Warp Specialized kernel.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement\n\n"
+      << "  --m=<int>                   Sets the M extent of the GEMM for all groups\n"
+      << "  --n=<int>                   Sets the N extent of the GEMM for all groups\n"
+      << "  --k=<int>                   Sets the K extent of the GEMM for all groups\n"
+      << "  --groups=<int>              Sets the number of individual GEMM problems for Grouped GEMM\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha\n"
+      << "  --beta=<f32>                Epilogue scalar beta\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform\n\n"
+      << "  --benchmark=<str>           Executes a benchmark problem size.\n";
+
+    out
+      << "\n\nExamples:\n\n"
+      << "$ " << "57_hopper_grouped_gemm" << " --m=1024 --n=512 --k=1024 --groups=10 --alpha=2 --beta=0.707 \n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s, std::vector<typename ProblemShape::UnderlyingProblemShape> problem_sizes_host) const
+  {
+    // Number of real-valued multiply-adds
+    uint64_t fmas = uint64_t();
+
+    for (auto const & problem : problem_sizes_host) {
+      fmas += static_cast<uint64_t>(get<0>(problem)) *
+              static_cast<uint64_t>(get<1>(problem)) *
+              static_cast<uint64_t>(get<2>(problem));
+    }
+    // Two flops per multiply-add
+    uint64_t flop = uint64_t(2) * uint64_t(fmas);
+    double gflop = double(flop) / double(1.0e9);
+    return gflop / runtime_s;
+  }
+};
+
+/// Result structure
+struct Result
+{
+  double avg_runtime_ms = 0.0;
+  double gflops = 0.0;
+  cutlass::Status status = cutlass::Status::kSuccess;
+  cudaError_t error = cudaSuccess;
+  bool passed = false;
+};
+
+#if defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM setup and evaluation
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to initialize a block of device data
+template <class Element>
+bool initialize_block(
+  cutlass::DeviceAllocation<Element>& block,
+  uint64_t seed=2023) {
+
+  Element scope_max, scope_min;
+  int bits_input = cutlass::sizeof_bits<Element>::value;
+
+  if (bits_input == 1) {
+    scope_max = static_cast<Element>(2);
+    scope_min = static_cast<Element>(0);
+  } else if (bits_input <= 8) {
+    scope_max = static_cast<Element>(2);
+    scope_min = static_cast<Element>(-2);
+  } else {
+    scope_max = static_cast<Element>(8);
+    scope_min = static_cast<Element>(-8);
+  }
+
+  cutlass::reference::device::BlockFillRandomUniform(
+    block.get(), block.size(), seed, scope_max, scope_min, 0);
+
+  return true;
+}
+
+/// Allocates device-side data
+void allocate(const Options &options) {
+  int64_t total_elements_A = 0;
+  int64_t total_elements_B = 0;
+  int64_t total_elements_C = 0;
+  int64_t total_elements_D = 0;
+
+  for (int32_t i = 0; i < options.groups; ++i) {
+
+    auto problem = options.problem_sizes_host.at(i);
+    auto M = get<0>(problem);
+    auto N = get<1>(problem);
+    auto K = get<2>(problem);
+
+    offset_A.push_back(total_elements_A);
+    offset_B.push_back(total_elements_B);
+    offset_C.push_back(total_elements_C);
+    offset_D.push_back(total_elements_D);
+
+    int64_t elements_A = M * K;
+    int64_t elements_B = K * N;
+    int64_t elements_C = M * N;
+    int64_t elements_D = M * N;
+
+    total_elements_A += elements_A;
+    total_elements_B += elements_B;
+    total_elements_C += elements_C;
+    total_elements_D += elements_D;
+
+    stride_A_host.push_back(cutlass::make_cute_packed_stride(StrideA{}, {M, K, 1}));
+    stride_B_host.push_back(cutlass::make_cute_packed_stride(StrideB{}, {N, K, 1}));
+    stride_C_host.push_back(cutlass::make_cute_packed_stride(StrideC{}, {M, N, 1}));
+    stride_D_host.push_back(cutlass::make_cute_packed_stride(StrideD{}, {M, N, 1}));
+
+  }
+
+  block_A.reset(total_elements_A);
+  block_B.reset(total_elements_B);
+  block_C.reset(total_elements_C);
+  block_D.reset(total_elements_D);
+  block_ref_D.reset(total_elements_D);
+  block_alpha.reset(options.groups);
+  block_beta.reset(options.groups);
+}
+
+/// Initialize operands to be used in the GEMM and reference GEMM
+void initialize(const Options &options) {
+
+  uint64_t seed = 2020;
+
+  problem_sizes.reset(options.groups);
+  problem_sizes.copy_from_host(options.problem_sizes_host.data());
+
+  //
+  // Assign pointers
+  //
+
+  std::vector<ElementA *> ptr_A_host(options.groups);
+  std::vector<ElementB *> ptr_B_host(options.groups);
+  std::vector<ElementC *> ptr_C_host(options.groups);
+  std::vector<ElementC *> ptr_D_host(options.groups);
+  std::vector<ElementAccumulator *> ptr_alpha_host(options.groups);
+  std::vector<ElementAccumulator *> ptr_beta_host(options.groups);
+
+  for (int32_t i = 0; i < options.groups; ++i) {
+    ptr_A_host.at(i) = block_A.get() + offset_A.at(i);
+    ptr_B_host.at(i) = block_B.get() + offset_B.at(i);
+    ptr_C_host.at(i) = block_C.get() + offset_C.at(i);
+    ptr_D_host.at(i) = block_D.get() + offset_D.at(i);
+    alpha_host.push_back((options.alpha == FLT_MAX) ? static_cast<ElementAccumulator>((rand() % 5) + 1) : options.alpha);
+    beta_host.push_back((options.beta == FLT_MAX) ? static_cast<ElementAccumulator>(rand() % 5) : options.beta);
+    ptr_alpha_host.at(i) = block_alpha.get() + i;
+    ptr_beta_host.at(i) = block_beta.get() + i;
+  }
+
+  ptr_A.reset(options.groups);
+  ptr_A.copy_from_host(ptr_A_host.data());
+
+  ptr_B.reset(options.groups);
+  ptr_B.copy_from_host(ptr_B_host.data());
+
+  ptr_C.reset(options.groups);
+  ptr_C.copy_from_host(ptr_C_host.data());
+
+  ptr_D.reset(options.groups);
+  ptr_D.copy_from_host(ptr_D_host.data());
+
+  stride_A.reset(options.groups);
+  stride_A.copy_from_host(stride_A_host.data());
+
+  stride_B.reset(options.groups);
+  stride_B.copy_from_host(stride_B_host.data());
+
+  stride_C.reset(options.groups);
+  stride_C.copy_from_host(stride_C_host.data());
+
+  stride_D.reset(options.groups);
+  stride_D.copy_from_host(stride_D_host.data());
+
+  alpha_device.reset(options.groups);
+  alpha_device.copy_from_host(ptr_alpha_host.data());
+  beta_device.reset(options.groups);
+  beta_device.copy_from_host(ptr_beta_host.data());
+
+  initialize_block(block_A, seed + 2023);
+  initialize_block(block_B, seed + 2022);
+  initialize_block(block_C, seed + 2021);
+  block_alpha.copy_from_host(alpha_host.data());
+  block_beta.copy_from_host(beta_host.data());
+}
+
+/// Populates a Gemm::Arguments structure from the given commandline options
+template <typename GemmT>
+typename GemmT::Arguments args_from_options(const Options &options, bool host_problem_shapes_available = true)
+{
+  cutlass::KernelHardwareInfo hw_info;
+  // Change device_id to another value if you are running on a machine with multiple GPUs and wish
+  // to use a GPU other than that with device ID 0.
+  hw_info.device_id = 0;
+  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+  typename GemmT::Arguments arguments;
+  decltype(arguments.epilogue.thread) fusion_args;
+
+  if (options.alpha != FLT_MAX && options.beta != FLT_MAX) {
+    // If both alpha/beta are provided (via cmd line args) and are scalar, i.e., same alpha/beta applies to all batches.
+    fusion_args.alpha = options.alpha;
+    fusion_args.beta = options.beta;
+    fusion_args.alpha_ptr = nullptr;
+    fusion_args.beta_ptr = nullptr;
+    fusion_args.alpha_ptr_array = nullptr;
+    fusion_args.beta_ptr_array = nullptr;
+    // Single alpha and beta for all groups
+    fusion_args.dAlpha = {cute::_0{}, cute::_0{}, 0};
+    fusion_args.dBeta = {cute::_0{}, cute::_0{}, 0};
+  }
+  else {
+    // If pointers to alpha/beta are provided, i.e., alpha/beta can differ between batches/groups.
+    fusion_args.alpha = 0;
+    fusion_args.beta = 0;
+    fusion_args.alpha_ptr = nullptr;
+    fusion_args.beta_ptr = nullptr;
+    fusion_args.alpha_ptr_array = alpha_device.get();
+    fusion_args.beta_ptr_array = beta_device.get();
+    // One alpha and beta per each group
+    fusion_args.dAlpha = {cute::_0{}, cute::_0{}, 1};
+    fusion_args.dBeta = {cute::_0{}, cute::_0{}, 1};
+  }
+
+  if (host_problem_shapes_available) {
+    arguments = typename GemmT::Arguments {
+      cutlass::gemm::GemmUniversalMode::kGrouped,
+      {options.groups, problem_sizes.get(), options.problem_sizes_host.data()},
+      {ptr_A.get(), stride_A.get(), ptr_B.get(), stride_B.get()},
+      {fusion_args, ptr_C.get(), stride_C.get(), ptr_D.get(), stride_D.get()},
+      hw_info
+    };
+  }
+  else {
+    arguments = typename GemmT::Arguments {
+      cutlass::gemm::GemmUniversalMode::kGrouped,
+      {options.groups, problem_sizes.get(), nullptr},
+      {ptr_A.get(), stride_A.get(), ptr_B.get(), stride_B.get()},
+      {fusion_args, ptr_C.get(), stride_C.get(), ptr_D.get(), stride_D.get()},
+      hw_info
+    };
+  }
+
+  return arguments;
+}
+
+bool verify(const Options &options) {
+  bool passed = true;
+  for (int32_t i = 0; i < options.groups; ++i) {
+    auto problem = options.problem_sizes_host.at(i);
+    auto M = get<0>(problem);
+    auto N = get<1>(problem);
+    auto K = get<2>(problem);
+    cutlass::TensorRef ref_A(block_A.get() + offset_A.at(i), Gemm::LayoutA::packed({M, K}));
+    cutlass::TensorRef ref_B(block_B.get() + offset_B.at(i), Gemm::LayoutB::packed({K, N}));
+    cutlass::TensorRef ref_C(block_C.get() + offset_C.at(i), Gemm::LayoutC::packed({M, N}));
+    cutlass::TensorRef ref_D(block_ref_D.get() + offset_D.at(i), Gemm::LayoutD::packed({M, N}));
+
+    //
+    // Compute reference output
+    //
+
+    // Create instantiation for device reference gemm kernel
+    DeviceGemmReference gemm_reference;
+
+    // Launch device reference gemm kernel
+    gemm_reference(
+      {M, N, K},
+      ElementAccumulator(alpha_host.at(i)),
+      ref_A,
+      ref_B,
+      ElementAccumulator(beta_host.at(i)),
+      ref_C,
+      ref_D);
+
+    // Wait for kernel to finish
+    CUDA_CHECK(cudaDeviceSynchronize());
+
+    // Check if output from CUTLASS kernel and reference kernel are equal or not
+    passed &= cutlass::reference::device::BlockCompareEqual(block_ref_D.get() + offset_D.at(i), block_D.get() + offset_D.at(i), M * N);
+    #if 0
+    std::cout << "Group: " << i << " Status: " << passed << std::endl;
+    #endif
+  }
+  return passed;
+}
+
+/// Execute a given example GEMM computation
+template <typename GemmT>
+int run(Options &options, bool host_problem_shapes_available = true)
+{
+  allocate(options);
+  initialize(options);
+
+  // Instantiate CUTLASS kernel depending on templates
+  GemmT gemm;
+
+  // Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
+  auto arguments = args_from_options<GemmT>(options, host_problem_shapes_available);
+
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size = GemmT::get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  // Check if the problem size is supported or not
+  CUTLASS_CHECK(gemm.can_implement(arguments));
+
+  // Initialize CUTLASS kernel with arguments and workspace pointer
+  CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
+
+  // Correctness / Warmup iteration
+  CUTLASS_CHECK(gemm.run());
+
+  // Check if output from CUTLASS kernel and reference kernel are equal or not
+  Result result;
+  result.passed = verify(options);
+
+  std::cout << "  Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
+
+  if (!result.passed) {
+    exit(-1);
+  }
+
+  // Run profiling loop
+  if (options.iterations > 0)
+  {
+    GpuTimer timer;
+    timer.start();
+    for (int iter = 0; iter < options.iterations; ++iter) {
+      CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
+      CUTLASS_CHECK(gemm.run());
+    }
+    timer.stop();
+
+    // Compute average setup and runtime and GFLOPs.
+    float elapsed_ms       = timer.elapsed_millis();
+    result.avg_runtime_ms  = double(elapsed_ms) / double(options.iterations);
+    result.gflops          = options.gflops(result.avg_runtime_ms / 1000.0, options.problem_sizes_host);
+
+    std::cout << "  Problem Sizes, Alpha, Beta " << std::endl;
+    for (int32_t i = 0; i < options.groups; ++i) {
+      std::cout << "    " << options.problem_sizes_host.at(i);
+      std::cout << ", " << alpha_host.at(i) << ", " << beta_host.at(i) << std::endl;
+    }
+    std::cout << "  Groups      : " << options.groups  << std::endl;
+    std::cout << "  Avg runtime : " << result.avg_runtime_ms << " ms" << std::endl;
+    std::cout << "  GFLOPS      : " << result.gflops << std::endl;
+  }
+
+  return 0;
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  // CUTLASS must be compiled with CUDA 12.3 Toolkit to run this example
+  if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 3)) {
+    std::cerr << "This example requires CUDA 12.3 or newer.\n";
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+
+  cudaDeviceProp props;
+  int current_device_id;
+  CUDA_CHECK(cudaGetDevice(&current_device_id));
+  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (props.major < 9) {
+    std::cerr
+      << "This example requires a GPU of NVIDIA's Hopper Architecture or "
+      << "later (compute capability 90 or greater).\n";
+    return 0;
+  }
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  //
+  // Evaluate CUTLASS kernels
+  //
+
+#if defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED)
+  std::cout << "\n*** Cooperative schedule ***" << std::endl;
+  run<Gemm>(options);
+  std::cout << "\n*** Cooperative schedule (host problem shapes unavailable) ***" << std::endl;
+  run<Gemm>(options, false /*host_problem_shapes_available*/);
+  std::cout << "\n*** Pingpong schedule ***" << std::endl;
+  run<GemmPingpong>(options);
+  std::cout << "\n*** Pingpong schedule (host problem shapes unavailable) ***" << std::endl;
+  run<GemmPingpong>(options, false /*host_problem_shapes_available*/);
+#endif
+
+  return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/57_hopper_grouped_gemm/CMakeLists.txt b/examples/57_hopper_grouped_gemm/CMakeLists.txt
new file mode 100644
index 0000000000..1dadbfa813
--- /dev/null
+++ b/examples/57_hopper_grouped_gemm/CMakeLists.txt
@@ -0,0 +1,66 @@
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# Note that we set --iterations=0 for all tests below to disable the performance benchmarking.
+# Only the correctness check will be run by these commands.
+
+set(TEST_RANDOM --iterations=0)                                                     # Random problem sizes
+set(TEST_RANDOM_LARGE_GROUP --groups=500 --iterations=0)                            # Random problem sizes
+
+set(TEST_EPILOGUE --alpha=0.5 --beta=0.5 --iterations=0)                            # Random problem sizes
+set(TEST_EPILOGUE_LARGE_GROUP --alpha=1.5 --beta=2.0 --groups=500 --iterations=0)   # Random problem sizes
+
+set(TEST_EPILOGUE_OP --beta=0.5 --iterations=1)                                     # Random problem sizes
+set(TEST_EPILOGUE_OP_LARGE_GROUP --alpha=1.5 --iterations=1)                        # Random problem sizes
+
+set(TEST_FIXED --m=2048 --n=5120 --k=8192 --groups=50 --iterations=0)               # Fixed problem sizes
+set(TEST_FIXED_LARGE_GROUP --m=2048 --n=512 --k=512 --groups=512 --iterations=0)    # Fixed problem sizes
+
+set(TEST_SMALL --m=256 --n=128 --iterations=0)                                      # Small problem sizes
+set(TEST_SMALL_LARGE_GROUP --m=128 --n=128 --groups=500 --iterations=0)             # Small problem sizes
+
+set(TEST_RANDOM_PERF --iterations=10)                                               # Random problem sizes
+set(TEST_RANDOM_PERF_LARGE_GROUP --groups=500 --iterations=10)                      # Random problem sizes
+
+cutlass_example_add_executable(
+  57_hopper_grouped_gemm
+  57_hopper_grouped_gemm.cu
+  TEST_COMMAND_OPTIONS
+  TEST_RANDOM
+  TEST_RANDOM_LARGE_GROUP
+  TEST_EPILOGUE
+  TEST_EPILOGUE_LARGE_GROUP
+  TEST_EPILOGUE_OP
+  TEST_EPILOGUE_OP_LARGE_GROUP
+  TEST_FIXED
+  TEST_FIXED_LARGE_GROUP
+  TEST_SMALL
+  TEST_SMALL_LARGE_GROUP
+  TEST_RANDOM_PERF
+  TEST_RANDOM_PERF_LARGE_GROUP
+  )
diff --git a/examples/58_ada_fp8_gemm/CMakeLists.txt b/examples/58_ada_fp8_gemm/CMakeLists.txt
new file mode 100644
index 0000000000..2af325424f
--- /dev/null
+++ b/examples/58_ada_fp8_gemm/CMakeLists.txt
@@ -0,0 +1,34 @@
+
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+cutlass_example_add_executable(
+  58_ada_fp8_gemm
+  ada_fp8_gemm.cu
+  )
diff --git a/examples/58_ada_fp8_gemm/ada_fp8_gemm.cu b/examples/58_ada_fp8_gemm/ada_fp8_gemm.cu
new file mode 100644
index 0000000000..79bead365b
--- /dev/null
+++ b/examples/58_ada_fp8_gemm/ada_fp8_gemm.cu
@@ -0,0 +1,826 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Example of running an Ada FP8 GEMM.
+
+    In addition to using FP8 Tensor Core instructions, the Ada FP8 GEMM uses a distinct epilogue
+    that enables additional scaling of operands/outputs, storing a pre-activation-function output
+    tensor (called the "auxiliary" output), and computing the absolute maximum value of the
+    outputs.
+
+    Pseudocode for this epilogue is as follows:
+
+    Aux = ((alpha * scale_a * scale_b) * accumulator) + ((beta * scale_c) * source) + bias
+    D = activation(Aux)
+
+    if Aux is fp8 type:
+        abs_max_output = max( abs(aux) | (for every aux in Aux))
+        Aux = scale_aux * Aux
+    endif
+
+    if D is fp8 type:
+        abs_max_output = max( abs(d) | (for every d in D))
+        D = scale_d * D
+    endif
+
+    Parameter Aux is optionally stored to global memory
+*/
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm_complex.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination_generic_with_scaling.h"
+#include "cutlass/gemm/device/gemm_universal_with_absmax.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+
+
+using ElementA = cutlass::float_e4m3_t;
+using ElementB = cutlass::float_e4m3_t;
+using ElementOutput = cutlass::float_e4m3_t;
+using ElementAuxOutput = ElementOutput;
+using ElementAccumulator = float;
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using LayoutC = cutlass::layout::RowMajor;
+static int const kStages = 3;
+static int const kAlignmentA = 16;
+static int const kAlignmentB = 16;
+
+using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationGenericWithScalingAndAbsMax<
+    cutlass::epilogue::thread::ReLu,
+    ElementOutput,
+    ElementAuxOutput,
+    8,
+    ElementAccumulator,
+    ElementAccumulator
+    >;
+
+template <typename MathOperator>
+using Gemm_ = cutlass::gemm::device::GemmUniversalWithAbsMax<
+    ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC,
+    ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm89,
+    cutlass::gemm::GemmShape<128, 64, 128>, cutlass::gemm::GemmShape<64, 32, 128>, cutlass::gemm::GemmShape<16, 8, 32>,
+    EpilogueOutputOp, cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages,
+    kAlignmentA, kAlignmentB, MathOperator
+  >;
+
+using ElementAbsmax = typename EpilogueOutputOp::ElementAbsmax;
+
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+  bool error;
+  bool reference_check;
+  cutlass::gemm::GemmCoord problem_size;
+
+  int iterations;
+  int warmup_iterations;
+
+  bool scale_A;
+  bool scale_B;
+  bool scale_C;
+
+  float alpha;
+  float beta;
+
+  Options():
+    help(false),
+    error(false),
+    reference_check(false),
+    iterations(20),
+    warmup_iterations(5),
+    scale_A(true),
+    scale_B(true),
+    scale_C(true),
+    alpha(1.f),
+    beta(0.f)
+  { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("iterations", iterations, 20);
+    cmd.get_cmd_line_argument("warmup_iterations", warmup_iterations, 5);
+    cmd.get_cmd_line_argument("reference-check", reference_check, false);
+    cmd.get_cmd_line_argument("scale-A", scale_A, true);
+    cmd.get_cmd_line_argument("scale-B", scale_B, true);
+    cmd.get_cmd_line_argument("scale-C", scale_C, true);
+    cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+    cmd.get_cmd_line_argument("beta", beta, 0.f);
+
+    int m, n, k;
+    cmd.get_cmd_line_argument("m", m, 1024);
+    cmd.get_cmd_line_argument("n", n, 1024);
+    cmd.get_cmd_line_argument("k", k, 1024);
+
+    problem_size = cutlass::gemm::GemmCoord{m, n, k};
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "58_ada_fp8_gemm\n\n"
+      << "  This example executes a GEMM using Ada FP8 Tensor Core operations. In addition to performing\n"
+      << "  a normal GEMM, the kernel performs the following operations:\n"
+      << "      Aux = ((alpha * scale_a * scale_b) * accumulator) + ((beta * scale_c) * source) + bias\n"
+      << "        D = activation(Aux)\n\n"
+      << "      if Aux is fp8:\n"
+      << "         abs_max_output = max( abs(aux) | (for every aux in Aux) )\n"
+      << "         Aux = scale_aux * Aux\n\n"
+      << "      if D is fp8 type:\n"
+      << "         abs_max_output = max( abs(d) | (for every d in D) )\n"
+      << "         D = scale_d * D\n\n"
+      << "Options:\n\n"
+      << "  --help                           If specified, displays this usage statement\n\n"
+      << "  --m=<int>                        Sets the M dimension of the GEMM\n"
+      << "  --n=<int>                        Sets the N dimension of the GEMM\n"
+      << "  --k=<int>                        Sets the K dimension of the GEMM\n"
+      << "  --scale-A=<bool>                 Whether to apply a scaling factor to operand A (default: true)\n"
+      << "  --scale-B=<bool>                 Whether to apply a scaling factor to operand B (default: true)\n"
+      << "  --scale-C=<bool>                 Whether to apply a scaling factor to operand C (default: true)\n"
+      << "  --iterations=<int>               Number of profiling iterations to perform\n"
+      << "  --warmup-iterations=<int>        Number of warmup iterations to perform\n"
+      << "  --reference-check=<bool>         If true, performs reference check\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  float gflops(float runtime_s) const {
+    // Two flops per multiply-add
+    return 2.0f * float(problem_size.product()) / float(1.0e9) / runtime_s;
+  }
+};
+
+/// Helper class to run the kernel
+template <typename Gemm>
+struct TestbedRunner {
+
+  using ElementAccumulator = typename Gemm::ElementAccumulator;
+  using ElementCompute = typename Gemm::GemmKernel::Epilogue::OutputOp::ElementCompute;
+  using ElementScalingFactor = typename Gemm::EpilogueOutputOp::ElementScalingFactor;
+
+  static bool const kScaleAux = Gemm::EpilogueOutputOp::kIsScalingAndAmaxAuxOutputNeeded;
+  static bool const kScaleOutput = Gemm::EpilogueOutputOp::kIsScalingAndAmaxOutputNeeded;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<typename Gemm::ElementA, typename Gemm::LayoutA> tensor_A;
+  cutlass::HostTensor<typename Gemm::ElementB, typename Gemm::LayoutB> tensor_B;
+  cutlass::HostTensor<typename Gemm::ElementC, typename Gemm::LayoutC> tensor_C;
+  cutlass::HostTensor<typename Gemm::EpilogueOutputOp::ElementAuxOutput, typename Gemm::LayoutC> tensor_Aux;
+  cutlass::HostTensor<typename Gemm::EpilogueOutputOp::ElementOutput, typename Gemm::LayoutC> tensor_D;
+  cutlass::HostTensor<typename Gemm::ElementC, typename Gemm::LayoutC> tensor_Vector;
+  cutlass::HostTensor<ElementAccumulator, typename Gemm::LayoutC> tmp_D;
+  cutlass::HostTensor<typename Gemm::EpilogueOutputOp::ElementOutput, typename Gemm::LayoutC> reference_D;
+  cutlass::HostTensor<typename Gemm::EpilogueOutputOp::ElementAuxOutput, typename Gemm::LayoutC> reference_Aux;
+  cutlass::HostTensor<ElementScalingFactor, typename Gemm::LayoutC> scale_A;
+  cutlass::HostTensor<ElementScalingFactor, typename Gemm::LayoutC> scale_B;
+  cutlass::HostTensor<ElementScalingFactor, typename Gemm::LayoutC> scale_C;
+  cutlass::HostTensor<ElementScalingFactor, typename Gemm::LayoutC> scale_D;
+  cutlass::HostTensor<ElementScalingFactor, typename Gemm::LayoutC> scale_Aux;
+  cutlass::HostTensor<ElementAbsmax, typename Gemm::LayoutC> abs_max_Aux;
+  cutlass::HostTensor<ElementAbsmax, typename Gemm::LayoutC> abs_max_D;
+  cutlass::HostTensor<ElementAbsmax, typename Gemm::LayoutC> reference_abs_max_Aux;
+  cutlass::HostTensor<ElementAbsmax, typename Gemm::LayoutC> reference_abs_max_D;
+
+  //
+  // Methods
+  //
+
+  TestbedRunner(
+    bool scaleA = true,
+    bool scaleB = true,
+    bool scaleC = true,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { }
+
+  /// Helper to initialize scaling factors
+  template <typename Element, typename Layout>
+  bool initialize_scale_factor(cutlass::TensorView<Element, Layout> view, uint64_t seed, int bits=0) {
+    cutlass::reference::host::TensorFillRandomUniform(view, seed, double(1.), double(0.), bits);
+    return true;
+  }
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view,
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      double scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<typename Gemm::ElementC>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        scope_max = 5;
+        scope_min = -5;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope_max, scope_min, 0);
+    }
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    }
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(
+        view.data(), view.capacity());
+    }
+    else {
+      std::cerr << "Not implemented";
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Initializes data structures
+  void initialize(const Options& options) {
+    //
+    // Allocate the GEMM workspace
+    //
+
+    tensor_A.resize(options.problem_size.mk());
+    tensor_B.resize(options.problem_size.kn());
+    tensor_C.resize(options.problem_size.mn());
+    tensor_D.resize(options.problem_size.mn());
+    tensor_Vector.resize({1, options.problem_size.n()});
+    reference_D.resize(options.problem_size.mn(), false);
+    tmp_D.resize(options.problem_size.mn(), false);
+
+    initialize_tensor(tensor_A.host_view(), init_A, seed + 2019);
+    initialize_tensor(tensor_B.host_view(), init_B, seed + 2018);
+    initialize_tensor(tensor_C.host_view(), init_C, seed + 2017);
+    initialize_tensor(tensor_Vector.host_view(), init_C, seed + 2020);
+
+    // It is possible to randomly initialize to all zeros, so override this with non-zeros
+    // in the upper left corner of each operand.
+    cutlass::Coord<2> origin(0);
+    tensor_A.host_view().at(origin) = typename Gemm::ElementA(1);
+    tensor_B.host_view().at(origin) = typename Gemm::ElementB(1);
+    tensor_C.host_view().at(origin) = typename Gemm::ElementC(1);
+    tensor_Vector.host_view().at(origin) = typename Gemm::ElementC(1);
+
+    cutlass::reference::host::TensorFill(tensor_D.host_view());
+    cutlass::reference::host::TensorCopy(reference_D.host_view(), tensor_C.host_view());
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D.sync_device();
+    tensor_Vector.sync_device();
+
+    int scale_bits = 2;
+    if (options.scale_A) {
+      scale_A.resize({1, 1});
+      initialize_scale_factor(scale_A.host_view(), seed + 2021, scale_bits);
+      scale_A.sync_device();
+    }
+
+    if (options.scale_B) {
+      scale_B.resize({1, 1});
+      initialize_scale_factor(scale_B.host_view(), seed + 2022, scale_bits);
+      scale_B.sync_device();
+    }
+
+    if (options.scale_C) {
+      scale_C.resize({1, 1});
+      initialize_scale_factor(scale_C.host_view(), seed + 2023, scale_bits);
+      scale_C.sync_device();
+    }
+
+    if (kScaleOutput) {
+      scale_D.resize({1, 1});
+      initialize_scale_factor(scale_D.host_view(), seed + 2024, scale_bits);
+      scale_D.sync_device();
+
+      abs_max_D.resize({1, 1});
+      cutlass::reference::host::TensorFill(abs_max_D.host_view());
+      abs_max_D.sync_device();
+
+      reference_abs_max_D.resize({1, 1});
+    }
+
+    if (kScaleAux) {
+      tensor_Aux.resize(options.problem_size.mn());
+      cutlass::reference::host::TensorFill(tensor_Aux.host_view());
+      tensor_Aux.sync_device();
+
+      scale_Aux.resize({1, 1});
+      initialize_scale_factor(scale_Aux.host_view(), seed + 2025, scale_bits);
+      scale_Aux.sync_device();
+
+      abs_max_Aux.resize({1, 1});
+      cutlass::reference::host::TensorFill(abs_max_Aux.host_view());
+      abs_max_Aux.sync_device();
+
+      reference_Aux.resize(options.problem_size.mn(), false);
+      reference_abs_max_Aux.resize({1, 1});
+    }
+  }
+
+  /// Compares computed reference with device reference and outputs to a file if incorrect
+  bool compare_reference(const Options& options) {
+
+    tensor_D.sync_host();
+
+    bool passed = cutlass::reference::host::TensorEquals(reference_D.host_view(), tensor_D.host_view());
+
+    if (kScaleAux) {
+      tensor_Aux.sync_host();
+      abs_max_Aux.sync_host();
+      passed &= cutlass::reference::host::TensorEquals(reference_Aux.host_view(), tensor_Aux.host_view());
+      passed &= cutlass::reference::host::TensorEquals(abs_max_Aux.host_view(), reference_abs_max_Aux.host_view());
+    }
+
+    if (kScaleOutput) {
+      abs_max_D.sync_host();
+      passed &= cutlass::reference::host::TensorEquals(abs_max_D.host_view(), reference_abs_max_D.host_view());
+    }
+
+    if (!passed) {
+      std::cerr << "Reference check failed" << std::endl;
+
+      std::string output_file = "testbed_with_amax_errors.txt";
+      std::ofstream file(output_file);
+
+      file
+        << "problem: " << options.problem_size
+        << ", alpha: " << options.alpha << ", beta: " << options.beta << "\n\n";
+
+      file
+        << "A =\n" << tensor_A.host_view()
+        << "\nB =\n" << tensor_B.host_view()
+        << "\nC =\n" << tensor_C.host_view()
+        << "\nVector =\n" << tensor_Vector.host_view()
+        << "\nScaleA = " << scale_A.host_view()
+        << "\nScaleB = " << scale_B.host_view()
+        << "\nScaleC = " << scale_C.host_view()
+        << "\nScaleD = " << scale_D.host_view()
+        << "\nScaleAux = " << scale_Aux.host_view()
+        << "\n\nReference D =\n" << reference_D.host_view()
+        << "\nComputed D =\n" << tensor_D.host_view();
+      if (kScaleAux) {
+        file
+          << "\n\nReference Aux =\n" << reference_Aux.host_view()
+          << "\nComputed Aux =\n" << tensor_Aux.host_view()
+          << "\n\nReference Absmax Aux = " << reference_abs_max_Aux.host_view()
+          << "\nComputed Absmax Aux = " << abs_max_Aux.host_view();
+      }
+      if (kScaleOutput) {
+        file
+          << "\n\nReference Absmax D = " << reference_abs_max_D.host_view()
+          << "\nComputed Absmax D = " << abs_max_D.host_view();
+      }
+
+      std::cerr << "Dumped results to " << output_file << std::endl;
+
+    }
+
+    return passed;
+  }
+
+  /// Verifies the result is a GEMM
+  bool verify(const Options& options) {
+
+    cutlass::Coord<2> origin(0);
+    ElementCompute scaled_alpha = options.alpha;
+    if (options.scale_A) {
+      scaled_alpha *= scale_A.host_view().at(origin);
+    }
+    if (options.scale_B) {
+      scaled_alpha *= scale_B.host_view().at(origin);
+    }
+
+    ElementCompute scaled_beta = options.beta;
+    if (options.scale_C) {
+      scaled_beta *= scale_C.host_view().at(origin);
+    }
+
+    //
+    // Verify
+    //
+
+    cutlass::reference::host::GemmComplex<
+        typename Gemm::ElementA, typename Gemm::LayoutA,
+        typename Gemm::ElementB, typename Gemm::LayoutB,
+        typename Gemm::ElementC, typename Gemm::LayoutC,
+        ElementCompute, ElementAccumulator, ElementAccumulator
+    >(
+      options.problem_size,
+      scaled_alpha,
+      tensor_A.host_ref(),
+      Gemm::kTransformA,
+      tensor_B.host_ref(),
+      Gemm::kTransformB,
+      scaled_beta,
+      tensor_C.host_ref(),
+      tmp_D.host_ref(),
+      ElementAccumulator(0)
+    );
+
+    ElementCompute tmp_abs_max_Aux(0.);
+    ElementCompute tmp_abs_max_D(0.);
+
+    cutlass::NumericConverter<ElementCompute, typename Gemm::ElementC> cvt_c_to_compute;
+    cutlass::NumericConverter<ElementCompute, ElementAccumulator> cvt_accum_to_compute;
+    cutlass::NumericConverter<ElementAccumulator, ElementCompute> cvt_compute_to_accum;
+    cutlass::NumericConverter<typename Gemm::EpilogueOutputOp::ElementOutput, ElementCompute> cvt_compute_to_d;
+    cutlass::NumericConverter<typename Gemm::EpilogueOutputOp::ElementAuxOutput, ElementCompute> cvt_compute_to_aux;
+
+    cutlass::absolute_value_op<ElementCompute> abs;
+    cutlass::maximum_with_nan_propogation<ElementCompute> max;
+    cutlass::epilogue::thread::ReLu<ElementCompute> act;
+
+    ElementScalingFactor d_scale = kScaleOutput ? scale_D.host_view().at(origin) : ElementScalingFactor(1.);
+
+    for (int m = 0; m < options.problem_size.m(); ++m) {
+      for (int n = 0; n < options.problem_size.n(); ++n) {
+        ElementCompute intermediate = cvt_accum_to_compute(tmp_D.host_view().at({m, n}));
+        ElementCompute bias = cvt_c_to_compute(tensor_Vector.host_view().at({0, n}));
+        ElementCompute aux = intermediate + bias;
+        ElementCompute d = act(aux);
+        tmp_abs_max_Aux = max(abs(aux), tmp_abs_max_Aux);
+        tmp_abs_max_D = max(abs(d), tmp_abs_max_D);
+        reference_D.host_view().at({m, n}) = cvt_compute_to_d(d * d_scale);
+
+        if (kScaleAux) {
+          reference_Aux.host_view().at({m, n}) = cvt_compute_to_aux(aux * scale_Aux.host_view().at(origin));
+        }
+      }
+    }
+
+    if (kScaleAux) {
+      reference_abs_max_Aux.host_view().at(origin) = cvt_compute_to_accum(tmp_abs_max_Aux);
+    }
+
+    if (kScaleOutput) {
+      reference_abs_max_D.host_view().at(origin) = cvt_compute_to_accum(tmp_abs_max_D);
+    }
+
+    return compare_reference(options);
+  }
+
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+
+    if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 4)) {
+      std::cerr << "This example requires CUDA 12.4 or greater." << std::endl;
+      return false;
+    }
+
+    size_t smem_size = sizeof(typename Gemm::GemmKernel::SharedStorage);
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      std::cerr << "cudaGetDevice() failed with error: " << cudaGetErrorString(result) << std::endl;
+      return false;
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      std::cerr << "cudaGetDeviceProperties() failed with error: " << cudaGetErrorString(result) << std::endl;
+      return false;
+    }
+
+    if (properties.major < 8 || (properties.major == 8 && properties.minor < 9)) {
+      std::cerr << "CUTLASS's Ada FP8 GEMM example requires a device of compute capability 89 or higher.\n" << std::endl;
+      return false;
+    }
+
+    if (properties.sharedMemPerBlockOptin < smem_size) {
+      std::cerr << "Insufficient shared memory. Need " << smem_size
+                << ", but device only has " << properties.sharedMemPerBlockOptin << std::endl;
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(Options& options)
+  {
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      std::cerr << "Insufficient resources to run the kernel." << std::endl;
+      return false;
+    }
+
+    this->initialize(options);
+
+    //
+    // Initialize the GEMM operator
+    //
+
+    typename Gemm::EpilogueOutputOp::Params::ActivationParams activation_params{
+      ElementCompute(options.alpha),
+      ElementCompute(options.beta)
+    };
+    typename Gemm::EpilogueOutputOp::Params epilogue_params{
+      activation_params,
+      scale_A.device_data(),
+      scale_B.device_data(),
+      scale_C.device_data(),
+      scale_D.device_data(),
+      scale_Aux.device_data(),
+      abs_max_Aux.device_data(),
+      abs_max_D.device_data()
+    };
+
+    typename Gemm::Arguments arguments{
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      options.problem_size,
+      /* batch_count = */ 1,
+      epilogue_params,
+      tensor_A.device_data(),
+      tensor_B.device_data(),
+      tensor_C.device_data(),
+      tensor_D.device_data(),
+      tensor_Aux.device_data(),
+      tensor_Vector.device_data(),
+      options.problem_size.m() * options.problem_size.k(),
+      options.problem_size.n() * options.problem_size.k(),
+      options.problem_size.m() * options.problem_size.n(),
+      options.problem_size.m() * options.problem_size.n(),
+      (int)options.problem_size.m(), // Batch stride vector
+      tensor_A.layout().stride(0),
+      tensor_B.layout().stride(0),
+      tensor_C.layout().stride(0),
+      tensor_D.layout().stride(0),
+      (int64_t)0 // Leading dimension of vector. This must be 0
+    };
+
+    Gemm gemm_op;
+
+    cutlass::Status status = gemm_op.can_implement(arguments);
+    if (status != cutlass::Status::kSuccess) {
+      std::cerr << "Gemm::can_implement() failed" << std::endl;
+      return false;
+    }
+
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    status = gemm_op.initialize(arguments, workspace.get());
+    if (status != cutlass::Status::kSuccess) {
+      std::cerr << "Gemm::initialize() failed" << std::endl;
+      return false;
+    }
+
+    //
+    // Run the GEMM
+    //
+
+    status = gemm_op();
+
+    if (status != cutlass::Status::kSuccess) {
+      std::cerr << "Gemm::run() failed" << std::endl;
+      return false;
+    }
+
+    cudaError_t cuda_error = cudaDeviceSynchronize();
+    if (cuda_error != cudaSuccess) {
+      std::cerr << "CUDA error: " << cudaGetErrorString(cuda_error) << std::endl;
+      return false;
+    }
+
+    //
+    // Verify
+    //
+
+    bool passed = true;
+    if (options.reference_check) {
+      passed &= this->verify(options);
+    } else {
+      std::cout << "Skipped reference check" << std::endl;
+    }
+
+    //
+    // Warm up
+    //
+
+    for (int i = 0; i < options.warmup_iterations; ++i) {
+      gemm_op();
+    }
+
+    //
+    // Profile
+    //
+
+    cudaEvent_t events[2];
+    cudaError_t error;
+    for (auto & event : events) {
+      error = cudaEventCreate(&event);
+      if (error != cudaSuccess) {
+        std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(error) << std::endl;
+        return false;
+      }
+    }
+
+    // Record an event at the start of a series of GEMM operations
+    error = cudaEventRecord(events[0]);
+    if (error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(error) << std::endl;
+      return false;
+    }
+
+    // Run profiling loop
+    for (int iter = 0; iter < options.iterations; ++iter) {
+      gemm_op();
+    }
+
+    // Record an event when the GEMM operations have been launched.
+    error = cudaEventRecord(events[1]);
+    if (error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(error) << std::endl;
+      return false;
+    }
+
+    // Wait for work on the device to complete.
+    error = cudaEventSynchronize(events[1]);
+    if (error != cudaSuccess) {
+      std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(error) << std::endl;
+      return false;
+    }
+
+    // Measure elapsed runtime
+    float runtime_ms = 0;
+    error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+    if (error != cudaSuccess) {
+      std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(error) << std::endl;
+      return false;
+    }
+
+    // Compute average runtime and GFLOPs.
+    runtime_ms = runtime_ms / float(options.iterations);
+    float gflops = options.gflops(runtime_ms / 1000.0f);
+
+    std::cout << "Problem size: " << options.problem_size.m() << 'x' << options.problem_size.n() << 'x' << options.problem_size.k() << std::endl;
+    std::cout << "Runtime (ms): " << runtime_ms << std::endl;
+    std::cout << "GFLOPs/sec:   " << gflops << std::endl;
+
+    // Cleanup
+    for (auto event : events) {
+      (void)cudaEventDestroy(event);
+    }
+
+    return passed;
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const** argv) {
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 4) ||
+      (props.major != 8 && props.minor != 9)) {
+
+    //
+    // This example requires an NVIDIA Ada-architecture GPU.
+    //
+
+    std::cout
+      << "CUTLASS's FP8 SM89 example requires a GPU of NVIDIA's Ada architecture "
+      << "and CUDA toolkit version 12.4 or later.\n";
+
+    return 0;
+  }
+
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, argv);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.error) {
+    std::cerr << "Aborting execution." << std::endl;
+    return -1;
+  }
+
+  std::cout << "Running GEMM with staged accumulation (OpMultiplyAdd)" << std::endl;
+  std::cout << "=====================================================" << std::endl;
+  TestbedRunner<Gemm_<cutlass::arch::OpMultiplyAdd>> testbed_staged_accum;
+  bool passed = testbed_staged_accum.run(options);
+
+  if (passed) {
+    std::cout << "Passed" << std::endl;
+  } else {
+    std::cout << "Failed" << std::endl;
+  }
+
+  std::cout << "\nRunning GEMM with fast accumulation (OpMultiplyAddFastAccum)" << std::endl;
+  std::cout << "============================================================" << std::endl;
+  TestbedRunner<Gemm_<cutlass::arch::OpMultiplyAddFastAccum>> testbed_fast_accum;
+  passed = testbed_fast_accum.run(options);
+
+  if (passed) {
+    std::cout << "Passed" << std::endl;
+  } else {
+    std::cout << "Failed" << std::endl;
+  }
+
+  return 0;
+}
diff --git a/examples/59_ampere_gather_scatter_conv/CMakeLists.txt b/examples/59_ampere_gather_scatter_conv/CMakeLists.txt
new file mode 100644
index 0000000000..ce22cd1f37
--- /dev/null
+++ b/examples/59_ampere_gather_scatter_conv/CMakeLists.txt
@@ -0,0 +1,40 @@
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+if (NOT MSVC)
+
+cutlass_example_add_executable(
+  59_ampere_gather_scatter_conv
+  ampere_gather_scatter_conv.cu
+)
+
+if (CUTLASS_ENABLE_OPENMP_TESTS AND OpenMP_CXX_FOUND)
+  target_link_libraries(59_ampere_gather_scatter_conv PRIVATE OpenMP::OpenMP_CXX)
+endif()
+
+endif()
diff --git a/examples/59_ampere_gather_scatter_conv/README.md b/examples/59_ampere_gather_scatter_conv/README.md
new file mode 100644
index 0000000000..4aac053639
--- /dev/null
+++ b/examples/59_ampere_gather_scatter_conv/README.md
@@ -0,0 +1,209 @@
+# Example 59: Ampere gather/scatter convolution
+
+CuTe and CUTLASS 3.x based Ampere convolution forward propagation kernel capable of operating on both affine and gather/scatter tensors.
+
+Example executions:
+```sh
+./59_ampere_gather_scatter_conv
+./59_ampere_gather_scatter_conv --n=108
+./59_ampere_gather_scatter_conv --n=4096 --i=1
+./59_ampere_gather_scatter_conv --n=1080 --i=1000
+./59_ampere_gather_scatter_conv --n=131072 --i=1000 --no-check
+```
+
+This example demonstrates a few super cool features of CUTLASS and CuTe. It shows off
+1. A dense conv 3D fprop kernel written as a single file ...
+2. ... that leverages off-the-shelf CUTLASS collectives to show how custom kernels can use collectives ...
+3. ... and uses the exact same templated kernel to also stamp out a gather/scatter 3D fprop conv ...
+4. ... while getting near peak performance of the Ampere class tensor core on Ampere and Ada GPUs ...
+5. ... by using static cute shapes and strides in case problem shapes are known at compile time.
+
+## A dense conv 3D fprop kernel written in CUTLASS 3.x and CuTe
+
+The most common strategy for implementing high performance convolution kernels on the GPU is to transform
+the activation tensor in such a way that we can perform the computation as a GEMM. This is called the 
+image to column (im2col) transformation. [CUTLASS 2.x implementation of im2col based convolutions is
+documented separately](../../media/docs/implicit_gemm_convolution.md), and here we consider a fresh approach for CuTe.
+
+A 3D convolution has the following input tensors:
+- Activation tensor (Act): `((N,(D,H,W)), (C,(1,1,1)))`
+- Filter tensor     (Flt): `( K,          (C,(T,R,S)))`
+- Output tensor     (Out): `((N,(Z,P,Q)),  K         )`
+
+Where
+- N   := number of images
+- DHW := spatial dimensions of the activation tensor
+- C   := channel dimension of the activation tensor
+- K   := channel dimension of the filter and output tensor
+- TRS := spoke dimensions of the filter tensor
+- ZPQ := spatial dimensions of the output tensor
+
+As is evident in the tensor shapes, these cannot be issued to a GEMM just yet, since there is no
+logical M, N, and K modes we can group the tensor modes into.
+
+Notice that every spoke of the filter tensor (TRS) will be applied to some (offset) view of the
+activation tensor, thus expanding the logical size of the activation tensor. 
+Additionally, a similar logical transform of the spatial dimensions can be encoded as a function of the
+padding, dilations, traversal strides, and filter spokes. This gets us to our im2col transform:
+
+im2col transform affects the component shapes/strides of the activation tensor in the following way:
+- ZPQ Shape   : changes DHW domain with formula `(1 + (DHW + pad - (((TRS-1) * dilation) + 1)) / traversal_stride)`
+- TRS Shape   : TRS domain instead of `(1,1,1)`
+- ZPQ Strides : Original DHW strides get `elem_scale()`-ed by traversal strides DHW
+- TRS Strides : Original DHW strides get `elem_scale()`-ed by dilation DHW
+
+With this transform applied, we end up with a set of input and output tensors that
+are logically consistent in their MNK dimensions, thus allowing us to dispatch to a GEMM.
+im2col activation layout: ((N,(Z,P,Q)), (C,(T,R,S))) // logical (M,K)
+filter layout           : ( K,          (C,(T,R,S))) // logical (N,K)
+output layout           : ((N,(Z,P,Q)),  K         ) // logical (M,N)
+
+CuTe's layout representation and algebra make these folded tensors easy to represent and manipulate.
+This is most evident in the reference check code used in this example:
+
+```cpp
+for (size_t logical_m = 0; logical_m < size<0>(mOutputRef); ++logical_m) {
+  for (size_t logical_n = 0; logical_n < size<1>(mOutputRef); ++logical_n) {
+    auto accumulator = float(0);
+    for (size_t logical_k = 0; logical_k < size<1>(mStencil); ++logical_k) {
+      accumulator += mStencil(logical_m, logical_k) * mActivation(logical_n, logical_k);
+    }
+    mOutputRef(logical_m, logical_n) = accumulator;
+  }
+}
+```
+
+Which succinctly demonstrates how im2col transform allows us to implement convolutions
+as GEMMs with special layout transformations on the input tensor.
+
+Note: in the example kernel's implementation we treat activations as the B tensor
+and filter as the A tensor, thus making their logical dimensions NK and MK respectively.
+
+## Leveraging CUTLASS collectives off the shelf in a custom kernel
+
+Now that we have transformed our problem in such a way that allows us to dispatch to a GEMM,
+we can reuse much of the machinery CUTLASS offers to implement this forward pass convolution
+operator. CUTLASS decomposes these "moving parts" of GPU linear algebra into reusable,
+modular software components abstracted by C++ template classes. This example
+demonstrates how some of the lower layers of the hierarchy can be re-used for custom kernels
+by writing a custom kernel for convolution that re-uses the Ampere/Ada GEMM collectives
+from CUTLASS 3.
+
+A kernel author is free to compose their custom components with any of the existing templates
+in the CUTLASS hierarchy to leverage existing high performance implementations from the CUTLASS
+team. In this example, we write a custom kernel layer and compose with an existing collective.
+However, any of the CUTLASS kernels can be composed with bespoke collectives if the desired
+customization is a mainloop or epilogue fusion without changes to the grid planning,
+tile scheduling, load balancing, or thread marshalling.
+
+## Implementing gather/scatter and dense convolution with the same kernel
+
+Functionality and correctness of the implemented kernel, as a virtue of using
+CuTe and off the shelf CUTLASS collectives, only relies on the logical consistency of
+the layouts of input and output tensors. This means that we can freely change how
+the logical coordinates of the tensors map into the index space, and even how these dereferences
+happen. [CUTLASS example 52](../52_hopper_gather_scatter_fusion/) demonstrates this by implementing a custom stride that
+supports indexed indirection for tensor data accesses. This allows for example 52
+to implement a GEMM where inputs are gathered and output is scattered based on an index buffer.
+
+We re-use the same custom stride utilities in this example to implement a convolution kernel
+that gathers along the NDHW dimensions of the activation tensor and scatters the output along the
+NZPQ dimensions of the output tensor, treating the channel dimensions as the dense vectors.
+
+Our dense affine im2col transformed activation tensor:
+
+```cpp
+// im2col transformed activation layout: ((nzpq), (ctrs)) => idx
+auto xformed_act_layout = make_layout(
+  make_shape (make_shape (      N,     Z,   P, Q), make_shape (  C,      T,   R, S)),
+  make_stride(make_stride(D*H*W*C, H*W*C, W*C, C), make_stride(_1{}, H*W*C, W*C, C)));
+```
+
+now becomes a composed layout that uses `IndexedGather`:
+
+```cpp
+// Inner layout of the composition:
+// ((nzpq), (csrt)) => (idx_buffer_idx, dense_offset)
+auto EG = E<0>{};  // Gather basis     (1,0) (idx_buffer_idx) 
+auto EC = E<1>{};  // Contiguous basis (0,1) (dense_offset)    
+auto xformed_act_logical_inner = make_layout(
+  make_shape (make_shape (       N,      Z,    P,  Q), make_shape ( C,      T,    R,  S)),
+  make_stride(make_stride(D*H*W*EG, H*W*EG, W*EG, EG), make_stride(EC, H*W*EG, W*EG, EG)));
+
+// Outer layout of the composition:
+// (idx_buffer_idx, dense_offset) => idx
+// IndexedGather obtains idx by applying (gmem_base_ptr + gather_idx_buf[idx_buffer_idx] + dense_offset)
+auto xformed_act_gather_outer = make_layout(
+  make_shape(_1{},_1{}),
+  make_stride(CustomStride{IndexedGather{gather_idx_buf}, C}, _1{}));
+
+// Compose the inner and outer layouts
+// ((nzpq), (ctrs)) => idx
+auto xformed_act_composed_layout = composition(
+  xformed_act_gather_outer,
+  make_arithmetic_tuple(_0{}, _0{}),
+  xformed_act_logical_inner);
+```
+
+Here, we create a composed layout whose inner layout has the same logical MK shape as earlier,
+but with an outer layout that uses the custom strides with an index buffer to access memory with
+indirections. A custom stride requires two inputs to compute the index that a certain coordinate maps to:
+the index buffer offset and the dense offset into the vector. This entails that our inner layout
+(the one with the logical MK shape) has a rank-2 codomain `(idx_buffer_idx, dense_offset)`.
+We can set up such a layout with scaled basis strides, which allow us to map a domain onto a
+codomain with multiple orthogonal bases. The two codomain basis are the
+index buffer offsets (rank 0 basis), and the dense vector offsets (rank 1 basis).
+A similar composed layout is set up for the output scatter tensor.
+
+This tensor still has a logical MK shape and is backed by a CuTe layout, which means we can still
+tile, partition, and otherwise manipulate it with CuTe's layout algebra in the same way we would any
+other tensor. Substituting the activation tensor's affine layout for this gather layout requires
+no changes to the implementation of the kernel whatsoever. Everything composes. This example
+stamps out a dense 3D convolution as well as gather/scatter 3D convolution using the same kernel template,
+with the only difference between them being the layouts of the input and output tensors.
+
+Convolutions are just a special case of tensor contractions, and as [example 51](../51_hopper_gett)
+demonstrates, the exact same collective used in this example can also be used to implement arbitrary GETTs.
+Of course, this also means that the same kernel can implement gather/scatter GETTs as well!
+
+This demonstrates the composition power of not just CuTe, but also CUTLASS 3's two level
+micro kernel abstraction. A single highly tuned temporal micro-kernel (collective) can be implemented once
+and applied to compute dense GETTs, gather/scatter GETTs, dense convolutions, and gather/scatter convolutions.
+
+## Peak performance on Ampere and Ada GPUs by leveraging domain specific knowledge
+
+Often, when implementing custom kernels, a user has more knowledge of the problem domain that can be
+exploited to deliver higher performance than otherwise could be through general kernels. In this example
+we presume that the shape of each of the images (DHWC dimensions) as well as the filter (TRS) are available
+a-priori and that the tile shape evenly divides the problem. Number of images (N) is still left as a runtime
+parameter.
+
+Knowing the extents of our tensors at compile time allows us to encode them as static cute shapes rather than
+a dynamic problem shape, resulting in the elimination of most of the index computation instructions such as
+expensive div/mods. Knowing that the problem shape is divisible by the tile shape allows us to use the 
+Ampere collective that does not perform predication on global memory loads, further reducing overheads
+and allowing us to achieve near peak performance on RTX Ampere and Ada GPUs.
+
+Running this example on an RTX 3080Ti prints the following performance numbers (some output culled for brevity):
+
+```
+$> ./examples/59_ampere_gather_scatter_conv/59_ampere_gather_scatter_conv --n=131072 --i=128 --no-check
+Ampere convolution forward propogation kernel supporting both affine and gather/scatter tensors.
+
+Allocating tensors ... done.
+Initializing data ... done.
+Initializing gather/scatter index buffers ... done.
+
+Running dense fprop kernel
+Conv TFLOP count = 0.927713
+Conv dense perf: 31.027376ms | TFLOP/s = 29.899819
+
+Running gather/scatter fprop kernel
+Conv TFLOP count = 0.927713
+Conv gather/scatter perf: 28.973721ms | TFLOP/s = 32.019117
+```
+
+With this in mind, this example kernel has the following limitations:
+- This example kernel only supports dynamic image count, all other conv problem shape must be defined as `cute::Constant<>`s
+- Problem shapes (including dynamic image count `N`) must be evenly divisible by the tile shape
+- It does not perform fp32->tf32 numeric conversion, gmem inputs must be rounded to tf32 already
diff --git a/examples/59_ampere_gather_scatter_conv/ampere_conv_kernel.h b/examples/59_ampere_gather_scatter_conv/ampere_conv_kernel.h
new file mode 100644
index 0000000000..cc00cced96
--- /dev/null
+++ b/examples/59_ampere_gather_scatter_conv/ampere_conv_kernel.h
@@ -0,0 +1,320 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/atom/copy_atom.hpp"
+#include <random>
+
+#include "cutlass/util/print_error.hpp"
+
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_mma.hpp"
+
+using namespace cute;
+
+struct AmpereUnpredicatedFprop {
+  //
+  // Static config for conv problem shape
+  //
+  using D = _6;
+  using H = _4;
+  using W = _4;
+
+  using T = _3;
+  using R = _3;
+  using S = _3;
+
+  using Z = _4;
+  using P = _2;
+  using Q = _2;
+
+  using C = _64;
+  using K = _128;
+
+  // Tiler config
+  using Tiler_K = decltype(cute::min(K{}, _128{}));
+  using Tiler_C = decltype(cute::min(C{}, _32{}));
+  using Tiler_N = _4;
+  using TileM = Tiler_K;
+  using TileN = Shape<Tiler_N, Z, P, Q>;
+  using TileK = Shape<Tiler_C,_1,_1,_1>;
+  using PIPE  = _3;
+  using TilerFlt = Shape<TileM, TileK>;
+  using TilerAct = Shape<TileN, TileK>;
+  using TilerOut = Shape<TileM, TileN>;
+
+  using TileSizeM = Int<size(TileM{})>;
+  using TileSizeN = Int<size(TileN{})>;
+  using TileSizeK = Int<size(TileK{})>;
+  static constexpr int Stages = PIPE::value;
+
+  using ElementFlt = tfloat32_t;
+  using ElementAct = tfloat32_t;
+  using ElementOut = float;
+
+  using TiledMma = TiledMMA<
+    MMA_Atom<SM80_16x8x8_F32TF32TF32F32_TN>,
+    Layout<Shape<_2,_2,_1>>,
+    Tile<_32,_32,Underscore>>;
+
+  static constexpr int MaxThreadsPerBlock = size(TiledMma{});
+  static constexpr int MinBlocksPerMultiprocessor = 1;
+
+  union SharedStorage {
+    struct {
+      ElementFlt sAMatrix[size(TileM{}) * size(TileK{}) * size(PIPE{})];
+      ElementAct sBMatrix[size(TileN{}) * size(TileK{}) * size(PIPE{})];
+    } mainloop;
+
+    struct {
+      ElementOut sCMatrix[size(TileM{}) * size(TileN{})];
+    } epilogue;
+  };
+
+  //
+  // Stencil tensor
+  //
+
+  using GmemLayoutFlt = decltype(make_ordered_layout(
+    Shape< K, Shape< C, T, R, S>>{},
+    tuple<_4, tuple<_0,_3,_2,_1>>{}));
+
+  // We have 64 elements * 32b each in the major mode that we can vectorize
+  // Max vector size is 128b, so lay 16 threads along the major mode with a vector size of 4
+  // Rest along the minor mode
+  using GmemTiledCopyFlt = decltype(make_tiled_copy(
+    Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<uint128_t>, ElementFlt>{},
+    Layout<Shape <_16, _8>,
+           Stride< _8, _1>>{},
+    Layout<Shape < _1, _4>>{}));
+
+  // Following layout is also correct, but trades off dynamic strides in the slice for bank conflict free accesses
+  // using SmemLayoutFlt = decltype(
+  //     composition(Swizzle<3,2,3>{},
+  //                 make_ordered_layout(
+  //                     Shape<TileSizeM,TileSizeK,PIPE>{},
+  //                     tuple<       _1,       _0,  _2>{})));
+
+  using SmemLayoutAtomFlt = decltype(
+    composition(Swizzle<1,2,3>{},
+                Layout<Shape <_8,Shape <_4, _2>>,
+                       Stride<_4,Stride<_1,_32>>>{}));
+
+  using SmemCopyAtomFlt = Copy_Atom<SM75_U32x4_LDSM_N, ElementFlt>;
+
+  //
+  // Activation tensor
+  //
+
+  // Activation tensor is major in the contraction mode, so vectorize that mode first
+  // Then lay out the rest of the threads along the other mode
+  using GmemTiledCopyAct = decltype(make_tiled_copy(
+    Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<uint128_t>, ElementAct>{},
+    Layout<Shape <_16, _8>,
+           Stride< _8, _1>>{},
+    Layout<Shape < _1, _4>>{}));
+
+  // Following layout is also correct, but trades off dynamic strides in the slice for bank conflict free accesses
+  // using SmemLayoutAct = decltype(
+  //     composition(Swizzle<3,2,3>{},
+  //                 make_ordered_layout(
+  //                     Shape<TileSizeN,TileSizeK,PIPE>{},
+  //                     tuple<       _1,       _0,  _2>{})));
+
+  using SmemLayoutAtomAct = decltype(
+    composition(Swizzle<1,2,3>{},
+                Layout<Shape <_8,Shape <_4, _2>>,
+                       Stride<_4,Stride<_1,_32>>>{}));
+
+  using SmemCopyAtomAct = Copy_Atom<SM75_U32x4_LDSM_N, ElementAct>;
+
+  //
+  // Output tensor
+  //
+
+  using GmemTiledCopyOut = decltype(make_tiled_copy(
+    Copy_Atom<UniversalCopy<uint128_t>, ElementAct>{},
+    Layout<Shape <_8, _16>,
+           Stride<_1,  _8>>{},
+    Layout<Shape <_4,  _1>>{}));
+
+  using SmemCopyAtomOut = Copy_Atom<UniversalCopy<uint32_t>, ElementOut>;
+
+  // This can be optimized to make accesses BCF, but we use a col-major layout here to show off composability
+  using SmemLayoutOut = Layout<Shape<TileSizeM, TileSizeN>>;
+
+  //
+  // Conv functor
+  //
+  template <class EngineFlt, class TensorActivation, class TensorOutput>
+  void __device__
+  operator()(cute::Tensor<EngineFlt, GmemLayoutFlt> mFlt, // ( K,        (C,T,R,S))
+             TensorActivation                       mAct, // ((N,Z,P,Q), (C,T,R,S))
+             TensorOutput                           mOut, // ( K,        (N,Z,P,Q))
+             char* smem_buf) const {
+    using namespace cute;
+    using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveMma<
+        cutlass::gemm::MainloopSm80CpAsyncUnpredicated<PIPE::value>,
+        Shape<TileM,TileN,TileK>,
+        ElementFlt,
+        Underscore, // Ignore the stride, we are passing full cute::Tensor to operator()
+        ElementAct,
+        Underscore, // Ignore the stride, we are passing full cute::Tensor to operator()
+        TiledMma,
+        GmemTiledCopyFlt,
+        SmemLayoutAtomFlt,
+        SmemCopyAtomFlt,
+        cute::identity,
+        GmemTiledCopyAct,
+        SmemLayoutAtomAct,
+        SmemCopyAtomAct,
+        cute::identity>;
+
+    TiledMma tiled_mma;
+    Tensor accum = partition_fragment_C(tiled_mma, TilerOut{});
+    clear(accum);
+
+    // Set up tensors
+    // NOTE: blockIdx.x projects onto act-NDHW mode, y along the flt-K mode for the sake of higher dynamic range in NDHW
+    Tensor gA_mk = local_tile(mFlt, TilerFlt{}, make_coord(_,_));                              // (BLK_M,BLK_K,m',k')
+    Tensor gB_nk = local_tile(mAct, TilerAct{}, make_coord(_,_));                              // (BLK_N,BLK_K,n',_1)
+    Tensor gC_mn = local_tile(mOut, TilerOut{}, make_coord(_,_));                              // (BLK_M,BLK_N,m',n')
+
+    // Compute m_coord and n_coord with their post-tiled shapes
+    auto m_coord = idx2crd(int(blockIdx.y), shape<2>(gA_mk));
+    auto n_coord = idx2crd(int(blockIdx.x), shape<2>(gB_nk));
+    Tensor gA = gA_mk(_,_,m_coord,_);                                                          // (BLK_M,BLK_K,k')
+    Tensor gB = gB_nk(_,_,n_coord,_);                                                          // (BLK_N,BLK_K,_1)
+    Tensor gC = gC_mn(_,_,m_coord,n_coord);                                                    // (BLK_M,BLK_N)
+
+    auto k_tile_iter = cute::make_coord_iterator(size<2>(gA));
+    int k_tile_count = size<2>(gA);
+
+    CollectiveMainloop collective_mma;
+    collective_mma(
+      accum,
+      gA,
+      gB,
+      accum,
+      k_tile_iter, k_tile_count,
+      Underscore{}, // no residue since we do not support predication
+      threadIdx.x,
+      smem_buf);
+
+    //
+    // Epilogue
+    //
+    SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+    Tensor sC = make_tensor(make_smem_ptr(&storage.epilogue.sCMatrix[0]), SmemLayoutOut{});
+
+    auto smem_tiled_copy_C = make_tiled_copy_C(SmemCopyAtomOut{}, tiled_mma);
+    auto smem_thr_copy_C = smem_tiled_copy_C.get_slice(threadIdx.x);
+    auto tCrC = smem_thr_copy_C.retile_S(accum);
+    auto tCsC = smem_thr_copy_C.partition_D(sC);
+    copy(smem_tiled_copy_C, tCrC, tCsC);
+
+    __syncthreads();
+
+    GmemTiledCopyOut gmem_tiled_copy_C;
+    auto gmem_thr_copy_C = gmem_tiled_copy_C.get_slice(threadIdx.x);
+    auto tDsC = gmem_thr_copy_C.partition_S(sC);
+    auto tDgC = gmem_thr_copy_C.partition_D(gC);
+    copy(gmem_tiled_copy_C, tDsC, tDgC);
+
+    #if 0
+      if (thread0()) {
+        print("mAct = "); print(mAct);          print('\n');
+        print("mFlt = "); print(mFlt);          print('\n');
+        print("mOut = "); print(mOut);          print('\n');
+        print("gA   = "); print(gA);            print('\n');
+        print("gB   = "); print(gB);            print('\n');
+        print("gC   = "); print(gC);            print('\n');
+        print("sA   = "); print(sA.layout());   print('\n');
+        print("sB   = "); print(sB.layout());   print('\n');
+        print("sC   = "); print(sC.layout());   print('\n');
+        print("tAgA = "); print(tAgA.layout()); print('\n');
+        print("tBgB = "); print(tBgB.layout()); print('\n');
+        print("tAsA = "); print(tAsA.layout()); print('\n');
+        print("tBsB = "); print(tBsB.layout()); print('\n');
+        print("tCsA = "); print(tCsA.layout()); print('\n');
+        print("tCsB = "); print(tCsB.layout()); print('\n');
+        print("tCrC = "); print(tCrC.layout()); print('\n');
+        print("tCsC = "); print(tCsC.layout()); print('\n');
+        print("tDsC = "); print(tDsC.layout()); print('\n');
+        print("tDgC = "); print(tDgC.layout()); print('\n');
+        print("gmem tiled copy A = "); print(gmem_tiled_copy_A); print('\n');
+        print("gmem tiled copy B = "); print(gmem_tiled_copy_B); print('\n');
+        print("gmem tiled copy C = "); print(gmem_tiled_copy_C); print('\n');
+        print("k_tile_count = "); print(size<2>(gA)); print('\n');
+        print("k_tile_iter  = "); print(*k_tile_iter); print('\n');
+        print("K_BLOCK_MAX  = "); print(K_BLOCK_MAX); print('\n');
+    }
+    #endif
+  }
+};
+
+template <class TensorFlt, class TensorAct, class TensorOut>
+inline int
+fprop_reference(
+    TensorFlt mStencil,    // Logical MK: ( K,        (C,T,R,S))
+    TensorAct mActivation, // Logical NK: ((N,Z,P,Q), (C,T,R,S))
+    TensorOut mOutput,     // Logical MN: ( K,        (N,Z,P,Q))
+    TensorOut mOutputRef) {
+  int32_t N = size<1,0>(mOutputRef); 
+  int32_t Z = size<1,1>(mOutputRef);
+  int32_t P = size<1,2>(mOutputRef);
+  int32_t Q = size<1,3>(mOutputRef);
+  int32_t T = size<1,3>(mStencil);
+  int32_t R = size<1,2>(mStencil);
+  int32_t S = size<1,1>(mStencil);
+  int32_t C = size<1,0>(mStencil);
+
+  size_t K    = static_cast<size_t>(size<0>(mOutputRef));
+  size_t NZPQ = static_cast<size_t>(size<1>(mOutputRef));
+  size_t CTRS = static_cast<size_t>(size<1>(mStencil));
+
+#if defined(_OPENMP)
+  #pragma omp parallel for
+#endif
+  for (size_t logical_m = 0; logical_m < K; ++logical_m) {
+    for (size_t logical_n = 0; logical_n < NZPQ; ++logical_n) {
+      auto accumulator = float(0);
+      for (size_t logical_k = 0; logical_k < CTRS; ++logical_k) {
+        accumulator += mStencil(logical_m, logical_k) * mActivation(logical_n, logical_k);
+      }
+      mOutputRef(logical_m, logical_n) = accumulator;
+    }
+  }
+
+  return print_relative_error(mOutput, mOutputRef,  /*print_verbose*/ false,  /*print_error*/ true, /*error_margin*/ 0.01);
+}
diff --git a/examples/59_ampere_gather_scatter_conv/ampere_gather_scatter_conv.cu b/examples/59_ampere_gather_scatter_conv/ampere_gather_scatter_conv.cu
new file mode 100644
index 0000000000..341d1e9fd1
--- /dev/null
+++ b/examples/59_ampere_gather_scatter_conv/ampere_gather_scatter_conv.cu
@@ -0,0 +1,392 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Example demonstrating CuTe and CUTLASS 3.x based Ampere convolution forward propogation kernel
+      capable of operating on both affine and gather/scatter tensors.
+
+  This example demonstartes a few super cool features of CUTLASS and CuTe. It shows off
+  1. A dense conv 3D fprop kernel written as a single file ...
+  2. ... that leverages off the shelf CUTLASS collectives to show how custom kernels can use collectives ...
+  3. ... and uses the exact same templated kernel to also stamp out a gather/scatter 3D fprop conv ...
+  4. ... while getting near peak performance of the Ampere class tensor core on Ampere and Ada GPUs ...
+  5. ... by using static cute shapes and strides in case problem shapes are known at compile time.
+
+  Full documentation for this example can be found within the README.md file in this directory.
+
+  Example executions:
+  ./59_ampere_gather_scatter_conv
+  ./59_ampere_gather_scatter_conv --n=108
+  ./59_ampere_gather_scatter_conv --n=4096 --i=1
+  ./59_ampere_gather_scatter_conv --n=1080 --i=1000
+  ./59_ampere_gather_scatter_conv --n=131072 --i=1000 --no-check
+*/
+
+#include <thrust/sequence.h>
+#include <thrust/universal_vector.h>
+
+#include "ampere_conv_kernel.h"
+#include "gather_tensor.hpp"
+
+#include "cutlass/util/command_line.h"
+
+bool check_cuda_result(cudaError_t code, const char* file, int line) {
+  if (code == cudaSuccess) {
+    return true;
+  }
+
+  std::cerr << "CUDA error at  (" << file << "," << line << ")\n\t" << unsigned(code) << " -- " << cudaGetErrorString(code) << "\n";
+  return false;
+}
+
+#define CHECK_CUDA(code) (check_cuda_result(code, __FILE__, __LINE__))
+
+using namespace cute;
+using example::IndexedGather;
+using example::CustomStride;
+
+template<class Operator, class FilterTensor, class ActivationTensor, class OutputTensor>
+__global__
+__launch_bounds__(Operator::MaxThreadsPerBlock, Operator::MinBlocksPerMultiprocessor)
+void kernel_entrypoint(FilterTensor mFlt, ActivationTensor mAct, OutputTensor mOut) {
+  extern __shared__ char smem_buf[];
+  Operator op;
+  op(mFlt, mAct, mOut, smem_buf);
+}
+
+int ampere_dense_conv_fprop(
+    int num_images,
+    float* activations,
+    float* filter,
+    float* output,
+    float* output_ref,
+    int num_iterations = 1,
+    bool do_ref_check = true) {
+  auto D = typename AmpereUnpredicatedFprop::D{};
+  auto H = typename AmpereUnpredicatedFprop::H{};
+  auto W = typename AmpereUnpredicatedFprop::W{};
+  auto Z = typename AmpereUnpredicatedFprop::Z{};
+  auto P = typename AmpereUnpredicatedFprop::P{};
+  auto Q = typename AmpereUnpredicatedFprop::Q{};
+  auto C = typename AmpereUnpredicatedFprop::C{};
+  auto K = typename AmpereUnpredicatedFprop::K{};
+  auto S = typename AmpereUnpredicatedFprop::S{};
+  auto R = typename AmpereUnpredicatedFprop::R{};
+  auto T = typename AmpereUnpredicatedFprop::T{};
+
+  int N = num_images; // dynamic
+  if (num_images % int(typename AmpereUnpredicatedFprop::Tiler_N{}) != 0) {
+    printf("ERROR: Input image count must be evenly divisible by CTA tiler N.\n");
+    return 1;
+  }
+
+  // Tensor Activation: (n,d,h,w,c)::(?,6,4,4,64):(6144,1536,384,64,1)
+  auto activation_layout = make_layout(
+    make_shape (make_shape (      N,     D,   H, W), make_shape ( C,   _1{},_1{},_1{})),
+    make_stride(make_stride(D*H*W*C, H*W*C, W*C, C), make_stride(_1{}, _0{},_0{},_0{})));
+
+  auto xformed_act_layout = make_layout(
+    make_shape (make_shape(N, Z, P, Q),       make_shape ( C,       T,   R, S)),
+    make_stride(stride<0>(activation_layout), make_stride(_1{}, H*W*C, W*C, C)));
+
+  // Tensor Filter    : (k,c,s,r,t)::(128,3,3,3,64):(1728,576,192,64,1)
+  auto filter_layout = AmpereUnpredicatedFprop::GmemLayoutFlt{};
+
+  // Tensor Output    : (n,z,p,q,k)::(?,4,2,2,128):(2048,1024,512,128,1)
+  auto output_layout = make_ordered_layout(
+    make_shape( K,   make_shape( N,   Z,   P,   Q)),
+    make_tuple(_0{}, make_tuple(_4{},_3{},_2{},_1{})));
+
+  Tensor mActivation = make_tensor(make_gmem_ptr(activations), activation_layout);
+  Tensor mXformedAct = make_tensor(make_gmem_ptr(activations), xformed_act_layout);
+  Tensor mFilter     = make_tensor(make_gmem_ptr(filter), filter_layout);
+  Tensor mOutput     = make_tensor(make_gmem_ptr(output), output_layout); // (K, (N,Z,P,Q))
+  Tensor mOutputRef  = make_tensor(make_gmem_ptr(output_ref), output_layout);
+
+  print("xformed act layout ((N,Z,P,Q), (C,T,R,S)) = "); print(xformed_act_layout); print("\n");
+
+  cudaEvent_t start, stop;
+  CHECK_CUDA(cudaEventCreate(&start));
+  CHECK_CUDA(cudaEventCreate(&stop));
+
+  constexpr size_t smem_size = sizeof(typename AmpereUnpredicatedFprop::SharedStorage);
+  Tensor gOutput_mn = zipped_divide(mOutput, typename AmpereUnpredicatedFprop::TilerOut{}); // ((BLK_M, BLK_N), (m', n'))
+  dim3 lauch_grid {static_cast<uint32_t>(size<1,1>(gOutput_mn)), static_cast<uint32_t>(size<1,0>(gOutput_mn)), 1};
+
+  CHECK_CUDA(cudaFuncSetAttribute(
+    kernel_entrypoint<AmpereUnpredicatedFprop, decltype(mFilter), decltype(mXformedAct), decltype(mOutput)>,
+    cudaFuncAttributeMaxDynamicSharedMemorySize,
+    smem_size));
+
+  CHECK_CUDA(cudaEventRecord(start));
+  for (int i = 0; i < num_iterations; ++i) {
+    kernel_entrypoint<AmpereUnpredicatedFprop, decltype(mFilter), decltype(mXformedAct), decltype(mOutput)>
+      <<<lauch_grid, AmpereUnpredicatedFprop::MaxThreadsPerBlock, smem_size>>>(
+        mFilter, mXformedAct, mOutput);
+  }
+  CHECK_CUDA(cudaEventRecord(stop));
+  CHECK_CUDA(cudaEventSynchronize(stop));
+
+  float milliseconds = 0;
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  milliseconds /= float(num_iterations);
+
+  double tflop_count = (2 * double(size<0>(xformed_act_layout)) * double(size(filter_layout))) / double(1e12);
+  double tflops = tflop_count / (double(milliseconds) / double(1e3));
+
+  printf("Conv TFLOP count = %f\n", tflop_count);
+  printf("Conv dense perf: %fms | TFLOP/s = %f\n", milliseconds, tflops);
+
+  if (do_ref_check) {
+    printf("Running host reference check ...\n");
+    return fprop_reference(mFilter, mXformedAct, mOutput, mOutputRef);
+  }
+  else {
+    return 0;
+  }
+}
+
+int ampere_gather_scatter_conv_fprop(
+    int num_images,
+    float* activations,
+    uint32_t *gather_idx_buf,
+    float* filter,
+    float* output,
+    uint32_t *scatter_idx_buf,
+    int num_iterations = 1) {
+  auto D = typename AmpereUnpredicatedFprop::D{};
+  auto H = typename AmpereUnpredicatedFprop::H{};
+  auto W = typename AmpereUnpredicatedFprop::W{};
+  auto Z = typename AmpereUnpredicatedFprop::Z{};
+  auto P = typename AmpereUnpredicatedFprop::P{};
+  auto Q = typename AmpereUnpredicatedFprop::Q{};
+  auto C = typename AmpereUnpredicatedFprop::C{};
+  auto K = typename AmpereUnpredicatedFprop::K{};
+  auto S = typename AmpereUnpredicatedFprop::S{};
+  auto R = typename AmpereUnpredicatedFprop::R{};
+  auto T = typename AmpereUnpredicatedFprop::T{};
+
+  int N = num_images; // dynamic
+  if (N % int(typename AmpereUnpredicatedFprop::Tiler_N{}) != 0) {
+    printf("ERROR: Input image count must be evenly divisible by CTA tiler N. Got num_images = %d\n", N);
+    return 1;
+  }
+
+  // Tensor Filter    : (k,c,s,r,t)::(128,3,3,3,64):(1728,576,192,64,1)
+  auto filter_layout = AmpereUnpredicatedFprop::GmemLayoutFlt{};
+
+  // Tensor Output    : (n,z,p,q,k)::(?,4,2,2,128):(2048,1024,512,128,1)
+  auto output_layout = make_ordered_layout(
+    make_shape( K,   make_shape( N,   Z,   P,   Q)),
+    make_tuple(_0{}, make_tuple(_4{},_3{},_2{},_1{})));
+
+  // Input gather layout
+  // inner_layout(make_coord((nzpq), (csrt))) => (idx_buffer_idx, dense_c_idx)
+  auto EG = E<0>{};  // Gather basis     (1,0) (idx_buffer_idx) 
+  auto EC = E<1>{};  // Contiguous basis (0,1) (dense_offset)    
+  auto xformed_act_logical_inner = make_layout(
+    make_shape (make_shape (       N,      Z,    P,  Q), make_shape ( C,      T,    R,  S)),
+    make_stride(make_stride(D*H*W*EG, H*W*EG, W*EG, EG), make_stride(EC, H*W*EG, W*EG, EG)));
+
+  // outer_layout(make_coord(idx_buffer_idx, dense_c_idx)) => idx
+  // IndexedGather obtains idx by applying (gmem_base_ptr + gather_idx_buf[idx_buffer_idx] + dense_offset)
+  auto xformed_act_gather_outer = make_layout(
+    make_shape(_1{},_1{}),
+    make_stride(CustomStride{IndexedGather{gather_idx_buf}, C}, _1{}));
+
+  // Compose the inner and outer layouts
+  // gather_composed(make_coord((nzpq), (csrt))) => idx
+  auto xformed_act_composed_layout = composition(
+    xformed_act_gather_outer,
+    make_arithmetic_tuple(_0{}, _0{}),
+    xformed_act_logical_inner);
+
+  // Output scatter layout
+  auto out_basis_stride = make_stride(
+    E<1>{},
+    make_stride(Z*P*Q*E<0>{}, P*Q*E<0>{}, Q*E<0>{}, _1{}*E<0>{})); // -> (crd0, crd1)
+  auto out_basis_layout = make_layout(shape(output_layout), out_basis_stride);
+  auto out_scatter_layout = make_layout(
+    make_shape(_1{},_1{}),
+    make_stride(CustomStride{IndexedGather{scatter_idx_buf}, K}, _1{}));
+  auto out_composed_layout = composition(
+    out_scatter_layout,
+    make_arithmetic_tuple(_0{},_0{}),
+    out_basis_layout);
+
+  Tensor mXformedActGather = make_tensor(make_gmem_ptr(activations), xformed_act_composed_layout);
+  Tensor mFilter = make_tensor(make_gmem_ptr(filter), filter_layout);
+  Tensor mOutputScatter = make_tensor(make_gmem_ptr(output), out_composed_layout);  // (K, (N,Z,P,Q))
+
+  Tensor gOutput_mn = zipped_divide(mOutputScatter, typename AmpereUnpredicatedFprop::TilerOut{}); // ((BLK_M, BLK_N), (m', n'))
+  dim3 lauch_grid {static_cast<uint32_t>(size<1,1>(gOutput_mn)), static_cast<uint32_t>(size<1,0>(gOutput_mn)), 1};
+  constexpr size_t smem_size = sizeof(typename AmpereUnpredicatedFprop::SharedStorage);
+
+  print("xforemed gather layout ((N,Z,P,Q), (C,T,R,S)) = "); print(xformed_act_composed_layout); print("\n");
+  print("Output  scatter layout ( K,        (N,Z,P,Q)) = "); print(out_composed_layout);         print("\n");
+  print("Filter layout          ( K,        (C,T,R,S)) = "); print(filter_layout);               print("\n");
+
+  CHECK_CUDA(cudaFuncSetAttribute(
+    kernel_entrypoint<AmpereUnpredicatedFprop, decltype(mFilter), decltype(mXformedActGather), decltype(mOutputScatter)>,
+    cudaFuncAttributeMaxDynamicSharedMemorySize,
+    smem_size));
+
+  cudaEvent_t start, stop;
+  CHECK_CUDA(cudaEventCreate(&start));
+  CHECK_CUDA(cudaEventCreate(&stop));
+  CHECK_CUDA(cudaEventRecord(start));
+  for (int i = 0; i < num_iterations; ++i) {
+    kernel_entrypoint<AmpereUnpredicatedFprop, decltype(mFilter), decltype(mXformedActGather), decltype(mOutputScatter)>
+      <<<lauch_grid, AmpereUnpredicatedFprop::MaxThreadsPerBlock, smem_size>>>(
+          mFilter, mXformedActGather, mOutputScatter);
+  }
+  CHECK_CUDA(cudaEventRecord(stop));
+  CHECK_CUDA(cudaEventSynchronize(stop));
+  float milliseconds = 0;
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  milliseconds /= float(num_iterations);
+
+  double tflop_count = (2 * double(size<0>(xformed_act_logical_inner)) * double(size(filter_layout))) / double(1e12);
+  double tflops = tflop_count / (double(milliseconds) / double(1e3));
+  printf("Conv TFLOP count = %f\n", tflop_count);
+  printf("Conv gather/scatter perf: %fms | TFLOP/s = %f\n", milliseconds, tflops);
+
+  return 0;
+}
+
+int
+main(int argc, char const** argv) {
+  cutlass::CommandLine cmd(argc, argv);
+  std::cout << "Ampere convolution forward propogation kernel supporting both affine and gather/scatter tensors.\n\n";
+  if (cmd.check_cmd_line_flag("help")) {
+    std::cout
+      << "Options:\n"
+         "\t--n=<int>    Sets the number of images for the input activation tensor (dataset size). Default = 131072.\n"
+         "\t--i=<int>    Sets the benchmarking repetitions. Default = 128.\n"
+         "\t--nocheck    If specified, skips the reference check for dense kernel.\n"
+         "\t--help       Displays this help message and exits.\n";
+    return 0;
+  }
+
+
+  cudaDeviceProp props;
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+  if (props.major < 8) {
+    std::cerr << "This example requires an Ampere GPU or newer.\n";
+    return 0;
+  }
+
+  int num_images = 4320;
+  cmd.get_cmd_line_argument("n", num_images, 4320);
+  int num_iterations = 128;
+  cmd.get_cmd_line_argument("i", num_iterations, 128);
+  bool do_host_ref_check = not cmd.check_cmd_line_flag("no-check");
+
+  auto D = typename AmpereUnpredicatedFprop::D{};
+  auto H = typename AmpereUnpredicatedFprop::H{};
+  auto W = typename AmpereUnpredicatedFprop::W{};
+  auto Z = typename AmpereUnpredicatedFprop::Z{};
+  auto P = typename AmpereUnpredicatedFprop::P{};
+  auto Q = typename AmpereUnpredicatedFprop::Q{};
+  auto C = typename AmpereUnpredicatedFprop::C{};
+  auto K = typename AmpereUnpredicatedFprop::K{};
+
+  auto activation_layout = make_layout(
+    make_shape (make_shape (num_images,     D,   H, W), make_shape ( C,   _1{},_1{},_1{})),
+    make_stride(make_stride(   D*H*W*C, H*W*C, W*C, C), make_stride(_1{}, _0{},_0{},_0{})));
+
+  auto filter_layout = typename AmpereUnpredicatedFprop::GmemLayoutFlt{};
+
+  auto output_layout = make_ordered_layout(
+    make_shape( K,   make_shape(num_images,   Z,   P,   Q)),
+    make_step (_0{}, make_step (      _4{},_3{},_2{},_1{})));
+
+  print("Filter layout     ( K,        (C,T,R,S)) = "); print(filter_layout);     print("\n");
+  print("Activation layout ((N,D,H,W), (C,1,1,1)) = "); print(activation_layout); print("\n");
+  print("Output layout     ( K,        (N,Z,P,Q)) = "); print(output_layout);     print("\n");
+
+  // allocate tensors
+  std::cout << "Allocating tensors ... ";
+  thrust::universal_vector<float> activation_data(size_t(cute::size(activation_layout)), float(0));
+  thrust::universal_vector<float> filter_data(size_t(cute::size(filter_layout)), float(0));
+  thrust::universal_vector<float> output_data(size_t(cute::size(output_layout)), float(0));
+  thrust::universal_vector<float> output_data_ref(size_t(cute::size(output_layout)), float(0));
+  std::cout << "done.\n";
+
+  // init tensors
+  std::cout << "Initializing data ... " << std::flush;
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_real_distribution<float> uniform_dist(-1.0, 1.0);
+  for (std::size_t i = 0; i < size_t(cute::size(activation_layout)); ++i) {
+    activation_data[i] = uniform_dist(gen);
+  }
+
+  for (std::size_t i = 0; i < size_t(cute::size(filter_layout)); ++i) {
+    filter_data[i] = uniform_dist(gen);
+  }
+  std::cout << "done.\n";
+
+  // set up index buffers for gather/scatter, fill with indireciton indices in reversed order
+  std::cout << "Initializing gather/scatter index buffers ... ";
+  thrust::universal_vector<uint32_t> gather_idx_buf(size_t(size<0>(activation_layout)));
+  thrust::universal_vector<uint32_t> scatter_idx_buf(size_t(size<1>(output_layout)));
+  thrust::sequence(gather_idx_buf.rbegin(), gather_idx_buf.rend());
+  thrust::sequence(scatter_idx_buf.rbegin(), scatter_idx_buf.rend());
+  std::cout << "done.\n";
+
+  // launch dense
+  std::cout << "\nRunning dense fprop kernel\n";
+  int passed = ampere_dense_conv_fprop(
+    num_images,
+    activation_data.data().get(),
+    filter_data.data().get(),
+    output_data.data().get(),
+    output_data_ref.data().get(),
+    num_iterations,
+    do_host_ref_check);
+
+  // launch gather/scatter
+  std::cout << "\nRunning gather/scatter fprop kernel\n";
+  ampere_gather_scatter_conv_fprop(
+    num_images,
+    activation_data.data().get(),
+    gather_idx_buf.data().get(),
+    filter_data.data().get(),
+    output_data.data().get(),
+    scatter_idx_buf.data().get(),
+    num_iterations);
+
+  return passed;
+}
diff --git a/examples/60_cutlass_import/CMakeLists.txt b/examples/60_cutlass_import/CMakeLists.txt
new file mode 100644
index 0000000000..974bf4102b
--- /dev/null
+++ b/examples/60_cutlass_import/CMakeLists.txt
@@ -0,0 +1,66 @@
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# This example demonstrates building against and utilizing an
+# installed CUTLASS library. Unlike the other examples, this example
+# is not built within the standard CUTLASS CMake flow, but rather
+# relies on a pre-installed CUTLASS package. If the CUTLASS package is
+# not installed in a standard location, provide the root location of
+# the install with "-DCUTLASS_DIR=<root-of-cutlass-install>" CMake
+# argument or any of the other features CMake allows for specifying
+# locations of installed CMake packages via find_package().
+
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+
+project(cutlass_import_example VERSION 0.2 LANGUAGES CXX CUDA)
+
+if (CUTLASS_DIR)
+  message(STATUS "Using CUTLASS specified at ${CUTLASS_DIR}.")
+  list(APPEND CMAKE_PREFIX_PATH ${CUTLASS_DIR})
+endif()
+
+find_package(NvidiaCutlass 2.0 REQUIRED)
+
+message(STATUS "CUTLASS: ${NvidiaCutlass_DIR}")
+
+add_executable(example)
+
+target_sources(example PRIVATE main.cpp)
+
+target_include_directories(
+  example
+  PRIVATE
+  ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
+  )
+
+target_link_libraries(
+  example 
+  PRIVATE
+  nvidia::cutlass::cutlass
+  nvidia::cutlass::library
+  )
diff --git a/examples/60_cutlass_import/main.cpp b/examples/60_cutlass_import/main.cpp
new file mode 100644
index 0000000000..f17f545892
--- /dev/null
+++ b/examples/60_cutlass_import/main.cpp
@@ -0,0 +1,66 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief This example demonstrates utilizing an installed CUTLASS library.
+*/
+
+#include <iostream>
+
+#include <cutlass/cutlass.h>
+#include <cutlass/library/manifest.h>
+
+int main(int argc, char ** argv) {
+
+    // The operations built into the CUTLASS library are managed by a
+    // Manifest. The manifest is populated with a call to one of the
+    // "initialize" methods. 
+
+    cutlass::library::Manifest manifest;
+
+    initialize_all(manifest);
+
+    // Once initialized, the manifest can be queried for operations,
+    // and those operations can be further inspected via methods
+    // exposed in the library headers.
+    // 
+    // Here, we simply enumerate the embedded kernels and list them.
+
+    auto & opVec = manifest.operations();
+
+    std::cout << "Manifest contains " << opVec.size() << " operations, listed below." << std::endl;
+
+    for(auto opIter = manifest.begin(); opIter != manifest.end(); ++opIter) {
+        std::cout << (*opIter)->description().name << std::endl;
+    }
+
+    return 0;
+}
diff --git a/examples/61_hopper_gemm_with_topk_and_softmax/61_hopper_gemm_with_topk_and_softmax.cu b/examples/61_hopper_gemm_with_topk_and_softmax/61_hopper_gemm_with_topk_and_softmax.cu
new file mode 100644
index 0000000000..8bb14b4556
--- /dev/null
+++ b/examples/61_hopper_gemm_with_topk_and_softmax/61_hopper_gemm_with_topk_and_softmax.cu
@@ -0,0 +1,534 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief  Hopper GEMM + Top-K + Softmax fusion
+
+    This example illustrates how to use the LinCombTopKSoftmaxCol EVT node to fuse
+    Top-K and Softmax into the GEMM epilogue, with certain assumptions made.
+
+    Those assumptions are as:
+      1. Fusion is over the N dimension.
+      2. Top-K is either 2 or 4 elements, and the value is static (meaning two kernels have to be
+         compiled to support both.)
+      3. The GEMM tile shape along N is greater than or equal to problem size
+         along N.
+
+
+    The example runs the fused GEMM kernel, along with a standard unfused host reference, and
+    manually performs Top-K and softmax, and compares the error between tensors.
+
+    Note that some numerical error (smaller than 1e-5) is to be expected, but this is true
+    in most efficient reduction kernels, because floating point addition is not necessarily
+    associative.
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/error_metrics.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/gett.hpp"
+
+
+#include "helper.h"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+static constexpr int TopK = 2;
+static constexpr bool EnableTopKSoftmax = TopK > 1;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM kernel configurations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+// C matrix configuration
+using         ElementC    = void;
+using         LayoutC     = cutlass::layout::RowMajor;
+constexpr int AlignmentC  = 1;
+
+// D matrix configuration
+using         ElementD    = cutlass::half_t;                                // Element type for C and D matrix operands
+using         LayoutD     = cutlass::layout::RowMajor;                      // Layout type for output
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of output in units of elements (up to 16 bytes)
+
+// Core kernel configurations
+using ElementAccumulator  = float;                                          // Element type for internal accumulation
+using ElementCompute      = float;                                          // Element type for epilogue computation
+using ArchTag             = cutlass::arch::Sm90;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                 // Operator class tag
+using TileShape           = Shape<_64,_64,_128>;                            // Threadblock-level tile size
+using ClusterShape        = Shape<_1,_1,_1>;                                // Shape of the threadblocks in a cluster
+using KernelSchedule      = cutlass::gemm::KernelTmaWarpSpecialized;
+using EpilogueSchedule    = cutlass::epilogue::TmaWarpSpecialized;
+
+// Top-K + Softmax fusion operation
+using FusionOperation     = std::conditional_t<EnableTopKSoftmax,
+  typename cutlass::epilogue::fusion::LinCombTopKSoftmaxCol<TopK, ElementD, ElementCompute>,
+  typename cutlass::epilogue::fusion::LinearCombination<ElementD, ElementCompute, ElementC, ElementCompute>
+>;
+
+// The fusion op only allows for epilogue tiles matching the mainloop tile.
+using EpilogueTileType    = decltype(cute::take<0,2>(TileShape{}));
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    TileShape, ClusterShape,
+    EpilogueTileType,
+    ElementAccumulator, ElementCompute,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule,
+    FusionOperation
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    TileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))
+    >,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    Shape<int,int,int,int>, // Indicates ProblemShape
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+
+using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+// Extract information from Gemm kernel.
+using EpilogueOutputOp  = typename Gemm::EpilogueOutputOp;
+using ElementScalar     = typename EpilogueOutputOp::ElementScalar;
+
+using StrideA = typename Gemm::GemmKernel::StrideA;
+using StrideB = typename Gemm::GemmKernel::StrideB;
+using StrideD = typename Gemm::GemmKernel::StrideD;
+
+/// Initialization
+StrideA stride_A;
+StrideB stride_B;
+StrideD stride_D;
+uint64_t seed;
+
+cutlass::HostTensor<ElementA  , LayoutA  > tensor_A;
+cutlass::HostTensor<ElementB  , LayoutB  > tensor_B;
+cutlass::HostTensor<ElementD  , LayoutD  > tensor_D;
+cutlass::HostTensor<ElementD  , LayoutD  > tensor_ref_D;
+
+using LayoutScalar = cutlass::layout::PackedVectorLayout;
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Testbed utility types
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help = false;
+
+  int iterations = 1000;
+  int m = 16, n = 8, k = 64, l = 1;
+  double eps = 1e-5;
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("m", m);
+    cmd.get_cmd_line_argument("n", n);
+    cmd.get_cmd_line_argument("k", k);
+    cmd.get_cmd_line_argument("l", l);
+    cmd.get_cmd_line_argument("iterations", iterations);
+    cmd.get_cmd_line_argument("eps", eps);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "61_hopper_gemm_with_topk_and_softmax\n\n"
+      << "  Hopper FP8 GEMM with Top-K and softmax fusion.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement\n\n"
+      << "  --m=<int>                   Sets the M extent of the GEMM\n"
+      << "  --n=<int>                   Sets the N extent of the GEMM\n"
+      << "  --k=<int>                   Sets the K extent of the GEMM\n"
+      << "  --l=<int>                   Sets the l extent (batch) of the GEMM\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n"
+      << "  --eps=<float>               Threshold of numerical verification. Default: 1e-5.\n\n";
+
+    out
+      << "\n\nExamples:\n\n"
+      << "$ " << "61_hopper_gemm_with_topk_and_softmax" << " --m=16 --n=8 --k=1024 \n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const
+  {
+    // Two flops per multiply-add
+    uint64_t flop = uint64_t(2) * m * n * k;
+    double gflop = double(flop) / double(1.0e9);
+    return gflop / runtime_s;
+  }
+
+  float alpha() const {
+    return 1.f / static_cast<float>(k);
+  }
+};
+
+/// Result structure
+struct Result {
+  double avg_runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  Result(
+    double avg_runtime_ms = 0,
+    double gflops = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess)
+  :
+    avg_runtime_ms(avg_runtime_ms), gflops(gflops), status(status), error(error), passed(false)
+  {}
+
+};
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM setup and evaluation
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to initialize a block of device data
+template <typename Element, typename Layout>
+bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view,
+    uint64_t seed) {
+  cutlass::reference::host::TensorFillRandomUniform(
+    view, seed, /* max = */ 1, /* min = */ -1, /* bits = */ 2);
+  return true;
+}
+
+/// Initialize operands to be used in the GEMM and reference GEMM
+void initialize(const Options &options) {
+
+  stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(options.m, options.k, options.l));
+  stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(options.n, options.k, options.l));
+  stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(options.m, options.n, options.l));
+
+  auto a_coord = cutlass::make_Coord(options.m * options.l, options.k);
+  auto c_coord = cutlass::make_Coord(options.m * options.l, options.n);
+  auto b_coord = cutlass::make_Coord(options.k, options.n * options.l);
+
+  tensor_A.resize(a_coord);
+  tensor_B.resize(b_coord);
+  tensor_D.resize(c_coord);
+  tensor_ref_D.resize(c_coord);
+
+  initialize_tensor(tensor_A.host_view(), seed + 2022);
+  initialize_tensor(tensor_B.host_view(), seed + 2023);
+
+  tensor_A.sync_device();
+  tensor_B.sync_device();
+  tensor_D.sync_device();
+}
+
+/// Populates a Gemm::Arguments structure from the given commandline options
+typename Gemm::Arguments args_from_options(const Options &options) {
+  typename Gemm::Arguments arguments{
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    {options.m, options.n, options.k, options.l},
+    {tensor_A.device_data(), stride_A, tensor_B.device_data(), stride_B},
+    {
+      {options.alpha(), 0.f}, // alpha, beta
+      nullptr, stride_D,
+      tensor_D.device_data(), stride_D
+    }
+  };
+
+  return arguments;
+}
+
+bool verify(const Options &options) {
+  //
+  // Compute reference output
+  //
+
+  // Create instantiation for device reference gemm kernel
+  auto A = cute::make_tensor(tensor_A.host_data(),
+      cute::make_layout(cute::make_shape(options.m, options.k, options.l), stride_A));
+  auto B = cute::make_tensor(tensor_B.host_data(),
+      cute::make_layout(cute::make_shape(options.n, options.k, options.l), stride_B));
+  auto D = cute::make_tensor(tensor_ref_D.host_data(),
+      cute::make_layout(cute::make_shape(options.m, options.n, options.l), stride_D));
+  using unused_t = decltype(D);
+
+  cutlass::reference::host::GettMainloopParams<ElementAccumulator, decltype(A), decltype(B)> mainloop_params{A, B};
+
+  cutlass::reference::host::GettEpilogueParams<
+      ElementScalar,
+      ElementScalar,
+      ElementAccumulator,
+      ElementCompute,
+      unused_t,
+      decltype(D),
+      unused_t, // bias
+      unused_t, // aux
+      unused_t, // valpha
+      unused_t  // vbeta
+  > epilogue_params;
+
+  epilogue_params.D = D;
+  epilogue_params.alpha = options.alpha();
+  epilogue_params.beta = 0.f;
+
+  // get reference result
+  cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
+
+  if constexpr (EnableTopKSoftmax) {
+    // top-K + softmax
+    for (int i = 0; i < options.m; ++i) {
+
+      // Find Top-K
+      cutlass::Array<ElementAccumulator, TopK> top_k;
+      top_k.fill(-cutlass::platform::numeric_limits<ElementCompute>::infinity());
+      for (int j = 0; j < options.n; ++j) {
+        auto val = static_cast<ElementAccumulator>(tensor_ref_D.host_view().ref().at({i, j}));
+        for (int top_k_idx = 0; top_k_idx < TopK; ++top_k_idx) {
+          if (val > top_k[top_k_idx]) {
+            // Shift down
+            for (int l = TopK - 1; l > top_k_idx; --l) {
+              top_k[l] = top_k[l - 1];
+            }
+            top_k[top_k_idx] = val;
+            break;
+          }
+        }
+      }
+
+      // This formulation of top-K + softmax only works when it is
+      // guaranteed that none of the top-K elements are repeated!
+      // If this is the case, the device kernel can also make mistakes, because
+      //   A. Once the top-K values are reduced, and the operation is being applied,
+      //      there is no way to tell repeated elements apart, so none are masked.
+      //   B. The softmax sum of exps will be incorrect (because the repeated elements
+      //      are not repeated in it.)
+
+      ElementAccumulator max = top_k[0];
+      ElementAccumulator sum = ElementAccumulator(0.f);
+      for (int top_k_idx = 0; top_k_idx < TopK; ++top_k_idx) {
+        sum = sum + cutlass::fast_exp(top_k[top_k_idx] - max);
+      }
+
+      for (int j=0; j < options.n; ++j) {
+        auto val = tensor_ref_D.host_view().ref().at({i, j});
+        if (val < top_k[TopK - 1]) {
+          tensor_ref_D.host_view().ref().at({i, j}) = static_cast<ElementD>(0.f);
+        } else {
+          // Softmax
+          auto softmax_val = cutlass::fast_exp(val - max) / sum;
+          tensor_ref_D.host_view().ref().at({i, j}) = static_cast<ElementD>(softmax_val);
+        }
+      }
+    }
+  }
+
+  // compare_reference
+  tensor_D.sync_host();
+
+  double err = cutlass::reference::host::TensorRelativeErrorMetric(
+    tensor_D.host_view(),
+    tensor_ref_D.host_view());
+  bool passed = err < options.eps;
+
+  if (options.m <= 32 && options.n <= 32) {
+    std::cout << "GEMM output:\n" << tensor_D.host_view() << "\n\n";
+    std::cout << "Reference output:\n" << tensor_ref_D.host_view() << "\n\n";
+  }
+
+  std::cout << "  Disposition: " << (passed ? "Passed" : "Failed") << " \t Relative error: " << err << std::endl;
+
+  return passed;
+}
+
+/// Execute a given example GEMM computation
+template <typename Gemm>
+int run(Options &options) {
+  initialize(options);
+
+  // Instantiate CUTLASS kernel depending on templates
+  Gemm gemm;
+
+  // Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
+  auto arguments = args_from_options(options);
+
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  // Check if the problem size is supported or not
+  CUTLASS_CHECK(gemm.can_implement(arguments));
+
+  // Initialize CUTLASS kernel with arguments and workspace pointer
+  CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
+
+  // Correctness / Warmup iteration
+  CUTLASS_CHECK(gemm.run());
+
+  // Check if output from CUTLASS kernel and reference kernel are equal or not
+  Result result;
+  result.passed = verify(options);
+
+  if (!result.passed) {
+    exit(-1);
+  }
+
+  // Run profiling loop
+  if (options.iterations > 0) {
+    GpuTimer timer;
+    timer.start();
+    for (int iter = 0; iter < options.iterations; ++iter) {
+      CUTLASS_CHECK(gemm.run());
+    }
+    timer.stop();
+
+    // Compute average runtime and GFLOPs.
+    float elapsed_ms = timer.elapsed_millis();
+    result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
+    result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
+
+    std::cout << "  Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
+    std::cout << "  Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
+    std::cout << "  GFLOPS: " << result.gflops << std::endl;
+  }
+
+  return 0;
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  // CUTLASS must be compiled with CUDA 12.0 Toolkit to run this example
+  // and must have compute capability at least 90.
+  if (__CUDACC_VER_MAJOR__ < 12) {
+    std::cerr << "This example requires CUDA 12 or newer.\n";
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+
+  cudaDeviceProp props;
+  int current_device_id;
+  CUDA_CHECK(cudaGetDevice(&current_device_id));
+  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (props.major < 9) {
+    std::cerr
+      << "This example requires a GPU of NVIDIA's Hopper Architecture or "
+      << "later (compute capability 90 or greater).\n";
+    return 0;
+  }
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  //
+  // Evaluate CUTLASS kernels
+  //
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+  run<Gemm>(options);
+#endif
+
+  return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/61_hopper_gemm_with_topk_and_softmax/CMakeLists.txt b/examples/61_hopper_gemm_with_topk_and_softmax/CMakeLists.txt
new file mode 100644
index 0000000000..7d9160a733
--- /dev/null
+++ b/examples/61_hopper_gemm_with_topk_and_softmax/CMakeLists.txt
@@ -0,0 +1,32 @@
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cutlass_example_add_executable(
+  61_hopper_gemm_with_topk_and_softmax
+  61_hopper_gemm_with_topk_and_softmax.cu
+  )
diff --git a/examples/62_hopper_sparse_gemm/62_hopper_sparse_gemm.cu b/examples/62_hopper_sparse_gemm/62_hopper_sparse_gemm.cu
new file mode 100644
index 0000000000..c3f1ce709a
--- /dev/null
+++ b/examples/62_hopper_sparse_gemm/62_hopper_sparse_gemm.cu
@@ -0,0 +1,596 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Hopper Sparse GEMM example.
+
+  This example demonstrates how to construct and run a structured sparse GEMM kernel
+  on NVIDIA Hopper architecture.
+    
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/transform/device/transform_universal_adapter.hpp"
+#include "cutlass/transform/kernel/sparse_gemm_compressor.hpp"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+
+#include "helper.h"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SPARSE_SM90_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM kernel configurations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutTagA  = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutTagB  = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+// C/D matrix configuration
+using         ElementC    = float;                                          // Element type for C and D matrix operands
+using         LayoutTagC  = cutlass::layout::ColumnMajor;                   // Layout type for C and D matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+
+// Core kernel configurations
+using ElementAccumulator  = float;                                          // Element type for internal accumulation
+using TileShape           = Shape<_128,_128,_128>;                          // Threadblock-level tile size for sparse kernel
+using TileShapeRef        = Shape<_128,_128, _64>;                          // Threadblock-level tile size for reference (dense) kernel
+using ClusterShape        = Shape<_1,_2,_1>;                                // Shape of the threadblocks in a cluster
+using KernelSchedule      = cutlass::gemm::KernelTmaWarpSpecialized;        // Kernel schedule policy
+using EpilogueSchedule    = cutlass::epilogue::TmaWarpSpecialized;          // Epilogue schedule policy
+
+using ProblemShape = Shape<int,int,int,int>;
+
+// Sparse kernel setup
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    TileShape, ClusterShape,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutTagC, AlignmentC,
+    ElementC, LayoutTagC, AlignmentC,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
+    ElementA, LayoutTagA, AlignmentA,
+    ElementB, LayoutTagB, AlignmentB,
+    ElementAccumulator,
+    TileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    ProblemShape,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+
+using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+// Reference (dense) kernel setup
+
+using CollectiveEpilogueRef = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    TileShapeRef, ClusterShape,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutTagC, AlignmentC,
+    ElementC, LayoutTagC, AlignmentC,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloopRef = typename cutlass::gemm::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    ElementA, LayoutTagA, AlignmentA,
+    ElementB, LayoutTagB, AlignmentB,
+    ElementAccumulator,
+    TileShapeRef, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernelRef = cutlass::gemm::kernel::GemmUniversal<
+    ProblemShape,
+    CollectiveMainloopRef,
+    CollectiveEpilogue
+>;
+
+using GemmRef = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelRef>;
+
+// Layouts 
+using LayoutA = typename Gemm::GemmKernel::CollectiveMainloop::LayoutA;
+using LayoutE = typename Gemm::GemmKernel::CollectiveMainloop::LayoutE;
+using StrideB = typename Gemm::GemmKernel::StrideB;
+using StrideC = typename Gemm::GemmKernel::StrideC;
+using StrideD = typename Gemm::GemmKernel::StrideD;
+
+// Layouts for reference (non-sparse) tensors
+using StrideA = cutlass::gemm::TagToStrideA_t<LayoutTagA>;
+using StrideE = StrideA;
+
+using ElementE = typename Gemm::GemmKernel::CollectiveMainloop::ElementE;
+using SparseConfig = typename Gemm::GemmKernel::CollectiveMainloop::SparseConfig;
+
+// Offline compressor kernel
+using CompressorUtility = cutlass::transform::kernel::StructuredSparseCompressorUtility<
+                            ProblemShape,
+                            ElementA,
+                            LayoutTagA,
+                            SparseConfig>;
+
+using CompressorKernel = cutlass::transform::kernel::StructuredSparseCompressor<
+                            ProblemShape,
+                            ElementA,
+                            LayoutTagA,
+                            SparseConfig,
+                            cutlass::arch::Sm90>;
+
+using Compressor = cutlass::transform::device::TransformUniversalAdapter<CompressorKernel>;
+
+//
+// Data members
+//
+
+ProblemShape problem_shape;
+
+StrideA stride_A;
+StrideA stride_A_compressed;
+StrideE stride_E;
+StrideB stride_B;
+StrideC stride_C;
+StrideD stride_D;
+
+LayoutA layout_A;
+LayoutE layout_E;
+
+uint64_t seed;
+
+cutlass::DeviceAllocation<typename Gemm::ElementA> block_A;
+cutlass::DeviceAllocation<typename Gemm::ElementA> block_A_compressed;
+cutlass::DeviceAllocation<typename Gemm::CollectiveMainloop::ElementE> block_E;
+cutlass::DeviceAllocation<typename Gemm::ElementB> block_B;
+cutlass::DeviceAllocation<typename Gemm::ElementC> block_C;
+cutlass::DeviceAllocation<typename Gemm::EpilogueOutputOp::ElementOutput> block_D;
+cutlass::DeviceAllocation<typename Gemm::EpilogueOutputOp::ElementOutput> block_D_ref;
+
+#endif // defined(CUTLASS_ARCH_MMA_SPARSE_SM90_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Testbed utility types
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+
+  float alpha, beta;
+  int iterations;
+  int m, n, k, l;
+
+  Options():
+    help(false),
+    m(5120), n(4096), k(16384), l(1),
+    alpha(1.f), beta(0.f),
+    iterations(10)
+  { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("m", m);
+    cmd.get_cmd_line_argument("n", n);
+    cmd.get_cmd_line_argument("k", k);
+    cmd.get_cmd_line_argument("l", l);
+    cmd.get_cmd_line_argument("alpha", alpha);
+    cmd.get_cmd_line_argument("beta", beta);
+    cmd.get_cmd_line_argument("iterations", iterations);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "62_hopper_sparse_gemm\n\n"
+      << "  Hopper Sparse GEMM example.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement\n\n"
+      << "  --m=<int>                   Sets the M extent of the GEMM\n"
+      << "  --n=<int>                   Sets the N extent of the GEMM\n"
+      << "  --k=<int>                   Sets the K extent of the GEMM\n"
+      << "  --l=<int>                   Sets the L extent of the GEMM (batch size)\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha\n"
+      << "  --beta=<f32>                Epilogue scalar beta\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n";
+
+    out
+      << "\n\nExamples:\n\n"
+      << "$ " << "62_hopper_sparse_gemm" << " --m=4096 --n=5120 --k=8192 --l=1 --alpha=2 --beta=0.707 \n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const
+  {
+    // Two flops per multiply-add
+    uint64_t flop = uint64_t(2) * m * n * k;
+    double gflop = double(flop) / double(1.0e9);
+    return gflop / runtime_s;
+  }
+};
+
+#if defined(CUTLASS_ARCH_MMA_SPARSE_SM90_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM setup and evaluation
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to initialize a block of device data
+template <class Element>
+bool initialize_block(
+  cutlass::DeviceAllocation<Element>& block,
+  uint64_t seed) {
+
+  Element scope_max, scope_min;
+  int bits_input = cutlass::sizeof_bits<Element>::value;
+
+  if (bits_input == 1) {
+    scope_max = Element(2);
+    scope_min = Element(0);
+  } else if (bits_input <= 8) {
+    scope_max = Element(2);
+    scope_min = Element(-2);
+  } else {
+    scope_max = Element(8);
+    scope_min = Element(-8);
+  }
+
+  cutlass::reference::device::BlockFillRandomUniform(
+    block.get(), block.size(), seed, scope_max, scope_min, 0);
+
+  return true;
+}
+
+/// Make A structured sparse by replacing elements with 0 and compress it
+bool sparsify_and_compress()
+{
+  auto [M, N, K, L] = problem_shape;
+  CompressorUtility compressor_utility(problem_shape, stride_A);
+
+  int ME = compressor_utility.get_metadata_m_physical();
+  int KE = compressor_utility.get_metadata_k_physical();
+  int KC = compressor_utility.get_tensorA_k_physical();
+
+  block_A_compressed.reset(M * KC * L);
+  block_E.reset(ME * KE * L);
+
+  stride_A_compressed = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, KC, L));
+  stride_E = cutlass::make_cute_packed_stride(StrideE{}, cute::make_shape(ME, KE, L));
+
+  // Random sparsification is performed on host
+  std::vector<ElementA> block_A_host(block_A.size());
+  cutlass::device_memory::copy_to_host(block_A_host.data(), block_A.get(), block_A.size());
+  compressor_utility.structure_sparse_zero_mask_fill(block_A_host.data(), static_cast<int>(seed + 2024));
+  cutlass::device_memory::copy_to_device(block_A.get(), block_A_host.data(), block_A.size());
+
+  cutlass::KernelHardwareInfo hw_info;
+  hw_info.device_id = 0;
+  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+  typename Compressor::Arguments arguments {
+    problem_shape,
+    { block_A.get(),
+      stride_A,
+      block_A_compressed.get(),
+      block_E.get() },
+    {hw_info} };
+
+  Compressor compressor_op;
+  size_t workspace_size = Compressor::get_workspace_size(arguments);
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  CUTLASS_CHECK(compressor_op.can_implement(arguments));
+  CUTLASS_CHECK(compressor_op.initialize(arguments, workspace.get()));
+  CUTLASS_CHECK(compressor_op.run());
+  CUDA_CHECK(cudaDeviceSynchronize());
+
+  return true;
+}
+
+/// Initialize operands to be used in the GEMM and reference GEMM
+bool initialize(Options const& options) {
+
+  problem_shape = make_tuple(options.m, options.n, options.k, options.l);
+  auto [M, N, K, L] = problem_shape;
+
+  stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+  stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+  stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
+  stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
+
+  // Allocate memory for tensors
+  block_A.reset(M * K * L);
+  block_B.reset(N * K * L);
+  block_C.reset(M * N * L);
+  block_D.reset(M * N * L);
+  block_D_ref.reset(M * N * L);
+
+  // Fill input tensors with data
+  initialize_block(block_A, seed + 2021);
+  initialize_block(block_B, seed + 2022);
+  initialize_block(block_C, seed + 2023);
+
+  // Replace 0 in A with 1 to avoid metadata changes
+  std::vector<ElementA> block_A_host(block_A.size());
+  cutlass::device_memory::copy_to_host(block_A_host.data(), block_A.get(), block_A.size());
+  for (size_t i = 0; i < block_A.size(); ++i) if (block_A_host[i] == ElementA(0)) block_A_host[i] = ElementA(1.0);
+  cutlass::device_memory::copy_to_device(block_A.get(), block_A_host.data(), block_A.size());
+
+  if (!sparsify_and_compress()) {
+    return false;
+  };
+
+  // Build the compressed/metadata layouts
+  layout_A = SparseConfig::fill_layoutA(problem_shape);
+  layout_E = SparseConfig::fill_layoutE(problem_shape);
+
+  return true;
+}
+
+/// Populates a Gemm::Arguments structure from the given commandline options
+typename Gemm::Arguments make_args(Options const& options)
+{
+  typename Gemm::Arguments arguments{
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    problem_shape,
+    { block_A_compressed.get(), layout_A, block_B.get(), stride_B, block_E.get(), layout_E },
+    { { ElementAccumulator(options.alpha), ElementAccumulator(options.beta) },
+      block_C.get(), stride_C, block_D.get(), stride_D }
+  };
+
+  return arguments;
+}
+
+typename GemmRef::Arguments make_args_ref(Options const& options)
+{
+  typename GemmRef::Arguments arguments{
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    problem_shape,
+    { block_A.get(), stride_A, block_B.get(), stride_B },
+    { { ElementAccumulator(options.alpha), ElementAccumulator(options.beta) },
+      block_C.get(), stride_C, block_D_ref.get(), stride_D }
+  };
+
+  return arguments;
+}
+
+template<class Engine, class Layout>
+void print_device_tensor(cute::Tensor<Engine, Layout> const& t)
+{
+  // Assumes size = cosize, i.e. compact tensor
+  std::vector<typename Engine::value_type> data_host(t.size());
+  cutlass::device_memory::copy_to_host(data_host.data(), t.data(), t.size());
+  auto t_host = cute::make_tensor(data_host.data(), t.layout());
+  cute::print_tensor(t_host);
+}
+
+bool verify(Options const& options) {
+  CUDA_CHECK(cudaDeviceSynchronize());
+
+  bool passed = cutlass::reference::device::BlockCompareEqual(block_D_ref.get(), block_D.get(), block_D.size());
+
+#if 0
+  if (!passed) {
+    auto [M, N, K, L] = problem_shape;
+    CompressorUtility compressor_utility(problem_shape, stride_A);
+    int ME = compressor_utility.get_metadata_m_physical();
+    int KE = compressor_utility.get_metadata_k_physical();
+    int KC = compressor_utility.get_tensorA_k_physical();
+
+    cute::print("A (original): "); print_device_tensor(make_tensor(block_A.get(), make_shape(M, K, L), stride_A));
+    cute::print("A (compressed): "); print_device_tensor(make_tensor(block_A_compressed.get(), make_shape(M, KC, L), stride_A_compressed));
+    cute::print("E (physical): "); print_device_tensor(make_tensor(block_E.get(), make_shape(ME, KE, L), stride_E));
+    cute::print("E (logical): "); print_device_tensor(make_tensor(block_E.get(), upcast<CollectiveMainloop::ElementEMmaSparsity>(layout_E)));
+    cute::print("B: "); print_device_tensor(make_tensor(block_B.get(), make_shape(N, K, L), stride_B));
+    cute::print("C: "); print_device_tensor(make_tensor(block_C.get(), make_shape(M, N, L), stride_C));
+    cute::print("D reference: "); print_device_tensor(make_tensor(block_D_ref.get(), make_shape(M, N, L), stride_D));
+    cute::print("D  computed: "); print_device_tensor(make_tensor(block_D.get(), make_shape(M, N, L), stride_D));
+  }
+#endif
+
+  return passed;
+}
+
+template<typename Gemm>
+struct Runner
+{
+  using Arguments = typename Gemm::Arguments;
+
+  Runner(Arguments args): arguments(args) {
+    // Using the arguments, query for extra workspace required for matrix multiplication computation
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+    // Allocate workspace memory
+    workspace.reset(workspace_size);
+
+    // Check if the problem size is supported or not
+    CUTLASS_CHECK(gemm.can_implement(arguments));
+  }
+
+  void run() {
+    CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
+    CUTLASS_CHECK(gemm.run());
+  }
+
+  void benchmark(Options const& options) {
+    if (options.iterations > 0)
+    {
+      GpuTimer timer;
+      timer.start();
+      for (int iter = 0; iter < options.iterations; ++iter) {
+        run();
+      }
+      timer.stop();
+
+      // Compute average runtime and GFLOPs.
+      float elapsed_ms = timer.elapsed_millis();
+      double avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
+      double gflops = options.gflops(avg_runtime_ms / 1000.0);
+
+      std::cout << "  Avg runtime: " << avg_runtime_ms << " ms" << std::endl;
+      std::cout << "  GFLOPS: " << gflops << std::endl;
+    }
+  }
+
+  Gemm gemm;
+  Arguments arguments;
+  cutlass::device_memory::allocation<uint8_t> workspace;
+};
+
+/// Execute the example (verification and timing)
+void run(Options &options) {
+  bool init = initialize(options);
+  if (!init) {
+    std::cout << "Initialization failure" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
+  Runner<Gemm> gemm(make_args(options));
+  Runner<GemmRef> gemm_ref(make_args_ref(options));
+
+  gemm.run();
+  gemm_ref.run();
+
+  bool passed = verify(options);
+
+  std::cout << "  Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << std::endl;
+  std::cout << "  Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
+
+  if (!passed) {
+    exit(EXIT_FAILURE);
+  }
+
+  std::cout << "Sparse GEMM:" << std::endl;
+  gemm.benchmark(options);
+
+  std::cout << "Dense GEMM:" << std::endl;
+  gemm_ref.benchmark(options);
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SPARSE_SM90_SUPPORTED)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  // CUTLASS must be compiled with CUDA 12.2 Toolkit to run this example
+  // and must have compute capability at least 90.
+  if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 2)) {
+    std::cerr << "This example requires CUDA 12.2 or newer.\n";
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+
+  cudaDeviceProp props;
+  int current_device_id;
+  CUDA_CHECK(cudaGetDevice(&current_device_id));
+  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (props.major < 9) {
+    std::cerr
+      << "This example requires a GPU of NVIDIA's Hopper Architecture or "
+      << "later (compute capability 90 or greater).\n";
+    return 0;
+  }
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  //
+  // Evaluate CUTLASS kernels
+  //
+
+#if defined(CUTLASS_ARCH_MMA_SPARSE_SM90_SUPPORTED)
+  run(options);
+#endif
+
+  return EXIT_SUCCESS;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/62_hopper_sparse_gemm/CMakeLists.txt b/examples/62_hopper_sparse_gemm/CMakeLists.txt
new file mode 100644
index 0000000000..cf55da4552
--- /dev/null
+++ b/examples/62_hopper_sparse_gemm/CMakeLists.txt
@@ -0,0 +1,36 @@
+
+# Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# Sparse kernel in this example triggers an ICE in gcc 7.5
+if (NOT (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8.0))
+cutlass_example_add_executable(
+  62_hopper_sparse_gemm
+  62_hopper_sparse_gemm.cu
+  )
+endif()
diff --git a/examples/63_hopper_gemm_with_weight_prefetch/63_hopper_gemm_with_weight_prefetch.cu b/examples/63_hopper_gemm_with_weight_prefetch/63_hopper_gemm_with_weight_prefetch.cu
new file mode 100644
index 0000000000..03c54a8ee9
--- /dev/null
+++ b/examples/63_hopper_gemm_with_weight_prefetch/63_hopper_gemm_with_weight_prefetch.cu
@@ -0,0 +1,500 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Hopper FP8 GEMM + L2 Weight Prefetch
+
+    This example implements a non-persistent warp-specialized GEMM kernel for the Hopper
+    architecture with programmatic dependent launch (PDL) enabling prefetching weights into
+    L2 cache.
+    
+    For more information about dependent launch refer to the CUDA programming guide:
+    https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programmatic-dependent-launch-and-synchronization
+
+    In some cases, PDL can result in a window where a previous kernel is not actively utilizing 
+    DRAM, and the next kernel sits idle until the previous finishes. During this window, the next
+    kernel can begin loading a non-dependent operand (i.e. weights in a linear projection are
+    typically static) and cache it in L2.
+
+    The kernel and collective mainloop assume operand `A` corresponds to weights and operand `B`
+    corresponds to activations (so we can have very small batch/token count).
+    After initialization, the prefetch warp starts loading K tiles of `A` into an unused portion 
+    of shared memory, and loads up to half of all K tiles that the same CTA would eventually load.
+    The exact number of K tiles loaded is determined by `args.mainloop.prefetch_ratio` \in 
+    [0.0, 1.0]. Smaller values result in less prefetching, and larger values result in more.
+    Negative values result in a "best-effort" prefetch, meaning prefetcher will stop issuing weight
+    loads as soon as the activation DMA warp starts loading (as soon as it is signaled that the 
+    previous kernel has flushed its memory.)
+
+    The DMA warp responsible for loading `A` will also begin loading K tiles until it fills up
+    the available shared memory.
+    The DMA warp responsible for loading `B` will wait until activations are flushed to global 
+    memory by the preceding kernel.
+
+    Another mainloop parameter, `args.mainloop.overlap_ratio` \in [0.0, 1.0] determines how early 
+    the next kernel (the one doing the prefetch) is launched. Smaller values result in greater 
+    overlap, and larger values result in smaller overlap. Negative values disable PDL completely,
+    meaning there will be no overlap. This will make prefetch ineffective.
+
+    These two runtime parameters should be tuned per problem size and GEMM config combination, and
+    if feasible, per-operation in an entire layer or model.
+
+    NOTE: you must build this target with the following flag to enable Grid Dependency Control
+    instructions (GDC) in CUTLASS:
+      - CUTLASS_ENABLE_GDC_FOR_SM90
+
+    To lock persistence mode, power (350W), clocks (1005MHz) for evaluation (assumes device 0 and H100)
+
+      $ sudo nvidia-smi -pm 1 -i 0
+
+      $ sudo nvidia-smi -i 0 -pl 350
+
+      $ sudo nvidia-smi -i 0 -lgc 1005
+
+    Example:
+
+      $ mkdir build && cd build
+
+      $ cmake .. -DCUTLASS_NVCC_ARCHS="90a" -DCUTLASS_ENABLE_GDC_FOR_SM90=1
+
+      $ cd examples/63_hopper_gemm_with_weight_prefetch
+
+      $ make
+
+      $ ./63_hopper_gemm_with_weight_prefetch --p=0.5 --o=0.5
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/gett.hpp"
+
+
+#include "collective/dispatch_policy_extra.hpp"
+#include "collective/builder.hpp"
+#include "kernel/sm90_gemm_tma_warpspecialized_with_prefetch.hpp"
+
+#include "helper.h"
+#include "gemm_with_weight_prefetch_commandline.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM kernel configurations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// A matrix configuration
+using         ElementA    = cutlass::float_e4m3_t;                          // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using         ElementB    = cutlass::float_e5m2_t;                          // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+// C matrix configuration
+using         ElementC    = cutlass::float_e4m3_t;                          // Element type for C and D matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C and D matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+
+// D matrix configuration
+using         ElementD    = ElementC;
+using         LayoutD     = LayoutC;
+constexpr int AlignmentD  = AlignmentC;
+
+// Core kernel configurations
+using ElementAccumulator  = float;                                          // Element type for internal accumulation
+using ElementCompute      = float;                                          // Element type for epilogue computation
+using ArchTag             = cutlass::arch::Sm90;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                 // Operator class tag
+using TileShape           = Shape<_64,_64,_128>;                            // Threadblock-level tile size
+// Cluster_N > 1 is not supported yet.
+using ClusterShape        = Shape<_1,_1,_1>;                                // Shape of the threadblocks in a cluster
+using KernelSchedule      = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccumWithPrefetchAndSplitDMA;
+using EpilogueSchedule    = cutlass::epilogue::TmaWarpSpecialized;
+using EpilogueTileType    = cutlass::epilogue::collective::EpilogueTileAuto;
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    TileShape, ClusterShape,
+    EpilogueTileType,
+    ElementAccumulator, ElementCompute,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    TileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))
+    >,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    Shape<int,int,int,int>, // Indicates ProblemShape
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+
+using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+// Extract information from Gemm kernel.
+using EpilogueOutputOp  = typename Gemm::EpilogueOutputOp;
+using ElementScalar     = typename EpilogueOutputOp::ElementScalar;
+
+using StrideA = typename Gemm::GemmKernel::StrideA;
+using StrideB = typename Gemm::GemmKernel::StrideB;
+using StrideC = typename Gemm::GemmKernel::StrideC;
+using StrideD = typename Gemm::GemmKernel::StrideD;
+
+/// Initialization
+StrideA stride_A;
+StrideB stride_B;
+StrideC stride_C;
+StrideD stride_D;
+uint64_t seed;
+
+cutlass::HostTensor<ElementA  , LayoutA  > tensor_A;
+cutlass::HostTensor<ElementB  , LayoutB  > tensor_B;
+cutlass::HostTensor<ElementC  , LayoutC  > tensor_C;
+cutlass::HostTensor<ElementD  , LayoutD  > tensor_D;
+cutlass::HostTensor<ElementD  , LayoutD  > tensor_ref_D;
+
+using LayoutScalar = cutlass::layout::PackedVectorLayout;
+cutlass::HostTensor<ElementScalar, LayoutScalar> scalar_alpha;
+cutlass::HostTensor<ElementScalar, LayoutScalar> scalar_beta;
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Testbed utility types
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Result structure
+struct Result
+{
+  double avg_runtime_ms;
+  double gflops;
+  double eff_bw;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  Result(
+    double avg_runtime_ms = 0,
+    double gflops = 0,
+    double eff_bw = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess)
+  :
+    avg_runtime_ms(avg_runtime_ms), gflops(gflops), eff_bw(eff_bw), status(status), error(error), passed(false)
+  {}
+
+};
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM setup and evaluation
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to initialize a block of device data
+template <typename Element, typename Layout>
+bool initialize_tensor(
+  cutlass::TensorView<Element, Layout> view,
+  uint64_t seed) {
+
+  double scope_max, scope_min;
+  int bits_input = cutlass::sizeof_bits<Element>::value;
+  int bits_output = cutlass::sizeof_bits<Element>::value;
+
+  if (bits_input == 1) {
+    scope_max = 2;
+    scope_min = 0;
+  }
+  else if (bits_input <= 8) {
+    scope_max = 2;
+    scope_min = -2;
+  }
+  else if (bits_output == 16) {
+    scope_max = 5;
+    scope_min = -5;
+  }
+  else {
+    scope_max = 8;
+    scope_min = -8;
+  }
+  cutlass::reference::host::TensorFillRandomUniform(
+    view, seed, scope_max, scope_min, 0);
+
+  return true;
+}
+
+/// Initialize operands to be used in the GEMM and reference GEMM
+void initialize(const Options &options) {
+
+  stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(options.m, options.k, options.l));
+  stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(options.n, options.k, options.l));
+  stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(options.m, options.n, options.l));
+  stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(options.m, options.n, options.l));
+
+  auto a_coord = cutlass::make_Coord(options.m * options.l, options.k);
+  auto c_coord = cutlass::make_Coord(options.m * options.l, options.n);
+  auto b_coord = cutlass::make_Coord(options.k, options.n * options.l);
+
+  tensor_A.resize(a_coord);
+  tensor_B.resize(b_coord);
+  tensor_C.resize(c_coord);
+  tensor_D.resize(c_coord);
+  tensor_ref_D.resize(c_coord);
+
+  initialize_tensor(tensor_A.host_view(), seed + 2022);
+  initialize_tensor(tensor_B.host_view(), seed + 2023);
+  initialize_tensor(tensor_C.host_view(), seed + 2024);
+
+  tensor_A.sync_device();
+  tensor_B.sync_device();
+  tensor_C.sync_device();
+  tensor_D.sync_device();
+}
+
+/// Populates a Gemm::Arguments structure from the given commandline options
+typename Gemm::Arguments args_from_options(const Options &options)
+{
+  typename Gemm::Arguments arguments{
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    {options.m, options.n, options.k, options.l},
+    {tensor_A.device_data(), stride_A, tensor_B.device_data(), stride_B},
+    {
+      {}, // epilogue.thread
+      tensor_C.device_data(), stride_C,
+      tensor_D.device_data(), stride_D
+    }
+  };
+
+  auto &fusion_args = arguments.epilogue.thread;
+  fusion_args.alpha = options.alpha;
+  fusion_args.beta = options.beta;
+  fusion_args.alpha_ptr = scalar_alpha.device_data();
+  fusion_args.beta_ptr = scalar_beta.device_data();
+
+  arguments.mainloop.overlap_ratio = options.overlap_ratio;
+  arguments.mainloop.prefetch_ratio = options.prefetch_ratio;
+
+  return arguments;
+}
+
+bool verify(const Options &options) {
+  //
+  // Compute reference output
+  //
+
+  // Create instantiation for device reference gemm kernel
+  auto A = cute::make_tensor(tensor_A.host_data(),
+      cute::make_layout(cute::make_shape(options.m, options.k, options.l), stride_A));
+  auto B = cute::make_tensor(tensor_B.host_data(),
+      cute::make_layout(cute::make_shape(options.n, options.k, options.l), stride_B));
+  auto C = cute::make_tensor(tensor_C.host_data(),
+      cute::make_layout(cute::make_shape(options.m, options.n, options.l), stride_C));
+  auto D = cute::make_tensor(tensor_ref_D.host_data(),
+      cute::make_layout(cute::make_shape(options.m, options.n, options.l), stride_D));
+  using unused_t = decltype(D);
+
+  cutlass::reference::host::GettMainloopParams<ElementAccumulator, decltype(A), decltype(B)> mainloop_params{A, B};
+
+  cutlass::reference::host::GettEpilogueParams<
+      ElementScalar,
+      ElementScalar,
+      ElementAccumulator,
+      ElementCompute,
+      decltype(C),
+      decltype(D),
+      unused_t, // bias
+      unused_t, // aux
+      unused_t, // valpha
+      unused_t  // vbeta
+  > epilogue_params;
+
+  epilogue_params.C = C;
+  epilogue_params.D = D;
+  epilogue_params.alpha = options.alpha;
+  epilogue_params.beta = options.beta;
+
+  // get reference result
+  cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
+
+  // compare_reference
+  tensor_D.sync_host();
+  bool passed = cutlass::reference::host::TensorEquals(tensor_ref_D.host_view(), tensor_D.host_view());
+
+  return passed;
+}
+
+/// Execute a given example GEMM computation
+template <typename Gemm>
+int run(Options &options)
+{
+  initialize(options);
+
+  // Instantiate CUTLASS kernel depending on templates
+  Gemm gemm;
+
+  // Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
+  auto arguments = args_from_options(options);
+
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  // Check if the problem size is supported or not
+  CUTLASS_CHECK(gemm.can_implement(arguments));
+
+  // Initialize CUTLASS kernel with arguments and workspace pointer
+  CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
+
+  // Correctness / Warmup iteration
+  CUTLASS_CHECK(gemm.run(nullptr, nullptr, /* launch_with_pdl = */ options.overlap_ratio >= 0));
+
+  // Check if output from CUTLASS kernel and reference kernel are equal or not
+  Result result;
+  result.passed = verify(options);
+
+  std::cout << "  Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
+
+  if (!result.passed) {
+    exit(-1);
+  }
+
+  // Run profiling loop
+  if (options.iterations > 0)
+  {
+    GpuTimer timer;
+    timer.start();
+    for (int iter = 0; iter < options.iterations; ++iter) {
+      CUTLASS_CHECK(gemm.run(nullptr, nullptr, /* launch_with_pdl = */ options.overlap_ratio >= 0));
+    }
+    timer.stop();
+
+    // Compute average runtime and GFLOPs.
+    float elapsed_ms = timer.elapsed_millis();
+    result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
+    double avg_runtime_s = (double)(result.avg_runtime_ms / 1000.0);
+    result.gflops = options.gflops(avg_runtime_s);
+    result.eff_bw = options.effective_bandwidth(avg_runtime_s, sizeof(ElementA), sizeof(ElementB), sizeof(ElementC), sizeof(ElementD));
+
+    std::cout << "  Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
+    std::cout << "  Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
+    std::cout << "  GFLOPS: " << result.gflops << std::endl;
+    std::cout << "  Effective bandwidth: " << result.eff_bw << " GB/s" << std::endl;
+  }
+
+  return 0;
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  // CUTLASS must be compiled with CUDA 12.0 Toolkit to run this example
+  // and must have compute capability at least 90.
+  if (__CUDACC_VER_MAJOR__ < 12) {
+    std::cerr << "This example requires CUDA 12 or newer.\n";
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+
+  cudaDeviceProp props;
+  int current_device_id;
+  CUDA_CHECK(cudaGetDevice(&current_device_id));
+  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (props.major < 9) {
+    std::cerr
+      << "This example requires a GPU of NVIDIA's Hopper Architecture or "
+      << "later (compute capability 90 or greater).\n";
+    return 0;
+  }
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  //
+  // Evaluate CUTLASS kernels
+  //
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+  run<Gemm>(options);
+#endif
+
+  return 0;
+}
diff --git a/examples/63_hopper_gemm_with_weight_prefetch/CMakeLists.txt b/examples/63_hopper_gemm_with_weight_prefetch/CMakeLists.txt
new file mode 100644
index 0000000000..f48673241a
--- /dev/null
+++ b/examples/63_hopper_gemm_with_weight_prefetch/CMakeLists.txt
@@ -0,0 +1,36 @@
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+include_directories(
+  .
+)
+
+cutlass_example_add_executable(
+  63_hopper_gemm_with_weight_prefetch
+  63_hopper_gemm_with_weight_prefetch.cu
+  )
diff --git a/examples/63_hopper_gemm_with_weight_prefetch/README.md b/examples/63_hopper_gemm_with_weight_prefetch/README.md
new file mode 100644
index 0000000000..5dac1cc6c2
--- /dev/null
+++ b/examples/63_hopper_gemm_with_weight_prefetch/README.md
@@ -0,0 +1,82 @@
+# GEMM with L2 weight prefetch
+
+A non-persistent warp specialized GEMM directed at low latency inference.
+
+The kernel can optionally prefetch a portion of weights (operand `A`) into L2 cache while the 
+rest of the warps are waiting on the previous kernel to finish writing and flush its memory.
+An example of this is normalization or reduction kernels that are immediately followed by a GEMM.
+
+It exposes two runtime parameters:
+1. `overlap_ratio`: how early `griddepcontrol.launch_dependent_grids` is issued. 
+   Default is `0.5`, meaning after approximately half of K tiles are loaded by DMA warps.
+2. `prefetch_ratio`: what percentage of K tiles to prefetch. 
+   Default is `-1.0`, meaning prefetching will stop as soon as other DMA warps are past
+   `griddepcontrol`.
+
+It is highly recommended to auto-tune these parameters per GEMM and according to some end to end 
+runtime (either an entire transformer layer or multiple, but probably not the entire model.)
+
+TMA loads use non-default cache hints: `A` (weights) are loaded with `EvictFirst`, and `B` (activation)
+is loaded with `EvictLast`.
+
+## Getting started
+To use this kernel in your own target, add this directory to your includes, and include the 
+following headers from this example:
+
+```cxx
+#include "collective/dispatch_policy_extra.hpp"
+#include "collective/builder.hpp"
+#include "kernel/sm90_gemm_tma_warpspecialized_with_prefetch.hpp"
+```
+
+And then use either one of the new kernel schedules:
+
+```cxx
+// Without separate warps for A and B
+using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccumWithPrefetch;
+
+// With separate warps for A and B
+using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccumWithPrefetchAndSplitDMA;
+```
+
+The kernel with separate warps for A and B (
+`KernelTmaWarpSpecializedFP8FastAccumWithPrefetchAndSplitDMA`)
+is expected to be more performant than the other, especially since it allows the kernel to load 
+weights into shmem ahead of the `griddepcontrol`.
+
+As for other GEMM parameters, Thread Block Cluster larger than 1 CTA are not yet supported, and
+obviously the kernel layer implementation is warp specialized and uses the TMA, and other kernel
+layers or collectives require reimplementation.
+
+## Example
+
+Using the example is mostly straightforward.
+Just build, and run with your choice of `MNK`:
+
+```bash
+./63_hopper_gemm_with_weight_prefetch --m=8192 --n=1 --k=8192
+```
+
+You can also disable the overlap or try different overlap and prefetch ratios and see the
+difference:
+
+```bash
+echo "Without overlap and prefetch"
+./63_hopper_gemm_with_weight_prefetch --o=-1.0 --p=-1.0
+
+echo "Overlap ratio of 0.5, best effort prefetch"
+./63_hopper_gemm_with_weight_prefetch --o=0.5 --p=-1.0
+
+echo "Overlap ratio of 0.8, prefetch ratio of 0.7"
+./63_hopper_gemm_with_weight_prefetch --o=0.8 --p=0.7
+```
+
+However, note that the example still runs a single GEMM, and most of the performance improvement
+is expected in end to end applications.
+
+
+## Limitations
+* The parameter defaults are typically not good choices, especially `prefetch_ratio`. 
+  When `prefetch_ratio` is unspecified (set to `-1.0`), the prefetch warp will `try_wait` on a 
+  memory barrier before issuing every single TMA load, and in many cases this will slow down 
+  prefetching to the point of being almost ineffective.
diff --git a/examples/63_hopper_gemm_with_weight_prefetch/collective/builder.hpp b/examples/63_hopper_gemm_with_weight_prefetch/collective/builder.hpp
new file mode 100644
index 0000000000..bfb64820f0
--- /dev/null
+++ b/examples/63_hopper_gemm_with_weight_prefetch/collective/builder.hpp
@@ -0,0 +1,242 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "dispatch_policy_extra.hpp"
+#include "sm90_mma_tma_gmma_ss_warpspecialized_with_prefetch.hpp"
+#include "../pipeline/prefetch_pipeline_sm90.hpp"
+
+namespace cutlass::gemm::collective {
+
+namespace detail {
+
+// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count. 
+template<int CapacityBytes, class ElementA, class ElementB, class TileShapeMNK, int stages>
+constexpr int
+compute_stage_count_or_override_prefetch(StageCount<stages> stage_count) {
+  return stages;
+}
+
+// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count. 
+template<int CapacityBytes, class ElementA, class ElementB, class TileShapeMNK, int carveout_bytes>
+constexpr int
+compute_stage_count_or_override_prefetch(StageCountAutoCarveout<carveout_bytes> stage_count) {
+  constexpr auto mainloop_pipeline_bytes = sizeof(typename cutlass::PipelineTmaAsync<1>::SharedStorage);
+  constexpr auto prefetch_pipeline_bytes = sizeof(typename cutlass::detail::PrefetcherPipelineSharedStorage<PrefetchStages>);
+  constexpr auto a_bits = cute::sizeof_bits_v<ElementA>;
+  constexpr auto b_bits = cute::sizeof_bits_v<ElementB>;
+  constexpr int MK_bytes = cutlass::bits_to_bytes(a_bits * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{})); //also the prefetch smem size
+  constexpr int NK_bytes = cutlass::bits_to_bytes(b_bits * size<1>(TileShapeMNK{}) * size<2>(TileShapeMNK{}));
+  constexpr int stage_bytes = MK_bytes + NK_bytes + static_cast<int>(mainloop_pipeline_bytes);
+
+  return (CapacityBytes - carveout_bytes - MK_bytes * PrefetchStagesActual - prefetch_pipeline_bytes) / stage_bytes;
+}
+
+} // namespace detail
+
+// GMMA_TMA_WS_FP8_FAST_ACCUM_SS + prefetch
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelScheduleType,
+    cute::enable_if_t<
+      cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedFP8FastAccumWithPrefetch>>
+> {
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
+                "Not meet TMA alignment requirement yet\n");
+  static_assert(detail::is_input_fp8<ElementA, ElementB>(),
+                "Only FP8 datatypes are compatible with these kernel schedules\n");
+  // Dispatch TN fp8 kernels only to TMA warp specialized FP8 builder
+  static_assert(!detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>(),
+                 "Not supported for fp8 non-TN warp specialized kernels yet\n");
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+
+  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A<ElementA, GmemLayoutATag>();
+  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B<ElementB, GmemLayoutBTag>();
+
+  using AtomLayoutMNK = Layout<Shape<_1,_1,_1>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector<
+      ElementA, ElementB, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{}));
+
+  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+
+  using SmemLayoutAtomA = decltype(detail::ss_smem_selector<
+      GmmaMajorA, ElementA, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+  using SmemLayoutAtomB = decltype(detail::ss_smem_selector<
+      GmmaMajorB, ElementB, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  static constexpr int PipelineStages = detail::compute_stage_count_or_override_prefetch<detail::sm90_smem_capacity_bytes,
+      ElementA, ElementB, TileShape_MNK>(StageCountType{});
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedWithPrefetch<PipelineStages, ClusterShape_MNK, KernelScheduleType>;
+
+  using SmemCopyAtomA = void;
+  using SmemCopyAtomB = void;
+
+  using CollectiveOp = CollectiveMma<
+      DispatchPolicy,
+      TileShape_MNK,
+      ElementA,
+      TagToStrideA_t<GmemLayoutATag>,
+      ElementB,
+      TagToStrideB_t<GmemLayoutBTag>,
+      TiledMma,
+      GmemTiledCopyA,
+      SmemLayoutAtomA,
+      SmemCopyAtomA,
+      cute::identity,
+      GmemTiledCopyB,
+      SmemLayoutAtomB,
+      SmemCopyAtomB,
+      cute::identity
+    >;
+};
+
+// GMMA_TMA_WS_FP8_FAST_ACCUM_SS + prefetch and split DMA warps
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelScheduleType,
+    cute::enable_if_t<
+      cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedFP8FastAccumWithPrefetchAndSplitDMA>>
+> {
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
+                "Not meet TMA alignment requirement yet\n");
+  static_assert(detail::is_input_fp8<ElementA, ElementB>(),
+                "Only FP8 datatypes are compatible with these kernel schedules\n");
+  // Dispatch TN fp8 kernels only to TMA warp specialized FP8 builder
+  static_assert(!detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>(),
+                 "Not supported for fp8 non-TN warp specialized kernels yet\n");
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+
+  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A<ElementA, GmemLayoutATag>();
+  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B<ElementB, GmemLayoutBTag>();
+
+  using AtomLayoutMNK = Layout<Shape<_1,_1,_1>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector<
+      ElementA, ElementB, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{}));
+
+  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+
+  using SmemLayoutAtomA = decltype(detail::ss_smem_selector<
+      GmmaMajorA, ElementA, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+  using SmemLayoutAtomB = decltype(detail::ss_smem_selector<
+      GmmaMajorB, ElementB, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  static constexpr int PipelineStages = detail::compute_stage_count_or_override_prefetch<detail::sm90_smem_capacity_bytes,
+      ElementA, ElementB, TileShape_MNK>(StageCountType{});
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedWithPrefetch<PipelineStages, ClusterShape_MNK, KernelScheduleType>;
+
+  using SmemCopyAtomA = void;
+  using SmemCopyAtomB = void;
+
+  using CollectiveOp = CollectiveMma<
+      DispatchPolicy,
+      TileShape_MNK,
+      ElementA,
+      TagToStrideA_t<GmemLayoutATag>,
+      ElementB,
+      TagToStrideB_t<GmemLayoutBTag>,
+      TiledMma,
+      GmemTiledCopyA,
+      SmemLayoutAtomA,
+      SmemCopyAtomA,
+      cute::identity,
+      GmemTiledCopyB,
+      SmemLayoutAtomB,
+      SmemCopyAtomB,
+      cute::identity
+    >;
+};
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/63_hopper_gemm_with_weight_prefetch/collective/dispatch_policy_extra.hpp b/examples/63_hopper_gemm_with_weight_prefetch/collective/dispatch_policy_extra.hpp
new file mode 100644
index 0000000000..37369176f9
--- /dev/null
+++ b/examples/63_hopper_gemm_with_weight_prefetch/collective/dispatch_policy_extra.hpp
@@ -0,0 +1,61 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+namespace cutlass::gemm {
+
+// Standard non-persistent kernel with a single producer warp, and one prefetch warp.
+// `A` is assumed to be static, and therefore the producer warp for `A` attempts to load `A` 
+// while the producer warp is waiting on griddepcontrol.
+// GDC `launch_dependent_grids` is issued from the producer warp instead of math warps, and 
+// according to prefetch ratio.
+struct KernelTmaWarpSpecializedFP8FastAccumWithPrefetch { };
+
+// Non-persistent kernel with two producer warps (one for each of A and B), and one prefetch warp.
+// `A` is assumed to be static, and therefore the producer warp for `A` attempts to load `A` 
+// while the producer warp for `B` is waiting on griddepcontrol. Producer warp for `A` does not 
+// wait on griddepcontrol and loads immediately.
+struct KernelTmaWarpSpecializedFP8FastAccumWithPrefetchAndSplitDMA { };
+
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  class KernelSchedule = KernelTmaWarpSpecializedFP8FastAccumWithPrefetch
+>
+struct MainloopSm90TmaGmmaWarpSpecializedWithPrefetch {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm90;
+  using Schedule = KernelSchedule;
+};
+
+} // namespace cutlass::gemm
diff --git a/examples/63_hopper_gemm_with_weight_prefetch/collective/sm90_mma_tma_gmma_ss_warpspecialized_with_prefetch.hpp b/examples/63_hopper_gemm_with_weight_prefetch/collective/sm90_mma_tma_gmma_ss_warpspecialized_with_prefetch.hpp
new file mode 100644
index 0000000000..9bcb1f5a7e
--- /dev/null
+++ b/examples/63_hopper_gemm_with_weight_prefetch/collective/sm90_mma_tma_gmma_ss_warpspecialized_with_prefetch.hpp
@@ -0,0 +1,872 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+#include "cutlass/arch/grid_dependency_control.h"
+
+#include "dispatch_policy_extra.hpp"
+
+#include "../pipeline/prefetch_pipeline_sm90.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+constexpr int PrefetchStages = 4;
+constexpr int PrefetchInitialStages = 1;
+// This determines how much shmem we set aside for prefetch.
+// We don't reuse anything loaded by prefetcher, so we can keep
+// loading into the same place -- there will be a conflict when
+// writing, but it doesn't affect performance as much as the doors
+// that this opens.
+constexpr int PrefetchStagesActual = 1;
+
+} // namespace detail
+
+// WarpSpecialized Mainloop
+template <
+  int Stages,
+  class ClusterShape,
+  class KernelSchedule,
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90TmaGmmaWarpSpecializedWithPrefetch<Stages, ClusterShape, KernelSchedule>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedWithPrefetch<Stages, ClusterShape, KernelSchedule>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  static_assert(size<1>(ClusterShape{}) == 1, "Cluster shape N must be 1");
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+
+  using PrefetcherPipeline = cutlass::PrefetchPipeline<detail::PrefetchStages>;
+
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(rank(SmemLayoutA{}) == 3 && size<2>(SmemLayoutA{}) == DispatchPolicy::Stages);
+  static_assert(rank(SmemLayoutB{}) == 3 && size<2>(SmemLayoutB{}) == DispatchPolicy::Stages);
+
+  using PrefetchSmemLayoutA = decltype(make_layout(make_shape(
+    cute::Int<size<0>(SmemLayoutA{})>{},
+    cute::Int<size<1>(SmemLayoutA{})>{},
+    cute::Int<detail::PrefetchStagesActual>{})));
+
+  static constexpr auto prefetch_smem_size = cute::cosize_v<PrefetchSmemLayoutA>;
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using InternalElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using InternalElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
+
+  // Defined outside the class where it's used, to work around MSVC issues
+  using PrefetcherPipelineStorage = ::cutlass::detail::PrefetcherPipelineSharedStorage<detail::PrefetchStages>;
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::array_aligned<typename TiledMma::ValTypeA, prefetch_smem_size> smem_prefetch;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+    PrefetcherPipelineStorage prefetcher_pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A;
+    StrideA dA;
+    ElementB const* ptr_B;
+    StrideB dB;
+    uint32_t mma_promotion_interval = 4;
+    float overlap_ratio = 0.5;
+    float prefetch_ratio = -1.0;
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy_A_sm90(
+        GmemTiledCopyA{},
+        make_tensor(static_cast<InternalElementA const*>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}));
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        make_tensor(static_cast<InternalElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}));
+
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    uint32_t tma_transaction_bytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
+    float overlap_ratio = 0.5;
+    float prefetch_ratio = -1.0;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    auto ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_A);
+    auto ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+
+    typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{});
+    typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{});
+    uint32_t transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t transaction_bytes_nk = TmaTransactionBytesNK;
+    uint32_t transaction_bytes = transaction_bytes_mk + transaction_bytes_nk;
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      transaction_bytes,
+      transaction_bytes_mk,
+      transaction_bytes_nk,
+      args.overlap_ratio,
+      args.prefetch_ratio
+    };
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+    
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    bool implementable = cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+      return false;
+    }
+
+    if (args.overlap_ratio > 1.0) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: `overlap_ratio` must be either negative (disabled) or in [0, 1].\n");
+      return false;
+    }
+
+    if (args.prefetch_ratio > 1.0) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: `prefetch_ratio` must be either negative (disabled) or in [0, 1].\n");
+      return false;
+    }
+
+    return true;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 1;
+  static constexpr uint32_t TmaTransactionBytesMK =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<ElementA>::value));
+  static constexpr uint32_t TmaTransactionBytesNK =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<ElementB>::value));
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  /// The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L));                            // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
+
+    return cute::make_tuple(gA_mkl, gB_nkl);
+  }
+
+  template <
+    class TensorA, class TensorB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PrefetcherPipeline prefetcher_pipeline,
+      PipelineState smem_pipe_write,
+      TensorA const& gA_mkl,
+      TensorB const& gB_nkl,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+      bool disable_gdc = mainloop_params.overlap_ratio < 0.0;
+      float overlap_ratio = mainloop_params.overlap_ratio;
+      int launch_dep_grids_threshold = static_cast<int>(static_cast<float>(k_tile_count - 1) * overlap_ratio);
+
+      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+
+      //
+      // Prepare the TMA loads for A
+      //
+
+      constexpr uint32_t cluster_shape_x = get<0>(typename DispatchPolicy::ClusterShape());
+      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+      auto cta_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+      auto cta_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+      Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+      // Applies the mapping from cta_tma_a
+      Tensor tAgA = cta_tma_a.partition_S(gA);                                                   // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = cta_tma_a.partition_D(sA);                                                // (TMA,TMA_M,TMA_K,PIPE)
+
+      // Applies the mapping from cta_tma_b
+      Tensor tBgB = cta_tma_b.partition_S(gB);                                                   // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = cta_tma_b.partition_D(sB);                                                // (TMA,TMA_N,TMA_K,PIPE)
+
+      uint16_t mcast_mask_a = 0;
+      uint16_t mcast_mask_b = 0;
+
+      // Issue TmaLoads
+      // Maps the tile -> block, value
+      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+        for (int n = 0; n < size<1>(block_layout); ++n) {
+          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+        }
+      }
+
+      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+        for (int m = 0; m < size<0>(block_layout); ++m) {
+          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+        }
+      }
+
+      // We have to wait on dependent grids because of B.
+      cutlass::arch::wait_on_dependent_grids();
+
+      // Signal prefetcher to stop
+      prefetcher_pipeline.producer_arrive();
+
+      bool launch_dep_grids = false;
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (int cnt=0 ; k_tile_count > 0; --k_tile_count, ++cnt) {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a, cute::TMA::CacheHintSm90::EVICT_FIRST), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b, cute::TMA::CacheHintSm90::EVICT_LAST), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+        ++k_tile_iter;
+
+        if (!disable_gdc && cnt >= launch_dep_grids_threshold && !launch_dep_grids) { 
+          launch_dep_grids = true;
+          cutlass::arch::launch_dependent_grids();
+        }
+
+        // Advance smem_pipe_write
+        ++smem_pipe_write;
+      }
+      if (!disable_gdc && !launch_dep_grids) { 
+        cutlass::arch::launch_dependent_grids();
+      }
+    }
+  }
+
+  template <
+    class TensorA,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load_MK(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PrefetcherPipeline prefetcher_pipeline,
+      PipelineState smem_pipe_write,
+      TensorA const& gA_mkl,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+      bool disable_gdc = mainloop_params.overlap_ratio < 0.0;
+      float overlap_ratio = mainloop_params.overlap_ratio;
+      int launch_dep_grids_threshold = static_cast<int>(static_cast<float>(k_tile_count - 1) * overlap_ratio);
+
+      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+
+      //
+      // Prepare the TMA loads for A
+      //
+
+      constexpr uint32_t cluster_shape_x = get<0>(typename DispatchPolicy::ClusterShape());
+      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+      auto cta_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+
+      // Applies the mapping from cta_tma_a
+      Tensor tAgA = cta_tma_a.partition_S(gA);                                                   // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = cta_tma_a.partition_D(sA);                                                // (TMA,TMA_M,TMA_K,PIPE)
+
+      uint16_t mcast_mask_a = 0;
+
+      // Issue TmaLoads
+      // Maps the tile -> block, value
+      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+        for (int n = 0; n < size<1>(block_layout); ++n) {
+          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+        }
+      }
+
+      // Don't wait on dependent grids when loading `A`, because
+      // we assume `A` (weights) are static.
+
+      bool launch_dep_grids = false;
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (int cnt=0 ; k_tile_count > 0; --k_tile_count, ++cnt) {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a, cute::TMA::CacheHintSm90::EVICT_FIRST), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        ++k_tile_iter;
+
+        if (!disable_gdc && cnt >= launch_dep_grids_threshold && !launch_dep_grids) { 
+          launch_dep_grids = true;
+          cutlass::arch::launch_dependent_grids();
+        }
+
+        // Advance smem_pipe_write
+        ++smem_pipe_write;
+      }
+      if (!disable_gdc && !launch_dep_grids) { 
+        cutlass::arch::launch_dependent_grids();
+      }
+    }
+  }
+
+  template <
+    class TensorB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load_NK(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PrefetcherPipeline prefetcher_pipeline,
+      PipelineState smem_pipe_write,
+      TensorB const& gB_nkl,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+
+      //
+      // Prepare the TMA loads for B
+      //
+
+      constexpr uint32_t cluster_shape_x = get<0>(typename DispatchPolicy::ClusterShape());
+      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+      auto cta_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+      Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+      // Applies the mapping from cta_tma_b
+      Tensor tBgB = cta_tma_b.partition_S(gB);                                                   // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = cta_tma_b.partition_D(sB);                                                // (TMA,TMA_N,TMA_K,PIPE)
+
+      uint16_t mcast_mask_b = 0;
+
+      // Issue TmaLoads
+      // Maps the tile -> block, value
+      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+        for (int m = 0; m < size<0>(block_layout); ++m) {
+          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+        }
+      }
+
+      // Ensure that the prefetched kernel does not touch
+      // unflushed global memory prior to this instruction
+      cutlass::arch::wait_on_dependent_grids();
+
+      // Signal prefetcher to stop
+      prefetcher_pipeline.producer_arrive();
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (; k_tile_count > 0; --k_tile_count) {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b, cute::TMA::CacheHintSm90::EVICT_LAST), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+        ++k_tile_iter;
+
+        // Advance smem_pipe_write
+        ++smem_pipe_write;
+      }
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all 
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was 
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+
+  template <
+    class TensorA,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  prefetch_MK(
+      Params const& mainloop_params,
+      PrefetcherPipeline prefetcher_pipeline,
+      PipelineState smem_pipe_write,
+      TensorA const& gA_mkl,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+      bool do_best_effort_prefetch = mainloop_params.prefetch_ratio < 0;
+      float prefetch_ratio = do_best_effort_prefetch ? 1.0 : mainloop_params.prefetch_ratio;
+      int prefetch_iters = static_cast<int>(static_cast<float>(k_tile_count) * 0.5 * prefetch_ratio);
+      prefetch_iters = min(k_tile_count, ((prefetch_iters + detail::PrefetchStages - 1) / detail::PrefetchStages) * detail::PrefetchStages);
+
+      Tensor sA = make_tensor(
+          make_smem_ptr(shared_tensors.smem_prefetch.data()), PrefetchSmemLayoutA{});             // (BLK_M,BLK_K,PIPE)
+
+      //
+      // Prepare the TMA loads for A
+      //
+
+      constexpr uint32_t cluster_shape_x = get<0>(typename DispatchPolicy::ClusterShape());
+      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+      auto cta_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+
+      // Applies the mapping from cta_tma_a
+      Tensor tAgA = cta_tma_a.partition_S(gA);                                                   // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = cta_tma_a.partition_D(sA);                                                // (TMA,TMA_M,TMA_K,PIPE)
+
+      uint16_t mcast_mask_a = 0;
+
+      // Issue TmaLoads
+      // Maps the tile -> block, value
+      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+        for (int n = 0; n < size<1>(block_layout); ++n) {
+          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+        }
+      }
+
+      uint32_t prefetcher_stage = 0;
+      uint32_t prefetcher_phase = 0;
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (int cnt = 0 ; cnt < prefetch_iters; ++cnt) {
+
+        if (do_best_effort_prefetch && prefetcher_pipeline.have_producers_arrived()) {
+          break;
+        }
+
+        prefetcher_pipeline.prefetcher_acquire(prefetcher_stage, prefetcher_phase, cnt >= detail::PrefetchStages);
+        using BarrierType = typename PrefetcherPipeline::PrefetcherBarrierType;
+        BarrierType* tma_barrier = prefetcher_pipeline.prefetcher_get_barrier(prefetcher_stage);
+
+        int write_stage = 0;
+        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a, cute::TMA::CacheHintSm90::EVICT_FIRST), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        ++k_tile_iter;
+        ++k_tile_iter;
+
+        prefetcher_pipeline.advance_prefetcher_state(prefetcher_stage, prefetcher_phase);
+      }
+      prefetcher_pipeline.prefetcher_tail(prefetcher_stage, prefetcher_phase);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+    static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+
+    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
+        "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    warpgroup_fence_operand(accum);
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+
+      warpgroup_commit_batch();
+
+      ++smem_pipe_read;
+    }
+
+    warpgroup_fence_operand(accum);
+    // Mainloop GMMAs
+    k_tile_count -= prologue_mma_count;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+      warpgroup_fence_operand(accum);
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
+      warpgroup_wait<K_PIPE_MMAS>();
+      warpgroup_fence_operand(accum);
+
+      // UNLOCK smem_pipe_release, done _computing_ on it
+      pipeline.consumer_release(smem_pipe_release);
+
+      // Advance smem_pipe_read and smem_pipe_release
+      ++smem_pipe_read;
+      ++smem_pipe_release;
+    }
+
+    warpgroup_fence_operand(accum);
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+    
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/63_hopper_gemm_with_weight_prefetch/gemm_with_weight_prefetch_commandline.hpp b/examples/63_hopper_gemm_with_weight_prefetch/gemm_with_weight_prefetch_commandline.hpp
new file mode 100644
index 0000000000..6be87768ee
--- /dev/null
+++ b/examples/63_hopper_gemm_with_weight_prefetch/gemm_with_weight_prefetch_commandline.hpp
@@ -0,0 +1,117 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Command line options parsing
+struct Options {
+
+  bool help = false;
+
+  float alpha = 1.f, beta = 0.f;
+  float overlap_ratio = 0.5f, prefetch_ratio = 0.5f;
+  int iterations = 1000;
+  int n = 64, m = 1280, k = 8192, l = 1;
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("m", m);
+    cmd.get_cmd_line_argument("n", n);
+    cmd.get_cmd_line_argument("k", k);
+    cmd.get_cmd_line_argument("l", l);
+    cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+    cmd.get_cmd_line_argument("beta", beta, 0.f);
+    cmd.get_cmd_line_argument("p", prefetch_ratio, 0.5f);
+    cmd.get_cmd_line_argument("o", overlap_ratio, 0.5f);
+    cmd.get_cmd_line_argument("iterations", iterations);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "63_hopper_gemm_with_weight_prefetch\n\n"
+      << "  Hopper FP8 GEMM using a non-persistent kernel with L2 weight prefetch. \n"
+      << "  For more details please refer to the source file.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement\n\n"
+      << "  --m=<int>                   Sets the M extent of the GEMM\n"
+      << "  --n=<int>                   Sets the N extent of the GEMM\n"
+      << "  --k=<int>                   Sets the K extent of the GEMM\n"
+      << "  --l=<int>                   Sets the l extent (batch) of the GEMM\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha\n"
+      << "  --beta=<f32>                Epilogue scalar beta\n"
+      << "  --p=<f32>                   Prefetch ratio\n"
+      << "  --o=<f32>                   Overlap ratio\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n";
+
+    out
+      << "\n\nExamples:\n\n"
+      << "$ " << "63_hopper_gemm_with_weight_prefetch" << 
+      " --m=1024 --n=512 --k=1024 --o=0.5 --p=0.5 \n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const
+  {
+    // Two flops per multiply-add
+    uint64_t flop = uint64_t(2) * m * n * k * l;
+    double gflop = double(flop) / double(1.0e9);
+    return gflop / runtime_s;
+  }
+
+  /// Compute effective bandwidth in GB/sec
+  double effective_bandwidth(
+    double runtime_s,
+    size_t bytes_a,
+    size_t bytes_b,
+    size_t bytes_c,
+    size_t bytes_d
+  ) const
+  {
+    static double const kBytesPerGiB = double(1ull << 30);
+
+    double bytes_in = 
+      (double)(l) * (double)(m) * (double)(k) * (double)(bytes_a) +                        // A
+      (double)(l) * (double)(n) * (double)(k) * (double)(bytes_b) +                        // B
+      (beta != 0.f ? (double)(l) * (double)(m) * (double)(n) * (double)(bytes_c) : 0.f);   // C
+    double bytes_out = (double)(l) * (double)(m) * (double)(n) * (double)(bytes_d);        // D
+
+    double gb_total = (bytes_in + bytes_out) / kBytesPerGiB;
+    return gb_total / runtime_s;
+  }
+};
diff --git a/examples/63_hopper_gemm_with_weight_prefetch/kernel/sm90_gemm_tma_warpspecialized_with_prefetch.hpp b/examples/63_hopper_gemm_with_weight_prefetch/kernel/sm90_gemm_tma_warpspecialized_with_prefetch.hpp
new file mode 100644
index 0000000000..6e33d8fc62
--- /dev/null
+++ b/examples/63_hopper_gemm_with_weight_prefetch/kernel/sm90_gemm_tma_warpspecialized_with_prefetch.hpp
@@ -0,0 +1,561 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/arch/mma_sm90.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+
+#include "cute/tensor.hpp"
+
+#include "../collective/dispatch_policy_extra.hpp"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+// GEMM + Prefetch for the A tensor + (optional) split DMA warps
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<
+    cute::is_same_v<typename CollectiveMainloop_::DispatchPolicy::Schedule, KernelTmaWarpSpecializedFP8FastAccumWithPrefetchAndSplitDMA> || 
+    cute::is_same_v<typename CollectiveMainloop_::DispatchPolicy::Schedule, KernelTmaWarpSpecializedFP8FastAccumWithPrefetch>
+    >
+>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(cute::rank(ProblemShape{}) == 3 or cute::rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+  static constexpr bool IsGdcEnabled = cutlass::arch::IsGdcGloballyEnabled;
+
+  static constexpr bool SplitWarps = cute::is_same_v<typename CollectiveMainloop_::DispatchPolicy::Schedule, KernelTmaWarpSpecializedFP8FastAccumWithPrefetchAndSplitDMA>;
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  static_assert(cute::is_void_v<TileScheduler_> or cute::is_same_v<TileScheduler_, PersistentScheduler>,
+    "TMA warp-specialized kernel does not support specializing the tile scheduler.");
+  using TileSchedulerTag = TileScheduler_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileScheduler_, ArchTag, TileShape, ClusterShape>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    // Mainloop and epilogue don't use smem concurrently since kernel is non-persistent, so we can use a union
+    union TensorStorage {
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+
+      MainloopTensorStorage mainloop;
+      EpilogueTensorStorage epilogue;
+    } tensors;
+
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using PrefetcherPipelineStorage = typename CollectiveMainloop::PrefetcherPipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+      alignas(16) PrefetcherPipelineStorage prefetcher;
+    } pipelines;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  static constexpr uint32_t NumLoadWarpGroups = 1;
+  static constexpr uint32_t NumMmaWarpGroups = 1;
+  static constexpr uint32_t MaxThreadsPerBlock = CUTE_STATIC_V(size(TiledMma{})) + (NumLoadWarpGroups * NumThreadsPerWarpGroup);
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    (void) workspace;
+    auto problem_shape = args.problem_shape;
+    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
+      // swap M/N
+      get<0>(problem_shape) = get<1>(args.problem_shape);
+      get<1>(problem_shape) = get<0>(args.problem_shape);
+    }
+    return {
+      args.mode,
+      problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, workspace)
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
+        (args.mode == GemmUniversalMode::kBatched && cute::rank(ProblemShape{}) == 4);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+
+    return implementable;
+  }
+
+  static
+  size_t
+  get_workspace_size(Arguments const& args) {
+    return 0;
+  }
+
+  static
+  cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return Status::kSuccess;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    auto cluster_shape = ClusterShape{};
+    auto tile_shape = TileShape{};
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    return TileScheduler::get_tiled_cta_shape_mnl(
+        problem_shape_MNKL, tile_shape, cluster_shape);
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+#if defined(__CUDA_ARCH_FEAT_SM90_ALL)
+#  define ENABLE_SM90_KERNEL_LEVEL 1
+#endif
+
+// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
+#if ! defined(ENABLE_SM90_KERNEL_LEVEL)
+    printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
+#else
+
+    enum class WarpGroupRole {
+      Producer = 0,
+      Consumer = 1,
+    };
+    // Split mode: use Warp0 to load NK and epilogue, Warp2 to load MK.
+    // Non-split mode: use Warp0 to load MK, NK and epilogue, Warp2 is unused.
+    // Both modes use Warp1 to prefetch.
+    enum class ProducerWarpRole {
+      Warp0 = 0,
+      PrefetchMK = 1,
+      Warp2 = 2,
+      UnusedWarp = 3
+    };
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    int thread_idx = int(threadIdx.x);
+    int lane_idx = canonical_lane_idx();
+    int warp_idx = canonical_warp_idx_sync();
+    int warp_idx_in_warp_group = warp_idx % NumWarpsPerWarpGroup;
+    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
+    auto warp_group_role = WarpGroupRole(canonical_warp_group_idx());
+    auto producer_warp_role = ProducerWarpRole(warp_idx_in_warp_group);
+    int lane_predicate = cute::elect_one_sync();
+    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
+
+
+    // Issue Tma Descriptor Prefetch from a single thread
+    if ((warp_idx == 0) && lane_predicate) {
+      CollectiveMainloop::prefetch_tma_descriptors(params.mainloop);
+      CollectiveEpilogue::prefetch_tma_descriptors(params.epilogue);
+    }
+
+    // Mainloop Load pipeline
+    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+    typename MainloopPipeline::Params mainloop_pipeline_params;
+    mainloop_pipeline_params.is_leader = warp_group_thread_idx == 0;
+    if (warp_group_role == WarpGroupRole::Producer && (
+          producer_warp_role == ProducerWarpRole::Warp0 ||
+          producer_warp_role == ProducerWarpRole::Warp2)) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
+      mainloop_pipeline_params.transaction_bytes = params.mainloop.tma_transaction_bytes;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_pipeline_params.num_consumers = NumThreadsPerWarpGroup;
+    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params, ClusterShape{});
+    bool should_prefetch = params.mainloop.prefetch_ratio > 0;
+    using PrefetcherPipeline = typename CollectiveMainloop::PrefetcherPipeline;
+    typename PrefetcherPipeline::Params prefetcher_pipeline_params;
+    prefetcher_pipeline_params.num_prefetchers = 1;
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::PrefetchMK) {
+      prefetcher_pipeline_params.should_prefetch = should_prefetch;
+      prefetcher_pipeline_params.transaction_bytes = params.mainloop.tma_transaction_bytes_mk;
+    }
+    PrefetcherPipeline prefetcher_pipeline(shared_storage.pipelines.prefetcher, prefetcher_pipeline_params);
+
+    // Epilogue Load pipeline
+    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Warp0) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.dst_blockid = cute::block_rank_in_cluster();
+    epi_load_pipeline_params.producer_arv_count = NumThreadsPerWarp;
+    epi_load_pipeline_params.consumer_arv_count = NumThreadsPerWarpGroup;
+    if constexpr (CollectiveEpilogue::RequiresTransactionBytes) {
+      epi_load_pipeline_params.transaction_bytes = params.epilogue.tma_transaction_bytes;
+    }
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    // Initialize starting pipeline states for the collectives
+    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
+    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
+
+    // For the DMA Load (producer) we start with an opposite phase
+    // i.e., we skip all waits since we know that the buffer is indeed empty
+    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
+    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    auto cluster_wait_fn = [&] () {
+      // We need this to guarantee that the Pipeline init is visible
+      // To all producers and consumer thread blocks in the Cluster
+      if constexpr (size(ClusterShape{}) > 1) {
+        // Non-prefetcher warps arrive and wait,
+        // Prefetcher warp can go ahead without waiting.
+        cute::cluster_arrive_relaxed();
+        if (warp_group_role != WarpGroupRole::Producer ||
+            producer_warp_role != ProducerWarpRole::PrefetchMK) {
+          cute::cluster_wait();
+        }
+        return [] () {};
+      }
+      else {
+        // __syncthreads() but only for non prefetcher warps
+        if (should_prefetch) {
+
+          // Use a named barrier to let the prefetcher warp start loading into the L2
+          // without waiting to sync with all other warps.
+          // All other warps need to sync because the mainloop pipeline init
+          // should be visible to all of them.
+          // Prefetcher has its own barriers, and the only warps it would need to sync
+          // with would be the DMA warps.
+          using ClusterSyncWithPrefetchBarrier = typename cutlass::arch::NamedBarrier;
+          auto prefetcher_arrive_barrier = ClusterSyncWithPrefetchBarrier(
+              blockDim.x * blockDim.y * blockDim.z,
+              /*reserved_named_barriers_*/ 14);
+          // Prefetcher warp doesn't arrive on this barrier.
+          auto cluster_arrive_barrier = ClusterSyncWithPrefetchBarrier(
+              blockDim.x * blockDim.y * blockDim.z - NumThreadsPerWarp,
+              /*reserved_named_barriers_*/ 15);
+
+          if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::PrefetchMK) {
+            __syncwarp();
+            prefetcher_arrive_barrier.arrive();
+          }
+          else if (warp_group_role == WarpGroupRole::Producer) {
+            prefetcher_arrive_barrier.arrive_and_wait();
+            cluster_arrive_barrier.arrive_and_wait();
+          }
+          else {
+            prefetcher_arrive_barrier.arrive();
+            cluster_arrive_barrier.arrive_and_wait();
+          }
+        } else {
+        __syncthreads();
+        }
+        return [] () {};
+      }
+    } ();
+
+    // Preconditions
+    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    // Optionally append 1s until problem shape is rank-4 in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
+    TiledMma tiled_mma;
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop;
+    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+    // Prepare and partition the input tensors. Expects a tuple of tensors where:
+    // get<0>(load_inputs) is the tma tensor A after local tiling so that it has shape (BLK_M,BLK_K,m,k,l)
+    // get<1>(load_inputs) is the tma tensor B after local tiling so that it has shape (BLK_N,BLK_K,n,k,l)
+    auto load_inputs = collective_mainloop.load_init(problem_shape_MNKL, params.mainloop);
+    static_assert(cute::tuple_size_v<decltype(load_inputs)> >= 2, "Output of load_init must have at least two elements (A, B)");
+
+    // Extract out partitioned A and B.
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gB_nkl = get<1>(load_inputs);
+
+    // Compute m_coord, n_coord, and l_coord with their post-tiled shapes
+    auto m_coord = idx2crd(int(blockIdx.x), shape<2>(gA_mkl));
+    auto n_coord = idx2crd(int(blockIdx.y), shape<2>(gB_nkl));
+    auto l_coord = idx2crd(int(blockIdx.z), shape<4>(gB_nkl));
+    auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+    // Get pipeline iterators and increments from tensor shapes
+    auto k_tile_iter  = cute::make_coord_iterator(shape<3>(gA_mkl));
+    auto k_tile_count = size<3>(gA_mkl);
+
+    // Wait for all thread blocks in the Cluster
+    cluster_wait_fn();
+
+    if (warp_group_role == WarpGroupRole::Producer) {
+      if (producer_warp_role == ProducerWarpRole::Warp0) {
+        if constexpr(SplitWarps) {
+          collective_mainloop.load_NK(
+            params.mainloop,
+            mainloop_pipeline,
+            prefetcher_pipeline,
+            mainloop_pipe_producer_state,
+            gB_nkl,
+            blk_coord,
+            k_tile_iter, k_tile_count,
+            lane_idx,
+            block_rank_in_cluster,
+            shared_storage.tensors.mainloop
+          );
+        }
+        else {
+          collective_mainloop.load(
+            params.mainloop,
+            mainloop_pipeline,
+            prefetcher_pipeline,
+            mainloop_pipe_producer_state,
+            gA_mkl, gB_nkl,
+            blk_coord,
+            k_tile_iter, k_tile_count,
+            lane_idx,
+            block_rank_in_cluster,
+            shared_storage.tensors.mainloop
+          );
+        }
+        // Update starting mainloop pipeline state for the pipeline drain
+        mainloop_pipe_producer_state.advance(k_tile_count);
+        // Make sure mainloop consumer has been waited upon before issuing epilogue load
+        collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+
+        if (collective_epilogue.is_producer_load_needed()) {
+          // Ensure warp is converged before issuing epilogue loads
+          __syncwarp();
+          epi_load_pipe_producer_state = collective_epilogue.load(
+            epi_load_pipeline,
+            epi_load_pipe_producer_state,
+            problem_shape_MNKL,
+            blk_shape,
+            blk_coord,
+            tiled_mma,
+            lane_idx,
+            shared_storage.tensors.epilogue
+          );
+          collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
+        }
+      }
+      else if (SplitWarps && producer_warp_role == ProducerWarpRole::Warp2) {
+        collective_mainloop.load_MK(
+          params.mainloop,
+          mainloop_pipeline,
+          prefetcher_pipeline,
+          mainloop_pipe_producer_state,
+          gA_mkl,
+          blk_coord,
+          k_tile_iter, k_tile_count,
+          lane_idx,
+          block_rank_in_cluster,
+          shared_storage.tensors.mainloop
+        );
+        // Update starting mainloop pipeline state for the pipeline drain
+        mainloop_pipe_producer_state.advance(k_tile_count);
+        // Make sure mainloop consumer has been waited upon before issuing epilogue load
+        collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+      } else if (producer_warp_role == ProducerWarpRole::PrefetchMK && should_prefetch) {
+        collective_mainloop.prefetch_MK(
+          params.mainloop,
+          prefetcher_pipeline,
+          mainloop_pipe_producer_state,
+          gA_mkl,
+          blk_coord,
+          k_tile_iter, k_tile_count,
+          lane_idx,
+          block_rank_in_cluster,
+          shared_storage.tensors.mainloop
+        );
+      }
+    }
+    else if (warp_group_role == WarpGroupRole::Consumer) {
+      Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));                 // (MMA,MMA_M,MMA_N)
+
+      collective_mainloop.mma(
+        mainloop_pipeline,
+        mainloop_pipe_consumer_state,
+        accumulators,
+        k_tile_count,
+        warp_group_thread_idx,
+        shared_storage.tensors.mainloop,
+        params.mainloop
+      );
+
+      // Make sure the math instructions are done and free buffers before entering the epilogue
+      collective_mainloop.mma_tail(
+        mainloop_pipeline,
+        mainloop_pipe_consumer_state,
+        k_tile_count
+      );
+
+      // Epilogue and write to gD
+      auto [epi_load_pipe_consumer_state_next, epi_store_pipe_producer_state_next] =
+      collective_epilogue.store(
+        epi_load_pipeline,
+        epi_load_pipe_consumer_state,
+        epi_store_pipeline,
+        epi_store_pipe_producer_state,
+        problem_shape_MNKL,
+        blk_shape,
+        blk_coord,
+        accumulators,
+        tiled_mma,
+        warp_group_thread_idx,
+        shared_storage.tensors.epilogue
+      );
+
+      collective_epilogue.store_tail(
+        epi_load_pipeline,
+        epi_load_pipe_consumer_state_next,
+        epi_store_pipeline,
+        epi_store_pipe_producer_state_next
+      );
+    }
+#endif
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/examples/63_hopper_gemm_with_weight_prefetch/pipeline/prefetch_pipeline_sm90.hpp b/examples/63_hopper_gemm_with_weight_prefetch/pipeline/prefetch_pipeline_sm90.hpp
new file mode 100644
index 0000000000..7abd39ccfc
--- /dev/null
+++ b/examples/63_hopper_gemm_with_weight_prefetch/pipeline/prefetch_pipeline_sm90.hpp
@@ -0,0 +1,161 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/barrier.h"
+#include "cute/container/array.hpp"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+namespace detail {
+
+// MSVC work-around
+template <int Stages>
+struct PrefetcherPipelineSharedStorage {
+  using TransactionBarrier = cutlass::arch::ClusterTransactionBarrier;
+  using Barrier = cutlass::arch::ClusterBarrier;
+
+  TransactionBarrier tma_barrier[Stages];
+  Barrier producer_ready_barrier;
+};
+
+} // end namespace detail
+
+using namespace cute;
+
+// Prefetcher pipeline is modeled after PipelineTmaAsync, with a cluster transaction
+// barrier providing control over the number of concurrent outstanding TMA loads.
+// There is also an additional cluster barrier which is only used when `prefetch_ratio` is unset.
+// `prefetch_ratio` determines how many K tiles get loaded, and when unset, the prefetcher checks
+// whether DMA warps are done waiting on griddepcontrol, and if so, stops issuing more TMA loads.
+template <int Stages_>
+class PrefetchPipeline {
+public :
+  static constexpr uint32_t Stages = Stages_;
+  using SharedStorage = detail::PrefetcherPipelineSharedStorage<Stages>;
+
+  using TransactionBarrier = typename SharedStorage::TransactionBarrier;
+  using Barrier = typename SharedStorage::Barrier;
+  using PrefetcherBarrierType = typename TransactionBarrier::ValueType;
+
+  struct Params {
+    uint32_t transaction_bytes = 0;
+    uint32_t num_prefetchers = 1;
+    bool should_prefetch = false;
+  };
+
+  // Constructor
+  CUTLASS_DEVICE
+  PrefetchPipeline(SharedStorage& storage, Params params)
+      : params_(params)
+      , tma_barrier_ptr_(&storage.tma_barrier[0])
+      , producer_ready_barrier_ptr_(&storage.producer_ready_barrier) {
+
+    int lane_predicate = cute::elect_one_sync();
+    if (params.should_prefetch && lane_predicate) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Stages; ++i) {
+        tma_barrier_ptr_[i].init(params.num_prefetchers);
+      }
+      producer_ready_barrier_ptr_[0].init(1);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void producer_arrive() {
+    if (params_.should_prefetch) {
+      producer_ready_barrier_ptr_[0].arrive();
+    }
+  }
+
+  CUTLASS_DEVICE
+  bool have_producers_arrived() {
+    if (params_.should_prefetch) {
+      uint32_t barrier_status_ = producer_ready_barrier_ptr_[0].try_wait(0);
+      auto barrier_status = static_cast<BarrierStatus>(barrier_status_);
+      if (barrier_status == BarrierStatus::WaitDone) {
+        return true; // exit prefetcher loop
+      }
+      return false;
+    }
+    return true;
+  }
+
+  CUTLASS_DEVICE
+  void prefetcher_acquire(uint32_t stage, uint32_t phase, bool should_wait) {
+    if (params_.should_prefetch) {
+      if (should_wait) {
+        tma_barrier_ptr_[stage].wait(phase ^ 1);
+      }
+      tma_barrier_ptr_[stage].arrive_and_expect_tx(params_.transaction_bytes);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void advance_prefetcher_state(uint32_t& stage, uint32_t& phase) {
+    if (params_.should_prefetch) {
+      stage++;
+      if (stage == Stages) {
+        stage = 0;
+        phase ^= 1;
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  void prefetcher_tail(uint32_t stage, uint32_t phase) {
+    if (params_.should_prefetch) {
+      // Wait on any already-issued loads
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < stage; ++i) {
+        tma_barrier_ptr_[i].wait(phase);
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  PrefetcherBarrierType* prefetcher_get_barrier(uint32_t stage) {
+    return reinterpret_cast<PrefetcherBarrierType*>(&tma_barrier_ptr_[stage]);
+  }
+
+private :
+  TransactionBarrier* tma_barrier_ptr_ = nullptr;
+  Barrier* producer_ready_barrier_ptr_ = nullptr;
+  Params params_;
+
+};
+
+}  // end namespace cutlass
diff --git a/examples/64_ada_fp8_gemm_grouped/CMakeLists.txt b/examples/64_ada_fp8_gemm_grouped/CMakeLists.txt
new file mode 100644
index 0000000000..183202593c
--- /dev/null
+++ b/examples/64_ada_fp8_gemm_grouped/CMakeLists.txt
@@ -0,0 +1,35 @@
+
+# Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+
+cutlass_example_add_executable(
+  64_ada_fp8_gemm_grouped
+  ada_fp8_gemm_grouped.cu
+  )
diff --git a/examples/64_ada_fp8_gemm_grouped/ada_fp8_gemm_grouped.cu b/examples/64_ada_fp8_gemm_grouped/ada_fp8_gemm_grouped.cu
new file mode 100644
index 0000000000..8e3dbbb08b
--- /dev/null
+++ b/examples/64_ada_fp8_gemm_grouped/ada_fp8_gemm_grouped.cu
@@ -0,0 +1,1208 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Ada FP8 GEMM Grouped With Per-Group Scale Example.
+
+    This workload computes a batch of GEMM operations with distinct problem sizes. Pointers to matrices
+    in Global Memory are passed to the kernel in array (also held in Global Memory). Similarly,
+    leading dimensions and problem sizes are stored in arrays in GMEM.
+
+    This differs from "Batched Array" GEMM because the size of each GEMM problem in the Grouped GEMM
+    concept may be distinct.
+
+    The differences between this and the examples/24_gemm_grouped are: (1) this example scales the output of each GEMM by a different scalar value specified by alpha_ptr_array. (2) this example uses FP8 tensorcore.
+
+    This benchmark program initializes a workspace with random problem sizes for a given number of
+    groups. Command line options enable overriding M, N, and/or K dimensions with uniform values to
+    model problems more similar to the traditional batched GEMM.
+
+    Additionally, problem sizes are collected and binned to compute the same problem as a series of
+    conventional batched GEMMs (setup for this problem is not timed). This demonstrates the performance
+    enhancement achieved by implementing a specialized grouped GEMM kernel.
+
+    Examples:
+
+      # Runs a grouped GEMM with 100 random problem sizes
+      $ ./examples/64_ada_fp8_gemm_grouped/64_ada_fp8_gemm_grouped --groups=100
+
+      # Runs a grouped GEMM with 100 random problem sizes (with GEMM-K dimension equal to 1024)
+      $ ./examples/64_ada_fp8_gemm_grouped/64_ada_fp8_gemm_grouped --groups=100 --k=1024 --verbose=true
+
+      # Runs a grouped GEMM that is equivalent to a batched GEMM
+      $ ./examples/64_ada_fp8_gemm_grouped/64_ada_fp8_gemm_grouped --groups=100 --m=2048 --n=1024 --k=1024 --verbose=true
+
+      # Execute Grouped GEMM and profile with NSight
+      $ nv-nsight-cu-cli ./examples/64_ada_fp8_gemm_grouped/64_ada_fp8_gemm_grouped --m=256 --n=256 --k=256 --verbose=true \
+                                                                    --iterations=1 --reference-check=false
+
+*/
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <chrono>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <vector>
+#include <map>
+#include <unordered_map>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm_grouped_per_group_scale.h"
+#include "cutlass/gemm/kernel/default_gemm_grouped_per_group_scale.h"
+#include "cutlass/gemm/device/gemm_grouped.h"
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm_complex.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/device/gemm_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Result structure
+struct Result {
+
+  double runtime_ms;
+  double initialization_time_ms;
+  double gflops;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  //
+  // Methods
+  //
+
+  Result(
+    double runtime_ms = 0,
+    double initialization_time_ms = 0,
+    double gflops = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess
+  ):
+    runtime_ms(runtime_ms), initialization_time_ms(initialization_time_ms), gflops(gflops),
+    status(status), error(error), passed(true) { }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Hash function for cutlass::gemm::GemmCoord
+struct HashGemmCoord {
+  size_t operator()(cutlass::gemm::GemmCoord const &problem) const {
+    std::hash<int> hasher;
+    return (hasher(problem.m() * 3)) ^ (hasher(1 + problem.n() * 5)) ^ (hasher(2 + problem.k() * 7));
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+  bool error;
+  bool reference_check;
+  bool profile_initialization;
+  bool sort_problems;
+
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes;
+
+  // problem size bins
+  std::unordered_map<
+    cutlass::gemm::GemmCoord,
+    std::vector<int32_t>,
+    HashGemmCoord> problem_bins;
+
+  int alignment;
+  int problem_count;
+  int iterations;
+  int cuda_streams;
+  bool verbose;
+  float alpha;
+  std::vector<float> alpha_array;
+  float beta;
+  std::string benchmark_path;
+
+  std::string   output_tag;
+  std::ofstream output_file;
+
+  using GroupScheduleMode = cutlass::gemm::kernel::GroupScheduleMode;
+  std::vector<GroupScheduleMode> scheduler_modes;
+
+  std::unordered_map<std::string, GroupScheduleMode>
+    str_to_scheduler_mode = {
+      {"kDeviceOnly", GroupScheduleMode::kDeviceOnly},
+      {"kHostPrecompute", GroupScheduleMode::kHostPrecompute}
+    };
+
+  struct GroupScheduleModeHash {
+    size_t operator()(GroupScheduleMode m) const {
+      return static_cast<size_t>(m);
+    }
+  };
+
+  std::unordered_map<GroupScheduleMode, std::string, GroupScheduleModeHash>
+    scheduler_mode_to_str = {
+      {GroupScheduleMode::kDeviceOnly, "kDeviceOnly"},
+      {GroupScheduleMode::kHostPrecompute, "kHostPrecompute"}
+    };
+
+  std::vector<GroupScheduleMode> all_scheduler_modes = {GroupScheduleMode::kDeviceOnly, GroupScheduleMode::kHostPrecompute};
+
+  //
+  // Methods
+  //
+
+  Options():
+    help(false),
+    error(false),
+    alignment(16),
+    reference_check(true),
+    profile_initialization(false),
+    sort_problems(false),
+    problem_count(15),
+    iterations(20),
+    cuda_streams(0),
+    verbose(false),
+    alpha(1),
+    beta(),
+    scheduler_modes({GroupScheduleMode::kDeviceOnly})
+  { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("alignment", alignment, 16);
+    cmd.get_cmd_line_argument("groups", problem_count, 15);
+    cmd.get_cmd_line_argument("alpha", alpha, 1.0f);
+    cmd.get_cmd_line_argument("beta", beta, 0.0f);
+    cmd.get_cmd_line_argument("iterations", iterations, 20);
+    cmd.get_cmd_line_argument("streams", cuda_streams, 0);
+    cmd.get_cmd_line_argument("verbose", verbose, false);
+    cmd.get_cmd_line_argument("reference-check", reference_check, true);
+    cmd.get_cmd_line_argument("profile-initialization", profile_initialization, false);
+    cmd.get_cmd_line_argument("sort-problems", sort_problems, false);
+    cmd.get_cmd_line_argument("benchmark", benchmark_path);
+
+    std::vector<std::string> scheduler_mode_strs;
+    cmd.get_cmd_line_arguments("scheduler-modes", scheduler_mode_strs);
+
+    if (!scheduler_mode_strs.empty()) {
+      scheduler_modes.clear();
+      if (scheduler_mode_strs.size() == 1 && scheduler_mode_strs[0] == "all") {
+        scheduler_modes = all_scheduler_modes;
+      } else {
+        for (std::string precomp_str : scheduler_mode_strs) {
+          auto it = str_to_scheduler_mode.find(precomp_str);
+          if (it != str_to_scheduler_mode.end()) {
+            scheduler_modes.push_back(it->second);
+          } else if (precomp_str == "all") {
+            std::cerr << "Flag --scheduler-modes=all must not contain other scheduler modes in list." << std::endl;
+            error = true;
+            return;
+          } else {
+            std::cerr << "Unrecognized scheduler mode '" << precomp_str << "'" << std::endl;
+            error = true;
+            return;
+          }
+        }
+      }
+    }
+
+    std::string output_path;
+    cmd.get_cmd_line_argument("tag", output_tag);
+    cmd.get_cmd_line_argument("output_file", output_path);
+
+    if (!output_path.empty()) {
+
+      std::ios_base::openmode open_mode = std::ios_base::out;
+
+      std::ifstream input_file(output_path.c_str());
+
+      if (input_file.good()) {
+        open_mode = std::ios_base::app;
+        input_file.close();
+      }
+
+      output_file.open(output_path.c_str(), open_mode);
+
+      if (output_file.good() && open_mode != std::ios_base::app) {
+        output_file << "Tag,Provider,Kind,Groups,Runtime,GFLOPs\n";
+      }
+    }
+
+    // Decide how to initialize the problems
+    if (!benchmark_path.empty()) {
+      if (!benchmark_problems()) {
+        error = true;
+        problem_sizes.clear();
+        return;
+      }
+    }
+    else {
+      randomize_problems(cmd);
+    }
+
+    // Post-process the problem sizes
+    bin_problems();
+
+    // Initalize alpha array
+    randomize_alpha_ptr_array(cmd);
+  }
+
+  void randomize_problems(cutlass::CommandLine &cmd) {
+
+    //
+    // For now, randomly choose the problem sizes.
+    //
+
+    int cmd_line_m = -1;
+    int cmd_line_n = -1;
+    int cmd_line_k = -1;
+
+    cmd.get_cmd_line_argument("m", cmd_line_m);
+    cmd.get_cmd_line_argument("n", cmd_line_n);
+    cmd.get_cmd_line_argument("k", cmd_line_k);
+
+    problem_sizes.reserve(problem_count);
+
+    for (int i = 0; i < problem_count; ++i) {
+
+      int m = cmd_line_m;
+      int n = cmd_line_n;
+      int k = cmd_line_k;
+
+      if (m < 1) {
+        m = alignment * ((rand() % 256) + 1);
+      }
+
+      if (n < 1) {
+        n = alignment * ((rand() % 256) + 1);
+      }
+
+      if (k < 1) {
+        k = alignment * ((rand() % 256) + 1);
+      }
+
+      cutlass::gemm::GemmCoord problem(m, n, k);
+
+      problem_sizes.push_back(problem);
+    }
+  }
+
+  void randomize_alpha_ptr_array(cutlass::CommandLine &cmd) {
+    alpha_array.resize(problem_count);
+    for (int i = 0; i < problem_count; ++i) {
+        alpha_array[i] = static_cast<float>((rand() % 100) - 50 + alpha);
+    }
+  }
+
+  /// Load a benchmark
+  bool benchmark_problems() {
+    std::ifstream file(benchmark_path);
+    if (!file.good()) {
+      return false;
+    }
+
+    while (file.good()) {
+
+      int idx = -1;
+      std::string extent_str;
+
+      file >> idx >> extent_str;
+
+      if (idx < 0 || extent_str.empty()) {
+        break;
+      }
+
+      cutlass::gemm::GemmCoord extent;
+      std::vector<std::string> tokens;
+
+      cutlass::CommandLine::tokenize(tokens, extent_str, 'x');
+
+      for (int i = 0; i < int(tokens.size()); ++i) {
+        int x = std::atoi(tokens.at(i).c_str());
+
+        // round up
+        if (x % alignment) {
+          x += (alignment - (x % alignment));
+        }
+
+        extent.at(i) = x;
+      }
+
+      if (extent.product()) {
+        problem_sizes.push_back(extent);
+      }
+    }
+
+    return true;
+  }
+
+  /// Post processes the problems
+  void bin_problems() {
+
+    problem_bins.clear();
+
+    problem_count = int(problem_sizes.size());
+
+    //
+    // Insert the problem sizes into a sorted container class. This is *NOT* necessary
+    // to run the CUTLASS kernel, but it enables the execution of cublas's batched GEMM.
+    //
+    for (int i = 0; i < int(problem_sizes.size()); ++i) {
+      auto it = problem_bins.find(problem_sizes.at(i));
+      if (it == problem_bins.end()) {
+        problem_bins.insert({problem_sizes.at(i), std::vector<int32_t>({i}) });
+      }
+      else {
+        it->second.push_back(i);
+      }
+    }
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "64_ada_fp8_gemm_grouped\n\n"
+      << "  This example profiles the performance of a 'grouped' GEMM kernel. This is similar to batched GEMM\n"
+      << "  in that multiple, independent GEMMs are computed by one grid launch. It differs in that each\n"
+      << "  'group' may compute a unique problem size. Problem sizes and pointers to matrices are both stored\n"
+      << "  in device Global Memory and loaded by the kernel.\n\n"
+      << "Options:\n\n"
+      << "  --help                           If specified, displays this usage statement.\n\n"
+      << "  --benchmark=<str>                Executes a benchmark problem size.\n"
+      << "  --output_file=<str>              Path to a CSV file to output results. If it exists already, results are appended.\n"
+      << "  --tag=<str>                      String tag to prepend to the CSV file.\n"
+      << "  --groups=<int>                   Number of individual GEMM problems (default: --groups=15)\n"
+      << "  --m=<int>                        Sets the M dimension for all groups. Otherwise, it is selected randomly\n"
+      << "  --n=<int>                        Sets the N dimension for all groups. Otherwise, it is selected randomly\n"
+      << "  --k=<int>                        Sets the K dimension for all groups. Otherwise, it is selected randomly\n"
+      << "  --alpha=<f32>                    Epilogue scalar alpha (real part)\n"
+      << "  --beta=<f32>                     Epilogue scalar beta (real part)\n"
+      << "  --scheduler-modes=<str>          List of scheduler modes to be profile for grouped GEMM scheduler (default: --scheduler_modes=kDeviceOnly)\n"
+      << "  --iterations=<int>               Number of profiling iterations to perform.\n"
+      << "  --reference-check=<bool>         If true, performs reference check.\n"
+      << "  --verbose=<bool>                 If true, prints problem sizes and batching structure.\n"
+      << "  --profile-initialization=<bool>  If true, profiles the device-level kernel's initialization.\n"
+      << "  --sort-problems=<bool>           If true, sorts problem sizes in descending order of GEMM-K dimension.\n";
+
+    out << "\n\nExamples:\n\n"
+
+      << "# Runs a grouped GEMM with 100 random problem sizes\n"
+      << "$ ./examples/64_ada_fp8_gemm_grouped/64_ada_fp8_gemm_grouped --groups=100\n\n"
+
+      << "# Runs a grouped GEMM with 100 random problem sizes (with GEMM-K dimension equal to 1024)\n"
+      << "$ ./examples/64_ada_fp8_gemm_grouped/64_ada_fp8_gemm_grouped --groups=100 --k=1024 --verbose=true\n\n"
+
+      << "# Runs a grouped GEMM that is equivalent to a batched GEMM\n"
+      << "$ ./examples/64_ada_fp8_gemm_grouped/64_ada_fp8_gemm_grouped --groups=100 --m=2048 --n=1024 --k=1024 --verbose=true\n\n"
+
+      << "# Runs a grouped GEMM with each different scheduler mode\n"
+      << "$ ./examples/64_ada_fp8_gemm_grouped/64_ada_fp8_gemm_grouped --scheduler-modes=all\n\n"
+
+      << "# Runs a grouped GEMM with each different scheduler mode and profiles host-side initialization time\n"
+      << "$ ./examples/64_ada_fp8_gemm_grouped/64_ada_fp8_gemm_grouped --scheduler-modes=all --profile-initialization=true\n\n"
+
+      << "# Runs a grouped GEMM problem given an externally supplied benchmark file. This is a text file in which\n"
+      << "# Each line contains a unique group index and an MxNxK triple indicating problemsize.\n"
+      << "#\n"
+      << "# For example, assume the following are the contents of 'problems.txt'\n"
+      << "#\n"
+      << "# 0 1024x256x520\n"
+      << "# 1 520x264x1024\n"
+      << "# 2 96x48x1024\n"
+      << "#\n"
+      << "$ ./examples/64_ada_fp8_gemm_grouped/64_ada_fp8_gemm_grouped --benchmark=problems.txt\n\n"
+
+      << "# Execute Grouped GEMM and profile with NSight\n"
+      << "$ nv-nsight-cu-cli ./examples/64_ada_fp8_gemm_grouped/64_ada_fp8_gemm_grouped --m=256 --n=256 --k=256 --verbose=true --iterations=1 --reference-check=false\n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+
+    // Number of real-valued multiply-adds
+    int64_t fmas = int64_t();
+
+    for (auto const & problem : problem_sizes) {
+      fmas += problem.product();
+    }
+
+    // Two flops per multiply-add
+    return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Gemm>
+class BaseTestbed {
+public:
+  //
+  // Type definitions
+  //
+
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementC = typename Gemm::ElementC;
+  using ElementAccumulator = typename Gemm::ElementAccumulator;
+
+  using EpilogueOutputOp = typename Gemm::GemmKernel::Epilogue::OutputOp;
+  using ElementCompute = typename EpilogueOutputOp::ElementCompute;
+
+  using LayoutA = typename Gemm::LayoutA;
+  using LayoutB = typename Gemm::LayoutB;
+  using LayoutC = typename Gemm::LayoutC;
+
+  using MatrixCoord = typename LayoutC::TensorCoord;
+
+  using DeviceGemmReference = cutlass::reference::device::Gemm<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    ElementAccumulator>;
+
+  //
+  // Data members
+  //
+
+  Options & options;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint32_t seed;
+
+  cutlass::DeviceAllocation<cutlass::gemm::GemmCoord> problem_sizes_device;
+
+  std::vector<int64_t> offset_A;
+  std::vector<int64_t> offset_B;
+  std::vector<int64_t> offset_C;
+  std::vector<int64_t> offset_D;
+
+  std::vector<int64_t> lda_host;
+  std::vector<int64_t> ldb_host;
+  std::vector<int64_t> ldc_host;
+  std::vector<int64_t> ldd_host;
+  std::vector<ElementCompute *> alpha_ptr_array_host;
+
+  cutlass::DeviceAllocation<int64_t> lda;
+  cutlass::DeviceAllocation<int64_t> ldb;
+  cutlass::DeviceAllocation<int64_t> ldc;
+  cutlass::DeviceAllocation<int64_t> ldd;
+
+  cutlass::DeviceAllocation<ElementA> block_A;
+  cutlass::DeviceAllocation<ElementB> block_B;
+  cutlass::DeviceAllocation<ElementC> block_C;
+  cutlass::DeviceAllocation<ElementC> block_D;
+  cutlass::DeviceAllocation<ElementCompute> alpha_array_device;
+
+  cutlass::DeviceAllocation<ElementA *> ptr_A;
+  cutlass::DeviceAllocation<ElementB *> ptr_B;
+  cutlass::DeviceAllocation<ElementC *> ptr_C;
+  cutlass::DeviceAllocation<ElementC *> ptr_D;
+  cutlass::DeviceAllocation<ElementCompute *> alpha_ptr_array_device;
+
+  BaseTestbed(
+    Options &options_,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint32_t seed_ = 3080
+  ):
+    options(options_), init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { }
+
+  int problem_count() const {
+    return options.problem_count;
+  }
+
+  /// Helper to initialize a tensor view
+  template <typename Element>
+  void initialize_tensor(
+    Element *ptr,
+    size_t capacity,
+    cutlass::Distribution::Kind dist_kind,
+    uint32_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      Element scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<typename Gemm::ElementC>::value;
+
+      if (bits_input == 1) {
+        scope_max = static_cast<Element>(2);
+        scope_min = static_cast<Element>(0);
+      } else if (bits_input <= 8) {
+        scope_max = static_cast<Element>(2);
+        scope_min = static_cast<Element>(-2);
+      } else if (bits_output == 16) {
+        if (cutlass::sizeof_bits<ElementAccumulator>::value <= 16) {
+          scope_max = static_cast<Element>(5);
+          scope_min = static_cast<Element>(-5);
+        }
+        else {
+          scope_max = static_cast<Element>(8);
+          scope_min = static_cast<Element>(-8);
+        }
+      } else {
+        scope_max = static_cast<Element>(8);
+        scope_min = static_cast<Element>(-8);
+      }
+
+      cutlass::reference::device::BlockFillRandomUniform(
+        ptr, capacity, seed, scope_max, scope_min, 0);
+    }
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::device::BlockFillRandomGaussian(
+        ptr, capacity, seed, Element(), Element(0.5f));
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      // Fill with increasing elements
+      cutlass::reference::device::BlockFillSequential(
+        ptr, capacity, Element(1), Element());
+    }
+    else {
+
+      // Fill with all 1s
+      cutlass::reference::device::BlockFillSequential(
+        ptr, capacity, Element(), Element(1));
+    }
+  }
+
+  /// Allocates device-side data
+  void allocate() {
+    int64_t total_elements_A = 0;
+    int64_t total_elements_B = 0;
+    int64_t total_elements_C = 0;
+    int64_t total_elements_D = 0;
+
+    lda_host.resize(problem_count());
+    ldb_host.resize(problem_count());
+    ldc_host.resize(problem_count());
+    ldd_host.resize(problem_count());
+
+    for (int32_t i = 0; i < problem_count(); ++i) {
+
+      auto problem = options.problem_sizes.at(i);
+
+      lda_host.at(i) = LayoutA::packed({problem.m(), problem.k()}).stride(0);
+      ldb_host.at(i) = LayoutB::packed({problem.k(), problem.n()}).stride(0);
+      ldc_host.at(i) = LayoutC::packed({problem.m(), problem.n()}).stride(0);
+      ldd_host.at(i) = LayoutC::packed({problem.m(), problem.n()}).stride(0);
+
+      offset_A.push_back(total_elements_A);
+      offset_B.push_back(total_elements_B);
+      offset_C.push_back(total_elements_C);
+      offset_D.push_back(total_elements_D);
+
+      int64_t elements_A = problem.m() * problem.k();
+      int64_t elements_B = problem.k() * problem.n();
+      int64_t elements_C = problem.m() * problem.n();
+      int64_t elements_D = problem.m() * problem.n();
+
+      total_elements_A += elements_A;
+      total_elements_B += elements_B;
+      total_elements_C += elements_C;
+      total_elements_D += elements_D;
+    }
+
+    lda.reset(problem_count());
+    ldb.reset(problem_count());
+    ldc.reset(problem_count());
+    ldd.reset(problem_count());
+
+    block_A.reset(total_elements_A);
+    block_B.reset(total_elements_B);
+    block_C.reset(total_elements_C);
+    block_D.reset(total_elements_D);
+
+    alpha_ptr_array_host.resize(problem_count());
+    alpha_array_device.reset(problem_count());
+    alpha_ptr_array_device.reset(problem_count());
+  }
+
+  /// Initializes device-side data
+  void initialize() {
+    problem_sizes_device.reset(problem_count());
+    problem_sizes_device.copy_from_host(options.problem_sizes.data());
+
+    lda.copy_from_host(lda_host.data());
+    ldb.copy_from_host(ldb_host.data());
+    ldc.copy_from_host(ldc_host.data());
+    ldd.copy_from_host(ldd_host.data());
+
+    //
+    // Assign pointers
+    //
+
+    std::vector<ElementA *> ptr_A_host(problem_count());
+    std::vector<ElementB *> ptr_B_host(problem_count());
+    std::vector<ElementC *> ptr_C_host(problem_count());
+    std::vector<ElementC *> ptr_D_host(problem_count());
+
+    for (int32_t i = 0; i < problem_count(); ++i) {
+      ptr_A_host.at(i) = block_A.get() + offset_A.at(i);
+      ptr_B_host.at(i) = block_B.get() + offset_B.at(i);
+      ptr_C_host.at(i) = block_C.get() + offset_C.at(i);
+      ptr_D_host.at(i) = block_D.get() + offset_D.at(i);
+    }
+
+    ptr_A.reset(problem_count());
+    ptr_A.copy_from_host(ptr_A_host.data());
+
+    ptr_B.reset(problem_count());
+    ptr_B.copy_from_host(ptr_B_host.data());
+
+    ptr_C.reset(problem_count());
+    ptr_C.copy_from_host(ptr_C_host.data());
+
+    ptr_D.reset(problem_count());
+    ptr_D.copy_from_host(ptr_D_host.data());
+
+    //
+    // Initialize the problems of the workspace
+    //
+
+    initialize_tensor(block_A.get(), block_A.size(), init_A, seed * 2021);
+    initialize_tensor(block_B.get(), block_B.size(), init_B, seed * 2022);
+    initialize_tensor(block_C.get(), block_C.size(), init_C, seed * 2023);
+
+    cutlass::reference::device::BlockFillSequential(
+      block_D.get(), block_D.size(), ElementC(), ElementC());
+
+    // Initialize alpha array
+    alpha_array_device.copy_from_host(options.alpha_array.data());
+    for (int32_t i = 0; i < problem_count(); ++i) {
+      alpha_ptr_array_host.at(i) = alpha_array_device.get() + i;
+    }
+    alpha_ptr_array_device.copy_from_host(alpha_ptr_array_host.data());
+  }
+
+  /// Verifies the result is a GEMM
+  bool verify() {
+
+    bool passed = true;
+
+    for (int32_t i = 0; i < problem_count(); ++i) {
+      cutlass::gemm::GemmCoord problem = options.problem_sizes.at(i);
+
+      LayoutA layout_A(lda_host.at(i));
+      LayoutB layout_B(ldb_host.at(i));
+      LayoutC layout_C(ldc_host.at(i));
+      LayoutC layout_D(ldd_host.at(i));
+
+      MatrixCoord extent_A{problem.m(), problem.k()};
+      MatrixCoord extent_B{problem.k(), problem.n()};
+      MatrixCoord extent_C{problem.m(), problem.n()};
+
+      cutlass::TensorView<ElementA, LayoutA> view_A(block_A.get() + offset_A.at(i), layout_A, extent_A);
+      cutlass::TensorView<ElementB, LayoutB> view_B(block_B.get() + offset_B.at(i), layout_B, extent_B);
+      cutlass::TensorView<ElementC, LayoutC> view_C(block_C.get() + offset_C.at(i), layout_C, extent_C);
+
+      cutlass::DeviceAllocation<ElementC>    block_Ref(layout_D.capacity(extent_C));
+      cutlass::TensorView<ElementC, LayoutC> view_Ref_device(block_Ref.get(), layout_D, extent_C);
+
+      // Reference GEMM
+      cutlass::reference::device::GemmComplex<
+          ElementA, LayoutA,
+          ElementB, LayoutB,
+          ElementC, LayoutC,
+          ElementCompute, ElementAccumulator
+      >(
+        problem,
+        options.alpha_array[i],
+        view_A,
+        Gemm::kTransformA,
+        view_B,
+        Gemm::kTransformB,
+        options.beta,
+        view_C,
+        view_Ref_device,
+        ElementAccumulator(0)
+      );
+
+      // Copy to host memory
+      std::vector<ElementC> matrix_D(layout_D.capacity(extent_C));
+      std::vector<ElementC> matrix_Ref(layout_D.capacity(extent_C));
+
+      cutlass::device_memory::copy_to_host(matrix_D.data(),   block_D.get() + offset_D.at(i), matrix_D.size());
+      cutlass::device_memory::copy_to_host(matrix_Ref.data(), block_Ref.get(),                matrix_D.size());
+
+      cutlass::TensorView<ElementC, LayoutC> view_D(  matrix_D.data(),   layout_D, extent_C);
+      cutlass::TensorView<ElementC, LayoutC> view_Ref(matrix_Ref.data(), layout_D, extent_C);
+
+      // Reference check
+      passed = cutlass::reference::host::TensorEquals(view_D, view_Ref);
+
+      if (!passed) {
+        std::cerr << "\n***\nError - problem " << i << " failed the QA check\n***\n" << std::endl;
+        return passed;
+      }
+    }
+
+    return passed;
+  }
+
+};
+
+template <typename Gemm_, cutlass::gemm::kernel::GroupScheduleMode GroupScheduleMode_>
+class TestbedGrouped : BaseTestbed<Gemm_> {
+public:
+  TestbedGrouped(
+    Options &options_,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint32_t seed_ = 3080
+  ): BaseTestbed<Gemm_>(options_, init_A_, init_B_, init_C_, seed_) {}
+
+  // Redefine GEMM with different GroupScheduleMode_
+  using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmGroupedPerGroupScale<
+    typename Gemm_::ElementA,
+    typename Gemm_::LayoutA,
+    Gemm_::kTransformA,
+    Gemm_::kAlignmentA,
+    typename Gemm_::ElementB,
+    typename Gemm_::LayoutB,
+    Gemm_::kTransformB,
+    Gemm_::kAlignmentB,
+    typename Gemm_::ElementC,
+    typename Gemm_::LayoutC,
+    typename Gemm_::ElementAccumulator,
+    typename Gemm_::OperatorClass,
+    typename Gemm_::ArchTag,
+    typename Gemm_::ThreadblockShape,
+    typename Gemm_::WarpShape,
+    typename Gemm_::InstructionShape,
+    typename Gemm_::EpilogueOutputOp,
+    typename Gemm_::ThreadblockSwizzle,
+    Gemm_::kStages,
+    GroupScheduleMode_>::GemmKernel;
+
+  using Gemm = cutlass::gemm::device::GemmGrouped<GemmKernel>;
+
+  /// Verbose printing of problem sizes
+  void print_problem_sizes() {
+    std::cout << std::endl;
+
+    // Print groups
+    std::cout << this->problem_count() << " groups:\n";
+
+    int32_t idx = 0;
+    int64_t total_tiles = 0;
+
+    for (auto const & problem : this->options.problem_sizes) {
+      int tiles = Gemm::problem_tile_count(problem);
+      total_tiles += tiles;
+
+      std::cout << "  [" << idx << "]: "
+        << problem.m() << "-by-" << problem.n() << "-by-" << problem.k()
+        << " (" << tiles << " threadblock tiles)" << "\n";
+
+      ++idx;
+    }
+    std::cout << std::endl;
+  }
+
+  /// Sort problems in descending order of problem-K dimension
+  void sort_problems() {
+    Gemm::sort_problems(this->options.problem_count,
+                        this->options.problem_sizes.data(),
+                        this->lda_host.data(),
+                        this->ldb_host.data(),
+                        this->ldc_host.data(),
+                        this->ldd_host.data(),
+                        this->offset_A.data(),
+                        this->offset_B.data(),
+                        this->offset_C.data(),
+                        this->offset_D.data());
+  }
+
+  /// Executes a grouped kernel and measures runtime
+  Result profile() {
+    std::string sched_mode = this->options.scheduler_mode_to_str.find(GroupScheduleMode_)->second;
+
+    std::cout << std::endl;
+    std::cout << "Grouped GEMM (CUTLASS) with mode " << sched_mode << ":\n"
+      << "====================================================" << std::endl;
+
+    Result result;
+
+    int threadblock_count = Gemm::sufficient(this->options.problem_sizes.data(), this->options.problem_count);
+
+    // Early exit
+    if (!threadblock_count) {
+      std::cout << "Active CUDA device lacks hardware resources to run CUTLASS Grouped GEMM kernel." << std::endl;
+      return result;
+    }
+
+    result.passed = false;
+
+    // Initialize the problem
+    this->allocate();
+    if (this->options.sort_problems) {
+      sort_problems();
+    }
+    this->initialize();
+
+    if (this->options.verbose) {
+      print_problem_sizes();
+    }
+
+    // Configure the GEMM arguments
+    typename Gemm::EpilogueOutputOp::ElementCompute ** alpha_ptr_array = this->alpha_ptr_array_device.get();
+    typename Gemm::EpilogueOutputOp::Params epilogue_op(alpha_ptr_array, nullptr);
+
+    // Configure GEMM arguments
+    typename Gemm::Arguments args(
+      this->problem_sizes_device.get(),
+      this->problem_count(),
+      threadblock_count,
+      epilogue_op,
+      this->ptr_A.get(),
+      this->ptr_B.get(),
+      this->ptr_C.get(),
+      this->ptr_D.get(),
+      this->lda.get(),
+      this->ldb.get(),
+      this->ldc.get(),
+      this->ldd.get(),
+      this->options.problem_sizes.data()
+    );
+
+    // Initialize the GEMM object
+    Gemm gemm;
+
+    size_t workspace_size = gemm.get_workspace_size(args);
+    cutlass::DeviceAllocation<uint8_t> workspace(workspace_size);
+
+    result.status = gemm.initialize(args, workspace.get());
+
+    if (result.status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to initialize CUTLASS Grouped GEMM kernel." << std::endl;
+      return result;
+    }
+
+    // Run the grouped GEMM object
+    result.status = gemm.run();
+
+    if (result.status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to run CUTLASS Grouped GEMM kernel." << std::endl;
+      return result;
+    }
+
+    // Wait for completion
+    result.error = cudaDeviceSynchronize();
+
+    if (result.error != cudaSuccess)  {
+      std::cerr << "Kernel execution error: " << cudaGetErrorString(result.error);
+      return result;
+    }
+
+    //
+    // Verify correctness
+    //
+    result.passed = true;
+
+    if (this->options.reference_check) {
+      result.passed = this->verify();
+    }
+
+    //
+    // Warm-up run of the grouped GEMM object
+    //
+    result.status = gemm.run();
+
+    if (result.status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to run CUTLASS Grouped GEMM kernel." << std::endl;
+      return result;
+    }
+
+    //
+    // Construct events
+    //
+
+    cudaEvent_t events[2];
+
+    for (auto & event : events) {
+      result.error = cudaEventCreate(&event);
+      if (result.error != cudaSuccess) {
+        std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+        return -1;
+      }
+    }
+
+    // Record an event at the start of a series of GEMM operations
+    result.error = cudaEventRecord(events[0]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    //
+    // Run profiling loop
+    //
+
+    for (int iter = 0; iter < this->options.iterations; ++iter) {
+      gemm();
+    }
+
+    //
+    // Stop profiling loop
+    //
+
+    // Record an event when the GEMM operations have been launched.
+    result.error = cudaEventRecord(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Wait for work on the device to complete.
+    result.error = cudaEventSynchronize(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Measure elapsed runtime
+    float runtime_ms = 0;
+    result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Compute average runtime and GFLOPs.
+    result.runtime_ms = double(runtime_ms) / double(this->options.iterations);
+    result.gflops = this->options.gflops(result.runtime_ms / 1000.0);
+
+    //
+    // Cleanup
+    //
+
+    for (auto event : events) {
+      (void)cudaEventDestroy(event);
+    }
+
+    // Optionally profile initialization
+    if (this->options.profile_initialization) {
+      // Warm up
+      gemm.initialize(args, workspace.get());
+
+      auto start_time = std::chrono::high_resolution_clock::now();
+      for (int32_t i = 0; i < this->options.iterations; ++i) {
+        gemm.initialize(args, workspace.get());
+      }
+      auto end_time = std::chrono::high_resolution_clock::now();
+
+      std::chrono::duration<double, std::milli> duration = end_time - start_time;
+      duration /= double(this->options.iterations);
+      result.initialization_time_ms = duration.count();
+    }
+
+    int64_t total_tiles = Gemm::group_tile_count(args);
+    std::cout << "    " << total_tiles << " total threadblock tiles." << std::endl;
+
+    std::cout << std::endl;
+    std::cout << "    " << "Grouped Runtime: " << result.runtime_ms << " ms" << std::endl;
+    std::cout << "    " << "Grouped  GFLOPs: " << result.gflops << std::endl;
+    if (this->options.profile_initialization) {
+      std::cout << "    " << "Init    Runtime: " << result.initialization_time_ms << " ms" << std::endl;
+    }
+
+    if (this->options.output_file.good()) {
+      this->options.output_file << this->options.output_tag << ",CUTLASS,grouped-" << sched_mode << ","
+        << this->options.problem_count << "," << result.runtime_ms << "," << result.gflops << std::endl;
+    }
+
+    std::cout << "\nPassed\n";
+
+    return result;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 4)) {
+    std::cerr << "This example requires CUDA 12.4 or greater." << std::endl;
+    return 0;
+  }
+  int device_idx;
+  cudaError_t result = cudaGetDevice(&device_idx);
+  cudaDeviceProp properties;
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+  if (result != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() failed with error: " << cudaGetErrorString(result) << std::endl;
+    return 0;
+  }
+
+  if (!(properties.major == 8 && properties.minor == 9)) {
+    std::cerr << "CUTLASS's Ada FP8 Gemm Grouped example requires a device of compute capability 89.\n" << std::endl;
+    return 0;
+  }
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.error) {
+    std::cerr << "Aborting execution." << std::endl;
+    return -1;
+  }
+
+  //
+  // Define the Grouped and Batched GEMM types
+  //
+
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+
+  constexpr int ElementsPerAccessA = 128 / cutlass::sizeof_bits<ElementA>::value;
+  constexpr int ElementsPerAccessB = 128 / cutlass::sizeof_bits<ElementB>::value;
+
+  // Define a grouped GEMM kernel with all template parameters set except
+  // for scheduling mode. This will be used as the template for all scheduling
+  // modes executed.
+  using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmGroupedPerGroupScale<
+    ElementA,
+    LayoutA,
+    cutlass::ComplexTransform::kNone,
+    ElementsPerAccessA,
+    ElementB,
+    LayoutB,
+    cutlass::ComplexTransform::kNone,
+    ElementsPerAccessB,
+    ElementOutput, LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm89,
+    cutlass::gemm::GemmShape<64, 128, 64>,
+    cutlass::gemm::GemmShape<64, 32, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombination<
+        ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+        ElementAccumulator, ElementAccumulator>,
+    // NOTE: Threadblock swizzling is currently not supported by CUTLASS's grouped kernels.
+    // This parameter is passed in at present to match the APIs of other kernels. The parameter
+    // is unused within the kernel.
+    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle,
+    4>::GemmKernel;
+
+  using GemmGrouped = cutlass::gemm::device::GemmGrouped<GemmKernel>;
+
+  //
+  // Profile it
+  //
+
+  using GroupScheduleMode = cutlass::gemm::kernel::GroupScheduleMode;
+  for (GroupScheduleMode mode : options.scheduler_modes) {
+    Result result;
+    switch (mode) {
+      case GroupScheduleMode::kDeviceOnly:
+        {
+          TestbedGrouped<GemmGrouped, GroupScheduleMode::kDeviceOnly> runner(options);
+          result = runner.profile();
+          break;
+        }
+      case GroupScheduleMode::kHostPrecompute:
+        {
+          TestbedGrouped<GemmGrouped, GroupScheduleMode::kHostPrecompute> runner(options);
+          result = runner.profile();
+          break;
+        }
+    }
+
+    if (result.error != cudaSuccess) {
+      return 1;
+    }
+
+    // Override verbose flag to avoid printing duplicate information for each scheduling mode
+    options.verbose = false;
+  }
+
+  return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index d5fdac141b..7e8d45227b 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -44,7 +44,7 @@ function(cutlass_example_add_executable NAME)
     set(__DISABLE_TESTS OFF)
   endif()
 
-  cutlass_add_executable(${NAME} ${__UNPARSED_ARGUMENTS})
+  cutlass_add_executable(${NAME} ${__UNPARSED_ARGUMENTS} BATCH_SOURCES OFF)
 
   add_dependencies(cutlass_examples ${NAME})
 
@@ -54,12 +54,14 @@ function(cutlass_example_add_executable NAME)
     CUTLASS
     cutlass_tools_util_includes
     $<$<BOOL:${CUTLASS_ENABLE_CUBLAS}>:nvidia::cublas>
+    cuda
     )
 
   target_include_directories(
     ${NAME}
     PRIVATE
     ${CUTLASS_EXAMPLES_COMMON_SOURCE_DIR}
+    ${CUTLASS_EXAMPLES_UTILS_DIR}
     )
 
   install(
@@ -116,8 +118,35 @@ foreach(EXAMPLE
   34_transposed_conv2d
   35_gemm_softmax
   36_gather_scatter_fusion
+  37_gemm_layernorm_gemm_fusion
+  38_syr2k_grouped
+  cute
+  39_gemm_permute
+  41_fused_multi_head_attention
+  42_ampere_tensorop_group_conv
+  43_ell_block_sparse_gemm
+  45_dual_gemm
+  46_depthwise_simt_conv2dfprop
+  47_ampere_gemm_universal_streamk
+  48_hopper_warp_specialized_gemm
+  49_hopper_gemm_with_collective_builder
+  50_hopper_gemm_with_epilogue_swizzle
+  51_hopper_gett
+  52_hopper_gather_scatter_fusion
+  53_hopper_gemm_permute
+  54_hopper_fp8_warp_specialized_gemm
+  55_hopper_mixed_dtype_gemm
+  56_hopper_ptr_array_batched_gemm
+  57_hopper_grouped_gemm
+  58_ada_fp8_gemm
+  59_ampere_gather_scatter_conv
+  61_hopper_gemm_with_topk_and_softmax
+  62_hopper_sparse_gemm
+  63_hopper_gemm_with_weight_prefetch
+  64_ada_fp8_gemm_grouped
   )
 
   add_subdirectory(${EXAMPLE})
 
 endforeach()
+
diff --git a/examples/common/gather_tensor.hpp b/examples/common/gather_tensor.hpp
new file mode 100644
index 0000000000..62616e00c7
--- /dev/null
+++ b/examples/common/gather_tensor.hpp
@@ -0,0 +1,215 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cute/layout.hpp"
+#include "cute/tensor.hpp"
+#include "cute/util/print.hpp"
+
+namespace example {
+
+using namespace cute;
+
+// Empty type used to disable gather/scatter for a GEMM argument
+struct NoGather
+{
+  template<class... Ts>
+  NoGather(Ts...) {};
+};
+
+/// Function object that applies an index to its argument
+template <class Index>
+struct IndexedGather
+{
+  CUTE_HOST_DEVICE constexpr
+  IndexedGather(Index const *indices = {}): indices_(indices) {}
+
+  template <typename I>
+  CUTE_HOST_DEVICE constexpr
+  Index
+  operator()(I i) const { return indices_[i]; }
+
+  CUTE_HOST_DEVICE friend
+  void 
+  print(IndexedGather const &s) {
+    cute::print("Indexed");
+  }
+
+  Index const *indices_;
+};
+
+/// Function object that applies a stride to its argument
+/// Example: StridedFunc<int,_2> gathers every other row/column
+template <class Stride>
+struct StridedGather
+{
+  CUTE_HOST_DEVICE constexpr
+  StridedGather(Stride stride = {}): stride_(stride) {}
+
+  template <class I>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  operator()(I i) const { return i * stride_; }
+
+  CUTE_HOST_DEVICE friend
+  void 
+  print(StridedGather const &s) {
+    cute::print("Strided{");
+    print(s.stride_);
+    cute::print("}");
+  }
+
+  Stride stride_;
+};
+
+/// Custom stride object that applies a function followed by a stride
+template <class Func, class Stride>
+struct CustomStride
+{
+  CUTE_HOST_DEVICE constexpr
+  CustomStride(Func const &func, Stride const &stride): func_(func), stride_(stride) {}
+
+  template <class I>
+  CUTE_HOST_DEVICE constexpr friend
+  auto
+  operator*(I i, CustomStride const &s) { return s.func_(i) * s.stride_; }
+
+  template <class I>
+  CUTE_HOST_DEVICE constexpr friend
+  auto
+  operator*(CustomStride const &s, I i) { return s.func_(i) * s.stride_; }
+
+  CUTE_HOST_DEVICE friend
+  void
+  print(CustomStride const & s) {
+    cute::print("Custom{");
+    print(s.func_);
+    cute::print(",");
+    print(s.stride_);
+    cute::print("}");
+  }
+
+  template<class Div>
+  CUTE_HOST_DEVICE constexpr friend
+  auto
+  safe_div(CustomStride const &s, Div const &div)
+  {
+    return CustomStride<Func, decltype(safe_div(s.stride_, div))>(s.func_, safe_div(s.stride_, div));
+  }
+
+  // Circumvent the requirement on make_layout that shape and stride are integral
+  template <class Shape>
+  CUTE_HOST_DEVICE constexpr friend
+  auto
+  make_layout(Shape const &shape, CustomStride const &stride)
+  {
+    return Layout<Shape, CustomStride>(shape, stride);
+  }
+
+  Func func_;
+  Stride stride_;
+};
+
+template<class Stride, class Func>
+CUTLASS_HOST_DEVICE
+auto
+make_custom_stride_layout(Stride const &stride, Func&& func)
+{
+  // Use a dummy shape and replace the first non-unit stride with a custom gather stride
+  auto idx = find_if(stride, [](auto x){ return not is_constant<1, decltype(x)>{}; });
+  constexpr int I = decltype(idx)::value;
+  return make_layout(repeat_like(stride, _1{}),
+                     replace<I>(stride, CustomStride{static_cast<Func&&>(func), get<I>(stride)}));
+}
+
+/// Helper function to optionally create a gather tensor
+template<class Iterator, class Shape, class Stride, class Func>
+CUTLASS_HOST_DEVICE
+auto 
+make_gather_tensor(Iterator iter, Shape const &shape, Stride const &stride, Func &&func)
+{
+  if constexpr (not cutlass::platform::is_same<remove_cvref_t<Func>, NoGather>::value) {
+    Layout matrix_layout = make_identity_layout(shape);
+    auto offset = as_arithmetic_tuple(repeat_like(shape, _0{}));
+    Layout gather_layout = make_custom_stride_layout(stride, static_cast<Func&&>(func));
+    return make_tensor(iter, ComposedLayout{gather_layout, offset, matrix_layout});
+  } else {
+    return make_tensor(iter, shape, stride);
+  }
+}
+
+} // namespace example
+
+namespace cute
+{
+
+template<int N, int I, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+upcast(Shape const& shape, Stride const& stride)
+{
+  if constexpr (is_tuple<Shape>::value) {
+    return transform_layout(shape, stride, [](auto const& s, auto const& d) { return upcast<N,I>(s,d); });
+  } else if constexpr (is_scaled_basis<Stride>::value) {
+    if constexpr (Stride::mode() == I) {
+      return make_layout(shape_div(shape, Int<N>{}), shape_div(stride, Int<N>{}));
+    } else {
+      return make_layout(shape, stride);
+    }
+  } else {
+    return upcast<N>(shape, stride);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <int N, class OuterShape, class OuterStride, class Offset, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+upcast(ComposedLayout<Layout<OuterShape,OuterStride>,Offset,Layout<Shape,Stride>> const& layout)
+{
+  // Find index of the stride-1 mode - that is the only one that requires updating inner shape and offset
+  auto idx = find_if(layout.layout_a().stride(), [](auto x){ return is_constant<1, decltype(x)>{}; });
+  constexpr int I = decltype(idx)::value;
+
+  // Upcast the outer layout (works as expected)
+  auto outer = upcast<N>(layout.layout_a());
+
+  // Upcast the accumulated offset along stride-1 mode
+  auto offset = as_arithmetic_tuple(replace<I>(layout.offset(), upcast<N>(get<I>(layout.offset()))));
+
+  // Upcast the inner layout's shape along stride-1 mode
+  auto inner = upcast<N,I>(layout.layout_b().shape(), layout.layout_b().stride());
+
+  return composition(outer, offset, inner);
+}
+
+} // namespace example
diff --git a/examples/common/helper.h b/examples/common/helper.h
index 2affd96c68..a7a81e7479 100644
--- a/examples/common/helper.h
+++ b/examples/common/helper.h
@@ -1,7 +1,41 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
 #pragma once
 
 #include "cuda_runtime.h"
+#include <iostream>
 
+/**
+ * Panic wrapper for unwinding CUTLASS errors
+ */
 #define CUTLASS_CHECK(status)                                                                    \
   {                                                                                              \
     cutlass::Status error = status;                                                              \
@@ -12,6 +46,10 @@
     }                                                                                            \
   }
 
+
+/**
+ * Panic wrapper for unwinding CUDA runtime errors
+ */
 #define CUDA_CHECK(status)                                              \
   {                                                                     \
     cudaError_t error = status;                                         \
@@ -21,3 +59,50 @@
       exit(EXIT_FAILURE);                                               \
     }                                                                   \
   }
+
+
+/**
+ * GPU timer for recording the elapsed time across kernel(s) launched in GPU stream
+ */
+struct GpuTimer
+{
+    cudaStream_t _stream_id;
+    cudaEvent_t _start;
+    cudaEvent_t _stop;
+
+    /// Constructor
+    GpuTimer() : _stream_id(0)
+    {
+        CUDA_CHECK(cudaEventCreate(&_start));
+        CUDA_CHECK(cudaEventCreate(&_stop));
+    }
+
+    /// Destructor
+    ~GpuTimer()
+    {
+        CUDA_CHECK(cudaEventDestroy(_start));
+        CUDA_CHECK(cudaEventDestroy(_stop));
+    }
+
+    /// Start the timer for a given stream (defaults to the default stream)
+    void start(cudaStream_t stream_id = 0)
+    {
+        _stream_id = stream_id;
+        CUDA_CHECK(cudaEventRecord(_start, _stream_id));
+    }
+
+    /// Stop the timer
+    void stop()
+    {
+        CUDA_CHECK(cudaEventRecord(_stop, _stream_id));
+    }
+
+    /// Return the elapsed time (in milliseconds)
+    float elapsed_millis()
+    {
+        float elapsed = 0.0;
+        CUDA_CHECK(cudaEventSynchronize(_stop));
+        CUDA_CHECK(cudaEventElapsedTime(&elapsed, _start, _stop));
+        return elapsed;
+    }
+};
diff --git a/examples/cute/CMakeLists.txt b/examples/cute/CMakeLists.txt
new file mode 100644
index 0000000000..69aefd7c94
--- /dev/null
+++ b/examples/cute/CMakeLists.txt
@@ -0,0 +1,30 @@
+
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+add_subdirectory(tutorial)
diff --git a/examples/cute/tutorial/CMakeLists.txt b/examples/cute/tutorial/CMakeLists.txt
new file mode 100644
index 0000000000..b427d9368c
--- /dev/null
+++ b/examples/cute/tutorial/CMakeLists.txt
@@ -0,0 +1,60 @@
+
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+cutlass_example_add_executable(
+  sgemm_1
+  sgemm_1.cu
+)
+
+cutlass_example_add_executable(
+  sgemm_2
+  sgemm_2.cu
+)
+
+cutlass_example_add_executable(
+  sgemm_sm70
+  sgemm_sm70.cu
+)
+
+cutlass_example_add_executable(
+  sgemm_sm80
+  sgemm_sm80.cu
+)
+
+cutlass_example_add_executable(
+  tiled_copy
+  tiled_copy.cu
+)
+
+cutlass_example_add_executable(
+  wgmma_sm90
+  wgmma_sm90.cu
+)
+
diff --git a/examples/cute/tutorial/sgemm_1.cu b/examples/cute/tutorial/sgemm_1.cu
new file mode 100644
index 0000000000..e5bf9a9201
--- /dev/null
+++ b/examples/cute/tutorial/sgemm_1.cu
@@ -0,0 +1,469 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#include <cstdlib>
+#include <cstdio>
+#include <cassert>
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <cute/tensor.hpp>
+
+#include "cutlass/util/print_error.hpp"
+#include "cutlass/util/GPU_Clock.hpp"
+#include "cutlass/util/helper_cuda.hpp"
+
+template <class ProblemShape, class CtaTiler,
+          class TA, class AStride, class ASmemLayout, class AThreadLayout,
+          class TB, class BStride, class BSmemLayout, class BThreadLayout,
+          class TC, class CStride, class CSmemLayout, class CThreadLayout,
+          class Alpha, class Beta>
+__global__ static
+__launch_bounds__(decltype(size(CThreadLayout{}))::value)
+void
+gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
+            TA const* A, AStride dA, ASmemLayout sA_layout, AThreadLayout tA,
+            TB const* B, BStride dB, BSmemLayout sB_layout, BThreadLayout tB,
+            TC      * C, CStride dC, CSmemLayout          , CThreadLayout tC,
+            Alpha alpha, Beta beta)
+{
+  using namespace cute;
+
+  // Preconditions
+  CUTE_STATIC_ASSERT_V(rank(shape_MNK) == Int<3>{});                   // (M, N, K)
+  CUTE_STATIC_ASSERT_V(rank(cta_tiler) == Int<3>{});                   // (BLK_M, BLK_N, BLK_K)
+
+  static_assert(is_static<AThreadLayout>::value);
+  static_assert(is_static<BThreadLayout>::value);
+  static_assert(is_static<CThreadLayout>::value);
+
+  CUTE_STATIC_ASSERT_V(size(tA) == size(tB));                          // NumThreads
+  CUTE_STATIC_ASSERT_V(size(tC) == size(tA));                          // NumThreads
+
+  CUTE_STATIC_ASSERT_V(size<0>(cta_tiler) % size<0>(tA) == Int<0>{});  // BLK_M / THR_M
+  CUTE_STATIC_ASSERT_V(size<2>(cta_tiler) % size<1>(tA) == Int<0>{});  // BLK_K / THR_K
+  CUTE_STATIC_ASSERT_V(size<1>(cta_tiler) % size<0>(tB) == Int<0>{});  // BLK_N / THR_N
+  CUTE_STATIC_ASSERT_V(size<2>(cta_tiler) % size<1>(tB) == Int<0>{});  // BLK_K / THR_K
+  CUTE_STATIC_ASSERT_V(size<0>(cta_tiler) % size<0>(tC) == Int<0>{});  // BLK_M / THR_M
+  CUTE_STATIC_ASSERT_V(size<1>(cta_tiler) % size<1>(tC) == Int<0>{});  // BLK_N / THR_N
+
+  static_assert(is_static<ASmemLayout>::value);
+  static_assert(is_static<BSmemLayout>::value);
+  static_assert(is_static<CSmemLayout>::value);
+
+  CUTE_STATIC_ASSERT_V(size<0>(ASmemLayout{}) == size<0>(cta_tiler));  // BLK_M
+  CUTE_STATIC_ASSERT_V(size<0>(CSmemLayout{}) == size<0>(cta_tiler));  // BLK_M
+  CUTE_STATIC_ASSERT_V(size<0>(BSmemLayout{}) == size<1>(cta_tiler));  // BLK_N
+  CUTE_STATIC_ASSERT_V(size<1>(CSmemLayout{}) == size<1>(cta_tiler));  // BLK_N
+  CUTE_STATIC_ASSERT_V(size<1>(ASmemLayout{}) == size<2>(cta_tiler));  // BLK_K
+  CUTE_STATIC_ASSERT_V(size<1>(BSmemLayout{}) == size<2>(cta_tiler));  // BLK_K
+
+  CUTE_STATIC_ASSERT_V(congruent(select<0,2>(shape_MNK), dA));         // dA strides for shape MK
+  CUTE_STATIC_ASSERT_V(congruent(select<1,2>(shape_MNK), dB));         // dB strides for shape NK
+  CUTE_STATIC_ASSERT_V(congruent(select<0,1>(shape_MNK), dC));         // dC strides for shape MN
+
+  //
+  // Full and Tiled Tensors
+  //
+
+  // Represent the full tensors
+  Tensor mA = make_tensor(make_gmem_ptr(A), select<0,2>(shape_MNK), dA); // (M,K)
+  Tensor mB = make_tensor(make_gmem_ptr(B), select<1,2>(shape_MNK), dB); // (N,K)
+  Tensor mC = make_tensor(make_gmem_ptr(C), select<0,1>(shape_MNK), dC); // (M,N)
+
+  // Get the appropriate blocks for this thread block
+  auto cta_coord = make_coord(blockIdx.x, blockIdx.y, _);              // (m,n,k)
+  Tensor gA = local_tile(mA, cta_tiler, cta_coord, Step<_1, X,_1>{});  // (BLK_M,BLK_K,k)
+  Tensor gB = local_tile(mB, cta_tiler, cta_coord, Step< X,_1,_1>{});  // (BLK_N,BLK_K,k)
+  Tensor gC = local_tile(mC, cta_tiler, cta_coord, Step<_1,_1, X>{});  // (BLK_M,BLK_N)
+
+  // Shared memory buffers
+  __shared__ TA smemA[cosize_v<ASmemLayout>];
+  __shared__ TB smemB[cosize_v<BSmemLayout>];
+  Tensor sA = make_tensor(make_smem_ptr(smemA), sA_layout);            // (BLK_M,BLK_K)
+  Tensor sB = make_tensor(make_smem_ptr(smemB), sB_layout);            // (BLK_N,BLK_K)
+
+  //
+  // Partition the copying of A and B tiles across the threads
+  //
+
+  // TUTORIAL: Example of simple raked partitioning of ThreadLayouts tA|tB over data A|B tiles
+
+  Tensor tAgA = local_partition(gA, tA, threadIdx.x);                  // (THR_M,THR_K,k)
+  Tensor tAsA = local_partition(sA, tA, threadIdx.x);                  // (THR_M,THR_K)
+
+  Tensor tBgB = local_partition(gB, tB, threadIdx.x);                  // (THR_N,THR_K,k)
+  Tensor tBsB = local_partition(sB, tB, threadIdx.x);                  // (THR_N,THR_K)
+
+  CUTE_STATIC_ASSERT_V(size<0>(tAgA) == size<0>(tAsA));                // THR_M
+  CUTE_STATIC_ASSERT_V(size<1>(tAgA) == size<1>(tAsA));                // THR_K
+  CUTE_STATIC_ASSERT_V(size<0>(tBgB) == size<0>(tBsB));                // THR_N
+  CUTE_STATIC_ASSERT_V(size<1>(tBgB) == size<1>(tBsB));                // THR_K
+
+  //
+  // Define A/B partitioning and C accumulators
+  //
+
+  // TUTORIAL: Example of partitioning via projections of a ThreadLayout tC
+
+  // Partition sA (M,K) by the rows of tC
+  Tensor tCsA = local_partition(sA, tC, threadIdx.x, Step<_1, X>{});   // (THR_M,BLK_K)
+  // Partition sB (N,K) by the cols of tC
+  Tensor tCsB = local_partition(sB, tC, threadIdx.x, Step< X,_1>{});   // (THR_N,BLK_K)
+  // Partition gC (M,N) by the tile of tC
+  Tensor tCgC = local_partition(gC, tC, threadIdx.x, Step<_1,_1>{});   // (THR_M,THR_N)
+
+  // Allocate the accumulators -- same shape/layout as the partitioned data
+  Tensor tCrC = make_tensor_like(tCgC);                                // (THR_M,THR_N)
+
+  CUTE_STATIC_ASSERT_V(size<0>(tCrC) == size<0>(tCgC));                // THR_M
+  CUTE_STATIC_ASSERT_V(size<0>(tCrC) == size<0>(tCsA));                // THR_M
+  CUTE_STATIC_ASSERT_V(size<1>(tCrC) == size<1>(tCgC));                // THR_N
+  CUTE_STATIC_ASSERT_V(size<1>(tCrC) == size<0>(tCsB));                // THR_N
+  CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCsB));                // BLK_K
+
+  // Clear the accumulators
+  clear(tCrC);
+
+#if 0
+  if(thread0()) {
+    print("  mA : "); print(  mA); print("\n");
+    print("  gA : "); print(  gA); print("\n");
+    print("  sA : "); print(  sA); print("\n");
+    print("tAgA : "); print(tAgA); print("\n");
+    print("tAsA : "); print(tAsA); print("\n");
+  }
+#endif
+
+#if 0
+  if(thread0()) {
+    print("  mB : "); print(  mB); print("\n");
+    print("  gB : "); print(  gB); print("\n");
+    print("  sB : "); print(  sB); print("\n");
+    print("tBgB : "); print(tBgB); print("\n");
+    print("tBsB : "); print(tBsB); print("\n");
+  }
+#endif
+
+#if 0
+  if(thread0()) {
+    print("  mC : "); print(  mC); print("\n");
+    print("  gC : "); print(  gC); print("\n");
+    print("tCsA : "); print(tCsA); print("\n");
+    print("tCsB : "); print(tCsB); print("\n");
+    print("tCgC : "); print(tCgC); print("\n");
+    print("tCrC : "); print(tCrC); print("\n");
+  }
+#endif
+
+#if 1
+
+  // TUTORIAL: Example of a simple mainloop that read tiles of data into shared memory,
+  //           and then computes on those tiles.
+  //   copy(.) operates on the global and shared memory via the tA|tB partitioning
+  //   gemm(.) operates on the shared and register memory via the tC partitioning
+
+  auto K_TILE_MAX = size<2>(tAgA);
+
+  for (int k_tile = 0; k_tile < K_TILE_MAX; ++k_tile)
+  {
+    // Copy gmem to smem with tA|tB thread-partitioned tensors
+    copy(tAgA(_,_,k_tile), tAsA);      // A   (THR_M,THR_K) -> (THR_M,THR_K)
+    copy(tBgB(_,_,k_tile), tBsB);      // B   (THR_N,THR_K) -> (THR_N,THR_K)
+
+    // TUTORIAL: The above call to copy(tAgA(_,_,k_tile), tAsA) is equivalent to
+    //   Tensor tAgAk = tAgA(_,_,k_tile);
+    //   CUTE_UNROLL
+    //   for (int i = 0; i < size(tAsA); ++i) {
+    //     tAsA(i) = tAgAk(i);
+    //   }
+
+    cp_async_fence();        // Label the end of (potential) cp.async instructions
+    cp_async_wait<0>();      // Sync on all (potential) cp.async instructions
+    __syncthreads();         // Wait for all threads to write to smem
+
+    // Compute gemm on tC thread-partitioned smem
+    gemm(tCsA, tCsB, tCrC);            // (THR_M,THR_N) += (THR_M,BLK_K) * (THR_N,BLK_K)
+
+    // TUTORIAL: The above call to gemm(tCsA, tCsB, tCrC) is equivalent to
+    //   CUTE_UNROLL
+    //   for (int k = 0; k < size<1>(tCsA); ++k) {
+    //     CUTE_UNROLL
+    //     for (int m = 0; m < size<0>(tCrC); ++m) {
+    //       CUTE_UNROLL
+    //       for (int n = 0; n < size<1>(tCrC); ++n) {
+    //         tCrC(m,n) += tCsA(m,k) * tCsB(n,k);
+    //       }
+    //     }
+    //   }
+
+    __syncthreads();         // Wait for all threads to read from smem
+  }
+
+#endif
+
+  //
+  // Epilogue
+  //
+
+  axpby(alpha, tCrC, beta, tCgC);
+
+  // TUTORIAL: The above call to axpby(alpha, tCrC, beta, tCgC) is equivalent to
+  //   CUTE_UNROLL
+  //   for (int i = 0; i < size(tCsA); ++i) {
+  //     tCgC(i) = alpha * tCrC(i) + beta * tCgC(i);
+  //   }
+}
+
+// Setup params for an NT GEMM
+// Use m-major smem sA, n-major smem sB, and mn-major threads tA|tB
+template <class TA, class TB, class TC,
+          class Alpha, class Beta>
+void
+gemm_nt(int m, int n, int k,
+        Alpha alpha,
+        TA const* A, int ldA,
+        TB const* B, int ldB,
+        Beta beta,
+        TC      * C, int ldC,
+        cudaStream_t stream = 0)
+{
+  using namespace cute;
+
+  // Define shapes (dynamic)
+  auto M = int(m);
+  auto N = int(n);
+  auto K = int(k);
+  auto prob_shape = make_shape(M, N, K);                     // (M, N, K)
+
+  // Define NT strides (mixed)
+  auto dA = make_stride(Int<1>{}, ldA);                      // (dM, dK)
+  auto dB = make_stride(Int<1>{}, ldB);                      // (dN, dK)
+  auto dC = make_stride(Int<1>{}, ldC);                      // (dM, dN)
+
+  // Define CTA tile sizes (static)
+  auto bM = Int<128>{};
+  auto bN = Int<128>{};
+  auto bK = Int<  8>{};
+  auto cta_tiler = make_shape(bM, bN, bK);                   // (BLK_M, BLK_N, BLK_K)
+
+  // Define the smem layouts (static)
+  auto sA = make_layout(make_shape(bM, bK));                 // (m,k) -> smem_idx; m-major
+  auto sB = make_layout(make_shape(bN, bK));                 // (n,k) -> smem_idx; n-major
+  auto sC = make_layout(make_shape(bM, bN));                 // (m,n) -> smem_idx; m-major
+
+  // Define the thread layouts (static)
+  auto tA = make_layout(make_shape(Int<32>{}, Int< 8>{}));   // (m,k) -> thr_idx
+  auto tB = make_layout(make_shape(Int<32>{}, Int< 8>{}));   // (n,k) -> thr_idx
+  auto tC = make_layout(make_shape(Int<16>{}, Int<16>{}));   // (m,n) -> thr_idx
+
+  dim3 dimBlock(size(tC));
+  dim3 dimGrid(size(ceil_div(M, bM)),
+               size(ceil_div(N, bN)));
+  gemm_device<<<dimGrid, dimBlock, 0, stream>>>
+      (prob_shape, cta_tiler,
+       A, dA, sA, tA,
+       B, dB, sB, tB,
+       C, dC, sC, tC,
+       alpha, beta);
+}
+
+// Setup params for a TN GEMM
+// Use padded m-major smem sA, padded n-major smem sB, and k-major threads tA|tB
+template <class TA, class TB, class TC,
+          class Alpha, class Beta>
+void
+gemm_tn(int m, int n, int k,
+        Alpha alpha,
+        TA const* A, int ldA,
+        TB const* B, int ldB,
+        Beta beta,
+        TC      * C, int ldC,
+        cudaStream_t stream = 0)
+{
+  using namespace cute;
+
+  // Define shapes (dynamic)
+  auto M = int(m);
+  auto N = int(n);
+  auto K = int(k);
+  auto prob_shape = make_shape(M, N, K);                     // (M, N, K)
+
+  // Define TN strides (mixed)
+  auto dA = make_stride(ldA, Int<1>{});                      // (dM, dK)
+  auto dB = make_stride(ldB, Int<1>{});                      // (dN, dK)
+  auto dC = make_stride(Int<1>{}, ldC);                      // (dM, dN)
+
+  // Define CTA tile sizes (static)
+  auto bM = Int<128>{};
+  auto bN = Int<128>{};
+  auto bK = Int<  8>{};
+  auto cta_tiler = make_shape(bM, bN, bK);                   // (BLK_M, BLK_N, BLK_K)
+
+  // Define the smem layouts (static)
+  auto sA = make_layout(make_shape(bM,bK), LayoutRight{});   // (m,k) -> smem_idx; k-major
+  auto sB = make_layout(make_shape(bN,bK), LayoutRight{});   // (n,k) -> smem_idx; k-major
+  auto sC = make_layout(make_shape(bM, bN));                 // (m,n) -> smem_idx; m-major
+
+  // Define the thread layouts (static)
+  auto tA = make_layout(make_shape(Int<32>{}, Int< 8>{}), LayoutRight{});  // (m,k) -> thr_idx; k-major
+  auto tB = make_layout(make_shape(Int<32>{}, Int< 8>{}), LayoutRight{});  // (n,k) -> thr_idx; k-major
+  auto tC = make_layout(make_shape(Int<16>{}, Int<16>{}));                 // (m,n) -> thr_idx; m-major
+
+  dim3 dimBlock(size(tC));
+  dim3 dimGrid(size(ceil_div(M, bM)),
+               size(ceil_div(N, bN)));
+  gemm_device<<<dimGrid, dimBlock, 0, stream>>>
+      (prob_shape, cta_tiler,
+       A, dA, sA, tA,
+       B, dB, sB, tB,
+       C, dC, sC, tC,
+       alpha, beta);
+}
+
+template <class TA, class TB, class TC,
+          class Alpha, class Beta>
+void
+gemm(char transA, char transB, int m, int n, int k,
+     Alpha alpha,
+     TA const* A, int ldA,
+     TB const* B, int ldB,
+     Beta beta,
+     TC      * C, int ldC,
+     cudaStream_t stream = 0)
+{
+  if (transA == 'N' && transB == 'T') {
+    return gemm_nt(m, n, k, alpha, A, ldA, B, ldB, beta, C, ldC, stream);
+  } else
+  if (transA == 'T' && transB == 'N') {
+    return gemm_tn(m, n, k, alpha, A, ldA, B, ldB, beta, C, ldC, stream);
+  }
+  assert(false && "Not implemented");
+}
+
+
+int main(int argc, char** argv)
+{
+  int m = 5120;
+  if (argc >= 2)
+    sscanf(argv[1], "%d", &m);
+
+  int n = 5120;
+  if (argc >= 3)
+    sscanf(argv[2], "%d", &n);
+
+  int k = 4096;
+  if (argc >= 4)
+    sscanf(argv[3], "%d", &k);
+
+  char transA = 'N';
+  if (argc >= 5)
+    sscanf(argv[4], "%c", &transA);
+
+  char transB = 'T';
+  if (argc >= 6)
+    sscanf(argv[5], "%c", &transB);
+
+  using TA = float;
+  using TB = float;
+  using TC = float;
+  using TI = float;
+
+  TI alpha = 1.0;
+  TI beta  = 0.0;
+
+  std::cout << "M = " << m << std::endl;
+  std::cout << "N = " << n << std::endl;
+  std::cout << "K = " << k << std::endl;
+  std::cout << "C = A^" << transA << " B^" << transB << std::endl;
+
+  cute::device_init(0);
+
+  thrust::host_vector<TA> h_A(m*k);
+  thrust::host_vector<TB> h_B(n*k);
+  thrust::host_vector<TC> h_C(m*n);
+
+  for (int j = 0; j < m*k; ++j) h_A[j] = static_cast<TA>( 2*(rand() / double(RAND_MAX)) - 1 );
+  for (int j = 0; j < n*k; ++j) h_B[j] = static_cast<TB>( 2*(rand() / double(RAND_MAX)) - 1 );
+  for (int j = 0; j < m*n; ++j) h_C[j] = static_cast<TC>(-1);
+
+  thrust::device_vector<TA> d_A = h_A;
+  thrust::device_vector<TB> d_B = h_B;
+  thrust::device_vector<TC> d_C = h_C;
+
+  double gflops = (2.0*m*n*k) * 1e-9;
+
+  const int timing_iterations = 100;
+  GPU_Clock timer;
+
+  int ldA = 0, ldB = 0, ldC = m;
+
+  if (transA == 'N') {
+    ldA = m;
+  } else if (transA == 'T') {
+    ldA = k;
+  } else {
+    assert(false);
+  }
+
+  if (transB == 'N') {
+    ldB = k;
+  } else if (transB == 'T') {
+    ldB = n;
+  } else {
+    assert(false);
+  }
+  // Run once
+  d_C = h_C;
+  gemm(transA, transB, m, n, k,
+       alpha,
+       d_A.data().get(), ldA,
+       d_B.data().get(), ldB,
+       beta,
+       d_C.data().get(), ldC);
+  CUTE_CHECK_LAST();
+  thrust::host_vector<TC> cute_result = d_C;
+
+  // Timing iterations
+  timer.start();
+  for (int i = 0; i < timing_iterations; ++i) {
+    gemm(transA, transB, m, n, k,
+         alpha,
+         d_A.data().get(), ldA,
+         d_B.data().get(), ldB,
+         beta,
+         d_C.data().get(), ldC);
+  }
+  double cute_time = timer.seconds() / timing_iterations;
+  CUTE_CHECK_LAST();
+  printf("CUTE_GEMM:     [%6.1f]GFlop/s  (%6.4f)ms\n", gflops / cute_time, cute_time*1000);
+  return 0;
+}
diff --git a/examples/cute/tutorial/sgemm_2.cu b/examples/cute/tutorial/sgemm_2.cu
new file mode 100644
index 0000000000..ee2b6b2e61
--- /dev/null
+++ b/examples/cute/tutorial/sgemm_2.cu
@@ -0,0 +1,523 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#include <cstdlib>
+#include <cstdio>
+#include <cassert>
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <cute/tensor.hpp>
+
+#include "cutlass/util/print_error.hpp"
+#include "cutlass/util/GPU_Clock.hpp"
+#include "cutlass/util/helper_cuda.hpp"
+
+template <class ProblemShape, class CtaTiler,
+          class TA, class AStride, class ASmemLayout, class TiledCopyA,
+          class TB, class BStride, class BSmemLayout, class TiledCopyB,
+          class TC, class CStride, class CSmemLayout, class TiledMma,
+          class Alpha, class Beta>
+__global__ static
+__launch_bounds__(decltype(size(TiledMma{}))::value)
+void
+gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
+            TA const* A, AStride dA, ASmemLayout sA_layout, TiledCopyA copy_a,
+            TB const* B, BStride dB, BSmemLayout sB_layout, TiledCopyB copy_b,
+            TC      * C, CStride dC, CSmemLayout          , TiledMma mma,
+            Alpha alpha, Beta beta)
+{
+  using namespace cute;
+
+  // Preconditions
+  CUTE_STATIC_ASSERT_V(rank(shape_MNK) == Int<3>{});                   // (M, N, K)
+  CUTE_STATIC_ASSERT_V(rank(cta_tiler) == Int<3>{});                   // (BLK_M, BLK_N, BLK_K)
+
+  CUTE_STATIC_ASSERT_V(size(copy_a) == size(mma));                     // NumThreads
+  CUTE_STATIC_ASSERT_V(size(copy_b) == size(mma));                     // NumThreads
+
+  static_assert(is_static<ASmemLayout>::value);
+  static_assert(is_static<BSmemLayout>::value);
+  static_assert(is_static<CSmemLayout>::value);
+
+  CUTE_STATIC_ASSERT_V(size<0>(ASmemLayout{}) == size<0>(cta_tiler));  // BLK_M
+  CUTE_STATIC_ASSERT_V(size<0>(CSmemLayout{}) == size<0>(cta_tiler));  // BLK_M
+  CUTE_STATIC_ASSERT_V(size<0>(BSmemLayout{}) == size<1>(cta_tiler));  // BLK_N
+  CUTE_STATIC_ASSERT_V(size<1>(CSmemLayout{}) == size<1>(cta_tiler));  // BLK_N
+  CUTE_STATIC_ASSERT_V(size<1>(ASmemLayout{}) == size<2>(cta_tiler));  // BLK_K
+  CUTE_STATIC_ASSERT_V(size<1>(BSmemLayout{}) == size<2>(cta_tiler));  // BLK_K
+
+  CUTE_STATIC_ASSERT_V(congruent(select<0,2>(shape_MNK), dA));         // dA strides for shape MK
+  CUTE_STATIC_ASSERT_V(congruent(select<1,2>(shape_MNK), dB));         // dB strides for shape NK
+  CUTE_STATIC_ASSERT_V(congruent(select<0,1>(shape_MNK), dC));         // dC strides for shape MN
+
+  //
+  // Full and Tiled Tensors
+  //
+
+  // Represent the full tensors
+  Tensor mA = make_tensor(make_gmem_ptr(A), select<0,2>(shape_MNK), dA); // (M,K)
+  Tensor mB = make_tensor(make_gmem_ptr(B), select<1,2>(shape_MNK), dB); // (N,K)
+  Tensor mC = make_tensor(make_gmem_ptr(C), select<0,1>(shape_MNK), dC); // (M,N)
+
+  // Get the appropriate blocks for this thread block
+  auto cta_coord = make_coord(blockIdx.x, blockIdx.y, _);              // (m,n,k)
+  Tensor gA = local_tile(mA, cta_tiler, cta_coord, Step<_1, X,_1>{});  // (BLK_M,BLK_K,k)
+  Tensor gB = local_tile(mB, cta_tiler, cta_coord, Step< X,_1,_1>{});  // (BLK_N,BLK_K,k)
+  Tensor gC = local_tile(mC, cta_tiler, cta_coord, Step<_1,_1, X>{});  // (BLK_M,BLK_N)
+
+  // Shared memory buffers
+  __shared__ TA smemA[cosize_v<ASmemLayout>];
+  __shared__ TB smemB[cosize_v<BSmemLayout>];
+  Tensor sA = make_tensor(make_smem_ptr(smemA), sA_layout);            // (BLK_M,BLK_K)
+  Tensor sB = make_tensor(make_smem_ptr(smemB), sB_layout);            // (BLK_N,BLK_K)
+
+  //
+  // Partition the copying of A and B tiles across the threads
+  //
+
+  // TUTORIAL: Example of partitioning via a TiledCopy
+
+  ThrCopy thr_copy_a = copy_a.get_slice(threadIdx.x);
+  Tensor tAgA = thr_copy_a.partition_S(gA);                            // (CPY,CPY_M,CPY_K,k)
+  Tensor tAsA = thr_copy_a.partition_D(sA);                            // (CPY,CPY_M,CPY_K)
+  // Allocate registers same shape/layout as partitioned data
+  Tensor tArA = make_fragment_like(tAsA);                              // (CPY,CPY_M,CPY_K)
+
+  ThrCopy thr_copy_b = copy_b.get_slice(threadIdx.x);
+  Tensor tBgB = thr_copy_b.partition_S(gB);                            // (CPY,CPY_N,CPY_K,k)
+  Tensor tBsB = thr_copy_b.partition_D(sB);                            // (CPY,CPY_N,CPY_K)
+  // Allocate registers same shape/layout as partitioned data
+  Tensor tBrB = make_fragment_like(tBsB);                              // (CPY,CPY_N,CPY_K)
+
+  CUTE_STATIC_ASSERT_V(size<1>(tAgA) == size<1>(tAsA));                // CPY_M
+  CUTE_STATIC_ASSERT_V(size<1>(tAgA) == size<1>(tArA));                // CPY_M
+  CUTE_STATIC_ASSERT_V(size<2>(tAgA) == size<2>(tAsA));                // CPY_K
+  CUTE_STATIC_ASSERT_V(size<2>(tAgA) == size<2>(tArA));                // CPY_K
+  CUTE_STATIC_ASSERT_V(size<1>(tBgB) == size<1>(tBsB));                // CPY_N
+  CUTE_STATIC_ASSERT_V(size<1>(tBgB) == size<1>(tBrB));                // CPY_N
+  CUTE_STATIC_ASSERT_V(size<2>(tBgB) == size<2>(tBsB));                // CPY_K
+  CUTE_STATIC_ASSERT_V(size<2>(tBgB) == size<2>(tBrB));                // CPY_K
+
+  // Copy gmem to rmem for k_tile=0
+  copy(copy_a, tAgA(_,_,_,0), tArA);
+  copy(copy_b, tBgB(_,_,_,0), tBrB);
+  //
+  // Define A/B partitioning and C accumulators
+  //
+
+  // TUTORIAL: Example of partitioning via a TiledMMA
+
+  ThrMMA thr_mma = mma.get_slice(threadIdx.x);
+  Tensor tCsA = thr_mma.partition_A(sA);                               // (MMA,MMA_M,MMA_K)
+  Tensor tCsB = thr_mma.partition_B(sB);                               // (MMA,MMA_N,MMA_K)
+  Tensor tCgC = thr_mma.partition_C(gC);                               // (MMA,MMA_M,MMA_N)
+
+  // Allocate the accumulators -- same size as the projected data
+  Tensor tCrC = thr_mma.make_fragment_C(tCgC);                         // (MMA,MMA_M,MMA_N)
+
+  CUTE_STATIC_ASSERT_V(  shape(tCrC) ==   shape(tCgC));                // (MMA,MMA_M,MMA_N)
+  CUTE_STATIC_ASSERT_V(size<1>(tCgC) == size<1>(tCsA));                // MMA_M
+  CUTE_STATIC_ASSERT_V(size<2>(tCgC) == size<1>(tCsB));                // MMA_N
+  CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                // MMA_K
+
+  // Clear the accumulators
+  clear(tCrC);
+
+#if 0
+  if(thread0()) {
+    print("  mA : "); print(  mA); print("\n");
+    print("  gA : "); print(  gA); print("\n");
+    print("  sA : "); print(  sA); print("\n");
+    print("tAgA : "); print(tAgA); print("\n");
+    print("tAsA : "); print(tAsA); print("\n");
+    print("tArA : "); print(tArA); print("\n");
+  }
+#endif
+
+#if 0
+  if(thread0()) {
+    print("  mB : "); print(  mB); print("\n");
+    print("  gB : "); print(  gB); print("\n");
+    print("  sB : "); print(  sB); print("\n");
+    print("tBgB : "); print(tBgB); print("\n");
+    print("tBsB : "); print(tBsB); print("\n");
+    print("tArA : "); print(tArA); print("\n");
+  }
+#endif
+
+#if 0
+  if(thread0()) {
+    print("  mC : "); print(  mC); print("\n");
+    print("  gC : "); print(  gC); print("\n");
+    print("tCsA : "); print(tCsA); print("\n");
+    print("tCsB : "); print(tCsB); print("\n");
+    print("tCgC : "); print(tCgC); print("\n");
+    print("tCrC : "); print(tCrC); print("\n");
+  }
+#endif
+
+#if 1
+
+  // TUTORIAL: Example of an inner loop that pipelines compute with reads
+  //           from global memory by staging through register and shared memory.
+  //   Data is read from global to registers, then to shared via the TiledCopy partitions
+  //   gemm(.) operates on the shared memory directly via the TiledMMA partitions
+
+  auto K_TILE_MAX = size<3>(tAgA);
+
+  for (int k_tile = 0; k_tile < K_TILE_MAX; ++k_tile)
+  {
+    // Copy rmem to smem with tA|tB thread-partitioned tensors
+    __syncthreads();         // Wait for all threads to consume smem
+    copy(tArA, tAsA);
+    copy(tBrB, tBsB);
+    __syncthreads();         // Wait for all threads to consume smem
+
+    // Copy gmem to rmem for k_tile+1 with tA|tB thread-partitioned tensors
+    int k_tile_next = (k_tile + 1 < K_TILE_MAX) ? k_tile + 1 : k_tile;
+    copy(copy_a, tAgA(_,_,_,k_tile_next), tArA);
+    copy(copy_b, tBgB(_,_,_,k_tile_next), tBrB);
+    // TUTORIAL: The above call to copy(copy_a, tAgA(_,_,_,k_tile_next), tArA) is equivalent to
+    //   CUTE_UNROLL
+    //   for (int k = 0; k < size<1>(tCsA); ++k) {
+    //     CUTE_UNROLL
+    //     for (int m = 0; m < size<0>(tCrC); ++m) {
+    //       copy_a.call(tAgA(_,m,k), tArA(_,m,k);
+    //     }
+    //   }
+
+    // Compute gemm on mma-partitioned smem
+    gemm(mma, tCsA, tCsB, tCrC);
+    // TUTORIAL: The above call to gemm(tCsA, tCsB, tCrC) is equivalent to
+    //   CUTE_UNROLL
+    //   for (int k = 0; k < size<1>(tCsA); ++k) {
+    //     CUTE_UNROLL
+    //     for (int m = 0; m < size<0>(tCrC); ++m) {
+    //       CUTE_UNROLL
+    //       for (int n = 0; n < size<1>(tCrC); ++n) {
+    //         mma.call(tCsA(_,m,k), tCsB(_,n,k), tCrC(_,m,n);
+    //       }
+    //     }
+    //   }
+  }
+
+#endif
+
+  //
+  // Epilogue
+  //
+
+  axpby(alpha, tCrC, beta, tCgC);
+}
+
+// Setup params for a NT GEMM
+template <class TA, class TB, class TC,
+          class Alpha, class Beta>
+void
+gemm_nt(int m, int n, int k,
+        Alpha alpha,
+        TA const* A, int ldA,
+        TB const* B, int ldB,
+        Beta beta,
+        TC      * C, int ldC,
+        cudaStream_t stream = 0)
+{
+  using namespace cute;
+
+  // Define shapes (dynamic)
+  auto M = int(m);
+  auto N = int(n);
+  auto K = int(k);
+  auto prob_shape = make_shape(M, N, K);                     // (M, N, K)
+
+  // Define NT strides (mixed)
+  auto dA = make_stride(Int<1>{}, ldA);                      // (dM, dK)
+  auto dB = make_stride(Int<1>{}, ldB);                      // (dN, dK)
+  auto dC = make_stride(Int<1>{}, ldC);                      // (dM, dN)
+
+  // Define CTA tile sizes (static)
+  auto bM = Int<128>{};
+  auto bN = Int<128>{};
+  auto bK = Int<  8>{};
+  auto cta_tiler = make_shape(bM, bN, bK);                   // (BLK_M, BLK_N, BLK_K)
+
+  // Define the smem layouts (static)
+  auto sA = make_layout(make_shape(bM, bK));                 // (m,k) -> smem_idx; m-major
+  auto sB = make_layout(make_shape(bN, bK));                 // (n,k) -> smem_idx; n-major
+  auto sC = make_layout(make_shape(bM, bN));                 // (m,n) -> smem_idx; m-major
+
+  // Define the thread layouts (static)
+
+  // TUTORIAL: Construct TiledCopy with a particular Copy_Atom to use and
+  //           define the partitioning pattern to apply.
+  // Each thread will (try to) copy 4x1 elements of type TA using 128-bit copy.
+  // Use 32x8 of these threads.
+
+  TiledCopy copyA = make_tiled_copy(Copy_Atom<UniversalCopy<uint128_t>, TA>{},
+                                    Layout<Shape<_32,_8>>{},  // Thr layout 32x8 m-major
+                                    Layout<Shape< _4,_1>>{}); // Val layout  4x1 m-major
+  TiledCopy copyB = make_tiled_copy(Copy_Atom<UniversalCopy<uint128_t>, TB>{},
+                                    Layout<Shape<_32,_8>>{},  // Thr layout 32x8 n-major
+                                    Layout<Shape< _4,_1>>{}); // Val layout  4x1 n-major
+
+  // TUTORIAL: Construct TiledMMA with a particular MMA_Atom to use and
+  //           define the partitioning pattern to apply.
+  // Use a 1x1x1 FMA on the types TC += TA * TB. Each atom requires a single thread.
+  // Reproduce that atom 16x16x1 times (m-major) across threads so that we use 256 threads.
+
+  TiledMMA mmaC = make_tiled_mma(UniversalFMA<TC,TA,TB>{},
+                                 Layout<Shape<_16,_16,_1>>{});  // 16x16x1 UniversalFMA
+
+#if 0
+  print(copyA);
+  print(copyB);
+  print(mmaC);
+#endif
+
+#if 0
+  print_latex(copyA);
+  print_latex(copyB);
+  print_latex(mmaC);
+#endif
+
+  dim3 dimBlock(size(mmaC));
+  dim3 dimGrid(size(ceil_div(M, bM)),
+               size(ceil_div(N, bN)));
+  gemm_device<<<dimGrid, dimBlock, 0, stream>>>
+      (prob_shape, cta_tiler,
+       A, dA, sA, copyA,
+       B, dB, sB, copyB,
+       C, dC, sC, mmaC,
+       alpha, beta);
+}
+
+// Setup params for a TN GEMM
+template <class TA, class TB, class TC,
+          class Alpha, class Beta>
+void
+gemm_tn(int m, int n, int k,
+        Alpha alpha,
+        TA const* A, int ldA,
+        TB const* B, int ldB,
+        Beta beta,
+        TC      * C, int ldC,
+        cudaStream_t stream = 0)
+{
+  using namespace cute;
+
+  // Define shapes (dynamic)
+  auto M = int(m);
+  auto N = int(n);
+  auto K = int(k);
+  auto prob_shape = make_shape(M, N, K);                     // (M, N, K)
+
+  // Define TN strides (mixed)
+  auto dA = make_stride(ldA, Int<1>{});                      // (dM, dK)
+  auto dB = make_stride(ldB, Int<1>{});                      // (dN, dK)
+  auto dC = make_stride(Int<1>{}, ldC);                      // (dM, dN)
+
+  // Define CTA tile sizes (static)
+  auto bM = Int<128>{};
+  auto bN = Int<128>{};
+  auto bK = Int<  8>{};
+  auto cta_tiler = make_shape(bM, bN, bK);                   // (BLK_M, BLK_N, BLK_K)
+
+  // Define the smem layouts (static)
+  auto sA = make_layout(make_shape (      bM,          bK),
+                        make_stride(Int<1>{}, bM+Int<1>{}));        // (m,k) -> smem_idx; padded m-major
+  auto sB = make_layout(make_shape (      bN,          bK),
+                        make_stride(Int<1>{}, bN+Int<1>{}));        // (n,k) -> smem_idx; padded n-major
+  auto sC = make_layout(make_shape(bM, bN));                        // (m,n) -> smem_idx
+
+  // TUTORIAL: Construct TiledCopy to define the Copy_Atom to use and the
+  //           partitioning pattern to apply.
+  // Each thread will copy 1x1 elements of type TA.
+  // Use 32x8 of these threads arranged in k-major.
+
+  TiledCopy copyA = make_tiled_copy(Copy_Atom<UniversalCopy<TA>, TA>{},
+                                    Layout<Shape<_32,_8>,Stride<_8,_1>>{}, // Thr layout 32x8 k-major
+                                    Layout<Shape< _1,_1>>{});              // Val layout  1x1
+  TiledCopy copyB = make_tiled_copy(Copy_Atom<UniversalCopy<TB>, TB>{},
+                                    Layout<Shape<_32,_8>,Stride<_8,_1>>{}, // Thr layout 32x8 k-major
+                                    Layout<Shape< _1,_1>>{});              // Val layout  1x1
+
+  // TUTORIAL: Construct TiledMMA to define the MMA_Atom to use and the
+  //           partitioning pattern to apply.
+  // Use a 1x1x1 FMA on the types TC += TA * TB. Each atom requires a single thread.
+  // Reproduce that atom 16x16x1 times (m-major) across threads so that we use 256 threads.
+
+  TiledMMA mmaC = make_tiled_mma(UniversalFMA<TC,TA,TB>{},
+                                 Layout<Shape<_16,_16,_1>>{});  // 16x16x1 TiledMMA
+
+#if 0
+  print(copyA);
+  print(copyB);
+  print(mmaC);
+#endif
+
+#if 0
+  print_latex(copyA);
+  print_latex(copyB);
+  print_latex(mmaC);
+#endif
+
+  dim3 dimBlock(size(mmaC));
+  dim3 dimGrid(size(ceil_div(M, bM)),
+               size(ceil_div(N, bN)));
+  gemm_device<<<dimGrid, dimBlock, 0, stream>>>
+      (prob_shape, cta_tiler,
+       A, dA, sA, copyA,
+       B, dB, sB, copyB,
+       C, dC, sC, mmaC,
+       alpha, beta);
+}
+
+template <class TA, class TB, class TC,
+          class Alpha, class Beta>
+void
+gemm(char transA, char transB, int m, int n, int k,
+     Alpha alpha,
+     TA const* A, int ldA,
+     TB const* B, int ldB,
+     Beta beta,
+     TC      * C, int ldC,
+     cudaStream_t stream = 0)
+{
+  if (transA == 'N' && transB == 'T') {
+    return gemm_nt(m, n, k, alpha, A, ldA, B, ldB, beta, C, ldC, stream);
+  } else
+  if (transA == 'T' && transB == 'N') {
+    return gemm_tn(m, n, k, alpha, A, ldA, B, ldB, beta, C, ldC, stream);
+  }
+  assert(false && "Not implemented");
+}
+
+
+int main(int argc, char** argv)
+{
+  int m = 5120;
+  if (argc >= 2)
+    sscanf(argv[1], "%d", &m);
+
+  int n = 5120;
+  if (argc >= 3)
+    sscanf(argv[2], "%d", &n);
+
+  int k = 4096;
+  if (argc >= 4)
+    sscanf(argv[3], "%d", &k);
+
+  char transA = 'N';
+  if (argc >= 5)
+    sscanf(argv[4], "%c", &transA);
+
+  char transB = 'T';
+  if (argc >= 6)
+    sscanf(argv[5], "%c", &transB);
+
+  using TA = float;
+  using TB = float;
+  using TC = float;
+  using TI = float;
+
+  TI alpha = 1.0;
+  TI beta  = 0.0;
+
+  std::cout << "M = " << m << std::endl;
+  std::cout << "N = " << n << std::endl;
+  std::cout << "K = " << k << std::endl;
+  std::cout << "C = A^" << transA << " B^" << transB << std::endl;
+
+  cute::device_init(0);
+
+  thrust::host_vector<TA> h_A(m*k);
+  thrust::host_vector<TB> h_B(n*k);
+  thrust::host_vector<TC> h_C(m*n);
+
+  for (int j = 0; j < m*k; ++j) h_A[j] = static_cast<TA>( 2*(rand() / double(RAND_MAX)) - 1 );
+  for (int j = 0; j < n*k; ++j) h_B[j] = static_cast<TB>( 2*(rand() / double(RAND_MAX)) - 1 );
+  for (int j = 0; j < m*n; ++j) h_C[j] = static_cast<TC>(-1);
+
+  thrust::device_vector<TA> d_A = h_A;
+  thrust::device_vector<TB> d_B = h_B;
+  thrust::device_vector<TC> d_C = h_C;
+
+  double gflops = (2.0*m*n*k) * 1e-9;
+
+  const int timing_iterations = 100;
+  GPU_Clock timer;
+
+  int ldA = 0, ldB = 0, ldC = m;
+
+  if (transA == 'N') {
+    ldA = m;
+  } else if (transA == 'T') {
+    ldA = k;
+  } else {
+    assert(false);
+  }
+
+  if (transB == 'N') {
+    ldB = k;
+  } else if (transB == 'T') {
+    ldB = n;
+  } else {
+    assert(false);
+  }
+
+  // Run once
+  d_C = h_C;
+  gemm(transA, transB, m, n, k,
+       alpha,
+       d_A.data().get(), ldA,
+       d_B.data().get(), ldB,
+       beta,
+       d_C.data().get(), ldC);
+  CUTE_CHECK_LAST();
+  thrust::host_vector<TC> cute_result = d_C;
+
+  // Timing iterations
+  timer.start();
+  for (int i = 0; i < timing_iterations; ++i) {
+    gemm(transA, transB, m, n, k,
+         alpha,
+         d_A.data().get(), ldA,
+         d_B.data().get(), ldB,
+         beta,
+         d_C.data().get(), ldC);
+  }
+  double cute_time = timer.seconds() / timing_iterations;
+  CUTE_CHECK_LAST();
+  printf("CUTE_GEMM:     [%6.1f]GFlop/s  (%6.4f)ms\n", gflops / cute_time, cute_time*1000);
+
+  return 0;
+}
diff --git a/examples/cute/tutorial/sgemm_sm70.cu b/examples/cute/tutorial/sgemm_sm70.cu
new file mode 100644
index 0000000000..ef6284cf00
--- /dev/null
+++ b/examples/cute/tutorial/sgemm_sm70.cu
@@ -0,0 +1,526 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#include <cstdlib>
+#include <cstdio>
+#include <cassert>
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <cute/tensor.hpp>
+
+#include "cutlass/util/print_error.hpp"
+#include "cutlass/util/GPU_Clock.hpp"
+#include "cutlass/util/helper_cuda.hpp"
+
+template <class ProblemShape, class CtaTiler,
+          class TA, class AStride, class ASmemLayout, class TiledCopyA,
+          class TB, class BStride, class BSmemLayout, class TiledCopyB,
+          class TC, class CStride, class CSmemLayout, class TiledMma,
+          class Alpha, class Beta>
+__global__ static
+__launch_bounds__(decltype(size(TiledMma{}))::value)
+void
+gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
+            TA const* A, AStride dA, ASmemLayout sA_layout, TiledCopyA copy_a,
+            TB const* B, BStride dB, BSmemLayout sB_layout, TiledCopyB copy_b,
+            TC      * C, CStride dC, CSmemLayout          , TiledMma mma,
+            Alpha alpha, Beta beta)
+{
+  using namespace cute;
+
+  // Preconditions
+  CUTE_STATIC_ASSERT_V(rank(shape_MNK) == Int<3>{});                   // (M, N, K)
+  CUTE_STATIC_ASSERT_V(rank(cta_tiler) == Int<3>{});                   // (BLK_M, BLK_N, BLK_K)
+
+  CUTE_STATIC_ASSERT_V(size(copy_a) == size(mma));                     // NumThreads
+  CUTE_STATIC_ASSERT_V(size(copy_b) == size(mma));                     // NumThreads
+
+  static_assert(is_static<ASmemLayout>::value);
+  static_assert(is_static<BSmemLayout>::value);
+  static_assert(is_static<CSmemLayout>::value);
+
+  CUTE_STATIC_ASSERT_V(size<0>(ASmemLayout{}) == size<0>(cta_tiler));  // BLK_M
+  CUTE_STATIC_ASSERT_V(size<0>(CSmemLayout{}) == size<0>(cta_tiler));  // BLK_M
+  CUTE_STATIC_ASSERT_V(size<0>(BSmemLayout{}) == size<1>(cta_tiler));  // BLK_N
+  CUTE_STATIC_ASSERT_V(size<1>(CSmemLayout{}) == size<1>(cta_tiler));  // BLK_N
+  CUTE_STATIC_ASSERT_V(size<1>(ASmemLayout{}) == size<2>(cta_tiler));  // BLK_K
+  CUTE_STATIC_ASSERT_V(size<1>(BSmemLayout{}) == size<2>(cta_tiler));  // BLK_K
+
+  CUTE_STATIC_ASSERT_V(congruent(select<0,2>(shape_MNK), dA));         // dA strides for shape MK
+  CUTE_STATIC_ASSERT_V(congruent(select<1,2>(shape_MNK), dB));         // dB strides for shape NK
+  CUTE_STATIC_ASSERT_V(congruent(select<0,1>(shape_MNK), dC));         // dC strides for shape MN
+
+  //
+  // Full and Tiled Tensors
+  //
+
+  // Represent the full tensors
+  Tensor mA = make_tensor(make_gmem_ptr(A), select<0,2>(shape_MNK), dA); // (M,K)
+  Tensor mB = make_tensor(make_gmem_ptr(B), select<1,2>(shape_MNK), dB); // (N,K)
+  Tensor mC = make_tensor(make_gmem_ptr(C), select<0,1>(shape_MNK), dC); // (M,N)
+
+  // Get the appropriate blocks for this thread block
+  auto cta_coord = make_coord(blockIdx.x, blockIdx.y, _);              // (m,n,k)
+  Tensor gA = local_tile(mA, cta_tiler, cta_coord, Step<_1, X,_1>{});  // (BLK_M,BLK_K,k)
+  Tensor gB = local_tile(mB, cta_tiler, cta_coord, Step< X,_1,_1>{});  // (BLK_N,BLK_K,k)
+  Tensor gC = local_tile(mC, cta_tiler, cta_coord, Step<_1,_1, X>{});  // (BLK_M,BLK_N)
+
+  // Shared memory buffers
+  __shared__ TA smemA[cosize_v<ASmemLayout>];
+  __shared__ TB smemB[cosize_v<BSmemLayout>];
+  Tensor sA = make_tensor(make_smem_ptr(smemA), sA_layout);            // (BLK_M,BLK_K)
+  Tensor sB = make_tensor(make_smem_ptr(smemB), sB_layout);            // (BLK_N,BLK_K)
+
+  //
+  // Partition the copying of A and B tiles across the threads
+  //
+
+  // TUTORIAL: Example of partitioning via a TiledCopy
+
+  ThrCopy thr_copy_a = copy_a.get_slice(threadIdx.x);
+  Tensor tAgA = thr_copy_a.partition_S(gA);                            // (CPY,CPY_M,CPY_K,k)
+  Tensor tAsA = thr_copy_a.partition_D(sA);                            // (CPY,CPY_M,CPY_K)
+  Tensor tArA = make_fragment_like(tAsA);                              // (CPY,CPY_M,CPY_K)
+
+  ThrCopy thr_copy_b = copy_b.get_slice(threadIdx.x);
+  Tensor tBgB = thr_copy_b.partition_S(gB);                            // (CPY,CPY_N,CPY_K,k)
+  Tensor tBsB = thr_copy_b.partition_D(sB);                            // (CPY,CPY_N,CPY_K)
+  Tensor tBrB = make_fragment_like(tBsB);                              // (CPY,CPY_N,CPY_K)
+
+  CUTE_STATIC_ASSERT_V(size<1>(tAgA) == size<1>(tAsA));                // CPY_M
+  CUTE_STATIC_ASSERT_V(size<1>(tAgA) == size<1>(tArA));                // CPY_M
+  CUTE_STATIC_ASSERT_V(size<2>(tAgA) == size<2>(tAsA));                // CPY_K
+  CUTE_STATIC_ASSERT_V(size<2>(tAgA) == size<2>(tArA));                // CPY_K
+  CUTE_STATIC_ASSERT_V(size<1>(tBgB) == size<1>(tBsB));                // CPY_N
+  CUTE_STATIC_ASSERT_V(size<1>(tBgB) == size<1>(tBrB));                // CPY_N
+  CUTE_STATIC_ASSERT_V(size<2>(tBgB) == size<2>(tBsB));                // CPY_K
+  CUTE_STATIC_ASSERT_V(size<2>(tBgB) == size<2>(tBrB));                // CPY_K
+
+  // Copy gmem to rmem for k_tile=0
+  copy(copy_a, tAgA(_,_,_,0), tArA);
+  copy(copy_b, tBgB(_,_,_,0), tBrB);
+  //
+  // Define A/B partitioning and C accumulators
+  //
+
+  // TUTORIAL: Example of partitioning via a TiledMMA
+
+  ThrMMA thr_mma = mma.get_slice(threadIdx.x);
+  Tensor tCsA = thr_mma.partition_A(sA);                               // (MMA,MMA_M,MMA_K)
+  Tensor tCsB = thr_mma.partition_B(sB);                               // (MMA,MMA_N,MMA_K)
+  Tensor tCgC = thr_mma.partition_C(gC);                               // (MMA,MMA_M,MMA_N)
+
+  // Allocate registers for pipelining
+  Tensor tCrA = thr_mma.make_fragment_A(tCsA);                         // (MMA,MMA_M,MMA_K)
+  Tensor tCrB = thr_mma.make_fragment_B(tCsB);                         // (MMA,MMA_N,MMA_K)
+  // Allocate the accumulators -- same size as the projected data
+  Tensor tCrC = thr_mma.make_fragment_C(tCgC);                         // (MMA,MMA_M,MMA_N)
+
+  CUTE_STATIC_ASSERT_V(  shape(tCrA) ==   shape(tCsA));                // (MMA,MMA_M,MMA_K)
+  CUTE_STATIC_ASSERT_V(  shape(tCrB) ==   shape(tCsB));                // (MMA,MMA_N,MMA_K)
+  CUTE_STATIC_ASSERT_V(  shape(tCrC) ==   shape(tCgC));                // (MMA,MMA_M,MMA_N)
+  CUTE_STATIC_ASSERT_V(size<1>(tCgC) == size<1>(tCsA));                // MMA_M
+  CUTE_STATIC_ASSERT_V(size<2>(tCgC) == size<1>(tCsB));                // MMA_N
+  CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                // MMA_K
+
+  // Clear the accumulators
+  clear(tCrC);
+
+#if 0
+  if(thread0()) {
+    print("  mA : "); print(  mA); print("\n");
+    print("  gA : "); print(  gA); print("\n");
+    print("  sA : "); print(  sA); print("\n");
+    print("tAgA : "); print(tAgA); print("\n");
+    print("tAsA : "); print(tAsA); print("\n");
+    print("tArA : "); print(tArA); print("\n");
+  }
+#endif
+
+#if 0
+  if(thread0()) {
+    print("  mB : "); print(  mB); print("\n");
+    print("  gB : "); print(  gB); print("\n");
+    print("  sB : "); print(  sB); print("\n");
+    print("tBgB : "); print(tBgB); print("\n");
+    print("tBsB : "); print(tBsB); print("\n");
+    print("tArA : "); print(tArA); print("\n");
+  }
+#endif
+
+#if 0
+  if(thread0()) {
+    print("  mC : "); print(  mC); print("\n");
+    print("  gC : "); print(  gC); print("\n");
+    print("tCsA : "); print(tCsA); print("\n");
+    print("tCsB : "); print(tCsB); print("\n");
+    print("tCgC : "); print(tCgC); print("\n");
+    print("tCrC : "); print(tCrC); print("\n");
+  }
+#endif
+
+#if 1
+
+  // Copy rmem to smem
+  copy(tArA, tAsA);
+  copy(tBrB, tBsB);
+  __syncthreads();
+
+  //
+  // PIPELINED MAIN LOOP
+  // TUTORIAL: Example of a gemm loop that pipelines shared memory AND register memory
+  //   Data is read from global to registers, then to shared via the tA|tB partitions
+  //   Data is then copied from shared to registers in multiple waves via the tC partitions
+  //     and gemm(.) operates on the current register wave
+  //
+
+  // Load A, B shmem->regs for k_block=0
+  copy(tCsA(_,_,0), tCrA(_,_,0));
+  copy(tCsB(_,_,0), tCrB(_,_,0));
+  auto K_TILE_MAX  = size<3>(tAgA);
+  auto K_BLOCK_MAX = size<2>(tCrA);
+
+  CUTE_NO_UNROLL
+  for (int k_tile = 0; k_tile < K_TILE_MAX; ++k_tile)
+  {
+    // Pipeline the k-mode of the block registers
+    CUTE_UNROLL
+    for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block)
+    {
+      if (k_block == K_BLOCK_MAX - 1)
+      {
+        // Copy rmem to smem
+        __syncthreads();
+        copy(tArA, tAsA);
+        copy(tBrB, tBsB);
+        __syncthreads();
+      }
+
+      // Copy smem to rmem for k_block+1
+      int k_block_next = (k_block + 1) % K_BLOCK_MAX;
+      copy(tCsA(_,_,k_block_next), tCrA(_,_,k_block_next));
+      copy(tCsB(_,_,k_block_next), tCrB(_,_,k_block_next));
+      if (k_block == 0)
+      {
+        // Copy gmem to rmem for k_tile+1
+        int k_tile_next = (k_tile + 1 < K_TILE_MAX) ? k_tile + 1 : k_tile;
+        copy(copy_a, tAgA(_,_,_,k_tile_next), tArA);
+        copy(copy_b, tBgB(_,_,_,k_tile_next), tBrB);
+      }
+      // Thread-level register gemm for k_block
+      gemm(mma, tCrA(_,_,k_block), tCrB(_,_,k_block), tCrC);
+    } // k_block
+  } // k_tile
+
+#endif
+
+  //
+  // Epilogue
+  //
+
+  axpby(alpha, tCrC, beta, tCgC);
+}
+
+// Setup params for a NT GEMM
+template <class TA, class TB, class TC,
+          class Alpha, class Beta>
+void
+gemm_nt(int m, int n, int k,
+        Alpha alpha,
+        TA const* A, int ldA,
+        TB const* B, int ldB,
+        Beta beta,
+        TC      * C, int ldC,
+        cudaStream_t stream = 0)
+{
+  using namespace cute;
+
+  // Define shapes (dynamic)
+  auto M = int(m);
+  auto N = int(n);
+  auto K = int(k);
+  auto prob_shape = make_shape(M, N, K);                     // (M, N, K)
+
+  // Define NT strides (mixed)
+  auto dA = make_stride(Int<1>{}, ldA);                      // (dM, dK)
+  auto dB = make_stride(Int<1>{}, ldB);                      // (dN, dK)
+  auto dC = make_stride(Int<1>{}, ldC);                      // (dM, dN)
+
+  // Define CTA tile sizes (static)
+  auto bM = Int<128>{};
+  auto bN = Int<128>{};
+  auto bK = Int<  8>{};
+  auto cta_tiler = make_shape(bM, bN, bK);                   // (BLK_M, BLK_N, BLK_K)
+
+  // Define the smem layouts (static)
+  auto sA = make_layout(make_shape(bM, bK));                 // (m,k) -> smem_idx; m-major
+  auto sB = make_layout(make_shape(bN, bK));                 // (n,k) -> smem_idx; n-major
+  auto sC = make_layout(make_shape(bM, bN));                 // (m,n) -> smem_idx; m-major
+
+  // Define the thread layouts (static)
+  TiledCopy copyA = make_tiled_copy(Copy_Atom<UniversalCopy<uint128_t>, TA>{},
+                                    Layout<Shape<_32,_8>>{},  // Thr layout 32x8 m-major
+                                    Layout<Shape< _4,_1>>{}); // Val layout  4x1 m-major
+  TiledCopy copyB = make_tiled_copy(Copy_Atom<UniversalCopy<uint128_t>, TB>{},
+                                    Layout<Shape<_32,_8>>{},  // Thr layout 32x8 n-major
+                                    Layout<Shape< _4,_1>>{}); // Val layout  4x1 n-major
+
+  TiledMMA mmaC = make_tiled_mma(UniversalFMA<TC,TA,TB>{},
+                                 Layout<Shape<_16,_16,_1>>{});  // 16x16x1 TiledMMA
+
+#if 0
+  print(copyA);
+  print(copyB);
+  print(mmaC);
+#endif
+
+#if 0
+  print_latex(copyA);
+  print_latex(copyB);
+  print_latex(mmaC);
+#endif
+
+  dim3 dimBlock(size(mmaC));
+  dim3 dimGrid(size(ceil_div(M, bM)),
+               size(ceil_div(N, bN)));
+  gemm_device<<<dimGrid, dimBlock, 0, stream>>>
+      (prob_shape, cta_tiler,
+       A, dA, sA, copyA,
+       B, dB, sB, copyB,
+       C, dC, sC, mmaC,
+       alpha, beta);
+}
+
+// Setup params for a TN GEMM
+template <class TA, class TB, class TC,
+          class Alpha, class Beta>
+void
+gemm_tn(int m, int n, int k,
+        Alpha alpha,
+        TA const* A, int ldA,
+        TB const* B, int ldB,
+        Beta beta,
+        TC      * C, int ldC,
+        cudaStream_t stream = 0)
+{
+  using namespace cute;
+
+  // Define shapes (dynamic)
+  auto M = int(m);
+  auto N = int(n);
+  auto K = int(k);
+  auto prob_shape = make_shape(M, N, K);                     // (M, N, K)
+
+  // Define TN strides (mixed)
+  auto dA = make_stride(ldA, Int<1>{});                      // (dM, dK)
+  auto dB = make_stride(ldB, Int<1>{});                      // (dN, dK)
+  auto dC = make_stride(Int<1>{}, ldC);                      // (dM, dN)
+
+  // Define CTA tile sizes (static)
+  auto bM = Int<128>{};
+  auto bN = Int<128>{};
+  auto bK = Int<  8>{};
+  auto cta_tiler = make_shape(bM, bN, bK);                   // (BLK_M, BLK_N, BLK_K)
+
+  // Define the smem layouts (static)
+  auto sA = make_layout(make_shape (      bM,          bK),
+                        make_stride(Int<1>{}, bM+Int<1>{}));        // (m,k) -> smem_idx; padded m-major
+  auto sB = make_layout(make_shape (      bN,          bK),
+                        make_stride(Int<1>{}, bN+Int<1>{}));        // (n,k) -> smem_idx; padded n-major
+  auto sC = make_layout(make_shape(bM, bN));                        // (m,n) -> smem_idx
+
+  // Define the thread layouts (static)
+
+  TiledCopy copyA = make_tiled_copy(Copy_Atom<UniversalCopy<TA>, TA>{},
+                                    Layout<Shape<_32,_8>,Stride<_8,_1>>{}, // Thr layout 32x8 k-major
+                                    Layout<Shape< _1,_1>>{});              // Val layout  1x1
+  TiledCopy copyB = make_tiled_copy(Copy_Atom<UniversalCopy<TB>, TB>{},
+                                    Layout<Shape<_32,_8>,Stride<_8,_1>>{}, // Thr layout 32x8 k-major
+                                    Layout<Shape< _1,_1>>{});              // Val layout  1x1
+
+  TiledMMA mmaC = make_tiled_mma(UniversalFMA<TC,TA,TB>{},
+                                 Layout<Shape<_16,_16,_1>>{});  // 16x16x1 TiledMMA
+
+#if 0
+  print(copyA);
+  print(copyB);
+  print(mmaC);
+#endif
+
+#if 0
+  print_latex(copyA);
+  print_latex(copyB);
+  print_latex(mmaC);
+#endif
+
+  dim3 dimBlock(size(mmaC));
+  dim3 dimGrid(size(ceil_div(M, bM)),
+               size(ceil_div(N, bN)));
+  gemm_device<<<dimGrid, dimBlock, 0, stream>>>
+      (prob_shape, cta_tiler,
+       A, dA, sA, copyA,
+       B, dB, sB, copyB,
+       C, dC, sC, mmaC,
+       alpha, beta);
+}
+
+template <class TA, class TB, class TC,
+          class Alpha, class Beta>
+void
+gemm(char transA, char transB, int m, int n, int k,
+     Alpha alpha,
+     TA const* A, int ldA,
+     TB const* B, int ldB,
+     Beta beta,
+     TC      * C, int ldC,
+     cudaStream_t stream = 0)
+{
+  if (transA == 'N' && transB == 'T') {
+    return gemm_nt(m, n, k, alpha, A, ldA, B, ldB, beta, C, ldC, stream);
+  } else
+  if (transA == 'T' && transB == 'N') {
+    return gemm_tn(m, n, k, alpha, A, ldA, B, ldB, beta, C, ldC, stream);
+  }
+  assert(false && "Not implemented");
+}
+
+
+int main(int argc, char** argv)
+{
+  cudaDeviceProp props;
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (props.major < 7) {
+    std::cout << "This example requires an Volta GPU or newer (CC >= 70)" << std::endl;
+    // Return 0 so tests pass if run on unsupported architectures or CUDA Toolkits.
+    return 0;
+  }
+
+  int m = 5120;
+  if (argc >= 2)
+    sscanf(argv[1], "%d", &m);
+
+  int n = 5120;
+  if (argc >= 3)
+    sscanf(argv[2], "%d", &n);
+
+  int k = 4096;
+  if (argc >= 4)
+    sscanf(argv[3], "%d", &k);
+
+  char transA = 'N';
+  if (argc >= 5)
+    sscanf(argv[4], "%c", &transA);
+
+  char transB = 'T';
+  if (argc >= 6)
+    sscanf(argv[5], "%c", &transB);
+
+  using TA = float;
+  using TB = float;
+  using TC = float;
+  using TI = float;
+
+  TI alpha = 1.0;
+  TI beta  = 0.0;
+
+  std::cout << "M = " << m << std::endl;
+  std::cout << "N = " << n << std::endl;
+  std::cout << "K = " << k << std::endl;
+  std::cout << "C = A^" << transA << " B^" << transB << std::endl;
+
+  thrust::host_vector<TA> h_A(m*k);
+  thrust::host_vector<TB> h_B(n*k);
+  thrust::host_vector<TC> h_C(m*n);
+
+  for (int j = 0; j < m*k; ++j) h_A[j] = static_cast<TA>( 2*(rand() / double(RAND_MAX)) - 1 );
+  for (int j = 0; j < n*k; ++j) h_B[j] = static_cast<TB>( 2*(rand() / double(RAND_MAX)) - 1 );
+  for (int j = 0; j < m*n; ++j) h_C[j] = static_cast<TC>(-1);
+
+  thrust::device_vector<TA> d_A = h_A;
+  thrust::device_vector<TB> d_B = h_B;
+  thrust::device_vector<TC> d_C = h_C;
+
+  double gflops = (2.0*m*n*k) * 1e-9;
+
+  const int timing_iterations = 100;
+  GPU_Clock timer;
+
+  int ldA = 0, ldB = 0, ldC = m;
+
+  if (transA == 'N') {
+    ldA = m;
+  } else if (transA == 'T') {
+    ldA = k;
+  } else {
+    assert(false);
+  }
+
+  if (transB == 'N') {
+    ldB = k;
+  } else if (transB == 'T') {
+    ldB = n;
+  } else {
+    assert(false);
+  }
+
+  // Run once
+  d_C = h_C;
+  gemm(transA, transB, m, n, k,
+       alpha,
+       d_A.data().get(), ldA,
+       d_B.data().get(), ldB,
+       beta,
+       d_C.data().get(), ldC);
+  CUTE_CHECK_LAST();
+  thrust::host_vector<TC> cute_result = d_C;
+
+  // Timing iterations
+  timer.start();
+  for (int i = 0; i < timing_iterations; ++i) {
+    gemm(transA, transB, m, n, k,
+         alpha,
+         d_A.data().get(), ldA,
+         d_B.data().get(), ldB,
+         beta,
+         d_C.data().get(), ldC);
+  }
+  double cute_time = timer.seconds() / timing_iterations;
+  CUTE_CHECK_LAST();
+  printf("CUTE_GEMM:     [%6.1f]GFlop/s  (%6.4f)ms\n", gflops / cute_time, cute_time*1000);
+
+  return 0;
+}
diff --git a/examples/cute/tutorial/sgemm_sm80.cu b/examples/cute/tutorial/sgemm_sm80.cu
new file mode 100644
index 0000000000..5ae0bf0f8b
--- /dev/null
+++ b/examples/cute/tutorial/sgemm_sm80.cu
@@ -0,0 +1,567 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#include <cstdlib>
+#include <cstdio>
+#include <cassert>
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <cute/tensor.hpp>
+
+#include "cutlass/util/print_error.hpp"
+#include "cutlass/util/GPU_Clock.hpp"
+#include "cutlass/util/helper_cuda.hpp"
+
+template <class ProblemShape, class CtaTiler,
+          class TA, class AStride, class ASmemLayout, class TiledCopyA,
+          class TB, class BStride, class BSmemLayout, class TiledCopyB,
+          class TC, class CStride, class CSmemLayout, class TiledMma,
+          class Alpha, class Beta>
+__global__ static
+__launch_bounds__(decltype(size(TiledMma{}))::value)
+void
+gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
+            TA const* A, AStride dA, ASmemLayout sA_layout, TiledCopyA copy_a,
+            TB const* B, BStride dB, BSmemLayout sB_layout, TiledCopyB copy_b,
+            TC      * C, CStride dC, CSmemLayout          , TiledMma mma,
+            Alpha alpha, Beta beta)
+{
+  using namespace cute;
+
+  // Preconditions
+  CUTE_STATIC_ASSERT_V(rank(shape_MNK) == Int<3>{});                   // (M, N, K)
+  CUTE_STATIC_ASSERT_V(rank(cta_tiler) == Int<3>{});                   // (BLK_M, BLK_N, BLK_K)
+
+  CUTE_STATIC_ASSERT_V(size(copy_a) == size(mma));                     // NumThreads
+  CUTE_STATIC_ASSERT_V(size(copy_b) == size(mma));                     // NumThreads
+
+  static_assert(is_static<ASmemLayout>::value);
+  static_assert(is_static<BSmemLayout>::value);
+  static_assert(is_static<CSmemLayout>::value);
+
+  CUTE_STATIC_ASSERT_V(size<0>(ASmemLayout{}) == size<0>(cta_tiler));  // BLK_M
+  CUTE_STATIC_ASSERT_V(size<0>(CSmemLayout{}) == size<0>(cta_tiler));  // BLK_M
+  CUTE_STATIC_ASSERT_V(size<0>(BSmemLayout{}) == size<1>(cta_tiler));  // BLK_N
+  CUTE_STATIC_ASSERT_V(size<1>(CSmemLayout{}) == size<1>(cta_tiler));  // BLK_N
+  CUTE_STATIC_ASSERT_V(size<1>(ASmemLayout{}) == size<2>(cta_tiler));  // BLK_K
+  CUTE_STATIC_ASSERT_V(size<1>(BSmemLayout{}) == size<2>(cta_tiler));  // BLK_K
+
+  CUTE_STATIC_ASSERT_V(congruent(select<0,2>(shape_MNK), dA));         // dA strides for shape MK
+  CUTE_STATIC_ASSERT_V(congruent(select<1,2>(shape_MNK), dB));         // dB strides for shape NK
+  CUTE_STATIC_ASSERT_V(congruent(select<0,1>(shape_MNK), dC));         // dC strides for shape MN
+
+  //
+  // Full and Tiled Tensors
+  //
+
+  // Represent the full tensors
+  Tensor mA = make_tensor(make_gmem_ptr(A), select<0,2>(shape_MNK), dA); // (M,K)
+  Tensor mB = make_tensor(make_gmem_ptr(B), select<1,2>(shape_MNK), dB); // (N,K)
+  Tensor mC = make_tensor(make_gmem_ptr(C), select<0,1>(shape_MNK), dC); // (M,N)
+
+  // Get the appropriate blocks for this thread block
+  auto cta_coord = make_coord(blockIdx.x, blockIdx.y, _);              // (m,n,k)
+  Tensor gA = local_tile(mA, cta_tiler, cta_coord, Step<_1, X,_1>{});  // (BLK_M,BLK_K,k)
+  Tensor gB = local_tile(mB, cta_tiler, cta_coord, Step< X,_1,_1>{});  // (BLK_N,BLK_K,k)
+  Tensor gC = local_tile(mC, cta_tiler, cta_coord, Step<_1,_1, X>{});  // (BLK_M,BLK_N)
+
+  // Shared memory buffers
+  __shared__ TA smemA[cosize_v<ASmemLayout>];
+  __shared__ TB smemB[cosize_v<BSmemLayout>];
+  Tensor sA = make_tensor(make_smem_ptr(smemA), sA_layout);            // (BLK_M,BLK_K,PIPE)
+  Tensor sB = make_tensor(make_smem_ptr(smemB), sB_layout);            // (BLK_N,BLK_K,PIPE)
+
+  //
+  // Partition the copying of A and B tiles across the threads
+  //
+
+  ThrCopy thr_copy_a = copy_a.get_slice(threadIdx.x);
+  Tensor tAgA = thr_copy_a.partition_S(gA);                            // (CPY,CPY_M,CPY_K,k)
+  Tensor tAsA = thr_copy_a.partition_D(sA);                            // (CPY,CPY_M,CPY_K,PIPE)
+
+  ThrCopy thr_copy_b = copy_b.get_slice(threadIdx.x);
+  Tensor tBgB = thr_copy_b.partition_S(gB);                            // (CPY,CPY_N,CPY_K,k)
+  Tensor tBsB = thr_copy_b.partition_D(sB);                            // (CPY,CPY_N,CPY_K,PIPE)
+
+  CUTE_STATIC_ASSERT_V(size<1>(tAgA) == size<1>(tAsA));                // CPY_M
+  CUTE_STATIC_ASSERT_V(size<2>(tAgA) == size<2>(tAsA));                // CPY_K
+  CUTE_STATIC_ASSERT_V(size<1>(tBgB) == size<1>(tBsB));                // CPY_N
+  CUTE_STATIC_ASSERT_V(size<2>(tBgB) == size<2>(tBsB));                // CPY_K
+
+  //
+  // PREFETCH
+  //
+
+  auto K_PIPE_MAX = size<3>(tAsA);
+
+  // Total count of tiles
+  int k_tile_count = size<3>(tAgA);
+  // Current tile index in gmem to read from
+  int k_tile_next = 0;
+
+  // Start async loads for all pipes but the last
+  CUTE_UNROLL
+  for (int k_pipe = 0; k_pipe < K_PIPE_MAX-1; ++k_pipe) {
+    copy(copy_a, tAgA(_,_,_,k_tile_next), tAsA(_,_,_,k_pipe));
+    copy(copy_b, tBgB(_,_,_,k_tile_next), tBsB(_,_,_,k_pipe));
+    cp_async_fence();
+    --k_tile_count;
+    if (k_tile_count > 0) { ++k_tile_next; }
+  }
+
+  //
+  // Define A/B partitioning and C accumulators
+  //
+
+  ThrMMA thr_mma = mma.get_slice(threadIdx.x);
+  Tensor tCsA = thr_mma.partition_A(sA);                               // (MMA,MMA_M,MMA_K,PIPE)
+  Tensor tCsB = thr_mma.partition_B(sB);                               // (MMA,MMA_N,MMA_K,PIPE)
+  Tensor tCgC = thr_mma.partition_C(gC);                               // (MMA,MMA_M,MMA_N)
+
+  // Allocate registers for pipelining
+  Tensor tCrA = thr_mma.make_fragment_A(tCsA(_,_,_,0));                // (MMA,MMA_M,MMA_K)
+  Tensor tCrB = thr_mma.make_fragment_B(tCsB(_,_,_,0));                // (MMA,MMA_N,MMA_K)
+  // Allocate the accumulators -- same size as the projected data
+  Tensor tCrC = thr_mma.make_fragment_C(tCgC);                         // (MMA,MMA_M,MMA_N)
+
+  CUTE_STATIC_ASSERT_V((  shape(tCrA) == take<0,3>(shape(tCsA))));     // (MMA,MMA_M,MMA_K)
+  CUTE_STATIC_ASSERT_V((  shape(tCrB) == take<0,3>(shape(tCsB))));     // (MMA,MMA_N,MMA_K)
+  CUTE_STATIC_ASSERT_V((  shape(tCrC) == take<0,3>(shape(tCgC))));     // (MMA,MMA_M,MMA_N)
+  CUTE_STATIC_ASSERT_V((size<1>(tCgC) == size<1>(tCsA)));              // MMA_M
+  CUTE_STATIC_ASSERT_V((size<2>(tCgC) == size<1>(tCsB)));              // MMA_N
+  CUTE_STATIC_ASSERT_V((size<2>(tCsA) == size<2>(tCsB)));              // MMA_K
+
+  // Clear the accumulators
+  clear(tCrC);
+
+#if 0
+  if(thread0()) {
+    print("  mA : "); print(  mA); print("\n");
+    print("  gA : "); print(  gA); print("\n");
+    print("  sA : "); print(  sA); print("\n");
+    print("tAgA : "); print(tAgA); print("\n");
+    print("tAsA : "); print(tAsA); print("\n");
+  }
+#endif
+
+#if 0
+  if(thread0()) {
+    print("  mB : "); print(  mB); print("\n");
+    print("  gB : "); print(  gB); print("\n");
+    print("  sB : "); print(  sB); print("\n");
+    print("tBgB : "); print(tBgB); print("\n");
+    print("tBsB : "); print(tBsB); print("\n");
+  }
+#endif
+
+#if 0
+  if(thread0()) {
+    print("  mC : "); print(  mC); print("\n");
+    print("  gC : "); print(  gC); print("\n");
+    print("tCsA : "); print(tCsA); print("\n");
+    print("tCsB : "); print(tCsB); print("\n");
+    print("tCgC : "); print(tCgC); print("\n");
+    print("tCrA : "); print(tCrA); print("\n");
+    print("tCrB : "); print(tCrB); print("\n");
+    print("tCrC : "); print(tCrC); print("\n");
+  }
+#endif
+
+#if 1
+
+  // Current pipe index in smem to read from
+  int smem_pipe_read  = 0;
+  // Current pipe index in smem to write to
+  int smem_pipe_write = K_PIPE_MAX-1;
+
+  // Pipe slice
+  Tensor tCsA_p = tCsA(_,_,_,smem_pipe_read);
+  Tensor tCsB_p = tCsB(_,_,_,smem_pipe_read);
+
+  // Size of the register pipeline
+  auto K_BLOCK_MAX = size<2>(tCrA);
+
+  // PREFETCH register pipeline
+  if (K_BLOCK_MAX > 1) {
+    // Wait until our first prefetched tile is loaded in
+    cp_async_wait<K_PIPE_MAX-2>();
+    __syncthreads();
+
+    // Prefetch the first rmem from the first k-tile
+    copy(tCsA_p(_,_,Int<0>{}), tCrA(_,_,Int<0>{}));
+    copy(tCsB_p(_,_,Int<0>{}), tCrB(_,_,Int<0>{}));
+  }
+
+  //
+  // PIPELINED MAIN LOOP
+  // TUTORIAL: Example of a gemm loop that pipelines shared memory using SM80's cp.async instructions
+  //           and explicit pipelines in shared memory.
+  //   Data is read from global(k_tile_next) to shared(smem_pipe_write).
+  //   Data is read from shared(smem_pipe_read) to registers(k_block_next).
+  //   Data is computed on registers(b_block).
+  //
+  //   This allows all copies and compute to overlap:
+  //     Copy from gmem->smem can overlap with copies from smem->rmem and compute on rmem.
+  //     Copy from smem->rmem can overlap with compute on rmem.
+  //
+
+  CUTE_NO_UNROLL
+  while (k_tile_count > -(K_PIPE_MAX-1))
+  {
+    CUTE_UNROLL
+    for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block)
+    {
+      if (k_block == K_BLOCK_MAX - 1)
+      {
+        // Slice the smem_pipe_read smem
+        tCsA_p = tCsA(_,_,_,smem_pipe_read);
+        tCsB_p = tCsB(_,_,_,smem_pipe_read);
+
+        // Commit the smem for smem_pipe_read
+        cp_async_wait<K_PIPE_MAX-2>();
+        __syncthreads();
+      }
+
+      // Load A, B shmem->regs for k_block+1
+      auto k_block_next = (k_block + Int<1>{}) % K_BLOCK_MAX;      // static
+      copy(tCsA_p(_,_,k_block_next), tCrA(_,_,k_block_next));
+      copy(tCsB_p(_,_,k_block_next), tCrB(_,_,k_block_next));
+      // Copy gmem to smem before computing gemm on each k-pipe
+      if (k_block == 0)
+      {
+        copy(copy_a, tAgA(_,_,_,k_tile_next), tAsA(_,_,_,smem_pipe_write));
+        copy(copy_b, tBgB(_,_,_,k_tile_next), tBsB(_,_,_,smem_pipe_write));
+        cp_async_fence();
+
+        // Advance the gmem tile
+        --k_tile_count;
+        if (k_tile_count > 0) { ++k_tile_next; }
+
+        // Advance the smem pipe
+        smem_pipe_write = smem_pipe_read;
+        ++smem_pipe_read;
+        smem_pipe_read = (smem_pipe_read == K_PIPE_MAX) ? 0 : smem_pipe_read;
+      }
+      // Thread-level register gemm for k_block
+      gemm(mma, tCrA(_,_,k_block), tCrB(_,_,k_block), tCrC);
+    }
+
+  }
+
+#endif
+
+  //
+  // Epilogue
+  //
+
+  axpby(alpha, tCrC, beta, tCgC);
+}
+
+// Setup params for a NT GEMM
+template <class TA, class TB, class TC,
+          class Alpha, class Beta>
+void
+gemm_nt(int m, int n, int k,
+        Alpha alpha,
+        TA const* A, int ldA,
+        TB const* B, int ldB,
+        Beta beta,
+        TC      * C, int ldC,
+        cudaStream_t stream = 0)
+{
+  using namespace cute;
+
+  // Define shapes (dynamic)
+  auto M = int(m);
+  auto N = int(n);
+  auto K = int(k);
+  auto prob_shape = make_shape(M, N, K);                     // (M, N, K)
+
+  // Define NT strides (mixed)
+  auto dA = make_stride(Int<1>{}, ldA);                      // (dM, dK)
+  auto dB = make_stride(Int<1>{}, ldB);                      // (dN, dK)
+  auto dC = make_stride(Int<1>{}, ldC);                      // (dM, dN)
+
+  // Define CTA tile sizes (static)
+  auto bM = Int<128>{};
+  auto bN = Int<128>{};
+  auto bK = Int<  8>{};
+  auto cta_tiler = make_shape(bM, bN, bK);                   // (BLK_M, BLK_N, BLK_K)
+  auto bP = Int<3>{};  // Pipeline
+
+  // Define the smem layouts (static)
+  auto sA = make_layout(make_shape(bM, bK, bP));             // (m,k,p) -> smem_idx; m-major
+  auto sB = make_layout(make_shape(bN, bK, bP));             // (n,k,p) -> smem_idx; n-major
+  auto sC = make_layout(make_shape(bM, bN));                 // (m,n) -> smem_idx; m-major
+
+  // Define the thread layouts (static)
+
+  TiledCopy copyA = make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<uint128_t>, TA>{},
+                                    Layout<Shape<_32,_8>>{}, // Thr layout 32x8 m-major
+                                    Layout<Shape< _4,_1>>{});// Val layout  4x1 m-major
+  TiledCopy copyB = make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<uint128_t>, TB>{},
+                                    Layout<Shape<_32,_8>>{}, // Thr layout 32x8 n-major
+                                    Layout<Shape< _4,_1>>{});// Val layout  4x1 n-major
+
+  TiledMMA mmaC = make_tiled_mma(UniversalFMA<TC,TA,TB>{},
+                                 Layout<Shape<_16,_16,_1>>{});  // 16x16x1 TiledMMA
+
+#if 0
+  print(copyA);
+  print(copyB);
+  print(mmaC);
+#endif
+
+#if 0
+  print_latex(copyA);
+  print_latex(copyB);
+  print_latex(mmaC);
+#endif
+
+  dim3 dimBlock(size(mmaC));
+  dim3 dimGrid(size(ceil_div(M, bM)),
+               size(ceil_div(N, bN)));
+  gemm_device<<<dimGrid, dimBlock, 0, stream>>>
+      (prob_shape, cta_tiler,
+       A, dA, sA, copyA,
+       B, dB, sB, copyB,
+       C, dC, sC, mmaC,
+       alpha, beta);
+}
+
+// Setup params for a TN GEMM
+template <class TA, class TB, class TC,
+          class Alpha, class Beta>
+void
+gemm_tn(int m, int n, int k,
+        Alpha alpha,
+        TA const* A, int ldA,
+        TB const* B, int ldB,
+        Beta beta,
+        TC      * C, int ldC,
+        cudaStream_t stream = 0)
+{
+  using namespace cute;
+
+  // Define shapes (dynamic)
+  auto M = int(m);
+  auto N = int(n);
+  auto K = int(k);
+  auto prob_shape = make_shape(M, N, K);                     // (M, N, K)
+
+  // Define TN strides (mixed)
+  auto dA = make_stride(ldA, Int<1>{});                      // (dM, dK)
+  auto dB = make_stride(ldB, Int<1>{});                      // (dN, dK)
+  auto dC = make_stride(Int<1>{}, ldC);                      // (dM, dN)
+
+  // Define CTA tile sizes (static)
+  auto bM = Int<128>{};
+  auto bN = Int<128>{};
+  auto bK = Int<  8>{};
+  auto cta_tiler = make_shape(bM, bN, bK);                   // (BLK_M, BLK_N, BLK_K)
+  auto bP = Int<3>{};  // Pipeline
+
+  // Define the smem layouts (static)
+  auto sA_atom                  = make_layout(make_shape (      bM,          bK),
+                                              make_stride(Int<1>{}, bM+Int<1>{})); // (m,k) -> smem_idx; padded m-major
+  [[maybe_unused]] auto sB_atom = make_layout(make_shape (      bN,          bK),
+                                              make_stride(Int<1>{}, bN+Int<1>{})); // (n,k) -> smem_idx; padded n-major
+  auto sA = tile_to_shape(sA_atom, make_shape(bM, bK, bP));
+  auto sB = tile_to_shape(sA_atom, make_shape(bN, bK, bP));
+  auto sC = make_layout(make_shape(bM, bN));                        // (m,n) -> smem_idx
+
+  // Define the thread layouts (static)
+
+  TiledCopy copyA = make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<TA>, TA>{},
+                                    Layout<Shape<_32,_8>,Stride<_8,_1>>{}, // Thr layout 32x8 k-major
+                                    Layout<Shape< _1,_1>>{});              // Val layout  1x1
+  TiledCopy copyB = make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<TB>, TB>{},
+                                    Layout<Shape<_32,_8>,Stride<_8,_1>>{}, // Thr layout 32x8 k-major
+                                    Layout<Shape< _1,_1>>{});              // Val layout  1x1
+
+  TiledMMA mmaC = make_tiled_mma(UniversalFMA<TC,TA,TB>{},
+                                 Layout<Shape<_16,_16,_1>>{});  // 16x16x1 TiledMMA
+
+#if 0
+  print(copyA);
+  print(copyB);
+  print(mmaC);
+#endif
+
+#if 0
+  print_latex(copyA);
+  print_latex(copyB);
+  print_latex(mmaC);
+#endif
+
+  dim3 dimBlock(size(mmaC));
+  dim3 dimGrid(size(ceil_div(M, bM)),
+               size(ceil_div(N, bN)));
+  gemm_device<<<dimGrid, dimBlock, 0, stream>>>
+      (prob_shape, cta_tiler,
+       A, dA, sA, copyA,
+       B, dB, sB, copyB,
+       C, dC, sC, mmaC,
+       alpha, beta);
+}
+
+template <class TA, class TB, class TC,
+          class Alpha, class Beta>
+void
+gemm(char transA, char transB, int m, int n, int k,
+     Alpha alpha,
+     TA const* A, int ldA,
+     TB const* B, int ldB,
+     Beta beta,
+     TC      * C, int ldC,
+     cudaStream_t stream = 0)
+{
+  if (transA == 'N' && transB == 'T') {
+    return gemm_nt(m, n, k, alpha, A, ldA, B, ldB, beta, C, ldC, stream);
+  } else
+  if (transA == 'T' && transB == 'N') {
+    return gemm_tn(m, n, k, alpha, A, ldA, B, ldB, beta, C, ldC, stream);
+  }
+  assert(false && "Not implemented");
+}
+
+
+int main(int argc, char** argv)
+{
+  cudaDeviceProp props;
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (props.major < 8) {
+    std::cout << "This example requires an Ampere GPU or newer (CC >= 80)" << std::endl;
+    // Return 0 so tests pass if run on unsupported architectures or CUDA Toolkits.
+    return 0;
+  }
+
+  int m = 5120;
+  if (argc >= 2)
+    sscanf(argv[1], "%d", &m);
+
+  int n = 5120;
+  if (argc >= 3)
+    sscanf(argv[2], "%d", &n);
+
+  int k = 4096;
+  if (argc >= 4)
+    sscanf(argv[3], "%d", &k);
+
+  char transA = 'N';
+  if (argc >= 5)
+    sscanf(argv[4], "%c", &transA);
+
+  char transB = 'T';
+  if (argc >= 6)
+    sscanf(argv[5], "%c", &transB);
+
+  using TA = float;
+  using TB = float;
+  using TC = float;
+  using TI = float;
+
+  TI alpha = 1.0;
+  TI beta  = 0.0;
+
+  std::cout << "M = " << m << std::endl;
+  std::cout << "N = " << n << std::endl;
+  std::cout << "K = " << k << std::endl;
+  std::cout << "C = A^" << transA << " B^" << transB << std::endl;
+
+  thrust::host_vector<TA> h_A(m*k);
+  thrust::host_vector<TB> h_B(n*k);
+  thrust::host_vector<TC> h_C(m*n);
+
+  for (int j = 0; j < m*k; ++j) h_A[j] = static_cast<TA>( 2*(rand() / double(RAND_MAX)) - 1 );
+  for (int j = 0; j < n*k; ++j) h_B[j] = static_cast<TB>( 2*(rand() / double(RAND_MAX)) - 1 );
+  for (int j = 0; j < m*n; ++j) h_C[j] = static_cast<TC>(-1);
+
+  thrust::device_vector<TA> d_A = h_A;
+  thrust::device_vector<TB> d_B = h_B;
+  thrust::device_vector<TC> d_C = h_C;
+
+  double gflops = (2.0*m*n*k) * 1e-9;
+
+  const int timing_iterations = 100;
+  GPU_Clock timer;
+
+  int ldA = 0, ldB = 0, ldC = m;
+
+  if (transA == 'N') {
+    ldA = m;
+  } else if (transA == 'T') {
+    ldA = k;
+  } else {
+    assert(false);
+  }
+
+  if (transB == 'N') {
+    ldB = k;
+  } else if (transB == 'T') {
+    ldB = n;
+  } else {
+    assert(false);
+  }
+
+  // Run once
+  d_C = h_C;
+  gemm(transA, transB, m, n, k,
+       alpha,
+       d_A.data().get(), ldA,
+       d_B.data().get(), ldB,
+       beta,
+       d_C.data().get(), ldC);
+  CUTE_CHECK_LAST();
+  thrust::host_vector<TC> cute_result = d_C;
+
+  // Timing iterations
+  timer.start();
+  for (int i = 0; i < timing_iterations; ++i) {
+    gemm(transA, transB, m, n, k,
+         alpha,
+         d_A.data().get(), ldA,
+         d_B.data().get(), ldB,
+         beta,
+         d_C.data().get(), ldC);
+  }
+  double cute_time = timer.seconds() / timing_iterations;
+  CUTE_CHECK_LAST();
+  printf("CUTE_GEMM:     [%6.1f]GFlop/s  (%6.4f)ms\n", gflops / cute_time, cute_time*1000);
+
+  return 0;
+}
diff --git a/examples/cute/tutorial/tiled_copy.cu b/examples/cute/tutorial/tiled_copy.cu
new file mode 100644
index 0000000000..a8ae3b1040
--- /dev/null
+++ b/examples/cute/tutorial/tiled_copy.cu
@@ -0,0 +1,256 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <cute/tensor.hpp>
+
+#include "cutlass/util/print_error.hpp"
+#include "cutlass/util/GPU_Clock.hpp"
+#include "cutlass/util/helper_cuda.hpp"
+
+// This is a simple tutorial showing several ways to partition a tensor into tiles then
+// perform efficient, coalesced copies. This example also shows how to vectorize accesses
+// which may be a useful optimization or required for certain workloads.
+//
+// `copy_kernel()` and `copy_kernel_vectorized()` each assume a pair of tensors with
+// dimensions (m, n) have been partitioned via `tiled_divide()`.
+//
+// The result are a part of compatible tensors with dimensions ((M, N), m', n'), where
+// (M, N) denotes a statically sized tile, and m' and n' denote the number of such tiles
+// within the tensor.
+//
+// Each statically sized tile is mapped to a CUDA threadblock which performs efficient
+// loads and stores to Global Memory.
+//
+// `copy_kernel()` uses `cute::local_partition()` to partition the tensor and map
+// the result to threads using a striped indexing scheme. Threads themselve are arranged
+// in a (ThreadShape_M, ThreadShape_N) arrangement which is replicated over the tile.
+//
+// `copy_kernel_vectorized()` uses `cute::make_tiled_copy()` to perform a similar
+// partitioning using `cute::Copy_Atom` to perform vectorization. The actual vector
+// size is defined by `ThreadShape`.
+//
+// This example assumes the overall tensor shape is divisible by the tile size and
+// does not perform predication.
+
+
+/// Simple copy kernel.
+//
+// Uses local_partition() to partition a tile among threads arranged as (THR_M, THR_N).
+template <class TensorS, class TensorD, class ThreadLayout>
+__global__ void copy_kernel(TensorS S, TensorD D, ThreadLayout)
+{
+  using namespace cute;
+
+  // Slice the tiled tensors
+  Tensor tile_S = S(make_coord(_,_), blockIdx.x, blockIdx.y);   // (BlockShape_M, BlockShape_N)
+  Tensor tile_D = D(make_coord(_,_), blockIdx.x, blockIdx.y);   // (BlockShape_M, BlockShape_N)
+
+  // Construct a partitioning of the tile among threads with the given thread arrangement.
+
+  // Concept:                         Tensor  ThrLayout       ThrIndex
+  Tensor thr_tile_S = local_partition(tile_S, ThreadLayout{}, threadIdx.x);  // (ThrValM, ThrValN)
+  Tensor thr_tile_D = local_partition(tile_D, ThreadLayout{}, threadIdx.x);  // (ThrValM, ThrValN)
+
+  // Construct a register-backed Tensor with the same shape as each thread's partition
+  // Use make_tensor to try to match the layout of thr_tile_S
+  Tensor fragment = make_tensor_like(thr_tile_S);               // (ThrValM, ThrValN)
+
+  // Copy from GMEM to RMEM and from RMEM to GMEM
+  copy(thr_tile_S, fragment);
+  copy(fragment, thr_tile_D);
+}
+
+/// Vectorized copy kernel.
+///
+/// Uses `make_tiled_copy()` to perform a copy using vector instructions. This operation
+/// has the precondition that pointers are aligned to the vector size.
+///
+template <class TensorS, class TensorD, class Tiled_Copy>
+__global__ void copy_kernel_vectorized(TensorS S, TensorD D, Tiled_Copy tiled_copy)
+{
+  using namespace cute;
+
+  // Slice the tensors to obtain a view into each tile.
+  Tensor tile_S = S(make_coord(_, _), blockIdx.x, blockIdx.y);  // (BlockShape_M, BlockShape_N)
+  Tensor tile_D = D(make_coord(_, _), blockIdx.x, blockIdx.y);  // (BlockShape_M, BlockShape_N)
+
+  // Construct a Tensor corresponding to each thread's slice.
+  ThrCopy thr_copy = tiled_copy.get_thread_slice(threadIdx.x);
+
+  Tensor thr_tile_S = thr_copy.partition_S(tile_S);             // (CopyOp, CopyM, CopyN)
+  Tensor thr_tile_D = thr_copy.partition_D(tile_D);             // (CopyOp, CopyM, CopyN)
+
+  // Construct a register-backed Tensor with the same shape as each thread's partition
+  // Use make_fragment because the first mode is the instruction-local mode
+  Tensor fragment = make_fragment_like(thr_tile_D);             // (CopyOp, CopyM, CopyN)
+
+  // Copy from GMEM to RMEM and from RMEM to GMEM
+  copy(tiled_copy, thr_tile_S, fragment);
+  copy(tiled_copy, fragment, thr_tile_D);
+}
+
+/// Main function
+int main(int argc, char** argv)
+{
+  //
+  // Given a 2D shape, perform an efficient copy
+  //
+
+  using namespace cute;
+  using Element = float;
+
+  // Define a tensor shape with dynamic extents (m, n)
+  auto tensor_shape = make_shape(256, 512);
+
+  //
+  // Allocate and initialize
+  //
+
+  thrust::host_vector<Element> h_S(size(tensor_shape));
+  thrust::host_vector<Element> h_D(size(tensor_shape));
+
+  for (size_t i = 0; i < h_S.size(); ++i) {
+    h_S[i] = static_cast<Element>(i);
+    h_D[i] = Element{};
+  }
+
+  thrust::device_vector<Element> d_S = h_S;
+  thrust::device_vector<Element> d_D = h_D;
+
+  //
+  // Make tensors
+  //
+
+  Tensor tensor_S = make_tensor(make_gmem_ptr(thrust::raw_pointer_cast(d_S.data())), make_layout(tensor_shape));
+  Tensor tensor_D = make_tensor(make_gmem_ptr(thrust::raw_pointer_cast(d_D.data())), make_layout(tensor_shape));
+
+  //
+  // Tile tensors
+  //
+
+  // Define a statically sized block (M, N).
+  // Note, by convention, capital letters are used to represent static modes.
+  auto block_shape = make_shape(Int<128>{}, Int<64>{});
+
+  if ((size<0>(tensor_shape) % size<0>(block_shape)) || (size<1>(tensor_shape) % size<1>(block_shape))) {
+    std::cerr << "The tensor shape must be divisible by the block shape." << std::endl;
+    return -1;
+  }
+  // Equivalent check to the above
+  if (not evenly_divides(tensor_shape, block_shape)) {
+    std::cerr << "Expected the block_shape to evenly divide the tensor shape." << std::endl;
+    return -1;
+  }
+
+  // Tile the tensor (m, n) ==> ((M, N), m', n') where (M, N) is the static tile
+  // shape, and modes (m', n') correspond to the number of tiles.
+  //
+  // These will be used to determine the CUDA kernel grid dimensions.
+  Tensor tiled_tensor_S = tiled_divide(tensor_S, block_shape);      // ((M, N), m', n')
+  Tensor tiled_tensor_D = tiled_divide(tensor_D, block_shape);      // ((M, N), m', n')
+
+  // Construct a TiledCopy with a specific access pattern.
+  //   This version uses a
+  //   (1) Layout-of-Threads to describe the number and arrangement of threads (e.g. row-major, col-major, etc),
+  //   (2) Layout-of-Values that each thread will access.
+
+  // Thread arrangement
+  Layout thr_layout = make_layout(make_shape(Int<32>{}, Int<8>{}));  // (32,8) -> thr_idx
+
+  // Value arrangement per thread
+  Layout val_layout = make_layout(make_shape(Int<4>{}, Int<1>{}));   // (4,1) -> val_idx
+
+  // Define `AccessType` which controls the size of the actual memory access instruction.
+  using CopyOp = UniversalCopy<uint_byte_t<sizeof(Element) * size(val_layout)>>;     // A very specific access width copy instruction
+  //using CopyOp = UniversalCopy<cutlass::AlignedArray<Element, size(val_layout)>>;  // A more generic type that supports many copy strategies
+  //using CopyOp = AutoVectorizingCopy;                                              // An adaptable-width instruction that assumes maximal alignment of inputs
+
+  // A Copy_Atom corresponds to one CopyOperation applied to Tensors of type Element.
+  using Atom = Copy_Atom<CopyOp, Element>;
+
+  // Construct tiled copy, a tiling of copy atoms.
+  //
+  // Note, this assumes the vector and thread layouts are aligned with contigous data
+  // in GMEM. Alternative thread layouts are possible but may result in uncoalesced
+  // reads. Alternative value layouts are also possible, though incompatible layouts
+  // will result in compile time errors.
+  TiledCopy tiled_copy = make_tiled_copy(Atom{},             // Access strategy
+                                         thr_layout,         // thread layout (e.g. 32x4 Col-Major)
+                                         val_layout);        // value layout (e.g. 4x1)
+
+  //
+  // Determine grid and block dimensions
+  //
+
+  dim3 gridDim (size<1>(tiled_tensor_D), size<2>(tiled_tensor_D));   // Grid shape corresponds to modes m' and n'
+  dim3 blockDim(size(thr_layout));
+
+  //
+  // Launch the kernel
+  //
+  copy_kernel_vectorized<<< gridDim, blockDim >>>(
+    tiled_tensor_S,
+    tiled_tensor_D,
+    tiled_copy);
+
+  cudaError result = cudaDeviceSynchronize();
+  if (result != cudaSuccess) {
+    std::cerr << "CUDA Runtime error: " << cudaGetErrorString(result) << std::endl;
+    return -1;
+  }
+
+  //
+  // Verify
+  //
+
+  h_D = d_D;
+
+  int32_t errors = 0;
+  int32_t const kErrorLimit = 10;
+
+  for (size_t i = 0; i < h_D.size(); ++i) {
+    if (h_S[i] != h_D[i]) {
+      std::cerr << "Error. S[" << i << "]: " << h_S[i] << ",   D[" << i << "]: " << h_D[i] << std::endl;
+
+      if (++errors >= kErrorLimit) {
+        std::cerr << "Aborting on " << kErrorLimit << "nth error." << std::endl;
+        return -1;
+      }
+    }
+  }
+
+  std::cout << "Success." << std::endl;
+
+  return 0;
+}
+
diff --git a/examples/cute/tutorial/wgmma_sm90.cu b/examples/cute/tutorial/wgmma_sm90.cu
new file mode 100644
index 0000000000..0baa494a37
--- /dev/null
+++ b/examples/cute/tutorial/wgmma_sm90.cu
@@ -0,0 +1,562 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#include <cstdlib>
+#include <cstdio>
+#include <cassert>
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <cute/tensor.hpp>
+
+#include "cutlass/cluster_launch.hpp"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/pipeline/sm90_pipeline.hpp"
+
+#include "cutlass/util/print_error.hpp"
+#include "cutlass/util/GPU_Clock.hpp"
+#include "cutlass/util/helper_cuda.hpp"
+#include "cutlass/arch/mma_sm90.h"
+#include "cutlass/device_kernel.h"
+
+using namespace cute;
+
+template <class ElementA,
+          class ElementB,
+          class SmemLayoutA,  // (M,K,P)
+          class SmemLayoutB>  // (N,K,P)
+struct SharedStorage
+{
+  array_aligned<ElementA, cosize_v<SmemLayoutA>> smem_A;
+  array_aligned<ElementB, cosize_v<SmemLayoutB>> smem_B;
+
+  uint64_t tma_barrier[size<2>(SmemLayoutA{})];
+  uint64_t mma_barrier[size<2>(SmemLayoutA{})];
+};
+
+template <class ProblemShape, class CtaTiler,
+          class TA, class SmemLayoutA, class TmaA,
+          class TB, class SmemLayoutB, class TmaB,
+          class TC, class CStride, class TiledMma,
+          class Alpha, class Beta>
+__global__ static
+__launch_bounds__(decltype(size(TiledMma{}))::value)
+void
+gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
+            TA const* A, CUTLASS_GRID_CONSTANT TmaA const tma_a,
+            TB const* B, CUTLASS_GRID_CONSTANT TmaB const tma_b,
+            TC      * C, CStride dC, TiledMma mma,
+            Alpha alpha, Beta beta)
+{
+  // Preconditions
+  CUTE_STATIC_ASSERT_V(rank(shape_MNK) == Int<3>{});                   // (M, N, K)
+  CUTE_STATIC_ASSERT_V(rank(cta_tiler) == Int<3>{});                   // (BLK_M, BLK_N, BLK_K)
+
+  static_assert(is_static<SmemLayoutA>::value);
+  static_assert(is_static<SmemLayoutB>::value);
+
+  CUTE_STATIC_ASSERT_V(size<0>(SmemLayoutA{}) == size<0>(cta_tiler));  // BLK_M
+  CUTE_STATIC_ASSERT_V(size<0>(SmemLayoutB{}) == size<1>(cta_tiler));  // BLK_N
+  CUTE_STATIC_ASSERT_V(size<1>(SmemLayoutA{}) == size<2>(cta_tiler));  // BLK_K
+  CUTE_STATIC_ASSERT_V(size<1>(SmemLayoutB{}) == size<2>(cta_tiler));  // BLK_K
+
+  CUTE_STATIC_ASSERT_V(congruent(select<0,1>(shape_MNK), dC));         // dC strides for shape MN
+
+  //
+  // Full and Tiled Tensors
+  //
+
+  // Represent the full tensors
+  auto [M, N, K] = shape_MNK;
+  Tensor mA = tma_a.get_tma_tensor(make_shape(M,K));                   // (M,K) TMA Tensor
+  Tensor mB = tma_b.get_tma_tensor(make_shape(N,K));                   // (N,K) TMA Tensor
+  Tensor mC = make_tensor(make_gmem_ptr(C), make_shape(M,N), dC);      // (M,N)
+
+  // Get the appropriate blocks for this thread block
+  auto cta_coord = make_coord(blockIdx.x, blockIdx.y, _);              // (m,n,k)
+  Tensor gA = local_tile(mA, cta_tiler, cta_coord, Step<_1, X,_1>{});  // (BLK_M,BLK_K,k)
+  Tensor gB = local_tile(mB, cta_tiler, cta_coord, Step< X,_1,_1>{});  // (BLK_N,BLK_K,k)
+  Tensor gC = local_tile(mC, cta_tiler, cta_coord, Step<_1,_1, X>{});  // (BLK_M,BLK_N)
+
+  // Shared memory tensors
+  extern __shared__ char shared_memory[];
+  using SharedStorage = SharedStorage<TA, TB, SmemLayoutA, SmemLayoutB>;
+  SharedStorage& smem = *reinterpret_cast<SharedStorage*>(shared_memory);
+  Tensor sA = make_tensor(make_smem_ptr(smem.smem_A.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE)
+  Tensor sB = make_tensor(make_smem_ptr(smem.smem_B.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE)
+
+  //
+  // Partition the copying of A and B tiles
+  //
+  // TUTORIAL:
+  //   These are TMA partitionings, which have a dedicated custom partitioner.
+  //   The Int<0>, Layout<_1> indicates that the TMAs are not multicasted.
+  //     Any multicasting must be in conformance with tma_x constructed with make_tma_atom on host.
+  //   The group_modes<0,2> transforms the (X,Y,Z)-shaped tensors into ((X,Y),Z)-shaped tensors
+  //     with the understanding that the TMA is responsible for everything in mode-0.
+  //   The tma_partition reorders and offsets mode-0 according to the tma_x atom and the multicast info.
+  //
+
+  auto [tAgA, tAsA] = tma_partition(tma_a, Int<0>{}, Layout<_1>{},
+                                    group_modes<0,2>(sA), group_modes<0,2>(gA));  // (TMA,k) and (TMA,PIPE)
+
+  auto [tBgB, tBsB] = tma_partition(tma_b, Int<0>{}, Layout<_1>{},
+                                    group_modes<0,2>(sB), group_modes<0,2>(gB));  // (TMA,k) and (TMA,PIPE)
+
+  // The TMA is responsible for copying everything in mode-0 of tAsA and tBsB
+  constexpr int kTmaTransactionBytes = CUTE_STATIC_V(size<0>(tAsA)) * sizeof(TA) +
+                                       CUTE_STATIC_V(size<0>(tBsB)) * sizeof(TB);
+
+  //
+  // PREFETCH
+  //
+
+  auto K_PIPE_MAX = size<1>(tAsA);
+
+  // Total count of tiles
+  int k_tile_count = size<1>(tAgA);
+  // Current tile index in gmem to read from
+  int k_tile = 0;
+
+  // Initialize Barriers
+  int warp_idx = cutlass::canonical_warp_idx_sync();
+  int lane_predicate = cute::elect_one_sync();
+  uint64_t* producer_mbar = smem.tma_barrier;
+  uint64_t* consumer_mbar = smem.mma_barrier;
+
+  using ProducerBarType = cutlass::arch::ClusterTransactionBarrier;  // TMA
+  using ConsumerBarType = cutlass::arch::ClusterBarrier;             // MMA
+  CUTE_UNROLL
+  for (int pipe = 0; pipe < K_PIPE_MAX; ++pipe) {
+    if ((warp_idx == 0) && lane_predicate) {
+      ProducerBarType::init(&producer_mbar[pipe],   1);
+      ConsumerBarType::init(&consumer_mbar[pipe], 128);
+    }
+  }
+  // Ensure barrier init is complete on all CTAs
+  cluster_sync();
+
+  // Start async loads for all pipes
+  CUTE_UNROLL
+  for (int pipe = 0; pipe < K_PIPE_MAX; ++pipe)
+  {
+    if ((warp_idx == 0) && lane_predicate)
+    {
+      // Set expected Tx Bytes after each reset / init
+      ProducerBarType::arrive_and_expect_tx(&producer_mbar[pipe], kTmaTransactionBytes);
+      copy(tma_a.with(producer_mbar[pipe]), tAgA(_,k_tile), tAsA(_,pipe));
+      copy(tma_b.with(producer_mbar[pipe]), tBgB(_,k_tile), tBsB(_,pipe));
+    }
+    --k_tile_count;
+    ++k_tile;
+  }
+
+  //
+  // Define A/B partitioning and C accumulators
+  //
+  // TUTORIAL:
+  //   The tCrA and tCrB are actually Tensors of MMA Descriptors constructed as views of SMEM.
+  //   The MMA Descriptor generation is automatic via inspection and validation of the SMEM Layouts.
+  //   Because the MMA reads directly from SMEM and the fragments are descriptors rather than registers,
+  //     there is no need for copy(tCsA, tCrA) in the mainloop.
+  //
+
+  ThrMMA thr_mma = mma.get_thread_slice(threadIdx.x);
+  Tensor tCsA = thr_mma.partition_A(sA);                               // (MMA,MMA_M,MMA_K,PIPE)
+  Tensor tCsB = thr_mma.partition_B(sB);                               // (MMA,MMA_N,MMA_K,PIPE)
+  Tensor tCgC = thr_mma.partition_C(gC);                               // (MMA,MMA_M,MMA_N)
+
+  // Allocate accumulators and clear them
+  Tensor tCrC = thr_mma.make_fragment_C(tCgC);                         // (MMA,MMA_M,MMA_N)
+  clear(tCrC);
+
+  // Allocate "fragments"
+  Tensor tCrA = thr_mma.make_fragment_A(tCsA);                         // (MMA,MMA_M,MMA_K,PIPE)
+  Tensor tCrB = thr_mma.make_fragment_B(tCsB);                         // (MMA,MMA_N,MMA_K,PIPE)
+
+  //
+  // PIPELINED MAIN LOOP
+  //
+  // TUTORIAL:
+  //   Rather than interleaving the stages and instructions like in SM70 and SM80,
+  //     the SM90 mainloops rely on explicit producer-consumer synchronization
+  //     on the purely async instructions TMA and MMA.
+  //   More advanced pipeline and warp-specialization strategies are available in CUTLASS mainloops.
+  //
+
+  // A PipelineState is a circular pipe index [.index()] and a pipe phase [.phase()]
+  //   that flips each cycle through K_PIPE_MAX.
+  auto write_state = cutlass::PipelineState<K_PIPE_MAX>();             // TMA writes
+  auto read_state  = cutlass::PipelineState<K_PIPE_MAX>();             // MMA  reads
+
+  CUTE_NO_UNROLL
+  while (k_tile_count > -K_PIPE_MAX)
+  {
+    // Wait for Producer to complete
+    int read_pipe = read_state.index();
+    ProducerBarType::wait(&producer_mbar[read_pipe], read_state.phase());
+
+    // MMAs to cover 1 K_TILE
+    warpgroup_arrive();
+    gemm(mma, tCrA(_,_,_,read_pipe), tCrB(_,_,_,read_pipe), tCrC);     // (V,M) x (V,N) => (V,M,N)
+    warpgroup_commit_batch();
+
+    // Wait for all MMAs in a K_TILE to complete
+    warpgroup_wait<0>();
+
+    // Notify that consumption is done
+    ConsumerBarType::arrive(&consumer_mbar[read_pipe]);
+    ++read_state;
+
+    if ((warp_idx == 0) && lane_predicate)
+    {
+      int pipe = write_state.index();
+      // Wait for Consumer to complete consumption
+      ConsumerBarType::wait(&consumer_mbar[pipe], write_state.phase());
+      // Set expected Tx Bytes after each reset / init
+      ProducerBarType::arrive_and_expect_tx(&producer_mbar[pipe], kTmaTransactionBytes);
+      copy(tma_a.with(producer_mbar[pipe]), tAgA(_,k_tile), tAsA(_,pipe));
+      copy(tma_b.with(producer_mbar[pipe]), tBgB(_,k_tile), tBsB(_,pipe));
+      ++write_state;
+    }
+    --k_tile_count;
+    ++k_tile;
+  }
+
+  //
+  // Epilogue (unpredicated)
+  //
+
+  axpby(alpha, tCrC, beta, tCgC);
+}
+
+// Setup params for an NT GEMM
+template <class TA, class TB, class TC,
+          class Alpha, class Beta>
+void
+gemm_nt(int m, int n, int k,
+        Alpha alpha,
+        TA const* A, int ldA,
+        TB const* B, int ldB,
+        Beta beta,
+        TC      * C, int ldC,
+        cudaStream_t stream = 0)
+{
+  // Define shapes (dynamic)
+  auto M = int(m);
+  auto N = int(n);
+  auto K = int(k);
+  auto prob_shape = make_shape(M, N, K);                     // (M, N, K)
+
+  // Define TN strides (mixed)
+  auto dA = make_stride(Int<1>{}, ldA);                      // (dM, dK)
+  auto dB = make_stride(Int<1>{}, ldB);                      // (dN, dK)
+  auto dC = make_stride(Int<1>{}, ldC);                      // (dM, dN)
+
+  // Define CTA tile sizes (static)
+  auto bM = Int<128>{};
+  auto bN = Int<128>{};
+  auto bK = Int< 64>{};
+  auto cta_tiler = make_shape(bM, bN, bK);                   // (BLK_M, BLK_N, BLK_K)
+  auto bP = Int<  3>{};  // Pipeline
+
+  // Define the smem layouts (static)
+  auto sA = tile_to_shape(GMMA::Layout_MN_SW128_Atom<TA>{}, make_shape(bM,bK,bP));
+  auto sB = tile_to_shape(GMMA::Layout_MN_SW128_Atom<TB>{}, make_shape(bN,bK,bP));
+
+  // Define the MMA
+  TiledMMA tiled_mma = make_tiled_mma(SM90_64x64x16_F16F16F16_SS<GMMA::Major::MN,GMMA::Major::MN>{});
+
+  // Define the TMAs
+  // Create Global memory tensors for TMA inspection
+  Tensor mA = make_tensor(A, make_shape(M,K), dA);
+  Tensor mB = make_tensor(B, make_shape(N,K), dB);
+
+  // Create TMA Atoms with the desired copy operation on the source and destination
+  Copy_Atom tmaA = make_tma_atom(SM90_TMA_LOAD{}, mA, sA(_,_,0), make_shape(bM,bK));
+  Copy_Atom tmaB = make_tma_atom(SM90_TMA_LOAD{}, mB, sB(_,_,0), make_shape(bN,bK));
+
+  //
+  // Setup and Launch
+  //
+
+  // Launch parameter setup
+  int smem_size = int(sizeof(SharedStorage<TA, TB, decltype(sA), decltype(sB)>));
+  dim3 dimBlock(size(tiled_mma));
+  dim3 dimCluster(2, 1, 1);
+  dim3 dimGrid(round_up(size(ceil_div(m, bM)), dimCluster.x),
+               round_up(size(ceil_div(n, bN)), dimCluster.y));
+  cutlass::ClusterLaunchParams params = {dimGrid, dimBlock, dimCluster, smem_size};
+
+  void const* kernel_ptr = reinterpret_cast<void const*>(
+                              &gemm_device<decltype(prob_shape), decltype(cta_tiler),
+                                           TA, decltype(sA), decltype(tmaA),
+                                           TB, decltype(sB), decltype(tmaB),
+                                           TC, decltype(dC), decltype(tiled_mma),
+                                           decltype(alpha), decltype(beta)>);
+
+  CUTE_CHECK_ERROR(cudaFuncSetAttribute(
+    kernel_ptr,
+    cudaFuncAttributeMaxDynamicSharedMemorySize,
+    smem_size));
+
+  // Kernel Launch
+  cutlass::Status status = cutlass::launch_kernel_on_cluster(params, kernel_ptr,
+                                                             prob_shape, cta_tiler,
+                                                             A, tmaA,
+                                                             B, tmaB,
+                                                             C, dC, tiled_mma,
+                                                             alpha, beta);
+  CUTE_CHECK_LAST();
+
+  if (status != cutlass::Status::kSuccess) {
+    std::cerr << "Error: Failed at kernel Launch" << std::endl;
+  }
+}
+
+// Setup params for a TN GEMM
+template <class TA, class TB, class TC,
+          class Alpha, class Beta>
+void
+gemm_tn(int m, int n, int k,
+        Alpha alpha,
+        TA const* A, int ldA,
+        TB const* B, int ldB,
+        Beta beta,
+        TC      * C, int ldC,
+        cudaStream_t stream = 0)
+{
+  // Define shapes (dynamic)
+  auto M = int(m);
+  auto N = int(n);
+  auto K = int(k);
+  auto prob_shape = make_shape(M, N, K);                     // (M, N, K)
+
+  // Define TN strides (mixed)
+  auto dA = make_stride(ldA, Int<1>{});                      // (dM, dK)
+  auto dB = make_stride(ldB, Int<1>{});                      // (dN, dK)
+  auto dC = make_stride(Int<1>{}, ldC);                      // (dM, dN)
+
+  // Define CTA tile sizes (static)
+  auto bM = Int<128>{};
+  auto bN = Int<128>{};
+  auto bK = Int< 64>{};
+  auto cta_tiler = make_shape(bM, bN, bK);                   // (BLK_M, BLK_N, BLK_K)
+  auto bP = Int<3>{};  // Pipeline
+
+  // Define the smem layouts (static)
+  auto sA = tile_to_shape(GMMA::Layout_K_SW128_Atom<TA>{}, make_shape(bM,bK,bP));
+  auto sB = tile_to_shape(GMMA::Layout_K_SW128_Atom<TB>{}, make_shape(bN,bK,bP));
+
+  // Define the MMA
+  TiledMMA tiled_mma = make_tiled_mma(SM90_64x64x16_F16F16F16_SS<GMMA::Major::K,GMMA::Major::K>{});
+
+  // Define the TMAs
+  // Create Global memory tensors for TMA inspection
+  Tensor mA = make_tensor(A, make_shape(M,K), dA);
+  Tensor mB = make_tensor(B, make_shape(N,K), dB);
+
+  // Create TMA Atoms with the desired copy operation on the source and destination
+  Copy_Atom tmaA = make_tma_atom(SM90_TMA_LOAD{}, mA, sA(_,_,0), make_shape(bM,bK));
+  Copy_Atom tmaB = make_tma_atom(SM90_TMA_LOAD{}, mB, sB(_,_,0), make_shape(bN,bK));
+
+  //
+  // Setup and Launch
+  //
+
+  // Launch parameter setup
+  int smem_size = int(sizeof(SharedStorage<TA, TB, decltype(sA), decltype(sB)>));
+  dim3 dimBlock(size(tiled_mma));
+  dim3 dimCluster(2, 1, 1);
+  dim3 dimGrid(round_up(size(ceil_div(m, bM)), dimCluster.x),
+               round_up(size(ceil_div(n, bN)), dimCluster.y));
+  cutlass::ClusterLaunchParams params = {dimGrid, dimBlock, dimCluster, smem_size};
+
+  void const* kernel_ptr = reinterpret_cast<void const*>(
+                              &gemm_device<decltype(prob_shape), decltype(cta_tiler),
+                                           TA, decltype(sA), decltype(tmaA),
+                                           TB, decltype(sB), decltype(tmaB),
+                                           TC, decltype(dC), decltype(tiled_mma),
+                                           decltype(alpha), decltype(beta)>);
+
+  CUTE_CHECK_ERROR(cudaFuncSetAttribute(
+    kernel_ptr,
+    cudaFuncAttributeMaxDynamicSharedMemorySize,
+    smem_size));
+
+  // Kernel Launch
+  cutlass::Status status = cutlass::launch_kernel_on_cluster(params, kernel_ptr,
+                                                             prob_shape, cta_tiler,
+                                                             A, tmaA,
+                                                             B, tmaB,
+                                                             C, dC, tiled_mma,
+                                                             alpha, beta);
+  CUTE_CHECK_LAST();
+
+  if (status != cutlass::Status::kSuccess) {
+    std::cerr << "Error: Failed at kernel Launch" << std::endl;
+  }
+}
+
+template <class TA, class TB, class TC,
+          class Alpha, class Beta>
+void
+gemm(char transA, char transB, int m, int n, int k,
+     Alpha alpha,
+     TA const* A, int ldA,
+     TB const* B, int ldB,
+     Beta beta,
+     TC      * C, int ldC,
+     cudaStream_t stream = 0)
+{
+  if (transA == 'N' && transB == 'T') {
+    return gemm_nt(m, n, k, alpha, A, ldA, B, ldB, beta, C, ldC, stream);
+  } else
+  if (transA == 'T' && transB == 'N') {
+    return gemm_tn(m, n, k, alpha, A, ldA, B, ldB, beta, C, ldC, stream);
+  }
+  assert(false && "Not implemented");
+}
+
+int main(int argc, char** argv)
+{
+
+  cudaDeviceProp props;
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (props.major != 9) {
+    std::cout << "This example requires NVIDIA's Hopper Architecture GPU with compute capability 90a\n" << std::endl;
+    return 0;
+  }
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+  int m = 512;
+  if (argc >= 2)
+    sscanf(argv[1], "%d", &m);
+
+  int n = 256;
+  if (argc >= 3)
+    sscanf(argv[2], "%d", &n);
+
+  int k = 1024;
+  if (argc >= 4)
+    sscanf(argv[3], "%d", &k);
+
+  char transA = 'N';
+  if (argc >= 5)
+    sscanf(argv[4], "%c", &transA);
+
+  char transB = 'T';
+  if (argc >= 6)
+    sscanf(argv[5], "%c", &transB);
+
+  using TA = cute::half_t;
+  using TB = cute::half_t;
+  using TC = cute::half_t;
+  using TI = cute::half_t;
+
+  TI alpha = TI(1.0f);
+  TI beta  = TI(0.0f);
+
+  thrust::host_vector<TA> h_A(m*k);
+  thrust::host_vector<TB> h_B(n*k);
+  thrust::host_vector<TC> h_C(m*n);
+
+  // Initialize the tensors
+  for (int j = 0; j < m*k; ++j) h_A[j] = TA(int((rand() % 2) ? 1 : -1));
+  for (int j = 0; j < n*k; ++j) h_B[j] = TB(int((rand() % 2) ? 1 : -1));
+  for (int j = 0; j < m*n; ++j) h_C[j] = TC(0);
+
+  thrust::device_vector<TA> d_A = h_A;
+  thrust::device_vector<TB> d_B = h_B;
+  thrust::device_vector<TC> d_C = h_C;
+
+  double gflops = (2.0*m*n*k) * 1e-9;
+
+  const int timing_iterations = 100;
+  GPU_Clock timer;
+
+  int ldA = 0, ldB = 0, ldC = m;
+
+  if (transA == 'N') {
+    ldA = m;
+  } else if (transA == 'T') {
+    ldA = k;
+  } else {
+    assert(false);
+  }
+
+  if (transB == 'N') {
+    ldB = k;
+  } else if (transB == 'T') {
+    ldB = n;
+  } else {
+    assert(false);
+  }
+
+  // Run once
+  d_C = h_C;
+  gemm(transA, transB, m, n, k,
+       alpha,
+       d_A.data().get(), ldA,
+       d_B.data().get(), ldB,
+       beta,
+       d_C.data().get(), ldC);
+  CUTE_CHECK_LAST();
+  thrust::host_vector<TC> cute_result = d_C;
+
+  // Timing iterations
+  timer.start();
+  for (int i = 0; i < timing_iterations; ++i) {
+    gemm(transA, transB, m, n, k,
+         alpha,
+         d_A.data().get(), ldA,
+         d_B.data().get(), ldB,
+         beta,
+         d_C.data().get(), ldC);
+  }
+  double cute_time = timer.seconds() / timing_iterations;
+  CUTE_CHECK_LAST();
+  printf("CUTE_GEMM:     [%6.1f]GFlop/s  (%6.4f)ms\n", gflops / cute_time, cute_time*1000);
+
+#else
+
+  std::cout << "CUTLASS_ARCH_MMA_SM90_SUPPORTED must be enabled, but it is not. Test is waived \n" << std::endl;
+#endif
+
+ return 0;
+
+}
diff --git a/examples/python/00_basic_gemm.ipynb b/examples/python/00_basic_gemm.ipynb
new file mode 100644
index 0000000000..c27955517e
--- /dev/null
+++ b/examples/python/00_basic_gemm.ipynb
@@ -0,0 +1,475 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "1ef96b3f",
+   "metadata": {},
+   "source": [
+    "# Basic example of using the CUTLASS Python interface\n",
+    "This notebook walks through a basic example of using the CUTLASS Python interface to declare, compile, and run GEMMs.\n",
+    "\n",
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cutlass/blob/main/examples/python/00_basic_gemm.ipynb)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "df94d7e6",
+   "metadata": {},
+   "source": [
+    "## Prerequisites for running on Colab\n",
+    "This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "71c7a069",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!#nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cf16785d",
+   "metadata": {},
+   "source": [
+    "If running on Colab, you will need to install the CUTLASS Python interface. To do so, uncomment the following line and run the cell:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c819bb68",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!#pip install nvidia-cutlass"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "962324fd",
+   "metadata": {},
+   "source": [
+    "## General setup\n",
+    "We first import various packages needed for the example and construct the input and output tensors that will be used in our example."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e324219",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import random\n",
+    "\n",
+    "import cutlass\n",
+    "\n",
+    "# This controls whether the C++ GEMM declaration will be printed at each step. \n",
+    "# Set to `False` to omit this information.\n",
+    "print_module = True\n",
+    "\n",
+    "m = 128\n",
+    "n = m\n",
+    "k = m\n",
+    "\n",
+    "dtype = np.float16\n",
+    "type_A = np.float16\n",
+    "type_B = np.float16\n",
+    "type_C = np.float16\n",
+    "type_D = np.float16\n",
+    "\n",
+    "np.random.seed(1234)\n",
+    "random.seed(1234)\n",
+    "scope_min = -4\n",
+    "scope_max = 4\n",
+    "tensor_A = np.ceil(np.random.uniform(low=scope_min, high=scope_max, size=(m, k)).astype(type_A))\n",
+    "tensor_B = np.ceil(np.random.uniform(low=scope_min, high=scope_max, size=(k, n)).astype(type_B))\n",
+    "tensor_C = np.ceil(np.random.uniform(low=scope_min, high=scope_max, size=(m, n)).astype(type_C))\n",
+    "\n",
+    "alpha = np.float16(1.)\n",
+    "beta = np.float16(0.)\n",
+    "\n",
+    "tensor_D = np.zeros(tensor_C.shape).astype(type_D)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "f2c7bf48",
+   "metadata": {},
+   "source": [
+    "## Declaring and running a GEMM\n",
+    "To get started, one only needs to provide the tensors declared above to the `cutlass.op.Gemm` call.\n",
+    "This sets up a default GEMM operation for the given device on which you are running.\n",
+    "\n",
+    "Assuming that we are running on SM80, this default to using a GEMM that leverages FP16 Tensor Core operations.\n",
+    "\n",
+    "Calling `plan.run()` will generate the CUTLASS C++ kernel in question, compile it, and run it on the tensors we previously passed in. By setting `print_module` to `true`, the C++ code that is emitted is printed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0dfd8975",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# We specify `element_accumulator` here so as to match the kernel run by NumPy below. However,\n",
+    "# specifying `element_accumulator` is not required if it is the same as `element`\n",
+    "plan = cutlass.Gemm(element=dtype, layout=cutlass.LayoutType.RowMajor, element_accumulator=np.float32)\n",
+    "plan.run(tensor_A, tensor_B, tensor_C, tensor_D, print_module=print_module)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "4a5856de",
+   "metadata": {},
+   "source": [
+    "There are many other ways to construct a plan from `cutlass.op.Gemm` (e.g., by specifiying they types and layouts of each operand, by providing representative tensors as inputs). For more details on these, see the documentation in the `cutlass.op.Gemm` constructor."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "945478ef",
+   "metadata": {},
+   "source": [
+    "We then compare the output to running the GEMM using NumPy."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6b669de6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tensor_D_numpy = (alpha * (tensor_A @ tensor_B)) + (beta * tensor_C)\n",
+    "np.testing.assert_array_equal(tensor_D, tensor_D_numpy)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "ee5cbbbe",
+   "metadata": {},
+   "source": [
+    "Note that one could use the same kernel just declared for tensors provided by other frameworks beyond NumPy, such as PyTorch or CuPy."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "b6c86493",
+   "metadata": {},
+   "source": [
+    "## Changing operation modes\n",
+    "By default, the CUTLASS Python interface will try to use Tensor Core operations whenever possible. If the configuration provided to `cutlass.op.Gemm` is not supported on Tensor Cores, the interface will fall back to using a SIMT kernel.\n",
+    "\n",
+    "The operation mode currently in use can be returned via the `plan.opclass` property. In this case Tensor Core operations."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "529fda93",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(plan.opclass)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "6d27c575",
+   "metadata": {},
+   "source": [
+    "Suppose that we don't want to use Tensor Cores for this GEMM. One can change to using CUTLASS's SIMT GEMMs by setting the plan's `opclass` field.\n",
+    "\n",
+    "As is shown in the printed output, the emitted kernel uses template parameters that fit CUTLASS's SIMT GEMMs.\n",
+    "\n",
+    "Also notice that, this time around, we provided tensor parameters to `plan.run()`. One is free to provide different parameters to `plan.run()` than were passed in at the initial call to `cutlass.op.Gemm`, provided that the passed-in tensors have the same data type and layout as those passed in on intialization."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6a44d35b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tensor_D_simt = np.zeros(tensor_C.shape).astype(type_D)\n",
+    "plan.opclass = cutlass.OpcodeClass.Simt\n",
+    "plan.run(tensor_A, tensor_B, tensor_C, tensor_D_simt, alpha, beta, print_module=print_module)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "639dcb59",
+   "metadata": {},
+   "source": [
+    "If we compare the output of the Tensor Core and SIMT GEMMs we just ran we see that they are equal."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9b480853",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.testing.assert_array_equal(tensor_D, tensor_D_simt)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "0cce1eae",
+   "metadata": {},
+   "source": [
+    "## Running cached kernels\n",
+    "You may have noticed that the `plan.run()` calls for the previous two kernels took some time to execute. This is because the kernel being emitted had not yet been compiled.\n",
+    "\n",
+    "CUTLASS caches compiled binaries so that recompilation isn't necessary every time a kernel is run. For example, if we change modes back to using Tensor Cores and call `plan.run()` again (with a different set of tensor parameters), you'll find the call to return much faster."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f8051e5e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "m = 2400\n",
+    "n = 3232\n",
+    "k = 4096\n",
+    "\n",
+    "tensor_A = np.ceil(np.random.uniform(low=scope_min, high=scope_max, size=(m, k)).astype(type_A))\n",
+    "tensor_B = np.ceil(np.random.uniform(low=scope_min, high=scope_max, size=(k, n)).astype(type_B))\n",
+    "tensor_C = np.ceil(np.random.uniform(low=scope_min, high=scope_max, size=(m, n)).astype(type_C))\n",
+    "tensor_D = np.zeros(tensor_C.shape).astype(type_D)\n",
+    "\n",
+    "alpha = np.float16(1.)\n",
+    "beta = np.float16(2.)\n",
+    "\n",
+    "plan.opclass = cutlass.OpcodeClass.TensorOp\n",
+    "plan.run(tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, print_module=print_module)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "52a4e318",
+   "metadata": {},
+   "source": [
+    "## Running non-default GEMMs\n",
+    "The previous examples showed how it is simple to get started running a default GEMM kernel in CUTLASS. But, what do you do if you want a bit more control over the parameters to the GEMM?\n",
+    "\n",
+    "Under the hood, CUTLASS enumerates the different GEMM configuration parameters possible for this kernel from the CUTLASS profiler. The code below shows how one can access the tile descriptions for the kernels (e.g., cluster, threadblock, and warp shape)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1c593be1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tiles = plan.tile_descriptions()\n",
+    "print('{} tile descriptions returned'.format(len(tiles)))\n",
+    "num_print = 10\n",
+    "print('First {} tile descriptions are:'.format(num_print))\n",
+    "for td in tiles[:num_print]:\n",
+    "    print(td)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "dc3ad875",
+   "metadata": {},
+   "source": [
+    "Next, we'll pick one of these configurations at random and compile and run it."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a8dc5287",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tiles = [td for td in tiles if td.threadblock_shape[0] >= 128]\n",
+    "idx = random.randint(0, len(tiles)-1)\n",
+    "td = tiles[idx]\n",
+    "print('Tile description {} is: {}'.format(idx, td))\n",
+    "plan.compile(td)\n",
+    "plan.run(tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, print_module=print_module)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "c5a8b534",
+   "metadata": {},
+   "source": [
+    "One can also change the swizzling function used by the kernel. For example, one can modify the kernel to use the stream K feature of CUTLASS via:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e5e88d17",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Stream K is exposed through the threadblock swizzle method for pre-SM90 kernels,\n",
+    "# and via the tile_scheduler attribute of the TileDescription for post-SM90 kernels\n",
+    "if plan.cc < 90:\n",
+    "    plan.swizzling_functor = cutlass.swizzle.ThreadblockSwizzleStreamK\n",
+    "    plan.run(tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, print_module=print_module)\n",
+    "else:\n",
+    "    # Stream-K is currently only supported for warp-specialized cooperative kernels\n",
+    "    td.kernel_schedule = cutlass.KernelScheduleType.TmaWarpSpecializedCooperative\n",
+    "    td.epilogue_schedule = cutlass.EpilogueScheduleType.TmaWarpSpecializedCooperative\n",
+    "    td.tile_scheduler = cutlass.TileSchedulerType.StreamK\n",
+    "\n",
+    "    plan.compile(td)\n",
+    "    plan.run(tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, print_module=print_module)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "5a8ba2ba",
+   "metadata": {},
+   "source": [
+    "## Handling errors\n",
+    "The CUTLASS Python interface attempts to catch runtime and compilation errors in Python so as to provide more understandable error messages.\n",
+    "\n",
+    "Here's an example in which we try to use too many stages for a given GEMM kernel. Normally, this would result in a runtime error due to the GPU having insufficient shared memory to launch the kernel with 8 stages. The CUTLASS Python interface is able to detect this issue before compiling the kernel, and reports it back to the user. Uncomment and run the code below to see this error."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fe7d0e42",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# td = tiles[0]\n",
+    "# td.stages = 8\n",
+    "# plan.compile(td)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0fff34a4",
+   "metadata": {},
+   "source": [
+    "## Specializations for other data types\n",
+    "\n",
+    "Various CUTLASS kernels specialized for specific data types can also be run via the Python interface.\n",
+    "\n",
+    "For example, the code below shows how to declare and run a GEMM using the 3xTF32 feature (see corresponding C++ example [here](https://github.com/NVIDIA/cutlass/blob/main/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/27_ampere_3xtf32_fast_accurate_tensorop_gemm.cu))."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "338ad890",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from cutlass.backend.utils.device import device_cc\n",
+    "\n",
+    "# 3xTF32 requires SM80 or higher\n",
+    "if device_cc() >= 80:\n",
+    "    plan = cutlass.op.Gemm(element=np.float32, layout=cutlass.LayoutType.RowMajor)\n",
+    "    plan.math_operation = cutlass.MathOperation.multiply_add_fast_f32\n",
+    "\n",
+    "    # Create input/output tensors in FP32\n",
+    "    A, B = [np.ones((128, 128)).astype(np.float32) for _ in range(2)]\n",
+    "    C, D = [np.zeros((128, 128)).astype(np.float32) for _ in range(2)]\n",
+    "\n",
+    "    # Run the GEMM\n",
+    "    plan.run(A, B, C, D, print_module=print_module)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "65531df1",
+   "metadata": {},
+   "source": [
+    "Additionally, one can run CUTLASS's FP8 GEMMs if using a frontend library capable of allocating and initializing FP8 tensors (e.g., PyTorch)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "776f1d8d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "try:\n",
+    "    import torch\n",
+    "except ImportError:\n",
+    "    print(\"PyTorch is not available. Skipping FP8 example\")\n",
+    "    import sys; sys.exit(0)\n",
+    "\n",
+    "if not hasattr(torch, \"float8_e4m3fn\"):\n",
+    "    print(\"Version of PyTorch does not have the float8_e4m3fn data type. Skipping FP8 example\")\n",
+    "    import sys; sys.exit(0)\n",
+    "\n",
+    "# FP8 is supported through the CUTLASS Python interface on SM90 and higher\n",
+    "if device_cc() >= 90:\n",
+    "    plan = cutlass.op.Gemm(element=torch.float8_e4m3fn, element_C=torch.float32, element_accumulator=torch.float32,\n",
+    "                        layout_A=cutlass.LayoutType.RowMajor, layout_B=cutlass.LayoutType.ColumnMajor,\n",
+    "                        layout_C=cutlass.LayoutType.ColumnMajor)\n",
+    "\n",
+    "    # Create input/output tensors in FP8\n",
+    "    A, B = [torch.ones((128, 128)).to(torch.float8_e4m3fn).to(\"cuda\") for _ in range(2)]\n",
+    "    C, D = [torch.zeros((128, 128)).to(torch.float8_e4m3fn).to(\"cuda\") for _ in range(2)]\n",
+    "\n",
+    "    # Run the GEMM\n",
+    "    plan.run(A, B, C, D, print_module=print_module)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.9"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "0466d96796c9cd8f7a1cad264ff326ececc950ba2420e0256d5105fc1a3c6e70"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/python/01_epilogue.ipynb b/examples/python/01_epilogue.ipynb
new file mode 100644
index 0000000000..97663f5035
--- /dev/null
+++ b/examples/python/01_epilogue.ipynb
@@ -0,0 +1,253 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "5d24a692",
+   "metadata": {},
+   "source": [
+    "# Example of using elementwise activation functions in the CUTLASS Python interface\n",
+    "This notebook walks through a basic example of using the CUTLASS Python interface to declare, compile, and run GEMMs with different epilogues.\n",
+    "\n",
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cutlass/blob/main/examples/python/01_epilogue.ipynb)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "28c916da",
+   "metadata": {},
+   "source": [
+    "## Prerequisites for running on Colab\n",
+    "This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0fcea8ea",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!#nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7ec60b57",
+   "metadata": {},
+   "source": [
+    "If running on Colab, you will need to install the CUTLASS Python interface. To do so, uncomment the following line and run the cell:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1db9e51c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!#pip install nvidia-cutlass"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "962324fd",
+   "metadata": {},
+   "source": [
+    "## General setup\n",
+    "We first import various packages needed for the example and construct the input and output tensors that will be used in our example."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "63a70a3c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "import cutlass\n",
+    "\n",
+    "# This controls whether ther C++ GEMM declaration will be printed at each step. Set to `false` to\n",
+    "# omit this information.\n",
+    "print_module = True\n",
+    "\n",
+    "m = 256\n",
+    "n = m\n",
+    "k = m\n",
+    "\n",
+    "type_A = np.float16\n",
+    "type_B = np.float16\n",
+    "type_C = np.float16\n",
+    "type_D = np.float16\n",
+    "\n",
+    "np.random.seed(1234)\n",
+    "scope_min = -4\n",
+    "scope_max = 4\n",
+    "tensor_A = np.ceil(np.random.uniform(low=scope_min, high=scope_max, size=(m, k)).astype(type_A))\n",
+    "tensor_B = np.ceil(np.random.uniform(low=scope_min, high=scope_max, size=(k, n)).astype(type_B))\n",
+    "tensor_C = np.ceil(np.random.uniform(low=scope_min, high=scope_max, size=(m, n)).astype(type_C))\n",
+    "\n",
+    "alpha = np.float16(1.)\n",
+    "beta = np.float16(0.)\n",
+    "\n",
+    "tensor_D = np.zeros(tensor_C.shape).astype(type_D)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1eb0d95b",
+   "metadata": {},
+   "source": [
+    "## Run a GEMM with an identity activation function\n",
+    "To begin, we simply run a default GEMM with an identity activation function. This performs the well-known operation `D = alpha * (A @ B) + beta * C`. This is the default activation function used, and does not need to be specified."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8d257833",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plan = cutlass.op.Gemm(element=np.float16, layout=cutlass.LayoutType.RowMajor)\n",
+    "plan.run(tensor_A, tensor_B, tensor_C, tensor_D, print_module=print_module)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "54961694",
+   "metadata": {},
+   "source": [
+    "## Run a GEMM with a ReLU element-wise activation function\n",
+    "CUTLASS makes it easy to support other element-wise activation functions. This results in performing an element-wise after the generic linear combination performed in a GEMM. If we call such an activation function `act`, the resulting formulation is:\n",
+    "```\n",
+    "D = alpha * (A @ B) + beta * C\n",
+    "D = act(D)\n",
+    "```\n",
+    "\n",
+    "Here, we will add a ReLU activation function. Given an input `x`, ReLU returns `max(x, 0)`.\n",
+    "\n",
+    "This is easy to do in CUTLASS. One only needs to set the plan's `activation` field."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5fe49443",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tensor_D_relu = np.zeros(tensor_C.shape).astype(type_D)\n",
+    "plan.activation = \"relu\"\n",
+    "plan.run(tensor_A, tensor_B, tensor_C, tensor_D_relu, print_module=print_module)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "455d0a37",
+   "metadata": {},
+   "source": [
+    "We can now verify that the result of the GEMM that used a ReLU activation function:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e32e7798",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "relu_ref = (tensor_D >= 0).astype(type_D) * tensor_D\n",
+    "np.testing.assert_array_equal(relu_ref, tensor_D_relu)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cf959171",
+   "metadata": {},
+   "source": [
+    "## Other element-wise activation functions\n",
+    "CUTLASS supports a variety of widely-used element-wise activation functions. We can obtain a list of these functions via the `get_activations()` method."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9e17d730",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "activations = plan.activations()\n",
+    "for activation in activations:\n",
+    "    print(activation)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0e4599fa",
+   "metadata": {},
+   "source": [
+    "We can then run each of them:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9c3598c9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for activation in activations:\n",
+    "    print('=============================================================================================')\n",
+    "    print(f'Compiling and running activation {activation}')\n",
+    "    print('=============================================================================================')\n",
+    "    plan.activation = activation\n",
+    "    plan.run(tensor_A, tensor_B, tensor_C, tensor_D, print_module=print_module)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "18828622",
+   "metadata": {},
+   "source": [
+    "To add an activation with parameter such as `leaky_relu`, a tuple should be provided containing the activation function name and the (or a list of) parameter."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "53108eae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "negative_slope = 0.5\n",
+    "plan.activation = (\"leaky_relu\", negative_slope)\n",
+    "plan.run(tensor_A, tensor_B, tensor_C, tensor_D, print_module=print_module)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/python/02_pytorch_extension_grouped_gemm.ipynb b/examples/python/02_pytorch_extension_grouped_gemm.ipynb
new file mode 100644
index 0000000000..86c86fb65c
--- /dev/null
+++ b/examples/python/02_pytorch_extension_grouped_gemm.ipynb
@@ -0,0 +1,300 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "6acbea5d",
+   "metadata": {},
+   "source": [
+    "# Exporting a CUTLASS grouped GEMM kernel to a PyTorch CUDA extension\n",
+    "This notebook walks through a basic example of using the CUTLASS Python interface to declare\n",
+    "a grouped GEMM kernel and export it as a PyTorch CUDA extension. Note that GEMM and Conv2d can also be exported as PyTorch CUDA extensions. \n",
+    "\n",
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cutlass/blob/main/examples/python/02_pytorch_extension_grouped_gemm.ipynb)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2d70560e",
+   "metadata": {},
+   "source": [
+    "## Prerequisites for running on Colab\n",
+    "This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cc7c7458",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!#nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2107bb0d",
+   "metadata": {},
+   "source": [
+    "If running on Colab, you will need to install the CUTLASS Python interface and PyTorch. To do so, uncomment the following line and run the cell:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a9852cb8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!#pip install nvidia-cutlass torch --extra-index-url https://download.pytorch.org/whl/cu121"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "962324fd",
+   "metadata": {},
+   "source": [
+    "## Background on grouped GEMM\n",
+    "Grouped GEMM enables one to execute a set of GEMMs (each with potentially different sizes and strides)\n",
+    "in a single CUDA kernel. It can be thought of as a generalized version of a pointer-array GEMM,\n",
+    "without the requirement that the sizes and strides of each GEMM be the same.\n",
+    "\n",
+    "For example, if one has `p` GEMMs with sizes:\n",
+    "```text\n",
+    "M_1 x N_1 x K_1\n",
+    "M_2 x N_2 x K_2\n",
+    "...\n",
+    "M_p x N_p x K_p\n",
+    "```\n",
+    "CUTLASS's grouped GEMM will execute these in a single CUDA kernel.\n",
+    "\n",
+    "Grouped GEMM is particularly beneficial for saturating the GPU with many small problems that would\n",
+    "insufficiently utilize the device in isolation.\n",
+    "\n",
+    "## Declaring a grouped GEMM via the CUTLASS Python interface\n",
+    "A grouped GEMM operation is declared similarly to a GEMM operation in the CUTLASS Python interface: one\n",
+    "simply calls `cutlass.op.GroupedGemm`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fdcf21d8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cutlass\n",
+    "import torch\n",
+    "\n",
+    "dtype = torch.float16\n",
+    "plan = cutlass.op.GroupedGemm(element=dtype, layout=cutlass.LayoutType.RowMajor)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "514f40a4",
+   "metadata": {},
+   "source": [
+    "We can then compile and run this operation on a group of GEMMs. We'll first set up some utility functions to initialize GEMMs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c2a7371e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import random\n",
+    "random.seed(2023)\n",
+    "\n",
+    "# Utility function to initialize A, B, C, and D matrices corresponding to dimensions M, N, and K\n",
+    "def initialize(dtype, M, N, K):\n",
+    "    sizes = [(M, K), (K, N), (M, N), (M, N)]\n",
+    "    return [torch.randint(-3, 3, size, device='cuda').to(dtype) for size in sizes]\n",
+    "\n",
+    "# Utility function to generate `problems` GEMMs of random sizes\n",
+    "def generate_problems(problems):\n",
+    "    valid_sizes = [128, 256, 512, 1024]\n",
+    "    As, Bs, Cs, Ds = [], [], [], []\n",
+    "    for _ in range(problems):\n",
+    "        M, N, K = [random.choice(valid_sizes) for _ in range(3)]\n",
+    "        A, B, C, D = initialize(dtype, M, N, K)\n",
+    "        As.append(A)\n",
+    "        Bs.append(B)\n",
+    "        Cs.append(C)\n",
+    "        Ds.append(D)\n",
+    "    return As, Bs, Cs, Ds"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "590a3bc5",
+   "metadata": {},
+   "source": [
+    "We'll next run a group of 20 GEMMs via the CUTLASS Python interface and via PyTorch."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "776c9233",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "As, Bs, Cs, Ds, = generate_problems(20)\n",
+    "\n",
+    "plan.run(As, Bs, Cs, Ds, print_module=True)\n",
+    "Ds_torch = [a @ b for a, b in zip(As, Bs)]\n",
+    "\n",
+    "for d, d_torch in zip(Ds, Ds_torch):\n",
+    "    assert torch.allclose(d, d_torch)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "766e4f03",
+   "metadata": {},
+   "source": [
+    "## Exporting the CUTLASS kernel to a PyTorch CUDA extension\n",
+    "The procedure above allows one to quickly experiment with using a CUTLASS kernels However, one might prefer to use the CUTLASS kernel via a [PyTorch CUDA extension](https://pytorch.org/tutorials/advanced/cpp_extension.html). This will avoids adding any runtime overheads associated with the Python portions of the CUTLASS Python interface.\n",
+    "\n",
+    "The CUTLASS Python interface provides simple solutions for creating PyTorch CUDA extensions for a CUTLASS kernel. These extensions can either be written out for a later \"ahead-of-time\" compilation, or be just-in-time compiled and returned to the user.\n",
+    "\n",
+    "To create a JIT-compiled module from the CUTLASS kernel we defined above, simply call the following:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3a98dee6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "op = plan.construct()\n",
+    "grouped_gemm = cutlass.emit.pytorch(op, name='grouped_gemm', cc=plan.cc, sourcedir='out', jit=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c8ca3991",
+   "metadata": {},
+   "source": [
+    "The `cutlass.emit.pytorch` function emits:\n",
+    "* `out/grouped_gemm_kernel.cu`: This file contains the declaration of the CUTLASS kernel and a method to call it from PyTorch tensors\n",
+    "* `out/grouped_gemm.cpp`: This file contains a C++ wrapper around the aforementioned CUTLASS kernel\n",
+    "* `setup.py`: This file contains the `setuptools` script for building and installing the generated extension\n",
+    "\n",
+    "The extension can be build from within the `module_output` directory by running:\n",
+    "```bash\n",
+    "TORCH_CUDA_ARCH_LIST=\"8.0\" python setup.py install\n",
+    "```\n",
+    "Where `TORCH_ARCH_LIST` is set to the compute capability of the device on which the kernel will be run.\n",
+    "\n",
+    "See the PyTorch [\"Custom C++ and CUDA Extensions\"](https://pytorch.org/tutorials/advanced/cpp_extension.html) tutorial for more details on this.\n",
+    "\n",
+    "The PyTorch CUDA extension could be built for this module by running:\n",
+    "```bash\n",
+    "cd out\n",
+    "TORCH_CUDA_ARCH_LIST=\"8.0\" python setup.py\n",
+    "```\n",
+    "(assuming that one is building for SM80)\n",
+    "\n",
+    "One could then use the kernel in a later PyTorch module by running:\n",
+    "\n",
+    "```python\n",
+    "import torch\n",
+    "import grouped_gemm\n",
+    "\n",
+    "grouped_gemm.run(As, Bs)\n",
+    "```\n",
+    "\n",
+    "In this case, however, we set `jit=True`, which specifies that we would like to compile and load the PyTorch CUDA extension on the fly.\n",
+    "Under the hood, this leverages the [torch.utils.cpp_extension.load](https://pytorch.org/tutorials/advanced/cpp_extension.html) method\n",
+    "and returns back the loaded extension.\n",
+    "\n",
+    "We can then use the extension and compare its results to running the GEMMs via vanilla PyTorch GEMMs:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cecb26a4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Ds = grouped_gemm.run(As, Bs)\n",
+    "Ds_torch = [a @ b for a, b in zip(As, Bs)]\n",
+    "for d, d_torch in zip(Ds, Ds_torch):\n",
+    "    assert torch.allclose(d, d_torch)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "50db80e4",
+   "metadata": {},
+   "source": [
+    "Finally, we can profile our grouped GEMM extension:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b76805d3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_warmup = 20\n",
+    "num_profile = 100\n",
+    "\n",
+    "# Warmup iterations\n",
+    "for _ in range(num_warmup):\n",
+    "    Ds = grouped_gemm.run(As, Bs)\n",
+    "    Ds_torch = [a @ b for a, b in zip(As, Bs)]\n",
+    "    torch.cuda.synchronize()\n",
+    "\n",
+    "# Timing iterations\n",
+    "import time\n",
+    "grouped = 0\n",
+    "nongrouped = 0\n",
+    "for _ in range(num_profile):\n",
+    "    start = time.time()\n",
+    "    Ds = grouped_gemm.run(As, Bs)\n",
+    "    torch.cuda.synchronize()\n",
+    "    grouped += time.time() - start\n",
+    "\n",
+    "    start = time.time()\n",
+    "    Ds_torch = [a @ b for a, b in zip(As, Bs)]\n",
+    "    torch.cuda.synchronize()\n",
+    "    nongrouped += time.time() - start\n",
+    "\n",
+    "print('Grouped:     {:.3f} us'.format(grouped * 1e6/num_profile))\n",
+    "print('Non-Grouped: {:.3f} us'.format(nongrouped * 1e6/num_profile))\n",
+    "print('Speedup: {:.3f}'.format(nongrouped / grouped))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/python/03_basic_conv2d.ipynb b/examples/python/03_basic_conv2d.ipynb
new file mode 100644
index 0000000000..d0eb452675
--- /dev/null
+++ b/examples/python/03_basic_conv2d.ipynb
@@ -0,0 +1,465 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Basic example of using the CUTLASS Python interface for Conv2d\n",
+    "\n",
+    "This notebook walks through a basic example of using the CUTLASS Python interface to declare, compile, and run Conv2d. \n",
+    "\n",
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cutlass/blob/main/examples/python/03_basic_conv2d.ipynb)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prerequisites for running on Colab\n",
+    "This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!#nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If running on Colab, you will need to install the CUTLASS Python interface. To do so, uncomment the following line and run the cell:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!#pip install nvidia-cutlass"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## General setup\n",
+    "We first import various packages needed for the example and construct the input and output tensors that will be used in our example."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import random\n",
+    "\n",
+    "import cutlass\n",
+    "\n",
+    "# This controls whether the C++ GEMM declaration will be printed at each step. \n",
+    "# Set to `false` to omit this information.\n",
+    "print_module = True\n",
+    "\n",
+    "# Input tensor: [N, H, W, C] under the channel-last layout\n",
+    "N, H, W, C = [32, 28, 28, 64]\n",
+    "\n",
+    "# Weight tensor: [K, R, S, C] under the channel-last layout\n",
+    "K, R, S = [128, 3, 3]\n",
+    "\n",
+    "# Stride, and padding\n",
+    "stride = (2, 2)\n",
+    "padding = (1, 1)\n",
+    "dilation = (1, 1)\n",
+    "\n",
+    "# Compute the output size [N, P, Q, K]\n",
+    "N, P, Q, K = cutlass.Conv2d.output_size((N, H, W, C), (K, R, S, C), padding, stride, dilation)\n",
+    "\n",
+    "dtype = torch.float16\n",
+    "type_A = torch.float16\n",
+    "type_B = torch.float16\n",
+    "type_C = torch.float16\n",
+    "type_D = torch.float16\n",
+    "\n",
+    "torch.manual_seed(1234)\n",
+    "\n",
+    "input = torch.ceil(\n",
+    "    torch.empty(size=(N, C, H, W), dtype=type_A, device=\"cuda\").uniform_(-4.5, 3.5)\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "weight = torch.ceil(\n",
+    "    torch.empty(size=(K, C, R, S), dtype=type_B, device=\"cuda\").uniform_(-4.5, 3.5)\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "tensor_C = torch.ceil(\n",
+    "    torch.empty(size=(N, K, P, Q), dtype=type_B, device=\"cuda\").uniform_(-4.5, 3.5)\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "output = torch.zeros_like(tensor_C)\n",
+    "\n",
+    "alpha = 1.0\n",
+    "beta = 0.0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Declaring and running a Conv2d Fprop\n",
+    "\n",
+    "We first show you how to run a Conv2d in the forward propagation. To get started, one only needs to provide the tensors declared above to the `cutlass.op.Conv2dFprop` call. This sets up a default Conv2d fprop operation for the given device on which you are running. \n",
+    "\n",
+    "Assuming that we are runing on SM80, the default is a Conv2d that leverages FP16 Tensor Core operations.\n",
+    "\n",
+    "Calling `plan.run()` will generate the CUTLASS C++ kernel in question, compile it, and run it on the tensors we previously passed in. By setting `print_module` to `true`, the C++ code that is emitted is printed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Specifying `element_accumulator` is not required if it is the same as `element`\n",
+    "plan = cutlass.Conv2dFprop(element=dtype, element_accumulator=torch.float32)\n",
+    "plan.run(input, weight, tensor_C, output, stride, padding, dilation, alpha, beta, print_module=print_module)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "There are many other ways to construct a plan from `cutlass.op.Conv2dFprop` (e.g., by specifying the types of each operand, by providing representative tensors as input). For more details on these, see the documentation in the `cutlass.op.Conv2dFprop` constructor.\n",
+    "\n",
+    "We then compare the output to running the Conv2d using PyTorch. PyTorch use NCHW layout by default, so permutations are required."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output_torch = alpha * torch.ops.aten.conv2d(\n",
+    "    input, weight, stride=stride, padding=padding, dilation=dilation\n",
+    ") + beta * tensor_C\n",
+    "\n",
+    "assert torch.equal(output_torch, output)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note that one could use the same kernel just declared for tensors provided by other frameworks beyond PyTorch, such as NumPy."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Declaring and running Conv2d Dgrad and Wgrad\n",
+    "\n",
+    "The Python interface also supports declaring and running backward kernels of Conv2d. To begin with, we construct the tensors for the gradient of input, output, and weight."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "grad_output = torch.ceil(\n",
+    "    torch.empty(size=(N, K, P, Q), dtype=type_A, device=\"cuda\").uniform_(-4.5, 3.5)\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "grad_input = torch.zeros_like(input)\n",
+    "grad_weight = torch.zeros_like(weight)\n",
+    "\n",
+    "tensor_C_dgrad = torch.ceil(\n",
+    "    torch.empty(size=(N, C, H, W), dtype=type_A, device=\"cuda\").uniform_(-4.5, 3.5)\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "tensor_C_wgrad = torch.ceil(\n",
+    "    torch.empty(size=(K, C, R, S), dtype=type_B, device=\"cuda\").uniform_(-4.5, 3.5)\n",
+    ").to(memory_format=torch.channels_last)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The script below gives a simple example of computing a data gradient via the CUTLASS Python interface and via PyTorch."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plan_dgrad = cutlass.Conv2dDgrad(element=dtype, element_accumulator=torch.float32)\n",
+    "plan_dgrad.run(grad_output, weight, tensor_C_dgrad, grad_input, stride, padding, dilation, alpha, beta, print_module=print_module)\n",
+    "\n",
+    "grad_input_torch = alpha * torch.nn.grad.conv2d_input(\n",
+    "    (N, C, H, W),\n",
+    "    weight, grad_output,\n",
+    "    stride=stride, padding=padding\n",
+    ") + beta * tensor_C_dgrad\n",
+    "\n",
+    "assert torch.equal(grad_input_torch, grad_input)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The script below gives a simple example of computing a weight gradient via the CUTLASS Python interface and via PyTorch."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plan_wgrad = cutlass.Conv2dWgrad(element=dtype, element_accumulator=torch.float32)\n",
+    "plan_wgrad.run(grad_output, input, tensor_C_wgrad, grad_weight, stride, padding, dilation, alpha, beta, print_module=print_module)\n",
+    "\n",
+    "grad_weight_torch = alpha * torch.nn.grad.conv2d_weight(\n",
+    "    input, (K, C, R, S), grad_output,\n",
+    "    stride=stride, padding=padding\n",
+    ") + beta * tensor_C_wgrad\n",
+    "\n",
+    "assert torch.equal(grad_weight_torch, grad_weight)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Running non-default Conv2ds\n",
+    "\n",
+    "The previous examples showed how it is simple to get starting running a default Conv2d kernel in CUTLASS. But, what do you do if you want a bit more control over the parameters to the Conv2d? CUTLASS Python interface exposes mutable parameters that can be set after the `plan` initialization. We summarize these in the table below.\n",
+    "\n",
+    "|Parameter|Description|\n",
+    "| --      | --      |\n",
+    "|`tile_description`|The threadblock tile size, warp count, software pipeline stages, and instruction shape|\n",
+    "|`iterator_algorithm`|The iterator algorithm used to access the source operands|\n",
+    "|`swizzling_stride`|The stride of the threadblock swizzling functor|\n",
+    "|`split-K`|Partitions the reduction dimension to different threadblocks|"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Tile Description\n",
+    "\n",
+    "The `tile_description` defines the tiling size of each threadblock, the warp count along each dimension of the tile, the software pipeline stages, and the instruction size. Under the hood, CUTLASS enumerates the different Conv2d configuration parameters for this kernel from the CUTLASS profiler. The code below shows how one can access the tile descriptions for the kernel (e.g., threadblock and warp shape)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plan.opclass = \"tensor_op\"\n",
+    "tiles = plan.tile_descriptions()\n",
+    "print(f'{len(tiles)} tile descriptions returned')\n",
+    "num_print = 10\n",
+    "print(f'First {num_print} tile descriptions are:')\n",
+    "for td in tiles[:num_print]:\n",
+    "    print(td)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, we'll pick one of these configurations at random and compile and run it."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "random.seed(42)\n",
+    "idx = random.randint(0, len(tiles)-1)\n",
+    "td = tiles[idx]\n",
+    "print(f'Tile description {idx} is: {td}')\n",
+    "plan.tile_description = td\n",
+    "plan.run(input, weight, tensor_C, output, stride, padding, dilation, alpha, beta, print_module=print_module)\n",
+    "assert torch.equal(output_torch, output)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Besides tile descriptions enumerated by CUTLASS, the users can also explicitly set the `threadblockshape`, `warp_shape`, `stages`, `instruction_shape`, and `cluster_shape`. If the configuration is invalid, an exception will be raised at `plan.run()` and the detailed compilation error will be stored in `./cutlass_python_compilation_error.txt` for debugging."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if plan.cc == 70:\n",
+    "    plan.tile_description = {\n",
+    "        \"threadblock_shape\": [64, 256, 32],\n",
+    "        \"warp_count\": [1, 4, 1],\n",
+    "        \"stages\": 2,\n",
+    "        \"instruction_shape\": [8, 8, 4], # optional,\n",
+    "        \"cluster_shape\": [1, 1, 1] # optional, only [1, 1, 1] is supported currently\n",
+    "    }\n",
+    "elif plan.cc == 75:\n",
+    "    plan.tile_description = {\n",
+    "        \"threadblock_shape\": [128, 64, 32],\n",
+    "        \"warp_count\": [2, 1, 1],\n",
+    "        \"stages\": 2,\n",
+    "        \"instruction_shape\": [16, 8, 8], # optional,\n",
+    "        \"cluster_shape\": [1, 1, 1] # optional, only [1, 1, 1] is supported currently\n",
+    "    }\n",
+    "elif plan.cc == 80:\n",
+    "    plan.tile_description = {\n",
+    "        \"threadblock_shape\": [128, 128, 64],\n",
+    "        \"warp_count\": [2, 2, 1],\n",
+    "        \"stages\": 4,\n",
+    "        \"instruction_shape\": [16, 8, 16], # optional,\n",
+    "        \"cluster_shape\": [1, 1, 1] # optional, only [1, 1, 1] is supported currently\n",
+    "    }\n",
+    "elif plan.cc == 86:\n",
+    "    plan.tile_description = {\n",
+    "        \"threadblock_shape\": [128, 64, 64],\n",
+    "        \"warp_count\": [2, 2, 1],\n",
+    "        \"stages\": 3,\n",
+    "        \"instruction_shape\": [16, 8, 16],\n",
+    "        \"cluster_shape\": [1, 1, 1]\n",
+    "    }\n",
+    "\n",
+    "plan.run(input, weight, tensor_C, output, stride, padding, dilation, alpha, beta, print_module=print_module)\n",
+    "assert torch.equal(output_torch, output)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Iterator Algorithm\n",
+    "\n",
+    "The iterator algorithm describes how sources are loaded from memory. There are some iterator algorithms optimized for specific alignments and input/output channels that have better performance. The table below illustrates the available iterator algorithms.\n",
+    "\n",
+    "|Conv Kind | Iterator Algorithm | Description |\n",
+    "| --       | --                 | --          |\n",
+    "|Fprop     | \"analytic\"         | Functionally correct in all cases but lower performance |\n",
+    "|          | \"optimized\"        | Optimized for and requires `R <= 32`, `S<= 32`, and `C % alignment_input == 0`|\n",
+    "|          | \"few_channels\"     | optimized for small `C` and requires `C % alignment_input == 0`|\n",
+    "|          | \"fixed_channels\"   | optimized for small `C` and requires `C == alignment_input` |\n",
+    "|Dgrad     | \"analytic\"         | Functionally correct in all cases but lower performance |\n",
+    "|          | \"optimized\"        | Optimzed for and require `R <= 32`, `S<= 32`, `K % alignment_grad_output == 0`, and `C % alignment_weight == 0`|\n",
+    "|Wgrad     | \"analytic\"         | Functionally correct in all cases but lower performance |\n",
+    "|          | \"optimized\"        | Optimized for and require `K % alignment_grad_output == 0`, and `C % alignment_input == 0`|\n",
+    "\n",
+    "By default, the Python interface will automatically propose a suitable iterator algorithm based on the input tensors in `plan.run()`. However, the user can also specify the desired iterator algorithm as follows"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plan.iterator_algorithm = \"analytic\"\n",
+    "plan.run(input, weight, tensor_C, output, stride, padding, dilation, alpha, beta, print_module=print_module)\n",
+    "assert torch.equal(output_torch, output)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If the iterator algorithm is invalid for the problem size in `plan.run()`, an exception will be raised."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Swizzling Stride\n",
+    "The swizzling changes how the tile are mapped to threadblocks to improve the L2 Locality. Given a swizzling stride `N`, the threadblock `(tb_x, tb_y)` computes tile `(tb_x / N, tb_y * N + (tb_x % N))`. Currently, stride values of `1`, `2`, `4`, and `8` are supported for `fprop`, `wgrad`, and `1`, and `4` for `dgrad`. The swizzling stride can be set with:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plan.swizzling_stride = 4\n",
+    "plan.run(input, weight, tensor_C, output, stride, padding, dilation, alpha, beta, print_module=print_module)\n",
+    "assert torch.equal(output_torch, output)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Split-K\n",
+    "Split-K is usually applied when the Conv2d has small spatial dimensions and large reduction dimension to ensure good utilization. It further partitions the reduction dimension to different threadblocks. The CUTLASS Python interface supports two types of split-K strategies: `Parallel`, and `Serial`. \n",
+    "* `Parallel`: the partial results from different threadblocks are stored in a temporary buffer in the global memory. When the Conv2d is done, a separate reduction kernel is created and launched to reduce the partial results.\n",
+    "* `Serial`: A semaphore is used to coordinate the order of different threadblocks adding their partial results to a given output tile. A separate kernel does not need to be launched for prforming the reduction.\n",
+    "\n",
+    "While all `fprop`, `dgrad`, and `wgrad` support split-K, here we use `wgrad` as an example. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Parallel Split-K with 5 slices\n",
+    "grad_weight_parallel = torch.zeros_like(grad_weight)\n",
+    "plan_wgrad.run(\n",
+    "    grad_output, input, tensor_C_wgrad, grad_weight_parallel,  \n",
+    "    stride, padding, dilation, alpha, beta, print_module=print_module, split_k=(\"parallel\", 5))\n",
+    "assert torch.equal(grad_weight_torch, grad_weight_parallel)\n",
+    "\n",
+    "# Serial Split-K with 3 slices\n",
+    "grad_weight_serial = torch.zeros_like(grad_weight)\n",
+    "plan_wgrad.run(\n",
+    "    grad_output, input, tensor_C_wgrad, grad_weight_serial,  \n",
+    "    stride, padding, dilation, alpha, beta, print_module=print_module, split_k=(\"serial\", 3))\n",
+    "assert torch.equal(grad_weight_torch, grad_weight_serial)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/examples/python/04_epilogue_visitor.ipynb b/examples/python/04_epilogue_visitor.ipynb
new file mode 100644
index 0000000000..cf66cd2414
--- /dev/null
+++ b/examples/python/04_epilogue_visitor.ipynb
@@ -0,0 +1,258 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "5d24a692",
+   "metadata": {},
+   "source": [
+    "# Example of using epilogue visitor in the CUTLASS Python interface\n",
+    "This notebook walks through a basic example of using the CUTLASS Python interface to declare, compile, and run GEMMs with different epilogues through CUTLASS Epilogue Visitor.\n",
+    "\n",
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cutlass/blob/main/examples/python/04_epilogue_visitor.ipynb)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3a800e79",
+   "metadata": {},
+   "source": [
+    "## Prerequisites for running on Colab\n",
+    "This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9cfff2c8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!#nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "06706f00",
+   "metadata": {},
+   "source": [
+    "If running on Colab, you will need to install the CUTLASS Python interface. To do so, uncomment the following line and run the cell:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "491a7314",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!#pip install nvidia-cutlass"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "962324fd",
+   "metadata": {},
+   "source": [
+    "## General setup\n",
+    "We first import various packages needed for the example, construct the input and output tensors that will be used in our example."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "63a70a3c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import cutlass\n",
+    "from cutlass.epilogue import relu\n",
+    "from cutlass import Tensor as FakeTensor\n",
+    "from cutlass.utils.profiler import CUDAEventProfiler\n",
+    "\n",
+    "# This controls whether ther C++ GEMM declaration will be printed at each step. Set to `false` to\n",
+    "# omit this information.\n",
+    "print_module = True\n",
+    "\n",
+    "# The Epilogue Visitor feature currently only works for SM80 and 90\n",
+    "from cutlass.backend.utils.device import device_cc\n",
+    "if device_cc() not in [80, 90]:\n",
+    "    import sys\n",
+    "    sys.exit()\n",
+    "\n",
+    "m = 16384\n",
+    "n = m\n",
+    "k = 512\n",
+    "\n",
+    "type_A = torch.float16\n",
+    "type_B = torch.float16\n",
+    "type_C = torch.float16\n",
+    "type_D = torch.float16\n",
+    "\n",
+    "torch.manual_seed(2023)\n",
+    "scope_min = -4\n",
+    "scope_max = 4\n",
+    "tensor_A = torch.ceil(torch.empty(size=(m, k), dtype=type_A, device=\"cuda\").uniform_(scope_min, scope_max))\n",
+    "tensor_B = torch.ceil(torch.empty(size=(k, n), dtype=type_B, device=\"cuda\").uniform_(scope_min, scope_max))\n",
+    "tensor_C = torch.ceil(torch.empty(size=(m, n), dtype=type_C, device=\"cuda\").uniform_(scope_min, scope_max))\n",
+    "tensor_D = torch.zeros_like(tensor_C)\n",
+    "\n",
+    "plan = cutlass.op.Gemm(element=torch.float16, layout=cutlass.LayoutType.RowMajor, element_accumulator=torch.float32)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1eb0d95b",
+   "metadata": {},
+   "source": [
+    "## Define the epilogue visitor functor\n",
+    "The epilogue functor can be defined as a simple Python function and a set of example tensors for inputs and outputs. The example below illustrates a complex epilogue under the directed acyclic graph structure (`F` is used twice). The epilogue takes source tensors in different ranks: `alpha`, `beta` are scalars, `bias` is a column vector to broadcast, and `C`, `aux` are matrices. It contains various math operations from basic arithmatic operations and built-in callable functions like `relu`. It also accomodates multiple outputs `D` and `F`. Note that there are some restrictions on syntax.\n",
+    "* Each named variable must be assigned exactly once and defined before it used.\n",
+    "* Reserved names: `accum`, `C`, and `D` are reserved for accumulator, tensor_C, and tensor_D.\n",
+    "* Return values must be a named variable.\n",
+    "\n",
+    "The example tensors is a dictionary with tensor names as keys and reference tensors as values. The reference tensors can be `float`, `torch.Tensor`, `numpy.ndarray`, or our `FakeTensor`. They provides the shape and data type information of the inputs and outputs of the epilogue.\n",
+    "\n",
+    "The epilogue can be generated simply through `cutlass.evt.trace(<epilogue function>, <example_tensors>)`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8d257833",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define epilogue visitor\n",
+    "def example_epilogue(accum, alpha, C, beta, aux, bias):\n",
+    "    F = alpha * accum + (beta * C + aux)\n",
+    "    E = relu(F + 1) + bias\n",
+    "    D = E + F\n",
+    "    return D, F\n",
+    "\n",
+    "# Construct inputs and outputs\n",
+    "alpha = 0.5\n",
+    "beta = 0.5\n",
+    "aux = torch.ceil(torch.empty(size=(m, n), dtype=type_C, device=\"cuda\").uniform_(scope_min, scope_max))\n",
+    "bias = torch.ceil(torch.empty(size=(m, 1), dtype=type_C, device=\"cuda\").uniform_(scope_min, scope_max))\n",
+    "tensor_F = torch.zeros_like(tensor_D)\n",
+    "examples_tensors = {\n",
+    "    \"accum\": FakeTensor(element=torch.float32, shape=(m, n), layout_tag=cutlass.LayoutType.RowMajor),\n",
+    "    \"alpha\": alpha,\n",
+    "    \"C\": tensor_C,\n",
+    "    \"beta\": beta,\n",
+    "    \"aux\": aux,\n",
+    "    \"bias\": bias,\n",
+    "    \"D\": tensor_D,\n",
+    "    \"F\": tensor_F\n",
+    "}\n",
+    "\n",
+    "# Trace the epilogue visitor\n",
+    "epilogue_visitor = cutlass.epilogue.trace(example_epilogue, examples_tensors)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "54961694",
+   "metadata": {},
+   "source": [
+    "## Run a GEMM with the epilogue visitor functor\n",
+    "The `epilogue_visitor` can be used by setting the plan's `epilogue_visitor` field. The arguments for the epilogue visitor are provided as a `dict` through the `visitor_args` keyword argument."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5fe49443",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "visitor_args = {\n",
+    "    \"alpha\": alpha, \"C\": tensor_C, \"beta\": beta, \n",
+    "    \"aux\": aux, \"bias\": bias, \"D\": tensor_D, \"F\": tensor_F\n",
+    "}\n",
+    "\n",
+    "plan.epilogue_visitor = epilogue_visitor\n",
+    "plan.run(\n",
+    "    tensor_A, tensor_B, tensor_C, tensor_D, \n",
+    "    visitor_args=visitor_args, print_module=print_module)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "455d0a37",
+   "metadata": {},
+   "source": [
+    "The epilogue function `example_epilogue` can be used as a reference function. We can now verify the results simply with"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e32e7798",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class TorchReference(torch.nn.Module):\n",
+    "    def forward(self, A, B, alpha, C, beta, aux, bias):\n",
+    "        accum = torch.matmul(A, B)\n",
+    "        return example_epilogue(accum, alpha, C, beta, aux, bias)\n",
+    "\n",
+    "torch_reference = TorchReference()\n",
+    "tensor_D_ref, tensor_F_ref = torch_reference(tensor_A, tensor_B, alpha, tensor_C, beta, aux, bias)\n",
+    "\n",
+    "assert torch.equal(tensor_D, tensor_D_ref)\n",
+    "assert torch.equal(tensor_F, tensor_F_ref)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b69e441f",
+   "metadata": {},
+   "source": [
+    "The performance of CUTLASS fused kernel can be profiled with"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8db92150",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "warmup_iterations = 10\n",
+    "profile_iterations = 50\n",
+    "# Profile CUTLASS fused kernel\n",
+    "duration = CUDAEventProfiler(\n",
+    "    plan, warmup_iterations, profile_iterations,\n",
+    "    tensor_A, tensor_B, tensor_C, tensor_D, \n",
+    "    visitor_args=visitor_args)()\n",
+    "\n",
+    "print(f\"CUTLASS duration: {duration:.2f} ms\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/python/README.md b/examples/python/README.md
new file mode 100644
index 0000000000..590f2e24e4
--- /dev/null
+++ b/examples/python/README.md
@@ -0,0 +1,22 @@
+# Examples of using the CUTLASS Python interface
+
+* [00_basic_gemm](/examples/python/00_basic_gemm.ipynb)
+
+    Shows how declare, configure, compile, and run a CUTLASS GEMM using the Python interface
+
+* [01_epilogue](/examples/python/01_epilogue.ipynb)
+
+    Shows how to fuse elementwise activation functions to GEMMs via the Python interface
+
+* [02_pytorch_extension_grouped_gemm](/examples/python/02_pytorch_extension_grouped_gemm.ipynb)
+
+    Shows how to declare, compile, and run a grouped GEMM operation via the Python interface,
+    along with how the emitted kernel can be easily exported to a PyTorch CUDA extension.
+
+* [03_basic_conv2d](/examples/python/03_basic_conv2d.ipynb)
+
+    Shows how to declare, configure, compile, and run a CUTLASS Conv2d using the Python interface
+
+* [04_epilogue_visitor](/examples/python/04_epilogue_visitor.ipynb)
+
+    Shows how to fuse elementwise activation functions to GEMMs via the Python Epilogue Visitor interface
diff --git a/include/cute/algorithm/axpby.hpp b/include/cute/algorithm/axpby.hpp
new file mode 100644
index 0000000000..339743f491
--- /dev/null
+++ b/include/cute/algorithm/axpby.hpp
@@ -0,0 +1,95 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+
+#include <cute/tensor_impl.hpp>
+#include <cute/tensor_predicate.hpp>
+
+namespace cute
+{
+
+//
+// Accept mutable temporaries
+//
+template <class Alpha,
+          class XEngine, class XLayout,
+          class Beta,
+          class YEngine, class YLayout,
+          class PrdTensor = TrivialPredTensor>
+CUTE_HOST_DEVICE
+void
+axpby(Alpha                    const& alpha,
+      Tensor<XEngine, XLayout> const& x,
+      Beta                     const& beta,
+      Tensor<YEngine, YLayout>     && y,
+      PrdTensor                const& p = {})
+{
+  return axpby(alpha, x, beta, y, p);
+}
+
+//
+// AXPBY
+//
+template <class Alpha,
+          class XEngine, class XLayout,
+          class Beta,
+          class YEngine, class YLayout,
+          class PrdTensor = TrivialPredTensor>
+CUTE_HOST_DEVICE
+void
+axpby(Alpha                    const& alpha,
+      Tensor<XEngine, XLayout> const& x,
+      Beta                     const& beta,
+      Tensor<YEngine, YLayout>      & y,
+      PrdTensor                const& p = {})
+{
+  auto isBetaZero = [&] () {
+    if constexpr (is_complex<Beta>::value) {
+      return beta.real() == Int<0>{} && beta.imag() == Int<0>{};
+    }
+    else {
+      return beta == Int<0>{};
+    }
+
+    CUTE_GCC_UNREACHABLE;
+  } ();
+
+  CUTE_UNROLL
+  for (int i = 0; i < size(x); ++i) {
+    if (p(i)) {
+      y(i) = (isBetaZero ? alpha * x(i) : alpha * x(i) + beta * y(i));
+    }
+  }
+}
+
+} // end namespace cute
diff --git a/include/cute/algorithm/clear.hpp b/include/cute/algorithm/clear.hpp
new file mode 100644
index 0000000000..0b3a8eaa1d
--- /dev/null
+++ b/include/cute/algorithm/clear.hpp
@@ -0,0 +1,64 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>          // CUTE_HOST_DEVICE
+#include <cute/tensor_impl.hpp>     // cute::Tensor
+#include <cute/algorithm/fill.hpp>  // cute::fill
+
+namespace cute
+{
+
+//
+// Accept mutable temporaries
+//
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE
+void
+clear(Tensor<Engine, Layout>&& tensor)
+{
+  return clear(tensor);
+}
+
+//
+// Set elements to zero
+//
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE
+void
+clear(Tensor<Engine, Layout>& tensor)
+{
+  using T = typename Tensor<Engine,Layout>::value_type;
+
+  fill(tensor, T{});
+}
+
+} // end namespace cute
diff --git a/include/cute/algorithm/cooperative_copy.hpp b/include/cute/algorithm/cooperative_copy.hpp
new file mode 100644
index 0000000000..c9e02245e2
--- /dev/null
+++ b/include/cute/algorithm/cooperative_copy.hpp
@@ -0,0 +1,339 @@
+/***************************************************************************************************
+* Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+* SPDX-License-Identifier: BSD-3-Clause
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*
+* 1. Redistributions of source code must retain the above copyright notice, this
+* list of conditions and the following disclaimer.
+*
+* 2. Redistributions in binary form must reproduce the above copyright notice,
+* this list of conditions and the following disclaimer in the documentation
+* and/or other materials provided with the distribution.
+*
+* 3. Neither the name of the copyright holder nor the names of its
+* contributors may be used to endorse or promote products derived from
+* this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+**************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+#include <cute/layout.hpp>
+#include <cute/layout_composed.hpp> // cute::logical_divide
+#include <cute/swizzle.hpp>         // cute::Swizzle
+#include <cute/swizzle_layout.hpp>  // cute::get_nonswizzle_portion
+#include <cute/tensor_impl.hpp>     // cute::Tensor
+#include <cute/tensor_predicate.hpp>
+#include <cute/algorithm/copy.hpp>
+#include <cute/atom/copy_atom.hpp>
+
+namespace cute
+{
+
+template <uint32_t NumThreads,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE void
+naive_cooperative_copy(uint32_t                     const& tid,
+                       Tensor<SrcEngine, SrcLayout> const& src,
+                       Tensor<DstEngine, DstLayout>      & dst)
+{
+  auto N = size(dst);
+  auto R = N % Int<NumThreads>{};
+  if (R > 0 && tid < R) {                                             // Likely static condition && Residue in-bounds
+    dst[tid] = src[tid];
+  }
+  CUTE_UNROLL
+  for (uint32_t i = uint32_t(R); i < uint32_t(N); i += NumThreads) {  // All in-bounds
+    dst[tid + i] = src[tid + i];
+  }
+}
+
+// Accept mutable temporaries
+template <uint32_t NumThreads,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE void
+naive_cooperative_copy(uint32_t                     const& tid,
+                       Tensor<SrcEngine, SrcLayout> const& src,
+                       Tensor<DstEngine, DstLayout>     && dst)
+{
+  return naive_cooperative_copy(tid, src, dst);
+}
+
+// A heuristic to determine a "good" permutation of two tensors for later vectorization and thr-assignment
+template <class AEngine, class ALayout,
+          class BEngine, class BLayout>
+CUTE_HOST_DEVICE constexpr
+auto
+heuristic_permutation(Tensor<AEngine, ALayout> const& a,
+                      Tensor<BEngine, BLayout> const& b)
+{
+  constexpr bool swizzleA = get_swizzle_t<AEngine>::num_bits != 0 or
+                            get_swizzle_t<ALayout>::num_bits != 0;
+  constexpr bool swizzleB = get_swizzle_t<BEngine>::num_bits != 0 or
+                            get_swizzle_t<BLayout>::num_bits != 0;
+  auto a_inv = right_inverse(get_nonswizzle_portion(a.layout()));
+  auto b_inv = right_inverse(get_nonswizzle_portion(b.layout()));
+
+  constexpr uint8_t scoreA = (uint8_t(swizzleA)                  << 2) |
+                             (uint8_t(is_smem<AEngine>::value)   << 1) |
+                             (uint8_t(size(a_inv) > size(b_inv)) << 0);
+
+  constexpr uint8_t scoreB = (uint8_t(swizzleB)                  << 2) |
+                             (uint8_t(is_smem<BEngine>::value)   << 1) |
+                             (uint8_t(size(b_inv) > size(a_inv)) << 0);
+
+  if constexpr (scoreA >= scoreB) {
+    return a_inv;
+  } else {
+    return b_inv;
+  }
+}
+
+// cooperative_copy<NumThreads, MaxVecBits>(thr_idx, src, dst)
+// Use NumThreads to copy Tensor src to Tensor dst with element-wise vectorization up to MaxVecBits.
+// @pre 0 <= @a tid < NumThreads
+// @pre Tensors @a src and @a dst are aligned up to MaxVecBits.
+//      That is, pointers and dynamic strides are assumed to be aligned up to MaxVecBits.
+//
+template <uint32_t NumThreads, uint32_t MaxVecBits,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout,
+          class CopyPolicy = DefaultCopy>
+CUTE_HOST_DEVICE
+void
+cooperative_copy(uint32_t                     const& tid,
+                 Tensor<SrcEngine, SrcLayout> const& src,
+                 Tensor<DstEngine, DstLayout>      & dst,
+                 CopyPolicy                   const& cpy = {})
+{
+  // Assumes the shapes are static, can generalize/fallback
+  CUTE_STATIC_ASSERT_V(is_static<decltype(shape(src))>{} && is_static<decltype(shape(dst))>{});
+  CUTE_STATIC_ASSERT_V(size(src) == size(dst));
+  // Assumes the types are the same, can generalize/fallback
+  static_assert(cute::is_same<typename SrcEngine::value_type, typename DstEngine::value_type>::value);
+  static_assert(MaxVecBits == sizeof_bits_v<typename SrcEngine::value_type> ||
+                MaxVecBits == 8 || MaxVecBits == 16 || MaxVecBits == 32 || MaxVecBits == 64 || MaxVecBits == 128,
+                "Expected MaxVecBits to be value size or 8 or 16 or 32 or 64 or 128 for alignment and performance.");
+  // Check that the tensors are likely shared across threads: either gmem or smem
+  static_assert((is_gmem<SrcEngine>::value || is_smem<SrcEngine>::value),
+                "cooperative_copy expects shared gmem or smem source tensor.");
+  static_assert((is_gmem<DstEngine>::value || is_smem<DstEngine>::value),
+                "cooperative_copy expects shared gmem or smem destination tensor.");
+  // Precondition on tid in DEBUG
+  assert(tid < NumThreads);
+  // Precondition on pointer alignment in DEBUG
+  assert(is_byte_aligned<ceil_div(MaxVecBits,8u)>(raw_pointer_cast(src.data())));
+  assert(is_byte_aligned<ceil_div(MaxVecBits,8u)>(raw_pointer_cast(dst.data())));
+
+#if 0
+      if (thread0()) {
+        print("   "); print("cooperative_copy\n");
+        print("   "); print("NumThreads: "); print(NumThreads); print("\n");
+        print("   "); print("MaxVecBits: "); print(MaxVecBits); print("\n");
+        print("   "); print("src: "); print(src); print("\n");
+        print("   "); print("dst: "); print(dst); print("\n");
+      }
+#ifdef __CUDA_ARCH__
+      __syncthreads();
+#endif
+#endif
+
+  // The common layout of the two tensors that can be vectorized over elements and threads
+  // vidx -> coord
+  auto common_layout = heuristic_permutation(src, dst);
+
+  // Apply
+  // (V, rest)
+  Tensor src_a = coalesce(logical_divide(src, common_layout), Shape<_1,_1>{});
+  Tensor dst_a = coalesce(logical_divide(dst, common_layout), Shape<_1,_1>{});
+
+  //
+  // Determine vectorization of elems and thrs based on src/dst size and number of threads
+  // NOTE: This heuristic promotes parallelization over vectorization
+  //
+
+  // The number of elements and number of bits
+  constexpr int  elem_bits = sizeof_bits_v<typename SrcEngine::value_type>;
+  constexpr int total_elem = size(SrcLayout{});
+
+  // The number of elements that can be vectorized in values
+  constexpr int common_elem = decltype(max_common_vector(src_a, dst_a))::value;
+
+#if 0
+      if (thread0()) {
+        print("   "); print("common_layout: "); print(common_layout); print("\n");
+        print("   "); print("src_a: "); print(src_a); print("\n");
+        print("   "); print("dst_a: "); print(dst_a); print("\n");
+      }
+#ifdef __CUDA_ARCH__
+      __syncthreads();
+#endif
+#endif
+
+  //
+  if constexpr (total_elem % NumThreads != 0) {
+    // Not attempting to find a partitioning pattern, fallback to dynamically indexed slowpath
+
+    if constexpr (common_elem > 1 && MaxVecBits > elem_bits) {
+      // If the vectorization is non-trivial and divides the maximum vectorizations, then vectorize
+      constexpr auto max_align_src = elem_bits * decltype(max_alignment(src_a.layout()))::value;
+      constexpr auto max_align_dst = elem_bits * decltype(max_alignment(dst_a.layout()))::value;
+      constexpr auto vec_bits = gcd(max_align_src, max_align_dst, MaxVecBits);
+      using VecType = uint_bit_t<vec_bits>;
+
+      static_assert(vec_bits % elem_bits == 0, "Expected divisibility");
+      static_assert((vec_bits >= 8), "No support for subbyte copying");
+
+      Tensor src_v = recast<VecType const>(src_a);
+      Tensor dst_v = recast<VecType      >(dst_a);
+
+#if 0
+      if (thread0()) {
+        print("   "); print("cooperative_copy -- naive\n");
+        print("   "); print("src_v: "); print(src_v); print("\n");
+        print("   "); print("dst_v: "); print(dst_v); print("\n");
+      }
+#ifdef __CUDA_ARCH__
+      __syncthreads();
+#endif
+#endif
+
+      naive_cooperative_copy<NumThreads>(tid, src_v, dst_v);
+    } else {
+      naive_cooperative_copy<NumThreads>(tid, src_a, dst_a);
+    }
+  } else {
+    // If the tensors can be equally partitioned by the threads,
+    // compute vectorization widths in elements and threads.
+
+    // If there are too many threads to allow a full vectorized copy, trunc the vectorization
+    constexpr int total_bits = total_elem * elem_bits;
+    constexpr int max_bits_per_thr = total_bits / NumThreads;
+    // At least elem_bits, at most common_bits
+    constexpr int common_bits = common_elem * elem_bits;
+    constexpr int vec_bits = cute::max(elem_bits, cute::gcd(common_bits, int(MaxVecBits), max_bits_per_thr));
+
+    // Should account for vec_bits < 8 and/or vec_elem <= 1
+    // And also account for subbyte types, which could cause race conditions
+    // Want to ENFORCE sufficient vectorization in those cases
+    static_assert(vec_bits % elem_bits == 0, "Expected divisibility");
+    static_assert(vec_bits >= 8, "No support for subbyte copying");
+
+    using VecType = uint_bit_t<vec_bits>;
+    constexpr int vec_elem = vec_bits / elem_bits;
+
+    constexpr int vec_thrs = cute::min(int(NumThreads), total_elem / vec_elem);
+
+    //
+    // Determine the partitioning patterns for the vec_elems and vec_thrs
+    //
+
+    // Distribute the rest of the V*T to some consistent portion outside of the common_layout, if needed
+    auto common_domain_src = domain_distribute(shape(src_a), Int<vec_elem*vec_thrs>{});
+    auto common_domain_dst = domain_distribute(shape(dst_a), Int<vec_elem*vec_thrs>{});
+
+    // Make sure for now, could fall back here instead
+    CUTE_STATIC_ASSERT_V(size(common_domain_src) == Int<vec_elem*vec_thrs>{});
+    CUTE_STATIC_ASSERT_V(compatible(common_domain_src, common_domain_dst) ||
+                         compatible(common_domain_dst, common_domain_src));
+    // Use the "more specific" domain for the extra elements of V*T
+    auto common_domain = conditional_return(compatible(common_domain_src, common_domain_dst),
+                                            common_domain_dst, common_domain_src);
+
+    // Construct the tiler
+    auto tiler_vt = common_domain.with_shape(Int<vec_elem>{}, Int<vec_thrs>{});
+
+    // Apply and slice
+    Tensor src_v = logical_divide(src_a, tiler_vt)(make_coord(_,tid),_);
+    Tensor dst_v = logical_divide(dst_a, tiler_vt)(make_coord(_,tid),_);
+
+#if 0
+      if (thread0()) {
+        print("   "); print("cooperative_copy -- vec\n");
+        print("   "); print("Used vector: ");  print(vec_elem); print("\n");
+        print("   "); print("Used threads: ");  print(vec_thrs); print("\n");
+        print("   "); print("tiler_vt: "); print(tiler_vt); print("\n");
+        print("   "); print("src_v: "); print(src_v); print("\n");
+        print("   "); print("dst_v: "); print(dst_v); print("\n");
+        print("   "); print("recast<VecType const>(src_v): "); print(recast<VecType const>(src_v)); print("\n");
+        print("   "); print("recast<VecType      >(dst_v): "); print(recast<VecType      >(dst_v)); print("\n");
+      }
+#ifdef __CUDA_ARCH__
+      __syncthreads();
+#endif
+#endif
+
+    // If we're using all threads (static) or the tid is in-range (dynamic)
+    if (vec_thrs == NumThreads or tid < vec_thrs) {
+      auto src_c = recast<VecType const>(src_v);
+      auto dst_c = recast<VecType>(dst_v);
+      return copy(cpy, src_c, dst_c);
+    }
+  }
+}
+
+
+// Default max-vectorization size to value_type size
+template <uint32_t NumThreads,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout,
+          class CopyPolicy = DefaultCopy>
+CUTE_HOST_DEVICE
+void
+cooperative_copy(uint32_t                     const& tid,
+                 Tensor<SrcEngine, SrcLayout> const& src,
+                 Tensor<DstEngine, DstLayout>      & dst,
+                 CopyPolicy                   const& cpy = {})
+{
+  constexpr uint32_t MaxVecBits = sizeof_bits_v<typename SrcEngine::value_type>;
+  return cooperative_copy<NumThreads, MaxVecBits>(tid, src, dst, cpy);
+}
+
+//
+// Accept mutable temporaries
+//
+
+template <uint32_t NumThreads,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout,
+          class CopyPolicy = DefaultCopy>
+CUTE_HOST_DEVICE
+void
+cooperative_copy(uint32_t                     const& tid,
+                 Tensor<SrcEngine, SrcLayout> const& src,
+                 Tensor<DstEngine, DstLayout>     && dst,
+                 CopyPolicy                   const& cpy = {})
+{
+  return cooperative_copy<NumThreads>(tid, src, dst, cpy);
+}
+
+template <uint32_t NumThreads, uint32_t MaxVecBits,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout,
+          class CopyPolicy = DefaultCopy>
+CUTE_HOST_DEVICE
+void
+cooperative_copy(uint32_t                     const& tid,
+                 Tensor<SrcEngine, SrcLayout> const& src,
+                 Tensor<DstEngine, DstLayout>     && dst,
+                 CopyPolicy                   const& cpy = {})
+{
+  return cooperative_copy<NumThreads, MaxVecBits>(tid, src, dst, cpy);
+}
+
+} // end namespace cute
diff --git a/include/cute/algorithm/cooperative_gemm.hpp b/include/cute/algorithm/cooperative_gemm.hpp
new file mode 100644
index 0000000000..e4bd5ea628
--- /dev/null
+++ b/include/cute/algorithm/cooperative_gemm.hpp
@@ -0,0 +1,585 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+#include <cute/util/type_traits.hpp>
+
+#include <cute/atom/mma_atom.hpp>
+
+#include <cute/algorithm/axpby.hpp>
+#include <cute/algorithm/functional.hpp>
+#include <cute/algorithm/gemm.hpp>
+
+#include <cute/tensor_impl.hpp>
+
+namespace cute
+{
+
+//
+// Cooperative Shared-Memory GEMMs
+//
+
+namespace detail {
+
+// Slow fallback path:
+template<typename ... Args,
+         typename Alpha, typename TRC, typename RCLayout,
+         typename Beta, class TSC, typename CLayout, typename SCLayout,
+         typename CLoadTransformOp, typename CStoreTransformOp>
+CUTE_HOST_DEVICE
+void
+epilogue_predication(ThrMMA<Args...>    const& thr_mma,
+                     Alpha              const& alpha,
+                     Tensor<TRC, RCLayout>   & tCrC,
+                     Beta               const& beta,
+                     Tensor<TSC, CLayout>    & sC,
+                     Tensor<TSC, SCLayout>   & tCsC,
+                     CLoadTransformOp   const& sC_load_op,  // transforms C values before use in GEMM
+                     CStoreTransformOp  const& sC_store_op) // transforms results before they are stored to C
+{
+  using InputTypeC   = typename TSC::value_type;
+  using ComputeTypeC = typename ThrMMA<Args...>::ValTypeC;
+  CUTE_STATIC_ASSERT(CUTE_STL_NAMESPACE::is_same_v<ComputeTypeC, typename TRC::value_type>);
+
+  // Create coordinate tensors for the problem
+  Tensor cC   = make_identity_tensor(shape(sC));                     // (M,N) -> (m,n)
+  // Repeat partitioning with thr_mma
+  Tensor tCcC = thr_mma.partition_C(cC);                             // (MMA,MMA_M,MMA_N) -> (m,n)
+
+  const bool isBetaZero = [&] () {
+    if constexpr (is_complex<Beta>::value) {
+      return beta.real() == Int<0>{} && beta.imag() == Int<0>{};
+    }
+    else {
+      return beta == Int<0>{};
+    }
+    CUTE_GCC_UNREACHABLE;
+  } ();
+
+  // Custom axpby_if for now
+  CUTE_UNROLL
+  for (int i = 0; i < size(tCrC); ++i)
+  {
+    if (elem_less(tCcC(i), shape(sC)))
+    {
+      tCsC(i) = sC_store_op(isBetaZero ? alpha * tCrC(i)
+                                       : alpha * tCrC(i) +
+                                          beta * static_cast<ComputeTypeC>(sC_load_op(tCsC(i))));
+    }
+  }
+}
+
+template<class Alpha, class TRC, class RCLayout,
+         class Beta, class TSC, class SCLayout,
+         class CLoadTransformOp, class CStoreTransformOp,
+         class SmemCopyOpC>
+CUTE_HOST_DEVICE
+void
+epilogue_no_predication(Alpha              const& alpha,
+                        Tensor<TRC, RCLayout>   & tCrC,
+                        Beta               const& beta,
+                        Tensor<TSC, SCLayout>   & tCsC,
+                        CLoadTransformOp   const& sC_load_op,  // transforms C values before use in GEMM
+                        CStoreTransformOp  const& sC_store_op, // transforms results before they are stored to C
+                        SmemCopyOpC        const& sC_copy_op)
+{
+  using InputTypeC   = typename TSC::value_type;
+  using ComputeTypeC = typename TRC::value_type;
+
+  const bool isBetaZero = [&] () {
+    if constexpr (is_complex<Beta>::value) {
+      return beta.real() == Int<0>{} && beta.imag() == Int<0>{};
+    }
+    else {
+      return beta == Int<0>{};
+    }
+    CUTE_GCC_UNREACHABLE;
+  } ();
+
+  Tensor tCrDi = make_fragment_like(tCsC);
+  Tensor tCrD = make_fragment_like(tCrC);
+  if(!isBetaZero) {
+    copy(sC_copy_op, tCsC, tCrDi);
+    // Transform C on/after load
+    cute::transform(tCrDi, tCrD, sC_load_op);
+  }
+  // C = alpha * (A * B) + beta * C
+  axpby(alpha, tCrC, beta, tCrD);
+  // Transform C before/on store
+  cute::transform(tCrD, tCrDi, sC_store_op);
+  copy(sC_copy_op, tCrDi, tCsC);
+}
+
+// Predicated Cooperative GEMM
+template <class... Args,
+          class TA, class ALayout, class TB, class BLayout,
+          class TC, class RCLayout,
+          class ALoadTransformOp, class BLoadTransformOp>
+CUTE_HOST_DEVICE
+void
+cooperative_gemm_predication(ThrMMA<Args...>     const& thr_mma,
+                             Tensor<TA, ALayout> const& sA,
+                             Tensor<TB, BLayout> const& sB,
+                             Tensor<TC, RCLayout>     & tCrC,
+                             ALoadTransformOp    const& sA_load_op,  // transforms A values before use in GEMM
+                             BLoadTransformOp    const& sB_load_op)  // transforms B values before use in GEMM
+{
+  using InputTypeA        = typename TA::value_type;
+  using InputTypeB        = typename TB::value_type;
+  using InputTypeC        = typename TC::value_type;
+  using ComputeTypeA = typename ThrMMA<Args...>::ValTypeA;
+  using ComputeTypeB = typename ThrMMA<Args...>::ValTypeB;
+  using ComputeTypeC = typename ThrMMA<Args...>::ValTypeC;
+
+  //
+  // MMA Partitioning
+  //
+
+  // Partition the sA, sB, and sC tiles across the threads for the MMA
+  Tensor tCsA = thr_mma.partition_A(sA);                            // (MMA,MMA_M,MMA_K)
+  Tensor tCsB = thr_mma.partition_B(sB);                            // (MMA,MMA_N,MMA_K)
+
+  // Create register tensors for the MMA to operate on
+  Tensor tCrA = thr_mma.make_fragment_A(tCsA);                      // (MMA,MMA_M,MMA_K)
+  Tensor tCrB = thr_mma.make_fragment_B(tCsB);                      // (MMA,MMA_N,MMA_K)
+
+#if 0
+  if (thread0()) {
+    print("  sA: "); print(  sA); print("\n");
+    print("  sB: "); print(  sB); print("\n");
+    print(thr_mma);
+    print("tCsA: "); print(tCsA); print("\n");
+    print("tCsB: "); print(tCsB); print("\n");
+    print("tCrA: "); print(tCrA); print("\n");
+    print("tCrB: "); print(tCrB); print("\n");
+    print("tCrC: "); print(tCrC); print("\n");
+  }
+#endif
+
+  //
+  // PREDICATION
+  //
+
+  // Create coordinate tensors for the problem
+  Tensor cA = make_identity_tensor(shape(sA));                      // (M,K) -> (m,k)
+  Tensor cB = make_identity_tensor(shape(sB));                      // (N,K) -> (n,k)
+
+  // Repeat partitioning with thr_mma
+  Tensor tCcA = thr_mma.partition_A(cA);                            // (MMA,MMA_M,MMA_K) -> (m,k)
+  Tensor tCcB = thr_mma.partition_B(cB);                            // (MMA,MMA_N,MMA_K) -> (n,k)
+
+  // Allocate the preds for MMA- and MMA_MN-modes
+  Tensor tCpA = make_tensor<bool>(make_shape(size<0>(tCsA), size<1>(tCsA)));
+  Tensor tCpB = make_tensor<bool>(make_shape(size<0>(tCsB), size<1>(tCsB)));
+
+  // Populate the predicates on M and N
+  CUTE_UNROLL
+  for (int i = 0; i < size(tCpA); ++i) {
+    tCpA(i) = elem_less(get<0>(tCcA(_,_,Int<0>{})(i)), shape<0>(sA));
+  }
+  CUTE_UNROLL
+  for (int i = 0; i < size(tCpB); ++i) {
+    tCpB(i) = elem_less(get<0>(tCcB(_,_,Int<0>{})(i)), shape<0>(sB));
+  }
+
+#if 0
+  if (thread0()) {
+    print("  cA: "); print(  cA); print("\n");
+    print("  cB: "); print(  cB); print("\n");
+    print("tCcA: "); print(tCcA); print("\n");
+    print("tCcB: "); print(tCcB); print("\n");
+    print_tensor(tCpA);
+    print_tensor(tCpB);
+  }
+#endif
+
+  //
+  // PREFETCH k_block = 0
+  //   Condition the k-predication on (static) k_block == K_BLOCK_MAX-1, the last k_block
+  //   Assumes the MMA-tiling in K is trivial
+  //
+
+  constexpr int K_BLOCK_MAX = size<2>(tCrA);
+
+  CUTE_UNROLL
+  for (int m = 0; m < size<1>(tCrA); ++m) {     // Copy MMA_M
+    CUTE_UNROLL
+    for (int i = 0; i < size<0>(tCrA); ++i) {   // Copy MMA_I
+      tCrA(i,m,0) = (tCpA(i,m) && (0 < K_BLOCK_MAX-1 || elem_less(get<1>(tCcA(i,m,0)), shape<1>(sA)))) ? static_cast<ComputeTypeA>(sA_load_op(tCsA(i,m,0))) : ComputeTypeA{};
+    }
+  }
+  CUTE_UNROLL
+  for (int n = 0; n < size<1>(tCrB); ++n) {     // Copy MMA_N
+    CUTE_UNROLL
+    for (int i = 0; i < size<0>(tCrB); ++i) {   // Copy MMA_I
+      tCrB(i,n,0) = (tCpB(i,n) && (0 < K_BLOCK_MAX-1 || elem_less(get<1>(tCcB(i,n,0)), shape<1>(sB)))) ? static_cast<ComputeTypeB>(sB_load_op(tCsB(i,n,0))) : ComputeTypeB{};
+    }
+  }
+  //
+  // MAINLOOP
+  //
+
+  CUTE_UNROLL
+  for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block)
+  {
+    if (k_block < K_BLOCK_MAX-1)   // static-if not the last k_block
+    {
+      int k_next = k_block + 1;    // Load k_next block
+
+      //   Condition the k-predication on (static) k_block == K_BLOCK_MAX-1, the last k_block
+      //   Assumes the MMA-tiling in K is trivial
+
+      CUTE_UNROLL
+      for (int m = 0; m < size<1>(tCrA); ++m) {       // Copy MMA_M
+        CUTE_UNROLL
+        for (int i = 0; i < size<0>(tCrA); ++i) {     // Copy MMA_I
+          tCrA(i,m,k_next) = (tCpA(i,m) && (k_next < K_BLOCK_MAX-1 || elem_less(get<1>(tCcA(i,m,k_next)), shape<1>(sA)))) ? static_cast<ComputeTypeA>(sA_load_op(tCsA(i,m,k_next))) : ComputeTypeA{};
+        }
+      }
+      CUTE_UNROLL
+      for (int n = 0; n < size<1>(tCrB); ++n) {       // Copy MMA_N
+        CUTE_UNROLL
+        for (int i = 0; i < size<0>(tCrB); ++i) {     // Copy MMA_I
+          tCrB(i,n,k_next) = (tCpB(i,n) && (k_next < K_BLOCK_MAX-1 || elem_less(get<1>(tCcB(i,n,k_next)), shape<1>(sB)))) ? static_cast<ComputeTypeB>(sB_load_op(tCsB(i,n,k_next))) : ComputeTypeB{};
+        }
+      }
+    }
+    // GEMM on k_block in registers
+    gemm(thr_mma, tCrA(_,_,k_block), tCrB(_,_,k_block), tCrC);
+  }
+}
+
+// Unpredicated Cooperative GEMM
+template <class... Args,
+          class TA, class ALayout, class TB, class BLayout,
+          class TC, class CLayout,
+          class ALoadTransformOp, class BLoadTransformOp,
+          class SmemCopyOpA, class SmemCopyOpB>
+CUTE_HOST_DEVICE
+void
+cooperative_gemm_no_predication(uint32_t                   thread_idx,
+                                ThrMMA<Args...>     const& thr_mma,
+                                Tensor<TA, ALayout> const& sA,
+                                Tensor<TB, BLayout> const& sB,
+                                Tensor<TC, CLayout>      & tCrC,
+                                ALoadTransformOp    const& sA_load_op,  // transforms A values before use in GEMM
+                                BLoadTransformOp    const& sB_load_op,  // transforms B values before use in GEMM
+                                SmemCopyOpA         const& sA_copy_op,
+                                SmemCopyOpB         const& sB_copy_op)
+{
+  using InputTypeA        = typename TA::value_type;
+  using InputTypeB        = typename TB::value_type;
+  using InputTypeC        = typename TC::value_type;
+  using ComputeTypeA = typename ThrMMA<Args...>::ValTypeA;
+  using ComputeTypeB = typename ThrMMA<Args...>::ValTypeB;
+  using ComputeTypeC = typename ThrMMA<Args...>::ValTypeC;
+
+
+  //
+  // MMA Partitioning
+  //
+
+  // Create register tensors for the MMA to operate on
+  Tensor tCrA  = thr_mma.partition_fragment_A(sA);                    // (MMA,MMA_M,MMA_K)
+  Tensor tCrB  = thr_mma.partition_fragment_B(sB);                    // (MMA,MMA_N,MMA_K)
+
+  using CopyOpAType = SmemCopyOpA;
+  using CopyOpBType = SmemCopyOpB;
+
+  auto smem_tiled_copy_A = make_tiled_copy_A(Copy_Atom<CopyOpAType, InputTypeA>{}, thr_mma);
+  auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(thread_idx);
+  Tensor tCsA            = smem_thr_copy_A.partition_S(sA);
+  Tensor tCrAi           = make_fragment_like(tCsA);
+  Tensor tCrAi_copy_view = smem_thr_copy_A.retile_D(tCrAi);
+  CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrAi_copy_view));             // CPY_M
+  CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrAi_copy_view));             // CPY_K
+
+  auto smem_tiled_copy_B = make_tiled_copy_B(Copy_Atom<CopyOpBType, InputTypeB>{}, thr_mma);
+  auto smem_thr_copy_B   = smem_tiled_copy_B.get_thread_slice(thread_idx);
+  Tensor tCsB            = smem_thr_copy_B.partition_S(sB);
+  Tensor tCrBi           = make_fragment_like(tCsB);
+  Tensor tCrBi_copy_view = smem_thr_copy_B.retile_D(tCrBi);
+  CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrBi_copy_view));            // CPY_N
+  CUTE_STATIC_ASSERT_V(size<2>(tCsB) == size<2>(tCrBi_copy_view));            // CPY_K
+
+#if 0
+  if (thread0()) {
+    print("  sA: "); print(sA); print("\n");
+    print("  sB: "); print(sB); print("\n");
+    print(thr_mma); print("\n");
+    print("tCrA: "); print(tCrA); print("\n");
+    print("tCrB: "); print(tCrB); print("\n");
+    print("tCrC: "); print(tCrC); print("\n");
+    print(smem_thr_copy_A); print("\n");
+    print("tCsA: "); print(tCsA); print("\n");
+    print("tCrA_copy_view: "); print(tCrA_copy_view); print("\n");
+    print(smem_thr_copy_B); print("\n");
+    print("tCsB: "); print(tCsB); print("\n");
+    print("tCrB_copy_view: "); print(tCrB_copy_view); print("\n");
+  }
+#endif
+
+  //
+  // PREFETCH
+  //
+
+  copy(smem_tiled_copy_A, tCsA(_,_,Int<0>{}), tCrAi_copy_view(_,_,Int<0>{}));
+  copy(smem_tiled_copy_B, tCsB(_,_,Int<0>{}), tCrBi_copy_view(_,_,Int<0>{}));
+  //
+  // MAINLOOP
+  //
+
+  constexpr int K_BLOCK_MAX = size<2>(tCrA);
+
+  CUTE_UNROLL
+  for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block)
+  {
+    // static-if load the next k_block. No k-predication required on these loads.
+    if (k_block < K_BLOCK_MAX-1)
+    {
+      // Load the next k_block
+      int k_next = k_block + 1;       // statically unrolled
+      copy(smem_tiled_copy_A, tCsA(_,_,k_next), tCrAi_copy_view(_,_,k_next));
+      copy(smem_tiled_copy_B, tCsB(_,_,k_next), tCrBi_copy_view(_,_,k_next));
+    }
+
+    // Transform A and B, relying on the compiler to remove in case of identity ops
+    cute::transform(tCrAi(_,_,k_block), tCrA(_,_,k_block), sA_load_op);
+    cute::transform(tCrBi(_,_,k_block), tCrB(_,_,k_block), sB_load_op);
+
+    // GEMM on k_block in registers
+    gemm(thr_mma, tCrA(_,_,k_block), tCrB(_,_,k_block), tCrC);
+  }
+}
+
+} // end namespace detail
+
+// C passed as a shared memory tensor
+// Epilogue included
+template <class... Args,
+          class Alpha, class TA, class ALayout, class TB, class BLayout,
+          class Beta,  class TC, class CLayout,
+          class ALoadTransformOp = cute::identity, class BLoadTransformOp  = cute::identity,
+          class CLoadTransformOp = cute::identity, class CStoreTransformOp = cute::identity,
+          class SmemCopyOpA = DefaultCopy, class SmemCopyOpB = DefaultCopy,
+          class SmemCopyOpC = DefaultCopy>
+CUTE_HOST_DEVICE
+void
+cooperative_gemm(uint32_t                   thread_idx,
+                 TiledMMA<Args...>   const& tiled_mma,
+                 Alpha               const& alpha,
+                 Tensor<TA, ALayout> const& sA,
+                 Tensor<TB, BLayout> const& sB,
+                 Beta                const& beta,
+                 Tensor<TC, CLayout>      & sC,
+                 ALoadTransformOp    const& sA_load_op  = {}, // transforms A values before use in GEMM
+                 BLoadTransformOp    const& sB_load_op  = {}, // transforms B values before use in GEMM
+                 CLoadTransformOp    const& sC_load_op  = {}, // transforms C values before use in GEMM
+                 CStoreTransformOp   const& sC_store_op = {}, // transforms results before they are stored to C
+                 SmemCopyOpA         const& sA_copy_op  = {},
+                 SmemCopyOpB         const& sB_copy_op  = {},
+                 SmemCopyOpC         const& sC_copy_op  = {})
+{
+  CUTE_STATIC_ASSERT_V(rank(sA) == Int<2>{});
+  CUTE_STATIC_ASSERT_V(rank(sB) == Int<2>{});
+  CUTE_STATIC_ASSERT_V(rank(sC) == Int<2>{});
+
+  CUTE_STATIC_ASSERT_V(size<0>(sA) == size<0>(sC));  // AM == CM
+  CUTE_STATIC_ASSERT_V(size<0>(sB) == size<1>(sC));  // BN == CN
+  CUTE_STATIC_ASSERT_V(size<1>(sA) == size<1>(sB));  // AK == BK
+
+  using InputTypeA        = typename TA::value_type;
+  using InputTypeB        = typename TB::value_type;
+  using InputTypeC        = typename TC::value_type;
+  using ComputeTypeA = typename TiledMMA<Args...>::ValTypeA;
+  using ComputeTypeB = typename TiledMMA<Args...>::ValTypeB;
+  using ComputeTypeC = typename TiledMMA<Args...>::ValTypeC;
+
+  auto compat = evenly_divides(make_shape(size<0>(sA), size<0>(sB), size<1>(sA)),
+                               tile_shape(TiledMMA<Args...>{}));
+
+  // ThrMMA
+  auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
+  Tensor tCsC  = thr_mma.partition_C(sC);                             // (MMA,MMA_M,MMA_N) :: InputTypeC
+  Tensor tCrC  = thr_mma.make_fragment_C(tCsC);                       // (MMA,MMA_M,MMA_N) :: ComputeTypeC
+
+  // Clear accumulators
+  clear(tCrC);
+
+#if 0
+  if (thread0()) {
+    print("  sC: "); print(sC); print("\n");
+    print("  tCsC: "); print(tCsC); print("\n");
+  }
+#endif
+
+  if constexpr (is_constant<true, decltype(compat)>::value) {
+    detail::cooperative_gemm_no_predication(
+        thread_idx, thr_mma, sA, sB, tCrC, sA_load_op, sB_load_op, sA_copy_op, sB_copy_op
+    );
+    detail::epilogue_no_predication(
+        alpha, tCrC, beta, tCsC, sC_load_op, sC_store_op, sC_copy_op
+    );
+  } else {
+    detail::cooperative_gemm_predication(
+        thr_mma, sA, sB, tCrC, sA_load_op, sB_load_op
+    );
+    detail::epilogue_predication(
+        thr_mma, alpha, tCrC, beta, sC, tCsC, sC_load_op, sC_store_op
+    );
+  }
+}
+
+// C already partitioned into registers on input
+// It can be passed non-empty
+// Epilogue not included
+template <class... Args,
+          class TA, class ALayout, class TB, class BLayout,
+          class TC, class CLayout,
+          class ALoadTransformOp = cute::identity, class BLoadTransformOp  = cute::identity,
+          class SmemCopyOpA = DefaultCopy, class SmemCopyOpB = DefaultCopy>
+CUTE_HOST_DEVICE
+void
+cooperative_gemm(uint32_t                   thread_idx,
+                 TiledMMA<Args...>   const& tiled_mma,
+                 Tensor<TA, ALayout> const& sA,
+                 Tensor<TB, BLayout> const& sB,
+                 Tensor<TC, CLayout>      & tCrC,
+                 ALoadTransformOp    const& sA_load_op  = {}, // transforms A values before use in GEMM
+                 BLoadTransformOp    const& sB_load_op  = {}, // transforms B values before use in GEMM
+                 SmemCopyOpA         const& sA_copy_op  = {},
+                 SmemCopyOpB         const& sB_copy_op  = {})
+{
+  CUTE_STATIC_ASSERT_V(rank(sA) == Int<2>{});
+  CUTE_STATIC_ASSERT_V(rank(sB) == Int<2>{});
+
+  CUTE_STATIC_ASSERT_V(size<1>(sA) == size<1>(sB));  // AK == BK
+
+  using InputTypeA        = typename TA::value_type;
+  using InputTypeB        = typename TB::value_type;
+  using InputTypeC        = typename TC::value_type;
+  using ComputeTypeA = typename TiledMMA<Args...>::ValTypeA;
+  using ComputeTypeB = typename TiledMMA<Args...>::ValTypeB;
+  using ComputeTypeC = typename TiledMMA<Args...>::ValTypeC;
+
+  // Check if input C fragment is compatible with thr_mma and problem size
+  using ref_c_frag = decltype(partition_shape_C(tiled_mma, make_shape(size<0>(sA), size<0>(sB))));
+  CUTE_STATIC_ASSERT_V(compatible(shape(ref_c_frag{}), shape(tCrC)));
+
+  auto compat = evenly_divides(make_shape(size<0>(sA), size<0>(sB), size<1>(sA)),
+                               tile_shape(TiledMMA<Args...>{}));
+
+  // ThrMMA
+  auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
+
+  if constexpr (is_constant<true, decltype(compat)>::value) {
+    detail::cooperative_gemm_no_predication(
+        thread_idx, thr_mma, sA, sB, tCrC, sA_load_op, sB_load_op, sA_copy_op, sB_copy_op
+    );
+  } else {
+    detail::cooperative_gemm_predication(
+        thr_mma, sA, sB, tCrC, sA_load_op, sB_load_op
+    );
+  }
+}
+
+// Accept mutable temporaries
+template <class... Args,
+          class Alpha, class TA, class ALayout, class TB, class BLayout,
+          class Beta,  class TC, class CLayout,
+          class ALoadTransformOp = cute::identity, class BLoadTransformOp  = cute::identity,
+          class CLoadTransformOp = cute::identity, class CStoreTransformOp = cute::identity,
+          class SmemCopyOpA = DefaultCopy, class SmemCopyOpB = DefaultCopy,
+          class SmemCopyOpC = DefaultCopy>
+CUTE_HOST_DEVICE
+void
+cooperative_gemm(uint32_t thread_idx,
+                 TiledMMA<Args...>   const& tiled_mma,
+                 Alpha               const& alpha,
+                 Tensor<TA, ALayout> const& sA,
+                 Tensor<TB, BLayout> const& sB,
+                 Beta                const& beta,
+                 Tensor<TC, CLayout>     && sC,
+                 ALoadTransformOp    const& sA_load_op  = {}, // transforms A values before use in GEMM
+                 BLoadTransformOp    const& sB_load_op  = {}, // transforms B values before use in GEMM
+                 CLoadTransformOp    const& sC_load_op  = {}, // transforms C values before use in GEMM
+                 CStoreTransformOp   const& sC_store_op = {}, // transforms results before they are stored to C
+                 SmemCopyOpA         const& sA_copy_op  = {},
+                 SmemCopyOpB         const& sB_copy_op  = {},
+                 SmemCopyOpC         const& sC_copy_op  = {})
+{
+  cooperative_gemm(thread_idx, tiled_mma, alpha, sA, sB, beta, sC,
+                   sA_load_op, sB_load_op, sC_load_op, sC_store_op,
+                   sA_copy_op, sB_copy_op, sC_copy_op);
+}
+
+// Legacy overload of cute::gemm for backwards-compatibility
+template <class... Args,
+          class Alpha, class TA, class ALayout, class TB, class BLayout,
+          class Beta,  class TC, class CLayout,
+          class ALoadTransformOp = cute::identity, class BLoadTransformOp  = cute::identity,
+          class CLoadTransformOp = cute::identity, class CStoreTransformOp = cute::identity>
+CUTE_HOST_DEVICE
+void
+gemm(ThrMMA<Args...>     const& thr_mma,
+     Alpha               const& alpha,
+     Tensor<TA, ALayout> const& sA,
+     Tensor<TB, BLayout> const& sB,
+     Beta                const& beta,
+     Tensor<TC, CLayout>      & sC,
+     ALoadTransformOp    const& sA_load_op  = {}, // transforms A values before use in GEMM
+     BLoadTransformOp    const& sB_load_op  = {}, // transforms B values before use in GEMM
+     CLoadTransformOp    const& sC_load_op  = {}, // transforms C values before use in GEMM
+     CStoreTransformOp   const& sC_store_op = {}) // transforms results before they are stored to C
+{
+  CUTE_STATIC_ASSERT_V(rank(sA) == Int<2>{});
+  CUTE_STATIC_ASSERT_V(rank(sB) == Int<2>{});
+  CUTE_STATIC_ASSERT_V(rank(sC) == Int<2>{});
+
+  CUTE_STATIC_ASSERT_V(size<0>(sA) == size<0>(sC));  // AM == CM
+  CUTE_STATIC_ASSERT_V(size<0>(sB) == size<1>(sC));  // BN == CN
+  CUTE_STATIC_ASSERT_V(size<1>(sA) == size<1>(sB));  // AK == BK
+
+  Tensor tCsC  = thr_mma.partition_C(sC);                           // (MMA,MMA_M,MMA_N)
+  Tensor tCrC  = thr_mma.make_fragment_C(tCsC);                     // (MMA,MMA_M,MMA_N)
+
+  // Goes directly to the slow path to avoid getting thread_idx from thr_mma
+  detail::cooperative_gemm_predication(
+    thr_mma, sA, sB, sC, sA_load_op, sB_load_op
+  );
+
+  detail::epilogue_predication(
+      thr_mma, alpha, tCrC, beta, sC, tCsC, sC_load_op, sC_store_op
+  );
+}
+
+} // end namespace cute
diff --git a/include/cute/algorithm/copy.hpp b/include/cute/algorithm/copy.hpp
new file mode 100644
index 0000000000..84ef49161d
--- /dev/null
+++ b/include/cute/algorithm/copy.hpp
@@ -0,0 +1,545 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>            // CUTE_HOST_DEVICE
+#include <cute/tensor_impl.hpp>       // cute::Tensor
+#include <cute/tensor_predicate.hpp>  // cute::TrivialPredTensor
+#include <cute/atom/copy_atom.hpp>    // cute::Copy_Atom
+
+namespace cute
+{
+
+//
+// copy_if -- Predicated Copy
+//
+
+template <class PrdTensor,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy_if(PrdTensor                    const& pred,
+        Tensor<SrcEngine, SrcLayout> const& src,
+        Tensor<DstEngine, DstLayout>      & dst)
+{
+  using SrcType = typename SrcEngine::value_type;
+  using DstType = typename DstEngine::value_type;
+
+  CUTE_UNROLL
+  for (int i = 0; i < size(dst); ++i) {
+    if (pred(i)) {
+      dst(i) = static_cast<DstType>(static_cast<SrcType>(src(i)));
+    }
+  }
+}
+
+//
+// copy_if -- Predicated CopyAtom
+//
+
+template <class... CopyArgs,
+          class PredTensor,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy_if(Copy_Atom<CopyArgs...>       const& copy_atom,
+        PredTensor                   const& pred,      // (Rest...)
+        Tensor<SrcEngine, SrcLayout> const& src,       // (V,Rest...)
+        Tensor<DstEngine, DstLayout>      & dst)       // (V,Rest...)
+{
+  static_assert(SrcLayout::rank == DstLayout::rank, "CopyAtom rank-mismatch.");
+  auto has_with_bool = cute::is_valid([](auto t)->void_t<decltype(declval<typename decltype(t)::Traits>().with(true))>{}, copy_atom);
+
+  if constexpr (SrcLayout::rank == 1) {   // Dispatch the copy
+    if constexpr (has_with_bool) {
+      copy_atom.with(pred()).call(src, dst);
+    } else {
+      if (pred()) { copy_atom.call(src, dst); }
+    }
+  } else {                                // Loop over all but the first mode
+    constexpr int R = SrcLayout::rank;
+    Tensor src_v = group_modes<1,R>(src);
+    Tensor dst_v = group_modes<1,R>(dst);
+    CUTE_UNROLL
+    for (int i = 0; i < size<1>(dst_v); ++i) {
+      if constexpr (has_with_bool) {
+        copy_atom.with(pred(i)).call(src_v(_,i), dst_v(_,i));
+      } else {
+        if (pred(i)) { copy_atom.call(src_v(_,i), dst_v(_,i)); }
+      }
+    }
+  }
+}
+
+//
+// copy_if -- AutoCopyAsync
+//
+template <class PrdTensor,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy_if(AutoCopyAsync                const& cpy,
+        PrdTensor                    const& pred,
+        Tensor<SrcEngine, SrcLayout> const& src,
+        Tensor<DstEngine, DstLayout>      & dst)
+{
+  using SrcElemWithConst = remove_reference_t<typename SrcEngine::reference>;
+  using SrcType = typename SrcEngine::value_type;
+  using DstType = typename DstEngine::value_type;
+
+  auto copy_op = []() {
+#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED)
+    if constexpr (is_gmem<SrcEngine>::value && is_smem<DstEngine>::value &&
+                  sizeof(SrcType) == sizeof(DstType)) {
+      if constexpr (is_const_v<SrcElemWithConst> && sizeof(SrcType) == 16) {
+          return SM80_CP_ASYNC_CACHEGLOBAL<SrcType,DstType>{};
+      } else if constexpr (sizeof(SrcType) == 4 || sizeof(SrcType) == 8 || sizeof(SrcType) == 16) {
+          return SM80_CP_ASYNC_CACHEALWAYS<SrcType,DstType>{};
+      } else {
+          return UniversalCopy<SrcType,DstType>{};
+      }
+    } else {
+        return UniversalCopy<SrcType,DstType>{};
+    }
+
+    CUTE_GCC_UNREACHABLE;
+#else
+    return UniversalCopy<SrcType,DstType>{};
+#endif
+  }();
+
+  CUTE_UNROLL
+  for (int i = 0; i < size(dst); ++i) {
+    if (pred(i)) {
+      copy_op.copy(src(i), dst(i));
+    }
+  }
+}
+
+//
+// copy -- AutoCopyAsync
+//
+
+template <class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy(AutoCopyAsync                const& cpy,
+     Tensor<SrcEngine, SrcLayout> const& src,       // (V,Rest...)
+     Tensor<DstEngine, DstLayout>      & dst)       // (V,Rest...)
+{
+  copy_if(cpy, TrivialPredTensor{}, src, dst);
+}
+
+//
+// copy -- CopyAtom
+//
+
+template <class... CopyArgs,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy(Copy_Atom<CopyArgs...>       const& copy_atom,
+     Tensor<SrcEngine, SrcLayout> const& src,       // (V,Rest...)
+     Tensor<DstEngine, DstLayout>      & dst)       // (V,Rest...)
+{
+  static_assert(SrcLayout::rank == DstLayout::rank, "CopyAtom rank-mismatch.");
+
+  if constexpr (SrcLayout::rank == 1) {   // Dispatch the copy
+    copy_atom.call(src, dst);
+  } else {                                // Loop over all but the first mode
+    constexpr int R = SrcLayout::rank;
+    Tensor src_v = group_modes<1,R>(src);
+    Tensor dst_v = group_modes<1,R>(dst);
+
+    if constexpr (is_static<decltype(shape(src_v))>::value && is_static<decltype(shape(dst_v))>::value) {
+      CUTE_STATIC_ASSERT_V(size<1>(src_v) == size<1>(dst_v));
+
+      // AutoFilter on the Rest-mode
+      auto dst_null = nullspace(layout<1>(dst_v));
+
+      Tensor dst_n = zipped_divide(dst_v, make_tile(shape<0>(dst_v), dst_null));  // ((V, NLL), (_1, Rest))
+      Tensor src_n = zipped_divide(src_v, make_tile(shape<0>(src_v), dst_null));  // ((V, NLL), (_1, Rest))
+
+      CUTE_STATIC_ASSERT_V(size<1>(src_n) == size<1>(dst_n));
+      CUTE_STATIC_ASSERT_V((cosize<0,1>(dst_n.layout()) == Int<1>{}), "Nullspace definition error");
+      CUTE_STATIC_ASSERT_V((cosize<0,1>(src_n.layout()) == Int<1>{}), "Error: Ambiguous scatter detected in copy");
+      CUTE_STATIC_ASSERT_V((size<1,0>(dst_n) == Int<1>{}));
+      CUTE_STATIC_ASSERT_V((size<1,0>(src_n) == Int<1>{}));
+
+      Tensor dst_c = dst_n(make_coord(_,Int<0>{}),make_coord(Int<0>{},_));        // (V, Rest)
+      Tensor src_c = src_n(make_coord(_,Int<0>{}),make_coord(Int<0>{},_));        // (V, Rest)
+
+      CUTE_STATIC_ASSERT_V(size<1>(src_c) == size<1>(dst_c));
+      CUTE_STATIC_ASSERT_V(shape<0>(dst_c) == shape<0>(dst));
+      CUTE_STATIC_ASSERT_V(shape<0>(src_c) == shape<0>(src));
+
+      CUTE_UNROLL
+      for (int i = 0; i < size<1>(dst_c); ++i) {
+        copy_atom.call(src_c(_,i), dst_c(_,i));
+      }
+    } else {
+      CUTE_UNROLL
+      for (int i = 0; i < size<1>(dst_v); ++i) {
+        copy_atom.call(src_v(_,i), dst_v(_,i));
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////
+// Special Auto-Vectorizing, Auto-Filtering Overloads //
+////////////////////////////////////////////////////////
+
+// Specialization for AutoVectorizingCopyAssumedAlignment<MaxVecBits>
+template <int MaxVecBits, class... Args,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy(AutoVectorizingCopyWithAssumedAlignment<MaxVecBits> const&,
+     Tensor<SrcEngine, SrcLayout>                        const& src,
+     Tensor<DstEngine, DstLayout>                             & dst)
+{
+  constexpr int common_elem = CUTE_STATIC_V(max_common_vector(src, dst));
+  constexpr int align_bits  = CUTE_STATIC_V(gcd(max_alignment(src), max_alignment(dst), Int<MaxVecBits>{}));
+  static_assert(is_integral<decltype(Int<common_elem>{} * sizeof_bits_v<typename SrcEngine::value_type>)>::value, "Error: Attempting a subbit copy!");
+  constexpr int vec_bits    = gcd(common_elem * sizeof_bits_v<typename SrcEngine::value_type>, align_bits);
+
+  if constexpr (common_elem > 1 && ((vec_bits % 8) == 0)) {
+    // If more than one element vectorizes to 8bits or more, then recast and copy
+    using VecType = uint_bit_t<vec_bits>;
+    // Preserve volatility
+    using SrcVecType = conditional_t<is_volatile_v<typename SrcEngine::element_type>, VecType const volatile, VecType const>;
+    using DstVecType = conditional_t<is_volatile_v<typename DstEngine::element_type>, VecType       volatile, VecType      >;
+
+    // Recast
+    Tensor src_v = recast<SrcVecType>(src);
+    Tensor dst_v = recast<DstVecType>(dst);
+
+#if 0
+    if (thread0()) {
+      print("copy -- found max_common_vector of %d elems and vectorization to %d bits\n", common_elem, vec_bits);
+      print("   "); print(src); print(" => "); print(src_v); print("\n");
+      print("   "); print(dst); print(" => "); print(dst_v); print("\n");
+    }
+#endif
+
+    return copy_if(TrivialPredTensor{}, src_v, dst_v);
+  } else {
+    return copy_if(TrivialPredTensor{}, src, dst);
+  }
+}
+
+template <class Base>
+struct AutoFilter {
+  Base const& base;
+  CUTE_HOST_DEVICE AutoFilter(Base const& b) : base(b) {}
+};
+
+// Specialization for AutoFilter
+template <class CopyOp,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy(AutoFilter<CopyOp>           const& copy_op,
+     Tensor<SrcEngine, SrcLayout> const& src,
+     Tensor<DstEngine, DstLayout>      & dst)
+{
+  if constexpr (is_constant<true, decltype(size(src) == size(dst))>::value) {
+    auto dst_null = nullspace(dst.layout());
+
+    Tensor dst_n = zipped_divide(dst, dst_null);
+    Tensor src_n = zipped_divide(src, dst_null);
+
+    CUTE_STATIC_ASSERT_V(cosize<0>(dst_n.layout()) == Int<1>{}, "Nullspace definition error");
+    CUTE_STATIC_ASSERT_V(cosize<0>(src_n.layout()) == Int<1>{}, "Error: Ambiguous scatter detected in copy");
+
+    copy(copy_op.base, src_n(Int<0>{},_), dst_n(Int<0>{},_));
+  } else {
+    copy(copy_op.base, src, dst);
+  }
+}
+
+// Auto-vectorizing copy for static layouts
+template <class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy(Tensor<SrcEngine, SrcLayout> const& src,
+     Tensor<DstEngine, DstLayout>      & dst)
+{
+  if constexpr (is_static<SrcLayout>::value && is_static<DstLayout>::value) {
+    // Assume Tensors with static layouts (e.g. registers) have pointers that are 128b aligned
+    return copy(AutoFilter(AutoVectorizingCopyWithAssumedAlignment<128>{}), src, dst);
+  } else
+  if constexpr (is_static<decltype(shape(src))>::value && is_static<decltype(shape(dst))>::value) {
+    // Tensors with static shapes can be filtered, but do not assume that dynamic layouts are aligned.
+    return copy(AutoFilter(AutoVectorizingCopyWithAssumedAlignment<8>{}), src, dst);
+  } else {
+    // Do not assume that dynamic layouts are aligned.
+    return copy(AutoVectorizingCopyWithAssumedAlignment<8>{}, src, dst);
+  }
+}
+
+// Auto-vectorizing copy with assumed alignment up to 128bit.
+template <class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy_aligned(Tensor<SrcEngine, SrcLayout> const& src,
+             Tensor<DstEngine, DstLayout>      & dst)
+{
+  if constexpr (is_static<decltype(shape(src))>::value && is_static<decltype(shape(dst))>::value) {
+    // Tensors with static shapes can be filtered
+    return copy(AutoFilter(AutoVectorizingCopyWithAssumedAlignment<128>{}), src, dst);
+  } else {
+    return copy(AutoVectorizingCopyWithAssumedAlignment<128>{}, src, dst);
+  }
+}
+
+// Specializaton for Atom AutoVectorizingCopyAssumedAlignment
+template <int MaxVecBits, class... Args,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy(Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<MaxVecBits>, Args...> const&,
+     Tensor<SrcEngine, SrcLayout>                                            const& src,
+     Tensor<DstEngine, DstLayout>                                                 & dst)
+{
+  return copy(AutoVectorizingCopyWithAssumedAlignment<MaxVecBits>{}, src, dst);
+}
+
+#if defined(CUTE_COPY_ATOM_TMA_SM90_ENABLED)
+template <class... CT_Args,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy(Copy_Traits<SM90_BULK_COPY_AUTO, CT_Args...> const& atom,  // Copy_Traits may or may not have the memory barrier in it already
+     Tensor<SrcEngine, SrcLayout>                 const& src,
+     Tensor<DstEngine, DstLayout>                      & dst)
+{
+  using SrcType = typename SrcEngine::value_type;
+  using DstType = typename DstEngine::value_type;
+  static_assert(cute::is_same<SrcType, DstType>::value);
+  static_assert((is_gmem<SrcEngine>::value && is_smem<DstEngine>::value) ||
+                (is_smem<SrcEngine>::value && is_gmem<DstEngine>::value),
+                "Bulk Copy only supports gmem -> smem or smem -> gmem movement.");
+  // G2S or S2G dispatch
+  using BULK_COPY_OP = conditional_t<is_gmem<SrcEngine>::value,
+                                     SM90_BULK_COPY_G2S,
+                                     SM90_BULK_COPY_S2G>;
+
+  // Find the common subtensor of src and dst
+  auto tiler = max_common_layout(src, dst);
+  constexpr int vec_elem = decltype(size(tiler))::value;
+  constexpr int vec_bits = vec_elem * sizeof_bits_v<SrcType>;
+  static_assert(vec_bits >= 128, "Expected at least 128-bits for BLKCP");
+
+  // Construct a new concrete Atom of the vector size
+  using BulkAtom = Copy_Atom<Copy_Traits<BULK_COPY_OP, Int<vec_bits>, CT_Args...>, SrcType>;
+  auto bulk_atom = apply(atom.opargs_, [](auto const&... args) { return BulkAtom{args...}; });
+
+#if 0
+  if (thread0()) {
+    print("copy blkcp -- found a max_common_layout of "); print(tiler); print("\n");
+    print("   "); print(src); print("\n");
+    print("   "); print(dst); print("\n");
+  }
+#endif
+
+  return copy(bulk_atom, logical_divide(src, tiler), logical_divide(dst, tiler));
+}
+
+// Backwards-compat. Throw out any extra Copy_Atom args.
+template <class... CT_Args, class... CA_Args,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy(Copy_Atom<Copy_Traits<SM90_BULK_COPY_AUTO, CT_Args...>, CA_Args...> const& atom,
+     Tensor<SrcEngine, SrcLayout>                const& src,
+     Tensor<DstEngine, DstLayout>                     & dst)
+{
+  return copy(static_cast<Copy_Traits<SM90_BULK_COPY_AUTO, CT_Args...> const&>(atom), src, dst);
+}
+#endif // #if defined(CUTE_COPY_ATOM_TMA_SM90_ENABLED)
+
+//
+// Decay TiledCopy to CopyAtom
+//
+
+template <class CopyAtom, class TV, class Tiler,
+          class PrdTensor,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy_if(TiledCopy<CopyAtom, TV, Tiler> const& tiled_copy,
+        PrdTensor                      const& pred,
+        Tensor<SrcEngine, SrcLayout>   const& src,
+        Tensor<DstEngine, DstLayout>        & dst)
+{
+  return copy_if(static_cast<CopyAtom const&>(tiled_copy), pred, src, dst);
+}
+
+template <class CopyAtom, class TV, class Tiler,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy(TiledCopy<CopyAtom, TV, Tiler> const& tiled_copy,
+     Tensor<SrcEngine, SrcLayout>   const& src,
+     Tensor<DstEngine, DstLayout>        & dst)
+{
+  return copy(static_cast<CopyAtom const&>(tiled_copy), src, dst);
+}
+
+template <class TiledCopy, class ThrIdx,
+          class PrdTensor,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy_if(ThrCopy<TiledCopy, ThrIdx>   const& thr_copy,
+        PrdTensor                    const& pred,
+        Tensor<SrcEngine, SrcLayout> const& src,
+        Tensor<DstEngine, DstLayout>      & dst) = delete;
+
+template <class TiledCopy, class ThrIdx,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy(ThrCopy<TiledCopy, ThrIdx>   const& thr_copy,
+     Tensor<SrcEngine, SrcLayout> const& src,
+     Tensor<DstEngine, DstLayout>      & dst) = delete;
+
+//
+// Catch uncaught policies
+//
+
+template <class CopyPolicy,
+          class PredTensor,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy_if(CopyPolicy                   const& cpy,
+        PredTensor                   const& prd,
+        Tensor<SrcEngine, SrcLayout> const& src,
+        Tensor<DstEngine, DstLayout>      & dst)
+{
+  static_assert(dependent_false<CopyPolicy>, "Unrecognized CopyPolicy.");
+}
+
+template <class CopyPolicy,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy(CopyPolicy                   const& cpy,
+     Tensor<SrcEngine, SrcLayout> const& src,
+     Tensor<DstEngine, DstLayout>      & dst)
+{
+  static_assert(dependent_false<CopyPolicy>, "Unrecognized CopyPolicy.");
+}
+
+//
+// Accept mutable temporaries
+//
+
+template <class PrdTensor,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy_if(PrdTensor                    const& pred,
+        Tensor<SrcEngine, SrcLayout> const& src,
+        Tensor<DstEngine, DstLayout>     && dst)
+{
+  return copy_if(pred, src, dst);
+}
+
+template <class CopyPolicy,
+          class PrdTensor,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy_if(CopyPolicy                   const& copy_policy,
+        PrdTensor                    const& pred,
+        Tensor<SrcEngine, SrcLayout> const& src,
+        Tensor<DstEngine, DstLayout>     && dst)
+{
+  return copy_if(copy_policy, pred, src, dst);
+}
+
+template <class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy(Tensor<SrcEngine, SrcLayout> const& src,
+     Tensor<DstEngine, DstLayout>     && dst)
+{
+  return copy(src, dst);
+}
+
+template <class CopyPolicy,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy(CopyPolicy                   const& copy_policy,
+     Tensor<SrcEngine, SrcLayout> const& src,
+     Tensor<DstEngine, DstLayout>     && dst)
+{
+  return copy(copy_policy, src, dst);
+}
+
+template <class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy_aligned(Tensor<SrcEngine, SrcLayout> const& src,
+             Tensor<DstEngine, DstLayout>     && dst)
+{
+  return copy_aligned(src, dst);
+}
+
+} // end namespace cute
diff --git a/include/cute/algorithm/fill.hpp b/include/cute/algorithm/fill.hpp
new file mode 100644
index 0000000000..3f33a42ade
--- /dev/null
+++ b/include/cute/algorithm/fill.hpp
@@ -0,0 +1,87 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+
+#include <cute/tensor_impl.hpp>
+#include <cute/algorithm/prefer.hpp>
+
+namespace cute
+{
+
+//
+// Accept mutable temporaries
+//
+template <class Engine, class Layout, class T>
+CUTE_HOST_DEVICE
+void
+fill(Tensor<Engine, Layout>&& tensor, T const& value)
+{
+  return fill(tensor, value);
+}
+
+namespace detail
+{
+
+// Prefer fill(tensor.data(), value), if possible
+template <class Engine, class Layout, class T>
+CUTE_HOST_DEVICE
+auto
+fill(Tensor<Engine, Layout>& tensor, T const& value, prefer<1>)
+    -> decltype(fill(tensor.data(), value))
+{
+  fill(tensor.data(), value);
+}
+
+// Default implementation
+template <class Engine, class Layout, class T>
+CUTE_HOST_DEVICE
+void
+fill(Tensor<Engine, Layout>& tensor, T const& value, prefer<0>)
+{
+  CUTE_UNROLL
+  for (int i = 0; i < size(tensor); ++i) {
+    tensor(i) = value;
+  }
+}
+
+} // end namespace detail
+
+template <class Engine, class Layout, class T>
+CUTE_HOST_DEVICE
+void
+fill(Tensor<Engine, Layout>& tensor, T const& value)
+{
+  return detail::fill(tensor, value, prefer<1>{});
+}
+
+} // end namespace cute
diff --git a/include/cute/algorithm/functional.hpp b/include/cute/algorithm/functional.hpp
new file mode 100644
index 0000000000..ef80d018d7
--- /dev/null
+++ b/include/cute/algorithm/functional.hpp
@@ -0,0 +1,290 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>          // CUTE_HOST_DEVICE
+#include <cute/numeric/math.hpp>    // cute::max, cute::min
+#include <cute/numeric/complex.hpp> // cute::conj
+
+/** C++14 <functional> extensions */
+
+namespace cute {
+
+/**************/
+/** Identity **/
+/**************/
+
+struct identity {
+  template <class T>
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto) operator()(T&& arg) const {
+    return static_cast<T&&>(arg);
+  }
+};
+
+template <class R>
+struct constant_fn {
+  template <class... T>
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto) operator()(T&&...) const {
+    return r_;
+  }
+  R r_;
+};
+
+/***********/
+/** Unary **/
+/***********/
+
+#define CUTE_LEFT_UNARY_OP(NAME,OP)                                  \
+  struct NAME {                                                      \
+    template <class T>                                               \
+    CUTE_HOST_DEVICE constexpr                                       \
+    decltype(auto) operator()(T&& arg) const {                       \
+      return OP static_cast<T&&>(arg);                                \
+    }                                                                \
+  }
+#define CUTE_RIGHT_UNARY_OP(NAME,OP)                                 \
+  struct NAME {                                                      \
+    template <class T>                                               \
+    CUTE_HOST_DEVICE constexpr                                       \
+    decltype(auto) operator()(T&& arg) const {                       \
+      return static_cast<T&&>(arg) OP ;                               \
+    }                                                                \
+  }
+#define CUTE_NAMED_UNARY_OP(NAME,OP)                                 \
+  struct NAME {                                                      \
+    template <class T>                                               \
+    CUTE_HOST_DEVICE constexpr                                       \
+    decltype(auto) operator()(T&& arg) const {                       \
+      return OP (static_cast<T&&>(arg));                              \
+    }                                                                \
+  }
+
+CUTE_LEFT_UNARY_OP(unary_plus,       +);
+CUTE_LEFT_UNARY_OP(negate,           -);
+CUTE_LEFT_UNARY_OP(bit_not,          ~);
+CUTE_LEFT_UNARY_OP(logical_not,      !);
+CUTE_LEFT_UNARY_OP(dereference,      *);
+CUTE_LEFT_UNARY_OP(address_of,       &);
+CUTE_LEFT_UNARY_OP(pre_increment,   ++);
+CUTE_LEFT_UNARY_OP(pre_decrement,   --);
+
+CUTE_RIGHT_UNARY_OP(post_increment, ++);
+CUTE_RIGHT_UNARY_OP(post_decrement, --);
+
+CUTE_NAMED_UNARY_OP(abs_fn,           abs);
+CUTE_NAMED_UNARY_OP(conjugate, cute::conj);
+
+#undef CUTE_LEFT_UNARY_OP
+#undef CUTE_RIGHT_UNARY_OP
+#undef CUTE_NAMED_UNARY_OP
+
+template <int Shift_>
+struct shift_right_const {
+  static constexpr int Shift = Shift_;
+
+  template <class T>
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto) operator()(T&& arg) const {
+    return static_cast<T&&>(arg) >> Shift;
+  }
+};
+
+template <int Shift_>
+struct shift_left_const {
+  static constexpr int Shift = Shift_;
+
+  template <class T>
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto) operator()(T&& arg) const {
+    return static_cast<T&&>(arg) << Shift;
+  }
+};
+
+/************/
+/** Binary **/
+/************/
+
+#define CUTE_BINARY_OP(NAME,OP)                                      \
+  struct NAME {                                                      \
+    template <class T, class U>                                      \
+    CUTE_HOST_DEVICE constexpr                                       \
+    decltype(auto) operator()(T&& lhs, U&& rhs) const {              \
+      return static_cast<T&&>(lhs) OP static_cast<U&&>(rhs);           \
+    }                                                                \
+  }
+#define CUTE_NAMED_BINARY_OP(NAME,OP)                                \
+  struct NAME {                                                      \
+    template <class T, class U>                                      \
+    CUTE_HOST_DEVICE constexpr                                       \
+    decltype(auto) operator()(T&& lhs, U&& rhs) const {              \
+      return OP (static_cast<T&&>(lhs), static_cast<U&&>(rhs));        \
+    }                                                                \
+  }
+
+
+CUTE_BINARY_OP(plus,                 +);
+CUTE_BINARY_OP(minus,                -);
+CUTE_BINARY_OP(multiplies,           *);
+CUTE_BINARY_OP(divides,              /);
+CUTE_BINARY_OP(modulus,              %);
+
+CUTE_BINARY_OP(plus_assign,         +=);
+CUTE_BINARY_OP(minus_assign,        -=);
+CUTE_BINARY_OP(multiplies_assign,   *=);
+CUTE_BINARY_OP(divides_assign,      /=);
+CUTE_BINARY_OP(modulus_assign,      %=);
+
+CUTE_BINARY_OP(bit_and,              &);
+CUTE_BINARY_OP(bit_or,               |);
+CUTE_BINARY_OP(bit_xor,              ^);
+CUTE_BINARY_OP(left_shift,          <<);
+CUTE_BINARY_OP(right_shift,         >>);
+
+CUTE_BINARY_OP(bit_and_assign,      &=);
+CUTE_BINARY_OP(bit_or_assign,       |=);
+CUTE_BINARY_OP(bit_xor_assign,      ^=);
+CUTE_BINARY_OP(left_shift_assign,  <<=);
+CUTE_BINARY_OP(right_shift_assign, >>=);
+
+CUTE_BINARY_OP(logical_and,         &&);
+CUTE_BINARY_OP(logical_or,          ||);
+
+CUTE_BINARY_OP(equal_to,            ==);
+CUTE_BINARY_OP(not_equal_to,        !=);
+CUTE_BINARY_OP(greater,              >);
+CUTE_BINARY_OP(less,                 <);
+CUTE_BINARY_OP(greater_equal,       >=);
+CUTE_BINARY_OP(less_equal,          <=);
+
+CUTE_NAMED_BINARY_OP(max_fn, cute::max);
+CUTE_NAMED_BINARY_OP(min_fn, cute::min);
+
+#undef CUTE_BINARY_OP
+#undef CUTE_NAMED_BINARY_OP
+
+/**********/
+/** Fold **/
+/**********/
+
+#define CUTE_FOLD_OP(NAME,OP)                                        \
+  struct NAME##_unary_rfold {                                        \
+    template <class... T>                                            \
+    CUTE_HOST_DEVICE constexpr                                       \
+    auto operator()(T&&... t) const {                                \
+      return (t OP ...);                                             \
+    }                                                                \
+  };                                                                 \
+  struct NAME##_unary_lfold {                                        \
+    template <class... T>                                            \
+    CUTE_HOST_DEVICE constexpr                                       \
+    auto operator()(T&&... t) const {                                \
+      return (... OP t);                                             \
+    }                                                                \
+  };                                                                 \
+  struct NAME##_binary_rfold {                                       \
+    template <class U, class... T>                                   \
+    CUTE_HOST_DEVICE constexpr                                       \
+    auto operator()(U&& u, T&&... t) const {                         \
+      return (t OP ... OP u);                                        \
+    }                                                                \
+  };                                                                 \
+  struct NAME##_binary_lfold {                                       \
+    template <class U, class... T>                                   \
+    CUTE_HOST_DEVICE constexpr                                       \
+    auto operator()(U&& u, T&&... t) const {                         \
+      return (u OP ... OP t);                                        \
+    }                                                                \
+  }
+
+CUTE_FOLD_OP(plus,                 +);
+CUTE_FOLD_OP(minus,                -);
+CUTE_FOLD_OP(multiplies,           *);
+CUTE_FOLD_OP(divides,              /);
+CUTE_FOLD_OP(modulus,              %);
+
+CUTE_FOLD_OP(plus_assign,         +=);
+CUTE_FOLD_OP(minus_assign,        -=);
+CUTE_FOLD_OP(multiplies_assign,   *=);
+CUTE_FOLD_OP(divides_assign,      /=);
+CUTE_FOLD_OP(modulus_assign,      %=);
+
+CUTE_FOLD_OP(bit_and,              &);
+CUTE_FOLD_OP(bit_or,               |);
+CUTE_FOLD_OP(bit_xor,              ^);
+CUTE_FOLD_OP(left_shift,          <<);
+CUTE_FOLD_OP(right_shift,         >>);
+
+CUTE_FOLD_OP(bit_and_assign,      &=);
+CUTE_FOLD_OP(bit_or_assign,       |=);
+CUTE_FOLD_OP(bit_xor_assign,      ^=);
+CUTE_FOLD_OP(left_shift_assign,  <<=);
+CUTE_FOLD_OP(right_shift_assign, >>=);
+
+CUTE_FOLD_OP(logical_and,         &&);
+CUTE_FOLD_OP(logical_or,          ||);
+
+CUTE_FOLD_OP(equal_to,            ==);
+CUTE_FOLD_OP(not_equal_to,        !=);
+CUTE_FOLD_OP(greater,              >);
+CUTE_FOLD_OP(less,                 <);
+CUTE_FOLD_OP(greater_equal,       >=);
+CUTE_FOLD_OP(less_equal,          <=);
+
+#undef CUTE_FOLD_OP
+
+/**********/
+/** Meta **/
+/**********/
+
+template <class Fn, class Arg>
+struct bound_fn {
+
+  template <class T>
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  operator()(T&& arg) {
+    return fn_(arg_, static_cast<T&&>(arg));
+  }
+
+  Fn fn_;
+  Arg arg_;
+};
+
+template <class Fn, class Arg>
+CUTE_HOST_DEVICE constexpr
+auto
+bind(Fn const& fn, Arg const& arg) {
+  return bound_fn<Fn,Arg>{fn, arg};
+}
+
+} // end namespace cute
diff --git a/include/cute/algorithm/gemm.hpp b/include/cute/algorithm/gemm.hpp
new file mode 100644
index 0000000000..c4713838b6
--- /dev/null
+++ b/include/cute/algorithm/gemm.hpp
@@ -0,0 +1,500 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+
+#include <cute/util/type_traits.hpp>
+#include <cute/algorithm/functional.hpp>
+
+#include <cute/tensor_impl.hpp>
+
+#include <cute/atom/mma_atom.hpp>
+
+/** The gemm algorithm takes four (or three) tensors and computes
+ *   D = A * B + C
+ * It dispatches based on the number of modes each tensor has:
+ *
+ * 1. `(V) x (V) => (V)`.
+ *      The element-wise product of vectors. Dispatches to FMA or MMA.
+ * 2. `(M) x (N) => (M,N)`.
+ *      The outer product of vectors. Dispatches to [3] with new mode K=(1).
+ * 3. `(M,K) x (N,K) => (M,N)`.
+ *      The product of matrices. Dispatches to [5] with MMA vector-mode V.
+ * 4. `(V,M) x (V,N) => (V,M,N)`.
+ *      The batched outer product of vectors. Accounts for register reuse and dispatches to [1] for each (m,n).
+ * 5. `(V,M,K) x (V,N,K) => (V,M,N)`.
+ *      The batched product of matrices. Dispatches to [4] for each (k).
+ */
+
+namespace cute
+{
+
+//
+// Three arguments to four
+//
+
+template <class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout>
+CUTE_HOST_DEVICE
+void
+gemm(Tensor<TA, ALayout> const& A,
+     Tensor<TB, BLayout> const& B,
+     Tensor<TC, CLayout>      & C)
+{
+  return gemm(C, A, B, C);
+}
+
+template <class MMA,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout>
+CUTE_HOST_DEVICE
+void
+gemm(MMA_Atom<MMA>       const& mma,
+     Tensor<TA, ALayout> const& A,
+     Tensor<TB, BLayout> const& B,
+     Tensor<TC, CLayout>      & C)
+{
+  return gemm(mma, C, A, B, C);
+}
+
+//
+// Accept mutable temporaries
+//
+
+template <class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout>
+CUTE_HOST_DEVICE
+void
+gemm(Tensor<TA, ALayout> const& A,
+     Tensor<TB, BLayout> const& B,
+     Tensor<TC, CLayout>     && C)
+{
+  return gemm(C, A, B, C);
+}
+
+template <class TD, class DLayout,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout>
+CUTE_HOST_DEVICE
+void
+gemm(Tensor<TD, DLayout>     && D,
+     Tensor<TA, ALayout> const& A,
+     Tensor<TB, BLayout> const& B,
+     Tensor<TC, CLayout> const& C)
+{
+  return gemm(D, A, B, C);
+}
+
+template <class MMA,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout>
+CUTE_HOST_DEVICE
+void
+gemm(MMA_Atom<MMA>       const& mma,
+     Tensor<TA, ALayout> const& A,
+     Tensor<TB, BLayout> const& B,
+     Tensor<TC, CLayout>     && C)
+{
+  return gemm(mma, C, A, B, C);
+}
+
+template <class MMA,
+          class TD, class DLayout,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout>
+CUTE_HOST_DEVICE
+void
+gemm(MMA_Atom<MMA>       const& mma,
+     Tensor<TD, DLayout>     && D,
+     Tensor<TA, ALayout> const& A,
+     Tensor<TB, BLayout> const& B,
+     Tensor<TC, CLayout> const& C)
+{
+  return gemm(mma, D, A, B, C);
+}
+
+//
+// Default MMA is UniversalFMA
+//
+
+template <class TD, class DLayout,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout>
+CUTE_HOST_DEVICE
+void
+gemm(Tensor<TD, DLayout>      & D,
+     Tensor<TA, ALayout> const& A,
+     Tensor<TB, BLayout> const& B,
+     Tensor<TC, CLayout> const& C)
+{
+  using MMA = MMA_Atom<UniversalFMA<typename Tensor<TD,DLayout>::value_type,
+                                    typename Tensor<TA,ALayout>::value_type,
+                                    typename Tensor<TB,BLayout>::value_type,
+                                    typename Tensor<TC,CLayout>::value_type>>;
+
+  return gemm(MMA{}, D, A, B, C);
+}
+
+//
+// Thread-Local Register-Memory GEMMs
+//
+
+// Dispatch [1]: (V) x (V) => (V)
+template <class MMA,
+          class TD, class DLayout,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout,
+          __CUTE_REQUIRES(DLayout::rank == 1 && is_rmem<TD>::value &&
+                          ALayout::rank == 1 && is_rmem<TA>::value &&
+                          BLayout::rank == 1 && is_rmem<TB>::value &&
+                          CLayout::rank == 1 && is_rmem<TC>::value)>
+CUTE_HOST_DEVICE
+void
+gemm(MMA_Atom<MMA>       const& mma,
+     Tensor<TD, DLayout>      & D,  // (V) Logical data
+     Tensor<TA, ALayout> const& A,  // (V) Logical data
+     Tensor<TB, BLayout> const& B,  // (V) Logical data
+     Tensor<TC, CLayout> const& C)  // (V) Logical data
+{
+  // No static assertions on (V), MMA checks compatibility
+  mma.call(D, A, B, C);
+}
+
+// Dispatch [2]: (M) x (N) => (M,N)
+template <class MMA,
+          class TD, class DLayout,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout,
+          __CUTE_REQUIRES(DLayout::rank == 2 && is_rmem<TD>::value &&
+                          ALayout::rank == 1 && is_rmem<TA>::value &&
+                          BLayout::rank == 1 && is_rmem<TB>::value &&
+                          CLayout::rank == 2 && is_rmem<TC>::value)>
+CUTE_HOST_DEVICE
+void
+gemm(MMA_Atom<MMA>       const& mma,
+     Tensor<TD, DLayout>      & D,  // (M,N) Logical data
+     Tensor<TA, ALayout> const& A,  // (M)   Logical data
+     Tensor<TB, BLayout> const& B,  // (N)   Logical data
+     Tensor<TC, CLayout> const& C)  // (M,N) Logical data
+{
+  CUTE_STATIC_ASSERT_V(size<0>(A) == size<0>(C));  // AM == CM
+  CUTE_STATIC_ASSERT_V(size<0>(B) == size<1>(C));  // BN == CN
+  CUTE_STATIC_ASSERT_V(size<0>(C) == size<0>(D) && size<1>(C) == size<1>(D));
+  gemm(mma,
+       D,                                                       // (M,N)
+       make_tensor(A.data(), append<2>(A.layout())),            // (M,1)
+       make_tensor(B.data(), append<2>(B.layout())),            // (N,1)
+       C);                                                      // (M,N)
+}
+
+// Dispatch [3]: (M,K) x (N,K) => (M,N)
+template <class MMA,
+          class TD, class DLayout,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout,
+          __CUTE_REQUIRES(DLayout::rank == 2 && is_rmem<TD>::value &&
+                          ALayout::rank == 2 && is_rmem<TA>::value &&
+                          BLayout::rank == 2 && is_rmem<TB>::value &&
+                          CLayout::rank == 2 && is_rmem<TC>::value)>
+CUTE_HOST_DEVICE
+void
+gemm(MMA_Atom<MMA>       const& mma,
+     Tensor<TD, DLayout>      & D,  // (M,N) Logical data
+     Tensor<TA, ALayout> const& A,  // (M,K) Logical data
+     Tensor<TB, BLayout> const& B,  // (N,K) Logical data
+     Tensor<TC, CLayout> const& C)  // (M,N) Logical data
+{
+  CUTE_STATIC_ASSERT_V(size<0>(A) == size<0>(C));  // AM == CM
+  CUTE_STATIC_ASSERT_V(size<0>(B) == size<1>(C));  // BN == CN
+  CUTE_STATIC_ASSERT_V(size<1>(A) == size<1>(B));  // AK == BK
+  CUTE_STATIC_ASSERT_V(size<0>(C) == size<0>(D) && size<1>(C) == size<1>(D));
+
+  // Assert this is a 1-value MMA
+  CUTE_STATIC_ASSERT_V(size<1>(typename MMA_Atom<MMA>::LayoutC_TV{}) == Int<1>{});
+  CUTE_STATIC_ASSERT_V(size<1>(typename MMA_Atom<MMA>::LayoutA_TV{}) == Int<1>{});
+  CUTE_STATIC_ASSERT_V(size<1>(typename MMA_Atom<MMA>::LayoutB_TV{}) == Int<1>{});
+
+  gemm(mma,
+       make_tensor(D.data(), prepend<3>(D.layout())),      // (1,M,N)
+       make_tensor(A.data(), prepend<3>(A.layout())),      // (1,M,K)
+       make_tensor(B.data(), prepend<3>(B.layout())),      // (1,N,K)
+       make_tensor(C.data(), prepend<3>(C.layout())));     // (1,M,N)
+}
+
+// Dispatch [4]: (V,M) x (V,N) => (V,M,N)
+template <class MMA,
+          class TD, class DLayout,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout,
+          __CUTE_REQUIRES(DLayout::rank == 3 && is_rmem<TD>::value &&
+                          ALayout::rank == 2 && is_rmem<TA>::value &&
+                          BLayout::rank == 2 && is_rmem<TB>::value &&
+                          CLayout::rank == 3 && is_rmem<TC>::value)>
+CUTE_HOST_DEVICE
+void
+gemm(MMA_Atom<MMA>       const& mma,
+     Tensor<TD, DLayout>      & D,  // (V,M,N) Logical data
+     Tensor<TA, ALayout> const& A,  // (V,M)   Logical data
+     Tensor<TB, BLayout> const& B,  // (V,N)   Logical data
+     Tensor<TC, CLayout> const& C)  // (V,M,N) Logical data
+{
+  CUTE_STATIC_ASSERT_V(size<1>(A) == size<1>(C));  // AM == CM
+  CUTE_STATIC_ASSERT_V(size<1>(B) == size<2>(C));  // BN == CN
+  CUTE_STATIC_ASSERT_V(size<0>(C) == size<0>(D) && size<1>(C) == size<1>(D) && size<2>(C) == size<2>(D));
+  auto M = size<1>(A);
+  auto N = size<1>(B);
+  // REGISTER .reuse OPTIMIZATIONS
+  // 64-bit traversal specialization -- serpentine path
+  if constexpr (decltype(size<0>(A))::value * sizeof(typename TA::value_type) == 8 &&
+                decltype(size<0>(B))::value * sizeof(typename TB::value_type) == 8)
+  {
+#if 1 // NOTE: Row- vs Col- major could depend on the C-matrix order... (which we can test)
+    // Row-major serpentine iteration
+    CUTE_UNROLL
+    for (int m = 0; m < M; ++m) {
+      CUTE_UNROLL
+      for (int n = 0; n < N; ++n) {
+        int ns = (m & 1) ? N-1-n : n;  // Serpentine coordinate
+        gemm(mma, D(_,m,ns), A(_,m), B(_,ns), C(_,m,ns));
+      }
+    }
+#else
+    // Col-major serpentine iteration
+    CUTE_UNROLL
+    for (int n = 0; n < N; ++n) {
+      CUTE_UNROLL
+      for (int m = 0; m < M; ++m) {
+        int ms = (n & 1) ? M-1-m : m;  // Serpentine coordinate
+        gemm(mma, D(_,ms,n), A(_,ms), B(_,n), C(_,ms,n));
+      }
+    }
+#endif
+  } else
+  // 32-bit traversal specialization -- kinked serpentine path
+  if constexpr (decltype(size<0>(A))::value * sizeof(typename TA::value_type) == 4 &&
+                decltype(size<0>(B))::value * sizeof(typename TB::value_type) == 4)
+  {
+#if 1  // NOTE: Row- vs Col- major could depend on the C-matrix order... (which we can test)
+    // Row-major kinked serpentine iteration
+    CUTE_UNROLL
+    for (int m = 0; m < M; m += 2) {
+      CUTE_UNROLL
+      for (int n = 0; n < N; ++n) {
+        int ns = (m & 2) ? N-1-n : n;
+        gemm(mma, D(_,m+0,ns), A(_,m+0), B(_,ns), C(_,m+0,ns));
+
+        if (m+1 < M) {
+          gemm(mma, D(_,m+1,ns), A(_,m+1), B(_,ns), C(_,m+1,ns));
+        }
+      }
+    }
+#else
+    // Col-major kinked serpentine iteration
+    CUTE_UNROLL
+    for (int n = 0; n < N; n += 2) {
+      CUTE_UNROLL
+      for (int m = 0; m < M; ++m) {
+        // Kinked serpentine traversal for maximum register reuse
+        int ms = (n & 2) ? M-1-m : m;
+        gemm(mma, D(_,ms,n+0), A(_,ms), B(_,n+0), C(_,ms,n+0));
+
+        if (n+1 < N) {
+          gemm(mma, D(_,ms,n+1), A(_,ms), B(_,n+1), C(_,ms,n+1));
+        }
+      }
+    }
+#endif
+  } else
+  // 64-bit + 32-bit traversal order -- keep A (64-bit) in the outer loop and serpentine B
+  if constexpr (decltype(size<0>(A))::value * sizeof(typename TA::value_type) == 8 &&
+                decltype(size<0>(B))::value * sizeof(typename TB::value_type) == 4) {
+    // Row-major serpentine iteration
+    CUTE_UNROLL
+    for (int m = 0; m < M; ++m) {
+      CUTE_UNROLL
+      for (int n = 0; n < N; ++n) {
+        int ns = (m & 1) ? N-1-n : n;  // Serpentine coordinate
+        gemm(mma, D(_,m,ns), A(_,m), B(_,ns), C(_,m,ns));
+      }
+    }
+  } else
+  // 32-bit + 64-bit traversal order -- keep B (64-bit) in the outer loop and serpentine A
+  if constexpr (decltype(size<0>(A))::value * sizeof(typename TA::value_type) == 4 &&
+                decltype(size<0>(B))::value * sizeof(typename TB::value_type) == 8) {
+    // Col-major serpentine iteration
+    CUTE_UNROLL
+    for (int n = 0; n < N; ++n) {
+      CUTE_UNROLL
+      for (int m = 0; m < M; ++m) {
+        int ms = (n & 1) ? M-1-m : m;  // Serpentine coordinate
+        gemm(mma, D(_,ms,n), A(_,ms), B(_,n), C(_,ms,n));
+      }
+    }
+  } else
+  // Fallback to serpentine loop
+  {
+    // Col-major serpentine iteration
+    CUTE_UNROLL
+    for (int n = 0; n < N; ++n) {
+      CUTE_UNROLL
+      for (int m = 0; m < M; ++m) {
+        int ms = (n & 1) ? M-1-m : m;  // Serpentine coordinate
+        gemm(mma, D(_,ms,n), A(_,ms), B(_,n), C(_,ms,n));
+      }
+    }
+  }
+}
+
+// Dispatch [5]: (V,M,K) x (V,N,K) => (V,M,N)
+template <class MMA,
+          class TD, class DLayout,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout,
+          __CUTE_REQUIRES(DLayout::rank == 3 && is_rmem<TD>::value &&
+                          ALayout::rank == 3 && is_rmem<TA>::value &&
+                          BLayout::rank == 3 && is_rmem<TB>::value &&
+                          CLayout::rank == 3 && is_rmem<TC>::value)>
+CUTE_HOST_DEVICE
+void
+gemm(MMA_Atom<MMA>       const& mma,
+     Tensor<TD, DLayout>      & D,  // (V,M,N) Logical data
+     Tensor<TA, ALayout> const& A,  // (V,M,K) Logical data
+     Tensor<TB, BLayout> const& B,  // (V,N,K) Logical data
+     Tensor<TC, CLayout> const& C)  // (V,M,N) Logical data
+{
+  CUTE_STATIC_ASSERT_V(size<1>(A) == size<1>(C));  // AM == CM
+  CUTE_STATIC_ASSERT_V(size<1>(B) == size<2>(C));  // BN == CN
+  CUTE_STATIC_ASSERT_V(size<2>(A) == size<2>(B));  // AK == BK
+  CUTE_STATIC_ASSERT_V(size<0>(C) == size<0>(D) && size<1>(C) == size<1>(D) && size<2>(C) == size<2>(D));
+  auto K = size<2>(A);
+
+  CUTE_UNROLL
+  for (int k = 0; k < K; ++k) {
+    gemm(mma, D, A(_,_,k), B(_,_,k), C);
+  }
+}
+
+//
+// Thread-Local Shared-Memory GEMMs
+//
+
+// Dispatch [1]: (V) x (V) => (V)
+// Dispatch [2]: (M) x (N) => (M,N)
+// Dispatch [3]: (M,K) x (N,K) => (M,N)
+// Dispatch [4]: (V,M) x (V,N) => (V,M,N)
+// Dispatch [5]: (V,M,K) x (V,N,K) => (V,M,N)
+// Dispatch [3]: (M,K) x (N,K) => (M,N)
+template <class MMA,
+          class TD, class DLayout,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout,
+          __CUTE_REQUIRES(DLayout::rank == 2 && is_rmem<TD>::value &&
+                          ALayout::rank == 2 && is_smem<TA>::value &&
+                          BLayout::rank == 2 && is_smem<TB>::value &&
+                          CLayout::rank == 2 && is_rmem<TC>::value)>
+CUTE_HOST_DEVICE
+void
+gemm(MMA_Atom<MMA>       const& mma,
+     Tensor<TD, DLayout>      & D,  // (M,N) Logical data
+     Tensor<TA, ALayout> const& A,  // (M,K) Logical data
+     Tensor<TB, BLayout> const& B,  // (N,K) Logical data
+     Tensor<TC, CLayout> const& C)  // (M,N) Logical data
+{
+  CUTE_STATIC_ASSERT_V(size<0>(A) == size<0>(C));  // AM == CM
+  CUTE_STATIC_ASSERT_V(size<0>(B) == size<1>(C));  // BN == CN
+  CUTE_STATIC_ASSERT_V(size<1>(A) == size<1>(B));  // AK == BK
+  CUTE_STATIC_ASSERT_V(size<0>(C) == size<0>(D) && size<1>(C) == size<1>(D));
+
+  // Assert this is a 1-value MMA
+  CUTE_STATIC_ASSERT_V(size<1>(typename MMA_Atom<MMA>::LayoutC_TV{}) == Int<1>{});
+  CUTE_STATIC_ASSERT_V(size<1>(typename MMA_Atom<MMA>::LayoutA_TV{}) == Int<1>{});
+  CUTE_STATIC_ASSERT_V(size<1>(typename MMA_Atom<MMA>::LayoutB_TV{}) == Int<1>{});
+
+  gemm(mma,
+       make_tensor(D.data(), prepend<3>(D.layout())),      // (1,M,N)
+       make_tensor(A.data(), prepend<3>(A.layout())),      // (1,M,K)
+       make_tensor(B.data(), prepend<3>(B.layout())),      // (1,N,K)
+       make_tensor(C.data(), prepend<3>(C.layout())));     // (1,M,N)
+}
+
+// Dispatch [5]: (V,M,K) x (V,N,K) => (V,M,N)
+template <class MMA,
+          class TD, class DLayout,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout,
+          __CUTE_REQUIRES(DLayout::rank == 3 && is_rmem<TD>::value &&
+                          ALayout::rank == 3 && is_smem<TA>::value &&
+                          BLayout::rank == 3 && is_smem<TB>::value &&
+                          CLayout::rank == 3 && is_rmem<TC>::value)>
+CUTE_HOST_DEVICE
+void
+gemm(MMA_Atom<MMA>       const& mma,
+     Tensor<TD, DLayout>      & D,  // (V,M,N) Logical data
+     Tensor<TA, ALayout> const& A,  // (V,M,K) Logical data
+     Tensor<TB, BLayout> const& B,  // (V,N,K) Logical data
+     Tensor<TC, CLayout> const& C)  // (V,M,N) Logical data
+{
+  CUTE_STATIC_ASSERT_V(size<1>(A) == size<1>(C));  // AM == CM
+  CUTE_STATIC_ASSERT_V(size<1>(B) == size<2>(C));  // BN == CN
+  CUTE_STATIC_ASSERT_V(size<2>(A) == size<2>(B));  // AK == BK
+  CUTE_STATIC_ASSERT_V(size<0>(C) == size<0>(D) && size<1>(C) == size<1>(D) && size<2>(C) == size<2>(D));
+
+  auto rA = MMA_Atom<MMA>::make_fragment_A(A);
+  auto rB = MMA_Atom<MMA>::make_fragment_B(B);
+
+  auto K = size<2>(A);
+
+  CUTE_UNROLL
+  for (int k = 0; k < K; ++k)
+  {
+    copy(A(_,_,k), rA(_,_,k));
+    copy(B(_,_,k), rB(_,_,k));
+    // Thread-level register gemm for k
+    gemm(mma, D, rA(_,_,k), rB(_,_,k), C);
+  }
+}
+
+} // end namespace cute
diff --git a/include/cute/algorithm/prefer.hpp b/include/cute/algorithm/prefer.hpp
new file mode 100644
index 0000000000..a69e504298
--- /dev/null
+++ b/include/cute/algorithm/prefer.hpp
@@ -0,0 +1,46 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+namespace cute
+{
+
+// Infinite types that inherit from each other
+template <size_t N>
+struct prefer : prefer<N-1> {};
+
+template <>
+struct prefer<0> {};
+
+// Can be used to preferencially overload implementations
+// Higher N in prefer<N> have higher priority.
+
+} // end namespace cute
diff --git a/include/cute/algorithm/prefetch.hpp b/include/cute/algorithm/prefetch.hpp
new file mode 100644
index 0000000000..c39f63acdd
--- /dev/null
+++ b/include/cute/algorithm/prefetch.hpp
@@ -0,0 +1,145 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>          // CUTE_HOST_DEVICE
+#include <cute/tensor_impl.hpp>     // cute::Tensor
+#include <cute/atom/copy_atom.hpp>  // cute::Copy_Atom
+
+namespace cute
+{
+
+//
+// Prefetch global tensors into L2
+//
+
+template <uint32_t NumThreads, uint32_t FetchBytes = 64,
+          class GEngine, class GLayout>
+CUTE_HOST_DEVICE
+void
+cooperative_prefetch(uint32_t                 const& tid,
+                     Tensor<GEngine, GLayout> const& src)
+{
+  static_assert(is_gmem<GEngine>::value, "Expected global tensor for prefetch");
+
+  constexpr int V = decltype(max_common_vector(src, src))::value;
+
+  if constexpr (V > 1) {
+    // L2 sector is 32B, default fetch granularity is 64B
+    using VecType = conditional_t<(V * sizeof_bits_v<typename GEngine::value_type>) < (FetchBytes * 8),
+                                  ArrayEngine<typename GEngine::value_type, V>,
+                                  uint8_t[FetchBytes]                         >;
+
+    Tensor src_v = recast<VecType const>(src);
+    CUTE_UNROLL
+    for (int i = tid; i < size(src_v); i += NumThreads) {
+      prefetch(raw_pointer_cast(&src_v(i)));
+    }
+  } else {
+    CUTE_UNROLL
+    for (int i = tid; i < size(src); i += NumThreads) {
+      prefetch(raw_pointer_cast(&src(i)));
+    }
+  }
+}
+
+template <class GEngine, class GLayout>
+CUTE_HOST_DEVICE
+void
+prefetch(Tensor<GEngine, GLayout> const& src)
+{
+  return cooperative_prefetch<1>(0, src);
+}
+
+// Prefetch with copy atom
+namespace detail {
+
+template <class CopyOp, class = void>
+constexpr bool has_prefetch = false;
+
+template <class CopyOp>
+constexpr bool has_prefetch<CopyOp, void_t<typename CopyOp::PREFETCH>> = true;
+
+} // end namespace detail
+
+template <class CopyOp, class... CT_Args, class... CA_Args,
+          class GEngine, class GLayout>
+CUTE_HOST_DEVICE
+void
+prefetch(Copy_Atom<Copy_Traits<CopyOp, CT_Args...>, CA_Args...> const& atom,
+         Tensor<GEngine, GLayout>                               const& src)
+{
+  if constexpr (detail::has_prefetch<CopyOp>) {
+    using Prefetch_Traits = Copy_Traits<typename CopyOp::PREFETCH, CT_Args...>;
+    using Prefetch_Atom = Copy_Atom<Prefetch_Traits, CA_Args...>;
+    Prefetch_Atom prefetch_atom{atom};
+    auto& dst = const_cast<Tensor<GEngine, GLayout>&>(src); // dst is ignored for prefetch atoms
+    return copy(prefetch_atom, src, dst);
+  } else {
+    return prefetch(src);
+  }
+}
+
+#if defined(CUTE_COPY_ATOM_TMA_SM90_ENABLED)
+template <class... CT_Args,
+          class SrcEngine, class SrcLayout>
+CUTE_HOST_DEVICE
+void
+prefetch(Copy_Traits<SM90_BULK_COPY_AUTO, CT_Args...> const& atom,
+         Tensor<SrcEngine, SrcLayout>                 const& src)
+{
+  using SrcType = typename SrcEngine::value_type;
+  static_assert(is_gmem<SrcEngine>::value, "Expected global tensor for L2 prefetch");
+
+  auto tiler = max_common_layout(src, src);
+  constexpr int vec_elem = decltype(size(tiler))::value;
+  constexpr int vec_bits = vec_elem * sizeof_bits_v<SrcType>;
+  static_assert(vec_bits >= 128, "Expected at least 128-bits for BLKCP");
+
+  // Construct a new concrete Atom of the vector size
+  auto bulk_atom = Copy_Atom<Copy_Traits<SM90_BULK_COPY_G2S, Int<vec_bits>>, SrcType>{};
+
+  return prefetch(bulk_atom, logical_divide(src, tiler));
+}
+
+// Backwards-compat. Throw out any extra Copy_Atom args.
+template <class... CT_Args, class... CA_Args,
+          class SrcEngine, class SrcLayout>
+CUTE_HOST_DEVICE
+void
+prefetch(Copy_Atom<Copy_Traits<SM90_BULK_COPY_AUTO, CT_Args...>, CA_Args...> const& atom,
+         Tensor<SrcEngine, SrcLayout>                                        const& src)
+{
+  return prefetch(static_cast<Copy_Traits<SM90_BULK_COPY_AUTO, CT_Args...> const&>(atom), src);
+}
+#endif // #if defined(CUTE_COPY_ATOM_TMA_SM90_ENABLED)
+
+} // end namespace cute
diff --git a/include/cute/algorithm/tensor_algorithms.hpp b/include/cute/algorithm/tensor_algorithms.hpp
new file mode 100644
index 0000000000..dbffc61335
--- /dev/null
+++ b/include/cute/algorithm/tensor_algorithms.hpp
@@ -0,0 +1,166 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/** Common algorithms on (hierarchical) tensors */
+
+#pragma once
+
+#include <cute/config.hpp>
+#include <cute/tensor_impl.hpp>
+
+namespace cute
+{
+
+//
+// for_each
+//
+
+template <class Engine, class Layout, class UnaryOp>
+CUTE_HOST_DEVICE constexpr
+void
+for_each(Tensor<Engine,Layout> const& tensor, UnaryOp&& op)
+{
+  CUTE_UNROLL
+  for (int i = 0; i < size(tensor); ++i) {
+    op(tensor(i));
+  }
+}
+
+template <class Engine, class Layout, class UnaryOp>
+CUTE_HOST_DEVICE constexpr
+void
+for_each(Tensor<Engine,Layout>& tensor, UnaryOp&& op)
+{
+  CUTE_UNROLL
+  for (int i = 0; i < size(tensor); ++i) {
+    op(tensor(i));
+  }
+}
+
+// Accept mutable temporaries
+template <class Engine, class Layout, class UnaryOp>
+CUTE_HOST_DEVICE constexpr
+void
+for_each(Tensor<Engine,Layout>&& tensor, UnaryOp&& op)
+{
+  return for_each(tensor, op);
+}
+
+//
+// transform
+//
+
+// Similar to std::transform but does not return number of elements affected
+template <class Engine, class Layout, class UnaryOp>
+CUTE_HOST_DEVICE constexpr
+void
+transform(Tensor<Engine,Layout>& tensor, UnaryOp&& op)
+{
+  CUTE_UNROLL
+  for (int i = 0; i < size(tensor); ++i) {
+    tensor(i) = op(tensor(i));
+  }
+}
+
+// Accept mutable temporaries
+template <class Engine, class Layout, class UnaryOp>
+CUTE_HOST_DEVICE constexpr
+void
+transform(Tensor<Engine,Layout>&& tensor, UnaryOp&& op)
+{
+  return transform(tensor, op);
+}
+
+// Similar to std::transform transforms one tensors and assigns it to another
+template <class EngineIn, class LayoutIn,
+          class EngineOut, class LayoutOut,
+          class UnaryOp>
+CUTE_HOST_DEVICE constexpr
+void
+transform(Tensor<EngineIn, LayoutIn > const& tensor_in,
+          Tensor<EngineOut,LayoutOut>      & tensor_out,
+          UnaryOp&& op)
+{
+  CUTE_UNROLL
+  for (int i = 0; i < size(tensor_in); ++i) {
+    tensor_out(i) = op(tensor_in(i));
+  }
+}
+
+// Accept mutable temporaries
+template <class EngineIn, class LayoutIn,
+          class EngineOut, class LayoutOut,
+          class UnaryOp>
+CUTE_HOST_DEVICE constexpr
+void
+transform(Tensor<EngineIn, LayoutIn > const& tensor_in,
+          Tensor<EngineOut,LayoutOut>     && tensor_out,
+          UnaryOp&& op)
+{
+  return transform(tensor_in, tensor_out, op);
+}
+
+// Similar to std::transform with a binary operation
+// Takes two tensors as input and one tensor as output.
+// Applies the binary_op to tensor_in1 and tensor_in2 and
+// assigns it to tensor_out
+template <class EngineIn1, class LayoutIn1,
+          class EngineIn2, class LayoutIn2,
+          class EngineOut, class LayoutOut,
+          class BinaryOp>
+CUTE_HOST_DEVICE constexpr
+void
+transform(Tensor<EngineIn1,LayoutIn1> const& tensor_in1,
+          Tensor<EngineIn2,LayoutIn2> const& tensor_in2,
+          Tensor<EngineOut,LayoutOut>      & tensor_out,
+          BinaryOp&& op)
+{
+  CUTE_UNROLL
+  for (int i = 0; i < size(tensor_in1); ++i) {
+    tensor_out(i) = op(tensor_in1(i), tensor_in2(i));
+  }
+}
+
+// Accept mutable temporaries
+template <class EngineIn1, class LayoutIn1,
+          class EngineIn2, class LayoutIn2,
+          class EngineOut, class LayoutOut,
+          class BinaryOp>
+CUTE_HOST_DEVICE constexpr
+void
+transform(Tensor<EngineIn1,LayoutIn1> const& tensor_in1,
+          Tensor<EngineIn2,LayoutIn2> const& tensor_in2,
+          Tensor<EngineOut,LayoutOut>     && tensor_out,
+          BinaryOp&& op)
+{
+  return transform(tensor_in1, tensor_in2, tensor_out, op);
+}
+
+} // end namespace cute
diff --git a/include/cute/algorithm/tuple_algorithms.hpp b/include/cute/algorithm/tuple_algorithms.hpp
new file mode 100644
index 0000000000..5a70f590b6
--- /dev/null
+++ b/include/cute/algorithm/tuple_algorithms.hpp
@@ -0,0 +1,1073 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+
+#include <cute/util/type_traits.hpp>
+#include <cute/container/tuple.hpp>
+#include <cute/algorithm/functional.hpp>
+#include <cute/numeric/integer_sequence.hpp>
+#include <cute/numeric/integral_constant.hpp>
+
+/// @file tuple_algorithms.hpp
+/// @brief Common algorithms on (hierarchical) tuples
+///
+/// Code guidelines and style preferences:
+///
+/// For perfect forwarding, don't use std::forward, because it may not
+/// be defined in device code when compiling with NVRTC. Instead, use
+/// `static_cast<ParameterType&&>(parameter_name)`.
+///
+/// CuTe generally does not bother forwarding functions, as
+/// reference-qualified member functions are rare in this code base.
+///
+/// Throughout CUTLASS, cute::make_tuple always needs to be called
+/// namespace-qualified, EVEN If inside the cute namespace and/or in
+/// scope of a "using namespace cute" declaration. Otherwise, the
+/// compiler may select std::make_tuple instead of cute::make_tuple,
+/// due to argument-dependent lookup.
+
+namespace cute
+{
+
+//
+// Apply (Unpack)
+// (t, f) => f(t_0,t_1,...,t_n)
+//
+
+namespace detail {
+
+template <class T, class F, int... I>
+CUTE_HOST_DEVICE constexpr
+auto
+apply(T&& t, F&& f, seq<I...>)
+{
+  return f(get<I>(static_cast<T&&>(t))...);
+}
+
+} // end namespace detail
+
+template <class T, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+apply(T&& t, F&& f)
+{
+  return detail::apply(static_cast<T&&>(t), f, tuple_seq<T>{});
+}
+
+//
+// Transform Apply
+// (t, f, g) => g(f(t_0),f(t_1),...)
+//
+
+namespace detail {
+
+template <class T, class F, class G, int... I>
+CUTE_HOST_DEVICE constexpr
+auto
+tapply(T&& t, F&& f, G&& g, seq<I...>)
+{
+  return g(f(get<I>(static_cast<T&&>(t)))...);
+}
+
+template <class T0, class T1, class F, class G, int... I>
+CUTE_HOST_DEVICE constexpr
+auto
+tapply(T0&& t0, T1&& t1, F&& f, G&& g, seq<I...>)
+{
+  return g(f(get<I>(static_cast<T0&&>(t0)),
+             get<I>(static_cast<T1&&>(t1)))...);
+}
+
+template <class T0, class T1, class T2, class F, class G, int... I>
+CUTE_HOST_DEVICE constexpr
+auto
+tapply(T0&& t0, T1&& t1, T2&& t2, F&& f, G&& g, seq<I...>)
+{
+  return g(f(get<I>(static_cast<T0&&>(t0)),
+             get<I>(static_cast<T1&&>(t1)),
+             get<I>(static_cast<T2&&>(t2)))...);
+}
+
+} // end namespace detail
+
+template <class T, class F, class G>
+CUTE_HOST_DEVICE constexpr
+auto
+transform_apply(T&& t, F&& f, G&& g)
+{
+  if constexpr (is_tuple<remove_cvref_t<T>>::value) {
+    return detail::tapply(static_cast<T&&>(t), f, g, tuple_seq<T>{});
+  } else {
+    return g(f(static_cast<T&&>(t)));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T0, class T1, class F, class G>
+CUTE_HOST_DEVICE constexpr
+auto
+transform_apply(T0&& t0, T1&& t1, F&& f, G&& g)
+{
+  if constexpr (is_tuple<remove_cvref_t<T0>>::value) {
+    return detail::tapply(static_cast<T0&&>(t0), static_cast<T1&&>(t1), f, g, tuple_seq<T0>{});
+  } else {
+    return g(f(static_cast<T0&&>(t0), static_cast<T1&&>(t1)));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T0, class T1, class T2, class F, class G>
+CUTE_HOST_DEVICE constexpr
+auto
+transform_apply(T0&& t0, T1&& t1, T2&& t2, F&& f, G&& g)
+{
+  if constexpr (is_tuple<remove_cvref_t<T0>>::value) {
+    return detail::tapply(static_cast<T0&&>(t0), static_cast<T1&&>(t1), static_cast<T2&&>(t2), f, g, tuple_seq<T0>{});
+  } else {
+    return g(f(static_cast<T0&&>(t0), static_cast<T1&&>(t1), static_cast<T2&&>(t2)));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// For Each
+// (t, f) => f(t_0),f(t_1),...,f(t_n)
+//
+
+template <class T, class F>
+CUTE_HOST_DEVICE constexpr
+void
+for_each(T&& t, F&& f)
+{
+  if constexpr (is_tuple<remove_cvref_t<T>>::value) {
+    return detail::apply(t, [&](auto&&... a) { (f(static_cast<decltype(a)&&>(a)), ...); }, tuple_seq<T>{});
+  } else {
+    return f(static_cast<T&&>(t));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+for_each_leaf(T&& t, F&& f)
+{
+  if constexpr (is_tuple<remove_cvref_t<T>>::value) {
+    return detail::apply(static_cast<T&&>(t), [&](auto&&... a){ return (for_each_leaf(static_cast<decltype(a)&&>(a), f), ...); }, tuple_seq<T>{});
+  } else {
+    return f(static_cast<T&&>(t));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Transform
+// (t, f) => (f(t_0),f(t_1),...,f(t_n))
+//
+
+template <class T, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+transform(T const& t, F&& f)
+{
+  if constexpr (is_tuple<T>::value) {
+    return detail::tapply(t, f, [](auto const&... a){ return cute::make_tuple(a...); }, tuple_seq<T>{});
+  } else {
+    return f(t);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T0, class T1, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+transform(T0 const& t0, T1 const& t1, F&& f)
+{
+  if constexpr (is_tuple<T0>::value) {
+    static_assert(tuple_size<T0>::value == tuple_size<T1>::value, "Mismatched tuple_size");
+    return detail::tapply(t0, t1, f, [](auto const&... a){ return cute::make_tuple(a...); }, tuple_seq<T0>{});
+  } else {
+    return f(t0, t1);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T0, class T1, class T2, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+transform(T0 const& t0, T1 const& t1, T2 const& t2, F&& f)
+{
+  if constexpr (is_tuple<T0>::value) {
+    static_assert(tuple_size<T0>::value == tuple_size<T1>::value, "Mismatched tuple_size");
+    static_assert(tuple_size<T0>::value == tuple_size<T2>::value, "Mismatched tuple_size");
+    return detail::tapply(t0, t1, t2, f, [](auto const&... a){ return cute::make_tuple(a...); }, tuple_seq<T0>{});
+  } else {
+    return f(t0, t1, t2);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+transform_leaf(T const& t, F&& f)
+{
+  if constexpr (is_tuple<T>::value) {
+    return transform(t, [&](auto const& a) { return transform_leaf(a, f); });
+  } else {
+    return f(t);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T0, class T1, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+transform_leaf(T0 const& t0, T1 const& t1, F&& f)
+{
+  if constexpr (is_tuple<T0>::value) {
+    return transform(t0, t1, [&](auto const& a, auto const& b) { return transform_leaf(a, b, f); });
+  } else {
+    return f(t0, t1);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// find and find_if
+//
+
+namespace detail {
+
+template <class T, class F, int I, int... Is>
+CUTE_HOST_DEVICE constexpr
+auto
+find_if(T const& t, F&& f, seq<I,Is...>)
+{
+  if constexpr (decltype(f(get<I>(t)))::value) {
+    return cute::C<I>{};
+  } else
+  if constexpr (sizeof...(Is) == 0) {
+    return cute::C<I+1>{};
+  } else {
+    return find_if(t, f, seq<Is...>{});
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+} // end namespace detail
+
+template <class T, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+find_if(T const& t, F&& f)
+{
+  if constexpr (is_tuple<T>::value) {
+    return detail::find_if(t, f, tuple_seq<T>{});
+  } else {
+    return cute::C<decltype(f(t))::value ? 0 : 1>{};
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T, class X>
+CUTE_HOST_DEVICE constexpr
+auto
+find(T const& t, X const& x)
+{
+  return find_if(t, [&](auto const& v) { return v == x; });  // This should always return a static true/false
+}
+
+template <class T, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+any_of(T const& t, F&& f)
+{
+  if constexpr (is_tuple<T>::value) {
+    return detail::apply(cute::transform(t, f), [&] (auto const&... a) { return (false_type{} || ... || a); }, tuple_seq<T>{});
+  } else {
+    return f(t);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+all_of(T const& t, F&& f)
+{
+  if constexpr (is_tuple<T>::value) {
+    return detail::apply(cute::transform(t, f), [&] (auto const&... a) { return (true_type{} && ... && a); }, tuple_seq<T>{});
+  } else {
+    return f(t);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+none_of(T const& t, F&& f)
+{
+  return not any_of(t, f);
+}
+
+//
+// Filter
+// (t, f) => <f(t_0),f(t_1),...,f(t_n)>
+//
+
+template <class T, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+filter_tuple(T const& t, F&& f)
+{
+  return transform_apply(t, f, [](auto const&... a) { return cute::tuple_cat(a...); });
+}
+
+template <class T0, class T1, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+filter_tuple(T0 const& t0, T1 const& t1, F&& f)
+{
+  return transform_apply(t0, t1, f, [](auto const&... a) { return cute::tuple_cat(a...); });
+}
+
+template <class T0, class T1, class T2, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+filter_tuple(T0 const& t0, T1 const& t1, T2 const& t2, F&& f)
+{
+  return transform_apply(t0, t1, t2, f, [](auto const&... a) { return cute::tuple_cat(a...); });
+}
+
+//
+// Fold (Reduce, Accumulate)
+// (t, v, f) => f(...f(f(v,t_0),t_1),...,t_n)
+//
+
+namespace detail {
+
+template <class Fn, class Val>
+struct FoldAdaptor {
+  template <class X>
+  CUTE_HOST_DEVICE constexpr auto operator|(X&& x) {
+    auto r = fn_(val_, static_cast<X&&>(x));
+    return FoldAdaptor<Fn, decltype(r)>{fn_, r};
+  }
+  Fn fn_;
+  Val val_;
+};
+
+template <class T, class V, class F, int... Is>
+CUTE_HOST_DEVICE constexpr
+auto
+fold(T&& t, V const& v, F&& f, seq<Is...>)
+{
+  return (FoldAdaptor<F,V>{f,v} | ... | get<Is>(static_cast<T&&>(t))).val_;
+}
+
+} // end namespace detail
+
+template <class T, class V, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+fold(T&& t, V const& v, F&& f)
+{
+  if constexpr (is_tuple<remove_cvref_t<T>>::value) {
+    return detail::fold(static_cast<T&&>(t), v, f, tuple_seq<T>{});
+  } else {
+    return f(v, static_cast<T&&>(t));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+fold_first(T&& t, F&& f)
+{
+  if constexpr (is_tuple<remove_cvref_t<T>>::value) {
+    return detail::fold(static_cast<T&&>(t), get<0>(t), f, make_range<1,tuple_size<remove_cvref_t<T>>::value>{});
+  } else {
+    return t;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// front, back, take, select, unwrap
+//
+
+// Get the first non-tuple element in a hierarchical tuple
+template <class T>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+front(T&& t)
+{
+  if constexpr (is_tuple<remove_cvref_t<T>>::value) {
+    return front(get<0>(static_cast<T&&>(t)));
+  } else {
+    return static_cast<T&&>(t);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Get the last non-tuple element in a hierarchical tuple
+template <class T>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+back(T&& t)
+{
+  if constexpr (is_tuple<remove_cvref_t<T>>::value) {
+    constexpr int N = tuple_size<remove_cvref_t<T>>::value;
+
+    // MSVC needs a bit of extra help here deducing return types.
+    // We help it by peeling off the nonrecursive case a level "early."
+    if constexpr (! is_tuple<remove_cvref_t<decltype(get<N - 1>(static_cast<T&&>(t)))>>::value) {
+      return get<N - 1>(static_cast<T&&>(t));
+    } else {
+      return back(get<N - 1>(static_cast<T&&>(t)));
+    }
+  } else {
+    return static_cast<T&&>(t);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Takes the elements in the range [B,E)
+template <int B, int E, class T>
+CUTE_HOST_DEVICE constexpr
+auto
+take(T const& t)
+{
+  if constexpr (E == -1) {
+    if constexpr (is_tuple<T>::value) {
+      return take<B,tuple_size<T>::value>(t);
+    } else {
+      return take<B,1>(t);
+    }
+  } else
+  if constexpr (B <= E) {
+    return detail::apply(t, [](auto const&... a) { return cute::make_tuple(a...); }, make_range<B,E>{});
+  } else {
+    static_assert(B <= E);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Select tuple elements with given indices.
+template <int... I, class T>
+CUTE_HOST_DEVICE constexpr
+auto
+select(T const& t)
+{
+  return cute::make_tuple(get<I>(t)...);
+}
+
+// Wrap non-tuples into rank-1 tuples or forward
+template <class T>
+CUTE_HOST_DEVICE constexpr
+auto
+wrap(T const& t)
+{
+  if constexpr (is_tuple<T>::value) {
+    return t;
+  } else {
+    return cute::make_tuple(t);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Unwrap rank-1 tuples until we're left with a rank>1 tuple or a non-tuple
+template <class T>
+CUTE_HOST_DEVICE constexpr
+auto
+unwrap(T const& t)
+{
+  if constexpr (is_tuple<T>::value) {
+    if constexpr (tuple_size<T>::value == 1) {
+      return unwrap(get<0>(t));
+    } else {
+      return t;
+    }
+  } else {
+    return t;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Flatten and Unflatten
+//
+
+template <class T>
+struct is_flat : true_type {};
+
+template <class... Ts>
+struct is_flat<tuple<Ts...>> : bool_constant<(true && ... && (not is_tuple<Ts>::value))> {};
+
+// Flatten a hierarchical tuple to a tuple of depth one
+//   and wrap non-tuples into a rank-1 tuple.
+template <class T>
+CUTE_HOST_DEVICE constexpr
+auto
+flatten_to_tuple(T const& t)
+{
+  if constexpr (is_tuple<T>::value) {
+    if constexpr (is_flat<T>::value) {      // Shortcut for perf
+      return t;
+    } else {
+      return filter_tuple(t, [](auto const& a) { return flatten_to_tuple(a); });
+    }
+  } else {
+    return cute::make_tuple(t);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Flatten a hierarchical tuple to a tuple of depth one
+//   and leave non-tuple untouched.
+template <class T>
+CUTE_HOST_DEVICE constexpr
+auto
+flatten(T const& t)
+{
+  if constexpr (is_tuple<T>::value) {
+    if constexpr (is_flat<T>::value) {      // Shortcut for perf
+      return t;
+    } else {
+      return filter_tuple(t, [](auto const& a) { return flatten_to_tuple(a); });
+    }
+  } else {
+    return t;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+namespace detail {
+
+template <class FlatTuple, class TargetProfile>
+CUTE_HOST_DEVICE constexpr
+auto
+unflatten_impl(FlatTuple const& flat_tuple, TargetProfile const& target_profile)
+{
+  if constexpr (is_tuple<TargetProfile>::value) {
+    return fold(target_profile, cute::make_tuple(cute::make_tuple(), flat_tuple), [](auto const& v, auto const& t) {
+      auto [result, remaining_tuple] = v;
+      auto [sub_result, sub_tuple] = unflatten_impl(remaining_tuple, t);
+      return cute::make_tuple(append(result, sub_result), sub_tuple);
+    });
+  } else {
+    return cute::make_tuple(get<0>(flat_tuple), take<1, decltype(rank(flat_tuple))::value>(flat_tuple));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+}  // end namespace detail
+
+// Unflatten a flat tuple into a hierarchical tuple
+// @pre flatten(@a flat_tuple) == @a flat_tuple
+// @pre rank(flatten(@a target_profile)) == rank(@a flat_tuple)
+// @post congruent(@a result, @a target_profile)
+// @post flatten(@a result) == @a flat_tuple
+template <class FlatTuple, class TargetProfile>
+CUTE_HOST_DEVICE constexpr
+auto
+unflatten(FlatTuple const& flat_tuple, TargetProfile const& target_profile)
+{
+  auto [unflatten_tuple, flat_remainder] = detail::unflatten_impl(flat_tuple, target_profile);
+  CUTE_STATIC_ASSERT_V(rank(flat_remainder) == Int<0>{});
+  return unflatten_tuple;
+}
+
+//
+// insert and remove and replace
+//
+
+namespace detail {
+
+// Shortcut around cute::tuple_cat for common insert/remove/repeat cases
+template <class T, class X, int... I, int... J, int... K>
+CUTE_HOST_DEVICE constexpr
+auto
+construct(T const& t, X const& x, seq<I...>, seq<J...>, seq<K...>)
+{
+  return cute::make_tuple(get<I>(t)..., (void(J),x)..., get<K>(t)...);
+}
+
+} // end namespace detail
+
+// Insert x into the Nth position of the tuple
+template <int N, class T, class X>
+CUTE_HOST_DEVICE constexpr
+auto
+insert(T const& t, X const& x)
+{
+  return detail::construct(t, x, make_seq<N>{}, seq<0>{}, make_range<N,tuple_size<T>::value>{});
+}
+
+// Remove the Nth element of the tuple
+template <int N, class T>
+CUTE_HOST_DEVICE constexpr
+auto
+remove(T const& t)
+{
+  return detail::construct(t, 0, make_seq<N>{}, seq<>{}, make_range<N+1,tuple_size<T>::value>{});
+}
+
+// Replace the Nth element of the tuple with x
+template <int N, class T, class X>
+CUTE_HOST_DEVICE constexpr
+auto
+replace(T const& t, X const& x)
+{
+  if constexpr (is_tuple<T>::value) {
+    return detail::construct(t, x, make_seq<N>{}, seq<0>{}, make_range<N+1,tuple_size<T>::value>{});
+  } else {
+    static_assert(N == 0);
+    return x;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Replace the first element of the tuple with x
+template <class T, class X>
+CUTE_HOST_DEVICE constexpr
+auto
+replace_front(T const& t, X const& x)
+{
+  if constexpr (is_tuple<T>::value) {
+    return detail::construct(t, x, seq<>{}, seq<0>{}, make_range<1,tuple_size<T>::value>{});
+  } else {
+    return x;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Replace the last element of the tuple with x
+template <class T, class X>
+CUTE_HOST_DEVICE constexpr
+auto
+replace_back(T const& t, X const& x)
+{
+  if constexpr (is_tuple<T>::value) {
+    return detail::construct(t, x, make_seq<tuple_size<T>::value-1>{}, seq<0>{}, seq<>{});
+  } else {
+    return x;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Make a tuple of Xs of tuple_size N
+//
+
+template <int N, class X>
+CUTE_HOST_DEVICE constexpr
+auto
+tuple_repeat(X const& x)
+{
+  return detail::construct(0, x, seq<>{}, make_seq<N>{}, seq<>{});
+}
+
+//
+// Make repeated Xs of rank N
+//
+
+template <int N, class X>
+CUTE_HOST_DEVICE constexpr
+auto
+repeat(X const& x)
+{
+  if constexpr (N == 1) {
+    return x;
+  } else {
+    return detail::construct(0, x, seq<>{}, make_seq<N>{}, seq<>{});
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Make a tuple of Xs the same profile as tuple T
+//
+
+template <class T, class X>
+CUTE_HOST_DEVICE constexpr
+auto
+repeat_like(T const& t, X const& x)
+{
+  if constexpr (is_tuple<T>::value) {
+    return transform(t, [&](auto const& a) { return repeat_like(a,x); });
+  } else {
+    return x;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Group the elements [B,E) of a T into a single element
+// e.g. group<2,4>(T<_1,_2,_3,_4,_5,_6>{})
+//              => T<_1,_2,T<_3,_4>,_5,_6>{}
+template <int B, int E, class T>
+CUTE_HOST_DEVICE constexpr
+auto
+group(T const& t)
+{
+  if constexpr (not is_tuple<T>::value) {
+    if constexpr (E == -1) {
+      return group<B,1>(t);
+    } else {
+      return detail::construct(t, take<B,E>(t), make_seq<B>{}, make_seq<(B < E)>{}, make_range<E,1>{});
+    }
+  } else
+  if constexpr (E == -1) {
+    return group<B,tuple_size<T>::value>(t);
+  } else
+  if constexpr (B <= E) {
+    return detail::construct(t, take<B,E>(t), make_seq<B>{}, make_seq<(B < E)>{}, make_range<E,tuple_size<T>::value>{});
+  } else {
+    static_assert(B <= E);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Extend a T to rank N by appending/prepending an element
+//
+
+template <int N, class T, class X>
+CUTE_HOST_DEVICE constexpr
+auto
+append(T const& a, X const& x)
+{
+  if constexpr (is_tuple<T>::value) {
+    if constexpr (N == tuple_size<T>::value) {
+      return a;
+    } else {
+      static_assert(N > tuple_size<T>::value);
+      return detail::construct(a, x, make_seq<tuple_size<T>::value>{}, make_seq<N-tuple_size<T>::value>{}, seq<>{});
+    }
+  } else {
+    if constexpr (N == 1) {
+      return a;
+    } else {
+      return detail::construct(cute::make_tuple(a), x, seq<0>{}, make_seq<N-1>{}, seq<>{});
+    }
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T, class X>
+CUTE_HOST_DEVICE constexpr
+auto
+append(T const& a, X const& x)
+{
+  if constexpr (is_tuple<T>::value) {
+    return detail::construct(a, x, make_seq<tuple_size<T>::value>{}, seq<0>{}, seq<>{});
+  } else {
+    return cute::make_tuple(a, x);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <int N, class T, class X>
+CUTE_HOST_DEVICE constexpr
+auto
+prepend(T const& a, X const& x)
+{
+  if constexpr (is_tuple<T>::value) {
+    if constexpr (N == tuple_size<T>::value) {
+      return a;
+    } else {
+      static_assert(N > tuple_size<T>::value);
+      return detail::construct(a, x, seq<>{}, make_seq<N-tuple_size<T>::value>{}, make_seq<tuple_size<T>::value>{});
+    }
+  } else {
+    if constexpr (N == 1) {
+      return a;
+    } else {
+      static_assert(N > 1);
+      return detail::construct(cute::make_tuple(a), x, seq<>{}, make_seq<N-1>{}, seq<0>{});
+    }
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T, class X>
+CUTE_HOST_DEVICE constexpr
+auto
+prepend(T const& a, X const& x)
+{
+  if constexpr (is_tuple<T>::value) {
+    return detail::construct(a, x, seq<>{}, seq<0>{}, make_seq<tuple_size<T>::value>{});
+  } else {
+    return cute::make_tuple(x, a);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Inclusive scan (prefix sum)
+//
+
+namespace detail {
+
+template <class T, class V, class F, int I, int... Is>
+CUTE_HOST_DEVICE constexpr
+auto
+iscan(T const& t, V const& v, F&& f, seq<I,Is...>)
+{
+  // Apply the function to v and the element at I
+  auto v_next = f(v, get<I>(t));
+  // Replace I with v_next
+  auto t_next = replace<I>(t, v_next);
+
+#if 0
+  std::cout << "ISCAN i" << I << std::endl;
+  std::cout << "  t      " << t << std::endl;
+  std::cout << "  i      " << v << std::endl;
+  std::cout << "  f(i,t) " << v_next << std::endl;
+  std::cout << "  t_n    " << t_next << std::endl;
+#endif
+
+  if constexpr (sizeof...(Is) == 0) {
+    return t_next;
+  } else {
+    return iscan(t_next, v_next, f, seq<Is...>{});
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+} // end namespace detail
+
+template <class T, class V, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+iscan(T const& t, V const& v, F&& f)
+{
+  return detail::iscan(t, v, f, tuple_seq<T>{});
+}
+
+//
+// Exclusive scan (prefix sum)
+//
+
+namespace detail {
+
+template <class T, class V, class F, int I, int... Is>
+CUTE_HOST_DEVICE constexpr
+auto
+escan(T const& t, V const& v, F&& f, seq<I,Is...>)
+{
+  if constexpr (sizeof...(Is) == 0) {
+    // Replace I with v
+    return replace<I>(t, v);
+  } else {
+    // Apply the function to v and the element at I
+    auto v_next = f(v, get<I>(t));
+    // Replace I with v
+    auto t_next = replace<I>(t, v);
+
+#if 0
+    std::cout << "ESCAN i" << I << std::endl;
+    std::cout << "  t      " << t << std::endl;
+    std::cout << "  i      " << v << std::endl;
+    std::cout << "  f(i,t) " << v_next << std::endl;
+    std::cout << "  t_n    " << t_next << std::endl;
+#endif
+
+    // Recurse
+    return escan(t_next, v_next, f, seq<Is...>{});
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+} // end namespace detail
+
+template <class T, class V, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+escan(T const& t, V const& v, F&& f)
+{
+  return detail::escan(t, v, f, tuple_seq<T>{});
+}
+
+//
+// Zip (Transpose)
+//
+
+// Take       ((a,b,c,...),(x,y,z,...),...)        rank-R0 x rank-R1 input
+// to produce ((a,x,...),(b,y,...),(c,z,...),...)  rank-R1 x rank-R0 output
+
+namespace detail {
+
+template <int J, class... Ts>
+CUTE_HOST_DEVICE constexpr
+auto
+zip_(Ts const&... ts)
+{
+  return cute::make_tuple(get<J>(ts)...);
+}
+
+template <class T, int... Is, int... Js>
+CUTE_HOST_DEVICE constexpr
+auto
+zip(T const& t, seq<Is...>, seq<Js...>)
+{
+  static_assert(conjunction<bool_constant<tuple_size<tuple_element_t<0,T>>::value == tuple_size<tuple_element_t<Is,T>>::value>...>::value, "Mismatched Ranks");
+  return cute::make_tuple(zip_<Js>(get<Is>(t)...)...);
+}
+
+} // end namespace detail
+
+template <class T>
+CUTE_HOST_DEVICE constexpr
+auto
+zip(T const& t)
+{
+  if constexpr (is_tuple<T>::value) {
+    if constexpr (is_tuple<tuple_element_t<0,T>>::value) {
+      return detail::zip(t, tuple_seq<T>{}, tuple_seq<tuple_element_t<0,T>>{});
+    } else {
+      return cute::make_tuple(t);
+    }
+  } else {
+    return t;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Convenient to pass them in separately
+template <class T0, class T1, class... Ts>
+CUTE_HOST_DEVICE constexpr
+auto
+zip(T0 const& t0, T1 const& t1, Ts const&... ts)
+{
+  return zip(cute::make_tuple(t0, t1, ts...));
+}
+
+//
+// zip2_by -- A guided zip for rank-2 tuples
+//   Take a tuple like ((A,a),((B,b),(C,c)),d)
+//   and produce a tuple ((A,(B,C)),(a,(b,c),d))
+//   where the rank-2 modes are selected by the terminals of the guide (X,(X,X))
+//
+
+namespace detail {
+
+template <class T, class TG, int... Is, int... Js>
+CUTE_HOST_DEVICE constexpr
+auto
+zip2_by(T const& t, TG const& guide, seq<Is...>, seq<Js...>)
+{
+  // zip2_by produces the modes like ((A,a),(B,b),...)
+  auto split = cute::make_tuple(zip2_by(get<Is>(t), get<Is>(guide))...);
+
+  // Rearrange and append missing modes from t to make ((A,B,...),(a,b,...,x,y))
+  return cute::make_tuple(cute::make_tuple(get<0>(get<Is>(split))...),
+                          cute::make_tuple(get<1>(get<Is>(split))..., get<Js>(t)...));
+}
+
+} // end namespace detail
+
+template <class T, class TG>
+CUTE_HOST_DEVICE constexpr
+auto
+zip2_by(T const& t, TG const& guide)
+{
+  if constexpr (is_tuple<TG>::value) {
+    constexpr int TR = tuple_size<T>::value;
+    constexpr int GR = tuple_size<TG>::value;
+    static_assert(TR >= GR, "Mismatched ranks");
+    return detail::zip2_by(t, guide,
+                           make_range< 0, GR>{},
+                           make_range<GR, TR>{});
+  } else {
+    static_assert(tuple_size<T>::value == 2, "Mismatched ranks");
+    return t;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+/// @return A tuple of the elements of @c t in reverse order.
+template <class T>
+CUTE_HOST_DEVICE constexpr
+auto
+reverse(T const& t)
+{
+  if constexpr (is_tuple<T>::value) {
+    return detail::apply(t, [](auto const&... a){ return cute::make_tuple(a...); }, tuple_rseq<T>{});
+  } else {
+    return t;
+  }
+}
+
+} // end namespace cute
diff --git a/include/cute/arch/cluster_sm90.hpp b/include/cute/arch/cluster_sm90.hpp
new file mode 100644
index 0000000000..8fff51be8e
--- /dev/null
+++ b/include/cute/arch/cluster_sm90.hpp
@@ -0,0 +1,245 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+
+// Config
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && \
+  ((__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8))))
+#  define CUTE_ARCH_CLUSTER_SM90_ENABLED
+#endif
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDACC_VER_MAJOR__ >= 12))
+#  define CUTE_ARCH_ELECT_ONE_SM90_ENABLED
+#endif
+
+namespace cute {
+
+CUTE_DEVICE void cluster_arrive_relaxed()
+{
+#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
+  asm volatile("barrier.cluster.arrive.relaxed.aligned;\n" : : );
+#else
+  CUTE_INVALID_CONTROL_PATH("CUTE_ARCH_CLUSTER_SM90_ENABLED is not defined");
+#endif
+}
+
+CUTE_DEVICE void cluster_arrive()
+{
+#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
+  asm volatile("barrier.cluster.arrive.aligned;\n" : : );
+#else
+  CUTE_INVALID_CONTROL_PATH("CUTE_ARCH_CLUSTER_SM90_ENABLED is not defined");
+#endif
+}
+
+CUTE_DEVICE void cluster_wait()
+{
+#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
+  asm volatile("barrier.cluster.wait.aligned;\n" : : );
+#else
+  CUTE_INVALID_CONTROL_PATH("CUTE_ARCH_CLUSTER_SM90_ENABLED is not defined");
+#endif
+}
+
+CUTE_DEVICE void cluster_sync()
+{
+#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
+  cluster_arrive();
+  cluster_wait();
+#else
+  CUTE_INVALID_CONTROL_PATH("CUTE_ARCH_CLUSTER_SM90_ENABLED is not defined");
+#endif
+}
+
+// Returns the dim3 grid size in terms of number of clusters.
+CUTE_DEVICE dim3 cluster_grid_dims()
+{
+#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
+  uint32_t x, y, z;
+  asm volatile("mov.u32 %0, %%nclusterid.x;\n" : "=r"(x) : );
+  asm volatile("mov.u32 %0, %%nclusterid.y;\n" : "=r"(y) : );
+  asm volatile("mov.u32 %0, %%nclusterid.z;\n" : "=r"(z) : );
+  return {x, y, z};
+#elif defined(__CUDA_ARCH__)
+  // MSVC requires protecting use of gridDim with __CUDA_ARCH__.
+  return gridDim;
+#elif defined(_MSC_VER)
+  CUTE_INVALID_CONTROL_PATH("cluster_grid_dims() can only be called on device");
+  return {0, 0, 0};
+#else
+  return {0, 0, 0};
+#endif
+}
+
+// Returns the dim3 cluster rank in the grid.
+CUTE_DEVICE dim3 cluster_id_in_grid()
+{
+#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
+  uint32_t x, y, z;
+  asm volatile("mov.u32 %0, %%clusterid.x;\n" : "=r"(x) : );
+  asm volatile("mov.u32 %0, %%clusterid.y;\n" : "=r"(y) : );
+  asm volatile("mov.u32 %0, %%clusterid.z;\n" : "=r"(z) : );
+  return {x, y, z};
+#elif defined(__CUDA_ARCH__)
+  // MSVC requires protecting use of blockIdx with __CUDA_ARCH__.
+  return blockIdx;
+#elif defined(_MSC_VER)
+  CUTE_INVALID_CONTROL_PATH("cluster_id_in_grid() can only be called on device");
+  return {0, 0, 0};
+#else
+  return {0, 0, 0};
+#endif
+}
+
+// Returns the relative dim3 block rank local to the cluster.
+CUTE_DEVICE dim3 block_id_in_cluster()
+{
+#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
+  uint32_t x, y, z;
+  asm volatile("mov.u32 %0, %%cluster_ctaid.x;\n" : "=r"(x) : );
+  asm volatile("mov.u32 %0, %%cluster_ctaid.y;\n" : "=r"(y) : );
+  asm volatile("mov.u32 %0, %%cluster_ctaid.z;\n" : "=r"(z) : );
+  return {x, y, z};
+#else
+  return {0,0,0};
+#endif
+}
+
+// Returns the dim3 cluster shape.
+CUTE_DEVICE dim3 cluster_shape()
+{
+#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
+  uint32_t x, y, z;
+  asm volatile("mov.u32 %0, %%cluster_nctaid.x;\n" : "=r"(x) : );
+  asm volatile("mov.u32 %0, %%cluster_nctaid.y;\n" : "=r"(y) : );
+  asm volatile("mov.u32 %0, %%cluster_nctaid.z;\n" : "=r"(z) : );
+  return {x, y, z};
+#else
+  return {1,1,1};
+#endif
+}
+
+// Get 1D ctaid in a cluster.
+CUTE_DEVICE uint32_t block_rank_in_cluster()
+{
+#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
+  uint32_t rank;
+  asm volatile("mov.u32 %0, %%cluster_ctarank;\n" : "=r"(rank) :);
+  return rank;
+#else
+  return 0;
+#endif
+}
+
+// Set the destination block-ID in cluster for a given SMEM Address
+CUTE_DEVICE uint32_t set_block_rank(uint32_t smemAddr, uint32_t rank)
+{
+#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
+  uint32_t result;
+  asm volatile("mapa.shared::cluster.u32  %0, %1, %2;\n"
+              : "=r"(result)
+              : "r"(smemAddr), "r"(rank));
+  return result;
+#else
+  return smemAddr;
+#endif
+}
+
+// Elect one thread in the warp. The elected thread gets its predicate set to true, all others obtain false.
+CUTE_HOST_DEVICE uint32_t elect_one_sync()
+{
+#if defined(CUTE_ARCH_ELECT_ONE_SM90_ENABLED)
+  uint32_t pred = 0;
+  uint32_t laneid = 0;
+  asm volatile(
+    "{\n"
+    ".reg .b32 %%rx;\n"
+    ".reg .pred %%px;\n"
+    "     elect.sync %%rx|%%px, %2;\n"
+    "@%%px mov.s32 %1, 1;\n"
+    "     mov.s32 %0, %%rx;\n"
+    "}\n"
+    : "+r"(laneid), "+r"(pred)
+    : "r"(0xFFFFFFFF));
+  return pred;
+#elif defined(__CUDA_ARCH__)
+  return (threadIdx.x % 32) == 0;
+#else
+  return true;
+#endif
+}
+
+struct ElectOneLaneIdReturnType {
+  uint32_t is_leader;
+  uint32_t leader_lane_id;
+};
+
+CUTE_HOST_DEVICE
+ElectOneLaneIdReturnType
+elect_one_leader_sync()
+{
+#if defined(CUTE_ARCH_ELECT_ONE_SM90_ENABLED)
+  uint32_t pred = 0;
+  uint32_t laneid = 0;
+  asm volatile(
+    "{\n"
+    ".reg .b32 %%rx;\n"
+    ".reg .pred %%px;\n"
+    "     elect.sync %%rx|%%px, %2;\n"
+    "@%%px mov.s32 %1, 1;\n"
+    "     mov.s32 %0, %%rx;\n"
+    "}\n"
+    : "+r"(laneid), "+r"(pred)
+    : "r"(0xFFFFFFFF));
+  return {pred, laneid};
+#elif defined(__CUDA_ARCH__)
+  return {(threadIdx.x % 32) == 0, 0};
+#else
+  return {true, 0};
+#endif
+}
+
+// Store value to remote shared memory in the cluster
+CUTE_DEVICE
+void
+store_shared_remote(uint32_t value, uint32_t smem_addr, uint32_t mbarrier_addr, uint32_t dst_cta_rank)
+{
+#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
+  uint32_t dsmem_addr = set_block_rank(smem_addr, dst_cta_rank);
+  uint32_t remote_barrier_addr = set_block_rank(mbarrier_addr, dst_cta_rank);
+  asm volatile("st.async.shared::cluster.mbarrier::complete_tx::bytes.u32 [%0], %1, [%2];"
+               : : "r"(dsmem_addr), "r"(value), "r"(remote_barrier_addr));
+#endif
+}
+
+} // end namespace cute
diff --git a/include/cute/arch/config.hpp b/include/cute/arch/config.hpp
new file mode 100644
index 0000000000..84d7779a34
--- /dev/null
+++ b/include/cute/arch/config.hpp
@@ -0,0 +1,50 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cutlass/arch/config.h> // CUTLASS_ARCH_MMA_SMxx_ENABLED
+
+// TMA instructions
+#if defined(CUTLASS_ARCH_MMA_SM90_ENABLED)
+#  define CUTE_ARCH_TMA_SM90_ENABLED
+#endif
+
+#if defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_ENABLED)
+#  define CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED
+#endif
+
+// STSM
+#if defined(CUTLASS_ARCH_MMA_SM90_ENABLED)
+#  define CUTE_ARCH_STSM_SM90_ENABLED
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/include/cute/arch/copy.hpp b/include/cute/arch/copy.hpp
new file mode 100644
index 0000000000..47dbef2f55
--- /dev/null
+++ b/include/cute/arch/copy.hpp
@@ -0,0 +1,107 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+
+#include <cute/arch/util.hpp>
+#include <cute/numeric/numeric_types.hpp>
+
+namespace cute
+{
+
+//
+// Direct Copy for any specific types
+//
+
+template <class S, class D = S>
+struct UniversalCopy
+{
+  using SRegisters = S[1];
+  using DRegisters = D[1];
+
+  // Sanity
+  static_assert(sizeof_bits_v<S> >= 8);
+  static_assert(sizeof_bits_v<D> >= 8);
+
+  CUTE_HOST_DEVICE static constexpr void
+  copy(S const& src,
+       D      & dst)
+  {
+    dst = src;
+  }
+};
+
+//
+// Placeholder for the copy algorithm's stronger auto-vectorizing behavior
+//   that assumes alignment of pointers and dynamic layouts up to MaxVecBits
+//
+
+template <int MaxVecBits = 128>
+struct AutoVectorizingCopyWithAssumedAlignment
+     : UniversalCopy<uint_bit_t<MaxVecBits>>
+{
+  static_assert(MaxVecBits == 8 || MaxVecBits == 16 || MaxVecBits == 32 || MaxVecBits == 64 || MaxVecBits == 128,
+                "Expected MaxVecBits to be 8 or 16 or 32 or 64 or 128 for alignment and performance.");
+};
+
+//
+// AutoVectorizingCopy alias assumes maximal alignment of pointers and dynamic strides.
+//   If this is not the case then AutoVectorizingCopyWithAssumedAlignment should be used instead
+//
+
+using AutoVectorizingCopy = AutoVectorizingCopyWithAssumedAlignment<128>;
+
+//
+// DefaultCopy alias does not assume alignment of pointers or dynamic strides.
+//
+
+using DefaultCopy = AutoVectorizingCopyWithAssumedAlignment<8>;
+
+//
+// Copy policy automatically selecting between
+// UniversalCopy and cp.async , based on type and memory space.
+//
+struct AutoCopyAsync {};
+
+//
+// Global memory prefetch into L2
+//
+
+CUTE_HOST_DEVICE static void
+prefetch(void const* gmem_ptr)
+{
+#if defined(__CUDA_ARCH__)
+  asm volatile("prefetch.global.L2 [%0];\n" : : "l"(gmem_ptr) : "memory");
+#endif
+}
+
+} // end namespace cute
diff --git a/include/cute/arch/copy_sm50.hpp b/include/cute/arch/copy_sm50.hpp
new file mode 100644
index 0000000000..925d9ebe37
--- /dev/null
+++ b/include/cute/arch/copy_sm50.hpp
@@ -0,0 +1,98 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+
+#include <cute/arch/copy.hpp>
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 500
+  #define CUTE_ARCH_WARP_SHUFFLE_ENABLED 1
+#endif
+
+namespace cute
+{
+// Shuffle data between thread pair (0, 1), (2, 3), etc.
+struct SM50_Shuffle_U32_2x2Trans_XOR1
+{
+  using SRegisters = uint32_t[2];
+  using DRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  copy(uint32_t const& src0, uint32_t const& src1, uint32_t& dst0, uint32_t& dst1)
+  {
+#if defined(CUTE_ARCH_WARP_SHUFFLE_ENABLED)
+    uint32_t x0 = src0;
+    uint32_t y0 = __shfl_xor_sync(0xffffffff, x0, 1);
+
+    uint32_t x1 = src1;
+    uint32_t y1 = __shfl_xor_sync(0xffffffff, x1, 1);
+
+    if (threadIdx.x % 2 == 0) {
+      dst1 = y0;
+    } 
+    else {
+      dst0 = y1;
+    }
+#else 
+    CUTE_INVALID_CONTROL_PATH("Trying to use __shfl_xor_sync without CUTE_ARCH_WARP_SHUFFLE_ENABLED.");
+#endif
+  }
+};
+
+// Shuffle data between thread pair (0, 4), (1, 5), etc.
+struct SM50_Shuffle_U32_2x2Trans_XOR4
+{
+  using SRegisters = uint32_t[2];
+  using DRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  copy(uint32_t const& src0, uint32_t const& src1, uint32_t& dst0, uint32_t& dst1)
+  {
+#if defined(CUTE_ARCH_WARP_SHUFFLE_ENABLED)
+    uint32_t x0 = threadIdx.x & 4  ? src0 : src1;
+    uint32_t y0 = __shfl_xor_sync(0xffffffff, x0, 4);
+
+    // Replace detination register with shuffle result.
+    if (threadIdx.x & 0x4) {
+      dst0 = y0;
+    } 
+    else {
+      dst1 = y0;
+    }
+#else 
+    CUTE_INVALID_CONTROL_PATH("Trying to use __shfl_xor_sync without CUTE_ARCH_WARP_SHUFFLE_ENABLED.");
+#endif
+  }
+};
+
+
+} // end namespace cute
diff --git a/include/cute/arch/copy_sm75.hpp b/include/cute/arch/copy_sm75.hpp
new file mode 100644
index 0000000000..3d3d37acb0
--- /dev/null
+++ b/include/cute/arch/copy_sm75.hpp
@@ -0,0 +1,236 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+
+#include <cute/arch/copy.hpp>
+
+// Config
+#if defined(__clang__) && defined(__CUDA__)
+  // ldmatrix PTX instructions added in Clang 14: https://reviews.llvm.org/D107046
+  // ... but will not work until Clang 15:
+  //   * https://reviews.llvm.org/D121666
+  //   * https://reviews.llvm.org/D126846
+  #define CUTE_ARCH_CLANG_SUPPORTS_LDSM_SM75 (__clang_major__ >= 15)
+#endif
+
+#if defined(__NVCC__) || defined(__CUDACC_RTC__)
+  // ldmatrix PTX instruction added in CUDA 10.2+
+  #define CUTE_ARCH_NVCC_SUPPORTS_LDSM_SM75 ((__CUDACC_VER_MAJOR__  == 10 && __CUDACC_VER_MINOR__ >= 2) || __CUDACC_VER_MAJOR__ >= 11)
+#endif
+
+#if ! defined(CUTE_ARCH_LDSM_SM75_SUPPORTED)
+  #define CUTE_ARCH_LDSM_SM75_SUPPORTED (CUTE_ARCH_NVCC_SUPPORTS_LDSM_SM75 || CUTE_ARCH_CLANG_SUPPORTS_LDSM_SM75)
+#endif
+
+#if ! defined(CUTE_ARCH_LDSM_SM75_ENABLED)
+  #define CUTE_ARCH_LDSM_SM75_ENABLED (CUTE_ARCH_LDSM_SM75_SUPPORTED)
+#endif
+
+#if (CUTE_ARCH_LDSM_SM75_ENABLED) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 750
+  #define CUTE_ARCH_LDSM_SM75_ACTIVATED 1
+#endif
+
+namespace cute
+{
+
+struct SM75_U32x1_LDSM_N
+{
+  using SRegisters = uint128_t[1];
+  using DRegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  copy(uint128_t const& smem_src,
+       uint32_t& dst)
+  {
+#if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_src);
+    asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];\n"
+        : "=r"(dst)
+        :  "r"(smem_int_ptr));
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use ldmatrix without CUTE_ARCH_LDSM_SM75_ACTIVATED.");
+#endif
+  }
+};
+
+struct SM75_U32x2_LDSM_N
+{
+  using SRegisters = uint128_t[1];
+  using DRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  copy(uint128_t const& smem_src,
+       uint32_t& dst0, uint32_t& dst1)
+  {
+#if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_src);
+    asm volatile ("ldmatrix.sync.aligned.x2.m8n8.shared.b16 {%0, %1}, [%2];\n"
+        : "=r"(dst0), "=r"(dst1)
+        :  "r"(smem_int_ptr));
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use ldmatrix without CUTE_ARCH_LDSM_SM75_ACTIVATED.");
+#endif
+  }
+};
+
+struct SM75_U32x4_LDSM_N
+{
+  using SRegisters = uint128_t[1];
+  using DRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  copy(uint128_t const& smem_src,
+       uint32_t& dst0, uint32_t& dst1, uint32_t& dst2, uint32_t& dst3)
+  {
+#if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_src);
+    asm volatile ("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];\n"
+        : "=r"(dst0), "=r"(dst1), "=r"(dst2), "=r"(dst3)
+        :  "r"(smem_int_ptr));
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use ldmatrix without CUTE_ARCH_LDSM_SM75_ACTIVATED.");
+#endif
+  }
+};
+
+struct SM75_U16x2_LDSM_T
+{
+  using SRegisters = uint128_t[1];
+  using DRegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  copy(uint128_t const& smem_src,
+       uint32_t& dst)
+  {
+#if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_src);
+    asm volatile ("ldmatrix.sync.aligned.x1.trans.m8n8.shared.b16 {%0}, [%1];\n"
+        : "=r"(dst)
+        :  "r"(smem_int_ptr));
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use ldmatrix without CUTE_ARCH_LDSM_SM75_ACTIVATED.");
+#endif
+  }
+};
+
+struct SM75_U16x4_LDSM_T
+{
+  using SRegisters = uint128_t[1];
+  using DRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  copy(uint128_t const& smem_src,
+       uint32_t& dst0, uint32_t& dst1)
+  {
+#if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_src);
+    asm volatile ("ldmatrix.sync.aligned.x2.trans.m8n8.shared.b16 {%0, %1}, [%2];\n"
+        : "=r"(dst0), "=r"(dst1)
+        :  "r"(smem_int_ptr));
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use ldmatrix without CUTE_ARCH_LDSM_SM75_ACTIVATED.");
+#endif
+  }
+};
+
+struct SM75_U16x8_LDSM_T
+{
+  using SRegisters = uint128_t[1];
+  using DRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  copy(uint128_t const& smem_src,
+       uint32_t& dst0, uint32_t& dst1, uint32_t& dst2, uint32_t& dst3)
+  {
+#if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_src);
+    asm volatile ("ldmatrix.sync.aligned.x4.trans.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];\n"
+        : "=r"(dst0), "=r"(dst1), "=r"(dst2), "=r"(dst3)
+        :  "r"(smem_int_ptr));
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use ldmatrix without CUTE_ARCH_LDSM_SM75_ACTIVATED.");
+#endif
+  }
+};
+
+//
+// Legacy LDSM interfaces that aren't very useful
+//
+
+template <class T>
+CUTE_HOST_DEVICE
+void
+copy_ldsm(uint128_t const* const smem_ptr,
+          T* rmem_ptr)
+{
+  uint32_t* reg_ptr = reinterpret_cast<uint32_t*>(rmem_ptr);
+
+  // if constexpr
+  if (sizeof(T) == 4) {
+    SM75_U32x1_LDSM_N::copy(smem_ptr[0], reg_ptr[0]);
+  }
+  else if (sizeof(T) == 8) {
+    SM75_U32x2_LDSM_N::copy(smem_ptr[0], reg_ptr[0], reg_ptr[1]);
+  }
+  else if (sizeof(T) == 16) {
+    SM75_U32x4_LDSM_N::copy(smem_ptr[0], reg_ptr[0], reg_ptr[1], reg_ptr[2], reg_ptr[3]);
+  }
+  else {
+    static_assert(sizeof(T) == 4 || sizeof(T) == 8 || sizeof(T) == 16, "sizeof(T) is not supported");
+  }
+}
+
+template <class T>
+CUTE_HOST_DEVICE
+void
+copy_ldsm_trans(uint128_t const* const smem_ptr,
+                T* rmem_ptr)
+{
+  uint32_t* reg_ptr = reinterpret_cast<uint32_t*>(rmem_ptr);
+
+  // if constexpr
+  if (sizeof(T) == 4) {
+    SM75_U16x2_LDSM_T::copy(smem_ptr[0], reg_ptr[0]);
+  }
+  else if (sizeof(T) == 8) {
+    SM75_U16x4_LDSM_T::copy(smem_ptr[0], reg_ptr[0], reg_ptr[1]);
+  }
+  else if (sizeof(T) == 16) {
+    SM75_U16x8_LDSM_T::copy(smem_ptr[0], reg_ptr[0], reg_ptr[1], reg_ptr[2], reg_ptr[3]);
+  }
+  else {
+    static_assert(sizeof(T) == 4 || sizeof(T) == 8 || sizeof(T) == 16, "sizeof(T) is not supported");
+  }
+}
+
+} // end namespace cute
diff --git a/include/cute/arch/copy_sm80.hpp b/include/cute/arch/copy_sm80.hpp
new file mode 100644
index 0000000000..e04181bfe9
--- /dev/null
+++ b/include/cute/arch/copy_sm80.hpp
@@ -0,0 +1,198 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+
+#include <cute/arch/copy.hpp>
+
+// Config
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+#  define CUTE_ARCH_CP_ASYNC_SM80_ENABLED
+#endif
+
+namespace cute
+{
+
+/// Copy via cp.async with caching at all levels
+template <class TS, class TD = TS>
+struct SM80_CP_ASYNC_CACHEALWAYS
+{
+  using SRegisters = TS[1];
+  using DRegisters = TD[1];
+
+  static_assert(sizeof(TS) == sizeof(TD), "cp.async requires sizeof(src_value_type) == sizeof(dst_value_type)");
+  static_assert(sizeof(TS) == 4 || sizeof(TS) == 8 || sizeof(TS) == 16, "cp.async sizeof(TS) is not supported");
+
+  CUTE_HOST_DEVICE static void
+  copy(TS const& gmem_src,
+       TD      & smem_dst)
+  {
+#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED)
+    TS const* gmem_ptr    = &gmem_src;
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
+    asm volatile("cp.async.ca.shared.global.L2::128B [%0], [%1], %2;\n"
+        :: "r"(smem_int_ptr),
+           "l"(gmem_ptr),
+           "n"(sizeof(TS)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Support for cp.async instructions has not been enabled");
+#endif
+  }
+};
+
+/// Copy via cp.async with caching at global level
+template <class TS, class TD = TS>
+struct SM80_CP_ASYNC_CACHEGLOBAL
+{
+  using SRegisters = TS[1];
+  using DRegisters = TD[1];
+
+  static_assert(sizeof(TS) == sizeof(TD), "cp.async requires sizeof(src_value_type) == sizeof(dst_value_type)");
+  static_assert(sizeof(TS) == 16, "cp.async sizeof(TS) is not supported");
+
+  CUTE_HOST_DEVICE static void
+  copy(TS const& gmem_src,
+       TD      & smem_dst)
+  {
+#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED)
+    TS const* gmem_ptr    = &gmem_src;
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
+    asm volatile("cp.async.cg.shared.global.L2::128B [%0], [%1], %2;\n"
+        :: "r"(smem_int_ptr),
+           "l"(gmem_ptr),
+           "n"(sizeof(TS)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Support for cp.async instructions has not been enabled");
+#endif
+  }
+};
+
+/// Copy via cp.async with caching at all levels
+template <class TS, class TD = TS>
+struct SM80_CP_ASYNC_CACHEALWAYS_ZFILL
+{
+  using SRegisters = TS[1];
+  using DRegisters = TD[1];
+
+  static_assert(sizeof(TS) == sizeof(TD), "cp.async requires sizeof(src_value_type) == sizeof(dst_value_type)");
+  static_assert(sizeof(TS) == 4 || sizeof(TS) == 8 || sizeof(TS) == 16, "cp.async sizeof(TS) is not supported");
+
+  CUTE_HOST_DEVICE static void
+  copy(TS const& gmem_src,
+       TD      & smem_dst,
+       bool      pred)
+  {
+#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED)
+    TS const* gmem_ptr    = &gmem_src;
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
+    int src_size = pred ? sizeof(TS) : 0;
+    asm volatile("cp.async.ca.shared.global.L2::128B [%0], [%1], %2, %3;\n"
+        :: "r"(smem_int_ptr),
+           "l"(gmem_ptr),
+           "n"(sizeof(TS)),
+           "r"(src_size));
+#else
+    CUTE_INVALID_CONTROL_PATH("Support for cp.async instructions has not been enabled");
+#endif
+  }
+};
+
+/// Copy via cp.async with caching at global level
+template <class TS, class TD = TS>
+struct SM80_CP_ASYNC_CACHEGLOBAL_ZFILL
+{
+  using SRegisters = TS[1];
+  using DRegisters = TD[1];
+
+  static_assert(sizeof(TS) == sizeof(TD), "cp.async requires sizeof(src_value_type) == sizeof(dst_value_type)");
+  static_assert(sizeof(TS) == 16, "cp.async sizeof(TS) is not supported");
+
+  CUTE_HOST_DEVICE static void
+  copy(TS const& gmem_src,
+       TD      & smem_dst,
+       bool      pred)
+  {
+#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED)
+    TS const* gmem_ptr    = &gmem_src;
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
+    int src_size = pred ? sizeof(TS) : 0;
+    asm volatile("cp.async.cg.shared.global.L2::128B [%0], [%1], %2, %3;\n"
+        :: "r"(smem_int_ptr),
+           "l"(gmem_ptr),
+           "n"(sizeof(TS)),
+           "r"(src_size));
+#else
+    CUTE_INVALID_CONTROL_PATH("Support for cp.async instructions has not been enabled");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Establishes an ordering w.r.t previously issued cp.async instructions. Does not block.
+CUTE_HOST_DEVICE
+void
+cp_async_fence()
+{
+#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED)
+  asm volatile("cp.async.commit_group;\n" ::);
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Blocks until all but N previous cp.async.commit_group operations have committed.
+template <int N>
+CUTE_HOST_DEVICE
+void
+cp_async_wait()
+{
+#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED)
+  if constexpr (N == 0) {
+    asm volatile("cp.async.wait_all;\n" ::);
+  } else {
+    asm volatile("cp.async.wait_group %0;\n" :: "n"(N));
+  }
+#endif
+}
+
+template <int N>
+CUTE_HOST_DEVICE
+void
+cp_async_wait(Int<N>)
+{
+  return cp_async_wait<N>();
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // end namespace cute
diff --git a/include/cute/arch/copy_sm90.hpp b/include/cute/arch/copy_sm90.hpp
new file mode 100644
index 0000000000..bcb3b7d19c
--- /dev/null
+++ b/include/cute/arch/copy_sm90.hpp
@@ -0,0 +1,219 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>      // CUTE_HOST_DEVICE
+#include <cute/arch/config.hpp> // CUTE_ARCH_TMA_SMxx_ENABLED
+#include <cute/arch/copy.hpp>
+
+namespace cute
+{
+
+struct SM90_U32x1_STSM_N
+{
+  using SRegisters = uint32_t[1];
+  using DRegisters = uint128_t[1];
+
+  CUTE_HOST_DEVICE static void
+  copy(uint32_t const& src,
+       uint128_t     & smem_dst)
+  {
+#if defined(CUTE_ARCH_STSM_SM90_ENABLED)
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
+    asm volatile ("stmatrix.sync.aligned.x1.m8n8.shared.b16 [%0], {%1};\n"
+        :: "r"(smem_int_ptr),
+           "r"(src));
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use stmatrix without CUTE_ARCH_STSM_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_U32x2_STSM_N
+{
+  using SRegisters = uint32_t[2];
+  using DRegisters = uint128_t[1];
+
+  CUTE_HOST_DEVICE static void
+  copy(uint32_t const& src0, uint32_t const& src1,
+       uint128_t& smem_dst)
+  {
+#if defined(CUTE_ARCH_STSM_SM90_ENABLED)
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
+    asm volatile ("stmatrix.sync.aligned.x2.m8n8.shared.b16 [%0], {%1, %2};\n"
+        :: "r"(smem_int_ptr),
+           "r"(src0), "r"(src1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use stmatrix without CUTE_ARCH_STSM_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_U32x4_STSM_N
+{
+  using SRegisters = uint32_t[4];
+  using DRegisters = uint128_t[1];
+
+  CUTE_HOST_DEVICE static void
+  copy(uint32_t const& src0, uint32_t const& src1, uint32_t const& src2, uint32_t const& src3,
+       uint128_t& smem_dst)
+  {
+#if defined(CUTE_ARCH_STSM_SM90_ENABLED)
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
+    asm volatile ("stmatrix.sync.aligned.x4.m8n8.shared.b16 [%0], {%1, %2, %3, %4};\n"
+        :: "r"(smem_int_ptr),
+          "r"(src0), "r"(src1), "r"(src2), "r"(src3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use stmatrix without CUTE_ARCH_STSM_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_U16x2_STSM_T
+{
+  using SRegisters = uint32_t[1];
+  using DRegisters = uint128_t[1];
+
+  CUTE_HOST_DEVICE static void
+  copy(uint32_t const& src,
+       uint128_t& smem_dst)
+  {
+#if defined(CUTE_ARCH_STSM_SM90_ENABLED)
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
+    asm volatile ("stmatrix.sync.aligned.x1.trans.m8n8.shared.b16 [%0], {%1};\n"
+        :: "r"(smem_int_ptr),
+           "r"(src));
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use stmatrix without CUTE_ARCH_STSM_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_U16x4_STSM_T
+{
+  using SRegisters = uint32_t[2];
+  using DRegisters = uint128_t[1];
+
+  CUTE_HOST_DEVICE static void
+  copy(uint32_t const& src0, uint32_t const& src1,
+       uint128_t& smem_dst)
+  {
+#if defined(CUTE_ARCH_STSM_SM90_ENABLED)
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
+    asm volatile ("stmatrix.sync.aligned.x2.trans.m8n8.shared.b16 [%0], {%1, %2};\n"
+        :: "r"(smem_int_ptr),
+           "r"(src0), "r"(src1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use stmatrix without CUTE_ARCH_STSM_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_U16x8_STSM_T
+{
+  using SRegisters = uint32_t[4];
+  using DRegisters = uint128_t[1];
+
+  CUTE_HOST_DEVICE static void
+  copy(uint32_t const& src0, uint32_t const& src1, uint32_t const& src2, uint32_t const& src3,
+       uint128_t& smem_dst)
+  {
+#if defined(CUTE_ARCH_STSM_SM90_ENABLED)
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
+    asm volatile ("stmatrix.sync.aligned.x4.trans.m8n8.shared.b16 [%0], {%1, %2, %3, %4};\n"
+        :: "r"(smem_int_ptr),
+          "r"(src0), "r"(src1), "r"(src2), "r"(src3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use stmatrix without CUTE_ARCH_STSM_SM90_ENABLED.");
+#endif
+  }
+};
+
+//
+// Legacy STSM interfaces that aren't very useful
+//
+
+template <class T>
+CUTE_HOST_DEVICE
+void
+copy_stsm(T const* const rmem_ptr,
+          uint128_t* const smem_ptr)
+{
+  uint32_t const* reg_ptr = reinterpret_cast<uint32_t const*>(rmem_ptr);
+
+  // if constexpr
+  if (sizeof(T) == 4) {
+    SM90_U32x1_STSM_N::copy(reg_ptr[0], smem_ptr[0]);
+  }
+  else if (sizeof(T) == 8) {
+    SM90_U32x2_STSM_N::copy(reg_ptr[0], reg_ptr[1], smem_ptr[0]);
+  }
+  else if (sizeof(T) == 16) {
+    SM90_U32x4_STSM_N::copy(reg_ptr[0], reg_ptr[1], reg_ptr[2], reg_ptr[3], smem_ptr[0]);
+  }
+  else {
+    static_assert(sizeof(T) == 4 || sizeof(T) == 8 || sizeof(T) == 16, "sizeof(T) is not supported");
+  }
+}
+
+template <class T>
+CUTE_HOST_DEVICE
+void
+copy_stsm_trans(T const* const rmem_ptr,
+                uint128_t* const smem_ptr)
+{
+  uint32_t const* reg_ptr = reinterpret_cast<uint32_t const*>(rmem_ptr);
+
+  // if constexpr
+  if (sizeof(T) == 4) {
+    SM90_U16x2_STSM_T::copy(reg_ptr[0], smem_ptr[0]);
+  }
+  else if (sizeof(T) == 8) {
+    SM90_U16x4_STSM_T::copy(reg_ptr[0], reg_ptr[1], smem_ptr[0]);
+  }
+  else if (sizeof(T) == 16) {
+    SM90_U16x8_STSM_T::copy(reg_ptr[0], reg_ptr[1], reg_ptr[2], reg_ptr[3], smem_ptr[0]);
+  }
+  else {
+    static_assert(sizeof(T) == 4 || sizeof(T) == 8 || sizeof(T) == 16, "sizeof(T) is not supported");
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // end namespace cute
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <cute/arch/copy_sm90_desc.hpp>
+#include <cute/arch/copy_sm90_tma.hpp>
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cute/arch/copy_sm90_desc.hpp b/include/cute/arch/copy_sm90_desc.hpp
new file mode 100644
index 0000000000..cc0bf4a392
--- /dev/null
+++ b/include/cute/arch/copy_sm90_desc.hpp
@@ -0,0 +1,440 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/numeric_types.h"
+
+#if !defined(__CUDACC_RTC__)
+#include <cuda.h>
+#include <cinttypes>
+#endif
+
+#include <cute/config.hpp>
+
+#include <cute/arch/util.hpp>   // cute::cast_smem_ptr_to_uint
+#include <cute/arch/config.hpp> // CUTE_ARCH_TMA_SMxx_ENABLED
+#include <cute/arch/copy.hpp>
+#include <cute/arch/copy_sm90.hpp>
+
+#include <cute/container/alignment.hpp>
+#include <cute/container/bit_field.hpp>
+#include <cute/container/array.hpp>
+#include <cute/numeric/numeric_types.hpp>
+
+namespace cute
+{
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Barriers are 64-bit of user-managed information used in broadly two types syncronization patterns
+/// 1) arrive/wait on threads (usage: cp.async and warp-specialized kernels)
+/// 2) transaction-based (usage: TMA transaction where a CTA issues one transaction)
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Initialize barrier present in shared memory
+CUTE_HOST_DEVICE
+void
+initialize_barrier(uint64_t& smem_barrier,                 // 64 bits user-manged barrier in smem
+                   int thread_count = 1)                   // Thread count expected to arrive/wait on this barrier
+{
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+  uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_barrier);
+  asm volatile ("mbarrier.init.shared::cta.b64 [%0], %1;\n"
+    :: "r"(smem_int_ptr),
+       "r"(thread_count));
+#endif
+}
+
+// Set the number of bytes transfered per transaction and perform an arrive operation as well
+CUTE_HOST_DEVICE
+void
+set_barrier_transaction_bytes(uint64_t& smem_barrier,      // 64 bits user-manged barrier in smem
+                              uint32_t bytes)              // Number of bytes transfered by per TMA transaction
+{
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+  uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_barrier);
+  asm volatile ("mbarrier.arrive.expect_tx.shared::cta.b64 _, [%0], %1;\n"
+    :: "r"(smem_int_ptr),
+       "r"(bytes));
+#endif
+}
+
+// Barrier wait
+CUTE_HOST_DEVICE
+void
+wait_barrier(uint64_t& smem_barrier,                       // 64 bits user-manged barrier in smem
+             int phase_bit)                                // Current phase bit the barrier waiting to flip
+{
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+  uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_barrier);
+  asm volatile(
+    "{\n"
+    ".reg .pred                P1;\n"
+    "LAB_WAIT:\n"
+    "mbarrier.try_wait.parity.shared::cta.b64 P1, [%0], %1;\n"
+    "@P1                       bra DONE;\n"
+    "bra                   LAB_WAIT;\n"
+    "DONE:\n"
+    "}\n"
+    :: "r"(smem_int_ptr),
+       "r"(phase_bit));
+
+#endif
+}
+
+// Barrier arrive
+CUTE_HOST_DEVICE
+void
+arrive_barrier(uint64_t& smem_barrier)                      // 64 bits user-manged barrier in smem
+{
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+  uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_barrier);
+  asm volatile(
+    "{\n"
+    ".reg .b64 state; \n"
+    "mbarrier.arrive.shared::cta.b64   state, [%0];\n"
+    "}\n"
+    :: "r"(smem_int_ptr));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// TMA Descriptor and utilities
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace TMA {
+
+enum class SmemSwizzleBits : uint8_t {
+  DISABLE = 0,
+  B32 = 1,
+  B64 = 2,
+  B128 = 3,
+};
+
+enum class SmemSwizzleBase : uint8_t {
+  SWIZZLE_BASE_16B         = 0,
+};
+
+enum class OOBFill : uint8_t {
+  ZERO = 0,
+  CONSTANT = 1,
+};
+
+CUTE_HOST_DEVICE char const* to_string(OOBFill const& t) {
+  switch (t) {
+    case OOBFill::ZERO:     return "ZERO";
+    case OOBFill::CONSTANT: return "CONSTANT";
+  }
+  return nullptr;
+}
+
+enum class L2Promotion : uint8_t {
+  DISABLE = 0,
+  B64 = 1,
+  B128 = 2,
+  B256 = 3,
+};
+
+CUTE_HOST_DEVICE char const* to_string(L2Promotion const& t) {
+  switch (t) {
+    case L2Promotion::DISABLE: return "DISABLE";
+    case L2Promotion::B64:     return "B64";
+    case L2Promotion::B128:    return "B128";
+    case L2Promotion::B256:    return "B256";
+  }
+  return nullptr;
+}
+
+// Aux parameters which are independent with the problem size
+struct DescriptorAuxParams {
+  OOBFill     oobfill_     = OOBFill::ZERO;
+  L2Promotion l2promo_     = L2Promotion::DISABLE;
+};
+
+enum class CacheHintSm90 : uint64_t {
+  EVICT_NORMAL = 0x1000000000000000,
+  EVICT_FIRST = 0x12F0000000000000,
+  EVICT_LAST = 0x14F0000000000000,
+};
+
+#if (__CUDACC_VER_MAJOR__ >= 12)
+
+#if !defined(__CUDACC_RTC__)
+/// @return The TMA descriptor datatype enum corresponding to T.
+template <class T>
+inline CUtensorMapDataType
+to_CUtensorMapDataType() {
+  if constexpr (is_same_v<T,       int8_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT8;    } else
+  if constexpr (is_same_v<T,      uint8_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT8;    } else
+  if constexpr (is_same_v<T, float_e4m3_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT8;    } else
+  if constexpr (is_same_v<T, float_e5m2_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT8;    } else
+  if constexpr (is_same_v<T,     uint16_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT16;   } else
+  if constexpr (is_same_v<T,     uint32_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT32;   } else
+  if constexpr (is_same_v<T,     uint64_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT64;   } else
+  if constexpr (is_same_v<T,      int32_t>) { return CU_TENSOR_MAP_DATA_TYPE_INT32;    } else
+  if constexpr (is_same_v<T,      int64_t>) { return CU_TENSOR_MAP_DATA_TYPE_INT64;    } else
+  if constexpr (is_same_v<T,       half_t>) { return CU_TENSOR_MAP_DATA_TYPE_FLOAT16;  } else
+  if constexpr (is_same_v<T,        float>) { return CU_TENSOR_MAP_DATA_TYPE_FLOAT32;  } else
+  if constexpr (is_same_v<T,       double>) { return CU_TENSOR_MAP_DATA_TYPE_FLOAT64;  } else
+  if constexpr (is_same_v<T,   bfloat16_t>) { return CU_TENSOR_MAP_DATA_TYPE_BFLOAT16; } else
+  if constexpr (is_same_v<T,   tfloat32_t>) { return CU_TENSOR_MAP_DATA_TYPE_TFLOAT32; } else
+  { static_assert(sizeof(T) < 0, "Unknown TMA Format!"); }
+}
+
+inline CUtensorMapSwizzle
+to_CUtensorMapSwizzle(SmemSwizzleBits const& t, SmemSwizzleBase const& b) {
+  switch (t) {
+    default: assert(false && "Unsupported pair of SmemSwizzleBits and SmemSwizzleBase!");
+    case SmemSwizzleBits::DISABLE: 
+      assert((b == SmemSwizzleBase::SWIZZLE_BASE_16B) && "Expected 16B swizzle base for 0B swizzle bits.");
+      return CU_TENSOR_MAP_SWIZZLE_NONE;
+    case SmemSwizzleBits::B32:
+      assert((b == SmemSwizzleBase::SWIZZLE_BASE_16B) && "Expected 16B swizzle base for 32B swizzle bits.");
+      return CU_TENSOR_MAP_SWIZZLE_32B;
+    case SmemSwizzleBits::B64:
+      assert((b == SmemSwizzleBase::SWIZZLE_BASE_16B) && "Expected 16B swizzle base for 64B swizzle bits.");
+      return CU_TENSOR_MAP_SWIZZLE_64B;
+    case SmemSwizzleBits::B128:
+      assert((b == SmemSwizzleBase::SWIZZLE_BASE_16B) && "Expected 16B swizzle base for 128B swizzle bits.");
+      return CU_TENSOR_MAP_SWIZZLE_128B;
+  }
+}
+
+inline CUtensorMapFloatOOBfill
+to_CUtensorMapFloatOOBfill(OOBFill const& t) {
+  switch(t) {
+    default:                assert(false && "Unknown OOBFill!");
+    case OOBFill::ZERO:     return CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE;
+    case OOBFill::CONSTANT: return CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA;
+  }
+}
+
+inline CUtensorMapL2promotion
+to_CUtensorMapL2promotion(L2Promotion const& t) {
+  switch(t) {
+    default: assert(false && "Unknown L2Promotion!");
+    case L2Promotion::DISABLE: return CU_TENSOR_MAP_L2_PROMOTION_NONE;
+    case L2Promotion::B64:     return CU_TENSOR_MAP_L2_PROMOTION_L2_64B;
+    case L2Promotion::B128:    return CU_TENSOR_MAP_L2_PROMOTION_L2_128B;
+    case L2Promotion::B256:    return CU_TENSOR_MAP_L2_PROMOTION_L2_256B;
+  }
+}
+
+#endif // !defined(__CUDACC_RTC__)
+
+#endif // (__CUDACC_VER_MAJOR__ >= 12)
+
+} // end namespace TMA
+
+#if (__CUDACC_VER_MAJOR__ >= 12) && !defined(__CUDACC_RTC__)
+  using TmaDescriptor = CUtensorMap;
+  using Im2ColTmaDescriptor = CUtensorMap;
+#else
+  using TmaDescriptor = struct alignas(64) { char bytes[128]; };
+  using Im2ColTmaDescriptor = struct alignas(64) { char bytes[128]; };
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Initiates a TensorMap Prefetch
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CUTE_HOST_DEVICE
+void
+prefetch_tma_descriptor(TmaDescriptor const* desc_ptr)
+{
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+  uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+  // Prefetch TMA Descriptor using generic addressing (i.e. no specific state space: const or param)
+  asm volatile (
+    "prefetch.tensormap [%0];"
+    :
+    : "l"(gmem_int_desc)
+    : "memory");
+#else
+  CUTE_INVALID_CONTROL_PATH("Trying to use TMA Descriptor Prefetch without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Perform a TensorMap modification (by each field)
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Replace tensor pointer directly in GMEM
+CUTE_HOST_DEVICE
+void
+tma_descriptor_replace_addr_in_global_mem(TmaDescriptor const* desc_ptr,
+                                          void const* const new_tensor_ptr)
+{
+#if defined(CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED)
+  uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+  uint64_t const new_desc_addr = reinterpret_cast<uint64_t>(new_tensor_ptr);
+  asm volatile (
+    "tensormap.replace.tile.global_address.global.b1024.b64 [%0], %1;"
+    :: "l"(gmem_int_desc), "l"(new_desc_addr));
+#else
+  CUTE_INVALID_CONTROL_PATH("Using TMA Descriptor modification without CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED and CUDA 12.3");
+#endif
+}
+
+// Replace tensor pointer by bringing the tensormap from GMEM into the shared memory
+CUTE_HOST_DEVICE
+void
+tma_descriptor_replace_addr_in_shared_mem(TmaDescriptor& smem_desc,
+                                          void const* const new_tensor_ptr)
+{
+#if defined(CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED)
+  uint32_t smem_int_desc = cast_smem_ptr_to_uint(&smem_desc);
+  uint64_t const new_desc_addr = reinterpret_cast<uint64_t>(new_tensor_ptr);
+  asm volatile (
+    "tensormap.replace.tile.global_address.shared::cta.b1024.b64 [%0], %1;"
+    :: "r"(smem_int_desc), "l"(new_desc_addr));
+#else
+  CUTE_INVALID_CONTROL_PATH("Using TMA Descriptor modification without CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED and CUDA 12.3");
+#endif
+}
+
+// Replace tensor dims and strides for GEMMs by bringing the tensormap from GMEM into the shared memory
+CUTE_HOST_DEVICE
+void
+tma_descriptor_replace_dims_strides_in_shared_mem(TmaDescriptor                 & smem_desc,
+                                                  cute::array<uint32_t, 5> const& prob_shape,
+                                                  cute::array<uint64_t, 5> const& prob_stride)
+{
+#if defined(CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED)
+  uint32_t smem_int_desc = cast_smem_ptr_to_uint(&smem_desc);
+  uint64_t const smem_int64_desc = 0;
+  asm volatile (
+    "cvt.u64.u32 %0, %1;"
+    :: "l"(smem_int64_desc), "r"(smem_int_desc));
+  asm volatile (
+    "tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [%0], 0, %1;"
+    :: "l"(smem_int64_desc), "r"(prob_shape[0]));
+  asm volatile (
+    "tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [%0], 1, %1;"
+    :: "l"(smem_int64_desc), "r"(prob_shape[1]));
+  asm volatile (
+    "tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [%0], 2, %1;"
+    :: "l"(smem_int64_desc), "r"(prob_shape[2]));
+  asm volatile (
+    "tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [%0], 3, %1;"
+    :: "l"(smem_int64_desc), "r"(prob_shape[3]));
+  asm volatile (
+    "tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [%0], 4, %1;"
+    :: "l"(smem_int64_desc), "r"(prob_shape[4]));
+  // Strides must be a multiple of 16. Also, stride for the intermost dimension is implicitly 1
+  #if ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 5)))
+  asm volatile (
+    "tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], 0, %1;"
+    :: "l"(smem_int64_desc), "l"(prob_stride[1]));
+  asm volatile (
+    "tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], 1, %1;"
+    :: "l"(smem_int64_desc), "l"(prob_stride[2]));
+  asm volatile (
+    "tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], 2, %1;"
+    :: "l"(smem_int64_desc), "l"(prob_stride[3]));
+  asm volatile (
+    "tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], 3, %1;"
+    :: "l"(smem_int64_desc), "l"(prob_stride[4]));
+  #else
+  // 4 LSBs are not included
+  asm volatile (
+    "tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], 0, %1;"
+    :: "l"(smem_int64_desc), "l"(prob_stride[1] >> 4));
+  asm volatile (
+    "tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], 1, %1;"
+    :: "l"(smem_int64_desc), "l"(prob_stride[2] >> 4));
+  asm volatile (
+    "tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], 2, %1;"
+    :: "l"(smem_int64_desc), "l"(prob_stride[3] >> 4));
+  asm volatile (
+    "tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], 3, %1;"
+    :: "l"(smem_int64_desc), "l"(prob_stride[4] >> 4));
+  #endif
+#else
+  CUTE_INVALID_CONTROL_PATH("Using TMA Descriptor modification without CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED and CUDA 12.3");
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Perform a fused copy and fence operation (needed when modifying tensormap in shared memory)
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CUTE_HOST_DEVICE
+void
+tma_descriptor_cp_fence_release(TmaDescriptor const* gmem_desc_ptr, TmaDescriptor& smem_desc)
+{
+#if defined(CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED)
+  uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(gmem_desc_ptr);
+  uint32_t smem_int_desc = cast_smem_ptr_to_uint(&smem_desc);
+  asm volatile (
+    "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned [%0], [%1], 128;"
+    :: "l"(gmem_int_desc), "r"(smem_int_desc));
+#else
+  CUTE_INVALID_CONTROL_PATH("Using TMA Descriptor modification without CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED and CUDA 12.3");
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Perform a release fence operation (needed when modifying tensormap directly in GMEM)
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CUTE_HOST_DEVICE
+void
+tma_descriptor_fence_release()
+{
+#if defined(CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED)
+  asm volatile ("fence.proxy.tensormap::generic.release.gpu;");
+#else
+  CUTE_INVALID_CONTROL_PATH("Using TMA Descriptor modification without CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED and CUDA 12.3");
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Perform a acquire fence operation
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CUTE_HOST_DEVICE
+void
+tma_descriptor_fence_acquire(TmaDescriptor const* desc_ptr)
+{
+#if defined(CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED)
+  uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+  asm volatile (
+    "fence.proxy.tensormap::generic.acquire.gpu [%0], 128;"
+    :
+    : "l"(gmem_int_desc)
+    : "memory");
+#else
+  CUTE_INVALID_CONTROL_PATH("Using TMA Descriptor modification without CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED and CUDA 12.3");
+#endif
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // end namespace cute
diff --git a/include/cute/arch/copy_sm90_tma.hpp b/include/cute/arch/copy_sm90_tma.hpp
new file mode 100644
index 0000000000..fb33d63cad
--- /dev/null
+++ b/include/cute/arch/copy_sm90_tma.hpp
@@ -0,0 +1,1395 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+
+#include <cute/arch/config.hpp> // CUTE_ARCH_TMA_SMxx_ENABLED
+#include <cute/arch/copy.hpp>
+#include <cute/arch/copy_sm90.hpp>
+#include "cutlass/arch/synclog.hpp"
+
+namespace cute
+{
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// TMA_LOAD : Initiates a TMA copy from global memory to shared memory
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SM90_TMA_LOAD_1D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint"
+      " [%0], [%1, {%3}], [%2], %4;"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "r"(crd0), "l"(cache_hint)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+
+  struct PREFETCH
+  {
+    CUTE_HOST_DEVICE static void
+    copy(void const* desc_ptr,
+         int32_t const& crd0)
+    {
+  #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+      uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+      asm volatile (
+        "cp.async.bulk.prefetch.tensor.1d.L2.global"
+        " [%0, {%1}];"
+        :
+        : "l"(gmem_int_desc),
+          "r"(crd0)
+        : "memory");
+  #else
+      CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+  #endif
+    }
+  };
+};
+
+struct SM90_TMA_LOAD_2D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0, int32_t const& crd1)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint"
+      " [%0], [%1, {%3, %4}], [%2], %5;"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "r"(crd0), "r"(crd1), "l"(cache_hint)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+
+  struct PREFETCH
+  {
+    CUTE_HOST_DEVICE static void
+    copy(void const* desc_ptr,
+         int32_t const& crd0, int32_t const& crd1)
+    {
+  #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+      uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+      asm volatile (
+        "cp.async.bulk.prefetch.tensor.2d.L2.global"
+        " [%0, {%1, %2}];"
+        :
+        : "l"(gmem_int_desc),
+          "r"(crd0), "r"(crd1)
+        : "memory");
+  #else
+      CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+  #endif
+    }
+  };
+};
+
+struct SM90_TMA_LOAD_3D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint"
+      " [%0], [%1, {%3, %4, %5}], [%2], %6;"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "r"(crd0), "r"(crd1), "r"(crd2), "l"(cache_hint)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+
+  struct PREFETCH
+  {
+    CUTE_HOST_DEVICE static void
+    copy(void const* desc_ptr,
+         int32_t const& crd0, int32_t const& crd1, int32_t const& crd2)
+    {
+  #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+      uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+      asm volatile (
+        "cp.async.bulk.prefetch.tensor.3d.L2.global"
+        " [%0, {%1, %2, %3}];"
+        :
+        : "l"(gmem_int_desc),
+          "r"(crd0), "r"(crd1), "r"(crd2)
+        : "memory");
+  #else
+      CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+  #endif
+    }
+  };
+};
+
+struct SM90_TMA_LOAD_4D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint"
+      " [%0], [%1, {%3, %4, %5, %6}], [%2], %7;"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3), "l"(cache_hint)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+
+  struct PREFETCH
+  {
+    CUTE_HOST_DEVICE static void
+    copy(void const* desc_ptr,
+         int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3)
+    {
+  #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+      uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+      asm volatile (
+        "cp.async.bulk.prefetch.tensor.4d.L2.global"
+        " [%0, {%1, %2, %3, %4}];"
+        :
+        : "l"(gmem_int_desc),
+          "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3)
+        : "memory");
+  #else
+      CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+  #endif
+    }
+  };
+};
+
+struct SM90_TMA_LOAD_5D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint"
+      " [%0], [%1, {%3, %4, %5, %6, %7}], [%2], %8;"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3), "r"(crd4), "l"(cache_hint)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+
+  struct PREFETCH
+  {
+    CUTE_HOST_DEVICE static void
+    copy(void const* desc_ptr,
+         int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4)
+    {
+  #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+      uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+      asm volatile (
+        "cp.async.bulk.prefetch.tensor.5d.L2.global"
+        " [%0, {%1, %2, %3, %4, %5}];"
+        :
+        : "l"(gmem_int_desc),
+          "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3), "r"(crd4)
+        : "memory");
+  #else
+      CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+  #endif
+    }
+  };
+};
+
+struct SM90_TMA_LOAD
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0)
+  {
+    return SM90_TMA_LOAD_1D::copy(desc_ptr, mbar_ptr, cache_hint, smem_ptr, crd0);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0, int32_t const& crd1)
+  {
+    return SM90_TMA_LOAD_2D::copy(desc_ptr, mbar_ptr, cache_hint, smem_ptr, crd0, crd1);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2)
+  {
+    return SM90_TMA_LOAD_3D::copy(desc_ptr, mbar_ptr, cache_hint, smem_ptr, crd0, crd1, crd2);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3)
+  {
+    return SM90_TMA_LOAD_4D::copy(desc_ptr, mbar_ptr, cache_hint, smem_ptr, crd0, crd1, crd2, crd3);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4)
+  {
+    return SM90_TMA_LOAD_5D::copy(desc_ptr, mbar_ptr, cache_hint, smem_ptr, crd0, crd1, crd2, crd3, crd4);
+  }
+
+  struct PREFETCH
+  {
+    CUTE_HOST_DEVICE static void
+    copy(void const* desc_ptr,
+         int32_t const& crd0)
+    {
+      return SM90_TMA_LOAD_1D::PREFETCH::copy(desc_ptr, crd0);
+    }
+    CUTE_HOST_DEVICE static void
+    copy(void const* desc_ptr,
+         int32_t const& crd0, int32_t const& crd1)
+    {
+      return SM90_TMA_LOAD_2D::PREFETCH::copy(desc_ptr, crd0, crd1);
+    }
+    CUTE_HOST_DEVICE static void
+    copy(void const* desc_ptr,
+         int32_t const& crd0, int32_t const& crd1, int32_t const& crd2)
+    {
+      return SM90_TMA_LOAD_3D::PREFETCH::copy(desc_ptr, crd0, crd1, crd2);
+    }
+    CUTE_HOST_DEVICE static void
+    copy(void const* desc_ptr,
+         int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3)
+    {
+      return SM90_TMA_LOAD_4D::PREFETCH::copy(desc_ptr, crd0, crd1, crd2, crd3);
+    }
+    CUTE_HOST_DEVICE static void
+    copy(void const* desc_ptr,
+         int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4)
+    {
+      return SM90_TMA_LOAD_5D::PREFETCH::copy(desc_ptr, crd0, crd1, crd2, crd3, crd4);
+    }
+  };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// TMA_LOAD im2col: Initiates a TMA copy, in im2col mode, from global memory to shared memory
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SM90_TMA_LOAD_IM2COL_3D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr,
+       void      * smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_n,
+       uint16_t const& offset_w)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+    // Copy from global to shared::cluster.
+    asm volatile (
+      "cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes"
+      " [%0], [%1, {%3, %4, %5}], [%2], {%6};"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "r"(coord_c), "r"(coord_w), "r"(coord_n),
+        "h"(offset_w)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+
+  struct PREFETCH
+  {
+    CUTE_HOST_DEVICE static void
+    copy(void const* desc_ptr,
+         int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_n,
+         uint16_t const& offset_w)
+    {
+  #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+      uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+      asm volatile (
+        "cp.async.bulk.prefetch.tensor.3d.L2.global.im2col"
+        " [%0, {%1, %2, %3}], {%4};"
+        :
+        : "l"(gmem_int_desc),
+          "r"(coord_c), "r"(coord_w), "r"(coord_n),
+          "h"(offset_w)
+        : "memory");
+  #else
+      CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+  #endif
+    }
+  };
+};
+
+struct SM90_TMA_LOAD_IM2COL_4D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr,
+       void      * smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_n,
+       uint16_t const& offset_w, uint16_t const& offset_h)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+    // Copy from global to shared::cluster.
+    asm volatile (
+      "cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes"
+      " [%0], [%1, {%3, %4, %5, %6}], [%2], {%7, %8};"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "r"(coord_c), "r"(coord_w), "r"(coord_h), "r"(coord_n),
+        "h"(offset_w), "h"(offset_h)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+
+  struct PREFETCH
+  {
+    CUTE_HOST_DEVICE static void
+    copy(void const* desc_ptr,
+         int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_n,
+         uint16_t const& offset_w, uint16_t const& offset_h)
+    {
+  #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+      uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+      asm volatile (
+        "cp.async.bulk.prefetch.tensor.4d.L2.global.im2col"
+        " [%0, {%1, %2, %3, %4}], {%5, %6};"
+        :
+        : "l"(gmem_int_desc),
+          "r"(coord_c), "r"(coord_w), "r"(coord_h), "r"(coord_n),
+          "h"(offset_w), "h"(offset_h)
+        : "memory");
+  #else
+      CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+  #endif
+    }
+  };
+};
+
+struct SM90_TMA_LOAD_IM2COL_5D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr,
+       void      * smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_d, int32_t const& coord_n,
+       uint16_t const& offset_w, uint16_t const& offset_h, uint16_t const& offset_d)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+    // Copy from global to shared::cluster.
+    asm volatile (
+      "cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes"
+      " [%0], [%1, {%3, %4, %5, %6, %7}], [%2], {%8, %9, %10};"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "r"(coord_c), "r"(coord_w), "r"(coord_h), "r"(coord_d), "r"(coord_n),
+        "h"(offset_w), "h"(offset_h), "h"(offset_d)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+
+  struct PREFETCH
+  {
+    CUTE_HOST_DEVICE static void
+    copy(void const* desc_ptr,
+         int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_d, int32_t const& coord_n,
+         uint16_t const& offset_w, uint16_t const& offset_h, uint16_t const& offset_d)
+    {
+  #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+      uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+      asm volatile (
+        "cp.async.bulk.prefetch.tensor.5d.L2.global.im2col"
+        " [%0, {%1, %2, %3, %4, %5}], {%6, %7, %8};"
+        :
+        : "l"(gmem_int_desc),
+          "r"(coord_c), "r"(coord_w), "r"(coord_h), "r"(coord_d), "r"(coord_n),
+          "h"(offset_w), "h"(offset_h), "h"(offset_d)
+        : "memory");
+  #else
+      CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+  #endif
+    }
+  };
+};
+
+struct SM90_TMA_LOAD_IM2COL
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr,
+       void      * smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_n,
+       uint16_t const& offset_w)
+  {
+    return SM90_TMA_LOAD_IM2COL_3D::copy(desc_ptr, mbar_ptr, smem_ptr,
+                                         coord_c, coord_w, coord_n,
+                                         offset_w);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr,
+       void      * smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_n,
+       uint16_t const& offset_w, uint16_t const& offset_h)
+  {
+    return SM90_TMA_LOAD_IM2COL_4D::copy(desc_ptr, mbar_ptr, smem_ptr,
+                                         coord_c, coord_w, coord_h, coord_n,
+                                         offset_w, offset_h);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr,
+       void      * smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_d, int32_t const& coord_n,
+       uint16_t const& offset_w, uint16_t const& offset_h, uint16_t const& offset_d)
+  {
+    return SM90_TMA_LOAD_IM2COL_5D::copy(desc_ptr, mbar_ptr, smem_ptr,
+                                         coord_c, coord_w, coord_h, coord_d, coord_n,
+                                         offset_w, offset_h, offset_d);
+  }
+
+  struct PREFETCH
+  {
+    CUTE_HOST_DEVICE static void
+    copy(void const* desc_ptr,
+         int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_n,
+         uint16_t const& offset_w)
+    {
+      return SM90_TMA_LOAD_IM2COL_3D::PREFETCH::copy(desc_ptr,
+                                                     coord_c, coord_w, coord_n,
+                                                     offset_w);
+    }
+    CUTE_HOST_DEVICE static void
+    copy(void const* desc_ptr,
+         int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_n,
+         uint16_t const& offset_w, uint16_t const& offset_h)
+    {
+      return SM90_TMA_LOAD_IM2COL_4D::PREFETCH::copy(desc_ptr,
+                                                     coord_c, coord_w, coord_h, coord_n,
+                                                     offset_w, offset_h);
+    }
+    CUTE_HOST_DEVICE static void
+    copy(void const* desc_ptr,
+         int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_d, int32_t const& coord_n,
+         uint16_t const& offset_w, uint16_t const& offset_h, uint16_t const& offset_d)
+    {
+      return SM90_TMA_LOAD_IM2COL_5D::PREFETCH::copy(desc_ptr,
+                                                     coord_c, coord_w, coord_h, coord_d, coord_n,
+                                                     offset_w, offset_h, offset_d);
+    }
+  };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// TMA_LOAD_MULTICAST: Initiates a TMA copy from global memory to shared memory
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SM90_TMA_LOAD_MULTICAST_1D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint"
+      " [%0], [%1, {%4}], [%2], %3, %5;"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "h"(multicast_mask),
+        "r"(crd0), "l"(cache_hint)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_LOAD_MULTICAST_2D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0, int32_t const& crd1)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint"
+      " [%0], [%1, {%4, %5}], [%2], %3, %6;"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "h"(multicast_mask),
+        "r"(crd0), "r"(crd1), "l"(cache_hint)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_LOAD_MULTICAST_3D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint"
+      " [%0], [%1, {%4, %5, %6}], [%2], %3, %7;"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "h"(multicast_mask),
+        "r"(crd0), "r"(crd1), "r"(crd2), "l"(cache_hint)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_LOAD_MULTICAST_4D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint"
+      " [%0], [%1, {%4, %5, %6, %7}], [%2], %3, %8;"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "h"(multicast_mask),
+        "r"(crd0), "r"(crd1), "r"(crd2),  "r"(crd3), "l"(cache_hint)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_LOAD_MULTICAST_5D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint"
+      " [%0], [%1, {%4, %5, %6, %7, %8}], [%2], %3, %9;"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "h"(multicast_mask),
+        "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3), "r"(crd4), "l"(cache_hint)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_LOAD_MULTICAST
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0)
+  {
+    return SM90_TMA_LOAD_MULTICAST_1D::copy(desc_ptr, mbar_ptr, multicast_mask, cache_hint, smem_ptr, crd0);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0, int32_t const& crd1)
+  {
+    return SM90_TMA_LOAD_MULTICAST_2D::copy(desc_ptr, mbar_ptr, multicast_mask, cache_hint, smem_ptr, crd0, crd1);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2)
+  {
+    return SM90_TMA_LOAD_MULTICAST_3D::copy(desc_ptr, mbar_ptr, multicast_mask, cache_hint, smem_ptr, crd0, crd1, crd2);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3)
+  {
+    return SM90_TMA_LOAD_MULTICAST_4D::copy(desc_ptr, mbar_ptr, multicast_mask, cache_hint, smem_ptr, crd0, crd1, crd2, crd3);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4)
+  {
+    return SM90_TMA_LOAD_MULTICAST_5D::copy(desc_ptr, mbar_ptr, multicast_mask, cache_hint, smem_ptr, crd0, crd1, crd2, crd3, crd4);
+  }
+
+  using PREFETCH = typename SM90_TMA_LOAD::PREFETCH;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// TMA_LOAD_MULTICAST im2col: Initiates a TMA copy, in im2col mode, from global memory to shared memory
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SM90_TMA_LOAD_IM2COL_MULTICAST_3D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask,
+       void      * smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_n,
+       uint16_t const& offset_w)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+    // Copy from global to shared::cluster.
+    asm volatile (
+      "cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster"
+      " [%0], [%1, {%3, %4, %5}], [%2], {%6}, %7;"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "r"(coord_c), "r"(coord_w), "r"(coord_n),
+        "h"(offset_w),
+        "h"(multicast_mask)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_LOAD_IM2COL_MULTICAST_4D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask,
+       void      * smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_n,
+       uint16_t const& offset_w, uint16_t const& offset_h)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+    // Copy from global to shared::cluster.
+    asm volatile (
+      "cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster"
+      " [%0], [%1, {%3, %4, %5, %6}], [%2], {%7, %8}, %9;"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "r"(coord_c), "r"(coord_w), "r"(coord_h), "r"(coord_n),
+        "h"(offset_w), "h"(offset_h),
+        "h"(multicast_mask)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_LOAD_IM2COL_MULTICAST_5D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask,
+       void      * smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_d, int32_t const& coord_n,
+       uint16_t const& offset_w, uint16_t const& offset_h, uint16_t const& offset_d)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+    // Copy from global to shared::cluster.
+    asm volatile (
+      "cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster"
+      " [%0], [%1, {%3, %4, %5, %6, %7}], [%2], {%8, %9, %10}, %11;"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "r"(coord_c), "r"(coord_w), "r"(coord_h), "r"(coord_d), "r"(coord_n),
+        "h"(offset_w), "h"(offset_h), "h"(offset_d),
+        "h"(multicast_mask)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_LOAD_IM2COL_MULTICAST
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask,
+       void      * smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_n,
+       uint16_t const& offset_w)
+  {
+    return SM90_TMA_LOAD_IM2COL_MULTICAST_3D::copy(desc_ptr, mbar_ptr, multicast_mask,
+                                                   smem_ptr,
+                                                   coord_c, coord_w, coord_n,
+                                                   offset_w);
+  }
+
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask,
+       void      * smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_n,
+       uint16_t const& offset_w, uint16_t const& offset_h)
+  {
+    return SM90_TMA_LOAD_IM2COL_MULTICAST_4D::copy(desc_ptr, mbar_ptr, multicast_mask,
+                                                   smem_ptr,
+                                                   coord_c, coord_w, coord_h, coord_n,
+                                                   offset_w, offset_h);
+  }
+
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask,
+       void      * smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_d, int32_t const& coord_n,
+       uint16_t const& offset_w, uint16_t const& offset_h, uint16_t const& offset_d)
+  {
+    return SM90_TMA_LOAD_IM2COL_MULTICAST_5D::copy(desc_ptr, mbar_ptr, multicast_mask,
+                                                   smem_ptr,
+                                                   coord_c, coord_w, coord_h, coord_d, coord_n,
+                                                   offset_w, offset_h, offset_d);
+  }
+
+  using PREFETCH = typename SM90_TMA_LOAD_IM2COL::PREFETCH;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// TMA_STORE : Initiates a TMA copy from shared memory to global memory
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SM90_TMA_STORE_1D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr,
+       void const* smem_ptr,
+       int32_t const& crd0)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.1d.global.shared::cta.bulk_group [%0, {%2}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr),
+        "r"(crd0)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_STORE_2D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr,
+       void const* smem_ptr,
+       int32_t const& crd0, int32_t const& crd1)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.2d.global.shared::cta.bulk_group [%0, {%2, %3}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr),
+        "r"(crd0), "r"(crd1)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_STORE_3D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr,
+       void const* smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.3d.global.shared::cta.bulk_group [%0, {%2, %3, %4}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr),
+        "r"(crd0), "r"(crd1), "r"(crd2)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_STORE_4D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr,
+       void const* smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.4d.global.shared::cta.bulk_group [%0, {%2, %3, %4, %5}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr),
+        "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_STORE_5D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr,
+       void const* smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.5d.global.shared::cta.bulk_group [%0, {%2, %3, %4, %5, %6}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr),
+        "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3), "r"(crd4)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_STORE
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr,
+       void const* smem_ptr,
+       int32_t const& crd0)
+  {
+    return SM90_TMA_STORE_1D::copy(desc_ptr, smem_ptr, crd0);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr,
+       void const* smem_ptr,
+       int32_t const& crd0, int32_t const& crd1)
+  {
+    return SM90_TMA_STORE_2D::copy(desc_ptr, smem_ptr, crd0, crd1);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr,
+       void const* smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2)
+  {
+    return SM90_TMA_STORE_3D::copy(desc_ptr, smem_ptr, crd0, crd1, crd2);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr,
+       void const* smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3)
+  {
+    return SM90_TMA_STORE_4D::copy(desc_ptr, smem_ptr, crd0, crd1, crd2, crd3);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr,
+       void const* smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4)
+  {
+    return SM90_TMA_STORE_5D::copy(desc_ptr, smem_ptr, crd0, crd1, crd2, crd3, crd4);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// TMA_STORE im2col: Initiates a TMA copy, in im2col mode, from shared memory to global memory
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SM90_TMA_STORE_IM2COL_3D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr,
+       void const* smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_n)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.3d.global.shared::cta.im2col_no_offs.bulk_group"
+      " [%0, {%2, %3, %4}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr),
+        "r"(coord_c), "r"(coord_w), "r"(coord_n)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_STORE_IM2COL_4D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr,
+       void const* smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_n)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.4d.global.shared::cta.im2col_no_offs.bulk_group"
+      " [%0, {%2, %3, %4, %5}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr),
+        "r"(coord_c), "r"(coord_w), "r"(coord_h), "r"(coord_n)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_STORE_IM2COL_5D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr,
+       void const* smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_d, int32_t const& coord_n)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.5d.global.shared::cta.im2col_no_offs.bulk_group"
+      " [%0, {%2, %3, %4, %5, %6}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr),
+        "r"(coord_c), "r"(coord_w), "r"(coord_h), "r"(coord_d), "r"(coord_n)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_STORE_IM2COL
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr,
+       void const* smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_n)
+  {
+    return SM90_TMA_STORE_IM2COL_3D::copy(desc_ptr, smem_ptr, coord_c, coord_w, coord_n);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr,
+       void const* smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_n)
+  {
+    return SM90_TMA_STORE_IM2COL_4D::copy(desc_ptr, smem_ptr, coord_c, coord_w, coord_h, coord_n);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr,
+       void const* smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_d, int32_t const& coord_n)
+  {
+    return SM90_TMA_STORE_IM2COL_5D::copy(desc_ptr, smem_ptr, coord_c, coord_w, coord_h, coord_d, coord_n);
+  }
+};
+
+// Fence for smem stores for subsequent TMA_STORE
+CUTE_HOST_DEVICE static void
+tma_store_fence() {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    cutlass::arch::synclog_emit_fence_view_async_shared(__LINE__);
+    asm volatile ("fence.proxy.async.shared::cta;");
+#elif defined(__CUDA_ARCH__)
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+}
+
+// Indicate arrival of warp issuing TMA_STORE
+CUTE_HOST_DEVICE static void
+tma_store_arrive() {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    cutlass::arch::synclog_emit_tma_store_arrive(__LINE__);
+    asm volatile("cp.async.bulk.commit_group;");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+}
+
+// Wait until at most Count committed TMA_STOREs are pending and all prior commits are complete
+template <int Count>
+CUTE_HOST_DEVICE static void
+tma_store_wait() {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    asm volatile(
+      "cp.async.bulk.wait_group.read %0;"
+      :
+      : "n"(Count)
+      : "memory");
+    cutlass::arch::synclog_emit_tma_store_wait(__LINE__, Count);
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// TMA_REDUCE_ADD : Initiates a TMA reduce-add from shared memory to global memory
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SM90_TMA_REDUCE_ADD_1D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* const desc_ptr,
+       void const* const smem_ptr,
+       int32_t const& crd0)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
+    asm volatile (
+      "cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.bulk_group [%0, {%2}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr),
+        "r"(crd0)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_REDUCE_ADD_2D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* const desc_ptr,
+       void const* const smem_ptr,
+       int32_t const& crd0, int32_t const& crd1)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
+    asm volatile (
+      "cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.bulk_group [%0, {%2, %3}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr),
+        "r"(crd0), "r"(crd1)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_REDUCE_ADD_3D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* const desc_ptr,
+       void const* const smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
+    asm volatile (
+      "cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.bulk_group [%0, {%2, %3, %4}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr),
+        "r"(crd0), "r"(crd1), "r"(crd2)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_REDUCE_ADD_4D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* const desc_ptr,
+       void const* const smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
+    asm volatile (
+      "cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.bulk_group [%0, {%2, %3, %4, %5}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr),
+        "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_REDUCE_ADD_5D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* const desc_ptr,
+       void const* const smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
+    asm volatile (
+      "cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.bulk_group [%0, {%2, %3, %4, %5, %6}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr),
+        "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3), "r"(crd4)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_REDUCE_ADD
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* const desc_ptr,
+       void const* const smem_ptr,
+       int32_t const& crd0)
+  {
+    return SM90_TMA_REDUCE_ADD_1D::copy(desc_ptr, smem_ptr, crd0);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* const desc_ptr,
+       void const* const smem_ptr,
+       int32_t const& crd0, int32_t const& crd1)
+  {
+    return SM90_TMA_REDUCE_ADD_2D::copy(desc_ptr, smem_ptr, crd0, crd1);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* const desc_ptr,
+       void const* const smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2)
+  {
+    return SM90_TMA_REDUCE_ADD_3D::copy(desc_ptr, smem_ptr, crd0, crd1, crd2);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* const desc_ptr,
+       void const* const smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3)
+  {
+    return SM90_TMA_REDUCE_ADD_4D::copy(desc_ptr, smem_ptr, crd0, crd1, crd2, crd3);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* const desc_ptr,
+       void const* const smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4)
+  {
+    return SM90_TMA_REDUCE_ADD_5D::copy(desc_ptr, smem_ptr, crd0, crd1, crd2, crd3, crd4);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// BULK_COPY : Copy a bulk of memory between shared memory and global memory
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SM90_BULK_COPY_G2S
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* gmem_ptr, uint64_t* mbar_ptr,
+       void      * smem_ptr, int32_t load_bytes)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    asm volatile("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];\n"
+                     :
+                     : "r"(smem_int_ptr), "l"(gmem_ptr), "r"(load_bytes), "r"(smem_int_mbar)
+                     : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use BULK_COPY without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+
+  struct PREFETCH
+  {
+    CUTE_HOST_DEVICE static void
+    copy(void const* gmem_ptr, int32_t load_bytes)
+    {
+  #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+      asm volatile("cp.async.bulk.prefetch.L2.global [%0], %1;\n"
+                      :
+                      : "l"(gmem_ptr), "r"(load_bytes)
+                      : "memory");
+  #else
+      CUTE_INVALID_CONTROL_PATH("Trying to use BULK_COPY without CUTE_ARCH_TMA_SM90_ENABLED.");
+  #endif
+    }
+  };
+};
+
+struct SM90_BULK_COPY_S2G
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* smem_ptr,
+       void      * gmem_ptr, int32_t store_bytes)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    asm volatile("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2;\n"
+                     :
+                     : "l"(gmem_ptr), "r"(smem_int_ptr), "r"(store_bytes)
+                     : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use BULK_COPY without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_BULK_COPY_AUTO {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // end namespace cute
diff --git a/include/cute/arch/mma.hpp b/include/cute/arch/mma.hpp
new file mode 100644
index 0000000000..6e06114a6c
--- /dev/null
+++ b/include/cute/arch/mma.hpp
@@ -0,0 +1,64 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>           // CUTE_HOST_DEVICE
+#include <cute/numeric/complex.hpp>  // cute::fma
+#include <cute/numeric/real.hpp>     // cute::fma
+
+namespace cute
+{
+
+//
+// Direct FMA for any type
+//
+
+template <class D, class A = D, class B = A, class C = D>
+struct UniversalFMA
+{
+  using DRegisters = D[1];
+  using ARegisters = A[1];
+  using BRegisters = B[1];
+  using CRegisters = C[1];
+
+  CUTE_HOST_DEVICE static constexpr void
+  fma(D      & d,
+      A const& a,
+      B const& b,
+      C const& c)
+  {
+    // Forward to an ADL/cute free function for these types
+    using cute::fma;
+    fma(d, a, b, c);
+  }
+};
+
+} // end namespace cute
diff --git a/include/cute/arch/mma_sm61.hpp b/include/cute/arch/mma_sm61.hpp
new file mode 100644
index 0000000000..f7bcb7d19d
--- /dev/null
+++ b/include/cute/arch/mma_sm61.hpp
@@ -0,0 +1,87 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cute/config.hpp>
+#include <cute/arch/mma.hpp>
+
+// Config
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 610))
+#  define CUTE_ARCH_MMA_SM61_ENABLED
+#endif
+
+namespace cute
+{
+
+struct SM61_DP4A
+{
+  using DRegisters = int32_t[1];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = int32_t[1];
+
+  // Register asm fma
+  CUTE_HOST_DEVICE static void
+  fma(int32_t& d, uint32_t const& a, uint32_t const& b, int32_t const& c)
+  {
+#if defined(CUTE_ARCH_MMA_SM61_ENABLED)
+    asm volatile("dp4a.s32.s32 %0, %1, %2, %3;"
+                 : "=r"(d)
+                 : "r"(a), "r"(b), "r"(c));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM61_DP4A without CUTE_ARCH_MMA_SM61_ENABLED");
+#endif
+  }
+};
+
+struct SM61_DP2A
+{
+  using DRegisters = int32_t[1];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = int32_t[1];
+
+  // Register asm fma
+  CUTE_HOST_DEVICE static void
+  fma(int32_t& d, uint32_t const& a, uint32_t const& b, int32_t const& c)
+  {
+#if defined(CUTE_ARCH_MMA_SM61_ENABLED)
+    asm volatile("dp2a.s32.s32 %0, %1, %2, %3;"
+                 : "=r"(d)
+                 : "r"(a), "r"(b), "r"(c));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM61_DP2A without CUTE_ARCH_MMA_SM61_ENABLED");
+#endif
+  }
+};
+
+} // namespace cute
diff --git a/include/cute/arch/mma_sm70.hpp b/include/cute/arch/mma_sm70.hpp
new file mode 100644
index 0000000000..63d96cf5d6
--- /dev/null
+++ b/include/cute/arch/mma_sm70.hpp
@@ -0,0 +1,329 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+
+#include <cute/arch/mma.hpp>
+
+// Config
+#if ((__CUDACC_VER_MAJOR__ > 10) || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))
+#  define CUTE_ARCH_MMA_SM70_SUPPORTED
+#  if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700))
+#    define CUTE_ARCH_MMA_SM70_ENABLED
+#  endif
+#endif
+
+namespace cute
+{
+
+//
+// SM70 MMA 884 F16F16F16
+//
+
+struct SM70_8x8x4_F16F16F16F16_TN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  // Register asm fma
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM70_ENABLED)
+    asm volatile("mma.sync.aligned.m8n8k4.row.col.f16.f16.f16.f16"
+                 "{%0, %1,  %2,  %3},"
+                 "{%4, %5},"
+                 "{%6, %7},"
+                 "{%8, %9, %10, %11};\n"
+        : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+        :  "r"(a0),  "r"(a1),
+           "r"(b0),  "r"(b1),
+           "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM70_8x8x4_F16F16F16F16_TN without CUTE_ARCH_MMA_SM70_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SM70_8x8x4_F16F16F16F16_NT
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  // Register asm fma
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM70_ENABLED)
+    asm volatile("mma.sync.aligned.m8n8k4.col.row.f16.f16.f16.f16"
+                 "{%0, %1,  %2,  %3},"
+                 "{%4, %5},"
+                 "{%6, %7},"
+                 "{%8, %9, %10, %11};\n"
+        : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+        :  "r"(a0),  "r"(a1),
+           "r"(b0),  "r"(b1),
+           "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM70_8x8x4_F16F16F16F16_NT without CUTE_ARCH_MMA_SM70_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SM70_8x8x4_F16F16F16F16_NN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  // Register asm fma
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM70_ENABLED)
+    asm volatile("mma.sync.aligned.m8n8k4.col.col.f16.f16.f16.f16"
+                 "{%0, %1,  %2,  %3},"
+                 "{%4, %5},"
+                 "{%6, %7},"
+                 "{%8, %9, %10, %11};\n"
+        : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+        :  "r"(a0),  "r"(a1),
+           "r"(b0),  "r"(b1),
+           "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM70_8x8x4_F16F16F16F16_NN without CUTE_ARCH_MMA_SM70_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SM70_8x8x4_F16F16F16F16_TT
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  // Register asm fma
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM70_ENABLED)
+    asm volatile("mma.sync.aligned.m8n8k4.row.row.f16.f16.f16.f16"
+                 "{%0, %1,  %2,  %3},"
+                 "{%4, %5},"
+                 "{%6, %7},"
+                 "{%8, %9, %10, %11};\n"
+        : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+        :  "r"(a0),  "r"(a1),
+           "r"(b0),  "r"(b1),
+           "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM70_8x8x4_F16F16F16F16_TT without CUTE_ARCH_MMA_SM70_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// SM70 MMA 884 F16F16F32
+//
+
+struct SM70_8x8x4_F32F16F16F32_TN
+{
+  using DRegisters = float[8];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[8];
+
+  // Register asm fma
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float      & d2, float      & d3,
+      float         & d4, float         & d5, float      & d6, float      & d7,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0, uint32_t const& b1,
+      float    const& c0, float    const& c1, float const& c2, float const& c3,
+      float    const& c4, float    const& c5, float const& c6, float const& c7)
+  {
+#if defined(CUTE_ARCH_MMA_SM70_ENABLED)
+    asm volatile("mma.sync.aligned.m8n8k4.row.col.f32.f16.f16.f32"
+                 "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+                 "{%8,  %9},"
+                 "{%10, %11},"
+                 "{%12, %13, %14, %15, %16, %17, %18, %19};\n"
+        : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3),
+          "=f"(d4), "=f"(d5), "=f"(d6), "=f"(d7)
+        :  "r"(a0),  "r"(a1),
+           "r"(b0),  "r"(b1),
+           "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
+           "f"(c4),  "f"(c5),  "f"(c6),  "f"(c7));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM70_8x8x4_F32F16F16F32_TN without CUTE_ARCH_MMA_SM70_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SM70_8x8x4_F32F16F16F32_NT
+{
+  using DRegisters = float[8];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[8];
+
+  // Register asm fma
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float      & d2, float      & d3,
+      float         & d4, float         & d5, float      & d6, float      & d7,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0, uint32_t const& b1,
+      float    const& c0, float    const& c1, float const& c2, float const& c3,
+      float    const& c4, float    const& c5, float const& c6, float const& c7)
+  {
+#if defined(CUTE_ARCH_MMA_SM70_ENABLED)
+    asm volatile("mma.sync.aligned.m8n8k4.col.row.f32.f16.f16.f32"
+                 "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+                 "{%8,  %9},"
+                 "{%10, %11},"
+                 "{%12, %13, %14, %15, %16, %17, %18, %19};"
+        : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3),
+          "=f"(d4), "=f"(d5), "=f"(d6), "=f"(d7)
+        :  "r"(a0),  "r"(a1),
+           "r"(b0),  "r"(b1),
+           "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
+           "f"(c4),  "f"(c5),  "f"(c6),  "f"(c7));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM70_8x8x4_F32F16F16F32_NT without CUTE_ARCH_MMA_SM70_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SM70_8x8x4_F32F16F16F32_NN
+{
+  using DRegisters = float[8];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[8];
+
+  // Register asm fma
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float      & d2, float      & d3,
+      float         & d4, float         & d5, float      & d6, float      & d7,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0, uint32_t const& b1,
+      float    const& c0, float    const& c1, float const& c2, float const& c3,
+      float    const& c4, float    const& c5, float const& c6, float const& c7)
+  {
+#if defined(CUTE_ARCH_MMA_SM70_ENABLED)
+    asm volatile("mma.sync.aligned.m8n8k4.col.col.f32.f16.f16.f32"
+                 "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+                 "{%8,  %9},"
+                 "{%10, %11},"
+                 "{%12, %13, %14, %15, %16, %17, %18, %19};"
+        : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3),
+          "=f"(d4), "=f"(d5), "=f"(d6), "=f"(d7)
+        :  "r"(a0),  "r"(a1),
+           "r"(b0),  "r"(b1),
+           "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
+           "f"(c4),  "f"(c5),  "f"(c6),  "f"(c7));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM70_8x8x4_F32F16F16F32_NN without CUTE_ARCH_MMA_SM70_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SM70_8x8x4_F32F16F16F32_TT
+{
+  using DRegisters = float[8];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[8];
+
+  // Register asm fma
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float      & d2, float      & d3,
+      float         & d4, float         & d5, float      & d6, float      & d7,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0, uint32_t const& b1,
+      float    const& c0, float    const& c1, float const& c2, float const& c3,
+      float    const& c4, float    const& c5, float const& c6, float const& c7)
+  {
+#if defined(CUTE_ARCH_MMA_SM70_ENABLED)
+    asm volatile("mma.sync.aligned.m8n8k4.row.row.f32.f16.f16.f32"
+                 "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+                 "{%8,  %9},"
+                 "{%10, %11},"
+                 "{%12, %13, %14, %15, %16, %17, %18, %19};"
+        : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3),
+          "=f"(d4), "=f"(d5), "=f"(d6), "=f"(d7)
+        :  "r"(a0),  "r"(a1),
+           "r"(b0),  "r"(b1),
+           "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
+           "f"(c4),  "f"(c5),  "f"(c6),  "f"(c7));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM70_8x8x4_F32F16F16F32_TT without CUTE_ARCH_MMA_SM70_ENABLED");
+#endif
+  }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // end namespace cute
diff --git a/include/cute/arch/mma_sm75.hpp b/include/cute/arch/mma_sm75.hpp
new file mode 100644
index 0000000000..c33f7b391c
--- /dev/null
+++ b/include/cute/arch/mma_sm75.hpp
@@ -0,0 +1,120 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+
+#include <cute/arch/mma.hpp>
+
+// Config
+#if ((__CUDACC_VER_MAJOR__ > 10) || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))
+#  define CUTE_ARCH_MMA_SM75_SUPPORTED
+#  if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750))
+#    define CUTE_ARCH_MMA_SM75_ENABLED
+#  endif
+#endif
+
+namespace cute
+{
+
+//
+// SM75 MMA 1688 F16F16F32
+//
+
+struct SM75_16x8x8_F32F16F16F32_TN
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = float[4];
+
+  // Register asm fma
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float      & d2, float      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      float    const& c0, float    const& c1, float const& c2, float const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM75_ENABLED)
+    asm volatile("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                 "{%0, %1, %2, %3},"
+                 "{%4, %5},"
+                 "{%6},"
+                 "{%7, %8, %9, %10};\n"
+        : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+        :  "r"(a0),  "r"(a1),
+           "r"(b0),
+           "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM75_16x8x8_F32F16F16F32_TN without CUTE_ARCH_MMA_SM75_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// SM75 MMA 8816 S8S8S32
+//
+
+struct SM75_8x8x16_S32S8S8S32_TN
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  // Register asm fma
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM75_ENABLED)
+    asm volatile("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32"
+                 "{%0, %1},"
+                 "{%2},"
+                 "{%3},"
+                 "{%4, %5};\n"
+        : "=r"(d0), "=r"(d1)
+        :  "r"(a0),
+           "r"(b0),
+           "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM75_8x8x16_S32S8S8S32_TN without CUTE_ARCH_MMA_SM75_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // end namespace cute
diff --git a/include/cute/arch/mma_sm80.hpp b/include/cute/arch/mma_sm80.hpp
new file mode 100644
index 0000000000..17860dd40f
--- /dev/null
+++ b/include/cute/arch/mma_sm80.hpp
@@ -0,0 +1,2241 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cute/config.hpp>
+#include <cute/arch/mma.hpp>
+#include <cute/numeric/complex.hpp>
+
+// Config
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+#  define CUTE_ARCH_MMA_SM80_ENABLED
+
+#if (__CUDA_ARCH__ <= 900)
+#define CUTE_ARCH_MMA_B1_AND_SM80_ENABLED
+#endif
+
+#if (__CUDA_ARCH__ <= 890)
+#define CUTE_ARCH_MMA_B1_XOR_SM80_ENABLED
+#endif
+
+#endif
+
+
+
+namespace cute {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x8 TN
+struct SM80_16x8x8_F16F16F16F16_TN
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
+      "{%0, %1},"
+      "{%2, %3},"
+      "{%4},"
+      "{%5, %6};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x8_F16F16F16F16_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x16 TN
+struct SM80_16x8x16_F16F16F16F16_TN
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      "{%6,  %7},"
+      "{%8,  %9};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x16_F16F16F16F16_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x8 TN
+struct SM80_16x8x8_F32F16F16F32_TN
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x8_F32F16F16F32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x16 TN
+struct SM80_16x8x16_F32F16F16F32_TN
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x16_F32F16F16F32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x8 TN
+struct SM80_16x8x8_F32BF16BF16F32_TN
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x8_F32BF16BF16F32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x16 TN
+struct SM80_16x8x16_F32BF16BF16F32_TN
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x16_F32BF16BF16F32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x4 TN
+struct SM80_16x8x4_F32TF32TF32F32_TN
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x4_F32TF32TF32F32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x8 TN
+struct SM80_16x8x8_F32TF32TF32F32_TN
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x8_F32TF32TF32F32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x4 TN
+struct SM80_8x8x4_F64F64F64F64_TN
+{
+  using DRegisters = double[2];
+  using ARegisters = double[1];
+  using BRegisters = double[1];
+  using CRegisters = double[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(double      & d0, double      & d1,
+      double const& a0,
+      double const& b0,
+      double const& c0, double const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k4.row.col.f64.f64.f64.f64 "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=d"(d0), "=d"(d1)
+      :  "d"(a0),
+         "d"(b0),
+         "d"(c0),  "d"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x4_F64F64F64F64_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+// MMA 8x8x4 TN with Planar Complex multiplication
+struct SM80_8x8x4_C64C64C64C64_TN
+{
+  using DRegisters = complex<double>[2];
+  using ARegisters = complex<double>[1];
+  using BRegisters = complex<double>[1];
+  using CRegisters = complex<double>[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(complex<double>      & d0, complex<double>      & d1,
+      complex<double> const& a0,
+      complex<double> const& b0,
+      complex<double> const& c0, complex<double> const& c1)
+  {
+    // Because thrust::complex does not provide a mutable ref
+    double& rd0 = reinterpret_cast<double(&)[2]>(d0)[0];
+    double& id0 = reinterpret_cast<double(&)[2]>(d0)[1];
+    double& rd1 = reinterpret_cast<double(&)[2]>(d1)[0];
+    double& id1 = reinterpret_cast<double(&)[2]>(d1)[1];
+
+    // d.real() =  a.real() * b.real() + c.real();
+    SM80_8x8x4_F64F64F64F64_TN::fma(
+      rd0, rd1,
+      a0.real(),
+      b0.real(),
+      c0.real(), c1.real());
+
+    // d.imag() =  a.imag() * b.real() + c.imag();
+    SM80_8x8x4_F64F64F64F64_TN::fma(
+      id0, id1,
+      a0.imag(),
+      b0.real(),
+      c0.imag(), c1.imag());
+
+    // d.real() = -a.imag() * b.imag() + d.real();
+    SM80_8x8x4_F64F64F64F64_TN::fma(
+      rd0, rd1,
+      -a0.imag(),
+      b0.imag(),
+      d0.real(), d1.real());
+
+    // d.imag() =  a.real() * b.imag() + d.imag();
+    SM80_8x8x4_F64F64F64F64_TN::fma(
+      id0, id1,
+      a0.real(),
+      b0.imag(),
+      d0.imag(), d1.imag());
+  }
+};
+
+// MMA 8x8x4 TN with Gaussian Complex multiplication:
+//    (a + bi)*(c + di)
+//  yields
+//    t0 += a*c
+//    t1 += b*d
+//    t2 += (a+b)*(c+d)
+//  then
+//    re = t0 - t1
+//    im = t2 - t0 - t1
+struct SM80_8x8x4_GC64C64C64GC64_TN
+{
+  struct GaussComplex {
+    double t0, t1, t2;
+
+    CUTE_HOST_DEVICE //constexpr
+    operator complex<double>() const { return complex<double>(t0 - t1, t2 - t0 - t1); }
+
+    CUTE_HOST_DEVICE friend //constexpr
+    complex<double> operator*(GaussComplex const& a, complex<double> const& b) { return static_cast<complex<double>>(a) * b; }
+    CUTE_HOST_DEVICE friend //constexpr
+    complex<double> operator*(complex<double> const& a, GaussComplex const& b) { return b * a; }
+
+    CUTE_HOST_DEVICE friend //constexpr
+    complex<double> operator+(GaussComplex const& a, complex<double> const& b) { return static_cast<complex<double>>(a) + b; }
+    CUTE_HOST_DEVICE friend //constexpr
+    complex<double> operator+(complex<double> const& a, GaussComplex const& b) { return b + a; }
+  };
+
+  using DRegisters = GaussComplex[2];
+  using ARegisters = complex<double>[1];
+  using BRegisters = complex<double>[1];
+  using CRegisters = GaussComplex[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(GaussComplex         & d0, GaussComplex         & d1,
+      complex<double> const& a0,
+      complex<double> const& b0,
+      GaussComplex    const& c0, GaussComplex    const& c1)
+  {
+    SM80_8x8x4_F64F64F64F64_TN::fma(d0.t0, d1.t0,
+                                    a0.real(),
+                                    b0.real(),
+                                    c0.t0, c1.t0);
+    SM80_8x8x4_F64F64F64F64_TN::fma(d0.t1, d1.t1,
+                                    a0.imag(),
+                                    b0.imag(),
+                                    c0.t1, c1.t1);
+    SM80_8x8x4_F64F64F64F64_TN::fma(d0.t2, d1.t2,
+                                    a0.real() + a0.imag(),
+                                    b0.real() + b0.imag(),
+                                    c0.t2, c1.t2);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x16 TN
+struct SM80_8x8x16_S32S8S8S32_TN
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x16_S32S8S8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x16 TN
+struct SM80_8x8x16_S32S8S8S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x16_S32S8S8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x16 TN
+struct SM80_16x8x16_S32S8S8S32_TN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x16_S32S8S8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x16 TN
+struct SM80_16x8x16_S32S8S8S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x16_S32S8S8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN
+struct SM80_16x8x32_S32S8S8S32_TN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32S8S8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN
+struct SM80_16x8x32_S32S8S8S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32S8S8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x16 TN
+struct SM80_8x8x16_S32S8U8S32_TN
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k16.row.col.s32.s8.u8.s32 "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x16_S32S8U8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x16 TN
+struct SM80_8x8x16_S32S8U8S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k16.row.col.s32.s8.u8.s32.satfinite "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x16_S32S8U8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x16 TN
+struct SM80_16x8x16_S32S8U8S32_TN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.s32.s8.u8.s32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x16_S32S8U8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x16 TN
+struct SM80_16x8x16_S32S8U8S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.s32.s8.u8.s32.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x16_S32S8U8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN
+struct SM80_16x8x32_S32S8U8S32_TN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.s8.u8.s32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32S8U8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN
+struct SM80_16x8x32_S32S8U8S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.s8.u8.s32.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32S8U8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x16 TN
+struct SM80_8x8x16_S32U8S8S32_TN
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k16.row.col.s32.u8.s8.s32 "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x16_S32U8S8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x16 TN
+struct SM80_8x8x16_S32U8S8S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k16.row.col.s32.u8.s8.s32.satfinite "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x16_S32U8S8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x16 TN
+struct SM80_16x8x16_S32U8S8S32_TN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.s32.u8.s8.s32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x16_S32U8S8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x16 TN
+struct SM80_16x8x16_S32U8S8S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.s32.u8.s8.s32.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x16_S32U8S8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN
+struct SM80_16x8x32_S32U8S8S32_TN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.u8.s8.s32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32U8S8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN
+struct SM80_16x8x32_S32U8S8S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.u8.s8.s32.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32U8S8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x16 TN
+struct SM80_8x8x16_S32U8U8S32_TN
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k16.row.col.s32.u8.u8.s32 "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x16_S32U8U8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x16 TN
+struct SM80_8x8x16_S32U8U8S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k16.row.col.s32.u8.u8.s32.satfinite "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x16_S32U8U8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x16 TN
+struct SM80_16x8x16_S32U8U8S32_TN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.s32.u8.u8.s32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x16_S32U8U8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x16 TN
+struct SM80_16x8x16_S32U8U8S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.s32.u8.u8.s32.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x16_S32U8U8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN
+struct SM80_16x8x32_S32U8U8S32_TN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.u8.u8.s32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32U8U8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN
+struct SM80_16x8x32_S32U8U8S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.u8.u8.s32.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32U8U8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x32 TN
+struct SM80_8x8x32_S32S4S4S32_TN
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k32.row.col.s32.s4.s4.s32 "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x32_S32S4S4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x32 TN
+struct SM80_8x8x32_S32S4S4S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k32.row.col.s32.s4.s4.s32.satfinite "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x32_S32S4S4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN
+struct SM80_16x8x32_S32S4S4S32_TN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.s4.s4.s32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32S4S4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN
+struct SM80_16x8x32_S32S4S4S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.s4.s4.s32.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32S4S4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN
+struct SM80_16x8x64_S32S4S4S32_TN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k64.row.col.s32.s4.s4.s32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x64_S32S4S4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN
+struct SM80_16x8x64_S32S4S4S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k64.row.col.s32.s4.s4.s32.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x64_S32S4S4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x32 TN
+struct SM80_8x8x32_S32S4U4S32_TN
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k32.row.col.s32.s4.u4.s32 "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x32_S32S4U4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x32 TN
+struct SM80_8x8x32_S32S4U4S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k32.row.col.s32.s4.u4.s32.satfinite "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x32_S32S4U4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN
+struct SM80_16x8x32_S32S4U4S32_TN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.s4.u4.s32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32S4U4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN
+struct SM80_16x8x32_S32S4U4S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.s4.u4.s32.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32S4U4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN
+struct SM80_16x8x64_S32S4U4S32_TN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k64.row.col.s32.s4.u4.s32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x64_S32S4U4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN
+struct SM80_16x8x64_S32S4U4S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k64.row.col.s32.s4.u4.s32.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x64_S32S4U4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x32 TN
+struct SM80_8x8x32_S32U4S4S32_TN
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k32.row.col.s32.u4.s4.s32 "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x32_S32U4S4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x32 TN
+struct SM80_8x8x32_S32U4S4S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k32.row.col.s32.u4.s4.s32.satfinite "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x32_S32U4S4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN
+struct SM80_16x8x32_S32U4S4S32_TN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.u4.s4.s32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32U4S4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN
+struct SM80_16x8x32_S32U4S4S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.u4.s4.s32.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32U4S4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN
+struct SM80_16x8x64_S32U4S4S32_TN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k64.row.col.s32.u4.s4.s32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x64_S32U4S4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN
+struct SM80_16x8x64_S32U4S4S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k64.row.col.s32.u4.s4.s32.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x64_S32U4S4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x32 TN
+struct SM80_8x8x32_S32U4U4S32_TN
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k32.row.col.s32.u4.u4.s32 "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x32_S32U4U4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x32 TN
+struct SM80_8x8x32_S32U4U4S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k32.row.col.s32.u4.u4.s32.satfinite "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x32_S32U4U4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN
+struct SM80_16x8x32_S32U4U4S32_TN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.u4.u4.s32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32U4U4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN
+struct SM80_16x8x32_S32U4U4S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.u4.u4.s32.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32U4U4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN
+struct SM80_16x8x64_S32U4U4S32_TN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k64.row.col.s32.u4.u4.s32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x64_S32U4U4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN
+struct SM80_16x8x64_S32U4U4S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k64.row.col.s32.u4.u4.s32.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x64_S32U4U4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x128 TN
+struct SM80_8x8x128_S32U1U1S32_TN_ANDPOPC
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k128.row.col.s32.b1.b1.s32.and.popc "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x128_S32U1U1S32_TN_ANDPOPC without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x128 TN
+struct SM80_16x8x128_S32U1U1S32_TN_ANDPOPC
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k128.row.col.s32.b1.b1.s32.and.popc "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x128_S32U1U1S32_TN_ANDPOPC without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x256 TN
+struct SM80_16x8x256_S32U1U1S32_TN_ANDPOPC
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_B1_AND_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k256.row.col.s32.b1.b1.s32.and.popc "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x256_S32U1U1S32_TN_ANDPOPC without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x128 TN
+struct SM80_8x8x128_S32U1U1S32_TN_XORPOPC
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_B1_XOR_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k128.row.col.s32.b1.b1.s32.xor.popc "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x128_S32U1U1S32_TN_XORPOPC without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x128 TN
+struct SM80_16x8x128_S32U1U1S32_TN_XORPOPC
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_B1_XOR_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k128.row.col.s32.b1.b1.s32.xor.popc "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x128_S32U1U1S32_TN_XORPOPC without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x256 TN
+struct SM80_16x8x256_S32U1U1S32_TN_XORPOPC
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_B1_XOR_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k256.row.col.s32.b1.b1.s32.xor.popc "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x256_S32U1U1S32_TN_XORPOPC without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cute
diff --git a/include/cute/arch/mma_sm90.hpp b/include/cute/arch/mma_sm90.hpp
new file mode 100644
index 0000000000..51d34563c4
--- /dev/null
+++ b/include/cute/arch/mma_sm90.hpp
@@ -0,0 +1,9331 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cute/config.hpp>
+#include <cute/arch/mma.hpp>
+
+// Config
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+#    define CUTE_ARCH_MMA_SM90_ENABLED
+#    define CUTE_ARCH_MMA_F64_SM90_ENABLED
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cute {
+
+namespace SM90 {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x4 TN
+struct MMA_16x8x4_F64F64F64F64_TN
+{
+  using DRegisters = double[4];
+  using ARegisters = double[2];
+  using BRegisters = double[1];
+  using CRegisters = double[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(double      & d0, double      & d1, double      & d2, double      & d3,
+      double const& a0, double const& a1,
+      double const& b0,
+      double const& c0, double const& c1, double const& c2, double const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_F64_SM90_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k4.row.col.f64.f64.f64.f64"
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=d"(d0), "=d"(d1), "=d"(d2), "=d"(d3)
+      :  "d"(a0),  "d"(a1),
+         "d"(b0),
+         "d"(c0),  "d"(c1),  "d"(c2),  "d"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_16x8x4_F64F64F64F64_TN without CUTE_ARCH_MMA_SM90_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x8 TN
+struct MMA_16x8x8_F64F64F64F64_TN
+{
+  using DRegisters = double[4];
+  using ARegisters = double[4];
+  using BRegisters = double[2];
+  using CRegisters = double[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(double      & d0, double      & d1, double      & d2, double      & d3,
+      double const& a0, double const& a1, double const& a2, double const& a3,
+      double const& b0, double const& b1,
+      double const& c0, double const& c1, double const& c2, double const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_F64_SM90_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k8.row.col.f64.f64.f64.f64"
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=d"(d0), "=d"(d1), "=d"(d2), "=d"(d3)
+      :  "d"(a0),  "d"(a1),  "d"(a2),  "d"(a3),
+         "d"(b0),  "d"(b1),
+         "d"(c0),  "d"(c1),  "d"(c2),  "d"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_16x8x8_F64F64F64F64_TN without CUTE_ARCH_MMA_SM90_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x16 TN
+struct MMA_16x8x16_F64F64F64F64_TN
+{
+  using DRegisters = double[4];
+  using ARegisters = double[8];
+  using BRegisters = double[4];
+  using CRegisters = double[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(double      & d0, double      & d1, double      & d2, double      & d3,
+      double const& a0, double const& a1, double const& a2, double const& a3,
+      double const& a4, double const& a5, double const& a6, double const& a7,
+      double const& b0, double const& b1, double const& b2, double const& b3,
+      double const& c0, double const& c1, double const& c2, double const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_F64_SM90_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.f64.f64.f64.f64"
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7,  %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      "{%16, %17, %18, %19};\n"
+      : "=d"(d0), "=d"(d1), "=d"(d2), "=d"(d3)
+      :  "d"(a0),  "d"(a1),  "d"(a2),  "d"(a3),
+         "d"(a4),  "d"(a5),  "d"(a6),  "d"(a7),
+         "d"(b0),  "d"(b1),  "d"(b2),  "d"(b3),
+         "d"(c0),  "d"(c1),  "d"(c2),  "d"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_16x8x16_F64F64F64F64_TN without CUTE_ARCH_MMA_SM90_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x4 TN
+struct MMA_16x8x4_C64C64C64C64_TN
+{
+  using DRegisters = complex<double>[4];
+  using ARegisters = complex<double>[2];
+  using BRegisters = complex<double>[1];
+  using CRegisters = complex<double>[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(complex<double>      & d0, complex<double>      & d1,
+      complex<double>      & d2, complex<double>      & d3,
+      complex<double> const& a0, complex<double> const& a1,
+      complex<double> const& b0,
+      complex<double> const& c0, complex<double> const& c1,
+      complex<double> const& c2, complex<double> const& c3)
+  {
+    // Because thrust::complex does not provide a mutable ref
+    double& rd0 = reinterpret_cast<double(&)[2]>(d0)[0];
+    double& id0 = reinterpret_cast<double(&)[2]>(d0)[1];
+    double& rd1 = reinterpret_cast<double(&)[2]>(d1)[0];
+    double& id1 = reinterpret_cast<double(&)[2]>(d1)[1];
+    double& rd2 = reinterpret_cast<double(&)[2]>(d2)[0];
+    double& id2 = reinterpret_cast<double(&)[2]>(d2)[1];
+    double& rd3 = reinterpret_cast<double(&)[2]>(d3)[0];
+    double& id3 = reinterpret_cast<double(&)[2]>(d3)[1];
+
+    // d.real() =  a.real() * b.real() + c.real();
+    MMA_16x8x4_F64F64F64F64_TN::fma(
+      rd0, rd1, rd2, rd3,
+      a0.real(), a1.real(),
+      b0.real(),
+      c0.real(), c1.real(), c2.real(), c3.real());
+
+    // d.imag() =  a.imag() * b.real() + c.imag();
+    MMA_16x8x4_F64F64F64F64_TN::fma(
+      id0, id1, id2, id3,
+      a0.imag(), a1.imag(),
+      b0.real(),
+      c0.imag(), c1.imag(), c2.imag(), c3.imag());
+
+    // d.real() = -a.imag() * b.imag() + d.real();
+    MMA_16x8x4_F64F64F64F64_TN::fma(
+      rd0, rd1, rd2, rd3,
+      -a0.imag(), -a1.imag(),
+      b0.imag(),
+      d0.real(), d1.real(), d2.real(), d3.real());
+
+    // d.imag() =  a.real() * b.imag() + d.imag();
+    MMA_16x8x4_F64F64F64F64_TN::fma(
+      id0, id1, id2, id3,
+      a0.real(), a1.real(),
+      b0.imag(),
+      d0.imag(), d1.imag(), d2.imag(), d3.imag());
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x8 TN
+struct MMA_16x8x8_C64C64C64C64_TN
+{
+  using DRegisters = complex<double>[4];
+  using ARegisters = complex<double>[4];
+  using BRegisters = complex<double>[2];
+  using CRegisters = complex<double>[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(complex<double>      & d0, complex<double>      & d1,
+      complex<double>      & d2, complex<double>      & d3,
+      complex<double> const& a0, complex<double> const& a1,
+      complex<double> const& a2, complex<double> const& a3,
+      complex<double> const& b0, complex<double> const& b1,
+      complex<double> const& c0, complex<double> const& c1,
+      complex<double> const& c2, complex<double> const& c3)
+  {
+    // Because thrust::complex does not provide a mutable ref
+    double& rd0 = reinterpret_cast<double(&)[2]>(d0)[0];
+    double& id0 = reinterpret_cast<double(&)[2]>(d0)[1];
+    double& rd1 = reinterpret_cast<double(&)[2]>(d1)[0];
+    double& id1 = reinterpret_cast<double(&)[2]>(d1)[1];
+    double& rd2 = reinterpret_cast<double(&)[2]>(d2)[0];
+    double& id2 = reinterpret_cast<double(&)[2]>(d2)[1];
+    double& rd3 = reinterpret_cast<double(&)[2]>(d3)[0];
+    double& id3 = reinterpret_cast<double(&)[2]>(d3)[1];
+
+    // d.real() =  a.real() * b.real() + c.real();
+    MMA_16x8x8_F64F64F64F64_TN::fma(
+      rd0, rd1, rd2, rd3,
+      a0.real(), a1.real(), a2.real(), a3.real(),
+      b0.real(), b1.real(),
+      c0.real(), c1.real(), c2.real(), c3.real());
+
+    // d.imag() =  a.imag() * b.real() + c.imag();
+    MMA_16x8x8_F64F64F64F64_TN::fma(
+      id0, id1, id2, id3,
+      a0.imag(), a1.imag(), a2.imag(), a3.imag(),
+      b0.real(), b1.real(),
+      c0.imag(), c1.imag(), c2.imag(), c3.imag());
+
+    // d.real() = -a.imag() * b.imag() + d.real();
+    MMA_16x8x8_F64F64F64F64_TN::fma(
+      rd0, rd1, rd2, rd3,
+      -a0.imag(), -a1.imag(), -a2.imag(), -a3.imag(),
+      b0.imag(), b1.imag(),
+      d0.real(), d1.real(), d2.real(), d3.real());
+
+    // d.imag() =  a.real() * b.imag() + d.imag();
+    MMA_16x8x8_F64F64F64F64_TN::fma(
+      id0, id1, id2, id3,
+      a0.real(), a1.real(), a2.real(), a3.real(),
+      b0.imag(), b1.imag(),
+      d0.imag(), d1.imag(), d2.imag(), d3.imag());
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x16 TN
+struct MMA_16x8x16_C64C64C64C64_TN
+{
+  using DRegisters = complex<double>[4];
+  using ARegisters = complex<double>[8];
+  using BRegisters = complex<double>[4];
+  using CRegisters = complex<double>[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(complex<double>      & d0, complex<double>      & d1,
+      complex<double>      & d2, complex<double>      & d3,
+      complex<double> const& a0, complex<double> const& a1,
+      complex<double> const& a2, complex<double> const& a3,
+      complex<double> const& a4, complex<double> const& a5,
+      complex<double> const& a6, complex<double> const& a7,
+      complex<double> const& b0, complex<double> const& b1,
+      complex<double> const& b2, complex<double> const& b3,
+      complex<double> const& c0, complex<double> const& c1,
+      complex<double> const& c2, complex<double> const& c3)
+  {
+    // Because thrust::complex does not provide a mutable ref
+    double& rd0 = reinterpret_cast<double(&)[2]>(d0)[0];
+    double& id0 = reinterpret_cast<double(&)[2]>(d0)[1];
+    double& rd1 = reinterpret_cast<double(&)[2]>(d1)[0];
+    double& id1 = reinterpret_cast<double(&)[2]>(d1)[1];
+    double& rd2 = reinterpret_cast<double(&)[2]>(d2)[0];
+    double& id2 = reinterpret_cast<double(&)[2]>(d2)[1];
+    double& rd3 = reinterpret_cast<double(&)[2]>(d3)[0];
+    double& id3 = reinterpret_cast<double(&)[2]>(d3)[1];
+
+    // d.real() =  a.real() * b.real() + c.real();
+    MMA_16x8x16_F64F64F64F64_TN::fma(
+      rd0, rd1, rd2, rd3,
+      a0.real(), a1.real(), a2.real(), a3.real(),
+      a4.real(), a5.real(), a6.real(), a7.real(),
+      b0.real(), b1.real(), b2.real(), b3.real(),
+      c0.real(), c1.real(), c2.real(), c3.real());
+
+    // d.imag() =  a.imag() * b.real() + c.imag();
+    MMA_16x8x16_F64F64F64F64_TN::fma(
+      id0, id1, id2, id3,
+      a0.imag(), a1.imag(), a2.imag(), a3.imag(),
+      a4.imag(), a5.imag(), a6.imag(), a7.imag(),
+      b0.real(), b1.real(), b2.real(), b3.real(),
+      c0.imag(), c1.imag(), c2.imag(), c3.imag());
+
+    // d.real() = -a.imag() * b.imag() + d.real();
+    MMA_16x8x16_F64F64F64F64_TN::fma(
+      rd0, rd1, rd2, rd3,
+      -a0.imag(), -a1.imag(), -a2.imag(), -a3.imag(),
+      -a4.imag(), -a5.imag(), -a6.imag(), -a7.imag(),
+      b0.imag(), b1.imag(), b2.imag(), b3.imag(),
+      d0.real(), d1.real(), d2.real(), d3.real());
+
+    // d.imag() =  a.real() * b.imag() + d.imag();
+    MMA_16x8x16_F64F64F64F64_TN::fma(
+      id0, id1, id2, id3,
+      a0.real(), a1.real(), a2.real(), a3.real(),
+      a4.real(), a5.real(), a6.real(), a7.real(),
+      b0.imag(), b1.imag(), b2.imag(), b3.imag(),
+      d0.imag(), d1.imag(), d2.imag(), d3.imag());
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}
+
+} // namespace cute
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <cute/arch/mma_sm90_desc.hpp>
+#include <cute/arch/mma_sm90_gmma.hpp>
+#include <cute/arch/mma_sm90_gmma_sparse.hpp>
+#include <cute/layout.hpp>                     // cute::size
+#include <cute/numeric/integral_constant.hpp>  // cute::is_static
+#include <cute/numeric/numeric_types.hpp>      // cute::half_t, cute::float_e4m3_t, cute::tfloat32_t, etc
+#include <cute/util/type_traits.hpp>           // cute::is_same_v
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cute {
+namespace SM90::GMMA {
+
+template <
+  class ElementA,
+  class ElementB,
+  class ElementC,
+  class TileShape_MNK,
+  GMMA::Major MajorA = GMMA::Major::K,
+  GMMA::Major MajorB = GMMA::Major::K,
+  auto... Args                         // e.g. GMMA::ScaleOut::One, [GMMA::ScaleIn::One, GMMA::ScaleIn::One]
+                                       // But most commonly leave empty for defaults
+>
+CUTE_HOST_DEVICE constexpr
+auto
+ss_op_selector()
+{
+  static_assert(is_static<TileShape_MNK>::value, "TileShape_MNK must be static.");
+  static_assert(rank(TileShape_MNK{}) == 3, "TileShape_MNK must be rank 3.");
+  static_assert(size<0>(TileShape_MNK{}) % 64 == 0, "Tile_M must be a multiple of 64.");
+  auto Tile_N = size<1>(TileShape_MNK{});
+
+  // F16 accumulator
+  if constexpr (is_same_v<ElementC, half_t>) {
+
+    // Input A: half_t ; Input B: half_t
+    if constexpr (is_same_v<ElementA, half_t> && is_same_v<ElementB, half_t>) {
+      static_assert(size<2>(TileShape_MNK{}) % 16 == 0, "Tile_K must be a multiple of 16.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e4m3_t ; Input B: float_e4m3_t
+    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e4m3_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e4m3_t ; Input B: float_e5m2_t
+    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e5m2_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e5m2_t ; Input B: float_e4m3_t
+    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e4m3_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e5m2_t ; Input B: float_e5m2_t
+    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e5m2_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    else {
+      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
+    }
+  }
+
+  // F32 accumulator
+  else if constexpr (is_same_v<ElementC, float>) {
+
+    // Input A: half_t ; Input B: half_t
+    if constexpr (is_same_v<ElementA, half_t> && is_same_v<ElementB, half_t>) {
+      static_assert(size<2>(TileShape_MNK{}) % 16 == 0, "Tile_K must be a multiple of 16.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: bfloat16_t ; Input B: bfloat16_t
+    else if constexpr (is_same_v<ElementA, bfloat16_t> && is_same_v<ElementB, bfloat16_t>) {
+      static_assert(size<2>(TileShape_MNK{}) % 16 == 0, "Tile_K must be a multiple of 16.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: tfloat32_t ; Input B: tfloat32_t
+    else if constexpr (is_same_v<ElementA, tfloat32_t> && is_same_v<ElementB, tfloat32_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 8 == 0, "Tile_K must be a multiple of 8.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e4m3_t ; Input B: float_e4m3_t
+    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e4m3_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e4m3_t ; Input B: float_e5m2_t
+    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e5m2_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e5m2_t ; Input B: float_e4m3_t
+    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e4m3_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e5m2_t ; Input B: float_e5m2_t
+    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e5m2_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    else {
+      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
+    }
+  }
+
+  // S32 accumulator
+  else if constexpr (is_same_v<ElementC, int32_t>) {
+
+    // Input A: int8_t ; Input B: int8_t
+    if constexpr (is_same_v<ElementA, int8_t> && is_same_v<ElementB, int8_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_S32S8S8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_S32S8S8_SS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_S32S8S8_SS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_S32S8S8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_S32S8S8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_S32S8S8_SS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_S32S8S8_SS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_S32S8S8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_S32S8S8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_S32S8S8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_S32S8S8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_S32S8S8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_S32S8S8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_S32S8S8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_S32S8S8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_S32S8S8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_S32S8S8_SS_TN{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_S32S8S8_SS_TN{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: int8_t ; Input B: uint8_t
+    else if constexpr (is_same_v<ElementA, int8_t> && is_same_v<ElementB, uint8_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_S32S8U8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_S32S8U8_SS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_S32S8U8_SS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_S32S8U8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_S32S8U8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_S32S8U8_SS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_S32S8U8_SS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_S32S8U8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_S32S8U8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_S32S8U8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_S32S8U8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_S32S8U8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_S32S8U8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_S32S8U8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_S32S8U8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_S32S8U8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_S32S8U8_SS_TN{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_S32S8U8_SS_TN{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: uint8_t ; Input B: int8_t
+    else if constexpr (is_same_v<ElementA, uint8_t> && is_same_v<ElementB, int8_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_S32U8S8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_S32U8S8_SS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_S32U8S8_SS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_S32U8S8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_S32U8S8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_S32U8S8_SS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_S32U8S8_SS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_S32U8S8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_S32U8S8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_S32U8S8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_S32U8S8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_S32U8S8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_S32U8S8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_S32U8S8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_S32U8S8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_S32U8S8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_S32U8S8_SS_TN{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_S32U8S8_SS_TN{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: uint8_t ; Input B: uint8_t
+    else if constexpr (is_same_v<ElementA, uint8_t> && is_same_v<ElementB, uint8_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_S32U8U8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_S32U8U8_SS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_S32U8U8_SS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_S32U8U8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_S32U8U8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_S32U8U8_SS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_S32U8U8_SS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_S32U8U8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_S32U8U8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_S32U8U8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_S32U8U8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_S32U8U8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_S32U8U8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_S32U8U8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_S32U8U8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_S32U8U8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_S32U8U8_SS_TN{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_S32U8U8_SS_TN{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    else {
+      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
+    }
+  }
+
+  // Unknown accumulator type
+  else {
+    static_assert(sizeof(ElementC) == 0, "Unknown ElementC accumulator type.");
+  }
+}
+
+template <
+  class ElementA,
+  class ElementB,
+  class ElementC,
+  class TileShape_MNK,
+  GMMA::Major MajorA = GMMA::Major::K,
+  GMMA::Major MajorB = GMMA::Major::K,
+  auto... Args                         // e.g. GMMA::ScaleOut::One, [GMMA::ScaleIn::One, GMMA::ScaleIn::One]
+                                       // But most commonly leave empty for defaults
+>
+CUTE_HOST_DEVICE constexpr
+auto
+ss_op_selector_sparse()
+{
+  static_assert(is_static<TileShape_MNK>::value, "TileShape_MNK must be static.");
+  static_assert(rank(TileShape_MNK{}) == 3, "TileShape_MNK must be rank 3.");
+  static_assert(size<0>(TileShape_MNK{}) % 64 == 0, "Tile_M must be a multiple of 64.");
+  auto Tile_N = size<1>(TileShape_MNK{});
+
+  // F16 accumulator
+  if constexpr (is_same_v<ElementC, half_t>) {
+
+    // Input A: half_t ; Input B: half_t
+    if constexpr (is_same_v<ElementA, half_t> && is_same_v<ElementB, half_t>) {
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e4m3_t ; Input B: float_e4m3_t
+    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e4m3_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e4m3_t ; Input B: float_e5m2_t
+    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e5m2_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e5m2_t ; Input B: float_e4m3_t
+    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e4m3_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e5m2_t ; Input B: float_e5m2_t
+    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e5m2_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    else {
+      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
+    }
+  }
+
+  // F32 accumulator
+  else if constexpr (is_same_v<ElementC, float>) {
+
+    // Input A: half_t ; Input B: half_t
+    if constexpr (is_same_v<ElementA, half_t> && is_same_v<ElementB, half_t>) {
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: bfloat16_t ; Input B: bfloat16_t
+    else if constexpr (is_same_v<ElementA, bfloat16_t> && is_same_v<ElementB, bfloat16_t>) {
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: tfloat32_t ; Input B: tfloat32_t
+    else if constexpr (is_same_v<ElementA, tfloat32_t> && is_same_v<ElementB, tfloat32_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 16 == 0, "Tile_K must be a multiple of 16.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e4m3_t ; Input B: float_e4m3_t
+    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e4m3_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e4m3_t ; Input B: float_e5m2_t
+    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e5m2_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e5m2_t ; Input B: float_e4m3_t
+    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e4m3_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e5m2_t ; Input B: float_e5m2_t
+    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e5m2_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    else {
+      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
+    }
+  }
+
+  // S32 accumulator
+  else if constexpr (is_same_v<ElementC, int32_t>) {
+
+    // Input A: int8_t ; Input B: int8_t
+    if constexpr (is_same_v<ElementA, int8_t> && is_same_v<ElementB, int8_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8S8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8S8_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8S8_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8S8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8S8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8S8_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8S8_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8S8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8S8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8S8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8S8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8S8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8S8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8S8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8S8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8S8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8S8_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8S8_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: int8_t ; Input B: uint8_t
+    else if constexpr (is_same_v<ElementA, int8_t> && is_same_v<ElementB, uint8_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8U8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8U8_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8U8_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8U8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8U8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8U8_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8U8_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8U8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8U8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8U8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8U8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8U8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8U8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8U8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8U8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8U8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8U8_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8U8_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: uint8_t ; Input B: int8_t
+    else if constexpr (is_same_v<ElementA, uint8_t> && is_same_v<ElementB, int8_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8S8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8S8_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8S8_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8S8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8S8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8S8_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8S8_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8S8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8S8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8S8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8S8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8S8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8S8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8S8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8S8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8S8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8S8_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8S8_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: uint8_t ; Input B: uint8_t
+    else if constexpr (is_same_v<ElementA, uint8_t> && is_same_v<ElementB, uint8_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8U8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8U8_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8U8_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8U8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8U8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8U8_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8U8_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8U8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8U8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8U8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8U8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8U8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8U8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8U8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8U8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8U8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8U8_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8U8_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    else {
+      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
+    }
+  }
+
+  // Unknown accumulator type
+  else {
+    static_assert(sizeof(ElementC) == 0, "Unknown ElementC accumulator type.");
+  }
+}
+
+template <
+  class ElementA,
+  class ElementB,
+  class ElementC,
+  class TileShape_MNK,
+  GMMA::Major MajorA = GMMA::Major::K,
+  GMMA::Major MajorB = GMMA::Major::K,
+  auto... Args                         // e.g. GMMA::ScaleOut::One, [GMMA::ScaleIn::One, GMMA::ScaleIn::One]
+                                       // But most commonly leave empty for defaults
+>
+CUTE_HOST_DEVICE constexpr
+auto
+rs_op_selector()
+{
+  static_assert(is_static<TileShape_MNK>::value, "TileShape_MNK must be static.");
+  static_assert(rank(TileShape_MNK{}) == 3, "TileShape_MNK must be rank 3.");
+  static_assert(size<0>(TileShape_MNK{}) % 64 == 0, "Tile_M must be a multiple of 64.");
+  static_assert(MajorA == GMMA::Major::K, "Register source A operand GMMAs must have K-major A layout.");
+  auto Tile_N = size<1>(TileShape_MNK{});
+
+  // F16 accumulator
+  if constexpr (is_same_v<ElementC, half_t>) {
+
+    // Input A: half_t ; Input B: half_t
+    if constexpr (is_same_v<ElementA, half_t> && is_same_v<ElementB, half_t>) {
+      static_assert(size<2>(TileShape_MNK{}) % 16 == 0, "Tile_K must be a multiple of 16.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e4m3_t ; Input B: float_e4m3_t
+    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e4m3_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e4m3_t ; Input B: float_e5m2_t
+    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e5m2_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e5m2_t ; Input B: float_e4m3_t
+    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e4m3_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e5m2_t ; Input B: float_e5m2_t
+    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e5m2_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    else {
+      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
+    }
+  }
+
+  // F32 accumulator
+  else if constexpr (is_same_v<ElementC, float>) {
+
+    // Input A: half_t ; Input B: half_t
+    if constexpr (is_same_v<ElementA, half_t> && is_same_v<ElementB, half_t>) {
+      static_assert(size<2>(TileShape_MNK{}) % 16 == 0, "Tile_K must be a multiple of 16.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: bfloat16_t ; Input B: bfloat16_t
+    else if constexpr (is_same_v<ElementA, bfloat16_t> && is_same_v<ElementB, bfloat16_t>) {
+      static_assert(size<2>(TileShape_MNK{}) % 16 == 0, "Tile_K must be a multiple of 16.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: tfloat32_t ; Input B: tfloat32_t
+    else if constexpr (is_same_v<ElementA, tfloat32_t> && is_same_v<ElementB, tfloat32_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 8 == 0, "Tile_K must be a multiple of 8.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e4m3_t ; Input B: float_e4m3_t
+    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e4m3_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e4m3_t ; Input B: float_e5m2_t
+    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e5m2_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e5m2_t ; Input B: float_e4m3_t
+    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e4m3_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e5m2_t ; Input B: float_e5m2_t
+    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e5m2_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    else {
+      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
+    }
+  }
+
+  // S32 accumulator
+  else if constexpr (is_same_v<ElementC, int32_t>) {
+
+    // Input A: int8_t ; Input B: int8_t
+    if constexpr (is_same_v<ElementA, int8_t> && is_same_v<ElementB, int8_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_S32S8S8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_S32S8S8_RS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_S32S8S8_RS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_S32S8S8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_S32S8S8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_S32S8S8_RS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_S32S8S8_RS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_S32S8S8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_S32S8S8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_S32S8S8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_S32S8S8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_S32S8S8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_S32S8S8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_S32S8S8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_S32S8S8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_S32S8S8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_S32S8S8_RS_TN{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_S32S8S8_RS_TN{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: int8_t ; Input B: uint8_t
+    else if constexpr (is_same_v<ElementA, int8_t> && is_same_v<ElementB, uint8_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_S32S8U8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_S32S8U8_RS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_S32S8U8_RS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_S32S8U8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_S32S8U8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_S32S8U8_RS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_S32S8U8_RS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_S32S8U8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_S32S8U8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_S32S8U8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_S32S8U8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_S32S8U8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_S32S8U8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_S32S8U8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_S32S8U8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_S32S8U8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_S32S8U8_RS_TN{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_S32S8U8_RS_TN{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: uint8_t ; Input B: int8_t
+    else if constexpr (is_same_v<ElementA, uint8_t> && is_same_v<ElementB, int8_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_S32U8S8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_S32U8S8_RS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_S32U8S8_RS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_S32U8S8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_S32U8S8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_S32U8S8_RS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_S32U8S8_RS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_S32U8S8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_S32U8S8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_S32U8S8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_S32U8S8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_S32U8S8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_S32U8S8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_S32U8S8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_S32U8S8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_S32U8S8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_S32U8S8_RS_TN{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_S32U8S8_RS_TN{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: uint8_t ; Input B: uint8_t
+    else if constexpr (is_same_v<ElementA, uint8_t> && is_same_v<ElementB, uint8_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_S32U8U8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_S32U8U8_RS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_S32U8U8_RS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_S32U8U8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_S32U8U8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_S32U8U8_RS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_S32U8U8_RS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_S32U8U8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_S32U8U8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_S32U8U8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_S32U8U8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_S32U8U8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_S32U8U8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_S32U8U8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_S32U8U8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_S32U8U8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_S32U8U8_RS_TN{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_S32U8U8_RS_TN{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    else {
+      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
+    }
+  }
+
+  // Unknown accumulator type
+  else {
+    static_assert(sizeof(ElementC) == 0, "Unknown ElementC accumulator type.");
+  }
+}
+
+template <
+  class ElementA,
+  class ElementB,
+  class ElementC,
+  class TileShape_MNK,
+  GMMA::Major MajorA = GMMA::Major::K,
+  GMMA::Major MajorB = GMMA::Major::K,
+  auto... Args                         // e.g. GMMA::ScaleOut::One, [GMMA::ScaleIn::One, GMMA::ScaleIn::One]
+                                       // But most commonly leave empty for defaults
+>
+CUTE_HOST_DEVICE constexpr
+auto
+rs_op_selector_sparse()
+{
+  static_assert(is_static<TileShape_MNK>::value, "TileShape_MNK must be static.");
+  static_assert(rank(TileShape_MNK{}) == 3, "TileShape_MNK must be rank 3.");
+  static_assert(size<0>(TileShape_MNK{}) % 64 == 0, "Tile_M must be a multiple of 64.");
+  static_assert(MajorA == GMMA::Major::K, "Register source A operand GMMAs must have K-major A layout.");
+  auto Tile_N = size<1>(TileShape_MNK{});
+
+  // F16 accumulator
+  if constexpr (is_same_v<ElementC, half_t>) {
+
+    // Input A: half_t ; Input B: half_t
+    if constexpr (is_same_v<ElementA, half_t> && is_same_v<ElementB, half_t>) {
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e4m3_t ; Input B: float_e4m3_t
+    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e4m3_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e4m3_t ; Input B: float_e5m2_t
+    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e5m2_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e5m2_t ; Input B: float_e4m3_t
+    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e4m3_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e5m2_t ; Input B: float_e5m2_t
+    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e5m2_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    else {
+      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
+    }
+  }
+
+  // F32 accumulator
+  else if constexpr (is_same_v<ElementC, float>) {
+
+    // Input A: half_t ; Input B: half_t
+    if constexpr (is_same_v<ElementA, half_t> && is_same_v<ElementB, half_t>) {
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: bfloat16_t ; Input B: bfloat16_t
+    else if constexpr (is_same_v<ElementA, bfloat16_t> && is_same_v<ElementB, bfloat16_t>) {
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: tfloat32_t ; Input B: tfloat32_t
+    else if constexpr (is_same_v<ElementA, tfloat32_t> && is_same_v<ElementB, tfloat32_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 16 == 0, "Tile_K must be a multiple of 16.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e4m3_t ; Input B: float_e4m3_t
+    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e4m3_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e4m3_t ; Input B: float_e5m2_t
+    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e5m2_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e5m2_t ; Input B: float_e4m3_t
+    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e4m3_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e5m2_t ; Input B: float_e5m2_t
+    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e5m2_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    else {
+      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
+    }
+  }
+
+  // S32 accumulator
+  else if constexpr (is_same_v<ElementC, int32_t>) {
+
+    // Input A: int8_t ; Input B: int8_t
+    if constexpr (is_same_v<ElementA, int8_t> && is_same_v<ElementB, int8_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8S8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8S8_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8S8_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8S8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8S8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8S8_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8S8_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8S8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8S8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8S8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8S8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8S8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8S8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8S8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8S8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8S8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8S8_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8S8_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: int8_t ; Input B: uint8_t
+    else if constexpr (is_same_v<ElementA, int8_t> && is_same_v<ElementB, uint8_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8U8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8U8_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8U8_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8U8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8U8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8U8_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8U8_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8U8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8U8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8U8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8U8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8U8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8U8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8U8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8U8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8U8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8U8_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8U8_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: uint8_t ; Input B: int8_t
+    else if constexpr (is_same_v<ElementA, uint8_t> && is_same_v<ElementB, int8_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8S8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8S8_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8S8_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8S8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8S8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8S8_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8S8_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8S8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8S8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8S8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8S8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8S8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8S8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8S8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8S8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8S8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8S8_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8S8_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: uint8_t ; Input B: uint8_t
+    else if constexpr (is_same_v<ElementA, uint8_t> && is_same_v<ElementB, uint8_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8U8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8U8_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8U8_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8U8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8U8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8U8_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8U8_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8U8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8U8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8U8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8U8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8U8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8U8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8U8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8U8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8U8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8U8_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8U8_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    else {
+      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
+    }
+  }
+
+  // Unknown accumulator type
+  else {
+    static_assert(sizeof(ElementC) == 0, "Unknown ElementC accumulator type.");
+  }
+}
+
+} // end namespace SM90::GMMA
+} // end namespace cute
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cute/arch/mma_sm90_desc.hpp b/include/cute/arch/mma_sm90_desc.hpp
new file mode 100644
index 0000000000..a53a9748b4
--- /dev/null
+++ b/include/cute/arch/mma_sm90_desc.hpp
@@ -0,0 +1,156 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cute/config.hpp>
+
+#include <cute/arch/mma.hpp>
+
+// Config
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && defined(__CUDA_ARCH_FEAT_SM90_ALL))
+#    define CUTE_ARCH_MMA_SM90A_ENABLED
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cute {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// GMMA Descriptor and utilities
+
+// GMMA enums and utilities
+namespace SM90::GMMA {
+
+enum class LayoutType : uint8_t {
+  INTERLEAVE = 0,
+  B128 = 1,
+  B64 = 2,
+  B32 = 3,
+};
+
+CUTE_HOST_DEVICE char const* to_string(LayoutType const& t) {
+  switch (t) {
+    case LayoutType::INTERLEAVE: return "INTERLEAVE";
+    case LayoutType::B128:       return "B128";
+    case LayoutType::B64:        return "B64";
+    case LayoutType::B32:        return "B32";
+  }
+  return nullptr;
+}
+
+#if !defined(__CUDACC_RTC__)
+// Output operator for all enums in this namespace
+CUTE_HOST std::ostream& operator<<(std::ostream& os, LayoutType const& t) {
+  char const* s = to_string(t);
+  if (s) {
+    std::operator<<(os, s);  // Explicit call to avoid ambiguity
+  } else {
+    os.setstate(std::ios_base::failbit);
+  }
+  return os;
+}
+#endif // !defined(__CUDACC_RTC__)
+
+} // end namespace SM90::GMMA
+
+union GmmaDescriptor
+{
+  CUTE_HOST_DEVICE constexpr
+  GmmaDescriptor() noexcept : desc_(0) {}
+  CUTE_HOST_DEVICE constexpr
+  GmmaDescriptor(uint64_t desc) noexcept : desc_(desc) {}
+  CUTE_HOST_DEVICE constexpr
+  GmmaDescriptor(GmmaDescriptor const& t) noexcept : desc_(t.desc_) {}
+  CUTE_HOST_DEVICE constexpr
+  GmmaDescriptor(GmmaDescriptor && t) noexcept : desc_(t.desc_) {}
+
+  CUTE_HOST_DEVICE constexpr
+  GmmaDescriptor& operator=(GmmaDescriptor const& t) noexcept {
+    desc_ = t.desc_;
+    return *this;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  GmmaDescriptor& operator=(GmmaDescriptor && t) noexcept {
+    desc_ = t.desc_;
+    return *this;
+  }
+
+  uint64_t desc_;
+  uint32_t reg32_[2];
+  uint16_t reg16_[4];
+
+  // Bitfield implementation avoids the need for shifts in assignment
+  struct {
+    // start_address, bit [0,14), 4LSB not included
+    uint16_t start_address_ : 14, : 2;        // 14 bits [0,14), 2 bits unused
+    // leading dimension byte offset, bit [16,30), 4LSB not included
+    // For N: This is the stride from the first col to the second col of the 8x2 brick in INTERLEAVED
+    //   Unused for all SWIZZLE_* layouts (and assumed to be 1)
+    // For T: This is the stride from the first 8 rows to the next 8 rows.
+    uint16_t leading_byte_offset_ : 14, : 2;  // 14 bits [0,14), 2 bits unused
+    // stride dimension byte offset, bit [32,46), 4LSB not included
+    // For N: This is the stride from the first 8 rows to the next 8 rows.
+    // For T: This is the stride fro mthe first 8 cols to the next 8 cols.
+    uint16_t stride_byte_offset_ : 14, : 2;   // 14 bits [0,14), 2 bits unused
+    // base_offset, bit [49,52)
+    // Valid only for SWIZZLE_128B and SWIZZLE_64B
+    uint8_t : 1, base_offset_ : 3, : 4;       // 1 bit unused, 3 bits [1,4), 4 bits unused
+    // layout type, bit [62,64)
+    // SWIZZLE_NONE = 0, SWIZZLE_32B = 3, SWIZZLE_64B = 2, SWIZZLE_128B = 1
+    uint8_t : 6, layout_type_ : 2;            // 6 bits unused, 2 bits [6,8)
+  } bitfield;
+
+  // Decay to a uint64_t
+  CUTE_HOST_DEVICE constexpr
+  operator uint64_t() const noexcept { return desc_; }
+};
+
+// Printer
+CUTE_HOST_DEVICE void
+print(GmmaDescriptor const& t)
+{
+#if !defined(__CUDACC_RTC__)
+  printf("GmmaDescriptor: 0x%016llx\n",   static_cast<unsigned long long>(t.desc_));
+  printf("  start_addr :  0x%04x\n",      t.bitfield.start_address_);
+  printf("  leading_off:  0x%04x (%d)\n", t.bitfield.leading_byte_offset_, t.bitfield.leading_byte_offset_);
+  printf("  stride_off :  0x%04x (%d)\n", t.bitfield.stride_byte_offset_, t.bitfield.stride_byte_offset_);
+  printf("  base_offset:  0x%01x\n",      t.bitfield.base_offset_);
+  printf("  layout_type:  0x%01x (%s)\n", t.bitfield.layout_type_, to_string(static_cast<SM90::GMMA::LayoutType>(t.bitfield.layout_type_)));
+#endif // !defined(__CUDACC_RTC__)
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cute
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cute/arch/mma_sm90_gmma.hpp b/include/cute/arch/mma_sm90_gmma.hpp
new file mode 100644
index 0000000000..d809aa4a63
--- /dev/null
+++ b/include/cute/arch/mma_sm90_gmma.hpp
@@ -0,0 +1,20974 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>                 // CUTE_HOST_DEVICE
+
+#include "cutlass/arch/synclog.hpp"
+
+// Config
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && defined(__CUDA_ARCH_FEAT_SM90_ALL))
+#  define CUTE_ARCH_MMA_SM90A_ENABLED
+#endif
+
+namespace cute {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Warpgroup sync primitives
+
+CUTE_HOST_DEVICE
+void
+warpgroup_arrive()
+{
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+  cutlass::arch::synclog_emit_warpgroup_arrive(__LINE__);
+  asm volatile ("wgmma.fence.sync.aligned;\n" ::: "memory");
+#else
+  CUTE_INVALID_CONTROL_PATH("Attempting to use wgmma.fence without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+}
+
+template <int N>
+CUTE_HOST_DEVICE
+void
+warpgroup_wait()
+{
+  static_assert(N >= 0 && N <= 7, "WGMMA wait: N must be in range [0, 7]");
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+  cutlass::arch::synclog_emit_warpgroup_wait(__LINE__, N);
+  asm volatile("wgmma.wait_group.sync.aligned %0;\n" :: "n"(N) : "memory");
+#else
+  CUTE_INVALID_CONTROL_PATH("Attempting to use wgmma.wait_group<N> without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+}
+
+// Marks the commit point for one or more sized batch of warpgroup MMAs.
+CUTE_HOST_DEVICE
+void
+warpgroup_commit_batch()
+{
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+  cutlass::arch::synclog_emit_warpgroup_commit_batch(__LINE__);
+  asm volatile("wgmma.commit_group.sync.aligned;\n" ::: "memory");
+#else
+  CUTE_INVALID_CONTROL_PATH("Attempting to use wgmma.commit_group without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+}
+
+CUTE_HOST_DEVICE
+void
+warpgroup_fence_operand(uint32_t& reg) {
+  // MSVC emits a build error for 'asm volatile'
+  // even if it only occurs in a __device__ function.
+  // This prevents the error.
+#if defined(__CUDA_ARCH__)
+  asm volatile("" : "+r"(reg) :: "memory");
+#endif
+}
+
+CUTE_HOST_DEVICE
+void
+warpgroup_fence_operand(float& reg) {
+#if defined(__CUDA_ARCH__)
+  asm volatile("" : "+f"(reg) :: "memory");
+#endif
+}
+
+namespace SM90::GMMA {
+
+enum class Major {
+  K  = 0,
+  MN = 1
+};
+
+enum class ScaleOut {
+  Zero = 0,
+  One  = 1
+};
+
+enum class ScaleIn {
+  Neg = -1,
+  One =  1
+};
+
+enum class SparseSel {
+  Zero = 0,
+  One  = 1
+};
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// GMMA PTX definitions:  C = (scaleA * A) * (scaleB * B) + (scaleD * C)
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %4, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k16.f16.f16.f16 "
+      "{%0, %1},"
+      " %2,"
+      " %3,"
+      " p,  %5, %6, %7, %8;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %7, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k16.f16.f16.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      " %6,"
+      " p,   %8,  %9,  %10;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p,   %7,  %8,  %9,  %10;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p,   %10, %11, %12;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p,   %11, %12, %13, %14;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p,   %14, %15, %16;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p,   %19, %20, %21, %22;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p,   %22, %23, %24;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p,   %27, %28, %29, %30;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p,   %30, %31, %32;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p,   %35, %36, %37, %38;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p,   %38, %39, %40;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p,   %51, %52, %53, %54;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k16.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p,    %54,  %55,  %56;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k16.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p,    %67,  %68,  %69,  %70;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k16.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p,    %70,  %71,  %72;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p,   %7,  %8,  %9,  %10;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p,   %10, %11, %12;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p,   %11, %12, %13, %14;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p,   %14, %15, %16;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p,   %19, %20, %21, %22;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p,   %22, %23, %24;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p,   %35, %36, %37, %38;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p,   %38, %39, %40;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p,   %51, %52, %53, %54;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p,    %54,  %55,  %56;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p,    %67,  %68,  %69,  %70;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p,    %70,  %71,  %72;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %98, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " p,    %99,  %100, %101, %102;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %101, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " p,    %102, %103, %104;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %130, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " p,    %131, %132, %133, %134;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %133, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " p,    %134, %135, %136;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p,   %7,  %8,  %9,  %10;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p,   %10, %11, %12;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p,   %11, %12, %13, %14;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p,   %14, %15, %16;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p,   %19, %20, %21, %22;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p,   %22, %23, %24;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p,   %35, %36, %37, %38;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p,   %38, %39, %40;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p,   %51, %52, %53, %54;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p,    %54,  %55,  %56;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p,    %67,  %68,  %69,  %70;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p,    %70,  %71,  %72;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %98, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " p,    %99,  %100, %101, %102;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %101, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " p,    %102, %103, %104;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %130, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " p,    %131, %132, %133, %134;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %133, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " p,    %134, %135, %136;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p,   %7,  %8;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p,   %10, %11;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p,   %11, %12;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p,   %14, %15;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p,   %19, %20;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p,   %22, %23;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p,   %35, %36;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p,   %38, %39;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p,   %51, %52;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p,    %54,  %55;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p,    %67,  %68;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p,    %70,  %71;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %98, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " p,    %99,  %100;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %101, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " p,    %102, %103;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %130, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " p,    %131, %132;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %133, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " p,    %134, %135;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN S32+=S8*S8
+struct MMA_64x8x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN S32+=S8*S8
+struct MMA_64x8x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN S32+=S8*S8
+struct MMA_64x16x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN S32+=S8*S8
+struct MMA_64x16x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN S32+=S8*S8
+struct MMA_64x32x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN S32+=S8*S8
+struct MMA_64x32x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN S32+=S8*S8
+struct MMA_64x64x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN S32+=S8*S8
+struct MMA_64x64x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN S32+=S8*S8
+struct MMA_64x96x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN S32+=S8*S8
+struct MMA_64x96x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN S32+=S8*S8
+struct MMA_64x128x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN S32+=S8*S8
+struct MMA_64x128x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN S32+=S8*S8
+struct MMA_64x192x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %98, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN S32+=S8*S8
+struct MMA_64x192x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %98, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN S32+=S8*S8
+struct MMA_64x256x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %130, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN S32+=S8*S8
+struct MMA_64x256x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %130, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN S32+=S8*S8
+struct MMA_64x8x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN S32+=S8*S8
+struct MMA_64x8x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN S32+=S8*S8
+struct MMA_64x16x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN S32+=S8*S8
+struct MMA_64x16x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN S32+=S8*S8
+struct MMA_64x32x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN S32+=S8*S8
+struct MMA_64x32x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN S32+=S8*S8
+struct MMA_64x64x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN S32+=S8*S8
+struct MMA_64x64x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN S32+=S8*S8
+struct MMA_64x96x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN S32+=S8*S8
+struct MMA_64x96x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN S32+=S8*S8
+struct MMA_64x128x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN S32+=S8*S8
+struct MMA_64x128x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN S32+=S8*S8
+struct MMA_64x192x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %101, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN S32+=S8*S8
+struct MMA_64x192x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %101, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN S32+=S8*S8
+struct MMA_64x256x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %133, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN S32+=S8*S8
+struct MMA_64x256x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %133, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN S32+=S8*U8
+struct MMA_64x8x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN S32+=S8*U8
+struct MMA_64x8x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN S32+=S8*U8
+struct MMA_64x16x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN S32+=S8*U8
+struct MMA_64x16x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN S32+=S8*U8
+struct MMA_64x32x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN S32+=S8*U8
+struct MMA_64x32x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN S32+=S8*U8
+struct MMA_64x64x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN S32+=S8*U8
+struct MMA_64x64x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN S32+=S8*U8
+struct MMA_64x96x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN S32+=S8*U8
+struct MMA_64x96x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN S32+=S8*U8
+struct MMA_64x128x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN S32+=S8*U8
+struct MMA_64x128x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN S32+=S8*U8
+struct MMA_64x192x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %98, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN S32+=S8*U8
+struct MMA_64x192x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %98, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN S32+=S8*U8
+struct MMA_64x256x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %130, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN S32+=S8*U8
+struct MMA_64x256x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %130, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN S32+=S8*U8
+struct MMA_64x8x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN S32+=S8*U8
+struct MMA_64x8x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN S32+=S8*U8
+struct MMA_64x16x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN S32+=S8*U8
+struct MMA_64x16x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN S32+=S8*U8
+struct MMA_64x32x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN S32+=S8*U8
+struct MMA_64x32x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN S32+=S8*U8
+struct MMA_64x64x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN S32+=S8*U8
+struct MMA_64x64x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN S32+=S8*U8
+struct MMA_64x96x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN S32+=S8*U8
+struct MMA_64x96x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN S32+=S8*U8
+struct MMA_64x128x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN S32+=S8*U8
+struct MMA_64x128x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN S32+=S8*U8
+struct MMA_64x192x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %101, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN S32+=S8*U8
+struct MMA_64x192x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %101, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN S32+=S8*U8
+struct MMA_64x256x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %133, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN S32+=S8*U8
+struct MMA_64x256x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %133, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN S32+=U8*S8
+struct MMA_64x8x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN S32+=U8*S8
+struct MMA_64x8x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN S32+=U8*S8
+struct MMA_64x16x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN S32+=U8*S8
+struct MMA_64x16x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN S32+=U8*S8
+struct MMA_64x32x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN S32+=U8*S8
+struct MMA_64x32x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN S32+=U8*S8
+struct MMA_64x64x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN S32+=U8*S8
+struct MMA_64x64x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN S32+=U8*S8
+struct MMA_64x96x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN S32+=U8*S8
+struct MMA_64x96x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN S32+=U8*S8
+struct MMA_64x128x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN S32+=U8*S8
+struct MMA_64x128x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN S32+=U8*S8
+struct MMA_64x192x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %98, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN S32+=U8*S8
+struct MMA_64x192x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %98, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN S32+=U8*S8
+struct MMA_64x256x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %130, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN S32+=U8*S8
+struct MMA_64x256x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %130, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN S32+=U8*S8
+struct MMA_64x8x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN S32+=U8*S8
+struct MMA_64x8x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN S32+=U8*S8
+struct MMA_64x16x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN S32+=U8*S8
+struct MMA_64x16x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN S32+=U8*S8
+struct MMA_64x32x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN S32+=U8*S8
+struct MMA_64x32x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN S32+=U8*S8
+struct MMA_64x64x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN S32+=U8*S8
+struct MMA_64x64x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN S32+=U8*S8
+struct MMA_64x96x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN S32+=U8*S8
+struct MMA_64x96x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN S32+=U8*S8
+struct MMA_64x128x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN S32+=U8*S8
+struct MMA_64x128x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN S32+=U8*S8
+struct MMA_64x192x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %101, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN S32+=U8*S8
+struct MMA_64x192x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %101, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN S32+=U8*S8
+struct MMA_64x256x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %133, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN S32+=U8*S8
+struct MMA_64x256x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %133, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN S32+=U8*U8
+struct MMA_64x8x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN S32+=U8*U8
+struct MMA_64x8x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN S32+=U8*U8
+struct MMA_64x16x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN S32+=U8*U8
+struct MMA_64x16x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN S32+=U8*U8
+struct MMA_64x32x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN S32+=U8*U8
+struct MMA_64x32x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN S32+=U8*U8
+struct MMA_64x64x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN S32+=U8*U8
+struct MMA_64x64x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN S32+=U8*U8
+struct MMA_64x96x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN S32+=U8*U8
+struct MMA_64x96x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN S32+=U8*U8
+struct MMA_64x128x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN S32+=U8*U8
+struct MMA_64x128x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN S32+=U8*U8
+struct MMA_64x192x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %98, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN S32+=U8*U8
+struct MMA_64x192x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %98, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN S32+=U8*U8
+struct MMA_64x256x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %130, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN S32+=U8*U8
+struct MMA_64x256x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %130, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN S32+=U8*U8
+struct MMA_64x8x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN S32+=U8*U8
+struct MMA_64x8x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN S32+=U8*U8
+struct MMA_64x16x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN S32+=U8*U8
+struct MMA_64x16x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN S32+=U8*U8
+struct MMA_64x32x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN S32+=U8*U8
+struct MMA_64x32x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN S32+=U8*U8
+struct MMA_64x64x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN S32+=U8*U8
+struct MMA_64x64x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN S32+=U8*U8
+struct MMA_64x96x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN S32+=U8*U8
+struct MMA_64x96x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN S32+=U8*U8
+struct MMA_64x128x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN S32+=U8*U8
+struct MMA_64x128x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN S32+=U8*U8
+struct MMA_64x192x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %101, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN S32+=U8*U8
+struct MMA_64x192x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %101, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN S32+=U8*U8
+struct MMA_64x256x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %133, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN S32+=U8*U8
+struct MMA_64x256x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %133, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %4, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.f16.e4m3.e4m3 "
+      "{%0, %1},"
+      " %2,"
+      " %3,"
+      " p,  %5, %6;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %7, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.f16.e4m3.e4m3 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      " %6,"
+      " p,   %8,  %9;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p,   %7,  %8;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p,   %10, %11;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p,   %7,  %8;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p,   %10, %11;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p,   %11, %12;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p,   %14, %15;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p,   %11, %12;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p,   %14, %15;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p,   %19, %20;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p,   %22, %23;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p,   %19, %20;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p,   %22, %23;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p,   %35, %36;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p,   %38, %39;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p,   %27, %28;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p,   %30, %31;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p,   %51, %52;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p,    %54,  %55;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p,   %35, %36;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p,   %38, %39;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p,    %67,  %68;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p,    %70,  %71;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p,   %51, %52;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p,    %54,  %55;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %98, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " p,    %99,  %100;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %101, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " p,    %102, %103;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p,    %67,  %68;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p,    %70,  %71;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %130, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " p,    %131, %132;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %133, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " p,    %134, %135;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %4, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.f16.e4m3.e5m2 "
+      "{%0, %1},"
+      " %2,"
+      " %3,"
+      " p,  %5, %6;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %7, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.f16.e4m3.e5m2 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      " %6,"
+      " p,   %8,  %9;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p,   %7,  %8;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p,   %10, %11;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p,   %7,  %8;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p,   %10, %11;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p,   %11, %12;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p,   %14, %15;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p,   %11, %12;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p,   %14, %15;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p,   %19, %20;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p,   %22, %23;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p,   %19, %20;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p,   %22, %23;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p,   %35, %36;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p,   %38, %39;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p,   %27, %28;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p,   %30, %31;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p,   %51, %52;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p,    %54,  %55;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p,   %35, %36;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p,   %38, %39;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p,    %67,  %68;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p,    %70,  %71;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p,   %51, %52;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p,    %54,  %55;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %98, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " p,    %99,  %100;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %101, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " p,    %102, %103;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p,    %67,  %68;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p,    %70,  %71;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %130, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " p,    %131, %132;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %133, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " p,    %134, %135;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %4, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.f16.e5m2.e4m3 "
+      "{%0, %1},"
+      " %2,"
+      " %3,"
+      " p,  %5, %6;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %7, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.f16.e5m2.e4m3 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      " %6,"
+      " p,   %8,  %9;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p,   %7,  %8;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p,   %10, %11;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p,   %7,  %8;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p,   %10, %11;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p,   %11, %12;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p,   %14, %15;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p,   %11, %12;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p,   %14, %15;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p,   %19, %20;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p,   %22, %23;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p,   %19, %20;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p,   %22, %23;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p,   %35, %36;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p,   %38, %39;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p,   %27, %28;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p,   %30, %31;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p,   %51, %52;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p,    %54,  %55;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p,   %35, %36;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p,   %38, %39;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p,    %67,  %68;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p,    %70,  %71;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p,   %51, %52;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p,    %54,  %55;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %98, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " p,    %99,  %100;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %101, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " p,    %102, %103;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p,    %67,  %68;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p,    %70,  %71;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %130, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " p,    %131, %132;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %133, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " p,    %134, %135;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %4, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.f16.e5m2.e5m2 "
+      "{%0, %1},"
+      " %2,"
+      " %3,"
+      " p,  %5, %6;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %7, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.f16.e5m2.e5m2 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      " %6,"
+      " p,   %8,  %9;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p,   %7,  %8;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p,   %10, %11;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p,   %7,  %8;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p,   %10, %11;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p,   %11, %12;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p,   %14, %15;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p,   %11, %12;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p,   %14, %15;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p,   %19, %20;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p,   %22, %23;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p,   %19, %20;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p,   %22, %23;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p,   %35, %36;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p,   %38, %39;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p,   %27, %28;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p,   %30, %31;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p,   %51, %52;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p,    %54,  %55;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p,   %35, %36;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p,   %38, %39;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p,    %67,  %68;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p,    %70,  %71;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p,   %51, %52;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p,    %54,  %55;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %98, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " p,    %99,  %100;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %101, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " p,    %102, %103;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p,    %67,  %68;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p,    %70,  %71;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %130, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " p,    %131, %132;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %133, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " p,    %134, %135;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace SM90::GMMA
+
+} // namespace cute
+
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+#include "mma_sm90_gmma_ext.hpp"
+#endif
diff --git a/include/cute/arch/mma_sm90_gmma_ext.hpp b/include/cute/arch/mma_sm90_gmma_ext.hpp
new file mode 100644
index 0000000000..10a36aff80
--- /dev/null
+++ b/include/cute/arch/mma_sm90_gmma_ext.hpp
@@ -0,0 +1,56445 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+ 
+#pragma once
+  
+#include <cute/config.hpp>                // CUTE_HOST_DEVICE
+
+#include "cutlass/arch/synclog.hpp"
+
+// Config
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && defined(__CUDA_ARCH_FEAT_SM90_ALL))
+#  define CUTE_ARCH_MMA_SM90A_ENABLED
+#endif
+
+namespace cute {
+
+namespace SM90::GMMA {
+
+// GMMA 64x24x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      " %6,"
+      " %7,"
+      " p,   %9,  %10, %11, %12;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      "{%6,  %7,  %8,  %9},"
+      " %10,"
+      " p,   %12, %13, %14;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      " %10,"
+      " %11,"
+      " p,   %13, %14, %15, %16;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      "{%10, %11, %12, %13},"
+      " %14,"
+      " p,   %16, %17, %18;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p,   %15, %16, %17, %18;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p,   %18, %19, %20;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      " %14,"
+      " %15,"
+      " p,   %17, %18, %19, %20;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      "{%14, %15, %16, %17},"
+      " %18,"
+      " p,   %20, %21, %22;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      " %18,"
+      " %19,"
+      " p,   %21, %22, %23, %24;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      "{%18, %19, %20, %21},"
+      " %22,"
+      " p,   %24, %25, %26;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %22, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " p,   %23, %24, %25, %26;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %25, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " p,   %26, %27, %28;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %24, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      " %22,"
+      " %23,"
+      " p,   %25, %26, %27, %28;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %27, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      "{%22, %23, %24, %25},"
+      " %26,"
+      " p,   %28, %29, %30;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      " %26,"
+      " %27,"
+      " p,   %29, %30, %31, %32;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      "{%26, %27, %28, %29},"
+      " %30,"
+      " p,   %32, %33, %34;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %30, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " p,   %31, %32, %33, %34;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %33, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " p,   %34, %35, %36;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %32, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      " %30,"
+      " %31,"
+      " p,   %33, %34, %35, %36;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %35, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      "{%30, %31, %32, %33},"
+      " %34,"
+      " p,   %36, %37, %38;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      " %34,"
+      " %35,"
+      " p,   %37, %38, %39, %40;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      "{%34, %35, %36, %37},"
+      " %38,"
+      " p,   %40, %41, %42;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %38, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " p,   %39, %40, %41, %42;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %41, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " p,   %42, %43, %44;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %40, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      " %38,"
+      " %39,"
+      " p,   %41, %42, %43, %44;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %43, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      "{%38, %39, %40, %41},"
+      " %42,"
+      " p,   %44, %45, %46;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p,   %43, %44, %45, %46;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p,   %46, %47, %48;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      " %42,"
+      " %43,"
+      " p,   %45, %46, %47, %48;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      "{%42, %43, %44, %45},"
+      " %46,"
+      " p,   %48, %49, %50;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %46, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " p,   %47, %48, %49, %50;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %49, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " p,   %50, %51, %52;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %48, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      " %46,"
+      " %47,"
+      " p,   %49, %50, %51, %52;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %51, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      "{%46, %47, %48, %49},"
+      " %50,"
+      " p,   %52, %53, %54;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k16.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      " %50,"
+      " %51,"
+      " p,    %53,  %54,  %55,  %56;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k16.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      "{%50,  %51,  %52,  %53},"
+      " %54,"
+      " p,    %56,  %57,  %58;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %54, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k16.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " p,    %55,  %56,  %57,  %58;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %57, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k16.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " p,    %58,  %59,  %60;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %56, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k16.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      " %54,"
+      " %55,"
+      " p,    %57,  %58,  %59,  %60;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %59, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k16.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      "{%54,  %55,  %56,  %57},"
+      " %58,"
+      " p,    %60,  %61,  %62;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k16.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p,    %59,  %60,  %61,  %62;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k16.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p,    %62,  %63,  %64;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k16.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      " %58,"
+      " %59,"
+      " p,    %61,  %62,  %63,  %64;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k16.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      "{%58,  %59,  %60,  %61},"
+      " %62,"
+      " p,    %64,  %65,  %66;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %62, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k16.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " p,    %63,  %64,  %65,  %66;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %65, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k16.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " p,    %66,  %67,  %68;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %64, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k16.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      " %62,"
+      " %63,"
+      " p,    %65,  %66,  %67,  %68;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %67, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k16.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      "{%62,  %63,  %64,  %65},"
+      " %66,"
+      " p,    %68,  %69,  %70;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p,   %15, %16, %17, %18;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p,   %18, %19, %20;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %22, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " p,   %23, %24, %25, %26;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %25, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " p,   %26, %27, %28;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p,   %27, %28, %29, %30;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p,   %30, %31, %32;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %30, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " p,   %31, %32, %33, %34;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %33, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " p,   %34, %35, %36;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %38, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " p,   %39, %40, %41, %42;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %41, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " p,   %42, %43, %44;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p,   %43, %44, %45, %46;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p,   %46, %47, %48;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %46, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " p,   %47, %48, %49, %50;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %49, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " p,   %50, %51, %52;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %54, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " p,    %55,  %56,  %57,  %58;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %57, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " p,    %58,  %59,  %60;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p,    %59,  %60,  %61,  %62;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p,    %62,  %63,  %64;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %62, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " p,    %63,  %64,  %65,  %66;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %65, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " p,    %66,  %67,  %68;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %70, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      " %68,"
+      " %69,"
+      " p,    %71,  %72,  %73,  %74;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %73, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      "{%68,  %69,  %70,  %71},"
+      " %72,"
+      " p,    %74,  %75,  %76;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %74, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " p,    %75,  %76,  %77,  %78;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %77, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " p,    %78,  %79,  %80;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %78, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      " %76,"
+      " %77,"
+      " p,    %79,  %80,  %81,  %82;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %81, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      "{%76,  %77,  %78,  %79},"
+      " %80,"
+      " p,    %82,  %83,  %84;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %82, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " p,    %83,  %84,  %85,  %86;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %85, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " p,    %86,  %87,  %88;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %86, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      " %84,"
+      " %85,"
+      " p,    %87,  %88,  %89,  %90;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %89, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      "{%84,  %85,  %86,  %87},"
+      " %88,"
+      " p,    %90,  %91,  %92;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %90, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " p,    %91,  %92,  %93,  %94;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %93, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " p,    %94,  %95,  %96;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %94, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      " %92,"
+      " %93,"
+      " p,    %95,  %96,  %97,  %98;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %97, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      "{%92,  %93,  %94,  %95},"
+      " %96,"
+      " p,    %98,  %99,  %100;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %102, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      " %100,"
+      " %101,"
+      " p,    %103, %104, %105, %106;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %105, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      "{%100, %101, %102, %103},"
+      " %104,"
+      " p,    %106, %107, %108;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %106, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " p,    %107, %108, %109, %110;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %109, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " p,    %110, %111, %112;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %110, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      " %108,"
+      " %109,"
+      " p,    %111, %112, %113, %114;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %113, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      "{%108, %109, %110, %111},"
+      " %112,"
+      " p,    %114, %115, %116;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %114, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " p,    %115, %116, %117, %118;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %117, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " p,    %118, %119, %120;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %118, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      " %116,"
+      " %117,"
+      " p,    %119, %120, %121, %122;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %121, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      "{%116, %117, %118, %119},"
+      " %120,"
+      " p,    %122, %123, %124;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %122, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " p,    %123, %124, %125, %126;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %125, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " p,    %126, %127, %128;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %126, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      " %124,"
+      " %125,"
+      " p,    %127, %128, %129, %130;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %129, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      "{%124, %125, %126, %127},"
+      " %128,"
+      " p,    %130, %131, %132;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p,   %15, %16, %17, %18;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p,   %18, %19, %20;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %22, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " p,   %23, %24, %25, %26;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %25, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " p,   %26, %27, %28;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p,   %27, %28, %29, %30;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p,   %30, %31, %32;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %30, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " p,   %31, %32, %33, %34;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %33, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " p,   %34, %35, %36;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %38, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " p,   %39, %40, %41, %42;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %41, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " p,   %42, %43, %44;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p,   %43, %44, %45, %46;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p,   %46, %47, %48;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %46, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " p,   %47, %48, %49, %50;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %49, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " p,   %50, %51, %52;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %54, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " p,    %55,  %56,  %57,  %58;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %57, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " p,    %58,  %59,  %60;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p,    %59,  %60,  %61,  %62;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p,    %62,  %63,  %64;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %62, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " p,    %63,  %64,  %65,  %66;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %65, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " p,    %66,  %67,  %68;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %70, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      " %68,"
+      " %69,"
+      " p,    %71,  %72,  %73,  %74;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %73, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      "{%68,  %69,  %70,  %71},"
+      " %72,"
+      " p,    %74,  %75,  %76;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %74, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " p,    %75,  %76,  %77,  %78;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %77, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " p,    %78,  %79,  %80;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %78, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      " %76,"
+      " %77,"
+      " p,    %79,  %80,  %81,  %82;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %81, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      "{%76,  %77,  %78,  %79},"
+      " %80,"
+      " p,    %82,  %83,  %84;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %82, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " p,    %83,  %84,  %85,  %86;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %85, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " p,    %86,  %87,  %88;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %86, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      " %84,"
+      " %85,"
+      " p,    %87,  %88,  %89,  %90;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %89, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      "{%84,  %85,  %86,  %87},"
+      " %88,"
+      " p,    %90,  %91,  %92;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %90, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " p,    %91,  %92,  %93,  %94;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %93, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " p,    %94,  %95,  %96;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %94, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      " %92,"
+      " %93,"
+      " p,    %95,  %96,  %97,  %98;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %97, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      "{%92,  %93,  %94,  %95},"
+      " %96,"
+      " p,    %98,  %99,  %100;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %102, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      " %100,"
+      " %101,"
+      " p,    %103, %104, %105, %106;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %105, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      "{%100, %101, %102, %103},"
+      " %104,"
+      " p,    %106, %107, %108;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %106, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " p,    %107, %108, %109, %110;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %109, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " p,    %110, %111, %112;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %110, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      " %108,"
+      " %109,"
+      " p,    %111, %112, %113, %114;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %113, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      "{%108, %109, %110, %111},"
+      " %112,"
+      " p,    %114, %115, %116;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %114, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " p,    %115, %116, %117, %118;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %117, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " p,    %118, %119, %120;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %118, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      " %116,"
+      " %117,"
+      " p,    %119, %120, %121, %122;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %121, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      "{%116, %117, %118, %119},"
+      " %120,"
+      " p,    %122, %123, %124;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %122, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " p,    %123, %124, %125, %126;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %125, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " p,    %126, %127, %128;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %126, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      " %124,"
+      " %125,"
+      " p,    %127, %128, %129, %130;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %129, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      "{%124, %125, %126, %127},"
+      " %128,"
+      " p,    %130, %131, %132;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p,   %15, %16;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p,   %18, %19;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %22, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " p,   %23, %24;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %25, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " p,   %26, %27;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p,   %27, %28;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p,   %30, %31;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %30, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " p,   %31, %32;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %33, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " p,   %34, %35;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %38, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " p,   %39, %40;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %41, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " p,   %42, %43;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p,   %43, %44;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p,   %46, %47;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %46, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " p,   %47, %48;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %49, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " p,   %50, %51;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %54, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " p,    %55,  %56;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %57, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " p,    %58,  %59;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p,    %59,  %60;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p,    %62,  %63;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %62, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " p,    %63,  %64;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %65, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " p,    %66,  %67;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %70, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      " %68,"
+      " %69,"
+      " p,    %71,  %72;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %73, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      "{%68,  %69,  %70,  %71},"
+      " %72,"
+      " p,    %74,  %75;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %74, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " p,    %75,  %76;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %77, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " p,    %78,  %79;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %78, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      " %76,"
+      " %77,"
+      " p,    %79,  %80;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %81, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      "{%76,  %77,  %78,  %79},"
+      " %80,"
+      " p,    %82,  %83;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %82, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " p,    %83,  %84;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %85, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " p,    %86,  %87;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %86, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      " %84,"
+      " %85,"
+      " p,    %87,  %88;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %89, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      "{%84,  %85,  %86,  %87},"
+      " %88,"
+      " p,    %90,  %91;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %90, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " p,    %91,  %92;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %93, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " p,    %94,  %95;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %94, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      " %92,"
+      " %93,"
+      " p,    %95,  %96;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %97, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      "{%92,  %93,  %94,  %95},"
+      " %96,"
+      " p,    %98,  %99;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %102, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      " %100,"
+      " %101,"
+      " p,    %103, %104;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %105, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      "{%100, %101, %102, %103},"
+      " %104,"
+      " p,    %106, %107;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %106, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " p,    %107, %108;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %109, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " p,    %110, %111;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %110, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      " %108,"
+      " %109,"
+      " p,    %111, %112;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %113, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      "{%108, %109, %110, %111},"
+      " %112,"
+      " p,    %114, %115;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %114, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " p,    %115, %116;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %117, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " p,    %118, %119;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %118, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      " %116,"
+      " %117,"
+      " p,    %119, %120;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %121, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      "{%116, %117, %118, %119},"
+      " %120,"
+      " p,    %122, %123;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %122, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " p,    %123, %124;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %125, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " p,    %126, %127;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %126, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      " %124,"
+      " %125,"
+      " p,    %127, %128;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %129, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      "{%124, %125, %126, %127},"
+      " %128,"
+      " p,    %130, %131;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN S32+=S8*S8
+struct MMA_64x24x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN S32+=S8*S8
+struct MMA_64x24x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN S32+=S8*S8
+struct MMA_64x48x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN S32+=S8*S8
+struct MMA_64x48x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN S32+=S8*S8
+struct MMA_64x80x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN S32+=S8*S8
+struct MMA_64x80x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN S32+=S8*S8
+struct MMA_64x112x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN S32+=S8*S8
+struct MMA_64x112x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN S32+=S8*S8
+struct MMA_64x144x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %74, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN S32+=S8*S8
+struct MMA_64x144x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %74, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN S32+=S8*S8
+struct MMA_64x160x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %82, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN S32+=S8*S8
+struct MMA_64x160x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %82, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN S32+=S8*S8
+struct MMA_64x176x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %90, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN S32+=S8*S8
+struct MMA_64x176x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %90, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN S32+=S8*S8
+struct MMA_64x208x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %106, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN S32+=S8*S8
+struct MMA_64x208x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %106, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN S32+=S8*S8
+struct MMA_64x224x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %114, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN S32+=S8*S8
+struct MMA_64x224x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %114, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN S32+=S8*S8
+struct MMA_64x240x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %122, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN S32+=S8*S8
+struct MMA_64x240x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %122, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN S32+=S8*S8
+struct MMA_64x24x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN S32+=S8*S8
+struct MMA_64x24x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN S32+=S8*S8
+struct MMA_64x48x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN S32+=S8*S8
+struct MMA_64x48x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN S32+=S8*S8
+struct MMA_64x80x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN S32+=S8*S8
+struct MMA_64x80x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN S32+=S8*S8
+struct MMA_64x112x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN S32+=S8*S8
+struct MMA_64x112x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN S32+=S8*S8
+struct MMA_64x144x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %77, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN S32+=S8*S8
+struct MMA_64x144x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %77, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN S32+=S8*S8
+struct MMA_64x160x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %85, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN S32+=S8*S8
+struct MMA_64x160x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %85, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN S32+=S8*S8
+struct MMA_64x176x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %93, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN S32+=S8*S8
+struct MMA_64x176x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %93, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN S32+=S8*S8
+struct MMA_64x208x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %109, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN S32+=S8*S8
+struct MMA_64x208x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %109, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN S32+=S8*S8
+struct MMA_64x224x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %117, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN S32+=S8*S8
+struct MMA_64x224x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %117, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN S32+=S8*S8
+struct MMA_64x240x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %125, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN S32+=S8*S8
+struct MMA_64x240x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %125, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN S32+=S8*U8
+struct MMA_64x24x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN S32+=S8*U8
+struct MMA_64x24x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN S32+=S8*U8
+struct MMA_64x48x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN S32+=S8*U8
+struct MMA_64x48x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN S32+=S8*U8
+struct MMA_64x80x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN S32+=S8*U8
+struct MMA_64x80x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN S32+=S8*U8
+struct MMA_64x112x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN S32+=S8*U8
+struct MMA_64x112x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN S32+=S8*U8
+struct MMA_64x144x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %74, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN S32+=S8*U8
+struct MMA_64x144x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %74, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN S32+=S8*U8
+struct MMA_64x160x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %82, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN S32+=S8*U8
+struct MMA_64x160x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %82, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN S32+=S8*U8
+struct MMA_64x176x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %90, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN S32+=S8*U8
+struct MMA_64x176x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %90, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN S32+=S8*U8
+struct MMA_64x208x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %106, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN S32+=S8*U8
+struct MMA_64x208x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %106, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN S32+=S8*U8
+struct MMA_64x224x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %114, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN S32+=S8*U8
+struct MMA_64x224x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %114, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN S32+=S8*U8
+struct MMA_64x240x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %122, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN S32+=S8*U8
+struct MMA_64x240x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %122, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN S32+=S8*U8
+struct MMA_64x24x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN S32+=S8*U8
+struct MMA_64x24x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN S32+=S8*U8
+struct MMA_64x48x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN S32+=S8*U8
+struct MMA_64x48x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN S32+=S8*U8
+struct MMA_64x80x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN S32+=S8*U8
+struct MMA_64x80x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN S32+=S8*U8
+struct MMA_64x112x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN S32+=S8*U8
+struct MMA_64x112x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN S32+=S8*U8
+struct MMA_64x144x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %77, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN S32+=S8*U8
+struct MMA_64x144x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %77, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN S32+=S8*U8
+struct MMA_64x160x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %85, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN S32+=S8*U8
+struct MMA_64x160x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %85, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN S32+=S8*U8
+struct MMA_64x176x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %93, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN S32+=S8*U8
+struct MMA_64x176x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %93, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN S32+=S8*U8
+struct MMA_64x208x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %109, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN S32+=S8*U8
+struct MMA_64x208x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %109, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN S32+=S8*U8
+struct MMA_64x224x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %117, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN S32+=S8*U8
+struct MMA_64x224x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %117, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN S32+=S8*U8
+struct MMA_64x240x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %125, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN S32+=S8*U8
+struct MMA_64x240x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %125, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN S32+=U8*S8
+struct MMA_64x24x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN S32+=U8*S8
+struct MMA_64x24x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN S32+=U8*S8
+struct MMA_64x48x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN S32+=U8*S8
+struct MMA_64x48x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN S32+=U8*S8
+struct MMA_64x80x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN S32+=U8*S8
+struct MMA_64x80x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN S32+=U8*S8
+struct MMA_64x112x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN S32+=U8*S8
+struct MMA_64x112x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN S32+=U8*S8
+struct MMA_64x144x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %74, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN S32+=U8*S8
+struct MMA_64x144x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %74, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN S32+=U8*S8
+struct MMA_64x160x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %82, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN S32+=U8*S8
+struct MMA_64x160x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %82, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN S32+=U8*S8
+struct MMA_64x176x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %90, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN S32+=U8*S8
+struct MMA_64x176x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %90, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN S32+=U8*S8
+struct MMA_64x208x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %106, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN S32+=U8*S8
+struct MMA_64x208x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %106, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN S32+=U8*S8
+struct MMA_64x224x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %114, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN S32+=U8*S8
+struct MMA_64x224x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %114, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN S32+=U8*S8
+struct MMA_64x240x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %122, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN S32+=U8*S8
+struct MMA_64x240x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %122, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN S32+=U8*S8
+struct MMA_64x24x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN S32+=U8*S8
+struct MMA_64x24x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN S32+=U8*S8
+struct MMA_64x48x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN S32+=U8*S8
+struct MMA_64x48x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN S32+=U8*S8
+struct MMA_64x80x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN S32+=U8*S8
+struct MMA_64x80x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN S32+=U8*S8
+struct MMA_64x112x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN S32+=U8*S8
+struct MMA_64x112x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN S32+=U8*S8
+struct MMA_64x144x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %77, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN S32+=U8*S8
+struct MMA_64x144x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %77, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN S32+=U8*S8
+struct MMA_64x160x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %85, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN S32+=U8*S8
+struct MMA_64x160x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %85, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN S32+=U8*S8
+struct MMA_64x176x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %93, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN S32+=U8*S8
+struct MMA_64x176x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %93, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN S32+=U8*S8
+struct MMA_64x208x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %109, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN S32+=U8*S8
+struct MMA_64x208x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %109, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN S32+=U8*S8
+struct MMA_64x224x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %117, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN S32+=U8*S8
+struct MMA_64x224x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %117, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN S32+=U8*S8
+struct MMA_64x240x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %125, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN S32+=U8*S8
+struct MMA_64x240x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %125, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN S32+=U8*U8
+struct MMA_64x24x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN S32+=U8*U8
+struct MMA_64x24x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN S32+=U8*U8
+struct MMA_64x48x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN S32+=U8*U8
+struct MMA_64x48x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN S32+=U8*U8
+struct MMA_64x80x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN S32+=U8*U8
+struct MMA_64x80x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN S32+=U8*U8
+struct MMA_64x112x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN S32+=U8*U8
+struct MMA_64x112x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN S32+=U8*U8
+struct MMA_64x144x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %74, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN S32+=U8*U8
+struct MMA_64x144x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %74, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN S32+=U8*U8
+struct MMA_64x160x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %82, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN S32+=U8*U8
+struct MMA_64x160x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %82, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN S32+=U8*U8
+struct MMA_64x176x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %90, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN S32+=U8*U8
+struct MMA_64x176x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %90, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN S32+=U8*U8
+struct MMA_64x208x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %106, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN S32+=U8*U8
+struct MMA_64x208x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %106, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN S32+=U8*U8
+struct MMA_64x224x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %114, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN S32+=U8*U8
+struct MMA_64x224x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %114, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN S32+=U8*U8
+struct MMA_64x240x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %122, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN S32+=U8*U8
+struct MMA_64x240x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %122, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN S32+=U8*U8
+struct MMA_64x24x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN S32+=U8*U8
+struct MMA_64x24x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN S32+=U8*U8
+struct MMA_64x48x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN S32+=U8*U8
+struct MMA_64x48x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN S32+=U8*U8
+struct MMA_64x80x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN S32+=U8*U8
+struct MMA_64x80x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN S32+=U8*U8
+struct MMA_64x112x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN S32+=U8*U8
+struct MMA_64x112x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN S32+=U8*U8
+struct MMA_64x144x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %77, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN S32+=U8*U8
+struct MMA_64x144x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %77, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN S32+=U8*U8
+struct MMA_64x160x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %85, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN S32+=U8*U8
+struct MMA_64x160x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %85, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN S32+=U8*U8
+struct MMA_64x176x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %93, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN S32+=U8*U8
+struct MMA_64x176x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %93, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN S32+=U8*U8
+struct MMA_64x208x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %109, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN S32+=U8*U8
+struct MMA_64x208x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %109, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN S32+=U8*U8
+struct MMA_64x224x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %117, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN S32+=U8*U8
+struct MMA_64x224x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %117, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN S32+=U8*U8
+struct MMA_64x240x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %125, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN S32+=U8*U8
+struct MMA_64x240x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %125, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      " %6,"
+      " %7,"
+      " p,   %9,  %10;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      "{%6,  %7,  %8,  %9},"
+      " %10,"
+      " p,   %12, %13;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p,   %15, %16;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p,   %18, %19;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      " %10,"
+      " %11,"
+      " p,   %13, %14;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      "{%10, %11, %12, %13},"
+      " %14,"
+      " p,   %16, %17;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %22, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " p,   %23, %24;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %25, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " p,   %26, %27;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p,   %15, %16;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p,   %18, %19;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p,   %27, %28;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p,   %30, %31;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      " %14,"
+      " %15,"
+      " p,   %17, %18;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      "{%14, %15, %16, %17},"
+      " %18,"
+      " p,   %20, %21;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %30, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " p,   %31, %32;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %33, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " p,   %34, %35;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      " %18,"
+      " %19,"
+      " p,   %21, %22;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      "{%18, %19, %20, %21},"
+      " %22,"
+      " p,   %24, %25;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %38, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " p,   %39, %40;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %41, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " p,   %42, %43;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %22, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " p,   %23, %24;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %25, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " p,   %26, %27;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p,   %43, %44;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p,   %46, %47;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %24, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      " %22,"
+      " %23,"
+      " p,   %25, %26;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %27, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      "{%22, %23, %24, %25},"
+      " %26,"
+      " p,   %28, %29;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %46, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " p,   %47, %48;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %49, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " p,   %50, %51;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      " %26,"
+      " %27,"
+      " p,   %29, %30;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      "{%26, %27, %28, %29},"
+      " %30,"
+      " p,   %32, %33;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %54, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " p,    %55,  %56;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %57, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " p,    %58,  %59;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %30, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " p,   %31, %32;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %33, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " p,   %34, %35;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p,    %59,  %60;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p,    %62,  %63;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %32, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      " %30,"
+      " %31,"
+      " p,   %33, %34;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %35, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      "{%30, %31, %32, %33},"
+      " %34,"
+      " p,   %36, %37;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %62, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " p,    %63,  %64;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %65, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " p,    %66,  %67;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      " %34,"
+      " %35,"
+      " p,   %37, %38;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      "{%34, %35, %36, %37},"
+      " %38,"
+      " p,   %40, %41;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %70, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      " %68,"
+      " %69,"
+      " p,    %71,  %72;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %73, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      "{%68,  %69,  %70,  %71},"
+      " %72,"
+      " p,    %74,  %75;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %38, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " p,   %39, %40;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %41, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " p,   %42, %43;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %74, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " p,    %75,  %76;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %77, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " p,    %78,  %79;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %40, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      " %38,"
+      " %39,"
+      " p,   %41, %42;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %43, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      "{%38, %39, %40, %41},"
+      " %42,"
+      " p,   %44, %45;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %78, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      " %76,"
+      " %77,"
+      " p,    %79,  %80;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %81, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      "{%76,  %77,  %78,  %79},"
+      " %80,"
+      " p,    %82,  %83;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p,   %43, %44;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p,   %46, %47;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %82, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " p,    %83,  %84;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %85, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " p,    %86,  %87;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      " %42,"
+      " %43,"
+      " p,   %45, %46;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      "{%42, %43, %44, %45},"
+      " %46,"
+      " p,   %48, %49;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %86, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      " %84,"
+      " %85,"
+      " p,    %87,  %88;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %89, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      "{%84,  %85,  %86,  %87},"
+      " %88,"
+      " p,    %90,  %91;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %46, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " p,   %47, %48;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %49, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " p,   %50, %51;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %90, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " p,    %91,  %92;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %93, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " p,    %94,  %95;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %48, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      " %46,"
+      " %47,"
+      " p,   %49, %50;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %51, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      "{%46, %47, %48, %49},"
+      " %50,"
+      " p,   %52, %53;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %94, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      " %92,"
+      " %93,"
+      " p,    %95,  %96;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %97, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      "{%92,  %93,  %94,  %95},"
+      " %96,"
+      " p,    %98,  %99;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k32.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      " %50,"
+      " %51,"
+      " p,    %53,  %54;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k32.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      "{%50,  %51,  %52,  %53},"
+      " %54,"
+      " p,    %56,  %57;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %102, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      " %100,"
+      " %101,"
+      " p,    %103, %104;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %105, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      "{%100, %101, %102, %103},"
+      " %104,"
+      " p,    %106, %107;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %54, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " p,    %55,  %56;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %57, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " p,    %58,  %59;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %106, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " p,    %107, %108;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %109, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " p,    %110, %111;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %56, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k32.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      " %54,"
+      " %55,"
+      " p,    %57,  %58;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %59, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k32.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      "{%54,  %55,  %56,  %57},"
+      " %58,"
+      " p,    %60,  %61;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %110, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      " %108,"
+      " %109,"
+      " p,    %111, %112;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %113, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      "{%108, %109, %110, %111},"
+      " %112,"
+      " p,    %114, %115;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p,    %59,  %60;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p,    %62,  %63;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %114, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " p,    %115, %116;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %117, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " p,    %118, %119;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k32.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      " %58,"
+      " %59,"
+      " p,    %61,  %62;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k32.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      "{%58,  %59,  %60,  %61},"
+      " %62,"
+      " p,    %64,  %65;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %118, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      " %116,"
+      " %117,"
+      " p,    %119, %120;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %121, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      "{%116, %117, %118, %119},"
+      " %120,"
+      " p,    %122, %123;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %62, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " p,    %63,  %64;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %65, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " p,    %66,  %67;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %122, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " p,    %123, %124;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %125, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " p,    %126, %127;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %64, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k32.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      " %62,"
+      " %63,"
+      " p,    %65,  %66;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %67, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k32.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      "{%62,  %63,  %64,  %65},"
+      " %66,"
+      " p,    %68,  %69;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %126, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      " %124,"
+      " %125,"
+      " p,    %127, %128;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %129, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      "{%124, %125, %126, %127},"
+      " %128,"
+      " p,    %130, %131;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      " %6,"
+      " %7,"
+      " p,   %9,  %10;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      "{%6,  %7,  %8,  %9},"
+      " %10,"
+      " p,   %12, %13;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p,   %15, %16;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p,   %18, %19;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      " %10,"
+      " %11,"
+      " p,   %13, %14;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      "{%10, %11, %12, %13},"
+      " %14,"
+      " p,   %16, %17;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %22, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " p,   %23, %24;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %25, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " p,   %26, %27;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p,   %15, %16;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p,   %18, %19;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p,   %27, %28;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p,   %30, %31;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      " %14,"
+      " %15,"
+      " p,   %17, %18;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      "{%14, %15, %16, %17},"
+      " %18,"
+      " p,   %20, %21;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %30, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " p,   %31, %32;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %33, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " p,   %34, %35;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      " %18,"
+      " %19,"
+      " p,   %21, %22;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      "{%18, %19, %20, %21},"
+      " %22,"
+      " p,   %24, %25;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %38, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " p,   %39, %40;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %41, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " p,   %42, %43;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %22, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " p,   %23, %24;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %25, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " p,   %26, %27;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p,   %43, %44;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p,   %46, %47;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %24, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      " %22,"
+      " %23,"
+      " p,   %25, %26;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %27, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      "{%22, %23, %24, %25},"
+      " %26,"
+      " p,   %28, %29;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %46, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " p,   %47, %48;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %49, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " p,   %50, %51;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      " %26,"
+      " %27,"
+      " p,   %29, %30;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      "{%26, %27, %28, %29},"
+      " %30,"
+      " p,   %32, %33;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %54, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " p,    %55,  %56;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %57, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " p,    %58,  %59;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %30, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " p,   %31, %32;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %33, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " p,   %34, %35;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p,    %59,  %60;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p,    %62,  %63;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %32, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      " %30,"
+      " %31,"
+      " p,   %33, %34;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %35, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      "{%30, %31, %32, %33},"
+      " %34,"
+      " p,   %36, %37;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %62, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " p,    %63,  %64;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %65, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " p,    %66,  %67;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      " %34,"
+      " %35,"
+      " p,   %37, %38;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      "{%34, %35, %36, %37},"
+      " %38,"
+      " p,   %40, %41;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %70, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      " %68,"
+      " %69,"
+      " p,    %71,  %72;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %73, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      "{%68,  %69,  %70,  %71},"
+      " %72,"
+      " p,    %74,  %75;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %38, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " p,   %39, %40;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %41, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " p,   %42, %43;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %74, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " p,    %75,  %76;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %77, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " p,    %78,  %79;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %40, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      " %38,"
+      " %39,"
+      " p,   %41, %42;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %43, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      "{%38, %39, %40, %41},"
+      " %42,"
+      " p,   %44, %45;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %78, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      " %76,"
+      " %77,"
+      " p,    %79,  %80;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %81, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      "{%76,  %77,  %78,  %79},"
+      " %80,"
+      " p,    %82,  %83;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p,   %43, %44;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p,   %46, %47;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %82, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " p,    %83,  %84;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %85, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " p,    %86,  %87;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      " %42,"
+      " %43,"
+      " p,   %45, %46;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      "{%42, %43, %44, %45},"
+      " %46,"
+      " p,   %48, %49;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %86, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      " %84,"
+      " %85,"
+      " p,    %87,  %88;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %89, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      "{%84,  %85,  %86,  %87},"
+      " %88,"
+      " p,    %90,  %91;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %46, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " p,   %47, %48;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %49, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " p,   %50, %51;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %90, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " p,    %91,  %92;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %93, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " p,    %94,  %95;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %48, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      " %46,"
+      " %47,"
+      " p,   %49, %50;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %51, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      "{%46, %47, %48, %49},"
+      " %50,"
+      " p,   %52, %53;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %94, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      " %92,"
+      " %93,"
+      " p,    %95,  %96;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %97, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      "{%92,  %93,  %94,  %95},"
+      " %96,"
+      " p,    %98,  %99;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k32.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      " %50,"
+      " %51,"
+      " p,    %53,  %54;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k32.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      "{%50,  %51,  %52,  %53},"
+      " %54,"
+      " p,    %56,  %57;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %102, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      " %100,"
+      " %101,"
+      " p,    %103, %104;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %105, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      "{%100, %101, %102, %103},"
+      " %104,"
+      " p,    %106, %107;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %54, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " p,    %55,  %56;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %57, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " p,    %58,  %59;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %106, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " p,    %107, %108;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %109, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " p,    %110, %111;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %56, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k32.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      " %54,"
+      " %55,"
+      " p,    %57,  %58;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %59, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k32.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      "{%54,  %55,  %56,  %57},"
+      " %58,"
+      " p,    %60,  %61;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %110, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      " %108,"
+      " %109,"
+      " p,    %111, %112;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %113, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      "{%108, %109, %110, %111},"
+      " %112,"
+      " p,    %114, %115;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p,    %59,  %60;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p,    %62,  %63;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %114, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " p,    %115, %116;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %117, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " p,    %118, %119;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k32.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      " %58,"
+      " %59,"
+      " p,    %61,  %62;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k32.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      "{%58,  %59,  %60,  %61},"
+      " %62,"
+      " p,    %64,  %65;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %118, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      " %116,"
+      " %117,"
+      " p,    %119, %120;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %121, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      "{%116, %117, %118, %119},"
+      " %120,"
+      " p,    %122, %123;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %62, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " p,    %63,  %64;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %65, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " p,    %66,  %67;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %122, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " p,    %123, %124;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %125, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " p,    %126, %127;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %64, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k32.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      " %62,"
+      " %63,"
+      " p,    %65,  %66;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %67, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k32.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      "{%62,  %63,  %64,  %65},"
+      " %66,"
+      " p,    %68,  %69;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %126, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      " %124,"
+      " %125,"
+      " p,    %127, %128;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %129, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      "{%124, %125, %126, %127},"
+      " %128,"
+      " p,    %130, %131;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      " %6,"
+      " %7,"
+      " p,   %9,  %10;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      "{%6,  %7,  %8,  %9},"
+      " %10,"
+      " p,   %12, %13;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p,   %15, %16;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p,   %18, %19;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      " %10,"
+      " %11,"
+      " p,   %13, %14;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      "{%10, %11, %12, %13},"
+      " %14,"
+      " p,   %16, %17;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %22, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " p,   %23, %24;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %25, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " p,   %26, %27;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p,   %15, %16;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p,   %18, %19;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p,   %27, %28;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p,   %30, %31;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      " %14,"
+      " %15,"
+      " p,   %17, %18;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      "{%14, %15, %16, %17},"
+      " %18,"
+      " p,   %20, %21;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %30, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " p,   %31, %32;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %33, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " p,   %34, %35;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      " %18,"
+      " %19,"
+      " p,   %21, %22;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      "{%18, %19, %20, %21},"
+      " %22,"
+      " p,   %24, %25;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %38, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " p,   %39, %40;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %41, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " p,   %42, %43;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %22, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " p,   %23, %24;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %25, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " p,   %26, %27;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p,   %43, %44;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p,   %46, %47;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %24, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      " %22,"
+      " %23,"
+      " p,   %25, %26;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %27, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      "{%22, %23, %24, %25},"
+      " %26,"
+      " p,   %28, %29;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %46, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " p,   %47, %48;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %49, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " p,   %50, %51;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      " %26,"
+      " %27,"
+      " p,   %29, %30;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      "{%26, %27, %28, %29},"
+      " %30,"
+      " p,   %32, %33;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %54, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " p,    %55,  %56;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %57, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " p,    %58,  %59;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %30, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " p,   %31, %32;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %33, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " p,   %34, %35;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p,    %59,  %60;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p,    %62,  %63;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %32, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      " %30,"
+      " %31,"
+      " p,   %33, %34;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %35, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      "{%30, %31, %32, %33},"
+      " %34,"
+      " p,   %36, %37;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %62, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " p,    %63,  %64;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %65, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " p,    %66,  %67;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      " %34,"
+      " %35,"
+      " p,   %37, %38;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      "{%34, %35, %36, %37},"
+      " %38,"
+      " p,   %40, %41;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %70, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      " %68,"
+      " %69,"
+      " p,    %71,  %72;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %73, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      "{%68,  %69,  %70,  %71},"
+      " %72,"
+      " p,    %74,  %75;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %38, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " p,   %39, %40;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %41, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " p,   %42, %43;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %74, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " p,    %75,  %76;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %77, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " p,    %78,  %79;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %40, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      " %38,"
+      " %39,"
+      " p,   %41, %42;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %43, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      "{%38, %39, %40, %41},"
+      " %42,"
+      " p,   %44, %45;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %78, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      " %76,"
+      " %77,"
+      " p,    %79,  %80;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %81, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      "{%76,  %77,  %78,  %79},"
+      " %80,"
+      " p,    %82,  %83;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p,   %43, %44;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p,   %46, %47;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %82, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " p,    %83,  %84;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %85, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " p,    %86,  %87;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      " %42,"
+      " %43,"
+      " p,   %45, %46;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      "{%42, %43, %44, %45},"
+      " %46,"
+      " p,   %48, %49;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %86, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      " %84,"
+      " %85,"
+      " p,    %87,  %88;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %89, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      "{%84,  %85,  %86,  %87},"
+      " %88,"
+      " p,    %90,  %91;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %46, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " p,   %47, %48;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %49, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " p,   %50, %51;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %90, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " p,    %91,  %92;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %93, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " p,    %94,  %95;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %48, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      " %46,"
+      " %47,"
+      " p,   %49, %50;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %51, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      "{%46, %47, %48, %49},"
+      " %50,"
+      " p,   %52, %53;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %94, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      " %92,"
+      " %93,"
+      " p,    %95,  %96;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %97, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      "{%92,  %93,  %94,  %95},"
+      " %96,"
+      " p,    %98,  %99;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k32.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      " %50,"
+      " %51,"
+      " p,    %53,  %54;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k32.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      "{%50,  %51,  %52,  %53},"
+      " %54,"
+      " p,    %56,  %57;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %102, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      " %100,"
+      " %101,"
+      " p,    %103, %104;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %105, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      "{%100, %101, %102, %103},"
+      " %104,"
+      " p,    %106, %107;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %54, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " p,    %55,  %56;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %57, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " p,    %58,  %59;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %106, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " p,    %107, %108;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %109, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " p,    %110, %111;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %56, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k32.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      " %54,"
+      " %55,"
+      " p,    %57,  %58;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %59, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k32.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      "{%54,  %55,  %56,  %57},"
+      " %58,"
+      " p,    %60,  %61;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %110, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      " %108,"
+      " %109,"
+      " p,    %111, %112;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %113, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      "{%108, %109, %110, %111},"
+      " %112,"
+      " p,    %114, %115;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p,    %59,  %60;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p,    %62,  %63;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %114, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " p,    %115, %116;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %117, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " p,    %118, %119;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k32.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      " %58,"
+      " %59,"
+      " p,    %61,  %62;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k32.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      "{%58,  %59,  %60,  %61},"
+      " %62,"
+      " p,    %64,  %65;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %118, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      " %116,"
+      " %117,"
+      " p,    %119, %120;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %121, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      "{%116, %117, %118, %119},"
+      " %120,"
+      " p,    %122, %123;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %62, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " p,    %63,  %64;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %65, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " p,    %66,  %67;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %122, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " p,    %123, %124;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %125, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " p,    %126, %127;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %64, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k32.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      " %62,"
+      " %63,"
+      " p,    %65,  %66;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %67, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k32.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      "{%62,  %63,  %64,  %65},"
+      " %66,"
+      " p,    %68,  %69;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %126, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      " %124,"
+      " %125,"
+      " p,    %127, %128;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %129, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      "{%124, %125, %126, %127},"
+      " %128,"
+      " p,    %130, %131;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      " %6,"
+      " %7,"
+      " p,   %9,  %10;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      "{%6,  %7,  %8,  %9},"
+      " %10,"
+      " p,   %12, %13;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p,   %15, %16;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p,   %18, %19;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      " %10,"
+      " %11,"
+      " p,   %13, %14;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      "{%10, %11, %12, %13},"
+      " %14,"
+      " p,   %16, %17;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %22, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " p,   %23, %24;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %25, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " p,   %26, %27;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p,   %15, %16;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p,   %18, %19;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p,   %27, %28;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p,   %30, %31;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      " %14,"
+      " %15,"
+      " p,   %17, %18;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      "{%14, %15, %16, %17},"
+      " %18,"
+      " p,   %20, %21;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %30, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " p,   %31, %32;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %33, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " p,   %34, %35;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      " %18,"
+      " %19,"
+      " p,   %21, %22;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      "{%18, %19, %20, %21},"
+      " %22,"
+      " p,   %24, %25;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %38, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " p,   %39, %40;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %41, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " p,   %42, %43;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %22, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " p,   %23, %24;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %25, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " p,   %26, %27;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p,   %43, %44;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p,   %46, %47;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %24, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      " %22,"
+      " %23,"
+      " p,   %25, %26;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %27, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      "{%22, %23, %24, %25},"
+      " %26,"
+      " p,   %28, %29;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %46, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " p,   %47, %48;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %49, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " p,   %50, %51;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      " %26,"
+      " %27,"
+      " p,   %29, %30;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      "{%26, %27, %28, %29},"
+      " %30,"
+      " p,   %32, %33;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %54, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " p,    %55,  %56;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %57, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " p,    %58,  %59;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %30, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " p,   %31, %32;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %33, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " p,   %34, %35;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p,    %59,  %60;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p,    %62,  %63;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %32, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      " %30,"
+      " %31,"
+      " p,   %33, %34;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %35, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      "{%30, %31, %32, %33},"
+      " %34,"
+      " p,   %36, %37;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %62, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " p,    %63,  %64;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %65, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " p,    %66,  %67;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      " %34,"
+      " %35,"
+      " p,   %37, %38;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      "{%34, %35, %36, %37},"
+      " %38,"
+      " p,   %40, %41;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %70, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      " %68,"
+      " %69,"
+      " p,    %71,  %72;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %73, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      "{%68,  %69,  %70,  %71},"
+      " %72,"
+      " p,    %74,  %75;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %38, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " p,   %39, %40;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %41, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " p,   %42, %43;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %74, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " p,    %75,  %76;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %77, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " p,    %78,  %79;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %40, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      " %38,"
+      " %39,"
+      " p,   %41, %42;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %43, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      "{%38, %39, %40, %41},"
+      " %42,"
+      " p,   %44, %45;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %78, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      " %76,"
+      " %77,"
+      " p,    %79,  %80;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %81, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      "{%76,  %77,  %78,  %79},"
+      " %80,"
+      " p,    %82,  %83;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p,   %43, %44;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p,   %46, %47;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %82, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " p,    %83,  %84;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %85, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " p,    %86,  %87;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      " %42,"
+      " %43,"
+      " p,   %45, %46;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      "{%42, %43, %44, %45},"
+      " %46,"
+      " p,   %48, %49;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %86, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      " %84,"
+      " %85,"
+      " p,    %87,  %88;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %89, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      "{%84,  %85,  %86,  %87},"
+      " %88,"
+      " p,    %90,  %91;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %46, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " p,   %47, %48;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %49, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " p,   %50, %51;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %90, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " p,    %91,  %92;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %93, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " p,    %94,  %95;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %48, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      " %46,"
+      " %47,"
+      " p,   %49, %50;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %51, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      "{%46, %47, %48, %49},"
+      " %50,"
+      " p,   %52, %53;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %94, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      " %92,"
+      " %93,"
+      " p,    %95,  %96;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %97, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      "{%92,  %93,  %94,  %95},"
+      " %96,"
+      " p,    %98,  %99;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k32.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      " %50,"
+      " %51,"
+      " p,    %53,  %54;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k32.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      "{%50,  %51,  %52,  %53},"
+      " %54,"
+      " p,    %56,  %57;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %102, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      " %100,"
+      " %101,"
+      " p,    %103, %104;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %105, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      "{%100, %101, %102, %103},"
+      " %104,"
+      " p,    %106, %107;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %54, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " p,    %55,  %56;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %57, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " p,    %58,  %59;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %106, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " p,    %107, %108;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %109, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " p,    %110, %111;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %56, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k32.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      " %54,"
+      " %55,"
+      " p,    %57,  %58;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %59, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k32.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      "{%54,  %55,  %56,  %57},"
+      " %58,"
+      " p,    %60,  %61;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %110, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      " %108,"
+      " %109,"
+      " p,    %111, %112;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %113, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      "{%108, %109, %110, %111},"
+      " %112,"
+      " p,    %114, %115;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p,    %59,  %60;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p,    %62,  %63;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %114, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " p,    %115, %116;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %117, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " p,    %118, %119;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k32.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      " %58,"
+      " %59,"
+      " p,    %61,  %62;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k32.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      "{%58,  %59,  %60,  %61},"
+      " %62,"
+      " p,    %64,  %65;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %118, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      " %116,"
+      " %117,"
+      " p,    %119, %120;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %121, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      "{%116, %117, %118, %119},"
+      " %120,"
+      " p,    %122, %123;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %62, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " p,    %63,  %64;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %65, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " p,    %66,  %67;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %122, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " p,    %123, %124;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %125, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " p,    %126, %127;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %64, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k32.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      " %62,"
+      " %63,"
+      " p,    %65,  %66;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %67, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k32.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      "{%62,  %63,  %64,  %65},"
+      " %66,"
+      " p,    %68,  %69;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %126, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      " %124,"
+      " %125,"
+      " p,    %127, %128;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %129, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      "{%124, %125, %126, %127},"
+      " %128,"
+      " p,    %130, %131;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace SM90::GMMA
+
+} // namespace cute
diff --git a/include/cute/arch/mma_sm90_gmma_sparse.hpp b/include/cute/arch/mma_sm90_gmma_sparse.hpp
new file mode 100644
index 0000000000..ecca91b93c
--- /dev/null
+++ b/include/cute/arch/mma_sm90_gmma_sparse.hpp
@@ -0,0 +1,22743 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cute/config.hpp>                 // CUTE_HOST_DEVICE
+#include <cute/arch/mma_sm90_gmma.hpp>     // GMMA::Major, etc.
+
+namespace cute {
+
+namespace SM90::GMMA::SPARSE {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// GMMA PTX definitions:  C = (scaleA * A) * (scaleB * B) + (scaleD * C)
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k32.f16.f16.f16 "
+      "{%0, %1},"
+      " %2,"
+      " %3,"
+      " %4, %5,"
+      " p,  %7, %8, %9, %10;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k32.f16.f16.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      " %6,"
+      " %7, %8,"
+      " p,   %10, %11, %12;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p,   %9,  %10, %11, %12;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p,   %12, %13, %14;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p,   %13, %14, %15, %16;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p,   %16, %17, %18;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p,   %21, %22, %23, %24;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p,   %24, %25, %26;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p,   %29, %30, %31, %32;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p,   %32, %33, %34;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p,   %37, %38, %39, %40;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p,   %40, %41, %42;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p,   %53, %54, %55, %56;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k32.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p,    %56,  %57,  %58;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k32.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p,    %69,  %70,  %71,  %72;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k32.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p,    %72,  %73,  %74;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p,   %9,  %10, %11, %12;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p,   %12, %13, %14;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p,   %13, %14, %15, %16;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p,   %16, %17, %18;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p,   %21, %22, %23, %24;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p,   %24, %25, %26;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p,   %37, %38, %39, %40;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p,   %40, %41, %42;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p,   %53, %54, %55, %56;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p,    %56,  %57,  %58;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p,    %69,  %70,  %71,  %72;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p,    %72,  %73,  %74;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %100, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " %98, %99,"
+      " p,    %101, %102, %103, %104;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %103, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " %101, %102,"
+      " p,    %104, %105, %106;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %132, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " %130, %131,"
+      " p,    %133, %134, %135, %136;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %135, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " %133, %134,"
+      " p,    %136, %137, %138;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p,   %9,  %10, %11, %12;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p,   %12, %13, %14;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p,   %13, %14, %15, %16;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p,   %16, %17, %18;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p,   %21, %22, %23, %24;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p,   %24, %25, %26;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p,   %37, %38, %39, %40;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p,   %40, %41, %42;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p,   %53, %54, %55, %56;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p,    %56,  %57,  %58;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p,    %69,  %70,  %71,  %72;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p,    %72,  %73,  %74;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %100, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " %98, %99,"
+      " p,    %101, %102, %103, %104;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %103, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " %101, %102,"
+      " p,    %104, %105, %106;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %132, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " %130, %131,"
+      " p,    %133, %134, %135, %136;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %135, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " %133, %134,"
+      " p,    %136, %137, %138;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p,   %9,  %10;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p,   %12, %13;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p,   %13, %14;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p,   %16, %17;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p,   %21, %22;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p,   %24, %25;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p,   %37, %38;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p,   %40, %41;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p,   %53, %54;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p,    %56,  %57;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p,    %69,  %70;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p,    %72,  %73;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %100, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " %98, %99,"
+      " p,    %101, %102;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %103, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " %101, %102,"
+      " p,    %104, %105;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %132, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " %130, %131,"
+      " p,    %133, %134;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %135, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " %133, %134,"
+      " p,    %136, %137;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %100, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " %98, %99,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %100, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " %98, %99,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %132, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " %130, %131,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %132, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " %130, %131,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %103, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " %101, %102,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %103, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " %101, %102,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %135, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " %133, %134,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %135, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " %133, %134,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %100, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " %98, %99,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %100, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " %98, %99,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %132, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " %130, %131,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %132, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " %130, %131,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %103, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " %101, %102,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %103, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " %101, %102,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %135, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " %133, %134,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %135, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " %133, %134,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %100, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " %98, %99,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %100, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " %98, %99,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %132, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " %130, %131,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %132, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " %130, %131,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %103, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " %101, %102,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %103, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " %101, %102,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %135, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " %133, %134,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %135, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " %133, %134,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %100, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " %98, %99,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %100, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " %98, %99,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %132, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " %130, %131,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %132, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " %130, %131,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %103, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " %101, %102,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %103, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " %101, %102,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %135, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " %133, %134,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %135, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " %133, %134,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f16.e4m3.e4m3 "
+      "{%0, %1},"
+      " %2,"
+      " %3,"
+      " %4, %5,"
+      " p,  %7, %8;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f16.e4m3.e4m3 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      " %6,"
+      " %7, %8,"
+      " p,   %10, %11;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p,   %9,  %10;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p,   %12, %13;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p,   %9,  %10;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p,   %12, %13;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p,   %13, %14;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p,   %16, %17;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p,   %13, %14;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p,   %16, %17;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p,   %21, %22;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p,   %24, %25;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p,   %21, %22;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p,   %24, %25;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p,   %37, %38;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p,   %40, %41;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p,   %29, %30;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p,   %32, %33;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p,   %53, %54;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p,    %56,  %57;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p,   %37, %38;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p,   %40, %41;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p,    %69,  %70;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p,    %72,  %73;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p,   %53, %54;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p,    %56,  %57;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %100, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " %98, %99,"
+      " p,    %101, %102;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %103, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " %101, %102,"
+      " p,    %104, %105;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p,    %69,  %70;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p,    %72,  %73;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %132, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " %130, %131,"
+      " p,    %133, %134;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %135, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " %133, %134,"
+      " p,    %136, %137;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f16.e4m3.e5m2 "
+      "{%0, %1},"
+      " %2,"
+      " %3,"
+      " %4, %5,"
+      " p,  %7, %8;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f16.e4m3.e5m2 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      " %6,"
+      " %7, %8,"
+      " p,   %10, %11;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p,   %9,  %10;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p,   %12, %13;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p,   %9,  %10;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p,   %12, %13;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p,   %13, %14;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p,   %16, %17;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p,   %13, %14;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p,   %16, %17;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p,   %21, %22;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p,   %24, %25;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p,   %21, %22;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p,   %24, %25;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p,   %37, %38;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p,   %40, %41;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p,   %29, %30;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p,   %32, %33;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p,   %53, %54;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p,    %56,  %57;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p,   %37, %38;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p,   %40, %41;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p,    %69,  %70;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p,    %72,  %73;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p,   %53, %54;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p,    %56,  %57;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %100, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " %98, %99,"
+      " p,    %101, %102;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %103, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " %101, %102,"
+      " p,    %104, %105;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p,    %69,  %70;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p,    %72,  %73;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %132, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " %130, %131,"
+      " p,    %133, %134;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %135, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " %133, %134,"
+      " p,    %136, %137;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f16.e5m2.e4m3 "
+      "{%0, %1},"
+      " %2,"
+      " %3,"
+      " %4, %5,"
+      " p,  %7, %8;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f16.e5m2.e4m3 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      " %6,"
+      " %7, %8,"
+      " p,   %10, %11;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p,   %9,  %10;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p,   %12, %13;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p,   %9,  %10;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p,   %12, %13;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p,   %13, %14;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p,   %16, %17;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p,   %13, %14;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p,   %16, %17;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p,   %21, %22;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p,   %24, %25;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p,   %21, %22;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p,   %24, %25;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p,   %37, %38;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p,   %40, %41;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p,   %29, %30;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p,   %32, %33;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p,   %53, %54;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p,    %56,  %57;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p,   %37, %38;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p,   %40, %41;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p,    %69,  %70;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p,    %72,  %73;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p,   %53, %54;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p,    %56,  %57;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %100, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " %98, %99,"
+      " p,    %101, %102;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %103, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " %101, %102,"
+      " p,    %104, %105;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p,    %69,  %70;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p,    %72,  %73;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %132, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " %130, %131,"
+      " p,    %133, %134;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %135, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " %133, %134,"
+      " p,    %136, %137;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f16.e5m2.e5m2 "
+      "{%0, %1},"
+      " %2,"
+      " %3,"
+      " %4, %5,"
+      " p,  %7, %8;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f16.e5m2.e5m2 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      " %6,"
+      " %7, %8,"
+      " p,   %10, %11;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p,   %9,  %10;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p,   %12, %13;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p,   %9,  %10;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p,   %12, %13;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p,   %13, %14;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p,   %16, %17;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p,   %13, %14;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p,   %16, %17;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p,   %21, %22;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p,   %24, %25;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p,   %21, %22;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p,   %24, %25;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p,   %37, %38;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p,   %40, %41;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p,   %29, %30;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p,   %32, %33;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p,   %53, %54;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p,    %56,  %57;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p,   %37, %38;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p,   %40, %41;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p,    %69,  %70;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p,    %72,  %73;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p,   %53, %54;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p,    %56,  %57;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %100, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " %98, %99,"
+      " p,    %101, %102;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %103, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " %101, %102,"
+      " p,    %104, %105;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p,    %69,  %70;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p,    %72,  %73;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %132, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " %130, %131,"
+      " p,    %133, %134;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %135, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " %133, %134,"
+      " p,    %136, %137;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace SM90::GMMA::SPARSE
+
+} // namespace cute
+
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+#include "mma_sm90_gmma_sparse_ext.hpp"
+#endif
diff --git a/include/cute/arch/mma_sm90_gmma_sparse_ext.hpp b/include/cute/arch/mma_sm90_gmma_sparse_ext.hpp
new file mode 100644
index 0000000000..c224e4034e
--- /dev/null
+++ b/include/cute/arch/mma_sm90_gmma_sparse_ext.hpp
@@ -0,0 +1,60445 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+ 
+#pragma once
+  
+#include <cute/config.hpp>                // CUTE_HOST_DEVICE
+
+#include "cutlass/arch/synclog.hpp"
+
+// Config
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && defined(__CUDA_ARCH_FEAT_SM90_ALL))
+#  define CUTE_ARCH_MMA_SM90A_ENABLED
+#endif
+
+namespace cute {
+
+namespace SM90::GMMA::SPARSE {
+
+// SPARSE GMMA 64x24x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      " %6,"
+      " %7,"
+      " %8, %9,"
+      " p,   %11, %12, %13, %14;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      "{%6,  %7,  %8,  %9},"
+      " %10,"
+      " %11, %12,"
+      " p,   %14, %15, %16;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      " %10,"
+      " %11,"
+      " %12, %13,"
+      " p,   %15, %16, %17, %18;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      "{%10, %11, %12, %13},"
+      " %14,"
+      " %15, %16,"
+      " p,   %18, %19, %20;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p,   %17, %18, %19, %20;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p,   %20, %21, %22;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      " %14,"
+      " %15,"
+      " %16, %17,"
+      " p,   %19, %20, %21, %22;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      "{%14, %15, %16, %17},"
+      " %18,"
+      " %19, %20,"
+      " p,   %22, %23, %24;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %22, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      " %18,"
+      " %19,"
+      " %20, %21,"
+      " p,   %23, %24, %25, %26;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %25, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      "{%18, %19, %20, %21},"
+      " %22,"
+      " %23, %24,"
+      " p,   %26, %27, %28;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %24, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " %22, %23,"
+      " p,   %25, %26, %27, %28;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %27, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " %25, %26,"
+      " p,   %28, %29, %30;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      " %22,"
+      " %23,"
+      " %24, %25,"
+      " p,   %27, %28, %29, %30;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      "{%22, %23, %24, %25},"
+      " %26,"
+      " %27, %28,"
+      " p,   %30, %31, %32;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %30, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      " %26,"
+      " %27,"
+      " %28, %29,"
+      " p,   %31, %32, %33, %34;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %33, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      "{%26, %27, %28, %29},"
+      " %30,"
+      " %31, %32,"
+      " p,   %34, %35, %36;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %32, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " %30, %31,"
+      " p,   %33, %34, %35, %36;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %35, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " %33, %34,"
+      " p,   %36, %37, %38;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      " %30,"
+      " %31,"
+      " %32, %33,"
+      " p,   %35, %36, %37, %38;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      "{%30, %31, %32, %33},"
+      " %34,"
+      " %35, %36,"
+      " p,   %38, %39, %40;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %38, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      " %34,"
+      " %35,"
+      " %36, %37,"
+      " p,   %39, %40, %41, %42;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %41, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      "{%34, %35, %36, %37},"
+      " %38,"
+      " %39, %40,"
+      " p,   %42, %43, %44;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %40, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " %38, %39,"
+      " p,   %41, %42, %43, %44;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %43, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " %41, %42,"
+      " p,   %44, %45, %46;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      " %38,"
+      " %39,"
+      " %40, %41,"
+      " p,   %43, %44, %45, %46;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      "{%38, %39, %40, %41},"
+      " %42,"
+      " %43, %44,"
+      " p,   %46, %47, %48;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p,   %45, %46, %47, %48;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p,   %48, %49, %50;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %46, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      " %42,"
+      " %43,"
+      " %44, %45,"
+      " p,   %47, %48, %49, %50;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %49, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      "{%42, %43, %44, %45},"
+      " %46,"
+      " %47, %48,"
+      " p,   %50, %51, %52;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %48, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " %46, %47,"
+      " p,   %49, %50, %51, %52;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %51, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " %49, %50,"
+      " p,   %52, %53, %54;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      " %46,"
+      " %47,"
+      " %48, %49,"
+      " p,   %51, %52, %53, %54;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      "{%46, %47, %48, %49},"
+      " %50,"
+      " %51, %52,"
+      " p,   %54, %55, %56;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %54, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k32.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      " %50,"
+      " %51,"
+      " %52, %53,"
+      " p,    %55,  %56,  %57,  %58;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %57, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k32.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      "{%50,  %51,  %52,  %53},"
+      " %54,"
+      " %55, %56,"
+      " p,    %58,  %59,  %60;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %56, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k32.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " %54, %55,"
+      " p,    %57,  %58,  %59,  %60;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %59, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k32.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " %57, %58,"
+      " p,    %60,  %61,  %62;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k32.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      " %54,"
+      " %55,"
+      " %56, %57,"
+      " p,    %59,  %60,  %61,  %62;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k32.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      "{%54,  %55,  %56,  %57},"
+      " %58,"
+      " %59, %60,"
+      " p,    %62,  %63,  %64;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k32.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p,    %61,  %62,  %63,  %64;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k32.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p,    %64,  %65,  %66;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %62, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k32.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      " %58,"
+      " %59,"
+      " %60, %61,"
+      " p,    %63,  %64,  %65,  %66;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %65, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k32.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      "{%58,  %59,  %60,  %61},"
+      " %62,"
+      " %63, %64,"
+      " p,    %66,  %67,  %68;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %64, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k32.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " %62, %63,"
+      " p,    %65,  %66,  %67,  %68;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %67, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k32.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " %65, %66,"
+      " p,    %68,  %69,  %70;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k32.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      " %62,"
+      " %63,"
+      " %64, %65,"
+      " p,    %67,  %68,  %69,  %70;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k32.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      "{%62,  %63,  %64,  %65},"
+      " %66,"
+      " %67, %68,"
+      " p,    %70,  %71,  %72;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p,   %17, %18, %19, %20;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p,   %20, %21, %22;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %24, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " %22, %23,"
+      " p,   %25, %26, %27, %28;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %27, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " %25, %26,"
+      " p,   %28, %29, %30;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p,   %29, %30, %31, %32;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p,   %32, %33, %34;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %32, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " %30, %31,"
+      " p,   %33, %34, %35, %36;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %35, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " %33, %34,"
+      " p,   %36, %37, %38;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %40, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " %38, %39,"
+      " p,   %41, %42, %43, %44;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %43, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " %41, %42,"
+      " p,   %44, %45, %46;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p,   %45, %46, %47, %48;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p,   %48, %49, %50;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %48, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " %46, %47,"
+      " p,   %49, %50, %51, %52;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %51, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " %49, %50,"
+      " p,   %52, %53, %54;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %56, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " %54, %55,"
+      " p,    %57,  %58,  %59,  %60;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %59, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " %57, %58,"
+      " p,    %60,  %61,  %62;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p,    %61,  %62,  %63,  %64;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p,    %64,  %65,  %66;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %64, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " %62, %63,"
+      " p,    %65,  %66,  %67,  %68;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %67, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " %65, %66,"
+      " p,    %68,  %69,  %70;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %72, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      " %68,"
+      " %69,"
+      " %70, %71,"
+      " p,    %73,  %74,  %75,  %76;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %75, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      "{%68,  %69,  %70,  %71},"
+      " %72,"
+      " %73, %74,"
+      " p,    %76,  %77,  %78;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %76, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " %74, %75,"
+      " p,    %77,  %78,  %79,  %80;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %79, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " %77, %78,"
+      " p,    %80,  %81,  %82;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %80, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      " %76,"
+      " %77,"
+      " %78, %79,"
+      " p,    %81,  %82,  %83,  %84;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %83, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      "{%76,  %77,  %78,  %79},"
+      " %80,"
+      " %81, %82,"
+      " p,    %84,  %85,  %86;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %84, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " %82, %83,"
+      " p,    %85,  %86,  %87,  %88;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %87, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " %85, %86,"
+      " p,    %88,  %89,  %90;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %88, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      " %84,"
+      " %85,"
+      " %86, %87,"
+      " p,    %89,  %90,  %91,  %92;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %91, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      "{%84,  %85,  %86,  %87},"
+      " %88,"
+      " %89, %90,"
+      " p,    %92,  %93,  %94;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %92, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " %90, %91,"
+      " p,    %93,  %94,  %95,  %96;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %95, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " %93, %94,"
+      " p,    %96,  %97,  %98;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %96, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      " %92,"
+      " %93,"
+      " %94, %95,"
+      " p,    %97,  %98,  %99,  %100;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %99, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      "{%92,  %93,  %94,  %95},"
+      " %96,"
+      " %97, %98,"
+      " p,    %100, %101, %102;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %104, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      " %100,"
+      " %101,"
+      " %102, %103,"
+      " p,    %105, %106, %107, %108;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %107, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      "{%100, %101, %102, %103},"
+      " %104,"
+      " %105, %106,"
+      " p,    %108, %109, %110;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %108, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " %106, %107,"
+      " p,    %109, %110, %111, %112;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %111, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " %109, %110,"
+      " p,    %112, %113, %114;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %112, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      " %108,"
+      " %109,"
+      " %110, %111,"
+      " p,    %113, %114, %115, %116;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %115, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      "{%108, %109, %110, %111},"
+      " %112,"
+      " %113, %114,"
+      " p,    %116, %117, %118;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %116, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " %114, %115,"
+      " p,    %117, %118, %119, %120;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %119, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " %117, %118,"
+      " p,    %120, %121, %122;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %120, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      " %116,"
+      " %117,"
+      " %118, %119,"
+      " p,    %121, %122, %123, %124;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %123, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      "{%116, %117, %118, %119},"
+      " %120,"
+      " %121, %122,"
+      " p,    %124, %125, %126;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %124, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " %122, %123,"
+      " p,    %125, %126, %127, %128;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %127, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " %125, %126,"
+      " p,    %128, %129, %130;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %128, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      " %124,"
+      " %125,"
+      " %126, %127,"
+      " p,    %129, %130, %131, %132;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %131, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      "{%124, %125, %126, %127},"
+      " %128,"
+      " %129, %130,"
+      " p,    %132, %133, %134;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p,   %17, %18, %19, %20;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p,   %20, %21, %22;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %24, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " %22, %23,"
+      " p,   %25, %26, %27, %28;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %27, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " %25, %26,"
+      " p,   %28, %29, %30;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p,   %29, %30, %31, %32;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p,   %32, %33, %34;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %32, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " %30, %31,"
+      " p,   %33, %34, %35, %36;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %35, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " %33, %34,"
+      " p,   %36, %37, %38;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %40, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " %38, %39,"
+      " p,   %41, %42, %43, %44;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %43, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " %41, %42,"
+      " p,   %44, %45, %46;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p,   %45, %46, %47, %48;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p,   %48, %49, %50;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %48, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " %46, %47,"
+      " p,   %49, %50, %51, %52;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %51, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " %49, %50,"
+      " p,   %52, %53, %54;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %56, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " %54, %55,"
+      " p,    %57,  %58,  %59,  %60;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %59, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " %57, %58,"
+      " p,    %60,  %61,  %62;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p,    %61,  %62,  %63,  %64;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p,    %64,  %65,  %66;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %64, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " %62, %63,"
+      " p,    %65,  %66,  %67,  %68;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %67, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " %65, %66,"
+      " p,    %68,  %69,  %70;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %72, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      " %68,"
+      " %69,"
+      " %70, %71,"
+      " p,    %73,  %74,  %75,  %76;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %75, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      "{%68,  %69,  %70,  %71},"
+      " %72,"
+      " %73, %74,"
+      " p,    %76,  %77,  %78;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %76, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " %74, %75,"
+      " p,    %77,  %78,  %79,  %80;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %79, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " %77, %78,"
+      " p,    %80,  %81,  %82;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %80, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      " %76,"
+      " %77,"
+      " %78, %79,"
+      " p,    %81,  %82,  %83,  %84;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %83, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      "{%76,  %77,  %78,  %79},"
+      " %80,"
+      " %81, %82,"
+      " p,    %84,  %85,  %86;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %84, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " %82, %83,"
+      " p,    %85,  %86,  %87,  %88;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %87, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " %85, %86,"
+      " p,    %88,  %89,  %90;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %88, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      " %84,"
+      " %85,"
+      " %86, %87,"
+      " p,    %89,  %90,  %91,  %92;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %91, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      "{%84,  %85,  %86,  %87},"
+      " %88,"
+      " %89, %90,"
+      " p,    %92,  %93,  %94;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %92, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " %90, %91,"
+      " p,    %93,  %94,  %95,  %96;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %95, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " %93, %94,"
+      " p,    %96,  %97,  %98;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %96, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      " %92,"
+      " %93,"
+      " %94, %95,"
+      " p,    %97,  %98,  %99,  %100;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %99, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      "{%92,  %93,  %94,  %95},"
+      " %96,"
+      " %97, %98,"
+      " p,    %100, %101, %102;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %104, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      " %100,"
+      " %101,"
+      " %102, %103,"
+      " p,    %105, %106, %107, %108;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %107, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      "{%100, %101, %102, %103},"
+      " %104,"
+      " %105, %106,"
+      " p,    %108, %109, %110;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %108, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " %106, %107,"
+      " p,    %109, %110, %111, %112;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %111, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " %109, %110,"
+      " p,    %112, %113, %114;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %112, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      " %108,"
+      " %109,"
+      " %110, %111,"
+      " p,    %113, %114, %115, %116;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %115, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      "{%108, %109, %110, %111},"
+      " %112,"
+      " %113, %114,"
+      " p,    %116, %117, %118;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %116, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " %114, %115,"
+      " p,    %117, %118, %119, %120;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %119, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " %117, %118,"
+      " p,    %120, %121, %122;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %120, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      " %116,"
+      " %117,"
+      " %118, %119,"
+      " p,    %121, %122, %123, %124;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %123, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      "{%116, %117, %118, %119},"
+      " %120,"
+      " %121, %122,"
+      " p,    %124, %125, %126;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %124, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " %122, %123,"
+      " p,    %125, %126, %127, %128;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %127, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " %125, %126,"
+      " p,    %128, %129, %130;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %128, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      " %124,"
+      " %125,"
+      " %126, %127,"
+      " p,    %129, %130, %131, %132;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %131, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      "{%124, %125, %126, %127},"
+      " %128,"
+      " %129, %130,"
+      " p,    %132, %133, %134;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p,   %17, %18;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p,   %20, %21;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %24, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " %22, %23,"
+      " p,   %25, %26;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %27, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " %25, %26,"
+      " p,   %28, %29;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p,   %29, %30;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p,   %32, %33;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %32, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " %30, %31,"
+      " p,   %33, %34;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %35, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " %33, %34,"
+      " p,   %36, %37;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %40, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " %38, %39,"
+      " p,   %41, %42;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %43, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " %41, %42,"
+      " p,   %44, %45;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p,   %45, %46;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p,   %48, %49;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %48, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " %46, %47,"
+      " p,   %49, %50;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %51, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " %49, %50,"
+      " p,   %52, %53;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %56, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " %54, %55,"
+      " p,    %57,  %58;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %59, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " %57, %58,"
+      " p,    %60,  %61;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p,    %61,  %62;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p,    %64,  %65;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %64, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " %62, %63,"
+      " p,    %65,  %66;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %67, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " %65, %66,"
+      " p,    %68,  %69;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %72, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      " %68,"
+      " %69,"
+      " %70, %71,"
+      " p,    %73,  %74;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %75, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      "{%68,  %69,  %70,  %71},"
+      " %72,"
+      " %73, %74,"
+      " p,    %76,  %77;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %76, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " %74, %75,"
+      " p,    %77,  %78;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %79, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " %77, %78,"
+      " p,    %80,  %81;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %80, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      " %76,"
+      " %77,"
+      " %78, %79,"
+      " p,    %81,  %82;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %83, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      "{%76,  %77,  %78,  %79},"
+      " %80,"
+      " %81, %82,"
+      " p,    %84,  %85;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %84, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " %82, %83,"
+      " p,    %85,  %86;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %87, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " %85, %86,"
+      " p,    %88,  %89;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %88, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      " %84,"
+      " %85,"
+      " %86, %87,"
+      " p,    %89,  %90;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %91, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      "{%84,  %85,  %86,  %87},"
+      " %88,"
+      " %89, %90,"
+      " p,    %92,  %93;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %92, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " %90, %91,"
+      " p,    %93,  %94;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %95, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " %93, %94,"
+      " p,    %96,  %97;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %96, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      " %92,"
+      " %93,"
+      " %94, %95,"
+      " p,    %97,  %98;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %99, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      "{%92,  %93,  %94,  %95},"
+      " %96,"
+      " %97, %98,"
+      " p,    %100, %101;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %104, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      " %100,"
+      " %101,"
+      " %102, %103,"
+      " p,    %105, %106;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %107, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      "{%100, %101, %102, %103},"
+      " %104,"
+      " %105, %106,"
+      " p,    %108, %109;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %108, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " %106, %107,"
+      " p,    %109, %110;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %111, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " %109, %110,"
+      " p,    %112, %113;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %112, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      " %108,"
+      " %109,"
+      " %110, %111,"
+      " p,    %113, %114;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %115, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      "{%108, %109, %110, %111},"
+      " %112,"
+      " %113, %114,"
+      " p,    %116, %117;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %116, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " %114, %115,"
+      " p,    %117, %118;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %119, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " %117, %118,"
+      " p,    %120, %121;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %120, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      " %116,"
+      " %117,"
+      " %118, %119,"
+      " p,    %121, %122;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %123, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      "{%116, %117, %118, %119},"
+      " %120,"
+      " %121, %122,"
+      " p,    %124, %125;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %124, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " %122, %123,"
+      " p,    %125, %126;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %127, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " %125, %126,"
+      " p,    %128, %129;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %128, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      " %124,"
+      " %125,"
+      " %126, %127,"
+      " p,    %129, %130;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %131, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      "{%124, %125, %126, %127},"
+      " %128,"
+      " %129, %130,"
+      " p,    %132, %133;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %76, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " %74, %75,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %76, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " %74, %75,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %84, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " %82, %83,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %84, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " %82, %83,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %92, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " %90, %91,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %92, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " %90, %91,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %108, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " %106, %107,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %108, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " %106, %107,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %116, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " %114, %115,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %116, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " %114, %115,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %124, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " %122, %123,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %124, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " %122, %123,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %79, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " %77, %78,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %79, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " %77, %78,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %87, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " %85, %86,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %87, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " %85, %86,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %95, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " %93, %94,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %95, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " %93, %94,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %111, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " %109, %110,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %111, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " %109, %110,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %119, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " %117, %118,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %119, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " %117, %118,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %127, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " %125, %126,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %127, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " %125, %126,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %76, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " %74, %75,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %76, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " %74, %75,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %84, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " %82, %83,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %84, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " %82, %83,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %92, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " %90, %91,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %92, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " %90, %91,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %108, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " %106, %107,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %108, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " %106, %107,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %116, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " %114, %115,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %116, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " %114, %115,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %124, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " %122, %123,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %124, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " %122, %123,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %79, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " %77, %78,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %79, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " %77, %78,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %87, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " %85, %86,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %87, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " %85, %86,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %95, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " %93, %94,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %95, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " %93, %94,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %111, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " %109, %110,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %111, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " %109, %110,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %119, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " %117, %118,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %119, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " %117, %118,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %127, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " %125, %126,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %127, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " %125, %126,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %76, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " %74, %75,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %76, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " %74, %75,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %84, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " %82, %83,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %84, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " %82, %83,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %92, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " %90, %91,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %92, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " %90, %91,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %108, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " %106, %107,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %108, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " %106, %107,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %116, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " %114, %115,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %116, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " %114, %115,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %124, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " %122, %123,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %124, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " %122, %123,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %79, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " %77, %78,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %79, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " %77, %78,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %87, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " %85, %86,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %87, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " %85, %86,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %95, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " %93, %94,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %95, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " %93, %94,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %111, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " %109, %110,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %111, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " %109, %110,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %119, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " %117, %118,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %119, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " %117, %118,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %127, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " %125, %126,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %127, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " %125, %126,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %76, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " %74, %75,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %76, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " %74, %75,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %84, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " %82, %83,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %84, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " %82, %83,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %92, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " %90, %91,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %92, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " %90, %91,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %108, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " %106, %107,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %108, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " %106, %107,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %116, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " %114, %115,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %116, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " %114, %115,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %124, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " %122, %123,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %124, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " %122, %123,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %79, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " %77, %78,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %79, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " %77, %78,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %87, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " %85, %86,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %87, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " %85, %86,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %95, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " %93, %94,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %95, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " %93, %94,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %111, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " %109, %110,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %111, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " %109, %110,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %119, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " %117, %118,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %119, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " %117, %118,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %127, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " %125, %126,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %127, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " %125, %126,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      " %6,"
+      " %7,"
+      " %8, %9,"
+      " p,   %11, %12;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      "{%6,  %7,  %8,  %9},"
+      " %10,"
+      " %11, %12,"
+      " p,   %14, %15;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p,   %17, %18;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p,   %20, %21;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      " %10,"
+      " %11,"
+      " %12, %13,"
+      " p,   %15, %16;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      "{%10, %11, %12, %13},"
+      " %14,"
+      " %15, %16,"
+      " p,   %18, %19;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %24, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " %22, %23,"
+      " p,   %25, %26;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %27, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " %25, %26,"
+      " p,   %28, %29;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p,   %17, %18;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p,   %20, %21;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p,   %29, %30;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p,   %32, %33;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      " %14,"
+      " %15,"
+      " %16, %17,"
+      " p,   %19, %20;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      "{%14, %15, %16, %17},"
+      " %18,"
+      " %19, %20,"
+      " p,   %22, %23;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %32, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " %30, %31,"
+      " p,   %33, %34;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %35, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " %33, %34,"
+      " p,   %36, %37;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %22, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      " %18,"
+      " %19,"
+      " %20, %21,"
+      " p,   %23, %24;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %25, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      "{%18, %19, %20, %21},"
+      " %22,"
+      " %23, %24,"
+      " p,   %26, %27;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %40, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " %38, %39,"
+      " p,   %41, %42;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %43, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " %41, %42,"
+      " p,   %44, %45;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %24, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " %22, %23,"
+      " p,   %25, %26;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %27, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " %25, %26,"
+      " p,   %28, %29;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p,   %45, %46;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p,   %48, %49;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      " %22,"
+      " %23,"
+      " %24, %25,"
+      " p,   %27, %28;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      "{%22, %23, %24, %25},"
+      " %26,"
+      " %27, %28,"
+      " p,   %30, %31;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %48, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " %46, %47,"
+      " p,   %49, %50;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %51, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " %49, %50,"
+      " p,   %52, %53;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %30, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      " %26,"
+      " %27,"
+      " %28, %29,"
+      " p,   %31, %32;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %33, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      "{%26, %27, %28, %29},"
+      " %30,"
+      " %31, %32,"
+      " p,   %34, %35;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %56, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " %54, %55,"
+      " p,    %57,  %58;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %59, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " %57, %58,"
+      " p,    %60,  %61;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %32, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " %30, %31,"
+      " p,   %33, %34;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %35, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " %33, %34,"
+      " p,   %36, %37;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p,    %61,  %62;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p,    %64,  %65;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      " %30,"
+      " %31,"
+      " %32, %33,"
+      " p,   %35, %36;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      "{%30, %31, %32, %33},"
+      " %34,"
+      " %35, %36,"
+      " p,   %38, %39;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %64, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " %62, %63,"
+      " p,    %65,  %66;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %67, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " %65, %66,"
+      " p,    %68,  %69;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %38, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      " %34,"
+      " %35,"
+      " %36, %37,"
+      " p,   %39, %40;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %41, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      "{%34, %35, %36, %37},"
+      " %38,"
+      " %39, %40,"
+      " p,   %42, %43;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %72, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      " %68,"
+      " %69,"
+      " %70, %71,"
+      " p,    %73,  %74;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %75, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      "{%68,  %69,  %70,  %71},"
+      " %72,"
+      " %73, %74,"
+      " p,    %76,  %77;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %40, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " %38, %39,"
+      " p,   %41, %42;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %43, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " %41, %42,"
+      " p,   %44, %45;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %76, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " %74, %75,"
+      " p,    %77,  %78;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %79, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " %77, %78,"
+      " p,    %80,  %81;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      " %38,"
+      " %39,"
+      " %40, %41,"
+      " p,   %43, %44;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      "{%38, %39, %40, %41},"
+      " %42,"
+      " %43, %44,"
+      " p,   %46, %47;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %80, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      " %76,"
+      " %77,"
+      " %78, %79,"
+      " p,    %81,  %82;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %83, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      "{%76,  %77,  %78,  %79},"
+      " %80,"
+      " %81, %82,"
+      " p,    %84,  %85;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p,   %45, %46;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p,   %48, %49;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %84, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " %82, %83,"
+      " p,    %85,  %86;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %87, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " %85, %86,"
+      " p,    %88,  %89;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %46, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      " %42,"
+      " %43,"
+      " %44, %45,"
+      " p,   %47, %48;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %49, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      "{%42, %43, %44, %45},"
+      " %46,"
+      " %47, %48,"
+      " p,   %50, %51;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %88, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      " %84,"
+      " %85,"
+      " %86, %87,"
+      " p,    %89,  %90;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %91, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      "{%84,  %85,  %86,  %87},"
+      " %88,"
+      " %89, %90,"
+      " p,    %92,  %93;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %48, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " %46, %47,"
+      " p,   %49, %50;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %51, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " %49, %50,"
+      " p,   %52, %53;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %92, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " %90, %91,"
+      " p,    %93,  %94;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %95, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " %93, %94,"
+      " p,    %96,  %97;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      " %46,"
+      " %47,"
+      " %48, %49,"
+      " p,   %51, %52;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      "{%46, %47, %48, %49},"
+      " %50,"
+      " %51, %52,"
+      " p,   %54, %55;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %96, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      " %92,"
+      " %93,"
+      " %94, %95,"
+      " p,    %97,  %98;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %99, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      "{%92,  %93,  %94,  %95},"
+      " %96,"
+      " %97, %98,"
+      " p,    %100, %101;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %54, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      " %50,"
+      " %51,"
+      " %52, %53,"
+      " p,    %55,  %56;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %57, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      "{%50,  %51,  %52,  %53},"
+      " %54,"
+      " %55, %56,"
+      " p,    %58,  %59;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %104, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      " %100,"
+      " %101,"
+      " %102, %103,"
+      " p,    %105, %106;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %107, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      "{%100, %101, %102, %103},"
+      " %104,"
+      " %105, %106,"
+      " p,    %108, %109;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %56, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " %54, %55,"
+      " p,    %57,  %58;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %59, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " %57, %58,"
+      " p,    %60,  %61;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %108, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " %106, %107,"
+      " p,    %109, %110;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %111, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " %109, %110,"
+      " p,    %112, %113;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      " %54,"
+      " %55,"
+      " %56, %57,"
+      " p,    %59,  %60;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      "{%54,  %55,  %56,  %57},"
+      " %58,"
+      " %59, %60,"
+      " p,    %62,  %63;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %112, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      " %108,"
+      " %109,"
+      " %110, %111,"
+      " p,    %113, %114;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %115, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      "{%108, %109, %110, %111},"
+      " %112,"
+      " %113, %114,"
+      " p,    %116, %117;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p,    %61,  %62;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p,    %64,  %65;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %116, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " %114, %115,"
+      " p,    %117, %118;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %119, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " %117, %118,"
+      " p,    %120, %121;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %62, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      " %58,"
+      " %59,"
+      " %60, %61,"
+      " p,    %63,  %64;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %65, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      "{%58,  %59,  %60,  %61},"
+      " %62,"
+      " %63, %64,"
+      " p,    %66,  %67;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %120, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      " %116,"
+      " %117,"
+      " %118, %119,"
+      " p,    %121, %122;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %123, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      "{%116, %117, %118, %119},"
+      " %120,"
+      " %121, %122,"
+      " p,    %124, %125;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %64, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " %62, %63,"
+      " p,    %65,  %66;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %67, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " %65, %66,"
+      " p,    %68,  %69;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %124, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " %122, %123,"
+      " p,    %125, %126;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %127, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " %125, %126,"
+      " p,    %128, %129;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      " %62,"
+      " %63,"
+      " %64, %65,"
+      " p,    %67,  %68;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      "{%62,  %63,  %64,  %65},"
+      " %66,"
+      " %67, %68,"
+      " p,    %70,  %71;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %128, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      " %124,"
+      " %125,"
+      " %126, %127,"
+      " p,    %129, %130;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %131, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      "{%124, %125, %126, %127},"
+      " %128,"
+      " %129, %130,"
+      " p,    %132, %133;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      " %6,"
+      " %7,"
+      " %8, %9,"
+      " p,   %11, %12;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      "{%6,  %7,  %8,  %9},"
+      " %10,"
+      " %11, %12,"
+      " p,   %14, %15;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p,   %17, %18;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p,   %20, %21;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      " %10,"
+      " %11,"
+      " %12, %13,"
+      " p,   %15, %16;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      "{%10, %11, %12, %13},"
+      " %14,"
+      " %15, %16,"
+      " p,   %18, %19;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %24, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " %22, %23,"
+      " p,   %25, %26;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %27, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " %25, %26,"
+      " p,   %28, %29;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p,   %17, %18;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p,   %20, %21;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p,   %29, %30;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p,   %32, %33;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      " %14,"
+      " %15,"
+      " %16, %17,"
+      " p,   %19, %20;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      "{%14, %15, %16, %17},"
+      " %18,"
+      " %19, %20,"
+      " p,   %22, %23;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %32, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " %30, %31,"
+      " p,   %33, %34;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %35, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " %33, %34,"
+      " p,   %36, %37;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %22, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      " %18,"
+      " %19,"
+      " %20, %21,"
+      " p,   %23, %24;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %25, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      "{%18, %19, %20, %21},"
+      " %22,"
+      " %23, %24,"
+      " p,   %26, %27;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %40, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " %38, %39,"
+      " p,   %41, %42;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %43, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " %41, %42,"
+      " p,   %44, %45;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %24, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " %22, %23,"
+      " p,   %25, %26;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %27, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " %25, %26,"
+      " p,   %28, %29;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p,   %45, %46;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p,   %48, %49;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      " %22,"
+      " %23,"
+      " %24, %25,"
+      " p,   %27, %28;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      "{%22, %23, %24, %25},"
+      " %26,"
+      " %27, %28,"
+      " p,   %30, %31;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %48, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " %46, %47,"
+      " p,   %49, %50;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %51, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " %49, %50,"
+      " p,   %52, %53;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %30, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      " %26,"
+      " %27,"
+      " %28, %29,"
+      " p,   %31, %32;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %33, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      "{%26, %27, %28, %29},"
+      " %30,"
+      " %31, %32,"
+      " p,   %34, %35;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %56, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " %54, %55,"
+      " p,    %57,  %58;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %59, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " %57, %58,"
+      " p,    %60,  %61;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %32, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " %30, %31,"
+      " p,   %33, %34;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %35, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " %33, %34,"
+      " p,   %36, %37;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p,    %61,  %62;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p,    %64,  %65;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      " %30,"
+      " %31,"
+      " %32, %33,"
+      " p,   %35, %36;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      "{%30, %31, %32, %33},"
+      " %34,"
+      " %35, %36,"
+      " p,   %38, %39;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %64, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " %62, %63,"
+      " p,    %65,  %66;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %67, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " %65, %66,"
+      " p,    %68,  %69;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %38, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      " %34,"
+      " %35,"
+      " %36, %37,"
+      " p,   %39, %40;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %41, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      "{%34, %35, %36, %37},"
+      " %38,"
+      " %39, %40,"
+      " p,   %42, %43;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %72, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      " %68,"
+      " %69,"
+      " %70, %71,"
+      " p,    %73,  %74;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %75, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      "{%68,  %69,  %70,  %71},"
+      " %72,"
+      " %73, %74,"
+      " p,    %76,  %77;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %40, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " %38, %39,"
+      " p,   %41, %42;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %43, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " %41, %42,"
+      " p,   %44, %45;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %76, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " %74, %75,"
+      " p,    %77,  %78;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %79, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " %77, %78,"
+      " p,    %80,  %81;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      " %38,"
+      " %39,"
+      " %40, %41,"
+      " p,   %43, %44;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      "{%38, %39, %40, %41},"
+      " %42,"
+      " %43, %44,"
+      " p,   %46, %47;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %80, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      " %76,"
+      " %77,"
+      " %78, %79,"
+      " p,    %81,  %82;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %83, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      "{%76,  %77,  %78,  %79},"
+      " %80,"
+      " %81, %82,"
+      " p,    %84,  %85;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p,   %45, %46;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p,   %48, %49;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %84, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " %82, %83,"
+      " p,    %85,  %86;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %87, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " %85, %86,"
+      " p,    %88,  %89;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %46, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      " %42,"
+      " %43,"
+      " %44, %45,"
+      " p,   %47, %48;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %49, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      "{%42, %43, %44, %45},"
+      " %46,"
+      " %47, %48,"
+      " p,   %50, %51;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %88, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      " %84,"
+      " %85,"
+      " %86, %87,"
+      " p,    %89,  %90;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %91, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      "{%84,  %85,  %86,  %87},"
+      " %88,"
+      " %89, %90,"
+      " p,    %92,  %93;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %48, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " %46, %47,"
+      " p,   %49, %50;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %51, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " %49, %50,"
+      " p,   %52, %53;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %92, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " %90, %91,"
+      " p,    %93,  %94;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %95, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " %93, %94,"
+      " p,    %96,  %97;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      " %46,"
+      " %47,"
+      " %48, %49,"
+      " p,   %51, %52;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      "{%46, %47, %48, %49},"
+      " %50,"
+      " %51, %52,"
+      " p,   %54, %55;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %96, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      " %92,"
+      " %93,"
+      " %94, %95,"
+      " p,    %97,  %98;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %99, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      "{%92,  %93,  %94,  %95},"
+      " %96,"
+      " %97, %98,"
+      " p,    %100, %101;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %54, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      " %50,"
+      " %51,"
+      " %52, %53,"
+      " p,    %55,  %56;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %57, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      "{%50,  %51,  %52,  %53},"
+      " %54,"
+      " %55, %56,"
+      " p,    %58,  %59;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %104, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      " %100,"
+      " %101,"
+      " %102, %103,"
+      " p,    %105, %106;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %107, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      "{%100, %101, %102, %103},"
+      " %104,"
+      " %105, %106,"
+      " p,    %108, %109;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %56, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " %54, %55,"
+      " p,    %57,  %58;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %59, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " %57, %58,"
+      " p,    %60,  %61;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %108, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " %106, %107,"
+      " p,    %109, %110;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %111, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " %109, %110,"
+      " p,    %112, %113;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      " %54,"
+      " %55,"
+      " %56, %57,"
+      " p,    %59,  %60;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      "{%54,  %55,  %56,  %57},"
+      " %58,"
+      " %59, %60,"
+      " p,    %62,  %63;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %112, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      " %108,"
+      " %109,"
+      " %110, %111,"
+      " p,    %113, %114;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %115, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      "{%108, %109, %110, %111},"
+      " %112,"
+      " %113, %114,"
+      " p,    %116, %117;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p,    %61,  %62;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p,    %64,  %65;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %116, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " %114, %115,"
+      " p,    %117, %118;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %119, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " %117, %118,"
+      " p,    %120, %121;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %62, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      " %58,"
+      " %59,"
+      " %60, %61,"
+      " p,    %63,  %64;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %65, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      "{%58,  %59,  %60,  %61},"
+      " %62,"
+      " %63, %64,"
+      " p,    %66,  %67;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %120, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      " %116,"
+      " %117,"
+      " %118, %119,"
+      " p,    %121, %122;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %123, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      "{%116, %117, %118, %119},"
+      " %120,"
+      " %121, %122,"
+      " p,    %124, %125;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %64, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " %62, %63,"
+      " p,    %65,  %66;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %67, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " %65, %66,"
+      " p,    %68,  %69;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %124, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " %122, %123,"
+      " p,    %125, %126;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %127, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " %125, %126,"
+      " p,    %128, %129;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      " %62,"
+      " %63,"
+      " %64, %65,"
+      " p,    %67,  %68;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      "{%62,  %63,  %64,  %65},"
+      " %66,"
+      " %67, %68,"
+      " p,    %70,  %71;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %128, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      " %124,"
+      " %125,"
+      " %126, %127,"
+      " p,    %129, %130;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %131, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      "{%124, %125, %126, %127},"
+      " %128,"
+      " %129, %130,"
+      " p,    %132, %133;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      " %6,"
+      " %7,"
+      " %8, %9,"
+      " p,   %11, %12;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      "{%6,  %7,  %8,  %9},"
+      " %10,"
+      " %11, %12,"
+      " p,   %14, %15;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p,   %17, %18;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p,   %20, %21;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      " %10,"
+      " %11,"
+      " %12, %13,"
+      " p,   %15, %16;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      "{%10, %11, %12, %13},"
+      " %14,"
+      " %15, %16,"
+      " p,   %18, %19;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %24, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " %22, %23,"
+      " p,   %25, %26;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %27, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " %25, %26,"
+      " p,   %28, %29;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p,   %17, %18;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p,   %20, %21;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p,   %29, %30;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p,   %32, %33;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      " %14,"
+      " %15,"
+      " %16, %17,"
+      " p,   %19, %20;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      "{%14, %15, %16, %17},"
+      " %18,"
+      " %19, %20,"
+      " p,   %22, %23;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %32, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " %30, %31,"
+      " p,   %33, %34;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %35, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " %33, %34,"
+      " p,   %36, %37;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %22, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      " %18,"
+      " %19,"
+      " %20, %21,"
+      " p,   %23, %24;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %25, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      "{%18, %19, %20, %21},"
+      " %22,"
+      " %23, %24,"
+      " p,   %26, %27;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %40, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " %38, %39,"
+      " p,   %41, %42;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %43, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " %41, %42,"
+      " p,   %44, %45;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %24, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " %22, %23,"
+      " p,   %25, %26;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %27, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " %25, %26,"
+      " p,   %28, %29;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p,   %45, %46;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p,   %48, %49;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      " %22,"
+      " %23,"
+      " %24, %25,"
+      " p,   %27, %28;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      "{%22, %23, %24, %25},"
+      " %26,"
+      " %27, %28,"
+      " p,   %30, %31;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %48, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " %46, %47,"
+      " p,   %49, %50;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %51, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " %49, %50,"
+      " p,   %52, %53;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %30, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      " %26,"
+      " %27,"
+      " %28, %29,"
+      " p,   %31, %32;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %33, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      "{%26, %27, %28, %29},"
+      " %30,"
+      " %31, %32,"
+      " p,   %34, %35;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %56, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " %54, %55,"
+      " p,    %57,  %58;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %59, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " %57, %58,"
+      " p,    %60,  %61;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %32, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " %30, %31,"
+      " p,   %33, %34;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %35, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " %33, %34,"
+      " p,   %36, %37;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p,    %61,  %62;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p,    %64,  %65;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      " %30,"
+      " %31,"
+      " %32, %33,"
+      " p,   %35, %36;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      "{%30, %31, %32, %33},"
+      " %34,"
+      " %35, %36,"
+      " p,   %38, %39;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %64, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " %62, %63,"
+      " p,    %65,  %66;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %67, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " %65, %66,"
+      " p,    %68,  %69;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %38, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      " %34,"
+      " %35,"
+      " %36, %37,"
+      " p,   %39, %40;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %41, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      "{%34, %35, %36, %37},"
+      " %38,"
+      " %39, %40,"
+      " p,   %42, %43;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %72, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      " %68,"
+      " %69,"
+      " %70, %71,"
+      " p,    %73,  %74;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %75, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      "{%68,  %69,  %70,  %71},"
+      " %72,"
+      " %73, %74,"
+      " p,    %76,  %77;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %40, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " %38, %39,"
+      " p,   %41, %42;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %43, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " %41, %42,"
+      " p,   %44, %45;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %76, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " %74, %75,"
+      " p,    %77,  %78;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %79, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " %77, %78,"
+      " p,    %80,  %81;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      " %38,"
+      " %39,"
+      " %40, %41,"
+      " p,   %43, %44;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      "{%38, %39, %40, %41},"
+      " %42,"
+      " %43, %44,"
+      " p,   %46, %47;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %80, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      " %76,"
+      " %77,"
+      " %78, %79,"
+      " p,    %81,  %82;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %83, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      "{%76,  %77,  %78,  %79},"
+      " %80,"
+      " %81, %82,"
+      " p,    %84,  %85;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p,   %45, %46;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p,   %48, %49;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %84, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " %82, %83,"
+      " p,    %85,  %86;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %87, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " %85, %86,"
+      " p,    %88,  %89;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %46, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      " %42,"
+      " %43,"
+      " %44, %45,"
+      " p,   %47, %48;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %49, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      "{%42, %43, %44, %45},"
+      " %46,"
+      " %47, %48,"
+      " p,   %50, %51;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %88, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      " %84,"
+      " %85,"
+      " %86, %87,"
+      " p,    %89,  %90;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %91, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      "{%84,  %85,  %86,  %87},"
+      " %88,"
+      " %89, %90,"
+      " p,    %92,  %93;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %48, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " %46, %47,"
+      " p,   %49, %50;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %51, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " %49, %50,"
+      " p,   %52, %53;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %92, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " %90, %91,"
+      " p,    %93,  %94;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %95, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " %93, %94,"
+      " p,    %96,  %97;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      " %46,"
+      " %47,"
+      " %48, %49,"
+      " p,   %51, %52;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      "{%46, %47, %48, %49},"
+      " %50,"
+      " %51, %52,"
+      " p,   %54, %55;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %96, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      " %92,"
+      " %93,"
+      " %94, %95,"
+      " p,    %97,  %98;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %99, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      "{%92,  %93,  %94,  %95},"
+      " %96,"
+      " %97, %98,"
+      " p,    %100, %101;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %54, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      " %50,"
+      " %51,"
+      " %52, %53,"
+      " p,    %55,  %56;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %57, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      "{%50,  %51,  %52,  %53},"
+      " %54,"
+      " %55, %56,"
+      " p,    %58,  %59;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %104, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      " %100,"
+      " %101,"
+      " %102, %103,"
+      " p,    %105, %106;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %107, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      "{%100, %101, %102, %103},"
+      " %104,"
+      " %105, %106,"
+      " p,    %108, %109;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %56, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " %54, %55,"
+      " p,    %57,  %58;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %59, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " %57, %58,"
+      " p,    %60,  %61;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %108, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " %106, %107,"
+      " p,    %109, %110;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %111, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " %109, %110,"
+      " p,    %112, %113;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      " %54,"
+      " %55,"
+      " %56, %57,"
+      " p,    %59,  %60;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      "{%54,  %55,  %56,  %57},"
+      " %58,"
+      " %59, %60,"
+      " p,    %62,  %63;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %112, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      " %108,"
+      " %109,"
+      " %110, %111,"
+      " p,    %113, %114;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %115, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      "{%108, %109, %110, %111},"
+      " %112,"
+      " %113, %114,"
+      " p,    %116, %117;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p,    %61,  %62;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p,    %64,  %65;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %116, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " %114, %115,"
+      " p,    %117, %118;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %119, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " %117, %118,"
+      " p,    %120, %121;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %62, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      " %58,"
+      " %59,"
+      " %60, %61,"
+      " p,    %63,  %64;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %65, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      "{%58,  %59,  %60,  %61},"
+      " %62,"
+      " %63, %64,"
+      " p,    %66,  %67;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %120, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      " %116,"
+      " %117,"
+      " %118, %119,"
+      " p,    %121, %122;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %123, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      "{%116, %117, %118, %119},"
+      " %120,"
+      " %121, %122,"
+      " p,    %124, %125;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %64, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " %62, %63,"
+      " p,    %65,  %66;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %67, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " %65, %66,"
+      " p,    %68,  %69;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %124, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " %122, %123,"
+      " p,    %125, %126;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %127, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " %125, %126,"
+      " p,    %128, %129;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      " %62,"
+      " %63,"
+      " %64, %65,"
+      " p,    %67,  %68;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      "{%62,  %63,  %64,  %65},"
+      " %66,"
+      " %67, %68,"
+      " p,    %70,  %71;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %128, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      " %124,"
+      " %125,"
+      " %126, %127,"
+      " p,    %129, %130;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %131, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      "{%124, %125, %126, %127},"
+      " %128,"
+      " %129, %130,"
+      " p,    %132, %133;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      " %6,"
+      " %7,"
+      " %8, %9,"
+      " p,   %11, %12;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      "{%6,  %7,  %8,  %9},"
+      " %10,"
+      " %11, %12,"
+      " p,   %14, %15;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p,   %17, %18;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p,   %20, %21;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      " %10,"
+      " %11,"
+      " %12, %13,"
+      " p,   %15, %16;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      "{%10, %11, %12, %13},"
+      " %14,"
+      " %15, %16,"
+      " p,   %18, %19;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %24, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " %22, %23,"
+      " p,   %25, %26;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %27, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " %25, %26,"
+      " p,   %28, %29;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p,   %17, %18;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p,   %20, %21;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p,   %29, %30;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p,   %32, %33;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      " %14,"
+      " %15,"
+      " %16, %17,"
+      " p,   %19, %20;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      "{%14, %15, %16, %17},"
+      " %18,"
+      " %19, %20,"
+      " p,   %22, %23;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %32, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " %30, %31,"
+      " p,   %33, %34;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %35, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " %33, %34,"
+      " p,   %36, %37;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %22, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      " %18,"
+      " %19,"
+      " %20, %21,"
+      " p,   %23, %24;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %25, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      "{%18, %19, %20, %21},"
+      " %22,"
+      " %23, %24,"
+      " p,   %26, %27;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %40, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " %38, %39,"
+      " p,   %41, %42;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %43, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " %41, %42,"
+      " p,   %44, %45;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %24, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " %22, %23,"
+      " p,   %25, %26;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %27, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " %25, %26,"
+      " p,   %28, %29;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p,   %45, %46;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p,   %48, %49;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      " %22,"
+      " %23,"
+      " %24, %25,"
+      " p,   %27, %28;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      "{%22, %23, %24, %25},"
+      " %26,"
+      " %27, %28,"
+      " p,   %30, %31;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %48, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " %46, %47,"
+      " p,   %49, %50;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %51, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " %49, %50,"
+      " p,   %52, %53;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %30, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      " %26,"
+      " %27,"
+      " %28, %29,"
+      " p,   %31, %32;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %33, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      "{%26, %27, %28, %29},"
+      " %30,"
+      " %31, %32,"
+      " p,   %34, %35;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %56, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " %54, %55,"
+      " p,    %57,  %58;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %59, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " %57, %58,"
+      " p,    %60,  %61;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %32, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " %30, %31,"
+      " p,   %33, %34;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %35, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " %33, %34,"
+      " p,   %36, %37;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p,    %61,  %62;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p,    %64,  %65;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      " %30,"
+      " %31,"
+      " %32, %33,"
+      " p,   %35, %36;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      "{%30, %31, %32, %33},"
+      " %34,"
+      " %35, %36,"
+      " p,   %38, %39;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %64, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " %62, %63,"
+      " p,    %65,  %66;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %67, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " %65, %66,"
+      " p,    %68,  %69;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %38, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      " %34,"
+      " %35,"
+      " %36, %37,"
+      " p,   %39, %40;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %41, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      "{%34, %35, %36, %37},"
+      " %38,"
+      " %39, %40,"
+      " p,   %42, %43;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %72, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      " %68,"
+      " %69,"
+      " %70, %71,"
+      " p,    %73,  %74;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %75, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      "{%68,  %69,  %70,  %71},"
+      " %72,"
+      " %73, %74,"
+      " p,    %76,  %77;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %40, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " %38, %39,"
+      " p,   %41, %42;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %43, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " %41, %42,"
+      " p,   %44, %45;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %76, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " %74, %75,"
+      " p,    %77,  %78;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %79, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " %77, %78,"
+      " p,    %80,  %81;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      " %38,"
+      " %39,"
+      " %40, %41,"
+      " p,   %43, %44;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      "{%38, %39, %40, %41},"
+      " %42,"
+      " %43, %44,"
+      " p,   %46, %47;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %80, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      " %76,"
+      " %77,"
+      " %78, %79,"
+      " p,    %81,  %82;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %83, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      "{%76,  %77,  %78,  %79},"
+      " %80,"
+      " %81, %82,"
+      " p,    %84,  %85;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p,   %45, %46;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p,   %48, %49;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %84, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " %82, %83,"
+      " p,    %85,  %86;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %87, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " %85, %86,"
+      " p,    %88,  %89;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %46, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      " %42,"
+      " %43,"
+      " %44, %45,"
+      " p,   %47, %48;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %49, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      "{%42, %43, %44, %45},"
+      " %46,"
+      " %47, %48,"
+      " p,   %50, %51;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %88, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      " %84,"
+      " %85,"
+      " %86, %87,"
+      " p,    %89,  %90;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %91, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      "{%84,  %85,  %86,  %87},"
+      " %88,"
+      " %89, %90,"
+      " p,    %92,  %93;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %48, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " %46, %47,"
+      " p,   %49, %50;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %51, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " %49, %50,"
+      " p,   %52, %53;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %92, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " %90, %91,"
+      " p,    %93,  %94;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %95, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " %93, %94,"
+      " p,    %96,  %97;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      " %46,"
+      " %47,"
+      " %48, %49,"
+      " p,   %51, %52;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      "{%46, %47, %48, %49},"
+      " %50,"
+      " %51, %52,"
+      " p,   %54, %55;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %96, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      " %92,"
+      " %93,"
+      " %94, %95,"
+      " p,    %97,  %98;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %99, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      "{%92,  %93,  %94,  %95},"
+      " %96,"
+      " %97, %98,"
+      " p,    %100, %101;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %54, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      " %50,"
+      " %51,"
+      " %52, %53,"
+      " p,    %55,  %56;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %57, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      "{%50,  %51,  %52,  %53},"
+      " %54,"
+      " %55, %56,"
+      " p,    %58,  %59;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %104, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      " %100,"
+      " %101,"
+      " %102, %103,"
+      " p,    %105, %106;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %107, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      "{%100, %101, %102, %103},"
+      " %104,"
+      " %105, %106,"
+      " p,    %108, %109;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %56, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " %54, %55,"
+      " p,    %57,  %58;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %59, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " %57, %58,"
+      " p,    %60,  %61;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %108, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " %106, %107,"
+      " p,    %109, %110;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %111, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " %109, %110,"
+      " p,    %112, %113;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      " %54,"
+      " %55,"
+      " %56, %57,"
+      " p,    %59,  %60;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      "{%54,  %55,  %56,  %57},"
+      " %58,"
+      " %59, %60,"
+      " p,    %62,  %63;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %112, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      " %108,"
+      " %109,"
+      " %110, %111,"
+      " p,    %113, %114;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %115, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      "{%108, %109, %110, %111},"
+      " %112,"
+      " %113, %114,"
+      " p,    %116, %117;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p,    %61,  %62;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p,    %64,  %65;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %116, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " %114, %115,"
+      " p,    %117, %118;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %119, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " %117, %118,"
+      " p,    %120, %121;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %62, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      " %58,"
+      " %59,"
+      " %60, %61,"
+      " p,    %63,  %64;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %65, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      "{%58,  %59,  %60,  %61},"
+      " %62,"
+      " %63, %64,"
+      " p,    %66,  %67;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %120, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      " %116,"
+      " %117,"
+      " %118, %119,"
+      " p,    %121, %122;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %123, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      "{%116, %117, %118, %119},"
+      " %120,"
+      " %121, %122,"
+      " p,    %124, %125;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %64, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " %62, %63,"
+      " p,    %65,  %66;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %67, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " %65, %66,"
+      " p,    %68,  %69;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %124, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " %122, %123,"
+      " p,    %125, %126;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %127, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " %125, %126,"
+      " p,    %128, %129;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      " %62,"
+      " %63,"
+      " %64, %65,"
+      " p,    %67,  %68;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      "{%62,  %63,  %64,  %65},"
+      " %66,"
+      " %67, %68,"
+      " p,    %70,  %71;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %128, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      " %124,"
+      " %125,"
+      " %126, %127,"
+      " p,    %129, %130;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %131, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      "{%124, %125, %126, %127},"
+      " %128,"
+      " %129, %130,"
+      " p,    %132, %133;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace SM90::GMMA::SPARSE
+
+} // namespace cute
diff --git a/include/cute/arch/util.hpp b/include/cute/arch/util.hpp
new file mode 100644
index 0000000000..3749a9c255
--- /dev/null
+++ b/include/cute/arch/util.hpp
@@ -0,0 +1,320 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+#include <cute/numeric/integer_sequence.hpp>
+
+#if defined(__clang__) && defined(__CUDA__)
+  //  __cvta_generic_to_shared was added in Clang 14: https://reviews.llvm.org/D111665
+  #if __clang_major__ >= 14
+    #define CUTE_CLANG_SUPPORTS_CVTA_GENERIC_TO_SHARED 1
+  #endif
+
+  // __nvvm_get_smem_pointer added in Clang 14: https://reviews.llvm.org/D111665
+  // ... but will not work on Windows until Clang 15: https://reviews.llvm.org/D122897
+  #if (!defined(_WIN32) && __clang_major__ >= 14) || __clang_major__ >= 15
+    #define CUTE_CLANG_SUPPORTS_NVVM_GET_SMEM_POINTER 1
+  #endif
+#endif
+
+#if defined(__NVCC__) || defined(__CUDACC_RTC__)
+  // __cvta_generic_to_shared added in CUDA 11+
+  #if __CUDACC_VER_MAJOR__ >= 11
+    #define CUTE_NVCC_SUPPORTS_CVTA_GENERIC_TO_SHARED 1
+  #endif
+
+  // __nvvm_get_smem_pointer added in CUDA 10.2
+  #if __CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2
+    #define CUTE_NVCC_SUPPORTS_NVVM_GET_SMEM_POINTER 1
+  #endif
+#endif
+
+#if CUTE_NVCC_SUPPORTS_CVTA_GENERIC_TO_SHARED || CUTE_CLANG_SUPPORTS_CVTA_GENERIC_TO_SHARED
+  #define CUTE_CVTA_GENERIC_TO_SHARED_SUPPORTED 1
+#endif
+
+#if !defined(CUTE_CVTA_GENERIC_TO_SHARED_ACTIVATED) && CUTE_CVTA_GENERIC_TO_SHARED_SUPPORTED && defined(__CUDA_ARCH__)
+  #define CUTE_CVTA_GENERIC_TO_SHARED_ACTIVATED 1
+#endif
+
+#if CUTE_NVCC_SUPPORTS_NVVM_GET_SMEM_POINTER || CUTE_CLANG_SUPPORTS_NVVM_GET_SMEM_POINTER
+  #define CUTE_NVVM_GET_SMEM_POINTER_SUPPORTED 1
+#endif
+
+#if !defined(CUTE_NVVM_GET_SMEM_POINTER_ACTIVATED) && CUTE_NVVM_GET_SMEM_POINTER_SUPPORTED && defined(__CUDA_ARCH__)
+  #define CUTE_NVVM_GET_SMEM_POINTER_ACTIVATED 1
+#endif
+
+// Clang 14+ provides a declaration of __nvvm_get_smem_pointer, so we only need
+// to provide one for NVCC
+#if CUTE_NVCC_SUPPORTS_NVVM_GET_SMEM_POINTER
+  extern "C" {
+  // This NVVM intrinsic is subject to change in future versions of CUDA.
+  // Clients should not call it directly.
+  CUTE_DEVICE uint32_t __nvvm_get_smem_pointer(void*);
+  }
+#endif
+
+namespace cute
+{
+
+/// CUTE helper to cast SMEM pointer to unsigned
+CUTE_DEVICE
+uint32_t
+cast_smem_ptr_to_uint(void const* const ptr)
+{
+// We prefer to use the new CVTA intrinsics if they are available, otherwise we will fall back to
+// the previous internal intrinsics if they are available.
+#if CUTE_CVTA_GENERIC_TO_SHARED_ACTIVATED
+  //
+  // This NVVM intrinsic converts an address in shared memory to a plain
+  // unsigned integer. This is necessary to pass to shared memory instructions
+  // in inline PTX.
+  //
+  // In CUDA 11 and beyond, this replaces __nvvm_get_smem_pointer()  [only available in 10.2].
+  //
+  //__device__ size_t __cvta_generic_to_shared(void* ptr);
+
+  /// CUTE helper to get SMEM pointer
+  return static_cast<uint32_t>(__cvta_generic_to_shared(ptr));
+
+#elif CUTE_NVVM_GET_SMEM_POINTER_ACTIVATED
+
+  return __nvvm_get_smem_pointer(ptr);
+
+#elif defined(__CUDA_ARCH__)
+
+  uint32_t smem_ptr;
+
+  asm(
+  "{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %1; cvt.u32.u64 %0, smem_ptr; }\n"
+    : "=r"(smem_ptr) : "l"(ptr));
+
+  return smem_ptr;
+
+#else
+
+
+  (void) ptr;
+  printf("ERROR: cast_smem_ptr_to_uint not supported but used.\n");
+  return 0;
+
+#endif
+}
+
+namespace detail {
+
+//
+// Wrapper for MMAOp::fma
+//
+
+template <class MmaOp>
+struct CallFMA {
+  template <class... Args>
+  CUTE_HOST_DEVICE constexpr void
+  operator()(Args&&... args) const {
+    return MmaOp::fma(static_cast<Args&&>(args)...);
+  }
+};
+
+//
+// Wrapper for CopyOp::copy
+//
+
+template <class CopyOp>
+struct CallCOPY {
+  template <class... Args>
+  CUTE_HOST_DEVICE constexpr void
+  operator()(Args&&... args) const {
+    return CopyOp::copy(static_cast<Args&&>(args)...);
+  }
+};
+
+//
+// Utility for exploding pointers/arrays/tensors into functions
+//
+
+template <class Fn,
+          class PtrA, int... I>
+CUTE_HOST_DEVICE constexpr
+void
+explode(Fn fn,
+        PtrA&& a, int_sequence<I...>)
+{
+  return fn(a[I]...);
+}
+
+template <class Fn,
+          class PtrS, int... Is,
+          class PtrD, int... Id>
+CUTE_HOST_DEVICE constexpr
+void
+explode(Fn fn,
+        PtrS&& s, int_sequence<Is...>,
+        PtrD&& d, int_sequence<Id...>)
+{
+  return fn(s[Is]..., d[Id]...);
+}
+
+template <class Fn,
+          class PtrA, int... Ia,
+          class PtrB, int... Ib,
+          class PtrC, int... Ic>
+CUTE_HOST_DEVICE constexpr
+void
+explode(Fn fn,
+        PtrA&& a, int_sequence<Ia...>,
+        PtrB&& b, int_sequence<Ib...>,
+        PtrC&& c, int_sequence<Ic...>)
+{
+  return fn(a[Ia]..., b[Ib]..., c[Ic]...);
+}
+
+template <class Fn,
+          class PtrD, int... Id,
+          class PtrA, int... Ia,
+          class PtrB, int... Ib,
+          class PtrC, int... Ic>
+CUTE_HOST_DEVICE constexpr
+void
+explode(Fn fn,
+        PtrD&& d, int_sequence<Id...>,
+        PtrA&& a, int_sequence<Ia...>,
+        PtrB&& b, int_sequence<Ib...>,
+        PtrC&& c, int_sequence<Ic...>)
+{
+  return fn(d[Id]..., a[Ia]..., b[Ib]..., c[Ic]...);
+}
+
+template <class Fn,
+          class PtrD, int... Id,
+          class PtrA, int... Ia,
+          class PtrB, int... Ib,
+          class PtrC, int... Ic,
+          class PtrE, int... Ie>
+CUTE_HOST_DEVICE constexpr
+void
+explode(Fn fn,
+        PtrD&& d, int_sequence<Id...>,
+        PtrA&& a, int_sequence<Ia...>,
+        PtrB&& b, int_sequence<Ib...>,
+        PtrC&& c, int_sequence<Ic...>,
+        PtrE&& e, int_sequence<Ie...>)
+{
+  return fn(d[Id]..., a[Ia]..., b[Ib]..., c[Ic]..., e[Ie]...);
+}
+
+template <class Fn,
+          class PtrD, int... Id,
+          class PtrA, int... Ia,
+          class PtrB, int... Ib,
+          class PtrC, int... Ic,
+          class PtrE, int... Ie,
+          class PtrF, int... If>
+CUTE_HOST_DEVICE constexpr
+void
+explode(Fn fn,
+        PtrD&& d, int_sequence<Id...>,
+        PtrA&& a, int_sequence<Ia...>,
+        PtrB&& b, int_sequence<Ib...>,
+        PtrC&& c, int_sequence<Ic...>,
+        PtrE&& e, int_sequence<Ie...>,
+        PtrF&& f, int_sequence<If...>)
+{
+  return fn(d[Id]..., a[Ia]..., b[Ib]..., c[Ic]..., e[Ie]..., f[If]...);
+}
+
+template <class Fn,
+          class PtrD, int... Id,
+          class PtrA, int... Ia,
+          class PtrB, int... Ib,
+          class PtrC, int... Ic,
+          class PtrE, int... Ie,
+          class PtrF, int... If,
+          class PtrG, int... Ig>
+CUTE_HOST_DEVICE constexpr
+void
+explode(Fn fn,
+        PtrD&& d, int_sequence<Id...>,
+        PtrA&& a, int_sequence<Ia...>,
+        PtrB&& b, int_sequence<Ib...>,
+        PtrC&& c, int_sequence<Ic...>,
+        PtrE&& e, int_sequence<Ie...>,
+        PtrF&& f, int_sequence<If...>,
+        PtrG&& g, int_sequence<Ig...>)
+{
+  return fn(d[Id]..., a[Ia]..., b[Ib]..., c[Ic]..., e[Ie]..., f[If]..., g[Ig]...);
+}
+
+//
+// Utility for exploding tuples into functions
+//
+
+template <class Fn,
+          class TupleA, int... I>
+CUTE_HOST_DEVICE constexpr
+void
+explode_tuple(Fn fn,
+              TupleA&& a, int_sequence<I...>)
+{
+  return fn(get<I>(a)...);
+}
+
+template <class Fn,
+          class TupleA, int... Ia,
+          class TupleB, int... Ib>
+CUTE_HOST_DEVICE constexpr
+void
+explode_tuple(Fn fn,
+              TupleA&& a, int_sequence<Ia...>,
+              TupleB&& b, int_sequence<Ib...>)
+{
+  return fn(get<Ia>(a)..., get<Ib>(b)...);
+}
+
+template <class Fn,
+          class TupleA, int... Ia,
+          class TupleB, int... Ib,
+          class TupleC, int... Ic>
+CUTE_HOST_DEVICE constexpr
+void
+explode_tuple(Fn fn,
+              TupleA&& a, int_sequence<Ia...>,
+              TupleB&& b, int_sequence<Ib...>,
+              TupleC&& c, int_sequence<Ic...>)
+{
+  return fn(get<Ia>(a)..., get<Ib>(b)..., get<Ic>(c)...);
+}
+
+} // end namespace detail
+
+} // end namespace cute
diff --git a/include/cute/atom/copy_atom.hpp b/include/cute/atom/copy_atom.hpp
new file mode 100644
index 0000000000..75b7aa4de6
--- /dev/null
+++ b/include/cute/atom/copy_atom.hpp
@@ -0,0 +1,764 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>                     // CUTE_HOST_DEVICE
+#include <cute/tensor_impl.hpp>                // cute::Tensor
+#include <cute/util/type_traits.hpp>           // cute::__CUTE_REQUIRES
+#include <cute/container/tuple.hpp>            // cute::is_tuple
+#include <cute/numeric/integral_constant.hpp>  // cute::is_constant, cute::is_integral
+#include <cute/atom/copy_traits.hpp>           // cute::Copy_Traits
+#include <cute/atom/mma_atom.hpp>              // cute::TiledMMA
+
+namespace cute
+{
+
+template <class... Args>
+struct Copy_Atom;
+
+template <class CopyOperation, class CopyInternalType>
+struct Copy_Atom<CopyOperation, CopyInternalType> : Copy_Atom<Copy_Traits<CopyOperation>, CopyInternalType>
+{};
+
+template <class... Args, class CopyInternalType>
+struct Copy_Atom<Copy_Traits<Args...>, CopyInternalType>
+  : Copy_Traits<Args...>
+{
+  using Traits = Copy_Traits<Args...>;
+
+  // Bit and Thr layouts from the Copy_Traits
+  using ThrID        = typename Traits::ThrID;
+  using BitLayoutSrc = typename Traits::SrcLayout;
+  using BitLayoutDst = typename Traits::DstLayout;
+  using BitLayoutRef = typename Traits::RefLayout;
+
+  using ValType = CopyInternalType;
+
+  using ValLayoutSrc = decltype(recast_layout<uint1_t, ValType>(BitLayoutSrc{}));
+  using ValLayoutDst = decltype(recast_layout<uint1_t, ValType>(BitLayoutDst{}));
+  using ValLayoutRef = decltype(recast_layout<uint1_t, ValType>(BitLayoutRef{}));
+
+  CUTE_STATIC_ASSERT_V(size<0>(ValLayoutSrc{}) == size(ThrID{}), "CopyOperation is not valid for Src of ValType.");
+  CUTE_STATIC_ASSERT_V(size<0>(ValLayoutDst{}) == size(ThrID{}), "CopyOperation is not valid for Dst of ValType.");
+  CUTE_STATIC_ASSERT_V(size<0>(ValLayoutRef{}) == size(ThrID{}), "CopyOperation is not valid for Ref of ValType.");
+
+  static constexpr int NumValSrc = size<1>(ValLayoutSrc{});
+  static constexpr int NumValDst = size<1>(ValLayoutDst{});
+
+  // Additional Trait parameters/transformations
+  template <class... TraitsArgs>
+  CUTE_HOST_DEVICE
+  auto
+  with(TraitsArgs&&... args) const {
+    auto traits = Traits::with(static_cast<TraitsArgs&&>(args)...);
+    return Copy_Atom<decltype(traits), CopyInternalType>{traits};
+  }
+
+  //
+  // Tensor call interfaces
+  //
+
+  // Check and call instruction, or recurse
+  template <class SEngine, class SLayout,
+            class DEngine, class DLayout>
+  CUTE_HOST_DEVICE
+  void
+  call(Tensor<SEngine,SLayout> const& src,
+       Tensor<DEngine,DLayout>      & dst) const
+  {
+    static_assert(SLayout::rank == 1, "Expected rank-1 src tensor");
+    static_assert(DLayout::rank == 1, "Expected rank-1 dst tensor");
+
+    if constexpr (is_constant<NumValSrc, decltype(size(src))>::value ||
+                  is_constant<NumValDst, decltype(size(dst))>::value) {
+      // Dispatch to unpack to execute instruction
+      return copy_unpack(static_cast<Traits const&>(*this), src, dst);
+    } else if constexpr (is_tuple<decltype(shape(src))>::value &&
+                         is_tuple<decltype(shape(dst))>::value) {
+      // If the size of the src/dst doesn't match the instruction,
+      //   recurse this rank-1 layout by peeling off the mode
+      //   ((A,B,C,...)) -> (A,B,C,...)
+      return copy(*this, tensor<0>(src), tensor<0>(dst));
+    } else {
+      static_assert(dependent_false<SEngine>,
+                    "CopyAtom: Src/Dst partitioning does not match the instruction requirement.");
+    }
+  }
+
+  // Accept mutable temporaries
+  template <class SEngine, class SLayout,
+            class DEngine, class DLayout>
+  CUTE_HOST_DEVICE
+  void
+  call(Tensor<SEngine,SLayout> const& src,
+       Tensor<DEngine,DLayout>     && dst) const
+  {
+    return call(src, dst);
+  }
+};
+
+//
+// A tiling of copy atoms
+//
+
+template <class TiledCopy, class ThrIdx>
+struct ThrCopy;
+
+template <class Copy_Atom,
+          class LayoutCopy_TV,  // (tid,vid) -> coord   [Need not be 2D...]
+          class ShapeTiler_MN>  // coord space
+struct TiledCopy : Copy_Atom
+{
+  // Layout information from the CopyAtom
+  using AtomThrID     = typename Copy_Atom::ThrID;        // thrid -> thr_idx
+  using AtomLayoutSrc = typename Copy_Atom::ValLayoutSrc; // (thr,val) -> offset
+  using AtomLayoutDst = typename Copy_Atom::ValLayoutDst; // (thr,val) -> offset
+  using AtomLayoutRef = typename Copy_Atom::ValLayoutRef; // (thr,val) -> offset
+
+  using AtomNumThr = decltype(size<0>(AtomLayoutRef{}));
+  using AtomNumVal = decltype(size<1>(AtomLayoutRef{}));
+
+  // Layout information for the TiledCopy
+  using Tiler_MN       = ShapeTiler_MN;
+  using TiledLayout_TV = LayoutCopy_TV;
+  using TiledNumThr    = decltype(size<0>(TiledLayout_TV{}));
+  using TiledNumVal    = decltype(size<1>(TiledLayout_TV{}));
+
+  CUTE_STATIC_ASSERT_V(TiledNumThr{} % AtomNumThr{} == Int<0>{}, "TiledCopy uses too few thrs for selected CopyAtom");
+  CUTE_STATIC_ASSERT_V(TiledNumVal{} % AtomNumVal{} == Int<0>{}, "TiledCopy uses too few vals for selected CopyAtom");
+
+  // Tile a tensor or a layout from shape
+  //   (M,N,...)
+  // to shape
+  //   ((ThrV,ThrX),FrgV,(RestM,RestN,...))
+  // where
+  //   ThrV:  The threads local to a COPY_ATOM Src.
+  //   ThrX:  The threads tiled across COPY_ATOMs Src.
+  //   FrgV:  The values local to a COPY_ATOM Src.
+  //   RestM: The values tiled in M.
+  //   RestN: The values tiled in N.
+  template <class STensor>
+  CUTE_HOST_DEVICE constexpr static
+  auto
+  tidfrg_S(STensor&& stensor)
+  {
+    CUTE_STATIC_ASSERT_V(rank(stensor) >= rank(Tiler_MN{}), "Rank of tensor to be partitioned too small.");
+
+    // Tile the stensor and compute the (src-thr, src-val) -> (ref-thr, ref-val) layout
+    return tile2thrfrg(zipped_divide(stensor,Tiler_MN{}), right_inverse(AtomLayoutRef{}).compose(AtomLayoutSrc{}));
+  }
+
+  // Tile a tensor or a layout from shape
+  //   (M,N,...)
+  // to shape
+  //   ((ThrV,ThrX),FrgV,(RestM,RestN,...))
+  // where
+  //   ThrV:  The threads local to a COPY_ATOM Dst.
+  //   ThrX:  The threads tiled across COPY_ATOMs Dst.
+  //   FrgV:  The values local to a COPY_ATOM Dst.
+  //   RestM: The values tiled in M.
+  //   RestN: The values tiled in N.
+  template <class DTensor>
+  CUTE_HOST_DEVICE constexpr static
+  auto
+  tidfrg_D(DTensor&& dtensor)
+  {
+    CUTE_STATIC_ASSERT_V(rank(dtensor) >= rank(Tiler_MN{}), "Rank of tensor to be partitioned too small.");
+
+    // Tile the dtensor and compute the (dst-thr, dst-val) -> (ref-thr, ref-val) layout
+    return tile2thrfrg(zipped_divide(dtensor,Tiler_MN{}), right_inverse(AtomLayoutRef{}).compose(AtomLayoutDst{}));
+  }
+
+  // Tile a tensor or a layout from shape
+  //   ((TileM,TileN,...), (RestM,RestN,...))
+  // to shape
+  //   ((ThrV,ThrX),FrgV,(RestM,RestN,...))
+  template <class Tensor, class Ref2TrgLayout>
+  CUTE_HOST_DEVICE constexpr static
+  auto
+  tile2thrfrg(Tensor&& tensor, Ref2TrgLayout const& ref2trg)
+  {
+    // Take the thrs/vals that the atom is interested in
+    // NOTE: Assumes the AtomNumThr are contiguous and identity within TiledThrID
+    auto atom_layout_TV = zipped_divide(TiledLayout_TV{}, make_shape(AtomNumThr{}, AtomNumVal{}));
+    // ((atom_tid,atom_val),(rest_tid,rest_val)) -> (m,n)
+
+    // Transform to the trg layout
+    auto trg_layout_TV = atom_layout_TV.compose(ref2trg, _);
+    // ((trg_tid,trg_val),(rest_tid,rest_val)) -> (m,n)
+
+    // Transform the thrs mode from thrid to thr_idx
+    // NOTE: Assumes the AtomNumThr are contiguous and identity within TiledThrID
+    auto thrval2mn = coalesce(zip(trg_layout_TV), Shape<_1,Shape<_1,_1>>{});
+    // ((trg_tid,rest_tid),(trg_val,rest_val)) -> (m,n)
+
+    /// ==================
+
+    // Transform the tile mode
+    auto tv_tensor = tensor.compose(thrval2mn, _);
+    // ((thrid,val),(RestM,RestN,...))
+
+    // Unfold and return
+    return tv_tensor(make_coord(_,_), _);
+  }
+
+  // retile_S and retile_D assume they are working with the reference layout -- they are the same
+  template <class Tensor>
+  CUTE_HOST_DEVICE constexpr static
+  auto
+  retile(Tensor&& tensor)
+  {
+    constexpr int R = remove_cvref_t<Tensor>::rank;
+    // Assert that AtomLayoutSrc|Dst is identity so we can skip the Ref transformation
+
+    // Assume the first size<0>(tensor) elements are the first val_ids in TiledLayout_TV.
+    // Then, we only need the shape+layout of those size<0>(tensor) elements in TiledLayout_TV
+    //   and that shape is what we gather from the other modes of tensor
+
+    auto V = size<0>(tensor);
+
+    auto frg_layout_mn = upcast<TiledNumThr{} * V>(right_inverse(TiledLayout_TV{}).with_shape(shape(Tiler_MN{})));
+    // (m,n) -> v_idx -- The shape and order of the V inside of TiledLayout_TV
+
+    auto frg_layout_v = zipped_divide(logical_product(make_layout(V), right_inverse(frg_layout_mn)), make_layout(AtomNumVal{}));
+    // (atom_vals,rest_vals) -> (v,m,n)
+
+    /// =======
+
+    // Tile the tensor for TileFrg
+    auto t_tensor = zipped_divide(tensor, prepend(product_each(shape(frg_layout_mn)), V));
+    // ((TileV,TileM,TileN,...),(1,RestM,RestN,...))
+
+    // Transform the tile mode
+    auto v_tensor = t_tensor.compose(frg_layout_v, _);
+    // ((atom_vals,rest_vals),(1,RM,RN,...))
+
+    // Unfold and return
+    return v_tensor(_, append<R>(Int<0>{},_));
+  }
+
+  CUTE_HOST_DEVICE constexpr static
+  auto
+  get_layoutS_TV()
+  {
+    // (M,N) -> (M,N)
+    auto ref_S = make_layout(make_shape(shape(Tiler_MN{}), Int<1>{}));
+    // (thr_idx,val_idx) -> (M,N)
+    return tile2thrfrg(ref_S, right_inverse(AtomLayoutRef{}).compose(AtomLayoutSrc{}))(_,_,Int<0>{});
+  }
+
+  CUTE_HOST_DEVICE constexpr static
+  auto
+  get_layoutS_MN()
+  {
+    // (thr_idx,val_idx) -> (M,N)
+    auto layoutS_TV = get_layoutS_TV();
+    // (M,K) -> (thr_idx,val_idx)
+    auto layoutS_MK = right_inverse(layoutS_TV).with_shape(shape(Tiler_MN{}));
+
+    // athrid = (v,m,k) -> thr_idx
+    auto thrID_S = make_layout(size<0>(TiledLayout_TV{}));
+
+    return cute::make_tuple(layoutS_MK, thrID_S);
+  }
+
+  CUTE_HOST_DEVICE constexpr static
+  auto
+  get_layoutD_TV()
+  {
+    // (M,N) -> (M,N)
+    auto ref_D = make_layout(make_shape(shape(Tiler_MN{}), Int<1>{}));
+    // (thr_idx,val_idx) -> (M,N)
+    return tile2thrfrg(ref_D, right_inverse(AtomLayoutRef{}).compose(AtomLayoutDst{}))(_,_,Int<0>{});
+  }
+
+  CUTE_HOST_DEVICE constexpr static
+  auto
+  get_layoutD_MN()
+  {
+    // (thr_idx,val_idx) -> (M,N)
+    auto layoutD_TV = get_layoutD_TV();
+    // (M,K) -> (thr_idx,val_idx)
+    auto layoutD_MK = right_inverse(layoutD_TV).with_shape(shape(Tiler_MN{}));
+
+    // athrid = (v,m,k) -> thr_idx
+    auto thrID_D = make_layout(size<0>(TiledLayout_TV{}));
+
+    return cute::make_tuple(layoutD_MK, thrID_D);
+  }
+
+  template <class ThrIdx,
+            __CUTE_REQUIRES(is_integral<ThrIdx>::value)>
+  CUTE_HOST_DEVICE static
+  auto
+  get_slice(ThrIdx const& thr_idx)
+  {
+    return ThrCopy<TiledCopy, ThrIdx>(thr_idx);
+  }
+
+  template <class ThrIdx,
+            __CUTE_REQUIRES(is_integral<ThrIdx>::value)>
+  CUTE_HOST_DEVICE  static
+  auto
+  get_thread_slice(ThrIdx const& thr_idx)
+  {
+    return get_slice(thr_idx);
+  }
+};
+
+template <class TiledCopy, class ThrIdx>
+struct ThrCopy
+{
+  ThrIdx thr_idx_;
+
+  CUTE_HOST_DEVICE
+  ThrCopy(ThrIdx const& thr_idx) : thr_idx_(thr_idx) {}
+
+  template <class STensor>
+  CUTE_HOST_DEVICE
+  auto
+  partition_S(STensor&& stensor) const {
+    //static_assert(sizeof(typename remove_cvref_t<STensor>::value_type) == sizeof(typename TiledCopy::ValType),
+    //              "Expected ValType for tiling SrcTensor.");
+    auto thr_tensor = make_tensor(static_cast<STensor&&>(stensor).data(), TiledCopy::tidfrg_S(stensor.layout()));
+    return thr_tensor(thr_idx_, _, repeat<rank_v<STensor>>(_));
+  }
+
+  template <class DTensor>
+  CUTE_HOST_DEVICE
+  auto
+  partition_D(DTensor&& dtensor) const {
+    //static_assert(sizeof(typename remove_cvref_t<DTensor>::value_type) == sizeof(typename TiledCopy::ValType),
+    //              "Expected ValType for tiling DstTensor.");
+    auto thr_tensor = make_tensor(static_cast<DTensor&&>(dtensor).data(), TiledCopy::tidfrg_D(dtensor.layout()));
+    return thr_tensor(thr_idx_, _, repeat<rank_v<DTensor>>(_));
+  }
+
+  template <class STensor>
+  CUTE_HOST_DEVICE static
+  auto
+  retile_S(STensor&& stensor) {
+    // static_assert(sizeof(typename remove_cvref_t<STensor>::value_type) == sizeof(typename TiledCopy::ValType),
+    //               "Expected ValType for tiling SrcTensor.");
+    return make_tensor(static_cast<STensor&&>(stensor).data(), TiledCopy::retile(stensor.layout()));
+  }
+
+  template <class DTensor>
+  CUTE_HOST_DEVICE static
+  auto
+  retile_D(DTensor&& dtensor) {
+    // static_assert(sizeof(typename remove_cvref_t<DTensor>::value_type) == sizeof(typename TiledCopy::ValType),
+    //               "Expected ValType for tiling DstTensor.");
+    return make_tensor(static_cast<DTensor&&>(dtensor).data(), TiledCopy::retile(dtensor.layout()));
+  }
+};
+
+
+template <class... Args,
+          class LayoutCopy_TV,
+          class Tiler>
+CUTE_HOST_DEVICE
+auto
+make_tiled_copy_impl(Copy_Atom<Args...> const& atom,
+                     LayoutCopy_TV      const&,
+                     Tiler              const&)
+{
+  return TiledCopy<Copy_Atom<Args...>, LayoutCopy_TV, Tiler>{atom};
+}
+
+//
+// These tile the Copy_Atom as a whole
+//
+
+template <class... CArgs, class... MArgs>
+CUTE_HOST_DEVICE
+auto
+make_tiled_copy_A(Copy_Atom<CArgs...> const& copy_atom,
+                  TiledMMA<MArgs...>  const& mma)
+{
+  return make_tiled_copy_impl(copy_atom, mma.get_layoutA_TV(), make_shape(tile_size<0>(mma),tile_size<2>(mma)));
+}
+
+template <class... CArgs, class... MArgs>
+CUTE_HOST_DEVICE
+auto
+make_tiled_copy_B(Copy_Atom<CArgs...> const& copy_atom,
+                  TiledMMA<MArgs...>  const& mma)
+{
+  return make_tiled_copy_impl(copy_atom, mma.get_layoutB_TV(), make_shape(tile_size<1>(mma),tile_size<2>(mma)));
+}
+
+template <class... CArgs, class... MArgs>
+CUTE_HOST_DEVICE
+auto
+make_tiled_copy_C(Copy_Atom<CArgs...> const& copy_atom,
+                  TiledMMA<MArgs...>  const& mma)
+{
+  return make_tiled_copy_impl(copy_atom, mma.get_layoutC_TV(), make_shape(tile_size<0>(mma),tile_size<1>(mma)));
+}
+
+// returns the smallest tiled copy that can retile LayoutC_TV
+// for use with pipelined epilogues with subtiled stores
+template <class... CArgs, class... MArgs>
+CUTE_HOST_DEVICE
+auto
+make_tiled_copy_C_atom(Copy_Atom<CArgs...> const& copy_atom,
+                       TiledMMA<MArgs...>  const& mma)
+{
+  // Truncate the V-layout to just the Copy_Atom, keep the V-order
+  auto layoutC_TV = mma.get_layoutC_TV();
+  auto copy_V     = Int<Copy_Atom<CArgs...>::NumValSrc>{};
+  CUTE_STATIC_ASSERT_V(copy_V <= size<1>(layoutC_TV));
+  auto layout_TV  = composition(layoutC_TV, make_layout(make_shape(size<0>(layoutC_TV), copy_V)));
+
+  // Recompute tiler and restride the TV layout for the new tiler
+
+  // Tiler -- Find the active elements in the MMA tensor and generate a tiler to extract them
+  // Convert to the awkward by-mode tiler to preserve the modes of the tiled MMA
+  auto mma_tiler = make_shape(tile_size<0>(mma),tile_size<1>(mma));
+  auto mma_zeros = repeat_like(mma_tiler, Int<0>{});
+
+  auto tiler = transform(make_seq<rank(mma_tiler)>{}, [&](auto i) {
+    return filter(composition(make_layout(mma_tiler, replace<i>(mma_zeros, Int<1>{})), layout_TV));
+  });
+
+  // Layout_TV -- Find the (tid,vid) -> tile coord transformation
+  // Apply the tiler to a reference and transform the codomain
+  // tile_coord -> mma_coord
+  auto tile2mma = composition(make_layout(mma_tiler), tiler);
+
+  // (tid,vid) -> tile_coord
+  auto layout_tv = composition(left_inverse(tile2mma), layout_TV);
+
+  return make_tiled_copy_impl(copy_atom, layout_tv, tiler);
+}
+
+/** Produce a TiledCopy from logical thread and values layouts.
+ * The thread and value layouts map coordinates to thr_idx and val_idx.
+ *    The product of these layouts is taken to produce the TV layout and the Tiler.
+ * Useful when threads and values need very specific mappings onto coordinates
+ *    in the target tensors.
+ */
+template <class... Args,
+          class ThrLayout,
+          class ValLayout = Layout<_1>>
+CUTE_HOST_DEVICE
+auto
+make_tiled_copy(Copy_Atom<Args...> const& copy_atom,
+                ThrLayout          const& thr_layout = {},     // (m,n) -> thr_idx
+                ValLayout          const& val_layout = {})     // (m,n) -> val_idx
+{
+  // Take the raked_products to compute the Layout_MN
+  // (M,N) -> (thr_idx, val_idx)
+  auto layout_mn = raked_product(thr_layout, val_layout);
+  // (thr_idx, val_idx) -> (M,N)
+  auto layout_tv = right_inverse(layout_mn).with_shape(make_shape(size(thr_layout), size(val_layout)));
+  // Tiler for extracting relevant elements
+  // (M,N) -> tensor coord
+  auto tiler = product_each(shape(layout_mn));
+
+#if 0
+  print("thr_layout: "); print(thr_layout); print("\n");
+  print("val_layout: "); print(val_layout); print("\n");
+  print("layout_mn : "); print(layout_mn);  print("\n");
+  print("layout_tv : "); print(layout_tv);  print("\n");
+  print("tiler     : "); print(tiler);      print("\n");
+#endif
+
+  return make_tiled_copy_impl(copy_atom, layout_tv, tiler);
+}
+
+/** Produce a TiledCopy from thread and value offset maps.
+ * The TV Layout maps threads and values to the codomain of the data_layout.
+ * It is verified that the intended codomain is valid within data_layout.
+ * Useful when threads and values don't care about owning specific coordinates, but
+ *   care more about the vector-width and offsets between them.
+ */
+template <class... Args, class AtomTVLayout, class DataLayout>
+CUTE_HOST_DEVICE constexpr
+auto
+make_cotiled_copy(Copy_Atom<Args...> const& copy_atom,
+                  AtomTVLayout const& atom_tv_layout,   // atom (thr,val) -> data addr
+                  DataLayout   const& data_layout)      // coord          -> data addr    The target layout
+{
+  static_assert(is_static<AtomTVLayout>::value);
+  static_assert(is_static<DataLayout>::value);
+
+  // data addr -> data coord    Append 1:0 so off-the-ends get the stride-0
+  auto inv_data_layout = make_layout(left_inverse(data_layout), Layout<_1,_0>{});
+
+  // (tid,vid) -> data_coord
+  auto layout_tv_data = composition(inv_data_layout, atom_tv_layout);
+
+  // Check validity
+  CUTE_STATIC_ASSERT_V(coalesce(composition(data_layout, layout<1>(layout_tv_data))) == coalesce(layout<1>(atom_tv_layout)),
+                       "The memory pointed to by AtomTVLayout does not exist in the DataLayout.");
+
+#if 0
+  if (thread0()) {
+    print("data_layout        : "); print(data_layout); print("\n");
+    print("atom_tv_layout     : "); print(atom_tv_layout); print("\n");
+    print("layout_tv_data     : "); print(layout_tv_data); print("\n");
+  }
+#endif
+
+  //
+  // Tiler -- Find the active elements in the DATA tensor and generate a tiler to extract them
+  //
+
+  // Convert to the awkward by-mode tiler to preserve the modes of the tiled DATA
+  auto flat_data_shape = product_each(shape(data_layout));
+  auto flat_data_zeros = repeat<rank(flat_data_shape)>(Int<0>{});
+
+  auto tiler = transform(make_seq<rank(flat_data_shape)>{}, [&](auto i) {
+    return filter(composition(make_layout(flat_data_shape, replace<i>(flat_data_zeros, Int<1>{})), layout_tv_data));
+  });
+
+  //
+  // Layout_TV -- Find the (tid,vid) -> tile coord transformation
+  //
+
+  // Apply the tiler to a reference and transform the codomain
+  // tile_coord -> data_coord
+  auto tile2data = composition(make_layout(flat_data_shape), tiler);
+
+  // (tid,vid) -> tile_coord
+  auto layout_tv = composition(left_inverse(tile2data), layout_tv_data);
+
+#if 0
+  if (thread0()) {
+    print("tiler              : "); print(tiler); print("\n");
+    print("tile2data          : "); print(tile2data); print("\n");
+    print("layout_tv          : "); print(layout_tv); print("\n");
+  }
+#endif
+
+  return make_tiled_copy_impl(copy_atom, layout_tv, tiler);
+}
+
+// Make a TiledCopy out of the copy_atom that matches the Src-Layout of tiled_copy
+template <class... Args,
+          class TiledCopy>
+CUTE_HOST_DEVICE
+auto
+make_tiled_copy_S(Copy_Atom<Args...> const& copy_atom,
+                  TiledCopy          const& tiled_copy)
+{
+  return make_tiled_copy_impl(copy_atom, tiled_copy.get_layoutS_TV(), typename TiledCopy::Tiler_MN{});
+}
+
+// Make a TiledCopy out of the copy_atom that matches the Dst-Layout of tiled_copy
+template <class... Args,
+          class TiledCopy>
+CUTE_HOST_DEVICE
+auto
+make_tiled_copy_D(Copy_Atom<Args...> const& copy_atom,
+                  TiledCopy          const& tiled_copy)
+{
+  return make_tiled_copy_impl(copy_atom, tiled_copy.get_layoutD_TV(), typename TiledCopy::Tiler_MN{});
+}
+
+//
+// Size
+//
+
+// The logical size of a TileCopy
+template <int... I, class... Args>
+CUTE_HOST_DEVICE constexpr
+auto
+tile_size(TiledCopy<Args...> const&)
+{
+  return size<I...>(typename TiledCopy<Args...>::Tiler_MN{});
+}
+
+// The number of threads involved in a TiledCopy
+template <class... Args>
+CUTE_HOST_DEVICE constexpr
+auto
+size(TiledCopy<Args...> const&)
+{
+  return typename TiledCopy<Args...>::TiledNumThr{};
+}
+
+//
+// Display utilities
+//
+
+template <class... Args, class T>
+CUTE_HOST_DEVICE
+void
+print(Copy_Atom<Copy_Traits<Args...>, T> const&)
+{
+  using Atom = Copy_Atom<Copy_Traits<Args...>, T>;
+  print("Copy_Atom\n");
+  print("  ThrID:        "); print(typename Atom::ThrID{});        print("\n");
+  print("  ValLayoutSrc: "); print(typename Atom::ValLayoutSrc{}); print("\n");
+  print("  ValLayoutDst: "); print(typename Atom::ValLayoutDst{}); print("\n");
+  print("  ValLayoutRef: "); print(typename Atom::ValLayoutRef{}); print("\n");
+  print("  ValueType:    "); print(sizeof_bits<typename Atom::ValType>::value); print("b\n");
+}
+
+template <class Atom, class... Args>
+CUTE_HOST_DEVICE
+void
+print(TiledCopy<Atom, Args...> const& copy, char const* pad = "")
+{
+  using Copy = TiledCopy<Atom, Args...>;
+  print("TiledCopy\n");
+  print("  Tiler_MN:       "); print(typename Copy::Tiler_MN{});       print("\n");
+  print("  TiledLayout_TV: "); print(typename Copy::TiledLayout_TV{}); print("\n");
+  print(static_cast<Atom const&>(copy));
+}
+
+template <class TiledCopy, class ThrIdx>
+CUTE_HOST_DEVICE
+void
+print(ThrCopy<TiledCopy, ThrIdx> const& thr_copy)
+{
+  print("ThrCopy\n");
+  print("  ThrIdx: "); print(thr_copy.thr_idx_); print("\n");
+  print(TiledCopy{});
+}
+
+// TiledCopy to LaTeX TikZ
+template <class... Args, class TikzColorFn = TikzColor_TV>
+CUTE_HOST_DEVICE
+auto
+print_latex(TiledCopy<Args...> const& copy,
+            TikzColorFn color = {})              // lambda(thr_idx,val_idx) -> tikz color string
+{
+  auto [layoutS_MN, thrID_S] = copy.get_layoutS_MN();
+  auto [layoutD_MN, thrID_D] = copy.get_layoutD_MN();
+
+  print_latex_copy(layoutS_MN, thrID_S,
+                   layoutD_MN, thrID_D);
+}
+
+// MNK Copy Layout to LaTeX TikZ
+template <class LayoutS, class ThrIDS,
+          class LayoutD, class ThrIDD,
+          class TikzColorFn = TikzColor_TV>
+CUTE_HOST_DEVICE
+void
+print_latex_copy(LayoutS const& S, ThrIDS const& TS,  // (m,n) -> (tid,vid)  and  tid -> thr_idx
+                 LayoutD const& D, ThrIDD const& TD,  // (m,n) -> (tid,vid)  and  tid -> thr_idx
+                 TikzColorFn color = {})              // lambda(thr_idx,val_idx) -> tikz color string
+{
+  CUTE_STATIC_ASSERT_V(rank(S) == Int<2>{});
+  CUTE_STATIC_ASSERT_V(rank(D) == Int<2>{});
+
+  assert(size<0>(S) == size<0>(D));
+  assert(size<1>(S) == size<1>(D));
+
+  // Commented prints
+  printf("%% LayoutS: "); print(S);  printf("\n");
+  printf("%% ThrIDS : "); print(TS); printf("\n");
+  printf("%% LayoutD: "); print(D);  printf("\n");
+  printf("%% ThrIDD : "); print(TD); printf("\n\n");
+
+  // Header
+  printf("\\documentclass[convert]{standalone}\n"
+         "\\usepackage{tikz}\n\n"
+         "\\begin{document}\n"
+         "\\begin{tikzpicture}[x={(0cm,-1cm)},y={(1cm,0cm)},every node/.style={minimum size=1cm, outer sep=0pt}]\n\n");
+
+  // S starting at 0,0
+  for (int i = 0; i < size<0>(S); ++i) {
+    for (int j = 0; j < size<1>(S); ++j) {
+      int thrid   = S(i,j) % size(TS);
+      int val_idx = S(i,j) / size(TS);
+      int thr_idx = TS(thrid);
+
+      printf("\\node[fill=%s] at (%d,%d) {\\shortstack{T%d \\\\ V%d}};\n",
+             color(thr_idx, val_idx),
+             i, j,
+             thr_idx, val_idx);
+    }
+  }
+  // Grid
+  printf("\\draw[color=black,thick,shift={(-0.5,-0.5)}] (%d,%d) grid (%d,%d);\n\n",
+         0, 0, int(size<0>(S)), int(size<1>(S)));
+  // S Labels
+  for (int i =  0, j = -1; i < size<0>(S); ++i) {
+    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", i, j, i);
+  }
+  for (int i = -1, j =  0; j < size<1>(S); ++j) {
+    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", i, j, j);
+  }
+
+  // D starting at 0,size<1>(S)+3
+  for (int i = 0; i < size<0>(D); ++i) {
+    for (int j = 0; j < size<1>(D); ++j) {
+      int thrid   = D(i,j) % size(TD);
+      int val_idx = D(i,j) / size(TD);
+      int thr_idx = TD(thrid);
+
+      printf("\\node[fill=%s] at (%d,%d) {\\shortstack{T%d \\\\ V%d}};\n",
+             color(thr_idx, val_idx),
+             i, j + size<1>(S) + 3,
+             thr_idx, val_idx);
+    }
+  }
+  // Grid
+  printf("\\draw[color=black,thick,shift={(-0.5,-0.5)}] (%d,%d) grid (%d,%d);\n\n",
+         0, int(size<1>(S)+3), int(size<0>(D)), int(size<1>(D)+size<1>(S)+3));
+  // D Labels
+  for (int i = 0, j = size<1>(D); i < size<0>(D); ++i) {
+    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", i, j + size<1>(S) + 3, i);
+  }
+  for (int i = -1, j =         0; j < size<1>(D); ++j) {
+    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", i, j + size<1>(S) + 3, j);
+  }
+
+  // Footer
+  printf("\\end{tikzpicture}\n"
+         "\\end{document}\n");
+}
+
+} // end namespace cute
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <cute/atom/copy_traits_sm50.hpp>
+#include <cute/atom/copy_traits_sm75.hpp>
+#include <cute/atom/copy_traits_sm80.hpp>
+#include <cute/atom/copy_traits_sm90.hpp>
+
+// Config
+#if (__CUDACC_VER_MAJOR__ >= 12)
+#  define CUTE_COPY_ATOM_TMA_SM90_ENABLED
+#endif
+
+#if defined(CUTE_COPY_ATOM_TMA_SM90_ENABLED)
+#include <cute/atom/copy_traits_sm90_tma.hpp>
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cute/atom/copy_traits.hpp b/include/cute/atom/copy_traits.hpp
new file mode 100644
index 0000000000..ac746a64e1
--- /dev/null
+++ b/include/cute/atom/copy_traits.hpp
@@ -0,0 +1,162 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/arch/copy.hpp>
+
+#include <cute/tensor_impl.hpp>
+
+namespace cute
+{
+
+/**
+ * concept Copy_Traits
+ * {
+ *   using ThrID     =    // Logical thread id (tid) -> tidx
+ *
+ *   using SrcLayout =    // (Logical src thread id (tid), Logical src value id (vid)) -> bit
+ *   using DstLayout =    // (Logical dst thread id (tid), Logical dst value id (vid)) -> bit
+ *   using RefLayout =    // (Logical ref thread id (tid), Logical ref value id (vid)) -> bit
+ * };
+ *
+ * The abstract bit ordering of the Copy_Traits (the codomain of SrcLayout, DstLayout, and RefLayout)
+ * is arbitrary and only used to construct maps
+ *   (ref-tid,ref-vid) -> (src-tid,src-vid)
+ *   (ref-tid,ref-vid) -> (dst-tid,dst-vid)
+ * in TiledCopy. The Layout_TV in TiledCopy is in accordance with the RefLayout of a Traits, then mapped to
+ * the Src or Dst (tid,vid) representation on demand.
+ *
+ */
+
+template <class CopyOperation, class... CopyOpArgs>
+struct Copy_Traits
+{
+  static_assert(dependent_false<CopyOperation>, "Copy_Traits not implemented for this CopyOperation.");
+};
+
+template <class S, class D>
+struct Copy_Traits<UniversalCopy<S,D>>
+{
+  // Logical thread id to thread idx (one-thread)
+  using ThrID = Layout<_1>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,Int<sizeof_bits<S>::value>>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,Int<sizeof_bits<D>::value>>>;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+};
+
+template <int MaxVecBits>
+struct Copy_Traits<AutoVectorizingCopyWithAssumedAlignment<MaxVecBits>>
+{
+  // Logical thread id to thread idx (one-thread)
+  using ThrID = Layout<_1>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,_1>, Stride<_0,_0>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,_1>, Stride<_0,_0>>;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+};
+
+// Extract a CPY_Op from a CPY_Traits
+template <class CPY_Traits>
+struct CPY_Op {};
+
+template <class CPY_Op_Arg, class... Args>
+struct CPY_Op<Copy_Traits<CPY_Op_Arg, Args...>> {
+  using type = CPY_Op_Arg;
+};
+  
+//
+// Generic copy_unpack for common argument-based Copy_Traits
+//
+
+template <class AnyCPYTraits,
+          class SEngine, class SLayout,
+          class DEngine, class DLayout>
+CUTE_HOST_DEVICE constexpr
+void
+copy_unpack(AnyCPYTraits            const&,
+            Tensor<SEngine,SLayout> const& src,
+            Tensor<DEngine,DLayout>      & dst)
+{
+  using CopyOp       = typename CPY_Op<AnyCPYTraits>::type;  
+  using RegistersSrc = typename CopyOp::SRegisters;
+  using RegistersDst = typename CopyOp::DRegisters;
+  using RegTypeSrc   = typename remove_extent<RegistersSrc>::type;
+  using RegTypeDst   = typename remove_extent<RegistersDst>::type;
+  constexpr int RegNumSrc = extent<RegistersSrc>::value;
+  constexpr int RegNumDst = extent<RegistersDst>::value;
+
+  Tensor rS = recast<RegTypeSrc>(src);
+  Tensor rD = recast<RegTypeDst>(dst);
+
+  CUTE_STATIC_ASSERT_V(size(rS) == Int<RegNumSrc>{},
+    "Copy_Traits: src failed to vectorize into registers. Layout is incompatible with this CopyOp.");
+  CUTE_STATIC_ASSERT_V(size(rD) == Int<RegNumDst>{},
+    "Copy_Traits: dst failed to vectorize into registers. Layout is incompatible with this CopyOp.");
+
+  detail::explode(detail::CallCOPY<CopyOp>{},
+                  rS, make_int_sequence<RegNumSrc>{},
+                  rD, make_int_sequence<RegNumDst>{});
+}
+
+// Accept mutable temporaries
+template <class AnyCPYTraits,
+          class SEngine, class SLayout,
+          class DEngine, class DLayout>
+CUTE_HOST_DEVICE constexpr
+void
+copy_unpack(AnyCPYTraits            const& traits,
+            Tensor<SEngine,SLayout> const& src,
+            Tensor<DEngine,DLayout>     && dst)
+{
+  copy_unpack(traits, src, dst);
+}
+
+namespace detail {
+
+template <class CopyOp, class = void>
+constexpr bool is_prefetch = false;
+
+template <class CopyOp>
+constexpr bool is_prefetch<CopyOp, void_t<typename CopyOp::PREFETCH>> = is_same_v<CopyOp, typename CopyOp::PREFETCH>;
+
+} // end namespace detail
+
+
+} // end namespace cute
diff --git a/include/cute/atom/copy_traits_sm50.hpp b/include/cute/atom/copy_traits_sm50.hpp
new file mode 100644
index 0000000000..7a693805e6
--- /dev/null
+++ b/include/cute/atom/copy_traits_sm50.hpp
@@ -0,0 +1,75 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/arch/copy_sm50.hpp>
+#include <cute/atom/copy_traits.hpp>
+
+#include <cute/layout.hpp>
+
+namespace cute
+{
+
+template <>
+struct Copy_Traits<SM50_Shuffle_U32_2x2Trans_XOR1>
+{
+  // Logical thread id to thread idx (one-thread)
+  using ThrID = Layout<_32>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape <_32,_64>,
+                           Stride<_64, _1>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape <Shape < _2,  _16>,Shape <_32,  _2>>,
+                           Stride<Stride<_32, _128>,Stride< _1, _64>>>;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+};
+
+template <>
+struct Copy_Traits<SM50_Shuffle_U32_2x2Trans_XOR4>
+{
+  // Logical thread id to thread idx (one-thread)
+  using ThrID = Layout<_32>;
+ 
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape <_32,_64>,
+                           Stride<_64, _1>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape <Shape < _4,  _2,   _4>, Shape<_32,   _2>>,
+                           Stride<Stride<_64, _32, _512>,Stride< _1, _256>>>;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+};
+
+} // end namespace cute
diff --git a/include/cute/atom/copy_traits_sm75.hpp b/include/cute/atom/copy_traits_sm75.hpp
new file mode 100644
index 0000000000..9ad82c6174
--- /dev/null
+++ b/include/cute/atom/copy_traits_sm75.hpp
@@ -0,0 +1,143 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/arch/copy_sm75.hpp>
+#include <cute/atom/copy_traits.hpp>
+
+#include <cute/layout.hpp>
+
+namespace cute
+{
+
+template <>
+struct Copy_Traits<SM75_U32x1_LDSM_N>
+{
+  // Logical thread id to thread idx (warp)
+  using ThrID = Layout<_32>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape <Shape <  _8,_4>,_128>,
+                           Stride<Stride<_128,_0>,  _1>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape <_32,_32>,
+                           Stride<_32, _1>>;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = DstLayout;
+};
+
+template <>
+struct Copy_Traits<SM75_U32x2_LDSM_N>
+{
+  // Logical thread id to thread idx (warp)
+  using ThrID = Layout<_32>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape <Shape < _16,_2>,_128>,
+                           Stride<Stride<_128,_0>,  _1>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape <_32,Shape <_32,   _2>>,
+                           Stride<_32,Stride< _1,_1024>>>;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = DstLayout;
+};
+
+template <>
+struct Copy_Traits<SM75_U32x4_LDSM_N>
+{
+  // Logical thread id to thread idx (warp)
+  using ThrID = Layout<_32>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape < _32,_128>,
+                           Stride<_128,  _1>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape <_32,Shape <_32,   _4>>,
+                           Stride<_32,Stride< _1,_1024>>>;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = DstLayout;
+};
+
+template <>
+struct Copy_Traits<SM75_U16x2_LDSM_T>
+{
+  // Logical thread id to thread idx (warp)
+  using ThrID = Layout<_32>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape <Shape <  _8,_4>,_128>,
+                           Stride<Stride<_128,_0>,  _1>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape <Shape <  _4, _8>,Shape <_16,  _2>>,
+                           Stride<Stride<_256,_16>,Stride< _1,_128>>>;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = DstLayout;
+};
+
+template <>
+struct Copy_Traits<SM75_U16x4_LDSM_T>
+{
+  // Logical thread id to thread idx (warp)
+  using ThrID = Layout<_32>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape <Shape < _16,_2>,_128>,
+                           Stride<Stride<_128,_0>,  _1>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape <Shape <  _4, _8>,Shape <_16,  _2,   _2>>,
+                           Stride<Stride<_256,_16>,Stride< _1,_128,_1024>>>;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = DstLayout;
+};
+
+template <>
+struct Copy_Traits<SM75_U16x8_LDSM_T>
+{
+  // Logical thread id to thread idx (warp)
+  using ThrID = Layout<_32>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape < _32,_128>,
+                           Stride<_128,  _1>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape <Shape <  _4, _8>,Shape <_16,  _2,   _4>>,
+                           Stride<Stride<_256,_16>,Stride< _1,_128,_1024>>>;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = DstLayout;
+};
+
+} // end namespace cute
diff --git a/include/cute/atom/copy_traits_sm80.hpp b/include/cute/atom/copy_traits_sm80.hpp
new file mode 100644
index 0000000000..3795f52a89
--- /dev/null
+++ b/include/cute/atom/copy_traits_sm80.hpp
@@ -0,0 +1,167 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/arch/copy_sm80.hpp>
+#include <cute/atom/copy_traits.hpp>
+
+#include <cute/layout.hpp>
+
+namespace cute
+{
+
+template <class S, class D>
+struct Copy_Traits<SM80_CP_ASYNC_CACHEALWAYS<S,D>>
+{
+  // Logical thread id to thread idx (one-thread)
+  using ThrID = Layout<_1>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,Int<sizeof_bits<S>::value>>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,Int<sizeof_bits<D>::value>>>;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+};
+
+template <class S, class D>
+struct Copy_Traits<SM80_CP_ASYNC_CACHEGLOBAL<S,D>>
+{
+  // Logical thread id to thread idx (one-thread)
+  using ThrID = Layout<_1>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,Int<sizeof_bits<S>::value>>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,Int<sizeof_bits<D>::value>>>;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+};
+
+template <class S, class D>
+struct Copy_Traits<SM80_CP_ASYNC_CACHEALWAYS_ZFILL<S,D>>
+{
+  // Logical thread id to thread idx (one-thread)
+  using ThrID = Layout<_1>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,Int<sizeof_bits<S>::value>>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,Int<sizeof_bits<D>::value>>>;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  // Predicate value: true = load, false = zfill
+  bool pred = true;
+
+  // Construct a zfill variant with a given predicate value
+  CUTE_HOST_DEVICE constexpr
+  Copy_Traits<SM80_CP_ASYNC_CACHEALWAYS_ZFILL<S,D>>
+  with(bool pred) const {
+    return {pred};
+  }
+
+  // Overload copy_unpack for zfill variant to pass the predicate into the op
+  template <class TS, class SLayout,
+            class TD, class DLayout>
+  CUTE_HOST_DEVICE friend constexpr
+  void
+  copy_unpack(Copy_Traits        const& traits,
+              Tensor<TS,SLayout> const& src,
+              Tensor<TD,DLayout>      & dst)
+  {
+    static_assert(is_gmem<TS>::value, "Expected gmem source for cp.async.");
+    static_assert(is_smem<TD>::value, "Expected smem destination for cp.async.");
+
+    Tensor rS = recast<S>(src);
+    Tensor rD = recast<D>(dst);
+
+    CUTE_STATIC_ASSERT_V(size(rS) == Int<1>{},
+      "In CopyAtom, src layout doesn't vectorize into registers. This src layout is incompatible with this tiled copy.");
+    CUTE_STATIC_ASSERT_V(size(rD) == Int<1>{},
+      "In CopyAtom, dst layout doesn't vectorize into registers. This dst layout is incompatible with this tiled copy.");
+
+    SM80_CP_ASYNC_CACHEALWAYS_ZFILL<S,D>::copy(rS[0], rD[0], traits.pred);
+  }
+};
+
+template <class S, class D>
+struct Copy_Traits<SM80_CP_ASYNC_CACHEGLOBAL_ZFILL<S,D>>
+{
+  // Logical thread id to thread idx (one-thread)
+  using ThrID = Layout<_1>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,Int<sizeof_bits<S>::value>>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,Int<sizeof_bits<D>::value>>>;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  // Predicate value: true = load, false = zfill
+  bool pred = true;
+
+  // Construct a zfill variant with a given predicate value
+  CUTE_HOST_DEVICE constexpr
+  Copy_Traits<SM80_CP_ASYNC_CACHEGLOBAL_ZFILL<S,D>>
+  with(bool pred) const {
+    return {pred};
+  }
+
+  // Overload copy_unpack for zfill variant to pass the predicate into the op
+  template <class TS, class SLayout,
+            class TD, class DLayout>
+  CUTE_HOST_DEVICE friend constexpr
+  void
+  copy_unpack(Copy_Traits        const& traits,
+              Tensor<TS,SLayout> const& src,
+              Tensor<TD,DLayout>      & dst)
+  {
+    static_assert(is_gmem<TS>::value, "Expected gmem source for cp.async.");
+    static_assert(is_smem<TD>::value, "Expected smem destination for cp.async.");
+
+    Tensor rS = recast<S>(src);
+    Tensor rD = recast<D>(dst);
+
+    CUTE_STATIC_ASSERT_V(size(rS) == Int<1>{},
+      "In CopyAtom, src layout doesn't vectorize into registers. This src layout is incompatible with this tiled copy.");
+    CUTE_STATIC_ASSERT_V(size(rD) == Int<1>{},
+      "In CopyAtom, dst layout doesn't vectorize into registers. This dst layout is incompatible with this tiled copy.");
+
+    SM80_CP_ASYNC_CACHEGLOBAL_ZFILL<S,D>::copy(rS[0], rD[0], traits.pred);
+  }
+};
+
+} // end namespace cute
diff --git a/include/cute/atom/copy_traits_sm90.hpp b/include/cute/atom/copy_traits_sm90.hpp
new file mode 100644
index 0000000000..f9590848af
--- /dev/null
+++ b/include/cute/atom/copy_traits_sm90.hpp
@@ -0,0 +1,132 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/arch/copy_sm90.hpp>
+#include <cute/atom/copy_traits.hpp>
+#include <cute/atom/copy_traits_sm75.hpp>
+
+#include <cute/layout.hpp>
+
+namespace cute
+{
+
+template <>
+struct Copy_Traits<SM90_U32x1_STSM_N>
+{
+  // Logical thread id to thread idx (warp)
+  using ThrID   = Layout<_32>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = typename Copy_Traits<SM75_U32x1_LDSM_N>::DstLayout;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = typename Copy_Traits<SM75_U32x1_LDSM_N>::SrcLayout;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+};
+
+template <>
+struct Copy_Traits<SM90_U32x2_STSM_N>
+{
+  // Logical thread id to thread idx (warp)
+  using ThrID   = Layout<_32>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = typename Copy_Traits<SM75_U32x2_LDSM_N>::DstLayout;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = typename Copy_Traits<SM75_U32x2_LDSM_N>::SrcLayout;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+};
+
+template <>
+struct Copy_Traits<SM90_U32x4_STSM_N>
+{
+  // Logical thread id to thread idx (warp)
+  using ThrID   = Layout<_32>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = typename Copy_Traits<SM75_U32x4_LDSM_N>::DstLayout;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = typename Copy_Traits<SM75_U32x4_LDSM_N>::SrcLayout;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+};
+
+template <>
+struct Copy_Traits<SM90_U16x2_STSM_T>
+{
+  // Logical thread id to thread idx (warp)
+  using ThrID   = Layout<_32>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = typename Copy_Traits<SM75_U16x2_LDSM_T>::DstLayout;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = typename Copy_Traits<SM75_U16x2_LDSM_T>::SrcLayout;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+};
+
+template <>
+struct Copy_Traits<SM90_U16x4_STSM_T>
+{
+  // Logical thread id to thread idx (warp)
+  using ThrID   = Layout<_32>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = typename Copy_Traits<SM75_U16x4_LDSM_T>::DstLayout;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = typename Copy_Traits<SM75_U16x4_LDSM_T>::SrcLayout;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+};
+
+template <>
+struct Copy_Traits<SM90_U16x8_STSM_T>
+{
+  // Logical thread id to thread idx (warp)
+  using ThrID   = Layout<_32>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = typename Copy_Traits<SM75_U16x8_LDSM_T>::DstLayout;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = typename Copy_Traits<SM75_U16x8_LDSM_T>::SrcLayout;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+};
+
+} // end namespace cute
diff --git a/include/cute/atom/copy_traits_sm90_im2col.hpp b/include/cute/atom/copy_traits_sm90_im2col.hpp
new file mode 100644
index 0000000000..54f76073b1
--- /dev/null
+++ b/include/cute/atom/copy_traits_sm90_im2col.hpp
@@ -0,0 +1,940 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+/*! \file
+  \brief im2col make_tma_copy
+*/
+
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/arch/copy_sm90_desc.hpp"
+#include "cute/tensor.hpp"
+
+#include "cute/algorithm/prefetch.hpp"
+#include "cutlass/fast_math.h"
+#include "cutlass/cuda_host_adapter.hpp"
+
+namespace cute
+{
+
+// Utility for unpacking TMA_LOAD_IM2COL arguments into a CopyOp
+template <class CopyOp>
+struct TMA_LOAD_IM2COL_Unpack
+{
+  /// Copy from src to dst.
+  ///
+  /// @param traits Copy traits created with a TMA descriptor that
+  ///   correctly matches the input tensor and other convolution
+  ///   parameters.
+  ///
+  /// @param src Tile of the im2col-transformed coordinate tensor
+  ///   (result of get_tma_tensor), representing the global-memory
+  ///   tensor from which to load.
+  ///
+  /// @param dst Shared memory tile, into which to load.
+  template <class... Args,
+            class TS, class SLayout,
+            class TD, class DLayout>
+  CUTE_HOST_DEVICE friend constexpr void
+  copy_unpack(Copy_Traits<CopyOp, Args...> const& traits,
+              Tensor<TS,SLayout>           const& src, // tile of the transformed global activation (A) tensor
+              Tensor<TD,DLayout>                & dst) // shared memory tile
+  {
+    auto src_coord_offset = src(Int<0>{});
+    auto src_coord_cwhdn_offset_srt = flatten(src_coord_offset);
+    // Interpret the TMA IM2COL coordinate as  (c, ([w,h,d]), n, ([s,r,t]))
+    CUTE_STATIC_ASSERT_V(rank(src_coord_offset) == _4{});
+    CUTE_STATIC_ASSERT_V(rank<1>(src_coord_offset) == rank<3>(src_coord_offset));
+
+    if constexpr (detail::is_prefetch<CopyOp>) {
+      return detail::explode_tuple(detail::CallCOPY<CopyOp>{},
+                                   traits.opargs_, tuple_seq<decltype(traits.opargs_)>{},
+                                   src_coord_cwhdn_offset_srt, tuple_seq<decltype(src_coord_cwhdn_offset_srt)>{});
+    } else {
+      static_assert(is_smem<TD>::value, "SM90_TMA_LOAD_IM2COL requires the destination be shared memory.");
+      void* dst_ptr = cute::raw_pointer_cast(dst.data());
+      return detail::explode_tuple(detail::CallCOPY<CopyOp>{},
+                                   traits.opargs_, tuple_seq<decltype(traits.opargs_)>{},
+                                   make_tuple(dst_ptr), seq<0>{},
+                                   src_coord_cwhdn_offset_srt, tuple_seq<decltype(src_coord_cwhdn_offset_srt)>{});
+    }
+  }
+};
+
+// Copy_Traits for SM90 im2col TMA load comes in two layers.
+//
+// 1. Copy_Traits<SM90_TMA_LOAD_IM2COL>
+// 2. Copy_Traits<SM90_TMA_LOAD_IM2COL_OP>
+//
+// Copy_Traits<SM90_TMA_LOAD_IM2COL>
+// is the "outer" layer.  It has a TMA descriptor,
+// but no barrier ("tma_mbar"), so it's "nonexecutable."
+// One calls its "with" member function with a barrier,
+// to get an executable "inner"-layer
+// Copy_Traits<SM90_TMA_LOAD_IM2COL_OP> object.
+// That object's "copy_unpack" member function
+// actually invokes im2col TMA load.
+
+struct SM90_TMA_LOAD_IM2COL_OP : SM90_TMA_LOAD_IM2COL {};
+
+/// @brief Non-executable specialization of Copy_Traits for SM90
+///   im2col TMA load, with TMA descriptor but no barrier.
+///
+/// Use `.with(memory_barrier)` to construct an executable version.
+template <class NumBitsPerTMA, class TMATensor>
+struct Copy_Traits<SM90_TMA_LOAD_IM2COL, NumBitsPerTMA, TMATensor>
+{
+  using ThrID = Layout<_1>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1, NumBitsPerTMA>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1, NumBitsPerTMA>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  Im2ColTmaDescriptor tma_desc_;
+  TMATensor tma_tensor_;
+
+  CUTE_HOST_DEVICE constexpr
+  Im2ColTmaDescriptor const*
+  get_tma_descriptor() const
+  {
+    return &tma_desc_;
+  }
+
+  template <class GShape>
+  CUTE_HOST_DEVICE constexpr
+  TMATensor const
+  get_tma_tensor(GShape const&) const
+  {
+    return tma_tensor_;
+  }
+
+  /// @brief Get an executable specialization.
+  ///
+  /// Copy_Traits specializations with SM90_TMA_LOAD_IM2COL are not
+  /// directly executable.  Instead, call this "with" member function
+  /// to get an executable specialization.  "Executable" means that
+  /// @c copy_unpack works.
+  ///
+  /// @param tma_mbar Memory barrier for synchronization
+  ///
+  /// @param multicast_mask Multicast mask (unused; only exists
+  ///   for interface compatibility with the actual multicast Copy_Traits)
+  ///
+  /// @return Executable specialization of @c Copy_Traits
+  CUTE_HOST_DEVICE constexpr
+  Copy_Traits<SM90_TMA_LOAD_IM2COL_OP, NumBitsPerTMA>
+  with(uint64_t& tma_mbar, [[maybe_unused]] uint16_t const& multicast_mask = 0) const
+  {
+    return {{}, {&tma_desc_, &tma_mbar}};
+  }
+
+  // Copy_Traits specializations with SM90_TMA_LOAD_IM2COL
+  // are not directly executable.  Instead, call .with
+  // to get an executable specialization.
+  template <class TS, class SLayout,
+            class TD, class DLayout>
+  CUTE_HOST_DEVICE friend constexpr void
+  copy_unpack(Copy_Traits        const& traits,
+              Tensor<TS,SLayout> const& src,
+              Tensor<TD,DLayout>      & dst) = delete;
+};
+
+/// @brief Executable specialization of Copy_Traits for SM90 im2col
+///   TMA load, with TMA descriptor and barrier.
+template <class NumBitsPerTMA>
+struct Copy_Traits<SM90_TMA_LOAD_IM2COL_OP, NumBitsPerTMA>
+     : TMA_LOAD_IM2COL_Unpack<SM90_TMA_LOAD_IM2COL_OP>
+{
+  using ThrID = Layout<_1>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1, NumBitsPerTMA>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1, NumBitsPerTMA>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  // SM90_TMA_LOAD_IM2COL arguments
+  tuple<
+  Im2ColTmaDescriptor const*,
+  uint64_t* // smem mbarrier
+  > const opargs_;
+};
+
+template <class NumBitsPerTMA, class... Args>
+struct Copy_Traits<SM90_TMA_LOAD_IM2COL::PREFETCH, NumBitsPerTMA, Args...>
+     : TMA_LOAD_IM2COL_Unpack<SM90_TMA_LOAD_IM2COL::PREFETCH>
+{
+  using ThrID = Layout<_1>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1, NumBitsPerTMA>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1, NumBitsPerTMA>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  // SM90_TMA_LOAD_IM2COL::PREFETCH arguments
+  tuple<Im2ColTmaDescriptor const*> const opargs_;
+
+  CUTE_HOST_DEVICE
+  Copy_Traits(Copy_Traits<SM90_TMA_LOAD_IM2COL, NumBitsPerTMA, Args...> const& traits)
+    : opargs_({&traits.tma_desc_}) {}
+};
+
+//////////////////////////////////////////////////////////////////////////////
+///////////////////////////// TMA_LOAD_MULTICAST /////////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+
+struct SM90_TMA_LOAD_IM2COL_MULTICAST_OP : SM90_TMA_LOAD_IM2COL_MULTICAST {};
+
+/// @brief Non-executable specialization of Copy_Traits for SM90
+///   im2col TMA load, with TMA descriptor but no barrier or multicast
+///   mask.
+///
+/// Use `.with(memory_barrier)` to construct an executable version.
+template <class NumBitsPerTMA, class TMATensor>
+struct Copy_Traits<SM90_TMA_LOAD_IM2COL_MULTICAST, NumBitsPerTMA, TMATensor>
+{
+  using ThrID = Layout<_1>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1, NumBitsPerTMA>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1, NumBitsPerTMA>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  Im2ColTmaDescriptor tma_desc_;
+  TMATensor tma_tensor_;
+
+  CUTE_HOST_DEVICE constexpr
+  Im2ColTmaDescriptor const*
+  get_tma_descriptor() const {
+    return &tma_desc_;
+  }
+
+  template <class GShape>
+  CUTE_HOST_DEVICE constexpr
+  TMATensor const
+  get_tma_tensor(GShape const&) const
+  {
+    return tma_tensor_;
+  }
+
+  /// @brief Get an executable specialization.
+  ///
+  /// Copy_Traits specializations with SM90_TMA_LOAD_IM2COL_MULTICAST
+  /// are not directly executable.  Instead, call this "with" member
+  /// function to get an executable specialization.  "Executable"
+  /// means that @c copy_unpack works.
+  ///
+  /// @param tma_mbar Memory barrier for synchronization
+  ///
+  /// @param multicast_mask Multicast mask (defaults to a single CTA)
+  ///
+  /// @return Executable specialization of @c Copy_Traits
+  CUTE_HOST_DEVICE constexpr
+  Copy_Traits<SM90_TMA_LOAD_IM2COL_MULTICAST_OP, NumBitsPerTMA>
+  with(uint64_t& tma_mbar, uint16_t const& multicast_mask) const {
+    return {{}, {&tma_desc_, &tma_mbar, multicast_mask}};
+  }
+
+  // Copy_Traits specializations with SM90_TMA_LOAD_IM2COL_MULTICAST
+  // are not directly executable.  Instead, call .with to get an
+  // executable specialization.
+  template <class TS, class SLayout,
+            class TD, class DLayout>
+  CUTE_HOST_DEVICE friend constexpr void
+  copy_unpack(Copy_Traits        const& traits,
+              Tensor<TS,SLayout> const& src,
+              Tensor<TD,DLayout>      & dst) = delete;
+};
+
+/// @brief Executable specialization of Copy_Traits for SM90 multicast
+///   im2col TMA load, with TMA descriptor, barrier, and multicast mask.
+template <class NumBitsPerTMA>
+struct Copy_Traits<SM90_TMA_LOAD_IM2COL_MULTICAST_OP, NumBitsPerTMA>
+     : TMA_LOAD_IM2COL_Unpack<SM90_TMA_LOAD_IM2COL_MULTICAST_OP>
+{
+  using ThrID = Layout<_1>;
+  // Map from (src-thr,src-val) to bit.
+  using SrcLayout = Layout<Shape<_1, NumBitsPerTMA>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1, NumBitsPerTMA>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  // SM90_TMA_LOAD_IM2COL_MULTICAST arguments
+  tuple<
+  Im2ColTmaDescriptor const*,
+  uint64_t*, // smem mbarrier
+  uint16_t   // multicast mask
+  > const opargs_;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+///////////////////////////// TMA_STORE IM2COL////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+
+// The executable SM90_TMA_STORE_IM2COL with tma_desc
+template <class NumBitsPerTMA, class TMATensor>
+struct Copy_Traits<SM90_TMA_STORE_IM2COL, NumBitsPerTMA, TMATensor>
+{
+  using ThrID   = Layout<_1>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  // SM90_TMA_STORE_IM2COL arguments
+  Im2ColTmaDescriptor tma_desc_;
+  TMATensor tma_tensor_;
+
+  // Return TmaDescriptor/TensorMap
+  CUTE_HOST_DEVICE constexpr
+  Im2ColTmaDescriptor const*
+  get_tma_descriptor() const {
+    return &tma_desc_;
+  }
+
+  template <class GShape>
+  CUTE_HOST_DEVICE constexpr
+  TMATensor const
+  get_tma_tensor(GShape const&) const
+  {
+    return tma_tensor_;
+  }
+
+  // This is the copy_unpack dispatch for this Copy_Traits
+  // Src needs to be a smem tensor
+  // Dst needs to be a gmem tensor with TmaCoordIterator .data()
+  template <class TS, class SLayout,
+            class TD, class DLayout>
+  CUTE_HOST_DEVICE friend constexpr void
+  copy_unpack(Copy_Traits        const& traits,
+              Tensor<TS,SLayout> const& src,
+              Tensor<TD,DLayout>      & dst)
+  {
+    static_assert(is_smem<TS>::value, "Expected smem src for SM90_TMA_STORE_IM2COL");
+
+    void const* const desc_ptr = &(traits.tma_desc_);
+    void const* const src_ptr  = cute::raw_pointer_cast(src.data());
+    auto dst_coord = flatten(take<0,3>(dst(Int<0>{})));
+
+    return detail::explode_tuple(detail::CallCOPY<SM90_TMA_STORE_IM2COL>{},
+                                 make_tuple(desc_ptr, src_ptr), seq<0,1>{},
+                                 dst_coord, tuple_seq<decltype(dst_coord)>{});
+  }
+};
+
+namespace detail {
+
+/// @brief Creates a TMA descriptor for im2col TMA load.
+///
+/// @param tensor_cwhdn Global activation tensor (A matrix of Fprop).
+///   This is the original (not im2col-transformed) tensor in global
+///   memory.
+///
+/// @param slayout Rank 2 (M,K) shared memory layout of the activation
+///   tensor.  Here, K is "GEMM K," not the filter tensor's mode of
+///   the same name.
+//////
+/// @param traversal_stride Traversal strides convolution parameter
+//////
+/// Each of padding_shape, traversal_stride, and dilation_shape is a
+/// tuple whose size is the number of spatial modes (e.g., 3 for a 5-D
+/// convolution).
+///
+/// @return TMA descriptor for im2col TMA load
+template <class EngineA, class LayoutA,
+          class SmemSwizzle, class TMALayout,
+          class LowerCornerStride,
+          class UpperCornerStride,
+          class LowerPaddingStride,
+          class UpperPaddingStride,
+          class TraversalStride,
+          class LowerSRTStride,
+          class DilationStride>
+CUTE_HOST
+auto
+make_im2col_tma_copy_desc(
+    Tensor<EngineA, LayoutA>    const& tensor_cwhdn,       // (C,W,H,D,N)
+    uint32_t                           range_c,            // TILE_C
+    uint32_t                           range_whdn,         // TILE_WHDN
+    SmemSwizzle                 const& smem_swizzle,       // Swizzle
+    TMALayout                   const& tma_layout_vt,      // TMA layout
+    LowerCornerStride           const& lower_corner_whd,   // WHD offset of the "base pointer"
+    UpperCornerStride           const& upper_corner_whd,   // WHD upper corner
+    LowerPaddingStride          const& lower_padding_whd,  // WHD lower padding
+    UpperPaddingStride          const& upper_padding_whd,  // WHD upper padding
+    TraversalStride             const& stride_whd,         // WHD traversal stride
+    LowerSRTStride              const& lower_srt,          // SRT offset of the "base pointer"
+    DilationStride              const& stride_srt,          // SRT stride - dilation
+    TMA::DescriptorAuxParams    const& aux_params = {})
+{
+  static_assert(is_gmem<EngineA>::value, "Tensor must point to GPU global memory.");
+  using value_type = typename EngineA::value_type;
+
+  constexpr uint32_t num_total_modes   = LayoutA::rank;
+  constexpr int      num_spatial_modes = num_total_modes - 2;
+
+  // Gmem starting address
+  void* gmem_address = (void*) raw_pointer_cast(tensor_cwhdn.data());
+
+  // Gmem extents are just the tensor shape
+  cute::array<uint64_t, 5> gmem_prob_shape = {1,1,1,1,1};
+  for_each(make_seq<num_total_modes>{}, [&](auto i) {
+    gmem_prob_shape[i] = static_cast<uint64_t>(shape<i>(tensor_cwhdn));
+  });
+
+  // Gmem strides are byte strides of the activation tensor in CWHDN order
+  cute::array<uint64_t, 5> gmem_prob_stride = {0,0,0,0,0};
+  for_each(make_seq<num_total_modes>{}, [&](auto i) {
+    gmem_prob_stride[i] = sizeof(value_type) * stride<i>(tensor_cwhdn);
+  });
+
+  // Traversal strides are a function of the dilation shape
+  // corresponding to spatial (WHD) modes.
+  cute::array<uint32_t, 5> tma_traversal_strides = {1,1,1,1,1};
+  for_each(make_seq<num_spatial_modes>{}, [&](auto i) {
+    tma_traversal_strides[i+1] = static_cast<uint32_t>(get<i>(stride_whd));
+  });
+
+  cute::array<int32_t, num_spatial_modes> tma_lower_corner{};
+  for_each(make_seq<num_spatial_modes>{}, [&](auto i) {
+    tma_lower_corner[i] = static_cast<int32_t>(get<i>(lower_corner_whd));
+  });
+
+  cute::array<int32_t, num_spatial_modes> tma_upper_corner{};
+  for_each(make_seq<num_spatial_modes>{}, [&](auto i) {
+    tma_upper_corner[i] = static_cast<int32_t>(get<i>(upper_corner_whd));
+  });
+
+  Im2ColTmaDescriptor tma_desc;
+
+#if (__CUDACC_VER_MAJOR__ >= 12)
+
+  CUtensorMapDataType     tma_format      = TMA::to_CUtensorMapDataType<value_type>();
+  CUtensorMapInterleave   tma_interleave  = CU_TENSOR_MAP_INTERLEAVE_NONE;
+  CUtensorMapL2promotion  tma_l2Promotion = to_CUtensorMapL2promotion(aux_params.l2promo_);
+  CUtensorMapFloatOOBfill tma_oob_fill    = to_CUtensorMapFloatOOBfill(aux_params.oobfill_);
+  TMA::SmemSwizzleBits    swizzle_bits    = detail::get_tma_swizzle_bits(smem_swizzle);
+  TMA::SmemSwizzleBase    swizzle_base    = detail::get_tma_swizzle_base(smem_swizzle);
+  CUtensorMapSwizzle      tma_swizzle     = TMA::to_CUtensorMapSwizzle(swizzle_bits, swizzle_base);
+
+  CUresult encode_result = CUTLASS_CUDA_DRIVER_WRAPPER_CALL(cuTensorMapEncodeIm2col)(
+      &tma_desc,
+      tma_format,
+      num_total_modes,
+      gmem_address,
+      gmem_prob_shape.data(),
+      gmem_prob_stride.data() + 1, // gmem_prob_stride[0] implicitly sizeof(value_type)
+      tma_lower_corner.data(),
+      tma_upper_corner.data(),
+      range_c,
+      range_whdn,
+      tma_traversal_strides.data(),
+      tma_interleave,
+      tma_swizzle,
+      tma_l2Promotion,
+      tma_oob_fill);
+
+  // The extra asserts help indicate the error's cause.
+  assert(encode_result != CUDA_ERROR_DEINITIALIZED);
+  assert(encode_result != CUDA_ERROR_NOT_INITIALIZED);
+  assert(encode_result != CUDA_ERROR_INVALID_CONTEXT);
+  assert(encode_result != CUDA_ERROR_INVALID_VALUE);
+  assert(encode_result == CUDA_SUCCESS);
+
+#endif // (__CUDACC_VER_MAJOR__ >= 12)
+  //
+  // Calculate gemm shapes and linearized shapes based on tma layout tiling.
+  //
+
+  // Compute [w, h, d, n]
+  // q/p/z = (w/h/d + (upper_corner_whd - lower_corner_whd - 1)) / stride_whd + 1
+  auto gemm_mn_ = cute::transform(cute::make_seq<num_spatial_modes>{}, [&](auto i) {
+    return (shape<i+1>(tensor_cwhdn) + get<i>(upper_corner_whd) - get<i>(lower_corner_whd) - Int<1>{}) / get<i>(stride_whd) + Int<1>{};
+  });
+  auto gemm_mn = append(gemm_mn_, shape<num_spatial_modes+1>(tensor_cwhdn));
+
+  // Compute [c, s, r, t]
+  // fprop/wgrad, s/r/t = 1 + (upper_padding_whd - upper_corner_whd) / stride_srt
+  // wgrad,       s/r/t = 1 + (lower_padding_whd - lower_corner_whd) / stride_srt
+  auto gemm_k_ = cute::transform(cute::make_seq<num_spatial_modes>{}, [&](auto i) {
+    auto padding_size = conditional_return(get<i>(stride_srt) > Int<0>{},
+                                           get<i>(upper_padding_whd) - get<i>(upper_corner_whd),
+                                           get<i>(lower_corner_whd)  - get<i>(lower_padding_whd));
+    return Int<1>{} + padding_size / get<i>(stride_srt);
+  });
+  auto gemm_k = prepend(gemm_k_, shape<0>(tensor_cwhdn));
+
+  // For fprop/dgrad kernel, gemm_shapes is ((q, p, z, n), (c, s, r, t))
+  // For wgrad kernel, gemm_shapes is ((c, s, r, t), (q, p, z, n))
+  auto gemm_shapes_common = make_shape(
+      transform_leaf(gemm_mn, [](auto s) {
+        return conditional_return(cute::is_static<decltype(s)>{}, s, cutlass::FastDivmod(s));
+      }),
+      gemm_k);
+  auto gemm_shapes = make_shape(
+      basis_get(stride<0,1>(tma_layout_vt), gemm_shapes_common),
+      basis_get(stride<0,0>(tma_layout_vt), gemm_shapes_common));
+
+  // For fprop/dgrad kernel, linearized shapes is (whdn, (c, s, r, t))
+  // For wgrad kernel linearized shapes is ((c, s, r, t), whdn)
+  auto linear_shapes_common = make_shape(size(gemm_mn), gemm_k);
+  auto linear_shapes = make_shape(
+      basis_get(stride<0,1>(tma_layout_vt), linear_shapes_common),
+      basis_get(stride<0,0>(tma_layout_vt), linear_shapes_common));
+
+  //
+  // Calculate gmem basis stride based on tma layout tiling.
+  //
+
+  auto tma_basis_scale = make_shape(Int<1>{}, stride_whd, Int<1>{}, stride_srt);
+  auto tma_basis = elem_scale(tma_basis_scale, make_basis_like(tma_basis_scale));
+
+  auto gbasis_strides_common = make_stride(
+      append(get<1>(tma_basis), get<2>(tma_basis)),
+      prepend(get<3>(tma_basis), get<0>(tma_basis)));    // ((w,h,d,n),(c,s,r,t))
+  auto gbasis_strides = make_stride(
+      basis_get(stride<0,1>(tma_layout_vt), gbasis_strides_common),
+      basis_get(stride<0,0>(tma_layout_vt), gbasis_strides_common));
+
+  //
+  // Create tma tensor
+  //
+
+  auto lower_corner = make_arithmetic_tuple(Int<0>{}, lower_corner_whd, Int<0>{}, lower_srt);
+
+  auto tensor_multimode = make_tensor(ArithmeticTupleIterator(lower_corner), gemm_shapes, gbasis_strides);
+  auto tensor_linear = make_identity_tensor(linear_shapes);
+  auto tma_tensor = make_tensor(tensor_multimode.data(), composition(
+      tensor_multimode.layout(),
+      tensor_linear(Int<0>{}),
+      tensor_linear.layout()));
+
+  return cute::make_tuple(tma_desc, tma_tensor);
+}
+
+template <class CopyOp,
+          class GEngine, class GLayout,
+          class SLayout,
+          class VShape, class VStride,
+          class LowerCornerStride,
+          class UpperCornerStride,
+          class LowerPaddingStride,
+          class UpperPaddingStride,
+          class TraversalStride,
+          class LowerSRTStride,
+          class DilationStride>
+CUTE_HOST_RTC
+auto
+make_tma_atom_im2col(CopyOp,
+                     Tensor<GEngine,GLayout>      const& gtensor,           // Full GMEM Tensor: ((w, h, d, n), c)
+                     SLayout                      const& slayout,           // CTA Tile of SMEM, potentially swizzled
+                     int32_t                      const& num_multicast,     // The number of CTAs involved in multicasting
+                     Layout<VShape,VStride>       const& cta_v_map,         // V: CTA val idx -> gmem mode
+                     LowerCornerStride            const& lower_corner_whd,
+                     UpperCornerStride            const& upper_corner_whd,
+                     LowerPaddingStride           const& lower_padding_whd,
+                     UpperPaddingStride           const& upper_padding_whd,
+                     TraversalStride              const& stride_whd,        // traversal stride
+                     LowerSRTStride               const& lower_srt,
+                     DilationStride               const& stride_srt,        // dilation
+                     TMA::DescriptorAuxParams     const& aux_params = {})
+{
+  //
+  // TMA parameter checking
+  //
+
+  CUTE_STATIC_ASSERT_V(product_each(shape(slayout)) == product_each(shape(cta_v_map)),
+    "TMA requires CTA_Tile and SLayout top-level shape equivalence.");
+
+  //
+  // TMA slayout manipulation
+  //
+
+  // Invert the smem to get the largest contiguous vector in the smem layout
+  auto inv_smem_layout = right_inverse(get_nonswizzle_portion(slayout));
+  // trunc_smem_idx -> trunc_smem_coord
+
+  // Map from smem idx to a gmem mode
+  auto sidx_to_gmode = coalesce(composition(cta_v_map, inv_smem_layout));
+
+#if 0
+  print("g_layout         : "); print(gtensor.layout()); print("\n");
+  print("s_layout         : "); print(slayout); print("\n");
+  print("cta_t_map        : "); print(cta_t_map); print("\n");
+  print("cta_v_map        : "); print(cta_v_map); print("\n");
+  print("inv_smem         : "); print(inv_smem_layout); print("\n");
+  print("sidx_to_gmode    : "); print(sidx_to_gmode); print("\n");
+#endif
+
+  //
+  // TMA gtensor manipulation
+  //
+
+  // Generate a TupleBasis for the gtensor
+  auto glayout_basis = make_identity_layout(product_each(shape(gtensor)));
+
+  // Tile the modes of gtensor with the truncated cta_v_map o inv_smem_layout_trunc
+  auto tma_layout_full = flatten(composition(glayout_basis, sidx_to_gmode));
+
+  // Truncate any incompatibilities -- no starting in the middle of gmodes
+  auto smem_rank = find_if(stride(tma_layout_full), [](auto e) {
+    [[maybe_unused]] auto v = basis_value(e);
+    return not is_constant<1,decltype(v)>{};
+  });
+  static_assert(smem_rank >= 2, "IM2COL expects at least 2 modes of the smem to vectorize with gmem.");
+  // IM2COL uses a maximum of 2 modes
+  constexpr int smem_tma_rank = cute::min(int(smem_rank), 2);
+
+  // Keep only the static-1 basis modes into gmem
+  auto tma_layout_trunc = take<0,smem_tma_rank>(tma_layout_full);
+
+  // Split according to the portion each multicast CTA will be responsible for
+  auto tma_layout_vt = logical_divide(tma_layout_trunc, shape_div(size(tma_layout_trunc), num_multicast));
+
+#if 0
+  print("glayout_basis   : "); print(glayout_basis); print("\n");
+  print("tma_layout_full : "); print(tma_layout_full); print("\n");
+
+  print("tma_layout_trunc: "); print(tma_layout_trunc); print("\n");
+  print("tma_layout_vt   : "); print(tma_layout_vt); print("\n");
+#endif
+
+  auto range_c    = size<0,0>(tma_layout_vt);
+  auto range_whdn = size<0,1>(tma_layout_vt);
+  Tensor gtensor_cwhdn = make_tensor(gtensor.data(),
+                                     flatten(make_layout(make_layout(basis_get(stride<0,0>(tma_layout_vt), gtensor.shape()),
+                                                                     basis_get(stride<0,0>(tma_layout_vt), gtensor.stride())),
+                                                         make_layout(basis_get(stride<0,1>(tma_layout_vt), gtensor.shape()),
+                                                                     basis_get(stride<0,1>(tma_layout_vt), gtensor.stride())))));
+  auto [tma_desc, tma_tensor] = make_im2col_tma_copy_desc(
+      gtensor_cwhdn,
+      range_c,
+      range_whdn,
+      detail::get_swizzle_portion(slayout),
+      tma_layout_vt,
+      lower_corner_whd,
+      upper_corner_whd,
+      lower_padding_whd,
+      upper_padding_whd,
+      stride_whd,
+      lower_srt,
+      stride_srt,
+      aux_params);
+
+  //
+  // Construct the Copy_Traits
+  //
+
+  using T = typename GEngine::value_type;
+  constexpr int num_bits_per_tma = decltype(size(tma_layout_trunc))::value * sizeof(T) * 8;
+
+  using Traits = Copy_Traits<CopyOp, cute::C<num_bits_per_tma>, decltype(tma_tensor)>;
+  using Atom = Copy_Atom<Traits, typename GEngine::value_type>;
+
+#if 0
+  print("num_bits      :  "); print(num_bits_per_tma); print("\n");
+#endif
+
+  Traits tma_traits{tma_desc, tma_tensor};
+
+  // Return the Copy_Atom
+  return Atom{tma_traits};
+}
+
+/// Make a TiledCopy for im2col TMA load.
+///
+/// @param copy_op The copy implementation: either
+///   SM90_TMA_LOAD_IM2COL or SM90_TMA_LOAD_IM2COL_MULTICAST.
+///
+/// @param tensor_cwhdn The global tensor to use for im2col TMA loads.
+///   For Fprop convolutions, this is the activation tensor.  This is
+///   the "original tensor that points to global memory, not the
+///   coordinate (im2col-transformed) tensor.
+///
+/// @param slayout Layout of shared memory tile.
+///
+/// @param stride_whd The traversal strides convolution
+///   parameter.
+///
+/// @return TiledCopy specialization for im2col TMA loads.
+template <class CopyOp,
+          class GEngine, class GLayout,
+          class SLayout,
+          class TShape, class TStride,
+          class VShape, class VStride,
+          class LowerCornerStride,
+          class UpperCornerStride,
+          class LowerPaddingStride,
+          class UpperPaddingStride,
+          class TraversalStride,
+          class LowerSRTStride,
+          class DilationStride>
+CUTE_HOST_RTC
+auto
+make_tma_copy_im2col(CopyOp                       const& copy_op,
+                     Tensor<GEngine,GLayout>      const& gtensor,
+                     SLayout                      const& slayout,
+                     Layout<TShape,TStride>       const& cta_t_map,          // CTA tid -> logical TMA tid
+                     Layout<VShape,VStride>       const& cta_v_map,          // CTA vid -> gmem coord
+                     LowerCornerStride            const& lower_corner_whd,
+                     UpperCornerStride            const& upper_corner_whd,
+                     LowerPaddingStride           const& lower_padding_whd,
+                     UpperPaddingStride           const& upper_padding_whd,
+                     TraversalStride              const& stride_whd,         // traversal stride
+                     LowerSRTStride               const& lower_srt,
+                     DilationStride               const& stride_srt,         // dilation
+                     TMA::DescriptorAuxParams     const& aux_params = {})
+{
+  //
+  // TMA parameter checking
+  //
+
+  CUTE_STATIC_ASSERT_V(size(slayout) % cosize(cta_t_map) == Int<0>{},
+    "Number of active CTAs in TMA must divide domain size of slayout.");
+
+  Copy_Atom atom = make_tma_atom_im2col(copy_op, gtensor, slayout, cosize(cta_t_map), cta_v_map,
+                                        lower_corner_whd, upper_corner_whd, lower_padding_whd,
+                                        upper_padding_whd, stride_whd, lower_srt, stride_srt, aux_params);
+
+  //
+  // Construct the TiledCopy
+  //
+
+  auto cta_tiler = product_each(shape(cta_v_map));
+
+  auto num_elems_per_tma = size<1>(typename decltype(atom)::RefLayout{}) / static_value<sizeof_bits<typename GEngine::value_type>>();
+
+  // smem idx -> smem coord
+  auto inv_smem_layout = right_inverse(get_nonswizzle_portion(slayout));
+  // CTA V -> smem_coord
+  auto layout_v = composition(inv_smem_layout, num_elems_per_tma);
+  // Scale that up to cover all of the smem_coords
+  auto layout_V = tile_to_shape(make_layout(layout_v), size(cta_v_map));
+  // CTA T -> smem idx
+  auto layout_t = make_layout(cosize(cta_t_map), shape_div(num_elems_per_tma, cosize(cta_t_map)));
+  // CTA TID -> smem coord
+  auto layout_T = composition(inv_smem_layout, composition(layout_t, cta_t_map));
+  // Combine with the T mapping
+  [[maybe_unused]] auto layout_TV = make_layout(layout_T, layout_V);
+
+#if 0
+  print("cta_tiler : "); print(cta_tiler); print("\n");
+  print("layout_v : "); print(layout_v); print("\n");
+  print("layout_V : "); print(layout_V); print("\n");
+  print("layout_t : "); print(layout_t); print("\n");
+  print("layout_T : "); print(layout_T); print("\n");
+  print("layout_TV : "); print(layout_TV); print("\n");
+#endif
+
+  return TiledCopy<decltype(atom), decltype(layout_TV), decltype(cta_tiler)>{atom};
+}
+
+/// Make a TiledCopy for im2col TMA with no offsets.
+/// E.g. im2col TMA load for C and im2col TMA store for D.
+template <class CopyOp,
+          class GEngine, class GLayout,
+          class SLayout,
+          class TShape, class TStride,
+          class VShape, class VStride>
+CUTE_HOST_RTC
+auto
+make_tma_copy_im2col(CopyOp                  const& copy_op,
+                     Tensor<GEngine,GLayout> const& gtensor,
+                     SLayout                 const& slayout,
+                     Layout<TShape,TStride>  const& cta_t_map,          // CTA tid -> logical TMA tid
+                     Layout<VShape,VStride>  const& cta_v_map)          // CTA vid -> gmem coord
+{
+  constexpr int num_spatial_modes = rank<0>(GLayout{}) - 1;
+  return make_tma_copy_im2col(copy_op, gtensor, slayout, cta_t_map, cta_v_map,
+                              append<num_spatial_modes>(Stride<_0>{}, Int<0>{}),  // lower_corner_whd
+                              append<num_spatial_modes>(Stride<_0>{}, Int<0>{}),  // upper_corner_whd
+                              append<num_spatial_modes>(Stride<_0>{}, Int<0>{}),  // lower_padding_whd
+                              append<num_spatial_modes>(Stride<_0>{}, Int<0>{}),  // upper_padding_whd
+                              append<num_spatial_modes>(Stride<_1>{}, Int<1>{}),  // stride_whd
+                              append<num_spatial_modes>(Stride<_0>{}, Int<0>{}),  // lower_srt
+                              append<num_spatial_modes>(Stride<_1>{}, Int<1>{})); // stride_srt
+}
+
+} // namespace detail
+
+
+
+template <class CopyOp,
+          class Engine0, class Layout0,
+          class SLayout,
+          class CTATiler,
+          class MulticastSize,
+          class LowerCornerStride,
+          class UpperCornerStride,
+          class LowerPaddingStride,
+          class UpperPaddingStride,
+          class TraversalStride,
+          class LowerSRTStride,
+          class DilationStride>
+CUTE_HOST_RTC
+auto
+make_im2col_tma_copy(CopyOp                   const& copy_op,
+                     Tensor<Engine0, Layout0> const& tensor_cwhdn,
+                     SLayout                  const& slayout,
+                     CTATiler                 const& cta_tiler,
+                     MulticastSize            const& multicast_size,
+                     LowerCornerStride        const& lower_corner_whd,
+                     UpperCornerStride        const& upper_corner_whd,
+                     LowerPaddingStride       const& lower_padding_whd,
+                     UpperPaddingStride       const& upper_padding_whd,
+                     TraversalStride          const& stride_whd,
+                     LowerSRTStride           const& lower_srt,
+                     DilationStride           const& stride_srt)
+{
+  auto cta_v_tile = make_identity_layout(product_each(shape(tensor_cwhdn))).compose(cta_tiler);
+  auto cta_t_tile = make_layout(multicast_size);
+
+  return detail::make_tma_copy_im2col(copy_op, tensor_cwhdn,
+                                      slayout, cta_t_tile, cta_v_tile,
+                                      lower_corner_whd, upper_corner_whd, lower_padding_whd, upper_padding_whd, stride_whd, lower_srt, stride_srt);
+}
+
+// Explicit default for multicast_size
+template <class CopyOp,
+          class Engine0, class Layout0,
+          class SLayout,
+          class CTATiler,
+          class LowerCornerStride,
+          class UpperCornerStride,
+          class LowerPaddingStride,
+          class UpperPaddingStride,
+          class TraversalStride,
+          class LowerSRTStride,
+          class DilationStride>
+CUTE_HOST_RTC
+auto
+make_im2col_tma_copy(CopyOp                   const& copy_op,
+                     Tensor<Engine0, Layout0> const& tensor_cwhdn,
+                     SLayout                  const& slayout,
+                     CTATiler                 const& cta_tiler,
+                     LowerCornerStride        const& lower_corner_whd,
+                     UpperCornerStride        const& upper_corner_whd,
+                     LowerPaddingStride       const& lower_padding_whd,
+                     UpperPaddingStride       const& upper_padding_whd,
+                     TraversalStride          const& stride_whd,
+                     LowerSRTStride           const& lower_srt,
+                     DilationStride           const& stride_srt)
+{
+  return make_im2col_tma_copy(copy_op, tensor_cwhdn, slayout, cta_tiler, Int<1>{},
+                              lower_corner_whd, upper_corner_whd, lower_padding_whd, upper_padding_whd, stride_whd, lower_srt, stride_srt);
+}
+
+// Explicit default for cta_tiler and multicast_size
+template <class CopyOp,
+          class Engine0, class Layout0,
+          class SLayout,
+          class LowerCornerStride,
+          class UpperCornerStride,
+          class LowerPaddingStride,
+          class UpperPaddingStride,
+          class TraversalStride,
+          class LowerSRTStride,
+          class DilationStride>
+CUTE_HOST_RTC
+auto
+make_im2col_tma_copy(CopyOp                   const& copy_op,
+                     Tensor<Engine0, Layout0> const& tensor_cwhdn,
+                     SLayout                  const& slayout,
+                     LowerCornerStride        const& lower_corner_whd,
+                     UpperCornerStride        const& upper_corner_whd,
+                     LowerPaddingStride       const& lower_padding_whd,
+                     UpperPaddingStride       const& upper_padding_whd,
+                     TraversalStride          const& stride_whd,
+                     LowerSRTStride           const& lower_srt,
+                     DilationStride           const& stride_srt)
+{
+  return make_im2col_tma_copy(copy_op, tensor_cwhdn, slayout, product_each(shape(slayout)), Int<1>{},
+                              lower_corner_whd, upper_corner_whd, lower_padding_whd, upper_padding_whd, stride_whd, lower_srt, stride_srt);
+}
+
+// No offsets copy.
+template <class CopyOp,
+          class Engine0, class Layout0,
+          class SLayout,
+          class CTATiler,
+          class MulticastSize>
+CUTE_HOST_RTC
+auto
+make_im2col_tma_copy(CopyOp                   const& copy_op,
+                     Tensor<Engine0, Layout0> const& tensor_cwhdn,
+                     SLayout                  const& slayout,
+                     CTATiler                 const& cta_tiler,
+                     MulticastSize            const& multicast_size)
+{
+  auto cta_v_tile = make_identity_layout(product_each(shape(tensor_cwhdn))).compose(cta_tiler);
+  auto cta_t_tile = make_layout(multicast_size);
+
+  return detail::make_tma_copy_im2col(copy_op, tensor_cwhdn, slayout, cta_t_tile, cta_v_tile);
+}
+
+// Explicit default for multicast_size
+template <class CopyOp,
+          class Engine0, class Layout0,
+          class SLayout,
+          class CTATiler>
+CUTE_HOST_RTC
+auto
+make_im2col_tma_copy(CopyOp                   const& copy_op,
+                     Tensor<Engine0, Layout0> const& tensor_cwhdn,
+                     SLayout                  const& slayout,
+                     CTATiler                 const& cta_tiler)
+{
+  return make_im2col_tma_copy(copy_op, tensor_cwhdn, slayout, cta_tiler, Int<1>{});
+}
+
+// Explicit default for cta_tiler and multicast_size
+template <class CopyOp,
+          class Engine0, class Layout0,
+          class SLayout>
+CUTE_HOST_RTC
+auto
+make_im2col_tma_copy(CopyOp                   const& copy_op,
+                     Tensor<Engine0, Layout0> const& tensor_cwhdn,
+                     SLayout                  const& slayout)
+{
+  return make_im2col_tma_copy(copy_op, tensor_cwhdn, slayout, product_each(shape(slayout)), Int<1>{});
+}
+
+} // namespace cute
diff --git a/include/cute/atom/copy_traits_sm90_tma.hpp b/include/cute/atom/copy_traits_sm90_tma.hpp
new file mode 100644
index 0000000000..4ad7f80851
--- /dev/null
+++ b/include/cute/atom/copy_traits_sm90_tma.hpp
@@ -0,0 +1,1560 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#if !defined(__CUDACC_RTC__)
+#include <cuda.h>
+#endif
+
+#include <cute/atom/copy_traits_sm90_tma_swizzle.hpp>
+#include <cute/atom/copy_traits.hpp>
+#include <cute/atom/copy_atom.hpp>
+
+#include <cute/algorithm/prefetch.hpp>
+
+#include <cute/numeric/integral_ratio.hpp>
+
+#include <cutlass/cuda_host_adapter.hpp>
+
+namespace cute
+{
+
+template <class GmemTmaBasisStrides_, class TmaGmemBasis_, class TmaSwizzle_>
+struct AuxTmaParams {
+  using GmemStrides  = GmemTmaBasisStrides_;    // Strides for Gmem mode -> Tma coord mode, may be dynamic
+  GmemStrides g_stride_;
+  using TmaGmemBasis = TmaGmemBasis_;           // Layout for Tma box shape -> Gmem mode(s), always static
+  static_assert(is_static<TmaGmemBasis>::value);
+  using TmaSwizzle   = TmaSwizzle_;             // Tma swizzle, always Swizzle<B,M,S>
+  static_assert(is_static<TmaSwizzle>::value);
+};
+
+// Utility for unpacking TMA_LOAD arguments into a CopyOp
+template <class CopyOp, class... Args>
+struct TMA_LOAD_Unpack
+{
+  template <class TS, class SLayout,
+            class TD, class DLayout>
+  CUTE_HOST_DEVICE friend constexpr void
+  copy_unpack(Copy_Traits<CopyOp, Args...> const& traits,
+              Tensor<TS,SLayout>           const& src,
+              Tensor<TD,DLayout>                & dst)
+  {
+    static_assert(is_smem<TD>::value, "SM90_TMA_LOAD requires the destination be shared memory.");
+
+    auto src_coord = src.data().coord_;
+    void* dst_ptr = cute::raw_pointer_cast(dst.data());
+#if 0
+    auto [c0,c1,c2,c3,c4] = append<5>(src_coord, 0);
+    printf("THR (%d,%d,%d) BLK (%d,%d,%d) TMACRD (%d,%d,%d,%d,%d) SMEMADDR (%p)\n",
+          threadIdx.x, threadIdx.y, threadIdx.z,
+          blockIdx.x, blockIdx.y, blockIdx.z,
+          int32_t(c0), int32_t(c1), int32_t(c2), int32_t(c3), int32_t(c4), dst_ptr);
+#endif
+    return detail::explode_tuple(detail::CallCOPY<CopyOp>{},
+                                 traits.opargs_, tuple_seq<decltype(traits.opargs_)>{},
+                                 make_tuple(dst_ptr), seq<0>{},
+                                 src_coord, tuple_seq<decltype(src_coord)>{});
+  }
+};
+
+//////////////////////////////////////////////////////////////////////////////
+///////////////////////////// TMA_LOAD ///////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+
+struct SM90_TMA_LOAD_OP : SM90_TMA_LOAD {};
+
+// The non-executable SM90_TMA_LOAD with tma_desc and no tma_mbar
+// Use .with(tma_mbar) to construct an executable version
+template <class NumBitsPerTMA, class AuxParams_>
+struct Copy_Traits<SM90_TMA_LOAD, NumBitsPerTMA, AuxParams_>
+{
+  using ThrID     = Layout<_1>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  // SM90_TMA_LOAD arguments
+  TmaDescriptor tma_desc_;
+  using AuxParams = AuxParams_;
+  AuxParams aux_params_;
+
+  // Return TmaDescriptor/TensorMap
+  CUTE_HOST_DEVICE constexpr
+  TmaDescriptor const*
+  get_tma_descriptor() const {
+    return &tma_desc_;
+  }
+
+  // Construct an executable SM90_TMA_LOAD with tma_mbar
+  CUTE_HOST_DEVICE constexpr
+  Copy_Traits<SM90_TMA_LOAD_OP, NumBitsPerTMA>
+  with(
+    uint64_t& tma_mbar,
+    [[maybe_unused]] uint16_t const& multicast_mask = 0,
+    TMA::CacheHintSm90 const& cache_hint = TMA::CacheHintSm90::EVICT_NORMAL) const {
+    // We accept multicast_mask here to keep the API for both atoms consistent
+    return {&tma_desc_, &tma_mbar, static_cast<uint64_t>(cache_hint)};
+  }
+
+  // Construct an executable SM90_TMA_LOAD with tma_mbar (temp. overloaded for grouped gemm/ptr array gemm)
+  CUTE_HOST_DEVICE constexpr
+  Copy_Traits<SM90_TMA_LOAD_OP, NumBitsPerTMA>
+  with(
+    TmaDescriptor const* new_tma_desc,
+    uint64_t& tma_mbar,
+    [[maybe_unused]] uint16_t const& multicast_mask = 0,
+    TMA::CacheHintSm90 const& cache_hint = TMA::CacheHintSm90::EVICT_NORMAL) const {
+    // We accept multicast_mask here to keep the API for both atoms consistent
+    return {new_tma_desc, &tma_mbar, static_cast<uint64_t>(cache_hint)};
+  }
+
+  // Generate the TMA coord tensor
+  template <class GShape>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_tma_tensor(GShape const& g_shape) const {
+    static_assert(is_congruent<decltype(g_shape), decltype(aux_params_.g_stride_)>::value);
+    return make_counting_tensor(make_layout(g_shape, aux_params_.g_stride_));
+  }
+
+  // Don't try to execute a copy with SM90_TMA_LOAD before calling .with()
+  template <class TS, class SLayout,
+            class TD, class DLayout>
+  CUTE_HOST_DEVICE friend constexpr void
+  copy_unpack(Copy_Traits        const& traits,
+              Tensor<TS,SLayout> const& src,
+              Tensor<TD,DLayout>      & dst) = delete;
+};
+
+// The executable SM90_TMA_LOAD with tma_desc and tma_mbar
+template <class NumBitsPerTMA>
+struct Copy_Traits<SM90_TMA_LOAD_OP, NumBitsPerTMA>
+  : TMA_LOAD_Unpack<SM90_TMA_LOAD_OP, NumBitsPerTMA>
+{
+  using ThrID     = Layout<_1>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  // SM90_TMA_LOAD arguments
+  tuple<
+  TmaDescriptor const*,
+  uint64_t*, // smem mbarrier
+  uint64_t   // cache hint
+  > const opargs_;
+
+  CUTE_HOST_DEVICE
+  Copy_Traits(TmaDescriptor const* desc, uint64_t* mbar, uint64_t cache)
+    : opargs_(desc, mbar, cache) {}
+};
+
+// The prefetch for SM90_TMA_LOAD with tma_desc
+template <class NumBitsPerTMA, class... Args>
+struct Copy_Traits<SM90_TMA_LOAD::PREFETCH, NumBitsPerTMA, Args...>
+{
+  using ThrID     = Layout<_1>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  // SM90_TMA_LOAD::PREFETCH arguments
+  tuple<TmaDescriptor const*> const opargs_;
+
+  // Construct with any other Traits' TMA Desc
+  template <class... CopyArgs>
+  CUTE_HOST_DEVICE
+  Copy_Traits(Copy_Traits<CopyArgs...> const& traits)
+    : opargs_({&traits.tma_desc_}) {}
+
+  template <class TS, class SLayout,
+            class TD, class DLayout>
+  CUTE_HOST_DEVICE friend constexpr void
+  copy_unpack(Copy_Traits        const& traits,
+              Tensor<TS,SLayout> const& src,
+              Tensor<TD,DLayout>      & dst)
+  {
+    auto src_coord = src.data().coord_;
+    return detail::explode_tuple(detail::CallCOPY<SM90_TMA_LOAD::PREFETCH>{},
+                                 traits.opargs_, tuple_seq<decltype(traits.opargs_)>{},
+                                 src_coord, tuple_seq<decltype(src_coord)>{});
+  }
+};
+
+//////////////////////////////////////////////////////////////////////////////
+///////////////////////////// TMA_LOAD_MULTICAST /////////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+
+struct SM90_TMA_LOAD_MULTICAST_OP : SM90_TMA_LOAD_MULTICAST {};
+
+// The non-executable SM90_TMA_LOAD_MULTICAST with tma_desc and no tma_mbar
+// Use .with(tma_mbar, multicast_mask) to construct an executable version
+template <class NumBitsPerTMA, class AuxParams_>
+struct Copy_Traits<SM90_TMA_LOAD_MULTICAST, NumBitsPerTMA, AuxParams_>
+{
+  using ThrID     = Layout<_1>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  // SM90_TMA_LOAD_MULTICAST arguments
+  TmaDescriptor tma_desc_;
+  using AuxParams = AuxParams_;
+  AuxParams aux_params_;
+
+  // Return TmaDescriptor/TensorMap
+  CUTE_HOST_DEVICE constexpr
+  TmaDescriptor const*
+  get_tma_descriptor() const {
+    return &tma_desc_;
+  }
+
+  // Construct an executable SM90_TMA_LOAD_MULTICAST with tma_mbar
+  CUTE_HOST_DEVICE constexpr
+  Copy_Traits<SM90_TMA_LOAD_MULTICAST_OP, NumBitsPerTMA>
+  with(
+    uint64_t& tma_load_mbar,
+    uint16_t const& multicast_mask,
+    TMA::CacheHintSm90 const& cache_hint = TMA::CacheHintSm90::EVICT_NORMAL) const {
+    return {&tma_desc_, &tma_load_mbar, multicast_mask, static_cast<uint64_t>(cache_hint)};
+  }
+
+  // Construct an executable SM90_TMA_LOAD_MULTICAST_OP with tma_mbar (temp. overloaded for grouped gemm/ptr array gemm)
+  CUTE_HOST_DEVICE constexpr
+  Copy_Traits<SM90_TMA_LOAD_MULTICAST_OP, NumBitsPerTMA>
+  with(
+    TmaDescriptor const* new_tma_desc,
+    uint64_t& tma_load_mbar,
+    uint16_t const& multicast_mask,
+    TMA::CacheHintSm90 const& cache_hint = TMA::CacheHintSm90::EVICT_NORMAL) const {
+    return {new_tma_desc, &tma_load_mbar, multicast_mask, static_cast<uint64_t>(cache_hint)};
+  }
+
+  // Generate the TMA coord tensor
+  template <class GShape>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_tma_tensor(GShape const& g_shape) const {
+    static_assert(is_congruent<decltype(g_shape), decltype(aux_params_.g_stride_)>::value);
+    return make_counting_tensor(make_layout(g_shape, aux_params_.g_stride_));
+  }
+
+  // Don't try to execute a copy with SM90_TMA_LOAD_MULTICAST before calling .with()
+  template <class TS, class SLayout,
+            class TD, class DLayout>
+  CUTE_HOST_DEVICE friend constexpr void
+  copy_unpack(Copy_Traits        const& traits,
+              Tensor<TS,SLayout> const& src,
+              Tensor<TD,DLayout>      & dst) = delete;
+};
+
+// The executable SM90_TMA_LOAD_MULTICAST with tma_desc and tma_mbar and multicast_mask
+template <class NumBitsPerTMA>
+struct Copy_Traits<SM90_TMA_LOAD_MULTICAST_OP, NumBitsPerTMA>
+  : TMA_LOAD_Unpack<SM90_TMA_LOAD_MULTICAST_OP, NumBitsPerTMA>
+{
+  using ThrID     = Layout<_1>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  // SM90_TMA_LOAD_MULTICAST arguments
+  tuple<
+  TmaDescriptor const*,
+  uint64_t*, // smem mbarrier
+  uint16_t,  // multicast mask
+  uint64_t   // cache hint
+  > const opargs_;
+
+  CUTE_HOST_DEVICE
+  Copy_Traits(TmaDescriptor const* desc, uint64_t* mbar, uint16_t mask, uint64_t hint)
+    : opargs_(desc, mbar, mask, hint) {}
+};
+
+//////////////////////////////////////////////////////////////////////////////
+///////////////////////////// TMA_STORE //////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+
+struct SM90_TMA_STORE_PTR : SM90_TMA_STORE {};
+
+// The executable SM90_TMA_STORE with tma_desc
+template <class NumBitsPerTMA, class AuxParams_>
+struct Copy_Traits<SM90_TMA_STORE, NumBitsPerTMA, AuxParams_>
+{
+  using ThrID     = Layout<_1>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  // SM90_TMA_STORE arguments
+  TmaDescriptor tma_desc_;
+  using AuxParams = AuxParams_;
+  AuxParams aux_params_;
+
+  // Return TmaDescriptor/TensorMap
+  CUTE_HOST_DEVICE constexpr
+  TmaDescriptor const*
+  get_tma_descriptor() const {
+    return &tma_desc_;
+  }
+
+  // Generate the TMA coord tensor
+  template <class GShape>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_tma_tensor(GShape const& g_shape) const {
+    static_assert(is_congruent<decltype(g_shape), decltype(aux_params_.g_stride_)>::value);
+    return make_counting_tensor(make_layout(g_shape, aux_params_.g_stride_));
+  }
+
+  // Construct new TMA_STORE with (unsafe) swapped out TMA descriptor ptr (for grouped gemm/ptr array gemm)
+  CUTE_HOST_DEVICE constexpr
+  Copy_Traits<SM90_TMA_STORE_PTR, NumBitsPerTMA>
+  with(TmaDescriptor const* new_tma_desc) const {
+    return {new_tma_desc};
+  }
+
+  template <class TS, class SLayout,
+            class TD, class DLayout>
+  CUTE_HOST_DEVICE friend constexpr void
+  copy_unpack(Copy_Traits        const& traits,
+              Tensor<TS,SLayout> const& src,
+              Tensor<TD,DLayout>      & dst)
+  {
+    static_assert(is_smem<TS>::value, "Expected smem src for SM90_TMA_STORE");
+    //static_assert(is_gmem<TD>::value, "Expected gmem dst for SM90_TMA_STORE");  // TMA spoofed src tensor
+
+    void const* const desc_ptr = &(traits.tma_desc_);
+    void const* const src_ptr  = cute::raw_pointer_cast(src.data());
+    auto dst_coord = dst.data().coord_;
+#if 0
+    auto [c0,c1,c2,c3,c4] = append<5>(dst_coord, 0);
+    printf("THR (%d,%d,%d) BLK (%d,%d,%d) TMACRD (%d,%d,%d,%d,%d) SMEMADDR (%p)\n",
+           threadIdx.x, threadIdx.y, threadIdx.z,
+           blockIdx.x, blockIdx.y, blockIdx.z,
+           int32_t(c0), int32_t(c1), int32_t(c2), int32_t(c3), int32_t(c4), src_ptr);
+#endif
+    return detail::explode_tuple(detail::CallCOPY<SM90_TMA_STORE>{},
+                                 make_tuple(desc_ptr, src_ptr), seq<0,1>{},
+                                 dst_coord, tuple_seq<decltype(dst_coord)>{});
+  }
+};
+
+// Same as SM90_TMA_STORE, but with an unsafe TMA Desc PTR instead
+template <class NumBitsPerTMA>
+struct Copy_Traits<SM90_TMA_STORE_PTR, NumBitsPerTMA>
+{
+  using ThrID     = Layout<_1>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  // SM90_TMA_STORE arguments
+  TmaDescriptor const* tma_desc_;
+
+  template <class TS, class SLayout,
+            class TD, class DLayout>
+  CUTE_HOST_DEVICE friend constexpr void
+  copy_unpack(Copy_Traits        const& traits,
+              Tensor<TS,SLayout> const& src,
+              Tensor<TD,DLayout>      & dst)
+  {
+    static_assert(is_smem<TS>::value, "Expected smem src for SM90_TMA_STORE");
+    //static_assert(is_gmem<TD>::value, "Expected gmem dst for SM90_TMA_STORE");  // TMA spoofed src tensor
+
+    void const* const desc_ptr = traits.tma_desc_;
+    void const* const src_ptr  = cute::raw_pointer_cast(src.data());
+    auto dst_coord = dst.data().coord_;
+#if 0
+    auto [c0,c1,c2,c3,c4] = append<5>(dst_coord, 0);
+    printf("THR (%d,%d,%d) BLK (%d,%d,%d) TMACRD (%d,%d,%d,%d,%d) SMEMADDR (%p)\n",
+           threadIdx.x, threadIdx.y, threadIdx.z,
+           blockIdx.x, blockIdx.y, blockIdx.z,
+           int32_t(c0), int32_t(c1), int32_t(c2), int32_t(c3), int32_t(c4), src_ptr);
+#endif
+    return detail::explode_tuple(detail::CallCOPY<SM90_TMA_STORE_PTR>{},
+                                 make_tuple(desc_ptr, src_ptr), seq<0,1>{},
+                                 dst_coord, tuple_seq<decltype(dst_coord)>{});
+  }
+};
+
+//////////////////////////////////////////////////////////////////////////////
+///////////////////////////// TMA_REDUCE_ADD //////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+
+// The executable SM90_TMA_REDUCE_ADD with tma_desc
+template <class NumBitsPerTMA, class AuxParams_>
+struct Copy_Traits<SM90_TMA_REDUCE_ADD, NumBitsPerTMA, AuxParams_>
+{
+  using ThrID   = Layout<_1>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  // SM90_TMA_REDUCE_ADD arguments
+  TmaDescriptor tma_desc_;
+  using AuxParams = AuxParams_;
+  AuxParams aux_params_;
+
+  // Return TmaDescriptor/TensorMap
+  CUTE_HOST_DEVICE constexpr
+  TmaDescriptor const*
+  get_tma_descriptor() const {
+    return &tma_desc_;
+  }
+
+  // Generate the TMA coord tensor
+  template <class GShape>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_tma_tensor(GShape const& g_shape) const {
+    static_assert(is_congruent<decltype(g_shape), decltype(aux_params_.g_stride_)>::value);
+    return make_counting_tensor(make_layout(g_shape, aux_params_.g_stride_));
+  }
+
+  template <class Coord, int... Is>
+  CUTE_HOST_DEVICE constexpr
+  void
+  copy_unpack_(void const* const src_ptr,
+               Coord const& dst_coord, seq<Is...>) const
+  {
+#if 0
+    auto [c0,c1,c2,c3,c4] = append<5>(dst_coord, 0);
+    printf("THR (%d,%d,%d) BLK (%d,%d,%d) TMACRD (%d,%d,%d,%d,%d) SMEMADDR (%p)\n",
+           threadIdx.x, threadIdx.y, threadIdx.z,
+           blockIdx.x, blockIdx.y, blockIdx.z,
+           int32_t(c0), int32_t(c1), int32_t(c2), int32_t(c3), int32_t(c4), src_ptr);
+#endif
+
+    SM90_TMA_REDUCE_ADD::copy(&tma_desc_,
+                         src_ptr, get<Is>(dst_coord)...);
+  }
+
+  // This is the copy_unpack dispatch for this Copy_Traits
+  // Src needs to be a smem tensor
+  // Dst needs to be a gmem tensor with TmaCoordIterator .data()
+  template <class TS, class SLayout,
+            class TD, class DLayout>
+  CUTE_HOST_DEVICE friend constexpr
+  void
+  copy_unpack(Copy_Traits        const& traits,
+              Tensor<TS,SLayout> const& src,
+              Tensor<TD,DLayout>      & dst)
+  {
+    static_assert(is_smem<TS>::value, "Expected smem src for SM90_TMA_REDUCE_ADD");
+    //static_assert(is_gmem<TD>::value, "Expected gmem dst for SM90_TMA_REDUCE_ADD");  // TMA spoofed src tensor
+
+    traits.copy_unpack_(cute::raw_pointer_cast(src.data()), dst.data().coord_, tuple_seq<decltype(dst.data().coord_)>{});
+  }
+};
+
+//////////////////////////////////////////////////////////////////////////////
+///////////////////////////// BULK COPY //////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+
+template <class NumBitsPerTMA, class... OpArgs>
+struct Copy_Traits<SM90_BULK_COPY_G2S, NumBitsPerTMA, OpArgs...>
+{
+  static_assert(int32_t(NumBitsPerTMA::value / 8) % 16 == 0,
+                "Bulk Copy requires copy vector size align to 16B.");
+
+  using ThrID = Layout<_1>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  // SM90_BULK_COPY_G2S arguments
+  // 0: uint64_t* bulk_load_memory_barrier
+  cute::tuple<OpArgs...> bulk_load_mbar_;
+
+  // Record the memory barrier for the instruction
+  CUTE_HOST_DEVICE constexpr
+  Copy_Traits<SM90_BULK_COPY_G2S, NumBitsPerTMA, uint64_t*>
+  with(uint64_t& bulk_mbar) const {
+    return {&bulk_mbar};
+  }
+
+  template <class TS, class SLayout,
+            class TD, class DLayout>
+  CUTE_HOST_DEVICE friend constexpr
+  void
+  copy_unpack(Copy_Traits        const& traits,
+              Tensor<TS,SLayout> const& src,
+              Tensor<TD,DLayout>      & dst)
+  {
+    static_assert(is_same<cute::tuple<OpArgs...>, cute::tuple<uint64_t*>>::value,
+                  "Extra arguments not set. Set .with() before use.");
+    static_assert(is_gmem<TS>::value, "Expected gmem src for SM90_BULK_COPY_G2S");
+    static_assert(is_smem<TD>::value, "Expected smem dst for SM90_BULK_COPY_G2S");
+    SM90_BULK_COPY_G2S::copy(raw_pointer_cast(src.data()), get<0>(traits.bulk_load_mbar_),
+                             raw_pointer_cast(dst.data()), int32_t(NumBitsPerTMA::value / 8));
+  }
+};
+
+template <class NumBitsPerTMA, class... Args>
+struct Copy_Traits<SM90_BULK_COPY_G2S::PREFETCH, NumBitsPerTMA, Args...>
+     : Copy_Traits<SM90_BULK_COPY_G2S, NumBitsPerTMA>
+{
+  template <class... CopyArgs>
+  CUTE_HOST_DEVICE
+  Copy_Traits(Copy_Traits<CopyArgs...> const& traits) {}
+
+  template <class TS, class SLayout,
+            class TD, class DLayout>
+  CUTE_HOST_DEVICE friend constexpr
+  void
+  copy_unpack(Copy_Traits        const& traits,
+              Tensor<TS,SLayout> const& src,
+              Tensor<TD,DLayout>      & dst)
+  {
+    static_assert(is_gmem<TS>::value, "Expected gmem src for SM90_BULK_PREFETCH");
+    SM90_BULK_COPY_G2S::PREFETCH::copy(raw_pointer_cast(src.data()), int32_t(NumBitsPerTMA::value / 8));
+  }
+};
+
+template <class NumBitsPerTMA>
+struct Copy_Traits<SM90_BULK_COPY_S2G, NumBitsPerTMA>
+{
+  static_assert(int32_t(NumBitsPerTMA::value / 8) % 16 == 0,
+                "Bulk Copy requires copy vector size align to 16B.");
+
+  using ThrID = Layout<_1>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  template <class TS, class SLayout,
+            class TD, class DLayout>
+  CUTE_HOST_DEVICE friend constexpr
+  void
+  copy_unpack(Copy_Traits        const& traits,
+              Tensor<TS,SLayout> const& src,
+              Tensor<TD,DLayout>      & dst)
+  {
+    static_assert(is_smem<TS>::value, "Expected smem src for SM90_BULK_COPY_S2G");
+    static_assert(is_gmem<TD>::value, "Expected gmem dst for SM90_BULK_COPY_S2G");
+    SM90_BULK_COPY_S2G::copy(raw_pointer_cast(src.data()), raw_pointer_cast(dst.data()), int32_t(NumBitsPerTMA::value / 8));
+  }
+};
+
+//
+// Placeholder for the bulk copy algorithm's default, auto-vectorizing behavior
+//
+
+template <class... OpArgs>
+struct Copy_Traits<SM90_BULK_COPY_AUTO, OpArgs...>
+{
+  // Logical thread id to thread idx (one-thread)
+  using ThrID = Layout<_1>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,_1>, Stride<_0,_0>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,_1>, Stride<_0,_0>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  // SM90_UBULK_COPY arguments
+  // 0: uint64_t* bulk_load_memory_barrier [if this is a BULK_LOAD_G2S]
+  cute::tuple<OpArgs...> opargs_;
+
+  // Record the memory barrier for the instruction
+  CUTE_HOST_DEVICE constexpr
+  Copy_Traits<SM90_BULK_COPY_AUTO, uint64_t*>
+  with(uint64_t& bulk_mbar) const {
+    return {&bulk_mbar};
+  }
+};
+
+//
+// MAKE_TMA_COPY and related
+//
+
+namespace detail {
+
+// Custom version of coalesce that greedily combines modes only up to size-256
+// Look at each element and the back of the stack (in order of priority)
+// back(NewLayout)  get<I>(OldLayout)
+//      s0:d0           _1:d1     =>  continue
+//      _1:d0           s1:d1     =>  replace_back     s1:d1
+//      s0:d0           s1:s0*d0  =>  replace_back  s0*s1:d0   if s0*s1 <= 256
+//      s0:d0           s1:d1     =>  append           s1:d1
+//
+// @pre OldShape and OldStride are flat
+template <int I, class OldShape, class OldStride, class NewShape, class NewStride>
+CUTE_HOST_DEVICE constexpr
+auto
+coalesce_256_impl(OldShape const& old_shape, OldStride const& old_stride,
+                  NewShape const& new_shape, NewStride const& new_stride)
+{
+  if constexpr (I == rank_v<OldShape>) {
+    // Base case, we're done
+    if constexpr (is_constant<1, NewShape>::value) {
+      return Layout<_1,_0>{};
+    } else {
+      return Layout<NewShape,NewStride>{new_shape,new_stride};
+    }
+  } else if constexpr (is_constant<1, decltype(get<I>(old_shape))>::value) {
+    // shape<I>(layout) == _1, skip it and continue
+    return coalesce_256_impl<I+1>(old_shape, old_stride, new_shape, new_stride);
+  } else if constexpr (is_constant<1, NewShape>::value) {
+    // Replace our shape-1 with anything (Can only happen on input new_shape/new_stride)
+    return coalesce_256_impl<I+1>(old_shape, old_stride, get<I>(old_shape), get<I>(old_stride));
+  } else if constexpr (is_constant<true, decltype(back(new_shape) * back(new_stride) == get<I>(old_stride) &&
+                                                  get<I>(old_shape) * back(new_shape) <= Int<256>{})>::value) {
+    // Merge modes because the shapes and strides match and the merge is 256 or less
+    return coalesce_256_impl<I+1>(old_shape, old_stride,
+                                  replace_back(new_shape, get<I>(old_shape) * back(new_shape)),
+                                  new_stride);
+  } else {
+    // Can't replace or merge, so append a new mode
+    return coalesce_256_impl<I+1>(old_shape, old_stride,
+                                  append(new_shape,  get<I>(old_shape)),
+                                  append(new_stride, get<I>(old_stride)));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Combine all the modes that are possible to combine
+// Does not respect the profile of the layout, but does preserve total size
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+coalesce_256(Layout<Shape,Stride> const& layout)
+{
+  auto flat_shape  = flatten(layout.shape());
+  auto flat_stride = flatten(layout.stride());
+  return coalesce_256_impl<1>(flat_shape, flat_stride, get<0>(flat_shape), get<0>(flat_stride));
+}
+
+template <class TmaInternalType,
+          class GEngine, class GLayout,
+          class SShape, class SStride,
+          class VShape, class VStride>
+CUTE_HOST_DEVICE constexpr
+auto
+construct_tma_gbasis(Tensor<GEngine,GLayout> const& gtensor,       // The original GMEM Tensor
+                     Layout<SShape,SStride>  const& slayout,       // The layout of SMEM
+                     Layout<VShape,VStride>  const& cta_v_map)     // smem_idx to hier gmode
+{
+  //
+  // TMA parameter checking
+  //
+
+  // CUTE_STATIC_ASSERT_V(product_each(shape(slayout)) == product_each(shape(cta_v_map)),
+  //                      "TMA requires CTA_Tile and SLayout top-level shape equivalence.");
+  CUTE_STATIC_ASSERT_V(size(slayout) == size(cta_v_map),
+                       "TMA requires CTA_Tile and SLayout top-level size equivalence.");
+
+#if 0
+  print("gtensor         : "); print(gtensor); print("\n");
+  print("slayout         : "); print(slayout); print("\n");
+  print("cta_v_map       : "); print(cta_v_map); print("\n");
+#endif
+
+  //
+  // TMA slayout manipulation
+  //
+
+  // Invert the smem to get the largest contiguous vector in the smem layout
+  // smem idx -> smem coord
+  auto inv_smem_layout = right_inverse(get_nonswizzle_portion(slayout));
+
+  // Compose with the V-Map to convert smem coord (CTA val idx) to gmem mode
+  // smem idx -> gmem mode
+  auto sidx2gmode_full = coalesce(composition(cta_v_map, inv_smem_layout));
+
+#if 0
+  print("inv_smem_layout : "); print(inv_smem_layout); print("\n");
+  print("sidx2gmode_full : "); print(sidx2gmode_full); print("\n");
+#endif
+
+  //
+  // TMA gtensor truncation
+  //
+
+  // Truncate any incompatibilities -- no starting in the middle of gmodes
+  auto smem_rank = find_if(stride(sidx2gmode_full), [](auto e) {
+    [[maybe_unused]] auto v = basis_value(e);
+    return not is_constant<1,decltype(v)>{};
+  });
+  static_assert(smem_rank > 0, "Could not find a common tile-gmem vectorization. Does the Tile select out major GMEM modes?");
+
+  // Keep only the static-1 basis modes into gmem
+  auto sidx2gmode = take<0,smem_rank>(sidx2gmode_full);
+
+#if 0
+  print("smem_rank  : "); print(smem_rank); print("\n");
+  print("sidx2gmode : "); print(sidx2gmode); print("\n");
+#endif
+
+  //
+  // TMA gtensor manipulation
+  //
+
+  // The smem vector is the same units as gtensor, so compose first and then recast
+  // tma_val_idx:gmem_strides
+  auto tile_gstride = recast<TmaInternalType>(gtensor.compose(sidx2gmode)).layout();
+  // Coalesce modes up to size-256 (the maximum TMA box extent in units of TmaInternalType)
+  // tma_box_shape:gmem_strides
+  auto tma_gstride  = coalesce_256(tile_gstride);
+
+  // Perform the tiling, recast, and coalesce to the gmem vector again, but with indirections to the gtensor modes
+  auto gbasis = make_identity_layout(shape(gtensor));
+  auto tile_gbasis_tmp = gbasis.compose(sidx2gmode);
+
+  // Instead of the recast (gbasis doesn't have type info), replace the shape with the already-recasted shape
+  // tma_box_shape:gmem_mode
+  auto tile_gbasis = make_layout(shape(tile_gstride), stride(tile_gbasis_tmp));
+
+  // "Coalesce" the tile basis into a compatible shape with the tma_gstride
+  auto tma_gbasis_tile = tile_gbasis.compose(make_layout(wrap(shape(tma_gstride))));
+
+  // Recast the original tensor for shape/stride inspections
+  Tensor gtensor_T = recast<TmaInternalType>(gtensor);
+
+  // Find missing bases that don't appear in tile_gbasis
+  auto tile_gbasis_remaining_stride = filter_tuple(flatten(shape (gtensor_T)), flatten(stride(gtensor_T)),
+                                                   flatten(stride(gbasis)),
+                                                   [&](auto s, auto d, auto e)
+  {
+    if constexpr (is_constant<1, decltype(s)>::value || is_constant<0, decltype(d)>::value) {
+      return cute::tuple<>{};          // If size-1 or stride-0, then don't append
+    } else {
+      using E = decltype(e);
+      auto has_e = any_of(flatten(stride(tma_gbasis_tile)), [] (auto tb) { return tb == E{}; });
+      if constexpr (decltype(has_e)::value) {
+        return cute::tuple<>{};        // If d was found, then don't append
+      } else {
+        return cute::tuple<E>(e);      // Else, this is missing so append
+      }
+    }
+  });
+
+  // Append the remaining basis modes that contribute to the TMA with size-1
+  auto tile_gbasis_remaining_shape = repeat<rank(tile_gbasis_remaining_stride)>(Int<1>{});
+  auto tma_gbasis_full = make_layout(tuple_cat(wrap( shape(tma_gbasis_tile)), wrap(tile_gbasis_remaining_shape )),
+                                     tuple_cat(wrap(stride(tma_gbasis_tile)), wrap(tile_gbasis_remaining_stride)));
+
+  // Group the trailing modes to make this max rank-5 -- TMA rank limitation
+  // tma_box_shape:gmem_mode
+  auto tma_gbasis = group<cute::min(rank(tma_gbasis_full),4),-1>(tma_gbasis_full);
+
+#if 0
+  print("tile_gstride : "); print(tile_gstride); print("\n");
+  print("tma_gstride  : "); print(tma_gstride); print("\n");
+  print("gbasis       : "); print(gbasis); print("\n");
+  print("tile_gbasis  : "); print(tma_gbasis_tile); print("\n");
+  print("tma_gbasis   : "); print(tma_gbasis); print("\n");
+#endif
+
+  return tma_gbasis;
+}
+
+template <class GEngine, class GLayout,
+          class TmaGmemBasisStride,
+          class ShapeT, size_t TmaRank>
+CUTE_HOST_DEVICE constexpr
+void
+fill_tma_gmem_shape_stride(Tensor<GEngine,GLayout>   const& gtensor,           // Gmem Shapes and Strides, in units of TmaInternalType
+                           TmaGmemBasisStride        const& tma_gbasis_stride, // Map Tma mode idx -> Gmem mode(s)
+                           cute::array<ShapeT,   TmaRank> & gmem_prob_shape,   // Tma Shapes, uint32_t or uin64_t
+                           cute::array<uint64_t, TmaRank> & gmem_prob_stride)  // Tma Strides
+{
+  static_assert(is_tuple<TmaGmemBasisStride>::value);
+  static_assert(is_same<uint32_t, ShapeT>::value || is_same<uint64_t, ShapeT>::value);
+
+  using TmaInternalType = typename GEngine::value_type;
+  constexpr int tma_rank = decltype(rank(tma_gbasis_stride))::value;
+  static_assert(TmaRank >= tma_rank);
+
+  auto gmem_shape  =  shape(gtensor);
+  auto gmem_stride = stride(gtensor);
+  // Use the indirections in tma_gbasis_stride into gtensor to construct the tma gmem shapes/strides
+  for_each(make_seq<tma_rank>{}, [&](auto i) {
+    constexpr int tma_i_rank = decltype(rank<i>(tma_gbasis_stride))::value;
+    if constexpr (tma_i_rank == 1) {
+      // Trivial contribution of this gmem mode to this tma mode
+      auto ej = unwrap(get<i>(tma_gbasis_stride));
+      gmem_prob_shape[i]  = basis_get(ej, gmem_shape);
+      gmem_prob_stride[i] = basis_get(ej, gmem_stride);
+    } else {
+      // Apply a recurrence to each gmem mode that contributes to this tma mode
+      for_each(get<i>(tma_gbasis_stride), [&](auto ej) {
+        // Problem shape
+        uint64_t shape_j  = basis_get(ej, gmem_shape);
+        // Problem stride (in bytes)
+        uint64_t stride_j = basis_get(ej, gmem_stride);
+        uint64_t old_stride = gmem_prob_stride[i];
+        gmem_prob_stride[i] = gcd(gmem_prob_stride[i], stride_j);
+
+        if (gmem_prob_stride[i] != 0) {
+          // Recurrence: g_shape = (s_i - 1) * (d_i / gcd_j d_j) + 1
+          gmem_prob_shape[i] = (gmem_prob_shape[i]-1) * (old_stride / gmem_prob_stride[i])
+                             +            (shape_j-1) * (stride_j   / gmem_prob_stride[i])
+                             + 1;
+        } else {
+          gmem_prob_shape[i] = shape_j;
+        }
+      });
+    }
+  });
+}
+
+// Overload for an existing Copy_Traits
+template <class GEngine, class GLayout,
+          class Op, class Bits, class Aux,
+          class ShapeT, size_t TmaRank>
+CUTE_HOST_DEVICE constexpr
+void
+fill_tma_gmem_shape_stride(Copy_Traits<Op,Bits,Aux>  const& tma_traits,
+                           Tensor<GEngine,GLayout>   const& gtensor,           // Gmem Shapes and Strides, value_type = TmaInternalType
+                           cute::array<ShapeT,   TmaRank> & gmem_prob_shape,   // Tma Shapes, uint32_t or uin64_t
+                           cute::array<uint64_t, TmaRank> & gmem_prob_stride)  // Tma Strides
+{
+  return fill_tma_gmem_shape_stride(gtensor, stride(typename Aux::TmaGmemBasis{}),
+                                    gmem_prob_shape, gmem_prob_stride);
+}
+
+// Use a sidx2gmode to read through the GMEM tensor
+//   and construct a TMA Descriptor for the resulting instruction
+// At the same time, construct the Tma Tensor's Stride to generate
+//   the TMA coordinates that the instruction consumes.
+//
+template <class TmaInternalType,
+          class GEngine, class GLayout,
+          class TShape, class TStride,
+          int B, int M, int S>
+CUTE_HOST_RTC
+auto
+make_tma_copy_desc(Tensor<GEngine,GLayout> const& gtensor,         // The original GMEM Tensor
+                   Layout<TShape,TStride>  const& tma_gbasis,      // TMA mode -> GMEM mode mapping
+                   Swizzle<B,M,S>          const& swizzle,         // Swizzle fn on smem_idx
+                   uint32_t                       num_multicast)   // The number of CTAs in multicasting
+{
+  //
+  // TMA desc creation
+  //
+
+  constexpr int tma_dim = decltype(rank(tma_gbasis))::value;
+
+  //
+  // TMA gmem desc info
+  //
+
+  // Recast the original tensor for shape/stride inspections
+  Tensor gtensor_T = recast<TmaInternalType>(gtensor);
+
+  void* gmem_address = (void*) raw_pointer_cast(gtensor_T.data());
+  auto  gmem_layout  = gtensor_T.layout();
+
+  cute::array<uint64_t, 5> gmem_prob_shape  = {1,1,1,1,1};
+  cute::array<uint64_t, 5> gmem_prob_stride = {0,0,0,0,0};
+
+  fill_tma_gmem_shape_stride(gtensor_T, stride(tma_gbasis), gmem_prob_shape, gmem_prob_stride);
+
+  assert((reinterpret_cast<uint64_t>(gmem_address) & 0b1111) == 0);  // Address must be 16B-aligned
+
+  assert(gmem_prob_shape[0] >= (uint64_t(1)));               // Size must be min 1
+  assert(gmem_prob_shape[0] <= (uint64_t(1) << 32));         // Size must be max 2^32
+  assert(gmem_prob_shape[1] >= (uint64_t(1)));               // Size must be min 1
+  assert(gmem_prob_shape[1] <= (uint64_t(1) << 32));         // Size must be max 2^32
+  assert(gmem_prob_shape[2] >= (uint64_t(1)));               // Size must be min 1
+  assert(gmem_prob_shape[2] <= (uint64_t(1) << 32));         // Size must be max 2^32
+  assert(gmem_prob_shape[3] >= (uint64_t(1)));               // Size must be min 1
+  assert(gmem_prob_shape[3] <= (uint64_t(1) << 32));         // Size must be max 2^32
+  assert(gmem_prob_shape[4] >= (uint64_t(1)));               // Size must be min 1
+  assert(gmem_prob_shape[4] <= (uint64_t(1) << 32));         // Size must be max 2^32
+
+  // TMA descriptor does not store the zeroth stride and assumes it is 1 (TmaInternalType element).
+  assert(gmem_prob_stride[0] == 1 && "Majorness of smem doesn't match majorness of gmem");
+
+  // convert strides to byte strides
+  for(uint64_t& stride : gmem_prob_stride) {
+    stride = (stride * sizeof_bits_v<TmaInternalType>) / 8;
+  }
+
+  // Assert the byte strides. Tma Descriptor uses byte strides
+  assert((gmem_prob_stride[1]) < (uint64_t(1) << 40));       // Stride must be max 2^40
+  assert((gmem_prob_stride[1] & 0b1111) == 0);               // Stride must be multiple of 16B (128b)
+  assert((gmem_prob_stride[2]) < (uint64_t(1) << 40));       // Stride must be max 2^40
+  assert((gmem_prob_stride[2] & 0b1111) == 0);               // Stride must be multiple of 16B (128b)
+  assert((gmem_prob_stride[3]) < (uint64_t(1) << 40));       // Stride must be max 2^40
+  assert((gmem_prob_stride[3] & 0b1111) == 0);               // Stride must be multiple of 16B (128b)
+  assert((gmem_prob_stride[4]) < (uint64_t(1) << 40));       // Stride must be max 2^40
+  assert((gmem_prob_stride[4] & 0b1111) == 0);               // Stride must be multiple of 16B (128b)
+
+  //
+  // TMA smem desc info
+  //
+
+  cute::array<uint32_t, 5> smem_box_shape  = {1,1,1,1,1};
+  cute::array<uint32_t, 5> smem_box_stride = {1,1,1,1,1};
+  // The smem box is simply given by the sizes of the modes in tma_gbasis
+  for_each(make_seq<tma_dim>{}, [&](auto i) {
+    smem_box_shape[i] *= size<i>(tma_gbasis);
+  });
+  // Finally, truncate the tma box by the num_multicast
+  for (uint32_t i = tma_dim-1, multicast = num_multicast; multicast > 1; --i) {
+    assert(smem_box_shape[i] % multicast == 0 || multicast % smem_box_shape[i] == 0);
+    uint32_t new_mult = ceil_div(multicast, smem_box_shape[i]);
+    smem_box_shape[i] = ceil_div(smem_box_shape[i], multicast);
+    multicast = new_mult;
+  }
+
+  assert(smem_box_shape[0] >= (uint32_t(1)));                // Size must be min 1
+  assert(smem_box_shape[0] <= (uint32_t(1) << 8));           // Size must be max 2^8 = 256
+  assert(smem_box_shape[1] >= (uint32_t(1)));                // Size must be min 1
+  assert(smem_box_shape[1] <= (uint32_t(1) << 8));           // Size must be max 2^8 = 256
+  assert(smem_box_shape[2] >= (uint32_t(1)));                // Size must be min 1
+  assert(smem_box_shape[2] <= (uint32_t(1) << 8));           // Size must be max 2^8 = 256
+  assert(smem_box_shape[3] >= (uint32_t(1)));                // Size must be min 1
+  assert(smem_box_shape[3] <= (uint32_t(1) << 8));           // Size must be max 2^8 = 256
+  assert(smem_box_shape[4] >= (uint32_t(1)));                // Size must be min 1
+  assert(smem_box_shape[4] <= (uint32_t(1) << 8));           // Size must be max 2^8 = 256
+
+  assert(smem_box_stride[0] >= (uint32_t(1)));               // Stride must be min 1
+  assert(smem_box_stride[0] <= (uint32_t(8)));               // Stride must be max 2^3 = 8
+  assert(smem_box_stride[1] >= (uint32_t(1)));               // Stride must be min 1
+  assert(smem_box_stride[1] <= (uint32_t(8)));               // Stride must be max 2^3 = 8
+  assert(smem_box_stride[2] >= (uint32_t(1)));               // Stride must be min 1
+  assert(smem_box_stride[2] <= (uint32_t(8)));               // Stride must be max 2^3 = 8
+  assert(smem_box_stride[3] >= (uint32_t(1)));               // Stride must be min 1
+  assert(smem_box_stride[3] <= (uint32_t(8)));               // Stride must be max 2^3 = 8
+  assert(smem_box_stride[4] >= (uint32_t(1)));               // Stride must be min 1
+  assert(smem_box_stride[4] <= (uint32_t(8)));               // Stride must be max 2^3 = 8
+
+    //
+    // Construct the descriptor
+    //
+
+    TmaDescriptor tma_desc{};
+
+    //
+    // TMA general info
+    //
+
+  #if (__CUDACC_VER_MAJOR__ >= 12) && !defined(__CUDACC_RTC__)
+
+    CUtensorMapDataType     tma_format      = TMA::to_CUtensorMapDataType<TmaInternalType>();
+    CUtensorMapInterleave   tma_interleave  = CU_TENSOR_MAP_INTERLEAVE_NONE;
+    CUtensorMapL2promotion  tma_l2Promotion = CU_TENSOR_MAP_L2_PROMOTION_L2_128B;
+    CUtensorMapFloatOOBfill tma_oobFill     = CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE;
+
+    // TMA smem swizzle type
+    TMA::SmemSwizzleBits swizzle_bits = get_tma_swizzle_bits(swizzle);
+    TMA::SmemSwizzleBase swizzle_base = get_tma_swizzle_base(swizzle);
+    CUtensorMapSwizzle smem_swizzle = TMA::to_CUtensorMapSwizzle(swizzle_bits, swizzle_base);
+    CUresult result = CUTLASS_CUDA_DRIVER_WRAPPER_CALL(cuTensorMapEncodeTiled)(
+        &tma_desc,
+        tma_format,
+        tma_dim,
+        gmem_address,
+        gmem_prob_shape.data(),
+        gmem_prob_stride.data() + 1,  // gmem_prob_stride[0] implicitly 1
+        smem_box_shape.data(),
+        smem_box_stride.data(),
+        tma_interleave,
+        smem_swizzle,
+        tma_l2Promotion,
+        tma_oobFill);
+
+    if (result != CUDA_SUCCESS) {
+      std::cerr << "TMA Desc Addr:   " << &tma_desc
+                << "\nformat         " << tma_format
+                << "\ndim            " << tma_dim
+                << "\ngmem_address   " << gmem_address
+                << "\nglobalDim      " << gmem_prob_shape
+                << "\nglobalStrides  " << gmem_prob_stride
+                << "\nboxDim         " << smem_box_shape
+                << "\nelementStrides " << smem_box_stride
+                << "\ninterleave     " << tma_interleave
+                << "\nswizzle        " << smem_swizzle
+                << "\nl2Promotion    " << tma_l2Promotion
+                << "\noobFill        " << tma_oobFill << std::endl;
+      std::cerr << "Error: Failed to initialize the TMA descriptor " << result << std::endl;
+      assert(false);
+    }
+
+  #endif // (__CUDACC_VER_MAJOR__ >= 12) && !defined(__CUDACC_RTC__)
+  auto recast_ratio = cute::trait_ratio(sizeof_bits<typename GEngine::value_type>{},
+                                        sizeof_bits<             TmaInternalType>{});
+
+  auto gbasis = make_basis_like(shape(gtensor));
+
+  // Finally, get the inverse permutation of the E<i> bases for the mocked gmem stride
+  auto gmem_tma_basis_stride = transform_leaf(gbasis, [&](auto ei) {
+    auto si = basis_get(ei,  shape(gmem_layout));
+    auto di = basis_get(ei, stride(gmem_layout));
+    if constexpr (is_constant<1, decltype(si)>::value || is_constant<0, decltype(di)>::value) {
+      return Int<0>{};                  // If size-1 or stride-0, return arithmetic identity -- no contribution to the TMA
+    } else {
+      auto tma_gmem_basis_stride = stride(tma_gbasis);
+      // Find j such that E<i> is in stride<j>(tma_gbasis)
+      using EI = decltype(ei);
+      [[maybe_unused]] auto j = find_if(tma_gmem_basis_stride, [&](auto tma_stride_j) { return any_of(tma_stride_j, [&](auto dj) { return dj == EI{}; }); });
+      if constexpr (decltype(j == rank(tma_gmem_basis_stride))::value) {
+        return Int<0>{};               // If not-found, return arithmetic identity -- no contribution to the TMA
+      } else
+      if constexpr (decltype(j == Int<0>{})::value) {
+        auto scale = recast_ratio * basis_get(ei, stride(gtensor));
+        return E<j>{} * scale;         // Return TMA Coord basis -- with a recast scale factor
+      } else
+      if constexpr (decltype(rank<j>(tma_gmem_basis_stride) == Int<1>{})::value) {
+        return E<j>{};                 // Return TMA Coord basis -- known scale of Int<1>{}
+      } else {
+        int32_t scale = ceil_div(int32_t(di * sizeof_bits_v<TmaInternalType> / cute::max(gmem_prob_stride[j], uint64_t{16})), 8);
+        return E<j>{} * scale;         // Return TMA Coord basis -- with a dynamic scale factor
+      }
+    }
+  });
+
+#if 0
+    print("gmem_tma_basis_stride : "); print(gmem_tma_basis_stride); print("\n");
+#endif
+
+  using AuxParams = AuxTmaParams<decltype(gmem_tma_basis_stride),
+                                 decltype(tma_gbasis),
+                                 decltype(swizzle)>;
+  return cute::make_tuple(tma_desc, AuxParams{gmem_tma_basis_stride});
+}
+
+template <class TmaInternalType,
+          class CopyOp,
+          class GEngine, class GLayout,
+          class SLayout,
+          class VShape, class VStride>
+CUTE_HOST_RTC
+auto
+make_tma_copy_atom(CopyOp,
+                   Tensor<GEngine,GLayout> const& gtensor,       // Full GMEM Tensor
+                   SLayout                 const& slayout,       // CTA Tile of SMEM, potentially swizzled
+                   uint32_t                const& num_multicast, // The number of CTAs involved in multicasting
+                   Layout<VShape,VStride>  const& cta_v_map)     // V: CTA val idx -> gmem mode
+{
+  //
+  // TMA truncated layout
+  //
+
+  auto smem_swizzle = get_swizzle_portion(slayout);
+  auto smem_layout  = get_nonswizzle_portion(slayout);
+
+  auto tma_gbasis = detail::construct_tma_gbasis<TmaInternalType>(gtensor, smem_layout, cta_v_map);
+
+  //
+  // Construct the TMA Desc and the strides of the TMA Tensor
+  //
+
+  auto [tma_desc, aux_params] = detail::make_tma_copy_desc<TmaInternalType>(gtensor,
+                                                                            tma_gbasis,
+                                                                            smem_swizzle,
+                                                                            num_multicast);
+
+  //
+  // Construct the Copy_Traits
+  //
+
+  constexpr int num_bits_per_tma = size(tma_gbasis) * sizeof_bits_v<TmaInternalType>;
+  using Traits = Copy_Traits<CopyOp, cute::C<num_bits_per_tma>, decltype(aux_params)>;
+  using Atom   = Copy_Atom<Traits, typename GEngine::value_type>;
+
+  Traits tma_traits{tma_desc, aux_params};
+
+#if 0
+  print("num_bits_per_tma :  "); print(num_bits_per_tma); print("\n");
+  print("g_stride_bases   :  "); print(tma_traits.aux_params_.g_stride_); print("\n");
+#endif
+
+  // Return the Copy_Atom
+  return Atom{tma_traits};
+}
+
+// The "logical TMA tid" is a map from the CTA rank to its logical id
+// within the instruction.  It works like a mask or ordering on the
+// CTAs.  For non-multicast TMA, all CTAs should map to 0.  For
+// multicast TMA of size 4, CTAs will be mapped to {0,1,2,3}.
+template <class TmaInternalType,
+          class CopyOp,
+          class GEngine, class GLayout,
+          class SLayout,
+          class TShape, class TStride,
+          class VShape, class VStride>
+CUTE_HOST_RTC
+auto
+make_tma_copy_tiled(CopyOp                  const& copy_op,
+                    Tensor<GEngine,GLayout> const& gtensor,     // Full GMEM Tensor
+                    SLayout                 const& slayout,     // CTA Tile of SMEM
+                    Layout<TShape,TStride>  const& cta_t_map,   // T: CTA thr idx -> logical TMA tid
+                    Layout<VShape,VStride>  const& cta_v_map)   // V: CTA val idx -> gmem mode
+{
+  Copy_Atom atom = make_tma_copy_atom<TmaInternalType>(copy_op, gtensor, slayout,
+                                                       cosize(cta_t_map), cta_v_map);
+
+  //
+  // Construct the TiledCopy
+  //
+
+  [[maybe_unused]] auto cta_tiler = product_each(shape(cta_v_map));
+
+  auto num_elems_per_tma = size<1>(typename decltype(atom)::RefLayout{}) / static_value<sizeof_bits<typename GEngine::value_type>>();
+
+  // smem idx -> smem coord
+  auto inv_smem_layout = right_inverse(get_nonswizzle_portion(slayout));
+  // CTA V -> smem_coord
+  auto layout_v = composition(inv_smem_layout, num_elems_per_tma);
+  // Scale that up to cover all of the smem_coords
+  auto layout_V = tile_to_shape(make_layout(layout_v), size(cta_v_map));
+  // CTA T -> smem idx
+  auto layout_t = make_layout(cosize(cta_t_map), shape_div(num_elems_per_tma, cosize(cta_t_map)));
+  // CTA TID -> smem coord
+  auto layout_T = composition(inv_smem_layout, composition(layout_t, cta_t_map));
+  // Combine with the T mapping
+  [[maybe_unused]] auto layout_TV = make_layout(layout_T, layout_V);
+
+#if 0
+  print("cta_tiler : "); print(cta_tiler); print("\n");
+  print("layout_v : "); print(layout_v); print("\n");
+  print("layout_V : "); print(layout_V); print("\n");
+  print("layout_t : "); print(layout_t); print("\n");
+  print("layout_T : "); print(layout_T); print("\n");
+  print("layout_TV : "); print(layout_TV); print("\n");
+#endif
+
+  return TiledCopy<decltype(atom), decltype(layout_TV), decltype(cta_tiler)>{atom};
+}
+
+} // end namespace detail
+
+/** Make a CuTe CTA-collective TiledCopy for a TMA operation.
+ *
+ * @param CopyOp The target copy operation: SM90_TMA_LOAD, SM90_TMA_LOAD_MULTICAST, SM90_TMA_STORE
+ * @param gtensor The GMEM Tensor to be involved in the TMA.
+ * @param slayout The SMEM Layout to be involved in the TMA.
+ * @param cta_tile The CTA-local tile that each CTA will be tiling GMEM with.
+ *                 This is often the blk_shape that is used to tile the GMEM for CTAs:
+ *                   local_tile(gtensor, blk_shape, blk_coord) -> CTA-local tile of gtensor
+ * @param cluster_size When using SM90_TMA_LOAD_MULTICAST, this can be a (static) power-of-2 <= 16
+ *                   defining the multicast size (used to further partition the SMEM)
+ *                 Else, static-1
+ *
+ * This code attempts to maximize the TMA box size. It does this by tracing
+ * the SMEM "vector" -- the inverse of the smem layout -- to find the largest
+ * contiguous array of smem that can be written to/from global memory given
+ * the constraints that the TMA instruction imposes.
+ *
+ * This is accomplished by assigning "basis" strides to the GMEM to track which
+ * modes of SMEM map to which modes of GMEM, then reorder the modes of GMEM according
+ * to the SMEM vector, and then using those GMEM/SMEM modes to fill in the desc.
+ *
+ * Examples:
+     using T = float;
+     T* gptr = nullptr;
+
+    {
+    // Simple 2D
+    Tensor gtensor = make_tensor(gptr, make_shape(1024, 256), GenRowMajor{}); // K-Major GMEM
+    auto slayout   = make_layout(make_shape(_64{}, _32{}), GenRowMajor{});    // K-Major SMEM
+    auto tma = make_tma_copy(SM90_TMA_LOAD{}, gtensor, slayout);
+    }
+
+    {
+    // GMMA 2D
+    Tensor gtensor = make_tensor(gptr, make_shape(1024, 256));                                 // MN-Major GMEM
+    auto slayout   = tile_to_shape(GMMA::Layout_MN_SW128_Atom<T>{}, make_shape(_128{},_64{})); // MN-Major Swizzled+Tiled 128x64 SMEM
+    auto tma = make_tma_copy(SM90_TMA_LOAD{}, gtensor, slayout);
+    }
+
+    {
+    // 3D
+    Tensor gtensor = make_tensor(gptr, make_shape(1024, 32, 512), make_stride(64, Int<1>{}, 65536)); // GMEM
+    auto slayout   = make_layout(make_shape(_16{}, _8{}, _2{}), make_stride(_16{}, _1{}, _8{}));     // SMEM w/ same major-mode
+    auto tma = make_tma_copy(SM90_TMA_LOAD{}, gtensor, slayout);
+    }
+
+    {
+    // cuTENSOR 4D
+    auto layout = make_shape(make_shape(32,40),make_shape(make_shape(8,8),656)); // GMEM
+    auto cta_tile    = make_shape(_128{},make_shape(_32{},_2{}));                // GMEM Tiling:
+                                                                                 //   Take 128-elem from m: m0 must divide 128,
+                                                                                 //                         m-last may be predicated
+                                                                                 //   Take 32-elem from k0, 2-elem from k1
+    auto slayout = make_layout(cta_tile);                                        // Col-Major SMEM
+    auto tma = make_tma_copy(SM90_TMA_LOAD{}, gtensor, slayout, cta_tile, Int<1>{});
+    }
+ *
+ * Check the TMA box size and desc:
+    print("TMA Box size:  "); print(typename decltype(tma)::Tiler_MN{}); print("\n");
+    print("TMA desc     : "); print(tma.tma_desc_); print("\n");
+ *
+ * Usage:
+     Tensor mA = tma_a.get_tma_tensor(make_shape(M,N));        // (M,N) TMA coord tensor
+     Tensor gA = local_tile(mA, cta_tile, cta_coord);          // (BLK_M,BLK_N) TMA coord tensor for this CTA
+     Tensor sA = make_tensor(make_smem_ptr<T>(sptr), slayout); // (BLK_M,BLK_N) SMEM tensor
+
+     auto cta_tma = tma.get_slice(cta_idx_in_cluster);         // Slice for multicast partitioning
+     Tensor tAgA = cta_tma.partition_S(gA);                    // Partition for src
+     Tensor tAsA = cta_tma.partition_D(sA);                    // Partition for dst
+
+     copy(tma.with(barrier, mcast_mask), tAgA, tAsA);          // copy with supporting TMA params
+ */
+template <class TmaInternalType = void,
+          class CopyOp,
+          class GEngine, class GLayout,
+          class SLayout,
+          class CTA_Tiler,
+          class Cluster_Size>
+CUTE_HOST_RTC
+auto
+make_tma_copy(CopyOp                  const& copy_op,
+              Tensor<GEngine,GLayout> const& gtensor,
+              SLayout                 const& slayout,
+              CTA_Tiler               const& cta_tiler,
+              Cluster_Size            const& cluster_size)
+{
+  if constexpr (cute::is_same_v<CopyOp, SM90_TMA_LOAD_IM2COL> ||
+                cute::is_same_v<CopyOp, SM90_TMA_STORE_IM2COL>) {
+    return make_im2col_tma_copy(copy_op,
+                                gtensor,
+                                slayout,
+                                cta_tiler,
+                                cluster_size);
+  } else {
+    auto cta_v_tile = make_identity_layout(shape(gtensor)).compose(cta_tiler);
+    auto cta_t_tile = make_layout(cluster_size);
+    // Prefer TmaInternalType if specified. Fallback to GEngine::value_type
+    using TmaType = conditional_t<is_same<void, TmaInternalType>::value, typename GEngine::value_type, TmaInternalType>;
+    return detail::make_tma_copy_tiled<TmaType>(copy_op,
+                                                gtensor, slayout,
+                                                cta_t_tile, cta_v_tile);
+  }
+}
+
+// Explicit defaulting
+template <class CopyOp,
+          class GEngine, class GLayout,
+          class SLayout>
+CUTE_HOST_RTC
+auto
+make_tma_copy(CopyOp                  const& copy_op,
+              Tensor<GEngine,GLayout> const& gtensor,
+              SLayout                 const& slayout)
+{
+  return make_tma_copy(copy_op, gtensor, slayout, product_each(shape(slayout)), Int<1>{});
+}
+
+// Explicit defaulting
+template <class CopyOp,
+          class GEngine, class GLayout,
+          class SLayout,
+          class Cluster_Size>
+CUTE_HOST_RTC
+auto
+make_tma_copy(CopyOp                  const& copy_op,
+              Tensor<GEngine,GLayout> const& gtensor,
+              SLayout                 const& slayout,
+              Cluster_Size            const& cluster_size)
+{
+  return make_tma_copy(copy_op, gtensor, slayout, product_each(shape(slayout)), cluster_size);
+}
+
+////////////////////////////////////
+// Experimental Make TMA Atom and Partitioner
+///////////////////////////////////
+
+template <class TmaInternalType = void,
+          class CopyOp,
+          class GEngine, class GLayout,
+          class SLayout,
+          class CTA_Tiler,
+          class Cluster_Size = Int<1>>
+CUTE_HOST_RTC
+auto
+make_tma_atom(CopyOp                  const& copy_op,
+              Tensor<GEngine,GLayout> const& gtensor,
+              SLayout                 const& slayout,
+              CTA_Tiler               const& cta_tiler,
+              Cluster_Size            const& cluster_size = {})
+{
+  auto cta_v_tile = make_identity_layout(shape(gtensor)).compose(cta_tiler);
+  // Prefer TmaInternalType if specified. Fallback to GEngine::value_type
+  using TmaType = conditional_t<is_same<void, TmaInternalType>::value, typename GEngine::value_type, TmaInternalType>;
+  return detail::make_tma_copy_atom<TmaType>(copy_op,
+                                             gtensor, slayout,
+                                             size(cluster_size), cta_v_tile);
+}
+
+// The "VectorCopy Partitioner" for TMA
+template <class... Args,
+          class CtaCoord,
+          class TShape, class TStride,
+          class SEngine, class SLayout,
+          class GEngine, class GLayout>
+CUTE_DEVICE
+auto
+tma_partition(Copy_Atom<Args...>      const& copy_atom,
+              CtaCoord                const& cta_coord,
+              Layout<TShape,TStride>  const& cta_layout,  // T: CTA coord -> logical multicast id
+              Tensor<SEngine,SLayout> const& stensor,     // SMEM Tensor (TMATile, Rest...)
+              Tensor<GEngine,GLayout> const& gtensor)     // GMEM Tensor (TMATile, Rest...)
+{
+  CUTE_STATIC_ASSERT_V(size<0>(stensor) == size<0>(gtensor));
+
+  // Invert the smem to get the largest contiguous vector in the smem layout
+  Layout inv_smem_layout = right_inverse(get_nonswizzle_portion(layout<0>(stensor)));
+  // Scale that up to cover all of the smem_coords
+  Layout layout_v = tile_to_shape(make_layout(inv_smem_layout), size<0>(stensor));
+
+  // Factor out the single-instrucion portion
+  Layout tma_layout_v = make_layout(Int<Copy_Atom<Args...>::NumValSrc>{});
+  auto layout_V = make_tile(logical_divide(layout_v, tma_layout_v));
+
+  // Append with _ until we cover all Rest... modes
+  auto glayout_V = append<GLayout::rank>(layout_V, _);
+  auto slayout_V = append<SLayout::rank>(layout_V, _);
+  // Transform tile mode and coalesce
+  Tensor gtensor_v = coalesce(gtensor.compose(glayout_V), Shape<Shape<_1,_1>>{});    // ((TMA,TMA_Iter), Rest...)
+  Tensor stensor_v = coalesce(stensor.compose(slayout_V), Shape<Shape<_1,_1>>{});    // ((TMA,TMA_Iter), Rest...)
+
+#if 0
+  if (thread0()) {
+    print("cta_coord  : "); print(cta_coord); print("\n");
+    print("cta_layout : "); print(cta_layout); print("\n");
+    print("gtensor   : "); print(gtensor); print("\n");
+    print("stensor   : "); print(stensor); print("\n");
+    print("layout_V  : "); print(layout_V); print("\n");
+    print("gtensor_v : "); print(gtensor_v); print("\n");
+    print("stensor_v : "); print(stensor_v); print("\n");
+  }
+#endif
+
+  // Offset inside the TMA-mode for the multicast
+  auto multicast_offset = cta_layout(cta_coord) * (size(tma_layout_v) / cosize(cta_layout));
+  auto multicast_coord  = make_coord(make_coord(multicast_offset, Int<0>{}));
+  auto gcoord = append<GLayout::rank>(multicast_coord, Int<0>{});
+  auto scoord = append<SLayout::rank>(multicast_coord, Int<0>{});
+
+  Tensor gresult = domain_offset(gcoord, gtensor_v);
+  Tensor sresult = domain_offset(scoord, stensor_v);
+
+  return cute::make_tuple(gresult, sresult);
+}
+
+// Explicit defaults for cta_coord and cta_layout
+template <class... Args,
+          class SEngine, class SLayout,
+          class GEngine, class GLayout>
+CUTE_DEVICE
+auto
+tma_partition(Copy_Atom<Args...>      const& copy_atom,
+              Tensor<SEngine,SLayout> const& stensor,     // SMEM Tensor (TMATile, Rest...)
+              Tensor<GEngine,GLayout> const& gtensor)     // GMEM Tensor (TMATile, Rest...)
+{
+  return tma_partition(copy_atom, Int<0>{}, Layout<_1,_0>{}, stensor, gtensor);
+}
+
+// TMA Multicast Masks Calculation
+template <int Mode, class CtaLayout, class CtaCoord>
+CUTE_HOST_DEVICE constexpr
+uint16_t
+create_tma_multicast_mask(CtaLayout const& cta_layout_vmnk,
+                          CtaCoord  const& cta_coord_vmnk)
+{
+  auto cta_coord_slicer = replace<Mode>(cta_coord_vmnk, _);
+  auto [cta_layout, elected_cta] = slice_and_offset(cta_coord_slicer, cta_layout_vmnk);
+
+  uint16_t mcast_mask = 0;
+  if constexpr (rank_v<decltype(cta_layout)> == 1 and depth_v<decltype(cta_layout)> <= 1 and
+                not is_static<decltype(cta_layout)>::value) {
+    // Get the instruction code -- optimized for dynamic flat-rank-1 cta_layout
+    mcast_mask = uint16_t(1);
+    // Smear by stride<0> (may want to predicate on stride<0> mag?)
+    mcast_mask |= mcast_mask << (1*stride<0>(cta_layout));
+    mcast_mask |= mcast_mask << (2*stride<0>(cta_layout));
+    mcast_mask |= mcast_mask << (4*stride<0>(cta_layout));
+    mcast_mask |= mcast_mask << (8*stride<0>(cta_layout));
+    // Select shape<0>
+    mcast_mask &= (uint16_t(-1) >> (16 - shape<0>(cta_layout) * stride<0>(cta_layout)));
+  } else {
+    // Get the instruction code -- generic path
+    for (int i = 0; i < size(cta_layout); ++i) {
+      mcast_mask |= uint16_t(1) << cta_layout(i);
+    }
+  }
+  // Shift by the instruction's elected block rank (dynamic)
+  mcast_mask <<= elected_cta;
+  return mcast_mask;
+}
+
+////////////////////////////////////
+// Make TMA copy A/B/C
+///////////////////////////////////
+
+template <class TmaInternalType = void,
+          class CopyOp,
+          class GEngine, class GLayout,
+          class SLayout,
+          class CTA_Tiler,
+          class Cluster_Size>
+CUTE_HOST_RTC
+auto
+make_tma_copy_A_sm90(CopyOp                  const& copy_op,
+                     Tensor<GEngine,GLayout> const& gtensor,
+                     SLayout                 const& slayout,
+                     CTA_Tiler               const& cta_tiler,
+                     Cluster_Size            const& cluster_size)
+{
+  // Keep only MK modes from MNK
+  auto cta_tiler_mk = remove<1>(cta_tiler);
+
+  // mcast along N mode for this M load, if any
+  auto cluster_size_n = size<1>(cluster_size);
+
+  if constexpr (cute::is_same_v<CopyOp, SM90_TMA_LOAD_IM2COL>) {
+    return make_im2col_tma_copy(copy_op,
+                                gtensor,
+                                slayout,
+                                cta_tiler_mk,
+                                cluster_size_n);
+  } else {
+    auto cta_v_tile = make_identity_layout(shape(gtensor)).compose(cta_tiler_mk);
+    auto cta_t_tile = make_layout(cluster_size_n);
+
+    // Prefer TmaInternalType if specified. Fallback to GEngine::value_type
+    using TmaType = conditional_t<is_same<void, TmaInternalType>::value, typename GEngine::value_type, TmaInternalType>;
+    auto tma_copy = detail::make_tma_copy_tiled<TmaType>(copy_op, gtensor, slayout, cta_t_tile, cta_v_tile);
+    return tma_copy;
+  }
+}
+
+template <class TmaInternalType = void,
+          class CopyOp,
+          class GEngine, class GLayout,
+          class SLayout,
+          class CTA_Tiler,
+          class Cluster_Size>
+CUTE_HOST_RTC
+auto
+make_tma_copy_B_sm90(CopyOp                  const& copy_op,
+                     Tensor<GEngine,GLayout> const& gtensor,
+                     SLayout                 const& slayout,
+                     CTA_Tiler               const& cta_tiler,
+                     Cluster_Size            const& cluster_size)
+{
+  // Keep only NK modes from MNK
+  auto cta_tiler_nk = remove<0>(cta_tiler);
+
+  // mcast along M mode for this N load, if any
+  auto cluster_size_m = size<0>(cluster_size);
+
+  if constexpr (cute::is_same_v<CopyOp, SM90_TMA_LOAD_IM2COL>) {
+    return make_im2col_tma_copy(copy_op,
+                                gtensor,
+                                slayout,
+                                cta_tiler_nk,
+                                cluster_size_m);
+  } else {
+    auto cta_v_tile = make_identity_layout(shape(gtensor)).compose(cta_tiler_nk);
+    auto cta_t_tile = make_layout(cluster_size_m);
+
+    // Prefer TmaInternalType if specified. Fallback to GEngine::value_type
+    using TmaType = conditional_t<is_same<void, TmaInternalType>::value, typename GEngine::value_type, TmaInternalType>;
+    auto tma_copy = detail::make_tma_copy_tiled<TmaType>(copy_op, gtensor, slayout, cta_t_tile, cta_v_tile);
+    return tma_copy;
+  }
+}
+
+template <class TmaInternalType = void,
+          class CopyOp,
+          class GEngine, class GLayout,
+          class SLayout,
+          class CTA_Tiler>
+CUTE_HOST_RTC
+auto
+make_tma_copy_C_sm90(CopyOp                  const& copy_op,
+                     Tensor<GEngine,GLayout> const& gtensor,
+                     SLayout                 const& slayout,
+                     CTA_Tiler               const& cta_tiler)
+{
+  // Keep only MN modes from MNK
+  auto cta_tiler_mn = remove<2>(cta_tiler);
+
+  if constexpr (cute::is_same_v<CopyOp, SM90_TMA_LOAD_IM2COL> ||
+      cute::is_same_v<CopyOp, SM90_TMA_STORE_IM2COL>) {
+    return make_im2col_tma_copy(copy_op,
+                                gtensor,
+                                slayout,
+                                cta_tiler_mn,
+                                _1{});
+  } else {
+    auto cta_v_tile = make_identity_layout(shape(gtensor)).compose(cta_tiler_mn);
+
+    // No multicast, so only 1 CTA involved
+    auto cta_t_map = Layout<_1,_0>{};
+
+    // Prefer TmaInternalType if specified. Fallback to GEngine::value_type
+    using TmaType = conditional_t<is_same<void, TmaInternalType>::value, typename GEngine::value_type, TmaInternalType>;
+    auto tma_copy = detail::make_tma_copy_tiled<TmaType>(copy_op, gtensor, slayout, cta_t_map, cta_v_tile);
+    return tma_copy;
+  }
+}
+} // end namespace cute
diff --git a/include/cute/atom/copy_traits_sm90_tma_swizzle.hpp b/include/cute/atom/copy_traits_sm90_tma_swizzle.hpp
new file mode 100644
index 0000000000..3286e72b36
--- /dev/null
+++ b/include/cute/atom/copy_traits_sm90_tma_swizzle.hpp
@@ -0,0 +1,93 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+/// @file copy_traits_sm90_tma_swizzle.hpp
+/// @brief Functions for converting swizzle layout to TMA descriptor
+
+#if !defined(__CUDACC_RTC__)
+#include <cuda.h>
+#endif
+
+#include <cute/arch/copy_sm90_desc.hpp>
+#include <cute/swizzle_layout.hpp>
+
+namespace cute::detail {
+
+template <int B, int M, int S>
+CUTE_HOST_DEVICE constexpr
+TMA::SmemSwizzleBits
+get_tma_swizzle_bits(Swizzle<B,M,S>)
+{
+  if constexpr (M == 4) {
+    switch (B) {
+      default:  static_assert(0 <= B && B <= 3, "Expected B = 0,1,2, or 3 when M == 4. Unsupported layout swizzle.");
+      case 3:   return TMA::SmemSwizzleBits::B128;
+      case 2:   return TMA::SmemSwizzleBits::B64;
+      case 1:   return TMA::SmemSwizzleBits::B32;
+      case 0:   return TMA::SmemSwizzleBits::DISABLE;
+    }
+  } else
+  {
+    static_assert(M < 0, "Unsupported layout swizzle.");
+  }
+}
+
+template <class Layout>
+TMA::SmemSwizzleBits
+get_tma_swizzle_bits(Layout const& layout)
+{
+  return get_tma_swizzle_bits(get_swizzle_portion(layout));
+}
+
+template <int B, int M, int S>
+CUTE_HOST_DEVICE constexpr
+TMA::SmemSwizzleBase
+get_tma_swizzle_base(Swizzle<B,M,S>)
+{
+  if constexpr (M == 4) {
+    static_assert(0 <= B && B <= 3, "Expected B = 0,1,2, or 3 when M == 4. Unsupported layout swizzle.");
+    static_assert(S == 3, "Expected S = 3 when M == 4. Unsupported layout swizzle.");
+    return TMA::SmemSwizzleBase::SWIZZLE_BASE_16B;
+  } 
+  else {
+    static_assert(M == 4, "Expected 128b=16B=(2^4)B base swizzle.");
+  }
+}
+
+template <class Layout>
+TMA::SmemSwizzleBase
+get_tma_swizzle_base(Layout const& layout)
+{
+  return get_tma_swizzle_base(get_swizzle_portion(layout));
+}
+
+} // namespace cute::detail
diff --git a/include/cute/atom/mma_atom.hpp b/include/cute/atom/mma_atom.hpp
new file mode 100644
index 0000000000..7cb4fe3df2
--- /dev/null
+++ b/include/cute/atom/mma_atom.hpp
@@ -0,0 +1,1112 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+#include <cute/arch/mma.hpp>
+#include <cute/atom/mma_traits.hpp>
+#include <cute/tensor_impl.hpp>
+#include <cute/util/type_traits.hpp>
+
+namespace cute {
+
+template <class... Args>
+struct MMA_Atom;
+
+template <class MMAOperation>
+struct MMA_Atom<MMAOperation> : MMA_Atom<MMA_Traits<MMAOperation>>
+{};
+
+template <class MMAOperation, class... Args>
+struct MMA_Atom<MMA_Traits<MMAOperation, Args...>>
+  : MMA_Traits<MMAOperation, Args...>
+{
+  using MMA_Op = MMAOperation;
+  using Traits = MMA_Traits<MMAOperation, Args...>;
+
+  // Element value types from the MMA_Traits
+  using ValTypeD = typename Traits::ValTypeD;
+  using ValTypeA = typename Traits::ValTypeA;
+  using ValTypeB = typename Traits::ValTypeB;
+  using ValTypeC = typename Traits::ValTypeC;
+
+  // Thr-Val layouts from the MMA_Traits
+  using Shape_MNK  = typename Traits::Shape_MNK;
+  using ThrID      = typename Traits::ThrID;
+  using LayoutC_TV = typename Traits::CLayout;
+  using LayoutA_TV = typename Traits::ALayout;
+  using LayoutB_TV = typename Traits::BLayout;
+
+  // Fragment value types from the MMA_Traits (optional, defaults to Val type)
+  using FrgTypeD = typename detail::FrgTypeC_or_Default<Traits>::type;
+  using FrgTypeA = typename detail::FrgTypeA_or_Default<Traits>::type;
+  using FrgTypeB = typename detail::FrgTypeB_or_Default<Traits>::type;
+  using FrgTypeC = typename detail::FrgTypeC_or_Default<Traits>::type;
+
+  // Additional Trait parameters/transformations
+  template <class... TraitsArgs>
+  CUTE_HOST_DEVICE
+  auto
+  with(TraitsArgs&&... args) const {
+    auto traits = Traits::with(static_cast<TraitsArgs&&>(args)...);
+    return MMA_Atom<decltype(traits)>{traits};
+  }
+
+  //
+  // Tensor call interfaces
+  //
+
+  // Cast, check, and call fma
+  template <class TD, class DLayout,
+            class TA, class ALayout,
+            class TB, class BLayout,
+            class TC, class CLayout>
+  CUTE_HOST_DEVICE constexpr
+  void
+  call(Tensor<TD, DLayout>      & D,
+       Tensor<TA, ALayout> const& A,
+       Tensor<TB, BLayout> const& B,
+       Tensor<TC, CLayout> const& C) const
+  {
+    static_assert(DLayout::rank == 1, "Expected rank-1 D tensor");
+    static_assert(ALayout::rank == 1, "Expected rank-1 A tensor");
+    static_assert(BLayout::rank == 1, "Expected rank-1 B tensor");
+    static_assert(CLayout::rank == 1, "Expected rank-1 C tensor");
+
+    return mma_unpack(static_cast<Traits const&>(*this), D, A, B, C);
+  }
+
+  // Three arguments reproduces C
+  template <class TA, class ALayout,
+            class TB, class BLayout,
+            class TC, class CLayout>
+  CUTE_HOST_DEVICE constexpr
+  void
+  call(Tensor<TA, ALayout> const& A,
+       Tensor<TB, BLayout> const& B,
+       Tensor<TC, CLayout>      & C) const
+  {
+    return call(C, A, B, C);
+  }
+
+  //
+  // make_fragment_A|B|C
+  //   These functions are awkward as they expect already-partitioned tensors
+  //     resulting from a previous call to partition_A|B|C
+  //   The reasoning is that we can inspect the layout of the partitioned data
+  //     and attempt to match it in generated fragment to promote vectorization
+  //     when copying from partition to fragment.
+  //
+
+  template <class CTensor>
+  CUTE_HOST_DEVICE static constexpr
+  auto
+  make_fragment_C(CTensor&& ctensor)
+  {
+    // Check that this tensor is likely already partitioned
+    CUTE_STATIC_ASSERT_V(rank(ctensor) >= Int<3>{});  // VMN
+    CUTE_STATIC_ASSERT_V(size<0>(ctensor) == size<1>(LayoutC_TV{}));
+    // C is a bit special because we are after accumulators here
+    // The input/output type doesn't have to match the accumulator type
+    //static_assert(std::is_same<ValTypeC, typename remove_cvref_t<CTensor>::value_type>::value, "Expecting ValTypeC type");
+
+    // We'll never base the accumulator layout on the input tensor layout, so just return a FrgTypeC tensor
+    return make_tensor<FrgTypeC>(shape(ctensor));
+  }
+
+  template <class ATensor>
+  CUTE_HOST_DEVICE static constexpr
+  auto
+  make_fragment_A(ATensor&& atensor)
+  {
+    // Check that this tensor is likely already partitioned
+    CUTE_STATIC_ASSERT_V(rank(atensor) >= Int<3>{});  // VMK
+    CUTE_STATIC_ASSERT_V(size<0>(atensor) == size<1>(LayoutA_TV{}));
+
+    if constexpr (has_dereference<FrgTypeA>::value) {
+      // If the intended FrgTypeA is a view (of the current tensor), forward the whole
+      static_assert(is_same<ValTypeA, typename remove_cvref_t<ATensor>::value_type>::value
+                      , "Expecting ValTypeA type");
+      return make_tensor<FrgTypeA>(static_cast<ATensor&&>(atensor));
+    } else {
+      // Else, the intended FrgTypeA is a value type, construct a new tensor with a fragment layout
+      return make_fragment_like<FrgTypeA>(atensor);
+    }
+
+    CUTE_GCC_UNREACHABLE;
+  }
+
+  template <class BTensor>
+  CUTE_HOST_DEVICE static constexpr
+  auto
+  make_fragment_B(BTensor&& btensor)
+  {
+    // Check that this tensor is likely already partitioned
+    CUTE_STATIC_ASSERT_V(rank(btensor) >= Int<3>{});  // VNK
+    CUTE_STATIC_ASSERT_V(size<0>(btensor) == size<1>(LayoutB_TV{}));
+
+    if constexpr (has_dereference<FrgTypeB>::value) {
+      // If the intended FrgTypeB is a view (of the current tensor), forward the whole
+      static_assert(is_same<ValTypeB, typename remove_cvref_t<BTensor>::value_type>::value
+                      , "Expecting ValTypeB type");
+      return make_tensor<FrgTypeB>(static_cast<BTensor&&>(btensor));
+    } else {
+      // Else, the intended FrgTypeB is a value type, construct a new tensor with a fragment layout
+      return make_fragment_like<FrgTypeB>(btensor);
+    }
+
+    CUTE_GCC_UNREACHABLE;
+  }
+};
+
+//
+// A tiling of mma atoms
+//
+
+template <class TiledMMA, class ThrCoord>
+struct ThrMMA;
+
+// @tparam MMA_Atom The MMA_Atom to use in the TiledMMA
+// @tparam AtomLayoutMNK The MNK-tiling of the Atom to be performed.
+// @tparam PermuationsMNK Permutations to apply to each MNK-mode before tiling for the Atom.
+template <class MMA_Atom,
+          class AtomLayoutMNK,
+          class PermutationMNK = Tile<Underscore,Underscore,Underscore>>
+struct TiledMMA : MMA_Atom
+{
+  using Atom           = MMA_Atom;
+  using AtomShape_MNK  = typename MMA_Atom::Shape_MNK;
+  using AtomThrID      = typename MMA_Atom::ThrID;
+  using AtomLayoutC_TV = typename MMA_Atom::LayoutC_TV;
+  using AtomLayoutA_TV = typename MMA_Atom::LayoutA_TV;
+  using AtomLayoutB_TV = typename MMA_Atom::LayoutB_TV;
+
+  static_assert(   rank_v<AtomLayoutMNK>  == 3,   "TiledMMA requires rank-3 AtomLayoutMNK");
+  static_assert(   rank_v<PermutationMNK> == 3,   "TiledMMA requires rank-3 PermutationMNK");
+  static_assert( is_tuple<PermutationMNK>::value, "TiledMMA requires independent permutations of MNK.");
+  static_assert(is_static<PermutationMNK>::value, "TiledMMA requires static permutations of MNK.");
+
+  using ThrLayoutVMNK = decltype(tiled_product(AtomThrID{}, AtomLayoutMNK{}));
+  ThrLayoutVMNK thr_layout_vmnk_;
+
+  CUTE_HOST_DEVICE constexpr
+  TiledMMA(MMA_Atom const& mma_atom = {}, AtomLayoutMNK const& thr_layout_mnk = {})
+    : MMA_Atom(mma_atom),
+      thr_layout_vmnk_(tiled_product(AtomThrID{}, thr_layout_mnk)) {}
+
+  CUTE_HOST_DEVICE constexpr auto
+  get_thr_layout_vmnk() const {
+    return thr_layout_vmnk_;
+  }
+
+  // Tile a tensor or a layout from shape
+  //   (M,N,...)
+  // to shape
+  //   ((ThrV,(ThrM,ThrN)),(FrgV,(RestM,RestN,...)))
+  // where
+  //   ThrV:  The threads local to an MMA. layout<0>(ThrLayoutVMNK): ThrV -> thread_idx
+  //   ThrM:  The threads tiled in M.      layout<1>(ThrLayoutVMNK): ThrM -> thread_idx
+  //   ThrN:  The threads tiled in N.      layout<2>(ThrLayoutVMNK): ThrN -> thread_idx
+  //   FrgV:  The values local to an MMA.
+  //   RestM: The values tiled in M.
+  //   RestN: The values tiled in N.
+  template <class CTensor>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  thrfrg_C(CTensor&& ctensor) const
+  {
+    CUTE_STATIC_ASSERT_V(rank(ctensor) >= Int<2>{});
+    // Reorder the tensor for the TiledAtom
+    auto t_tile = make_tile(permutation_mnk<0>(),
+                            permutation_mnk<1>());
+    auto t_tensor = logical_divide(ctensor, t_tile);                 // (PermM,PermN)
+
+    // Tile the tensor for the Atom
+    auto c_tile = make_tile(make_layout(size<0>(AtomShape_MNK{})),
+                            make_layout(size<1>(AtomShape_MNK{})));
+    auto c_tensor = zipped_divide(t_tensor, c_tile);                 // ((AtomM,AtomN),(RestM,RestN))
+
+    // Transform the Atom mode from (M,K) to (Thr,Val)
+    auto tv_tensor = c_tensor.compose(AtomLayoutC_TV{},_);           // ((ThrV,FrgV),(RestM,RestN))
+
+    // Tile the tensor for the C-threads
+    auto thr_tile = make_tile(_,
+                              make_tile(make_layout(size<1>(thr_layout_vmnk_)),
+                                        make_layout(size<2>(thr_layout_vmnk_))));
+    auto thr_tensor = zipped_divide(tv_tensor, thr_tile);            // ((ThrV,(ThrM,ThrN)),(FrgV,(RestM,RestN)))
+
+    return thr_tensor;
+  }
+
+  // Tile a tensor or a layout from shape
+  //   (M,K,...)
+  // to shape
+  //   ((ThrV,(ThrM,ThrK)),(FrgV,(RestM,RestK,...)))
+  // where
+  //   ThrV: The threads local to an MMA. layout<0>(ThrLayoutVMNK): ThrV -> thread_idx
+  //   ThrM: The threads tiled in M.      layout<1>(ThrLayoutVMNK): ThrM -> thread_idx
+  //   ThrK: The threads tiled in K.      layout<3>(ThrLayoutVMNK): ThrK -> thread_idx
+  //   FrgV:  The values local to an MMA.
+  //   RestM: The values tiled in M.
+  //   RestK: The values tiled in K.
+  template <class ATensor>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  thrfrg_A(ATensor&& atensor) const
+  {
+    CUTE_STATIC_ASSERT_V(rank(atensor) >= Int<2>{});
+    // Reorder the tensor for the TiledAtom
+    auto t_tile = make_tile(permutation_mnk<0>(),
+                            permutation_mnk<2>());
+    auto t_tensor = logical_divide(atensor, t_tile);                 // (PermM,PermK)
+
+    // Tile the tensor for the Atom
+    auto a_tile = make_tile(make_layout(size<0>(AtomShape_MNK{})),
+                            make_layout(size<2>(AtomShape_MNK{})));
+    auto a_tensor = zipped_divide(t_tensor, a_tile);                 // ((AtomM,AtomK),(RestM,RestK))
+
+    // Transform the Atom mode from (M,K) to (Thr,Val)
+    auto tv_tensor = a_tensor.compose(AtomLayoutA_TV{},_);           // ((ThrV,FrgV),(RestM,RestK))
+
+    // Tile the tensor for the Thread
+    auto thr_tile = make_tile(_,
+                              make_tile(make_layout(size<1>(thr_layout_vmnk_)),
+                                        make_layout(size<3>(thr_layout_vmnk_))));
+    auto thr_tensor = zipped_divide(tv_tensor, thr_tile);            // ((ThrV,(ThrM,ThrK)),(FrgV,(RestM,RestK)))
+
+    return thr_tensor;
+  }
+
+  // Tile a tensor or a layout from shape
+  //   (N,K,...)
+  // to shape
+  //   ((ThrV,(ThrN,ThrK)),(FrgV,(RestN,RestK,...)))
+  // where
+  //   ThrV: The threads local to an MMA. layout<0>(ThrLayoutVMNK): ThrV -> thread_idx
+  //   ThrN: The threads tiled in N.      layout<2>(ThrLayoutVMNK): ThrN -> thread_idx
+  //   ThrK: The threads tiled in K.      layout<3>(ThrLayoutVMNK): ThrK -> thread_idx
+  //   FrgV:  The values local to an MMA.
+  //   RestN: The values tiled in N.
+  //   RestK: The values tiled in K.
+  template <class BTensor>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  thrfrg_B(BTensor&& btensor) const
+  {
+    CUTE_STATIC_ASSERT_V(rank(btensor) >= Int<2>{});
+    // Reorder the tensor for the TiledAtom
+    auto t_tile = make_tile(permutation_mnk<1>(),
+                            permutation_mnk<2>());
+    auto t_tensor = logical_divide(btensor, t_tile);                 // (PermN,PermK)
+
+    // Tile the tensor for the Atom
+    auto b_tile = make_tile(make_layout(size<1>(AtomShape_MNK{})),
+                            make_layout(size<2>(AtomShape_MNK{})));
+    auto b_tensor = zipped_divide(t_tensor, b_tile);                 // ((AtomN,AtomK),(RestN,RestK))
+
+    // Transform the Atom mode from (M,K) to (Thr,Val)
+    auto tv_tensor = b_tensor.compose(AtomLayoutB_TV{},_);           // ((ThrV,FrgV),(RestN,RestK))
+
+    // Tile the tensor for the Thread
+    auto thr_tile = make_tile(_,
+                              make_tile(make_layout(size<2>(thr_layout_vmnk_)),
+                                        make_layout(size<3>(thr_layout_vmnk_))));
+    auto thr_tensor = zipped_divide(tv_tensor, thr_tile);            // ((ThrV,(ThrN,ThrK)),(FrgV,(RestN,RestK)))
+
+    return thr_tensor;
+  }
+
+  template <class ThrIdx,
+            __CUTE_REQUIRES(is_integral<ThrIdx>::value)>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_slice(ThrIdx const& thr_idx) const
+  {
+    auto thr_vmnk = thr_layout_vmnk_.get_flat_coord(thr_idx);
+    return ThrMMA<TiledMMA, decltype(thr_vmnk)>{*this, thr_vmnk};
+  }
+
+  template <class ThrIdx,
+            __CUTE_REQUIRES(is_integral<ThrIdx>::value)>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_thread_slice(ThrIdx const& thr_idx) const
+  {
+    return get_slice(thr_idx);
+  }
+
+  //
+  // Utility for printing and visualization
+  //
+
+  // The permutation applied to the MNK-mode data
+  template <int I>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  permutation_mnk() const {
+    static_assert(0 <= I && I < 3);
+    auto perm = get<I>(PermutationMNK{});
+    return conditional_return(is_underscore<decltype(perm)>{}, size<I>(AtomShape_MNK{}) * size<I+1>(get_thr_layout_vmnk()), perm);
+  }
+
+  // The size of the MNK-mode
+  template <int I>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  tile_size_mnk() const {
+    static_assert(0 <= I && I < 3);
+    return size(permutation_mnk<I>());
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_layoutC_MN() const
+  {
+    // (M,N) -> (M,N)
+    auto ref_C = make_layout(make_shape(tile_size_mnk<0>(), tile_size_mnk<1>()));
+    // (cthrid,val) -> (M,N)
+    auto layoutC_TV = thrfrg_C(ref_C);
+    // (M,N) -> (cthrid,frg)
+    auto layoutC_MN = right_inverse(layoutC_TV).with_shape(shape(ref_C));
+
+    // cthrid = (v,m,n) -> thr_idx
+    auto thrID_C = thr_layout_vmnk_(_,_,_,Int<0>{});
+
+    return cute::make_tuple(layoutC_MN, thrID_C);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_layoutC_TV() const
+  {
+    // (M,N) -> (M,N)
+    auto ref_C = make_layout(make_shape(tile_size_mnk<0>(), tile_size_mnk<1>()));
+    // (cthrid,val) -> (M,N)
+    auto layoutC_TV = thrfrg_C(ref_C);
+
+    // thr_idx -> (ThrV,ThrM,ThrN,ThrK)
+    auto thridx_2_thrid = right_inverse(thr_layout_vmnk_);
+
+    // (thr_idx,val) -> (M,N)
+    return layoutC_TV.compose(thridx_2_thrid, _);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_layoutA_MK() const
+  {
+    // (M,K) -> (M,K)
+    auto ref_A = make_layout(make_shape(tile_size_mnk<0>(), tile_size_mnk<2>()));
+    // (athrid,val) -> (M,K)
+    auto layoutA_TV = thrfrg_A(ref_A);
+    // (M,K) -> (athrid,frg)
+    auto layoutA_MK = right_inverse(layoutA_TV).with_shape(shape(ref_A));
+
+    // athrid = (v,m,k) -> thr_idx
+    auto thrID_A = thr_layout_vmnk_(_,_,Int<0>{},_);
+
+    return cute::make_tuple(layoutA_MK, thrID_A);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_layoutA_TV() const
+  {
+    // (M,K) -> (M,K)
+    auto ref_A = make_layout(make_shape(tile_size_mnk<0>(), tile_size_mnk<2>()));
+    // (athrid,val) -> (M,K)
+    auto layoutA_TV = thrfrg_A(ref_A);
+
+    // (ThrV,(ThrM,ThrK)) -> (ThrV,(ThrM,ThrN,ThrK))
+    auto atile = make_tile(_,
+                           make_tile(make_layout(make_shape (size<1>(thr_layout_vmnk_), size<2>(thr_layout_vmnk_)),
+                                                 make_stride(               Int<1>{} ,                Int<0>{} )),
+                                     _));
+
+    // thr_idx -> (ThrV,ThrM,ThrN,ThrK)
+    auto thridx_2_thrid = right_inverse(thr_layout_vmnk_);
+
+    // (thr_idx,val) -> (M,K)
+    return thrfrg_A(ref_A).compose(atile, _).compose(thridx_2_thrid, _);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_layoutB_NK() const
+  {
+    // (N,K) -> (N,K)
+    auto ref_B = make_layout(make_shape(tile_size_mnk<1>(), tile_size_mnk<2>()));
+    // (bthrid,val) -> (N,K)
+    auto layoutB_TV = thrfrg_B(ref_B);
+    // (N,K) -> (bthrid,frg)
+    auto layoutB_NK = right_inverse(layoutB_TV).with_shape(shape(ref_B));
+
+    // bthrid = (v,n,k) -> thr_idx
+    auto thrID_B = thr_layout_vmnk_(_,Int<0>{},_,_);
+
+    return cute::make_tuple(layoutB_NK, thrID_B);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_layoutB_TV() const
+  {
+    // (N,K) -> (N,K)
+    auto ref_B = make_layout(make_shape(tile_size_mnk<1>(), tile_size_mnk<2>()));
+    // (bthrid,val) -> (N,K)
+    auto layoutB_TV = thrfrg_B(ref_B);
+
+    // (ThrV,(ThrN,ThrK)) -> (ThrV,(ThrM,ThrN,ThrK))
+    auto btile = make_tile(_,
+                           make_tile(make_layout(make_shape (size<1>(thr_layout_vmnk_), size<2>(thr_layout_vmnk_)),
+                                                 make_stride(               Int<0>{} ,                Int<1>{} )),
+                                     _));
+
+    // thr_idx -> (ThrV,ThrM,ThrN,ThrK)
+    auto thridx_2_thrid = right_inverse(thr_layout_vmnk_);
+
+    // (thr_idx,val) -> (N,K)
+    return thrfrg_B(ref_B).compose(btile, _).compose(thridx_2_thrid, _);
+  }
+};
+
+template <class TiledMMA, class ThrVMNK>
+struct ThrMMA : TiledMMA
+{
+  ThrVMNK thr_vmnk_;
+
+  template <class CTensor>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  partition_C(CTensor&& ctensor) const
+  {
+    auto thr_tensor = make_tensor(static_cast<CTensor&&>(ctensor).data(), this->thrfrg_C(ctensor.layout()));
+
+    auto thr_vmn = make_coord(get<0>(thr_vmnk_), make_coord(get<1>(thr_vmnk_), get<2>(thr_vmnk_)));
+    return thr_tensor(thr_vmn, make_coord(_, repeat<rank<1,1>(thr_tensor)>(_)));
+  }
+
+  template <class ATensor>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  partition_A(ATensor&& atensor) const
+  {
+    auto thr_tensor = make_tensor(static_cast<ATensor&&>(atensor).data(), this->thrfrg_A(atensor.layout()));
+
+    auto thr_vmk = make_coord(get<0>(thr_vmnk_), make_coord(get<1>(thr_vmnk_), get<3>(thr_vmnk_)));
+    return thr_tensor(thr_vmk, make_coord(_, repeat<rank<1,1>(thr_tensor)>(_)));
+  }
+
+  template <class BTensor>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  partition_B(BTensor&& btensor) const
+  {
+    auto thr_tensor = make_tensor(static_cast<BTensor&&>(btensor).data(), this->thrfrg_B(btensor.layout()));
+
+    auto thr_vnk = make_coord(get<0>(thr_vmnk_), make_coord(get<2>(thr_vmnk_), get<3>(thr_vmnk_)));
+    return thr_tensor(thr_vnk, make_coord(_, repeat<rank<1,1>(thr_tensor)>(_)));
+  }
+
+  template <class CTensor>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  partition_fragment_C(CTensor&& ctensor) const
+  {
+    return TiledMMA::make_fragment_C(partition_C(ctensor));
+  }
+
+  template <class ATensor>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  partition_fragment_A(ATensor&& atensor) const
+  {
+    return TiledMMA::make_fragment_A(partition_A(atensor));
+  }
+
+  template <class BTensor>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  partition_fragment_B(BTensor&& btensor) const
+  {
+    return TiledMMA::make_fragment_B(partition_B(btensor));
+  }
+};
+
+//
+// These tile the MMA_Atom as a whole
+//
+
+template <class MMA_Op,
+          class MMAThrLayout = Layout<Shape<_1,_1,_1>>,
+          class Permutations = Tile<Underscore,Underscore,Underscore>>
+CUTE_HOST_DEVICE constexpr
+auto
+make_tiled_mma(MMA_Atom<MMA_Op> const& mma_atom,
+               MMAThrLayout     const& thr_layout   = {},
+               Permutations     const& permutations = {})
+{
+  auto thr_layout_mnk  = append<3>(thr_layout, Layout<_1,_0>{});
+  auto permutation_mnk = append<3>(permutations, _);
+
+  return TiledMMA<MMA_Atom<MMA_Op>,
+                  decltype(thr_layout_mnk),
+                  decltype(permutation_mnk)>{mma_atom, thr_layout_mnk};
+}
+
+template <class MMA_Op,
+          class MMAThrLayout = Layout<Shape<_1,_1,_1>>,
+          class Permutations = Tile<Underscore,Underscore,Underscore>>
+CUTE_HOST_DEVICE constexpr
+auto
+make_tiled_mma(MMA_Op       const&,
+               MMAThrLayout const& thr_layout   = {},
+               Permutations const& permutations = {})
+{
+  // Attempt to wrap in an MMA_Atom<> and forward
+  return make_tiled_mma(MMA_Atom<MMA_Op>{}, thr_layout, permutations);
+}
+
+//
+// partition_fragment_C -- static context
+//
+
+template <class... Args, class Shape_MN>
+CUTE_HOST_DEVICE constexpr
+auto
+partition_shape_C(TiledMMA<Args...> const& mma, Shape_MN const& shape_MN)
+{
+  auto dummy    = make_layout(shape(shape_MN));
+  auto dummy_tv = mma.thrfrg_C(dummy);
+  // Slice+rearrange like partition_C
+  auto dummy_v  = dummy_tv(Int<0>{}, make_coord(_, repeat<rank(dummy)>(_)));
+  return shape(dummy_v);
+
+}
+
+
+template <class... Args, class Shape_MN>
+CUTE_HOST_DEVICE constexpr
+auto
+partition_fragment_C(TiledMMA<Args...> const& mma, Shape_MN const& shapeMN)
+{
+  return make_tensor<typename TiledMMA<Args...>::FrgTypeC>(partition_shape_C(mma, shapeMN));
+}
+
+// partition_fragment_A and partition_fragment_B often depend on the
+//   layout of A and B and/or the thread_idx that is requesting the partition.
+// For these reasons, they should not be used in a static context.
+// See TiledMMA::get_slice(thr_idx).partition_fragment_A(tensorA) instead.
+
+template <class... Args, class Shape_MK>
+CUTE_HOST_DEVICE constexpr
+auto
+partition_shape_A(TiledMMA<Args...> const& mma, Shape_MK const& shape_MK)
+{
+  auto dummy    = make_layout(shape(shape_MK));
+  auto dummy_tv = mma.thrfrg_A(dummy);
+  // Slice+rearrange like partition_A
+  auto dummy_v  = dummy_tv(Int<0>{}, make_coord(_, repeat<rank(dummy)>(_)));
+  return shape(dummy_v);
+
+}
+
+template <class... Args, class Shape_NK>
+CUTE_HOST_DEVICE constexpr
+auto
+partition_shape_B(TiledMMA<Args...> const& mma, Shape_NK const& shape_NK)
+{
+  auto dummy    = make_layout(shape(shape_NK));
+  auto dummy_tv = mma.thrfrg_B(dummy);
+  // Slice+rearrange like partition_B
+  auto dummy_v  = dummy_tv(Int<0>{}, make_coord(_, repeat<rank(dummy)>(_)));
+  return shape(dummy_v);
+
+}
+
+//
+// Size
+//
+
+template <int I, class... Args>
+CUTE_HOST_DEVICE constexpr
+auto
+tile_size(TiledMMA<Args...> const& mma)
+{
+  return mma.template tile_size_mnk<I>();
+}
+
+template <class... Args>
+CUTE_HOST_DEVICE constexpr
+auto
+tile_shape(TiledMMA<Args...> const& mma)
+{
+  return make_shape(tile_size<0>(mma), tile_size<1>(mma), tile_size<2>(mma));
+}
+
+// Deprecate?
+template <int... I, class... Args>
+CUTE_HOST_DEVICE constexpr
+auto
+size(TiledMMA<Args...> const& mma)
+{
+  return size<I...>(mma.get_thr_layout_vmnk());
+}
+
+// Alias
+template <int... I, class... Args>
+CUTE_HOST_DEVICE constexpr
+auto
+thr_size(TiledMMA<Args...> const& mma)
+{
+  return size<I...>(mma.get_thr_layout_vmnk());
+}
+
+//
+// Display utilities
+//
+
+template <class... Args>
+CUTE_HOST_DEVICE
+void
+print(MMA_Atom<MMA_Traits<Args...>> const&)
+{
+  using Atom = MMA_Atom<MMA_Traits<Args...>>;
+  print("MMA_Atom\n");
+  print("  ThrID:      "); print(typename Atom::ThrID{});      print("\n");
+  print("  Shape_MNK:  "); print(typename Atom::Shape_MNK{});  print("\n");
+  print("  LayoutA_TV: "); print(typename Atom::LayoutA_TV{}); print("\n");
+  print("  LayoutB_TV: "); print(typename Atom::LayoutB_TV{}); print("\n");
+  print("  LayoutC_TV: "); print(typename Atom::LayoutC_TV{}); print("\n");
+}
+
+template <class Atom, class TiledThr, class TiledPerm>
+CUTE_HOST_DEVICE
+void
+print(TiledMMA<Atom, TiledThr, TiledPerm> const& mma)
+{
+  print("TiledMMA\n");
+  print("  ThrLayoutVMNK:  "); print(mma.get_thr_layout_vmnk());  print("\n");
+  print("  PermutationMNK: "); print(TiledPerm{}); print("\n");
+  print(static_cast<Atom const&>(mma));
+}
+
+template <class TiledMMA, class ThrVMNK>
+CUTE_HOST_DEVICE
+void
+print(ThrMMA<TiledMMA, ThrVMNK> const& thr_mma)
+{
+  print("ThrMMA\n");
+  print("  Thr VMNK: "); print(thr_mma.thr_vmnk_); print("\n");
+  print(static_cast<TiledMMA>(thr_mma));
+}
+
+// MMA Atom to LaTeX TikZ
+template <class... Args, class TikzColorFn = TikzColor_TV>
+CUTE_HOST_DEVICE
+void
+print_latex(MMA_Atom<Args...> const& mma_atom,
+            TikzColorFn color = {})             // lambda(thr_idx,val_idx) -> tikz color string
+{
+  print_latex(make_tiled_mma(mma_atom));
+}
+
+// TiledMMA to LaTeX TikZ
+template <class... Args, class TikzColorFn = TikzColor_TV>
+CUTE_HOST_DEVICE
+void
+print_latex(TiledMMA<Args...> const& mma,
+            TikzColorFn color = {})             // lambda(thr_idx,val_idx) -> tikz color string
+{
+  auto layout_and_thrid_C = mma.get_layoutC_MN();
+  auto layoutC_MN = get<0>(layout_and_thrid_C);
+  auto thrID_C    = get<1>(layout_and_thrid_C);
+
+  auto layout_and_thrid_A = mma.get_layoutA_MK();
+  auto layoutA_MK = get<0>(layout_and_thrid_A);
+  auto thrID_A    = get<1>(layout_and_thrid_A);
+
+  auto layout_and_thrid_B = mma.get_layoutB_NK();
+  auto layoutB_NK = get<0>(layout_and_thrid_B);
+  auto thrID_B    = get<1>(layout_and_thrid_B);
+
+  print_latex_mma(layoutC_MN, thrID_C,
+                  layoutA_MK, thrID_A,
+                  layoutB_NK, thrID_B);
+}
+
+// MNK MMA Layout to LaTeX TikZ
+template <class LayoutC, class ThrIDC,
+          class LayoutA, class ThrIDA,
+          class LayoutB, class ThrIDB,
+          class TikzColorFn = TikzColor_TV>
+CUTE_HOST_DEVICE
+void
+print_latex_mma(LayoutC const& C, ThrIDC const& TC,  // (m,n) -> (tid,vid)  and  tid -> thr_idx
+                LayoutA const& A, ThrIDA const& TA,  // (m,k) -> (tid,vid)  and  tid -> thr_idx
+                LayoutB const& B, ThrIDB const& TB,  // (n,k) -> (tid,vid)  and  tid -> thr_idx
+                TikzColorFn color = {})              // lambda(thr_idx,val_idx) -> tikz color string
+{
+  CUTE_STATIC_ASSERT_V(rank(C) == Int<2>{});
+  CUTE_STATIC_ASSERT_V(rank(A) == Int<2>{});
+  CUTE_STATIC_ASSERT_V(rank(B) == Int<2>{});
+
+  assert(size<0>(A) == size<0>(C));
+  assert(size<0>(B) == size<1>(C));
+  assert(size<1>(A) == size<1>(B));
+
+  // Commented prints
+  printf("%% LayoutC: "); print(C);  printf("\n");
+  printf("%% ThrIDC : "); print(TC); printf("\n");
+  printf("%% LayoutA: "); print(A);  printf("\n");
+  printf("%% ThrIDA : "); print(TA); printf("\n");
+  printf("%% LayoutB: "); print(B);  printf("\n");
+  printf("%% ThrIDB : "); print(TB); printf("\n\n");
+  // Header
+  printf("\\documentclass[convert]{standalone}\n"
+         "\\usepackage{tikz}\n\n"
+         "\\begin{document}\n"
+         "\\begin{tikzpicture}[x={(0cm,-1cm)},y={(1cm,0cm)},every node/.style={minimum size=1cm, outer sep=0pt}]\n\n");
+
+  // C starting at 0,0
+  for (int m = 0; m < size<0>(C); ++m) {
+    for (int n = 0; n < size<1>(C); ++n) {
+      int thrid   = C(m,n) % size(TC);
+      int val_idx = C(m,n) / size(TC);
+      int thr_idx = TC(thrid);
+
+      printf("\\node[fill=%s] at (%d,%d) {\\shortstack{T%d \\\\ V%d}};\n",
+             color(thr_idx, val_idx),
+             m, n,
+             thr_idx, val_idx);
+    }
+  }
+  // Grid
+  printf("\\draw[color=black,thick,shift={(-0.5,-0.5)}] (%d,%d) grid (%d,%d);\n\n",
+         0, 0, int(size<0>(C)), int(size<1>(C)));
+
+  // A starting at 0,-size<1>(A)-1
+  for (int m = 0; m < size<0>(A); ++m) {
+    for (int k = 0; k < size<1>(A); ++k) {
+      int thrid   = A(m,k) % size(TA);
+      int val_idx = A(m,k) / size(TA);
+      int thr_idx = TA(thrid);
+
+      printf("\\node[fill=%s] at (%d,%d) {\\shortstack{T%d \\\\ V%d}};\n",
+             color(thr_idx, val_idx),
+             m, k-1-size<1>(A),
+             thr_idx, val_idx);
+    }
+  }
+  // Grid
+  printf("\\draw[color=black,thick,shift={(-0.5,-0.5)}] (%d,%d) grid (%d,%d);\n\n",
+         0, int(-size<1>(A)-1), int(size<0>(A)), -1);
+  // A labels
+  for (int m =  0, k = -1; m < size<0>(A); ++m) {
+    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", m, k-1-size<1>(A), m);
+  }
+  for (int m = -1, k =  0; k < size<1>(A); ++k) {
+    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", m, k-1-size<1>(A), k);
+  }
+
+  // B starting at -size<1>(B)-1,0
+  for (int n = 0; n < size<0>(B); ++n) {
+    for (int k = 0; k < size<1>(B); ++k) {
+      int thrid   = B(n,k) % size(TB);
+      int val_idx = B(n,k) / size(TB);
+      int thr_idx = TB(thrid);
+
+      printf("\\node[fill=%s] at (%d,%d) {\\shortstack{T%d \\\\ V%d}};\n",
+             color(thr_idx, val_idx),
+             k-1-size<1>(B), n,
+             thr_idx, val_idx);
+    }
+  }
+  // Grid
+  printf("\\draw[color=black,thick,shift={(-0.5,-0.5)}] (%d,%d) grid (%d,%d);\n\n",
+         int(-size<1>(B)-1), 0, -1, int(size<0>(B)));
+  // B labels
+  for (int n =  0, k = -1; n < size<0>(B); ++n) {
+    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", k-1-size<1>(B), n, n);
+  }
+  for (int n = -1, k =  0; k < size<1>(B); ++k) {
+    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", k-1-size<1>(B), n, k);
+  }
+
+  // Footer
+  printf("\\end{tikzpicture}\n"
+         "\\end{document}\n");
+}
+
+// MNK MMA Layout to console printer
+template <class LayoutC, class ThrIDC,
+          class LayoutA, class ThrIDA,
+          class LayoutB, class ThrIDB>
+CUTE_HOST_DEVICE
+void
+print_layout_mma(LayoutC const& C, ThrIDC const& TC,  // (m,n) -> (tid,vid)  and  tid -> thr_idx
+                 LayoutA const& A, ThrIDA const& TA,  // (m,k) -> (tid,vid)  and  tid -> thr_idx
+                 LayoutB const& B, ThrIDB const& TB)  // (n,k) -> (tid,vid)  and  tid -> thr_idx
+{
+  CUTE_STATIC_ASSERT_V(rank(C) == Int<2>{});
+  CUTE_STATIC_ASSERT_V(rank(A) == Int<2>{});
+  CUTE_STATIC_ASSERT_V(rank(B) == Int<2>{});
+
+  assert(size<0>(A) == size<0>(C));
+  assert(size<0>(B) == size<1>(C));
+  assert(size<1>(A) == size<1>(B));
+
+  int a_width = size<1>(A) * 6 + 4;
+
+  // Print out B (white-shifted) k-by-n
+  for (int k = 0; k < size<1>(B); ++k) {
+    // Header
+    printf("%*s", a_width, "");
+    for (int n = 0; n < size<0>(B); ++n) printf("+-----");
+    printf("+\n");
+    // Values
+    printf("%*s", a_width, "");
+    for (int n = 0; n < size<0>(B); ++n) printf("|T%02dV%1d", int(TB(B(n,k) % size(TB))), int(B(n,k) / size(TB)));
+    printf("|\n");
+  }
+  // Footer
+  printf("%*s", a_width, "");
+  for (int n = 0; n < size<0>(B); ++n) printf("+-----");
+  printf("+\n\n");
+
+  // Print out A m-by-k and C m-by-n
+  for (int m = 0; m < size<0>(A); ++m) {
+    // Header
+    for (int k = 0; k < size<1>(A); ++k) printf("+-----");
+    printf("+   ");
+    for (int n = 0; n < size<1>(C); ++n) printf("+-----");
+    printf("+\n");
+    // Values
+    for (int k = 0; k < size<1>(A); ++k) printf("|T%02dV%1d", int(TA(A(m,k) % size(TA))), int(A(m,k) / size(TA)));
+    printf("|   ");
+    for (int n = 0; n < size<1>(C); ++n) printf("|T%02dV%1d", int(TC(C(m,n) % size(TC))), int(C(m,n) / size(TC)));
+    printf("|\n");
+  }
+  // Footer
+  for (int k = 0; k < size<1>(A); ++k) printf("+-----");
+  printf("+   ");
+  for (int n = 0; n < size<1>(C); ++n) printf("+-----");
+  printf("+\n");
+}
+
+// MNK MMA Layout to SVG -- 8-value color coded by thread
+template <class LayoutC, class ThrIDC,
+          class LayoutA, class ThrIDA,
+          class LayoutB, class ThrIDB>
+CUTE_HOST_DEVICE
+void
+print_svg_mma(LayoutC const& C, ThrIDC const& TC,  // (m,n) -> (tid,vid)  and  tid -> thr_idx
+              LayoutA const& A, ThrIDA const& TA,  // (m,k) -> (tid,vid)  and  tid -> thr_idx
+              LayoutB const& B, ThrIDB const& TB)  // (n,k) -> (tid,vid)  and  tid -> thr_idx
+{
+  char const *color_map[8] = {"175,175,255", "175,255,175", "255,255,175",
+                              "255,175,175", "210,210,255", "210,255,210",
+                              "255,255,210", "255,210,210"};
+
+  const int cell_width = 20;
+  const int cell_height = 20;
+
+  const int page_width = (size<1>(A) + size<0>(B) + 2) * cell_width;
+  const int page_height = (size<1>(B) + size<0>(A) + 2) * cell_height;
+
+  // header
+  printf("<svg width=\"100%%\" height=\"100%%\" viewBox=\"0 0 %d %d\" "
+         "preserveAspectRatio=\"xMidYMid meet\" "
+         "xmlns=\"http://www.w3.org/2000/svg\">\n",
+         page_width, page_height);
+
+  // C
+  int c_base_x = (size<1>(A) + 2) * cell_width;
+  int c_base_y = (size<1>(B) + 2) * cell_height;
+  for (int m = 0; m < cute::size<0>(C); ++m) {
+    for (int n = 0; n < cute::size<1>(C); ++n) {
+
+      int thrid = C(m, n) % size(TC);
+      int val_idx = C(m, n) / size(TC);
+      int thr_idx = TC(thrid);
+
+      int x = n * cell_width + c_base_x;
+      int y = m * cell_height + c_base_y;
+
+      int thr_x = x + cell_width / 2;
+      int thr_y = y + cell_height / 4;
+      int val_x = x + cell_width / 2;
+      int val_y = y + cell_height * 3 / 4;
+
+      printf("<rect x=\"%d\" y=\"%d\" width=\"%d\" height=\"%d\" "
+             "fill=\"rgb(%s)\" stroke=\"black\"/>\n",
+             x, y, cell_width, cell_height, color_map[thr_idx % 8]);
+
+      printf("<text x=\"%d\" y=\"%d\" text-anchor=\"middle\" "
+             "alignment-baseline=\"central\" font-size=\"8\">T%d</text>\n",
+             thr_x, thr_y, thr_idx);
+      printf("<text x=\"%d\" y=\"%d\" text-anchor=\"middle\" "
+             "alignment-baseline=\"central\" font-size=\"8\">V%d</text>\n",
+             val_x, val_y, val_idx);
+    }
+  }
+
+  // A
+  int a_base_x = cell_width;
+  int a_base_y = (size<1>(B) + 2) * cell_height;
+  for (int m = 0; m < size<0>(A); ++m) {
+    for (int k = 0; k < size<1>(A); ++k) {
+      int thrid = A(m, k) % size(TA);
+      int val_idx = A(m, k) / size(TA);
+      int thr_idx = TA(thrid);
+
+      int x = k * cell_width + a_base_x;
+      int y = m * cell_height + a_base_y;
+
+      int thr_x = x + cell_width / 2;
+      int thr_y = y + cell_height / 4;
+      int val_x = x + cell_width / 2;
+      int val_y = y + cell_height * 3 / 4;
+
+      printf("<rect x=\"%d\" y=\"%d\" width=\"%d\" height=\"%d\" "
+             "fill=\"rgb(%s)\" stroke=\"black\" />\n",
+             x, y, cell_width, cell_height, color_map[thr_idx % 8]);
+      printf("<text x=\"%d\" y=\"%d\" text-anchor=\"middle\" "
+             "alignment-baseline=\"central\" font-size=\"8\">T%d</text>\n",
+             thr_x, thr_y, thr_idx);
+      printf("<text x=\"%d\" y=\"%d\" text-anchor=\"middle\" "
+             "alignment-baseline=\"central\" font-size=\"8\">V%d</text>\n",
+             val_x, val_y, val_idx);
+    }
+  }
+
+  // B
+  int b_base_x = (size<1>(A) + 2) * cell_width;
+  int b_base_y = cell_height;
+  for (int n = 0; n < size<0>(B); ++n) {
+    for (int k = 0; k < size<1>(B); ++k) {
+      int thrid = B(n, k) % size(TB);
+      int val_idx = B(n, k) / size(TB);
+      int thr_idx = TB(thrid);
+
+      int x = n * cell_width + b_base_x;
+      int y = k * cell_height + b_base_y;
+
+      int thr_x = x + cell_width / 2;
+      int thr_y = y + cell_height / 4;
+      int val_x = x + cell_width / 2;
+      int val_y = y + cell_height * 3 / 4;
+
+      printf("<rect x=\"%d\" y=\"%d\" width=\"%d\" height=\"%d\" "
+             "fill=\"rgb(%s)\" stroke=\"black\" />\n",
+             x, y, cell_width, cell_height, color_map[thr_idx % 8]);
+      printf("<text x=\"%d\" y=\"%d\" text-anchor=\"middle\" "
+             "alignment-baseline=\"central\" font-size=\"8\">T%d</text>\n",
+             thr_x, thr_y, thr_idx);
+      printf("<text x=\"%d\" y=\"%d\" text-anchor=\"middle\" "
+             "alignment-baseline=\"central\" font-size=\"8\">V%d</text>\n",
+             val_x, val_y, val_idx);
+    }
+  }
+
+  // A labels
+  for (int m = 0; m < size<0>(A); ++m) {
+    int x = cell_width / 2;
+    int y = m * cell_height + cell_height / 2 + a_base_y;
+    printf("<text x=\"%d\" y=\"%d\" text-anchor=\"middle\" "
+           "alignment-baseline=\"central\" font-size=\"12\">%d</text>\n",
+           x, y, m);
+  }
+  for (int k = 0; k < size<1>(A); ++k) {
+    int x = cell_width + k * cell_width + cell_width / 2;
+    int y = -cell_height / 2 + a_base_y;
+    printf("<text x=\"%d\" y=\"%d\" text-anchor=\"middle\" "
+           "alignment-baseline=\"central\" font-size=\"12\">%d</text>\n",
+           x, y, k);
+  }
+
+  // B labels
+  for (int n = 0; n < size<0>(B); ++n) {
+    int x = b_base_x + cell_width * n + cell_width / 2;
+    int y = cell_height / 2;
+    printf("<text x=\"%d\" y=\"%d\" text-anchor=\"middle\" "
+           "alignment-baseline=\"central\" font-size=\"12\">%d</text>\n",
+           x, y, n);
+  }
+  for (int k = 0; k < size<1>(B); ++k) {
+    int x = b_base_x - cell_width / 2;
+    int y = cell_height * (k + 1) + cell_height / 2;
+    printf("<text x=\"%d\" y=\"%d\" text-anchor=\"middle\" "
+           "alignment-baseline=\"central\" font-size=\"12\">%d</text>\n",
+           x, y, k);
+  }
+
+  // footer
+  printf("</svg>");
+}
+
+template <class... Args>
+CUTE_HOST_DEVICE
+void
+print_svg(MMA_Atom<Args...> const &mma_atom) {
+  print_svg(make_tiled_mma(mma_atom));
+}
+
+template <class... Args>
+CUTE_HOST_DEVICE
+void
+print_svg(TiledMMA<Args...> const &mma) {
+  auto layout_and_thrid_C = mma.get_layoutC_MN();
+  auto layoutC_MN = get<0>(layout_and_thrid_C);
+  auto thrID_C = get<1>(layout_and_thrid_C);
+
+  auto layout_and_thrid_A = mma.get_layoutA_MK();
+  auto layoutA_MK = get<0>(layout_and_thrid_A);
+  auto thrID_A = get<1>(layout_and_thrid_A);
+
+  auto layout_and_thrid_B = mma.get_layoutB_NK();
+  auto layoutB_NK = get<0>(layout_and_thrid_B);
+  auto thrID_B = get<1>(layout_and_thrid_B);
+
+  print_svg_mma(layoutC_MN, thrID_C, layoutA_MK, thrID_A, layoutB_NK, thrID_B);
+}
+
+} // namespace cute
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <cute/atom/mma_traits_sm61.hpp>
+#include <cute/atom/mma_traits_sm70.hpp>
+#include <cute/atom/mma_traits_sm75.hpp>
+#include <cute/atom/mma_traits_sm80.hpp>
+#include <cute/atom/mma_traits_sm90.hpp>
+#include <cute/atom/mma_traits_sm90_gmma.hpp>
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cute/atom/mma_traits.hpp b/include/cute/atom/mma_traits.hpp
new file mode 100644
index 0000000000..0994698a87
--- /dev/null
+++ b/include/cute/atom/mma_traits.hpp
@@ -0,0 +1,189 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/tensor_impl.hpp>  // cute::Tensor
+#include <cute/pointer.hpp>      // cute::is_rmem
+#include <cute/arch/mma.hpp>     // cute::UniversalFMA
+#include <cute/arch/util.hpp>    // cute::detail::explode
+
+namespace cute
+{
+
+/**
+ * concept MMA_Traits
+ * {
+ *   using ValTypeD =  // Logical A-value type
+ *   using ValTypeA =  // Logical B-value type
+ *   using ValTypeB =  // Logical C-value type
+ *   using ValTypeC =  // Logical D-value type    (NOTE: Not used? Assumed == ValTypeD)
+ *
+ *   using FrgTypeA =  // A-type consumed by MMA  (if ommitted, same as ValTypeA)
+ *   using FrgTypeB =  // B_type consumed by MMA  (if ommitted, same as ValTypeB)
+ *   using FrgTypeC =  // C_type consumed by MMA  (if ommitted, same as ValTypeC)
+ *
+ *   using Shape_MNK =    // Logical MxNxK shape of the MMA
+ *
+ *   using ThrID     =    // Logical thread id (tid) -> tidx
+ *
+ *   using ALayout =      // (Logical thread id (tid), Logical value id (vid)) -> Flat MK-coord
+ *   using BLayout =      // (Logical thread id (tid), Logical value id (vid)) -> Flat NK-coord
+ *   using CLayout =      // (Logical thread id (tid), Logical value id (vid)) -> Flat MN-coord
+ * };
+ */
+
+template <class MMAOperation, class... MMAOpArgs>
+struct MMA_Traits
+{
+  static_assert(sizeof(MMAOperation) == 0, "MMA_Traits not implemented for this MMA_Operation.");
+};
+
+template <class D, class A, class B, class C>
+struct MMA_Traits<UniversalFMA<D,A,B,C>>
+{
+  using ValTypeD = D;
+  using ValTypeA = A;
+  using ValTypeB = B;
+  using ValTypeC = C;
+
+  // Logical shape of the MMA
+  using Shape_MNK = Shape<_1,_1,_1>;
+
+  // Logical thread id (tid) -> tidx
+  using ThrID   = Layout<_1>;
+
+  // (Logical thread id (tid), Logical value id (vid)) -> coord
+
+  // (tid,vid) -> (m,k)
+  using ALayout = Layout<Shape<_1,_1>>;
+  // (tid,vid) -> (n,k)
+  using BLayout = Layout<Shape<_1,_1>>;
+  // (tid,vid) -> (m,n)
+  using CLayout = Layout<Shape<_1,_1>>;
+};
+
+// Extract an MMA_Op from an MMA_Traits
+template <class MMA_Traits>
+struct MMA_Op {};
+
+template <class MMA_Op_Arg, class... Args>
+struct MMA_Op<MMA_Traits<MMA_Op_Arg, Args...>> {
+  using type = MMA_Op_Arg;
+};
+
+//
+// Generic mma_unpack for any MMA_Traits
+//
+
+template <class AnyMMATraits,
+          class TD, class DLayout,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout>
+CUTE_HOST_DEVICE constexpr
+void
+mma_unpack(AnyMMATraits        const& traits,
+           Tensor<TD, DLayout>      & D,
+           Tensor<TA, ALayout> const& A,
+           Tensor<TB, BLayout> const& B,
+           Tensor<TC, CLayout> const& C)
+{
+  static_assert(is_rmem<TD>::value, "Expected registers in MMA_Atom::call");
+  static_assert(is_rmem<TA>::value, "Expected registers in MMA_Atom::call");
+  static_assert(is_rmem<TB>::value, "Expected registers in MMA_Atom::call");
+  static_assert(is_rmem<TC>::value, "Expected registers in MMA_Atom::call");
+
+  // Register value types from the MMA_Operation register arrays
+  using MMA_Op   = typename MMA_Op<AnyMMATraits>::type;
+  using RegTypeD = typename remove_extent<typename MMA_Op::DRegisters>::type;
+  using RegTypeA = typename remove_extent<typename MMA_Op::ARegisters>::type;
+  using RegTypeB = typename remove_extent<typename MMA_Op::BRegisters>::type;
+  using RegTypeC = typename remove_extent<typename MMA_Op::CRegisters>::type;
+
+  Tensor rA = recast<RegTypeA>(A);
+  Tensor rB = recast<RegTypeB>(B);
+  Tensor rD = recast<RegTypeD>(D);
+  Tensor rC = recast<RegTypeC>(C);
+
+  constexpr int RegNumD = extent<typename MMA_Op::DRegisters>::value;
+  constexpr int RegNumA = extent<typename MMA_Op::ARegisters>::value;
+  constexpr int RegNumB = extent<typename MMA_Op::BRegisters>::value;
+  constexpr int RegNumC = extent<typename MMA_Op::CRegisters>::value;
+
+  CUTE_STATIC_ASSERT_V(size(rA) == Int<RegNumA>{});
+  CUTE_STATIC_ASSERT_V(size(rB) == Int<RegNumB>{});
+  CUTE_STATIC_ASSERT_V(size(rD) == Int<RegNumD>{});
+  CUTE_STATIC_ASSERT_V(size(rC) == Int<RegNumC>{});
+
+  detail::explode(MMA_Op::fma,
+                  rD, make_int_sequence<RegNumD>{},
+                  rA, make_int_sequence<RegNumA>{},
+                  rB, make_int_sequence<RegNumB>{},
+                  rC, make_int_sequence<RegNumC>{});
+}
+
+// Accept mutable temporaries
+template <class AnyMMATraits,
+          class TD, class DLayout,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout>
+CUTE_HOST_DEVICE constexpr
+void
+mma_unpack(AnyMMATraits        const& traits,
+           Tensor<TD, DLayout>     && D,
+           Tensor<TA, ALayout> const& A,
+           Tensor<TB, BLayout> const& B,
+           Tensor<TC, CLayout> const& C)
+{
+  mma_unpack(traits, D, A, B, C);
+}
+
+namespace detail {
+
+template <class X, class = void>
+struct FrgTypeA_or_Default { using type = typename X::ValTypeA; };
+template <class X>
+struct FrgTypeA_or_Default<X,void_t<typename X::FrgTypeA>> { using type = typename X::FrgTypeA; };
+
+template <class X, class = void>
+struct FrgTypeB_or_Default { using type = typename X::ValTypeB; };
+template <class X>
+struct FrgTypeB_or_Default<X,void_t<typename X::FrgTypeB>> { using type = typename X::FrgTypeB; };
+
+template <class X, class = void>
+struct FrgTypeC_or_Default { using type = typename X::ValTypeC; };
+template <class X>
+struct FrgTypeC_or_Default<X,void_t<typename X::FrgTypeC>> { using type = typename X::FrgTypeC; };
+
+} // end namespace detail
+
+} // namespace cute
diff --git a/include/cute/atom/mma_traits_sm61.hpp b/include/cute/atom/mma_traits_sm61.hpp
new file mode 100644
index 0000000000..f72a639400
--- /dev/null
+++ b/include/cute/atom/mma_traits_sm61.hpp
@@ -0,0 +1,73 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/arch/mma_sm61.hpp>
+
+#include <cute/atom/mma_traits.hpp>
+#include <cute/layout.hpp>
+
+namespace cute
+{
+
+template <>
+struct MMA_Traits<SM61_DP4A>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using Shape_MNK = Shape<_1,_1,_4>;
+  using ThrID   = Layout<_1>;
+  using ALayout = Layout<Shape<_1,_4>>;
+  using BLayout = Layout<Shape<_1,_4>>;
+  using CLayout = Layout<Shape<_1,_1>>;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM61_DP2A>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int16_t;
+  using ValTypeB = int16_t;
+  using ValTypeC = int32_t;
+
+  using Shape_MNK = Shape<_1,_1,_2>;
+  using ThrID   = Layout<_1>;
+  using ALayout = Layout<Shape<_1,_2>>;
+  using BLayout = Layout<Shape<_1,_2>>;
+  using CLayout = Layout<Shape<_1,_1>>;
+};
+
+} // namespace cute
diff --git a/include/cute/atom/mma_traits_sm70.hpp b/include/cute/atom/mma_traits_sm70.hpp
new file mode 100644
index 0000000000..f0702a9617
--- /dev/null
+++ b/include/cute/atom/mma_traits_sm70.hpp
@@ -0,0 +1,198 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/arch/mma_sm70.hpp>
+
+#include <cute/atom/mma_traits.hpp>
+#include <cute/layout.hpp>
+
+namespace cute
+{
+
+namespace {
+
+// Logical thread id to thread idx (quadpair)
+using SM70_QuadPair = Layout<Shape <_4, _2>,
+                             Stride<_1,_16>>;
+// (T8,V4) -> (M8,K4)
+using SM70_8x4_Row  = Layout<Shape <_8,_4>,
+                             Stride<_1,_8>>;
+// (T8,V4) -> (M8,K4)
+using SM70_8x4_Col  = Layout<Shape <Shape <_4,_2>,_4>,
+                             Stride<Stride<_8,_4>,_1>>;
+// (T8,V8) -> (M8,N8)
+using SM70_8x8_16b  = Layout<Shape <_8,_8>,
+                             Stride<_1,_8>>;
+// (T8,V8) -> (M8,N8)
+using SM70_8x8_32b  = Layout<Shape <Shape <_2, _2,_2>,Shape <_2,_2, _2>>,
+                             Stride<Stride<_1,_16,_4>,Stride<_8,_2,_32>>>;
+
+} 
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM70_8x8x4_F16F16F16F16_TN>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using Shape_MNK = Shape<_8,_8,_4>;
+  using ThrID   = SM70_QuadPair;
+  using ALayout = SM70_8x4_Row;
+  using BLayout = SM70_8x4_Row;
+  using CLayout = SM70_8x8_16b;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM70_8x8x4_F16F16F16F16_NT>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using Shape_MNK = Shape<_8,_8,_4>;
+  using ThrID   = SM70_QuadPair;
+  using ALayout = SM70_8x4_Col;
+  using BLayout = SM70_8x4_Col;
+  using CLayout = SM70_8x8_16b;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM70_8x8x4_F16F16F16F16_NN>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using Shape_MNK = Shape<_8,_8,_4>;
+  using ThrID   = SM70_QuadPair;
+  using ALayout = SM70_8x4_Col;
+  using BLayout = SM70_8x4_Row;
+  using CLayout = SM70_8x8_16b;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM70_8x8x4_F16F16F16F16_TT>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using Shape_MNK = Shape<_8,_8,_4>;
+  using ThrID   = SM70_QuadPair;
+  using ALayout = SM70_8x4_Row;
+  using BLayout = SM70_8x4_Col;
+  using CLayout = SM70_8x8_16b;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM70_8x8x4_F32F16F16F32_TN>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using Shape_MNK = Shape<_8,_8,_4>;
+  using ThrID   = SM70_QuadPair;
+  using ALayout = SM70_8x4_Row;
+  using BLayout = SM70_8x4_Row;
+  using CLayout = SM70_8x8_32b;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM70_8x8x4_F32F16F16F32_NT>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using Shape_MNK = Shape<_8,_8,_4>;
+  using ThrID   = SM70_QuadPair;
+  using ALayout = SM70_8x4_Col;
+  using BLayout = SM70_8x4_Col;
+  using CLayout = SM70_8x8_32b;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM70_8x8x4_F32F16F16F32_NN>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using Shape_MNK = Shape<_8,_8,_4>;
+  using ThrID   = SM70_QuadPair;
+  using ALayout = SM70_8x4_Col;
+  using BLayout = SM70_8x4_Row;
+  using CLayout = SM70_8x8_32b;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM70_8x8x4_F32F16F16F32_TT>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using Shape_MNK = Shape<_8,_8,_4>;
+  using ThrID   = SM70_QuadPair;
+  using ALayout = SM70_8x4_Row;
+  using BLayout = SM70_8x4_Col;
+  using CLayout = SM70_8x8_32b;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+} // namespace cute
diff --git a/include/cute/atom/mma_traits_sm75.hpp b/include/cute/atom/mma_traits_sm75.hpp
new file mode 100644
index 0000000000..1d3f51961c
--- /dev/null
+++ b/include/cute/atom/mma_traits_sm75.hpp
@@ -0,0 +1,81 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/arch/mma_sm75.hpp>
+
+#include <cute/atom/mma_traits.hpp>
+#include <cute/layout.hpp>
+
+namespace cute
+{
+
+template <>
+struct MMA_Traits<SM75_16x8x8_F32F16F16F32_TN>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using Shape_MNK = Shape<_16,_8,_8>;
+  using ThrID   = Layout<_32>;
+  using ALayout = Layout<Shape <Shape < _4,_8>,Shape < _2,_2>>,
+                         Stride<Stride<_32,_1>,Stride<_16,_8>>>;
+  using BLayout = Layout<Shape <Shape < _4,_8>,_2>,
+                         Stride<Stride<_16,_1>,_8>>;
+  using CLayout = Layout<Shape <Shape < _4,_8>,Shape < _2,_2>>,
+                         Stride<Stride<_32,_1>,Stride<_16,_8>>>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM75_8x8x16_S32S8S8S32_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using Shape_MNK = Shape<_8,_8,_16>;
+  using ThrID   = Layout<_32>;
+  using ALayout = Layout<Shape <Shape < _4,_8>,_4>,
+                         Stride<Stride<_32,_1>,_8>>;
+  using BLayout = Layout<Shape <Shape < _4,_8>,_4>,
+                         Stride<Stride<_32,_1>,_8>>;
+  using CLayout = Layout<Shape <Shape < _4,_8>,_2>,
+                         Stride<Stride<_16,_1>,_8>>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cute
diff --git a/include/cute/atom/mma_traits_sm80.hpp b/include/cute/atom/mma_traits_sm80.hpp
new file mode 100644
index 0000000000..5f7e73e467
--- /dev/null
+++ b/include/cute/atom/mma_traits_sm80.hpp
@@ -0,0 +1,690 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/arch/mma_sm80.hpp>
+#include <cute/atom/mma_traits.hpp>
+#include <cute/layout.hpp>
+#include <cute/numeric/numeric_types.hpp>
+
+namespace cute
+{
+
+namespace {
+
+// (T32,V1) -> (M8,N8)
+using SM80_8x4      = Layout<Shape <Shape < _4,_8>,_1>,
+                             Stride<Stride< _8,_1>,_0>>;
+// (T32,V2) -> (M8,N8)
+using SM80_8x8_Row  = Layout<Shape <Shape < _4,_8>,_2>,
+                             Stride<Stride<_16,_1>,_8>>;
+// (T32,V4) -> (M8,N16)
+using SM80_8x16_Row = Layout<Shape <Shape < _4,_8>,_4>,
+                             Stride<Stride<_32,_1>,_8>>;
+// (T32,V4) -> (M16,N8)
+using SM80_16x8_Row = Layout<Shape <Shape < _4,_8>,Shape < _2,_2>>,
+                             Stride<Stride<_32,_1>,Stride<_16,_8>>>;
+
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//////////////////////// fp16 = fp16 * fp16 + fp16 ////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM80_16x8x8_F16F16F16F16_TN>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using Shape_MNK = Shape<_16,_8,_8>;
+  using ThrID   = Layout<_32>;
+  using ALayout = SM80_16x8_Row;
+  using BLayout = SM80_8x8_Row;
+  using CLayout = SM80_16x8_Row;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x16_F16F16F16F16_TN>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using Shape_MNK = Shape<_16,_8,_16>;
+  using ThrID   = Layout<_32>;
+  using ALayout = Layout<Shape <Shape < _4,_8>,Shape < _2,_2,  _2>>,
+                         Stride<Stride<_32,_1>,Stride<_16,_8,_128>>>;
+  using BLayout = Layout<Shape <Shape < _4,_8>,Shape <_2, _2>>,
+                         Stride<Stride<_16,_1>,Stride<_8,_64>>>;
+  using CLayout = SM80_16x8_Row;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//////////////////////// fp32 = fp16 * fp16 + fp32 ////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM80_16x8x8_F32F16F16F32_TN>
+     : MMA_Traits<SM80_16x8x8_F16F16F16F16_TN>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x16_F32F16F16F32_TN>
+     : MMA_Traits<SM80_16x8x16_F16F16F16F16_TN>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//////////////////////// fp32 = bf16 * bf16 + fp32 ////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM80_16x8x8_F32BF16BF16F32_TN>
+     : MMA_Traits<SM80_16x8x8_F16F16F16F16_TN>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x16_F32BF16BF16F32_TN>
+     : MMA_Traits<SM80_16x8x16_F16F16F16F16_TN>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//////////////////////// fp32 = tf32 * tf32 + fp32 ////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM80_16x8x4_F32TF32TF32F32_TN>
+{
+  using ValTypeD = float;
+  using ValTypeA = cutlass::tfloat32_t;
+  using ValTypeB = cutlass::tfloat32_t;
+  using ValTypeC = float;
+
+  using Shape_MNK = Shape<_16,_8,_4>;
+  using ThrID   = Layout<_32>;
+  using ALayout = Layout<Shape <Shape < _4,_8>,_2>,
+                         Stride<Stride<_16,_1>,_8>>;
+  using BLayout = SM80_8x4;
+  using CLayout = SM80_16x8_Row;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x8_F32TF32TF32F32_TN>
+{
+  using ValTypeD = float;
+  using ValTypeA = cutlass::tfloat32_t;
+  using ValTypeB = cutlass::tfloat32_t;
+  using ValTypeC = float;
+
+  using Shape_MNK = Shape<_16,_8,_8>;
+  using ThrID   = Layout<_32>;
+  using ALayout = Layout<Shape <Shape < _4,_8>,Shape <_2, _2>>,
+                         Stride<Stride<_16,_1>,Stride<_8,_64>>>;
+  using BLayout = Layout<Shape <Shape <_4,_8>, _2>,
+                         Stride<Stride<_8,_1>,_32>>;
+  using CLayout = SM80_16x8_Row;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//////////////////////// fp64 = fp64 * fp64 + fp64 ////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM80_8x8x4_F64F64F64F64_TN>
+{
+  using ValTypeD = double;
+  using ValTypeA = double;
+  using ValTypeB = double;
+  using ValTypeC = double;
+
+  using Shape_MNK = Shape<_8,_8,_4>;
+  using ThrID   = Layout<_32>;
+  using ALayout = SM80_8x4;
+  using BLayout = SM80_8x4;
+  using CLayout = SM80_8x8_Row;
+};
+
+// Custom complex fp64 MMA composed of 4 fp64 MMAs -- same layouts
+template <>
+struct MMA_Traits<SM80_8x8x4_C64C64C64C64_TN>
+     : MMA_Traits<SM80_8x8x4_F64F64F64F64_TN>
+{
+  using ValTypeD = complex<double>;
+  using ValTypeA = complex<double>;
+  using ValTypeB = complex<double>;
+  using ValTypeC = complex<double>;
+};
+
+// Custom complex fp64 MMA composed of 3 fp64 MMAs -- same layouts
+template <>
+struct MMA_Traits<SM80_8x8x4_GC64C64C64GC64_TN>
+     : MMA_Traits<SM80_8x8x4_F64F64F64F64_TN>
+{
+  using ValTypeD = typename SM80_8x8x4_GC64C64C64GC64_TN::GaussComplex;
+  using ValTypeA = complex<double>;
+  using ValTypeB = complex<double>;
+  using ValTypeC = typename SM80_8x8x4_GC64C64C64GC64_TN::GaussComplex;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////// s32 = s8 * s8 + s32 ///////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM80_8x8x16_S32S8S8S32_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using Shape_MNK = Shape<_8,_8,_16>;
+  using ThrID   = Layout<_32>;
+  using ALayout = SM80_8x16_Row;
+  using BLayout = SM80_8x16_Row;
+  using CLayout = SM80_8x8_Row;
+};
+
+template <>
+struct MMA_Traits<SM80_8x8x16_S32S8S8S32_TN_SATURATE>
+     : MMA_Traits<SM80_8x8x16_S32S8S8S32_TN> {};
+
+template <>
+struct MMA_Traits<SM80_16x8x16_S32S8S8S32_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using Shape_MNK = Shape<_16,_8,_16>;
+  using ThrID   = Layout<_32>;
+  using ALayout = Layout<Shape <Shape < _4,_8>,Shape < _4,_2>>,
+                         Stride<Stride<_64,_1>,Stride<_16,_8>>>;
+  using BLayout = SM80_8x16_Row;
+  using CLayout = SM80_16x8_Row;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x16_S32S8S8S32_TN_SATURATE>
+     : MMA_Traits<SM80_16x8x16_S32S8S8S32_TN> {};
+
+template <>
+struct MMA_Traits<SM80_16x8x32_S32S8S8S32_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using Shape_MNK = Shape<_16,_8,_32>;
+  using ThrID   = Layout<_32>;
+  using ALayout = Layout<Shape <Shape < _4,_8>,Shape < _4,_2,  _2>>,
+                         Stride<Stride<_64,_1>,Stride<_16,_8,_256>>>;
+  using BLayout = Layout<Shape <Shape < _4,_8>, Shape <_4,  _2>>,
+                         Stride<Stride<_32,_1>, Stride<_8,_128>>>;
+  using CLayout = SM80_16x8_Row;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x32_S32S8S8S32_TN_SATURATE>
+     : MMA_Traits<SM80_16x8x32_S32S8S8S32_TN> {};
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////// s32 = s8 * u8 + s32 ///////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM80_8x8x16_S32S8U8S32_TN>
+     : MMA_Traits<SM80_8x8x16_S32S8S8S32_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+};
+
+template <>
+struct MMA_Traits<SM80_8x8x16_S32S8U8S32_TN_SATURATE>
+     : MMA_Traits<SM80_8x8x16_S32S8U8S32_TN> {};
+
+template <>
+struct MMA_Traits<SM80_16x8x16_S32S8U8S32_TN>
+     : MMA_Traits<SM80_16x8x16_S32S8S8S32_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x16_S32S8U8S32_TN_SATURATE>
+     : MMA_Traits<SM80_16x8x16_S32S8U8S32_TN> {};
+
+template <>
+struct MMA_Traits<SM80_16x8x32_S32S8U8S32_TN>
+     : MMA_Traits<SM80_16x8x32_S32S8S8S32_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x32_S32S8U8S32_TN_SATURATE>
+     : MMA_Traits<SM80_16x8x32_S32S8U8S32_TN> {};
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////// s32 = u8 * s8 + s32 ///////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM80_8x8x16_S32U8S8S32_TN>
+     : MMA_Traits<SM80_8x8x16_S32S8S8S32_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+};
+
+template <>
+struct MMA_Traits<SM80_8x8x16_S32U8S8S32_TN_SATURATE>
+     : MMA_Traits<SM80_8x8x16_S32U8S8S32_TN> {};
+
+template <>
+struct MMA_Traits<SM80_16x8x16_S32U8S8S32_TN>
+     : MMA_Traits<SM80_16x8x16_S32S8S8S32_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x16_S32U8S8S32_TN_SATURATE>
+     : MMA_Traits<SM80_16x8x16_S32U8S8S32_TN> {};
+
+template <>
+struct MMA_Traits<SM80_16x8x32_S32U8S8S32_TN>
+     : MMA_Traits<SM80_16x8x32_S32S8S8S32_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x32_S32U8S8S32_TN_SATURATE>
+     : MMA_Traits<SM80_16x8x32_S32U8S8S32_TN> {};
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////// s32 = u8 * u8 + s32 ///////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM80_8x8x16_S32U8U8S32_TN>
+     : MMA_Traits<SM80_8x8x16_S32S8S8S32_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+};
+
+template <>
+struct MMA_Traits<SM80_8x8x16_S32U8U8S32_TN_SATURATE>
+     : MMA_Traits<SM80_8x8x16_S32U8U8S32_TN> {};
+
+template <>
+struct MMA_Traits<SM80_16x8x16_S32U8U8S32_TN>
+     : MMA_Traits<SM80_16x8x16_S32S8S8S32_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x16_S32U8U8S32_TN_SATURATE>
+     : MMA_Traits<SM80_16x8x16_S32U8U8S32_TN> {};
+
+template <>
+struct MMA_Traits<SM80_16x8x32_S32U8U8S32_TN>
+     : MMA_Traits<SM80_16x8x32_S32S8S8S32_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x32_S32U8U8S32_TN_SATURATE>
+     : MMA_Traits<SM80_16x8x32_S32U8U8S32_TN> {};
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////// s32 = s4 * s4 + s32 ///////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM80_8x8x32_S32S4S4S32_TN> {
+  using ValTypeD = int32_t;
+  using ValTypeA = int4b_t;
+  using ValTypeB = int4b_t;
+  using ValTypeC = int32_t;
+
+  using Shape_MNK = Shape<_8, _8, _32>;
+  using ThrID = Layout<_32>;
+  // (T32,V8) -> (M8,N32)
+  using ALayout = Layout<Shape <Shape < _4, _8>, Shape <_8>>,
+                         Stride<Stride<_64, _1>, Stride<_8>>>;
+  using BLayout = Layout<Shape <Shape < _4, _8>, Shape <_8>>,
+                         Stride<Stride<_64, _1>, Stride<_8>>>;
+  using CLayout = SM80_8x8_Row;
+};
+
+template <>
+struct MMA_Traits<SM80_8x8x32_S32S4S4S32_TN_SATURATE>
+     : MMA_Traits<SM80_8x8x32_S32S4S4S32_TN> {};
+
+template <>
+struct MMA_Traits<SM80_16x8x32_S32S4S4S32_TN> {
+  using ValTypeD = int32_t;
+  using ValTypeA = int4b_t;
+  using ValTypeB = int4b_t;
+  using ValTypeC = int32_t;
+
+  using Shape_MNK = Shape<_16, _8, _32>;
+  using ThrID = Layout<_32>;
+  // (T32,V16) -> (M16,N32)
+  using ALayout = Layout<Shape <Shape <  _4, _8>, Shape < _8, _2>>,
+                         Stride<Stride<_128, _1>, Stride<_16, _8>>>;
+  // (T32,V8) -> (M8,N32)
+  using BLayout = Layout<Shape <Shape < _4, _8>, Shape <_8>>,
+                         Stride<Stride<_32, _1>, Stride<_8>>>;
+  using CLayout = SM80_16x8_Row;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x32_S32S4S4S32_TN_SATURATE>
+     : MMA_Traits<SM80_16x8x32_S32S4S4S32_TN> {};
+
+template <>
+struct MMA_Traits<SM80_16x8x64_S32S4S4S32_TN> {
+  using ValTypeD = int32_t;
+  using ValTypeA = int4b_t;
+  using ValTypeB = int4b_t;
+  using ValTypeC = int32_t;
+
+  using Shape_MNK = Shape<_16, _8, _64>;
+  using ThrID = Layout<_32>;
+  // (T32,V32) -> (M16,N64)
+  using ALayout = Layout<Shape <Shape <  _4, _8>, Shape < _8, _2, _2>>,
+                         Stride<Stride<_128, _1>, Stride<_16, _8, _512>>>;
+  // (T32,V16) -> (M8,N64)
+  using BLayout = Layout<Shape <Shape < _4, _8>, Shape <_8, _2>>,
+                         Stride<Stride<_64, _1>, Stride<_8, _256>>>;
+  using CLayout = SM80_16x8_Row;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x64_S32S4S4S32_TN_SATURATE>
+     : MMA_Traits<SM80_16x8x64_S32S4S4S32_TN> {};
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////// s32 = s4 * u4 + s32 ///////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM80_8x8x32_S32S4U4S32_TN>
+     : MMA_Traits<SM80_8x8x32_S32S4S4S32_TN> {
+  using ValTypeD = int32_t;
+  using ValTypeA = int4b_t;
+  using ValTypeB = uint4b_t;
+  using ValTypeC = int32_t;
+};
+
+template <>
+struct MMA_Traits<SM80_8x8x32_S32S4U4S32_TN_SATURATE>
+     : MMA_Traits<SM80_8x8x32_S32S4U4S32_TN> {};
+
+template <>
+struct MMA_Traits<SM80_16x8x32_S32S4U4S32_TN>
+    : MMA_Traits<SM80_16x8x32_S32S4S4S32_TN> {
+  using ValTypeD = int32_t;
+  using ValTypeA = int4b_t;
+  using ValTypeB = uint4b_t;
+  using ValTypeC = int32_t;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x32_S32S4U4S32_TN_SATURATE>
+     : MMA_Traits<SM80_16x8x32_S32S4U4S32_TN> {};
+template <>
+struct MMA_Traits<SM80_16x8x64_S32S4U4S32_TN>
+     : MMA_Traits<SM80_16x8x64_S32S4S4S32_TN> {
+  using ValTypeD = int32_t;
+  using ValTypeA = int4b_t;
+  using ValTypeB = uint4b_t;
+  using ValTypeC = int32_t;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x64_S32S4U4S32_TN_SATURATE>
+     : MMA_Traits<SM80_16x8x64_S32S4U4S32_TN> {};
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////// s32 = u4 * s4 + s32 ///////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM80_8x8x32_S32U4S4S32_TN>
+     : MMA_Traits<SM80_8x8x32_S32S4S4S32_TN> {
+  using ValTypeD = int32_t;
+  using ValTypeA = uint4b_t;
+  using ValTypeB = int4b_t;
+  using ValTypeC = int32_t;
+};
+
+template <>
+struct MMA_Traits<SM80_8x8x32_S32U4S4S32_TN_SATURATE>
+     : MMA_Traits<SM80_8x8x32_S32U4S4S32_TN> {};
+
+template <>
+struct MMA_Traits<SM80_16x8x32_S32U4S4S32_TN>
+     : MMA_Traits<SM80_16x8x32_S32S4S4S32_TN> {
+  using ValTypeD = int32_t;
+  using ValTypeA = uint4b_t;
+  using ValTypeB = int4b_t;
+  using ValTypeC = int32_t;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x32_S32U4S4S32_TN_SATURATE>
+     : MMA_Traits<SM80_16x8x32_S32U4S4S32_TN> {};
+
+template <>
+struct MMA_Traits<SM80_16x8x64_S32U4S4S32_TN>
+     : MMA_Traits<SM80_16x8x64_S32S4S4S32_TN> {
+  using ValTypeD = int32_t;
+  using ValTypeA = uint4b_t;
+  using ValTypeB = int4b_t;
+  using ValTypeC = int32_t;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x64_S32U4S4S32_TN_SATURATE>
+    : MMA_Traits<SM80_16x8x64_S32U4S4S32_TN> {};
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////// s32 = u4 * u4 + s32 ///////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM80_8x8x32_S32U4U4S32_TN>
+     : MMA_Traits<SM80_8x8x32_S32S4S4S32_TN> {
+  using ValTypeD = int32_t;
+  using ValTypeA = uint4b_t;
+  using ValTypeB = uint4b_t;
+  using ValTypeC = int32_t;
+};
+
+template <>
+struct MMA_Traits<SM80_8x8x32_S32U4U4S32_TN_SATURATE>
+     : MMA_Traits<SM80_8x8x32_S32U4U4S32_TN> {};
+
+template <>
+struct MMA_Traits<SM80_16x8x32_S32U4U4S32_TN>
+     : MMA_Traits<SM80_16x8x32_S32S4S4S32_TN> {
+  using ValTypeD = int32_t;
+  using ValTypeA = uint4b_t;
+  using ValTypeB = uint4b_t;
+  using ValTypeC = int32_t;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x32_S32U4U4S32_TN_SATURATE>
+     : MMA_Traits<SM80_16x8x32_S32U4U4S32_TN> {};
+
+template <>
+struct MMA_Traits<SM80_16x8x64_S32U4U4S32_TN>
+     : MMA_Traits<SM80_16x8x64_S32S4S4S32_TN> {
+  using ValTypeD = int32_t;
+  using ValTypeA = uint4b_t;
+  using ValTypeB = uint4b_t;
+  using ValTypeC = int32_t;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x64_S32U4U4S32_TN_SATURATE>
+     : MMA_Traits<SM80_16x8x64_S32U4U4S32_TN> {};
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////// s32 = b1 ^ b1 + s32 ///////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM80_16x8x256_S32U1U1S32_TN_XORPOPC>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = cute::uint1b_t;
+  using ValTypeB = cute::uint1b_t;
+  using ValTypeC = int32_t;
+
+  using Shape_MNK = Shape<_16,_8,_256>;
+  using ThrID   = Layout<_32>;
+  using ALayout = Layout<Shape<Shape<_4,_8>,Shape<_32,_2,_2>>,
+                       Stride<Stride<_512,_1>,Stride<_16,_8,_2048>>>;
+  using BLayout = Layout<Shape<Shape <_4,_8>,Shape<_32,_2>>,
+                         Stride<Stride<_256,_1>,Stride< _8,_1024>>>;
+  using CLayout = SM80_16x8_Row;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////// s32 = b1 & b1 + s32 ///////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM80_16x8x256_S32U1U1S32_TN_ANDPOPC>
+     : MMA_Traits<SM80_16x8x256_S32U1U1S32_TN_XORPOPC> {};
+
+template<>
+struct MMA_Traits<SM80_8x8x128_S32U1U1S32_TN_XORPOPC>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = cute::uint1b_t;
+  using ValTypeB = cute::uint1b_t;
+  using ValTypeC = int32_t;
+
+  using Shape_MNK = Shape<_8,_8,_128>;
+  using ThrID   = Layout<_32>;
+  using ALayout = Layout<Shape<Shape<_4,_8>,_32>,
+                         Stride<Stride<_256,_1>,_8>>;
+  using BLayout = Layout<Shape<Shape<_4,_8>,_32>,
+                         Stride<Stride<_256,_1>,_8>>;
+  using CLayout = SM80_8x8_Row;
+};
+
+template <>
+struct MMA_Traits<SM80_8x8x128_S32U1U1S32_TN_ANDPOPC>
+      :MMA_Traits<SM80_8x8x128_S32U1U1S32_TN_XORPOPC> {};
+
+template<>
+struct MMA_Traits<SM80_16x8x128_S32U1U1S32_TN_XORPOPC>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = cute::uint1b_t;
+  using ValTypeB = cute::uint1b_t;
+  using ValTypeC = int32_t;
+
+  using Shape_MNK = Shape<_16,_8,_128>;
+  using ThrID   = Layout<_32>;
+  using ALayout = Layout<Shape<Shape<_4,_8>,Shape<_32,_2>>,
+                       Stride<Stride<_512,_1>,Stride<Stride<_16,_8>>>>;
+  using BLayout = Layout<Shape <Shape<_4,_8>,_32>,
+                         Stride<Stride<_256,_1>,_8>>;
+  using CLayout = SM80_16x8_Row;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x128_S32U1U1S32_TN_ANDPOPC>
+      :MMA_Traits<SM80_16x8x128_S32U1U1S32_TN_XORPOPC> {};
+
+} // end namespace cute
diff --git a/include/cute/atom/mma_traits_sm90.hpp b/include/cute/atom/mma_traits_sm90.hpp
new file mode 100644
index 0000000000..b2ced3f878
--- /dev/null
+++ b/include/cute/atom/mma_traits_sm90.hpp
@@ -0,0 +1,144 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/arch/mma_sm90.hpp>
+#include <cute/atom/mma_traits.hpp>
+
+#include <cute/layout.hpp>
+
+namespace cute {
+
+///////////////////////////////////////////////////////////////////////////////
+//////////////////////// fp64 = fp64 * fp64 + fp64 ////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+using SM90_16x8x4_F64F64F64F64_TN = SM90::MMA_16x8x4_F64F64F64F64_TN;
+
+template <>
+struct MMA_Traits<SM90_16x8x4_F64F64F64F64_TN>
+{
+  using ValTypeD = double;
+  using ValTypeA = double;
+  using ValTypeB = double;
+  using ValTypeC = double;
+
+  using Shape_MNK = Shape<_16,_8,_4>;
+  using ThrID =  Layout<_32>;
+  using ALayout = Layout<Shape <Shape < _4,_8>,_2>,
+                         Stride<Stride<_16,_1>,_8>>;
+  using BLayout = Layout<Shape <Shape < _4,_8>,_1>,
+                         Stride<Stride< _8,_1>,_0>>;
+  using CLayout = Layout<Shape <Shape < _4,_8>,Shape < _2,_2>>,
+                         Stride<Stride<_32,_1>,Stride<_16,_8>>>;
+};
+
+using SM90_16x8x8_F64F64F64F64_TN = SM90::MMA_16x8x8_F64F64F64F64_TN;
+
+template <>
+struct MMA_Traits<SM90_16x8x8_F64F64F64F64_TN>
+{
+  using ValTypeD = double;
+  using ValTypeA = double;
+  using ValTypeB = double;
+  using ValTypeC = double;
+
+  using Shape_MNK = Shape<_16,_8,_8>;
+  using ThrID   = Layout<_32>;
+  using ALayout = Layout<Shape <Shape < _4,_8>,Shape <_2, _2>>,
+                         Stride<Stride<_16,_1>,Stride<_8,_64>>>;
+  using BLayout = Layout<Shape <Shape < _4,_8>, _2>,
+                         Stride<Stride< _8,_1>,_32>>;
+  using CLayout = Layout<Shape <Shape < _4,_8>,Shape < _2,_2>>,
+                         Stride<Stride<_32,_1>,Stride<_16,_8>>>;
+};
+
+using SM90_16x8x16_F64F64F64F64_TN = SM90::MMA_16x8x16_F64F64F64F64_TN;
+
+template <>
+struct MMA_Traits<SM90_16x8x16_F64F64F64F64_TN>
+{
+  using ValTypeD = double;
+  using ValTypeA = double;
+  using ValTypeB = double;
+  using ValTypeC = double;
+
+  using Shape_MNK = Shape<_16,_8,_16>;
+  using ThrID   = Layout<_32>;
+  using ALayout = Layout<Shape <Shape < _4,_8>,Shape <_2, _4>>,
+                         Stride<Stride<_16,_1>,Stride<_8,_64>>>;
+  using BLayout = Layout<Shape <Shape < _4,_8>, _4>,
+                         Stride<Stride< _8,_1>,_32>>;
+  using CLayout = Layout<Shape <Shape < _4,_8>,Shape < _2,_2>>,
+                         Stride<Stride<_32,_1>,Stride<_16,_8>>>;
+};
+
+///////////////////////////////////////////////////////////////////////////////////
+//////////////////////// cfp64 = cfp64 * cfp64 + cfp64 ////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////
+
+using SM90_16x8x4_C64C64C64C64_TN  = SM90::MMA_16x8x4_C64C64C64C64_TN;
+
+template <>
+struct MMA_Traits<SM90_16x8x4_C64C64C64C64_TN>
+     : MMA_Traits<SM90_16x8x4_F64F64F64F64_TN>
+{
+  using ValTypeD = complex<double>;
+  using ValTypeA = complex<double>;
+  using ValTypeB = complex<double>;
+  using ValTypeC = complex<double>;
+};
+
+using SM90_16x8x8_C64C64C64C64_TN  = SM90::MMA_16x8x8_C64C64C64C64_TN;
+
+template <>
+struct MMA_Traits<SM90_16x8x8_C64C64C64C64_TN>
+     : MMA_Traits<SM90_16x8x8_F64F64F64F64_TN>
+{
+  using ValTypeD = complex<double>;
+  using ValTypeA = complex<double>;
+  using ValTypeB = complex<double>;
+  using ValTypeC = complex<double>;
+};
+
+using SM90_16x8x16_C64C64C64C64_TN = SM90::MMA_16x8x16_C64C64C64C64_TN;
+
+template <>
+struct MMA_Traits<SM90_16x8x16_C64C64C64C64_TN>
+     : MMA_Traits<SM90_16x8x16_F64F64F64F64_TN>
+{
+  using ValTypeD = complex<double>;
+  using ValTypeA = complex<double>;
+  using ValTypeB = complex<double>;
+  using ValTypeC = complex<double>;
+};
+
+} // end namespace cute
diff --git a/include/cute/atom/mma_traits_sm90_gmma.hpp b/include/cute/atom/mma_traits_sm90_gmma.hpp
new file mode 100644
index 0000000000..8f59ff55b4
--- /dev/null
+++ b/include/cute/atom/mma_traits_sm90_gmma.hpp
@@ -0,0 +1,8998 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/pointer_flagged.hpp>            // cute::smem_ptr_flag
+#include <cute/pointer_sparse.hpp>             // cute::smem_sparse_ptr_flag
+#include <cute/swizzle.hpp>                    // cute::Swizzle
+#include <cute/tensor_impl.hpp>                // cute::Tensor
+#include <cute/arch/mma_sm90_desc.hpp>         // cute::LayoutType
+#include <cute/arch/mma_sm90_gmma.hpp>         // cute::SM90_64x8x16_F16F16F16_SS, etc
+#include <cute/atom/mma_traits.hpp>            // cute::MMA_Traits
+#include <cute/layout_composed.hpp>            // cute::ComposedLayout
+#include <cute/numeric/integral_constant.hpp>  // cute::is_static
+
+namespace cute {
+
+// Fence between the async destination accumulators of GMMA & source for their dependent use
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE
+void
+warpgroup_fence_operand(Tensor<Engine, Layout>& frg) {
+  CUTE_STATIC_ASSERT(is_static<Layout>::value);
+  if constexpr (is_same_v<typename Engine::value_type, float>) {
+    auto f32_frg = recast<float>(frg);
+    CUTE_UNROLL
+    for (int i = 0; i < size(f32_frg); ++i) {
+      warpgroup_fence_operand(f32_frg(i));
+    }
+  }
+  else {
+    CUTE_STATIC_ASSERT(is_rmem<Engine>::value);
+    auto u32_frg = recast<uint32_t>(frg);
+    CUTE_UNROLL
+    for (int i = 0; i < size(u32_frg); ++i) {
+      warpgroup_fence_operand(u32_frg(i));
+    }
+  }
+}
+
+namespace SM90::GMMA {
+
+///////////////////////////////////////////
+// Common layouts for GMMA Shared Memory //
+///////////////////////////////////////////
+
+// M|N-major GMMA layouts in units of bits
+using Layout_MN_INTER_Atom_Bits = ComposedLayout<Swizzle<0,4,3>, smem_ptr_flag, Layout<Shape< _128,_8>,Stride<_1, _128>>>;
+using Layout_MN_SW32_Atom_Bits  = ComposedLayout<Swizzle<1,4,3>, smem_ptr_flag, Layout<Shape< _256,_8>,Stride<_1, _256>>>;
+using Layout_MN_SW64_Atom_Bits  = ComposedLayout<Swizzle<2,4,3>, smem_ptr_flag, Layout<Shape< _512,_8>,Stride<_1, _512>>>;
+using Layout_MN_SW128_Atom_Bits = ComposedLayout<Swizzle<3,4,3>, smem_ptr_flag, Layout<Shape<_1024,_8>,Stride<_1,_1024>>>;
+
+// K-major GMMA layouts in units of bits
+using Layout_K_INTER_Atom_Bits  = ComposedLayout<Swizzle<0,4,3>, smem_ptr_flag, Layout<Shape<_8, _128>,Stride< _128,_1>>>;
+using Layout_K_SW32_Atom_Bits   = ComposedLayout<Swizzle<1,4,3>, smem_ptr_flag, Layout<Shape<_8, _256>,Stride< _256,_1>>>;
+using Layout_K_SW64_Atom_Bits   = ComposedLayout<Swizzle<2,4,3>, smem_ptr_flag, Layout<Shape<_8, _512>,Stride< _512,_1>>>;
+using Layout_K_SW128_Atom_Bits  = ComposedLayout<Swizzle<3,4,3>, smem_ptr_flag, Layout<Shape<_8,_1024>,Stride<_1024,_1>>>;
+
+// M|N-major layouts in units of Type
+template <class Type>
+using Layout_MN_INTER_Atom = decltype(upcast<sizeof_bits<Type>::value>(Layout_MN_INTER_Atom_Bits{}));
+template <class Type>
+using Layout_MN_SW32_Atom  = decltype(upcast<sizeof_bits<Type>::value>(Layout_MN_SW32_Atom_Bits{}));
+template <class Type>
+using Layout_MN_SW64_Atom  = decltype(upcast<sizeof_bits<Type>::value>(Layout_MN_SW64_Atom_Bits{}));
+template <class Type>
+using Layout_MN_SW128_Atom = decltype(upcast<sizeof_bits<Type>::value>(Layout_MN_SW128_Atom_Bits{}));
+
+// K-major layouts in units of Type
+template <class Type>
+using Layout_K_INTER_Atom = decltype(upcast<sizeof_bits<Type>::value>(Layout_K_INTER_Atom_Bits{}));
+template <class Type>
+using Layout_K_SW32_Atom  = decltype(upcast<sizeof_bits<Type>::value>(Layout_K_SW32_Atom_Bits{}));
+template <class Type>
+using Layout_K_SW64_Atom  = decltype(upcast<sizeof_bits<Type>::value>(Layout_K_SW64_Atom_Bits{}));
+template <class Type>
+using Layout_K_SW128_Atom = decltype(upcast<sizeof_bits<Type>::value>(Layout_K_SW128_Atom_Bits{}));
+
+// With GMMA::Major param
+template <class Type, Major tnsp>
+using Layout_INTER_Atom = typename conditional<tnsp == Major::MN,
+                                               Layout_MN_INTER_Atom<Type>,
+                                               Layout_K_INTER_Atom<Type>>::type;
+template <class Type, Major tnsp>
+using Layout_SW32_Atom = typename conditional<tnsp == Major::MN,
+                                              Layout_MN_SW32_Atom<Type>,
+                                              Layout_K_SW32_Atom<Type>>::type;
+template <class Type, Major tnsp>
+using Layout_SW64_Atom = typename conditional<tnsp == Major::MN,
+                                              Layout_MN_SW64_Atom<Type>,
+                                              Layout_K_SW64_Atom<Type>>::type;
+template <class Type, Major tnsp>
+using Layout_SW128_Atom = typename conditional<tnsp == Major::MN,
+                                               Layout_MN_SW128_Atom<Type>,
+                                               Layout_K_SW128_Atom<Type>>::type;
+
+//
+// Tensor (position-dependent swizzle) to LayoutType utility
+//
+
+template <class Engine, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+LayoutType
+layout_type(Tensor<Engine, Layout<Shape,Stride>> const&)
+{
+  static_assert(is_same<uint128_t, typename Engine::value_type>::value,
+                "Expected uint128_t type in LayoutType conversion.");
+
+  using Swizzle = get_swizzle_t<Engine>;
+  constexpr int B = Swizzle::num_bits;
+  constexpr int M = Swizzle::num_base;
+  constexpr int S = Swizzle::num_shft;
+
+  static_assert(M == 4,           "Unsupported layout swizzle");
+  static_assert(0 <= B && B <= 3, "Unsupported layout swizzle");
+  static_assert(S == 3,           "Unsupported layout swizzle");
+
+  switch (B) {
+    case 0: return LayoutType::INTERLEAVE;
+    case 1: return LayoutType::B32;
+    case 2: return LayoutType::B64;
+    case 3: return LayoutType::B128;
+  }
+  return LayoutType::INTERLEAVE;  // ERROR
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Construction method for GMMA Descriptors
+///////////////////////////////////////////////////////////////////////////////
+
+/**
+* ///////////////////////////////
+* // make_gmma_desc<Major::MN> //
+* ///////////////////////////////
+* Each GmmaDescriptor Major-MN describes a canonical layout of the form
+*
+* LayoutType::INTERLEAVE   : Swizzle<0,4,3> o smem_ptr o ((T,1,m),(8,k)):((1,T,SBO),(1T,LBO))
+* LayoutType::B32          : Swizzle<1,4,3> o smem_ptr o ((T,2,m),(8,k)):((1,T,LBO),(2T,SBO))
+* LayoutType::B64          : Swizzle<2,4,3> o smem_ptr o ((T,4,m),(8,k)):((1,T,LBO),(4T,SBO))
+* LayoutType::B128         : Swizzle<3,4,3> o smem_ptr o ((T,8,m),(8,k)):((1,T,LBO),(8T,SBO))
+*
+* where
+*   T  : sizeof(uint128_t) / sizeof(value_type)
+*   m  : integer in [1,16] corresponding to GMMA shape
+*   k  : integer in [1,32] corresponding to GMMA shape
+*   SBO: stride byte offset
+*   LBO: leading byte offset
+*
+* See GMMA::Layout_MN_XXX_Atom<value_type> for building canonical GmmaDescriptor Major-MN layouts.
+* For example,
+*   auto smem_layout = tile_to_shape(Layout_MN_SW128_Atom<value_type>{}, Shape<_128,_64>{});
+* is guaranteed to be accepted by make_gmma_desc<Major::MN> for appropriate value_type.
+*
+* //////////////////////////////
+* // make_gmma_desc<Major::K> //
+* //////////////////////////////
+* Each GmmaDescriptor Major-K describes a canonical layout of the form
+*
+* LayoutType::INTERLEAVE : Swizzle<0,4,3> o smem_ptr o ((8,m),(T,2)):((1T,SBO),(1,LBO))
+* LayoutType::B32        : Swizzle<1,4,3> o smem_ptr o ((8,m),(T,2)):((2T,SBO),(1, T ))
+* LayoutType::B64        : Swizzle<2,4,3> o smem_ptr o ((8,m),(T,2)):((4T,SBO),(1, T ))
+* LayoutType::B128       : Swizzle<3,4,3> o smem_ptr o ((8,m),(T,2)):((8T,SBO),(1, T ))
+*
+* See GMMA::Layout_K_XXX_Atom<value_type> for building canonical GmmaDescriptor Major-K layouts.
+* For example,
+*   auto smem_layout = tile_to_shape(Layout_K_SW128_Atom<value_type>{}, Shape<_128,_64>{});
+* is guaranteed to be accepted by make_gmma_desc<Major::K> for appropriate value_type.
+*/
+template <Major MajorMode, class TEngine, class TLayout>
+CUTE_HOST_DEVICE constexpr
+GmmaDescriptor
+make_gmma_desc(Tensor<TEngine,TLayout> const& tensor)
+{
+  static_assert(is_smem<TEngine>::value, "GMMA Descriptors can only be constructed on smem.");
+  static_assert(TLayout::rank == 2, "GMMA Descriptors can only be constructed on rank-2 tensors.");
+  using value_type = typename TEngine::value_type;
+
+  Tensor u128_tensor = recast<uint128_t const>(tensor);
+
+  // Result
+  GmmaDescriptor desc;
+
+  // Layout type
+  constexpr LayoutType LAYOUT_TYPE = layout_type(u128_tensor);
+  desc.bitfield.layout_type_ = uint8_t(LAYOUT_TYPE);
+
+  // Start address (4LSB not included)
+  uint32_t start_address = cast_smem_ptr_to_uint(raw_pointer_cast(u128_tensor.data()));
+  desc.bitfield.start_address_ = static_cast<uint16_t>(start_address >> 4);
+
+  constexpr uint8_t base_offset = 0;
+  desc.bitfield.base_offset_ = base_offset;
+
+  // LayoutType meta
+  constexpr int W = LAYOUT_TYPE == LayoutType::INTERLEAVE ? 1 :
+                    LAYOUT_TYPE == LayoutType::B32        ? 2 :
+                    LAYOUT_TYPE == LayoutType::B64        ? 4 :
+                    LAYOUT_TYPE == LayoutType::B128       ? 8 : -1;
+
+  if constexpr (MajorMode == Major::MN)
+  {
+    /* In units of uint128_t, each GmmaDescriptor Major-MN describes a canonical layout of the form
+     *
+     * LayoutType::INTERLEAVE         : Swizzle<0,4,3> o smem_ptr o ((1,n),(8,k)):((X,SBO),(1,LBO))
+     * LayoutType::B32                : Swizzle<1,4,3> o smem_ptr o ((2,n),(8,k)):((1,LBO),(2,SBO))
+     * LayoutType::B64                : Swizzle<2,4,3> o smem_ptr o ((4,n),(8,k)):((1,LBO),(4,SBO))
+     * LayoutType::B128               : Swizzle<3,4,3> o smem_ptr o ((8,n),(8,k)):((1,LBO),(8,SBO))
+     */
+    static_assert(size<1>(u128_tensor) == Int<(256 / cute::sizeof_bits<value_type>::value)>{} || // A and B in dense MMA
+                  size<1>(u128_tensor) == Int<(128 / cute::sizeof_bits<value_type>::value)>{} || // A in sparse MMA
+                  size<1>(u128_tensor) == Int<(512 / cute::sizeof_bits<value_type>::value)>{},   // B in sparse MMA
+                         "Not a canonical GMMA_MN Layout: Expected K-size 256/sizeof_bits<T> for dense or (128|512)/sizeof_bits<T> for sparse.");
+
+    // Construct the canonical GMMA T Layout with shape ((W,n),(8,2))
+    Layout canonical_layout = logical_divide(layout(u128_tensor), make_tile(Layout<Int<W>,_1>{}, Layout<Int<8>,_1>{}));
+
+    // Check ranks of canonical
+    CUTE_STATIC_ASSERT_V(rank<0>(canonical_layout) == Int<2>{}, "Not a canonical GMMA_MN Layout: No flat offset mode");
+    CUTE_STATIC_ASSERT_V(rank<1>(canonical_layout) == Int<2>{}, "Not a canonical GMMA_MN Layout: No flat offset mode");
+    // Check canonical mode strides
+    constexpr uint32_t stride_00 = stride<0,0>(canonical_layout);
+    constexpr uint32_t expected_stride_00 = LAYOUT_TYPE == LayoutType::INTERLEAVE ? stride<0,0>(canonical_layout) : 1;
+    static_assert(stride_00 == expected_stride_00, "Not a canonical GMMA_MN Layout: Expected stride failure.");
+    constexpr uint32_t stride_10 = stride<1,0>(canonical_layout);
+    constexpr uint32_t expected_stride_10 = W;
+    static_assert(stride_10 == expected_stride_10, "Not a canonical GMMA_MN Layout: Expected stride failure.");
+
+    // stride dimension byte offset and leading dimension byte offset (4LSB not included == uint128_t units)
+    constexpr uint32_t stride_01 = stride<0,1>(canonical_layout);
+    constexpr uint32_t stride_11 = stride<1,1>(canonical_layout);
+
+    desc.bitfield.stride_byte_offset_  = (LAYOUT_TYPE == LayoutType::INTERLEAVE) ? stride_01 : stride_11;
+    desc.bitfield.leading_byte_offset_ = (LAYOUT_TYPE == LayoutType::INTERLEAVE) ? stride_11 : stride_01;
+  }
+  else if constexpr (MajorMode == Major::K)
+  {
+    /* In units of uint128_t, each GmmaDescriptor Major-K describes a canonical layout of the form
+     *
+     * LayoutType::INTERLEAVE    : Swizzle<0,4,3> o smem_ptr o ((8,n),2):((1,SBO),LBO)
+     * LayoutType::B32           : Swizzle<1,4,3> o smem_ptr o ((8,n),2):((2,SBO),1)
+     * LayoutType::B64           : Swizzle<2,4,3> o smem_ptr o ((8,n),2):((4,SBO),1)
+     * LayoutType::B128          : Swizzle<3,4,3> o smem_ptr o ((8,n),2):((8,SBO),1)
+     */
+    CUTE_STATIC_ASSERT_V(size<0>(u128_tensor) % Int<8>{} == Int<0>{},          // N|M size
+                         "Not a canonical GMMA_K Layout: Expected MN-size multiple of 8.");
+    CUTE_STATIC_ASSERT_V(size<1>(u128_tensor) == Int<2>{} || size<1>(u128_tensor) == Int<4>{},      // K   size
+                         "Not a canonical GMMA_K Layout: Expected K-size 2 for dense or 4 for sparse (in units of uint128_t).");
+
+    // Construct the canonical GMMA N Layout with shape ((8,n),(2,1))
+    Layout canonical_layout = logical_divide(layout(u128_tensor), make_tile(Layout<_8,_1>{}, Layout<_2,_1>{}));
+
+    // Check ranks of canonical
+    CUTE_STATIC_ASSERT_V(rank<0>(canonical_layout) == Int<2>{}, "Not a canonical GMMA_K Layout: No flat offset mode");
+    CUTE_STATIC_ASSERT_V(rank<1>(canonical_layout) == Int<2>{}, "Not a canonical GMMA_K Layout: No flat offset mode");
+    // Check canonical mode strides
+    constexpr uint32_t stride_00 = stride<0,0>(canonical_layout);
+    constexpr uint32_t expected_stride_00 = W;
+    static_assert(stride_00 == expected_stride_00, "Not a canonical GMMA_K Layout: Expected stride failure.");
+    constexpr uint32_t stride_10 = stride<1,0>(canonical_layout);
+    constexpr uint32_t expected_stride_10 = (LAYOUT_TYPE == LayoutType::INTERLEAVE) ? stride<1,0>(canonical_layout) : 1;
+    static_assert(stride_10 == expected_stride_10, "Not a canonical GMMA_K Layout: Expected stride failure.");
+
+    // stride dimension byte offset and leading dimension byte offset (4LSB not included == uint128_t units)
+    constexpr uint32_t stride_01 = stride<0,1>(canonical_layout);
+
+    desc.bitfield.stride_byte_offset_  = stride_01;
+    desc.bitfield.leading_byte_offset_ = stride_10;
+  } else {
+    static_assert(MajorMode != Major::MN && MajorMode != Major::K, "Unrecognized MajorMode!");
+  }
+
+#if 0
+  // DEBUG and SANITY
+  assert((start_address & 0b0000001111) == 0); // Must be 16B aligned (4LSB are 0) no negotiation
+  assert((start_address & 0b1110000000) == 0); // Assert base_offset is 0, generalize later
+  if (thread0()) {
+    print("smem_desc input     tensor: "); print(tensor.data()); print(" o "); print(tensor.layout()); print("\n");
+    print("smem_desc uint128_t tensor: "); print(u128_tensor.data()); print(" o "); print(u128_tensor.layout()); print("\n");
+    //print("     desc canonical layout: "); print(canonical_layout); print("\n");
+    print(desc);
+  }
+#endif
+
+  return desc;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Higher level GMMA Descriptor utilities
+///////////////////////////////////////////////////////////////////////////////
+
+struct DescriptorIterator
+{
+  using reference    = GmmaDescriptor;
+  using element_type = GmmaDescriptor;
+  using value_type   = GmmaDescriptor;
+
+  GmmaDescriptor desc_;
+
+  // Dereference returns the GmmaDescriptor
+  CUTE_HOST_DEVICE constexpr
+  reference operator*() const { return desc_; }
+
+  // Advance and return a new GmmaDescriptor
+  template <class Index>
+  CUTE_HOST_DEVICE constexpr
+  reference operator[](Index const& i) const { return *(*this + i); }
+
+  // Return an advanced iterator
+  template <class Index>
+  CUTE_HOST_DEVICE constexpr
+  DescriptorIterator operator+(Index const& offset) const
+  {
+    return { GmmaDescriptor{desc_ + uint64_t(offset)} };
+  }
+};
+
+template <class T>
+CUTE_HOST_DEVICE constexpr
+GmmaDescriptor
+raw_pointer_cast(DescriptorIterator const& ptr) {
+  return ptr.desc_;
+}
+
+// Recast a DescriptorIterator Tensor to uint64_t, it's RegType in mma_unpack
+template <class NewT>
+CUTE_HOST_DEVICE constexpr
+DescriptorIterator
+recast_ptr(DescriptorIterator const& iter) {
+  static_assert(is_same<NewT, uint64_t>::value, "Can only cast GmmaDescriptorIterator to uint64_t.");
+  return iter;  // Do nothing, it will still dereference to GmmaDescriptor and decay to uint64_t
+}
+
+CUTE_HOST_DEVICE void
+print(DescriptorIterator) {
+  printf("GMMA::DescriptorIterator");
+}
+
+// The GMMA Traits below have custom fragment type flags for their smem desc tensors.
+// These flags specialize a MakeTensor customization point to correctly make the fragment that is desired.
+template <Major>
+struct smem_desc : DescriptorIterator {};
+
+} // end namespace SM90::GMMA
+
+// Customization point for creating a GMMA::smem_desc Tensor
+template <SM90::GMMA::Major MajorMode>
+struct MakeTensor<SM90::GMMA::smem_desc<MajorMode>>
+{
+  template <class TEngine, class TLayout>
+  CUTE_HOST_DEVICE constexpr auto
+  operator()(Tensor<TEngine,TLayout> const& smem_tensor)
+  {
+    static_assert(is_smem<TEngine>::value, "Expected SMEM Tensor to construct a GMMA Desc Tensor");
+    return make_tensor(SM90::GMMA::DescriptorIterator{SM90::GMMA::make_gmma_desc<MajorMode>(tensor<0>(smem_tensor))},
+                       replace<0>(recast<uint128_t const>(smem_tensor).layout(), Layout<_1,_0>{}));
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//////////////////////////// MMA_TRAITS ///////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+namespace SM90::GMMA {
+
+//
+// Specialized mma_unpack implementation for SM90 GMMA instructions
+//
+
+template <class MMA_Op, class... MMA_Args,
+          class TD, class DLayout,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout>
+CUTE_HOST_DEVICE constexpr
+void
+mma_unpack(MMA_Traits<MMA_Op, MMA_Args...> const& traits,
+           Tensor<TD, DLayout>      & D,
+           Tensor<TA, ALayout> const& A,
+           Tensor<TB, BLayout> const& B,
+           Tensor<TC, CLayout> const& C)
+{
+  static_assert(is_rmem<TD>::value, "Expected registers in MMA_Atom::call");
+  static_assert(is_rmem<TA>::value, "Expected registers in MMA_Atom::call");
+  static_assert(is_rmem<TB>::value, "Expected registers in MMA_Atom::call");
+  static_assert(is_rmem<TC>::value, "Expected registers in MMA_Atom::call");
+
+  // Register value types from the MMA_Operation register arrays
+  using RegTypeA = typename remove_extent<typename MMA_Op::ARegisters>::type;
+  using RegTypeB = typename remove_extent<typename MMA_Op::BRegisters>::type;
+  using RegTypeC = typename remove_extent<typename MMA_Op::CRegisters>::type;
+
+  // SM90 GMMA take three arguments rather than four, try to assert C and D are aliased
+  static_assert(is_same<typename TD::value_type, typename TC::value_type>::value, "GMMA C and D value_type must match.");
+  static_assert(is_same<DLayout, CLayout>::value, "GMMA C and D layouts must match.");
+  // assert((void*)&C == (void*)&D);
+
+  Tensor rA = recast<RegTypeA>(A);
+  Tensor rB = recast<RegTypeB>(B);
+  Tensor rC = recast<RegTypeC>(D);  // NOTE: D and C are same, so use mutable D
+
+  constexpr int RegNumA = extent<typename MMA_Op::ARegisters>::value;
+  constexpr int RegNumB = extent<typename MMA_Op::BRegisters>::value;
+  constexpr int RegNumC = extent<typename MMA_Op::CRegisters>::value;
+
+  CUTE_STATIC_ASSERT_V(size(rA) == Int<RegNumA>{});
+  CUTE_STATIC_ASSERT_V(size(rB) == Int<RegNumB>{});
+  CUTE_STATIC_ASSERT_V(size(rC) == Int<RegNumC>{});
+
+  detail::explode(MMA_Op::fma,
+                  rA, make_int_sequence<RegNumA>{},
+                  rB, make_int_sequence<RegNumB>{},
+                  rC, make_int_sequence<RegNumC>{},
+                  &(traits.accumulate_), seq<0>{});
+}
+
+// Accumulator layouts
+template<int N>
+using CLayout_64xN   = Layout<Shape <Shape <  _4,_8, _4>,Shape < _2,_2,Int<N/8>>>,
+                              Stride<Stride<_128,_1,_16>,Stride<_64,_8,   _512>>>;
+
+using CLayout_64x8   = CLayout_64xN<  8>;
+using CLayout_64x16  = CLayout_64xN< 16>;
+using CLayout_64x32  = CLayout_64xN< 32>;
+using CLayout_64x64  = CLayout_64xN< 64>;
+using CLayout_64x96  = CLayout_64xN< 96>;
+using CLayout_64x128 = CLayout_64xN<128>;
+using CLayout_64x192 = CLayout_64xN<192>;
+using CLayout_64x256 = CLayout_64xN<256>;
+
+// Register source layout for 32-bit value types
+using ALayout_64x8   = Layout<Shape <Shape <  _4,_8, _4>,Shape <    _2,  _2>>,
+                              Stride<Stride< _64,_1,_16>,Stride<    _8,_256>>>;
+
+// Register source layout for 16-bit (sparse 32-bit) value types
+using ALayout_64x16  = Layout<Shape <Shape <  _4,_8, _4>,Shape < _2,_2,  _2>>,
+                              Stride<Stride<_128,_1,_16>,Stride<_64,_8,_512>>>;
+
+// Register source layout for 8-bit (sparse 16-bit) value types
+using ALayout_64x32  = Layout<Shape <Shape <  _4,_8, _4>,Shape < _4,_2,   _2>>,
+                              Stride<Stride<_256,_1,_16>,Stride<_64,_8,_1024>>>;
+
+// Register source layout for sparse 8-bit value types
+using ALayout_64x64  = Layout<Shape <Shape <  _4,_8, _4>,Shape < _8,_2,   _2>>,
+                              Stride<Stride<_512,_1,_16>,Stride<_64,_8,_2048>>>;
+
+// Shared memory source layouts for any value type
+template <int M, int K>
+using ABLayout       = Layout<Shape <_128,Shape <Int<M>,Int<K>>>,
+                              Stride<  _0,Stride<    _1,Int<M>>>>;
+
+} // end namespace SM90::GMMA
+
+using namespace SM90;
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x16_F16F16F16_SS = SM90::GMMA::MMA_64x8x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_8,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<  8, 16>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x16_F16F16F16_RS = SM90::GMMA::MMA_64x8x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_8,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<  8, 16>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x16_F16F16F16_SS = SM90::GMMA::MMA_64x16x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_16,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 16, 16>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x16_F16F16F16_RS = SM90::GMMA::MMA_64x16x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_16,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 16, 16>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x16_F16F16F16_SS = SM90::GMMA::MMA_64x32x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_32,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 32, 16>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x16_F16F16F16_RS = SM90::GMMA::MMA_64x32x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_32,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 32, 16>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x16_F16F16F16_SS = SM90::GMMA::MMA_64x64x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_64,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 64, 16>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x16_F16F16F16_RS = SM90::GMMA::MMA_64x64x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_64,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 64, 16>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x16_F16F16F16_SS = SM90::GMMA::MMA_64x96x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_96,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 96, 16>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x16_F16F16F16_RS = SM90::GMMA::MMA_64x96x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_96,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 96, 16>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x16_F16F16F16_SS = SM90::GMMA::MMA_64x128x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_128,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<128, 16>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x16_F16F16F16_RS = SM90::GMMA::MMA_64x128x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_128,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<128, 16>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x16_F16F16F16_SS = SM90::GMMA::MMA_64x192x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_192,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<192, 16>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x16_F16F16F16_RS = SM90::GMMA::MMA_64x192x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_192,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<192, 16>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x16_F16F16F16_SS = SM90::GMMA::MMA_64x256x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_256,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<256, 16>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x16_F16F16F16_RS = SM90::GMMA::MMA_64x256x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_256,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<256, 16>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x16_F32F16F16_SS = SM90::GMMA::MMA_64x8x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_8,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<  8, 16>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x16_F32F16F16_RS = SM90::GMMA::MMA_64x8x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_8,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<  8, 16>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x16_F32F16F16_SS = SM90::GMMA::MMA_64x16x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_16,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 16, 16>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x16_F32F16F16_RS = SM90::GMMA::MMA_64x16x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_16,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 16, 16>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x16_F32F16F16_SS = SM90::GMMA::MMA_64x32x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_32,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 32, 16>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x16_F32F16F16_RS = SM90::GMMA::MMA_64x32x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_32,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 32, 16>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x16_F32F16F16_SS = SM90::GMMA::MMA_64x64x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_64,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 64, 16>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x16_F32F16F16_RS = SM90::GMMA::MMA_64x64x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_64,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 64, 16>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x16_F32F16F16_SS = SM90::GMMA::MMA_64x96x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_96,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 96, 16>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x16_F32F16F16_RS = SM90::GMMA::MMA_64x96x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_96,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 96, 16>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x16_F32F16F16_SS = SM90::GMMA::MMA_64x128x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_128,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<128, 16>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x16_F32F16F16_RS = SM90::GMMA::MMA_64x128x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_128,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<128, 16>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x16_F32F16F16_SS = SM90::GMMA::MMA_64x192x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_192,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<192, 16>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x16_F32F16F16_RS = SM90::GMMA::MMA_64x192x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_192,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<192, 16>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x16_F32F16F16_SS = SM90::GMMA::MMA_64x256x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_256,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<256, 16>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x16_F32F16F16_RS = SM90::GMMA::MMA_64x256x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_256,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<256, 16>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x8x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_8,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<  8, 16>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x8x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_8,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<  8, 16>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x16x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_16,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 16, 16>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x16x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_16,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 16, 16>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x32x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_32,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 32, 16>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x32x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_32,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 32, 16>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x64x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_64,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 64, 16>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x64x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_64,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 64, 16>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x96x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_96,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 96, 16>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x96x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_96,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 96, 16>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x128x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_128,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<128, 16>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x128x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_128,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<128, 16>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x192x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_192,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<192, 16>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x192x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_192,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<192, 16>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x256x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_256,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<256, 16>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x256x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_256,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<256, 16>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x8x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<  8,  8>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x8x8_F32TF32TF32_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<  8,  8>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x16x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout< 16,  8>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x16x8_F32TF32TF32_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout< 16,  8>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x32x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout< 32,  8>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x32x8_F32TF32TF32_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout< 32,  8>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x64x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout< 64,  8>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x64x8_F32TF32TF32_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout< 64,  8>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x96x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout< 96,  8>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x96x8_F32TF32TF32_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout< 96,  8>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x128x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<128,  8>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x128x8_F32TF32TF32_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<128,  8>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x192x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<192,  8>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x192x8_F32TF32TF32_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<192,  8>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x256x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<256,  8>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x256x8_F32TF32TF32_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<256,  8>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x8x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x8x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x8x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x8x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x8x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x8x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x16x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x16x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x16x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x16x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x16x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x16x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x32x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x32x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x32x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x32x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x32x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x32x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x64x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x64x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x64x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x64x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x64x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x64x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x96x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x96x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x96x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x96x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x96x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x96x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x128x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x128x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x128x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x128x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x128x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x128x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x192x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x192x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x192x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x192x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x192x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x192x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x256x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x256x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x256x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x256x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x256x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x256x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x8x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x8x32_S32S8S8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x8x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x8x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x8x32_S32S8S8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x8x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x16x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x16x32_S32S8S8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x16x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x16x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x16x32_S32S8S8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x16x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x32x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x32x32_S32S8S8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x32x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x32x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x32x32_S32S8S8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x32x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x64x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x64x32_S32S8S8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x64x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x64x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x64x32_S32S8S8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x64x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x96x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x96x32_S32S8S8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x96x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x96x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x96x32_S32S8S8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x96x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x128x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x128x32_S32S8S8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x128x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x128x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x128x32_S32S8S8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x128x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x192x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x192x32_S32S8S8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x192x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x192x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x192x32_S32S8S8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x192x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x256x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x256x32_S32S8S8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x256x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x256x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x256x32_S32S8S8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x256x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x8x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x8x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x8x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x8x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x8x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x8x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x16x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x16x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x16x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x16x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x16x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x16x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x32x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x32x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x32x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x32x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x32x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x32x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x64x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x64x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x64x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x64x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x64x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x64x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x96x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x96x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x96x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x96x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x96x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x96x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x128x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x128x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x128x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x128x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x128x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x128x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x192x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x192x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x192x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x192x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x192x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x192x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x256x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x256x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x256x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x256x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x256x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x256x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x8x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x8x32_S32S8U8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x8x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x8x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x8x32_S32S8U8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x8x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x16x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x16x32_S32S8U8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x16x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x16x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x16x32_S32S8U8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x16x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x32x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x32x32_S32S8U8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x32x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x32x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x32x32_S32S8U8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x32x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x64x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x64x32_S32S8U8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x64x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x64x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x64x32_S32S8U8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x64x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x96x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x96x32_S32S8U8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x96x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x96x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x96x32_S32S8U8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x96x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x128x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x128x32_S32S8U8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x128x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x128x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x128x32_S32S8U8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x128x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x192x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x192x32_S32S8U8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x192x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x192x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x192x32_S32S8U8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x192x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x256x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x256x32_S32S8U8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x256x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x256x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x256x32_S32S8U8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x256x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x8x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x8x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x8x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x8x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x8x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x8x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x16x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x16x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x16x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x16x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x16x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x16x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x32x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x32x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x32x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x32x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x32x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x32x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x64x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x64x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x64x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x64x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x64x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x64x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x96x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x96x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x96x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x96x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x96x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x96x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x128x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x128x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x128x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x128x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x128x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x128x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x192x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x192x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x192x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x192x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x192x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x192x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x256x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x256x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x256x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x256x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x256x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x256x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x8x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x8x32_S32U8S8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x8x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x8x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x8x32_S32U8S8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x8x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x16x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x16x32_S32U8S8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x16x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x16x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x16x32_S32U8S8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x16x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x32x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x32x32_S32U8S8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x32x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x32x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x32x32_S32U8S8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x32x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x64x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x64x32_S32U8S8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x64x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x64x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x64x32_S32U8S8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x64x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x96x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x96x32_S32U8S8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x96x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x96x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x96x32_S32U8S8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x96x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x128x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x128x32_S32U8S8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x128x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x128x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x128x32_S32U8S8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x128x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x192x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x192x32_S32U8S8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x192x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x192x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x192x32_S32U8S8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x192x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x256x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x256x32_S32U8S8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x256x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x256x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x256x32_S32U8S8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x256x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x8x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x8x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x8x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x8x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x8x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x8x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x16x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x16x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x16x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x16x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x16x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x16x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x32x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x32x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x32x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x32x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x32x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x32x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x64x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x64x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x64x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x64x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x64x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x64x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x96x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x96x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x96x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x96x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x96x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x96x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x128x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x128x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x128x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x128x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x128x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x128x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x192x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x192x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x192x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x192x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x192x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x192x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x256x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x256x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x256x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x256x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x256x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x256x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x8x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x8x32_S32U8U8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x8x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x8x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x8x32_S32U8U8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x8x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x16x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x16x32_S32U8U8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x16x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x16x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x16x32_S32U8U8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x16x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x32x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x32x32_S32U8U8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x32x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x32x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x32x32_S32U8U8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x32x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x64x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x64x32_S32U8U8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x64x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x64x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x64x32_S32U8U8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x64x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x96x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x96x32_S32U8U8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x96x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x96x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x96x32_S32U8U8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x96x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x128x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x128x32_S32U8U8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x128x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x128x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x128x32_S32U8U8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x128x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x192x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x192x32_S32U8U8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x192x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x192x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x192x32_S32U8U8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x192x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x256x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x256x32_S32U8U8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x256x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x256x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x256x32_S32U8U8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x256x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x8x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x8x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x8x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x8x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x16x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x16x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x16x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x16x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x32x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x32x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x32x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x32x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x64x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x64x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x64x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x64x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x96x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x96x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x96x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x96x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x128x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x128x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x128x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x128x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x192x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x192x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x192x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x192x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x256x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x256x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x256x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x256x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x8x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x8x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x8x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x8x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x16x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x16x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x16x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x16x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x32x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x32x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x32x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x32x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x64x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x64x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x64x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x64x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x96x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x96x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x96x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x96x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x128x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x128x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x128x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x128x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x192x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x192x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x192x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x192x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x256x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x256x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x256x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x256x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x8x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x8x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x8x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x8x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x16x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x16x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x16x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x16x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x32x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x32x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x32x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x32x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x64x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x64x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x64x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x64x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x96x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x96x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x96x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x96x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x128x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x128x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x128x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x128x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x192x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x192x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x192x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x192x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x256x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x256x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x256x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x256x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x8x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x8x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x8x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x8x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x16x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x16x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x16x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x16x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x32x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x32x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x32x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x32x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x64x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x64x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x64x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x64x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x96x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x96x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x96x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x96x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x128x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x128x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x128x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x128x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x192x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x192x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x192x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x192x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x256x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x256x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x256x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x256x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // end namespace cute
+
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+#include "mma_traits_sm90_gmma_ext.hpp"
+#endif
diff --git a/include/cute/atom/mma_traits_sm90_gmma_ext.hpp b/include/cute/atom/mma_traits_sm90_gmma_ext.hpp
new file mode 100644
index 0000000000..15e2412c87
--- /dev/null
+++ b/include/cute/atom/mma_traits_sm90_gmma_ext.hpp
@@ -0,0 +1,20116 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+ 
+#pragma once
+  
+#include <cute/arch/mma_sm90.hpp>
+#include <cute/atom/mma_traits.hpp>
+
+namespace cute {
+
+namespace SM90::GMMA {
+
+using CLayout_64x24  = CLayout_64xN< 24>;
+using CLayout_64x40  = CLayout_64xN< 40>;
+using CLayout_64x48  = CLayout_64xN< 48>;
+using CLayout_64x56  = CLayout_64xN< 56>;
+using CLayout_64x72  = CLayout_64xN< 72>;
+using CLayout_64x80  = CLayout_64xN< 80>;
+using CLayout_64x88  = CLayout_64xN< 88>;
+using CLayout_64x104 = CLayout_64xN<104>;
+using CLayout_64x112 = CLayout_64xN<112>;
+using CLayout_64x120 = CLayout_64xN<120>;
+using CLayout_64x136 = CLayout_64xN<136>;
+using CLayout_64x144 = CLayout_64xN<144>;
+using CLayout_64x152 = CLayout_64xN<152>;
+using CLayout_64x160 = CLayout_64xN<160>;
+using CLayout_64x168 = CLayout_64xN<168>;
+using CLayout_64x176 = CLayout_64xN<176>;
+using CLayout_64x184 = CLayout_64xN<184>;
+using CLayout_64x200 = CLayout_64xN<200>;
+using CLayout_64x208 = CLayout_64xN<208>;
+using CLayout_64x216 = CLayout_64xN<216>;
+using CLayout_64x224 = CLayout_64xN<224>;
+using CLayout_64x232 = CLayout_64xN<232>;
+using CLayout_64x240 = CLayout_64xN<240>;
+using CLayout_64x248 = CLayout_64xN<248>;
+
+}
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x16_F16F16F16_SS = SM90::GMMA::MMA_64x24x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_24,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 24, 16>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x16_F16F16F16_RS = SM90::GMMA::MMA_64x24x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_24,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 24, 16>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x16_F16F16F16_SS = SM90::GMMA::MMA_64x40x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_40,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 40, 16>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x16_F16F16F16_RS = SM90::GMMA::MMA_64x40x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_40,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 40, 16>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x16_F16F16F16_SS = SM90::GMMA::MMA_64x48x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_48,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 48, 16>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x16_F16F16F16_RS = SM90::GMMA::MMA_64x48x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_48,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 48, 16>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x16_F16F16F16_SS = SM90::GMMA::MMA_64x56x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_56,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 56, 16>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x16_F16F16F16_RS = SM90::GMMA::MMA_64x56x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_56,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 56, 16>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x16_F16F16F16_SS = SM90::GMMA::MMA_64x72x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_72,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 72, 16>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x16_F16F16F16_RS = SM90::GMMA::MMA_64x72x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_72,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 72, 16>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x16_F16F16F16_SS = SM90::GMMA::MMA_64x80x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_80,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 80, 16>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x16_F16F16F16_RS = SM90::GMMA::MMA_64x80x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_80,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 80, 16>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x16_F16F16F16_SS = SM90::GMMA::MMA_64x88x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_88,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 88, 16>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x16_F16F16F16_RS = SM90::GMMA::MMA_64x88x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_88,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 88, 16>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x16_F16F16F16_SS = SM90::GMMA::MMA_64x104x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_104,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<104, 16>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x16_F16F16F16_RS = SM90::GMMA::MMA_64x104x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_104,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<104, 16>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x16_F16F16F16_SS = SM90::GMMA::MMA_64x112x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_112,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<112, 16>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x16_F16F16F16_RS = SM90::GMMA::MMA_64x112x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_112,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<112, 16>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x16_F16F16F16_SS = SM90::GMMA::MMA_64x120x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_120,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<120, 16>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x16_F16F16F16_RS = SM90::GMMA::MMA_64x120x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_120,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<120, 16>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x16_F16F16F16_SS = SM90::GMMA::MMA_64x136x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_136,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<136, 16>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x16_F16F16F16_RS = SM90::GMMA::MMA_64x136x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_136,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<136, 16>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x16_F16F16F16_SS = SM90::GMMA::MMA_64x144x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_144,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<144, 16>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x16_F16F16F16_RS = SM90::GMMA::MMA_64x144x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_144,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<144, 16>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x16_F16F16F16_SS = SM90::GMMA::MMA_64x152x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_152,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<152, 16>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x16_F16F16F16_RS = SM90::GMMA::MMA_64x152x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_152,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<152, 16>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x16_F16F16F16_SS = SM90::GMMA::MMA_64x160x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_160,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<160, 16>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x16_F16F16F16_RS = SM90::GMMA::MMA_64x160x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_160,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<160, 16>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x16_F16F16F16_SS = SM90::GMMA::MMA_64x168x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_168,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<168, 16>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x16_F16F16F16_RS = SM90::GMMA::MMA_64x168x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_168,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<168, 16>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x16_F16F16F16_SS = SM90::GMMA::MMA_64x176x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_176,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<176, 16>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x16_F16F16F16_RS = SM90::GMMA::MMA_64x176x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_176,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<176, 16>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x16_F16F16F16_SS = SM90::GMMA::MMA_64x184x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_184,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<184, 16>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x16_F16F16F16_RS = SM90::GMMA::MMA_64x184x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_184,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<184, 16>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x16_F16F16F16_SS = SM90::GMMA::MMA_64x200x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_200,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<200, 16>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x16_F16F16F16_RS = SM90::GMMA::MMA_64x200x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_200,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<200, 16>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x16_F16F16F16_SS = SM90::GMMA::MMA_64x208x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_208,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<208, 16>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x16_F16F16F16_RS = SM90::GMMA::MMA_64x208x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_208,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<208, 16>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x16_F16F16F16_SS = SM90::GMMA::MMA_64x216x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_216,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<216, 16>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x16_F16F16F16_RS = SM90::GMMA::MMA_64x216x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_216,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<216, 16>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x16_F16F16F16_SS = SM90::GMMA::MMA_64x224x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_224,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<224, 16>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x16_F16F16F16_RS = SM90::GMMA::MMA_64x224x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_224,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<224, 16>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x16_F16F16F16_SS = SM90::GMMA::MMA_64x232x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_232,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<232, 16>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x16_F16F16F16_RS = SM90::GMMA::MMA_64x232x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_232,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<232, 16>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x16_F16F16F16_SS = SM90::GMMA::MMA_64x240x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_240,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<240, 16>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x16_F16F16F16_RS = SM90::GMMA::MMA_64x240x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_240,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<240, 16>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x16_F16F16F16_SS = SM90::GMMA::MMA_64x248x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_248,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<248, 16>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x16_F16F16F16_RS = SM90::GMMA::MMA_64x248x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_248,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<248, 16>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x16_F32F16F16_SS = SM90::GMMA::MMA_64x24x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_24,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 24, 16>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x16_F32F16F16_RS = SM90::GMMA::MMA_64x24x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_24,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 24, 16>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x16_F32F16F16_SS = SM90::GMMA::MMA_64x40x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_40,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 40, 16>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x16_F32F16F16_RS = SM90::GMMA::MMA_64x40x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_40,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 40, 16>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x16_F32F16F16_SS = SM90::GMMA::MMA_64x48x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_48,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 48, 16>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x16_F32F16F16_RS = SM90::GMMA::MMA_64x48x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_48,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 48, 16>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x16_F32F16F16_SS = SM90::GMMA::MMA_64x56x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_56,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 56, 16>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x16_F32F16F16_RS = SM90::GMMA::MMA_64x56x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_56,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 56, 16>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x16_F32F16F16_SS = SM90::GMMA::MMA_64x72x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_72,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 72, 16>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x16_F32F16F16_RS = SM90::GMMA::MMA_64x72x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_72,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 72, 16>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x16_F32F16F16_SS = SM90::GMMA::MMA_64x80x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_80,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 80, 16>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x16_F32F16F16_RS = SM90::GMMA::MMA_64x80x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_80,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 80, 16>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x16_F32F16F16_SS = SM90::GMMA::MMA_64x88x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_88,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 88, 16>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x16_F32F16F16_RS = SM90::GMMA::MMA_64x88x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_88,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 88, 16>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x16_F32F16F16_SS = SM90::GMMA::MMA_64x104x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_104,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<104, 16>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x16_F32F16F16_RS = SM90::GMMA::MMA_64x104x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_104,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<104, 16>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x16_F32F16F16_SS = SM90::GMMA::MMA_64x112x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_112,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<112, 16>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x16_F32F16F16_RS = SM90::GMMA::MMA_64x112x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_112,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<112, 16>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x16_F32F16F16_SS = SM90::GMMA::MMA_64x120x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_120,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<120, 16>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x16_F32F16F16_RS = SM90::GMMA::MMA_64x120x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_120,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<120, 16>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x16_F32F16F16_SS = SM90::GMMA::MMA_64x136x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_136,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<136, 16>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x16_F32F16F16_RS = SM90::GMMA::MMA_64x136x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_136,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<136, 16>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x16_F32F16F16_SS = SM90::GMMA::MMA_64x144x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_144,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<144, 16>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x16_F32F16F16_RS = SM90::GMMA::MMA_64x144x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_144,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<144, 16>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x16_F32F16F16_SS = SM90::GMMA::MMA_64x152x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_152,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<152, 16>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x16_F32F16F16_RS = SM90::GMMA::MMA_64x152x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_152,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<152, 16>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x16_F32F16F16_SS = SM90::GMMA::MMA_64x160x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_160,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<160, 16>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x16_F32F16F16_RS = SM90::GMMA::MMA_64x160x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_160,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<160, 16>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x16_F32F16F16_SS = SM90::GMMA::MMA_64x168x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_168,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<168, 16>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x16_F32F16F16_RS = SM90::GMMA::MMA_64x168x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_168,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<168, 16>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x16_F32F16F16_SS = SM90::GMMA::MMA_64x176x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_176,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<176, 16>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x16_F32F16F16_RS = SM90::GMMA::MMA_64x176x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_176,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<176, 16>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x16_F32F16F16_SS = SM90::GMMA::MMA_64x184x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_184,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<184, 16>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x16_F32F16F16_RS = SM90::GMMA::MMA_64x184x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_184,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<184, 16>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x16_F32F16F16_SS = SM90::GMMA::MMA_64x200x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_200,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<200, 16>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x16_F32F16F16_RS = SM90::GMMA::MMA_64x200x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_200,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<200, 16>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x16_F32F16F16_SS = SM90::GMMA::MMA_64x208x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_208,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<208, 16>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x16_F32F16F16_RS = SM90::GMMA::MMA_64x208x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_208,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<208, 16>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x16_F32F16F16_SS = SM90::GMMA::MMA_64x216x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_216,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<216, 16>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x16_F32F16F16_RS = SM90::GMMA::MMA_64x216x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_216,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<216, 16>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x16_F32F16F16_SS = SM90::GMMA::MMA_64x224x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_224,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<224, 16>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x16_F32F16F16_RS = SM90::GMMA::MMA_64x224x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_224,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<224, 16>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x16_F32F16F16_SS = SM90::GMMA::MMA_64x232x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_232,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<232, 16>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x16_F32F16F16_RS = SM90::GMMA::MMA_64x232x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_232,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<232, 16>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x16_F32F16F16_SS = SM90::GMMA::MMA_64x240x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_240,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<240, 16>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x16_F32F16F16_RS = SM90::GMMA::MMA_64x240x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_240,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<240, 16>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x16_F32F16F16_SS = SM90::GMMA::MMA_64x248x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_248,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<248, 16>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x16_F32F16F16_RS = SM90::GMMA::MMA_64x248x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_248,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<248, 16>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x24x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_24,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 24, 16>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x24x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_24,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 24, 16>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x40x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_40,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 40, 16>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x40x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_40,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 40, 16>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x48x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_48,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 48, 16>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x48x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_48,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 48, 16>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x56x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_56,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 56, 16>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x56x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_56,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 56, 16>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x72x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_72,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 72, 16>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x72x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_72,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 72, 16>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x80x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_80,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 80, 16>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x80x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_80,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 80, 16>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x88x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_88,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 88, 16>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x88x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_88,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 88, 16>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x104x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_104,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<104, 16>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x104x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_104,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<104, 16>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x112x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_112,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<112, 16>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x112x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_112,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<112, 16>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x120x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_120,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<120, 16>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x120x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_120,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<120, 16>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x136x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_136,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<136, 16>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x136x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_136,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<136, 16>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x144x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_144,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<144, 16>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x144x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_144,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<144, 16>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x152x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_152,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<152, 16>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x152x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_152,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<152, 16>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x160x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_160,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<160, 16>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x160x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_160,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<160, 16>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x168x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_168,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<168, 16>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x168x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_168,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<168, 16>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x176x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_176,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<176, 16>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x176x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_176,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<176, 16>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x184x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_184,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<184, 16>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x184x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_184,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<184, 16>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x200x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_200,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<200, 16>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x200x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_200,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<200, 16>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x208x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_208,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<208, 16>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x208x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_208,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<208, 16>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x216x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_216,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<216, 16>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x216x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_216,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<216, 16>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x224x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_224,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<224, 16>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x224x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_224,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<224, 16>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x232x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_232,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<232, 16>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x232x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_232,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<232, 16>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x240x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_240,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<240, 16>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x240x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_240,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<240, 16>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x248x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_248,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<248, 16>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x248x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_248,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<248, 16>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x24x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout< 24,  8>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x24x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout< 24,  8>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x40x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout< 40,  8>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x40x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout< 40,  8>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x48x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout< 48,  8>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x48x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout< 48,  8>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x56x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout< 56,  8>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x56x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout< 56,  8>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x72x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout< 72,  8>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x72x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout< 72,  8>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x80x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout< 80,  8>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x80x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout< 80,  8>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x88x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout< 88,  8>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x88x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout< 88,  8>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x104x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<104,  8>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x104x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<104,  8>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x112x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<112,  8>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x112x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<112,  8>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x120x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<120,  8>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x120x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<120,  8>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x136x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<136,  8>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x136x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<136,  8>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x144x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<144,  8>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x144x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<144,  8>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x152x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<152,  8>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x152x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<152,  8>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x160x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<160,  8>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x160x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<160,  8>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x168x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<168,  8>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x168x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<168,  8>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x176x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<176,  8>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x176x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<176,  8>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x184x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<184,  8>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x184x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<184,  8>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x200x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<200,  8>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x200x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<200,  8>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x208x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<208,  8>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x208x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<208,  8>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x216x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<216,  8>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x216x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<216,  8>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x224x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<224,  8>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x224x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<224,  8>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x232x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<232,  8>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x232x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<232,  8>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x240x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<240,  8>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x240x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<240,  8>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x248x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<248,  8>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x248x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<248,  8>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x24x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x24x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x24x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x24x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x24x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x24x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x48x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x48x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x48x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x48x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x48x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x48x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x80x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x80x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x80x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x80x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x80x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x80x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x112x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x112x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x112x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x112x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x112x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x112x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x144x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x144x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x144x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x144x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x144x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x144x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x160x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x160x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x160x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x160x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x160x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x160x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x176x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x176x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x176x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x176x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x176x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x176x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x208x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x208x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x208x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x208x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x208x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x208x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x224x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x224x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x224x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x224x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x224x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x224x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x240x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x240x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x240x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x240x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x240x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x240x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x24x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x24x32_S32S8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x24x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x24x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x24x32_S32S8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x24x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x48x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x48x32_S32S8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x48x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x48x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x48x32_S32S8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x48x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x80x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x80x32_S32S8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x80x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x80x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x80x32_S32S8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x80x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x112x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x112x32_S32S8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x112x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x112x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x112x32_S32S8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x112x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x144x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x144x32_S32S8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x144x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x144x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x144x32_S32S8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x144x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x160x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x160x32_S32S8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x160x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x160x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x160x32_S32S8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x160x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x176x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x176x32_S32S8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x176x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x176x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x176x32_S32S8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x176x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x208x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x208x32_S32S8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x208x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x208x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x208x32_S32S8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x208x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x224x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x224x32_S32S8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x224x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x224x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x224x32_S32S8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x224x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x240x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x240x32_S32S8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x240x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x240x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x240x32_S32S8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x240x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x24x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x24x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x24x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x24x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x24x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x24x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x48x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x48x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x48x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x48x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x48x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x48x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x80x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x80x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x80x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x80x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x80x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x80x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x112x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x112x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x112x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x112x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x112x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x112x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x144x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x144x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x144x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x144x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x144x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x144x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x160x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x160x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x160x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x160x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x160x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x160x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x176x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x176x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x176x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x176x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x176x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x176x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x208x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x208x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x208x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x208x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x208x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x208x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x224x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x224x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x224x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x224x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x224x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x224x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x240x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x240x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x240x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x240x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x240x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x240x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x24x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x24x32_S32S8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x24x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x24x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x24x32_S32S8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x24x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x48x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x48x32_S32S8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x48x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x48x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x48x32_S32S8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x48x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x80x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x80x32_S32S8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x80x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x80x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x80x32_S32S8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x80x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x112x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x112x32_S32S8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x112x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x112x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x112x32_S32S8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x112x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x144x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x144x32_S32S8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x144x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x144x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x144x32_S32S8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x144x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x160x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x160x32_S32S8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x160x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x160x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x160x32_S32S8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x160x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x176x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x176x32_S32S8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x176x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x176x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x176x32_S32S8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x176x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x208x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x208x32_S32S8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x208x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x208x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x208x32_S32S8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x208x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x224x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x224x32_S32S8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x224x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x224x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x224x32_S32S8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x224x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x240x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x240x32_S32S8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x240x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x240x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x240x32_S32S8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x240x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x24x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x24x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x24x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x24x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x24x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x24x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x48x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x48x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x48x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x48x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x48x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x48x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x80x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x80x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x80x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x80x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x80x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x80x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x112x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x112x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x112x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x112x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x112x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x112x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x144x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x144x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x144x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x144x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x144x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x144x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x160x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x160x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x160x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x160x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x160x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x160x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x176x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x176x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x176x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x176x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x176x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x176x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x208x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x208x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x208x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x208x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x208x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x208x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x224x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x224x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x224x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x224x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x224x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x224x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x240x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x240x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x240x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x240x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x240x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x240x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x24x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x24x32_S32U8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x24x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x24x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x24x32_S32U8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x24x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x48x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x48x32_S32U8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x48x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x48x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x48x32_S32U8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x48x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x80x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x80x32_S32U8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x80x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x80x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x80x32_S32U8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x80x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x112x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x112x32_S32U8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x112x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x112x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x112x32_S32U8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x112x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x144x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x144x32_S32U8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x144x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x144x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x144x32_S32U8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x144x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x160x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x160x32_S32U8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x160x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x160x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x160x32_S32U8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x160x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x176x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x176x32_S32U8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x176x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x176x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x176x32_S32U8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x176x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x208x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x208x32_S32U8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x208x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x208x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x208x32_S32U8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x208x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x224x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x224x32_S32U8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x224x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x224x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x224x32_S32U8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x224x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x240x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x240x32_S32U8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x240x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x240x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x240x32_S32U8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x240x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x24x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x24x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x24x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x24x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x24x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x24x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x48x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x48x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x48x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x48x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x48x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x48x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x80x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x80x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x80x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x80x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x80x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x80x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x112x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x112x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x112x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x112x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x112x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x112x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x144x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x144x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x144x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x144x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x144x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x144x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x160x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x160x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x160x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x160x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x160x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x160x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x176x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x176x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x176x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x176x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x176x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x176x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x208x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x208x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x208x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x208x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x208x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x208x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x224x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x224x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x224x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x224x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x224x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x224x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x240x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x240x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x240x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x240x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x240x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x240x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x24x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x24x32_S32U8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x24x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x24x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x24x32_S32U8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x24x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x48x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x48x32_S32U8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x48x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x48x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x48x32_S32U8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x48x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x80x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x80x32_S32U8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x80x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x80x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x80x32_S32U8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x80x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x112x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x112x32_S32U8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x112x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x112x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x112x32_S32U8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x112x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x144x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x144x32_S32U8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x144x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x144x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x144x32_S32U8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x144x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x160x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x160x32_S32U8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x160x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x160x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x160x32_S32U8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x160x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x176x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x176x32_S32U8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x176x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x176x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x176x32_S32U8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x176x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x208x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x208x32_S32U8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x208x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x208x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x208x32_S32U8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x208x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x224x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x224x32_S32U8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x224x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x224x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x224x32_S32U8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x224x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x240x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x240x32_S32U8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x240x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x240x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x240x32_S32U8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x240x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x24x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x24x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x24x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x24x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x40x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x40x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x40x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x40x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x48x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x48x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x48x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x48x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x56x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x56x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x56x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x56x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x72x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x72x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x72x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x72x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x80x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x80x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x80x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x80x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x88x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x88x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x88x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x88x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x104x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x104x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x104x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x104x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x112x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x112x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x112x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x112x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x120x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x120x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x120x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x120x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x136x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x136x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x136x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x136x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x144x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x144x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x144x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x144x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x152x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x152x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x152x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x152x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x160x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x160x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x160x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x160x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x168x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x168x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x168x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x168x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x176x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x176x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x176x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x176x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x184x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x184x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x184x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x184x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x200x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x200x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x200x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x200x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x208x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x208x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x208x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x208x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x216x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x216x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x216x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x216x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x224x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x224x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x224x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x224x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x232x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x232x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x232x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x232x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x240x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x240x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x240x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x240x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x248x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x248x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x248x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x248x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x24x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x24x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x24x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x24x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x40x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x40x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x40x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x40x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x48x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x48x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x48x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x48x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x56x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x56x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x56x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x56x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x72x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x72x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x72x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x72x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x80x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x80x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x80x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x80x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x88x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x88x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x88x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x88x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x104x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x104x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x104x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x104x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x112x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x112x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x112x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x112x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x120x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x120x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x120x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x120x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x136x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x136x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x136x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x136x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x144x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x144x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x144x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x144x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x152x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x152x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x152x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x152x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x160x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x160x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x160x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x160x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x168x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x168x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x168x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x168x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x176x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x176x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x176x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x176x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x184x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x184x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x184x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x184x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x200x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x200x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x200x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x200x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x208x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x208x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x208x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x208x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x216x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x216x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x216x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x216x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x224x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x224x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x224x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x224x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x232x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x232x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x232x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x232x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x240x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x240x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x240x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x240x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x248x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x248x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x248x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x248x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x24x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x24x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x24x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x24x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x40x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x40x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x40x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x40x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x48x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x48x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x48x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x48x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x56x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x56x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x56x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x56x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x72x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x72x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x72x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x72x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x80x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x80x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x80x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x80x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x88x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x88x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x88x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x88x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x104x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x104x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x104x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x104x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x112x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x112x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x112x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x112x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x120x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x120x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x120x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x120x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x136x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x136x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x136x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x136x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x144x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x144x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x144x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x144x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x152x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x152x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x152x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x152x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x160x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x160x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x160x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x160x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x168x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x168x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x168x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x168x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x176x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x176x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x176x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x176x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x184x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x184x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x184x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x184x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x200x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x200x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x200x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x200x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x208x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x208x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x208x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x208x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x216x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x216x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x216x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x216x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x224x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x224x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x224x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x224x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x232x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x232x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x232x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x232x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x240x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x240x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x240x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x240x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x248x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x248x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x248x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x248x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x24x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x24x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x24x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x24x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x40x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x40x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x40x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x40x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x48x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x48x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x48x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x48x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x56x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x56x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x56x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x56x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x72x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x72x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x72x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x72x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x80x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x80x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x80x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x80x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x88x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x88x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x88x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x88x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x104x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x104x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x104x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x104x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x112x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x112x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x112x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x112x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x120x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x120x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x120x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x120x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x136x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x136x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x136x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x136x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x144x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x144x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x144x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x144x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x152x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x152x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x152x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x152x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x160x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x160x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x160x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x160x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x168x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x168x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x168x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x168x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x176x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x176x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x176x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x176x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x184x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x184x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x184x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x184x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x200x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x200x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x200x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x200x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x208x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x208x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x208x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x208x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x216x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x216x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x216x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x216x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x224x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x224x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x224x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x224x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x232x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x232x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x232x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x232x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x240x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x240x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x240x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x240x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x248x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x248x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x248x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x248x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // end namespace cute
diff --git a/include/cute/atom/mma_traits_sm90_gmma_sparse.hpp b/include/cute/atom/mma_traits_sm90_gmma_sparse.hpp
new file mode 100644
index 0000000000..161dc7ecf0
--- /dev/null
+++ b/include/cute/atom/mma_traits_sm90_gmma_sparse.hpp
@@ -0,0 +1,7738 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cute/pointer_sparse.hpp>             // cute::smem_sparse_ptr_flag
+#include <cute/swizzle.hpp>                    // cute::Swizzle
+#include <cute/tensor_impl.hpp>                // cute::Tensor
+#include <cute/arch/mma_sm90_desc.hpp>         // cute::LayoutType
+#include <cute/arch/mma_sm90_gmma_sparse.hpp>  // cute::SM90::SPARSE::GMMA_64x8x32_F16F16F16_SS, etc
+#include <cute/atom/mma_traits_sm90_gmma.hpp>  // cute::GMMA::Layout_*
+#include <cute/atom/mma_traits.hpp>            // cute::MMA_Traits
+#include <cute/layout_composed.hpp>            // cute::ComposedLayout
+#include <cute/numeric/integral_constant.hpp>  // cute::is_static
+
+namespace cute {
+
+namespace SM90::GMMA {
+
+///////////////////////////////////////////
+// Common layouts for GMMA Shared Memory //
+///////////////////////////////////////////
+
+// M|N-major layouts in units of Type and sparsity factor S
+template <class Type, int S>
+using Layout_MN_INTER_SpAtom = ComposedLayout<Swizzle<0,4,3>, smem_sparse_ptr_flag_bits<S,sizeof_bits_v<Type>>,
+                                              decltype(blocked_product(Layout<Shape<_1,Int<S>>>{}, Layout_MN_INTER_Atom<Type>{}.layout_b()))>;
+template <class Type, int S>
+using Layout_MN_SW32_SpAtom  = ComposedLayout<Swizzle<1,4,3>, smem_sparse_ptr_flag_bits<S,sizeof_bits_v<Type>>,
+                                              decltype(blocked_product(Layout<Shape<_1,Int<S>>>{}, Layout_MN_SW32_Atom<Type>{}.layout_b()))>;
+template <class Type, int S>
+using Layout_MN_SW64_SpAtom  = ComposedLayout<Swizzle<2,4,3>, smem_sparse_ptr_flag_bits<S,sizeof_bits_v<Type>>,
+                                              decltype(blocked_product(Layout<Shape<_1,Int<S>>>{}, Layout_MN_SW64_Atom<Type>{}.layout_b()))>;
+template <class Type, int S>
+using Layout_MN_SW128_SpAtom = ComposedLayout<Swizzle<3,4,3>, smem_sparse_ptr_flag_bits<S,sizeof_bits_v<Type>>,
+                                              decltype(blocked_product(Layout<Shape<_1,Int<S>>>{}, Layout_MN_SW128_Atom<Type>{}.layout_b()))>;
+
+// K-major layouts in units of Type and sparsity factor S
+template <class Type, int S>
+using Layout_K_INTER_SpAtom = ComposedLayout<Swizzle<0,4,3>, smem_sparse_ptr_flag_bits<S,sizeof_bits_v<Type>>,
+                                              decltype(blocked_product(Layout<Shape<_1,Int<S>>>{}, Layout_K_INTER_Atom<Type>{}.layout_b()))>;
+template <class Type, int S>
+using Layout_K_SW32_SpAtom  = ComposedLayout<Swizzle<1,4,3>, smem_sparse_ptr_flag_bits<S,sizeof_bits_v<Type>>,
+                                              decltype(blocked_product(Layout<Shape<_1,Int<S>>>{}, Layout_K_SW32_Atom<Type>{}.layout_b()))>;
+template <class Type, int S>
+using Layout_K_SW64_SpAtom  = ComposedLayout<Swizzle<2,4,3>, smem_sparse_ptr_flag_bits<S,sizeof_bits_v<Type>>,
+                                              decltype(blocked_product(Layout<Shape<_1,Int<S>>>{}, Layout_K_SW64_Atom<Type>{}.layout_b()))>;
+template <class Type, int S>
+using Layout_K_SW128_SpAtom = ComposedLayout<Swizzle<3,4,3>, smem_sparse_ptr_flag_bits<S,sizeof_bits_v<Type>>,
+                                              decltype(blocked_product(Layout<Shape<_1,Int<S>>>{}, Layout_K_SW128_Atom<Type>{}.layout_b()))>;
+
+// With GMMA::Major param
+template <class Type, int S, GMMA::Major tnsp>
+using Layout_INTER_SpAtom = typename conditional<tnsp == GMMA::Major::MN,
+                                                 Layout_MN_INTER_SpAtom<Type,S>,
+                                                 Layout_K_INTER_SpAtom<Type,S>>::type;
+template <class Type, int S, GMMA::Major tnsp>
+using Layout_SW32_SpAtom = typename conditional<tnsp == GMMA::Major::MN,
+                                                Layout_MN_SW32_SpAtom<Type,S>,
+                                                Layout_K_SW32_SpAtom<Type,S>>::type;
+template <class Type, int S, GMMA::Major tnsp>
+using Layout_SW64_SpAtom = typename conditional<tnsp == GMMA::Major::MN,
+                                                Layout_MN_SW64_SpAtom<Type,S>,
+                                                Layout_K_SW64_SpAtom<Type,S>>::type;
+template <class Type, int S, GMMA::Major tnsp>
+using Layout_SW128_SpAtom = typename conditional<tnsp == GMMA::Major::MN,
+                                                 Layout_MN_SW128_SpAtom<Type,S>,
+                                                 Layout_K_SW128_SpAtom<Type,S>>::type;
+
+///////////////////////////////////////////////////////////////////////////////
+// Higher level GMMA Descriptor utilities
+///////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major>
+struct sparse_smem_desc : DescriptorIterator {};
+
+} // end namespace SM90::GMMA
+
+// Customization point for creating a cute::GMMAsparse_smem_desc Tensor
+template <SM90::GMMA::Major MajorMode>
+struct MakeTensor<SM90::GMMA::sparse_smem_desc<MajorMode>>
+{
+  // Note that this is the exact same as cute::GMMAsmem_desc above, plus additional static checks.
+  template <class TEngine, class TLayout>
+  CUTE_HOST_DEVICE constexpr auto
+  operator()(Tensor<TEngine,TLayout> const& smem_tensor)
+  {
+    static_assert(is_smem<TEngine>::value, "Expected SMEM Tensor to construct a GMMA Desc Tensor");
+    static_assert(is_sparse<typename TEngine::value_type>::value, "Expected sparse value_type.");
+    static_assert(is_sparse_ptr<TEngine>::value, "Expected sparse iter.");
+    return make_tensor(SM90::GMMA::DescriptorIterator{SM90::GMMA::make_gmma_desc<MajorMode>(tensor<0>(smem_tensor))},
+                       replace<0>(recast<uint128_t const>(smem_tensor).layout(), Layout<_1,_0>{}));
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//////////////////////////// MMA_TRAITS ///////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+namespace SM90::GMMA {
+
+// Metadata layouts
+using ELayout_64x64  = Layout<Shape <Shape <_2,   _2,_8, _4>, Shape <_32>>, 
+                              Stride<Stride<_8,_2048,_1,_16>, Stride<_64>>>;
+
+using ELayout_64x32  = Layout<Shape <Shape <   _2,_2,_8, _4>, Shape <_16,_2>>, 
+                              Stride<Stride<_1024,_0,_1,_16>, Stride<_64,_8>>>;
+
+using ELayout_64x16  = Layout<Shape <Shape <  _2,_2,_8, _4>, Shape < _8,_2>>, 
+                              Stride<Stride<_512,_0,_1,_16>, Stride<_64,_8>>>;
+
+} // namespace SM90::GMMA
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace SM90::GMMA::SPARSE {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class MMAOp,
+          class TD, class DLayout,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout>
+CUTE_HOST_DEVICE constexpr void
+mma_unpack(MMA_Traits<MMAOp>   const& traits,
+           Tensor<TD, DLayout>      & D,
+           Tensor<TA, ALayout> const& A_zipped,
+           Tensor<TB, BLayout> const& B,
+           Tensor<TC, CLayout> const& C)
+{
+  static_assert(is_rmem_v<TD>, "Expected registers in MMA_Atom::call");
+  static_assert(is_rmem_v<TA>, "Expected registers in MMA_Atom::call");
+  static_assert(is_rmem_v<TB>, "Expected registers in MMA_Atom::call");
+  static_assert(is_rmem_v<TC>, "Expected registers in MMA_Atom::call");
+
+  using DRegisters = typename MMAOp::DRegisters;
+  using ARegisters = typename MMAOp::ARegisters;
+  using ERegisters = typename MMAOp::ERegisters;
+  using BRegisters = typename MMAOp::BRegisters;
+  using CRegisters = typename MMAOp::CRegisters;
+
+  // Register value types from the MMAOp register arrays
+  using RegTypeD   = typename remove_extent<DRegisters>::type;
+  using RegTypeA   = typename remove_extent<ARegisters>::type;
+  using RegTypeE   = typename remove_extent<ERegisters>::type;
+  using RegTypeB   = typename remove_extent<BRegisters>::type;
+  using RegTypeC   = typename remove_extent<CRegisters>::type;
+
+  constexpr int RegNumA = extent<ARegisters>::value;
+  constexpr int RegNumE = extent<ERegisters>::value;
+  constexpr int RegNumB = extent<BRegisters>::value;
+  constexpr int RegNumC = extent<CRegisters>::value;
+
+  auto [A, E] = unzip_tensor(A_zipped);
+  Tensor rA   = recast<RegTypeA>(A);
+  Tensor rE   = recast<RegTypeE>(E);
+  Tensor rB   = recast<RegTypeB>(B);
+
+  CUTE_STATIC_ASSERT_V(size(rA) == Int<RegNumA>{});
+  CUTE_STATIC_ASSERT_V(size(rE) == Int<RegNumE>{});
+  CUTE_STATIC_ASSERT_V(size(rB) == Int<RegNumB>{});
+
+  static_assert(is_same<RegTypeD, void>::value, "GMMA DRegisters must have void type.");
+  static_assert(is_same<typename TD::value_type, typename TC::value_type>::value, "GMMA C and D value_type must match.");
+  static_assert(is_same<DLayout, CLayout>::value, "GMMA C and D layouts must match.");
+
+  Tensor rC = recast<RegTypeC>(D);  // NOTE: D and C are same, so use mutable D
+
+  CUTE_STATIC_ASSERT_V(size(rC) == Int<RegNumC>{});
+
+  detail::explode(MMAOp::fma,
+                  rA, make_int_sequence<RegNumA>{},
+                  rB, make_int_sequence<RegNumB>{},
+                  rC, make_int_sequence<RegNumC>{},
+                  rE, make_int_sequence<RegNumE>{},
+                  &(traits.accumulate_), seq<0>{});
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace SM90::SPARSE
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<  8, 16>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<  8, 16>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 16, 16>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 16, 16>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 32, 16>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 32, 16>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 64, 16>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 64, 16>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 96, 16>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 96, 16>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<128, 16>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<128, 16>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<192, 16>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<192, 16>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<256, 16>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<256, 16>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // end namespace cute
+
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+#include "mma_traits_sm90_gmma_sparse_ext.hpp"
+#endif
diff --git a/include/cute/atom/mma_traits_sm90_gmma_sparse_ext.hpp b/include/cute/atom/mma_traits_sm90_gmma_sparse_ext.hpp
new file mode 100644
index 0000000000..3680b7e13f
--- /dev/null
+++ b/include/cute/atom/mma_traits_sm90_gmma_sparse_ext.hpp
@@ -0,0 +1,17335 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+ 
+#pragma once
+  
+#include <cute/arch/mma_sm90.hpp>
+#include <cute/atom/mma_traits.hpp>
+
+namespace cute {
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 24, 16>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 24, 16>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 40, 16>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 40, 16>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 48, 16>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 48, 16>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 56, 16>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 56, 16>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 72, 16>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 72, 16>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 80, 16>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 80, 16>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 88, 16>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 88, 16>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<104, 16>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<104, 16>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<112, 16>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<112, 16>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<120, 16>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<120, 16>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<136, 16>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<136, 16>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<144, 16>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<144, 16>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<152, 16>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<152, 16>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<160, 16>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<160, 16>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<168, 16>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<168, 16>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<176, 16>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<176, 16>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<184, 16>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<184, 16>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<200, 16>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<200, 16>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<208, 16>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<208, 16>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<216, 16>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<216, 16>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<224, 16>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<224, 16>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<232, 16>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<232, 16>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<240, 16>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<240, 16>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<248, 16>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<248, 16>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 40, 64>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 40, 64>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 40, 64>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 40, 64>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 56, 64>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 56, 64>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 56, 64>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 56, 64>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 72, 64>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 72, 64>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 72, 64>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 72, 64>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 88, 64>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 88, 64>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 88, 64>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 88, 64>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<104, 64>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<104, 64>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<104, 64>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<104, 64>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<120, 64>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<120, 64>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<120, 64>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<120, 64>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<136, 64>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<136, 64>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<136, 64>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<136, 64>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<152, 64>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<152, 64>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<152, 64>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<152, 64>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<168, 64>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<168, 64>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<168, 64>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<168, 64>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<184, 64>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<184, 64>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<184, 64>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<184, 64>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<200, 64>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<200, 64>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<200, 64>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<200, 64>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<216, 64>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<216, 64>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<216, 64>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<216, 64>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<232, 64>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<232, 64>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<232, 64>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<232, 64>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<248, 64>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<248, 64>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<248, 64>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<248, 64>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 40, 64>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 40, 64>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 40, 64>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 40, 64>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 56, 64>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 56, 64>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 56, 64>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 56, 64>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 72, 64>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 72, 64>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 72, 64>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 72, 64>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 88, 64>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 88, 64>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 88, 64>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 88, 64>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<104, 64>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<104, 64>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<104, 64>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<104, 64>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<120, 64>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<120, 64>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<120, 64>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<120, 64>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<136, 64>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<136, 64>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<136, 64>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<136, 64>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<152, 64>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<152, 64>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<152, 64>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<152, 64>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<168, 64>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<168, 64>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<168, 64>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<168, 64>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<184, 64>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<184, 64>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<184, 64>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<184, 64>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<200, 64>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<200, 64>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<200, 64>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<200, 64>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<216, 64>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<216, 64>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<216, 64>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<216, 64>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<232, 64>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<232, 64>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<232, 64>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<232, 64>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<248, 64>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<248, 64>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<248, 64>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<248, 64>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 40, 64>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 40, 64>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 40, 64>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 40, 64>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 56, 64>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 56, 64>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 56, 64>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 56, 64>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 72, 64>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 72, 64>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 72, 64>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 72, 64>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 88, 64>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 88, 64>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 88, 64>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 88, 64>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<104, 64>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<104, 64>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<104, 64>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<104, 64>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<120, 64>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<120, 64>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<120, 64>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<120, 64>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<136, 64>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<136, 64>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<136, 64>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<136, 64>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<152, 64>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<152, 64>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<152, 64>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<152, 64>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<168, 64>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<168, 64>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<168, 64>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<168, 64>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<184, 64>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<184, 64>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<184, 64>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<184, 64>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<200, 64>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<200, 64>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<200, 64>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<200, 64>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<216, 64>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<216, 64>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<216, 64>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<216, 64>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<232, 64>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<232, 64>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<232, 64>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<232, 64>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<248, 64>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<248, 64>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<248, 64>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<248, 64>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 40, 64>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 40, 64>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 40, 64>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 40, 64>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 56, 64>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 56, 64>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 56, 64>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 56, 64>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 72, 64>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 72, 64>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 72, 64>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 72, 64>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 88, 64>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 88, 64>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 88, 64>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 88, 64>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<104, 64>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<104, 64>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<104, 64>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<104, 64>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<120, 64>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<120, 64>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<120, 64>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<120, 64>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<136, 64>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<136, 64>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<136, 64>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<136, 64>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<152, 64>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<152, 64>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<152, 64>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<152, 64>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<168, 64>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<168, 64>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<168, 64>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<168, 64>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<184, 64>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<184, 64>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<184, 64>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<184, 64>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<200, 64>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<200, 64>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<200, 64>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<200, 64>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<216, 64>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<216, 64>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<216, 64>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<216, 64>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<232, 64>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<232, 64>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<232, 64>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<232, 64>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<248, 64>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<248, 64>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<248, 64>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<248, 64>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // end namespace cute
diff --git a/include/cute/config.hpp b/include/cute/config.hpp
new file mode 100644
index 0000000000..792eee90f0
--- /dev/null
+++ b/include/cute/config.hpp
@@ -0,0 +1,149 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+#  define CUTE_HOST_DEVICE __forceinline__ __host__ __device__
+#  define CUTE_DEVICE      __forceinline__          __device__
+#  define CUTE_HOST        __forceinline__ __host__
+#else
+#  define CUTE_HOST_DEVICE inline
+#  define CUTE_DEVICE      inline
+#  define CUTE_HOST        inline
+#endif // CUTE_HOST_DEVICE, CUTE_DEVICE
+
+#if defined(__CUDACC_RTC__)
+#  define CUTE_HOST_RTC CUTE_HOST_DEVICE
+#else
+#  define CUTE_HOST_RTC CUTE_HOST
+#endif
+
+#if !defined(__CUDACC_RTC__) && !defined(__clang__) && \
+  (defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA))
+#  define CUTE_UNROLL    #pragma unroll
+#  define CUTE_NO_UNROLL #pragma unroll 1
+#elif defined(__CUDACC_RTC__) || defined(__clang__)
+#  define CUTE_UNROLL    _Pragma("unroll")
+#  define CUTE_NO_UNROLL _Pragma("unroll 1")
+#else
+#  define CUTE_UNROLL
+#  define CUTE_NO_UNROLL
+#endif // CUTE_UNROLL
+
+#if defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA)
+#  define CUTE_INLINE_CONSTANT                 static const __device__
+#else
+#  define CUTE_INLINE_CONSTANT                 static constexpr
+#endif
+
+// __grid_constant__ was introduced in CUDA 11.7.
+#if ((__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 7)))
+#  define CUTE_GRID_CONSTANT_SUPPORTED
+#endif
+
+// __grid_constant__ can be enabled only on SM70+.
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700))
+#  define CUTE_GRID_CONSTANT_ENABLED
+#endif
+
+#if ! defined(CUTE_GRID_CONSTANT)
+#  if defined(CUTE_GRID_CONSTANT_SUPPORTED) && defined(CUTE_GRID_CONSTANT_ENABLED)
+#    define CUTE_GRID_CONSTANT __grid_constant__
+#  else
+#    define CUTE_GRID_CONSTANT
+#  endif
+#endif
+
+// Some versions of GCC < 11 have trouble deducing that a
+// function with "auto" return type and all of its returns in an "if
+// constexpr ... else" statement must actually return.  Thus, GCC
+// emits spurious "missing return statement" build warnings.
+// Developers can suppress these warnings by using the
+// CUTE_GCC_UNREACHABLE macro, which must be followed by a semicolon.
+// It's harmless to use the macro for other GCC versions or other
+// compilers, but it has no effect.
+#if ! defined(CUTE_GCC_UNREACHABLE)
+#  if defined(__GNUC__)
+#    define CUTE_GCC_UNREACHABLE __builtin_unreachable()
+#  else
+#    define CUTE_GCC_UNREACHABLE
+#  endif
+#endif
+
+#if defined(_MSC_VER)
+// Provides support for alternative operators 'and', 'or', and 'not'
+#  include <ciso646>
+#endif // _MSC_VER
+
+#if defined(__CUDACC_RTC__)
+#  define CUTE_STL_NAMESPACE cuda::std
+#  define CUTE_STL_NAMESPACE_IS_CUDA_STD
+#else
+#  define CUTE_STL_NAMESPACE std
+#endif
+
+//
+// Assertion helpers
+//
+
+#if defined(__CUDACC_RTC__)
+#  include <cuda/std/cassert>
+#else
+#  include <cassert>
+#endif
+
+#define CUTE_STATIC_V(x)            decltype(x)::value
+
+#define CUTE_STATIC_ASSERT          static_assert
+#define CUTE_STATIC_ASSERT_V(x,...) static_assert(decltype(x)::value, ##__VA_ARGS__)
+
+// Fail and print a message. Typically used for notification of a compiler misconfiguration.
+#if defined(__CUDA_ARCH__)
+#  define CUTE_INVALID_CONTROL_PATH(x) assert(0 && x); printf(x); __brkpt()
+#else
+#  define CUTE_INVALID_CONTROL_PATH(x) assert(0 && x); printf(x)
+#endif
+
+//
+// IO
+//
+
+#if !defined(__CUDACC_RTC__)
+#  include <cstdio>
+#  include <iostream>
+#  include <iomanip>
+#endif
+
+//
+// Debugging utilities
+//
+
+#include <cute/util/debug.hpp>
diff --git a/include/cute/container/alignment.hpp b/include/cute/container/alignment.hpp
new file mode 100644
index 0000000000..52e4cbadd9
--- /dev/null
+++ b/include/cute/container/alignment.hpp
@@ -0,0 +1,70 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+
+#include <cute/numeric/numeric_types.hpp>
+#include <cute/numeric/math.hpp>
+
+namespace cute
+{
+
+// Test if a pointer is aligned to N bytes
+template <int N>
+CUTE_HOST_DEVICE constexpr
+bool
+is_byte_aligned(void const* const ptr)
+{
+  static_assert(has_single_bit(N), "N must be a power of 2 in alignment check");
+  return (reinterpret_cast<uintptr_t>(ptr) & (N-1)) == 0;
+}
+
+#if defined(__CUDACC__)
+#  define CUTE_ALIGNAS(n) __align__(n)
+#else
+#  define CUTE_ALIGNAS(n) alignas(n)
+#endif
+
+template <size_t Alignment, class Child = void>
+struct aligned_struct {};
+
+template <class Child> struct CUTE_ALIGNAS(  1) aligned_struct<  1, Child> {};
+template <class Child> struct CUTE_ALIGNAS(  2) aligned_struct<  2, Child> {};
+template <class Child> struct CUTE_ALIGNAS(  4) aligned_struct<  4, Child> {};
+template <class Child> struct CUTE_ALIGNAS(  8) aligned_struct<  8, Child> {};
+template <class Child> struct CUTE_ALIGNAS( 16) aligned_struct< 16, Child> {};
+template <class Child> struct CUTE_ALIGNAS( 32) aligned_struct< 32, Child> {};
+template <class Child> struct CUTE_ALIGNAS( 64) aligned_struct< 64, Child> {};
+template <class Child> struct CUTE_ALIGNAS(128) aligned_struct<128, Child> {};
+template <class Child> struct CUTE_ALIGNAS(256) aligned_struct<256, Child> {};
+
+} // end namespace cute
diff --git a/include/cute/container/array.hpp b/include/cute/container/array.hpp
new file mode 100644
index 0000000000..9cdcf5f4c2
--- /dev/null
+++ b/include/cute/container/array.hpp
@@ -0,0 +1,492 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+
+#include <cute/numeric/integral_constant.hpp>
+#include <cute/util/type_traits.hpp>
+
+namespace cute
+{
+
+template <class T, size_t N>
+struct array
+{
+  using element_type = T;
+  using value_type = remove_cv_t<T>;
+  using size_type = size_t;
+  using difference_type = ptrdiff_t;
+  using reference = element_type&;
+  using const_reference = const element_type&;
+  using pointer = element_type*;
+  using const_pointer = const element_type*;
+  using iterator = pointer;
+  using const_iterator = const_pointer;
+
+  CUTE_HOST_DEVICE constexpr
+  reference operator[](size_type pos)
+  {
+    return begin()[pos];
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_reference operator[](size_type pos) const
+  {
+    return begin()[pos];
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  reference front()
+  {
+    return *begin();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_reference front() const
+  {
+    return *begin();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  reference back()
+  {
+    // return *rbegin();
+    return operator[](N-1);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_reference back() const
+  {
+    // return *rbegin();
+    return operator[](N-1);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  T* data()
+  {
+    return __elems_;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  T const* data() const
+  {
+    return __elems_;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  iterator begin()
+  {
+    return data();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_iterator begin() const
+  {
+    return data();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_iterator cbegin()
+  {
+    return begin();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_iterator cbegin() const
+  {
+    return begin();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  iterator end()
+  {
+    return data() + size();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_iterator end() const
+  {
+    return data() + size();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_iterator cend()
+  {
+    return end();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_iterator cend() const
+  {
+    return end();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  bool empty() const
+  {
+    return size() == 0;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  size_type size() const
+  {
+    return N;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  size_type max_size() const
+  {
+    return size();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  void fill(const T& value)
+  {
+    for (auto& e : *this) {
+      e = value;
+    }
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  void clear()
+  {
+    fill(T(0));
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  void swap(array& other)
+  {
+    using CUTE_STL_NAMESPACE::swap;
+    for (size_type i = 0; i < size(); ++i) {
+      swap((*this)[i], other[i]);
+    }
+  }
+
+  element_type __elems_[N];
+};
+
+
+template <class T>
+struct array<T, 0>
+{
+  using element_type = T;
+  using value_type = remove_cv_t<T>;
+  using size_type = size_t;
+  using difference_type = ptrdiff_t;
+  using reference = element_type&;
+  using const_reference = const element_type&;
+  using pointer = element_type*;
+  using const_pointer = const element_type*;
+  using const_iterator = const_pointer;
+  using iterator = pointer;
+
+  CUTE_HOST_DEVICE constexpr
+  reference operator[](size_type pos)
+  {
+    return begin()[pos];
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_reference operator[](size_type pos) const
+  {
+    return begin()[pos];
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  reference front()
+  {
+    return *begin();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_reference front() const
+  {
+    return *begin();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  reference back()
+  {
+    return *begin();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_reference back() const
+  {
+    return *begin();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  T* data()
+  {
+    return nullptr;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  T const* data() const
+  {
+    return nullptr;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  iterator begin()
+  {
+    return nullptr;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_iterator begin() const
+  {
+    return nullptr;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_iterator cbegin()
+  {
+    return nullptr;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_iterator cbegin() const
+  {
+    return nullptr;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  iterator end()
+  {
+    return nullptr;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_iterator end() const
+  {
+    return nullptr;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_iterator cend()
+  {
+    return nullptr;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_iterator cend() const
+  {
+    return nullptr;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  bool empty() const
+  {
+    return true;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  size_type size() const
+  {
+    return 0;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  size_type max_size() const
+  {
+    return 0;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  void fill(const T& value)
+  {}
+
+  CUTE_HOST_DEVICE constexpr
+  void clear()
+  {}
+
+  CUTE_HOST_DEVICE constexpr
+  void swap(array& other)
+  {}
+};
+
+template <class T, size_t N>
+CUTE_HOST_DEVICE constexpr
+bool operator==(array<T,N> const& lhs, array<T,N> const& rhs)
+{
+  for (size_t i = 0; i < N; ++i) {
+    if (lhs[i] != rhs[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <class T, size_t N>
+CUTE_HOST_DEVICE constexpr
+void clear(array<T,N>& a)
+{
+  a.fill(T(0));
+}
+
+template <class T, size_t N>
+CUTE_HOST_DEVICE constexpr
+void fill(array<T,N>& a, T const& value)
+{
+  a.fill(value);
+}
+
+template <class T, size_t N>
+CUTE_HOST_DEVICE constexpr
+void swap(array<T,N>& a, array<T,N>& b)
+{
+  a.swap(b);
+}
+
+/// @return A cute::array of the elements of @c t in reverse order.
+template <class T, size_t N>
+CUTE_HOST_DEVICE constexpr
+cute::array<T,N> reverse(cute::array<T,N> const& t) 
+{
+  if constexpr (N == 0u) {
+    return t;
+  } else {
+    cute::array<T,N> t_r{};
+    for (size_t k = 0; k < N; ++k) {
+      t_r[k] = t[N - k - 1];
+    }
+    return t_r;
+  }
+}
+
+} // end cute
+
+
+//
+// Specialize tuple-related functionality for cute::array
+//
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/tuple>
+#else
+#include <tuple>
+#endif
+
+namespace cute
+{
+
+template <size_t I, class T, size_t N>
+CUTE_HOST_DEVICE constexpr
+T& get(array<T,N>& a)
+{
+  static_assert(I < N, "Index out of range");
+  return a[I];
+}
+
+template <size_t I, class T, size_t N>
+CUTE_HOST_DEVICE constexpr
+T const& get(array<T,N> const& a)
+{
+  static_assert(I < N, "Index out of range");
+  return a[I];
+}
+
+template <size_t I, class T, size_t N>
+CUTE_HOST_DEVICE constexpr
+T&& get(array<T,N>&& a)
+{
+  static_assert(I < N, "Index out of range");
+  return cute::move(a[I]);
+}
+
+} // end namespace cute
+
+namespace CUTE_STL_NAMESPACE
+{
+
+template <class T, size_t N>
+struct tuple_size<cute::array<T,N>>
+    : CUTE_STL_NAMESPACE::integral_constant<size_t, N>
+{};
+
+template <size_t I, class T, size_t N>
+struct tuple_element<I, cute::array<T,N>>
+{
+  using type = T;
+};
+
+template <class T, size_t N>
+struct tuple_size<cute::array<T,N> const>
+    : CUTE_STL_NAMESPACE::integral_constant<size_t, N>
+{};
+
+template <size_t I, class T, size_t N>
+struct tuple_element<I, cute::array<T,N> const>
+{
+  using type = T;
+};
+
+} // end namespace CUTE_STL_NAMESPACE
+
+#ifdef CUTE_STL_NAMESPACE_IS_CUDA_STD
+namespace std
+{
+
+#if defined(__CUDACC_RTC__)
+template <class... _Tp>
+struct tuple_size;
+
+template <size_t _Ip, class... _Tp>
+struct tuple_element;
+#endif
+
+template <class T, size_t N>
+struct tuple_size<cute::array<T,N>>
+    : CUTE_STL_NAMESPACE::integral_constant<size_t, N>
+{};
+
+template <size_t I, class T, size_t N>
+struct tuple_element<I, cute::array<T,N>>
+{
+  using type = T;
+};
+
+template <class T, size_t N>
+struct tuple_size<cute::array<T,N> const>
+    : CUTE_STL_NAMESPACE::integral_constant<size_t, N>
+{};
+
+template <size_t I, class T, size_t N>
+struct tuple_element<I, cute::array<T,N> const>
+{
+  using type = T;
+};
+
+} // end namespace std
+#endif // CUTE_STL_NAMESPACE_IS_CUDA_STD
diff --git a/include/cute/container/array_aligned.hpp b/include/cute/container/array_aligned.hpp
new file mode 100644
index 0000000000..a9d14a1a25
--- /dev/null
+++ b/include/cute/container/array_aligned.hpp
@@ -0,0 +1,42 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/container/alignment.hpp>  // CUTE_ALIGNAS
+#include <cute/container/array.hpp>      // cute::array
+
+namespace cute
+{
+
+template <class T, size_t N, size_t Alignment = 16>
+struct CUTE_ALIGNAS(Alignment) array_aligned : cute::array<T,N> {};
+
+} // end namespace cute
diff --git a/include/cute/container/array_subbyte.hpp b/include/cute/container/array_subbyte.hpp
new file mode 100644
index 0000000000..48d416f45b
--- /dev/null
+++ b/include/cute/container/array_subbyte.hpp
@@ -0,0 +1,662 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Statically sized array of elements that accommodates subbyte trivial types
+           in a packed storage.
+*/
+
+#pragma once
+
+#include <cute/config.hpp>
+
+#include <cute/numeric/numeric_types.hpp>
+#include <cute/numeric/integral_constant.hpp>
+
+namespace cute
+{
+//
+// Underlying subbyte storage type
+//
+template <class T>
+using subbyte_storage_type_t = conditional_t<(cute::sizeof_bits_v<T> <=   8), uint8_t,
+                               conditional_t<(cute::sizeof_bits_v<T> <=  16), uint16_t,
+                               conditional_t<(cute::sizeof_bits_v<T> <=  32), uint32_t,
+                               conditional_t<(cute::sizeof_bits_v<T> <=  64), uint64_t,
+                               conditional_t<(cute::sizeof_bits_v<T> <= 128), uint128_t,
+                               T>>>>>;
+
+template <class T> struct subbyte_iterator;
+template <class, class> struct swizzle_ptr;
+
+//
+// subbyte_reference
+//   Proxy object for sub-byte element references
+//
+template <class T>
+struct subbyte_reference
+{
+  // Iterator Element type (const or non-const)
+  using element_type = T;
+  // Iterator Value type without type qualifier.
+  using value_type   = remove_cv_t<T>;
+  // Storage type (const or non-const)
+  using storage_type = conditional_t<(is_const_v<T>), subbyte_storage_type_t<T> const, subbyte_storage_type_t<T>>;
+
+  static_assert(sizeof_bits_v<storage_type> % 8 == 0, "Storage type is not supported");
+
+  static_assert(sizeof_bits_v<element_type> <= sizeof_bits_v<storage_type>,
+                "Size of Element must not be greater than Storage.");
+
+private:
+
+  // Bitmask for covering one item
+  static constexpr storage_type BitMask = storage_type(storage_type(-1) >> (sizeof_bits_v<storage_type> - sizeof_bits_v<element_type>));
+  // Flag for fast branching on straddled elements
+  static constexpr bool is_storage_unaligned = ((sizeof_bits_v<storage_type> % sizeof_bits_v<element_type>) != 0);
+
+  friend struct subbyte_iterator<T>;
+
+  // Pointer to storage element
+  storage_type* ptr_ = nullptr;
+
+  // Bit index of value_type starting position within storage_type element.
+  // RI: 0 <= idx_ < sizeof_bit<storage_type>
+  uint8_t idx_ = 0;
+
+  // Ctor
+  template <class PointerType>
+  CUTE_HOST_DEVICE constexpr
+  subbyte_reference(PointerType* ptr, uint8_t idx = 0) : ptr_(reinterpret_cast<storage_type*>(ptr)), idx_(idx) {}
+
+public:
+
+  // Copy Ctor
+  CUTE_HOST_DEVICE constexpr
+  subbyte_reference(subbyte_reference<value_type> const& other) {
+    *this = other.get();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  subbyte_reference(subbyte_reference<value_type const> const& other) {
+    *this = other.get();
+  }
+
+  // Copy Assignment
+  CUTE_HOST_DEVICE constexpr
+  subbyte_reference& operator=(subbyte_reference<value_type> const& other) {
+    return *this = other.get();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  subbyte_reference& operator=(subbyte_reference<value_type const> const& other) {
+    return *this = other.get();
+  }
+
+  // Assignment
+  template <class T_ = element_type>
+  CUTE_HOST_DEVICE constexpr
+  enable_if_t<!is_const_v<T_>, subbyte_reference&> operator=(value_type x)
+  {
+    static_assert(is_same_v<T_, element_type>, "Do not specify template arguments!");
+    storage_type item = (reinterpret_cast<storage_type const&>(x) & BitMask);
+
+    // Update the current storage element
+    storage_type bit_mask_0 = storage_type(BitMask << idx_);
+    ptr_[0] = storage_type((ptr_[0] & ~bit_mask_0) | (item << idx_));
+
+    // If value_type is unaligned with storage_type (static) and this is a straddled value (dynamic)
+    if (is_storage_unaligned && idx_ + sizeof_bits_v<value_type> > sizeof_bits_v<storage_type>) {
+      uint8_t straddle_bits = uint8_t(sizeof_bits_v<storage_type> - idx_);
+      storage_type bit_mask_1 = storage_type(BitMask >> straddle_bits);
+      // Update the next storage element
+      ptr_[1] = storage_type((ptr_[1] & ~bit_mask_1) | (item >> straddle_bits));
+    }
+
+    return *this;
+  }
+
+  // Comparison of referenced values
+  CUTE_HOST_DEVICE constexpr friend
+  bool operator==(subbyte_reference const& x, subbyte_reference const& y) { return x.get() == y.get(); }
+  CUTE_HOST_DEVICE constexpr friend
+  bool operator!=(subbyte_reference const& x, subbyte_reference const& y) { return x.get() != y.get(); }
+  CUTE_HOST_DEVICE constexpr friend
+  bool operator< (subbyte_reference const& x, subbyte_reference const& y) { return x.get() <  y.get(); }
+  CUTE_HOST_DEVICE constexpr friend
+  bool operator> (subbyte_reference const& x, subbyte_reference const& y) { return x.get() >  y.get(); }
+  CUTE_HOST_DEVICE constexpr friend
+  bool operator<=(subbyte_reference const& x, subbyte_reference const& y) { return x.get() <= y.get(); }
+  CUTE_HOST_DEVICE constexpr friend
+  bool operator>=(subbyte_reference const& x, subbyte_reference const& y) { return x.get() >= y.get(); }
+
+  // Value
+  CUTE_HOST_DEVICE
+  value_type get() const
+  {
+    if constexpr (is_same_v<bool, value_type>) {      // Extract to bool -- potentially faster impl
+      return bool((*ptr_) & (BitMask << idx_));
+    } else {                                          // Extract to value_type
+      // Extract from the current storage element
+      auto item = storage_type((ptr_[0] >> idx_) & BitMask);
+
+      // If value_type is unaligned with storage_type (static) and this is a straddled value (dynamic)
+      if (is_storage_unaligned && idx_ + sizeof_bits_v<value_type> > sizeof_bits_v<storage_type>) {
+        uint8_t straddle_bits = uint8_t(sizeof_bits_v<storage_type> - idx_);
+        storage_type bit_mask_1 = storage_type(BitMask >> straddle_bits);
+        // Extract from the next storage element
+        item |= storage_type((ptr_[1] & bit_mask_1) << straddle_bits);
+      }
+
+      return reinterpret_cast<value_type&>(item);
+    }
+  }
+
+  // Extract to type value_type
+  CUTE_HOST_DEVICE constexpr
+  operator value_type() const {
+    return get();
+  }
+
+  // Address
+  CUTE_HOST_DEVICE
+  subbyte_iterator<T> operator&() const {
+    return {ptr_, idx_};
+  }
+};
+
+template <class T>
+CUTE_HOST_DEVICE
+void
+print(subbyte_reference<T> ref) {
+  cute::print(ref.get());
+}
+
+template <class T>
+CUTE_HOST_DEVICE
+void
+pretty_print(subbyte_reference<T> ref) {
+  cute::pretty_print(ref.get());
+}
+
+//
+// subbyte_iterator
+//   Random-access iterator over subbyte references
+//
+template <class T>
+struct subbyte_iterator
+{
+  // Iterator Element type (const or non-const)
+  using element_type = T;
+  // Iterator Value type without type qualifier.
+  using value_type   = remove_cv_t<T>;
+  // Storage type (const or non-const)
+  using storage_type = conditional_t<(is_const_v<T>), subbyte_storage_type_t<T> const, subbyte_storage_type_t<T>>;
+  // Reference proxy type
+  using reference = subbyte_reference<element_type>;
+
+  static_assert(sizeof_bits_v<storage_type> % 8 == 0, "Storage type is not supported");
+
+  static_assert(sizeof_bits_v<element_type> <= sizeof_bits_v<storage_type>,
+                "Size of Element must not be greater than Storage.");
+
+private:
+
+  template <class, class> friend struct swizzle_ptr;
+  template <class U> friend CUTE_HOST_DEVICE constexpr U* raw_pointer_cast(subbyte_iterator<U> const&);
+  template <class N, class U> friend CUTE_HOST_DEVICE constexpr auto recast_ptr(subbyte_iterator<U> const&);
+  template <class U> friend CUTE_HOST_DEVICE void print(subbyte_iterator<U> const&);
+
+  // Pointer to storage element
+  storage_type* ptr_;
+
+  // Bit index of value_type starting position within storage_type element.
+  // RI: 0 <= idx_ < sizeof_bit<storage_type>
+  uint8_t idx_;
+
+public:
+
+  // Default Ctor
+  CUTE_HOST_DEVICE constexpr
+  subbyte_iterator() : ptr_{nullptr}, idx_{0} {};
+
+  // Ctor
+  template <class PointerType>
+  CUTE_HOST_DEVICE constexpr
+  subbyte_iterator(PointerType* ptr, uint8_t idx = 0) : ptr_(reinterpret_cast<storage_type*>(ptr)), idx_(idx) { }
+
+  CUTE_HOST_DEVICE constexpr
+  reference operator*() const {
+    return reference(ptr_, idx_);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  subbyte_iterator& operator+=(uint64_t k) {
+    k = sizeof_bits_v<value_type> * k + idx_;
+    ptr_ += k / sizeof_bits_v<storage_type>;
+    idx_  = k % sizeof_bits_v<storage_type>;
+    return *this;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  subbyte_iterator operator+(uint64_t k) const {
+    return subbyte_iterator(ptr_, idx_) += k;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  reference operator[](uint64_t k) const {
+    return *(*this + k);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  subbyte_iterator& operator++() {
+    idx_ += sizeof_bits_v<value_type>;
+    if (idx_ >= sizeof_bits_v<storage_type>) {
+      ++ptr_;
+      idx_ -= sizeof_bits_v<storage_type>;
+    }
+    return *this;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  subbyte_iterator operator++(int) {
+    subbyte_iterator ret(*this);
+    ++(*this);
+    return ret;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  subbyte_iterator& operator--() {
+    if (idx_ >= sizeof_bits_v<value_type>) {
+      idx_ -= sizeof_bits_v<value_type>;
+    } else {
+      --ptr_;
+      idx_ += sizeof_bits_v<storage_type> - sizeof_bits_v<value_type>;
+    }
+    return *this;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  subbyte_iterator operator--(int) {
+    subbyte_iterator ret(*this);
+    --(*this);
+    return ret;
+  }
+
+  CUTE_HOST_DEVICE constexpr friend
+  bool operator==(subbyte_iterator const& x, subbyte_iterator const& y) {
+    return x.ptr_ == y.ptr_ && x.idx_ == y.idx_;
+  }
+  CUTE_HOST_DEVICE constexpr friend
+  bool operator!=(subbyte_iterator const& x, subbyte_iterator const& y) { return !(x == y); }
+  CUTE_HOST_DEVICE constexpr friend
+  bool operator< (subbyte_iterator const& x, subbyte_iterator const& y) {
+    return x.ptr_ < y.ptr_ || (x.ptr_ == y.ptr_ && x.idx_ < y.idx_);
+  }
+  CUTE_HOST_DEVICE constexpr friend
+  bool operator<=(subbyte_iterator const& x, subbyte_iterator const& y) { return !(y <  x); }
+  CUTE_HOST_DEVICE constexpr friend
+  bool operator> (subbyte_iterator const& x, subbyte_iterator const& y) { return  (y <  x); }
+  CUTE_HOST_DEVICE constexpr friend
+  bool operator>=(subbyte_iterator const& x, subbyte_iterator const& y) { return !(x <  y); }
+};
+
+// Conversion to raw pointer with loss of subbyte index
+template <class T>
+CUTE_HOST_DEVICE constexpr
+T*
+raw_pointer_cast(subbyte_iterator<T> const& x) {
+  assert(x.idx_ == 0);
+  return reinterpret_cast<T*>(x.ptr_);
+}
+
+// Conversion to NewT_ with possible loss of subbyte index
+template <class NewT_, class T>
+CUTE_HOST_DEVICE constexpr
+auto
+recast_ptr(subbyte_iterator<T> const& x) {
+  using NewT = conditional_t<(is_const_v<T>), NewT_ const, NewT_>;
+  if constexpr (cute::is_subbyte_v<NewT>) {       // Making subbyte_iter, preserve the subbyte idx
+    return subbyte_iterator<NewT>(x.ptr_, x.idx_);
+  } else {                                       // Not subbyte, assume/assert subbyte idx 0
+    return reinterpret_cast<NewT*>(raw_pointer_cast(x));
+  }
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Dynamic pointers have unknown static alignment
+template <class T>
+CUTE_HOST_DEVICE constexpr
+Int<0>
+max_alignment(subbyte_iterator<T> const& x) {
+  return {};
+}
+
+template <class T>
+CUTE_HOST_DEVICE void
+print(subbyte_iterator<T> const& x) {
+  printf("subptr[%db](%p.%u)", int(sizeof_bits_v<T>), x.ptr_, x.idx_);
+}
+
+template <class T>
+CUTE_HOST_DEVICE void
+print(subbyte_reference<T> const& x) {
+  print(x.get());
+}
+
+//
+// array_subbyte
+//   Statically sized array for non-byte-aligned data types
+//
+template <class T, size_t N>
+struct array_subbyte
+{
+  using element_type    = T;
+  using value_type      = remove_cv_t<T>;
+  using pointer         = element_type*;
+  using const_pointer   = element_type const*;
+
+  using size_type       = size_t;
+  using difference_type = ptrdiff_t;
+
+  //
+  // References
+  //
+  using reference       = subbyte_reference<element_type>;
+  using const_reference = subbyte_reference<element_type const>;
+
+  //
+  // Iterators
+  //
+  using iterator        = subbyte_iterator<element_type>;
+  using const_iterator  = subbyte_iterator<element_type const>;
+
+  // Storage type (const or non-const)
+  using storage_type = conditional_t<(is_const_v<T>), subbyte_storage_type_t<T> const, subbyte_storage_type_t<T>>;
+
+  static_assert(sizeof_bits_v<storage_type> % 8 == 0, "Storage type is not supported");
+
+private:
+
+  // Number of storage elements, ceil_div
+  static constexpr size_type StorageElements = (N * sizeof_bits_v<value_type> + sizeof_bits_v<storage_type> - 1) / sizeof_bits_v<storage_type>;
+
+  // Internal storage
+  storage_type storage[StorageElements];
+
+public:
+
+  CUTE_HOST_DEVICE constexpr
+  size_type size() const {
+    return N;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  size_type max_size() const {
+    return N;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  bool empty() const {
+    return !N;
+  }
+
+  // Efficient clear method
+  CUTE_HOST_DEVICE constexpr
+  void clear() {
+    CUTE_UNROLL
+    for (size_type i = 0; i < StorageElements; ++i) {
+      storage[i] = storage_type(0);
+    }
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  void fill(T const& value) {
+    CUTE_UNROLL
+    for (size_type i = 0; i < N; ++i) {
+      at(i) = value;
+    }
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  reference at(size_type pos) {
+    return iterator(storage)[pos];
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_reference at(size_type pos) const {
+    return const_iterator(storage)[pos];
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  reference operator[](size_type pos) {
+    return at(pos);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_reference operator[](size_type pos) const {
+    return at(pos);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  reference front() {
+    return at(0);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_reference front() const {
+    return at(0);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  reference back() {
+    return at(N-1);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_reference back() const {
+    return at(N-1);
+  }
+
+  // In analogy to std::vector<bool>::data(), these functions are deleted to prevent bugs.
+  // Instead, prefer
+  //   auto* data = raw_pointer_cast(my_subbyte_array.begin());
+  // where the type of auto* is implementation-defined and
+  // with the knowledge that [data, data + my_subbyte_array.size()) may not be a valid range.
+  CUTE_HOST_DEVICE constexpr
+  pointer data() = delete;
+
+  CUTE_HOST_DEVICE constexpr
+  const_pointer data() const = delete;
+
+  CUTE_HOST_DEVICE constexpr
+  iterator begin() {
+    return iterator(storage);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_iterator begin() const {
+    return const_iterator(storage);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_iterator cbegin() const {
+    return begin();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  iterator end() {
+    return iterator(storage) + N;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_iterator end() const {
+    return const_iterator(storage) + N;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_iterator cend() const {
+    return end();
+  }
+
+  //
+  // Comparison operators
+  //
+
+};
+
+//
+// Operators
+//
+
+template <class T, size_t N>
+CUTE_HOST_DEVICE constexpr
+void clear(array_subbyte<T,N>& a)
+{
+  a.clear();
+}
+
+template <class T, size_t N>
+CUTE_HOST_DEVICE constexpr
+void fill(array_subbyte<T,N>& a, T const& value)
+{
+  a.fill(value);
+}
+
+} // namespace cute
+
+//
+// Specialize tuple-related functionality for cute::array_subbyte
+//
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/tuple>
+#else
+#include <tuple>
+#endif
+
+namespace cute
+{
+
+template <size_t I, class T, size_t N>
+CUTE_HOST_DEVICE constexpr
+T& get(array_subbyte<T,N>& a)
+{
+  static_assert(I < N, "Index out of range");
+  return a[I];
+}
+
+template <size_t I, class T, size_t N>
+CUTE_HOST_DEVICE constexpr
+T const& get(array_subbyte<T,N> const& a)
+{
+  static_assert(I < N, "Index out of range");
+  return a[I];
+}
+
+template <size_t I, class T, size_t N>
+CUTE_HOST_DEVICE constexpr
+T&& get(array_subbyte<T,N>&& a)
+{
+  static_assert(I < N, "Index out of range");
+  return cute::move(a[I]);
+}
+
+} // end namespace cute
+
+namespace CUTE_STL_NAMESPACE
+{
+
+template <class T>
+struct is_reference<cute::subbyte_reference<T>>
+    : CUTE_STL_NAMESPACE::true_type
+{};
+
+
+template <class T, size_t N>
+struct tuple_size<cute::array_subbyte<T,N>>
+    : CUTE_STL_NAMESPACE::integral_constant<size_t, N>
+{};
+
+template <size_t I, class T, size_t N>
+struct tuple_element<I, cute::array_subbyte<T,N>>
+{
+  using type = T;
+};
+
+template <class T, size_t N>
+struct tuple_size<const cute::array_subbyte<T,N>>
+    : CUTE_STL_NAMESPACE::integral_constant<size_t, N>
+{};
+
+template <size_t I, class T, size_t N>
+struct tuple_element<I, const cute::array_subbyte<T,N>>
+{
+  using type = T;
+};
+
+} // end namespace CUTE_STL_NAMESPACE
+
+#ifdef CUTE_STL_NAMESPACE_IS_CUDA_STD
+namespace std
+{
+
+#if defined(__CUDACC_RTC__)
+template <class... _Tp>
+struct tuple_size;
+
+template <size_t _Ip, class... _Tp>
+struct tuple_element;
+#endif
+
+template <class T, size_t N>
+struct tuple_size<cute::array_subbyte<T,N>>
+    : CUTE_STL_NAMESPACE::integral_constant<size_t, N>
+{};
+
+template <size_t I, class T, size_t N>
+struct tuple_element<I, cute::array_subbyte<T,N>>
+{
+  using type = T;
+};
+
+template <class T, size_t N>
+struct tuple_size<const cute::array_subbyte<T,N>>
+    : CUTE_STL_NAMESPACE::integral_constant<size_t, N>
+{};
+
+template <size_t I, class T, size_t N>
+struct tuple_element<I, const cute::array_subbyte<T,N>>
+{
+  using type = T;
+};
+
+} // end namespace std
+#endif // CUTE_STL_NAMESPACE_IS_CUDA_STD
diff --git a/include/cute/container/bit_field.hpp b/include/cute/container/bit_field.hpp
new file mode 100644
index 0000000000..d7fac42a54
--- /dev/null
+++ b/include/cute/container/bit_field.hpp
@@ -0,0 +1,133 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Portable bit field that supports byte and word straddling that can
+           be used in unions to bit-wise define parameters.
+*/
+
+#pragma once
+
+#include <cute/config.hpp>                  // CUTE_HOST_DEVICE
+#include <cute/numeric/numeric_types.hpp>   // uint_bit_t
+#include <cute/util/type_traits.hpp>        // cute::is_same
+
+namespace cute
+{
+
+class dummy_type {};
+
+template <uint32_t BitStart, uint32_t NumBits, class OtherValueType = dummy_type>
+struct bit_field
+{
+  static_assert(0 < NumBits && NumBits <= 64, "bit_fields with more than 64 bits are not supported.");
+
+  // value_type: Use the smallest value type that fits NumBits
+  static constexpr uint32_t value_type_bits = (NumBits <=  8) ?  8 :
+                                              (NumBits <= 16) ? 16 :
+                                              (NumBits <= 32) ? 32 : 64;
+  using value_type   = cute::uint_bit_t<value_type_bits>;
+  // storage_type: Use the smallest storage_type that avoids boundary crossing
+  static constexpr uint32_t storage_type_bits = (BitStart /  8 == (BitStart + NumBits - 1) /  8) ?  8 :
+                                                (BitStart / 16 == (BitStart + NumBits - 1) / 16) ? 16 :
+                                                (BitStart / 32 == (BitStart + NumBits - 1) / 32) ? 32 : 64;
+  using storage_type = cute::uint_bit_t<storage_type_bits>;
+
+  static_assert(sizeof(OtherValueType) == sizeof(value_type) || is_same<OtherValueType,dummy_type>::value,
+                "sizeof(OtherValueType) must be same as sizeof(value_type).");
+
+  // Number of storage values needed: ceil_div(BitStart + NumBits, storage_type_bits)
+  static constexpr uint32_t N      = (BitStart + NumBits + storage_type_bits - 1) / storage_type_bits;
+  // Index of storage value for BitStart
+  static constexpr uint32_t idx    = BitStart / storage_type_bits;
+  // Bit of data_[idx] for BitStart
+  static constexpr uint32_t bit_lo = BitStart % storage_type_bits;
+  // Number of bits in data_[idx] used for NumBits if straddling, else 0
+  static constexpr uint32_t bit_hi = (idx + 1 < N) ? (storage_type_bits - bit_lo) : 0;
+
+public:
+
+  // NumBits mask
+  static constexpr value_type   mask    = value_type(uint64_t(-1) >> (64u - NumBits));
+  // NumBits mask for BitStart
+  static constexpr storage_type mask_lo = storage_type(mask) << bit_lo;
+  // NumBits mask for leftover bits in data_[idx+1] if straddling, else 0
+  static constexpr storage_type mask_hi = (idx + 1 < N) ? (storage_type(mask) >> bit_hi) : 0;
+
+  storage_type data_[N];
+
+  // Get value
+  CUTE_HOST_DEVICE constexpr
+  value_type get() const {
+    storage_type result = (data_[idx] & mask_lo) >> bit_lo;
+    if constexpr (bit_hi != 0) {
+      result |= (data_[idx+1] & mask_hi) << bit_hi;
+    }
+    return static_cast<value_type>(result);
+  }
+
+  // Set value
+  CUTE_HOST_DEVICE constexpr
+  void set(value_type x) {
+    storage_type item = static_cast<storage_type>(x & mask);
+    data_[idx] = static_cast<storage_type>((data_[idx] & ~mask_lo) | (item << bit_lo));
+    if constexpr (bit_hi != 0) {
+      data_[idx+1] = static_cast<storage_type>((data_[idx+1] & ~mask_hi) | (item >> bit_hi));
+    }
+  }
+
+  // Assign value
+  CUTE_HOST_DEVICE constexpr
+  bit_field& operator=(value_type x) {
+    set(x);
+    return *this;
+  }
+
+  // Cast to value
+  CUTE_HOST_DEVICE constexpr
+  operator value_type () const {
+    return get();
+  }
+
+  // Assign OtherValueType
+  CUTE_HOST_DEVICE constexpr
+  bit_field& operator=(OtherValueType x) {
+    return *this = *reinterpret_cast<value_type*>(&x);
+  }
+
+  // Cast to OtherValueType
+  CUTE_HOST_DEVICE constexpr
+  operator OtherValueType () const {
+    value_type x = get();
+    return *reinterpret_cast<OtherValueType*>(&x);
+  }
+};
+
+} // end namespace cute
diff --git a/include/cute/container/cuda_types.hpp b/include/cute/container/cuda_types.hpp
new file mode 100644
index 0000000000..fbc314e543
--- /dev/null
+++ b/include/cute/container/cuda_types.hpp
@@ -0,0 +1,183 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>                     // CUTE_HOST_DEVICE, CUTE_GCC_UNREACHABLE
+#include <cute/numeric/integral_constant.hpp>  // cute::integral_constant
+
+namespace cute
+{
+
+//
+// dim3
+//
+
+using dim3 = ::dim3;
+
+// MSVC doesn't define its C++ version macro to match
+// its C++ language version.  This means that when
+// building with MSVC, dim3 isn't constexpr-friendly.
+template <size_t I>
+CUTE_HOST_DEVICE
+#if ! defined(_MSC_VER)
+constexpr
+#endif
+uint32_t& get(dim3& a)
+{
+  static_assert(I < 3, "Index out of range");
+  if constexpr (I == 0) {
+    return a.x;
+  } else if constexpr (I == 1) {
+    return a.y;
+  } else if constexpr (I == 2) {
+    return a.z;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <size_t I>
+CUTE_HOST_DEVICE
+#if ! defined(_MSC_VER)
+constexpr
+#endif
+uint32_t const& get(dim3 const& a)
+{
+  static_assert(I < 3, "Index out of range");
+  if constexpr (I == 0) {
+    return a.x;
+  } else if constexpr (I == 1) {
+    return a.y;
+  } else if constexpr (I == 2) {
+    return a.z;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <size_t I>
+CUTE_HOST_DEVICE
+#if ! defined(_MSC_VER)
+constexpr
+#endif
+uint32_t&& get(dim3&& a)
+{
+  static_assert(I < 3, "Index out of range");
+  if constexpr (I == 0) {
+    return cute::move(a.x);
+  } else if constexpr (I == 1) {
+    return cute::move(a.y);
+  } else if constexpr (I == 2) {
+    return cute::move(a.z);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Specialize cute::tuple-traits for external types
+template <>
+struct tuple_size<dim3>
+    : integral_constant<size_t, 3>
+{};
+
+template <size_t I>
+struct tuple_element<I, dim3>
+{
+  using type = uint32_t;
+};
+
+//
+// uint3
+//
+
+using uint3 = ::uint3;
+
+template <size_t I>
+CUTE_HOST_DEVICE constexpr
+uint32_t& get(uint3& a)
+{
+  static_assert(I < 3, "Index out of range");
+  if constexpr (I == 0) {
+    return a.x;
+  } else if constexpr (I == 1) {
+    return a.y;
+  } else if constexpr (I == 2) {
+    return a.z;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <size_t I>
+CUTE_HOST_DEVICE constexpr
+uint32_t const& get(uint3 const& a)
+{
+  static_assert(I < 3, "Index out of range");
+  if constexpr (I == 0) {
+    return a.x;
+  } else if constexpr (I == 1) {
+    return a.y;
+  } else if constexpr (I == 2) {
+    return a.z;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <size_t I>
+CUTE_HOST_DEVICE constexpr
+uint32_t&& get(uint3&& a)
+{
+  static_assert(I < 3, "Index out of range");
+  if constexpr (I == 0) {
+    return cute::move(a.x);
+  } else if constexpr (I == 1) {
+    return cute::move(a.y);
+  } else if constexpr (I == 2) {
+    return cute::move(a.z);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Specialize cute::tuple-traits for external types
+template <>
+struct tuple_size<uint3>
+    : integral_constant<size_t, 3>
+{};
+
+template <size_t I>
+struct tuple_element<I, uint3>
+{
+  using type = uint32_t;
+};
+
+} // end namespace cute
diff --git a/include/cute/container/packed_tuple.hpp b/include/cute/container/packed_tuple.hpp
new file mode 100644
index 0000000000..c20df2c235
--- /dev/null
+++ b/include/cute/container/packed_tuple.hpp
@@ -0,0 +1,254 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+#include <cute/util/type_traits.hpp>
+#include <cute/numeric/integral_constant.hpp>
+#include <cute/container/type_list.hpp>
+
+namespace cute {
+
+namespace detail {
+
+// Empty Structure Optimization
+template <bool IsFirstEmpty, bool IsRestEmpty, class... T>
+struct ESO;
+
+template <class First, class... Rest>
+static constexpr bool is_first_empty_v = cute::is_empty<First>::value;
+template <class First, class... Rest>
+static constexpr bool is_rest_empty_v  = (cute::is_empty<Rest>::value && ...);
+
+template <class... T>
+using ESO_t = ESO<is_first_empty_v<T...>, is_rest_empty_v<T...>, T...>;
+
+// Empty First and Empty Rest...
+template <class First, class... Rest>
+struct ESO<true, true, First, Rest...> {
+  CUTE_HOST_DEVICE constexpr
+  ESO() {}
+
+  CUTE_HOST_DEVICE constexpr
+  ESO(First const&, Rest const&...) {}
+};
+
+// NonEmpty First and Empty Rest...
+template <class First, class... Rest>
+struct ESO<false, true, First, Rest...> {
+  CUTE_HOST_DEVICE constexpr
+  ESO() : first_{} {}
+
+  CUTE_HOST_DEVICE constexpr
+  ESO(First const& first, Rest const&...) : first_{first} {}
+
+  First first_;
+};
+
+// Empty First and NonEmpty Rest...
+template <class First, class... Rest>
+struct ESO<true, false, First, Rest...> {
+  CUTE_HOST_DEVICE constexpr
+  ESO() : rest_{} {}
+
+  CUTE_HOST_DEVICE constexpr
+  ESO(First const&, Rest const&... rest) : rest_{rest...} {}
+
+  ESO_t<Rest...> rest_;
+};
+
+// NonEmpty T and NonEmpty Rest...
+template <class First, class... Rest>
+struct ESO<false, false, First, Rest...> {
+  CUTE_HOST_DEVICE constexpr
+  ESO() : first_{}, rest_{} {}
+
+  CUTE_HOST_DEVICE constexpr
+  ESO(First const& first, Rest const&... rest) : first_{first}, rest_{rest...} {}
+
+  First first_;
+  ESO_t<Rest...> rest_;
+};
+
+// Get Nth value from ESO
+template <size_t N, class T, class... Rest, bool F, bool R>
+CUTE_HOST_DEVICE constexpr decltype(auto) getv(ESO<F, R, T, Rest...> const& s) {
+  if constexpr (N == 0) {
+    if constexpr (F) { return T{}; }
+    else             { return static_cast<T const&>(s.first_); }
+  } else {
+    if constexpr (R) { return cute::tuple_element_t<N-1, cute::type_list<Rest...>>{}; }
+    else             { return getv<N-1>(s.rest_); }
+  }
+}
+
+template <size_t N, class T, class... Rest, bool F, bool R>
+CUTE_HOST_DEVICE constexpr decltype(auto) getv(ESO<F, R, T, Rest...>& s) {
+  if constexpr (N == 0) {
+    if constexpr (F) { return T{}; }
+    else             { return static_cast<T&>(s.first_); }
+  } else {
+    if constexpr (R) { return cute::tuple_element_t<N-1, cute::type_list<Rest...>>{}; }
+    else             { return getv<N-1>(s.rest_); }
+  }
+}
+
+template <size_t N, class T, class... Rest, bool F, bool R>
+CUTE_HOST_DEVICE constexpr decltype(auto) getv(ESO<F, R, T, Rest...>&& s) {
+  if constexpr (N == 0) {
+    if constexpr (F) { return T{}; }
+    else             { return static_cast<T&&>(s.first_); }
+  } else {
+    if constexpr (R) { return cute::tuple_element_t<N-1, cute::type_list<Rest...>>{}; }
+    else             { return getv<N-1>(static_cast<ESO_t<Rest...>&&>(s.rest_)); }
+  }
+}
+
+// findt: Implementation detail of cute::find.
+// If X is the first template argument of the tuple, findt returns C<N>.
+
+template <class X, size_t N,
+  bool IsFirstEmpty, bool IsRestEmpty, class First, class... Rest>
+CUTE_HOST_DEVICE constexpr
+auto
+findt(ESO<IsFirstEmpty, IsRestEmpty, First, Rest...> const& t) noexcept
+{
+  if constexpr (cute::is_same_v<X, First>) {
+    return C<N>{};
+  }
+  else {
+    static_assert(sizeof...(Rest) != 0,
+      "The type does not appear in the argument list of the tuple.");
+    if constexpr (IsRestEmpty) {
+      // The rest is empty, so creating an instance of it is cheap.
+      return cute::detail::findt<X, N+1>(ESO_t<Rest...>{});
+    }
+    else {
+      return cute::detail::findt<X, N+1>(t.rest_);
+    }
+  }
+}
+
+} // end namespace detail
+
+// packed_tuple<T...> is a tuple type that is a standard-layout type
+// whenever all of its template arguments are standard layout types:
+//   (cute::is_standard_layout_v<T> && ...) implies (cute::is_standard_layout_v<packed_tuple<T...>>)
+
+template <class... T>
+struct packed_tuple : detail::ESO_t<T...>
+{
+  CUTE_HOST_DEVICE constexpr
+  packed_tuple() {}
+
+  CUTE_HOST_DEVICE constexpr
+  packed_tuple(T const&... ts)
+    : detail::ESO_t<T...>(ts...)
+  {}
+};
+
+template <>
+struct packed_tuple<> {};
+
+template <size_t I, class... T>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+get(packed_tuple<T...> const& t) {
+  static_assert(I < sizeof...(T), "Index out of range");
+  return detail::getv<I>(t);
+}
+
+template <size_t I, class... T>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+get(packed_tuple<T...>& t) {
+  static_assert(I < sizeof...(T), "Index out of range");
+  return detail::getv<I>(t);
+}
+
+template <size_t I, class... T>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+get(packed_tuple<T...>&& t) {
+  static_assert(I < sizeof...(T), "Index out of range");
+  return detail::getv<I>(static_cast<detail::ESO_t<T...>&&>(t));
+}
+
+template <class... T>
+CUTE_HOST_DEVICE constexpr
+packed_tuple<T...>
+make_packed_tuple(T const&... t)
+{
+  return {t...};
+}
+
+// Returns the position of type X (as a static integer) in the tuple
+// type's argument list.  X must be unique in the argument list.
+template <class X, class... T>
+CUTE_HOST_DEVICE constexpr
+auto
+find(packed_tuple<T...> const& t) noexcept
+{
+  return detail::findt<X, 0>(t);
+}
+
+} // end namespace cute
+
+namespace CUTE_STL_NAMESPACE
+{
+
+template <class... T>
+struct tuple_size<cute::packed_tuple<T...>>
+    : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
+{};
+
+template <size_t I, class... T>
+struct tuple_element<I, cute::packed_tuple<T...>>
+    : CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>
+{};
+
+} // end namespace CUTE_STL_NAMESPACE
+
+#ifdef CUTE_STL_NAMESPACE_IS_CUDA_STD
+namespace std {
+
+template <class ... T>
+struct tuple_size<cute::packed_tuple<T...>>
+  : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
+{};
+
+template <size_t I, class ... T>
+struct tuple_element<I, cute::packed_tuple<T...>>
+  : CUTE_STL_NAMESPACE::tuple_element<I, cute::packed_tuple<T...>>
+{};
+
+} // end namespace std
+#endif // CUTE_STL_NAMESPACE_IS_CUDA_STD
diff --git a/include/cute/container/tuple.hpp b/include/cute/container/tuple.hpp
new file mode 100644
index 0000000000..3123a68d83
--- /dev/null
+++ b/include/cute/container/tuple.hpp
@@ -0,0 +1,744 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+#include <cute/util/type_traits.hpp>
+#include <cute/numeric/integral_constant.hpp>  // cute::true_type, cute::false_type
+#include <cute/numeric/integer_sequence.hpp>
+
+#include <cute/container/cuda_types.hpp>
+#include <cute/container/type_list.hpp>
+#if defined(CUTLASS_USE_PACKED_TUPLE)
+#  include <cute/container/packed_tuple.hpp>
+#endif
+
+//#include <cute/container/array.hpp>            // Advanced optimizations
+
+// cute::tuple is like std::tuple, with two differences.
+//
+// 1. It works on both host and device.
+// 2. Its template arguments must be semiregular types.
+//
+// Semiregular types are default constructible and copyable.
+// They include "value types" like int or float,
+// but do _not_ include references like int& or float&.
+// (See std::tie for an example of a tuple of references.)
+//
+// If the template arguments of cute::tuple are all empty types (in
+// the sense of std::is_empty_v), then the cute::tuple is also an
+// empty type.  Furthermore, if CUTLASS_USE_PACKED_TUPLE is defined,
+// cute::tuple is always a standard-layout type if all of its template
+// arguments are standard-layout types.
+
+namespace cute
+{
+
+#if defined(CUTLASS_USE_PACKED_TUPLE)
+
+template<class... T>
+using tuple = packed_tuple<T...>;
+
+#else
+
+namespace detail
+{
+
+// This is simplified over the implementations in std::, cuda::std::, and thrust:: by ignoring much of
+// the conversion SFINAE, special overloading, and avoiding cvref template types.
+//
+// Over standard-conforming tuple implementations, this appears to accelerate compilation times by over 3x.
+
+// EBO stands for "empty base optimization."
+// We use this technique to ensure that cute::tuple
+// doesn't need to waste space storing any template arguments
+// of cute::tuple that have no data (like integral_constant).
+// Otherwise, cute::tuple would need to spend at least 1 byte
+// for each of its template arguments.
+//
+// This is one way in which cute::tuple differs from std::tuple.
+// Empty types in the template argument list are not even constructed,
+// and do not have unique element addresses.  In fact, they are not
+// even members of the tuple or stored in any way.  Calling `get`
+// constructs and returns an instance of an empty type on demand.
+//
+// EBO always "holds" a single value of type T.
+// N is like an array index that TupleBase uses
+// to access the desired tuple element.
+template <size_t N, class T, bool IsEmpty = is_empty<T>::value>
+struct EBO;
+
+template <class T, size_t N, bool B>
+CUTE_HOST_DEVICE constexpr C<N> findt(EBO<N, T, B> const&)
+{ return {}; }
+
+// Specialization for types T that have no data;
+// the "static tuple leaf."  Valid T here include
+// integral_constant<U, Value>, Int<Value>,
+// and any other semiregular type
+// for which std::is_empty_v<T> is true.
+template <size_t N, class T>
+struct EBO<N, T, true>
+{
+  CUTE_HOST_DEVICE constexpr
+  EBO() {}
+
+  CUTE_HOST_DEVICE constexpr
+  EBO(T const&) {}
+};
+
+template <size_t N, class T>
+CUTE_HOST_DEVICE constexpr T getv(EBO<N, T, true> const&)
+{ return {}; }
+
+// Specialization for types T that are not empty;
+// the "dynamic tuple leaf."  Valid T here include int,
+// any other integral or floating-point type,
+// or any semiregular type for which std::is_empty_v<T> is false.
+template <size_t N, class T>
+struct EBO<N, T, false>
+{
+  CUTE_HOST_DEVICE constexpr
+  EBO() : t_{} {}
+
+  CUTE_HOST_DEVICE constexpr
+  EBO(T const& t) : t_{t} {}
+
+  T t_;
+};
+
+template <size_t N, class T>
+CUTE_HOST_DEVICE constexpr T const& getv(EBO<N, T, false> const& x)
+{ return x.t_; }
+
+template <size_t N, class T>
+CUTE_HOST_DEVICE constexpr T& getv(EBO<N, T, false>& x)
+{ return x.t_; }
+
+template <size_t N, class T>
+CUTE_HOST_DEVICE constexpr T&& getv(EBO<N, T, false>&& x)
+{ return cute::move(x.t_); }
+
+template <class IdxSeq, class... T>
+struct TupleBase;
+
+// Base class of cute::tuple binds each element to an index
+// by inheriting from EBO<i, t> for each (i, t) in (I..., T...).
+// The storage (for nonempty t) lives in the base classes.
+template <size_t... I, class... T>
+struct TupleBase<index_sequence<I...>, T...>
+    : EBO<I,T>...
+{
+  CUTE_HOST_DEVICE constexpr
+  TupleBase() {}
+
+  CUTE_HOST_DEVICE constexpr
+  TupleBase(T const&... t) : EBO<I,T>(t)... {}
+};
+
+} // end namespace detail
+
+// Attempting to use the following commented-out alias
+// in the declaration of `struct tuple` causes MSVC 2022 build errors.
+//
+//template <class... T>
+//using TupleBase = detail::TupleBase<make_index_sequence<sizeof...(T)>, T...>;
+
+// This is the actual cute::tuple class.
+// The storage (if any) lives in TupleBase's EBO base classes.
+//
+// Inheriting from the above alias TupleBase
+// causes MSVC 2022 build errors when assigning one tuple to another:
+// In summary: this is verbose as a work-around for MSVC build errors.
+template <class... T>
+struct tuple : detail::TupleBase<make_index_sequence<sizeof...(T)>, T...>
+{
+  CUTE_HOST_DEVICE constexpr
+  tuple() {}
+
+  CUTE_HOST_DEVICE constexpr
+  tuple(T const&... t) : detail::TupleBase<make_index_sequence<sizeof...(T)>, T...>(t...) {}
+};
+
+template <>
+struct tuple<>
+{};
+
+//
+// get for cute::tuple (just like std::get for std::tuple)
+//
+
+template <size_t I, class... T>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+get(tuple<T...> const& t) noexcept
+{
+  static_assert(I < sizeof...(T), "Index out of range");
+  return detail::getv<I>(t);
+}
+
+template <size_t I, class... T>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+get(tuple<T...>& t) noexcept
+{
+  static_assert(I < sizeof...(T), "Index out of range");
+  return detail::getv<I>(t);
+}
+
+template <size_t I, class... T>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+get(tuple<T...>&& t) noexcept
+{
+  static_assert(I < sizeof...(T), "Index out of range");
+  return detail::getv<I>(static_cast<tuple<T...>&&>(t));
+}
+
+//
+// find a type X within a cute::tuple
+//   Requires X to be unique in tuple
+//   Returns a static integer
+//
+
+template <class X, class... T>
+CUTE_HOST_DEVICE constexpr
+auto
+find(tuple<T...> const& t) noexcept
+{
+  return detail::findt<X>(t);
+}
+
+#endif // CUTLASS_USE_PACKED_TUPLE
+
+//
+// Custom is_tuple trait simply checks the existence of tuple_size
+//      and assumes std::get<I>(.), std::tuple_element<I,.>
+//
+namespace detail {
+
+template <class T>
+auto has_tuple_size( T*) -> bool_constant<(0 <= tuple_size<T>::value)>;
+auto has_tuple_size(...) -> false_type;
+
+} // end namespace detail
+
+template <class T>
+struct is_tuple : decltype(detail::has_tuple_size((T*)0)) {};
+
+template<typename T>
+constexpr bool is_tuple_v = cute::is_tuple<T>::value;
+
+//
+// make_tuple (value-based implementation)
+//
+
+template <class... T>
+CUTE_HOST_DEVICE constexpr
+tuple<T...>
+make_tuple(T const&... t)
+{
+  return {t...};
+}
+
+//
+// tuple_cat concatenates multiple cute::tuple into a single cute::tuple,
+// just like std::tuple_cat for std::tuple.
+//
+
+#if 0
+// Original implementation
+
+namespace detail {
+
+template <class T0, class T1,
+          size_t... I0, size_t... I1>
+CUTE_HOST_DEVICE constexpr
+auto
+tuple_cat(T0 const& t0, T1 const& t1,
+          index_sequence<I0...>, index_sequence<I1...>)
+{
+  return cute::make_tuple(get<I0>(t0)..., get<I1>(t1)...);
+}
+
+} // end namespace detail
+
+CUTE_HOST_DEVICE constexpr
+tuple<>
+tuple_cat()
+{
+  return {};
+}
+
+template <class Tuple,
+          __CUTE_REQUIRES(is_tuple<Tuple>::value)>
+CUTE_HOST_DEVICE constexpr
+Tuple const&
+tuple_cat(Tuple const& t)
+{
+  return t;
+}
+
+template <class T0, class T1>
+CUTE_HOST_DEVICE constexpr
+auto
+tuple_cat(T0 const& t0, T1 const& t1)
+{
+  return detail::tuple_cat(t0, t1,
+                           make_index_sequence<tuple_size<T0>::value>{},
+                           make_index_sequence<tuple_size<T1>::value>{});
+}
+
+template <class T0, class T1, class T2, class... Ts>
+CUTE_HOST_DEVICE constexpr
+auto
+tuple_cat(T0 const& t0, T1 const& t1, T2 const& t2, Ts const&... ts)
+{
+  return cute::tuple_cat(cute::tuple_cat(t0,t1),t2,ts...);
+}
+#endif
+
+#if 1
+// Extended implementation
+
+namespace detail {
+
+template <class T0, class T1,
+          size_t... I0, size_t... I1>
+CUTE_HOST_DEVICE constexpr
+auto
+tuple_cat(T0 const& t0, T1 const& t1,
+          index_sequence<I0...>, index_sequence<I1...>)
+{
+  return cute::make_tuple(get<I0>(t0)..., get<I1>(t1)...);
+}
+
+template <class T0, class T1, class T2,
+          size_t... I0, size_t... I1, size_t... I2>
+CUTE_HOST_DEVICE constexpr
+auto
+tuple_cat(T0 const& t0, T1 const& t1, T2 const& t2,
+          index_sequence<I0...>, index_sequence<I1...>, index_sequence<I2...>)
+{
+  return cute::make_tuple(get<I0>(t0)..., get<I1>(t1)..., get<I2>(t2)...);
+}
+
+template <class T0, class T1, class T2, class T3,
+          size_t... I0, size_t... I1, size_t... I2, size_t... I3>
+CUTE_HOST_DEVICE constexpr
+auto
+tuple_cat(T0 const& t0, T1 const& t1, T2 const& t2, T3 const& t3,
+          index_sequence<I0...>, index_sequence<I1...>, index_sequence<I2...>, index_sequence<I3...>)
+{
+  return cute::make_tuple(get<I0>(t0)..., get<I1>(t1)..., get<I2>(t2)..., get<I3>(t3)...);
+}
+
+template <class T0, class T1, class T2, class T3, class T4,
+          size_t... I0, size_t... I1, size_t... I2, size_t... I3, size_t... I4>
+CUTE_HOST_DEVICE constexpr
+auto
+tuple_cat(T0 const& t0, T1 const& t1, T2 const& t2, T3 const& t3, T4 const& t4,
+          index_sequence<I0...>, index_sequence<I1...>, index_sequence<I2...>, index_sequence<I3...>, index_sequence<I4...>)
+{
+  return cute::make_tuple(get<I0>(t0)..., get<I1>(t1)..., get<I2>(t2)..., get<I3>(t3)..., get<I4>(t4)...);
+}
+
+template <class T0, class T1>
+struct tuple_cat_static;
+
+template <class... T0s, class... T1s>
+struct tuple_cat_static<tuple<T0s...>, tuple<T1s...>> {
+  using type = tuple<T0s..., T1s...>;
+};
+
+} // end namespace detail
+
+CUTE_HOST_DEVICE constexpr
+tuple<>
+tuple_cat()
+{
+  return {};
+}
+
+template <class Tuple,
+          __CUTE_REQUIRES(is_tuple<Tuple>::value)>
+CUTE_HOST_DEVICE constexpr
+Tuple const&
+tuple_cat(Tuple const& t)
+{
+  return t;
+}
+
+template <class T0, class T1>
+CUTE_HOST_DEVICE constexpr
+auto
+tuple_cat(T0 const& t0, T1 const& t1)
+{
+  if constexpr (is_static<T0>::value && is_static<T1>::value &&
+		is_tuple<T0>::value && is_tuple<T1>::value) {
+    return typename detail::tuple_cat_static<T0, T1>::type{};
+  } else {
+    return detail::tuple_cat(t0, t1,
+                           make_index_sequence<tuple_size<T0>::value>{},
+                           make_index_sequence<tuple_size<T1>::value>{});
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T0, class T1, class T2>
+CUTE_HOST_DEVICE constexpr
+auto
+tuple_cat(T0 const& t0, T1 const& t1, T2 const& t2)
+{
+  return detail::tuple_cat(t0, t1, t2,
+                           make_index_sequence<tuple_size<T0>::value>{},
+                           make_index_sequence<tuple_size<T1>::value>{},
+                           make_index_sequence<tuple_size<T2>::value>{});
+}
+
+template <class T0, class T1, class T2, class T3>
+CUTE_HOST_DEVICE constexpr
+auto
+tuple_cat(T0 const& t0, T1 const& t1, T2 const& t2, T3 const& t3)
+{
+  return detail::tuple_cat(t0, t1, t2, t3,
+                           make_index_sequence<tuple_size<T0>::value>{},
+                           make_index_sequence<tuple_size<T1>::value>{},
+                           make_index_sequence<tuple_size<T2>::value>{},
+                           make_index_sequence<tuple_size<T3>::value>{});
+}
+
+template <class T0, class T1, class T2, class T3, class T4>
+CUTE_HOST_DEVICE constexpr
+auto
+tuple_cat(T0 const& t0, T1 const& t1, T2 const& t2, T3 const& t3, T4 const& t4)
+{
+  return detail::tuple_cat(t0, t1, t2, t3, t4,
+                           make_index_sequence<tuple_size<T0>::value>{},
+                           make_index_sequence<tuple_size<T1>::value>{},
+                           make_index_sequence<tuple_size<T2>::value>{},
+                           make_index_sequence<tuple_size<T3>::value>{},
+                           make_index_sequence<tuple_size<T4>::value>{});
+}
+
+template <class T0, class T1, class T2, class T3, class T4, class T5, class... Ts>
+CUTE_HOST_DEVICE constexpr
+auto
+tuple_cat(T0 const& t0, T1 const& t1, T2 const& t2, T3 const& t3, T4 const& t4, T5 const& t5, Ts const&... ts)
+{
+  return cute::tuple_cat(cute::tuple_cat(t0,t1,t2,t3,t4), cute::tuple_cat(t5, ts...));
+}
+#endif
+
+#if 0
+// Outer-Inner indexing trick to concat all tuples at once
+
+namespace detail {
+
+template <size_t... Ns>
+struct tuple_cat_helper
+{
+  static constexpr cute::array<size_t,sizeof...(Ns)> ns = {Ns...};
+
+  static constexpr size_t total_size() {
+    size_t sum = 0;
+    for (size_t n : ns) sum += n;
+    return sum;
+  }
+  static constexpr size_t total_size_ = total_size();
+
+  static constexpr auto values() {
+    cute::array<size_t[2],total_size_> outer_inner = {};
+
+    size_t idx = 0;
+    for (size_t i = 0; i < ns.size(); ++i) {
+      for (size_t j = 0; j < ns[i]; ++j, ++idx) {
+        outer_inner[idx][0] = i;
+        outer_inner[idx][1] = j;
+      }
+    }
+    return outer_inner;
+  }
+  static constexpr auto outer_inner_ = values();
+
+  using total_sequence = make_index_sequence<total_size_>;
+};
+
+template <class Helper, class Tuple, size_t... I>
+CUTE_HOST_DEVICE constexpr
+auto
+tuple_cat(Tuple const& t, index_sequence<I...>)
+{
+  return cute::make_tuple(get<Helper::outer_inner_[I][1]>(get<Helper::outer_inner_[I][0]>(t))...);
+}
+
+template <class T0, class T1,
+          size_t... I0, size_t... I1>
+CUTE_HOST_DEVICE constexpr
+auto
+tuple_cat(T0 const& t0, T1 const& t1,
+          index_sequence<I0...>, index_sequence<I1...>)
+{
+  return cute::make_tuple(get<I0>(t0)..., get<I1>(t1)...);
+}
+
+} // end namespace detail
+
+CUTE_HOST_DEVICE constexpr
+tuple<>
+tuple_cat()
+{
+  return {};
+}
+
+template <class Tuple,
+          __CUTE_REQUIRES(is_tuple<Tuple>::value)>
+CUTE_HOST_DEVICE constexpr
+Tuple const&
+tuple_cat(Tuple const& t)
+{
+  return t;
+}
+
+template <class T0, class T1>
+CUTE_HOST_DEVICE constexpr
+auto
+tuple_cat(T0 const& t0, T1 const& t1)
+{
+  return detail::tuple_cat(t0, t1,
+                           make_index_sequence<tuple_size<T0>::value>{},
+                           make_index_sequence<tuple_size<T1>::value>{});
+}
+
+template <class... Tuples>
+CUTE_HOST_DEVICE constexpr
+auto
+tuple_cat(Tuples const&... ts)
+{
+  using Helper = detail::tuple_cat_helper<tuple_size<Tuples>::value...>;
+  return detail::tuple_cat<Helper>(cute::make_tuple(ts...), typename Helper::total_sequence{});
+}
+#endif
+
+//
+// Equality operators
+//
+
+namespace detail {
+
+template <class TupleA, class TupleB, size_t... I>
+CUTE_HOST_DEVICE constexpr
+auto
+equal_impl(TupleA const& a, TupleB const& b, index_sequence<I...>)
+{
+  return (cute::true_type{} && ... && (get<I>(a) == get<I>(b)));
+}
+
+} // end namespace detail
+
+template <class TupleT, class TupleU,
+          __CUTE_REQUIRES(is_tuple<TupleT>::value && is_tuple<TupleU>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+operator==(TupleT const& t, TupleU const& u)
+{
+  if constexpr (tuple_size<TupleT>::value == tuple_size<TupleU>::value) {
+    return detail::equal_impl(t, u, make_index_sequence<tuple_size<TupleT>::value>{});
+  } else {
+    return cute::false_type{};
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class TupleT, class TupleU,
+          __CUTE_REQUIRES(is_tuple<TupleT>::value ^ is_tuple<TupleU>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+operator==(TupleT const& t, TupleU const& u)
+{
+  return cute::false_type{};
+}
+
+template <class TupleT, class TupleU,
+          __CUTE_REQUIRES(is_tuple<TupleT>::value && is_tuple<TupleU>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+operator!=(TupleT const& t, TupleU const& u)
+{
+  return !(t == u);
+}
+
+template <class TupleT, class TupleU,
+          __CUTE_REQUIRES(is_tuple<TupleT>::value ^ is_tuple<TupleU>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+operator!=(TupleT const& t, TupleU const& u)
+{
+  return cute::true_type{};
+}
+
+//
+// Comparison operators
+//
+
+//
+// There are many ways to compare tuple of elements and because CuTe is built
+//   on parameterizing layouts of coordinates, some comparisons are appropriate
+//   only in certain cases.
+//  -- lexicographical comparison [reverse, reflected, revref]
+//  -- colexicographical comparison [reverse, reflected, revref]
+//  -- element-wise comparison [any,all]
+// This can be very confusing. To avoid errors in selecting the appropriate
+//   comparison, op<|op<=|op>|op>= are *not* implemented for cute::tuple.
+//
+// That said, see int_tuple for more explicitly named common comparison ops.
+//
+
+//
+// Display utilities
+//
+
+namespace detail {
+
+template <class Tuple, size_t... Is>
+CUTE_HOST_DEVICE void print_tuple(Tuple const& t, index_sequence<Is...>, char s = '(', char e = ')')
+{
+  using cute::print;
+  if (sizeof...(Is) == 0) {
+    print(s);
+  } else {
+    ((void(print(Is == 0 ? s : ',')), void(print(get<Is>(t)))), ...);
+  }
+  print(e);
+}
+
+#if !defined(__CUDACC_RTC__)
+template <class Tuple, std::size_t... Is>
+CUTE_HOST std::ostream& print_tuple_os(std::ostream& os, Tuple const& t, index_sequence<Is...>, char s = '(', char e = ')')
+{
+  if (sizeof...(Is) == 0) {
+    os << s;
+  } else {
+    (void(os << (Is == 0 ? s : ',') << get<Is>(t)), ...);
+  }
+  return os << e;
+}
+#endif // !defined(__CUDACC_RTC__)
+
+} // end namespace detail
+
+template <class Tuple,
+          __CUTE_REQUIRES(is_tuple<Tuple>::value)>
+CUTE_HOST_DEVICE void print(Tuple const& t)
+{
+  return detail::print_tuple(t, make_index_sequence<tuple_size<Tuple>::value>{});
+}
+
+#if !defined(__CUDACC_RTC__)
+template <class Tuple,
+          __CUTE_REQUIRES(is_tuple<Tuple>::value)>
+CUTE_HOST std::ostream& operator<<(std::ostream& os, Tuple const& t)
+{
+  return detail::print_tuple_os(os, t, make_index_sequence<tuple_size<Tuple>::value>{});
+}
+#endif // !defined(__CUDACC_RTC__)
+
+} // end namespace cute
+
+#if ! defined(CUTLASS_USE_PACKED_TUPLE)
+
+namespace CUTE_STL_NAMESPACE
+{
+
+template <class... T>
+struct tuple_size<cute::tuple<T...>>
+    : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
+{};
+
+template <size_t I, class... T>
+struct tuple_element<I, cute::tuple<T...>>
+    : CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>
+{};
+
+template <class... T>
+struct tuple_size<const cute::tuple<T...>>
+    : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
+{};
+
+template <size_t I, class... T>
+struct tuple_element<I, const cute::tuple<T...>>
+    : CUTE_STL_NAMESPACE::tuple_element<I, const CUTE_STL_NAMESPACE::tuple<T...>>
+{};
+
+} // end namespace CUTE_STL_NAMESPACE
+
+//
+// std compatibility
+//
+
+#ifdef CUTE_STL_NAMESPACE_IS_CUDA_STD
+namespace std
+{
+
+#if defined(__CUDACC_RTC__)
+template <class... _Tp>
+struct tuple_size;
+
+template <size_t _Ip, class... _Tp>
+struct tuple_element;
+#endif
+
+template <class... T>
+struct tuple_size<cute::tuple<T...>>
+    : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
+{};
+
+template <size_t I, class... T>
+struct tuple_element<I, cute::tuple<T...>>
+    : CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>
+{};
+
+template <class... T>
+struct tuple_size<const cute::tuple<T...>>
+    : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
+{};
+
+template <size_t I, class... T>
+struct tuple_element<I, const cute::tuple<T...>>
+    : CUTE_STL_NAMESPACE::tuple_element<I, const CUTE_STL_NAMESPACE::tuple<T...>>
+{};
+
+} // end namespace std
+#endif // CUTE_STL_NAMESPACE_IS_CUDA_STD
+
+#endif // CUTLASS_USE_PACKED_TUPLE
diff --git a/include/cute/container/type_list.hpp b/include/cute/container/type_list.hpp
new file mode 100644
index 0000000000..a15f2c1c15
--- /dev/null
+++ b/include/cute/container/type_list.hpp
@@ -0,0 +1,124 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>            // CUTE_HOST_DEVICE, CUTE_STL_NAMESPACE
+
+namespace cute
+{
+
+template <class... T>
+struct type_list {};
+
+// get<I> for type_list<T...>
+//   requires tuple_element_t<I,type_list<T...>> to have std::is_default_constructible
+template <size_t I, class... T>
+CUTE_HOST_DEVICE constexpr
+CUTE_STL_NAMESPACE::tuple_element_t<I, type_list<T...>>
+get(type_list<T...> const& t) noexcept {
+  return {};
+}
+
+} // end namespace cute
+
+//
+// Specialize tuple-related functionality for cute::type_list
+//
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/tuple>
+#else
+#include <tuple>
+#endif
+
+namespace CUTE_STL_NAMESPACE
+{
+
+template <class... T>
+struct tuple_size<cute::type_list<T...>>
+    : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
+{};
+
+template <size_t I, class... T>
+struct tuple_element<I, cute::type_list<T...>>
+{
+  using type = typename CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>::type;
+};
+
+template <class... T>
+struct tuple_size<const cute::type_list<T...>>
+    : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
+{};
+
+template <size_t I, class... T>
+struct tuple_element<I, const cute::type_list<T...>>
+{
+  using type = typename CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>::type;
+};
+
+} // end namespace std
+
+#ifdef CUTE_STL_NAMESPACE_IS_CUDA_STD
+namespace std
+{
+
+#if defined(__CUDACC_RTC__)
+template <class... _Tp>
+struct tuple_size;
+
+template <size_t _Ip, class... _Tp>
+struct tuple_element;
+#endif
+
+template <class... T>
+struct tuple_size<cute::type_list<T...>>
+    : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
+{};
+
+template <size_t I, class... T>
+struct tuple_element<I, cute::type_list<T...>>
+{
+  using type = typename CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>::type;
+};
+
+template <class... T>
+struct tuple_size<const cute::type_list<T...>>
+    : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
+{};
+
+template <size_t I, class... T>
+struct tuple_element<I, const cute::type_list<T...>>
+{
+  using type = typename CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>::type;
+};
+
+} // end namespace std
+#endif // CUTE_STL_NAMESPACE_IS_CUDA_STD
diff --git a/include/cute/int_tuple.hpp b/include/cute/int_tuple.hpp
new file mode 100644
index 0000000000..95d06bbdd7
--- /dev/null
+++ b/include/cute/int_tuple.hpp
@@ -0,0 +1,864 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>                      // CUTE_HOST_DEVICE
+#include <cute/container/array.hpp>             // cute::array
+#include <cute/container/tuple.hpp>             // cute::is_tuple
+#include <cute/numeric/integral_constant.hpp>   // cute::Int
+#include <cute/algorithm/tuple_algorithms.hpp>  // cute::transform
+
+/** IntTuple is an integer or a tuple of IntTuples.
+ * This file holds utilities for working with IntTuples,
+ * but does not hold a concrete concept or class of IntTuple.
+ */
+
+namespace cute
+{
+
+// Implementation of get<0>(Integral).
+//   Even though is_tuple<Integral> is false and tuple_size<Integral> doesn't compile,
+//   CuTe defines rank(Integral) as 1, so it's useful for get<0>(Integral) to return its input
+template <size_t I, class T, __CUTE_REQUIRES(cute::is_integral<cute::remove_cvref_t<T>>::value)>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+get(T&& t) noexcept
+{
+  static_assert(I == 0, "Index out of range");
+  return static_cast<T&&>(t);
+}
+
+// Custom recursive get for anything that implements get<I>(.) (for a single integer I).
+template <size_t I0, size_t I1, size_t... Is, class T>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+get(T&& t) noexcept
+{
+  return get<I1, Is...>(get<I0>(static_cast<T&&>(t)));
+}
+
+//
+// rank
+//
+
+template <int... Is, class IntTuple>
+CUTE_HOST_DEVICE constexpr
+auto
+rank(IntTuple const& t)
+{
+  if constexpr (sizeof...(Is) == 0) {
+    if constexpr (is_tuple<IntTuple>::value) {
+      return Int<tuple_size<IntTuple>::value>{};
+    } else {
+      return Int<1>{};
+    }
+  } else {
+    return rank(get<Is...>(t));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class IntTuple>
+using rank_t = decltype(rank(declval<IntTuple>()));
+
+template <class IntTuple>
+static constexpr auto rank_v = rank_t<IntTuple>::value;
+
+//
+// shape
+//
+
+template <class IntTuple>
+CUTE_HOST_DEVICE constexpr
+auto
+shape(IntTuple const& s)
+{
+  if constexpr (is_tuple<IntTuple>::value) {
+    return transform(s, [](auto const& a) { return shape(a); });
+  } else {
+    return s;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <int I, int... Is, class IntTuple>
+CUTE_HOST_DEVICE constexpr
+auto
+shape(IntTuple const& s)
+{
+  if constexpr (is_tuple<IntTuple>::value) {
+    return shape<Is...>(get<I>(s));
+  } else {
+    return get<I,Is...>(shape(s));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// max
+//
+
+template <class T0, class... Ts>
+CUTE_HOST_DEVICE constexpr
+auto
+max(T0 const& t0, Ts const&... ts)
+{
+  if constexpr (is_tuple<T0>::value) {
+    return cute::max(cute::apply(t0, [](auto const&... a){ return cute::max(a...); }), ts...);
+  } else if constexpr (sizeof...(Ts) == 0) {
+    return t0;
+  } else {
+    return cute::max(t0, cute::max(ts...));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// min
+//
+
+template <class T0, class... Ts>
+CUTE_HOST_DEVICE constexpr
+auto
+min(T0 const& t0, Ts const&... ts)
+{
+  if constexpr (is_tuple<T0>::value) {
+    return cute::min(cute::apply(t0, [](auto const&... a){ return cute::min(a...); }), ts...);
+  } else if constexpr (sizeof...(Ts) == 0) {
+    return t0;
+  } else {
+    return cute::min(t0, cute::min(ts...));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// gcd
+//
+
+template <class T0, class... Ts>
+CUTE_HOST_DEVICE constexpr
+auto
+gcd(T0 const& t0, Ts const&... ts)
+{
+  if constexpr (is_tuple<T0>::value) {
+    return cute::gcd(cute::apply(t0, [](auto const&... a){ return cute::gcd(a...); }), ts...);
+  } else if constexpr (sizeof...(Ts) == 0) {
+    return t0;
+  } else {
+    return cute::gcd(t0, cute::gcd(ts...));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// depth
+//
+
+template <int... Is, class IntTuple>
+CUTE_HOST_DEVICE constexpr
+auto
+depth(IntTuple const& t)
+{
+  if constexpr (sizeof...(Is) == 0) {
+    if constexpr (is_tuple<IntTuple>::value) {
+      return Int<1>{} + cute::apply(t, [](auto const&... v){ return cute::max(depth(v)...); });
+    } else {
+      return Int<0>{};
+    }
+  } else {
+    return depth(get<Is...>(t));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class Tuple>
+using depth_t = decltype(depth(declval<Tuple>()));
+
+template <class Tuple>
+static constexpr auto depth_v = depth_t<Tuple>::value;
+
+//
+// product
+//
+
+// Implementation of product as a function object
+struct Product
+{
+  template <class IntTuple>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  operator()(IntTuple const& a) const
+  {
+    if constexpr (is_tuple<IntTuple>::value) {
+      if constexpr (tuple_size<IntTuple>::value == 0) {
+        return Int<1>{};
+      } else {
+        return cute::transform_apply(a, Product{}, multiplies_unary_lfold{});
+      }
+    } else if constexpr (cute::is_integral<IntTuple>::value) {
+      return a;
+    }
+
+    CUTE_GCC_UNREACHABLE;
+  }
+};
+// Callable product function object
+CUTE_INLINE_CONSTANT Product product;
+
+// Return a rank(t) tuple @a result such that get<i>(@a result) = product(get<i>(@a t))
+template <class Tuple>
+CUTE_HOST_DEVICE constexpr
+auto
+product_each(Tuple const& t)
+{
+  return transform(wrap(t), product);
+}
+
+// Take the product of Tuple at the leaves of TupleG
+template <class Tuple, class TupleG>
+CUTE_HOST_DEVICE constexpr
+auto
+product_like(Tuple const& tuple, TupleG const& guide)
+{
+  return transform_leaf(guide, tuple, [](auto const& g, auto const& t) { return product(t); });
+}
+
+// Return the product of elements in a mode
+template <int... Is, class IntTuple>
+CUTE_HOST_DEVICE constexpr
+auto
+size(IntTuple const& a)
+{
+  if constexpr (sizeof...(Is) == 0) {
+    return product(a);
+  } else {
+    return size(get<Is...>(a));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class IntTuple>
+static constexpr auto size_v = decltype(size(declval<IntTuple>()))::value;
+
+//
+// sum
+//
+
+template <class IntTuple>
+CUTE_HOST_DEVICE constexpr
+auto
+sum(IntTuple const& a)
+{
+  if constexpr (is_tuple<IntTuple>::value) {
+    return cute::apply(a, [](auto const&... v){ return (Int<0>{} + ... + sum(v)); });
+  } else {
+    return a;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// inner_product
+//
+
+template <class IntTupleA, class IntTupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+inner_product(IntTupleA const& a, IntTupleB const& b)
+{
+  if constexpr (is_tuple<IntTupleA>::value && is_tuple<IntTupleB>::value) {
+    static_assert(tuple_size<IntTupleA>::value == tuple_size<IntTupleB>::value, "Mismatched ranks");
+    return transform_apply(a, b, [](auto const& x, auto const& y) { return inner_product(x,y); },
+                                 [](auto const&... v) { return (Int<0>{} + ... + v); });
+  } else {
+    return a * b;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// ceil_div
+//
+
+template <class IntTupleA, class IntTupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+ceil_div(IntTupleA const& a, IntTupleB const& b)
+{
+  if constexpr (is_tuple<IntTupleA>::value) {
+    if constexpr (is_tuple<IntTupleB>::value) {  // tuple tuple
+      static_assert(tuple_size<IntTupleA>::value >= tuple_size<IntTupleB>::value, "Mismatched ranks");
+      constexpr int R = tuple_size<IntTupleA>::value;        // Missing ranks in TupleB are implicitly 1
+      return transform(a, append<R>(b,Int<1>{}), [](auto const& x, auto const& y) { return ceil_div(x,y); });
+    } else {                                     // tuple int
+      auto const [result, rest] = fold(a, cute::make_tuple(cute::make_tuple(), b),
+        [] (auto const& init, auto const& ai) {
+          return cute::make_tuple(append(get<0>(init), ceil_div(ai, get<1>(init))), ceil_div(get<1>(init), ai));
+        });
+      return result;
+    }
+  } else
+  if constexpr (is_tuple<IntTupleB>::value) {    // int tuple
+    return ceil_div(a, product(b));
+  } else {
+    return (a + b - Int<1>{}) / b;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// round_up
+//   Round @a a up to the nearest multiple of @a b.
+//   For negative numbers, rounds away from zero.
+//
+
+template <class IntTupleA, class IntTupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+round_up(IntTupleA const& a, IntTupleB const& b)
+{
+  if constexpr (is_tuple<IntTupleA>::value && is_tuple<IntTupleB>::value) {
+    static_assert(tuple_size<IntTupleA>::value >= tuple_size<IntTupleB>::value, "Mismatched ranks");
+    constexpr int R = tuple_size<IntTupleA>::value;        // Missing ranks in TupleB are implicitly 1
+    return transform(a, append<R>(b,Int<1>{}), [](auto const& x, auto const& y) { return round_up(x,y); });
+  } else {
+    return ((a + b - Int<1>{}) / b) * b;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+/** Division for Shapes
+ * Case Tuple Tuple:
+ *   Perform shape_div element-wise
+ * Case Tuple Int:
+ *   Fold the division of b across each element of a
+ *   Example: shape_div((4,5,6),40) -> shape_div((1,5,6),10) -> shape_div((1,1,6),2) -> (1,1,3)
+ * Case Int Tuple:
+ *   Return shape_div(a, product(b))
+ * Case Int Int:
+ *   Enforce the divisibility condition a % b == 0 || b % a == 0 when possible
+ *   Return a / b with rounding away from 0 (that is, 1 or -1 when a < b)
+ */
+template <class IntTupleA, class IntTupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+shape_div(IntTupleA const& a, IntTupleB const& b)
+{
+  if constexpr (is_tuple<IntTupleA>::value) {
+    if constexpr (is_tuple<IntTupleB>::value) {  // tuple tuple
+      static_assert(tuple_size<IntTupleA>::value == tuple_size<IntTupleB>::value, "Mismatched ranks");
+      return transform(a, b, [](auto const& x, auto const& y) { return shape_div(x,y); });
+    } else {                                     // tuple int
+      auto const [result, rest] = fold(a, cute::make_tuple(cute::make_tuple(), b),
+        [] (auto const& init, auto const& ai) {
+          return cute::make_tuple(append(get<0>(init), shape_div(ai, get<1>(init))), shape_div(get<1>(init), ai));
+        });
+      return result;
+    }
+  } else
+  if constexpr (is_tuple<IntTupleB>::value) {    // int tuple
+    return shape_div(a, product(b));
+  } else
+  if constexpr (is_static<IntTupleA>::value && is_static<IntTupleB>::value) {
+    static_assert(IntTupleA::value % IntTupleB::value == 0 || IntTupleB::value % IntTupleA::value == 0, "Static shape_div failure");
+    return C<shape_div(IntTupleA::value, IntTupleB::value)>{};
+  } else {                                       // int int
+    //assert(a % b == 0 || b % a == 0);          // Waive dynamic assertion
+    return a / b != 0 ? a / b : signum(a) * signum(b);  // Division with rounding away from zero
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+/** Minimum for Shapes
+ */
+template <class IntTupleA, class IntTupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+shape_min(IntTupleA const& a, IntTupleB const& b)
+{
+  if constexpr (is_tuple<IntTupleA>::value || is_tuple<IntTupleB>::value) {
+    static_assert(dependent_false<IntTupleA>, "Not implemented.");
+  } else
+  if constexpr (is_constant<1, IntTupleA>::value || is_constant<1, IntTupleB>::value) {
+    return Int<1>{};            // _1 is less than all other shapes, preserve static
+  } else {
+    return cute::min(a, b);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+/** Return a tuple the same profile as A scaled by corresponding elements in B
+ */
+template <class A, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+elem_scale(A const& a, B const& b)
+{
+  if constexpr (is_tuple<A>::value) {
+    return transform(a, b, [](auto const& x, auto const& y) { return elem_scale(x,y); });
+  } else {
+    return a * product(b);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+/** Test if two IntTuple have the same profile (hierarchical rank division)
+ */
+template <class IntTupleA, class IntTupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+congruent(IntTupleA const& a, IntTupleB const& b)
+{
+  return bool_constant<is_same<decltype(repeat_like(shape(a),_0{})),
+                               decltype(repeat_like(shape(b),_0{}))>::value>{};
+}
+
+template <class A, class B>
+using is_congruent = decltype(congruent(declval<A>(), declval<B>()));
+
+/** Test if two IntTuple have the similar profiles up to Shape A (hierarchical rank division)
+ * weakly_congruent is a partial order on A and B: A <= B
+ */
+template <class IntTupleA, class IntTupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+weakly_congruent(IntTupleA const& a, IntTupleB const& b)
+{
+  if constexpr (is_tuple<IntTupleA>::value && is_tuple<IntTupleB>::value) {
+    if constexpr (tuple_size<IntTupleA>::value != tuple_size<IntTupleB>::value) {
+      return false_type{};
+    } else {
+      return transform_apply(a, b, [](auto const& x, auto const& y) { return weakly_congruent(x,y); },
+                                   [](auto const&... z) { return (true_type{} && ... && z); });
+    }
+  } else if constexpr (is_integral<IntTupleA>::value) {
+    return true_type{};
+  } else if constexpr (is_integral<IntTupleB>::value) {
+    return false_type{};
+  } else {
+    return weakly_congruent(shape(a), shape(b));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class A, class B>
+using is_weakly_congruent = decltype(weakly_congruent(declval<A>(), declval<B>()));
+
+/** Test if Shape A is compatible with Shape B:
+ *    the size of A and B are the same, and
+ *    any coordinate into A can also be used as a coordinate into B
+ * Equivalently, the size of Shape B is the same as Shape A at each terminal of Shape A.
+ * compatible is a partial order on A and B: A <= B
+ */
+template <class IntTupleA, class IntTupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+compatible(IntTupleA const& a, IntTupleB const& b)
+{
+  if constexpr (is_tuple<IntTupleA>::value && is_tuple<IntTupleB>::value) {
+    if constexpr (tuple_size<IntTupleA>::value != tuple_size<IntTupleB>::value) {
+      return false_type{};
+    } else {
+      return transform_apply(a, b, [](auto const& x, auto const& y) { return compatible(x,y); },
+                                   [](auto const&... z) { return (true_type{} && ... && z); });
+    }
+  } else if constexpr (is_integral<IntTupleA>::value) {
+    return a == size(b);
+  } else if constexpr (is_integral<IntTupleB>::value) {
+    return false_type{};
+  } else {
+    return compatible(shape(a), shape(b));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class A, class B>
+using is_compatible = decltype(compatible(declval<A>(), declval<B>()));
+
+/** Test if Shape A is evenly divided by Tiler B
+ * @returns Static or dynamic boolean
+ * @post if result is true_type, then
+ *       size(a) == logical_divide(make_layout(shape(a)),b) will always compile
+ *       and result in true_type.
+ */
+template <class Shape, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+evenly_divides(Shape const& a, Tiler const& b)
+{
+  if constexpr (is_tuple<Tiler>::value) {
+    if constexpr (rank_v<Tiler> > rank_v<Shape>) {
+      return false_type{};
+    } else {
+      return transform_apply(b, a, [](auto const& x, auto const& y) { return evenly_divides(y,x); },
+                                   [](auto const&... z) { return (true_type{} && ... && z); });
+    }
+  } else {
+    return size(a) == size(b) * size(ceil_div(shape(a), b));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+/** Replace the elements of Tuple B that are paired with an Int<0> with an Int<1>
+ */
+template <class IntTupleA, class IntTupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+filter_zeros(IntTupleA const& a, IntTupleB const& b)
+{
+  if constexpr (is_tuple<IntTupleA>::value) {
+    return transform(a, b, [](auto const& x, auto const& y) { return filter_zeros(x,y); });
+  } else if constexpr (is_constant<0, IntTupleA>::value) {
+    return repeat_like(b, Int<1>{});
+  } else {
+    return b;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class Tuple>
+CUTE_HOST_DEVICE constexpr
+auto
+filter_zeros(Tuple const& t)
+{
+  return filter_zeros(t, t);
+}
+
+//
+// Converters and constructors with arrays and params
+//
+
+/** Make an IntTuple of rank N from an Indexable array.
+ * Access elements up to a dynamic index n, then use init (requires compatible types)
+ * Consider cute::take<B,E> if all indexing is known to be valid
+ * \code
+ *   std::vector<int> a = {6,3,4};
+ *   auto tup = make_int_tuple<5>(a, a.size(), 0)            // (6,3,4,0,0)
+ * \endcode
+ */
+template <int N, class Indexable, class T>
+CUTE_HOST_DEVICE constexpr
+auto
+make_int_tuple(Indexable const& t, int n, T const& init)
+{
+  static_assert(N > 0);
+  if constexpr (N == 1) {
+    return 0 < n ? t[0] : init;
+  } else {
+    return transform(make_seq<N>{}, [&](auto i) { return i < n ? t[i] : init; });
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+/** Fill the dynamic values of a Tuple with values from another Tuple
+ * \code
+ *   auto params = make_tuple(6,3,4);
+ *   cute::tuple<Int<1>, cute::tuple<int, int, Int<3>>, int, Int<2>> result;
+ *   fill_int_tuple_from(result, params);                    // (_1,(6,3,_3),4,_2)
+ * \endcode
+ */
+template <class Tuple, class TupleV>
+CUTE_HOST_DEVICE constexpr
+auto
+fill_int_tuple_from(Tuple& result, TupleV const& vals)
+{
+  return fold(result, vals, [](auto const& init, auto&& r) {
+    if constexpr (is_static<remove_cvref_t<decltype(r)>>::value) {       // Skip static elements of result
+      return init;
+    } else if constexpr (is_tuple<remove_cvref_t<decltype(r)>>::value) { // Recurse into tuples
+      return fill_int_tuple_from(r, init);
+    } else {                                                             // Assign and consume arg
+      static_assert(tuple_size<remove_cvref_t<decltype(init)>>::value > 0, "Not enough values to fill with!");
+      r = get<0>(init);
+      return remove<0>(init);
+    }
+
+    CUTE_GCC_UNREACHABLE;
+  });
+}
+
+/** Make a "Tuple" by filling in the dynamic values in order from the arguments
+ * \code
+ *   using result_t = cute::tuple<Int<1>, cute::tuple<int, int, Int<3>>, int, Int<2>>;
+ *   auto result = make_int_tuple_from<result_t>(6,3,4);     // (_1,(6,3,_3),4,_2)
+ * \endcode
+ */
+template <class Tuple, class... Ts>
+CUTE_HOST_DEVICE constexpr
+Tuple
+make_int_tuple_from(Ts const&... ts)
+{
+  Tuple result = Tuple{};
+  fill_int_tuple_from(result, cute::make_tuple(ts...));
+  return result;
+}
+
+/** Convert a tuple to a flat homogeneous array of type T
+ * \code
+ *   auto tup = cute::make_tuple(Int<1>{}, cute::make_tuple(6,3,Int<3>{}),4,Int<2>{});
+ *   cute::array<uint64_t,6> result = to_array<uint64_t>(tup);   // [1,6,3,3,4,2]
+ * \endcode
+ */
+template <class T = int64_t, class IntTuple>
+CUTE_HOST_DEVICE constexpr
+auto
+to_array(IntTuple const& t)
+{
+  auto flat_t = flatten_to_tuple(t);
+  constexpr int N = tuple_size<decltype(flat_t)>::value;
+  cute::array<T,N> result;
+  for_each(make_seq<N>{}, [&] (auto i) { result[i] = get<i>(flat_t); });
+  return result;
+}
+
+//
+// Comparison operators
+//
+
+//
+// There are many ways to compare tuple of elements and because CuTe is built
+//   on parameterizing layouts of coordinates, some comparisons are appropriate
+//   only in certain cases.
+//  -- lexicographical comparison [reverse, reflected, revref]   : Correct for coords in RowMajor Layout
+//  -- colexicographical comparison [reverse, reflected, revref] : Correct for coords in ColMajor Layout
+//  -- element-wise comparison [any,all]                         :
+// This can be very confusing. To avoid errors in selecting the appropriate
+//   comparison, op<|op<=|op>|op>= are *not* implemented for cute::tuple.
+//
+// When actually desiring to order coordinates, the user should map them to
+//   their indices within the Layout they came from:
+//      e.g.  layoutX(coordA) < layoutX(coordB)
+// That said, we implement the three most common ways to compare tuples below.
+//   These are implemented with slighly more explicit names than op<.
+//
+
+template <class IntTupleA, class IntTupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+lex_less(IntTupleA const& a, IntTupleB const& b);
+
+template <class IntTupleA, class IntTupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+colex_less(IntTupleA const& a, IntTupleB const& b);
+
+template <class IntTupleA, class IntTupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+elem_less(IntTupleA const& a, IntTupleB const& b);
+
+namespace detail {
+
+template <size_t I, class TupleA, class TupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+lex_less_impl(TupleA const& a, TupleB const& b)
+{
+  if constexpr (I == tuple_size<TupleB>::value) {
+    return cute::false_type{};    // Terminal: TupleB is exhausted
+  } else if constexpr (I == tuple_size<TupleA>::value) {
+    return cute::true_type{};     // Terminal: TupleA is exhausted, TupleB is not exhausted
+  } else {
+    return lex_less(get<I>(a), get<I>(b)) || (get<I>(a) == get<I>(b) && lex_less_impl<I+1>(a,b));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <size_t I, class TupleA, class TupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+colex_less_impl(TupleA const& a, TupleB const& b)
+{
+  if constexpr (I == tuple_size<TupleB>::value) {
+    return cute::false_type{};    // Terminal: TupleB is exhausted
+  } else if constexpr (I == tuple_size<TupleA>::value) {
+    return cute::true_type{};     // Terminal: TupleA is exhausted, TupleB is not exhausted
+  } else {
+    constexpr size_t A = tuple_size<TupleA>::value - 1 - I;
+    constexpr size_t B = tuple_size<TupleB>::value - 1 - I;
+    return colex_less(get<A>(a), get<B>(b)) || (get<A>(a) == get<B>(b) && colex_less_impl<I+1>(a,b));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <size_t I, class TupleA, class TupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+elem_less_impl(TupleA const& a, TupleB const& b)
+{
+  if constexpr (I == tuple_size<TupleA>::value) {
+    return cute::true_type{};     // Terminal: TupleA is exhausted
+  } else if constexpr (I == tuple_size<TupleB>::value) {
+    return cute::false_type{};    // Terminal: TupleA is not exhausted, TupleB is exhausted
+  } else {
+    return elem_less(get<I>(a), get<I>(b)) && elem_less_impl<I+1>(a,b);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+} // end namespace detail
+
+// Lexicographical comparison
+
+template <class IntTupleA, class IntTupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+lex_less(IntTupleA const& a, IntTupleB const& b)
+{
+  if constexpr (is_tuple<IntTupleA>::value && is_tuple<IntTupleB>::value) {
+    return detail::lex_less_impl<0>(a, b);
+  } else {
+    return a < b;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T, class U>
+CUTE_HOST_DEVICE constexpr
+auto
+lex_leq(T const& t, U const& u) {
+  return !lex_less(u, t);
+}
+
+template <class T, class U>
+CUTE_HOST_DEVICE constexpr
+auto
+lex_gtr(T const& t, U const& u) {
+  return lex_less(u, t);
+}
+
+template <class T, class U>
+CUTE_HOST_DEVICE constexpr
+auto
+lex_geq(T const& t, U const& u) {
+  return !lex_less(t, u);
+}
+
+// Colexicographical comparison
+
+template <class IntTupleA, class IntTupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+colex_less(IntTupleA const& a, IntTupleB const& b)
+{
+  if constexpr (is_tuple<IntTupleA>::value && is_tuple<IntTupleB>::value) {
+    return detail::colex_less_impl<0>(a, b);
+  } else {
+    return a < b;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T, class U>
+CUTE_HOST_DEVICE constexpr
+auto
+colex_leq(T const& t, U const& u) {
+  return !colex_less(u, t);
+}
+
+template <class T, class U>
+CUTE_HOST_DEVICE constexpr
+auto
+colex_gtr(T const& t, U const& u) {
+  return colex_less(u, t);
+}
+
+template <class T, class U>
+CUTE_HOST_DEVICE constexpr
+auto
+colex_geq(T const& t, U const& u) {
+  return !colex_less(t, u);
+}
+
+// Elementwise [all] comparison
+
+template <class IntTupleA, class IntTupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+elem_less(IntTupleA const& a, IntTupleB const& b)
+{
+  if constexpr (is_tuple<IntTupleA>::value && is_tuple<IntTupleB>::value) {
+    return detail::elem_less_impl<0>(a, b);
+  } else {
+    return a < b;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T, class U>
+CUTE_HOST_DEVICE constexpr
+auto
+elem_leq(T const& t, U const& u) {
+  return !elem_less(u, t);
+}
+
+template <class T, class U>
+CUTE_HOST_DEVICE constexpr
+auto
+elem_gtr(T const& t, U const& u) {
+  return elem_less(u, t);
+}
+
+template <class T, class U>
+CUTE_HOST_DEVICE constexpr
+auto
+elem_geq(T const& t, U const& u) {
+  return !elem_less(t, u);
+}
+
+} // end namespace cute
diff --git a/include/cute/layout.hpp b/include/cute/layout.hpp
new file mode 100644
index 0000000000..26195a4782
--- /dev/null
+++ b/include/cute/layout.hpp
@@ -0,0 +1,2058 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+#include <cute/int_tuple.hpp>
+#include <cute/stride.hpp>
+#include <cute/underscore.hpp>
+#include <cute/numeric/arithmetic_tuple.hpp>
+#include <cute/numeric/integral_constant.hpp>
+#include <cute/numeric/integral_ratio.hpp>
+#include <cute/numeric/numeric_types.hpp>  // cute::sizeof_bits
+
+namespace cute
+{
+
+// Aliases
+
+template <class... Shapes>
+using Shape = cute::tuple<Shapes...>;
+
+template <class... Strides>
+using Stride = cute::tuple<Strides...>;
+
+template <class... Strides>
+using Step = cute::tuple<Strides...>;
+
+template <class... Coords>
+using Coord = cute::tuple<Coords...>;
+
+template <class... Layouts>
+using Tile = cute::tuple<Layouts...>;
+
+template <class... Ts>
+CUTE_HOST_DEVICE constexpr
+Shape<Ts...>
+make_shape(Ts const&... t) {
+  return {t...};
+}
+template <class... Ts>
+CUTE_HOST_DEVICE constexpr
+Stride<Ts...>
+make_stride(Ts const&... t) {
+  return {t...};
+}
+template <class... Ts>
+CUTE_HOST_DEVICE constexpr
+Step<Ts...>
+make_step(Ts const&... t) {
+  return {t...};
+}
+template <class... Ts>
+CUTE_HOST_DEVICE constexpr
+Coord<Ts...>
+make_coord(Ts const&... t) {
+  return {t...};
+}
+template <class... Ts>
+CUTE_HOST_DEVICE constexpr
+Tile<Ts...>
+make_tile(Ts const&... t)
+{
+  return {t...};
+}
+
+//
+// Layout
+//
+
+template <class Shape, class Stride = LayoutLeft::Apply<Shape> >
+struct Layout
+    : private cute::tuple<Shape, Stride>   // EBO for static layouts
+{
+  // Expensive in compilation time...
+  //static_assert(is_congruent<Shape, Stride>::value, "Shape and Stride must be congruent");
+
+  // NOTE: This defaults static Shapes/Strides correctly, but not dynamic
+  CUTE_HOST_DEVICE constexpr
+  Layout(Shape  const& shape  = {}, Stride const& stride = {})
+      : cute::tuple<Shape, Stride>(shape, stride)
+  {}
+
+  //
+  // Accessors
+  //
+
+  static constexpr int rank  = rank_v<Shape>;
+
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  layout() {
+    return *this;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  layout() const {
+    return *this;
+  }
+
+  template <int... I>
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  shape() {
+    return get<0,I...>(static_cast<cute::tuple<Shape, Stride>&>(*this));
+  }
+
+  template <int... I>
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  shape() const {
+    return get<0,I...>(static_cast<cute::tuple<Shape, Stride> const&>(*this));
+  }
+
+  template <int... I>
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  stride() {
+    return get<1,I...>(static_cast<cute::tuple<Shape, Stride>&>(*this));
+  }
+
+  template <int... I>
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  stride() const {
+    return get<1,I...>(static_cast<cute::tuple<Shape, Stride> const&>(*this));
+  }
+
+  //
+  // Mappings
+  //
+
+  // Map a logical coordinate to a linear index (Coord has no Underscore slice operators)
+  // OR
+  // Slice the layout and return the sublayout (Coord has an Underscore slice op)
+  template <class Coord>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  operator()(Coord const& coord) const {
+    if constexpr (has_underscore<Coord>::value) {
+      return slice(coord, *this);
+    } else {
+      return crd2idx(coord, shape(), stride());
+    }
+
+    CUTE_GCC_UNREACHABLE;
+  }
+
+  // Convenience function for multi-dimensional coordinates
+  template <class Coord0, class Coord1, class... Coords>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  operator()(Coord0 const& c0, Coord1 const& c1, Coords const&... cs) const {
+    return operator()(make_coord(c0,c1,cs...));
+  }
+
+  //
+  // Compose
+  //
+
+  template <class OtherLayout>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  compose(OtherLayout const& other) const {
+    return composition(*this, other);
+  }
+
+  template <class... Layouts>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  compose(Layouts const&... layouts) const {
+    return composition(*this, make_tile(layouts...));
+  }
+
+  template <class OtherShape>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  with_shape(OtherShape const& shape) const {
+    return composition(*this, make_layout(shape));
+  }
+
+  template <class... Shapes>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  with_shape(Shapes const&... shapes) const {
+    return composition(*this, make_layout(make_shape(shapes...)));
+  }
+
+  //
+  // Tile
+  //
+
+  template <class OtherLayout>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  tile(OtherLayout const& other) const {
+    return tiled_divide(*this, other);
+  }
+
+  template <class... Layouts>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  tile(Layouts const&... layouts) const {
+    return tiled_divide(*this, make_tile(layouts...));
+  }
+
+  //
+  // Utility
+  //
+
+  //
+  // Index to Coordinate
+  //
+
+  // NOTE: Only valid for compact layouts
+
+  // Return the (hierarchical) ND logical coordinate corresponding to the linear index
+  // @post crd2idx(@a result, shape(), stride()) == idx
+  // @post congruent(@a result, shape())
+  template <class IInt,
+            __CUTE_REQUIRES(is_integral<IInt>::value)>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_hier_coord(IInt const& idx) const {
+    return cute::idx2crd(idx, shape(), stride());
+  }
+
+  // Return the (flat) ND logical coordinate corresponding to the linear index
+  // @post crd2idx(@a result, shape(), stride()) == idx
+  // @post rank(@a result) == rank(shape()) && depth(@a result) == 1
+  template <class IInt,
+            __CUTE_REQUIRES(is_integral<IInt>::value)>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_flat_coord(IInt const& idx) const {
+    return cute::crd2crd(this->get_hier_coord(idx), shape(), repeat<rank>(Int<1>{}));
+  }
+
+  // Return the generalized column-major 1D logical coordinate corresponding to the linear index
+  // @post crd2idx(@a result, shape(), stride()) == idx
+  // @post is_integral<decltype(@a result)>::value
+  template <class IInt,
+            __CUTE_REQUIRES(is_integral<IInt>::value)>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_1d_coord(IInt const& idx) const {
+    return cute::crd2idx(this->get_hier_coord(idx), shape());
+  }
+
+  //
+  // Coordinate to Coordinate
+  //
+
+#if 0
+  // Return the (hierarchical) ND logical coordinate corresponding to the linear index
+  // @post congruent(@a result, shape())
+  template <class Coord>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  crd_2_hier_coord(Coord const& crd) const {
+    return cute::crd2crd(crd, shape(), shape());
+  }
+
+  // Return the (flat) ND logical coordinate corresponding to the linear index
+  // @post rank(@a result) == rank(shape()) && depth(@a result) == 1
+  template <class Coord>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  crd_2_flat_coord(Coord const& crd) const {
+    return cute::crd2crd(crd, shape(), product_each(shape()));
+  }
+
+  // Return the generalized column-major 1D logical coordinate corresponding to the linear index
+  // @post is_integral<decltype(@a result)>::value
+  template <class Coord>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  crd_2_1d_coord(Coord const& crd) const {
+    //return cute::crd2crd(crd, shape(), product(shape()));
+    return cute::crd2idx(crd, shape());
+  }
+#endif
+};
+
+// Equality, return a static or dynamic boolean
+template <class ShapeA, class StrideA,
+          class ShapeB, class StrideB>
+CUTE_HOST_DEVICE constexpr
+auto
+operator==(Layout<ShapeA,StrideA> const& layoutA, Layout<ShapeB,StrideB> const& layoutB)
+{
+  return layoutA.shape() == layoutB.shape() && layoutA.stride() == layoutB.stride();
+}
+
+template <class Layout>
+struct is_layout : false_type {};
+template <class Shape, class Stride>
+struct is_layout<Layout<Shape,Stride>> : true_type {};
+
+//
+// Layout construction
+//
+
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+make_layout(Shape const& shape, Stride const& stride)
+{
+  static_assert(is_tuple<Shape >::value || is_integral<Shape >::value);
+  static_assert(is_tuple<Stride>::value || is_integral<Stride>::value);
+  return Layout<Shape,Stride>(shape, stride);
+}
+
+template <class Shape>
+CUTE_HOST_DEVICE constexpr
+auto
+make_layout(Shape const& shape)
+{
+  static_assert(is_tuple<Shape >::value || is_integral<Shape >::value);
+  return make_layout(shape, compact_major<LayoutLeft>(shape));
+}
+
+//
+// Convenience tags for common layouts
+//
+
+template <class Shape>
+CUTE_HOST_DEVICE constexpr
+auto
+make_layout(Shape const& shape, LayoutLeft)
+{
+  return make_layout(shape, compact_major<LayoutLeft>(shape));
+}
+
+template <class Shape>
+CUTE_HOST_DEVICE constexpr
+auto
+make_layout(Shape const& shape, LayoutRight)
+{
+  return make_layout(shape, compact_major<LayoutRight>(shape));
+}
+
+//
+// Construct a layout from multiple layouts by concatenation
+//
+
+// One argument overload
+template <class Shape0, class Stride0>
+CUTE_HOST_DEVICE constexpr
+auto
+make_layout(Layout<Shape0,Stride0> const& layout0)
+{
+  return make_layout(make_shape (layout0.shape() ),
+                     make_stride(layout0.stride()));
+}
+
+// Two argument overload
+template <class Shape0, class Stride0,
+          class Shape1, class Stride1>
+CUTE_HOST_DEVICE constexpr
+auto
+make_layout(Layout<Shape0,Stride0> const& layout0,
+            Layout<Shape1,Stride1> const& layout1)
+{
+  return make_layout(make_shape (layout0.shape() , layout1.shape() ),
+                     make_stride(layout0.stride(), layout1.stride()));
+}
+
+// Var argument overload
+template <class Shape0, class Stride0,
+          class Shape1, class Stride1,
+          class... Shapes, class... Strides>
+CUTE_HOST_DEVICE constexpr
+auto
+make_layout(Layout<Shape0,Stride0> const& layout0,
+            Layout<Shape1,Stride1> const& layout1,
+            Layout<Shapes,Strides> const&... layouts)
+{
+  return make_layout(make_shape (layout0.shape() , layout1.shape() , layouts.shape()... ),
+                     make_stride(layout0.stride(), layout1.stride(), layouts.stride()...));
+}
+
+//
+// Advanced Layout constructions
+//
+
+// Make a compact layout with shape @a shape and strides following the order induced by @a order.
+// Dynamic values in @a order are ignored, considered large, and considered ordered from left to right.
+// Example:
+//   make_ordered_layout(Shape<_2,_2,_2,_2>{}, Step<_0,_2,_3,_1>{})
+//     ->  (_2,_2,_2,_2):(_1,_4,_8,_2)
+//   make_ordered_layout(make_shape(2,3,4,5), make_step(Int<2>{}, 67, 42, Int<50>{}))
+//     -> (2,3,4,5):(_1,10,30,2)
+template <class Shape, class Order>
+CUTE_HOST_DEVICE constexpr
+auto
+make_ordered_layout(Shape const& shape, Order const& order)
+{
+  return make_layout(shape, compact_order(shape, order));
+}
+
+// Make a compact layout with the same shape as @a layout
+//   and strides following the order induced by @a layout.stride().
+// Static-0 strides in the input @a layout are preserved in the output.
+// Example:
+//   make_layout_like(Layout<Shape<_2,_2,_2,_2>, Stride<_0,_2,_4,_1>>{})
+//     ->  (_2,_2,_2,_2):(_0,_2,_4,_1)
+//   make_layout_like(make_layout(make_shape(2,3,4,5), make_stride(Int<0>{},42,Int<1>{},Int<0>{})))
+//     -> (2,3,4,5):(_0,4,_1,_0)
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+make_layout_like(Layout<Shape,Stride> const& layout)
+{
+  return make_layout(layout.shape(),
+                     compact_order(filter_zeros(layout.stride(), layout.shape()), layout.stride()));
+}
+
+// Make a compact layout with the same shape as @a layout
+//   and strides following the order induced by @a layout.stride(),
+//   except mode-0 is always stride-1 and generated column-major.
+// The 0th mode is commonly used for MMA_Atoms or Copy_Atoms so this
+//   generates the 0th mode with LayoutLeft (preserving stride-0s) regardless of the reference layout
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+make_fragment_like(Layout<Shape,Stride> const& layout)
+{
+  constexpr int R = Layout<Shape,Stride>::rank;
+  if constexpr (R > 1 && is_static<Shape>::value) {
+    return tiled_product(make_layout(get<0>(layout.shape()),
+                                     compact_major<LayoutLeft>(filter_zeros(get<0>(layout.stride()), get<0>(layout.shape())))),
+                         make_ordered_layout(take<1,R>(layout.shape()), take<1,R>(layout.stride())));
+  } else {
+    return make_layout(layout.shape());
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class Shape,
+          __CUTE_REQUIRES(is_tuple<Shape>::value || is_integral<Shape>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+make_fragment_like(Shape const& shape)
+{
+  return make_layout(shape);
+}
+
+//
+// Make an identity layout that maps a coordinate to itself
+//
+
+template <class Shape>
+CUTE_HOST_DEVICE constexpr
+auto
+make_identity_layout(Shape const& shape)
+{
+  return make_layout(shape, make_basis_like(shape));
+}
+
+//
+// Operations to manipulate Layouts like a tuple of pairs
+//
+
+// Return the Is...th sublayout.
+// For Is... = <I0,I1,...,IN>, equivalent to get<IN>(...get<I1>(get<I0>(layout)))
+template <size_t... Is, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+get(Layout<Shape,Stride> const& layout)
+{
+  return make_layout(get<Is...>(layout.shape()),
+                     get<Is...>(layout.stride()));
+}
+
+// Return a new layout with only the modes in the range [B,E)
+template <int B, int E, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+take(Layout<Shape,Stride> const& layout)
+{
+  static_assert(B < E, "take: empty range error");
+  static_assert(0 <= B && E <= Layout<Shape,Stride>::rank, "take: range out of bounds");
+  return make_layout(take<B,E>(layout.shape()),
+                     take<B,E>(layout.stride()));
+}
+
+// Return a new layout with only the modes Is... = <I0,I1,...,IN>
+template <int... Is, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+select(Layout<Shape,Stride> const& layout)
+{
+  return make_layout(select<Is...>(layout.shape()),
+                     select<Is...>(layout.stride()));
+}
+
+// Return a layout with depth at most 1
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+flatten(Layout<Shape,Stride> const& layout)
+{
+  return make_layout(flatten(layout.shape()),
+                     flatten(layout.stride()));
+}
+
+// Return a layout whose profile is congruent to TargetProfile
+// @pre Input layout is flat, flatten(@a layout) == @a layout
+// @pre Input layout can be folded to profile, rank(@a layout) == rank(flatten(@a target_profile))
+// @post congruent(@a result, @a target_profile)
+template <class Shape, class Stride, class TargetProfile>
+CUTE_HOST_DEVICE constexpr
+auto
+unflatten(Layout<Shape,Stride> const& layout, TargetProfile const& target_profile)
+{
+  return make_layout(unflatten(layout.shape(),  target_profile),
+                     unflatten(layout.stride(), target_profile));
+}
+
+//
+// Utilities
+//
+
+// Return the sublayout of mode I...
+template <int... Is, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+layout(Layout<Shape,Stride> const& layout)
+{
+  if constexpr (sizeof...(Is) == 0) {
+    return layout;
+  } else {
+    return get<Is...>(layout);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Return the shape of a mode
+template <int... Is, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+shape(Layout<Shape,Stride>& layout)
+{
+  return layout.template shape<Is...>();
+}
+
+template <int... Is, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+shape(Layout<Shape,Stride> const& layout)
+{
+  return layout.template shape<Is...>();
+}
+
+// Return the stride of a mode
+template <int... Is, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+stride(Layout<Shape,Stride>& layout)
+{
+  return layout.template stride<Is...>();
+}
+
+template <int... Is, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+stride(Layout<Shape,Stride> const& layout)
+{
+  return layout.template stride<Is...>();
+}
+
+// Return the number of elements in a mode
+template <int... Is, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+size(Layout<Shape,Stride> const& layout)
+{
+  return size(shape<Is...>(layout));
+}
+
+// Return the number of modes
+template <int... Is, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+rank(Layout<Shape,Stride> const& layout)
+{
+  return rank(shape<Is...>(layout));
+}
+
+// Return the depth of the layout
+template <int... Is, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+depth(Layout<Shape,Stride> const& layout)
+{
+  return depth(shape<Is...>(layout));
+}
+
+// Return the codomain shape of a mode
+// @post size(coshape(@a a)) == cosize(@a a)
+// @return C Coordinate with smallest elements such that
+//           @a elem_less(sub_layout(c), C) for all c < size(@a sub_layout)
+//           where sub_layout = get<Is...>(layout).
+template <int... Is, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+coshape(Layout<Shape,Stride> const& layout)
+{
+  // Protect against negative strides
+  auto abs_sub_layout = make_layout(shape<Is...>(layout),
+                                    transform_leaf(stride<Is...>(layout), abs_fn{}));
+  auto co_coord = as_arithmetic_tuple(abs_sub_layout(size(abs_sub_layout) - Int<1>{}));
+  return co_coord + repeat_like(co_coord, Int<1>{});
+}
+
+// Return the codomain size of a mode
+// @return M smallest integer such that
+//           @a sub_layout(c) < M for all c < size(@a sub_layout)
+//           where sub_layout = get<Is...>(layout).
+template <int... Is, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+cosize(Layout<Shape,Stride> const& layout)
+{
+  return size(coshape<Is...>(layout));
+}
+
+template <class Layout>
+using cosize_t = decltype(cosize(declval<Layout>()));
+
+template <class Layout>
+static constexpr auto cosize_v = cosize_t<Layout>::value;
+
+// With crd2idx(coord, shape), makes sense to have crd2idx(coord, Layout) as well
+template <class Coord, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+crd2idx(Coord const& c, Layout<Shape,Stride> const& layout)
+{
+  return crd2idx(c, layout.shape(), layout.stride());
+}
+
+//
+// Slice and Dice a layout
+//
+
+template <class Coord, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+slice(Coord const& c, Layout<Shape,Stride> const& layout)
+{
+  return make_layout(slice(c, layout.shape()),
+                     slice(c, layout.stride()));
+}
+
+template <class Coord, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+slice_and_offset(Coord const& c, Layout<Shape,Stride> const& layout)
+{
+  return cute::make_tuple(slice(c, layout), crd2idx(c, layout));
+}
+
+template <class Coord, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+dice(Coord const& c, Layout<Shape,Stride> const& layout)
+{
+  return make_layout(dice(c, layout.shape()),
+                     dice(c, layout.stride()));
+}
+
+// Compute a pointer offset and (potentially modified) layout from a coordinate
+// This exists so it can be overloaded for ComposedLayout
+template <class Coord, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+domain_offset(Coord const& coord, Layout<Shape,Stride> const& layout)
+{
+  return cute::make_tuple(layout, layout(coord));
+}
+
+//
+// Transform the modes of a layout
+//
+
+namespace detail {
+
+template <class Tuple, class F, int... I>
+CUTE_HOST_DEVICE constexpr
+auto
+transform_layout(Tuple const& t, F&& f, seq<I...>)
+{
+  return make_layout(f(get<I>(t))...);
+}
+
+template <class Tuple0, class Tuple1, class F, int... I, int... I0, int... I1>
+CUTE_HOST_DEVICE constexpr
+auto
+transform_layout(Tuple0 const& t0, Tuple1 const& t1, F&& f, seq<I...>, seq<I0...>, seq<I1...>)
+{
+  return make_layout(f(get<I>(t0),get<I>(t1))..., get<I0>(t0)..., get<I1>(t1)...);
+}
+
+} // end namespace detail
+
+template <class Tuple, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+transform_layout(Tuple const& t, F&& f)
+{
+  return detail::transform_layout(t, f, make_seq<decltype(rank(t))::value>{});
+}
+
+template <class Tuple0, class Tuple1, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+transform_layout(Tuple0 const& t0, Tuple1 const& t1, F&& f)
+{
+  constexpr int R0 = decltype(rank(t0))::value;
+  constexpr int R1 = decltype(rank(t1))::value;
+  constexpr int R  = (R0 < R1) ? R0 : R1;
+  return detail::transform_layout(t0, t1, f, make_seq<R>{}, make_range<R,R0>{}, make_range<R,R1>{});
+}
+
+//
+// Coalesce and Filter
+//
+
+namespace detail {
+
+// Look at each element and the front of the stack (in order of priority)
+// front(NewLayout)  get<I>(Layout)
+//      s0:d0           _1:d1     =>  continue
+//      _1:d0           s1:d1     =>  replace_front    s1:d1
+//      s0:s1*d1        s1:d1     =>  replace_front s0*s1:d1
+//      s0:d0           s1:d1     =>  prepend          s1:d1
+//
+// @pre OldShape and OldStride are flat
+template <int I, class OldShape, class OldStride, class NewShape, class NewStride>
+CUTE_HOST_DEVICE constexpr
+auto
+bw_coalesce(OldShape const& old_shape, OldStride const& old_stride,
+            NewShape const& new_shape, NewStride const& new_stride)
+{
+  if constexpr (I == -1) {
+    // Base case, we're done
+    if constexpr (is_constant<1, NewShape>::value) {
+      return Layout<_1,_0>{};
+    } else {
+      return Layout<NewShape,NewStride>{new_shape,new_stride};
+    }
+  } else if constexpr (is_constant<1, decltype(get<I>(old_shape))>::value) {
+    // shape<I>(layout) == _1, skip it and continue
+    return bw_coalesce<I-1>(old_shape, old_stride, new_shape, new_stride);
+  } else if constexpr (is_constant<1, NewShape>::value) {
+    // Replace our shape-1 with anything (Can only happen on input new_shape/new_stride)
+    return bw_coalesce<I-1>(old_shape, old_stride, get<I>(old_shape), get<I>(old_stride));
+  } else if constexpr (is_static<decltype(get<0>(new_shape))>::value &&
+                       is_constant<true, decltype(get<I>(old_shape) * get<I>(old_stride) == get<0>(new_stride))>::value) {
+    // Merge modes because the shapes and strides match
+    return bw_coalesce<I-1>(old_shape, old_stride,
+                            replace_front(new_shape,  get<I>(old_shape) * get<0>(new_shape)),
+                            replace_front(new_stride, get<I>(old_stride)));
+  } else {
+    // Can't replace or merge, so prepend a new mode
+    return bw_coalesce<I-1>(old_shape, old_stride,
+                            prepend(new_shape,  get<I>(old_shape)),
+                            prepend(new_stride, get<I>(old_stride)));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// cute::coalesce promises to not change the Layout as a function from integers to codomain.
+// It accomplishes this inside of the Layout's domain, but not always outside of the domain.
+//   Example: (_4,_1):(_1,_0) coalesces to _4:_1.
+// detail::coalesce_x preserves the Layout function inside its domain and outside.
+//
+// @post depth(@a result) <= 1
+// @post for all i, 0 <= i, @a layout(i) == @a result(i)
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+coalesce_x(Layout<Shape,Stride> const& layout)
+{
+  auto flat_shape  = flatten(layout.shape());
+  auto flat_stride = flatten(layout.stride());
+
+  constexpr int R = decltype(rank(flat_shape))::value;
+  if constexpr (is_constant<1, decltype(get<R-1>(flat_shape))>::value) {
+    return detail::bw_coalesce<R-2>(flat_shape, flat_stride,             Int<2>{}, get<R-1>(flat_stride));
+  } else {
+    return detail::bw_coalesce<R-2>(flat_shape, flat_stride, get<R-1>(flat_shape), get<R-1>(flat_stride));
+  }
+}
+
+// Apply coalesce_x at the terminals of trg_profile
+template <class Shape, class Stride, class IntTuple>
+CUTE_HOST_DEVICE constexpr
+auto
+coalesce_x(Layout<Shape,Stride> const& layout, IntTuple const& trg_profile)
+{
+  if constexpr (is_tuple<IntTuple>::value) {
+    static_assert(tuple_size<IntTuple>::value <= Layout<Shape,Stride>::rank);
+    return cute::transform_layout(layout, trg_profile, [](auto const& l, auto const& t) { return coalesce_x(l,t); });
+  } else {
+    return coalesce_x(layout);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+} // end namespace detail
+
+// "Simplify" the layout by combining modes that are possible to combine
+// Does not respect the shape of the layout, but does preserve total size
+// @post size(@a result) == size(@a layout)
+// @post depth(@a result) <= 1
+// @post for all i, 0 <= i < size(@a layout), @a layout(i) == @a result(i)
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+coalesce(Layout<Shape,Stride> const& layout)
+{
+  auto flat_shape  = flatten(layout.shape());
+  auto flat_stride = flatten(layout.stride());
+
+  constexpr int R = decltype(rank(flat_shape))::value;
+  return detail::bw_coalesce<R-2>(flat_shape, flat_stride, get<R-1>(flat_shape), get<R-1>(flat_stride));
+}
+
+// Apply coalesce at the terminals of trg_profile
+template <class Shape, class Stride, class IntTuple>
+CUTE_HOST_DEVICE constexpr
+auto
+coalesce(Layout<Shape,Stride> const& layout, IntTuple const& trg_profile)
+{
+  if constexpr (is_tuple<IntTuple>::value) {
+    static_assert(tuple_size<IntTuple>::value <= Layout<Shape,Stride>::rank);
+    return transform_layout(layout, trg_profile, [](auto const& l, auto const& t) { return coalesce(l,t); });
+  } else {
+    return coalesce(layout);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Combine static and dynamic modes of a shape.
+// @post size(@a result) == size(@a shape)
+// @post depth(@a result) <= 1
+template <class Shape>
+CUTE_HOST_DEVICE constexpr
+auto
+coalesce(Shape const& shape)
+{
+  static_assert(is_integral<Shape>::value || is_tuple<Shape>::value);
+
+  return cute::fold_first(flatten(shape), [](auto const& init, auto const& a) {
+    if constexpr (is_static<decltype(back(init))>::value == is_static<decltype(a)>::value) {
+      return replace_back(init, back(init) * a);  // Both static or both dynamic, coalesce and replace
+    } else {
+      return append(init, a);                     // Can't coalesce, so append
+    }
+  });
+}
+
+// Replace the modes in layout that have a 0-stride with a 1-size
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+filter_zeros(Layout<Shape,Stride> const& layout)
+{
+  return make_layout(filter_zeros(layout.stride(), layout.shape()), layout.stride());
+}
+
+// Replace the modes in layout that correspond to a 0 at the terminals of trg_profile with a 1-size
+template <class Shape, class Stride, class IntTuple>
+CUTE_HOST_DEVICE constexpr
+auto
+filter_zeros(Layout<Shape,Stride> const& layout, IntTuple const& trg_profile)
+{
+  return make_layout(filter_zeros(trg_profile, layout.shape()), layout.stride());
+}
+
+// Remove all of the 0-strides and 1-sizes
+// Return 1-shape if empty
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+filter(Layout<Shape,Stride> const& layout)
+{
+  return coalesce(filter_zeros(layout));
+}
+
+// Apply filter at the terminals of trg_profile
+template <class Shape, class Stride, class IntTuple>
+CUTE_HOST_DEVICE constexpr
+auto
+filter(Layout<Shape,Stride> const& layout, IntTuple const& trg_profile)
+{
+  if constexpr (is_tuple<IntTuple>::value) {
+    static_assert(tuple_size<IntTuple>::value <= Layout<Shape,Stride>::rank);
+    return transform_layout(layout, trg_profile, [](auto const& l, auto const& t) { return filter(l,t); });
+  } else {
+    return filter(layout);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Append, Prepend, Replace
+//
+
+template <int N, class ShapeA, class StrideA, class ShapeX = _1, class StrideX = _0>
+CUTE_HOST_DEVICE constexpr
+auto
+append(Layout<ShapeA,StrideA> const& layout,
+       Layout<ShapeX,StrideX> const& x = {})
+{
+  return make_layout(append<N>(layout.shape(),  x.shape()),
+                     append<N>(layout.stride(), x.stride()));
+}
+
+template <class ShapeA, class StrideA, class ShapeX = _1, class StrideX = _0>
+CUTE_HOST_DEVICE constexpr
+auto
+append(Layout<ShapeA,StrideA> const& layout,
+       Layout<ShapeX,StrideX> const& x = {})
+{
+  return make_layout(append(layout.shape(),  x.shape()),
+                     append(layout.stride(), x.stride()));
+}
+
+template <int N, class ShapeA, class StrideA, class ShapeX = _1, class StrideX = _0>
+CUTE_HOST_DEVICE constexpr
+auto
+prepend(Layout<ShapeA,StrideA> const& layout,
+        Layout<ShapeX,StrideX> const& x = {})
+{
+  return make_layout(prepend<N>(layout.shape(),  x.shape()),
+                     prepend<N>(layout.stride(), x.stride()));
+}
+
+template <class ShapeA, class StrideA, class ShapeX = _1, class StrideX = _0>
+CUTE_HOST_DEVICE constexpr
+auto
+prepend(Layout<ShapeA,StrideA> const& layout,
+        Layout<ShapeX,StrideX> const& x = {})
+{
+  return make_layout(prepend(layout.shape(),  x.shape()),
+                     prepend(layout.stride(), x.stride()));
+}
+
+template <int N, class ShapeA, class StrideA, class ShapeX, class StrideX>
+CUTE_HOST_DEVICE constexpr
+auto
+replace(Layout<ShapeA,StrideA> const& layout,
+        Layout<ShapeX,StrideX> const& x)
+{
+  return make_layout(replace<N>(layout.shape(),  x.shape()),
+                     replace<N>(layout.stride(), x.stride()));
+}
+
+template <int B, int E, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+group(Layout<Shape,Stride> const& layout)
+{
+  return make_layout(group<B,E>(layout.shape()),
+                     group<B,E>(layout.stride()));
+}
+
+//
+// Composition of two layouts: lhs o rhs
+// @post compatible(rhs, result)
+// @post result(c) = lhs(rhs(c))
+//         for all c in the domain of rhs
+//
+
+namespace detail {
+
+template <class LShape, class LStride,
+          class RShape, class RStride>
+CUTE_HOST_DEVICE constexpr
+auto
+composition_impl(LShape const& lhs_shape, LStride const& lhs_stride,
+                 RShape const& rhs_shape, RStride const& rhs_stride)
+{
+  if constexpr (is_tuple<RShape>::value) {
+    // Apply the right-distributivity of Layout composition
+    return transform_layout(rhs_shape, rhs_stride, [&](auto const& s, auto const& d) {
+      return composition_impl(lhs_shape, lhs_stride, s, d);
+    });
+  } else
+  if constexpr (is_scaled_basis<RStride>::value) {
+    // Special case for a ScaledBasis stride
+    return composition_impl(basis_get(rhs_stride, lhs_shape), basis_get(rhs_stride, lhs_stride),
+                            rhs_shape, basis_value(rhs_stride));
+  } else
+  if constexpr (is_constant<0, RStride>::value) {
+    // Special case shortcut for any static stride-0
+    return Layout<RShape, RStride>{rhs_shape, rhs_stride};
+  } else
+  if constexpr (is_integral<decltype(lhs_shape)>::value) {
+    // Special case shortcut for any integral LShape
+    return Layout{rhs_shape, rhs_stride * lhs_stride};
+  } else
+  if constexpr (is_constant<1, RStride>::value) {
+    // Special case shortcut for any static stride-1
+    constexpr int R  = rank_v<LShape>;
+    auto result_shape_0  = take<0,R-1>(lhs_shape);
+
+    // Mod out the rhs_shape from the lhs_shape
+    auto const [result_shape_1, rest_shape]  = fold(result_shape_0, cute::make_tuple(cute::make_tuple(), rhs_shape),
+      [] (auto const& init, auto const& si) {
+        return cute::make_tuple(append(get<0>(init), shape_min(abs(si), get<1>(init))), shape_div(get<1>(init), abs(si)));
+      });
+
+    // Jump into coalesce and append (rest_shape, get<R-1>(lhs_stride))
+    return detail::bw_coalesce<R-2>(result_shape_1, lhs_stride, rest_shape, get<R-1>(lhs_stride));
+  } else {
+    // General case: integral RShape and RStride, tuple LShape and LStride
+    constexpr int R  = rank_v<LShape>;
+    auto result_shape_0  = take<0,R-1>(lhs_shape);
+    auto result_stride_0 = take<0,R-1>(lhs_stride);
+
+    // Divide out the rhs_stride from the lhs_shape
+    auto const [result_shape_1, rest_stride] = fold(result_shape_0, cute::make_tuple(cute::make_tuple(), rhs_stride),
+      [] (auto const& init, auto const& di) {
+        return cute::make_tuple(append(get<0>(init), shape_div(di, get<1>(init))), shape_div(get<1>(init), di));
+      });
+
+    // Apply any lhs_shape changes to the stride
+    auto result_stride_1 = elem_scale(result_stride_0, shape_div(result_shape_0, result_shape_1));
+
+    // Mod out the rhs_shape from the lhs_shape
+    auto const [result_shape_2, rest_shape] = fold(result_shape_1, cute::make_tuple(cute::make_tuple(), rhs_shape),
+      [] (auto const& init, auto const& si) {
+        return cute::make_tuple(append(get<0>(init), shape_min(abs(si), get<1>(init))), shape_div(get<1>(init), abs(si)));
+      });
+
+    // Jump into coalesce and append (rest_shape, rest_stride * get<R-1>(lhs_stride))
+    return detail::bw_coalesce<R-2>(result_shape_2, result_stride_1, rest_shape, rest_stride * get<R-1>(lhs_stride));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+} // end namespace detail
+
+template <class LShape, class LStride,
+          class RShape, class RStride>
+CUTE_HOST_DEVICE constexpr
+auto
+composition(Layout<LShape,LStride> const& lhs,
+            Layout<RShape,RStride> const& rhs)
+{
+  auto coprofile = repeat_like(decltype(coshape(rhs)){}, Int<0>{});
+  auto flat_lhs = detail::coalesce_x(lhs, coprofile);
+  return detail::composition_impl(flat_lhs.shape(), flat_lhs.stride(), rhs.shape(), rhs.stride());
+}
+
+template <class LShape, class LStride, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+composition(Layout<LShape,LStride> const& lhs,
+            Tiler                  const& rhs)
+{
+  if constexpr (is_tuple<Tiler>::value) {
+    static_assert(tuple_size<Tiler>::value <= Layout<LShape,LStride>::rank);
+    // Drop any modes of lhs that aren't hit by rhs
+    return detail::transform_layout(lhs, rhs, [](auto const& l, auto const& r) { return composition(l,r); }, make_seq<tuple_size<Tiler>::value>{}, seq<>{}, seq<>{});
+  } else if constexpr (is_underscore<Tiler>::value) {
+    return lhs;
+  } else if constexpr (is_integral<Tiler>::value) {
+    auto flat_lhs = detail::coalesce_x(lhs);
+    return detail::composition_impl(flat_lhs.shape(), flat_lhs.stride(), rhs, Int<1>{});
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Complement
+//
+// Build the complement of a layout.
+// @post size(@a result) >= @a cosize_hi / size(filter(@a layout)));
+// @post For all i in [1,size(@a result)),
+//           @a result(i) < @a result(i-1)
+//           For all j in [0, size(@a layout)),
+//               @a result(i) != @a layout(j)
+//
+
+namespace detail {
+
+// @pre @a layout has been filtered (flattened and no stride-0 or size-1 modes).
+template <class Shape, class Stride, class CoTarget>
+CUTE_HOST_DEVICE constexpr
+auto
+complement(Shape const& shape, Stride const& stride, CoTarget const& cotarget)
+{
+  if constexpr (is_constant<0, Stride>::value) {
+    // Special case for irreducible rank-1 stride-0 layout
+    return make_layout(coalesce(cotarget));
+  } else {
+    // General case
+    constexpr int R = rank_v<Shape>;
+    static_assert(R == 1 || is_static<Stride>::value,
+                  "Dynamic-stride complement only for rank-1 layouts");
+
+    // Should just be a sort and a fold...
+    // Then we could even handle dynamic strides (but they would destroy all static strides)
+    auto [shape_, stride_, result_shape_, result_stride] =
+      fold(make_seq<R-1>{},
+           cute::make_tuple(shape, stride, cute::make_tuple(), cute::make_tuple(Int<1>{})),
+           [](auto const& init, auto i)
+           {
+              auto [shape, stride, result_shape, result_stride] = init;
+              auto min_stride = cute::min(stride);
+              auto min_idx    = cute::find(stride, min_stride);
+              auto new_shape  = min_stride / get<i>(result_stride);
+              auto new_stride = min_stride * get<min_idx>(shape);
+              static_assert(not is_constant<0, decltype(new_shape)>::value, "Non-injective Layout detected in complement.");
+
+              return cute::make_tuple(remove<min_idx>(shape),              // Remove the min_idx from shape
+                                      remove<min_idx>(stride),             // Remove the min_idx from stride
+                                      append(result_shape , new_shape ),   // new shape  = min_stride / last_stride
+                                      append(result_stride, new_stride));  // new stride = min_stride * curr_shape
+            });
+
+    // Append the last shape mode
+    auto new_shape    = get<0>(stride_) / get<R-1>(result_stride);         // new shape  = min_stride / last_stride
+    static_assert(not is_constant<0, decltype(new_shape)>::value, "Non-injective Layout detected in complement.");
+    auto result_shape = append(result_shape_, new_shape);
+
+    // Compute the rest_shape and rest_stride
+    auto new_stride  = get<0>(stride_) * get<0>(shape_);                   // new stride = min_stride * curr_shape
+    auto rest_shape  = coalesce(ceil_div(cotarget, new_stride));
+    auto rest_stride = compact_major<LayoutLeft>(rest_shape, new_stride);
+
+    // Coalesce and append (rest_shape, rest_stride)
+    return coalesce(make_layout(make_shape (result_shape , rest_shape ),
+                                make_stride(result_stride, rest_stride)));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+} // end namespace detail
+
+template <class Shape, class Stride, class CoTarget>
+CUTE_HOST_DEVICE constexpr
+auto
+complement(Layout<Shape,Stride> const& layout, CoTarget const& cotarget)
+{
+  auto filter_layout = filter(layout);
+  return detail::complement(filter_layout.shape(), filter_layout.stride(), shape(cotarget));
+}
+
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+complement(Layout<Shape,Stride> const& layout)
+{
+  auto filter_layout = filter(layout);
+  return detail::complement(filter_layout.shape(), filter_layout.stride(), cosize(filter_layout));
+}
+
+//
+// Right-Inverse and Left-Inverse
+//
+
+namespace detail {
+
+template <int NextStride, class Shape, class Stride, int... Is>
+CUTE_HOST_DEVICE constexpr
+auto
+inverse_seq(Shape const& shape, Stride const& stride, seq<Is...>)
+{
+  auto next_I = cute::find_if(stride, [](auto a) { return is_constant<NextStride, decltype(a)>{}; });
+
+  if constexpr (next_I == decltype(rank(stride))::value) {
+    // If not found, return current seq
+    return seq<Is...>{};
+  } else {
+    // auto next_stride = get<next_I>(shape) * get<next_I>(stride);
+    // NOTE: Needed for g++-7
+    using next_stride = decltype(get<next_I>(shape) * get<next_I>(stride));
+
+    if constexpr (is_static<next_stride>::value && !is_constant<NextStride, next_stride>::value) {
+      // If next_stride is static and unique, then continue
+      return inverse_seq<next_stride::value>(shape, stride, seq<Is..., next_I>{});
+    } else {
+      // Else return current seq + next_I
+      return seq<Is..., next_I>{};
+    }
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+} // end namespace detail
+
+//
+// Build the right-inverse of a layout
+// @pre is_static<Layout>
+// @result A layout @a result such that
+//    @a layout(@a result(i)) == i for all i < size(@a result)
+// @result A layout @a result such that
+//    composition(@a layout, @a result) is identical to make_layout(shape(result))
+//
+
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+right_inverse(Layout<Shape,Stride> const& layout)
+{
+  auto flat_layout = coalesce(layout);
+  auto astride = transform_leaf(flat_layout.stride(), abs_fn{});
+
+  // Find Int<1>{}, the starting stride, and follow the strides to gen inverse_seq
+  [[maybe_unused]] auto iseq = detail::inverse_seq<1>(flat_layout.shape(), astride, seq<>{});
+
+  if constexpr (iseq.size() == 0) {
+    return Layout<_1,_0>{};     // Empty case, nothing found
+  } else {
+    // Generate the corresponding new strides and construct
+    auto rstride = compact_major<LayoutLeft>(flat_layout.shape());
+    return make_layout(unwrap(transform(iseq, [&](auto i) { return shape<i>(flat_layout); })),
+                       unwrap(transform(iseq, [&](auto i) { return signum(stride<i>(flat_layout)) * get<i>(rstride); })));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+CUTE_HOST_DEVICE constexpr
+auto
+right_inverse(Underscore const& _)
+{
+  return _;
+}
+
+//
+// Build the left-inverse of a layout
+// @pre is_static<Layout>
+// @pre @a layout is an injective function
+// @result A layout @a result such that
+//    @a result(@a layout(i)) == i for all i < size(@a layout)
+// @result A layout @a result such that
+//    composition(@a result, @a layout) is identical to make_layout(shape(layout))
+//
+
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+left_inverse(Layout<Shape,Stride> const& layout)
+{
+  return right_inverse(make_layout(layout, complement(layout)));
+}
+
+CUTE_HOST_DEVICE constexpr
+auto
+left_inverse(Underscore const& _)
+{
+  return _;
+}
+
+//
+// Max Common Layout
+//
+
+/* Return a layout that points to the maximum number of contiguous elements
+ * that logically correspond in the layouts of @a a and @a b.
+ *
+ * @returns Layout R
+ * @post For all 0 <= i < size(R), a(R(i)) == i and b(R(i)) == i
+ */
+template <class ShapeA, class StrideA,
+          class ShapeB, class StrideB>
+CUTE_HOST_DEVICE constexpr
+auto
+max_common_layout(Layout<ShapeA,StrideA> const& a,
+                  Layout<ShapeB,StrideB> const& b)
+{
+  Layout inv_b  = right_inverse(b);
+  Layout common = coalesce(composition(a, inv_b));
+
+  // Keep only the static identity component of the common layout
+  if constexpr (is_static<decltype(shape<0>(common))>::value &&
+                is_constant<1, decltype(stride<0>(common))>::value) {
+    // Truncate to the size of the contiguous vector (static stride-1 mode)
+    return composition(inv_b, layout<0>(common));
+  } else {
+    return Layout<_1,_0>{};
+  }
+}
+
+/* Return Int<N> such that N is the maximum number of contiguous elements
+ * that logically correspond in the layouts of @a a and @a b.
+ *
+ * @returns Int<N> with N >= 1
+ * @post For all 0 <= n < N, a(b.get_1d_coord(n)) == n
+ *       (NOTE: Problems with negative strides/coords in this post-condition)
+ */
+template <class ShapeA, class StrideA,
+          class ShapeB, class StrideB>
+CUTE_HOST_DEVICE constexpr
+auto
+max_common_vector(Layout<ShapeA,StrideA> const& a,
+                  Layout<ShapeB,StrideB> const& b)
+{
+  Layout common = coalesce(composition(a, right_inverse(b)));
+
+  // Keep only the static identity component of the common layout
+  if constexpr (is_static<decltype(shape<0>(common))>::value &&
+                is_constant<1, decltype(stride<0>(common))>::value) {
+    // Truncate to the size of the contiguous vector (static stride-1 mode)
+    return shape<0>(common);
+  } else {
+    return Int<1>{};
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+/* Return a layout that distributes ShapeB over ShapeA.
+ *
+ * @returns Layout result
+ * @post evenly_divides(@a b, size(@a result))
+ * @post evenly_divides(@a a, @a result)
+ * @post For all i,j in [0,size(@a result)) with i < j, @a result(i) < @a result(j). Surjective and Ordered.
+ * @post composition(make_layout(shape(@a a)), @a result) is admissible
+ * \code
+ *   // Note that 6 does not divide this shape
+ *   Layout layoutA = Layout<Shape<Int<15>,Int<14>>>{};
+ *
+ *   // Want to tile any 6 elements and don't care where they come from
+ *   Layout dist = domain_distribute(layoutA, Int<6>{});   // (_3,_2):(_1,_15)
+ *
+ *   // Not guaranteed to find all 6 though...
+ *   CUTE_STATIC_ASSERT_V(Int<6>{} == size(dist));
+ *
+ *   Layout result = zipped_divide(layoutA, dist);         // (_6,Rest)
+ * \endcode
+ */
+template <class ShapeA, class ShapeB>
+CUTE_HOST_DEVICE constexpr
+auto
+domain_distribute(ShapeA const& a, ShapeB const& b)
+{
+  static_assert(is_integral<ShapeB>::value);
+  static_assert(is_static<ShapeB>::value);
+
+  auto flat_shape_a = flatten(shape(a));
+
+  static_assert(is_static<decltype(flat_shape_a)>::value);
+
+  // Compute the shape of the result
+  auto [result_shape, b_rest] = cute::fold(flat_shape_a, cute::make_tuple(cute::tuple<>{}, size(b)), [](auto init, auto a_) {
+    auto [result, b_] = init;
+    auto gcd_ = gcd(a_, b_);
+    return cute::make_tuple(append(result, gcd_), b_ / gcd_);
+  });
+
+  // Compute the stride of the result
+  auto result_stride = compact_major<LayoutLeft>(flat_shape_a);
+
+  return coalesce(make_layout(result_shape, result_stride));
+}
+
+//
+// Kernel (Nullspace) of a Layout
+//
+
+namespace detail {
+
+template <int NextI, class Stride, int... Is>
+CUTE_HOST_DEVICE constexpr
+auto
+nullspace_seq(Stride const& stride, seq<Is...>)
+{
+  if constexpr (NextI == rank_v<Stride>) {
+    return seq<Is...>{};
+  } else
+  if constexpr (is_constant<0, decltype(get<NextI>(stride))>::value) {
+    return detail::nullspace_seq<NextI+1>(stride, seq<Is..., NextI>{});
+  } else {
+    return detail::nullspace_seq<NextI+1>(stride, seq<Is...>{});
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+} // end namespace detail
+
+//
+// Build the nullspace of a layout
+// @result A layout @a result such that
+//    size(@a result) == size(@a layout) / size(filter(@a layout))
+//    @a layout(@a result(i)) == 0 for all i < size(@a result)
+//
+
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+nullspace(Layout<Shape,Stride> const& layout)
+{
+  auto flat_layout = flatten(layout);
+
+  auto iseq = detail::nullspace_seq<0>(flat_layout.stride(), seq<>{});
+
+  if constexpr (iseq.size() == 0) {
+    return Layout<_1,_0>{};     // Empty case, nothing found
+  } else {
+    // Generate the corresponding new strides and construct
+    auto rstride = compact_major<LayoutLeft>(flat_layout.shape());
+    return make_layout(unwrap(transform(iseq, [&](auto i) { return shape<i>(flat_layout); })),
+                       unwrap(transform(iseq, [&](auto i) { return get<i>(rstride); })));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Zip
+//
+
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+zip(Layout<Shape,Stride> const& layout)
+{
+  return make_layout(zip(layout.shape()),
+                     zip(layout.stride()));
+}
+
+template <class TShape, class TStride,
+          class UShape, class UStride>
+CUTE_HOST_DEVICE constexpr
+auto
+zip(Layout<TShape,TStride> const& layoutA,
+    Layout<UShape,UStride> const& layoutB)
+{
+  return make_layout(zip(layoutA.shape(),  layoutB.shape()),
+                     zip(layoutA.stride(), layoutB.stride()));
+}
+
+//
+// Tile unzip
+//   Logical product and logical divide (on layouts) produce rank-2 results by design.
+//   Follow the profile of @a tile and zip the rank-2 modes located at the terminals into
+//   their own mode.
+//
+
+template <class LShape, class LStride, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+tile_unzip(Layout<LShape,LStride> const& layout,
+           Tiler                  const& tiler)
+{
+  return make_layout(zip2_by(layout.shape(),  tiler),
+                     zip2_by(layout.stride(), tiler));
+}
+
+//
+// Logical divide
+//
+
+template <class LShape, class LStride,
+          class TShape, class TStride>
+CUTE_HOST_DEVICE constexpr
+auto
+logical_divide(Layout<LShape,LStride> const& layout,
+               Layout<TShape,TStride> const& tiler)
+{
+  return composition(layout, make_layout(tiler, complement(tiler, shape(layout))));
+}
+
+template <class LShape, class LStride, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+logical_divide(Layout<LShape,LStride> const& layout,
+               Tiler                  const& tiler)
+{
+  if constexpr (is_tuple<Tiler>::value) {
+    static_assert(tuple_size<Tiler>::value <= Layout<LShape,LStride>::rank, "logical_divide: Too many modes in tiler.");
+    return transform_layout(layout, tiler, [](auto const& l, auto const& t) { return logical_divide(l,t); });
+  } else if constexpr (is_underscore<Tiler>::value) {
+    return layout;
+  } else if constexpr (is_integral<Tiler>::value) {
+    return logical_divide(layout, make_layout(tiler));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Generalization of ceil_div for Layout lhs
+//   is effectively the "rest mode" of logical_divide.
+// Occurs in the calculation of gridDim, for example, for generalized tilers
+// Example:
+//   dim3 gridDim(size(ceil_div(problem_shape_M, cta_tiler_M)),
+//                size(ceil_div(problem_shape_N, cta_tiler_N)));
+// This does not consider compositional acceptance, so it may be the case that
+//   ceil_div produces a result while logical_divide (and friends) do not.
+template <class Target, class TShape, class TStride>
+CUTE_HOST_DEVICE constexpr
+auto
+ceil_div(Target                 const& target,
+         Layout<TShape,TStride> const& tiler)
+{
+  return shape(complement(tiler, shape(target)));
+}
+
+//
+// Convenience operator
+//   that produces layouts like ((BLK_A,BLK_B,...),(a,b,...,x,y))
+//   by gathering the tile modes and residuals into a rank-2 result.
+//
+
+template <class LShape, class LStride,
+          class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+zipped_divide(Layout<LShape,LStride> const& layout,
+              Tiler                  const& tiler)
+{
+  return tile_unzip(logical_divide(layout, tiler), tiler);
+}
+
+// Same as zipped_divide, but unpacks the second mode: ((BLK_A,BLK_B,...),a,b,...,x,y)
+template <class LShape, class LStride,
+          class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+tiled_divide(Layout<LShape,LStride> const& layout,
+             Tiler                  const& tiler)
+{
+  auto result = zipped_divide(layout, tiler);
+
+  auto R1 = rank<1>(result);
+  return result(_, repeat<R1>(_));
+}
+
+// Same as zipped_divide, but unpacks both modes: (BLK_A,BLK_B,...,a,b,...,x,y)
+template <class LShape, class LStride,
+          class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+flat_divide(Layout<LShape,LStride> const& layout,
+            Tiler                  const& tiler)
+{
+  auto result = zipped_divide(layout, tiler);
+
+  auto R0 = rank<0>(result);
+  auto R1 = rank<1>(result);
+  return result(repeat<R0>(_), repeat<R1>(_));
+}
+
+//
+// Logical product
+//
+
+template <class LShape, class LStride,
+          class TShape, class TStride>
+CUTE_HOST_DEVICE constexpr
+auto
+logical_product(Layout<LShape,LStride> const& block,
+                Layout<TShape,TStride> const& tiler)
+{
+  return make_layout(block, composition(complement(block, size(block)*cosize(tiler)), tiler));
+}
+
+template <class LShape, class LStride, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+logical_product(Layout<LShape,LStride> const& block,
+                Tiler                  const& tiler)
+{
+  if constexpr (is_tuple<Tiler>::value) {
+    static_assert(tuple_size<Tiler>::value <= Layout<LShape,LStride>::rank, "logical_product: Too many modes in tiler.");
+    return transform_layout(block, tiler, [](auto const& l, auto const& t) { return logical_product(l,t); });
+  } else if constexpr (is_underscore<Tiler>::value) {
+    return block;
+  } else if constexpr (is_integral<Tiler>::value) {
+    return logical_product(block, make_layout(tiler));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Convenience operator
+//   that produces layouts like ((BLK_A,BLK_B,...),(a,b,...,x,y))
+//   by gathering the block modes and products into a rank-2 result.
+//
+
+template <class LShape, class LStride,
+          class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+zipped_product(Layout<LShape,LStride> const& block,
+               Tiler                  const& tiler)
+{
+  return tile_unzip(logical_product(block, tiler), tiler);
+}
+
+// Same as zipped_product, but unpacks the second mode: ((BLK_A,BLK_B,...),a,b,...,x,y)
+template <class LShape, class LStride,
+          class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+tiled_product(Layout<LShape,LStride> const& block,
+              Tiler                  const& tiler)
+{
+  auto result = zipped_product(block, tiler);
+
+  auto R1 = rank<1>(result);
+  return result(_, repeat<R1>(_));
+}
+
+// Same as zipped_product, but unpacks both modes: (BLK_A,BLK_B,...,a,b,...,x,y)
+template <class LShape, class LStride,
+          class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+flat_product(Layout<LShape,LStride> const& block,
+             Tiler                  const& tiler)
+{
+  auto result = zipped_product(block, tiler);
+
+  auto R0 = rank<0>(result);
+  auto R1 = rank<1>(result);
+  return result(repeat<R0>(_), repeat<R1>(_));
+}
+
+//
+// Rank-sensitive products
+//
+
+// blocked_product -- Reproduce a block over a tiler.
+// Think of every element of "tiler" as a "block"
+//   and return the layout of the resulting structure.
+// @post rank(@a result) == cute::max(rank(@a block), rank(@a tiler))
+template <class TShape, class TStride,
+          class UShape, class UStride>
+CUTE_HOST_DEVICE constexpr
+auto
+blocked_product(Layout<TShape,TStride> const& block,
+                Layout<UShape,UStride> const& tiler)
+{
+  constexpr int R = cute::max(rank_v<TShape>, rank_v<UShape>);
+
+  auto result = logical_product(append<R>(block), append<R>(tiler));
+
+  return coalesce(zip(get<0>(result), get<1>(result)), tuple_repeat<R>(Int<1>{}));
+}
+
+// raked_product -- Reproduce a block over a tiler with block-interleaving.
+// Think of every element of "tiler" as a "block", interleave those blocks,
+//   and return the layout of the resulting structure.
+// @post rank(@a result) == cute::max(rank(@a block), rank(@a tiler))
+template <class TShape, class TStride,
+          class UShape, class UStride>
+CUTE_HOST_DEVICE constexpr
+auto
+raked_product(Layout<TShape,TStride> const& block,
+              Layout<UShape,UStride> const& tiler)
+{
+  constexpr int R = cute::max(rank_v<TShape>, rank_v<UShape>);
+
+  auto result = logical_product(append<R>(block), append<R>(tiler));
+
+  return coalesce(zip(get<1>(result), get<0>(result)), tuple_repeat<R>(Int<1>{}));
+}
+
+// tile_to_shape -- Perform a product of a layout so that the result matches a target shape.
+// This is similar to blocked_product, but specifies the result shape instead of the
+//   product shape, which is more convenient in certain circumstances.
+// @param block The layout to repeat
+// @param trg_shape The target shape of the result
+// @param ord_shape The order of the modes of @a trg_shape to tile @a layout with.
+//                  Defaults to GenColMajor, so @a layout will repeat
+//                    across the first mode first, the second mode second, etc
+//                  E.g. Step<_2,_1,_3> will cause @a layout to repeat
+//                    across the second mode first, the first mode second, and the third mode last.
+// @pre rank(@a block) <= rank(@a trg_shape)
+// @post compatible(@a trg_shape, shape(@a result))
+template <class Shape, class Stride,
+          class TrgShape, class ModeOrder = LayoutLeft>
+CUTE_HOST_DEVICE constexpr
+auto
+tile_to_shape(Layout<Shape,Stride> const& block,
+              TrgShape             const& trg_shape,
+              ModeOrder            const& ord_shape = {})
+{
+  CUTE_STATIC_ASSERT_V(rank(block) <= rank(trg_shape), "Rank of layout must be <= rank of target shape.");
+  constexpr int R = rank_v<TrgShape>;
+
+  auto padded_block = append<R>(block);
+
+  auto block_shape  = product_each(shape(padded_block));
+  auto target_shape = product_each(shape(trg_shape));
+
+  // Assert proper division
+  if constexpr (is_static<decltype(target_shape)>::value) {
+    CUTE_STATIC_ASSERT_V(evenly_divides(target_shape, block_shape),
+                         "tile_to_shape: block shape does not divide the target shape.");
+  }
+
+  auto product_shape = ceil_div(target_shape, block_shape);
+
+  return coalesce(blocked_product(padded_block, make_ordered_layout(product_shape, ord_shape)), product_shape);
+}
+
+//
+// Upcast
+//   For stride-1 mode, divide size by N. Divide all other strides by N.
+//
+
+template <int N, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+upcast(Shape const& shape, Stride const& stride)
+{
+  if constexpr (is_tuple<Shape>::value) {                  // tuple stride
+    return transform_layout(shape, stride, [](auto const& s, auto const& d) { return upcast<N>(s,d); });
+  } else if constexpr (is_constant<0, Stride>::value) {    // static-0 stride
+    return Layout<Shape,Stride>{shape,stride};
+  } else if constexpr (is_static<Stride>::value) {         // static stride
+    return make_layout(shape_div(shape,  shape_div(Int<N>{}, abs(stride))),
+                       shape_div(stride, Int<N>{}));
+  } else {                                                 // dynamic stride
+    // assume dynamic strides are larger than N and divisible
+    // assert(stride % N == 0);
+    return make_layout(shape, safe_div(stride, Int<N>{}));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <int N, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+upcast(Layout<Shape,Stride> const& layout)
+{
+  return upcast<N>(layout.shape(), layout.stride());
+}
+
+//
+// Downcast
+//   For stride-1 mode, multiply size by N. Multiply all other strides by N.
+//
+
+template <int N, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+downcast(Shape const& shape, Stride const& stride)
+{
+  if constexpr (is_tuple<Shape>::value) {
+    return transform_layout(shape, stride, [](auto const& s, auto const& d) { return downcast<N>(s,d); });
+  } else if constexpr (is_constant<1, Stride>::value || is_constant<-1, Stride>::value) {
+    return make_layout(shape * Int<N>{}, stride);
+  } else {
+    return make_layout(shape, stride * Int<N>{});
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <int N, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+downcast(Layout<Shape,Stride> const& layout)
+{
+  CUTE_STATIC_ASSERT(has_int1<Stride>::value, "Downcast requires adjacent elements");
+  return downcast<N>(layout.shape(), layout.stride());
+}
+
+//
+// Recast
+//
+
+template <class OldType, class NewType,
+          class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+recast_layout(Layout<Shape,Stride> const& layout)
+{
+  using scale = decltype(trait_ratio(sizeof_bits<NewType>{}, sizeof_bits<OldType>{}));
+  if constexpr (scale::num == 1 && scale::den == 1) {
+    return layout;
+  }
+  else if constexpr (scale::num == 1) {
+    return downcast<scale::den>(layout);
+  }
+  else if constexpr (scale::den == 1) {
+    return upcast<scale::num>(layout);
+  }
+  else {
+    return downcast<scale::den>(upcast<scale::num>(layout));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Determine the maximum alignment of a Layout.
+// The maximum alignment is the largest N for which upcast<N>(layout) will compile.
+//   upcast<N>(layout) compiles when the static shapes and strides pass divisibility checks.
+//   Therefore, upcast<M>(layout) will also compile for all divisors M of N.
+// Note that this only considers the static shapes and strides of the Layout
+//   in symmetry with upcast<N> only checking against static shapes and strides and assuming all
+//   dynamic shapes and strides are large and multiples of N.
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+max_alignment(Layout<Shape,Stride> const& layout)
+{
+  auto flat_layout   = coalesce(layout);
+  auto static_shape  = transform( shape(flat_layout), [](auto s){ return conditional_return<is_static<decltype(s)>::value>(s, Int<1>{}); });
+  auto static_stride = transform(stride(flat_layout), [](auto d){ return conditional_return<is_static<decltype(d)>::value>(d, Int<0>{}); });
+  auto filter_layout = make_layout(static_shape, static_stride);
+  auto permuted = logical_divide(filter_layout, right_inverse(filter_layout));
+  return gcd(size<0>(permuted), stride<1>(permuted));
+}
+
+//
+// Display utilities
+//
+
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE void print(Layout<Shape,Stride> const& layout)
+{
+  print(layout.shape()); print(":"); print(layout.stride());
+}
+
+#if !defined(__CUDACC_RTC__)
+template <class Shape, class Stride>
+CUTE_HOST std::ostream& operator<<(std::ostream& os, Layout<Shape,Stride> const& layout)
+{
+  return os << shape(layout) << ":" << stride(layout);
+}
+#endif
+
+// Generic 2D Layout to console table
+template <class Layout>
+CUTE_HOST_DEVICE
+void
+print_layout(Layout const& layout)  // (m,n) -> idx
+{
+  CUTE_STATIC_ASSERT_V(rank(layout) == Int<2>{});
+
+  int idx_width = num_digits(cosize(layout)) + 2;
+  const char* delim = "+-----------------------";
+
+  print(layout); print("\n");
+
+  // Column indices
+  print("    ");
+  for (int n = 0; n < size<1>(layout); ++n) { printf("  %*d ", idx_width-2, n); }
+  printf("\n");
+
+  // Print out A m-by-n
+  for (int m = 0; m < size<0>(layout); ++m) {
+    // Header
+    print("    ");
+    for (int n = 0; n < size<1>(layout); ++n) { printf("%.*s", idx_width+1, delim); }
+    printf("+\n");
+    // Values
+    printf("%2d  ", m);  // Row indices
+    for (int n = 0; n < size<1>(layout); ++n) { printf("| %*d ", idx_width-2, int(layout(m,n))); }
+    printf("|\n");
+  }
+  // Footer
+  print("    ");
+  for (int n = 0; n < size<1>(layout); ++n) { printf("%.*s", idx_width+1, delim); }
+  printf("+\n");
+}
+
+// Generic ThrVal 2D Layout to console table
+template <class Layout, class ThrID>
+CUTE_HOST_DEVICE
+void
+print_layout(Layout const& layout, ThrID const& thrid)  // (m,n) -> (tid,vid)  and  tid -> thr_idx
+{
+  CUTE_STATIC_ASSERT_V(rank(layout) == Int<2>{});
+
+  print(layout); print("\n");
+  print(thrid);  print("\n");
+
+  // Print out m-by-n
+  for (int m = 0; m < size<0>(layout); ++m) {
+    // Header
+    for (int n = 0; n < size<1>(layout); ++n) printf("+------");
+    printf("+\n");
+    // Values
+    for (int n = 0; n < size<1>(layout); ++n) printf("|%03d-%02d", int(thrid(layout(m,n) % size(thrid))), int(layout(m,n) / size(thrid)));
+    printf("|\n");
+  }
+  // Footer
+  for (int n = 0; n < size<1>(layout); ++n) printf("+------");
+  printf("+\n");
+}
+
+struct TikzColor_White {
+  CUTE_HOST_DEVICE char const*
+  operator()(int idx) const {
+    return "white";
+  }
+};
+
+struct TikzColor_BWx8 {
+  CUTE_HOST_DEVICE char const*
+  operator()(int idx) const {
+    static char const* color_map[8] = {"black!00", "black!40", "black!20", "black!60",
+                                       "black!10", "black!50", "black!30", "black!70"};
+    return color_map[idx % 8];
+  }
+};
+
+struct TikzColor_TV {
+  CUTE_HOST_DEVICE char const*
+  operator()(int tid, int vid) const {
+    static char const* color_map[8] = {"{rgb,255:red,175;green,175;blue,255}",
+                                       "{rgb,255:red,175;green,255;blue,175}",
+                                       "{rgb,255:red,255;green,255;blue,175}",
+                                       "{rgb,255:red,255;green,175;blue,175}",
+                                       "{rgb,255:red,210;green,210;blue,255}",
+                                       "{rgb,255:red,210;green,255;blue,210}",
+                                       "{rgb,255:red,255;green,255;blue,210}",
+                                       "{rgb,255:red,255;green,210;blue,210}"};
+    return color_map[tid % 8];
+  }
+};
+
+// Generic 2D Layout to LaTeX printer
+template <class LayoutA, class TikzColorFn = TikzColor_BWx8>
+CUTE_HOST_DEVICE
+void
+print_latex(LayoutA const& layout_a,   // (m,n) -> idx
+            TikzColorFn color = {})    // lambda(idx) -> tikz color string
+{
+  CUTE_STATIC_ASSERT_V(rank(layout_a) <= Int<2>{});
+  auto layout = append<2>(layout_a, Layout<_1,_0>{});
+
+  // Commented print(layout)
+  printf("%% Layout: "); print(layout); printf("\n");
+  // Header
+  printf("\\documentclass[convert]{standalone}\n"
+         "\\usepackage{tikz}\n\n"
+         "\\begin{document}\n"
+         "\\begin{tikzpicture}[x={(0cm,-1cm)},y={(1cm,0cm)},every node/.style={minimum size=1cm, outer sep=0pt}]\n\n");
+
+  // Layout
+  for (int i = 0; i < size<0>(layout); ++i) {
+    for (int j = 0; j < size<1>(layout); ++j) {
+      int idx = layout(i,j);
+      printf("\\node[fill=%s] at (%d,%d) {%d};\n",
+             color(idx), i, j, idx);
+    }
+  }
+  // Grid
+  printf("\\draw[color=black,thick,shift={(-0.5,-0.5)}] (0,0) grid (%d,%d);\n\n",
+         int(size<0>(layout)), int(size<1>(layout)));
+  // Labels
+  for (int i =  0, j = -1; i < size<0>(layout); ++i) {
+    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", i, j, i);
+  }
+  for (int i = -1, j =  0; j < size<1>(layout); ++j) {
+    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", i, j, j);
+  }
+
+  // Footer
+  printf("\\end{tikzpicture}\n"
+         "\\end{document}\n");
+}
+
+// Generic ThrVal 2D Layout to LaTeX TikZ
+template <class Layout, class ThrID, class TikzColorFn = TikzColor_TV>
+CUTE_HOST_DEVICE
+void
+print_latex(Layout const& layout,    // (m,n) -> (tid,vid)
+            ThrID  const& thr,       // tid -> thr_idx
+            TikzColorFn color = {})  // lambda(thr_idx,val_idx) -> tikz color string
+{
+  CUTE_STATIC_ASSERT_V(rank(layout) == Int<2>{});
+
+  // Commented prints
+  printf("%% Layout: "); print(layout); printf("\n");
+  printf("%% ThrID : "); print(thr);  printf("\n");
+  // Header
+  printf("\\documentclass[convert]{standalone}\n"
+         "\\usepackage{tikz}\n\n"
+         "\\begin{document}\n"
+         "\\begin{tikzpicture}[x={(0cm,-1cm)},y={(1cm,0cm)},every node/.style={minimum size=1cm, outer sep=0pt}]\n\n");
+
+  // Layout
+  for (int i = 0; i < size<0>(layout); ++i) {
+    for (int j = 0; j < size<1>(layout); ++j) {
+      int thrid   = layout(i,j) % size(thr);
+      int val_idx = layout(i,j) / size(thr);
+      int thr_idx = thr(thrid);
+
+      printf("\\node[fill=%s] at (%d,%d) {\\shortstack{T%d \\\\ V%d}};\n",
+             color(thr_idx, val_idx),
+             i, j,
+             thr_idx, val_idx);
+    }
+  }
+  // Grid
+  printf("\\draw[color=black,thick,shift={(-0.5,-0.5)}] (0,0) grid (%d,%d);\n\n",
+         int(size<0>(layout)), int(size<1>(layout)));
+  // Labels
+  for (int i = 0, j = -1; i < size<0>(layout); ++i) {
+    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", i, j, i);
+  }
+  for (int j = 0, i = -1; j < size<1>(layout); ++j) {
+    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", i, j, j);
+  }
+
+  // Footer
+  printf("\\end{tikzpicture}\n"
+         "\\end{document}\n");
+}
+
+} // end namespace cute
diff --git a/include/cute/layout_composed.hpp b/include/cute/layout_composed.hpp
new file mode 100644
index 0000000000..26ae8dc76c
--- /dev/null
+++ b/include/cute/layout_composed.hpp
@@ -0,0 +1,661 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>                     // CUTE_HOST_DEVICE, CUTE_GCC_UNREACHABLE
+#include <cute/layout.hpp>                     // cute::tuple
+#include <cute/numeric/integral_constant.hpp>  // cute::true_type, cute::false_type, cute::Int
+
+/* This implements a ComposedLayout of the form
+ *   LayoutA o Offset o LayoutB
+ * and is useful in cases where composition() does not or cannot apply to LayoutA and LayoutB.
+ * For example, when the "divisibility condition" in shape_div is violated in composition(LayoutA, LayoutB).
+ *
+ * This ComposedLayout provides similar functionality to Layout including tiling, partitioning,
+ * coordinate-to-index mapping and layout manipulations, but is not considered a "normal" layout.
+ * For example, this layout provides shape() and size() functions, but does not provide stride() functions.
+ * Mostly, the similar functionality is accomplished by applying each operation to LayoutB only
+ * as LayoutB defines the domain.
+ */
+
+namespace cute
+{
+
+// A Layout of non-trivially composable functions: F o I o L
+template <class LayoutA, class Offset, class LayoutB>
+struct ComposedLayout : private cute::tuple<LayoutA, Offset, LayoutB>  // EBO for static layouts
+{
+  CUTE_HOST_DEVICE constexpr
+  ComposedLayout(LayoutA const& layoutA = {},
+                 Offset  const& offset  = {},
+                 LayoutB const& layoutB = {})
+      : cute::tuple<LayoutA, Offset, LayoutB>(layoutA, offset, layoutB)
+  {}
+
+  //
+  // Accessors
+  //
+
+  static constexpr int rank  = LayoutB::rank;
+
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  layout_a() const {
+    return get<0>(static_cast<cute::tuple<LayoutA, Offset, LayoutB> const&>(*this));
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  offset() const {
+    return get<1>(static_cast<cute::tuple<LayoutA, Offset, LayoutB> const&>(*this));
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  layout_b() const {
+    return get<2>(static_cast<cute::tuple<LayoutA, Offset, LayoutB> const&>(*this));
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  layout() const {
+    return *this;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  shape() const {
+    return layout_b().shape();
+  }
+
+  // Doesn't really make sense to ask for the strides of this "layout"
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  stride() const = delete;
+
+  //
+  // Mappings
+  //
+
+  // Map a logical coordinate to a linear index (Coord has no Underscore slice operators)
+  // OR
+  // Slice the layout and return the sublayout (Coord has an Underscore slice op)
+  template <class Coord>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  operator()(Coord const& coord) const {
+    if constexpr (has_underscore<Coord>::value) {
+      return slice(coord, *this);
+    } else {
+      return layout_a()(offset() + layout_b()(coord));    // (A o O o B)(c)
+    }
+
+    CUTE_GCC_UNREACHABLE;
+  }
+
+  // Convenience function for multi-dimensional coordinates
+  template <class Coord0, class Coord1, class... Coords>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  operator()(Coord0 const& c0, Coord1 const& c1, Coords const&... cs) const {
+    return operator()(make_coord(c0,c1,cs...));
+  }
+
+  //
+  // Compose
+  //
+
+  template <class OtherLayout>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  compose(OtherLayout const& other) const {
+    return composition(*this, other);
+  }
+
+  template <class... Layouts>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  compose(Layouts const&... layouts) const {
+    return composition(*this, make_tile(layouts...));
+  }
+
+  template <class OtherShape>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  with_shape(OtherShape const& shape) const {
+    return composition(*this, make_layout(shape));
+  }
+
+  template <class... Shapes>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  with_shape(Shapes const&... shapes) const {
+    return composition(*this, make_layout(make_shape(shapes...)));
+  }
+
+  //
+  // Tile
+  //
+
+  template <class OtherLayout>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  tile(OtherLayout const& other) const {
+    return tiled_divide(*this, other);
+  }
+
+  template <class... Layouts>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  tile(Layouts const&... layouts) const {
+    return tiled_divide(*this, make_tile(layouts...));
+  }
+
+  // Equality, return a static or dynamic boolean
+  template <class... Args>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  operator==(ComposedLayout<Args...> const& other) const {
+    return this->layout_a() == other.layout_a() &&
+           this->layout_b() == other.layout_b() &&
+           this->offset()   == other.offset();
+  }
+};
+
+template <class A, class O, class B>
+struct is_layout<ComposedLayout<A,O,B>> : true_type {};
+
+template <class T>
+struct is_composed_layout : false_type {};
+template <class A, class O, class B>
+struct is_composed_layout<ComposedLayout<A,O,B>> : true_type {};
+
+//
+// Constructors
+//
+
+template <class LayoutA, class Offset, class LayoutB>
+CUTE_HOST_DEVICE constexpr
+auto
+make_composed_layout(LayoutA const& layoutA,
+                     Offset  const& offset,
+                     LayoutB const& layoutB)
+{
+  return ComposedLayout<LayoutA, Offset, LayoutB>{layoutA, offset, layoutB};
+}
+
+//
+// Utilities
+//
+
+// Return the layout of a mode
+template <int... Is, class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+layout(ComposedLayout<A,O,B> const& clayout)
+{
+  return composition(clayout.layout_a(), clayout.offset(), layout<Is...>(clayout.layout_b()));
+}
+
+// Return the shape of a mode
+template <int... Is, class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+shape(ComposedLayout<A,O,B> const& layout)
+{
+  return shape<Is...>(layout.layout_b());
+}
+
+// Doesn't make sense to directly ask for the strides of this "layout"
+template <int... Is, class Fn, class O, class Layout>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+stride(ComposedLayout<Fn,O,Layout> const& layout) = delete;
+
+// Return the number of elements in a mode
+template <int... Is, class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+size(ComposedLayout<A,O,B> const& layout)
+{
+  return size<Is...>(layout.layout_b());
+}
+
+// Return the number of modes
+template <int... Is, class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+rank(ComposedLayout<A,O,B> const& layout)
+{
+  return rank<Is...>(layout.layout_b());
+}
+
+// Return the depth of the layout
+template <int... Is, class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+depth(ComposedLayout<A,O,B> const& layout)
+{
+  return depth<Is...>(layout.layout_b());
+}
+
+// Return the codomain size of a mode
+template <int... Is, class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+cosize(ComposedLayout<A,O,B> const& layout)
+{
+  return cosize<Is...>(layout.layout_b());
+}
+
+//
+// Operations to manipulate Layouts like a tuple of pairs
+//
+
+template <size_t I, class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+get(ComposedLayout<A,O,B> const& a)
+{
+  return composition(a.layout_a(), a.offset(), get<I>(a.layout_b()));
+}
+
+template <int Begin, int End, class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+take(ComposedLayout<A,O,B> const& a)
+{
+  return composition(a.layout_a(), a.offset(), take<Begin,End>(a.layout_b()));
+}
+
+template <class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+flatten(ComposedLayout<A,O,B> const& a)
+{
+  return composition(a.layout_a(), a.offset(), flatten(a.layout_b()));
+}
+
+template <int N, class A, class O, class B, class X>
+CUTE_HOST_DEVICE constexpr
+auto
+append(ComposedLayout<A,O,B> const& a, X const& x)
+{
+  return composition(a.layout_a(), a.offset(), append<N>(a.layout_b(), x));
+}
+
+template <int Begin, int End, class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+group(ComposedLayout<A,O,B> const& a)
+{
+  return composition(a.layout_a(), a.offset(), group<Begin,End>(a.layout_b()));
+}
+
+//
+// Slice a ComposedLayout
+//
+
+template <class Coord, class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+slice_and_offset(Coord const& coord, ComposedLayout<A,O,B> const& layout)
+{
+  auto [slice, offset] = slice_and_offset(coord, layout.layout_b());
+  return cute::make_tuple(ComposedLayout{layout.layout_a(), layout.offset() + offset, slice}, Int<0>{});
+}
+
+template <class Coord, class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+slice(Coord const& coord, ComposedLayout<A,O,B> const& layout)
+{
+  return get<0>(slice_and_offset(coord, layout));
+}
+
+// Compute a pointer offset and (potentially modified) layout from a coordinate
+// For composed layout tensors the offset is accumulated in the layout itself while pointer is not updated
+template <class Coord, class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+domain_offset(Coord const& coord, ComposedLayout<A,O,B> const& layout)
+{
+  return cute::make_tuple(ComposedLayout{layout.layout_a(), layout.offset() + layout.layout_b()(coord), layout.layout_b()}, Int<0>{});
+}
+
+//
+// composition
+//
+
+template <class LayoutA,
+          class Offset,
+          class LayoutB>
+CUTE_HOST_DEVICE constexpr
+auto
+composition(LayoutA const& layoutA,
+            Offset  const& offset,
+            LayoutB const& layoutB)
+{
+  return ComposedLayout<LayoutA, Offset, LayoutB>{layoutA, offset, layoutB};
+}
+
+template <class A, class O, class B, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+composition(ComposedLayout<A,O,B> const& a,
+            Tiler                 const& b)
+{
+  return composition(a.layout_a(), a.offset(), composition(a.layout_b(), b));
+}
+
+template <class ShapeA, class StrideA,
+          class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+composition(Layout<ShapeA,StrideA> const& a,
+            ComposedLayout<A,O,B>  const& b)
+{
+  CUTE_STATIC_ASSERT_V(b.offset() == Int<0>{}, "Require offset == 0.");
+
+  return composition(composition(a, b.layout_a()), b.layout_b());
+}
+
+//
+// complement
+//
+
+template <class A, class O, class B, class CoTarget>
+CUTE_HOST_DEVICE constexpr
+auto
+complement(ComposedLayout<A,O,B> const& layout, CoTarget const& cotarget)
+{
+  return complement(layout.layout_b(), cotarget);
+}
+
+template <class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+complement(ComposedLayout<A,O,B> const& layout)
+{
+  return complement(layout, cosize(layout));
+}
+
+//
+// inverse
+//
+
+template <class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+right_inverse(ComposedLayout<A,O,B> const& layout)
+{
+  return composition(right_inverse(layout.layout_b()), right_inverse(layout.offset()), right_inverse(layout.layout_a()));
+}
+
+template <class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+left_inverse(ComposedLayout<A,O,B> const& layout)
+{
+  return composition(left_inverse(layout.layout_b()), left_inverse(layout.offset()), left_inverse(layout.layout_a()));
+}
+
+//
+// Other operations
+//
+
+template <class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+zip(ComposedLayout<A,O,B> const& a)
+{
+  return composition(a.layout_a(), a.offset(), zip(a.layout_b()));
+}
+
+// Partitions
+
+template <class A, class O, class B, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+logical_divide(ComposedLayout<A,O,B> const& a,
+               Tiler                 const& b)
+{
+  return composition(a.layout_a(), a.offset(), logical_divide(a.layout_b(), b));
+}
+
+template <class A, class O, class B, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+tile_unzip(ComposedLayout<A,O,B> const& a,
+           Tiler                 const& b)
+{
+  return composition(a.layout_a(), a.offset(), tile_unzip(a.layout_b(), b));
+}
+
+template <class A, class O, class B, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+tiled_divide(ComposedLayout<A,O,B> const& a,
+             Tiler                 const& b)
+{
+  return composition(a.layout_a(), a.offset(), tiled_divide(a.layout_b(), b));
+}
+
+template <class A, class O, class B, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+zipped_divide(ComposedLayout<A,O,B> const& a,
+              Tiler                 const& b)
+{
+  return composition(a.layout_a(), a.offset(), zipped_divide(a.layout_b(), b));
+}
+
+template <class A, class O, class B, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+flat_divide(ComposedLayout<A,O,B> const& a,
+            Tiler                 const& b)
+{
+  return composition(a.layout_a(), a.offset(), flat_divide(a.layout_b(), b));
+}
+
+template <class A, class O, class B, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+logical_product(ComposedLayout<A,O,B> const& a,
+                Tiler                 const& b)
+{
+  return composition(a.layout_a(), a.offset(), logical_product(a.layout_b(), b));
+}
+
+template <class A, class O, class B, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+zipped_product(ComposedLayout<A,O,B> const& a,
+               Tiler                 const& b)
+{
+  return composition(a.layout_a(), a.offset(), zipped_product(a.layout_b(), b));
+}
+
+template <class A, class O, class B, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+tiled_product(ComposedLayout<A,O,B> const& a,
+              Tiler                 const& b)
+{
+  return composition(a.layout_a(), a.offset(), tiled_product(a.layout_b(), b));
+}
+
+template <class A, class O, class B, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+flat_product(ComposedLayout<A,O,B> const& a,
+             Tiler                 const& b)
+{
+  return composition(a.layout_a(), a.offset(), flat_product(a.layout_b(), b));
+}
+
+template <class A, class O, class B, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+blocked_product(ComposedLayout<A,O,B> const& a,
+                Tiler                 const& b)
+{
+  return composition(a.layout_a(), a.offset(), blocked_product(a.layout_b(), b));
+}
+
+template <class A, class O, class B, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+raked_product(ComposedLayout<A,O,B> const& a,
+              Tiler                 const& b)
+{
+  return composition(a.layout_a(), a.offset(), raked_product(a.layout_b(), b));
+}
+
+template <class A, class O, class B,
+          class Shape, class ModeOrder = GenColMajor>
+CUTE_HOST_DEVICE constexpr
+auto
+tile_to_shape(ComposedLayout<A,O,B> const& layout,
+              Shape                 const& trg_shape,
+              ModeOrder             const& ord_shape = {})
+{
+  return composition(layout.layout_a(), layout.offset(), tile_to_shape(layout.layout_b(), trg_shape, ord_shape));
+}
+
+template <class A, class O, class B,
+          class Shape>
+CUTE_HOST_DEVICE constexpr
+auto
+filter(ComposedLayout<A,O,B> const& layout, Shape const& trg_profile)
+{
+  return composition(layout.layout_a(), layout.offset(), filter(layout.layout_b(), trg_profile));
+}
+
+template <class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+coalesce(ComposedLayout<A,O,B> const& layout)
+{
+  return composition(layout.layout_a(), layout.offset(), coalesce(layout.layout_b()));
+}
+
+template <class A, class O, class B, class Shape>
+CUTE_HOST_DEVICE constexpr
+auto
+coalesce(ComposedLayout<A,O,B> const& layout, Shape const& trg_profile)
+{
+  return composition(layout.layout_a(), layout.offset(), coalesce(layout.layout_b(), trg_profile));
+}
+
+
+//
+// Upcast and Downcast
+//
+
+template <int N, class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+upcast(ComposedLayout<A,O,B> const& layout)
+{
+  return composition(upcast<N>(layout.layout_a()), upcast<N>(layout.offset()), upcast<N>(layout.layout_b()));
+}
+
+template <int N, class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+downcast(ComposedLayout<A,O,B> const& layout)
+{
+  return composition(downcast<N>(layout.layout_a()), downcast<N>(layout.offset()), downcast<N>(layout.layout_b()));
+}
+
+
+template <class OldType, class NewType,
+          class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+recast_layout(ComposedLayout<A,O,B> const& layout)
+{
+  using scale = decltype(trait_ratio(sizeof_bits<NewType>{}, sizeof_bits<OldType>{}));
+  if constexpr (scale::num == 1 && scale::den == 1) {
+    return layout;
+  }
+  else if constexpr (scale::num == 1) {
+    return downcast<scale::den>(layout);
+  }
+  else if constexpr (scale::den == 1) {
+    return upcast<scale::num>(layout);
+  }
+  else {
+    return downcast<scale::den>(upcast<scale::num>(layout));
+  }
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+max_alignment(ComposedLayout<A,O,B> const& layout)
+{
+  // Do not attempt for general ComposedLayouts
+  //return gcd(max_alignment(layout.layout_a()), max_alignment(layout.offset()), max_alignment(layout.layout_b()));
+  return Int<1>{};
+}
+
+template <class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+nullspace(ComposedLayout<A,O,B> const& layout)
+{
+  // Do not attempt for general ComposedLayouts
+  return Layout<_1,_0>{};
+}
+
+//
+// Display utilities
+//
+
+template <class A, class O, class B>
+CUTE_HOST_DEVICE void print(ComposedLayout<A,O,B> const& layout)
+{
+  print(layout.layout_a()); print(" o "); print(layout.offset()); print(" o "); print(layout.layout_b());
+}
+
+#if !defined(__CUDACC_RTC__)
+template <class A, class O, class B>
+CUTE_HOST std::ostream& operator<<(std::ostream& os, ComposedLayout<A,O,B> const& layout)
+{
+  return os << layout.layout_a() << " o " << layout.offset() << " o " << layout.layout_b();
+}
+#endif
+
+} // end namespace cute
diff --git a/include/cute/numeric/arithmetic_tuple.hpp b/include/cute/numeric/arithmetic_tuple.hpp
new file mode 100644
index 0000000000..2e46905719
--- /dev/null
+++ b/include/cute/numeric/arithmetic_tuple.hpp
@@ -0,0 +1,556 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+
+#include <cute/container/tuple.hpp>
+#include <cute/numeric/integral_constant.hpp>
+#include <cute/algorithm/functional.hpp>
+#include <cute/algorithm/tuple_algorithms.hpp>
+#include <cute/util/type_traits.hpp>
+
+namespace cute
+{
+
+template <class... T>
+struct ArithmeticTuple : tuple<T...>
+{
+  template <class... U>
+  CUTE_HOST_DEVICE constexpr
+  ArithmeticTuple(ArithmeticTuple<U...> const& u)
+    : tuple<T...>(static_cast<tuple<U...> const&>(u)) {}
+
+  template <class... U>
+  CUTE_HOST_DEVICE constexpr
+  ArithmeticTuple(tuple<U...> const& u)
+    : tuple<T...>(u) {}
+
+  template <class... U>
+  CUTE_HOST_DEVICE constexpr
+  ArithmeticTuple(U const&... u)
+    : tuple<T...>(u...) {}
+};
+
+template <class... T>
+struct is_tuple<ArithmeticTuple<T...>> : true_type {};
+
+template <class... Ts>
+struct is_flat<ArithmeticTuple<Ts...>> : is_flat<tuple<Ts...>> {};
+
+template <class... T>
+CUTE_HOST_DEVICE constexpr
+auto
+make_arithmetic_tuple(T const&... t) {
+  return ArithmeticTuple<T...>(t...);
+}
+
+template <class T>
+CUTE_HOST_DEVICE constexpr
+auto
+as_arithmetic_tuple(T const& t) {
+  if constexpr (is_tuple<T>::value) {
+    return detail::tapply(t, [](auto const& x){ return as_arithmetic_tuple(x); },
+                          [](auto const&... a){ return make_arithmetic_tuple(a...); },
+                          tuple_seq<T>{});
+  } else {
+    return t;
+  }
+}
+
+//
+// Numeric operators
+//
+
+// Addition
+template <class... T, class... U>
+CUTE_HOST_DEVICE constexpr
+auto
+operator+(ArithmeticTuple<T...> const& t, ArithmeticTuple<U...> const& u) {
+  constexpr int R = cute::max(int(sizeof...(T)), int(sizeof...(U)));
+  return transform_apply(append<R>(t,Int<0>{}), append<R>(u,Int<0>{}), plus{}, [](auto const&... a){ return make_arithmetic_tuple(a...); });
+}
+
+template <class... T, class... U>
+CUTE_HOST_DEVICE constexpr
+auto
+operator+(ArithmeticTuple<T...> const& t, tuple<U...> const& u) {
+  return t + ArithmeticTuple<U...>(u);
+}
+
+template <class... T, class... U>
+CUTE_HOST_DEVICE constexpr
+auto
+operator+(tuple<T...> const& t, ArithmeticTuple<U...> const& u) {
+  return ArithmeticTuple<T...>(t) + u;
+}
+
+// Subtraction
+template <class... T, class... U>
+CUTE_HOST_DEVICE constexpr
+auto
+operator-(ArithmeticTuple<T...> const& t, ArithmeticTuple<U...> const& u) {
+  constexpr int R = cute::max(int(sizeof...(T)), int(sizeof...(U)));
+  return transform_apply(append<R>(t,Int<0>{}), append<R>(u,Int<0>{}), minus{}, [](auto const&... a){ return make_arithmetic_tuple(a...); });
+}
+
+template <class... T, class... U>
+CUTE_HOST_DEVICE constexpr
+auto
+operator-(ArithmeticTuple<T...> const& t, tuple<U...> const& u) {
+  return t - ArithmeticTuple<U...>(u);
+}
+
+template <class... T, class... U>
+CUTE_HOST_DEVICE constexpr
+auto
+operator-(tuple<T...> const& t, ArithmeticTuple<U...> const& u) {
+  return ArithmeticTuple<T...>(t) - u;
+}
+
+// Negation
+template <class... T>
+CUTE_HOST_DEVICE constexpr
+auto
+operator-(ArithmeticTuple<T...> const& t) {
+  return transform_apply(t, negate{}, [](auto const&... a){ return make_arithmetic_tuple(a...); });
+}
+
+//
+// Special cases
+//
+
+template <auto t, class... U>
+CUTE_HOST_DEVICE constexpr
+ArithmeticTuple<U...> const&
+operator+(C<t>, ArithmeticTuple<U...> const& u) {
+  static_assert(t == 0, "Arithmetic tuple op+ error!");
+  return u;
+}
+
+template <class... T, auto u>
+CUTE_HOST_DEVICE constexpr
+ArithmeticTuple<T...> const&
+operator+(ArithmeticTuple<T...> const& t, C<u>) {
+  static_assert(u == 0, "Arithmetic tuple op+ error!");
+  return t;
+}
+
+template <auto t, class... U>
+CUTE_HOST_DEVICE constexpr
+ArithmeticTuple<U...> const&
+operator-(C<t>, ArithmeticTuple<U...> const& u) {
+  static_assert(t == 0, "Arithmetic tuple op- error!");
+  return -u;
+}
+
+template <class... T, auto u>
+CUTE_HOST_DEVICE constexpr
+ArithmeticTuple<T...> const&
+operator-(ArithmeticTuple<T...> const& t, C<u>) {
+  static_assert(u == 0, "Arithmetic tuple op- error!");
+  return t;
+}
+
+//
+// ArithmeticTupleIterator
+//
+
+template <class ArithTuple>
+struct ArithmeticTupleIterator
+{
+  using value_type   = ArithTuple;
+  using element_type = ArithTuple;
+  using reference    = ArithTuple;
+
+  ArithTuple coord_;
+
+  CUTE_HOST_DEVICE constexpr
+  ArithmeticTupleIterator(ArithTuple const& coord = {}) : coord_(coord) {}
+
+  CUTE_HOST_DEVICE constexpr
+  ArithTuple operator*() const { return coord_; }
+
+  template <class Coord>
+  CUTE_HOST_DEVICE constexpr
+  auto operator[](Coord const& c) const { return *(*this + c); }
+
+  template <class Coord>
+  CUTE_HOST_DEVICE constexpr
+  auto operator+(Coord const& c) const {
+    return ArithmeticTupleIterator<remove_cvref_t<decltype(coord_ + c)>>(coord_ + c);
+  }
+};
+
+template <class Tuple>
+CUTE_HOST_DEVICE constexpr
+auto
+make_inttuple_iter(Tuple const& t) {
+  return ArithmeticTupleIterator(as_arithmetic_tuple(t));
+}
+
+template <class T0, class T1, class... Ts>
+CUTE_HOST_DEVICE constexpr
+auto
+make_inttuple_iter(T0 const& t0, T1 const& t1, Ts const&... ts) {
+  return make_inttuple_iter(cute::make_tuple(t0, t1, ts...));
+}
+
+//
+// ArithmeticTuple "basis" elements
+//   A ScaledBasis<T,N> is a (at least) rank-N+1 ArithmeticTuple:
+//      (_0,_0,...,T,_0,...)
+//   with value T in the Nth mode
+
+template <class T, int N>
+struct ScaledBasis : private tuple<T>
+{
+  CUTE_HOST_DEVICE constexpr
+  ScaledBasis(T const& t = {}) : tuple<T>(t) {}
+
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto) value()       { return get<0>(static_cast<tuple<T>      &>(*this)); }
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto) value() const { return get<0>(static_cast<tuple<T> const&>(*this)); }
+
+  CUTE_HOST_DEVICE static constexpr
+  auto mode() { return Int<N>{}; }
+};
+
+template <class T>
+struct is_scaled_basis : false_type {};
+template <class T, int N>
+struct is_scaled_basis<ScaledBasis<T,N>> : true_type {};
+
+template <class T, int N>
+struct is_integral<ScaledBasis<T,N>> : true_type {};
+
+// Get the scalar T out of a ScaledBasis
+template <class SB>
+CUTE_HOST_DEVICE constexpr auto
+basis_value(SB const& e)
+{
+  if constexpr (is_scaled_basis<SB>::value) {
+    return basis_value(e.value());
+  } else {
+    return e;
+  }
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Apply the N... pack to another Tuple
+template <class SB, class Tuple>
+CUTE_HOST_DEVICE decltype(auto)
+basis_get(SB const& e, Tuple&& t)
+{
+  if constexpr (is_scaled_basis<SB>::value) {
+    return basis_get(e.value(), get<SB::mode()>(static_cast<Tuple&&>(t)));
+  } else {
+    return static_cast<Tuple&&>(t);
+  }
+  CUTE_GCC_UNREACHABLE;
+}
+
+namespace detail {
+
+template <class T, int... I>
+CUTE_HOST_DEVICE constexpr
+auto
+to_atuple_i(T const& t, seq<I...>) {
+  return make_arithmetic_tuple((void(I),Int<0>{})..., t);
+}
+
+} // end namespace detail
+
+// Turn a ScaledBases<T,N> into a rank-N+1 ArithmeticTuple
+//    with N prefix 0s:  (_0,_0,...N...,_0,T)
+template <class T, int N>
+CUTE_HOST_DEVICE constexpr
+auto
+as_arithmetic_tuple(ScaledBasis<T,N> const& t) {
+  return detail::to_atuple_i(as_arithmetic_tuple(t.value()), make_seq<N>{});
+}
+
+namespace detail {
+
+template <int... Ns>
+struct Basis;
+
+template <>
+struct Basis<> {
+  using type = Int<1>;
+};
+
+template <int N, int... Ns>
+struct Basis<N,Ns...> {
+  using type = ScaledBasis<typename Basis<Ns...>::type, N>;
+};
+
+} // end namespace detail
+
+// Shortcut for writing ScaledBasis<ScaledBasis<ScaledBasis<Int<1>, N0>, N1>, ...>
+// E<>    := _1
+// E<0>   := (_1,_0,_0,...)
+// E<1>   := (_0,_1,_0,...)
+// E<0,0> := ((_1,_0,_0,...),_0,_0,...)
+// E<0,1> := ((_0,_1,_0,...),_0,_0,...)
+// E<1,0> := (_0,(_1,_0,_0,...),_0,...)
+// E<1,1> := (_0,(_0,_1,_0,...),_0,...)
+template <int... N>
+using E = typename detail::Basis<N...>::type;
+
+template <class Shape>
+CUTE_HOST_DEVICE constexpr
+auto
+make_basis_like(Shape const& shape)
+{
+  if constexpr (is_integral<Shape>::value) {
+    return Int<1>{};
+  } else {
+    // Generate bases for each rank of shape
+    return transform(tuple_seq<Shape>{}, shape, [](auto I, auto si) {
+      // Generate bases for each rank of si and add an i on front
+      using I_type = decltype(I);
+      return transform_leaf(make_basis_like(si), [](auto e) {
+        // MSVC has trouble capturing variables as constexpr,
+        // so that they can be used as template arguments.
+        // This is exactly what the code needs to do with i, unfortunately.
+        // The work-around is to define i inside the inner lambda,
+        // by using just the type from the enclosing scope.
+        constexpr int i = I_type::value;
+        return ScaledBasis<decltype(e), i>{};
+      });
+    });
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Arithmetic
+//
+
+template <class T, int M, class U>
+CUTE_HOST_DEVICE constexpr
+auto
+safe_div(ScaledBasis<T,M> const& b, U const& u)
+{
+  auto t = safe_div(b.value(), u);
+  return ScaledBasis<decltype(t),M>{t};
+}
+
+template <class T, int M, class U>
+CUTE_HOST_DEVICE constexpr
+auto
+shape_div(ScaledBasis<T,M> const& b, U const& u)
+{
+  auto t = shape_div(b.value(), u);
+  return ScaledBasis<decltype(t),M>{t};
+}
+
+// Equality
+template <class T, int N, class U, int M>
+CUTE_HOST_DEVICE constexpr
+auto
+operator==(ScaledBasis<T,N> const& t, ScaledBasis<U,M> const& u) {
+  return bool_constant<M == N>{} && t.value() == u.value();
+}
+
+// Not equal to anything else
+template <class T, int N, class U>
+CUTE_HOST_DEVICE constexpr
+false_type
+operator==(ScaledBasis<T,N> const&, U const&) {
+  return {};
+}
+
+template <class T, class U, int M>
+CUTE_HOST_DEVICE constexpr
+false_type
+operator==(T const&, ScaledBasis<U,M> const&) {
+  return {};
+}
+
+// Abs
+template <class T, int N>
+CUTE_HOST_DEVICE constexpr
+auto
+abs(ScaledBasis<T,N> const& e) {
+  return ScaledBasis<decltype(abs(e.value())),N>{abs(e.value())};
+}
+
+// Multiplication
+template <class A, class T, int N>
+CUTE_HOST_DEVICE constexpr
+auto
+operator*(A const& a, ScaledBasis<T,N> const& e) {
+  auto r = a * e.value();
+  return ScaledBasis<decltype(r),N>{r};
+}
+
+template <class T, int N, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+operator*(ScaledBasis<T,N> const& e, B const& b) {
+  auto r = e.value() * b;
+  return ScaledBasis<decltype(r),N>{r};
+}
+
+// Addition
+template <class T, int N, class U, int M>
+CUTE_HOST_DEVICE constexpr
+auto
+operator+(ScaledBasis<T,N> const& t, ScaledBasis<U,M> const& u) {
+  return as_arithmetic_tuple(t) + as_arithmetic_tuple(u);
+}
+
+template <class T, int N, class... U>
+CUTE_HOST_DEVICE constexpr
+auto
+operator+(ScaledBasis<T,N> const& t, ArithmeticTuple<U...> const& u) {
+  return as_arithmetic_tuple(t) + u;
+}
+
+template <class... T, class U, int M>
+CUTE_HOST_DEVICE constexpr
+auto
+operator+(ArithmeticTuple<T...> const& t, ScaledBasis<U,M> const& u) {
+  return t + as_arithmetic_tuple(u);
+}
+
+template <auto t, class U, int M>
+CUTE_HOST_DEVICE constexpr
+auto
+operator+(C<t>, ScaledBasis<U,M> const& u) {
+  static_assert(t == 0, "ScaledBasis op+ error!");
+  return u;
+}
+
+template <class T, int N, auto u>
+CUTE_HOST_DEVICE constexpr
+auto
+operator+(ScaledBasis<T,N> const& t, C<u>) {
+  static_assert(u == 0, "ScaledBasis op+ error!");
+  return t;
+}
+
+//
+// Display utilities
+//
+
+template <class ArithTuple>
+CUTE_HOST_DEVICE void print(ArithmeticTupleIterator<ArithTuple> const& iter)
+{
+  printf("ArithTuple"); print(iter.coord_);
+}
+
+template <class T, int N>
+CUTE_HOST_DEVICE void print(ScaledBasis<T,N> const& e)
+{
+  print(e.value()); printf("@%d", N);
+}
+
+#if !defined(__CUDACC_RTC__)
+template <class ArithTuple>
+CUTE_HOST std::ostream& operator<<(std::ostream& os, ArithmeticTupleIterator<ArithTuple> const& iter)
+{
+  return os << "ArithTuple" << iter.coord_;
+}
+
+template <class T, int N>
+CUTE_HOST std::ostream& operator<<(std::ostream& os, ScaledBasis<T,N> const& e)
+{
+  return os << e.value() << "@" << N;
+}
+#endif
+
+} // end namespace cute
+
+
+namespace CUTE_STL_NAMESPACE
+{
+
+template <class... T>
+struct tuple_size<cute::ArithmeticTuple<T...>>
+  : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
+{};
+
+template <size_t I, class... T>
+struct tuple_element<I, cute::ArithmeticTuple<T...>>
+  : CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>
+{};
+
+template <class... T>
+struct tuple_size<const cute::ArithmeticTuple<T...>>
+  : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
+{};
+
+template <size_t I, class... T>
+struct tuple_element<I, const cute::ArithmeticTuple<T...>>
+  : CUTE_STL_NAMESPACE::tuple_element<I, const CUTE_STL_NAMESPACE::tuple<T...>>
+{};
+
+} // end namespace CUTE_STL_NAMESPACE
+
+#ifdef CUTE_STL_NAMESPACE_IS_CUDA_STD
+namespace std
+{
+
+#if defined(__CUDACC_RTC__)
+template <class... _Tp>
+struct tuple_size;
+
+template <size_t _Ip, class... _Tp>
+struct tuple_element;
+#endif
+
+template <class... T>
+struct tuple_size<cute::ArithmeticTuple<T...>>
+  : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
+{};
+
+template <size_t I, class... T>
+struct tuple_element<I, cute::ArithmeticTuple<T...>>
+  : CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>
+{};
+
+template <class... T>
+struct tuple_size<const cute::ArithmeticTuple<T...>>
+  : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
+{};
+
+template <size_t I, class... T>
+struct tuple_element<I, const cute::ArithmeticTuple<T...>>
+  : CUTE_STL_NAMESPACE::tuple_element<I, const CUTE_STL_NAMESPACE::tuple<T...>>
+{};
+
+} // end namespace std
+#endif // CUTE_STL_NAMESPACE_IS_CUDA_STD
diff --git a/include/cute/numeric/complex.hpp b/include/cute/numeric/complex.hpp
new file mode 100644
index 0000000000..7dd9ea5bf0
--- /dev/null
+++ b/include/cute/numeric/complex.hpp
@@ -0,0 +1,76 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>    // CUTE_HOST_DEVICE
+
+#include <cutlass/complex.h>  // cutlass::complexm, cutlass::real, cutlass::imag, cutlass::is_complex
+
+namespace cute
+{
+
+using cutlass::complex;
+using cutlass::is_complex;
+using cutlass::RealType;
+using cutlass::real;
+using cutlass::imag;
+using cutlass::conj;
+
+template <class T>
+static constexpr auto is_complex_v = is_complex<T>::value;
+
+/// Fused multiply-add for complex numbers
+template <class D, class A, class B, class C>
+CUTE_HOST_DEVICE constexpr
+void
+fma(complex<D>      & d,
+    complex<A> const& a,
+    complex<B> const& b,
+    complex<C> const& c)
+{
+  fma(d.real(),  a.real(), b.real(), c.real());
+  fma(d.imag(),  a.real(), b.imag(), c.imag());
+  fma(d.real(), -a.imag(), b.imag(), d.real());
+  fma(d.imag(),  a.imag(), b.real(), d.imag());
+}
+
+/// Fused multiply-add for triplets
+template <class A, class B, class C>
+CUTE_HOST_DEVICE constexpr
+void
+fma(complex<A> const& a,
+    complex<B> const& b,
+    complex<C>      & c)
+{
+  return fma(c, a, b, c);
+}
+
+} // end namespace cute
diff --git a/include/cute/numeric/int.hpp b/include/cute/numeric/int.hpp
new file mode 100644
index 0000000000..571b3e3ed0
--- /dev/null
+++ b/include/cute/numeric/int.hpp
@@ -0,0 +1,106 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cstdint>
+#else
+#include <cstdint>
+#endif
+
+#include <cute/config.hpp>          // CUTE_STL_NAMESPACE
+
+#include <cutlass/numeric_types.h>  // cutlass::int2b_t, cutlass::int4b_t
+
+namespace cute
+{
+
+//
+// Signed integers
+//
+
+using int2_t  = cutlass::int2b_t;
+using int4_t  = cutlass::int4b_t;
+using CUTE_STL_NAMESPACE::int8_t;
+using CUTE_STL_NAMESPACE::int16_t;
+using CUTE_STL_NAMESPACE::int32_t;
+using CUTE_STL_NAMESPACE::int64_t;
+
+template <int N> struct int_bit;
+template <> struct int_bit<  2>  { using type = int2_t; };
+template <> struct int_bit<  4>  { using type = int4_t; };
+template <> struct int_bit<  8>  { using type = int8_t;  };
+template <> struct int_bit< 16>  { using type = int16_t; };
+template <> struct int_bit< 32>  { using type = int32_t; };
+template <> struct int_bit< 64>  { using type = int64_t; };
+
+template <int N>
+using int_bit_t = typename int_bit<N>::type;
+
+template <int N>
+using int_byte = int_bit<8*N>;
+
+template <int N>
+using int_byte_t = typename int_byte<N>::type;
+
+//
+// Unsigned integers
+//
+
+using uint1_t   = cutlass::uint1b_t;
+using uint2_t   = cutlass::uint2b_t;
+using uint4_t   = cutlass::uint4b_t;
+using CUTE_STL_NAMESPACE::uint8_t;
+using CUTE_STL_NAMESPACE::uint16_t;
+using CUTE_STL_NAMESPACE::uint32_t;
+using CUTE_STL_NAMESPACE::uint64_t;
+using cutlass::uint128_t;
+
+template <int N> struct uint_bit;
+template <> struct uint_bit<  1> { using type = uint1_t; };
+template <> struct uint_bit<  2> { using type = uint2_t; };
+template <> struct uint_bit<  4> { using type = uint4_t; };
+template <> struct uint_bit<  8> { using type = uint8_t;  };
+template <> struct uint_bit< 16> { using type = uint16_t; };
+template <> struct uint_bit< 32> { using type = uint32_t; };
+template <> struct uint_bit< 64> { using type = uint64_t; };
+template <> struct uint_bit<128> { using type = cutlass::uint128_t; };
+
+template <int N>
+using uint_bit_t = typename uint_bit<N>::type;
+
+template <int N>
+using uint_byte = uint_bit<8*N>;
+
+template <int N>
+using uint_byte_t = typename uint_byte<N>::type;
+
+} // namespace cute
diff --git a/include/cute/numeric/integer_sequence.hpp b/include/cute/numeric/integer_sequence.hpp
new file mode 100644
index 0000000000..6080179585
--- /dev/null
+++ b/include/cute/numeric/integer_sequence.hpp
@@ -0,0 +1,151 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+#include <cute/util/type_traits.hpp>
+#include <cute/numeric/integral_constant.hpp>
+
+namespace cute
+{
+
+using CUTE_STL_NAMESPACE::integer_sequence;
+using CUTE_STL_NAMESPACE::make_integer_sequence;
+
+namespace detail {
+
+template <class T, class S, T Begin>
+struct range_impl;
+
+template <class T, T... N, T Begin>
+struct range_impl<T, integer_sequence<T, N...>, Begin> {
+  using type = integer_sequence<T, N+Begin...>;
+};
+
+template <class S>
+struct reverse_impl;
+
+template <class T, T... N>
+struct reverse_impl<integer_sequence<T, N...>> {
+  using type = integer_sequence<T, sizeof...(N)-1-N...>;
+};
+
+} // end namespace detail
+
+template <class T, T Begin, T End>
+using make_integer_range = typename detail::range_impl<
+    T,
+    make_integer_sequence<T, (End-Begin > 0) ? (End-Begin) : 0>,
+    Begin>::type;
+
+template <class T, T N>
+using make_integer_sequence_reverse = typename detail::reverse_impl<
+    make_integer_sequence<T, N>>::type;
+
+//
+// Common aliases
+//
+
+// int_sequence
+
+template <int... Ints>
+using int_sequence = integer_sequence<int, Ints...>;
+
+template <int N>
+using make_int_sequence = make_integer_sequence<int, N>;
+
+template <int N>
+using make_int_rsequence = make_integer_sequence_reverse<int, N>;
+
+template <int Begin, int End>
+using make_int_range = make_integer_range<int, Begin, End>;
+
+// index_sequence
+
+template <size_t... Ints>
+using index_sequence = integer_sequence<size_t, Ints...>;
+
+template <size_t N>
+using make_index_sequence = make_integer_sequence<size_t, N>;
+
+template <size_t N>
+using make_index_rsequence = make_integer_sequence_reverse<size_t, N>;
+
+template <size_t Begin, size_t End>
+using make_index_range = make_integer_range<size_t, Begin, End>;
+
+//
+// Shortcuts
+//
+
+template <int... Ints>
+using seq = int_sequence<Ints...>;
+
+template <int N>
+using make_seq = make_int_sequence<N>;
+
+template <int N>
+using make_rseq = make_int_rsequence<N>;
+
+template <int Min, int Max>
+using make_range = make_int_range<Min, Max>;
+
+template <class Tuple>
+using tuple_seq = make_seq<tuple_size<remove_cvref_t<Tuple>>::value>;
+
+template <class Tuple>
+using tuple_rseq = make_rseq<tuple_size<remove_cvref_t<Tuple>>::value>;
+
+//
+// Specialize cute::tuple-traits for std::integer_sequence
+//
+
+template <class T, T... Ints>
+struct tuple_size<integer_sequence<T, Ints...>>
+    : cute::integral_constant<size_t, sizeof...(Ints)>
+{};
+
+template <size_t I, class T, T... Is>
+struct tuple_element<I, integer_sequence<T, Is...>>
+{
+  constexpr static T idx[sizeof...(Is)] = {Is...};
+  using type = cute::integral_constant<T, idx[I]>;
+};
+
+template <size_t I, class T, T... Ints>
+CUTE_HOST_DEVICE constexpr
+tuple_element_t<I, integer_sequence<T, Ints...>>
+get(integer_sequence<T, Ints...>) {
+  static_assert(I < sizeof...(Ints), "Index out of range");
+  return {};
+}
+
+} // end namespace cute
diff --git a/include/cute/numeric/integral_constant.hpp b/include/cute/numeric/integral_constant.hpp
new file mode 100644
index 0000000000..3a8d036eef
--- /dev/null
+++ b/include/cute/numeric/integral_constant.hpp
@@ -0,0 +1,517 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/numeric/math.hpp>      // cute::max, etc
+#include <cute/util/print.hpp>        // cute::print
+#include <cute/util/type_traits.hpp>  // __CUTE_REQUIRES, cute::is_std_integral
+
+namespace cute
+{
+
+// A constant value: short name and type-deduction for fast compilation
+template <auto v>
+struct C {
+  using type = C<v>;
+  static constexpr auto value = v;
+  using value_type = decltype(v);
+  CUTE_HOST_DEVICE constexpr operator   value_type() const noexcept { return value; }
+  CUTE_HOST_DEVICE constexpr value_type operator()() const noexcept { return value; }
+};
+
+// Deprecate
+template <class T, T v>
+using constant = C<v>;
+
+template <bool b>
+using bool_constant = C<b>;
+
+using true_type  = bool_constant<true>;
+using false_type = bool_constant<false>;
+
+// A more std:: conforming integral_constant that enforces type but interops with C<v>
+template <class T, T v>
+struct integral_constant : C<v> {
+  using type = integral_constant<T,v>;
+  static constexpr T value = v;
+  using value_type = T;
+  // Disambiguate C<v>::operator value_type()
+  //CUTE_HOST_DEVICE constexpr operator   value_type() const noexcept { return value; }
+  CUTE_HOST_DEVICE constexpr value_type operator()() const noexcept { return value; }
+};
+
+//
+// Traits
+//
+
+// Use cute::is_std_integral<T> to match built-in integral types (int, int64_t, unsigned, etc)
+// Use cute::is_integral<T> to match both built-in integral types AND static integral types.
+
+template <class T>
+struct is_integral : bool_constant<is_std_integral<T>::value> {};
+template <auto v>
+struct is_integral<C<v>                  > : true_type {};
+template <class T, T v>
+struct is_integral<integral_constant<T,v>> : true_type {};
+
+// Register FastDivmod as the integral type
+template<>
+struct is_integral<cutlass::FastDivmod> : true_type {};
+
+// is_static detects if an (abstract) value is defined completely by its type (no members)
+template <class T>
+struct is_static : bool_constant<is_empty<remove_cvref_t<T>>::value> {};
+
+template <class T>
+constexpr bool is_static_v = is_static<T>::value;
+
+// is_constant detects if a type is a static integral type and if v is equal to a value
+
+template <auto n, class T>
+struct is_constant : false_type {};
+template <auto n, class T>
+struct is_constant<n, T const > : is_constant<n,T> {};
+template <auto n, class T>
+struct is_constant<n, T const&> : is_constant<n,T> {};
+template <auto n, class T>
+struct is_constant<n, T      &> : is_constant<n,T> {};
+template <auto n, class T>
+struct is_constant<n, T     &&> : is_constant<n,T> {};
+template <auto n, auto v>
+struct is_constant<n, C<v>                  > : bool_constant<v == n> {};
+template <auto n, class T, T v>
+struct is_constant<n, integral_constant<T,v>> : bool_constant<v == n> {};
+
+//
+// Specializations
+//
+
+template <int v>
+using Int = C<v>;
+
+using _m32    = Int<-32>;
+using _m24    = Int<-24>;
+using _m16    = Int<-16>;
+using _m12    = Int<-12>;
+using _m10    = Int<-10>;
+using _m9     = Int<-9>;
+using _m8     = Int<-8>;
+using _m7     = Int<-7>;
+using _m6     = Int<-6>;
+using _m5     = Int<-5>;
+using _m4     = Int<-4>;
+using _m3     = Int<-3>;
+using _m2     = Int<-2>;
+using _m1     = Int<-1>;
+using _0      = Int<0>;
+using _1      = Int<1>;
+using _2      = Int<2>;
+using _3      = Int<3>;
+using _4      = Int<4>;
+using _5      = Int<5>;
+using _6      = Int<6>;
+using _7      = Int<7>;
+using _8      = Int<8>;
+using _9      = Int<9>;
+using _10     = Int<10>;
+using _12     = Int<12>;
+using _16     = Int<16>;
+using _24     = Int<24>;
+using _32     = Int<32>;
+using _40     = Int<40>;
+using _48     = Int<48>;
+using _56     = Int<56>;
+using _64     = Int<64>;
+using _72     = Int<72>;
+using _80     = Int<80>;
+using _88     = Int<88>;
+using _96     = Int<96>;
+using _104    = Int<104>;
+using _112    = Int<112>;
+using _120    = Int<120>;
+using _128    = Int<128>;
+using _136    = Int<136>;
+using _144    = Int<144>;
+using _152    = Int<152>;
+using _160    = Int<160>;
+using _168    = Int<168>;
+using _176    = Int<176>;
+using _184    = Int<184>;
+using _192    = Int<192>;
+using _200    = Int<200>;
+using _208    = Int<208>;
+using _216    = Int<216>;
+using _224    = Int<224>;
+using _232    = Int<232>;
+using _240    = Int<240>;
+using _248    = Int<248>;
+using _256    = Int<256>;
+using _384    = Int<384>;
+using _512    = Int<512>;
+using _768    = Int<768>;
+using _1024   = Int<1024>;
+using _2048   = Int<2048>;
+using _4096   = Int<4096>;
+using _8192   = Int<8192>;
+using _16384  = Int<16384>;
+using _32768  = Int<32768>;
+using _65536  = Int<65536>;
+using _131072 = Int<131072>;
+using _262144 = Int<262144>;
+using _524288 = Int<524288>;
+
+/***************/
+/** Operators **/
+/***************/
+
+#define CUTE_LEFT_UNARY_OP(OP)                                       \
+  template <auto t>                                                  \
+  CUTE_HOST_DEVICE constexpr                                         \
+  C<(OP t)> operator OP (C<t>) {                                     \
+    return {};                                                       \
+  }
+#define CUTE_RIGHT_UNARY_OP(OP)                                      \
+  template <auto t>                                                  \
+  CUTE_HOST_DEVICE constexpr                                         \
+  C<(t OP)> operator OP (C<t>) {                                     \
+    return {};                                                       \
+  }
+#define CUTE_BINARY_OP(OP)                                           \
+  template <auto t, auto u>                                          \
+  CUTE_HOST_DEVICE constexpr                                         \
+  C<(t OP u)> operator OP (C<t>, C<u>) {                             \
+    return {};                                                       \
+  }
+
+CUTE_LEFT_UNARY_OP(+);
+CUTE_LEFT_UNARY_OP(-);
+CUTE_LEFT_UNARY_OP(~);
+CUTE_LEFT_UNARY_OP(!);
+CUTE_LEFT_UNARY_OP(*);
+
+CUTE_BINARY_OP( +);
+CUTE_BINARY_OP( -);
+CUTE_BINARY_OP( *);
+CUTE_BINARY_OP( /);
+CUTE_BINARY_OP( %);
+CUTE_BINARY_OP( &);
+CUTE_BINARY_OP( |);
+CUTE_BINARY_OP( ^);
+CUTE_BINARY_OP(<<);
+CUTE_BINARY_OP(>>);
+
+CUTE_BINARY_OP(&&);
+CUTE_BINARY_OP(||);
+
+CUTE_BINARY_OP(==);
+CUTE_BINARY_OP(!=);
+CUTE_BINARY_OP( >);
+CUTE_BINARY_OP( <);
+CUTE_BINARY_OP(>=);
+CUTE_BINARY_OP(<=);
+
+#undef CUTE_BINARY_OP
+#undef CUTE_LEFT_UNARY_OP
+#undef CUTE_RIGHT_UNARY_OP
+
+//
+// Mixed static-dynamic special cases
+//
+
+template <auto t, class U,
+          __CUTE_REQUIRES(is_std_integral<U>::value && t == 0)>
+CUTE_HOST_DEVICE constexpr
+C<0>
+operator*(C<t>, U) {
+  return {};
+}
+
+template <class U, auto t,
+          __CUTE_REQUIRES(is_std_integral<U>::value && t == 0)>
+CUTE_HOST_DEVICE constexpr
+C<0>
+operator*(U, C<t>) {
+  return {};
+}
+
+template <auto t, class U,
+          __CUTE_REQUIRES(is_std_integral<U>::value && t == 0)>
+CUTE_HOST_DEVICE constexpr
+C<0>
+operator/(C<t>, U) {
+  return {};
+}
+
+template <class U, auto t,
+          __CUTE_REQUIRES(is_std_integral<U>::value && (t == 1 || t == -1))>
+CUTE_HOST_DEVICE constexpr
+C<0>
+operator%(U, C<t>) {
+  return {};
+}
+
+template <auto t, class U,
+          __CUTE_REQUIRES(is_std_integral<U>::value && t == 0)>
+CUTE_HOST_DEVICE constexpr
+C<0>
+operator%(C<t>, U) {
+  return {};
+}
+
+template <auto t, class U,
+          __CUTE_REQUIRES(is_std_integral<U>::value && t == 0)>
+CUTE_HOST_DEVICE constexpr
+C<0>
+operator&(C<t>, U) {
+  return {};
+}
+
+template <class U, auto t,
+          __CUTE_REQUIRES(is_std_integral<U>::value && t == 0)>
+CUTE_HOST_DEVICE constexpr
+C<0>
+operator&(U, C<t>) {
+  return {};
+}
+
+template <auto t, class U,
+          __CUTE_REQUIRES(is_std_integral<U>::value && !bool(t))>
+CUTE_HOST_DEVICE constexpr
+C<false>
+operator&&(C<t>, U) {
+  return {};
+}
+
+template <auto t, class U,
+          __CUTE_REQUIRES(is_std_integral<U>::value && !bool(t))>
+CUTE_HOST_DEVICE constexpr
+C<false>
+operator&&(U, C<t>) {
+  return {};
+}
+
+template <class U, auto t,
+          __CUTE_REQUIRES(is_std_integral<U>::value && bool(t))>
+CUTE_HOST_DEVICE constexpr
+C<true>
+operator||(C<t>, U) {
+  return {};
+}
+
+template <class U, auto t,
+          __CUTE_REQUIRES(is_std_integral<U>::value && bool(t))>
+CUTE_HOST_DEVICE constexpr
+C<true>
+operator||(U, C<t>) {
+  return {};
+}
+
+//
+// Named functions from math.hpp
+//
+
+#define CUTE_NAMED_UNARY_FN(OP)                                      \
+  template <auto t>                                                  \
+  CUTE_HOST_DEVICE constexpr                                         \
+  C<OP(t)> OP (C<t>) {                                               \
+    return {};                                                       \
+  }
+#define CUTE_NAMED_BINARY_FN(OP)                                     \
+  template <auto t, auto u>                                          \
+  CUTE_HOST_DEVICE constexpr                                         \
+  C<OP(t,u)> OP (C<t>, C<u>) {                                       \
+    return {};                                                       \
+  }                                                                  \
+  template <auto t, class U,                                         \
+            __CUTE_REQUIRES(is_std_integral<U>::value)>              \
+  CUTE_HOST_DEVICE constexpr                                         \
+  auto OP (C<t>, U u) {                                              \
+    return OP(t,u);                                                  \
+  }                                                                  \
+  template <class T, auto u,                                         \
+            __CUTE_REQUIRES(is_std_integral<T>::value)>              \
+  CUTE_HOST_DEVICE constexpr                                         \
+  auto OP (T t, C<u>) {                                              \
+    return OP(t,u);                                                  \
+  }
+
+CUTE_NAMED_UNARY_FN(abs);
+CUTE_NAMED_UNARY_FN(signum);
+CUTE_NAMED_UNARY_FN(has_single_bit);
+
+CUTE_NAMED_BINARY_FN(max);
+CUTE_NAMED_BINARY_FN(min);
+CUTE_NAMED_BINARY_FN(shiftl);
+CUTE_NAMED_BINARY_FN(shiftr);
+CUTE_NAMED_BINARY_FN(gcd);
+CUTE_NAMED_BINARY_FN(lcm);
+
+#undef CUTE_NAMED_UNARY_FN
+#undef CUTE_NAMED_BINARY_FN
+
+//
+// Other functions
+//
+
+template <auto t, auto u>
+CUTE_HOST_DEVICE constexpr
+C<t / u>
+safe_div(C<t>, C<u>) {
+  static_assert(t % u == 0, "Static safe_div requires t % u == 0");
+  return {};
+}
+
+template <auto t, class U,
+          __CUTE_REQUIRES(is_std_integral<U>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+safe_div(C<t>, U u) {
+  return t / u;
+}
+
+template <class T, auto u,
+          __CUTE_REQUIRES(is_std_integral<T>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+safe_div(T t, C<u>) {
+  return t / u;
+}
+
+template <class TrueType, class FalseType>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+conditional_return(true_type, TrueType&& t, FalseType&&) {
+  return static_cast<TrueType&&>(t);
+}
+
+template <class TrueType, class FalseType>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+conditional_return(false_type, TrueType&&, FalseType&& f) {
+  return static_cast<FalseType&&>(f);
+}
+
+template <auto v>
+CUTE_HOST_DEVICE constexpr
+auto
+conditional_return(bool b, C<v> const&, C<v> const&) {
+  return C<v>{};
+}
+
+template <auto v, auto u>
+CUTE_HOST_DEVICE constexpr
+auto
+conditional_return(bool b, C<v> const&, C<u> const&) {
+  return b ? v : u;
+}
+
+// TrueType and FalseType must have a common type
+template <class TrueType, class FalseType>
+CUTE_HOST_DEVICE constexpr
+auto
+conditional_return(bool b, TrueType const& t, FalseType const& f) {
+  return b ? t : f;
+}
+
+// TrueType and FalseType don't require a common type
+template <bool b, class TrueType, class FalseType>
+CUTE_HOST_DEVICE constexpr
+auto
+conditional_return(TrueType const& t, FalseType const& f) {
+  if constexpr (b) {
+    return t;
+  } else {
+    return f;
+  }
+}
+
+template <class Trait>
+CUTE_HOST_DEVICE constexpr
+auto
+static_value()
+{
+  if constexpr (is_std_integral<decltype(Trait::value)>::value) {
+    return Int<Trait::value>{};
+  } else {
+    return Trait::value;
+  }
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Display utilities
+//
+
+template <auto Value>
+CUTE_HOST_DEVICE void print(C<Value>) {
+  printf("_");
+  ::cute::print(Value);
+}
+
+#if !defined(__CUDACC_RTC__)
+template <auto t>
+CUTE_HOST std::ostream& operator<<(std::ostream& os, C<t> const&) {
+  return os << "_" << t;
+}
+#endif
+
+
+namespace detail {
+
+// parse_int_digits takes a variadic number of digits and converts them into an int
+template <class... Ts>
+constexpr uint64_t parse_int_digits(uint64_t result, int digit, Ts... digits)
+{
+  if constexpr (sizeof...(Ts) == 0) {
+    return 10 * result + digit;
+  } else {
+    return parse_int_digits(10 * result + digit, digits...);
+  }
+}
+
+} // end namespace detail
+
+
+// This user-defined literal operator allows cute::constant written as literals. For example,
+//
+//    auto var = 32_c;
+//
+//  var has type cute::constant<int,32>.
+//
+template <char... digits>
+constexpr cute::constant<int,detail::parse_int_digits(0, (digits - '0')...)> operator "" _c()
+{
+  static_assert((('0' <= digits && digits <= '9') && ...),
+                "Expected 0 <= digit <= 9 for each digit of the integer.");
+  return {};
+}
+
+} // end namespace cute
diff --git a/include/cute/numeric/integral_ratio.hpp b/include/cute/numeric/integral_ratio.hpp
new file mode 100644
index 0000000000..a614bdb2d9
--- /dev/null
+++ b/include/cute/numeric/integral_ratio.hpp
@@ -0,0 +1,293 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>                     // CUTE_HOST_DEVICE
+#include <cute/numeric/integral_constant.hpp>  // cute::false_type, cute::true_type
+#include <cute/numeric/math.hpp>               // cute::signum
+#include <cute/util/type_traits.hpp>           // __CUTE_REQUIRES
+
+namespace cute
+{
+
+/** Compile-time rational arithmetic type.
+ * Like cute::C for std::integral_constant, cute::R for std::ratio has a short name
+ *   for error messages and compile times.
+ * The static data members @a num and @a den represent the reduced numerator and denominator
+ *   of the rational value. Thus, two cute::R types with different @a n or @a d are distinct types
+ *   even if they represent the same rational value.
+ * A cute::R exposes the reduced canonical type via its ::type member.
+ *   That is, cute::R<3,6>::type is cute::R<1,2> and cute::R<6,3>::type is cute::C<2>.
+ * A cute::R<n,d>::value can be used much like any other trait::value. It can be involved in
+ *   arithmetic expressions (according to the operator-overloads for cute::C and cute::R,
+ *   though these may be incomplete) but with a potential rational value rather than an integral value.
+ */
+template <auto n, auto d>
+class R {
+  static_assert(d != 0);
+  static constexpr auto an  = abs(n);
+  static constexpr auto ad  = abs(d);
+  static constexpr auto g   = gcd(an, ad);
+
+ public:
+  static constexpr auto num = signum(n) * signum(d) * an / g;
+  static constexpr auto den =                         ad / g;
+  // RI: den >= 1 && gcd(abs(num),den) == 1
+  using type = typename conditional<num == 0 || den == 1, C<num>, R<num,den>>::type;
+};
+
+template <class T>
+struct is_ratio : false_type {};
+template <auto n, auto d>
+struct is_ratio<R<n,d>> : true_type {};
+
+template <auto a, auto b>
+CUTE_HOST_DEVICE constexpr
+typename R<a,b>::type
+ratio(C<a>, C<b>) {
+  return {};
+}
+
+template <auto a, auto b, auto c>
+CUTE_HOST_DEVICE constexpr
+typename R<a*c,b>::type
+ratio(C<a>, R<b,c>) {
+  return {};
+}
+
+template <auto a, auto b, auto c>
+CUTE_HOST_DEVICE constexpr
+typename R<b,a*c>::type
+ratio(R<b,c>, C<a>) {
+  return {};
+}
+
+template <auto a, auto b, auto c, auto d>
+CUTE_HOST_DEVICE constexpr
+typename R<a*d,b*c>::type
+ratio(R<a,b>, R<c,d>) {
+  return {};
+}
+
+//
+// Non-reduced ratio implementations
+//
+
+template <auto a, auto b>
+CUTE_HOST_DEVICE constexpr
+R<a,b>
+nratio(C<a>, C<b>) {
+  return {};
+}
+
+template <auto a, auto b, auto c>
+CUTE_HOST_DEVICE constexpr
+R<a*c,b>
+nratio(C<a>, R<b,c>) {
+  return {};
+}
+
+template <auto a, auto b, auto c>
+CUTE_HOST_DEVICE constexpr
+R<b,a*c>
+nratio(R<b,c>, C<a>) {
+  return {};
+}
+
+template <auto a, auto b, auto c, auto d>
+CUTE_HOST_DEVICE constexpr
+R<a*d,b*c>
+nratio(R<a,b>, R<c,d>) {
+  return {};
+}
+
+//
+// Operators
+//
+
+template <auto a, auto b, auto x, auto y>
+CUTE_HOST_DEVICE constexpr
+typename R<a*x,b*y>::type
+operator*(R<a,b>, R<x,y>) {
+  return {};
+}
+
+template <auto a, auto b, auto c>
+CUTE_HOST_DEVICE constexpr
+typename R<a*c,b>::type
+operator*(R<a,b>, C<c>) {
+  return {};
+}
+
+template <auto c, auto a, auto b>
+CUTE_HOST_DEVICE constexpr
+typename R<a*c,b>::type
+operator*(C<c>, R<a,b>) {
+  return {};
+}
+
+// Product with dynamic type needs to produce an integer...
+template <class C, auto a, auto b,
+          __CUTE_REQUIRES(cute::is_std_integral<C>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+operator*(C const& c, R<a,b>) {
+  return c * R<a,b>::num / R<a,b>::den;
+}
+
+// Product with dynamic type needs to produce an integer...
+template <auto a, auto b, class C,
+          __CUTE_REQUIRES(cute::is_std_integral<C>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+operator*(R<a,b>, C const& c) {
+  return c * R<a,b>::num / R<a,b>::den;
+}
+
+template <class C, auto a, auto b>
+CUTE_HOST_DEVICE constexpr
+auto
+operator/(C const& c, R<a,b>) {
+  return c * R<b,a>{};
+}
+
+template <auto a, auto b, auto x, auto y>
+CUTE_HOST_DEVICE constexpr
+typename R<a*y+b*x, b*y>::type
+operator+(R<a,b>, R<x,y>) {
+  return {};
+}
+
+template <auto a, auto b, auto c>
+CUTE_HOST_DEVICE constexpr
+typename R<a+c*b,b>::type
+operator+(R<a,b>, C<c>) {
+  return {};
+}
+
+template <auto c, auto a, auto b>
+CUTE_HOST_DEVICE constexpr
+typename R<a+c*b,b>::type
+operator+(C<c>, R<a,b>) {
+  return {};
+}
+
+/////////////////
+// Comparisons //
+/////////////////
+
+template <auto a, auto b, auto x, auto y>
+CUTE_HOST_DEVICE constexpr
+bool_constant<R<a,b>::num == R<x,y>::num && R<a,b>::den == R<x,y>::den>
+operator==(R<a,b>, R<x,y>) {
+  return {};
+}
+
+template <auto a, auto b, auto c>
+CUTE_HOST_DEVICE constexpr
+bool_constant<R<a,b>::num == c && R<a,b>::den == 1>
+operator==(R<a,b>, C<c>) {
+  return {};
+}
+
+template <auto c, auto a, auto b>
+CUTE_HOST_DEVICE constexpr
+bool_constant<R<a,b>::num == c && R<a,b>::den == 1>
+operator==(C<c>, R<a,b>) {
+  return {};
+}
+
+///////////////////////
+// Special functions //
+///////////////////////
+
+template <auto a, auto b, auto x, auto y>
+CUTE_HOST_DEVICE constexpr
+typename R<gcd(a*y,b*x),b*x>::type
+gcd(R<a,b>, R<x,y>) {
+  return {};
+}
+
+template <auto a, auto b, auto c>
+CUTE_HOST_DEVICE constexpr
+typename R<gcd(a,b*c),b*c>::type
+gcd(R<a,b>, C<c>) {
+  return {};
+}
+
+template <auto c, auto a, auto b>
+CUTE_HOST_DEVICE constexpr
+typename R<gcd(a,b*c),b*c>::type
+gcd(C<c>, R<a,b>) {
+  return {};
+}
+
+template <auto a, auto b>
+CUTE_HOST_DEVICE constexpr
+typename R<abs(a),abs(b)>::type
+abs(R<a,b>) {
+  return {};
+}
+
+template <auto a, auto b>
+CUTE_HOST_DEVICE constexpr
+int32_t
+log_2(R<a,b>) {
+  static_assert(R<a,b>::num > 0);
+  static_assert(R<a,b>::den > 0);
+  return log_2(static_cast<uint32_t>(R<a,b>::num)) - log_2(static_cast<uint32_t>(R<a,b>::den));
+}
+
+// @return A non-reduced ratio cute::R of the Trait0::value / Trait1::value
+template <class Trait0, class Trait1>
+CUTE_HOST_DEVICE constexpr
+auto
+trait_ratio(Trait0, Trait1) {
+  return nratio(static_value<Trait0>(), static_value<Trait1>());
+}
+
+//
+// Display utilities
+//
+
+template <auto a, auto b>
+CUTE_HOST_DEVICE void print(R<a,b>) {
+  print(C<a>{}); print("/"); print(C<b>{});
+}
+
+#if !defined(__CUDACC_RTC__)
+template <auto a, auto b>
+CUTE_HOST std::ostream& operator<<(std::ostream& os, R<a,b>) {
+  return os << "_" << C<a>{} << "/" << C<b>{};
+}
+#endif
+
+} // end namespace cute
diff --git a/include/cute/numeric/math.hpp b/include/cute/numeric/math.hpp
new file mode 100644
index 0000000000..e493a3a953
--- /dev/null
+++ b/include/cute/numeric/math.hpp
@@ -0,0 +1,356 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>            // CUTE_HOST_DEVICE
+#include <cute/util/type_traits.hpp>  // __CUTE_REQUIRES
+
+#include <cutlass/fast_math.h>
+
+namespace cute
+{
+
+//
+// Common Operations
+//
+
+template <class T, class U,
+          __CUTE_REQUIRES(is_arithmetic<T>::value &&
+                          is_arithmetic<U>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+max(T const& t, U const& u) {
+  return t < u ? u : t;
+}
+
+template <class T, class U,
+          __CUTE_REQUIRES(is_arithmetic<T>::value &&
+                          is_arithmetic<U>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+min(T const& t, U const& u) {
+  return t < u ? t : u;
+}
+
+template <class T,
+          __CUTE_REQUIRES(is_arithmetic<T>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+abs(T const& t) {
+  if constexpr (is_signed<T>::value) {
+    return t < T(0) ? -t : t;
+  } else {
+    return t;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Returns 1 if x > 0, -1 if x < 0, and 0 if x is zero.
+template <class T,
+          __CUTE_REQUIRES(is_arithmetic<T>::value)>
+CUTE_HOST_DEVICE constexpr
+int
+signum(T const& x) {
+  if constexpr (is_signed<T>::value) {
+    return (T(0) < x) - (x < T(0));
+  } else {
+    return T(0) < x;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// C++17 <numeric> operations
+//
+
+// Greatest common divisor of two positive integers
+template <class T, class U,
+          __CUTE_REQUIRES(is_std_integral<T>::value &&
+                          is_std_integral<U>::value)>
+CUTE_HOST_DEVICE constexpr
+cute::common_type_t<T, U>
+gcd(T t, U u) {
+  while (true) {
+    if (t == 0) { return u; }
+    u %= t;
+    if (u == 0) { return t; }
+    t %= u;
+  }
+}
+
+// Least common multiple of two positive integers
+template <class T, class U,
+          __CUTE_REQUIRES(is_std_integral<T>::value &&
+                          is_std_integral<U>::value)>
+CUTE_HOST_DEVICE constexpr
+cute::common_type_t<T, U>
+lcm(T const& t, U const& u) {
+  return (t / gcd(t,u)) * u;
+}
+
+//
+// C++20 <bit> operations
+//
+
+// Checks if a number is an integral power of two
+template <class T>
+CUTE_HOST_DEVICE constexpr
+bool
+has_single_bit(T x) {
+  return x != 0 && (x & (x - 1)) == 0;
+}
+
+// Smallest number of bits needed to represent the given value
+//   For x == 0, this is 0
+//   For x != 0, this is 1 + floor(log2(x))
+// bit_width( 0b0000 ) = 0
+// bit_width( 0b0001 ) = 1
+// bit_width( 0b0010 ) = 2
+// bit_width( 0b0011 ) = 2
+// bit_width( 0b0100 ) = 3
+// bit_width( 0b0101 ) = 3
+// bit_width( 0b0110 ) = 3
+// bit_width( 0b0111 ) = 3
+template <class T>
+CUTE_HOST_DEVICE constexpr
+int
+bit_width(T x) {
+  static_assert(is_unsigned<T>::value, "Only to be used for unsigned types.");
+  constexpr int N = (numeric_limits<T>::digits == 64 ? 6 :
+                    (numeric_limits<T>::digits == 32 ? 5 :
+                    (numeric_limits<T>::digits == 16 ? 4 :
+                    (numeric_limits<T>::digits ==  8 ? 3 : (assert(false),0)))));
+  T r = 0;
+  for (int i = N - 1; i >= 0; --i) {
+    T shift = (x > ((T(1) << (T(1) << i))-1)) << i;
+    x >>= shift;
+    r  |= shift;
+  }
+  return r + (x != 0);
+}
+
+// Smallest integral power of two not less than the given value
+// bit_ceil( 0b00000000 ) = 0b00000001
+// bit_ceil( 0b00000001 ) = 0b00000001
+// bit_ceil( 0b00000010 ) = 0b00000010
+// bit_ceil( 0b00000011 ) = 0b00000100
+// bit_ceil( 0b00000100 ) = 0b00000100
+// bit_ceil( 0b00000101 ) = 0b00001000
+// bit_ceil( 0b00000110 ) = 0b00001000
+// bit_ceil( 0b00000111 ) = 0b00001000
+// bit_ceil( 0b00001000 ) = 0b00001000
+// bit_ceil( 0b00001001 ) = 0b00010000
+template <class T>
+CUTE_HOST_DEVICE constexpr
+T
+bit_ceil(T x) {
+  return x == 0 ? T(1) : (T(1) << bit_width(x - 1));
+}
+
+// Largest integral power of two not greater than the given value
+// bit_floor( 0b00000000 ) = 0b00000000
+// bit_floor( 0b00000001 ) = 0b00000001
+// bit_floor( 0b00000010 ) = 0b00000010
+// bit_floor( 0b00000011 ) = 0b00000010
+// bit_floor( 0b00000100 ) = 0b00000100
+// bit_floor( 0b00000101 ) = 0b00000100
+// bit_floor( 0b00000110 ) = 0b00000100
+// bit_floor( 0b00000111 ) = 0b00000100
+// bit_floor( 0b00001000 ) = 0b00001000
+// bit_floor( 0b00001001 ) = 0b00001000
+template <class T>
+CUTE_HOST_DEVICE constexpr
+T
+bit_floor(T x) {
+  return x == 0 ? 0 : (T(1) << (bit_width(x) - 1));
+}
+
+template <class T>
+CUTE_HOST_DEVICE constexpr T rotl(T x, int s);
+template <class T>
+CUTE_HOST_DEVICE constexpr T rotr(T x, int s);
+
+// Computes the result of circular bitwise left-rotation
+template <class T>
+CUTE_HOST_DEVICE constexpr
+T
+rotl(T x, int s) {
+  constexpr int N = numeric_limits<T>::digits;
+  return static_cast<T>(s == 0 ? x : s > 0 ? (x << s) | (x >> (N - s)) : rotr(x, -s));
+}
+
+// Computes the result of circular bitwise right-rotation
+template <class T>
+CUTE_HOST_DEVICE constexpr
+T
+rotr(T x, int s) {
+  constexpr int N = numeric_limits<T>::digits;
+  return static_cast<T>(s == 0 ? x : s > 0 ? (x >> s) | (x << (N - s)) : rotl(x, -s));
+}
+
+// Counts the number of consecutive 0 bits, starting from the most significant bit
+// countl_zero( 0b00000000 ) = 8
+// countl_zero( 0b11111111 ) = 0
+// countl_zero( 0b00011100 ) = 3
+template <class T>
+CUTE_HOST_DEVICE constexpr
+int
+countl_zero(T x) {
+  return numeric_limits<T>::digits - bit_width(x);
+}
+
+// Counts the number of consecutive 1 bits, starting from the most significant bit
+// countl_one( 0b00000000 ) = 0
+// countl_one( 0b11111111 ) = 8
+// countl_one( 0b11100011 ) = 3
+template <class T>
+CUTE_HOST_DEVICE constexpr
+int
+countl_one(T x) {
+  return countl_zero(~x);
+}
+
+// Counts the number of consecutive 0 bits, starting from the least significant bit
+// countr_zero( 0b00000000 ) = 8
+// countr_zero( 0b11111111 ) = 0
+// countr_zero( 0b00011100 ) = 2
+template <class T>
+CUTE_HOST_DEVICE constexpr
+int
+countr_zero(T x) {
+  return x == 0 ? numeric_limits<T>::digits : bit_width(T(x & T(-x))) - 1;  // bit_width of the LSB
+}
+
+// Counts the number of consecutive 1 bits, starting from the least significant bit
+// countr_one( 0b00000000 ) = 0
+// countr_one( 0b11111111 ) = 8
+// countr_one( 0b11100011 ) = 2
+template <class T>
+CUTE_HOST_DEVICE constexpr
+int
+countr_one(T x) {
+  return countr_zero(~x);
+}
+
+// Counts the number of 1 bits in an unsigned integer
+// popcount( 0b00000000 ) = 0
+// popcount( 0b11111111 ) = 8
+// popcount( 0b00011101 ) = 4
+template <class T>
+CUTE_HOST_DEVICE constexpr
+int
+popcount(T x) {
+  int c = 0;
+  while (x) {
+    ++c;
+    x &= x - 1; // clear the least significant bit set
+  }
+  return c;
+}
+
+//
+// Custom operations
+//
+
+// Computes the result of bitwise left-shift
+template <class T>
+CUTE_HOST_DEVICE constexpr
+auto
+shiftl(T x, int s) {
+  return s >= 0 ? (x << s) : (x >> -s);
+}
+
+// Computes the result of bitwise right-shift
+template <class T>
+CUTE_HOST_DEVICE constexpr
+auto
+shiftr(T x, int s) {
+  return s >= 0 ? (x >> s) : (x << -s);
+}
+
+// Safe divide
+// @pre t % u == 0
+// @result t / u
+template <class T, class U,
+          __CUTE_REQUIRES(is_std_integral<T>::value &&
+                          is_std_integral<U>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+safe_div(T const& t, U const& u) {
+  //assert(t % u == 0);
+  return t / u;
+}
+
+/**
+ * log2 computation
+ */
+
+template <class T>
+CUTE_HOST_DEVICE constexpr
+int32_t
+log_2(T x) {
+  assert(x > 0);
+  static_assert(is_unsigned<T>::value, "Only to be used for unsigned integral types.");
+  return static_cast<int32_t>(bit_width(x)) - 1;
+}
+
+template <class IntDiv, class IntMod>
+struct DivModReturnType {
+  IntDiv div_;
+  IntMod mod_;
+  CUTE_HOST_DEVICE constexpr
+  DivModReturnType(IntDiv const& div, IntMod const& mod) : div_(div), mod_(mod) {}
+};
+
+// General divmod
+template <class CInt0, class CInt1>
+CUTE_HOST_DEVICE constexpr
+auto
+divmod(CInt0 const& a, CInt1 const& b) {
+  return DivModReturnType{a / b, a % b};
+}
+
+// Specialized function with fastDivmod input
+template <class CInt>
+CUTE_HOST_DEVICE constexpr
+auto
+divmod(CInt const& a, cutlass::FastDivmod const& b) {
+  using val_div_type = typename cutlass::FastDivmod::value_div_type;
+  using val_mod_type = typename cutlass::FastDivmod::value_mod_type;
+  val_div_type div = 0;
+  val_mod_type mod = 0;
+  b(div, mod, a);
+  return DivModReturnType{div, mod};
+}
+
+} // namespace cute
diff --git a/include/cute/numeric/numeric_types.hpp b/include/cute/numeric/numeric_types.hpp
new file mode 100644
index 0000000000..b9943b8ca3
--- /dev/null
+++ b/include/cute/numeric/numeric_types.hpp
@@ -0,0 +1,136 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>          // CUTE_HOST_DEVICE
+#include <cute/numeric/int.hpp>     // cute::int2_t, cute::int4_t, etc
+
+#include <cutlass/numeric_size.h>   // cutlass::sizeof_bits
+#include <cutlass/numeric_types.h>  // cutlass::float_e4m3_t, cutlass::float_e5m2_t, etc
+
+namespace cute {
+
+template <typename T>
+struct sizeof_bits : public cutlass::sizeof_bits<T> {};
+
+// DO NOT change auto to int, sizeof_bits<sparse_elem> use integral_ratio instead of int 
+template <class T>
+static constexpr auto sizeof_bits_v = sizeof_bits<T>::value;
+
+using cutlass::bits_to_bytes;
+using cutlass::bytes_to_bits;
+
+using cutlass::is_subbyte;
+
+template <class T>
+static constexpr auto is_subbyte_v = is_subbyte<T>::value;
+
+using cutlass::half_t;
+using cutlass::bfloat16_t;
+
+using cutlass::tfloat32_t;
+
+// Umbrella floating-point 8-bit data type : type_erased_dynamic_float8_t
+// This umbrella datatype can be enabled when a user provides a specific
+// datatype in runtime argument list.
+using cutlass::type_erased_dynamic_float8_t;
+using cutlass::float_e4m3_t;
+using cutlass::float_e5m2_t;
+
+using cutlass::uint1b_t;
+using cutlass::int2b_t;
+using cutlass::uint2b_t;
+using cutlass::int4b_t;
+using cutlass::uint4b_t;
+using cutlass::bin1_t;
+
+
+//
+// Print utility
+//
+
+CUTE_HOST_DEVICE
+void
+print(half_t a) {
+  printf("%f", static_cast<float>(a));
+}
+
+CUTE_HOST_DEVICE
+void
+print(bfloat16_t a) {
+  printf("%f", static_cast<float>(a));
+}
+
+
+CUTE_HOST_DEVICE
+void
+print(tfloat32_t a) {
+  printf("%f", static_cast<float>(a));
+}
+
+CUTE_HOST_DEVICE
+void
+print(float_e4m3_t a) {
+  printf("%f", static_cast<float>(a));
+}
+
+CUTE_HOST_DEVICE
+void
+print(float_e5m2_t a) {
+  printf("%f", static_cast<float>(a));
+}
+
+CUTE_HOST_DEVICE void
+pretty_print(bfloat16_t v) {
+  printf("%*.2f", 8, float(v));
+}
+
+CUTE_HOST_DEVICE void
+pretty_print(half_t v) {
+  printf("%*.2f", 8, float(v));
+}
+
+CUTE_HOST_DEVICE void
+pretty_print(tfloat32_t v) {
+  printf("%*.2e", 10, static_cast<float>(v));
+}
+
+CUTE_HOST_DEVICE void
+pretty_print(float_e4m3_t t) {
+  printf("%*.2f", 8, static_cast<float>(t));
+}
+
+CUTE_HOST_DEVICE void
+pretty_print(float_e5m2_t t) {
+  printf("%*.2f", 8, static_cast<float>(t));
+}
+
+} // namespace cute
diff --git a/include/cute/numeric/real.hpp b/include/cute/numeric/real.hpp
new file mode 100644
index 0000000000..4ce58dfa18
--- /dev/null
+++ b/include/cute/numeric/real.hpp
@@ -0,0 +1,74 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+
+namespace cute
+{
+
+/// Generic add
+template <class C, class A, class B>
+CUTE_HOST_DEVICE constexpr
+void
+add(C& c, A const& a, B const& b)
+{
+  c = a + b;
+}
+
+/// Generic multiply
+template <class C, class A, class B>
+CUTE_HOST_DEVICE constexpr
+void
+mul(C& c, A const& a, B const& b)
+{
+  c = a * b;
+}
+
+/// Generic fused multiply-add
+template <class D, class A, class B, class C>
+CUTE_HOST_DEVICE constexpr
+void
+fma(D& d, A const& a, B const& b, C const& c)
+{
+  d = a * b + c;
+}
+
+/// Fused multiply-add for triplets
+template <class A, class B, class C>
+CUTE_HOST_DEVICE constexpr
+void
+fma(A const& a, B const& b, C& c)
+{
+  return fma(c, a, b, c);
+}
+
+} // end namespace cute
diff --git a/include/cute/pointer.hpp b/include/cute/pointer.hpp
new file mode 100644
index 0000000000..cc49b6a356
--- /dev/null
+++ b/include/cute/pointer.hpp
@@ -0,0 +1,330 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>                     // CUTE_HOST_DEVICE
+#include <cute/pointer_base.hpp>               // cute::iter_adaptor
+#include <cute/pointer_sparse.hpp>
+#include <cute/container/array_subbyte.hpp>    // cute::subbyte_iterator
+#include <cute/numeric/integral_constant.hpp>  // cute::true_type, cute::false_type
+#include <cute/numeric/numeric_types.hpp>      // sizeof_bits
+
+namespace cute
+{
+
+//
+// recast_ptr<T> -- Create an iterator over values of type T.
+// For most types this will simply be T*, but certain types require more care.
+// Subbyte Types: uint2_t, uint4_t, etc
+//   Requires construction of a subbyte_iterator<T> in order to properly
+//   resolve each element in byte-addressed memory.
+// Sparse Types: sparse_elem<int S, class T>
+//   A type that holds one physical element meant to represent S number of logical elements.
+//   Requires construction of a sparse_ptr that emulates access to the S logical elements.
+//
+
+template <class NewT>
+CUTE_HOST_DEVICE constexpr
+auto
+recast_ptr(void* ptr)
+{
+  if constexpr (is_sparse<NewT>::value) {
+    constexpr int sparsity = NewT::sparsity;
+    NewT* p = reinterpret_cast<NewT*>(ptr);
+    return make_sparse_ptr<sparsity>(p);
+  } else
+  if constexpr (cute::is_subbyte_v<NewT>) {
+    return subbyte_iterator<NewT>(ptr);
+  } else {
+    return reinterpret_cast<NewT*>(ptr);
+  }
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class NewT>
+CUTE_HOST_DEVICE constexpr
+auto
+recast_ptr(void const* ptr)
+{
+  if constexpr (is_sparse<NewT>::value) {
+    constexpr int sparsity = NewT::sparsity;
+    NewT const* p = reinterpret_cast<NewT const*>(ptr);
+    return make_sparse_ptr<sparsity>(p);
+  } else
+  if constexpr (cute::is_subbyte_v<NewT>) {
+    return subbyte_iterator<NewT const>(ptr);
+  } else {
+    return reinterpret_cast<NewT const*>(ptr);
+  }
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Disambiguate nullptr
+template <class NewT>
+CUTE_HOST_DEVICE constexpr
+auto
+recast_ptr(decltype(nullptr)) {   // nullptr_t
+  return recast_ptr<NewT>(static_cast<NewT*>(nullptr));
+}
+
+//
+// gmem_ptr
+//
+
+template <class P>
+struct gmem_ptr : iter_adaptor<P, gmem_ptr<P>> {
+  using iter_adaptor<P, gmem_ptr<P>>::iter_adaptor;
+};
+
+template <class T, class = void>
+struct is_gmem : false_type {};
+template <class P>                     // Found the gmem
+struct is_gmem<gmem_ptr<P>> : true_type {};
+template <class P>                     // Recurse on ::iterator, if possible
+struct is_gmem<P, void_t<typename P::iterator>> : is_gmem<typename P::iterator> {};
+template <class P>
+constexpr bool is_gmem_v = is_gmem<P>::value;
+
+// Idempotent gmem tag on an iterator
+template <class Iterator>
+CUTE_HOST_DEVICE constexpr
+auto
+make_gmem_ptr(Iterator iter) {
+  if constexpr (is_gmem<Iterator>::value) {
+    return iter;
+  } else {
+    return gmem_ptr<Iterator>{iter};
+  }
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Explicitly typed construction from a raw pointer
+template <class T>
+CUTE_HOST_DEVICE constexpr
+auto
+make_gmem_ptr(void* ptr) {
+  return make_gmem_ptr(recast_ptr<T>(ptr));
+}
+
+// Explicitly typed construction from a raw pointer
+template <class T>
+CUTE_HOST_DEVICE constexpr
+auto
+make_gmem_ptr(void const* ptr) {
+  return make_gmem_ptr(recast_ptr<T const>(ptr));
+}
+
+// nullptr_t overload for make_gmem_ptr<float>(nullptr) disambiguation
+template <class T>
+CUTE_HOST_DEVICE constexpr
+auto
+make_gmem_ptr(decltype(nullptr)) { // nullptr_t
+  return make_gmem_ptr(recast_ptr<T>(nullptr));
+}
+
+// The gmem tag is invariant over type-recast
+template <class NewT, class P>
+CUTE_HOST_DEVICE constexpr
+auto
+recast_ptr(gmem_ptr<P> const& ptr) {
+  return make_gmem_ptr(recast_ptr<NewT>(ptr.get()));
+}
+
+//
+// smem_ptr
+//
+
+template <class P>
+struct smem_ptr : iter_adaptor<P, smem_ptr<P>> {
+  using iter_adaptor<P, smem_ptr<P>>::iter_adaptor;
+};
+
+template <class T, class = void>
+struct is_smem : false_type {};
+template <class P>                     // Found the smem
+struct is_smem<smem_ptr<P>> : true_type {};
+template <class P>                     // Recurse on ::iterator, if possible
+struct is_smem<P, void_t<typename P::iterator>> : is_smem<typename P::iterator> {};
+template <class P>
+constexpr bool is_smem_v = is_smem<P>::value;
+
+// Idempotent smem tag on an iterator
+template <class Iterator>
+CUTE_HOST_DEVICE constexpr
+auto
+make_smem_ptr(Iterator iter) {
+  if constexpr (is_smem<Iterator>::value) {
+    return iter;
+  } else {
+    return smem_ptr<Iterator>{iter};
+  }
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Make a smem swizzle pointer, common operation
+template <class Iterator, class Swizzle>
+CUTE_HOST_DEVICE constexpr
+auto
+make_smem_ptr(Iterator ptr, Swizzle sw)
+{
+  return make_swizzle_ptr(make_smem_ptr(ptr), sw);
+}
+
+// Explicitly typed construction from a raw pointer
+template <class T>
+CUTE_HOST_DEVICE constexpr
+auto
+make_smem_ptr(void* ptr) {
+  return make_smem_ptr(recast_ptr<T>(ptr));
+}
+
+// Explicitly typed construction from a raw pointer
+template <class T>
+CUTE_HOST_DEVICE constexpr
+auto
+make_smem_ptr(void const* ptr) {
+  return make_smem_ptr(recast_ptr<T const>(ptr));
+}
+
+// nullptr_t overload for make_smem_ptr<float>(nullptr) disambiguation
+template <class T>
+CUTE_HOST_DEVICE constexpr
+auto
+make_smem_ptr(decltype(nullptr)) { // nullptr_t
+  return make_smem_ptr(recast_ptr<T>(nullptr));
+}
+
+// The smem tag is invariant over type-recast
+template <class NewT, class P>
+CUTE_HOST_DEVICE constexpr
+auto
+recast_ptr(smem_ptr<P> const& ptr) {
+  return make_smem_ptr(recast_ptr<NewT>(ptr.get()));
+}
+
+//
+// rmem_ptr
+//
+
+template <class P>
+struct rmem_ptr : iter_adaptor<P, rmem_ptr<P>> {
+  using iter_adaptor<P, rmem_ptr<P>>::iter_adaptor;
+};
+
+// Anything that is not gmem or smem is rmem
+template <class T, class = void>
+struct is_rmem : bool_constant<not (is_gmem<T>::value || is_smem<T>::value)> {};
+template <class P>
+struct is_rmem<rmem_ptr<P>> : true_type {};
+template <class P>
+constexpr bool is_rmem_v = is_rmem<P>::value;
+
+// Idempotent rmem tag on an iterator
+template <class Iterator>
+CUTE_HOST_DEVICE constexpr
+auto
+make_rmem_ptr(Iterator iter) {
+  if constexpr (is_rmem<Iterator>::value) {
+    return iter;
+  } else {
+    return rmem_ptr<Iterator>{iter};
+  }
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Explicitly typed construction from a raw pointer
+template <class T>
+CUTE_HOST_DEVICE constexpr
+auto
+make_rmem_ptr(void* ptr) {
+  return make_rmem_ptr(recast_ptr<T>(ptr));
+}
+
+// Explicitly typed construction from a raw pointer
+template <class T>
+CUTE_HOST_DEVICE constexpr
+auto
+make_rmem_ptr(void const* ptr) {
+  return make_rmem_ptr(recast_ptr<T const>(ptr));
+}
+
+// The rmem tag is invariant over type-recast
+template <class NewT, class P>
+CUTE_HOST_DEVICE constexpr
+auto
+recast_ptr(rmem_ptr<P> const& ptr) {
+  return make_rmem_ptr(recast_ptr<NewT>(ptr.get()));
+}
+
+//
+// Display utilities
+//
+
+template <class T>
+CUTE_HOST_DEVICE void print(gmem_ptr<T> ptr)
+{
+  printf("gmem_"); print(ptr.get());
+}
+
+template <class T>
+CUTE_HOST_DEVICE void print(smem_ptr<T> ptr)
+{
+  printf("smem_"); print(ptr.get());
+}
+
+template <class T>
+CUTE_HOST_DEVICE void print(rmem_ptr<T> ptr)
+{
+  printf("rmem_"); print(ptr.get());
+}
+
+#if !defined(__CUDACC_RTC__)
+template <class T>
+CUTE_HOST std::ostream& operator<<(std::ostream& os, gmem_ptr<T> ptr)
+{
+  return os << "gmem_[" << int(sizeof_bits<iter_value_t<T>>::value) << "b]";
+}
+
+template <class T>
+CUTE_HOST std::ostream& operator<<(std::ostream& os, smem_ptr<T> ptr)
+{
+  return os << "smem_[" << int(sizeof_bits<iter_value_t<T>>::value) << "b]";
+}
+
+template <class T>
+CUTE_HOST std::ostream& operator<<(std::ostream& os, rmem_ptr<T> ptr)
+{
+  return os << "rmem_[" << int(sizeof_bits<iter_value_t<T>>::value) << "b]";
+}
+
+#endif // !defined(__CUDACC_RTC__)
+
+} // end namespace cute
diff --git a/include/cute/pointer_base.hpp b/include/cute/pointer_base.hpp
new file mode 100644
index 0000000000..57ad0b3cde
--- /dev/null
+++ b/include/cute/pointer_base.hpp
@@ -0,0 +1,262 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>                     // CUTE_HOST_DEVICE
+#include <cute/numeric/numeric_types.hpp>      // cute::sizeof_bits
+#include <cute/numeric/integral_constant.hpp>  // Int<0>
+#include <cute/util/type_traits.hpp>           // cute::declval, cute::void_t, etc
+
+namespace cute
+{
+
+//
+// C++20 <iterator> iterator_traits
+//
+
+namespace detail {
+// Default reference type of an iterator
+template <class T, class = void>
+struct iter_ref { using type = decltype(*declval<T&>()); };
+// Prefer to propagate ::reference
+template <class T>
+struct iter_ref<T,void_t<typename T::reference>> { using type = typename T::reference; };
+} // end namespace detail
+
+template <class T>
+using iter_reference = detail::iter_ref<T>;
+template <class T>
+using iter_reference_t = typename iter_reference<T>::type;
+
+namespace detail {
+// Default element_type of an iterator
+template <class T, class = void>
+struct iter_e { using type = remove_reference_t<typename iter_ref<T>::type>; };
+// Prefer to propagate ::element_type
+template <class T>
+struct iter_e<T,void_t<typename T::element_type>> { using type = typename T::element_type; };
+} // end namespace detail
+
+template <class T>
+using iter_element = detail::iter_e<T>;
+template <class T>
+using iter_element_t = typename iter_element<T>::type;
+
+namespace detail {
+// Default value_type of an iterator
+template <class T, class = void>
+struct iter_v { using type = remove_cv_t<typename iter_e<T>::type>; };
+// Prefer to propagate ::value_type
+template <class T>
+struct iter_v<T,void_t<typename T::value_type>> { using type = typename T::value_type; };
+} // end namespace detail
+
+template <class T>
+using iter_value = detail::iter_v<T>;
+template <class T>
+using iter_value_t = typename iter_value<T>::type;
+
+template <class Iterator>
+struct iterator_traits {
+  using reference    = iter_reference_t<Iterator>;
+  using element_type = iter_element_t<Iterator>;
+  using value_type   = iter_value_t<Iterator>;
+};
+
+//
+// has_dereference to determine if a type is an iterator concept
+//
+
+namespace detail {
+template <class T, class = void>
+struct has_dereference : CUTE_STL_NAMESPACE::false_type {};
+template <class T>
+struct has_dereference<T, void_t<decltype(*declval<T&>())>> : CUTE_STL_NAMESPACE::true_type {};
+} // end namespace detail
+
+template <class T>
+using has_dereference = detail::has_dereference<T>;
+
+//
+// raw_pointer_cast
+//
+
+template <class T>
+CUTE_HOST_DEVICE constexpr
+T*
+raw_pointer_cast(T* ptr) {
+  return ptr;
+}
+
+// The statically-known alignment of a dynamic pointer is unknown
+template <class T>
+CUTE_HOST_DEVICE constexpr
+Int<0>
+max_alignment(T*) {
+  return {};
+}
+
+//
+// A very simplified iterator adaptor.
+// Derived classed may override methods, but be careful to reproduce interfaces exactly.
+// Clients should never have an instance of this class. Do not write methods that take this as a param.
+//
+
+template <class Iterator, class DerivedType>
+struct iter_adaptor
+{
+  using iterator     = Iterator;
+  using reference    = typename iterator_traits<iterator>::reference;
+  using element_type = typename iterator_traits<iterator>::element_type;
+  using value_type   = typename iterator_traits<iterator>::value_type;
+
+  iterator ptr_;
+
+  CUTE_HOST_DEVICE constexpr
+  iter_adaptor(iterator ptr = {}) : ptr_(ptr) {}
+
+  CUTE_HOST_DEVICE constexpr
+  reference operator*() const { return *ptr_; }
+
+  template <class Index>
+  CUTE_HOST_DEVICE constexpr
+  reference operator[](Index const& i) const { return ptr_[i]; }
+
+  template <class Index>
+  CUTE_HOST_DEVICE constexpr
+  DerivedType operator+(Index const& i) const { return {ptr_ + i}; }
+
+  CUTE_HOST_DEVICE constexpr
+  iterator get() const { return ptr_; }
+
+  CUTE_HOST_DEVICE constexpr
+  friend bool operator==(DerivedType const& x, DerivedType const& y) { return x.ptr_ == y.ptr_; }
+  CUTE_HOST_DEVICE constexpr
+  friend bool operator!=(DerivedType const& x, DerivedType const& y) { return x.ptr_ != y.ptr_; }
+  CUTE_HOST_DEVICE constexpr
+  friend bool operator< (DerivedType const& x, DerivedType const& y) { return x.ptr_ <  y.ptr_; }
+  CUTE_HOST_DEVICE constexpr
+  friend bool operator<=(DerivedType const& x, DerivedType const& y) { return x.ptr_ <= y.ptr_; }
+  CUTE_HOST_DEVICE constexpr
+  friend bool operator> (DerivedType const& x, DerivedType const& y) { return x.ptr_ >  y.ptr_; }
+  CUTE_HOST_DEVICE constexpr
+  friend bool operator>=(DerivedType const& x, DerivedType const& y) { return x.ptr_ >= y.ptr_; }
+};
+
+template <class I, class D>
+CUTE_HOST_DEVICE constexpr
+auto
+raw_pointer_cast(iter_adaptor<I,D> const& x) {
+  return raw_pointer_cast(x.ptr_);
+}
+
+template <class I, class D>
+CUTE_HOST_DEVICE constexpr
+auto
+max_alignment(iter_adaptor<I,D> const& x) {
+  return max_alignment(x.ptr_);
+}
+
+//
+// counting iterator -- quick and dirty
+//
+
+template <class T = int>
+struct counting_iterator
+{
+  using index_type = T;
+  using value_type = T;
+  using reference  = T;
+
+  index_type n_;
+
+  CUTE_HOST_DEVICE constexpr
+  counting_iterator(index_type n = 0) : n_(n) {}
+
+  CUTE_HOST_DEVICE constexpr
+  index_type operator*() const { return n_; }
+
+  CUTE_HOST_DEVICE constexpr
+  index_type operator[](index_type i) const { return n_ + i; }
+
+  CUTE_HOST_DEVICE constexpr
+  counting_iterator operator+(index_type i) const { return {n_ + i}; }
+  CUTE_HOST_DEVICE constexpr
+  counting_iterator& operator++() { ++n_; return *this; }
+  CUTE_HOST_DEVICE constexpr
+  counting_iterator operator++(int) { counting_iterator ret = *this; ++n_; return ret; }
+
+  CUTE_HOST_DEVICE constexpr
+  friend bool operator==(counting_iterator const& x, counting_iterator const& y) { return x.n_ == y.n_; }
+  CUTE_HOST_DEVICE constexpr
+  friend bool operator!=(counting_iterator const& x, counting_iterator const& y) { return x.n_ != y.n_; }
+  CUTE_HOST_DEVICE constexpr
+  friend bool operator< (counting_iterator const& x, counting_iterator const& y) { return x.n_ <  y.n_; }
+  CUTE_HOST_DEVICE constexpr
+  friend bool operator<=(counting_iterator const& x, counting_iterator const& y) { return x.n_ <= y.n_; }
+  CUTE_HOST_DEVICE constexpr
+  friend bool operator> (counting_iterator const& x, counting_iterator const& y) { return x.n_ >  y.n_; }
+  CUTE_HOST_DEVICE constexpr
+  friend bool operator>=(counting_iterator const& x, counting_iterator const& y) { return x.n_ >= y.n_; }
+};
+
+template <class T>
+CUTE_HOST_DEVICE constexpr
+T
+raw_pointer_cast(counting_iterator<T> const& x) {
+  return x.n_;
+}
+
+//
+// Display utilities
+//
+
+template <class T>
+CUTE_HOST_DEVICE void print(T const* const ptr)
+{
+  printf("ptr["); print(sizeof_bits<T>::value); printf("b](%p)", ptr);
+}
+
+template <class T>
+CUTE_HOST_DEVICE void print(counting_iterator<T> ptr)
+{
+  printf("counting_iter("); print(ptr.n_); printf(")");
+}
+
+#if !defined(__CUDACC_RTC__)
+template <class T>
+CUTE_HOST std::ostream& operator<<(std::ostream& os, counting_iterator<T> ptr)
+{
+  return os << "counting_iter(" << ptr.n_ << ")";
+}
+#endif // !defined(__CUDACC_RTC__)
+
+} // end namespace cute
diff --git a/include/cute/pointer_flagged.hpp b/include/cute/pointer_flagged.hpp
new file mode 100644
index 0000000000..eb8d7e452e
--- /dev/null
+++ b/include/cute/pointer_flagged.hpp
@@ -0,0 +1,199 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>                     // CUTE_HOST_DEVICE
+#include <cute/layout_composed.hpp>            // cute::ComposedLayout
+#include <cute/pointer.hpp>                    // cute::make_smem_ptr
+#include <cute/pointer_sparse.hpp>             // cute::is_sparse
+#include <cute/pointer_swizzle.hpp>            // cute::make_swizzle_ptr
+#include <cute/arch/util.hpp>                  // cute::cast_smem_ptr_to_uint
+#include <cute/numeric/integral_constant.hpp>  // cute::Int
+
+namespace cute
+{
+
+//
+// Stand-in Swizzle Layout
+//   A model of a nullptr smem_ptr<T> with B == sizeof_bits<T>::value
+//   That represents an unset pointer. This is a placeholder type that is waiting for an smem_ptr
+//
+
+template <int Bits>
+struct smem_ptr_flag_bits : Int<0> {};
+
+using smem_ptr_flag = smem_ptr_flag_bits<1>;
+
+// A flagged construction method to transform ComposedLayout
+// Make a swizzle pointer tensor and check that the intended type size matches
+template <class Iterator, class SwizzleFn, int B, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+make_tensor(Iterator const& ptr,
+            ComposedLayout<SwizzleFn,smem_ptr_flag_bits<B>,Layout> const& layout)
+{
+  static_assert(is_smem<Iterator>::value, "Expected smem.");
+  static_assert(B == sizeof_bits<iter_value_t<Iterator>>::value, "Expected a B-bit pointer type.");
+  return make_tensor(make_smem_ptr(ptr.get(), layout.layout_a()),
+                     layout.layout_b());
+}
+
+// NOTE: To preserve smem_ptr_flag_bits under recast ops
+template <int N, class SwizzleFn, int B, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+upcast(ComposedLayout<SwizzleFn,smem_ptr_flag_bits<B>,Layout> const& layout)
+{
+  return composition(layout.layout_a(), smem_ptr_flag_bits<B*N>{}, upcast<N>(layout.layout_b()));
+}
+
+template <int N, class SwizzleFn, int B, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+downcast(ComposedLayout<SwizzleFn,smem_ptr_flag_bits<B>,Layout> const& layout)
+{
+  return composition(layout.layout_a(), smem_ptr_flag_bits<B/N>{}, downcast<N>(layout.layout_b()));
+}
+
+//
+// Conversion with swizzle_layout
+//
+
+template <class SwizzleFn, int B, class Layout>
+CUTE_HOST_DEVICE
+auto
+as_position_independent_swizzle_layout(ComposedLayout<SwizzleFn,smem_ptr_flag_bits<B>,Layout> const& layout)
+{
+  return composition(recast_layout<uint8_t,uint_bit_t<B>>(layout.layout_a()), Int<0>{}, layout.layout_b());
+}
+
+template <class Tensor>
+CUTE_HOST_DEVICE
+auto
+as_position_independent_swizzle_tensor(Tensor&& tensor)
+{
+  static_assert(is_smem<remove_cvref_t<Tensor>>::value, "Expected smem tensor.");
+  using SwizzleFn = get_swizzle_t<remove_cvref_t<Tensor>>;
+  if constexpr (SwizzleFn::num_bits == 0) {
+    return tensor;
+  } else {
+#if !defined(NDEBUG)
+    {
+    uint32_t address = cast_smem_ptr_to_uint(raw_pointer_cast(static_cast<Tensor&&>(tensor).data()));
+    uint32_t mask    = ((uint32_t(1) << SwizzleFn::num_base) - 1) | SwizzleFn::swizzle_code;
+    assert((address & mask) == 0);  // Alignment to the Base, Z, and Y of Swizzle
+    }
+#endif
+    using T = typename remove_cvref_t<Tensor>::value_type;
+    // Recast swizzle from acting on byte-addressed pointers to elements of type-T
+    auto new_swizzle = recast_layout<uint8_t, T>(SwizzleFn{});
+    // Strip off everything and create a new smem_ptr for type-T
+    auto new_ptr = make_smem_ptr<T>(raw_pointer_cast(static_cast<Tensor&&>(tensor).data()));
+    return make_tensor(new_ptr, composition(new_swizzle, Int<0>{}, tensor.layout()));
+  }
+  CUTE_GCC_UNREACHABLE;
+}
+
+// A model of a nullptr sparse_ptr<S, smem_ptr<T>> with B == sizeof_bits<T>::value
+// That represents an unset pointer. This is a placeholder type that is waiting for an smem_ptr
+template <int Sparsity, int Bits>
+struct smem_sparse_ptr_flag_bits : Int<0> {};
+
+template <int Sparsity>
+using smem_sparse_ptr_flag = smem_sparse_ptr_flag_bits<Sparsity, 1>;
+
+// A flagged construction method to transform ComposedLayout
+// Make a swizzle pointer tensor and check that the intended type size matches
+template <class Iterator, class SwizzleFn, int S, int B, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+make_tensor(Iterator const& ptr,
+            ComposedLayout<SwizzleFn,smem_sparse_ptr_flag_bits<S,B>,Layout> const& layout)
+{
+  static_assert(is_smem<Iterator>::value, "Expected smem.");
+  static_assert(is_sparse_ptr<Iterator>::value, "Expected sparse iter");
+  static_assert(is_sparse<iter_value_t<Iterator>>::value, "Expected sparse elem");
+  static_assert(S == iter_value_t<Iterator>::sparsity, "Expected sparsity S");
+  static_assert(B == sizeof_bits<typename iter_value_t<Iterator>::raw_type>::value, "Expected B-bit pointer type");
+  return make_tensor(make_swizzle_ptr(ptr, layout.layout_a()), layout.layout_b());
+}
+
+// NOTE: To preserve smem_ptr_flag_bits under recast ops
+template <int N, class SwizzleFn, int S, int B, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+upcast(ComposedLayout<SwizzleFn,smem_sparse_ptr_flag_bits<S,B>,Layout> const& layout)
+{
+  static_assert(dependent_false<SwizzleFn>, "Not implemented for safety");
+}
+
+template <int N, class SwizzleFn, int S, int B, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+downcast(ComposedLayout<SwizzleFn,smem_sparse_ptr_flag_bits<S,B>,Layout> const& layout)
+{
+  static_assert(dependent_false<SwizzleFn>, "Not implemented for safety");
+}
+
+//
+// Display utilities
+//
+
+// Capture and cast smem_ptr_flag Layouts to offset-0 layouts
+template <class SwizzleFn, int B, class Layout>
+CUTE_HOST_DEVICE
+void
+print_layout(ComposedLayout<SwizzleFn,smem_ptr_flag_bits<B>,Layout> const& layout)
+{
+  print_layout(as_position_independent_swizzle_layout(layout));
+}
+
+template <class SwizzleFn, int B, class Layout>
+CUTE_HOST_DEVICE
+void
+print_latex(ComposedLayout<SwizzleFn,smem_ptr_flag_bits<B>,Layout> const& layout)
+{
+  print_latex(as_position_independent_swizzle_layout(layout));
+}
+
+template <int B>
+CUTE_HOST_DEVICE void print(smem_ptr_flag_bits<B> ptr)
+{
+  printf("smem_ptr[%db](unset)", B);
+}
+
+template <int S, int B>
+CUTE_HOST_DEVICE void print(smem_sparse_ptr_flag_bits<S,B>)
+{
+  printf("smem_sparse<%d>_ptr[%db](unset)", S, B);
+}
+
+} // end namespace cute
diff --git a/include/cute/pointer_sparse.hpp b/include/cute/pointer_sparse.hpp
new file mode 100644
index 0000000000..ccae458650
--- /dev/null
+++ b/include/cute/pointer_sparse.hpp
@@ -0,0 +1,172 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cute/config.hpp>                     // CUTE_HOST_DEVICE
+#include <cute/pointer_base.hpp>               // cute::iter_adaptor
+#include <cute/numeric/integral_constant.hpp>  // cute::false_type, cute::true_type
+#include <cute/numeric/integral_ratio.hpp>     // cute::ratio
+
+namespace cute
+{
+
+// A data type that holds one physical element meant to represent Sparsity number of logical elements
+// This class is purposely not compatible with anything -- know what you're doing if you attempt to use it
+template <int Sparsity, class T>
+struct sparse_elem
+{
+  static constexpr int sparsity = Sparsity;
+  using raw_type = T;
+  T elem_;
+
+  CUTE_HOST_DEVICE constexpr
+  explicit sparse_elem(T const& elem = {}) : elem_(elem) {}
+
+  CUTE_HOST_DEVICE constexpr friend bool operator==(sparse_elem const& a, sparse_elem const& b) { return a.elem_ == b.elem_; }
+  CUTE_HOST_DEVICE constexpr friend bool operator!=(sparse_elem const& a, sparse_elem const& b) { return a.elem_ != b.elem_; }
+  CUTE_HOST_DEVICE constexpr friend bool operator< (sparse_elem const& a, sparse_elem const& b) { return a.elem_ <  b.elem_; }
+  CUTE_HOST_DEVICE constexpr friend bool operator<=(sparse_elem const& a, sparse_elem const& b) { return a.elem_ <= b.elem_; }
+  CUTE_HOST_DEVICE constexpr friend bool operator> (sparse_elem const& a, sparse_elem const& b) { return a.elem_ >  b.elem_; }
+  CUTE_HOST_DEVICE constexpr friend bool operator>=(sparse_elem const& a, sparse_elem const& b) { return a.elem_ >= b.elem_; }
+};
+
+template <class T>
+struct is_sparse : false_type {};
+template <class T>
+struct is_sparse<T const> : is_sparse<T> {};
+template <int S, class T>
+struct is_sparse<sparse_elem<S,T>> : true_type {};
+template<class T>
+static constexpr auto is_sparse_v = is_sparse<T>::value;
+
+// Overload sizeof_bits for sparse_elem.
+//   Much like subbyte element types, this is the effective number of bits in a sparse_elem
+//   rather than actual physical bits that may be used in storing one. Also like subbyte element
+//   types, modified iterators are required to properly index and access sparse_elems.
+//
+//   Defining sizeof_bits like this makes reasonable expressions like N * sizeof_bits_v<E> meaningful
+//   even when E is subbyte or sparse. However, this also means that sparse_elem can rather easily be
+//   confused with subbyte elements and special care should be taken with each.
+template <int S, class T>
+struct sizeof_bits<sparse_elem<S,T>> {
+  // Simple implementation that conforms to sizeof_bits
+  //static constexpr auto value = sizeof_bits<T>::value / S;
+  //static_assert(value != 0, "sizeof_bits=0 detected. Sparsity is larger than width.");
+  //static_assert((sizeof_bits<T>::value % S) == 0, "Width needs to be a multiple of sparsity.")
+
+  // Interesting experiment that allows any sparsity level to be used by potentially presenting
+  // an integral_ratio rather than size_t. This is valid in most integer expressions as well.
+  static constexpr auto value = cute::ratio(cute::Int<cute::sizeof_bits_v<T>>{}, cute::Int<S>{});
+};
+
+//
+// sparse_ptr
+//
+
+template <class T, class = void>
+struct is_sparse_ptr : false_type {};
+template <class T>
+struct is_sparse_ptr<T, void_t<typename T::iterator>> : is_sparse_ptr<typename T::iterator> {};
+
+template <int Sparsity, class Iterator>
+struct sparse_ptr : iter_adaptor<Iterator, sparse_ptr<Sparsity, Iterator>>
+{
+  using reference    = typename iterator_traits<Iterator>::reference;
+  using element_type = typename iterator_traits<Iterator>::element_type;
+  using value_type   = typename iterator_traits<Iterator>::value_type;
+
+  // Sanity, for now
+  static_assert(is_sparse<value_type>::value, "Enforce sparse value-type");
+  static_assert(Sparsity == iter_value_t<Iterator>::sparsity, "Enforce sparsity S");
+  static_assert(not is_sparse_ptr<Iterator>::value, "Enforce sparse singleton");
+
+  template <class Index>
+  CUTE_HOST_DEVICE constexpr
+  sparse_ptr operator+(Index const& i) const {
+    // Only allow offset by multiples of the sparsity factor,
+    // else the misalignments become a bug. E.g. (sparse_ptr<8,I>{} + 7) + 7
+    // Motivation for subsparse_iterator or generalization of subbyte_iterator?
+    assert(i % Sparsity == 0);
+    return {this->get() + i / Sparsity};
+  }
+
+  template <class Index>
+  CUTE_HOST_DEVICE constexpr
+  reference operator[](Index const& i) const {
+    // Allow offset by any value and dereference.
+    // Not implemented in terms of sparse_ptr::op+()
+    return *(this->get() + i / Sparsity);
+  }
+};
+
+template <int S, class I>
+struct is_sparse_ptr<sparse_ptr<S,I>> : true_type {};
+
+template <int Sparsity, class Iter>
+CUTE_HOST_DEVICE constexpr
+auto
+make_sparse_ptr(Iter const& iter) {
+  if constexpr (Sparsity == 1) {
+    return iter;
+  } else {
+    return sparse_ptr<Sparsity, Iter>{iter};
+  }
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class NewT, int S, class Iter>
+CUTE_HOST_DEVICE constexpr
+auto
+recast_ptr(sparse_ptr<S,Iter> const& ptr) {
+  static_assert(not is_sparse<NewT>::value);
+  return recast_ptr<NewT>(ptr.get());
+}
+
+//
+// Display utilities
+//
+
+template <int S, class Iter>
+CUTE_HOST_DEVICE void print(sparse_ptr<S,Iter> ptr)
+{
+  printf("sparse<%d>_", S); print(ptr.get());
+}
+
+#if !defined(__CUDACC_RTC__)
+template <int S, class Iter>
+CUTE_HOST std::ostream& operator<<(std::ostream& os, sparse_ptr<S,Iter> ptr)
+{
+  return os << "sparse<" << S << ">_" << ptr.get();
+}
+#endif
+
+} // end namespace cute
diff --git a/include/cute/pointer_swizzle.hpp b/include/cute/pointer_swizzle.hpp
new file mode 100644
index 0000000000..1a802cfdc6
--- /dev/null
+++ b/include/cute/pointer_swizzle.hpp
@@ -0,0 +1,176 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>                   // CUTE_HOST_DEVICE
+#include <cute/pointer_base.hpp>             // cute::iter_adaptor
+#include <cute/swizzle.hpp>                  // cute::Swizzle, cute::get_swizzle primary template
+#include <cute/util/type_traits.hpp>         // cute::iterator_traits
+#include <cute/container/array_subbyte.hpp>  // cute::subbyte_iterator
+
+/* This implements a swizzle pointer of the form
+ *   InvolutionFn o PtrAdd
+ * where the InvolutionFn need not be linear.
+ *
+ * This differs subtly from swizzle_layout because the smem pointer is used
+ * as the offset. That means that swizzle_layout will implement position-independent
+ * swizzle layouts, while swizzle_ptr implements position-dependent swizzle tensors.
+ * Arch chose to design hardware with position-dependent swizzles.
+ *
+ * For clarity:
+ *   NormalLayout  : DeRef <- PtrAdd <- [Layout]
+ *   ComposedLayout: DeRef <- PtrAdd <- [Swizzle <- OffsetAdd <- Layout]
+ *   SwizzlePtr    : [DeRef <- Swizzle <- PtrAdd] <- Layout
+ *
+ * Furthermore, for known swizzles, this pointer attempts to decay itself
+ *    to a normal-pointer with a new layout containing dynamic or static strides.
+ * This is possible by determining the subdomain of the InvolutionFn
+ *    that is identity and testing if the Layout's codomain is contained
+ *    within it.
+ */
+
+namespace cute
+{
+
+// concept SwizzleFn {
+//   CUTE_HOST_DEVICE constexpr static uint apply(uint);
+// }
+// See Swizzle<B,M,S> in swizzle.hpp for common swizzle-functions.
+
+template <class SwizzleFn, class Iterator>
+struct swizzle_ptr : iter_adaptor<Iterator,swizzle_ptr<SwizzleFn,Iterator>>
+{
+  using iterator     = Iterator;
+  using reference    = typename iterator_traits<iterator>::reference;
+  using element_type = typename iterator_traits<iterator>::element_type;
+  using value_type   = typename iterator_traits<iterator>::value_type;
+
+  using iter_adaptor<Iterator,swizzle_ptr<SwizzleFn,Iterator>>::iter_adaptor;
+
+  template <class Iter>
+  CUTE_HOST_DEVICE constexpr static
+  Iter apply_swizzle(Iter ptr) {
+    return {apply_swizzle(ptr.get())};
+  }
+
+  template <class T>
+  CUTE_HOST_DEVICE constexpr static
+  T* apply_swizzle(T* ptr) {
+    return reinterpret_cast<T*>(SwizzleFn::apply(reinterpret_cast<uintptr_t>(ptr)));
+  }
+
+  template <class T>
+  CUTE_HOST_DEVICE constexpr static
+  subbyte_iterator<T> apply_swizzle(subbyte_iterator<T> ptr) {
+    return {apply_swizzle(ptr.ptr_), ptr.idx_};
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  reference operator*() const {
+    return *apply_swizzle(this->get());
+  }
+
+  template <class Int>
+  CUTE_HOST_DEVICE constexpr
+  reference operator[](Int const& i) const {
+    return *apply_swizzle(this->get() + i);
+  }
+};
+
+//
+// Helper Function
+//
+template <class SwizzleFn, class P>                   // Found the SwizzleFn
+struct get_swizzle<swizzle_ptr<SwizzleFn,P>> { using type = SwizzleFn; };
+template <class T>                                    // Recurse into anything with a ::iterator
+struct get_swizzle<T, void_t<typename T::iterator>> : get_swizzle<typename T::iterator> {};
+
+template <class Iterator, class SwizzleFn>
+CUTE_HOST_DEVICE constexpr
+swizzle_ptr<SwizzleFn,Iterator>
+make_swizzle_ptr(Iterator ptr, SwizzleFn) {
+  return {ptr};
+}
+
+// Swizzle-0 specialization for immediate decay
+template <class Iterator, int M, int S>
+CUTE_HOST_DEVICE constexpr
+Iterator
+make_swizzle_ptr(Iterator ptr, Swizzle<0,M,S>) {
+  return ptr;
+}
+
+//
+// Recast
+//
+
+template <class SwizzleFn, class P>
+CUTE_HOST_DEVICE constexpr
+auto
+raw_pointer_cast(swizzle_ptr<SwizzleFn,P> const& ptr) {
+  return raw_pointer_cast(ptr.get());
+}
+
+// SwizzleFn operates on the pointer address, so it doesn't care about the type
+template <class NewT, class SwizzleFn, class P>
+CUTE_HOST_DEVICE constexpr
+auto
+recast_ptr(swizzle_ptr<SwizzleFn,P> const& ptr) {
+  return make_swizzle_ptr(recast_ptr<NewT>(ptr.get()), SwizzleFn{});
+}
+
+// The statically-known alignment of a swizzle pointer is the alignment of the swizzle function converted to bits
+template <class SwizzleFn, class P>
+CUTE_HOST_DEVICE constexpr
+auto
+max_alignment(swizzle_ptr<SwizzleFn,P> const&) {
+  return Int<8>{} * max_alignment(SwizzleFn{});
+}
+
+//
+// Display utilities
+//
+
+template <class SwizzleFn, class P>
+CUTE_HOST_DEVICE void print(swizzle_ptr<SwizzleFn,P> ptr)
+{
+  print(SwizzleFn{}); printf("_"); print(ptr.get());
+}
+
+#if !defined(__CUDACC_RTC__)
+template <class SwizzleFn, class P>
+CUTE_HOST std::ostream& operator<<(std::ostream& os, swizzle_ptr<SwizzleFn,P> ptr)
+{
+  return os << SwizzleFn{} << "_" << ptr.get();
+}
+#endif
+
+} // end namespace cute
diff --git a/include/cute/stride.hpp b/include/cute/stride.hpp
new file mode 100644
index 0000000000..f2d31f4e34
--- /dev/null
+++ b/include/cute/stride.hpp
@@ -0,0 +1,598 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>                     // CUTE_HOST_DEVICE
+#include <cute/util/type_traits.hpp>           // cute::__CUTE_REQUIRES
+#include <cute/container/tuple.hpp>            // cute::is_tuple
+#include <cute/numeric/integral_constant.hpp>  // cute::is_integral
+#include <cute/numeric/integer_sequence.hpp>   // cute::seq
+#include <cute/numeric/math.hpp>               // cute::divmod
+#include <cute/numeric/arithmetic_tuple.hpp>   // cute::basis_get
+#include <cute/algorithm/functional.hpp>       // cute::identity
+#include <cute/algorithm/tuple_algorithms.hpp> // cute::fold
+#include <cute/int_tuple.hpp>                  // cute::is_congruent
+
+namespace cute
+{
+
+/** crd2idx(c,s,d) maps a coordinate within <Shape,Stride> to an index
+ *
+ * This is computed as follows:
+ *  [coord, shape, and stride are all integers => step forward by stride]
+ * op(c, s, d)             => c * d
+ *  [coord is integer, shape and stride are tuple => divmod coord for each mode]
+ * op(c, (s,S), (d,D))     => op(c % prod(s), s, d) + op(c / prod(s), (S), (D))
+ *  [coord, shape, and stride are all tuples => consider each mode independently]
+ * op((c,C), (s,S), (d,D)) => op(c, s, d) + op((C), (S), (D))
+ */
+template <class Coord, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+crd2idx(Coord  const& coord,
+        Shape  const& shape,
+        Stride const& stride);
+
+namespace detail {
+
+template <class Coord, class Shape, class Stride, int... Is>
+CUTE_HOST_DEVICE constexpr
+auto
+crd2idx_ttt(Coord  const& coord,
+            Shape  const& shape,
+            Stride const& stride, seq<Is...>)
+{
+  return (... + crd2idx(get<Is>(coord), get<Is>(shape), get<Is>(stride)));
+}
+
+template <class CInt, class STuple, class DTuple, int I0, int... Is>
+CUTE_HOST_DEVICE constexpr
+auto
+crd2idx_itt(CInt   const& coord,
+            STuple const& shape,
+            DTuple const& stride, seq<I0,Is...>)
+{
+  if constexpr (sizeof...(Is) == 0) {  // Avoid recursion and mod on single/last iter
+    return crd2idx(coord, get<I0>(shape), get<I0>(stride));
+  } else if constexpr (is_constant<0, CInt>::value) {
+    return crd2idx(_0{}, get<I0>(shape), get<I0>(stride))
+         + (_0{} + ... + crd2idx(_0{}, get<Is>(shape), get<Is>(stride)));
+  } else {                             // General case
+    auto [div, mod] = divmod(coord, product(get<I0>(shape)));
+    return crd2idx(mod, get<I0>(shape), get<I0>(stride))
+         + crd2idx_itt(div, shape, stride, seq<Is...>{});
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+} // end namespace detail
+
+template <class Coord, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+crd2idx(Coord  const& coord,
+        Shape  const& shape,
+        Stride const& stride)
+{
+  if constexpr (is_tuple<Coord>::value) {
+    if constexpr (is_tuple<Shape>::value) {      // tuple tuple tuple
+      static_assert(tuple_size<Coord>::value == tuple_size< Shape>::value, "Mismatched Ranks");
+      static_assert(tuple_size<Coord>::value == tuple_size<Stride>::value, "Mismatched Ranks");
+      return detail::crd2idx_ttt(coord, shape, stride, tuple_seq<Coord>{});
+    } else {                                     // tuple "int" "int"
+      static_assert(sizeof(Coord) == 0, "Invalid parameters");
+    }
+  } else {
+    if constexpr (is_tuple<Shape>::value) {      // "int" tuple tuple
+      static_assert(tuple_size<Shape>::value == tuple_size<Stride>::value, "Mismatched Ranks");
+      return detail::crd2idx_itt(coord, shape, stride, tuple_seq<Shape>{});
+    } else {                                     // "int" "int" "int"
+      return coord * stride;
+    }
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+namespace detail {
+
+template <class CTuple, class STuple, int I0, int... Is>
+CUTE_HOST_DEVICE constexpr
+auto
+crd2idx_horner(CTuple const& coord,
+               STuple const& shape, seq<I0,Is...>)
+{
+  if constexpr (sizeof...(Is) == 0) {  // No recursion on single/last iter
+    return get<I0>(coord);
+  } else {                             // General case
+    return get<I0>(coord) + get<I0>(shape) * crd2idx_horner(coord, shape, seq<Is...>{});
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+} // end namespace detail
+
+/** crd2idx(c,s) maps a coordinate within Shape to an index
+ * via a colexicographical enumeration of coordinates in Shape.
+ * i = c0 + s0 * (c1 + s1 * (c2 + s2 * ...))
+ */
+template <class Coord, class Shape>
+CUTE_HOST_DEVICE constexpr
+auto
+crd2idx(Coord const& coord,
+        Shape const& shape)
+{
+  if constexpr (is_integral<Coord>::value) {  // Coord is already an index
+    return coord;
+  } else if constexpr (is_integral<Shape>::value) {
+    static_assert(dependent_false<Shape>, "Invalid parameters");
+  } else {                                    // Make congruent, flatten, and apply Horner's method
+    static_assert(tuple_size<Coord>::value == tuple_size<Shape>::value, "Mismatched Ranks");
+    auto flat_coord = flatten(coord);
+    auto flat_shape = flatten(product_like(shape, coord));
+    return detail::crd2idx_horner(flat_coord, flat_shape, tuple_seq<decltype(flat_shape)>{});
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+/** idx2crd(i,s,d) splits an index into a coordinate within <Shape,Stride>.
+ *
+ * This is computed as follows:
+ *  [index, shape, and stride are all integers => determine 1D coord]
+ * op(i, s, d)             => (i / d) % s
+ *  [index is integer, shape and stride are tuple => determine component for each mode]
+ * op(i, (s,S), (d,D))     => (op(i, s, d), op(i, S, D)...)
+ *  [index, shape, and stride are all tuples => consider each mode independently]
+ * op((i,I), (s,S), (d,D)) => (op(i, s, d), op((I), (S), (D)))
+ *
+ * NOTE: This only works for compact shape+stride layouts. A more general version would
+ *       apply to all surjective layouts
+ */
+template <class Index, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+idx2crd(Index  const& idx,
+        Shape  const& shape,
+        Stride const& stride)
+{
+  if constexpr (is_tuple<Index>::value) {
+    if constexpr (is_tuple<Shape>::value) {      // tuple tuple tuple
+      static_assert(tuple_size<Index>::value == tuple_size< Shape>::value, "Mismatched Ranks");
+      static_assert(tuple_size<Index>::value == tuple_size<Stride>::value, "Mismatched Ranks");
+      return transform(idx, shape, stride, [](auto const& i, auto const& s, auto const& d){ return idx2crd(i,s,d); });
+    } else {                                     // tuple "int" "int"
+      static_assert(sizeof(Index) == 0, "Invalid parameters");
+    }
+  } else {
+    if constexpr (is_tuple<Shape>::value) {
+      if constexpr (is_tuple<Stride>::value) {   // "int" tuple tuple
+        static_assert(tuple_size<Shape>::value == tuple_size<Stride>::value, "Mismatched Ranks");
+        return transform(shape, stride, [&](auto const& s, auto const& d){ return idx2crd(idx,s,d); });
+      } else {                                   // "int" tuple "int"
+        return transform(shape, compact_col_major(shape, stride), [&](auto const& s, auto const& d){ return idx2crd(idx,s,d); });
+      }
+    } else {                                     // "int" "int" "int"
+      if constexpr (is_constant<1, Shape>::value) {
+        // Skip potential stride-0 division
+        return Int<0>{};
+      } else {
+        return (idx / stride) % shape;
+      }
+    }
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+/** idx2crd(i,s) splits an index into a coordinate within Shape
+ * via a colexicographical enumeration of coordinates in Shape.
+ * c0 = (idx / 1) % s0
+ * c1 = (idx / s0) % s1
+ * c2 = (idx / (s0 * s1)) % s2
+ * ...
+ */
+template <class Index, class Shape>
+CUTE_HOST_DEVICE constexpr
+auto
+idx2crd(Index const& idx,
+        Shape const& shape)
+{
+  if constexpr (is_tuple<Index>::value) {
+    if constexpr (is_tuple<Shape>::value) {      // tuple tuple
+      static_assert(tuple_size<Index>::value == tuple_size<Shape>::value, "Mismatched Ranks");
+      return transform(idx, shape, [](auto const& i, auto const& s) { return idx2crd(i,s); });
+    } else {                                     // tuple "int"
+      static_assert(sizeof(Index) == 0, "Invalid parameters");
+    }
+  } else {
+    if constexpr (is_tuple<Shape>::value) {      // "int" tuple
+      return transform_leaf(as_arithmetic_tuple(crd2idx(idx, shape, make_basis_like(shape))), identity{});
+    } else {                                     // "int" "int"
+      return idx;
+    }
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// crd2crd
+//
+
+template <class Coord, class SShape, class DShape>
+CUTE_HOST_DEVICE constexpr
+auto
+crd2crd(Coord  const& coord,
+        SShape const& src_shape,
+        DShape const& dst_shape)
+{
+  if constexpr (is_tuple<Coord>::value && is_tuple<SShape>::value && is_tuple<DShape>::value) {
+    static_assert(tuple_size<Coord>::value == tuple_size<SShape>::value, "Mismatched Ranks");
+    static_assert(tuple_size<Coord>::value == tuple_size<DShape>::value, "Mismatched Ranks");
+    return transform(coord, src_shape, dst_shape, [](auto const& c, auto const& s, auto const& d) { return crd2crd(c,s,d); });
+  } else {
+    // assert(size(src_shape) == size(dst_shape))
+    return idx2crd(crd2idx(coord, src_shape), dst_shape);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Compact Major
+//
+
+// Tags for common layouts and dispatching
+struct LayoutLeft;               // Col-major layout mapping; leftmost extent has stride 1
+using GenColMajor = LayoutLeft;  // Alias
+
+struct LayoutRight;              // Row-major layout mapping; rightmost extent has stride 1
+using GenRowMajor = LayoutRight; // Alias
+
+namespace detail {
+
+// For GCC8.5 -- Use of lambdas in unevaluated contexts. Instead use function objects.
+template <class Major>
+struct CompactLambda;
+
+// @pre is_integral<Current>
+// Return (result, current * product(shape)) to enable recurrence
+template <class Major, class Shape, class Current>
+CUTE_HOST_DEVICE constexpr
+auto
+compact(Shape   const& shape,
+        Current const& current)
+{
+  if constexpr (is_tuple<Shape>::value) { // Shape::tuple Current::int
+    using Lambda = CompactLambda<Major>;                  // Append or Prepend
+    using Seq    = typename Lambda::template seq<Shape>;  // Seq or RSeq
+    return cute::detail::fold(shape, cute::make_tuple(cute::make_tuple(), current), Lambda{}, Seq{});
+  } else {                                // Shape::int Current::int
+    if constexpr (is_constant<1, Shape>::value) {
+      return cute::make_tuple(Int<0>{}, current); // If current is dynamic, this could save a reg
+    } else {
+      return cute::make_tuple(current, current * shape);
+    }
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// For GCC8.5 -- Specialization LayoutLeft
+template <>
+struct CompactLambda<LayoutLeft>
+{
+  template <class Init, class Shape>
+  CUTE_HOST_DEVICE constexpr auto
+  operator()(Init const& init, Shape const& si) {
+    auto result = detail::compact<LayoutLeft>(si, get<1>(init));
+    return cute::make_tuple(append(get<0>(init), get<0>(result)), get<1>(result));  // Append
+  }
+
+  template <class Shape>
+  using seq = tuple_seq<Shape>;                                                     // Seq
+};
+
+// For GCC8.5 -- Specialization LayoutRight
+template <>
+struct CompactLambda<LayoutRight>
+{
+  template <class Init, class Shape>
+  CUTE_HOST_DEVICE constexpr auto
+  operator()(Init const& init, Shape const& si) {
+    auto result = detail::compact<LayoutRight>(si, get<1>(init));
+    return cute::make_tuple(prepend(get<0>(init), get<0>(result)), get<1>(result));  // Prepend
+  }
+
+  template <class Shape>
+  using seq = tuple_rseq<Shape>;                                                     // RSeq
+};
+
+} // end namespace detail
+
+template <class Major, class Shape, class Current = Int<1>,
+          __CUTE_REQUIRES(is_tuple<Shape>::value || is_integral<Shape>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+compact_major(Shape   const& shape,
+              Current const& current = {})
+{
+  if constexpr (is_tuple<Current>::value) {    // Shape::tuple Current::tuple
+    static_assert(is_tuple<Shape>::value, "Invalid parameters");
+    static_assert(tuple_size<Shape>::value == tuple_size<Current>::value, "Mismatched Ranks");
+    // Recurse to apply to the terminals of current
+    return transform(shape, current, [&](auto const& s, auto const& c){ return compact_major<Major>(s,c); });
+  } else {
+    return get<0>(detail::compact<Major>(shape, current));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Compact Col Major
+//
+
+struct LayoutLeft {
+  template <class Shape>
+  using Apply = decltype(compact_major<LayoutLeft>(declval<Shape>()));
+};
+
+template <class Shape, class Current = Int<1>>
+CUTE_HOST_DEVICE constexpr
+auto
+compact_col_major(Shape   const& shape,
+                  Current const& current = {})
+{
+  return compact_major<LayoutLeft>(shape, current);
+}
+
+//
+// Compact Row Major
+//
+
+struct LayoutRight {
+  template <class Shape>
+  using Apply = decltype(compact_major<LayoutRight>(declval<Shape>()));
+};
+
+template <class Shape, class Current = Int<1>>
+CUTE_HOST_DEVICE constexpr
+auto
+compact_row_major(Shape   const& shape,
+                  Current const& current = {})
+{
+  return compact_major<LayoutRight>(shape, current);
+}
+
+//
+// Compact Order -- compute a compact stride based on an ordering of the modes
+//
+
+namespace detail {
+
+// @pre weakly_congruent(order, shape)
+// @pre is_congruent<RefShape, RefOrder>
+// @pre is_static<Order>
+// @pre is_static<RefOrder>
+template <class Shape, class Order, class RefShape, class RefOrder>
+CUTE_HOST_DEVICE constexpr
+auto
+compact_order(Shape const& shape, Order const& order,
+              RefShape const& ref_shape, RefOrder const& ref_order)
+{
+  if constexpr (is_tuple<Order>::value) {
+    static_assert(tuple_size<Shape>::value == tuple_size<Order>::value, "Need equal rank of shape and order");
+    return transform(shape, order, [&](auto const& s, auto const& o) { return compact_order(s, o, ref_shape, ref_order); });
+  } else {
+    // Compute the starting stride for this shape by accumulating all shapes corresponding to lesser orders
+    auto stride_start = product(transform(ref_shape, ref_order,
+                                          [&](auto const& s, auto const& o) {
+                                            return conditional_return(o < order, s, Int<1>{});
+                                          }));
+    return compact_col_major(shape, stride_start);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+} // end namespace detail
+
+template <class Shape, class Order>
+CUTE_HOST_DEVICE constexpr
+auto
+compact_order(Shape const& shape, Order const& order)
+{
+  auto ref_shape = flatten_to_tuple(product_like(shape, order));
+
+  auto flat_order = flatten_to_tuple(order);
+  // Find the largest static element of order
+  auto max_order = cute::fold(flat_order, Int<0>{}, [](auto v, auto order) {
+    if constexpr (is_constant<true, decltype(v < order)>::value) {
+      return order;
+    } else {
+      return v;
+    }
+
+    CUTE_GCC_UNREACHABLE;
+  });
+  // Replace any dynamic elements within order with large-static elements
+  auto max_seq = make_range<max_order+1, max_order+1+rank(flat_order)>{};
+  auto ref_order = cute::transform(max_seq, flat_order, [](auto seq_v, auto order) {
+    if constexpr (is_static<decltype(order)>::value) {
+      return order;
+    } else {
+      return seq_v;
+    }
+
+    CUTE_GCC_UNREACHABLE;
+  });
+
+  auto new_order = unflatten(ref_order, order);
+
+  return detail::compact_order(shape, new_order, ref_shape, ref_order);
+}
+
+template <class Shape>
+CUTE_HOST_DEVICE constexpr
+auto
+compact_order(Shape const& shape, GenColMajor const& major)
+{
+  return compact_major<LayoutLeft>(shape);
+}
+
+template <class Shape>
+CUTE_HOST_DEVICE constexpr
+auto
+compact_order(Shape const& shape, GenRowMajor const& major)
+{
+  return compact_major<LayoutRight>(shape);
+}
+
+//
+// Coordinate iterator
+//
+
+namespace detail {
+
+template <class Coord, class Shape, class Order>
+CUTE_HOST_DEVICE constexpr
+void
+increment(Coord& coord, Shape const& shape, Order const& order)
+{
+  ++basis_get(get<0>(order), coord);
+  cute::for_each(make_range<1, tuple_size<Order>::value>{}, [&](auto i){
+    if (basis_get(get<i-1>(order), coord) == basis_get(get<i-1>(order), shape)) {
+      basis_get(get<i-1>(order), coord) = 0;
+      ++basis_get(get<i>(order), coord);
+    }
+  });
+}
+
+/** Increment a (dynamic) coord colexicographically within a shape
+ * @pre is_congruent<Coord,Shape>::value
+ * \code
+ *   auto shape = make_shape(1,2,make_shape(2,3),3);
+ *   auto coord = repeat_like(shape, 0);
+ *
+ *   for (int i = 0; i < size(shape); ++i) {
+ *     std::cout << i << ": " << coord << std::endl;
+ *     increment(coord, shape);
+ *   }
+ * \endcode
+ */
+template <class Coord, class Shape>
+CUTE_HOST_DEVICE constexpr
+void
+increment(Coord& coord, Shape const& shape)
+{
+  increment(coord, shape, flatten_to_tuple(make_basis_like(shape)));
+}
+
+} // end namespace detail
+
+struct ForwardCoordIteratorSentinel
+{};
+
+// A forward iterator for a starting coordinate in a shape's domain, and a shape.
+// The starting coordinate may be zero but need not necessarily be.
+template <class Coord, class Shape, class Order>
+struct ForwardCoordIterator
+{
+  static_assert(is_congruent<Coord, Shape>::value);
+
+  CUTE_HOST_DEVICE constexpr
+  Coord const& operator*() const { return coord; }
+  CUTE_HOST_DEVICE constexpr
+  ForwardCoordIterator& operator++() { detail::increment(coord, shape, Order{}); return *this; }
+  // Sentinel for the end of the implied range
+  CUTE_HOST_DEVICE constexpr
+  bool operator==(ForwardCoordIteratorSentinel const&) const { return basis_get(back(Order{}), coord) == basis_get(back(Order{}), shape); }
+  CUTE_HOST_DEVICE constexpr
+  bool operator!=(ForwardCoordIteratorSentinel const&) const { return basis_get(back(Order{}), coord) != basis_get(back(Order{}), shape); }
+  // NOTE: These are expensive, avoid use
+  CUTE_HOST_DEVICE constexpr
+  bool operator==(ForwardCoordIterator const& other) const { return coord == other.coord; }
+  CUTE_HOST_DEVICE constexpr
+  bool operator!=(ForwardCoordIterator const& other) const { return coord != other.coord; }
+
+  Coord coord;
+  Shape const& shape;
+};
+
+// A forward iterator for a coordinate that starts from a provided coordinate and increments in a prescribed order
+template <class Order, class Shape, class Coord>
+CUTE_HOST_DEVICE constexpr
+auto
+make_coord_iterator(Coord const& coord, Shape const& shape)
+{
+  static_assert(is_congruent<Coord, Shape>::value);
+  static_assert(is_congruent<Order, Coord>::value);
+  static_assert(is_congruent<Order, Shape>::value);
+  auto flat_order  = flatten_to_tuple(Order{});
+  auto inv_order   = transform(make_seq<rank(flat_order)>{}, [&](auto i){ return find(flat_order, i); });
+  auto basis_order = transform_leaf(inv_order, [&](auto i) { return get<i>(flatten_to_tuple(make_basis_like(shape))); });
+  return ForwardCoordIterator<Coord,Shape,decltype(basis_order)>{coord,shape};
+}
+
+// A forward iterator for a coordinate that starts from a provided coordinate and increments colex
+template <class Shape, class Coord>
+CUTE_HOST_DEVICE constexpr
+auto
+make_coord_iterator(Coord const& coord, Shape const& shape)
+{
+  static_assert(is_congruent<Coord, Shape>::value);
+  auto basis_order = flatten_to_tuple(make_basis_like(shape));
+  return ForwardCoordIterator<Coord,Shape,decltype(basis_order)>{coord,shape};
+}
+
+// A forward iterator for a coordinate that starts from zero and increments in a prescribed order
+template <class Order, class Shape>
+CUTE_HOST_DEVICE constexpr
+auto
+make_coord_iterator(Shape const& shape)
+{
+  return make_coord_iterator<Order>(repeat_like(shape, int(0)), shape);
+}
+
+// A forward iterator for a coordinate that starts from zero and increments colex
+template <class Shape>
+CUTE_HOST_DEVICE constexpr
+auto
+make_coord_iterator(Shape const& shape)
+{
+  return make_coord_iterator(repeat_like(shape, int(0)), shape);
+}
+
+} // end namespace cute
diff --git a/include/cute/swizzle.hpp b/include/cute/swizzle.hpp
new file mode 100644
index 0000000000..52abf856dd
--- /dev/null
+++ b/include/cute/swizzle.hpp
@@ -0,0 +1,498 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>                      // CUTE_HOST_DEVICE
+#include <cute/container/tuple.hpp>             // cute::is_tuple
+#include <cute/numeric/integral_constant.hpp>   // cute::constant
+#include <cute/numeric/math.hpp>                // cute::max, cute::min
+#include <cute/algorithm/tuple_algorithms.hpp>  // cute::transform_apply
+
+namespace cute
+{
+
+// A generic Swizzle functor
+/* 0bxxxxxxxxxxxxxxxYYYxxxxxxxZZZxxxx
+ *                               ^--^ MBase is the number of least-sig bits to keep constant
+ *                  ^-^       ^-^     BBits is the number of bits in the mask
+ *                    ^---------^     SShift is the distance to shift the YYY mask
+ *                                       (pos shifts YYY to the right, neg shifts YYY to the left)
+ *
+ * e.g. Given
+ * 0bxxxxxxxxxxxxxxxxYYxxxxxxxxxZZxxx
+ * the result is
+ * 0bxxxxxxxxxxxxxxxxYYxxxxxxxxxAAxxx where AA = ZZ xor YY
+ */
+template <int BBits, int MBase, int SShift = BBits>
+struct Swizzle
+{
+  static constexpr int num_bits = BBits;
+  static constexpr int num_base = MBase;
+  static constexpr int num_shft = SShift;
+
+  static_assert(num_base >= 0,             "MBase must be positive.");
+  static_assert(num_bits >= 0,             "BBits must be positive.");
+  static_assert(abs(num_shft) >= num_bits, "abs(SShift) must be more than BBits.");
+
+  // using 'int' type here to avoid unintentially casting to unsigned... unsure.
+  using bit_msk = cute::constant<int, (1 << num_bits) - 1>;
+  using yyy_msk = cute::constant<int, bit_msk{} << (num_base + max(0,num_shft))>;
+  using zzz_msk = cute::constant<int, bit_msk{} << (num_base - min(0,num_shft))>;
+  using msk_sft = cute::constant<int, num_shft>;
+
+  static constexpr uint32_t swizzle_code = uint32_t(yyy_msk{} | zzz_msk{});
+
+  template <class Offset>
+  CUTE_HOST_DEVICE constexpr static
+  auto
+  apply(Offset const& offset)
+  {
+    return offset ^ shiftr(offset & yyy_msk{}, msk_sft{});   // ZZZ ^= YYY
+  }
+
+  template <class Offset>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  operator()(Offset const& offset) const
+  {
+    return apply(offset);
+  }
+
+  template <int B, int M, int S>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  operator==(Swizzle<B,M,S> const&) const
+  {
+    return B == BBits && M == MBase && S == SShift;
+  }
+};
+
+//
+// make_swizzle<0b1000, 0b0100>()         ->  Swizzle<1,2,1>
+// make_swizzle<0b11000000, 0b00000110>() ->  Swizzle<2,1,5>
+//
+
+template <uint32_t Y, uint32_t Z>
+CUTE_HOST_DEVICE constexpr
+auto
+make_swizzle()
+{
+  constexpr uint32_t BZ = popcount(Y);                    // Number of swizzle bits
+  constexpr uint32_t BY = popcount(Z);                    // Number of swizzle bits
+  static_assert(BZ == BY, "Number of bits in Y and Z don't match");
+  constexpr uint32_t TZ_Y = countr_zero(Y);               // Number of trailing zeros in Y
+  constexpr uint32_t TZ_Z = countr_zero(Z);               // Number of trailing zeros in Z
+  constexpr uint32_t M = cute::min(TZ_Y, TZ_Z) % 32;
+  constexpr  int32_t S = int32_t(TZ_Y) - int32_t(TZ_Z);   // Difference in trailing zeros
+  static_assert((Y | Z) == Swizzle<BZ,M,S>::swizzle_code, "Something went wrong.");
+  return Swizzle<BZ,M,S>{};
+}
+
+template <int B0, int M0, int S0,
+          int B1, int M1, int S1>
+CUTE_HOST_DEVICE constexpr
+auto
+composition(Swizzle<B0,M0,S0>, Swizzle<B1,M1,S1>)
+{
+  static_assert(S0 == S1, "Can only merge swizzles of the same shift.");
+  constexpr uint32_t Y = Swizzle<B0,M0,S0>::yyy_msk::value ^ Swizzle<B1,M1,S1>::yyy_msk::value;
+  constexpr uint32_t Z = Swizzle<B0,M0,S0>::zzz_msk::value ^ Swizzle<B1,M1,S1>::zzz_msk::value;
+  return make_swizzle<Y,Z>();
+
+  //return ComposedFn<Swizzle<B0,M0,S0>, Swizzle<B1,M1,S1>>{};
+}
+
+//
+// Utility for slicing and swizzle "offsets"
+//
+
+// For swizzle functions, it is often needed to keep track of which bits are
+//   consumed and which bits are free. Furthermore, it is useful to know whether
+// each of these bits is known statically or dynamically.
+
+// MixedBits is an 32-bit unsigned integer class where some bits are known statically
+//   and some bits are known dynamically. These sets of bits are disjoint and it is
+//   known statically which bits are known dynamically.
+
+// MixedBits can only be manipulated through bitwise operations
+
+// Abstract value:  StaticInt | (dynamic_int_ & StaticFlags)
+template <uint32_t StaticInt,
+          uint32_t StaticFlags>    // 0: static, 1: dynamic
+struct MixedBits
+{
+  // Representation invariants
+  static_assert(StaticFlags != 0, "Should be at least one dynamic bit in MixedBits.");
+  static_assert((StaticInt & StaticFlags) == 0, "No static/dynamic overlap allowed in MixedBits.");
+
+  uint32_t dynamic_int_;
+  // assert((dynamic_int_ & ~StaticFlags) == 0);
+
+  CUTE_HOST_DEVICE constexpr operator uint32_t() const noexcept { return StaticInt | dynamic_int_; }
+};
+
+// Return a value representing (C<s>{} | (d & C<f>)) potentially using MixedBits to track s and f.
+// This maker does allow ((s & f) != 0) and enforces the MixedBits invariant before creation.
+template <auto s, class DynamicType, auto f>
+CUTE_HOST_DEVICE constexpr
+auto
+make_mixed_bits(C<s>, DynamicType const& d, C<f>)
+{
+  static_assert(is_integral<DynamicType>::value);
+  constexpr uint32_t new_f = uint32_t(f) & ~uint32_t(s);        // StaticBits take precedence, M<0,f>{d} | C<s>{}
+  if constexpr (new_f == 0 || is_static<DynamicType>::value) {
+    return C<s>{} | (d & C<new_f>{});                           // Just return a static int
+  } else {
+    return MixedBits<s, new_f>{uint32_t(d) & new_f};            // MixedBits
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Operators
+//
+
+// Equality
+template <uint32_t S0, uint32_t F0, auto S1>
+CUTE_HOST_DEVICE constexpr
+auto
+operator==(MixedBits<S0,F0> const& m, C<S1>)
+{
+  return (S0 == (uint32_t(S1) & ~F0)) && (m.dynamic_int_ == (uint32_t(S1) & F0));
+}
+
+template <uint32_t S0, uint32_t F0, auto S1>
+CUTE_HOST_DEVICE constexpr
+auto
+operator==(C<S1> s, MixedBits<S0,F0> const& m)
+{
+  return m == s;
+}
+
+// Bitwise AND
+template <uint32_t S0, uint32_t F0,
+          uint32_t S1, uint32_t F1>
+CUTE_HOST_DEVICE constexpr
+auto
+operator&(MixedBits<S0,F0> const& m0, MixedBits<S1,F1> const& m1)
+{
+  // Truth table for (S0,D0,F0) & (S1,D1,F1) -> (S,D,F)
+  //   S0D0F0  | 0X0 | 001 | 011 | 1X0 |
+  // S1D1F1
+  //  0X0      | 0X0 | 0X0 | 0X0 | 0X0 |
+  //  001      | 0X0 | 001 | 001 | 001 |
+  //  011      | 0X0 | 001 | 011 | 011 |
+  //  1X0      | 0X0 | 001 | 011 | 1X0 |
+
+  return make_mixed_bits(C<S0 & S1>{},
+                         //(S0 | m0.dynamic_int_) & (S1 | m1.dynamic_int_),
+                         ((S1 & F0) & m0.dynamic_int_) | ((S0 & F1) & m1.dynamic_int_) | (m0.dynamic_int_ & m1.dynamic_int_),
+                         C<(S1 & F0) | (S0 & F1) | (F0 & F1)>{});
+}
+
+template <uint32_t S0, uint32_t F0, auto S1>
+CUTE_HOST_DEVICE constexpr
+auto
+operator&(MixedBits<S0,F0> const& m, C<S1>)
+{
+  return make_mixed_bits(C<S0 & uint32_t(S1)>{},
+                         m.dynamic_int_,
+                         C<F0 & uint32_t(S1)>{});
+}
+
+template <uint32_t S0, uint32_t F0, auto S1>
+CUTE_HOST_DEVICE constexpr
+auto
+operator&(C<S1> s, MixedBits<S0,F0> const& m)
+{
+  return m & s;
+}
+
+// Bitwise OR
+template <uint32_t S0, uint32_t F0,
+          uint32_t S1, uint32_t F1>
+CUTE_HOST_DEVICE constexpr
+auto
+operator|(MixedBits<S0,F0> const& m0, MixedBits<S1,F1> const& m1)
+{
+  // Truth table for (S0,D0,F0) | (S1,D1,F1) -> (S,D,F)
+  //   S0D0F0 | 0X0 | 001 | 011 | 1X0 |
+  // S1D1F1
+  //  0X0     | 0X0 | 001 | 011 | 1X0 |
+  //  001     | 001 | 001 | 011 | 1X0 |
+  //  011     | 011 | 011 | 011 | 1X0 |
+  //  1X0     | 1X0 | 1X0 | 1X0 | 1X0 |
+
+  return make_mixed_bits(C<S0 | S1>{},
+                         ((~S1 & F0) & m0.dynamic_int_) | ((~S0 & F1) & m1.dynamic_int_),
+                         C<(~S0 & F1) | (~S1 & F0)>{});
+}
+
+template <uint32_t S0, uint32_t F0, auto S1>
+CUTE_HOST_DEVICE constexpr
+auto
+operator|(MixedBits<S0,F0> const& m, C<S1>)
+{
+  return make_mixed_bits(C<S0 |  uint32_t(S1)>{},
+                         m.dynamic_int_,
+                         C<F0 & ~uint32_t(S1)>{});
+}
+
+template <uint32_t S0, uint32_t F0, auto S1>
+CUTE_HOST_DEVICE constexpr
+auto
+operator|(C<S1> s, MixedBits<S0,F0> const& m)
+{
+  return m | s;
+}
+
+// Bitwise XOR
+template <uint32_t S0, uint32_t F0,
+          uint32_t S1, uint32_t F1>
+CUTE_HOST_DEVICE constexpr
+auto
+operator^(MixedBits<S0,F0> const& m0, MixedBits<S1,F1> const& m1)
+{
+  // Truth table for (S0,D0,F0) ^ (S1,D1,F1) -> (S,D,F)
+  //   S0D0F0 | 0X0 | 001 | 011 | 1X0 |
+  // S1D1F1
+  //  0X0     | 0X0 | 001 | 011 | 1X0 |
+  //  001     | 001 | 001 | 011 | 011 |
+  //  011     | 011 | 011 | 001 | 001 |
+  //  1X0     | 1X0 | 011 | 001 | 0X0 |
+
+  return make_mixed_bits(C<(~S0 & S1 & ~F0) | (S0 & ~S1 & ~F1)>{},
+                         (S0 | m0.dynamic_int_) ^ (S1 | m1.dynamic_int_),
+                         C<F0 | F1>{});
+}
+
+template <uint32_t S0, uint32_t F0, auto S1>
+CUTE_HOST_DEVICE constexpr
+auto
+operator^(MixedBits<S0,F0> const& m, C<S1>)
+{
+  return make_mixed_bits(C<(~S0 & uint32_t(S1) & ~F0) | (S0 & ~uint32_t(S1))>{},
+                         (S0 | m.dynamic_int_) ^ uint32_t(S1),
+                         C<F0>{});
+}
+
+template <uint32_t S0, uint32_t F0, auto S1>
+CUTE_HOST_DEVICE constexpr
+auto
+operator^(C<S1> s, MixedBits<S0,F0> const& m)
+{
+  return m ^ s;
+}
+
+template <uint32_t S0, uint32_t F0, auto S1>
+CUTE_HOST_DEVICE constexpr
+auto
+operator<<(MixedBits<S0,F0> const& m, C<S1>)
+{
+  return make_mixed_bits(C<(S0 << S1)>{},
+                         m.dynamic_int_ << S1,
+                         C<(F0 << S1)>{});
+}
+
+template <uint32_t S0, uint32_t F0, auto S1>
+CUTE_HOST_DEVICE constexpr
+auto
+operator>>(MixedBits<S0,F0> const& m, C<S1>)
+{
+  return make_mixed_bits(C<(S0 >> S1)>{},
+                         m.dynamic_int_ >> S1,
+                         C<(F0 >> S1)>{});
+}
+
+template <uint32_t S0, uint32_t F0, auto S1>
+CUTE_HOST_DEVICE constexpr
+auto
+shiftl(MixedBits<S0,F0> const& m, C<S1> s)
+{
+  if constexpr (S1 >= 0) {
+    return m << s;
+  } else {
+    return m >> -s;
+  }
+}
+
+template <uint32_t S0, uint32_t F0, auto S1>
+CUTE_HOST_DEVICE constexpr
+auto
+shiftr(MixedBits<S0,F0> const& m, C<S1> s)
+{
+  if constexpr (S1 >= 0) {
+    return m >> s;
+  } else {
+    return m << -s;
+  }
+}
+
+//
+// Upcast and Downcast
+//
+
+template <uint32_t S0, uint32_t F0, auto S1>
+CUTE_HOST_DEVICE constexpr
+auto
+safe_div(MixedBits<S0,F0> const& m, C<S1> s)
+{
+  static_assert(has_single_bit(uint32_t(S1)), "Only divide MixedBits by powers of two.");
+  return make_mixed_bits(safe_div(C<S0>{}, s),
+                         safe_div(m.dynamic_int_, s),
+                         safe_div(C<F0>{}, s));
+}
+
+template <uint32_t N, uint32_t S0, uint32_t F0>
+CUTE_HOST_DEVICE constexpr
+auto
+upcast(MixedBits<S0,F0> const& m)
+{
+  static_assert(has_single_bit(N), "Only divide MixedBits by powers of two.");
+  return safe_div(m, C<N>{});
+}
+
+template <uint32_t N, class T, __CUTE_REQUIRES(cute::is_integral<T>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+upcast(T const& m)
+{
+  return safe_div(m, C<N>{});
+}
+
+template <uint32_t N, uint32_t S0, uint32_t F0>
+CUTE_HOST_DEVICE constexpr
+auto
+downcast(MixedBits<S0,F0> const& m)
+{
+  static_assert(has_single_bit(N), "Only scale MixedBits by powers of two.");
+  return make_mixed_bits(C<S0 * N>{},
+                         m.dynamic_int_ * N,
+                         C<F0 * N>{});
+}
+
+template <uint32_t N, class T, __CUTE_REQUIRES(cute::is_integral<T>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+downcast(T const& m)
+{
+  return m * C<N>{};
+}
+
+template <uint32_t S0, uint32_t F0>
+CUTE_HOST_DEVICE constexpr
+auto
+max_alignment(MixedBits<S0,F0> const&)
+{
+  return C<uint32_t(1) << countr_zero(S0 | F0)>{};
+}
+
+template <auto v>
+CUTE_HOST_DEVICE constexpr
+C<v>
+max_alignment(C<v> const& c)
+{
+  return c;
+}
+
+//
+// Convert a Pow2Layout+Coord to a MixedBits
+//
+
+template <class Shape, class Stride, class Coord>
+CUTE_HOST_DEVICE constexpr
+auto
+to_mixed_bits(Shape const& shape, Stride const& stride, Coord const& coord)
+{
+  if constexpr (is_tuple<Shape>::value && is_tuple<Stride>::value && is_tuple<Coord>::value) {
+    static_assert(tuple_size<Shape>::value == tuple_size<Stride>::value, "Mismatched ranks");
+    static_assert(tuple_size<Shape>::value == tuple_size<Coord >::value, "Mismatched ranks");
+    return transform_apply(shape, stride, coord, [](auto const& s, auto const& d, auto const& c) { return to_mixed_bits(s,d,c); },
+                                                 [](auto const&... a) { return (a ^ ...); });
+  } else if constexpr (is_integral<Shape>::value && is_integral<Stride>::value && is_integral<Coord>::value) {
+    static_assert(decltype(shape*stride)::value == 0 || has_single_bit(decltype(shape*stride)::value), "Requires pow2 shape*stride.");
+    return make_mixed_bits(Int<0>{}, coord * stride, (shape - Int<1>{}) * stride);
+  } else {
+    static_assert(is_integral<Shape>::value && is_integral<Stride>::value && is_integral<Coord>::value, "Either Shape, Stride, and Coord must be all tuples, or they must be all integral (in the sense of cute::is_integral).");
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class Layout, class Coord>
+CUTE_HOST_DEVICE constexpr
+auto
+to_mixed_bits(Layout const& layout, Coord const& coord)
+{
+  return to_mixed_bits(layout.shape(), layout.stride(), idx2crd(coord, layout.shape()));
+}
+
+//
+// Display utilities
+//
+
+template <int B, int M, int S>
+CUTE_HOST_DEVICE void print(Swizzle<B,M,S> const&)
+{
+  printf("Sw<%d,%d,%d>", B, M, S);
+}
+
+template <uint32_t S, uint32_t F>
+CUTE_HOST_DEVICE void print(MixedBits<S,F> const& m)
+{
+  printf("M_%u|(%u&%u)=%u", S, m.dynamic_int_, F, uint32_t(m));
+}
+
+#if !defined(__CUDACC_RTC__)
+template <int B, int M, int S>
+CUTE_HOST std::ostream& operator<<(std::ostream& os, Swizzle<B,M,S> const&)
+{
+  return os << "Sw<" << B << "," << M << "," << S << ">";
+}
+
+template <uint32_t S, class D, uint32_t F>
+CUTE_HOST std::ostream& operator<<(std::ostream& os, MixedBits<S,F> const& m)
+{
+  return os << "M_" << S << "|(" << m.dynamic_int_ << "&" << F << ")=" << uint32_t(m);
+}
+#endif // !defined(__CUDACC_RTC__)
+
+//
+// Helper Function
+//
+template <class T, class = void>                      // Default No-Swizzle
+struct get_swizzle { using type = Swizzle<0,4,3>; };
+
+template <class T>
+using get_swizzle_t = typename get_swizzle<T>::type;
+
+} // end namespace cute
diff --git a/include/cute/swizzle_layout.hpp b/include/cute/swizzle_layout.hpp
new file mode 100644
index 0000000000..7f7161bc32
--- /dev/null
+++ b/include/cute/swizzle_layout.hpp
@@ -0,0 +1,584 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>           // CUTE_HOST_DEVICE
+#include <cute/layout.hpp>           // cute::Layout
+#include <cute/layout_composed.hpp>  // cute::ComposedLayout
+#include <cute/swizzle.hpp>          // cute::Swizzle, cute::get_swizzle primary template
+
+/* Specialized functionality for a ComposedLayout of the form
+ *   InvolutionFn o Offset o LayoutB
+ * where the InvolutionFn is a Swizzle<B,M,S> and is not linear (hence the need for the Offset).
+ *
+ * Because these are specializations for core functions of ComposedLayout, these Swizzle Layouts
+ * provide similar functionality to Layout including tiling, partitioning,
+ * coordinate-to-index mapping and layout manipulations, but are not considered "normal" layouts.
+ * For example, these provide shape() and size() functions, but do not provide stride() functions.
+ *
+ * Furthermore, each of these specializations uses Swizzle<>-specific knowledge in its implementation and
+ * attempts to decay itself to a normal-layout with dynamic or static strides when certain slicing conditions
+ * are met. This is possible by determining the subdomain of the Swizzle<> function that is identity and
+ * testing if LayoutB's codomain is contained within it. In general, MizedBits is used as the Offset to track
+ * statically-vs-dynamically known bits in the Offset to improve the decay to static or dynamic normal layouts.
+ */
+
+namespace cute
+{
+
+//
+// Helper Function
+//
+template <int B, int M, int S, class Offset, class LayoutB>
+struct get_swizzle<ComposedLayout<Swizzle<B,M,S>,Offset,LayoutB>> { using type = Swizzle<B,M,S>; };
+
+//
+// Constructors
+//
+
+template <int B, int M, int S>
+CUTE_HOST_DEVICE constexpr
+auto
+make_layout(Swizzle<B,M,S> const& sxor)
+{
+  return composition(sxor, Layout<Int<M+B+abs(S)>,Int<1>>{});
+}
+
+namespace detail {
+
+template <int B, int M, int S, class OldShape, class OldStride, class NewShape, class NewStride>
+CUTE_HOST_DEVICE constexpr
+auto
+transfer_swizzle(Layout<OldShape,OldStride> const& old_layout,
+                 Layout<NewShape,NewStride> const& new_layout)
+{
+  // Our goal is to determine a new swizzle for the strides in new_layout for consistent vectorizations
+
+  // This is accomplished by identifying
+  //  S o L  :=:  S? o L*
+  // We identify the "active" portion of S by computing (P o L)(c*) where P is a projection generated by S
+  // Then that active identifier is transformed through the layouts:
+  //  L*(L[(P o L)(c*)])
+  // which is a new swizzle identifier for S?, the new swizzle
+
+  // Projections of the swizzle layout for composition, P
+  auto swizzle_only_zy = make_layout(make_shape (Int<(1 << M)>{}, Int<(1 << B)>{}, Int<(1 << (abs(S)-B))>{}, Int<(1 <<  B        )>{}, Int<1>{}),
+                                     make_stride(       Int<0>{}, Int<(1 << M)>{},                 Int<0>{}, Int<(1 << (M+abs(S)))>{}, Int<0>{}));
+
+  // Compose with the tile to get the swizzle projection, P o L  [The Z and Y contributing portions of L]
+  auto layout_only_zy       = composition(swizzle_only_zy, old_layout);
+  // Transform the end coordinate to get the active bits of the swizzle, (P o L)(c*)
+  auto swizzle_active_bits  = layout_only_zy(size(layout_only_zy)-Int<1>{});
+
+  // Get the Z bit and the Y bits -- keep only those that are active in Z *and* Y
+  auto zzz_msk = typename Swizzle<B,M,S>::zzz_msk{};
+  auto yyy_msk = typename Swizzle<B,M,S>::yyy_msk{};
+  auto msk_sft = typename Swizzle<B,M,S>::msk_sft{};
+  auto active_Z = swizzle_active_bits & shiftr(swizzle_active_bits,  msk_sft) & zzz_msk;
+  auto active_Y = swizzle_active_bits & shiftr(swizzle_active_bits, -msk_sft) & yyy_msk;
+
+  // Pass the identifiers through the old layout and new layout to make a new swizzle identifier, L*(L[(P o L)(c*)])
+  auto new_active_Z = new_layout(old_layout.get_1d_coord(active_Z));
+  auto new_active_Y = new_layout(old_layout.get_1d_coord(active_Y));
+
+  // Use this new swizzle identifier to construct the new swizzle for new_layout
+  //   (this also makes sure it's a "valid" swizzle that Swizzle can represent)
+  return composition(make_swizzle<new_active_Y,new_active_Z>(), new_layout);
+}
+
+} // end namespace detail
+
+template <int B, int M, int S, class Offset, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+make_fragment_like(ComposedLayout<Swizzle<B,M,S>,Offset,Layout> const& layout)
+{
+  return make_fragment_like(layout.layout_b());
+}
+
+//
+// Utilities
+//
+
+namespace detail {
+
+// Get just the Swizzle part of a composed layout.
+template <int B, int M, int S, class Offset, class LayoutB>
+CUTE_HOST_DEVICE constexpr
+auto
+get_swizzle_portion(ComposedLayout<Swizzle<B,M,S>,Offset,LayoutB>)
+{
+  return Swizzle<B,M,S>{};
+}
+
+// A non-swizzled layout's "Swizzle part" is the identity swizzle.
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+get_swizzle_portion(Layout<Shape,Stride>)
+{
+  return Swizzle<0,4,3>{};
+}
+
+// Get the "non-swizzle" part of a composed layout,
+// which is the underlying (non-composed) Layout.
+template <int B, int M, int S, class Offset, class LayoutB>
+CUTE_HOST_DEVICE constexpr
+auto
+get_nonswizzle_portion(ComposedLayout<Swizzle<B,M,S>,Offset,LayoutB> const& slayout)
+{
+  return slayout.layout_b();
+}
+
+// The non-swizzle part of a non-swizzled layout is just the Layout.
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+get_nonswizzle_portion(Layout<Shape,Stride> const& slayout)
+{
+  return slayout;
+}
+
+} // namespace detail
+
+//
+// Slice a Swizzled ComposedLayout
+//
+
+namespace detail {
+
+template <class IntZ, class IntY, class Offset, int... I>
+CUTE_HOST_DEVICE constexpr
+auto
+make_swizzle_strides(true_type,
+                     IntZ   const& Z,
+                     IntY   const& Y,
+                     Offset const& offset,
+                     int_sequence<I...>)
+{
+  // Below is an optimized/compressed version of:
+  //return cute::make_tuple((swizzle(offset + Z*Int<(1 << I)>{}) - swizzle(offset))...);
+  // with knowledge of Swizzle, I... ranges for each B bits,
+  //    and the layout won't slice along z-bits that are already set
+
+  // y\z  0   1
+  //   0  Z  DC
+  //   1 -Z  DC
+
+  return cute::make_tuple(conditional_return((offset & (Y << Int<I>{})) == Int<0>{}, Z * Int<(1 << I)>{}, -Z * Int<(1 << I)>{})...);
+}
+
+template <class IntZ, class IntY, class Offset, int... I>
+CUTE_HOST_DEVICE constexpr
+auto
+make_swizzle_strides(false_type,
+                     IntZ   const& Z,
+                     IntY   const& Y,
+                     Offset const& offset,
+                     int_sequence<I...>)
+{
+  // Below is an optimized/compressed version of:
+  //return cute::make_tuple((swizzle(offset + Y*Int<(1 << I)>{}) - swizzle(offset))...);
+  // with knowledge of Swizzle, I... ranges for each B bits,
+  //    and the layout won't slice along y-bits that are already set
+
+  // y\z  0   1
+  //   0 Y+Z Y-Z
+  //   1 DC  DC
+
+  return cute::make_tuple(conditional_return((offset & (Z << Int<I>{})) == Int<0>{}, (Y+Z) * Int<(1 << I)>{}, (Y-Z) * Int<(1 << I)>{})...);
+}
+
+} // end namespace detail
+
+template <class Coord, int B, int M, int S, class Offset, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+slice_and_offset(Coord const& coord, ComposedLayout<Swizzle<B,M,S>,Offset,Layout> const& layout)
+{
+  if constexpr (all_underscore<Coord>::value) {
+    // Skip the expensive/complicated attempt to decay to a normal layout and just reshape
+    return cute::make_tuple(composition(layout.layout_a(), layout.offset(), slice(coord, layout.layout_b())), Int<0>{});
+  } else {
+
+    // Projections of the swizzle layout for composition
+    auto sw = make_layout(make_shape(Int<(1 << M)>{}, Int<(1 << B)>{}, Int<(1 << (abs(S)-B))>{}, Int<(1 << B)>{}, Int<1>{}));
+
+    auto swizzle_anti_zy = make_layout(shape(sw),
+                                       make_stride(stride<0>(sw),      Int<0>{}, stride<2>(sw),      Int<0>{}, size(sw)));
+    auto swizzle_only_zy = make_layout(shape(sw),
+                                       make_stride(     Int<0>{}, stride<1>(sw),      Int<0>{}, stride<3>(sw), Int<0>{}));
+
+    // The portion of the layout that is not yet consumed
+    auto sliced_layout = slice(coord, layout.layout_b());
+
+    // The portion of the layout that we are consuming now
+    auto diced_layout = dice(coord, layout.layout_b());
+    auto diced_coord  = dice(coord, coord);
+
+    auto diced_layout_anti_zy = composition(swizzle_anti_zy, diced_layout);
+    auto diced_layout_only_zy = composition(swizzle_only_zy, diced_layout);
+
+    // New swizzle and offset
+    auto swizzle = layout.layout_a();
+    // offset_only_zy interacts with swizzle and gets accumulated with layout.offset()
+    //   being careful about the static/dynamic contributions from diced_layout and diced_coord
+    auto offset_only_zy = layout.offset() ^ to_mixed_bits(diced_layout_only_zy, diced_coord);
+    // offset_anti_zy always gets passed through, no interaction with swizzle
+    auto offset_anti_zy = diced_layout_anti_zy(diced_coord);
+
+    // If Layout's codomain hits on         Y AND Z, then it's not reducible
+    // If Layout's codomain hits on         Y XOR Z, then it's dynamic-normal
+    // If Layout's codomain hits on neither Y NOR Z, then it's static-normal
+
+    // If the sliced_layout hits two bits that are swizzled together, then don't attempt to decay
+
+    // Compose with the layout to get the swizzle projection, P o L  [The Z and Y contributing portions of L]
+    //   (this also tests that shape/stride of layout compose with swizzle)
+    auto sliced_layout_only_zy = composition(swizzle_only_zy, sliced_layout);
+    // Transform the end coordinate to get the active bits of the swizzle, (P o L)(c*)
+    [[maybe_unused]] auto swizzle_active_bits = sliced_layout_only_zy(size(sliced_layout_only_zy)-Int<1>{});
+
+    // Determine if any active bits collide under the swizzle for potential decay
+    if constexpr (is_constant<0, decltype(not (swizzle_active_bits & ~swizzle(swizzle_active_bits)))>::value)
+    { // Hits on Y AND Z, so it's not reducible
+      return cute::make_tuple(composition(swizzle, offset_only_zy, sliced_layout), offset_anti_zy);
+    } else
+    { // Misses on Y or Z, so it's static-normal or dynamic-normal
+
+      // Lowest bit of the Z and Y masks
+      auto Z = typename Swizzle<B,M,S>::zzz_msk{} & -typename Swizzle<B,M,S>::zzz_msk{};
+      auto Y = typename Swizzle<B,M,S>::yyy_msk{} & -typename Swizzle<B,M,S>::yyy_msk{};
+      auto stride_lo = detail::make_swizzle_strides(Z < Y, Z, Y, offset_only_zy, make_int_sequence<B>{});
+      auto stride_hi = detail::make_swizzle_strides(Z > Y, Z, Y, offset_only_zy, make_int_sequence<B>{});
+
+      // Construct a (dynamic) layout that we can perform the composition with
+      auto swizzle_layout = make_layout(make_shape (Int<(1 << M)>{}, repeat<B>(Int<2>{}), Int<(1 << (abs(S)-B))>{}, repeat<B>(Int<2>{}), Int<                  1>{}),
+                                        make_stride(Int<       1>{},           stride_lo, Int<(1 <<      (M+B))>{},          stride_hi , Int<(1 << (M+B+abs(S)))>{}));
+
+      // Decay to a normal layout with offset
+      return cute::make_tuple(composition(swizzle_layout, sliced_layout),
+                              swizzle(offset_only_zy) + offset_anti_zy);
+    }
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// composition
+//
+
+// Ignore identity case
+template <int M, int S,
+          class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+composition(Swizzle<0,M,S> const&,
+            Int<0> const&,
+            Layout<Shape,Stride> const& layout)
+{
+  return layout;
+}
+
+template <int B, int M, int S,
+          class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+composition(Swizzle<B,M,S> const& sxor,
+            Layout<Shape,Stride> const& layout)
+{
+  return composition(sxor, Int<0>{}, layout);
+}
+
+template <class ShapeA, class StrideA,
+          int B, int M, int S>
+CUTE_HOST_DEVICE constexpr
+auto
+composition(Layout<ShapeA,StrideA> const& a,
+            Swizzle<B,M,S>         const& b)
+{
+  // Get the Z bits and the Y bits
+  auto active_Y = a(typename Swizzle<B,M,S>::yyy_msk{});
+  auto active_Z = a(typename Swizzle<B,M,S>::zzz_msk{});
+
+  // Works in simple cases... but could be greatly generalized
+
+  return composition(make_swizzle<active_Y,active_Z>(), a);
+}
+
+//
+// inverse
+//
+
+// Specialization to attempt to pass-through the Swizzle back to the left -- Needed?
+template <int B, int M, int S, class Offset, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+right_inverse(ComposedLayout<Swizzle<B,M,S>,Offset,Layout> const& layout)
+{
+  if constexpr (is_constant<0, Offset>::value) {
+    return composition(right_inverse(layout.layout_b()), layout.layout_a());
+  } else {
+    return composition(right_inverse(layout.layout_b()), right_inverse(layout.offset()), right_inverse(layout.layout_a()));
+  }
+}
+
+// Specialization to attempt to pass-through the Swizzle back to the left -- Needed?
+template <int B, int M, int S, class Offset, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+left_inverse(ComposedLayout<Swizzle<B,M,S>,Offset,Layout> const& layout)
+{
+  if constexpr (is_constant<0, Offset>::value) {
+    return composition(left_inverse(layout.layout_b()), layout.layout_a());
+  } else {
+    return composition(left_inverse(layout.layout_b()), left_inverse(layout.offset()), left_inverse(layout.layout_a()));
+  }
+}
+
+template <int B, int M, int S>
+CUTE_HOST_DEVICE constexpr
+Swizzle<B,M,S>
+right_inverse(Swizzle<B,M,S> const& sw)
+{
+  return sw;
+}
+
+template <int B, int M, int S>
+CUTE_HOST_DEVICE constexpr
+Swizzle<B,M,S>
+left_inverse(Swizzle<B,M,S> const& sw)
+{
+  return sw;
+}
+
+// Kludge -- Probably want an OffsetFn<T> here instead
+template <class T, __CUTE_REQUIRES(is_integral<T>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+right_inverse(T const& t)
+{
+  return -t;
+}
+
+// Kludge -- Probably want an OffsetFn<T> here instead
+template <class T, __CUTE_REQUIRES(is_integral<T>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+left_inverse(T const& t)
+{
+  return -t;
+}
+
+//
+// Upcast and Downcast
+//
+
+template <int N, int B, int M, int S>
+CUTE_HOST_DEVICE constexpr
+auto
+upcast(Swizzle<B,M,S> const& swizzle)
+{
+  static_assert(has_single_bit(N), "N must be a power of two");
+  constexpr int log2_n = bit_width(uint32_t(N)) - 1;
+  constexpr int NewM   = M - log2_n;
+  if constexpr (NewM >= 0) {
+    return Swizzle<B,NewM,S>{};
+  } else {
+    return Swizzle<cute::max(B+NewM,0), 0, S>{};
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <int N, int B, int M, int S>
+CUTE_HOST_DEVICE constexpr
+auto
+downcast(Swizzle<B,M,S> const& swizzle)
+{
+  static_assert(has_single_bit(N), "N must be a power of two");
+  constexpr int log2_n = bit_width(uint32_t(N)) - 1;
+  return Swizzle<B,(M + log2_n),S>{};
+}
+
+template <class OldType, class NewType,
+          int B, int M, int S>
+CUTE_HOST_DEVICE constexpr
+auto
+recast_layout(Swizzle<B,M,S> const& swizzle)
+{
+  using scale = decltype(trait_ratio(sizeof_bits<NewType>{}, sizeof_bits<OldType>{}));
+  if constexpr (scale::num == 1 && scale::den == 1) {
+    return swizzle;
+  }
+  else if constexpr (scale::num == 1) {
+    return downcast<scale::den>(swizzle);
+  }
+  else if constexpr (scale::den == 1) {
+    return upcast<scale::num>(swizzle);
+  }
+  else {
+    return downcast<scale::den>(upcast<scale::num>(layout));
+  }
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <int B, int M, int S>
+CUTE_HOST_DEVICE constexpr
+auto
+max_alignment(Swizzle<B,M,S> const&)
+{
+  return Int<(1 << M)>{};
+}
+
+template <int B, int M, int S, class Offset, class LayoutB>
+CUTE_HOST_DEVICE constexpr
+auto
+max_alignment(ComposedLayout<Swizzle<B,M,S>,Offset,LayoutB> const& layout)
+{
+  return gcd(max_alignment(layout.layout_a()),
+             max_alignment(layout.offset()),
+             max_alignment(layout.layout_b()));
+}
+
+//
+// Other operations
+//
+
+template <int B, int M, int S, class Offset, class LayoutB, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+max_common_layout(ComposedLayout<Swizzle<B,M,S>,Offset,LayoutB> const& a,
+                  Layout<Shape,Stride>                          const& b)
+{
+  auto common = max_common_layout(a.layout_b(), b);
+  auto base = Int<(1 << M)>{};
+  if constexpr (base < size(common)) {
+    return common.compose(base);       // Truncate common to size base
+  } else {
+    return common;
+  }
+}
+
+template <class Shape, class Stride, int B, int M, int S, class Offset, class LayoutB>
+CUTE_HOST_DEVICE constexpr
+auto
+max_common_layout(Layout<Shape,Stride>                          const& a,
+                  ComposedLayout<Swizzle<B,M,S>,Offset,LayoutB> const& b)
+{
+  return max_common_layout(b, a);
+}
+
+template <int B, int M, int S, class Offset, class LayoutB, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+max_common_vector(ComposedLayout<Swizzle<B,M,S>,Offset,LayoutB> const& a,
+                  Layout<Shape,Stride>                          const& b)
+{
+  // This assumes that Offset is in the YZ domain of the Swizzle...
+  return cute::min(max_common_vector(a.layout_b(), b), Int<(1 << M)>{});
+}
+
+template <class Shape, class Stride, int B, int M, int S, class Offset, class LayoutB>
+CUTE_HOST_DEVICE constexpr
+auto
+max_common_vector(Layout<Shape,Stride>                          const& a,
+                  ComposedLayout<Swizzle<B,M,S>,Offset,LayoutB> const& b)
+{
+  return max_common_vector(b, a);
+}
+
+template <int B0, int M0, int S0, class Offset0, class LayoutB0,
+          int B1, int M1, int S1, class Offset1, class LayoutB1>
+CUTE_HOST_DEVICE constexpr
+auto
+max_common_vector(ComposedLayout<Swizzle<B0,M0,S0>,Offset0,LayoutB0> const& a,
+                  ComposedLayout<Swizzle<B1,M1,S1>,Offset1,LayoutB1> const& b)
+{
+  // Typical impl is composition(a, right_inverse(b))
+  // so this is  Sw0 o B0 o rinv(Sw1 o B1) = Sw0 o B0 o rinv(B1) o Sw1
+  auto vec = max_common_vector(a.layout_b(), b.layout_b());
+
+  // This assumes that Offset is in the YZ domain of the Swizzle...
+  if constexpr (Swizzle<B0,M0,S0>{} == Swizzle<B1,M1,S1>{}) {
+    return vec;
+  } else {
+    return cute::min(vec, Int<(1 << M0)>{}, Int<(1 << M1)>{});
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// ComposedLayout as second argument is often more difficult...
+
+template <class Shape, class Stride,
+          int B, int M, int S, class Offset, class LayoutT>
+CUTE_HOST_DEVICE constexpr
+auto
+logical_product(Layout<Shape,Stride>                          const& layout,
+                ComposedLayout<Swizzle<B,M,S>,Offset,LayoutT> const& tiler)
+{
+  CUTE_STATIC_ASSERT_V(tiler.offset() == Int<0>{}, "Require Swizzle offset == 0.");
+  // The new layout -- if swizzle wasn't an issue, this is the result
+  //   our goal is to determine a new swizzle for these strides
+  auto new_layout = logical_product(layout, tiler.layout_b());
+
+  // This is accomplished by identifying
+  //  S o L  :=:  S? o L*
+  // We identify the "active" portion of S by computing (P o L)(c*) where P is a projection generated by S
+  // Then that active identifier is transformed through the layouts:
+  //  L*(L[(P o L)(c*)])
+  // which is a new swizzle identifier for S?, the new swizzle
+
+  // Projections of the swizzle layout for composition, P
+  auto swizzle_only_zy = make_layout(make_shape (Int<(1 << M)>{}, Int<(1 << B)>{}, Int<(1 << (abs(S)-B))>{}, Int<(1 <<  B        )>{}, Int<1>{}),
+                                     make_stride(       Int<0>{}, Int<(1 << M)>{},                 Int<0>{}, Int<(1 << (M+abs(S)))>{}, Int<0>{}));
+
+  // Compose with the tiler to get the swizzle projection, P o L  [The Z and Y contributing portions of L]
+  auto layout_only_zy       = composition(swizzle_only_zy, tiler.layout_b());
+  // Transform the end coordinate to get the active bits of the swizzle, (P o L)(c*)
+  auto swizzle_active_bits  = layout_only_zy(size(layout_only_zy)-Int<1>{});
+  // Get the Z bit and the Y bits
+  auto active_Z = swizzle_active_bits & typename Swizzle<B,M,S>::zzz_msk{};
+  auto active_Y = swizzle_active_bits & typename Swizzle<B,M,S>::yyy_msk{};
+
+  // Pass the identifiers through the old layout and new layout to make a new swizzle identifier, L*(L[(P o L)(c*)])
+  auto new_active_Z = new_layout(Int<0>{}, tiler.layout_b()[active_Z]);
+  auto new_active_Y = new_layout(Int<0>{}, tiler.layout_b()[active_Y]);
+
+  // Use this new swizzle identifier to construxt the new swizzle for new_layout
+  //   (this also makes sure it's a "valid" swizzle that Swizzle can represent)
+  return composition(make_swizzle<new_active_Y,new_active_Z>(), new_layout);
+}
+
+} // end namespace cute
diff --git a/include/cute/tensor.hpp b/include/cute/tensor.hpp
new file mode 100644
index 0000000000..3f3335b63d
--- /dev/null
+++ b/include/cute/tensor.hpp
@@ -0,0 +1,58 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/tensor_impl.hpp>
+
+//
+// Extended Engines
+//
+
+#include <cute/pointer_swizzle.hpp>
+#include <cute/pointer_sparse.hpp>
+#include <cute/pointer_flagged.hpp>
+#include <cute/tensor_zip.hpp>
+
+//
+// Tensor Algorithms
+//
+
+#include <cute/algorithm/tensor_algorithms.hpp>
+#include <cute/algorithm/fill.hpp>
+#include <cute/algorithm/clear.hpp>
+#include <cute/algorithm/copy.hpp>
+#include <cute/algorithm/prefetch.hpp>
+#include <cute/algorithm/axpby.hpp>
+#include <cute/algorithm/gemm.hpp>
+
+#include <cute/algorithm/cooperative_copy.hpp>
+#include <cute/algorithm/cooperative_gemm.hpp>
+
diff --git a/include/cute/tensor_impl.hpp b/include/cute/tensor_impl.hpp
new file mode 100644
index 0000000000..2be19c15e3
--- /dev/null
+++ b/include/cute/tensor_impl.hpp
@@ -0,0 +1,1206 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This file contains the definition of Tensor as well as classes/functions most closely associated with it.
+
+    For backwards-compatibility, "tensor.hpp" is the "entrypoint" header for a collection of classes and utilities
+    that are adjacent to Tensor, e.g. fill(). Whereas this file contains the actual definition of Tensor and
+    a small set of functions central to its usage.
+
+    Within the CUTLASS codebase, favor not including "tensor.hpp" wherever possible; instead include "tensor_impl.hpp"
+    along with other specific headers that you need. This helps to avoid circular includes and to reduce build time.
+*/
+
+#pragma once
+
+#include <cute/config.hpp>                     // CUTE_HOST_DEVICE
+#include <cute/layout.hpp>                     // cute::Shape
+#include <cute/layout_composed.hpp>            // cute::is_composed_layout
+#include <cute/pointer.hpp>                    // cute::recast_ptr
+#include <cute/pointer_base.hpp>               // cute::iterator_traits
+#include <cute/container/array_aligned.hpp>    // cute::array_aligned
+#include <cute/container/array_subbyte.hpp>    // cute::array_subbyte
+#include <cute/container/tuple.hpp>            // cute::tuple
+#include <cute/numeric/integral_constant.hpp>  // cute::is_integral
+#include <cute/util/type_traits.hpp>           // __CUTE_REQUIRES
+
+namespace cute
+{
+
+//
+// Engine -- owning or non-owning data store
+//
+
+// concept Engine {
+//   using iterator     = ;
+//   using value_type   = ;
+//   using element_type = ;
+//   using reference    = ;
+//   iterator begin();
+// };
+
+template <class T, size_t N>
+struct ArrayEngine
+{
+  using Storage = typename conditional<(sizeof_bits<T>::value % 8 == 0),
+                                       array_aligned<T,N>,
+                                       array_subbyte<T,N>>::type;
+  using iterator     = typename Storage::iterator;
+  using reference    = typename iterator_traits<iterator>::reference;
+  using element_type = typename iterator_traits<iterator>::element_type;
+  using value_type   = typename iterator_traits<iterator>::value_type;
+  Storage storage_;
+
+  CUTE_HOST_DEVICE constexpr auto begin() const { return storage_.begin(); }
+  CUTE_HOST_DEVICE constexpr auto begin()       { return storage_.begin(); }
+};
+
+// Specialization for sparse_elem<S,T> tensor allocation/iteration
+// NOTE: This can and should be used for allocation of SMEM as well!
+//       Fuse these two ArrayEngines?
+template <int S, class T, size_t N>
+struct ArrayEngine<sparse_elem<S,T>, N>
+{
+  static_assert(N % S == 0, "Expected a multiple of the sparsity.");
+  using value_type   = sparse_elem<S,T>;
+  using Storage      = typename conditional<(sizeof_bits<T>::value % 8 == 0),
+                                            array_aligned<T,N/S>,
+                                            array_subbyte<T,N/S>>::type;
+  using iterator     = sparse_ptr<S,sparse_elem<S,T>*>;
+  using reference    = typename iterator_traits<iterator>::reference;
+  using element_type = typename iterator_traits<iterator>::element_type;
+  Storage storage_;
+
+  CUTE_HOST_DEVICE constexpr auto begin() const { return recast_ptr<value_type>(storage_.begin()); }
+  CUTE_HOST_DEVICE constexpr auto begin()       { return recast_ptr<value_type>(storage_.begin()); }
+};
+
+template <class Iterator>
+struct ViewEngine
+{
+  using iterator     = Iterator;
+  using reference    = typename iterator_traits<iterator>::reference;
+  using element_type = typename iterator_traits<iterator>::element_type;
+  using value_type   = typename iterator_traits<iterator>::value_type;
+  iterator storage_;
+
+  CUTE_HOST_DEVICE constexpr iterator const& begin() const { return storage_; }
+  CUTE_HOST_DEVICE constexpr iterator      & begin()       { return storage_; }
+};
+
+template <class Iterator>
+struct ConstViewEngine
+{
+  using iterator     = Iterator;
+  using reference    = typename iterator_traits<iterator>::reference;
+  using element_type = typename iterator_traits<iterator>::element_type;
+  using value_type   = typename iterator_traits<iterator>::value_type;
+  iterator storage_;
+
+  CUTE_HOST_DEVICE constexpr iterator const& begin() const { return storage_; }
+};
+
+//
+// Tensor
+//
+
+template <class Engine, class Layout>
+struct Tensor
+{
+  using iterator     = typename Engine::iterator;
+  using value_type   = typename Engine::value_type;
+  using element_type = typename Engine::element_type;
+  using reference    = typename Engine::reference;
+
+  using engine_type  = Engine;
+  using layout_type  = Layout;
+
+  CUTE_HOST_DEVICE constexpr
+  Tensor() {}
+
+  CUTE_HOST_DEVICE constexpr
+  Tensor(Engine const& engine, Layout const& layout)
+      : rep_(layout, engine) {
+  }
+
+  //
+  // Accessors
+  //
+
+  static constexpr int rank  = Layout::rank;
+
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  tensor() const {
+    return *this;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  engine() const {
+    return get<1>(rep_);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  engine() {
+    return get<1>(rep_);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  data() const {
+    return engine().begin();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  data() {
+    return engine().begin();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  layout() const {
+    return get<0>(rep_);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  shape() const {
+    return layout().shape();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  auto
+  size() const {
+    return cute::size(shape());
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  stride() const {
+    return layout().stride();
+  }
+
+  //
+  // Indexing op() and op[]
+  //
+
+  // Index into this tensor like an array by computing the offset via layout()
+  template <class Coord>
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  operator[](Coord const& coord) {
+    return data()[layout()(coord)];
+  }
+
+  template <class Coord>
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  operator[](Coord const& coord) const {
+    return data()[layout()(coord)];
+  }
+
+  template <class Coord>
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  operator()(Coord const& coord) {
+    if constexpr (has_underscore<Coord>::value) {
+      auto const& [sliced_layout,offset] = slice_and_offset(coord, layout());
+      return make_tensor(data() + offset, sliced_layout);
+    } else {
+      return data()[layout()(coord)];
+    }
+
+    CUTE_GCC_UNREACHABLE;
+  }
+
+  template <class Coord>
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  operator()(Coord const& coord) const {
+    if constexpr (has_underscore<Coord>::value) {
+      auto const& [sliced_layout,offset] = slice_and_offset(coord, layout());
+      return make_tensor(data() + offset, sliced_layout);
+    } else {
+      return data()[layout()(coord)];
+    }
+
+    CUTE_GCC_UNREACHABLE;
+  }
+
+  // op() convenience function for multi-dimensional coordinates
+  template <class Coord0, class Coord1, class... Coords>
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  operator()(Coord0 const& c0, Coord1 const& c1, Coords const&... cs) {
+    return operator()(make_coord(c0,c1,cs...));
+  }
+
+  template <class Coord0, class Coord1, class... Coords>
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  operator()(Coord0 const& c0, Coord1 const& c1, Coords const&... cs) const {
+    return operator()(make_coord(c0,c1,cs...));
+  }
+
+  //
+  // Compose
+  //
+
+  template <class... Layouts>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  compose(Layouts const&... layouts) {
+    return make_tensor(data(), layout().compose(layouts...));
+  }
+
+  template <class... Layouts>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  compose(Layouts const&... layouts) const {
+    return make_tensor(data(), layout().compose(layouts...));
+  }
+
+  //
+  // Tile
+  //
+
+  template <class... Layouts>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  tile(Layouts const&... layouts) {
+    return make_tensor(data(), layout().tile(layouts...));
+  }
+
+  template <class... Layouts>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  tile(Layouts const&... layouts) const {
+    return make_tensor(data(), layout().tile(layouts...));
+  }
+
+  //
+  // Utility
+  //
+
+  template <class Int,
+            __CUTE_REQUIRES(is_integral<Int>::value)>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_1d_coord(Int const& linear_idx) const {
+    return layout().get_1d_coord(linear_idx);
+  }
+
+  template <class Int,
+            __CUTE_REQUIRES(is_integral<Int>::value)>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_hier_coord(Int const& linear_idx) const {
+    return layout().get_hier_coord(linear_idx);
+  }
+
+  template <class Int,
+            __CUTE_REQUIRES(is_integral<Int>::value)>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_flat_coord(Int const& linear_idx) const {
+    return layout().get_flat_coord(linear_idx);
+  }
+
+  cute::tuple<layout_type, engine_type> rep_;
+};
+
+template <class T>
+struct is_tensor : false_type {};
+template <class Engine, class Layout>
+struct is_tensor<Tensor<Engine,Layout>> : true_type {};
+template <class T>
+constexpr bool is_tensor_v = is_tensor<T>::value;
+
+// Customization point for creation of owning and non-owning Tensors
+template <class T>
+struct MakeTensor
+{
+  template <class Arg0, class... Args>
+  CUTE_HOST_DEVICE constexpr auto
+  operator()(Arg0 const& arg0, Args const&... args) const
+  {
+    if constexpr (has_dereference<Arg0>::value) {
+      // Construct a non-owning Tensor
+      using Engine = ViewEngine<Arg0>;
+      if constexpr (sizeof...(Args) == 1 && (is_layout<Args>::value && ...)) {
+        // Forward a Layout
+        return Tensor{Engine{arg0}, args...};
+      } else {
+        // Construct a Layout from Args
+        return Tensor{Engine{arg0}, make_layout(args...)};
+      }
+    } else {
+      // Construct an owning Tensor
+      static_assert((is_static<Arg0>::value && ... && is_static<Args>::value),
+                    "Dynamic owning tensors not supported");
+      if constexpr (sizeof...(Args) == 0 && is_layout<Arg0>::value) {
+        // Forward a Layout
+        using Layout = Arg0;
+        using Engine = ArrayEngine<T, cosize_v<Layout>>;
+        return Tensor<Engine,Layout>();
+      } else {
+        // Construct a Layout from Args
+        using Layout = decltype(make_layout(arg0, args...));
+        using Engine = ArrayEngine<T, cosize_v<Layout>>;
+        return Tensor<Engine,Layout>();
+      }
+    }
+  }
+};
+
+//
+// make_tensor
+//
+
+// Make an owning Tensor that will allocate a static array
+// e.g. make_tensor<float>(Int<12>{})
+template <class T, class... Args>
+CUTE_HOST_DEVICE constexpr
+auto
+make_tensor(Args const&... args)
+{
+  static_assert((not has_dereference<Args>::value && ...), "Expected layout args... in make_tensor<T>(args...)");
+  return MakeTensor<T>{}(args...);
+}
+
+// Make a non-owning Tensor that will use a pointer (view)
+// e.g. make_tensor(vec.data(), 12)
+template <class Iterator, class... Args>
+CUTE_HOST_DEVICE constexpr
+auto
+make_tensor(Iterator const& iter, Args const&... args)
+{
+  static_assert(has_dereference<Iterator>::value, "Expected iterator iter in make_tensor(iter, args...)");
+  static_assert((not has_dereference<Args>::value && ...), "Expected layout args... in make_tensor(iter, args...)");
+  return MakeTensor<Iterator>{}(iter, args...);
+}
+
+//
+// make_tensor_like
+//   Make a register tensor the same type and shape and (if possible) order as another tensor
+//
+
+template <class NewT, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+make_tensor_like(Layout const& layout)
+{
+  return make_tensor<NewT>(make_layout_like(layout));
+}
+
+template <class NewT, class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+make_tensor_like(Tensor<Engine,Layout> const& tensor)
+{
+  return make_tensor_like<NewT>(tensor.layout());
+}
+
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+make_tensor_like(Tensor<Engine,Layout> const& tensor)
+{
+  return make_tensor_like<typename Engine::value_type>(tensor.layout());
+}
+
+//
+// make_fragment_like
+//   Make a tensor the same shape and (if possible) order as another tensor, with special
+//   consideration of the 0th mode. The 0th mode is commonly used for MMA_Atoms or Copy_Atoms
+//   so this allocates the 0th mode with LayoutLeft regardless of the reference layout.
+//
+
+template <class NewT, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+make_fragment_like(Layout const& layout)
+{
+  return make_tensor<NewT>(make_fragment_like(layout));
+}
+
+template <class NewT, class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+make_fragment_like(Tensor<Engine,Layout> const& tensor)
+{
+  return make_fragment_like<NewT>(tensor.layout());
+}
+
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+make_fragment_like(Tensor<Engine,Layout> const& tensor)
+{
+  return make_fragment_like<typename Engine::value_type>(tensor.layout());
+}
+
+//
+// make_counting_tensor
+//   Make a tensor from a layout by binding it to a counting iter with 0-offset of the same profile as the codomain.
+//
+
+template <class Layout, __CUTE_REQUIRES(is_layout<Layout>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+make_counting_tensor(Layout const& layout)
+{
+  return make_tensor(make_inttuple_iter(repeat_like(coshape(layout), Int<0>{})), layout);
+}
+
+//
+// make_identity_tensor
+//   Make a tensor that maps coordinates within a shape to themselves.
+//
+
+template <class Shape>
+CUTE_HOST_DEVICE constexpr
+auto
+make_identity_tensor(Shape const& shape)
+{
+  return make_counting_tensor(make_identity_layout(shape));
+}
+
+//
+// Utilities
+//
+
+// Return the subtensor of a mode
+template <int... Is, class Tensor>
+CUTE_HOST_DEVICE constexpr
+auto
+tensor(Tensor&& tensor)
+{
+  if constexpr (sizeof...(Is) == 0) {
+    return tensor;
+  } else {
+    return make_tensor(tensor.data(), get<Is...>(tensor.layout()));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Return the layout of a mode
+template <int... Is, class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+layout(Tensor<Engine,Layout> const& tensor)
+{
+  return layout<Is...>(tensor.layout());
+}
+
+// Return the shape of a mode
+template <int... Is, class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+shape(Tensor<Engine,Layout> const& tensor)
+{
+  return shape<Is...>(tensor.layout());
+}
+
+// Return the stride of a mode
+template <int... Is, class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+stride(Tensor<Engine,Layout> const& tensor)
+{
+  return stride<Is...>(tensor.layout());
+}
+
+// Return the number of elements in a mode
+template <int... Is, class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+size(Tensor<Engine,Layout> const& tensor)
+{
+  return size<Is...>(tensor.layout());
+}
+
+// Return the rank of a mode
+template <int... Is, class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+rank(Tensor<Engine,Layout> const& tensor)
+{
+  return rank<Is...>(tensor.layout());
+}
+
+// Return the depth of a mode
+template <int... Is, class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+depth(Tensor<Engine, Layout> const& tensor)
+{
+  return depth<Is...>(tensor.layout());
+}
+
+//
+// Operations to manipulate Tensors like a Layout or IntTuple
+//   These are implemented with explicit modifier overloads because these
+//   methods likely also have a general IntTuple overload that can shadow.
+//
+
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+flatten(Tensor<Engine,Layout> const& tensor) {
+  return make_tensor(tensor.data(), flatten(tensor.layout()));
+}
+
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+flatten(Tensor<Engine,Layout>& tensor) {
+  return make_tensor(tensor.data(), flatten(tensor.layout()));
+}
+
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+flatten(Tensor<Engine,Layout>&& tensor) {
+  return make_tensor(tensor.data(), flatten(tensor.layout()));
+}
+
+template <class Engine, class Layout, class Profile = Int<1>>
+CUTE_HOST_DEVICE constexpr
+auto
+coalesce(Tensor<Engine,Layout> const& tensor, Profile const& profile = {}) {
+  return make_tensor(tensor.data(), coalesce(tensor.layout(), profile));
+}
+
+template <class Engine, class Layout, class Profile = Int<1>>
+CUTE_HOST_DEVICE constexpr
+auto
+coalesce(Tensor<Engine,Layout>& tensor, Profile const& profile = {}) {
+  return make_tensor(tensor.data(), coalesce(tensor.layout(), profile));
+}
+
+template <class Engine, class Layout, class Profile = Int<1>>
+CUTE_HOST_DEVICE constexpr
+auto
+coalesce(Tensor<Engine,Layout>&& tensor, Profile const& profile = {}) {
+  return make_tensor(tensor.data(), coalesce(tensor.layout(), profile));
+}
+
+// Replace the modes in layout that have a 0-stride with a 1-size
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+filter_zeros(Tensor<Engine,Layout> const& tensor) {
+  return make_tensor(tensor.data(), filter_zeros(tensor.layout()));
+}
+
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+filter_zeros(Tensor<Engine,Layout>& tensor) {
+  return make_tensor(tensor.data(), filter_zeros(tensor.layout()));
+}
+
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+filter_zeros(Tensor<Engine,Layout>&& tensor) {
+  return make_tensor(tensor.data(), filter_zeros(tensor.layout()));
+}
+
+template <class Engine, class Layout, class Profile>
+CUTE_HOST_DEVICE constexpr
+auto
+filter_zeros(Tensor<Engine,Layout> const& tensor, Profile const& profile)
+{
+  return make_tensor(tensor.data(), filter_zeros(tensor.layout(), profile));
+}
+
+template <class Engine, class Layout, class Profile>
+CUTE_HOST_DEVICE constexpr
+auto
+filter_zeros(Tensor<Engine,Layout>& tensor, Profile const& profile)
+{
+  return make_tensor(tensor.data(), filter_zeros(tensor.layout(), profile));
+}
+
+template <class Engine, class Layout, class Profile>
+CUTE_HOST_DEVICE constexpr
+auto
+filter_zeros(Tensor<Engine,Layout>&& tensor, Profile const& profile)
+{
+  return make_tensor(tensor.data(), filter_zeros(tensor.layout(), profile));
+}
+
+// Remove all of the 0-strides and 1-sizes
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+filter(Tensor<Engine,Layout> const& tensor) {
+  return make_tensor(tensor.data(), filter(tensor.layout()));
+}
+
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+filter(Tensor<Engine,Layout>& tensor) {
+  return make_tensor(tensor.data(), filter(tensor.layout()));
+}
+
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+filter(Tensor<Engine,Layout>&& tensor) {
+  return make_tensor(tensor.data(), filter(tensor.layout()));
+}
+
+// Group the modes [B,E) into a single mode
+// e.g. group<2,4>(make_tensor<int>(Layout<Shape<_1,_2,_3,_4,_5,_6>>{}))
+//      => make_tensor<int>(Layout<Shape<_1,_2,Shape<_3,_4>,_5,_6>>{})
+template <int B, int E, class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+group_modes(Tensor<Engine,Layout> const& tensor) {
+  return make_tensor(tensor.data(), group<B,E>(tensor.layout()));
+}
+
+template <int B, int E, class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+group_modes(Tensor<Engine,Layout>& tensor) {
+  return make_tensor(tensor.data(), group<B,E>(tensor.layout()));
+}
+
+template <int B, int E, class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+group_modes(Tensor<Engine,Layout>&& tensor) {
+  return make_tensor(tensor.data(), group<B,E>(tensor.layout()));
+}
+
+// Return the subtensor of a range of modes
+template <int B, int E, class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+take(Tensor<Engine,Layout> const& tensor) {
+  return make_tensor(tensor.data(), take<B,E>(tensor.layout()));
+}
+
+template <int B, int E, class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+take(Tensor<Engine,Layout>& tensor) {
+  return make_tensor(tensor.data(), take<B,E>(tensor.layout()));
+}
+
+template <int B, int E, class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+take(Tensor<Engine,Layout>&& tensor) {
+  return make_tensor(tensor.data(), take<B,E>(tensor.layout()));
+}
+
+// Return a tensor with the same shape as input but offset by a given coordinate
+template <class Coord, class Tensor,
+          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+domain_offset(Coord const& coord, Tensor&& tensor)
+{
+  auto [layout, ptr_offset] = domain_offset(coord, tensor.layout());
+  return make_tensor(static_cast<Tensor&&>(tensor).data() + ptr_offset, layout);
+}
+
+//
+// Recast
+//
+
+// NOTE: This is very dangerous to do
+//   -- doesn't check dynamic integer divisibility
+//   -- doesn't check alignment
+
+template <class NewType, class Tensor>
+CUTE_HOST_DEVICE constexpr
+auto
+recast(Tensor&& tensor)
+{
+  using OldType = typename remove_cvref_t<Tensor>::value_type;
+  auto old_layout = tensor.layout();
+  auto new_layout = recast_layout<OldType,NewType>(old_layout);
+
+  // If this is an upcast of a normal Layout with static negative strides, then offset as well
+  if constexpr (sizeof(OldType) < sizeof(NewType) && not is_composed_layout<decltype(old_layout)>::value) {
+    auto shape_diff = transform(flatten(old_layout.shape()), flatten(new_layout.shape()), minus{});
+    auto extent_diff = transform(shape_diff, flatten(old_layout.stride()), multiplies{});
+    auto offset = fold(extent_diff, Int<0>{}, [](auto const& i, auto const& a) { return i + cute::min(a,Int<0>{}); });
+
+    return make_tensor(recast_ptr<NewType>(static_cast<Tensor&&>(tensor).data() + offset), new_layout);
+  } else {
+    return make_tensor(recast_ptr<NewType>(static_cast<Tensor&&>(tensor).data()         ), new_layout);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// max_common_vector
+//
+
+/* Return Int<N> such that N is the maximum number of contiguous elements
+ * that logically correspond in the tensors of @a a and @a b. This is,
+ * the number of elements that could reasonably be vectorized into a single load/store.
+ *
+ * @returns Int<N> with N >= 0
+ *
+ * A return value of Int<0> indicates that no such conclusion can be made and no
+ * vectorization should be attempted.
+ *
+ * Note that the return value does NOT include alignment concerns such as the pointer value and
+ * the divisbility of dynamic strides.
+ */
+template <class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE constexpr
+auto
+max_common_vector(Tensor<SrcEngine,SrcLayout> const& a,
+                  Tensor<DstEngine,DstLayout> const& b)
+{
+  using SrcType = typename SrcEngine::value_type;
+  using SrcRef  = typename SrcEngine::reference;
+  using DstType = typename DstEngine::value_type;
+  using DstRef  = typename DstEngine::reference;
+
+  // Determine if vectorization candidates at all
+  if constexpr (// Should be the same value_types, else the copy is also performing a cast
+                cute::is_same<SrcType, DstType>::value &&
+                // The types should be trivially copyable so that vectorization is valid
+                is_trivially_copyable<SrcType>::value &&
+                is_trivially_copyable<DstType>::value &&
+                // Should be load/storing real data, rather than implicit iterators or such
+                is_reference<SrcRef>::value &&
+                is_reference<DstRef>::value)
+  {
+    return max_common_vector(a.layout(), b.layout());
+  } else {
+    return Int<0>{};
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+/* Return a layout that points to the maximum number of contiguous elements
+ * that logically correspond in the tensors of @a a and @a b. This is,
+ * the elements that could reasonably be "vectorized" into a single load/store.
+ *
+ * @returns Layout R such that composition(a.layout(), R) and composition(b.layout(), R)
+ *          are both identity Layouts.
+ *
+ * Note that the returned layout does NOT include alignment concerns such as the pointer value and
+ * the divisbility of dynamic strides.
+ */
+template <class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE constexpr
+auto
+max_common_layout(Tensor<SrcEngine,SrcLayout> const& a,
+                  Tensor<DstEngine,DstLayout> const& b)
+{
+  using SrcType = typename SrcEngine::value_type;
+  using SrcRef  = typename SrcEngine::reference;
+  using DstType = typename DstEngine::value_type;
+  using DstRef  = typename DstEngine::reference;
+
+  // Determine if vectorization candidates at all
+  if constexpr (// Should be the same value_types, else the copy is also performing a cast
+                cute::is_same<SrcType, DstType>::value &&
+                // The types should be trivially copyable so that vectorization is valid
+                is_trivially_copyable<SrcType>::value &&
+                is_trivially_copyable<DstType>::value &&
+                // Should be load/storing real data, rather than implicit iterators or such
+                is_reference<SrcRef>::value &&
+                is_reference<DstRef>::value)
+  {
+    return max_common_layout(a.layout(), b.layout());
+  } else {
+    return Layout<_1,_0>{};
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+/* Return the maximum (statically known) alignment of a Tensor in the number of bits
+ */
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+max_alignment(Tensor<Engine,Layout> const& t)
+{
+  return gcd(max_alignment(t.data()),
+             max_alignment(t.layout()) * static_value<sizeof_bits<typename Engine::value_type>>());
+}
+
+//
+// Key algebraic operations -- Composition, Divide, and Product
+//
+
+// Apply a Tiler to the Tensor via composition.
+template <class Tensor, class Tiler,
+          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+composition(Tensor    && tensor,
+            Tiler const& tiler)   // Layout or Tile<Layout...> or Shape
+{
+  return make_tensor(static_cast<Tensor&&>(tensor).data(),
+                     composition(tensor.layout(), tiler));
+}
+
+// Apply a Tiler to the Tensor.
+//
+// Consider a Tensor with shape (A,B,x,y)
+// And a Tiler that is:
+//
+// * A Layout with shape (BLK_A,BLK_B)
+// ** Result Tensor shape ((BLK_A,BLK_B),Rest).
+// ** That is, the Tensor and Tile are treated as 1D for the tiling.
+// ** See logical_divide(Layout,Layout)
+//
+// * A Tile<Layout...> with shape <BLK_A,BLK_B>
+// ** Result Tensor shape ((BLK_A,a),(BLK_B,b),x,y).
+// ** Each mode of the Tile<Layout...> is applied to the corresponding mode of the Tensor.
+// ** See logical_divide(Layout,Tuple)
+//
+// * A Shape (BLK_A,BLK_B)
+// ** Result Tensor shape ((BLK_A,a),(BLK_B,b),x,y).
+// ** Equivalent to applying Tile<BLK_A:_1,BLK_B:_1>.
+// ** See logical_divide(Layout,Tuple) and logical_divide(Layout,Int)
+//
+// Note that the Tile<Layout...>/Shape Tilers must be weakly_congruent to the Tensor
+template <class Tensor, class Tiler,
+          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+logical_divide(Tensor    && tensor,
+               Tiler const& tiler)   // Layout or Tile<Layout...> or Shape
+{
+  return make_tensor(static_cast<Tensor&&>(tensor).data(),
+                     logical_divide(tensor.layout(), tiler));
+}
+
+// zipped_divide is logical_divide with Tiler modes and Rest modes gathered together: (Tiler,Rest)
+// When Tiler is Layout, this has no effect as logical_divide results in the same.
+// When Tiler is Tile<Layout...> or Shape, this zips modes into standard form ((BLK_A,BLK_B),(a,b,x,y))
+template <class Tensor, class Tiler,
+          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+zipped_divide(Tensor    && tensor,
+              Tiler const& tiler)    // Layout or Tile<Layout...> or Shape
+{
+  return make_tensor(static_cast<Tensor&&>(tensor).data(),
+                     zipped_divide(tensor.layout(), tiler));
+}
+
+// tiled_divide is zipped_divide with the second output mode flattened ((BLK_A,BLK_B),a,b,x,y)
+template <class Tensor, class Tiler,
+          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+tiled_divide(Tensor    && tensor,
+             Tiler const& tiler)     // Layout or Tile<Layout...> or Shape
+{
+  return make_tensor(static_cast<Tensor&&>(tensor).data(),
+                     tiled_divide(tensor.layout(), tiler));
+}
+
+// flat_divide is zipped_divide with the both modes flattened (BLK_A,BLK_B,a,b,x,y)
+template <class Tensor, class Tiler,
+          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+flat_divide(Tensor    && tensor,
+            Tiler const& tiler)      // Layout or Tile<Layout...> or Shape
+{
+  return make_tensor(static_cast<Tensor&&>(tensor).data(),
+                     flat_divide(tensor.layout(), tiler));
+}
+
+// logical_product on a Tensor doesn't make sense since it often increases cosize
+//   though this might make sense for creating Tensors with broadcasted (stride-0) modes
+
+//
+// Tensor partitioning utilities
+//
+
+// Apply a Tiler to the Tensor, then slice out one of those tiles by slicing into the "Rest" modes.
+// With an inner_partition, you get everything that's inside the Tiler. Everything that the Tiler is pointing to.
+// Split the modes of tensor according to the Tiler
+//   zipped_divide returns something like ((BLK_A,BLK_B,...),(a,b,...,x,y))
+// Then slice into the second mode (the "Rest" mode) with Coord
+template <class Tensor, class Tiler, class Coord,
+          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+inner_partition(Tensor    && tensor,
+                Tiler const& tiler,
+                Coord const& coord)
+{
+  auto tensor_tiled = zipped_divide(static_cast<Tensor&&>(tensor), tiler);
+  constexpr int R0 = decltype(rank<0>(tensor_tiled))::value;
+
+  // The coord slices into the second mode (the "rest" mode), flatten the first
+  if constexpr (is_tuple<Coord>::value) {
+    // Append trailing modes if coord is tuple
+    constexpr int R1 = decltype(rank<1>(tensor_tiled))::value;
+    return tensor_tiled(repeat<R0>(_), append<R1>(coord,_));
+  } else {
+    // Flat indexing if coord is not tuple
+    return tensor_tiled(repeat<R0>(_), coord);
+  }
+}
+
+// Apply a Tiler to the Tensor, then slice out the remainder by slicing into the "Tile" modes.
+// With an outer_partition, you get everything that's outside the Tiler. The layout of the Tile in the Tensor.
+// Split the modes of tensor according to the Tiler
+//   zipped_divide returns something like ((BLK_A,BLK_B,...),(a,b,...,x,y))
+// Then slice into the first mode (the "Tile" mode) with Coord
+template <class Tensor, class Tiler, class Coord,
+          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+outer_partition(Tensor    && tensor,
+                Tiler const& tiler,
+                Coord const& coord)
+{
+  auto tensor_tiled = zipped_divide(static_cast<Tensor&&>(tensor), tiler);
+  constexpr int R1 = decltype(rank<1>(tensor_tiled))::value;
+
+  // The coord slices into the first mode (the "tile" mode), flatten the second
+  if constexpr (is_tuple<Coord>::value) {
+    // Append trailing modes if coord is tuple
+    constexpr int R0 = decltype(rank<0>(tensor_tiled))::value;
+    return tensor_tiled(append<R0>(coord,_), repeat<R1>(_));
+  } else {
+    // Flat indexing if coord is not tuple
+    return tensor_tiled(coord, repeat<R1>(_));
+  }
+}
+
+// Tile a tensor according to @a tiler and use @a coord to index into the remainder, keeping the tile.
+// This is typical at the CTA level where tiles of data are extracted:
+//   Tensor data = ...                                                                         // (  M,  N)
+//   Tensor cta_data = local_tile(data, Shape<_32,_64>{}, make_coord(blockIdx.x,blockIdx.y));  // (_32,_64)
+template <class Tensor, class Tiler, class Coord,
+          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+local_tile(Tensor    && tensor,
+           Tiler const& tiler,   // tiler to apply
+           Coord const& coord)   // coord to slice into "remainder"
+{
+  return inner_partition(static_cast<Tensor&&>(tensor),
+                         tiler,
+                         coord);
+}
+
+// Same as above, but with a projection parameter to strip out unwanted tiling modes for convenience
+//   when using projections of the same tiler.
+// This is typical at the CTA level where tiles of data are extracted as projections:
+//   Tensor dataA = ...                                                        // (M,K)
+//   Tensor dataB = ...                                                        // (N,K)
+//   Tensor dataC = ...                                                        // (M,N)
+//   auto cta_tiler = Shape<_32, _64, _4>{};
+//   auto cta_coord = make_coord(blockIdx.x, blockIdx.y, _);
+//   Tensor ctaA = local_tile(dataA, cta_tiler, cta_coord, Step<_1, X,_1>{});  // (_32,_4,k)
+//   Tensor ctaB = local_tile(dataB, cta_tiler, cta_coord, Step< X,_1,_1>{});  // (_64,_4,k)
+//   Tensor ctaC = local_tile(dataC, cta_tiler, cta_coord, Step<_1,_1, X>{});  // (_32,_64)
+template <class Tensor, class Tiler, class Coord, class Proj,
+          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
+CUTE_HOST_DEVICE
+auto
+local_tile(Tensor    && tensor,
+           Tiler const& tiler,   // tiler to apply
+           Coord const& coord,   // coord to slice into "remainder"
+           Proj  const& proj)    // projection to apply to tiler and coord
+{
+  return local_tile(static_cast<Tensor&&>(tensor),
+                    dice(proj, tiler),
+                    dice(proj, coord));
+}
+
+// Tile a tensor according to the flat shape of a layout that provides the coordinate of the target index.
+// This is typical at the Thread level where data is partitioned across repeated patterns of threads:
+//   Tensor data = ...                                                            // (_16,_64)
+//   Tensor thr_data = local_partition(data, Layout<Shape<_2,_16>>{}, thr_idx);   // ( _8, _4)
+template <class Tensor, class LShape, class LStride, class Index,
+          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
+CUTE_HOST_DEVICE
+auto
+local_partition(Tensor                     && tensor,
+                Layout<LShape,LStride> const& tile,    // coord -> index
+                Index                  const& index)   // index to slice for
+{
+  static_assert(is_integral<Index>::value);
+  return outer_partition(static_cast<Tensor&&>(tensor),
+                         product_each(shape(tile)),
+                         tile.get_flat_coord(index));
+}
+
+// Same as above, but with a projection parameter to strip out unwanted tiling modes for convenience
+//   when using projections of the same tiler.
+// This is typical at the Thread level where data is partitioned across projected layouts of threads:
+//   Tensor dataA = ...                                                            // (M,K)
+//   Tensor dataB = ...                                                            // (N,K)
+//   Tensor dataC = ...                                                            // (M,N)
+//   auto thr_layout = Layout<Shape<_2,_16,_1>, Stride<_16,_1,_0>>{};
+//   Tensor thrA = local_partition(dataA, thr_layout, thr_idx, Step<_1, X,_1>{});  // (M/2,K/1)
+//   Tensor thrB = local_partition(dataB, thr_layout, thr_idx, Step< X,_1,_1>{});  // (N/16,K/1)
+//   Tensor thrC = local_partition(dataC, thr_layout, thr_idx, Step<_1,_1, X>{});  // (M/2,N/16)
+template <class Tensor, class LShape, class LStride, class Index, class Projection,
+          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
+CUTE_HOST_DEVICE
+auto
+local_partition(Tensor                     && tensor,
+                Layout<LShape,LStride> const& tile,   // coord -> index
+                Index                  const& index,  // index to slice for
+                Projection             const& proj)
+{
+  return local_partition(static_cast<Tensor&&>(tensor),
+                         dice(proj, tile),
+                         index);
+}
+
+//
+// Display utilities
+//
+
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE void print(Tensor<Engine,Layout> const& tensor)
+{
+  print(tensor.data()); print(" o "); print(tensor.layout());
+}
+
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE void print_tensor(Tensor<Engine,Layout> const& tensor, bool print_type = true)
+{
+  if (print_type) {
+    print(tensor); print(":\n");
+  }
+
+  if constexpr (Layout::rank == 1)
+  {
+    for (int m = 0; m < size(tensor); ++m) {
+      pretty_print(tensor(m));
+      printf("\n");
+    }
+  } else
+  if constexpr (Layout::rank == 2)
+  {
+    for (int m = 0; m < size<0>(tensor); ++m) {
+      for (int n = 0; n < size<1>(tensor); ++n) {
+        pretty_print(tensor(m,n));
+      }
+      printf("\n");
+    }
+  } else
+  if constexpr (Layout::rank == 3)
+  {
+    print_tensor(tensor(_,_,0), false);
+    for (int k = 1; k < size<2>(tensor); ++k) {
+      for (int i = 0; i < 5*size<1>(tensor); ++i) { print("-"); } print("\n");
+      print_tensor(tensor(_,_,k), false);
+    }
+  } else
+  if constexpr (Layout::rank == 4)
+  {
+    print_tensor(tensor(_,_,_,0), false);
+    for (int p = 1; p < size<3>(tensor); ++p) {
+      for (int i = 0; i < 5*size<1>(tensor); ++i) { print("="); } print("\n");
+      print_tensor(tensor(_,_,_,p), false);
+    }
+  }
+}
+
+#if !defined(__CUDACC_RTC__)
+template <class Engine, class Layout>
+CUTE_HOST std::ostream& print_tensor_os(std::ostream& os, Tensor<Engine,Layout> const& tensor)
+{
+  int digits = 9;
+
+  if constexpr (Layout::rank == 1)
+  {
+    for (int m = 0; m < size(tensor); ++m) {
+      os << std::setw(digits) << tensor(m) << std::endl;
+    }
+  } else
+  if constexpr (Layout::rank == 2)
+  {
+    for (int m = 0; m < size<0>(tensor); ++m) {
+      for (int n = 0; n < size<1>(tensor); ++n) {
+        os << std::setw(digits) << tensor(m,n);
+      }
+      os << std::endl;
+    }
+  } else
+  if constexpr (Layout::rank == 3)
+  {
+    print_tensor_os(os, tensor(_,_,0));
+    for (int k = 1; k < size<2>(tensor); ++k) {
+      for (int i = 0; i < digits*size<1>(tensor); ++i) { os << "-"; } os << std::endl;
+      print_tensor_os(os, tensor(_,_,k));
+    }
+  } else
+  if constexpr (Layout::rank == 4)
+  {
+    print_tensor_os(os, tensor(_,_,_,0));
+    for (int p = 1; p < size<3>(tensor); ++p) {
+      for (int i = 0; i < digits*size<1>(tensor); ++i) { os << "="; } os << std::endl;
+      print_tensor_os(os, tensor(_,_,_,p));
+    }
+  }
+
+  return os;
+}
+
+template <class Engine, class Layout>
+CUTE_HOST std::ostream& operator<<(std::ostream& os, Tensor<Engine,Layout> const& tensor)
+{
+  os << tensor.layout() << std::endl;
+  return print_tensor_os(os, tensor);
+}
+#endif // !defined(__CUDACC_RTC__)
+
+} // end namespace cute
+
diff --git a/include/cute/tensor_predicate.hpp b/include/cute/tensor_predicate.hpp
new file mode 100644
index 0000000000..9c8a2ba614
--- /dev/null
+++ b/include/cute/tensor_predicate.hpp
@@ -0,0 +1,78 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>                    // CUTE_HOST_DEVICE
+#include <cute/numeric/integral_constant.hpp> // cute::true_type
+
+namespace cute
+{
+
+template <class T>
+struct ConstantTensor
+{
+  template <class... Coords>
+  CUTE_HOST_DEVICE constexpr
+  T const&
+  operator()(Coords const&...) const {
+    return val_;
+  }
+
+  T val_;
+};
+
+struct TrivialPredTensor
+{
+  template <class... Coords>
+  CUTE_HOST_DEVICE constexpr
+  true_type
+  operator()(Coords const&...) const {
+    return {};
+  }
+};
+
+template <class Fn>
+struct FunctionPredTensor
+{
+  CUTE_HOST_DEVICE constexpr
+  FunctionPredTensor(Fn const& fn) : fn_(fn) {}
+
+  template <class... Coords>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  operator()(Coords const&... coords) const {
+    return fn_(coords...);
+  }
+
+  Fn const& fn_;
+};
+
+} // end namespace cute
diff --git a/include/cute/tensor_zip.hpp b/include/cute/tensor_zip.hpp
new file mode 100644
index 0000000000..6d70ffc847
--- /dev/null
+++ b/include/cute/tensor_zip.hpp
@@ -0,0 +1,243 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cute/config.hpp>           // CUTE_HOST_DEVICE
+#include <cute/tensor_impl.hpp>      // cute::Tensor
+#include <cute/container/tuple.hpp>  // cute::tuple
+
+namespace cute
+{
+
+// A tuple of Iterators that can be offset asymmetrically
+// Note that this only accepts op+(tuple<Index...>) and op[tuple<Index...>]
+//   where each iterator will be offset by its respective index only.
+// READ-ONLY for now until cute::tuple can be constructed with references.
+template <class... Iters>
+struct ZipIterator
+{
+  using value_type   = cute::tuple<iter_value_t<Iters>...>;
+  using element_type = cute::tuple<iter_element_t<Iters>...>;
+  // NOTE: cute::tuple does not support constructions with references at the moment.
+  //       Consider fixes and/or an implementation of std::forward_as_tuple.
+  //       For now, use a cute::tuple of value_types instead, which makes this Iterator READ-ONLY.
+  //using reference    = cute::tuple<iter_reference_t<Iters>...>;
+  using reference  = value_type;
+
+  ZipIterator() = delete;
+
+  CUTE_HOST_DEVICE constexpr
+  ZipIterator(Iters... iters)
+    : iters_(iters...)
+  {}
+
+  CUTE_HOST_DEVICE constexpr
+  ZipIterator(cute::tuple<Iters...> const& iters)
+    : iters_(iters)
+  {}
+
+  CUTE_HOST_DEVICE constexpr
+  reference operator*() const {
+    return cute::apply(iters_, [](auto&&... args) { return reference(*args...); });
+  }
+
+  template <class... Index>
+  CUTE_HOST_DEVICE constexpr
+  ZipIterator operator+(cute::tuple<Index...> const& idxs) const {
+    static_assert(sizeof...(Index) == sizeof...(Iters), "Expect same number of offsets as iterators.");
+    return cute::transform(iters_, idxs, [](auto&& iter, auto&& idx) { return iter + idx; });
+  }
+
+  template <class... Index>
+  CUTE_HOST_DEVICE constexpr
+  reference operator[](cute::tuple<Index...> const& idxs) const {
+    return *(*this + idxs);
+  }
+
+  cute::tuple<Iters...> iters_;
+};
+
+//------------------------------------------------------------------------------
+// type traits
+
+template <class... Iters>
+struct is_rmem<ZipIterator<Iters...>> : conjunction<is_rmem<Iters>...> {};
+template <class... Iters>
+struct is_smem<ZipIterator<Iters...>> : conjunction<is_smem<Iters>...> {};
+template <class... Iters>
+struct is_gmem<ZipIterator<Iters...>> : conjunction<is_gmem<Iters>...> {};
+// A tuple of Layouts that operates on each Layout symmetrically
+// The Layouts need to have compatible shapes and ranks.
+// The ZipLayout presents the intersection of the domain of its component Layouts.
+//   E.g. all Layouts accept 1D coords and ZipLayout does as well.
+// The ZipLayout returns the union of the codomain of its component Layouts.
+//   E.g. all Layouts return an integer so ZipLayout returns a tuple of integers.
+template <class... Layouts>
+struct ZipLayout
+{
+  static constexpr int rank = (int(0) | ... | Layouts::rank);
+
+  static_assert((is_layout<Layouts>::value && ...), "All template parameters must be layouts");
+  static_assert(((Layouts::rank == rank) && ...),   "All layouts must have the same rank");
+
+  CUTE_HOST_DEVICE constexpr
+  ZipLayout(Layouts const&... layouts)
+    : layouts_(layouts...)
+  {}
+
+  CUTE_HOST_DEVICE constexpr
+  ZipLayout(cute::tuple<Layouts...> const& layouts)
+    : layouts_(layouts)
+  {}
+
+  template <class Coord>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  operator()(Coord const& coord) const {
+    if constexpr (has_underscore<Coord>::value) {
+      return ZipLayout(cute::transform(layouts_, [&] (auto layout) { return layout(coord); }));
+    } else {
+      return cute::transform(layouts_, [&] (auto layout) { return layout(coord); });
+    }
+
+    CUTE_GCC_UNREACHABLE;
+  }
+
+  // op() convenience function for multi-dimensional coordinates
+  template <class Coord0, class Coord1, class... Coords>
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  operator()(Coord0 const& c0, Coord1 const& c1, Coords const&... cs) const {
+    return operator()(make_coord(c0,c1,cs...));
+  }
+
+  cute::tuple<Layouts...> layouts_;
+};
+
+template <class... Layouts>
+struct is_layout<ZipLayout<Layouts...>> : true_type {};
+
+//
+// make_zip_tensor and unzip_tensor
+//
+
+template <class... Engines, class... Layouts>
+CUTE_HOST_DEVICE constexpr
+auto
+make_zip_tensor(Tensor<Engines,Layouts> const&... tensors)
+{
+  return make_tensor(ZipIterator(tensors.data()...),
+                     ZipLayout(tensors.layout()...));
+}
+
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+unzip_tensor(Tensor<Engine,Layout> const& tensor)
+{
+  return cute::transform(tensor.data().iters_, tensor.layout().layouts_,
+                         [](auto iter, auto layout) { return make_tensor(iter, layout); });
+}
+
+//
+// Utilities
+//
+
+template <int... Is, class... Layouts>
+CUTE_HOST_DEVICE constexpr
+auto
+rank(ZipLayout<Layouts...> const& layouts)
+{
+  return rank<Is...>(get<0>(layouts.layouts_));
+}
+
+template <int... Is, class... Layouts>
+CUTE_HOST_DEVICE constexpr
+auto
+size(ZipLayout<Layouts...> const& layouts)
+{
+  return size<Is...>(get<0>(layouts.layouts_));
+}
+
+//
+// Manipulation
+//
+
+// Extend each component layout to rank-N by appending Layout @a x.
+template <int N, class... Layouts, class ShapeX = _1, class StrideX = _0>
+CUTE_HOST_DEVICE constexpr
+auto
+append(ZipLayout<Layouts...>  const& layouts,
+       Layout<ShapeX,StrideX> const& x = {})
+{
+  return ZipLayout(cute::transform(layouts.layouts_, [&](auto t){ return append<N>(t, x); }));
+}
+
+// Extend each component layout to rank-N by prepending Layout @a x.
+template <int N, class... Layouts, class ShapeX = _1, class StrideX = _0>
+CUTE_HOST_DEVICE constexpr
+auto
+prepend(ZipLayout<Layouts...>  const& layouts,
+        Layout<ShapeX,StrideX> const& x = {})
+{
+  return ZipLayout(cute::transform(layouts.layouts_, [&](auto t){ return prepend<N>(t, x); }));
+}
+
+template <class... Layouts, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+logical_divide(ZipLayout<Layouts...> const& layouts,
+               Tiler                 const& tiler)
+{
+  return ZipLayout(cute::transform(layouts.layouts_, [&](auto t){ return logical_divide(t, tiler); }));
+}
+
+template <class... Layouts, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+zipped_divide(ZipLayout<Layouts...> const& layouts,
+              Tiler                 const& tiler)
+{
+  return ZipLayout(cute::transform(layouts.layouts_, [&](auto t){ return zipped_divide(t, tiler); }));
+}
+
+// Return <SlicedZipLayout, ZipOffsets> by calling slice_and_offset and all component layouts.
+template <class Coord, class... Layouts>
+CUTE_HOST_DEVICE constexpr
+auto
+slice_and_offset(Coord const& c, ZipLayout<Layouts...> const& layouts)
+{
+  auto result = cute::zip(cute::transform(layouts.layouts_, [&c](auto const& layout) { return slice_and_offset(c, layout); }));
+  return cute::make_tuple(ZipLayout(get<0>(result)), get<1>(result));
+}
+
+} // end namespace cute
diff --git a/include/cute/underscore.hpp b/include/cute/underscore.hpp
new file mode 100644
index 0000000000..e9d80fe5b5
--- /dev/null
+++ b/include/cute/underscore.hpp
@@ -0,0 +1,194 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>                     // CUTE_INLINE_CONSTANT, CUTE_HOST_DEVICE
+#include <cute/container/tuple.hpp>            // cute::is_tuple
+#include <cute/numeric/integral_constant.hpp>  // cute::false_type, cute::true_type
+
+namespace cute
+{
+
+// For slicing
+struct Underscore : Int<0> {};
+
+CUTE_INLINE_CONSTANT Underscore _;
+
+// Convenient alias
+using X = Underscore;
+
+// Treat Underscore as an integral like integral_constant
+template <>
+struct is_integral<Underscore> : true_type {};
+
+template <class T>
+struct is_underscore : false_type {};
+template <>
+struct is_underscore<Underscore> : true_type {};
+
+// Tuple trait for detecting static member element
+template <class Tuple, class Elem, class Enable = void>
+struct has_elem : false_type {};
+template <class Elem>
+struct has_elem<Elem, Elem> : true_type {};
+template <class Tuple, class Elem>
+struct has_elem<Tuple, Elem, enable_if_t<is_tuple<Tuple>::value> >
+    : has_elem<Tuple, Elem, tuple_seq<Tuple> > {};
+template <class Tuple, class Elem, int... Is>
+struct has_elem<Tuple, Elem, seq<Is...>>
+    : disjunction<has_elem<tuple_element_t<Is, Tuple>, Elem>...> {};
+
+// Tuple trait for detecting static member element
+template <class Tuple, class Elem, class Enable = void>
+struct all_elem : false_type {};
+template <class Elem>
+struct all_elem<Elem, Elem> : true_type {};
+template <class Tuple, class Elem>
+struct all_elem<Tuple, Elem, enable_if_t<is_tuple<Tuple>::value> >
+    : all_elem<Tuple, Elem, tuple_seq<Tuple> > {};
+template <class Tuple, class Elem, int... Is>
+struct all_elem<Tuple, Elem, seq<Is...>>
+    : conjunction<all_elem<tuple_element_t<Is, Tuple>, Elem>...> {};
+
+// Tuple trait for detecting Underscore member
+template <class Tuple>
+using has_underscore = has_elem<Tuple, Underscore>;
+
+template <class Tuple>
+using all_underscore = all_elem<Tuple, Underscore>;
+
+template <class Tuple>
+using has_int1 = has_elem<Tuple, Int<1>>;
+
+template <class Tuple>
+using has_int0 = has_elem<Tuple, Int<0>>;
+
+//
+// Slice keeps only the elements of Tuple B that are paired with an Underscore
+//
+
+namespace detail {
+
+template <class A, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+lift_slice(A const& a, B const& b)
+{
+  if constexpr (is_tuple<A>::value) {
+    static_assert(tuple_size<A>::value == tuple_size<B>::value, "Mismatched Ranks");
+    return filter_tuple(a, b, [](auto const& x, auto const& y) { return lift_slice(x,y); });
+  } else if constexpr (is_underscore<A>::value) {
+    return cute::tuple<B>{b};
+  } else {
+    return cute::tuple<>{};
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+} // end namespace detail
+
+// Entry point overrides the lifting so that slice(_,b) == b
+template <class A, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+slice(A const& a, B const& b)
+{
+  if constexpr (is_tuple<A>::value) {
+    static_assert(tuple_size<A>::value == tuple_size<B>::value, "Mismatched Ranks");
+    return filter_tuple(a, b, [](auto const& x, auto const& y) { return detail::lift_slice(x,y); });
+  } else if constexpr (is_underscore<A>::value) {
+    return b;
+  } else {
+    return cute::tuple<>{};
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Dice keeps only the elements of Tuple B that are paired with an Int
+//
+
+namespace detail {
+
+template <class A, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+lift_dice(A const& a, B const& b)
+{
+  if constexpr (is_tuple<A>::value) {
+    static_assert(tuple_size<A>::value == tuple_size<B>::value, "Mismatched Ranks");
+    return filter_tuple(a, b, [](auto const& x, auto const& y) { return lift_dice(x,y); });
+  } else if constexpr (is_underscore<A>::value) {
+    return cute::tuple<>{};
+  } else {
+    return cute::tuple<B>{b};
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+} // end namespace detail
+
+// Entry point overrides the lifting so that dice(1,b) == b
+template <class A, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+dice(A const& a, B const& b)
+{
+  if constexpr (is_tuple<A>::value) {
+    static_assert(tuple_size<A>::value == tuple_size<B>::value, "Mismatched Ranks");
+    return filter_tuple(a, b, [](auto const& x, auto const& y) { return detail::lift_dice(x,y); });
+  } else if constexpr (is_underscore<A>::value) {
+    return cute::tuple<>{};
+  } else {
+    return b;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Display utilities
+//
+
+CUTE_HOST_DEVICE void print(Underscore const&) {
+  printf("_");
+}
+
+#if !defined(__CUDACC_RTC__)
+CUTE_HOST std::ostream& operator<<(std::ostream& os, Underscore const&) {
+  return os << "_";
+}
+#endif
+
+} // end namespace cute
diff --git a/include/cute/util/debug.hpp b/include/cute/util/debug.hpp
new file mode 100644
index 0000000000..2645444369
--- /dev/null
+++ b/include/cute/util/debug.hpp
@@ -0,0 +1,164 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+/**
+ * \file
+ * \brief Debugging and logging functionality
+ */
+
+#include <cuda_runtime_api.h>
+
+#include <cute/config.hpp>
+
+namespace cute
+{
+
+/******************************************************************************
+ * Debug and logging macros
+ ******************************************************************************/
+
+/**
+ * Formats and prints the given message to stdout
+ */
+#if !defined(CUTE_LOG)
+#  if !defined(__CUDA_ARCH__)
+#    define CUTE_LOG(format, ...) printf(format, __VA_ARGS__)
+#  else
+#    define CUTE_LOG(format, ...)                                \
+        printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \
+               blockIdx.x,  blockIdx.y,  blockIdx.z,             \
+               threadIdx.x, threadIdx.y, threadIdx.z,            \
+               __VA_ARGS__);
+#  endif
+#endif
+
+/**
+ * Formats and prints the given message to stdout only if DEBUG is defined
+ */
+#if !defined(CUTE_LOG_DEBUG)
+#  ifdef DEBUG
+#    define CUTE_LOG_DEBUG(format, ...) CUTE_LOG(format, __VA_ARGS__)
+#  else
+#    define CUTE_LOG_DEBUG(format, ...)
+#  endif
+#endif
+
+/**
+ * \brief Perror macro with exit
+ */
+#if !defined(CUTE_ERROR_EXIT)
+#  define CUTE_ERROR_EXIT(e)                                         \
+      do {                                                           \
+        cudaError_t code = (e);                                      \
+        if (code != cudaSuccess) {                                   \
+          fprintf(stderr, "<%s:%d> %s:\n    %s: %s\n",               \
+                  __FILE__, __LINE__, #e,                            \
+                  cudaGetErrorName(code), cudaGetErrorString(code)); \
+          fflush(stderr);                                            \
+          exit(1);                                                   \
+        }                                                            \
+      } while (0)
+#endif
+
+#if !defined(CUTE_CHECK_LAST)
+#  define CUTE_CHECK_LAST() CUTE_ERROR_EXIT(cudaPeekAtLastError()); CUTE_ERROR_EXIT(cudaDeviceSynchronize())
+#endif
+
+#if !defined(CUTE_CHECK_ERROR)
+#  define CUTE_CHECK_ERROR(e) CUTE_ERROR_EXIT(e)
+#endif
+
+// A dummy function that uses compilation failure to print a type
+template <class... T>
+CUTE_HOST_DEVICE void
+print_type() {
+  static_assert(sizeof...(T) < 0, "Printing type T.");
+}
+
+template <class... T>
+CUTE_HOST_DEVICE void
+print_type(T&&...) {
+  static_assert(sizeof...(T) < 0, "Printing type T.");
+}
+
+//
+// Device-specific helpers
+//
+// e.g.
+// if (thread0()) print(...);
+// if (block0()) print(...);
+// if (thread(42)) print(...);
+
+CUTE_HOST_DEVICE
+bool
+block([[maybe_unused]] int bid)
+{
+#if defined(__CUDA_ARCH__)
+  return blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y == static_cast<unsigned int>(bid);
+#else
+  return true;
+#endif
+}
+
+CUTE_HOST_DEVICE
+bool
+thread([[maybe_unused]] int tid, [[maybe_unused]] int bid)
+{
+#if defined(__CUDA_ARCH__)
+  return (threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.x*blockDim.y == static_cast<unsigned int>(tid)) && block(bid);
+#else
+  return true;
+#endif
+}
+
+CUTE_HOST_DEVICE
+bool
+thread(int tid)
+{
+  return thread(tid,0);
+}
+
+CUTE_HOST_DEVICE
+bool
+thread0()
+{
+  return thread(0,0);
+}
+
+CUTE_HOST_DEVICE
+bool
+block0()
+{
+  return block(0);
+}
+
+}  // end namespace cute
diff --git a/include/cute/util/print.hpp b/include/cute/util/print.hpp
new file mode 100644
index 0000000000..dbd6581693
--- /dev/null
+++ b/include/cute/util/print.hpp
@@ -0,0 +1,261 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>           // CUTE_HOST_DEVICE
+#include <cute/util/type_traits.hpp> // cute::is_valid
+#include <cute/numeric/numeric_types.hpp> 
+
+//
+// CUDA compatible print and printf
+//
+
+namespace cute
+{
+
+CUTE_HOST_DEVICE
+int
+num_digits(int x)
+{
+  return (x < 10 ? 1 :
+          (x < 100 ? 2 :
+           (x < 1000 ? 3 :
+            (x < 10000 ? 4 :
+             (x < 100000 ? 5 :
+              (x < 1000000 ? 6 :
+               (x < 10000000 ? 7 :
+                (x < 100000000 ? 8 :
+                 (x < 1000000000 ? 9 :
+                  10)))))))));
+}
+
+//
+// print dispatcher
+//
+
+CUTE_HOST_DEVICE
+void
+print(char c) {
+  printf("%c", c);
+}
+
+CUTE_HOST_DEVICE
+void
+print(signed char a) {
+  printf("%d", static_cast<int>(a));
+}
+
+CUTE_HOST_DEVICE
+void
+print(unsigned char a) {
+  printf("%u", static_cast<unsigned int>(a));
+}
+
+CUTE_HOST_DEVICE
+void
+print(short a) {
+  printf("%hd", a);
+}
+
+CUTE_HOST_DEVICE
+void
+print(unsigned short a) {
+  printf("%hu", a);
+}
+
+CUTE_HOST_DEVICE
+void
+print(int a) {
+  printf("%d", a);
+}
+
+CUTE_HOST_DEVICE
+void
+print(uint1b_t a) {
+  printf("%d", int(a));
+}
+
+CUTE_HOST_DEVICE
+void
+print(int2b_t a) {
+  printf("%d", int(a));
+}
+
+CUTE_HOST_DEVICE
+void
+print(uint2b_t a) {
+  printf("%d", int(a));
+}
+
+CUTE_HOST_DEVICE
+void
+print(int4b_t a) {
+  printf("%d", int(a));
+}
+
+CUTE_HOST_DEVICE
+void
+print(uint4b_t a) {
+  printf("%d", int(a));
+}
+
+CUTE_HOST_DEVICE
+void
+print(bin1_t a) {
+  printf("%d", int(a));
+}
+
+CUTE_HOST_DEVICE
+void
+print(unsigned int a) {
+  printf("%u", a);
+}
+
+CUTE_HOST_DEVICE
+void
+print(long a) {
+  printf("%ld", a);
+}
+
+CUTE_HOST_DEVICE
+void
+print(unsigned long a) {
+  printf("%lu", a);
+}
+
+CUTE_HOST_DEVICE
+void
+print(long long a) {
+  printf("%lld", a);
+}
+
+CUTE_HOST_DEVICE
+void
+print(unsigned long long a) {
+  printf("%llu", a);
+}
+
+CUTE_HOST_DEVICE
+void
+print(float a) {
+  printf("%f", a);
+}
+
+CUTE_HOST_DEVICE
+void
+print(double a) {
+  printf("%f", a);
+}
+
+template <class... T>
+CUTE_HOST_DEVICE
+void
+print(char const* format, T const&... t) {
+  printf(format, t...);
+}
+
+CUTE_HOST_DEVICE
+void
+print(char const* format) {
+  printf("%s", format);
+}
+
+//
+// pretty printing
+//
+
+CUTE_HOST_DEVICE void
+pretty_print(uint1b_t a) {
+  printf("%*d", 3, int(a));
+}
+
+CUTE_HOST_DEVICE void
+pretty_print(int2b_t a) {
+  printf("%*d", 5, int(a));
+}
+
+CUTE_HOST_DEVICE void
+pretty_print(uint2b_t a) {
+  printf("%*d", 5, int(a));
+}
+
+CUTE_HOST_DEVICE void
+pretty_print(int4b_t a) {
+  printf("%*d", 5, int(a));
+}
+
+CUTE_HOST_DEVICE void
+pretty_print(uint4b_t a) {
+  printf("%*d", 5, int(a));
+}
+
+CUTE_HOST_DEVICE void
+pretty_print(bool v) {
+  printf("%*d", 3, int(v));
+}
+
+CUTE_HOST_DEVICE void
+pretty_print(int32_t v) {
+  printf("%*d", 5, v);
+}
+
+CUTE_HOST_DEVICE void
+pretty_print(uint32_t v) {
+  printf("%*d", 5, v);
+}
+
+CUTE_HOST_DEVICE void
+pretty_print(int64_t v) {
+  printf("%*lld", 5, static_cast<long long>(v));
+}
+
+CUTE_HOST_DEVICE void
+pretty_print(uint64_t v) {
+  printf("%*llu", 5, static_cast<unsigned long long>(v));
+}
+
+CUTE_HOST_DEVICE void
+pretty_print(float v) {
+  printf("%*.2e", 10, v);
+}
+
+CUTE_HOST_DEVICE void
+pretty_print(double v) {
+  printf("%*.3e", 11, v);
+}
+
+template <class T>
+CUTE_HOST_DEVICE void
+pretty_print(T t) {
+  printf("  "); print(t);
+}
+
+} // end namespace cute
diff --git a/include/cute/util/type_traits.hpp b/include/cute/util/type_traits.hpp
new file mode 100644
index 0000000000..a3074ef947
--- /dev/null
+++ b/include/cute/util/type_traits.hpp
@@ -0,0 +1,298 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/type_traits>
+#include <cuda/std/utility>
+#include <cuda/std/cstddef>
+#include <cuda/std/cstdint>
+#include <cuda/std/limits>
+#else
+#include <type_traits>
+#include <utility>      // tuple_size, tuple_element
+#include <cstddef>      // ptrdiff_t
+#include <cstdint>      // uintptr_t
+#include <limits>       // numeric_limits
+#endif
+
+#include <cute/config.hpp> // CUTE_STL_NAMESPACE
+
+namespace cute
+{
+  using CUTE_STL_NAMESPACE::enable_if;
+  using CUTE_STL_NAMESPACE::enable_if_t;
+}
+
+#define __CUTE_REQUIRES(...)   typename cute::enable_if<(__VA_ARGS__)>::type* = nullptr
+#define __CUTE_REQUIRES_V(...) typename cute::enable_if<decltype((__VA_ARGS__))::value>::type* = nullptr
+
+namespace cute
+{
+
+// <type_traits>
+using CUTE_STL_NAMESPACE::conjunction;
+using CUTE_STL_NAMESPACE::conjunction_v;
+
+using CUTE_STL_NAMESPACE::disjunction;
+using CUTE_STL_NAMESPACE::disjunction_v;
+
+using CUTE_STL_NAMESPACE::negation;
+using CUTE_STL_NAMESPACE::negation_v;
+
+using CUTE_STL_NAMESPACE::void_t;
+using CUTE_STL_NAMESPACE::is_void_v;
+
+using CUTE_STL_NAMESPACE::is_base_of;
+using CUTE_STL_NAMESPACE::is_base_of_v;
+
+using CUTE_STL_NAMESPACE::is_const;
+using CUTE_STL_NAMESPACE::is_const_v;
+using CUTE_STL_NAMESPACE::is_volatile;
+using CUTE_STL_NAMESPACE::is_volatile_v;
+
+// Defined in cute/numeric/integral_constant.hpp
+// using CUTE_STL_NAMESPACE::true_type;
+// using CUTE_STL_NAMESPACE::false_type;
+
+using CUTE_STL_NAMESPACE::conditional;
+using CUTE_STL_NAMESPACE::conditional_t;
+
+using CUTE_STL_NAMESPACE::add_const_t;
+
+using CUTE_STL_NAMESPACE::remove_const_t;
+using CUTE_STL_NAMESPACE::remove_cv_t;
+using CUTE_STL_NAMESPACE::remove_reference_t;
+
+using CUTE_STL_NAMESPACE::extent;
+using CUTE_STL_NAMESPACE::remove_extent;
+
+using CUTE_STL_NAMESPACE::decay;
+using CUTE_STL_NAMESPACE::decay_t;
+
+using CUTE_STL_NAMESPACE::is_lvalue_reference;
+using CUTE_STL_NAMESPACE::is_lvalue_reference_v;
+
+using CUTE_STL_NAMESPACE::is_reference;
+using CUTE_STL_NAMESPACE::is_trivially_copyable;
+
+using CUTE_STL_NAMESPACE::is_convertible;
+using CUTE_STL_NAMESPACE::is_convertible_v;
+
+using CUTE_STL_NAMESPACE::is_same;
+using CUTE_STL_NAMESPACE::is_same_v;
+
+using CUTE_STL_NAMESPACE::is_constructible;
+using CUTE_STL_NAMESPACE::is_constructible_v;
+using CUTE_STL_NAMESPACE::is_default_constructible;
+using CUTE_STL_NAMESPACE::is_default_constructible_v;
+using CUTE_STL_NAMESPACE::is_standard_layout;
+using CUTE_STL_NAMESPACE::is_standard_layout_v;
+
+using CUTE_STL_NAMESPACE::is_arithmetic;
+using CUTE_STL_NAMESPACE::is_unsigned;
+using CUTE_STL_NAMESPACE::is_unsigned_v;
+using CUTE_STL_NAMESPACE::is_signed;
+using CUTE_STL_NAMESPACE::is_signed_v;
+
+using CUTE_STL_NAMESPACE::make_signed;
+using CUTE_STL_NAMESPACE::make_signed_t;
+
+// using CUTE_STL_NAMESPACE::is_integral;
+template <class T>
+using is_std_integral = CUTE_STL_NAMESPACE::is_integral<T>;
+
+using CUTE_STL_NAMESPACE::is_empty;
+using CUTE_STL_NAMESPACE::is_empty_v;
+
+using CUTE_STL_NAMESPACE::invoke_result_t;
+
+using CUTE_STL_NAMESPACE::common_type;
+using CUTE_STL_NAMESPACE::common_type_t;
+
+using CUTE_STL_NAMESPACE::remove_pointer;
+using CUTE_STL_NAMESPACE::remove_pointer_t;
+
+using CUTE_STL_NAMESPACE::add_pointer;
+using CUTE_STL_NAMESPACE::add_pointer_t;
+
+using CUTE_STL_NAMESPACE::alignment_of;
+using CUTE_STL_NAMESPACE::alignment_of_v;
+
+using CUTE_STL_NAMESPACE::is_pointer;
+using CUTE_STL_NAMESPACE::is_pointer_v;
+
+// <utility>
+using CUTE_STL_NAMESPACE::declval;
+
+template <class T>
+constexpr T&& forward(remove_reference_t<T>& t) noexcept
+{
+  return static_cast<T&&>(t);
+}
+
+template <class T>
+constexpr T&& forward(remove_reference_t<T>&& t) noexcept
+{
+  static_assert(! is_lvalue_reference_v<T>, "T cannot be an lvalue reference (e.g., U&).");
+  return static_cast<T&&>(t);
+}
+
+template <class T>
+constexpr remove_reference_t<T>&& move(T&& t) noexcept
+{
+  return static_cast<remove_reference_t<T>&&>(t);
+}
+
+// <limits>
+using CUTE_STL_NAMESPACE::numeric_limits;
+
+// <cstddef>
+using CUTE_STL_NAMESPACE::ptrdiff_t;
+
+// <cstdint>
+using CUTE_STL_NAMESPACE::uintptr_t;
+
+// C++20
+// using std::remove_cvref;
+template <class T>
+struct remove_cvref {
+  using type = remove_cv_t<remove_reference_t<T>>;
+};
+
+// C++20
+// using std::remove_cvref_t;
+template <class T>
+using remove_cvref_t = typename remove_cvref<T>::type;
+
+//
+// dependent_false
+//
+// @brief An always-false value that depends on one or more template parameters.
+// See
+// https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2019/p1830r1.pdf
+// https://github.com/cplusplus/papers/issues/572
+// https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2593r0.html
+template <class... Args>
+inline constexpr bool dependent_false = false;
+
+//
+// tuple_size, tuple_element
+//
+// @brief CuTe-local tuple-traits to prevent conflicts with other libraries.
+// For cute:: types, we specialize std::tuple-traits, which is explicitly allowed.
+//   cute::tuple, cute::array, cute::array_subbyte, etc
+// But CuTe wants to treat some external types as tuples as well. For those,
+// we specialize cute::tuple-traits to avoid polluting external traits.
+//   dim3, uint3, etc
+
+template <class T, class = void>
+struct tuple_size;
+
+template <class T>
+struct tuple_size<T,void_t<typename CUTE_STL_NAMESPACE::tuple_size<T>::type>> : CUTE_STL_NAMESPACE::integral_constant<size_t, CUTE_STL_NAMESPACE::tuple_size<T>::value> {};
+
+// S =  : std::integral_constant<std::size_t, std::tuple_size<T>::value> {};
+
+template <class T>
+constexpr size_t tuple_size_v = tuple_size<T>::value;
+
+template <size_t I, class T, class = void>
+struct tuple_element;
+
+template <size_t I, class T>
+struct tuple_element<I,T,void_t<typename CUTE_STL_NAMESPACE::tuple_element<I,T>::type>> : CUTE_STL_NAMESPACE::tuple_element<I,T> {};
+
+template <size_t I, class T>
+using tuple_element_t = typename tuple_element<I,T>::type;
+
+//
+// is_valid
+//
+
+namespace detail {
+
+template <class F, class... Args, class = decltype(declval<F&&>()(declval<Args&&>()...))>
+CUTE_HOST_DEVICE constexpr auto
+is_valid_impl(int) { return CUTE_STL_NAMESPACE::true_type{}; }
+
+template <class F, class... Args>
+CUTE_HOST_DEVICE constexpr auto
+is_valid_impl(...) { return CUTE_STL_NAMESPACE::false_type{}; }
+
+template <class F>
+struct is_valid_fn {
+  template <class... Args>
+  CUTE_HOST_DEVICE constexpr auto
+  operator()(Args&&...) const { return is_valid_impl<F, Args&&...>(int{}); }
+};
+
+} // end namespace detail
+
+template <class F>
+CUTE_HOST_DEVICE constexpr auto
+is_valid(F&&) {
+  return detail::is_valid_fn<F&&>{};
+}
+
+template <class F, class... Args>
+CUTE_HOST_DEVICE constexpr auto
+is_valid(F&&, Args&&...) {
+  return detail::is_valid_impl<F&&, Args&&...>(int{});
+}
+
+template <bool B, template<class...> class True, template<class...> class False>
+struct conditional_template {
+  template <class... U>
+  using type = True<U...>;
+};
+
+template <template<class...> class True, template<class...> class False>
+struct conditional_template<false, True, False> {
+  template <class... U>
+  using type = False<U...>;
+};
+
+//
+// is_any_of
+//
+
+// Member `value` is true if and only if T is same as (is_same_v) at least one of the types in Us
+template <class T, class... Us>
+struct is_any_of {
+  constexpr static bool value = (... || CUTE_STL_NAMESPACE::is_same_v<T, Us>);
+};
+
+// Is true if and only if T is same as (is_same_v) at least one of the types in Us
+template <class T, class... Us>
+inline constexpr bool is_any_of_v = is_any_of<T, Us...>::value;
+
+} // end namespace cute
diff --git a/include/cutlass/aligned_buffer.h b/include/cutlass/aligned_buffer.h
index f869d388b0..0d2bb29048 100644
--- a/include/cutlass/aligned_buffer.h
+++ b/include/cutlass/aligned_buffer.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/arch/arch.h b/include/cutlass/arch/arch.h
index 578e6c14a3..36d4676bdf 100644
--- a/include/cutlass/arch/arch.h
+++ b/include/cutlass/arch/arch.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -34,12 +34,14 @@
 
 #pragma once
 
+#include "cutlass/cutlass.h"
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
 namespace arch {
 
-#if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+#if defined(__NVCC__) || defined(__CUDACC_RTC__) || (defined(__clang__) && defined(__CUDA__))
 
 /// Computes laneId within a warp
 CUTLASS_DEVICE
@@ -84,6 +86,12 @@ struct Sm80 {
 struct Sm86 {
   static int const kMinComputeCapability = 86;
 };
+struct Sm89 {
+  static int const kMinComputeCapability = 89;
+};
+struct Sm90 {
+  static int const kMinComputeCapability = 90; 
+};
 
 /// Triggers a breakpoint on the device
 CUTLASS_DEVICE
diff --git a/include/cutlass/arch/barrier.h b/include/cutlass/arch/barrier.h
new file mode 100644
index 0000000000..460531aa89
--- /dev/null
+++ b/include/cutlass/arch/barrier.h
@@ -0,0 +1,723 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Barrier Operations on SM90+
+*/
+
+#pragma once
+
+#include <cutlass/arch/memory_sm75.h>
+#include <cute/arch/cluster_sm90.hpp>
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && (__CUDACC_VER_MAJOR__ >= 12)
+#define CUDA_BARRIER_ENABLED 1
+#else
+#define CUDA_BARRIER_ENABLED 0
+#endif
+
+namespace cutlass {
+/// @brief
+namespace arch {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+CUTLASS_DEVICE void fence_view_async_shared();
+
+namespace detail { // namespace detail begin
+
+// Single threaded versions that need to be called in an elect_one region
+template<typename T, uint32_t Stages>
+CUTLASS_DEVICE
+void initialize_barrier_array(T ptr, int arv_cnt) {
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < Stages; i++) {
+    ptr[i].init(arv_cnt);
+  }
+}
+
+template<typename T, uint32_t Stages>
+CUTLASS_DEVICE
+void initialize_barrier_array(uint64_t *ptr, int arv_cnt) {
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < Stages; i++) {
+    T::init(&ptr[i], arv_cnt);
+  }
+}
+
+template<typename FullBarrier, typename EmptyBarrier, uint32_t Stages>
+CUTLASS_DEVICE
+void initialize_barrier_array_pair(FullBarrier full_barriers, EmptyBarrier empty_barriers, int full_barrier_arv_cnt, int empty_barrier_arv_cnt) {
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < Stages; i++) {
+    full_barriers[i].init(full_barrier_arv_cnt);
+    empty_barriers[i].init(empty_barrier_arv_cnt);
+  }
+}
+
+template<typename FullBarrier, typename EmptyBarrier, uint32_t Stages>
+CUTLASS_DEVICE
+void initialize_barrier_array_pair(uint64_t *full_barriers_ptr, uint64_t *empty_barriers_ptr, int full_barrier_arv_cnt, int empty_barrier_arv_cnt) {
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < Stages; i++) {
+    FullBarrier::init(&full_barriers_ptr[i], full_barrier_arv_cnt);
+    EmptyBarrier::init(&empty_barriers_ptr[i], empty_barrier_arv_cnt);
+  }
+}
+
+// Aligned versions that need to be call warp wide
+template<typename T, uint32_t Stages>
+CUTLASS_DEVICE
+void initialize_barrier_array_aligned(T ptr, int arv_cnt) {
+  if(cute::elect_one_sync()) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Stages; i++) {
+      ptr[i].init(arv_cnt);
+    }
+  }
+}
+
+template<typename T, uint32_t Stages>
+CUTLASS_DEVICE
+void initialize_barrier_array_aligned(uint64_t *ptr, int arv_cnt) {
+  if(cute::elect_one_sync()) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Stages; i++) {
+      T::init(&ptr[i], arv_cnt);
+    }
+  }
+}
+
+template<typename FullBarrier, typename EmptyBarrier, uint32_t Stages>
+CUTLASS_DEVICE
+void initialize_barrier_array_pair_aligned(FullBarrier full_barriers, EmptyBarrier empty_barriers, int full_barrier_arv_cnt, int empty_barrier_arv_cnt) {
+  if(cute::elect_one_sync()) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Stages; i++) {
+      full_barriers[i].init(full_barrier_arv_cnt);
+      empty_barriers[i].init(empty_barrier_arv_cnt);
+    }
+  }
+}
+
+template<typename FullBarrier, typename EmptyBarrier, uint32_t Stages>
+CUTLASS_DEVICE
+void initialize_barrier_array_pair_aligned(uint64_t *full_barriers_ptr, uint64_t *empty_barriers_ptr, int full_barrier_arv_cnt, int empty_barrier_arv_cnt) {
+  if(cute::elect_one_sync()) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Stages; i++) {
+      FullBarrier::init(&full_barriers_ptr[i], full_barrier_arv_cnt);
+      EmptyBarrier::init(&empty_barriers_ptr[i], empty_barrier_arv_cnt);
+    }
+  }
+}
+
+} // namespace detail end
+
+
+// Enumerates the reserved named barriers to avoid potential conflicts
+// This enum class specifies the NamedBarriers reserved by CUTLASS.
+enum class ReservedNamedBarriers { 
+  EpilogueBarrier = 1,
+  TransposeBarrier = 2,
+  TransformBarrier = 3,
+  StreamkBarrier0 = 4,
+  StreamkBarrier1 = 5
+  , FirstUserBarrier = StreamkBarrier1 + 1
+};
+
+
+class NamedBarrier {
+
+  // Data Members:
+
+  // Range = [1 , NUM_THREADS_PER_CTA]
+  // Range % warp-size (i.e 32) == 0
+  uint32_t const num_threads_;
+
+  // Range : [0, 15]
+  // Note that should be set to the final barrier ID, including ReserveNamedBarrierCount should be considered
+  uint32_t const id_;
+
+ public:
+
+  // Constructor for CUTLASS developers:
+  // effective barrier ID starts from 0
+  CUTLASS_DEVICE
+  NamedBarrier(uint32_t num_threads, ReservedNamedBarriers reserved_named_barriers)
+      : num_threads_(num_threads), id_(static_cast<uint32_t>(reserved_named_barriers)) {}
+
+  // Constructor for CUTLASS users:
+  // effective barrier ID starts from ReservedNamedBarrierCount
+  CUTLASS_DEVICE
+  NamedBarrier(uint32_t num_threads, uint32_t id = 0)
+      : num_threads_(num_threads), id_(id + ReservedNamedBarrierCount) {
+    CUTLASS_ASSERT(id + ReservedNamedBarrierCount <= HardwareMaxNumNamedBarriers && "Effective barrier_id should not exceed 16.");
+  }
+
+  CUTLASS_DEVICE
+  void arrive_and_wait() const {
+    // Note: The value of id_ is already the final barrier id (set correctly in the constructor).
+    NamedBarrier::arrive_and_wait_internal(num_threads_, id_);
+  }
+
+  CUTLASS_DEVICE
+  void arrive_and_wait_unaligned() const {
+    // Note: The value of id_ is already the final barrier id (set correctly in the constructor).
+    NamedBarrier::arrive_and_wait_internal_unaligned(num_threads_, id_);
+  }
+
+  CUTLASS_DEVICE
+  void arrive() const {
+    // Note: The value of id_ is already the final barrier id (set correctly in the constructor).
+    NamedBarrier::arrive_internal(num_threads_, id_);
+  }
+
+  CUTLASS_DEVICE
+  void arrive_unaligned() const {
+    // Note: The value of id_ is already the final barrier id (set correctly in the constructor).
+    NamedBarrier::arrive_internal_unaligned(num_threads_, id_);
+  }
+
+  CUTLASS_DEVICE
+  void sync() const {
+    NamedBarrier::arrive_and_wait();
+  }
+
+  //  Static variants
+
+  // Calling interface for CUTLASS users: 
+  // effective barrier ID starts from ReservedNamedBarrierCount
+  CUTLASS_DEVICE
+  static void arrive_and_wait(uint32_t num_threads, uint32_t barrier_id) {
+    arrive_and_wait_internal(num_threads, barrier_id + ReservedNamedBarrierCount);
+  }
+
+  // Calling interface for CUTLASS developers: 
+  // effective barrier ID starts from 0
+  CUTLASS_DEVICE
+  static void arrive_and_wait(uint32_t num_threads, ReservedNamedBarriers reserved_named_barriers) {
+    arrive_and_wait_internal(num_threads, static_cast<int>(reserved_named_barriers));
+  }
+
+  // Calling interface for CUTLASS users: 
+  // effective barrier ID starts from ReservedNamedBarrierCount
+  CUTLASS_DEVICE
+  static void arrive(uint32_t num_threads, uint32_t barrier_id) {
+    arrive_internal(num_threads, barrier_id + ReservedNamedBarrierCount);
+  }
+
+  // Calling interface for CUTLASS developers: 
+  // effective barrier ID starts from 0
+  CUTLASS_DEVICE
+  static void arrive(uint32_t num_threads, ReservedNamedBarriers reserved_named_barriers) {
+    arrive_internal(num_threads, static_cast<int>(reserved_named_barriers));
+  }
+
+  // Calling interface for CUTLASS users: 
+  // effective barrier ID starts from ReservedNamedBarrierCount
+  CUTLASS_DEVICE
+  static void sync(uint32_t num_threads, uint32_t barrier_id) {
+    sync_internal(num_threads, barrier_id + ReservedNamedBarrierCount);
+  }
+
+  // Calling interface for CUTLASS developers: 
+  // effective barrier ID starts from 0
+  CUTLASS_DEVICE
+  static void sync(uint32_t num_threads, ReservedNamedBarriers reserved_named_barriers) {
+    sync_internal(num_threads, static_cast<int>(reserved_named_barriers));
+  }
+
+
+ private:
+  CUTLASS_DEVICE
+  static void arrive_and_wait_internal(uint32_t num_threads, uint32_t barrier_id) {
+#if CUDA_BARRIER_ENABLED
+    asm volatile("bar.sync %0, %1;" : : "r"(barrier_id), "r"(num_threads));
+    cutlass::arch::synclog_emit_named_barrier_arrive_and_wait(__LINE__, num_threads, barrier_id);
+#elif defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+#endif
+  }
+
+  CUTLASS_DEVICE
+  static void arrive_and_wait_internal_unaligned(uint32_t num_threads, uint32_t barrier_id) {
+#if CUDA_BARRIER_ENABLED
+    asm volatile("barrier.sync %0, %1;" : : "r"(barrier_id), "r"(num_threads));
+    cutlass::arch::synclog_emit_named_barrier_arrive_and_wait(__LINE__, num_threads, barrier_id);
+#elif defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+#endif
+  }
+
+  CUTLASS_DEVICE
+  static void arrive_internal(uint32_t num_threads, uint32_t barrier_id) {
+#if CUDA_BARRIER_ENABLED
+    cutlass::arch::synclog_emit_named_barrier_arrive(__LINE__, num_threads, barrier_id);
+    asm volatile("bar.arrive %0, %1;" : : "r"(barrier_id), "r"(num_threads));
+#elif defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+#endif
+  }
+
+  CUTLASS_DEVICE
+  static void arrive_internal_unaligned(uint32_t num_threads, uint32_t barrier_id) {
+#if CUDA_BARRIER_ENABLED
+    cutlass::arch::synclog_emit_named_barrier_arrive(__LINE__, num_threads, barrier_id);
+    asm volatile("barrier.arrive %0, %1;" : : "r"(barrier_id), "r"(num_threads));
+#elif defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+#endif
+  }
+
+  CUTLASS_DEVICE
+  static void sync_internal(uint32_t num_threads, uint32_t barrier_id) {
+    NamedBarrier::arrive_and_wait_internal(num_threads, barrier_id);
+  }
+
+ public:
+  // Currently we reserve 8 NamedBarriers for CUTLASS' own use cases, 
+  // while leaving the renaming for general users.
+  static const uint32_t ReservedNamedBarrierCount = static_cast<uint32_t>(ReservedNamedBarriers::FirstUserBarrier);
+  static const uint32_t HardwareMaxNumNamedBarriers = 16;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Hopper introduces a new cluster-wide barrier which handle with Cluster-wide arrive-wait behaviour.
+// This is an extension to the Ampere arrive-wait barriers
+// Note : Ampere arrive-wait Barriers have a larger max-arrive count (2^30) than Hopper arrive-wait Barriers (2^20).
+struct ClusterBarrier {
+
+  using ValueType = uint64_t;
+
+protected:
+  // Can never be initialized - can only be aliased to smem
+  ValueType barrier_;
+
+public:
+
+  CUTLASS_DEVICE
+  ClusterBarrier() = delete;
+
+  CUTLASS_DEVICE
+  void init(uint32_t arrive_count) const {
+    ClusterBarrier::init(&this->barrier_, arrive_count);
+  }
+
+  CUTLASS_DEVICE
+  bool test_wait(uint32_t phase, uint32_t pred=true) const {
+    return ClusterBarrier::test_wait(&this->barrier_, phase, pred);
+  }
+
+  CUTLASS_DEVICE
+  bool try_wait(uint32_t phase) const {
+    return ClusterBarrier::try_wait(&this->barrier_, phase);
+  }
+
+  CUTLASS_DEVICE
+  void wait(uint32_t phase) const {
+    ClusterBarrier::wait(&this->barrier_, phase);
+  }
+
+  // Barrier arrive on local smem
+  CUTLASS_DEVICE
+  void arrive() const {
+    ClusterBarrier::arrive(&this->barrier_);
+  }
+
+  // Remote SMEM arrive with a perdicate (usually done to pick the thread doing the arrive)
+  CUTLASS_DEVICE
+  void arrive(uint32_t cta_id, uint32_t pred = true ) const {
+    ClusterBarrier::arrive(&this->barrier_, cta_id, pred);
+  }
+
+  //
+  //  Static Versions
+  //
+  CUTLASS_DEVICE
+  static void init(ValueType const* smem_ptr, uint32_t arrive_count) {
+#if CUDA_BARRIER_ENABLED
+    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
+    asm volatile(
+        "{\n\t"
+        "mbarrier.init.shared::cta.b64 [%1], %0; \n"
+        "}"
+        :
+        : "r"(arrive_count), "r"(smem_addr));
+    cutlass::arch::synclog_emit_cluster_barrier_init(__LINE__, smem_addr, arrive_count);
+#elif defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+#endif
+  }
+
+  // Static version of wait - in case we don't want to burn a register
+  CUTLASS_DEVICE
+  static void wait(ValueType const* smem_ptr, uint32_t phase) {
+#if CUDA_BARRIER_ENABLED
+    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_cluster_barrier_wait(__LINE__, smem_addr, phase);
+    // Arbitrarily large timer value after which try-wait expires and re-tries.
+    uint32_t ticks = 0x989680;
+    asm volatile(
+        "{\n\t"
+        ".reg .pred       P1; \n\t"
+        "LAB_WAIT: \n\t"
+        "mbarrier.try_wait.parity.shared::cta.b64 P1, [%0], %1, %2; \n\t"
+        "@P1 bra DONE; \n\t"
+        "bra     LAB_WAIT; \n\t"
+        "DONE: \n\t"
+        "}"
+        :
+        : "r"(smem_addr), "r"(phase), "r"(ticks));
+
+#elif defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+#endif
+  }
+
+  CUTLASS_DEVICE
+  static bool test_wait(ValueType const* smem_ptr, uint32_t phase, uint32_t pred) {
+#if CUDA_BARRIER_ENABLED
+    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_cluster_barrier_test_wait(__LINE__, smem_addr, phase, pred);
+    uint32_t waitComplete;
+
+    asm volatile(
+        "{\n\t"
+        ".reg .pred P1; \n\t"
+        ".reg .pred P2; \n\t"
+        "setp.eq.u32 P2, %3, 1;\n\t"
+        "@P2 mbarrier.test_wait.parity.shared::cta.b64 P1, [%1], %2; \n\t"
+        "selp.b32 %0, 1, 0, P1; \n\t"
+        "}"
+        : "=r"(waitComplete)
+        : "r"(smem_addr), "r"(phase), "r"(pred));
+
+    return static_cast<bool>(waitComplete);
+#elif defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+#endif
+    return 0;
+  }
+
+  CUTLASS_DEVICE
+  static bool try_wait(ValueType const* smem_ptr, uint32_t phase) {
+#if CUDA_BARRIER_ENABLED
+    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_cluster_barrier_try_wait(__LINE__, smem_addr, phase);
+    uint32_t waitComplete;
+
+    asm volatile(
+        "{\n\t"
+        ".reg .pred P1; \n\t"
+        "mbarrier.try_wait.parity.shared::cta.b64 P1, [%1], %2; \n\t"
+        "selp.b32 %0, 1, 0, P1; \n\t"
+        "}"
+        : "=r"(waitComplete)
+        : "r"(smem_addr), "r"(phase));
+
+    return static_cast<bool>(waitComplete);
+#elif defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+#endif
+    return 0;
+  }
+
+  // Static Predicated version of the above - in case we know the address.
+  CUTLASS_DEVICE
+  static void arrive(ValueType const* smem_ptr, uint32_t cta_id, uint32_t pred) {
+#if CUDA_BARRIER_ENABLED
+    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
+    if (pred) {
+      asm volatile(
+          "{\n\t"
+          ".reg .b32 remAddr32;\n\t"
+          "mapa.shared::cluster.u32  remAddr32, %0, %1;\n\t"
+          "mbarrier.arrive.shared::cluster.b64  _, [remAddr32];\n\t"
+          "}"
+          :
+          : "r"(smem_addr), "r"(cta_id));
+    }
+
+    cutlass::arch::synclog_emit_cluster_barrier_arrive_cluster(__LINE__, smem_addr, cta_id, pred);
+#elif defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+#endif
+  }
+
+  // Barrier arrive on local smem
+  CUTLASS_DEVICE
+  static void arrive(ValueType const* smem_ptr) {
+#if CUDA_BARRIER_ENABLED
+    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
+    asm volatile(
+        "{\n\t"
+        "mbarrier.arrive.shared::cta.b64 _, [%0];\n\t"
+        "}"
+        :
+        : "r"(smem_addr));
+    cutlass::arch::synclog_emit_cluster_barrier_arrive(__LINE__, smem_addr);
+#elif defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+#endif
+  }
+
+  CUTLASS_DEVICE
+  static void invalidate(ValueType const* smem_ptr) {
+#if CUDA_BARRIER_ENABLED
+    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
+    asm volatile(
+        "{\n\t"
+        "mbarrier.inval.shared::cta.b64 [%0]; \n\t"
+        "}"
+        :
+        : "r"(smem_addr));
+#elif defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SM90 also introduces a new type of cluster-barrier which supports sync.
+// not just based on Arrive Count, but also transaction count (in bytes)
+struct ClusterTransactionBarrier : public ClusterBarrier {
+
+  CUTLASS_DEVICE
+  ClusterTransactionBarrier() = delete;
+
+  // Performs an arrive operation + expected transaction bytes increment
+  CUTLASS_DEVICE
+  void arrive_and_expect_tx(uint32_t transaction_bytes) const {
+    ClusterTransactionBarrier::arrive_and_expect_tx(&this->barrier_, transaction_bytes);
+  }
+
+  // Performs an arrive operation + expected transaction bytes increment
+  CUTLASS_DEVICE
+  void arrive_and_expect_tx(uint32_t transaction_bytes, uint32_t cta_id, uint32_t pred = 1u) const {
+    ClusterTransactionBarrier::arrive_and_expect_tx(&this->barrier_, transaction_bytes , cta_id, pred);
+  }
+
+  // Performs an expected transaction bytes increment without doing an arrive operation
+  CUTLASS_DEVICE
+  void expect_transaction(uint32_t transaction_bytes) const {
+    ClusterTransactionBarrier::expect_transaction(&this->barrier_, transaction_bytes);
+  }
+
+  // Performs an expected transaction bytes decrement without doing an arrive operation
+  CUTLASS_DEVICE
+  void complete_transaction(uint32_t transaction_bytes, uint32_t pred = 1) const {
+    uint32_t cta_rank = cute::block_rank_in_cluster();
+    ClusterTransactionBarrier::complete_transaction(&this->barrier_, cta_rank, transaction_bytes, pred);
+  }
+
+  // Performs an expected transaction bytes decrement without doing an arrive operation
+  CUTLASS_DEVICE
+  void complete_transaction(uint32_t dst_cta_id, uint32_t transaction_bytes, uint32_t pred) const {
+    ClusterTransactionBarrier::complete_transaction(&this->barrier_, dst_cta_id, transaction_bytes, pred);
+  }
+
+  //
+  //  Static Versions
+  //
+
+  // Performs an arrive operation + expected transaction bytes increment
+  CUTLASS_DEVICE
+  static void arrive_and_expect_tx(ValueType const* smem_ptr, uint32_t transaction_bytes) {
+#if CUDA_BARRIER_ENABLED
+    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
+    asm volatile(
+        "{\n\t"
+        "mbarrier.arrive.expect_tx.shared::cta.b64 _, [%1], %0; \n\t"
+        "}"
+        :
+        : "r"(transaction_bytes), "r"(smem_addr));
+    cutlass::arch::synclog_emit_cluster_transaction_barrier_arrive_and_expect_tx(__LINE__, smem_addr, transaction_bytes);
+#elif defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+#endif
+  }
+
+  // Performs an arrive operation + expected transaction bytes increment for a remote cta_id in a Cluster
+  CUTLASS_DEVICE
+  static void arrive_and_expect_tx(
+      ValueType const* smem_ptr, uint32_t transaction_bytes, uint32_t cta_id, uint32_t pred) {
+#if CUDA_BARRIER_ENABLED
+    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
+    asm volatile(
+        "{\n\t"
+        ".reg .pred p;\n\t"
+        ".reg .b32 remAddr32;\n\t"
+        "setp.eq.u32 p, %2, 1;\n\t"
+        "@p mapa.shared::cluster.u32  remAddr32, %0, %1;\n\t"
+        "@p mbarrier.arrive.expect_tx.shared::cluster.b64  _, [remAddr32], %3;\n\t"
+        "}"
+        :
+        : "r"(smem_addr), "r"(cta_id), "r"(pred), "r"(transaction_bytes));
+#elif defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+#endif
+  }
+
+  // Performs an expected transaction bytes increment without doing an arrive operation
+  CUTLASS_DEVICE
+  static void expect_transaction(ValueType const* smem_ptr, uint32_t transaction_bytes) {
+#if CUDA_BARRIER_ENABLED
+    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
+    asm volatile(
+        "{\n\t"
+        "mbarrier.expect_tx.shared::cta.b64 [%1], %0; \n\t"
+        "}"
+        :
+        : "r"(transaction_bytes), "r"(smem_addr));
+    cutlass::arch::synclog_emit_cluster_transaction_barrier_expect_transaction(__LINE__, smem_addr, transaction_bytes);
+#elif defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+#endif
+  }
+
+  // Performs an expected transaction bytes decrement without doing an arrive operation
+  CUTLASS_DEVICE
+  static void complete_transaction(
+      ValueType const* smem_ptr, uint32_t dst_cta_id, uint32_t transaction_bytes, uint32_t pred = 1) {
+#if CUDA_BARRIER_ENABLED
+    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
+    smem_addr = cute::set_block_rank(smem_addr, dst_cta_id);
+    asm volatile(
+        "{\n\t"
+        ".reg .pred p;\n\t"
+        "setp.eq.u32 p, %2, 1;\n\t"
+        "@p mbarrier.complete_tx.shared::cluster.relaxed.cluster.b64   [%1], %0;"
+        "}"
+        :
+        : "r"(transaction_bytes), "r"(smem_addr), "r"(pred));
+    cutlass::arch::synclog_emit_cluster_transaction_barrier_complete_transaction(__LINE__, smem_addr, dst_cta_id, transaction_bytes, pred);
+#elif defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+#endif
+  }
+
+  //
+  // DEPRECATED APIs
+  //
+  [[deprecated("Use arrive_and_expect_tx instead")]] CUTLASS_DEVICE
+  void arrive_and_reset_bytes(uint32_t transaction_bytes) const {
+    arrive_and_expect_tx(transaction_bytes);
+  }
+  [[deprecated("Use arrive_and_expect_tx instead")]] CUTLASS_DEVICE
+  void arrive_and_reset_bytes(uint32_t transaction_bytes, uint32_t cta_id) const {
+    arrive_and_expect_tx(transaction_bytes, cta_id);
+  }
+  [[deprecated("Use expect_transaction instead")]] CUTLASS_DEVICE
+  void reset_bytes(uint32_t transaction_bytes) const {
+    expect_transaction(transaction_bytes);
+  }
+  [[deprecated("Use complete_transaction instead")]] CUTLASS_DEVICE
+  void commit(uint32_t transaction_bytes, uint32_t pred = 1) const {
+    complete_transaction(transaction_bytes, pred);
+  }
+  [[deprecated("Use complete_transaction instead")]] CUTLASS_DEVICE
+  void commit(uint32_t dst_cta_id, uint32_t transaction_bytes, uint32_t pred) const {
+    complete_transaction(dst_cta_id, transaction_bytes, pred);
+  }
+  [[deprecated("Use arrive_and_expect_tx instead")]] CUTLASS_DEVICE
+  static void arrive_and_reset_bytes(ValueType const* smem_ptr, uint32_t transaction_bytes) {
+    arrive_and_expect_tx(smem_ptr, transaction_bytes);
+  }
+  [[deprecated("Use arrive_and_expect_tx instead")]] CUTLASS_DEVICE
+  static void arrive_and_reset_bytes(ValueType const* smem_ptr, uint32_t transaction_bytes, uint32_t cta_id, uint32_t pred) {
+    arrive_and_expect_tx(smem_ptr, transaction_bytes, cta_id, pred);
+  }
+  [[deprecated("Use expect_transaction instead")]] CUTLASS_DEVICE
+  static void reset_bytes(ValueType const* smem_ptr, uint32_t transaction_bytes) {
+    expect_transaction(smem_ptr, transaction_bytes);
+  }
+  [[deprecated("Use complete_transaction instead")]] CUTLASS_DEVICE
+  static void commit(ValueType const* smem_ptr, uint32_t dst_cta_id, uint32_t transaction_bytes, uint32_t pred = 1) {
+    complete_transaction(smem_ptr, dst_cta_id, transaction_bytes, pred);
+  }
+};
+
+// Helps with visibility of barrier init operations across warps / cta / cluster
+// Available as a separate function so as to batch inits across barriers and fence once
+// Note : It must be composed with an appropriate sync instruction with the right scope
+// to ensure visibility eg. __syncthreads() or a cluster_arrive() + cluster_wait()
+CUTLASS_DEVICE
+void fence_barrier_init() {
+#if CUDA_BARRIER_ENABLED
+  cutlass::arch::synclog_emit_fence_barrier_init(__LINE__);
+  asm volatile(
+      "{\n\t"
+      "fence.mbarrier_init.release.cluster; \n"
+      "}"
+      ::);
+#elif defined(__CUDA_ARCH__)
+  asm volatile ("brkpt;\n" ::);
+#endif
+}
+
+// Issue a shared memory fence for async operations
+CUTLASS_DEVICE
+void fence_view_async_shared() {
+#if CUDA_BARRIER_ENABLED
+    cutlass::arch::synclog_emit_fence_view_async_shared(__LINE__);
+    asm volatile (
+        "{\n\t"
+        "fence.proxy.async.shared::cta; \n"
+        "}"
+        ::);
+#elif defined(__CUDA_ARCH__)
+  asm volatile ("brkpt;\n" ::);
+#endif
+}
+
+// Arrive on completion of in-flight cp.async operations issued by the calling thread 
+CUTLASS_DEVICE
+void cpasync_barrier_arrive(uint64_t const* smem_ptr) {
+#if CUDA_BARRIER_ENABLED
+  uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
+  asm volatile(
+    "{\n\t"
+    "cp.async.mbarrier.arrive.shared::cta.b64 [%0];\n\t"
+    "}"
+    :
+    : "r"(smem_addr));
+  cutlass::arch::synclog_emit_cpasync_barrier_arrive(__LINE__, smem_addr);
+#elif defined(__CUDA_ARCH__)
+  asm volatile ("brkpt;\n" ::);
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+}  // end namespace arch
+}  // end namespace cutlass
diff --git a/include/cutlass/arch/cache_operation.h b/include/cutlass/arch/cache_operation.h
index d84d4790d0..9d2344bf32 100644
--- a/include/cutlass/arch/cache_operation.h
+++ b/include/cutlass/arch/cache_operation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/arch/config.h b/include/cutlass/arch/config.h
new file mode 100644
index 0000000000..0fc60f41db
--- /dev/null
+++ b/include/cutlass/arch/config.h
@@ -0,0 +1,85 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Definitions for architecture macros
+*/
+
+#pragma once
+
+#include "cutlass/platform/platform.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SM90
+#if (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 0))
+  #define CUTLASS_ARCH_MMA_SM90_SUPPORTED 1
+  #if (!defined(CUTLASS_ARCH_MMA_SM90_ENABLED) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 900)
+    #define CUTLASS_ARCH_MMA_SM90_ENABLED 1
+
+    #if (!defined(CUTLASS_ARCH_MMA_SM90A_ENABLED) && defined(__CUDA_ARCH_FEAT_SM90_ALL))
+      #define CUTLASS_ARCH_MMA_SM90A_ENABLED 1
+    #endif
+  #endif
+#endif
+
+#if (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)
+  #define CUTLASS_ARCH_MMA_SPARSE_SM90_SUPPORTED
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SM90 Modifiable
+#if (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 3))
+  #define CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED 1
+  #if (!defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_ENABLED) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 900)
+    #define CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_ENABLED 1
+
+    #if (!defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90A_ENABLED) && defined(__CUDA_ARCH_FEAT_SM90_ALL))
+      #define CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90A_ENABLED 1
+    #endif
+  #endif
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SM90 F64
+#if (__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 8))
+  #define CUTLASS_ARCH_MMA_SM90_F64_MMA_SUPPORTED 1
+  #if (!defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900)
+    #define CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED 1
+  #endif
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/include/cutlass/arch/grid_dependency_control.h b/include/cutlass/arch/grid_dependency_control.h
new file mode 100644
index 0000000000..14ef197497
--- /dev/null
+++ b/include/cutlass/arch/grid_dependency_control.h
@@ -0,0 +1,84 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+ 
+/*! \file
+    \brief Grid dependent control (GDC) helpers for programmatic dependent launches (PDL).
+*/
+
+#pragma once
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/conv/dispatch_policy.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#ifndef CUTLASS_GDC_ENABLED
+  #if (defined(CUTLASS_ENABLE_GDC_FOR_SM90) && \
+     __CUDACC_VER_MAJOR__ >= 12 && \
+     defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL))
+    #define CUTLASS_GDC_ENABLED
+  #endif
+#endif
+
+namespace cutlass {
+namespace arch {
+
+// Issuing the launch_dependents instruction hints a dependent kernel to launch earlier
+// launch_dependents doesn't impact the functionality but the performance:
+// Launching a dependent kernel too early can compete with current kernels,
+// while launching too late can lead to a long latency.
+CUTLASS_DEVICE
+void launch_dependent_grids() {
+#if (defined(CUTLASS_GDC_ENABLED))
+  asm volatile("griddepcontrol.launch_dependents;");
+#endif
+}
+
+// Issuing the griddepcontrol.wait instruction enforces no global memory access
+// prior to this istruction. This ensures the correctness of global memory access
+// when launching a dependent kernel earlier.
+CUTLASS_DEVICE
+void wait_on_dependent_grids() {
+#if (defined(CUTLASS_GDC_ENABLED))
+  asm volatile("griddepcontrol.wait;");
+#endif
+}
+
+// Enable kernel-level query regarding whether the GDC feature is turned on
+#if (defined(CUTLASS_GDC_ENABLED))
+static constexpr bool IsGdcGloballyEnabled = true;
+#else
+static constexpr bool IsGdcGloballyEnabled = false;
+#endif
+
+
+} // namespace arch
+} // namespace cutlass
diff --git a/include/cutlass/arch/memory.h b/include/cutlass/arch/memory.h
index a41110cbc1..db9ad7397c 100644
--- a/include/cutlass/arch/memory.h
+++ b/include/cutlass/arch/memory.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,6 +35,8 @@
 #pragma once
 
 #include "cutlass/cutlass.h"
+#include "cutlass/arch/cache_operation.h"
+#include "cutlass/platform/platform.h"
 
 namespace cutlass {
 namespace arch {
@@ -45,7 +47,9 @@ template <
     /// Fragment type to store loaded data
     typename AccessType,
     /// The bytes of loading
-    int LoadBytes
+    int LoadBytes,
+    /// Cache operation
+    CacheOperation::Kind cache_op = CacheOperation::Always
     >
 struct global_load;
 
@@ -59,8 +63,7 @@ struct global_load;
 
 #if (((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 4)) || \
      (__CUDACC_VER_MAJOR__ > 11)) &&                                  \
-    defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750) && \
-    ! (defined(__clang__) && defined(__CUDA__))
+    defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750)
   #define CUTLASS_ENABLE_L2_PREFETCH 1
 #else
   #define CUTLASS_ENABLE_L2_PREFETCH 0
@@ -69,10 +72,11 @@ struct global_load;
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 // The redundant mov PTX instruction is used to enforce the compiler to
-// initialize data to zero before ld.global
+// keep the initializing code before ld.global
 template <typename AccessType>
 struct global_load<AccessType,
-                   32 
+                   32,
+                   CacheOperation::Always
                   > {
   CUTLASS_DEVICE
   global_load(AccessType &D, void const *ptr, bool pred_guard) {
@@ -108,7 +112,40 @@ struct global_load<AccessType,
 
 template <typename AccessType>
 struct global_load<AccessType,
-                   16
+                   32,
+                   CacheOperation::LastUse
+                  > {
+  CUTLASS_DEVICE
+  global_load(AccessType &D, void const *ptr, bool pred_guard) {
+  uint4 *data = reinterpret_cast<uint4 *>(&D);
+
+    asm volatile(
+        "{\n"
+        "  .reg .pred p;\n"
+        "  setp.ne.b32 p, %9, 0;\n"
+        "  mov.b32 %0, %10;\n"
+        "  mov.b32 %1, %11;\n"
+        "  mov.b32 %2, %12;\n"
+        "  mov.b32 %3, %13;\n"
+        "  mov.b32 %4, %14;\n"
+        "  mov.b32 %5, %15;\n"
+        "  mov.b32 %6, %16;\n"
+        "  mov.b32 %7, %17;\n"
+        "  @p ld.global.lu.v4.u32 {%0, %1, %2, %3}, [%8];\n"
+        "  @p ld.global.lu.v4.u32 {%4, %5, %6, %7}, [%18];\n"
+        "}\n"
+        : "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z), "=r"(data[0].w),
+          "=r"(data[1].x), "=r"(data[1].y), "=r"(data[1].z), "=r"(data[1].w)
+        : "l"(ptr), "r"((int)pred_guard), "r"(data[0].x), "r"(data[0].y),
+          "r"(data[0].z), "r"(data[0].w), "r"(data[1].x), "r"(data[1].y),
+          "r"(data[1].z), "r"(data[1].w), "l"(((uint8_t *)ptr) + 16));
+  }
+};
+
+template <typename AccessType>
+struct global_load<AccessType,
+                   16,
+                   CacheOperation::Always
                   > {
   CUTLASS_DEVICE
   global_load(AccessType &D, void const *ptr, bool pred_guard) {
@@ -134,7 +171,31 @@ struct global_load<AccessType,
 
 template <typename AccessType>
 struct global_load<AccessType,
-                   8
+                   16,
+                   CacheOperation::LastUse
+                  > {
+  CUTLASS_DEVICE
+  global_load(AccessType &D, void const *ptr, bool pred_guard) {
+  uint4 &data = reinterpret_cast<uint4 &>(D);
+    asm volatile(
+        "{\n"
+        "  .reg .pred p;\n"
+        "  setp.ne.b32 p, %5, 0;\n"
+        "  mov.b32 %0, %6;\n"
+        "  mov.b32 %1, %7;\n"
+        "  mov.b32 %2, %8;\n"
+        "  mov.b32 %3, %9;\n"
+        "  @p ld.global.lu.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+        "}\n"
+        : "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
+        : "l"(ptr), "r"((int)pred_guard), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w));
+  }
+};
+
+template <typename AccessType>
+struct global_load<AccessType,
+                   8,
+                   CacheOperation::Always
                   > {
   CUTLASS_DEVICE
   global_load(AccessType &D, void const *ptr, bool pred_guard) {
@@ -159,7 +220,30 @@ struct global_load<AccessType,
 
 template <typename AccessType>
 struct global_load<AccessType,
-                   4
+                   8,
+                   CacheOperation::LastUse
+                  > {
+  CUTLASS_DEVICE
+  global_load(AccessType &D, void const *ptr, bool pred_guard) {
+  uint2 &data = reinterpret_cast<uint2 &>(D);
+
+    asm volatile(
+        "{\n"
+        "  .reg .pred p;\n"
+        "  setp.ne.b32 p, %3, 0;\n"
+        "  mov.b32 %0, %4;\n"
+        "  mov.b32 %1, %5;\n"
+        "  @p ld.global.lu.v2.u32 {%0, %1}, [%2];\n"
+        "}\n"
+        : "=r"(data.x), "=r"(data.y)
+        : "l"(ptr), "r"((int)pred_guard), "r"(data.x), "r"(data.y));
+  }
+};
+
+template <typename AccessType>
+struct global_load<AccessType,
+                   4,
+                   CacheOperation::Always
                   > {
   CUTLASS_DEVICE
   global_load(AccessType &D, void const *ptr, bool pred_guard) {
@@ -183,7 +267,29 @@ struct global_load<AccessType,
 
 template <typename AccessType>
 struct global_load<AccessType,
-                   2
+                   4,
+                   CacheOperation::LastUse
+                  > {
+  CUTLASS_DEVICE
+  global_load(AccessType &D, void const *ptr, bool pred_guard) {
+  unsigned &data = reinterpret_cast<unsigned &>(D);
+
+    asm volatile(
+        "{\n"
+        "  .reg .pred p;\n"
+        "  setp.ne.b32 p, %2, 0;\n"
+        "  mov.b32 %0, %3;\n"
+        "  @p ld.global.lu.u32 %0, [%1];\n"
+        "}\n"
+        : "=r"(data)
+        : "l"(ptr), "r"((int)pred_guard), "r"(data));
+  }
+};
+
+template <typename AccessType>
+struct global_load<AccessType,
+                   2,
+                   CacheOperation::Always
                   > {
   CUTLASS_DEVICE
   global_load(AccessType &D, void const *ptr, bool pred_guard) {
@@ -207,7 +313,29 @@ struct global_load<AccessType,
 
 template <typename AccessType>
 struct global_load<AccessType,
-                   1
+                   2,
+                   CacheOperation::LastUse
+                  > {
+  CUTLASS_DEVICE
+  global_load(AccessType &D, void const *ptr, bool pred_guard) {
+  uint16_t &data = reinterpret_cast<uint16_t &>(D);
+
+    asm volatile(
+        "{\n"
+        "  .reg .pred p;\n"
+        "  setp.ne.b32 p, %2, 0;\n"
+        "  mov.b16 %0, %3;\n"
+        "  @p ld.global.lu.u16 %0, [%1];\n"
+        "}\n"
+        : "=h"(data)
+        : "l"(ptr), "r"((int)pred_guard), "h"(data));
+  }
+};
+
+template <typename AccessType>
+struct global_load<AccessType,
+                   1,
+                   CacheOperation::Always
                   > {
   CUTLASS_DEVICE
   global_load(AccessType &D, void const *ptr, bool pred_guard) {
@@ -451,7 +579,7 @@ template <>
 CUTLASS_DEVICE
 void shared_store<16>(uint32_t ptr, void const *src) {
   uint4 const *dst_u128 = reinterpret_cast<uint4 const *>(src);
-  asm volatile("ld.shared.v4.u32 [%0], {%1, %2, %3, %4};\n"
+  asm volatile("st.shared.v4.u32 [%0], {%1, %2, %3, %4};\n"
     : :
       "r"(ptr),
       "r"(dst_u128->x),
@@ -468,7 +596,7 @@ void shared_store<16>(uint32_t ptr, void const *src) {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-#include "memory_sm75.h"
-#include "memory_sm80.h"
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/memory_sm80.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/arch/memory_sm75.h b/include/cutlass/arch/memory_sm75.h
index d97682ed30..0e957c72ae 100644
--- a/include/cutlass/arch/memory_sm75.h
+++ b/include/cutlass/arch/memory_sm75.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,7 +35,10 @@
 #pragma once
 
 #include "cutlass/array.h"
+#include "cutlass/detail/helper_macros.hpp"
 #include "cutlass/layout/matrix.h"
+#include "cute/arch/copy_sm75.hpp"
+#include "cute/arch/util.hpp"
 
 namespace cutlass {
 namespace arch {
@@ -48,7 +51,7 @@ template <
   /// .x1, .x2, or .x4
   int MatrixCount
 >
-inline __device__ void ldsm(Array<unsigned, MatrixCount> & D, void const* ptr);
+CUTLASS_DEVICE void ldsm(Array<unsigned, MatrixCount> & D, void const* ptr);
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 //
@@ -56,96 +59,24 @@ inline __device__ void ldsm(Array<unsigned, MatrixCount> & D, void const* ptr);
 //
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-#if (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2) || (__CUDACC_VER_MAJOR__ >= 11)
-
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750)
-#define CUDA_LDMATRIX_ACTIVATED 1
-#endif
-
-#define CUDA_LDMATRIX_SUPPORTED 1
-#endif
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/*
-#if ! defined(CUDA_NVVM_GET_SMEM_POINTER_SUPPORTED) && (__CUDACC_VER_MAJOR__ > 10)
-  #define CUDA_NVVM_GET_SMEM_POINTER_SUPPORTED 1
-#endif
-#if ! defined(CUDA_NVVM_GET_SMEM_POINTER_SUPPORTED)
-  #define CUDA_NVVM_GET_SMEM_POINTER_SUPPORTED ((__CUDACC_VER_MAJOR__ == 10) && (__CUDACC_VER_MINOR__ >= 1))
-#endif
-
-#if ! defined(CUDA_NVVM_GET_SMEM_POINTER_ENABLED)
-  #define CUDA_NVVM_GET_SMEM_POINTER_ENABLED CUDA_NVVM_GET_SMEM_POINTER_SUPPORTED
-#endif
-*/
-
-#if (! defined (__clang__) && __CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2)
-  extern "C" {
-  //
-  // This NVVM intrinsic is subject to change in future versions of CUDA.
-  // Clients should not call it directly. Rather, they should use the 
-  // cutlass::arch::ldsm<>() template.
-  //
-  __device__ uint32_t __nvvm_get_smem_pointer(void *);
-  }
-#endif
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
 /// CUTLASS helper to get SMEM pointer
-inline __device__ unsigned cutlass_get_smem_pointer(void *ptr) {
-
-// We prefer to use the new CVTA intrinsics if they are available, otherwise we will fall back to
-// the previous internal intrinsics if they are available.
-#if (! defined (__clang__) && defined(__CUDA_ARCH__) && __CUDACC_VER_MAJOR__ >= 11)
-  //
-  // This NVVM intrinsic converts an address in shared memory to a plain
-  // unsigned integer. This is necessary to pass to shared memory instructions
-  // in inline PTX.
-  //
-  // In CUDA 11 and beyond, this replaces __nvvm_get_smem_pointer()  [only available in 10.2].
-  //
-  //__device__ size_t __cvta_generic_to_shared(void* ptr);
-
-  /// CUTLASS helper to get SMEM pointer
-  return static_cast<unsigned>(__cvta_generic_to_shared(ptr));
-
-#elif (! defined (__clang__) && defined(__CUDA_ARCH__) &&  __CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2)
-
-  return __nvvm_get_smem_pointer(ptr);
-
-#elif defined(__CUDA_ARCH__)
-
-  uint32_t smem_ptr;
-
-  asm(
-  "{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %1; cvt.u32.u64 %0, smem_ptr; }\n" 
-    : "=r"(smem_ptr) : "l"(ptr));
-
-  return smem_ptr;
-
-#else
-
-    CUTLASS_UNUSED(ptr);
-    CUTLASS_NOT_IMPLEMENTED();
-    return 0;
-
-#endif
+CUTLASS_DEVICE unsigned cutlass_get_smem_pointer(void *ptr) {
+  return cute::cast_smem_ptr_to_uint(ptr);
 }
-  
+
 /// CUTLASS helper to get SMEM pointer
-inline __device__ unsigned cutlass_get_smem_pointer(void const *ptr) {
+CUTLASS_DEVICE unsigned cutlass_get_smem_pointer(void const *ptr) {
   return cutlass_get_smem_pointer(const_cast<void *>(ptr));
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <>
-inline __device__ void ldsm<layout::RowMajor, 1>(
+CUTLASS_DEVICE void ldsm<layout::RowMajor, 1>(
     Array<unsigned, 1> & D,
     void const* ptr) {
 
-  #if defined(CUDA_LDMATRIX_ACTIVATED)
+  #if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
 
     unsigned addr = cutlass_get_smem_pointer(ptr);
 
@@ -165,11 +96,11 @@ inline __device__ void ldsm<layout::RowMajor, 1>(
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <>
-inline __device__ void ldsm<layout::RowMajor, 2>(
+CUTLASS_DEVICE void ldsm<layout::RowMajor, 2>(
     Array<unsigned, 2> & D,
     void const* ptr) {
 
-  #if defined(CUDA_LDMATRIX_ACTIVATED)
+  #if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
 
     unsigned addr = cutlass_get_smem_pointer(ptr);
 
@@ -189,11 +120,11 @@ inline __device__ void ldsm<layout::RowMajor, 2>(
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <>
-inline __device__ void ldsm<layout::RowMajor, 4>(
+CUTLASS_DEVICE void ldsm<layout::RowMajor, 4>(
     Array<unsigned, 4> & D,
     void const* ptr) {
 
-  #if defined(CUDA_LDMATRIX_ACTIVATED)
+  #if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
 
     unsigned addr = cutlass_get_smem_pointer(ptr);
 
@@ -217,11 +148,11 @@ inline __device__ void ldsm<layout::RowMajor, 4>(
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <>
-inline __device__ void ldsm<layout::ColumnMajor, 1>(
+CUTLASS_DEVICE void ldsm<layout::ColumnMajor, 1>(
     Array<unsigned, 1> & D,
     void const* ptr) {
 
-  #if CUDA_LDMATRIX_ACTIVATED
+  #if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
 
     unsigned addr = cutlass_get_smem_pointer(ptr);
 
@@ -241,11 +172,11 @@ inline __device__ void ldsm<layout::ColumnMajor, 1>(
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <>
-inline __device__ void ldsm<layout::ColumnMajor, 2>(
+CUTLASS_DEVICE void ldsm<layout::ColumnMajor, 2>(
     Array<unsigned, 2> & D,
     void const* ptr) {
 
-  #if defined(CUDA_LDMATRIX_ACTIVATED)
+  #if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
 
     unsigned addr = cutlass_get_smem_pointer(ptr);
 
@@ -265,11 +196,11 @@ inline __device__ void ldsm<layout::ColumnMajor, 2>(
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <>
-inline __device__ void ldsm<layout::ColumnMajor, 4>(
+CUTLASS_DEVICE void ldsm<layout::ColumnMajor, 4>(
     Array<unsigned, 4> & D,
     void const* ptr) {
 
-  #if defined(CUDA_LDMATRIX_ACTIVATED)
+  #if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
 
     unsigned addr = cutlass_get_smem_pointer(ptr);
 
diff --git a/include/cutlass/arch/memory_sm80.h b/include/cutlass/arch/memory_sm80.h
index 48a499c5c5..cb0ba4b54b 100644
--- a/include/cutlass/arch/memory_sm80.h
+++ b/include/cutlass/arch/memory_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -36,6 +36,7 @@
 #pragma once
 
 #include "cutlass/cutlass.h"
+#include "cutlass/complex.h"
 #include "cutlass/arch/memory.h"
 #include "cutlass/arch/memory_sm75.h"
 #include "cutlass/arch/cache_operation.h"
@@ -53,7 +54,7 @@ namespace arch {
 
 /// Initiates an asynchronous copy from global memory to shared memory.
 ///
-/// LDGSTS
+/// cp.async
 ///
 template <
     /// Size of the access in bytes
@@ -65,7 +66,7 @@ struct cp_async;
 /// Initiates an asynchronous copy from global memory to shared memory. Rather than predicate
 /// the entire transfer, zeros are written to SMEM if the guard predicate is false.
 ///
-/// LDGSTS
+/// cp.async
 ///
 template <
     /// Size of the access in bytes
@@ -77,7 +78,7 @@ struct cp_async_zfill;
 /// Initiates an asynchronous copy from global memory to shared memory. Rather than predicate
 /// the entire transfer, nans (0x7eff) are written to SMEM if the guard predicate is false.
 ///
-/// LDGSTS
+/// cp.async
 ///
 template <
     /// Size of the access in bytes
@@ -89,7 +90,7 @@ struct cp_async_nan;
 /// Either 0 or 1 are written to SMEM based on input element type
 /// Used for diagonal elements of triangular matrix of BLAS3 functions
 ///
-/// STS
+/// st.shared
 ///
 template <
    /// Type of Element
@@ -98,6 +99,9 @@ template <
    bool IsHermitianData = false>
 struct cp_async_diag;
 
+static const uint32_t OOB_NAN_F16 = 0x7eff;
+static const uint32_t OOB_NAN_F16x2 = ((OOB_NAN_F16 << 16) | OOB_NAN_F16);
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Partial specialization
@@ -190,8 +194,8 @@ struct cp_async_nan<16, CacheOperation::Always> {
   cp_async_nan(void *smem_ptr, void const *global_ptr, bool pred_guard) {
     #if CUDA_CP_ASYNC_ACTIVATED
 
-      static __constant__ uint4 OOB_NAN_F16x8 = {0x7eff7eff, 0x7eff7eff,
-                                                 0x7eff7eff, 0x7eff7eff};
+      static __constant__ uint4 OOB_NAN_F16x8 = {OOB_NAN_F16x2, OOB_NAN_F16x2,
+                                                 OOB_NAN_F16x2, OOB_NAN_F16x2};
 
       unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
 
@@ -305,7 +309,6 @@ struct cp_async_diag <Element_, true> {
   }
 };
 
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Partial specialization
@@ -323,6 +326,7 @@ struct cp_async<SizeInBytes, CacheOperation::Global> {
         "cp.async only supports CacheOperation::Global when access size is 16B.");
 
       unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
+      cutlass::arch::synclog_emit_cp_async(__LINE__, smem_int_ptr, global_ptr, pred_guard, SizeInBytes);
 
       asm volatile(
           "{\n"
@@ -362,6 +366,7 @@ struct cp_async_zfill<SizeInBytes, CacheOperation::Global> {
 
       unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
       int src_in_bytes = (pred_guard ? SizeInBytes : 0);
+      cutlass::arch::synclog_emit_cp_async_zfill(__LINE__, smem_int_ptr, global_ptr, pred_guard, SizeInBytes);
 
       asm volatile(
 #if CUTLASS_ENABLE_L2_PREFETCH
@@ -386,6 +391,48 @@ struct cp_async_zfill<SizeInBytes, CacheOperation::Global> {
   }
 };
 
+/// Partial specialization
+template <>
+struct cp_async_nan<16, CacheOperation::Global> {
+  static int const kSizeInBytes = 16;
+
+  /// Copy with nan fill
+  CUTLASS_DEVICE
+  cp_async_nan(void *smem_ptr, void const *global_ptr, bool pred_guard) {
+    #if CUDA_CP_ASYNC_ACTIVATED
+
+      static __constant__ uint4 OOB_NAN_F16x8 = {OOB_NAN_F16x2, OOB_NAN_F16x2,
+                                                 OOB_NAN_F16x2, OOB_NAN_F16x2};
+
+      unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
+      cutlass::arch::synclog_emit_cp_async_nan(__LINE__, smem_int_ptr, global_ptr, pred_guard);
+
+      asm volatile(
+          "{\n"
+          "  .reg .pred p;\n"
+          "  setp.ne.b32 p, %0, 0;\n"
+#if CUTLASS_ENABLE_L2_PREFETCH
+          "  @p cp.async.cg.shared.global.L2::128B [%1], [%2], %3;\n"
+#else
+          "  @p cp.async.cg.shared.global [%1], [%2], %3;\n"
+#endif
+          "  @!p st.shared.v4.u32 [%1], {%4, %5, %6, %7};\n"
+          "}\n"
+          :
+          : "r"((int)pred_guard), "r"(smem_int_ptr), "l"(global_ptr),
+            "n"(kSizeInBytes), "r"(OOB_NAN_F16x8.x), "r"(OOB_NAN_F16x8.y), "r"(OOB_NAN_F16x8.z),
+            "r"(OOB_NAN_F16x8.w));
+
+    #else
+
+      CUTLASS_UNUSED(smem_ptr);
+      CUTLASS_UNUSED(global_ptr);
+      CUTLASS_UNUSED(pred_guard);
+      CUTLASS_NOT_IMPLEMENTED();
+
+    #endif
+  }
+};
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Establishes an ordering w.r.t previously issued cp.async instructions. Does not block.
@@ -393,6 +440,7 @@ CUTLASS_DEVICE
 void cp_async_fence() {
   #if CUDA_CP_ASYNC_ACTIVATED
   asm volatile("cp.async.commit_group;\n" ::);
+  cutlass::arch::synclog_emit_cp_async_fence(__LINE__);
   #endif
 }
 
@@ -403,6 +451,7 @@ template <int N>
 CUTLASS_DEVICE void cp_async_wait() {
   #if CUDA_CP_ASYNC_ACTIVATED
   asm volatile("cp.async.wait_group %0;\n" ::"n"(N));
+  cutlass::arch::synclog_emit_cp_async_wait(__LINE__, N);
   #endif
 }
 
@@ -411,6 +460,7 @@ template <>
 CUTLASS_DEVICE void cp_async_wait<0>() {
   #if CUDA_CP_ASYNC_ACTIVATED
   asm volatile("cp.async.wait_all;\n" ::);
+  cutlass::arch::synclog_emit_cp_async_wait_all(__LINE__);
   #endif
 }
 
diff --git a/include/cutlass/arch/mma.h b/include/cutlass/arch/mma.h
index ce3e02f365..007ba19bed 100644
--- a/include/cutlass/arch/mma.h
+++ b/include/cutlass/arch/mma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -49,61 +49,85 @@ namespace arch {
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Tag indicating the operation implied by MMA.
-struct OpMultiplyAdd;
+struct OpMultiplyAdd {};
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Tag indicating the result is saturated to MAX_FLOAT|MIN_FLOAT or MAX_INT|MIN_INT
-struct OpMultiplyAddSaturate;
+struct OpMultiplyAddSaturate {};
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Tag indicating the input is converted to a narrower type (BF16)
-struct OpMultiplyAddFastBF16;
+struct OpMultiplyAddFastBF16 {};
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Tag indicating the input is converted to a narrower type (F16)
-struct OpMultiplyAddFastF16;
+struct OpMultiplyAddFastF16 {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tag indicating the input data types are mixed and the narrower type is 
+/// upcasted to the wider type
+struct OpMultiplyAddMixedInputUpcast {};
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Tag indicating the input is converted to 2 (big and small) TF32 components
 //  Perform 3xTF32 or 4xTF32 for every F32 output element
-struct OpMultiplyAddFastF32;
+struct OpMultiplyAddFastF32 {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Tag indicating the input is converted to 2 (big and small) TF32 components
 //  Perform 3xTF32 or 4xTF32 for every complex<F32> output element
-struct OpMultiplyAddComplexFastF32;
+struct OpMultiplyAddComplexFastF32 {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tag indicating that staged accumulation is not to be used. This is valid only for SM89
+/// FP8 kernels.
+struct OpMultiplyAddFastAccum;
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Tag indicating the complex multiply-add operation
-struct OpMultiplyAddComplex;
+struct OpMultiplyAddComplex {};
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Tag indicating the gaussian complex multiply-add operation
-struct OpMultiplyAddGaussianComplex;
+struct OpMultiplyAddGaussianComplex {};
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Tag indicating the inner product is defined by (XOR, POPC)
-struct OpXorPopc;
+struct OpXorPopc {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tag indicating the inner product is defined by (AND, POPC)
+struct OpAndPopc {};
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Tag classifying math operators as thread-level operations.
-struct OpClassSimt;
+struct OpClassSimt {};
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Tag classifing operators as Tensor Core operations.
-struct OpClassTensorOp;
+/// Tag classifying operators as Tensor Core operations.
+struct OpClassTensorOp {};
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-/// Tag classifing operators as WMMA Tensor Core operations
-struct OpClassWmmaTensorOp;
+/// Tag classifying operators as WMMA Tensor Core operations
+struct OpClassWmmaTensorOp {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tag classifying operators as Tensor Core with structure sparse operations.
+struct OpClassSparseTensorOp {};
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -223,4 +247,23 @@ struct SparseMma;
 #include "cutlass/arch/mma_sm75.h"
 #include "cutlass/arch/mma_sm80.h"
 #include "cutlass/arch/mma_sparse_sm80.h"
+#include "cutlass/arch/mma_sm89.h"
+#include "cutlass/arch/mma_sparse_sm89.h"
+#include "cutlass/arch/mma_sm90.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace arch {
+namespace detail {
+/// Helper for determining whether staged accumulation should be used for a given operator
+template <typename Operator>
+struct UseStagedAccumulation {
+  static bool const value = platform::is_same<typename Operator::MathOperator, OpMultiplyAddFastF32>::value ||
+                            platform::is_same<typename Operator::MathOperator, OpMultiplyAddComplexFastF32>::value ||
+                            is_sm89_staged_policy_v<Operator>;
+};
+} // namespace detail
+} // namespace arch
+} // namespace cutlass
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/arch/mma_sm50.h b/include/cutlass/arch/mma_sm50.h
index f5458fc8be..98ff18bea0 100644
--- a/include/cutlass/arch/mma_sm50.h
+++ b/include/cutlass/arch/mma_sm50.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/arch/mma_sm60.h b/include/cutlass/arch/mma_sm60.h
index 6fa8b6f7c9..3e3c71ef36 100644
--- a/include/cutlass/arch/mma_sm60.h
+++ b/include/cutlass/arch/mma_sm60.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/arch/mma_sm61.h b/include/cutlass/arch/mma_sm61.h
index dc90d7868d..82a5aa7280 100644
--- a/include/cutlass/arch/mma_sm61.h
+++ b/include/cutlass/arch/mma_sm61.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/arch/mma_sm70.h b/include/cutlass/arch/mma_sm70.h
index 4fe862f4d9..28bb46382c 100644
--- a/include/cutlass/arch/mma_sm70.h
+++ b/include/cutlass/arch/mma_sm70.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -33,11 +33,7 @@
 */
 #pragma once
 
-#if defined(__CUDACC_RTC__)
 #include <cuda/std/cassert>
-#else
-#include <assert.h>
-#endif
 
 #include "mma.h"
 #include "cutlass/layout/matrix.h"
diff --git a/include/cutlass/arch/mma_sm75.h b/include/cutlass/arch/mma_sm75.h
index 816072bd62..a39ededbe0 100644
--- a/include/cutlass/arch/mma_sm75.h
+++ b/include/cutlass/arch/mma_sm75.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -34,11 +34,7 @@
 
 #pragma once
 
-#if defined(__CUDACC_RTC__)
 #include <cuda/std/cassert>
-#else
-#include <assert.h>
-#endif
 
 #include "cutlass/arch/wmma.h"
 
@@ -126,7 +122,11 @@ struct Mma<
       : "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(C[0]), "r"(C[1]));
 
 #else
-    assert(0);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
 #endif
   }
 };
@@ -188,242 +188,11 @@ struct Mma<
   );
 
 #else
-    assert(0);
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// Integer matrix multiply .8816 (8b)
-//
-////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: S32 = S8 * S8 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<8, 8, 16>,
-  32,
-  int8_t,
-  layout::RowMajor,
-  int8_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<8, 8, 16>;
-
-  using ElementA = int8_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<int8_t, 4>;
-
-  using ElementB = int8_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<int8_t, 4>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 2>;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm75;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
-
-  unsigned const & A = reinterpret_cast<unsigned const &>(a);
-  unsigned const & B = reinterpret_cast<unsigned const &>(b);
-
-  int const *C = reinterpret_cast<int const *>(&c);
-  int *D = reinterpret_cast<int *>(&d);
-
-  asm volatile("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
-      : "=r"(D[0]), "=r"(D[1])
-      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-
-#else
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = U8 * S8 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<8, 8, 16>,
-  32,
-  uint8_t,
-  layout::RowMajor,
-  int8_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<8, 8, 16>;
-
-  using ElementA = uint8_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<uint8_t, 4>;
-
-  using ElementB = int8_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<int8_t, 4>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 2>;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm75;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
-
-  unsigned const & A = reinterpret_cast<unsigned const &>(a);
-  unsigned const & B = reinterpret_cast<unsigned const &>(b);
-
-  int const *C = reinterpret_cast<int const *>(&c);
-  int *D = reinterpret_cast<int *>(&d);
-
-  asm volatile("mma.sync.aligned.m8n8k16.row.col.s32.u8.s8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
-      : "=r"(D[0]), "=r"(D[1])
-      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-
-#else
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = S8 * U8 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<8, 8, 16>,
-  32,
-  int8_t,
-  layout::RowMajor,
-  uint8_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<8, 8, 16>;
-
-  using ElementA = int8_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<int8_t, 4>;
-
-  using ElementB = uint8_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<uint8_t, 4>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 2>;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm75;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
-
-  unsigned const & A = reinterpret_cast<unsigned const &>(a);
-  unsigned const & B = reinterpret_cast<unsigned const &>(b);
-
-  int const *C = reinterpret_cast<int const *>(&c);
-  int *D = reinterpret_cast<int *>(&d);
-
-  asm volatile("mma.sync.aligned.m8n8k16.row.col.s8.u8 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
-      : "=r"(D[0]), "=r"(D[1])
-      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-
-
-#else
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = U8 * U8 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<8, 8, 16>,
-  32,
-  uint8_t,
-  layout::RowMajor,
-  uint8_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<8, 8, 16>;
-
-  using ElementA = uint8_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<uint8_t, 4>;
-
-  using ElementB = uint8_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<uint8_t, 4>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 2>;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm75;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
-
-  unsigned const & A = reinterpret_cast<unsigned const &>(a);
-  unsigned const & B = reinterpret_cast<unsigned const &>(b);
-
-  int const *C = reinterpret_cast<int const *>(&c);
-  int *D = reinterpret_cast<int *>(&d);
-
-  asm volatile("mma.sync.aligned.m8n8k16.row.col.s32.u8.u8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
-      : "=r"(D[0]), "=r"(D[1])
-      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-
-#else
-    assert(0);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
 #endif
   }
 };
@@ -437,7 +206,7 @@ struct Mma<
 /// Matrix multiply-add operation: S32 = S8 * S8 + S32
 template <>
 struct Mma<
-  gemm::GemmShape<8,8,16>,
+  gemm::GemmShape<8, 8, 16>,
   32,
   int8_t,
   layout::RowMajor,
@@ -447,7 +216,7 @@ struct Mma<
   layout::RowMajor,
   OpMultiplyAddSaturate> {
 
-  using Shape = gemm::GemmShape<8,8,16>;
+  using Shape = gemm::GemmShape<8, 8, 16>;
 
   using ElementA = int8_t;
   using LayoutA = layout::RowMajor;
@@ -484,9 +253,12 @@ struct Mma<
   asm volatile("mma.sync.aligned.m8n8k16.row.col.satfinite.s32.s8.s8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-
 #else
-    assert(0);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
 #endif
   }
 };
@@ -494,7 +266,7 @@ struct Mma<
 /// Matrix multiply-add operation: S32 = U8 * S8 + S32
 template <>
 struct Mma<
-  gemm::GemmShape<8,8,16>,
+  gemm::GemmShape<8, 8, 16>,
   32,
   uint8_t,
   layout::RowMajor,
@@ -504,7 +276,7 @@ struct Mma<
   layout::RowMajor,
   OpMultiplyAddSaturate> {
 
-  using Shape = gemm::GemmShape<8,8,16>;
+  using Shape = gemm::GemmShape<8, 8, 16>;
 
   using ElementA = uint8_t;
   using LayoutA = layout::RowMajor;
@@ -541,9 +313,12 @@ struct Mma<
   asm volatile("mma.sync.aligned.m8n8k16.row.col.satfinite.s32.u8.s8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-
 #else
-    assert(0);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
 #endif
   }
 };
@@ -551,7 +326,7 @@ struct Mma<
 /// Matrix multiply-add operation: S32 = S8 * U8 + S32
 template <>
 struct Mma<
-  gemm::GemmShape<8,8,16>,
+  gemm::GemmShape<8, 8, 16>,
   32,
   int8_t,
   layout::RowMajor,
@@ -561,7 +336,7 @@ struct Mma<
   layout::RowMajor,
   OpMultiplyAddSaturate> {
 
-  using Shape = gemm::GemmShape<8,8,16>;
+  using Shape = gemm::GemmShape<8, 8, 16>;
 
   using ElementA = int8_t;
   using LayoutA = layout::RowMajor;
@@ -598,9 +373,12 @@ struct Mma<
   asm volatile("mma.sync.aligned.m8n8k16.row.col.satfinite.s32.s8.u8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-
 #else
-    assert(0);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
 #endif
   }
 };
@@ -608,7 +386,7 @@ struct Mma<
 /// Matrix multiply-add operation: S32 = U8 * U8 + S32
 template <>
 struct Mma<
-  gemm::GemmShape<8,8,16>,
+  gemm::GemmShape<8, 8, 16>,
   32,
   uint8_t,
   layout::RowMajor,
@@ -618,7 +396,7 @@ struct Mma<
   layout::RowMajor,
   OpMultiplyAddSaturate> {
 
-  using Shape = gemm::GemmShape<8,8,16>;
+  using Shape = gemm::GemmShape<8, 8, 16>;
 
   using ElementA = uint8_t;
   using LayoutA = layout::RowMajor;
@@ -655,243 +433,12 @@ struct Mma<
   asm volatile("mma.sync.aligned.m8n8k16.row.col.satfinite.s32.u8.u8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-
-#else
-    assert(0);
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// Integer matrix multiply  (4b)
-//
-////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: S32 = S4 * S4 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<8,8,32>,
-  32,
-  int4b_t,
-  layout::RowMajor,
-  int4b_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<8,8,32>;
-
-  using ElementA = int4b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<int4b_t, 8>;
-
-  using ElementB = int4b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<int4b_t, 8>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 2>;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm75;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
-
-  unsigned const & A = reinterpret_cast<unsigned const &>(a);
-  unsigned const & B = reinterpret_cast<unsigned const &>(b);
-
-  int const *C = reinterpret_cast<int const *>(&c);
-  int *D = reinterpret_cast<int *>(&d);
-
-  asm volatile("mma.sync.aligned.m8n8k32.row.col.s32.s4.s4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
-      : "=r"(D[0]), "=r"(D[1])
-      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-
-#else
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = U4 * S4 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<8,8,32>,
-  32,
-  uint4b_t,
-  layout::RowMajor,
-  int4b_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<8,8,32>;
-
-  using ElementA = uint4b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<uint4b_t, 8>;
-
-  using ElementB = int4b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<int4b_t, 8>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 2>;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm75;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
-
-  unsigned const & A = reinterpret_cast<unsigned const &>(a);
-  unsigned const & B = reinterpret_cast<unsigned const &>(b);
-
-  int const *C = reinterpret_cast<int const *>(&c);
-  int *D = reinterpret_cast<int *>(&d);
-
-  asm volatile("mma.sync.aligned.m8n8k32.row.col.s32.u4.s4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
-      : "=r"(D[0]), "=r"(D[1])
-      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-
-#else
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = S4 * U4 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<8,8,32>,
-  32,
-  int4b_t,
-  layout::RowMajor,
-  uint4b_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<8,8,32>;
-
-  using ElementA = int4b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<int4b_t, 8>;
-
-  using ElementB = uint4b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<uint4b_t, 8>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 2>;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm75;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
-
-  unsigned const & A = reinterpret_cast<unsigned const &>(a);
-  unsigned const & B = reinterpret_cast<unsigned const &>(b);
-
-  int const *C = reinterpret_cast<int const *>(&c);
-  int *D = reinterpret_cast<int *>(&d);
-
-  asm volatile("mma.sync.aligned.m8n8k32.row.col.s32.s4.u4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
-      : "=r"(D[0]), "=r"(D[1])
-      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-
-#else
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = U4 * U4 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<8,8,32>,
-  32,
-  uint4b_t,
-  layout::RowMajor,
-  uint4b_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<8,8,32>;
-
-  using ElementA = uint4b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<uint4b_t, 8>;
-
-  using ElementB = uint4b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<uint4b_t, 8>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 2>;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm75;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
-
-  unsigned const & A = reinterpret_cast<unsigned const &>(a);
-  unsigned const & B = reinterpret_cast<unsigned const &>(b);
-
-  int const *C = reinterpret_cast<int const *>(&c);
-  int *D = reinterpret_cast<int *>(&d);
-
-  asm volatile("mma.sync.aligned.m8n8k32.row.col.s32.u4.u4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
-      : "=r"(D[0]), "=r"(D[1])
-      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-
 #else
-    assert(0);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
 #endif
   }
 };
@@ -905,7 +452,7 @@ struct Mma<
 /// Matrix multiply-add operation: S32 = S4 * S4 + S32
 template <>
 struct Mma<
-  gemm::GemmShape<8,8,32>,
+  gemm::GemmShape<8, 8, 32>,
   32,
   int4b_t,
   layout::RowMajor,
@@ -915,7 +462,7 @@ struct Mma<
   layout::RowMajor,
   OpMultiplyAddSaturate> {
 
-  using Shape = gemm::GemmShape<8,8,32>;
+  using Shape = gemm::GemmShape<8, 8, 32>;
 
   using ElementA = int4b_t;
   using LayoutA = layout::RowMajor;
@@ -952,9 +499,12 @@ struct Mma<
   asm volatile("mma.sync.aligned.m8n8k32.row.col.satfinite.s32.s4.s4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-
 #else
-    assert(0);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
 #endif
   }
 };
@@ -962,7 +512,7 @@ struct Mma<
 /// Matrix multiply-add operation: S32 = U4 * S4 + S32
 template <>
 struct Mma<
-  gemm::GemmShape<8,8,32>,
+  gemm::GemmShape<8, 8, 32>,
   32,
   uint4b_t,
   layout::RowMajor,
@@ -972,7 +522,7 @@ struct Mma<
   layout::RowMajor,
   OpMultiplyAddSaturate> {
 
-  using Shape = gemm::GemmShape<8,8,32>;
+  using Shape = gemm::GemmShape<8, 8, 32>;
 
   using ElementA = uint4b_t;
   using LayoutA = layout::RowMajor;
@@ -1009,9 +559,12 @@ struct Mma<
   asm volatile("mma.sync.aligned.m8n8k32.row.col.satfinite.s32.u4.s4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-
 #else
-    assert(0);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
 #endif
   }
 };
@@ -1019,7 +572,7 @@ struct Mma<
 /// Matrix multiply-add operation: S32 = S4 * U4 + S32
 template <>
 struct Mma<
-  gemm::GemmShape<8,8,32>,
+  gemm::GemmShape<8, 8, 32>,
   32,
   int4b_t,
   layout::RowMajor,
@@ -1029,7 +582,7 @@ struct Mma<
   layout::RowMajor,
   OpMultiplyAddSaturate> {
 
-  using Shape = gemm::GemmShape<8,8,32>;
+  using Shape = gemm::GemmShape<8, 8, 32>;
 
   using ElementA = int4b_t;
   using LayoutA = layout::RowMajor;
@@ -1066,9 +619,12 @@ struct Mma<
   asm volatile("mma.sync.aligned.m8n8k32.row.col.satfinite.s32.s4.u4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-
 #else
-    assert(0);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
 #endif
   }
 };
@@ -1076,7 +632,7 @@ struct Mma<
 /// Matrix multiply-add operation: S32 = U4 * U4 + S32
 template <>
 struct Mma<
-  gemm::GemmShape<8,8,32>,
+  gemm::GemmShape<8, 8, 32>,
   32,
   uint4b_t,
   layout::RowMajor,
@@ -1086,7 +642,7 @@ struct Mma<
   layout::RowMajor,
   OpMultiplyAddSaturate> {
 
-  using Shape = gemm::GemmShape<8,8,32>;
+  using Shape = gemm::GemmShape<8, 8, 32>;
 
   using ElementA = uint4b_t;
   using LayoutA = layout::RowMajor;
@@ -1123,9 +679,12 @@ struct Mma<
   asm volatile("mma.sync.aligned.m8n8k32.row.col.satfinite.s32.u4.u4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-
 #else
-    assert(0);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
 #endif
   }
 };
@@ -1176,6 +735,7 @@ struct Mma<
   ) const {
 
 #if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
+
 #if defined(CUTLASS_ARCH_WMMA_ENABLED)
   using WmmaFragmentA = nvcuda::wmma::fragment<
           nvcuda::wmma::matrix_a,
@@ -1208,16 +768,18 @@ struct Mma<
 
   nvcuda::wmma::bmma_sync(D, A, B, C, nvcuda::wmma::experimental::bmmaBitOpXOR, 
                                           nvcuda::wmma::experimental::bmmaAccumulateOpPOPC);
+
 #else
 
-  assert(0); // WMMA must be supported to issue binary matrix multiply-accumulate instructions.
+  CUTLASS_UNUSED(a);
+  CUTLASS_UNUSED(b);
+  CUTLASS_UNUSED(c);
+  CUTLASS_UNUSED(d);
+  CUTLASS_NOT_IMPLEMENTED(); // WMMA must be supported to issue binary matrix multiply-accumulate instructions.
 
 #endif // defined(CUTLASS_ARCH_WMMA_ENABLED)
 
-#else
-    assert(0);
 #endif
-
   }
 };
 
diff --git a/include/cutlass/arch/mma_sm80.h b/include/cutlass/arch/mma_sm80.h
index 5b9f524067..19d78bf20e 100644
--- a/include/cutlass/arch/mma_sm80.h
+++ b/include/cutlass/arch/mma_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -34,11 +34,7 @@
 
 #pragma once
 
-#if defined(__CUDACC_RTC__)
 #include <cuda/std/cassert>
-#else
-#include <assert.h>
-#endif
 
 #include "cutlass/cutlass.h"
 #include "mma.h"
@@ -53,7 +49,16 @@
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
 #define CUTLASS_ARCH_MMA_SM80_ENABLED
+
+#if (__CUDA_ARCH__ <= 900)
+#define CUTLASS_ARCH_MMA_B1_AND_SM80_ENABLED
+#endif
+#if (__CUDA_ARCH__ <= 890)
+#define CUTLASS_ARCH_MMA_B1_XOR_SM80_ENABLED
 #endif
+
+#endif
+
 #endif
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -528,7 +533,7 @@ struct Mma<
 
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Matrix Multiply 16816 - S8 input, S32 accumulation
+// Matrix Multiply 16816 - S8 input, S32 accumulation - SATURATE
 //
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -543,7 +548,7 @@ struct Mma<
   layout::ColumnMajor,
   int,
   layout::RowMajor,
-  OpMultiplyAdd> {
+  OpMultiplyAddSaturate> {
 
   using Shape = gemm::GemmShape<16,8,16>;
 
@@ -559,8 +564,7 @@ struct Mma<
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<int, 4>;
 
-  using Operator = OpMultiplyAdd;
-
+  using Operator = OpMultiplyAddSaturate;
   using ArchTag = arch::Sm80;
 
   /// Computes multiply-add
@@ -573,6 +577,7 @@ struct Mma<
   ) const {
 
 #if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
     uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
     uint32_t const &B = reinterpret_cast<uint32_t const &>(b);
 
@@ -580,8 +585,8 @@ struct Mma<
     int *D = reinterpret_cast<int *>(&d);
 
     asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32 {%0,%1,%2,%3}, {%4,%5}, {%6}, "
-        "{%7,%8,%9,%10};\n"
+        "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, "
+        "{%6}, {%7,%8,%9,%10};\n"
         : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
         : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]),
           "r"(C[3]));
@@ -603,7 +608,7 @@ struct Mma<
   layout::ColumnMajor,
   int,
   layout::RowMajor,
-  OpMultiplyAdd> {
+  OpMultiplyAddSaturate> {
 
   using Shape = gemm::GemmShape<16,8,16>;
 
@@ -619,7 +624,7 @@ struct Mma<
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<int, 4>;
 
-  using Operator = OpMultiplyAdd;
+  using Operator = OpMultiplyAddSaturate;
   using ArchTag = arch::Sm80;
 
   /// Computes multiply-add
@@ -632,6 +637,7 @@ struct Mma<
   ) const {
 
 #if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
     uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
     uint32_t const &B = reinterpret_cast<uint32_t const &>(b);
 
@@ -639,8 +645,8 @@ struct Mma<
     int *D = reinterpret_cast<int *>(&d);
 
     asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.s32.u8.s8.s32 {%0,%1,%2,%3}, {%4,%5}, {%6}, "
-        "{%7,%8,%9,%10};\n"
+        "mma.sync.aligned.m16n8k16.row.col.s32.u8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, "
+        "{%6}, {%7,%8,%9,%10};\n"
         : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
         : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]),
           "r"(C[3]));
@@ -662,7 +668,7 @@ struct Mma<
   layout::ColumnMajor,
   int,
   layout::RowMajor,
-  OpMultiplyAdd> {
+  OpMultiplyAddSaturate> {
 
   using Shape = gemm::GemmShape<16,8,16>;
 
@@ -678,7 +684,7 @@ struct Mma<
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<int, 4>;
 
-  using Operator = OpMultiplyAdd;
+  using Operator = OpMultiplyAddSaturate;
   using ArchTag = arch::Sm80;
 
   /// Computes multiply-add
@@ -699,12 +705,12 @@ struct Mma<
     int *D = reinterpret_cast<int *>(&d);
 
     asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.s32.s8.u8.s32 {%0,%1,%2,%3}, {%4,%5}, {%6}, "
-        "{%7,%8,%9,%10};\n"
+        "mma.sync.aligned.m16n8k16.row.col.s32.s8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, "
+        "{%6}, {%7,%8,%9,%10};\n"
         : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
         : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]),
           "r"(C[3]));
-
+    
 #else
     assert(0);
 #endif
@@ -722,7 +728,7 @@ struct Mma<
   layout::ColumnMajor,
   int,
   layout::RowMajor,
-  OpMultiplyAdd> {
+  OpMultiplyAddSaturate> {
 
   using Shape = gemm::GemmShape<16,8,16>;
 
@@ -738,7 +744,7 @@ struct Mma<
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<int, 4>;
 
-  using Operator = OpMultiplyAdd;
+  using Operator = OpMultiplyAddSaturate;
   using ArchTag = arch::Sm80;
 
   /// Computes multiply-add
@@ -759,13 +765,12 @@ struct Mma<
     int *D = reinterpret_cast<int *>(&d);
 
     asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.s32.u8.u8.s32 {%0,%1,%2,%3}, {%4,%5}, {%6}, "
-        "{%7,%8,%9,%10};\n"
+        "mma.sync.aligned.m16n8k16.row.col.s32.u8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, "
+        "{%6}, {%7,%8,%9,%10};\n"
         : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
         : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]),
           "r"(C[3]));
 
-
 #else
     assert(0);
 #endif
@@ -774,14 +779,14 @@ struct Mma<
 
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Matrix Multiply 16816 - S8 input, S32 accumulation - SATURATE
+// Matrix Multiply 16832 - S8 input, S32 accumulation - SATURATE
 //
 ////////////////////////////////////////////////////////////////////////////////
 
 /// Matrix multiply-add operation: S32 = S8 * S8 + S32
 template <>
 struct Mma<
-  gemm::GemmShape<16,8,16>,
+  gemm::GemmShape<16,8,32>,
   32,
   int8_t,
   layout::RowMajor,
@@ -791,15 +796,15 @@ struct Mma<
   layout::RowMajor,
   OpMultiplyAddSaturate> {
 
-  using Shape = gemm::GemmShape<16,8,16>;
+  using Shape = gemm::GemmShape<16,8,32>;
 
   using ElementA = int8_t;
   using LayoutA = layout::RowMajor;
-  using FragmentA = Array<int8_t, 8>;
+  using FragmentA = Array<int8_t, 16>;
 
   using ElementB = int8_t;
   using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<int8_t, 4>;
+  using FragmentB = Array<int8_t, 8>;
 
   using ElementC = int;
   using LayoutC = layout::RowMajor;
@@ -819,18 +824,18 @@ struct Mma<
 
 #if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
 
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const &B = reinterpret_cast<uint32_t const &>(b);
+  uint32_t const * A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const * B = reinterpret_cast<uint32_t const *>(&b);
 
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
+  int const *C = reinterpret_cast<int const *>(&c);
+  int *D = reinterpret_cast<int *>(&d);
 
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, "
-        "{%6}, {%7,%8,%9,%10};\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]),
-          "r"(C[3]));
+  asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite {%0,%1,%2,%3}, "
+      "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+      : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+      : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+        "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
 
 #else
     assert(0);
@@ -841,7 +846,7 @@ struct Mma<
 /// Matrix multiply-add operation: S32 = U8 * S8 + S32
 template <>
 struct Mma<
-  gemm::GemmShape<16,8,16>,
+  gemm::GemmShape<16,8,32>,
   32,
   uint8_t,
   layout::RowMajor,
@@ -851,15 +856,15 @@ struct Mma<
   layout::RowMajor,
   OpMultiplyAddSaturate> {
 
-  using Shape = gemm::GemmShape<16,8,16>;
+  using Shape = gemm::GemmShape<16,8,32>;
 
   using ElementA = uint8_t;
   using LayoutA = layout::RowMajor;
-  using FragmentA = Array<uint8_t, 8>;
+  using FragmentA = Array<uint8_t, 16>;
 
   using ElementB = int8_t;
   using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<int8_t, 4>;
+  using FragmentB = Array<int8_t, 8>;
 
   using ElementC = int;
   using LayoutC = layout::RowMajor;
@@ -880,17 +885,17 @@ struct Mma<
 #if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
 
     uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const &B = reinterpret_cast<uint32_t const &>(b);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
 
     int const *C = reinterpret_cast<int const *>(&c);
     int *D = reinterpret_cast<int *>(&d);
 
     asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.s32.u8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, "
-        "{%6}, {%7,%8,%9,%10};\n"
+        "mma.sync.aligned.m16n8k32.row.col.s32.u8.s8.s32.satfinite {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
         : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]),
-          "r"(C[3]));
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
 
 #else
     assert(0);
@@ -901,7 +906,7 @@ struct Mma<
 /// Matrix multiply-add operation: S32 = S8 * U8 + S32
 template <>
 struct Mma<
-  gemm::GemmShape<16,8,16>,
+  gemm::GemmShape<16,8,32>,
   32,
   int8_t,
   layout::RowMajor,
@@ -911,15 +916,15 @@ struct Mma<
   layout::RowMajor,
   OpMultiplyAddSaturate> {
 
-  using Shape = gemm::GemmShape<16,8,16>;
+  using Shape = gemm::GemmShape<16,8,32>;
 
   using ElementA = int8_t;
   using LayoutA = layout::RowMajor;
-  using FragmentA = Array<int8_t, 8>;
+  using FragmentA = Array<int8_t, 16>;
 
   using ElementB = uint8_t;
   using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<uint8_t, 4>;
+  using FragmentB = Array<uint8_t, 8>;
 
   using ElementC = int;
   using LayoutC = layout::RowMajor;
@@ -940,18 +945,18 @@ struct Mma<
 #if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
 
     uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const &B = reinterpret_cast<uint32_t const &>(b);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
 
     int const *C = reinterpret_cast<int const *>(&c);
     int *D = reinterpret_cast<int *>(&d);
 
     asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.s32.s8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, "
-        "{%6}, {%7,%8,%9,%10};\n"
+        "mma.sync.aligned.m16n8k32.row.col.s32.s8.u8.s32.satfinite {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
         : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]),
-          "r"(C[3]));
-    
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
 #else
     assert(0);
 #endif
@@ -961,7 +966,7 @@ struct Mma<
 /// Matrix multiply-add operation: S32 = U8 * U8 + S32
 template <>
 struct Mma<
-  gemm::GemmShape<16,8,16>,
+  gemm::GemmShape<16,8,32>,
   32,
   uint8_t,
   layout::RowMajor,
@@ -971,15 +976,15 @@ struct Mma<
   layout::RowMajor,
   OpMultiplyAddSaturate> {
 
-  using Shape = gemm::GemmShape<16,8,16>;
+  using Shape = gemm::GemmShape<16,8,32>;
 
   using ElementA = uint8_t;
   using LayoutA = layout::RowMajor;
-  using FragmentA = Array<uint8_t, 8>;
+  using FragmentA = Array<uint8_t, 16>;
 
   using ElementB = uint8_t;
   using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<uint8_t, 4>;
+  using FragmentB = Array<uint8_t, 8>;
 
   using ElementC = int;
   using LayoutC = layout::RowMajor;
@@ -1000,17 +1005,17 @@ struct Mma<
 #if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
 
     uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const &B = reinterpret_cast<uint32_t const &>(b);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
 
     int const *C = reinterpret_cast<int const *>(&c);
     int *D = reinterpret_cast<int *>(&d);
 
     asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.s32.u8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, "
-        "{%6}, {%7,%8,%9,%10};\n"
+        "mma.sync.aligned.m16n8k32.row.col.s32.u8.u8.s32.satfinite {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
         : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]),
-          "r"(C[3]));
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
 
 #else
     assert(0);
@@ -1020,38 +1025,38 @@ struct Mma<
 
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Matrix Multiply 16832 - S8 input, S32 accumulation
+// Matrix Multiply 16864 - S4 input, S32 accumulation - SATURATE
 //
 ////////////////////////////////////////////////////////////////////////////////
 
-/// Matrix multiply-add operation: S32 = S8 * S8 + S32
+/// Matrix multiply-add operation: S32 = S4 * S4 + S32
 template <>
 struct Mma<
-  gemm::GemmShape<16,8,32>,
+  gemm::GemmShape<16, 8, 64>,
   32,
-  int8_t,
+  cutlass::int4b_t,
   layout::RowMajor,
-  int8_t,
+  cutlass::int4b_t,
   layout::ColumnMajor,
   int,
   layout::RowMajor,
-  OpMultiplyAdd> {
+  OpMultiplyAddSaturate> {
 
-  using Shape = gemm::GemmShape<16,8,32>;
+  using Shape = gemm::GemmShape<16, 8, 64>;
 
-  using ElementA = int8_t;
+  using ElementA = cutlass::int4b_t;
   using LayoutA = layout::RowMajor;
-  using FragmentA = Array<int8_t, 16>;
+  using FragmentA = Array<cutlass::int4b_t, 32>;
 
-  using ElementB = int8_t;
+  using ElementB = cutlass::int4b_t;
   using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<int8_t, 8>;
+  using FragmentB = Array<cutlass::int4b_t, 16>;
 
   using ElementC = int;
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<int, 4>;
 
-  using Operator = OpMultiplyAdd;
+  using Operator = OpMultiplyAddSaturate;
   using ArchTag = arch::Sm80;
 
   /// Computes multiply-add
@@ -1065,53 +1070,57 @@ struct Mma<
 
 #if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
 
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  uint32_t const * A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const * B = reinterpret_cast<uint32_t const *>(&b);
 
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
+  int const *C = reinterpret_cast<int const *>(&c);
+  int *D = reinterpret_cast<int *>(&d);
 
-    asm volatile(
-        "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-        "{%8,%9}, {%10,%11,%12,%13};\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+  asm volatile(
+      "mma.sync.aligned.m16n8k64.row.col.s32.s4.s4.s32.satfinite {%0,%1,%2,%3}, "
+      "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+      : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+      : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+        "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
 
 #else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
     assert(0);
 #endif
   }
 };
 
-/// Matrix multiply-add operation: S32 = U8 * S8 + S32
+/// Matrix multiply-add operation: S32 = U4 * S4 + S32
 template <>
 struct Mma<
-  gemm::GemmShape<16,8,32>,
+  gemm::GemmShape<16, 8, 64>,
   32,
-  uint8_t,
+  cutlass::uint4b_t,
   layout::RowMajor,
-  int8_t,
+  cutlass::int4b_t,
   layout::ColumnMajor,
   int,
   layout::RowMajor,
-  OpMultiplyAdd> {
+  OpMultiplyAddSaturate> {
 
-  using Shape = gemm::GemmShape<16,8,32>;
+  using Shape = gemm::GemmShape<16, 8, 64>;
 
-  using ElementA = uint8_t;
+  using ElementA = cutlass::uint4b_t;
   using LayoutA = layout::RowMajor;
-  using FragmentA = Array<uint8_t, 16>;
+  using FragmentA = Array<cutlass::uint4b_t, 32>;
 
-  using ElementB = int8_t;
+  using ElementB = cutlass::int4b_t;
   using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<int8_t, 8>;
+  using FragmentB = Array<cutlass::int4b_t, 16>;
 
   using ElementC = int;
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<int, 4>;
 
-  using Operator = OpMultiplyAdd;
+  using Operator = OpMultiplyAddSaturate;
   using ArchTag = arch::Sm80;
 
   /// Computes multiply-add
@@ -1132,46 +1141,50 @@ struct Mma<
     int *D = reinterpret_cast<int *>(&d);
 
     asm volatile(
-        "mma.sync.aligned.m16n8k32.row.col.s32.u8.s8.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-        "{%8,%9}, {%10,%11,%12,%13};\n"
+        "mma.sync.aligned.m16n8k64.row.col.s32.u4.s4.s32.satfinite {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
         : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
         : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
           "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
 
 #else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
     assert(0);
 #endif
   }
 };
 
-/// Matrix multiply-add operation: S32 = S8 * U8 + S32
+/// Matrix multiply-add operation: S32 = S4 * U4 + S32
 template <>
 struct Mma<
-  gemm::GemmShape<16,8,32>,
+  gemm::GemmShape<16, 8, 64>,
   32,
-  int8_t,
+  cutlass::int4b_t,
   layout::RowMajor,
-  uint8_t,
+  cutlass::uint4b_t,
   layout::ColumnMajor,
   int,
   layout::RowMajor,
-  OpMultiplyAdd> {
+  OpMultiplyAddSaturate> {
 
-  using Shape = gemm::GemmShape<16,8,32>;
+  using Shape = gemm::GemmShape<16, 8, 64>;
 
-  using ElementA = int8_t;
+  using ElementA = cutlass::int4b_t;
   using LayoutA = layout::RowMajor;
-  using FragmentA = Array<int8_t, 16>;
+  using FragmentA = Array<cutlass::int4b_t, 32>;
 
-  using ElementB = uint8_t;
+  using ElementB = cutlass::uint4b_t;
   using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<uint8_t, 8>;
+  using FragmentB = Array<cutlass::uint4b_t, 16>;
 
   using ElementC = int;
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<int, 4>;
 
-  using Operator = OpMultiplyAdd;
+  using Operator = OpMultiplyAddSaturate;
   using ArchTag = arch::Sm80;
 
   /// Computes multiply-add
@@ -1192,46 +1205,50 @@ struct Mma<
     int *D = reinterpret_cast<int *>(&d);
 
     asm volatile(
-        "mma.sync.aligned.m16n8k32.row.col.s32.s8.u8.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-        "{%8,%9}, {%10,%11,%12,%13};\n"
+        "mma.sync.aligned.m16n8k64.row.col.s32.s4.u4.s32.satfinite {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
         : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
         : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
           "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
 
 #else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
     assert(0);
 #endif
   }
 };
 
-/// Matrix multiply-add operation: S32 = U8 * U8 + S32
+/// Matrix multiply-add operation: S32 = U4 * U4 + S32
 template <>
 struct Mma<
-  gemm::GemmShape<16,8,32>,
+  gemm::GemmShape<16, 8, 64>,
   32,
-  uint8_t,
+  cutlass::uint4b_t,
   layout::RowMajor,
-  uint8_t,
+  cutlass::uint4b_t,
   layout::ColumnMajor,
   int,
   layout::RowMajor,
-  OpMultiplyAdd> {
+  OpMultiplyAddSaturate> {
 
-  using Shape = gemm::GemmShape<16,8,32>;
+  using Shape = gemm::GemmShape<16, 8, 64>;
 
-  using ElementA = uint8_t;
+  using ElementA = cutlass::uint4b_t;
   using LayoutA = layout::RowMajor;
-  using FragmentA = Array<uint8_t, 16>;
+  using FragmentA = Array<cutlass::uint4b_t, 32>;
 
-  using ElementB = uint8_t;
+  using ElementB = cutlass::uint4b_t;
   using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<uint8_t, 8>;
+  using FragmentB = Array<cutlass::uint4b_t, 16>;
 
   using ElementC = int;
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<int, 4>;
 
-  using Operator = OpMultiplyAdd;
+  using Operator = OpMultiplyAddSaturate;
   using ArchTag = arch::Sm80;
 
   /// Computes multiply-add
@@ -1252,725 +1269,56 @@ struct Mma<
     int *D = reinterpret_cast<int *>(&d);
 
     asm volatile(
-        "mma.sync.aligned.m16n8k32.row.col.s32.u8.u8.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-        "{%8,%9}, {%10,%11,%12,%13};\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
-
-#else
-    assert(0);
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// Matrix Multiply 16832 - S8 input, S32 accumulation - SATURATE
-//
-////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: S32 = S8 * S8 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<16,8,32>,
-  32,
-  int8_t,
-  layout::RowMajor,
-  int8_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate> {
-
-  using Shape = gemm::GemmShape<16,8,32>;
-
-  using ElementA = int8_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<int8_t, 16>;
-
-  using ElementB = int8_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<int8_t, 8>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm80;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
-
-  uint32_t const * A = reinterpret_cast<uint32_t const *>(&a);
-  uint32_t const * B = reinterpret_cast<uint32_t const *>(&b);
-
-  int const *C = reinterpret_cast<int const *>(&c);
-  int *D = reinterpret_cast<int *>(&d);
-
-  asm volatile(
-      "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite {%0,%1,%2,%3}, "
-      "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-      : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-      : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
-        "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
-
-#else
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = U8 * S8 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<16,8,32>,
-  32,
-  uint8_t,
-  layout::RowMajor,
-  int8_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate> {
-
-  using Shape = gemm::GemmShape<16,8,32>;
-
-  using ElementA = uint8_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<uint8_t, 16>;
-
-  using ElementB = int8_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<int8_t, 8>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using Operator = OpMultiplyAddSaturate;
-  using ArchTag = arch::Sm80;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-    asm volatile(
-        "mma.sync.aligned.m16n8k32.row.col.s32.u8.s8.s32.satfinite {%0,%1,%2,%3}, "
-        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
-
-#else
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = S8 * U8 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<16,8,32>,
-  32,
-  int8_t,
-  layout::RowMajor,
-  uint8_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate> {
-
-  using Shape = gemm::GemmShape<16,8,32>;
-
-  using ElementA = int8_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<int8_t, 16>;
-
-  using ElementB = uint8_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<uint8_t, 8>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm80;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-    asm volatile(
-        "mma.sync.aligned.m16n8k32.row.col.s32.s8.u8.s32.satfinite {%0,%1,%2,%3}, "
-        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
-
-#else
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = U8 * U8 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<16,8,32>,
-  32,
-  uint8_t,
-  layout::RowMajor,
-  uint8_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate> {
-
-  using Shape = gemm::GemmShape<16,8,32>;
-
-  using ElementA = uint8_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<uint8_t, 16>;
-
-  using ElementB = uint8_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<uint8_t, 8>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using Operator = OpMultiplyAddSaturate;
-  using ArchTag = arch::Sm80;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-    asm volatile(
-        "mma.sync.aligned.m16n8k32.row.col.s32.u8.u8.s32.satfinite {%0,%1,%2,%3}, "
-        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
-
-#else
-    assert(0);
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// Matrix Multiply 16864 - S4 input, S32 accumulation
-//
-////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: S32 = S4 * S4 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<16, 8, 64>,
-  32,
-  cutlass::int4b_t,
-  layout::RowMajor,
-  cutlass::int4b_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<16, 8, 64>;
-
-  using ElementA = cutlass::int4b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<cutlass::int4b_t, 32>;
-
-  using ElementB = cutlass::int4b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<cutlass::int4b_t, 16>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm80;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-    asm volatile(
-        "mma.sync.aligned.m16n8k64.row.col.s32.s4.s4.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-        "{%8,%9}, {%10,%11,%12,%13};\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
-
-#else
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = U4 * S4 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<16, 8, 64>,
-  32,
-  cutlass::uint4b_t,
-  layout::RowMajor,
-  cutlass::int4b_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<16, 8, 64>;
-
-  using ElementA = cutlass::uint4b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<cutlass::uint4b_t, 32>;
-
-  using ElementB = cutlass::int4b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<cutlass::int4b_t, 16>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm80;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-    asm volatile(
-        "mma.sync.aligned.m16n8k64.row.col.s32.u4.s4.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-        "{%8,%9}, {%10,%11,%12,%13};\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
-
-#else
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = S4 * U4 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<16, 8, 64>,
-  32,
-  cutlass::int4b_t,
-  layout::RowMajor,
-  cutlass::uint4b_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<16, 8, 64>;
-
-  using ElementA = cutlass::int4b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<cutlass::int4b_t, 32>;
-
-  using ElementB = cutlass::uint4b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<cutlass::uint4b_t, 16>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm80;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-    asm volatile(
-        "mma.sync.aligned.m16n8k64.row.col.s32.s4.u4.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-        "{%8,%9}, {%10,%11,%12,%13};\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
-
-#else
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = U4 * U4 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<16, 8, 64>,
-  32,
-  cutlass::uint4b_t,
-  layout::RowMajor,
-  cutlass::uint4b_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<16, 8, 64>;
-
-  using ElementA = cutlass::uint4b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<cutlass::uint4b_t, 32>;
-
-  using ElementB = cutlass::uint4b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<cutlass::uint4b_t, 16>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm80;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-    asm volatile(
-        "mma.sync.aligned.m16n8k64.row.col.s32.u4.u4.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-        "{%8,%9}, {%10,%11,%12,%13};\n"
+        "mma.sync.aligned.m16n8k64.row.col.s32.u4.u4.s32.satfinite {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
         : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
         : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
           "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
 
 #else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
     assert(0);
 #endif
   }
 };
 
-
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Matrix Multiply 16864 - S4 input, S32 accumulation - SATURATE
+// Matrix Multiply 168256 - B1 input, S32 accumulation - AND,POPC
 //
 ////////////////////////////////////////////////////////////////////////////////
 
-/// Matrix multiply-add operation: S32 = S4 * S4 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<16, 8, 64>,
-  32,
-  cutlass::int4b_t,
-  layout::RowMajor,
-  cutlass::int4b_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate> {
-
-  using Shape = gemm::GemmShape<16, 8, 64>;
-
-  using ElementA = cutlass::int4b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<cutlass::int4b_t, 32>;
-
-  using ElementB = cutlass::int4b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<cutlass::int4b_t, 16>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm80;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
-
-  uint32_t const * A = reinterpret_cast<uint32_t const *>(&a);
-  uint32_t const * B = reinterpret_cast<uint32_t const *>(&b);
-
-  int const *C = reinterpret_cast<int const *>(&c);
-  int *D = reinterpret_cast<int *>(&d);
-
-  asm volatile(
-      "mma.sync.aligned.m16n8k64.row.col.s32.s4.s4.s32.satfinite {%0,%1,%2,%3}, "
-      "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-      : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-      : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
-        "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
-
-#else
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = U4 * S4 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<16, 8, 64>,
-  32,
-  cutlass::uint4b_t,
-  layout::RowMajor,
-  cutlass::int4b_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate> {
-
-  using Shape = gemm::GemmShape<16, 8, 64>;
-
-  using ElementA = cutlass::uint4b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<cutlass::uint4b_t, 32>;
-
-  using ElementB = cutlass::int4b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<cutlass::int4b_t, 16>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using Operator = OpMultiplyAddSaturate;
-  using ArchTag = arch::Sm80;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-    asm volatile(
-        "mma.sync.aligned.m16n8k64.row.col.s32.u4.s4.s32.satfinite {%0,%1,%2,%3}, "
-        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
-
-#else
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = S4 * U4 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<16, 8, 64>,
-  32,
-  cutlass::int4b_t,
-  layout::RowMajor,
-  cutlass::uint4b_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate> {
-
-  using Shape = gemm::GemmShape<16, 8, 64>;
-
-  using ElementA = cutlass::int4b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<cutlass::int4b_t, 32>;
-
-  using ElementB = cutlass::uint4b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<cutlass::uint4b_t, 16>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm80;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-    asm volatile(
-        "mma.sync.aligned.m16n8k64.row.col.s32.s4.u4.s32.satfinite {%0,%1,%2,%3}, "
-        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
-
-#else
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = U4 * U4 + S32
+/// Matrix multiply-add operation: S32 = B1 & B1 + S32
 template <>
 struct Mma<
-  gemm::GemmShape<16, 8, 64>,
+  gemm::GemmShape<16,8,256>,
   32,
-  cutlass::uint4b_t,
+  cutlass::uint1b_t,
   layout::RowMajor,
-  cutlass::uint4b_t,
+  cutlass::uint1b_t,
   layout::ColumnMajor,
-  int,
+  int32_t,
   layout::RowMajor,
-  OpMultiplyAddSaturate> {
+  OpAndPopc> {
 
-  using Shape = gemm::GemmShape<16, 8, 64>;
+  using Shape = gemm::GemmShape<16,8,256>;
 
-  using ElementA = cutlass::uint4b_t;
+  using ElementA = cutlass::uint1b_t;
   using LayoutA = layout::RowMajor;
-  using FragmentA = Array<cutlass::uint4b_t, 32>;
+  using FragmentA = Array<cutlass::uint1b_t, 128>;
 
-  using ElementB = cutlass::uint4b_t;
+  using ElementB = cutlass::uint1b_t;
   using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<cutlass::uint4b_t, 16>;
+  using FragmentB = Array<cutlass::uint1b_t, 64>;
 
-  using ElementC = int;
+  using ElementC = int32_t;
   using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
+  using FragmentC = Array<int32_t, 4>;
 
-  using Operator = OpMultiplyAddSaturate;
+  using Operator = OpAndPopc;
   using ArchTag = arch::Sm80;
 
   /// Computes multiply-add
@@ -1982,7 +1330,7 @@ struct Mma<
     FragmentC const &c
   ) const {
 
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+#if defined(CUTLASS_ARCH_MMA_B1_AND_SM80_ENABLED)
 
     uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
     uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
@@ -1991,13 +1339,18 @@ struct Mma<
     int *D = reinterpret_cast<int *>(&d);
 
     asm volatile(
-        "mma.sync.aligned.m16n8k64.row.col.s32.u4.u4.s32.satfinite {%0,%1,%2,%3}, "
-        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        "mma.sync.aligned.m16n8k256.row.col.s32.b1.b1.s32.and.popc {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, "
+        "{%8,%9}, {%10,%11,%12,%13};\n"
         : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
         : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
           "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
 
 #else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
     assert(0);
 #endif
   }
@@ -2042,7 +1395,7 @@ struct Mma<
     FragmentC const &c
   ) const {
 
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+#if defined(CUTLASS_ARCH_MMA_B1_AND_SM80_ENABLED)
 
     uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
     uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
@@ -2059,6 +1412,10 @@ struct Mma<
           "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
 
 #else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
     assert(0);
 #endif
   }
@@ -2109,13 +1466,14 @@ struct Mma<
     FragmentC const &c
   ) const {
 
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+#if defined(CUTLASS_ARCH_MMA_B1_XOR_SM80_ENABLED)
 
     uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
     uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
 
     int const *C = reinterpret_cast<int const *>(&c);
     int *D = reinterpret_cast<int *>(&d);
+
     asm volatile(
         "mma.sync.aligned.m16n8k256.row.col.s32.b1.b1.s32.xor.popc {%0,%1,%2,%3}, "
         "{%4,%5,%6,%7}, "
@@ -2126,9 +1484,13 @@ struct Mma<
 
 #else
     
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
     assert(0);
-    
-#endif // defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+#endif // defined(CUTLASS_ARCH_MMA_B1_XOR_SM80_ENABLED)
   }
 };
 
@@ -2136,5 +1498,4 @@ struct Mma<
 
 } // namespace arch
 } // namespace cutlass
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/arch/mma_sm89.h b/include/cutlass/arch/mma_sm89.h
new file mode 100644
index 0000000000..d8a75b6623
--- /dev/null
+++ b/include/cutlass/arch/mma_sm89.h
@@ -0,0 +1,363 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Matrix multiply-accumulate specialzied for SM89
+*/
+
+#pragma once
+
+#include <cuda/std/cassert>
+
+#include "cutlass/cutlass.h"
+#include "mma.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 4)
+
+#  define CUTLASS_ARCH_MMA_SM89_SUPPORTED 1
+#endif
+
+#if defined(CUTLASS_ARCH_MMA_SM89_SUPPORTED) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 890)
+#  define CUTLASS_ARCH_MMA_SM89_ENABLED
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace arch {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+// Whether the Mma uses as SM89 staged accumulation policy
+template <class Operator>
+static constexpr bool is_sm89_staged_policy_v =
+  (
+    // ElementA must be FP8
+    platform::is_same<typename Operator::ElementA, cutlass::float_e4m3_t>::value ||
+    platform::is_same<typename Operator::ElementA, cutlass::float_e5m2_t>::value
+  ) &&
+  (
+    // ElementB must be FP8
+    platform::is_same<typename Operator::ElementB, cutlass::float_e4m3_t>::value ||
+    platform::is_same<typename Operator::ElementB, cutlass::float_e5m2_t>::value
+  ) &&
+  (
+    // The instruction shape must be 16x8x32
+    Operator::ArchMmaOperator::Shape::kM == 16 &&
+    Operator::ArchMmaOperator::Shape::kN == 8 &&
+    Operator::ArchMmaOperator::Shape::kK == 32
+  ) &&
+  (
+    // The operator must be OpMultiplyAdd (default)
+    platform::is_same<typename Operator::MathOperator, OpMultiplyAdd>::value
+  );
+} // namespace detail
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 16832 - Float {E4M3, E5M2}, FP32 accumulation
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation - F32 = fe4m3 * fe4m3 + F32
+template <typename Operator_>
+struct Mma<
+  gemm::GemmShape<16, 8, 32>,
+  32,
+  cutlass::float_e4m3_t,
+  layout::RowMajor,
+  cutlass::float_e4m3_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  Operator_> {
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+
+  using Shape = gemm::GemmShape<16, 8, 32>;
+
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 8>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM89_ENABLED)
+
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+
+  asm(
+      "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
+      "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+      :
+        "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
+        "r"(B[0]), "r"(B[1]),
+        "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
+  );
+
+#else
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
+#endif
+  }
+};
+
+/// Matrix multiply-add operation - F32 = fe4m3 * fe5m2 + F32
+template <typename Operator_>
+struct Mma<
+  gemm::GemmShape<16, 8, 32>,
+  32,
+  cutlass::float_e4m3_t,
+  layout::RowMajor,
+  cutlass::float_e5m2_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  Operator_> {
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+
+  using Shape = gemm::GemmShape<16, 8, 32>;
+
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+
+  using ElementB = cutlass::float_e5m2_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 8>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM89_ENABLED)
+
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+
+  asm(
+      "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e5m2.f32 "
+      "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+      :
+        "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
+        "r"(B[0]), "r"(B[1]),
+        "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
+  );
+
+#else
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
+#endif
+  }
+};
+
+/// Matrix multiply-add operation - F32 = fe5m2 * fe4m3 + F32
+template <typename Operator_>
+struct Mma<
+  gemm::GemmShape<16, 8, 32>,
+  32,
+  cutlass::float_e5m2_t,
+  layout::RowMajor,
+  cutlass::float_e4m3_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  Operator_> {
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+
+  using Shape = gemm::GemmShape<16, 8, 32>;
+
+  using ElementA = cutlass::float_e5m2_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 8>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM89_ENABLED)
+
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+
+  asm(
+      "mma.sync.aligned.m16n8k32.row.col.f32.e5m2.e4m3.f32 "
+      "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+      :
+        "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
+        "r"(B[0]), "r"(B[1]),
+        "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
+  );
+
+#else
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
+#endif
+  }
+};
+
+/// Matrix multiply-add operation - F32 = fe5m2 * fe5m2 + F32
+template <typename Operator_>
+struct Mma<
+  gemm::GemmShape<16, 8, 32>,
+  32,
+  cutlass::float_e5m2_t,
+  layout::RowMajor,
+  cutlass::float_e5m2_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  Operator_> {
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+
+  using Shape = gemm::GemmShape<16, 8, 32>;
+
+  using ElementA = cutlass::float_e5m2_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+
+  using ElementB = cutlass::float_e5m2_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 8>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM89_ENABLED)
+
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+
+  asm(
+      "mma.sync.aligned.m16n8k32.row.col.f32.e5m2.e5m2.f32 "
+      "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+      :
+        "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
+        "r"(B[0]), "r"(B[1]),
+        "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
+  );
+
+#else
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
+#endif
+  }
+};
+
+} // namespace arch
+} // namespace cutlass
diff --git a/include/cutlass/arch/mma_sm90.h b/include/cutlass/arch/mma_sm90.h
new file mode 100644
index 0000000000..16108f0a1e
--- /dev/null
+++ b/include/cutlass/arch/mma_sm90.h
@@ -0,0 +1,241 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Matrix multiply
+*/
+
+#pragma once
+
+#include <cuda/std/cassert>
+
+#include "mma.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/config.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace arch {
+
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix Multiply-Add 16x8x4 fp64
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F64 = F64 * F64 + F64
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,4>,
+  32,
+  double,
+  layout::RowMajor,
+  double,
+  layout::ColumnMajor,
+  double,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16,8,4>;
+
+  using ElementA = double;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<double, 2>;
+
+  using ElementB = double;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<double, 1>;
+
+  using ElementC = double;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<double, 4>;
+
+  using Operator = OpMultiplyAdd;
+
+  using ArchTag = arch::Sm90;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+
+  double const *A = reinterpret_cast<double const *>(&a);
+  double const *B = reinterpret_cast<double const *>(&b);
+
+  double const *C = reinterpret_cast<double const *>(&c);
+  double *D = reinterpret_cast<double *>(&d);
+
+  asm volatile("mma.sync.aligned.m16n8k4.row.col.f64.f64.f64.f64.rn {%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
+      : "=d"(D[0]), "=d"(D[1]), "=d"(D[2]), "=d"(D[3])
+      : "d"(A[0]), "d"(A[1]),
+        "d"(B[0]),
+        "d"(C[0]), "d"(C[1]), "d"(C[2]), "d"(C[3]));
+
+#else
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix Multiply-Add 16x8x8 fp64
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F64 = F64 * F64 + F64
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,8>,
+  32,
+  double,
+  layout::RowMajor,
+  double,
+  layout::ColumnMajor,
+  double,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16,8,8>;
+
+  using ElementA = double;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<double, 4>;
+
+  using ElementB = double;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<double, 2>;
+
+  using ElementC = double;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<double, 4>;
+
+  using Operator = OpMultiplyAdd;
+
+  using ArchTag = arch::Sm90;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+
+  double const *A = reinterpret_cast<double const *>(&a);
+  double const *B = reinterpret_cast<double const *>(&b);
+
+  double const *C = reinterpret_cast<double const *>(&c);
+  double *D = reinterpret_cast<double *>(&d);
+
+  asm volatile("mma.sync.aligned.m16n8k8.row.col.f64.f64.f64.f64 {%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n"
+      : "=d"(D[0]), "=d"(d[1]), "=d"(d[2]), "=d"(d[3])
+      : "d"(A[0]), "d"(A[1]), "d"(A[2]), "d"(A[3]),
+        "d"(B[0]), "d"(B[1]),
+        "d"(C[0]), "d"(C[1]), "d"(C[2]), "d"(C[3]));
+
+#else
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix Multiply-Add 16x8x16 fp64
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F64 = F64 * F64 + F64
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,16>,
+  32,
+  double,
+  layout::RowMajor,
+  double,
+  layout::ColumnMajor,
+  double,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16,8,16>;
+
+  using ElementA = double;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<double, 8>;
+
+  using ElementB = double;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<double, 4>;
+
+  using ElementC = double;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<double, 4>;
+
+  using Operator = OpMultiplyAdd;
+
+  using ArchTag = arch::Sm90;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+    
+#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+
+  double const *A = reinterpret_cast<double const *>(&a);
+  double const *B = reinterpret_cast<double const *>(&b);
+
+  double const *C = reinterpret_cast<double const *>(&c);
+  double *D = reinterpret_cast<double *>(&d);
+
+  asm volatile("mma.sync.aligned.m16n8k16.row.col.f64.f64.f64.f64 {%0, %1, %2, %3}, {%4, %5, %6, %7, %8, %9, %10, %11}, {%12, %13, %14, %15}, {%16, %17, %18, %19};\n"
+      : "=d"(D[0]), "=d"(D[1]), "=d"(D[2]), "=d"(D[3])
+      : "d"(A[0]), "d"(A[2]), "d"(A[2]), "d"(A[3]), "d"(A[4]), "d"(A[5]), "d"(A[6]), "d"(A[7]),
+        "d"(B[0]), "d"(B[1]), "d"(B[2]), "d"(B[3]), 
+        "d"(C[0]), "d"(C[1]), "d"(C[2]), "d"(C[3]));
+
+#else
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace arch
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/include/cutlass/arch/mma_sparse_sm80.h b/include/cutlass/arch/mma_sparse_sm80.h
index e22d600623..ed2a5ad019 100644
--- a/include/cutlass/arch/mma_sparse_sm80.h
+++ b/include/cutlass/arch/mma_sparse_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,11 +35,7 @@
 
 #pragma once
 
-#if defined(__CUDACC_RTC__)
 #include <cuda/std/cassert>
-#else
-#include <assert.h>
-#endif
 
 #include "mma.h"
 #include "cutlass/layout/matrix.h"
@@ -54,6 +50,7 @@
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
 #define CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED
 #endif
+
 #endif
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -121,6 +118,27 @@ struct SparseMma<
   uint32_t const *C = reinterpret_cast<uint32_t const *>(&c);
   uint32_t *D = reinterpret_cast<uint32_t *>(&d);
 
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+  if (id2 == 0) {
+    asm volatile(
+        "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f16.f16.f16.f16 {%0,%1}, "
+        "{%2,%3,%4,%5}, {%6,%7,%8,%9}, {%10,%11}, %12, 0x0;\n"
+        : "=r"(D[0]), "=r"(D[1])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(B[2]), "r"(B[3]), "r"(C[0]), "r"(C[1]), "r"(E));
+  }
+  else if (id2 == 1) {
+    asm volatile(
+        "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f16.f16.f16.f16 {%0,%1}, "
+        "{%2,%3,%4,%5}, {%6,%7,%8,%9}, {%10,%11}, %12, 0x1;\n"
+        : "=r"(D[0]), "=r"(D[1])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(B[2]), "r"(B[3]), "r"(C[0]), "r"(C[1]), "r"(E));
+  }
+  else {
+    assert(0);
+  }
+#else
   if (id2 == 0) {
     asm volatile(
         "mma.sp.sync.aligned.m16n8k32.row.col.f16.f16.f16.f16 {%0,%1}, "
@@ -140,7 +158,13 @@ struct SparseMma<
   else {
     assert(0);
   }
+#endif
+
 #else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
     assert(0);
 #endif
   }
@@ -200,6 +224,29 @@ struct SparseMma<
   float const *C = reinterpret_cast<float const *>(&c);
   float *D = reinterpret_cast<float *>(&d);
 
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+  if (id2 == 0) {
+    asm volatile(
+        "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(B[2]), "r"(B[3]), "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]),
+          "r"(E));
+  }
+  else if (id2 == 1) {
+    asm volatile(
+        "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x1;\n"
+        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(B[2]), "r"(B[3]), "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]),
+          "r"(E));
+  }
+  else {
+    assert(0);
+  }
+#else
   if (id2 == 0) {
     asm volatile(
         "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 {%0,%1,%2,%3}, "
@@ -222,8 +269,13 @@ struct SparseMma<
     assert(0);
   }
 
-#else
+#endif
 
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
     assert(0);
 #endif
   }
@@ -276,26 +328,50 @@ struct SparseMma<gemm::GemmShape<16, 8, 32>, 32, bfloat16_t, layout::RowMajor,
     float const *C = reinterpret_cast<float const *>(&c);
     float *D = reinterpret_cast<float *>(&d);
 
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
     if (id2 == 0) {
-    asm volatile(
-        "mma.sp.sync.aligned.m16n8k32.row.col.f32.bf16.bf16.f32 "
-        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
-          "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.bf16.bf16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
+            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
     } else if (id2 == 1) {
-    asm volatile(
-        "mma.sp.sync.aligned.m16n8k32.row.col.f32.bf16.bf16.f32 "
-        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x1;\n"
-        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
-          "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.bf16.bf16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x1;\n"
+          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
+            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
     } else {
-    assert(0);
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k32.row.col.f32.bf16.bf16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
+            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+    } else if (id2 == 1) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k32.row.col.f32.bf16.bf16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x1;\n"
+          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
+            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+    } else {
+      assert(0);
     }
+#endif
 
 #else
 
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
     assert(0);
 #endif
   }
@@ -348,26 +424,50 @@ struct SparseMma<gemm::GemmShape<16, 8, 16>, 32, tfloat32_t, layout::RowMajor,
     float const *C = reinterpret_cast<float const *>(&c);
     float *D = reinterpret_cast<float *>(&d);
 
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
     if (id2 == 0) {
-    asm volatile(
-        "mma.sp.sync.aligned.m16n8k16.row.col.f32.tf32.tf32.f32 "
-        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
-          "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k16.row.col.f32.tf32.tf32.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
+            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
     } else if (id2 == 1) {
-    asm volatile(
-        "mma.sp.sync.aligned.m16n8k16.row.col.f32.tf32.tf32.f32 "
-        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x1;\n"
-        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
-          "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k16.row.col.f32.tf32.tf32.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x1;\n"
+          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
+            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
     } else {
-    assert(0);
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k16.row.col.f32.tf32.tf32.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
+            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+    } else if (id2 == 1) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k16.row.col.f32.tf32.tf32.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x1;\n"
+          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
+            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+    } else {
+      assert(0);
     }
+#endif
 
 #else
 
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
     assert(0);
 #endif
   }
@@ -375,7 +475,7 @@ struct SparseMma<gemm::GemmShape<16, 8, 16>, 32, tfloat32_t, layout::RowMajor,
 
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Sparse Matrix Multiply 16864 - S8 input, S32 accumulation
+// Sparse Matrix Multiply 16864 - S8 input, S32 accumulation - SATURATE
 //
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -390,7 +490,7 @@ struct SparseMma<
   layout::ColumnMajor,
   int,
   layout::RowMajor,
-  OpMultiplyAdd,
+  OpMultiplyAddSaturate,
   SPFormatType::Thread> {
 
   using Shape = gemm::GemmShape<16,8,64>;
@@ -409,7 +509,7 @@ struct SparseMma<
 
   using FragmentE = uint32_t;
 
-  using Operator = OpMultiplyAdd;
+  using Operator = OpMultiplyAddSaturate;
   using ArchTag = arch::Sm80;
 
   static int const kSparse = 2;
@@ -437,18 +537,35 @@ struct SparseMma<
     int const *C = reinterpret_cast<int const *>(&c);
     int *D = reinterpret_cast<int *>(&d);
 
-    if (id2 == 0)
-    asm volatile(
-        "mma.sp.sync.aligned.m16n8k64.row.col.s32.s8.s8.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-        "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
-    else
-    assert(0);
-
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k64.row.col.s32.s8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
 #else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k64.row.col.s32.s8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
 
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
     assert(0);
 #endif
   }
@@ -465,7 +582,7 @@ struct SparseMma<
   layout::ColumnMajor,
   int,
   layout::RowMajor,
-  OpMultiplyAdd,
+  OpMultiplyAddSaturate,
   SPFormatType::Thread> {
 
   using Shape = gemm::GemmShape<16,8,64>;
@@ -484,7 +601,7 @@ struct SparseMma<
 
   using FragmentE = uint32_t;
 
-  using Operator = OpMultiplyAdd;
+  using Operator = OpMultiplyAddSaturate;
   using ArchTag = arch::Sm80;
 
   static int const kSparse = 2;
@@ -512,18 +629,36 @@ struct SparseMma<
     int const *C = reinterpret_cast<int const *>(&c);
     int *D = reinterpret_cast<int *>(&d);
 
-    if (id2 == 0)
-    asm volatile(
-        "mma.sp.sync.aligned.m16n8k64.row.col.s32.s8.u8.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-        "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
-    else
-    assert(0);
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k64.row.col.s32.s8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k64.row.col.s32.s8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
 
 #else
 
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
     assert(0);
 #endif
   }
@@ -540,7 +675,7 @@ struct SparseMma<
   layout::ColumnMajor,
   int,
   layout::RowMajor,
-  OpMultiplyAdd,
+  OpMultiplyAddSaturate,
   SPFormatType::Thread> {
 
   using Shape = gemm::GemmShape<16,8,64>;
@@ -559,7 +694,7 @@ struct SparseMma<
 
   using FragmentE = uint32_t;
 
-  using Operator = OpMultiplyAdd;
+  using Operator = OpMultiplyAddSaturate;
   using ArchTag = arch::Sm80;
 
   static int const kSparse = 2;
@@ -587,18 +722,35 @@ struct SparseMma<
     int const *C = reinterpret_cast<int const *>(&c);
     int *D = reinterpret_cast<int *>(&d);
 
-    if (id2 == 0)
-    asm volatile(
-        "mma.sp.sync.aligned.m16n8k64.row.col.s32.u8.s8.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-        "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
-    else
-    assert(0);
-
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k64.row.col.s32.u8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
 #else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k64.row.col.s32.u8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
 
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
     assert(0);
 #endif
   }
@@ -615,7 +767,7 @@ struct SparseMma<
   layout::ColumnMajor,
   int,
   layout::RowMajor,
-  OpMultiplyAdd,
+  OpMultiplyAddSaturate,
   SPFormatType::Thread> {
 
   using Shape = gemm::GemmShape<16,8,64>;
@@ -634,7 +786,7 @@ struct SparseMma<
 
   using FragmentE = uint32_t;
 
-  using Operator = OpMultiplyAdd;
+  using Operator = OpMultiplyAddSaturate;
   using ArchTag = arch::Sm80;
 
   static int const kSparse = 2;
@@ -662,18 +814,35 @@ struct SparseMma<
     int const *C = reinterpret_cast<int const *>(&c);
     int *D = reinterpret_cast<int *>(&d);
 
-    if (id2 == 0)
-    asm volatile(
-        "mma.sp.sync.aligned.m16n8k64.row.col.s32.u8.u8.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-        "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
-    else
-    assert(0);
-
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k64.row.col.s32.u8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
 #else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k64.row.col.s32.u8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
 
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
     assert(0);
 #endif
   }
@@ -681,33 +850,33 @@ struct SparseMma<
 
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Sparse Matrix Multiply 16864 - S8 input, S32 accumulation - SATURATE
+// Sparse Matrix Multiply 168128 - S4 input, S32 accumulation - SATURATE
 //
 ////////////////////////////////////////////////////////////////////////////////
 
-/// Matrix multiply-add operation: S32 = S8 * S8 + S32
+/// Matrix multiply-add operation: S32 = S4 * S4 + S32
 template <>
 struct SparseMma<
-  gemm::GemmShape<16,8,64>,
+  gemm::GemmShape<16,8,128>,
   32,
-  int8_t,
+  cutlass::int4b_t,
   layout::RowMajor,
-  int8_t,
+  cutlass::int4b_t,
   layout::ColumnMajor,
   int,
   layout::RowMajor,
   OpMultiplyAddSaturate,
   SPFormatType::Thread> {
 
-  using Shape = gemm::GemmShape<16,8,64>;
+  using Shape = gemm::GemmShape<16,8,128>;
 
-  using ElementA = int8_t;
+  using ElementA = cutlass::int4b_t;
   using LayoutA = layout::RowMajor;
-  using FragmentA = Array<int8_t, 16>;
+  using FragmentA = Array<cutlass::int4b_t, 32>;
 
-  using ElementB = int8_t;
+  using ElementB = cutlass::int4b_t;
   using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<int8_t, 16>;
+  using FragmentB = Array<cutlass::int4b_t, 32>;
 
   using ElementC = int;
   using LayoutC = layout::RowMajor;
@@ -715,7 +884,7 @@ struct SparseMma<
 
   using FragmentE = uint32_t;
 
-  using Operator = OpMultiplyAdd;
+  using Operator = OpMultiplyAddSaturate;
   using ArchTag = arch::Sm80;
 
   static int const kSparse = 2;
@@ -743,46 +912,64 @@ struct SparseMma<
     int const *C = reinterpret_cast<int const *>(&c);
     int *D = reinterpret_cast<int *>(&d);
 
-    if (id2 == 0)
-    asm volatile(
-        "mma.sp.sync.aligned.m16n8k64.row.col.s32.s8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-        "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
-    else
-    assert(0);
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k128.row.col.s32.s4.s4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k128.row.col.s32.s4.s4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
 
 #else
 
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
     assert(0);
 #endif
   }
 };
 
-/// Matrix multiply-add operation: S32 = S8 * U8 + S32
+/// Matrix multiply-add operation: S32 = S4 * U4 + S32
 template <>
 struct SparseMma<
-  gemm::GemmShape<16,8,64>,
+  gemm::GemmShape<16,8,128>,
   32,
-  int8_t,
+  cutlass::int4b_t,
   layout::RowMajor,
-  uint8_t,
+  cutlass::uint4b_t,
   layout::ColumnMajor,
   int,
   layout::RowMajor,
   OpMultiplyAddSaturate,
   SPFormatType::Thread> {
 
-  using Shape = gemm::GemmShape<16,8,64>;
+  using Shape = gemm::GemmShape<16,8,128>;
 
-  using ElementA = int8_t;
+  using ElementA = cutlass::int4b_t;
   using LayoutA = layout::RowMajor;
-  using FragmentA = Array<int8_t, 16>;
+  using FragmentA = Array<cutlass::int4b_t, 32>;
 
-  using ElementB = uint8_t;
+  using ElementB = cutlass::uint4b_t;
   using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<uint8_t, 16>;
+  using FragmentB = Array<cutlass::uint4b_t, 32>;
 
   using ElementC = int;
   using LayoutC = layout::RowMajor;
@@ -790,7 +977,7 @@ struct SparseMma<
 
   using FragmentE = uint32_t;
 
-  using Operator = OpMultiplyAdd;
+  using Operator = OpMultiplyAddSaturate;
   using ArchTag = arch::Sm80;
 
   static int const kSparse = 2;
@@ -818,46 +1005,64 @@ struct SparseMma<
     int const *C = reinterpret_cast<int const *>(&c);
     int *D = reinterpret_cast<int *>(&d);
 
-    if (id2 == 0)
-    asm volatile(
-        "mma.sp.sync.aligned.m16n8k64.row.col.s32.s8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-        "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
-    else
-    assert(0);
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k128.row.col.s32.s4.u4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k128.row.col.s32.s4.u4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
 
 #else
 
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
     assert(0);
 #endif
   }
 };
 
-/// Matrix multiply-add operation: S32 = U8 * S8 + S32
+/// Matrix multiply-add operation: S32 = U4 * S4 + S32
 template <>
 struct SparseMma<
-  gemm::GemmShape<16,8,64>,
+  gemm::GemmShape<16,8,128>,
   32,
-  uint8_t,
+  cutlass::uint4b_t,
   layout::RowMajor,
-  int8_t,
+  cutlass::int4b_t,
   layout::ColumnMajor,
   int,
   layout::RowMajor,
   OpMultiplyAddSaturate,
   SPFormatType::Thread> {
 
-  using Shape = gemm::GemmShape<16,8,64>;
+  using Shape = gemm::GemmShape<16,8,128>;
 
-  using ElementA = uint8_t;
+  using ElementA = cutlass::uint4b_t;
   using LayoutA = layout::RowMajor;
-  using FragmentA = Array<uint8_t, 16>;
+  using FragmentA = Array<cutlass::uint4b_t, 32>;
 
-  using ElementB = int8_t;
+  using ElementB = cutlass::int4b_t;
   using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<int8_t, 16>;
+  using FragmentB = Array<cutlass::int4b_t, 32>;
 
   using ElementC = int;
   using LayoutC = layout::RowMajor;
@@ -865,7 +1070,7 @@ struct SparseMma<
 
   using FragmentE = uint32_t;
 
-  using Operator = OpMultiplyAdd;
+  using Operator = OpMultiplyAddSaturate;
   using ArchTag = arch::Sm80;
 
   static int const kSparse = 2;
@@ -893,46 +1098,64 @@ struct SparseMma<
     int const *C = reinterpret_cast<int const *>(&c);
     int *D = reinterpret_cast<int *>(&d);
 
-    if (id2 == 0)
-    asm volatile(
-        "mma.sp.sync.aligned.m16n8k64.row.col.s32.u8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-        "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
-    else
-    assert(0);
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k128.row.col.s32.u4.s4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k128.row.col.s32.u4.s4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
 
 #else
 
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
     assert(0);
 #endif
   }
 };
 
-/// Matrix multiply-add operation: S32 = U8 * U8 + S32
+/// Matrix multiply-add operation: S32 = U4 * U4 + S32
 template <>
 struct SparseMma<
-  gemm::GemmShape<16,8,64>,
+  gemm::GemmShape<16,8,128>,
   32,
-  uint8_t,
+  cutlass::uint4b_t,
   layout::RowMajor,
-  uint8_t,
+  cutlass::uint4b_t,
   layout::ColumnMajor,
   int,
   layout::RowMajor,
   OpMultiplyAddSaturate,
   SPFormatType::Thread> {
 
-  using Shape = gemm::GemmShape<16,8,64>;
+  using Shape = gemm::GemmShape<16,8,128>;
 
-  using ElementA = uint8_t;
+  using ElementA = cutlass::uint4b_t;
   using LayoutA = layout::RowMajor;
-  using FragmentA = Array<uint8_t, 16>;
+  using FragmentA = Array<cutlass::uint4b_t, 32>;
 
-  using ElementB = uint8_t;
+  using ElementB = cutlass::uint4b_t;
   using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<uint8_t, 16>;
+  using FragmentB = Array<cutlass::uint4b_t, 32>;
 
   using ElementC = int;
   using LayoutC = layout::RowMajor;
@@ -940,7 +1163,7 @@ struct SparseMma<
 
   using FragmentE = uint32_t;
 
-  using Operator = OpMultiplyAdd;
+  using Operator = OpMultiplyAddSaturate;
   using ArchTag = arch::Sm80;
 
   static int const kSparse = 2;
@@ -968,630 +1191,36 @@ struct SparseMma<
     int const *C = reinterpret_cast<int const *>(&c);
     int *D = reinterpret_cast<int *>(&d);
 
-    if (id2 == 0)
-    asm volatile(
-        "mma.sp.sync.aligned.m16n8k64.row.col.s32.u8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-        "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
-    else
-    assert(0);
-
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k128.row.col.s32.u4.u4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
 #else
-
-    assert(0);
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k128.row.col.s32.u4.u4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
 #endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// Sparse Matrix Multiply 168128 - S4 input, S32 accumulation
-//
-////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: S32 = S4 * S4 + S32
-template <>
-struct SparseMma<
-  gemm::GemmShape<16,8,128>,
-  32,
-  cutlass::int4b_t,
-  layout::RowMajor,
-  cutlass::int4b_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAdd,
-  SPFormatType::Thread> {
-
-  using Shape = gemm::GemmShape<16,8,128>;
-
-  using ElementA = cutlass::int4b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<cutlass::int4b_t, 32>;
-
-  using ElementB = cutlass::int4b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<cutlass::int4b_t, 32>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using FragmentE = uint32_t;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm80;
-
-  static int const kSparse = 2;
-
-  static int const kMetaSizeInBits = 2;
-
-  static int const kMaxID2 = 1;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c,
-    uint32_t const &E,
-    int const id2
-  ) const {
-
-#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-    if (id2 == 0)
-    asm volatile(
-        "mma.sp.sync.aligned.m16n8k128.row.col.s32.s4.s4.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-        "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
-    else
-    assert(0);
-
-#else
-
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = S4 * U4 + S32
-template <>
-struct SparseMma<
-  gemm::GemmShape<16,8,128>,
-  32,
-  cutlass::int4b_t,
-  layout::RowMajor,
-  cutlass::uint4b_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAdd,
-  SPFormatType::Thread> {
-
-  using Shape = gemm::GemmShape<16,8,128>;
-
-  using ElementA = cutlass::int4b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<cutlass::int4b_t, 32>;
-
-  using ElementB = cutlass::uint4b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<cutlass::uint4b_t, 32>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using FragmentE = uint32_t;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm80;
-
-  static int const kSparse = 2;
-
-  static int const kMetaSizeInBits = 2;
-
-  static int const kMaxID2 = 1;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c,
-    uint32_t const &E,
-    int const id2
-  ) const {
-
-#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-    if (id2 == 0)
-    asm volatile(
-        "mma.sp.sync.aligned.m16n8k128.row.col.s32.s4.u4.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-        "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
-    else
-    assert(0);
-
-#else
-
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = U4 * S4 + S32
-template <>
-struct SparseMma<
-  gemm::GemmShape<16,8,128>,
-  32,
-  cutlass::uint4b_t,
-  layout::RowMajor,
-  cutlass::int4b_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAdd,
-  SPFormatType::Thread> {
-
-  using Shape = gemm::GemmShape<16,8,128>;
-
-  using ElementA = cutlass::uint4b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<cutlass::uint4b_t, 32>;
-
-  using ElementB = cutlass::int4b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<cutlass::int4b_t, 32>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using FragmentE = uint32_t;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm80;
-
-  static int const kSparse = 2;
-
-  static int const kMetaSizeInBits = 2;
-
-  static int const kMaxID2 = 1;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c,
-    uint32_t const &E,
-    int const id2
-  ) const {
-
-#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-    if (id2 == 0)
-    asm volatile(
-        "mma.sp.sync.aligned.m16n8k128.row.col.s32.u4.s4.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-        "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
-    else
-    assert(0);
-
-#else
-
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = U4 * U4 + S32
-template <>
-struct SparseMma<
-  gemm::GemmShape<16,8,128>,
-  32,
-  cutlass::uint4b_t,
-  layout::RowMajor,
-  cutlass::uint4b_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAdd,
-  SPFormatType::Thread> {
-
-  using Shape = gemm::GemmShape<16,8,128>;
-
-  using ElementA = cutlass::uint4b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<cutlass::uint4b_t, 32>;
-
-  using ElementB = cutlass::uint4b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<cutlass::uint4b_t, 32>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using FragmentE = uint32_t;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm80;
-
-  static int const kSparse = 2;
-
-  static int const kMetaSizeInBits = 2;
-
-  static int const kMaxID2 = 1;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c,
-    uint32_t const &E,
-    int const id2
-  ) const {
-
-#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-    if (id2 == 0)
-    asm volatile(
-        "mma.sp.sync.aligned.m16n8k128.row.col.s32.u4.u4.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-        "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
-    else
-    assert(0);
-
-#else
-
-    assert(0);
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// Sparse Matrix Multiply 168128 - S4 input, S32 accumulation - SATURATE
-//
-////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: S32 = S4 * S4 + S32
-template <>
-struct SparseMma<
-  gemm::GemmShape<16,8,128>,
-  32,
-  cutlass::int4b_t,
-  layout::RowMajor,
-  cutlass::int4b_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate,
-  SPFormatType::Thread> {
-
-  using Shape = gemm::GemmShape<16,8,128>;
-
-  using ElementA = cutlass::int4b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<cutlass::int4b_t, 32>;
-
-  using ElementB = cutlass::int4b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<cutlass::int4b_t, 32>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using FragmentE = uint32_t;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm80;
-
-  static int const kSparse = 2;
-
-  static int const kMetaSizeInBits = 2;
-
-  static int const kMaxID2 = 1;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c,
-    uint32_t const &E,
-    int const id2
-  ) const {
-
-#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-    if (id2 == 0)
-    asm volatile(
-        "mma.sp.sync.aligned.m16n8k128.row.col.s32.s4.s4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-        "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
-    else
-    assert(0);
-
-#else
-
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = S4 * U4 + S32
-template <>
-struct SparseMma<
-  gemm::GemmShape<16,8,128>,
-  32,
-  cutlass::int4b_t,
-  layout::RowMajor,
-  cutlass::uint4b_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate,
-  SPFormatType::Thread> {
-
-  using Shape = gemm::GemmShape<16,8,128>;
-
-  using ElementA = cutlass::int4b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<cutlass::int4b_t, 32>;
-
-  using ElementB = cutlass::uint4b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<cutlass::uint4b_t, 32>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using FragmentE = uint32_t;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm80;
-
-  static int const kSparse = 2;
-
-  static int const kMetaSizeInBits = 2;
-
-  static int const kMaxID2 = 1;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c,
-    uint32_t const &E,
-    int const id2
-  ) const {
-
-#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-    if (id2 == 0)
-    asm volatile(
-        "mma.sp.sync.aligned.m16n8k128.row.col.s32.s4.u4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-        "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
-    else
-    assert(0);
-
-#else
-
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = U4 * S4 + S32
-template <>
-struct SparseMma<
-  gemm::GemmShape<16,8,128>,
-  32,
-  cutlass::uint4b_t,
-  layout::RowMajor,
-  cutlass::int4b_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate,
-  SPFormatType::Thread> {
-
-  using Shape = gemm::GemmShape<16,8,128>;
-
-  using ElementA = cutlass::uint4b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<cutlass::uint4b_t, 32>;
-
-  using ElementB = cutlass::int4b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<cutlass::int4b_t, 32>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using FragmentE = uint32_t;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm80;
-
-  static int const kSparse = 2;
-
-  static int const kMetaSizeInBits = 2;
-
-  static int const kMaxID2 = 1;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c,
-    uint32_t const &E,
-    int const id2
-  ) const {
-
-#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-    if (id2 == 0)
-    asm volatile(
-        "mma.sp.sync.aligned.m16n8k128.row.col.s32.u4.s4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-        "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
-    else
-    assert(0);
-
-#else
-
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = U4 * U4 + S32
-template <>
-struct SparseMma<
-  gemm::GemmShape<16,8,128>,
-  32,
-  cutlass::uint4b_t,
-  layout::RowMajor,
-  cutlass::uint4b_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate,
-  SPFormatType::Thread> {
-
-  using Shape = gemm::GemmShape<16,8,128>;
-
-  using ElementA = cutlass::uint4b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<cutlass::uint4b_t, 32>;
-
-  using ElementB = cutlass::uint4b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<cutlass::uint4b_t, 32>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using FragmentE = uint32_t;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm80;
-
-  static int const kSparse = 2;
-
-  static int const kMetaSizeInBits = 2;
-
-  static int const kMaxID2 = 1;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c,
-    uint32_t const &E,
-    int const id2
-  ) const {
-
-#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-    if (id2 == 0)
-    asm volatile(
-        "mma.sp.sync.aligned.m16n8k128.row.col.s32.u4.u4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-        "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
-    else
-    assert(0);
 
 #else
 
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
     assert(0);
 #endif
   }
diff --git a/include/cutlass/arch/mma_sparse_sm89.h b/include/cutlass/arch/mma_sparse_sm89.h
new file mode 100644
index 0000000000..2fae35be42
--- /dev/null
+++ b/include/cutlass/arch/mma_sparse_sm89.h
@@ -0,0 +1,405 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Sparse matrix multiply accumulate for SM89
+*/
+
+#pragma once
+
+#include <cuda/std/cassert>
+
+#include "mma.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 4)
+
+#  define CUTLASS_ARCH_SPARSE_MMA_SM89_SUPPORTED 1
+#endif
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM89_SUPPORTED) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 890)
+#  define CUTLASS_ARCH_SPARSE_MMA_SM89_ENABLED
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace arch {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F32 = fe4m3 * fe4m3 + F32
+template <typename Operator_>
+struct SparseMma<
+  gemm::GemmShape<16,8,64>,
+  32,
+  cutlass::float_e4m3_t,
+  layout::RowMajor,
+  cutlass::float_e4m3_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  Operator_,
+  SPFormatType::Thread> {
+
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+
+  using Shape = gemm::GemmShape<16,8,64>;
+
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 16>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<ElementC, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 2;
+
+  static int const kMaxID2 = 1;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM89_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    float const *C = reinterpret_cast<float const *>(&c);
+    float *D = reinterpret_cast<float *>(&d);
+
+      if (id2 == 0) {
+        asm volatile(
+            "mma.sp.sync.aligned.m16n8k64.row.col.f32.e4m3.e4m3.f32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+            "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+            : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+            : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+              "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+      }
+      else {
+        assert(0);
+      }
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F32 = fe4m3 * fe5m2 + F32
+template <typename Operator_>
+struct SparseMma<
+  gemm::GemmShape<16,8,64>,
+  32,
+  cutlass::float_e4m3_t,
+  layout::RowMajor,
+  cutlass::float_e5m2_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  Operator_,
+  SPFormatType::Thread> {
+
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+
+  using Shape = gemm::GemmShape<16,8,64>;
+
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+
+  using ElementB = cutlass::float_e5m2_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 16>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<ElementC, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 2;
+
+  static int const kMaxID2 = 1;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM89_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    float const *C = reinterpret_cast<float const *>(&c);
+    float *D = reinterpret_cast<float *>(&d);
+
+      if (id2 == 0) {
+        asm volatile(
+            "mma.sp.sync.aligned.m16n8k64.row.col.f32.e4m3.e5m2.f32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+            "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+            : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+            : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+              "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+      }
+      else {
+        assert(0);
+      }
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F32 = fe5m2 * fe4m3 + F32
+template <typename Operator_>
+struct SparseMma<
+  gemm::GemmShape<16,8,64>,
+  32,
+  cutlass::float_e5m2_t,
+  layout::RowMajor,
+  cutlass::float_e4m3_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  Operator_,
+  SPFormatType::Thread> {
+
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+
+  using Shape = gemm::GemmShape<16,8,64>;
+
+  using ElementA = cutlass::float_e5m2_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 16>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<ElementC, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 2;
+
+  static int const kMaxID2 = 1;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM89_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    float const *C = reinterpret_cast<float const *>(&c);
+    float *D = reinterpret_cast<float *>(&d);
+
+      if (id2 == 0) {
+        asm volatile(
+            "mma.sp.sync.aligned.m16n8k64.row.col.f32.e5m2.e4m3.f32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+            "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+            : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+            : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+              "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+      }
+      else {
+        assert(0);
+      }
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F32 = fe5m2 * fe5m2 + F32
+template <typename Operator_>
+struct SparseMma<
+  gemm::GemmShape<16,8,64>,
+  32,
+  cutlass::float_e5m2_t,
+  layout::RowMajor,
+  cutlass::float_e5m2_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  Operator_,
+  SPFormatType::Thread> {
+
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+
+  using Shape = gemm::GemmShape<16,8,64>;
+
+  using ElementA = cutlass::float_e5m2_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+
+  using ElementB = cutlass::float_e5m2_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 16>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<ElementC, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 2;
+
+  static int const kMaxID2 = 1;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM89_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    float const *C = reinterpret_cast<float const *>(&c);
+    float *D = reinterpret_cast<float *>(&d);
+
+      if (id2 == 0) {
+        asm volatile(
+            "mma.sp.sync.aligned.m16n8k64.row.col.f32.e5m2.e5m2.f32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+            "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+            : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+            : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+              "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+      }
+      else {
+        assert(0);
+      }
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace arch
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/arch/reg_reconfig.h b/include/cutlass/arch/reg_reconfig.h
new file mode 100644
index 0000000000..d2b434453e
--- /dev/null
+++ b/include/cutlass/arch/reg_reconfig.h
@@ -0,0 +1,67 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief PTX for CTA Reconfiguration
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#ifndef CUDA_CTA_RECONFIG_ACTIVATED
+  #if (__CUDACC_VER_MAJOR__ >= 12 && \
+    defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL))
+    #define CUDA_CTA_RECONFIG_ACTIVATED 1
+  #endif
+#endif
+
+namespace cutlass {
+namespace arch {
+
+template<uint32_t RegCount>
+CUTLASS_DEVICE
+void warpgroup_reg_alloc(){
+#if CUDA_CTA_RECONFIG_ACTIVATED
+  asm volatile( "setmaxnreg.inc.sync.aligned.u32 %0;\n" : : "n"(RegCount) );
+#endif
+}
+
+template<uint32_t RegCount>
+CUTLASS_DEVICE
+void warpgroup_reg_dealloc(){
+#if CUDA_CTA_RECONFIG_ACTIVATED
+  asm volatile( "setmaxnreg.dec.sync.aligned.u32 %0;\n" : : "n"(RegCount) );
+#endif
+}
+
+} // namespace arch
+} // namespace cutlass
diff --git a/include/cutlass/arch/simd.h b/include/cutlass/arch/simd.h
index abbabf94b3..f670fc293f 100644
--- a/include/cutlass/arch/simd.h
+++ b/include/cutlass/arch/simd.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -34,8 +34,8 @@
 
 #pragma once
 
-#include "../array.h"
-#include "../numeric_types.h"
+#include "cutlass/arch/array.h"
+#include "cutlass/arch/numeric_types.h"
 
 namespace cutlass {
 namespace arch {
diff --git a/include/cutlass/arch/simd_sm60.h b/include/cutlass/arch/simd_sm60.h
index 1de8d42b05..6e1ef20441 100644
--- a/include/cutlass/arch/simd_sm60.h
+++ b/include/cutlass/arch/simd_sm60.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -50,8 +50,6 @@ template <>
 Array<half_t, 2> operator*(Array<half_t, 2> const &a, Array<half_t, 2> const &b) {
   Array<half_t, 2> d;
 
-  // TODO
-
   return d;
 }
 
@@ -60,8 +58,6 @@ template <>
 Array<half_t, 2> operator+(AArray<half_t, 2> const &a, Array<half_t, 2> const &b) {
   Array<half_t, 2> d;
 
-  // TODO
-
   return d;
 }
 
@@ -70,8 +66,6 @@ template <>
 Array<half_t, 2> operator-(Array<half_t, 2> const &a, Array<half_t, 2> const &b) {
   Array<T, N> d;
 
-  // TODO
-
   return d;
 }
 
@@ -83,8 +77,6 @@ template <>
 Array<half_t, 2> mac(Array<half_t, 2> const &a, Array<half_t, 2> const &b, Array<half_t, 2> const &c) {
   Array<half_t, 2> d;
 
-  // TODO
-
   return d;
 }
 
@@ -95,8 +87,6 @@ CUTLASS_HOST_DEVICE
 template <>
 half_t dot(Array<half_t, 2> const &a, Array<half_t, 2> const &b, half_t accum) {
 
-  // TODO
-
   return accum;
 }
 
@@ -105,8 +95,6 @@ CUTLASS_HOST_DEVICE
 template <>
 float dot(Array<half_t, 2> const &a, Array<half_t, 2> const &b, float accum) {
 
-  // TODO
-
   return accum;
 }
 
diff --git a/include/cutlass/arch/simd_sm61.h b/include/cutlass/arch/simd_sm61.h
index da236c16ea..b783c943ec 100644
--- a/include/cutlass/arch/simd_sm61.h
+++ b/include/cutlass/arch/simd_sm61.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/arch/synclog.hpp b/include/cutlass/arch/synclog.hpp
new file mode 100644
index 0000000000..8cf65ad73e
--- /dev/null
+++ b/include/cutlass/arch/synclog.hpp
@@ -0,0 +1,1324 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Synchronization event logging for race condition debugging.
+*/
+
+#pragma once
+
+#include "cutlass/detail/helper_macros.hpp"
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cstdint>
+#else
+#include <cstdint>
+#endif
+
+#if !defined(__CUDACC_RTC__)
+#include <mutex>
+#include <vector>
+#endif
+
+namespace cutlass {
+namespace arch {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ENABLE_SYNCLOG)
+
+constexpr uint32_t synclog_cap = 1 << 26;
+
+inline std::mutex synclog_mutex;
+inline std::vector<uint32_t*> synclog_buf_list;
+#if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+CUTLASS_DEVICE uint32_t* synclog_buf;
+#endif
+
+CUTLASS_DEVICE
+uint32_t* synclog_alloc(uint32_t n) {
+  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+  uint32_t* buf = synclog_buf;
+  if (buf == nullptr) return nullptr;
+  uint32_t last = atomicAdd(&buf[0], n);
+  if (last + n < synclog_cap) return buf + last + 1;
+  if (last >= synclog_cap) atomicAdd(&buf[0], -n);
+  #endif
+  return nullptr;
+}
+
+CUTLASS_DEVICE
+void synclog_emit_prefix(uint32_t* to, uint32_t header, uint32_t line) {
+  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+  uint64_t time64;
+  asm volatile (
+    "mov.u64 %0, %%globaltimer;\n"
+    : "=l"(time64) :
+  );
+  to[0] = header;
+  to[1] = line;
+  to[2] = time64;
+  to[3] = time64 >> 32;
+  to[4] = threadIdx.x;
+  to[5] = threadIdx.y;
+  to[6] = threadIdx.z;
+  to[7] = blockIdx.x;
+  to[8] = blockIdx.y;
+  to[9] = blockIdx.z;
+  #endif
+}
+
+constexpr uint32_t synclog_header_none = 0;
+constexpr uint32_t synclog_length_prefix = 1 + 1 + 2 + 3 + 3;
+
+constexpr bool     synclog_enable_syncthreads = true;
+constexpr uint32_t synclog_header_syncthreads = 1;
+constexpr uint32_t synclog_length_syncthreads = synclog_length_prefix + 0;
+
+constexpr bool     synclog_enable_syncwarp = true;
+constexpr uint32_t synclog_header_syncwarp = 2;
+constexpr uint32_t synclog_length_syncwarp = synclog_length_prefix + 0;
+
+constexpr bool     synclog_enable_named_barrier_arrive_and_wait = true;
+constexpr uint32_t synclog_header_named_barrier_arrive_and_wait = 3;
+constexpr uint32_t synclog_length_named_barrier_arrive_and_wait = synclog_length_prefix + 2;
+
+constexpr bool     synclog_enable_named_barrier_arrive = true;
+constexpr uint32_t synclog_header_named_barrier_arrive = 4;
+constexpr uint32_t synclog_length_named_barrier_arrive = synclog_length_prefix + 2;
+
+constexpr bool     synclog_enable_cluster_barrier_init = true;
+constexpr uint32_t synclog_header_cluster_barrier_init = 5;
+constexpr uint32_t synclog_length_cluster_barrier_init = synclog_length_prefix + 2;
+
+constexpr bool     synclog_enable_cluster_barrier_wait = true;
+constexpr uint32_t synclog_header_cluster_barrier_wait = 6;
+constexpr uint32_t synclog_length_cluster_barrier_wait = synclog_length_prefix + 4;
+
+constexpr bool     synclog_enable_cluster_barrier_test_wait = true;
+constexpr uint32_t synclog_header_cluster_barrier_test_wait = 7;
+constexpr uint32_t synclog_length_cluster_barrier_test_wait = synclog_length_prefix + 5;
+
+constexpr bool     synclog_enable_cluster_barrier_try_wait = true;
+constexpr uint32_t synclog_header_cluster_barrier_try_wait = 8;
+constexpr uint32_t synclog_length_cluster_barrier_try_wait = synclog_length_prefix + 4;
+
+constexpr bool     synclog_enable_cluster_barrier_arrive_cluster = true;
+constexpr uint32_t synclog_header_cluster_barrier_arrive_cluster = 9;
+constexpr uint32_t synclog_length_cluster_barrier_arrive_cluster = synclog_length_prefix + 5;
+
+constexpr bool     synclog_enable_cluster_barrier_arrive = true;
+constexpr uint32_t synclog_header_cluster_barrier_arrive = 10;
+constexpr uint32_t synclog_length_cluster_barrier_arrive = synclog_length_prefix + 3;
+
+constexpr bool     synclog_enable_cluster_barrier_invalidate = true;
+constexpr uint32_t synclog_header_cluster_barrier_invalidate = 11;
+constexpr uint32_t synclog_length_cluster_barrier_invalidate = synclog_length_prefix + 3;
+
+constexpr bool     synclog_enable_cluster_transaction_barrier_arrive_and_expect_tx = true;
+constexpr uint32_t synclog_header_cluster_transaction_barrier_arrive_and_expect_tx = 12;
+constexpr uint32_t synclog_length_cluster_transaction_barrier_arrive_and_expect_tx = synclog_length_prefix + 4;
+
+constexpr bool     synclog_enable_cluster_transaction_barrier_arrive_and_expect_tx_cluster = true;
+constexpr uint32_t synclog_header_cluster_transaction_barrier_arrive_and_expect_tx_cluster = 13;
+constexpr uint32_t synclog_length_cluster_transaction_barrier_arrive_and_expect_tx_cluster = synclog_length_prefix + 6;
+
+constexpr bool     synclog_enable_cluster_transaction_barrier_expect_transaction = true;
+constexpr uint32_t synclog_header_cluster_transaction_barrier_expect_transaction = 14;
+constexpr uint32_t synclog_length_cluster_transaction_barrier_expect_transaction = synclog_length_prefix + 4;
+
+constexpr bool     synclog_enable_cluster_transaction_barrier_complete_transaction = true;
+constexpr uint32_t synclog_header_cluster_transaction_barrier_complete_transaction = 15;
+constexpr uint32_t synclog_length_cluster_transaction_barrier_complete_transaction = synclog_length_prefix + 6;
+
+constexpr bool     synclog_enable_fence_barrier_init = true;
+constexpr uint32_t synclog_header_fence_barrier_init = 16;
+constexpr uint32_t synclog_length_fence_barrier_init = synclog_length_prefix + 0;
+
+constexpr bool     synclog_enable_fence_view_async_shared = true;
+constexpr uint32_t synclog_header_fence_view_async_shared = 17;
+constexpr uint32_t synclog_length_fence_view_async_shared = synclog_length_prefix + 0;
+
+constexpr bool     synclog_enable_cp_async_wait = true;
+constexpr uint32_t synclog_header_cp_async_wait = 18;
+constexpr uint32_t synclog_length_cp_async_wait = synclog_length_prefix + 1;
+
+constexpr bool     synclog_enable_cp_async_wait_all = true;
+constexpr uint32_t synclog_header_cp_async_wait_all = 19;
+constexpr uint32_t synclog_length_cp_async_wait_all = synclog_length_prefix + 0;
+
+constexpr bool     synclog_enable_cp_async_fence = true;
+constexpr uint32_t synclog_header_cp_async_fence = 20;
+constexpr uint32_t synclog_length_cp_async_fence = synclog_length_prefix + 0;
+
+constexpr bool     synclog_enable_cp_async_nan = true;
+constexpr uint32_t synclog_header_cp_async_nan = 21;
+constexpr uint32_t synclog_length_cp_async_nan = synclog_length_prefix + 4;
+
+constexpr bool     synclog_enable_cp_async_zfill = true;
+constexpr uint32_t synclog_header_cp_async_zfill = 22;
+constexpr uint32_t synclog_length_cp_async_zfill = synclog_length_prefix + 5;
+
+constexpr bool     synclog_enable_cp_async = true;
+constexpr uint32_t synclog_header_cp_async = 23;
+constexpr uint32_t synclog_length_cp_async = synclog_length_prefix + 5;
+
+constexpr bool     synclog_enable_tma_load = true;
+constexpr uint32_t synclog_header_tma_load = 24;
+constexpr uint32_t synclog_length_tma_load = synclog_length_prefix + 4;
+
+constexpr bool     synclog_enable_tma_store = true;
+constexpr uint32_t synclog_header_tma_store = 25;
+constexpr uint32_t synclog_length_tma_store = synclog_length_prefix + 3;
+
+constexpr bool     synclog_enable_tma_store_arrive = true;
+constexpr uint32_t synclog_header_tma_store_arrive = 26;
+constexpr uint32_t synclog_length_tma_store_arrive = synclog_length_prefix + 0;
+
+constexpr bool     synclog_enable_tma_store_wait = true;
+constexpr uint32_t synclog_header_tma_store_wait = 27;
+constexpr uint32_t synclog_length_tma_store_wait = synclog_length_prefix + 1;
+
+constexpr bool     synclog_enable_warpgroup_arrive = true;
+constexpr uint32_t synclog_header_warpgroup_arrive = 28;
+constexpr uint32_t synclog_length_warpgroup_arrive = synclog_length_prefix + 0;
+
+constexpr bool     synclog_enable_warpgroup_wait = true;
+constexpr uint32_t synclog_header_warpgroup_wait = 29;
+constexpr uint32_t synclog_length_warpgroup_wait = synclog_length_prefix + 1;
+
+constexpr bool     synclog_enable_warpgroup_commit_batch = true;
+constexpr uint32_t synclog_header_warpgroup_commit_batch = 30;
+constexpr uint32_t synclog_length_warpgroup_commit_batch = synclog_length_prefix + 0;
+
+constexpr bool     synclog_enable_wgmma_reg_smem = true;
+constexpr uint32_t synclog_header_wgmma_reg_smem = 31;
+constexpr uint32_t synclog_length_wgmma_reg_smem = synclog_length_prefix + 2;
+
+constexpr bool     synclog_enable_wgmma_smem_smem = true;
+constexpr uint32_t synclog_header_wgmma_smem_smem = 32;
+constexpr uint32_t synclog_length_wgmma_smem_smem = synclog_length_prefix + 4;
+
+constexpr bool     synclog_enable_cpasync_barrier_arrive = true;
+constexpr uint32_t synclog_header_cpasync_barrier_arrive = 33;
+constexpr uint32_t synclog_length_cpasync_barrier_arrive = synclog_length_prefix + 3;
+
+CUTLASS_DEVICE
+bool synclog_condition_emit() {
+  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+  return threadIdx.x%NumThreadsPerWarp == 0 && threadIdx.y == 0 && threadIdx.z == 0 &&
+    blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0;
+  #else
+  return 0;
+  #endif
+}
+
+CUTLASS_DEVICE
+bool synclog_condition_print() {
+  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+  return threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0 &&
+    blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0;
+  #else
+  return false;
+  #endif
+}
+
+CUTLASS_DEVICE
+void synclog_print_prefix(char const* header, uint32_t at) {
+  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+  uint32_t line = synclog_buf[at + 1];
+  uint32_t timeLo = synclog_buf[at + 2];
+  uint32_t timeHi = synclog_buf[at + 3];
+  uint32_t threadIdxX = synclog_buf[at + 4];
+  uint32_t threadIdxY = synclog_buf[at + 5];
+  uint32_t threadIdxZ = synclog_buf[at + 6];
+  uint32_t blockIdxX = synclog_buf[at + 7];
+  uint32_t blockIdxY = synclog_buf[at + 8];
+  uint32_t blockIdxZ = synclog_buf[at + 9];
+  printf(
+    "%s line=%u time=%lu thread=%u,%u,%u block=%u,%u,%u ",
+    header, line,
+    (uint64_t)timeHi << 32 | timeLo,
+    threadIdxX, threadIdxY, threadIdxZ,
+    blockIdxX, blockIdxY, blockIdxZ
+  );
+  #endif
+}
+
+CUTLASS_DEVICE
+uint64_t synclog_mbarrier_bits(uint32_t smem_addr) {
+  uint64_t bits = 0;
+  asm volatile (
+    "mbarrier.inval.shared::cta.b64 [%1];\n"
+    "ld.shared::cta.b64 %0, [%1];\n"
+    : "=l"(bits) : "r"(smem_addr)
+  );
+  return bits;
+}
+
+CUTLASS_DEVICE
+void synclog_print_wgmma_desc(char const* str, uint32_t lo, uint32_t hi, char const* sep) {
+  CUTLASS_UNUSED(hi);
+  uint32_t smem_int_ptr = (lo & ((1 << 14) - 1)) << 4;
+  printf("%s_smem_int_ptr=%u%s", str, smem_int_ptr, sep);
+}
+
+#endif // defined(CUTLASS_ENABLE_SYNCLOG)
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline void synclog_setup() {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+  std::scoped_lock lock(synclog_mutex);
+  auto fail = [] () {
+    fprintf(stderr, "synclog_setup() failed\n");
+    std::terminate();
+  };
+  int orig_device = 0;
+  if (cudaGetDevice(&orig_device) != cudaSuccess) {
+    fail();
+  }
+  int device_count = 0;
+  if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
+    fail();
+  }
+  if (synclog_buf_list.size() == 0) {
+    for (int device = 0; device < device_count; device++) {
+      uint32_t* buf = 0;
+      if (cudaSetDevice(device) != cudaSuccess ||
+        cudaMalloc(&buf, synclog_cap * sizeof(uint32_t)) != cudaSuccess) {
+        fail();
+      }
+      synclog_buf_list.push_back(buf);
+    }
+  }
+  for (int device = 0; device < device_count; device++) {
+    uint32_t* buf = synclog_buf_list.at(device);
+    if (cudaSetDevice(device) != cudaSuccess ||
+      cudaMemset(buf, 0, synclog_cap * sizeof(uint32_t)) != cudaSuccess ||
+      cudaMemcpyToSymbol(synclog_buf, &buf, sizeof(buf)) != cudaSuccess) {
+      fail();
+    }
+  }
+  if (cudaSetDevice(orig_device) != cudaSuccess) {
+    fail();
+  }
+  #endif
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_syncthreads(uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_syncthreads) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_syncthreads);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_syncthreads, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_syncwarp(uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_syncwarp) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_syncwarp);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_syncwarp, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_named_barrier_arrive_and_wait(
+  uint32_t line,
+  uint32_t num_threads,
+  uint32_t barrier_id) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_named_barrier_arrive_and_wait) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_named_barrier_arrive_and_wait);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_named_barrier_arrive_and_wait, line);
+  to[synclog_length_prefix + 0] = num_threads;
+  to[synclog_length_prefix + 1] = barrier_id;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(num_threads);
+  CUTLASS_UNUSED(barrier_id);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_named_barrier_arrive(
+  uint32_t line,
+  uint32_t num_threads,
+  uint32_t barrier_id) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_named_barrier_arrive) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_named_barrier_arrive);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_named_barrier_arrive, line);
+  to[synclog_length_prefix + 0] = num_threads;
+  to[synclog_length_prefix + 1] = barrier_id;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(num_threads);
+  CUTLASS_UNUSED(barrier_id);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cluster_barrier_init(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t arrive_count) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_barrier_init) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_init);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_barrier_init, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = arrive_count;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(arrive_count);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cluster_barrier_wait(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t phase) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_barrier_wait) return;
+  if (!synclog_condition_emit()) return;
+  uint64_t bits = synclog_mbarrier_bits(smem_addr);
+  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_wait);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_barrier_wait, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = phase;
+  to[synclog_length_prefix + 2] = bits;
+  to[synclog_length_prefix + 3] = bits >> 32;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(phase);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cluster_barrier_test_wait(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t phase,
+  uint32_t pred) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_barrier_test_wait) return;
+  if (!synclog_condition_emit()) return;
+  uint64_t bits = synclog_mbarrier_bits(smem_addr);
+  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_test_wait);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_barrier_test_wait, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = phase;
+  to[synclog_length_prefix + 2] = pred;
+  to[synclog_length_prefix + 3] = bits;
+  to[synclog_length_prefix + 4] = bits >> 32;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(phase);
+  CUTLASS_UNUSED(pred);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cluster_barrier_try_wait(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t phase) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_barrier_try_wait) return;
+  if (!synclog_condition_emit()) return;
+  uint64_t bits = synclog_mbarrier_bits(smem_addr);
+  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_try_wait);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_barrier_try_wait, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = phase;
+  to[synclog_length_prefix + 2] = bits;
+  to[synclog_length_prefix + 3] = bits >> 32;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(phase);  
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cluster_barrier_arrive_cluster(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t cta_id,
+  uint32_t pred) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_barrier_arrive_cluster) return;
+  if (!synclog_condition_emit()) return;
+  uint64_t bits = synclog_mbarrier_bits(smem_addr);
+  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_arrive_cluster);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_barrier_arrive_cluster, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = cta_id;
+  to[synclog_length_prefix + 2] = pred;
+  to[synclog_length_prefix + 3] = bits;
+  to[synclog_length_prefix + 4] = bits >> 32;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(cta_id);
+  CUTLASS_UNUSED(pred);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cluster_barrier_arrive(
+  uint32_t line,
+  uint32_t smem_addr) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_barrier_arrive) return;
+  if (!synclog_condition_emit()) return;
+  uint64_t bits = synclog_mbarrier_bits(smem_addr);
+  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_arrive);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_barrier_arrive, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = bits;
+  to[synclog_length_prefix + 2] = bits >> 32;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cluster_barrier_invalidate(
+  uint32_t line,
+  uint32_t smem_addr) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_barrier_invalidate) return;
+  if (!synclog_condition_emit()) return;
+  uint64_t bits = synclog_mbarrier_bits(smem_addr);
+  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_invalidate);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_barrier_invalidate, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = bits;
+  to[synclog_length_prefix + 2] = bits >> 32;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cluster_transaction_barrier_arrive_and_expect_tx(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t transaction_bytes) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_transaction_barrier_arrive_and_expect_tx) return;
+  if (!synclog_condition_emit()) return;
+  uint64_t bits = synclog_mbarrier_bits(smem_addr);
+  uint32_t* to = synclog_alloc(synclog_length_cluster_transaction_barrier_arrive_and_expect_tx);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_transaction_barrier_arrive_and_expect_tx, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = transaction_bytes;
+  to[synclog_length_prefix + 2] = bits;
+  to[synclog_length_prefix + 3] = bits >> 32;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(transaction_bytes);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cluster_transaction_barrier_arrive_and_expect_tx_cluster(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t transaction_bytes,
+  uint32_t cta_id,
+  uint32_t pred) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_transaction_barrier_arrive_and_expect_tx_cluster) return;
+  if (!synclog_condition_emit()) return;
+  uint64_t bits = synclog_mbarrier_bits(smem_addr);
+  uint32_t* to = synclog_alloc(synclog_length_cluster_transaction_barrier_arrive_and_expect_tx_cluster);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_transaction_barrier_arrive_and_expect_tx_cluster, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = transaction_bytes;
+  to[synclog_length_prefix + 2] = cta_id;
+  to[synclog_length_prefix + 3] = pred;
+  to[synclog_length_prefix + 4] = bits;
+  to[synclog_length_prefix + 5] = bits >> 32;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(transaction_bytes);
+  CUTLASS_UNUSED(cta_id);
+  CUTLASS_UNUSED(pred);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cluster_transaction_barrier_expect_transaction(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t transaction_bytes) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_transaction_barrier_expect_transaction) return;
+  if (!synclog_condition_emit()) return;
+  uint64_t bits = synclog_mbarrier_bits(smem_addr);
+  uint32_t* to = synclog_alloc(synclog_length_cluster_transaction_barrier_expect_transaction);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_transaction_barrier_expect_transaction, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = transaction_bytes;
+  to[synclog_length_prefix + 2] = bits;
+  to[synclog_length_prefix + 2] = bits >> 32;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(transaction_bytes);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cluster_transaction_barrier_complete_transaction(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t dst_cta_id,
+  uint32_t transaction_bytes,
+  uint32_t pred) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_transaction_barrier_complete_transaction) return;
+  if (!synclog_condition_emit()) return;
+  uint64_t bits = synclog_mbarrier_bits(smem_addr);
+  uint32_t* to = synclog_alloc(synclog_length_cluster_transaction_barrier_complete_transaction);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_transaction_barrier_complete_transaction, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = dst_cta_id;
+  to[synclog_length_prefix + 2] = transaction_bytes;
+  to[synclog_length_prefix + 3] = pred;
+  to[synclog_length_prefix + 4] = bits;
+  to[synclog_length_prefix + 5] = bits >> 32;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(dst_cta_id);
+  CUTLASS_UNUSED(transaction_bytes);
+  CUTLASS_UNUSED(pred);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_fence_barrier_init(uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_fence_barrier_init) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_fence_barrier_init);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_fence_barrier_init, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_fence_view_async_shared(uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_fence_view_async_shared) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_fence_view_async_shared);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_fence_view_async_shared, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cp_async_wait(
+  uint32_t line,
+  uint32_t n) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cp_async_wait) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cp_async_wait);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cp_async_wait, line);
+  to[synclog_length_prefix + 0] = n;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(n);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cp_async_wait_all(uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cp_async_wait_all) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cp_async_wait_all);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cp_async_wait_all, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cp_async_fence(uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cp_async_fence) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cp_async_fence);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cp_async_fence, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cp_async_nan(
+  uint32_t line,
+  uint32_t smem_addr,
+  const void* gmem_ptr,
+  uint32_t pred) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cp_async_nan) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cp_async_nan);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cp_async_nan, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = (uint32_t)((uint64_t)gmem_ptr);
+  to[synclog_length_prefix + 2] = (uint32_t)((uint64_t)gmem_ptr >> 32);
+  to[synclog_length_prefix + 3] = pred;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(gmem_ptr);
+  CUTLASS_UNUSED(pred);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cp_async_zfill(
+  uint32_t line,
+  uint32_t smem_addr,
+  const void* gmem_ptr,
+  uint32_t pred,
+  uint32_t size) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cp_async_zfill) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cp_async_zfill);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cp_async_zfill, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = (uint32_t)((uint64_t)gmem_ptr);
+  to[synclog_length_prefix + 2] = (uint32_t)((uint64_t)gmem_ptr >> 32);
+  to[synclog_length_prefix + 3] = pred;
+  to[synclog_length_prefix + 4] = size;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(gmem_ptr);
+  CUTLASS_UNUSED(pred);
+  CUTLASS_UNUSED(size);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cp_async(
+  uint32_t line,
+  uint32_t smem_addr,
+  const void* gmem_ptr,
+  uint32_t pred,
+  uint32_t size) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cp_async) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cp_async);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cp_async, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = (uint32_t)((uint64_t)gmem_ptr);
+  to[synclog_length_prefix + 2] = (uint32_t)((uint64_t)gmem_ptr >> 32);
+  to[synclog_length_prefix + 3] = pred;
+  to[synclog_length_prefix + 4] = size;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(gmem_ptr);
+  CUTLASS_UNUSED(pred);
+  CUTLASS_UNUSED(size);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_tma_load(
+  uint32_t line,
+  uint64_t gmem_int_desc,
+  uint32_t smem_int_mbar,
+  uint32_t smem_int_ptr) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_tma_load) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_tma_load);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_tma_load, line);
+  to[synclog_length_prefix + 0] = (uint32_t)((uint64_t)gmem_int_desc);
+  to[synclog_length_prefix + 1] = (uint32_t)((uint64_t)gmem_int_desc >> 32);
+  to[synclog_length_prefix + 2] = smem_int_mbar;
+  to[synclog_length_prefix + 3] = smem_int_ptr;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(gmem_int_desc);
+  CUTLASS_UNUSED(smem_int_mbar);
+  CUTLASS_UNUSED(smem_int_ptr);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_tma_store(
+  uint32_t line,
+  uint64_t gmem_int_desc,
+  uint32_t smem_int_ptr) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_tma_store) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_tma_store);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_tma_store, line);
+  to[synclog_length_prefix + 0] = (uint32_t)((uint64_t)gmem_int_desc);
+  to[synclog_length_prefix + 1] = (uint32_t)((uint64_t)gmem_int_desc >> 32);
+  to[synclog_length_prefix + 2] = smem_int_ptr;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(gmem_int_desc);
+  CUTLASS_UNUSED(smem_int_ptr);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_tma_store_arrive(uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_tma_store_arrive) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_tma_store_arrive);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_tma_store_arrive, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_tma_store_wait(
+  uint32_t line,
+  uint32_t count) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_tma_store_wait) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_tma_store_wait);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_tma_store_wait, line);
+  to[synclog_length_prefix + 0] = count;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(count);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_warpgroup_arrive(
+  uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_warpgroup_arrive) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_warpgroup_arrive);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_warpgroup_arrive, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_warpgroup_wait(
+  uint32_t line,
+  uint32_t n) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_warpgroup_wait) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_warpgroup_wait);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_warpgroup_wait, line);
+  to[synclog_length_prefix + 0] = n;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(n);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_warpgroup_commit_batch(
+  uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_warpgroup_commit_batch) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_warpgroup_commit_batch);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_warpgroup_commit_batch, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_wgmma_reg_smem(
+  uint32_t line,
+  uint64_t desc_b) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_wgmma_reg_smem) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_wgmma_reg_smem);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_wgmma_reg_smem, line);
+  to[synclog_length_prefix + 0] = desc_b;
+  to[synclog_length_prefix + 1] = desc_b >> 32;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(desc_b);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_wgmma_smem_smem(
+  uint32_t line,
+  uint64_t desc_a,
+  uint64_t desc_b) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_wgmma_smem_smem) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_wgmma_smem_smem);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_wgmma_smem_smem, line);
+  to[synclog_length_prefix + 0] = desc_a;
+  to[synclog_length_prefix + 1] = desc_a >> 32;
+  to[synclog_length_prefix + 2] = desc_b;
+  to[synclog_length_prefix + 3] = desc_b >> 32;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(desc_a);
+  CUTLASS_UNUSED(desc_b);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cpasync_barrier_arrive(
+  uint32_t line,
+  uint32_t smem_addr) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cpasync_barrier_arrive) return;
+  if (!synclog_condition_emit()) return;
+  uint64_t bits = synclog_mbarrier_bits(smem_addr);
+  uint32_t* to = synclog_alloc(synclog_length_cpasync_barrier_arrive);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cpasync_barrier_arrive, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = bits;
+  to[synclog_length_prefix + 2] = bits >> 32;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+#if !defined(CUTLASS_ENABLE_SYNCLOG)
+CUTLASS_DEVICE
+#elif defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+static __attribute__((__noinline__)) __device__
+#else
+static __attribute__((__noinline__))
+#endif
+void synclog_print() {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+  if (synclog_buf == nullptr || !synclog_condition_print()) {
+    return;
+  }
+  printf("synclog start\n");
+  for (uint32_t at = 1; at < synclog_cap; ) {
+    uint32_t header = synclog_buf[at];
+    if (header == synclog_header_none) {
+      break;
+    }
+    printf("synclog at %u: ", at);
+    if constexpr (synclog_enable_syncthreads) {
+      if (header == synclog_header_syncthreads) {
+        synclog_print_prefix("syncthreads", at);
+        at += synclog_length_syncthreads;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_syncwarp) {
+      if (header == synclog_header_syncwarp) {
+        synclog_print_prefix("syncwarp", at);
+        at += synclog_length_syncwarp;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_named_barrier_arrive_and_wait) {
+      if (header == synclog_header_named_barrier_arrive_and_wait) {
+        synclog_print_prefix("named_barrier_arrive_and_wait", at);
+        at += synclog_length_named_barrier_arrive_and_wait;
+        printf("num_threads=%u barrier_id=%u\n", synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_named_barrier_arrive) {
+      if (header == synclog_header_named_barrier_arrive) {
+        synclog_print_prefix("named_barrier_arrive", at);
+        at += synclog_length_named_barrier_arrive;
+        printf("num_threads=%u barrier_id=%u\n", synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_barrier_init) {
+      if (header == synclog_header_cluster_barrier_init) {
+        synclog_print_prefix("cluster_barrier_init", at);
+        at += synclog_length_cluster_barrier_init;
+        printf("smem_addr=%u arrive_count=%u\n", synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_barrier_wait) {
+      if (header == synclog_header_cluster_barrier_wait) {
+        synclog_print_prefix("cluster_barrier_wait", at);
+        at += synclog_length_cluster_barrier_wait;
+        printf("smem_addr=%u phase=%u", synclog_buf[at-4], synclog_buf[at-3]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_barrier_test_wait) {
+      if (header == synclog_header_cluster_barrier_test_wait) {
+        synclog_print_prefix("cluster_barrier_test_wait", at);
+        at += synclog_length_cluster_barrier_test_wait;
+        printf("smem_addr=%u phase=%u pred=%u", synclog_buf[at-5], synclog_buf[at-4], synclog_buf[at-3]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_barrier_try_wait) {
+      if (header == synclog_header_cluster_barrier_try_wait) {
+        synclog_print_prefix("cluster_barrier_try_wait", at);
+        at += synclog_length_cluster_barrier_try_wait;
+        printf("smem_addr=%u phase=%u", synclog_buf[at-4], synclog_buf[at-3]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_barrier_arrive_cluster) {
+      if (header == synclog_header_cluster_barrier_arrive_cluster) {
+        synclog_print_prefix("cluster_barrier_arrive_cluster", at);
+        at += synclog_length_cluster_barrier_arrive_cluster;
+        printf("smem_addr=%u cta_id=%u pred=%u", synclog_buf[at-5], synclog_buf[at-4], synclog_buf[at-3]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_barrier_arrive) {
+      if (header == synclog_header_cluster_barrier_arrive) {
+        synclog_print_prefix("cluster_barrier_arrive", at);
+        at += synclog_length_cluster_barrier_arrive;
+        printf("smem_addr=%u", synclog_buf[at-3]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_barrier_invalidate) {
+      if (header == synclog_header_cluster_barrier_invalidate) {
+        synclog_print_prefix("cluster_barrier_invalidate", at);
+        at += synclog_length_cluster_barrier_invalidate;
+        printf("smem_addr=%u", synclog_buf[at-3]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_transaction_barrier_arrive_and_expect_tx) {
+      if (header == synclog_header_cluster_transaction_barrier_arrive_and_expect_tx) {
+        synclog_print_prefix("cluster_transaction_barrier_arrive_and_expect_tx", at);
+        at += synclog_length_cluster_transaction_barrier_arrive_and_expect_tx;
+        printf("smem_addr=%u transaction_bytes=%u", synclog_buf[at-4], synclog_buf[at-3]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_transaction_barrier_arrive_and_expect_tx_cluster) {
+      if (header == synclog_header_cluster_transaction_barrier_arrive_and_expect_tx_cluster) {
+        synclog_print_prefix("cluster_transaction_barrier_arrive_and_expect_tx_cluster", at);
+        at += synclog_length_cluster_transaction_barrier_arrive_and_expect_tx_cluster;
+        printf("smem_addr=%u transaction_bytes=%u cta_id=%u pred=%u", synclog_buf[at-6], synclog_buf[at-5], synclog_buf[at-4], synclog_buf[at-3]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_transaction_barrier_expect_transaction) {
+      if (header == synclog_header_cluster_transaction_barrier_expect_transaction) {
+        synclog_print_prefix("cluster_transaction_barrier_expect_transaction", at);
+        at += synclog_length_cluster_transaction_barrier_expect_transaction;
+        printf("smem_addr=%u transaction_bytes=%u", synclog_buf[at-4], synclog_buf[at-3]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_transaction_barrier_complete_transaction) {
+      if (header == synclog_header_cluster_transaction_barrier_complete_transaction) {
+        synclog_print_prefix("cluster_transaction_barrier_complete_transaction", at);
+        at += synclog_length_cluster_transaction_barrier_complete_transaction;
+        printf("smem_addr=%u dst_cta_id=%u transaction_bytes=%u pred=%u", synclog_buf[at-6], synclog_buf[at-5], synclog_buf[at-4], synclog_buf[at-3]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_fence_barrier_init) {
+      if (header == synclog_header_fence_barrier_init) {
+        synclog_print_prefix("fence_barrier_init", at);
+        at += synclog_length_fence_barrier_init;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_fence_view_async_shared) {
+      if (header == synclog_header_fence_view_async_shared) {
+        synclog_print_prefix("fence_view_async_shared", at);
+        at += synclog_length_fence_view_async_shared;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cp_async_wait) {
+      if (header == synclog_header_cp_async_wait) {
+        synclog_print_prefix("cp_async_wait", at);
+        at += synclog_length_cp_async_wait;
+        printf("n=%u\n", synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cp_async_wait_all) {
+      if (header == synclog_header_cp_async_wait_all) {
+        synclog_print_prefix("cp_async_wait_all", at);
+        at += synclog_length_cp_async_wait_all;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cp_async_fence) {
+      if (header == synclog_header_cp_async_fence) {
+        synclog_print_prefix("cp_async_fence", at);
+        at += synclog_length_cp_async_fence;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cp_async_nan) {
+      if (header == synclog_header_cp_async_nan) {
+        synclog_print_prefix("cp_async_nan", at);
+        at += synclog_length_cp_async_nan;
+        uint64_t gmem_addr = synclog_buf[at-3];
+        gmem_addr += (uint64_t)synclog_buf[at-2] << 32;
+        printf("smem_addr=%u gmem_addr=%llu pred=%u\n", synclog_buf[at-4], gmem_addr, synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cp_async_zfill) {
+      if (header == synclog_header_cp_async_zfill) {
+        synclog_print_prefix("cp_async_zfill", at);
+        at += synclog_length_cp_async_zfill;
+        uint64_t gmem_addr = synclog_buf[at-4];
+        gmem_addr += (uint64_t)synclog_buf[at-3] << 32;
+        printf("smem_addr=%u gmem_addr=%llu pred=%u size=%u\n", synclog_buf[at-5], gmem_addr, synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cp_async) {
+      if (header == synclog_header_cp_async) {
+        synclog_print_prefix("cp_async", at);
+        at += synclog_length_cp_async;
+        uint64_t gmem_addr = synclog_buf[at-4];
+        gmem_addr += (uint64_t)synclog_buf[at-3] << 32;
+        printf("smem_addr=%u gmem_addr=%llu pred=%u size=%u\n", synclog_buf[at-5], gmem_addr, synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_tma_load) {
+      if (header == synclog_header_tma_load) {
+        synclog_print_prefix("tma_load", at);
+        at += synclog_length_tma_load;
+        uint64_t gmem_int_desc = synclog_buf[at-4];
+        gmem_int_desc += (uint64_t)synclog_buf[at-3] << 32;
+        printf("gmem_int_desc=%llu smem_int_mbar=%u smem_int_ptr=%u\n", gmem_int_desc, synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_tma_store) {
+      if (header == synclog_header_tma_store) {
+        synclog_print_prefix("tma_store", at);
+        at += synclog_length_tma_store;
+        uint64_t gmem_int_desc = synclog_buf[at-3];
+        gmem_int_desc += (uint64_t)synclog_buf[at-2] << 32;
+        printf("gmem_int_desc=%llu smem_int_ptr=%u\n", gmem_int_desc, synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_tma_store_arrive) {
+      if (header == synclog_header_tma_store_arrive) {
+        synclog_print_prefix("tma_store_arrive", at);
+        at += synclog_length_tma_store_arrive;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_tma_store_wait) {
+      if (header == synclog_header_tma_store_wait) {
+        synclog_print_prefix("tma_store_wait", at);
+        at += synclog_length_tma_store_wait;
+        printf("count=%u\n", synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_warpgroup_arrive) {
+      if (header == synclog_header_warpgroup_arrive) {
+        synclog_print_prefix("warpgroup_arrive", at);
+        at += synclog_length_warpgroup_arrive;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_warpgroup_wait) {
+      if (header == synclog_header_warpgroup_wait) {
+        synclog_print_prefix("warpgroup_wait", at);
+        at += synclog_length_warpgroup_wait;
+        printf("n=%u\n", synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_warpgroup_commit_batch) {
+      if (header == synclog_header_warpgroup_commit_batch) {
+        synclog_print_prefix("warpgroup_commit_batch", at);
+        at += synclog_length_warpgroup_commit_batch;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_wgmma_reg_smem) {
+      if (header == synclog_header_wgmma_reg_smem) {
+        synclog_print_prefix("wgmma_reg_smem", at);
+        at += synclog_length_wgmma_reg_smem;
+        synclog_print_wgmma_desc("desc_b", synclog_buf[at-2], synclog_buf[at-1], "");
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_wgmma_smem_smem) {
+      if (header == synclog_header_wgmma_smem_smem) {
+        synclog_print_prefix("wgmma_smem_smem", at);
+        at += synclog_length_wgmma_smem_smem;
+        synclog_print_wgmma_desc("desc_a", synclog_buf[at-4], synclog_buf[at-3], " ");
+        synclog_print_wgmma_desc("desc_b", synclog_buf[at-2], synclog_buf[at-1], "");
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cpasync_barrier_arrive) {
+      if (header == synclog_header_cpasync_barrier_arrive) {
+        synclog_print_prefix("cpasync_barrier_arrive", at);
+        at += synclog_length_cpasync_barrier_arrive;
+        printf("smem_addr=%u", synclog_buf[at-3]);
+        continue;
+      }
+    }
+    asm volatile ("brkpt;\n" ::);
+  }
+  if (synclog_buf[0] >= synclog_cap) {
+    printf(
+      "synclog was truncated (exceeded capacity of %lu bytes)\n",
+      (synclog_cap - 1) * sizeof(uint32_t)
+    );
+  }
+  printf("synclog end\n");
+  #endif
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ENABLE_SYNCLOG)
+#undef __syncthreads
+#define __syncthreads() do {\
+  cutlass::arch::synclog_emit_syncthreads(__LINE__);\
+  __syncthreads();\
+} while (0)
+#endif // defined(CUTLASS_ENABLE_SYNCLOG)
+
+#if defined(CUTLASS_ENABLE_SYNCLOG)
+#undef __syncwarp
+#define __syncwarp(...) do {\
+  cutlass::arch::synclog_emit_syncwarp(__LINE__);\
+  __syncwarp(__VA_ARGS__);\
+} while (0)
+#endif // defined(CUTLASS_ENABLE_SYNCLOG)
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace arch
+} // namespace cutlass
diff --git a/include/cutlass/arch/wmma.h b/include/cutlass/arch/wmma.h
index 13d0a9ed97..720895f385 100644
--- a/include/cutlass/arch/wmma.h
+++ b/include/cutlass/arch/wmma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/arch/wmma_sm70.h b/include/cutlass/arch/wmma_sm70.h
index 59d8eeed8e..d75ee2b075 100644
--- a/include/cutlass/arch/wmma_sm70.h
+++ b/include/cutlass/arch/wmma_sm70.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -34,11 +34,7 @@
 
 #pragma once
 
-#if defined(__CUDACC_RTC__)
 #include <cuda/std/cassert>
-#else
-#include <assert.h>
-#endif
 #include "cutlass/layout/matrix.h"
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/arch/wmma_sm72.h b/include/cutlass/arch/wmma_sm72.h
index 0895c86b3a..b644181b80 100644
--- a/include/cutlass/arch/wmma_sm72.h
+++ b/include/cutlass/arch/wmma_sm72.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -34,11 +34,7 @@
 
 #pragma once
 
-#if defined(__CUDACC_RTC__)
 #include <cuda/std/cassert>
-#else
-#include <assert.h>
-#endif
 #include "cutlass/layout/matrix.h"
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/arch/wmma_sm75.h b/include/cutlass/arch/wmma_sm75.h
index c2c9e068f5..f603605128 100644
--- a/include/cutlass/arch/wmma_sm75.h
+++ b/include/cutlass/arch/wmma_sm75.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -34,11 +34,7 @@
 
 #pragma once
 
-#if defined(__CUDACC_RTC__)
 #include <cuda/std/cassert>
-#else
-#include <assert.h>
-#endif
 #include "cutlass/layout/matrix.h"
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/array.h b/include/cutlass/array.h
index 347002f008..e85d19facf 100644
--- a/include/cutlass/array.h
+++ b/include/cutlass/array.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,8 +35,9 @@
 
 #pragma once
 #include "cutlass/cutlass.h"
+#include "cutlass/functional.h"
 #include "cutlass/numeric_types.h"
-
+#include "cutlass/platform/platform.h"
 namespace cutlass {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -47,15 +48,31 @@ template <
   int N,
   bool RegisterSized = sizeof_bits<T>::value >= 32
 >
-class Array;
+struct Array;
+
+namespace detail {
+
+template<class T>
+struct is_Array : platform::false_type {};
+
+template <
+  typename T,
+  int N,
+  bool RegisterSized
+>
+struct is_Array<Array<T, N, RegisterSized> > : platform::true_type {};
+
+template<typename T>
+constexpr bool is_Array_v = is_Array<T>::value;
+
+} // namespace detail
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Defines the size of an Array<> in bits
 template <typename T, int N, bool RegisterSized>
 struct sizeof_bits<Array<T, N, RegisterSized> > {
-  static int const value =
-    int(sizeof(typename Array<T, N, RegisterSized>::Storage)) * 8 * int(Array<T, N, RegisterSized>::kStorageElements);
+  static constexpr int value = sizeof(Array<T, N, RegisterSized>) * 8;
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -81,8 +98,7 @@ template <
   typename T,
   int N
 >
-class Array<T, N, true> {
-public:
+struct Array<T, N, true> {
 
   /// Storage type
   using Storage = T;
@@ -92,10 +108,10 @@ class Array<T, N, true> {
 
   /// Number of storage elements
   //static std::size_t const kStorageElements = N;
-  static size_t const kStorageElements = N;
+  static constexpr size_t kStorageElements = N;
 
   /// Number of logical elements
-  static size_t const kElements = N;
+  static constexpr size_t kElements = N;
 
   //
   // C++ standard members
@@ -337,26 +353,9 @@ class Array<T, N, true> {
     }
   };
 
-private:
-
   /// Internal storage
   Storage storage[kElements];
 
-public:
-
-  #if 0
-  CUTLASS_HOST_DEVICE
-  Array() { }
-
-  CUTLASS_HOST_DEVICE
-  Array(Array const &x) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kElements; ++i) {
-      storage[i] = x.storage[i];
-    }
-  }
-  #endif
-
   /// Efficient clear method
   CUTLASS_HOST_DEVICE
   void clear() {
@@ -442,7 +441,7 @@ class Array<T, N, true> {
   CUTLASS_HOST_DEVICE
   void fill(T const &value) {
     CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kElements; ++i) {
+    for (int i = 0; i < int(kElements); ++i) {
       storage[i] = static_cast<Storage>(value);
     }
   }
@@ -452,6 +451,11 @@ class Array<T, N, true> {
     return iterator(storage);
   }
 
+  CUTLASS_HOST_DEVICE
+  const_iterator begin() const {
+    return cbegin();
+  }
+
   CUTLASS_HOST_DEVICE
   const_iterator cbegin() const {
     return const_iterator(storage);
@@ -462,6 +466,11 @@ class Array<T, N, true> {
     return iterator(reinterpret_cast<pointer>(storage + kStorageElements));
   }
 
+  CUTLASS_HOST_DEVICE
+  const_iterator end() const {
+    return cend();
+  }
+
   CUTLASS_HOST_DEVICE
   const_iterator cend() const {
     return const_iterator(reinterpret_cast<const_pointer>(storage + kStorageElements));
@@ -472,6 +481,11 @@ class Array<T, N, true> {
     return reverse_iterator(reinterpret_cast<pointer>(storage + kStorageElements));
   }
 
+  CUTLASS_HOST_DEVICE
+  const_reverse_iterator rbegin() const {
+    return crbegin();
+  }
+
   CUTLASS_HOST_DEVICE
   const_reverse_iterator crbegin() const {
     return const_reverse_iterator(reinterpret_cast<const_pointer>(storage + kStorageElements));
@@ -482,6 +496,11 @@ class Array<T, N, true> {
     return reverse_iterator(reinterpret_cast<pointer>(storage));
   }
 
+  CUTLASS_HOST_DEVICE
+  const_reverse_iterator rend() const {
+    return crend();
+  }
+
   CUTLASS_HOST_DEVICE
   const_reverse_iterator crend() const {
     return const_reverse_iterator(reinterpret_cast<const_pointer>(storage));
@@ -493,77 +512,2092 @@ class Array<T, N, true> {
 
 };
 
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Factories
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <typename Element>
 CUTLASS_HOST_DEVICE
 Array<Element, 1> make_Array(Element x) {
-  Array<Element, 1> m;
-  m[0] = x;
-  return m;
+  return {x};
 }
 
 template <typename Element>
 CUTLASS_HOST_DEVICE
 Array<Element, 2> make_Array(Element x, Element y) {
-  Array<Element, 2> m;
-  m[0] = x;
-  m[1] = y;
-  return m;
+  return {x,y};
 }
 
 template <typename Element>
 CUTLASS_HOST_DEVICE
 Array<Element, 3> make_Array(Element x, Element y, Element z) {
-  Array<Element, 3> m;
-  m[0] = x;
-  m[1] = y;
-  m[2] = z;
-  return m;
+  return {x,y,z};
 }
 
 template <typename Element>
 CUTLASS_HOST_DEVICE
 Array<Element, 4> make_Array(Element x, Element y, Element z, Element w) {
-  Array<Element, 4> m;
-  m[0] = x;
-  m[1] = y;
-  m[2] = z;
-  m[3] = w;
-  return m;
+  return {x,y,z,w};
 }
 
-////////////////////////////////////////////////////////////////////////////////////////////////////
 
-} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// functional.h numeric specializations
+/////////////////////////////////////////////////////////////////////////////////////////////////
 
-////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename T, int N>
+struct absolute_value_op< Array<T, N> > {
 
-#include "cutlass/array_subbyte.h"
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs) const {
 
-////////////////////////////////////////////////////////////////////////////////////////////////////
+    Array<T, N> result;
+    absolute_value_op<T> scalar_op;
 
-namespace cutlass {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i]);
+    }
 
-////////////////////////////////////////////////////////////////////////////////////////////////////
+    return result;
+  }
+};
 
-/// Aligned array type
-template <
-  /// Element type
-  typename T,
-  /// Number of elements in the array
-  int N,
-  /// Alignment requirement in bytes
-  int Alignment = sizeof_bits<T>::value * N / 8
->
-class alignas(Alignment) AlignedArray: public Array<T, N> {
-public:
+template <typename T, int N>
+struct plus<Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    plus<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], rhs[i]);
+    }
+
+    return result;
+  }
 
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
+
+    Array<T, N> result;
+    plus<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], scalar);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()( T const &scalar, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    plus<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, rhs[i]);
+    }
+
+    return result;
+  }
 };
+template <typename T, int N>
+struct minus<Array<T, N>> {
 
-////////////////////////////////////////////////////////////////////////////////////////////////////
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
 
-} // namespace cutlass
+    Array<T, N> result;
+    minus<T> scalar_op;
 
-////////////////////////////////////////////////////////////////////////////////////////////////////
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], rhs[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
+
+    Array<T, N> result;
+    minus<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], scalar);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()( T const &scalar, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    minus<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, rhs[i]);
+    }
+
+    return result;
+  }
+};
+
+template <typename T, int N>
+struct multiplies<Array<T, N>> {
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    multiplies<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], rhs[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
+
+    Array<T, N> result;
+    multiplies<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], scalar);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()( T const &scalar, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    multiplies<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, rhs[i]);
+    }
+
+    return result;
+  }
+};
+
+template <typename T, int N, bool PropogateNaN>
+struct maximum_absolute_value_reduction<Array<T, N>, PropogateNaN> {
+
+  CUTLASS_HOST_DEVICE
+  T operator() (T const& scalar, Array<T, N> const& rhs) const {
+
+    T result = scalar;
+    maximum_absolute_value_reduction<T, PropogateNaN> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result = scalar_op(result, rhs[i]);
+    }
+
+    return result;
+  }
+};
+
+template <typename T, int N>
+struct scale<Array<T, N>> {
+  T const scaling_factor_;
+
+  CUTLASS_HOST_DEVICE
+  scale(T scaling_factor) : scaling_factor_(scaling_factor) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const & rhs) const {
+    Array<T, N> result;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = rhs[i] * scaling_factor_;
+    }
+
+    return result;
+  }
+};
+
+template <typename T, int N>
+struct divides<Array<T, N>> {
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    divides<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], rhs[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
+
+    Array<T, N> result;
+    divides<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], scalar);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()( T const &scalar, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    divides<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, rhs[i]);
+    }
+
+    return result;
+  }
+};
+
+template <typename T, int N>
+struct reciprocal_approximate<Array<T, N>> {
+  
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs) const {
+
+    Array<T, N> result;
+    reciprocal_approximate<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i]);
+    }
+
+    return result;
+  }
+};
+
+template <typename T, int N>
+struct reciprocal_approximate_ftz<Array<T, N>> {
+  
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs) const {
+
+    Array<T, N> result;
+    reciprocal_approximate_ftz<T> scalar_op;
 
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i]);
+    }
+
+    return result;
+  }
+};
+
+template <typename T, int N, bool PropagateNaN>
+struct maximum<Array<T, N>, PropagateNaN> {
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    maximum<T, PropagateNaN> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], rhs[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
+
+    Array<T, N> result;
+    maximum<T, PropagateNaN> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], scalar);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(T const &scalar, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    maximum<T, PropagateNaN> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, rhs[i]);
+    }
+
+    return result;
+  }
+};
+
+template <typename T, int N, bool PropagateNaN>
+struct minimum<Array<T, N>, PropagateNaN> {
+
+  CUTLASS_HOST_DEVICE
+  static T scalar_op(T const &lhs, T const &rhs) {
+    return (rhs < lhs ? rhs : lhs);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    minimum<T, PropagateNaN> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], rhs[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
+
+    Array<T, N> result;
+    minimum<T, PropagateNaN> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], scalar);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(T const &scalar, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    minimum<T, PropagateNaN> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, rhs[i]);
+    }
+
+    return result;
+  }
+};
+
+template <typename T, int N>
+struct minimum_with_nan_propagation<Array<T, N>> : minimum<Array<T, N>, true> 
+{};
+
+template <typename T, int N>
+struct negate<Array<T, N>> {
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs) const {
+
+    Array<T, N> result;
+    negate<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i]);
+    }
+
+    return result;
+  }
+};
+
+/// Fused multiply-add
+template <typename T, int N>
+struct multiply_add<Array<T, N>, Array<T, N>, Array<T, N>> {
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a, Array<T, N> const &b, Array<T, N> const &c) const {
+
+    Array<T, N> result;
+    multiply_add<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(a[i], b[i], c[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a, T const &scalar, Array<T, N> const &c) const {
+
+    Array<T, N> result;
+    multiply_add<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(a[i], scalar, c[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(T const &scalar, Array<T, N> const &b, Array<T, N> const &c) const {
+
+    Array<T, N> result;
+    multiply_add<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, b[i], c[i]);
+    }
+
+    return result;
+  }
+};
+
+/// Fused square-and-plus
+template <typename T, int N>
+struct square_and_plus<Array<T, N>> {
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
+    multiply_add<Array<T, N>, Array<T, N>, Array<T, N>> ma_op;
+    return ma_op(rhs, rhs, lhs);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, T const &rhs) const {
+    plus<Array<T, N>> plus_op;
+    multiplies<T> multiplies_op;
+    return plus_op(multiplies_op(rhs, rhs), lhs);
+  }
+};
+
+/// Inverse-square-root
+template <typename T, int N>
+struct inverse_square_root<Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a) const {
+    Array<T, N> result;
+    inverse_square_root<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(a[i]);
+    }
+    return result;
+  }
+};
+
+template <int N>
+struct inverse_square_root<Array<half_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & a) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = h2rsqrt(a_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
+      __half d_residual = hrsqrt(a_residual_ptr[N - 1]);
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    inverse_square_root<half_t> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(a[i]);
+    }
+
+    #endif
+
+    return result;
+  }
+};
+
+/// Fused multiply-add-relu0
+template <typename T, int N>
+struct multiply_add_relu0<Array<T, N>, Array<T, N>, Array<T, N>> {
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a, Array<T, N> const &b, Array<T, N> const &c) const {
+
+    Array<T, N> result;
+    multiply_add<T> scalar_op;
+    maximum<T> mx;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(scalar_op(a[i], b[i], c[i]), T(0));
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a, T const &scalar, Array<T, N> const &c) const {
+
+    Array<T, N> result;
+    multiply_add<T> scalar_op;
+    maximum<T> mx;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(scalar_op(a[i], scalar, c[i]), T(0));
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(T const &scalar, Array<T, N> const &b, Array<T, N> const &c) const {
+
+    Array<T, N> result;
+    multiply_add<T> scalar_op;
+    maximum<T> mx;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(scalar_op(scalar, b[i], c[i]), T(0));
+    }
+
+    return result;
+  }
+};
+
+
+template <typename T, int N>
+struct conjugate<Array<T, N> >  {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a) const {
+
+    conjugate<T> conj_op;
+
+    Array<T, N> ca;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      ca[i] = conj_op(a[i]);
+    }
+    return ca;
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// functional.h numeric specializations targeting SIMD instructions in device code.
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N>
+struct plus<Array<half_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hadd2(lhs_ptr[i], rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+      __half d_residual = __hadd(a_residual_ptr[N - 1], b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs[i] + rhs[i];
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hadd2(lhs_pair, rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+      __half d_residual = __hadd(reinterpret_cast<__half const &>(lhs), b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs + rhs[i];
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hadd2(lhs_ptr[i], rhs_pair);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half d_residual = __hadd(a_residual_ptr[N - 1], reinterpret_cast<__half const &>(rhs));
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs[i] + rhs;
+    }
+    #endif
+
+    return result;
+  }
+};
+
+template <int N>
+struct minus<Array<half_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hsub2(lhs_ptr[i], rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+      __half d_residual = __hsub(a_residual_ptr[N - 1], b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs[i] - rhs[i];
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hsub2(lhs_pair, rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+      __half d_residual = __hsub(reinterpret_cast<__half const &>(lhs), b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs - rhs[i];
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hsub2(lhs_ptr[i], rhs_pair);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half d_residual = __hsub(a_residual_ptr[N - 1], reinterpret_cast<__half const &>(rhs));
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs[i] - rhs;
+    }
+    #endif
+
+    return result;
+  }
+};
+
+template <int N>
+struct multiplies<Array<half_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hmul2(lhs_ptr[i], rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+      __half d_residual = __hmul(a_residual_ptr[N - 1], b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs[i] * rhs[i];
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hmul2(lhs_pair, rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+
+      __half d_residual = __hmul(
+        reinterpret_cast<__half const &>(lhs),
+        b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs * rhs[i];
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hmul2(lhs_ptr[i], rhs_pair);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+
+      __half d_residual = __hmul(
+        a_residual_ptr[N - 1],
+        reinterpret_cast<__half const &>(rhs));
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs[i] * rhs;
+    }
+    #endif
+
+    return result;
+  }
+};
+
+template <int N>
+struct divides<Array<half_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __h2div(lhs_ptr[i], rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+
+      __half d_residual = __hdiv(
+        a_residual_ptr[N - 1],
+        b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs[i] / rhs[i];
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __h2div(lhs_pair, rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+
+      __half d_residual = __hdiv(
+        reinterpret_cast<__half const &>(lhs),
+        b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs / rhs[i];
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __h2div(lhs_ptr[i], rhs_pair);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+
+      __half d_residual = __hdiv(
+        a_residual_ptr[N - 1],
+        reinterpret_cast<__half const &>(rhs));
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs[i] / rhs;
+    }
+    #endif
+
+    return result;
+  }
+};
+
+template <int N>
+struct negate<Array<half_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *source_ptr = reinterpret_cast<__half2 const *>(&lhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hneg2(source_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      half_t x = -lhs[N - 1];
+      __half lhs_val = reinterpret_cast<__half const &>(x);
+      result[N - 1] = reinterpret_cast<half_t const &>(lhs_val);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = -lhs[i];
+    }
+    #endif
+
+    return result;
+  }
+};
+
+/// Fused multiply-add
+template <int N>
+struct multiply_add<Array<half_t, N>, Array<half_t, N>, Array<half_t, N>> {
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    Array<half_t, N> const &a,
+    Array<half_t, N> const &b,
+    Array<half_t, N> const &c) const {
+
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
+    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
+    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2(a_ptr[i], b_ptr[i], c_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
+      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
+
+      __half d_residual = __hfma(
+        a_residual_ptr[N - 1],
+        b_residual_ptr[N - 1],
+        c_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    multiply_add<half_t> op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a[i], b[i], c[i]);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    half_t const &a,
+    Array<half_t, N> const &b,
+    Array<half_t, N> const &c) const {
+
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 a_pair = __half2half2(reinterpret_cast<__half const &>(a));
+    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
+    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2(a_pair, b_ptr[i], c_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
+      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
+      __half d_residual = __hfma(
+        reinterpret_cast<__half const &>(a),
+        b_residual_ptr[N - 1],
+        c_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    multiply_add<half_t> op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a, b[i], c[i]);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    Array<half_t, N> const &a,
+    half_t const &b,
+    Array<half_t, N> const &c) const {
+
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
+    __half2 b_pair = __half2half2(reinterpret_cast<__half const &>(b));
+    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2(a_ptr[i], b_pair, c_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
+      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
+
+      __half d_residual = __hfma(
+        a_residual_ptr[N - 1],
+        reinterpret_cast<__half const &>(b),
+        c_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    multiply_add<half_t> op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a[i], b, c[i]);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    Array<half_t, N> const &a,
+    Array<half_t, N> const &b,
+    half_t const &c) const {
+
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
+    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
+    __half2 c_pair = __half2half2(reinterpret_cast<__half const &>(c));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2(a_ptr[i], b_ptr[i], c_pair);
+    }
+
+    if constexpr (N % 2) {
+
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
+
+      __half d_residual = __hfma(
+        a_residual_ptr[N - 1],
+        b_residual_ptr[N - 1],
+        reinterpret_cast<__half const &>(c));
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    multiply_add<half_t> op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a[i], b[i], c);
+    }
+    #endif
+
+    return result;
+  }
+};
+
+/// Fused multiply-add-relu0
+template <int N>
+struct multiply_add_relu0<Array<half_t, N>, Array<half_t, N>, Array<half_t, N>> {
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    Array<half_t, N> const &a,
+    Array<half_t, N> const &b,
+    Array<half_t, N> const &c) const {
+
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
+    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
+    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2_relu(a_ptr[i], b_ptr[i], c_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
+      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
+
+      __half d_residual = __hfma_relu(
+        a_residual_ptr[N - 1],
+        b_residual_ptr[N - 1],
+        c_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    multiply_add<half_t> op;
+    maximum<half_t> mx;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(op(a[i], b[i], c[i]), (half_t)0);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    half_t const &a,
+    Array<half_t, N> const &b,
+    Array<half_t, N> const &c) const {
+
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 a_pair = __half2half2(reinterpret_cast<__half const &>(a));
+    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
+    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2_relu(a_pair, b_ptr[i], c_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
+      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
+      __half d_residual = __hfma_relu(
+        reinterpret_cast<__half const &>(a),
+        b_residual_ptr[N - 1],
+        c_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    multiply_add<half_t> op;
+    maximum<half_t> mx;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(op(a, b[i], c[i]), half_t(0));
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    Array<half_t, N> const &a,
+    half_t const &b,
+    Array<half_t, N> const &c) const {
+
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
+    __half2 b_pair = __half2half2(reinterpret_cast<__half const &>(b));
+    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2_relu(a_ptr[i], b_pair, c_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
+      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
+
+      __half d_residual = __hfma_relu(
+        a_residual_ptr[N - 1],
+        reinterpret_cast<__half const &>(b),
+        c_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    multiply_add<half_t> op;
+    maximum<half_t> mx;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(op(a[i], b, c[i]), half_t(0));
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    Array<half_t, N> const &a,
+    Array<half_t, N> const &b,
+    half_t const &c) const {
+
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
+    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
+    __half2 c_pair = __half2half2(reinterpret_cast<__half const &>(c));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2_relu(a_ptr[i], b_ptr[i], c_pair);
+    }
+
+    if constexpr (N % 2) {
+
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
+
+      __half d_residual = __hfma_relu(
+        a_residual_ptr[N - 1],
+        b_residual_ptr[N - 1],
+        reinterpret_cast<__half const &>(c));
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    multiply_add<half_t> op;
+    maximum<half_t> mx;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(op(a[i], b[i], c), half_t(0));
+    }
+    #endif
+
+    return result;
+  }
+};
+
+template <int N, bool PropagateNaN>
+struct minimum<Array<half_t, N>, PropagateNaN> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = PropagateNaN ? __hmin2_nan(lhs_ptr[i], rhs_ptr[i])
+                                   : __hmin2(lhs_ptr[i], rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+
+      __half d_residual = PropagateNaN ? __hmin_nan(a_residual_ptr[N - 1], b_residual_ptr[N - 1])
+                                       : __hmin(a_residual_ptr[N - 1], b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    minimum<half_t,PropagateNaN> mn;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mn(lhs[i],rhs[i]);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = PropagateNaN ? __hmin2_nan(lhs_pair, rhs_ptr[i])
+                                   : __hmin2(lhs_pair, rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+
+      __half d_residual = PropagateNaN ? __hmin_nan(reinterpret_cast<__half const &>(lhs), b_residual_ptr[N - 1])
+                                       : __hmin(reinterpret_cast<__half const &>(lhs), b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    minimum<half_t,PropagateNaN> mn;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mn(lhs, rhs[i]);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = PropagateNaN ? __hmin2_nan(lhs_ptr[i], rhs_pair)
+                                   : __hmin2(lhs_ptr[i], rhs_pair);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+
+      __half d_residual = PropagateNaN ? __hmin_nan(a_residual_ptr[N - 1], reinterpret_cast<__half const &>(rhs))
+                                       : __hmin(a_residual_ptr[N - 1], reinterpret_cast<__half const &>(rhs));
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    minimum<half_t, PropagateNaN> mn;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mn(lhs[i], rhs);
+    }
+    #endif
+
+    return result;
+  }
+};
+
+template <int N, bool PropagateNaN>
+struct maximum<Array<half_t, N>, PropagateNaN> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = PropagateNaN ? __hmax2_nan(lhs_ptr[i], rhs_ptr[i])
+                                   : __hmax2(lhs_ptr[i], rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+
+      __half d_residual = PropagateNaN ? __hmax(a_residual_ptr[N - 1], b_residual_ptr[N - 1])
+                                       : __hmax_nan(a_residual_ptr[N - 1], b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    maximum<half_t,PropagateNaN> mx;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(lhs[i], rhs[i]);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = PropagateNaN ? __hmax2_nan(lhs_pair, rhs_ptr[i])
+                                   : __hmax2(lhs_pair, rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+
+      __half d_residual = PropagateNaN ? __hmax_nan(reinterpret_cast<__half const &>(lhs), b_residual_ptr[N - 1])
+                                       : __hmax(reinterpret_cast<__half const &>(lhs), b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    maximum<half_t,PropagateNaN> mx;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(lhs, rhs[i]);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = PropagateNaN ? __hmax2_nan(lhs_ptr[i], rhs_pair)
+                                   : __hmax2(lhs_ptr[i], rhs_pair);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+
+      __half d_residual = PropagateNaN ? __hmax_nan(a_residual_ptr[N - 1], reinterpret_cast<__half const &>(rhs))
+                                       : __hmax(a_residual_ptr[N - 1], reinterpret_cast<__half const &>(rhs));
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    maximum<half_t,PropagateNaN> mx;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(lhs[i], rhs);
+    }
+    #endif
+
+    return result;
+  }
+};
+
+/// Fused multiply-add
+template <int N>
+struct multiply_add<Array<bfloat16_t, N>, Array<bfloat16_t, N>, Array<bfloat16_t, N>> {
+
+  CUTLASS_HOST_DEVICE
+  Array<bfloat16_t, N> operator()(
+    Array<bfloat16_t, N> const &a,
+    Array<bfloat16_t, N> const &b,
+    Array<bfloat16_t, N> const &c) const {
+
+    Array<bfloat16_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    unsigned *result_ptr = reinterpret_cast<unsigned *>(&result);
+    unsigned const *a_ptr = reinterpret_cast<unsigned const *>(&a);
+    unsigned const *b_ptr = reinterpret_cast<unsigned const *>(&b);
+    unsigned const *c_ptr = reinterpret_cast<unsigned const *>(&c);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n"
+        : "=r"(result_ptr[i])
+        : "r"(a_ptr[i]), "r"(b_ptr[i]), "r"(c_ptr[i])
+      );
+    }
+
+    if constexpr (N % 2) {
+
+      uint16_t *result_ptr = reinterpret_cast<uint16_t *>(&result);
+      uint16_t const *a_residual_ptr = reinterpret_cast<uint16_t const *>(&a);
+      uint16_t const *b_residual_ptr = reinterpret_cast<uint16_t const *>(&b);
+      uint16_t const *c_residual_ptr = reinterpret_cast<uint16_t const *>(&c);
+
+      asm ("fma.rn.bf16 %0, %1, %2, %3;\n"
+        : "=h"(result_ptr[N - 1])
+        : "h"(a_residual_ptr[N - 1]), "h"(b_residual_ptr[N - 1]), "h"(c_residual_ptr[N - 1])
+      );
+    }
+
+    #else
+
+    multiply_add<bfloat16_t> op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a[i], b[i], c[i]);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<bfloat16_t, N> operator()(
+    bfloat16_t const &a,
+    Array<bfloat16_t, N> const &b,
+    Array<bfloat16_t, N> const &c) const {
+
+    Array<bfloat16_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    unsigned *result_ptr = reinterpret_cast<unsigned *>(&result);
+
+    unsigned const *b_ptr = reinterpret_cast<unsigned const *>(&b);
+    unsigned const *c_ptr = reinterpret_cast<unsigned const *>(&c);
+
+    unsigned a_packed = static_cast<unsigned>(a.raw());
+    a_packed = (a_packed | (a_packed << 16));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n"
+        : "=r"(result_ptr[i])
+        : "r"(a_packed), "r"(b_ptr[i]), "r"(c_ptr[i])
+      );
+    }
+
+    if constexpr (N % 2) {
+
+      uint16_t *result_ptr = reinterpret_cast<uint16_t *>(&result);
+      uint16_t const *a_residual_ptr = reinterpret_cast<uint16_t const *>(&a);
+      uint16_t const *b_residual_ptr = reinterpret_cast<uint16_t const *>(&b);
+      uint16_t const *c_residual_ptr = reinterpret_cast<uint16_t const *>(&c);
+
+      asm ("fma.rn.bf16 %0, %1, %2, %3;\n"
+        : "=h"(result_ptr[N - 1])
+        : "h"(a_residual_ptr[0]), "h"(b_residual_ptr[N - 1]), "h"(c_residual_ptr[N - 1])
+      );
+    }
+
+    #else
+
+    multiply_add<bfloat16_t> op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a, b[i], c[i]);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<bfloat16_t, N> operator()(
+    Array<bfloat16_t, N> const &a,
+    bfloat16_t const &b,
+    Array<bfloat16_t, N> const &c) const {
+
+    Array<bfloat16_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    unsigned *result_ptr = reinterpret_cast<unsigned *>(&result);
+
+    unsigned const *a_ptr = reinterpret_cast<unsigned const *>(&a);
+    unsigned const *c_ptr = reinterpret_cast<unsigned const *>(&c);
+
+    unsigned b_packed = static_cast<unsigned>(b.raw());
+    b_packed = (b_packed | (b_packed << 16));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n"
+        : "=r"(result_ptr[i])
+        : "r"(a_ptr[i]), "r"(b_packed), "r"(c_ptr[i])
+      );
+    }
+
+    if constexpr (N % 2) {
+
+      uint16_t *result_ptr = reinterpret_cast<uint16_t *>(&result);
+      uint16_t const *a_residual_ptr = reinterpret_cast<uint16_t const *>(&a);
+      uint16_t const *b_residual_ptr = reinterpret_cast<uint16_t const *>(&b);
+      uint16_t const *c_residual_ptr = reinterpret_cast<uint16_t const *>(&c);
+
+      asm ("fma.rn.bf16 %0, %1, %2, %3;\n"
+        : "=h"(result_ptr[N - 1])
+        : "h"(a_residual_ptr[N - 1]), "h"(b_residual_ptr[0]), "h"(c_residual_ptr[N - 1])
+      );
+    }
+
+    #else
+
+    multiply_add<bfloat16_t> op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a[i], b, c[i]);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<bfloat16_t, N> operator()(
+    Array<bfloat16_t, N> const &a,
+    Array<bfloat16_t, N> const &b,
+    bfloat16_t const &c) const {
+
+    Array<bfloat16_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    unsigned *result_ptr = reinterpret_cast<unsigned *>(&result);
+
+    unsigned const *a_ptr = reinterpret_cast<unsigned const *>(&a);
+    unsigned const *b_ptr = reinterpret_cast<unsigned const *>(&b);
+
+    unsigned c_packed = static_cast<unsigned>(c.raw());
+    c_packed = (c_packed | (c_packed << 16));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n"
+        : "=r"(result_ptr[i])
+        : "r"(a_ptr[i]), "r"(b_ptr[i]), "r"(c_packed)
+      );
+    }
+
+    if constexpr (N % 2) {
+
+      uint16_t *result_ptr = reinterpret_cast<uint16_t *>(&result);
+      uint16_t const *a_residual_ptr = reinterpret_cast<uint16_t const *>(&a);
+      uint16_t const *b_residual_ptr = reinterpret_cast<uint16_t const *>(&b);
+      uint16_t const *c_residual_ptr = reinterpret_cast<uint16_t const *>(&c);
+
+      asm ("fma.rn.bf16 %0, %1, %2, %3;\n"
+        : "=h"(result_ptr[N - 1])
+        : "h"(a_residual_ptr[N - 1]), "h"(b_residual_ptr[N - 1]), "h"(c_residual_ptr[0])
+      );
+    }
+
+    #else
+
+    multiply_add<bfloat16_t> op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a[i], b[i], c);
+    }
+    #endif
+
+    return result;
+  }
+};
+
+
+/// bit_and
+template <int N>
+struct bit_and<Array<uint1b_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<uint1b_t, N> operator()(Array<uint1b_t, N> const &a, Array<uint1b_t, N> const &b) const {
+    using ArrayType = Array<uint1b_t, N>;
+    using Storage = typename ArrayType::Storage;
+    ArrayType result;
+
+    Storage *result_data = result.raw_data();
+    Storage const *a_data = a.raw_data();
+    Storage const *b_data = b.raw_data();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ArrayType::kStorageElements; ++i) {
+      result_data[i] = (a_data[i] & b_data[i]);
+    }
+
+    return result;
+  }
+};
+
+
+/// bit_or
+template <int N>
+struct bit_or<Array<uint1b_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<uint1b_t, N> operator()(Array<uint1b_t, N> const &a, Array<uint1b_t, N> const &b) const {
+    using ArrayType = Array<uint1b_t, N>;
+    using Storage = typename ArrayType::Storage;
+    ArrayType result;
+
+    Storage *result_data = result.raw_data();
+    Storage const *a_data = a.raw_data();
+    Storage const *b_data = b.raw_data();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ArrayType::kStorageElements; ++i) {
+      result_data[i] = (a_data[i] | b_data[i]);
+    }
+
+    return result;
+  }
+};
+
+
+/// bit_not
+template <int N>
+struct bit_not<Array<uint1b_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<uint1b_t, N> operator()(Array<uint1b_t, N> const &a) const {
+    using ArrayType = Array<uint1b_t, N>;
+    using Storage = typename ArrayType::Storage;
+    ArrayType result;
+
+    Storage *result_data = result.raw_data();
+    Storage const *a_data = a.raw_data();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ArrayType::kStorageElements; ++i) {
+      result_data[i] = (~a_data[i]);
+    }
+
+    return result;
+  }
+};
+
+
+/// bit_xor
+template <int N>
+struct bit_xor<Array<uint1b_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<uint1b_t, N> operator()(Array<uint1b_t, N> const &a, Array<uint1b_t, N> const &b) const {
+    using ArrayType = Array<uint1b_t, N>;
+    using Storage = typename ArrayType::Storage;
+    ArrayType result;
+
+    Storage *result_data = result.raw_data();
+    Storage const *a_data = a.raw_data();
+    Storage const *b_data = b.raw_data();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ArrayType::kStorageElements; ++i) {
+      result_data[i] = (a_data[i] ^ b_data[i]);
+    }
+
+    return result;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Operator overloads
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator+(Array<T, N> const &lhs, Array<T, N> const &rhs) {
+  plus<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator+(T const &lhs, Array<T, N> const &rhs) {
+  plus<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator+(Array<T, N> const &lhs, T const &rhs) {
+  plus<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator-(Array<T, N> const &lhs, Array<T, N> const &rhs) {
+  minus<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator-(Array<T, N> const &lhs) {
+  negate<Array<T, N>> op;
+  return op(lhs);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator*(Array<T, N> const &lhs, Array<T, N> const &rhs) {
+  multiplies<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator*(T lhs, Array<T, N> const &rhs) {
+  multiplies<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator*(Array<T, N> const &lhs, T rhs) {
+  multiplies<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator/(Array<T, N> const &lhs, Array<T, N> const &rhs) {
+  divides<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> fma(Array<T, N> const &a, Array<T, N> const &b, Array<T, N> const &c) {
+  multiply_add<Array<T, N>> op;
+  return op(a, b, c);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> fma(T a, Array<T, N> const &b, Array<T, N> const &c) {
+  multiply_add<Array<T, N>> op;
+  return op(a, b, c);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> fma(Array<T, N> const &a, T b, Array<T, N> const &c) {
+  multiply_add<Array<T, N>> op;
+  return op(a, b, c);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> fma(Array<T, N> const &a, Array<T, N> const &b, T c) {
+  multiply_add<Array<T, N>> op;
+  return op(a, b, c);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+  
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// AlignedArray
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Aligned array type
+template <
+  /// Element type
+  typename T,
+  /// Number of elements in the array
+  int N,
+  /// Alignment requirement in bytes
+  int Alignment = ( sizeof_bits<T>::value * N + 7 ) / 8
+>
+class alignas(Alignment) AlignedArray: public Array<T, N> {
+public:
+
+};
+
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/array_subbyte.h"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/array_planar_complex.h b/include/cutlass/array_planar_complex.h
index 0d8d2c9899..2dd8aa84e1 100644
--- a/include/cutlass/array_planar_complex.h
+++ b/include/cutlass/array_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -51,13 +51,12 @@ struct ArrayPlanarComplex {
   using Element = Element_;
 
   /// Number of logical elements
-  static size_t const kElements = N;
+  static constexpr size_t kElements = N;
 
   /// Underlying Fragment of real-valued elemenets
-  using ArrayReal = Array<Element, N>;
+  using ArrayReal = cutlass::Array<Element, N>;
 
 public:
-
   /// Fragment of real-valued elements representing the real part
   ArrayReal real;
 
@@ -65,19 +64,6 @@ struct ArrayPlanarComplex {
   ArrayReal imag;
 
 public:
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  ArrayPlanarComplex() { }
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  ArrayPlanarComplex(
-    ArrayReal const &real_,
-    ArrayReal const &imag_
-  ):
-    real(real_), imag(imag_) { }
-
   /// Sets the array to zero efficiently
   CUTLASS_HOST_DEVICE
   void clear() {
@@ -93,7 +79,7 @@ template <typename Element, int N>
 CUTLASS_HOST_DEVICE
 ArrayPlanarComplex<Element, N> 
 make_ArrayPlanarComplex(Array<Element, N> const &real, Array<Element, N> const &imag) {
-  return ArrayPlanarComplex<Element, N>(real, imag);
+  return ArrayPlanarComplex<Element, N>{real, imag};
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/array_subbyte.h b/include/cutlass/array_subbyte.h
index 12b1665f01..d2e0e5efdb 100644
--- a/include/cutlass/array_subbyte.h
+++ b/include/cutlass/array_subbyte.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -48,10 +48,8 @@ template <
   typename T,
   int N
 >
-class Array<T, N, false> {
-public:
-
-  static int const kSizeBits = sizeof_bits<T>::value * N;
+struct Array<T, N, false> {
+  static constexpr int kSizeBits = sizeof_bits<T>::value * N;
 
   /// Storage type
   using Storage = typename platform::conditional<
@@ -68,16 +66,16 @@ class Array<T, N, false> {
   using Element = T;
 
   /// Number of logical elements per stored object
-  static int const kElementsPerStoredItem = int(sizeof(Storage) * 8) / sizeof_bits<T>::value;
+  static constexpr int kElementsPerStoredItem = int(sizeof(Storage) * 8) / sizeof_bits<T>::value;
 
   /// Number of storage elements
-  static size_t const kStorageElements = N / kElementsPerStoredItem;
+  static constexpr size_t kStorageElements = (N + kElementsPerStoredItem - 1) / kElementsPerStoredItem;
 
   /// Number of logical elements
-  static size_t const kElements = N;
+  static constexpr size_t kElements = N;
 
   /// Bitmask for covering one item
-  static Storage const kMask = ((Storage(1) << sizeof_bits<T>::value) - 1);
+  static constexpr Storage kMask = ((Storage(1) << sizeof_bits<T>::value) - 1);
 
   //
   // C++ standard members with pointer types removed
@@ -96,16 +94,14 @@ class Array<T, N, false> {
   /// Reference object inserts or extracts sub-byte items
   class reference {
     /// Pointer to storage element
-    Storage *ptr_;
+    Storage *ptr_{nullptr};
 
     /// Index into elements packed into Storage object
-    int idx_;
+    int idx_{0};
 
   public:
 
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    reference(): ptr_(nullptr), idx_(0) { }
+    reference() = default;
 
     /// Ctor
     CUTLASS_HOST_DEVICE
@@ -114,11 +110,38 @@ class Array<T, N, false> {
     /// Assignment
     CUTLASS_HOST_DEVICE
     reference &operator=(T x) {
+    // `*ptr_ & kUpdateMask` will read ptr_ before write to it
+    // This means code pattern like
+    //
+    // ```cpp
+    // Array<half_t, N> result;
+    // result[0] = xxx;
+    // ```
+    // 
+    // Will leads to compiler warning on use of unintialized member variable. Although we know
+    //      this read of uninitialized member variable is harmeless.
+
+#if defined(__clang__)
+#  pragma clang diagnostic push
+#  pragma clang diagnostic ignored "-Wuninitialized"
+#elif defined(__GNUC__)
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wuninitialized"
+#  pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+
       Storage item = (reinterpret_cast<Storage const &>(x) & kMask);
 
       Storage kUpdateMask = Storage(~(kMask << (idx_ * sizeof_bits<T>::value)));
+
       *ptr_ = Storage(((*ptr_ & kUpdateMask) | (item << idx_ * sizeof_bits<T>::value)));
 
+#if defined(__clang__)
+#  pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#  pragma GCC diagnostic pop
+#endif
+
       return *this;
     }
 
@@ -151,16 +174,14 @@ class Array<T, N, false> {
   class const_reference {
 
     /// Pointer to storage element
-    Storage const *ptr_;
+    Storage const *ptr_{nullptr};
 
     /// Index into elements packed into Storage object
-    int idx_;
+    int idx_{0};
 
   public:
 
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    const_reference(): ptr_(nullptr), idx_(0) { }
+    const_reference() = default;
 
     /// Ctor
     CUTLASS_HOST_DEVICE
@@ -200,15 +221,14 @@ class Array<T, N, false> {
   class iterator {
 
     /// Pointer to storage element
-    Storage *ptr_;
+    Storage *ptr_{nullptr};
 
     /// Index into elements packed into Storage object
-    int idx_;
+    int idx_{0};
 
   public:
 
-    CUTLASS_HOST_DEVICE
-    iterator(): ptr_(nullptr), idx_(0) { }
+    iterator() = default;
 
     CUTLASS_HOST_DEVICE
     iterator(Storage *ptr, int idx = 0): ptr_(ptr), idx_(idx) { }
@@ -279,15 +299,14 @@ class Array<T, N, false> {
   class const_iterator {
 
     /// Pointer to storage element
-    Storage const *ptr_;
+    Storage const *ptr_{nullptr};
 
     /// Index into elements packed into Storage object
-    int idx_;
+    int idx_{0};
 
   public:
 
-    CUTLASS_HOST_DEVICE
-    const_iterator(): ptr_(nullptr), idx_(0) { }
+    const_iterator() = default;
 
     CUTLASS_HOST_DEVICE
     const_iterator(Storage const *ptr, int idx = 0): ptr_(ptr), idx_(idx) { }
@@ -358,62 +377,36 @@ class Array<T, N, false> {
   class reverse_iterator {
 
     /// Pointer to storage element
-    Storage *ptr_;
+    Storage *ptr_{nullptr};
 
     /// Index into elements packed into Storage object
-    int idx_;
+    int idx_{0};
 
   public:
 
-    CUTLASS_HOST_DEVICE
-    reverse_iterator(): ptr_(nullptr), idx_(0) { }
+    reverse_iterator() = default;
 
     CUTLASS_HOST_DEVICE
     reverse_iterator(Storage *ptr, int idx = 0): ptr_(ptr), idx_(idx) { }
-
-    // TODO
   };
 
   /// Bidirectional constant iterator over elements
   class const_reverse_iterator {
 
     /// Pointer to storage element
-    Storage const *ptr_;
+    Storage const *ptr_{nullptr};
 
     /// Index into elements packed into Storage object
-    int idx_;
+    int idx_{0};
 
   public:
 
-    CUTLASS_HOST_DEVICE
-    const_reverse_iterator(): ptr_(nullptr), idx_(0) { }
+    const_reverse_iterator() = default;
 
     CUTLASS_HOST_DEVICE
     const_reverse_iterator(Storage const *ptr, int idx = 0): ptr_(ptr), idx_(idx) { }
-
-    // TODO
   };
 
-private:
-
-  /// Internal storage
-  Storage storage[kStorageElements];
-
-public:
-
-  #if 0
-  CUTLASS_HOST_DEVICE
-  Array() { }
-
-  CUTLASS_HOST_DEVICE
-  Array(Array const &x) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < int(kStorageElements); ++i) {
-      storage[i] = x.storage[i];
-    }
-  }
-  #endif
-
   /// Efficient clear method
   CUTLASS_HOST_DEVICE
   void clear() {
@@ -484,7 +477,6 @@ class Array<T, N, false> {
     return storage;
   }
 
-
   CUTLASS_HOST_DEVICE
   constexpr bool empty() const {
     return !kElements;
@@ -555,14 +547,15 @@ class Array<T, N, false> {
     return const_reverse_iterator(storage);
   }
 
-  //
-  // Comparison operators
-  //
-
+private:
+  /// Internal storage
+  Storage storage[kStorageElements];
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 } // namespace cutlass
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/barrier.h b/include/cutlass/barrier.h
new file mode 100644
index 0000000000..6f2373b6df
--- /dev/null
+++ b/include/cutlass/barrier.h
@@ -0,0 +1,377 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implementation of a CTA-wide barrier for inter-CTA synchronization.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/barrier.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+namespace detail {
+
+//
+// Utilities for abstracting synchronization methods for barriers
+//
+
+struct SyncthreadsSync {
+  CUTLASS_DEVICE
+  static void sync() {
+    __syncthreads();
+  }
+};
+
+struct SyncwarpSync {
+  CUTLASS_DEVICE
+  static void sync() {
+    __syncwarp();
+  }
+};
+
+template <
+  int ThreadCount,
+  int BarrierId
+>
+struct NamedBarrierSync {
+  CUTLASS_DEVICE
+  static void sync() {
+    cutlass::arch::NamedBarrier::sync(ThreadCount, static_cast<arch::ReservedNamedBarriers>(BarrierId));
+  }
+};
+
+} // namepspace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Group or CTA-wide semaphore for inter-CTA synchronization.
+template <class Sync>
+struct GenericBarrier {
+
+public:
+
+  /// Flag type
+  using T = int;
+
+  /// Initial flag value
+  static const T INIT = 0;
+
+
+protected:
+
+  /// Load flag, as a strong acquire operation (int specialization)
+  CUTLASS_DEVICE
+  static int ld_acquire(int *ptr)
+  {
+    int state = 0;
+
+#if (__CUDA_ARCH__ >= 700)
+    /// SM70 and newer use memory consistency qualifiers
+
+    // Acquire pattern using acquire modifier
+    asm volatile ("ld.global.acquire.gpu.b32 %0, [%1];\n" : "=r"(state) : "l"(ptr));
+
+#else
+    asm volatile ("ld.cg.global.b32 %0, [%1];\n" : "=r"(state) : "l"(ptr));
+#endif // (__CUDA_ARCH__ >= 700)
+
+    return state;
+  }
+
+
+  /// Reduce into flag, with release pattern (int specialization)
+  CUTLASS_DEVICE
+  static void red_release(int *ptr, int val)
+  {
+#if (__CUDA_ARCH__ >= 700)
+    /// SM70 and newer use memory consistency qualifiers
+
+    // Release pattern using acq_rel fence + relaxed modifier.  (The fence also releases data
+    // that was weakly-written by other threads prior to the last syncthreads)
+    asm volatile ("fence.acq_rel.gpu;\n");
+    asm volatile ("red.relaxed.gpu.global.add.s32 [%0], %1;\n" : : "l"(ptr), "r"(val));
+
+#else
+    __threadfence();
+    atomicAdd(ptr, val);
+#endif // (__CUDA_ARCH__ >= 700)
+  }
+
+
+public:
+
+  /// Uses thread[0] to wait for at least the specified count of signals on the given flag counter
+  CUTLASS_DEVICE
+  static void wait_lt(void *lock_ptr, int thread_idx, int flag_idx, int count)
+  {
+    T *flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
+
+    if (thread_idx == 0)
+    {
+        // Spin-loop
+        #pragma unroll 1
+        while(ld_acquire(flag_ptr) < count) {}
+    }
+
+    Sync::sync();
+  }
+
+  /// Uses thread[0] to wait for at least the specified count of signals on the given flag counter
+  CUTLASS_DEVICE
+  static void wait_eq(void *lock_ptr, int thread_idx, int flag_idx, T val = 1)
+  {
+    T *flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
+
+    if (thread_idx == 0)
+    {
+        // Spin-loop
+        #pragma unroll 1
+        while(ld_acquire(flag_ptr) != val) {}
+    }
+    Sync::sync();
+  }
+
+  /// Uses thread[0] to wait for the specified count of signals on the given flag counter
+  CUTLASS_DEVICE
+  static void wait_eq_reset(void *lock_ptr, int thread_idx, int flag_idx, T val = 1) {
+    T *flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
+
+    if (thread_idx == 0)
+    {
+        // Spin-loop
+        #pragma unroll 1
+        while(atomicCAS(flag_ptr, val, 0) != val) {}
+    }
+
+    Sync::sync();
+  }
+
+  /// Increment the arrival count for a flag
+  CUTLASS_DEVICE
+  static void arrive_inc(void *lock_ptr, int thread_idx, int flag_idx, int val = 1)
+  {
+    T* flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
+
+    Sync::sync();
+
+    if (thread_idx == 0)
+    {
+      red_release(flag_ptr, val);
+    }
+  }
+
+
+  /// Increment the arrival counts for a range of flags
+  CUTLASS_DEVICE
+  static void arrive_range_inc(void *lock_ptr, int thread_idx, int first_flag_idx, int count = 1, int val = 1)
+  {
+    int flag_idx = first_flag_idx + thread_idx;
+    T* flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
+
+    // Barrier to make sure all other threads in group have written their data
+    Sync::sync();
+
+    // Select threads increment their flags
+    if (thread_idx < count) {
+      red_release(flag_ptr, val);
+    }
+  }
+};
+
+using Barrier = GenericBarrier<detail::SyncthreadsSync>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/** Structure for managing multiple NamedBarriers to be used by different warp groups, allowing
+ * runtime index values to be used to call into named barriers with compile-time-constant IDs.
+ *
+ * @param ThreadCount_ Number of threads that will wait on a NamedBarrier with a given ID
+ * @param Offset Value added to the ID passed in by the user to determine the NamedBarrier ID to call into
+ * @param MaxNumNamedBarriers The maximum number of unique barrier IDs that will be requested on this type
+**/
+template <
+  uint32_t ThreadCount_,
+  uint32_t Offset = 0,
+  uint32_t MaxNumNamedBarriers = 16
+>
+struct NamedBarrierManager {
+
+  static_assert(MaxNumNamedBarriers <= arch::NamedBarrier::HardwareMaxNumNamedBarriers);
+  static_assert(MaxNumNamedBarriers + Offset <= arch::NamedBarrier::HardwareMaxNumNamedBarriers, "Barrier IDs cannot exceed 15");
+
+  // Number of threads participating in the barrier
+  static constexpr uint32_t ThreadCount = ThreadCount_;
+
+  template <uint32_t BarrierId>
+  using BarrierSync = cutlass::GenericBarrier<cutlass::detail::NamedBarrierSync<ThreadCount, BarrierId>>;
+
+  // Underlying type used by all barriers for synchronization. Does not depend on
+  // template parameter BarrierId, so passing in 0 suffices.
+  using T = typename BarrierSync<0>::T;
+
+  using IntegerSequence = cute::make_integer_sequence<uint32_t, MaxNumNamedBarriers>;
+
+  CUTLASS_DEVICE
+  static
+  void wait_lt(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, int count) {
+    wait_lt_helper(idx, lock_ptr, thread_idx, flag_idx, count, IntegerSequence{});
+  }
+
+  CUTLASS_DEVICE
+  static void
+  wait_eq(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, T val = 1) {
+    wait_eq_helper<false>(idx, lock_ptr, thread_idx, flag_idx, val, IntegerSequence{});
+  }
+
+  CUTLASS_DEVICE
+  static void
+  wait_eq_reset(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, T val = 1) {
+    wait_eq_helper<true>(idx, lock_ptr, thread_idx, flag_idx, val, IntegerSequence{});
+  }
+
+  CUTLASS_DEVICE
+  static void
+  arrive_inc(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, int val = 1) {
+    arrive_inc_helper(idx, lock_ptr, thread_idx, flag_idx, val, IntegerSequence{});
+  }
+
+  CUTLASS_DEVICE
+  static void
+  arrive_range_inc(uint32_t idx, void *lock_ptr, int thread_idx, int first_flag_idx, int count = 1, int val = 1) {
+    arrive_range_inc_helper(idx, lock_ptr, thread_idx, first_flag_idx, count, val, IntegerSequence{});
+  }
+
+private:
+  CUTLASS_DEVICE
+  static void
+  check_barrier_in_range([[maybe_unused]] uint32_t idx) {
+    assert((idx < MaxNumNamedBarriers) && "Index exceeds barrier count");
+  }
+
+  template <uint32_t... Idx>
+  CUTLASS_DEVICE
+  static void
+  wait_lt_helper(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, int count, cute::integer_sequence<uint32_t, Idx...>) {
+    check_barrier_in_range(idx);
+    ((Idx == idx && (BarrierSync<Idx + Offset>::wait_lt(lock_ptr, thread_idx, flag_idx, count), true)) || ...);
+  }
+
+  template <bool Reset, uint32_t... Idx>
+  CUTLASS_DEVICE
+  static void
+  wait_eq_helper(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, T val, cute::integer_sequence<uint32_t, Idx...>) {
+    check_barrier_in_range(idx);
+    if constexpr (Reset) {
+      ((Idx == idx && (BarrierSync<Idx + Offset>::wait_eq_reset(lock_ptr, thread_idx, flag_idx, val), true)) || ...);
+    }
+    else {
+      ((Idx == idx && (BarrierSync<Idx + Offset>::wait_eq(lock_ptr, thread_idx, flag_idx, val), true)) || ...);
+    }
+  }
+
+  template <uint32_t... Idx>
+  CUTLASS_DEVICE
+  static void
+  arrive_inc_helper(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, int val, cute::integer_sequence<uint32_t, Idx...>) {
+    check_barrier_in_range(idx);
+    ((Idx == idx && (BarrierSync<Idx + Offset>::arrive_inc(lock_ptr, thread_idx, flag_idx, val), true)) || ...);
+  }
+
+  template <uint32_t... Idx>
+  CUTLASS_DEVICE
+  static void
+  arrive_range_inc_helper(uint32_t idx, void *lock_ptr, int thread_idx, int first_flag_idx, int count, int val, cute::integer_sequence<uint32_t, Idx...>) {
+    check_barrier_in_range(idx);
+    ((Idx == idx && (BarrierSync<Idx + Offset>::arrive_range_inc(lock_ptr, thread_idx, first_flag_idx, count, val), true)) || ...);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/** Structure for synchronizing via contiguous barriers (e.g., __syncwarp, __syncthreads)
+ *  via an API that mirrors that of NamedBarrierManager
+ *
+ * @param Synchronizer Synchronization helper exposing a `sync()` method to perform synchronization
+**/
+template <
+  class Synchronizer,
+  uint32_t ThreadCount_
+>
+struct SyncManager {
+
+  // Number of threads participating in the barrier
+  static constexpr uint32_t ThreadCount = ThreadCount_;
+
+  using BarrierSync = cutlass::GenericBarrier<Synchronizer>;
+
+  // Underlying type used by all barriers for synchronization.
+  using T = typename BarrierSync::T;
+
+  CUTLASS_DEVICE
+  static
+  void wait_lt(uint32_t, void *lock_ptr, int thread_idx, int flag_idx, int count) {
+    BarrierSync::wait_lt(lock_ptr, thread_idx, flag_idx, count);
+  }
+
+  CUTLASS_DEVICE
+  static void
+  wait_eq(uint32_t, void *lock_ptr, int thread_idx, int flag_idx, T val = 1) {
+    BarrierSync::wait_eq(lock_ptr, thread_idx, flag_idx, val);
+  }
+
+  CUTLASS_DEVICE
+  static void
+  wait_eq_reset(uint32_t, void *lock_ptr, int thread_idx, int flag_idx, T val = 1) {
+    BarrierSync::wait_eq_reset(lock_ptr, thread_idx, flag_idx, val);
+  }
+
+  CUTLASS_DEVICE
+  static void
+  arrive_inc(uint32_t, void *lock_ptr, int thread_idx, int flag_idx, int val = 1) {
+    BarrierSync::arrive_inc(lock_ptr, thread_idx, flag_idx, val);
+  }
+
+  CUTLASS_DEVICE
+  static void
+  arrive_range_inc(uint32_t idx, void *lock_ptr, int thread_idx, int first_flag_idx, int count = 1, int val = 1) {
+    BarrierSync::arrive_range_inc(lock_ptr, thread_idx, first_flag_idx, count, val);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/bfloat16.h b/include/cutlass/bfloat16.h
index e4d20efc1e..5af6d3ab80 100644
--- a/include/cutlass/bfloat16.h
+++ b/include/cutlass/bfloat16.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -33,16 +33,21 @@
     \brief Defines a proxy class for storing non-standard 16-bit floating point values with
           8 bits of exponent and 7 bit of mantissa.
 */
+
 #pragma once
 
-#if !defined(__CUDACC_RTC__)
+#if defined(__CUDACC_RTC__)
+#include "cutlass/floating_point_nvrtc.h"
+#else
 #include <cmath>
 #include <limits>
 #include <cstdint>
 #include <cstring>
 #endif
 
+#include <cuda_bf16.h>
 #include "cutlass/cutlass.h"
+#include "cutlass/platform/platform.h"
 
 namespace cutlass {
 
@@ -70,9 +75,41 @@ struct alignas(2) bfloat16_t {
     return h;
   }
 
+private:
+  struct from_32_bit_integer_t {};
+  static constexpr from_32_bit_integer_t from_32_bit_integer{};
+
+  template<class T>
+  CUTLASS_HOST_DEVICE
+  explicit bfloat16_t(from_32_bit_integer_t, T x) {
+    static_assert(cutlass::platform::is_integral<T>::value && sizeof(T) == 4, "Requires 32-bit integer");
+
+    float flt = static_cast<float>(x);
+    uint32_t bits;
+
+    #if defined(__CUDA_ARCH__)
+    bits = reinterpret_cast<uint32_t &>(flt);
+    #else
+    std::memcpy(&bits, &flt, sizeof(bits));
+    #endif
+
+    storage = uint16_t(bits >> 16);
+  }
+
+public:
   /// Default constructor
+  bfloat16_t() = default;
+
+  /// Reinterpret cast from CUDA's __nv_bfloat16 type
   CUTLASS_HOST_DEVICE
-  bfloat16_t() : storage(0) { }
+  explicit bfloat16_t(__nv_bfloat16 const & x) {
+    #if defined(__CUDA_ARCH__)
+    storage = reinterpret_cast<uint16_t const &>(x);
+    #else
+    __nv_bfloat16_raw raw(x);
+    std::memcpy(&storage, &raw.x, sizeof(storage));
+    #endif
+  }
 
   /// Floating-point conversion - round toward nearest
   CUTLASS_HOST_DEVICE
@@ -117,18 +154,10 @@ struct alignas(2) bfloat16_t {
 
   /// Integer conversion - round toward nearest
   CUTLASS_HOST_DEVICE
-  explicit bfloat16_t(int x) {
-    float flt = static_cast<float>(x);
-    uint32_t bits;
-
-    #if defined(__CUDA_ARCH__)
-    bits = reinterpret_cast<uint32_t &>(flt);
-    #else
-    std::memcpy(&bits, &flt, sizeof(bits));
-    #endif
+  explicit bfloat16_t(int x) : bfloat16_t(from_32_bit_integer, x) {}
 
-    storage = uint16_t(bits >> 16);
-  }
+  CUTLASS_HOST_DEVICE
+  explicit bfloat16_t(uint32_t x) : bfloat16_t(from_32_bit_integer, x) {}
 
   /// Converts to float
   CUTLASS_HOST_DEVICE
@@ -161,6 +190,12 @@ struct alignas(2) bfloat16_t {
     return (float(*this) != 0.0f);
   }
 
+  /// Bitcasts to CUDA's bf16 type
+  CUTLASS_DEVICE
+  __nv_bfloat16 to_nv_bfloat16() const {
+    return reinterpret_cast<__nv_bfloat16 const &>(storage);
+  }
+
   /// Obtains raw bits
   CUTLASS_HOST_DEVICE
   uint16_t raw() const {
@@ -200,7 +235,7 @@ bool signbit(cutlass::bfloat16_t const& h) {
 
 CUTLASS_HOST_DEVICE
 cutlass::bfloat16_t abs(cutlass::bfloat16_t const& h) {
-  return cutlass::bfloat16_t::bitcast(h.raw() & 0x7fffffff);
+  return cutlass::bfloat16_t::bitcast(h.raw() & 0x7fff);
 }
 
 CUTLASS_HOST_DEVICE
@@ -292,9 +327,9 @@ bfloat16_t copysign(bfloat16_t const& a, bfloat16_t const& b) {
 //
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
+#if !defined(__CUDACC_RTC__)
 namespace std {
 
-#if !defined(__CUDACC_RTC__)
 /// Numeric limits
 template <>
 struct numeric_limits<cutlass::bfloat16_t> {
@@ -349,9 +384,78 @@ struct numeric_limits<cutlass::bfloat16_t> {
   CUTLASS_HOST_DEVICE
   static cutlass::bfloat16_t denorm_min() { return cutlass::bfloat16_t::bitcast(0x1); }
 };
-#endif
 
 } // namespace std
+#endif
+
+namespace cutlass {
+namespace platform {
+
+/// Forward Declaration
+template <class T>
+struct numeric_limits;
+
+/// Numeric limits
+template <>
+struct numeric_limits<cutlass::bfloat16_t> {
+  static bool const is_specialized = true;
+  static bool const is_signed = true;
+  static bool const is_integer = false;
+  static bool const is_exact = false;
+  static bool const has_infinity = true;
+  static bool const has_quiet_NaN = true;
+  static bool const has_signaling_NaN = false;
+#if !defined(__CUDACC_RTC__)
+  static std::float_denorm_style const has_denorm = std::denorm_present;
+#endif
+  static bool const has_denorm_loss = true;
+#if !defined(__CUDACC_RTC__)
+  static std::float_round_style const round_style = std::round_to_nearest;
+#endif
+  static bool const is_iec559 = false;
+  static bool const is_bounded = true;
+  static bool const is_modulo = false;
+  static int const digits = 7;
+
+  /// Least positive value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t min() { return cutlass::bfloat16_t::bitcast(0x01); }
+
+  /// Minimum finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t lowest() { return cutlass::bfloat16_t::bitcast(0xff7f); }
+
+  /// Maximum finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t max() { return cutlass::bfloat16_t::bitcast(0x7f7f); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t epsilon() { return cutlass::bfloat16_t::bitcast(0x1000); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t round_error() { return cutlass::bfloat16_t(0.5f); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t infinity() { return cutlass::bfloat16_t::bitcast(0x7f80); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t quiet_NaN() { return cutlass::bfloat16_t::bitcast(0x7fff); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t signaling_NaN() { return cutlass::bfloat16_t::bitcast(0x7fff); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t denorm_min() { return cutlass::bfloat16_t::bitcast(0x1); }
+};
+
+} // namespace platform
+} // namespace cutlass
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 //
@@ -365,114 +469,190 @@ namespace cutlass {
 
 CUTLASS_HOST_DEVICE
 bool operator==(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __heq(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16());
+#else
   return float(lhs) == float(rhs);
+#endif
 }
 
 CUTLASS_HOST_DEVICE
 bool operator!=(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hne(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16());
+#else
   return float(lhs) != float(rhs);
+#endif
 }
 
 CUTLASS_HOST_DEVICE
 bool operator<(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hlt(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16());
+#else
   return float(lhs) < float(rhs);
+#endif
 }
 
 CUTLASS_HOST_DEVICE
 bool operator<=(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hle(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16());
+#else
   return float(lhs) <= float(rhs);
+#endif
 }
 
 CUTLASS_HOST_DEVICE
 bool operator>(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hgt(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16());
+#else
   return float(lhs) > float(rhs);
+#endif
 }
 
 CUTLASS_HOST_DEVICE
 bool operator>=(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hge(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16());
+#else
   return float(lhs) >= float(rhs);
+#endif
 }
 
 CUTLASS_HOST_DEVICE
 bfloat16_t operator+(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return bfloat16_t(__hadd(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
+#else
   return bfloat16_t(float(lhs) + float(rhs));
+#endif
 }
 
 CUTLASS_HOST_DEVICE
 bfloat16_t operator-(bfloat16_t const& lhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return bfloat16_t(__hneg(lhs.to_nv_bfloat16()));
+#else
   return bfloat16_t(-float(lhs));
+#endif
 }
 
 CUTLASS_HOST_DEVICE
 bfloat16_t operator-(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return bfloat16_t(__hsub(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
+#else
   return bfloat16_t(float(lhs) - float(rhs));
+#endif
 }
 
 CUTLASS_HOST_DEVICE
 bfloat16_t operator*(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return bfloat16_t(__hmul(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
+#else
   return bfloat16_t(float(lhs) * float(rhs));
+#endif
 }
 
 CUTLASS_HOST_DEVICE
 bfloat16_t operator/(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return bfloat16_t(__hdiv(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
+#else
   return bfloat16_t(float(lhs) / float(rhs));
+#endif
 }
 
 CUTLASS_HOST_DEVICE
 bfloat16_t& operator+=(bfloat16_t & lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  lhs = bfloat16_t(__hadd(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
+#else
   lhs = bfloat16_t(float(lhs) + float(rhs));
+#endif
   return lhs;
 }
 
 CUTLASS_HOST_DEVICE
 bfloat16_t& operator-=(bfloat16_t & lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  lhs = bfloat16_t(__hsub(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
+#else
   lhs = bfloat16_t(float(lhs) - float(rhs));
+#endif
   return lhs;
 }
 
 CUTLASS_HOST_DEVICE
 bfloat16_t& operator*=(bfloat16_t & lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  lhs = bfloat16_t(__hmul(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
+#else
   lhs = bfloat16_t(float(lhs) * float(rhs));
+#endif
   return lhs;
 }
 
 CUTLASS_HOST_DEVICE
 bfloat16_t& operator/=(bfloat16_t & lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  lhs = bfloat16_t(__hdiv(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
+#else
   lhs = bfloat16_t(float(lhs) / float(rhs));
+#endif
   return lhs;
 }
 
 CUTLASS_HOST_DEVICE
 bfloat16_t& operator++(bfloat16_t & lhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  lhs = bfloat16_t(__hadd(lhs.to_nv_bfloat16(), bfloat16_t(1.0f).to_nv_bfloat16()));
+#else
   float tmp(lhs);
   ++tmp;
   lhs = bfloat16_t(tmp);
+#endif
   return lhs;
 }
 
 CUTLASS_HOST_DEVICE
 bfloat16_t& operator--(bfloat16_t & lhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  lhs = bfloat16_t(__hsub(lhs.to_nv_bfloat16(), bfloat16_t(1.0f).to_nv_bfloat16()));
+#else
   float tmp(lhs);
   --tmp;
   lhs = bfloat16_t(tmp);
+#endif
   return lhs;
 }
 
 CUTLASS_HOST_DEVICE
 bfloat16_t operator++(bfloat16_t & lhs, int) {
   bfloat16_t ret(lhs);
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  lhs = bfloat16_t(__hadd(lhs.to_nv_bfloat16(), bfloat16_t(1.0f).to_nv_bfloat16()));
+#else
   float tmp(lhs);
   tmp++;
   lhs = bfloat16_t(tmp);
+#endif
   return ret;
 }
 
 CUTLASS_HOST_DEVICE
 bfloat16_t operator--(bfloat16_t & lhs, int) {
   bfloat16_t ret(lhs);
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  lhs = bfloat16_t(__hsub(lhs.to_nv_bfloat16(), bfloat16_t(1.0f).to_nv_bfloat16()));
+#else
   float tmp(lhs);
   tmp--;
   lhs = bfloat16_t(tmp);
+#endif
   return ret;
 }
 
diff --git a/include/cutlass/blas3.h b/include/cutlass/blas3.h
index 3c2df6dd6f..d41f1ee61e 100644
--- a/include/cutlass/blas3.h
+++ b/include/cutlass/blas3.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -39,7 +39,9 @@
 
 #include "cutlass/cutlass.h"
 #include "cutlass/array.h"
+#include "cutlass/blas3_types.h"
 #include "cutlass/coord.h"
+#include "cutlass/complex.h"
 #include "cutlass/functional.h"
 #include "cutlass/numeric_types.h"
 
@@ -48,41 +50,7 @@
 namespace cutlass {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-/// Enumerated type describing the type of kernel (based on input or output matrices).
-enum class BlasMode {
-  kGemm,
-  kSymmetric,
-  kHermitian,
-  kTriangular,
-  kInvalid
-};
-
-/// Enumerated type describing the fill mode for matrices for BLAS functions.
-enum class FillMode {
-  kFull,              /// The entire tensor is covered.
-  kLower,             /// The 'lower' part of a tensor is covered including diagonal
-  kUpper,             /// The 'upper' part of a tensor is covered including diaognal
-  kDiagonal,          /// Only diagonal elements are covered.
-  kNone,              /// No element is covered.
-  kInvalid
-};
 
-/// Enumerated type describing the diagonal property of matrices for BLAS functions.
-enum class DiagType {
-  kNonUnit,
-  kUnit,
-  kZero, // Only used internally for computing SYMM/HEMM
-  kInvalid
-}; 
-
-/// Enumerated type describing the side dense matrix is in matrix equation for BLAS functions.
-enum class SideMode {
-  kLeft,
-  kRight,
-  kInvalid
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
 /// Defines FillMode inversions
 template <FillMode kFillMode>
 struct InvertFillMode;
@@ -164,7 +132,7 @@ struct MantissaInBits<double> {
 template <>
 struct MantissaInBits<cutlass::complex<double>> {
   static int constexpr bits = 30;
-  static double constexpr error = 1.0e-15;
+  static double constexpr error = 1.0e-14;
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/blas3_types.h b/include/cutlass/blas3_types.h
new file mode 100644
index 0000000000..653b93b771
--- /dev/null
+++ b/include/cutlass/blas3_types.h
@@ -0,0 +1,78 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Enumerated type describing the type of kernel (based on input or output matrices).
+enum class BlasMode {
+  kGemm,
+  kSymmetric,
+  kHermitian,
+  kTriangular,
+  kInvalid
+};
+
+/// Enumerated type describing the fill mode for matrices for BLAS functions.
+enum class FillMode {
+  kFull,              /// The entire tensor is covered.
+  kLower,             /// The 'lower' part of a tensor is covered including diagonal
+  kUpper,             /// The 'upper' part of a tensor is covered including diaognal
+  kDiagonal,          /// Only diagonal elements are covered.
+  kNone,              /// No element is covered.
+  kInvalid
+};
+
+/// Enumerated type describing the diagonal property of matrices for BLAS functions.
+enum class DiagType {
+  kNonUnit,
+  kUnit,
+  kZero, // Only used internally for computing SYMM/HEMM
+  kInvalid
+}; 
+
+/// Enumerated type describing the side dense matrix is in matrix equation for BLAS functions.
+enum class SideMode {
+  kLeft,
+  kRight,
+  kInvalid
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/block_striped.h b/include/cutlass/block_striped.h
new file mode 100644
index 0000000000..09f3fb04fc
--- /dev/null
+++ b/include/cutlass/block_striped.h
@@ -0,0 +1,267 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Utilities for performing block-striped access (load, store, reduce) of trivially-copyable,
+    statically-sized array types to global memory.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/wmma_array.h"
+#include "cutlass/functional.h"
+#include "cutlass/complex.h"
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// AccessWidth
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes the maximal power-of-two that evenly divides the size of T, capped at Limit
+template <
+  typename T,
+  int Limit>
+struct AccessWidth
+{
+  // Inductive case
+  template <
+      int ObjectBytes,        /// Size of T in bytes
+      int AlignBytes,         /// Template induction variable
+      bool IsAligned  =       /// Whether ObjectBytes is an even multiple of AlignBytes
+        ((AlignBytes <= Limit) &&  (ObjectBytes % AlignBytes == 0))>
+  struct Detail
+  {
+      static const int value = Detail<ObjectBytes, AlignBytes * 2>::value;
+  };
+
+  // Base case (ObjectBytes is not an even multiple of AlignBytes)
+  template <
+      int ObjectBytes,        /// Size of T in bytes
+      int AlignBytes>         /// Template induction variable
+  struct Detail<ObjectBytes, AlignBytes, false>
+  {
+      static const int value = AlignBytes / 2;
+  };
+
+  /// The maximal power-of-two that evenly divides the size of T
+  static const int value = Detail<
+    (int) sizeof(T),
+    1>::value;
+};
+
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// StripedAccessType
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// ReinterpretCast type for striping a trivially-copyable type in global memory
+/// (Default specialization.  Striping granularity is type T.)
+template <
+    typename T,           /// Data type
+    int TransferBytes =   /// Data access width (16 byte max for global memory access on current architectures)
+      AccessWidth<T, 16>::value>
+struct alignas(TransferBytes) StripedAccessType : public T
+{};
+
+
+/// ReinterpretCast type for striping a trivially-copyable type in global memory
+/// (Specialization for cutlass::Array<T>.  Striping granularity is a multiple of T.)
+template <
+    typename T,           /// Array element type
+    int N,                /// Number of elements in array
+    bool RegisterSized,   /// T is register-sized
+    int TransferBytes>    /// Data access width
+struct StripedAccessType<
+    Array<T, N, RegisterSized>,
+    TransferBytes>
+: public AlignedArray<
+            T,                                                  // Element type of StripedAccessType
+            __NV_STD_MAX(1, TransferBytes / (int) sizeof(T)),   // Number of elements T in StripedAccessType
+            TransferBytes>                                      // Alignment of StripedAccessType
+{};
+
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+
+/// ReinterpretCast type for striping a trivially-copyable type in global memory
+/// (Specialization for cutlass::WmmaFragmentArray<T>.  Striping granularity is a multiple of T.)
+template<
+    typename Use,
+    int m,
+    int n,
+    int k,
+    typename ElementT,
+    typename Layout,
+    int kFragments,
+    int TransferBytes>
+struct StripedAccessType<
+    WmmaFragmentArray<nvcuda::wmma::fragment<Use, m, n, k, ElementT, Layout>, kFragments>,
+    TransferBytes>
+: public AlignedArray<
+            ElementT,
+            __NV_STD_MAX(1, TransferBytes / (int) sizeof(ElementT)),
+            TransferBytes>
+{};
+
+#endif // if defined(CUTLASS_ARCH_WMMA_ENABLED)
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// BlockStriped
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Utility for performing block-striped access (load, store) of trivially-copyable,
+/// statically-sized array types to global memory
+template <
+  int BlockThreads,
+  typename ArrayT,
+  typename AccessT = StripedAccessType<ArrayT> >
+struct BlockStriped
+{
+  /// Number of striped accesses
+  static const int kStripes = int(sizeof(ArrayT) / sizeof(AccessT));
+  static_assert(kStripes > 0, "AccessT type must be smaller than or equal to ArrayT type");
+
+  /// Load
+  CUTLASS_DEVICE
+  static void load(ArrayT &data, ArrayT *ptr, int thread_idx)
+  {
+    AccessT *access_input = reinterpret_cast<AccessT*>(ptr);
+    AccessT *access_data = reinterpret_cast<AccessT*>(&data);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kStripes; ++i) {
+      access_data[i] = access_input[(BlockThreads * i) + thread_idx];
+    }
+  }
+
+  /// Load & Add
+  CUTLASS_DEVICE
+  static void load_add(ArrayT &data, ArrayT *ptr, int thread_idx)
+  {
+    AccessT *access_input = reinterpret_cast<AccessT*>(ptr);
+    AccessT *access_data = reinterpret_cast<AccessT*>(&data);
+
+    plus<AccessT> add;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kStripes; ++i)
+    {
+      access_data[i] = add(access_data[i], access_input[(BlockThreads * i) + thread_idx]);
+    }
+  }
+
+  /// Store
+  CUTLASS_DEVICE
+  static void store(ArrayT *ptr, const ArrayT &data, int thread_idx)
+  {
+    AccessT *access_output = reinterpret_cast<AccessT*>(ptr);
+    const AccessT *access_data = reinterpret_cast<const AccessT*>(&data);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kStripes; ++i) {
+      access_output[(BlockThreads * i) + thread_idx] = access_data[i];
+    }
+  }
+
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// BlockStripedReduce
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/// Utility for performing block-striped access (load, store, reduce) of trivially-copyable,
+/// statically-sized array types to global memory.
+/// (Default specialization)
+template <
+  int BlockThreads,
+  typename ArrayT,
+  typename ElementT = typename StripedAccessType<ArrayT>::Element>
+struct BlockStripedReduce :
+  BlockStriped<
+    BlockThreads,
+    ArrayT,
+    ElementT>
+{
+  /// Reduce
+  CUTLASS_DEVICE
+  static void reduce(ArrayT *ptr, const ArrayT &data, int thread_idx)
+  {
+    cutlass::atomic_add<ElementT> reduce;
+    ElementT *access_output = reinterpret_cast<ElementT*>(ptr);
+    const ElementT *access_data = reinterpret_cast<const ElementT*>(&data);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < BlockStripedReduce::kStripes; ++i) {
+      reduce(access_output + (BlockThreads * i) + thread_idx, access_data[i]);
+    }
+  }
+};
+
+
+/// Utility for performing block-striped access (load, store, reduce) of trivially-copyable,
+/// statically-sized array types to global memory.
+/// (Specialization for half_t.  Uses half2 vectorized-reduction.)
+template <
+  int BlockThreads,
+  typename ArrayT>
+struct BlockStripedReduce<BlockThreads, ArrayT, half_t> :
+  BlockStriped<
+    BlockThreads,
+    ArrayT,
+    half2>
+{
+  static_assert(BlockStripedReduce::kStripes % 2 == 0, "Array of half must be even number in length");
+
+  /// Reduce
+  CUTLASS_DEVICE
+  static void reduce(ArrayT *ptr, const ArrayT &data, int thread_idx)
+  {
+    cutlass::atomic_add<half2> reduce;
+    half2 *access_output = reinterpret_cast<half2*>(ptr);
+    const half2 *access_data = reinterpret_cast<const half2*>(&data);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < BlockStripedReduce::kStripes; ++i)
+    {
+      reduce(access_output + (BlockThreads * i) + thread_idx, access_data[i]);
+    }
+  }
+};
+
+
+} // namespace cutlass
+
diff --git a/include/cutlass/cluster_launch.hpp b/include/cutlass/cluster_launch.hpp
new file mode 100644
index 0000000000..a0fa22b6bb
--- /dev/null
+++ b/include/cutlass/cluster_launch.hpp
@@ -0,0 +1,275 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief PTX for TMA Tensor Memory Access operators on memory added for SM90
+*/
+
+#pragma once
+
+#include <cuda_runtime_api.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/trace.h"
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/type_traits>
+#else
+#include <type_traits>
+#include <cstdio>
+#endif
+
+#if ((__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8)))
+#  define CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED
+#endif
+
+namespace cutlass {
+
+#ifndef NDEBUG
+#define Return_Status(cudaError_t_status)            \
+  if (cudaError_t_status != cudaSuccess) {           \
+    fprintf(stderr,                                  \
+            "[ ERROR: CUDA Runtime ] %s:%d: %s\n",   \
+            __FILE__,                                \
+            __LINE__,                                \
+            cudaGetErrorString(cudaError_t_status)); \
+    return Status::kInvalid;                         \
+  } else {                                           \
+    return Status::kSuccess;                         \
+  }
+#else
+#define Return_Status(cudaError_t_status)          \
+  if (cudaError_t_status != cudaSuccess) {         \
+    return Status::kInvalid;                       \
+  } else {                                         \
+    return Status::kSuccess;                       \
+  }
+#endif
+
+struct ClusterLauncher {
+  constexpr static int MaxClusterSize = 32;
+
+  // Check for hardware compatibility
+  static inline CUTLASS_HOST
+  Status check_cluster_dims(dim3 grid, dim3 cluster) {
+    if (((cluster.x * cluster.y * cluster.z) <= MaxClusterSize) &&
+        (grid.x % cluster.x == 0) && (grid.y % cluster.y == 0) && (grid.z % cluster.z == 0)) {
+      return Status::kSuccess;
+    }
+    else {
+      CUTLASS_TRACE_HOST("ClusterLauncher: Invalid cluster configuration -- aborting launch.");
+      return Status::kInvalid;
+    }
+  }
+
+  static inline CUTLASS_HOST
+  Status
+#if defined(CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED)
+  init(void const* kernel_function)
+#else
+  init(void const* /* kernel_function */)
+#endif
+  {
+#if defined(CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED)
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    if (kernel_function == nullptr) {
+      CUTLASS_TRACE_HOST("kernel_function is null");
+      return Status::kInvalid;
+    }
+    CUTLASS_TRACE_HOST("Checking previous error state before calling cudaFuncSetAttribute");
+    cudaError_t prevStatus = cudaGetLastError();
+    if (prevStatus != cudaSuccess) {
+      fprintf(stderr,
+              "[ ERROR: CUDA Runtime ] %s:%d: %s\n",
+              __FILE__,
+              __LINE__,
+              cudaGetErrorString(prevStatus));
+      return Status::kInvalid;
+    }
+    CUTLASS_TRACE_HOST("Calling cudaFuncSetAttribute");
+#endif
+    // This attribute was added in CUDA 11.8.
+    cudaError_t status =
+        cudaFuncSetAttribute(
+          kernel_function, cudaFuncAttributeNonPortableClusterSizeAllowed, 1);
+    Return_Status(status);
+#else
+    return Status::kInvalid;
+#endif
+  }
+
+  // This is the method we expect to use going forward
+  static inline CUTLASS_HOST
+  Status launch(
+      dim3 const grid_dims,
+      dim3 const cluster_dims,
+      dim3 const block_dims,
+      size_t const smem_size,
+      cudaStream_t cuda_stream,
+      void const* kernel,
+      void** kernel_params,
+      bool launch_with_pdl = false) {
+#if defined(CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED)
+    if (check_cluster_dims(grid_dims, cluster_dims) != Status::kSuccess) {
+      CUTLASS_TRACE_HOST("ClusterLauncher: check_cluster_dims() failed. Aborting.");
+      return Status::kInvalid;
+    }
+
+    auto init_status = init(kernel);
+    if (init_status != Status::kSuccess) {
+      CUTLASS_TRACE_HOST("ClusterLauncher: init(kernel) failed with status " << int(init_status) << ". Aborting.");
+      return Status::kInvalid;
+    }
+
+    cudaLaunchConfig_t launch_config;
+    launch_config.gridDim = {grid_dims.x, grid_dims.y, grid_dims.z};
+    launch_config.blockDim = {block_dims.x, block_dims.y, block_dims.z};
+    launch_config.dynamicSmemBytes = smem_size;
+    launch_config.stream = cuda_stream;
+
+    cudaLaunchAttribute launch_attribute[2];
+
+    launch_attribute[0].id = cudaLaunchAttributeClusterDimension;
+    launch_attribute[0].val.clusterDim.x = cluster_dims.x;
+    launch_attribute[0].val.clusterDim.y = cluster_dims.y;
+    launch_attribute[0].val.clusterDim.z = cluster_dims.z;
+
+    launch_attribute[1].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+    launch_attribute[1].val.programmaticStreamSerializationAllowed = 1;
+
+    launch_config.numAttrs = launch_with_pdl ? 2 : 1;
+
+    launch_config.attrs = launch_attribute;
+
+    CUTLASS_TRACE_HOST("ClusterLauncher: Launching GPC_CLUSTER_GRID GridDims = "
+        "(" << grid_dims.x << ", " << grid_dims.y << ", " << grid_dims.z << "), "
+        "And ClusterDims = "
+        "(" << cluster_dims.x << ", " << cluster_dims.y << ", " << cluster_dims.z << ")\n");
+
+    cutlass::arch::synclog_setup();
+    cudaError_t status = cudaLaunchKernelExC(&launch_config, kernel, kernel_params);
+    Return_Status(status);
+#else
+    CUTLASS_TRACE_HOST("ClusterLauncher: CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED not defined! Aborting cluster launch.");
+    return Status::kInvalid;
+#endif
+  }
+
+};
+
+namespace detail {
+
+template<class Arg>
+void* checked_addressof(Arg&& arg) {
+  static_assert(! std::is_rvalue_reference_v<Arg> || ! std::is_const_v<Arg>, "You cannot take the address of a const rvalue reference (const T&&).");
+  // We use std::addressof to ensure we get the address,
+  // in case the type has an overloaded operator&.
+  // Note that this precludes `const T&&` references.
+  return const_cast<void*>(reinterpret_cast<void const*>(std::addressof(arg)));
+}
+
+} // namespace detail
+
+//! Parameters for launch_on_cluster (see below).
+struct ClusterLaunchParams {
+  //! Grid dimensions
+  dim3 grid_dims{1, 1, 1};
+
+  //! Block dimensions
+  dim3 block_dims{1, 1, 1};
+
+  //! Cluster dimensions
+  dim3 cluster_dims{1, 1, 1};
+
+  //! Number of bytes required for the kernel's shared memory.
+  int smem_size_in_bytes = 0;
+
+  //! CUDA stream on which to launch the kernel.
+  cudaStream_t cuda_stream = nullptr;
+};
+
+/// @brief Launch the kernel on the stream using cluster launch.
+///
+/// @param params Cluster launch parameters (see above).
+/// @param kernel_ptr Pointer to the kernel function (see example).
+/// @param args Zero or more arguments to pass to the kernel.
+///
+/// @tparam Args Types of the arguments passed to the kernel.
+///   Don't specify this/these template argument(s) explicitly.
+///
+/// @return Status::Success on success, else an error code.
+///
+/// @code
+/// template<class SharedMemoryType, class A, class B, class C>
+/// __global__ void kernel(A a, B b, C c);
+///
+/// X x = get_x();
+/// Y y = get_y();
+/// Z z = get_z();
+///
+/// void const* kernel_ptr =
+///   const_cast<void const*>(reinterpret_cast<void*>(
+///     &kernel<SharedMemory, X, Y, Z>));
+/// auto status = launch_kernel_on_cluster(
+///   {grid_dims, block_dims, cluster_dims, sizeof(SharedMemory)},
+///   kernel_ptr, x, y, z);
+/// @endcode
+template<class ... Args>
+CUTLASS_HOST cutlass::Status
+launch_kernel_on_cluster(const ClusterLaunchParams& params,
+  void const* kernel_ptr,
+  Args&& ... args)
+{
+  // Unfortunately, we find ourselves needing to pass in
+  // the parameters as an array of raw pointers.
+  if constexpr (sizeof...(Args) == 0) {
+    return cutlass::ClusterLauncher::launch(
+      params.grid_dims,
+      params.cluster_dims,
+      params.block_dims,
+      params.smem_size_in_bytes,
+      params.cuda_stream,
+      kernel_ptr, nullptr);
+  }
+  else {
+    void* kernel_params[sizeof...(Args)] = {
+      detail::checked_addressof(std::forward<Args>(args))...
+    };
+    return cutlass::ClusterLauncher::launch(
+      params.grid_dims,
+      params.cluster_dims,
+      params.block_dims,
+      params.smem_size_in_bytes,
+      params.cuda_stream,
+      kernel_ptr,
+      kernel_params);
+  }
+}
+
+}  // namespace cutlass
diff --git a/include/cutlass/complex.h b/include/cutlass/complex.h
index aeccd2db1d..6d0bf31df6 100644
--- a/include/cutlass/complex.h
+++ b/include/cutlass/complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -28,10 +28,13 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
+
 #pragma once
 
 #include <cuComplex.h>
 
+#include <cuda_fp16.h>
+
 #if defined(__CUDACC_RTC__)
 #include <cuda/std/cstdint>
 #else
@@ -39,11 +42,11 @@
 #endif
 
 #include "cutlass/cutlass.h"
-#include "cutlass/half.h"
+#include "cutlass/functional.h"
+#include "cutlass/platform/platform.h"
 #include "cutlass/real.h"
 
-#include "cutlass/bfloat16.h"
-#include "cutlass/tfloat32.h"
+#include "cutlass/numeric_types.h"
 
 #include "cutlass/fast_math.h"
 
@@ -53,8 +56,7 @@
 
 namespace cutlass {
 
-//////////////////////////////////////////////////////////////////////////////////////////////////
-
+/////////////////////////////////////////////////////////////////////////////////////////////////
 /// Enumeraed type describing a transformation on a complex value.
 enum class ComplexTransform {
   kNone,
@@ -116,6 +118,18 @@ double const &imag(cuDoubleComplex const &z) { return z.y; }
 /// Returns the imaginary part of the complex number
 CUTLASS_HOST_DEVICE
 double &imag(cuDoubleComplex &z) { return z.y; }
+
+// Returns the conjugate of the complex number
+CUTLASS_HOST_DEVICE cuFloatComplex
+conj(cuFloatComplex const& z) {
+  return make_cuFloatComplex(z.x, -z.y);
+}
+
+// Returns the conjugate of the complex number
+CUTLASS_HOST_DEVICE cuDoubleComplex
+conj(cuDoubleComplex const& z) {
+  return make_cuDoubleComplex(z.x, -z.y);
+}
 #endif
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -128,6 +142,7 @@ class complex
 {
  public:
   /// Type alias for scalar type
+  using value_type = T;
 
  private:
   //
@@ -146,15 +161,18 @@ class complex
 // Methods
 //
 
-/// Constructor
+  /// Default constructor
+  complex() = default;
+
+  /// Constructor
   CUTLASS_HOST_DEVICE
-  complex(T r = T(0)) : _real(r), _imag(T(0)) {}
+  complex(T r) : _real(r), _imag(T(0)) {}
 
-/// Constructor
+  /// Constructor
   CUTLASS_HOST_DEVICE
   complex(T r, T i) : _real(r), _imag(i) {}
-  //
-/// Constructor
+
+  /// Constructor
   template<typename A>
   CUTLASS_HOST_DEVICE
   complex(complex<A> const &z) : _real(static_cast<T>(z.real())), _imag(static_cast<T>(z.imag())) {}
@@ -170,16 +188,6 @@ class complex
   complex(cuDoubleComplex const &z) : _real(static_cast<T>(cuCreal(z))), _imag(static_cast<T>(cuCimag(z))) {}
   #endif
 
-  /// Assignment
-  template<typename A>
-  CUTLASS_HOST_DEVICE
-  complex<T>& operator=(complex<A> const &z)
-  {
-    _real = static_cast<T>(z.real());
-    _imag = static_cast<T>(z.imag());
-    return *this;
-  }
-
   /// Equality operator
   CUTLASS_HOST_DEVICE bool operator==(complex<T> const &rhs) const {
     return this->real() == rhs.real() && this->imag() == rhs.imag();
@@ -196,6 +204,24 @@ class complex
     return complex<T>(this->real() + rhs.real(), this->imag() + rhs.imag());
   }
 
+  /// Reduction into memory address.  Components may update out of order.
+  template <typename OtherT>
+  CUTLASS_DEVICE void red(complex<OtherT> *ptr) const {
+    static_assert(platform::is_same<T, OtherT>::value, "Component type must match");
+    cutlass::atomic_add<T> reduce;
+    reduce(&ptr->_real, _real);
+    reduce(&ptr->_imag, _imag);
+  }
+
+  /// Reduction into memory address.  Components may update out of order.  (Half specialization)
+  CUTLASS_DEVICE void red(complex<half_t> *ptr) const {
+    static_assert(platform::is_same<T, half_t>::value, "Component type must match");
+    half2 *h2_ptr = reinterpret_cast<half2*>(ptr);
+    half2 h2_data = reinterpret_cast<half2&>(*this);
+    cutlass::atomic_add<half2> reduce;
+    reduce(h2_ptr, h2_data);
+  }
+
   /// Subtraction
     template <typename A>
   CUTLASS_HOST_DEVICE complex<T> operator-(complex<A> const &rhs) const {
@@ -283,6 +309,13 @@ class complex
   CUTLASS_HOST_DEVICE
   T &imag() { return _imag; }
 
+  /// Set the real part of the complex number
+  CUTLASS_HOST_DEVICE
+  void real(T real) { _real = real; }
+
+  /// Set the imaginary part of the complex number
+  CUTLASS_HOST_DEVICE
+  void imag(T imag) { _imag = imag; }
 
   #if !defined(__CUDACC_RTC__)
   /// Converts to cuFloatComplex
@@ -295,60 +328,90 @@ class complex
   #endif
 };
 
+// Complex conjugate
+template<class T>
+CUTLASS_HOST_DEVICE complex<T> conj(complex<T> const& z) {
+  return {z.real(), -z.imag()};
+}
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
 //
 // Accessors for complex template
 //
 
-/// Returns the real part of the complex number
-template <typename T>
-CUTLASS_HOST_DEVICE T const &real(complex<T> const &z) {
-  return z.real();
-}
+// Nonmember real and imag need to work for non-complex numbers too.
+// That means cutlass::complex, std::complex, cuda::std::complex, and
+// any user-defined complex number type that looks like std::complex.
+// It's reasonable to assume that a "complex number type" has
+// zero-argument real() and imag() member functions returning
+// non-void.  While cuFloatComplex and cuDoubleComplex lack those
+// member functions, one-argument nonmember real and imag overloads
+// for those types are defined above.
 
-/// Returns the real part of the complex number
-template <typename T>
-CUTLASS_HOST_DEVICE T &real(complex<T> &z) {
-  return z.real();
-}
+namespace detail {
 
-/// Returns the imaginary part of the complex number
-template <typename T>
-CUTLASS_HOST_DEVICE T const &imag(complex<T> const &z) {
-  return z.imag();
-}
+template <typename T, typename Enable = void>
+struct has_zero_argument_real_member_function :
+  cutlass::platform::false_type
+{};
 
-/// Returns the imaginary part of the complex number
 template <typename T>
-CUTLASS_HOST_DEVICE T &imag(complex<T> &z) {
-  return z.imag();
-}
+struct has_zero_argument_real_member_function<T,
+  cutlass::platform::enable_if_t<
+    ! cutlass::platform::is_void_v<
+      decltype(cutlass::platform::declval<T>().real())
+    >
+  >
+> : cutlass::platform::true_type
+{};
 
-/// Returns the real part of the real number
 template <typename T>
-CUTLASS_HOST_DEVICE T const &real(T const &r) {
-  return r;
-}
+constexpr bool has_zero_argument_real_member_function_v =
+  has_zero_argument_real_member_function<T>::value;
 
-/// Returns the real part of the real number
-template <typename T>
-CUTLASS_HOST_DEVICE T &real(T &r) {
-  return r;
-}
+template <typename T, typename Enable = void>
+struct has_zero_argument_imag_member_function :
+  cutlass::platform::false_type
+{};
 
-/// Returns the imaginary part of the real number
 template <typename T>
-CUTLASS_HOST_DEVICE T const &imag(T const &r) {
-  return T();
-}
+struct has_zero_argument_imag_member_function<T,
+  cutlass::platform::enable_if_t<
+    ! cutlass::platform::is_void_v<
+      decltype(cutlass::platform::declval<T>().imag())
+    >
+  >
+> : cutlass::platform::true_type
+{};
 
-/// Returns the imaginary part of the complex number
 template <typename T>
-CUTLASS_HOST_DEVICE T &imag(T &r) {
-  return T();
-}
+constexpr bool has_zero_argument_imag_member_function_v =
+  has_zero_argument_imag_member_function<T>::value;
+
+} // namespace detail
 
+template<typename T>
+CUTLASS_HOST_DEVICE auto real(T z) {
+  if constexpr (detail::has_zero_argument_real_member_function_v<T>) {
+    return z.real();
+  } else {
+    return z;
+  }
+}
+  
+template<typename T>
+CUTLASS_HOST_DEVICE auto imag(T z) {
+  if constexpr (detail::has_zero_argument_imag_member_function_v<T>) {
+    return z.imag();
+  } else {
+    // Imaginary part of a non-complex input has the same type as the
+    // input, and its value is zero.  CUTLASS assumes in this case
+    // that value-initializing T is well-formed and results in zero.
+    return T{};
+  }
+}
+  
 //
 // Output operators
 //
@@ -375,10 +438,36 @@ std::ostream &operator<<(std::ostream &out, complex<T> const &z) {
 // Non-member functions defined for complex numbers
 //
 
-/// Returns the magnitude of the complex number
+// abs returns the magnitude of the complex number.
+
+CUTLASS_HOST_DEVICE float abs(complex<float> const &z) {
+  return ::hypot(z.real(), z.imag());
+}
+
+CUTLASS_HOST_DEVICE double abs(complex<double> const &z) {
+  return ::hypot(z.real(), z.imag());
+}
+
+// In theory, it would make sense to add a complex<long double>
+// specialization of abs here, since hypot works for long double too.
+// In practice, long double doesn't have a portable number of bits or
+// behavior, so users who care about higher-precision floating-point
+// computation should probably insist on an actual FP128 type.
+
 template <typename T>
 CUTLASS_HOST_DEVICE T abs(complex<T> const &z) {
-  return sqrt(norm(z));
+  // cutlass::complex permits all kinds of T, including types that
+  // don't have NaN.  For a generic floating-point type with Inf
+  // and/or NaN, LAPACK's DLAPY2 algorithm would make sense, as it
+  // would handle issues like avoiding unwarranted overflow if
+  // z.real() or z.imag() is slightly bigger than the square root of
+  // the max finite number.  That could be a future improvement; for
+  // now, the code just uses the naive algorithm.
+  //
+  // Use the "swap two-step" idiom so that argument-dependent lookup
+  // can find any CUTLASS-specific overloads.
+  using cutlass::sqrt;
+  return sqrt(z.real() * z.real() + z.imag() * z.imag());
 }
 
 /// Returns the magnitude of the complex number
@@ -414,33 +503,70 @@ CUTLASS_HOST_DEVICE R norm_accumulate(T const &x, R const & accumulator) {
 /// Norm accumulate specialized for complex types
 template <typename T, typename R>
 CUTLASS_HOST_DEVICE R norm_accumulate(complex<T> const &z, R const &accumulator) {
-  return accumulator + static_cast<R>(real(z)) * static_cast<R>(real(z)) + 
+  return accumulator + static_cast<R>(real(z)) * static_cast<R>(real(z)) +
     static_cast<R>(imag(z)) * static_cast<R>(imag(z));
 }
 
-/// Returns the complex conjugate
-CUTLASS_HOST_DEVICE float conj(float const &z) {
-  return z;
+namespace detail {
+  
+template<class T>
+CUTLASS_HOST_DEVICE T conj_impl(T const& z, cutlass::platform::true_type) {
+  return conj(z);
 }
 
-/// Returns the complex conjugate
-CUTLASS_HOST_DEVICE double conj(double const &z) {
+template<class T>
+CUTLASS_HOST_DEVICE T conj_impl(T const& z, cutlass::platform::false_type) {
   return z;
 }
 
-/// Returns the complex conjugate
-template <typename T>
-CUTLASS_HOST_DEVICE complex<T> conj(complex<T> const &z) {
-  return complex<T>(real(z), -imag(z));
+template<class T>
+CUTLASS_HOST_DEVICE T conj_impl(T const& z) {
+  constexpr bool use_unqualified_conj =
+    ! cutlass::platform::is_arithmetic_v<T> &&
+    ! detail::has_cutlass_conj_v<T> &&
+    detail::has_unqualified_conj_v<T>;
+  return conj_impl(z, cutlass::platform::bool_constant<use_unqualified_conj>{});
 }
-/// Indentity transform for non-complex types
-template <typename T>
-CUTLASS_HOST_DEVICE T conj(T const &z) {
-    static_assert( !platform::is_same<T, cuComplex>::value &&
-                   !platform::is_same<T, cuDoubleComplex>::value &&
-                   !platform::is_same<T, cutlass::complex<double>>::value &&
-                   !platform::is_same<T, cutlass::complex<float>>::value, "May not be a complex data type");
-  return z;
+  
+} // namespace detail
+
+// Return the complex conjugate of the input.
+//
+// This MUST be a function and not a function object, because it may
+// be common practice for downstream types to define specifically
+// cutlass::conj overloads, instead of overloads in their namespace.
+//
+// As a result of this being a function and not a function object,
+// CUTLASS code needs to declare "using cutlass::conj;" in scope and
+// then call this function unqualified, just like std::swap.
+//
+// If an overload already exists for cutlass::conj(T), that overload
+// will be called instead of this one.  Otherwise:
+//
+// 1. for arithmetic types, return z;
+//
+// 2. for types where (namespace-unqualified) conj(z) is well formed
+//    and cutlass::conj(z) is NOT well formed, return conj(z); and,
+//
+// 3. for everything else, return z.
+//
+// Regarding (1), the C++ Standard Library makes std::conj always
+// return std::complex, even for (noncomplex) arithmetic types.
+// cutlass::conj(T t) needs to return type T.  This follows the
+// convention of linear algebra software like the BLAS, where
+// "conjugate transpose" means the same thing as "transpose" for a
+// matrix of noncomplex numbers.
+//
+// Case (2) covers std::complex, cuda::std::complex, and non-Standard
+// (including user-defined) complex number types (for which "conj(z)"
+// is findable via argument-dependent lookup, but does not live in the
+// cutlass namespace).  It excludes cutlass::conj(z) in order to
+// prevent infinite recursion.
+//
+// Case (3) covers non-Standard non-complex number types.
+template<class T>
+CUTLASS_HOST_DEVICE T conj(T const& z) {
+  return detail::conj_impl(z);
 }
 
 /// Projects the complex number z onto the Riemann sphere
@@ -494,24 +620,24 @@ CUTLASS_HOST_DEVICE complex<T> sin(complex<T> const &z) {
   return (exp(-z) - exp(z)) * complex<T>(T(0), T(1) / T(2));
 }
 
-/// Comparison 
+/// Comparison
 template <typename T>
 CUTLASS_HOST_DEVICE bool operator<(complex<T> const &lhs, complex<T> const &rhs) {
-  //TODO
-  return true; 
+  return true;
 }
 
 //////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Partial specialization for complex-valued type.
 template <typename T>
-struct RealType< complex<T> > {
+struct RealType< complex<T> >
+{
   using Type = T;
 
   /// Number of elements
   static int const kExtent = 2;
 
-CUTLASS_HOST_DEVICE
+  CUTLASS_HOST_DEVICE
   static complex<T> from_real(double x) {
     return complex<T>(static_cast<T>(x));
   }
@@ -549,6 +675,147 @@ struct is_complex<complex<T>> {
   static bool const value = true;
 };
 
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// functional.h numeric specializations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Squares with optional conversion
+template <typename T, typename Output>
+struct magnitude_squared<complex<T>, Output> {
+  CUTLASS_HOST_DEVICE
+  Output operator()(complex<T> lhs) const {
+    multiplies<Output> mul_op;
+
+    Output y_r = Output(lhs.real());
+    Output y_i = Output(lhs.imag());
+
+    return mul_op(y_r, y_r) + mul_op(y_i, y_i);
+  }
+};
+
+/// Fused multiply-add
+template <typename T>
+struct multiply_add<complex<T>, complex<T>, complex<T>> {
+  CUTLASS_HOST_DEVICE
+  complex<T> operator()(
+    complex<T> const &a,
+    complex<T> const &b,
+    complex<T> const &c) const {
+
+    T real = c.real();
+    T imag = c.imag();
+
+    real += a.real() * b.real();
+    real += -a.imag() * b.imag();
+    imag += a.real() * b.imag();
+    imag += a.imag () * b.real();
+
+    return complex<T>{
+      real,
+      imag
+    };
+  }
+};
+
+/// Fused multiply-add
+template <typename T>
+struct multiply_add<complex<T>, T, complex<T>> {
+  CUTLASS_HOST_DEVICE
+  complex<T> operator()(
+    complex<T> const &a,
+    T const &b,
+    complex<T> const &c) const {
+
+    T real = c.real();
+    T imag = c.imag();
+
+    real += a.real() * b;
+    imag += a.imag () * b;
+
+    return complex<T>{
+      real,
+      imag
+    };
+  }
+};
+
+/// Fused multiply-add
+template <typename T>
+struct multiply_add<T, complex<T>, complex<T>> {
+  CUTLASS_HOST_DEVICE
+  complex<T> operator()(
+    T const &a,
+    complex<T> const &b,
+    complex<T> const &c) const {
+
+    T real = c.real();
+    T imag = c.imag();
+
+    real += a * b.real();
+    imag += a * b.imag();
+
+    return complex<T>{
+      real,
+      imag
+    };
+  }
+};
+
+/// Conjugate
+template <typename T>
+struct conjugate<complex<T>>  {
+  CUTLASS_HOST_DEVICE
+  complex<T> operator()(complex<T> const &a) const {
+    // Invoke the complex<T> overload specifically, rather than
+    // wasting the compiler's effort on overload resolution.
+    return cutlass::conj(a);
+  }
+};
+
+#if ! defined(__CUDACC_RTC__)
+template <>
+struct conjugate<cuFloatComplex>  {
+  CUTLASS_HOST_DEVICE
+  cuFloatComplex operator()(cuFloatComplex const& z) const {
+    return make_cuFloatComplex(z.x, -z.y);
+  }
+};
+
+template <>
+struct conjugate<cuDoubleComplex>  {
+  CUTLASS_HOST_DEVICE
+  cuDoubleComplex operator()(cuDoubleComplex const& z) const {
+    return make_cuDoubleComplex(z.x, -z.y);
+  }
+};
+#endif
+  
+/// Computes the square of a difference with optional conversion
+template <typename T, typename Output>
+struct magnitude_squared_difference<complex<T>, Output> {
+  CUTLASS_HOST_DEVICE
+  Output operator()(complex<T> lhs, complex<T> rhs) const {
+    multiplies<Output> mul_op;
+
+    Output y_r = Output(lhs.real()) - Output(rhs.real());
+    Output y_i = Output(lhs.imag()) - Output(rhs.imag());
+
+    return mul_op(y_r, y_r) + mul_op(y_i, y_i);
+  }
+};
+
+/// Reduces value into the data pointed to by ptr (complex<T> specialization)
+template <typename T>
+struct atomic_add<complex<T>> {
+  CUTLASS_DEVICE
+  void operator()(complex<T> *ptr, const complex<T> &data)
+  {
+    data.red(ptr);
+  }
+};
+
+
 //////////////////////////////////////////////////////////////////////////////////////////////////
 
 }  // namespace cutlass
diff --git a/include/cutlass/constants.h b/include/cutlass/constants.h
index abb7cab438..49d96045aa 100644
--- a/include/cutlass/constants.h
+++ b/include/cutlass/constants.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/collective/builders/sm90_common.inl b/include/cutlass/conv/collective/builders/sm90_common.inl
new file mode 100644
index 0000000000..526db83edf
--- /dev/null
+++ b/include/cutlass/conv/collective/builders/sm90_common.inl
@@ -0,0 +1,96 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/layout/tensor.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/dispatch_policy.hpp"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/gemm/collective/builders/sm90_common.inl"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv::collective::detail {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Maps a rank-1 cute::Shape<> representing the cluster shape on to the IM2COL TMA atom that should be used with it
+template <class UnimodalClusterShape>
+constexpr auto
+sm90_cluster_shape_to_im2col_tma_atom(UnimodalClusterShape unimodal_cluster_shape) {
+  static_assert(cute::rank(unimodal_cluster_shape) == 1,
+    "Use this function to figure out TMA for each mode individually.");
+
+  if constexpr (cute::size(unimodal_cluster_shape) == 1) {
+    return cute::SM90_TMA_LOAD_IM2COL{};
+  }
+  else {
+    return cute::SM90_TMA_LOAD_IM2COL_MULTICAST{};
+  }
+}
+
+// Collective tile traits struct that serves as a type list containing a tensor's mem layouts and atoms for the
+template<
+  class GmemTiledCopy_,
+  class SmemLayout_,
+  class SmemCopyAtom_ = void
+>
+struct Sm90ImplicitGemmTileTraits {
+  using GmemTiledCopy = GmemTiledCopy_;
+  using SmemLayout = SmemLayout_;
+  using SmemCopyAtom = SmemCopyAtom_;
+};
+
+// Accepts a cutlass::layout::Tensor tag and computes the corresponding spatial dimension count
+template <class GmemLayoutTagA, class GmemLayoutTagB>
+constexpr int
+gmem_layout_tags_to_spatial_dims() {
+  static_assert(cute::is_same_v<GmemLayoutTagA, GmemLayoutTagB>);
+  if constexpr      (cute::is_same_v<GmemLayoutTagA, cutlass::layout::TensorNWC>) {
+    return 1;
+  }
+  else if constexpr (cute::is_same_v<GmemLayoutTagA, cutlass::layout::TensorNHWC>) {
+    return 2;
+  }
+  else if constexpr (cute::is_same_v<GmemLayoutTagA, cutlass::layout::TensorNDHWC>) {
+    return 3;
+  }
+  else {
+    static_assert(cutlass::detail::dependent_false<GmemLayoutTagA>);
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv::collective::detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/collective/builders/sm90_gmma_builder.inl b/include/cutlass/conv/collective/builders/sm90_gmma_builder.inl
new file mode 100644
index 0000000000..a08209efb6
--- /dev/null
+++ b/include/cutlass/conv/collective/builders/sm90_gmma_builder.inl
@@ -0,0 +1,257 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/conv/collective/builders/sm90_common.inl"
+
+// SM90 Collective Builders should be used only starting CUDA 12.0
+#if (__CUDACC_VER_MAJOR__ >= 12)
+#define CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv::collective {
+using namespace cute;
+
+namespace detail {
+
+// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count. 
+template<int CapacityBytes, class ElementA, class ElementB, class TileShapeMNK, int stages>
+constexpr int
+compute_stage_count_or_override(StageCount<stages> stage_count) {
+  return stages;
+}
+
+// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count. 
+template<int CapacityBytes, class ElementA, class ElementB, class TileShapeMNK, int stages>
+constexpr int
+compute_stage_count_or_override(cute::Int<stages> stage_count) {
+  return stages;
+}
+
+// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count. 
+template<int CapacityBytes, class ElementA, class ElementB, class TileShapeMNK, int carveout_bytes>
+constexpr int
+compute_stage_count_or_override(StageCountAutoCarveout<carveout_bytes> stage_count) {
+  constexpr auto mainloop_pipeline_bytes = sizeof(typename cutlass::PipelineTmaAsync<1>::SharedStorage);
+  constexpr auto a_bits = cute::sizeof_bits_v<ElementA>;
+  constexpr auto b_bits = cute::sizeof_bits_v<ElementB>;
+  constexpr int stage_bytes =
+    cutlass::bits_to_bytes(a_bits * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
+    cutlass::bits_to_bytes(b_bits * size<1>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
+    static_cast<int>(mainloop_pipeline_bytes);
+
+  return (CapacityBytes - carveout_bytes) / stage_bytes;
+}
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA_TMA_WS_SS_FPROP
+template <
+  conv::Operator ConvOp,
+  class ElementA,
+  class GmemLayoutA,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutB,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassTensorOp,
+    ConvOp,
+    ElementA,
+    GmemLayoutA,
+    AlignmentA,
+    ElementB,
+    GmemLayoutB,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelScheduleType,
+    cute::enable_if_t<cute::is_same_v<KernelScheduleType, KernelImplicitTmaWarpSpecializedSm90> ||
+                      cute::is_same_v<KernelScheduleType, KernelImplicitTmaWarpSpecializedSm90Cooperative> ||
+                      cute::is_same_v<KernelScheduleType, KernelImplicitTmaWarpSpecializedSm90Pingpong>>
+> {
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+  static_assert(cutlass::gemm::collective::detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, cutlass::gemm::collective::detail::tma_alignment_bytes>(),
+                "Should meet TMA alignment requirement\n");
+
+  // For fp32 types, map to tf32 MMA value type
+  using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
+  using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
+
+  // For fprop, majorA = K,  major B = K;
+  // For wgrad, majorA = MN, major B = MN;
+  // For dgrad, majorA = K,  major B = MN;
+  static constexpr cute::GMMA::Major GmmaMajorA =
+    (ConvOp == conv::Operator::kWgrad) ? cute::GMMA::Major::MN : cute::GMMA::Major::K;
+  static constexpr cute::GMMA::Major GmmaMajorB =
+    (ConvOp == conv::Operator::kFprop) ? cute::GMMA::Major::K : cute::GMMA::Major::MN;
+
+  using AtomLayoutMNK = cute::conditional_t<cute::is_same_v<KernelScheduleType, KernelImplicitTmaWarpSpecializedSm90Cooperative>,
+      Layout<Shape<_2,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector<
+      ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{}));
+
+  // For wgrad kernel, tensor A uses tma tiled mode and tensor B uses tma im2col mode.
+  using GmemTiledCopyA = cute::conditional_t<ConvOp == conv::Operator::kWgrad,
+      decltype(cutlass::gemm::collective::detail::sm90_cluster_shape_to_tma_atom(cute::shape<1>(ClusterShape_MNK{}))),
+      decltype(cutlass::conv::collective::detail::sm90_cluster_shape_to_im2col_tma_atom(cute::shape<1>(ClusterShape_MNK{})))>;
+  using GmemTiledCopyB = cute::conditional_t<ConvOp == conv::Operator::kWgrad,
+      decltype(cutlass::conv::collective::detail::sm90_cluster_shape_to_im2col_tma_atom(cute::shape<0>(ClusterShape_MNK{}))),
+      decltype(cutlass::gemm::collective::detail::sm90_cluster_shape_to_tma_atom(cute::shape<0>(ClusterShape_MNK{})))>;
+
+  using SmemLayoutAtomA = decltype(cutlass::gemm::collective::detail::ss_smem_selector<
+      GmmaMajorA, ElementAMma, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+  using SmemLayoutAtomB = decltype(cutlass::gemm::collective::detail::ss_smem_selector<
+      GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  static constexpr int PipelineStages = detail::compute_stage_count_or_override<cutlass::gemm::collective::detail::sm90_smem_capacity_bytes,
+      ElementAMma, ElementBMma, TileShape_MNK>(StageCountType{});
+
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape_MNK{}), shape<2>(TileShape_MNK{}), Int<PipelineStages>{}),
+      Step<_2,_1,_3>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape_MNK{}), shape<2>(TileShape_MNK{}), Int<PipelineStages>{}),
+      Step<_2,_1,_3>{}));
+
+  constexpr static int NumSpatialDimensions = cutlass::conv::collective::detail::gmem_layout_tags_to_spatial_dims<GmemLayoutA, GmemLayoutB>();
+
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedImplicitGemm<
+      ConvOp, PipelineStages, NumSpatialDimensions, ClusterShape_MNK, KernelScheduleType>;
+
+  using CollectiveOp = CollectiveConv<
+      DispatchPolicy,
+      TileShape_MNK,
+      ElementA,
+      ElementB,
+      TiledMma,
+      detail::Sm90ImplicitGemmTileTraits<GmemTiledCopyA, SmemLayoutA>,
+      detail::Sm90ImplicitGemmTileTraits<GmemTiledCopyB, SmemLayoutB>
+    >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA auto kernel schedule
+template <
+  conv::Operator ConvOp,
+  class ElementA,
+  class GmemLayoutA,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutB,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassTensorOp,
+    ConvOp,
+    ElementA,
+    GmemLayoutA,
+    AlignmentA,
+    ElementB,
+    GmemLayoutB,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelScheduleType,
+    cute::enable_if_t<cute::is_same_v<KernelScheduleType, KernelScheduleAuto>>
+> {
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+
+/*
+#if ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 1)))
+  // Cooperative schedule performs best for CUDA Toolkits with version >= 12.1
+
+  // For TileShape_M == 64, choosing KernelTmaWarpSpecialized as the KernelSchedule
+  // Since KernelTmaWarpSpecializedCooperative requires TileShape_M to be at least 128
+  using KernelWarpSpecializedSchedule = cute::conditional_t<size<0>(TileShape_MNK{}) == Int<64>{},
+      KernelImplicitTmaWarpSpecializedSm90PingPong, KernelImplicitTmaWarpSpecializedSm90Cooperative>;
+#else
+  using KernelWarpSpecializedSchedule = KernelImplicitTmaWarpSpecializedSm90;
+#endif
+*/
+  using KernelWarpSpecializedSchedule = KernelImplicitTmaWarpSpecializedSm90;
+
+  using CollectiveOp = typename CollectiveBuilder<
+      arch::Sm90,
+      arch::OpClassTensorOp,
+      ConvOp,
+      ElementA,
+      GmemLayoutA,
+      AlignmentA,
+      ElementB,
+      GmemLayoutB,
+      AlignmentB,
+      ElementAccumulator,
+      TileShape_MNK,
+      ClusterShape_MNK,
+      StageCountType,
+      KernelWarpSpecializedSchedule
+    >::CollectiveOp;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/collective/collective_builder.hpp b/include/cutlass/conv/collective/collective_builder.hpp
new file mode 100644
index 0000000000..9d6a16c0db
--- /dev/null
+++ b/include/cutlass/conv/collective/collective_builder.hpp
@@ -0,0 +1,93 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/conv/collective/collective_conv.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Used to specify stage counts or dispatch to automatic computation of stage count
+template<int num_stages>
+struct StageCount {
+  static constexpr int value = num_stages;
+
+  StageCount() = default;
+  explicit StageCount(cute::Int<num_stages>) {}
+};
+
+template<int carveout_bytes>
+struct StageCountAutoCarveout {
+  static constexpr int bytes = carveout_bytes;
+
+  StageCountAutoCarveout() = default;
+  explicit StageCountAutoCarveout(cute::Int<carveout_bytes>) {}
+};
+
+// Used to automatically let the builder pick the kernel schedule.
+// Can be overridden with kernel schedule tags in cutlass/conv/dispatch_policy.hpp
+struct KernelScheduleAuto {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ArchTag,
+  class OpClass,
+  conv::Operator,
+  class ElementA,
+  class GmemLayoutA,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutB,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType,
+  class Enable = void
+>
+struct CollectiveBuilder {
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Could not build a collective for given parameters.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "builders/sm90_gmma_builder.inl"
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/collective/collective_conv.hpp b/include/cutlass/conv/collective/collective_conv.hpp
new file mode 100644
index 0000000000..d187b5ecee
--- /dev/null
+++ b/include/cutlass/conv/collective/collective_conv.hpp
@@ -0,0 +1,62 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/conv/collective/detail.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class DispatchPolicy,
+  class TileShape,
+  class ElementA,
+  class ElementB,
+  class TiledMma,
+  class TileTraitsA,
+  class TileTraitsB
+>
+struct CollectiveConv {
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Could not find a mainloop specialization.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "sm90_implicit_gemm_gmma_ss_warpspecialized.hpp"
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/collective/detail.hpp b/include/cutlass/conv/collective/detail.hpp
new file mode 100644
index 0000000000..ac272c8e20
--- /dev/null
+++ b/include/cutlass/conv/collective/detail.hpp
@@ -0,0 +1,254 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/conv/convnd_problem_shape.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv::collective::detail {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Construct the stride types for conv collectives based on the dispatch policy, strides 64b by default
+template <class DispatchPolicy>
+constexpr auto
+sm90_dispatch_policy_to_stride_A() {
+  if constexpr (DispatchPolicy::ConvOp == conv::Operator::kFprop) {
+    // Maps to modes ((w,n), C)
+    if constexpr (DispatchPolicy::NumSpatialDimensions == 1) {
+      return cute::Stride<cute::Stride<int64_t, int64_t>,
+                          cute::Int<1>>{};
+    }
+    // Maps to modes ((w,h,n), C)
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 2) {
+      return cute::Stride<cute::Stride<int64_t, int64_t, int64_t>,
+                          cute::Int<1>>{};
+    }
+    // Maps to modes ((w,h,d,n), C)
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 3) {
+      return cute::Stride<cute::Stride<int64_t, int64_t, int64_t, int64_t>,
+                          cute::Int<1>>{};
+    }
+    // error dims assert
+    else {
+      static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported spatial dim count.");
+    }
+  }
+  else if constexpr (DispatchPolicy::ConvOp == conv::Operator::kWgrad) {
+    // Maps to modes (k, nq/npq/nzpq)
+    if constexpr (DispatchPolicy::NumSpatialDimensions == 1 ||
+                  DispatchPolicy::NumSpatialDimensions == 2 ||
+                  DispatchPolicy::NumSpatialDimensions == 3) {
+      return cute::Stride<cute::Int<1>, int64_t>{};
+    }
+    // error dims assert
+    else {
+      static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported spatial dim count.");
+    }
+  }
+  else if constexpr (DispatchPolicy::ConvOp == conv::Operator::kDgrad) {
+    // Maps to modes ((q,n), K)
+    if constexpr (DispatchPolicy::NumSpatialDimensions == 1) {
+      return cute::Stride<cute::Stride<int64_t, int64_t>,
+                          cute::Int<1>>{};
+    }
+    // Maps to modes ((q,p,n), K)
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 2) {
+      return cute::Stride<cute::Stride<int64_t, int64_t, int64_t>,
+                          cute::Int<1>>{};
+    }
+    // Maps to modes ((q,p,z,n), K)
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 3) {
+      return cute::Stride<cute::Stride<int64_t, int64_t, int64_t, int64_t>,
+                          cute::Int<1>>{};
+    }
+    // error dims assert
+    else {
+      static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported spatial dim count.");
+    }
+  }
+  else {
+    static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported ConvOp.");
+  }
+}
+
+// Construct the stirde types for conv collectives based on the dispatch policy, strides 64b by default
+template <class DispatchPolicy>
+constexpr auto
+sm90_dispatch_policy_to_stride_B() {
+  if constexpr (DispatchPolicy::ConvOp == conv::Operator::kFprop) {
+    // Maps to modes (k, (C,s))
+    if constexpr      (DispatchPolicy::NumSpatialDimensions == 1) {
+      return cute::Stride<int64_t, cute::Stride<cute::Int<1>, int64_t>>{};
+    }
+    // Maps to modes (k, (C,s,r))
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 2) {
+      return cute::Stride<int64_t, cute::Stride<cute::Int<1>, int64_t, int64_t>>{};
+    }
+    // Maps to modes (k, (C,s,r,t))
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 3) {
+      return cute::Stride<int64_t, cute::Stride<cute::Int<1>, int64_t, int64_t, int64_t>>{};
+    }
+    // error dims assert
+    else {
+      static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported spatial dim count.");
+    }
+  }
+  else if constexpr (DispatchPolicy::ConvOp == conv::Operator::kWgrad) {
+    // Maps to modes (C, (w,n))
+    if constexpr (DispatchPolicy::NumSpatialDimensions == 1) {
+      return cute::Stride<cute::Int<1>,
+                          cute::Stride<int64_t, int64_t>>{};
+    }
+    // Maps to modes (C, (w,h,n))
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 2) {
+      return cute::Stride<cute::Int<1>,
+                          cute::Stride<int64_t, int64_t, int64_t>>{};
+    }
+    // Maps to modes (C, (w,h,d,n))
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 3) {
+      return cute::Stride<cute::Int<1>,
+                          cute::Stride<int64_t, int64_t, int64_t, int64_t>>{};
+    }
+    // error dims assert
+    else {
+      static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported spatial dim count.");
+    }
+  }
+  else if constexpr (DispatchPolicy::ConvOp == conv::Operator::kDgrad) {
+    // Maps to modes (C, (k,s))
+    if constexpr      (DispatchPolicy::NumSpatialDimensions == 1) {
+      return cute::Stride<cute::Int<1>, cute::Stride<int64_t, int64_t>>{};
+    }
+    // Maps to modes (C, (k,s,r))
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 2) {
+      return cute::Stride<cute::Int<1>, cute::Stride<int64_t, int64_t, int64_t>>{};
+    }
+    // Maps to modes (C, (k,s,r,t))
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 3) {
+      return cute::Stride<cute::Int<1>, cute::Stride<int64_t, int64_t, int64_t, int64_t>>{};
+    }
+    // error dims assert
+    else {
+      static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported spatial dim count.");
+    }
+  }
+  else {
+    static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported ConvOp.");
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Compute the lower/near corner, returning it as a cute::array in [W,H,D] order
+template <conv::Operator ConvOp, int NumSpatialDimensions>
+CUTLASS_HOST_DEVICE
+constexpr auto
+compute_lower_corner_whd(ConvProblemShape<ConvOp, NumSpatialDimensions> const& problem_shape) {
+  using cute::for_each;
+  using cute::make_seq;
+
+  cute::array<int, NumSpatialDimensions> lower{};
+  if constexpr (ConvOp == conv::Operator::kFprop ||
+                ConvOp == conv::Operator::kWgrad) {
+    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
+      lower[NumSpatialDimensions-1-i] = -1 * problem_shape.lower_padding[i];
+    });
+  }
+  else if constexpr (ConvOp == conv::Operator::kDgrad) {
+    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
+      lower[NumSpatialDimensions-1-i] = problem_shape.lower_padding[i] -
+        (problem_shape.shape_B[i+1] - 1) * problem_shape.dilation[i];
+    });
+  }
+  return lower;
+}
+
+// Computes the upper/far corner, returning it as a cute::array in [W,H,D] order
+template <conv::Operator ConvOp, int NumSpatialDimensions>
+CUTLASS_HOST_DEVICE
+constexpr auto
+compute_upper_corner_whd(ConvProblemShape<ConvOp, NumSpatialDimensions> const& problem_shape) {
+  using cute::for_each;
+  using cute::make_seq;
+
+  cute::array<int, NumSpatialDimensions> upper{};
+  if constexpr (ConvOp == conv::Operator::kFprop) {
+    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
+      upper[NumSpatialDimensions-1-i] = problem_shape.upper_padding[i] -
+        (problem_shape.shape_B[i+1] - 1) * problem_shape.dilation[i];
+    });
+  }
+  else if constexpr (ConvOp == conv::Operator::kWgrad) {
+    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
+      upper[NumSpatialDimensions-1-i] = problem_shape.upper_padding[i] -
+        (problem_shape.shape_C[i+1] - 1) * problem_shape.dilation[i];
+    });
+  }
+  else if constexpr (ConvOp == conv::Operator::kDgrad) {
+    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
+      upper[NumSpatialDimensions-1-i] = problem_shape.lower_padding[i] -
+        (problem_shape.shape_B[i+1] - 1) * problem_shape.dilation[i] + problem_shape.shape_C[i+1] - problem_shape.shape_A[i+1];
+    });
+  }
+  return upper;
+}
+
+// Compute the lower/near corner of (t,r,s), returning it as a cute::array in [S,R,T] order
+template <conv::Operator ConvOp, int NumSpatialDimensions>
+CUTLASS_HOST_DEVICE
+constexpr auto
+compute_lower_srt(ConvProblemShape<ConvOp, NumSpatialDimensions> const& problem_shape) {
+  using cute::for_each;
+  using cute::make_seq;
+
+  cute::array<int, NumSpatialDimensions> lower{};
+  if constexpr (ConvOp == conv::Operator::kFprop ||
+                ConvOp == conv::Operator::kWgrad) {
+    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
+      lower[NumSpatialDimensions-1-i] = 0;
+    });
+  }
+  else if constexpr (ConvOp == conv::Operator::kDgrad) {
+    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
+      lower[NumSpatialDimensions-1-i] = (problem_shape.shape_B[i+1] - 1) * problem_shape.dilation[i];
+    });
+  }
+  return lower;
+}
+
+template <class CopyOp> struct is_im2col_load { static constexpr bool value = false; };
+template <> struct is_im2col_load<SM90_TMA_LOAD_IM2COL          > { static constexpr bool value = true; };
+template <> struct is_im2col_load<SM90_TMA_LOAD_IM2COL_MULTICAST> { static constexpr bool value = true; };
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv::collective::detail
diff --git a/include/cutlass/conv/collective/sm90_implicit_gemm_gmma_ss_warpspecialized.hpp b/include/cutlass/conv/collective/sm90_implicit_gemm_gmma_ss_warpspecialized.hpp
new file mode 100644
index 0000000000..0e5d898d0e
--- /dev/null
+++ b/include/cutlass/conv/collective/sm90_implicit_gemm_gmma_ss_warpspecialized.hpp
@@ -0,0 +1,753 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor_predicate.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/atom/copy_traits_sm90_im2col.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/algorithm/gemm.hpp"
+
+#include "cutlass/conv/detail.hpp"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/dispatch_policy.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/util/packed_stride.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  conv::Operator ConvOp,
+  int Stages,
+  int NumSpatialDims,
+  class ClusterShape,
+  class KernelSchedule,
+  int PipelineAsyncMmaStages,
+  class TileShape_,
+  class ElementA_,
+  class ElementB_,
+  class TiledMma_,
+  class TileTraitsA_,
+  class TileTraitsB_>
+struct CollectiveConv<
+    MainloopSm90TmaGmmaWarpSpecializedImplicitGemm<
+        ConvOp, Stages, NumSpatialDims, ClusterShape, KernelSchedule, PipelineAsyncMmaStages>,
+    TileShape_,
+    ElementA_,
+    ElementB_,
+    TiledMma_,
+    TileTraitsA_,
+    TileTraitsB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedImplicitGemm<
+      ConvOp, Stages, NumSpatialDims, ClusterShape, KernelSchedule, PipelineAsyncMmaStages>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using ElementB = ElementB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = typename TileTraitsA_::GmemTiledCopy;
+  using GmemTiledCopyB = typename TileTraitsB_::GmemTiledCopy;
+  using SmemLayoutA = typename TileTraitsA_::SmemLayout;
+  using SmemLayoutB = typename TileTraitsB_::SmemLayout;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+  static constexpr int NumSpatialDimensions = DispatchPolicy::NumSpatialDimensions;
+  static constexpr int NumTensorDimensions = NumSpatialDimensions + 2;
+  // Deduce the kernel-facing stride tuple types based on the dispatch policy
+  // (which is a function of the number of spatial dimensions, the algorithm, etc.)
+  using StrideA = decltype(detail::sm90_dispatch_policy_to_stride_A<DispatchPolicy>());
+  using StrideB = decltype(detail::sm90_dispatch_policy_to_stride_B<DispatchPolicy>());
+
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+  using PipelineState  = typename cutlass::PipelineState<DispatchPolicy::Stages>;
+  
+  using ProblemShape = ConvProblemShape<ConvOp, NumSpatialDimensions>;
+
+  // TODO: move pipeline mode tiling into the collective setup phase instead
+  static_assert(rank(SmemLayoutA{}) == 3, "SmemLayout must be rank 3 (M/N, K, PIPE)");
+  static_assert((size<0>(TileShape{}) == size<0>(SmemLayoutA{})), "SmemLayout must be compatible with the tile shape.");
+  static_assert((size<2>(TileShape{}) == size<1>(SmemLayoutA{})), "SmemLayout must be compatible with the tile shape.");
+
+  static_assert(rank(SmemLayoutB{}) == 3, "SmemLayout must be rank 3 (M/N, K, PIPE)");
+  static_assert((size<1>(TileShape{}) == size<0>(SmemLayoutB{})), "SmemLayout must be compatible with the tile shape.");
+  static_assert((size<2>(TileShape{}) == size<1>(SmemLayoutB{})), "SmemLayout must be compatible with the tile shape.");
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
+  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+
+  // The tma load mode of wgrad is tiled for tensor A and im2col for tensor B while the tma load mode of fprop and dgrad
+  // kernel is im2col for tensor A and tiled for tensor B.
+  static_assert((ConvOp == conv::Operator::kWgrad
+             && (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>))
+             || (ConvOp != conv::Operator::kWgrad
+             && (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_IM2COL> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_IM2COL_MULTICAST>)),
+      "GmemTiledCopyA - invalid SM90 TMA copy atom specified.");
+  static_assert((ConvOp == conv::Operator::kWgrad
+             && (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_IM2COL> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_IM2COL_MULTICAST>))
+             || (ConvOp != conv::Operator::kWgrad
+             && (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>)),
+      "GmemTiledCopyB - invalid SM90 TMA copy atom specified.");
+
+  static constexpr bool is_im2col_A = detail::is_im2col_load<GmemTiledCopyA>::value;
+  static constexpr bool is_im2col_B = detail::is_im2col_load<GmemTiledCopyB>::value;
+
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using InternalElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using InternalElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
+
+  struct SharedStorage
+  {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = DispatchPolicy::PipelineAsyncMmaStages;
+  static constexpr uint32_t TmaTransactionBytes =
+      (size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof(InternalElementA)))+
+      (size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof(InternalElementB)));
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A{nullptr};
+    ElementB const* ptr_B{nullptr};
+  };
+
+private:
+  // Note that for fprop and dgrad kernel, the tma load mode is im2col for tensor A and tiled for
+  // tensor B while for wgrad kernel, the tma load mode is tiled for tensor A and im2col for tensor
+  // B since operand A, B is swapped.
+  // Get tma_load_a instantce.
+  template <class TensorA>
+  static constexpr auto
+  get_tma_load_a_instance(TensorA const& tensor_a, ProblemShape const& problem_shape) {
+    if constexpr (is_im2col_A) {
+      // compute the upper and lower corners based on the conv padding
+      auto lower_corner_whd = detail::compute_lower_corner_whd(problem_shape);
+      auto upper_corner_whd = detail::compute_upper_corner_whd(problem_shape);
+      auto lower_srt = detail::compute_lower_srt(problem_shape);
+
+      // The calculation of gbasis strides for dgrad kernel needs perform negate for dilation values.
+      cute::array<int32_t, NumSpatialDimensions> stride_srt{};
+      for (int i = 0; i < NumSpatialDimensions; ++i) {
+        stride_srt[i] = ConvOp == conv::Operator::kDgrad ?
+            -problem_shape.dilation[NumSpatialDimensions-1-i] :
+            problem_shape.dilation[NumSpatialDimensions-1-i];
+      }
+
+      return make_im2col_tma_copy(
+          GmemTiledCopyA{},
+          tensor_a,
+          SmemLayoutA{}(_,_,_0{}),
+          product_each(shape(SmemLayoutA{}(_,_,_0{}))),
+          size<1>(ClusterShape{}),
+          shape(lower_corner_whd),
+          shape(upper_corner_whd),
+          cute::reverse(shape(problem_shape.lower_padding)),
+          cute::reverse(shape(problem_shape.upper_padding)),
+          cute::reverse(shape(problem_shape.traversal_stride)),
+          shape(lower_srt),
+          shape(stride_srt));
+    }
+    // TMA tiled mode for tensor A in wgrad kernel.
+    else {
+      return make_tma_copy(
+          GmemTiledCopyA{},
+          tensor_a,
+          SmemLayoutA{}(_,_,_0{}),
+          make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+          size<1>(ClusterShape{}));
+    }
+  }
+
+  // Get tma_load_b instantce.
+  template <class TensorB>
+  static constexpr auto
+  get_tma_load_b_instance(TensorB const& tensor_b, ProblemShape const& problem_shape) {
+    // TMA im2col mode for tensor B in wgrad kernel.
+    if constexpr (is_im2col_B) {
+      // compute the upper and lower corners based on the conv padding
+      auto lower_corner_whd = detail::compute_lower_corner_whd(problem_shape);
+      auto upper_corner_whd = detail::compute_upper_corner_whd(problem_shape);
+      auto lower_srt = detail::compute_lower_srt(problem_shape);
+
+      return make_im2col_tma_copy(
+          GmemTiledCopyB{},
+          tensor_b,
+          SmemLayoutB{}(_,_,_0{}),
+          product_each(shape(SmemLayoutB{}(_,_,_0{}))),
+          size<0>(ClusterShape{}),
+          shape(lower_corner_whd),
+          shape(upper_corner_whd),
+          cute::reverse(shape(problem_shape.lower_padding)),
+          cute::reverse(shape(problem_shape.upper_padding)),
+          cute::reverse(shape(problem_shape.traversal_stride)),
+          shape(lower_srt),
+          cute::reverse(shape(problem_shape.dilation)));
+    }
+    else {
+      return make_tma_copy(
+          GmemTiledCopyB{},
+          tensor_b,
+          SmemLayoutB{}(_,_,_0{}),
+          make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+          size<0>(ClusterShape{}));
+    }
+  }
+
+public:
+
+  // Performs im2col transformations on the input of type ConvProblemShape
+  static constexpr auto
+  get_problem_shape_MNKL(ProblemShape const& problem_shape) {
+
+    if constexpr (is_im2col_A || is_im2col_B) {
+      // transformation + im2col linearization
+      return cutlass::conv::detail::get_linearized_problem_shape_MNKL(problem_shape);
+    }
+    else {
+      // transformation
+      return cutlass::conv::detail::get_transformed_problem_shape_MNKL(problem_shape);
+    }
+  }
+
+  // Device side kernel params
+  struct Params {
+    using _Submode = decltype(take<0,NumTensorDimensions-1>(typename ProblemShape::TensorExtent{}));
+
+    // Assumption: StrideA is congruent with Problem_MK
+    // Select TMA load type according to convolution operator.
+    using TensorShapeA = cute::conditional_t<ConvOp == conv::Operator::kWgrad,
+        decltype(repeat_like(StrideA{}, int32_t(0))),
+        decltype(make_shape(_Submode{}, int(0)))>;
+
+    using TensorShapeB = cute::conditional_t<ConvOp == conv::Operator::kWgrad,
+        decltype(make_shape(int(0), _Submode{})),
+        decltype(repeat_like(StrideB{}, int32_t(0)))>;
+
+    using TMA_A = decltype(get_tma_load_a_instance(
+        make_tensor(
+            make_gmem_ptr(static_cast<InternalElementA const*>(nullptr)),
+            make_layout(TensorShapeA{}, StrideA{})),
+        ConvProblemShape<ConvOp, NumSpatialDimensions>{}));
+
+    using TMA_B = decltype(get_tma_load_b_instance(
+        make_tensor(
+            make_gmem_ptr(static_cast<InternalElementB const*>(nullptr)),
+            make_layout(TensorShapeB{}, StrideB{})),
+        ConvProblemShape<ConvOp, NumSpatialDimensions>{}));
+
+    // Members
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+  };
+
+  //
+  // Methods
+  //
+
+  // Lowers the host side user facing arguments to the kernel facing lauch params
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+    // from the flat problem shape arrays of ConvProblemShape<ConvOp, N>, create a rank-3 MNK problem shape tuple
+    // tma desc creation depends on the original untransformed domain.
+
+    // A extents.
+    auto shape_A_orig = problem_shape.get_shape_A();
+    // B extents.
+    auto shape_B_orig = problem_shape.get_shape_B();
+
+    // Fill inferred cute strides from flat stride arrays
+    auto dA = make_cute_packed_stride(StrideA{}, problem_shape.stride_A, ConvOp);
+    auto dB = make_cute_packed_stride(StrideB{}, problem_shape.stride_B, ConvOp);
+
+    auto ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_A);
+    auto ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(make_gmem_ptr(ptr_A), make_layout(shape_A_orig, dA));
+    Tensor tensor_b = make_tensor(make_gmem_ptr(ptr_B), make_layout(shape_B_orig, dB));
+
+    auto tma_load_a = get_tma_load_a_instance(tensor_a, problem_shape);
+    auto tma_load_b = get_tma_load_b_instance(tensor_b, problem_shape);
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      TmaTransactionBytes
+    };
+  }
+  
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      Arguments const& args) {
+    // Activation and Filter channel mode extents much match
+    bool implementable = true;
+    // channel mode is major
+    implementable &= problem_shape.stride_A[NumTensorDimensions-1] == 1;
+    implementable &= problem_shape.stride_B[NumTensorDimensions-1] == 1;
+
+    constexpr int tma_alignment_bits = 128;
+    // A extents.
+    auto shape_A_orig = problem_shape.get_shape_A();
+    // B extents.
+    auto shape_B_orig = problem_shape.get_shape_B();
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(shape_A_orig, StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(shape_B_orig, StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+      return false;
+    }
+
+    // Check valid padding values for TMA_LOAD_IM2COL
+    constexpr int padding_limit = (ProblemShape::RankS == 1) ? 65536 : (ProblemShape::RankS == 2 ? 256 : 16);
+    for (int i = 0; i < problem_shape.RankS; ++i) {
+      implementable = implementable && problem_shape.lower_padding[i] <= padding_limit && problem_shape.lower_padding[i] >= 0;
+      implementable = implementable && problem_shape.upper_padding[i] <= padding_limit && problem_shape.upper_padding[i] >= 0;
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Padding values don't meet requirements for TMA LOAD IM2COL.\n");
+      return false;
+    }
+
+    if (is_im2col_A || is_im2col_B) {
+      // Check valid corner values for TMA_LOAD_IM2COL, signed int ranging from [-corner_limit, corner_limit - 1]
+      constexpr int32_t corner_limit = 1 << (16 / NumSpatialDimensions - 1);
+      auto lower_corner_whd = detail::compute_lower_corner_whd(problem_shape);
+      for (int i = 0; i < problem_shape.RankS; ++i) {
+        implementable = implementable && lower_corner_whd[i] >= -corner_limit && lower_corner_whd[i] <= (corner_limit - 1);
+      }
+      auto upper_corner_whd = detail::compute_upper_corner_whd(problem_shape);
+      for (int i = 0; i < problem_shape.RankS; ++i) {
+        implementable = implementable && upper_corner_whd[i] >= -corner_limit && upper_corner_whd[i] <= (corner_limit - 1);
+      }
+
+      if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Padding values don't meet requirements for TMA LOAD IM2COL.\n");
+        return false;
+      }
+    }
+
+    // Wgrad kernels don't support non-packed output strides, non-packed tensor A stride (linearized)
+    if constexpr (ConvOp == conv::Operator::kWgrad) {
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      std::ostringstream os;
+#endif        
+      const auto & input_shape  = problem_shape.shape_A;
+      const auto & input_stride  = problem_shape.stride_A;
+
+      implementable &= input_stride[ProblemShape::RankT - 1] == 1;
+      int input_shape_size = 1;
+      for (int i = ProblemShape::RankT - 2; i >= 0; --i) {
+        input_shape_size *= input_shape[i + 1];
+        implementable &= input_stride[i] == input_shape_size;
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+        if (input_stride[i] != input_shape_size) {
+          os << "\n    *** input_stride[" << i << "] = " << input_stride[i] << " != input_shape_size = " << input_shape_size << " ***";
+        }
+#endif
+      }
+
+      if (!implementable) {
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+        os << "\n    input_shape_size: " << input_shape_size
+           << "\n    input_shape: " << input_shape
+           << "\n    input_stride: " << input_stride
+           << "\n";
+#endif        
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Wgrad kernels don't support non-packed input strides.\n");
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+        CUTLASS_TRACE_HOST(os.str());
+#endif        
+        return false;
+      }
+
+      const auto & output_shape  = problem_shape.shape_C;
+      const auto & output_stride  = problem_shape.stride_C;
+
+      implementable &= output_stride[ProblemShape::RankT - 1] == 1;
+      int output_shape_size = 1;
+      for (int i = ProblemShape::RankT - 2; i >= 0; --i) {
+        output_shape_size *= output_shape[i + 1];
+        implementable &= output_stride[i] == output_shape_size;
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+        if (output_stride[i] != output_shape_size) {
+          os << "\n    *** output_stride[" << i << "] = " << output_stride[i] << " != output_shape_size = " << output_shape_size << " ***";
+        }
+#endif
+      }
+
+      if (!implementable) {
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+        os << "\n    output_shape_size: " << input_shape_size
+           << "\n    output_shape: " << input_shape
+           << "\n    output_stride: " << input_stride
+           << "\n";
+#endif
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Wgrad kernels don't support non-packed output strides.\n");
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+        CUTLASS_TRACE_HOST(os.str());
+#endif        
+        return false;
+      }
+    }
+
+    // Conv kernels only support cross correlation mode currently.
+    implementable &= problem_shape.mode == cutlass::conv::Mode::kCrossCorrelation;
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Conv kernels only support cross correlation mode currently.\n");
+      return false;
+    }
+
+    if (problem_shape.groups > 1) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: This kernel does not support conv groups > 1.\n");
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mk - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k)
+  /// gB_nk - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k)
+  /// The rest of the tensors can be specified as needed by this collective.
+  /// The dimensions of gA_mk and gA_nk do not contain L to maintain consistency with 
+  /// StrideA and StrideB set up for TMA 
+  template <class ProblemShapeMNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShapeMNKL const& problem_shape_MNKL, Params const& mainloop_params){
+  //load_init(ProblemShapeMNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mk = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K));                            // (m,k)
+    Tensor mB_nk = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K));                            // (n,k)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mk = local_tile(mA_mk, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k)
+    Tensor gB_nk = local_tile(mB_nk, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k)
+
+    return cute::make_tuple(gA_mk, gB_nk);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA, class TensorB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_producer_state,
+      cute::tuple<TensorA, TensorB> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+
+    int lane_predicate = cute::elect_one_sync();
+    if (lane_predicate) {
+      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+
+      //
+      // Prepare the TMA loads for A and B
+      //
+      constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+
+      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+      auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+      auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+      auto [gA_mk, gB_nk] = load_inputs;
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+
+      Tensor gA = gA_mk(_,_,m_coord,_);                                                     // (BLK_M,BLK_K,k)
+      Tensor gB = gB_nk(_,_,n_coord,_);                                                     // (BLK_N,BLK_K,k)
+
+      // Applies the mapping from block_tma_a
+      Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+      uint16_t mcast_mask_a = 0;
+      uint16_t mcast_mask_b = 0;
+
+      // Issue TmaLoads
+      // Maps the tile -> block, value
+      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_IM2COL_MULTICAST> ||
+                    cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+        for (int n = 0; n < size<1>(block_layout); ++n) {
+          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+        }
+      }
+
+      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_IM2COL_MULTICAST> ||
+                    cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+        for (int m = 0; m < size<0>(block_layout); ++m) {
+          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+        }
+      }
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for ( ; k_tile_count > 0; --k_tile_count) {
+        // LOCK smem_pipe_producer_state for _writing_
+        pipeline.producer_acquire(smem_pipe_producer_state);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_producer_state);
+
+        int write_stage = smem_pipe_producer_state.index();
+
+        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+        ++k_tile_iter;
+
+        // Advance smem_pipe_producer_state
+        ++smem_pipe_producer_state;
+      }
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_producer_state) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_producer_state);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <class FrgTensorC>
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_consumer_state,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+
+    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
+        "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_consumer_state;
+
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    warpgroup_fence_operand(accum);
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue) {
+      // WAIT on smem_pipe_consumer_state until its data are available (phase bit flips from rdPhaseBit value)
+      pipeline.consumer_wait(smem_pipe_consumer_state);
+
+      int read_stage = smem_pipe_consumer_state.index();
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+
+      warpgroup_commit_batch();
+
+      ++smem_pipe_consumer_state;
+    }
+
+    warpgroup_fence_operand(accum);
+    // Mainloop GMMAs
+    k_tile_count -= prologue_mma_count;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // WAIT on smem_pipe_consumer_state until its data are available (phase bit flips from rdPhaseBit value)
+      pipeline.consumer_wait(smem_pipe_consumer_state);
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_consumer_state.index();
+      warpgroup_fence_operand(accum);
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_producer_state is consumed
+      warpgroup_wait<K_PIPE_MMAS>();
+      warpgroup_fence_operand(accum);
+
+      // UNLOCK smem_pipe_release, done _computing_ on it
+      pipeline.consumer_release(smem_pipe_release);
+
+      // Advance smem_pipe_consumer_state and smem_pipe_release
+      ++smem_pipe_consumer_state;
+      ++smem_pipe_release;
+    }
+
+    warpgroup_fence_operand(accum);
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/conv2d_problem_size.h b/include/cutlass/conv/conv2d_problem_size.h
index 7d0c86f478..d2e8952998 100644
--- a/include/cutlass/conv/conv2d_problem_size.h
+++ b/include/cutlass/conv/conv2d_problem_size.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,7 +35,7 @@
     activation (NHWC), 
     filter (KRSC), 
     output (NPQK), 
-    pading (pad_h, pad_w), 
+    pading (pad_h, pad_w),
     stride (stride_h, stride_w),
     dilation (dilation_h, dilation_w).
     
@@ -47,17 +47,10 @@
 
 #pragma once
 
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/cmath>
-#else
-#include <cmath>
-#endif
-
 #include "cutlass/cutlass.h"
 #include "cutlass/tensor_coord.h"
 #include "cutlass/fast_math.h"
-#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/gemm_enumerated_types.h"
 #include "cutlass/matrix_coord.h"
 #include "cutlass/conv/convolution.h"
 #include "cutlass/functional.h"
@@ -87,7 +80,7 @@ struct Conv2dProblemSize {
 
 public:
   CUTLASS_HOST_DEVICE
-  Conv2dProblemSize(): 
+  Conv2dProblemSize():
     N(0), H(0), W(0), C(0), P(0), Q(0), K(0), R(0), S(0),
     pad_h(0), pad_w(0), stride_h(1), stride_w(1), dilation_h(1), dilation_w(1),
     mode(Mode::kConvolution), split_k_slices(1), groups(1) { }
@@ -107,7 +100,7 @@ struct Conv2dProblemSize {
     Mode mode
   ): 
     N(N), H(H), W(W), C(C), P(P), Q(Q), K(K), R(R), S(S),
-    pad_h(R / 2), pad_w(S / 2), stride_h(1), stride_w(1), dilation_h(1), dilation_w(1), 
+    pad_h(R / 2), pad_w(S / 2), stride_h(1), stride_w(1), dilation_h(1), dilation_w(1),
     mode(mode), split_k_slices(1), groups (1) { }
   
   /// Constructor
@@ -131,9 +124,9 @@ struct Conv2dProblemSize {
     Mode mode,
     int split_k_slices = 1,
     int groups = 1
-  ): 
-    N(N), H(H), W(W), C(C), K(K), R(R), S(S), P(P), Q(Q),
-    pad_h(pad_h), pad_w(pad_w), stride_h(stride_h), stride_w(stride_w), 
+  ):
+    N(N), H(H), W(W), C(C), P(P), Q(Q), K(K), R(R), S(S),
+    pad_h(pad_h), pad_w(pad_w), stride_h(stride_h), stride_w(stride_w),
     dilation_h(dilation_h), dilation_w(dilation_w), 
     mode(mode), split_k_slices(split_k_slices), groups (groups) { }
 
@@ -152,11 +145,11 @@ struct Conv2dProblemSize {
     int groups = 1
   ):
     N(input_size.n()), H(input_size.h()), W(input_size.w()), C(input_size.c()),
+    P(output_size.h()), Q(output_size.w()),
     K(filter_size.n()), R(filter_size.h()), S(filter_size.w()),
-    pad_h(padding[0]), pad_w(padding[2]), 
-    stride_h(stride.row()), stride_w(stride.column()), 
+    pad_h(padding[0]), pad_w(padding[2]),
+    stride_h(stride.row()), stride_w(stride.column()),
     dilation_h(dilation.row()), dilation_w(dilation.column()),
-    P(output_size.h()), Q(output_size.w()),     
     mode(mode), split_k_slices(split_k_slices), groups(groups) {}
 
   /// Constructs convolution problem size from cutlass Tensor4DCoord and MatrixCoord 
@@ -165,7 +158,7 @@ struct Conv2dProblemSize {
   Conv2dProblemSize(
     cutlass::Tensor4DCoord input_size,   // NHWC
     cutlass::Tensor4DCoord filter_size,  // KRSC
-    cutlass::Tensor4DCoord padding,      // pad_h, _, pad_w, _
+    cutlass::Tensor4DCoord padding,      // pad_h, upper_pad_h, pad_w, upper_pad_w
     cutlass::MatrixCoord stride,         // stride_h, stride_w
     cutlass::MatrixCoord dilation,       // dilation_h, dilation_w
     cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation,
@@ -175,12 +168,12 @@ struct Conv2dProblemSize {
     N(input_size.n()), H(input_size.h()), W(input_size.w()), C(input_size.c()),
     K(filter_size.n()), R(filter_size.h()), S(filter_size.w()),
     pad_h(padding[0]), pad_w(padding[2]),
-    stride_h(stride.row()), stride_w(stride.column()), 
+    stride_h(stride.row()), stride_w(stride.column()),
     dilation_h(dilation.row()), dilation_w(dilation.column()),
     mode(mode), split_k_slices(split_k_slices), groups(groups) {
       // set output P and Q
-      P = ((H + pad_h * 2 - R * dilation_h) / stride_h) + 1;
-      Q = ((W + pad_w * 2 - S * dilation_w) / stride_w) + 1;
+      P = ((H + pad_h + padding[1] - R * dilation_h) / stride_h) + 1;
+      Q = ((W + pad_w + padding[3] - S * dilation_w) / stride_w) + 1;
     }
 
   /// Constructs convolution problem size from cutlass Tensor4DCoord and MatrixCoord 
@@ -195,9 +188,9 @@ struct Conv2dProblemSize {
     int groups = 1
   ):
     N(input_size.n()), H(input_size.h()), W(input_size.w()), C(input_size.c()),
+    P(output_size.h()), Q(output_size.w()),
     K(filter_size.n()), R(filter_size.h()), S(filter_size.w()),
-    P(output_size.h()), Q(output_size.w()), 
-    pad_h(R / 2), pad_w(S / 2), stride_h(1), stride_w(1), 
+    pad_h(R / 2), pad_w(S / 2), stride_h(1), stride_w(1),
     dilation_h(1), dilation_w(1),
     mode(mode), split_k_slices(split_k_slices), groups(groups) {}
 
@@ -221,12 +214,12 @@ struct Conv2dProblemSize {
   CUTLASS_HOST_DEVICE
   bool operator==(Conv2dProblemSize const &conv) const {
     return (
-      (N == conv.N) && (W == conv.H) && (W == conv.W) && (C == conv.C) &&
+      (N == conv.N) && (H == conv.H) && (W == conv.W) && (C == conv.C) &&
       (K == conv.K) && (R == conv.R) && (S == conv.S) &&
       (P == conv.P) && (Q == conv.Q) &&
       (pad_h == conv.pad_h) && (pad_w == conv.pad_w) &&
       (stride_h == conv.stride_h) && (stride_w == conv.stride_w) &&
-      (dilation_h == conv.dilation_h) && (dilation_h == conv.dilation_h)
+      (dilation_h == conv.dilation_h) && (dilation_w == conv.dilation_w)
     );  
   }
 
@@ -245,9 +238,10 @@ struct Conv2dProblemSize {
 
   /// Returns filter extent as Tensor4DCoord
   CUTLASS_HOST_DEVICE
-  cutlass::Tensor4DCoord filter_extent() const {
+  cutlass::Tensor4DCoord filter_extent(bool is_deconv = false) const {
 
-    return cutlass::Tensor4DCoord ({K, R, S, C});
+    return is_deconv ? cutlass::Tensor4DCoord ({C, R, S, K / groups})
+        : cutlass::Tensor4DCoord ({K, R, S, C / groups});
   }
 
   /// Returns output extent as Tensor4DCoord
@@ -268,7 +262,7 @@ struct Conv2dProblemSize {
   CUTLASS_HOST_DEVICE
   int64_t filter_size() const {
 
-    return (K * R * S * C);
+    return (K * R * S * C / groups);
   }
 
   /// Returns output size in number of elements
@@ -278,7 +272,7 @@ struct Conv2dProblemSize {
     return (N * P * Q * K);
   }
   
-  /// Returns output extent as Tensor4DCoord
+  /// Returns padding as Tensor4DCoord
   CUTLASS_HOST_DEVICE
   cutlass::Tensor4DCoord padding() const {
 
@@ -336,8 +330,9 @@ cutlass::gemm::GemmCoord implicit_gemm_problem_size(
     return gemm::GemmCoord(
       problem_size.N * problem_size.P * problem_size.Q,
       problem_size.K,
-      problem_size.R * problem_size.S * problem_size.C
+      problem_size.R * problem_size.S * problem_size.C / problem_size.groups
     );
+  case Operator::kDeconv:
   case Operator::kDgrad:
     return gemm::GemmCoord(
       problem_size.N * problem_size.H * problem_size.W,
@@ -362,61 +357,160 @@ int implicit_gemm_k_iterations(
   Operator conv_operator, 
   int threadblock_K, 
   Conv2dProblemSize const &problem_size,
-  IteratorAlgorithm algorithm = IteratorAlgorithm::kAnalytic) {
+  IteratorAlgorithm algorithm = IteratorAlgorithm::kAnalytic,
+  GroupMode group_mode = GroupMode::kNone,
+  int threadblock_N = 0) {
 
   int iterations = 0;
 
-  if (algorithm == IteratorAlgorithm::kFixedChannels) {
+  if (group_mode == GroupMode::kNone) {
 
-    int positions_per_iteration = threadblock_K / problem_size.C;
-    switch (conv_operator) {
-    case Operator::kFprop:
-      iterations = (problem_size.R * problem_size.S + positions_per_iteration - 1 ) / positions_per_iteration;
-      break;
+    if (algorithm == IteratorAlgorithm::kFixedChannels) {
 
-    default:
-      break;
+      int positions_per_iteration = threadblock_K / problem_size.C;
+      switch (conv_operator) {
+      case Operator::kFprop:
+        iterations = (problem_size.R * problem_size.S + positions_per_iteration - 1 ) / positions_per_iteration;
+        break;
+
+      default:
+        break;
+      }
     }
-  }
-  else if (algorithm == IteratorAlgorithm::kFewChannels) {
+    else if (algorithm == IteratorAlgorithm::kFewChannels) {
 
-    switch (conv_operator) {
-    case Operator::kFprop:
-      iterations = (problem_size.R * problem_size.S * problem_size.C + threadblock_K - 1 ) / threadblock_K;
-      break;
+      switch (conv_operator) {
+      case Operator::kFprop:
+        iterations = (problem_size.R * problem_size.S * problem_size.C + threadblock_K - 1 ) / threadblock_K;
+        break;
 
-    default:
-      break;
+      default:
+        break;
+      }
+    }
+    else {
+      int elements_per_split_k_slice = 0;
+
+      switch (conv_operator) {
+      case Operator::kFprop:
+        elements_per_split_k_slice = (problem_size.C + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+        iterations = problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K);
+        break;
+
+      case Operator::kDeconv:
+      case Operator::kDgrad:
+        elements_per_split_k_slice = (problem_size.K + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+        iterations = problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K);
+        break;
+
+      case Operator::kWgrad:
+        elements_per_split_k_slice = (problem_size.N * problem_size.P * problem_size.Q + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+        iterations = (elements_per_split_k_slice + threadblock_K - 1) / threadblock_K;
+        break;
+
+      default:
+        break;
+      }
     }
-  }
-  else {
-    int elements_per_split_k_slice = 0;
 
-    switch (conv_operator) {
-    case Operator::kFprop:
-      elements_per_split_k_slice = (problem_size.C + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
-      iterations = problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K);
-      break;
-
-    case Operator::kDgrad:
-      elements_per_split_k_slice = (problem_size.K + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
-      iterations = problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K);
-      break;
-
-    case Operator::kWgrad:
-      elements_per_split_k_slice = (problem_size.N * problem_size.P * problem_size.Q + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
-      iterations = (elements_per_split_k_slice + threadblock_K - 1) / threadblock_K;
-      break;
-
-    default:
-      break;
+  } else if (group_mode == GroupMode::kDepthwise) {
+    int channels_per_cta = threadblock_N;
+
+    if (algorithm == IteratorAlgorithm::kAnalytic) {
+      switch (conv_operator) {
+        case Operator::kFprop:
+          iterations = problem_size.R * problem_size.S *
+                       ((channels_per_cta + threadblock_K - 1) / threadblock_K);
+          break;
+
+        default:
+          break;
+      }
+    }
+  } else {  // Group conv
+
+    int channels_per_group = problem_size.C / problem_size.groups;
+    int k_per_group = problem_size.K / problem_size.groups;
+
+    if (algorithm == IteratorAlgorithm::kAnalytic) {
+      switch (conv_operator) {
+        case Operator::kFprop:
+          iterations = problem_size.R * problem_size.S * ((channels_per_group + threadblock_K - 1) / threadblock_K);
+          // In group conv, if k_per_group < threadblock_N, one Threadblock will calculate multiple groups
+          if (problem_size.groups != 1) {
+            if (k_per_group < threadblock_N) {
+              iterations *= threadblock_N / k_per_group;
+            }
+          }
+          break;
+
+        default:
+          break;
+      }
+    } else if (algorithm == IteratorAlgorithm::kOptimized) {
+      // Current optimized iterator only support GroupMode::kSingleGroup
+      if (group_mode == GroupMode::kSingleGroup) {
+        switch (conv_operator) {
+          case Operator::kFprop:
+            iterations = problem_size.R * problem_size.S * ((channels_per_group + threadblock_K - 1) / threadblock_K);
+            break;
+
+          default:
+            break;
+        }
+      }
     }
+
   }
 
   return iterations;
 }
 
 
+template <int N = 1, int Output_P = 1, int Output_Q = 1>
+CUTLASS_HOST_DEVICE
+int depthwise_gemm_k_iterations(
+  Operator conv_operator, 
+  int threadblock_K, 
+  Conv2dProblemSize const &problem_size,
+  IteratorAlgorithm algorithm = IteratorAlgorithm::kAnalytic,
+  GroupMode group_mode = GroupMode::kNone,
+  int threadblock_N = 0) {
+
+    int n =  problem_size.N;
+    int p = (problem_size.P + Output_P - 1) /  Output_P;
+    int q = (problem_size.Q + Output_Q - 1) /  Output_Q;
+
+    int iterations = (n * p * q + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+    return iterations;
+}
+
+
+CUTLASS_HOST_DEVICE
+int implicit_gemm_k_iterations_per_channel(
+    Operator conv_operator,
+    Conv2dProblemSize const &problem_size,
+    IteratorAlgorithm algorithm = IteratorAlgorithm::kAnalytic) {
+
+  int iterations = 0; //0 means not applicable
+  if (algorithm == IteratorAlgorithm::kAnalytic || algorithm == IteratorAlgorithm::kOptimized) {
+    switch (conv_operator) {
+      case Operator::kFprop:
+        iterations = problem_size.R * problem_size.S;
+        break;
+
+      case Operator::kDeconv:
+      case Operator::kDgrad:
+        iterations = problem_size.R * problem_size.S;
+        break;
+
+      default:
+        break;
+    }
+  }
+  return iterations;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 //  Mapping function (ImplicitGemm A, B, C -> Conv Activation, Filter, Output)
 ////////////////////////////////////////////////////////////////////////////////
@@ -427,6 +521,7 @@ cutlass::Tensor4DCoord implicit_gemm_tensor_a_extent(
   Conv2dProblemSize const &problem_size) {
   switch (conv_operator) {
     case cutlass::conv::Operator::kFprop: return problem_size.activation_extent();
+    case cutlass::conv::Operator::kDeconv:
     case cutlass::conv::Operator::kDgrad: return problem_size.output_extent();
     case cutlass::conv::Operator::kWgrad: return problem_size.output_extent();
     default : break;
@@ -441,6 +536,7 @@ cutlass::Tensor4DCoord implicit_gemm_tensor_b_extent(
   Conv2dProblemSize const &problem_size) {
   switch (conv_operator) {
     case cutlass::conv::Operator::kFprop: return problem_size.filter_extent();
+    case cutlass::conv::Operator::kDeconv: return problem_size.filter_extent(true);
     case cutlass::conv::Operator::kDgrad: return problem_size.filter_extent();
     case cutlass::conv::Operator::kWgrad: return problem_size.activation_extent();
     default : break;
@@ -455,6 +551,7 @@ cutlass::Tensor4DCoord implicit_gemm_tensor_c_extent(
   Conv2dProblemSize const &problem_size) {
   switch (conv_operator) {
     case cutlass::conv::Operator::kFprop: return problem_size.output_extent();
+    case cutlass::conv::Operator::kDeconv:
     case cutlass::conv::Operator::kDgrad: return problem_size.activation_extent();
     case cutlass::conv::Operator::kWgrad: return problem_size.filter_extent();
     default : break;
@@ -469,6 +566,7 @@ int64_t implicit_gemm_tensor_a_size(
   Conv2dProblemSize const &problem_size) {
   switch (conv_operator) {
     case cutlass::conv::Operator::kFprop: return problem_size.activation_size();
+    case cutlass::conv::Operator::kDeconv:
     case cutlass::conv::Operator::kDgrad: return problem_size.output_size();
     case cutlass::conv::Operator::kWgrad: return problem_size.output_size();
     default : break;
@@ -483,6 +581,7 @@ int64_t implicit_gemm_tensor_b_size(
   Conv2dProblemSize const &problem_size) {
   switch (conv_operator) {
     case cutlass::conv::Operator::kFprop: return problem_size.filter_size();
+    case cutlass::conv::Operator::kDeconv:
     case cutlass::conv::Operator::kDgrad: return problem_size.filter_size();
     case cutlass::conv::Operator::kWgrad: return problem_size.activation_size();
     default : break;
@@ -497,6 +596,7 @@ int64_t implicit_gemm_tensor_c_size(
   Conv2dProblemSize const &problem_size) {
   switch (conv_operator) {
     case cutlass::conv::Operator::kFprop: return problem_size.output_size();
+    case cutlass::conv::Operator::kDeconv:
     case cutlass::conv::Operator::kDgrad: return problem_size.activation_size();
     case cutlass::conv::Operator::kWgrad: return problem_size.filter_size();
     default : break;
@@ -537,12 +637,12 @@ void strided_dgrad_starting_coords(
   // function locals for remainder by fast divmod
   int pad_h_rem_, pad_w_rem_;
 
-  // start_h  = platform::abs(problem_size.stride_h - ((problem_size.pad_h % problem_size.stride_h) - r)) % problem_size.stride_h;
+  // start_h  = std::abs(problem_size.stride_h - ((problem_size.pad_h % problem_size.stride_h) - r)) % problem_size.stride_h;
   stride_h_divmod.divmod(pad_h_rem_, problem_size.pad_h);
   int r_ = absolute_value(problem_size.stride_h - (pad_h_rem_ - r));
   stride_h_divmod.divmod(start_h, r_);
 
-  //start_w  = platform::abs(problem_size.stride_w - ((problem_size.pad_w % problem_size.stride_w) - s)) % problem_size.stride_w;
+  //start_w  = std::abs(problem_size.stride_w - ((problem_size.pad_w % problem_size.stride_w) - s)) % problem_size.stride_w;
   stride_w_divmod.divmod(pad_w_rem_, problem_size.pad_w);
   int s_ = absolute_value(problem_size.stride_w - (pad_w_rem_ - s));
   stride_w_divmod.divmod(start_w, s_);
diff --git a/include/cutlass/conv/conv3d_problem_size.h b/include/cutlass/conv/conv3d_problem_size.h
index 82ea1cef46..9a9514f2d8 100644
--- a/include/cutlass/conv/conv3d_problem_size.h
+++ b/include/cutlass/conv/conv3d_problem_size.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -80,11 +80,11 @@ struct Conv3dProblemSize : public Conv2dProblemSize {
 public:
   CUTLASS_HOST_DEVICE
   Conv3dProblemSize(): 
+    Conv2dProblemSize(),
     D(0), T(0), Z(0), 
-    pad_d(0), 
+    pad_d(0),
     stride_d(1), 
-    dilation_d(1),
-    Conv2dProblemSize() { }
+    dilation_d(1) { }
  
   /// Constructor for default padding, stride, dilation, and split-K
   CUTLASS_HOST_DEVICE
@@ -102,10 +102,10 @@ struct Conv3dProblemSize : public Conv2dProblemSize {
     int R,
     int S,
     Mode mode
-  ): 
+  ):
+    Conv2dProblemSize(N, H, W, C, P, Q, K, R, S, mode),
     D(D), T(T), Z(Z), 
-    pad_d(T / 2), stride_d(1), dilation_d(1),
-    Conv2dProblemSize(N, H, W, C, P, Q, K, R, S, mode) { }
+    pad_d(T / 2), stride_d(1), dilation_d(1) { }
 
   /// Constructor
   CUTLASS_HOST_DEVICE
@@ -134,15 +134,15 @@ struct Conv3dProblemSize : public Conv2dProblemSize {
     Mode mode,
     int split_k_slices = 1,
     int groups = 1
-  ): 
-    D(D), T(T), Z(Z), 
-    pad_d(pad_d), stride_d(stride_d), dilation_d(dilation_d),
+  ):
     Conv2dProblemSize(
-      N, H, W, C, K, R, S, P, Q, 
-      pad_h, pad_w, 
-      stride_h, stride_w, 
-      dilation_h, dilation_w,
-      mode, split_k_slices, groups) { }
+    N, H, W, C, K, R, S, P, Q, 
+    pad_h, pad_w, 
+    stride_h, stride_w, 
+    dilation_h, dilation_w,
+    mode, split_k_slices, groups),
+    D(D), T(T), Z(Z), 
+    pad_d(pad_d), stride_d(stride_d), dilation_d(dilation_d) { }
 
   /// Constructs convolution problem size from cutlass Tensor5DCoord and Coord3D 
   // set *user-defined* output size and sets Z, P, and Q (include all data members in ctor)
@@ -158,8 +158,6 @@ struct Conv3dProblemSize : public Conv2dProblemSize {
     int split_k_slices = 1,
     int groups = 1
   ):
-    D(input_size.d()), T(filter_size.d()), Z(output_size.d()),
-    pad_d(padding[0]), stride_d(stride[0]), dilation_d(dilation[0]),
     Conv2dProblemSize(
       {input_size.n(), input_size.h(), input_size.w(), input_size.c()},
       {filter_size.n(), filter_size.h(), filter_size.w(), filter_size.c()},
@@ -167,8 +165,9 @@ struct Conv3dProblemSize : public Conv2dProblemSize {
       {stride[1], stride[2]},
       {dilation[1], dilation[2]},
       {output_size.n(), output_size.h(), output_size.w(), output_size.c()},
-      mode, split_k_slices, groups
-    ) { }
+      mode, split_k_slices, groups),
+    D(input_size.d()), T(filter_size.d()), Z(output_size.d()),
+    pad_d(padding[0]), stride_d(stride[0]), dilation_d(dilation[0]) { }
 
   /// Constructs convolution problem size from cutlass Tensor5DCoord and Coord3D 
   // *computes* output size and sets Z, P and Q (include all data members in ctor)
@@ -183,18 +182,46 @@ struct Conv3dProblemSize : public Conv2dProblemSize {
     int split_k_slices = 1,
     int groups = 1
   ):
-    D(input_size.d()), T(filter_size.d()),
-    pad_d(padding[0]), stride_d(stride[0]), dilation_d(dilation[0]),
     Conv2dProblemSize(
       {input_size.n(), input_size.h(), input_size.w(), input_size.c()},
       {filter_size.n(), filter_size.h(), filter_size.w(), filter_size.c()},
       {padding[1], padding[1], padding[2], padding[2]},
       {stride[1], stride[2]},
       {dilation[1], dilation[2]},
-      mode, split_k_slices, groups
-    ) { 
+      mode, split_k_slices, groups),
+    D(input_size.d()), T(filter_size.d()),
+    pad_d(padding[0]), stride_d(stride[0]), dilation_d(dilation[0])
+    {
+      // set output Z
+      Z = ((D + pad_d * 2 - T * dilation_d) / stride_d) + 1;
+    }
+
+  /// Constructs convolution problem size from cutlass Tensor5DCoord, Coord3D
+  // *computes* output size and sets Z, P and Q (include all data members in ctor)
+  CUTLASS_HOST_DEVICE
+  Conv3dProblemSize(
+    cutlass::Tensor5DCoord input_size,    // NDHWC
+    cutlass::Tensor5DCoord filter_size,   // KTRSC
+    CUTLASS_STL_NAMESPACE::tuple<Coord3D, Coord3D> padding, // Coord3D {pad_d, pad_h, pad_w} & Coord3D {far pad_d, pad_h, pad_w} to calculate o/p/q
+    Coord3D stride,                       // stride_d, stride_h, stride_w
+    Coord3D dilation,                     // dilation_d, dilation_h, dilation_w
+    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation,
+    int split_k_slices = 1,
+    int groups = 1
+  ):
+    Conv2dProblemSize(
+      {input_size.n(), input_size.h(), input_size.w(), input_size.c()},
+      {filter_size.n(), filter_size.h(), filter_size.w(), filter_size.c()},
+      {CUTLASS_STL_NAMESPACE::get<0>(padding)[1], CUTLASS_STL_NAMESPACE::get<1>(padding)[1],
+       CUTLASS_STL_NAMESPACE::get<0>(padding)[2], CUTLASS_STL_NAMESPACE::get<1>(padding)[2]},
+      {stride[1], stride[2]},
+      {dilation[1], dilation[2]},
+      mode, split_k_slices, groups),
+    D(input_size.d()), T(filter_size.d()),
+    pad_d(CUTLASS_STL_NAMESPACE::get<0>(padding)[0]), stride_d(stride[0]), dilation_d(dilation[0])
+    {
       // set output Z
-      Z = ((D + pad_d * 2 - T * dilation_d) / stride_d) + 1;      
+      Z = ((D + pad_d + CUTLASS_STL_NAMESPACE::get<1>(padding)[0] - T * dilation_d) / stride_d) + 1;
     }
 
   /// Equality operator (ignores mode and split_k_slice)
@@ -205,8 +232,8 @@ struct Conv3dProblemSize : public Conv2dProblemSize {
       (K == conv.K) && (T == conv.T) && (R == conv.R) && (S == conv.S) &&
       (Z == conv.Z) &&(P == conv.P) && (Q == conv.Q) &&
       (pad_d == conv.pad_d) && (pad_h == conv.pad_h) && (pad_w == conv.pad_w) &&
-      (stride_d == conv.stride_d) && (stride_h == conv.stride_h) && (stride_w == conv.stride_h) &&
-      (dilation_d == conv.dilation_d) && (dilation_h == conv.dilation_h) && (dilation_h == conv.dilation_h)
+      (stride_d == conv.stride_d) && (stride_h == conv.stride_h) && (stride_w == conv.stride_w) &&
+      (dilation_d == conv.dilation_d) && (dilation_h == conv.dilation_h) && (dilation_w == conv.dilation_w)
     );  
   }
 
@@ -241,9 +268,10 @@ struct Conv3dProblemSize : public Conv2dProblemSize {
 
   /// Returns filter extent as Tensor5DCoord
   CUTLASS_HOST_DEVICE
-  cutlass::Tensor5DCoord filter_extent() const {
+  cutlass::Tensor5DCoord filter_extent(bool is_deconv = false) const {
 
-    return cutlass::Tensor5DCoord ({K, T, R, S, C});
+    return is_deconv ? cutlass::Tensor5DCoord ({C, T, R, S, K})
+        : cutlass::Tensor5DCoord ({K, T, R, S, C});
   }
 
   /// Returns output extent as Tensor5DCoord
@@ -274,7 +302,7 @@ struct Conv3dProblemSize : public Conv2dProblemSize {
     return (N * Z * P * Q * K);
   }
 
-  /// Returns output extent as Tensor5DCoord
+  /// Returns padding as Coord3D
   CUTLASS_HOST_DEVICE
   Coord3D padding() const {
 
@@ -315,6 +343,7 @@ cutlass::gemm::GemmCoord implicit_gemm_problem_size(
       problem_size.K,
       problem_size.T * problem_size.R * problem_size.S * problem_size.C
     );
+  case Operator::kDeconv:
   case Operator::kDgrad:
     return gemm::GemmCoord(
       problem_size.N * problem_size.D * problem_size.H * problem_size.W,
@@ -339,29 +368,47 @@ int implicit_gemm_k_iterations(
   Operator conv_operator, 
   int threadblock_K, 
   Conv3dProblemSize const &problem_size,
-  IteratorAlgorithm algorithm = IteratorAlgorithm::kAnalytic) {
+  IteratorAlgorithm algorithm = IteratorAlgorithm::kAnalytic,
+  GroupMode group_mode = GroupMode::kNone,
+  int threadblock_N = 0) {
 
   int iterations = 0;
   int elements_per_split_k_slice = 0;
-
-  switch (conv_operator) {
-    case Operator::kFprop:
-      elements_per_split_k_slice = (problem_size.C + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
-      iterations = problem_size.T * problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K);
-      break;
-  
-    case Operator::kDgrad:
-      elements_per_split_k_slice =  (problem_size.K + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
-      iterations = problem_size.T * problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K);
-      break;
-  
-    case Operator::kWgrad:
-      elements_per_split_k_slice = (problem_size.N * problem_size.Z * problem_size.P * problem_size.Q + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
-      iterations = (elements_per_split_k_slice + threadblock_K - 1) / threadblock_K;
-      break;
-  
-    default:
-      break;
+  if (group_mode == GroupMode::kNone) {
+    switch (conv_operator) {
+      case Operator::kFprop:
+        elements_per_split_k_slice = (problem_size.C + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+        iterations = problem_size.T * problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K);
+        break;
+
+      case Operator::kDeconv:
+      case Operator::kDgrad:
+        elements_per_split_k_slice =  (problem_size.K + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+        iterations = problem_size.T * problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K);
+        break;
+    
+      case Operator::kWgrad:
+        elements_per_split_k_slice = (problem_size.N * problem_size.Z * problem_size.P * problem_size.Q + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+        iterations = (elements_per_split_k_slice + threadblock_K - 1) / threadblock_K;
+        break;
+    
+      default:
+        break;
+    }
+  } else if (group_mode == GroupMode::kDepthwise) {
+    int channels_per_cta = threadblock_N;
+
+    if (algorithm == IteratorAlgorithm::kAnalytic) {
+      switch (conv_operator) {
+        case Operator::kFprop:
+          iterations = problem_size.T * problem_size.R * problem_size.S *
+                       ((channels_per_cta + threadblock_K - 1) / threadblock_K);
+          break;
+
+        default:
+          break;
+      }
+    }
   }
 
   return iterations;
@@ -377,6 +424,7 @@ cutlass::Tensor5DCoord implicit_gemm_tensor_a_extent(
   Conv3dProblemSize const &problem_size) {
   switch (conv_operator) {
     case cutlass::conv::Operator::kFprop: return problem_size.activation_extent();
+    case cutlass::conv::Operator::kDeconv:
     case cutlass::conv::Operator::kDgrad: return problem_size.output_extent();
     case cutlass::conv::Operator::kWgrad: return problem_size.output_extent();
     default : break;
@@ -391,6 +439,7 @@ cutlass::Tensor5DCoord implicit_gemm_tensor_b_extent(
   Conv3dProblemSize const &problem_size) {
   switch (conv_operator) {
     case cutlass::conv::Operator::kFprop: return problem_size.filter_extent();
+    case cutlass::conv::Operator::kDeconv: return problem_size.filter_extent(true);
     case cutlass::conv::Operator::kDgrad: return problem_size.filter_extent();
     case cutlass::conv::Operator::kWgrad: return problem_size.activation_extent();
     default : break;
@@ -405,6 +454,7 @@ cutlass::Tensor5DCoord implicit_gemm_tensor_c_extent(
   Conv3dProblemSize const &problem_size) {
   switch (conv_operator) {
     case cutlass::conv::Operator::kFprop: return problem_size.output_extent();
+    case cutlass::conv::Operator::kDeconv:
     case cutlass::conv::Operator::kDgrad: return problem_size.activation_extent();
     case cutlass::conv::Operator::kWgrad: return problem_size.filter_extent();
     default : break;
@@ -419,6 +469,7 @@ int64_t implicit_gemm_tensor_a_size(
   Conv3dProblemSize const &problem_size) {
   switch (conv_operator) {
     case cutlass::conv::Operator::kFprop: return problem_size.activation_size();
+    case cutlass::conv::Operator::kDeconv:
     case cutlass::conv::Operator::kDgrad: return problem_size.output_size();
     case cutlass::conv::Operator::kWgrad: return problem_size.output_size();
     default : break;
@@ -433,6 +484,7 @@ int64_t implicit_gemm_tensor_b_size(
   Conv3dProblemSize const &problem_size) {
   switch (conv_operator) {
     case cutlass::conv::Operator::kFprop: return problem_size.filter_size();
+    case cutlass::conv::Operator::kDeconv:
     case cutlass::conv::Operator::kDgrad: return problem_size.filter_size();
     case cutlass::conv::Operator::kWgrad: return problem_size.activation_size();
     default : break;
@@ -447,6 +499,7 @@ int64_t implicit_gemm_tensor_c_size(
   Conv3dProblemSize const &problem_size) {
   switch (conv_operator) {
     case cutlass::conv::Operator::kFprop: return problem_size.output_size();
+    case cutlass::conv::Operator::kDeconv:
     case cutlass::conv::Operator::kDgrad: return problem_size.activation_size();
     case cutlass::conv::Operator::kWgrad: return problem_size.filter_size();
     default : break;
diff --git a/include/cutlass/conv/convnd_problem_shape.hpp b/include/cutlass/conv/convnd_problem_shape.hpp
new file mode 100644
index 0000000000..cd2f674ff4
--- /dev/null
+++ b/include/cutlass/conv/convnd_problem_shape.hpp
@@ -0,0 +1,601 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This file contains definitions and utility functions for describing convolution problem shapes.
+*/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/conv/convolution.h"
+
+#include "cute/container/array.hpp"
+
+#if ! defined(__CUDACC_RTC__)
+#include <initializer_list>
+#endif
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Implements the user facing argument for all CUTLASS 3.x convolutions in a rank agnostic fashion.
+// All tensors are flat and by default treated as layout right (NDHWC, KTRSC, NZPQK)
+// Supports asymmetric padding, traversal strides, dilations, and all conv algorithm types.
+template <
+  conv::Operator ConvOp_,
+  int NumSpatialDimensions_
+>
+struct ConvProblemShape {
+  //
+  // Alias types for members
+  //
+
+  static constexpr int RankS = NumSpatialDimensions_;
+  static constexpr int RankT = NumSpatialDimensions_ + 2;
+  static constexpr conv::Operator ConvOp = ConvOp_;
+  static constexpr int NumSpatialDimensions = NumSpatialDimensions_;
+  using SpatialExtent = cute::array<int, RankS>;
+  using TensorExtent  = cute::array<int, RankT>;
+  using TensorStride  = cute::array<int64_t, RankT>;
+  using ShapePadding = SpatialExtent;
+  using TraversalStride = SpatialExtent;
+  using ShapeDilation = SpatialExtent;
+  using Corner = SpatialExtent;
+
+  //
+  // Members
+  //
+  cutlass::conv::Mode mode{};
+  TensorExtent shape_A{};
+  TensorStride stride_A{};
+  TensorExtent shape_B{};
+  TensorStride stride_B{};
+  TensorExtent shape_C{};
+  TensorStride stride_C{};
+
+  // asymmetric padding, both upper and lower padding must be >= 0
+  ShapePadding lower_padding{};
+  ShapePadding upper_padding{};
+  TraversalStride traversal_stride{};
+  ShapeDilation dilation{};
+  int groups = 1;
+
+  //
+  // Methods
+  //
+
+  ConvProblemShape() = default;
+
+  // Constructor accepts user facing arguments and computes to stores the corners as its internal state
+  ConvProblemShape(
+      conv::Mode mode,                                                     // convolution/cross-correlation
+      TensorExtent shape_act,                                              // [n,d,h,w,c]
+      TensorStride stride_act,                                             // [n,d,h,w,c]
+      TensorExtent shape_flt,                                              // [k,t,r,s,c]
+      TensorStride stride_flt,                                             // [k,t,r,s,c]
+      ShapePadding lower_padding,                                          // [pad_d, pad_h, pad_w]
+      ShapePadding upper_padding,                                          // [pad_d, pad_h, pad_w]
+      TraversalStride tstride,                                             // [stride_d, stride_h, stride_w]
+      ShapeDilation dilation,                                              // [dilation_d, dilation_h, dilation_w]
+      int groups)
+      : mode(mode)
+      , lower_padding(lower_padding)
+      , upper_padding(upper_padding)
+      , traversal_stride(tstride)
+      , dilation(dilation)
+      , groups(groups) {
+
+    auto [shape_xformed_act, stride_xformed_act] = calculate_xformed_act(shape_act, shape_flt);
+    set_shape_stride_ABC(shape_act, stride_act, shape_flt, stride_flt, shape_xformed_act, stride_xformed_act);
+  }
+
+  // Allow user input of xformed activation stride to support non-packed strides.
+  ConvProblemShape(
+      conv::Mode mode,                                                     // convolution/cross-correlation
+      TensorExtent shape_act,                                              // [n,d,h,w,c]
+      TensorStride stride_act,                                             // [n,d,h,w,c]
+      TensorExtent shape_flt,                                              // [k,t,r,s,c]
+      TensorStride stride_flt,                                             // [k,t,r,s,c]
+      TensorStride stride_xformed_act,                                     // [n,z,p,q,k]
+      ShapePadding lower_padding,                                          // [pad_d, pad_h, pad_w]
+      ShapePadding upper_padding,                                          // [pad_d, pad_h, pad_w]
+      TraversalStride tstride,                                             // [stride_d, stride_h, stride_w]
+      ShapeDilation dilation,                                              // [dilation_d, dilation_h, dilation_w]
+      int groups)
+      : mode(mode)
+      , lower_padding(lower_padding)
+      , upper_padding(upper_padding)
+      , traversal_stride(tstride)
+      , dilation(dilation)
+      , groups(groups) {
+
+    CUTLASS_ASSERT(stride_act[RankT - 1] == 1);
+    CUTLASS_ASSERT(stride_flt[RankT - 1] == 1);
+    CUTLASS_ASSERT(stride_xformed_act[RankT - 1] == 1);
+
+    auto stride_act_packed = packed_stride_right_major(shape_act);
+    auto stride_flt_packed = packed_stride_right_major(shape_flt);
+    auto [shape_xformed_act, stride_xformed_act_packed] = calculate_xformed_act(shape_act, shape_flt);
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int i = 0; i < RankT - 1; ++i) {
+      CUTLASS_ASSERT(stride_act[i] >= stride_act_packed[i]);
+      CUTLASS_ASSERT(stride_flt[i] >= stride_flt_packed[i]);
+      CUTLASS_ASSERT(stride_xformed_act[i] >= stride_xformed_act_packed[i]);
+    }
+
+    set_shape_stride_ABC(shape_act, stride_act, shape_flt, stride_flt, shape_xformed_act, stride_xformed_act);
+  }
+
+  // Constructor accepts user facing arguments and presume packed tensor strides in canonical (CWHDN) order.
+  ConvProblemShape(
+      conv::Mode mode,
+      TensorExtent shape_act,
+      TensorExtent shape_flt,
+      ShapePadding lower_padding,
+      ShapePadding upper_padding,
+      TraversalStride tstride,
+      ShapeDilation dilation,
+      int groups)
+      : ConvProblemShape(
+        mode,
+        shape_act,
+        packed_stride_right_major(shape_act),
+        shape_flt,
+        packed_stride_right_major(shape_flt),
+        lower_padding,
+        upper_padding,
+        tstride,
+        dilation,
+        groups) {
+    }
+
+#if ! defined(__CUDACC_RTC__)
+  // Constructor accepts user facing arguments and computes to stores the corners as its internal state
+  ConvProblemShape(
+      conv::Mode                     mode,
+      std::initializer_list<int>     shape_act_,
+      std::initializer_list<int64_t> stride_act_,
+      std::initializer_list<int>     shape_flt_,
+      std::initializer_list<int64_t> stride_flt_,
+      std::initializer_list<int>     lower_padding_,
+      std::initializer_list<int>     upper_padding_,
+      std::initializer_list<int>     traversal_stride_,
+      std::initializer_list<int>     dilation_,
+      int groups)
+      : mode(mode)
+      , groups(groups) {
+
+    TensorExtent shape_act{};
+    TensorStride stride_act{};
+    TensorExtent shape_flt{};
+    TensorStride stride_flt{};
+
+    assert(shape_act_.size() == shape_act.size());
+    assert(stride_act_.size() == stride_act.size());
+    assert(shape_flt_.size() == shape_flt.size());
+    assert(stride_flt_.size() == stride_flt.size());
+    assert(lower_padding_.size() == lower_padding.size());
+    assert(upper_padding_.size() == upper_padding.size());
+    assert(traversal_stride_.size() == traversal_stride.size());
+    assert(dilation_.size() == dilation.size());
+
+    std::copy(shape_act_.begin(), shape_act_.end(), shape_act.begin());
+    std::copy(stride_act_.begin(), stride_act_.end(), stride_act.begin());
+    std::copy(shape_flt_.begin(), shape_flt_.end(), shape_flt.begin());
+    std::copy(stride_flt_.begin(), stride_flt_.end(), stride_flt.begin());
+    std::copy(lower_padding_.begin(), lower_padding_.end(), lower_padding.begin());
+    std::copy(upper_padding_.begin(), upper_padding_.end(), upper_padding.begin());
+    std::copy(traversal_stride_.begin(), traversal_stride_.end(), traversal_stride.begin());
+    std::copy(dilation_.begin(), dilation_.end(), dilation.begin());
+
+    auto [shape_xformed_act, stride_xformed_act] = calculate_xformed_act(shape_act, shape_flt);
+    set_shape_stride_ABC(shape_act, stride_act, shape_flt, stride_flt, shape_xformed_act, stride_xformed_act);
+  }
+
+  // Allow user input of xformed activation stride to support non-packed strides.
+  ConvProblemShape(
+      conv::Mode                     mode,
+      std::initializer_list<int>     shape_act_,
+      std::initializer_list<int64_t> stride_act_,
+      std::initializer_list<int>     shape_flt_,
+      std::initializer_list<int64_t> stride_flt_,
+      std::initializer_list<int64_t> stride_xformed_act_,
+      std::initializer_list<int>     lower_padding_,
+      std::initializer_list<int>     upper_padding_,
+      std::initializer_list<int>     traversal_stride_,
+      std::initializer_list<int>     dilation_,
+      int groups)
+      : mode(mode)
+      , groups(groups) {
+    TensorExtent shape_act{};
+    TensorStride stride_act{};
+    TensorExtent shape_flt{};
+    TensorStride stride_flt{};
+    TensorStride stride_xformed_act{};
+
+    std::copy(shape_act_.begin(), shape_act_.end(), shape_act.begin());
+    std::copy(stride_act_.begin(), stride_act_.end(), stride_act.begin());
+    std::copy(shape_flt_.begin(), shape_flt_.end(), shape_flt.begin());
+    std::copy(stride_flt_.begin(), stride_flt_.end(), stride_flt.begin());
+    std::copy(stride_xformed_act_.begin(), stride_xformed_act_.end(), stride_xformed_act.begin());
+    std::copy(lower_padding_.begin(), lower_padding_.end(), lower_padding.begin());
+    std::copy(upper_padding_.begin(), upper_padding_.end(), upper_padding.begin());
+    std::copy(traversal_stride_.begin(), traversal_stride_.end(), traversal_stride.begin());
+    std::copy(dilation_.begin(), dilation_.end(), dilation.begin());
+
+    CUTLASS_ASSERT(stride_act[RankT - 1] == 1);
+    CUTLASS_ASSERT(stride_flt[RankT - 1] == 1);
+    CUTLASS_ASSERT(stride_xformed_act[RankT - 1] == 1);
+
+    auto stride_act_packed = packed_stride_right_major(shape_act);
+    auto stride_flt_packed = packed_stride_right_major(shape_flt);
+    auto [shape_xformed_act, stride_xformed_act_packed] = calculate_xformed_act(shape_act, shape_flt);
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int i = 0; i < RankT - 1; ++i) {
+      CUTLASS_ASSERT(stride_act[i] >= stride_act_packed[i]);
+      CUTLASS_ASSERT(stride_flt[i] >= stride_flt_packed[i]);
+      CUTLASS_ASSERT(stride_xformed_act[i] >= stride_xformed_act_packed[i]);
+    }
+
+    set_shape_stride_ABC(shape_act, stride_act, shape_flt, stride_flt, shape_xformed_act, stride_xformed_act);
+  }
+
+  // Constructor accepts user facing arguments and computes to stores the corners as its internal state
+  ConvProblemShape(
+      conv::Mode                     mode,
+      std::initializer_list<int>     shape_act_,
+      std::initializer_list<int>     shape_flt_,
+      std::initializer_list<int>     lower_padding_,
+      std::initializer_list<int>     upper_padding_,
+      std::initializer_list<int>     traversal_stride_,
+      std::initializer_list<int>     dilation_,
+      int groups)
+      : mode(mode)
+      , groups(groups) {
+    TensorExtent shape_act{};
+    TensorStride stride_act{};
+    TensorExtent shape_flt{};
+    TensorStride stride_flt{};
+
+    assert(shape_act_.size() == shape_act.size());
+    assert(shape_flt_.size() == shape_flt.size());
+    assert(lower_padding_.size() == lower_padding.size());
+    assert(upper_padding_.size() == upper_padding.size());
+    assert(traversal_stride_.size() == traversal_stride.size());
+    assert(dilation_.size() == dilation.size());
+
+    std::copy(shape_act_.begin(), shape_act_.end(), shape_act.begin());
+    std::copy(shape_flt_.begin(), shape_flt_.end(), shape_flt.begin());
+    std::copy(lower_padding_.begin(), lower_padding_.end(), lower_padding.begin());
+    std::copy(upper_padding_.begin(), upper_padding_.end(), upper_padding.begin());
+    std::copy(traversal_stride_.begin(), traversal_stride_.end(), traversal_stride.begin());
+    std::copy(dilation_.begin(), dilation_.end(), dilation.begin());
+    stride_act = packed_stride_right_major(shape_act);
+    stride_flt = packed_stride_right_major(shape_flt);
+
+    auto [shape_xformed_act, stride_xformed_act] = calculate_xformed_act(shape_act, shape_flt);
+    set_shape_stride_ABC(shape_act, stride_act, shape_flt, stride_flt, shape_xformed_act, stride_xformed_act);
+  }
+#endif // not defined(__CUDACC_RTC__)
+
+  // Set shape and stride of tensor A/B/C according to following table:
+  // |              | Fprop  | Dgrad  | Wgrad |
+  // | ------       | ------ | ------ | ------|
+  // |   ShapeA     | NDHWC  | NZPQK  | NZPQK |
+  // |   ShapeB     | KTRSC  | KTRSC  | NDHWC |
+  // |   ShapeC     | NZPQK  | NDHWC  | KTRSC |
+  //
+  // Input comes from calculate_xformed_act, which does NOT depend on ConvOp.
+  CUTLASS_HOST_DEVICE
+  constexpr void
+  set_shape_stride_ABC(
+    TensorExtent shape_act,
+    TensorStride stride_act,
+    TensorExtent shape_flt,
+    TensorStride stride_flt,
+    TensorExtent shape_xformed_act,
+    TensorStride stride_xformed_act) {
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    printf("*** set_shape_stride_ABC ***");
+    printf("\n  shape_act: ");
+    print(shape_act);
+    printf("\n  stride_act: ");
+    print(stride_act);
+    printf("\n  shape_flt: ");
+    print(shape_flt);
+    printf("\n  stride_flt: ");
+    print(stride_flt);
+    printf("\n  shape_xformed_act: ");
+    print(shape_xformed_act);
+    printf("\n  stride_xformed_act: ");
+    print(stride_xformed_act);
+    if constexpr (ConvOp == cutlass::conv::Operator::kFprop) {
+      printf("\n  ConvOp: Fprop");
+    }
+    if constexpr (ConvOp == cutlass::conv::Operator::kDgrad) {
+      printf("\n  ConvOp: Dgrad");
+    }
+    if constexpr (ConvOp == cutlass::conv::Operator::kWgrad) {
+      printf("\n  ConvOp: Wgrad");
+    }
+    printf("\n");
+#endif
+
+    if constexpr (ConvOp == cutlass::conv::Operator::kFprop) {
+      shape_A = shape_act;
+      stride_A = stride_act;
+      shape_B = shape_flt;
+      stride_B = stride_flt;
+      shape_C = shape_xformed_act;
+      stride_C = stride_xformed_act;
+    }
+    else if constexpr (ConvOp == cutlass::conv::Operator::kDgrad) {
+      shape_A = shape_xformed_act;
+      stride_A = stride_xformed_act;
+      shape_B = shape_flt;
+      stride_B = stride_flt;
+      shape_C = shape_act;
+      stride_C = stride_act;
+    }
+    else if constexpr (ConvOp == cutlass::conv::Operator::kWgrad) {
+      shape_A = shape_xformed_act;
+      stride_A = stride_xformed_act;
+      shape_B = shape_act;
+      stride_B = stride_act;
+      shape_C = shape_flt;
+      stride_C = stride_flt;
+    }
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    printf("\n  shape_A: ");
+    print(shape_A);
+    printf("\n  stride_A: ");
+    print(stride_A);
+    printf("\n  shape_B: ");
+    print(shape_B);
+    printf("\n  stride_B: ");
+    print(stride_B);
+    printf("\n  shape_C: ");
+    print(shape_C);
+    printf("\n  stride_C: ");
+    print(stride_C);
+#endif
+  }
+
+  // Get A extents.
+  // fprop: A extents array contains [N,D,H,W,C]. Turn that into ((W,H,D,N), (C))
+  // dgrad: A extents array contains [N,Z,P,Q,K]. Turn that into ((Q,P,Z,N), (K))
+  // wgrad: A extents array contains [N,Z,P,Q,K]. Turn that into ((K), (Q,P,Z,N))
+  CUTLASS_HOST_DEVICE
+  constexpr auto
+  get_shape_A() const {
+    using cute::make_shape;
+    using cute::take;
+
+    if constexpr (ConvOp == conv::Operator::kFprop ||
+                  ConvOp == conv::Operator::kDgrad) {
+      return make_shape(
+        cute::reverse(take<0, RankT - 1>(shape_A)),
+        shape_A[RankT - 1]);
+    }
+    // For wgrad kernel, we need to linearize NZPQ for tensor A
+    else if constexpr (ConvOp == conv::Operator::kWgrad) {
+      return make_shape(
+        shape_A[RankT - 1],
+        cute::product(take<0, RankT - 1>(shape_A)));
+    }
+  }
+
+  // Get B extents.
+  // fprop: B extents array contains [K,T,R,S,C]. Turn that into ((K), (C,S,R,T))
+  // dgrad: B extents array contains [K,T,R,S,C]. Turn that into ((C), (K,S,R,T))
+  // wgrad: B extents array contains [N,D,H,W,C]. Turn that into ((C), (W,H,D,N))
+  CUTLASS_HOST_DEVICE
+  constexpr auto
+  get_shape_B() const {
+    using cute::make_shape;
+    using cute::reverse;
+    using cute::take;
+
+    if constexpr (ConvOp == conv::Operator::kFprop) {
+      return make_shape(
+        shape_B[0],
+        reverse(take<1, RankT>(shape_B)));
+    }
+    else if constexpr (ConvOp == conv::Operator::kWgrad) {
+      return make_shape(
+        shape_B[RankT - 1],
+        reverse(take<0, RankT - 1>(shape_B)));
+    }
+    else if constexpr (ConvOp == conv::Operator::kDgrad) {
+      // shape_B: [K,T,R,S,C], return: [(C),(K,S,R,T)]
+      return make_shape(
+        shape_B[RankT - 1],
+        cute::insert<0>(
+          reverse(take<1, RankT - 1>(shape_B)),
+          shape_B[0]));
+    }
+  }
+
+  // Get C extents.
+  // fprop: C extents array contains [N,Z,P,Q,K]. Turn that into ((Q,P,Z,N), (K))
+  // dgrad: C extents array contains [N,D,H,W,C]. Turn that into ((W,H,D,N), (C))
+  // wgrad: C extents array contains [K,T,R,S,C]. Turn that into ((K), (C,S,R,T))
+  CUTLASS_HOST_DEVICE
+  constexpr auto
+  get_shape_C() const {
+    using cute::make_shape;
+    using cute::reverse;
+    using cute::take;
+
+    if constexpr (ConvOp == conv::Operator::kFprop ||
+                  ConvOp == conv::Operator::kDgrad) {
+      return make_shape(
+        reverse(take<0, RankT - 1>(shape_C)),
+        shape_C[RankT - 1]);
+    }
+    else if constexpr (ConvOp == conv::Operator::kWgrad) {
+      return make_shape(
+        shape_C[0],
+        reverse(take<1, RankT>(shape_C)));
+    }
+  }
+
+  // Static method that returns the canonical strides of tensors (layouts are right major and compact)
+  CUTLASS_HOST_DEVICE
+  static constexpr TensorStride
+  packed_stride_right_major(TensorExtent const& extents) {
+    TensorStride strides{};
+    strides[RankT-1] = 1;
+    cute::for_each(cute::make_rseq<RankT-1>{}, [&](auto i) {
+      strides[i] = extents[i+1] * strides[i+1];
+    });
+    return strides;
+  }
+
+  // Static method that returns the packed logical size of any TensorExtent
+  CUTLASS_HOST_DEVICE
+  static constexpr size_t
+  size(TensorExtent const& extents) {
+    size_t size = 1;
+    cute::for_each(cute::make_seq<RankT>{}, [&](auto i) {
+      size *= extents[i];
+    });
+    return size;
+  }
+
+  CUTLASS_HOST_DEVICE
+  constexpr size_t
+  size_A() const {
+    return shape_A[0] * stride_A[0];
+  }
+
+  CUTLASS_HOST_DEVICE
+  constexpr size_t
+  size_B() const {
+    return shape_B[0] * stride_B[0];
+  }
+
+  CUTLASS_HOST_DEVICE
+  constexpr size_t
+  size_C() const {
+    return shape_C[0] * stride_C[0];
+  }
+
+  // Equality operator
+  CUTLASS_HOST_DEVICE
+  bool operator==(ConvProblemShape<ConvOp, NumSpatialDimensions> const& rhs) const {
+    using cute::for_each;
+    using cute::make_seq;
+
+    bool is_equal = true;
+
+    // Compare all tensor extents
+    for_each(make_seq<RankT>{}, [&](auto i) {
+      is_equal = is_equal
+          && (shape_A[i] == rhs.shape_A[i])
+          && (shape_B[i] == rhs.shape_B[i]);
+    });
+
+    // Compare all spatial extents
+    for_each(make_seq<RankS>{}, [&](auto i) {
+      is_equal = is_equal
+          && (lower_padding[i] == rhs.lower_padding[i])
+          && (upper_padding[i] == rhs.upper_padding[i])
+          && (traversal_stride[i] == rhs.traversal_stride[i])
+          && (dilation[i] == rhs.dilation[i]);
+    });
+
+    return is_equal;
+  }
+
+  /// Inequality operator
+  CUTLASS_HOST_DEVICE
+  bool operator!=(ConvProblemShape<ConvOp, NumSpatialDimensions> const &rhs) const {
+    return !(*this == rhs);
+  }
+
+private:
+  CUTLASS_HOST_DEVICE
+  constexpr auto
+  calculate_xformed_act(TensorExtent shape_act, TensorExtent shape_flt) {
+    TensorExtent shape_xformed_act{};
+    // calculate n,z,p,q,k.
+    // a helper lambda to compute a single spatial extent of the nzpqk tensor
+    auto nzpqk_extent = [](int act_ext, int filter_ext, int pad_total, int dilation, int tstride) {
+      return 1 + (act_ext + pad_total - ((filter_ext -1) * dilation + 1)) / tstride;
+    };
+
+    shape_xformed_act[0] = shape_act[0]; // Activation N extent
+    cute::for_each(cute::make_seq<RankS>{}, [&](auto i) {
+      shape_xformed_act[i+1] = nzpqk_extent(
+          shape_act[i+1], shape_flt[i+1], upper_padding[i] + lower_padding[i], dilation[i], traversal_stride[i]);
+      });
+    shape_xformed_act[RankT-1] = shape_flt[0]; // Filter K extent
+
+    TensorStride stride_xformed_act = packed_stride_right_major(shape_xformed_act);
+
+    return cute::make_tuple(shape_xformed_act, stride_xformed_act);
+  }
+};
+
+template<
+  conv::Operator ConvOp,
+  int SpatialDim
+>
+void print(ConvProblemShape<ConvOp, SpatialDim> const& problem) {
+  printf("ConvProblemShape with %d spatial dimensions implementing cutlass::conv::Operator::%d\n",
+      SpatialDim, int(ConvOp));
+  printf("\tTensorA: ");
+      cute::print(problem.shape_A); printf(":");
+      cute::print(problem.stride_A); printf("\n");
+  printf("\tTensorB: ");
+      cute::print(problem.shape_B); printf(":");
+      cute::print(problem.stride_B); printf("\n");
+  printf("\tTensorC: ");
+      cute::print(problem.shape_C); printf(":");
+      cute::print(problem.stride_C); printf("\n");
+  printf("\tLower padding:     "); print(problem.lower_padding);       printf("\n");
+  printf("\tUpper padding:     "); print(problem.upper_padding);       printf("\n");
+  printf("\tTraversal strides: "); print(problem.traversal_stride);    printf("\n");
+  printf("\tDilation:          "); print(problem.dilation);            printf("\n");
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/convolution.h b/include/cutlass/conv/convolution.h
index 52a4636c12..243ee269dd 100644
--- a/include/cutlass/conv/convolution.h
+++ b/include/cutlass/conv/convolution.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -29,18 +29,18 @@
  *
  **************************************************************************************************/
 /*! \file
-    \brief 
+    \brief
 
-This file contains definitions and utility functions for describing convolution problem sizes in terms of 
-activation (NHWC), filter (KRSC), output (NPQK), pading (pad_h, pad_w), stride (stride_h, stride_w),
-dilation (dilation_h, dilation_w).  Furthermore, it defines helper functions to map cutlass' implicit gemm 
-tensor extents, sizes, data types to that of convolutions extents, sizes, and data types. 
+This file contains definitions and utility functions for describing convolution problem sizes in terms of
+activation (NHWC), filter (KRSC), output (NPQK), padding (pad_h, pad_w), stride (stride_h, stride_w), and
+dilation (dilation_h, dilation_w).  Furthermore, it defines helper functions to map CUTLASS's implicit gemm
+tensor extents, sizes, and data types to that of the convolution's extents, sizes, and data types.
 
                         * Mapping convolutions to Gemm computation *
 
-Cutlass employs ImplicitGemm algorithm to implement convolutions. ImplicitGemm algorithm runs gemm operation 
-on convolution tensors Activation, Filter, and Output . The underlying gemm operation follows the standard 
-gemm definition:
+Cutlass implements convolutions with the Implicit Gemm algorithm.  This algorithm performs a gemm
+(general matrix-matrix multiply) on the convolution tensors Activation, Filter, and Output.
+The underlying gemm operation follows the standard gemm definition:
 
                                      C = A * B + C
 
@@ -48,22 +48,23 @@ gemm definition:
                             C is source and output matrix
 
 
-For the three convolutional operators (Fprop, Dgrad, Wgrad), ImplicitGemm matrices A, B, and C are mapped on 
-to convolution tensors Activation, Filter and Output as per the below table:
+For the three convolutional operators (Fprop, Dgrad, Wgrad), ImplicitGemm matrices A, B, and C are mapped
+to convolution tensors Activation, Filter and Output as described in the table below.
 
         ___________________________________________________________________________
-         ConvolutionalOperator |        A        |      B         |       C                           
+         ConvolutionalOperator |        A        |      B         |       C
         ___________________________________________________________________________
         |                      |                 |                |               |
-        |       Fprop          |    Activation   |    Filter      |     Output    |  
-        |       Dgrad          |     Output      |    Filter      |   Activation  |  
-        |       Wgrad          |     Output      |  Activation    |     Filter    | 
+        |       Fprop          |    Activation   |    Filter      |     Output    |
+        |       Dgrad          |     Output      |    Filter      |   Activation  |
+        |       Wgrad          |     Output      |  Activation    |     Filter    |
         ___________________________________________________________________________
 
-In convolution codebase, DO NOT mix using (A, B, C) with (Acvitation, Filter, Output).
+In convolution codebase, DO NOT mix using (A, B, C) with (Activation, Filter, Output).
 
-For example, a convolution class/function with A, B, Output is confusing and error-prone. Instead use below 
-mapping functions and adhere to using either A, B, C or Acvitation, Filter, Output. 
+For example, it's confusing and error prone to document a convolution class or function
+as operating on "A, B, Output."  Instead, use the mapping functions below,
+and adhere to using either A, B, C or Activation, Filter, Output.
 
 Map elements' data types (ImplicitGemm -> Conv): GemmToConvElementMap
 Map elements' data types (Conv -> ImplicitGemm): ConvToGemmElementMap
@@ -72,9 +73,10 @@ Map elements' data types (Conv -> ImplicitGemm): ConvToGemmElementMap
 #pragma once
 
 #include "cutlass/cutlass.h"
+#include "cutlass/layout/tensor.h"
 #include "cutlass/tensor_coord.h"
 #include "cutlass/fast_math.h"
-#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/gemm_enumerated_types.h"
 #include "cutlass/matrix_coord.h"
 
 namespace cutlass {
@@ -83,40 +85,107 @@ namespace conv {
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Convolutional operator
-enum class Operator { 
-  kFprop, 
-  kDgrad, 
-  kWgrad 
+enum class Operator {
+  kFprop,
+  kDgrad,
+  kWgrad,
+  kDeconv
 };
 
-/// Distinguishes convolution  from cross correlation
-enum class Mode { 
-  kCrossCorrelation, 
-  kConvolution 
+/// Distinguishes convolution from cross correlation
+enum class Mode {
+  kCrossCorrelation,
+  kConvolution
 };
 
 /// Selects among several implementation variants trading off performance with simplicity
-enum class IteratorAlgorithm { 
+enum class IteratorAlgorithm {
   kAnalytic,      ///< functionally correct in all cases but lower performance
   kOptimized,     ///< optimized for R <= 32, S <= 32 and unity-stride dgrad
   kFixedChannels, ///< Analytic algorithm optimized for fixed channel count (C == AccessSize)
-  kFewChannels    ///< Analytic algorithm optimized for few channels (C divisible by AccessSize)
+  kFewChannels,   ///< Analytic algorithm optimized for few channels (C divisible by AccessSize)
+  kFixedStrideDilation ///< Optimized for fixed stride and dilation
 };
 
 /// Distinguishes among partial specializations that accelerate certain problems where convolution
 /// stride is unit.
 enum class StrideSupport {
   kStrided,       ///< arbitrary convolution stride
-  kUnity          ///< unit convolution stride
+  kUnity,         ///< unit convolution stride
+  kFixed          ///< fixed convolution stride
 };
 
 /// Identifies split-K mode
-enum class SplitKMode { 
-  kNone, 
-  kSerial, 
+enum class SplitKMode {
+  kNone,
+  kSerial,
   kParallel
 };
 
+/// Identifies group mode
+enum class GroupMode {
+  kNone,
+  kSingleGroup,   ///< One CTA calculates one group or less
+  kMultipleGroup, ///< One CTA calculates multiple groups
+  kDepthwise      ///< One CTA calculates cta_n groups (problem_size.C == problem_size.K == problem_size.groups)
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Shape of a tensor
+template <
+  int N = 1,
+  int H = 1,
+  int W = 1,
+  int C = 1
+>
+struct TensorNHWCShape {
+  static int const kN = N;
+  static int const kH = H;
+  static int const kW = W;
+  static int const kC = C;
+
+  static int const kHW = H * W;
+  static int const kNHW = N * kHW;
+  static int const kNHWC = N * H * W * C;
+
+  static int const kCount = kNHWC;
+
+  //
+  // Static member functions
+  //
+
+  /// Returns a Coord object
+  CUTLASS_HOST_DEVICE
+  static Coord<4> toCoord() {
+    return make_Coord(kN, kH, kW, kC);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Shape of a conv2d stride, which controls how the filter convolves around the input volume
+template <
+  /// Stride in horizontal direction
+  int u = 1,
+  /// Stride in vertical direction
+  int v = 1
+>
+struct Stride2D {
+  static int const kU = u;
+  static int const kV = v;
+
+  //
+  // Static member functions
+  //
+
+  /// Returns a Coord object
+  CUTLASS_HOST_DEVICE
+  static Coord<2> toCoord() {
+    return make_Coord(kU, kV);
+  }
+};
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace conv
diff --git a/include/cutlass/conv/detail.hpp b/include/cutlass/conv/detail.hpp
new file mode 100644
index 0000000000..3e4173569c
--- /dev/null
+++ b/include/cutlass/conv/detail.hpp
@@ -0,0 +1,137 @@
+
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/conv/convnd_problem_shape.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv::detail {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+  // Helper function to get the problem shape
+template <typename T, class ProblemShape>
+auto get_problem_shape_MNKL_helper(ProblemShape const& problem_shape, cute::true_type) {
+  return T::get_problem_shape_MNKL(problem_shape);
+}
+
+template <typename T, class ProblemShape>
+ProblemShape get_problem_shape_MNKL_helper(ProblemShape const& problem_shape, cute::false_type) {
+  return problem_shape;
+}
+
+// Get problem shape MNKL according to following table:
+// |               |   Fprop   |   Dgrad         |   Wgrad   |
+// |   ----        | --------- | --------        | --------  |
+// |   Shape_M     | (Q,P,Z,N) | (W/V,H/U,D/O,N) | (K)       |
+// |   Shape_N     | (K)       | (C)             | (C,S,R,T) |
+// |   Shape_K     | (C,S,R,T) | (K,S,R,T)       | (Q,P,Z,N) |
+// |   Shape_L     | _1        | (V,U,O)         | _1        |
+
+template <class ProblemShape>
+CUTLASS_HOST_DEVICE
+constexpr auto
+get_transformed_problem_shape_MNKL(ProblemShape const& problem_shape) {
+  return problem_shape;
+}
+
+
+template <conv::Operator ConvOp, int SpatialDim>
+CUTLASS_HOST_DEVICE
+constexpr auto
+get_transformed_problem_shape_MNKL(ConvProblemShape<ConvOp, SpatialDim> const& problem_shape) {
+  using cute::insert;
+  using cute::make_shape;
+  using cute::reverse;
+  using cute::take;
+
+  constexpr int RankT = SpatialDim + 2;
+
+  if constexpr (ConvOp == conv::Operator::kWgrad) {
+    auto M_xformed = problem_shape.shape_C[0];
+    auto N_xformed = reverse(take<1, RankT>(problem_shape.shape_C));
+    auto K_xformed = reverse(take<0, RankT - 1>(problem_shape.shape_A));
+    auto L_xformed = cute::Int<1>{};
+
+    return make_shape(M_xformed, N_xformed, K_xformed, L_xformed);
+  }
+  else if constexpr (ConvOp == conv::Operator::kFprop){
+    auto M_xformed = reverse(take<0, RankT - 1>(problem_shape.shape_C));
+    auto N_xformed = problem_shape.shape_C[RankT - 1];
+    auto K_xformed = reverse(take<1, RankT>(problem_shape.shape_B));
+    auto L_xformed = cute::Int<1>{};
+
+    return make_shape(M_xformed, N_xformed, K_xformed, L_xformed);
+  }
+  else if constexpr (ConvOp == conv::Operator::kDgrad) {
+    auto L_xformed = reverse(problem_shape.traversal_stride); // (V,U,O)
+    auto M_xformed = ceil_div(reverse(take<0,RankT - 1>(problem_shape.shape_C)), L_xformed);
+    auto N_xformed = problem_shape.shape_C[RankT - 1];
+    // shape_B: [K,T,R,S,C], K_xformed: [K,S,R,T]
+    auto K_xformed = insert<0>(
+                (reverse(take<1,RankT - 1>(problem_shape.shape_B))),
+                problem_shape.shape_B[0]);
+
+    return make_shape(M_xformed, N_xformed, K_xformed, L_xformed);
+  }
+}
+
+// Assuming im2col linearization
+// Get problem shape MNKL according to following table:
+// |               |   Fprop   |   Dgrad               |   Wgrad   |
+// |   ----        | --------- | --------              | --------  |
+// |   Shape_M     | (Q*P*Z*N) | ([W/V]*[H/U]*[D/O]*N) | (K)       |
+// |   Shape_N     | (K)       | (C)                   | (C,S,R,T) |
+// |   Shape_K     | (C,S,R,T) | (K,S,R,T)             | (Q*P*Z*N) |
+// |   Shape_L     | _1        | (V*U*O)               | _1        |
+template <conv::Operator ConvOp, int SpatialDim>
+CUTLASS_HOST_DEVICE
+constexpr auto
+get_linearized_problem_shape_MNKL(ConvProblemShape<ConvOp, SpatialDim> const& problem_shape) {
+
+  auto [M, N, K, L] = get_transformed_problem_shape_MNKL(problem_shape);
+
+  if constexpr (ConvOp == conv::Operator::kFprop || ConvOp == conv::Operator::kDgrad) {
+    return cute::make_shape(cute::product(M), N, K, cute::product(L));
+  }
+  else if constexpr (ConvOp == conv::Operator::kWgrad) {
+    return cute::make_shape(M, N, cute::product(K), L);
+  }
+
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv::detail
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/device/conv_universal_adapter.hpp b/include/cutlass/conv/device/conv_universal_adapter.hpp
new file mode 100644
index 0000000000..193f8d8854
--- /dev/null
+++ b/include/cutlass/conv/device/conv_universal_adapter.hpp
@@ -0,0 +1,421 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+// common
+#include "cutlass/arch/mma.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/trace.h"
+#include "cutlass/cluster_launch.hpp"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/conv/kernel/conv_universal.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/cuda_host_adapter.hpp"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv::device {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/*!
+  ConvUniversalAdapter is a stateful, reusable handle built around a kernel
+  of type cutlass::conv::kernel::ConvUniversal.
+
+  It manages the lifetime of the underlying `kernel::Params` struct, and exposes APIs
+  to create it from the host facing arguments. For power users, static methods
+  are exposed that bypass the stateful methods or args->params lowering.
+*/
+template <class ConvKernel_>
+class ConvUniversalAdapter
+{
+public:
+  using ConvKernel = GetUnderlyingKernel_t<ConvKernel_>;
+  using TileShape = typename ConvKernel::TileShape;
+  using ElementA = typename ConvKernel::ElementA;
+  using ElementB = typename ConvKernel::ElementB;
+  using ElementC = typename ConvKernel::ElementC;
+  using ElementD = typename ConvKernel::ElementD;
+  using ElementAccumulator = typename ConvKernel::TiledMma::ValTypeC;
+  using DispatchPolicy = typename ConvKernel::DispatchPolicy;
+  using CollectiveMainloop = typename ConvKernel::CollectiveMainloop;
+  using CollectiveEpilogue = typename ConvKernel::CollectiveEpilogue;
+
+  static bool const kEnableCudaHostAdapter = CUTLASS_ENABLE_CUDA_HOST_ADAPTER;
+
+  // Tease out meta-information about the conv algorithm
+  static constexpr conv::Operator kConvolutionalOperator = DispatchPolicy::ConvOp;
+  static constexpr int NumSpatialDimensions = CollectiveMainloop::NumSpatialDimensions;
+
+  // If our TiledMMA's instruction thread layout size is larger than 1, we know its a tensorop!
+  using OperatorClass = cute::conditional_t<
+      (cute::size(typename ConvKernel::TiledMma::AtomThrID{}) > 1),
+      cutlass::arch::OpClassTensorOp, cutlass::arch::OpClassSimt>;
+
+  using ArchTag = typename ConvKernel::ArchTag;
+
+  // Assume TiledMma's ShapeMNK is the same as 2.x's ThreadblockShape
+  using ThreadblockShape = cutlass::gemm::GemmShape<
+      cute::size<0>(TileShape{}),
+      cute::size<1>(TileShape{}),
+      cute::size<2>(TileShape{})>;
+
+  using ClusterShape = cutlass::gemm::GemmShape<
+      cute::size<0>(typename ConvKernel::DispatchPolicy::ClusterShape{}),
+      cute::size<1>(typename ConvKernel::DispatchPolicy::ClusterShape{}),
+      cute::size<2>(typename ConvKernel::DispatchPolicy::ClusterShape{})>;
+
+  // Instruction shape is easy too, since we get that directly from our TiledMma's atom shape
+  using InstructionShape = cutlass::gemm::GemmShape<
+      cute::size<0>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{}),
+      cute::size<1>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{}),
+      cute::size<2>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{})>;
+
+  // Legacy: provide a correct warp count, but no reliable warp shape
+  static int const kThreadCount = ConvKernel::MaxThreadsPerBlock;
+
+  // Warp shape is not a primary API type in 3.x
+  // But we can best approximate it by inspecting the TiledMma
+  // For this, we make the assumption that we always have 4 warps along M, and rest along N, none along K
+  // We also always round up the warp count to 4 if the tiled mma is smaller than 128 threads
+  static constexpr int WarpsInMma = cute::max(4, CUTE_STATIC_V(cute::size(typename ConvKernel::TiledMma{})) / 32);
+  static constexpr int WarpsInMmaM = 4;
+  static constexpr int WarpsInMmaN = cute::ceil_div(WarpsInMma, WarpsInMmaM);
+  using WarpCount = cutlass::gemm::GemmShape<WarpsInMmaM, WarpsInMmaN, 1>;
+  using WarpShape = cutlass::gemm::GemmShape<
+      CUTE_STATIC_V(cute::tile_size<0>(typename CollectiveMainloop::TiledMma{})) / WarpsInMmaM,
+      CUTE_STATIC_V(cute::tile_size<1>(typename CollectiveMainloop::TiledMma{})) / WarpsInMmaN,
+      CUTE_STATIC_V(cute::tile_size<2>(typename CollectiveMainloop::TiledMma{}))>;
+
+  static int constexpr kStages = CollectiveMainloop::DispatchPolicy::Stages;
+
+  // Inspect TiledCopy for A and B to compute the alignment size
+  static int constexpr kAlignmentA = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+      typename CollectiveMainloop::GmemTiledCopyA, ElementA>();
+  static int constexpr kAlignmentB = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+      typename CollectiveMainloop::GmemTiledCopyB, ElementB>();
+  static int constexpr kAlignmentC = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+      typename CollectiveEpilogue::GmemTiledCopyC, ElementC>();
+  static int constexpr kAlignmentD = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+      typename CollectiveEpilogue::GmemTiledCopyD, ElementD>();
+
+  using EpilogueOutputOp = typename CollectiveEpilogue::ThreadEpilogueOp;
+
+  /// Argument structure: User API
+  using Arguments = typename ConvKernel::Arguments;
+  /// Argument structure: Kernel API
+  using Params = typename ConvKernel::Params;
+
+private:
+
+  /// Kernel API parameters object
+  Params params_;
+
+public:
+
+  /// Access the Params structure
+  Params const& params() const {
+    return params_;
+  }
+
+  /// Determines whether the conv can execute the given problem.
+  static Status
+  can_implement(Arguments const& args) {
+    if (ConvKernel::can_implement(args)) {
+      return Status::kSuccess;
+    }
+    else {
+      return Status::kInvalid;
+    }
+  }
+
+  /// Gets the workspace size
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_bytes = 0;
+    CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
+
+    workspace_bytes += ConvKernel::get_workspace_size(args);
+    return workspace_bytes;
+  }
+
+  /// Computes the grid shape
+  static dim3
+  get_grid_shape(Arguments const& args, void* workspace = nullptr) {
+    auto tmp_params = ConvKernel::to_underlying_arguments(args, workspace);
+    return ConvKernel::get_grid_shape(tmp_params);
+  }
+
+  /// Computes the grid shape
+  static dim3
+  get_grid_shape(Params const& params) {
+    return ConvKernel::get_grid_shape(params);
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int /* smem_capacity */ = -1) {
+    CUTLASS_TRACE_HOST("ConvUniversal::maximum_active_blocks()");
+    int max_active_blocks = -1;
+    int smem_size = ConvKernel::SharedStorageSize;
+
+    // first, account for dynamic smem capacity if needed
+    cudaError_t result;
+    if (smem_size >= (48 << 10)) {
+      CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
+      result = cudaFuncSetAttribute(
+          device_kernel<ConvKernel>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize,
+          smem_size);
+      if (cudaSuccess != result) {
+        result = cudaGetLastError(); // to clear the error bit
+        CUTLASS_TRACE_HOST(
+          "  cudaFuncSetAttribute() returned error: "
+          << cudaGetErrorString(result));
+        return -1;
+      }
+    }
+
+    // query occupancy after setting smem size
+    result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks,
+        device_kernel<ConvKernel>,
+        ConvKernel::MaxThreadsPerBlock,
+        smem_size);
+
+    if (cudaSuccess != result) {
+      result = cudaGetLastError(); // to clear the error bit
+      CUTLASS_TRACE_HOST(
+        "  cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error: "
+        << cudaGetErrorString(result));
+      return -1;
+    }
+
+    CUTLASS_TRACE_HOST("  max_active_blocks: " << max_active_blocks);
+    return max_active_blocks;
+  }
+
+  /// Initializes conv state from arguments.
+  Status
+  initialize(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+
+    CUTLASS_TRACE_HOST("ConvUniversal::initialize() - workspace "
+      << workspace << ", stream: " << (stream ? "non-null" : "null"));
+
+    // Initialize the workspace
+    Status status = ConvKernel::initialize_workspace(args, workspace, stream, cuda_adapter);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // Initialize the Params structure
+    params_ = ConvKernel::to_underlying_arguments(args, workspace);
+
+    // Don't set the function attributes - require the CudaHostAdapter to set it.
+    if constexpr (kEnableCudaHostAdapter) {
+      CUTLASS_ASSERT(cuda_adapter);
+      return Status::kSuccess;
+    }
+    else {
+      // account for dynamic smem capacity if needed
+      int smem_size = ConvKernel::SharedStorageSize;
+      if (smem_size >= (48 << 10)) {
+        CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
+        cudaError_t result = cudaFuncSetAttribute(
+            device_kernel<ConvKernel>,
+            cudaFuncAttributeMaxDynamicSharedMemorySize,
+            smem_size);
+        if (cudaSuccess != result) {
+          result = cudaGetLastError(); // to clear the error bit
+          CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error: " << cudaGetErrorString(result));
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    return Status::kSuccess;
+  }
+
+  /// Update API is preserved in 3.0, but does not guarantee a lightweight update of params.
+  Status
+  update(Arguments const& args, void* workspace = nullptr) {
+    CUTLASS_TRACE_HOST("ConvUniversal()::update() - workspace: " << workspace);
+
+    size_t workspace_bytes = get_workspace_size(args);
+    if (workspace_bytes > 0 && nullptr == workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    params_ = ConvKernel::to_underlying_arguments(args, workspace);
+    return Status::kSuccess;
+  }
+
+  /// Primary run() entry point API that is static allowing users to create and manage their own params.
+  /// Supplied params struct must be construct by calling ConvKernel::to_underling_arguments()
+  static Status
+  run(Params& params, cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
+    CUTLASS_TRACE_HOST("ConvUniversal::run()");
+    dim3 const block = ConvKernel::get_block_shape();
+    dim3 const grid = get_grid_shape(params);
+
+    // configure smem size and carveout
+    int smem_size = ConvKernel::SharedStorageSize;
+
+    Status launch_result;
+    // Use extended launch API only for mainloops that use it
+    if constexpr (ConvKernel::ArchTag::kMinComputeCapability >= 90) {
+      [[maybe_unused]] constexpr bool is_static_1x1x1 =
+        cute::is_static_v<typename ConvKernel::DispatchPolicy::ClusterShape> and
+        cute::size(typename ConvKernel::DispatchPolicy::ClusterShape{}) == 1;
+      dim3 cluster(cute::size<0>(typename ConvKernel::DispatchPolicy::ClusterShape{}),
+                   cute::size<1>(typename ConvKernel::DispatchPolicy::ClusterShape{}),
+                   cute::size<2>(typename ConvKernel::DispatchPolicy::ClusterShape{}));
+      void* kernel_params[] = {&params};
+      if constexpr (kEnableCudaHostAdapter) {
+        //
+        // Use the cuda host adapter
+        //
+        CUTLASS_ASSERT(cuda_adapter);
+        if (cuda_adapter) {
+
+          launch_result = cuda_adapter->launch(grid,
+                                               cluster, 
+                                               block, 
+                                               smem_size, 
+                                               stream, 
+                                               kernel_params,
+                                               kernel_index);
+        }
+        else {
+          return Status::kErrorInternal;
+        }
+      }
+      else {
+        CUTLASS_ASSERT(cuda_adapter == nullptr);
+        void const* kernel = (void const*) device_kernel<ConvKernel>;
+        if constexpr (ConvKernel::ArchTag::kMinComputeCapability == 90) {
+          if constexpr (is_static_1x1x1) {
+            device_kernel<ConvKernel><<<grid, block, smem_size, stream>>>(params);
+            launch_result = Status::kSuccess;
+          }
+          else {
+            launch_result = ClusterLauncher::launch(
+                grid, cluster, block, smem_size, stream, kernel, kernel_params);
+          }
+        }
+      }
+    }
+    else {
+      launch_result = Status::kSuccess;
+
+      if constexpr (kEnableCudaHostAdapter) {
+        CUTLASS_ASSERT(cuda_adapter);
+        if (cuda_adapter) {
+          void* kernel_params[] = {&params};
+
+          launch_result = cuda_adapter->launch(
+              grid, block, smem_size, stream, kernel_params, 0
+              );
+
+        }
+        else {
+          return Status::kErrorInternal;
+        }
+      }
+      else {
+        CUTLASS_ASSERT(cuda_adapter == nullptr);
+        device_kernel<ConvKernel><<<grid, block, smem_size, stream>>>(params);
+      }
+    }
+
+    cudaError_t result = cudaGetLastError();
+    if (cudaSuccess == result && Status::kSuccess == launch_result) {
+      return Status::kSuccess;
+    }
+    else {
+      CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << result);
+      return Status::kErrorInternal;
+    }
+  }
+
+  //
+  // Non-static launch overloads that first create and set the internal params struct of this kernel handle.
+  //
+
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  run(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr,
+    int32_t kernel_index = 0
+  ) {
+    Status status = initialize(args, workspace, stream, cuda_adapter);
+    if (Status::kSuccess == status) {
+      status = run(params_, stream, cuda_adapter, kernel_index);
+    }
+    return status;
+  }
+
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  operator()(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+    return run(args, workspace, stream, cuda_adapter);
+  }
+
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  run(cudaStream_t stream = nullptr) {
+    return run(params_, stream);
+  }
+
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  operator()(cudaStream_t stream = nullptr) {
+    return run(params_, stream);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv::device
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/device/direct_convolution.h b/include/cutlass/conv/device/direct_convolution.h
new file mode 100644
index 0000000000..43ab94b5fc
--- /dev/null
+++ b/include/cutlass/conv/device/direct_convolution.h
@@ -0,0 +1,270 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Template for device-level Depthwise Convolution
+*/
+
+#pragma once
+
+#include <limits>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/conv/convolution.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename DirectConvolutionKernel_>
+class DirectConvolution {
+public:
+
+  using UnderlyingKernel = DirectConvolutionKernel_;
+
+  using ElementA = typename UnderlyingKernel::ElementA;
+  using LayoutA = typename UnderlyingKernel::LayoutA;
+  using ElementB = typename UnderlyingKernel::ElementB;
+  using LayoutB = typename UnderlyingKernel::LayoutB;
+  using ElementC = typename UnderlyingKernel::ElementC;
+  using LayoutC = typename UnderlyingKernel::LayoutC;
+  using ElementAccumulator = typename UnderlyingKernel::ElementAccumulator;
+  using ElementCompute = typename UnderlyingKernel::ElementCompute;
+  using OperatorClass = typename UnderlyingKernel::OperatorClass;
+  using ArchTag = typename UnderlyingKernel::ArchTag;
+  using ThreadblockShape = typename UnderlyingKernel::ThreadblockShape;
+  using WarpShape = typename UnderlyingKernel::WarpShape;
+  using InstructionShape = typename UnderlyingKernel::InstructionShape;
+  using ThreadblockSwizzle = typename UnderlyingKernel::ThreadblockSwizzle;
+  using EpilogueOutputOp = typename UnderlyingKernel::EpilogueOutputOp;
+  static int const kStages = UnderlyingKernel::kStages;
+  static int const kConvDim = UnderlyingKernel::kConvDim;
+  using WarpMmaOperator = typename UnderlyingKernel::WarpMmaOperator;
+  using ArchMmaOperator = typename UnderlyingKernel::ArchMmaOperator;
+  using MathOperator = typename UnderlyingKernel::MathOperator; 
+
+  static cutlass::conv::Operator const kConvolutionalOperator = UnderlyingKernel::kConvolutionalOperator;
+  static cutlass::conv::IteratorAlgorithm const kIteratorAlgorithm = UnderlyingKernel::kIteratorAlgorithm;
+  static cutlass::conv::StrideSupport const kStrideSupport = UnderlyingKernel::kStrideSupport;
+  static cutlass::conv::GroupMode const kGroupMode = UnderlyingKernel::kGroupMode;
+
+  static int const kWarpCount = 
+    (ThreadblockShape::kM / WarpShape::kM) * 
+    (ThreadblockShape::kN / WarpShape::kN) *
+    (ThreadblockShape::kK / WarpShape::kK);
+
+  /// Argument structure
+  using Arguments = typename UnderlyingKernel::Arguments;
+
+  using ReorderKernel = typename UnderlyingKernel::ReorderKernel;
+
+ private:
+
+  /// Kernel parameters object
+  typename UnderlyingKernel::Params params_;
+
+public:
+
+  /// Constructs Implicit GEMM
+  DirectConvolution() { }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    // dispatch to iterators
+    Status status = UnderlyingKernel::Mma::IteratorA::can_implement(args.problem_size);
+    if (Status::kSuccess != status) {
+      return status;
+    }
+
+    status = UnderlyingKernel::Mma::IteratorB::can_implement(args.problem_size);
+    if (Status::kSuccess != status) {
+      return status;
+    }
+
+    if (kGroupMode != conv::GroupMode::kDepthwise) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    // C and K should be multiple of groups
+    if (args.problem_size.K != args.problem_size.groups &&
+      args.problem_size.C != args.problem_size.groups) {
+      return Status::kErrorInvalidProblem;
+    }
+    
+
+    static int const kAlignmentC = UnderlyingKernel::Epilogue::OutputTileIterator::kElementsPerAccess;
+    if (kConvolutionalOperator == conv::Operator::kFprop) {
+      if (args.problem_size.K % kAlignmentC)
+        return Status::kErrorMisalignedOperand;
+    } else if (kConvolutionalOperator == conv::Operator::kDgrad) {
+       if (args.problem_size.C % kAlignmentC)
+        return Status::kErrorMisalignedOperand;
+    } else if (kConvolutionalOperator == conv::Operator::kWgrad) {
+       if (args.problem_size.C % kAlignmentC)
+        return Status::kErrorMisalignedOperand;
+    }
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(
+      threadblock_swizzle.get_tiled_shape(
+        kConvolutionalOperator,
+        args.problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.problem_size.split_k_slices));
+
+    if (!(grid.y <= std::numeric_limits<uint16_t>::max() &&
+          grid.z <= std::numeric_limits<uint16_t>::max())) {
+
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {  
+    return 0;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    // initialize the params structure from the arguments
+    params_ = typename UnderlyingKernel::Params(
+    	args,
+    	static_cast<int *>(workspace)
+    );
+    
+    int smem_size = int(sizeof(typename UnderlyingKernel::SharedStorage));
+
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(cutlass::Kernel<UnderlyingKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+    
+    return Status::kSuccess;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    // update the params structure from the arguments
+    params_.ptr_A = args.ref_A.data();
+    params_.ptr_B = args.ref_B.data();
+    params_.ptr_C = args.ref_C.data();
+    params_.ptr_D = args.ref_D.data();
+    params_.output_op = args.output_op;
+    params_.ptr_reordered_B = args.ref_reordered_B.data();
+    params_.semaphore = static_cast<int *>(workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    // Launch reorder kernel
+    if (params_.ptr_reordered_B != nullptr) {
+      dim3 grid = ReorderKernel::get_grid_shape(params_);
+      dim3 block = ReorderKernel::get_block_shape();
+
+      cutlass::arch::synclog_setup();
+      cutlass::Kernel<ReorderKernel><<<grid, block, 0, stream>>>(params_);
+    }
+
+    // Launch main kernel
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(32 * kWarpCount, 1, 1);
+
+    // Dynamic SMEM size based on input params.
+    int smem_size = int(params_.get_smem_size());
+
+    // Make sure we can use that much shared memory.
+    cudaError_t status = 
+        cudaFuncSetAttribute(cutlass::Kernel<UnderlyingKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
+    if (status != cudaSuccess)
+      return Status::kErrorInternal;
+
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<UnderlyingKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    cudaError_t result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+
+  int get_smem_size() { return int(params_.get_smem_size()); }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}
+}
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/device/implicit_gemm_convolution.h b/include/cutlass/conv/device/implicit_gemm_convolution.h
index 8e87ec566a..a1cb06e98f 100644
--- a/include/cutlass/conv/device/implicit_gemm_convolution.h
+++ b/include/cutlass/conv/device/implicit_gemm_convolution.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -39,6 +39,7 @@
 #include "cutlass/cutlass.h"
 #include "cutlass/device_kernel.h"
 #include "cutlass/conv/convolution.h"
+#include "cutlass/cuda_host_adapter.hpp"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -52,32 +53,35 @@ template<typename ImplicitGemmKernel_>
 class ImplicitGemmConvolution {
 public:
 
-  using ImplicitGemmKernel = ImplicitGemmKernel_;
-
-  using ElementA = typename ImplicitGemmKernel::ElementA;
-  using LayoutA = typename ImplicitGemmKernel::LayoutA;
-  using ElementB = typename ImplicitGemmKernel::ElementB;
-  using LayoutB = typename ImplicitGemmKernel::LayoutB;
-  using ElementC = typename ImplicitGemmKernel::ElementC;
-  using LayoutC = typename ImplicitGemmKernel::LayoutC;
-  using ElementAccumulator = typename ImplicitGemmKernel::ElementAccumulator;
-  using ElementCompute = typename ImplicitGemmKernel::ElementCompute;
-  using OperatorClass = typename ImplicitGemmKernel::OperatorClass;
-  using ArchTag = typename ImplicitGemmKernel::ArchTag;
-  using ThreadblockShape = typename ImplicitGemmKernel::ThreadblockShape;
-  using WarpShape = typename ImplicitGemmKernel::WarpShape;
-  using InstructionShape = typename ImplicitGemmKernel::InstructionShape;
-  using ThreadblockSwizzle = typename ImplicitGemmKernel::ThreadblockSwizzle;
-  using EpilogueOutputOp = typename ImplicitGemmKernel::EpilogueOutputOp;
-  static int const kStages = ImplicitGemmKernel::kStages;
-  static int const kConvDim = ImplicitGemmKernel::kConvDim;
-  using WarpMmaOperator = typename ImplicitGemmKernel::WarpMmaOperator;
-  using ArchMmaOperator = typename ImplicitGemmKernel::ArchMmaOperator;
-  using MathOperator = typename ImplicitGemmKernel::MathOperator; 
-
-  static cutlass::conv::Operator const kConvolutionalOperator = ImplicitGemmKernel::kConvolutionalOperator;
-  static cutlass::conv::IteratorAlgorithm const kIteratorAlgorithm = ImplicitGemmKernel::kIteratorAlgorithm;
-  static cutlass::conv::StrideSupport const kStrideSupport = ImplicitGemmKernel::kStrideSupport;
+  using UnderlyingKernel = GetUnderlyingKernel_t<ImplicitGemmKernel_>;
+
+  using ElementA = typename UnderlyingKernel::ElementA;
+  using LayoutA = typename UnderlyingKernel::LayoutA;
+  using ElementB = typename UnderlyingKernel::ElementB;
+  using LayoutB = typename UnderlyingKernel::LayoutB;
+  using ElementC = typename UnderlyingKernel::ElementC;
+  using LayoutC = typename UnderlyingKernel::LayoutC;
+  using ElementAccumulator = typename UnderlyingKernel::ElementAccumulator;
+  using ElementCompute = typename UnderlyingKernel::ElementCompute;
+  using OperatorClass = typename UnderlyingKernel::OperatorClass;
+  using ArchTag = typename UnderlyingKernel::ArchTag;
+  using ThreadblockShape = typename UnderlyingKernel::ThreadblockShape;
+  using WarpShape = typename UnderlyingKernel::WarpShape;
+  using InstructionShape = typename UnderlyingKernel::InstructionShape;
+  using ThreadblockSwizzle = typename UnderlyingKernel::ThreadblockSwizzle;
+  using EpilogueOutputOp = typename UnderlyingKernel::EpilogueOutputOp;
+  static int const kStages = UnderlyingKernel::kStages;
+  static int const kConvDim = UnderlyingKernel::kConvDim;
+  using WarpMmaOperator = typename UnderlyingKernel::WarpMmaOperator;
+  using ArchMmaOperator = typename UnderlyingKernel::ArchMmaOperator;
+  using MathOperator = typename UnderlyingKernel::MathOperator; 
+
+  static cutlass::conv::Operator const kConvolutionalOperator = UnderlyingKernel::kConvolutionalOperator;
+  static cutlass::conv::IteratorAlgorithm const kIteratorAlgorithm = UnderlyingKernel::kIteratorAlgorithm;
+  static cutlass::conv::StrideSupport const kStrideSupport = UnderlyingKernel::kStrideSupport;
+  static cutlass::conv::GroupMode const kGroupMode = UnderlyingKernel::kGroupMode;
+
+  static bool const kEnableCudaHostAdapter = CUTLASS_ENABLE_CUDA_HOST_ADAPTER;
 
   static int const kWarpCount = 
     (ThreadblockShape::kM / WarpShape::kM) * 
@@ -85,12 +89,12 @@ class ImplicitGemmConvolution {
     (ThreadblockShape::kK / WarpShape::kK);
 
   /// Argument structure
-  using Arguments = typename ImplicitGemmKernel::Arguments;
+  using Arguments = typename UnderlyingKernel::Arguments;
 
 private:
 
   /// Kernel parameters object
-  typename ImplicitGemmKernel::Params params_;
+  typename UnderlyingKernel::Params params_;
 
 public:
 
@@ -99,23 +103,56 @@ class ImplicitGemmConvolution {
 
   /// Determines whether the Implicit GEMM can execute the given problem.
   static Status can_implement(Arguments const &args) {
-
     // dispatch to iterators
-    Status status = ImplicitGemmKernel::Mma::IteratorA::can_implement(args.problem_size);
+    Status status = UnderlyingKernel::Mma::IteratorA::can_implement(args.problem_size);
     if (Status::kSuccess != status) {
       return status;
     }
 
-    status = ImplicitGemmKernel::Mma::IteratorB::can_implement(args.problem_size);
+    status = UnderlyingKernel::Mma::IteratorB::can_implement(args.problem_size);
     if (Status::kSuccess != status) {
       return status;
     }
 
-    static int const kAlignmentC = ImplicitGemmKernel::Epilogue::OutputTileIterator::kElementsPerAccess;
+    // check group conv constraint
+    if (args.problem_size.groups != 1) {
+      if (kGroupMode == conv::GroupMode::kNone) {
+        return Status::kErrorInvalidProblem;
+      } 
+
+      // C and K should be multiple of groups
+      if (args.problem_size.K % args.problem_size.groups ||
+        args.problem_size.C % args.problem_size.groups) {
+        return Status::kErrorInvalidProblem;
+      }
+
+      // split-k is not supported
+      if (args.problem_size.split_k_slices != 1) {
+        return Status::kErrorInvalidProblem;
+      }
+
+      int k_per_group = args.problem_size.K / args.problem_size.groups;
+      // k_per_group should be multiple of ThreadblockShape N, one CTA calculate one group
+      if (kGroupMode == conv::GroupMode::kSingleGroup && k_per_group % ThreadblockShape::kN) {
+        return Status::kErrorInvalidProblem;
+      }
+      // ThreadblockShape::kN should be divisible by k_per_group, one CTA calculate multiple groups
+      if (kGroupMode == conv::GroupMode::kMultipleGroup && ThreadblockShape::kN % k_per_group) {
+        return Status::kErrorInvalidProblem;
+      }
+
+      // current optimized iterator algo only supports SingleGroup mode
+      if (kIteratorAlgorithm == IteratorAlgorithm::kOptimized &&
+        kGroupMode != conv::GroupMode::kSingleGroup) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    static int const kAlignmentC = UnderlyingKernel::Epilogue::OutputTileIterator::kElementsPerAccess;
     if (kConvolutionalOperator == conv::Operator::kFprop) {
       if (args.problem_size.K % kAlignmentC)
         return Status::kErrorMisalignedOperand;
-    } else if (kConvolutionalOperator == conv::Operator::kDgrad) {
+    } else if (kConvolutionalOperator == conv::Operator::kDgrad || kConvolutionalOperator == conv::Operator::kDeconv) {
        if (args.problem_size.C % kAlignmentC)
         return Status::kErrorMisalignedOperand;
     } else if (kConvolutionalOperator == conv::Operator::kWgrad) {
@@ -123,25 +160,15 @@ class ImplicitGemmConvolution {
         return Status::kErrorMisalignedOperand;
     }
 
-    // check for unsupported problem sizes for strided dgrad implementation
-    if (kConvolutionalOperator == conv::Operator::kDgrad && 
+    // check for unsupported problem sizes for strided dgrad / deconv implementation
+    if ((kConvolutionalOperator == conv::Operator::kDgrad || kConvolutionalOperator == conv::Operator::kDeconv) &&
       kStrideSupport == conv::StrideSupport::kStrided) {
-
-      // Unity stride (1x1) is supported by strided dgrad but disabled for performance 
-      // reasons. For unity stride, use strided dgrad optimized unity stride specialization.
-      // Note that unit tests strided dgrad for unity stride to make sure that strided 
-      // dgrad implemetnation is functionaly sound. 
-      // Strided dgrad implementation also support mixed strides, i.e., (1x2) and (2x1)
-      if(args.problem_size.stride_h == 1 && args.problem_size.stride_w == 1) {
+      // split-k (serial or parallel) is not supported for strided dgrad / deconv
+      if(args.problem_size.split_k_slices > 1 && (args.problem_size.stride().at(args.problem_size.stride().max_dim_index()) > 1)) {
         return Status::kErrorNotSupported;
       }
 
-      // split-k (serial or parallel) is not supported for strided dgrad
-      if(args.problem_size.split_k_slices > 1) {
-        return Status::kErrorNotSupported;
-      }
-      
-      // dilation > {1x1} is not supported for strided dgrad
+      // dilation > {1x1} is not supported for strided dgrad / deconv
       if(args.problem_size.dilation_h > 1 || args.problem_size.dilation_w > 1) {
         return Status::kErrorNotSupported;
       }
@@ -204,7 +231,8 @@ class ImplicitGemmConvolution {
   Status initialize(
     Arguments const &args, 
     void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr) {
    
     if (args.problem_size.split_k_slices > 1) {
 
@@ -220,20 +248,26 @@ class ImplicitGemmConvolution {
     }
 
     // initialize the params structure from the arguments
-    params_ = typename ImplicitGemmKernel::Params(
+    params_ = typename UnderlyingKernel::Params(
     	args,
     	static_cast<int *>(workspace)
     );
-    
-    int smem_size = int(sizeof(typename ImplicitGemmKernel::SharedStorage));
 
-    if (smem_size >= (48 << 10)) {
-      cudaError_t result = cudaFuncSetAttribute(cutlass::Kernel<ImplicitGemmKernel>,
-                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                    smem_size);
-
-      if (result != cudaSuccess) {
-        return Status::kErrorInternal;
+    if constexpr (kEnableCudaHostAdapter) {
+      CUTLASS_ASSERT(cuda_adapter);
+      return Status::kSuccess;
+    }
+    else {
+      int smem_size = int(sizeof(typename UnderlyingKernel::SharedStorage));
+  
+      if (smem_size >= (48 << 10)) {
+        cudaError_t result = cudaFuncSetAttribute(cutlass::Kernel<UnderlyingKernel>,
+                                      cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                      smem_size);
+  
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
       }
     }
     
@@ -255,7 +289,7 @@ class ImplicitGemmConvolution {
   }
 
   /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
+  Status run(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
 
 
     ThreadblockSwizzle threadblock_swizzle;
@@ -263,30 +297,55 @@ class ImplicitGemmConvolution {
     dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
     dim3 block(32 * kWarpCount, 1, 1);
 
-    int smem_size = int(sizeof(typename ImplicitGemmKernel::SharedStorage));
-
-    cutlass::Kernel<ImplicitGemmKernel><<<grid, block, smem_size, stream>>>(params_);
+    int smem_size = int(sizeof(typename UnderlyingKernel::SharedStorage));
+    cutlass::Status launch_result = cutlass::Status::kSuccess ;
+
+    if constexpr (kEnableCudaHostAdapter) {
+        //
+        // Use the cuda host adapter
+        //
+        CUTLASS_ASSERT(cuda_adapter);
+        if (cuda_adapter) {
+
+          void* kernel_params[] = {&params_};
+          launch_result = cuda_adapter->launch(
+              grid, dim3(1,1,1), block, smem_size, stream, kernel_params, kernel_index
+              );
+        }
+        else {
+          launch_result = Status::kErrorInternal;
+        }
+    }
+    else {
+      cutlass::arch::synclog_setup();
+      cutlass::Kernel<UnderlyingKernel><<<grid, block, smem_size, stream>>>(params_);      
+    }
 
     cudaError_t result = cudaGetLastError();
-
-    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+    if (cudaSuccess == result && Status::kSuccess == launch_result) {
+      return Status::kSuccess;
+    }
+    else {
+      CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << result);
+      return Status::kErrorInternal;
+    }
   }
 
   /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
+  Status operator()(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
+    return run(stream, cuda_adapter, kernel_index);
   }
 
   /// Runs the kernel using initialized state.
   Status operator()(
     Arguments const &args, 
     void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
+    cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
     
-    Status status = initialize(args, workspace, stream);
+    Status status = initialize(args, workspace, stream, cuda_adapter);
     
     if (status == Status::kSuccess) {
-      status = run(stream);
+      status = run(stream, cuda_adapter, kernel_index);
     }
 
     return status;
diff --git a/include/cutlass/conv/device/implicit_gemm_convolution_fusion.h b/include/cutlass/conv/device/implicit_gemm_convolution_fusion.h
index b0c590add3..265156cc5b 100644
--- a/include/cutlass/conv/device/implicit_gemm_convolution_fusion.h
+++ b/include/cutlass/conv/device/implicit_gemm_convolution_fusion.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -231,6 +231,7 @@ class ImplicitGemmConvolutionFusion {
 
     int smem_size = int(sizeof(typename ImplicitGemmFusionKernel::SharedStorage));
 
+    cutlass::arch::synclog_setup();
     cutlass::Kernel<ImplicitGemmFusionKernel><<<grid, block, smem_size, stream>>>(params_);
 
     cudaError_t result = cudaGetLastError();
diff --git a/include/cutlass/conv/dispatch_policy.hpp b/include/cutlass/conv/dispatch_policy.hpp
new file mode 100644
index 0000000000..b8b5eb2bff
--- /dev/null
+++ b/include/cutlass/conv/dispatch_policy.hpp
@@ -0,0 +1,90 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/conv/convolution.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/arch/arch.h"
+
+#include "cute/layout.hpp"
+#include "cute/numeric/integral_constant.hpp"
+
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+//////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv {
+
+//////////////////////////////////////////////////////////////////////////////
+
+//
+// Policies for categorical dispatch of mainloop against kernel grid schedules
+//
+struct KernelImplicitTmaWarpSpecializedSm90 : cutlass::gemm::KernelTmaWarpSpecialized { };
+struct KernelImplicitTmaWarpSpecializedSm90Cooperative { };
+struct KernelImplicitTmaWarpSpecializedSm90Pingpong { };
+
+//
+// Collective Mainloop Policies
+//
+
+// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, static schedule between TMA and GMMA
+// for fprop
+template<
+  conv::Operator ConvOp_,
+  int Stages_,
+  int NumSpatialDimensions_,
+  class ClusterShape_ = cute::Shape<cute::C<1>,cute::C<1>,cute::C<1>>,
+  class KernelSchedule = KernelImplicitTmaWarpSpecializedSm90,
+  int PipelineAsyncMmaStages_ = 1
+>
+struct MainloopSm90TmaGmmaWarpSpecializedImplicitGemm {
+  static constexpr int Stages = Stages_;
+  static constexpr int NumSpatialDimensions = NumSpatialDimensions_;
+  static constexpr Operator ConvOp = ConvOp_;
+  static constexpr int PipelineAsyncMmaStages = PipelineAsyncMmaStages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm90;
+  using Schedule = KernelSchedule;
+
+  static_assert(NumSpatialDimensions >= 1);
+  static_assert(! (cute::is_same_v<KernelSchedule,KernelImplicitTmaWarpSpecializedSm90Cooperative> ||
+                   cute::is_same_v<KernelSchedule,KernelImplicitTmaWarpSpecializedSm90Pingpong>),
+    "Persistent schedules not support for conv yet.");
+};
+
+//////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv 
+
+//////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/kernel/conv_universal.hpp b/include/cutlass/conv/kernel/conv_universal.hpp
new file mode 100644
index 0000000000..23ccea2f8f
--- /dev/null
+++ b/include/cutlass/conv/kernel/conv_universal.hpp
@@ -0,0 +1,65 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/conv/convnd_problem_shape.hpp"
+#include "cutlass/detail/dependent_false.hpp"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv::kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/*
+ * Stateless universal device CONV kernel type that treats CONV as
+ * a composition of a collective mainloop and a collective epilogue.
+**/
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileSchedulerTag_ = void,
+  class Enable = void
+>
+class ConvUniversal {
+  static_assert(cutlass::detail::dependent_false<Enable>,
+      "Could not find a valid specialization at the kernel layer to dispatch against.");
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv::kernel
+
+////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/conv/kernel/sm90_implicit_gemm_tma_warpspecialized.hpp"
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/kernel/default_conv2d.h b/include/cutlass/conv/kernel/default_conv2d.h
index 00cff2378c..79bedb2c84 100644
--- a/include/cutlass/conv/kernel/default_conv2d.h
+++ b/include/cutlass/conv/kernel/default_conv2d.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -106,6 +106,56 @@ struct DefaultConvEpilogue<
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename ArchTag,
+  typename Shape,
+  typename WarpMmaSimt,
+  typename ElementOutput,
+  typename ElementTensor,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess,
+  typename PermuteDLayout = layout::NoPermute,
+  conv::StrideSupport StrideSupport = conv::StrideSupport::kUnity,
+  int Rank = 4
+>
+struct DefaultConvEpilogueWithBroadcastSimt {
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueWithBroadcastSimt<
+    Shape,
+    WarpMmaSimt,
+    ElementOutput,
+    ElementTensor,
+    ElementVector,
+    OutputOp,
+    ElementsPerAccess,
+    false,
+    PermuteDLayout,
+    StrideSupport,
+    Rank
+  >::Epilogue;
+};
+
+template <
+  typename ArchTag,
+  typename Shape,
+  typename WarpMmaSimt,
+  typename ElementOutput,
+  typename ElementTensor,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess
+>
+struct DefaultConvEpilogueWithBroadcastSimtStridedDgrad {
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueWithBroadcastSimtStridedDgrad<
+    Shape,
+    WarpMmaSimt,
+    ElementOutput,
+    ElementTensor,
+    ElementVector,
+    OutputOp,
+    ElementsPerAccess
+  >::Epilogue;
+};
 
 template <
   typename ArchTag,
diff --git a/include/cutlass/conv/kernel/default_conv2d_dgrad.h b/include/cutlass/conv/kernel/default_conv2d_dgrad.h
index 547068713b..c5a8b1315e 100644
--- a/include/cutlass/conv/kernel/default_conv2d_dgrad.h
+++ b/include/cutlass/conv/kernel/default_conv2d_dgrad.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/kernel/default_conv2d_fprop.h b/include/cutlass/conv/kernel/default_conv2d_fprop.h
index 859f23969a..9fbd97e585 100644
--- a/include/cutlass/conv/kernel/default_conv2d_fprop.h
+++ b/include/cutlass/conv/kernel/default_conv2d_fprop.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -76,7 +76,7 @@ template <
   int Stages,
   typename MathOperatorTag,
   conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
-  conv::StrideSupport StrideSupport = StrideSupport::kStrided,
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity,
   /// Access granularity of A matrix in units of elements
   int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
   /// Access granularity of B matrix in units of elements
@@ -327,7 +327,6 @@ struct DefaultConv2dFprop <
   >;
 };
 
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and two stage
@@ -1167,7 +1166,11 @@ struct DefaultConv2dFprop <
     WarpMmaTensorOp,
     kPartitionsK,
     EpilogueOutputOp,
-    EpilogueOutputOp::kCount
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    4
   >::Epilogue;
 
   // Define the kernel
@@ -1628,7 +1631,11 @@ struct DefaultConv2dFprop <
     ThreadblockShape,
     WarpMmaSimtOp,
     EpilogueOutputOp,
-    EpilogueOutputOp::kCount
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    4
   >::Epilogue;
 
   // Define the kernel
@@ -1741,7 +1748,11 @@ struct DefaultConv2dFprop <
     ThreadblockShape,
     WarpMmaSimtOp,
     EpilogueOutputOp,
-    EpilogueOutputOp::kCount
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    4
   >::Epilogue;
 
   // Define the kernel
@@ -1751,7 +1762,6 @@ struct DefaultConv2dFprop <
     ThreadblockSwizzle,
     conv::Operator::kFprop
   >;
-
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1853,7 +1863,11 @@ struct DefaultConv2dFprop <
     ThreadblockShape,
     WarpMmaSimtOp,
     EpilogueOutputOp,
-    EpilogueOutputOp::kCount
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    4
   >::Epilogue;
 
   // Define the kernel
@@ -1967,7 +1981,11 @@ struct DefaultConv2dFprop <
     ThreadblockShape,
     WarpMmaSimtOp,
     EpilogueOutputOp,
-    EpilogueOutputOp::kCount
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    4
   >::Epilogue;
 
   // Define the kernel
diff --git a/include/cutlass/conv/kernel/default_conv2d_fprop_fusion.h b/include/cutlass/conv/kernel/default_conv2d_fprop_fusion.h
index 4fc2200a5d..8589ace029 100644
--- a/include/cutlass/conv/kernel/default_conv2d_fprop_fusion.h
+++ b/include/cutlass/conv/kernel/default_conv2d_fprop_fusion.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -45,8 +45,8 @@
 #include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h"
 #include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h"
 #include "cutlass/conv/threadblock/predicated_scale_bias_vector_access_iterator.h"
-#include "cutlass/conv/threadblock/regular_scale_bias_vector_access_iterator.h"
-#include "cutlass/conv/warp/conv2d_fprop_scale_bias_iterator.h"
+#include "cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h"
+#include "cutlass/gemm/warp/scale_bias_tile_iterator.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -76,7 +76,7 @@ template <
   int Stages,
   typename MathOperatorTag,
   conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
-  conv::StrideSupport StrideSupport = StrideSupport::kStrided
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity
 > struct DefaultConv2dFpropFusion;
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -161,7 +161,7 @@ struct DefaultConv2dFpropFusion <
           LayoutScaleBias>;
 
   using SmemIteratorScaleBias =
-      cutlass::conv::threadblock::RegularScaleBiasVectorAccessIterator<
+      cutlass::transform::threadblock::RegularScaleBiasVectorAccessIterator<
           cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
           LayoutScaleBias>;
 
@@ -172,7 +172,7 @@ struct DefaultConv2dFpropFusion <
   static int const kThreadCount = 32;
 
   // Warp-level iterators to load scale and bias vectors
-  using WarpIteratorScaleBias = cutlass::conv::warp::WarpIteratorScaleBias<
+  using WarpIteratorScaleBias = cutlass::gemm::warp::ScaleBiasTileIterator<
       MatrixShape<WarpShape::kM, WarpShape::kK>, ElementScaleBias,
       LayoutScaleBias, MatrixShape<InstructionShape::kM, InstructionShape::kK>,
       typename WarpMmaTensorOp::IteratorA::Base::Policy, kThreadCount,
@@ -296,7 +296,7 @@ struct DefaultConv2dFpropFusion <
           LayoutScaleBias>;
 
   using SmemIteratorScaleBias =
-      cutlass::conv::threadblock::RegularScaleBiasVectorAccessIterator<
+      cutlass::transform::threadblock::RegularScaleBiasVectorAccessIterator<
           cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
           LayoutScaleBias>;
 
@@ -307,7 +307,7 @@ struct DefaultConv2dFpropFusion <
   static int const kThreadCount = 32;
 
   // Warp-level iterators to load scale and bias vectors
-  using WarpIteratorScaleBias = cutlass::conv::warp::WarpIteratorScaleBias<
+  using WarpIteratorScaleBias = cutlass::gemm::warp::ScaleBiasTileIterator<
       MatrixShape<WarpShape::kM, WarpShape::kK>, ElementScaleBias,
       LayoutScaleBias, MatrixShape<InstructionShape::kM, InstructionShape::kK>,
       typename WarpMmaTensorOp::IteratorA::Base::Policy, kThreadCount,
diff --git a/include/cutlass/conv/kernel/default_conv2d_fprop_with_absmax.h b/include/cutlass/conv/kernel/default_conv2d_fprop_with_absmax.h
new file mode 100644
index 0000000000..76bc12886c
--- /dev/null
+++ b/include/cutlass/conv/kernel/default_conv2d_fprop_with_absmax.h
@@ -0,0 +1,127 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Defines a default configuration for convolution with absolute maximum calculation.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/kernel/implicit_gemm_convolution_with_absmax.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_with_absmax.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_absmax.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
+>
+struct DefaultConv2dFpropWithAbsMax {
+
+  using ImplicitGemmBase = typename DefaultConv2dFprop<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MathOperatorTag,
+    IteratorAlgorithm,
+    StrideSupport,
+    AlignmentA,
+    AlignmentB
+  >::Kernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithAbsMax<
+    typename ImplicitGemmBase::Epilogue::Shape,
+    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
+    ImplicitGemmBase::Epilogue::kPartitionsK,
+    ElementC,
+    typename EpilogueOutputOp::ElementAuxOutput,
+    ElementC,
+    EpilogueOutputOp,
+    ImplicitGemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithAbsMax<
+    typename ImplicitGemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace conv
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h b/include/cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h
index 187c46f93a..0825789ced 100644
--- a/include/cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h
+++ b/include/cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -31,7 +31,7 @@
 
 /*! \file
   \brief 
-    Defines a GEMM with Reduction based on an existing UniversalGemm kernel.
+    Defines a GEMM with Broadcast based on an existing UniversalGemm kernel.
 
 */
 
@@ -71,7 +71,7 @@ template <
   int Stages,
   typename MathOperatorTag,
   conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
-  conv::StrideSupport StrideSupport = StrideSupport::kStrided,
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity,
   /// Access granularity of A matrix in units of elements
   int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
   /// Access granularity of B matrix in units of elements
@@ -99,7 +99,7 @@ struct DefaultConv2dFpropWithBroadcast {
     AlignmentB
   >::Kernel;
 
-  // Replace epilogue
+  // Define epilogue
   using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastTensorOp<
     ArchTag,
     typename ImplicitGemmBase::Epilogue::Shape,
@@ -107,7 +107,98 @@ struct DefaultConv2dFpropWithBroadcast {
     ImplicitGemmBase::Epilogue::kPartitionsK,
     ElementC,
     typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    ImplicitGemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
+    typename ImplicitGemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm,
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFpropWithBroadcast <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  using ImplicitGemmBase = typename DefaultConv2dFprop<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    arch::OpClassSimt,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MathOperatorTag,
+    IteratorAlgorithm,
+    StrideSupport,
+    AlignmentA,
+    AlignmentB
+  >::Kernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastSimt<
+    ArchTag,
+    typename ImplicitGemmBase::Epilogue::Shape,
+    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
     ElementC,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
     EpilogueOutputOp,
     ImplicitGemmBase::Epilogue::kElementsPerAccess
   >::Epilogue;
diff --git a/include/cutlass/conv/kernel/default_conv2d_fprop_with_reduction.h b/include/cutlass/conv/kernel/default_conv2d_fprop_with_reduction.h
index 82bbd252a1..e6e8a82209 100644
--- a/include/cutlass/conv/kernel/default_conv2d_fprop_with_reduction.h
+++ b/include/cutlass/conv/kernel/default_conv2d_fprop_with_reduction.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -72,7 +72,7 @@ template <
   int Stages,
   typename MathOperatorTag,
   conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
-  conv::StrideSupport StrideSupport = StrideSupport::kStrided,
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity,
   /// Access granularity of A matrix in units of elements
   int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
   /// Access granularity of B matrix in units of elements
@@ -100,7 +100,7 @@ struct DefaultConv2dFpropWithReduction {
     AlignmentB
   >::Kernel;
 
-  // Replace epilogue
+  // Define epilogue
   using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithReductionTensorOp<
     ArchTag,
     typename ImplicitGemmBase::Epilogue::Shape,
diff --git a/include/cutlass/conv/kernel/default_conv2d_group_fprop.h b/include/cutlass/conv/kernel/default_conv2d_group_fprop.h
new file mode 100644
index 0000000000..e2deaf6fe2
--- /dev/null
+++ b/include/cutlass/conv/kernel/default_conv2d_group_fprop.h
@@ -0,0 +1,622 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h"
+
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_fixed_channels.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_few_channels.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dGroupFprop
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::GroupMode GroupMode,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
+> struct DefaultConv2dGroupFprop;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                         OpClassTensorOp convolutions 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dGroupFprop specialization for Analytic IteratorAlgorithm and multistage 
+/// pipeline that supports all GroupMode.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::GroupMode GroupMode,
+  conv::StrideSupport StrideSupport, 
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dGroupFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  GroupMode,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  static_assert(platform::is_same<LayoutA, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+  static_assert(platform::is_same<LayoutB, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+  static_assert(platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA, LayoutA,
+      ThreadMapA,
+      AccessTypeA,
+      GroupMode
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB,
+      AccessTypeB,
+      GroupMode
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages 
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv2dProblemSize,
+    GroupMode
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dGroupFprop specialization for Analytic IteratorAlgorithm and
+/// 2 stage pipeline that supports all GroupMode.
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::GroupMode GroupMode,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dGroupFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  GroupMode,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  static_assert(platform::is_same<LayoutA, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+  static_assert(platform::is_same<LayoutB, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+  static_assert(platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA, LayoutA,
+        ThreadMapA,
+        AccessTypeA,
+        GroupMode
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, LayoutB,
+        ThreadMapB,
+        AccessTypeB,
+        GroupMode
+      >
+    >;
+
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv2dProblemSize,
+    GroupMode
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dGroupFprop specialization for Optimized IteratorAlgorithm and multistage
+/// pipeline that supports GroupMode::kSingleGroup.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dGroupFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  GroupMode::kSingleGroup,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  static_assert(platform::is_same<LayoutA, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+  static_assert(platform::is_same<LayoutB, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+  static_assert(platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA, LayoutA,
+      ThreadMapA,
+      AccessTypeA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB,
+      AccessTypeB
+    >;
+
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv2dProblemSize,
+    GroupMode::kSingleGroup
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dGroupFprop specialization for Optimized IteratorAlgorithm and
+/// 2 stage pipeline that supports GroupMode::kSingleGroup.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dGroupFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  GroupMode::kSingleGroup,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  static_assert(platform::is_same<LayoutA, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+  static_assert(platform::is_same<LayoutB, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+  static_assert(platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+    ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+    ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+    2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        LayoutA,
+        ThreadMapA,
+        AccessTypeA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        LayoutB,
+        ThreadMapB,
+        AccessTypeB
+      >
+    >;
+
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv2dProblemSize,
+    GroupMode::kSingleGroup
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/kernel/default_conv2d_wgrad.h b/include/cutlass/conv/kernel/default_conv2d_wgrad.h
index c5e5b3c5b5..d0e52dfe34 100644
--- a/include/cutlass/conv/kernel/default_conv2d_wgrad.h
+++ b/include/cutlass/conv/kernel/default_conv2d_wgrad.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/kernel/default_conv2d_wgrad_fusion.h b/include/cutlass/conv/kernel/default_conv2d_wgrad_fusion.h
index e43e02cde5..110e07db9c 100644
--- a/include/cutlass/conv/kernel/default_conv2d_wgrad_fusion.h
+++ b/include/cutlass/conv/kernel/default_conv2d_wgrad_fusion.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/kernel/default_conv3d_dgrad.h b/include/cutlass/conv/kernel/default_conv3d_dgrad.h
index 8f408ef04e..cb50ba49b8 100644
--- a/include/cutlass/conv/kernel/default_conv3d_dgrad.h
+++ b/include/cutlass/conv/kernel/default_conv3d_dgrad.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -293,6 +293,439 @@ struct DefaultConv3dDgrad <
 };
 
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv3dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kStrided
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      conv::StrideSupport::kStrided
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dDgradFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad,
+    Conv3dProblemSize
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dDgrad specialization for Optimized IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv3dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kUnity
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dDgradFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+      // ThreadMapB,
+      // StrideSupport::kUnity
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultConv3dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kStrided
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    // cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        conv::StrideSupport::kStrided
+      // >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    // cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv3dDgradFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+      // >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dDgrad specialization for Optimized IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultConv3dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    // cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        StrideSupport::kUnity
+      // >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    // cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dDgradFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+        // ThreadMapB,
+        // StrideSupport::kUnity
+      // >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad,
+    Conv3dProblemSize
+  >;
+
+};
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace kernel
diff --git a/include/cutlass/conv/kernel/default_conv3d_fprop.h b/include/cutlass/conv/kernel/default_conv3d_fprop.h
index 7b20ae344a..41fdd64a5e 100644
--- a/include/cutlass/conv/kernel/default_conv3d_fprop.h
+++ b/include/cutlass/conv/kernel/default_conv3d_fprop.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -54,7 +54,7 @@ namespace conv {
 namespace kernel {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines a kernel for Conv2dFprop
+/// Defines a kernel for Conv3dFprop
 template <
   typename ElementA,
   typename LayoutA,
@@ -73,7 +73,7 @@ template <
   int Stages,
   typename MathOperatorTag,
   conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
-  conv::StrideSupport StrideSupport = StrideSupport::kStrided
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity
 > struct DefaultConv3dFprop;
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -94,7 +94,8 @@ template <
   typename InstructionShape,
   typename EpilogueOutputOp,
   typename ThreadblockSwizzle,
-  typename MathOperatorTag
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport
 >
 struct DefaultConv3dFprop <
   ElementA,
@@ -113,7 +114,8 @@ struct DefaultConv3dFprop <
   ThreadblockSwizzle,
   2,
   MathOperatorTag,
-  IteratorAlgorithm::kAnalytic
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport
 > {
 
   // Define the core components from GEMM
@@ -185,7 +187,7 @@ struct DefaultConv3dFprop <
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage
+/// Defines a kernel for Conv3dFprop specialization for Analytic IteratorAlgorithm and multistage
 // pipeline.
 template <
   typename ElementA,
@@ -202,7 +204,8 @@ template <
   typename EpilogueOutputOp,
   typename ThreadblockSwizzle,
   int Stages,
-  typename MathOperatorTag
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport
 >
 struct DefaultConv3dFprop <
   ElementA,
@@ -221,7 +224,8 @@ struct DefaultConv3dFprop <
   ThreadblockSwizzle,
   Stages,
   MathOperatorTag,
-  IteratorAlgorithm::kAnalytic
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport
 > {
 
   // Define the core components from GEMM
@@ -306,7 +310,8 @@ template <
   typename InstructionShape,
   typename EpilogueOutputOp,
   typename ThreadblockSwizzle,
-  typename MathOperatorTag
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport
 >
 struct DefaultConv3dFprop <
   ElementA,
@@ -325,7 +330,8 @@ struct DefaultConv3dFprop <
   ThreadblockSwizzle,
   2,
   MathOperatorTag,
-  IteratorAlgorithm::kOptimized
+  IteratorAlgorithm::kOptimized,
+  StrideSupport
 > {
 
   // Define the core components from GEMM
@@ -416,7 +422,8 @@ template <
   typename EpilogueOutputOp,
   typename ThreadblockSwizzle,
   int Stages,
-  typename MathOperatorTag
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport
 >
 struct DefaultConv3dFprop <
   ElementA,
@@ -435,7 +442,8 @@ struct DefaultConv3dFprop <
   ThreadblockSwizzle,
   Stages,
   MathOperatorTag,
-  IteratorAlgorithm::kOptimized
+  IteratorAlgorithm::kOptimized,
+  StrideSupport
 > {
 
   // Define the core components from GEMM
@@ -492,7 +500,465 @@ struct DefaultConv3dFprop <
     WarpMmaTensorOp,
     1,
     EpilogueOutputOp,
-    EpilogueOutputOp::kCount
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv3dFprop specialization for Analytic IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport
+>
+struct DefaultConv3dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dFprop specialization for Optimized IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport
+>
+struct DefaultConv3dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      LayoutA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      LayoutB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dFprop specialization for Analytic IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport
+>
+struct DefaultConv3dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dFprop specialization for Optimized IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport
+>
+struct DefaultConv3dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        LayoutA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        LayoutB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    5
   >::Epilogue;
 
   // Define the kernel
diff --git a/include/cutlass/conv/kernel/default_conv3d_fprop_fusion.h b/include/cutlass/conv/kernel/default_conv3d_fprop_fusion.h
new file mode 100644
index 0000000000..d0457d572e
--- /dev/null
+++ b/include/cutlass/conv/kernel/default_conv3d_fprop_fusion.h
@@ -0,0 +1,360 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+   Default kernel-level fused activation's scale+bias+relu and implicit GEMM convolution
+   definitions that combine threadblock-scoped matrix multiply-add with the
+   appropriate threadblock-scoped epilogue.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/predicated_scale_bias_vector_access_iterator.h"
+#include "cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h"
+#include "cutlass/gemm/warp/scale_bias_tile_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for fused batch norm and Conv3dFprop
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementScaleBias,
+  typename LayoutScaleBias,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity
+> struct DefaultConv3dFpropFusion;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                         OpClassTensorOp convolutions 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dFprop specialzation for Analytic IteratorAlgorithm and multistage 
+/// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementScaleBias,
+  typename LayoutScaleBias,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv3dFpropFusion <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementScaleBias,
+  LayoutScaleBias,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using IteratorScaleBias =
+      cutlass::conv::threadblock::PredicatedScaleBiasVectorAccessIterator<
+          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
+          LayoutScaleBias>;
+
+  using SmemIteratorScaleBias =
+      cutlass::transform::threadblock::RegularScaleBiasVectorAccessIterator<
+          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
+          LayoutScaleBias>;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static int const kThreadCount = 32;
+
+  // Warp-level iterators to load scale and bias vectors
+  using WarpIteratorScaleBias = cutlass::gemm::warp::ScaleBiasTileIterator<
+      MatrixShape<WarpShape::kM, WarpShape::kK>, ElementScaleBias,
+      LayoutScaleBias, MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+      typename WarpMmaTensorOp::IteratorA::Base::Policy, kThreadCount,
+      MmaCore::WarpCount::kK>;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmFpropFusionMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    IteratorScaleBias,
+    SmemIteratorScaleBias,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    WarpIteratorScaleBias,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionFusion<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dFprop specialzation for Optimzed IteratorAlgorithm and 
+/// multistage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementScaleBias,
+  typename LayoutScaleBias,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv3dFpropFusion <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementScaleBias,
+  LayoutScaleBias,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+    ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+    ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+    Stages, MathOperatorTag
+  >;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      LayoutA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand 
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      LayoutB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using IteratorScaleBias =
+      cutlass::conv::threadblock::PredicatedScaleBiasVectorAccessIterator<
+          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
+          LayoutScaleBias>;
+
+  using SmemIteratorScaleBias =
+      cutlass::transform::threadblock::RegularScaleBiasVectorAccessIterator<
+          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
+          LayoutScaleBias>;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static int const kThreadCount = 32;
+
+  // Warp-level iterators to load scale and bias vectors
+  using WarpIteratorScaleBias = cutlass::gemm::warp::ScaleBiasTileIterator<
+      MatrixShape<WarpShape::kM, WarpShape::kK>, ElementScaleBias,
+      LayoutScaleBias, MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+      typename WarpMmaTensorOp::IteratorA::Base::Policy, kThreadCount,
+      MmaCore::WarpCount::kK>;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmFpropFusionMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    IteratorScaleBias,
+    SmemIteratorScaleBias,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    WarpIteratorScaleBias,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionFusion<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/kernel/default_conv3d_fprop_with_broadcast.h b/include/cutlass/conv/kernel/default_conv3d_fprop_with_broadcast.h
new file mode 100644
index 0000000000..0fc291e605
--- /dev/null
+++ b/include/cutlass/conv/kernel/default_conv3d_fprop_with_broadcast.h
@@ -0,0 +1,222 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief 
+    Defines a GEMM with Broadcast based on an existing UniversalGemm kernel.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv3d_fprop.h"
+#include "cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_broadcast.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
+>
+struct DefaultConv3dFpropWithBroadcast {
+
+  using ImplicitGemmBase = typename DefaultConv3dFprop<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MathOperatorTag,
+    IteratorAlgorithm,
+    StrideSupport
+  >::Kernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastTensorOp<
+    ArchTag,
+    typename ImplicitGemmBase::Epilogue::Shape,
+    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
+    ImplicitGemmBase::Epilogue::kPartitionsK,
+    ElementC,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    ImplicitGemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
+    typename ImplicitGemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv3dFprop specialization for Analytic IteratorAlgorithm,
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv3dFpropWithBroadcast <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  using ImplicitGemmBase = typename DefaultConv3dFprop<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    arch::OpClassSimt,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MathOperatorTag,
+    IteratorAlgorithm,
+    StrideSupport
+  >::Kernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastSimt<
+    ArchTag,
+    typename ImplicitGemmBase::Epilogue::Shape,
+    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
+    ElementC,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    ImplicitGemmBase::Epilogue::kElementsPerAccess,
+    layout::NoPermute,
+    StrideSupport,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
+    typename ImplicitGemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace conv
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/kernel/default_conv3d_wgrad.h b/include/cutlass/conv/kernel/default_conv3d_wgrad.h
index 46dc392043..4ed5e0c1bf 100644
--- a/include/cutlass/conv/kernel/default_conv3d_wgrad.h
+++ b/include/cutlass/conv/kernel/default_conv3d_wgrad.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -53,7 +53,7 @@ namespace kernel {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv2dWgrad
+/// Defines a kernel for Conv3dWgrad
 template <
   typename ElementA,
   typename LayoutA,
@@ -500,6 +500,433 @@ struct DefaultConv3dWgrad <
     Conv3dProblemSize
   >;
 };
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                         OpClassSimt convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv3dWgrad specialization for Analytic IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv3dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad,
+    Conv3dProblemSize
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dWgrad specialization for Optimized IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv3dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad,
+    Conv3dProblemSize
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dWgrad specialization for Analytic IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultConv3dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dWgrad specialization for Optimized IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultConv3dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad,
+    Conv3dProblemSize
+  >;
+
+};
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace kernel
diff --git a/include/cutlass/conv/kernel/default_deconv2d.h b/include/cutlass/conv/kernel/default_deconv2d.h
new file mode 100644
index 0000000000..4db152cd7a
--- /dev/null
+++ b/include/cutlass/conv/kernel/default_deconv2d.h
@@ -0,0 +1,999 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h" 
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_tile_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Deconv2d
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kStrided,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
+> struct DefaultDeconv2d;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Deconv2d specialization for Analytic IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv2d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      conv::StrideSupport::kUnity
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB,
+      cutlass::AlignedArray<ElementB, ThreadMapB::kElementsPerAccess>,
+      conv::GroupMode::kNone,
+      true /*IsDeconv*/
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport::kStrided,
+    4
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv2d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      conv::StrideSupport::kStrided
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB,
+      cutlass::AlignedArray<ElementB, ThreadMapB::kElementsPerAccess>,
+      conv::GroupMode::kNone,
+      true /*IsDeconv*/
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimtStridedDgrad<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Deconv2d specialization for Optimized IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv2d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kUnity
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB,
+      cutlass::AlignedArray<ElementB, ThreadMapB::kElementsPerAccess>,
+      true /*IsDeconv*/
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport::kStrided,
+    4
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv2d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  conv::StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      conv::StrideSupport::kStrided
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB,
+      cutlass::AlignedArray<ElementB, ThreadMapB::kElementsPerAccess>,
+      true /*IsDeconv*/
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimtStridedDgrad<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv
+  >;
+
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Deconv2d specialization for Analytic IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv2d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        conv::StrideSupport::kUnity
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, LayoutB,
+        ThreadMapB,
+        cutlass::AlignedArray<ElementB, ThreadMapB::kElementsPerAccess>,
+        conv::GroupMode::kNone,
+        true /*IsDeconv*/
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport::kStrided,
+    4
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv
+  >;
+
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv2d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        conv::StrideSupport::kStrided
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, LayoutB,
+        ThreadMapB,
+        cutlass::AlignedArray<ElementB, ThreadMapB::kElementsPerAccess>,
+        conv::GroupMode::kNone,
+        true /*IsDeconv*/
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimtStridedDgrad<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Deconv2d specialization for Optimized IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv2d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        StrideSupport::kUnity
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, LayoutB,
+        ThreadMapB,
+        cutlass::AlignedArray<ElementB, ThreadMapB::kElementsPerAccess>,
+        true /*IsDeconv*/
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport::kStrided,
+    4
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv2d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  conv::StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        conv::StrideSupport::kStrided
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, LayoutB,
+        ThreadMapB,
+        cutlass::AlignedArray<ElementB, ThreadMapB::kElementsPerAccess>,
+        true /*IsDeconv*/
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimtStridedDgrad<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv
+  >;
+
+};
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/kernel/default_deconv2d_with_broadcast.h b/include/cutlass/conv/kernel/default_deconv2d_with_broadcast.h
new file mode 100644
index 0000000000..d11432ed39
--- /dev/null
+++ b/include/cutlass/conv/kernel/default_deconv2d_with_broadcast.h
@@ -0,0 +1,305 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief 
+    Defines a GEMM with Broadcast based on an existing UniversalGemm kernel.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_deconv2d.h"
+#include "cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_broadcast.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kStrided,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
+>
+struct DefaultDeconv2dWithBroadcast {
+
+  using ImplicitGemmBase = typename DefaultDeconv2d<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MathOperatorTag,
+    IteratorAlgorithm,
+    StrideSupport,
+    AlignmentA,
+    AlignmentB
+  >::Kernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastTensorOp<
+    ArchTag,
+    typename ImplicitGemmBase::Epilogue::Shape,
+    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
+    ImplicitGemmBase::Epilogue::kPartitionsK,
+    ElementC,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    ImplicitGemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
+    typename ImplicitGemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Deconv2d specialization,
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv2dWithBroadcast <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm,
+  conv::StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+
+  using ImplicitGemmBase = typename DefaultDeconv2d<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    arch::OpClassSimt,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MathOperatorTag,
+    IteratorAlgorithm,
+    conv::StrideSupport::kUnity,
+    AlignmentA,
+    AlignmentB
+  >::Kernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastSimt<
+    ArchTag,
+    typename ImplicitGemmBase::Epilogue::Shape,
+    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
+    ElementC,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    ImplicitGemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
+    typename ImplicitGemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv2dWithBroadcast <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm,
+  conv::StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+
+  using ImplicitGemmBase = typename DefaultDeconv2d<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    arch::OpClassSimt,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MathOperatorTag,
+    IteratorAlgorithm,
+    conv::StrideSupport::kStrided,
+    AlignmentA,
+    AlignmentB
+  >::Kernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastSimtStridedDgrad<
+    ArchTag,
+    typename ImplicitGemmBase::Epilogue::Shape,
+    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
+    ElementC,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    ImplicitGemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
+    typename ImplicitGemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+}  // namespace kernel
+}  // namespace conv
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/kernel/default_deconv3d.h b/include/cutlass/conv/kernel/default_deconv3d.h
new file mode 100644
index 0000000000..70800c7af7
--- /dev/null
+++ b/include/cutlass/conv/kernel/default_deconv3d.h
@@ -0,0 +1,541 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h"
+
+#include "cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_tile_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Deconv3d
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kStrided
+> struct DefaultDeconv3d;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultDeconv3d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kStrided
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      conv::StrideSupport::kStrided
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB,
+      true /*IsDeconv*/
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport::kStrided,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv,
+    Conv3dProblemSize
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Deconv3d specialization for Optimized IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultDeconv3d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kUnity
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      LayoutB,
+      ThreadMapB,
+      true /*IsDeconv*/
+      // ThreadMapB,
+      // StrideSupport::kUnity
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport::kStrided,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultDeconv3d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kStrided
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    // cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        conv::StrideSupport::kStrided
+      // >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    // cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB,
+        true /*IsDeconv*/
+      // >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport::kStrided,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Deconv3d specialization for Optimized IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultDeconv3d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    // cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        StrideSupport::kUnity
+      // >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    // cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        LayoutB,
+        ThreadMapB,
+        true /*IsDeconv*/
+        // ThreadMapB,
+        // StrideSupport::kUnity
+      // >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport::kStrided,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv,
+    Conv3dProblemSize
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/include/cutlass/conv/kernel/default_deconv3d_with_broadcast.h b/include/cutlass/conv/kernel/default_deconv3d_with_broadcast.h
new file mode 100644
index 0000000000..affe7a06f4
--- /dev/null
+++ b/include/cutlass/conv/kernel/default_deconv3d_with_broadcast.h
@@ -0,0 +1,309 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief 
+    Defines a GEMM with Broadcast based on an existing UniversalGemm kernel.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_deconv3d.h"
+#include "cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_broadcast.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kStrided,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
+>
+struct DefaultDeconv3dWithBroadcast {
+
+  using ImplicitGemmBase = typename DefaultDeconv3d<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MathOperatorTag,
+    IteratorAlgorithm,
+    StrideSupport
+  >::Kernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastTensorOp<
+    ArchTag,
+    typename ImplicitGemmBase::Epilogue::Shape,
+    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
+    ImplicitGemmBase::Epilogue::kPartitionsK,
+    ElementC,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    ImplicitGemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
+    typename ImplicitGemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Deconv3d specialization for Analytic IteratorAlgorithm,
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv3dWithBroadcast <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm,
+  conv::StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+
+  using ImplicitGemmBase = typename DefaultDeconv3d<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    arch::OpClassSimt,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MathOperatorTag,
+    IteratorAlgorithm,
+    conv::StrideSupport::kUnity
+  >::Kernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastSimt<
+    ArchTag,
+    typename ImplicitGemmBase::Epilogue::Shape,
+    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
+    ElementC,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    ImplicitGemmBase::Epilogue::kElementsPerAccess,
+    layout::NoPermute,
+    StrideSupport::kStrided,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
+    typename ImplicitGemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv3dWithBroadcast <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm,
+  conv::StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+
+  using ImplicitGemmBase = typename DefaultDeconv3d<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    arch::OpClassSimt,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MathOperatorTag,
+    IteratorAlgorithm,
+    conv::StrideSupport::kStrided
+  >::Kernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastSimt<
+    ArchTag,
+    typename ImplicitGemmBase::Epilogue::Shape,
+    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
+    ElementC,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    ImplicitGemmBase::Epilogue::kElementsPerAccess,
+    layout::NoPermute,
+    StrideSupport::kStrided,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
+    typename ImplicitGemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace conv
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/kernel/default_depthwise_fprop.h b/include/cutlass/conv/kernel/default_depthwise_fprop.h
new file mode 100644
index 0000000000..aa4f2c359c
--- /dev/null
+++ b/include/cutlass/conv/kernel/default_depthwise_fprop.h
@@ -0,0 +1,588 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level Depthwise implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+#include "cutlass/conv/kernel/direct_convolution.h"
+
+#include "cutlass/conv/threadblock/depthwise_mma_core_with_lane_access_size.h"
+
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/depthwise_fprop_pipelined.h"
+
+// Direct Conv Related Header files
+#include "cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_optimized.h"
+#include "cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_fixed_stride_dilation.h"
+
+#include "cutlass/conv/threadblock/depthwise_fprop_filter_tile_access_iterator_direct_conv_optimized.h"
+#include "cutlass/conv/threadblock/depthwise_fprop_direct_conv_multistage.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for DepthwiseFprop
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = cutlass::sizeof_bits<ElementB>::value / cutlass::sizeof_bits<ElementB>::value
+> struct DefaultDepthwiseFprop;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for DepthwiseFprop with direct convolution algorithm
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename ThreadBlockOutputShape,
+  typename FilterShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity,
+  // MatrixShape<Height, Width>
+  typename StrideShape = cutlass::MatrixShape<-1, -1>,
+  // MatrixShape< Height, Width> 
+  typename DilationShape =  cutlass::MatrixShape<-1, -1>, 
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
+> struct DefaultDepthwiseDirect2dConvFprop;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Depthwise specialization for Analytic IteratorAlgorithm
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDepthwiseFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag, //   cutlass::arch::OpMultiplyAdd
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::conv::threadblock::DepthwiseMmaCoreWithLaneAccessSize<
+      ThreadblockShape,
+      WarpShape,
+      InstructionShape,
+      ElementA,
+      layout::RowMajor,
+      ElementB,
+      layout::ColumnMajor,
+      ElementAccumulator,
+      layout::RowMajor,
+      arch::OpClassSimt,
+      128,
+      sizeof_bits<ElementB>::value,
+      2,
+      MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA, LayoutA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, LayoutB,
+        ThreadMapB,
+        AccessTypeB,
+        cutlass::conv::GroupMode::kDepthwise
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::DepthwiseFpropPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv2dProblemSize,
+    cutlass::conv::GroupMode::kDepthwise
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Depthwise specialization for direct 2d conv implementation, 
+/// multiple stage pipeline, and SIMT-based mainloop
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename ThreadBlockOutputShape,
+  typename FilterShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  typename StrideShape,
+  typename DilationShape,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDepthwiseDirect2dConvFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  ThreadBlockOutputShape,
+  FilterShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  StrideShape,
+  DilationShape,
+  AlignmentA,
+  AlignmentB
+> {
+  // One warp handles the entrie groups per cta.
+  static_assert(ThreadblockShape::kN == WarpShape::kN,
+                "ThreadblockShape::kN should be same as WarpShape::kN ");
+  static_assert(ThreadblockShape::kK == FilterShape::kCount && WarpShape::kK == FilterShape::kCount,
+                "ThreadblockShape::kK and WarpShape::kK should be same as filter size");
+  static_assert(ThreadblockShape::kM % WarpShape::kM == 0,
+                "ThreadblockShape::kM must be divisible by WarpShape shape::kM");
+  static_assert(ThreadBlockOutputShape::kN, "ThreadBlockOutputShape::kN should be 1");
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::conv::threadblock::DepthwiseDirectConvMmaCoreWithLaneAccessSize<
+      ThreadblockShape,
+      ThreadBlockOutputShape,
+      FilterShape,
+      WarpShape,
+      InstructionShape,
+      ElementA,
+      layout::RowMajor,
+      ElementB,
+      layout::ColumnMajor,
+      ElementAccumulator,
+      layout::RowMajor,
+      arch::OpClassSimt,
+      128,
+      128,
+      Stages,
+      MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::DepthwiseFpropActivationDirect2dConvTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM,ThreadblockShape::kN>, // < outputShape:KMNK, groups per cta>
+      ThreadBlockOutputShape,
+      ElementA, LayoutA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+      cutlass::conv::threadblock::DepthwiseFpropFilterDirectConvTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kN, FilterShape::kCount>,
+        ElementB, LayoutB,
+        ThreadMapB
+      >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  using ThreadOutputShape = typename MmaCore::ThreadOutputShape;
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * AlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultDirectConvEpilogueSimt<
+    ThreadblockShape, // < outputShape:KMNK, groups per cta>
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    ThreadOutputShape,
+    ThreadBlockOutputShape
+  >::Epilogue;
+
+  // Define the Mma
+  using Mma = threadblock::DepthwiseFpropDirectConvMultipleStage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    CacheOpA,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages,
+    Epilogue
+  >;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::DirectConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv2dProblemSize,
+    cutlass::conv::GroupMode::kDepthwise,
+    ThreadBlockOutputShape
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Depthwise specialization for direct 2d conv implementation, 
+/// multiple stage pipeline, and SIMT-based mainloop
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename ThreadBlockOutputShape,
+  typename FilterShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  typename StrideShape,
+  typename DilationShape,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDepthwiseDirect2dConvFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  ThreadBlockOutputShape,
+  FilterShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kFixedStrideDilation,
+  StrideSupport,
+  StrideShape,
+  DilationShape,
+  AlignmentA,
+  AlignmentB
+> {
+
+
+
+  // One warp handles the entrie groups per cta.
+  static_assert(ThreadblockShape::kN == WarpShape::kN,
+                "ThreadblockShape::kN should be same as WarpShape::kN ");
+  static_assert(ThreadblockShape::kK == FilterShape::kCount && WarpShape::kK == FilterShape::kCount,
+                "ThreadblockShape::kK and WarpShape::kK should be same as filter size");
+  static_assert(ThreadblockShape::kM % WarpShape::kM == 0,
+                "ThreadblockShape::kM must be divisible by WarpShape shape::kM");
+  static_assert(ThreadBlockOutputShape::kN, "ThreadBlockOutputShape::kN should be 1");
+
+  static_assert(StrideShape::kRow >= 0 && StrideShape::kColumn >= 0, "Stride should be fixed");
+  static_assert(DilationShape::kRow >= 0 && DilationShape::kColumn >= 0, "Stride should be fixed");
+
+  // Activations loaded by threadblock
+  static int const ActivationShapeH = (ThreadBlockOutputShape::kH - 1) * StrideShape::kRow +
+                             (FilterShape::kRow - 1) * DilationShape::kRow + 1;
+
+  static int const ActivationShapeW = (ThreadBlockOutputShape::kW - 1) * StrideShape::kColumn +
+                             (FilterShape::kColumn - 1) * DilationShape::kColumn + 1;
+
+  using ActivationShape =
+      cutlass::conv::TensorNHWCShape<1, ActivationShapeH, ActivationShapeW, ThreadblockShape::kN >;
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::conv::threadblock::DepthwiseDirectConvMmaCoreWithLaneAccessSize<
+      ThreadblockShape,
+      ThreadBlockOutputShape,
+      FilterShape,
+      WarpShape,
+      InstructionShape,
+      ElementA,
+      layout::RowMajor,
+      ElementB,
+      layout::ColumnMajor,
+      ElementAccumulator,
+      layout::RowMajor,
+      arch::OpClassSimt,
+      128,
+      128,
+      Stages,
+      MathOperatorTag,
+      IteratorAlgorithm::kFixedStrideDilation,
+      StrideShape,
+      DilationShape,
+      ActivationShape>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::DepthwiseFpropActivationDirect2dConvTileAccessIteratorFixedStrideDilation<
+      cutlass::MatrixShape<ThreadblockShape::kM,ThreadblockShape::kN>, // < outputShape:KMNK, groups per cta>
+      ThreadBlockOutputShape,
+      StrideShape,
+      DilationShape,
+      ActivationShape,
+      ElementA, LayoutA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+      cutlass::conv::threadblock::DepthwiseFpropFilterDirectConvTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kN, FilterShape::kCount>,
+        ElementB, LayoutB,
+        ThreadMapB
+      >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  using ThreadOutputShape = typename MmaCore::ThreadOutputShape;
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * AlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultDirectConvEpilogueSimt<
+    ThreadblockShape, // < outputShape:KMNK, groups per cta>
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    ThreadOutputShape,
+    ThreadBlockOutputShape
+  >::Epilogue;
+
+  // Define the Mma
+  using Mma = threadblock::DepthwiseFpropDirectConvMultipleStage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    CacheOpA,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages,
+    Epilogue,
+    IteratorAlgorithm::kFixedStrideDilation
+  >;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::DirectConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv2dProblemSize,
+    cutlass::conv::GroupMode::kDepthwise,
+    ThreadBlockOutputShape
+  >;
+};
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/kernel/direct_convolution.h b/include/cutlass/conv/kernel/direct_convolution.h
new file mode 100644
index 0000000000..d4e98fa49e
--- /dev/null
+++ b/include/cutlass/conv/kernel/direct_convolution.h
@@ -0,0 +1,506 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a multi-staged Depthwise Convolution kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/epilogue/threadblock/output_iterator_parameter.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters structure
+template <typename Mma_,                 ///! Threadblock-scoped matrix multiply-accumulate
+          typename Epilogue_,            ///! Epilogue
+          typename ThreadblockSwizzle_,  ///! Threadblock swizzling function
+          conv::Operator ConvOperator,   ///! Convolutional operator (Fprop, Dgrad, Wgrad)
+          typename Arguments_,           ///! Kernel Arguments
+          typename ConvOutputIteratorParameter_, ///! Output Iterator Params
+          typename ConvProblemSize_ = Conv2dProblemSize,  ///! Convolutional operator on 2D or 3D problem
+          conv::GroupMode GroupMode_ = conv::GroupMode::kNone,  ///! Group mode
+          typename ThreadBlockOutputShape_ = cutlass::conv::TensorNHWCShape<1, 1, 1, 1> >  ///! OutputShape per ThreadBlock
+struct DirectConvolutionParams {
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
+  static Operator const kConvolutionalOperator = ConvOperator;
+  using ConvProblemSize = ConvProblemSize_;
+  using Arguments = Arguments_;
+  using ConvOutputIteratorParameter = ConvOutputIteratorParameter_;
+
+  using ThreadblockShape = typename Mma::Shape;
+  static IteratorAlgorithm const kIteratorAlgorithm = Mma::IteratorA::kIteratorAlgorithm;
+  static conv::GroupMode const kGroupMode = GroupMode_;
+  static int const kStages = Mma::kStages;
+
+  ConvProblemSize problem_size;
+  cutlass::gemm::GemmCoord grid_tiled_shape;
+  gemm::GemmCoord implicit_gemm_problem_size;
+  int swizzle_log_tile;
+  int smem_size_;
+
+  int gemm_k_iterations;
+  int gemm_k_iterations_per_channel;
+  typename Mma::IteratorA::Params iterator_A;
+  typename Mma::IteratorA::Element const *ptr_A;
+  typename Mma::IteratorB::Params iterator_B;
+  typename Mma::IteratorB::Element const *ptr_B;
+  typename Mma::IteratorB::Element *ptr_reordered_B;
+  typename Epilogue::OutputTileIterator::Params iterator_C;
+  typename Epilogue::OutputTileIterator::Element *ptr_C;
+  typename Epilogue::OutputTileIterator::Params iterator_D;
+  typename Epilogue::OutputTileIterator::Element *ptr_D;
+  typename EpilogueOutputOp::Params output_op;
+  int *semaphore;
+  SplitKMode split_k_mode;
+  int split_k_slices;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  DirectConvolutionParams() : swizzle_log_tile(0), gemm_k_iterations(0) {}
+
+  ///
+  CUTLASS_HOST_DEVICE
+  DirectConvolutionParams(Arguments const &args, int *semaphore = nullptr)
+      : problem_size(args.problem_size),
+        implicit_gemm_problem_size(
+            cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size)),
+        iterator_A(Mma::IteratorA::getParams(args.problem_size, args.ref_A.layout())),
+        ptr_A(args.ref_A.data()),
+        iterator_B(Mma::IteratorB::getParams(args.problem_size, args.ref_B.layout())),
+        ptr_B(args.ref_B.data()),
+        ptr_reordered_B(args.ref_reordered_B.data()),
+        iterator_C(ConvOutputIteratorParameter::layout(args.ref_C), args.problem_size),
+        ptr_C(args.ref_C.data()),
+        iterator_D(ConvOutputIteratorParameter::layout(args.ref_D), args.problem_size),
+        ptr_D(args.ref_D.data()),
+        output_op(args.output_op),
+        semaphore(semaphore),
+        split_k_mode(args.split_k_mode),
+        split_k_slices(args.problem_size.split_k_slices) {
+    gemm_k_iterations =
+        depthwise_gemm_k_iterations<ThreadBlockOutputShape::kN,
+                                    ThreadBlockOutputShape::kH,
+                                    ThreadBlockOutputShape::kW>(kConvolutionalOperator,
+                                                                ThreadblockShape::kK,
+                                                                args.problem_size,
+                                                                kIteratorAlgorithm,
+                                                                kGroupMode,
+                                                                ThreadblockShape::kN);
+
+    gemm_k_iterations_per_channel = implicit_gemm_k_iterations_per_channel(
+        kConvolutionalOperator, args.problem_size, kIteratorAlgorithm);
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+        kConvolutionalOperator,
+        problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.problem_size.split_k_slices);
+
+    swizzle_log_tile = threadblock_swizzle.get_log_tile(grid_tiled_shape);
+
+    // Dynamic SMEM usage because stride and dilation are runtime params.
+    smem_size_ = (cutlass::platform::max(iterator_A.activation_size, int(sizeof(typename Epilogue::SharedStorage))) * kStages + iterator_B.filter_size);
+  }
+
+  CUTLASS_HOST_DEVICE
+  int get_smem_size() {
+    // Dynamic Smem Size
+    return smem_size_;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Params_, typename ElementB_>
+struct ReorderKernel {
+  using Params = Params_;
+  using ElementB = ElementB_;
+
+  union SharedStorage {};
+
+  static unsigned int const kReorderKernelThreadPerCTA = 128;
+
+  CUTLASS_HOST_DEVICE
+  ReorderKernel() {}
+
+  CUTLASS_HOST_DEVICE
+  static dim3 get_grid_shape(Params const &params) {
+    return dim3{static_cast<unsigned int>(
+                    (params.problem_size.filter_size() + kReorderKernelThreadPerCTA - 1) /
+                    kReorderKernelThreadPerCTA),
+                1,
+                1};
+  }
+
+  CUTLASS_HOST_DEVICE
+  static dim3 get_block_shape() { return dim3{kReorderKernelThreadPerCTA, 1, 1}; }
+
+  CUTLASS_HOST_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+    int64_t m = static_cast<int64_t>(params.problem_size.groups);
+    int64_t n = static_cast<int64_t>(params.problem_size.filter_size() / params.problem_size.K);
+    const ElementB *src_with_type = static_cast<const ElementB *>(params.ptr_B);
+    ElementB *dst_with_type = static_cast<ElementB *>(params.ptr_reordered_B);
+
+    int64_t linear_index = blockIdx.x * kReorderKernelThreadPerCTA + threadIdx.x;
+    int64_t index_m = linear_index / n;
+    int64_t index_n = linear_index % n;
+    int64_t new_linear_index = index_m + index_n * m;
+
+    if (linear_index < m * n) {
+      dst_with_type[new_linear_index] = src_with_type[linear_index];
+    }
+    return;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,                             ///! Epilogue
+  typename ThreadblockSwizzle_,                   ///! Threadblock swizzling function
+  conv::Operator ConvOperator,                    ///! Convolutional operator (Fprop, Dgrad, Wgrad)
+  typename ConvProblemSize_ = Conv2dProblemSize,  ///! Convolutional operator on 2D or 3D problem
+  conv::GroupMode GroupMode_ = conv::GroupMode::kNone,    ///! Group mode
+  typename ThreadBlockOutputShape_ = cutlass::conv::TensorNHWCShape<1, 1, 1, 1>
+>
+struct DirectConvolution {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
+  static Operator const kConvolutionalOperator = ConvOperator;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename EpilogueOutputOp::ElementOutput;
+
+  /// Set output tensor C layout
+  using LayoutC = LayoutA;
+
+  using ElementAccumulator = typename EpilogueOutputOp::ElementAccumulator;
+  using ElementCompute = typename EpilogueOutputOp::ElementCompute;
+
+  using WarpMmaOperator = typename Mma::Policy::Operator;
+
+  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
+  using MathOperator = typename ArchMmaOperator::Operator;
+  
+  using OperatorClass = typename WarpMmaOperator::OperatorClass;
+  using ArchTag = typename WarpMmaOperator::ArchTag;
+
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename WarpMmaOperator::Shape;
+  using InstructionShape = typename cutlass::gemm::GemmShape<1, 1, 1>;
+
+  static int const kStages = Mma::kStages;
+  static IteratorAlgorithm const kIteratorAlgorithm = Mma::IteratorA::kIteratorAlgorithm; 
+  static StrideSupport const kStrideSupport = Mma::IteratorA::kStrideSupport;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  using TensorRefA = typename Mma::IteratorA::TensorRef;
+  using TensorRefB = typename Mma::IteratorB::TensorRef;
+  using TensorRefC = cutlass::TensorRef<ElementC, LayoutC>;
+
+  /// Check iterator A and B convolution dimension are the same and 
+  // set device::ImplicitGemmConvolution::kConvDim
+  static_assert(Mma::IteratorA::kConvDim == Mma::IteratorB::kConvDim, 
+    "Convolution on different different dimensions is not supported");
+  static int const kConvDim = Mma::IteratorA::kConvDim;
+
+  /// Conv dimension and problem size structure (Conv2d or Conv3d)
+  using ConvProblemSize = ConvProblemSize_;
+
+  static conv::GroupMode const kGroupMode = GroupMode_;
+
+
+  //
+  //
+  //
+  using ConvOutputIteratorParameter = epilogue::threadblock::ConvOutputIteratorParameter<
+    LayoutC,
+    typename Epilogue::OutputTileIterator::Layout, 
+    TensorRefC,
+    ConvOperator,
+    ConvProblemSize
+    >;
+
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    ConvProblemSize problem_size;
+    TensorRefA ref_A;
+    TensorRefB ref_B;
+    TensorRefB ref_reordered_B;
+    TensorRefC ref_C;
+    TensorRefC ref_D;
+    typename EpilogueOutputOp::Params output_op;
+    SplitKMode split_k_mode;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() { }
+   
+    CUTLASS_HOST_DEVICE 
+    Arguments(
+      ConvProblemSize const & problem_size
+    ):
+      problem_size(problem_size) { }
+
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      ConvProblemSize const & problem_size,
+      TensorRefA const & ref_A,
+      TensorRefB const & ref_B,
+      TensorRefC const & ref_C,
+      TensorRefC const & ref_D,
+      typename EpilogueOutputOp::Params const & output_op,
+      TensorRefB const & ref_reordered_B = nullptr,
+      SplitKMode const & split_k_mode = SplitKMode::kSerial
+    ):
+      problem_size(problem_size),
+      ref_A(ref_A),
+      ref_B(ref_B),
+      ref_C(ref_C),
+      ref_D(ref_D),
+      output_op(output_op),
+      ref_reordered_B(ref_reordered_B),
+      split_k_mode(split_k_mode)
+    {
+
+    }
+
+  };
+
+  using Params =
+      typename cutlass::conv::kernel::DirectConvolutionParams<Mma,
+                                                              Epilogue,
+                                                              ThreadblockSwizzle,
+                                                              kConvolutionalOperator,
+                                                              Arguments,
+                                                              ConvOutputIteratorParameter,
+                                                              ConvProblemSize,
+                                                              kGroupMode,
+                                                              ThreadBlockOutputShape>;
+
+  using ReorderKernel = typename cutlass::conv::kernel::ReorderKernel<Params, ElementB>;
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  DirectConvolution() { } 
+
+  /// Executes one ImplicitGEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_idx =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if threadblock is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_idx.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_idx.n()) {
+
+      return;
+    }
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+    int iterator_column_offset = 0;
+    int filter_row_offset = 0;
+    if (kGroupMode != GroupMode::kNone) {
+      if (kGroupMode == GroupMode::kDepthwise) {
+        iterator_column_offset += threadblock_tile_idx.n() * Mma::Shape::kN;
+      }
+    } 
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.iterator_A,
+      params.problem_size,
+      params.ptr_A,
+      thread_idx,
+      MatrixCoord(
+        threadblock_tile_idx.m() + threadblock_tile_idx.k(),
+        iterator_column_offset
+      )
+    );
+    
+    typename Mma::IteratorB iterator_B(
+      params.iterator_B,
+      params.problem_size,
+      params.ptr_reordered_B,
+      thread_idx,
+      MatrixCoord(
+        filter_row_offset,
+        iterator_column_offset
+      )
+    );
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+    
+    // Compute logical position within grid
+    threadblock_tile_idx =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+
+    MatrixCoord threadblock_offset(
+      threadblock_tile_idx.m() + threadblock_tile_idx.k(),
+      threadblock_tile_idx.n() * Mma::Shape::kN
+    );
+
+    // Tile iterator writing to destination tensor
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.iterator_D,
+      params.ptr_D,
+      ConvOutputIteratorParameter::extent(params.problem_size),
+      thread_idx,
+      threadblock_offset
+    );
+    
+    // Tile iterator reading from source accumulator tensor
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.iterator_C,
+      params.ptr_C,
+      ConvOutputIteratorParameter::extent(params.problem_size),
+      thread_idx,
+      threadblock_offset
+    );
+
+
+    // Construct the epilogue
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+
+    // Compute threadblock-scoped matrix multiply-add
+    // Epilogue is fused in the mainloop
+    mma(params.gemm_k_iterations,
+        accumulators,
+        iterator_A,
+        params.iterator_A,
+        iterator_B,
+        params.iterator_B,
+        accumulators,
+        epilogue,
+        output_op,
+        iterator_D,
+        iterator_C,
+        params.split_k_slices);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/kernel/implicit_gemm_convolution.h b/include/cutlass/conv/kernel/implicit_gemm_convolution.h
index d3f1a19f27..b1e0b477a8 100644
--- a/include/cutlass/conv/kernel/implicit_gemm_convolution.h
+++ b/include/cutlass/conv/kernel/implicit_gemm_convolution.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -61,8 +61,9 @@ template <
   typename Mma_,                                  ///! Threadblock-scoped matrix multiply-accumulate 
   typename Epilogue_,                             ///! Epilogue
   typename ThreadblockSwizzle_,                   ///! Threadblock swizzling function
-  conv::Operator ConvOperator,                    ///! Convolutional operator (Fprop, Dgrad, Wgrad)
-  typename ConvProblemSize_ = Conv2dProblemSize   ///! Convolutional operator on 2D or 3D problem
+  conv::Operator ConvOperator,                    ///! Convolutional operator (Fprop, Dgrad, Wgrad, Deconv)
+  typename ConvProblemSize_ = Conv2dProblemSize,  ///! Convolutional operator on 2D or 3D problem
+  conv::GroupMode GroupMode_ = conv::GroupMode::kNone    ///! Group mode
 >
 struct ImplicitGemmConvolution {
 
@@ -117,6 +118,8 @@ struct ImplicitGemmConvolution {
   /// Conv dimension and problem size structure (Conv2d or Conv3d)
   using ConvProblemSize = ConvProblemSize_;
 
+  static conv::GroupMode const kGroupMode = GroupMode_;
+
   /// Wgrad C stride idx for implicit gemm algorithm 
   // Conv2d row-major matrix C (KxRSC) 
   // Conv3d row-major matrix C (KxTRSC)
@@ -198,6 +201,7 @@ struct ImplicitGemmConvolution {
     int swizzle_log_tile;
 
     int gemm_k_iterations;
+    int gemm_k_iterations_per_channel;
     typename Mma::IteratorA::Params iterator_A;
     typename Mma::IteratorA::Element const *ptr_A;
     typename Mma::IteratorB::Params iterator_B;
@@ -229,9 +233,9 @@ struct ImplicitGemmConvolution {
       ptr_A(args.ref_A.data()),
       iterator_B(args.problem_size, args.ref_B.layout()),
       ptr_B(args.ref_B.data()),
-      iterator_C(ConvOutputIteratorParameter::layout(args.ref_C)),
+      iterator_C(ConvOutputIteratorParameter::layout(args.ref_C), implicit_gemm_tensor_c_extent(kConvolutionalOperator, args.problem_size)),
       ptr_C(args.ref_C.data()),
-      iterator_D(ConvOutputIteratorParameter::layout(args.ref_D)),
+      iterator_D(ConvOutputIteratorParameter::layout(args.ref_D), implicit_gemm_tensor_c_extent(kConvolutionalOperator, args.problem_size)),
       ptr_D(args.ref_D.data()),
       output_op(args.output_op),
       semaphore(semaphore),
@@ -241,7 +245,12 @@ struct ImplicitGemmConvolution {
         kConvolutionalOperator,
         ThreadblockShape::kK,
         args.problem_size,
-        kIteratorAlgorithm);
+        kIteratorAlgorithm,
+        kGroupMode,
+        ThreadblockShape::kN);
+
+      gemm_k_iterations_per_channel = implicit_gemm_k_iterations_per_channel(
+          kConvolutionalOperator, args.problem_size, kIteratorAlgorithm);
 
       ThreadblockSwizzle threadblock_swizzle;
 
@@ -286,6 +295,17 @@ struct ImplicitGemmConvolution {
 
     // Compute position within threadblock
     int thread_idx = threadIdx.x;
+    int iterator_A_column_offset = threadblock_tile_idx.k() * Mma::Shape::kK;
+    if (kGroupMode != GroupMode::kNone) {
+      if (kGroupMode != GroupMode::kDepthwise) {
+        int k_per_group = params.problem_size.K / params.problem_size.groups;
+        int group_idx = threadblock_tile_idx.n() * Mma::Shape::kN / k_per_group;
+        int channels_per_group = params.problem_size.C / params.problem_size.groups;
+        iterator_A_column_offset += group_idx * channels_per_group;
+      } else {
+        iterator_A_column_offset += threadblock_tile_idx.n() * Mma::Shape::kN;
+      }
+    } 
 
     // Construct iterators to A and B operands
     typename Mma::IteratorA iterator_A(
@@ -295,7 +315,7 @@ struct ImplicitGemmConvolution {
       thread_idx,
       MatrixCoord(
         threadblock_tile_idx.m() * Mma::Shape::kM,
-        threadblock_tile_idx.k() * Mma::Shape::kK
+        iterator_A_column_offset
       )
     );
     
@@ -312,7 +332,7 @@ struct ImplicitGemmConvolution {
 
     // Broadcast the warp_id computed by lane 0 to ensure dependent code
     // is compiled as warp-uniform.
-    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int warp_idx = canonical_warp_idx_sync();
     int lane_idx = threadIdx.x % 32;
 
     //
@@ -327,7 +347,7 @@ struct ImplicitGemmConvolution {
     accumulators.clear();
 
     // Compute threadblock-scoped matrix multiply-add
-    mma(params.gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
+    mma(params.gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators, params.gemm_k_iterations_per_channel);
 
     //
     // Epilogue
@@ -377,7 +397,6 @@ struct ImplicitGemmConvolution {
       threadblock_offset
     );
 
-
     // Construct the epilogue
     Epilogue epilogue(
       shared_storage.epilogue, 
diff --git a/include/cutlass/conv/kernel/implicit_gemm_convolution_fusion.h b/include/cutlass/conv/kernel/implicit_gemm_convolution_fusion.h
index d43521f155..74ecae4014 100644
--- a/include/cutlass/conv/kernel/implicit_gemm_convolution_fusion.h
+++ b/include/cutlass/conv/kernel/implicit_gemm_convolution_fusion.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -119,6 +119,8 @@ struct ImplicitGemmConvolutionFusion {
   /// Conv dimension and problem size structure (Conv2d or Conv3d)
   using ConvProblemSize = ConvProblemSize_;
 
+  static conv::GroupMode const kGroupMode = conv::GroupMode::kNone;
+
   /// Wgrad C stride idx for implicit gemm algorithm 
   // Conv2d row-major matrix C (KxRSC) 
   // Conv3d row-major matrix C (KxTRSC)
@@ -200,32 +202,30 @@ struct ImplicitGemmConvolutionFusion {
 
   /// Parameters structure
   struct Params {
-    ConvProblemSize problem_size;
-    cutlass::gemm::GemmCoord grid_tiled_shape;
-    gemm::GemmCoord implicit_gemm_problem_size;
-    int swizzle_log_tile;
-    int gemm_k_iterations;
-    typename Mma::IteratorA::Params iterator_A;
-    typename Mma::IteratorA::Element const *ptr_A;
-    typename Mma::IteratorB::Params iterator_B;
-    typename Mma::IteratorB::Element const *ptr_B;
-    typename Mma::IteratorScaleBias::Params iterator_scale_bias;
-    typename Mma::IteratorScaleBias::Element const *ptr_scale;
-    typename Mma::IteratorScaleBias::Element const *ptr_bias;
-    typename Epilogue::OutputTileIterator::Params iterator_C;
-    typename Epilogue::OutputTileIterator::Element *ptr_C;
-    typename Epilogue::OutputTileIterator::Params iterator_D;
-    typename Epilogue::OutputTileIterator::Element *ptr_D;
-    typename EpilogueOutputOp::Params output_op;
-    int *semaphore;
-    SplitKMode split_k_mode;
+    ConvProblemSize problem_size{};
+    cutlass::gemm::GemmCoord grid_tiled_shape{};
+    gemm::GemmCoord implicit_gemm_problem_size{};
+    int swizzle_log_tile{0};
+    int gemm_k_iterations{0};
+    typename Mma::IteratorA::Params iterator_A{};
+    typename Mma::IteratorA::Element const *ptr_A = nullptr;
+    typename Mma::IteratorB::Params iterator_B{};
+    typename Mma::IteratorB::Element const *ptr_B = nullptr;
+    typename Mma::IteratorScaleBias::Params iterator_scale_bias{};
+    typename Mma::IteratorScaleBias::Element const *ptr_scale = nullptr;
+    typename Mma::IteratorScaleBias::Element const *ptr_bias = nullptr;
+    typename Epilogue::OutputTileIterator::Params iterator_C {};
+    typename Epilogue::OutputTileIterator::Element *ptr_C = nullptr;
+    typename Epilogue::OutputTileIterator::Params iterator_D {};
+    typename Epilogue::OutputTileIterator::Element *ptr_D = nullptr;
+    typename EpilogueOutputOp::Params output_op {};
+    int *semaphore = nullptr;
+    SplitKMode split_k_mode {};
 
     //
     // Methods
     //
-
-    CUTLASS_HOST_DEVICE
-    Params(): swizzle_log_tile(0), gemm_k_iterations(0) { }
+    Params() = default;
 
     /// 
     CUTLASS_HOST_DEVICE
@@ -337,7 +337,7 @@ struct ImplicitGemmConvolutionFusion {
 
     // Broadcast the warp_id computed by lane 0 to ensure dependent code
     // is compiled as warp-uniform.
-    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int warp_idx = canonical_warp_idx_sync();
     int lane_idx = threadIdx.x % 32;
 
     //
diff --git a/include/cutlass/conv/kernel/implicit_gemm_convolution_strided_dgrad.h b/include/cutlass/conv/kernel/implicit_gemm_convolution_strided_dgrad.h
index 65191f5a6a..bf00f90bac 100644
--- a/include/cutlass/conv/kernel/implicit_gemm_convolution_strided_dgrad.h
+++ b/include/cutlass/conv/kernel/implicit_gemm_convolution_strided_dgrad.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -117,6 +117,8 @@ struct ImplicitGemmConvolutionStridedDgrad {
   /// Conv dimension and problem size structure (Conv2d or Conv3d)
   using ConvProblemSize = ConvProblemSize_;
 
+  static conv::GroupMode const kGroupMode = conv::GroupMode::kNone;
+
   /// Wgrad C stride idx for implicit gemm algorithm 
   // Conv2d row-major matrix C (KxRSC) 
   // Conv3d row-major matrix C (KxTRSC)
@@ -156,21 +158,20 @@ struct ImplicitGemmConvolutionStridedDgrad {
     // Data members
     //
 
-    ConvProblemSize problem_size;
-    TensorRefA ref_A;
-    TensorRefB ref_B;
-    TensorRefC ref_C;
-    TensorRefC ref_D;
-    typename EpilogueOutputOp::Params output_op;
-    SplitKMode split_k_mode;
+    ConvProblemSize problem_size{};
+    TensorRefA ref_A{};
+    TensorRefB ref_B{};
+    TensorRefC ref_C{};
+    TensorRefC ref_D{};
+    typename EpilogueOutputOp::Params output_op{};
+    SplitKMode split_k_mode{};
 
     //
     // Methods
     //
 
     /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Arguments() { }
+    Arguments() = default;
    
     CUTLASS_HOST_DEVICE 
     Arguments(
@@ -203,29 +204,28 @@ struct ImplicitGemmConvolutionStridedDgrad {
 
   /// Parameters structure
   struct Params {
-    ConvProblemSize problem_size;
-    cutlass::gemm::GemmCoord grid_tiled_shape;
-    FastDivmod stride_h_divmod;
-    FastDivmod stride_w_divmod;
-    int gemm_k_iterations;
-    typename Mma::IteratorA::Params iterator_A;
-    typename Mma::IteratorA::Element const *ptr_A;
-    typename Mma::IteratorB::Params iterator_B;
-    typename Mma::IteratorB::Element const *ptr_B;
-    typename Epilogue::OutputTileIterator::Params iterator_C;
-    typename Epilogue::OutputTileIterator::Element *ptr_C;
-    typename Epilogue::OutputTileIterator::Params iterator_D;
-    typename Epilogue::OutputTileIterator::Element *ptr_D;
-    typename EpilogueOutputOp::Params output_op;
-    int *semaphore;
-    SplitKMode split_k_mode;
+    ConvProblemSize problem_size{};
+    cutlass::gemm::GemmCoord grid_tiled_shape{};
+    int swizzle_log_tile{0};
+    FastDivmod stride_h_divmod{};
+    FastDivmod stride_w_divmod{};
+    int gemm_k_iterations{0};
+    typename Mma::IteratorA::Params iterator_A{};
+    typename Mma::IteratorA::Element const *ptr_A = nullptr;
+    typename Mma::IteratorB::Params iterator_B{};
+    typename Mma::IteratorB::Element const *ptr_B = nullptr;
+    typename Epilogue::OutputTileIterator::Params iterator_C{};
+    typename Epilogue::OutputTileIterator::Element *ptr_C = nullptr;
+    typename Epilogue::OutputTileIterator::Params iterator_D{};
+    typename Epilogue::OutputTileIterator::Element *ptr_D = nullptr;
+    typename EpilogueOutputOp::Params output_op {};
+    int *semaphore = nullptr;
+    SplitKMode split_k_mode {};
 
     //
     // Methods
     //
-
-    CUTLASS_HOST_DEVICE
-    Params(): gemm_k_iterations(0) { }
+    Params() = default;
 
     /// 
     CUTLASS_HOST_DEVICE
@@ -257,6 +257,8 @@ struct ImplicitGemmConvolutionStridedDgrad {
         args.problem_size,
         {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
         args.problem_size.split_k_slices);
+      
+      swizzle_log_tile = threadblock_swizzle.get_log_tile(grid_tiled_shape);
     }
   };
 
@@ -281,7 +283,7 @@ struct ImplicitGemmConvolutionStridedDgrad {
     ThreadblockSwizzle threadblock_swizzle;
 
     cutlass::gemm::GemmCoord threadblock_tile_idx =
-        threadblock_swizzle.get_tile_offset(params.grid_tiled_shape);
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
 
     // Early exit if CTA is out of range
     if (params.grid_tiled_shape.m() <= threadblock_tile_idx.m() ||
@@ -333,7 +335,7 @@ struct ImplicitGemmConvolutionStridedDgrad {
 
     // Broadcast the warp_id computed by lane 0 to ensure dependent code
     // is compiled as warp-uniform.
-    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int warp_idx = canonical_warp_idx_sync();
     int lane_idx = threadIdx.x % 32;
 
     // Check if CTA contributes valid MMA (Dy * w) and accumulator will be non-zero after MMA
@@ -387,16 +389,15 @@ struct ImplicitGemmConvolutionStridedDgrad {
 
     // Construct the semaphore.
     int block_idx = threadblock_tile_idx.m() + threadblock_tile_idx.n() * params.grid_tiled_shape.m();
-
     Semaphore semaphore(params.semaphore + block_idx, thread_idx);
-    
+
     // Compute logical position within grid
     threadblock_tile_idx =
-        threadblock_swizzle.get_tile_offset(params.grid_tiled_shape);
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
 
     // If performing a reduction via split-K, fetch the initial synchronization
     if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
-        
+
       // Fetch the synchronization lock initially but do not block.
       semaphore.fetch();
 
@@ -419,51 +420,51 @@ struct ImplicitGemmConvolutionStridedDgrad {
       start_r, start_s,
       threadblock_offset
     );
-    
-    // Tile iterator reading from source accumulator tensor
-    typename Epilogue::OutputTileIterator iterator_C(
-      params.iterator_C,
-      params.ptr_C,
-      ConvOutputIteratorParameter::extent(params.problem_size),
-      thread_idx,
-      params.stride_h_divmod, params.stride_w_divmod,
-      start_r, start_s,
-      threadblock_offset
-    );
-
 
     // Construct the epilogue
     Epilogue epilogue(
-      shared_storage.epilogue, 
-      thread_idx, 
-      warp_idx, 
+      shared_storage.epilogue,
+      thread_idx,
+      warp_idx,
       lane_idx);
 
-    // Wait on the semaphore - this latency may have been covered by iterator construction
-    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
-        
-      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
-      if (threadblock_tile_idx.k()) {
-        iterator_C = iterator_D;
-      }
+    if (output_op.is_source_needed())
+    {
+      // Tile iterator reading from source accumulator tensor
+      typename Epilogue::OutputTileIterator iterator_C(
+        params.iterator_C,
+        params.ptr_C,
+        ConvOutputIteratorParameter::extent(params.problem_size),
+        thread_idx,
+        params.stride_h_divmod, params.stride_w_divmod,
+        start_r, start_s,
+        threadblock_offset);
 
-      semaphore.wait(threadblock_tile_idx.k());
+      // Wait on the semaphore - this latency may have been covered by iterator construction
+      if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
 
+        // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+        if (threadblock_tile_idx.k()) {
+          iterator_C = iterator_D;
+        }
+
+        semaphore.wait(threadblock_tile_idx.k());
+      }
+
+      // Run epilogue with addend source iterator
+      epilogue(output_op, iterator_D, accumulators, iterator_C);
     }
-    // Each split-k-slice writes to a unique tensor location
-    else if (params.split_k_mode == SplitKMode::kParallel) {
-      iterator_D.add_pointer_offset(threadblock_tile_idx.k() * 
-        cutlass::conv::implicit_gemm_tensor_c_size(ConvOperator, params.problem_size));
+    else
+    {
+      // Run epilogue without addend source iterator
+      epilogue(output_op, iterator_D, accumulators);
     }
 
-    // Run efficient epilogue
-    epilogue(output_op, iterator_D, accumulators, iterator_C);
-  
     //
     // Release the semaphore
     //
 
-    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) { 
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
 
       int lock = 0;
       if (params.grid_tiled_shape.k() == threadblock_tile_idx.k() + 1) {
@@ -475,10 +476,11 @@ struct ImplicitGemmConvolutionStridedDgrad {
         // Otherwise, the semaphore is incremented
         lock = threadblock_tile_idx.k() + 1;
       }
-      
+
       semaphore.release(lock);
     }
-  } 
+
+  }
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -488,4 +490,3 @@ struct ImplicitGemmConvolutionStridedDgrad {
 } // namespace cutlass
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/include/cutlass/conv/kernel/implicit_gemm_convolution_with_absmax.h b/include/cutlass/conv/kernel/implicit_gemm_convolution_with_absmax.h
new file mode 100644
index 0000000000..b05fd2d3ed
--- /dev/null
+++ b/include/cutlass/conv/kernel/implicit_gemm_convolution_with_absmax.h
@@ -0,0 +1,494 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Convolution kernel with an epilogue that computes the absolute maximum value of the output
+    and a pre-activation-function auxiliary output. The auxiliary output is also (optionally)
+    stored to global memory.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/epilogue/threadblock/output_iterator_parameter.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,                             ///! Epilogue
+  typename ThreadblockSwizzle_,                   ///! Threadblock swizzling function
+  conv::Operator ConvOperator,                    ///! Convolutional operator (Fprop, Dgrad, Wgrad)
+  typename ConvProblemSize_ = Conv2dProblemSize   ///! Convolutional operator on 2D or 3D problem
+>
+struct ImplicitGemmConvolutionWithAbsMax {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static Operator const kConvolutionalOperator = ConvOperator;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename EpilogueOutputOp::ElementOutput;
+
+  /// Set output tensor C layout
+  using LayoutC = LayoutA;
+
+  using ElementAccumulator = typename EpilogueOutputOp::ElementAccumulator;
+  using ElementCompute = typename EpilogueOutputOp::ElementCompute;
+
+  using WarpMmaOperator = typename Mma::Policy::Operator;
+
+  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
+  using MathOperator = typename ArchMmaOperator::Operator;
+  
+  using OperatorClass = typename WarpMmaOperator::OperatorClass;
+  using ArchTag = typename WarpMmaOperator::ArchTag;
+
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename WarpMmaOperator::Shape;
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  static int const kStages = Mma::kStages;
+  static IteratorAlgorithm const kIteratorAlgorithm = Mma::IteratorA::kIteratorAlgorithm; 
+  static StrideSupport const kStrideSupport = Mma::IteratorA::kStrideSupport;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  using TensorRefA = typename Mma::IteratorA::TensorRef;
+  using TensorRefB = typename Mma::IteratorB::TensorRef;
+  using TensorRefC = cutlass::TensorRef<ElementC, LayoutC>;
+  using TensorRefAux = cutlass::TensorRef<typename EpilogueOutputOp::ElementAuxOutput, LayoutC>;
+
+  /// Check iterator A and B convolution dimension are the same and 
+  // set device::ImplicitGemmConvolution::kConvDim
+  static_assert(Mma::IteratorA::kConvDim == Mma::IteratorB::kConvDim, 
+    "Convolution on different different dimensions is not supported");
+  static int const kConvDim = Mma::IteratorA::kConvDim;
+
+  /// Conv dimension and problem size structure (Conv2d or Conv3d)
+  using ConvProblemSize = ConvProblemSize_;
+
+  static conv::GroupMode const kGroupMode = conv::GroupMode::kNone;
+
+  /// Wgrad C stride idx for implicit gemm algorithm 
+  // Conv2d row-major matrix C (KxRSC) 
+  // Conv3d row-major matrix C (KxTRSC)
+  static int const kWgradCStrideIdx = 
+    platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value ? 2 : 3;
+
+  /// This chooses the appropriate stride element of the C tensor.
+  static int const kTensorCStrideIdx = 
+    (kConvolutionalOperator == conv::Operator::kWgrad ? kWgradCStrideIdx : 0);
+
+  //
+  //
+  //
+  using ConvOutputIteratorParameter = epilogue::threadblock::ConvOutputIteratorParameter<
+    LayoutC,
+    typename Epilogue::OutputTileIterator::Layout, 
+    TensorRefC,
+    ConvOperator,
+    ConvProblemSize
+    >;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    ConvProblemSize problem_size;
+    TensorRefA ref_A;
+    TensorRefB ref_B;
+    TensorRefC ref_C;
+    TensorRefC ref_D;
+    TensorRefC ref_Aux;
+
+    typename EpilogueOutputOp::Params output_op;
+    SplitKMode split_k_mode;
+
+    void * ptr_Vector;
+
+    typename LayoutC::Stride::Index ldr;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() { }
+   
+    CUTLASS_HOST_DEVICE 
+    Arguments(
+      ConvProblemSize const & problem_size
+    ):
+      problem_size(problem_size) { }
+
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      ConvProblemSize const & problem_size,
+      TensorRefA const & ref_A,
+      TensorRefB const & ref_B,
+      TensorRefC const & ref_C,
+      TensorRefC const & ref_D,
+      TensorRefAux const & ref_Aux,
+      typename EpilogueOutputOp::Params const & output_op,
+      SplitKMode const & split_k_mode = SplitKMode::kSerial,
+      void * ptr_Vector = nullptr,
+      typename LayoutC::Stride::Index ldr = 0
+    ):
+      problem_size(problem_size),
+      ref_A(ref_A),
+      ref_B(ref_B),
+      ref_C(ref_C),
+      ref_D(ref_D),
+      ref_Aux(ref_Aux),
+      output_op(output_op),
+      split_k_mode(split_k_mode),
+      ptr_Vector(ptr_Vector),
+      ldr(ldr)
+    {
+
+    }
+
+  };
+
+  /// Parameters structure
+  struct Params {
+    ConvProblemSize problem_size;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    gemm::GemmCoord implicit_gemm_problem_size;
+    int swizzle_log_tile;
+
+    int gemm_k_iterations;
+    typename Mma::IteratorA::Params iterator_A;
+    typename Mma::IteratorA::Element const *ptr_A;
+    typename Mma::IteratorB::Params iterator_B;
+    typename Mma::IteratorB::Element const *ptr_B;
+    typename Epilogue::OutputTileIterator::Params iterator_C;
+    typename Epilogue::OutputTileIterator::Element *ptr_C;
+    typename Epilogue::OutputTileIterator::Params iterator_D;
+    typename Epilogue::OutputTileIterator::Element *ptr_D;
+    typename Epilogue::AuxOutputTileIterator::Params iterator_Aux;
+    typename Epilogue::AuxOutputTileIterator::Element *ptr_Aux;
+    typename EpilogueOutputOp::Params output_op;
+    int *semaphore;
+    SplitKMode split_k_mode;
+
+    void * ptr_Vector;
+    typename LayoutC::Stride::Index ldr;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params():
+      swizzle_log_tile(0), 
+      gemm_k_iterations(0),
+      ptr_Vector(nullptr),
+      ldr(0)
+    { }
+
+    /// 
+    CUTLASS_HOST_DEVICE
+    Params(
+      Arguments const &args,
+      int *semaphore = nullptr
+    ):
+      problem_size(args.problem_size),
+      implicit_gemm_problem_size(cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size)),
+      iterator_A(Mma::IteratorA::getParams(args.problem_size, args.ref_A.layout())),
+      ptr_A(args.ref_A.data()),
+      iterator_B(args.problem_size, args.ref_B.layout()),
+      ptr_B(args.ref_B.data()),
+      iterator_C(ConvOutputIteratorParameter::layout(args.ref_C)),
+      ptr_C(args.ref_C.data()),
+      iterator_D(ConvOutputIteratorParameter::layout(args.ref_D)),
+      ptr_D(args.ref_D.data()),
+      iterator_Aux(ConvOutputIteratorParameter::layout(args.ref_Aux)),
+      ptr_Aux(args.ref_Aux.data()),
+      output_op(args.output_op),
+      semaphore(semaphore),
+      split_k_mode(args.split_k_mode),
+      ptr_Vector(args.ptr_Vector), 
+      ldr(args.ldr)
+
+    {
+      gemm_k_iterations = implicit_gemm_k_iterations(kConvolutionalOperator, ThreadblockShape::kK, args.problem_size);
+
+      ThreadblockSwizzle threadblock_swizzle;
+
+      grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+        implicit_gemm_problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.problem_size.split_k_slices);
+
+      swizzle_log_tile = threadblock_swizzle.get_log_tile(grid_tiled_shape);
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  ImplicitGemmConvolutionWithAbsMax() { } 
+
+  /// Executes one ImplicitGEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_idx =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_idx.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_idx.n()) {
+
+      return;
+    }
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.iterator_A,
+      params.problem_size,
+      params.ptr_A,
+      thread_idx,
+      MatrixCoord(
+        threadblock_tile_idx.m() * Mma::Shape::kM,
+        threadblock_tile_idx.k() * Mma::Shape::kK
+      )
+    );
+    
+    typename Mma::IteratorB iterator_B(
+      params.iterator_B,
+      params.problem_size,
+      params.ptr_B,
+      thread_idx,
+      MatrixCoord(
+        threadblock_tile_idx.k() * Mma::Shape::kK,
+        threadblock_tile_idx.n() * Mma::Shape::kN
+      )
+    );
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(params.gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    // Construct the semaphore.
+    int block_idx = threadblock_tile_idx.m() + threadblock_tile_idx.n() * params.grid_tiled_shape.m();
+
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+    
+    // Compute logical position within grid
+    threadblock_tile_idx =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // If performing a reduction via split-K, fetch the initial synchronization
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
+        
+      // Fetch the synchronization lock initially but do not block.
+      semaphore.fetch();
+
+      // Indicate which position in a serial reduction the output operator is currently updating
+      output_op.set_k_partition(threadblock_tile_idx.k(), params.grid_tiled_shape.k());
+    }
+
+    MatrixCoord threadblock_offset(
+      threadblock_tile_idx.m() * Mma::Shape::kM,
+      threadblock_tile_idx.n() * Mma::Shape::kN
+    );
+
+    // Tile iterator writing to destination tensor
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.iterator_D,
+      params.ptr_D,
+      ConvOutputIteratorParameter::extent(params.problem_size),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to auxiliary tensor.
+    typename Epilogue::AuxOutputTileIterator iterator_Aux(
+      params.iterator_Aux,
+      params.ptr_Aux,
+      ConvOutputIteratorParameter::extent(params.problem_size),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator reading from source accumulator tensor
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.iterator_C,
+      params.ptr_C,
+      ConvOutputIteratorParameter::extent(params.problem_size),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Define the reduction output pointer and move to the appropriate place
+    typename Epilogue::ElementVector *ptr_Vector = 
+      static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
+
+
+    // Construct the epilogue
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Move to appropriate location for this output tile
+    if (ptr_Vector) {
+      ptr_Vector += threadblock_offset.column() + threadblock_tile_idx.m() * params.ldr;
+    }
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_idx.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_idx.k());
+
+    }
+    // Each split-k-slice writes to a unique tensor location
+    else if (params.split_k_mode == SplitKMode::kParallel) {
+      iterator_D.add_pointer_offset(threadblock_tile_idx.k() * 
+        cutlass::conv::implicit_gemm_tensor_c_size(ConvOperator, params.problem_size));
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(output_op,
+             // Only the final block uses Vector
+             ((params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) &&
+              (params.grid_tiled_shape.k() != threadblock_tile_idx.k() + 1))
+                 ? nullptr
+                 : ptr_Vector,
+             iterator_D,
+             accumulators,
+             iterator_C,
+             iterator_Aux,
+             ConvOutputIteratorParameter::extent(params.problem_size),
+             threadblock_offset);
+  
+    //
+    // Release the semaphore
+    //
+
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) { 
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_idx.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_idx.k() + 1;
+      }
+      
+      semaphore.release(lock);
+    }
+  } 
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h b/include/cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h
index 2ab47637a0..1f27e0686d 100644
--- a/include/cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h
+++ b/include/cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -61,7 +61,7 @@ template <
   typename Mma_,                                  ///! Threadblock-scoped matrix multiply-accumulate 
   typename Epilogue_,                             ///! Epilogue
   typename ThreadblockSwizzle_,                   ///! Threadblock swizzling function
-  conv::Operator ConvOperator,                    ///! Convolutional operator (Fprop, Dgrad, Wgrad)
+  conv::Operator ConvOperator,                    ///! Convolutional operator (Fprop, Dgrad, Wgrad, Deconv)
   typename ConvProblemSize_ = Conv2dProblemSize   ///! Convolutional operator on 2D or 3D problem
 >
 struct ImplicitGemmConvolutionWithFusedEpilogue {
@@ -117,6 +117,8 @@ struct ImplicitGemmConvolutionWithFusedEpilogue {
   /// Conv dimension and problem size structure (Conv2d or Conv3d)
   using ConvProblemSize = ConvProblemSize_;
 
+  static conv::GroupMode const kGroupMode = conv::GroupMode::kNone;
+
   /// Wgrad C stride idx for implicit gemm algorithm 
   // Conv2d row-major matrix C (KxRSC) 
   // Conv3d row-major matrix C (KxTRSC)
@@ -255,9 +257,9 @@ struct ImplicitGemmConvolutionWithFusedEpilogue {
       ptr_A(args.ref_A.data()),
       iterator_B(args.problem_size, args.ref_B.layout()),
       ptr_B(args.ref_B.data()),
-      iterator_C(ConvOutputIteratorParameter::layout(args.ref_C)),
+      iterator_C(ConvOutputIteratorParameter::layout(args.ref_C), implicit_gemm_tensor_c_extent(kConvolutionalOperator, args.problem_size)),
       ptr_C(args.ref_C.data()),
-      iterator_D(ConvOutputIteratorParameter::layout(args.ref_D)),
+      iterator_D(ConvOutputIteratorParameter::layout(args.ref_D), implicit_gemm_tensor_c_extent(kConvolutionalOperator, args.problem_size)),
       ptr_D(args.ref_D.data()),
       output_op(args.output_op),
       semaphore(semaphore),
@@ -339,7 +341,7 @@ struct ImplicitGemmConvolutionWithFusedEpilogue {
 
     // Broadcast the warp_id computed by lane 0 to ensure dependent code
     // is compiled as warp-uniform.
-    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int warp_idx = canonical_warp_idx_sync();
     int lane_idx = threadIdx.x % 32;
 
     //
diff --git a/include/cutlass/conv/kernel/sm90_implicit_gemm_tma_warpspecialized.hpp b/include/cutlass/conv/kernel/sm90_implicit_gemm_tma_warpspecialized.hpp
new file mode 100644
index 0000000000..657ac6b3ec
--- /dev/null
+++ b/include/cutlass/conv/kernel/sm90_implicit_gemm_tma_warpspecialized.hpp
@@ -0,0 +1,76 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+
+#include "cutlass/conv/detail.hpp"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/dispatch_policy.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/pipeline/sm90_pipeline.hpp"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class ConvUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<cute::is_base_of_v<KernelImplicitTmaWarpSpecializedSm90, typename CollectiveMainloop_::DispatchPolicy::Schedule>>
+> : public cutlass::gemm::kernel::GemmUniversal< 
+  ProblemShape_, 
+  CollectiveMainloop_, 
+  CollectiveEpilogue_, 
+  TileScheduler_
+>
+{};
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv::kernel
+
diff --git a/include/cutlass/conv/thread/depthwise_mma.h b/include/cutlass/conv/thread/depthwise_mma.h
new file mode 100644
index 0000000000..37ece7927e
--- /dev/null
+++ b/include/cutlass/conv/thread/depthwise_mma.h
@@ -0,0 +1,325 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates exposing architecture support for depthwise convolution
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/thread/mma.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// MMA operation
+template <
+  /// Size of the matrix product (concept: GemmShape)
+  typename Shape_,
+  /// Number of threads participating
+  int kThreads_,
+  /// Data type of A elements
+  typename ElementA,
+  /// Data type of B elements
+  typename ElementB,
+  /// Element type of C matrix
+  typename ElementC,
+  /// Inner product operator
+  typename Operator
+>
+struct ElementwiseInnerProduct;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// General implementation
+template <
+    /// Size of the matrix product (concept: GemmShape)
+    typename Shape_,
+    /// Data type of A elements
+    typename ElementA_,
+    /// Data type of B elements
+    typename ElementB_,
+    /// Element type of C matrix
+    typename ElementC_>
+struct ElementwiseInnerProduct<Shape_, 1, ElementA_, ElementB_, ElementC_, arch::OpMultiplyAdd> {
+  using Shape = Shape_;
+  using Operator = arch::OpMultiplyAdd;
+  using ElementC = ElementC_;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(Array<ElementC_, Shape::kN> &d,
+                  Array<ElementA_, Shape::kN> const &a,
+                  Array<ElementB_, Shape::kN> const &b,
+                  Array<ElementC_, Shape::kN> const &c) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Shape::kN; ++i) {
+      d[i] = a[i] * b[i] + c[i];
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Specialization of half_t
+template <>
+struct ElementwiseInnerProduct<
+  gemm::GemmShape<2, 2, 1>,
+  1,
+  half_t,
+  half_t,
+  half_t,
+  arch::OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<2, 2, 1>;
+  using Operator =  arch::OpMultiplyAdd;
+  using ElementC = half_t;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<half_t, 2> &d,
+    Array<half_t, 2> const &a,
+    Array<half_t, 2> const &b,
+    Array<half_t, 2> const &c
+  ) {
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600))
+
+    __half2 const & A = reinterpret_cast<__half2 const &>(a);
+    __half2 const & B = reinterpret_cast<__half2 const &>(b);
+    __half2 const & C = reinterpret_cast<__half2 const &>(c);
+
+    __half2 tmp_D = __hfma2(A, B, C);
+
+    d = reinterpret_cast<Array<half_t, 2> const &>(tmp_D);
+
+#else
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      d[i] = a[i] * b[i] + c[i];
+    }
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape,
+  /// Data type of A elements
+  typename ElementA,
+  /// Data type of B elements
+  typename ElementB,
+  /// Element type of C matrix
+  typename ElementC,
+  /// Concept: arch::OpMultiplyAdd or arch::Mma<>
+  typename Operator = arch::OpMultiplyAdd,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+struct DepthwiseDirectConvElementwiseInnerProduct;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Gemplate that handles all packed matrix layouts
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename ElementA_,
+  /// Data type of B elements
+  typename ElementB_,
+  /// Element type of C matrix
+  typename ElementC_,
+  /// Operator used to compute GEMM
+  typename Operator_
+>
+struct DepthwiseDirectConvElementwiseInnerProductGeneric {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  /// Data type of operand A
+  using ElementA = ElementA_;
+
+  /// Data type of operand B
+  using ElementB = ElementB_;
+
+  /// Element type of operand C
+  using ElementC = ElementC_;
+
+  /// Underlying mathematical operator
+  using Operator = Operator_;
+
+  /// A operand storage
+  using FragmentA = Array<ElementA, Shape::kMN>;
+
+  /// B operand storage
+  using FragmentB = Array<ElementB, Shape::kN>;
+
+  /// C operand storage
+  using FragmentC = Array<ElementC, Shape::kMN>;
+
+  /// Instruction
+  using MmaOp = cutlass::conv::thread::ElementwiseInnerProduct<
+    gemm::GemmShape<Shape::kN, Shape::kN, 1>,
+    1,
+    ElementA,
+    ElementB,
+    ElementC,
+    Operator>;
+
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+    Array<ElementC, Shape::kN> *ptr_D = reinterpret_cast<Array<ElementC, Shape::kN> *>(&D);
+    Array<ElementA, Shape::kN> const *ptr_A =
+        reinterpret_cast<Array<ElementA, Shape::kN> const *>(&A);
+    Array<ElementB, Shape::kN> const *ptr_B =
+        reinterpret_cast<Array<ElementB, Shape::kN> const *>(&B);
+
+    MmaOp mma_op;
+
+    // Copy accumulators
+    D = C;
+
+    // Compute matrix product
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Shape::kN / MmaOp::Shape::kN; ++n) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < Shape::kM; ++m) {
+
+          Array<ElementC, MmaOp::Shape::kN> tmpD = ptr_D[m * Shape::kN / MmaOp::Shape::kN + n];
+          Array<ElementA, MmaOp::Shape::kN> tmpA = ptr_A[m * Shape::kN / MmaOp::Shape::kN + n];
+          Array<ElementB, MmaOp::Shape::kN> tmpB = ptr_B[n];
+
+          mma_op(tmpD, tmpA, tmpB, tmpD);
+
+          ptr_D[m * Shape::kN / MmaOp::Shape::kN + n] = tmpD;
+
+        }
+      }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+    /// Data type of A elements
+  typename ElementA_,
+  /// Data type of B elements
+  typename ElementB_,
+  /// Element type of C matrix
+  typename ElementC_
+>
+struct DepthwiseDirectConvElementwiseInnerProduct<
+  Shape_,
+  ElementA_,
+  ElementB_,
+  ElementC_,
+  arch::OpMultiplyAdd
+  > {
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  /// Data type of operand A
+  using ElementA = ElementA_;
+
+  /// Data type of operand B
+  using ElementB = ElementB_;
+
+  /// Element type of operand C
+  using ElementC = ElementC_;
+
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
+  /// A operand storage
+  using FragmentA =
+      Array<ElementA, Shape::kMN>;  // output_tile_size per thread * groups_per_thread
+
+  /// B operand storage
+  using FragmentB = Array<ElementB, Shape::kN>;  // 1 * groups_per_thread
+
+  /// C operand storage
+  using FragmentC =
+      Array<ElementC, Shape::kMN>;  // output_tile_size per thread * groups_per_thread
+
+  static bool const use_optimized = 0;
+
+  using ArchMmaOperator =  DepthwiseDirectConvElementwiseInnerProductGeneric<Shape,
+                                                        ElementA,
+                                                        ElementB,
+                                                        ElementC,
+                                                        Operator>;
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    ArchMmaOperator mma;
+
+    mma(D, A, B, C);
+
+  }
+};
+
+} // namespace thread
+} // namespace conv
+} // namespace cutlass
diff --git a/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h
index 0af34babfd..978c14feb6 100644
--- a/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h
index 4014173b48..6fb1cb18e9 100644
--- a/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -248,7 +248,7 @@ class Conv2dDgradFilterTileAccessIteratorOptimized <
     pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
   }
 
-  CUTLASS_HOST_DEVICE
+  CUTLASS_DEVICE
   void advance() {
 
     int next_idx = 0;
@@ -263,18 +263,33 @@ class Conv2dDgradFilterTileAccessIteratorOptimized <
 
       // Move filter_r by stride_h
       filter_r_ += problem_size_.stride_h;
-
+#if 0
       bool check = (filter_r_ < problem_size_.R);
 
       filter_r_ = check ? filter_r_ : start_r_;
       next_idx = check ? 1 : 2;
       reset_bytes += (check ? reset_bytes_s_ : reset_bytes_r_);
+#else
+    asm volatile(
+        "{\n\t"
+        " .reg .pred %%p;\n\t"
+        " .reg .s64 t1;\n\t"
+        " setp.lt.s32 %%p, %3, %4;\n\t"
+        " selp.s32 %0, %3, %5, %%p;\n\t"
+        " selp.s32 %1, 1, 2, %%p;\n\t"
+        " selp.s64 t1, %6, %7, %%p;\n\t"
+        " add.s64 %2, %8, t1;\n\t"
+        "}\n"
+        : "=r"(filter_r_), "=r"(next_idx), "=l"(reset_bytes)
+        : "r"(filter_r_), "r"(problem_size_.R), "r"(start_r_),
+          "l"(reset_bytes_s_), "l"(reset_bytes_r_), "l"(reset_bytes));
+#endif
     }
 
     // offset pointers by offset_bytes
     pointer_ += (params_.inc_next[next_idx] - reset_bytes);
 
-    if (next_idx == 2) {  
+    if (next_idx == 2) {
       filter_k_ += params_.filter_k_delta;
     }
 
diff --git a/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h
index 80448f361e..1de41f3f7b 100644
--- a/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -528,7 +528,6 @@ class Conv2dDgradOutputGradientTileAccessIteratorAnalytic <
     int k = filter_k_ + iteration_vector_ * AccessType::kElements;
 
     return TensorCoord(n, p, q, k);
-
   }
 
   /// Returns true if the current coordinate is within the output tensor Dy
diff --git a/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h
index 4b1e906aea..ffa13c934d 100644
--- a/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -230,7 +230,7 @@ class Conv2dDgradOutputGradientTileAccessIteratorOptimized <
       offset_p[s] = (mapped_h + problem_size_.pad_h - filter_r) / problem_size_.stride_h;
       offset_q[s] = (mapped_w + problem_size_.pad_w - filter_s) / problem_size_.stride_w;
 
-      // Intialize pointers for gemm_k=0
+      // Initialize pointers for gemm_k=0
       TensorCoord coord{offset_n[s], offset_p[s], offset_q[s], filter_k_};
 
       pointer_[s] += params_.layout(coord) * sizeof_bits<Element>::value / 8;
@@ -321,7 +321,7 @@ class Conv2dDgradOutputGradientTileAccessIteratorOptimized <
     add_byte_offset_(pointer_offset * sizeof_bits<Element>::value / 8);
   }
 
-  CUTLASS_HOST_DEVICE
+  CUTLASS_DEVICE
   void advance() {
 
     int next_idx = 0;
@@ -336,23 +336,37 @@ class Conv2dDgradOutputGradientTileAccessIteratorOptimized <
 
       // Move filter_r by stride_h
       filter_r_ += problem_size_.stride_h;
+#if 0
       if (filter_r_ < problem_size_.R) {
-        
+
         next_idx = 1;
 
-        // Restore bytes in q coordinate (Mma in filter s dimenstion)
+        // Restore bytes in q coordinate (Mma in filter s dimension)
         reset_bytes = reset_bytes_s_;
 
       } else {
 
         // Restore filter_r
         filter_r_ = start_r_;
-        
+
         next_idx = 2;
-        
-        // Restore bytes in p and q coordinate (Mma in filter s and r dimenstion)
+
+        // Restore bytes in p and q coordinate (Mma in filter s and r dimension)
         reset_bytes = reset_bytes_r_;
       }
+#else
+      asm volatile(
+          "{\n\t"
+          " .reg .pred %%p;\n\t"
+          " setp.lt.s32 %%p, %3, %4;\n\t"
+          " selp.s32 %0, %3, %5, %%p;\n\t"
+          " selp.s32 %1, 1, 2, %%p;\n\t"
+          " selp.s64 %2, %6, %7, %%p;\n\t"
+          "}\n"
+          : "=r"(filter_r_), "=r"(next_idx), "=l"(reset_bytes)
+          : "r"(filter_r_), "r"(problem_size_.R), "r"(start_r_),
+            "l"(reset_bytes_s_), "l"(reset_bytes_r_));
+#endif
     }
 
     // offset pointers by offset_bytes
@@ -619,7 +633,7 @@ class Conv2dDgradOutputGradientTileAccessIteratorOptimized <
 
     CUTLASS_PRAGMA_UNROLL
     for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
-      clear_mask(v_idx, filter_k_ >= problem_size.K);
+      clear_mask(v_idx, filter_k_ + v_idx * AccessType::kElements >= problem_size.K);
     }
 
     set_iteration_index(0);
diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h
index add089af91..9317ea0cd9 100644
--- a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -67,7 +67,8 @@ template <
   typename Element_,
   typename Layout_,
   typename ThreadMap_,
-  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>,
+  conv::GroupMode GroupMode_ = conv::GroupMode::kNone
 >
 class Conv2dFpropActivationTileAccessIteratorAnalytic {
 public:
@@ -89,6 +90,7 @@ class Conv2dFpropActivationTileAccessIteratorAnalytic {
   static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
   static int const kConvDim = 2;
   using ConvProblemSize = typename conv::Conv2dProblemSize;
+  static conv::GroupMode const kGroupMode = GroupMode_;
  
   static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
   
@@ -119,6 +121,11 @@ class Conv2dFpropActivationTileAccessIteratorAnalytic {
   int filter_c_;
   int filter_r_;
   int filter_s_;
+  int filter_c_init_;
+  int group_idx_offset_;
+  int channels_per_group_;
+  int crs_cnt_;
+  int crs_per_group_;
 
   int offset_n_[ThreadMap::Iterations::kStrided];
   int offset_p_[ThreadMap::Iterations::kStrided];
@@ -137,6 +144,8 @@ class Conv2dFpropActivationTileAccessIteratorAnalytic {
     params_(params), 
     problem_size_(problem_size), 
     pointer_(reinterpret_cast<char const *>(ptr)), 
+    crs_cnt_(0),
+    group_idx_offset_(0),
     filter_c_(0), 
     filter_r_(0), 
     filter_s_(0) {
@@ -145,6 +154,12 @@ class Conv2dFpropActivationTileAccessIteratorAnalytic {
 
     filter_c_ = threadblock_offset.column() + thread_coord.contiguous();
 
+    if (kGroupMode != conv::GroupMode::kNone) {
+      filter_c_init_ = filter_c_;
+      channels_per_group_ = problem_size_.C / problem_size_.groups;
+      crs_per_group_ = problem_size_.S * problem_size_.R * ((channels_per_group_ + Shape::kColumn - 1) / Shape::kColumn);
+    }
+
     CUTLASS_PRAGMA_UNROLL
     for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
       int offset_npq = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
@@ -182,6 +197,10 @@ class Conv2dFpropActivationTileAccessIteratorAnalytic {
   CUTLASS_HOST_DEVICE
   void advance() {
     // moves to the next tile
+    if (kGroupMode != conv::GroupMode::kNone) {
+      ++crs_cnt_;
+    }
+
     ++filter_s_;
     if (filter_s_ < problem_size_.S) {
       return;
@@ -192,8 +211,19 @@ class Conv2dFpropActivationTileAccessIteratorAnalytic {
       return;
     }
     filter_r_ = 0;
-    
-    filter_c_ += Shape::kColumn * problem_size_.split_k_slices;
+
+    if (kGroupMode == conv::GroupMode::kNone) {
+      filter_c_ += Shape::kColumn * problem_size_.split_k_slices;
+    } else {
+      if (crs_cnt_ == crs_per_group_) {
+        // moves to next group
+        crs_cnt_ = 0;
+        ++group_idx_offset_;
+        filter_c_ = group_idx_offset_ * channels_per_group_ + filter_c_init_;
+      } else {
+        filter_c_ += Shape::kColumn * problem_size_.split_k_slices;
+      }
+    }
   }
 
   /// Returns the coordinate in the activations tensor X that is currently pointed to
@@ -273,7 +303,7 @@ class Conv2dFpropActivationTileAccessIteratorAnalytic {
   static Status can_implement(Conv2dProblemSize const &problem_size) {
 
     // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.C % AccessType::kElements) {
+    if ((problem_size.C / problem_size.groups) % AccessType::kElements) {
       return Status::kErrorInvalidProblem;
     }
 
diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h
index d95f3758d9..5a4489c017 100644
--- a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h
+++ b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h
index a4bc830a70..3f1f2bc141 100644
--- a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h
+++ b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h
index 147d4f1aeb..243d724b36 100644
--- a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -388,7 +388,7 @@ class Conv2dFpropActivationTileAccessIteratorOptimized {
   static Status can_implement(Conv2dProblemSize const &problem_size) {
 
     // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.C % AccessType::kElements) {
+    if ((problem_size.C / problem_size.groups) % AccessType::kElements) {
       return Status::kErrorInvalidProblem;
     }
 
diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h
index 08d3176d73..1725db5af5 100644
--- a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -66,7 +66,9 @@ template <
   typename Element_,
   typename Layout_,
   typename ThreadMap_,
-  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>,
+  conv::GroupMode GroupMode_ = conv::GroupMode::kNone,
+  bool IsDeconv_ = false
 >
 class Conv2dFpropFilterTileAccessIteratorAnalytic {
 public:
@@ -84,16 +86,18 @@ class Conv2dFpropFilterTileAccessIteratorAnalytic {
   using TensorCoord = typename Layout::TensorCoord;
   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;
+  static bool const IsDeconv = IsDeconv_;
   static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
   static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
   static int const kConvDim = 2;
   using ConvProblemSize = typename conv::Conv2dProblemSize;
+  static conv::GroupMode const kGroupMode = GroupMode_;
  
   static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
   
   static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
     "Vectors implied by the thread map must be divisible by the access type.");
- 
+
   //
   // Simplifying assertions
   //
@@ -118,8 +122,14 @@ class Conv2dFpropFilterTileAccessIteratorAnalytic {
   int filter_r_;
   int filter_s_;
   int filter_c_;
+  int filter_c_init_;
+  int crs_cnt_;
+  int crs_per_group_;  
+  int group_idx_offset_c_;
+  int channels_per_group_;
 
   int offset_k_[ThreadMap::Iterations::kStrided];
+  int group_idx_offset_k_[ThreadMap::Iterations::kStrided];
 
 public:
 
@@ -134,6 +144,8 @@ class Conv2dFpropFilterTileAccessIteratorAnalytic {
     params_(params), 
     problem_size_(problem_size), 
     pointer_(reinterpret_cast<char const *>(ptr)), 
+    crs_cnt_(0),
+    group_idx_offset_c_(0),
     filter_r_(0),
     filter_s_(0),
     filter_c_(0) {
@@ -142,9 +154,26 @@ class Conv2dFpropFilterTileAccessIteratorAnalytic {
 
     filter_c_ = threadblock_offset.row() + thread_coord.contiguous();
 
+    auto input_channels = (IsDeconv ? problem_size_.K : problem_size_.C);
+    auto output_channels = (IsDeconv ? problem_size_.C : problem_size_.K);
+
+    if (kGroupMode != conv::GroupMode::kNone) {
+      filter_c_init_ = filter_c_;
+      if (kGroupMode == conv::GroupMode::kDepthwise){
+        channels_per_group_ = 1;
+        crs_per_group_ = problem_size_.S * problem_size_.R;
+      } else {
+        channels_per_group_ = input_channels / problem_size_.groups;
+        crs_per_group_ = problem_size_.S * problem_size_.R * ((channels_per_group_ + Shape::kRow - 1) / Shape::kRow);
+      }
+    }
+
     CUTLASS_PRAGMA_UNROLL
     for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
       offset_k_[s] = threadblock_offset.column() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+      if (kGroupMode != conv::GroupMode::kNone && kGroupMode != conv::GroupMode::kDepthwise) {
+        group_idx_offset_k_[s] = (thread_coord.strided() + s * ThreadMap::Delta::kStrided) / (output_channels / problem_size_.groups);
+      }
     }
 
     set_iteration_index(0);
@@ -168,6 +197,10 @@ class Conv2dFpropFilterTileAccessIteratorAnalytic {
   CUTLASS_HOST_DEVICE
   void advance() {
     // moves to the next tile
+    if (kGroupMode != conv::GroupMode::kNone) {
+      ++crs_cnt_;
+    }
+
     ++filter_s_;
     if (filter_s_ < problem_size_.S) {
       return;
@@ -179,8 +212,21 @@ class Conv2dFpropFilterTileAccessIteratorAnalytic {
       return;
     }
     filter_r_ = 0;
-    
-    filter_c_ += Shape::kRow * problem_size_.split_k_slices;
+
+    if (kGroupMode == conv::GroupMode::kNone) {
+      filter_c_ += Shape::kRow * problem_size_.split_k_slices;
+    } else {
+      if (crs_cnt_ == crs_per_group_) {
+        crs_cnt_ = 0;
+        filter_c_ = filter_c_init_;
+        if (kGroupMode != conv::GroupMode::kDepthwise) {
+          // moves to next group
+          ++group_idx_offset_c_;
+        }
+      } else {
+        filter_c_ += Shape::kRow * problem_size_.split_k_slices;
+      }
+    }
   }
 
   /// Returns the coordinate in the filter tensor W that is currently pointed to
@@ -200,8 +246,17 @@ class Conv2dFpropFilterTileAccessIteratorAnalytic {
 
     TensorCoord coord = at();
 
-    return coord.n() < problem_size_.K &&
-      coord.c() < problem_size_.C;
+    auto input_channels = (IsDeconv ? problem_size_.K : problem_size_.C);
+    auto output_channels = (IsDeconv ? problem_size_.C : problem_size_.K);
+
+    if (kGroupMode == conv::GroupMode::kNone) {
+      return coord.n() < output_channels && coord.c() < input_channels;
+    } else if (kGroupMode == conv::GroupMode::kDepthwise) {
+      return coord.n() < output_channels && coord.c() < 1; // channels_per_group_ is always equal to ONE.
+    } else {
+      return coord.n() < output_channels && coord.c() < channels_per_group_ &&
+             group_idx_offset_c_ == group_idx_offset_k_[iteration_strided_];
+    }
   }
 
   /// Returns a pointer to the vector starting at the current coordinate
@@ -242,19 +297,22 @@ class Conv2dFpropFilterTileAccessIteratorAnalytic {
   CUTLASS_HOST_DEVICE
   static Status can_implement(Conv2dProblemSize const &problem_size) {
 
+    auto input_channels = (IsDeconv ? problem_size.K : problem_size.C);
+    auto output_channels = (IsDeconv ? problem_size.C : problem_size.K);
+
     // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.K % AccessType::kElements) {
+    if ((input_channels / problem_size.groups) % AccessType::kElements) {
       return Status::kErrorInvalidProblem;
     }
 
     if (platform::is_same<Layout, layout::TensorCxRSKx<32>>::value) {
-      if (problem_size.K % 32) {
+      if (output_channels % 32) {
         return Status::kErrorInvalidProblem;
       }
     }
 
     if (platform::is_same<Layout, layout::TensorCxRSKx<64>>::value) {
-      if (problem_size.K % 64) {
+      if (output_channels % 64) {
         return Status::kErrorInvalidProblem;
       }
     }
diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_few_channels.h b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_few_channels.h
index 7fb30ad957..a1291aa01c 100644
--- a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_few_channels.h
+++ b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_few_channels.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_fixed_channels.h b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_fixed_channels.h
index 7b1012356f..e90d501745 100644
--- a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_fixed_channels.h
+++ b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_fixed_channels.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h
index 2f6a1243ba..4c2343c32c 100644
--- a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -67,7 +67,8 @@ template <
   typename Element_,
   typename Layout_,
   typename ThreadMap_,
-  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>,
+  bool IsDeconv_ = false
 >
 class Conv2dFpropFilterTileAccessIteratorOptimized{
 public:
@@ -85,6 +86,7 @@ class Conv2dFpropFilterTileAccessIteratorOptimized{
   using TensorCoord = typename Layout::TensorCoord;
   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;
+  static bool const IsDeconv = IsDeconv_;
   static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
   static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
   static int const kConvDim = 2;
@@ -145,6 +147,7 @@ class Conv2dFpropFilterTileAccessIteratorOptimized{
   uint32_t predicates_[kAccessesPerVector];
   int filter_rs_;
   int filter_c_;
+  int channels_per_group_;
 
   //
   // Assertions
@@ -175,10 +178,11 @@ class Conv2dFpropFilterTileAccessIteratorOptimized{
 
     filter_c_ = threadblock_offset.row() + thread_coord.contiguous();
     Index column = threadblock_offset.column() + thread_coord.strided();
+    channels_per_group_ = (IsDeconv ? problem_size_.K : problem_size_.C) / problem_size_.groups;
 
     CUTLASS_PRAGMA_UNROLL
     for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      uint32_t pred = ((column + s * ThreadMap::Delta::kStrided < problem_size_.K) ? 1u : 0);
+      uint32_t pred = ((column + s * ThreadMap::Delta::kStrided < (IsDeconv ? problem_size_.C : problem_size_.K)) ? 1u : 0);
 
       CUTLASS_PRAGMA_UNROLL
       for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
@@ -188,7 +192,7 @@ class Conv2dFpropFilterTileAccessIteratorOptimized{
 
     CUTLASS_PRAGMA_UNROLL
     for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
-      clear_mask(v_idx, filter_c_ + v_idx * AccessType::kElements >= problem_size_.C);
+      clear_mask(v_idx, filter_c_ + v_idx * AccessType::kElements >= channels_per_group_);
     }
 
     pointer_ += (
@@ -229,7 +233,7 @@ class Conv2dFpropFilterTileAccessIteratorOptimized{
  
     CUTLASS_PRAGMA_UNROLL
     for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
-      clear_mask(v_idx, filter_c_ + v_idx * AccessType::kElements >= problem_size_.C);
+      clear_mask(v_idx, filter_c_ + v_idx * AccessType::kElements >= channels_per_group_);
     }
       
     pointer_ += next;
@@ -285,19 +289,22 @@ class Conv2dFpropFilterTileAccessIteratorOptimized{
   CUTLASS_HOST_DEVICE
   static Status can_implement(Conv2dProblemSize const &problem_size) {
 
+    auto input_channels = (IsDeconv ? problem_size.K : problem_size.C);
+    auto output_channels = (IsDeconv ? problem_size.C : problem_size.K);
+
     // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.C % AccessType::kElements) {
+    if ((input_channels / problem_size.groups) % AccessType::kElements) {
       return Status::kErrorInvalidProblem;
     }
 
     if (platform::is_same<Layout, layout::TensorCxRSKx<32>>::value) {
-      if (problem_size.K % 32) {
+      if (output_channels % 32) {
         return Status::kErrorInvalidProblem;
       }
     }
 
     if (platform::is_same<Layout, layout::TensorCxRSKx<64>>::value) {
-      if (problem_size.K % 64) {
+      if (output_channels % 64) {
         return Status::kErrorInvalidProblem;
       }
     }
diff --git a/include/cutlass/conv/threadblock/conv2d_params.h b/include/cutlass/conv/threadblock/conv2d_params.h
index 1ba9532cea..d34bc9faf1 100644
--- a/include/cutlass/conv/threadblock/conv2d_params.h
+++ b/include/cutlass/conv/threadblock/conv2d_params.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -554,20 +554,20 @@ struct Conv2dDgradOutputGradientIteratorOptimizedParams {
 
     // next S
     inc_next[0] = conv_sign * (
-      layout.stride()[0] * problem_size.dilation_w
+      (int64_t)layout.stride()[0] * problem_size.dilation_w
     ) * element_size_bits / 8;
 
     // next R
     inc_next[1] = conv_sign * (
-        layout.stride()[1] * problem_size.dilation_h
-        - (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
+        (int64_t)layout.stride()[1] * problem_size.dilation_h
+        - (problem_size.S - 1) * (int64_t)layout.stride()[0] * problem_size.dilation_w
       ) * element_size_bits / 8;
 
     // next K
     inc_next[2] = (
         threadblock_shape.column() * problem_size.split_k_slices
-        - conv_sign * (problem_size.R - 1) * layout.stride()[1] * problem_size.dilation_h
-        - conv_sign * (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
+        - conv_sign * (problem_size.R - 1) * (int64_t)layout.stride()[1] * problem_size.dilation_h
+        - conv_sign * (problem_size.S - 1) * (int64_t)layout.stride()[0] * problem_size.dilation_w
       ) * element_size_bits / 8;
 
     // logical offset added to internal channel counter - units are elements, not bytes
@@ -614,12 +614,12 @@ struct Conv2dStridedDgradOutputGradientIteratorOptimizedParams {
 
     // next S
     inc_next[0] = conv_sign * (
-      layout.stride()[0] * problem_size.dilation_w
+      (int64_t)layout.stride()[0] * problem_size.dilation_w
     ) * element_size_bits / 8;
 
     // next R
     inc_next[1] = conv_sign * (
-        layout.stride()[1] * problem_size.dilation_h
+        (int64_t)layout.stride()[1] * problem_size.dilation_h
       ) * element_size_bits / 8;
 
     // next K
@@ -670,18 +670,18 @@ struct Conv2dDgradFilterIteratorOptimizedParams {
     TRACE_CONV_INITIALIZERS("conv2d_dgrad", "filter", 
       element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
 
-    inc_next_strided = (layout.stride()[2] * threadmap_delta.strided() * element_size_bits) / 8;
+    inc_next_strided = ((int64_t)layout.stride()[2] * threadmap_delta.strided() * element_size_bits) / 8;
 
     inc_next_rs =
-      ( layout.stride()[0]
-        - (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[2]
+      ( (int64_t)layout.stride()[0]
+        - (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * (int64_t)layout.stride()[2]
       ) * element_size_bits / 8;
 
     inc_next_k =
       (
-        threadblock_shape.row() * problem_size.split_k_slices * layout.stride()[2]
-        - (problem_size.R * problem_size.S - 1) * layout.stride()[0]
-        - (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[2]
+        threadblock_shape.row() * problem_size.split_k_slices * (int64_t)layout.stride()[2]
+        - (problem_size.R * problem_size.S - 1) * (int64_t)layout.stride()[0]
+        - (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * (int64_t)layout.stride()[2]
       ) * element_size_bits / 8;
 
     filter_k_delta = threadblock_shape.row() * problem_size.split_k_slices;
@@ -730,26 +730,26 @@ struct Conv2dStridedDgradFilterIteratorOptimizedParams {
 
     // next S
     inc_next[0] =
-      ( layout.stride()[0] * problem_size.stride_w
+      ( (int64_t)layout.stride()[0] * problem_size.stride_w
         //- (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[2]
       ) * element_size_bits / 8;
 
     // next R
     inc_next[1] =
-      ( layout.stride()[1] * problem_size.stride_h
+      ( (int64_t)layout.stride()[1] * problem_size.stride_h
         //- (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[2]
       ) * element_size_bits / 8;
 
     // next K
     inc_next[2] =
       (
-        threadblock_shape.row() * problem_size.split_k_slices * layout.stride()[2]
+        threadblock_shape.row() * problem_size.split_k_slices * (int64_t)layout.stride()[2]
         //- (problem_size.R * problem_size.S - 1) * layout.stride()[0]
         //- (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[2]
       ) * element_size_bits / 8;
 
     // offset in units of bytes to move the pointer in backward direction
-    reset_bytes = (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[2]
+    reset_bytes = (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * (int64_t)layout.stride()[2]
             * element_size_bits / 8;
 
     filter_k_delta = threadblock_shape.row() * problem_size.split_k_slices;
@@ -800,13 +800,13 @@ struct Conv2dWgradOutputGradientIteratorOptimizedParams {
       element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
 
     // Incremental offsets in unites of bytes (number of elements) * sizeof_bits<Element>::value / 8
-    offset_next_strided = (threadmap_delta.strided() * layout.stride()[0])
+    offset_next_strided = (threadmap_delta.strided() * (int64_t)layout.stride()[0])
                         * element_size_bits / 8;
 
     offset_next_contiguous = (threadmap_delta.contiguous())
                             * element_size_bits / 8;
 
-    inc_next_npq = (threadblock_shape.column() * problem_size.split_k_slices * layout.stride()[0])
+    inc_next_npq = (threadblock_shape.column() * problem_size.split_k_slices * (int64_t)layout.stride()[0])
                       * element_size_bits / 8;
   }
 };
@@ -891,4 +891,3 @@ struct PredicatedScaleBiasVectorAccessIteratorParams {
 } // namespace cutlass
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/include/cutlass/conv/threadblock/conv2d_tile_iterator.h b/include/cutlass/conv/threadblock/conv2d_tile_iterator.h
index 66dd75d26f..17f4594ba5 100644
--- a/include/cutlass/conv/threadblock/conv2d_tile_iterator.h
+++ b/include/cutlass/conv/threadblock/conv2d_tile_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -104,6 +104,11 @@ class TileIterator {
     return TileAccessIterator::getParams(problem_size, layout);
   }
 
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    tile_access_iterator_.set_iteration_index(index);
+  }
 
   /// Adds a pointer offset in units of Element
   CUTLASS_HOST_DEVICE
diff --git a/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h
index ec68cc89a5..3e3a4f155d 100644
--- a/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -195,7 +195,7 @@ class Conv2dWgradActivationTileAccessIteratorAnalytic {
       s = filter_s_[iteration_contiguous_];
     }  
     else {
-      /// Multiple access to support non-128b alignment in contiguous dimenstion
+      /// Multiple access to support non-128b alignment in contiguous dimension
       c = (filter_c_[iteration_contiguous_] + iteration_vector_ * AccessType::kElements) % problem_size_.C;
       int wrap_c = (filter_c_[iteration_contiguous_] + iteration_vector_ * AccessType::kElements) / problem_size_.C;
       s = (filter_s_[iteration_contiguous_] + wrap_c) % problem_size_.S;
@@ -268,7 +268,7 @@ class Conv2dWgradActivationTileAccessIteratorAnalytic {
   static Status can_implement(Conv2dProblemSize const &problem_size) {
 
     // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.K % AccessType::kElements) {
+    if (problem_size.C % AccessType::kElements) {
       return Status::kErrorInvalidProblem;
     }
 
diff --git a/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h
index fec9dcda1c..8cbcc3d9fb 100644
--- a/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -212,7 +212,7 @@ class Conv2dWgradActivationTileAccessIteratorOptimized {
 
     if (kAccessesPerVector > 1) {
       // This code section is only to support non-128b alignment
-      // Multiple access to support non-128b alignment in contiguous dimenstion
+      // Multiple access to support non-128b alignment in contiguous dimension
       int wrap_c;
       params_.c_divmod(wrap_c, c, c + iteration_vector_ * AccessType::kElements);
 
@@ -304,7 +304,7 @@ class Conv2dWgradActivationTileAccessIteratorOptimized {
   static Status can_implement(Conv2dProblemSize const &problem_size) {
 
     // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.K % AccessType::kElements) {
+    if (problem_size.C % AccessType::kElements) {
       return Status::kErrorInvalidProblem;
     }
 
diff --git a/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h
index 0a26d646c7..793649dbea 100644
--- a/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -243,7 +243,7 @@ class Conv2dWgradOutputGradientTileAccessIteratorAnalytic {
   static Status can_implement(Conv2dProblemSize const &problem_size) {
 
     // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.C % AccessType::kElements) {
+    if (problem_size.K % AccessType::kElements) {
       return Status::kErrorInvalidProblem;
     }
 
diff --git a/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h
index aac0e3c3c1..07233d8924 100644
--- a/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -293,7 +293,7 @@ class Conv2dWgradOutputGradientTileAccessIteratorOptimized {
   static Status can_implement(Conv2dProblemSize const &problem_size) {
 
     // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.C % AccessType::kElements) {
+    if (problem_size.K % AccessType::kElements) {
       return Status::kErrorInvalidProblem;
     }
 
diff --git a/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h
index 331132c759..943ab88cfc 100644
--- a/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -251,7 +251,7 @@ class Conv3dDgradFilterTileAccessIteratorAnalytic {
   static Status can_implement(Conv3dProblemSize const &problem_size) {
 
     // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.C % (128/sizeof_bits<Element>::value)) {
+    if (problem_size.C % AccessType::kElements) {
       return Status::kErrorInvalidProblem;
     }
 
diff --git a/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h
index a5a760db23..2d5837dd3d 100644
--- a/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -272,7 +272,7 @@ class Conv3dDgradFilterTileAccessIteratorOptimized {
   static Status can_implement(Conv3dProblemSize const &problem_size) {
 
     // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.C % (128/sizeof_bits<Element>::value)) {
+    if (problem_size.C % AccessType::kElements) {
       return Status::kErrorInvalidProblem;
     }
 
diff --git a/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h
index 0fc623e2ef..30b7f2fcf6 100644
--- a/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -325,7 +325,7 @@ class Conv3dDgradOutputGradientTileAccessIteratorAnalytic <
   static Status can_implement(ConvProblemSize const &problem_size) {
 
     // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.K % (128/sizeof_bits<Element>::value)) {
+    if (problem_size.K % AccessType::kElements) {
       return Status::kErrorInvalidProblem;
     }
 
diff --git a/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h
index 3f6b36e9dd..5a53c8cbd5 100644
--- a/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -466,7 +466,7 @@ class Conv3dDgradOutputGradientTileAccessIteratorOptimized {
     }
 
     // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.K % (128/sizeof_bits<Element>::value)) {
+    if (problem_size.K % AccessType::kElements) {
       return Status::kErrorNotSupported;
     }
 
diff --git a/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h
index a192bfd4be..f0f9a86a34 100644
--- a/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -272,7 +272,7 @@ class Conv3dFpropActivationTileAccessIteratorAnalytic {
   static Status can_implement(ConvProblemSize const &problem_size) {
 
     // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.C % (128/sizeof_bits<Element>::value)) {
+    if (problem_size.C % AccessType::kElements) {
       return Status::kErrorInvalidProblem;
     }
 
diff --git a/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h
index d0e89bd68b..78b270eb9a 100644
--- a/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -455,7 +455,7 @@ class Conv3dFpropActivationTileAccessIteratorOptimized {
   static Status can_implement(Conv3dProblemSize const &problem_size) {
 
     // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.C % (128/sizeof_bits<Element>::value)) {
+    if (problem_size.C % AccessType::kElements) {
       return Status::kErrorInvalidProblem;
     }
 
diff --git a/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h
index 8b388b0e8c..9f04adc40b 100644
--- a/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -64,7 +64,8 @@ namespace threadblock {
 template <
   typename Shape_,
   typename Element_,
-  typename ThreadMap_
+  typename ThreadMap_,
+  bool IsDeconv_ = false
 >
 class Conv3dFpropFilterTileAccessIteratorAnalytic {
 public:
@@ -82,6 +83,7 @@ class Conv3dFpropFilterTileAccessIteratorAnalytic {
   using TensorCoord = typename Layout::TensorCoord;
   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;
+  static bool const IsDeconv = IsDeconv_;
   static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
   static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
   static int const kConvDim = 3;
@@ -198,8 +200,11 @@ class Conv3dFpropFilterTileAccessIteratorAnalytic {
 
     TensorCoord coord = at();
 
-    return coord.n() < problem_size_.K &&
-      coord.c() < problem_size_.C;
+    auto input_channels = (IsDeconv ? problem_size_.K : problem_size_.C);
+    auto output_channels = (IsDeconv ? problem_size_.C : problem_size_.K);
+
+    return coord.n() < output_channels &&
+      coord.c() < input_channels;
   }
 
   /// Returns a pointer to the vector starting at the current coordinate
@@ -233,9 +238,10 @@ class Conv3dFpropFilterTileAccessIteratorAnalytic {
   /// Determines whether the Implicit GEMM can execute the given problem.
   CUTLASS_HOST_DEVICE
   static Status can_implement(ConvProblemSize const &problem_size) {
-
+    auto input_channels = (IsDeconv ? problem_size.K : problem_size.C);
+    auto output_channels = (IsDeconv ? problem_size.C : problem_size.K);
     // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.K % (128/sizeof_bits<Element>::value)) {
+    if (input_channels % AccessType::kElements) {
       return Status::kErrorInvalidProblem;
     }
     return Status::kSuccess;
diff --git a/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h
index 9b69dbcb43..efe34497f5 100644
--- a/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -66,7 +66,8 @@ template <
   typename Shape_,
   typename Element_,
   typename Layout_,
-  typename ThreadMap_
+  typename ThreadMap_,
+  bool IsDeconv_ = false
 >
 class Conv3dFpropFilterTileAccessIteratorOptimized{
 public:
@@ -84,6 +85,7 @@ class Conv3dFpropFilterTileAccessIteratorOptimized{
   using TensorCoord = typename Layout::TensorCoord;
   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;
+  static bool const IsDeconv = IsDeconv_;
   static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
   static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
   static int const kConvDim = 3;
@@ -172,11 +174,11 @@ class Conv3dFpropFilterTileAccessIteratorOptimized{
 
     CUTLASS_PRAGMA_UNROLL
     for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      uint32_t pred = ((column + s * ThreadMap::Delta::kStrided < problem_size_.K) ? 1u : 0);
+      uint32_t pred = ((column + s * ThreadMap::Delta::kStrided < (IsDeconv ? problem_size_.C : problem_size_.K)) ? 1u : 0);
       predicates_ |= (pred << s);
     }
 
-    if (filter_c_ >= problem_size.C) {
+    if (filter_c_ >= (IsDeconv ? problem_size_.K : problem_size_.C)) {
       predicates_ = 0u;
     }
 
@@ -214,7 +216,7 @@ class Conv3dFpropFilterTileAccessIteratorOptimized{
       filter_c_ += params_.filter_c_delta;
     }
       
-    if (filter_c_ >= problem_size_.C) {
+    if (filter_c_ >= (IsDeconv ? problem_size_.K : problem_size_.C)) {
       predicates_ = 0;
     }
       
@@ -258,12 +260,12 @@ class Conv3dFpropFilterTileAccessIteratorOptimized{
   /// Determines whether the Implicit GEMM can execute the given problem.
   CUTLASS_HOST_DEVICE
   static Status can_implement(Conv3dProblemSize const &problem_size) {
+    auto input_channels = (IsDeconv ? problem_size.K : problem_size.C);
 
     // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.C % (128/sizeof_bits<Element>::value)) {
+    if (input_channels % AccessType::kElements) {
       return Status::kErrorInvalidProblem;
     }
-
     return Status::kSuccess;
   }
 };
diff --git a/include/cutlass/conv/threadblock/conv3d_params.h b/include/cutlass/conv/threadblock/conv3d_params.h
index 5ad1e4fa3d..ac422b8f05 100644
--- a/include/cutlass/conv/threadblock/conv3d_params.h
+++ b/include/cutlass/conv/threadblock/conv3d_params.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -304,8 +304,8 @@ struct Conv3dDgradOutputGradientIteratorOptimizedParams {
     // logical offset added to internal channel counter - units are elements, not bytes
     filter_k_delta = threadblock_shape.column() * problem_size.split_k_slices;
   }
-
 };
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Parameters object for Conv2d DGRAD Filter (w) iterator
@@ -343,18 +343,18 @@ struct Conv3dDgradFilterIteratorOptimizedParams {
     TRACE_CONV_INITIALIZERS("conv3d_dgrad", "filter", 
       element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
 
-    inc_next_strided = (layout.stride()[3] * threadmap_delta.strided() * element_size_bits) / 8;
+    inc_next_strided = ((int64_t)layout.stride()[3] * threadmap_delta.strided() * element_size_bits) / 8;
 
     inc_next_trs =
-      ( layout.stride()[0]
-        - (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[3]
+      ( (int64_t)layout.stride()[0]
+        - (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * (int64_t)layout.stride()[3]
       ) * element_size_bits / 8;
 
     inc_next_k =
       (
-        threadblock_shape.row() * problem_size.split_k_slices * layout.stride()[3]
-        - (problem_size.T * problem_size.R * problem_size.S - 1) * layout.stride()[0]
-        - (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[3]
+        threadblock_shape.row() * problem_size.split_k_slices * (int64_t)layout.stride()[3]
+        - (problem_size.T * problem_size.R * problem_size.S - 1) * (int64_t)layout.stride()[0]
+        - (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * (int64_t)layout.stride()[3]
       ) * element_size_bits / 8;
 
     filter_k_delta = threadblock_shape.row() * problem_size.split_k_slices;
@@ -408,13 +408,13 @@ struct Conv3dWgradOutputGradientIteratorOptimizedParams {
     element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
 
   // Incremental offsets in unites of bytes (number of elements) * element_size_bits / 8
-  offset_next_strided = (threadmap_delta.strided() * layout.stride()[0])
+  offset_next_strided = (threadmap_delta.strided() * (int64_t)layout.stride()[0])
                       * element_size_bits / 8;
 
   offset_next_contiguous = (threadmap_delta.contiguous()) 
                           * element_size_bits / 8;
 
-  inc_next_nzpq = (threadblock_shape.column() * problem_size.split_k_slices * layout.stride()[0])
+  inc_next_nzpq = (threadblock_shape.column() * problem_size.split_k_slices * (int64_t)layout.stride()[0])
                     * element_size_bits / 8;
 
   // Precompute several quantities for fast modulo arithmetic.
diff --git a/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h
index ebf7e84ed8..cc8faea701 100644
--- a/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -270,7 +270,7 @@ class Conv3dWgradActivationTileAccessIteratorAnalytic {
   static Status can_implement(Conv3dProblemSize const &problem_size) {
 
     // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.K % (128/sizeof_bits<Element>::value)) {
+    if (problem_size.C % AccessType::kElements) {
       return Status::kErrorInvalidProblem;
     }
 
diff --git a/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h
index 0413c7dc75..2b10d207fa 100644
--- a/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -250,7 +250,7 @@ class Conv3dWgradActivationTileAccessIteratorOptimized {
     fast_divmod(p, q, residual, problem_size_.Q, params_.q_mul, params_.q_shr);
  
     int d = z * problem_size_.stride_d + precomputed_filter_t_[iteration_contiguous_];
-    int h = p * problem_size_.stride_h + precomputed_filter_r_[iteration_contiguous_];;
+    int h = p * problem_size_.stride_h + precomputed_filter_r_[iteration_contiguous_];
     int w = q * problem_size_.stride_w + precomputed_filter_s_[iteration_contiguous_];
 
     return TensorCoord(n, d, h, w, filter_c_[iteration_contiguous_]);
@@ -300,7 +300,7 @@ class Conv3dWgradActivationTileAccessIteratorOptimized {
   static Status can_implement(Conv3dProblemSize const &problem_size) {
 
     // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.K % (128/sizeof_bits<Element>::value)) {
+    if (problem_size.C % AccessType::kElements) {
       return Status::kErrorInvalidProblem;
     }
 
diff --git a/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h
index 4d57f0ba48..be9d4fb7ac 100644
--- a/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -248,7 +248,7 @@ class Conv3dWgradOutputGradientTileAccessIteratorAnalytic {
   static Status can_implement(Conv3dProblemSize const &problem_size) {
 
     // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.C % (128/sizeof_bits<Element>::value)) {
+    if (problem_size.K % AccessType::kElements) {
       return Status::kErrorInvalidProblem;
     }
 
diff --git a/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h
index 2bc5971bd6..0ef145f19d 100644
--- a/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -291,7 +291,7 @@ class Conv3dWgradOutputGradientTileAccessIteratorOptimized {
   static Status can_implement(Conv3dProblemSize const &problem_size) {
 
     // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.C % (128/sizeof_bits<Element>::value)) {
+    if (problem_size.K % AccessType::kElements) {
       return Status::kErrorInvalidProblem;
     }
 
diff --git a/include/cutlass/conv/threadblock/depthwise_direct_conv_params.h b/include/cutlass/conv/threadblock/depthwise_direct_conv_params.h
new file mode 100644
index 0000000000..8023183499
--- /dev/null
+++ b/include/cutlass/conv/threadblock/depthwise_direct_conv_params.h
@@ -0,0 +1,230 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! 
+  \file 
+  \brief Extracts the host-params objects into non-template code.
+*/
+
+#pragma once
+
+#define TRACE_CONV_PARAMS_INITIALIZERS_ENABLED 0
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+
+#if TRACE_CONV_PARAMS_INITIALIZERS_ENABLED
+#include <fstream>
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters structure used for DepthwiseFpropActivationDirect2dConvTileAccessIteratorOptimized
+template<typename Layout_ = layout::TensorNHWC >
+struct Depthwise2dFpropDirectConvParams;
+
+/// Parameters structure used for DepthwiseFpropActivationDirect2dConvTileAccessIteratorFixedStrideDilation
+template<typename Layout_ = layout::TensorNHWC >
+struct Depthwise2dFpropDirectConvActivationIteratorFixedStrideDilationParams;
+
+/// Parameters structure used for DepthwiseFpropFilterDirectConvTileAccessIteratorOptimized
+template<typename Layout_ = layout::TensorNHWC >
+struct Depthwise2dFpropDirectConvFilterIteratorParams;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters structure used for DepthwiseFpropActivationDirect2dConvTileAccessIteratorOptimized
+template<>
+struct Depthwise2dFpropDirectConvParams<layout::TensorNHWC> {
+  
+  using Layout = layout::TensorNHWC;
+
+  Layout layout;
+
+  int32_t activation_tile_h;
+  int32_t activation_tile_w;
+  int32_t activation_tile_hw;
+  FastDivmod activation_tile_w_divmod;
+  
+  int filter[2];
+  int stride[2];
+  int dilation[2];
+  int inc_next[2];
+  FastDivmod pq_divmod;
+  FastDivmod q_divmod;
+
+  int activation_load_count;
+  int activation_storage_elements;
+  int activation_size;
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Depthwise2dFpropDirectConvParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Depthwise2dFpropDirectConvParams(
+      Conv2dProblemSize const &problem_size,
+      Layout const &layout,             ///< layout object
+      MatrixCoord threadblock_shape,    ///< CTA threadblock Shape
+      Layout::TensorCoord threadblock_output_shape,  ///< Output tile Shape per threadblock
+      const int element_size_bits,      ///< bits of activation element
+      const int thread_count,           ///< threads per threadblock
+      const int thread_count_contiguous, ///< number of threads for continuous dimension
+      const int element_per_load)       ///< element per each load
+      : layout(layout) {
+          
+    filter[0] = problem_size.S;
+    filter[1] = problem_size.R;
+    
+    stride[0] =  problem_size.stride_w;
+    stride[1] =  problem_size.stride_h;
+
+    dilation[0] = problem_size.dilation_w;
+    dilation[1] = problem_size.dilation_h;
+
+    // Compute activation_tile size per threadblock because stride and dilation are runtime params.
+    activation_tile_h = (threadblock_output_shape.h() - 1) * problem_size.stride_h +
+                        (problem_size.R - 1) * problem_size.dilation_h + 1;
+    activation_tile_w = (threadblock_output_shape.w() - 1) * problem_size.stride_w +
+                        (problem_size.S - 1) * problem_size.dilation_w + 1;
+    activation_tile_hw = activation_tile_h * activation_tile_w;
+
+    activation_tile_w_divmod = FastDivmod(activation_tile_w);
+
+    /// Below two values could not be templatized because the stride and dilation are runtime params
+    activation_load_count = (thread_count_contiguous * activation_tile_hw + (thread_count - 1)) / thread_count;
+    activation_storage_elements = activation_load_count * element_per_load * thread_count;
+    activation_size =  activation_storage_elements * element_size_bits / 8;
+
+    // Fastdivmod for output P, Q
+    int tiles_p =
+        (problem_size.P + (threadblock_output_shape.h() - 1)) / (threadblock_output_shape.h());
+    int tiles_q = (problem_size.Q + (threadblock_output_shape.w() - 1)) /
+                  (threadblock_output_shape.w());
+
+    pq_divmod = FastDivmod(tiles_p * tiles_q);
+    q_divmod = FastDivmod(tiles_q);
+
+    // next S
+    inc_next[0] = problem_size.dilation_w;
+    // next R
+    inc_next[1] = (activation_tile_w * problem_size.dilation_h - (problem_size.S - 1) * problem_size.dilation_w);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Parameters structure used for DepthwiseFpropActivationDirect2dConvTileAccessIteratorFixedStrideDilation
+template <>
+struct Depthwise2dFpropDirectConvActivationIteratorFixedStrideDilationParams<layout::TensorNHWC> {
+  using Layout = layout::TensorNHWC;
+
+  Layout layout;
+
+  FastDivmod pq_divmod;
+  FastDivmod q_divmod;
+
+  int activation_size;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Depthwise2dFpropDirectConvActivationIteratorFixedStrideDilationParams() {}
+
+  CUTLASS_HOST_DEVICE
+  Depthwise2dFpropDirectConvActivationIteratorFixedStrideDilationParams(
+      Conv2dProblemSize const &problem_size,
+      Layout const &layout,                          ///< Layout object
+      MatrixCoord threadblock_shape,                 ///< Threadblock Shape
+      Layout::TensorCoord threadblock_output_shape,  ///< Output tile Shape per threadblock
+      const int activation_size_                     ///< Activation size loaded by iterator
+      )
+      : layout(layout),
+        activation_size(activation_size_) {
+    // Fastdivmod for output P, Q
+    int tiles_p =
+        (problem_size.P + (threadblock_output_shape.h() - 1)) / (threadblock_output_shape.h());
+    int tiles_q =
+        (problem_size.Q + (threadblock_output_shape.w() - 1)) / (threadblock_output_shape.w());
+
+    pq_divmod = FastDivmod(tiles_p * tiles_q);
+    q_divmod = FastDivmod(tiles_q);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters structure used for DepthwiseFpropFilterDirectConvTileAccessIteratorOptimized
+template <>
+struct Depthwise2dFpropDirectConvFilterIteratorParams<layout::TensorNHWC> {
+  using Layout = layout::TensorNHWC;
+
+  Layout layout;
+
+  int filter_size;
+
+  bool is_convolution;
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Depthwise2dFpropDirectConvFilterIteratorParams() {}
+
+  CUTLASS_HOST_DEVICE
+  Depthwise2dFpropDirectConvFilterIteratorParams(
+      Conv2dProblemSize const &problem_size,
+      Layout const &layout,           ///< Layout object
+      MatrixCoord threadblock_shape,  ///< Threadblock Shape
+      const int filter_size_)         ///< Filter size loaded by iterator
+      : layout(layout),
+        filter_size(filter_size_),
+        is_convolution(problem_size.mode == Mode::kConvolution){}
+};
+
+}  // namespace threadblock
+}  // namespace conv
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_fixed_stride_dilation.h b/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_fixed_stride_dilation.h
new file mode 100644
index 0000000000..192d961051
--- /dev/null
+++ b/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_fixed_stride_dilation.h
@@ -0,0 +1,314 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile)
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/threadblock/depthwise_direct_conv_params.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Shape_,
+          typename OutputTileShape_,
+          typename StrideShape_,
+          typename DilationShape_,
+          typename ActivationShape_,
+          typename Element_,
+          typename Layout_,
+          typename ThreadMap_,
+          typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess> >
+class DepthwiseFpropActivationDirect2dConvTileAccessIteratorFixedStrideDilation {
+ public:
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using OutputTileShape = OutputTileShape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+
+  // Compilation value of stride , dialtion and activation shape
+  using StrideShape = StrideShape_;
+  using DilationShape = DilationShape_;
+  using ActivationShape = ActivationShape_;
+
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  static int const kActivationSize = ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess * ThreadMap::kThreads *
+           sizeof_bits<Element>::value / 8;
+
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+                "Vectors implied by the thread map must be divisible by the access type.");
+
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1, "Require Iterations::kContiguous == 1");
+  
+  static_assert(OutputTileShape::kN == 1, "Require OutputTileShape::kN == 1");
+  static_assert(OutputTileShape::kC == Shape::kColumn, "Require OutputTile shape == channels per threadblock");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Depthwise2dFpropDirectConvActivationIteratorFixedStrideDilationParams<Layout>;
+
+ private:
+  Conv2dProblemSize const &problem_size_;
+  Params const &params_;
+  char const *pointer_;
+
+  // Base channels for current threadblock
+  int base_c_;
+  // Base activation index for current threadblock
+  int offset_intial_npq_;
+  // Base activation coord for current threadblock
+  TensorCoord activatioin_base_;
+  // Intial thread positioin
+  int offset_initial_hwc_;
+  // Overall load instruction per thread.
+  int iterator_load_;
+  // thread loading position.
+  int iterator_hwc_;
+  // activation N is inside the Tensor or not
+  bool valid_n_;
+
+ public:
+
+
+  CUTLASS_HOST_DEVICE
+  DepthwiseFpropActivationDirect2dConvTileAccessIteratorFixedStrideDilation(
+      Params const &params,
+      Conv2dProblemSize const &problem_size,
+      Element const *ptr,
+      int thread_idx,
+      MatrixCoord const &threadblock_offset =
+          MatrixCoord()
+      )
+      : params_(params),
+        problem_size_(problem_size),
+        pointer_(reinterpret_cast<char const *>(ptr)),
+        offset_intial_npq_(threadblock_offset.row()),
+        offset_initial_hwc_(thread_idx),
+        iterator_load_(0) {
+    
+    base_c_ = threadblock_offset.column();
+
+    set_iteration_index(0);
+
+    set_activation_coord(offset_intial_npq_);
+
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_activation_coord(int offset_npq) {
+    int offset_inital_n, offset_inital_p, offset_inital_q;
+    int residual;
+
+    params_.pq_divmod(offset_inital_n, residual, offset_npq);
+    params_.q_divmod(offset_inital_p, offset_inital_q, residual);
+
+    int base_n = offset_inital_n;
+
+    int base_h =
+        offset_inital_p * OutputTileShape::kH * StrideShape::kRow - problem_size_.pad_h;
+
+    int base_w =
+        offset_inital_q * OutputTileShape::kW * StrideShape::kColumn - problem_size_.pad_w;
+
+    activatioin_base_ = TensorCoord(base_n, base_h, base_w, base_c_);
+
+    valid_n_ = activatioin_base_.n() < problem_size_.N;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(
+        problem_size,
+        layout,
+        {Shape::kRow, Shape::kColumn},
+        {OutputTileShape::kN, OutputTileShape::kH, OutputTileShape::kW, OutputTileShape::kC},
+        kActivationSize);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iterator_hwc_ = offset_initial_hwc_ + index * ThreadMap::kThreads;
+    iterator_load_ = index;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // Go to next threadblock
+    offset_intial_npq_ += problem_size_.split_k_slices;
+
+    set_iteration_index(0);
+
+    set_activation_coord(offset_intial_npq_);
+  }
+
+  /// Returns the coordinate in the activations tensor X that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+    int c = iterator_hwc_ %  ThreadMap::Detail::ShapeVec::kContiguous ;
+    int next = iterator_hwc_ /  ThreadMap::Detail::ShapeVec::kContiguous ;
+    int h = next / ActivationShape::kW;
+    int w = next % ActivationShape::kW;
+
+    c = c * AccessType::kElements;
+
+    return activatioin_base_ + TensorCoord(0, h, w, c);
+  }
+
+  /// Returns true if the current coordinate is within the activations tensor X
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    TensorCoord coord = at();
+    bool valid_c = coord.c() < problem_size_.C;
+    bool valid_h = coord.h() >= 0 && coord.h() < problem_size_.H;
+    bool valid_w = coord.w() >= 0 && coord.w() < problem_size_.W;
+    return valid_n_ ? valid_c & valid_h & valid_w : 0;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    AccessType const *ptr =
+        reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+
+    return ptr;
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  DepthwiseFpropActivationDirect2dConvTileAccessIteratorFixedStrideDilation &operator++() {
+
+    ++iterator_load_;
+    iterator_hwc_ += ThreadMap::kThreads;
+
+    if (iterator_load_ < ThreadMap::Iterations::kCount) {
+       return *this;
+    }
+    
+    iterator_load_ = 0;
+    iterator_hwc_ = offset_initial_hwc_;
+
+    return *this;
+  }
+
+  /// Determines the activation size loaded by iterator
+  CUTLASS_HOST_DEVICE
+  int get_load_size() {
+    return kActivationSize;
+  }
+
+  /// Determines the iterations needed
+  CUTLASS_HOST_DEVICE
+  int get_iteration_num() {
+    return ThreadMap::Iterations::kCount;
+  }
+
+  /// Determines whether the Depthwise fprop can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check stride and dilation constraint
+    if (problem_size.stride_h != StrideShape::kRow || problem_size.stride_w != StrideShape::kColumn) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (problem_size.dilation_h != DilationShape::kRow || problem_size.dilation_w != DilationShape::kColumn) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace conv
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_optimized.h b/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_optimized.h
new file mode 100644
index 0000000000..a858a23f9f
--- /dev/null
+++ b/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_optimized.h
@@ -0,0 +1,291 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile)
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/threadblock/depthwise_direct_conv_params.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Shape_,
+          typename OutputTileShape_,
+          typename Element_,
+          typename Layout_,
+          typename ThreadMap_,
+          typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess> >
+class DepthwiseFpropActivationDirect2dConvTileAccessIteratorOptimized {
+ public:
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using OutputTileShape = OutputTileShape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+                "Vectors implied by the thread map must be divisible by the access type.");
+
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1, "Require Iterations::kContiguous == 1");
+  
+  static_assert(OutputTileShape::kN == 1, "Require OutputTileShape::kN == 1");
+  static_assert(OutputTileShape::kC == Shape::kColumn, "Require OutputTile shape == channels per threadblock");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Depthwise2dFpropDirectConvParams<Layout>;
+
+ private:
+  Conv2dProblemSize const &problem_size_;
+  Params const &params_;
+  char const *pointer_;
+
+  // Base channels for current threadblock
+  int base_c_;
+  // Base activation index for current threadblock
+  int offset_intial_npq_;
+  // Base activation coord for current threadblock
+  TensorCoord activatioin_base_;
+  // Intial thread positioin
+  int offset_initial_hwc_;
+  // Overall load instruction per thread.
+  int iterator_load_;
+  // thread loading position.
+  int iterator_hwc_;
+  // Number of loads for activations tensor X.
+  const int number_of_loads_;
+
+ public:
+
+
+  CUTLASS_HOST_DEVICE
+  DepthwiseFpropActivationDirect2dConvTileAccessIteratorOptimized(
+      Params const &params,
+      Conv2dProblemSize const &problem_size,
+      Element const *ptr,
+      int thread_idx,
+      MatrixCoord const &threadblock_offset =
+          MatrixCoord()
+      )
+      : params_(params),
+        problem_size_(problem_size),
+        pointer_(reinterpret_cast<char const *>(ptr)),
+        offset_intial_npq_(threadblock_offset.row()),
+        offset_initial_hwc_(thread_idx),
+        iterator_load_(0),
+        number_of_loads_(params.activation_load_count) {
+    
+    base_c_ = threadblock_offset.column();
+
+    set_activation_coord(offset_intial_npq_);
+
+    set_iteration_index(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_activation_coord(int offset_npq) {
+    int offset_inital_n, offset_inital_p, offset_inital_q;
+    int residual;
+
+    params_.pq_divmod(offset_inital_n, residual, offset_npq);
+    params_.q_divmod(offset_inital_p, offset_inital_q, residual);
+
+    int base_n = offset_inital_n;
+
+    int base_h =
+        offset_inital_p * OutputTileShape::kH * problem_size_.stride_h - problem_size_.pad_h;
+
+    int base_w =
+        offset_inital_q * OutputTileShape::kW * problem_size_.stride_w - problem_size_.pad_w;
+
+    activatioin_base_ = TensorCoord(base_n, base_h, base_w, base_c_);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(
+        problem_size,
+        layout,
+        {Shape::kRow, Shape::kColumn},
+        {OutputTileShape::kN, OutputTileShape::kH, OutputTileShape::kW, OutputTileShape::kC},
+        sizeof_bits<Element>::value,
+        ThreadMap::kThreads,
+        ThreadMap::Detail::ShapeVec::kContiguous,
+        ThreadMap::kElementsPerAccess);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iterator_hwc_ = offset_initial_hwc_ + index * ThreadMap::kThreads;
+    iterator_load_ = index;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // Go to next threadblock
+    offset_intial_npq_ += problem_size_.split_k_slices;
+
+    set_activation_coord(offset_intial_npq_);
+  }
+
+  /// Returns the coordinate in the activations tensor X that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+    
+    int c = iterator_hwc_ %  ThreadMap::Detail::ShapeVec::kContiguous ;
+    int next = iterator_hwc_ /  ThreadMap::Detail::ShapeVec::kContiguous ;
+    int h, w;
+    params_.activation_tile_w_divmod(h, w, next) ;
+
+    c = c * AccessType::kElements;
+
+    return activatioin_base_ + TensorCoord(0, h, w, c);
+  }
+
+  /// Returns true if the current coordinate is within the activations tensor X
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.N && coord.h() >= 0 && coord.h() < problem_size_.H &&
+           coord.w() >= 0 && coord.w() < problem_size_.W && coord.c() < problem_size_.C;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    AccessType const *ptr =
+        reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+
+    return ptr;
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  DepthwiseFpropActivationDirect2dConvTileAccessIteratorOptimized &operator++() {
+
+    ++iterator_load_;
+    iterator_hwc_ += ThreadMap::kThreads;
+
+    if (iterator_load_ < number_of_loads_) {
+       return *this;
+    }
+    
+    iterator_load_ = 0;
+    iterator_hwc_ = offset_initial_hwc_;
+
+    return *this;
+  }
+
+  /// Determines the activation size loaded by iterator
+  CUTLASS_HOST_DEVICE
+  int get_load_size() {
+    return params_.activation_size;
+  }
+
+  /// Determines the iterations needed
+  CUTLASS_HOST_DEVICE
+  int get_iteration_num() {
+    return number_of_loads_;
+  }
+
+  /// Determines whether the Depthwise fprop can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace conv
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/threadblock/depthwise_fprop_direct_conv_multistage.h b/include/cutlass/conv/threadblock/depthwise_fprop_direct_conv_multistage.h
new file mode 100644
index 0000000000..50aeee006d
--- /dev/null
+++ b/include/cutlass/conv/threadblock/depthwise_fprop_direct_conv_multistage.h
@@ -0,0 +1,551 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a multistage threadblock-scoped Implicit GEMM Convolution kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/cache_operation.h"
+#include "cutlass/conv/threadblock/depthwise_mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Epilogue stores the data into global memory
+    typename Epilogue_,
+    /// iterator implementation variants
+    conv::IteratorAlgorithm IteratorAlgorithm_ = conv::IteratorAlgorithm::kOptimized,
+    /// Used for partial specialization
+    typename Enable = bool>
+class DepthwiseFpropDirectConvMultipleStage :
+   public DepthwiseDirectConvMmaBase<Shape_, Policy_, Stages> {
+public:
+  ///< Base class
+  using Base = DepthwiseDirectConvMmaBase<Shape_, Policy_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  using Epilogue = Epilogue_;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  static conv::IteratorAlgorithm const kItertorAlgorithm = IteratorAlgorithm_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+
+  using ElementC = typename Policy::Operator::ElementC;
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+  
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB = 
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA = typename Operator::FragmentA;
+  using WarpLoadedFragmentB = typename Operator::FragmentB;
+  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  DepthwiseFpropDirectConvMultipleStage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(IteratorA &iterator_A,
+                              IteratorB &iterator_B,
+                              int group_start_A = 0,
+                              int group_start_B = 0) {
+    if (kItertorAlgorithm == conv::IteratorAlgorithm::kFixedStrideDilation) {
+      // Number of iterators is a static value.
+      iterator_A.set_iteration_index(group_start_A * IteratorA::kAccessesPerVector);
+      this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess /
+                              IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+          ++iterator_A;
+        }
+        ++this->smem_iterator_A_;
+      }
+    } else {
+      // Number of iterators is a runtime value.
+      iterator_A.set_iteration_index(group_start_A * IteratorA::kAccessesPerVector);
+      this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < iterator_A.get_iteration_num(); ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess /
+                              IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+          ++iterator_A;
+        }
+        ++this->smem_iterator_A_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA &iterator_A,
+      ///< Params of global memory iterator
+      typename IteratorA::Params const &iterator_a_params,
+      ///< iterator over B operand in global memory
+      IteratorB &iterator_B,
+      ///< Params of global memory iterator
+      typename IteratorB::Params const &iterator_b_params,
+      ///< initial value of accumulator
+      FragmentC const &src_accum,
+      /// Epilogue
+      Epilogue &epilogue, 
+      ///< Output operator
+      typename Epilogue::OutputOp const &output_op, 
+      ///< Tile iterator for destination 
+      typename Epilogue::OutputTileIterator &destination_iterator,
+      ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+      typename Epilogue::OutputTileIterator &source_iterator,
+
+      int split_k_slices = 1
+      ) {
+
+    //
+    // Prologue
+    //
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1; ++stage, --gemm_k_iterations) {
+
+      if (stage == 0) {
+        iterator_B.set_iteration_index(0);
+        this->smem_iterator_B_.set_iteration_index(0);
+
+        // Async Copy for operand B
+        CUTLASS_PRAGMA_UNROLL
+        for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+          typename IteratorB::AccessType *dst_ptr =
+              reinterpret_cast<typename IteratorB::AccessType *>(this->smem_iterator_B_.get());
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+            int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                                  IteratorB::ThreadMap::kElementsPerAccess /
+                                  IteratorB::kAccessesPerVector / 8;
+
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, iterator_B.get(), iterator_B.valid());
+            
+            ++iterator_B;
+          }
+
+          ++this->smem_iterator_B_;
+        }
+      }
+
+      if(kItertorAlgorithm == conv::IteratorAlgorithm::kFixedStrideDilation){
+        // Number of iterators is compilation static.
+        iterator_A.set_iteration_index(0);
+        this->smem_iterator_A_.set_iteration_index(0);
+
+        // Async Copy for operand A
+        CUTLASS_PRAGMA_UNROLL
+        for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+          typename IteratorA::AccessType *dst_ptr =
+              reinterpret_cast<typename IteratorA::AccessType *>(this->smem_iterator_A_.get());
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+            int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                                  IteratorA::ThreadMap::kElementsPerAccess /
+                                  IteratorA::kAccessesPerVector / 8;
+
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, iterator_A.get(), iterator_A.valid());
+          
+            ++iterator_A;
+          }
+
+          ++this->smem_iterator_A_;
+        }
+
+      } else {
+        // Number of iterators is a runtime value.
+        iterator_A.set_iteration_index(0);
+        this->smem_iterator_A_.set_iteration_num(iterator_A.get_iteration_num());
+        this->smem_iterator_A_.set_iteration_index(0);
+
+
+        // Async Copy for operand A
+        CUTLASS_PRAGMA_UNROLL
+        for (int j = 0; j < iterator_A.get_iteration_num(); ++j) {
+          typename IteratorA::AccessType *dst_ptr =
+              reinterpret_cast<typename IteratorA::AccessType *>(this->smem_iterator_A_.get());
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+            int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                                  IteratorA::ThreadMap::kElementsPerAccess /
+                                  IteratorA::kAccessesPerVector / 8;
+
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+            ++iterator_A;
+          }
+
+          ++this->smem_iterator_A_;
+        }
+      }
+
+      // Move to the next stage
+      iterator_A.advance();
+
+      this->smem_iterator_A_.add_tile_offset({1, 0});
+
+      // Inserts a fence to group cp.async instructions into stages.
+      cutlass::arch::cp_async_fence();
+    }
+
+    /////////////////////////////////////////////////////////////////////////////
+    // Waits until kStages-2 stages have committed. 
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA warp_loaded_frag_A[2];
+    WarpLoadedFragmentB warp_loaded_frag_B[2];
+    WarpTransformedFragmentA warp_transformed_frag_A[2];
+    WarpTransformedFragmentB warp_transformed_frag_B[2];
+
+    Operator warp_mma;
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.setup_initial_status(iterator_a_params);
+
+
+    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
+                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
+
+    //
+    // Mainloop
+    //
+
+    unsigned int iterations = 0;
+    constexpr int inner_loop_iterations = round_up(Base::kWarpGemmIterations, 2);
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {   // Each iteration is a cta tile.
+
+      accum.clear();
+    
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < inner_loop_iterations; ++warp_mma_k) {
+        if (Base::kWarpGemmIterations % 2 == 0 || warp_mma_k + 1 != Base::kWarpGemmIterations) {
+          // Load warp-level tiles from shared memory, wrapping to k offset if
+          // this is the last group as the case may be.
+
+          this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Shape::kK);
+          this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Shape::kK);
+
+          this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
+          this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+          ++this->warp_tile_iterator_A_;
+          ++this->warp_tile_iterator_B_;
+        }
+
+        if (warp_mma_k > 0)
+          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
+                              warp_transformed_frag_B[warp_mma_k % 2],
+                              warp_loaded_frag_A[warp_mma_k % 2],
+                              warp_loaded_frag_B[warp_mma_k % 2]);
+
+        // Issue global->shared copies for the next stage
+        int group_start_iteration_A, group_start_iteration_B;
+
+        if (warp_mma_k == 0) {
+          group_start_iteration_A = 0;
+          group_start_iteration_B = 0;
+          copy_tiles_and_advance(
+              iterator_A, iterator_B, group_start_iteration_A, group_start_iteration_B);
+        }
+
+        if (warp_mma_k < Base::kWarpGemmIterations) {
+          warp_mma(
+            accum, 
+            warp_transformed_frag_A[warp_mma_k % 2],
+            warp_transformed_frag_B[warp_mma_k % 2], 
+            accum
+          );
+        }
+
+        if (warp_mma_k + 1 == inner_loop_iterations)
+          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+                              warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+                              warp_loaded_frag_A[(warp_mma_k + 1) % 2],
+                              warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+        if (warp_mma_k + 2 == inner_loop_iterations) {
+          // Inserts a fence to group cp.async instructions into stages.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages of cp.async have committed
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next cta
+          iterator_A.advance();
+
+          this->smem_iterator_A_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({-Base::kStages, 0});
+   
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.advance(- (Base::kStages-1) * iterator_A.get_load_size());
+            smem_read_stage_idx = 0;
+          } else {
+            this->warp_tile_iterator_A_.advance(iterator_A.get_load_size());
+            ++smem_read_stage_idx;
+          }
+
+          if (kItertorAlgorithm == conv::IteratorAlgorithm::kFixedStrideDilation) {
+            this->warp_tile_iterator_A_.setup_initial_status(iterator_a_params);
+          }
+
+          // goback to start position. B has no multiple stage
+          this->warp_tile_iterator_B_.add_tile_offset({-Policy::kPartitionsK * Shape::kK, 0});
+
+          --gemm_k_iterations;
+        }
+      }
+
+      //
+      // Epilogue
+      //
+      int32_t smem_base_offset = iterator_B.get_load_size() + (iterations % Base::kStages) * iterator_A.get_load_size();
+
+      destination_iterator.set_tile_index(iterations * split_k_slices);
+      
+      source_iterator.set_tile_index(iterations * split_k_slices);
+    
+      epilogue(output_op, destination_iterator, accum, source_iterator, smem_base_offset);
+
+      ++iterations;
+    }
+
+    // Insert fence and wait for all outstanding cp.async operations to commit.
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/threadblock/depthwise_fprop_filter_tile_access_iterator_direct_conv_optimized.h b/include/cutlass/conv/threadblock/depthwise_fprop_filter_tile_access_iterator_direct_conv_optimized.h
new file mode 100644
index 0000000000..52d604e43c
--- /dev/null
+++ b/include/cutlass/conv/threadblock/depthwise_fprop_filter_tile_access_iterator_direct_conv_optimized.h
@@ -0,0 +1,261 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+template <typename Shape_,
+          typename Element_,
+          typename Layout_,
+          typename ThreadMap_,
+          typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess> >
+class DepthwiseFpropFilterDirectConvTileAccessIteratorOptimized {
+public:   
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+ 
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static int const kFilterSize = ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess * ThreadMap::kThreads *
+           sizeof_bits<Element>::value / 8;
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+ 
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+  using Params = Depthwise2dFpropDirectConvFilterIteratorParams<Layout>;
+
+ protected:
+
+  Conv2dProblemSize const &problem_size_;
+  Params const &params_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  int filter_k_;
+  int offset_trs_[ThreadMap::Iterations::kStrided];
+
+public:
+
+
+
+  CUTLASS_HOST_DEVICE
+  DepthwiseFpropFilterDirectConvTileAccessIteratorOptimized(
+    Params const &params, 
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)), 
+    filter_k_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_k_ = threadblock_offset.column() + thread_coord.contiguous();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_trs_[s] = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+    }
+
+    set_iteration_index(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+      return Params(problem_size, layout, {Shape::kRow, Shape::kColumn}, kFilterSize);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * 8 / sizeof_bits<Element>::value;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // Do nothing because the filter is persistent in the SMEM
+  }
+
+  /// Returns the coordinate in the filter tensor W that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int k = filter_k_ + iteration_vector_ * AccessType::kElements;
+    int trs =  offset_trs_[iteration_strided_];
+
+    return TensorCoord(k, trs, 0 , 0);  // As a 2D-matrix
+  }
+
+  /// Returns true if the current coordinate is within the activations tensor W
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.K &&
+            coord.h() < Shape::kColumn;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+    TensorCoord coord = at();
+    int64_t offset = coord.n();
+    if (params_.is_convolution) {
+      offset += (Shape::kColumn - coord.h() - 1)* problem_size_.K;
+    } else {
+      offset += coord.h() * problem_size_.K;
+    }
+
+    return reinterpret_cast<AccessType const *>(pointer_ +
+                                                offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  DepthwiseFpropFilterDirectConvTileAccessIteratorOptimized &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines the filter size loaded by iterator
+  CUTLASS_HOST_DEVICE
+  int get_load_size() {
+    return kFilterSize;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.K % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    // check whether runtime filter size is same as templated filter size.
+    if ((problem_size.R * problem_size.S) != Shape::kColumn) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/threadblock/depthwise_fprop_pipelined.h b/include/cutlass/conv/threadblock/depthwise_fprop_pipelined.h
new file mode 100644
index 0000000000..c2825fa60d
--- /dev/null
+++ b/include/cutlass/conv/threadblock/depthwise_fprop_pipelined.h
@@ -0,0 +1,336 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Iterates over tiles of A operand in global memory 
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorA_,
+  /// Iterates over tiles of A operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorA_,
+  /// Iterates over tiles of B operand in global memory
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorB_,
+  /// Iterates over tiles of B operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorB_,
+  /// Data type of accumulator matrix
+  typename ElementC_,
+  /// Data type of accumulator matrix
+  typename LayoutC_,
+  /// Policy describing tuning details (concept: MmaPolicy)
+  typename Policy_,
+  /// Transformation applied to A operand
+  typename TransformA_ = NumericArrayConverter<
+    typename SmemIteratorA_::Element, 
+    typename IteratorA_::Element, 
+    IteratorA_::Fragment::kElements>,
+  ///
+  /// Transformation applied to A operand
+  typename TransformB_ = NumericArrayConverter<
+    typename SmemIteratorB_::Element, 
+    typename IteratorB_::Element, 
+    IteratorB_::Fragment::kElements>,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class DepthwiseFpropPipelined : public gemm::threadblock::MmaBase<Shape_, Policy_, 2> {
+public:
+
+  ///< Base class
+  using Base = gemm::threadblock::MmaBase<Shape_, Policy_, 2>;
+
+  using Shape = Shape_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using IteratorA = IteratorA_;     ///< Iterates over tiles of A operand in global memory
+  using IteratorB = IteratorB_;     ///< Iterates over tiles of B operand in global memory
+  using ElementC = ElementC_;       ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_;         ///< Layout of accumulator matrix
+  using Policy = Policy_;           ///< Policy describing tuning details
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand A loaded from global memory
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Obtain the arch tag from the warp-level operator
+  using ArchTag = typename Policy::Operator::ArchTag;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+  static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2");
+
+private:
+
+  using WarpFragmentA = typename Operator::FragmentA;
+  using WarpFragmentB = typename Operator::FragmentB;
+
+protected:
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  DepthwiseFpropPipelined(
+    typename Base::SharedStorage &shared_storage,       ///< Shared storage needed for internal use by threadblock-scoped GEMM
+    int thread_idx,                                     ///< ID within the threadblock
+    int warp_idx,                                       ///< ID of warp
+    int lane_idx                                        ///< ID of each thread within a warp
+  ):
+    Base(shared_storage, thread_idx, warp_idx, lane_idx),
+    smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+    smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) {
+
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+    int gemm_k_iterations,                            ///< number of iterations of the mainloop
+    FragmentC &accum,                                 ///< destination accumulator tile
+    IteratorA iterator_A,                             ///< iterator over A operand in global memory
+    IteratorB iterator_B,                             ///< iterator over B operand in global memory
+    FragmentC const &src_accum,                       ///< source accumulator tile
+    int gemm_k_iterations_per_channel = 0,            ///< number of iterations per channel
+    TransformA transform_A = TransformA(),            ///< transformation applied to A fragment
+    TransformB transform_B = TransformB()) {          ///< transformation applied to B fragment
+
+    //
+    // Prologue
+    //
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    FragmentA tb_frag_A;
+    FragmentB tb_frag_B;
+
+    tb_frag_A.clear();
+    tb_frag_B.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_A.load(tb_frag_A);
+    iterator_B.load(tb_frag_B);
+
+    ++iterator_A;
+    ++iterator_B;
+
+    this->smem_iterator_A_.store(transform_A(tb_frag_A));
+    this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+    ++this->smem_iterator_A_;
+    ++this->smem_iterator_B_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math instructions
+    WarpFragmentA warp_frag_A[2];
+    WarpFragmentB warp_frag_B[2];
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    Operator warp_mma;
+
+    int smem_write_stage_idx = 1;
+    // Depthwise specific
+    int channel_start_index = 0;
+    int rs_plane_idx = 0;
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
+    // shared memory loads (which have the tightest latency requirement).
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      if(rs_plane_idx == gemm_k_iterations_per_channel - 1){
+        // Reset interation index.
+        iterator_B.set_iteration_index(0);
+      }
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
+        // as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations - 1) {
+
+          // Write fragments to shared memory
+          this->smem_iterator_A_.store(transform_A(tb_frag_A));
+
+          this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+          __syncthreads();
+          
+          if(rs_plane_idx == gemm_k_iterations_per_channel - 1){
+            // Move to next set of filter groups.
+            channel_start_index += Base::kWarpGemmIterations;
+          }
+
+          ++this->smem_iterator_A_;
+          ++this->smem_iterator_B_;
+
+          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+          }
+          else {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+        }
+
+        this->warp_tile_iterator_A_.set_kgroup_index(channel_start_index + (warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index(channel_start_index + (warp_mma_k + 1) % Base::kWarpGemmIterations);
+        
+        this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k == 0) {
+
+          iterator_A.load(tb_frag_A);
+          iterator_B.load(tb_frag_B);
+    
+          ++iterator_A;
+          ++iterator_B;
+        }
+
+        warp_mma(accum, warp_frag_A[warp_mma_k % 2],
+                 warp_frag_B[warp_mma_k % 2], accum);
+      }
+
+      rs_plane_idx = (rs_plane_idx == gemm_k_iterations_per_channel - 1) ? 0: (rs_plane_idx + 1);
+
+    }
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/threadblock/depthwise_mma_base.h b/include/cutlass/conv/threadblock/depthwise_mma_base.h
new file mode 100644
index 0000000000..967587be05
--- /dev/null
+++ b/include/cutlass/conv/threadblock/depthwise_mma_base.h
@@ -0,0 +1,229 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a directconv threadblock-scoped Depthwise kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Policy object describing MmaTensorOp
+template <
+    /// Warp-level GEMM operator (concept: gemm::warp::Mma)
+    typename Operator_,
+    /// Padding used for A operand in shared memory (concept: MatrixShape)
+    typename SmemPaddingA_,
+    /// Padding used for B operand in shared memory (concept: MatrixShape)
+    typename SmemPaddingB_,
+    ///
+    typename ThreadMapA_,
+    ///
+    typename ThreadMapB_,
+    /// Number of partitions of K dimension of GEMM
+    int PartitionsK = 1>
+struct DepthwiseDirectConvMmaPolicy {
+  /// Warp-level GEMM operator (concept: gemm::warp::MmaTensorOp or gemm::warp::MmaSimt)
+  using Operator = Operator_;
+
+  /// Padding used for A operand in shared memory
+  using SmemPaddingA = SmemPaddingA_;
+
+  /// Padding used for B operand in shared memory
+  using SmemPaddingB = SmemPaddingB_;
+
+  using ThreadMapA = ThreadMapA_;
+  using ThreadMapB = ThreadMapB_;
+
+  /// Number of partitions of K dimension
+  static int const kPartitionsK = PartitionsK;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class DepthwiseDirectConvMmaBase {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = cutlass::gemm::
+      GemmShape<Shape::kM / WarpGemm::kM, Shape::kN / WarpGemm::kN, Shape::kK / WarpGemm::kK>;
+
+  /// Number of warp-level GEMM oeprations
+  /// kWarpGemmIterations could be even and odd. 
+  static int const kWarpGemmIterations = (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  /// Tensor reference to the A operand
+  using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+
+  /// Tensor reference to the B operand
+  using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+  static_assert(kWarpGemmIterations > 1,
+                "The pipelined structure requires at least two warp-level "
+                "GEMM operations.");
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+
+    /// Shape of the A matrix operand in shared memory
+    using ShapeA = MatrixShape<1,  // Not determined at compile-time :(
+                               Shape::kN + Policy::SmemPaddingA::kRow>;
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB = MatrixShape<Policy::ThreadMapB::StorageShape::kStrided +
+                                   Policy::SmemPaddingB::kRow,  // filter_rs_size
+                               Policy::ThreadMapB::StorageShape::kContiguous +
+                                   Policy::SmemPaddingB::kColumn>;  // Tile N = 64?
+
+   public:
+    //
+    // Data members
+    //
+
+    // Let persistent B matrix in front of dynamic matrix A
+    /// Buffer for B operand
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+    /// Buffer for A operand
+    /// Not be determined at compile-time -- Just to get a Smem start address.
+    AlignedBuffer<typename Operator::ElementA, 1> operand_A;  
+   public:
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the A matrix
+    CUTLASS_DEVICE
+    static typename Operator::LayoutA LayoutA() {
+      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
+    }
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() {
+      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+    }
+
+    /// Returns a TensorRef to the A operand
+    CUTLASS_HOST_DEVICE
+    TensorRefA operand_A_ref() { return TensorRefA{operand_A.data(), LayoutA()}; }
+
+    /// Returns a TensorRef to the B operand
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() { return TensorRefB{operand_B.data(), LayoutB()}; }
+  };
+
+ protected:
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A operand from shared memory
+  typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  DepthwiseDirectConvMmaBase(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : warp_tile_iterator_A_(shared_storage.operand_A_ref(), lane_idx),
+        warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx) {}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace conv
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/threadblock/depthwise_mma_core_with_lane_access_size.h b/include/cutlass/conv/threadblock/depthwise_mma_core_with_lane_access_size.h
new file mode 100644
index 0000000000..de84180f38
--- /dev/null
+++ b/include/cutlass/conv/threadblock/depthwise_mma_core_with_lane_access_size.h
@@ -0,0 +1,952 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic properties needed by CTA-level GEMMs assuming expectations about data
+      layout of the global memory fragments, data types, and internal tile sizes.
+
+      Partial specializations for threadblock::Mma operations targeting depthwise related simt instructions.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/warp/mma_depthwise_simt.h"
+
+#include "cutlass/gemm/threadblock/mma_pipelined.h"
+#include "cutlass/gemm/threadblock/mma_singlestage.h"
+
+#include "cutlass/gemm/threadblock/mma_base.h"
+#include "cutlass/conv/threadblock/depthwise_mma_base.h"
+
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear_direct_conv.h"
+
+#include "cutlass/arch/cache_operation.h" 
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+namespace detail {
+//
+// Convert a WarpShapeM which is the whole tile of elements into the number of elements (2D) held by
+// each partitions within warp. 
+// The goal is for each thread's tile of elements to be as square as
+// possible for performance (4x4 will be faster than 2x8).
+template<int WarpShapeM,  // The number of elements (1D) contained in the entire warp
+         int WarpNumThreadsM> // The number of partitions within the warp
+struct SimtWarpShape {
+  // kP * kQ * WarpNumThreadsM = WarpShapeM
+  // If needed, enable more specializations.
+};
+template <>
+struct SimtWarpShape<4, 4> {
+  static constexpr int kP = 1;
+  static constexpr int kQ = 1;
+};
+
+template <>
+struct SimtWarpShape<4, 2> {
+  static constexpr int kP = 2;
+  static constexpr int kQ = 1;
+};
+
+template <>
+struct SimtWarpShape<4, 1> {
+  static constexpr int kP = 2;
+  static constexpr int kQ = 2;
+};
+
+template <>
+struct SimtWarpShape<8, 1> {
+  static constexpr int kP = 2;
+  static constexpr int kQ = 4;
+};
+template <>
+struct SimtWarpShape<8, 2> {
+  static constexpr int kP = 2;
+  static constexpr int kQ = 2;
+};
+template <>
+struct SimtWarpShape<8, 4> {
+  static constexpr int kP = 1;
+  static constexpr int kQ = 2;
+};
+
+template <>
+struct SimtWarpShape<16, 1> {
+  static constexpr int kP = 4;
+  static constexpr int kQ = 4;
+};
+template <>
+struct SimtWarpShape<16, 2> {
+  static constexpr int kP = 2;
+  static constexpr int kQ = 4;
+};
+template <>
+struct SimtWarpShape<16, 4> {
+  static constexpr int kP = 2;
+  static constexpr int kQ = 2;
+};
+
+template <int WarpNumThreadsM>
+struct SimtWarpShape<25, WarpNumThreadsM> {
+  static_assert(WarpNumThreadsM == 1, "WarpShapeM could not be evenly splited by threads");
+  static constexpr int kP = 5;
+  static constexpr int kQ = 5;
+};
+
+template <>
+struct SimtWarpShape<32, 1> {
+  static constexpr int kP = 4;
+  static constexpr int kQ = 8;
+};
+
+template <>
+struct SimtWarpShape<32, 2> {
+  static constexpr int kP = 4;
+  static constexpr int kQ = 4;
+};
+
+template <>
+struct SimtWarpShape<32, 4> {
+  static constexpr int kP = 2;
+  static constexpr int kQ = 4;
+};
+
+}  // namespace detail
+
+template <
+    /// Shape of threadblock-scoped matrix multiply operator
+    typename Shape,
+    /// Shape of warp-level matrix multiply operator
+    typename WarpShape,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape,
+    /// Element data type of A operand
+    typename ElementA,
+    /// Layout of operand A
+    typename LayoutA,
+    /// Element data type of B operand
+    typename ElementB,
+    /// Layout of operand B
+    typename LayoutB,
+    /// Data type of accumulator
+    typename ElementC,
+    /// Layout of accumulator
+    typename LayoutC,
+    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
+    typename OperatorClass,
+    /// Size of a warp-scoped per thread access
+    int kLaneAccessSizeA_ = 0,
+    /// Size of a warp-scoped per thread access 
+    int kLaneAccessSizeB_ = 0,
+    /// Number of stages
+    int Stages = 2,
+    /// Operation performed by MMA
+    typename Operator = typename platform::conditional<
+        (platform::is_same<OperatorClass,
+                           cutlass::arch::OpClassTensorOp>::value) &&
+            (platform::is_same<ElementA, int8_t>::value ||
+             platform::is_same<ElementA, int4b_t>::value ||
+             platform::is_same<ElementA, uint8_t>::value ||
+             platform::is_same<ElementA, uint4b_t>::value),
+        cutlass::arch::OpMultiplyAddSaturate,
+        cutlass::arch::OpMultiplyAdd>::type,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA =
+        cutlass::arch::CacheOperation::Global,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB =
+        cutlass::arch::CacheOperation::Global,
+    /// per-element transformation for elements of A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// per-element transformation for elements of B
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    bool IsComplex = false // (is_complex<ElementA>::value || is_complex<ElementB>::value)
+>
+struct DepthwiseMmaCoreWithLaneAccessSize;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Shape of threadblock-scoped matrix multiply operator
+    typename Shape,
+    /// Shape of threadblock-scoped output tile 
+    typename ThreadBlockOutputShape,
+    /// Shape of filter shape per threadblock
+    typename FilterShape,
+    /// Shape of warp-level matrix multiply operator
+    typename WarpShape,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape,
+    /// Element data type of A operand
+    typename ElementA,
+    /// Layout of operand A
+    typename LayoutA,
+    /// Element data type of B operand
+    typename ElementB,
+    /// Layout of operand B
+    typename LayoutB,
+    /// Data type of accumulator
+    typename ElementC,
+    /// Layout of accumulator
+    typename LayoutC,
+    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
+    typename OperatorClass,
+    /// Size of a warp-scoped per thread access
+    int kLaneAccessSizeA_ = 0,
+    /// Size of a warp-scoped per thread access 
+    int kLaneAccessSizeB_ = 0,
+    /// Number of stages
+    int Stages = 2,
+    /// Operation performed by MMA
+    typename Operator = typename platform::conditional<
+        (platform::is_same<OperatorClass,
+                           cutlass::arch::OpClassTensorOp>::value) &&
+            (platform::is_same<ElementA, int8_t>::value ||
+             platform::is_same<ElementA, int4b_t>::value ||
+             platform::is_same<ElementA, uint8_t>::value ||
+             platform::is_same<ElementA, uint4b_t>::value),
+        cutlass::arch::OpMultiplyAddSaturate,
+        cutlass::arch::OpMultiplyAdd>::type,
+    /// Iterator algo type
+    conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kAnalytic,
+    /// Stride ( MatrixShape<Height, Width> )
+    typename StrideShape = cutlass::MatrixShape<-1, -1>,   
+    /// Dilation ( MatrixShape<Height, Width> )
+    typename DilationShape =  cutlass::MatrixShape<-1, -1>,
+    /// Activation Shape loaded by threadblock
+    typename ActivationShape = cutlass::conv::TensorNHWCShape<-1,-1,-1,-1>,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA =
+        cutlass::arch::CacheOperation::Global,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB =
+        cutlass::arch::CacheOperation::Global,
+    /// per-element transformation for elements of A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// per-element transformation for elements of B
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    bool IsComplex = false // (is_complex<ElementA>::value || is_complex<ElementB>::value)
+>
+struct DepthwiseDirectConvMmaCoreWithLaneAccessSize;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Shape of threadblock-scoped matrix multiply operator
+    typename Shape,
+    /// Shape of warp-level matrix multiply operator
+    typename WarpShape,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape,
+    /// Element data type of A operand
+    typename ElementA,
+    /// Layout of operand A
+    typename LayoutA,
+    /// Element data type of B operand
+    typename ElementB,
+    /// Layout of operand B
+    typename LayoutB,
+    /// Data type of accumulator
+    typename ElementC,
+    /// Layout of accumulator
+    typename LayoutC,
+    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
+    typename OperatorClass,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// per-element transformation for elements of A
+    ComplexTransform TransformA,
+    /// per-element transformation for elements of B
+    ComplexTransform TransformB,
+    bool IsComplex
+>
+struct DepthwiseMmaCoreWithLaneAccessSize<
+    Shape, WarpShape, InstructionShape,
+    ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+    OperatorClass, -1, -1, Stages, Operator, AccumulatorsInRowMajor,
+    CacheOpA, CacheOpB, TransformA, TransformB, IsComplex
+> : cutlass::gemm::threadblock::DefaultMmaCore<
+    Shape, WarpShape, InstructionShape,
+    ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+    OperatorClass, Stages, Operator, AccumulatorsInRowMajor,
+    CacheOpA, CacheOpB, TransformA, TransformB, IsComplex
+> {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: column-major
+///   Operator: simt class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Size of a warp-scoped per thread access (a value of -1 indicates the default)
+    int kLaneAccessSizeA_,
+    /// Size of a warp-scoped per thread access (a value of -1 indicates the default)
+    int kLaneAccessSizeB_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DepthwiseMmaCoreWithLaneAccessSize<Shape_,
+                                        WarpShape_,
+                                        cutlass::gemm::GemmShape<1, 1, 1>,
+                                        ElementA_,
+                                        layout::RowMajor,
+                                        ElementB_,
+                                        layout::ColumnMajor,
+                                        ElementC_,
+                                        LayoutC_,
+                                        arch::OpClassSimt,
+                                        kLaneAccessSizeA_,
+                                        kLaneAccessSizeB_,
+                                        2,
+                                        Operator_> : public cutlass::gemm::threadblock::DefaultMmaCore<Shape_,
+                                                                           WarpShape_,
+                                                                           cutlass::gemm::GemmShape<1, 1, 1>,
+                                                                           ElementA_,
+                                                                           layout::RowMajor,
+                                                                           ElementB_,
+                                                                           layout::ColumnMajor,
+                                                                           ElementC_,
+                                                                           LayoutC_,
+                                                                           arch::OpClassSimt,
+                                                                           2,
+                                                                           Operator_> {
+  using Base = cutlass::gemm::threadblock::DefaultMmaCore<Shape_,
+                              WarpShape_,
+                              cutlass::gemm::GemmShape<1, 1, 1>,
+                              ElementA_,
+                              layout::RowMajor,
+                              ElementB_,
+                              layout::ColumnMajor,
+                              ElementC_,
+                              LayoutC_,
+                              arch::OpClassSimt,
+                              2,
+                              Operator_>;
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+
+  static int const kLaneAccessSizeA = kLaneAccessSizeA_;
+  static int const kLaneAccessSizeB = kLaneAccessSizeB_;
+
+  // Divisility requirements
+  static_assert( kLaneAccessSizeA > 0 && kLaneAccessSizeB > 0,
+    "Size of a warp-scoped per thread access should be larger then ZERO" );
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = typename Base::WarpCount;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = cutlass::gemm::warp::WarpSize<arch::OpClassSimt>::value;
+
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory are same as base class
+  //
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = cutlass::gemm::threadblock::detail::simt_get_warp_threads_m<WarpShape>(); 
+  static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = kLaneAccessSizeA / sizeof_bits<ElementA>::value;
+  static const int numElementsB = kLaneAccessSizeB / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+
+  static int const kPaddingM = cutlass::gemm::threadblock::detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementA>::value);
+  static int const kPaddingN = cutlass::gemm::threadblock::detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementB>::value);
+
+  static_assert(!(kPaddingM % LaneM) && !(kPaddingN % LaneN),
+                "Padding must be divisible by Lane");
+
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::conv::warp::MmaDepthwiseSimt<
+      WarpShape,      /// Size of the Gemm problem - concept: gemm::GemmShape<>
+      ElementA,       /// Data type of A elements
+      SmemLayoutA,    /// Layout of A matrix (concept: MatrixLayout)
+      ElementB,       /// Data type of B elements
+      SmemLayoutB,    /// Layout of B matrix (concept: MatrixLayout)
+      ElementC,       /// Element type of C matrix
+      LayoutC,        /// Layout of C matrix (concept: MatrixLayout)
+      Policy          /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
+  >;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = cutlass::gemm::threadblock::MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<kPaddingM, 0>,    // skew for A matrix to avoid SMEM bank conflicts
+    MatrixShape<0, kPaddingN>,    // skew for B matrix to avoid SMEM bank conflicts
+    WarpCount::kK
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: row-major
+///   Operator: simt class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of threadblock-scoped output tile (concept: TensorNHWCShape)
+    typename ThreadBlockOutputShape_,
+    /// Shape of filter shape per threadblock
+    typename FilterShape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Size of a warp-scoped per thread access
+    int kLaneAccessSizeA_,
+    /// Number of stages
+    int Stages_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DepthwiseDirectConvMmaCoreWithLaneAccessSize<Shape_,
+                                                    ThreadBlockOutputShape_,
+                                                    FilterShape_,
+                                                    WarpShape_,
+                                                    cutlass::gemm::GemmShape<1, 1, 1>,
+                                                    ElementA_,
+                                                    layout::RowMajor,
+                                                    ElementB_,
+                                                    layout::ColumnMajor,
+                                                    ElementC_,
+                                                    LayoutC_,
+                                                    arch::OpClassSimt,
+                                                    kLaneAccessSizeA_,
+                                                    128,
+                                                    Stages_,
+                                                    Operator_> {
+  using Shape = Shape_;
+  using FilterShape = FilterShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+
+  static int const kLaneAccessSizeB = 128;
+
+  // Divisility requirements
+  static_assert( kLaneAccessSizeB > 0,
+    "Size of a warp-scoped per thread access should be larger then ZERO" );
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = cutlass::gemm::GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    1
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = cutlass::gemm::warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+  
+  // For Gmem load
+  static int const kElementsPerAccessA = 128 / sizeof_bits<ElementA>::value;
+  static int const kElementsPerAccessB = 128 / sizeof_bits<ElementB>::value;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajor;
+  using SmemLayoutB = layout::RowMajor;
+
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, 1>, // Set kStrided = 1 because activation shape is runtime value.
+    kThreads,
+    kElementsPerAccessA
+  >;
+
+  /// ThreadMap of iterator A
+  using SmemThreadMapA = IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIteratorDirectConv<
+    MatrixShape<1, Shape::kN>, // set kRow is 1 because it is a runtime value
+    ElementA, 
+    SmemLayoutA,
+    0,
+    SmemThreadMapA, // was IteratorThreadMapA
+    true  // Dynamic iterations.
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, FilterShape::kCount>,
+    kThreads,
+    kElementsPerAccessB
+  >;
+
+  /// Transpose the ThreadMap of iterator B
+  using SmemThreadMapB = IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand                                                  
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIteratorDirectConv<
+    MatrixShape<FilterShape::kCount, Shape::kN>,
+    ElementB, 
+    SmemLayoutB,
+    0,
+    SmemThreadMapB, // was IteratorThreadMapB
+    false   // static iterations.
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+  // Groups per threads
+  // Fp32: 2 groups
+  // Fp16: 2 groups
+  static const int GroupsPerThread = sizeof(ElementB) > 1 ? 2 : 4;
+  // Define the warp-level op  
+  static const int WarpNumThreadsN = cutlass::const_min(WarpShape::kN / GroupsPerThread, kWarpSize);
+  static const int WarpNumThreadsM = kWarpSize / WarpNumThreadsN; 
+
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+
+  // Get output P, Q per thread
+  static const int TileP = cutlass::conv::threadblock::detail::SimtWarpShape<WarpShape::kM, WarpNumThreadsM>::kP;
+  static const int TileQ = cutlass::conv::threadblock::detail::SimtWarpShape<WarpShape::kM, WarpNumThreadsM>::kQ;
+
+  static const int LaneLayout = 1;
+  static const int numElementsB = kLaneAccessSizeB / sizeof_bits<ElementB>::value;
+  static const int LaneN = cutlass::const_min(numElementsB, WarpShape::kN / WarpNumThreadsN);
+  
+  // Define the output tile computed by each thread
+  using ThreadOutputShape = cutlass::conv::TensorNHWCShape<1, TileP, TileQ, LaneN>;
+
+  // Fetch the channel with same access size
+  static const int LaneM = LaneN;
+
+  // No paddings
+  static int const kPaddingM = 0;
+  static int const kPaddingN = 0;
+
+  static_assert(!(kPaddingM % LaneM) && !(kPaddingN % LaneN),
+                "Padding must be divisible by Lane");
+
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::conv::warp::MmaDepthwiseDirectConvSimt<
+      WarpShape,      /// Size of the Gemm problem - concept: gemm::GemmShape<>
+      FilterShape,    /// Shape of filter shape per threadblock - concept: gemm::GemmShape<Depth, Height, Width>
+      ThreadOutputShape, /// Size of the output tile computed by thread - concept: conv::TensorNHWCShape<>
+      ThreadBlockOutputShape_, /// Size of the output tile computed by threadblock - concept: conv::TensorNHWCShape<>
+      ElementA,       /// Data type of A elements
+      SmemLayoutA,    /// Layout of A matrix (concept: MatrixLayout)
+      ElementB,       /// Data type of B elements
+      SmemLayoutB,    /// Layout of B matrix (concept: MatrixLayout)
+      ElementC,       /// Element type of C matrix
+      LayoutC,        /// Layout of C matrix (concept: MatrixLayout)
+      Policy          /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
+  >;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = cutlass::conv::threadblock::DepthwiseDirectConvMmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<kPaddingM, 0>,    // skew for A matrix to avoid SMEM bank conflicts
+    MatrixShape<0, kPaddingN>,    // skew for B matrix to avoid SMEM bank conflicts
+    IteratorThreadMapA,
+    IteratorThreadMapB,
+    WarpCount::kK
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: row-major
+///   Operator: simt class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of threadblock-scoped output tile (concept: TensorNHWCShape)
+    typename ThreadBlockOutputShape_,
+    /// Shape of filter shape per threadblock
+    typename FilterShape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Size of a warp-scoped per thread access
+    int kLaneAccessSizeA_,
+    /// Number of stages
+    int Stages_,
+    /// Operation performed by GEMM
+    typename Operator_,
+    /// Stride ( MatrixShape<Height, Width> )
+    typename StrideShape_,   
+    /// Dilation ( MatrixShape<Height, Width> )
+    typename DilationShape_,
+    /// Activation Shape loaded by threadblock
+    typename ActivationShape_>
+struct DepthwiseDirectConvMmaCoreWithLaneAccessSize<Shape_,
+                                                    ThreadBlockOutputShape_,
+                                                    FilterShape_,
+                                                    WarpShape_,
+                                                    cutlass::gemm::GemmShape<1, 1, 1>,
+                                                    ElementA_,
+                                                    layout::RowMajor,
+                                                    ElementB_,
+                                                    layout::ColumnMajor,
+                                                    ElementC_,
+                                                    LayoutC_,
+                                                    arch::OpClassSimt,
+                                                    kLaneAccessSizeA_,
+                                                    128,
+                                                    Stages_,
+                                                    Operator_,
+                                                    IteratorAlgorithm::kFixedStrideDilation,
+                                                    StrideShape_,
+                                                    DilationShape_,
+                                                    ActivationShape_> {
+  using Shape = Shape_;
+  using FilterShape = FilterShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+  using StrideShape = StrideShape_;
+  using DilationShape = DilationShape_; 
+  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
+  using ActivationShape = ActivationShape_;
+
+  static int const kLaneAccessSizeB = 128;
+
+  // Divisility requirements
+  static_assert( kLaneAccessSizeB > 0,
+    "Size of a warp-scoped per thread access should be larger then ZERO" );
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = cutlass::gemm::GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    1
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = cutlass::gemm::warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+  
+  // For Gmem load
+  static int const kElementsPerAccessA = 128 / sizeof_bits<ElementA>::value;
+  static int const kElementsPerAccessB = 128 / sizeof_bits<ElementB>::value;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajor;
+  using SmemLayoutB = layout::RowMajor;
+
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<ActivationShape::kC, ActivationShape::kNHW>,
+    kThreads,
+    kElementsPerAccessA
+  >;
+
+  /// ThreadMap of iterator A
+  using SmemThreadMapA = IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIteratorDirectConv<
+    MatrixShape<ActivationShape::kNHW, ActivationShape::kC>,
+    ElementA,
+    SmemLayoutA,
+    0,
+    SmemThreadMapA, // was IteratorThreadMapA
+    false  // static iterations.
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, FilterShape::kCount>,
+    kThreads,
+    kElementsPerAccessB
+  >;
+
+  /// Transpose the ThreadMap of iterator B
+  using SmemThreadMapB = IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand                                                  
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIteratorDirectConv<
+    MatrixShape<FilterShape::kCount, Shape::kN>,
+    ElementB, 
+    SmemLayoutB,
+    0,
+    SmemThreadMapB, // was IteratorThreadMapB
+    false   // static iterations.
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+  // Groups per threads
+  // Fp32: 2 groups
+  // Fp16: 2 groups
+  static const int GroupsPerThread = sizeof(ElementB) > 1 ? 2 : 4;
+  // Define the warp-level op  
+  static const int WarpNumThreadsN = cutlass::const_min(WarpShape::kN / GroupsPerThread, kWarpSize);
+  static const int WarpNumThreadsM = kWarpSize / WarpNumThreadsN; 
+
+  static const int TileP = cutlass::conv::threadblock::detail::SimtWarpShape<WarpShape::kM, WarpNumThreadsM>::kP;
+  static const int TileQ = cutlass::conv::threadblock::detail::SimtWarpShape<WarpShape::kM, WarpNumThreadsM>::kQ;
+
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+
+  static const int LaneLayout = 1;
+  static const int numElementsB = kLaneAccessSizeB / sizeof_bits<ElementB>::value;
+  static const int LaneN = cutlass::const_min(numElementsB, WarpShape::kN / WarpNumThreadsN);
+  
+  // Define the output tile computed by each thread
+  using ThreadOutputShape = cutlass::conv::TensorNHWCShape<1, TileP, TileQ, LaneN>;
+
+  // Fetch the channel with same access size
+  static const int LaneM = LaneN;
+
+  // No paddings
+  static int const kPaddingM = 0;
+  static int const kPaddingN = 0;
+
+  static_assert(!(kPaddingM % LaneM) && !(kPaddingN % LaneN),
+                "Padding must be divisible by Lane");
+
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::conv::warp::MmaDepthwiseDirectConvSimt<
+      WarpShape,      /// Size of the Gemm problem - concept: gemm::GemmShape<>
+      FilterShape,    /// Shape of filter shape per threadblock - concept: gemm::GemmShape<Depth, Height, Width>
+      ThreadOutputShape, /// Size of the output tile computed by thread - concept: conv::TensorNHWCShape<>
+      ThreadBlockOutputShape, /// Size of the output tile computed by threadblock - concept: conv::TensorNHWCShape<>
+      ElementA,       /// Data type of A elements
+      SmemLayoutA,    /// Layout of A matrix (concept: MatrixLayout)
+      ElementB,       /// Data type of B elements
+      SmemLayoutB,    /// Layout of B matrix (concept: MatrixLayout)
+      ElementC,       /// Element type of C matrix
+      LayoutC,        /// Layout of C matrix (concept: MatrixLayout)
+      Policy,          /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
+      IteratorAlgorithm::kFixedStrideDilation, /// Iterator algo type
+      StrideShape,   /// Stride ( MatrixShape<Height, Width> )
+      DilationShape,  /// Dilation ( MatrixShape<Height, Width> )
+      ActivationShape /// Activation Shape loaded by threadblock
+  >;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = cutlass::conv::threadblock::DepthwiseDirectConvMmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<kPaddingM, 0>,    // skew for A matrix to avoid SMEM bank conflicts
+    MatrixShape<0, kPaddingN>,    // skew for B matrix to avoid SMEM bank conflicts
+    IteratorThreadMapA,
+    IteratorThreadMapB,
+    WarpCount::kK
+  >;
+};
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
diff --git a/include/cutlass/conv/threadblock/implicit_gemm_fprop_fusion_multistage.h b/include/cutlass/conv/threadblock/implicit_gemm_fprop_fusion_multistage.h
index 4f16b42d8c..3bee07d0ab 100644
--- a/include/cutlass/conv/threadblock/implicit_gemm_fprop_fusion_multistage.h
+++ b/include/cutlass/conv/threadblock/implicit_gemm_fprop_fusion_multistage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -64,7 +64,7 @@
 #include "cutlass/arch/cache_operation.h"
 #include "cutlass/gemm/gemm.h"
 
-#include "cutlass/conv/warp/conv2d_fprop_scale_bias_iterator.h"
+#include "cutlass/gemm/warp/scale_bias_tile_iterator.h"
 #include "cutlass/conv/warp/scale_bias_relu_transform.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -139,6 +139,13 @@ class MmaFpropFusionBase {
   /// Tensor reference to the B operand
   using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
 
+  static_assert(kWarpGemmIterations > 1,
+                "The pipelined structure requires at least two warp-level "
+                "GEMM operations.");
+
+  static_assert((kWarpGemmIterations % 2) == 0,
+                "Inner loop iteration must be an even number.");
+
   //
   // Nested structs
   //
@@ -319,7 +326,7 @@ class ImplicitGemmFpropFusionMultistage
   using Policy = Policy_;
   ///< Base class
   using Base = MmaFpropFusionBase<Shape_, typename IteratorScaleBias::Element,
-                         typename IteratorScaleBias::Layout, Policy_,
+                         typename IteratorScaleBias::Layout, Policy,
                          WarpIteratorScaleBias, Stages>;
 
   using SmemIteratorA = SmemIteratorA_;
@@ -518,6 +525,8 @@ class ImplicitGemmFpropFusionMultistage
       IteratorScaleBias iterator_A_scale_bias,
       ///< initial value of accumulator
       FragmentC const &src_accum,
+      ///< number of iterations per channel
+      int gemm_k_iterations_per_channel = 0,  
       ///< Imaginary strides used for planar-complex only - ignored here
       int64_t imag_stride_A = 0,
       int64_t imag_stride_B = 0) {
diff --git a/include/cutlass/conv/threadblock/implicit_gemm_multistage.h b/include/cutlass/conv/threadblock/implicit_gemm_multistage.h
index 36b41aacd4..eea7743a40 100644
--- a/include/cutlass/conv/threadblock/implicit_gemm_multistage.h
+++ b/include/cutlass/conv/threadblock/implicit_gemm_multistage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -116,10 +116,6 @@ class ImplicitGemmMultistage :
   /// Internal structure exposed for introspection.
   struct Detail {
 
-    static_assert(Base::kWarpGemmIterations > 1,
-                  "The pipelined structure requires at least two warp-level "
-                  "GEMM operations.");
-
     /// Number of cp.async instructions to load one stage of operand A
     static int const AsyncCopyIterationsPerStageA =
         IteratorA::ThreadMap::Iterations::kCount;
@@ -138,6 +134,12 @@ class ImplicitGemmMultistage :
     /// Number of cp.async instructions to load on group of operand B
     static int const kAccessesPerGroupB =
         (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    // Optional staged-accumulation (e.g., tf32x3 kernels) for improved numerical
+    // accuracy, where each mainloop iteration first accumulates into a temporary
+    // set of freshly-cleared accumulators, which are subsequently added to the
+    // final accumulator set.
+    static bool const kStagedAccumulation = arch::detail::UseStagedAccumulation<Operator>::value;
   };
 
  private:
@@ -272,6 +274,8 @@ class ImplicitGemmMultistage :
       IteratorB iterator_B,
       ///< initial value of accumulator
       FragmentC const &src_accum,
+      ///< number of iterations per channel
+      int gemm_k_iterations_per_channel = 0,
       ///< Imaginary strides used for planar-complex only - ignored here
       int64_t imag_stride_A = 0,
       int64_t imag_stride_B = 0) {
@@ -297,7 +301,7 @@ class ImplicitGemmMultistage :
 
         CUTLASS_PRAGMA_UNROLL
         for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
-        int const kSrcBytes =
+          int const kSrcBytes =
             sizeof_bits<typename IteratorA::Element>::value *
             IteratorA::ThreadMap::kElementsPerAccess /
             IteratorA::kAccessesPerVector / 8;
@@ -322,7 +326,7 @@ class ImplicitGemmMultistage :
               this->smem_iterator_B_.get());
 
         CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
           int const kSrcBytes =
               sizeof_bits<typename IteratorB::Element>::value *
               IteratorB::ThreadMap::kElementsPerAccess /
@@ -389,10 +393,7 @@ class ImplicitGemmMultistage :
 
     FragmentC tmp_accum;
 
-    if (platform::is_same<typename Operator::MathOperator,
-                          arch::OpMultiplyAddFastF32>::value
-      || platform::is_same<typename Operator::MathOperator,
-                           arch::OpMultiplyAddComplexFastF32>::value) {
+    if (Detail::kStagedAccumulation) {
       tmp_accum.clear();
     }
 
@@ -446,10 +447,7 @@ class ImplicitGemmMultistage :
         copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A,
                                group_start_iteration_B);
 
-        if (platform::is_same<typename Operator::MathOperator,
-                              arch::OpMultiplyAddFastF32>::value
-          || platform::is_same<typename Operator::MathOperator,
-                               arch::OpMultiplyAddComplexFastF32>::value) {
+        if (Detail::kStagedAccumulation) {
           warp_mma(
             tmp_accum, 
             warp_transformed_frag_A[warp_mma_k % 2],
@@ -520,10 +518,7 @@ class ImplicitGemmMultistage :
 
     }
 
-    if (platform::is_same<typename Operator::MathOperator,
-                          arch::OpMultiplyAddFastF32>::value
-      || platform::is_same<typename Operator::MathOperator,
-                           arch::OpMultiplyAddComplexFastF32>::value) {
+    if (Detail::kStagedAccumulation) {
       accum = plus_accum(accum, tmp_accum); 
     }
   
diff --git a/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h b/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h
index f77e2e3325..79bcb78aa5 100644
--- a/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h
+++ b/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -188,6 +188,7 @@ class ImplicitGemmPipelined : public gemm::threadblock::MmaBase<Shape_, Policy_,
     IteratorA iterator_A,                             ///< iterator over A operand in global memory
     IteratorB iterator_B,                             ///< iterator over B operand in global memory
     FragmentC const &src_accum,                       ///< source accumulator tile
+    int gemm_k_iterations_per_channel = 0,             ///< number of iterations per channel
     TransformA transform_A = TransformA(),            ///< transformation applied to A fragment
     TransformB transform_B = TransformB()) {          ///< transformation applied to B fragment
 
@@ -237,7 +238,7 @@ class ImplicitGemmPipelined : public gemm::threadblock::MmaBase<Shape_, Policy_,
     int smem_write_stage_idx = 1;
 
     // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
-    // shared memory loads (which have the tighest latency requirement).
+    // shared memory loads (which have the tightest latency requirement).
 
     //
     // Mainloop
diff --git a/include/cutlass/conv/threadblock/implicit_gemm_wgrad_fusion_multistage.h b/include/cutlass/conv/threadblock/implicit_gemm_wgrad_fusion_multistage.h
index 7066997a35..1ec0c61dd8 100644
--- a/include/cutlass/conv/threadblock/implicit_gemm_wgrad_fusion_multistage.h
+++ b/include/cutlass/conv/threadblock/implicit_gemm_wgrad_fusion_multistage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -70,7 +70,7 @@
 #include "cutlass/arch/cache_operation.h"
 #include "cutlass/gemm/gemm.h"
 
-#include "cutlass/conv/warp/conv2d_fprop_scale_bias_iterator.h"
+#include "cutlass/gemm/warp/scale_bias_tile_iterator.h"
 #include "cutlass/conv/warp/scale_bias_relu_transform.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -138,6 +138,13 @@ class MmaWgradFusionBase {
   /// Tensor reference to the B operand
   using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
 
+  static_assert(kWarpGemmIterations > 1,
+                "The pipelined structure requires at least two warp-level "
+                "GEMM operations.");
+
+  static_assert((kWarpGemmIterations % 2) == 0,
+                "Inner loop iteration must be an even number.");
+
   //
   // Nested structs
   //
@@ -306,10 +313,6 @@ class ImplicitGemmWgradFusionMultistage
   /// Internal structure exposed for introspection.
   struct Detail {
 
-    static_assert(Base::kWarpGemmIterations > 1,
-                  "The pipelined structure requires at least two warp-level "
-                  "GEMM operations.");
-
     /// Number of cp.async instructions to load one stage of operand A
     static int const AsyncCopyIterationsPerStageA =
         IteratorA::ThreadMap::Iterations::kCount;
@@ -470,6 +473,8 @@ class ImplicitGemmWgradFusionMultistage
       IteratorScaleBias iterator_B_scale_bias,
       ///< initial value of accumulator
       FragmentC const &src_accum,
+      ///< number of iterations per channel
+      int gemm_k_iterations_per_channel = 0, 
       ///< Imaginary strides used for planar-complex only - ignored here
       int64_t imag_stride_A = 0,
       int64_t imag_stride_B = 0) {
diff --git a/include/cutlass/conv/threadblock/predicated_scale_bias_vector_access_iterator.h b/include/cutlass/conv/threadblock/predicated_scale_bias_vector_access_iterator.h
index 7d60e4b0b2..bfe9a39816 100644
--- a/include/cutlass/conv/threadblock/predicated_scale_bias_vector_access_iterator.h
+++ b/include/cutlass/conv/threadblock/predicated_scale_bias_vector_access_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -113,12 +113,9 @@ class PredicatedScaleBiasVectorAccessIterator<ThreadblockShape_,
   /// Internal pointer to first access of tile
   BytePointer pointer_;
 
-  /// Size of tensor
-  Conv2dProblemSize problem_size_;
-
-  int filter_c_;
-  int filter_r_;
-  int filter_s_;
+  int problem_size_trs;
+  int problem_size_c;
+  int filter_trs_;
 
   TensorCoord thread_offset_;
 
@@ -140,10 +137,43 @@ class PredicatedScaleBiasVectorAccessIterator<ThreadblockShape_,
       /// Initial offset of threadblock
       TensorCoord const &threadblock_offset)
       : params_(params),
-        problem_size_(problem_size),
-        filter_c_(0),
-        filter_r_(0),
-        filter_s_(0) {
+        problem_size_trs(problem_size.R * problem_size.S),
+        problem_size_c(problem_size.C),
+        filter_trs_(0) {
+    pointer_ = (thread_id < kThreads)
+                   ? reinterpret_cast<BytePointer>(
+                         const_cast<NonConstPointer>(scale_pointer))
+                   : reinterpret_cast<BytePointer>(
+                         const_cast<NonConstPointer>(bias_pointer));
+
+    // Per-thread offset in logical coordinates of tensor
+    int thread_base = (thread_id < kThreads) ? 0 : kThreads;
+
+    thread_offset_ =
+        threadblock_offset +
+        TensorCoord((thread_id - thread_base) * kElementsPerAccess, 0);
+
+    set_iteration_index(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Extent of tensor
+      Conv3dProblemSize const &problem_size,
+      /// Pointer to the start of the scale vector
+      ConstPointer scale_pointer,
+      /// Pointer to the start of the bias vector
+      ConstPointer bias_pointer,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : params_(params),
+        problem_size_trs(problem_size.T * problem_size.R * problem_size.S),
+        problem_size_c(problem_size.C),
+        filter_trs_(0) {
     pointer_ = (thread_id < kThreads)
                    ? reinterpret_cast<BytePointer>(
                          const_cast<NonConstPointer>(scale_pointer))
@@ -177,6 +207,22 @@ class PredicatedScaleBiasVectorAccessIterator<ThreadblockShape_,
                                                 scale_pointer, bias_pointer,
                                                 thread_id, make_Coord(0, 0)) {}
 
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Extent of tensor
+      Conv3dProblemSize const &problem_size,
+      /// Pointer to start of scale vector
+      ConstPointer scale_pointer,
+      /// Pointer to start of scale vector
+      ConstPointer bias_pointer,
+      ///< ID of each participating thread
+      int thread_id)
+      : PredicatedScaleBiasVectorAccessIterator(params, problem_size,
+                                                scale_pointer, bias_pointer,
+                                                thread_id, make_Coord(0, 0)) {}
+
   /// Overrides the internal iteration index
   CUTLASS_HOST_DEVICE
   void set_iteration_index(int index) {}
@@ -209,16 +255,10 @@ class PredicatedScaleBiasVectorAccessIterator<ThreadblockShape_,
   CUTLASS_HOST_DEVICE
   void advance() {
     // moves to the next tile
-    ++filter_s_;
-    if (filter_s_ == problem_size_.S) {
-      filter_s_ = 0;
-      ++filter_r_;
-
-      if (filter_r_ < problem_size_.R) {
-      } else {
-        filter_r_ = 0;
-        add_tile_offset(TensorCoord(1, 0));
-      }
+    ++filter_trs_;
+    if (filter_trs_ == problem_size_trs) {
+      filter_trs_ = 0;
+      add_tile_offset(TensorCoord(1, 0));
     }
   }
 
@@ -248,7 +288,7 @@ class PredicatedScaleBiasVectorAccessIterator<ThreadblockShape_,
         "}\n" : "+r"(enabled) :"n"(kThreads * 2));
 #endif
 
-    return ((thread_offset_.contiguous() < problem_size_.C) && enabled);
+    return ((thread_offset_.contiguous() < problem_size_c) && enabled);
   }
 };
 
@@ -322,6 +362,25 @@ class PredicatedScaleBiasVectorAccessIterator<ThreadblockShape_,
                   layout::PitchLinearCoord(threadblock_offset.column(),
                                            threadblock_offset.row())) {}
 
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Extent of tensor
+      Conv3dProblemSize const &problem_size,
+      ///< Pointer to the start of the scale vector
+      ConstPointer scale_pointer,
+      ///< Pointer to the start of the bias vector
+      ConstPointer bias_pointer,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params, problem_size, scale_pointer, bias_pointer,
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row())) {}
+
   /// Construct a PredicatedTileAccessIterator with zero threadblock offset
   CUTLASS_HOST_DEVICE
   PredicatedScaleBiasVectorAccessIterator(
@@ -335,6 +394,18 @@ class PredicatedScaleBiasVectorAccessIterator<ThreadblockShape_,
                                                 scale_pointer, bias_pointer,
                                                 thread_id, make_Coord(0, 0)) {}
 
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      Params const &params,                   ///< Precomputed parameters object
+      Conv3dProblemSize const &problem_size,  ///< Extent of tensor
+      ConstPointer scale_pointer,  ///< Pointer to the start of the scale vector
+      ConstPointer bias_pointer,   ///< Pointer to the start of the bias vector
+      int thread_id                ///< ID of each participating thread
+      )
+      : PredicatedScaleBiasVectorAccessIterator(params, problem_size,
+                                                scale_pointer, bias_pointer,
+                                                thread_id, make_Coord(0, 0)) {}
+
   /// Overrides the internal iteration index
   CUTLASS_HOST_DEVICE
   void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
diff --git a/include/cutlass/conv/threadblock/predicated_scale_bias_vector_iterator.h b/include/cutlass/conv/threadblock/predicated_scale_bias_vector_iterator.h
index 7b63c48516..24f0de4c28 100644
--- a/include/cutlass/conv/threadblock/predicated_scale_bias_vector_iterator.h
+++ b/include/cutlass/conv/threadblock/predicated_scale_bias_vector_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/threadblock_swizzle.h b/include/cutlass/conv/threadblock/threadblock_swizzle.h
index 06d1bdfafc..67418e6892 100644
--- a/include/cutlass/conv/threadblock/threadblock_swizzle.h
+++ b/include/cutlass/conv/threadblock/threadblock_swizzle.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -67,7 +67,7 @@ static int get_strided_dgrad_tile_m(
   // CUTLASS strided dgrad performance for stride > filter, i.e., stride={2x2} and filter={1x1})
   //
   // * Optimization * 
-  // Only launch CTAs in M dimenstion which contribute to a row in Dx output
+  // Only launch CTAs in M dimension which contribute to a row in Dx output
   // 
   // 
   // * Constraints *
@@ -95,11 +95,11 @@ struct StridedDgradHorizontalThreadblockSwizzle :
   /// Returns the shape of the problem in units of logical tiles
   /// For ImplicitGemmConvolution Conv2d problem size: conv_operator(NPQK, NHWC, KRSC)
   CUTLASS_HOST_DEVICE
-  gemm::GemmCoord get_tiled_shape(
+  static gemm::GemmCoord get_tiled_shape(
     cutlass::conv::Operator conv_operator,
     cutlass::conv::Conv2dProblemSize const &problem_size,
     gemm::GemmCoord tile_size,
-    int split_k_slices) const {
+    int split_k_slices) {
 
     gemm::GemmCoord implicit_gemm_problem_size = 
     cutlass::conv::implicit_gemm_problem_size(conv_operator, problem_size);
@@ -107,7 +107,7 @@ struct StridedDgradHorizontalThreadblockSwizzle :
     // compute number of tiles in m dimension
     int tile_m = get_strided_dgrad_tile_m(problem_size, tile_size.m());
 
-    // compute number of tiles in n dimenstion 
+    // compute number of tiles in n dimension 
     int tile_n = (implicit_gemm_problem_size.n() + tile_size.n() - 1) / tile_size.n();
 
     return gemm::GemmCoord(
@@ -136,11 +136,11 @@ struct StridedDgradIdentityThreadblockSwizzle :
   /// Returns the shape of the problem in units of logical tiles
   /// For ImplicitGemmConvolution Conv2d problem size: conv_operator(NPQK, NHWC, KRSC)
   CUTLASS_HOST_DEVICE
-  gemm::GemmCoord get_tiled_shape(
+  static gemm::GemmCoord get_tiled_shape(
     cutlass::conv::Operator conv_operator,
     cutlass::conv::Conv2dProblemSize const &problem_size,
     gemm::GemmCoord tile_size,
-    int split_k_slices) const {
+    int split_k_slices) {
 
     gemm::GemmCoord implicit_gemm_problem_size = 
     cutlass::conv::implicit_gemm_problem_size(conv_operator, problem_size);
@@ -148,7 +148,7 @@ struct StridedDgradIdentityThreadblockSwizzle :
     // compute number of tiles in m dimension
     int tile_m = get_strided_dgrad_tile_m(problem_size, tile_size.m());
 
-    // compute number of tiles in n dimenstion 
+    // compute number of tiles in n dimension 
     int tile_n = (implicit_gemm_problem_size.n() + tile_size.n() - 1) / tile_size.n();
 
     return gemm::GemmCoord(
@@ -157,7 +157,6 @@ struct StridedDgradIdentityThreadblockSwizzle :
       split_k_slices);
   }
 
-
   /// Returns the shape of the problem in units of logical tiles
   /// For GEMM problem size (MxNxK) (Do not use base class get_tiled_shape())
   private:
@@ -166,7 +165,29 @@ struct StridedDgradIdentityThreadblockSwizzle :
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+/// Threadblock swizzling function for GEMMs
+template <int N = 1, int Output_N = 1, int Output_P = 1, int Output_Q = 1>
+struct DepthwiseDirect2dConvIdentityThreadblockSwizzle
+    : public gemm::threadblock::GemmIdentityThreadblockSwizzle<N> {
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvIdentityThreadblockSwizzle() {}
+
+  /// Returns the shape of the problem in units of logical tiles
+  CUTLASS_HOST_DEVICE
+  static gemm::GemmCoord get_tiled_shape(cutlass::conv::Operator conv_operator,
+                            cutlass::conv::Conv2dProblemSize const &problem_size,
+                            gemm::GemmCoord tile_size,
+                            int split_k_slices) {
+        
+    gemm::GemmCoord implicit_gemm_problem_size =
+        cutlass::conv::implicit_gemm_problem_size(conv_operator, problem_size);
+
+    return gemm::GemmCoord(1,
+                     (implicit_gemm_problem_size.n() + tile_size.n() - 1) / tile_size.n(),
+                     split_k_slices);
+  }
+};
 
 } // namespace threadblock
-} // namespace gemm
+} // namespace conv
 } // namespace cutlass
diff --git a/include/cutlass/conv/warp/mma_depthwise_simt.h b/include/cutlass/conv/warp/mma_depthwise_simt.h
new file mode 100644
index 0000000000..ed385df039
--- /dev/null
+++ b/include/cutlass/conv/warp/mma_depthwise_simt.h
@@ -0,0 +1,380 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/thread/mma.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/thread/depthwise_mma.h"
+
+
+#include "cutlass/gemm/warp/mma_simt_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_simt_policy.h"
+
+#include "cutlass/gemm/warp/mma_simt.h"
+#include "cutlass/conv/warp/mma_depthwise_simt_tile_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Data type of A elements
+    typename ElementA_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA_,
+    /// Data type of B elements
+    typename ElementB_,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB_,
+    /// Element type of C matrix
+    typename ElementC_,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC_,
+    /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+    typename Policy_,
+    /// Number of partitions along K dimension
+    int PartitionsK = 1,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaDepthwiseSimt
+    : public cutlass::gemm::warp::
+          MmaSimt<Shape_, ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, Policy_> {
+  using Base = cutlass::gemm::warp::
+      MmaSimt<Shape_, ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, Policy_>;
+      
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of multiplicand A
+  using ElementA = ElementA_;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = ElementB_;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulator matrix C
+  using ElementC = ElementC_;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassSimt;
+
+  /// Hard-coded for now
+  using ArchTag = arch::Sm50;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = TransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = TransformB;
+
+public:
+
+  /// Iterates over the B operand in memory
+  using IteratorB = cutlass::conv::warp::DepthwiseMmaSimtTileIterator<
+    MatrixShape<Policy::LaneMmaShape::kK, Shape::kN>,
+    cutlass::gemm::Operand::kB,
+    ElementB,
+    LayoutB,
+    Policy,
+    PartitionsK,
+    Shape::kK
+  >;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentB = FragmentB;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaDepthwiseSimt():Base() {}
+};
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Shape of filter shape per threadblock - concept: gemm::GemmShape<Depth, Height, Width>
+    typename FilterShape_,
+    /// Shape of the output tile computed by thread- concept: conv::TensorNHWCShape<>
+    typename ThreadOutputShape_,
+    /// Shape of the output tile computed by threadblock - concept: conv::TensorNHWCShape<>
+    typename ThreadBlockOutputShape_,
+    /// Data type of A elements
+    typename ElementA_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA_,
+    /// Data type of B elements
+    typename ElementB_,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB_,
+    /// Element type of C matrix
+    typename ElementC_,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC_,
+    /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+    typename Policy_,
+    /// Iterator algo type
+    conv::IteratorAlgorithm IteratorAlgorithm_ = IteratorAlgorithm::kAnalytic,
+    /// Stride ( MatrixShape<Height, Width> )
+    typename StrideShape_ = cutlass::MatrixShape<-1, -1>,   
+    /// Dilation ( MatrixShape<Height, Width> )
+    typename DilationShape_ =  cutlass::MatrixShape<-1, -1>,
+    /// Activation Shape loaded by threadblock
+    typename ActivationShape_ = cutlass::conv::TensorNHWCShape<-1,-1,-1,-1>,
+    /// Number of partitions along K dimension
+    int PartitionsK = 1,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaDepthwiseDirectConvSimt {
+ public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Shape of filter shape per threadblock - concept: gemm::GemmShape<Depth, Height, Width>
+  using FilterShape = FilterShape_;
+
+  /// Shape of the output tile computed by thread- concept: conv::TensorNHWCShape<>
+  using ThreadOutputShape = ThreadOutputShape_;
+
+  /// Shape of the output tile computed by threadblock - concept: conv::TensorNHWCShape<>
+  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
+
+  /// Data type of multiplicand A
+  using ElementA = ElementA_;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = ElementB_;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulator matrix C
+  using ElementC = ElementC_;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Iterator algo type
+  static conv::IteratorAlgorithm const IteratorAlgorithm = IteratorAlgorithm_;
+
+  /// Stride ( MatrixShape<Height, Width> )
+  using StrideShape = StrideShape_; 
+
+  /// Dilation ( MatrixShape<Height, Width> )
+  using DilationShape = DilationShape_;
+  
+  /// Activation Shape loaded by threadblock
+  using ActivationShape = ActivationShape_;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassSimt;
+
+  /// Hard-coded for now
+  using ArchTag = arch::Sm50;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = TransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = TransformB;
+
+  static constexpr bool use_dp4a = (platform::is_same< layout::ColumnMajorInterleaved<4>, LayoutA>::value || 
+                                    platform::is_same< layout::RowMajorInterleaved<4>, LayoutA >::value) && 
+                                    platform::is_same< ElementA, int8_t >::value && 
+                                    platform::is_same< ElementB, int8_t >::value;
+
+  using dp4a_type = typename platform::conditional< use_dp4a , int8_t, bool >::type;
+
+  /// Thread-level matrix multiply accumulate operator
+  using ThreadMma = cutlass::conv::thread::DepthwiseDirectConvElementwiseInnerProduct<
+    cutlass::gemm::GemmShape<
+      Shape::kM / Policy::WarpShape::kRow,    // number of output pixels proccessed per thread
+      Shape::kN / Policy::WarpShape::kColumn, // number of channels proccessed per thread
+      1>,
+    ElementA,
+    ElementB,
+    ElementC,
+    arch::OpMultiplyAdd,
+    dp4a_type
+  >;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename ThreadMma::ArchMmaOperator;
+
+  /// Indicates math operator 
+  using MathOperator = typename ArchMmaOperator::Operator;
+  
+  /// Shape of the underlying instruction
+  using InstructionShape = cutlass::gemm::GemmShape<1,1,use_dp4a ? 4 : 1>;
+
+public:
+
+  /// Iterates over the A operand in memory
+  using IteratorA = cutlass::conv::warp::DepthwiseDirect2dConvSimtTileIterator<
+    MatrixShape<Shape::kM, Shape::kN>, // <output tile=(P*Q), output channels> per warp
+    FilterShape,
+    ThreadOutputShape,
+    ThreadBlockOutputShape,
+    cutlass::gemm::Operand::kA,
+    ElementA,
+    Policy,
+    IteratorAlgorithm,
+    StrideShape,
+    DilationShape,
+    ActivationShape,
+    PartitionsK,
+    Shape::kK
+  >;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentA = FragmentA;
+
+  /// Iterates over the B operand in memory
+  using IteratorB = cutlass::gemm::warp::MmaSimtTileIterator<
+    MatrixShape<1, Shape::kN>,
+    cutlass::gemm::Operand::kB,
+    ElementB,
+    LayoutB,
+    Policy,
+    PartitionsK,
+    Shape::kK
+  >;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentB = FragmentB;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = cutlass::gemm::warp::MmaSimtTileIterator<
+    MatrixShape<Shape::kM, Shape::kN>,
+    cutlass::gemm::Operand::kC,
+    ElementC,
+    LayoutC,
+    Policy
+  >;
+
+  /// Storage for C tile
+  using FragmentC = typename ThreadMma::FragmentC;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaDepthwiseDirectConvSimt() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &d, 
+    FragmentA a, 
+    FragmentB b, 
+    FragmentC const &c, int group_idx = 0) const {
+
+    ThreadMma mma;
+
+    mma(d, a, b, c);
+  }
+
+  /// Transform the mma operands to the required types
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
+                 FragmentA const &A, FragmentB const &B) const {
+    dst_A = A;
+    dst_B = B;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace conv
+} // namespace cutlass
diff --git a/include/cutlass/conv/warp/mma_depthwise_simt_tile_iterator.h b/include/cutlass/conv/warp/mma_depthwise_simt_tile_iterator.h
new file mode 100644
index 0000000000..26d9638bab
--- /dev/null
+++ b/include/cutlass/conv/warp/mma_depthwise_simt_tile_iterator.h
@@ -0,0 +1,862 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Describes the lane policy used by warp-level matrix multiply operators targeting SIMT
+      instructions
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/conv/convolution.h"
+
+#include "cutlass/arch/memory_sm75.h"
+
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma_simt_policy.h"
+#include "cutlass/gemm/warp/mma_simt_tile_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Iterates over operands to warp-level matrix multiply operations targeting SIMT instructions
+///
+/// concept: MutableRandomAccessContiguousTileIteratorConcept
+///
+template <
+  /// Size of the matrix to load (concept: MatrixShape)
+  typename Shape_,
+  /// Operand identity
+  cutlass::gemm::Operand Operand,
+  /// Data type of A elements
+  typename Element_,
+  /// Layout of operand
+  typename Layout_,
+  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+  typename Policy_,
+  /// Number of partitions along K dimension - used in sliced-K
+  int PartitionsK = 1,
+  /// Group Size along kPartition - used in sliced-K
+  int PartitionGroupSize = 1
+>
+class DepthwiseMmaSimtTileIterator;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for B operands of row-major layouts
+///
+/// Concept: MutableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of A elements
+    typename Element_,
+    /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+    typename Policy_,
+    /// Number of partitions along K dimension
+    int PartitionsK,
+    /// Group Size along kPartition - used in sliced-K
+    int PartitionGroupSize>
+class DepthwiseMmaSimtTileIterator<Shape_,
+                                   cutlass::gemm::Operand::kB,
+                                   Element_,
+                                   layout::RowMajor,
+                                   Policy_,
+                                   PartitionsK,
+                                   PartitionGroupSize>
+    : public cutlass::gemm::warp::MmaSimtTileIterator<Shape_,
+                                               cutlass::gemm::Operand::kB,
+                                               Element_,
+                                               layout::RowMajor,
+                                               Policy_,
+                                               PartitionsK,
+                                               PartitionGroupSize> {
+
+  using Base = cutlass::gemm::warp::MmaSimtTileIterator<Shape_,
+                                               cutlass::gemm::Operand::kB,
+                                               Element_,
+                                               layout::RowMajor,
+                                               Policy_,
+                                               PartitionsK,
+                                               PartitionGroupSize>;
+ public:
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static cutlass::gemm::Operand const kOperand = cutlass::gemm::Operand::kB;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of policy
+  using Layout = layout::RowMajor;
+
+  /// Decomposition of elements among threads
+  using Policy = Policy_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = typename Base::TensorRef;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Thread-level shape of a fragment
+  using ThreadShape = typename Base::ThreadShape;
+
+  /// Number of individual loads
+  using Iterations =  typename Base::Iterations;
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+  static_assert(Policy::LaneMmaShape::kN == 1, "Each thread should be 1 element per LDS along the k-dim");
+  
+private:
+
+  MatrixCoord lane_offset_;
+  int channel_idx_;
+  int base_channel_idx_;
+  int warps_n_;
+
+ public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  DepthwiseMmaSimtTileIterator():Base() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  DepthwiseMmaSimtTileIterator(
+    TensorRef ref, 
+    int lane_id
+  ) : Base(ref, lane_id) {
+
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    warps_n_ = -1;
+    channel_idx_ = 0;
+    base_channel_idx_ = 0;
+    lane_offset_ = lane_layout.inverse(lane_id) * MatrixCoord(0, Policy::LaneMmaShape::kN);
+  }
+  
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  DepthwiseMmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
+
+    if(warps_n_ == -1){
+        warps_n_ = coord.column();
+    }
+    
+    Base::add_tile_offset(coord);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator. (vector loads)
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+    Array<Element, Policy::LaneMmaShape::kN> *dst_ptr =
+        reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kRow; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Iterations::kColumn; ++n) {
+
+        void const *ptr = this->ref_.data() +
+                          this->ref_.offset({-(channel_idx_ - base_channel_idx_),
+                                             n * Policy::WarpShape::kColumn}) +
+                          pointer_offset / Policy::LaneMmaShape::kN;
+
+        // Base_k of a warp +  Base_k of current threads.
+        int thread_k_base_idx =
+            warps_n_ * Shape::kColumn / Policy::LaneMmaShape::kN + lane_offset_.column();
+
+        if (channel_idx_ + k == thread_k_base_idx + n * Policy::WarpShape::kColumn) {
+          // Depthwise kernel would only do computation when channel == k.
+          // Loads an element when the current computation channel == the k corresponding to this thread.
+          arch::shared_load(dst_ptr[n + k * Iterations::kColumn], ptr);
+        } else {
+          // Reduce SMEM load
+          dst_ptr[n + k * Iterations::kColumn].fill(Element(0));
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+  
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    if(k_group % PartitionGroupSize == 0 && k_group != 0){
+      base_channel_idx_ = k_group;
+    }
+    channel_idx_ = k_group;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Size of filter (concept: gemm::GemmShape<Depth, Height, Width>)
+    typename FilterShape_,
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename ThreadOutputShape_,
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename ThreadBlockOutputShape_,
+    /// Operand identity
+    cutlass::gemm::Operand Operand,
+    /// Data type of A elements
+    typename Element_,
+    /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+    typename Policy_,
+    /// Iterator algo type
+    conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kAnalytic,
+    /// Stride ( MatrixShape<Height, Width> )
+    typename StrideShape = cutlass::MatrixShape<-1, -1>,   
+    /// Dilation ( MatrixShape<Height, Width> )
+    typename DilationShape =  cutlass::MatrixShape<-1, -1>,
+    /// Activation Shape loaded by threadblock
+    typename ActivationShape = cutlass::conv::TensorNHWCShape<-1,-1,-1,-1>,
+    /// Number of partitions along K dimension - used in sliced-K
+    int PartitionsK = 1,
+    /// Group Size along kPartition - used in sliced-K
+    int PartitionGroupSize = 1>
+class DepthwiseDirect2dConvSimtTileIterator;
+
+
+/// Specialization for A operands of row-major layouts
+///
+/// Concept: MutableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Size of filter (concept: gemm::GemmShape<Depth, Height, Width>)
+    typename FilterShape_,
+    /// Size of the matrix to load (concept: TensorNHWC)
+    typename ThreadOutputShape_,
+    /// Size of the matrix to load (concept: TensorNHWC)
+    typename ThreadBlockOutputShape_,
+    /// Data type of A elements
+    typename Element_,
+    /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+    typename Policy_,
+    /// Iterator algo type
+    conv::IteratorAlgorithm IteratorAlgorithm,
+    /// Stride ( MatrixShape<Height, Width> )
+    typename StrideShape,   
+    /// Dilation ( MatrixShape<Height, Width> )
+    typename DilationShape,
+    /// Activation Shape loaded by threadblock
+    typename ActivationShape,
+    /// Number of partitions along K dimension - used in sliced-K
+    int PartitionsK,
+    /// Group Size along kPartition - used in sliced-K
+    int PartitionGroupSize>
+class DepthwiseDirect2dConvSimtTileIterator<Shape_,
+                                            FilterShape_,
+                                            ThreadOutputShape_,
+                                            ThreadBlockOutputShape_,
+                                            cutlass::gemm::Operand::kA,
+                                            Element_,
+                                            Policy_,
+                                            IteratorAlgorithm,
+                                            StrideShape,   
+                                            DilationShape,
+                                            ActivationShape,
+                                            PartitionsK,
+                                            PartitionGroupSize> {
+ public:
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Shape of filter (concept: gemm::GemmShape<Depth, Height, Width>)
+  using FilterShape = FilterShape_;
+
+  /// Shape of tile to load (concept: TensorNHWC)
+  using ThreadOutputShape = ThreadOutputShape_;
+
+  /// Shape of tile to load (concept: TensorNHWC)
+  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
+
+  /// Operand tag
+  static cutlass::gemm::Operand const kOperand = cutlass::gemm::Operand::kA;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of policy
+  using Layout = layout::RowMajor;
+
+  /// Decomposition of elements among threads
+  using Policy = Policy_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  //
+  // Derived quantities
+  //
+
+  static_assert(!(Shape::kRow % Policy::WarpShape::kRow), 
+    "The warp-level GEMM M size must be divisible by the number of threads arranged along the M dimension.");
+
+  static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
+  static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
+  static_assert(Policy::WarpShape::kRow > 0, "Policy::WarpShape::kRow must be greater than zero.");
+  static_assert(Shape::kRow / Policy::WarpShape::kRow > 0, "Shape::kRow / Policy::WarpShape::kRow must be greater than zero.");
+
+// Thread-level shape of a fragment
+  using ThreadShape = MatrixShape<
+    ThreadOutputShape::kNHW, // Output tile shape Computed by current threads
+    ThreadOutputShape::kC
+  >;
+
+  static_assert(!(ThreadShape::kColumn % Policy::LaneMmaShape::kN), 
+    "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");
+
+  /// Number of individual loads
+  using Iterations = MatrixShape<
+    ThreadShape::kRow,
+    ThreadShape::kColumn / Policy::LaneMmaShape::kN
+  >;
+
+  using ThreadTileCount = MatrixShape<
+    ThreadBlockOutputShape::kH / ThreadOutputShape::kH,
+    ThreadBlockOutputShape::kW / ThreadOutputShape::kW
+  >;
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<Element, ThreadShape::kCount>;
+
+protected:
+
+  /// Internal reference
+  cutlass::TensorRef<Array<Element, Policy::LaneMmaShape::kN>, layout::RowMajor> ref_;
+
+  int activation_offset[ThreadOutputShape::kH][ThreadOutputShape::kW][Iterations::kColumn];
+  int iterator_r_;
+  int iterator_s_;
+  int iterator_offset_;
+
+  int inc_next_s_ ;
+  int inc_next_r_ ;
+  
+  MatrixCoord lane_offset_;
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator(
+    TensorRef ref, 
+    int lane_id
+  ) {
+
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    // Set channel offset
+    lane_offset_ = lane_layout.inverse(lane_id) * MatrixCoord(0, Policy::LaneMmaShape::kN);
+
+    ref.add_coord_offset(lane_offset_);
+
+    ref_.reset(reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(ref.data()),
+               ref.stride(0) / Policy::LaneMmaShape::kN);
+
+    iterator_r_ = 0;
+    iterator_s_ = 0;
+    iterator_offset_ = 0;
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  template<typename Params>
+  CUTLASS_HOST_DEVICE
+  void setup_initial_status(Params const& params)  {
+  
+    inc_next_s_ = params.inc_next[0];
+    inc_next_r_ = params.inc_next[1];
+
+    // Get base HW offset of current threads
+    int threadgroup = threadIdx.x / (ThreadBlockOutputShape::kC / ThreadOutputShape::kC);
+    int base_p_ =
+        (threadgroup / (ThreadTileCount::kColumn)) * ThreadOutputShape::kH;
+    int base_q_ =
+        (threadgroup % (ThreadTileCount::kColumn)) * ThreadOutputShape::kW;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int p = 0; p < ThreadOutputShape::kH; ++p) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int q = 0; q < ThreadOutputShape::kW; ++q) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int col = 0; col < Iterations::kColumn; ++col) {
+          int base_w = (base_q_ + q) * params.stride[0];
+          int base_h = (base_p_ + p) * params.stride[1];
+
+          int offset = base_h * params.activation_tile_w + base_w;
+          activation_offset[p][q][col] = offset;
+        }
+      }
+    }
+  }
+
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
+    // Set warp row and col start
+    lane_offset_ = MatrixCoord({lane_offset_.row() + coord.row() * Shape::kRow, lane_offset_.column()});
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  void advance(int32_t pointer_offset) {
+    ref_.reset(ref_.data() + pointer_offset / sizeof(Element) / Policy::LaneMmaShape::kN);
+    iterator_s_ = 0;
+    iterator_r_ = 0;
+    iterator_offset_ = 0;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator &operator++() {
+    ++iterator_s_;
+    if (iterator_s_ < FilterShape::kColumn) {
+      iterator_offset_ += inc_next_s_;
+
+      return *this;
+    }
+
+    iterator_s_ = 0;
+
+    ++iterator_r_;
+    if (iterator_r_ < FilterShape::kRow) {
+      iterator_offset_ += inc_next_r_;
+      return *this;
+    }
+
+    iterator_r_ = 0;
+    iterator_offset_ = 0;
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator & operator--() {
+    // Do nothing
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator. (vector loads)
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+    Array<Element, Policy::LaneMmaShape::kN> *dst_ptr = 
+      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(&frag);
+
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int p = 0; p < ThreadOutputShape::kH; ++p) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int q = 0; q < ThreadOutputShape::kW; ++q) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < Iterations::kColumn; ++n) {
+          void const *ptr = ref_.data() +
+                            ref_.offset({activation_offset[p][q][n] + (iterator_offset_),
+                                         n * Policy::WarpShape::kColumn}) +
+                            pointer_offset / Policy::LaneMmaShape::kN;
+          arch::shared_load(dst_ptr[n + q + p * ThreadOutputShape::kW], ptr);
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+  
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
+    // Do nothing at present.
+  }
+
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag, Index pointer_offset) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation here
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Specialization for A operands of row-major layouts
+///
+/// Concept: MutableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Size of filter (concept: gemm::GemmShape<Depth, Height, Width>)
+    typename FilterShape_,
+    /// Size of the matrix to load (concept: TensorNHWC)
+    typename ThreadOutputShape_,
+    /// Size of the matrix to load (concept: TensorNHWC)
+    typename ThreadBlockOutputShape_,
+    /// Data type of A elements
+    typename Element_,
+    /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+    typename Policy_,
+    /// Stride ( MatrixShape<Height, Width> )
+    typename StrideShape_,
+    /// Dilation ( MatrixShape<Height, Width> )
+    typename DilationShape_,
+    /// Activation Shape loaded by threadblock
+    typename ActivationShape_,
+    /// Number of partitions along K dimension - used in sliced-K
+    int PartitionsK,
+    /// Group Size along kPartition - used in sliced-K
+    int PartitionGroupSize>
+class DepthwiseDirect2dConvSimtTileIterator<Shape_,
+                                            FilterShape_,
+                                            ThreadOutputShape_,
+                                            ThreadBlockOutputShape_,
+                                            cutlass::gemm::Operand::kA,
+                                            Element_,
+                                            Policy_,
+                                            IteratorAlgorithm::kFixedStrideDilation,
+                                            StrideShape_,
+                                            DilationShape_,
+                                            ActivationShape_,
+                                            PartitionsK,
+                                            PartitionGroupSize> {
+ public:
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Shape of filter (concept: gemm::GemmShape<Depth, Height, Width>)
+  using FilterShape = FilterShape_;
+
+  /// Shape of tile to load (concept: TensorNHWC)
+  using ThreadOutputShape = ThreadOutputShape_;
+
+  /// Shape of tile to load (concept: TensorNHWC)
+  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
+
+  /// Stride ( MatrixShape<Height, Width> )
+  using StrideShape = StrideShape_;
+
+  /// Dilation ( MatrixShape<Height, Width> )
+  using DilationShape = DilationShape_;
+
+  /// Activation Shape loaded by threadblock
+  using ActivationShape = ActivationShape_;
+
+  /// Operand tag
+  static cutlass::gemm::Operand const kOperand = cutlass::gemm::Operand::kA;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of policy
+  using Layout = layout::RowMajor;
+
+  /// Decomposition of elements among threads
+  using Policy = Policy_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  //
+  // Derived quantities
+  //
+
+  static_assert(!(Shape::kRow % Policy::WarpShape::kRow),
+                "The warp-level GEMM M size must be divisible by the number of threads arranged "
+                "along the M dimension.");
+
+  static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
+  static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
+  static_assert(Policy::WarpShape::kRow > 0, "Policy::WarpShape::kRow must be greater than zero.");
+  static_assert(Shape::kRow / Policy::WarpShape::kRow > 0,
+                "Shape::kRow / Policy::WarpShape::kRow must be greater than zero.");
+
+  // Activations loaded by threadblock
+  static int const ThreadActivationShapeH = (ThreadOutputShape::kH - 1) * StrideShape::kRow +
+                                            (FilterShape::kRow - 1) * DilationShape::kRow + 1;
+
+  static int const ThreadActivationShapeW = (ThreadOutputShape::kW - 1) * StrideShape::kColumn +
+                                            (FilterShape::kColumn - 1) * DilationShape::kColumn + 1;
+
+  using ThreadActivationShape = cutlass::conv::
+      TensorNHWCShape<1, ThreadActivationShapeH, ThreadActivationShapeW, ThreadOutputShape::kC>;
+
+  // Thread-level shape of a fragment
+  using ThreadShape =
+      MatrixShape<ThreadOutputShape::kNHW,
+                  ThreadOutputShape::kC>;
+
+  static_assert(!(ThreadShape::kColumn % Policy::LaneMmaShape::kN),
+                "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");
+
+  /// Number of individual loads
+  using Iterations =
+      MatrixShape<ThreadShape::kRow, ThreadShape::kColumn / Policy::LaneMmaShape::kN>;
+
+  using ThreadTileCount = MatrixShape<ThreadBlockOutputShape::kH / ThreadOutputShape::kH,
+                                      ThreadBlockOutputShape::kW / ThreadOutputShape::kW>;
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<Element, ThreadShape::kCount>;
+
+ protected:
+  /// Internal reference
+  cutlass::TensorRef<Array<Element, Policy::LaneMmaShape::kN>, layout::RowMajor> ref_;
+
+  Array<Element, Policy::LaneMmaShape::kN>
+      activation[ThreadActivationShape::kH][ThreadActivationShape::kW][Iterations::kColumn];
+  int iterator_r_;
+  int iterator_s_;
+
+
+  MatrixCoord lane_offset_;
+
+ public:
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator() {}
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator(TensorRef ref, int lane_id) {
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    // Set channel offset
+    lane_offset_ = lane_layout.inverse(lane_id) * MatrixCoord(0, Policy::LaneMmaShape::kN);
+
+    ref.add_coord_offset(lane_offset_);
+
+    ref_.reset(reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(ref.data()),
+               ref.stride(0) / Policy::LaneMmaShape::kN);
+
+    iterator_r_ = 0;
+    iterator_s_ = 0;
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  template <typename Params>
+  CUTLASS_HOST_DEVICE void setup_initial_status(
+      Params const &params) {
+
+    // Get base HW offset of current threads
+    int threadgroup = threadIdx.x / (ThreadBlockOutputShape::kC / ThreadOutputShape::kC);
+    int base_h =
+        (threadgroup / (ThreadTileCount::kColumn)) * ThreadOutputShape::kH * StrideShape::kRow;
+    int base_w =
+        (threadgroup % (ThreadTileCount::kColumn)) * ThreadOutputShape::kW * StrideShape::kColumn;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int h = 0; h < ThreadActivationShape::kH; ++h) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int w = 0; w < ThreadActivationShape::kW; ++w) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int col = 0; col < Iterations::kColumn; ++col) {
+          int offset = (base_h + h) * ActivationShape::kW + (base_w + w);
+
+          void const *ptr = ref_.data() + ref_.offset({offset, col * Policy::WarpShape::kColumn});
+          arch::shared_load(activation[h][w][col], ptr);
+        }
+      }
+    }
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
+    // Set warp row and col start
+    lane_offset_ =
+        MatrixCoord({lane_offset_.row() + coord.row() * Shape::kRow, lane_offset_.column()});
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  void advance(int32_t pointer_offset) {
+    ref_.reset(ref_.data() + pointer_offset / sizeof(Element) / Policy::LaneMmaShape::kN);
+    iterator_s_ = 0;
+    iterator_r_ = 0;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator &operator++() {
+    ++iterator_s_;
+    if (iterator_s_ < FilterShape::kColumn) {
+      return *this;
+    }
+
+    iterator_s_ = 0;
+
+    ++iterator_r_;
+    if (iterator_r_ < FilterShape::kRow) {
+      return *this;
+    }
+
+    iterator_r_ = 0;
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator &operator--() {
+    // Do nothing
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator. (vector loads)
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+    Array<Element, Policy::LaneMmaShape::kN> *dst_ptr =
+        reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int p = 0; p < ThreadOutputShape::kH; ++p) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int q = 0; q < ThreadOutputShape::kW; ++q) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < Iterations::kColumn; ++n) {
+          const int h = p * StrideShape::kRow + iterator_r_ * DilationShape::kRow;
+          const int w = q * StrideShape::kColumn + iterator_s_ * DilationShape::kColumn;
+
+          dst_ptr[n + q + p * ThreadOutputShape::kW] = activation[h][w][n];
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const { load_with_pointer_offset(frag, 0); }
+
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
+    // Do nothing at present.
+  }
+
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag, Index pointer_offset) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation here
+  }
+};
+
+} // namespace warp
+} // namespace conv
+} // namespace cutlass
diff --git a/include/cutlass/conv/warp/scale_bias_relu_transform.h b/include/cutlass/conv/warp/scale_bias_relu_transform.h
index 5bcbfcd0bd..4da31ab818 100644
--- a/include/cutlass/conv/warp/scale_bias_relu_transform.h
+++ b/include/cutlass/conv/warp/scale_bias_relu_transform.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -101,9 +101,8 @@ struct FpropScaleBiasReluTransform {
         "}\n"
         : "=r"(ptr_activations[0])
         : "r"(ptr_scale_bias[0]), "r"(ptr_activations[0]),
-          "r"(ptr_scale_bias[1]), "n"(0x7eff7eff));
+          "r"(ptr_scale_bias[1]), "n"(cutlass::arch::OOB_NAN_F16x2));
 #else
-    // TODO: write emulation code
     assert(0);
 #endif
   }
@@ -151,8 +150,8 @@ struct WgradScaleBiasReluTransform {
 #if 1 
     // CUDA + PTX version
 
-    bool h1_oob = (reinterpret_cast<uint16_t &>(ptr_activations[0].x) == 0x7eff);
-    bool h2_oob = (reinterpret_cast<uint16_t &>(ptr_activations[0].y) == 0x7eff);
+    bool h1_oob = (reinterpret_cast<uint16_t &>(ptr_activations[0].x) == cutlass::arch::OOB_NAN_F16);
+    bool h2_oob = (reinterpret_cast<uint16_t &>(ptr_activations[0].y) == cutlass::arch::OOB_NAN_F16);
 
     // Apply per channel scale+bias+relu if the data is not a special NaN
     // (0x7eff).  If it is a special NaN (0x7eff), hard code the output to 0.
@@ -161,7 +160,7 @@ struct WgradScaleBiasReluTransform {
     // out-of-bound because C x R x S can be an odd number.
     asm volatile(
         "{\n\t"
-        " fma.rn.f16x2.relu %0 , %1, %2, %3;\n"
+        " fma.rn.f16x2.relu %0, %1, %2, %3;\n"
         "}"
         : "=r"(reinterpret_cast<uint32_t &>(ptr_activations[0]))
         : "r"(ptr_scale_bias[0]), "r"(reinterpret_cast<uint32_t &>(ptr_activations[0])),
@@ -195,10 +194,9 @@ struct WgradScaleBiasReluTransform {
         "}\n"
         : "=r"(reinterpret_cast<uint32_t &>(ptr_activations[0]))
         : "r"(ptr_scale_bias[0]), "r"(reinterpret_cast<uint32_t &>(ptr_activations[0])),
-          "r"(ptr_scale_bias[1]), "n"(0x7eff), "n"(0xffff0000), "n"(0x0000ffff));
+          "r"(ptr_scale_bias[1]), "n"(cutlass::arch::OOB_NAN_F16), "n"(0xffff0000), "n"(0x0000ffff));
 #endif
 #else
-    // TODO: write emulation code
     assert(0);
 #endif
   }
diff --git a/include/cutlass/coord.h b/include/cutlass/coord.h
index 1fe8ec0feb..fe884d7037 100644
--- a/include/cutlass/coord.h
+++ b/include/cutlass/coord.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -37,7 +37,7 @@
 #if defined(__CUDACC_RTC__)
 #include <cuda/std/cstdint>
 #else
-#include <stdint.h>
+#include <cstdint>
 #endif
 
 #include "cutlass/cutlass.h"
@@ -100,12 +100,21 @@ struct Coord {
     }
   }
 
+  /// Constructs from some other Coord
+  template <int R, typename I, typename L>
+  CUTLASS_HOST_DEVICE
+  Coord(Coord<R, I, L> other) {
+    for (int i = 0; i < kRank; ++i) {
+      idx[i] = other[i];
+    }
+  }
+
   /// Returns a slice of the Coord which may be larger or smaller in rank
   /// than this.
   template <int Slice>
   CUTLASS_HOST_DEVICE
-  Coord<Slice> slice(int start = 0, Index identity = 0) const {
-    Coord<Slice> result;
+  Coord<Slice, Index, LongIndex> slice(int start = 0, Index identity = 0) const {
+    Coord<Slice, Index, LongIndex> result;
     for (int i = 0; i < Slice; ++i) {
       if (i + start < kRank) {
         result[i] = idx[i + start];
diff --git a/include/cutlass/core_io.h b/include/cutlass/core_io.h
index eef4360267..40ae22246a 100644
--- a/include/cutlass/core_io.h
+++ b/include/cutlass/core_io.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -31,7 +31,6 @@
 /*! \file
     \brief Helpers for printing cutlass/core objects
 */
-
 #pragma once
 
 #include <iostream>
@@ -45,7 +44,7 @@
 #include "cutlass/matrix_shape.h"
 #include "cutlass/layout/pitch_linear.h"
 #include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/gemm_enumerated_types.h"
 #include "cutlass/conv/convolution.h"
 #include "cutlass/conv/conv2d_problem_size.h"
 #include "cutlass/conv/conv3d_problem_size.h"
@@ -252,8 +251,9 @@ namespace conv {
 inline
 std::ostream& operator<<(std::ostream& out, Conv2dProblemSize const& problem) {
   out << "NHWC: (" << problem.N << ", " << problem.H << ", " << problem.W << ", " << problem.C << ")" << std::endl
-      << "KRSC: (" << problem.K << ", " << problem.R << ", " << problem.S << ", " << problem.C << ")" << std::endl
+      << "KRSC: (" << problem.K << ", " << problem.R << ", " << problem.S << ", " << problem.C / problem.groups << ")" << std::endl
       << "NPQK: (" << problem.N << ", " << problem.P << ", " << problem.Q << ", " << problem.K << ")" << std::endl
+      << "groups: (" << problem.groups << ")" << std::endl
       << "Pad_h, Pad_w: (" << problem.pad_h << ", " << problem.pad_w << ")" << std::endl
       << "Stride_h, Stride_w: (" << problem.stride_h << ", " << problem.stride_w << ")" << std::endl
       << "Dilation_h, Dilation_w: (" << problem.dilation_h << ", " << problem.dilation_w << ")" << std::endl
diff --git a/include/cutlass/cuda_host_adapter.hpp b/include/cutlass/cuda_host_adapter.hpp
new file mode 100644
index 0000000000..2adfd2665f
--- /dev/null
+++ b/include/cutlass/cuda_host_adapter.hpp
@@ -0,0 +1,412 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Interface betweeen a CUTLASS device-wide operator and CUDA.
+*/
+
+#pragma once
+
+#include <cuda_runtime_api.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/trace.h"
+
+#include "cutlass/platform/platform.h"
+#if ! defined(__CUDACC_RTC__)
+#include <cstdio>
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// NVRTC doesn't need definitions for these host classes
+
+#if ((__CUDACC_VER_MAJOR__ >= 12) ||                               \
+    ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8))) \
+    && !defined(__CUDACC_RTC__)
+#define CUDA_HOST_ADAPTER_LAUNCH_ATTRIBUTES_ENABLED
+#endif
+
+#if ((__CUDACC_VER_MAJOR__ >= 12) && !defined(__CUDACC_RTC__))
+#define CUDA_HOST_ADAPTER_TENSORMAP_ENABLED
+#endif
+
+// Include <cuda.h> for CUDA Driver API calls if any of these capabilities are enabled.
+#if defined(CUDA_HOST_ADAPTER_LAUNCH_ATTRIBUTES_ENABLED) ||        \
+    defined(CUDA_HOST_ADAPTER_TENSORMAP_ENABLED)
+
+#include <cuda.h>
+
+#endif // defined(CUDA_HOST_ADAPTER_LAUNCH_ATTRIBUTES_ENABLED) ||
+       // defined(CUDA_HOST_ADAPTER_TENSORMAP_ENABLED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Macro-level guard for CUDA Host Adapter
+//
+#if !defined(CUTLASS_ENABLE_CUDA_HOST_ADAPTER)
+#define CUTLASS_ENABLE_CUDA_HOST_ADAPTER false
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+#if !defined(__CUDACC_RTC__)
+
+#if ((__CUDACC_VER_MAJOR__ >= 12) ||                               \
+    ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8)))
+#include <cudaTypedefs.h>
+#endif // (__CUDACC_VERSION__ >= 11.8)
+
+#include <driver_types.h>
+
+#define CUTLASS_CUDA_DRIVER_STRINGIFY(tok) #tok
+
+#if defined(CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL)
+
+#define CUTLASS_CUDA_DRIVER_WRAPPER_DECL(func, ver) \
+  template <typename... Args>                       \
+  CUresult call_##func(Args... args) {              \
+    return func(args...);                           \
+  }
+
+#else // defined(CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL)
+
+#if ((__CUDACC_VER_MAJOR__ >= 13) ||                               \
+    ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 5))) \
+
+#define CUTLASS_CUDA_DRIVER_WRAPPER_DECL(func, ver)             \
+  template <typename... Args>                                   \
+  CUresult call_##func(Args... args) {                          \
+    cudaDriverEntryPointQueryResult cuda_status;                \
+    void* pfn = nullptr;                                        \
+    cudaError_t cuda_err = cudaGetDriverEntryPointByVersion(    \
+        CUTLASS_CUDA_DRIVER_STRINGIFY(func),                    \
+        &pfn, ver,                                              \
+        cudaEnableDefault,                                      \
+        &cuda_status);                                          \
+    if (cuda_status != cudaDriverEntryPointSuccess ||           \
+        cuda_err != cudaSuccess) {                              \
+      return CUDA_ERROR_UNKNOWN;                                \
+    }                                                           \
+    return reinterpret_cast<PFN_##func##_v##ver>(pfn)(args...); \
+  }
+
+#else
+
+#define CUTLASS_CUDA_DRIVER_WRAPPER_DECL(func, ver)             \
+  template <typename... Args>                                   \
+  CUresult call_##func(Args... args) {                          \
+    cudaDriverEntryPointQueryResult cuda_status;                \
+    void* pfn = nullptr;                                        \
+    cudaError_t cuda_err = cudaGetDriverEntryPoint(             \
+        CUTLASS_CUDA_DRIVER_STRINGIFY(func),                    \
+        &pfn,                                                   \
+        cudaEnableDefault,                                      \
+        &cuda_status);                                          \
+    if (cuda_status != cudaDriverEntryPointSuccess ||           \
+        cuda_err != cudaSuccess) {                              \
+      return CUDA_ERROR_UNKNOWN;                                \
+    }                                                           \
+    return reinterpret_cast<PFN_##func>(pfn)(args...);          \
+  }
+
+#endif // (__CUDACC_VERSION__ >= 12.5)
+
+#endif // defined(CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL)
+
+#if (__CUDACC_VER_MAJOR__ >= 12)
+CUTLASS_CUDA_DRIVER_WRAPPER_DECL(cuTensorMapEncodeTiled, 12000);
+CUTLASS_CUDA_DRIVER_WRAPPER_DECL(cuTensorMapEncodeIm2col, 12000);
+#endif
+
+#undef CUTLASS_CUDA_DRIVER_STRINGIFY
+
+#define CUTLASS_CUDA_DRIVER_WRAPPER_CALL(func) cutlass::call_##func
+
+#endif // !defined(__CUDACC_RTC__)
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This class manages runtime CUlaunchAttribute that can be supplied to CudaHostAdapter
+/// CudaHostLaunchAttributes will be an empty struct in earlier CTK where CUlaunchAttribute
+/// is not introduced.
+struct CudaHostLaunchAttributes {
+
+#if defined(CUDA_HOST_ADAPTER_LAUNCH_ATTRIBUTES_ENABLED)
+
+  /// Reasonable maximum launch attributes that are commonly applied
+  static constexpr int32_t kMaximumAttributeCount = 5;
+
+  /// Launch attributes
+  CUlaunchAttribute launch_attributes[kMaximumAttributeCount];
+  int32_t      attribute_count = 0;
+
+  CUTLASS_HOST_DEVICE
+  CudaHostLaunchAttributes(CUlaunchAttribute *launch_attributes_ = nullptr,
+                           int32_t attribute_count_ = 0) {
+    CUTLASS_ASSERT(attribute_count_ >= 0 && attribute_count_ < kMaximumAttributeCount);
+    for (int32_t i = 0; i < attribute_count_ && i < kMaximumAttributeCount; ++i) {
+      launch_attributes[i] = launch_attributes_[i];
+    }
+    attribute_count = attribute_count_;
+  }
+
+  CUTLASS_HOST_DEVICE
+  CUlaunchAttribute const* data() const {
+    return launch_attributes;
+  }
+
+  CUTLASS_HOST_DEVICE
+  size_t size() const {
+    return attribute_count;
+  }
+  
+#endif // (CUDA_HOST_ADAPTER_LAUNCH_ATTRIBUTES_ENABLED)
+
+};
+
+
+/// This class defines an object which abstracts interactions between the CUTLASS device-wide GEMM and
+/// CUDA. The intention is to enable CUTLASS to be used with both the CUDA Runtime API and CUDA Driver API.
+struct CudaHostAdapter {
+
+  /// Limit the number of kernels
+  static constexpr int32_t kMaximumKernelCount = 4;
+
+  /// Maximum cluster size
+  static constexpr int MaxClusterSize = 32;
+
+  //
+  // Data members
+  //
+
+  /// Handles
+  void        *kernel_handles[kMaximumKernelCount];
+  int32_t      kernel_count = 0;
+
+  CudaHostLaunchAttributes launch_attributes;
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CudaHostAdapter() = default;
+
+  /// Dtor
+  virtual ~CudaHostAdapter() = default;
+
+  /// Copy Ctor
+  CUTLASS_HOST_DEVICE
+  CudaHostAdapter(const CudaHostAdapter & rhs)
+      : kernel_count(rhs.kernel_count),
+        launch_attributes(rhs.launch_attributes) {
+    CUTLASS_ASSERT(rhs.kernel_count >= 0 && rhs.kernel_count < kMaximumKernelCount);
+
+    for (int32_t i = 0; i < rhs.kernel_count && i < kMaximumKernelCount; ++i) {
+      kernel_handles[i] = rhs.kernel_handles[i];
+    }
+  }
+
+  /// Copy Assignment
+  CUTLASS_HOST_DEVICE
+  CudaHostAdapter& operator=(const CudaHostAdapter & rhs) {
+    CUTLASS_ASSERT(rhs.kernel_count >= 0 && rhs.kernel_count < kMaximumKernelCount);
+    for (int32_t i = 0; i < rhs.kernel_count && i < kMaximumKernelCount; ++i) {
+      kernel_handles[i] = rhs.kernel_handles[i];
+    }
+    kernel_count = rhs.kernel_count;
+
+    launch_attributes = rhs.launch_attributes;
+
+    return *this;
+  }
+
+
+  /// Move ctor
+  CUTLASS_HOST_DEVICE
+  CudaHostAdapter(CudaHostAdapter && rhs)
+      : kernel_count(rhs.kernel_count),
+        launch_attributes(std::move(rhs.launch_attributes)) {
+    CUTLASS_ASSERT(rhs.kernel_count >= 0 && rhs.kernel_count < kMaximumKernelCount);
+
+    for (int32_t i = 0; i < rhs.kernel_count && i < kMaximumKernelCount; ++i) {
+      kernel_handles[i] = rhs.kernel_handles[i];
+    }
+  }
+
+  // / Move assignment
+  CUTLASS_HOST_DEVICE 
+  CudaHostAdapter& operator=(CudaHostAdapter && rhs) {
+    CUTLASS_ASSERT(rhs.kernel_count >= 0 && rhs.kernel_count < kMaximumKernelCount);
+    for (int32_t i = 0; i < rhs.kernel_count && i < kMaximumKernelCount; ++i) {
+      kernel_handles[i] = rhs.kernel_handles[i];
+    }
+    kernel_count = rhs.kernel_count;
+    launch_attributes = std::move(rhs.launch_attributes);
+    return *this;
+  }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  CudaHostAdapter(void **kernel_handles_, 
+                  int32_t kernel_count_,
+                  CudaHostLaunchAttributes const &launch_attributes_ = { })
+      : kernel_count(kernel_count_),
+        launch_attributes(launch_attributes_) {
+    CUTLASS_ASSERT(kernel_count >= 0 && kernel_count < kMaximumKernelCount);
+
+    for (int32_t i = 0; i < kernel_count && i < kMaximumKernelCount; ++i) {
+      kernel_handles[i] = kernel_handles_[i];
+    }
+  }
+
+  /// Returns true if the CudaHostAdapter is empty (kernel_count == 0)
+  CUTLASS_HOST_DEVICE 
+  bool empty() const { return !kernel_count; }
+
+  /// Returns kernel_count
+  CUTLASS_HOST_DEVICE
+  size_t size() const { return static_cast<size_t>(kernel_count); }
+
+  /// Queries the occupancy of a kernel
+  virtual Status query_occupancy(
+    int32_t *device_sms, 
+    int32_t *sm_occupancy,
+    int32_t kernel_index,
+    int32_t thread_count,
+    int32_t smem_size) const = 0;
+ 
+  /// Launches a kernel without using Threadblock Clusters. 
+  virtual Status launch(
+    dim3 const grid_dims,
+    dim3 const block_dims,
+    size_t const smem_size,
+    cudaStream_t cuda_stream,
+    void** kernel_params,
+    int32_t kernel_index) const = 0;
+
+  /// Launches a kernel using the CUDA Extensible Launch API and Threadblock Clusters.
+  virtual Status launch(
+    dim3 const grid_dims,
+    dim3 const cluster_dims,
+    dim3 const block_dims,
+    size_t const smem_size,
+    cudaStream_t cuda_stream,
+    void** kernel_params,
+    int32_t kernel_index) const = 0;
+
+#if defined(CUDA_HOST_ADAPTER_TENSORMAP_ENABLED)
+
+  /// Create a tensor map descriptor object representing im2col memory region.
+  virtual CUresult tensorMapEncodeIm2col (
+    CUtensorMap* tensorMap,
+    CUtensorMapDataType tensorDataType,
+    cuuint32_t tensorRank,
+    void* globalAddress,
+    const cuuint64_t* globalDim,
+    const cuuint64_t* globalStrides,
+    const int* pixelBoxLowerCorner,
+    const int* pixelBoxUpperCorner,
+    cuuint32_t channelsPerPixel,
+    cuuint32_t pixelsPerColumn,
+    const cuuint32_t* elementStrides,
+    CUtensorMapInterleave interleave,
+    CUtensorMapSwizzle swizzle,
+    CUtensorMapL2promotion l2Promotion,
+    CUtensorMapFloatOOBfill oobFill) const = 0;
+
+  /// Create a tensor map descriptor object representing tiled memory region.
+  virtual CUresult tensorMapEncodeTiled (
+    CUtensorMap* tensorMap,
+    CUtensorMapDataType tensorDataType,
+    cuuint32_t tensorRank,
+    void* globalAddress,
+    const cuuint64_t* globalDim,
+    const cuuint64_t* globalStrides,
+    const cuuint32_t* boxDim,
+    const cuuint32_t* elementStrides,
+    CUtensorMapInterleave interleave,
+    CUtensorMapSwizzle swizzle,
+    CUtensorMapL2promotion l2Promotion,
+    CUtensorMapFloatOOBfill oobFill) const = 0;
+
+  /// Modify an existing tensor map descriptor with an updated global address.
+  virtual CUresult tensorMapReplaceAddress(
+    CUtensorMap* tensorMap,
+    void* globalAddress)  const = 0;
+
+#endif // defined(CUDA_HOST_ADAPTER_TENSORMAP_ENABLED)
+
+protected:
+
+  /**
+   * Fills a buffer in Global Memory with a byte sequence copied from host memory.
+   * This function can be overriden to dispatch to the appropriate cuMemsetD*Async API
+  */
+  virtual Status memsetDeviceImpl(
+    void* destination, ///< Device memory pointer to be filled
+    void const* fill_value, ///< Value to be filled in the buffer
+    size_t fill_size, ///< Size of the data type to be used for filling the buffer
+    size_t count, ///< Number of elements of size fill_size
+    cudaStream_t stream) const = 0;
+
+public:
+
+  /// Fills a buffer in Global Memory with a byte sequence copied from host memory
+  template<class FillValueType>
+  CUTLASS_HOST_DEVICE
+  Status memsetDevice(
+      void* destination,
+      FillValueType fill_value, 
+      size_t count,
+      cudaStream_t stream) const {
+    return this->memsetDeviceImpl(
+      destination,
+      &fill_value,
+      sizeof(FillValueType),
+      count,
+      stream);
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/cutlass.h b/include/cutlass/cutlass.h
index ebc4c1e905..e12616a201 100644
--- a/include/cutlass/cutlass.h
+++ b/include/cutlass/cutlass.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,53 +35,13 @@
 
 #pragma once
 
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#ifdef CUTLASS_NAMESPACE
-#define cutlass CUTLASS_NAMESPACE
-#endif
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#define CUTLASS_UNUSED(expr) do { (void)(expr); } while (0)
-
-#if !defined(__CUDACC_RTC__)
-
-#include <assert.h>
-
-#if defined(_MSC_VER)
-  #define CUTLASS_NOT_IMPLEMENTED() assert(0 && __FUNCSIG__)
-#else
-  #define CUTLASS_NOT_IMPLEMENTED() assert(0 && __PRETTY_FUNCTION__)
-#endif
-
-#else
-
-#if defined(_MSC_VER)
-  #define CUTLASS_NOT_IMPLEMENTED() assert(0 && __FUNCSIG__)
-#else
-  #define CUTLASS_NOT_IMPLEMENTED() assert(0 && __PRETTY_FUNCTION__)
-#endif
-
-#endif
+#include "cutlass/arch/synclog.hpp"
+#include "cutlass/detail/helper_macros.hpp"
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
 
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
-#define CUTLASS_HOST_DEVICE __forceinline__ __device__ __host__
-#define CUTLASS_DEVICE __forceinline__ __device__
-#elif defined(__CUDACC_RTC__)
-#define CUTLASS_HOST_DEVICE __forceinline__ __device__
-#define CUTLASS_DEVICE __forceinline__ __device__
-#else
-#define CUTLASS_HOST_DEVICE inline
-#define CUTLASS_DEVICE inline
-#endif
-
 /// Status code returned by CUTLASS operations
 enum class Status {
   kSuccess,                    ///< Operation was successful.
@@ -132,58 +92,64 @@ static char const* cutlassGetStatusString(cutlass::Status status) {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-
-#ifndef CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED
-#define CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED 0
-#endif
-
-
-// CUDA 10.1 introduces the mma instruction
-#if !defined(CUTLASS_ENABLE_TENSOR_CORE_MMA)
-#define CUTLASS_ENABLE_TENSOR_CORE_MMA 0
-#endif
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#define CUTLASS_ASSERT(x) assert(x)
+static const int NumThreadsPerWarp = 32;
+static const int NumThreadsPerWarpGroup = 128;
+static const int NumWarpsPerWarpGroup = NumThreadsPerWarpGroup / NumThreadsPerWarp;
+static const int NumThreadsPerHalfWarp = NumThreadsPerWarp / 2;
+static const int NumThreadsPerQuad = 4;
+static const int NumThreadsPerQuadPair = NumThreadsPerQuad * 2;
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-// CUTLASS_PRAGMA_(UNROLL|NO_UNROLL) optimization directives for the CUDA compiler.
-#if defined(__CUDA_ARCH__)
-  #if defined(__CUDACC_RTC__) || (defined(__clang__) && defined(__CUDA__))
-    #define CUTLASS_PRAGMA_UNROLL _Pragma("unroll")
-    #define CUTLASS_PRAGMA_NO_UNROLL _Pragma("unroll 1")
+/// Helper function to return true when called by thread 0 of threadblock 0.
+CUTLASS_HOST_DEVICE bool thread0() {
+  #if defined(__CUDA_ARCH__)
+    return (!threadIdx.x && !threadIdx.y && !threadIdx.z) && (!blockIdx.x && !blockIdx.y && !blockIdx.z);
   #else
-    #define CUTLASS_PRAGMA_UNROLL #pragma unroll
-    #define CUTLASS_PRAGMA_NO_UNROLL #pragma unroll 1
+    return false;
   #endif
+}
 
-  #define CUTLASS_GEMM_LOOP CUTLASS_PRAGMA_NO_UNROLL
-
-#else
-
-    #define CUTLASS_PRAGMA_UNROLL
-    #define CUTLASS_PRAGMA_NO_UNROLL
-    #define CUTLASS_GEMM_LOOP
-
-#endif
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Returns a lane index in the warp. The threads in warp may not be convergent
+CUTLASS_DEVICE
+int canonical_lane_idx() { 
+  #if defined(__CUDA_ARCH__)
+    return threadIdx.x % NumThreadsPerWarp;
+  #else
+    return 0;
+  #endif
+}
 
-static const int NUM_THREADS_PER_WARP = 32;
-static const int NUM_THREADS_PER_HALF_WARP = NUM_THREADS_PER_WARP / 2;
-static const int NUM_THREADS_PER_QUAD = 4;
-static const int NUM_THREADS_PER_QUAD_PAIR = NUM_THREADS_PER_QUAD * 2;
+/// Returns a warp-uniform value indicating the canonical warp index of the calling threads.
+/// Threads within the warp must be converged.
+CUTLASS_DEVICE
+int canonical_warp_idx_sync() { 
+  #if defined(__CUDA_ARCH__)
+    return __shfl_sync(0xffffffff, threadIdx.x / NumThreadsPerWarp, 0);
+  #else
+    return 0;
+  #endif
+}
 
-////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Returns a warp index in the CTA. The threads in warp may not be convergent
+/// As it doesn't sync the warp, it faster and allows forward progress
+CUTLASS_DEVICE
+int canonical_warp_idx() { 
+  #if defined(__CUDA_ARCH__)
+    return threadIdx.x / NumThreadsPerWarp;
+  #else
+    return 0;
+  #endif
+}
 
-/// Helper function to return true when called by thread 0 of threadblock 0.
-CUTLASS_HOST_DEVICE bool thread0() {
+/// Returns a warp-uniform value indicating the canonical warp group index of the calling threads.
+/// Threads within the warp must be converged.
+CUTLASS_DEVICE
+int canonical_warp_group_idx() {
   #if defined(__CUDA_ARCH__)
-    return (!threadIdx.x && !threadIdx.y && !threadIdx.z) && (!blockIdx.x && !blockIdx.y && !blockIdx.z);
+    return __shfl_sync(0xffffffff, threadIdx.x / NumThreadsPerWarpGroup, 0);
   #else
-    return false;
+    return 0;
   #endif
 }
 
@@ -192,4 +158,3 @@ CUTLASS_HOST_DEVICE bool thread0() {
 }  // namespace cutlass
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/include/cutlass/detail/collective.hpp b/include/cutlass/detail/collective.hpp
new file mode 100644
index 0000000000..9d8f9e2f1d
--- /dev/null
+++ b/include/cutlass/detail/collective.hpp
@@ -0,0 +1,64 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cute/container/tuple.hpp"
+#include "cute/layout.hpp" // cute::size(shape)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <size_t I, class Tuple>
+struct deduce_mixed_width_dtype {
+static_assert(I >= 0u && I <= 2u, "Valid indices are 0, 1, and 2, which represent Operand, Scale, and Bias, respectively.");
+
+private:
+  using underlying_tuple = cute::conditional_t<cute::is_tuple<Tuple>::value, Tuple, cute::tuple<Tuple>>;
+  static constexpr size_t valid_index = cute::min(I, cute::tuple_size_v<underlying_tuple> - 1);
+
+public:
+  using type = cute::conditional_t<(I < cute::tuple_size_v<underlying_tuple>), 
+                                    cute::tuple_element_t<valid_index, underlying_tuple>,
+                                    void>;
+};
+
+template <size_t I, class Tuple>
+using deduce_mixed_width_dtype_t = typename deduce_mixed_width_dtype<I, Tuple>::type;
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
diff --git a/include/cutlass/detail/collective/mixed_input_utils.hpp b/include/cutlass/detail/collective/mixed_input_utils.hpp
new file mode 100644
index 0000000000..c740eb98b2
--- /dev/null
+++ b/include/cutlass/detail/collective/mixed_input_utils.hpp
@@ -0,0 +1,1017 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cute/util/type_traits.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+// The universal converter
+template <
+  class SrcType,
+  class DstType,
+  class LayoutIn,
+  class LayoutOut
+>
+struct LayoutAwareConvertImpl {
+  template<class EngineIn, class EngineOut>
+  CUTLASS_DEVICE
+  static void convert(
+    cute::Tensor<EngineIn, LayoutIn> const& src,
+    cute::Tensor<EngineOut, LayoutOut>    & dst) {
+
+    static_assert(cute::is_same_v<SrcType, typename EngineIn::value_type> &&
+                  cute::is_same_v<DstType, typename EngineOut::value_type>);
+    static_assert(cute::cosize_v<LayoutIn> == cute::cosize_v<LayoutOut>);
+    constexpr int N = decltype(cute::max_common_vector(LayoutIn{}, LayoutOut{})){};
+    using SrcArray = cutlass::Array<SrcType, N>;
+    using DstArray = cutlass::Array<DstType, N>;
+    using Converter = cutlass::NumericArrayConverter<DstType,
+                                                     SrcType,
+                                                     N,
+                                                     cutlass::FloatRoundStyle::round_to_nearest>;
+    auto&& src_vm = cute::recast<SrcArray>(src);
+    auto&& dst_vm = cute::recast<DstArray>(dst);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i <src.size(); ++i) {
+      dst_vm(i) = Converter::convert(src_vm(i));
+    }
+  }
+};
+
+// Specialization for INT4 -> BF16 with [02461357] value order
+template <>
+struct LayoutAwareConvertImpl<
+  cutlass::int4b_t,
+  cutlass::bfloat16_t,
+  cute::Layout<cute::Shape<_2,_4>, cute::Stride<_4,_1>>,
+  cute::Layout<_8>
+> {
+  template<class EngineIn, class EngineOut>
+  CUTLASS_DEVICE
+  static void convert(
+    cute::Tensor<EngineIn,
+                 cute::Layout<cute::Shape<_2,_4>, cute::Stride<_4,_1>>
+                > const& src, 
+    cute::Tensor<EngineOut,
+                 cute::Layout<_8>
+                >& dst) {
+
+    static_assert(cute::is_same_v<cutlass::int4b_t, typename EngineIn::value_type> &&
+                  cute::is_same_v<cutlass::bfloat16_t, typename EngineOut::value_type>);
+    using SrcArray = cutlass::Array<cutlass::int4b_t, 8>;
+    using DstArray = cutlass::Array<cutlass::bfloat16_t, 8>;
+    using RegArray = cutlass::AlignedArray<uint32_t, 4, sizeof(DstArray)>;
+
+    auto&& src_reg = cute::recast<uint32_t>(src)(0);
+    auto&& r       = cute::recast<RegArray>(dst)(0);
+    CUTLASS_PRAGMA_UNROLL
+    for (size_t ii = 0; ii < RegArray::kElements; ++ii) {
+      r[ii] = src_reg >> (4 * (ii));
+      static constexpr uint32_t xor_mask = 0x43084308;
+      static constexpr uint32_t lo_mask  = 0x000F000F;
+      static constexpr uint32_t immLut   = (0xf0 & 0xcc) ^ 0xaa;
+      asm volatile(
+          "{\n"
+          "  lop3.b32 %0, %0, %1, %2, %3;\n"
+          "}\n"
+          : "+r"(r[ii])
+          : "n"(lo_mask), "n"(xor_mask), "n"(immLut));
+      static constexpr uint32_t lo_bias = xor_mask; // 0x43084308, {136, 136}
+      {
+        __nv_bfloat162& bf16x2_val = reinterpret_cast<__nv_bfloat162&>(r[ii]);
+        bf16x2_val = __hsub2(bf16x2_val,
+                              reinterpret_cast<const __nv_bfloat162&>(lo_bias));
+      }
+    }
+  }
+};
+
+// Specialization for UINT4 -> BF16 with [02461357] value order
+template <>
+struct LayoutAwareConvertImpl<
+  cutlass::uint4b_t,
+  cutlass::bfloat16_t,
+  cute::Layout<cute::Shape<_2,_4>, cute::Stride<_4,_1>>,
+  cute::Layout<_8>
+> {
+  template<class EngineIn, class EngineOut>
+  CUTLASS_DEVICE
+  static void convert(
+    cute::Tensor<EngineIn,
+                cute::Layout<cute::Shape<_2,_4>, cute::Stride<_4,_1>>
+                > const& src, 
+    cute::Tensor<EngineOut,
+                 cute::Layout<_8>
+                >& dst) {
+
+    static_assert(cute::is_same_v<cutlass::uint4b_t, typename EngineIn::value_type> &&
+                  cute::is_same_v<cutlass::bfloat16_t, typename EngineOut::value_type>);
+    using SrcArray = cutlass::Array<cutlass::uint4b_t, 8>;
+    using DstArray = cutlass::Array<cutlass::bfloat16_t, 8>;
+    using RegArray = cutlass::AlignedArray<uint32_t, 4, sizeof(DstArray)>;
+
+    auto&& src_reg = cute::recast<uint32_t>(src)(0);
+    auto&& r       = cute::recast<RegArray>(dst)(0);
+    CUTLASS_PRAGMA_UNROLL
+    for (size_t ii = 0; ii < RegArray::kElements; ++ii) {
+      r[ii] = src_reg >> (4 * (ii));
+      static constexpr uint32_t or_mask = 0x43004300;
+      static constexpr uint32_t lo_mask = 0x000F000F;
+      static constexpr uint32_t immLut  = (0xf0 & 0xcc) | 0xaa;
+      asm volatile(
+          "{\n"
+          "  lop3.b32 %0, %0, %1, %2, %3;\n"
+          "}\n"
+          : "+r"(r[ii])
+          : "n"(lo_mask), "n"(or_mask), "n"(immLut));
+      static constexpr uint32_t lo_bias = or_mask; // 0x43004300, {128, 128}
+      {
+        __nv_bfloat162& bf16x2_val = reinterpret_cast<__nv_bfloat162&>(r[ii]);
+        bf16x2_val = __hsub2(bf16x2_val,
+                             reinterpret_cast<const __nv_bfloat162&>(lo_bias));
+      }
+    }
+  }
+};
+
+// Specialization for INT4 -> FP16 with [02461357] value order
+template <>
+struct LayoutAwareConvertImpl<
+  cutlass::int4b_t,
+  cutlass::half_t,
+  cute::Layout<cute::Shape<_2,_4>, cute::Stride<_4,_1>>,
+  cute::Layout<_8>
+> {
+  template<class EngineIn, class EngineOut>
+  CUTLASS_DEVICE
+  static void convert(
+    cute::Tensor<EngineIn,
+                cute::Layout<cute::Shape<_2,_4>, cute::Stride<_4,_1>>
+                > const& src, 
+    cute::Tensor<EngineOut,
+                cute::Layout<_8>
+                >& dst) {
+
+    static_assert(cute::is_same_v<cutlass::int4b_t, typename EngineIn::value_type> &&
+                  cute::is_same_v<cutlass::half_t, typename EngineOut::value_type>);
+    using SrcArray = cutlass::Array<cutlass::int4b_t, 8>;
+    using DstArray = cutlass::Array<cutlass::half_t, 8>;
+    using RegArray = cutlass::AlignedArray<uint32_t, 4, sizeof(DstArray)>;
+
+    auto&& src_reg = cute::recast<uint32_t>(src)(0);
+    auto&& r       = cute::recast<RegArray>(dst)(0);
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ii += 2) {
+      auto src_ = src_reg >> (4 * (ii));
+      r[ii + 0] = src_;
+      r[ii + 1] = src_;
+      static constexpr uint32_t lo_xor_mask = 0x64086408;
+      static constexpr uint32_t hi_xor_mask = 0x64806480;
+      static constexpr uint32_t lo_mask     = 0x000F000F;
+      static constexpr uint32_t hi_mask     = 0x00F000F0;
+      static constexpr uint32_t immLut      = (0xf0 & 0xcc) ^ 0xaa;
+      asm volatile(
+          "{\n"
+          "  lop3.b32 %0, %0, %1, %2, %3;\n"
+          "}\n"
+          : "+r"(r[ii + 0])
+          : "n"(lo_mask), "n"(lo_xor_mask), "n"(immLut));
+      asm volatile(
+          "{\n"
+          "  lop3.b32 %0, %0, %1, %2, %3;\n"
+          "}\n"
+          : "+r"(r[ii + 1])
+          : "n"(hi_mask), "n"(hi_xor_mask), "n"(immLut));
+      static constexpr uint32_t lo_bias  = 0x64086408; // {1032, 1032}
+      static constexpr uint32_t hi_bias  = 0xD480D480; // {-72, -72}
+      static constexpr uint32_t hi_scale = 0x2C002C00; // {1/16, 1/16}
+      {
+        half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii + 0]);
+        fp16x2_val = __hsub2(fp16x2_val,
+                             reinterpret_cast<const half2&>(lo_bias));
+      }
+      {
+        half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii + 1]);
+        fp16x2_val = __hfma2(fp16x2_val,
+                              reinterpret_cast<const half2&>(hi_scale),
+                              reinterpret_cast<const half2&>(hi_bias));
+      }
+    }
+  }
+};
+
+// Specialization for UINT4 -> FP16 with [02461357] value order
+template <>
+struct LayoutAwareConvertImpl<
+  cutlass::uint4b_t,
+  cutlass::half_t,
+  cute::Layout<cute::Shape<_2,_4>, cute::Stride<_4,_1>>,
+  cute::Layout<_8>
+> {
+  template<class EngineIn, class EngineOut>
+  CUTLASS_DEVICE
+  static void convert(
+    cute::Tensor<EngineIn,
+                cute::Layout<cute::Shape<_2,_4>, cute::Stride<_4,_1>>
+                > const& src, 
+    cute::Tensor<EngineOut,
+                cute::Layout<_8>
+                >& dst) {
+
+    static_assert(cute::is_same_v<cutlass::uint4b_t, typename EngineIn::value_type> &&
+                  cute::is_same_v<cutlass::half_t, typename EngineOut::value_type>);
+    using SrcArray = cutlass::Array<cutlass::uint4b_t, 8>;
+    using DstArray = cutlass::Array<cutlass::half_t, 8>;
+    using RegArray = cutlass::AlignedArray<uint32_t, 4, sizeof(DstArray)>;
+
+    auto&& src_reg = cute::recast<uint32_t>(src)(0);
+    auto&& r       = cute::recast<RegArray>(dst)(0);
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ii += 2) {
+      auto src_ = src_reg >> (4 * (ii));
+      r[ii + 0] = src_;
+      r[ii + 1] = src_;
+      static constexpr uint32_t or_mask = 0x64006400;
+      static constexpr uint32_t lo_mask = 0x000F000F;
+      static constexpr uint32_t hi_mask = 0x00F000F0;
+      static constexpr uint32_t immLut  = (0xf0 & 0xcc) | 0xaa;
+      asm volatile(
+          "{\n"
+          "  lop3.b32 %0, %0, %1, %2, %3;\n"
+          "}\n"
+          : "+r"(r[ii])
+          : "n"(lo_mask), "n"(or_mask), "n"(immLut));
+      asm volatile(
+          "{\n"
+          "  lop3.b32 %0, %0, %1, %2, %3;\n"
+          "}\n"
+          : "+r"(r[ii + 1])
+          : "n"(hi_mask), "n"(or_mask), "n"(immLut));
+      static constexpr uint32_t lo_bias  = or_mask;    // 0x64006400, {1024, 1024}
+      static constexpr uint32_t hi_bias  = 0xD400D400; // {-64, -64}
+      static constexpr uint32_t hi_scale = 0x2C002C00; // {1/16, 1/16}
+      {
+        half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii + 0]);
+        fp16x2_val = __hsub2(fp16x2_val,
+                             reinterpret_cast<const half2&>(lo_bias));
+      }
+      {
+        half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii + 1]);
+        fp16x2_val = __hfma2(fp16x2_val,
+                             reinterpret_cast<const half2&>(hi_scale),
+                             reinterpret_cast<const half2&>(hi_bias));
+      }
+    }
+  }
+};
+
+// Specialization for E5M2 -> FP16 with [3120] value order
+template <>
+struct LayoutAwareConvertImpl<
+  cutlass::float_e5m2_t,
+  cutlass::half_t,
+  cute::Layout<cute::Shape<_2,_2>, cute::Stride<_2,_1>>,
+  cute::Layout<_4>
+> {
+  template<class EngineIn, class EngineOut>
+  CUTLASS_DEVICE
+  static void convert(
+    cute::Tensor<EngineIn,
+                cute::Layout<cute::Shape<_2,_2>, cute::Stride<_2,_1>>
+                > const& src,
+    cute::Tensor<EngineOut,
+                cute::Layout<_4>
+                >& dst) {
+
+    static_assert(cute::is_same_v<cutlass::float_e5m2_t, typename EngineIn::value_type> &&
+                  cute::is_same_v<cutlass::half_t, typename EngineOut::value_type>);
+    using SrcArray = cutlass::Array<cutlass::float_e5m2_t, 8>;
+    using DstArray = cutlass::Array<cutlass::half_t, 8>;
+    using RegArray = cutlass::AlignedArray<uint32_t, 4, sizeof(DstArray)>;
+
+    auto&& src_reg = cute::recast<uint32_t>(src)(0);
+    auto&& r       = cute::recast<RegArray>(dst)(0);
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      // in registers: a3, a1, a2, a0
+      r[RegArray::kElements - ii - 1] = src_reg << (8 * (ii));
+
+      static constexpr uint32_t and_mask = 0xFF00FF00;
+      asm volatile(
+          "{\n"
+          "  and.b32 %0, %0, %1;\n"
+          "}\n"
+          : "+r"(r[ii])
+          : "n"(and_mask));
+    }
+  }
+};
+
+// Specialization for INT8 -> BF16 with [3120] value order
+template <>
+struct LayoutAwareConvertImpl<
+  cutlass::int8_t,
+  cutlass::half_t,
+  cute::Layout<cute::Shape<_2,_2>, cute::Stride<_2,_1>>,
+  cute::Layout<_4>
+> {
+  template<class EngineIn, class EngineOut>
+  CUTLASS_DEVICE
+  static void convert(
+    cute::Tensor<EngineIn,
+                cute::Layout<cute::Shape<_2,_2>, cute::Stride<_2,_1>>
+                > const& src,
+    cute::Tensor<EngineOut,
+                cute::Layout<_4>
+                >& dst) {
+
+    static_assert(cute::is_same_v<cutlass::int8_t, typename EngineIn::value_type> &&
+                  cute::is_same_v<cutlass::half_t, typename EngineOut::value_type>);
+    using SrcArray = cutlass::Array<cutlass::int8_t, 8>;
+    using DstArray = cutlass::Array<cutlass::half_t, 8>;
+    using RegArray = cutlass::AlignedArray<uint32_t, 4, sizeof(DstArray)>;
+
+    auto&& src_reg = cute::recast<uint32_t>(src)(0);
+    auto&& r       = cute::recast<RegArray>(dst)(0);
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      uint32_t tmp0, tmp1;
+      r[ii] = src_reg >> (8 * (ii));
+      static constexpr uint32_t or_mask    = 0x43004300;
+      static constexpr uint32_t and_mask_0 = 0x007F007F;
+      static constexpr uint32_t and_mask_1 = 0x00800080;
+      static constexpr uint32_t immLut     = (0xf0 & 0xcc) | 0xaa;
+      asm volatile(
+          "{\n"
+          "  lop3.b32 %0, %1, %2, %3, %4;\n"
+          "}\n"
+          : "=r"(tmp0)
+          : "r"(r[ii]), "n"(and_mask_0), "n"(or_mask), "n"(immLut));
+      asm volatile(
+          "{\n"
+          "  lop3.b32 %0, %1, %2, %3, %4;\n"
+          "}\n"
+          : "=r"(tmp1)
+          : "r"(r[ii]), "n"(and_mask_1), "n"(or_mask), "n"(immLut));
+      {
+        __nv_bfloat162& bf16x2_val = reinterpret_cast<__nv_bfloat162&>(r[ii]);
+        bf16x2_val = __hsub2(reinterpret_cast<__nv_bfloat162 const&>(tmp0),
+                             reinterpret_cast<__nv_bfloat162 const&>(tmp1));
+      }
+    }
+  }
+};
+
+// Specialization for INT8 -> FP16 with [3120] value order
+template <>
+struct LayoutAwareConvertImpl<
+  cutlass::int8_t,
+  cutlass::bfloat16_t,
+  cute::Layout<cute::Shape<_2,_2>, cute::Stride<_2,_1>>,
+  cute::Layout<_4>
+> {
+  template<class EngineIn, class EngineOut>
+  CUTLASS_DEVICE
+  static void convert(
+    cute::Tensor<EngineIn,
+                cute::Layout<cute::Shape<_2,_2>, cute::Stride<_2,_1>>
+                > const& src,
+    cute::Tensor<EngineOut,
+                cute::Layout<_4>
+                >& dst) {
+
+    static_assert(cute::is_same_v<cutlass::int8_t, typename EngineIn::value_type> &&
+                  cute::is_same_v<cutlass::bfloat16_t, typename EngineOut::value_type>);
+    using SrcArray = cutlass::Array<cutlass::int8_t, 8>;
+    using DstArray = cutlass::Array<cutlass::bfloat16_t, 8>;
+    using RegArray = cutlass::AlignedArray<uint32_t, 4, sizeof(DstArray)>;
+
+    auto&& src_reg = cute::recast<uint32_t>(src)(0);
+    auto&& r       = cute::recast<RegArray>(dst)(0);
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      r[ii] = src_reg >> (8 * (ii));
+      static constexpr uint32_t xor_mask = 0x64806480;
+      static constexpr uint32_t and_mask = 0x00FF00FF;
+      static constexpr uint32_t immLut   = (0xf0 & 0xcc) ^ 0xaa;
+      asm volatile(
+          "{\n"
+          "  lop3.b32 %0, %0, %1, %2, %3;\n"
+          "}\n"
+          : "+r"(r[ii])
+          : "n"(and_mask), "n"(xor_mask), "n"(immLut));
+      {
+        static constexpr uint32_t bias = 0x64806480;
+        __half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii]);
+        fp16x2_val = __hsub2(fp16x2_val,
+                             reinterpret_cast<__half2 const&>(bias));
+      }
+    }
+  }
+};
+
+template <
+  class EngineIn,
+  class EngineOut,
+  class LayoutIn,
+  class LayoutOut
+>
+CUTLASS_DEVICE
+void LayoutAwareConvert( // Accept mutable temporaries
+  cute::Tensor<EngineIn, LayoutIn>   const& src,
+  cute::Tensor<EngineOut, LayoutOut>     && dst) {
+
+  LayoutAwareConvert(src, dst);
+}
+template <
+  class EngineIn,
+  class EngineOut,
+  class LayoutIn,
+  class LayoutOut
+>
+CUTLASS_DEVICE
+void LayoutAwareConvert(
+  cute::Tensor<EngineIn, LayoutIn>   const& src,
+  cute::Tensor<EngineOut, LayoutOut>      & dst) {
+
+  using SrcType = typename EngineIn::value_type;
+  using DstType = typename EngineOut::value_type;
+  Tensor src_vm = coalesce(src);
+  Tensor dst_vm = coalesce(dst);
+  Layout src_layout = src_vm.layout();
+  Layout dst_layout = dst_vm.layout();
+  LayoutAwareConvertImpl<SrcType, 
+                         DstType,
+                         decltype(src_layout),
+                         decltype(dst_layout)>::convert(src_vm, dst_vm);
+}
+
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective::detail {
+
+template <class PointerType>
+static constexpr
+CUTLASS_HOST_DEVICE
+auto get_logical_ptr(PointerType const* ptr) {
+  if constexpr (cute::sizeof_bits_v<PointerType> < 8) {
+    return subbyte_iterator<PointerType const>(ptr);
+  }
+  else {  
+    return ptr;
+  }
+}
+template<int Stages, class LayoutAtom, class TileShape, class Stride>
+static constexpr
+CUTLASS_HOST_DEVICE
+auto get_smem_layout(LayoutAtom layout_atom, TileShape const& tile_shape, Stride const& stride) {
+  if constexpr (not cute::is_layout<Stride>::value) {
+    return tile_to_shape(
+      layout_atom,
+      append(tile_shape, Int<Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,Stride>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{});
+  }
+  else {
+    auto gmem_tile = composition(stride, tile_shape);
+    return make_layout_like(append(gmem_tile, make_layout(Int<Stages>{}, 0)));
+  }
+}
+template<class Shape, class Stride>
+static constexpr
+CUTLASS_HOST_DEVICE
+auto get_gmem_layout(Shape const& shape, Stride const& stride) {
+  if constexpr (not cute::is_layout<Stride>::value) {
+    return make_layout(shape, stride);
+  }
+  else {
+    return stride;
+  }
+}
+
+template<class Collective>
+struct MixedInputUtils {
+private:
+  using KernelSchedule = typename Collective::KernelSchedule;
+  using ConversionMode = typename Collective::ConversionMode;
+  using SmemLayoutA = typename Collective::SmemLayoutA;
+  using SmemLayoutB = typename Collective::SmemLayoutB;
+  using SmemLayoutScale = typename Collective::SmemLayoutScale;
+  using SwappedElementA = typename Collective::SwappedElementA;
+  using SwappedElementB = typename Collective::SwappedElementB;
+  using RealSwappedElementA = typename Collective::RealSwappedElementA;
+  using RealSwappedElementB = typename Collective::RealSwappedElementB;
+  using ElementScale = typename Collective::ElementScale;
+  using ElementZero = typename Collective::ElementZero;
+  using SmemCopyAtomScale = typename Collective::SmemCopyAtomScale;
+  static constexpr auto KernelConversionMode = Collective::KernelConversionMode;
+  static constexpr auto ModeHasScales = Collective::ModeHasScales;
+  static constexpr auto UseScaleLookupTable = Collective::UseScaleLookupTable;
+
+public:
+  static constexpr auto
+  elements_per_smem_scale() {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return 0;
+    } 
+    else if constexpr (ModeHasScales) {
+      return cute::cosize_v<SmemLayoutScale>;
+    } 
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Type not handled in scale smem allocation.");
+    }
+  }
+
+  static constexpr auto
+  elements_per_smem_zero() {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert ||
+                  KernelConversionMode == ConversionMode::ConvertAndScale ) {
+      return 0;
+    } 
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      return cute::cosize_v<SmemLayoutScale>;
+    } 
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Type not handled in scale smem allocation.");
+    }
+  }
+
+  // These methods use some the public members of the class. For that reason, we define them after the public section.
+  static constexpr uint32_t
+  compute_tma_transaction_bytes_mk() {
+    return cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(cute::sizeof_bits_v<SwappedElementA>));
+  }
+
+  static constexpr uint32_t
+  compute_tma_transaction_bytes_nk() {
+    return cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(cute::sizeof_bits_v<SwappedElementB>));
+  }
+
+  static constexpr uint32_t
+  compute_tma_transaction_bytes_extra() {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return 0;
+    }
+    else if constexpr (ModeHasScales) {
+      constexpr uint32_t scale_tx_bytes = cutlass::bits_to_bytes(size<0>(SmemLayoutScale{}) * size<1>(SmemLayoutScale{}) * static_cast<uint32_t>(cute::sizeof_bits_v<ElementScale>));
+      static_assert(scale_tx_bytes % 128 == 0, "Each scale stage must be 128B aligned."); // required by TMA
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return scale_tx_bytes;
+      }
+      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        // Scale and zero share smem layout
+        constexpr uint32_t zero_tx_bytes = cutlass::bits_to_bytes(size<0>(SmemLayoutScale{}) * size<1>(SmemLayoutScale{}) * static_cast<uint32_t>(cute::sizeof_bits_v<ElementZero>));
+        static_assert(zero_tx_bytes % 128 == 0, "Each zero stage must be 128B aligned."); // required by TMA
+        return scale_tx_bytes + zero_tx_bytes;
+      }
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Type not handled in tma transaction bytes computation.");
+      }
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Type not handled in tma transaction bytes computation.");
+    }
+  }
+
+  /// Utilities to copy A and extra inputs from smem to RF
+  template <class SmemTiledCopyA,
+            class TensorASmemView,
+            class TensorACopyView,
+            class... Ts,
+            class... Us
+            >
+  CUTLASS_DEVICE
+  static void copy_tensors_MK(
+    SmemTiledCopyA const& smem_tiled_copy_A,
+    TensorASmemView const& tCsA,
+    TensorACopyView& tCrA_copy_view,
+    cute::tuple<Ts...> const& partitioned_mma_extra_info,
+    cute::tuple<Us...> const& tiled_copy_and_views,
+    int k_block,
+    int read_stage) {
+
+    copy(smem_tiled_copy_A, tCsA(_,_,k_block,read_stage), tCrA_copy_view(_,_,k_block));
+
+    if (k_block == 0) {
+      // We are starting a new k-tile so copy the scale
+      if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+        // nothing to do
+      } 
+      else if constexpr (ModeHasScales) {
+        auto smem_tiled_copy_S = cute::get<0>(tiled_copy_and_views);
+        auto tCrS_copy_view    = cute::get<1>(tiled_copy_and_views);
+        auto tCsS              = cute::get<0>(partitioned_mma_extra_info);
+        copy(smem_tiled_copy_S, tCsS(_,_,k_block,read_stage), tCrS_copy_view(_,_,k_block));
+        if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+          // Nothing extra to do
+        } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+          auto tCsZ              = cute::get<2>(partitioned_mma_extra_info);
+          auto tCrZ_copy_view    = cute::get<2>(tiled_copy_and_views);
+          copy(smem_tiled_copy_S, tCsZ(_,_,k_block,read_stage), tCrZ_copy_view(_,_,k_block));
+        } else {
+          static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
+        }
+      } 
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
+      }
+    }
+  }
+
+  // The core converter uses a lookup table to converts i4 -> 8 bit value.
+  template <class EngineIn,
+            class LayoutIn,
+            class EngineOut,
+            class LayoutOut,
+            class EngineScale,
+            class LayoutScale>
+  CUTLASS_DEVICE
+  static void lookup_table_convert( // Accept mutable temporaries
+    Tensor<EngineIn, LayoutIn>       const& src,
+    Tensor<EngineOut, LayoutOut>         && dst,
+    Tensor<EngineScale, LayoutScale> const& scales_neg,
+    Tensor<EngineScale, LayoutScale> const& scales_pos) {
+    
+    lookup_table_convert(src, dst, scales_neg, scales_pos);
+  }
+  template <class EngineIn,
+            class LayoutIn,
+            class EngineOut,
+            class LayoutOut,
+            class EngineScale,
+            class LayoutScale>
+  CUTLASS_DEVICE
+  static void lookup_table_convert(
+    Tensor<EngineIn, LayoutIn>       const& src,
+    Tensor<EngineOut, LayoutOut>          & dst,
+    Tensor<EngineScale, LayoutScale> const& scales_neg,
+    Tensor<EngineScale, LayoutScale> const& scales_pos) {
+
+    constexpr int N = cute::cosize(LayoutIn{});
+    static_assert(N == 4 || N == 8);
+    static_assert(cosize(LayoutScale{}) <= N / 4, 
+                  "at least 4 consecutive weights must share the same scale.");
+    using SrcArray = cutlass::Array<cutlass::int4b_t, 8>;
+    using DstArray = cutlass::Array<RealSwappedElementB, 8>;
+    using RegArray = cutlass::AlignedArray<uint32_t, N / 4, sizeof(DstArray)>;
+
+    // View the input as reg
+    auto&& src_reg = cute::recast<uint32_t>(src)(0);
+    auto&& r       = cute::recast<RegArray>(dst)(0);
+
+    // Determines if to get from the signed or unsigned candidates
+    static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa;
+    uint32_t sign; // ((reg & 0x88888888) | 0x64206420) >> 1 
+    asm volatile(
+      "{\n"
+      "  lop3.b32 %0, %1, %2, %3, %4;\n" \
+      "}\n"
+      : "=r"(sign)
+      : "r"(src_reg), "n"(0x88888888), "n"(0x64206420), "n"(immLut)
+    );
+    sign = sign >> 1;
+
+    // Ignore sign bit when indexing into LUT
+    uint32_t lut_idx = src_reg & 0x77777777;
+    Tensor scales_neg_ = cute::filter(scales_neg);
+    Tensor scales_pos_ = cute::filter(scales_pos);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 4; ++i, lut_idx >>=16, sign >>=16) {
+      auto&& scale_neg_ = reinterpret_cast<cutlass::Array<uint32_t, 2> const&>(scales_neg_(i));
+      auto&& scale_pos_ = reinterpret_cast<cutlass::Array<uint32_t, 2> const&>(scales_pos_(i));
+      asm volatile(
+        "{\n"
+        "  .reg .b32 pos, neg                    ;\n" \
+        "  prmt .b32 neg, %3, %4, %1             ;\n" \
+        "  prmt .b32 pos, %5, %6, %1             ;\n" \
+        "  prmt .b32 %0, pos, neg, %2            ;\n" \
+        "}\n"
+        : "=r"(r[i])
+        : "r"(lut_idx), "r"(sign), "r"(scale_neg_[0]), "r"(scale_neg_[1]), "r"(scale_pos_[0]), "r"(scale_pos_[1])
+      );
+    }
+  }
+
+  /// Utilities to dequantize A.
+  template <class Layout>
+  CUTLASS_DEVICE
+  static void static_check_scale(Layout const& tensor) {
+    static_assert(shape<0>(Layout{}) >= 4 && stride<0>(Layout{}) == 0, "At least 4 adjacent weights in a thread must share the same scale.");
+  }
+  template <class Engine,
+            class Layout>
+  CUTLASS_DEVICE
+  static void static_check_scale(Tensor<Engine, Layout> const& tensor) {
+    static_check_scale(flatten(Layout{}));
+  }
+  template <class EngineIn,
+            class EngineOut, 
+            class LayoutIn,
+            class LayoutOut,
+            class... Ts>
+  CUTLASS_DEVICE
+  static void dequantize_A_kblock(
+    Tensor<EngineIn, LayoutIn> const& tCrA_load, 
+    Tensor<EngineOut, LayoutOut>& tCrA_mma,
+    cute::tuple<Ts...>& partitioned_extra_info,
+    int const k_block) {
+
+    static_assert(is_rmem<EngineIn>::value, "Input tensor for A conversion must come from registers");
+    static_assert(is_rmem<EngineOut>::value, "Output tensor for A conversion must come from registers");
+    static_assert(cosize_v<LayoutIn> == cosize_v<LayoutOut>);
+    static_assert(size_v<LayoutIn> == cosize_v<LayoutIn>);
+    static_assert(size_v<LayoutOut> == cosize_v<LayoutOut>);
+    using SrcType = typename EngineIn::value_type;
+    using DstType = typename EngineOut::value_type;
+
+    Tensor src = tCrA_load(_, _, k_block);
+    Tensor dst = tCrA_mma(_, _, k_block);
+    
+    CUTE_STATIC_ASSERT_V(size(src(_, 0)) == cosize(src(_, 0).layout()),
+                         "The first mode of tensor src must be contiguous in memory");
+    // try to make the size of the first mode equal to 32bit
+    int constexpr NumValPerSrcReg = cute::min(decltype(size(src(_, 0)))::value,
+                                              ceil_div(32, sizeof_bits_v<SrcType>));
+    Tensor src_vm = cute::group_modes<1,-1>(cute::zipped_divide(src, Int<NumValPerSrcReg>{}));
+    Tensor dst_vm = cute::group_modes<1,-1>(cute::zipped_divide(dst, Int<NumValPerSrcReg>{}));
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size<1>(dst_vm); ++i) {
+        LayoutAwareConvert(src_vm(_, i), dst_vm(_, i));
+      }
+    } 
+    else if constexpr (UseScaleLookupTable) {
+      constexpr int num_elements = decltype(size(src))::value;
+      static_assert(is_same_v<RealSwappedElementA, cutlass::int4b_t>, "Lookup table only supports int4 being the quant type now.");
+      static_assert(sizeof_bits_v<ElementScale> == 64, "Lookup table only supports 8 8bit scale values now.");
+      static_assert(num_elements % 4 == 0 && num_elements >= 4, "Lookup table requires a vector size of 4x when converting.");
+
+      Tensor tCrS_neg = cute::get<1>(partitioned_extra_info);
+      auto&& tCrS_pos = cute::get<2>(partitioned_extra_info); // modification to its value is needed
+      Tensor scales_neg = tCrS_neg(_, _, k_block);
+      Tensor scales_pos = tCrS_pos(_, _, k_block);
+      CUTE_STATIC_ASSERT_V(cute::size(src) == cute::size(scales_neg));
+
+      static_check_scale(scales_neg);
+      static_check_scale(scales_pos);
+      Tensor scales_neg_vm = cute::group_modes<1,-1>(cute::zipped_divide(scales_neg, Int<NumValPerSrcReg>{}));
+      Tensor scales_pos_vm = cute::group_modes<1,-1>(cute::zipped_divide(scales_pos, Int<NumValPerSrcReg>{}));
+
+      if (k_block == 0) {
+        Tensor scales_neg_vm_ = filter(scales_neg_vm);
+        Tensor scales_pos_vm_ = filter(scales_pos_vm);
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(scales_neg_vm_.layout()); ++i)
+        {
+          auto&& scale_neg_ = reinterpret_cast<cutlass::Array<uint32_t, 2> const&>(scales_neg_vm_(i));
+          auto&& scale_pos_ = reinterpret_cast<cutlass::Array<uint32_t, 2>      &>(scales_pos_vm_(i));
+          constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa;
+          asm volatile(
+              "{\n"
+              "  lop3 .b32 %0, %2, %4, %5, %6;\n" \
+              "  xor  .b32 %1, %3, %5;        \n" \
+              "}\n"
+              : "=r"(scale_pos_[0]), "=r"(scale_pos_[1])
+              : "r"(scale_neg_[0]), "r"(scale_neg_[1]), "n"(0xFFFFFF00), "n"(0x80808080), "n"(immLut)
+            );
+        }
+      }
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size<1>(dst_vm); ++i) {
+        lookup_table_convert(src_vm(_, i), dst_vm(_, i), scales_neg_vm(_, i), scales_pos_vm(_, i));
+      }
+    }
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      Tensor scales = cute::get<1>(partitioned_extra_info)(_, _, k_block);
+      CUTE_STATIC_ASSERT_V(size(src) == size(scales));
+      Tensor scales_vm = cute::group_modes<1,-1>(cute::zipped_divide(scales, Int<NumValPerSrcReg>{}));
+
+      if constexpr (is_same_v<DstType, ElementScale>) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size<1>(dst_vm); ++i) {
+          LayoutAwareConvert(src_vm(_, i), dst_vm(_, i));
+          CUTLASS_PRAGMA_UNROLL
+          for (int j = 0; j < size<0>(dst_vm); ++j) {
+            dst_vm(j, i) *= scales_vm(j, i);
+          }
+        }
+      }
+      else {
+        auto stage = make_tensor_like<ElementScale>(src_vm(_, 0));
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size<1>(dst_vm); ++i) {
+          LayoutAwareConvert(src_vm(_, i), stage);
+          CUTLASS_PRAGMA_UNROLL
+          for (int j = 0; j < size<0>(dst_vm); ++j) {
+            stage(j) *= scales_vm(j, i);
+          }
+          LayoutAwareConvert(stage, dst_vm(_, i));
+        }
+      }
+    }
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      static_assert(is_same_v<ElementScale, ElementZero>, "ElementScale and ElementZero must be the same.");
+      Tensor scales = cute::get<1>(partitioned_extra_info)(_, _, k_block);
+      Tensor zeros  = cute::get<3>(partitioned_extra_info)(_, _, k_block);
+      CUTE_STATIC_ASSERT_V(size(src) == size(scales));
+      CUTE_STATIC_ASSERT_V(size(src) == size(zeros));
+      Tensor scales_vm = cute::group_modes<1,-1>(cute::zipped_divide(scales, Int<NumValPerSrcReg>{}));
+      Tensor zeros_vm = cute::group_modes<1,-1>(cute::zipped_divide(zeros, Int<NumValPerSrcReg>{}));
+      
+      if constexpr (is_same_v<DstType, ElementScale>) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size<1>(dst_vm); ++i) {
+          LayoutAwareConvert(src_vm(_, i), dst_vm(_, i));
+          CUTLASS_PRAGMA_UNROLL
+          for (int j = 0; j < size<0>(dst_vm); ++j) {
+            dst_vm(j, i) = dst_vm(j, i) * scales_vm(j, i) + zeros_vm(j, i);
+          }
+        }
+      }
+      else {
+        auto stage = make_tensor_like<ElementScale>(src_vm(_, 0));
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size<1>(dst_vm); ++i) {
+          LayoutAwareConvert(src_vm(_, i), stage);
+          CUTLASS_PRAGMA_UNROLL
+          for (int j = 0; j < size<0>(dst_vm); ++j) {
+            stage(j) = stage(j) * scales_vm(j, i) + zeros_vm(j, i);
+          }
+          LayoutAwareConvert(stage, dst_vm(_, i));
+        }
+      }
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "No A data is loaded.");
+    }
+  }
+
+  /// Utilities for any additional inputs inside of the TMA load
+  template <
+    class Params,
+    class TensorStorage,
+    class... Ts
+  >
+  CUTLASS_DEVICE
+  static auto partition_extra_tma_inputs(
+    Params const& mainloop_params,
+    cute::tuple<Ts...> const& load_inputs,
+    TensorStorage& shared_tensors,
+    uint2 const& cluster_local_block_id,
+    int const m_coord, 
+    int const l_coord) {
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return cute::make_tuple();
+    } 
+    else if constexpr (ModeHasScales) {
+      Tensor sS  = make_tensor(make_smem_ptr(shared_tensors.smem_scale.begin()), SmemLayoutScale{}); // (BLK_M,BLK_K,PIPE)
+      Tensor gS_mkl = get<2>(load_inputs);
+      auto block_tma_s = mainloop_params.tma_load_scale.get_slice(cluster_local_block_id.y);
+      Tensor gS = gS_mkl(_,_,m_coord,_,l_coord);                                                  // (BLK_M,BLK_K,k)
+
+      Tensor tSgS = block_tma_s.partition_S(gS);                                              // (TMA,TMA_M,TMA_K,k)
+      Tensor tSsS = block_tma_s.partition_D(sS);                                              // (TMA,TMA_M,TMA_K,PIPE)
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(tSgS, tSsS);
+      } 
+      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor sZ  = make_tensor(make_smem_ptr(shared_tensors.smem_zero.begin()), SmemLayoutScale{}); // (BLK_M,BLK_K,PIPE)
+        Tensor gZ_mkl = get<3>(load_inputs);
+        auto block_tma_z = mainloop_params.tma_load_zero.get_slice(cluster_local_block_id.y);
+        Tensor gZ = gZ_mkl(_,_,m_coord,_,l_coord);                                            // (BLK_M,BLK_K,k)
+
+        Tensor tZgZ = block_tma_z.partition_S(gZ);                                            // (TMA,TMA_M,TMA_K,k)
+        Tensor tZsZ = block_tma_z.partition_D(sZ);                                            // (TMA,TMA_M,TMA_K,PIPE)
+        return cute::make_tuple(tSgS, tSsS, tZgZ, tZsZ);          
+      }
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled for input partitioning.");      
+      }
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled for input partitioning.");      
+    }
+  }
+
+  /// Utilities for partitioning extra inputs for loading from smem in the mainloop.
+  template <
+    class ThreadMma,
+    class TensorStorage
+  >
+  CUTLASS_DEVICE 
+  static auto partition_extra_mma_info(
+    ThreadMma const& mma_thread_slice,
+    TensorStorage& shared_tensors) {
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      // nothing to do
+      return cute::make_tuple();
+    }
+    else if constexpr (UseScaleLookupTable) {
+      Tensor sS = make_tensor(make_smem_ptr(shared_tensors.smem_scale.begin()), SmemLayoutScale{});// (BLK_M,BLK_SCALE_K,PIPE)
+      Tensor tCsS = mma_thread_slice.partition_A(sS);
+      Tensor tCrS_neg = make_tensor<ElementScale>(mma_thread_slice.partition_fragment_A(sS(_,_,Int<0>{})).layout()); 
+      Tensor tCrS_pos = make_tensor<ElementScale>(mma_thread_slice.partition_fragment_A(sS(_,_,Int<0>{})).layout()); 
+
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(tCsS, tCrS_neg, tCrS_pos);
+      }
+    }
+    else if constexpr (ModeHasScales) {
+      Tensor sS = make_tensor(make_smem_ptr(shared_tensors.smem_scale.begin()), SmemLayoutScale{});// (BLK_M,BLK_SCALE_K,PIPE)
+      Tensor tCsS = mma_thread_slice.partition_A(sS);
+      Tensor tCrS = make_tensor<ElementScale>(mma_thread_slice.partition_fragment_A(sS(_,_,Int<0>{})).layout()); 
+
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(tCsS, tCrS);
+      }
+      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor sZ = make_tensor(make_smem_ptr(shared_tensors.smem_zero.begin()), SmemLayoutScale{});// (BLK_M,BLK_SCALE_K,PIPE)
+        Tensor tCsZ = mma_thread_slice.partition_A(sZ);
+        Tensor tCrZ = make_tensor<ElementZero>(mma_thread_slice.partition_fragment_A(sZ(_,_,Int<0>{})).layout()); 
+        return cute::make_tuple(tCsS, tCrS, tCsZ, tCrZ);
+      }
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
+      }
+    } 
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
+    }
+  }
+
+  /// Returns the tiled copy and copy views for the extra inputs.
+  template <class TiledMma, class... Ts>
+  CUTLASS_DEVICE
+  static auto retile_extra_mma_info(
+    TiledMma const& tiled_mma,
+    cute::tuple<Ts...>& partitioned_extra_info,
+    int const warp_group_thread_idx) {
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      // nothing to do
+      return cute::make_tuple();
+    }
+    else if constexpr (ModeHasScales) {
+      auto smem_tiled_copy_S = make_tiled_copy_A(SmemCopyAtomScale{}, tiled_mma);
+      auto smem_thr_copy_S   = smem_tiled_copy_S.get_thread_slice(warp_group_thread_idx);
+      Tensor tCrS_copy_view  = smem_thr_copy_S.retile_D(cute::get<1>(partitioned_extra_info));        // (CPY,CPY_M,CPY_K)
+      
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(smem_tiled_copy_S, tCrS_copy_view);
+      } 
+      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor tCrZ_copy_view  = smem_thr_copy_S.retile_D(cute::get<3>(partitioned_extra_info));      // (CPY,CPY_M,CPY_K)
+        return cute::make_tuple(smem_tiled_copy_S, tCrS_copy_view, tCrZ_copy_view);
+      } 
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
+      }
+    } 
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
+    }
+  }
+};
+
+} // cutlass::gemm::collective::detail
diff --git a/include/cutlass/detail/dependent_false.hpp b/include/cutlass/detail/dependent_false.hpp
new file mode 100644
index 0000000000..76e52d2bf8
--- /dev/null
+++ b/include/cutlass/detail/dependent_false.hpp
@@ -0,0 +1,86 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::detail {
+
+/// @brief A bool constant that depends on one or more template parameters.
+///
+/// For more detailed documentation and use cases,
+/// please see `dependent_false` below.
+template <bool Value, class... Args>
+inline constexpr bool dependent_bool_value = Value;
+
+/// @brief An always-false value that depends on one or more template parameters.
+///
+/// This exists because `static_assert(false);` always fails,
+/// even if it occurs in the `else` branch of an `if constexpr`.
+/// The following example shows how to use `dependent_false` in that case.
+///
+/// @code
+/// template<class T>
+/// void foo (T t)
+/// {
+///     if constexpr (std::is_integral_v<T>) {
+///         do_integer_stuff(t);
+///     }
+///     else if constexpr (std::is_floating_point_v<T>) {
+///         do_floating_point_stuff(t);
+///     }
+///     else {
+///         static_assert(dependent_false<T>, "T must be "
+///             "an integral or floating-point type.");
+///     }
+/// }
+/// @endcode
+///
+/// This implements the C++ Standard Library proposal P1830R1.
+///
+/// https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2019/p1830r1.pdf
+///
+/// That proposal is under review as of 2022/12/05.
+/// The following link shows P1830's current review status.
+///
+/// https://github.com/cplusplus/papers/issues/572
+///
+/// P2593R0 proposes an alternate solution to this problem,
+/// that would change the C++ language itself.
+///
+/// https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2593r0.html
+///
+/// For headers in this library, however, we only consider library solutions
+/// as work-arounds for future C++ features.
+template <class... Args>
+inline constexpr bool dependent_false = dependent_bool_value<false, Args...>;
+
+}  // end namespace cutlass::detail
diff --git a/include/cutlass/detail/helper_macros.hpp b/include/cutlass/detail/helper_macros.hpp
new file mode 100644
index 0000000000..039f5e841a
--- /dev/null
+++ b/include/cutlass/detail/helper_macros.hpp
@@ -0,0 +1,211 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Helper macros for the CUTLASS library
+*/
+
+#pragma once
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+#ifdef CUTLASS_NAMESPACE
+#define concat_tok(a, b) a ## b
+#define mkcutlassnamespace(pre, ns) concat_tok(pre, ns)
+#define cutlass mkcutlassnamespace(cutlass_, CUTLASS_NAMESPACE)
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+#define CUTLASS_HOST_DEVICE __forceinline__ __device__ __host__
+#define CUTLASS_DEVICE __forceinline__ __device__
+#elif defined(__CUDACC_RTC__)
+#define CUTLASS_HOST_DEVICE __forceinline__ __device__
+#define CUTLASS_DEVICE __forceinline__ __device__
+#else
+#define CUTLASS_HOST_DEVICE inline
+#define CUTLASS_DEVICE inline
+#endif
+
+#if ! defined(_MSC_VER)
+#define CUTLASS_LAMBDA_FUNC_INLINE __attribute__((always_inline))
+#else
+#define CUTLASS_LAMBDA_FUNC_INLINE [[msvc::forceinline]]
+#endif
+
+#define CUTLASS_HOST __host__
+#define CUTLASS_GLOBAL __global__ static
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+CUTLASS_HOST_DEVICE void __CUTLASS_UNUSED(T const &) 
+{ }
+
+#if defined(__GNUC__)
+  #define CUTLASS_UNUSED(expr) __CUTLASS_UNUSED(expr)
+#else
+  #define CUTLASS_UNUSED(expr) do { ; } while (&expr != &expr)
+#endif
+
+#ifdef _MSC_VER
+// Provides support for alternative operators 'and', 'or', and 'not'
+#include <ciso646>
+#endif // _MSC_VER
+
+#if !defined(__CUDACC_RTC__)
+#include <cassert>
+#endif
+
+#if defined(__CUDA_ARCH__)
+  #if defined(_MSC_VER)
+    #define CUTLASS_NOT_IMPLEMENTED() { printf("%s not implemented\n", __FUNCSIG__); asm volatile ("brkpt;\n"); }
+  #else
+    #define CUTLASS_NOT_IMPLEMENTED() { printf("%s not implemented\n", __PRETTY_FUNCTION__); asm volatile ("brkpt;\n"); }
+  #endif
+#else
+  #if defined(_MSC_VER)
+    #define CUTLASS_NOT_IMPLEMENTED() assert(0 && __FUNCSIG__)
+  #else
+    #define CUTLASS_NOT_IMPLEMENTED() assert(0 && __PRETTY_FUNCTION__)
+  #endif
+#endif
+
+// CUTLASS_CMATH_NAMESPACE is the namespace where code can find
+// <cmath> functions like isnan and log.  Such functions are in
+// the std namespace in host code, but in the global namespace
+// in device code.
+//
+// The intended use case for this macro is in "using" declarations
+// for making argument-dependent lookup (ADL) work in generic code.
+// For example, if T is cutlass::half_t, the following code will
+// invoke cutlass::isnan(half_t).  If T is float, it will invoke
+// std::isnan on host and ::isnan on device.  (CUTLASS's support
+// for NVRTC prevents it from using things in the std namespace
+// in device code.)  Correct use of "using" declarations can help
+// avoid unexpected implicit conversions, like from half_t to float.
+//
+// template<class T>
+// bool foo(T x) {
+//   using CUTLASS_CMATH_NAMESPACE :: isnan;
+//   return isnan(x);
+// }
+//
+// Without this macro, one would need to write the following.
+//
+// template<class T>
+// bool foo(T x) {
+// #if defined(__CUDA_ARCH__)
+//   using ::isnan;
+// #else
+//   using std::isnan;
+// #endif
+//   return isnan(x);
+// }
+
+#if defined(__CUDA_ARCH__)
+#  define CUTLASS_CMATH_NAMESPACE
+#else
+#  define CUTLASS_CMATH_NAMESPACE std
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+
+#ifndef CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED
+#define CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED 0
+#endif
+
+
+// CUDA 10.1 introduces the mma instruction
+#if !defined(CUTLASS_ENABLE_TENSOR_CORE_MMA)
+#define CUTLASS_ENABLE_TENSOR_CORE_MMA 0
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define CUTLASS_ASSERT(x) assert(x)
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// CUTLASS_PRAGMA_(UNROLL|NO_UNROLL) optimization directives for the CUDA compiler.
+#if defined(__CUDA_ARCH__) && !defined(__INTELLISENSE__)
+  #if defined(__CUDACC_RTC__) || (defined(__clang__) && defined(__CUDA__))
+    #define CUTLASS_PRAGMA_UNROLL _Pragma("unroll")
+    #define CUTLASS_PRAGMA_NO_UNROLL _Pragma("unroll 1")
+  #else
+    #define CUTLASS_PRAGMA_UNROLL #pragma unroll
+    #define CUTLASS_PRAGMA_NO_UNROLL #pragma unroll 1
+  #endif
+
+  #define CUTLASS_GEMM_LOOP CUTLASS_PRAGMA_NO_UNROLL
+
+#else
+
+    #define CUTLASS_PRAGMA_UNROLL
+    #define CUTLASS_PRAGMA_NO_UNROLL
+    #define CUTLASS_GEMM_LOOP
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if !defined(__CUDACC_RTC__)
+#define CUTLASS_THREAD_LOCAL thread_local
+#else
+#define CUTLASS_THREAD_LOCAL
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(_MSVC_LANG)
+#  define CUTLASS_CPLUSPLUS _MSVC_LANG
+#else
+#  define CUTLASS_CPLUSPLUS __cplusplus
+#endif
+
+#if (201700L <= CUTLASS_CPLUSPLUS)
+#define CUTLASS_CONSTEXPR_IF_CXX17 constexpr
+#define CUTLASS_CXX17_OR_LATER 1
+#else
+#define CUTLASS_CONSTEXPR_IF_CXX17
+#define CUTLASS_CXX17_OR_LATER 0
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}; // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/detail/layout.hpp b/include/cutlass/detail/layout.hpp
new file mode 100644
index 0000000000..cbed61f683
--- /dev/null
+++ b/include/cutlass/detail/layout.hpp
@@ -0,0 +1,406 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cute/layout.hpp"
+#include "cute/pointer_sparse.hpp"       // cute::is_sparse
+#include "cute/swizzle.hpp"              // cute::Swizzle
+#include "cute/swizzle_layout.hpp"       // cute::detail::get_swizzle_portion
+#include "cute/util/type_traits.hpp"
+#include "cute/arch/copy_sm90_tma.hpp"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/detail/collective.hpp"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::detail {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// For each cutlass::layout, provides its corresponding cute stride types, 64b by default
+
+template <class L>
+struct TagToStrideA {
+  using type = L;
+};
+
+// Maps to modes [M, K, L]
+template <>
+struct TagToStrideA<layout::RowMajor> {
+  using type = cute::Stride<int64_t, cute::Int<1>, int64_t>;
+  using tag = layout::RowMajor;
+};
+
+// Maps to modes [M, K, L]
+template <>
+struct TagToStrideA<layout::ColumnMajor> {
+  using type = cute::Stride<cute::Int<1>, int64_t, int64_t>;
+  using tag = layout::ColumnMajor;
+};
+
+template <class L>
+struct TagToStrideB {
+  using type = L;
+};
+
+// Maps to modes [N, K, L]
+template <>
+struct TagToStrideB<layout::RowMajor> {
+  using type = cute::Stride<cute::Int<1>, int64_t, int64_t>;
+  using tag = layout::RowMajor;
+};
+
+// Maps to modes [N, K, L]
+template <>
+struct TagToStrideB<layout::ColumnMajor> {
+  using type = cute::Stride<int64_t, cute::Int<1>, int64_t>;
+  using tag = layout::ColumnMajor;
+};
+
+// For each cutlass::layout *, provides its corresponding cute stride types, 64b by default
+// Used by pointer array and grouped gemm
+// Maps to modes [M, K, L]
+template <>
+struct TagToStrideA<layout::RowMajor *> {
+  using UnderlyingType = cute::Stride<int64_t, cute::Int<1>, cute::Int<0>>;
+  using type = UnderlyingType*;
+  using tag = layout::RowMajor;
+};
+
+// Maps to modes [M, K, L]
+template <>
+struct TagToStrideA<layout::ColumnMajor *> {
+  using UnderlyingType = cute::Stride<cute::Int<1>, int64_t, cute::Int<0>>;
+  using type = UnderlyingType*;
+  using tag = layout::ColumnMajor;
+};
+
+// Maps to modes [N, K, L]
+template <>
+struct TagToStrideB<layout::RowMajor *> {
+  using UnderlyingType = cute::Stride<cute::Int<1>, int64_t, cute::Int<0>>;
+  using type = UnderlyingType*;
+  using tag = layout::RowMajor;
+};
+
+// Maps to modes [N, K, L]
+template <>
+struct TagToStrideB<layout::ColumnMajor *> {
+  using UnderlyingType = cute::Stride<int64_t, cute::Int<1>, cute::Int<0>>;
+  using type = UnderlyingType*;
+  using tag = layout::ColumnMajor;
+};
+
+// Maps to modes [M, N, L]
+template <class LayoutTag>
+struct TagToStrideC : TagToStrideA<LayoutTag> { };
+
+// Conv: Maps to modes ((P,N), C, _0) for compatiblity with GEMM epilogues expecting a batch mode stride
+template <>
+struct TagToStrideC<cutlass::layout::TensorNWC> {
+  using type = cute::Stride<cute::Stride<int64_t, int64_t>, cute::Int<1>, cute::Int<0>>;
+};
+
+// Conv: Maps to modes ((P,Q,N), C, _0) for compatiblity with GEMM epilogues expecting a batch mode stride
+template <>
+struct TagToStrideC<cutlass::layout::TensorNHWC> {
+  using type = cute::Stride<cute::Stride<int64_t, int64_t, int64_t>, cute::Int<1>, cute::Int<0>>;
+};
+
+// Conv: Maps to modes ((P,Q,Z,N), C, _0) for compatiblity with GEMM epilogues expecting a batch mode stride
+template <>
+struct TagToStrideC<cutlass::layout::TensorNDHWC> {
+  using type = cute::Stride<cute::Stride<int64_t, int64_t, int64_t, int64_t>, cute::Int<1>, cute::Int<0>>;
+};
+
+// Conv: Maps to modes (K, (C,S), _0) for compatiblity with GEMM epilogues expecting a batch mode stride
+template <>
+struct TagToStrideC<cutlass::layout::TensorKCS> {
+  using type = cute::Stride<int64_t, cute::Stride<cute::Int<1>, int64_t>, cute::Int<0>>;
+};
+
+// Conv: Maps to modes (K, (C,S,R), _0) for compatiblity with GEMM epilogues expecting a batch mode stride
+template <>
+struct TagToStrideC<cutlass::layout::TensorKCSR> {
+  using type = cute::Stride<int64_t, cute::Stride<cute::Int<1>, int64_t, int64_t>, cute::Int<0>>;
+};
+
+// Conv: Maps to modes (K, (C,S,R,T), _0) for compatiblity with GEMM epilogues expecting a batch mode stride
+template <>
+struct TagToStrideC<cutlass::layout::TensorKCSRT> {
+  using type = cute::Stride<int64_t, cute::Stride<cute::Int<1>, int64_t, int64_t, int64_t>, cute::Int<0>>;
+};
+
+// Conv: Maps to modes ((C,S), K, _0) for compatiblity with GEMM epilogues expecting a batch mode stride
+template <>
+struct TagToStrideC<cutlass::layout::TensorCSK> {
+  using type = cute::Stride<cute::Stride<cute::Int<1>, int64_t>, int64_t, cute::Int<0>>;
+};
+
+// Conv: Maps to modes ((C,S,R), K, _0) for compatiblity with GEMM epilogues expecting a batch mode stride
+template <>
+struct TagToStrideC<cutlass::layout::TensorCSRK> {
+  using type = cute::Stride<cute::Stride<cute::Int<1>, int64_t, int64_t>, int64_t, cute::Int<0>>;
+};
+
+// Conv: Maps to modes ((C,S,R,T), K, _0) for compatiblity with GEMM epilogues expecting a batch mode stride
+template <>
+struct TagToStrideC<cutlass::layout::TensorCSRTK> {
+  using type = cute::Stride<cute::Stride<cute::Int<1>, int64_t, int64_t, int64_t>, int64_t, cute::Int<0>>;
+};
+
+// Convenience aliases
+template<class LayoutTag>
+using TagToStrideA_t = typename TagToStrideA<LayoutTag>::type;
+
+template<class LayoutTag>
+using TagToStrideB_t = typename TagToStrideB<LayoutTag>::type;
+
+template<class LayoutTag>
+using TagToStrideC_t = typename TagToStrideC<LayoutTag>::type;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// For 2.x compatibility APIs, provide stride->layout tag mappers
+
+template<int ModeIndex, class Stride>
+constexpr bool
+is_major(Stride = {}) {
+  // Account for stride types with and without batch mode and batch modes with static zero stride
+  return cute::is_constant<1, decltype(cute::front(cute::get<ModeIndex>(cute::remove_pointer_t<Stride>{})))>::value;
+}
+
+template<int ModeIndex, class Shape, class Stride>
+constexpr bool
+is_major(cute::Layout<Shape,Stride> = {}) {
+  return is_major<ModeIndex>(Stride{});
+}
+
+// Note : This method can be used for deducing the Layout Tag of A, C, D Matrices
+template<class StrideA>
+constexpr
+auto
+stride_to_layout_tag_A() {
+  using InternalStrideA = cute::remove_pointer_t<StrideA>;
+  if constexpr (cute::is_layout<InternalStrideA>::value) {
+    return stride_to_layout_tag_A<decltype(cute::stride(InternalStrideA{}))>();
+  }
+  else if constexpr (is_major<0, StrideA>()) { // M major
+    return layout::ColumnMajor{};
+  }
+  // Specialize for sparse layout
+  else if constexpr (cute::get<0>(InternalStrideA{}) == cute::_2{} && 
+                     cute::rank(cute::get<1>(InternalStrideA{})) == 2 && 
+                     cute::is_same_v<cute::_1, cute::remove_cvref_t<decltype(cute::get<1,0>(InternalStrideA{}))>>) {
+    return layout::ColumnMajor{};
+  }
+  else { // K major
+    return layout::RowMajor{};
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template<class StrideB>
+constexpr
+auto
+stride_to_layout_tag_B() {
+  using InternalStrideB = cute::remove_pointer_t<StrideB>;
+  if constexpr (cute::is_layout<InternalStrideB>::value) {
+    return stride_to_layout_tag_B<decltype(cute::stride(InternalStrideB{}))>();
+  }
+  else if constexpr (is_major<0, StrideB>()) { // N major
+    return layout::RowMajor{};
+  }
+  else { // K major
+    return layout::ColumnMajor{};
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template<class StrideC>
+constexpr
+auto
+stride_to_layout_tag_C() {
+  using InternalStrideC = cute::remove_pointer_t<StrideC>;
+  if constexpr (cute::is_layout<InternalStrideC>::value) {
+    return stride_to_layout_tag_C<decltype(cute::stride(InternalStrideC{}))>();
+  }
+  else if constexpr (is_major<0, StrideC>()) { // M major
+    return layout::ColumnMajor{};
+  }
+  else { // N major
+    return layout::RowMajor{};
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Utilities to map Stride back on to their corresponding layout tags
+template <class S>
+struct StrideToLayoutTagA {
+  using type = decltype(detail::stride_to_layout_tag_A<S>());
+};
+
+template <class S>
+struct StrideToLayoutTagB {
+  using type = decltype(detail::stride_to_layout_tag_B<S>());
+};
+
+template <class S>
+struct StrideToLayoutTagC {
+  using type = decltype(detail::stride_to_layout_tag_C<S>());
+};
+
+// Convenience aliases
+template<class S>
+using StrideToLayoutTagA_t = typename StrideToLayoutTagA<S>::type;
+
+template<class S>
+using StrideToLayoutTagB_t = typename StrideToLayoutTagB<S>::type;
+
+template<class S>
+using StrideToLayoutTagC_t = typename StrideToLayoutTagC<S>::type;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Inspects a tiled copy and whether its copy engine is TMA or not
+template<class GmemTiledCopy>
+constexpr bool is_tma_copy_engine() {
+  if constexpr (cute::is_void_v<GmemTiledCopy>) {
+    return false;
+  }
+  else {
+   if constexpr (   cute::is_base_of_v<cute::SM90_TMA_LOAD,                         GmemTiledCopy>
+                  || cute::is_base_of_v<cute::SM90_TMA_LOAD_MULTICAST,              GmemTiledCopy>
+                  || cute::is_base_of_v<cute::SM90_TMA_LOAD_IM2COL,                 GmemTiledCopy>
+                  || cute::is_base_of_v<cute::SM90_TMA_LOAD_IM2COL_MULTICAST,       GmemTiledCopy>
+                  || cute::is_base_of_v<cute::SM90_TMA_STORE,                       GmemTiledCopy>
+                  || cute::is_base_of_v<cute::SM90_TMA_STORE_IM2COL,                GmemTiledCopy>
+                  ) {
+      return true;
+    }
+  }
+  return false;
+}
+
+template <class X, class = void>
+struct RawDtype { using type = X; };
+
+template <class X>
+struct RawDtype<X,cute::void_t<typename X::raw_type>> { using type = typename X::raw_type; };
+
+
+// Inspects a TiledCopy and returns its alignment in terms of element count
+template <class GmemTiledCopy, class Element, class ElementMma = Element>
+constexpr int
+get_alignment_count_from_gmem_tiled_copy() {
+
+  if constexpr (cute::is_void_v<GmemTiledCopy>) {
+    return 1;
+  }
+
+  // Account for ElementC = void kernels
+  else if constexpr (cute::is_void_v<Element>) {
+    return 0;
+  }
+
+  else {
+    // For TMA tiled copies, we know the alignment has to be 128 bits
+    if constexpr (is_tma_copy_engine<GmemTiledCopy>()) {
+      // For sparse MMA, alignment in logical elements is increased by sparsity factor
+      if constexpr (cute::is_sparse_v<ElementMma>) {
+        return 128 / sizeof_bits<Element>::value * ElementMma::sparsity;
+      }
+      return 128 / sizeof_bits<Element>::value;
+    }
+    else {
+      // For non-TMA tiled copies, TiledCopy holds the alignment count directly in its TiledShape_MN
+      return GmemTiledCopy::NumValSrc;
+    }
+  }
+}
+
+// Return alignment bit requirements for the GEMM inputs.
+template <
+  class ElementType
+>
+constexpr int
+get_input_alignment_bits() {
+  return 128;
+}
+
+// Return alignment bit requirements for the GEMM outputs.
+template <class ElementType>
+constexpr int
+get_output_alignment_bits() {
+  return 128;
+}
+
+// Check if tensor layout satisfies a given major alignment
+template<int Alignment, class Shape, class Stride>
+CUTLASS_HOST_DEVICE constexpr
+bool
+check_alignment(cute::Layout<Shape,Stride> const& layout) {
+  // Condition: shape must divide by Alignment without rounding
+  bool shape_check = cute::size(layout.shape()) == Alignment * cute::size(cute::upcast<Alignment>(layout));
+  // Condition: every dynamic stride must be a multiple of Alignment
+  bool stride_check = cute::all_of(cute::flatten(layout.stride()), [](auto s){ return cute::is_static<decltype(s)>::value || (s % Alignment == 0); });
+  return shape_check && stride_check;
+}
+
+// Check if tensor layout satisfies a given major alignment
+template<int Alignment, class Shape, class Stride>
+CUTLASS_HOST_DEVICE constexpr
+bool
+check_alignment(Shape const& shape, Stride const& stride) {
+  return check_alignment<Alignment>(cute::make_layout(shape, stride));
+}
+
+template<int B, int M, int S>
+CUTLASS_HOST_DEVICE constexpr
+size_t
+alignment_for_swizzle(cute::Swizzle<B, M, S>) {
+  static_assert(B >= 0 and M >= 0);
+  return size_t(1) << size_t(B + M + cute::abs(S));
+}
+
+template<class Layout>
+CUTLASS_HOST_DEVICE constexpr
+size_t
+alignment_for_swizzle(Layout layout) {
+  return alignment_for_swizzle(cute::detail::get_swizzle_portion(layout));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::detail
diff --git a/include/cutlass/detail/mainloop_fusion_helper_scale_factor.hpp b/include/cutlass/detail/mainloop_fusion_helper_scale_factor.hpp
new file mode 100644
index 0000000000..914443dd0d
--- /dev/null
+++ b/include/cutlass/detail/mainloop_fusion_helper_scale_factor.hpp
@@ -0,0 +1,75 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Mainloop Fusion configs specific for scale factors
+*/
+
+#pragma once
+
+#include <cute/util/type_traits.hpp> // cute::void_t
+
+namespace cutlass::detail {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename CollectiveMainloop, typename = void>
+struct ElementSFType {
+  using type = void;
+};
+
+template <typename CollectiveMainloop>
+struct ElementSFType<CollectiveMainloop, cute::void_t<typename CollectiveMainloop::ElementSF>> {
+  using type = typename CollectiveMainloop::ElementSF;
+};
+
+template <typename CollectiveMainloop, typename = void>
+struct LayoutSFAType {
+  using type = void;
+};
+
+template <typename CollectiveMainloop>
+struct LayoutSFAType<CollectiveMainloop, cute::void_t<typename CollectiveMainloop::LayoutSFA>> {
+  using type = typename CollectiveMainloop::LayoutSFA;
+};
+
+template <typename CollectiveMainloop, typename = void>
+struct LayoutSFBType {
+  using type = void;
+};
+
+template <typename CollectiveMainloop>
+struct LayoutSFBType<CollectiveMainloop, cute::void_t<typename CollectiveMainloop::LayoutSFB>> {
+  using type = typename CollectiveMainloop::LayoutSFB;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::detail
diff --git a/include/cutlass/detail/mma.hpp b/include/cutlass/detail/mma.hpp
new file mode 100644
index 0000000000..0e491b9c40
--- /dev/null
+++ b/include/cutlass/detail/mma.hpp
@@ -0,0 +1,71 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/arch/mma.h"
+#include "cute/layout.hpp"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::detail {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class TiledMma, class = void>
+struct IsSparseTensorOp : cute::false_type { };
+
+// TiledMma for sparse must have ValTypeE
+template <class TiledMma>
+struct IsSparseTensorOp<TiledMma, cute::void_t<typename TiledMma::ValTypeE>>
+    : cute::true_type { };
+
+// The following metafunction is used to extract the OperatorClass from a cutlass 3.x kernel.
+template <class TiledMma>
+struct get_operator_class {
+  static constexpr bool is_sparse_op = IsSparseTensorOp<TiledMma>::value;
+  static constexpr bool is_tensor_op = cute::size<0>(typename TiledMma::AtomShape_MNK{}) >= 8;
+  using type = cute::conditional_t<
+                is_tensor_op, 
+                cute::conditional_t<
+                  is_sparse_op,
+                  cutlass::arch::OpClassSparseTensorOp,
+                    cutlass::arch::OpClassTensorOp
+                  >,
+                cutlass::arch::OpClassSimt
+                >;
+};
+
+template <class T>
+using get_operator_class_t = typename get_operator_class<T>::type;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::detail
diff --git a/include/cutlass/device_kernel.h b/include/cutlass/device_kernel.h
index 1de33024b9..cc7caede49 100644
--- a/include/cutlass/device_kernel.h
+++ b/include/cutlass/device_kernel.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -34,20 +34,46 @@
 
 #pragma once
 
-#include "cutlass/cutlass.h"
+#include <cutlass/detail/helper_macros.hpp> // CUTLASS_HOST_DEVICE
+#include <cutlass/platform/platform.h> // uint64_t
+
+// __grid_constant__ was introduced in CUDA 11.7.
+#if ((__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 7))) && !CUTLASS_CLANG_CUDA
+#  define CUTLASS_GRID_CONSTANT_SUPPORTED
+#endif
+
+// __grid_constant__ can be enabled only on SM70+
+#if defined(CUTLASS_GRID_CONSTANT_SUPPORTED) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700)
+#  define CUTLASS_GRID_CONSTANT_ENABLED
+#endif
+
+#if ! defined(CUTLASS_GRID_CONSTANT)
+#  if defined(CUTLASS_GRID_CONSTANT_ENABLED)
+#    define CUTLASS_GRID_CONSTANT __grid_constant__
+#  else
+#    define CUTLASS_GRID_CONSTANT
+#  endif
+#endif
+
 ////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
 
+template <typename T>   struct Type2Type  {  using type=T;                    };
+// using the simple type to replace the complex type to reduce this symbol size
+template <typename  T>                                                                        struct GetUnderlyingKernel                              : public Type2Type<T>               {};
+template <uint64_t shader_guid, unsigned index, template <uint64_t, unsigned> class Wrapper > struct GetUnderlyingKernel<Wrapper<shader_guid,index>>  : public Wrapper<shader_guid,index> {};
+template <typename  T>                                                                        using  GetUnderlyingKernel_t                            = typename GetUnderlyingKernel<T>::type;
+
+
 ////////////////////////////////////////////////////////////////////////////////
 
 /// Generic CUTLASS kernel template.
 template <typename Operator>
-__global__
+CUTLASS_GLOBAL
 void Kernel(typename Operator::Params params) {
   // Dynamic shared memory base pointer
   extern __shared__ int SharedStorageBase[];
-
   // Declare pointer to dynamic shared memory.
   typename Operator::SharedStorage *shared_storage =
       reinterpret_cast<typename Operator::SharedStorage *>(SharedStorageBase);
@@ -55,8 +81,48 @@ void Kernel(typename Operator::Params params) {
   Operator op;
 
   op(params, *shared_storage);
+  cutlass::arch::synclog_print();
 }
 
+
+/// Generic CUTLASS kernel template.
+template <typename Operator>
+CUTLASS_GLOBAL
+void Kernel2(typename Operator::Params params) {
+  // Dynamic shared memory base pointer
+  extern __shared__ int SharedStorageBase[];
+  // Declare pointer to dynamic shared memory.
+  typename Operator::SharedStorage *shared_storage =
+      reinterpret_cast<typename Operator::SharedStorage *>(SharedStorageBase);
+
+  Operator::invoke(params, *shared_storage);
+  cutlass::arch::synclog_print();
+
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// 3.0 specific launch
+//
 ////////////////////////////////////////////////////////////////////////////////
-} /// namespace cutlass
 
+/// Generic CUTLASS kernel template.
+template <typename Operator>
+CUTLASS_GLOBAL
+#ifdef __CUDACC__
+// Enclosing this in __CUDACC__ suppresses MSVC warnings.
+__launch_bounds__(Operator::MaxThreadsPerBlock, Operator::MinBlocksPerMultiprocessor)
+#endif // __CUDACC__
+void device_kernel(CUTLASS_GRID_CONSTANT typename Operator::Params const params)
+{
+  // Dynamic shared memory base pointer
+  extern __shared__ char smem[];
+  Operator op;
+  op(params, smem);
+  cutlass::arch::synclog_print();
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+} /// namespace cutlass
diff --git a/include/cutlass/epilogue/collective/builders/sm90_builder.inl b/include/cutlass/epilogue/collective/builders/sm90_builder.inl
new file mode 100644
index 0000000000..720dcc008a
--- /dev/null
+++ b/include/cutlass/epilogue/collective/builders/sm90_builder.inl
@@ -0,0 +1,813 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cute/atom/mma_traits_sm90.hpp"
+#include "cute/atom/mma_traits_sm90_gmma.hpp"
+#include "cute/atom/copy_traits_sm90.hpp"
+
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/gemm/collective/builders/sm90_common.inl"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_epilogue.hpp"
+#include "cutlass/epilogue/collective/builders/sm90_common.inl"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
+#include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
+#include "cutlass/epilogue/fusion/callbacks.hpp"
+#include "cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp"
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/type_traits>
+#else
+#include <type_traits>
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::collective {
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+// Returns the parameterized dispatch policy for the TMA epilogue
+template<class TileShapeMNK, class EpilogueTileMN, class ElementC, class ElementD, class Schedule>
+constexpr auto
+sm90_get_tma_dispatch_policy() {
+  using namespace cute;
+
+  constexpr int EpiTiles = size(shape_div(take<0,2>(TileShapeMNK{}), EpilogueTileMN{}));
+  constexpr int FragmentSize = size(EpilogueTileMN{}) / (detail::sm90_is_cooperative_v<Schedule> ? 256 : 128);
+  // 8b residuals load fast and consume little smem, so the perf cost of waiting on stores to finish outweighs the cost of extra allocation
+  constexpr bool ReuseSmem = (sizeof_bits_v<ElementC> == sizeof_bits_v<ElementD>) && (sizeof_bits_v<ElementD> > 8);
+  // TMA store delay performs worse with residual loads and compilicates tensormap updates for Ptr-Array GEMMs
+  constexpr bool DelayTmaStore = is_void_v<ElementC> && !detail::sm90_is_ptr_array_tma_v<Schedule>;
+  constexpr int StagesD = cute::min(EpiTiles, 2);
+  constexpr int StagesC = ReuseSmem ? cute::max(cute::min(EpiTiles, 4), StagesD+1)
+                                    : cute::min(EpiTiles, 4);
+
+  if constexpr (detail::sm90_is_ptr_array_tma_v<Schedule>) {
+      return Sm90PtrArrayTmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmem, 
+                                            DelayTmaStore, Schedule::NumEpilogueWarpGroups>{};
+  } 
+  else {
+    return Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmem, DelayTmaStore>{};
+  }
+}
+
+// Returns the smem layout atom to be used for C or D matrix
+template<class GmemStrideType, class Element, class EpilogueTile_MN>
+constexpr auto
+sm90_get_epilogue_smem_swizzle_layout_atom() {
+  using namespace cute;
+
+  // ColMajor C/D (M-major)
+  if constexpr (cutlass::gemm::detail::is_major<0>(GmemStrideType{})) {
+    return cutlass::gemm::collective::detail::ss_smem_selector<
+      cute::GMMA::Major::MN, Element, decltype(get<0>(EpilogueTile_MN{})), decltype(get<1>(EpilogueTile_MN{}))
+    >();
+  }
+  // RowMajor C/D (N-major)
+  else if constexpr (cutlass::gemm::detail::is_major<1>(GmemStrideType{})) {
+    return cutlass::gemm::collective::detail::ss_smem_selector<
+      cute::GMMA::Major::K , Element, decltype(get<0>(EpilogueTile_MN{})), decltype(get<1>(EpilogueTile_MN{}))
+    >();
+  }
+  else {
+    static_assert(cutlass::detail::dependent_false<GmemStrideType>, "Unsupported gmem layout.");
+  }
+}
+
+// Attempts to compute a reasonable epilogue tile based on block tile shape or allows the user to provide one.
+template <class ElementD, class EpilogueTileType, class Schedule, class TileShape_MNK>
+constexpr auto
+sm90_compute_tile_shape_or_override() {
+  if constexpr (cute::is_same_v<EpilogueTileType, EpilogueTileAuto>) {
+    auto epi_tile = [&] () {
+      if constexpr (detail::sm90_is_cooperative_v<Schedule>) {
+        auto tile_m = cute::min(_128{}, size<0>(TileShape_MNK{}));
+        auto tile_n = cute::min(_32{}, size<1>(TileShape_MNK{}));
+        return make_shape(tile_m, tile_n);
+      }
+      else if constexpr (detail::sm90_is_warp_specialized_v<Schedule>) {
+        constexpr int N_perf = sizeof_bits_v<ElementD> == 8 ? 64 : 32;
+        auto tile_m = cute::min(_64{}, size<0>(TileShape_MNK{}));
+        auto tile_n = cute::min(Int<N_perf>{}, size<1>(TileShape_MNK{}));
+        return make_shape(tile_m, tile_n);
+      }
+      else {
+        static_assert(cutlass::detail::dependent_false<Schedule>, "Unsupported schedule.");
+      }
+    }();
+
+    return cute::transform(epi_tile, seq<0,1>{},
+      [] (auto epi_tiler, auto I) {
+        auto cta_tiler = make_layout(get<I>(TileShape_MNK{}));
+        // This is a multimodal CTA tiler, transform before returning
+        if constexpr (depth(cta_tiler) > 0) {
+          // This is an implicit multimodal tiler, match profile and return
+          if constexpr (tuple_size_v<decltype(shape(cta_tiler))> == 1) {
+            return make_tile(epi_tiler);
+          }
+          // This is an explicit multimodal tiler, compose out epi tiler
+          else {
+            return composition(cta_tiler, epi_tiler);
+          }
+        }
+        // This is a flat CTA tiler, no need for transformation
+        else {
+          return epi_tiler;
+        }
+      });
+  }
+  else if constexpr (cute::is_tuple<EpilogueTileType>::value) {
+    EpilogueTileType epi_tile;
+    constexpr int M = size<0>(shape(epi_tile));
+    constexpr int N = size<1>(shape(epi_tile));
+
+    static_assert(!is_layout<EpilogueTileType>::value, "EpilogueTile must be a cute::Tile or cute::Shape");
+    static_assert(M ==  64 && detail::sm90_is_warp_specialized_v<Schedule> ||
+                  M == 128 && detail::sm90_is_cooperative_v<Schedule>, "Unsupported tile shape");
+    static_assert(N % 16 == 0, "Unsupported tile shape");
+
+    return epi_tile;
+  }
+  else {
+    static_assert(cutlass::detail::dependent_false<EpilogueTileType>, "Invalid type for EpilogueTileType.");
+  }
+}
+
+// callbacks builder with TMA aux out
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class FusionOp,
+  class TileShape_MNK,
+  class EpilogueTile_MN,
+  class ElementAccumulator
+>
+struct CallbacksBuilder<
+  Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+  FusionOp,
+  TileShape_MNK,
+  EpilogueTile_MN,
+  ElementAccumulator,
+  cute::enable_if_t<(FusionOp::IsAuxOutSupported ^ FusionOp::IsAuxInSupported) // only one aux tensor
+              && not cute::is_subbyte_v<typename FusionOp::ElementAux>>
+> {
+  using GmemStrideTypeAux = gemm::TagToStrideC_t<typename FusionOp::GmemLayoutTagAux>;
+  using SmemLayoutAtomAux = decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<
+    GmemStrideTypeAux, typename FusionOp::ElementAux, EpilogueTile_MN>());
+  using CopyOpR2S = decltype(detail::sm90_get_smem_store_op_for_accumulator<
+    GmemStrideTypeAux, typename FusionOp::ElementAux>());
+  using CopyOpS2R = decltype(detail::sm90_get_smem_load_op_for_source<
+    GmemStrideTypeAux, typename FusionOp::ElementAux>());
+  using SmemCopyOpAux = cute::conditional_t<FusionOp::IsAuxOutSupported, CopyOpR2S, CopyOpS2R>;
+
+  using Callbacks = fusion::FusionCallbacks<
+    Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    FusionOp, TileShape_MNK, EpilogueTile_MN,
+    SmemLayoutAtomAux, SmemCopyOpAux
+  >;
+};
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class FusionOp,
+  class TileShape_MNK,
+  class EpilogueTile_MN,
+  class ElementAccumulator
+>
+struct CallbacksBuilder<
+  Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+  FusionOp,
+  TileShape_MNK,
+  EpilogueTile_MN,
+  ElementAccumulator,
+  cute::enable_if_t<(FusionOp::IsAuxOutSupported ^ FusionOp::IsAuxInSupported) // only one aux tensor
+              && sizeof_bits_v<typename FusionOp::ElementAux> == 1>
+> {
+  using Callbacks = fusion::FusionCallbacks<
+    Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    FusionOp, TileShape_MNK, EpilogueTile_MN,
+    Layout<_1,_0>, DefaultCopy // aux bit tensor doesn't use smem
+  >;
+};
+
+// Helper for building TMA warp-specialized collective epilogues, specialized by
+// the fusion operation performed and the dispatch policy to use.
+template <
+  class TileShape_MNK,
+  class EpilogueTile_MN,
+  class ElementAccumulator,
+  class ElementCompute,
+  class ElementC_,
+  class GmemLayoutTagC_,
+  int AlignmentC,
+  class ElementD_,
+  class GmemLayoutTagD,
+  int AlignmentD,
+  class FusionOpOrCallbacks,
+  class DispatchPolicy
+>
+struct Sm90TmaBuilderImpl {
+  // Passing void D disables destination store + smem allocation
+  using ElementD = cute::conditional_t<cute::is_void_v<ElementD_>,
+                     fusion::get_element_aux_t<FusionOpOrCallbacks>, ElementD_>;
+
+  // Passing void C disables source load + smem allocation
+  using ElementC = cute::conditional_t<cute::is_void_v<ElementC_>,ElementD,ElementC_>; // prevents void ref breakages
+  using GmemLayoutTagC = cute::conditional_t<cute::is_void_v<ElementC_>,GmemLayoutTagD,GmemLayoutTagC_>;
+
+  using GmemStrideTypeC = cutlass::detail::TagToStrideC_t<GmemLayoutTagC>;
+  using GmemStrideTypeD = cutlass::detail::TagToStrideC_t<GmemLayoutTagD>;
+  
+  using UnderlyingGmemStrideTypeC = cute::remove_pointer_t<GmemStrideTypeC>;
+  using UnderlyingGmemStrideTypeD = cute::remove_pointer_t<GmemStrideTypeD>;
+
+  using CopyOpS2G = cute::conditional_t<detail::is_im2col_mode<GmemLayoutTagD>,
+      SM90_TMA_STORE_IM2COL,
+      SM90_TMA_STORE
+    >;
+  using CopyOpG2S = cute::conditional_t<detail::is_im2col_mode<GmemLayoutTagC>,
+      SM90_TMA_LOAD_IM2COL,
+      SM90_TMA_LOAD
+    >;
+
+  // Get the smallest tiled copy we can use to retile the accumulators
+  using CopyAtomC = Copy_Atom<SM90_U32x4_STSM_N, cutlass::half_t>;
+  // Get register to register tiled copy that happen before shared memory store.
+  // Apply void as no register transform op needed currently.
+  using CopyOpR2R = void;
+
+  // TMA builder allows for passing callbacks directly, which is either a fusion::FusionCallbacks
+  // instance or a direct visitor implementation, e.g. fusion::Sm90LinearCombination
+  using FusionCallbacks = 
+    typename CallbacksBuilder<
+      DispatchPolicy,
+      FusionOpOrCallbacks,
+      TileShape_MNK,
+      EpilogueTile_MN,
+      ElementAccumulator
+    >::Callbacks;
+
+  using CollectiveOp = cutlass::epilogue::collective::CollectiveEpilogue<
+      DispatchPolicy,
+      TileShape_MNK,
+      EpilogueTile_MN,
+      ElementC_, // Need to pass void through to expose via GemmUniversal
+      GmemStrideTypeC,
+      ElementD_,
+      GmemStrideTypeD,
+      FusionCallbacks,
+      CopyOpG2S,
+      decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<UnderlyingGmemStrideTypeC, ElementC, EpilogueTile_MN>()),
+      decltype(detail::sm90_get_smem_load_op_for_source<UnderlyingGmemStrideTypeC, ElementC>()),
+      CopyOpS2G,
+      decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<UnderlyingGmemStrideTypeD, ElementD, EpilogueTile_MN>()),
+      decltype(detail::sm90_get_smem_store_op_for_accumulator<UnderlyingGmemStrideTypeD, ElementD>()),
+      CopyAtomC,
+      CopyOpR2R
+    >;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Descriptor classes for defining EVT nodes
+// Some of the epilogue visitor nodes require non-intuitive template arguments
+// such as CopyOpS2R for AuxLoad node. Traditionaly, these are resolved by the
+// builder classes. Here we provide a set of descriptor classes that resolve
+// these template arguments from more intuitive types such as Stride, Layout
+
+// Get TileShape, EpilogueTile, Dispatch Policy, StagesC, and STagesD
+template<
+  typename TileShape_MNK,
+  typename EpilogueTileType, 
+  typename ElementC,
+  typename ElementD,
+  typename Schedule
+>
+struct EpilogueDescriptor {
+  using TileShape = TileShape_MNK;
+  using EpilogueTile = 
+    decltype(
+      detail::sm90_compute_tile_shape_or_override<
+        ElementD, EpilogueTileType, Schedule, TileShape_MNK
+      >()
+    );
+  using DispatchPolicy = 
+    decltype(
+      detail::sm90_get_tma_dispatch_policy<
+        TileShape_MNK, EpilogueTile, 
+        ElementC, ElementD, Schedule
+      >()
+    );
+  constexpr static int StagesC = DispatchPolicy::StagesC;
+  constexpr static int StagesD = DispatchPolicy::StagesD;
+};
+
+// Get Stride, SmemLayout, and CopyOpS2R for AuxLoad node
+template<
+  typename EpilogueDescriptor,
+  typename StrideOrLayoutTag,
+  typename ElementAux
+>
+struct AuxLoadDescriptor {
+  constexpr static int Stages = EpilogueDescriptor::StagesC;
+  using EpilogueTile = typename EpilogueDescriptor::EpilogueTile;
+  using Element = ElementAux;
+  using Stride = cutlass::detail::TagToStrideC_t<StrideOrLayoutTag>;
+  using SmemLayoutAtom =
+    decltype(
+      detail::sm90_get_epilogue_smem_swizzle_layout_atom<
+        Stride, ElementAux, typename EpilogueDescriptor::EpilogueTile
+      >()
+    );
+  using CopyOpS2R =
+    decltype(detail::sm90_get_smem_load_op_for_source<Stride, ElementAux>());
+};
+
+// Get Stride, SmemLayout, and CopyOpS2R for AuxStore node
+template<
+  typename EpilogueDescriptor,
+  typename StrideOrLayoutTag,
+  typename ElementAux
+>
+struct AuxStoreDescriptor {
+  constexpr static int Stages = EpilogueDescriptor::StagesD;
+  using EpilogueTile = typename EpilogueDescriptor::EpilogueTile;
+  using Element = ElementAux;
+  using Stride = cutlass::detail::TagToStrideC_t<StrideOrLayoutTag>;
+  using SmemLayoutAtom =
+    decltype(
+      detail::sm90_get_epilogue_smem_swizzle_layout_atom<
+        Stride, ElementAux, typename EpilogueDescriptor::EpilogueTile
+      >()
+    );
+  using CopyOpR2S =
+    decltype(detail::sm90_get_smem_store_op_for_accumulator<Stride, ElementAux>());
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////
+
+// No-smem builder
+template <
+  class OpClass,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class EpilogueTileType,
+  class ElementAccumulator,
+  class ElementCompute,
+  class ElementC_,
+  class GmemLayoutTagC_,
+  int AlignmentC,
+  class ElementD,
+  class GmemLayoutTagD,
+  int AlignmentD,
+  class Schedule,
+  FloatRoundStyle RoundStyle
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    OpClass,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    EpilogueTileType,
+    ElementAccumulator,
+    ElementCompute,
+    ElementC_,
+    GmemLayoutTagC_,
+    AlignmentC,
+    ElementD,
+    GmemLayoutTagD,
+    AlignmentD,
+    Schedule,
+    fusion::LinearCombination<ElementD,ElementCompute,ElementC_,ElementCompute,RoundStyle>,
+    cute::enable_if_t<cute::is_same_v<Schedule, NoSmemWarpSpecialized> ||
+                      cute::is_same_v<Schedule, PtrArrayNoSmemWarpSpecialized> ||
+                      cute::is_same_v<Schedule, PtrArrayNoSmemWarpSpecializedTransposed> >> {
+
+  // Passing void C disables source load
+  using ElementC = cute::conditional_t<cute::is_void_v<ElementC_>,
+      ElementD, ElementC_>; // prevents cute breakages
+  using GmemLayoutTagC = cute::conditional_t<cute::is_void_v<ElementC_>,
+      GmemLayoutTagD, GmemLayoutTagC_>;
+  static constexpr thread::ScaleType::Kind ScaleType = cute::is_void_v<ElementC_> ?
+      thread::ScaleType::OnlyAlphaScaling : thread::ScaleType::Default;
+
+  static constexpr int FragmentSize = 1;
+  using ThreadOp = thread::LinearCombination<
+    ElementD, FragmentSize, ElementAccumulator, ElementCompute,
+    ScaleType, RoundStyle, ElementC>;
+
+  using CollectiveOp = cute::conditional_t<
+    cute::is_same_v<Schedule, NoSmemWarpSpecialized>,
+    cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
+      cutlass::epilogue::collective::DefaultEpilogue<
+        cutlass::detail::TagToStrideC_t<GmemLayoutTagC>,
+        cutlass::detail::TagToStrideC_t<GmemLayoutTagD>,
+        ThreadOp,
+        cutlass::gemm::EpilogueDefault>>,
+    // Epilogue for Ptr-Array and Grouped Gemm
+    cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
+      cutlass::epilogue::collective::DefaultEpilogueArray<
+        cutlass::detail::TagToStrideC_t<GmemLayoutTagC>,
+        cutlass::detail::TagToStrideC_t<GmemLayoutTagD>,
+        ThreadOp,
+        Schedule>>
+    >;
+};
+
+// Tma warp-specialized builder
+template <
+  class OpClass,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class EpilogueTileType,
+  class ElementAccumulator,
+  class ElementCompute,
+  class ElementC,
+  class GmemLayoutTagC,
+  int AlignmentC,
+  class ElementD_,
+  class GmemLayoutTagD,
+  int AlignmentD,
+  class Schedule,
+  class FusionOperation
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    OpClass,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    EpilogueTileType,
+    ElementAccumulator,
+    ElementCompute,
+    ElementC,
+    GmemLayoutTagC,
+    AlignmentC,
+    ElementD_,
+    GmemLayoutTagD,
+    AlignmentD,
+    Schedule,
+    FusionOperation,
+    cute::enable_if_t<cute::is_same_v<Schedule, TmaWarpSpecialized> ||
+                      cute::is_same_v<Schedule, TmaWarpSpecializedCooperative> ||
+                      detail::sm90_is_ptr_array_tma_v<Schedule>>> {
+private:
+  using ElementD = cute::conditional_t<cute::is_void_v<ElementD_>,
+                     fusion::get_element_aux_t<FusionOperation>, ElementD_>;
+  using EpilogueTile_MN =
+    decltype(detail::sm90_compute_tile_shape_or_override<ElementD, EpilogueTileType, Schedule, TileShape_MNK>());
+  using DispatchPolicy =
+    decltype(detail::sm90_get_tma_dispatch_policy<TileShape_MNK,EpilogueTile_MN,ElementC,ElementD,Schedule>());
+
+public:
+  using CollectiveOp =
+    typename detail::Sm90TmaBuilderImpl<
+      TileShape_MNK,
+      EpilogueTile_MN,
+      ElementAccumulator,
+      ElementCompute,
+      ElementC,
+      GmemLayoutTagC,
+      AlignmentC,
+      ElementD_,
+      GmemLayoutTagD,
+      AlignmentD,
+      FusionOperation,
+      DispatchPolicy
+    >::CollectiveOp;
+};
+
+// Auto builder
+template <
+  class OpClass,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class EpilogueTileType,
+  class ElementAccumulator,
+  class ElementCompute,
+  class ElementC,
+  class GmemLayoutTagC,
+  int AlignmentC,
+  class ElementD,
+  class GmemLayoutTagD,
+  int AlignmentD,
+  class FusionOperation
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    OpClass,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    EpilogueTileType,
+    ElementAccumulator,
+    ElementCompute,
+    ElementC,
+    GmemLayoutTagC,
+    AlignmentC,
+    ElementD,
+    GmemLayoutTagD,
+    AlignmentD,
+    EpilogueScheduleAuto,
+    FusionOperation,
+    void> {
+private:
+  static_assert(cute::is_same_v<FusionOperation, fusion::LinearCombination<ElementD,ElementCompute,ElementC,ElementCompute>>,
+                "Auto schedule doesn't support fusion. Use one of the TmaWarpSpecialized schedules instead.");
+
+  // Pick No-Smem epilogue as the Auto Epilogue Schedule (Auto schedules do not guarantee best performance) 
+  // since TMA epilogues are not compatible with non-TMA non-WS mainloops
+  using EpilogueSchedule = NoSmemWarpSpecialized;
+  using _CollectiveBuilder = CollectiveBuilder<
+    arch::Sm90,
+    OpClass,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    EpilogueTileType,
+    ElementAccumulator,
+    ElementCompute,
+    ElementC,
+    GmemLayoutTagC,
+    AlignmentC,
+    ElementD,
+    GmemLayoutTagD,
+    AlignmentD,
+    EpilogueSchedule,
+    FusionOperation
+  >;
+
+public:
+  using CollectiveOp = typename _CollectiveBuilder::CollectiveOp;
+};
+
+// DEPRECATED Tma warp-specialized builder for elementwise fusion
+template <
+  class OpClass,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class EpilogueTileType,
+  class ElementAccumulator,
+  class ElementCompute,
+  class ElementC,
+  class GmemLayoutTagC,
+  int AlignmentC,
+  class ElementD,
+  class GmemLayoutTagD,
+  int AlignmentD,
+  class Schedule,
+  class UnusedFusionOp
+>
+struct [[deprecated("Use TmaWarpSpecialized with fusion::LinCombEltAct instead")]]
+CollectiveBuilder<
+    arch::Sm90,
+    OpClass,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    EpilogueTileType,
+    ElementAccumulator,
+    ElementCompute,
+    ElementC,
+    GmemLayoutTagC,
+    AlignmentC,
+    ElementD,
+    GmemLayoutTagD,
+    AlignmentD,
+    Schedule,
+    UnusedFusionOp,
+    cute::enable_if_t<cute::is_base_of_v<TmaWarpSpecializedElementwiseBase, Schedule> ||
+                      cute::is_base_of_v<TmaWarpSpecializedCooperativeElementwiseBase, Schedule> >> {
+private:
+  using FusionOp =
+    fusion::LinCombEltAct<Schedule::template ActivationFunctor, ElementD, ElementCompute, ElementC, ElementCompute, Schedule::Round>;
+  using ImplSchedule =
+    cute::conditional_t<cute::is_base_of_v<TmaWarpSpecializedElementwiseBase, Schedule>,
+      TmaWarpSpecialized, TmaWarpSpecializedCooperative>;
+
+public:
+  using CollectiveOp =
+    typename CollectiveBuilder<
+      arch::Sm90,
+      OpClass,
+      TileShape_MNK,
+      ClusterShape_MNK,
+      EpilogueTileType,
+      ElementAccumulator,
+      ElementCompute,
+      ElementC,
+      GmemLayoutTagC,
+      AlignmentC,
+      ElementD,
+      GmemLayoutTagD,
+      AlignmentD,
+      ImplSchedule,
+      FusionOp
+    >::CollectiveOp;
+};
+
+// DEPRECATED Tma warp-specialized builder for bias + elementwise fusion
+template <
+  class OpClass,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class EpilogueTileType,
+  class ElementAccumulator,
+  class ElementCompute,
+  class ElementC_,
+  class GmemLayoutTagC_,
+  int AlignmentC,
+  class ElementD,
+  class GmemLayoutTagD,
+  int AlignmentD,
+  class Schedule,
+  class UnusedFusionOp
+>
+struct [[deprecated("Use TmaWarpSpecialized with fusion::LinCombPerRowBiasEltAct or fusion::LinCombPerRowBiasEltActAux instead")]]
+CollectiveBuilder<
+    arch::Sm90,
+    OpClass,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    EpilogueTileType,
+    ElementAccumulator,
+    ElementCompute,
+    ElementC_,
+    GmemLayoutTagC_,
+    AlignmentC,
+    ElementD,
+    GmemLayoutTagD,
+    AlignmentD,
+    Schedule,
+    UnusedFusionOp,
+    cute::enable_if_t<cute::is_base_of_v<TmaWarpSpecializedBiasElementwiseBase, Schedule> ||
+                      cute::is_base_of_v<TmaWarpSpecializedCooperativeBiasElementwiseBase, Schedule> >> {
+private:
+  using EpilogueTile_MN = decltype(detail::sm90_compute_tile_shape_or_override<
+    ElementD, EpilogueTileType, Schedule, TileShape_MNK>());
+  // MSVC doesn't seem to be able to deduce DispatchPolicy correctly if it's
+  // defined as decltype of a detail::sm90_get_tma_dispatch_policy call.
+  // Instead, we paste in the contents of that function.  A natural refactoring
+  // would be to create a type alias in the detail namespace.
+  using DispatchPolicy = Sm90TmaWarpSpecialized<
+    /* StagesC = */ size(shape_div(take<0, 2>(TileShape_MNK{}), EpilogueTile_MN{})),
+    /* StagesD = */ 2,
+    /* FragmentSize = */ size(EpilogueTile_MN{}) / (detail::sm90_is_cooperative_v<Schedule> ? 256 : 128),
+    /* ReuseSmemC = */ sizeof_bits_v<ElementC_> == sizeof_bits_v<ElementD>,
+    false
+  >;
+
+  using GmemStrideTypeAux = gemm::TagToStrideC_t<GmemLayoutTagD>;
+  using SmemLayoutAtomAux = decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<
+    GmemStrideTypeAux, typename Schedule::ElementT, EpilogueTile_MN>());
+  using SmemCopyOpAux = decltype(detail::sm90_get_smem_store_op_for_accumulator<
+    GmemStrideTypeAux, typename Schedule::ElementT>());
+  using FusionOperationAux = fusion::LinCombPerRowBiasEltActAux<
+    GmemLayoutTagD, Schedule::template ActivationFunctor, ElementD, ElementCompute,
+    typename Schedule::ElementT, typename Schedule::ElementBias, ElementC_, ElementCompute
+  >;
+  using FusionCallbacksAux = fusion::FusionCallbacks<
+    DispatchPolicy, FusionOperationAux, TileShape_MNK, EpilogueTile_MN, SmemLayoutAtomAux, SmemCopyOpAux
+  >;
+
+  using FusionOperationNoAux = fusion::LinCombPerRowBiasEltAct<
+    Schedule::template ActivationFunctor, ElementD, ElementCompute,
+    typename Schedule::ElementBias, ElementC_, ElementCompute
+  >;
+  using FusionCallbacksNoAux = fusion::FusionCallbacks<
+    DispatchPolicy, FusionOperationNoAux, TileShape_MNK, EpilogueTile_MN
+  >;
+
+  using ElementC = cute::conditional_t<cute::is_void_v<ElementC_>,ElementD,ElementC_>; // prevents void ref breakages
+  using GmemLayoutTagC = cute::conditional_t<cute::is_void_v<ElementC_>,GmemLayoutTagD,GmemLayoutTagC_>;
+
+  using GmemStrideTypeC = gemm::TagToStrideC_t<GmemLayoutTagC>;
+  using GmemStrideTypeD = gemm::TagToStrideC_t<GmemLayoutTagD>;
+
+  // Get the smallest tiled copy we can use to retile the accumulators
+  using CopyAtomC = Copy_Atom<SM90_U32x4_STSM_N, cutlass::half_t>;
+  // Get register to register tiled copy that happen before shared memory store.
+  // Apply void as no register transform op needed.
+  using CopyOpR2R = void;
+
+public:
+  using CollectiveOp = cutlass::epilogue::collective::Sm90EpilogueTmaWarpSpecializedBiasElementwise<
+      DispatchPolicy::StagesC,
+      DispatchPolicy::StagesD,
+      DispatchPolicy::FragmentSize,
+      TileShape_MNK,
+      EpilogueTile_MN,
+      ElementC_, // Need to pass void through to expose via GemmUniversal
+      GmemStrideTypeC,
+      ElementD,
+      GmemStrideTypeD,
+      cute::conditional_t<Schedule::StoreT, FusionCallbacksAux, FusionCallbacksNoAux>,
+      SM90_TMA_LOAD,
+      decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<GmemStrideTypeC, ElementC, EpilogueTile_MN>()),
+      decltype(detail::sm90_get_smem_load_op_for_source<GmemStrideTypeC, ElementC>()),
+      SM90_TMA_STORE,
+      decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<GmemStrideTypeD, ElementD, EpilogueTile_MN>()),
+      decltype(detail::sm90_get_smem_store_op_for_accumulator<GmemStrideTypeD, ElementD>()),
+      CopyAtomC,
+      CopyOpR2R
+    >;
+};
+
+// CollectiveBuilder that transposed epilogue below is used for sm90 gmma RS TT kernels
+// since swapping NNN kernels input matrix and transposing its output at the same time then
+// we can get TTN kernel.
+template <
+  class OpClass,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class EpilogueTileType,
+  class ElementAccumulator,
+  class ElementCompute,
+  class ElementC_,
+  class GmemLayoutTagC_,
+  int AlignmentC,
+  class ElementD,
+  class GmemLayoutTagD,
+  int AlignmentD,
+  FloatRoundStyle RoundStyle
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    OpClass,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    EpilogueTileType,
+    ElementAccumulator,
+    ElementCompute,
+    ElementC_,
+    GmemLayoutTagC_,
+    AlignmentC,
+    ElementD,
+    GmemLayoutTagD,
+    AlignmentD,
+    cutlass::gemm::EpilogueTransposed,
+    fusion::LinearCombination<ElementD,ElementCompute,ElementC_,ElementCompute,RoundStyle>,
+    void> {
+  // Passing void C disables source load
+  using ElementC = cute::conditional_t<cute::is_void_v<ElementC_>,
+      ElementD, ElementC_>; // prevents cute breakages
+  using GmemLayoutTagC = cute::conditional_t<cute::is_void_v<ElementC_>,
+      GmemLayoutTagD, GmemLayoutTagC_>;
+  static constexpr thread::ScaleType::Kind ScaleType = cute::is_void_v<ElementC_> ?
+      thread::ScaleType::OnlyAlphaScaling : thread::ScaleType::Default;
+
+  static constexpr int FragmentSize = 1;
+  using ThreadOp = thread::LinearCombination<
+    ElementD, FragmentSize, ElementAccumulator, ElementCompute,
+    ScaleType, RoundStyle, ElementC>;
+
+  using CollectiveOp = cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
+    cutlass::epilogue::collective::DefaultEpilogue<
+      cutlass::detail::TagToStrideC_t<GmemLayoutTagC>,
+      cutlass::detail::TagToStrideC_t<GmemLayoutTagD>,
+      ThreadOp,
+      cutlass::gemm::EpilogueTransposed>
+    >;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::collective
diff --git a/include/cutlass/epilogue/collective/builders/sm90_common.inl b/include/cutlass/epilogue/collective/builders/sm90_common.inl
new file mode 100644
index 0000000000..cd2639c5dd
--- /dev/null
+++ b/include/cutlass/epilogue/collective/builders/sm90_common.inl
@@ -0,0 +1,80 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::collective::detail {
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Selects the largest vectorized smem store atom available
+template <class GmemStrideTypeD, class ElementD>
+constexpr auto
+sm90_get_smem_store_op_for_accumulator() {
+  using namespace cute;
+
+  if constexpr (sizeof(ElementD) == 2 && size<0>(GmemStrideTypeD{}) == 1) {
+    return SM90_U16x8_STSM_T{};
+  }
+  else if constexpr (sizeof(ElementD) == 2 && size<1>(GmemStrideTypeD{}) == 1) {
+    return SM90_U32x4_STSM_N{};
+  }
+  else {
+    // auto-vectorizing store
+    return AutoVectorizingCopyWithAssumedAlignment{};
+  }
+}
+
+// Selects the largest vectorized smem load atom available
+template <class GmemStrideTypeC, class ElementC>
+constexpr auto
+sm90_get_smem_load_op_for_source() {
+  using namespace cute;
+
+  // Reuse the logic from smem store selector
+  using SmemStoreOp = decltype(sm90_get_smem_store_op_for_accumulator<GmemStrideTypeC, ElementC>());
+
+  if constexpr (cute::is_same_v<SmemStoreOp, SM90_U16x8_STSM_T>) {
+    return SM75_U16x8_LDSM_T{};
+  }
+  else if constexpr (cute::is_same_v<SmemStoreOp, SM90_U32x4_STSM_N>) {
+    return SM75_U32x4_LDSM_N{};
+  }
+  else {
+    // auto-vectorizing load
+    return AutoVectorizingCopyWithAssumedAlignment<128>{};
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::collective::detail
diff --git a/include/cutlass/epilogue/collective/collective_builder.hpp b/include/cutlass/epilogue/collective/collective_builder.hpp
new file mode 100644
index 0000000000..d54cd0a8f7
--- /dev/null
+++ b/include/cutlass/epilogue/collective/collective_builder.hpp
@@ -0,0 +1,120 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/arch/copy.hpp>         // cute::DefaultCopy
+#include <cute/util/type_traits.hpp>  // cute::is_base_of_v
+
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/epilogue/fusion/callbacks.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Used to specify epilogue subtile shape or dispatch to automatic computation of subtile shape
+struct EpilogueTileAuto {};
+
+// Used to let the builder pick the epilogue schedule automatically.
+// Can be overridden with kernel schedule tags in cutlass/gemm/dispatch_policy.hpp
+struct EpilogueScheduleAuto {};
+struct EpilogueIm2ColScheduleAuto {};
+
+template <
+  class ArchTag,
+  class OpClass,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class EpilogueTileType,
+  class ElementAccumulator,
+  class ElementCompute,
+  class ElementC,
+  class GmemLayoutTagC,
+  int AlignmentC,
+  class ElementD,
+  class GmemLayoutTagD,
+  int AlignmentD,
+  class EpilogueScheduleType,
+  class FusionOpOrCallbacks = cutlass::epilogue::fusion::LinearCombination<ElementD,ElementCompute,ElementC,ElementCompute>,
+  class Enable = void
+>
+struct CollectiveBuilder {
+  static_assert(cutlass::detail::dependent_false<ArchTag>,
+      "Could not build a collective epilogue for given parameters.");
+};
+
+// helper sub-builder for epilogue fusion callbacks (for internal use by CollectiveBuilder only)
+namespace detail {
+
+// callbacks builder with operation tag
+template<
+  class DispatchPolicy,
+  class FusionOp,
+  class TileShape_MNK,
+  class EpilogueTile_MN,
+  class ElementAccumulator,
+  class = void
+>
+struct CallbacksBuilder {
+  using Callbacks = fusion::FusionCallbacks<DispatchPolicy, FusionOp, TileShape_MNK, EpilogueTile_MN>;
+};
+
+// callbacks builder with callbacks passthrough
+template <
+  class DispatchPolicy,
+  class FusionCallbacks,
+  class TileShape_MNK,
+  class EpilogueTile_MN,
+  class ElementAccumulator
+>
+struct CallbacksBuilder<
+  DispatchPolicy,
+  FusionCallbacks,
+  TileShape_MNK,
+  EpilogueTile_MN,
+  ElementAccumulator,
+  cute::enable_if_t<not cute::is_base_of_v<fusion::FusionOperation, FusionCallbacks>>
+> {
+  using Callbacks = FusionCallbacks;
+};
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "builders/sm90_builder.inl"
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/collective/collective_epilogue.hpp b/include/cutlass/epilogue/collective/collective_epilogue.hpp
new file mode 100644
index 0000000000..8fb1a9588b
--- /dev/null
+++ b/include/cutlass/epilogue/collective/collective_epilogue.hpp
@@ -0,0 +1,71 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cutlass/detail/dependent_false.hpp>
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class DispatchPolicy,
+  class... Args
+>
+class CollectiveEpilogue {
+  static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Could not find an epilogue specialization.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "detail.hpp"
+
+//
+// Gemm
+//
+#include "default_epilogue.hpp"
+#include "default_epilogue_array.hpp"
+#include "epilogue_tensor_broadcast.hpp"
+#include "sm70_epilogue_vectorized.hpp"
+#include "sm70_epilogue_vectorized_array.hpp"
+#include "sm90_epilogue_tma_warpspecialized.hpp"
+#include "sm90_epilogue_tma_warpspecialized_bias_elementwise.hpp"
+#include "sm90_epilogue_array_tma_warpspecialized.hpp"
+//
+// Conv
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/collective/default_epilogue.hpp b/include/cutlass/epilogue/collective/default_epilogue.hpp
new file mode 100644
index 0000000000..cd4a6ccddb
--- /dev/null
+++ b/include/cutlass/epilogue/collective/default_epilogue.hpp
@@ -0,0 +1,242 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing elementwise operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/detail.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/numeric/numeric_types.hpp"
+#include "cutlass/cuda_host_adapter.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies an element wise operation to all elements within the fragment
+/// and writes them out to destination storage.
+template <
+  class StrideC_,
+  class StrideD_,
+  class ThreadEpilogueOp_,
+  class EpilogueSchedule_
+>
+class DefaultEpilogue {
+public:
+  //
+  // Type Aliases
+  //
+  using EpilogueSchedule = EpilogueSchedule_;
+  using DispatchPolicy = EpilogueSchedule_;
+
+  // derived types of output thread level operator
+  using ThreadEpilogueOp = ThreadEpilogueOp_;
+  using ElementOutput = typename ThreadEpilogueOp::ElementOutput;
+  using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator;
+  using ElementCompute = typename ThreadEpilogueOp::ElementCompute;
+  using ElementScalar = ElementCompute;
+  using ElementC = typename ThreadEpilogueOp::ElementC;
+  using StrideC = StrideC_;
+  using ElementD = typename ThreadEpilogueOp::ElementD;
+  using StrideD = StrideD_;
+
+  using GmemTiledCopyC = void;
+  using GmemTiledCopyD = void;
+
+  static const int kOutputAlignment = ThreadEpilogueOp::kCount;
+  using AlignmentType = typename cute::uint_bit<sizeof_bits<ElementOutput>::value * kOutputAlignment>::type;
+
+  static_assert(cute::rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+  static_assert(cute::rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+
+  struct SharedStorage { };
+
+  using TensorStorage = SharedStorage;
+
+  // Host side epilogue arguments
+  struct Arguments {
+    typename ThreadEpilogueOp::Params thread{};
+    ElementC const* ptr_C = nullptr;
+    StrideC dC{};
+    ElementD* ptr_D = nullptr;
+    StrideD dD{};
+  };
+
+  // Device side epilogue params
+  using Params = Arguments;
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      [[maybe_unused]] ProblemShape const& _,
+      Arguments const& args,
+      [[maybe_unused]] void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      [[maybe_unused]] ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    return true;
+  }
+
+  // Note: SharedStorage is unused for DefaultEpilogue
+  CUTLASS_HOST_DEVICE
+  DefaultEpilogue(Params const& params_, SharedStorage const& shared_storage = SharedStorage())
+      : params(params_), epilogue_op(params_.thread) { }
+
+  CUTLASS_DEVICE
+  bool
+  is_source_needed() {
+    return epilogue_op.is_source_needed();
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class BlockShapeMNK,
+    class BlockCoordMNKL,
+    class FrgEngine, class FrgLayout,
+    class TiledMma,
+    class ResidueMNK
+  >
+  CUTLASS_HOST_DEVICE void
+  operator()(
+      ProblemShapeMNKL problem_shape_mnkl,
+      BlockShapeMNK blk_shape_MNK,
+      BlockCoordMNKL blk_coord_mnkl,
+      cute::Tensor<FrgEngine, FrgLayout> const& accumulators,
+      TiledMma tiled_mma,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      [[maybe_unused]] char* smem_buf)
+  {
+    using namespace cute;
+    using X = Underscore;
+
+    static_assert(cute::rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
+    static_assert(is_static<BlockShapeMNK>::value, "ThreadBlock tile shape must be static");
+    static_assert(cute::rank(BlockShapeMNK{}) == 3, "BlockShapeMNK must be rank 3");
+    static_assert(cute::rank(BlockCoordMNKL{}) == 4, "BlockCoordMNKL must be rank 3");
+
+    // Separate out problem shape for convenience
+    auto M = get<0>(problem_shape_mnkl);
+    auto N = get<1>(problem_shape_mnkl);
+    auto L = get<3>(problem_shape_mnkl);
+
+    auto stride_c = detail::get_epilogue_stride<EpilogueSchedule>(params.dC);
+    auto stride_d = detail::get_epilogue_stride<EpilogueSchedule>(params.dD);
+
+    // Represent the full output tensor
+    Tensor mC_mnl = make_tensor(make_gmem_ptr(params.ptr_C), make_shape(M,N,L), stride_c);                 // (m,n,l)
+    Tensor mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D), make_shape(M,N,L), stride_d);                 // (m,n,l)
+    Tensor gC_mnl = local_tile(mC_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});    // (BLK_M,BLK_N,m,n,l)
+    Tensor gD_mnl = local_tile(mD_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});    // (BLK_M,BLK_N,m,n,l)
+
+    // Slice to get the tile this CTA is responsible for
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord_mnkl;
+    Tensor gC = gC_mnl(_,_,m_coord,n_coord,l_coord);                                                 // (BLK_M,BLK_N)
+    Tensor gD = gD_mnl(_,_,m_coord,n_coord,l_coord);                                                 // (BLK_M,BLK_N)
+
+    // Partition source and destination tiles to match the accumulator partitioning
+    auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCgD = thr_mma.partition_C(gD);                                       // (VEC,THR_M,THR_N)
+    Tensor tCgC = thr_mma.partition_C(gC);                                       // (VEC,THR_M,THR_N)
+
+    static_assert(is_static<FrgLayout>::value, "Accumulator layout must be static");
+    CUTE_STATIC_ASSERT_V(size(tCgC) == size(tCgD),
+        "Source and destination must have the same number of elements.");
+    CUTE_STATIC_ASSERT_V(size(tCgD) == size(accumulators),
+        "Accumulator count must have the same destination element count.");
+
+    // Make an identity coordinate tensor for predicating our output MN tile
+    auto cD = make_identity_tensor(make_shape(unwrap(shape<0>(gD)), unwrap(shape<1>(gD))));
+    Tensor tCcD = thr_mma.partition_C(cD);
+
+    // source is needed
+    if (epilogue_op.is_source_needed()) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(accumulators); ++i) {
+        if (elem_less(tCcD(i), make_coord(get<0>(residue_mnk), get<1>(residue_mnk)))) {
+          tCgD(i) = epilogue_op(accumulators(i), tCgC(i));
+        }
+      }
+    }
+    // source is not needed, avoid load
+    else {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(accumulators); ++i) {
+        if (elem_less(tCcD(i), make_coord(get<0>(residue_mnk), get<1>(residue_mnk)))) {
+          tCgD(i) = epilogue_op(accumulators(i));
+        }
+      }
+    }
+  }
+
+private:
+  Params params;
+  ThreadEpilogueOp epilogue_op;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/collective/default_epilogue_array.hpp b/include/cutlass/epilogue/collective/default_epilogue_array.hpp
new file mode 100644
index 0000000000..da7562b43a
--- /dev/null
+++ b/include/cutlass/epilogue/collective/default_epilogue_array.hpp
@@ -0,0 +1,283 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing elementwise operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/detail.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/numeric/numeric_types.hpp"
+#include "cutlass/trace.h"
+
+#include "cutlass/cuda_host_adapter.hpp"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Applies an element wise operation to all elements within the fragment
+// and writes them out to destination storage.
+template <
+  class StrideC_,
+  class StrideD_,
+  class ThreadEpilogueOp_,
+  class EpilogueSchedule_
+>
+class DefaultEpilogueArray {
+public:
+  //
+  // Type Aliases
+  //
+  using EpilogueSchedule = EpilogueSchedule_;
+  using DispatchPolicy = EpilogueSchedule_;
+  
+  // derived types of output thread level operator
+  using ThreadEpilogueOp = ThreadEpilogueOp_;
+  using ElementOutput = typename ThreadEpilogueOp::ElementOutput;
+  using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator;
+  using ElementCompute = typename ThreadEpilogueOp::ElementCompute;
+  using ElementScalar = ElementCompute;
+  using ElementC = typename ThreadEpilogueOp::ElementC;
+  using StrideC = StrideC_;
+  using InternalStrideC = cute::remove_pointer_t<StrideC>;
+  using ElementD = typename ThreadEpilogueOp::ElementD;
+  using StrideD = StrideD_;
+  using InternalStrideD = cute::remove_pointer_t<StrideD>;
+
+  using GmemTiledCopyC = void;
+  using GmemTiledCopyD = void;
+
+  static const int kOutputAlignment = ThreadEpilogueOp::kCount;
+  using AlignmentType = typename cute::uint_bit<sizeof_bits<ElementOutput>::value * kOutputAlignment>::type;
+
+  static_assert(cute::is_same_v<EpilogueSchedule, PtrArrayNoSmemWarpSpecialized> || cute::is_same_v<EpilogueSchedule, PtrArrayDefault> || cute::is_same_v<EpilogueSchedule, PtrArrayNoSmemWarpSpecializedTransposed>, "Incompatible epilogue schedule.");
+  static_assert(rank(InternalStrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+  static_assert(rank(InternalStrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+
+  struct SharedStorage { };
+
+  using TensorMapStorage = SharedStorage;
+
+  // Host side epilogue arguments
+  struct Arguments {
+    typename ThreadEpilogueOp::Params thread{};
+    ElementC const** ptr_C = nullptr;
+    StrideC dC{};
+    ElementD** ptr_D = nullptr;
+    StrideD dD{};
+  };
+
+  // Device side epilogue params
+  using Params = Arguments;
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      ProblemShape const&,
+      Arguments const& args,
+      [[maybe_unused]] void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args, int sm_count) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      [[maybe_unused]] ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    return true;
+  }
+
+  CUTLASS_HOST_DEVICE
+  DefaultEpilogueArray(Params const& params_)
+      : params(params_) { }
+
+  CUTLASS_DEVICE
+  bool
+  is_source_needed() {
+    // For Ptr-Array or Grouped Gemm we cannot determine if source is needed based on first beta.
+    return true;
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class BlockShapeMNK,
+    class BlockCoordMNKL,
+    class FrgEngine, class FrgLayout,
+    class TiledMma,
+    class ResidueMNK
+  >
+  CUTLASS_HOST_DEVICE void
+  operator()(
+      ProblemShapeMNKL problem_shape_mnkl,
+      BlockShapeMNK blk_shape_MNK,
+      BlockCoordMNKL blk_coord_mnkl,
+      cute::Tensor<FrgEngine, FrgLayout> const& accumulators,
+      TiledMma tiled_mma,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      [[maybe_unused]] char* smem_buf)
+  {
+    using namespace cute;
+    using X = Underscore;
+
+    static_assert(rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
+    static_assert(is_static<BlockShapeMNK>::value, "ThreadBlock tile shape must be static");
+    static_assert(rank(BlockShapeMNK{}) == 3, "BlockShapeMNK must be rank 3");
+    static_assert(rank(BlockCoordMNKL{}) == 4, "BlockCoordMNKL must be rank 3");
+
+    // Separate out problem shape for convenience
+    auto M = get<0>(problem_shape_mnkl);
+    auto N = get<1>(problem_shape_mnkl);
+    auto L = get<3>(problem_shape_mnkl);
+    // Batches are managed by using appropriate pointers to C and D matrices
+    const int32_t mock_L = 1;
+    const int32_t mock_l_coord = 0;
+    // Slice to get the tile this CTA is responsible for
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord_mnkl;
+
+    // If scalar alpha/beta are provided, i.e., same alpha/beta applies to all batches/groups.
+    // If pointers to alpha/beta are provided, i.e., alpha/beta can differ between batches/groups,
+    // we get the correct alpha/beta values for the current batch/group using group index.
+    ThreadEpilogueOp epilogue_op = ThreadEpilogueOp(params.thread, l_coord);
+
+    if (epilogue_op.is_source_needed() && params.dC == nullptr) {
+      // Beta value is non-zero while pointer to C is a nullptr
+      assert(0);
+    }
+
+    auto [stride_c, stride_d] = [&, l = l_coord]() {
+      if constexpr (!cute::is_same_v<InternalStrideC, StrideC>) {
+        // If grouped gemm
+        if (epilogue_op.is_source_needed()) {
+            return make_tuple(
+                detail::get_epilogue_stride<EpilogueSchedule>(params.dC[l]),
+                detail::get_epilogue_stride<EpilogueSchedule>(params.dD[l])
+            );
+        } 
+        else {
+          return make_tuple(
+              InternalStrideC{}, 
+              detail::get_epilogue_stride<EpilogueSchedule>(params.dD[l])
+          );
+        }
+      } 
+      else {
+        return make_tuple(
+            detail::get_epilogue_stride<EpilogueSchedule>(params.dC),
+            detail::get_epilogue_stride<EpilogueSchedule>(params.dD)
+        );
+      }
+    }();
+    
+    // Represent the full output tensor
+    ElementC const* ptr_C_l = nullptr;
+    if (epilogue_op.is_source_needed()) {
+      ptr_C_l = params.ptr_C[l_coord];
+    }
+    Tensor mC_mnl = make_tensor(make_gmem_ptr(ptr_C_l), make_shape(M,N,mock_L), stride_c);      // (m,n,l)
+    Tensor mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D[l_coord]), make_shape(M,N,mock_L), stride_d);      // (m,n,l)
+    Tensor gC_mnl = local_tile(mC_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});    // (BLK_M,BLK_N,m,n,l)
+    Tensor gD_mnl = local_tile(mD_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});    // (BLK_M,BLK_N,m,n,l)
+
+    Tensor gC = gC_mnl(_,_,m_coord,n_coord, mock_l_coord);                                                 // (BLK_M,BLK_N)
+    Tensor gD = gD_mnl(_,_,m_coord,n_coord, mock_l_coord);                                                 // (BLK_M,BLK_N)
+
+    // Partition source and destination tiles to match the accumulator partitioning
+    auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCgD = thr_mma.partition_C(gD);                                       // (VEC,THR_M,THR_N)
+    Tensor tCgC = thr_mma.partition_C(gC);                                       // (VEC,THR_M,THR_N)
+
+    static_assert(is_static<FrgLayout>::value, "Accumulator layout must be static");
+    CUTE_STATIC_ASSERT_V(size(tCgC) == size(tCgD),
+        "Source and destination must have the same number of elements.");
+    CUTE_STATIC_ASSERT_V(size(tCgD) == size(accumulators),
+        "Accumulator count must have the same destination element count.");
+
+    // Make an identity coordinate tensor for predicating our output MN tile
+    auto cD = make_identity_tensor(make_shape(unwrap(shape<0>(gD)), unwrap(shape<1>(gD))));
+    Tensor tCcD = thr_mma.partition_C(cD);
+
+    // source is needed
+    if (epilogue_op.is_source_needed()) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(accumulators); ++i) {
+        if (elem_less(tCcD(i), make_coord(get<0>(residue_mnk), get<1>(residue_mnk)))) {
+          tCgD(i) = epilogue_op(accumulators(i), tCgC(i));
+        }
+      }
+    }
+    // source is not needed, avoid load
+    else {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(accumulators); ++i) {
+        if (elem_less(tCcD(i), make_coord(get<0>(residue_mnk), get<1>(residue_mnk)))) {
+          tCgD(i) = epilogue_op(accumulators(i));
+        }
+      }
+    }
+  }
+
+private:
+  Params params;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/collective/detail.hpp b/include/cutlass/epilogue/collective/detail.hpp
new file mode 100644
index 0000000000..23e57d99b8
--- /dev/null
+++ b/include/cutlass/epilogue/collective/detail.hpp
@@ -0,0 +1,502 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/numeric/numeric_types.hpp"
+#include "cute/util/type_traits.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace collective {
+
+namespace detail {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class Stride>
+constexpr bool
+is_m_major() {
+  return cutlass::gemm::detail::is_major<0,Stride>();
+}
+
+template <class Stride>
+constexpr bool
+is_n_major() {
+  return cutlass::gemm::detail::is_major<1,Stride>();
+}
+
+template <class Stride>
+constexpr bool
+is_im2col() {
+  return cute::is_same_v<Stride, cutlass::detail::TagToStrideC_t<cutlass::layout::TensorNWC>>
+      || cute::is_same_v<Stride, cutlass::detail::TagToStrideC_t<cutlass::layout::TensorNHWC>>
+      || cute::is_same_v<Stride, cutlass::detail::TagToStrideC_t<cutlass::layout::TensorNDHWC>>;
+}
+
+template<class Schedule>
+struct sm90_is_ptr_array_tma : cute::false_type {};
+
+template<>
+struct sm90_is_ptr_array_tma<PtrArrayTmaWarpSpecializedCooperative> : cute::true_type {};
+
+template<>
+struct sm90_is_ptr_array_tma<PtrArrayTmaWarpSpecializedPingpong> : cute::true_type {};
+
+template<>
+struct sm90_is_ptr_array_tma<PtrArrayTmaWarpSpecialized> : cute::true_type {};
+
+template<class Schedule>
+static constexpr bool sm90_is_ptr_array_tma_v = sm90_is_ptr_array_tma<Schedule>::value;
+
+template<class Schedule>
+struct sm90_is_ptr_array_tma_cooperative : cute::false_type {};
+
+template<>
+struct sm90_is_ptr_array_tma_cooperative<PtrArrayTmaWarpSpecializedCooperative> : cute::true_type {};
+
+template<class Schedule>
+static constexpr bool sm90_is_ptr_array_tma_cooperative_v = sm90_is_ptr_array_tma_cooperative<Schedule>::value;
+
+template<class Schedule>
+struct sm90_is_ptr_array_tma_pingpong : cute::false_type {};
+
+template<>
+struct sm90_is_ptr_array_tma_pingpong<PtrArrayTmaWarpSpecializedPingpong> : cute::true_type {};
+
+template<class Schedule>
+static constexpr bool sm90_is_ptr_array_tma_pingpong_v = sm90_is_ptr_array_tma_pingpong<Schedule>::value;
+
+template<class DispatchPolicy>
+struct sm90_is_ptr_array_tma_dispatch_policy : cute::false_type {};
+
+template<
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  int NumEpilogueWarpGroups
+>
+struct sm90_is_ptr_array_tma_dispatch_policy<
+    Sm90PtrArrayTmaWarpSpecialized<StagesC, 
+                                   StagesD, 
+                                   FragmentSize,
+                                   ReuseSmemC, 
+                                   DelayTmaStore, 
+                                   NumEpilogueWarpGroups>> 
+    : cute::true_type {};
+
+template<class DispatchPolicy>
+static constexpr bool sm90_is_ptr_array_tma_dispatch_policy_v = sm90_is_ptr_array_tma_dispatch_policy<DispatchPolicy>::value;
+
+using cutlass::atomic_maximum;
+
+template <class T>
+static constexpr int elements_per_access_v = cutlass::sizeof_bits<uint32_t>::value / cutlass::sizeof_bits<T>::value;
+
+template <class EpilogueSchedule>
+static constexpr bool sm90_is_cooperative_v =
+  cute::is_base_of_v<cutlass::epilogue::TmaWarpSpecializedCooperative, EpilogueSchedule> ||
+  sm90_is_ptr_array_tma_cooperative_v<EpilogueSchedule>;
+
+template <class EpilogueSchedule>
+static constexpr bool sm90_is_warp_specialized_v =
+  (!sm90_is_ptr_array_tma_cooperative_v<EpilogueSchedule> && sm90_is_ptr_array_tma_v<EpilogueSchedule>) ||
+  cute::is_base_of_v<cutlass::epilogue::TmaWarpSpecialized, EpilogueSchedule>;
+
+template <class GmemLayoutTag>
+static constexpr bool is_im2col_mode =
+  cute::is_same_v<GmemLayoutTag, cutlass::layout::TensorNWC> ||
+  cute::is_same_v<GmemLayoutTag, cutlass::layout::TensorNHWC> ||
+  cute::is_same_v<GmemLayoutTag, cutlass::layout::TensorNDHWC>;
+
+template <class T>
+struct EmptyStorage {
+  CUTLASS_HOST_DEVICE
+  T* data() { return nullptr; }
+};
+
+template<class EpilogueSchedule, class Stride>
+CUTLASS_HOST_DEVICE
+auto get_epilogue_stride(Stride stride){
+  if constexpr (cute::is_base_of_v<cutlass::gemm::EpilogueTransposed, EpilogueSchedule>||
+                cute::is_base_of_v<cutlass::epilogue::PtrArrayNoSmemWarpSpecializedTransposed, EpilogueSchedule>) {
+    return cute::make_stride(cute::get<1>(stride), cute::get<0>(stride), cute::get<2>(stride));
+  }
+  else {
+    return stride;
+  }
+}
+
+template <typename ThreadEpilogueOp, typename = void>
+struct IsThreadEpilogueOpWithBias { 
+  static constexpr bool value = false; 
+  using type = typename ThreadEpilogueOp::ElementCompute; 
+};
+
+template <typename ThreadEpilogueOp>
+struct IsThreadEpilogueOpWithBias <ThreadEpilogueOp, cute::void_t<typename ThreadEpilogueOp::ElementBias>> { 
+  static constexpr bool value = true; 
+  using type = typename ThreadEpilogueOp::ElementBias; 
+};
+
+template <typename ThreadEpilogueOp, typename = void>
+struct IsThreadEpilogueOpWithPerChannelScaling {
+  static constexpr bool value = false;
+};
+
+template <typename ThreadEpilogueOp>
+struct IsThreadEpilogueOpWithPerChannelScaling <ThreadEpilogueOp, cute::enable_if_t<ThreadEpilogueOp::IsPerChannelScalingSupported>> {
+  static constexpr bool value = true;
+};
+
+template <typename ThreadEpilogueOp, typename = void>
+struct IsThreadEpilogueOpWithActivation {
+  static constexpr bool value = false;
+  using type = void;
+};
+
+template <typename ThreadEpilogueOp>
+struct IsThreadEpilogueOpWithActivation <ThreadEpilogueOp, cute::enable_if_t<ThreadEpilogueOp::IsEltActSupported>> {
+  static constexpr bool value = true;
+  using type = typename ThreadEpilogueOp::ActivationFn;
+};
+
+template <typename ThreadEpilogueOp, typename = void>
+struct IsThreadEpilogueOpWithElementwiseArguments : cute::false_type {};
+
+template <typename ThreadEpilogueOp>
+struct IsThreadEpilogueOpWithElementwiseArguments<
+        ThreadEpilogueOp,
+        cute::void_t<typename ThreadEpilogueOp::ElementwiseOp::Arguments>> : cute::true_type {};
+
+// Wrapper class to use operator-style epilogues in sm90 TMA warp-specialized kernels
+template <class EpilogueOp>
+class Sm90TmaWarpSpecializedAdapter : public EpilogueOp {
+public:
+  using GmemTiledCopyC = void;
+  using GmemTiledCopyD = void;
+
+  using LoadPipeline = cutlass::PipelineTransactionAsync<0>;
+  using LoadPipelineState = cutlass::PipelineState<0>;
+  constexpr static uint32_t TmaTransactionBytes = 0;
+  constexpr static bool RequiresTransactionBytes = false;
+
+  using StorePipeline = cutlass::PipelineTmaStore<0>;
+  using StorePipelineState = cutlass::PipelineState<0>;
+
+  using TensorStorage = typename EpilogueOp::SharedStorage;
+  using TensorMapStorage = typename EpilogueOp::SharedStorage;
+  using PipelineStorage = typename LoadPipeline::SharedStorage;
+
+  template<class CtaTileMNK>
+  CUTLASS_HOST_DEVICE
+  static constexpr int
+  get_load_pipe_increment(CtaTileMNK) {
+    return 1;
+  }
+
+  template<class CtaTileMNK>
+  CUTLASS_HOST_DEVICE
+  static constexpr int
+  get_store_pipe_increment(CtaTileMNK) {
+    return 1;
+  }
+
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors([[maybe_unused]] typename EpilogueOp::Params const&) {
+  }
+
+  // ctor inheritance
+  using EpilogueOp::EpilogueOp;
+
+  CUTLASS_HOST_DEVICE
+  Sm90TmaWarpSpecializedAdapter(
+      typename EpilogueOp::Params const& params,
+      [[maybe_unused]] TensorStorage& shared_tensors)
+    : EpilogueOp(params) { }
+
+  CUTLASS_DEVICE
+  bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE auto
+  load_init(
+    [[maybe_unused]] typename EpilogueOp::Params const& params,
+    [[maybe_unused]] TensorMapStorage& shared_tensormaps,
+    [[maybe_unused]] int32_t sm_count,
+    [[maybe_unused]] int32_t sm_idx) {
+    return cute::make_tuple(nullptr);
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class CtaTileMNK,
+    class CtaCoordMNKL,
+    class TiledMma
+  >
+  CUTLASS_DEVICE auto
+  load(
+      [[maybe_unused]] LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_producer_state,
+      [[maybe_unused]] ProblemShapeMNKL problem_shape_mnkl,
+      [[maybe_unused]] CtaTileMNK cta_tile_mnk,
+      [[maybe_unused]] CtaCoordMNKL cta_coord_mnkl,
+      [[maybe_unused]] TiledMma tiled_mma,
+      [[maybe_unused]] int thread_idx,
+      [[maybe_unused]] TensorStorage& shared_tensors,
+      [[maybe_unused]] int subtile_idx=-1)
+  {
+    return load_pipe_producer_state;
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class TileShapeMNK,
+    class TileCoordMNKL,
+    class TiledMma,
+    class TensorMapC
+  >
+  CUTLASS_DEVICE auto
+  load(
+      [[maybe_unused]] LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_producer_state,
+      [[maybe_unused]] ProblemShapeMNKL problem_shape_mnkl,
+      [[maybe_unused]] TileShapeMNK tile_shape_MNK,
+      [[maybe_unused]] TileCoordMNKL tile_coord_mnkl,
+      [[maybe_unused]] TiledMma tiled_mma,
+      [[maybe_unused]] int thread_idx,
+      [[maybe_unused]] TensorStorage& shared_tensors,
+      [[maybe_unused]] TensorMapC const& load_tensormap,
+      [[maybe_unused]] int subtile_idx=-1,
+      [[maybe_unused]] bool wait = false)
+  {
+    return load_pipe_producer_state;
+  }
+
+  CUTLASS_DEVICE auto
+  load_tail(
+      [[maybe_unused]] LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_producer_state)
+  {
+    return load_pipe_producer_state;
+  }
+
+  CUTLASS_DEVICE auto
+  store_init(
+    [[maybe_unused]] typename EpilogueOp::Params const& params,
+    [[maybe_unused]] TensorMapStorage& shared_tensormaps,
+    [[maybe_unused]] int32_t sm_count,
+    [[maybe_unused]] int32_t sm_idx,
+    [[maybe_unused]] int32_t warp_group_idx) {
+    return cute::make_tuple(nullptr);
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class CtaTileMNK,
+    class CtaCoordMNKL,
+    class AccEngine, class AccLayout,
+    class TiledMma
+  >
+  CUTLASS_DEVICE auto
+  store(
+      [[maybe_unused]] LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_consumer_state,
+      [[maybe_unused]] StorePipeline store_pipeline,
+      StorePipelineState store_pipe_producer_state,
+      ProblemShapeMNKL problem_shape_mnkl,
+      CtaTileMNK cta_tile_mnk,
+      CtaCoordMNKL cta_coord_mnkl,
+      cute::Tensor<AccEngine,AccLayout> accumulators,
+      TiledMma tiled_mma,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      int subtile_index = -1)
+  {
+    constexpr int BLK_M_RANK = cute::rank<0>(cta_tile_mnk);
+    auto m_max_coord = unwrap(cute::transform(make_seq<BLK_M_RANK>{}, [&](auto i) {
+        return get<0,i>(problem_shape_mnkl) - get<0,i>(cta_tile_mnk) * get<0,i>(cta_coord_mnkl);
+      }));
+
+    constexpr int BLK_N_RANK = cute::rank<1>(cta_tile_mnk);
+    auto n_max_coord = unwrap(cute::transform(make_seq<BLK_N_RANK>{}, [&](auto i) {
+        return get<1,i>(problem_shape_mnkl) - get<1,i>(cta_tile_mnk) * get<1,i>(cta_coord_mnkl);
+      }));
+
+    auto residue_mnk = make_tuple(m_max_coord, n_max_coord, Int<0>{});
+
+    (*this)(
+        problem_shape_mnkl,
+        cta_tile_mnk,
+        cta_coord_mnkl,
+        accumulators,
+        tiled_mma,
+        residue_mnk,
+        thread_idx,
+        reinterpret_cast<char*>(&shared_tensors));
+
+    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state);
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class TileShapeMNK,
+    class TileCoordMNKL,
+    class AccEngine, class AccLayout,
+    class TiledMma,
+    class TensorMapD
+  >
+  CUTLASS_DEVICE auto
+  store(
+      [[maybe_unused]] LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_consumer_state,
+      [[maybe_unused]] StorePipeline store_pipeline,
+      StorePipelineState store_pipe_producer_state,
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShapeMNK tile_shape_MNK,
+      TileCoordMNKL tile_coord_mnkl,
+      cute::Tensor<AccEngine,AccLayout> accumulators,
+      TiledMma tiled_mma,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      [[maybe_unused]] TensorMapD const& store_tensormap,
+      int subtile_index = -1)
+  {
+    constexpr int BLK_M_RANK = cute::rank<0>(tile_shape_MNK);
+    auto m_max_coord = unwrap(cute::transform(make_seq<BLK_M_RANK>{}, [&](auto i) {
+        return get<0,i>(problem_shape_mnkl) - get<0,i>(tile_shape_MNK) * get<0,i>(tile_coord_mnkl);
+      }));
+
+    constexpr int BLK_N_RANK = cute::rank<1>(tile_shape_MNK);
+    auto n_max_coord = unwrap(cute::transform(make_seq<BLK_N_RANK>{}, [&](auto i) {
+        return get<1,i>(problem_shape_mnkl) - get<1,i>(tile_shape_MNK) * get<1,i>(tile_coord_mnkl);
+      }));
+
+    auto residue_mnk = make_tuple(m_max_coord, n_max_coord, Int<0>{});
+
+    (*this)(
+        problem_shape_mnkl,
+        tile_shape_MNK,
+        tile_coord_mnkl,
+        accumulators,
+        tiled_mma,
+        residue_mnk,
+        thread_idx,
+        reinterpret_cast<char*>(&shared_tensors));
+
+    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state);
+  }
+
+  CUTLASS_DEVICE auto
+  store_tail(
+      [[maybe_unused]] LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_consumer_state,
+      [[maybe_unused]] StorePipeline store_pipeline,
+      StorePipelineState store_pipe_producer_state) {
+    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state);
+  }
+
+  // Dummy methods to perform different parts of TMA/Tensormap modifications
+
+  template <bool IsLoad,
+            class ProblemShapeMNKL>
+  CUTLASS_DEVICE
+  void
+  tensormaps_perform_update(
+      [[maybe_unused]] TensorMapStorage& shared_tensormaps,
+      [[maybe_unused]] typename EpilogueOp::Params const& params,
+      [[maybe_unused]] cute::TmaDescriptor const* tensormap,
+      [[maybe_unused]] ProblemShapeMNKL problem_shape,
+      [[maybe_unused]] int32_t next_batch,
+      [[maybe_unused]] int32_t warp_group_idx) { }
+
+  template <bool IsLoad>
+  CUTLASS_DEVICE
+  void
+  tensormaps_cp_fence_release(
+      [[maybe_unused]] TensorMapStorage& shared_tensormaps,
+      [[maybe_unused]] cute::TmaDescriptor const* tensormap,
+      [[maybe_unused]] int32_t warp_group_idx) { }
+
+  template <bool IsLoad>
+  CUTLASS_DEVICE
+  void
+  tensormaps_fence_acquire([[maybe_unused]] cute::TmaDescriptor const* tensormap) { }
+};
+
+// SFINAE helpers for detecting beta/beta_ptr/beta_ptr_array in EVT arguments.
+template <class Arguments, class = void>
+struct has_beta {
+  static constexpr bool value = false;
+};
+
+template <class Arguments>
+struct has_beta<Arguments, cute::void_t<decltype(Arguments{}.thread.beta)>> {
+  static constexpr bool value = true;
+};
+
+template <class Arguments, class = void>
+struct has_beta_ptr {
+  static constexpr bool value = false;
+};
+
+template <class Arguments>
+struct has_beta_ptr<Arguments, cute::void_t<decltype(Arguments{}.thread.beta_ptr)>> {
+  static constexpr bool value = true;
+};
+
+template <class Arguments, class = void>
+struct has_beta_ptr_array {
+  static constexpr bool value = false;
+};
+
+template <class Arguments>
+struct has_beta_ptr_array<Arguments, cute::void_t<decltype(Arguments{}.thread.beta_ptr_array)>> {
+  static constexpr bool value = true;
+};
+
+} // namespace detail
+} // namespace collective
+} // namespace epilogue
+} // namespace cutlass
diff --git a/include/cutlass/epilogue/collective/epilogue_tensor_broadcast.hpp b/include/cutlass/epilogue/collective/epilogue_tensor_broadcast.hpp
new file mode 100644
index 0000000000..48833ecf10
--- /dev/null
+++ b/include/cutlass/epilogue/collective/epilogue_tensor_broadcast.hpp
@@ -0,0 +1,271 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Functor for performing tensor-tensor broadacasts atop existing epilogues.
+
+  Concretely, the opeartion performed is the following:
+    UnaryOp(
+        BinaryOp1(
+            BinaryOp0(
+                Activation((alpha * A @ B) + bias),
+                beta * C0
+            ),
+            beta * C1
+        )
+    )
+
+    where:
+        - C0 and C1 have the same extents as the output
+        - BinaryOp0 and BinaryOp1 perform elementwise binary operations
+        - UnaryOp is an elementwise operation
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+
+#include "cute/tensor.hpp"
+#include "cutlass/cuda_host_adapter.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace collective {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Collective epilogue that applies elementwise tensor-tensor operations atop other epilogues
+///
+template <
+  class StrideC_,
+  class StrideD_,
+  class ThreadEpilogueOp_,
+  class EpilogueSchedule_,
+  bool PerColumnBias_ = false
+>
+class EpilogueTensorBroadcast {
+public:
+  //
+  // Type Aliases
+  //
+  using EpilogueSchedule = EpilogueSchedule_;
+
+  // derived types of output thread level operator
+  using ThreadEpilogueOp = ThreadEpilogueOp_;
+  using ElementOutput = typename ThreadEpilogueOp::ElementOutput;
+  using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator;
+  using ElementCompute = typename ThreadEpilogueOp::ElementCompute;
+  using ElementScalar = ElementCompute;
+  using ElementBias = typename ThreadEpilogueOp::ElementBias;
+  using ElementC = typename ThreadEpilogueOp::ElementC;
+  using StrideC = StrideC_;
+  using ElementD = typename ThreadEpilogueOp::ElementD;
+  using StrideD = StrideD_;
+  using ActivationFunctor = typename ThreadEpilogueOp::ActivationFunctor;
+
+  static_assert(cute::rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+  static_assert(cute::rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+
+  static constexpr int kOutputAlignment = ThreadEpilogueOp::kCount;
+  using AlignmentType = typename cute::uint_bit<sizeof_bits<ElementOutput>::value * kOutputAlignment>::type;
+
+  static constexpr bool IsBinaryOp0Enabled = ThreadEpilogueOp::IsBinaryOp0Enabled;
+  static constexpr bool IsBinaryOp1Enabled = ThreadEpilogueOp::IsBinaryOp1Enabled;
+  static constexpr bool IsUnaryOpEnabled = ThreadEpilogueOp::IsUnaryOpEnabled;
+
+  static constexpr bool PerColumnBias = PerColumnBias_;
+  using BiasStride = typename cute::conditional_t<PerColumnBias, Stride<_0, _1, _0>, Stride<_1, _0, _0>>;
+
+  struct SharedStorage { };
+
+  // Host side epilogue arguments
+  struct Arguments {
+    typename ThreadEpilogueOp::Params thread{};
+    StrideC dC{};
+    ElementD* ptr_D = nullptr;
+    StrideD dD{};
+    ElementBias* ptr_Bias = nullptr;
+    ElementC* ptr_C0 = nullptr;
+    ElementC* ptr_C1 = nullptr;
+  };
+
+  // Device side epilogue params
+  using Params = Arguments;
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      [[maybe_unused]] ProblemShape const& _,
+      Arguments const& args,
+      [[maybe_unused]] void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      [[maybe_unused]] ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    return true;
+  }
+
+  CUTLASS_HOST_DEVICE
+  EpilogueTensorBroadcast(Params const& params_)
+      : params(params_), epilogue_op(params_.thread) { }
+
+  CUTLASS_DEVICE
+  bool
+  is_source_needed() {
+    return epilogue_op.is_source0_needed() || epilogue_op.is_source1_needed();
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class BlockShapeMNK,
+    class BlockCoordMNKL,
+    class FrgEngine, class FrgLayout,
+    class TiledMma,
+    class ResidueMNK
+  >
+  CUTLASS_HOST_DEVICE void
+  operator()(
+      ProblemShapeMNKL problem_shape_mnkl,
+      BlockShapeMNK blk_shape_MNK,
+      BlockCoordMNKL blk_coord_mnkl,
+      cute::Tensor<FrgEngine, FrgLayout> const& accumulators,
+      TiledMma tiled_mma,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      [[maybe_unused]] char* smem_buf)
+  {
+    using namespace cute;
+    using X = Underscore;
+
+    static_assert(cute::rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
+    static_assert(is_static<BlockShapeMNK>::value, "ThreadBlock tile shape must be static");
+    static_assert(cute::rank(BlockShapeMNK{}) == 3, "BlockShapeMNK must be rank 3");
+    static_assert(cute::rank(BlockCoordMNKL{}) == 4, "BlockCoordMNKL must be rank 4");
+
+    // Separate out problem shape for convenience
+    auto M = get<0>(problem_shape_mnkl);
+    auto N = get<1>(problem_shape_mnkl);
+    auto L = get<3>(problem_shape_mnkl);
+
+    auto stride_c    = detail::get_epilogue_stride<EpilogueSchedule>(params.dC);
+    auto stride_d    = detail::get_epilogue_stride<EpilogueSchedule>(params.dD);
+    auto stride_bias = detail::get_epilogue_stride<EpilogueSchedule>(BiasStride{});
+
+    // Represent the full output tensor
+    Tensor mC0_mnl = make_tensor(make_gmem_ptr(params.ptr_C0), make_shape(M,N,L), stride_c);                   // (m,n,l)
+    Tensor mC1_mnl = make_tensor(make_gmem_ptr(params.ptr_C1), make_shape(M,N,L), stride_c);                   // (m,n,l)
+    Tensor mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D), make_shape(M,N,L), stride_d);                     // (m,n,l)
+    Tensor mBias_mnl = make_tensor(make_gmem_ptr(params.ptr_Bias), make_shape(M,N,L), stride_bias);            // (m,n,l)
+
+    Tensor gC0_mnl = local_tile(mC0_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});      // (BLK_M,BLK_N,m,n,l)
+    Tensor gC1_mnl = local_tile(mC1_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});      // (BLK_M,BLK_N,m,n,l)
+
+    Tensor gD_mnl = local_tile(mD_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});        // (BLK_M,BLK_N,m,n,l)
+    Tensor gBias_mnl = local_tile(mBias_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});  // (BLK_M,BLK_N,m,n,l)
+
+    // Slice to get the tile this thread block is responsible for
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord_mnkl;
+    Tensor gC0 = gC0_mnl(_,_,m_coord,n_coord,l_coord);                                                   // (BLK_M,BLK_N)
+    Tensor gC1 = gC1_mnl(_,_,m_coord,n_coord,l_coord);                                                   // (BLK_M,BLK_N)
+    Tensor gD = gD_mnl(_,_,m_coord,n_coord,l_coord);                                                     // (BLK_M,BLK_N)
+    Tensor gBias = gBias_mnl(_,_,m_coord,n_coord,l_coord);                                               // (BLK_M,BLK_N)
+
+    // Partition source and destination tiles to match the accumulator partitioning
+    auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCgD = thr_mma.partition_C(gD);                                                           // (VEC,THR_M,THR_N)
+    Tensor tCgC0 = thr_mma.partition_C(gC0);                                                         // (VEC,THR_M,THR_N)
+    Tensor tCgC1 = thr_mma.partition_C(gC1);                                                         // (VEC,THR_M,THR_N)
+    Tensor tCgBias = thr_mma.partition_C(gBias);                                                     // (VEC,THR_M,THR_N)
+
+    static_assert(is_static<FrgLayout>::value,
+        "Accumulator layout must be static");
+    CUTE_STATIC_ASSERT_V(size(tCgC0) == size(tCgD),
+        "Source and destination must have the same number of elements.");
+    CUTE_STATIC_ASSERT_V(size(tCgC1) == size(tCgD),
+        "Source and destination must have the same number of elements.");
+    CUTE_STATIC_ASSERT_V(size(tCgD) == size(accumulators),
+        "Accumulator count must have the same destination element count.");
+    CUTE_STATIC_ASSERT_V(size(tCgBias) == size(accumulators),
+        "Accumulator count must have the same destination element count.");
+
+    auto cD = make_identity_tensor(make_shape(unwrap(shape<0>(gD)), unwrap(shape<1>(gD))));
+    Tensor tCcD = thr_mma.partition_C(cD);
+
+    bool bias_needed = params.ptr_Bias != nullptr;
+    bool c0_needed = (params.ptr_C0 != nullptr) && epilogue_op.is_source0_needed();
+    bool c1_needed = (params.ptr_C1 != nullptr) && epilogue_op.is_source1_needed();
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(accumulators); ++i) {
+      if (elem_less(tCcD(i), make_coord(get<0>(residue_mnk), get<1>(residue_mnk)))) {
+        ElementBias bias = bias_needed ? tCgBias(i) : ElementBias(0);
+        ElementC c0 = c0_needed ? tCgC0(i) : ElementC(0);
+        ElementC c1 = c1_needed ? tCgC1(i) : ElementC(0);
+
+        tCgD(i) = epilogue_op(accumulators(i), c0, c1, bias);
+      }
+    }
+  }
+
+private:
+  Params params;
+  ThreadEpilogueOp epilogue_op;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp b/include/cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp
new file mode 100644
index 0000000000..a8083dab1d
--- /dev/null
+++ b/include/cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp
@@ -0,0 +1,549 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing elementwise operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class StrideC,
+  class StrideD,
+  class ThreadEpilogueOp,
+  class SmemLayout,
+  class CopyAtomR2S,
+  class TiledCopyS2R,
+  class CopyAtomR2G,
+  class EpilogueScheduleType = EpilogueSimtVectorized,
+  class Enable = void
+>
+class Epilogue {
+  static_assert(cute::is_same_v<EpilogueScheduleType, EpilogueSimtVectorized> ||
+                cute::is_same_v<EpilogueScheduleType, EpiloguePtrArraySimtVectorized>, 
+                "Could not find an epilogue specialization.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Epilogue Vectorized
+/// Applies an element wise operation to all elements within the fragment
+/// and writes it out to destination storage.
+///
+/// Ways to generalize this:
+/// - CTA tile shape
+/// - vectorization requirements (GMEM)
+/// - vectoriz(able) transform()
+///
+template <
+  class StrideC_,
+  class StrideD_,
+  class ThreadEpilogueOp_,
+  class SmemLayout_,
+  class CopyAtomR2S_,
+  class TiledCopyS2R_,
+  class CopyAtomR2G_,
+  class EpilogueScheduleType_
+>
+class Epilogue<
+        StrideC_,
+        StrideD_,
+        ThreadEpilogueOp_,
+        SmemLayout_,
+        CopyAtomR2S_,
+        TiledCopyS2R_,
+        CopyAtomR2G_,
+        EpilogueScheduleType_,
+        cute::enable_if_t<
+          cute::is_same_v<EpilogueScheduleType_, EpilogueSimtVectorized>
+        >
+      > {
+public:
+  //
+  // Type Aliases
+  //
+  // derived types of output thread level operator
+  using ThreadEpilogueOp = ThreadEpilogueOp_;
+  using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator;
+  using ElementCompute = typename ThreadEpilogueOp::ElementCompute;
+  using ElementScalar = ElementCompute;
+  using ElementOutput = typename ThreadEpilogueOp::ElementOutput;
+  using ElementC = typename ThreadEpilogueOp::ElementC;
+  using StrideC = StrideC_;
+  using ElementD = typename ThreadEpilogueOp::ElementD;
+  using StrideD = StrideD_;
+  using ElementBias = typename detail::IsThreadEpilogueOpWithBias<ThreadEpilogueOp>::type;
+  using SmemLayout   = SmemLayout_;
+  using CopyAtomR2S  = CopyAtomR2S_;
+  using TiledCopyS2R = TiledCopyS2R_;
+  using CopyAtomR2G  = CopyAtomR2G_;
+
+  using GmemTiledCopyC = void;
+  using GmemTiledCopyD = CopyAtomR2G;
+
+  static constexpr bool IsEpilogueBiasSupported = detail::IsThreadEpilogueOpWithBias<ThreadEpilogueOp>::value;
+  using StrideBias = cute::conditional_t<detail::is_m_major<StrideD>(), Stride<_1,_0,int64_t>, Stride<_0,_1,int64_t>>;
+
+  static_assert(cute::rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+  static_assert(cute::rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+
+  struct SharedStorage
+  {
+    cute::array_aligned<ElementAccumulator, cute::cosize_v<SmemLayout>> smem_epilogue;
+  };
+
+  static constexpr bool IsActHasArgs = detail::IsThreadEpilogueOpWithElementwiseArguments<ThreadEpilogueOp>::value;
+
+  // Host side epilogue arguments
+  template<class ThreadEpiOp, class = void>
+  struct ThreadEpilogueOpArguments {
+    ElementScalar alpha{0};
+    ElementScalar beta{0};
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias{};
+  };  
+
+  template<class ThreadEpiOp>
+  struct ThreadEpilogueOpArguments<
+          ThreadEpiOp,
+          cute::enable_if_t<detail::IsThreadEpilogueOpWithElementwiseArguments<ThreadEpiOp>::value>> {
+    ElementScalar alpha{0};
+    ElementScalar beta{0};
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias{};
+    typename ThreadEpiOp::ElementwiseArguments activation{};
+  };
+
+  struct Arguments {
+    ThreadEpilogueOpArguments<ThreadEpilogueOp> thread{};
+    using StrideBias = decltype(thread.dBias);
+    ElementC const* ptr_C = nullptr;
+    StrideC dC{};
+    ElementD* ptr_D = nullptr;
+    StrideD dD{};
+  };
+
+  // Device side epilogue params
+  template<class ThreadEpiOp, class = void>
+  struct ParamsType {
+    typename ThreadEpiOp::Params thread{};
+    ElementC const* ptr_C = nullptr;
+    StrideC dC{};
+    ElementD* ptr_D = nullptr;
+    StrideD dD{};
+    ElementBias const* ptr_Bias = nullptr;
+    StrideBias dBias{};
+  };
+
+  template<class ThreadEpiOp>
+  struct ParamsType<
+          ThreadEpiOp,
+          cute::enable_if_t<detail::IsThreadEpilogueOpWithElementwiseArguments<ThreadEpiOp>::value>> {
+    typename ThreadEpiOp::Params thread{};
+    typename ThreadEpiOp::ElementwiseArguments activation{};
+    ElementC const* ptr_C = nullptr;
+    StrideC dC{};
+    ElementD* ptr_D = nullptr;
+    StrideD dD{};
+    ElementBias const* ptr_Bias = nullptr;
+    StrideBias dBias{};
+  };
+
+  using Params = ParamsType<ThreadEpilogueOp>;
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      [[maybe_unused]] ProblemShape const& _,
+      Arguments const& args,
+      [[maybe_unused]] void* workspace) { 
+    typename ThreadEpilogueOp::Params thread_op_args;
+    thread_op_args.alpha = args.thread.alpha;
+    thread_op_args.beta = args.thread.beta;
+    thread_op_args.alpha_ptr = args.thread.alpha_ptr;
+    thread_op_args.beta_ptr = args.thread.beta_ptr;
+
+    if constexpr (IsActHasArgs) {
+      return {
+        thread_op_args,
+        args.thread.activation,
+        args.ptr_C,
+        args.dC,
+        args.ptr_D,
+        args.dD,
+        args.thread.bias_ptr,
+        args.thread.dBias
+      };
+    }
+    else {
+      return {
+        thread_op_args,
+        args.ptr_C,
+        args.dC,
+        args.ptr_D,
+        args.dD,
+        args.thread.bias_ptr,
+        args.thread.dBias
+      };
+    }
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      [[maybe_unused]] ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    return true;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Epilogue(Params const& params_)
+      : params(params_), epilogue_op(params_.thread) { }
+
+  CUTLASS_DEVICE
+  bool
+  is_source_needed() {
+    return epilogue_op.is_source_needed();
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class BlockShapeMNK,
+    class BlockCoordMNKL,
+    class FrgEngine, class FrgLayout,
+    class TiledMma,
+    class ResidueMNK
+  >
+  CUTLASS_DEVICE void
+  operator()(
+      ProblemShapeMNKL problem_shape_mnkl,
+      BlockShapeMNK blk_shape_MNK,
+      BlockCoordMNKL blk_coord_mnkl,
+      cute::Tensor<FrgEngine,FrgLayout> const& accumulators,                   // (MMA,MMA_M,MMA_N)
+      TiledMma tiled_mma,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+    static_assert(cute::rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
+    static_assert(is_static<BlockShapeMNK>::value, "ThreadBlock tile shape must be static");
+    static_assert(cute::rank(BlockShapeMNK{}) == 3, "BlockShapeMNK must be rank 3");
+    static_assert(cute::rank(BlockCoordMNKL{}) == 4, "BlockCoordMNKL must be rank 3");
+
+    // synchronizing function for smem reads/writes
+#if CUDA_BARRIER_ENABLED
+    auto synchronize = [] () { cutlass::arch::NamedBarrier::sync(typename TiledCopyS2R::TiledNumThr{}, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
+#else
+    auto synchronize = [] () { __syncthreads(); };
+#endif
+
+    // Separate out problem shape for convenience
+    auto M = get<0>(problem_shape_mnkl);
+    auto N = get<1>(problem_shape_mnkl);
+    auto L = get<3>(problem_shape_mnkl);
+
+    // Represent the full output tensor
+    Tensor mC_mnl = make_tensor(make_gmem_ptr(params.ptr_C), make_shape(M,N,L), params.dC);             //             (m,n,l)
+    Tensor mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D), make_shape(M,N,L), params.dD);             //             (m,n,l)
+    Tensor mBias_mnl = make_tensor(make_gmem_ptr(params.ptr_Bias), make_shape(M,N,L), params.dBias);    //             (m,n,l)
+
+    Tensor gC_mnl = local_tile(mC_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});             // (BLK_M,BLK_N,m,n,l)
+    Tensor gD_mnl = local_tile(mD_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});             // (BLK_M,BLK_N,m,n,l)
+    Tensor gBias_mnl = local_tile(mBias_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});       // (BLK_M,BLK_N,m,n,l)
+
+    // Slice to get the tile this CTA is responsible for
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord_mnkl;
+    Tensor gC = gC_mnl(_,_,m_coord,n_coord,l_coord);                                                   // (BLK_M,BLK_N)
+    Tensor gD = gD_mnl(_,_,m_coord,n_coord,l_coord);                                                   // (BLK_M,BLK_N)
+    Tensor gBias = gBias_mnl(_,_,m_coord,n_coord,l_coord);                                             // (BLK_M,BLK_N)
+  
+    // Construct a tensor in SMEM that we can partition for rearranging data
+    SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+    Tensor sAcc = make_tensor(make_smem_ptr(storage.smem_epilogue.data()), SmemLayout{});            // (SMEM_M,SMEM_N)
+
+    // Partition sAcc to match the accumulator partitioning
+    auto tiled_r2s = make_tiled_copy_C(CopyAtomR2S{}, tiled_mma);
+    auto thread_r2s     = tiled_r2s.get_thread_slice(thread_idx);
+    Tensor tRS_rAcc = thread_r2s.retile_S(accumulators);                              // ((Atom,AtomNum), MMA_M, MMA_N)
+    Tensor tRS_sAcc = thread_r2s.partition_D(sAcc);                                   // ((Atom,AtomNum),PIPE_M,PIPE_N)
+
+    // Tile gD and gC by the shape of SmemLayout first
+    auto tile  = make_shape(size<0>(sAcc), size<1>(sAcc));
+    Tensor gCt = flat_divide(gC, tile);                                                // (SMEM_M,SMEM_N,TILE_M,TILE_N)
+    Tensor gDt = flat_divide(gD, tile);                                                // (SMEM_M,SMEM_N,TILE_M,TILE_N)
+    Tensor gBiast = flat_divide(gBias, tile);                                          // (SMEM_M,SMEM_N,TILE_M,TILE_N)
+
+    // Partition sAcc, gC, and gD for the output
+    auto tiled_s2r = TiledCopyS2R{};
+    auto thread_s2r     = tiled_s2r.get_thread_slice(thread_idx);
+    Tensor tSR_sAcc = thread_s2r.partition_S(sAcc);                      //               ((Atom,AtomNum),ATOM_M,ATOM_N)
+    Tensor tSR_gC = thread_s2r.partition_D(gCt);                         // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N)
+    Tensor tSR_gD = thread_s2r.partition_D(gDt);                         // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N)
+    Tensor tSR_gBias = thread_s2r.partition_D(gBiast);                   // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N)
+
+    // Allocate intermediate registers on the dst tensors
+    Tensor tSR_rAcc = make_tensor<ElementAccumulator>(take<0,3>(shape(tSR_gC)));       // ((Atom,AtomNum),ATOM_M,ATOM_N)
+    Tensor tSR_rC = make_tensor<ElementC>(shape(tSR_rAcc));                            // ((Atom,AtomNum),ATOM_M,ATOM_N)
+    Tensor tSR_rD = make_tensor<ElementD>(shape(tSR_rAcc));                            // ((Atom,AtomNum),ATOM_M,ATOM_N)
+    Tensor tSR_rBias = make_tensor_like(tSR_gBias);                      // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N)
+
+    // Repeat the D-partitioning for coordinates and predication
+    Tensor cD   = make_identity_tensor(make_shape(size<0>(gD),size<1>(gD)));           // (BLK_M,BLK_N) -> (blk_m,blk_n)
+    Tensor cDt  = flat_divide(cD, tile);                                 //                (SMEM_M,SMEM_N,TILE_M,TILE_N)
+    Tensor tSR_cD = thread_s2r.partition_D(cDt);                         // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N)
+
+    CUTE_STATIC_ASSERT(size<1>(tRS_rAcc) % size<3>(tSR_gC) == 0);  // TILE_M divides MMA_M
+    CUTE_STATIC_ASSERT(size<2>(tRS_rAcc) % size<4>(tSR_gC) == 0);  // TILE_N divides MMA_N
+
+#if 0
+    if (thread_idx == 0 && m_coord == 0 && n_coord == 0) {
+      print("aC   : "); print(accumulators.layout()); print("\n");
+      print("gC   : "); print(gC.layout()); print("\n");
+      print("gD   : "); print(gD.layout()); print("\n");
+      print("gBias   : "); print(gBias.layout()); print("\n");
+      print("sAcc   : "); print(sAcc.layout()); print("\n");
+      print("\n");
+      print("tRS_sAcc : "); print(tRS_sAcc.layout()); print("\n");
+      print("tRS_rAcc : "); print(tRS_rAcc.layout()); print("\n");
+      print("\n");
+      print("gDt  : "); print(gDt.layout()); print("\n");
+      print("tSR_sAcc : "); print(tSR_sAcc.layout()); print("\n");
+      print("tSR_rAcc : "); print(tSR_rAcc.layout()); print("\n");
+      print("\n");
+      print("tSR_rC : "); print(tSR_rC.layout()); print("\n");
+      print("tSR_rD : "); print(tSR_rD.layout()); print("\n");
+      print("tSR_gC : "); print(tSR_gC.layout()); print("\n");
+      print("tSR_gD : "); print(tSR_gD.layout()); print("\n");
+      print("\n");
+      print("gBiast  : "); print(gBiast.layout()); print("\n");
+      print("tSR_gBias  : "); print(tSR_gBias.layout()); print("\n");
+      print("tSR_rBias  : "); print(tSR_rBias.layout()); print("\n");
+    }
+#endif
+
+    if constexpr (IsEpilogueBiasSupported) {
+      if (params.ptr_Bias) {
+        // Filter so we don't issue redundant copies over stride-0 modes
+        // (only works if 0-strides are in same location, which is by construction)
+        Tensor tSR_gBias_flt = filter_zeros(tSR_gBias);
+        Tensor tSR_rBias_flt = filter_zeros(tSR_rBias);
+        Tensor tSR_cD_flt = filter_zeros(tSR_cD, tSR_gBias.stride());
+
+        // Step 0. Copy Bias from GMEM to fragment
+        auto pred_fn = [&] (auto const&... coords) { return elem_less(tSR_cD_flt(coords...), take<0, 2>(residue_mnk)); };
+        copy_if(pred_fn, tSR_gBias_flt, tSR_rBias_flt);    
+      }
+    }
+
+    // For each tiling needed for SmemLayout to cover shape(gD)
+    CUTLASS_PRAGMA_UNROLL
+    for (int step_m = 0; step_m < size<2>(cDt); ++step_m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int step_n = 0; step_n < size<3>(cDt); ++step_n) {
+        // Step 1. Copy to SMEM
+        CUTLASS_PRAGMA_UNROLL
+        for (int pipe_m = 0; pipe_m < size<1>(tRS_sAcc); ++pipe_m) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int pipe_n = 0; pipe_n < size<2>(tRS_sAcc); ++pipe_n) {
+            int mma_m = step_m * size<1>(tRS_sAcc) + pipe_m;
+            int mma_n = step_n * size<2>(tRS_sAcc) + pipe_n;
+
+            copy(tiled_r2s, tRS_rAcc(_,mma_m,mma_n), tRS_sAcc(_,pipe_m,pipe_n));
+          }
+        }
+
+        // Step 2. Wait for SMEM writes to complete
+        synchronize();
+
+        // Step 3. Copy from SMEM into a fragment
+        copy(tiled_s2r, tSR_sAcc, tSR_rAcc);
+
+        // Step 4. Wait for SMEM reads to complete
+        synchronize();
+
+        Tensor tSR_gDmn = tSR_gD(_,_,_,step_m,step_n);
+        Tensor tSR_cDmn = tSR_cD(_,_,_,step_m,step_n);
+
+        if constexpr (IsEpilogueBiasSupported) {
+          Tensor tSR_rBiasmn = tSR_rBias(_,_,_,step_m,step_n);
+
+          if (epilogue_op.is_source_needed()) {
+            // source is needed
+            Tensor tSR_gCmn = tSR_gC(_,_,_,step_m,step_n);
+
+            // Step 5. Copy C from GMEM to a fragment
+            CUTLASS_PRAGMA_UNROLL
+            for (int m = 0; m < size<1>(tSR_gDmn); ++m) {
+              CUTLASS_PRAGMA_UNROLL
+              for (int n = 0; n < size<2>(tSR_gDmn); ++n) {
+                // Predication
+                if (elem_less(tSR_cDmn(0,m,n), take<0,2>(residue_mnk))) {
+                  CUTLASS_PRAGMA_UNROLL
+                  for (int i = 0; i < size<0>(tSR_rAcc); ++i) {
+                    tSR_rC(i,m,n) = tSR_gCmn(i,m,n);
+                  }
+                }
+              }
+            }
+
+            // Step 6. Elementwise operation with conversion
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < size(tSR_rAcc); ++i) {
+              if constexpr (IsActHasArgs) {
+                epilogue_op(tSR_rD(i), tSR_rD(i), tSR_rAcc(i), tSR_rC(i), tSR_rBiasmn(i), params.activation);
+              } else {
+                epilogue_op(tSR_rD(i), tSR_rD(i), tSR_rAcc(i), tSR_rC(i), tSR_rBiasmn(i));
+              }
+            }
+          }
+          else {
+            // source is not needed, avoid load and lift compute
+
+            // Step 5. Elementwise operation with conversion
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < size(tSR_rAcc); ++i) {
+              if constexpr (IsActHasArgs) {
+                epilogue_op(tSR_rD(i), tSR_rD(i), tSR_rAcc(i), tSR_rBiasmn(i), params.activation);
+              } else {
+                epilogue_op(tSR_rD(i), tSR_rD(i), tSR_rAcc(i), tSR_rBiasmn(i));
+              }
+            }
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < size<1>(tSR_gDmn); ++m) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int n = 0; n < size<2>(tSR_gDmn); ++n) {
+              // Predication
+              if (elem_less(tSR_cDmn(0,m,n), take<0,2>(residue_mnk))) {
+                // The Last Step. Copy to GMEM
+                copy(CopyAtomR2G{}, tSR_rD(_,m,n), tSR_gDmn(_,m,n));
+              }
+            }
+          }
+        } else {
+          if (epilogue_op.is_source_needed()) {
+            // source is needed
+            Tensor tSR_gCmn = tSR_gC(_,_,_,step_m,step_n);
+
+            // Step 5. Copy C from GMEM to a fragment
+            CUTLASS_PRAGMA_UNROLL
+            for (int m = 0; m < size<1>(tSR_gDmn); ++m) {
+              CUTLASS_PRAGMA_UNROLL
+              for (int n = 0; n < size<2>(tSR_gDmn); ++n) {
+                // Predication
+                if (elem_less(tSR_cDmn(0,m,n), take<0,2>(residue_mnk))) {
+                  CUTLASS_PRAGMA_UNROLL
+                  for (int i = 0; i < size<0>(tSR_rAcc); ++i) {
+                    tSR_rC(i,m,n) = tSR_gCmn(i,m,n);
+                  }
+                }
+              }
+            }
+
+            // Step 6. Elementwise operation with conversion
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < size(tSR_rAcc); ++i) {
+              tSR_rD(i) = epilogue_op(tSR_rAcc(i), tSR_rC(i));
+            }
+          }
+          else {
+            // source is not needed, avoid load and lift compute
+
+            // Step 5. Elementwise operation with conversion
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < size(tSR_rAcc); ++i) {
+              tSR_rD(i) = epilogue_op(tSR_rAcc(i));
+            }
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < size<1>(tSR_gDmn); ++m) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int n = 0; n < size<2>(tSR_gDmn); ++n) {
+              // Predication
+              if (elem_less(tSR_cDmn(0,m,n), take<0,2>(residue_mnk))) {
+                // The Last Step. Copy to GMEM
+                copy(CopyAtomR2G{}, tSR_rD(_,m,n), tSR_gDmn(_,m,n));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+private:
+  Params params;
+  ThreadEpilogueOp epilogue_op;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/collective/sm70_epilogue_vectorized_array.hpp b/include/cutlass/epilogue/collective/sm70_epilogue_vectorized_array.hpp
new file mode 100644
index 0000000000..8a70370b21
--- /dev/null
+++ b/include/cutlass/epilogue/collective/sm70_epilogue_vectorized_array.hpp
@@ -0,0 +1,412 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing elementwise operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Ptr Array Epilogue Vectorized
+/// Applies an element wise operation to all elements within the fragment
+/// and writes it out to destination storage.
+///
+/// Ways to generalize this:
+/// - CTA tile shape
+/// - vectorization requirements (GMEM)
+/// - vectoriz(able) transform()
+///
+template <
+  class StrideC_,
+  class StrideD_,
+  class ThreadEpilogueOp_,
+  class SmemLayout_,
+  class CopyAtomR2S_,
+  class TiledCopyS2R_,
+  class CopyAtomR2G_,
+  class EpilogueScheduleType_
+>
+class Epilogue<
+        StrideC_,
+        StrideD_,
+        ThreadEpilogueOp_,
+        SmemLayout_,
+        CopyAtomR2S_,
+        TiledCopyS2R_,
+        CopyAtomR2G_,
+        EpilogueScheduleType_,
+        cute::enable_if_t<
+          cute::is_same_v<EpilogueScheduleType_, EpiloguePtrArraySimtVectorized>
+        >
+      > {
+public:
+  //
+  // Type Aliases
+  //
+  // derived types of output thread level operator
+  using ThreadEpilogueOp = ThreadEpilogueOp_;
+  using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator;
+  using ElementCompute = typename ThreadEpilogueOp::ElementCompute;
+  using ElementScalar = ElementCompute;
+  using ElementOutput = typename ThreadEpilogueOp::ElementOutput;
+  using ElementC = typename ThreadEpilogueOp::ElementC;
+  using StrideC = StrideC_;
+  using InternalStrideC = cute::remove_pointer_t<StrideC>;
+  using ElementD = typename ThreadEpilogueOp::ElementD;
+  using StrideD = StrideD_;
+  using InternalStrideD = cute::remove_pointer_t<StrideD>;
+
+  using SmemLayout   = SmemLayout_;
+  using CopyAtomR2S  = CopyAtomR2S_;
+  using TiledCopyS2R = TiledCopyS2R_;
+  using CopyAtomR2G  = CopyAtomR2G_;
+
+  using GmemTiledCopyC = TiledCopyS2R;
+  using GmemTiledCopyD = TiledCopyS2R;
+
+  static const int kOutputAlignment = ThreadEpilogueOp::kCount;
+
+  using AlignmentType = typename cute::uint_bit<sizeof_bits<ElementOutput>::value * kOutputAlignment>::type;
+
+  static_assert(cute::rank(InternalStrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+  static_assert(cute::rank(InternalStrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+
+  struct SharedStorage
+  {
+    cute::array_aligned<ElementAccumulator, cute::cosize_v<SmemLayout>> smem_epilogue;
+  };
+
+  using TensorMapStorage = SharedStorage;
+
+  // Host side epilogue arguments
+  struct Arguments {
+    typename ThreadEpilogueOp::Params thread{};
+    ElementC const** ptr_C = nullptr;
+    StrideC dC{};
+    ElementD** ptr_D = nullptr;
+    StrideD dD{};
+  };
+
+  // Device side epilogue params
+  using Params = Arguments;
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      ProblemShape const&,
+      Arguments const& args,
+      [[maybe_unused]] void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args, int sm_count) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      [[maybe_unused]] ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    return true;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Epilogue(Params const& params_)
+      : params(params_) { }
+
+  CUTLASS_DEVICE
+  bool
+  is_source_needed() {
+    // For Ptr-Array or Grouped Gemm we cannot determine if source is needed based on first beta.
+    return true;
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class BlockShapeMNK,
+    class BlockCoordMNKL,
+    class FrgEngine, class FrgLayout,
+    class TiledMma,
+    class ResidueMNK
+  >
+  CUTLASS_DEVICE void
+  operator()(
+      ProblemShapeMNKL problem_shape_mnkl,
+      BlockShapeMNK blk_shape_MNK,
+      BlockCoordMNKL blk_coord_mnkl,
+      cute::Tensor<FrgEngine,FrgLayout> const& accumulators,                   // (MMA,MMA_M,MMA_N)
+      TiledMma tiled_mma,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+    static_assert(cute::rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
+    static_assert(is_static<BlockShapeMNK>::value, "ThreadBlock tile shape must be static");
+    static_assert(cute::rank(BlockShapeMNK{}) == 3, "BlockShapeMNK must be rank 3");
+    static_assert(cute::rank(BlockCoordMNKL{}) == 4, "BlockCoordMNKL must be rank 3");
+
+    // synchronizing function for smem reads/writes
+#if CUDA_BARRIER_ENABLED
+    auto synchronize = [] () { cutlass::arch::NamedBarrier::sync(typename TiledCopyS2R::TiledNumThr{}, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
+#else
+    auto synchronize = [] () { __syncthreads(); };
+#endif
+
+    // Separate out problem shape for convenience
+    auto M = get<0>(problem_shape_mnkl);
+    auto N = get<1>(problem_shape_mnkl);
+    auto L = get<3>(problem_shape_mnkl);
+    // Batches are managed by using appropriate pointers to C and D matrices
+    const int32_t mock_L = 1;
+    const int32_t mock_l_coord = 0;
+    // Slice to get the tile this CTA is responsible for
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord_mnkl;
+
+    // If scalar alpha/beta are provided, i.e., same alpha/beta applies to all batches/groups.
+    // If pointers to alpha/beta are provided, i.e., alpha/beta can differ between batches/groups,
+    // we get the correct alpha/beta values for the current batch/group using group index.
+    ThreadEpilogueOp epilogue_op = ThreadEpilogueOp(params.thread, l_coord);
+
+    if (epilogue_op.is_source_needed() && params.dC == nullptr) {
+      // Beta value is non-zero while pointer to C is a nullptr
+      assert(0);
+    }
+
+    InternalStrideC stride_c;
+    InternalStrideD stride_d;
+    if constexpr (!cute::is_same_v<InternalStrideC, StrideC>) {
+      // If grouped gemm
+      if (epilogue_op.is_source_needed()) {
+        stride_c = params.dC[l_coord];
+      }
+      stride_d = params.dD[l_coord];
+    }
+    else {
+      stride_c = params.dC;
+      stride_d = params.dD;
+    }
+
+    // Represent the full output tensor
+    ElementC const* ptr_C_l = nullptr;
+    if (epilogue_op.is_source_needed()) {
+      ptr_C_l = params.ptr_C[l_coord];
+    }
+    Tensor mC_mnl = make_tensor(make_gmem_ptr(ptr_C_l), make_shape(M,N,mock_L), stride_c);      //             (m,n,l)
+    Tensor mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D[l_coord]), make_shape(M,N,mock_L), stride_d);      //             (m,n,l)
+    Tensor gC_mnl = local_tile(mC_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});      // (BLK_M,BLK_N,m,n,l)
+    Tensor gD_mnl = local_tile(mD_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});      // (BLK_M,BLK_N,m,n,l)
+
+    Tensor gC = gC_mnl(_,_,m_coord,n_coord,mock_l_coord);                                                   // (BLK_M,BLK_N)
+    Tensor gD = gD_mnl(_,_,m_coord,n_coord,mock_l_coord);                                                   // (BLK_M,BLK_N)
+
+    // Construct a tensor in SMEM that we can partition for rearranging data
+    SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+    Tensor sAcc = make_tensor(make_smem_ptr(storage.smem_epilogue.data()), SmemLayout{});            // (SMEM_M,SMEM_N)
+
+    // Partition sAcc to match the accumulator partitioning
+    auto tiled_r2s = make_tiled_copy_C(CopyAtomR2S{}, tiled_mma);
+    auto thread_r2s     = tiled_r2s.get_thread_slice(thread_idx);
+    Tensor tRS_rAcc = thread_r2s.retile_S(accumulators);                              // ((Atom,AtomNum), MMA_M, MMA_N)
+    Tensor tRS_sAcc = thread_r2s.partition_D(sAcc);                                   // ((Atom,AtomNum),PIPE_M,PIPE_N)
+
+    // Tile gD and gC by the shape of SmemLayout first
+    auto tile  = make_shape(size<0>(sAcc), size<1>(sAcc));
+    Tensor gCt = flat_divide(gC, tile);                                                // (SMEM_M,SMEM_N,TILE_M,TILE_N)
+    Tensor gDt = flat_divide(gD, tile);                                                // (SMEM_M,SMEM_N,TILE_M,TILE_N)
+
+    // Partition sAcc, gC, and gD for the output
+    auto tiled_s2r = TiledCopyS2R{};
+    auto thread_s2r     = tiled_s2r.get_thread_slice(thread_idx);
+    Tensor tSR_sAcc = thread_s2r.partition_S(sAcc);                      //               ((Atom,AtomNum),ATOM_M,ATOM_N)
+    Tensor tSR_gC = thread_s2r.partition_D(gCt);                         // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N)
+    Tensor tSR_gD = thread_s2r.partition_D(gDt);                         // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N)
+
+    // Allocate intermediate registers on the dst tensors
+    Tensor tSR_rAcc = make_tensor<ElementAccumulator>(take<0,3>(shape(tSR_gC)));       // ((Atom,AtomNum),ATOM_M,ATOM_N)
+    Tensor tSR_rD = make_tensor<ElementOutput>(shape(tSR_rAcc));                       // ((Atom,AtomNum),ATOM_M,ATOM_N)
+
+    // Repeat the D-partitioning for coordinates and predication
+    Tensor cD   = make_identity_tensor(make_shape(size<0>(gD),size<1>(gD)));           // (BLK_M,BLK_N) -> (blk_m,blk_n)
+    Tensor cDt  = flat_divide(cD, tile);                                 //                (SMEM_M,SMEM_N,TILE_M,TILE_N)
+    Tensor tSR_cD = thread_s2r.partition_D(cDt);                         // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N)
+
+    CUTE_STATIC_ASSERT(size<1>(tRS_rAcc) % size<3>(tSR_gC) == 0);  // TILE_M divides MMA_M
+    CUTE_STATIC_ASSERT(size<2>(tRS_rAcc) % size<4>(tSR_gC) == 0);  // TILE_N divides MMA_N
+
+#if 0
+    if (thread_idx == 0 && m_coord == 0 && n_coord == 0) {
+      print("aC   : "); print(accumulators.layout()); print("\n");
+      print("gC   : "); print(gC.layout()); print("\n");
+      print("gD   : "); print(gD.layout()); print("\n");
+      print("sAcc   : "); print(sAcc.layout()); print("\n");
+      print("\n");
+      print("tRS_sAcc : "); print(tRS_sAcc.layout()); print("\n");
+      print("tRS_rAcc : "); print(tRS_rAcc.layout()); print("\n");
+      print("\n");
+      print("gDt  : "); print(gDt.layout()); print("\n");
+      print("tSR_sAcc : "); print(tSR_sAcc.layout()); print("\n");
+      print("tSR_rAcc : "); print(tSR_rAcc.layout()); print("\n");
+      print("\n");
+      print("tSR_rD : "); print(tSR_rD.layout()); print("\n");
+      print("tSR_gC : "); print(tSR_gC.layout()); print("\n");
+      print("tSR_gD : "); print(tSR_gD.layout()); print("\n");
+      print("\n");
+    }
+#endif
+
+    // For each tiling needed for SmemLayout to cover shape(gD)
+    CUTLASS_PRAGMA_UNROLL
+    for (int step_m = 0; step_m < size<2>(cDt); ++step_m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int step_n = 0; step_n < size<3>(cDt); ++step_n) {
+        // Step 1. Copy to SMEM
+        CUTLASS_PRAGMA_UNROLL
+        for (int pipe_m = 0; pipe_m < size<1>(tRS_sAcc); ++pipe_m) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int pipe_n = 0; pipe_n < size<2>(tRS_sAcc); ++pipe_n) {
+            int mma_m = step_m * size<1>(tRS_sAcc) + pipe_m;
+            int mma_n = step_n * size<2>(tRS_sAcc) + pipe_n;
+
+            copy(tiled_r2s, tRS_rAcc(_,mma_m,mma_n), tRS_sAcc(_,pipe_m,pipe_n));
+          }
+        }
+
+        // Step 2. Wait for SMEM writes to complete
+        synchronize();
+
+        // Step 3. Copy from SMEM into a fragment
+        copy(tiled_s2r, tSR_sAcc, tSR_rAcc);
+
+        // Step 4. Wait for SMEM reads to complete
+        synchronize();
+
+        Tensor tSR_gDmn = tSR_gD(_,_,_,step_m,step_n);
+        Tensor tSR_cDmn = tSR_cD(_,_,_,step_m,step_n);
+
+        if (epilogue_op.is_source_needed()) {
+          // source is needed
+          Tensor tSR_gCmn = tSR_gC(_,_,_,step_m,step_n);
+
+          Tensor tSR_rCmn = make_tensor<ElementC>(shape(tSR_gCmn));                     // ((Atom,AtomNum),ATOM_M,ATOM_N)
+
+          // Step 5. Copy C from GMEM to a fragment
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < size<1>(tSR_gDmn); ++m) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int n = 0; n < size<2>(tSR_gDmn); ++n) {
+              // Predication
+              if (elem_less(tSR_cDmn(0,m,n), take<0,2>(residue_mnk))) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int i = 0; i < size<0>(tSR_rAcc); ++i) {
+                  tSR_rCmn(i,m,n) = tSR_gCmn(i,m,n);
+                }
+              }
+            }
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < size<1>(tSR_gDmn); ++m) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int n = 0; n < size<2>(tSR_gDmn); ++n) {
+              // Predication
+              if (elem_less(tSR_cDmn(0,m,n), take<0,2>(residue_mnk))) {
+                // Step 6. Elementwise operation with conversion
+                CUTLASS_PRAGMA_UNROLL
+                for (int i = 0; i < size<0>(tSR_rAcc); ++i) {
+                  tSR_rD(i,m,n) = epilogue_op(tSR_rAcc(i,m,n), tSR_rCmn(i,m,n));
+                }
+                // Step 7. Copy to GMEM
+                copy(CopyAtomR2G{}, tSR_rD(_,m,n), tSR_gDmn(_,m,n));
+              }
+            }
+          }
+        }
+        else {
+          // source is not needed, avoid load and lift compute
+
+          // Step 5. Elementwise operation with conversion
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 0; i < size(tSR_rAcc); ++i) {
+            tSR_rD(i) = epilogue_op(tSR_rAcc(i));
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < size<1>(tSR_gDmn); ++m) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int n = 0; n < size<2>(tSR_gDmn); ++n) {
+              // Predication
+              if (elem_less(tSR_cDmn(0,m,n), take<0,2>(residue_mnk))) {
+                // Step 6. Copy to GMEM
+                copy(CopyAtomR2G{}, tSR_rD(_,m,n), tSR_gDmn(_,m,n));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+private:
+  Params params;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp b/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp
new file mode 100644
index 0000000000..54fe9b1daf
--- /dev/null
+++ b/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp
@@ -0,0 +1,1198 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing elementwise operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/epilogue/fusion/callbacks.hpp"
+#include "cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/cuda_host_adapter.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/atom/copy_traits_sm90_tma.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int StagesC_,
+  int StagesD_,
+  int FragmentSize_,
+  bool ReuseSmemC_,
+  bool DelayTmaStore_,
+  int NumEpilogueWarpGroups_,
+  class CtaTileMNK_,   //     (CTA_M,CTA_N,CTA_K)
+  class EpilogueTile_, // (EPI_TILE_M,EPI_TILE_N)
+  class ElementC_,
+  class StrideC_,
+  class ElementD_,
+  class StrideD_,
+  class FusionCallbacks_,
+  class CopyOpG2S_,
+  class SmemLayoutAtomC_,
+  class CopyOpS2R_,
+  class CopyOpS2G_,
+  class SmemLayoutAtomD_,
+  class CopyOpR2S_,
+  class CopyAtomC_,
+  class CopyOpR2R_
+>
+class CollectiveEpilogue<
+    Sm90PtrArrayTmaWarpSpecialized<StagesC_,
+                                   StagesD_,
+                                   FragmentSize_,
+                                   ReuseSmemC_,
+                                   DelayTmaStore_,
+                                   NumEpilogueWarpGroups_
+                                  >,
+    CtaTileMNK_,
+    EpilogueTile_,
+    ElementC_,
+    StrideC_,
+    ElementD_,
+    StrideD_,
+    FusionCallbacks_,
+    CopyOpG2S_,
+    SmemLayoutAtomC_,
+    CopyOpS2R_,
+    CopyOpS2G_,
+    SmemLayoutAtomD_,
+    CopyOpR2S_,
+    CopyAtomC_,
+    CopyOpR2R_
+> {
+public:
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = Sm90PtrArrayTmaWarpSpecialized<StagesC_,
+                                                        StagesD_,
+                                                        FragmentSize_,
+                                                        ReuseSmemC_,
+                                                        DelayTmaStore_, 
+                                                        NumEpilogueWarpGroups_
+                                                       >;
+  using CtaTileMNK = CtaTileMNK_;
+  using EpilogueTile = EpilogueTile_;
+  using FusionCallbacks = FusionCallbacks_;
+  using ElementC = ElementC_;
+  using StrideC = StrideC_;
+  using InternalStrideC = cute::remove_pointer_t<StrideC>;
+  using ElementD = ElementD_;
+  using StrideD = StrideD_;
+  using InternalStrideD = cute::remove_pointer_t<StrideD>;
+  using CopyOpG2S = CopyOpG2S_;
+  using SmemLayoutAtomC = SmemLayoutAtomC_;
+  using CopyOpS2R = CopyOpS2R_;
+  using CopyOpS2G = CopyOpS2G_;
+  using SmemLayoutAtomD = SmemLayoutAtomD_;
+  using CopyOpR2S = CopyOpR2S_;
+  using CopyAtomC = CopyAtomC_;
+  using CopyOpR2R = CopyOpR2R_;
+
+  using ThreadEpilogueOp = typename epilogue::fusion::FusionCallbacksTraits<FusionCallbacks>::Operation;
+  using GmemTiledCopyC = CopyOpG2S;
+  using GmemTiledCopyD = CopyOpS2G;
+
+  static_assert(!is_layout<EpilogueTile>::value && is_tuple<EpilogueTile>::value, "EpilogueTile must be a cute::Tile or cute::Shape");
+  static_assert(cute::rank(CtaTileMNK{}) == 3, "CtaTileMNK must be rank-3: [CTA_M, CTA_N, CTA_K]");
+  static_assert(cute::rank(EpilogueTile{}) == 2, "EpilogueTile must be rank-2: [EPI_TILE_M, EPI_TILE_N]");
+  static_assert(size<0>(CtaTileMNK{}) % size<0>(shape(EpilogueTile{})) == 0, "EPI_TILE_M must divide CTA_M");
+  static_assert(size<1>(CtaTileMNK{}) % size<1>(shape(EpilogueTile{})) == 0, "EPI_TILE_N must divide CTA_N");
+  static_assert(cute::rank(InternalStrideC{}) == 3, "StrideC must be rank-3: [M, N, L]");
+  static_assert(cute::rank(InternalStrideD{}) == 3, "StrideD must be rank-3: [M, N, L]");
+
+private:
+  constexpr static bool is_source_supported = not cute::is_void_v<ElementC>;
+  constexpr static bool is_destination_supported = not cute::is_void_v<ElementD>;
+  using NonVoidElementD = cute::conditional_t<not is_destination_supported,fusion::get_element_aux_t<FusionCallbacks>, ElementD>;
+  static_assert(not cute::is_void_v<NonVoidElementD>, "SmemElementD is void");
+  using NonVoidElementC = cute::conditional_t<not is_source_supported,NonVoidElementD,ElementC>; // prevents void ref breakages
+
+  using SmemElementC = typename cutlass::detail::get_unpacked_element_type<NonVoidElementC>::type;
+  using SmemElementD = typename cutlass::detail::get_unpacked_element_type<NonVoidElementD>::type;
+
+  constexpr static int StagesC = StagesC_;
+  constexpr static int StagesD = StagesD_;
+  constexpr static bool ReuseSmemC = ReuseSmemC_ and is_destination_supported;
+  constexpr static bool DelayTmaStore = DelayTmaStore_;
+
+  constexpr static bool is_m_major_C = detail::is_m_major<InternalStrideC>();
+  constexpr static bool is_m_major_D = detail::is_m_major<InternalStrideD>();
+
+  constexpr static bool is_im2col_C = cute::is_same_v<CopyOpG2S, SM90_TMA_LOAD_IM2COL>;
+  constexpr static bool is_im2col_D = cute::is_same_v<CopyOpS2G, SM90_TMA_STORE_IM2COL>;
+
+  // Check if register transformation is needed before copying register to shared memory.
+  constexpr static bool IsUseR2R = !cute::is_void_v<CopyOpR2R>;
+
+  using SmemLayoutC = decltype(tile_to_shape(
+      SmemLayoutAtomC{},
+      make_shape(size<0>(EpilogueTile{}), size<1>(EpilogueTile{}), Int<StagesC>{}),
+      cute::conditional_t<is_m_major_C, Step<_2,_1,_3>, Step<_1,_2,_3>>{} ));
+  using SmemLayoutD = decltype(tile_to_shape(
+      SmemLayoutAtomD{},
+      make_shape(size<0>(EpilogueTile{}), size<1>(EpilogueTile{}), Int<ReuseSmemC ? StagesC : StagesD>{}),
+      cute::conditional_t<is_m_major_D, Step<_2,_1,_3>, Step<_1,_2,_3>>{} ));
+
+  constexpr static bool support_smem_reuse = is_source_supported && is_destination_supported && StagesD <= StagesC
+                                            && cosize(take<0,2>(SmemLayoutC{})) == cosize(take<0,2>(SmemLayoutD{}));
+  static_assert(not (ReuseSmemC && not support_smem_reuse), "Smem reuse requirements not met");
+
+  constexpr static size_t SmemAlignmentD = cutlass::detail::alignment_for_swizzle(SmemLayoutD{});
+  constexpr static size_t SmemAlignmentC = cutlass::detail::alignment_for_swizzle(SmemLayoutC{});
+  constexpr static size_t MaxSmemAlignment = cute::max(SmemAlignmentC, SmemAlignmentD);
+
+  using SmemArrayTypeC = cute::ArrayEngine<SmemElementC, cosize_v<SmemLayoutC>>;
+  using SmemArrayTypeD = cute::ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>>;
+
+  using EmptyType = cute::tuple<>;
+  using SmemCStorage = cute::conditional_t<is_source_supported and (not ReuseSmemC),
+                         SmemArrayTypeC,
+                         EmptyType>;
+  using SmemDStorage = cute::conditional_t<is_destination_supported,
+                         SmemArrayTypeD,
+                         EmptyType>;
+
+  struct CollectiveStorageWithC {
+    alignas(SmemAlignmentC) ArrayEngine<SmemElementC, cosize_v<SmemLayoutC>> smem_C;
+    alignas(SmemAlignmentD) ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>> smem_D;
+  };
+
+  union CollectiveStorageWithoutC {
+    cute::array<SmemElementC, 0> smem_C;
+    alignas(SmemAlignmentD) ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>> smem_D;
+  };
+
+  union CollectiveStorageReuseC {
+    alignas(MaxSmemAlignment) ArrayEngine<SmemElementC, cosize_v<SmemLayoutC>> smem_C;
+    alignas(MaxSmemAlignment) ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>> smem_D;
+  };
+
+public:
+  // TMA pipeline for loading C
+  using LoadPipeline = cutlass::PipelineTransactionAsync<StagesC>;
+  using LoadPipelineState = cutlass::PipelineState<StagesC>;
+  constexpr static uint32_t TmaTransactionBytes =
+    (size(take<0,2>(SmemLayoutC{})) * static_cast<uint32_t>(sizeof_bits<SmemElementC>::value)) / 8;
+  constexpr static bool RequiresTransactionBytes = true;
+
+  constexpr static int NumEpilogueWarpGroups = NumEpilogueWarpGroups_;
+
+  // TMA pipeline for storing D
+  using StorePipeline = cute::conditional_t<ReuseSmemC,
+                          cutlass::PipelineTmaStore<StagesC, StagesD-1>,
+                          cutlass::PipelineTmaStore<StagesD>>;
+  using StorePipelineState = cutlass::PipelineState<ReuseSmemC ? StagesC : StagesD>;
+
+  struct SharedStorage {
+    struct TensorStorage {
+      using CollectiveStorage = cute::conditional_t<not is_source_supported, CollectiveStorageWithoutC,
+                                  cute::conditional_t<ReuseSmemC, CollectiveStorageReuseC, CollectiveStorageWithC>>;
+      CollectiveStorage collective;
+
+      using FusionStorage = typename FusionCallbacks::SharedStorage;
+      FusionStorage thread;
+    } tensors;
+
+    struct TensorMapStorage : cute::aligned_struct<128, _0> {
+      cute::TmaDescriptor smem_tensormap_C;
+      cute::array<cute::TmaDescriptor, NumEpilogueWarpGroups> smem_tensormap_D;
+    } tensormaps;
+
+    using PipelineStorage = typename LoadPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using TensorMapStorage = typename SharedStorage::TensorMapStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideC, StrideC>;
+
+  // Host side epilogue arguments
+  struct Arguments {
+    typename FusionCallbacks::Arguments thread{};
+    ElementC const** ptr_C = nullptr;
+    StrideC dC;
+    ElementD ** ptr_D = nullptr;
+    StrideD dD;
+  };
+
+  // Device side epilogue params
+  struct Params {
+    using TMA_C = decltype(make_tma_copy(
+        CopyOpG2S{},
+        make_tensor(make_gmem_ptr(static_cast<NonVoidElementC const*>(nullptr)),
+            repeat_like(InternalStrideC{}, int32_t(0)), InternalStrideC{}),
+        take<0,2>(SmemLayoutC{}),
+        EpilogueTile{},
+        _1{}));
+
+    using TMA_D = decltype(make_tma_copy(
+        CopyOpS2G{},
+        make_tensor(make_gmem_ptr(static_cast<NonVoidElementD const*>(nullptr)),
+            repeat_like(InternalStrideD{}, int32_t(0)), InternalStrideD{}),
+        take<0,2>(SmemLayoutD{}),
+        EpilogueTile{},
+        _1{}));
+
+    typename FusionCallbacks::Params thread{};
+    TMA_C tma_load_c;
+    TMA_D tma_store_d;
+    cute::TmaDescriptor* tensormaps;
+    ElementC const** ptr_C;
+    StrideC dC;
+    ElementD** ptr_D;
+    StrideD dD;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      ProblemShape const& problem_shape,
+      Arguments const& args,
+      [[maybe_unused]] void* workspace) {
+    // These tensor shapes (only applicable for grouped gemm) and pointers are only used to create tensormap/tma desc.
+    // These will be replaced with correct values before the initial tma load.
+    auto init_shape = repeat_like(append<4>(typename ProblemShape::UnderlyingProblemShape{}, 1), int32_t(1));
+    auto init_M = get<0>(init_shape);
+    auto init_N = get<1>(init_shape);
+    auto init_L = get<3>(init_shape);
+
+    static_assert(!is_im2col_C and !is_im2col_D, "Im2Col not supported on C or D");
+
+    InternalStrideC stride_c;
+    InternalStrideD stride_d;
+    if constexpr (IsGroupedGemmKernel) {
+      // Strides for Grouped Gemm will be replaced prior to the first access regardless.
+      stride_c = InternalStrideC{};
+      stride_d = InternalStrideD{};
+    } 
+    else {
+      // Tensor shapes for Ptr-Array are initialized correctly only here.
+      auto problem_shape_MNKL = append<4>(problem_shape.get_host_problem_shape(0), 1);
+      init_M = get<0>(problem_shape_MNKL);
+      init_N = get<1>(problem_shape_MNKL);
+      init_L = get<3>(problem_shape_MNKL);
+
+      stride_c = args.dC;
+      stride_d = args.dD;
+    }
+
+    uint32_t transaction_bytes = TmaTransactionBytes;
+    typename Params::TMA_C tma_load_c{};
+    if constexpr (is_source_supported) {
+      ElementC const* ptr_C_first_batch = reinterpret_cast<ElementC const*>(args.ptr_C); 
+      Tensor tensor_c = make_tensor(ptr_C_first_batch, make_layout(make_shape(init_M,init_N,init_L), append<3>(stride_c, _0{})));
+      tma_load_c = make_tma_copy(
+          CopyOpG2S{},
+          tensor_c,
+          take<0,2>(SmemLayoutC{}),
+          EpilogueTile{},
+          _1{});
+    }
+
+    typename Params::TMA_D tma_store_d;
+    if constexpr (is_destination_supported) {
+      ElementD const* ptr_D_first_batch = reinterpret_cast<ElementD const*>(args.ptr_D);
+      Tensor tensor_d = make_tensor(ptr_D_first_batch, make_layout(make_shape(init_M,init_N,init_L), append<3>(stride_d, _0{})));
+      tma_store_d = make_tma_copy(
+          CopyOpS2G{},
+          tensor_d,
+          take<0,2>(SmemLayoutD{}),
+          EpilogueTile{},
+          _1{});
+    }
+
+    auto fusion_workspace = static_cast<char*>(workspace);
+    auto fusion_workspace_size = FusionCallbacks::get_workspace_size(problem_shape, args.thread);
+    auto tma_descriptor_workspace = reinterpret_cast<cute::TmaDescriptor*>(
+                                      static_cast<char*>(workspace) + fusion_workspace_size);
+
+    return {
+      FusionCallbacks::to_underlying_arguments(problem_shape, args.thread, fusion_workspace),
+      tma_load_c,
+      tma_store_d,
+      tma_descriptor_workspace,
+      args.ptr_C,
+      args.dC,
+      args.ptr_D,
+      args.dD,
+      transaction_bytes,
+    };
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args, int sm_count) {
+    
+    constexpr uint32_t NumInputTensors = NumEpilogueWarpGroups + (cute::is_void_v<ElementC> ? 0 : 1);
+    auto descriptors_shape = cute::make_shape(sm_count, Int<NumInputTensors>{});
+    constexpr size_t SizeOfCuTensorMap = sizeof(cute::TmaDescriptor);
+
+    // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B tensormap copies
+    return (size(descriptors_shape) * SizeOfCuTensorMap) + FusionCallbacks::get_workspace_size(problem_shape, args.thread);
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return FusionCallbacks::initialize_workspace(problem_shape, args.thread, workspace, stream, cuda_adapter);
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+
+    bool implementable = true;
+    bool fusion_implementable = true;
+
+    if (problem_shape.is_host_problem_shape_available()) {
+      for (int i = 0; i < problem_shape.groups(); ++i) {
+        auto problem_shape_MNKL = append<4>(problem_shape.get_host_problem_shape(i), 1);
+        auto [M,N,K,L] = problem_shape_MNKL;
+
+        if constexpr (is_destination_supported) {
+          constexpr int tma_alignment_bits_D = cutlass::detail::get_output_alignment_bits<ElementD>();
+          constexpr int min_tma_aligned_elements_D = tma_alignment_bits_D / cutlass::sizeof_bits<ElementD>::value;
+          implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_D>(cute::make_shape(M,N,L), InternalStrideD{});
+        }
+
+        if constexpr (is_source_supported) {
+          constexpr int tma_alignment_bits_C = cutlass::detail::get_input_alignment_bits<ElementC>();
+          constexpr int min_tma_aligned_elements_C = tma_alignment_bits_C / cutlass::sizeof_bits<ElementC>::value;
+          implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_C>(cute::make_shape(M,N,L), InternalStrideC{});
+        }
+
+        fusion_implementable = fusion_implementable && FusionCallbacks::can_implement(problem_shape_MNKL, args.thread);
+      }
+    }
+    else {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Ignoring check to can implement because host problem shape is not available.\n");
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+
+    if (!fusion_implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum requirements for FusionCallbacks.\n");
+    }
+
+    bool beta_implementable = true;
+
+    if (cute::is_void_v<ElementC> || args.ptr_C == nullptr) {
+      if constexpr (detail::has_beta<Arguments>::value) {
+        beta_implementable = args.thread.beta == 0.0;
+      }
+      if constexpr (detail::has_beta_ptr<Arguments>::value) {
+        beta_implementable = beta_implementable && args.thread.beta_ptr == nullptr;
+      }
+      if constexpr (detail::has_beta_ptr_array<Arguments>::value) {
+        beta_implementable = beta_implementable && args.thread.beta_ptr_array == nullptr;
+      }
+    }
+
+    if (!beta_implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Beta/beta pointer was set, but epilogue is sourceless (void-C).\n");
+    }
+
+    return implementable && fusion_implementable && beta_implementable;
+  }
+
+  template<class TileShapeMNK>
+  CUTLASS_HOST_DEVICE
+  static constexpr int
+  get_load_pipe_increment(TileShapeMNK tile_shape_MNK) {
+    // Compute number of epilogue subtiles
+    return size<1>(zipped_divide(make_layout(take<0,2>(tile_shape_MNK)), EpilogueTile{}));
+  }
+
+  template<class TileShapeMNK>
+  CUTLASS_HOST_DEVICE
+  static constexpr int
+  get_store_pipe_increment(TileShapeMNK tile_shape_MNK) {
+    return get_load_pipe_increment(tile_shape_MNK);
+  }
+
+  CUTLASS_HOST_DEVICE
+  CollectiveEpilogue(Params const& params_, TensorStorage& shared_tensors)
+      : params(params_), fusion_callbacks(params_.thread, shared_tensors.thread) {}
+
+  CUTLASS_DEVICE
+  bool
+  is_producer_load_needed() const {
+    return fusion_callbacks.is_producer_load_needed();
+  }
+
+  CUTLASS_DEVICE auto
+  load_init(
+      Params const& params,
+      TensorMapStorage& shared_tensormaps,
+      int32_t sm_count,
+      int32_t sm_idx) {
+    // Initialize tma for loading
+    constexpr bool IsLoad = true;
+    auto load_tensormaps = tensormaps_init<IsLoad>(params, shared_tensormaps, sm_count, sm_idx, 0);
+    return load_tensormaps;
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class TileShapeMNK,
+    class TileCoordMNKL,
+    class TiledMma,
+    class TensorMapC,
+    __CUTE_REQUIRES(std::is_pointer_v<TensorMapC>)
+  >
+  CUTLASS_DEVICE auto
+  load(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_producer_state,
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShapeMNK tile_shape_MNK,
+      TileCoordMNKL tile_coord_mnkl,
+      TiledMma tiled_mma,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      TensorMapC const& load_tensormap,
+      int subtile_idx=-1,
+      bool wait_until_load_finishes = false) {
+    using namespace cute;
+
+    // Indexing variables
+    auto [M, N, K, L] = problem_shape_mnkl;
+    auto [m_coord, n_coord, k_coord, l_coord] = tile_coord_mnkl;
+
+    static_assert(!is_im2col_D, "Do not support im2col");
+
+    auto coord_shape = append<3>(make_shape(m_coord, n_coord), Int<0>{});
+
+    // Represent the full source tensor, slice to get the tile this CTA is currently responsible for
+    Tensor mC_mn = params.tma_load_c.get_tma_tensor(append<3>(make_shape(M,N), Int<1>{}));             //       (M,N,L)
+    Tensor mC = coalesce(mC_mn, take<0,2>(CtaTileMNK{}));
+    Tensor gC = local_tile(mC, take<0,2>(CtaTileMNK{}), coord_shape);                                  // (CTA_M,CTA_N)
+
+    // Apply epilogue subtile, get matching smem tensor
+    auto ptr_sC = shared_tensors.collective.smem_C.begin();
+    Tensor gC_epi = flat_divide(gC, EpilogueTile{});                             // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+    Tensor sC_epi = make_tensor(make_smem_ptr(ptr_sC), SmemLayoutC{});           //      (EPI_TILE_M,EPI_TILE_N,PIPE_C)
+
+    // Prepare the thread(b)lock's (G)mem to (S)mem TMA tiled copy (bGS_)
+    ThrCopy thrblk_g2s = params.tma_load_c.get_slice(Int<0>{});
+    Tensor bGS_gC = thrblk_g2s.partition_S(gC_epi);                                    // (G2S,G2S_M,G2S_N,EPI_M,EPI_N)
+    Tensor bGS_sC = thrblk_g2s.partition_D(sC_epi);                                    // (G2S,G2S_M,G2S_N,PIPE_C)
+
+    // Get the fusion callbacks for the producer load warp
+    auto pld_args = cutlass::epilogue::fusion::detail::ProducerLoadArgs{
+                      problem_shape_mnkl,
+                      CtaTileMNK{},
+                      tile_coord_mnkl,
+                      tiled_mma,
+                      EpilogueTile{},
+                      thread_idx
+                    };
+    auto pld_callbacks = fusion_callbacks.get_producer_load_callbacks(pld_args);
+    bool is_C_load_needed = is_source_supported && fusion_callbacks.is_C_load_needed();
+
+    LoadPipelineState last_load_producer_state = load_pipe_producer_state;
+
+    // Predication for TMA load (one thread issues TMA load)
+    bool issue_tma_load = cute::elect_one_sync();
+
+    // Pre-loop fusion callback entry point
+    pld_callbacks.begin();
+
+    LoadPipelineState prior_state = load_pipe_producer_state;
+
+    bool did_load = false;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int epi_n = 0; epi_n < size<3>(gC_epi); ++epi_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int epi_m = 0; epi_m < size<2>(gC_epi); ++epi_m) {
+        if (subtile_idx != -1 && (epi_n * static_cast<int>(size<2>(gC_epi)) + epi_m) != subtile_idx) {
+          continue;
+        }
+
+        // Acquire the lock for this stage
+        constexpr uint16_t mcast_mask = 0;
+        uint64_t* tma_barrier = load_pipeline.producer_get_barrier(load_pipe_producer_state);
+
+        load_pipeline.producer_acquire(load_pipe_producer_state);
+
+        // Loop fusion callback entry point
+        pld_callbacks.step(tma_barrier, epi_m, epi_n, load_pipe_producer_state.count(), issue_tma_load);
+
+        // Execute the TMA load for C if needed
+        if (is_C_load_needed) {
+          if (issue_tma_load) {
+            copy(params.tma_load_c.with(load_tensormap, *tma_barrier, mcast_mask),
+                bGS_gC(_,_,_,epi_m,epi_n), bGS_sC(_,_,_,load_pipe_producer_state.index()));
+            load_pipeline.producer_expect_transaction(load_pipe_producer_state);
+          }
+          last_load_producer_state = load_pipe_producer_state;
+          did_load = true;
+        }
+
+        // Commit TMA loads for this stage and release the lock
+        load_pipeline.producer_commit(load_pipe_producer_state);
+        ++load_pipe_producer_state;
+      }
+    }
+
+    // Post-loop fusion callback entry point
+    pld_callbacks.end();
+
+    if (wait_until_load_finishes && did_load) {
+      typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_tma_consumer_state =
+        {last_load_producer_state.index(), !last_load_producer_state.phase(), last_load_producer_state.count()};
+      load_pipeline.consumer_wait(epi_load_pipe_tma_consumer_state);
+    }
+
+    return load_pipe_producer_state;
+  }
+
+  CUTLASS_DEVICE auto
+  load_tail(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_producer_state) {
+
+    if (!fusion_callbacks.is_producer_load_needed()) {
+      return load_pipe_producer_state; 
+    }
+
+    bool issue_tma_load = cute::elect_one_sync();
+    if (issue_tma_load) {
+      load_pipeline.producer_tail(load_pipe_producer_state);
+    }
+
+    return load_pipe_producer_state;
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class TileShapeMNK,
+    class TileCoordMNKL,
+    class AccEngine, class AccLayout,
+    class TiledMma,
+    class TensorMapD
+  >
+  CUTLASS_DEVICE auto
+  store(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_consumer_state,
+      StorePipeline store_pipeline,
+      StorePipelineState store_pipe_producer_state,
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShapeMNK tile_shape_MNK,
+      TileCoordMNKL tile_coord_mnkl,
+      cute::Tensor<AccEngine,AccLayout> accumulators,
+      TiledMma tiled_mma,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      TensorMapD const& store_tensormap,
+      int subtile_idx=-1) {
+
+    using namespace cute;
+    using ElementAccumulator = typename AccEngine::value_type;
+    using ElementCompute_ = typename epilogue::fusion::FusionCallbacksTraits<FusionCallbacks>::ElementCompute;
+    using ElementCompute = cute::conditional_t<cute::is_void_v<ElementCompute_>,ElementAccumulator,ElementCompute_>;
+
+    static_assert(is_rmem<AccEngine>::value, "Accumulator must be RF resident.");
+    static_assert(rank(AccLayout{}) == 3, "Accumulator must be MMA-partitioned: (MMA,MMA_M,MMA_N)");
+    static_assert(rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
+    static_assert(is_static<TileShapeMNK>::value, "TileShapeMNK must be static");
+    static_assert(rank(TileShapeMNK{}) == 3, "TileShapeMNK must be rank 3");
+    static_assert(rank(TileCoordMNKL{}) == 4, "TileCoordMNKL must be rank 4");
+
+    // Indexing variables
+    auto [M, N, K, L] = problem_shape_mnkl;
+    auto [m_coord, n_coord, k_coord, l_coord] = tile_coord_mnkl;
+
+
+    static_assert(!is_im2col_D, "Do not support im2col");
+
+    auto coord_shape = append<3>(make_shape(m_coord, n_coord), Int<0>{});
+
+    // Represent the full output tensor, slice to get the tile this CTA is responsible for
+    Tensor mD_mn = params.tma_store_d.get_tma_tensor(append<3>(make_shape(M,N), Int<1>{}));            //       (M,N,L)
+
+    Tensor mD = coalesce(mD_mn, take<0,2>(CtaTileMNK{}));
+    Tensor gD = local_tile(mD, take<0,2>(CtaTileMNK{}), coord_shape);                                  // (CTA_M,CTA_N)
+
+    // Apply epilogue subtiling
+    Tensor gD_epi = flat_divide(gD, EpilogueTile{});                             // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+
+    // Construct the corresponding pipelined smem tensors
+    auto ptr_sC = shared_tensors.collective.smem_C.begin();
+    auto ptr_sD = shared_tensors.collective.smem_D.begin();
+    Tensor sC_epi = cute::as_position_independent_swizzle_tensor(
+                      make_tensor(make_smem_ptr(ptr_sC), SmemLayoutC{}));             // (EPI_TILE_M,EPI_TILE_N,PIPE_C)
+    Tensor sD_epi = cute::as_position_independent_swizzle_tensor(
+                      make_tensor(make_smem_ptr(ptr_sD), SmemLayoutD{}));             // (EPI_TILE_M,EPI_TILE_N,PIPE_D)
+
+    TiledCopy tiled_copy_C_atom = make_tiled_copy_C_atom(CopyAtomC{}, tiled_mma);
+
+    // (t)hread-partition for (r)egister to (r)egister copy (tRR_)
+    TiledCopy tiled_r2r = [&]() {
+      if constexpr (IsUseR2R) {
+        return make_tiled_copy_S(Copy_Atom<CopyOpR2R, ElementCompute>{}, tiled_copy_C_atom);
+      }
+      else {
+        return make_tiled_copy_S(Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>,
+          ElementCompute>{}, tiled_copy_C_atom);
+      }
+    }();
+    ThrCopy thread_r2r = tiled_r2r.get_slice(thread_idx);
+
+    // (t)hread-partition for (r)egister to (s)mem copy (tRS_)
+    TiledCopy tiled_r2s = [&]() {
+      if constexpr (IsUseR2R) {
+        return make_tiled_copy_D(Copy_Atom<CopyOpR2S,SmemElementD>{}, tiled_r2r);
+      }
+      else {
+        return make_tiled_copy_S(Copy_Atom<CopyOpR2S,SmemElementD>{}, tiled_copy_C_atom);
+      }
+    }();
+    ThrCopy thread_r2s = tiled_r2s.get_slice(thread_idx);
+    Tensor tRS_rAcc = thread_r2s.retile_S(accumulators);                                   // ((R2S,R2S_V),MMA_M,MMA_N)
+    Tensor tRS_sD   = thread_r2s.partition_D(sD_epi);                                       // (R2S,R2S_M,R2S_N,PIPE_D)
+
+    auto mma_tile_m = size<0>(TileShapeMNK{}) / size<1>(tRS_rAcc);
+    auto mma_tile_n = size<1>(TileShapeMNK{}) / size<2>(tRS_rAcc);
+    auto epi_tile_m = size<0>(EpilogueTile{});
+    auto epi_tile_n = size<1>(EpilogueTile{});
+
+    // Allocate D registers
+    Layout tRS_rD_layout = make_layout(take<0,3>(shape(thread_r2s.partition_S(sD_epi))));
+    Tensor tRS_rD = make_tensor<SmemElementD>(tRS_rD_layout);                                          // (R2S,R2S_M,R2S_N)
+
+    // Vectorized fragment view
+    constexpr int FragmentSize = DispatchPolicy::FragmentSize;
+    Tensor tRS_rAcc_frg = recast<Array<ElementAccumulator, FragmentSize>>(tRS_rAcc);
+    Tensor tRS_rD_frg   = recast<Array<SmemElementD      , FragmentSize>>(tRS_rD);
+    CUTE_STATIC_ASSERT(size<0>(tRS_rAcc) % FragmentSize == 0, "Fragment size does not vectorize properly");
+
+    // (t)hread-partition for (s)mem to (r)egister copy (tSR_)
+    TiledCopy tiled_s2r = make_tiled_copy_S(Copy_Atom<CopyOpS2R, SmemElementC>{}, tiled_copy_C_atom);
+    ThrCopy thread_s2r = tiled_s2r.get_slice(thread_idx);
+    Tensor tSR_sC        = thread_s2r.partition_S(sC_epi);                                  // (S2R,S2R_M,S2R_N,PIPE_C)
+    Layout tSR_rC_layout = thread_s2r.retile_D(tRS_rD).layout();                            // (S2R,S2R_M,S2R_N)
+
+    // Allocate C registers
+    // If C smem load is a non-vectorized dst(i) = src(i) then we can allocate C registers directly in the compute type
+    // to eliminate some redundant pack+unpack instruction sequences for sub-word types
+    constexpr bool IsDirectS2R = cute::is_same_v<CopyOpS2R, AutoVectorizingCopyWithAssumedAlignment<128>>
+                                && decltype(max_common_vector(tSR_rC_layout, tSR_sC.layout()))::value <= 1;
+    using RegisterElementC = cute::conditional_t<IsDirectS2R, ElementCompute, SmemElementC>;
+    Tensor tRS_rC = make_tensor<RegisterElementC>(tRS_rD_layout);                                  // (R2S,R2S_M,R2S_N)
+    Tensor tSR_rC = thread_s2r.retile_D(tRS_rC);                                                   // (S2R,S2R_M,S2R_N)
+
+    // thread(b)lock-partition for (s)mem to (g)mem copy (bSG_)
+    ThrCopy thrblk_s2g = params.tma_store_d.get_slice(Int<0>{});
+    Tensor bSG_sD = thrblk_s2g.partition_S(sD_epi);                                    // (S2G,S2G_M,S2G_N,PIPE_D)
+    Tensor bSG_gD = thrblk_s2g.partition_D(gD_epi);                                    // (S2G,S2G_M,S2G_N,EPI_M,EPI_N)
+
+    // OOB predication for tile quantization "residue"
+    // Absolute coordinate tensors (dynamic)
+    Tensor mD_crd = make_identity_tensor(make_shape(M,N));                                                     // (M,N)
+    Tensor cD_mn = local_tile(mD_crd, take<0,2>(CtaTileMNK{}), make_coord(m_coord, n_coord));          // (CTA_M,CTA_N)
+    Tensor tRS_cD_mn = thread_r2s.partition_S(flat_divide(cD_mn, EpilogueTile{}));     // (R2S,R2S_M,R2S_N,EPI_M,EPI_N)
+    // Relative coordinate tensors (static)
+    Tensor cD = make_counting_tensor(cD_mn.layout());                                                  // (CTA_M,CTA_N)
+    Tensor tRS_cD = make_counting_tensor(tRS_cD_mn.layout());                          // (R2S,R2S_M,R2S_N,EPI_M,EPI_N)
+    // Subtract the global "bottom right" corner from the local "top left" corner to get the max relative coordinate
+    auto residue_cD = make_coord(M,N) - cD_mn(_0{});                                                           // (m,n)
+    auto residue_tRS_cD = make_coord(M,N) - tRS_cD_mn(_0{});                                                   // (m,n)
+
+    CUTE_STATIC_ASSERT(epi_tile_m % mma_tile_m == 0, "MMA_TILE_M must divide EPI_TILE_M");
+
+    CUTE_STATIC_ASSERT(mma_tile_n % epi_tile_n == 0, "EPI_TILE_N must divide MMA_TILE_N");
+    // Get TiledCopy for partition reference when consumer store.
+    TiledCopy tiled_copy_partition_ref = make_tiled_copy_S(Copy_Atom<CopyOpR2S,SmemElementD>{}, tiled_copy_C_atom);
+    // Get the fusion callbacks for the consumer store warps
+    constexpr bool RefSrc = true; // Register tensors reference R2S copy src layout
+    auto cst_args = cutlass::epilogue::fusion::detail::ConsumerStoreArgs{
+                      problem_shape_mnkl,
+                      CtaTileMNK{},
+                      tile_coord_mnkl,
+                      tiled_mma,
+                      EpilogueTile{},
+                      tiled_copy_partition_ref,
+                      cD,
+                      residue_cD,
+                      tRS_cD,
+                      residue_tRS_cD,
+                      tRS_rC,
+                      thread_idx
+                    };
+    auto cst_callbacks = fusion_callbacks.template get_consumer_store_callbacks<RefSrc>(cst_args);
+    bool is_producer_load_needed = fusion_callbacks.is_producer_load_needed();
+    bool is_C_load_needed = is_source_supported && fusion_callbacks.is_C_load_needed();
+
+    // Thread synchronizer for previously issued waits or fences
+    // to ensure visibility of smem reads/writes to threads or TMA unit
+    auto synchronize = [&] () { cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
+
+    // Predication for TMA store (one warp issues TMA store)
+    bool issue_tma_store = (thread_idx / NumThreadsPerWarp) == 0;
+
+    // In the reuse smem configuration we have StagesC smem buffers and at most StagesD committed TMA stores in flight.
+    // The TMA store pipeline producer acquire returns when at most StagesD-1 committed stores are in-flight, so we can
+    // only guarantee store completion after StagesD iterations, then we can begin issuing releases on the smem buffer locks.
+    // store_pipe_producer_state tracks the acquire and load_pipe_consumer_state tracks the release, in circular buffer fashion.
+    LoadPipelineState load_wait_state = load_pipe_consumer_state;
+    if constexpr (ReuseSmemC) {
+      load_wait_state = store_pipe_producer_state;
+      load_wait_state.phase_ ^= 1;
+    }
+
+    // We can delay issue of TMA store by one iteration to achieve better interleaving of non-TMA instructions
+    // Sync requirements of smem reuse may preclude this optimization
+    // Delayed stores cause delayed stage releases which causes deadlock when StagesC == StagesD
+    int epi_m_prev = 0, epi_n_prev = 0;
+    static_assert(not (DelayTmaStore and ReuseSmemC and StagesC <= StagesD), "This TMA epilogue configuration will deadlock");
+
+    // The TMA store sequence for one subtile iteration
+    auto tma_store_fn = [&] (int epi_m, int epi_n) {
+      // Write the tile from smem to gmem with TMA
+      cutlass::arch::fence_view_async_shared(); // ensure smem writes are visible to TMA
+      synchronize(); // ensure all threads have issued their async fence
+      if constexpr (is_destination_supported) {
+        if (issue_tma_store) {
+          copy(params.tma_store_d.with(store_tensormap), bSG_sD(_,_,_,store_pipe_producer_state.index()), bSG_gD(_,_,_,epi_m,epi_n));
+        }
+      }
+
+      // Post async fence, pre TMA commit callback entry point
+      cst_callbacks.tma_store(epi_m, epi_n, store_pipe_producer_state.count(), issue_tma_store);
+
+      // Commit the TMA stores for this stage
+      if (issue_tma_store) {
+        store_pipeline.producer_commit(store_pipe_producer_state);
+      }
+      ++store_pipe_producer_state;
+      ++issued_stores;
+
+      // Wait for the next smem buffer to be available
+      if (issue_tma_store) {
+        store_pipeline.producer_acquire(store_pipe_producer_state);
+      }
+      synchronize();
+
+      if constexpr (ReuseSmemC) {
+        // producer_acquire returns when at most StagesD-1 committed stores are pending
+        bool store_finished = issued_stores > StorePipeline::UnacquiredStages;
+        // Let dma warp know earliest smem buffer is consumed and empty after StagesD producer commits
+        if (store_finished) {
+          if (is_producer_load_needed) {
+            load_pipeline.consumer_release(load_pipe_consumer_state);
+          }
+          ++load_pipe_consumer_state;
+        }
+      }
+    };
+
+    //
+    // BEGIN EPILOGUE
+    //
+
+    // Pre-loop fusion callback entry point
+    cst_callbacks.begin();
+
+    // For each output tile
+    CUTLASS_PRAGMA_UNROLL
+    for (int epi_n = 0; epi_n < size<3>(gD_epi); ++epi_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int epi_m = 0; epi_m < size<2>(gD_epi); ++epi_m) {
+        bool is_first_iteration = epi_m == 0 && epi_n == 0;
+        bool is_last_iteration = epi_m == size<2>(gD_epi)-1 && epi_n == size<3>(gD_epi)-1;
+
+        if (subtile_idx != -1 && (epi_n * static_cast<int>(size<2>(gD_epi)) + epi_m) != subtile_idx) {
+          continue;
+        }
+
+        cst_callbacks.begin_loop(epi_m, epi_n);
+
+        if (is_producer_load_needed) {
+          // Wait for the producer load to fill smem
+          load_pipeline.consumer_wait(load_wait_state);
+
+          if (is_C_load_needed) {
+            // Copy source tile from smem to register
+            copy(tiled_s2r, tSR_sC(_,_,_,load_wait_state.index()), tSR_rC);
+          }
+        }
+
+        // First loop fusion callback entry point
+        cst_callbacks.previsit(epi_m, epi_n, load_wait_state.count(), is_producer_load_needed);
+
+        if (is_producer_load_needed) {
+          if constexpr (not ReuseSmemC) {
+            // Let producer load warp know smem buffers are consumed and empty
+            cutlass::arch::fence_view_async_shared();
+            load_pipeline.consumer_release(load_pipe_consumer_state);
+            ++load_pipe_consumer_state;
+          }
+          ++load_wait_state;
+        }
+
+        int mma_m = epi_m;
+        int mma_n = (epi_n * size<1>(EpilogueTile{})) / mma_tile_n;
+        Tensor tRS_rAcc_frg_mn = tRS_rAcc_frg(_,mma_m,mma_n);
+
+        // Vectorized fragment loop with visitor callback entry point
+        int epi_n_in_mma = epi_n % (mma_tile_n / epi_tile_n);
+        int r2s_v = epi_n_in_mma * size(tRS_rD_frg);
+        CUTLASS_PRAGMA_UNROLL
+        for (int epi_v = 0; epi_v < size(tRS_rD_frg); ++epi_v) {
+          tRS_rD_frg(epi_v) = cst_callbacks.visit(tRS_rAcc_frg_mn(r2s_v + epi_v), epi_v, epi_m, epi_n);
+        }
+        // The latest we can delay the TMA store is right before the smem store of the next iteration
+        // since the current TMA store needs to be committed before we can acquire the next smem buffer
+        if constexpr (DelayTmaStore) {
+          // Issue TMA stores for the previous subtile
+          if (not is_first_iteration and subtile_idx == -1) {
+            tma_store_fn(epi_m_prev, epi_n_prev);
+          }
+          epi_m_prev = epi_m;
+          epi_n_prev = epi_n;
+        }
+
+        // Smem reduction callback entry point using current store buffer for workspace
+        cst_callbacks.reduce(sD_epi(_,_,store_pipe_producer_state.index()),
+                              synchronize, epi_m, epi_n, is_last_iteration, tRS_rD_frg);
+
+        // Copy tile from register to regiser if needed
+        if constexpr (IsUseR2R) {
+          // retile source and destination for tiled_r2r
+          Tensor tRR_rD_src = thread_r2r.retile_S(tRS_rD);                             // (R2R,R2R_M,R2R_N,EPI_M,EPI_N)
+          Tensor tRR_rD_dst = thread_r2r.retile_D(tRS_rD);                             // (R2R,R2R_M,R2R_N,EPI_M,EPI_N)
+
+          // Output needs register shuffling before copying to shared memory.
+          copy(tiled_r2r, tRR_rD_src, tRR_rD_dst);
+        }
+
+        // Copy tile from register to smem
+        if constexpr (is_destination_supported) {
+          copy(tiled_r2s, tRS_rD, tRS_sD(_,_,_,store_pipe_producer_state.index()));
+        }
+
+        // Post reduction, pre TMA store callback entry point
+        constexpr bool issue_smem_store = true; // No smem store predication
+        cst_callbacks.postreduce(epi_m, epi_n, store_pipe_producer_state.count(), issue_smem_store);
+
+        if constexpr (not DelayTmaStore) {
+          // Issue TMA stores for this subtile
+          tma_store_fn(epi_m, epi_n);
+        }
+
+        cst_callbacks.end_loop(epi_m, epi_n);
+
+      } // for epi_m
+    } // for epi_n
+
+
+    if constexpr (DelayTmaStore) {
+      // Issue TMA stores for the last subtile
+      tma_store_fn(epi_m_prev, epi_n_prev);
+    }
+
+    // Post-loop fusion callback entry point
+    cst_callbacks.end();
+
+    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state);
+  }
+
+  CUTLASS_DEVICE auto
+  store_tail(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_consumer_state,
+      StorePipeline store_pipeline,
+      StorePipelineState store_pipe_producer_state) {
+    // wait for all TMA stores to complete
+    store_pipeline.producer_tail(store_pipe_producer_state);
+    // reset store counter
+    issued_stores = 0;
+
+    if constexpr (ReuseSmemC) {
+      if (fusion_callbacks.is_producer_load_needed()) {
+        // Issue releases on up to StagesD-1 previously issued TMA stores
+        constexpr int release_stages = cute::min(StorePipeline::UnacquiredStages, get_load_pipe_increment(CtaTileMNK{}));
+        CUTLASS_PRAGMA_UNROLL
+        for (int stage = 0; stage < release_stages; ++stage) {
+          load_pipeline.consumer_release(load_pipe_consumer_state);
+          ++load_pipe_consumer_state;
+        }
+      }
+    }
+
+    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state);
+  }
+
+  CUTLASS_DEVICE auto
+  store_init(
+      Params const& params,
+      TensorMapStorage& shared_tensormaps,
+      int32_t sm_count,
+      int32_t sm_idx,
+      int32_t warp_group_idx) {
+    int warp_idx_in_warp_group = canonical_warp_idx_sync() % NumWarpsPerWarpGroup;
+    // Since only one warp issues TMA store, we only need that one warp to initialize tensormaps
+    if (warp_idx_in_warp_group == 0) {
+      // Initialize tma
+      constexpr bool IsLoad = false;
+      auto store_tensormaps = tensormaps_init<IsLoad>(params, shared_tensormaps, sm_count, sm_idx, warp_group_idx);
+      return store_tensormaps;
+    }
+    TmaDescriptor* null_tma_desc = nullptr;
+    return cute::make_tuple(null_tma_desc);
+  }
+
+  //
+  // Methods to perform different parts of TMA/Tensormap modifications
+  //
+
+  template <bool IsLoad>
+  CUTLASS_DEVICE auto
+  tensormaps_init(
+      Params const& params,
+      TensorMapStorage& shared_tensormaps,
+      int32_t sm_count,
+      int32_t sm_idx,
+      int32_t warp_group_idx) {
+
+    constexpr uint32_t NumInputTensors = NumEpilogueWarpGroups + (cute::is_void_v<ElementC> ? 0 : 1);
+    Layout desc_layout = make_layout(make_shape(sm_count, Int<NumInputTensors>{}));
+
+    Tensor gmem_tensormap = make_tensor(params.tensormaps, desc_layout);                      // (SMs, NumInputTensors)
+
+    if constexpr (IsLoad) {
+      if (is_source_supported) {
+        constexpr int C_tensormap_index = NumEpilogueWarpGroups;
+        Tensor pC_tensormap = make_tensor(params.tma_load_c.get_tma_descriptor(), Int<1>{}, Int<1>{});
+        Tensor sC_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_C), Int<1>{}, Int<1>{});
+
+        if (cute::elect_one_sync()) {
+          // Bringing tensormaps from params to smem for modification later
+          copy(recast<uint128_t>(pC_tensormap), recast<uint128_t>(sC_tensormap));
+        }
+        __syncwarp();
+        return cute::make_tuple(&gmem_tensormap(sm_idx, C_tensormap_index));
+
+      }
+      TmaDescriptor* null_tma_desc = nullptr;
+      return cute::make_tuple(null_tma_desc);
+    }
+    else {
+      Tensor pD_tensormap = make_tensor(params.tma_store_d.get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sD_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_D[warp_group_idx]), Int<1>{}, Int<1>{});
+
+      if (cute::elect_one_sync()) {
+        // Bringing tensormaps from params to smem for modification later
+        copy(recast<uint128_t>(pD_tensormap), recast<uint128_t>(sD_tensormap));
+      }
+      __syncwarp();
+      return cute::make_tuple(&gmem_tensormap(sm_idx, warp_group_idx));
+    }
+  }
+
+  // Replace address for the global tensor (to be done by single thread)
+  template <bool IsLoad>
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_address(
+      TensorMapStorage& shared_tensormaps,
+      Params const& params,
+      int32_t next_batch,
+      int32_t warp_group_idx) {
+    // Replacing global_address for the next batch
+    if constexpr (IsLoad) {
+      if constexpr (is_source_supported) {
+        if (params.ptr_C != nullptr) {
+          cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_C,
+                                                          params.ptr_C[next_batch]);
+        }
+      }
+    }
+    else if constexpr (is_destination_supported) {
+      cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_D[warp_group_idx],
+                                                      params.ptr_D[next_batch]);
+    }
+  }
+
+  // Replace dim and strides for the global tensor - used only for Grouped GEMM (to be done by single thread)
+  template <bool IsLoad, class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_tensor_properties(
+      TensorMapStorage& shared_tensormaps,
+      Params const& params,
+      int32_t next_group,
+      ProblemShape_MNKL problem_shape_mnkl,
+      int32_t warp_group_idx) {
+    const uint32_t M = get<0>(problem_shape_mnkl);
+    const uint32_t N = get<1>(problem_shape_mnkl);
+    // Replace all dims for consistency
+    constexpr int MaxTensorRank = 5;
+    cute::array<uint32_t, MaxTensorRank> prob_shape  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride = {0,0,0,0,0};
+
+    if constexpr (IsLoad) {
+      if constexpr (is_source_supported) {
+        if (params.dC != nullptr) {
+          ElementC const* ptr_C = nullptr;
+          Tensor tensor_c = make_tensor(ptr_C, make_layout(make_shape(M,N,Int<1>{}), params.dC[next_group]));
+
+          cute::detail::fill_tma_gmem_shape_stride(params.tma_load_c, tensor_c, 
+                                                  prob_shape, prob_stride);
+          // Convert strides to byte strides
+          for (uint64_t& stride : prob_stride) {
+            stride = (stride * sizeof_bits_v<ElementC>) / 8;
+          }
+          cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_C,
+                                                                  prob_shape,
+                                                                  prob_stride);
+        }
+      }
+    }
+    else if constexpr (is_destination_supported) {
+      ElementD const* ptr_D = nullptr;
+      Tensor tensor_d = make_tensor(ptr_D, make_layout(make_shape(M,N,Int<1>{}), params.dD[next_group]));
+
+      cute::detail::fill_tma_gmem_shape_stride(params.tma_store_d, tensor_d, 
+                                               prob_shape, prob_stride);
+      // Convert strides to byte strides
+      for (uint64_t& stride : prob_stride) {
+        stride = (stride * sizeof_bits_v<ElementD>) / 8;
+      }
+
+      cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_D[warp_group_idx],
+                                                              prob_shape,
+                                                              prob_stride);
+    }
+  }
+
+  template <bool IsLoad, class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  void
+  tensormaps_perform_update(
+      TensorMapStorage& shared_tensormaps,
+      Params const& params,
+      cute::TmaDescriptor const* tensormap,
+      ProblemShape_MNKL problem_shape_mnkl,
+      int32_t next_batch,
+      int32_t warp_group_idx) {
+
+    if (cute::elect_one_sync()) {
+      // Replacing global_address for the next batch
+      tensormaps_replace_global_address<IsLoad>(shared_tensormaps, params, next_batch, warp_group_idx);
+
+      if constexpr (IsGroupedGemmKernel) {
+        // Replacing global dims and strides for the next batch
+        tensormaps_replace_global_tensor_properties<IsLoad>(
+            shared_tensormaps, params, next_batch, problem_shape_mnkl, warp_group_idx);
+      }
+
+    }
+  }
+
+  template <bool IsLoad>
+  CUTLASS_DEVICE
+  void
+  tensormaps_cp_fence_release(
+      TensorMapStorage& shared_tensormaps,
+      cute::TmaDescriptor const* tensormap,
+      const int32_t warp_group_idx = 0) {
+
+    // Entire warp must do this (ie its aligned)
+    if constexpr (IsLoad) {
+      if constexpr (is_source_supported) {
+        tma_descriptor_cp_fence_release(tensormap, shared_tensormaps.smem_tensormap_C);
+      }
+    }
+    else if constexpr (is_destination_supported) {
+      tma_descriptor_cp_fence_release(tensormap, shared_tensormaps.smem_tensormap_D[warp_group_idx]);
+    }
+  }
+
+  template <bool IsLoad>
+  CUTLASS_DEVICE
+  void
+  tensormaps_fence_acquire(cute::TmaDescriptor const* tensormap) {
+    if constexpr (IsLoad) {
+      if constexpr (is_source_supported) {
+        cute::tma_descriptor_fence_acquire(tensormap);
+      }
+    } 
+    else {
+      cute::tma_descriptor_fence_acquire(tensormap);
+    }
+  }
+
+private:
+  Params const& params;
+  FusionCallbacks fusion_callbacks;
+  int issued_stores = 0;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp b/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp
new file mode 100644
index 0000000000..b3c7bf387d
--- /dev/null
+++ b/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp
@@ -0,0 +1,918 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing elementwise operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/epilogue/fusion/callbacks.hpp"
+#include "cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/trace.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/cuda_host_adapter.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int StagesC_,
+  int StagesD_,
+  int FragmentSize_,
+  bool ReuseSmemC_,
+  bool DelayTmaStore_,
+  class CtaTileMNK_,   //     (CTA_M,CTA_N,CTA_K)
+  class EpilogueTile_, // (EPI_TILE_M,EPI_TILE_N)
+  class ElementC_,
+  class StrideC_,
+  class ElementD_,
+  class StrideD_,
+  class FusionCallbacks_,
+  class CopyOpG2S_,
+  class SmemLayoutAtomC_,
+  class CopyOpS2R_,
+  class CopyOpS2G_,
+  class SmemLayoutAtomD_,
+  class CopyOpR2S_,
+  class CopyAtomC_,
+  class CopyOpR2R_
+>
+class CollectiveEpilogue<
+    Sm90TmaWarpSpecialized<StagesC_,StagesD_,FragmentSize_,ReuseSmemC_,DelayTmaStore_>,
+    CtaTileMNK_,
+    EpilogueTile_,
+    ElementC_,
+    StrideC_,
+    ElementD_,
+    StrideD_,
+    FusionCallbacks_,
+    CopyOpG2S_,
+    SmemLayoutAtomC_,
+    CopyOpS2R_,
+    CopyOpS2G_,
+    SmemLayoutAtomD_,
+    CopyOpR2S_,
+    CopyAtomC_,
+    CopyOpR2R_
+> {
+public:
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = Sm90TmaWarpSpecialized<StagesC_,StagesD_,FragmentSize_,ReuseSmemC_,DelayTmaStore_>;
+  using CtaTileMNK = CtaTileMNK_;
+  using EpilogueTile = EpilogueTile_;
+  using FusionCallbacks = FusionCallbacks_;
+  using ElementC = ElementC_;
+  using StrideC = StrideC_;
+  using ElementD = ElementD_;
+  using StrideD = StrideD_;
+  using CopyOpG2S = CopyOpG2S_;
+  using SmemLayoutAtomC = SmemLayoutAtomC_;
+  using CopyOpS2R = CopyOpS2R_;
+  using CopyOpS2G = CopyOpS2G_;
+  using SmemLayoutAtomD = SmemLayoutAtomD_;
+  using CopyOpR2S = CopyOpR2S_;
+  using CopyAtomC = CopyAtomC_;
+  using CopyOpR2R = CopyOpR2R_;
+
+  using ThreadEpilogueOp = typename epilogue::fusion::FusionCallbacksTraits<FusionCallbacks>::Operation;
+  using GmemTiledCopyC = CopyOpG2S;
+  using GmemTiledCopyD = CopyOpS2G;
+
+  static_assert(!is_layout<EpilogueTile>::value && is_tuple<EpilogueTile>::value, "EpilogueTile must be a cute::Tile or cute::Shape");
+  static_assert(cute::rank(CtaTileMNK{}) == 3, "CtaTileMNK must be rank-3: [CTA_M, CTA_N, CTA_K]");
+  static_assert(cute::rank(EpilogueTile{}) == 2, "EpilogueTile must be rank-2: [EPI_TILE_M, EPI_TILE_N]");
+  static_assert(size<0>(CtaTileMNK{}) % size<0>(shape(EpilogueTile{})) == 0, "EPI_TILE_M must divide CTA_M");
+  static_assert(size<1>(CtaTileMNK{}) % size<1>(shape(EpilogueTile{})) == 0, "EPI_TILE_N must divide CTA_N");
+  static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]");
+  static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]");
+
+private:
+  constexpr static bool is_source_supported = not cute::is_void_v<ElementC>;
+  constexpr static bool is_destination_supported = not cute::is_void_v<ElementD>;
+  using NonVoidElementD = cute::conditional_t<not is_destination_supported,fusion::get_element_aux_t<FusionCallbacks>, ElementD>;
+  static_assert(not cute::is_void_v<NonVoidElementD>, "SmemElementD is void");
+  using NonVoidElementC = cute::conditional_t<not is_source_supported,NonVoidElementD,ElementC>; // prevents void ref breakages
+
+  using TmaElementD = cute::conditional_t<cute::is_same_v<NonVoidElementD, cutlass::complex<float>>, uint64_t, NonVoidElementD>;
+  using TmaElementC = cute::conditional_t<cute::is_same_v<NonVoidElementC, cutlass::complex<float>>, uint64_t, NonVoidElementC>;
+
+  using SmemElementC = typename cutlass::detail::get_unpacked_element_type<NonVoidElementC>::type;
+  using SmemElementD = typename cutlass::detail::get_unpacked_element_type<NonVoidElementD>::type;
+
+  constexpr static int StagesC = StagesC_;
+  constexpr static int StagesD = StagesD_;
+  constexpr static bool ReuseSmemC = ReuseSmemC_ and is_destination_supported;
+  constexpr static bool DelayTmaStore = DelayTmaStore_;
+
+  constexpr static bool is_m_major_C = detail::is_m_major<StrideC>();
+  constexpr static bool is_m_major_D = detail::is_m_major<StrideD>();
+
+  constexpr static bool is_im2col_C = cute::is_same_v<CopyOpG2S, SM90_TMA_LOAD_IM2COL>;
+  constexpr static bool is_im2col_D = cute::is_same_v<CopyOpS2G, SM90_TMA_STORE_IM2COL>;
+
+  // Check if register transformation is needed before copying register to shared memory.
+  constexpr static bool IsUseR2R = !cute::is_void_v<CopyOpR2R>;
+
+  using SmemLayoutC = decltype(tile_to_shape(
+      SmemLayoutAtomC{},
+      make_shape(size<0>(EpilogueTile{}), size<1>(EpilogueTile{}), Int<StagesC>{}),
+      cute::conditional_t<is_m_major_C, Step<_2,_1,_3>, Step<_1,_2,_3>>{} ));
+  using SmemLayoutD = decltype(tile_to_shape(
+      SmemLayoutAtomD{},
+      make_shape(size<0>(EpilogueTile{}), size<1>(EpilogueTile{}), Int<ReuseSmemC ? StagesC : StagesD>{}),
+      cute::conditional_t<is_m_major_D, Step<_2,_1,_3>, Step<_1,_2,_3>>{} ));
+
+  constexpr static bool support_smem_reuse = is_source_supported && is_destination_supported && StagesD <= StagesC
+                                            && cosize(take<0,2>(SmemLayoutC{})) == cosize(take<0,2>(SmemLayoutD{}));
+  static_assert(not (ReuseSmemC && not support_smem_reuse), "Smem reuse requirements not met");
+
+  constexpr static size_t SmemAlignmentD = cutlass::detail::alignment_for_swizzle(SmemLayoutD{});
+  constexpr static size_t SmemAlignmentC = cutlass::detail::alignment_for_swizzle(SmemLayoutC{});
+  constexpr static size_t MaxSmemAlignment = cute::max(SmemAlignmentC, SmemAlignmentD);
+
+  using SmemArrayTypeC = cute::ArrayEngine<SmemElementC, cosize_v<SmemLayoutC>>;
+  using SmemArrayTypeD = cute::ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>>;
+
+  using EmptyType = cute::tuple<>;
+  using SmemCStorage = cute::conditional_t<is_source_supported and (not ReuseSmemC),
+                         SmemArrayTypeC,
+                         EmptyType>;
+  using SmemDStorage = cute::conditional_t<is_destination_supported,
+                         SmemArrayTypeD,
+                         EmptyType>;
+
+  struct CollectiveStorageWithC {
+    alignas(SmemAlignmentC) ArrayEngine<SmemElementC, cosize_v<SmemLayoutC>> smem_C;
+    alignas(SmemAlignmentD) ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>> smem_D;
+  };
+
+  union CollectiveStorageWithoutC {
+    cute::array<SmemElementC, 0> smem_C;
+    alignas(SmemAlignmentD) ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>> smem_D;
+  };
+
+  union CollectiveStorageReuseC {
+    alignas(MaxSmemAlignment) ArrayEngine<SmemElementC, cosize_v<SmemLayoutC>> smem_C;
+    alignas(MaxSmemAlignment) ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>> smem_D;
+  };
+
+public:
+  // TMA pipeline for loading C
+  using LoadPipeline = cutlass::PipelineTransactionAsync<StagesC>;
+  using LoadPipelineState = cutlass::PipelineState<StagesC>;
+  constexpr static uint32_t TmaTransactionBytes =
+    (size(take<0,2>(SmemLayoutC{})) * static_cast<uint32_t>(sizeof_bits<SmemElementC>::value)) / 8;
+  constexpr static bool RequiresTransactionBytes = true;
+
+  // TMA pipeline for storing D
+  using StorePipeline = cute::conditional_t<ReuseSmemC,
+                          cutlass::PipelineTmaStore<StagesC, StagesD-1>,
+                          cutlass::PipelineTmaStore<StagesD>>;
+  using StorePipelineState = cutlass::PipelineState<ReuseSmemC ? StagesC : StagesD>;
+
+  struct SharedStorage {
+    struct TensorStorage {
+      using CollectiveStorage = cute::conditional_t<not is_source_supported, CollectiveStorageWithoutC,
+                                  cute::conditional_t<ReuseSmemC, CollectiveStorageReuseC, CollectiveStorageWithC>>;
+      CollectiveStorage collective;
+
+      using FusionStorage = typename FusionCallbacks::SharedStorage;
+      FusionStorage thread;
+    } tensors;
+
+    using PipelineStorage = typename LoadPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side epilogue arguments
+  struct Arguments {
+    typename FusionCallbacks::Arguments thread{};
+    ElementC const* ptr_C;
+    StrideC dC;
+    ElementD const* ptr_D;
+    StrideD dD;
+  };
+
+  // Device side epilogue params
+  struct Params {
+    using TMA_C = decltype(make_tma_copy(
+        CopyOpG2S{},
+        make_tensor(make_gmem_ptr<TmaElementC const>(nullptr),
+            repeat_like(StrideC{}, int32_t(0)), StrideC{}),
+        take<0,2>(SmemLayoutC{}),
+        EpilogueTile{},
+        _1{}));
+    using TMA_D = decltype(make_tma_copy(
+        CopyOpS2G{},
+        make_tensor(make_gmem_ptr<TmaElementD>(nullptr),
+            repeat_like(StrideD{}, int32_t(0)), StrideD{}),
+        take<0,2>(SmemLayoutD{}),
+        EpilogueTile{},
+        _1{}));
+
+    typename FusionCallbacks::Params thread{};
+    TMA_C tma_load_c;
+    TMA_D tma_store_d;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      ProblemShape const& problem_shape,
+      Arguments const& args,
+      [[maybe_unused]] void* workspace) {
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    uint32_t transaction_bytes = TmaTransactionBytes;
+    typename Params::TMA_C tma_load_c{};
+    if constexpr (is_source_supported) {
+      Tensor tensor_c = make_tensor(make_gmem_ptr<TmaElementC const>(args.ptr_C), make_layout(make_shape(M,N,L), args.dC));
+      tma_load_c = make_tma_copy_C_sm90(
+          CopyOpG2S{},
+          tensor_c,
+          take<0,2>(SmemLayoutC{}),
+          EpilogueTile{});
+    }
+
+    typename Params::TMA_D tma_store_d;
+    if constexpr (is_destination_supported) {
+      Tensor tensor_d = make_tensor(make_gmem_ptr<TmaElementD>(args.ptr_D), make_layout(make_shape(M,N,L), args.dD));
+      tma_store_d = make_tma_copy_C_sm90(
+          CopyOpS2G{},
+          tensor_d,
+          take<0,2>(SmemLayoutD{}),
+          EpilogueTile{});
+    }
+
+    return {
+      FusionCallbacks::to_underlying_arguments(problem_shape, args.thread, workspace),
+      tma_load_c,
+      tma_store_d,
+      transaction_bytes
+    };
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return FusionCallbacks::get_workspace_size(problem_shape, args.thread);
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream, 
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return FusionCallbacks::initialize_workspace(problem_shape, args.thread, workspace, stream, cuda_adapter);
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+    auto shape = cute::make_shape(M,N,L);
+
+    bool implementable = true;
+    if constexpr (is_destination_supported) {
+      constexpr int tma_alignment_bits_D = cutlass::detail::get_output_alignment_bits<ElementD>();
+      constexpr int min_tma_aligned_elements_D = tma_alignment_bits_D / cutlass::sizeof_bits<ElementD>::value;
+      if constexpr (cute::is_same_v<CopyOpS2G, SM90_TMA_STORE_IM2COL>) { // ignore L stride for implicit gemm
+        implementable = cutlass::detail::check_alignment<min_tma_aligned_elements_D>(take<0,2>(shape), take<0,2>(StrideD{}));
+      }
+      else {
+        implementable = cutlass::detail::check_alignment<min_tma_aligned_elements_D>(shape, StrideD{});
+      }
+    }
+
+    if constexpr (not cute::is_void_v<ElementC>) {
+      constexpr int tma_alignment_bits_C = cutlass::detail::get_input_alignment_bits<ElementC>();
+      constexpr int min_tma_aligned_elements_C = tma_alignment_bits_C / cutlass::sizeof_bits<ElementC>::value;
+      if constexpr (cute::is_same_v<CopyOpG2S, SM90_TMA_LOAD_IM2COL>) { // ignore L stride for implicit gemm
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_C>(take<0,2>(shape), take<0,2>(StrideC{}));
+      }
+      else {
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_C>(shape, StrideC{});
+      }
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+
+    bool fusion_implementable = FusionCallbacks::can_implement(problem_shape, args.thread);
+
+    if (!fusion_implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum requirements for FusionCallbacks.\n");
+    }
+
+    bool beta_implementable = true;
+
+    if constexpr (cute::is_void_v<ElementC>) {
+      if constexpr (detail::has_beta<Arguments>::value) {
+        beta_implementable = args.thread.beta == 0.0;
+      }
+      if constexpr (detail::has_beta_ptr<Arguments>::value) {
+        beta_implementable = beta_implementable && args.thread.beta_ptr == nullptr;
+      }
+    }
+
+    if (!beta_implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Beta/beta pointer was set, but epilogue is sourceless (void-C).\n");
+    }
+
+    return implementable && fusion_implementable && beta_implementable;
+  }
+
+  template<class TileShapeMNK>
+  CUTLASS_HOST_DEVICE
+  static constexpr int
+  get_load_pipe_increment(TileShapeMNK tile_shape_MNK) {
+    // Compute number of epilogue subtiles
+    return size<1>(zipped_divide(make_layout(take<0,2>(tile_shape_MNK)), EpilogueTile{}));
+  }
+
+  template<class TileShapeMNK>
+  CUTLASS_HOST_DEVICE
+  static constexpr int
+  get_store_pipe_increment(TileShapeMNK tile_shape_MNK) {
+    return get_load_pipe_increment(tile_shape_MNK);
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void
+  prefetch_tma_descriptors(Params const& epilogue_params) {
+    if constexpr (is_source_supported) {
+      cute::prefetch_tma_descriptor(epilogue_params.tma_load_c.get_tma_descriptor());
+    }
+    if constexpr (is_destination_supported) {
+      cute::prefetch_tma_descriptor(epilogue_params.tma_store_d.get_tma_descriptor());
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  CollectiveEpilogue(Params const& params_, TensorStorage& shared_tensors)
+      : params(params_), fusion_callbacks(params_.thread, shared_tensors.thread) {}
+
+  CUTLASS_DEVICE
+  bool
+  is_producer_load_needed() const {
+    return fusion_callbacks.is_producer_load_needed();
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class TileShapeMNK,
+    class TileCoordMNKL,
+    class TiledMma
+  >
+  CUTLASS_DEVICE auto
+  load(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_producer_state,
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShapeMNK tile_shape_MNK,
+      TileCoordMNKL tile_coord_mnkl,
+      TiledMma tiled_mma,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      int subtile_idx=-1) {
+    using namespace cute;
+
+    // Indexing variables
+    auto [M, N, K, L] = problem_shape_mnkl;
+    auto [m_coord, n_coord, k_coord, l_coord] = tile_coord_mnkl;
+
+    // The tma tensor C under im2col mode only has two modes (M, N) which
+    // should be local tiled with only (m_coord, n_coord).
+    auto coord_shape = conditional_return<is_im2col_C>(
+      make_coord(m_coord, n_coord),
+      make_coord(m_coord, n_coord, l_coord));
+
+    // Represent the full source tensor, slice to get the tile this CTA is currently responsible for
+    Tensor mC_mn = params.tma_load_c.get_tma_tensor(make_shape(M,N,L));                                //       (M,N,L)
+    Tensor mC = coalesce(mC_mn, take<0,2>(CtaTileMNK{}));
+    Tensor gC = local_tile(mC, take<0,2>(CtaTileMNK{}), coord_shape);                                  // (CTA_M,CTA_N)
+
+    // Apply epilogue subtile, get matching smem tensor
+    auto ptr_sC = shared_tensors.collective.smem_C.begin();
+    Tensor gC_epi = flat_divide(gC, EpilogueTile{});                             // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+    Tensor sC_epi = make_tensor(make_smem_ptr(ptr_sC), SmemLayoutC{});           //      (EPI_TILE_M,EPI_TILE_N,PIPE_C)
+
+    // Prepare the thread(b)lock's (G)mem to (S)mem TMA tiled copy (bGS_)
+    ThrCopy thrblk_g2s = params.tma_load_c.get_slice(Int<0>{});
+    Tensor bGS_gC = thrblk_g2s.partition_S(gC_epi);                                    // (G2S,G2S_M,G2S_N,EPI_M,EPI_N)
+    Tensor bGS_sC = thrblk_g2s.partition_D(sC_epi);                                    // (G2S,G2S_M,G2S_N,PIPE_C)
+
+    // Get the fusion callbacks for the producer load warp
+    auto pld_args = cutlass::epilogue::fusion::detail::ProducerLoadArgs(
+                      problem_shape_mnkl,
+                      CtaTileMNK{},
+                      tile_coord_mnkl,
+                      tiled_mma,
+                      EpilogueTile{},
+                      thread_idx
+                    );
+    auto pld_callbacks = fusion_callbacks.get_producer_load_callbacks(pld_args);
+    bool is_C_load_needed = is_source_supported && fusion_callbacks.is_C_load_needed();
+
+    // Predication for TMA load (one thread issues TMA load)
+    bool issue_tma_load = cute::elect_one_sync();
+
+    // Pre-loop fusion callback entry point
+    pld_callbacks.begin();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int epi_n = 0; epi_n < size<3>(gC_epi); ++epi_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int epi_m = 0; epi_m < size<2>(gC_epi); ++epi_m) {
+        if (subtile_idx != -1 && (epi_n * static_cast<int>(size<2>(gC_epi)) + epi_m) != subtile_idx) {
+          continue;
+        }
+        // Acquire the lock for this stage
+        constexpr uint16_t mcast_mask = 0;
+        uint64_t* tma_barrier = load_pipeline.producer_get_barrier(load_pipe_producer_state);
+        load_pipeline.producer_acquire(load_pipe_producer_state);
+
+        // Loop fusion callback entry point
+        pld_callbacks.step(tma_barrier, epi_m, epi_n, load_pipe_producer_state.count(), issue_tma_load);
+
+        // Execute the TMA load for C if needed
+        if (issue_tma_load && is_C_load_needed) {
+          copy(params.tma_load_c.with(*tma_barrier, mcast_mask),
+              bGS_gC(_,_,_,epi_m,epi_n), bGS_sC(_,_,_,load_pipe_producer_state.index()));
+          load_pipeline.producer_expect_transaction(load_pipe_producer_state);
+        }
+
+        // Commit TMA loads for this stage and release the lock
+        load_pipeline.producer_commit(load_pipe_producer_state);
+        ++load_pipe_producer_state;
+      }
+    }
+
+    // Post-loop fusion callback entry point
+    pld_callbacks.end();
+
+    return load_pipe_producer_state;
+  }
+
+  CUTLASS_DEVICE auto
+  load_tail(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_producer_state) {
+    bool issue_tma_load = cute::elect_one_sync();
+    if (issue_tma_load) {
+      load_pipeline.producer_tail(load_pipe_producer_state);
+    }
+
+    return load_pipe_producer_state;
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class TileShapeMNK,
+    class TileCoordMNKL,
+    class AccEngine, class AccLayout,
+    class TiledMma
+  >
+  CUTLASS_DEVICE auto
+  store(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_consumer_state,
+      StorePipeline store_pipeline,
+      StorePipelineState store_pipe_producer_state,
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShapeMNK tile_shape_MNK,
+      TileCoordMNKL tile_coord_mnkl,
+      cute::Tensor<AccEngine,AccLayout> accumulators,
+      TiledMma tiled_mma,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      int subtile_idx=-1) {
+    using namespace cute;
+    using ElementAccumulator = typename AccEngine::value_type;
+    using ElementCompute_ = typename epilogue::fusion::FusionCallbacksTraits<FusionCallbacks>::ElementCompute;
+    using ElementCompute = cute::conditional_t<cute::is_void_v<ElementCompute_>,ElementAccumulator,ElementCompute_>;
+
+    static_assert(is_rmem<AccEngine>::value, "Accumulator must be RF resident.");
+    static_assert(rank(AccLayout{}) == 3, "Accumulator must be MMA-partitioned: (MMA,MMA_M,MMA_N)");
+    static_assert(rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
+    static_assert(is_static<TileShapeMNK>::value, "TileShapeMNK must be static");
+    static_assert(rank(TileShapeMNK{}) == 3, "TileShapeMNK must be rank 3");
+    static_assert(rank(TileCoordMNKL{}) == 4, "TileCoordMNKL must be rank 4");
+
+    // Indexing variables
+    auto [M, N, K, L] = problem_shape_mnkl;
+    auto [m_coord, n_coord, k_coord, l_coord] = tile_coord_mnkl;
+
+    // The tma tensor D under im2col mode only has two modes (M, N) which
+    // should be local tiled with only (m_coord, n_coord).
+    auto coord_shape = conditional_return<is_im2col_D>( 
+        make_coord(m_coord, n_coord),
+        make_coord(m_coord, n_coord, l_coord));
+
+    // Represent the full output tensor, slice to get the tile this CTA is responsible for
+    Tensor mD_mn = params.tma_store_d.get_tma_tensor(make_shape(M,N,L));                               //       (M,N,L)
+    Tensor mD = coalesce(mD_mn, take<0,2>(CtaTileMNK{}));
+    Tensor gD = local_tile(mD, take<0,2>(CtaTileMNK{}), coord_shape);                                  // (CTA_M,CTA_N)
+
+    // Apply epilogue subtiling
+    Tensor gD_epi = flat_divide(gD, EpilogueTile{});                             // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+
+    // Construct the corresponding pipelined smem tensors
+    auto ptr_sC = shared_tensors.collective.smem_C.begin();
+    auto ptr_sD = shared_tensors.collective.smem_D.begin();
+    Tensor sC_epi = cute::as_position_independent_swizzle_tensor(
+                      make_tensor(make_smem_ptr(ptr_sC), SmemLayoutC{}));             // (EPI_TILE_M,EPI_TILE_N,PIPE_C)
+    Tensor sD_epi = cute::as_position_independent_swizzle_tensor(
+                      make_tensor(make_smem_ptr(ptr_sD), SmemLayoutD{}));             // (EPI_TILE_M,EPI_TILE_N,PIPE_D)
+
+    TiledCopy tiled_copy_C_atom = make_tiled_copy_C_atom(CopyAtomC{}, tiled_mma);
+
+    // (t)hread-partition for (r)egister to (r)egister copy (tRR_)
+    TiledCopy tiled_r2r = [&]() {
+      if constexpr (IsUseR2R) {
+        return make_tiled_copy_S(Copy_Atom<CopyOpR2R, ElementCompute>{}, tiled_copy_C_atom);
+      }
+      else {
+        return make_tiled_copy_S(Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>,
+          ElementCompute>{}, tiled_copy_C_atom);
+      }
+    }();
+    ThrCopy thread_r2r = tiled_r2r.get_slice(thread_idx);
+
+    // (t)hread-partition for (r)egister to (s)mem copy (tRS_)
+    TiledCopy tiled_r2s = [&]() {
+      if constexpr (IsUseR2R) {
+        return make_tiled_copy_D(Copy_Atom<CopyOpR2S,SmemElementD>{}, tiled_r2r);
+      }
+      else {
+        return make_tiled_copy_S(Copy_Atom<CopyOpR2S,SmemElementD>{}, tiled_copy_C_atom);
+      }
+    }();
+    ThrCopy thread_r2s = tiled_r2s.get_slice(thread_idx);
+    Tensor tRS_rAcc = thread_r2s.retile_S(accumulators);                                   // ((R2S,R2S_V),MMA_M,MMA_N)
+    Tensor tRS_sD   = thread_r2s.partition_D(sD_epi);                                       // (R2S,R2S_M,R2S_N,PIPE_D)
+
+    auto mma_tile_m = size<0>(TileShapeMNK{}) / size<1>(tRS_rAcc);
+    auto mma_tile_n = size<1>(TileShapeMNK{}) / size<2>(tRS_rAcc);
+    auto epi_tile_m = size<0>(EpilogueTile{});
+    auto epi_tile_n = size<1>(EpilogueTile{});
+
+    // Allocate D registers
+    Layout tRS_rD_layout = make_layout(take<0,3>(shape(thread_r2s.partition_S(sD_epi))));
+    Tensor tRS_rD = make_tensor<SmemElementD>(tRS_rD_layout);                                      // (R2S,R2S_M,R2S_N)
+
+    // Vectorized fragment view
+    constexpr int FragmentSize = DispatchPolicy::FragmentSize;
+    Tensor tRS_rAcc_frg = recast<Array<ElementAccumulator, FragmentSize>>(tRS_rAcc);
+    Tensor tRS_rD_frg   = recast<Array<SmemElementD      , FragmentSize>>(tRS_rD);
+    CUTE_STATIC_ASSERT(size<0>(tRS_rAcc) % FragmentSize == 0, "Fragment size does not vectorize properly");
+
+    // (t)hread-partition for (s)mem to (r)egister copy (tSR_)
+    TiledCopy tiled_s2r = make_tiled_copy_S(Copy_Atom<CopyOpS2R, SmemElementC>{}, tiled_copy_C_atom);
+    ThrCopy thread_s2r = tiled_s2r.get_slice(thread_idx);
+    Tensor tSR_sC        = thread_s2r.partition_S(sC_epi);                                  // (S2R,S2R_M,S2R_N,PIPE_C)
+    Layout tSR_rC_layout = thread_s2r.retile_D(tRS_rD).layout();                            // (S2R,S2R_M,S2R_N)
+
+    // Allocate C registers
+    // If C smem load is a non-vectorized dst(i) = src(i) then we can allocate C registers directly in the compute type
+    // to eliminate some redundant pack+unpack instruction sequences for sub-word types
+    constexpr bool IsDirectS2R = cute::is_same_v<CopyOpS2R, AutoVectorizingCopyWithAssumedAlignment<128>>
+                                && decltype(max_common_vector(tSR_rC_layout, tSR_sC.layout()))::value <= 1;
+    using RegisterElementC = cute::conditional_t<IsDirectS2R, ElementCompute, SmemElementC>;
+    Tensor tRS_rC = make_tensor<RegisterElementC>(tRS_rD_layout);                                  // (R2S,R2S_M,R2S_N)
+    Tensor tSR_rC = thread_s2r.retile_D(tRS_rC);                                                   // (S2R,S2R_M,S2R_N)
+
+    // thread(b)lock-partition for (s)mem to (g)mem copy (bSG_)
+    ThrCopy thrblk_s2g = params.tma_store_d.get_slice(Int<0>{});
+    Tensor bSG_sD = thrblk_s2g.partition_S(sD_epi);                                    // (S2G,S2G_M,S2G_N,PIPE_D)
+    Tensor bSG_gD = thrblk_s2g.partition_D(gD_epi);                                    // (S2G,S2G_M,S2G_N,EPI_M,EPI_N)
+
+    // OOB predication for tile quantization "residue"
+    // Absolute coordinate tensors (dynamic)
+    Tensor mD_crd = make_identity_tensor(make_shape(M,N));                                                     // (M,N)
+    Tensor cD_mn = local_tile(mD_crd, take<0,2>(CtaTileMNK{}), make_coord(m_coord, n_coord));          // (CTA_M,CTA_N)
+    Tensor tRS_cD_mn = [&]() {
+      if constexpr (IsUseR2R) {
+        // (t)hread-partition for ConsumerStoreCallbacks. 
+        TiledCopy tiled_cst = make_tiled_copy_S(Copy_Atom<CopyOpR2S,SmemElementC>{}, tiled_copy_C_atom);
+        ThrCopy thread_cst = tiled_cst.get_slice(thread_idx);
+
+        return thread_cst.partition_S(flat_divide(cD_mn, EpilogueTile{}));             // (R2S,R2S_M,R2S_N,EPI_M,EPI_N)
+      }
+      else {
+        return thread_r2s.partition_S(flat_divide(cD_mn, EpilogueTile{}));             // (R2S,R2S_M,R2S_N,EPI_M,EPI_N)
+      }
+    }();
+    // Relative coordinate tensors (static)
+    Tensor cD = make_counting_tensor(cD_mn.layout());                                                  // (CTA_M,CTA_N)
+    Tensor tRS_cD = make_counting_tensor(tRS_cD_mn.layout());                          // (R2S,R2S_M,R2S_N,EPI_M,EPI_N)
+    // Subtract the global "bottom right" corner from the local "top left" corner to get the max relative coordinate
+    auto residue_cD = make_coord(M,N) - cD_mn(_0{});                                                           // (m,n)
+    auto residue_tRS_cD = make_coord(M,N) - tRS_cD_mn(_0{});                                                   // (m,n)
+
+    CUTE_STATIC_ASSERT(epi_tile_m % mma_tile_m == 0, "MMA_TILE_M must divide EPI_TILE_M");
+
+    CUTE_STATIC_ASSERT(mma_tile_n % epi_tile_n == 0, "EPI_TILE_N must divide MMA_TILE_N");
+    // Get TiledCopy for partition reference when consumer store.
+    TiledCopy tiled_copy_partition_ref = make_tiled_copy_S(Copy_Atom<CopyOpR2S,SmemElementD>{}, tiled_copy_C_atom);
+    // Get the fusion callbacks for the consumer store warps
+    constexpr bool RefSrc = true; // Register tensors reference tiled copy src layout
+    auto cst_args = cutlass::epilogue::fusion::detail::ConsumerStoreArgs(
+                      problem_shape_mnkl,
+                      CtaTileMNK{},
+                      tile_coord_mnkl,
+                      tiled_mma,
+                      EpilogueTile{},
+                      tiled_copy_partition_ref,
+                      cD,
+                      residue_cD,
+                      tRS_cD,
+                      residue_tRS_cD,
+                      tRS_rC,
+                      thread_idx
+                    );
+    auto cst_callbacks = fusion_callbacks.template get_consumer_store_callbacks<RefSrc>(cst_args);
+    bool is_producer_load_needed = fusion_callbacks.is_producer_load_needed();
+    bool is_C_load_needed = is_source_supported && fusion_callbacks.is_C_load_needed();
+
+    using FragmentVisit = decltype(cst_callbacks.visit(tRS_rAcc_frg(0), 0, 0, 0));
+    constexpr bool IsDirectR2S = cute::is_same_v<FragmentVisit, Array<SmemElementD, FragmentSize>>;
+    using RegisterElementD = cute::conditional_t<!IsDirectR2S, ElementCompute, SmemElementD>;
+    Tensor tRS_rCompute = make_tensor<RegisterElementD>(tRS_rD_layout);                            // (R2S,R2S_M,R2S_N)
+    Tensor tRS_rCompute_frg = recast<Array<RegisterElementD, FragmentSize>>(tRS_rCompute);
+
+    // Thread synchronizer for previously issued waits or fences
+    // to ensure visibility of smem reads/writes to threads or TMA unit
+    auto synchronize = [&] () { cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
+
+    // Predication for TMA store (one warp issues TMA store)
+    bool issue_tma_store = (thread_idx / NumThreadsPerWarp) == 0;
+
+    // In the reuse smem configuration we have StagesC smem buffers and at most StagesD committed TMA stores in flight.
+    // The TMA store pipeline producer acquire returns when at most StagesD-1 committed stores are in-flight, so we can
+    // only guarantee store completion after StagesD iterations, then we can begin issuing releases on the smem buffer locks.
+    // store_pipe_producer_state tracks the acquire and load_pipe_consumer_state tracks the release, in circular buffer fashion.
+    LoadPipelineState load_wait_state = load_pipe_consumer_state;
+    if constexpr (ReuseSmemC) {
+      load_wait_state = store_pipe_producer_state;
+      load_wait_state.phase_ ^= 1;
+    }
+
+    // We can delay issue of TMA store by one iteration to achieve better interleaving of non-TMA instructions
+    // Sync requirements of smem reuse may preclude this optimization
+    // Delayed stores cause delayed stage releases which causes deadlock when StagesC == StagesD
+    [[maybe_unused]] int epi_m_prev = 0;
+    [[maybe_unused]] int epi_n_prev = 0;
+    static_assert(not (DelayTmaStore and ReuseSmemC and StagesC <= StagesD), "This TMA epilogue configuration will deadlock");
+
+    // The TMA store sequence for one subtile iteration
+    auto tma_store_fn = [&] (int epi_m, int epi_n) {
+      // Write the tile from smem to gmem with TMA
+      cutlass::arch::fence_view_async_shared(); // ensure smem writes are visible to TMA
+      synchronize(); // ensure all threads have issued their async fence
+      if constexpr (is_destination_supported) {
+        if (issue_tma_store) {
+          copy(params.tma_store_d, bSG_sD(_,_,_,store_pipe_producer_state.index()), bSG_gD(_,_,_,epi_m,epi_n));
+        }
+      }
+
+      // Post async fence, pre TMA commit callback entry point
+      cst_callbacks.tma_store(epi_m, epi_n, store_pipe_producer_state.count(), issue_tma_store);
+
+      // Commit the TMA stores for this stage
+      if (issue_tma_store) {
+        store_pipeline.producer_commit(store_pipe_producer_state);
+      }
+      ++store_pipe_producer_state;
+      ++issued_stores;
+
+      // Wait for the next smem buffer to be available
+      if (issue_tma_store) {
+        store_pipeline.producer_acquire(store_pipe_producer_state);
+      }
+      synchronize();
+
+      if constexpr (ReuseSmemC) {
+        // producer_acquire returns when at most StagesD-1 committed stores are pending
+        bool store_finished = issued_stores > StorePipeline::UnacquiredStages;
+        // Let dma warp know earliest smem buffer is consumed and empty after StagesD producer commits
+        if (store_finished) {
+          if (is_producer_load_needed) {
+            load_pipeline.consumer_release(load_pipe_consumer_state);
+          }
+          ++load_pipe_consumer_state;
+        }
+      }
+    };
+
+    //
+    // BEGIN EPILOGUE
+    //
+
+    // Pre-loop fusion callback entry point
+    cst_callbacks.begin();
+
+    // For each output tile
+    CUTLASS_PRAGMA_UNROLL
+    for (int epi_n = 0; epi_n < size<3>(gD_epi); ++epi_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int epi_m = 0; epi_m < size<2>(gD_epi); ++epi_m) {
+        [[maybe_unused]] bool is_first_iteration = epi_m == 0 && epi_n == 0;
+        bool is_last_iteration = epi_m == size<2>(gD_epi)-1 && epi_n == size<3>(gD_epi)-1;
+
+        if (subtile_idx != -1 && (epi_n * static_cast<int>(size<2>(gD_epi)) + epi_m) != subtile_idx) {
+          continue;
+        }
+
+        cst_callbacks.begin_loop(epi_m, epi_n);
+
+        if (is_producer_load_needed) {
+          // Wait for the producer load to fill smem
+          load_pipeline.consumer_wait(load_wait_state);
+
+          if (is_C_load_needed) {
+            // Copy source tile from smem to register
+            copy(tiled_s2r, tSR_sC(_,_,_,load_wait_state.index()), tSR_rC);
+          }
+        }
+
+        // First loop fusion callback entry point
+        cst_callbacks.previsit(epi_m, epi_n, load_wait_state.count(), is_producer_load_needed);
+
+        if (is_producer_load_needed) {
+          if constexpr (not ReuseSmemC) {
+            // Let producer load warp know smem buffers are consumed and empty
+            cutlass::arch::fence_view_async_shared();
+            load_pipeline.consumer_release(load_pipe_consumer_state);
+            ++load_pipe_consumer_state;
+          }
+          ++load_wait_state;
+        }
+
+        int mma_m = epi_m;
+        int mma_n = (epi_n * size<1>(EpilogueTile{})) / mma_tile_n;
+        Tensor tRS_rAcc_frg_mn = tRS_rAcc_frg(_,mma_m,mma_n);
+
+        // Vectorized fragment loop with visitor callback entry point
+        int epi_n_in_mma = epi_n % (mma_tile_n / epi_tile_n);
+        int r2s_v = epi_n_in_mma * size(tRS_rCompute_frg);
+        CUTLASS_PRAGMA_UNROLL
+        for (int epi_v = 0; epi_v < size(tRS_rCompute_frg); ++epi_v) {
+          tRS_rCompute_frg(epi_v) = cst_callbacks.visit(tRS_rAcc_frg_mn(r2s_v + epi_v), epi_v, epi_m, epi_n);
+        }
+        // The latest we can delay the TMA store is right before the smem store of the next iteration
+        // since the current TMA store needs to be committed before we can acquire the next smem buffer
+        if constexpr (DelayTmaStore) {
+          // Issue TMA stores for the previous subtile
+          if (not is_first_iteration and subtile_idx == -1) {
+            tma_store_fn(epi_m_prev, epi_n_prev);
+          }
+          epi_m_prev = epi_m;
+          epi_n_prev = epi_n;
+        }
+
+        // Smem reduction callback entry point using current store buffer for workspace
+        cst_callbacks.reduce(sD_epi(_,_,store_pipe_producer_state.index()),
+                              synchronize, epi_m, epi_n, is_last_iteration, tRS_rCompute_frg);
+
+        // Copy tile from register to regiser if needed
+        if constexpr (IsUseR2R) {
+          // retile source and destination for tiled_r2r
+          Tensor tRR_rD_src = thread_r2r.retile_S(tRS_rCompute);                             // (R2R,R2R_M,R2R_N,EPI_M,EPI_N)
+          Tensor tRR_rD_dst = thread_r2r.retile_D(tRS_rCompute);                             // (R2R,R2R_M,R2R_N,EPI_M,EPI_N)
+
+          // Output register transformation before copying to shared memory.
+          copy(tiled_r2r, tRR_rD_src, tRR_rD_dst);
+        }
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(tRS_rD_frg); ++i) {
+          tRS_rD_frg(i) = cutlass::NumericArrayConverter<SmemElementD, RegisterElementD, FragmentSize>{}(tRS_rCompute_frg(i));
+        }
+
+        // Copy tile from register to smem
+        if constexpr (is_destination_supported) {
+          copy(tiled_r2s, tRS_rD, tRS_sD(_,_,_,store_pipe_producer_state.index()));
+        }
+
+        // Post reduction, pre TMA store callback entry point
+        constexpr bool issue_smem_store = true; // No smem store predication
+        cst_callbacks.postreduce(epi_m, epi_n, store_pipe_producer_state.count(), issue_smem_store);
+
+        if constexpr (not DelayTmaStore) {
+          // Issue TMA stores for this subtile
+          tma_store_fn(epi_m, epi_n);
+        }
+
+        cst_callbacks.end_loop(epi_m, epi_n);
+
+      } // for epi_m
+    } // for epi_n
+
+    if constexpr (DelayTmaStore) {
+      // Issue TMA stores for the last subtile
+      tma_store_fn(epi_m_prev, epi_n_prev);
+    }
+
+    // Post-loop fusion callback entry point
+    cst_callbacks.end();
+
+    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state);
+  }
+
+  CUTLASS_DEVICE auto
+  store_tail(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_consumer_state,
+      StorePipeline store_pipeline,
+      StorePipelineState store_pipe_producer_state) {
+    // wait for all TMA stores to complete
+    store_pipeline.producer_tail(store_pipe_producer_state);
+    // reset store counter
+    issued_stores = 0;
+
+    if constexpr (ReuseSmemC) {
+      if (fusion_callbacks.is_producer_load_needed()) {
+        // Issue releases on up to StagesD-1 previously issued TMA stores
+        constexpr int release_stages = cute::min(StorePipeline::UnacquiredStages, get_load_pipe_increment(CtaTileMNK{}));
+        CUTLASS_PRAGMA_UNROLL
+        for (int stage = 0; stage < release_stages; ++stage) {
+          load_pipeline.consumer_release(load_pipe_consumer_state);
+          ++load_pipe_consumer_state;
+        }
+      }
+    }
+
+    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state);
+  }
+
+private:
+  Params const& params;
+  FusionCallbacks fusion_callbacks;
+  int issued_stores = 0;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized_bias_elementwise.hpp b/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized_bias_elementwise.hpp
new file mode 100644
index 0000000000..9749040081
--- /dev/null
+++ b/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized_bias_elementwise.hpp
@@ -0,0 +1,164 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing pipelined epilogues with bias add and elementwise activation functions.
+         This collective is now DEPRECATED, will be removed in the next release. Use EVT instead.
+*/
+
+#pragma once
+
+#include "sm90_epilogue_tma_warpspecialized.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int StagesC_,
+  int StagesD_,
+  int FragmentSize_,
+  class BlockTileShape_,    //     (BLK_M,BLK_N,BLK_K)
+  class EpilogueTileShape_, // (EPI_TILE_M,EPI_TILE_N)
+  class ElementC_,
+  class StrideC_,
+  class ElementD_,
+  class StrideD_,
+  class FusionCallbacks_,
+  class CopyOpG2S_,
+  class SmemLayoutAtomC_,
+  class CopyOpS2R_,
+  class CopyOpS2G_,
+  class SmemLayoutAtomD_,
+  class CopyOpR2S_,
+  class CopyAtomC_,
+  class CopyOpR2R_
+>
+class Sm90EpilogueTmaWarpSpecializedBiasElementwise
+  : public CollectiveEpilogue<
+      Sm90TmaWarpSpecialized<StagesC_, StagesD_, FragmentSize_, false, false>,
+      BlockTileShape_,
+      EpilogueTileShape_,
+      ElementC_,
+      StrideC_,
+      ElementD_,
+      StrideD_,
+      FusionCallbacks_,
+      CopyOpG2S_,
+      SmemLayoutAtomC_,
+      CopyOpS2R_,
+      CopyOpS2G_,
+      SmemLayoutAtomD_,
+      CopyOpR2S_,
+      CopyAtomC_,
+      CopyOpR2R_
+> {
+private:
+  using Impl =
+    CollectiveEpilogue<
+      Sm90TmaWarpSpecialized<StagesC_, StagesD_, FragmentSize_, false, false>,
+      BlockTileShape_,
+      EpilogueTileShape_,
+      ElementC_,
+      StrideC_,
+      ElementD_,
+      StrideD_,
+      FusionCallbacks_,
+      CopyOpG2S_,
+      SmemLayoutAtomC_,
+      CopyOpS2R_,
+      CopyOpS2G_,
+      SmemLayoutAtomD_,
+      CopyOpR2S_,
+      CopyAtomC_,
+      CopyOpR2R_
+    >;
+public:
+  using DispatchPolicy = Sm90TmaWarpSpecializedBiasElementwise<StagesC_, StagesD_, FragmentSize_>;
+  using ElementCompute = typename Impl::ThreadEpilogueOp::ElementCompute;
+  using ElementBias = typename Impl::ThreadEpilogueOp::ElementBias;
+  using ElementT = typename Impl::ThreadEpilogueOp::ElementAux;
+
+  // Constructor inheritance
+  using Impl::Impl;
+
+  // Host side epilogue arguments
+  struct [[deprecated("use Sm90TmaWarpSpecialized Arguments instead")]]
+  Arguments {
+    struct ThreadArgs {
+      ElementCompute alpha{1};
+      ElementCompute beta{0};
+      ElementCompute const *alpha_ptr{nullptr};
+      ElementCompute const *beta_ptr{nullptr};
+    } thread;
+    ElementC_ const* ptr_C{nullptr};
+    StrideC_ dC{};
+    ElementD_* ptr_D{nullptr};
+    StrideD_ dD{};
+    ElementBias const* ptr_Bias{nullptr};
+    ElementT* ptr_T{nullptr};
+
+    CUTLASS_HOST_DEVICE
+    operator typename Impl::Arguments() const {
+      typename Impl::Arguments arguments;
+      arguments.thread.alpha = thread.alpha;
+      arguments.thread.beta = thread.beta;
+      arguments.thread.alpha_ptr = thread.alpha_ptr;
+      arguments.thread.beta_ptr = thread.beta_ptr;
+      if constexpr (not cute::is_void_v<ElementBias>) {
+        arguments.thread.bias_ptr = ptr_Bias;
+      }
+      if constexpr (not cute::is_void_v<ElementT>) {
+        arguments.thread.aux_ptr = ptr_T;
+        arguments.thread.dAux = dD;
+      }
+      arguments.ptr_C = ptr_C;
+      arguments.dC = dC;
+      arguments.ptr_D = ptr_D;
+      arguments.dD = dD;
+
+      return arguments;
+    }
+  };
+
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/dispatch_policy.hpp b/include/cutlass/epilogue/dispatch_policy.hpp
new file mode 100644
index 0000000000..a5f47f0832
--- /dev/null
+++ b/include/cutlass/epilogue/dispatch_policy.hpp
@@ -0,0 +1,196 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+
+//////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue {
+
+//////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Builder Epilogue Schedules
+//
+//////////////////////////////////////////////////////////////////////////////
+
+struct PtrArrayDefault {};
+struct EpilogueSimtVectorized {};
+struct EpiloguePtrArraySimtVectorized {};
+struct NoSmemWarpSpecialized {};
+struct PtrArrayNoSmemWarpSpecialized {};
+struct PtrArrayNoSmemWarpSpecializedTransposed {};
+struct PtrArrayPlanarComplexNoSmemWarpSpecialized {};
+struct TmaWarpSpecialized {};
+struct TmaWarpSpecializedCooperative {};
+struct PtrArrayTmaWarpSpecializedCooperative {
+  static constexpr int NumEpilogueWarpGroups = 2;
+};
+
+// Standard warp specialized epilogue
+struct PtrArrayTmaWarpSpecialized {
+  static constexpr int NumEpilogueWarpGroups = 1;
+};
+
+// Pingpong kernel epilogue
+struct PtrArrayTmaWarpSpecializedPingpong {
+  static constexpr int NumEpilogueWarpGroups = 2;
+};
+
+// DEPRECATED schedules, will be removed in next release
+struct TmaWarpSpecializedElementwiseBase : public TmaWarpSpecialized {};
+struct TmaWarpSpecializedCooperativeElementwiseBase : public TmaWarpSpecializedCooperative {};
+template <
+  template <class T> class ActivationFunctor_,
+  thread::ScaleType::Kind Scale_ = thread::ScaleType::Default,
+  FloatRoundStyle Round_ = FloatRoundStyle::round_to_nearest
+>
+struct [[deprecated("Use TmaWarpSpecialized with fusion::LinCombEltAct instead")]]
+TmaWarpSpecializedElementwise : public TmaWarpSpecializedElementwiseBase {
+  template <class T>
+  using ActivationFunctor = ActivationFunctor_<T>;
+  static constexpr thread::ScaleType::Kind Scale = Scale_;
+  static constexpr FloatRoundStyle Round = Round_;
+};
+
+template <
+  template <class T> class ActivationFunctor_,
+  thread::ScaleType::Kind Scale_ = thread::ScaleType::Default,
+  FloatRoundStyle Round_ = FloatRoundStyle::round_to_nearest
+>
+struct [[deprecated("Use TmaWarpSpecializedCooperative with fusion::LinCombEltAct instead")]]
+TmaWarpSpecializedCooperativeElementwise : public TmaWarpSpecializedCooperativeElementwiseBase {
+  template <class T>
+  using ActivationFunctor = ActivationFunctor_<T>;
+  static constexpr thread::ScaleType::Kind Scale = Scale_;
+  static constexpr FloatRoundStyle Round = Round_;
+};
+
+struct TmaWarpSpecializedBiasElementwiseBase : public TmaWarpSpecialized{};
+struct TmaWarpSpecializedCooperativeBiasElementwiseBase : public TmaWarpSpecializedCooperative {};
+
+template <
+  template <class T> class ActivationFunctor_,
+  class ElementT_,
+  template <class T> class BiasOp_,
+  bool StoreT_,
+  class ElementBias_
+>
+struct [[deprecated("Use TmaWarpSpecialized with fusion::LinCombPerRowBiasEltActAux instead")]]
+TmaWarpSpecializedBiasElementwise : public TmaWarpSpecializedBiasElementwiseBase {
+  template <class T>
+  using ActivationFunctor = ActivationFunctor_<T>;
+  using ElementT = ElementT_;
+
+  template <class T>
+  using BiasOp = BiasOp_<T>;
+
+  static constexpr bool StoreT = StoreT_;
+  using ElementBias = ElementBias_;
+};
+
+template <
+  template <class T> class ActivationFunctor_,
+  class ElementT_,
+  template <class T> class BiasOp_,
+  bool StoreT_,
+  class ElementBias_
+>
+struct [[deprecated("Use TmaWarpSpecializedCooperative with fusion::LinCombPerRowBiasEltActAux instead")]]
+TmaWarpSpecializedCooperativeBiasElementwise : public TmaWarpSpecializedCooperativeBiasElementwiseBase {
+  template <class T>
+  using ActivationFunctor = ActivationFunctor_<T>;
+
+  using ElementT = ElementT_;
+
+  template <class T>
+  using BiasOp = BiasOp_<T>;
+
+  static constexpr bool StoreT = StoreT_;
+  using ElementBias = ElementBias_;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Collective Dispatch Policies
+//
+//////////////////////////////////////////////////////////////////////////////
+
+template<
+  int StagesC_,
+  int StagesD_,
+  int FragmentSize_,
+  bool ReuseSmemC_,
+  bool DelayTmaStore_
+>
+struct Sm90TmaWarpSpecialized {
+  constexpr static int StagesC = StagesC_;
+  constexpr static int StagesD = StagesD_;
+  constexpr static int FragmentSize = FragmentSize_;
+  constexpr static bool ReuseSmemC = ReuseSmemC_;
+  constexpr static bool DelayTmaStore = DelayTmaStore_;
+};
+
+template<
+  int StagesC_,
+  int StagesD_,
+  int FragmentSize_,
+  bool ReuseSmemC_,
+  bool DelayTmaStore_,
+  int NumEpilogueWarpGroups_
+>
+struct Sm90PtrArrayTmaWarpSpecialized {
+  constexpr static int StagesC = StagesC_;
+  constexpr static int StagesD = StagesD_;
+  constexpr static int FragmentSize = FragmentSize_;
+  constexpr static bool ReuseSmemC = ReuseSmemC_;
+  constexpr static bool DelayTmaStore = DelayTmaStore_;
+  constexpr static int NumEpilogueWarpGroups = NumEpilogueWarpGroups_;
+};
+
+// DEPRECATED policies, will be removed in next release
+template<
+  int StagesC_,
+  int StagesD_,
+  int FragmentSize_ = 2
+>
+struct Sm90TmaWarpSpecializedBiasElementwise {
+  constexpr static int StagesC = StagesC_;
+  constexpr static int StagesD = StagesD_;
+  constexpr static int FragmentSize = FragmentSize_;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue
diff --git a/include/cutlass/epilogue/fusion/callbacks.hpp b/include/cutlass/epilogue/fusion/callbacks.hpp
new file mode 100644
index 0000000000..9ee37234cb
--- /dev/null
+++ b/include/cutlass/epilogue/fusion/callbacks.hpp
@@ -0,0 +1,89 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/epilogue/fusion/operations.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::fusion {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Dispatch interface for epilogue fusion callbacks
+// For visitor fusions, this is just a convenience wrapper to provide metadata and non-nested args.
+// It is also valid to just pass visitor callbacks directly to the collective, e.g. fusion::Sm90LinearCombination,
+// provided the collective supports a visitor callbacks interface. This is useful for implementing custom fusions.
+template <
+  class DispatchPolicy,  // specialize on collective's dispatch policy since callbacks API will depend on collective's algorithm
+  class Operation,       // the fusion operation being performed, e.g. fusion::LinearCombination
+  class CtaTile_MNK,     // computed tile per CTA
+  class EpilogueTile_MN, // epilogue subtile size
+  class... Args          // callbacks implementation dependent args (e.g. copy atoms, smem layouts)
+>
+struct FusionCallbacks {
+  static_assert(cutlass::detail::dependent_false<DispatchPolicy, Operation>, "Could not find a callbacks specialization.");
+};
+
+// Metadata helper to handle custom EVTs or other non-FusionCallbacks types
+template <class T>
+struct FusionCallbacksTraits {
+  using DispatchPolicy = void;
+  using Operation = T;
+  using CtaTile_MNK = void;
+  using EpilogueTile_MN = void;
+  using ElementCompute = void;
+};
+
+template <
+  class DispatchPolicy_,
+  class Operation_,
+  class CtaTile_MNK_,
+  class EpilogueTile_MN_,
+  class... Args
+>
+struct FusionCallbacksTraits<
+  FusionCallbacks<DispatchPolicy_, Operation_, CtaTile_MNK_, EpilogueTile_MN_, Args...>
+> {
+  using DispatchPolicy = DispatchPolicy_;
+  using Operation = Operation_;
+  using CtaTile_MNK = CtaTile_MNK_;
+  using EpilogueTile_MN = EpilogueTile_MN_;
+  using ElementCompute = typename Operation::ElementCompute;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::fusion
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/fusion/operations.hpp b/include/cutlass/epilogue/fusion/operations.hpp
new file mode 100644
index 0000000000..1ef06a538b
--- /dev/null
+++ b/include/cutlass/epilogue/fusion/operations.hpp
@@ -0,0 +1,495 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cutlass/numeric_conversion.h>
+#include <cutlass/layout/matrix.h>
+#include <cute/numeric/numeric_types.hpp>
+#include <cute/numeric/integral_constant.hpp> // cute::false_type
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::fusion {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Fusion Operations
+// Template args must not be implementation dependent
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct FusionOperation {
+  // metadata types/queries that can be overrided
+  using ElementOutput = void;
+  using ElementCompute = void;
+
+  using ElementSource = void;
+  static constexpr bool IsSourceSupported = false;
+
+  using ElementScalar = void;
+  static constexpr int AlignmentScalar = 0;
+  static constexpr bool IsScaleFactorSupported = false;
+  static constexpr bool IsPerRowScaleSupported = false;
+  static constexpr bool IsPerColScaleSupported = false;
+
+  using ElementBias = void;
+  static constexpr int AlignmentBias = 0;
+  static constexpr bool IsPerRowBiasSupported = false;
+  static constexpr bool IsPerColBiasSupported = false;
+  static constexpr bool IsDePerRowBiasSupported = false;
+
+  using ActivationFn = void;
+  static constexpr bool IsEltActSupported = false;
+  static constexpr bool IsDeEltActSupported = false;
+
+  using ElementAux = void;
+  using GmemLayoutTagAux = void;
+  static constexpr int AlignmentAux = 0;
+  static constexpr bool IsAuxOutSupported = false;
+  static constexpr bool IsAuxInSupported = false;
+
+  using ElementAmax = void;
+  static constexpr bool IsAbsMaxSupported = false;
+
+};
+
+// D = alpha * acc
+template<
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementScalar_ = ElementCompute_,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct ScaledAcc : FusionOperation {
+  using ElementOutput = ElementOutput_;
+  using ElementCompute = ElementCompute_;
+  using ElementScalar = ElementScalar_;
+  static constexpr int AlignmentScalar = 1;
+  static constexpr auto RoundStyle = RoundStyle_;
+};
+
+// D = alpha * acc + beta * C
+template<
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinearCombination
+    : ScaledAcc<ElementOutput_, ElementCompute_, ElementScalar_, RoundStyle_> {
+  using ElementSource = ElementSource_;
+  static constexpr bool IsSourceSupported = true;
+};
+
+// D = activation(alpha * acc + beta * C)
+template<
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinCombEltAct
+    : LinearCombination<ElementOutput_, ElementCompute_, ElementSource_, ElementScalar_, RoundStyle_> {
+  using ActivationFn = ActivationFn_<ElementCompute_>;
+  static constexpr bool IsEltActSupported = true;
+};
+
+// D = softmax(top_k(alpha * acc + beta * C))
+template<
+  int TopK,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinCombTopKSoftmaxCol
+    : LinearCombination<ElementOutput_, ElementCompute_, ElementSource_, ElementScalar_, RoundStyle_> {
+};
+
+
+// D = alpha * acc + beta * C + per-row bias
+template<
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementBias_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinCombPerRowBias
+    : LinearCombination<ElementOutput_, ElementCompute_, ElementSource_, ElementScalar_, RoundStyle_> {
+  using ElementBias = ElementBias_;
+  static constexpr int AlignmentBias = AlignmentBias_;
+  static constexpr bool IsPerRowBiasSupported = true;
+};
+
+// D = alpha * acc + beta * C + per-column bias
+template<
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementBias_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinCombPerColBias
+    : LinearCombination<ElementOutput_, ElementCompute_, ElementSource_, ElementScalar_, RoundStyle_> {
+  using ElementBias = ElementBias_;
+  static constexpr int AlignmentBias = AlignmentBias_;
+  static constexpr bool IsPerColBiasSupported = true;
+};
+
+// D = activation(alpha * acc + beta * C + per-row bias)
+template<
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementBias_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinCombPerRowBiasEltAct
+    : LinCombPerRowBias<ElementOutput_, ElementCompute_,
+        ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_> {
+  using ActivationFn = ActivationFn_<ElementCompute_>;
+  static constexpr bool IsEltActSupported = true;
+};
+
+// D = activation(alpha * acc + beta * C + per-column bias)
+template<
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementBias_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinCombPerColBiasEltAct
+    : LinCombPerColBias<ElementOutput_, ElementCompute_,
+        ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_> {
+  using ActivationFn = ActivationFn_<ElementCompute_>;
+  static constexpr bool IsEltActSupported = true;
+};
+
+// D = activation(alpha * acc + beta * C + per-row bias)
+// aux = alpha * acc + beta * C + per-row bias
+template<
+  class GmemLayoutTagAux_,
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementAux_ = ElementOutput_,
+  class ElementBias_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentAux_ = 128 / cute::sizeof_bits_v<ElementAux_>,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinCombPerRowBiasEltActAux
+    : LinCombPerRowBiasEltAct<ActivationFn_, ElementOutput_, ElementCompute_,
+        ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_> {
+  using ElementAux = ElementAux_;
+  using GmemLayoutTagAux = GmemLayoutTagAux_;
+  static constexpr int AlignmentAux = AlignmentAux_;
+  static constexpr bool IsAuxOutSupported = true;
+};
+
+// D = activation(alpha * acc + beta * C + per-col bias)
+// aux = alpha * acc + beta * C + per-col bias
+template<
+  class GmemLayoutTagAux_,
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementAux_ = ElementOutput_,
+  class ElementBias_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentAux_ = 128 / cute::sizeof_bits_v<ElementAux_>,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinCombPerColBiasEltActAux
+    : LinCombPerColBiasEltAct<ActivationFn_, ElementOutput_, ElementCompute_,
+        ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_> {
+  using ElementAux = ElementAux_;
+  using GmemLayoutTagAux = GmemLayoutTagAux_;
+  static constexpr int AlignmentAux = AlignmentAux_;
+  static constexpr bool IsAuxOutSupported = true;
+};
+
+// D = activation(per-row alpha * acc + per-row beta * C + per-row bias)
+template<
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementBias_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_, // per-row alpha/beta
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  int AlignmentScalar_ = 128 / cute::sizeof_bits_v<ElementScalar_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct PerRowLinCombPerRowBiasEltAct
+    : LinCombPerRowBiasEltAct<ActivationFn_, ElementOutput_, ElementCompute_,
+        ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_> {
+  static constexpr int AlignmentScalar = AlignmentScalar_;
+  static constexpr bool IsPerRowScaleSupported = true;
+};
+
+// D = per-column alpha * per-row alpha * acc + beta * C
+template<
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementSource_ = ElementCompute_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentScalar_ = 128 / cute::sizeof_bits_v<ElementScalar_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct OuterProdLinComb : FusionOperation {
+  using ElementOutput = ElementOutput_;
+  using ElementCompute = ElementCompute_;
+  using ElementSource = ElementSource_;
+  using ElementScalar = ElementScalar_;
+  static constexpr int AlignmentScalar = AlignmentScalar_;
+  static constexpr auto RoundStyle = RoundStyle_;
+  static constexpr bool IsSourceSupported = true;
+  static constexpr bool IsPerRowScaleSupported = true;
+  static constexpr bool IsPerColScaleSupported = true;
+};
+
+// D = activation(per-col alpha * acc + per-col beta * C + per-column bias)
+template<
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementBias_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_, // per-row alpha/beta
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  int AlignmentScalar_ = 128 / cute::sizeof_bits_v<ElementScalar_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct PerColLinCombPerColBiasEltAct
+    : LinCombPerColBiasEltAct<ActivationFn_, ElementOutput_, ElementCompute_,
+        ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_> {
+  static constexpr int AlignmentScalar = AlignmentScalar_;
+  static constexpr bool IsPerColScaleSupported = true;
+};
+
+// Z = scale_a * scale_b * alpha * acc + beta * scale_c * C + per-row bias
+// if D is fp8 
+//   D = scale_d * activation(Z)
+// else
+//   D = activation(Z)
+template<
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementBias_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct ScaledLinCombPerRowBiasEltAct
+    : LinCombPerRowBiasEltAct<ActivationFn_, ElementOutput_, ElementCompute_,
+        ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_> {
+  static constexpr bool IsScaleFactorSupported = true;
+};
+
+// Z = scale_a * scale_b * alpha * acc + beta * scale_c * C + per-col bias
+// if D is fp8 
+//   D = scale_d * activation(Z)
+// else
+//   D = activation(Z)
+template<
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementBias_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct ScaledLinCombPerColBiasEltAct
+    : LinCombPerColBiasEltAct<ActivationFn_, ElementOutput_, ElementCompute_,
+        ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_> {
+  static constexpr bool IsScaleFactorSupported = true;
+};
+
+// Z = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-row bias
+// if D is fp8 
+//   amax_d = max(abs(elements in activation(Z)))
+//   D = scale_d * activation(Z)
+// else
+//   D = activation(Z)
+// if Aux is fp8 
+//   amax_aux = max(abs(elements in Z))
+//   Aux = scale_aux * Z
+// else
+//   Aux = Z
+template<
+  class GmemLayoutTagAux_,
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementAux_ = ElementOutput_,
+  class ElementAmax_ = ElementCompute_,
+  class ElementBias_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentAux_ = 128 / cute::sizeof_bits_v<ElementAux_>,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct ScaledLinCombPerRowBiasEltActAmaxAux
+    : ScaledLinCombPerRowBiasEltAct<ActivationFn_, ElementOutput_, ElementCompute_,
+        ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_> {
+  using ElementAmax = ElementAmax_;
+  static constexpr bool IsAbsMaxSupported = true;
+
+  using ElementAux = ElementAux_;
+  using GmemLayoutTagAux = GmemLayoutTagAux_;
+  static constexpr int AlignmentAux = AlignmentAux_;
+  static constexpr bool IsAuxOutSupported = true;
+};
+
+// Z = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-col bias
+// if D is fp8 
+//   amax_d = max(abs(elements in activation(Z)))
+//   D = scale_d * activation(Z)
+// else
+//   D = activation(Z)
+// if Aux is fp8 
+//   amax_aux = max(abs(elements in Z))
+//   Aux = scale_aux * Z
+// else
+//   Aux = Z
+template<
+  class GmemLayoutTagAux_,
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementAux_ = ElementOutput_,
+  class ElementAmax_ = ElementCompute_,
+  class ElementBias_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentAux_ = 128 / cute::sizeof_bits_v<ElementAux_>,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct ScaledLinCombPerColBiasEltActAmaxAux
+    : ScaledLinCombPerColBiasEltAct<ActivationFn_, ElementOutput_, ElementCompute_,
+        ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_> {
+  using ElementAmax = ElementAmax_;
+  static constexpr bool IsAbsMaxSupported = true;
+
+  using ElementAux = ElementAux_;
+  using GmemLayoutTagAux = GmemLayoutTagAux_;
+  static constexpr int AlignmentAux = AlignmentAux_;
+  static constexpr bool IsAuxOutSupported = true;
+};
+
+// Z = Aux
+// dY = alpha * acc + beta * C
+// D = d_activation(dY, Z)
+template<
+  class GmemLayoutTagAux_,
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementAux_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentAux_ = 128 / cute::sizeof_bits_v<ElementAux_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinCombDeEltAct
+    : LinearCombination<ElementOutput_, ElementCompute_, ElementSource_, ElementScalar_, RoundStyle_> {
+  using ActivationFn = ActivationFn_<ElementCompute_>;
+  static constexpr bool IsDeEltActSupported = true;
+
+  using ElementAux = ElementAux_;
+  using GmemLayoutTagAux = GmemLayoutTagAux_;
+  static constexpr int AlignmentAux = AlignmentAux_;
+  static constexpr bool IsAuxInSupported = true;
+};
+
+// Z = Aux
+// dY = alpha * acc + beta * C
+// D = d_activation(dY, Z)
+// dBias = sum of columns of D
+template<
+  class GmemLayoutTagAux_,
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementAux_ = ElementOutput_,
+  class ElementBias_ = ElementCompute_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentAux_ = 128 / cute::sizeof_bits_v<ElementAux_>,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinCombDeEltActDePerRowBias
+    : LinCombDeEltAct<GmemLayoutTagAux_, ActivationFn_, ElementOutput_, ElementCompute_,
+        ElementAux_, ElementSource_, ElementScalar_, AlignmentAux_, RoundStyle_> {
+  using ElementBias = ElementBias_;
+  static constexpr int AlignmentBias = AlignmentBias_;
+  static constexpr bool IsDePerRowBiasSupported = true;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::fusion
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp b/include/cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp
new file mode 100644
index 0000000000..3e57fa0ba6
--- /dev/null
+++ b/include/cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp
@@ -0,0 +1,2688 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Fusion callbacks specializations for the sm90 TMA warp-specialized (ws) epilogue
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/fusion/callbacks.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp"
+
+#include "cutlass/epilogue/fusion/sm90_visitor_topk_softmax.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::fusion {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class NodeOp, class... ChildOps>
+using Sm90EVT = Sm90TreeVisitor<NodeOp, ChildOps...>;
+
+// D = alpha * acc
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::ScaledAcc<ElementOutput, ElementCompute, ElementScalar, RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90EVT<Sm90Compute<multiplies, ElementOutput, ElementCompute, RoundStyle>,
+      Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>, 
+      Sm90AccFetch
+    > {
+  using Impl = 
+    Sm90EVT<Sm90Compute<multiplies, ElementOutput, ElementCompute, RoundStyle>,
+      Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>,
+      Sm90AccFetch
+    >;
+  using Operation = fusion::ScaledAcc<ElementOutput, ElementCompute, ElementScalar, RoundStyle>;
+
+  struct Arguments {
+    // Give a name and flat ordering to the fusion callback args
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+
+    // Conversion to the args expected by the visitor implementation
+    // to_underlying_arguments will implicitly call this
+    operator typename Impl::Arguments() const {
+      return
+        {    // binary op : alpha * acc
+          {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+          {},                     // leaf args : acc
+          {} // binary args : multiplies
+        };   // end binary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = alpha * acc + beta * C
+template<
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinearCombination =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc)
+    Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>, // beta
+    Sm90SrcFetch<ElementSource>, // C
+    Sm90EVT<Sm90Compute<multiplies, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc
+      Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>, // alpha
+      Sm90AccFetch // acc
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource,
+  class ElementScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinearCombination<ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90LinearCombination<typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementSource, ElementScalar, RoundStyle> {
+
+  using Impl = Sm90LinearCombination<typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+  using Operation = fusion::LinearCombination<ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // ternary op : beta * C + (alpha * acc)
+          {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+          {},                   // leaf args : C
+          {                     // binary op : alpha * acc
+            {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+            {},                     // leaf args : acc
+            {}                  // binary args : multiplies
+          },                    // end binary op
+          {} // ternary args : multiply_add
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = alpha * acc + beta * C, where beta and alpha can be vectors for each batch
+template<
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinearCombinationPtrArray =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc)
+    Sm90ScalarBroadcastPtrArray<ElementScalar, Stride<_0,_0,int64_t>>, // beta
+    Sm90SrcFetch<ElementSource>, // C
+    Sm90EVT<Sm90Compute<multiplies, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc
+      Sm90ScalarBroadcastPtrArray<ElementScalar, Stride<_0,_0,int64_t>>, // alpha
+      Sm90AccFetch // acc
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  int NumEpilogueWarpGroups,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource,
+  class ElementScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90PtrArrayTmaWarpSpecialized<StagesC, 
+                                             StagesD, 
+                                             FragmentSize, 
+                                             ReuseSmemC, 
+                                             DelayTmaStore, 
+                                             NumEpilogueWarpGroups
+                                            >,
+    fusion::LinearCombination<ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90LinearCombinationPtrArray<typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementSource, ElementScalar, RoundStyle> {
+
+  using Impl = Sm90LinearCombinationPtrArray<typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+  using Operation = fusion::LinearCombination<ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementScalar const* const* alpha_ptr_array = nullptr;
+    ElementScalar const* const* beta_ptr_array = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // ternary op : beta * C + (alpha * acc)
+          {{beta}, {beta_ptr}, {beta_ptr_array}, {dBeta}}, // leaf args : beta
+          {},                   // leaf args : C
+          {                     // binary op : alpha * acc
+            {{alpha}, {alpha_ptr}, {alpha_ptr_array}, {dAlpha}}, // leaf args : alpha
+            {},                     // leaf args : acc
+            {}                  // binary args : multiplies
+          },                    // end binary op
+          {} // ternary args : multiply_add
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = activation(alpha * acc + beta * C)
+template<
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombEltAct =
+  Sm90EVT<Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>, // activation(beta * C + (alpha * acc))
+    Sm90LinearCombination<ElementCompute, ElementCompute, ElementSource, ElementScalar, RoundStyle> // beta * C + (alpha * acc)
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource,
+  class ElementScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombEltAct<ActivationFn, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90LinCombEltAct<ActivationFn, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle> {
+
+  using Impl = Sm90LinCombEltAct<ActivationFn, typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+  using Operation = fusion::LinCombEltAct<ActivationFn, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // unary op: activation(beta * C + (alpha * acc))
+          {    // ternary op : beta * C + (alpha * acc)
+            {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // binary op : alpha * acc
+              {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+              {},                     // leaf args : acc
+              {}                  // binary args : multiplies
+            },                    // end binary op
+            {} // ternary args : multiply_add
+          },   // end ternary op
+          activation // unary args: activation
+        };   // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = activation(alpha * acc + beta * C), where beta and alpha can be vectors for each batch
+template<
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombEltActPtrArray =
+  Sm90EVT<Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>, // activation(beta * C + (alpha * acc))
+    Sm90LinearCombinationPtrArray<ElementCompute, ElementCompute, ElementSource, ElementScalar, RoundStyle> // beta * C + (alpha * acc)
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  int NumEpilogueWarpGroups,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource,
+  class ElementScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90PtrArrayTmaWarpSpecialized<StagesC, 
+                                             StagesD, 
+                                             FragmentSize, 
+                                             ReuseSmemC, 
+                                             DelayTmaStore, 
+                                             NumEpilogueWarpGroups
+                                            >,
+    fusion::LinCombEltAct<ActivationFn, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90LinCombEltActPtrArray<ActivationFn, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle> {
+
+  using Impl = Sm90LinCombEltActPtrArray<ActivationFn, typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+  using Operation = fusion::LinCombEltAct<ActivationFn, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementScalar const* const* alpha_ptr_array = nullptr;
+    ElementScalar const* const* beta_ptr_array = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // unary op: activation(beta * C + (alpha * acc))
+          {    // ternary op : beta * C + (alpha * acc)
+            {{beta}, {beta_ptr}, {beta_ptr_array}, {dBeta}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // binary op : alpha * acc
+              {{alpha}, {alpha_ptr}, {alpha_ptr_array}, {dAlpha}}, // leaf args : alpha
+              {},                     // leaf args : acc
+              {}                  // binary args : multiplies
+            },                    // end binary op
+            {} // ternary args : multiply_add
+          },   // end ternary op
+          activation // unary args: activation
+        };   // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = alpha * acc + beta * C + per-row bias
+template<
+  class CtaTileShapeMNK,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombPerRowBias =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc + bias)
+    Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>, // beta
+    Sm90SrcFetch<ElementSource>, // C
+    Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc + bias
+      Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>, // alpha
+      Sm90AccFetch, // acc
+      Sm90ColBroadcast<0, CtaTileShapeMNK, ElementBias, ElementCompute, Stride<_1,_0,int64_t>, AlignmentBias> // bias
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerRowBias<ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90LinCombPerRowBias<
+      CtaTileShapeMNK, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle> {
+  using Impl = Sm90LinCombPerRowBias<
+    CtaTileShapeMNK, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>;
+  using Operation = fusion::LinCombPerRowBias<
+    ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    operator typename Impl::Arguments() const {
+      return
+        {     // ternary op : beta * C + (alpha * acc + bias)
+          {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+          {},                   // leaf args : C
+          {                     // ternary op : alpha * acc + bias
+            {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+            {},                     // leaf args : acc
+            {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+            {}                  // ternary args : multiply_add
+          },                    // end ternary op
+          {} // ternary args : multiply_add
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = alpha * acc + beta * C + per-column bias
+template<
+  int StagesC,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombPerColBias =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc + bias)
+    Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>, // beta
+    Sm90SrcFetch<ElementSource>, // C
+    Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc + bias
+      Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>, // alpha
+      Sm90AccFetch, // acc
+      Sm90RowBroadcast<0, CtaTileShapeMNK, ElementBias, ElementCompute, Stride<_0,_1,int64_t>, AlignmentBias> // bias
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerColBias<ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90LinCombPerColBias<
+      StagesC, CtaTileShapeMNK, EpilogueTile, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle> {
+  using Impl = Sm90LinCombPerColBias<
+    StagesC, CtaTileShapeMNK, EpilogueTile, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>;
+  using Operation = fusion::LinCombPerColBias<
+    ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_0,_1,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    operator typename Impl::Arguments() const {
+      return
+        {     // ternary op : beta * C + (alpha * acc + bias)
+          {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+          {},                   // leaf args : C
+          {                     // ternary op : alpha * acc + bias
+            {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+            {},                     // leaf args : acc
+            {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+            {}                  // ternary args : multiply_add
+          },                    // end ternary op
+          {} // ternary args : multiply_add
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = activation(alpha * acc + beta * C + per-row bias)
+template<
+  class CtaTileShapeMNK,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombPerRowBiasEltAct =
+  Sm90EVT<Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>,
+    Sm90LinCombPerRowBias<CtaTileShapeMNK, ElementCompute, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerRowBiasEltAct<
+      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90LinCombPerRowBiasEltAct<
+      CtaTileShapeMNK, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    > {
+
+  using Impl =
+    Sm90LinCombPerRowBiasEltAct<
+      CtaTileShapeMNK, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+  using Operation =
+    fusion::LinCombPerRowBiasEltAct<
+      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // unary op : activation(beta * C + (alpha * acc + bias))
+          {    // ternary op : beta * C + (alpha * acc + bias)
+            {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // ternary op : alpha * acc + bias
+              {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+              {},                     // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+              {}                  // ternary args : multiply_add
+            },                    // end ternary op
+            {} // ternary args : multiply_add
+          },   // end ternary op
+          activation // unary args : activation
+        };   // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = activation(alpha * acc + beta * C + per-column bias)
+template<
+  int StagesC,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombPerColBiasEltAct =
+  Sm90EVT<Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>,
+    Sm90LinCombPerColBias<StagesC, CtaTileShapeMNK, EpilogueTile, ElementCompute, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerColBiasEltAct<
+      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90LinCombPerColBiasEltAct<
+      StagesC, CtaTileShapeMNK, EpilogueTile, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    > {
+
+  using Impl =
+    Sm90LinCombPerColBiasEltAct<
+      StagesC, CtaTileShapeMNK, EpilogueTile, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+  using Operation =
+    fusion::LinCombPerColBiasEltAct<
+      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_0,_1,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // unary op : activation(beta * C + (alpha * acc + bias))
+          {    // ternary op : beta * C + (alpha * acc + bias)
+            {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // ternary op : alpha * acc + bias
+              {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+              {},                     // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+              {}                  // ternary args : multiply_add
+            },                    // end ternary op
+            {} // ternary args : multiply_add
+          },   // end ternary op
+          activation // unary args : activation
+        };   // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = activation(alpha * acc + beta * C + per-row bias)
+// Aux = alpha * acc + beta * C + per-row bias)
+template<
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  int Stages,
+  class StrideAux,
+  class SmemLayoutAtom,
+  class CopyOpR2S,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux = ElementOutput,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentAux = 128 / sizeof_bits_v<ElementAux>,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombPerRowBiasEltActAux =
+  Sm90EVT<Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>,
+    Sm90EVT<Sm90AuxStore<Stages, EpilogueTile, ElementAux, RoundStyle, StrideAux, SmemLayoutAtom, CopyOpR2S, AlignmentAux>,
+      Sm90LinCombPerRowBias<CtaTileShapeMNK, ElementCompute, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class GmemLayoutTagAux,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentAux,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class SmemLayoutAtom,
+  class CopyOpR2S
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerRowBiasEltActAux<
+      GmemLayoutTagAux, ActivationFn, ElementOutput, ElementCompute,
+      ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile,
+    SmemLayoutAtom,
+    CopyOpR2S
+> : Sm90LinCombPerRowBiasEltActAux<
+      CtaTileShapeMNK, EpilogueTile, StagesD, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>, SmemLayoutAtom, CopyOpR2S, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    > {
+
+  using Impl =
+    Sm90LinCombPerRowBiasEltActAux<
+      CtaTileShapeMNK, EpilogueTile, StagesD, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>, SmemLayoutAtom, CopyOpR2S, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >;
+  using Operation =
+    fusion::LinCombPerRowBiasEltActAux<
+      GmemLayoutTagAux, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    using StrideAux = cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>;
+    ElementAux* aux_ptr = nullptr;
+    StrideAux dAux = {};
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // unary op : activation(store(beta * C + (alpha * acc + bias)))
+          {                 // unary op : store(beta * C + (alpha * acc + bias))
+            {                  // ternary op : beta * C + (alpha * acc + bias)
+              {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+              {},                   // leaf args : C
+              {                     // ternary op : alpha * acc + bias
+                {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+                {},                     // leaf args : acc
+                {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+                {}                  // ternary args : multiply_add
+              },                    // end ternary op
+              {}               // ternary args : multiply_add
+            },                 // end ternary op
+            {aux_ptr, dAux} // unary args : store
+          },                // end unary op
+          activation // unary args : activation
+        };   // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// D = activation(alpha * acc + beta * C + per_col bias)
+// Aux = alpha * acc + beta * C + per_col bias)
+template<
+  int StagesC,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  int Stages,
+  class StrideAux,
+  class SmemLayoutAtom,
+  class CopyOpR2S,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux = ElementOutput,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentAux = 128 / sizeof_bits_v<ElementAux>,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombPerColBiasEltActAux =
+  Sm90EVT<Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>,
+    Sm90EVT<Sm90AuxStore<Stages, EpilogueTile, ElementAux, RoundStyle, StrideAux, SmemLayoutAtom, CopyOpR2S, AlignmentAux>,
+      Sm90LinCombPerColBias<StagesC, CtaTileShapeMNK, EpilogueTile, ElementCompute, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class GmemLayoutTagAux,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentAux,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class SmemLayoutAtom,
+  class CopyOpR2S
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerColBiasEltActAux<
+      GmemLayoutTagAux, ActivationFn, ElementOutput, ElementCompute,
+      ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile,
+    SmemLayoutAtom,
+    CopyOpR2S
+> : Sm90LinCombPerColBiasEltActAux<
+      StagesC, CtaTileShapeMNK, EpilogueTile, StagesD, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>, SmemLayoutAtom, CopyOpR2S, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    > {
+
+  using Impl =
+    Sm90LinCombPerColBiasEltActAux<
+      StagesC, CtaTileShapeMNK, EpilogueTile, StagesD, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>, SmemLayoutAtom, CopyOpR2S, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >;
+  using Operation =
+    fusion::LinCombPerColBiasEltActAux<
+      GmemLayoutTagAux, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_0,_1,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    using StrideAux = cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>;
+    ElementAux* aux_ptr = nullptr;
+    StrideAux dAux = {};
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // unary op : activation(store(beta * C + (alpha * acc + bias)))
+          {                 // unary op : store(beta * C + (alpha * acc + bias))
+            {                  // ternary op : beta * C + (alpha * acc + bias)
+              {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+              {},                   // leaf args : C
+              {                     // ternary op : alpha * acc + bias
+                {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+                {},                     // leaf args : acc
+                {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+                {}                  // ternary args : multiply_add
+              },                    // end ternary op
+              {}               // ternary args : multiply_add
+            },                 // end ternary op
+            {aux_ptr, dAux} // unary args : store
+          },                // end unary op
+          activation // unary args : activation
+        };   // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = per-row alpha * acc + per-row beta * C + per-row bias
+template<
+  class CtaTileShapeMNK,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  int AlignmentScalar = 128 / sizeof_bits_v<ElementScalar>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90PerRowLinCombPerRowBias =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc + bias)
+    Sm90ColBroadcast<0, CtaTileShapeMNK, ElementScalar, ElementCompute, Stride<bool,_0,int64_t>, AlignmentScalar>, // beta, dynamic scalar/vector broadcast
+    Sm90SrcFetch<ElementSource>, // C
+    Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc + bias
+      Sm90ColBroadcast<0, CtaTileShapeMNK, ElementScalar, ElementCompute, Stride<bool,_0,int64_t>, AlignmentScalar>, // alpha, dynamic scalar/vector broadcast
+      Sm90AccFetch, // acc
+      Sm90ColBroadcast<0, CtaTileShapeMNK, ElementBias, ElementCompute, Stride<_1,_0,int64_t>, AlignmentBias> // bias
+    >
+  >;
+
+// D = activation(per-row alpha * acc + per-row beta * C + per-row bias)
+template<
+  class CtaTileShapeMNK,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  int AlignmentScalar = 128 / sizeof_bits_v<ElementScalar>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90PerRowLinCombPerRowBiasEltAct =
+  Sm90EVT<Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>,
+    Sm90PerRowLinCombPerRowBias<CtaTileShapeMNK, ElementCompute, ElementCompute,
+                                ElementBias, ElementSource, ElementScalar, AlignmentBias, AlignmentScalar, RoundStyle>
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  int AlignmentScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::PerRowLinCombPerRowBiasEltAct<
+      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, AlignmentScalar, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90PerRowLinCombPerRowBiasEltAct<
+      CtaTileShapeMNK, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, AlignmentScalar, RoundStyle
+    > {
+
+  using Impl =
+    Sm90PerRowLinCombPerRowBiasEltAct<
+      CtaTileShapeMNK, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, AlignmentScalar, RoundStyle
+    >;
+  using Operation =
+    fusion::PerRowLinCombPerRowBiasEltAct<
+      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, AlignmentScalar, RoundStyle
+    >;
+
+  struct Arguments {
+    using StrideAlpha = Stride<bool,_0,int64_t>;
+    using StrideBeta  = Stride<bool,_0,int64_t>;
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    StrideAlpha dAlpha = {bool(1), _0{}, 0};
+    StrideBeta  dBeta  = {bool(1), _0{}, 0};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // unary op : activation(beta * C + (alpha * acc + bias))
+          {    // ternary op : beta * C + (alpha * acc + bias)
+            {beta_ptr, beta, dBeta}, // leaf args : beta
+            {},                      // leaf args : C
+            {                        // ternary op : alpha * acc + bias
+              {alpha_ptr, alpha, dAlpha}, // leaf args : alpha
+              {},                         // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+              {}                     // ternary args : multiply_add
+            },                       // end ternary op
+            {} // ternary args : multiply_add
+          },   // end ternary op
+          activation // unary args : activation
+        };   // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = per-col alpha * acc + per-col beta * C + per-column bias
+template<
+  int StagesC,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  int AlignmentScalar = 128 / sizeof_bits_v<ElementScalar>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90PerColLinCombPerColBias =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc + bias)
+    Sm90RowBroadcast<0, CtaTileShapeMNK, ElementScalar, ElementCompute, Stride<_0,bool,int64_t>, AlignmentScalar>, // beta, dynamic scalar/vector broadcast
+    Sm90SrcFetch<ElementSource>, // C
+    Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc + bias
+      Sm90RowBroadcast<0, CtaTileShapeMNK, ElementScalar, ElementCompute, Stride<_0,bool,int64_t>, AlignmentScalar>, // alpha, dynamic scalar/vector broadcast
+      Sm90AccFetch, // acc
+      Sm90RowBroadcast<0, CtaTileShapeMNK, ElementBias, ElementCompute, Stride<_0,_1,int64_t>, AlignmentBias> // bias
+    >
+  >;
+
+// D = activation(per-col alpha * acc + per-col beta * C + per-column bias)
+template<
+  int StagesC,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  int AlignmentScalar = 128 / sizeof_bits_v<ElementScalar>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90PerColLinCombPerColBiasEltAct =
+  Sm90EVT<Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>,
+    Sm90PerColLinCombPerColBias<StagesC, CtaTileShapeMNK, EpilogueTile, ElementCompute, ElementCompute,
+                                ElementBias, ElementSource, ElementScalar, AlignmentBias, AlignmentScalar, RoundStyle>
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  int AlignmentScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::PerColLinCombPerColBiasEltAct<
+      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, AlignmentScalar, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90PerColLinCombPerColBiasEltAct<
+      StagesC, CtaTileShapeMNK, EpilogueTile, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, AlignmentScalar, RoundStyle
+    > {
+
+  using Impl =
+    Sm90PerColLinCombPerColBiasEltAct<
+      StagesC, CtaTileShapeMNK, EpilogueTile, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, AlignmentScalar, RoundStyle
+    >;
+  using Operation =
+    fusion::PerColLinCombPerColBiasEltAct<
+      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, AlignmentScalar, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,bool,int64_t>;
+    using StrideBeta  = Stride<_0,bool,int64_t>;
+    StrideAlpha dAlpha = {_0{}, bool(1), 0};
+    StrideBeta  dBeta  = {_0{}, bool(1), 0};
+
+    using StrideBias = Stride<_0,_1,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // unary op : activation(beta * C + (alpha * acc + bias))
+          {    // ternary op : beta * C + (alpha * acc + bias)
+            {beta_ptr, beta, dBeta}, // leaf args : beta
+            {},               // leaf args : C
+            {                 // ternary op : alpha * acc + bias
+              {alpha_ptr, alpha, dAlpha},   // leaf args : alpha
+              {},                   // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+              {}              // ternary args : multiply_add
+            },                // end ternary op
+            {} // ternary args : multiply_add
+          },   // end ternary op
+          activation // unary args : activation
+        };   // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <typename T>
+constexpr bool is_fp8_v = cute::is_same_v<T,float_e4m3_t> || cute::is_same_v<T,float_e5m2_t>;
+
+// We only apply the scaling factor if output is fp8
+template <typename ElementOutput>
+struct ScaleOutOp { template <typename T> using Op = cutlass::first<T>; };
+template <>
+struct ScaleOutOp<float_e4m3_t> { template <typename T> using Op = cutlass::multiplies<T>; };
+template <>
+struct ScaleOutOp<float_e5m2_t> { template <typename T> using Op = cutlass::multiplies<T>; };
+
+template <typename T>
+using amax = cutlass::maximum_absolute_value_reduction<T, true>; // propogate nans
+
+}; // end namespace detail
+
+// D = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-row bias
+template<
+  class CtaTileShapeMNK,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90ScaledLinCombPerRowBias =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc + bias)
+    Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>, 2>, // scale_c * beta
+    Sm90SrcFetch<ElementSource>, // C
+    Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc + bias
+      Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>, 3>, // scale_a * scale_b * alpha
+      Sm90AccFetch, // acc
+      Sm90ColBroadcast<0, CtaTileShapeMNK, ElementBias, ElementCompute, Stride<_1,_0,int64_t>, AlignmentBias> // bias
+    >
+  >;
+
+// Z = scale_a * scale_b * alpha * acc + beta * scale_c * C + per-row bias
+// if D is fp8 
+//   D = scale_d * activation(Z)
+// else
+//   D = activation(Z)
+template<
+  class CtaTileShapeMNK,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90ScaledLinCombPerRowBiasEltAct =
+  Sm90EVT<Sm90Compute<detail::ScaleOutOp<ElementOutput>::template Op, ElementOutput, ElementCompute, RoundStyle>, // activation(Z) * scale_d
+    Sm90EVT<Sm90Compute<ActivationFn, ElementCompute, ElementCompute, RoundStyle>, // activation(Z)
+      // Z = scale_a * scale_b * alpha * acc + beta * scale_c * C + per-row bias
+      Sm90ScaledLinCombPerRowBias<CtaTileShapeMNK, ElementCompute, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>
+    >,
+    Sm90ScalarBroadcast<ElementScalar> // scale_d
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::ScaledLinCombPerRowBiasEltAct<
+      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90ScaledLinCombPerRowBiasEltAct<
+      CtaTileShapeMNK, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    > {
+
+  using Impl =
+    Sm90ScaledLinCombPerRowBiasEltAct<
+      CtaTileShapeMNK, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+  using Operation =
+    fusion::ScaledLinCombPerRowBiasEltAct<
+      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    ElementScalar scale_a = ElementScalar(1);
+    ElementScalar scale_b = ElementScalar(1);
+    ElementScalar scale_c = ElementScalar(1);
+    ElementScalar scale_d = ElementScalar(1);
+    ElementScalar const* scale_a_ptr = nullptr;
+    ElementScalar const* scale_b_ptr = nullptr;
+    ElementScalar const* scale_c_ptr = nullptr;
+    ElementScalar const* scale_d_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // binary op : activation((scale_c * beta) * C + ((scale_a * scale_b * alpha) * acc + bias)) * scale_d
+          {    // unary op : activation((scale_c * beta) * C + ((scale_a * scale_b * alpha) * acc + bias))
+            {    // ternary op : (scale_c * beta) * C + ((scale_a * scale_b * alpha) * acc + bias)
+              {{beta, scale_c},
+               {beta_ptr, scale_c_ptr},
+               {dBeta, {_0{}, _0{}, 0}}
+               },  // leaf args : (scale_c * beta)
+              {},  // leaf args : C
+              {    // ternary op : (scale_a * scale_b * alpha) * acc + bias
+                {{alpha, scale_a, scale_b}, 
+                 {alpha_ptr, scale_a_ptr, scale_b_ptr},
+                 {dAlpha, {_0{}, _0{}, 0}, {_0{}, _0{}, 0}}
+                 },                   // leaf args : (scale_a * scale_b * alpha)
+                {},                   // leaf args : acc
+                {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+                {} // ternary args : multiply_add
+              },   // end ternary op
+              {} // ternary args : multiply_add
+            },   // end ternary op
+            activation // unary args : activation
+          },   // end unary op
+          {{scale_d},
+           {scale_d_ptr}
+           },   // leaf args : scale_d
+          {} // binary args : multiplies or first
+        };   // end binary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-col bias
+template<
+  class CtaTileShapeMNK,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90ScaledLinCombPerColBias =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc + bias)
+    Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>, 2>, // scale_c * beta
+    Sm90SrcFetch<ElementSource>, // C
+    Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc + bias
+      Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>, 3>, // scale_a * scale_b * alpha
+      Sm90AccFetch, // acc
+      Sm90RowBroadcast<0, CtaTileShapeMNK, ElementBias, ElementCompute, Stride<_0,_1,int64_t>, AlignmentBias> // bias
+    >
+  >;
+
+// Z = scale_a * scale_b * alpha * acc + beta * scale_c * C + per-col bias
+// if D is fp8 
+//   D = scale_d * activation(Z)
+// else
+//   D = activation(Z)
+template<
+  class CtaTileShapeMNK,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90ScaledLinCombPerColBiasEltAct =
+  Sm90EVT<Sm90Compute<detail::ScaleOutOp<ElementOutput>::template Op, ElementOutput, ElementCompute, RoundStyle>, // activation(Z) * scale_d
+    Sm90EVT<Sm90Compute<ActivationFn, ElementCompute, ElementCompute, RoundStyle>, // activation(Z)
+      // Z = scale_a * scale_b * alpha * acc + beta * scale_c * C + per-row bias
+      Sm90ScaledLinCombPerColBias<CtaTileShapeMNK, ElementCompute, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>
+    >,
+    Sm90ScalarBroadcast<ElementScalar> // scale_d
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::ScaledLinCombPerColBiasEltAct<
+      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90ScaledLinCombPerColBiasEltAct<
+      CtaTileShapeMNK, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    > {
+
+  using Impl =
+    Sm90ScaledLinCombPerColBiasEltAct<
+      CtaTileShapeMNK, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+  using Operation =
+    fusion::ScaledLinCombPerColBiasEltAct<
+      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    ElementScalar scale_a = ElementScalar(1);
+    ElementScalar scale_b = ElementScalar(1);
+    ElementScalar scale_c = ElementScalar(1);
+    ElementScalar scale_d = ElementScalar(1);
+    ElementScalar const* scale_a_ptr = nullptr;
+    ElementScalar const* scale_b_ptr = nullptr;
+    ElementScalar const* scale_c_ptr = nullptr;
+    ElementScalar const* scale_d_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_0,_1,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // binary op : activation((scale_c * beta) * C + ((scale_a * scale_b * alpha) * acc + bias)) * scale_d
+          {    // unary op : activation((scale_c * beta) * C + ((scale_a * scale_b * alpha) * acc + bias))
+            {    // ternary op : (scale_c * beta) * C + ((scale_a * scale_b * alpha) * acc + bias)
+              {{beta, scale_c},
+               {beta_ptr, scale_c_ptr},
+               {dBeta, {_0{}, _0{}, 0}}
+               },  // leaf args : (scale_c * beta)
+              {},  // leaf args : C
+              {    // ternary op : (scale_a * scale_b * alpha) * acc + bias
+                {{alpha, scale_a, scale_b}, 
+                 {alpha_ptr, scale_a_ptr, scale_b_ptr},
+                 {dAlpha, {_0{}, _0{}, 0}, {_0{}, _0{}, 0}}
+                 },                   // leaf args : (scale_a * scale_b * alpha)
+                {},                   // leaf args : acc
+                {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+                {} // ternary args : multiply_add
+              },   // end ternary op
+              {} // ternary args : multiply_add
+            },   // end ternary op
+            activation // unary args : activation
+          },   // end unary op
+          {{scale_d},
+           {scale_d_ptr}
+           },   // leaf args : scale_d
+          {} // binary args : multiplies or first
+        };   // end binary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Z = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-row bias
+// if D is fp8 
+//   amax_d = max(abs(elements in activation(Z)))
+//   D = scale_d * activation(Z)
+// else
+//   D = activation(Z)
+// if Aux is fp8 
+//   amax_aux = max(abs(elements in Z))
+//   Aux = scale_aux * Z
+// else
+//   Aux = Z
+
+// fp8 aux specialization
+template<
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  int StagesD,
+  class StrideAux,
+  class SmemLayoutAtom,
+  class CopyOpR2S,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux = ElementOutput,
+  class ElementAmax = ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentAux = 128 / sizeof_bits_v<ElementAux>,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90ScaledLinCombPerRowBiasEltActAmaxAuxFp8 =
+  Sm90SplitTreeVisitor<
+    // Z = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-row bias
+    Sm90ScaledLinCombPerRowBias<CtaTileShapeMNK, ElementCompute, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>,
+    // D = activation(Z) * scale_d, amax_d = max(abs(elements in D))
+    Sm90EVT<Sm90Compute<detail::ScaleOutOp<ElementOutput>::template Op, ElementOutput, ElementCompute, RoundStyle>, // activation(Z) * scale_d
+      Sm90EVT<Sm90ScalarReduction<detail::amax, atomic_maximum, ElementAmax, ElementCompute, RoundStyle>, // amax_d
+        Sm90EVT<Sm90Compute<ActivationFn, ElementCompute, ElementCompute, RoundStyle>, // activation(Z)
+          Sm90SplitTreeFetch // Z
+        >
+      >,
+      Sm90ScalarBroadcast<ElementScalar> // scale_d
+    >,
+    // Aux = Z * scale_aux, amax_aux = max(abs(elements in Aux))
+    Sm90EVT<Sm90AuxStore<StagesD, EpilogueTile, ElementAux, RoundStyle, StrideAux, SmemLayoutAtom, CopyOpR2S, AlignmentAux>, // store(Aux)
+      Sm90EVT<Sm90Compute<cutlass::multiplies, ElementCompute, ElementCompute, RoundStyle>, // Z * scale_aux
+        Sm90EVT<Sm90ScalarReduction<detail::amax, atomic_maximum, ElementAmax, ElementCompute, RoundStyle>, // amax_aux
+          Sm90SplitTreeFetch // Z
+        >,
+        Sm90ScalarBroadcast<ElementScalar> // scale_aux
+      >
+    >
+  >;
+
+// non-fp8 aux specialization
+// lets us use some EVT specializations such as relu + uint1b_t aux
+template<
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  int StagesD,
+  class StrideAux,
+  class SmemLayoutAtom,
+  class CopyOpR2S,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux = ElementOutput,
+  class ElementAmax = ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentAux = 128 / sizeof_bits_v<ElementAux>,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90ScaledLinCombPerRowBiasEltActAmaxAuxNotFp8 =
+  // D = activation(Z) * scale_d, amax_d = max(abs(elements in D))
+  Sm90EVT<Sm90Compute<detail::ScaleOutOp<ElementOutput>::template Op, ElementOutput, ElementCompute, RoundStyle>, // activation(Z) * scale_d
+    Sm90EVT<Sm90ScalarReduction<detail::amax, atomic_maximum, ElementAmax, ElementCompute, RoundStyle>, // amax_d
+      Sm90EVT<Sm90Compute<ActivationFn, ElementCompute, ElementCompute, RoundStyle>, // activation(Z)
+        Sm90EVT<Sm90AuxStore<StagesD, EpilogueTile, ElementAux, RoundStyle, StrideAux, SmemLayoutAtom, CopyOpR2S, AlignmentAux>, // Aux = Z
+          // Z = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-row bias
+          Sm90ScaledLinCombPerRowBias<CtaTileShapeMNK, ElementCompute, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>
+        >
+      >
+    >,
+    Sm90ScalarBroadcast<ElementScalar> // scale_d
+  >;
+
+// dispatcher
+template<
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  int StagesD,
+  class StrideAux,
+  class SmemLayoutAtom,
+  class CopyOpR2S,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux = ElementOutput,
+  class ElementAmax = ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentAux = 128 / sizeof_bits_v<ElementAux>,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90ScaledLinCombPerRowBiasEltActAmaxAux = conditional_t<detail::is_fp8_v<ElementAux>,
+  Sm90ScaledLinCombPerRowBiasEltActAmaxAuxFp8<
+    CtaTileShapeMNK, EpilogueTile, StagesD, StrideAux, SmemLayoutAtom, CopyOpR2S, ActivationFn,
+    ElementOutput, ElementCompute, ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar,AlignmentAux, AlignmentBias, RoundStyle
+  >,
+  Sm90ScaledLinCombPerRowBiasEltActAmaxAuxNotFp8<
+    CtaTileShapeMNK, EpilogueTile, StagesD, StrideAux, SmemLayoutAtom, CopyOpR2S, ActivationFn,
+    ElementOutput, ElementCompute, ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+  >
+>;
+
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class GmemLayoutTagAux,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux,
+  class ElementAmax,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentAux,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class SmemLayoutAtom,
+  class CopyOpR2S
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::ScaledLinCombPerRowBiasEltActAmaxAux<
+      GmemLayoutTagAux, ActivationFn, ElementOutput, ElementCompute,
+      ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile,
+    SmemLayoutAtom,
+    CopyOpR2S
+> : Sm90ScaledLinCombPerRowBiasEltActAmaxAux<
+      CtaTileShapeMNK, EpilogueTile, StagesD, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>,
+      SmemLayoutAtom, CopyOpR2S, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    > {
+
+  using Impl =
+    Sm90ScaledLinCombPerRowBiasEltActAmaxAux<
+      CtaTileShapeMNK, EpilogueTile, StagesD, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>,
+      SmemLayoutAtom, CopyOpR2S, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >;
+  using Operation =
+    fusion::ScaledLinCombPerRowBiasEltActAmaxAux<
+      GmemLayoutTagAux, ActivationFn, ElementOutput, ElementCompute,
+      ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    ElementScalar scale_a = ElementScalar(1);
+    ElementScalar scale_b = ElementScalar(1);
+    ElementScalar scale_c = ElementScalar(1);
+    ElementScalar scale_d = ElementScalar(1);
+    ElementScalar const* scale_a_ptr = nullptr;
+    ElementScalar const* scale_b_ptr = nullptr;
+    ElementScalar const* scale_c_ptr = nullptr;
+    ElementScalar const* scale_d_ptr = nullptr;
+
+    ElementScalar scale_aux = ElementScalar(1);
+    ElementScalar const* scale_aux_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    ElementAmax* amax_D_ptr = nullptr;
+    ElementAmax* amax_aux_ptr = nullptr;
+
+    using StrideAux = cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>;
+    ElementAux* aux_ptr = nullptr;
+    StrideAux dAux = {};
+
+    operator typename Impl::Arguments() const {
+      // Only compute amax_d if D is fp8
+      ElementAmax* amax_D_ptr_ = nullptr;
+      if constexpr (detail::is_fp8_v<ElementOutput>) {
+        amax_D_ptr_ = amax_D_ptr;
+      }
+
+      // Aux is fp8 -> DAG arguments
+      if constexpr (detail::is_fp8_v<ElementAux>) {
+        typename Impl::Arguments args;
+        // always use structured binding to unpack DAG args since it may or may not be a tuple
+        auto& [Z_args, aux_args, D_args] = args;
+
+        Z_args =
+          {    // ternary op : (scale_c * beta) * C + ((scale_a * scale_b * alpha) * acc + bias)
+            {{beta, scale_c},
+             {beta_ptr, scale_c_ptr},
+             {dBeta, {_0{}, _0{}, 0}}
+             },  // leaf args : (scale_c * beta)
+            {},  // leaf args : C
+            {    // ternary op : (scale_a * scale_b * alpha) * acc + bias
+              {{alpha, scale_a, scale_b}, 
+               {alpha_ptr, scale_a_ptr, scale_b_ptr},
+               {dAlpha ,{_0{}, _0{}, 0}, {_0{}, _0{}, 0}}
+               },                   // leaf args : (scale_a * scale_b * alpha)
+              {},                   // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+              {} // ternary args : multiply_add
+            },   // end ternary op
+            {} // ternary args : multiply_add
+          };   // end ternary op
+
+        D_args =
+          {    // binary op : activation(Z) * scale_d or activation(Z)
+            {    // unary op : reduce(activation(Z))
+              {             // unary op : activation(Z)
+                {},             // leaf args : Z
+                activation      // unary args : activation
+              },                // end unary op
+              {amax_D_ptr_} // unary args : reduce
+            },              // end unary op
+            {{scale_d},
+             {scale_d_ptr}
+             },  // leaf args : scale_d
+            {} // binary args : multiplies or first
+          };   // end binary op
+
+        aux_args =
+          {    // unary op : store(Aux)
+            {    // binary op : Z * scale_d or Z
+              {    // unary op : reduce(Z)
+                {},            // leaf args : Z
+                {amax_aux_ptr} // unary args : reduce
+              },   // end unary op
+              {{scale_aux},
+               {scale_aux_ptr}
+               },  // leaf args : scale_d
+              {} // binary args : multiplies
+            },   // end binary op
+            {aux_ptr, dAux} // unary args : store
+          };   // end unary op
+
+        return args;
+      }
+
+      // Aux is not fp8 -> Tree arguments
+      else {
+        return
+          {  // binary op : activation(Z) * scale_d or activation(Z)
+            {  // unary op : reduce(activation(Z))
+              {  // unary op : activation(Z)
+                {  // unary op : store(Z)
+                  {  // ternary op : (scale_c * beta) * C + ((scale_a * scale_b * alpha) * acc + bias)
+                    {{beta, scale_c},
+                     {beta_ptr, scale_c_ptr},
+                     {dBeta, {_0{}, _0{}, 0}}
+                    },                // leaf args : (scale_c * beta)
+                    {},               // leaf args : C
+                    {                 // ternary op : (scale_a * scale_b * alpha) * acc + bias
+                      {{alpha, scale_a, scale_b}, 
+                       {alpha_ptr, scale_a_ptr, scale_b_ptr},
+                       {dAlpha, {_0{}, _0{}, 0}}
+                      },                // leaf args : (scale_a * scale_b * alpha)
+                      {},               // leaf args : acc
+                      {bias_ptr, ElementBias(0), dBias
+                      },                // leaf args : bias
+                      {}              // ternary args : multiply_add
+                    },                // end ternary op
+                    {}              // ternary args : multiply_add
+                  },                // end ternary op
+                  {aux_ptr, dAux} // unary args : store
+                },                // end unary op
+                activation     // unary args : activation
+              },               // end unary op
+              {amax_D_ptr_} // unary args : reduce
+            },              // end unary op
+            {{scale_d},{scale_d_ptr}}, // leaf args : scale_d
+            {} // binary args : multiplies or first
+          };   // end binary op
+      }
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Z = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-col bias
+// if D is fp8 
+//   amax_d = max(abs(elements in activation(Z)))
+//   D = scale_d * activation(Z)
+// else
+//   D = activation(Z)
+// if Aux is fp8 
+//   amax_aux = max(abs(elements in Z))
+//   Aux = scale_aux * Z
+// else
+//   Aux = Z
+
+// fp8 aux specialization
+template<
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  int StagesD,
+  class StrideAux,
+  class SmemLayoutAtom,
+  class CopyOpR2S,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux = ElementOutput,
+  class ElementAmax = ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentAux = 128 / sizeof_bits_v<ElementAux>,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90ScaledLinCombPerColBiasEltActAmaxAuxFp8 =
+  Sm90SplitTreeVisitor<
+    // Z = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-col bias
+    Sm90ScaledLinCombPerColBias<CtaTileShapeMNK, ElementCompute, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>,
+    // D = activation(Z) * scale_d, amax_d = max(abs(elements in D))
+    Sm90EVT<Sm90Compute<detail::ScaleOutOp<ElementOutput>::template Op, ElementOutput, ElementCompute, RoundStyle>, // activation(Z) * scale_d
+      Sm90EVT<Sm90ScalarReduction<detail::amax, atomic_maximum, ElementAmax, ElementCompute, RoundStyle>, // amax_d
+        Sm90EVT<Sm90Compute<ActivationFn, ElementCompute, ElementCompute, RoundStyle>, // activation(Z)
+          Sm90SplitTreeFetch // Z
+        >
+      >,
+      Sm90ScalarBroadcast<ElementScalar> // scale_d
+    >,
+    // Aux = Z * scale_aux, amax_aux = max(abs(elements in Aux))
+    Sm90EVT<Sm90AuxStore<StagesD, EpilogueTile, ElementAux, RoundStyle, StrideAux, SmemLayoutAtom, CopyOpR2S, AlignmentAux>, // store(Aux)
+      Sm90EVT<Sm90Compute<cutlass::multiplies, ElementCompute, ElementCompute, RoundStyle>, // Z * scale_aux
+        Sm90EVT<Sm90ScalarReduction<detail::amax, atomic_maximum, ElementAmax, ElementCompute, RoundStyle>, // amax_aux
+          Sm90SplitTreeFetch // Z
+        >,
+        Sm90ScalarBroadcast<ElementScalar> // scale_aux
+      >
+    >
+  >;
+
+// non-fp8 aux specialization
+// lets us use some EVT specializations such as relu + uint1b_t aux
+template<
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  int StagesD,
+  class StrideAux,
+  class SmemLayoutAtom,
+  class CopyOpR2S,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux = ElementOutput,
+  class ElementAmax = ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentAux = 128 / sizeof_bits_v<ElementAux>,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90ScaledLinCombPerColBiasEltActAmaxAuxNotFp8 =
+  // D = activation(Z) * scale_d, amax_d = max(abs(elements in D))
+  Sm90EVT<Sm90Compute<detail::ScaleOutOp<ElementOutput>::template Op, ElementOutput, ElementCompute, RoundStyle>, // activation(Z) * scale_d
+    Sm90EVT<Sm90ScalarReduction<detail::amax, atomic_maximum, ElementAmax, ElementCompute, RoundStyle>, // amax_d
+      Sm90EVT<Sm90Compute<ActivationFn, ElementCompute, ElementCompute, RoundStyle>, // activation(Z)
+        Sm90EVT<Sm90AuxStore<StagesD, EpilogueTile, ElementAux, RoundStyle, StrideAux, SmemLayoutAtom, CopyOpR2S, AlignmentAux>, // Aux = Z
+          // Z = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-row bias
+          Sm90ScaledLinCombPerColBias<CtaTileShapeMNK, ElementCompute, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>
+        >
+      >
+    >,
+    Sm90ScalarBroadcast<ElementScalar> // scale_d
+  >;
+
+// dispatcher
+template<
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  int StagesD,
+  class StrideAux,
+  class SmemLayoutAtom,
+  class CopyOpR2S,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux = ElementOutput,
+  class ElementAmax = ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentAux = 128 / sizeof_bits_v<ElementAux>,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90ScaledLinCombPerColBiasEltActAmaxAux = conditional_t<detail::is_fp8_v<ElementAux>,
+  Sm90ScaledLinCombPerColBiasEltActAmaxAuxFp8<
+    CtaTileShapeMNK, EpilogueTile, StagesD, StrideAux, SmemLayoutAtom, CopyOpR2S, ActivationFn,
+    ElementOutput, ElementCompute, ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar,AlignmentAux, AlignmentBias, RoundStyle
+  >,
+  Sm90ScaledLinCombPerColBiasEltActAmaxAuxNotFp8<
+    CtaTileShapeMNK, EpilogueTile, StagesD, StrideAux, SmemLayoutAtom, CopyOpR2S, ActivationFn,
+    ElementOutput, ElementCompute, ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+  >
+>;
+
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class GmemLayoutTagAux,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux,
+  class ElementAmax,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentAux,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class SmemLayoutAtom,
+  class CopyOpR2S
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::ScaledLinCombPerColBiasEltActAmaxAux<
+      GmemLayoutTagAux, ActivationFn, ElementOutput, ElementCompute,
+      ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile,
+    SmemLayoutAtom,
+    CopyOpR2S
+> : Sm90ScaledLinCombPerColBiasEltActAmaxAux<
+      CtaTileShapeMNK, EpilogueTile, StagesD, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>,
+      SmemLayoutAtom, CopyOpR2S, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    > {
+
+  using Impl =
+    Sm90ScaledLinCombPerColBiasEltActAmaxAux<
+      CtaTileShapeMNK, EpilogueTile, StagesD, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>,
+      SmemLayoutAtom, CopyOpR2S, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >;
+  using Operation =
+    fusion::ScaledLinCombPerColBiasEltActAmaxAux<
+      GmemLayoutTagAux, ActivationFn, ElementOutput, ElementCompute,
+      ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    ElementScalar scale_a = ElementScalar(1);
+    ElementScalar scale_b = ElementScalar(1);
+    ElementScalar scale_c = ElementScalar(1);
+    ElementScalar scale_d = ElementScalar(1);
+    ElementScalar const* scale_a_ptr = nullptr;
+    ElementScalar const* scale_b_ptr = nullptr;
+    ElementScalar const* scale_c_ptr = nullptr;
+    ElementScalar const* scale_d_ptr = nullptr;
+
+    ElementScalar scale_aux = ElementScalar(1);
+    ElementScalar const* scale_aux_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_0,_1,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    ElementAmax* amax_D_ptr = nullptr;
+    ElementAmax* amax_aux_ptr = nullptr;
+
+    using StrideAux = cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>;
+    ElementAux* aux_ptr = nullptr;
+    StrideAux dAux = {};
+
+    operator typename Impl::Arguments() const {
+      // Only compute amax_d if D is fp8
+      ElementAmax* amax_D_ptr_ = nullptr;
+      if constexpr (detail::is_fp8_v<ElementOutput>) {
+        amax_D_ptr_ = amax_D_ptr;
+      }
+
+      // Aux is fp8 -> DAG arguments
+      if constexpr (detail::is_fp8_v<ElementAux>) {
+        typename Impl::Arguments args;
+        // always use structured binding to unpack DAG args since it may or may not be a tuple
+        auto& [Z_args, aux_args, D_args] = args;
+
+        Z_args =
+          {    // ternary op : (scale_c * beta) * C + ((scale_a * scale_b * alpha) * acc + bias)
+            {{beta, scale_c},
+             {beta_ptr, scale_c_ptr},
+             {dBeta, {_0{}, _0{}, 0}}
+             },  // leaf args : (scale_c * beta)
+            {},  // leaf args : C
+            {    // ternary op : (scale_a * scale_b * alpha) * acc + bias
+              {{alpha, scale_a, scale_b}, 
+               {alpha_ptr, scale_a_ptr, scale_b_ptr},
+               {dAlpha, {_0{}, _0{}, 0}, {_0{}, _0{}, 0}}
+               },                   // leaf args : (scale_a * scale_b * alpha)
+              {},                   // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+              {} // ternary args : multiply_add
+            },   // end ternary op
+            {} // ternary args : multiply_add
+          };   // end ternary op
+
+        D_args =
+          {    // binary op : activation(Z) * scale_d or activation(Z)
+            {    // unary op : reduce(activation(Z))
+              {             // unary op : activation(Z)
+                {},             // leaf args : Z
+                activation      // unary args : activation
+              },                // end unary op
+              {amax_D_ptr_} // unary args : reduce
+            },              // end unary op
+            {{scale_d},
+             {scale_d_ptr}
+             },  // leaf args : scale_d
+            {} // binary args : multiplies or first
+          };   // end binary op
+
+        aux_args =
+          {    // unary op : store(Aux)
+            {    // binary op : Z * scale_d or Z
+              {    // unary op : reduce(Z)
+                {},            // leaf args : Z
+                {amax_aux_ptr} // unary args : reduce
+              },   // end unary op
+              {{scale_aux},
+               {scale_aux_ptr}
+               },  // leaf args : scale_d
+              {} // binary args : multiplies
+            },   // end binary op
+            {aux_ptr, dAux} // unary args : store
+          };   // end unary op
+
+        return args;
+      }
+
+      // Aux is not fp8 -> Tree arguments
+      else {
+        return
+          {  // binary op : activation(Z) * scale_d or activation(Z)
+            {  // unary op : reduce(activation(Z))
+              {  // unary op : activation(Z)
+                {  // unary op : store(Z)
+                  {  // ternary op : (scale_c * beta) * C + ((scale_a * scale_b * alpha) * acc + bias)
+                    {{beta, scale_c},
+                    {beta_ptr, scale_c_ptr},
+                    {dBeta, {_0{}, _0{}, 0}}
+                    },  // leaf args : (scale_c * beta)
+                    {},               // leaf args : C
+                    {                 // ternary op : (scale_a * scale_b * alpha) * acc + bias
+                      {{alpha, scale_a, scale_b}, 
+                       {alpha_ptr, scale_a_ptr, scale_b_ptr},
+                       {dAlpha, {_0{}, _0{}, 0}, {_0{}, _0{}, 0}}
+                      },                // leaf args : (scale_a * scale_b * alpha)
+                      {},               // leaf args : acc
+                      {bias_ptr, ElementBias(0), dBias
+                      },                // leaf args : bias
+                      {}              // ternary args : multiply_add
+                    },                // end ternary op
+                    {}              // ternary args : multiply_add
+                  },                // end ternary op
+                  {aux_ptr, dAux} // unary args : store
+                },                // end unary op
+                activation     // unary args : activation
+              },               // end unary op
+              {amax_D_ptr_} // unary args : reduce
+            },              // end unary op
+            {{scale_d},{scale_d_ptr}}, // leaf args : scale_d
+            {} // binary args : multiplies or first
+          };   // end binary op
+      }
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  int Stages,
+  class StrideAux,
+  class SmemLayoutAtom,
+  class CopyOpS2R,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentAux = 128 / sizeof_bits_v<ElementAux>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombDeEltAct =
+  Sm90EVT<Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>, // activation(beta * C + (alpha * acc), aux)
+    Sm90LinearCombination<ElementCompute, ElementCompute, ElementSource, ElementScalar, RoundStyle>, // beta * C + (alpha * acc)
+    Sm90AuxLoad<Stages, EpilogueTile, ElementAux, StrideAux, SmemLayoutAtom, CopyOpS2R, AlignmentAux> // aux
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class GmemLayoutTagAux,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentAux,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class SmemLayoutAtom,
+  class CopyOpS2R
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombDeEltAct<
+      GmemLayoutTagAux, ActivationFn, ElementOutput, ElementCompute,
+      ElementAux, ElementSource, ElementScalar, AlignmentAux, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile,
+    SmemLayoutAtom,
+    CopyOpS2R
+> : Sm90LinCombDeEltAct<
+      CtaTileShapeMNK, EpilogueTile, StagesC, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>, SmemLayoutAtom, CopyOpS2R, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementSource, ElementScalar, AlignmentAux, RoundStyle
+    > {
+
+  using Impl =
+    Sm90LinCombDeEltAct<
+      CtaTileShapeMNK, EpilogueTile, StagesC, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>, SmemLayoutAtom, CopyOpS2R, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementSource, ElementScalar, AlignmentAux, RoundStyle
+    >;
+  using Operation =
+    fusion::LinCombDeEltAct<
+      GmemLayoutTagAux, ActivationFn, ElementOutput, ElementCompute,
+      ElementAux, ElementSource, ElementScalar, AlignmentAux, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    using StrideAux = cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>;
+    ElementAux const* aux_ptr = nullptr;
+    StrideAux dAux = {};
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // binary op : activation(beta * C + (alpha * acc), aux)
+          {                  // ternary op : beta * C + (alpha * acc)
+            {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // binary op : alpha * acc
+              {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+              {},                     // leaf args : acc
+              {}                  // binary args : multiplies
+            },                    // end binary op
+            {}               // ternary args : multiply_add
+          },                 // end ternary op
+          {aux_ptr, ElementAux(0), dAux}, // leaf args : aux
+          activation // binary args : activation
+        };   // end binary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  int Stages,
+  class StrideAux,
+  class SmemLayoutAtom,
+  class CopyOpS2R,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux = ElementOutput,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentAux = 128 / sizeof_bits_v<ElementAux>,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombDeEltActDePerRowBias =
+  Sm90EVT<Sm90Compute<cutlass::epilogue::thread::Identity, ElementOutput, ElementCompute, RoundStyle>, // Identity for final conversion
+    Sm90EVT<Sm90ColReduction<plus, plus, plus, 0, CtaTileShapeMNK,
+                             ElementBias, ElementCompute, RoundStyle, Stride<_1,_0,int64_t>, AlignmentBias>,
+      Sm90LinCombDeEltAct<CtaTileShapeMNK, EpilogueTile, Stages, StrideAux, SmemLayoutAtom, CopyOpS2R, ActivationFn,
+                          ElementCompute, ElementCompute, ElementAux, ElementSource, ElementScalar, AlignmentAux, RoundStyle>
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class GmemLayoutTagAux,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentAux,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class SmemLayoutAtom,
+  class CopyOpS2R
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombDeEltActDePerRowBias<
+      GmemLayoutTagAux, ActivationFn, ElementOutput, ElementCompute,
+      ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile,
+    SmemLayoutAtom,
+    CopyOpS2R
+> : Sm90LinCombDeEltActDePerRowBias<
+      CtaTileShapeMNK, EpilogueTile, StagesC, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>, SmemLayoutAtom, CopyOpS2R, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    > {
+
+  using Impl =
+    Sm90LinCombDeEltActDePerRowBias<
+      CtaTileShapeMNK, EpilogueTile, StagesC, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>, SmemLayoutAtom, CopyOpS2R, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >;
+  using Operation =
+    fusion::LinCombDeEltActDePerRowBias<
+      GmemLayoutTagAux, ActivationFn, ElementOutput, ElementCompute,
+      ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    using StrideAux = cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>;
+    ElementAux const* aux_ptr = nullptr;
+    StrideAux dAux = {};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias* dbias_ptr = nullptr;
+    StrideBias dDbias = {};
+
+    operator typename Impl::Arguments() const {
+      return
+      {   // unary op : identity/convert
+        {    // unary op : reduce(activation(beta * C + (alpha * acc), aux))
+          {    // binary op : activation(beta * C + (alpha * acc), aux)
+            {                  // ternary op : beta * C + (alpha * acc)
+              {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+              {},                   // leaf args : C
+              {                     // binary op : alpha * acc
+                {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+                {},                     // leaf args : acc
+                {}                  // binary args : multiplies
+              },                    // end binary op
+              {}               // ternary args : multiply_add
+            },                 // end ternary op
+            {aux_ptr, ElementAux(0), dAux}, // leaf args : aux
+            activation // binary args : activation
+          },   // end binary op
+          {dbias_ptr, ElementCompute(0), dDbias} // unary args : reduce
+        },   // end unary op
+        {} // unary args : identity/convert
+      };   // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = per-column alpha * per-row alpha * acc + beta * c
+template<
+  class CtaTileShapeMNK,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentScalar = 128 / sizeof_bits_v<ElementScalar>, // Alignment of per-column and per-row scaling vectors
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90OuterProdLinComb =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // c(beta) * c(C) + c(alpha * acc)
+    Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int>>, // beta
+    Sm90SrcFetch<ElementSource>, // C
+    Sm90EVT<Sm90Compute<multiplies, ElementCompute, ElementCompute, RoundStyle>, // c(alpha) * c(acc)
+      Sm90OuterProduct<0, CtaTileShapeMNK, ElementScalar, Stride<_1,_0,int>, Stride<_0,_1,int>, AlignmentScalar>, // alpha_col * alpha_row
+      Sm90AccFetch // acc
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    OuterProdLinComb<ElementOutput, ElementCompute, ElementSource, ElementScalar, AlignmentScalar, RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90OuterProdLinComb<CtaTileShapeMNK, ElementOutput, ElementCompute, ElementSource, ElementScalar, AlignmentScalar, RoundStyle> {
+  using Impl = Sm90OuterProdLinComb<CtaTileShapeMNK, ElementOutput, ElementCompute, ElementSource, ElementScalar, AlignmentScalar, RoundStyle>;
+  using Operation = OuterProdLinComb<ElementOutput, ElementCompute, ElementSource, ElementScalar, AlignmentScalar, RoundStyle>;
+
+  struct Arguments {
+
+    // Give a name and flat ordering to the fusion callback args
+    using StrideCol  = Stride<_1,_0,int>;
+    using StrideRow  = Stride<_0,_1,int>;
+    using StrideBeta = Stride<_0,_0,int>;
+    ElementScalar const* alpha_ptr_col = nullptr;
+    ElementScalar const* alpha_ptr_row = nullptr;
+    ElementScalar        beta = static_cast<ElementScalar>(0);
+    ElementScalar const* beta_ptr = nullptr;
+    StrideCol  dAlphaCol = {};
+    StrideRow  dAlphaRow = {};
+    StrideBeta dBeta     = {};
+
+    // Conversion to the args expected by the visitor implementation
+    // to_underlying_arguments will implicitly call this
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {beta, beta_ptr, dBeta}, // leaf args : beta
+          {},                      // leaf args : C
+          {
+            { alpha_ptr_col, alpha_ptr_row, dAlphaCol, dAlphaRow }, // leaf args : alpha cols / rows
+            {},                                                     // leaf args : acc
+            {}
+          },
+          {}
+        };
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = softmax(top_k(alpha * acc + beta * C))
+template<
+  int TopK,
+  int FragmentSize,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombTopKSoftmaxCol =
+  Sm90EVT<Sm90TopKSoftmaxColReduction<TopK, FragmentSize, CtaTileShapeMNK, EpilogueTile, ElementOutput, ElementCompute, RoundStyle>, // softmax(top_k(beta * C + (alpha * acc)))
+    Sm90LinearCombination<ElementCompute, ElementCompute, ElementSource, ElementScalar, RoundStyle> // beta * C + (alpha * acc)
+  >;
+
+template <
+  int TopK,
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource,
+  class ElementScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombTopKSoftmaxCol<TopK, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90LinCombTopKSoftmaxCol<TopK, FragmentSize, CtaTileShapeMNK, EpilogueTile, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle> {
+
+  using Impl = Sm90LinCombTopKSoftmaxCol<TopK, FragmentSize, CtaTileShapeMNK, EpilogueTile, typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+  using Operation = fusion::LinCombTopKSoftmaxCol<TopK, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // unary op: activation(beta * C + (alpha * acc))
+          {    // ternary op : beta * C + (alpha * acc)
+            {{beta}, {beta_ptr}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // binary op : alpha * acc
+              {{alpha}, {alpha_ptr}}, // leaf args : alpha
+              {},                     // leaf args : acc
+              {}                  // binary args : multiplies
+            },                    // end binary op
+            {} // ternary args : multiply_add
+          },   // end ternary op
+          {} // unary args: activation
+        };   // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+template <class FusionOpOrCallbacks, class = cute::void_t<>>
+struct get_element_aux {
+  using type = void;
+};
+
+template <class FusionOpOrCallbacks>
+struct get_element_aux<FusionOpOrCallbacks, cute::void_t<typename FusionOpOrCallbacks::ElementAux>> {
+  using type = typename FusionOpOrCallbacks::ElementAux;
+};
+
+template <class NodeOp, class... ChildOps>
+struct get_element_aux<Sm90TreeVisitor<NodeOp, ChildOps...>, cute::void_t<>> {
+  using type = typename get_element_aux<NodeOp>::type;
+};
+
+template <class... Ts>
+struct get_element_aux<FusionCallbacks<Ts...>, cute::void_t<typename FusionCallbacks<Ts...>::Operation>> {
+ private:
+  using Operation = typename FusionCallbacks<Ts...>::Operation;
+ public:
+  using type = typename get_element_aux<Operation>::type;
+};
+} // namespace cutlass:epilogue::fusion::detail
+
+template <class Callbacks>
+using get_element_aux_t = typename detail::get_element_aux<Callbacks>::type;
+
+} // namespace cutlass::epilogue::fusion
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp b/include/cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp
new file mode 100644
index 0000000000..321daa6bcc
--- /dev/null
+++ b/include/cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp
@@ -0,0 +1,841 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree compute operations for the sm90 TMA warp-specialized (ws) epilogue
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/activation.h"
+
+#include "cute/tensor.hpp"
+
+#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::fusion {
+
+using namespace cute;
+using namespace detail;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// N-nary Elementwise Compute Operation
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// The template argument provided for ComputeFn must be able to accept
+// exactly one template parameter.  In Standard C++, it's OK for
+// ComputeFn to have other template parameters, as long as those have
+// defaults.  For example, the following struct Foo would work.
+//
+// template<class A, class B = A>
+// struct Foo {
+//   CUTLASS_HOST_DEVICE auto operator() (A a, B b);
+// };
+//
+// However, some compilers, such as Clang, require that the argument
+// take _exactly_ one template parameter.  This is nonstandard C++
+// behavior.  One work-around for this case is to create a subclass
+// with exactly one template parameter, and then use that subclass as
+// the template argument.
+//
+// template<class A>
+// struct FooHomogeneous : public Foo<A, A> {};
+//
+template<
+  template <class> class ComputeFn,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  class = void
+>
+struct Sm90Compute {
+private:
+  using EmptyArguments = typename Sm90VisitorImpl<>::Arguments;
+
+  template <class Fn, class = void>
+  struct ComputeArguments {
+    using type = EmptyArguments;
+  };
+
+  // partial specialization for compute fns that define an Arguments member, e.g. activation hyperparameters
+  template <class Fn>
+  struct ComputeArguments<Fn, platform::void_t<typename Fn::Arguments>> {
+    using type = typename Fn::Arguments;
+  };
+
+public:
+  struct SharedStorage { };
+
+  using Arguments = typename ComputeArguments<ComputeFn<ElementCompute>>::type;
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const&, Arguments const& args, void*) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const&, Arguments const&) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90Compute()
+      : params() {}
+
+  CUTLASS_HOST_DEVICE
+  Sm90Compute(Params const& params, SharedStorage const& shared_storage)
+      : params(params) {}
+
+  Params const params;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(Params const& params)
+      : params(params) {}
+
+    Params const& params;
+
+    template <typename ElementAccumulator, typename... ElementInputs, int FragmentSize>
+    CUTLASS_DEVICE Array<ElementOutput, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
+          Array<ElementInputs, FragmentSize> const&... frg_inputs) {
+      return transform_apply(cute::make_tuple(frg_inputs...),
+        [&] (auto&& frg_input) {
+          using ElementInput = typename cute::remove_cvref_t<decltype(frg_input)>::Element;
+          using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
+          ConvertInput convert_input{};
+
+          return convert_input(frg_input);
+        },
+        [&] (auto&&... cvt_frg_inputs) {
+          using ComputeOutput = ComputeFn<Array<ElementCompute, FragmentSize>>;
+          ComputeOutput compute_output{};
+
+          if constexpr (cute::is_same_v<Arguments, EmptyArguments>) {
+            using ElementComputeOutput =
+                typename cute::remove_cvref_t<decltype(compute_output(cvt_frg_inputs...))>::Element;
+            using ConvertOutput = NumericArrayConverter<ElementOutput, ElementComputeOutput, FragmentSize, RoundStyle>;
+            ConvertOutput convert_output{};
+            return convert_output(compute_output(cvt_frg_inputs...));
+          }
+          else {
+            using ElementComputeOutput =
+                typename cute::remove_cvref_t<decltype(compute_output(cvt_frg_inputs..., params))>::Element;
+            using ConvertOutput = NumericArrayConverter<ElementOutput, ElementComputeOutput, FragmentSize, RoundStyle>;
+            ConvertOutput convert_output{};
+            return convert_output(compute_output(cvt_frg_inputs..., params));
+          }
+        }
+      );
+    }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    return ConsumerStoreCallbacks(params);
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Performance Optimized Specializations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// beta * C + Z
+template <
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  class InputScaleOp,  // beta
+  class ElementSource, // C
+  class InputAddOp     // Z
+>
+struct Sm90TreeVisitor<
+  Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle,
+              cute::void_t<decltype(declval<InputScaleOp>().is_zero())>>,
+  InputScaleOp,
+  Sm90SrcFetch<ElementSource>,
+  InputAddOp
+> : Sm90VisitorImpl<
+      InputScaleOp,
+      Sm90SrcFetch<ElementSource>,
+      InputAddOp,
+      Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>
+    >
+{
+  using Impl =
+    Sm90VisitorImpl<
+      InputScaleOp,
+      Sm90SrcFetch<ElementSource>,
+      InputAddOp,
+      Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>
+    >;
+  using Params = typename Impl::Params;
+  using SharedStorage = typename Impl::SharedStorage;
+
+  CUTLASS_HOST_DEVICE
+  Sm90TreeVisitor() {}
+
+  CUTLASS_HOST_DEVICE
+  Sm90TreeVisitor(
+      Params const& params,
+      SharedStorage const& shared_storage)
+    : Impl(params, shared_storage) {}
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    auto const& scale_op = get<0>(Impl::ops);
+    auto const& added_op = get<2>(Impl::ops);
+    if constexpr (detail::IsScalarBroadcast<InputScaleOp>::value && not is_void_v<ElementSource>) {
+      return (get<2>(scale_op.params_ptr->dScalar[0]) != 0 && scale_op.params_ptr->scalar_ptrs[0] != nullptr) ||
+              is_C_load_needed() ||
+              added_op.is_producer_load_needed();
+    }
+    else {
+      return is_C_load_needed() || added_op.is_producer_load_needed();
+    }
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    auto const& scale_op = get<0>(Impl::ops);
+    auto const& src_op = get<1>(Impl::ops);
+    auto const& added_op = get<2>(Impl::ops);
+    return (not scale_op.is_zero() && src_op.is_C_load_needed()) || added_op.is_C_load_needed();
+  }
+
+  template <class CallbacksImpl>
+  struct ConsumerStoreCallbacks : CallbacksImpl {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(bool is_C_load_needed, CallbacksImpl&& impl)
+      : is_C_load_needed(is_C_load_needed), CallbacksImpl(cute::forward<CallbacksImpl>(impl)) { }
+
+    bool is_C_load_needed;
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<ElementOutput, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array frg_added = get<2>(CallbacksImpl::callbacks_tuple).visit(frg_acc, epi_v, epi_m, epi_n);
+
+      using ElementZ = typename decltype(frg_added)::Element;
+      using ConvertZ = NumericArrayConverter<ElementCompute, ElementZ, FragmentSize, RoundStyle>;
+      using ConvertI = NumericArrayConverter<ElementOutput, ElementCompute, FragmentSize, RoundStyle>;
+      ConvertZ convert_Z{};
+      ConvertI convert_I{};
+
+      Array frg_I = convert_Z(frg_added);
+
+      if constexpr (!is_void_v<ElementSource>) {
+        Array frg_scalar = get<0>(CallbacksImpl::callbacks_tuple).visit(frg_acc, epi_v, epi_m, epi_n);
+        Array frg_source = get<1>(CallbacksImpl::callbacks_tuple).visit(frg_acc, epi_v, epi_m, epi_n);
+
+        using ElementX = typename decltype(frg_scalar)::Element;
+        using ElementY = typename decltype(frg_source)::Element;
+        using ConvertX = NumericArrayConverter<ElementCompute, ElementX, FragmentSize, RoundStyle>;
+        using ConvertY = NumericArrayConverter<ElementCompute, ElementY, FragmentSize, RoundStyle>;
+        using ComputeI = multiply_add<Array<ElementCompute, FragmentSize>>;
+        ConvertX convert_X{};
+        ConvertY convert_Y{};
+        ComputeI compute_I{};
+
+        frg_I = compute_I(convert_X(frg_scalar), convert_Y(frg_source), frg_I);
+      }
+
+      return convert_I(frg_I);
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    auto callbacks_tuple = Impl::template get_consumer_store_callbacks<ReferenceSrc>(args);
+    bool is_C_load_needed = this->is_C_load_needed();
+    if (not is_C_load_needed) {
+      cute::clear(args.tCrC);
+    }
+    return ConsumerStoreCallbacks<decltype(callbacks_tuple)>(
+        is_C_load_needed, std::move(callbacks_tuple));
+  }
+};
+
+// ReLU with aux bit tensor dReLU/dZ
+// Aux(i) = Z(i) >= 0 ? 1 : 0
+namespace detail {
+// Placeholder node so we can retain standard EVT structure
+template <class StrideMNL>
+struct Sm90ReLUAuxStore : Sm90VisitorImpl<> {
+  struct SharedStorage {};
+
+  struct Arguments {
+    cutlass::uint1b_t* ptr_aux = nullptr;
+    StrideMNL dAux = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ReLUAuxStore() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ReLUAuxStore(Params const& params, SharedStorage const& shared_storage) { }
+};
+} // namespace detail
+
+// Specialization on the generic compute+aux EVT
+template <
+  // Compute node
+  template <class> class Activation,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  // Aux node
+  int Stages,
+  class EpilogueTile,
+  class StrideMNL,
+  class SmemLayoutAtom,
+  class CopyOpR2S,
+  int Alignment,
+  bool EnableNullptr,
+  // Input node
+  class InputOp
+>
+struct Sm90TreeVisitor<
+  Sm90Compute<Activation, ElementOutput, ElementCompute, RoundStyle,
+              cute::enable_if_t<cute::is_same_v<Activation<ElementCompute>, cutlass::epilogue::thread::ReLu<ElementCompute>>  ||
+                                cute::is_same_v<Activation<ElementCompute>, cutlass::epilogue::thread::Clamp<ElementCompute>> ||
+                                cute::is_same_v<Activation<ElementCompute>, cutlass::epilogue::thread::ThresholdReLU<ElementCompute>> >>,
+  Sm90TreeVisitor<
+    Sm90AuxStore<
+      Stages,
+      EpilogueTile,
+      cutlass::uint1b_t,
+      RoundStyle,
+      StrideMNL,
+      SmemLayoutAtom,
+      CopyOpR2S,
+      Alignment,
+      EnableNullptr
+    >,
+    InputOp
+  >
+> : Sm90VisitorImpl<
+      Sm90VisitorImpl<
+        InputOp,
+        detail::Sm90ReLUAuxStore<StrideMNL>
+      >,
+      Sm90Compute<Activation, ElementOutput, ElementCompute, RoundStyle>
+    >
+{
+  using Impl =
+    Sm90VisitorImpl<
+      Sm90VisitorImpl<
+        InputOp,
+        detail::Sm90ReLUAuxStore<StrideMNL>
+      >,
+      Sm90Compute<Activation, ElementOutput, ElementCompute, RoundStyle>
+    >;
+  using Params = typename Impl::Params;
+  using SharedStorage = typename Impl::SharedStorage;
+
+  CUTLASS_HOST_DEVICE
+  Sm90TreeVisitor() {}
+
+  CUTLASS_HOST_DEVICE
+  Sm90TreeVisitor(Params const& params_, SharedStorage const& shared_storage)
+    : params(params_), Impl(params_, shared_storage) {}
+
+  Params const& params;
+
+  template <class RTensor, class GTensor, class CTensor, class ThrResidue, class CallbacksImpl>
+  struct ConsumerStoreCallbacks : CallbacksImpl {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+        RTensor&& tC_rAux,
+        GTensor&& tC_gAux,
+        CTensor tC_cAux,
+        ThrResidue residue_tC_cAux,
+        Params const& params,
+        CallbacksImpl&& impl)
+      : tC_rAux(cute::forward<RTensor>(tC_rAux)),
+        tC_gAux(cute::forward<GTensor>(tC_gAux)),
+        tC_cAux(tC_cAux),
+        residue_tC_cAux(residue_tC_cAux),
+        params(params),
+        CallbacksImpl(cute::forward<CallbacksImpl>(impl)) {}
+
+    RTensor tC_rAux;                                                                   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    GTensor tC_gAux;                                                                   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    CTensor tC_cAux;                                                                   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    ThrResidue residue_tC_cAux;
+    Params const& params;
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<ElementOutput, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      // Unpack callbacks + params
+      auto& [callbacks_input_aux, callbacks_compute] = CallbacksImpl::callbacks_tuple;
+      auto& [callbacks_input, callbacks_aux] = callbacks_input_aux.callbacks_tuple;
+      auto const& [params_input_aux, params_compute] = params;
+      auto const& [params_input, params_aux] = params_input_aux;
+
+      // Visit the input node
+      Array frg_input = callbacks_input.visit(frg_acc, epi_v, epi_m, epi_n);
+
+      // Compute activation + aux
+      using ElementInput = typename decltype(frg_input)::Element;
+      using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
+      using ConvertAux = PackPredicates<FragmentSize>;
+      using ComputeOutput = Activation<ElementCompute>;
+      using ConvertOutput = NumericArrayConverter<ElementOutput, ElementCompute, FragmentSize, RoundStyle>;
+      ConvertInput convert_input{};
+      ComputeOutput relu{};
+      ConvertAux convert_aux{};
+      ConvertOutput convert_output{};
+
+      Array frg_compute = convert_input(frg_input);
+      bool frg_aux[FragmentSize];
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        ElementCompute pre_relu = frg_compute[i];
+        if constexpr (cute::is_same_v<Activation<ElementCompute>, cutlass::epilogue::thread::Clamp<ElementCompute>> ||
+                      cute::is_same_v<Activation<ElementCompute>, cutlass::epilogue::thread::ThresholdReLU<ElementCompute>>) {
+          frg_compute[i] = relu(frg_compute[i], params_compute);
+        }
+        else {
+          frg_compute[i] = relu(frg_compute[i]);
+        }
+        if constexpr (cute::is_same_v<ElementCompute, float>) {
+          uint32_t aux;
+          asm volatile("set.equ.u32.f32 %0, %1, %2;\n" : "=r"(aux) : "f"(frg_compute[i]), "f"(pre_relu)); // NaN outputs 1 in Aux
+          frg_aux[i] = static_cast<bool>(aux);
+        } else if constexpr (cute::is_same_v<ElementCompute, cutlass::half_t>) {
+          uint32_t aux;
+          cutlass::half_t compute = frg_compute[i];
+          asm volatile("set.equ.u32.f16 %0, %1, %2;\n" : "=r"(aux) : "h"(compute.raw()), "h"(pre_relu.raw())); // NaN outputs 1 in Aux
+          frg_aux[i] = static_cast<bool>(aux);
+        } else {
+          frg_aux[i] = frg_compute[i] == pre_relu;
+        }
+      }
+
+      static_assert(FragmentSize % 8 == 0, "Predicate vector must be byte-aligned");
+      Tensor tC_rAux_frg = recast<typename ConvertAux::result_type>(coalesce(tC_rAux(_,_,_,epi_m,epi_n)));   // (EPI_V)
+      tC_rAux_frg(epi_v) = convert_aux(frg_aux);
+
+      return convert_output(frg_compute);
+    }
+
+    CUTLASS_DEVICE void
+    end() {
+      // Unpack callbacks + params
+      auto& [callbacks_input_aux, callbacks_compute] = CallbacksImpl::callbacks_tuple;
+      auto& [callbacks_input, callbacks_aux] = callbacks_input_aux.callbacks_tuple;
+      auto const& [params_input_aux, params_compute] = params;
+      auto const& [params_input, params_aux] = params_input_aux;
+
+      // Visit the input node
+      callbacks_input.end();
+
+      // Nullptr is no-op
+      if constexpr (EnableNullptr) {
+        if (params_aux.ptr_aux == nullptr) {
+          return;
+        }
+      }
+
+      // Compute vectorization
+      constexpr auto MCL = decltype(max_common_layout(tC_rAux, tC_gAux)){};
+      constexpr int V = cute::min(Alignment, size(MCL));
+      // Copy vectorizes into byte-aligned stores
+      if constexpr (V > 1 && V % 8 == 0) {
+        using VecType = uint_bit_t<V>;
+        Tensor tC_rAux_vec = recast<VecType>(tC_rAux);
+        Tensor tC_gAux_vec = recast<VecType>(tC_gAux);
+        Tensor tC_cAux_vec = tensor<1>(zipped_divide(tC_cAux, MCL.compose(Int<V>{})));
+        auto predicate_fn = [&] (auto&&... coords) { return elem_less(tC_cAux_vec(coords...), residue_tC_cAux); };
+        copy_if(predicate_fn, tC_rAux_vec, tC_gAux_vec);
+      }
+      // sub-byte vectorization, must serialize threads
+      else {
+        // Assumes no inter-warp sharing of bytes (most copy layouts should satisfy this)
+        int lane_idx = canonical_lane_idx();
+        auto predicate_fn = [&] (auto&&... coords) { return elem_less(tC_cAux(coords...), residue_tC_cAux); };
+        CUTLASS_PRAGMA_NO_UNROLL
+        for (int i = 0; i < NumThreadsPerWarp; ++i) {
+          if (lane_idx == i) {
+            copy_if(predicate_fn, tC_rAux, tC_gAux);
+          }
+          __syncwarp();
+        }
+      }
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    // Unpack params
+    auto const& [params_input_aux, params_compute] = params;
+    auto const& [params_input, params_aux] = params_input_aux;
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+    gmem_ptr ptr_aux = make_gmem_ptr(subbyte_iterator<cutlass::uint1b_t>(params_aux.ptr_aux));
+    Tensor mAux = make_tensor(ptr_aux, make_layout(make_shape(M,N,L), params_aux.dAux));                     // (M,N,L)
+    Tensor gAux = local_tile(mAux, take<0,2>(args.tile_shape_mnk), make_coord(m,n,l));                 // (CTA_M,CTA_N)
+
+    Tensor tC_gAux = sm90_partition_for_epilogue<ReferenceSrc>(                        // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+                      gAux, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tC_rAux = make_tensor<cutlass::uint1b_t>(shape(tC_gAux));                   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+
+    auto callbacks_impl = Impl::template get_consumer_store_callbacks<ReferenceSrc>(args);
+    return ConsumerStoreCallbacks<decltype(tC_rAux), decltype(tC_gAux), decltype(args.tCcD), decltype(args.residue_tCcD), decltype(callbacks_impl)>(
+        cute::move(tC_rAux), cute::move(tC_gAux), args.tCcD, args.residue_tCcD, params, cute::move(callbacks_impl));
+  }
+};
+
+// Aux load for uint1b_t
+template <
+  int Stages,
+  class EpilogueTile,
+  class StrideMNL,
+  class SmemLayoutAtom,
+  class CopyOpS2R,
+  int Alignment,
+  bool EnableNullptr
+>
+struct Sm90AuxLoad<
+  Stages,
+  EpilogueTile,
+  cutlass::uint1b_t,
+  StrideMNL,
+  SmemLayoutAtom,
+  CopyOpS2R,
+  Alignment,
+  EnableNullptr
+> {
+  static_assert(Alignment % 128 == 0, "sub-16B alignment not supported yet");
+
+  struct SharedStorage {};
+
+  struct Arguments {
+    cutlass::uint1b_t const* ptr_aux = nullptr;
+    cutlass::uint1b_t null_default = cutlass::uint1b_t(0);
+    StrideMNL dAux = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90AuxLoad() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90AuxLoad(Params const& params, SharedStorage const&)
+      : params(params) { }
+
+  Params const params;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template <class RTensor, class GTensor, class CTensor, class ThrResidue>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(RTensor&& tC_rAux_, GTensor&& tC_gAux_, CTensor tC_cAux_, ThrResidue residue_tC_cAux_, Params const& params_)
+      : tC_rAux(cute::forward<RTensor>(tC_rAux_)),
+        tC_gAux(cute::forward<GTensor>(tC_gAux_)),
+        tC_cAux(tC_cAux_),
+        residue_tC_cAux(residue_tC_cAux_),
+        params(params_) {}
+
+    RTensor tC_rAux;                                                                   // (CPY,CPY_M,CPY_N,{EPI_M,EPI_N})
+    GTensor tC_gAux;                                                                   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    CTensor tC_cAux;                                                                   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    ThrResidue residue_tC_cAux;
+    Params const& params;
+
+    CUTLASS_DEVICE void
+    begin() {
+      if constexpr (decltype(cute::rank(tC_rAux))::value == 5) {
+        if constexpr (EnableNullptr) {
+          if (params.ptr_aux == nullptr) {
+            return;
+          }
+        }
+
+        constexpr auto MCL = decltype(max_common_layout(tC_rAux, tC_gAux)){};
+        constexpr int V = cute::min(Alignment, size(MCL));
+        if constexpr (V > 1) {
+          using VecType = uint_bit_t<V>;
+          Tensor tC_gAux_vec = recast<VecType>(tC_gAux);
+          Tensor tC_rAux_vec = recast<VecType>(tC_rAux);
+          Tensor tC_cAux_vec = tensor<1>(zipped_divide(tC_cAux, MCL.compose(Int<V>{})));
+          auto predicate_fn = [&] (auto&&... coords) { return elem_less(tC_cAux_vec(coords...), residue_tC_cAux); };
+          copy_if(predicate_fn, tC_gAux_vec, tC_rAux_vec);
+        }
+        else {
+          auto predicate_fn = [&] (auto&&... coords) { return elem_less(tC_cAux(coords...), residue_tC_cAux); };
+          copy_if(predicate_fn, tC_gAux, tC_rAux);
+        }
+      }
+    }
+
+    CUTLASS_DEVICE void
+    begin_loop(int epi_m, int epi_n) {
+      if constexpr (decltype(cute::rank(tC_rAux))::value == 3) {
+        if constexpr (EnableNullptr) {
+          if (params.ptr_aux == nullptr) {
+            return;
+          }
+        }
+
+        auto predicate_fn = [&] (auto&&... coords) { return elem_less(tC_cAux(_,_,_,epi_m,epi_n)(coords...), residue_tC_cAux); };
+        copy_if(predicate_fn, tC_gAux(_,_,_,epi_m,epi_n), tC_rAux);
+      }
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      using ElementRegister = typename remove_cvref_t<RTensor>::value_type;
+      if constexpr (decltype(cute::rank(tC_rAux))::value == 3) {
+        return recast<Array<ElementRegister, FragmentSize>>(coalesce(tC_rAux))(epi_v);
+      }
+      else {
+        return recast<Array<ElementRegister, FragmentSize>>(coalesce(tC_rAux(_,_,_,epi_m,epi_n)))(epi_v);
+      }
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+    gmem_ptr ptr_aux = make_gmem_ptr(subbyte_iterator<cutlass::uint1b_t const>(params.ptr_aux));
+    Tensor mAux = make_tensor(ptr_aux, make_layout(make_shape(M,N,L), params.dAux));                         // (M,N,L)
+    Tensor gAux = local_tile(mAux, take<0,2>(args.tile_shape_mnk), make_coord(m,n,l));                 // (CTA_M,CTA_N)
+
+    Tensor tC_gAux = sm90_partition_for_epilogue<ReferenceSrc>(                        // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+                      gAux, args.epi_tile, args.tiled_copy, args.thread_idx);
+
+    // If byte-unaligned vectorization, store in registers as uint32_t to reduce redundant pack+unpack instruction sequences
+    constexpr int V = decltype(max_common_vector(tC_gAux.layout(), make_layout(tC_gAux.shape())))::value;
+    Tensor tC_rAux = [&] () {
+      if constexpr (V % 8 != 0) {
+        return make_tensor<uint32_t>(take<0,3>(shape(tC_gAux)));                       // (CPY,CPY_M,CPY_N)
+      } else {
+        return make_tensor<cutlass::uint1b_t>(shape(tC_gAux));                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      }
+    }();
+
+    if constexpr (EnableNullptr) {
+      if (params.ptr_aux == nullptr) {
+        fill(tC_rAux, params.null_default);
+      }
+    }
+
+    return ConsumerStoreCallbacks<decltype(tC_rAux), decltype(tC_gAux), decltype(args.tCcD), decltype(args.residue_tCcD)>(
+        cute::move(tC_rAux), cute::move(tC_gAux), args.tCcD, args.residue_tCcD, params);
+  }
+};
+
+// dReLU specialization
+template<
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle
+>
+struct Sm90Compute<
+  cutlass::epilogue::thread::dReLU,
+  ElementOutput,
+  ElementCompute,
+  RoundStyle
+> : Sm90VisitorImpl<> {
+
+  using Sm90VisitorImpl<>::Sm90VisitorImpl;
+
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    template <typename ElementAccumulator, typename ElementInput, typename ElementAux, int FragmentSize>
+    CUTLASS_DEVICE Array<ElementOutput, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
+          Array<ElementInput      , FragmentSize> const& frg_input,
+          Array<ElementAux        , FragmentSize> const& frg_aux) {
+      using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
+      using ComputeOutput = cutlass::epilogue::thread::dReLU<Array<ElementCompute, FragmentSize>>;
+      using ConvertOutput = NumericArrayConverter<ElementOutput, ElementCompute, FragmentSize, RoundStyle>;
+      ConvertInput convert_input{};
+      ComputeOutput compute_output{};
+      ConvertOutput convert_output{};
+
+      return convert_output(compute_output(convert_input(frg_input), frg_aux)); // don't convert frg_aux for dReLU
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    return ConsumerStoreCallbacks();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::fusion
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp b/include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
new file mode 100644
index 0000000000..66b1086efc
--- /dev/null
+++ b/include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
@@ -0,0 +1,1581 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree load operations for the sm90 TMA warp-specialized (ws) epilogue
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+
+#include "cute/tensor.hpp"
+#include "sm90_visitor_tma_warpspecialized.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::fusion {
+
+using namespace cute;
+using namespace detail;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Elementwise Fetch Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// returns accumulator
+struct Sm90AccFetch : Sm90VisitorImpl<> {
+
+  using Sm90VisitorImpl<>::Sm90VisitorImpl;
+
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<ElementAccumulator, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      return frg_acc;
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    return ConsumerStoreCallbacks{};
+  }
+};
+
+// Split tree visitor fetches intermediate results from temporary accumulators
+using Sm90SplitTreeFetch = Sm90AccFetch;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// returns C
+template <class Element>
+struct Sm90SrcFetch : Sm90VisitorImpl<> {
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return is_C_load_needed();
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return not is_void_v<Element>;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return is_void_v<Element>;
+  }
+
+  using Sm90VisitorImpl<>::Sm90VisitorImpl;
+
+  template<class SrcTensor>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(SrcTensor const& tCrC)
+      : tCrC(tCrC) {}
+
+    SrcTensor const& tCrC;                                                                         // (CPY,CPY_M,CPY_N)
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<typename SrcTensor::value_type, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      return recast<Array<typename SrcTensor::value_type, FragmentSize>>(tCrC)(epi_v);
+    }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    // register type may differ from logical type so we can't assert matching types here
+    return ConsumerStoreCallbacks(args.tCrC);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Elementwise Load Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int Stages,
+  class EpilogueTile,
+  class Element,
+  class StrideMNL,
+  class SmemLayoutAtom,
+  class CopyOpS2R,
+  int Alignment = 128 / sizeof_bits_v<Element>,
+  bool EnableNullptr = true // Fallback scalar broadcast for nullptr params
+>
+struct Sm90AuxLoad {
+  static_assert(Alignment * sizeof_bits_v<Element> % 128 == 0, "sub-16B alignment not supported yet");
+
+  constexpr static bool is_m_major = epilogue::collective::detail::is_m_major<StrideMNL>();
+  // Find the max contiguous layout usable by TMA (if EpilogueTile is a non-compact tiler)
+  using SmemShapeTma = decltype(make_shape(
+      max_common_vector(make_layout(get<0>(EpilogueTile{})),make_layout(get<0>(EpilogueTile{}))),
+      max_common_vector(make_layout(get<1>(EpilogueTile{})),make_layout(get<1>(EpilogueTile{})))));
+  using SmemLayoutTma = decltype(tile_to_shape(
+      SmemLayoutAtom{}, SmemShapeTma{},
+      cute::conditional_t<is_m_major, Step<_2,_1>, Step<_1,_2>>{} ));
+  using SmemLayout = decltype(tile_to_shape(
+      SmemLayoutTma{},
+      make_shape(size<0>(shape(EpilogueTile{})), size<1>(shape(EpilogueTile{})), Int<Stages>{}),
+      cute::conditional_t<is_m_major, Step<_2,_1,_3>, Step<_1,_2,_3>>{} ));
+  using CopyOpG2S =
+      SM90_TMA_LOAD
+    ;
+
+  struct SharedStorage {
+    alignas(cutlass::detail::alignment_for_swizzle(SmemLayout{}))
+    array_aligned<Element, size(SmemLayout{})> smem_aux;
+  };
+
+  struct Arguments {
+    Element const* ptr_aux = nullptr;
+    Element null_default = Element(0);
+    StrideMNL dAux = {};
+  };
+
+  struct Params {
+    using TMA_Aux = decltype(make_tma_copy(
+        CopyOpG2S{},
+        make_tensor(make_gmem_ptr(static_cast<Element const*>(nullptr)), repeat_like(StrideMNL{}, int32_t(0)), append<3>(StrideMNL{}, _0{})),
+        take<0,2>(SmemLayoutTma{})));
+    TMA_Aux tma_load_aux;
+    Element null_default = Element(0);
+    bool use_default = false;
+  };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_mnkl = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_mnkl;
+    auto M_AUX =
+        size(M)
+      ;
+    Tensor tensor_aux = make_tensor(make_gmem_ptr(args.ptr_aux), make_layout(make_shape(M_AUX,N,L), append<3>(args.dAux, _0{})));
+    typename Params::TMA_Aux tma_load_aux = make_tma_copy(CopyOpG2S{}, tensor_aux, take<0,2>(SmemLayoutTma{}));
+
+    bool use_default = false;
+    if constexpr (EnableNullptr) {
+      use_default = args.ptr_aux == nullptr;
+    }
+
+    return Params{tma_load_aux, args.null_default, use_default};
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90AuxLoad() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90AuxLoad(Params const& params, SharedStorage const& shared_storage)
+      : params_ptr(&params),
+        smem_aux(const_cast<Element*>(shared_storage.smem_aux.data())) { }
+
+  Params const* params_ptr;
+  Element* smem_aux;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return true;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return (params_ptr->use_default && params_ptr->null_default == Element(0));
+  }
+
+  template <class GTensor, class STensor>
+  struct ProducerLoadCallbacks : EmptyProducerLoadCallbacks {
+    CUTLASS_DEVICE
+    ProducerLoadCallbacks(GTensor&& bGS_gAux, STensor&& bGS_sAux, Params const* params_ptr)
+      : bGS_gAux(cute::forward<GTensor>(bGS_gAux)),
+        bGS_sAux(cute::forward<STensor>(bGS_sAux)),
+        params_ptr(params_ptr) {}
+
+    GTensor bGS_gAux;                                                                  // (TMA,TMA_M,TMA_N,EPI_M,EPI_N)
+    STensor bGS_sAux;                                                                  // (TMA,TMA_M,TMA_N,PIPE)
+    Params const* params_ptr;
+
+    CUTLASS_DEVICE void
+    step(uint64_t* full_mbarrier_ptr, int epi_m, int epi_n, int load_iteration, bool issue_tma_load) {
+      if constexpr (EnableNullptr) {
+        if (params_ptr->use_default) {
+          return;
+        }
+      }
+
+      if (issue_tma_load) {
+        // Increment the expected transaction bytes of the current stage's mbarrier by the subtile's byte-size
+        constexpr uint32_t copy_bytes = size(take<0,2>(SmemLayout{})) * sizeof_bits_v<Element> / 8;
+        cutlass::arch::ClusterTransactionBarrier::expect_transaction(full_mbarrier_ptr, copy_bytes);
+        // Issue the TMA load
+        constexpr uint16_t mcast_mask = 0;
+        int load_pipe_index = load_iteration % Stages;
+        copy(params_ptr->tma_load_aux.with(*full_mbarrier_ptr, mcast_mask),
+          bGS_gAux(_,_,_,epi_m,epi_n), bGS_sAux(_,_,_,load_pipe_index));
+      }
+    }
+  };
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+    auto coord_shape =
+        make_coord(m, n, l)
+      ;
+    Tensor mAux_mn = params_ptr->tma_load_aux.get_tma_tensor(make_shape(M,N,L));                             // (M,N,L)
+    Tensor mAux = coalesce(mAux_mn, take<0,2>(args.tile_shape_mnk));
+    Tensor gAux = local_tile(mAux, take<0,2>(args.tile_shape_mnk), coord_shape);                       // (CTA_M,CTA_N)
+
+    Tensor gAux_epi = flat_divide(gAux, args.epi_tile);                          // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+    Tensor sAux_epi = make_tensor(make_smem_ptr(smem_aux), SmemLayout{});        // (EPI_TILE_M,EPI_TILE_N,PIPE)
+
+    ThrCopy thrblk_g2s = params_ptr->tma_load_aux.get_slice(_0{});
+    Tensor bGS_gAux = thrblk_g2s.partition_S(gAux_epi);                                // (TMA,TMA_M,TMA_N,EPI_M,EPI_N)
+    Tensor bGS_sAux = thrblk_g2s.partition_D(sAux_epi);                                // (TMA,TMA_M,TMA_N,PIPE)
+
+    return ProducerLoadCallbacks<decltype(bGS_gAux), decltype(bGS_sAux)>(
+      cute::move(bGS_gAux), cute::move(bGS_sAux), params_ptr);
+  }
+
+  template <class RTensor, class TiledS2R, class STensorS2R>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(RTensor&& tC_rAux, TiledS2R tiled_s2r, STensorS2R&& tSR_sAux, Params const* params_ptr)
+      : tC_rAux(cute::forward<RTensor>(tC_rAux)),
+        tiled_s2r(tiled_s2r),
+        tSR_sAux(cute::forward<STensorS2R>(tSR_sAux)),
+        params_ptr(params_ptr) { }
+
+    TiledS2R tiled_s2r;
+    RTensor tC_rAux;                                                                          // (CPY,CPY_M,CPY_N)
+    STensorS2R tSR_sAux;                                                                      // (S2R,S2R_M,S2R_N,PIPE)
+    Params const* params_ptr;
+
+    CUTLASS_DEVICE void
+    previsit(int epi_m, int epi_n, int load_iteration, bool is_producer_load_needed) {
+      if constexpr (EnableNullptr) {
+        if (params_ptr->use_default) {
+          fill(tC_rAux, params_ptr->null_default);
+          return;
+        }
+      }
+
+      using RLayoutS2R = decltype(cute::layout(TiledS2R{}.get_slice(0).retile_S(RTensor{})));
+      Tensor tSR_rAux = make_tensor(tC_rAux.data(), RLayoutS2R{});                                 // (S2R,S2R_M,S2R_N)
+
+      int load_pipe_index = load_iteration % Stages;
+      copy(tiled_s2r, tSR_sAux(_,_,_,load_pipe_index), tSR_rAux);
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<Element, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Tensor tC_rAux_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rAux));                          // (EPI_V)
+
+      return tC_rAux_frg(epi_v);
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+
+    Tensor mAux_mn = params_ptr->tma_load_aux.get_tma_tensor(make_shape(M,N,L));                             // (M,N,L)
+    Tensor mAux = coalesce(mAux_mn, take<0,2>(args.tile_shape_mnk));
+    Tensor tC_gAux = sm90_partition_for_epilogue<ReferenceSrc                          // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      >(mAux, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tC_rAux = make_tensor<Element>(take<0,3>(shape(tC_gAux)));                  // (CPY,CPY_M,CPY_N)
+
+    auto tiled_s2r = conditional_return<ReferenceSrc>(
+      make_tiled_copy_S(Copy_Atom<CopyOpS2R,Element>{}, args.tiled_copy),
+      make_tiled_copy_D(Copy_Atom<CopyOpS2R,Element>{}, args.tiled_copy)
+    );
+    Tensor sAux_epi = cute::as_position_independent_swizzle_tensor(
+                        make_tensor(make_smem_ptr(smem_aux), SmemLayout{}));            // (EPI_TILE_M,EPI_TILE_N,PIPE)
+    auto tSR_sAux = tiled_s2r.get_slice(args.thread_idx).partition_S(sAux_epi);               // (S2R,S2R_M,S2R_N,PIPE)
+
+    return ConsumerStoreCallbacks<decltype(tC_rAux), decltype(tiled_s2r), decltype(tSR_sAux)>(
+        cute::move(tC_rAux), tiled_s2r, cute::move(tSR_sAux), params_ptr);
+  }
+};
+
+template <
+  class Element,
+  class EpilogueTile,   // Unused
+  class LayoutOrStrideMNL,
+  class SmemLayoutAtom, // Unused
+  class CopyOpS2R,      // Unused
+  int Alignment,
+  bool EnableNullptr
+>
+struct Sm90AuxLoad<
+  0, EpilogueTile, Element, LayoutOrStrideMNL, 
+  SmemLayoutAtom, CopyOpS2R, Alignment, EnableNullptr
+> {
+  using ElementAux = Element;
+  using StrideMNL = cutlass::gemm::TagToStrideC_t<LayoutOrStrideMNL>;
+
+  struct SharedStorage { };
+
+  struct Arguments {
+    Element const* ptr_aux = nullptr;
+    Element null_default = Element(0);
+    StrideMNL dAux = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90AuxLoad() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90AuxLoad(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+  
+  Params const* params_ptr;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<
+    class GTensorG2R,
+    class RTensor,
+    class CTensorG2R,
+    class ProblemShapeMNL
+  >
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(GTensorG2R&& tC_gAux,
+        RTensor&& tC_rAux,
+        CTensorG2R&& tC_cAux,
+        ProblemShapeMNL problem_shape_mnl,
+        Params const* params_ptr)
+      : tC_gAux(cute::forward<GTensorG2R>(tC_gAux)),
+        tC_rAux(cute::forward<RTensor>(tC_rAux)),
+        tC_cAux(cute::forward<CTensorG2R>(tC_cAux)),
+        problem_shape_mnl(problem_shape_mnl),
+        params_ptr(params_ptr) {}
+    
+    GTensorG2R tC_gAux;
+    RTensor tC_rAux;
+    CTensorG2R tC_cAux;
+    ProblemShapeMNL problem_shape_mnl;
+    Params const* params_ptr;
+
+    CUTLASS_DEVICE void
+    begin_loop(int epi_m, int epi_n) {
+      if constexpr (EnableNullptr) {
+        if (params_ptr->ptr_aux == nullptr) {
+          fill(tC_rAux, params_ptr->null_default);
+          return;
+        }
+      }
+      constexpr auto MCL = decltype(max_common_layout(tC_gAux(_,_,_,_0{},_0{}), tC_rAux)){};
+      constexpr int V = cute::min(Alignment, size(MCL));
+
+      Tensor tC_cAux_mn = tC_cAux(_,_,_,epi_m,epi_n);
+      Tensor tC_cAux_vec = tensor<1>(zipped_divide(coalesce(tC_cAux_mn), MCL.compose(Int<V>{})));
+      
+      Tensor tC_gAux_vec = recast<Array<Element, V>>(coalesce(tC_gAux(_,_,_,epi_m,epi_n)));
+      Tensor tC_rAux_vec = recast<Array<Element, V>>(coalesce(tC_rAux));
+
+      auto pred_fn = [&] (auto const&... coords) {
+        return elem_less(tC_cAux_vec(coords...), problem_shape_mnl);
+      };
+
+      copy_if(pred_fn, tC_gAux_vec, tC_rAux_vec);
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<Element, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      return recast<Array<Element, FragmentSize>>(tC_rAux)(epi_v);
+    }
+  };
+
+  template <
+    bool ReferenceSrc,
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+
+    auto problem_shape_mnl = make_shape(M,N,L);
+
+    // Gmem Tensor
+    Tensor mAux = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_aux), make_shape(M,N,L), params_ptr->dAux
+    );
+    Tensor tC_gAux = sm90_partition_for_epilogue<ReferenceSrc>(
+      mAux, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+
+    // Register Tensor
+    Tensor tC_rAux = make_tensor<Element>(take<0,3>(shape(tC_gAux)));
+
+    // Predication support
+    Tensor coordAux = make_identity_tensor(shape(mAux));
+    Tensor tC_cAux = sm90_partition_for_epilogue<ReferenceSrc>(
+      coordAux, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+
+    return ConsumerStoreCallbacks<decltype(tC_gAux), decltype(tC_rAux), decltype(tC_cAux), decltype(problem_shape_mnl)>(
+      cute::move(tC_gAux),
+      cute::move(tC_rAux),
+      cute::move(tC_cAux),
+      problem_shape_mnl,
+      params_ptr
+    );
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Broadcast Load Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Scalar broadcast
+// Supports reduction over multiple broadcasts to support fusions such as fp8 scaling factors
+template<
+  class Element,
+  class StrideMNL_ = Stride<_0,_0,_0>,
+  int BroadcastCount = 1,
+  template <class> class ReductionFn = multiplies
+>
+struct Sm90ScalarBroadcast {
+  using StrideMNL = StrideMNL_;
+  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
+  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_0>{});
+
+  struct SharedStorage { };
+
+  struct Arguments {
+    Element scalars[BroadcastCount] = {};
+    Element const* scalar_ptrs[BroadcastCount] = {};
+    StrideMNL dScalar[BroadcastCount] = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+  
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  // This must be called after update_scalar is called
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    if (get<2>(params_ptr->dScalar[0]) == 0) { 
+      // Only 1 batch
+      return scalar == Element(0);
+    }
+    else { 
+      // multiple batch
+      if (valid_scalar == false) {
+        // for stridedBatch kernel, if ptr has a valid address, we need to enable the epi_load warps.
+        return params_ptr->scalar_ptrs[0] == nullptr;
+      }
+      else {
+        // Check whether each batch is ZERO or not.
+        return scalar == Element(0);
+      }
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ScalarBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
+      : params_ptr(&params) {
+    // Get the scalar for non-batched broadcast
+    if (size<2>(params_ptr->dScalar[0]) == 0) {
+      update_scalar();
+    }
+  }
+
+  Element scalar;
+  bool valid_scalar = false;
+  Params const* params_ptr;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    // Get the scalar for batched broadcast
+    if (size<2>(params_ptr->dScalar[0]) != 0) {
+      auto [m_coord, n_coord, k_coord, l_coord] = args.tile_coord_mnkl;
+      update_scalar(l_coord);
+    }
+
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(Element scalar)
+      : scalar(scalar) {}
+
+    Element scalar;
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<Element, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<Element, FragmentSize> frg_scalar;
+      frg_scalar.fill(scalar);
+
+      return frg_scalar;
+    }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    // Get the scalar for batched broadcast
+    if (get<2>(params_ptr->dScalar[0]) != 0) {
+      auto [m_coord, n_coord, k_coord, l_coord] = args.tile_coord_mnkl;
+      update_scalar(l_coord);
+    }
+
+    return ConsumerStoreCallbacks(scalar);
+  }
+
+private:
+  CUTLASS_DEVICE void
+  update_scalar(int l_coord = 0) {
+    valid_scalar = true;
+    int l_offset = l_coord * size<2>(params_ptr->dScalar[0]);
+
+    if (params_ptr->scalar_ptrs[0] != nullptr) {
+      scalar = params_ptr->scalar_ptrs[0][l_offset];
+    } 
+    else {
+      // batch stride is ignored for nullptr fallback
+      scalar = params_ptr->scalars[0];
+    }
+
+    // Do reduction over multiple broadcasts if necessary
+    ReductionFn<Element> reduction_fn;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < BroadcastCount; ++i) {
+      if (params_ptr->scalar_ptrs[i] != nullptr) {
+        int rest_l_offset = l_coord * size<2>(params_ptr->dScalar[i]);
+        scalar = reduction_fn(scalar, params_ptr->scalar_ptrs[i][rest_l_offset]);
+      } 
+      else {
+        // batch stride is ignored for nullptr fallback
+        scalar = reduction_fn(scalar, params_ptr->scalars[i]);
+      }
+    }
+  }
+
+  template<class... Xs>
+  CUTLASS_DEVICE void
+  update_scalar(cute::tuple<Xs...>) {
+    // Only support multiple L-modes with fully-broadcast scalar
+    scalar = params_ptr->scalars[0];
+    valid_scalar = true;
+  }
+};
+
+// Scalar broadcast
+// Supports reduction over multiple broadcasts to support fusions such as fp8 scaling factors
+template<
+  class Element,
+  class StrideMNL_ = Stride<_0,_0,_0>,
+  int BroadcastCount = 1,
+  template <class> class ReductionFn = multiplies
+>
+struct Sm90ScalarBroadcastPtrArray {
+  using StrideMNL = StrideMNL_;
+  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
+  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_0>{});
+
+  struct SharedStorage { };
+
+  struct Arguments {
+    Element scalars[BroadcastCount] = {};
+    Element const* scalar_ptrs[BroadcastCount] = {};
+    Element const* const* scalar_ptr_arrays[BroadcastCount] = {};
+    StrideMNL dScalar[BroadcastCount] = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+  
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    // producer load is needed if Element is not void
+    return !cute::is_void_v<Element>;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  // This must be called after update_scalar is called
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return scalar == Element(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ScalarBroadcastPtrArray() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ScalarBroadcastPtrArray(Params const& params, SharedStorage const& shared_storage)
+      : params_ptr(&params) {
+    // Get the scalar for non-batched broadcast
+    if (size<2>(params_ptr->dScalar[0]) == 0) {
+      update_scalar();
+    }
+  }
+
+  Element scalar;
+  Params const* params_ptr;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    // Get the scalar for batched broadcast
+    if (size<2>(params_ptr->dScalar[0]) != 0) {
+      auto [m_coord, n_coord, k_coord, l_coord] = args.tile_coord_mnkl;
+      update_scalar(l_coord);
+    }
+
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(Element scalar)
+      : scalar(scalar) {}
+
+    Element scalar;
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<Element, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<Element, FragmentSize> frg_scalar;
+      frg_scalar.fill(scalar);
+
+      return frg_scalar;
+    }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    // Get the scalar for batched broadcast
+    if (get<2>(params_ptr->dScalar[0]) != 0) {
+      auto [m_coord, n_coord, k_coord, l_coord] = args.tile_coord_mnkl;
+      update_scalar(l_coord);
+    }
+
+    return ConsumerStoreCallbacks(scalar);
+  }
+
+private:
+  CUTLASS_DEVICE void
+  update_scalar(int l_coord = 0) {
+    int l_offset = l_coord * size<2>(params_ptr->dScalar[0]);
+
+    if (params_ptr->scalar_ptr_arrays[0] != nullptr) {
+      scalar = *(params_ptr->scalar_ptr_arrays[0][l_offset]);
+    }
+    else if (params_ptr->scalar_ptrs[0] != nullptr) {
+      scalar = params_ptr->scalar_ptrs[0][l_offset];
+    }
+    else {
+      // batch stride is ignored for nullptr fallback
+      scalar = params_ptr->scalars[0];
+    }
+
+    // Do reduction over multiple broadcasts if necessary
+    ReductionFn<Element> reduction_fn;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < BroadcastCount; ++i) {
+
+      if (params_ptr->scalar_ptr_arrays[i] != nullptr) {
+        int rest_l_offset = l_coord * size<2>(params_ptr->dScalar[i]);
+        scalar = reduction_fn(scalar, *(params_ptr->scalar_ptr_arrays[i][rest_l_offset]));
+      }
+      if (params_ptr->scalar_ptrs[i] != nullptr) {
+        int rest_l_offset = l_coord * size<2>(params_ptr->dScalar[i]);
+        scalar = reduction_fn(scalar, params_ptr->scalar_ptrs[i][rest_l_offset]);
+      } 
+      else {
+        // batch stride is ignored for nullptr fallback
+        scalar = reduction_fn(scalar, params_ptr->scalars[i]);
+      }
+    }
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <int StagesC, class CtaTileShapeMNK, class EpilogueTile>
+[[deprecated("row broadcast only uses 0 stages")]] constexpr int
+compute_row_broadcast_stages() {
+  return ceil_div(StagesC, size<1>(zipped_divide(make_layout(take<0,2>(CtaTileShapeMNK{})), EpilogueTile{}))) + 1;
+}
+
+}
+
+// Row vector broadcast
+template<
+  int Stages,
+  class CtaTileShapeMNK,
+  class ElementInput,
+  class ElementCompute = ElementInput,
+  class StrideMNL_ = Stride<_0,_1,_0>,
+  int Alignment = 128 / sizeof_bits_v<ElementInput>,
+  bool EnableNullptr = true // Fallback scalar broadcast for nullptr params
+>
+struct Sm90RowBroadcast {
+  using StrideMNL = StrideMNL_;
+  static_assert(Stages == 0, "Row broadcast doesn't support smem pipelining");
+
+  static constexpr bool IsDynamicBroadcast = is_same_v<remove_cvref_t<decltype(get<1>(StrideMNL{}))>, bool>; // row vector or scalar broadcast
+  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))> || IsDynamicBroadcast); // batch stride can be dynamic or static
+  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_1>{} || IsDynamicBroadcast);
+
+  struct SharedStorage { 
+    array_aligned<ElementInput, size<1>(CtaTileShapeMNK{})> smem;
+  };
+
+  struct Arguments {
+    ElementInput const* ptr_row = nullptr;
+    ElementInput null_default = ElementInput(0);
+    StrideMNL dRow = {};
+  };
+
+  struct Params {
+    ElementInput const* ptr_row = nullptr;
+    ElementCompute null_default = ElementCompute(0);
+    StrideMNL dRow = {};
+  };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return {args.ptr_row, ElementCompute(args.null_default), args.dRow};
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90RowBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90RowBroadcast(Params const& params, SharedStorage const& shared_storage)
+      : params(params), is_zero_(false),
+        smem(const_cast<ElementInput*>(shared_storage.smem.data())) {
+    auto const& [stride_M, stride_N, stride_L] = params.dRow;
+    // Nullptr default
+    if (EnableNullptr && params.ptr_row == nullptr) {
+      is_zero_ = params.null_default == ElementCompute(0);
+    }
+    // Dynamic non-batched scalar broadcast
+    else if (IsDynamicBroadcast && stride_N == bool(0) && stride_L == repeat_like(stride_L, 0)) {
+      is_zero_ = params.ptr_row[0] == ElementInput(0);
+    }
+  }
+
+  Params params;
+  bool is_zero_ = false;
+  ElementInput *smem = nullptr;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return is_zero_;
+  }
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template <class GS_GTensor, class GS_STensor, class GS_CTensor, class Tiled_G2S, class SR_STensor, class SR_RTensor, class Residue, class ThrNum>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+        GS_GTensor tGS_gRow_, GS_STensor tGS_sRow_, 
+        GS_CTensor tGS_cRow_, Tiled_G2S tiled_g2s_, 
+        SR_STensor tSR_sRow_, SR_RTensor tSR_rRow_,
+        Residue residue_cRow_, ThrNum thr_num_, Params const& params_)
+      : tGS_gRow(tGS_gRow_)
+      , tGS_sRow(tGS_sRow_)
+      , tGS_cRow(tGS_cRow_)
+      , tiled_G2S(tiled_g2s_)
+      , tSR_sRow(tSR_sRow_)
+      , tSR_rRow(tSR_rRow_)
+      , residue_cRow(residue_cRow_)
+      , params(params_)
+      , is_nullptr(EnableNullptr && params_.ptr_row == nullptr) {
+      if (is_nullptr) {
+        fill(tSR_rRow, params.null_default);
+      }
+    }
+
+    GS_GTensor tGS_gRow;                                                         // (CPY,CPY_M,CPY_N)
+    GS_STensor tGS_sRow;                                                         // (CPY,CPY_M,CPY_N)
+    GS_CTensor tGS_cRow;                                                         // (CPY,CPY_M,CPY_N)
+    Tiled_G2S tiled_G2S;
+
+    SR_STensor tSR_sRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    SR_RTensor tSR_rRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) 
+  
+    Residue residue_cRow;                                                        // (m, n)
+    ThrNum thr_num;
+    Params const& params;
+    bool is_nullptr;
+
+    CUTLASS_DEVICE void
+    begin() {
+      if (is_nullptr) {
+        return;
+      }
+
+      auto synchronize = [&] () { cutlass::arch::NamedBarrier::sync(thr_num, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
+      Tensor tGS_gRow_flt = filter_zeros(tGS_gRow);
+      Tensor tGS_sRow_flt = filter_zeros(tGS_sRow);
+      Tensor tGS_cRow_flt = filter_zeros(tGS_cRow, tGS_gRow.stride());
+
+      for (int i = 0; i < size(tGS_gRow_flt); ++i) {
+        if (get<1>(tGS_cRow_flt(i)) >= size<1>(CtaTileShapeMNK{})) {
+          continue; // OOB of SMEM, 
+        }
+        if (elem_less(tGS_cRow_flt(i), residue_cRow)) {
+          tGS_sRow_flt(i) = tGS_gRow_flt(i);
+        }
+        else {
+          tGS_sRow_flt(i) = ElementInput(0); // Set to Zero when OOB so LDS can be issued without any preds.
+        }
+      }
+      synchronize();
+    }
+
+    CUTLASS_DEVICE void
+    begin_loop(int epi_m, int epi_n) {
+      if (epi_m == 0 and not is_nullptr) { // Assumes M-major subtile loop
+        Tensor tSR_sRow_flt = filter_zeros(tSR_sRow(_,_,_,epi_m,epi_n));
+        Tensor tSR_rRow_flt = make_tensor_like<ElementInput>(tSR_sRow_flt);
+        copy_aligned(tSR_sRow_flt, tSR_rRow_flt);
+
+        constexpr int FrgSize = size(tSR_rRow_flt);
+        using FrgInput = Array<ElementInput, FrgSize>;
+        using FrgCompute = Array<ElementCompute, FrgSize>;
+        using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FrgSize>;
+
+        Tensor tSR_rRow_input_frg = recast<FrgInput>(coalesce(tSR_rRow_flt));
+        Tensor tSR_rRow_compute_frg = recast<FrgCompute>(filter(tSR_rRow));
+        ConvertInput convert_input{};
+
+        tSR_rRow_compute_frg(_0{}) = convert_input(tSR_rRow_input_frg(_0{}));
+      }
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<ElementCompute, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<ElementCompute, FragmentSize> frg_row;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        frg_row[i] = tSR_rRow(epi_v * FragmentSize + i);
+      }
+
+      return frg_row;
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+    using ThreadCount = decltype(size(args.tiled_copy));
+
+    auto layout_N = [&] () {
+      auto shape_N = get<1>(args.problem_shape_mnkl);
+      if constexpr (IsDynamicBroadcast) {
+        auto stride_N = repeat_like(shape_N, int(0));
+        if (get<1>(params.dRow) == bool(1)) {
+          stride_N = transform_leaf(compact_major<LayoutLeft>(shape_N),
+            [] (auto const& stride) { return static_cast<int>(stride); }
+          );
+        }
+        return make_layout(shape_N, stride_N);
+      }
+      else {
+        return make_layout(shape_N);
+      }
+    }();
+
+    auto layout_M = make_layout(M, repeat_like(M, _0{}));
+    auto layout_L = make_layout(L, get<2>(params.dRow));
+    Tensor mRow = make_tensor(make_gmem_ptr(params.ptr_row), make_layout(layout_M,layout_N,layout_L));
+    Tensor gRow = local_tile(mRow(_,_,l), take<0,2>(args.tile_shape_mnk), make_coord(m, n));          // (CTA_M, CTA_N)
+    Tensor sRow = make_tensor(make_smem_ptr(smem), 
+        make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})), make_shape(_0{}, _1{}));  // (CTA_M, CTA_N)
+    //// G2S: Gmem to Smem
+    auto tiled_g2s = make_tiled_copy(Copy_Atom<DefaultCopy, ElementInput>{},
+                                     Layout< Shape<_1, ThreadCount>, 
+                                            Stride<_0,          _1>>{}, 
+                                     Layout<_1>{});   
+    auto thr_g2s = tiled_g2s.get_slice(args.thread_idx);
+    Tensor tGS_gRow = thr_g2s.partition_S(gRow);
+    Tensor tGS_sRow = thr_g2s.partition_D(sRow);
+
+    //// G2S: Coord 
+    Tensor tGS_cRow = thr_g2s.partition_S(args.cD);
+
+    //// S2R: Smem to Reg
+    Tensor tSR_sRow = sm90_partition_for_epilogue<ReferenceSrc>(sRow, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tSR_rRow = make_tensor_like<ElementCompute>(take<0,3>(tSR_sRow));                        // (CPY,CPY_M,CPY_N)
+
+    return ConsumerStoreCallbacks(
+      tGS_gRow, 
+      tGS_sRow, 
+      tGS_cRow, tiled_g2s, 
+      tSR_sRow, 
+      tSR_rRow, 
+      args.residue_cD,
+      ThreadCount{}, 
+      params);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Column vector broadcast
+template<
+  int Stages,
+  class CtaTileShapeMNK,
+  class ElementInput,
+  class ElementCompute = ElementInput,
+  class StrideMNL_ = Stride<_1,_0,_0>,
+  int Alignment = 128 / sizeof_bits_v<ElementInput>,
+  bool EnableNullptr = true // Fallback scalar broadcast for nullptr params
+>
+struct Sm90ColBroadcast {
+  using StrideMNL = StrideMNL_;
+  static_assert(Stages == 0, "Column broadcast doesn't support smem pipelining");
+
+  static constexpr bool IsDynamicBroadcast = is_same_v<remove_cvref_t<decltype(get<0>(StrideMNL{}))>, bool>; // Column vector or scalar broadcast
+  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))> || IsDynamicBroadcast); // batch stride can be dynamic or static
+  static_assert(take<0,2>(StrideMNL{}) == Stride<_1,_0>{} || IsDynamicBroadcast);
+
+  // Accumulator distributes col elements evenly amongst threads so we can just directly load from gmem
+  struct SharedStorage { };
+
+  struct Arguments {
+    ElementInput const* ptr_col = nullptr;
+    ElementInput null_default = ElementInput(0);
+    StrideMNL dCol = {};
+  };
+
+  struct Params {
+    ElementInput const* ptr_col = nullptr;
+    ElementCompute null_default = ElementCompute(0);
+    StrideMNL dCol = {};
+  };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return {args.ptr_col, ElementCompute(args.null_default), args.dCol};
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return is_zero_;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ColBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ColBroadcast(Params const& params, SharedStorage const& shared_storage)
+      : params(params), is_zero_(false) {
+    auto const& [stride_M, stride_N, stride_L] = params.dCol;
+    // Nullptr default
+    if (EnableNullptr && params.ptr_col == nullptr) {
+      is_zero_ = params.null_default == ElementCompute(0);
+    }
+    // Dynamic non-batched scalar broadcast
+    else if (IsDynamicBroadcast && stride_M == bool(0) && stride_L == repeat_like(stride_L, 0)) {
+      is_zero_ = params.ptr_col[0] == ElementInput(0);
+    }
+  }
+
+  Params params;
+  bool is_zero_;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<class GTensor, class RTensor, class CTensor, class ThrResidue>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(GTensor tCgCol_, RTensor tCrCol_, CTensor tCcCol_, ThrResidue residue_tCcCol_, Params const& params_)
+      : tCgCol(tCgCol_),
+        tCrCol(tCrCol_),
+        tCcCol(tCcCol_),
+        residue_tCcCol(residue_tCcCol_),
+        params(params_) {
+      if (EnableNullptr && params.ptr_col == nullptr) {
+        fill(tCrCol, params.null_default);
+      }
+    }
+
+    GTensor tCgCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    RTensor tCrCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    CTensor tCcCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    ThrResidue residue_tCcCol;
+    Params const& params;
+
+    CUTLASS_DEVICE void
+    begin() {
+      if (EnableNullptr && params.ptr_col == nullptr) {
+        return;
+      }
+
+      // Filter so we don't issue redundant copies over stride-0 modes
+      // (only works if 0-strides are in same location, which is by construction)
+      Tensor tCgCol_flt = filter_zeros(tCgCol);
+      Tensor tCrCol_flt = make_tensor_like<ElementInput>(filter_zeros(tCrCol));
+      Tensor tCcCol_flt = filter_zeros(tCcCol, tCgCol.stride());
+
+      constexpr auto MCL = decltype(max_common_layout(tCgCol_flt, tCrCol_flt)){};
+      constexpr int V = cute::min(Alignment, size(MCL));
+      if constexpr (V > 1) {
+        using VecType = uint_bit_t<V * sizeof_bits_v<ElementInput>>;
+        Tensor tCgCol_vec = recast<VecType>(coalesce(tCgCol_flt));
+        Tensor tCrCol_vec = recast<VecType>(coalesce(tCrCol_flt));
+        Tensor tCcCol_vec = tensor<1>(zipped_divide(tCcCol_flt, MCL.compose(Int<V>{})));
+        auto pred_fn = [&] (auto const&... coords) { return elem_less(tCcCol_vec(coords...), residue_tCcCol); };
+        copy_if(pred_fn, tCgCol_vec, tCrCol_vec);
+      }
+      else {
+        auto pred_fn = [&] (auto const&... coords) { return elem_less(tCcCol_flt(coords...), residue_tCcCol); };
+        copy_if(pred_fn, tCgCol_flt, tCrCol_flt);
+      }
+
+      constexpr int FrgSize = size(tCrCol_flt);
+      using FrgInput = Array<ElementInput, FrgSize>;
+      using FrgCompute = Array<ElementCompute, FrgSize>;
+      using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FrgSize>;
+
+      Tensor tCrCol_input_frg = recast<FrgInput>(coalesce(tCrCol_flt));
+      Tensor tCrCol_compute_frg = recast<FrgCompute>(filter(tCrCol));
+      ConvertInput convert_input{};
+
+      tCrCol_compute_frg(_0{}) = convert_input(tCrCol_input_frg(_0{}));
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<ElementCompute, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<ElementCompute, FragmentSize> frg_col;
+      Tensor tCrCol_mn = tCrCol(_,_,_,epi_m,epi_n);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        frg_col[i] = tCrCol_mn(epi_v * FragmentSize + i);
+      }
+
+      return frg_col;
+    }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto layout_M = [&] () {
+      auto shape_M = get<0>(args.problem_shape_mnkl);
+      if constexpr (IsDynamicBroadcast) {
+        auto stride_M = repeat_like(shape_M, int(0));
+        if (get<0>(params.dCol) == bool(1)) {
+          stride_M = transform_leaf(compact_major<LayoutLeft>(shape_M),
+            [] (auto const& stride) { return static_cast<int>(stride); }
+          );
+        }
+        return make_layout(shape_M, stride_M);
+      }
+      else {
+        return make_layout(shape_M);
+      }
+    }();
+
+    auto layout_N = make_layout(N, repeat_like(N, _0{}));
+    auto layout_L = make_layout(L, get<2>(params.dCol));
+    Tensor mCol = make_tensor(make_gmem_ptr(params.ptr_col), make_layout(layout_M,layout_N,layout_L));
+    Tensor tCgCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      mCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+
+    Tensor mCol_static = make_tensor(make_gmem_ptr(params.ptr_col), make_layout(make_layout(M),layout_N,layout_L));
+    Tensor tCgCol_static = sm90_partition_for_epilogue<ReferenceSrc>(                  // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      mCol_static, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tCrCol = make_tensor_like<ElementCompute>(tCgCol_static);                   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+
+    return ConsumerStoreCallbacks(tCgCol, tCrCol, args.tCcD, args.residue_tCcD, params);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Do outer product from the column and row loaded
+//
+template<
+  int Stages,
+  class CtaTileShapeMNK,
+  class ElementScalar,
+  class StrideColMNL_ = Stride<_1,_0,int64_t>, /// NOTE: Batched scaling untested for now
+  class StrideRowMNL_ = Stride<_0,_1,int64_t>,
+  int Alignment = 128 / sizeof_bits_v<ElementScalar>,
+  bool EnableNullptr = false // Fallback scalar broadcast for nullptr params
+>
+struct Sm90OuterProduct {
+  using StrideColMNL = StrideColMNL_;
+  using StrideRowMNL = StrideRowMNL_;
+  static_assert(Stages == 0, "OuterProduct doesn't support smem usage");
+  static_assert(Alignment * sizeof_bits_v<ElementScalar> % 128 == 0, "sub-16B alignment not supported yet");
+  static_assert(!EnableNullptr, "Nullptr fallback not implemented");
+  static_assert(is_static_v<decltype(take<0,2>(StrideColMNL{}))> &&
+                is_static_v<decltype(take<0,2>(StrideRowMNL{}))>, "Only batch stride can be dynamic");
+  static_assert(take<0,2>(StrideColMNL{}) == Stride<_1,_0>{} &&
+                take<0,2>(StrideRowMNL{}) == Stride<_0,_1>{}, "Row and column incorrectly formatted");
+
+  // Accumulator distributes col/row elements evenly amongst threads so we can just directly load from gmem
+  struct SharedStorage { };
+
+  struct Arguments {
+    ElementScalar const* ptr_col = nullptr;
+    ElementScalar const* ptr_row = nullptr;
+    StrideColMNL dCol = {};
+    StrideRowMNL dRow = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return false;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90OuterProduct() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90OuterProduct(Params const& params, SharedStorage const& shared_storage)
+  : params(params) { }
+
+  Params params;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<
+    class GTensorCol, class RTensorCol,
+    class GTensorRow, class RTensorRow
+  >
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(GTensorCol&& tCgCol, RTensorCol&& tCrCol,
+                           GTensorRow&& tCgRow, RTensorRow&& tCrRow,
+                           Params const& params)
+      : tCgCol(cute::forward<GTensorCol>(tCgCol))
+      , tCrCol(cute::forward<RTensorCol>(tCrCol))
+      , tCgRow(cute::forward<GTensorRow>(tCgRow))
+      , tCrRow(cute::forward<RTensorRow>(tCrRow))
+      , params(params) {}
+
+    GTensorCol tCgCol;                                                                 // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    RTensorCol tCrCol;                                                                 // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    GTensorRow tCgRow;                                                                 // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    RTensorRow tCrRow;                                                                 // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    Params const& params;
+
+    CUTLASS_DEVICE void
+    begin() {
+
+      // Filter so we don't issue redundant copies over stride-0 modes
+      copy(filter(tCgCol), filter(tCrCol));
+      copy(filter(tCgRow), filter(tCrRow));
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<ElementScalar, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<ElementScalar, FragmentSize> frg_colrow;
+      Tensor tCrCol_mn = tCrCol(_,_,_,epi_m,epi_n);
+      Tensor tCrRow_mn = tCrRow(_,_,_,epi_m,epi_n);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        frg_colrow[i] = static_cast<ElementScalar>(tCrCol_mn(epi_v * FragmentSize + i) * tCrRow_mn(epi_v * FragmentSize + i));
+      }
+      return frg_colrow;
+    }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    Tensor mCol = make_tensor(make_gmem_ptr(params.ptr_col), make_shape(M,N,L), params.dCol);
+    Tensor mRow = make_tensor(make_gmem_ptr(params.ptr_row), make_shape(M,N,L), params.dRow);
+    Tensor tCgCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      mCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tCgRow = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      mRow, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tCrCol = make_tensor_like(tCgCol);                                          // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    Tensor tCrRow = make_tensor_like(tCgRow);                                          // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+
+    return ConsumerStoreCallbacks<
+      decltype(tCgCol), decltype(tCrCol),
+      decltype(tCgRow), decltype(tCrRow)
+    >(
+      cute::move(tCgCol), cute::move(tCrCol),
+      cute::move(tCgRow), cute::move(tCrRow),
+      params
+    );
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Batch matrix broadcast
+// Only need to redefine this if we can multicast across cluster L
+template <
+  int Stages,
+  class EpilogueTile,
+  class Element,
+  class StrideMNL,
+  class SmemLayoutAtom,
+  class CopyOpS2R,
+  int Alignment = 128 / sizeof_bits_v<Element>,
+  bool EnableNullptr = true // Fallback scalar broadcast for nullptr params
+>
+using Sm90MatrixBroadcast
+  = Sm90AuxLoad<Stages, EpilogueTile, Element, StrideMNL, SmemLayoutAtom, CopyOpS2R, EnableNullptr>;
+
+namespace detail {
+
+template <typename Operation, typename = void>
+struct IsScalarBroadcast {
+  static constexpr bool value = false;
+};
+
+template <typename Operation>
+struct IsScalarBroadcast<Operation, cute::enable_if_t<is_same_v<decltype(take<0,2>(typename Operation::StrideMNL{})), Stride<_0,_0>>>> {
+  static constexpr bool value = true;
+};
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::fusion
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp b/include/cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp
new file mode 100644
index 0000000000..83cfc030df
--- /dev/null
+++ b/include/cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp
@@ -0,0 +1,1724 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree store operations for the sm90 TMA warp-specialized (ws) epilogue
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+
+#include "cute/tensor.hpp"
+#include "sm90_visitor_tma_warpspecialized.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::fusion {
+
+using namespace cute;
+using namespace detail;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Elementwise Store Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int Stages,
+  class EpilogueTile,
+  class Element,
+  FloatRoundStyle RoundStyle,
+  class StrideMNL,
+  class SmemLayoutAtom,
+  class CopyOpR2S,
+  int Alignment = 128 / sizeof_bits_v<Element>,
+  bool EnableNullptr = true // Noop on nullptr params
+>
+struct Sm90AuxStore {
+  using ElementAux = Element;
+  static_assert(Alignment * sizeof_bits_v<Element> % 128 == 0, "sub-16B alignment not supported yet");
+
+  constexpr static bool is_m_major = epilogue::collective::detail::is_m_major<StrideMNL>();
+  // Find the max contiguous layout usable by TMA (if EpilogueTile is a non-compact tiler)
+  using SmemShapeTma = decltype(make_shape(
+      max_common_vector(make_layout(get<0>(EpilogueTile{})),make_layout(get<0>(EpilogueTile{}))),
+      max_common_vector(make_layout(get<1>(EpilogueTile{})),make_layout(get<1>(EpilogueTile{})))));
+  using SmemLayoutTma = decltype(tile_to_shape(
+      SmemLayoutAtom{}, SmemShapeTma{},
+      cute::conditional_t<is_m_major, Step<_2,_1>, Step<_1,_2>>{} ));
+  using SmemLayout = decltype(tile_to_shape(
+      SmemLayoutTma{},
+      make_shape(size<0>(shape(EpilogueTile{})), size<1>(shape(EpilogueTile{})), Int<Stages>{}),
+      cute::conditional_t<is_m_major, Step<_2,_1,_3>, Step<_1,_2,_3>>{} ));
+
+  struct SharedStorage {
+    alignas(cutlass::detail::alignment_for_swizzle(SmemLayout{}))
+    array_aligned<Element, size(SmemLayout{})> smem_aux;
+  };
+
+  struct Arguments {
+    Element* ptr_aux = nullptr;
+    StrideMNL dAux = {};
+  };
+
+  struct Params {
+    using TMA_Aux = decltype(make_tma_copy(
+        SM90_TMA_STORE{},
+        make_tensor(static_cast<Element*>(nullptr), repeat_like(StrideMNL{}, int32_t(0)), StrideMNL{}),
+        SmemLayoutTma{}));
+    TMA_Aux tma_store_aux;
+    bool is_nullptr = false;
+  };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_mnkl = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_mnkl;
+
+    bool is_nullptr = false;
+    if constexpr (EnableNullptr) {
+      is_nullptr = args.ptr_aux == nullptr;
+    }
+
+    typename Params::TMA_Aux tma_store_aux;
+    if (not is_nullptr) {
+      Tensor tensor_aux = make_tensor(args.ptr_aux, make_layout(make_shape(M,N,L), args.dAux));
+      tma_store_aux = make_tma_copy(SM90_TMA_STORE{}, tensor_aux, SmemLayoutTma{});
+    }
+
+    return {tma_store_aux, is_nullptr};
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90AuxStore() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90AuxStore(Params const& params, SharedStorage const& shared_storage)
+      : params_ptr(&params),
+        smem_aux(const_cast<Element*>(shared_storage.smem_aux.data())) { }
+
+  Params const* params_ptr;
+  Element* smem_aux;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template <
+    class RTensor,
+    class TiledR2S,
+    class STensorR2S,
+    class STensorS2G,
+    class GTensorS2G
+  >
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+          RTensor&& tC_rAux,
+          TiledR2S tiled_r2s,
+          STensorR2S&& tRS_sAux,
+          STensorS2G&& bSG_sAux,
+          GTensorS2G&& bSG_gAux,
+          Params const* params_ptr)
+      : tiled_r2s(tiled_r2s),
+        tC_rAux(cute::forward<RTensor>(tC_rAux)),
+        tRS_sAux(cute::forward<STensorR2S>(tRS_sAux)),
+        bSG_sAux(cute::forward<STensorS2G>(bSG_sAux)),
+        bSG_gAux(cute::forward<GTensorS2G>(bSG_gAux)),
+        params_ptr(params_ptr) {}
+
+    TiledR2S tiled_r2s;
+    RTensor tC_rAux;                                                                   // (CPY,CPY_M,CPY_N)
+    STensorR2S tRS_sAux;                                                               // (R2S,R2S_M,R2S_N,PIPE)
+    STensorS2G bSG_sAux;                                                               // (S2G,S2G_M,S2G_N,PIPE)
+    GTensorS2G bSG_gAux;                                                               // (S2G,S2G_M,S2G_N,EPI_M,EPI_N)
+    Params const* params_ptr;
+
+    template <typename ElementAccumulator, typename ElementInput, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
+          Array<ElementInput, FragmentSize> const& frg_input) {
+      using ConvertInput = NumericArrayConverter<Element, ElementInput, FragmentSize, RoundStyle>;
+      ConvertInput convert_input{};
+
+      Tensor tC_rAux_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rAux));                          // (EPI_V)
+      tC_rAux_frg(epi_v) = convert_input(frg_input);
+
+      return frg_input;
+    }
+
+    CUTLASS_DEVICE void
+    postreduce(int epi_m, int epi_n, int store_iteration, bool issue_smem_store) {
+      if constexpr (EnableNullptr) {
+        if (params_ptr->is_nullptr) {
+          return;
+        }
+      }
+
+      using RLayoutR2S = decltype(cute::layout(TiledR2S{}.get_slice(0).retile_S(RTensor{})));
+      Tensor tRS_rAux = make_tensor(tC_rAux.data(), RLayoutR2S{});                                 // (R2S,R2S_M,R2S_N)
+
+      if (issue_smem_store) {
+        int store_pipe_index = store_iteration % Stages;
+        copy(tiled_r2s, tRS_rAux, tRS_sAux(_,_,_,store_pipe_index));
+      }
+    }
+
+    CUTLASS_DEVICE void
+    tma_store(int epi_m, int epi_n, int store_iteration, bool issue_tma_store) {
+      if constexpr (EnableNullptr) {
+        if (params_ptr->is_nullptr) {
+          return;
+        }
+      }
+
+      if (issue_tma_store) {
+        // Issue the TMA store
+        int store_pipe_index = store_iteration % Stages;
+        copy(params_ptr->tma_store_aux, bSG_sAux(_,_,_,store_pipe_index), bSG_gAux(_,_,_,epi_m,epi_n));
+      }
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+    Tensor mAux = params_ptr->tma_store_aux.get_tma_tensor(make_shape(M,N,L));                               // (M,N,L)
+    Tensor gAux = local_tile(mAux, take<0,2>(args.tile_shape_mnk), make_coord(m,n,l));                 // (CTA_M,CTA_N)
+
+    Tensor tC_gAux = sm90_partition_for_epilogue<ReferenceSrc>(                        // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+                      gAux, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tC_rAux = make_tensor<Element>(take<0,3>(shape(tC_gAux)));                  // (CPY,CPY_M,CPY_N)
+
+    Tensor sAux_epi = cute::as_position_independent_swizzle_tensor(
+                        make_tensor(make_smem_ptr(smem_aux), SmemLayout{}));     // (EPI_TILE_M,EPI_TILE_N,PIPE)
+    Tensor gAux_epi = flat_divide(gAux, args.epi_tile);                          // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+
+    auto tiled_r2s = conditional_return<ReferenceSrc>(
+      make_tiled_copy_S(Copy_Atom<CopyOpR2S,Element>{}, args.tiled_copy),
+      make_tiled_copy_D(Copy_Atom<CopyOpR2S,Element>{}, args.tiled_copy)
+    );
+    auto tRS_sAux = tiled_r2s.get_slice(args.thread_idx).partition_D(sAux_epi);               // (R2S,R2S_M,R2S_N,PIPE)
+
+    ThrCopy thrblk_s2g = params_ptr->tma_store_aux.get_slice(_0{});
+    Tensor bSG_sAux = thrblk_s2g.partition_S(sAux_epi);                                // (TMA,TMA_M,TMA_N,PIPE)
+    Tensor bSG_gAux = thrblk_s2g.partition_D(gAux_epi);                                // (TMA,TMA_M,TMA_N,EPI_M,EPI_N)
+
+    return ConsumerStoreCallbacks<decltype(tC_rAux), decltype(tiled_r2s), decltype(tRS_sAux), decltype(bSG_sAux), decltype(bSG_gAux)>(
+            cute::move(tC_rAux),
+            tiled_r2s,
+            cute::move(tRS_sAux),
+            cute::move(bSG_sAux),
+            cute::move(bSG_gAux),
+            params_ptr);
+  }
+};
+
+template <
+  class Element,
+  class EpilogueTile,   // Unused
+  FloatRoundStyle RoundStyle,
+  class LayoutOrStrideMNL,
+  class SmemLayoutAtom, // Unused
+  class CopyOpR2S,      // Unused
+  int Alignment,
+  bool EnableNullptr
+>
+struct Sm90AuxStore<
+  0, EpilogueTile, Element, RoundStyle, LayoutOrStrideMNL,
+  SmemLayoutAtom, CopyOpR2S, Alignment, EnableNullptr
+> {
+  using ElementAux = Element;
+  using StrideMNL = cutlass::gemm::TagToStrideC_t<LayoutOrStrideMNL>;
+
+  struct SharedStorage { };
+
+  struct Arguments {
+    Element* ptr_aux = nullptr;
+    StrideMNL dAux = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90AuxStore() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90AuxStore(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+
+  Params const* params_ptr;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<
+    class GTensorR2G,
+    class RTensor,
+    class CTensorR2G,
+    class ProblemShapeMNL
+  >
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+        GTensorR2G&& tC_gAux,
+        RTensor&& tC_rAux,
+        CTensorR2G&& tC_cAux,
+        ProblemShapeMNL problem_shape_mnl,
+        Params const* params_ptr)
+      : tC_gAux(cute::forward<GTensorR2G>(tC_gAux)),
+        tC_rAux(cute::forward<RTensor>(tC_rAux)),
+        tC_cAux(cute::forward<CTensorR2G>(tC_cAux)),
+        problem_shape_mnl(problem_shape_mnl),
+        params_ptr(params_ptr) {}
+
+    GTensorR2G tC_gAux;
+    RTensor tC_rAux;
+    CTensorR2G tC_cAux;
+    ProblemShapeMNL problem_shape_mnl;
+    Params const* params_ptr;
+
+    template <typename ElementAccumulator, typename ElementInput, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
+          Array<ElementInput, FragmentSize> const& frg_input) {
+      using ConvertInput = NumericArrayConverter<Element, ElementInput, FragmentSize, RoundStyle>;
+      ConvertInput convert_input{};
+
+      Tensor tC_rAux_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rAux));
+      tC_rAux_frg(epi_v) = convert_input(frg_input);
+
+      return frg_input;
+    }
+
+    CUTLASS_DEVICE void
+    end_loop(int epi_m, int epi_n) {
+      if constexpr (EnableNullptr) {
+        if (params_ptr->ptr_aux == nullptr) {
+          return;
+        }
+      }
+
+      constexpr auto MCL = decltype(max_common_layout(tC_gAux(_,_,_,_0{},_0{}), tC_rAux)){};
+      constexpr int V = cute::min(Alignment, size(MCL));
+
+      Tensor tC_cAux_mn = tC_cAux(_,_,_,epi_m,epi_n);
+      Tensor tC_cAux_vec = tensor<1>(zipped_divide(coalesce(tC_cAux_mn), MCL.compose(Int<V>{})));
+
+      Tensor tC_gAux_vec = recast<Array<Element, V>>(coalesce(tC_gAux(_,_,_,epi_m,epi_n)));
+      Tensor tC_rAux_vec = recast<Array<Element, V>>(coalesce(tC_rAux));
+
+      auto pred_fn = [&] (auto const&... coords) {
+        return elem_less(tC_cAux_vec(coords...), problem_shape_mnl);
+      };
+
+      copy_if(pred_fn, tC_rAux_vec, tC_gAux_vec);
+    }
+  };
+
+  template <
+    bool ReferenceSrc,
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+
+    auto problem_shape_mnl = make_shape(M,N,L);
+
+    // Gmem Tensor
+    Tensor mAux = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_aux), make_shape(M,N,L), params_ptr->dAux
+    );
+    Tensor tC_gAux = sm90_partition_for_epilogue<ReferenceSrc>(
+                      mAux, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+
+    // Register Tensor
+    Tensor tC_rAux = make_tensor<Element>(take<0,3>(shape(tC_gAux)));
+
+    // Predication support
+    Tensor coordAux = make_identity_tensor(shape(mAux));
+    Tensor tC_cAux = sm90_partition_for_epilogue<ReferenceSrc>(
+                      coordAux, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+
+    return ConsumerStoreCallbacks<decltype(tC_gAux), decltype(tC_rAux), decltype(tC_cAux), decltype(problem_shape_mnl)>(
+      cute::move(tC_gAux),
+      cute::move(tC_rAux),
+      cute::move(tC_cAux),
+      problem_shape_mnl,
+      params_ptr
+    );
+
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Reduction Store Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Scalar reduction
+template <
+  template <class> class RegReduceFn,
+  template <class> class GmemReduceFn,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  class StrideMNL = Stride<_0,_0,_0>,
+  bool EnableNullptr = true // Noop on nullptr params
+>
+struct Sm90ScalarReduction {
+private:
+  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
+  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_0>{});
+  static constexpr bool IsAtomic = is_atomic<GmemReduceFn<ElementCompute>>::value;
+  static_assert(IsAtomic, "non-atomic scalar reduction not supported yet");
+
+public:
+  struct SharedStorage { };
+
+  struct Arguments {
+    ElementOutput* ptr_scalar = nullptr;
+    ElementCompute reduction_identity = ElementCompute(0);
+    StrideMNL dScalar = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+  #if !defined(CUTLASS_SKIP_REDUCTION_INIT)
+    if constexpr (IsAtomic) {
+      auto problem_shape_mnkl = append<4>(problem_shape, 1);
+      auto [M, N, K, L] = problem_shape_mnkl;
+      Layout mScalar_layout = make_layout(make_shape(M,N,L), args.dScalar);
+      if (args.ptr_scalar != nullptr) {
+        return fill_workspace(args.ptr_scalar, ElementOutput(args.reduction_identity), cosize(mScalar_layout), stream, cuda_adapter);
+      }
+    }
+  #endif
+
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ScalarReduction() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ScalarReduction(Params const& params, SharedStorage const& shared_storage)
+      : params(params) { }
+
+  Params const params;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<class CTensor, class ThrResidue>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+        int l_coord,
+        CTensor tCcScalar,
+        ThrResidue residue_tCcScalar,
+        Params const& params)
+      : scalar(params.reduction_identity),
+        l_coord(l_coord),
+        tCcScalar(tCcScalar),
+        residue_tCcScalar(residue_tCcScalar),
+        params(params) {}
+
+    ElementCompute scalar;
+    int l_coord;
+    CTensor tCcScalar;                                                                 // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    ThrResidue residue_tCcScalar;
+    Params params;
+
+    template <typename ElementAccumulator, typename ElementInput, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
+          Array<ElementInput, FragmentSize> const& frg_input) {
+      if constexpr (EnableNullptr) {
+        if (params.ptr_scalar == nullptr) {
+          return frg_input;
+        }
+      }
+
+      using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
+      using ReduceInput = RegReduceFn<ElementCompute>;
+      ConvertInput convert_input{};
+      ReduceInput reduce_input{};
+
+      Array frg_I = convert_input(frg_input);
+      Tensor tCcScalar_mn = tCcScalar(_,_,_,epi_m,epi_n);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        if (elem_less(tCcScalar_mn(epi_v * FragmentSize + i), residue_tCcScalar)) {
+          scalar = reduce_input(scalar, frg_I[i]);
+        }
+      }
+
+      return frg_input;
+    }
+
+    CUTLASS_DEVICE void
+    end() {
+      if constexpr (EnableNullptr) {
+        if (params.ptr_scalar == nullptr) {
+          return;
+        }
+      }
+
+      using ConvertI = NumericConverter<ElementOutput, ElementCompute, RoundStyle>;
+      using ReduceInput = GmemReduceFn<ElementOutput>;
+
+      ConvertI convert_I{};
+      ReduceInput reduce_input{};
+
+      ElementOutput* ptr_scalar = params.ptr_scalar + l_coord * get<2>(params.dScalar);
+      reduce_input(ptr_scalar, convert_I(scalar));
+    }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    return ConsumerStoreCallbacks<decltype(args.tCcD), decltype(args.residue_tCcD)>(
+      get<3>(args.tile_coord_mnkl), args.tCcD, args.residue_tCcD, params);
+  }
+
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Row vector reduction
+template <
+  template <class> class RegReduceFn,
+  template <class> class ShuffleReduceFn,
+  template <class> class GmemReduceFn,
+  int Stages,
+  class CtaTileShapeMNK,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  class StrideMNL = Stride<_0,_1,_0>,
+  int Alignment = 128 / sizeof_bits_v<ElementOutput>,
+  bool EnableNullptr = true, // Noop on nullptr params
+  // If this is false, ptr_row is assumed to point to a compact n-major (ceil_div(M,CTA_M), round_nearest(N,CTA_N), L)
+  // tensor of ElementCompute. It is the user's responsibility to reduce this to a (N, L) tensor of ElementOutput
+  bool FinalReduction = true,
+  // False means skip OOB predication if OOB inputs are known to be the reduction identity
+  bool VisitCheckOOB = true,
+  // Indicate the parameter order when calling RegReduceFn
+  // Seq length equals the number of RegReduceFn parameters
+  // No.0 represents tCrRow; No.1 and subsequent numbers sequentially represent frg_inputs in `visit`
+  class RegReduceSeq = cute::seq<0, 1>
+>
+struct Sm90RowReduction {
+private:
+  static_assert(Stages == 0, "Smem usage not supported yet");
+  static_assert(Alignment * sizeof_bits_v<ElementOutput> % 128 == 0, "sub-16B alignment not supported yet");
+  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
+  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_1>{});
+  static constexpr bool IsAtomic = is_atomic<GmemReduceFn<ElementCompute>>::value;
+  static_assert(not (IsAtomic && not FinalReduction), "atomic reduction must be final");
+
+public:
+  struct SharedStorage { };
+
+  struct Arguments {
+    void* ptr_row = nullptr; // ElementOutput* if FinalReduction, else ElementCompute*
+    ElementCompute reduction_identity = 0;
+    StrideMNL dRow = {};
+  };
+
+  struct Params {
+    void* ptr_row = nullptr;
+    ElementCompute reduction_identity = 0;
+    StrideMNL dRow = {};
+    ElementCompute* reduction_buffer = nullptr;
+    int* tile_counters = nullptr;
+  };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    ElementCompute* reduction_buffer;
+    int* tile_counters = nullptr;
+    if constexpr (IsAtomic) {
+      reduction_buffer = nullptr;
+    }
+    else if constexpr (FinalReduction) {
+      auto problem_shape_mnkl = append<4>(problem_shape, 1);
+      auto [M, N, K, L] = problem_shape_mnkl;
+      auto [tile_M, tile_N, tile_K] = CtaTileShapeMNK{};
+      size_t tile_counters_offset = product(ceil_div(make_shape(size<>(M), size<>(N), L), make_shape(tile_M, tile_N))) * tile_N * sizeof(ElementCompute);
+      tile_counters_offset = round_nearest(tile_counters_offset, MinWorkspaceAlignment);
+
+      reduction_buffer = reinterpret_cast<ElementCompute*>(workspace);
+      tile_counters = reinterpret_cast<int*>(reinterpret_cast<uint8_t*>(workspace) + tile_counters_offset);
+    }
+    else {
+      reduction_buffer = reinterpret_cast<ElementCompute*>(args.ptr_row);
+    }
+
+    return {
+      args.ptr_row,
+      args.reduction_identity,
+      args.dRow,
+      reduction_buffer,
+      tile_counters
+    };
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    if constexpr (IsAtomic || not FinalReduction) {
+      return 0;
+    }
+
+    size_t workspace_size = 0;
+    auto problem_shape_mnkl = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_mnkl;
+    auto [tile_M, tile_N, tile_K] = CtaTileShapeMNK{};
+    // Increment by size of reduction buffer
+    workspace_size += product(ceil_div(make_shape(size<>(M),size<>(N),L), make_shape(tile_M, tile_N))) * tile_N * sizeof(ElementCompute);
+    // Align and increment by size of tile counters
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+    workspace_size += cute::ceil_div(size<>(N), tile_N) * sizeof(int);
+    return workspace_size;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    if constexpr (IsAtomic) {
+      auto problem_shape_mnkl = append<4>(problem_shape, 1);
+      auto [M, N, K, L] = problem_shape_mnkl;
+      Layout mRow_layout = make_layout(make_shape(size<>(M),size<>(N),size<>(L)), args.dRow);
+      if (args.ptr_row != nullptr) {
+        return fill_workspace(args.ptr_row, ElementOutput(args.reduction_identity), cosize(mRow_layout), stream, cuda_adapter);
+      }
+      return Status::kSuccess;
+    }
+    else if constexpr (FinalReduction) {
+      auto problem_shape_mnkl = append<4>(problem_shape, 1);
+      auto [M, N, K, L] = problem_shape_mnkl;
+      auto [tile_M, tile_N, tile_K] = CtaTileShapeMNK{};
+      size_t tile_counters_offset = product(ceil_div(make_shape(size<>(M),size<>(N),L), make_shape(tile_M, tile_N))) * tile_N * sizeof(ElementCompute);
+      tile_counters_offset = round_nearest(tile_counters_offset, MinWorkspaceAlignment);
+
+      int* tile_counters = reinterpret_cast<int*>(reinterpret_cast<uint8_t*>(workspace) + tile_counters_offset);
+      size_t tile_counters_size = cute::ceil_div(size<>(N), tile_N) * sizeof(int);
+      return zero_workspace(tile_counters, tile_counters_size, stream, cuda_adapter);
+    }
+    else {
+      return Status::kSuccess;
+    }
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90RowReduction() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90RowReduction(Params const& params, SharedStorage const& shared_storage)
+      : params(params) { }
+
+  Params params;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<class ArgsTuple>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(ArgsTuple&& args_tuple, Params const& params)
+      : args_tuple(cute::forward<ArgsTuple>(args_tuple)),
+        params(params) {}
+
+    ArgsTuple args_tuple;
+    Params const& params;
+    bool do_final_reduction = false;
+
+    template <typename ElementAccumulator, typename... ElementInputs, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
+          Array<ElementInputs, FragmentSize> const&... frg_inputs) {
+      if constexpr (EnableNullptr) {
+        if (params.ptr_row == nullptr) {
+          return cute::get<0>(cute::make_tuple(frg_inputs...));
+        }
+      }
+
+      auto& [ref_src, tCrRow, tCcRow, gRow_l, cRow, gBuf_ml, sBuf_layout,
+        lane_layout_MN, lane_mn, warp_layout_MN, warp_mn,
+        tile_coord_mnkl, residue_cRow, residue_tCcRow, epi_tile, tiled_copy, thread_idx] = args_tuple;
+      Tensor tCrRow_mn = tCrRow(_,_,_,epi_m,epi_n);
+      Tensor tCcRow_mn = tCcRow(_,_,_,epi_m,epi_n);
+
+      if constexpr (VisitCheckOOB) {
+        using ReduceInput = RegReduceFn<ElementCompute>;
+        ReduceInput reduce_input{};
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < FragmentSize; ++i) {
+          if (elem_less(tCcRow_mn(epi_v * FragmentSize + i), residue_tCcRow)) {
+            ElementCompute& tCrRow_vmn = tCrRow_mn(epi_v * FragmentSize + i);
+            tCrRow_vmn = transform_apply(cute::make_tuple(frg_inputs...),
+                [&] (auto&& frg_input) {
+                  return ElementCompute(frg_input[i]);
+                },
+                [&] (auto&&... cvt_frg_inputs) {
+                  auto frg_compute_tuple = cute::make_tuple(tCrRow_vmn, cvt_frg_inputs...);
+                  return cute::detail::apply(frg_compute_tuple, reduce_input, RegReduceSeq{});
+                });
+          }
+        }
+      }
+      else {
+        constexpr int RegFragSize = cute::max(1, static_cast<int>(sizeof(uint32_t) / sizeof(ElementCompute)));
+        using ReduceInput = RegReduceFn<Array<ElementCompute, RegFragSize>>;
+        ReduceInput reduce_input{};
+        Tensor tCrRow_mn_frg = recast<Array<ElementCompute, RegFragSize>>(tCrRow_mn);
+
+        constexpr int RegFragArraySize = FragmentSize / RegFragSize;
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < RegFragArraySize; ++i) {
+          Array<ElementCompute, RegFragSize>& tCrRow_vmn_frg = tCrRow_mn_frg(epi_v * RegFragArraySize + i);
+          tCrRow_vmn_frg = transform_apply(cute::make_tuple(frg_inputs...),
+              [&] (auto&& frg_input) {
+                using ElementInput = typename cute::remove_cvref_t<decltype(frg_input)>::Element;
+                using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, RegFragSize, RoundStyle>;
+                using RegFragArr = Array<Array<ElementCompute, RegFragSize>, RegFragArraySize>;
+                ConvertInput convert_input{};
+                return convert_input(reinterpret_cast<RegFragArr&>(frg_input)[i]);
+              },
+              [&] (auto&&... cvt_frg_inputs) {
+                auto frg_compute_tuple = cute::make_tuple(tCrRow_vmn_frg, cvt_frg_inputs...);
+                return cute::detail::apply(frg_compute_tuple, reduce_input, RegReduceSeq{});
+              });
+        }
+      }
+      return cute::get<0>(cute::make_tuple(frg_inputs...));
+    }
+
+    template <class STensor, class SyncFn, class VTensor>
+    CUTLASS_DEVICE void
+    reduce(STensor&& smem_buffer, SyncFn const& sync_fn, int epi_m, int epi_n, bool is_last_iteration, VTensor visit_results) {
+      if (not is_last_iteration) {
+        return;
+      }
+
+      auto& [ref_src, tCrRow, tCcRow, gRow_l, cRow, gBuf_ml, sBuf_layout,
+        lane_layout_MN, lane_mn, warp_layout_MN, warp_mn,
+        tile_coord_mnkl, residue_cRow, residue_tCcRow, epi_tile, tiled_copy, thread_idx] = args_tuple;
+      auto [m, n, k, l] = tile_coord_mnkl;
+      constexpr bool ReferenceSrc = decltype(ref_src)::value;
+      if constexpr (EnableNullptr) {
+        if (params.ptr_row == nullptr) {
+          return;
+        }
+      }
+
+      // fully OOB CTA in partially OOB cluster
+      if (not elem_less(cRow(_0{},_0{}), residue_cRow)) {
+        return;
+      }
+
+      int lane_m = get<0>(lane_mn);
+      [[maybe_unused]] bool is_reduced_lane = lane_m == 0;
+
+      //
+      // 1. Warp shuffle reduction
+      //
+      using FragmentShuffle = Array<ElementCompute, sizeof(uint64_t) / sizeof(ElementCompute)>;
+      Tensor tCrRow_frg = recast<FragmentShuffle>(filter(tCrRow));
+      using ReduceShuffle = ShuffleReduceFn<FragmentShuffle>;
+      ReduceShuffle reduce_shuffle{};
+
+      auto FrgSizePerLaneM = size(tCrRow_frg) / size<0>(lane_layout_MN);
+      constexpr bool SwapShuffle = FrgSizePerLaneM > 0;
+
+      //
+      // Swap Shuffle
+      //
+      // The normal way to reduction among threads:
+      // use shuffle to let *** the first half of threads *** have *** whole data *** from the second half of threads.
+      // After each step of reduction, a half of threads won't work in the following steps.
+      // That is, as the reduction progresses, the efficiency of shuffle & reduction instructions gradually change from 1/2, 1/4 to 1/32 (the worst case).
+      //
+      // To overcome this shortcoming, for a NxN matrix to be reduced among N threads as a 1XN vectors,
+      // we use swap & shuffle aiming to let *** each half of threads *** have *** a half of data *** from the other half of threads.
+      // After reduction, each half of threads should deal with a (N/2)x(N/2) sub-matrix independently in the following step.
+      // We can recursively do this until the problem size is 1.
+      //
+      if constexpr (SwapShuffle) { // for a NxN matrix to be reduced among N threads as a 1XN vectors
+        Tensor tCrRow_frg_ = logical_divide(tCrRow_frg, FrgSizePerLaneM);                       // (FrgSizePerLaneM, M)
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = size<1>(tCrRow_frg_) / 2; m > 0; m /= 2) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int r = 0; r < m; ++r) {
+            auto frg_A = tCrRow_frg_(_,r);
+            auto frg_B = tCrRow_frg_(_,r + m);
+            CUTLASS_PRAGMA_UNROLL
+            for (int v = 0; v < size(frg_A); ++v) {
+              // Step1: swap
+              if (not (lane_m & m)) { // the first half of threads swap fragments from the first half of data to the second
+                cutlass::swap(frg_A(v), frg_B(v));
+              }
+
+              // Step2: shuffle
+              uint64_t frg_shfl = reinterpret_cast<uint64_t&>(frg_A(v));
+              // each half of threads get a half of data from the other half of threads
+              frg_shfl = __shfl_xor_sync(0xFFFFFFFF, frg_shfl, lane_layout_MN(m, _0{}));
+
+              // Step3: reduction
+              frg_A(v) = reduce_shuffle(frg_B(v), reinterpret_cast<FragmentShuffle&>(frg_shfl));
+            }
+          }
+        }
+      }
+      else {
+        CUTLASS_PRAGMA_UNROLL
+        for (int reduction_rows = size<0>(lane_layout_MN) / 2; reduction_rows > 0; reduction_rows /= 2) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int frg_idx = 0; frg_idx < size(tCrRow_frg); ++frg_idx) {
+            uint64_t frg_shfl = reinterpret_cast<uint64_t&>(tCrRow_frg(frg_idx));
+            frg_shfl = __shfl_down_sync(0xFFFFFFFF, frg_shfl, lane_layout_MN(reduction_rows, _0{}));
+            tCrRow_frg(frg_idx) = reduce_shuffle(tCrRow_frg(frg_idx), reinterpret_cast<FragmentShuffle&>(frg_shfl));
+          }
+        }
+      }
+
+      //
+      // 2. Atomic reduction
+      //
+      if constexpr (IsAtomic) {
+        // Filter so we don't issue redunant copies over stride-0 modes
+        Tensor tCrRow_flt = filter_zeros(tCrRow);
+        Tensor tCcRow_flt = make_tensor(tCcRow.data(), make_layout(tCrRow_flt.shape(), tCcRow.stride()));
+        auto FltFrgSizePerLaneM = size(tCrRow_flt) / size<0>(lane_layout_MN);
+
+        Tensor tCgRow = sm90_partition_for_epilogue<ReferenceSrc>(gRow_l(_,_,l), epi_tile, tiled_copy, thread_idx);
+        Tensor tCgRow_flt = filter_zeros(tCgRow);
+        // NOTE: atomic reduction is performed in the output type
+        using ConvertOutput = NumericConverter<ElementOutput, ElementCompute, RoundStyle>;
+        using ReduceOutput = GmemReduceFn<ElementOutput>;
+        ConvertOutput convert_output{};
+        ReduceOutput reduce_output{};
+
+        if constexpr (SwapShuffle) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 0; i < FltFrgSizePerLaneM; ++i) {
+            int idx = lane_m * FltFrgSizePerLaneM + i;
+            // Only care about OOB for N mode
+            if (get<1>(tCcRow_flt(idx)) < get<1>(residue_tCcRow)) {
+              reduce_output(&tCgRow_flt(idx), convert_output(tCrRow_flt(i)));
+            }
+          }
+        }
+        else {
+          if (is_reduced_lane) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < size(tCrRow_flt); ++i) {
+              if (elem_less(tCcRow_flt(i), residue_tCcRow)) {
+                reduce_output(&tCgRow_flt(i), convert_output(tCrRow_flt(i)));
+              }
+            }
+          }
+        }
+        sync_fn();
+      }
+
+      //
+      // 2. One warp in M, skip threadblock smem reduction
+      //
+      else if constexpr (decltype(size<0>(warp_layout_MN))::value <= 1) {
+        // Dump warp reduction to gmem workspace
+        using ElementGmem = cute::conditional_t<FinalReduction, ElementCompute volatile, ElementCompute>;
+        Tensor tCgBuf = sm90_partition_for_epilogue<ReferenceSrc>(gBuf_ml(_,_,m,l), epi_tile, tiled_copy, thread_idx);
+
+        if constexpr (SwapShuffle) {
+          Tensor tCrRow_flt = filter(tCrRow);
+          Tensor tCgBuf_flt = recast<ElementGmem>(filter(tCgBuf));
+          auto FltFrgSizePerLaneM = size(tCrRow_flt) / size<0>(lane_layout_MN);
+          Tensor tCgBuf_flt_ = logical_divide(tCgBuf_flt, FltFrgSizePerLaneM);               // (FltFrgSizePerLaneM, M)
+          Tensor tCrRow_flt_ = logical_divide(tCrRow_flt, FltFrgSizePerLaneM);               // (FltFrgSizePerLaneM, M)
+          copy_aligned(tCrRow_flt_(_,_0{}), tCgBuf_flt_(_,lane_m));
+        }
+        else {
+          if (is_reduced_lane) {
+            copy_aligned(tCrRow, recast<ElementGmem>(tCgBuf));
+          }
+        }
+        sync_fn();
+      }
+
+      //
+      // 2. Multiple warps in M, do threadblock smem reduction
+      //
+      else {
+        Tensor sBuf = make_tensor(make_smem_ptr<ElementCompute>(raw_pointer_cast(smem_buffer.data())), sBuf_layout);
+        static_assert(decltype(cosize(sBuf.layout()))::value * sizeof(ElementCompute) <=
+                      decltype(cosize(smem_buffer.layout()))::value * sizeof(typename remove_cvref_t<STensor>::value_type),
+                      "smem reduction buffer not large enough, use a larger epilogue tile");
+        sync_fn();
+
+        // Dump warp reduction to smem workspace
+        Tensor tCsBuf = sm90_partition_for_epilogue<ReferenceSrc>(sBuf(_,_,get<0>(warp_mn)), epi_tile, tiled_copy, thread_idx);
+
+        if constexpr (SwapShuffle) {
+          Tensor tCrRow_flt = filter(tCrRow);
+          Tensor tCsBuf_flt = filter(tCsBuf);
+          auto FltFrgSizePerLaneM = size(tCrRow_flt) / size<0>(lane_layout_MN);
+          Tensor tCsBuf_flt_ = logical_divide(tCsBuf_flt, FltFrgSizePerLaneM);               // (FltFrgSizePerLaneM, M)
+          Tensor tCrRow_flt_ = logical_divide(tCrRow_flt, FltFrgSizePerLaneM);               // (FltFrgSizePerLaneM, M)
+          copy_aligned(tCrRow_flt_(_,_0{}), tCsBuf_flt_(_,lane_m));
+        }
+        else {
+          if (is_reduced_lane) {
+            copy_aligned(tCrRow, tCsBuf);
+          }
+        }
+        sync_fn();
+
+        constexpr int SmemFragSize = cute::max(size_t{1}, sizeof(uint32_t) / sizeof(ElementCompute));
+        using FragmentSmem = Array<ElementCompute, SmemFragSize>;
+        using VectorSmem = uint_bit_t<sizeof_bits_v<FragmentSmem>>;
+        using ReduceSmem = GmemReduceFn<FragmentSmem>;
+        ReduceSmem reduce_smem{};
+
+        Tensor sBuf_frg = recast<FragmentSmem>(filter_zeros(sBuf));
+        Tensor sBuf_vec = recast<VectorSmem>(filter_zeros(sBuf));
+        constexpr int FragsPerRow = decltype(size<1>(sBuf_frg))::value;
+
+        constexpr int RowNum = decltype(size<0>(warp_layout_MN))::value;
+        using FragmentSmemArray = Array<FragmentSmem, RowNum>;
+
+        // Do the threadblock smem reduction
+        using VectorGmem = cute::conditional_t<FinalReduction, VectorSmem volatile, VectorSmem>;
+        Tensor gBuf_vec = recast<VectorGmem>(filter(gBuf_ml(_,_,m,l)));
+        CUTLASS_PRAGMA_UNROLL
+        for (int frg_idx = thread_idx; frg_idx < FragsPerRow; frg_idx += size(tiled_copy)) {
+          FragmentSmemArray frg_smem;
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int reduction_rows = 0; reduction_rows < RowNum; ++reduction_rows) {
+            int FragsCurrRows = reduction_rows * FragsPerRow;
+            frg_smem[reduction_rows] = sBuf_frg(FragsCurrRows + frg_idx);
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int reduction_rows = RowNum / 2; reduction_rows > 0; reduction_rows /= 2) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int row_idx = 0; row_idx < reduction_rows; ++row_idx) {
+              frg_smem[row_idx] = reduce_smem(frg_smem[row_idx], frg_smem[row_idx + reduction_rows]);
+            }
+          }
+          gBuf_vec(frg_idx) = reinterpret_cast<VectorSmem&>(frg_smem[0]);
+        }
+        sync_fn();
+      }
+
+      //
+      // 3. Increment atomic counters to signal final gmem reduction
+      //
+      if constexpr (not IsAtomic && FinalReduction) {
+        // Ensure gmem writes are visible to other threads before incrementing counter
+        __threadfence();
+        sync_fn();
+        // Collective thread 0 increments atomic tile counter and copies value to smem
+        int* prev_tile_count = reinterpret_cast<int*>(raw_pointer_cast(smem_buffer.data()));
+        if (thread_idx == 0) {
+          *prev_tile_count = atomicAdd(&params.tile_counters[n], 1);
+        }
+        sync_fn();
+        // Broadcast tile count to other threads in CTA and determine final reduction status
+        do_final_reduction = *prev_tile_count == size<2>(gBuf_ml) * size<3>(gBuf_ml) - 1;
+        sync_fn();
+      }
+    }
+
+    CUTLASS_DEVICE void
+    end() {
+      //
+      // 4. Do final gmem reduction if necessary
+      //
+      if constexpr (not IsAtomic && FinalReduction) {
+        if (not do_final_reduction) {
+          return;
+        }
+
+        auto& [ref_src, tCrRow, tCcRow, gRow_l, cRow, gBuf_ml, sBuf_layout,
+          lane_layout_MN, lane_mn, warp_layout_MN, warp_mn,
+          tile_coord_mnkl, residue_cRow, residue_tCcRow, epi_tile, tiled_copy, thread_idx] = args_tuple;
+
+        using ReduceOutput = GmemReduceFn<ElementCompute>;
+        using ConvertOutput = NumericConverter<ElementOutput, ElementCompute, RoundStyle>;
+        ReduceOutput reduce_output{};
+        ConvertOutput convert_output{};
+
+        // Reduction over batches
+        if (size<2>(stride(gRow_l)) == 0) {
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (int n = thread_idx; n < size<1>(gBuf_ml); n += size(tiled_copy)) {
+            Tensor tRgBuf_ml = gBuf_ml(_0{},n,_,_);
+            ElementCompute output = tRgBuf_ml(_0{});
+            CUTLASS_PRAGMA_NO_UNROLL
+            for (int ml = 1; ml < size(tRgBuf_ml); ++ml) {
+              output = reduce_output(output, tRgBuf_ml(ml));
+            }
+            if (elem_less(cRow(_0{},n), residue_cRow)) {
+              gRow_l(_0{},n,_0{}) = convert_output(output);
+            }
+          }
+        }
+        // No reduction over batches
+        else {
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (int n = thread_idx; n < size<1>(gBuf_ml); n += size(tiled_copy)) {
+            bool do_store = elem_less(cRow(_0{},n), residue_cRow);
+            CUTLASS_PRAGMA_NO_UNROLL
+            for (int l = 0; l < size<3>(gBuf_ml); ++l) {
+              Tensor tRgBuf_m = gBuf_ml(_0{},n,_,l);
+              ElementCompute output = tRgBuf_m(_0{});
+              CUTLASS_PRAGMA_NO_UNROLL
+              for (int m = 1; m < size(tRgBuf_m); ++m) {
+                output = reduce_output(output, tRgBuf_m(m));
+              }
+              if (do_store) {
+                gRow_l(_0{},n,l) = convert_output(output);
+              }
+            }
+          }
+        }
+
+      }
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    Layout ref_layout_MN = [&] () {
+      if constexpr (ReferenceSrc) { return get<0>(args.tiled_copy.get_layoutS_MN()); }
+      else                        { return get<0>(args.tiled_copy.get_layoutD_MN()); }
+    }();                                                                                         // tile_mn -> tv_idx
+
+    // Get the MN layout + coord of lanes to determine shuffle reduction iterations
+    using _W = Int<decltype(args.tiled_copy)::TiledNumThr::value / NumThreadsPerWarp>;
+    Layout tv2lane = Layout<Shape<Int<NumThreadsPerWarp>,_W,_1>,Stride<_1,_0,_0>>{};            //   tv_idx -> lane_idx
+    Layout ref2lane = composition(tv2lane, ref_layout_MN);                                      //  tile_mn -> lane_idx
+    Layout lane_layout_MN = make_layout(filter(get<0>(ref2lane)), filter(get<1>(ref2lane)));    //  lane_mn -> lane_idx
+    Layout inv_lane_layout_MN = right_inverse(lane_layout_MN);                                  // lane_idx -> lane_mn
+    int lane_idx = canonical_lane_idx();
+    auto lane_mn = idx2crd(inv_lane_layout_MN(lane_idx), shape(lane_layout_MN));
+
+    // Get the MN layout + coord of warps to determine smem reduction iterations
+    Layout tv2warp = Layout<Shape<Int<NumThreadsPerWarp>,_W,_1>,Stride<_0,_1,_0>>{};            //   tv_idx -> warp_idx
+    Layout ref2warp = composition(tv2warp, ref_layout_MN);                                      //  tile_mn -> warp_idx
+    Layout warp_layout_MN = make_layout(filter(get<0>(ref2warp)), filter(get<1>(ref2warp)));    //  warp_mn -> warp_idx
+    Layout inv_warp_layout_MN = right_inverse(warp_layout_MN);                                  // warp_idx -> warp_mn
+
+    int warp_idx = args.thread_idx / NumThreadsPerWarp;
+    auto warp_mn = idx2crd(inv_warp_layout_MN(warp_idx), shape(warp_layout_MN));
+
+    // Partition output gmem and register tensors
+    auto [tile_M, tile_N, tile_K] = args.tile_shape_mnk;
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+
+    Tensor mRow = make_tensor(make_gmem_ptr<ElementOutput>(params.ptr_row), make_shape(M,N,L), params.dRow); // (M,N,L)
+    Tensor gRow_l = local_tile(mRow, take<0,2>(args.tile_shape_mnk), make_coord(m,n,_));             // (CTA_M,CTA_N,L)
+    Tensor tCgRow = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      gRow_l(_,_,l), args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tCrRow = make_tensor_like<ElementCompute>(tCgRow);                          // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+
+    fill(tCrRow, params.reduction_identity);
+
+    // Partition gmem+smem reduction buffer tensors
+    Layout gBuf_layout = make_layout(take<0,2>(args.tile_shape_mnk), make_stride(_0{}, _1{}));
+    auto block_shape = ceil_div(make_shape(M,N,L), shape(gBuf_layout)); // (M_CNT, N_CNT, L_CNT)
+
+    // Let the M_CNT (the num of partial reduction results) become the outer mode
+    Layout block_layout = make_layout(block_shape, make_stride(get<1>(block_shape), _1{}, get<0>(block_shape) * get<1>(block_shape)));
+    Layout mBuf_layout = blocked_product(gBuf_layout, block_layout);
+    Tensor mBuf = make_tensor(make_gmem_ptr(params.reduction_buffer), mBuf_layout);                // (ceil_M,ceil_N,L)
+    Tensor gBuf_ml = local_tile(mBuf, take<0,2>(args.tile_shape_mnk), make_coord(_,n,_));     // (CTA_M,CTA_N,REST_M,L)
+    Layout sBuf_layout = blocked_product(gBuf_layout,                                          // (CTA_M,CTA_N,WARPS_M)
+      make_layout(make_shape(_1{},_1{},size<0>(warp_layout_MN))));
+
+    auto args_tuple = make_tuple(
+        bool_constant<ReferenceSrc>{}, cute::move(tCrRow), args.tCcD, gRow_l, args.cD, gBuf_ml, sBuf_layout,
+        lane_layout_MN, lane_mn, warp_layout_MN, warp_mn,
+        args.tile_coord_mnkl, args.residue_cD, args.residue_tCcD, args.epi_tile, args.tiled_copy, args.thread_idx);
+    return ConsumerStoreCallbacks<decltype(args_tuple)>(cute::move(args_tuple), params);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Col vector reduction
+template <
+  template <class> class RegReduceFn,
+  template <class> class ShuffleReduceFn,
+  template <class> class GmemReduceFn,
+  int Stages,
+  class CtaTileShapeMNK,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  class StrideMNL = Stride<_1,_0,_0>,
+  int Alignment = 128 / sizeof_bits_v<ElementOutput>,
+  bool EnableNullptr = true, // Noop on nullptr params
+  // If this is false, ptr_col is assumed to point to a compact m-major (round_nearest(M,CTA_M), ceil_div(N,CTA_N), L)
+  // tensor of ElementCompute. It is the user's responsibility to reduce this to a (M, L) tensor of ElementOutput
+  bool FinalReduction = true,
+  // False means skip OOB predication if OOB inputs are known to be the reduction identity
+  bool VisitCheckOOB = true
+>
+struct Sm90ColReduction {
+private:
+  static_assert(Stages == 0, "Smem usage not supported yet");
+  static_assert(Alignment * sizeof_bits_v<ElementOutput> % 128 == 0, "sub-16B alignment not supported yet");
+  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
+  static_assert(take<0,2>(StrideMNL{}) == Stride<_1,_0>{});
+  static constexpr bool IsAtomic = is_atomic<GmemReduceFn<ElementCompute>>::value;
+  static_assert(not (IsAtomic && not FinalReduction), "atomic reduction must be final");
+
+public:
+  struct SharedStorage { };
+
+  struct Arguments {
+    void* ptr_col = nullptr; // ElementOutput* if FinalReduction, else ElementCompute*
+    ElementCompute reduction_identity = 0;
+    StrideMNL dCol = {};
+  };
+
+  struct Params {
+    void* ptr_col = nullptr;
+    ElementCompute reduction_identity = 0;
+    StrideMNL dCol = {};
+    ElementCompute* reduction_buffer = nullptr;
+    int* tile_counters = nullptr;
+  };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    ElementCompute* reduction_buffer;
+    int* tile_counters = nullptr;
+    if constexpr (IsAtomic) {
+      reduction_buffer = nullptr;
+    }
+    else if constexpr (FinalReduction) {
+      auto problem_shape_mnkl = append<4>(problem_shape, 1);
+      auto [M, N, K, L] = problem_shape_mnkl;
+      auto [tile_M, tile_N, tile_K] = CtaTileShapeMNK{};
+      size_t tile_counters_offset = product(ceil_div(make_shape(M,N,L), make_shape(tile_M, tile_N))) * tile_M * sizeof(ElementCompute);
+      tile_counters_offset = round_nearest(tile_counters_offset, MinWorkspaceAlignment);
+
+      reduction_buffer = reinterpret_cast<ElementCompute*>(workspace);
+      tile_counters = reinterpret_cast<int*>(reinterpret_cast<uint8_t*>(workspace) + tile_counters_offset);
+    }
+    else {
+      reduction_buffer = reinterpret_cast<ElementCompute*>(args.ptr_col);
+    }
+
+    return {
+      args.ptr_col,
+      args.reduction_identity,
+      args.dCol,
+      reduction_buffer,
+      tile_counters
+    };
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    if constexpr (IsAtomic || not FinalReduction) {
+      return 0;
+    }
+
+    size_t workspace_size = 0;
+    auto problem_shape_mnkl = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_mnkl;
+    auto [tile_M, tile_N, tile_K] = CtaTileShapeMNK{};
+
+    // Increment by size of reduction buffer
+    workspace_size += product(ceil_div(make_shape(M,N,L), make_shape(tile_M, tile_N))) * tile_M * sizeof(ElementCompute);
+    // Align and increment by size of tile counters
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+    workspace_size += cute::ceil_div(M, tile_M) * sizeof(int);
+
+    return workspace_size;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    if constexpr (IsAtomic) {
+      auto problem_shape_mnkl = append<4>(problem_shape, 1);
+      auto [M, N, K, L] = problem_shape_mnkl;
+      Layout mCol_layout = make_layout(make_shape(size<>(M),size<>(N),size<>(L)), args.dCol);
+      if (args.ptr_col != nullptr) {
+        return fill_workspace(args.ptr_col, ElementOutput(args.reduction_identity), cosize(mCol_layout), stream, cuda_adapter);
+      }
+      return Status::kSuccess;
+    }
+    else if constexpr (FinalReduction) {
+      auto problem_shape_mnkl = append<4>(problem_shape, 1);
+      auto [M, N, K, L] = problem_shape_mnkl;
+      auto [tile_M, tile_N, tile_K] = CtaTileShapeMNK{};
+      size_t tile_counters_offset = product(ceil_div(make_shape(M,N,L), make_shape(tile_M, tile_N))) * tile_M * sizeof(ElementCompute);
+      tile_counters_offset = round_nearest(tile_counters_offset, MinWorkspaceAlignment);
+
+      int* tile_counters = reinterpret_cast<int*>(reinterpret_cast<uint8_t*>(workspace) + tile_counters_offset);
+      size_t tile_counters_size = cute::ceil_div(M, tile_M) * sizeof(int);
+      return zero_workspace(tile_counters, tile_counters_size, stream, cuda_adapter);
+    }
+    else {
+      return Status::kSuccess;
+    }
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ColReduction() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ColReduction(Params const& params, SharedStorage const& shared_storage)
+      : params(params) { }
+
+  Params params;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<class ArgsTuple>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(ArgsTuple&& args_tuple, Params const& params)
+      : args_tuple(cute::forward<ArgsTuple>(args_tuple)),
+        params(params) {}
+
+    ArgsTuple args_tuple;
+    Params const& params;
+    bool do_final_reduction = false;
+
+    template <typename ElementAccumulator, typename ElementInput, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
+          Array<ElementInput, FragmentSize> const& frg_input) {
+      if constexpr (EnableNullptr) {
+        if (params.ptr_col == nullptr) {
+          return frg_input;
+        }
+      }
+
+      auto& [ref_src, tCrCol, tCcCol, gCol_l, cCol, gBuf_nl, sBuf_layout,
+              lane_layout_MN, lane_mn, warp_layout_MN, warp_mn,
+              tile_coord_mnkl, residue_cCol, residue_tCcCol, epi_tile, tiled_copy, thread_idx] = args_tuple;
+      Tensor tCrCol_mn = tCrCol(_,_,_,epi_m,epi_n);
+      Tensor tCcCol_mn = tCcCol(_,_,_,epi_m,epi_n);
+
+      using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
+      using ReduceInput = RegReduceFn<ElementCompute>;
+      ConvertInput convert_input{};
+      ReduceInput reduce_input{};
+
+      Array frg_I = convert_input(frg_input);
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        if (!VisitCheckOOB || elem_less(tCcCol_mn(epi_v * FragmentSize + i), residue_tCcCol)) {
+          ElementCompute& tCrCol_vmn = tCrCol_mn(epi_v * FragmentSize + i);
+          tCrCol_vmn = reduce_input(tCrCol_vmn, frg_I[i]);
+        }
+      }
+
+      return frg_input;
+    }
+
+    template <class STensor, class SyncFn, class VTensor>
+    CUTLASS_DEVICE void
+    reduce(STensor&& smem_buffer, SyncFn const& sync_fn, int epi_m, int epi_n, bool is_last_iteration, VTensor visit_results) {
+      if (not is_last_iteration) {
+        return;
+      }
+
+      auto& [ref_src, tCrCol, tCcCol, gCol_l, cCol, gBuf_nl, sBuf_layout,
+              lane_layout_MN, lane_mn, warp_layout_MN, warp_mn,
+              tile_coord_mnkl, residue_cCol, residue_tCcCol, epi_tile, tiled_copy, thread_idx] = args_tuple;
+      auto [m, n, k, l] = tile_coord_mnkl;
+      constexpr bool ReferenceSrc = decltype(ref_src)::value;
+
+      // Runtime nullptr is noop
+      if constexpr (EnableNullptr) {
+        if (params.ptr_col == nullptr) {
+          return;
+        }
+      }
+
+      // fully OOB CTA in partially OOB cluster
+      if (not elem_less(cCol(_0{},_0{}), residue_cCol)) {
+        return;
+      }
+
+      //
+      // 1. Warp shuffle reduction
+      //
+      using FragmentShuffle = Array<ElementCompute, sizeof(uint64_t) / sizeof(ElementCompute)>;
+      using ReduceShuffle = ShuffleReduceFn<FragmentShuffle>;
+      ReduceShuffle reduce_shuffle{};
+      Tensor tCrCol_frg = recast<FragmentShuffle>(filter(tCrCol));
+      CUTLASS_PRAGMA_UNROLL
+      for (int reduction_cols = size<1>(lane_layout_MN) / 2; reduction_cols > 0; reduction_cols /= 2) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int frg_idx = 0; frg_idx < size(tCrCol_frg); ++frg_idx) {
+          uint64_t frg_shfl = reinterpret_cast<uint64_t&>(tCrCol_frg(frg_idx));
+          frg_shfl = __shfl_down_sync(0xFFFFFFFF, frg_shfl, lane_layout_MN(_0{},reduction_cols));
+          tCrCol_frg(frg_idx) = reduce_shuffle(tCrCol_frg(frg_idx), reinterpret_cast<FragmentShuffle&>(frg_shfl));
+        }
+      }
+      bool is_reduced_lane = get<1>(lane_mn) == 0;
+
+      //
+      // 2. Atomic reduction
+      //
+      if constexpr (IsAtomic) {
+        // Filter so we don't issue redunant copies over stride-0 modes
+        Tensor tCrCol_flt = filter_zeros(tCrCol);
+        Tensor tCcCol_flt = make_tensor(tCcCol.data(), make_layout(tCrCol_flt.shape(), tCcCol.stride()));
+
+        Tensor tCgCol = sm90_partition_for_epilogue<ReferenceSrc>(gCol_l(_,_,l), epi_tile, tiled_copy, thread_idx);
+        Tensor tCgCol_flt = filter_zeros(tCgCol);
+
+        // NOTE: atomic reduction is performed in the output type
+        using ConvertOutput = NumericConverter<ElementOutput, ElementCompute, RoundStyle>;
+        using ReduceOutput = GmemReduceFn<ElementOutput>;
+        ConvertOutput convert_output{};
+        ReduceOutput reduce_output{};
+
+        if (is_reduced_lane) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 0; i < size(tCrCol_flt); ++i) {
+            if (elem_less(tCcCol_flt(i), residue_tCcCol)) {
+              reduce_output(&tCgCol_flt(i), convert_output(tCrCol_flt(i)));
+            }
+          }
+        }
+        sync_fn();
+      }
+
+      //
+      // 2. One warp in N, skip threadblock smem reduction
+      //
+      else if constexpr (decltype(size<1>(warp_layout_MN))::value <= 1) {
+        // Dump warp reduction to gmem workspace
+        using ElementGmem = cute::conditional_t<FinalReduction, ElementCompute volatile, ElementCompute>;
+        Tensor tCgBuf = sm90_partition_for_epilogue<ReferenceSrc>(gBuf_nl(_,_,n,l), epi_tile, tiled_copy, thread_idx);
+        if (is_reduced_lane) {
+          copy_aligned(tCrCol, recast<ElementGmem>(tCgBuf));
+        }
+        sync_fn();
+      }
+
+      //
+      // 2. Multiple warps in N, do threadblock smem reduction
+      //
+      else {
+        Tensor sBuf = make_tensor(make_smem_ptr<ElementCompute>(raw_pointer_cast(smem_buffer.data())), sBuf_layout);
+        static_assert(decltype(cosize(sBuf.layout()))::value * sizeof(ElementCompute) <=
+                      decltype(cosize(smem_buffer.layout()))::value * sizeof(typename remove_cvref_t<STensor>::value_type),
+                      "smem reduction buffer not large enough, use a larger epilogue tile");
+        sync_fn();
+
+        // Dump warp reduction to smem workspace
+        Tensor tCsBuf = sm90_partition_for_epilogue<ReferenceSrc>(sBuf(_,_,get<1>(warp_mn)), epi_tile, tiled_copy, thread_idx);
+        if (is_reduced_lane) {
+          copy_aligned(tCrCol, tCsBuf);
+        }
+        sync_fn();
+
+        constexpr int SmemFragSize = cute::max(size_t{1}, sizeof(uint32_t) / sizeof(ElementCompute));
+        using FragmentSmem = Array<ElementCompute, SmemFragSize>;
+        using VectorSmem = uint_bit_t<sizeof_bits_v<FragmentSmem>>;
+        using ReduceSmem = GmemReduceFn<FragmentSmem>;
+        ReduceSmem reduce_smem{};
+
+        Tensor sBuf_frg = recast<FragmentSmem>(filter_zeros(sBuf));
+        Tensor sBuf_vec = recast<VectorSmem>(filter_zeros(sBuf));
+        constexpr int FragsPerCol = decltype(size<0>(sBuf_frg))::value;
+
+        // Do the threadblock smem reduction
+        CUTLASS_PRAGMA_UNROLL
+        for (int reduction_cols = size<1>(warp_layout_MN) / 2; reduction_cols > 1; reduction_cols /= 2) {
+          int FragsPerReduction = reduction_cols * FragsPerCol;
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (int frg_idx = thread_idx; frg_idx < FragsPerReduction; frg_idx += size(tiled_copy)) {
+            FragmentSmem frg_smem = reduce_smem(sBuf_frg(frg_idx), sBuf_frg(frg_idx + FragsPerReduction));
+            sBuf_vec(frg_idx) = reinterpret_cast<VectorSmem&>(frg_smem);
+          }
+          sync_fn();
+        }
+
+        // Do final smem reduction and dump to gmem workspace
+        using VectorGmem = cute::conditional_t<FinalReduction, VectorSmem volatile, VectorSmem>;
+        Tensor gBuf_vec = recast<VectorGmem>(filter(gBuf_nl(_,_,n,l)));
+        CUTLASS_PRAGMA_NO_UNROLL
+        for (int frg_idx = thread_idx; frg_idx < FragsPerCol; frg_idx += size(tiled_copy)) {
+          FragmentSmem frg_smem = reduce_smem(sBuf_frg(frg_idx), sBuf_frg(frg_idx + FragsPerCol));
+          gBuf_vec(frg_idx) = reinterpret_cast<VectorSmem&>(frg_smem);
+        }
+        sync_fn();
+      }
+
+      //
+      // 3. Increment atomic counters to signal final gmem reduction
+      //
+      if constexpr (not IsAtomic && FinalReduction) {
+        // Ensure gmem writes are visible to other threads before incrementing counter
+        __threadfence();
+        sync_fn();
+        // Collective thread 0 increments atomic tile counter and copies value to smem
+        int* prev_tile_count = reinterpret_cast<int*>(raw_pointer_cast(smem_buffer.data()));
+        if (thread_idx == 0) {
+          *prev_tile_count = atomicAdd(&params.tile_counters[m], 1);
+        }
+        sync_fn();
+        // Broadcast tile count to other threads in CTA and determine final reduction status
+        do_final_reduction = *prev_tile_count == size<2>(gBuf_nl) * size<3>(gBuf_nl) - 1;
+        sync_fn();
+      }
+    }
+
+    CUTLASS_DEVICE void
+    end() {
+      //
+      // 4. Do final gmem reduction if necessary
+      //
+      if constexpr (not IsAtomic && FinalReduction) {
+        if (not do_final_reduction) {
+          return;
+        }
+
+        auto& [ref_src, tCrCol, tCcCol, gCol_l, cCol, gBuf_nl, sBuf_layout,
+                lane_layout_MN, lane_mn, warp_layout_MN, warp_mn,
+                tile_coord_mnkl, residue_cCol, residue_tCcCol, epi_tile, tiled_copy, thread_idx] = args_tuple;
+
+        using ReduceOutput = GmemReduceFn<ElementCompute>;
+        using ConvertOutput = NumericConverter<ElementOutput, ElementCompute, RoundStyle>;
+        ReduceOutput reduce_output{};
+        ConvertOutput convert_output{};
+
+        // Reduction over batches
+        if (size<2>(stride(gCol_l)) == 0) {
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (int m = thread_idx; m < size<0>(gBuf_nl); m += size(tiled_copy)) {
+            Tensor tRgBuf_nl = gBuf_nl(m,_0{},_,_);
+            ElementCompute output = tRgBuf_nl(_0{});
+            CUTLASS_PRAGMA_NO_UNROLL
+            for (int nl = 1; nl < size(tRgBuf_nl); ++nl) {
+              output = reduce_output(output, tRgBuf_nl(nl));
+            }
+            if (elem_less(cCol(m,_0{}), residue_cCol)) {
+              gCol_l(m,_0{},_0{}) = convert_output(output);
+            }
+          }
+        }
+        // No reduction over batches
+        else {
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (int m = thread_idx; m < size<0>(gBuf_nl); m += size(tiled_copy)) {
+            bool do_store = elem_less(cCol(m,_0{}), residue_cCol);
+            CUTLASS_PRAGMA_NO_UNROLL
+            for (int l = 0; l < size<3>(gBuf_nl); ++l) {
+              Tensor tRgBuf_n = gBuf_nl(m,_0{},_,l);
+              ElementCompute output = tRgBuf_n(_0{});
+              CUTLASS_PRAGMA_NO_UNROLL
+              for (int n = 1; n < size(tRgBuf_n); ++n) {
+                output = reduce_output(output, tRgBuf_n(n));
+              }
+              if (do_store) {
+                gCol_l(m,_0{},l) = convert_output(output);
+              }
+            }
+          }
+        }
+
+      }
+    }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    Layout ref_layout_MN = [&] () {
+      if constexpr (ReferenceSrc) { return get<0>(args.tiled_copy.get_layoutS_MN()); }
+      else                        { return get<0>(args.tiled_copy.get_layoutD_MN()); }
+    }();                                                                                         // tile_mn -> tv_idx
+
+    // Get the MN layout + coord of lanes to determine shuffle reduction iterations
+    using _W = Int<decltype(args.tiled_copy)::TiledNumThr::value / NumThreadsPerWarp>;
+    Layout tv2lane = Layout<Shape<Int<NumThreadsPerWarp>,_W,_1>,Stride<_1,_0,_0>>{};            //   tv_idx -> lane_idx
+    Layout ref2lane = composition(tv2lane, ref_layout_MN);                                      //  tile_mn -> lane_idx
+    Layout lane_layout_MN = make_layout(filter(get<0>(ref2lane)), filter(get<1>(ref2lane)));    //  lane_mn -> lane_idx
+    Layout inv_lane_layout_MN = right_inverse(lane_layout_MN);                                  // lane_idx -> lane_mn
+    int lane_idx = canonical_lane_idx();
+    auto lane_mn = idx2crd(inv_lane_layout_MN(lane_idx), shape(lane_layout_MN));
+
+    // Get the MN layout + coord of warps to determine smem reduction iterations
+    Layout tv2warp = Layout<Shape<Int<NumThreadsPerWarp>,_W,_1>,Stride<_0,_1,_0>>{};            //   tv_idx -> warp_idx
+    Layout ref2warp = composition(tv2warp, ref_layout_MN);                                      //  tile_mn -> warp_idx
+    Layout warp_layout_MN = make_layout(filter(get<0>(ref2warp)), filter(get<1>(ref2warp)));    //  warp_mn -> warp_idx
+    Layout inv_warp_layout_MN = right_inverse(warp_layout_MN);                                  // warp_idx -> warp_mn
+    int warp_idx = args.thread_idx / NumThreadsPerWarp;
+    auto warp_mn = idx2crd(inv_warp_layout_MN(warp_idx), shape(warp_layout_MN));
+
+    // Partition output gmem and register tensors
+    auto [tile_M, tile_N, tile_K] = args.tile_shape_mnk;
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+
+    Tensor mCol = make_tensor(make_gmem_ptr<ElementOutput>(params.ptr_col), make_shape(M,N,L), params.dCol); // (M,N,L)
+    Tensor gCol_l = local_tile(mCol, take<0,2>(args.tile_shape_mnk), make_coord(m,n,_));             // (CTA_M,CTA_N,L)
+    Tensor tCgCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+                      gCol_l(_,_,l), args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tCrCol = make_tensor_like<ElementCompute>(tCgCol);                          // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    fill(tCrCol, params.reduction_identity);
+
+    // Partition gmem+smem reduction buffer tensors
+    Layout gBuf_layout = make_layout(take<0,2>(args.tile_shape_mnk), make_stride(_1{}, _0{}));
+    Layout mBuf_layout = blocked_product(gBuf_layout, make_layout(ceil_div(make_shape(M,N,L), shape(gBuf_layout))));
+    Tensor mBuf = make_tensor(make_gmem_ptr(params.reduction_buffer), mBuf_layout);                // (ceil_M,ceil_N,L)
+    Tensor gBuf_nl = local_tile(mBuf, take<0,2>(args.tile_shape_mnk), make_coord(m,_,_));     // (CTA_M,CTA_N,REST_N,L)
+    Layout sBuf_layout = blocked_product(gBuf_layout,make_layout(make_shape(_1{},_1{},size<1>(warp_layout_MN)))); // (CTA_M,CTA_N,WARPS_N)
+
+    auto args_tuple = make_tuple(
+        bool_constant<ReferenceSrc>{}, cute::move(tCrCol), args.tCcD, gCol_l, args.cD, gBuf_nl, sBuf_layout,
+        lane_layout_MN, lane_mn, warp_layout_MN, warp_mn,
+        args.tile_coord_mnkl, args.residue_cD, args.residue_tCcD, args.epi_tile, args.tiled_copy, args.thread_idx);
+    return ConsumerStoreCallbacks<decltype(args_tuple)>(std::move(args_tuple), params);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Batch matrix reduction
+template <
+  int Stages,
+  class EpilogueTile,
+  class Element,
+  class StrideMNL,
+  class CopyOpR2S,
+  class SmemLayoutAtom,
+  int Alignment = 128 / sizeof_bits_v<Element>,
+  bool EnableNullptr = true // Noop on nullptr params
+>
+struct Sm90MatrixReduction;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::fusion
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp b/include/cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp
new file mode 100644
index 0000000000..48f4756d1f
--- /dev/null
+++ b/include/cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp
@@ -0,0 +1,1137 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree operation base implementation to enable composable fusions
+         for the sm90 TMA warp-specialized (ws) epilogue
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+
+#include "cute/tensor.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::fusion {
+
+using namespace cute;
+using cute::tuple;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partitioning Helpers
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+  class CtaTileMN,
+  class EpilogueTile,
+  class TiledCopy
+>
+CUTLASS_HOST_DEVICE
+constexpr auto
+sm90_partition_for_epilogue(
+    CtaTileMN cT,          // (CTA_M,CTA_N,...)
+    EpilogueTile epi_tile, // (EPI_TILE_M,EPI_TILE_N)
+    TiledCopy tiled_copy,
+    int thread_idx) {
+  ThrCopy thread_copy = tiled_copy.get_thread_slice(thread_idx);
+  Tensor cT_epi = flat_divide(cT, epi_tile);                                 // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N,...)
+  if constexpr (ReferenceSrc) {
+    return thread_copy.partition_S(cT_epi);                                        // (CPY,CPY_M,CPY_N,EPI_M,EPI_N,...)
+  }
+  else {
+    return thread_copy.partition_D(cT_epi);                                        // (CPY,CPY_M,CPY_N,EPI_M,EPI_N,...)
+  }
+}
+
+template <
+  bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+  class Engine, class LayoutMNL,
+  class TileShapeMNK,
+  class TileCoordMNKL,
+  class EpilogueTile,
+  class TiledCopy
+>
+CUTLASS_HOST_DEVICE
+constexpr auto
+sm90_partition_for_epilogue(
+    Tensor<Engine, LayoutMNL> mT,  // (M,N,L)
+    TileShapeMNK tile_shape_mnk,   // (CTA_M,CTA_N,CTA_K)
+    TileCoordMNKL tile_coord_mnkl, // (m,n,k,l)
+    EpilogueTile epi_tile,         // (EPI_TILE_M,EPI_TILE_N)
+    TiledCopy tiled_copy,
+    int thread_idx) {
+  auto [m, n, k, l] = tile_coord_mnkl;
+  auto coord_shape =
+      make_coord(m, n, l)
+    ;
+  Tensor cT = local_tile(mT, take<0,2>(tile_shape_mnk), coord_shape);                                  // (CTA_M,CTA_N)
+  Tensor tCcT =
+    sm90_partition_for_epilogue<ReferenceSrc>(cT, epi_tile, tiled_copy, thread_idx);   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+
+  return tCcT;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Visitor Implementation
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+  class ProblemShapeMNKL,
+  class TileShapeMNK,
+  class TileCoordMNKL,
+  class TiledMma,
+  class EpilogueTile
+>
+struct ProducerLoadArgs {
+  ProblemShapeMNKL problem_shape_mnkl;
+  TileShapeMNK tile_shape_mnk;
+  TileCoordMNKL tile_coord_mnkl;
+  TiledMma tiled_mma;
+  EpilogueTile epi_tile;
+  int thread_idx;
+
+  CUTLASS_DEVICE
+  ProducerLoadArgs(
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShapeMNK tile_shape_mnk,
+      TileCoordMNKL tile_coord_mnkl,
+      TiledMma tiled_mma,
+      EpilogueTile epi_tile,
+      int thread_idx)
+  : problem_shape_mnkl(problem_shape_mnkl),
+    tile_shape_mnk(tile_shape_mnk),
+    tile_coord_mnkl(tile_coord_mnkl),
+    tiled_mma(tiled_mma),
+    epi_tile(epi_tile),
+    thread_idx(thread_idx) {}
+};
+
+template<
+  class ProblemShapeMNKL,
+  class TileShapeMNK,
+  class TileCoordMNKL,
+  class TiledMma,
+  class EpilogueTile,
+  class TiledCopy,
+  class CoordTensor,
+  class Residue,
+  class ThrCoordTensor,
+  class ThrResidue,
+  class ThrSrcTensor
+>
+struct ConsumerStoreArgs {
+  ProblemShapeMNKL problem_shape_mnkl;
+  TileShapeMNK tile_shape_mnk;
+  TileCoordMNKL tile_coord_mnkl;
+  TiledMma tiled_mma;
+  EpilogueTile epi_tile;
+  TiledCopy tiled_copy;
+  CoordTensor cD;
+  Residue residue_cD;
+  ThrCoordTensor tCcD;
+  ThrResidue residue_tCcD;
+  ThrSrcTensor & tCrC;
+  int thread_idx;
+
+  CUTLASS_DEVICE
+  ConsumerStoreArgs(
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShapeMNK tile_shape_mnk,
+      TileCoordMNKL tile_coord_mnkl,
+      TiledMma tiled_mma,
+      EpilogueTile epi_tile,
+      TiledCopy tiled_copy,
+      CoordTensor cD,
+      Residue residue_cD,
+      ThrCoordTensor tCcD,
+      ThrResidue residue_tCcD,
+      ThrSrcTensor & tCrC,
+      int thread_idx)
+  : problem_shape_mnkl(problem_shape_mnkl),
+    tile_shape_mnk(tile_shape_mnk),
+    tile_coord_mnkl(tile_coord_mnkl),
+    tiled_mma(tiled_mma),
+    epi_tile(epi_tile),
+    tiled_copy(tiled_copy),
+    cD(cD),
+    residue_cD(residue_cD),
+    tCcD(tCcD),
+    residue_tCcD(residue_tCcD),
+    tCrC(tCrC),
+    thread_idx(thread_idx) {}
+};
+
+template <class... Ops>
+struct Sm90VisitorImplBase {
+  // Shared memory allocation
+  using SharedStorage = tuple<typename Ops::SharedStorage...>;
+  // Host side fusion arguments
+  using Arguments = tuple<typename Ops::Arguments...>;
+  // Device side fusion params (Kernel-entry API)
+  using Params = tuple<typename Ops::Params...>;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    uint8_t* op_workspace = reinterpret_cast<uint8_t*>(workspace);
+    return transform_apply(tuple<Ops...>{}, args,
+      [&] (auto&& op, auto const& op_args) {
+        using Op = cute::remove_cvref_t<decltype(op)>;
+        auto ret = Op::to_underlying_arguments(problem_shape, op_args, op_workspace);
+        if (op_workspace != nullptr) {
+          size_t op_workspace_size = Op::get_workspace_size(problem_shape, op_args);
+          op_workspace += round_nearest(op_workspace_size, MinWorkspaceAlignment);
+        }
+        return ret;
+      },
+      [] (auto&&... op_params) { return cute::make_tuple(op_params...); }
+    );
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return transform_apply(tuple<Ops...>{}, args,
+      [&] (auto&& op, auto const& op_args) {
+        using Op = cute::remove_cvref_t<decltype(op)>;
+        return Op::can_implement(problem_shape, op_args);
+      },
+      [&] (auto&&... implementable) {
+        return (true && ... && implementable);
+      }
+    );
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return transform_apply(tuple<Ops...>{}, args,
+      [&] (auto&& op, auto const& op_args) {
+        using Op = cute::remove_cvref_t<decltype(op)>;
+        size_t op_workspace_size = Op::get_workspace_size(problem_shape, op_args);
+        return round_nearest(op_workspace_size, MinWorkspaceAlignment);
+      },
+      [&] (auto&&... op_workspace_size) {
+        return (0 + ... + op_workspace_size);
+      }
+    );
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* op_workspace = reinterpret_cast<uint8_t*>(workspace);
+    return transform_apply(tuple<Ops...>{}, args,
+      // Initialize each operation's workspace, stopping at the first error
+      [&] (auto&& op, auto const& op_args) {
+        if (status != Status::kSuccess) {
+          return status;
+        }
+
+        using Op = cute::remove_cvref_t<decltype(op)>;
+        status = Op::initialize_workspace(problem_shape, op_args, op_workspace, stream, cuda_adapter);
+        if (op_workspace != nullptr) {
+          size_t op_workspace_size = Op::get_workspace_size(problem_shape, op_args);
+          op_workspace += round_nearest(op_workspace_size, MinWorkspaceAlignment);
+        }
+        return status;
+      },
+      // Return the final status
+      [&] (auto const&...ops) { return status; }
+    );
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImplBase() {}
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImplBase(Params const& params, SharedStorage const& shared_storage)
+    : ops(transform_apply(tuple<Ops...>{}, params, shared_storage,
+        [] (auto&& op, auto const& op_params, auto&& op_storage) {
+          using Op = cute::remove_cvref_t<decltype(op)>;
+          return Op(op_params, op_storage);
+        },
+        [] (auto&&... ops) { return cute::make_tuple(ops...); }
+      )) {}
+
+  // Ops can store kernel persistent variables (e.g. descriptors, scalars, wave counters)
+  tuple<Ops...> ops;
+};
+
+template <class... Ops>
+struct Sm90VisitorImpl : Sm90VisitorImplBase<Ops...> {
+
+  using Impl = Sm90VisitorImplBase<Ops...>;
+  using Params = typename Impl::Params;
+  using SharedStorage = typename Impl::SharedStorage;
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImpl() {}
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImpl(Params const& params, SharedStorage const& shared_storage)
+    : Impl(params, shared_storage) {}
+
+  using Impl::ops;
+
+  //
+  // Queries for kernel runtime
+  //
+
+  // Is a specialized warp for producer TMA loads needed
+  // e.g. Aux tensor loads, broadcasts using TMA bulk copy
+  // This condition cannot change between work tiles because it is used
+  // to determine whether the load warp should exit early or not
+  // e.g. for batched beta this must always be true regardless of current batch idx
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return cute::apply(ops,
+      [] (auto const&... op) {
+        return (false || ... || op.is_producer_load_needed());
+      }
+    );
+  }
+
+  // Is a producer TMA load specifically for C needed
+  // If this is true then is_producer_load_needed must also be true
+  // This condition can change between work tiles because it is only used
+  // to determine whether the TMA and smem loads for C of a given tile should happen
+  // e.g. for batched beta this can be false depending on current batch idx
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return cute::apply(ops,
+      [] (auto const&... op) {
+        return (false || ... || op.is_C_load_needed());
+      }
+    );
+  }
+
+  //
+  // Producer load callbacks, called by the epilogue load warp.
+  // Operations usually only define this if TMA load is needed. Most operations will reuse this empy implementation
+  // Load callbacks are responsible for issuing corresponding mbarrier expect-tx ops for any TMA loads issued, but
+  // are not responsible for issuing the producer_commit barrier arrival, which is issued by the collective instead
+  // If this is non-empty, is_producer_load_needed must be true.
+  //
+  template <class CallbacksTuple>
+  struct ProducerLoadCallbacks {
+    // Callbacks can store non-persistent variables (e.g. tensors) or copies of persistent variables
+    CallbacksTuple callbacks_tuple;
+
+    // Before entry of the subtile load loop
+    CUTLASS_DEVICE void
+    begin() {
+      for_each(callbacks_tuple,
+        [&] (auto& callbacks) {
+          callbacks.begin();
+        }
+      );
+    }
+
+    // Entry of the subtile load loop. Aux loads usually performed here
+    // Upon entry the producer acquire of the current subtile lock has completed.
+    // Upon exit all TMA loads for this subtile must have been issued, with corresponding expect-tx operations
+    CUTLASS_DEVICE void
+    step(uint64_t* full_mbarrier_ptr, int epi_m, int epi_n, int load_iteration, bool issue_tma_load) {
+      for_each(callbacks_tuple,
+        [&] (auto& callbacks) {
+          callbacks.step(full_mbarrier_ptr, epi_m, epi_n, load_iteration, issue_tma_load);
+        }
+      );
+    }
+
+    // Exit of the subtile load loop.
+    CUTLASS_DEVICE void
+    end() {
+      for_each(callbacks_tuple,
+        [] (auto& callbacks) {
+          callbacks.end();
+        }
+      );
+    }
+  };
+
+  // Producer load callbacks factory
+  // All operations must redefine this, but most can just dispatch to the base impl
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return transform_apply(ops,
+      [&] (auto& op) {
+        return op.get_producer_load_callbacks(args);
+      },
+      [] (auto&&... callbacks) {
+        auto callbacks_tuple = cute::make_tuple(callbacks...);
+        return ProducerLoadCallbacks<decltype(callbacks_tuple)>{callbacks_tuple};
+      }
+    );
+  }
+
+  //
+  // Consumer store callbacks, called by the epilogue store warps.
+  // All operations must redefine this, with optional inheritance from this empty implementation.
+  //
+  template <class CallbacksTuple>
+  struct ConsumerStoreCallbacks {
+    // Callbacks can store non-persistent variables (e.g. tensors) or copies of persistent variables
+    CallbacksTuple callbacks_tuple;
+
+    // Before entry of subtile store loop. Gmem broadcasts usually performed here.
+    CUTLASS_DEVICE void
+    begin() {
+      for_each(callbacks_tuple,
+        [] (auto& callbacks) {
+          callbacks.begin();
+        }
+      );
+    }
+
+    // Start of subtile store iteration
+    CUTLASS_DEVICE void
+    begin_loop(int epi_m, int epi_n) {
+      for_each(callbacks_tuple,
+        [&] (auto& callbacks) {
+          callbacks.begin_loop(epi_m, epi_n);
+        }
+      );
+    }
+
+    // Before visit callback. Smem broadcasts usually performed here.
+    // Upon entry, all producer loads for this subtile are completed and visible.
+    CUTLASS_DEVICE void
+    previsit(int epi_m, int epi_n, int load_iteration, bool is_producer_load_needed) {
+      for_each(callbacks_tuple,
+        [&] (auto& callbacks) {
+          callbacks.previsit(epi_m, epi_n, load_iteration, is_producer_load_needed);
+        }
+      );
+    }
+
+    // Perform the fused elementwise computation
+    template <typename ElementAccumulator, typename... ElementInputs, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
+          Array<ElementInputs, FragmentSize> const&... frg_inputs) // depends on the N-naryness of the op
+      = delete; // Must be implemented for each operation
+
+    // After visit call. Smem reductions usually performed here
+    // reduction_buffer is an arbitrary smem tensor that can be used for workspace
+    // It is each nodes reponsibility to assert that this buffer is sufficiently sized
+    // and to ensure that this buffer is no longer needed upon callback exit
+    // i.e. results are synchronized and no longer in the reduction buffer
+    //
+    // visit_results is a rmem tensor that contains the results of visit() for an entire
+    // on the current epilogue subtile
+    template <class STensor, class SyncFn, class VTensor>
+    CUTLASS_DEVICE void
+    reduce(STensor&& reduction_buffer, SyncFn const& sync_fn, int epi_m, int epi_n, bool is_last_iteration, VTensor visit_results) {
+      for_each(callbacks_tuple,
+        [&] (auto& callbacks) {
+          callbacks.reduce(reduction_buffer, sync_fn, epi_m, epi_n, is_last_iteration, visit_results);
+        }
+      );
+    }
+
+    // After reduce call, before smem async fence. Smem stores usually performed here.
+    // Upon exit, all smem stores for TMA must have been issued
+    CUTLASS_DEVICE void
+    postreduce(int epi_m, int epi_n, int store_iteration, bool issue_smem_store) {
+      for_each(callbacks_tuple,
+        [&] (auto& callbacks) {
+          callbacks.postreduce(epi_m, epi_n, store_iteration, issue_smem_store);
+        }
+      );
+    }
+
+    // After smem async fence, before TMA store commit. Aux stores usually performed here
+    // Upon exit, all TMA stores for this subtile must have been issued
+    // Because of the TMA store delay optimization, this entry point must ONLY be used for TMA stores
+    // other gmem stores can be placed in the reduce or postreduce entry points
+    CUTLASS_DEVICE void
+    tma_store(int epi_m, int epi_n, int store_iteration, bool issue_tma_store) {
+      for_each(callbacks_tuple,
+        [&] (auto& callbacks) {
+          callbacks.tma_store(epi_m, epi_n, store_iteration, issue_tma_store);
+        }
+      );
+    }
+
+    // End of subtile store iteration
+    CUTLASS_DEVICE void
+    end_loop(int epi_m, int epi_n) {
+      for_each(callbacks_tuple,
+        [&] (auto& callbacks) {
+          callbacks.end_loop(epi_m, epi_n);
+        }
+      );
+    }
+
+    // Exit of subtile store loop. Gmem reductions usually performed here.
+    CUTLASS_DEVICE void
+    end() {
+      for_each(callbacks_tuple,
+        [&] (auto& callbacks) {
+          callbacks.end();
+        }
+      );
+    }
+  };
+
+  // Consumer store callbacks factory
+  // All operations must redefine this
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    return transform_apply(ops,
+      [&] (auto& op) {
+        return op.template get_consumer_store_callbacks<ReferenceSrc>(args);
+      },
+      [] (auto&&... callbacks) {
+        auto callbacks_tuple = cute::make_tuple(callbacks...);
+        return ConsumerStoreCallbacks<decltype(callbacks_tuple)>{callbacks_tuple};
+      }
+    );
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Convenience aliases
+using EmptyProducerLoadCallbacks = Sm90VisitorImpl<>::ProducerLoadCallbacks<cute::tuple<>>;
+using EmptyConsumerStoreCallbacks = Sm90VisitorImpl<>::ConsumerStoreCallbacks<cute::tuple<>>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace detail
+
+using namespace detail;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Tree visitor
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class NodeOp, class... ChildOps>
+struct Sm90TreeVisitor : Sm90VisitorImpl<ChildOps..., NodeOp> {
+
+  using Impl = Sm90VisitorImpl<ChildOps..., NodeOp>;
+  using Params = typename Impl::Params;
+  using SharedStorage = typename Impl::SharedStorage;
+
+  CUTLASS_HOST_DEVICE
+  Sm90TreeVisitor() {}
+
+  CUTLASS_HOST_DEVICE
+  Sm90TreeVisitor(
+      Params const& params,
+      SharedStorage const& shared_storage)
+    : Impl(params, shared_storage) {}
+
+  template<class CallbacksImpl>
+  struct ConsumerStoreCallbacks : CallbacksImpl {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(CallbacksImpl&& impl)
+      : CallbacksImpl(cute::forward<CallbacksImpl>(impl)) {}
+
+    using CallbacksImpl::callbacks_tuple;
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      constexpr int Rm1 = sizeof...(ChildOps);
+      return cute::detail::tapply(callbacks_tuple,
+        [&] (auto& child_callbacks) {
+          return child_callbacks.visit(frg_acc, epi_v, epi_m, epi_n); // child ops must be nullary (e.g. loads, trees)
+        },
+        [&] (auto&&... frg_inputs) {
+          return get<Rm1>(callbacks_tuple).visit(frg_acc, epi_v, epi_m, epi_n, frg_inputs...);
+        },
+        make_seq<Rm1>{} // restrict the transform to R-1 child ops, apply is for node op
+      );
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    auto callbacks_tuple = Sm90VisitorImpl<ChildOps..., NodeOp>::
+      template get_consumer_store_callbacks<ReferenceSrc>(args);
+    return ConsumerStoreCallbacks<decltype(callbacks_tuple)>(std::move(callbacks_tuple));
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// DAG visitors
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Most DAG fusions can be represented as a set of output trees with a common input tree
+// The common input is first evaluated, then the result is passed as the acc fragment to the output trees
+template <class InputTree, class OutputTree, class... AuxOutTrees>
+struct Sm90SplitTreeVisitor : Sm90VisitorImpl<InputTree, AuxOutTrees..., OutputTree> {
+
+  using Sm90VisitorImpl<InputTree, AuxOutTrees..., OutputTree>::Sm90VisitorImpl;
+
+  template<class CallbacksImpl>
+  struct ConsumerStoreCallbacks : CallbacksImpl {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(CallbacksImpl&& impl)
+      : CallbacksImpl(cute::forward<CallbacksImpl>(impl)) {}
+
+    using CallbacksImpl::callbacks_tuple;
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array frg_input = get<0>(callbacks_tuple).visit(frg_acc, epi_v, epi_m, epi_n);
+
+      constexpr int Rm2 = sizeof...(AuxOutTrees);
+      cute::for_each(make_seq<Rm2>{}, // restrict the sequence to aux out trees
+        [&] (auto I) {
+          get<I+1>(callbacks_tuple).visit(frg_input, epi_v, epi_m, epi_n);
+        }
+      );
+
+      return get<Rm2+1>(callbacks_tuple).visit(frg_input, epi_v, epi_m, epi_n);
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    auto callbacks_tuple = Sm90VisitorImpl<InputTree, AuxOutTrees..., OutputTree>::
+      template get_consumer_store_callbacks<ReferenceSrc>(args);
+    return ConsumerStoreCallbacks<decltype(callbacks_tuple)>(std::move(callbacks_tuple));
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+  // deducing the output type for all the nodes is tricky so we just convert them all to a common type
+  // if multiple compute types are needed then split into multiple subgraphs grouped by type
+  class ElementCompute,
+  class EdgeTuple, // tuple of int_sequence, each sequence is the children indices (indexed by topological order) for each node
+  class... Ops     // in topological order, last op is the output. EdgeTuple must match this order
+>
+struct Sm90TopologicalVisitor : Sm90VisitorImpl<Ops...> {
+  static_assert(is_static_v<EdgeTuple>);
+  static_assert(cute::rank(EdgeTuple{}) == sizeof...(Ops));
+  static_assert(sizeof...(Ops) > 1);
+
+  using Sm90VisitorImpl<Ops...>::Sm90VisitorImpl;
+
+  template<class CallbacksImpl>
+  struct ConsumerStoreCallbacks : CallbacksImpl {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(CallbacksImpl&& impl)
+      : CallbacksImpl(cute::forward<CallbacksImpl>(impl)) {}
+
+    using CallbacksImpl::callbacks_tuple;
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      constexpr int Rm1 = sizeof...(Ops) - 1;
+      auto frg_compute_tuple = cute::repeat<Rm1>(Array<ElementCompute, FragmentSize>{});
+
+      return cute::detail::tapply(EdgeTuple{}, callbacks_tuple, frg_compute_tuple,
+        // Visit the first R-1 ops in topological order
+        [&] (auto&& edge_seq, auto& callbacks, auto& frg_compute) {
+          frg_compute = cute::detail::apply(frg_compute_tuple,
+            // Compute the current op with children inputs
+            [&] (auto const&... frg_inputs) {
+              auto frg_output = callbacks.visit(frg_acc, epi_v, epi_m, epi_n, frg_inputs...);
+              using ElementOutput = typename decltype(frg_output)::Element;
+              using ConvertOutput = NumericArrayConverter<ElementCompute, ElementOutput, FragmentSize>;
+              ConvertOutput convert_output{};
+
+              return convert_output(frg_output);
+            },
+            // Get inputs in the sequence given by the children indices of the current op
+            edge_seq
+          );
+          return frg_compute; // unused
+        },
+        // Visit the last op
+        [&] (auto const&...ops) {
+          return cute::detail::apply(frg_compute_tuple,
+            // Compute the last op with children inputs
+            [&] (auto const&... frg_inputs) {
+              return get<Rm1>(callbacks_tuple).visit(frg_acc, epi_v, epi_m, epi_n, frg_inputs...);
+            },
+            // Get inputs in the sequence given by the children indices of the last op
+            get<Rm1>(EdgeTuple{})
+          );
+        },
+        // Transform to visit R-1 ops, apply to visit last op
+        make_seq<Rm1>{}
+      );
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    auto callbacks_tuple = Sm90VisitorImpl<Ops...>::
+      template get_consumer_store_callbacks<ReferenceSrc>(args);
+    return ConsumerStoreCallbacks<decltype(callbacks_tuple)>(std::move(callbacks_tuple));
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Base specializations so we can have standard layout params and simple aggregate initializers
+namespace detail {
+
+template <class Op0>
+struct Sm90VisitorImplBase<Op0> {
+
+  // Retain tuple for SharedStorage because empty structs have 1B alignment
+  // tuples use multiple inheritance, avoids this problem
+  using SharedStorage = tuple<
+    typename Op0::SharedStorage
+  >;
+
+  struct Arguments {
+    typename Op0::Arguments op_0;
+  };
+
+  struct Params {
+    typename Op0::Params op_0;
+  };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return Params{
+      Op0::to_underlying_arguments(problem_shape, args.op_0, workspace)
+    };
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return Op0::can_implement(problem_shape, args.op_0);
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    size_t workspace_size = 0;
+    workspace_size += Op0::get_workspace_size(problem_shape, args.op_0);
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    status = Op0::initialize_workspace(problem_shape, args.op_0, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += Op0::get_workspace_size(problem_shape, args.op_0);
+    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImplBase() {}
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImplBase(Params const& params, SharedStorage const& shared_storage)
+    : ops({
+        Op0(params.op_0, get<0>(shared_storage))
+      }) {}
+
+  tuple<Op0> ops;
+};
+
+template <class Op0, class Op1>
+struct Sm90VisitorImplBase<Op0, Op1> {
+
+  using SharedStorage = tuple<
+    typename Op0::SharedStorage,
+    typename Op1::SharedStorage
+  >;
+
+  struct Arguments {
+    typename Op0::Arguments op_0;
+    typename Op1::Arguments op_1;
+  };
+
+  struct Params {
+    typename Op0::Params op_0;
+    typename Op1::Params op_1;
+  };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    size_t op_0_workspace_size = Op0::get_workspace_size(problem_shape, args.op_0);
+    uint8_t* op_0_workspace = reinterpret_cast<uint8_t*>(workspace);
+    uint8_t* op_1_workspace = op_0_workspace + op_0_workspace_size;
+    return Params{
+      Op0::to_underlying_arguments(problem_shape, args.op_0, op_0_workspace),
+      Op1::to_underlying_arguments(problem_shape, args.op_1, op_1_workspace)
+    };
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return Op0::can_implement(problem_shape, args.op_0) && 
+           Op1::can_implement(problem_shape, args.op_1);
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    size_t workspace_size = 0;
+    workspace_size += Op0::get_workspace_size(problem_shape, args.op_0);
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+
+    workspace_size += Op1::get_workspace_size(problem_shape, args.op_1);
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    status = Op0::initialize_workspace(problem_shape, args.op_0, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += Op0::get_workspace_size(problem_shape, args.op_0);
+    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = Op1::initialize_workspace(problem_shape, args.op_1, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += Op1::get_workspace_size(problem_shape, args.op_1);
+    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImplBase() {}
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImplBase(Params const& params, SharedStorage const& shared_storage)
+    : ops({
+        Op0(params.op_0, get<0>(shared_storage)),
+        Op1(params.op_1, get<1>(shared_storage))
+      }) {}
+
+  tuple<Op0, Op1> ops;
+};
+
+template <class Op0, class Op1, class Op2>
+struct Sm90VisitorImplBase<Op0, Op1, Op2> {
+
+  using SharedStorage = tuple<
+    typename Op0::SharedStorage,
+    typename Op1::SharedStorage,
+    typename Op2::SharedStorage
+  >;
+
+  struct Arguments {
+    typename Op0::Arguments op_0;
+    typename Op1::Arguments op_1;
+    typename Op2::Arguments op_2;
+  };
+
+  struct Params {
+    typename Op0::Params op_0;
+    typename Op1::Params op_1;
+    typename Op2::Params op_2;
+  };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    size_t op_0_workspace_size = Op0::get_workspace_size(problem_shape, args.op_0);
+    size_t op_1_workspace_size = Op1::get_workspace_size(problem_shape, args.op_1);
+    uint8_t* op_0_workspace = reinterpret_cast<uint8_t*>(workspace);
+    uint8_t* op_1_workspace = op_0_workspace + op_0_workspace_size;
+    uint8_t* op_2_workspace = op_1_workspace + op_1_workspace_size;
+    return Params{
+      Op0::to_underlying_arguments(problem_shape, args.op_0, op_0_workspace),
+      Op1::to_underlying_arguments(problem_shape, args.op_1, op_1_workspace),
+      Op2::to_underlying_arguments(problem_shape, args.op_2, op_2_workspace)
+    };
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return Op0::can_implement(problem_shape, args.op_0) && 
+           Op1::can_implement(problem_shape, args.op_1) &&
+           Op2::can_implement(problem_shape, args.op_2);          
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    size_t workspace_size = 0;
+    workspace_size += Op0::get_workspace_size(problem_shape, args.op_0);
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+
+    workspace_size += Op1::get_workspace_size(problem_shape, args.op_1);
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+
+    workspace_size += Op2::get_workspace_size(problem_shape, args.op_2);
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    status = Op0::initialize_workspace(problem_shape, args.op_0, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += Op0::get_workspace_size(problem_shape, args.op_0);
+    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = Op1::initialize_workspace(problem_shape, args.op_1, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += Op1::get_workspace_size(problem_shape, args.op_1);
+    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = Op2::initialize_workspace(problem_shape, args.op_2, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += Op2::get_workspace_size(problem_shape, args.op_2);
+    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImplBase() {}
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImplBase(Params const& params, SharedStorage const& shared_storage)
+    : ops({
+        Op0(params.op_0, get<0>(shared_storage)),
+        Op1(params.op_1, get<1>(shared_storage)),
+        Op2(params.op_2, get<2>(shared_storage))
+      }) {}
+
+  tuple<Op0, Op1, Op2> ops;
+};
+
+template <class Op0, class Op1, class Op2, class Op3>
+struct Sm90VisitorImplBase<Op0, Op1, Op2, Op3> {
+
+  using SharedStorage = tuple<
+    typename Op0::SharedStorage,
+    typename Op1::SharedStorage,
+    typename Op2::SharedStorage,
+    typename Op3::SharedStorage
+  >;
+
+  struct Arguments {
+    typename Op0::Arguments op_0;
+    typename Op1::Arguments op_1;
+    typename Op2::Arguments op_2;
+    typename Op3::Arguments op_3;
+  };
+
+  struct Params {
+    typename Op0::Params op_0;
+    typename Op1::Params op_1;
+    typename Op2::Params op_2;
+    typename Op3::Params op_3;
+  };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    size_t op_0_workspace_size = Op0::get_workspace_size(problem_shape, args.op_0);
+    size_t op_1_workspace_size = Op1::get_workspace_size(problem_shape, args.op_1);
+    size_t op_2_workspace_size = Op2::get_workspace_size(problem_shape, args.op_2);
+    uint8_t* op_0_workspace = reinterpret_cast<uint8_t*>(workspace);
+    uint8_t* op_1_workspace = op_0_workspace + op_0_workspace_size;
+    uint8_t* op_2_workspace = op_1_workspace + op_1_workspace_size;
+    uint8_t* op_3_workspace = op_2_workspace + op_2_workspace_size;
+    return Params{
+      Op0::to_underlying_arguments(problem_shape, args.op_0, op_0_workspace),
+      Op1::to_underlying_arguments(problem_shape, args.op_1, op_1_workspace),
+      Op2::to_underlying_arguments(problem_shape, args.op_2, op_2_workspace),
+      Op3::to_underlying_arguments(problem_shape, args.op_3, op_3_workspace)
+    };
+  }
+  
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return Op0::can_implement(problem_shape, args.op_0) && 
+           Op1::can_implement(problem_shape, args.op_1) &&
+           Op2::can_implement(problem_shape, args.op_2) &&
+           Op3::can_implement(problem_shape, args.op_3); 
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    size_t workspace_size = 0;
+    workspace_size += Op0::get_workspace_size(problem_shape, args.op_0);
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+
+    workspace_size += Op1::get_workspace_size(problem_shape, args.op_1);
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+
+    workspace_size += Op2::get_workspace_size(problem_shape, args.op_2);
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+
+    workspace_size += Op3::get_workspace_size(problem_shape, args.op_3);
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    status = Op0::initialize_workspace(problem_shape, args.op_0, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += Op0::get_workspace_size(problem_shape, args.op_0);
+    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = Op1::initialize_workspace(problem_shape, args.op_1, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += Op1::get_workspace_size(problem_shape, args.op_1);
+    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = Op2::initialize_workspace(problem_shape, args.op_2, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += Op2::get_workspace_size(problem_shape, args.op_2);
+    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = Op3::initialize_workspace(problem_shape, args.op_3, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += Op3::get_workspace_size(problem_shape, args.op_3);
+    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImplBase() {}
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImplBase(Params const& params, SharedStorage const& shared_storage)
+    : ops({
+        Op0(params.op_0, get<0>(shared_storage)),
+        Op1(params.op_1, get<1>(shared_storage)),
+        Op2(params.op_2, get<2>(shared_storage)),
+        Op3(params.op_3, get<3>(shared_storage))
+      }) {}
+
+  tuple<Op0, Op1, Op2, Op3> ops;
+};
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::fusion
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/fusion/sm90_visitor_topk_softmax.hpp b/include/cutlass/epilogue/fusion/sm90_visitor_topk_softmax.hpp
new file mode 100644
index 0000000000..53c0dce8ba
--- /dev/null
+++ b/include/cutlass/epilogue/fusion/sm90_visitor_topk_softmax.hpp
@@ -0,0 +1,759 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree Top-K + Softmax fusion operation for sm90 TMA warp-specialized epilogue
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+
+#include "cute/tensor.hpp"
+#include "sm90_visitor_tma_warpspecialized.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::fusion {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Top-K + Softmax reduction across columns
+// Performs a reduction of top-K values across N, and finally performs a softmax on them,
+// and sets values not in the top-K to 0.
+//
+//   Assumptions:
+//     1. CTA_N >= N (single tile across N, the mode which is reduced)
+//     2. EPI_N >= N (single epilogue tile across N, because we can reduce and revisit one
+//        epilogue tile at a time.)
+//     3. Top-K value is either 2 or 4.
+//
+
+namespace detail {
+
+// Implementations for add to sorted list and merging sorted lists,
+// with fast paths for lists of size 2 and 4 (Top-2 and Top-4).
+// Generic implementations may result in greater register use and branching,
+// and should be avoided.
+// Fast paths for Top-2 and Top-4 are written in inline PTX directly.
+
+CUTLASS_DEVICE
+Array<float, 2> top_2_reduce_scalar(Array<float, 2> a, float scalar) {
+  Array<float, 2> out;
+  asm volatile(
+      "{\n"
+      "  .reg .f32 mx;\n"
+      "  .reg .pred p;\n"
+      "  max.f32 mx, %3, %4;\n"
+      "  setp.gtu.f32 p, %2, %4;\n"
+      "  selp.f32 %1, mx, %2, p;\n"
+      "  selp.f32 %0, %2, %4, p;\n"
+      "}\n" : "=f"(out[0]), "=f"(out[1]) : "f"(a[0]), "f"(a[1]), "f"(scalar));
+  return out;
+}
+
+CUTLASS_DEVICE
+Array<float, 2> top_2_reduce(Array<float, 2> a, Array<float, 2> b) {
+  Array<float, 2> out;
+  asm volatile(
+      "{\n"
+      "  .reg .v2 .f32 mx;\n"
+      "  .reg .pred p;\n"
+      "  max.f32 mx.x, %3, %4;\n"           // max(a1, b0)
+      "  max.f32 mx.y, %2, %5;\n"           // max(a0, b1)
+      "  setp.gtu.f32 p, %2, %4;\n"         // a0 > b0
+      "  selp.f32 %1, mx.x, mx.y, p;\n"     // a0 > b0 ? max(a1, b0) : max(a0, b1)
+      "  selp.f32 %0, %2, %4, p;\n"         // a0 > b0 ? a0 : b0
+      "}\n" : "=f"(out[0]), "=f"(out[1]) : 
+      "f"(a[0]), "f"(a[1]), "f"(b[0]), "f"(b[1]));
+  return out;
+}
+
+CUTLASS_DEVICE
+Array<float, 4> top_4_reduce_scalar(Array<float, 4> a, float scalar) {
+  Array<float, 4> out;
+  asm volatile(
+      "{\n"
+      "  .reg .f32 mx;\n"                   // max(a3, b)
+      "  .reg .pred p0;\n"                  // a0 > b
+      "  .reg .pred p1;\n"                  // a1 > b
+      "  .reg .pred p2;\n"                  // a2 > b
+      "  max.f32 mx, %7, %8;\n"             // max(a3, b)
+      "  setp.gtu.f32 p0, %4, %8;\n"        // a0 > b
+      "  setp.gtu.f32 p1, %5, %8;\n"        // a1 > b
+      "  setp.gtu.f32 p2, %6, %8;\n"        // a2 > b
+      "  selp.f32 %3, mx, %6, p2;\n"        // a2 > b ? max(a3, b) : a2
+      "  selp.f32 %2, %6, %8, p2;\n"        // a1 = a2 > b ? a2 : b
+      "  selp.f32 %2, %2, %5, p1;\n"        // a1 > b ? max(a2, b) : a1 == a1 > b ? a1 : old_a1
+      "  selp.f32 %1, %5, %8, p1;\n"        // a0 = a1 > b ? a1 : b
+      "  selp.f32 %1, %1, %4, p0;\n"        // a0 > b ? max(a1, b) : a0 == a0 > b ? a0 : old_a0
+      "  selp.f32 %0, %4, %8, p0;\n"        // a0 = a0 > b ? a0 : b
+      "}\n" : 
+      "=f"(out[0]), "=f"(out[1]), "=f"(out[2]), "=f"(out[3]) : 
+      "f"(a[0]), "f"(a[1]), "f"(a[2]), "f"(a[3]), "f"(scalar));
+  return out;
+}
+
+CUTLASS_DEVICE
+Array<float, 4> top_4_reduce(Array<float, 4> a, Array<float, 4> b) {
+  Array<float, 4> out;
+  asm volatile(
+      "{\n"
+      "  .reg .f32 mxa0b1;\n"                          // max(a0, b1)
+      "  .reg .f32 mxa1b0;\n"                          // max(a1, b0)
+
+      "  .reg .f32 mxa2b0;\n"                          // max(a2, b0)
+      "  .reg .f32 mxa1b1;\n"                          // max(a1, b1)
+      "  .reg .f32 mxa0b2;\n"                          // max(a1, b1)
+
+      "  .reg .f32 mxa1b2;\n"                          // max(a1, b2)
+      "  .reg .f32 mxa2b1;\n"                          // max(a2, b1)
+      "  max.f32 mxa1b2, %5, %10;\n"
+      "  max.f32 mxa2b1, %6, %9;\n"
+
+      "  .reg .f32 mxa3b0;\n"                          // max(a1, b2)
+      "  .reg .f32 mxa0b3;\n"                          // max(a2, b1)
+      "  max.f32 mxa3b0, %7, %8;\n"
+      "  max.f32 mxa0b3, %4, %11;\n"
+
+      "  .reg .pred pa0b0;\n"                          // a0 > b0
+      "  .reg .pred pa1b0;\n"                          // a1 > b0
+      "  .reg .pred pa2b0;\n"                          // a2 > b0
+      "  .reg .pred pa0b1;\n"                          // a0 > b1
+      "  .reg .pred pa1b1;\n"                          // a1 > b1
+      "  .reg .pred pa0b2;\n"                          // a0 > b2
+      "  .reg .pred pb2a0;\n"                          // b1 > a0
+      "  .reg .pred pb1a0;\n"                          // b1 > a0
+
+      "  setp.gtu.f32 pa0b0, %4, %8;\n"                // a0 > b0
+      "  setp.gtu.f32 pa1b0, %5, %8;\n"                // a1 > b0
+      "  setp.gtu.f32 pa2b0, %6, %8;\n"                // a2 > b0
+      "  setp.gtu.f32 pa0b1, %4, %9;\n"                // a0 > b1
+      "  setp.gtu.f32 pa1b1, %5, %9;\n"                // a1 > b1
+      "  setp.gtu.f32 pa0b2, %4, %10;\n"               // a0 > b2
+
+      "  not.pred pb2a0, pa0b2;\n"
+      "  not.pred pb1a0, pa0b1;\n"
+
+      "  selp.f32 mxa1b0, %5, %8, pa1b0;\n"            // max(a1, b0)
+      "  selp.f32 mxa0b1, %4, %9, pa0b1;\n"            // max(a0, b1)
+
+      "  selp.f32 mxa1b1, %5, %9, pa1b1;\n"            // max(a1, b1)
+      "  selp.f32 mxa2b0, %6, %8, pa2b0;\n"            // max(a2, b0)
+      "  selp.f32 mxa0b2, %4, %10, pa0b2;\n"           // max(a0, b2)
+
+      // a0
+      "  selp.f32 %0, %4, %8, pa0b0;\n"                // a0 = a0 > b0 ? a0 : b0
+
+      // a1
+      "  selp.f32 %1, mxa1b0, mxa0b1, pa0b0;\n"        // a1 = a0 > b0 ? max(a1, b0) : max(a0, b1)
+
+      // a2
+      "  mov.f32 %2, mxa1b1;\n"                        // a2 = max(a1, b1) ** most likely case
+      "  selp.f32 %2, mxa2b0, %2, pa1b0;\n"            // a0 > a1 > b0
+      "  selp.f32 %2, mxa0b2, %2, pb1a0;\n"            // b0 > b1 > a0
+
+      // a3
+      "  mov.f32 %3, mxa1b2;\n"                        // a3 = max(a1, b2) ** one of the most likely cases
+      "  selp.f32 %3, mxa2b1, %3, pa1b1;\n"            // a3 = a1 > b1 ? max(a2, b1) ** second most likely case
+      "  selp.f32 %3, mxa3b0, %3, pa2b0;\n"            // a0 > a1 > a2 > b0
+      "  selp.f32 %3, mxa0b3, %3, pb2a0;\n"            // b0 > b1 > b2 > a0
+      "}\n" : 
+      "=f"(out[0]), "=f"(out[1]), "=f"(out[2]), "=f"(out[3]) : 
+      "f"(a[0]), "f"(a[1]), "f"(a[2]), "f"(a[3]),
+      "f"(b[0]), "f"(b[1]), "f"(b[2]), "f"(b[3]));
+  return out;
+}
+
+// Assumption: array elements are sorted in descending order
+// (a[0] is the largest element in a[].)
+template <typename Element, int N>
+CUTLASS_DEVICE
+void add_element_to_desc_sorted_array(cutlass::Array<Element, N>& a, Element b) {
+  if constexpr (N == 2 && is_same_v<Element, float>) {
+    a = top_2_reduce_scalar(a, b);
+  }
+  else if constexpr (N == 4 && is_same_v<Element, float>) {
+    a = top_4_reduce_scalar(a, b);
+  }
+  else {
+    // slower generic path with branching, slower, and can cause register spill
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < N; ++k) {
+      if (a[k] <= b) {
+        // Shift down
+        CUTLASS_PRAGMA_UNROLL
+        for (int l = N - 1; l > k; --l) {
+          a[l] = a[l-1];
+        }
+        a[k] = b;
+      }
+    }
+  }
+}
+
+// Assumption: array elements are sorted in descending order
+// (a[0] and b[0] are the largest elements in a[] and b[].)
+template <typename Element, int N>
+CUTLASS_DEVICE
+void merge_desc_sorted_arrays(cutlass::Array<Element, N>& a, const cutlass::Array<Element, N>& b) {
+  if constexpr (N == 2 && is_same_v<Element, float>) {
+    a = top_2_reduce(a, b);
+  }
+  else if constexpr (N == 4 && is_same_v<Element, float>) {
+    a = top_4_reduce(a, b);
+  }
+  else {
+    // slower generic path with branching, slower, and can cause register spill
+    int j = 0;
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < N; ++k) {
+      if (a[k] <= b[j]) {
+        // Shift down
+        CUTLASS_PRAGMA_UNROLL
+        for (int l = N - 1; l > k; --l) {
+          a[l] = a[l-1];
+        }
+        a[k] = b[j];
+        ++j;
+      }
+    }
+  }
+}
+
+// Assumption: array elements are sorted in descending order
+// (a[0] is the largest element in a[].)
+template <typename Element, int N>
+CUTLASS_DEVICE
+Element topk_logsumexp(cutlass::Array<Element, N> a) {
+  // Do one less `exp`, because we know what its result will be.
+  // Assume x is a set of `x_i`s, and `x_m` is the maximum of that set.
+  // logsumexp(x) = log(sum(x_i)) = m + log(sum(x_i - m)) = m + log(1 + sum_{i != m}(x_i - x_m))
+  // Compute m + log(1 + sum_{i != m}(x_i - x_m))
+  Element sum = Element(1.0);
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 1; i < N; ++i) {
+    sum += fast_exp(a[i] - a[0]);
+  }
+  return a[0] + fast_log(sum);
+}
+
+CUTLASS_DEVICE
+float fast_masked_softmax(float value, float minimum, float logsumexp) {
+  float new_value;
+  asm volatile(
+      "{\n"
+      "  .reg .pred p0;\n"
+      // value >= minimum
+      "  setp.geu.f32 p0, %1, %2;\n"
+
+      "  .reg .f32 x_lse;\n"
+      "  .reg .f32 %%f<11>;\n"
+      "  .reg .b32 %%r<3>;\n"
+
+      // x_lse = value - minimum
+      "  sub.rn.f32  x_lse, %1, %3;\n"
+
+      // exp(x_lse)
+      // The following is derived from a ptx dump of expf.
+      // exp requires a base conversion from exp2.
+      "  fma.rn.f32 %%f1, x_lse, 0f3BBB989D, 0f3F000000;\n"
+      "  cvt.sat.f32.f32 %%f2, %%f1;\n"
+      "  fma.rm.f32 %%f3, %%f2, 0f437C0000, 0f4B400001;\n"
+      "  add.f32 %%f4, %%f3, 0fCB40007F;\n"
+      "  neg.f32 %%f5, %%f4;\n"
+      "  fma.rn.f32 %%f6, x_lse, 0f3FB8AA3B, %%f5;\n"
+      "  fma.rn.f32 %%f7, x_lse, 0f32A57060, %%f6;\n"
+      "  mov.b32 %%r1, %%f3;\n"
+      "  shl.b32 %%r2, %%r1, 23;\n"
+      "  mov.b32 %%f8, %%r2;\n"
+      "  ex2.approx.ftz.f32 %%f9, %%f7;\n"
+      "  mul.f32 %%f10, %%f9, %%f8;\n"
+
+      // Mask or softmax
+      "  selp.f32 %0, %%f10, 0f00000000, p0;\n"
+      "}\n" : "=f"(new_value) : "f"(value), "f"(minimum), "f"(logsumexp));
+  return new_value;
+}
+
+template <typename Element>
+CUTLASS_DEVICE
+Element masked_softmax(Element value, Element minimum, Element logsumexp) {
+  if constexpr (is_same_v<Element, float>) {
+    // Inline PTX implementation
+    // Significantly reduces register requirements
+    return fast_masked_softmax(value, minimum, logsumexp);
+  }
+  else {
+    return value < minimum ? Element(0.0) : fast_exp(value - logsumexp);
+  }
+}
+
+} // namespace detail
+
+template <
+  int TopK,
+  int FragmentSize,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  int Alignment = 128 / sizeof_bits_v<ElementOutput>,
+  bool UseButterflyReduce = true
+>
+struct Sm90TopKSoftmaxColReduction {
+private:
+  static_assert(is_same_v<ElementCompute, float>, "Fused Top-K + Softmax reduction requires FP32 accumulation.");
+  static_assert(TopK == 2 || TopK == 4, "Fused Top-K + Softmax reduction only supports K=2 and K=4.");
+  static_assert(Alignment * sizeof_bits_v<ElementOutput> % 128 == 0, "sub-16B alignment not supported yet");
+
+  // Reduction tensors
+  //   We have two tensors for this EVT node: a reduction tensor and a tensor holding
+  //   final reduction values (tCrSoftmax). The reason for this is that Top-K and Softmax
+  //   require different reductions, but those luckily overlap. Top-K obviously needs at least
+  //   two values (K >= 2), and softmax needs one value: logsumexp. Logsumexp is simply the log
+  //   of sum of exponents over the set, and is equivalent to m + sum(exp(x_i - m)), where m is the
+  //   maximum of all x_i elements. Since safe softmax for any element x_i is computed as
+  //   softmax(x_i) = exp(x_i - m) / sum_j(exp(x_j - max))
+  //   we can track logsumexp instead of tracking two variables (sum of exps and the max).
+  //   In addition, subtracting logsumexp from any element and taking its exp is equivalent to
+  //   computing its softmax.
+  //   
+  //   The overlap between softmax and top-K is that we don't need to reduce logsumexp along the
+  //   way at all, because any element not in the top-K is going to be masked out and set to 0.
+  //   Therefore, we only reduce the top-K elements, and when done, compute their logsumexp and
+  //   keep it, and the smallest element in the top-K for masking out non-top-K elements.
+  //
+  //   This means that our final reduction result will always be 2 elements, regardless of the value
+  //   of K: minimum of top-K, and logsumexp.
+  //
+  //   For each reduction tensor, we define a new struct for readability.
+
+  struct ReductionResult {
+    ElementCompute min_;
+    ElementCompute logsumexp_;
+
+    CUTLASS_DEVICE
+    ReductionResult() { }
+
+    CUTLASS_DEVICE
+    ReductionResult(ElementCompute min, ElementCompute logsumexp): 
+      logsumexp_(logsumexp), min_(min) { }
+
+    // Warp shuffle broadcast
+    CUTLASS_DEVICE
+    void shuffle_up_sync(uint32_t delta, int lane_id) {
+      static_assert(sizeof(ReductionResult) == sizeof(uint64_t));
+      uint64_t r = reinterpret_cast<uint64_t&>(*this);
+      r = __shfl_up_sync(0xFFFFFFFF, r, delta);
+      *this = (lane_id - static_cast<int>(delta) >= 0) ? reinterpret_cast<ReductionResult&>(r) : *this;
+    }
+  };
+
+  struct TopKResult {
+    Array<ElementCompute, TopK> top_k_;
+
+    CUTLASS_DEVICE
+    TopKResult() {
+      top_k_.fill(-cutlass::platform::numeric_limits<ElementCompute>::infinity());
+    }
+
+    // This is where we do the "final" reduction, where we compute
+    // the logsumexp for softmax, keep the smallest value in top-K,
+    // and discard the rest.
+    CUTLASS_DEVICE
+    ReductionResult reduce_final() const {
+      return ReductionResult(top_k_[TopK - 1], topk_logsumexp(top_k_));
+    }
+
+    // Butterfly reduction
+    CUTLASS_DEVICE
+    void shuffle_xor_sync(int laneMask) {
+      if constexpr (TopK == 2) {
+        static_assert(sizeof(TopKResult) == sizeof(uint64_t));
+        uint64_t top_k = reinterpret_cast<uint64_t&>(*this);
+        top_k = __shfl_xor_sync(0xFFFFFFFF, top_k, laneMask);
+        auto synced_v = reinterpret_cast<TopKResult&>(top_k);
+        detail::merge_desc_sorted_arrays(top_k_, synced_v.top_k_);
+      }
+      else if constexpr (TopK == 4) {
+        static_assert(sizeof(TopKResult) == 2 * sizeof(uint64_t));
+        uint64_t* top_k_ptr = reinterpret_cast<uint64_t*>(this);
+        uint64_t top_k_arr[2];
+        top_k_arr[0] = top_k_ptr[0];
+        top_k_arr[1] = top_k_ptr[1];
+        top_k_arr[0] = __shfl_xor_sync(0xFFFFFFFF, top_k_arr[0], laneMask);
+        top_k_arr[1] = __shfl_xor_sync(0xFFFFFFFF, top_k_arr[1], laneMask);
+        auto synced_v = reinterpret_cast<TopKResult&>(top_k_arr);
+        detail::merge_desc_sorted_arrays(top_k_, synced_v.top_k_);
+      }
+      else {
+        TopKResult synced_v;
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < TopK; ++i) {
+          synced_v.top_k_[i] = __shfl_xor_sync(0xFFFFFFFF, top_k_[i], laneMask);
+        }
+        detail::merge_desc_sorted_arrays(top_k_, synced_v.top_k_);
+      }
+    }
+
+    // Warp shuffle reduction
+    CUTLASS_DEVICE
+    void shuffle_down_sync(uint32_t delta) {
+      if constexpr (TopK == 2) {
+        static_assert(sizeof(TopKResult) == sizeof(uint64_t));
+        uint64_t top_k = reinterpret_cast<uint64_t&>(*this);
+        top_k = __shfl_down_sync(0xFFFFFFFF, top_k, delta);
+        auto synced_v = reinterpret_cast<TopKResult&>(top_k);
+        detail::merge_desc_sorted_arrays(top_k_, synced_v.top_k_);
+      }
+      else if constexpr (TopK == 4) {
+        static_assert(sizeof(TopKResult) == 2 * sizeof(uint64_t));
+        uint64_t* top_k_ptr = reinterpret_cast<uint64_t*>(this);
+        uint64_t top_k_arr[2];
+        top_k_arr[0] = top_k_ptr[0];
+        top_k_arr[1] = top_k_ptr[1];
+        top_k_arr[0] = __shfl_down_sync(0xFFFFFFFF, top_k_arr[0], delta);
+        top_k_arr[1] = __shfl_down_sync(0xFFFFFFFF, top_k_arr[1], delta);
+        auto synced_v = reinterpret_cast<TopKResult&>(top_k_arr);
+        detail::merge_desc_sorted_arrays(top_k_, synced_v.top_k_);
+      }
+      else {
+        TopKResult synced_v;
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < TopK; ++i) {
+          synced_v.top_k_[i] = __shfl_down_sync(0xFFFFFFFF, top_k_[i], delta);
+        }
+        detail::merge_desc_sorted_arrays(top_k_, synced_v.top_k_);
+      }
+    }
+  };
+
+public:
+  struct SharedStorage { };
+
+  struct Arguments { };
+
+  struct Params { };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return {};
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    auto [M, N, K, L] = problem_shape;
+    auto [tile_M, tile_N, tile_K] = CtaTileShapeMNK{};
+    // Cross CTA reduction is not possible because there is no guarantee that all CTAs run
+    // concurrently.
+    // Cross epilogue tile reduction is possible, but re-visiting and applying reduction
+    // to accumulators is only possible for the current epilogue tile.
+    auto [epi_M, epi_N] = EpilogueTile{};
+    return N <= tile_N && N <= epi_N && N >= TopK;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return Status::kSuccess;
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90TopKSoftmaxColReduction() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90TopKSoftmaxColReduction(Params const& params, SharedStorage const& shared_storage)
+      : params(params) { }
+
+  Params params;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<class ArgsTuple>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(ArgsTuple&& args_tuple, Params const& params)
+      : args_tuple(cute::forward<ArgsTuple>(args_tuple)),
+        params(params) {}
+
+    ArgsTuple args_tuple;
+    Params const& params;
+
+    template <typename ElementAccumulator, typename ElementInput>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
+          Array<ElementInput, FragmentSize> const& frg_input) {
+
+      auto& [tCrTopK, tCrSoftmax, tCcCol, cCol, 
+              lane_layout_MN, lane_mn,
+              residue_cCol, residue_tCcCol] = args_tuple;
+      Tensor tCcCol_mn = tCcCol(_,_,_,epi_m,epi_n);
+
+      using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
+      ConvertInput convert_input{};
+
+      Array frg_I = convert_input(frg_input);
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        auto thread_crd = tCcCol_mn(epi_v * FragmentSize + i);
+        if (elem_less(thread_crd, residue_tCcCol)) {
+          TopKResult& tCrCol_vmn = tCrTopK(epi_v * FragmentSize + i);
+          detail::add_element_to_desc_sorted_array(tCrCol_vmn.top_k_, frg_I[i]);
+        }
+      }
+
+      return frg_input;
+    }
+
+    template <class STensor, class SyncFn, class VTensor>
+    CUTLASS_DEVICE void
+    reduce(STensor&& smem_buffer, SyncFn const& sync_fn, int epi_m, int epi_n, bool is_last_iteration, VTensor visit_results) {
+
+      auto& [tCrTopK, tCrSoftmax, tCcCol, cCol, 
+              lane_layout_MN, lane_mn,
+              residue_cCol, residue_tCcCol] = args_tuple;
+
+      // fully OOB CTA in partially OOB cluster
+      if (not elem_less(cCol(_0{},_0{}), residue_cCol)) {
+        return;
+      }
+      Tensor tCcCol_mn = tCcCol(_,_,_,epi_m,epi_n);
+
+      // `tCrTopK` and `tCrSoftmax` have 0-strides along modes that correspond to N,
+      // in order to reduce along modes in the `R2S` sublayout that correspond to N.
+      // This means we should modify and warp-reduce them according to their co-domain instead of
+      // their domain. Therefore we keep a filtered view of both and use them as necessary.
+      auto tCrTopK_f = filter(tCrTopK);
+      auto tCrSoftmax_f = filter(tCrSoftmax);
+
+      // The pattern here is: reduce Top-K first, then compute logsumexp, keep it and the
+      // last element of Top-K, use the latter to mask the visited results, and the former
+      // to apply softmax.
+      //
+      // This gives us two options: reduce the Top-K with warp shuffles, have the reduced
+      // lanes compute logsumexp and pair it with the last Top-K element, and broadcast
+      // the result back using warp shuffles.
+      //
+      // Alternatively, we can do a butterfly reduction over Top-K, and have all lanes
+      // compute their own logsumexp and skip the broadcast.
+      if constexpr (UseButterflyReduce) {
+        //
+        // 1. Butterfly reduction
+        //
+        CUTLASS_PRAGMA_UNROLL
+        for (int j = 1; j < size<1>(lane_layout_MN); j *= 2) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 0; i < size(tCrTopK_f); ++i) {
+            tCrTopK_f(i).shuffle_xor_sync(j);
+          }
+        }
+
+        //
+        // 2. Strip down reduced value and compute sum of exps
+        //
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(tCrSoftmax_f); ++i) {
+          tCrSoftmax_f(i) = tCrTopK_f(i).reduce_final();
+        }
+      }
+      else {
+        //
+        // 1. Warp shuffle reduction
+        //
+        CUTLASS_PRAGMA_UNROLL
+        for (int reduction_cols = size<1>(lane_layout_MN) / 2; reduction_cols > 0; reduction_cols /= 2) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 0; i < size(tCrTopK_f); ++i) {
+            tCrTopK_f(i).shuffle_down_sync(lane_layout_MN(_0{},reduction_cols));
+          }
+        }
+
+        //
+        // 2. Strip down reduced value and compute sum of exps
+        //
+        bool is_reduced_lane = get<1>(lane_mn) == 0;
+        if (is_reduced_lane) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 0; i < size(tCrSoftmax_f); ++i) {
+            tCrSoftmax_f(i) = tCrTopK_f(i).reduce_final();
+          }
+        }
+
+        //
+        // 3. Broadcast reduced values to all participants
+        //
+        CUTLASS_PRAGMA_UNROLL
+        for (int broadcast_cols = 1; broadcast_cols <= size<1>(lane_layout_MN) / 2; broadcast_cols *= 2) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 0; i < size(tCrSoftmax_f); ++i) {
+            tCrSoftmax_f(i).shuffle_up_sync(lane_layout_MN(_0{},broadcast_cols), get<1>(lane_mn));
+          }
+        }
+      }
+
+      //
+      // 4. Re-visit and apply top-K and softmax
+      //
+      CUTLASS_PRAGMA_UNROLL
+      for (int epi_v = 0; epi_v < size(visit_results); ++epi_v) {
+        auto& visit_frag = visit_results(epi_v);
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < FragmentSize; ++i) {
+          visit_frag[i] = detail::masked_softmax(
+            visit_frag[i],
+            tCrSoftmax(epi_v * FragmentSize + i).min_,
+            tCrSoftmax(epi_v * FragmentSize + i).logsumexp_
+          );
+        }
+      }
+
+    }
+
+    CUTLASS_DEVICE void
+    end_loop(int epi_m, int epi_n) {
+      auto& [tCrTopK, tCrSoftmax, tCcCol, cCol, 
+              lane_layout_MN, lane_mn,
+              residue_cCol, residue_tCcCol] = args_tuple;
+
+      // Reset reduced top-K values for next tile
+      // This must be done because we only assume a single epilogue tile across N,
+      // but not M.
+      fill(tCrTopK, TopKResult());
+    }
+
+    CUTLASS_DEVICE void
+    end() { }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    Layout ref_layout_MN = [&] () {
+      if constexpr (ReferenceSrc) { return get<0>(args.tiled_copy.get_layoutS_MN()); }
+      else                        { return get<0>(args.tiled_copy.get_layoutD_MN()); }
+    }();                                                                                         // tile_mn -> tv_idx
+
+    // Get the MN layout + coord of lanes to determine shuffle reduction iterations
+    using _W = Int<decltype(args.tiled_copy)::TiledNumThr::value / NumThreadsPerWarp>;
+    Layout tv2lane = Layout<Shape<Int<NumThreadsPerWarp>,_W,_1>,Stride<_1,_0,_0>>{};            //   tv_idx -> lane_idx
+    Layout ref2lane = composition(tv2lane, ref_layout_MN);                                      //  tile_mn -> lane_idx
+    Layout lane_layout_MN = make_layout(filter(get<0>(ref2lane)), filter(get<1>(ref2lane)));    //  lane_mn -> lane_idx
+    Layout inv_lane_layout_MN = right_inverse(lane_layout_MN);                                  // lane_idx -> lane_mn
+    int lane_idx = canonical_lane_idx();
+    auto lane_mn = idx2crd(inv_lane_layout_MN(lane_idx), shape(lane_layout_MN));
+
+    // Get the MN layout + coord of warps to determine smem reduction iterations
+    Layout tv2warp = Layout<Shape<Int<NumThreadsPerWarp>,_W,_1>,Stride<_0,_1,_0>>{};            //   tv_idx -> warp_idx
+    Layout ref2warp = composition(tv2warp, ref_layout_MN);                                      //  tile_mn -> warp_idx
+    Layout warp_layout_MN = make_layout(filter(get<0>(ref2warp)), filter(get<1>(ref2warp)));    //  warp_mn -> warp_idx
+
+    // Make sure there's only one warp across N so we can use warp shuffle intrinsics for reduction.
+    static_assert(decltype(size<1>(warp_layout_MN))::value <= 1);
+
+    // Reduction layout
+    //   We're assuming all elements in a row (over which we're performing the reduction) are
+    //   visited in the same corresponding epilogue tile, and this is what allows us to apply the
+    //   top-K + softmax operation within `reduce()`, by re-visiting the accumulated results.
+    //
+    //   This presents a challenge, because the layout of the accumulated results is typically in
+    //   in the register to shared memory shape, or: (R2S,R2S_M,R2S_N).
+    //   This means that we still need to reduce this tensor along N.
+    //
+    //   The solution is simple: we need to flatten the layout, identify modes that correspond to
+    //   N and set their strides to 0, in order to map fragment indices corresponding to the same
+    //   row back to the same element in the tensor.
+    //
+    //   This requires some extra layout manipulation, which is as follows.
+
+    // Create new accumulator layout with column broadcast
+    auto [M, N, K] = args.tile_shape_mnk;
+    auto thr_mma = args.tiled_mma.get_thread_slice(args.thread_idx);
+    auto gColReduce = make_tensor<ElementCompute>(
+        make_layout(make_shape(M, N), make_stride(_1{}, 0_c)));                                                // (M,N)
+    auto tCrColReduce = make_tensor_like<ElementCompute>(                                       // (FrgV, MMA_M, MMA_N)
+        thr_mma.partition_C(gColReduce).layout());
+
+    // Tile the new accumulator tensor according to R2S
+    ThrCopy thread_r2s = args.tiled_copy.get_slice(args.thread_idx);
+    Tensor tRS_rSoftmax = thread_r2s.retile_S(tCrColReduce);                               // ((R2S,R2S_V),MMA_M,MMA_N)
+    auto tCrC_layout = args.tCrC.layout();                                                         // (R2S,R2S_M,R2S_N)
+
+    // Compose the new accumulator R2S layout with the expected tCrC layout to get final 
+    // reduction tensor layout.
+    auto tCrSoftmax_layout = take<0, 3>(tRS_rSoftmax.layout()).compose(tCrC_layout); // (R2S,R2S_V) o (R2S,R2S_M,R2S_N)
+
+    Tensor tCrTopK = make_tensor<TopKResult>(tCrSoftmax_layout);                                   // (R2S,R2S_M,R2S_N)
+    Tensor tCrSoftmax = make_tensor<ReductionResult>(tCrSoftmax_layout);                           // (R2S,R2S_M,R2S_N)
+    fill(tCrTopK, TopKResult());
+
+    auto args_tuple = make_tuple(
+        cute::move(tCrTopK), cute::move(tCrSoftmax), args.tCcD, args.cD,
+        lane_layout_MN, lane_mn,
+        args.residue_cD, args.residue_tCcD);
+    return ConsumerStoreCallbacks<decltype(args_tuple)>(std::move(args_tuple), params);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::fusion
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/thread/activation.h b/include/cutlass/epilogue/thread/activation.h
index 9763f5fc1e..186e996602 100644
--- a/include/cutlass/epilogue/thread/activation.h
+++ b/include/cutlass/epilogue/thread/activation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -37,6 +37,7 @@
 
 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
 #include "cutlass/constants.h"
 #include "cutlass/complex.h"
 #include "cutlass/array.h"
@@ -51,23 +52,88 @@ namespace thread {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+// Identity operator
 template <typename T>
 struct Identity {
+  static const bool kIsHeavy = false;
+
   CUTLASS_HOST_DEVICE
   T operator()(T value) const {
     return value;
   }
 };
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename T, int N>
+struct Identity<Array<T, N> > {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> value) const {
+    return value;
+  }
+};
+
+/// Scale operator
+template <typename T>
+struct Scale {
+  struct Arguments {
+    using scale_type = T;
+    T scale = T(1);
+  };
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T value, T scale) const {
+    multiplies<T> mul;
+    return mul(scale, value);
+  }
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T value, Arguments args = Arguments()) const {
+    return this->operator()(value, args.scale);
+  }
+};
+
+template <typename T, int N>
+struct Scale<Array<T, N>> {
+  using Arguments = typename Scale<T>::Arguments;
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> values, T scale) const {
+    multiplies<Array<T, N>> mul;
+    return mul(scale, values);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> values, Arguments args = Arguments()) const {
+    return this->operator()(values, args.scale);
+  }
+};
+
+/// Specialization to compose other activations with a defined unary operator
+/// e.g. Scale<Identity<T>>
+template <template <class> class Activation, typename T>
+struct Scale<Activation<T>> {
+  using Arguments = typename Scale<T>::Arguments;
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T value, typename Arguments::scale_type scale) const {
+    multiplies<T> mul;
+    Activation<T> act;
+    return mul(scale, act(value));
+  }
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T value, Arguments args = Arguments()) const {
+    return this->operator()(value, args.scale);
+  }
+};
 
 /// ReLu operator - propagates NaNs
 /// Always put threshold in the right hand side of max to propagate NaN.
 template <typename T>
 struct ReLu {
-  static const bool kIsHeavy=false;
+  static const bool kIsHeavy = false;
+
   CUTLASS_HOST_DEVICE
-  T operator()(T const & threshold, T value) const {
+  T operator()(T threshold, T value) const {
     maximum<T> mx;
 
     return mx(value, threshold);
@@ -81,105 +147,188 @@ struct ReLu {
   }
 };
 
+template <typename T>
+using ReLU = ReLu<T>;
+
 template <typename T, int N>
 struct ReLu<Array<T, N>> {
-  static const bool kIsHeavy=false;
+  static const bool kIsHeavy = false;
+
   CUTLASS_HOST_DEVICE
   Array<T, N> operator()(T const & threshold, Array<T, N> const &frag) const {
-    maximum<Array<T, N> > mx;
+    maximum<Array<T, N>> mx;
 
     return mx(frag, threshold);
   }
 
   CUTLASS_HOST_DEVICE
   Array<T, N> operator()(Array<T, N> const &frag) const {
-    maximum<Array<T, N> > mx;
+    maximum<Array<T, N>> mx;
     return mx(frag, T(0));
   }
 };
 
-// Tanh operator
+// Generic clamp
 template <typename T>
-struct Tanh {
-  CUTLASS_HOST_DEVICE
-  T operator()(T const &scalar) const {
-    return fast_tanh(scalar);
-  }
-};
+struct Clamp {
+  struct Arguments {
+    T lower_bound = CUTLASS_STL_NAMESPACE::numeric_limits<T>::lowest();
+    T upper_bound = CUTLASS_STL_NAMESPACE::numeric_limits<T>::max();
+  };
 
-template <typename T, int N>
-struct Tanh<Array<T, N> > {
   CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &rhs) const {
-    Array<T, N> y;
-    Tanh<T> tanh_op;
+  T operator()(T const& value, T const& lower_bound, T const& upper_bound) const {
+    constexpr bool PropagateNaN = true;
+    maximum<T, PropagateNaN> mx;
+    minimum<T, PropagateNaN> mn;
 
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      y[i] = tanh_op(rhs[i]);
-    }
+    return mn(mx(value, lower_bound), upper_bound);
+  }
 
-    return y;
+  CUTLASS_HOST_DEVICE
+  T operator()(T const& value, Arguments const& args = Arguments()) const {
+    return this->operator()(value, args.lower_bound, args.upper_bound);
   }
 };
 
-template <int N>
-struct Tanh<Array<half_t, N>> {
-  using T = half_t;
+template <typename T, int N>
+struct Clamp<Array<T,N>> {
+  using Arguments = typename Clamp<T>::Arguments;
 
   CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const& z) const {
-    fast_tanh_op<Array<T, N>> tanh;
-    return tanh(z);
+  Array<T,N> operator()(Array<T,N> const& values, T const& lower_bound, T const& upper_bound) const {
+    constexpr bool PropagateNaN = true;
+    maximum<Array<T,N>, PropagateNaN> mx;
+    minimum<Array<T,N>, PropagateNaN> mn;
 
+    return mn(mx(values, lower_bound), upper_bound);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T,N> operator()(Array<T,N> const& values, Arguments const& args = Arguments()) const {
+    return this->operator()(values, args.lower_bound, args.upper_bound);
   }
 };
 
 // Leaky Relu operator
 template <typename T>
 struct LeakyReLU {
+
+  static const bool kIsHeavy = false;
+
+  struct Arguments {
+    T leaky_alpha = T(0);
+  };
+
   CUTLASS_HOST_DEVICE
-  T operator()(T const &value, T const & alpha_recip) const {
-    T res = value > T(0) ? value : value * alpha_recip;
+  T operator()(T const& value, T const& leaky_alpha) const {
+    T res = value > T(0) ? value : value * leaky_alpha;
     return res;
   }
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const& value, Arguments const& args = Arguments()) const {
+    return this->operator()(value, args.leaky_alpha);
+  }
 };
 
 template <typename T, int N>
 struct LeakyReLU<Array<T, N> > {
+
+  static const bool kIsHeavy = false;
+
+  using Arguments = typename LeakyReLU<T>::Arguments;
+
   CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &rhs, T const & alpha_recip) const {
+  Array<T, N> operator()(Array<T, N> const& values, T const& leaky_alpha) const {
     Array<T, N> y;
     LeakyReLU<T> leaky_op;
 
     CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < int(rhs.size()); ++i) {
-      y[i] = leaky_op(rhs[i], alpha_recip);
+    for (int i = 0; i < int(values.size()); ++i) {
+      y[i] = leaky_op(values[i], leaky_alpha);
     }
 
     return y;
   }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const& values, Arguments const& args = Arguments()) const {
+    return this->operator()(values, args.leaky_alpha);
+  }
 };
 
-// Sigmoid operator
+// Y = min((X <= threshold ? 0 : X), upper_bound)
 template <typename T>
-struct Sigmoid {
+struct ThresholdReLU {
+  static constexpr bool kIsHeavy = false;
+
+  struct Arguments {
+    T threshold = T(0);
+    T upper_bound = CUTLASS_STL_NAMESPACE::numeric_limits<T>::max();
+  };
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T value, T threshold, T upper_bound) const {
+    minimum_with_nan_propagation<T> mn;
+    
+    return mn((value <= threshold ? T(0) : value), upper_bound);
+  }
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T value, Arguments const& args = Arguments()) const {
+    return operator()(value, args.threshold, args.upper_bound);
+  }
+};
+
+template <typename T, int N>
+struct ThresholdReLU<Array<T,N>> {
+  static constexpr bool kIsHeavy = false;
+
+  using Arguments = typename ThresholdReLU<T>::Arguments;
+
   CUTLASS_HOST_DEVICE
-  T operator()(T const &scalar) const {
-    return T(1) / (T(1) + fast_exp(-scalar));
+  Array<T,N> operator()(Array<T,N> const& values, T threshold, T upper_bound) const {
+    ThresholdReLU<T> relu;
+
+    Array<T,N> retvals;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      retvals[i] = relu(values[i], threshold, upper_bound);    
+    }
+
+    return retvals;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T,N> operator()(Array<T,N> const& values, Arguments const& args = Arguments()) const {
+    return operator()(values, args.threshold, args.upper_bound);
+  }
+};
+
+// Tanh operator
+template <typename T>
+struct Tanh {
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &value) const {
+    return fast_tanh(value);
   }
 };
 
 template <typename T, int N>
-struct Sigmoid<Array<T, N> > {
+struct Tanh<Array<T, N> > {
+  static const bool kIsHeavy = true;
+
   CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &rhs) const {
+  Array<T, N> operator()(Array<T, N> const &value) const {
     Array<T, N> y;
-    Sigmoid<T> sigmoid_op;
+    Tanh<T> tanh_op;
 
     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
-      y[i] = sigmoid_op(rhs[i]);
+      y[i] = tanh_op(value[i]);
     }
 
     return y;
@@ -187,8 +336,31 @@ struct Sigmoid<Array<T, N> > {
 };
 
 template <int N>
-struct Sigmoid<Array<half_t, N>> {
+struct Tanh<Array<half_t, N>> {
   using T = half_t;
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const& z) const {
+    fast_tanh_op<Array<T, N>> tanh;
+    return tanh(z);
+  }
+};
+
+// Sigmoid operator
+template <typename T>
+struct Sigmoid {
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &value) const {
+    return T(1) / (T(1) + fast_exp(-value));
+  }
+};
+
+template <typename T, int N>
+struct Sigmoid<Array<T, N>> {
+  static const bool kIsHeavy = true;
 
   CUTLASS_HOST_DEVICE
   Array<T, N> operator()(Array<T, N> const& z) const {
@@ -217,23 +389,30 @@ struct Sigmoid<Array<half_t, N>> {
 // Reference: https://pytorch.org/docs/stable/generated/torch.nn.SiLU.html
 template <typename T>
 struct SiLu {
+  static const bool kIsHeavy = true;
+
   CUTLASS_HOST_DEVICE
-  T operator()(T const &scalar) const {
+  T operator()(T const &value) const {
     Sigmoid<T> sigmoid;
-    return scalar * sigmoid(scalar);
+    return value * sigmoid(value);
   }
 };
 
 template <typename T, int N>
 struct SiLu<Array<T, N>> {
+  static const bool kIsHeavy = true;
+
   CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &rhs) const {
+  Array<T, N> operator()(Array<T, N> const &value) const {
     Sigmoid<Array<T, N>> sigmoid_op;
     multiplies<Array<T, N>>     mul;
-    return mul(rhs, sigmoid_op(rhs));
+    return mul(value, sigmoid_op(value));
   }
 };
 
+template <typename T>
+using ScaledSiLu = Scale<SiLu<T>>;
+
 // Hardswish operator introduced by Howard et al. in the following paper
 // "Searching for MobileNetV3" (2019)
 // https://arxiv.org/pdf/1905.02244.pdf
@@ -241,6 +420,8 @@ struct SiLu<Array<T, N>> {
 // Reference: https://pytorch.org/docs/stable/generated/torch.nn.Hardswish.html
 template <typename T>
 struct HardSwish {
+  static const bool kIsHeavy = false;
+
   CUTLASS_HOST_DEVICE
   T operator()(T const &x) const {
     minimum<T> mn;
@@ -253,6 +434,7 @@ struct HardSwish {
 template <>
 struct HardSwish<float> {
   using T = float;
+  static const bool kIsHeavy = false;
 
   CUTLASS_HOST_DEVICE
   T operator()(T const &x) const {
@@ -265,14 +447,16 @@ struct HardSwish<float> {
 
 template <typename T, int N>
 struct HardSwish<Array<T, N> > {
+  static const bool kIsHeavy = false;
+
   CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &rhs) const {
+  Array<T, N> operator()(Array<T, N> const &value) const {
     Array<T, N> y;
     HardSwish<T> hardswish_op;
 
     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
-      y[i] = hardswish_op(rhs[i]);
+      y[i] = hardswish_op(value[i]);
     }
 
     return y;
@@ -282,18 +466,22 @@ struct HardSwish<Array<T, N> > {
 template <int N>
 struct HardSwish<Array<half_t, N> > {
   using T = half_t;
+  static const bool kIsHeavy = false;
 
   CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &rhs) const {
+  Array<T, N> operator()(Array<T, N> const &value) const {
     minimum<Array<T, N> > mn;
     maximum<Array<T, N> > mx;
     multiplies<Array<T, N> > mul;
     plus<Array<T, N> > add;
- 
-    return mul(mul(mn(mx(add(rhs, T(3)), T(0)), T(6)), rhs), T(0.16666667f));
+
+    return mul(mul(mn(mx(add(value, T(3)), T(0)), T(6)), value), T(0.16666667f));
   }
 };
 
+template <typename T>
+using ScaledHardSwish = Scale<HardSwish<T>>;
+
 //
 // GELU function definitions implemented as described by
 //   Hendrycks, D., and Gimpel, K. in
@@ -306,51 +494,63 @@ struct HardSwish<Array<half_t, N> > {
 // GELU operator
 template <typename T>
 struct GELU {
+  static const bool kIsHeavy = true;
+
   CUTLASS_HOST_DEVICE
-  T operator()(T const &scalar) const {
-    return T(cutlass::constants::half<T>() * scalar *
-      (cutlass::constants::one<T>() + (T)erff((float)(scalar / cutlass::constants::root_two<T>()))));
+  T operator()(T const &value) const {
+    return T(cutlass::constants::half<T>() * value *
+      (cutlass::constants::one<T>() + (T)erff((float)(value * cutlass::constants::half_root_two<T>()))));
   }
 };
 
 template <>
 struct GELU<float> {
+  static const bool kIsHeavy = true;
+
   CUTLASS_HOST_DEVICE
-  float operator()(float const &scalar) const {
-    return cutlass::constants::half<float>() * scalar *
-      (cutlass::constants::one<float>() + erff( scalar / cutlass::constants::root_two<float>() ));
+  float operator()(float const &value) const {
+    return cutlass::constants::half<float>() * value *
+      (cutlass::constants::one<float>() + erff(value * cutlass::constants::half_root_two<float>() ));
   }
 };
 
 template <>
 struct GELU<double> {
+  static const bool kIsHeavy = true;
+
   CUTLASS_HOST_DEVICE
-  double operator()(double const &scalar) const {
-    return cutlass::constants::half<double>() * scalar *
-      (cutlass::constants::one<double>() + erf( scalar / cutlass::constants::root_two<double>() ));
+  double operator()(double const &value) const {
+    return cutlass::constants::half<double>() * value *
+      (cutlass::constants::one<double>() + erf( value * cutlass::constants::half_root_two<double>() ));
   }
 };
 
 template <typename T, int N>
 struct GELU<Array<T, N> > {
+  static const bool kIsHeavy = true;
+
   CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &rhs) const {
+  Array<T, N> operator()(Array<T, N> const &value) const {
     Array<T, N> y;
     GELU<T> gelu_op;
 
     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
-      y[i] = gelu_op(rhs[i]);
+      y[i] = gelu_op(value[i]);
     }
 
     return y;
   }
 };
 
+template <typename T>
+using ScaledGELU = Scale<GELU<T>>;
+
 // GELU operator implemented using the Taylor series approximation
 template <typename T>
 struct GELU_taylor {
-  static const bool kIsHeavy=true;
+  static const bool kIsHeavy = true;
+
   CUTLASS_HOST_DEVICE
   T operator()(T const &z) const {
 
@@ -364,7 +564,8 @@ struct GELU_taylor {
 
 template <int N>
 struct GELU_taylor<Array<half_t, N> > {
-  static const bool kIsHeavy=true;
+  static const bool kIsHeavy = true;
+
   CUTLASS_HOST_DEVICE
   Array<half_t, N> operator()(Array<half_t, N> const &z) const {
 
@@ -390,25 +591,31 @@ struct GELU_taylor<Array<half_t, N> > {
 
 template <typename T, int N>
 struct GELU_taylor<Array<T, N> > {
-  static const bool kIsHeavy=true;
+  static const bool kIsHeavy = true;
+
   CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &rhs) const {
+  Array<T, N> operator()(Array<T, N> const &value) const {
     Array<T, N> y;
     GELU_taylor<T> gelu_op;
 
     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
-      y[i] = gelu_op(rhs[i]);
+      y[i] = gelu_op(value[i]);
     }
 
     return y;
   }
 };
 
+template <typename T>
+using ScaledGELU_taylor = Scale<GELU_taylor<T>>;
+
 /// Computes backwards pass for GELU operator assuming d_t is the layer gradient and
 /// z is computed from the forward pass.
 template <typename T>
 struct dGELU {
+  static const bool kIsHeavy = true;
+
   CUTLASS_HOST_DEVICE
   T operator()(T const &d_t, T const &z) const {
 
@@ -427,6 +634,8 @@ struct dGELU {
 
 template <typename T, int N>
 struct dGELU<Array<T, N> > {
+  static const bool kIsHeavy = true;
+
   CUTLASS_HOST_DEVICE
   Array<T, N> operator()(Array<T, N> const &d_t, Array<T, N> const &z) const {
     Array<T, N> y;
@@ -441,6 +650,137 @@ struct dGELU<Array<T, N> > {
   }
 };
 
+template <typename T>
+struct dReLU {
+  CUTLASS_HOST_DEVICE
+  T operator()(T d_t, bool d_relu) const {
+    return d_relu ? d_t : T(0);
+  }
+
+  template <typename U>
+  CUTLASS_HOST_DEVICE
+  T operator()(T d_t, U d_relu) const {
+    return operator()(d_t, static_cast<bool>(d_relu));
+  }
+};
+
+template <typename T, int N>
+struct dReLU<Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const& d_t, bool const (&d_relu)[N]) const {
+    Array<T, N> y;
+    dReLU<T> relu_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      y[i] = relu_op(d_t[i], d_relu[i]);
+    }
+
+    return y;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const& d_t, Array<uint1b_t, N> const& d_relu) const {
+    UnpackPredicates<N> unpack_op;
+
+    bool preds[N];
+    unpack_op(preds, d_relu);
+
+    return operator()(d_t, preds);
+  }
+
+  template <typename U>
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const& d_t, Array<U, N> const& d_relu) const {
+    Array<T, N> y;
+    dReLU<T> relu_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      y[i] = relu_op(d_t[i], d_relu[i]);
+    }
+
+    return y;
+  }
+};
+
+/// Computes backwards pass for ReLU operator assuming d_t is the layer gradient and
+/// z is computed from the forward pass.
+template <typename T>
+struct dReLU_Z {
+  CUTLASS_HOST_DEVICE
+  T operator()(T d_t, T z) const {
+    return z < 0 ? T(0) : d_t;
+  }
+};
+
+template <typename T, int N>
+struct dReLU_Z<Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const& d_t, Array<T, N> const& z) const {
+    Array<T, N> y;
+    dReLU_Z<T> relu_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      y[i] = relu_op(d_t[i], z[i]);
+    }
+
+    return y;
+  }
+};
+
+// ElementwiseFilter operator
+// Filters by a specific value and maps it to 0.0
+// Used in GEMM + comm
+template <typename T>
+struct ElementwiseFilter {
+
+  static const bool kIsHeavy = false;
+
+  struct Arguments {
+    T value_to_filter = T(-0.0);
+    T filtered_value = T(0.0);
+  };
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const& value, T const& value_to_filter, T const& filtered_value) const {
+    T res = value == value_to_filter ? filtered_value : value;
+    return res;
+  }
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const& value, Arguments const& args = Arguments()) const {
+    return this->operator()(value, args.value_to_filter, args.filtered_value);
+  }
+};
+
+template <typename T, int N>
+struct ElementwiseFilter<Array<T, N> > {
+
+  static const bool kIsHeavy = false;
+
+  using Arguments = typename ElementwiseFilter<T>::Arguments;
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const& values, T const& value_to_filter, T const& filtered_value) const {
+    Array<T, N> y;
+    ElementwiseFilter<T> filter_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < int(values.size()); ++i) {
+      y[i] = filter_op(values[i], value_to_filter, filtered_value);
+    }
+
+    return y;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const& values, Arguments const& args = Arguments()) const {
+    return this->operator()(values, args.value_to_filter, args.filtered_value);
+  }
+};
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace thread
diff --git a/include/cutlass/epilogue/thread/conversion_op.h b/include/cutlass/epilogue/thread/conversion_op.h
index baf9eb7dc3..86200b4139 100644
--- a/include/cutlass/epilogue/thread/conversion_op.h
+++ b/include/cutlass/epilogue/thread/conversion_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -92,6 +92,12 @@ class Convert {
 
   }
 
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+
+  }
+
   /// Returns true if source is needed based on state of runtime arguments
   CUTLASS_HOST_DEVICE
   constexpr bool is_source_needed() const {
diff --git a/include/cutlass/epilogue/thread/detail.hpp b/include/cutlass/epilogue/thread/detail.hpp
new file mode 100644
index 0000000000..775630027b
--- /dev/null
+++ b/include/cutlass/epilogue/thread/detail.hpp
@@ -0,0 +1,52 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Utilities for thread-level epilogues
+*/
+
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+namespace detail {
+
+/// Class used to identify cases in which no operation is performed
+template <typename T_>
+struct NoOp {};
+
+} // namespace detail
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
diff --git a/include/cutlass/epilogue/thread/linear_combination.h b/include/cutlass/epilogue/thread/linear_combination.h
index 2b083d71fa..c3aa3ff4fb 100644
--- a/include/cutlass/epilogue/thread/linear_combination.h
+++ b/include/cutlass/epilogue/thread/linear_combination.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -40,6 +40,7 @@
 #include "cutlass/functional.h"
 #include "cutlass/numeric_conversion.h"
 #include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/epilogue/thread/linear_combination_params.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -51,7 +52,7 @@ namespace thread {
 
 /// Applies a linear combination operator to an array of elements.
 ///
-/// D = alpha * accumulator + beta * source + uniform
+/// D = alpha * accumulator + beta * source
 ///
 template <
   typename ElementOutput_,                             ///< Data type used to load and store tensors
@@ -61,71 +62,98 @@ template <
   typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
   typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
   ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
-  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
+  typename ElementSource_ = ElementOutput_
 >
 class LinearCombination {
 public:
 
   using ElementOutput = ElementOutput_;
+  using ElementSource = ElementSource_;
   using ElementAccumulator = ElementAccumulator_;
   using ElementCompute = ElementCompute_;
+  using ElementScalar = ElementCompute;
+  using ElementC = ElementSource_;
+  using ElementD = ElementOutput_;
 
   static int const kCount = Count;
   static const ScaleType::Kind kScale = Scale;
   using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentSource = Array<ElementSource, kCount>;
   using FragmentAccumulator = Array<ElementAccumulator, kCount>;
-  using ComputeFragment = Array<ElementCompute, kCount>;
+  using FragmentCompute = Array<ElementCompute, kCount>;
 
   static FloatRoundStyle const kRound = Round;
 
   /// Host-constructable parameters structure
-  struct Params {
-
-    ElementCompute alpha;                  ///< scales accumulators
-    ElementCompute beta;                   ///< scales source tensor
-    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
-    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
-
-    //
-    // Methods
-    //
+  struct Params 
+  {
+    ElementCompute alpha;                         ///< scales accumulators
+    ElementCompute beta;                          ///< scales source tensor
+    ElementCompute const *alpha_ptr;              ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;               ///< pointer to source scalar - if not null, loads it from memory
+    ElementCompute const* const* alpha_ptr_array; ///< array of pointers to accumulator scalar per group/batch
+    ElementCompute const* const* beta_ptr_array;  ///< array of pointers to source scalar per group/batch
 
     CUTLASS_HOST_DEVICE
-    Params(): 
-      alpha(ElementCompute(1)), 
-      beta(ElementCompute(0)), 
-      alpha_ptr(nullptr), 
-      beta_ptr(nullptr) { }
+    Params():
+      alpha(ElementCompute(1)),
+      beta(ElementCompute(0)),
+      alpha_ptr(nullptr),
+      beta_ptr(nullptr),
+      alpha_ptr_array(nullptr),
+      beta_ptr_array(nullptr) { }
 
     CUTLASS_HOST_DEVICE
     Params(
       ElementCompute alpha,
       ElementCompute beta
-    ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {
-
-    }
+    ):
+      alpha(alpha), beta(beta),
+      alpha_ptr(nullptr), beta_ptr(nullptr),
+      alpha_ptr_array(nullptr), beta_ptr_array(nullptr) { }
 
     CUTLASS_HOST_DEVICE
     Params(
       ElementCompute alpha
-    ): alpha(alpha), beta(0), alpha_ptr(nullptr), beta_ptr(nullptr) {
-
-    }
+    ):
+      alpha(alpha), beta(0),
+      alpha_ptr(nullptr), beta_ptr(nullptr),
+      alpha_ptr_array(nullptr), beta_ptr_array(nullptr) { }
 
     CUTLASS_HOST_DEVICE
     Params(
       ElementCompute const *alpha_ptr,
       ElementCompute const *beta_ptr
-    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
-
-    }
+    ):
+      alpha(0), beta(0),
+      alpha_ptr(alpha_ptr), beta_ptr(beta_ptr),
+      alpha_ptr_array(nullptr), beta_ptr_array(nullptr) { }
 
     CUTLASS_HOST_DEVICE
     Params(
       ElementCompute const *alpha_ptr
-    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(nullptr) {
+    ):
+      alpha(0), beta(0),
+      alpha_ptr(alpha_ptr), beta_ptr(nullptr),
+      alpha_ptr_array(nullptr), beta_ptr_array(nullptr) { }
 
-    }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const* const* alpha_ptr_array,
+      ElementCompute const* const* beta_ptr_array
+    ):
+      alpha(0), beta(0),
+      alpha_ptr(nullptr), beta_ptr(nullptr),
+      alpha_ptr_array(alpha_ptr_array), beta_ptr_array(beta_ptr_array) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const* const* alpha_ptr_array
+    ):
+      alpha(0), beta(0),
+      alpha_ptr(nullptr), beta_ptr(nullptr),
+      alpha_ptr_array(alpha_ptr_array), beta_ptr_array(nullptr) { }
   };
 
 private:
@@ -141,12 +169,31 @@ class LinearCombination {
 
   /// Constructs the function object, possibly loading from pointers in host memory
   CUTLASS_HOST_DEVICE
-  LinearCombination(Params const &params) {
-
-    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
-    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+  explicit LinearCombination(Params const &params, int group_idx) {
+    if (params.alpha_ptr_array != nullptr && params.alpha_ptr_array[group_idx] != nullptr) {
+      alpha_ = *(params.alpha_ptr_array[group_idx]);
+    }
+    else if (params.alpha_ptr != nullptr) {
+      alpha_ = *params.alpha_ptr;
+    }
+    else {
+      alpha_ = params.alpha;
+    }
+    if (params.beta_ptr_array != nullptr && params.beta_ptr_array[group_idx] != nullptr) {
+      beta_ = *(params.beta_ptr_array[group_idx]);
+    }
+    else if (params.beta_ptr != nullptr) {
+      beta_ = *params.beta_ptr;
+    }
+    else {
+      beta_ = params.beta;
+    }
   }
 
+  CUTLASS_HOST_DEVICE
+  explicit LinearCombination(const Params & params) 
+  : LinearCombination(params, /* group_idx */ 0) { }
+
   /// Returns true if source is needed
   CUTLASS_HOST_DEVICE
   bool is_source_needed() const {
@@ -167,30 +214,30 @@ class LinearCombination {
     }
   }
 
-  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  /// Computes linear scaling with source: D = alpha * accumulator + beta * source
   CUTLASS_HOST_DEVICE
   FragmentOutput operator()(
-    FragmentAccumulator const &accumulator, 
-    FragmentOutput const &source) const {
+      FragmentAccumulator const &accumulator,
+      FragmentSource const &source) const {
 
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+    // Convert source to internal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round> source_converter;
     NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
 
     // Convert to destination numeric type
     NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
 
-    ComputeFragment converted_source = source_converter(source);
-    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
 
     if (Scale == ScaleType::Nothing)
       return destination_converter(converted_accumulator);
 
     // Perform binary operations
-    ComputeFragment intermediate;
+    FragmentCompute intermediate;
 
-    multiplies<ComputeFragment> mul_add_source;
-    multiply_add<ComputeFragment> mul_add_accumulator;
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
 
     if (Scale == ScaleType::NoBetaScaling)
       intermediate = converted_source;
@@ -205,7 +252,7 @@ class LinearCombination {
   /// Computes linear scaling: D = alpha * accumulator
   CUTLASS_HOST_DEVICE
   FragmentOutput operator()(
-    FragmentAccumulator const &accumulator) const {
+      FragmentAccumulator const &accumulator) const {
 
     // Convert source to interal compute numeric type
     NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
@@ -213,16 +260,259 @@ class LinearCombination {
     // Convert to destination numeric type
     NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
 
-    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
 
     if (Scale == ScaleType::Nothing)
       return destination_converter(converted_accumulator);
 
     // Perform binary operations
-    ComputeFragment intermediate;
-    multiplies<ComputeFragment> mul_accumulator;
+    FragmentCompute intermediate;
+    multiplies<FragmentCompute> mul_accumulator;
+
+    intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+
+    return destination_converter(intermediate);
+  }
+
+  //
+  // Specializations for scalar (for use with cute::collective::DefaultEpilogue)
+  //
+  CUTLASS_HOST_DEVICE
+  ElementD operator()(ElementAccumulator const accumulator, ElementC const source) const {
+    // Convert everything to Compute type, do compute, and then store to output type
+    NumericConverter<ElementCompute, ElementAccumulator, Round> accumulator_converter;
+    [[maybe_unused]] NumericConverter<ElementCompute, ElementC, Round> source_converter;
+    NumericConverter<ElementD, ElementCompute, Round> destination_converter;
+
+    // Convert to destination numeric type
+
+    ElementCompute converted_accumulator = accumulator_converter(accumulator);
+    if constexpr (Scale == ScaleType::Nothing) {
+      return destination_converter(converted_accumulator);
+    }
+
+    // Perform binary operations
+    ElementCompute intermediate;
+    multiplies<ElementCompute> multiply;
+    multiply_add<ElementCompute> madd;
+
+    if constexpr (Scale == ScaleType::NoBetaScaling) {
+      intermediate = source_converter(source);
+    }
+    else {
+      intermediate = multiply(beta_, source);                            // X =  beta * C + uniform
+    }
+
+    intermediate = madd(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    return destination_converter(intermediate);
+  }
+
+  CUTLASS_HOST_DEVICE
+  ElementD operator()(ElementAccumulator const accumulator) const {
+    // Convert everything to Compute type, do compute, and then store to output type
+    NumericConverter<ElementCompute, ElementAccumulator, Round> accumulator_converter;
+    NumericConverter<ElementD, ElementCompute, Round> destination_converter;
+    ElementCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Convert to destination numeric type
+    if constexpr (Scale == ScaleType::Nothing) {
+      return destination_converter(converted_accumulator);
+    }
+
+    // Perform binary operations
+    ElementCompute intermediate;
+    multiplies<ElementCompute> multiply;
+
+    intermediate = multiply(alpha_, accumulator);    // D = alpha * Accum
+    return destination_converter(intermediate);
+  }
+};
+
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = vector_alpha * accumulator + (optional) vector_beta/scalar_beta * source
+///
+template <
+  typename ElementOutput_,            ///< Data type used to load and store tensors
+  int Count,                          ///< Number of elements computed per operation.
+  typename ElementAccumulator_,       ///< Accumulator data type
+  typename ElementCompute_,           ///< Data type used to compute linear combination
+  FloatRoundStyle Round,
+  typename ElementSource_
+>
+class LinearCombination<ElementOutput_,
+                        Count,
+                        ElementAccumulator_,
+                        ElementCompute_,
+                        ScaleType::PerChannelScaling,
+                        Round,
+                        ElementSource_> {
+public:
+        
+  using ElementOutput = ElementOutput_;
+  using ElementSource = ElementSource_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementC = ElementSource_;
+  using ElementD = ElementOutput_;
+
+  static int const kCount = Count;
+  static const ScaleType::Kind kScale = ScaleType::PerChannelScaling;
+  static constexpr bool IsPerChannelScalingSupported = true;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentSource = Array<ElementSource, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentCompute = Array<ElementCompute, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  /// Host-constructable parameters structure
+  struct Params
+  {
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator vector
+    ElementCompute const *beta_ptr;        ///< pointer to source vector
+    ElementCompute beta;                   ///< scales source tensor
+
+    CUTLASS_HOST_DEVICE
+    Params():
+      alpha_ptr(nullptr),
+      beta_ptr(nullptr),
+      beta(ElementCompute(0)) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr
+    ):
+      alpha_ptr(alpha_ptr), beta_ptr(beta_ptr), beta(ElementCompute(0)) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr
+    ):
+      alpha_ptr(alpha_ptr), beta_ptr(nullptr), beta(ElementCompute(0)) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute beta
+    ):
+      alpha_ptr(alpha_ptr), beta_ptr(nullptr), beta(beta) { }
+
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute const* beta_ptr_ = nullptr;
+  ElementCompute beta_ = 0;
+
+public:
+
+  /// Constructs the function object
+  CUTLASS_HOST_DEVICE
+  LinearCombination(Params const& params) {
+    if (params.beta_ptr) {
+      beta_ptr_ = params.beta_ptr;
+    }
+    else {
+      beta_ = params.beta;
+    }
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return beta_ptr_ != nullptr || beta_ != ElementCompute(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool is_beta_vector() const {
+    return beta_ptr_ != nullptr;
+  }
+
+  /// Computes linear scaling with source: D = vector_alpha * accumulator + vector_beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+      FragmentAccumulator const& accumulator,
+      FragmentSource const& source,
+      FragmentCompute const& valpha,
+      FragmentCompute const& vbeta) const {
+    // Convert source to internal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+
+    intermediate = mul_add_source(vbeta, converted_source);                             // X = vector_beta * C + uniform
+
+    intermediate = mul_add_accumulator(valpha, converted_accumulator, intermediate);    // D = vector_alpha * Accum + X
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes linear scaling with source: D = vector_alpha * accumulator + scalar_beta(from host) * source 
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+      FragmentAccumulator const& accumulator,
+      FragmentSource const& source,
+      FragmentCompute const& valpha) const {
+    // Convert source to internal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+
+
+    intermediate = mul_add_source(beta_, converted_source);                           // X =  scalar_beta * C + uniform
+
+    intermediate = mul_add_accumulator(valpha, converted_accumulator, intermediate);    // D = vector_alpha * Accum + X
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes linear scaling: D = vector_alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+      FragmentAccumulator const& accumulator,
+      FragmentCompute const& valpha) const {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+    multiplies<FragmentCompute> mul_accumulator;
 
-    intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum 
+    intermediate = mul_accumulator(valpha, converted_accumulator);    // D = vector_alpha * Accum
 
     return destination_converter(intermediate);
   }
diff --git a/include/cutlass/epilogue/thread/linear_combination_bias_elementwise.h b/include/cutlass/epilogue/thread/linear_combination_bias_elementwise.h
index d85384f569..c5ffdaa03f 100644
--- a/include/cutlass/epilogue/thread/linear_combination_bias_elementwise.h
+++ b/include/cutlass/epilogue/thread/linear_combination_bias_elementwise.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -40,8 +40,10 @@
 #include "cutlass/array.h"
 #include "cutlass/functional.h"
 #include "cutlass/numeric_conversion.h"
+#include "cutlass/platform/platform.h"
 
 #include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/scale_type.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -51,6 +53,62 @@ namespace thread {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+// If kIsHeavy is a member, use it.  Otherwise, assume that it's false.
+namespace { // (anonymous)
+template<class Op, class Enable = void>
+struct kIsHeavy_member_or_false {
+  static constexpr bool value = false;
+};
+template<class Op>
+struct kIsHeavy_member_or_false<Op, typename cutlass::platform::enable_if<Op::kIsHeavy>::type> {
+  static constexpr bool value = Op::kIsHeavy;
+};
+
+} // namespace (anonymous)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+struct EmptyArguments {};
+
+template<class T, class = void>
+struct ElementwiseOpDispatcher {
+  using Arguments = EmptyArguments;
+
+  T op;
+
+  CUTLASS_HOST_DEVICE
+  ElementwiseOpDispatcher(Arguments) {}
+
+  template <typename ValueType>
+  CUTLASS_HOST_DEVICE
+  ValueType operator()(ValueType value) {
+    return op(value);
+  }
+};
+
+template<class T>
+struct ElementwiseOpDispatcher<T, std::void_t<typename T::Arguments>> {
+  using Arguments = typename T::Arguments;
+
+  Arguments args;
+  T op;
+
+  CUTLASS_HOST_DEVICE
+  ElementwiseOpDispatcher(Arguments args_):args(args_) {}
+
+  template <typename ValueType>
+  CUTLASS_HOST_DEVICE
+  ValueType operator()(ValueType value) {
+    return op(value, args);
+  }
+};
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 /// This base class is meant to define the concept required of the
 /// EpilogueWithBroadcast::OutputOp
 template <
@@ -61,38 +119,59 @@ template <
   typename ElementT_,
   int ElementsPerAccess,
   typename ElementwiseOp_ = Identity<ElementCompute_>,
-  typename BinaryOp_ = plus<ElementCompute_>
+  typename BinaryOp_ = plus<ElementCompute_>,
+  bool StoreT_ = true,
+  typename ElementVector_ = ElementC_
 >
 class LinearCombinationBiasElementwise {
 public:
 
   using ElementOutput = ElementC_;
+  using ElementD = ElementOutput;
   using ElementC = ElementC_;
   using ElementAccumulator = ElementAccumulator_;
   using ElementCompute = ElementCompute_;
+  using ElementScalar = ElementCompute;
   using ElementZ = ElementZ_;
   using ElementT = ElementT_;
+  using ElementVector = ElementVector_;
   static int const kElementsPerAccess = ElementsPerAccess;
   static int const kCount = kElementsPerAccess;
 
+  /// Follow cutlass3x EVT aliases
+  static bool const IsEltActSupported = true;
+
   using ElementwiseOp = ElementwiseOp_;
   using BinaryOp = BinaryOp_;
 
+  using ElementwiseOpDispatcher = detail::ElementwiseOpDispatcher<ElementwiseOp>;
+  using ElementwiseArguments = typename ElementwiseOpDispatcher::Arguments;
+
+  // Indicates that this epilogue applies only one binary operation
+  static bool const kIsSingleSource = true;
+
+
   using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
   using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
-  using FragmentC = Array<ElementOutput, kElementsPerAccess>;
+  using FragmentC = Array<ElementC, kElementsPerAccess>;
   using FragmentZ = Array<ElementZ, kElementsPerAccess>;
   using FragmentT = Array<ElementT, kElementsPerAccess>;
 
+  // Definitions needed for collective epilogue
+  using FragmentSource = FragmentC;
   using FragmentOutput = FragmentZ;
+  using ElementBias = ElementVector;
+  using FragmentBias = Array<ElementBias, kElementsPerAccess>;
+  using ActivationFn = ElementwiseOp;
+  static const ScaleType::Kind kScale = ScaleType::Default;
 
-  static bool const kIsHeavy = ElementwiseOp::kIsHeavy;
+  static bool const kIsHeavy = kIsHeavy_member_or_false<ElementwiseOp>::value;
 
   /// If true, the 'Z' tensor is stored
   static bool const kStoreZ = true;
 
   /// If true, the 'T' tensor is stored
-  static bool const kStoreT = true;
+  static bool const kStoreT = StoreT_;
 
   /// Host-constructable parameters structure
   struct Params {
@@ -101,6 +180,7 @@ class LinearCombinationBiasElementwise {
     ElementCompute beta;                   ///< scales source tensor
     ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
     ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    ElementwiseArguments  elementwise;     ///< Arguments for elementwise operation
 
     //
     // Methods
@@ -116,8 +196,9 @@ class LinearCombinationBiasElementwise {
     CUTLASS_HOST_DEVICE
     Params(
       ElementCompute alpha,
-      ElementCompute beta
-    ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {
+      ElementCompute beta,
+      ElementwiseArguments  elementwise_ = ElementwiseArguments{}
+    ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr), elementwise(elementwise_) {
 
     }
 
@@ -131,8 +212,9 @@ class LinearCombinationBiasElementwise {
     CUTLASS_HOST_DEVICE
     Params(
       ElementCompute const *alpha_ptr,
-      ElementCompute const *beta_ptr
-    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+      ElementCompute const *beta_ptr,
+      ElementwiseArguments  elementwise_ = ElementwiseArguments{}
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr), elementwise(elementwise_) {
 
     }
 
@@ -152,6 +234,7 @@ class LinearCombinationBiasElementwise {
 
   ElementCompute alpha_;
   ElementCompute beta_;
+  ElementwiseArguments const &elementwise_;
   bool skip_elementwise_;
 
 public:
@@ -162,7 +245,7 @@ class LinearCombinationBiasElementwise {
 
   /// Constructor from Params
   CUTLASS_HOST_DEVICE
-  LinearCombinationBiasElementwise(Params const &params) {
+  LinearCombinationBiasElementwise(Params const &params): elementwise_(params.elementwise) {
 
     alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
     beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
@@ -187,16 +270,84 @@ class LinearCombinationBiasElementwise {
     }
   }
 
+  /// Applies the operation when elementwise_op require arguments and is_source_needed() is true
+  template <typename ElementwiseArgs>
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
+    FragmentAccumulator const &AB,
+    FragmentC const &frag_C,
+    FragmentCompute const &V,
+    ElementwiseArgs const &elementwise_args) const {
+
+    ElementwiseOp elementwise_op;
+    BinaryOp binary_op;
+
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute tmp_C = NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(frag_C);
+    FragmentCompute result_Z;
+    FragmentCompute result_T;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = binary_op(alpha_ * tmp_Accum[i] + beta_ * tmp_C[i], V[i]);
+      result_T[i] = z;
+      result_Z[i] = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
+    }
+
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
+      frag_T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when elementwise_op require arguments and is_source_needed() is false
+  template <typename ElementwiseArgs>
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
+    FragmentAccumulator const &AB,
+    FragmentCompute const &V,
+    ElementwiseArgs const &elementwise_args) const {
+
+    ElementwiseOp elementwise_op;
+    BinaryOp binary_op;
+
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute result_Z;
+    FragmentCompute result_T;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = binary_op(alpha_ * tmp_Accum[i], V[i]);
+      result_T[i] = z;
+      result_Z[i] = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
+    }
+
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
+      frag_T = convert_t(result_T);
+    }
+  }
+
   /// Applies the operation when is_source_needed() is true
   CUTLASS_HOST_DEVICE
   void operator()(
-    FragmentZ &frag_Z, 
-    FragmentT &frag_T, 
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
     FragmentAccumulator const &AB,
     FragmentC const &frag_C,
     FragmentCompute const &V) const {
 
-    ElementwiseOp elementwise_op;
+    ElementwiseOpDispatcher elementwise_op(elementwise_);
     BinaryOp binary_op;
 
     FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
@@ -214,19 +365,21 @@ class LinearCombinationBiasElementwise {
     NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
     frag_Z = convert_z(result_Z);
 
-    NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
-    frag_T = convert_t(result_T);
+    if constexpr (kStoreT) {
+      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
+      frag_T = convert_t(result_T);
+    }
   }
 
   /// Applies the operation when is_source_needed() is false
   CUTLASS_HOST_DEVICE
   void operator()(
-    FragmentZ &frag_Z, 
-    FragmentT &frag_T, 
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
     FragmentAccumulator const &AB,
     FragmentCompute const &V) const {
 
-    ElementwiseOp elementwise_op;
+    ElementwiseOpDispatcher elementwise_op(elementwise_);
     BinaryOp binary_op;
 
     FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
@@ -243,8 +396,122 @@ class LinearCombinationBiasElementwise {
     NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
     frag_Z = convert_z(result_Z);
 
-    NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
-    frag_T = convert_t(result_T);
+    if constexpr (kStoreT) {
+      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
+      frag_T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when elementwise_op require arguments and is_source_needed() is true
+  template <typename ElementwiseArgs>
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    ElementZ &Z,
+    ElementT &T,
+    ElementAccumulator const &AB,
+    ElementC const &C,
+    ElementCompute const &V,
+    ElementwiseArgs const &elementwise_args) const {
+
+    ElementwiseOp elementwise_op;
+    BinaryOp binary_op;
+
+    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
+    ElementCompute tmp_C = NumericConverter<ElementCompute, ElementC>()(C);
+
+    ElementCompute z = binary_op(alpha_ * tmp_Accum + beta_ * tmp_C, V);
+    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
+
+    NumericConverter<ElementZ, ElementCompute> convert_z;
+    Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      ElementCompute result_T = z;
+      NumericConverter<ElementT, ElementCompute> convert_t;
+      T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when elementwise_op require arguments and is_source_needed() is false
+  template <typename ElementwiseArgs>
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    ElementZ &Z,
+    ElementT &T,
+    ElementAccumulator const &AB,
+    ElementCompute const &V,
+    ElementwiseArgs const &elementwise_args) const {
+
+    ElementwiseOp elementwise_op;
+    BinaryOp binary_op;
+
+    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
+
+    ElementCompute z = binary_op(alpha_ * tmp_Accum, V);
+    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
+
+    NumericConverter<ElementZ, ElementCompute> convert_z;
+    Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      ElementCompute result_T = z;
+      NumericConverter<ElementT, ElementCompute> convert_t;
+      T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when is_source_needed() is true
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    ElementZ &Z,
+    ElementT &T,
+    ElementAccumulator const &AB,
+    ElementC const &C,
+    ElementCompute const &V) const {
+
+    ElementwiseOpDispatcher elementwise_op(elementwise_);
+    BinaryOp binary_op;
+
+    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
+    ElementCompute tmp_C = NumericConverter<ElementCompute, ElementC>()(C);
+
+    ElementCompute z = binary_op(alpha_ * tmp_Accum + beta_ * tmp_C, V);
+    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z);
+
+    NumericConverter<ElementZ, ElementCompute> convert_z;
+    Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      ElementCompute result_T = z;
+      NumericConverter<ElementT, ElementCompute> convert_t;
+      T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when is_source_needed() is false
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    ElementZ &Z,
+    ElementT &T,
+    ElementAccumulator const &AB,
+    ElementCompute const &V) const {
+
+    ElementwiseOpDispatcher elementwise_op(elementwise_);
+    BinaryOp binary_op;
+
+    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
+
+    ElementCompute z = binary_op(alpha_ * tmp_Accum, V);
+    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z);
+
+    NumericConverter<ElementZ, ElementCompute> convert_z;
+    Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      ElementCompute result_T = z;
+      NumericConverter<ElementT, ElementCompute> convert_t;
+      T = convert_t(result_T);
+    }
   }
 };
 
diff --git a/include/cutlass/epilogue/thread/linear_combination_bias_relu.h b/include/cutlass/epilogue/thread/linear_combination_bias_relu.h
index 6316833045..ead1123ca8 100644
--- a/include/cutlass/epilogue/thread/linear_combination_bias_relu.h
+++ b/include/cutlass/epilogue/thread/linear_combination_bias_relu.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -49,7 +49,6 @@ namespace cutlass {
 namespace epilogue {
 namespace thread {
 
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace detail {
@@ -64,6 +63,42 @@ struct ArrayMaximum {
 
     Array<Element, ElementsPerAccess> result;
 
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      result[i] = platform::max(lhs[i].get(), rhs[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<Element, ElementsPerAccess> operator()(
+    Array<Element, ElementsPerAccess>  const &lhs,
+    Element                                   rhs) const {
+
+    Array<Element, ElementsPerAccess> result;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      result[i] = platform::max(lhs[i].get(), rhs);
+    }
+
+    return result;
+  }
+};
+
+
+/// Partial specialization: Element=float
+template <int ElementsPerAccess>
+struct ArrayMaximum<float, ElementsPerAccess> {
+
+  CUTLASS_HOST_DEVICE
+  Array<float, ElementsPerAccess> operator()(
+    Array<float, ElementsPerAccess>  const &lhs,
+    Array<float, ElementsPerAccess>  const &rhs) const {
+
+    Array<float, ElementsPerAccess> result;
+
     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < ElementsPerAccess; ++i) {
       result[i] = fmax(lhs[i], rhs[i]);
@@ -71,8 +106,24 @@ struct ArrayMaximum {
 
     return result;
   }
+
+  CUTLASS_HOST_DEVICE
+  Array<float, ElementsPerAccess> operator()(
+    Array<float, ElementsPerAccess>  const &lhs,
+    float rhs) const {
+
+    Array<float, ElementsPerAccess> result;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      result[i] = fmax(lhs[i], rhs);
+    }
+
+    return result;
+  }
 };
 
+/// Partial specialization: Element=half
 template <int ElementsPerAccess>
 struct ArrayMaximum<half_t, ElementsPerAccess> {
 
@@ -96,6 +147,8 @@ struct ArrayMaximum<half_t, ElementsPerAccess> {
       res_ptr[i] = __hmax2(lhs_ptr[i], rhs_ptr[i]);
     }
 
+    static_assert(!(ElementsPerAccess % 2), "Output array must be divisible by vector length.");
+
     #else
     __half const *lhs_ptr = reinterpret_cast<__half const *>(lhs.raw_data());
     __half const *rhs_ptr = reinterpret_cast<__half const *>(rhs.raw_data());
@@ -133,6 +186,8 @@ struct ArrayMaximum<half_t, ElementsPerAccess> {
       res_ptr[i] = __hmax2(lhs_ptr[i], rhs_pair);
     }
 
+    static_assert(!(ElementsPerAccess % 2), "Output array must be divisible by vector length.");
+
     #else
 
     __half const *lhs_ptr = reinterpret_cast<__half const *>(lhs.raw_data());
@@ -150,6 +205,90 @@ struct ArrayMaximum<half_t, ElementsPerAccess> {
   }
 };
 
+/// Partial specialization: Element=bfloat16_t
+template <int ElementsPerAccess>
+struct ArrayMaximum<bfloat16_t, ElementsPerAccess> {
+
+  using NvType   = __nv_bfloat16;
+  using NvTypeV2 = __nv_bfloat162;
+
+  CUTLASS_DEVICE
+  Array<bfloat16_t, ElementsPerAccess> operator()(
+    Array<bfloat16_t, ElementsPerAccess>  const &lhs,
+    Array<bfloat16_t, ElementsPerAccess>  const &rhs) const {
+
+    Array<bfloat16_t, ElementsPerAccess> result;
+
+    #if __CUDA_ARCH__ >= 800
+    int const kVectorCount = ElementsPerAccess / 2;
+
+
+    NvTypeV2 const *lhs_ptr = reinterpret_cast<NvTypeV2 const *>(lhs.raw_data());
+    NvTypeV2 const *rhs_ptr = reinterpret_cast<NvTypeV2 const *>(rhs.raw_data());
+    NvTypeV2       *res_ptr = reinterpret_cast<NvTypeV2 *>(result.raw_data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kVectorCount; ++i) {
+      res_ptr[i] = __hmax2(lhs_ptr[i], rhs_ptr[i]);
+    }
+
+    #else
+    NvType const *lhs_ptr = reinterpret_cast<NvType const *>(lhs.raw_data());
+    NvType const *rhs_ptr = reinterpret_cast<NvType const *>(rhs.raw_data());
+    NvType       *res_ptr = reinterpret_cast<NvType       *>(result.raw_data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      res_ptr[i] = ((lhs_ptr[i] < rhs_ptr[i]) ? rhs_ptr[i] : lhs_ptr[i]);
+    }
+
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  Array<bfloat16_t, ElementsPerAccess> operator()(
+    Array<bfloat16_t, ElementsPerAccess>  const &lhs,
+    bfloat16_t                                   rhs) const {
+
+    Array<bfloat16_t, ElementsPerAccess> result;
+
+    #if __CUDA_ARCH__ >= 800
+    int const kVectorCount = ElementsPerAccess / 2;
+
+
+    NvType rhs_raw = reinterpret_cast<NvType const &>(rhs);
+    NvTypeV2 rhs_pair = __bfloat162bfloat162(rhs_raw);
+
+    NvTypeV2 const *lhs_ptr = reinterpret_cast<NvTypeV2 const *>(lhs.raw_data());
+    NvTypeV2       *res_ptr = reinterpret_cast<NvTypeV2 *>(result.raw_data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kVectorCount; ++i) {
+      res_ptr[i] = __hmax2(lhs_ptr[i], rhs_pair);
+    }
+
+    static_assert(!(ElementsPerAccess % 2), "Output array must be divisible by vector length.");
+
+    #else
+
+    NvType const *lhs_ptr = reinterpret_cast<NvType const *>(lhs.raw_data());
+    NvType const  rhs_raw = reinterpret_cast<NvType const &>(rhs);
+    NvType       *res_ptr = reinterpret_cast<NvType       *>(result.raw_data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      res_ptr[i] = ((lhs_ptr[i] < rhs_raw) ? rhs_raw : lhs_ptr[i]);
+    }
+
+    #endif
+
+    return result;
+  }
+};
+
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <typename Element, int ElementsPerAccess>
@@ -187,6 +326,25 @@ struct ReluConditional<half_t, ElementsPerAccess> {
   }
 };
 
+template <int ElementsPerAccess>
+struct ReluConditional<bfloat16_t, ElementsPerAccess> {
+
+  CUTLASS_DEVICE
+  void operator()(
+    bool conditional[],
+    Array<bfloat16_t, ElementsPerAccess> const &fragment,
+    bfloat16_t threshold) const {
+
+    __nv_bfloat16 y = reinterpret_cast<__nv_bfloat16 const &>(threshold);
+    __nv_bfloat16 const *x = reinterpret_cast<__nv_bfloat16 const *>(fragment.raw_data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      conditional[i] = !__hlt(x[i], y);
+    }
+  }
+};
+
 } // namespace detail
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -204,7 +362,8 @@ template <
   typename ElementCompute_,
   typename ElementZ_,
   int ElementsPerAccess,
-  bool StoreT = true
+  bool StoreT_ = true,
+  typename ElementVector_ = ElementC_
 >
 class LinearCombinationBiasRelu {
 public:
@@ -214,6 +373,7 @@ class LinearCombinationBiasRelu {
   using ElementAccumulator = ElementAccumulator_;
   using ElementCompute = ElementCompute_;
   using ElementZ = ElementZ_;
+  using ElementVector = ElementVector_;
 
   using ElementT = uint1b_t;
 
@@ -223,6 +383,9 @@ class LinearCombinationBiasRelu {
   using ElementwiseOp = ReLu<ElementCompute>;
   using BinaryOp = plus<ElementCompute>;
 
+  // Indicates that this epilogue applies only one binary operation
+  static bool const kIsSingleSource = true;
+
   using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
   using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
   using FragmentC = Array<ElementOutput, kElementsPerAccess>;
@@ -233,7 +396,7 @@ class LinearCombinationBiasRelu {
   static bool const kStoreZ = true;
 
   /// If true, the 'T' tensor is stored
-  static bool const kStoreT = StoreT;
+  static bool const kStoreT = StoreT_;
 
   /// Host-constructable parameters structure
   struct Params {
diff --git a/include/cutlass/epilogue/thread/linear_combination_clamp.h b/include/cutlass/epilogue/thread/linear_combination_clamp.h
index 185a3d1b9a..aad9b52389 100644
--- a/include/cutlass/epilogue/thread/linear_combination_clamp.h
+++ b/include/cutlass/epilogue/thread/linear_combination_clamp.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -40,6 +40,7 @@
 #include "cutlass/array.h"
 #include "cutlass/functional.h"
 #include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/scale_type.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -87,6 +88,7 @@ class LinearCombinationClamp {
   using FragmentOutput = Array<ElementOutput, kCount>;
   using FragmentAccumulator = Array<ElementAccumulator, kCount>;
   using ComputeFragment = Array<ElementCompute, kCount>;
+  using FragmentSource = Array<ElementOutput, kCount>;
 
   static FloatRoundStyle const kRound = Round;
 
@@ -217,10 +219,10 @@ class LinearCombinationClamp {
 
     /// Clamping constant value
     ElementCompute const kClampMax =
-        ElementCompute(platform::numeric_limits<ElementOutput>::max());
+        ElementCompute(cutlass::platform::numeric_limits<ElementOutput>::max());
 
     ElementCompute const kClampMin =
-        ElementCompute(platform::numeric_limits<ElementOutput>::lowest());
+        ElementCompute(cutlass::platform::numeric_limits<ElementOutput>::lowest());
 
     intermediate = max_accumulator(intermediate, kClampMin);
     intermediate = min_accumulator(intermediate, kClampMax);
@@ -258,10 +260,10 @@ class LinearCombinationClamp {
 
     /// Clamping constant value
     ElementCompute const kClampMax =
-        ElementCompute(platform::numeric_limits<ElementOutput>::max());
+        ElementCompute(cutlass::platform::numeric_limits<ElementOutput>::max());
 
     ElementCompute const kClampMin =
-        ElementCompute(platform::numeric_limits<ElementOutput>::lowest());
+        ElementCompute(cutlass::platform::numeric_limits<ElementOutput>::lowest());
 
     intermediate = max_accumulator(intermediate, kClampMin);
     intermediate = min_accumulator(intermediate, kClampMax);
@@ -297,7 +299,7 @@ class LinearCombinationClamp<ElementOutput_, Count, int, float, Scale, Round> {
   using ElementCompute = float;
 
   static_assert(
-      platform::numeric_limits<ElementOutput>::is_integer,
+      cutlass::platform::numeric_limits<ElementOutput>::is_integer,
       "This elementwise op expects the output to be int.");
 
   static int const kCount = Count;
@@ -430,17 +432,12 @@ class LinearCombinationClamp<ElementOutput_, Count, int, float, Scale, Round> {
       intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
     }
 
-    // Convert floats back to INT
-    FragmentAccumulator scaled_accumulator;
-
-    NumericArrayConverter<int, ElementCompute, kCount, Round> compute_converter;
-
-    scaled_accumulator = compute_converter(intermediate);
-
-    // Convert to destination numeric type
-    NumericArrayConverter<ElementOutput, int, kCount, Round> destination_converter;
+    //
+    // Convert float => ElementOutput_ with clamping
+    //
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
 
-    return destination_converter(scaled_accumulator);
+    return destination_converter(intermediate);
   }
 
   /// Computes linear scaling: D = alpha * accumulator
@@ -464,17 +461,12 @@ class LinearCombinationClamp<ElementOutput_, Count, int, float, Scale, Round> {
       intermediate = mul_add_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
     }
 
-    // Convert floats back to INT
-    FragmentAccumulator scaled_accumulator;
-
-    NumericArrayConverter<int, ElementCompute, kCount, Round> compute_converter;
-
-    scaled_accumulator = compute_converter(intermediate);
-
-    // Convert to destination numeric type
-    NumericArrayConverter<ElementOutput, int, kCount, Round> destination_converter;
+    //
+    // Convert float => ElementOutput_ with clamping
+    //
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
 
-    return destination_converter(scaled_accumulator);
+    return destination_converter(intermediate);
   }
 };
 
@@ -507,7 +499,7 @@ class FastLinearCombinationClamp {
   using ElementCompute = float;
 
   static_assert(
-      platform::numeric_limits<ElementOutput>::is_integer,
+      cutlass::platform::numeric_limits<ElementOutput>::is_integer,
       "This elementwise op expects the output to be int.");
 
   static int const kCount = Count;
diff --git a/include/cutlass/epilogue/thread/linear_combination_dgelu.h b/include/cutlass/epilogue/thread/linear_combination_dgelu.h
index d80682f466..74eb8213e5 100644
--- a/include/cutlass/epilogue/thread/linear_combination_dgelu.h
+++ b/include/cutlass/epilogue/thread/linear_combination_dgelu.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,7 +35,7 @@
 
 #pragma once
 
-#include <cutlass/half.h>
+#include "cutlass/half.h"
 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
 #include "cutlass/array.h"
@@ -92,9 +92,9 @@ class LinearCombinationDGelu {
 
     ElementCompute alpha;                  ///< scales accumulators
     ElementCompute beta;                   ///< scales source tensor
-    ElementCompute threshold;              ///< minimum value that is output 
     ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
     ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    ElementCompute threshold;              ///< minimum value that is output
     //
     // Methods
     //
diff --git a/include/cutlass/epilogue/thread/linear_combination_drelu.h b/include/cutlass/epilogue/thread/linear_combination_drelu.h
index 74b090d3ee..aed173056d 100644
--- a/include/cutlass/epilogue/thread/linear_combination_drelu.h
+++ b/include/cutlass/epilogue/thread/linear_combination_drelu.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -34,7 +34,7 @@
 
 #pragma once
 
-#include <cutlass/half.h>
+#include "cutlass/half.h"
 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
 #include "cutlass/array.h"
@@ -87,9 +87,9 @@ class LinearCombinationDRelu {
 
     ElementCompute alpha;                  ///< scales accumulators
     ElementCompute beta;                   ///< scales source tensor
-    ElementCompute threshold;              ///< minimum value that is output 
     ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
     ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    ElementCompute threshold;              ///< minimum value that is output 
     //
     // Methods
     //
diff --git a/include/cutlass/epilogue/thread/linear_combination_gelu.h b/include/cutlass/epilogue/thread/linear_combination_gelu.h
index 9271243366..818b21aa85 100644
--- a/include/cutlass/epilogue/thread/linear_combination_gelu.h
+++ b/include/cutlass/epilogue/thread/linear_combination_gelu.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/thread/linear_combination_generic.h b/include/cutlass/epilogue/thread/linear_combination_generic.h
index d43ce5c401..a6bd9d6722 100644
--- a/include/cutlass/epilogue/thread/linear_combination_generic.h
+++ b/include/cutlass/epilogue/thread/linear_combination_generic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -49,6 +49,51 @@ namespace thread {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+template <class Activation, class = void>
+struct GenericActivationTraits {
+  static constexpr bool IsArgumentsNeeded = false;
+  struct Arguments {};
+};
+
+template <class Activation>
+struct GenericActivationTraits<Activation, decltype(typename Activation::Arguments(), void())> {
+  static constexpr bool IsArgumentsNeeded = true;
+  using Arguments = typename Activation::Arguments;
+};
+
+template <typename T>
+struct LinearCombinationGenericParams {
+  T alpha;                  ///< scales accumulators
+  T beta;                   ///< scales source tensor
+  T const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+  T const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  LinearCombinationGenericParams():
+    alpha(T(1)),
+    beta(T(0)),
+    alpha_ptr(nullptr),
+    beta_ptr(nullptr) { }
+
+  CUTLASS_HOST_DEVICE
+  LinearCombinationGenericParams(
+    T alpha,
+    T beta = T(0)
+  ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) { }
+
+  CUTLASS_HOST_DEVICE
+  LinearCombinationGenericParams(
+    T const *alpha_ptr,
+    T const *beta_ptr = nullptr
+  ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) { }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 /// Applies a linear combination operator followed by an activation function to an array of elements.
 ///
 /// D = activation(alpha * accumulator + beta * source + uniform)
@@ -78,44 +123,16 @@ class LinearCombinationGeneric {
 
   using FragmentOutput = Array<ElementOutput, kCount>;
   using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentSource = Array<ElementOutput, kCount>;
   using FragmentCompute = Array<ElementCompute, kCount>;
 
   static FloatRoundStyle const kRound = Round;
 
   /// Host-constructable parameters structure
-  struct Params {
-
-    ElementCompute alpha;                  ///< scales accumulators
-    ElementCompute beta;                   ///< scales source tensor
-    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
-    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
-
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params():
-      alpha(ElementCompute(1)),
-      beta(ElementCompute(0)),
-      alpha_ptr(nullptr),
-      beta_ptr(nullptr) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute alpha,
-      ElementCompute beta = ElementCompute(0)
-    ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {
-
-    }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute const *alpha_ptr,
-      ElementCompute const *beta_ptr = nullptr
-    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
-
-    }
+  struct Params
+    : LinearCombinationGenericParams<ElementCompute>,
+      GenericActivationTraits<ActivationFunctor<ElementCompute>>::Arguments {
+    using LinearCombinationGenericParams<ElementCompute>::LinearCombinationGenericParams;
   };
 
 private:
@@ -124,8 +141,7 @@ class LinearCombinationGeneric {
   // Data members
   //
 
-  ElementCompute alpha_;
-  ElementCompute beta_;
+  Params params_;
   bool skip_elementwise_;
 
 public:
@@ -133,9 +149,9 @@ class LinearCombinationGeneric {
   /// Constructs the function object, possibly loading from pointers in host memory
   CUTLASS_HOST_DEVICE
   LinearCombinationGeneric(Params const &params) {
-
-    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
-    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    params_ = params;
+    params_.alpha = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    params_.beta = (params.beta_ptr ? *params.beta_ptr : params.beta);
     skip_elementwise_ = false;
   }
 
@@ -148,16 +164,16 @@ class LinearCombinationGeneric {
 
     if (Scale == ScaleType::Nothing) return false;
 
-    return beta_ != ElementCompute(0);
+    return params_.beta != ElementCompute(0);
   }
 
   /// Functionally required for serial reduction in the epilogue
   CUTLASS_HOST_DEVICE
   void set_k_partition(int k_partition, int k_partition_count) {
     if (k_partition) {
-      beta_ = ElementCompute(1);
+      params_.beta = ElementCompute(1);
     }
-    
+
     if (k_partition != k_partition_count - 1) {
       skip_elementwise_ = true;
     }
@@ -186,15 +202,19 @@ class LinearCombinationGeneric {
 
     if (Scale == ScaleType::NoBetaScaling) {
       intermediate = converted_source;
-      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+      intermediate = mul_add_accumulator(params_.alpha, converted_accumulator, intermediate);    // D = alpha * Accum + X
     }  else if (Scale == ScaleType::Nothing) {
       intermediate = converted_accumulator;
     } else {
-      intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
-      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+      intermediate = mul_add_source(params_.beta, converted_source);                             // X =  beta * C + uniform
+      intermediate = mul_add_accumulator(params_.alpha, converted_accumulator, intermediate);    // D = alpha * Accum + X
     }
 
-    intermediate = skip_elementwise_ ? intermediate : activation(intermediate);
+    if constexpr (GenericActivationTraits<ActivationFunctor<ElementCompute>>::IsArgumentsNeeded) {
+      intermediate = skip_elementwise_ ? intermediate : activation(intermediate, params_);
+    } else {
+      intermediate = skip_elementwise_ ? intermediate : activation(intermediate);
+    }
 
     // Convert to destination numeric type
     NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
@@ -222,10 +242,14 @@ class LinearCombinationGeneric {
     if (Scale == ScaleType::Nothing) {
       intermediate = converted_accumulator;
     } else {
-      intermediate = mul_add_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+      intermediate = mul_add_accumulator(params_.alpha, converted_accumulator);    // D = alpha * Accum
     }
 
-    intermediate = skip_elementwise_ ? intermediate : activation(intermediate);
+    if constexpr (GenericActivationTraits<ActivationFunctor<FragmentCompute>>::IsArgumentsNeeded) {
+      intermediate = skip_elementwise_ ? intermediate : activation(intermediate, params_);
+    } else {
+      intermediate = skip_elementwise_ ? intermediate : activation(intermediate);
+    }
 
     // Convert to destination numeric type
     NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
diff --git a/include/cutlass/epilogue/thread/linear_combination_generic_with_scaling.h b/include/cutlass/epilogue/thread/linear_combination_generic_with_scaling.h
new file mode 100644
index 0000000000..e1dde1a6a2
--- /dev/null
+++ b/include/cutlass/epilogue/thread/linear_combination_generic_with_scaling.h
@@ -0,0 +1,325 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Functor performing linear combination operations with a generic element-wise activation
+  function. Scaling factors are applied to operands A, B, and C. The pre-activation auxiliary
+  output is also returned.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements.
+///
+/// Aux = ((alpha * scale_a * scale_b) * accumulator) + ((beta * scale_c) * source) + bias
+///   D = activation(Aux)
+///
+template <
+  template<typename T> class ActivationFunctor,
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  typename ElementAuxOutput_,                          ///< Data type used to store auxiliary output
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
+  bool IsHeavy = false
+>
+class LinearCombinationGenericWithScalingAndAbsMax {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAuxOutput = ElementAuxOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementScalingFactor = ElementAccumulator_;
+
+  /// Data type used for absolute maximum value
+  using ElementAbsmax = float;
+
+  static bool const kIsScalingAndAmaxAuxOutputNeeded = (platform::is_same<ElementAuxOutput, cutlass::float_e4m3_t>::value ||
+                                                        platform::is_same<ElementAuxOutput, cutlass::float_e5m2_t>::value);
+  static bool const kIsScalingAndAmaxOutputNeeded    = (platform::is_same<ElementOutput, cutlass::float_e4m3_t>::value ||
+                                                        platform::is_same<ElementOutput, cutlass::float_e5m2_t>::value);
+
+  static bool const kIsHeavy = IsHeavy;
+  static int const kCount = Count;
+  static const ScaleType::Kind kScale = Scale;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAuxOutput = Array<ElementAuxOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentCompute = Array<ElementCompute, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  /// Host-constructable parameters structure
+  struct Params {
+    struct ActivationParams
+      : LinearCombinationGenericParams<ElementCompute>,
+        GenericActivationTraits<ActivationFunctor<ElementCompute>>::Arguments {
+      using LinearCombinationGenericParams<ElementCompute>::LinearCombinationGenericParams;
+    };
+
+    ActivationParams activation;
+    ElementScalingFactor const* scale_a_ptr = nullptr;   ///< pointer to a scalar - if not null, loads it from memory
+    ElementScalingFactor const* scale_b_ptr = nullptr;   ///< pointer to b scalar - if not null, loads it from memory
+    ElementScalingFactor const* scale_c_ptr = nullptr;   ///< pointer to c scalar - if not null, loads it from memory
+    ElementScalingFactor const* scale_d_ptr = nullptr;   ///< pointer to d scalar - if not null, loads it from memory
+    ElementScalingFactor const* scale_aux_ptr = nullptr; ///< pointer to aux scalar - if not null, loads it from memory
+
+    ElementAbsmax * abs_max_aux_ptr = nullptr;      ///< pointer to location to store amax of Aux
+    ElementAbsmax * abs_max_D_ptr   = nullptr;      ///< pointer to location to store amax of D
+
+    CUTLASS_HOST_DEVICE
+    Params() :
+      scale_a_ptr(nullptr),
+      scale_b_ptr(nullptr),
+      scale_c_ptr(nullptr),
+      scale_d_ptr(nullptr),
+      scale_aux_ptr(nullptr),
+      abs_max_aux_ptr(nullptr),
+      abs_max_D_ptr(nullptr) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(ActivationParams activation_params,
+           ElementScalingFactor const* scale_a_ptr,
+           ElementScalingFactor const* scale_b_ptr,
+           ElementScalingFactor const* scale_c_ptr,
+           ElementScalingFactor const* scale_d_ptr,
+           ElementScalingFactor const* scale_aux_ptr,
+           ElementAbsmax * abs_max_aux_ptr,
+           ElementAbsmax * abs_max_D_ptr) :
+           activation(activation_params),
+           scale_a_ptr(scale_a_ptr),
+           scale_b_ptr(scale_b_ptr),
+           scale_c_ptr(scale_c_ptr),
+           scale_d_ptr(scale_d_ptr),
+           scale_aux_ptr(scale_aux_ptr),
+           abs_max_aux_ptr(abs_max_aux_ptr),
+           abs_max_D_ptr(abs_max_D_ptr) {}
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  Params params_;
+  bool skip_elementwise_;
+
+  // Scaling factors for output and auxiliary output
+  ElementCompute scale_d_;
+  ElementCompute scale_aux_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationGenericWithScalingAndAbsMax(Params const &params) :
+    params_(params),
+    skip_elementwise_(false),
+    scale_d_(ElementCompute(params.scale_d_ptr ? *(params.scale_d_ptr) : ElementScalingFactor(1))),
+    scale_aux_(ElementCompute(params.scale_aux_ptr ? *(params.scale_aux_ptr) : ElementScalingFactor(1)))
+  {
+    params_.activation.alpha = (params.activation.alpha_ptr ? *params.activation.alpha_ptr : params.activation.alpha);
+    params_.activation.beta = (params.activation.beta_ptr ? *params.activation.beta_ptr : params.activation.beta);
+    auto scale_a =
+        ElementCompute(params.scale_a_ptr ? *(params.scale_a_ptr) : ElementScalingFactor(1));
+    auto scale_b =
+        ElementCompute(params.scale_b_ptr ? *(params.scale_b_ptr) : ElementScalingFactor(1));
+    auto scale_c =
+        ElementCompute(params.scale_c_ptr ? *(params.scale_c_ptr) : ElementScalingFactor(1));
+
+    multiplies<ElementCompute> multiply;
+    params_.activation.alpha = multiply(params.activation.alpha, multiply(scale_a, scale_b));
+    params_.activation.beta = multiply(params.activation.beta, scale_c);
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+
+    if (Scale == ScaleType::Nothing) return false;
+
+    return params_.activation.beta != ElementCompute(0);
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      params_.activation.beta = ElementCompute(1);
+    }
+
+    // Only the final partition should perform the activation function
+    // and scale the output and auxiliary output values.
+    if (k_partition != k_partition_count - 1) {
+      skip_elementwise_ = true;
+      scale_d_ = ElementCompute(1.);
+      scale_aux_ = ElementCompute(1.);
+    }
+  }
+
+  /// Computes linear scaling:
+  ///    Aux = (alpha * scale_a * scale_b * accumulator) + (beta * scale_c * source) + bias
+  ///      D = activation(Aux)
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentCompute& output,
+    FragmentCompute& aux_output,
+    FragmentAccumulator const &accumulator,
+    FragmentCompute const& bias,
+    FragmentOutput const &source) {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> multiply;
+    plus<FragmentCompute> add;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+    ActivationFunctor<FragmentCompute> activation;
+
+    if (Scale == ScaleType::NoBetaScaling) {
+      intermediate = converted_source;
+      intermediate = mul_add_accumulator(params_.activation.alpha, converted_accumulator, intermediate);
+    }  else if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = multiply(params_.activation.beta, converted_source);
+      intermediate = mul_add_accumulator(params_.activation.alpha, converted_accumulator, intermediate);
+    }
+
+    intermediate = add(intermediate, bias);
+
+    aux_output = intermediate;
+    if constexpr (GenericActivationTraits<ActivationFunctor<ElementCompute>>::IsArgumentsNeeded) {
+      output = skip_elementwise_ ? intermediate : activation(intermediate, params_.activation);
+    } else {
+      output = skip_elementwise_ ? intermediate : activation(intermediate);
+    }
+  }
+
+  /// Computes linear scaling:
+  ///    Aux = (alpha * scale_a * scale_b * accumulator) + bias
+  ///      D = activation(Aux)
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentCompute& output,
+    FragmentCompute& aux_output,
+    FragmentAccumulator const &accumulator,
+    FragmentCompute const& bias) {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> multiply;
+    plus<FragmentCompute> add;
+    ActivationFunctor<FragmentCompute> activation;
+
+    if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = multiply(params_.activation.alpha, converted_accumulator);
+    }
+
+    intermediate = add(intermediate, bias);
+
+    aux_output = intermediate;
+    if constexpr (GenericActivationTraits<ActivationFunctor<FragmentCompute>>::IsArgumentsNeeded) {
+      output = skip_elementwise_ ? intermediate : activation(intermediate, params_.activation);
+    } else {
+      output = skip_elementwise_ ? intermediate : activation(intermediate);
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  ElementAbsmax* get_ptr_output_abs_max() const {
+    return params_.abs_max_D_ptr;
+  }
+
+  CUTLASS_HOST_DEVICE
+  ElementAbsmax* get_ptr_aux_output_abs_max() const {
+    return params_.abs_max_aux_ptr;
+  }
+
+  CUTLASS_HOST_DEVICE
+  ElementCompute get_scale_d() const {
+    return scale_d_;
+  }
+
+  CUTLASS_HOST_DEVICE
+  ElementCompute get_scale_aux() const {
+    return scale_aux_;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
diff --git a/include/cutlass/epilogue/thread/linear_combination_hardswish.h b/include/cutlass/epilogue/thread/linear_combination_hardswish.h
index 851712bfc8..ef51a318bd 100644
--- a/include/cutlass/epilogue/thread/linear_combination_hardswish.h
+++ b/include/cutlass/epilogue/thread/linear_combination_hardswish.h
@@ -1,5 +1,5 @@
 /*************************************************************************************************** 
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/thread/linear_combination_leaky_relu.h b/include/cutlass/epilogue/thread/linear_combination_leaky_relu.h
index 111fc53e60..5989f09ed3 100644
--- a/include/cutlass/epilogue/thread/linear_combination_leaky_relu.h
+++ b/include/cutlass/epilogue/thread/linear_combination_leaky_relu.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -37,6 +37,7 @@
 #include "cutlass/functional.h"
 #include "cutlass/numeric_conversion.h"
 #include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/scale_type.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -71,6 +72,7 @@ class LinearCombinationLeakyRelu {
   using FragmentOutput = Array<ElementOutput, kCount>;
   using FragmentAccumulator = Array<ElementAccumulator, kCount>;
   using ComputeFragment = Array<ElementCompute, kCount>;
+  using FragmentSource = Array<ElementOutput, kCount>;
 
   static FloatRoundStyle const kRound = Round;
 
diff --git a/include/cutlass/epilogue/thread/linear_combination_params.h b/include/cutlass/epilogue/thread/linear_combination_params.h
new file mode 100644
index 0000000000..2710556765
--- /dev/null
+++ b/include/cutlass/epilogue/thread/linear_combination_params.h
@@ -0,0 +1,75 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief 
+*/
+
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct LinearCombinationParams {
+  uint64_t alpha_data[2];
+  uint64_t beta_data[2];
+
+  CUTLASS_HOST_DEVICE
+  LinearCombinationParams()
+  : alpha_data {0lu, 0lu}, beta_data {0lu, 0lu} 
+  { }
+
+  template <typename ElementCompute>
+  CUTLASS_HOST_DEVICE 
+  LinearCombinationParams(ElementCompute alpha, ElementCompute beta) 
+  : alpha_data {0lu, 0lu}, beta_data {0lu, 0lu} 
+  {
+#if defined(__CUDA_ARCH__)
+    reinterpret_cast<ElementCompute&>(alpha_data) = alpha;
+    reinterpret_cast<ElementCompute&>(beta_data) = beta;
+#else
+    memcpy( alpha_data, &alpha, sizeof(ElementCompute) ); 
+    memcpy( beta_data, &beta, sizeof(ElementCompute) ); 
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/thread/linear_combination_planar_complex.h b/include/cutlass/epilogue/thread/linear_combination_planar_complex.h
index 7773b055ad..ff32f13b0b 100644
--- a/include/cutlass/epilogue/thread/linear_combination_planar_complex.h
+++ b/include/cutlass/epilogue/thread/linear_combination_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -40,6 +40,7 @@
 #include "cutlass/array_planar_complex.h"
 #include "cutlass/functional.h"
 #include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/scale_type.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -62,7 +63,8 @@ template <
                                                        ///< but we use 64 or 32 sometimes when there are not enough data to store
   typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
   typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
-  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
+  ScaleType::Kind Scale = ScaleType::Default           ///< Control Alpha and Beta scaling
 >
 class LinearCombinationPlanarComplex {
 public:
@@ -70,8 +72,10 @@ class LinearCombinationPlanarComplex {
   using ElementOutput = ElementOutput_;
   using ElementAccumulator = ElementAccumulator_;
   using ElementCompute = ElementCompute_;
+  using ElementScalar = complex<ElementCompute>;
 
   static int const kCount = Count;
+  static const ScaleType::Kind kScale = Scale;
 
   using FragmentOutput = ArrayPlanarComplex<ElementOutput, kCount>;
   using FragmentAccumulator = ArrayPlanarComplex<ElementAccumulator, kCount>;
@@ -82,37 +86,30 @@ class LinearCombinationPlanarComplex {
   /// Host-constructable parameters structure
   struct Params {
 
-    complex<ElementCompute> alpha;                  ///< scales accumulators
-    complex<ElementCompute> beta;                   ///< scales source tensor
-    complex<ElementCompute> const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
-    complex<ElementCompute> const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    ElementScalar alpha{ElementCompute(1)};         ///< scales accumulators
+    ElementScalar beta{ElementCompute(0)};          ///< scales source tensor
+    ElementScalar const* alpha_ptr{nullptr};        ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementScalar const* beta_ptr{nullptr};         ///< pointer to source scalar - if not null, loads it from memory
 
     //
     // Methods
     //
 
-    CUTLASS_HOST_DEVICE
-    Params(): 
-      alpha(ElementCompute(1)), 
-      beta(ElementCompute(0)), 
-      alpha_ptr(nullptr), 
-      beta_ptr(nullptr) { }
+    Params() = default;
 
     CUTLASS_HOST_DEVICE
     Params(
-      complex<ElementCompute> alpha,
-      complex<ElementCompute> beta
-    ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {
-
-    }
+      ElementScalar alpha,
+      ElementScalar beta
+    ): alpha(alpha), beta(beta)
+    {}
 
     CUTLASS_HOST_DEVICE
     Params(
-      complex<ElementCompute> const *alpha_ptr,
-      complex<ElementCompute> const *beta_ptr
-    ): alpha(complex<ElementCompute>()), beta(complex<ElementCompute>()), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
-
-    }
+      ElementScalar const *alpha_ptr,
+      ElementScalar const *beta_ptr
+    ): alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) 
+    {}
   };
 
 private:
@@ -121,15 +118,14 @@ class LinearCombinationPlanarComplex {
   // Data members
   //
 
-  complex<ElementCompute> alpha_;
-  complex<ElementCompute> beta_;
+  ElementScalar alpha_;
+  ElementScalar beta_;
 
 public:
 
   /// Constructs the function object, possibly loading from pointers in host memory
   CUTLASS_HOST_DEVICE
   LinearCombinationPlanarComplex(Params const &params) {
-
     alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
     beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
   }
@@ -137,6 +133,8 @@ class LinearCombinationPlanarComplex {
   /// Returns true if source is needed
   CUTLASS_HOST_DEVICE
   bool is_source_needed() const {
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+
     return beta_.real() != ElementCompute(0) || beta_.imag() != ElementCompute(0);
   }
 
@@ -158,23 +156,24 @@ class LinearCombinationPlanarComplex {
     NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
     NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
 
-    ComputeFragment converted_source(
+    ComputeFragment converted_source{
       source_converter(source.real), 
-      source_converter(source.imag));
+      source_converter(source.imag)};
 
-    ComputeFragment converted_accumulator(
+    ComputeFragment converted_accumulator{
       accumulator_converter(accumulator.real), 
-      accumulator_converter(accumulator.imag));
-
-    // Perform binary operations
-    ComputeFragment intermediate;
+      accumulator_converter(accumulator.imag)};
 
     multiplies<Array<ElementCompute, kCount> > mul_op;
     multiply_add<Array<ElementCompute, kCount> > mul_add_op;
 
+    // Perform binary operations
+  
     // complex multiply: I = beta * C
-    intermediate.real = mul_op(beta_.real(), converted_source.real);
-    intermediate.imag = mul_op(beta_.real(), converted_source.imag);
+    ComputeFragment intermediate {
+      mul_op(beta_.real(), converted_source.real),
+      mul_op(beta_.real(), converted_source.imag)
+    };
 
     intermediate.real = mul_add_op(-beta_.imag(), converted_source.imag, intermediate.real);
     intermediate.imag = mul_add_op( beta_.imag(), converted_source.real, intermediate.imag);
@@ -189,9 +188,9 @@ class LinearCombinationPlanarComplex {
     // Convert to destination numeric type
     NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
 
-    return FragmentOutput(
+    return FragmentOutput{
       destination_converter(intermediate.real), 
-      destination_converter(intermediate.imag));
+      destination_converter(intermediate.imag)};
   }
 
   /// Computes linear scaling: D = alpha * accumulator + beta * source
@@ -202,19 +201,19 @@ class LinearCombinationPlanarComplex {
     // Convert source to interal compute numeric type
     NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
 
-    ComputeFragment converted_accumulator(
+    ComputeFragment converted_accumulator{
       accumulator_converter(accumulator.real), 
-      accumulator_converter(accumulator.imag));
+      accumulator_converter(accumulator.imag)};
 
     // Perform binary operations
-    ComputeFragment intermediate;
-
     multiplies<Array<ElementCompute, kCount> > mul_op;
     multiply_add<Array<ElementCompute, kCount> > mul_add_op;
 
     // complex multiply-add: I = alpha * AB + I
-    intermediate.real = mul_add_op(alpha_.real(), converted_accumulator.real);
-    intermediate.imag = mul_add_op(alpha_.real(), converted_accumulator.imag);
+    ComputeFragment intermediate {
+      mul_op(alpha_.real(), converted_accumulator.real),
+      mul_op(alpha_.real(), converted_accumulator.imag)
+    };
 
     intermediate.real = mul_add_op(-alpha_.imag(), converted_accumulator.imag, intermediate.real);
     intermediate.imag = mul_add_op( alpha_.imag(), converted_accumulator.real, intermediate.imag);
@@ -222,9 +221,9 @@ class LinearCombinationPlanarComplex {
     // Convert to destination numeric type
     NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
 
-    return FragmentOutput(
+    return FragmentOutput{
       destination_converter(intermediate.real), 
-      destination_converter(intermediate.imag));
+      destination_converter(intermediate.imag)};
   }
 };
 
diff --git a/include/cutlass/epilogue/thread/linear_combination_relu.h b/include/cutlass/epilogue/thread/linear_combination_relu.h
index ea20021bcb..bbdc498622 100644
--- a/include/cutlass/epilogue/thread/linear_combination_relu.h
+++ b/include/cutlass/epilogue/thread/linear_combination_relu.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -34,7 +34,7 @@
 
 #pragma once
 
-#include <cutlass/half.h>
+#include "cutlass/half.h"
 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
 #include "cutlass/array.h"
@@ -90,6 +90,7 @@ class LinearCombinationRelu {
   using FragmentAccumulator = Array<ElementAccumulator, kCount>;
   using FragmentCompute = Array<ElementCompute, kCount>;
   using FragmentScaleBias = Array<ElementCompute, kCount>;
+  using FragmentSource = Array<ElementOutput, kCount>;
 
   static FloatRoundStyle const kRound = Round;
 
@@ -321,6 +322,7 @@ class LinearCombinationRelu <ElementOutput_, Count, int, float, Scale, Round> {
   using FragmentAccumulator = Array<ElementAccumulator, kCount>;
   using FragmentCompute = Array<ElementCompute, kCount>;
   using FragmentScaleBias = Array<ElementCompute, kCount>;
+  using FragmentSource = Array<ElementOutput, kCount>;
 
   static FloatRoundStyle const kRound = Round;
 
@@ -445,7 +447,7 @@ class LinearCombinationRelu <ElementOutput_, Count, int, float, Scale, Round> {
     // Compute threshold optionally
     intermediate = relu(threshold_, intermediate);
 
-    if (platform::numeric_limits<ElementOutput>::is_integer) {
+    if (cutlass::platform::numeric_limits<ElementOutput>::is_integer) {
       // Convert floats back to INT
       FragmentAccumulator scaled_accumulator;
 
@@ -490,7 +492,7 @@ class LinearCombinationRelu <ElementOutput_, Count, int, float, Scale, Round> {
     // Compute threshold optionally
     intermediate = relu(threshold_, intermediate);
 
-    if (platform::numeric_limits<ElementOutput>::is_integer) {
+    if (cutlass::platform::numeric_limits<ElementOutput>::is_integer) {
       // Convert floats back to INT
       FragmentAccumulator scaled_accumulator;
 
@@ -538,7 +540,7 @@ class LinearCombinationRelu <ElementOutput_, Count, int, float, Scale, Round> {
     // Compute threshold optionally
     intermediate = relu(threshold_, intermediate);
 
-    if (platform::numeric_limits<ElementOutput>::is_integer) {
+    if (cutlass::platform::numeric_limits<ElementOutput>::is_integer) {
       // Convert floats back to INT
       FragmentAccumulator scaled_accumulator;
 
diff --git a/include/cutlass/epilogue/thread/linear_combination_relu0.h b/include/cutlass/epilogue/thread/linear_combination_relu0.h
index 4fe1d3fe0d..76ad59244d 100644
--- a/include/cutlass/epilogue/thread/linear_combination_relu0.h
+++ b/include/cutlass/epilogue/thread/linear_combination_relu0.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -37,7 +37,7 @@
 
 #pragma once
 
-#include <cutlass/half.h>
+#include "cutlass/half.h"
 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
 #include "cutlass/array.h"
@@ -93,6 +93,7 @@ class LinearCombinationRelu0 {
   using FragmentAccumulator = Array<ElementAccumulator, kCount>;
   using FragmentCompute = Array<ElementCompute, kCount>;
   using FragmentScaleBias = Array<ElementCompute, kCount>;
+  using FragmentSource = Array<ElementOutput, kCount>;
 
   static FloatRoundStyle const kRound = Round;
 
@@ -308,6 +309,7 @@ class LinearCombinationRelu0 <ElementOutput_, Count, int, float, Scale, Round> {
   using FragmentAccumulator = Array<ElementAccumulator, kCount>;
   using FragmentCompute = Array<ElementCompute, kCount>;
   using FragmentScaleBias = Array<ElementCompute, kCount>;
+  using FragmentSource = Array<ElementOutput, kCount>;
 
   static FloatRoundStyle const kRound = Round;
 
@@ -416,7 +418,7 @@ class LinearCombinationRelu0 <ElementOutput_, Count, int, float, Scale, Round> {
     // Compute threshold optionally
     intermediate = relu(intermediate);
 
-    if (platform::numeric_limits<ElementOutput>::is_integer) {
+    if (cutlass::platform::numeric_limits<ElementOutput>::is_integer) {
       // Convert floats back to INT
       FragmentAccumulator scaled_accumulator;
 
@@ -461,7 +463,7 @@ class LinearCombinationRelu0 <ElementOutput_, Count, int, float, Scale, Round> {
     // Compute threshold optionally
     intermediate = relu(intermediate);
 
-    if (platform::numeric_limits<ElementOutput>::is_integer) {
+    if (cutlass::platform::numeric_limits<ElementOutput>::is_integer) {
       // Convert floats back to INT
       FragmentAccumulator scaled_accumulator;
 
@@ -509,7 +511,7 @@ class LinearCombinationRelu0 <ElementOutput_, Count, int, float, Scale, Round> {
     // Compute threshold optionally
     intermediate = relu(intermediate);
 
-    if (platform::numeric_limits<ElementOutput>::is_integer) {
+    if (cutlass::platform::numeric_limits<ElementOutput>::is_integer) {
       // Convert floats back to INT
       FragmentAccumulator scaled_accumulator;
 
diff --git a/include/cutlass/epilogue/thread/linear_combination_residual_block.h b/include/cutlass/epilogue/thread/linear_combination_residual_block.h
index c32ab7c47e..ec4083de65 100644
--- a/include/cutlass/epilogue/thread/linear_combination_residual_block.h
+++ b/include/cutlass/epilogue/thread/linear_combination_residual_block.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,7 @@
  **************************************************************************************************/
 
 /*! \file
-  \brief Epilogue functor specialized for residual blocks in deep neural network.
+  \brief Epilogue functor specialized for residual blocks in deep neural networks.
 */
 
 #pragma once
@@ -38,6 +38,7 @@
 #include "cutlass/array.h"
 #include "cutlass/functional.h"
 #include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/detail.hpp"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -45,24 +46,30 @@ namespace cutlass {
 namespace epilogue {
 namespace thread {
 
-// /// Models a residual block of the form: UnaryOp(BinaryOp(ActivationOp(TensorOp(X) + bias), residual))
+/// Models a residual block of the form: UnaryOp(BinaryOp(BinaryOp(ActivationOp(TensorOp(X) + bias), residual1), residual2))
 template <typename ElementOutput_, typename ElementAccumulator_,
           typename ElementCompute_, typename ElementC_, int ElementsPerAccess,
           template <typename T> class ActivationOp_,
-          template <typename T> class BinaryOp_,
-          template <typename T> class UnaryOp_>
+          template <typename T> class BinaryOp1_,
+          template <typename T> class UnaryOp_,
+          template <typename T> class BinaryOp2_ = detail::NoOp,
+          bool StoreT_ = false,
+          typename ElementVector_ = ElementC_>
 class LinearCombinationResidualBlock {
 public:
+  static bool const kIsSingleSource = false;
 
   using ElementOutput = ElementC_;
   using ElementC = ElementC_;
   using ElementAccumulator = ElementAccumulator_;
   using ElementCompute = ElementCompute_;
+  using ElementVector = ElementVector_;
   static int const kElementsPerAccess = ElementsPerAccess;
   static int const kCount = kElementsPerAccess;
 
   using UnaryOp = UnaryOp_<Array<ElementCompute, kCount>>;
-  using BinaryOp = BinaryOp_<Array<ElementCompute, kCount>>;
+  using BinaryOp1 = BinaryOp1_<Array<ElementCompute, kCount>>;
+  using BinaryOp2 = BinaryOp2_<Array<ElementCompute, kCount>>;
   using ActivationOp = ActivationOp_<Array<ElementCompute, kCount>>;
 
   using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
@@ -77,7 +84,132 @@ class LinearCombinationResidualBlock {
 
   static bool const kIsHeavy = true;
   static bool const kStoreZ = true;
-  static bool const kStoreT = false;
+  static bool const kStoreT = StoreT_;
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales residual input
+    ElementCompute const *alpha_ptr{nullptr};       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr{nullptr};        ///< pointer to residual scalar - if not null, loads it from memory
+
+    CUTLASS_HOST_DEVICE
+    Params() : alpha(ElementCompute(1)), beta(ElementCompute(1)) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute alpha, ElementCompute beta)
+        : alpha(alpha), beta(beta) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute const *alpha_ptr, ElementCompute const *beta_ptr)
+        : alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {}
+  };
+
+private:
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+  bool skip_elementwise_;
+
+public:
+
+  /// Constructor from Params
+  CUTLASS_HOST_DEVICE
+  LinearCombinationResidualBlock(Params const &params) {
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    skip_elementwise_ = false;
+  }
+
+  /// The "source" tensor corresponds to the residual input
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const { return true; }
+
+  /// Functionally required for serial reduction in the epilogue
+  /// IMPORTANT: Split-k is supported only when ActivationOp is Identity.
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+
+    if (k_partition != k_partition_count - 1) {
+      skip_elementwise_ = true;
+    }
+  }
+
+  /// Applies the operation UnaryOp(BinaryOp(BinaryOp(ActivationOp(AB + bias), residual1), residual2))
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentOutput &frag_Z, FragmentOutput &, FragmentAccumulator const &AB,
+                  FragmentC const &residual1, FragmentC const &residual2,
+                  FragmentCompute const &bias) const {
+    UnaryOp unary_op;
+    BinaryOp1 binary_op1;
+    BinaryOp2 binary_op2;
+    ActivationOp activation;
+
+    FragmentCompute tmp_Accum =
+        NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute tmp_residual1 =
+        NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(residual1);
+    FragmentCompute tmp_residual2 =
+        NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(residual2);
+
+    FragmentCompute z =
+        binary_op2(binary_op1(activation(alpha_ * tmp_Accum + bias), beta_ * tmp_residual1), beta_ * tmp_residual2);
+    FragmentCompute result_Z = skip_elementwise_ ? z : unary_op(z);
+
+    NumericArrayConverter<ElementOutput, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+  }
+
+  /// Should never be called
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentOutput &, FragmentOutput &, FragmentAccumulator const &,
+                  FragmentCompute const &) const {}
+};
+
+/// Models a residual block of the form: UnaryOp(BinaryOp(ActivationOp(TensorOp(X) + bias), residual))
+template <typename ElementOutput_, typename ElementAccumulator_,
+          typename ElementCompute_, typename ElementC_, int ElementsPerAccess,
+          template <typename T> class ActivationOp_,
+          template <typename T> class BinaryOp1_,
+          template <typename T> class UnaryOp_,
+          bool StoreT_,
+          typename ElementVector_>
+class LinearCombinationResidualBlock<ElementOutput_, ElementAccumulator_,
+          ElementCompute_, ElementC_, ElementsPerAccess,
+          ActivationOp_, BinaryOp1_, UnaryOp_,
+          detail::NoOp, StoreT_, ElementVector_> {
+public:
+  static bool const kIsSingleSource = true;
+
+  using ElementOutput = ElementC_;
+  using ElementC = ElementC_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementVector = ElementVector_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kCount = kElementsPerAccess;
+
+  using UnaryOp = UnaryOp_<Array<ElementCompute, kCount>>;
+  using BinaryOp = BinaryOp1_<Array<ElementCompute, kCount>>;
+  using ActivationOp = ActivationOp_<Array<ElementCompute, kCount>>;
+
+  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
+  using FragmentC = Array<ElementC, kElementsPerAccess>;
+  using FragmentOutput = Array<ElementOutput, kElementsPerAccess>;
+
+  using ElementZ = ElementOutput_;
+  using ElementT = ElementZ;
+  using FragmentZ = Array<ElementZ, kElementsPerAccess>;
+  using FragmentT = Array<ElementT, kElementsPerAccess>;
+
+  static bool const kIsHeavy = true;
+  static bool const kStoreZ = true;
+  static bool const kStoreT = StoreT_;
 
   /// Host-constructable parameters structure
   struct Params {
diff --git a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
index 752897f0bb..35251177f0 100644
--- a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
+++ b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/thread/linear_combination_silu.h b/include/cutlass/epilogue/thread/linear_combination_silu.h
index 641dfb3e70..fa346b068c 100644
--- a/include/cutlass/epilogue/thread/linear_combination_silu.h
+++ b/include/cutlass/epilogue/thread/linear_combination_silu.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp b/include/cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp
new file mode 100644
index 0000000000..c3ceea0ab1
--- /dev/null
+++ b/include/cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp
@@ -0,0 +1,253 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Functor performing linear combination operation, bias addition, and tensor-tensor
+  elementwise operations
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/detail.hpp"
+#include "cutlass/epilogue/thread/scale_type.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+namespace detail {
+
+/// Returns whether a source operand is needed for a combination of binary operation and scale
+/// type. Simple specialized checks are made for cases in which 0 is an identity element of
+/// the binary operation.
+template <class BinaryOp, class ElementCompute, ScaleType::Kind Scale>
+CUTLASS_HOST_DEVICE
+bool is_binary_op_source_needed(ElementCompute scale) {
+  if constexpr (cute::is_same_v<BinaryOp, NoOp<ElementCompute>>) {
+    return false;
+  }
+  else if constexpr (cute::is_same_v<BinaryOp, plus<ElementCompute>> || cute::is_same_v<BinaryOp, minus<ElementCompute>>) {
+    // Cases for binary operators for which 0 is an identity element
+    if constexpr (Scale == ScaleType::NoBetaScaling) return true;
+
+    if constexpr (Scale == ScaleType::OnlyAlphaScaling) return false;
+
+    if constexpr (Scale == ScaleType::Nothing) return false;
+
+    return scale != ElementCompute(0);
+  }
+
+  return true;
+}
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/** Compute a tensor-tensor broadcast epilogue.
+ *
+ * @param ElementOutput_ Data type used to load and store tensors
+ * @param ElementAccumulator_ Accumulator data type
+ * @param ElementCompute_ Data type used to compute linear combination
+ * @param ElementBias_ Data type of Bias elements
+ * @param ActivationFunctor_ Fused Activation
+ * @param BinaryOp0_ Binary operation to perform on O0 and C0. detail::NoOp means no operation
+ * @param BinaryOp1_ Binary operation to perform on O1 and C1. detail::NoOp means no operation
+ * @param UnaryOp_ Unary operation to perform on final result
+ * @param Scale Controls the type of Alpha and Beta scaling to perform
+ * @param Round How values should be rounded in conversions
+ * @param ElementSource_ Data type used for source operands
+ *
+ *  Computes the following:
+ *      O0 = alpha * accumulator + bias
+ *      O1 = BinaryOp0(O0, beta * C0)
+ *      O2 = BinaryOp1(O1, beta * C1)
+ *      D  = UnaryOp(O2)
+ */
+template <
+  class ElementOutput_,
+  class ElementAccumulator_ = ElementOutput_,
+  class ElementCompute_ = ElementOutput_,
+  class ElementBias_ = ElementCompute_,
+  template <class T> class ActivationFunctor_ = Identity,
+  template <class T> class BinaryOp0_ = plus,
+  template <class T> class BinaryOp1_ = detail::NoOp,
+  template <class T> class UnaryOp_ = Identity,
+  ScaleType::Kind Scale = ScaleType::Default,
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
+  class ElementSource_ = ElementOutput_
+>
+class LinearCombinationTensorBroadcast {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementScalar = ElementCompute;
+  using ElementBias = ElementBias_;
+  using ElementC = ElementSource_;
+  using ElementD = ElementOutput_;
+  using ElementScalingFactor = ElementAccumulator_;
+
+  using UnaryOp = UnaryOp_<ElementCompute>;
+  using BinaryOp0 = BinaryOp0_<ElementCompute>;
+  using BinaryOp1 = BinaryOp1_<ElementCompute>;
+  using ActivationFunctor = ActivationFunctor_<ElementCompute>;
+
+  static constexpr int kCount = 1;
+  static constexpr ScaleType::Kind kScale = Scale;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using ComputeFragment = Array<ElementCompute, kCount>;
+  using FragmentBias = Array<ElementBias, kCount>;
+
+  static constexpr FloatRoundStyle kRound = Round;
+  using NoOpType = detail::NoOp<ElementCompute>;
+  static constexpr bool IsBinaryOp0Enabled = !cute::is_same_v<BinaryOp0, NoOpType>;
+  static constexpr bool IsBinaryOp1Enabled = !cute::is_same_v<BinaryOp1, NoOpType>;
+  static constexpr bool IsUnaryOpEnabled = !cute::is_same_v<UnaryOp, NoOpType> && !cute::is_same_v<UnaryOp, Identity<ElementCompute>>;
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha{};                          ///< scales accumulators
+    ElementCompute beta{};                           ///< scales source tensor
+    ElementCompute const* alpha_ptr = nullptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const* beta_ptr = nullptr;        ///< pointer to source scalar - if not null, loads it from memory
+
+    //
+    // Methods
+    //
+    Params() = default;
+
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute const* alpha_ptr, ElementCompute const* beta_ptr)
+        : alpha_ptr(alpha_ptr),
+          beta_ptr(beta_ptr) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute const* alpha_ptr)
+        : alpha_ptr(alpha_ptr) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute alpha,
+           ElementCompute beta)
+        : alpha(alpha),
+          beta(beta) {}
+  };
+
+private:
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationTensorBroadcast(Params const& params)
+      : alpha_(params.alpha_ptr ? *params.alpha_ptr : params.alpha),
+        beta_(params.beta_ptr ? *params.beta_ptr : params.beta) {}
+
+  /// Returns true if source 0 is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source0_needed() const {
+    return detail::is_binary_op_source_needed<BinaryOp0, ElementCompute, Scale>(beta_);
+  }
+
+  /// Returns true if source 1 is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source1_needed() const {
+    return detail::is_binary_op_source_needed<BinaryOp1, ElementCompute, Scale>(beta_);
+  }
+
+  //
+  // Specialization for scalar
+  //
+  CUTLASS_HOST_DEVICE
+  ElementD operator()(ElementAccumulator const accumulator, ElementC const source0, ElementC source1, ElementBias const bias) {
+    // Convert everything to Compute type, do compute, and then store to output type
+    NumericConverter<ElementCompute, ElementAccumulator, Round> accumulator_converter;
+    NumericConverter<ElementCompute, ElementBias, Round> bias_converter;
+    NumericConverter<ElementCompute, ElementC, Round> source_converter;
+    NumericConverter<ElementD, ElementCompute, Round> destination_converter;
+
+    ActivationFunctor act;
+    multiplies<ElementCompute> mul;
+    multiply_add<ElementCompute> madd;
+
+    ElementCompute intermediate = accumulator_converter(accumulator);
+    intermediate = madd(alpha_, intermediate, bias_converter(bias));
+    intermediate = act(intermediate);
+
+    // Apply BinaryOp0, if needed
+    if constexpr (IsBinaryOp0Enabled) {
+      BinaryOp0 bin0;
+      ElementCompute converted_source = source_converter(source0);
+      intermediate = bin0(intermediate, mul(beta_, converted_source));
+    }
+
+    // Apply BinaryOp1, if needed
+    if constexpr (IsBinaryOp1Enabled) {
+      BinaryOp1 bin1;
+      ElementCompute converted_source = source_converter(source1);
+      intermediate = bin1(intermediate, mul(beta_, converted_source));
+    }
+
+    // Apply UnaryOp, if needed
+    if constexpr (IsUnaryOpEnabled) {
+      UnaryOp unary;
+      intermediate = unary(intermediate);
+    }
+
+    return destination_converter(intermediate);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/thread/linear_combination_with_elementwise.h b/include/cutlass/epilogue/thread/linear_combination_with_elementwise.h
index 4c34256e42..8a2ce5a2a9 100644
--- a/include/cutlass/epilogue/thread/linear_combination_with_elementwise.h
+++ b/include/cutlass/epilogue/thread/linear_combination_with_elementwise.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,7 +35,7 @@
 
 #pragma once
 
-#include <cutlass/half.h>
+#include "cutlass/half.h"
 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
 #include "cutlass/array.h"
diff --git a/include/cutlass/epilogue/thread/reduction_op.h b/include/cutlass/epilogue/thread/reduction_op.h
index 38fe9096e0..b24d4f9539 100644
--- a/include/cutlass/epilogue/thread/reduction_op.h
+++ b/include/cutlass/epilogue/thread/reduction_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/thread/scale_type.h b/include/cutlass/epilogue/thread/scale_type.h
index 043797c672..d1a466213f 100644
--- a/include/cutlass/epilogue/thread/scale_type.h
+++ b/include/cutlass/epilogue/thread/scale_type.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -45,13 +45,17 @@ namespace thread {
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Specifies internal data type for computation
+/// Note :
+///  1. Scalar means alpha/beta is a single value from host(constant param) or device memory.
+///  2. Vector means alpha/beta is a vector always from device memory.
 struct ScaleType {
   enum Kind {
-    Default,           // alpha x C + beta x D
-    NoBetaScaling,     // alpha x C + D
-    OnlyAlphaScaling,  // alpha x C
-    OnlyAlphaPerChannelScaling, // alpha_vec x C
-    Nothing           // C
+    Default,                           // D = scalar_alpha x Acc + scalar_beta x C
+    NoBetaScaling,                     // D = scalar_alpha x Acc + C
+    OnlyAlphaScaling,                  // D = scalar_alpha x Acc
+    PerChannelScaling,                 // D = vector_alpha x Acc + vector_beta x C
+    OnlyAlphaPerChannelScaling,        // D = vector_alpha x Acc
+    Nothing                            // D = Acc
   };
 };
 
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h b/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h
index cce51ad623..30af039bc3 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -77,7 +77,7 @@ namespace threadblock {
 //  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 template <
-  /// Epilouge Shape
+  /// Epilogue Shape
   typename Shape_,
   /// Warp-level mma operator
   typename WarpMmaTensorOp_,
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h b/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h
index 77dceb4a78..e86e4f92b5 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -78,7 +78,7 @@ namespace threadblock {
 //  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 template <
-  /// Epilouge Shape
+  /// Epilogue Shape
   typename Shape_,
   /// Warp-level mma operator
   typename WarpMmaTensorOp_,
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_direct_store.h b/include/cutlass/epilogue/threadblock/default_epilogue_direct_store.h
index 3ee95a62db..8770f61960 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_direct_store.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_direct_store.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h b/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h
index 5ddb99d15b..e38e0ff6df 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_simt.h b/include/cutlass/epilogue/threadblock/default_epilogue_simt.h
index ce66525121..f3119fa407 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_simt.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_simt.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -42,7 +42,10 @@
 #include "cutlass/numeric_types.h"
 #include "cutlass/array.h"
 
+#include "cutlass/arch/mma.h"
+
 #include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
 
 #include "cutlass/epilogue/thread/linear_combination.h"
 #include "cutlass/epilogue/thread/linear_combination_clamp.h"
@@ -58,12 +61,19 @@
 #include "cutlass/epilogue/warp/fragment_iterator_simt.h"
 #include "cutlass/epilogue/warp/tile_iterator_simt.h"
 #include "cutlass/epilogue/threadblock/default_thread_map_simt.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
 
 #include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_conv.h"
 #include "cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h"
 #include "cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_direct_conv.h" 
 #include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator_pitch_linear.h"
 #include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/threadblock/epilogue_depthwise.h"
+
+#include "cutlass/layout/permute.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -79,7 +89,10 @@ template <
   typename WarpMmaSimt_,
   typename OutputOp_,
   int ElementsPerAccess,
-  bool ScatterD = false
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute,
+  conv::StrideSupport StrideSupport = conv::StrideSupport::kUnity,
+  int Rank = 4
 >
 struct DefaultEpilogueSimt {
 
@@ -92,6 +105,8 @@ struct DefaultEpilogueSimt {
   using ElementOutput = typename OutputOp::ElementOutput;
   using LayoutC = typename WarpMmaSimt::LayoutC;
   using ElementAccumulator = typename WarpMmaSimt::ElementC;
+  static conv::StrideSupport const kStrideSupport = StrideSupport;
+  static int const kRank = Rank;
 
   //
   // Thread map
@@ -106,12 +121,29 @@ struct DefaultEpilogueSimt {
     kElementsPerAccess
   >::Type;
 
-  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+  static bool const UseCUDAStore = platform::is_same<ElementOutput, double>::value;
+
+  using PackedOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
     OutputTileThreadMap,
     ElementOutput,
-    ScatterD
+    ScatterD,
+    PermuteDLayout,
+    UseCUDAStore
   >;
 
+  using StridedOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorConv<
+    OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout,
+    UseCUDAStore,
+    kRank
+  >;
+
+  using OutputTileIterator = typename platform::conditional<StrideSupport == cutlass::conv::StrideSupport::kUnity,
+                                                            PackedOutputTileIterator,
+                                                            StridedOutputTileIterator>::type;
+
   using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorSimt<
     typename WarpMmaSimt::Shape,
     typename WarpMmaSimt::ThreadMma,
@@ -309,6 +341,99 @@ struct DefaultEpilogueSimtAffineRankN {
   >;
 };
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for SimtOps.
+template <typename Shape_,        // ThreadBlock Shape
+          typename WarpMmaSimt_,  // mma_depthwise_simt
+          typename OutputOp_,
+          int ElementsPerAccess_,
+          typename ThreadOutputShape_ = cutlass::conv::TensorNHWCShape<1, 1, 1, 1>,
+          typename ThreadBlockOutputShape_ = cutlass::conv::TensorNHWCShape<1, 1, 1, 1> >
+struct DefaultDirectConvEpilogueSimt {
+  using Shape = Shape_;
+  using WarpMmaSimt = WarpMmaSimt_;
+  using WarpShape = typename WarpMmaSimt::Shape;
+  using OutputOp = OutputOp_;
+  using ThreadOutputShape = ThreadOutputShape_;
+  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
+  static int const kElementsPerAccess = ElementsPerAccess_;
+
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaSimt::LayoutC;
+  using ElementAccumulator = typename WarpMmaSimt::ElementC;
+
+  /// Number of threads total
+  using WarpCount = gemm::GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN
+  >;
+
+  static int const kWarpSize = cutlass::gemm::warp::WarpSize<arch::OpClassSimt>::value;
+
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  //
+  // Thread map
+  //
+  
+  using OutputTileThreadMap = cutlass::transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<ThreadBlockOutputShape::kC, ThreadBlockOutputShape::kNHW>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorDirectConv<
+    OutputTileThreadMap,
+    ElementOutput,
+    ThreadOutputShape,
+    ThreadBlockOutputShape 
+  >;
+
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorSimt<
+    typename WarpMmaSimt::Shape,
+    typename WarpMmaSimt::ThreadMma,
+    layout::RowMajor,
+    typename WarpMmaSimt::Policy
+  >;
+  
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorSimtDirect2dConv<
+    typename WarpMmaSimt::Shape,
+    ThreadOutputShape,
+    ThreadBlockOutputShape,
+    typename WarpMmaSimt::ThreadMma,
+    ElementAccumulator,
+    layout::RowMajor,
+    typename WarpMmaSimt::Policy
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorPitchLinear<
+    OutputTileThreadMap,
+    ElementAccumulator
+  >;
+
+  /// Hard-coded padding elements added 
+  using Padding = typename WarpTileIterator::Padding;
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::EpilogueDepthwise<
+    Shape,
+    ThreadOutputShape,
+    ThreadBlockOutputShape,
+    WarpMmaSimt,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace threadblock
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h b/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
index 46f23e1b5b..1d62f4fc35 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -66,6 +66,7 @@
 #include "cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h"
 #include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
 #include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_conv.h"
 #include "cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h"
 #include "cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h"
 #include "cutlass/epilogue/threadblock/shared_load_iterator.h"
@@ -74,6 +75,8 @@
 #include "cutlass/epilogue/threadblock/epilogue.h"
 #include "cutlass/epilogue/threadblock/interleaved_epilogue.h"
 
+#include "cutlass/layout/permute.h"
+
 ////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
@@ -134,14 +137,15 @@ struct DefaultIteratorsTensorOp<float, float, 4, ThreadblockShape, WarpShape, In
   static int const kFragmentsPerIteration = 2;
 };
 
-/// Partial specialization for int32_t <= int32_t x 4
+/// Partial specialization for int32_t <= int32_t
 template <
+  int ElementsPerAccess,
   typename ThreadblockShape,
   typename WarpShape,
   typename InstructionShape,
   typename ThreadMap
 >
-struct DefaultIteratorsTensorOp<int32_t, int32_t, 4, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {
+struct DefaultIteratorsTensorOp<int32_t, int32_t, ElementsPerAccess, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {
   
   using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
     WarpShape,
@@ -158,15 +162,16 @@ struct DefaultIteratorsTensorOp<int32_t, int32_t, 4, ThreadblockShape, WarpShape
   static int const kFragmentsPerIteration = 1;
 };
 
-/// Partial specialization for float <= int32_t x 4
+/// Partial specialization for float <= int32_t
 template <
+  int ElementsPerAccess,
   typename ThreadblockShape,
   typename WarpShape,
   typename InstructionShape,
   typename ThreadMap
 >
-struct DefaultIteratorsTensorOp<float, int32_t, 4, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {
-  
+struct DefaultIteratorsTensorOp<float, int32_t, ElementsPerAccess, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {
+
   using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
     WarpShape,
     InstructionShape,
@@ -220,6 +225,82 @@ struct DefaultIteratorsTensorOp<
   static int const kFragmentsPerIteration = 2;
 };
 
+/// Partial specialization for half <= int32_t x 8 epilogues avoids shared memory bank conflicts.
+template <
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  bfloat16_t,
+  int32_t,
+  8,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  ThreadMap> {
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    32,
+    16,
+    8,
+    8
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    int32_t,
+    32,
+    16,
+    8,
+    8
+  >;
+
+  static int const kFragmentsPerIteration = 2;
+};
+
+/// Partial specialization for half <= int32_t x 8 epilogues avoids shared memory bank conflicts.
+template <
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  half_t, 
+  int32_t, 
+  8, 
+  ThreadblockShape, 
+  WarpShape, 
+  InstructionShape, 
+  ThreadMap> {
+  
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    32,
+    16,
+    8,
+    8
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    int32_t,
+    32,
+    16,
+    8,
+    8
+  >;
+
+  static int const kFragmentsPerIteration = 2;
+};
+
 /// Partial specialization for int8/int4b_t <= int32 x 16/8 epilogues avoids shared memory bank conflicts.
 /// Threadblock::kN = 256 still has bank conflicts.
 template <
@@ -245,7 +326,7 @@ struct DefaultIteratorsTensorOp<
                 platform::is_same<ElementOutput, uint8_t>::value,
                 "ElementOutput needs to be 4 or 8 bit (unsigned) int.");
 
-   static_assert((ElementsPerAccess == 16 || ElementsPerAccess == 8),
+   static_assert((ElementsPerAccess == 16 || ElementsPerAccess == 8 || ElementsPerAccess == 4),
                 "ElementsPerAccess needs to be 16 or 8.");
   
   using WarpTileIteratorMixed = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
@@ -265,8 +346,8 @@ struct DefaultIteratorsTensorOp<
     layout::RowMajor
   >;
 
-  using WarpTileIterator = typename cutlass::platform::conditional<
-                             (ThreadblockShape::kN == 256),
+  using WarpTileIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8) || (ElementsPerAccess == 4),
                              WarpTileIteratorNotMixed,
                              WarpTileIteratorMixed>::type;
 
@@ -284,13 +365,148 @@ struct DefaultIteratorsTensorOp<
     int32_t
   >;
 
-  using SharedLoadIterator = typename cutlass::platform::conditional<
-                             (ThreadblockShape::kN == 256),
+  using SharedLoadIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8) || (ElementsPerAccess == 4),
                              SharedLoadIteratorNotMixed,
                              SharedLoadIteratorMixed>::type;
 
   static int const kFragmentsPerIteration = 1;
 };
+
+/// Partial specialization for float_e4m3_t <= float x 16/8 epilogues avoids shared memory bank conflicts.
+/// Threadblock::kN = 256 still has bank conflicts.
+template <
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  cutlass::float_e4m3_t,
+  float, 
+  ElementsPerAccess,
+  ThreadblockShape, 
+  WarpShape, 
+  InstructionShape, 
+  ThreadMap> {
+
+  using ElementOutput = cutlass::float_e4m3_t;
+
+  static_assert((ElementsPerAccess == 16 || ElementsPerAccess == 8 || ElementsPerAccess == 4),
+              "ElementsPerAccess needs to be 16 or 8.");
+  
+  using WarpTileIteratorMixed = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    float,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using WarpTileIteratorNotMixed =  cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    float,
+    layout::RowMajor
+  >;
+
+  using WarpTileIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8) || (ElementsPerAccess == 4),
+                             WarpTileIteratorNotMixed,
+                             WarpTileIteratorMixed>::type;
+
+  using SharedLoadIteratorMixed = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    float,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using SharedLoadIteratorNotMixed = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    float
+  >;
+
+  using SharedLoadIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8) || (ElementsPerAccess == 4),
+                             SharedLoadIteratorNotMixed,
+                             SharedLoadIteratorMixed>::type;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+/// Partial specialization for float_e5m2_t <= float x 16/8 epilogues avoids shared memory bank conflicts.
+/// Threadblock::kN = 256 still has bank conflicts.
+template <
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  cutlass::float_e5m2_t,
+  float, 
+  ElementsPerAccess,
+  ThreadblockShape, 
+  WarpShape, 
+  InstructionShape, 
+  ThreadMap> {
+
+  using ElementOutput = cutlass::float_e5m2_t;
+
+  static_assert((ElementsPerAccess == 16 || ElementsPerAccess == 8 || ElementsPerAccess == 4),
+              "ElementsPerAccess needs to be 16 or 8.");
+  
+  using WarpTileIteratorMixed = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    float,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using WarpTileIteratorNotMixed =  cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    float,
+    layout::RowMajor
+  >;
+
+  using WarpTileIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8) || (ElementsPerAccess == 4),
+                             WarpTileIteratorNotMixed,
+                             WarpTileIteratorMixed>::type;
+
+  using SharedLoadIteratorMixed = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    float,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using SharedLoadIteratorNotMixed = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    float
+  >;
+
+  using SharedLoadIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8) || (ElementsPerAccess == 4),
+                             SharedLoadIteratorNotMixed,
+                             SharedLoadIteratorMixed>::type;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
 } // namespace detail
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -302,7 +518,10 @@ template <
   int PartitionsK,
   typename OutputOp_,
   int ElementsPerAccess,
-  bool ScatterD = false
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute,
+  conv::StrideSupport StrideSupport = conv::StrideSupport::kUnity,
+  int Rank = 4
 >
 struct DefaultEpilogueTensorOp {
 
@@ -315,6 +534,8 @@ struct DefaultEpilogueTensorOp {
   using ElementOutput = typename OutputOp::ElementOutput;
   using LayoutC = typename WarpMmaTensorOp::LayoutC;
   using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+  static conv::StrideSupport const kStrideSupport = StrideSupport;
+  static int const kRank = Rank;
 
   //
   // Thread map
@@ -330,13 +551,27 @@ struct DefaultEpilogueTensorOp {
 
   static bool const UseCUDAStore = platform::is_same<ElementOutput, double>::value;
 
-  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+  using PackedOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
     OutputTileThreadMap,
     ElementOutput,
     ScatterD,
+    PermuteDLayout,
     UseCUDAStore
   >;
 
+  using StridedOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorConv<
+    OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout,
+    UseCUDAStore,
+    kRank
+  >;
+
+  using OutputTileIterator = typename platform::conditional<StrideSupport == cutlass::conv::StrideSupport::kUnity,
+                                                            PackedOutputTileIterator,
+                                                            StridedOutputTileIterator>::type;
+
   using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
                                     cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
                                         typename WarpMmaTensorOp::Shape,
@@ -570,7 +805,6 @@ struct DefaultEpilogueTensorOpAffineRankN {
 };
 
 ////////////////////////////////////////////////////////////////////////////////
-
 /// Defines sensible defaults for epilogues for TensorOps which uses
 /// intereleaved output layout. For this case, shared memory is not needed.
 template <typename Shape_, typename WarpMmaTensorOp_, int PartitionsK,
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op_blas3.h b/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op_blas3.h
index 6b796b3532..e1ae5a24c1 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op_blas3.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op_blas3.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -119,7 +119,7 @@ struct DefaultEpilogueTensorOpBlas3 {
     kBlasMode
   >;
 
-  using AccumulatorFragmentIterator = typename std::conditional<is_complex<ElementOutput>::value,
+  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
                                     cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
                                         typename WarpMmaTensorOp::Shape,
                                         typename WarpMmaTensorOp::Policy::Operator::Shape,
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h b/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h
index 3b42c88929..f73edfdece 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -66,6 +66,8 @@
 
 #include "cutlass/epilogue/threadblock/epilogue.h"
 
+#include "cutlass/layout/permute.h"
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
@@ -81,7 +83,8 @@ template <
   int PartitionsK,
   typename OutputOp_,
   int ElementsPerAccess,
-  bool ScatterD = false
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
 >
 struct DefaultEpilogueVoltaTensorOp {
 
@@ -111,7 +114,8 @@ struct DefaultEpilogueVoltaTensorOp {
   using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
     OutputTileThreadMap,
     ElementOutput,
-    ScatterD
+    ScatterD,
+    PermuteDLayout
   >;
 
   using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorVoltaTensorOp<
@@ -326,7 +330,6 @@ struct DefaultEpilogueVoltaTensorOpAffineRankN {
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
 } // namespace threadblock
 } // namespace epilogue
 } // namespace cutlass
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_with_absmax.h b/include/cutlass/epilogue/threadblock/default_epilogue_with_absmax.h
new file mode 100644
index 0000000000..b0e89a4ed8
--- /dev/null
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_with_absmax.h
@@ -0,0 +1,126 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Default configuration for epilogue computing absolute maximum of output and auxiliary outputs.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_absmax.h"
+
+#include "cutlass/layout/permute.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for absolute-maximum-computing  epilogues with TensorOps
+template <
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename ElementOutput,
+  typename ElementAuxOutput,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultEpilogueWithAbsMax {
+
+  /// Use defaults related to the existing epilogue
+  using Base = DefaultEpilogueTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputOp,
+    ElementsPerAccess
+  >;
+
+  //
+  // Stores the output
+  //
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout
+  >;
+
+  //
+  // Stores the auxiliary output
+  //
+  using AuxOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementAuxOutput,
+    ScatterD,
+    PermuteDLayout
+  >;
+
+  /// Define the epilogue
+  using Epilogue = EpilogueWithAbsMax<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputTileIterator,
+    AuxOutputTileIterator,
+    ElementVector,
+    typename Base::AccumulatorFragmentIterator,
+    typename Base::WarpTileIterator,
+    typename Base::SharedLoadIterator,
+    OutputOp,
+    typename Base::Padding,
+    Base::kFragmentsPerIteration
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h b/include/cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h
index ebac2c4692..16e045e1e3 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -48,13 +48,147 @@
 #include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
 #include "cutlass/epilogue/threadblock/epilogue.h"
 #include "cutlass/epilogue/threadblock/epilogue_with_broadcast.h"
+#include "cutlass/epilogue/threadblock/epilogue_streamk_with_broadcast.h"
+
+#include "cutlass/layout/permute.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
 namespace epilogue {
 namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for SimtOps.
+template <
+  typename Shape,
+  typename WarpMmaSimt,
+  typename ElementOutput,
+  typename ElementTensor,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute,
+  conv::StrideSupport StrideSupport = conv::StrideSupport::kUnity,
+  int Rank = 4
+>
+struct DefaultEpilogueWithBroadcastSimt {
+
+  static conv::StrideSupport const kStrideSupport = StrideSupport;
+  static int const kRank = Rank;
+
+  static bool const UseCUDAStore = platform::is_same<ElementOutput, double>::value;
+
+  /// Use defaults related to the existing epilogue
+  using Base = DefaultEpilogueSimt<
+    Shape,
+    WarpMmaSimt,
+    OutputOp,
+    ElementsPerAccess
+  >;
 
+  using PackedOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout,
+    UseCUDAStore
+  >;
+
+  using StridedOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorConv<
+    typename Base::OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout,
+    UseCUDAStore,
+    kRank
+  >;
+
+  //
+  // Stores the result z = (y = GEMM(A, B, C), broadcast)
+  //
+  using OutputTileIterator = typename platform::conditional<StrideSupport == cutlass::conv::StrideSupport::kUnity,
+                                                            PackedOutputTileIterator,
+                                                            StridedOutputTileIterator>::type;
+
+  //
+  // Additional tensor tile iterator - stores t = Elementwise(z)
+  //
+  using TensorTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementTensor
+  >;
+  /// Define the epilogue
+  using Epilogue = EpilogueWithBroadcast<
+    Shape,
+    WarpMmaSimt,
+    Base::kPartitionsK,
+    OutputTileIterator,
+    TensorTileIterator,
+    ElementVector,
+    typename Base::AccumulatorFragmentIterator,
+    typename Base::WarpTileIterator,
+    typename Base::SharedLoadIterator,
+    OutputOp,
+    typename Base::Padding
+  >;
+};
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for strided dgrad epilogues for SimtOps.
+template <
+  typename Shape,
+  typename WarpMmaSimt,
+  typename ElementOutput,
+  typename ElementTensor,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultEpilogueWithBroadcastSimtStridedDgrad {
+
+  /// Use defaults related to the existing epilogue
+  using Base = DefaultEpilogueSimtStridedDgrad<
+    Shape,
+    WarpMmaSimt,
+    OutputOp,
+    ElementsPerAccess
+  >;
+
+  //
+  // Stores the result z = (y = GEMM(A, B, C), broadcast)
+  //
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorStridedDgrad<
+    typename Base::OutputTileThreadMap,
+    ElementOutput
+  >;
+
+  //
+  // Additional tensor tile iterator - stores t = Elementwise(z)
+  //
+  using TensorTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorStridedDgrad<
+    typename Base::OutputTileThreadMap,
+    ElementTensor
+  >;
+
+  /// Define the epilogue
+  using Epilogue = EpilogueWithBroadcast<
+    Shape,
+    WarpMmaSimt,
+    Base::kPartitionsK,
+    OutputTileIterator,
+    TensorTileIterator,
+    ElementVector,
+    typename Base::AccumulatorFragmentIterator,
+    typename Base::WarpTileIterator,
+    typename Base::SharedLoadIterator,
+    OutputOp,
+    typename Base::Padding
+  >;
+};
 ////////////////////////////////////////////////////////////////////////////////
 
 /// Defines sensible defaults for epilogues for TensorOps.
@@ -67,7 +201,8 @@ template <
   typename ElementVector,
   typename OutputOp,
   int ElementsPerAccess,
-  bool ScatterD = false
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
 >
 struct DefaultEpilogueWithBroadcastTensorOp {
 
@@ -86,7 +221,8 @@ struct DefaultEpilogueWithBroadcastTensorOp {
   using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
     typename Base::OutputTileThreadMap,
     ElementOutput,
-    ScatterD
+    ScatterD,
+    PermuteDLayout
   >;
 
   //
@@ -116,6 +252,67 @@ struct DefaultEpilogueWithBroadcastTensorOp {
 
 ////////////////////////////////////////////////////////////////////////////////
 
+/// Defines sensible defaults for streamk epilogues for TensorOps.
+template <
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename ElementOutput,
+  typename ElementTensor,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultStreamkEpilogueWithBroadcastTensorOp {
+
+  /// Use defaults related to the existing epilogue
+  using Base = DefaultEpilogueTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputOp,
+    ElementsPerAccess
+  >;
+
+  //
+  // Stores the result z = (y = GEMM(A, B, C), broadcast)
+  //
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout
+  >;
+
+  //
+  // Additional tensor tile iterator - stores t = Elementwise(z)
+  //
+  using TensorTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementTensor
+  >;
+
+  /// Define the epilogue
+  using Epilogue = EpilogueStreamkWithBroadcast<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputTileIterator,
+    TensorTileIterator,
+    ElementVector,
+    typename Base::AccumulatorFragmentIterator,
+    typename Base::WarpTileIterator,
+    typename Base::SharedLoadIterator,
+    OutputOp,
+    typename Base::Padding,
+    Base::kFragmentsPerIteration
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
 /// Defines sensible defaults for epilogues for VoltaTensorOps.
 template <
   typename Shape,
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_with_reduction.h b/include/cutlass/epilogue/threadblock/default_epilogue_with_reduction.h
index 223d7a7ce5..34ecfb741f 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_with_reduction.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_with_reduction.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -50,6 +50,8 @@
 #include "cutlass/epilogue/threadblock/epilogue.h"
 #include "cutlass/epilogue/threadblock/epilogue_with_reduction.h"
 
+#include "cutlass/layout/permute.h"
+
 ////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
@@ -67,7 +69,8 @@ template <
   typename OutputOp,
   typename ReductionOp,
   int ElementsPerAccess,
-  bool ScatterD = false
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
 >
 struct DefaultEpilogueWithReductionTensorOp {
 
@@ -89,7 +92,8 @@ struct DefaultEpilogueWithReductionTensorOp {
   using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
     typename Base::OutputTileThreadMap,
     ElementOutput,
-    ScatterD
+    ScatterD,
+    PermuteDLayout
   >;
 
   /// Define the epilogue
@@ -120,7 +124,8 @@ template <
   typename OutputOp,
   typename ReductionOp,
   int ElementsPerAccess,
-  bool ScatterD = false
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
 >
 struct DefaultEpilogueWithReductionVoltaTensorOp {
 
@@ -142,7 +147,8 @@ struct DefaultEpilogueWithReductionVoltaTensorOp {
   using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
     typename Base::OutputTileThreadMap,
     ElementOutput,
-    ScatterD
+    ScatterD,
+    PermuteDLayout
   >;
 
   /// Define the epilogue
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h b/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h
index e35065e32f..3b1c5dc199 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -64,6 +64,8 @@
 
 #include "cutlass/epilogue/threadblock/epilogue.h"
 
+#include "cutlass/layout/permute.h"
+
 ////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
@@ -79,7 +81,8 @@ template <
   int PartitionsK,
   typename OutputOp_,
   int ElementsPerAccess,
-  bool ScatterD = false
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
 >
 struct DefaultEpilogueWmmaTensorOp {
 
@@ -109,7 +112,8 @@ struct DefaultEpilogueWmmaTensorOp {
   using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
     OutputTileThreadMap,
     ElementOutput,
-    ScatterD
+    ScatterD,
+    PermuteDLayout
   >;
 
   using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorWmmaTensorOp<
diff --git a/include/cutlass/epilogue/threadblock/default_thread_map_simt.h b/include/cutlass/epilogue/threadblock/default_thread_map_simt.h
index 984108f8e4..2092caf4d7 100644
--- a/include/cutlass/epilogue/threadblock/default_thread_map_simt.h
+++ b/include/cutlass/epilogue/threadblock/default_thread_map_simt.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,7 +35,7 @@
 
 #pragma once
 
-#include "predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
 #include "cutlass/gemm/gemm.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h b/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h
index b58eef2616..e39ca9d539 100644
--- a/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,7 +35,7 @@
 
 #pragma once
 
-#include "predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
 #include "cutlass/gemm/gemm.h"
 #include "cutlass/layout/pitch_linear.h"
 
diff --git a/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h b/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h
index 2c85840125..1eac4a1834 100644
--- a/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,7 +35,7 @@
 
 #pragma once
 
-#include "predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
 #include "cutlass/gemm/gemm.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h b/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h
index e58ba9a75b..0dccf6525a 100644
--- a/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,7 +35,7 @@
 
 #pragma once
 
-#include "predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
 #include "cutlass/gemm/gemm.h"
 #include "cutlass/layout/pitch_linear.h"
 
diff --git a/include/cutlass/epilogue/threadblock/direct_store_epilogue_iterator.h b/include/cutlass/epilogue/threadblock/direct_store_epilogue_iterator.h
index c6a462b47f..11f89b658b 100644
--- a/include/cutlass/epilogue/threadblock/direct_store_epilogue_iterator.h
+++ b/include/cutlass/epilogue/threadblock/direct_store_epilogue_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/epilogue.h b/include/cutlass/epilogue/threadblock/epilogue.h
index 9ee85637da..4a0c67ba14 100644
--- a/include/cutlass/epilogue/threadblock/epilogue.h
+++ b/include/cutlass/epilogue/threadblock/epilogue.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -34,15 +34,12 @@
   The epilogue rearranges the result of a matrix product through shared memory to match canonical
   tensor layouts in global memory. Epilogues support conversion and reduction operations.
 
+  The shared memory resource is time-sliced across warps.
 */
 
 #pragma once
 
-#if defined(__CUDACC_RTC__)
 #include <cuda/std/cassert>
-#else
-#include <assert.h>
-#endif
 
 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
@@ -59,8 +56,8 @@
 #include "cutlass/transform/threadblock/regular_tile_iterator.h"
 
 #include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/epilogue_base_streamk.h"
 #include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
-#include "cutlass/numeric_types.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -68,6 +65,7 @@ namespace cutlass {
 namespace epilogue {
 namespace threadblock {
 
+
 ////////////////////////////////////////////////////////////////////////////////
 
 /// Epilogue operator
@@ -85,27 +83,39 @@ template <
   int IterationsUnroll =                    ///< Used to reduce binary size when epilogue op is large
     (!IsEpilogueFunctorHeavy<OutputOp_>::value)
 >
-class Epilogue : 
+class Epilogue :
   public EpilogueBase<
-    Shape_, 
-    typename WarpMmaOperator_::Shape, 
-    PartitionsK, 
-    AccumulatorFragmentIterator_, 
-    WarpTileIterator_, 
+    Shape_,
+    typename WarpMmaOperator_::Shape,
+    PartitionsK,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
     Padding_,
-    FragmentsPerPartition> {
+    FragmentsPerPartition>,
+  public EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>
+{
 
 public:
 
   using Base = EpilogueBase<
-    Shape_, 
-    typename WarpMmaOperator_::Shape, 
-    PartitionsK, 
-    AccumulatorFragmentIterator_, 
-    WarpTileIterator_, 
+    Shape_,
+    typename WarpMmaOperator_::Shape,
+    PartitionsK,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
     Padding_,
     FragmentsPerPartition>;
 
+  using BaseStreamK = EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>;
+
   using Shape = Shape_;
   using WarpMmaOperator = WarpMmaOperator_;
   static int const kPartitionsK = PartitionsK;
@@ -115,15 +125,23 @@ class Epilogue :
   using SharedLoadIterator = SharedLoadIterator_;
   using OutputOp = OutputOp_;
   using Padding = Padding_;
-
   using Layout = layout::RowMajor;
   using LongIndex = typename Layout::LongIndex;
 
-  /// The complete warp-level accumulator tile
+  /// Number of warps per block
+  using WarpCount = typename Base::WarpCount;
+
+  /// Number of threads per block
+  static int const kBlockThreads = 32 * WarpCount::kCount;
+
+  /// Per-thread accumulator tile type
   using AccumulatorTile = typename Base::AccumulatorTile;
 
-  /// Accumulator element
-  using ElementAccumulator = typename WarpTileIterator::Element;
+  /// Numerical accumulation element type
+  using ElementAccumulator = typename WarpMmaOperator::ElementC;
+
+  /// Fragment type used by the accumulator tile's fragment iterator
+  using AccumulatorFragment = typename AccumulatorFragmentIterator::Fragment;
 
   /// Output element
   using ElementOutput = typename OutputTileIterator::Element;
@@ -140,19 +158,18 @@ class Epilogue :
   /// Const tensor reference to source tensor
   using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
 
-  /// Array type used to output
+  /// Vector type used by the global output iterator
   using OutputAccessType = Array<
     typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
 
-  /// Array type used by output functor
-  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>; 
-  
-  /// Number of warps
-  using WarpCount = typename Base::WarpCount;
+  /// Vector type used by the shared output iterator
+  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
 
   static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
+
   static int constexpr kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles;
 
+
 public:
 
   static_assert(SharedLoadIterator::Fragment::kElements == OutputTileIterator::Fragment::kElements,
@@ -163,247 +180,322 @@ class Epilogue :
   static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess), 
     "Divisibility");
 
-private:
+  static_assert(kPartitionsK == 1 || Base::kFragmentsPerIteration == 1, "One of these must be exactly 1.");
 
-  /// Loads fragment from shared memory aligned with output tensor
-  SharedLoadIterator shared_load_iterator_;
 
 public:
 
-  /// Constructor
-  CUTLASS_DEVICE
-  Epilogue(
-    typename Base::SharedStorage &shared_storage,    ///< Shared storage object    
-    int thread_idx,                   ///< ID of a thread within the threadblock
-    int warp_idx,                     ///< ID of warp within threadblock
-    int lane_idx                     ///< Id of thread within warp
-  ):
-    Base(shared_storage, thread_idx, warp_idx, lane_idx),
-    shared_load_iterator_(shared_storage.reference(), thread_idx) 
+  /// Aspect for when epilogue source is not needed
+  struct SourceAspectNotNeeded
   {
-    
-  }
+    /// Constructor
+    CUTLASS_DEVICE
+    SourceAspectNotNeeded()
+    {}
 
-  /// Streams the result to global memory
-  CUTLASS_DEVICE
-  void operator()(
-    OutputOp const &output_op,                    ///< Output operator
-    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
-    AccumulatorTile const &accumulators,          ///< Complete warp-level accumulator tile
-    OutputTileIterator source_iterator) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
-    
-    if (!output_op.is_source_needed()) {
-      compute_source_not_needed_(output_op, destination_iterator, accumulators);  
-    }
-    else {
-      compute_source_needed_(output_op, destination_iterator, accumulators, source_iterator);
-    }
-  }
+    // No-op
+    CUTLASS_DEVICE
+    void load() { }
 
-private:
+    /// Invoke the output functor over each vector of output
+    CUTLASS_DEVICE
+    void apply_output_operator(
+      typename OutputTileIterator::Fragment &output_fragment,
+      OutputOp const &output_op,
+      typename SharedLoadIterator::Fragment const &aligned_accum_fragment)
+    {
+      OutputAccessType *output_frag_ptr =
+        reinterpret_cast<OutputAccessType *>(&output_fragment);
 
-  template <class Seq>
-  struct acc2smem_source_not_needed;
+      AccumulatorAccessType const *compute_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
+
+      int const kOutputOpIterations =
+        OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
 
-  template <size_t... Seq>
-  struct acc2smem_source_not_needed<cutlass::index_sequence<Seq...>> {
-    template <int Advance>
-    CUTLASS_DEVICE static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
-                                      WarpTileIterator &warp_tile_iterator) {
       CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < Advance; i++) {
-        ++accum_fragment_iterator;
+      for (int i = 0; i < kOutputOpIterations; ++i)
+      {
+        // Call the output operator
+        output_frag_ptr[i] = output_op(compute_frag_ptr[i]);
       }
+    }
+  };
 
-      CUTLASS_PRAGMA_UNROLL
-      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
-        typename AccumulatorFragmentIterator::Fragment accum_fragment;
 
-        accum_fragment_iterator.load(accum_fragment);
-        ++accum_fragment_iterator;
+  /// Aspect for when epilogue source is needed
+  struct SourceAspectNeeded
+  {
+    OutputTileIterator source_iterator;
 
-        warp_tile_iterator.store(accum_fragment);
-        if (p < Base::kFragmentsPerIteration - 1) {
-          warp_tile_iterator.add_pointer_offset(kSmemPointerOffset);
-        }
-      }
+    typename OutputTileIterator::Fragment source_fragment;
+
+    /// Invoke the output functor over each vector of output
+    CUTLASS_DEVICE
+    static void apply_output_operator(
+      typename OutputTileIterator::Fragment &output_fragment,
+      OutputOp const &output_op,
+      typename SharedLoadIterator::Fragment const &aligned_accum_fragment,
+      typename OutputTileIterator::Fragment const &source_fragment)
+    {
+      OutputAccessType *output_frag_ptr =
+        reinterpret_cast<OutputAccessType *>(&output_fragment);
+
+      AccumulatorAccessType const *compute_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
+
+      OutputAccessType const *source_frag_ptr =
+        reinterpret_cast<OutputAccessType const *>(&source_fragment);
 
-      if (Base::kFragmentsPerIteration > 1) {
-        warp_tile_iterator.add_pointer_offset(kSmemPointerOffset *
-                                              (1 - Base::kFragmentsPerIteration));
+      int const kOutputOpIterations =
+        OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kOutputOpIterations; ++i)
+      {
+        // Call the output operator
+        output_frag_ptr[i] = output_op(compute_frag_ptr[i], source_frag_ptr[i]);
       }
     }
 
+    /// Constructor
     CUTLASS_DEVICE
-    static void push(size_t pos,
-                     AccumulatorFragmentIterator const &iterator_begin,
-                     WarpTileIterator &warp_tile_iterator) {
-      int dummy[] = {
-          (pos == (Seq * Base::kFragmentsPerIteration)) &&
-          (helper<Seq * Base::kFragmentsPerIteration>(iterator_begin, warp_tile_iterator), 0)...};
-
-      CUTLASS_UNUSED(dummy[0]);
+    SourceAspectNeeded(OutputTileIterator source_iterator) :
+      source_iterator(source_iterator)
+    {
+      source_fragment.clear();
     }
-  };
-
-  static_assert(kPartitionsK == 1 || Base::kFragmentsPerIteration == 1, "One of these must be exactly 1.");
 
-  /// Streams the result to global memory
-  CUTLASS_DEVICE
-  void compute_source_not_needed_(
-    OutputOp const &output_op,                    ///< Output operator
-    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
-    AccumulatorTile const &accumulators          ///< Complete warp-level accumulator tile 
-    ) { 
+    // Load addend source fragment from global memory
+    CUTLASS_DEVICE
+    void load() {
+      source_iterator.load(source_fragment);
+      ++source_iterator;
+    }
 
-    //
-    // Iterator over warp-level accumulator fragment
-    //
+    /// Invoke the output functor over each vector of output
+    CUTLASS_DEVICE
+    void apply_output_operator(
+      typename OutputTileIterator::Fragment &output_fragment,
+      OutputOp const &output_op,
+      typename SharedLoadIterator::Fragment const &aligned_accum_fragment)
+    {
+      apply_output_operator(output_fragment, output_op, aligned_accum_fragment, source_fragment);
+    }
+  };
 
-    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
 
-    //
-    // Iterate over accumulator tile
-    // 
+private:
 
-    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations / Base::kFragmentsPerIteration : 1)
-    for (int iter = 0; iter < OutputTileIterator::kIterations; iter += Base::kFragmentsPerIteration) {
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
 
-      //
-      // Convert and store fragment
-      //
-      
-      __syncthreads();
+  /// Thread index in the threadblock
+  int thread_idx;
 
+  /// Warp index in the threadblock
+  int warp_idx;
 
-      acc2smem_source_not_needed<
-          cutlass::make_index_sequence<OutputTileIterator::kIterations /
-                                   Base::kFragmentsPerIteration>>::push(iter,
-                                                                        accum_fragment_iterator,
-                                                                        this->warp_tile_iterator_);
+public:
 
-      __syncthreads();
+  /// Constructor
+  CUTLASS_DEVICE
+  Epilogue(
+      typename Base::SharedStorage &shared_storage,   ///< Shared storage object
+      int thread_idx,                                 ///< ID of a thread within the threadblock
+      int warp_idx,                                   ///< ID of warp within threadblock
+      int lane_idx)                                   ///< Id of thread within warp
+  :
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      BaseStreamK(thread_idx),
+      shared_load_iterator_(shared_storage.reference(), thread_idx),
+      thread_idx(thread_idx),
+      warp_idx(warp_idx)
+  {}
+
+
+  /// Aggregates the accumulator sets shared by peer blocks in the global workspace,
+  /// performing epilogue computations, writing to output
+  CUTLASS_DEVICE
+  void reduce(
+      int peer_idx_begin,
+      int peer_idx_end,
+      int reduce_fragment_idx,
+      void *element_workspace,
+      OutputOp const &output_op,                      ///< Output operator
+      OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+      OutputTileIterator source_iterator)             ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+  {
+    // Reduce peer accumulator fragments into one fragment
+    AccumulatorFragment accum_fragment;
+    BaseStreamK::reduce(accum_fragment, peer_idx_begin, peer_idx_end, reduce_fragment_idx, element_workspace);
 
-      //
-      // Load fragments from shared memory
-      //
+    // Store fragment to shared memory
+    this->warp_tile_iterator_.store(accum_fragment);
 
-      CUTLASS_PRAGMA_UNROLL
-      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+    __syncthreads();
 
+    // Initialize/load source-fragment data
+    typename OutputTileIterator::Fragment source_fragment;
+    source_fragment.clear();
 
-        typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+    if (output_op.is_source_needed())
+    {
+      source_iterator += reduce_fragment_idx;
+      source_iterator.load(source_fragment);
+    }
 
-        shared_load_iterator_.load(aligned_accum_fragment[0]);
+    // Load fragment from shared memory
+    typename SharedLoadIterator::Fragment aligned_accum_fragment;
+    shared_load_iterator_.load(aligned_accum_fragment);
 
-        if (p < Base::kFragmentsPerIteration - 1) {
-          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
-        }
-        else if (kPartitionsK > 1) {
+    // Add fragments shared by other k partitions
+    if (kPartitionsK > 1)
+    {
+      plus <typename SharedLoadIterator::Fragment> add_fragments;
 
-          plus <typename SharedLoadIterator::Fragment> add_fragments;
+      CUTLASS_PRAGMA_UNROLL
+      for ( int i = 1; i < kPartitionsK; ++i) {
+        typename SharedLoadIterator::Fragment aligned_addend_fragment;
+        shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+        shared_load_iterator_.load(aligned_addend_fragment);
+        aligned_accum_fragment = add_fragments(aligned_accum_fragment, aligned_addend_fragment);
+      }
+    }
 
-          CUTLASS_PRAGMA_UNROLL
-          for ( int i = 1; i < kPartitionsK; ++i) {
-            shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
-            shared_load_iterator_.load(aligned_accum_fragment[i]);
-            aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
-          }
+    // Compute the output result
+    typename OutputTileIterator::Fragment output_fragment;
 
-          shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
-        }
+    // Apply the output operator
+    SourceAspectNeeded::apply_output_operator(
+        output_fragment,
+        output_op,
+        aligned_accum_fragment,
+        source_fragment);
 
-        //
-        // Compute the output result
-        //
+    // Store the final result
+    destination_iterator += reduce_fragment_idx;
+    destination_iterator.store(output_fragment);
+  }
 
-        typename OutputTileIterator::Fragment output_fragment;
 
-        apply_output_operator_source_not_needed_(output_fragment, output_op, aligned_accum_fragment[0]);
+  /// Perform the epilogue computations and stream the result to global memory.
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                      ///< Output operator
+    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+    AccumulatorTile const &accumulators)            ///< Complete warp-level accumulator tile
+  {
+    operator()(output_op, destination_iterator, accumulators, SourceAspectNotNeeded());
+  }
 
 
-        //
-        // Store the final result
-        //
+  /// Perform the epilogue computations and stream the result to global memory.  Implements
+  /// two alternative codepaths, depending on whether the output op requires addend data to be loaded.
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                      ///< Output operator
+    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,            ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator )            ///< Tile iterator for addend source
+  {
+    if (output_op.is_source_needed())
+    {
+      operator()(output_op, destination_iterator, accumulators, SourceAspectNeeded(source_iterator));
+    }
+    else
+    {
+      operator()(output_op, destination_iterator, accumulators, SourceAspectNotNeeded());
+    }
+  }
 
-        destination_iterator.store(output_fragment);
-        ++destination_iterator;
-      }
 
-      if (Base::kFragmentsPerIteration > 1) {
-        shared_load_iterator_.add_pointer_offset(kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
-      }
+  /// Perform the epilogue computations and stream the result to global memory.  Implements a
+  /// single codepath, regardless of whether the output op requires addend data to be loaded
+  CUTLASS_DEVICE
+  void unified(
+    OutputOp const &output_op,                      ///< Output operator
+    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,            ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator )            ///< Tile iterator for addend source
+  {
+    if (!output_op.is_source_needed())
+    {
+      source_iterator.clear_mask();
+      __syncthreads();  // Dummy (CUDA 11.0)
     }
+
+    operator()(output_op, destination_iterator, accumulators, SourceAspectNeeded(source_iterator));
   }
 
   template<class Seq>
-  struct acc2smem_source_needed;
+  struct acc2smem;
 
   template <size_t... Seq>
-  struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
+  struct acc2smem<cutlass::index_sequence<Seq...>> {
     template<int Advance>
     CUTLASS_DEVICE
     static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
-                       WarpTileIterator &warp_tile_iterator) {
+                      WarpTileIterator &warp_tile_iterator) {
       CUTLASS_PRAGMA_UNROLL
       for (int i = 0; i < Advance; i++) {
         ++accum_fragment_iterator;
       }
 
       typename AccumulatorFragmentIterator::Fragment accum_fragment;
+
       accum_fragment_iterator.load(accum_fragment);
+      ++accum_fragment_iterator;
       warp_tile_iterator.store(accum_fragment);
     }
 
     CUTLASS_DEVICE
     static void push(size_t pos,
-                     AccumulatorFragmentIterator const &iterator_begin,
-                     WarpTileIterator &warp_tile_iterator) {
+                    AccumulatorFragmentIterator const &iterator_begin,
+                    WarpTileIterator &warp_tile_iterator) {
       int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
     }
   };
 
+
   /// Streams the result to global memory
+  template <typename SourceAspect>
   CUTLASS_DEVICE
-  void compute_source_needed_(
-    OutputOp const &output_op,                    ///< Output operator
-    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
-    AccumulatorTile const &accumulators,          ///< Complete warp-level accumulator tile
-    OutputTileIterator source_iterator           ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
-    ) { 
-    
-    typename OutputTileIterator::Fragment source_fragment;
-
-    source_fragment.clear();
-
-    //
+  void operator()(
+    OutputOp const &output_op,                      ///< Output operator
+    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,            ///< Complete warp-level accumulator tile
+    SourceAspect source)
+  {
     // Iterator over warp-level accumulator fragment
-    //
-
     AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
 
     //
     // Iterate over accumulator tile
-    // 
+    //
 
-    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
-    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+    #ifdef __clang__
+    #pragma clang diagnostic push
+    #pragma clang diagnostic ignored "-Wcuda-compat"
+    // Turn off clangs warning about loop unroll argument using parens.
+    #endif
 
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter)
+    {
       //
       // Load the source
       //
 
-      source_iterator.load(source_fragment);
-      ++source_iterator;
-
+        source.load();
       //
       // Convert and store fragment
       //
-      
+
       __syncthreads();
 
-      acc2smem_source_needed<cutlass::make_index_sequence<OutputTileIterator::kIterations>>::push(
-          iter, accum_fragment_iterator, this->warp_tile_iterator_);
+      acc2smem<cutlass::make_index_sequence<OutputTileIterator::kIterations>>::push(
+        iter, accum_fragment_iterator, this->warp_tile_iterator_);
 
       __syncthreads();
 
@@ -412,12 +504,9 @@ class Epilogue :
       //
 
       typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
-
       shared_load_iterator_.load(aligned_accum_fragment[0]);
 
-      // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
       if (kPartitionsK > 1) {
-
         plus <typename SharedLoadIterator::Fragment> add_fragments;
 
         CUTLASS_PRAGMA_UNROLL
@@ -433,72 +522,21 @@ class Epilogue :
       //
       // Compute the output result
       //
-     
-      typename OutputTileIterator::Fragment output_fragment;
-
-      apply_output_operator_(output_fragment, output_op, aligned_accum_fragment[0], source_fragment);
 
+      typename OutputTileIterator::Fragment output_fragment;
+      source.apply_output_operator(output_fragment, output_op, aligned_accum_fragment[0]);
 
       //
       // Store the final result
       //
 
-      destination_iterator.store(output_fragment);      
+      destination_iterator.store(output_fragment);
       ++destination_iterator;
-
     }
-  }
-
-  /// Helper to invoke the output functor over each vector of output
-  CUTLASS_DEVICE
-  void apply_output_operator_(
-    typename OutputTileIterator::Fragment &output_fragment,
-    OutputOp const &output_op,                    ///< Output operator
-    typename SharedLoadIterator::Fragment const &aligned_accum_fragment,
-    typename OutputTileIterator::Fragment const &source_fragment) {
-      
-    OutputAccessType *output_frag_ptr = 
-      reinterpret_cast<OutputAccessType *>(&output_fragment);
-
-    AccumulatorAccessType const *compute_frag_ptr = 
-      reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
-
-    OutputAccessType const *source_frag_ptr = 
-      reinterpret_cast<OutputAccessType const *>(&source_fragment);
-
-    int const kOutputOpIterations = 
-      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kOutputOpIterations; ++i) {
-
-      // Call the output operator
-      output_frag_ptr[i] = output_op(compute_frag_ptr[i], source_frag_ptr[i]);
-    }
-  }
-
-  /// Helper to invoke the output functor over each vector of output
-  CUTLASS_DEVICE
-  void apply_output_operator_source_not_needed_(
-    typename OutputTileIterator::Fragment &output_fragment,
-    OutputOp const &output_op,                    ///< Output operator
-    typename SharedLoadIterator::Fragment const &aligned_accum_fragment) {
     
-    OutputAccessType *output_frag_ptr = 
-      reinterpret_cast<OutputAccessType *>(&output_fragment);
-
-    AccumulatorAccessType const *compute_frag_ptr = 
-      reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
-
-    int const kOutputOpIterations = 
-      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kOutputOpIterations; ++i) {
-
-      // Call the output operator
-      output_frag_ptr[i] = output_op(compute_frag_ptr[i]);
-    }
+    #ifdef __clang__
+    #pragma clang diagnostic pop
+    #endif
   }
 };
 
diff --git a/include/cutlass/epilogue/threadblock/epilogue_base.h b/include/cutlass/epilogue/threadblock/epilogue_base.h
index 8f52ebdd82..30432e80eb 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_base.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_base.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -43,11 +43,7 @@
 #include <utility>
 #endif
 
-#if defined(__CUDACC_RTC__)
 #include <cuda/std/cassert>
-#else
-#include <assert.h>
-#endif
 
 #include "cutlass/cutlass.h"
 #include "cutlass/matrix_shape.h"
diff --git a/include/cutlass/epilogue/threadblock/epilogue_base_streamk.h b/include/cutlass/epilogue/threadblock/epilogue_base_streamk.h
new file mode 100644
index 0000000000..294e9a5140
--- /dev/null
+++ b/include/cutlass/epilogue/threadblock/epilogue_base_streamk.h
@@ -0,0 +1,197 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Basic subset of epilogue functionality for supporting StreamK decompositions
+*/
+
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/functional.h"
+#include "cutlass/block_striped.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+/// StreamK epilogue functionality for cross-block accumulator fragment reduction
+template <
+  typename Shape,                          ///< Shape of threadblock tile (concept: GemmShape)
+  int PartitionsK,
+  typename WarpMmaOperator,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  typename AccumulatorFragmentIterator>    ///< Iterator for enumerating fragments within the per-thread tile of raw accumulators
+class EpilogueBaseStreamK
+{
+
+protected:
+
+  /// The per-thread tile of raw accumulators
+  using AccumulatorTile = typename AccumulatorFragmentIterator::AccumulatorTile;
+
+  /// Number of warps
+  using WarpCount = gemm::GemmShape<
+                        Shape::kM / WarpMmaOperator::Shape::kM,
+                        Shape::kN / WarpMmaOperator::Shape::kN,
+                        PartitionsK>;
+
+  /// Number of threads per block
+  static int const kBlockThreads = 32 * WarpCount::kCount;
+
+  /// Numerical accumulation element type
+  using ElementAccumulator = typename WarpMmaOperator::ElementC;
+
+  /// Fragment type used by the accumulator tile's fragment iterator
+  using AccumulatorFragment = typename AccumulatorFragmentIterator::Fragment;
+
+public:
+
+  /// Number of AccumulatorTile fragments per thread
+  static int const kAccumulatorFragments = AccumulatorFragmentIterator::Policy::kIterations;
+
+protected:
+
+  /// Number of AccumulatorTile fragments per block output tile
+  static int const kOutputTileFragments = kBlockThreads * kAccumulatorFragments;
+
+  /// Block-striped transfer utility for sharing AccumulatorFragment
+  using BlockStripedT = BlockStriped<kBlockThreads, AccumulatorFragment>;
+
+  /// AccumulatorFragment stride in the shared workspace between different peer blocks (each thread block can share accumulators for up to two block output tiles)
+  static const int kPeerFragmentStride = kOutputTileFragments * 2;
+
+public:
+
+  /// Workspace bytes per thread block
+  static size_t const kWorkspaceBytesPerBlock =sizeof(AccumulatorFragment) * kPeerFragmentStride;
+
+public:
+
+  /// Thread index in the threadblock
+  int thread_idx;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueBaseStreamK(
+      int thread_idx)                                       ///< ID of a thread within the threadblock
+  :
+      thread_idx(thread_idx)
+  {}
+
+
+  /// Aggregates the accumulator sets shared by peer blocks in the global workspace
+  CUTLASS_DEVICE
+  void reduce(
+      AccumulatorFragment &accum_fragment,                  ///< [out] sum of all shared accumulator fragments for these peer partials
+      int peer_idx_begin,
+      int peer_idx_end,
+      int reduce_fragment_idx,
+      void *workspace_ptr)
+  {
+    plus<AccumulatorFragment> add_fragments;
+
+    AccumulatorFragment *fragment_workspace = reinterpret_cast<AccumulatorFragment *>(workspace_ptr);
+
+    int fragment_offset = (peer_idx_begin * kPeerFragmentStride) + (reduce_fragment_idx * kBlockThreads);
+
+    // Load first peer fragment
+    BlockStripedT::load(accum_fragment, fragment_workspace + fragment_offset, this->thread_idx);
+
+    fragment_offset += kPeerFragmentStride;         // Move to next peer
+    fragment_offset += kOutputTileFragments;        // Move to the set of fragments for this peer's "non-started" output tile
+
+    // Reduce fragments from additional peers
+    #pragma unroll 2
+    for (; fragment_offset < peer_idx_end * kPeerFragmentStride; fragment_offset += kPeerFragmentStride)
+    {
+      // Load peer fragment
+      AccumulatorFragment addend_fragment;
+      BlockStripedT::load(addend_fragment, fragment_workspace + fragment_offset, this->thread_idx);
+
+      // Add peer fragment
+      accum_fragment = add_fragments(accum_fragment, addend_fragment);
+    }
+  }
+
+
+  /// Shares the accumulator set with peers in the global workspace
+  CUTLASS_DEVICE
+  void share(
+      int peer_idx,
+      void *workspace_ptr,
+      AccumulatorTile const &accumulators,
+      bool started_tile)                      ///< Whether this thread block computed the first work volume for the current output tile
+  {
+    AccumulatorFragment *fragment_workspace = reinterpret_cast<AccumulatorFragment *>(workspace_ptr);
+
+    int fragment_offset = peer_idx * kPeerFragmentStride;
+
+    if (!started_tile) {
+      // Move to the set of fragments for the "non-started" output tile
+      fragment_offset += kOutputTileFragments;
+    }
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    // Convert raw accumulator tile to fragments and store
+    CUTLASS_PRAGMA_UNROLL
+    for (int iter = 0; iter < kAccumulatorFragments; ++iter)
+    {
+      // Acquire reordered accumulator fragment
+      AccumulatorFragment accum_fragment;
+      accum_fragment_iterator.load(accum_fragment);
+      ++accum_fragment_iterator;
+
+      // Store accumulator fragment
+      BlockStripedT::store(fragment_workspace + fragment_offset, accum_fragment, this->thread_idx);
+
+      fragment_offset += kBlockThreads;
+    }
+  }
+
+};
+
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/threadblock/epilogue_depthwise.h b/include/cutlass/epilogue/threadblock/epilogue_depthwise.h
new file mode 100644
index 0000000000..83cbc8ab30
--- /dev/null
+++ b/include/cutlass/epilogue/threadblock/epilogue_depthwise.h
@@ -0,0 +1,335 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for Depthwise convoltuion
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/numeric_types.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator
+template <typename Shape_,                   ///< Shape of threadblock tile (concept: GemmShape)
+          typename ThreadOutputShape_,       /// Size of the matrix to load (concept: TensorNHWC)
+          typename ThreadBlockOutputShape_,  /// Size of the matrix to load (concept: TensorNHWC)
+          typename WarpMmaOperator_,         ///< Warp-level MMA operator (concept:
+                                             ///< gemm::warp::MmaTensorOp)
+          typename OutputTileIterator_,      ///< Tile iterator reading and writing output tensors
+          typename AccumulatorFragmentIterator_,  ///< Fragment iterator selecting accumulators
+          typename WarpTileIterator_,    ///< Warp-scoped tile iterator writing accumulators to SMEM
+          typename SharedLoadIterator_,  ///< Threadblock-scoped tile iterator loading from SMEM
+          typename OutputOp_,            ///< Output operator
+          typename Padding_  ///< Padding added to SMEM allocation to avoid bank conflicts (concept:
+                             ///< MatrixShape)
+          >
+class EpilogueDepthwise {
+ public:
+  using Shape = Shape_;
+  using WarpShape = typename WarpMmaOperator_::Shape;
+  using ThreadOutputShape = ThreadOutputShape_;
+  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  using OutputTileIterator = OutputTileIterator_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = Padding_;
+
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename AccumulatorFragmentIterator::AccumulatorTile;
+
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+  /// Array type used to output
+  using OutputAccessType =
+      Array<typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+  /// Array type used by output functor
+  using AccumulatorAccessType =
+      Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+  /// Number of warps
+  using WarpCount =
+      gemm::GemmShape<Shape::kM / WarpShape::kM, Shape::kN / WarpShape::kN>;
+
+ public:
+  static_assert(SharedLoadIterator::Fragment::kElements ==
+  OutputTileIterator::Fragment::kElements,
+    "Mismatch between shared load iterator and output tile iterator.");
+
+  static_assert(OutputTileIterator::kElementsPerAccess,
+                "OutputTileIterator::kElementsPerAccess must not be zero.");
+
+  static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess),
+                "Divisibility");
+
+  /// Shared storage allocation needed by the epilogue
+  struct SharedStorage {
+    //
+    // Type definitions
+    //
+
+    /// Element type of shared memory
+    using Element = typename WarpTileIterator::Element;
+
+    /// Tensor reference to shared memory allocation
+    using TensorRef = typename WarpTileIterator::TensorRef;
+
+    /// Layout of shared memory allocation
+    using Layout = typename WarpTileIterator::Layout;
+
+    /// Logical shape of the shared memory tile written to by all warps.
+    using Shape = MatrixShape<ThreadBlockOutputShape::kNHW, ThreadBlockOutputShape::kC>;
+
+    /// Shape of the shared memory allocation for the epilogue
+    using StorageShape = MatrixShape<Shape::kRow, Shape::kColumn>;
+
+    //
+    // Data members
+    //
+
+    AlignedBuffer<Element, StorageShape::kCount> storage;
+
+    //
+    // Methods
+    //
+
+    /// Returns a pointer to the shared memory buffer
+    CUTLASS_DEVICE
+    Element *data() { return storage.data(); }
+
+    /// Returns a tensor reference to the shared memory buffer
+    CUTLASS_DEVICE
+    TensorRef reference() {
+      return TensorRef(storage.data(), Layout::packed({StorageShape::kRow, StorageShape::kColumn}));
+    }
+  };
+
+ private:
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+
+  /// Stores a warp's fragment of accumulators to SMEM
+  WarpTileIterator warp_tile_iterator_;
+
+  LongIndex warp_offset;
+  int thread_idx;
+  int warp_idx;
+  int lane_idx;
+  int warp_m, warp_n;  // warp coordinates within a cta
+  int tid_m, tid_n;    // thread coordinates within a warp
+
+ public:
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueDepthwise(SharedStorage &shared_storage,  ///< Shared storage object
+                    int thread_idx_,                ///< ID of a thread within the threadblock
+                    int warp_idx_,                  ///< ID of warp within threadblock
+                    int lane_idx_                   ///< Id of thread within warp
+                    )
+      : thread_idx(thread_idx_),
+        warp_idx(warp_idx_),
+        lane_idx(lane_idx_),
+        shared_load_iterator_(shared_storage.reference(), thread_idx_),
+        warp_tile_iterator_(shared_storage.reference(), thread_idx_, lane_idx_) {}
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(OutputOp const &output_op,                ///< Output operator
+                  OutputTileIterator destination_iterator,  ///< Tile iterator for destination
+                  AccumulatorTile const &accumulators,  ///< Complete warp-level accumulator tile
+                  OutputTileIterator source_iterator,   ///< Threadblock tile coordinate in GEMM (in
+                                                        ///< units of threadblock tiles)
+                  const int smem_base_offset) {         ///< SMEM base offset for epilogue operation
+    // initiate the smem base offset for different output tile.
+    warp_tile_iterator_.set_smem_base_address(smem_base_offset);
+
+    shared_load_iterator_.set_smem_base_address(smem_base_offset);
+
+    if (!output_op.is_source_needed()) {
+      compute_source_not_needed_(output_op, destination_iterator, accumulators);
+    } else {
+      compute_source_needed_(output_op, destination_iterator, accumulators, source_iterator);
+    }
+  }
+
+ private:
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_needed_(
+      OutputOp const &output_op,                ///< Output operator
+      OutputTileIterator destination_iterator,  ///< Tile iterator for destination
+      AccumulatorTile const &accumulators,      ///< Complete warp-level accumulator tile
+      OutputTileIterator source_iterator) {     ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+
+    typename OutputTileIterator::Fragment source_fragment;
+
+    source_fragment.clear();
+
+    source_iterator.load(source_fragment);
+
+    // store to smem
+    warp_tile_iterator_.store(accumulators);
+
+    __syncthreads();
+
+    typename SharedLoadIterator::Fragment aligned_accum_fragment;
+
+    // load from smem
+    shared_load_iterator_.load(aligned_accum_fragment);
+
+    typename OutputTileIterator::Fragment output_fragment;
+
+    apply_output_operator_(output_fragment, output_op, aligned_accum_fragment, source_fragment);
+
+    // Store to GMEM
+    destination_iterator.store(output_fragment);
+  }
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_not_needed_(
+      OutputOp const &output_op,                ///< Output operator
+      OutputTileIterator destination_iterator,  ///< Tile iterator for destination
+      AccumulatorTile const &accumulators) {    ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+
+    // store to smem
+    warp_tile_iterator_.store(accumulators);
+
+    __syncthreads();
+
+    typename SharedLoadIterator::Fragment aligned_accum_fragment;
+
+    // load from smem
+    shared_load_iterator_.load(aligned_accum_fragment);
+
+    typename OutputTileIterator::Fragment output_fragment;
+
+    apply_output_operator_source_not_needed_(output_fragment, output_op, aligned_accum_fragment);
+
+    // Store to GMEM
+    destination_iterator.store(output_fragment);
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_(
+    typename OutputTileIterator::Fragment &output_fragment,
+    OutputOp const &output_op,                    ///< Output operator
+    typename SharedLoadIterator::Fragment const &aligned_accum_fragment,
+    typename OutputTileIterator::Fragment const &source_fragment) {
+      
+    OutputAccessType *output_frag_ptr = 
+      reinterpret_cast<OutputAccessType *>(&output_fragment);
+
+    AccumulatorAccessType const *compute_frag_ptr = 
+      reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
+
+    OutputAccessType const *source_frag_ptr = 
+      reinterpret_cast<OutputAccessType const *>(&source_fragment);
+
+    int const kOutputOpIterations = 
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+      // Call the output operator
+      output_frag_ptr[i] = output_op(compute_frag_ptr[i], source_frag_ptr[i]);
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_source_not_needed_(
+      typename OutputTileIterator::Fragment &output_fragment,
+      OutputOp const &output_op,  ///< Output operator
+      typename SharedLoadIterator::Fragment const &aligned_accum_fragment) {
+    OutputAccessType *output_frag_ptr = reinterpret_cast<OutputAccessType *>(&output_fragment);
+
+    AccumulatorAccessType const *compute_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
+
+    int const kOutputOpIterations =
+        OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+      // Call the output operator
+      output_frag_ptr[i] = output_op(compute_frag_ptr[i]);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace epilogue
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/threadblock/epilogue_direct_store.h b/include/cutlass/epilogue/threadblock/epilogue_direct_store.h
index 4bbfafaf2a..02de00dd69 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_direct_store.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_direct_store.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -77,7 +77,6 @@ class EpilogueDirectStore {
   using OutputTileIterator = OutputTileIterator_;
   using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
   using WarpTileIterator = WarpTileIterator_;
-  using SharedLoadIterator = SharedLoadIterator_;
   using OutputOp = OutputOp_;
   using Padding = MatrixShape<0, 0>;
 
diff --git a/include/cutlass/epilogue/threadblock/epilogue_gemm_k_reduction.h b/include/cutlass/epilogue/threadblock/epilogue_gemm_k_reduction.h
index 63cd4c6ef5..486c03040a 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_gemm_k_reduction.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_gemm_k_reduction.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -38,11 +38,7 @@
 
 #pragma once
 
-#if defined(__CUDACC_RTC__)
 #include <cuda/std/cassert>
-#else
-#include <assert.h>
-#endif
 
 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
@@ -149,13 +145,12 @@ class EpilogueGemmKReduction {
 
       CUTLASS_PRAGMA_UNROLL
       for (int i = 0; i < kIterations / 4; ++i) {
-        ElementOutput tmp;
+        ElementOutput *source_ptr = reinterpret_cast<ElementOutput *>(&source);
         cutlass::arch::global_load<ElementOutput, sizeof(ElementOutput)>(
-                                                  tmp,
+                                                  source_ptr[i],
                                                   (void *)(pointer_ + i * 32),
                                                   guard[i] && LoadForSerialSplitK);
 
-        source[i] = tmp;
       }
 
       FragmentAccumulator sum = gemm_k_with_reduction_accumulation;
diff --git a/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h b/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h
index 89f1fa569f..b294244cdc 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/epilogue_smem_accumulator.h b/include/cutlass/epilogue/threadblock/epilogue_smem_accumulator.h
index d87f645d58..85ddae7cbd 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_smem_accumulator.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_smem_accumulator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -38,11 +38,7 @@
 
 #pragma once
 
-#if defined(__CUDACC_RTC__)
 #include <cuda/std/cassert>
-#else
-#include <assert.h>
-#endif
 
 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
diff --git a/include/cutlass/epilogue/threadblock/epilogue_streamk_with_broadcast.h b/include/cutlass/epilogue/threadblock/epilogue_streamk_with_broadcast.h
new file mode 100644
index 0000000000..aff0548543
--- /dev/null
+++ b/include/cutlass/epilogue/threadblock/epilogue_streamk_with_broadcast.h
@@ -0,0 +1,443 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include <cuda/std/cassert>
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/utility>
+#else
+#include <utility>
+#endif
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/functional.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/epilogue_base_streamk.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+
+#include "cutlass/numeric_types.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This base class is meant to define the concept required of the
+/// EpilogueStreamkWithBroadcast::OutputOp
+template <
+  typename ElementC_,
+  typename ElementAccumulator_,
+  typename ElementCompute_,
+  typename ElementZ_,
+  typename ElementT_,
+  int ElementsPerAccess,
+  bool StoreZ = true,
+  bool StoreT = true
+>
+struct EpilogueStreamkWithBroadcastOpBase : EpilogueWithBroadcastOpBase<
+                                            ElementC_,
+                                            ElementAccumulator_,
+                                            ElementCompute_,
+                                            ElementZ_,
+                                            ElementT_,
+                                            ElementsPerAccess,
+                                            StoreZ,
+                                            StoreT
+                                            > 
+{
+
+  /// Parameters structure - required
+  struct Params { };
+
+  //
+  // Methods
+  //
+
+  /// Constructor from Params
+  EpilogueStreamkWithBroadcastOpBase(Params const &params_) { }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator with bias vector broadcast over columns.
+///
+/// Computes the following:
+///
+///
+///  Z, T = OutputOp(AB, C, Broadcast)
+///
+///  if (ElementwiseOp::kStoreZ) {
+///    store(converted_u);
+///  }  
+///
+///  if (ElementwiseOp::kStoreT) {
+///    store(v);
+///  }  
+///
+template <
+  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  int PartitionsK,                          ///< Number of partitions of the K dimension
+  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors (z)
+  typename TensorTileIterator_,             ///< Additional tile iterator for tensor-valued operands (t)
+  typename ElementVector_,                  ///< Pointer to broadcast vector
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
+  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
+  typename OutputOp_,                       ///< Output operator - concept is EpilogueWithBroadcastOp
+  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
+  int FragmentsPerPartition = 1,            ///< Used to coarsten the epilogue granularity
+  int IterationsUnroll =                    ///< Used to reduce binary size when epilogue op is large
+    (!IsEpilogueFunctorHeavy<OutputOp_>::value),
+  bool IsSingleSource = OutputOp_::kIsSingleSource
+>
+class EpilogueStreamkWithBroadcast;
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// EpilogueStreamkWithBroadcast: Two sources
+
+template <
+  typename Shape_,
+  typename WarpMmaOperator_,
+  int PartitionsK,
+  typename OutputTileIterator_,
+  typename TensorTileIterator_,
+  typename ElementVector_,
+  typename AccumulatorFragmentIterator_,
+  typename WarpTileIterator_,
+  typename SharedLoadIterator_,
+  typename OutputOp_,
+  typename Padding_,
+  int FragmentsPerPartition,
+  int IterationsUnroll
+>
+class EpilogueStreamkWithBroadcast<
+  Shape_,
+  WarpMmaOperator_,
+  PartitionsK,
+  OutputTileIterator_,
+  TensorTileIterator_,
+  ElementVector_,
+  AccumulatorFragmentIterator_,
+  WarpTileIterator_,
+  SharedLoadIterator_,
+  OutputOp_,
+  Padding_,
+  FragmentsPerPartition,
+  IterationsUnroll,
+  false
+> : 
+  public EpilogueWithBroadcast<
+    Shape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    OutputTileIterator_,
+    TensorTileIterator_,
+    ElementVector_,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    SharedLoadIterator_,
+    OutputOp_,
+    Padding_,
+    FragmentsPerPartition,
+    IterationsUnroll,
+    false>,
+  public EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>
+{
+
+public:
+
+  using Base = EpilogueWithBroadcast<
+    Shape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    OutputTileIterator_,
+    TensorTileIterator_,
+    ElementVector_,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    SharedLoadIterator_,
+    OutputOp_,
+    Padding_,
+    FragmentsPerPartition,
+    IterationsUnroll,
+    false>;
+
+  using BaseStreamK = EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>;
+
+  using Shape = Shape_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using TensorTileIterator = TensorTileIterator_;
+  using ElementVector = ElementVector_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+
+  /// Fragment type used by the accumulator tile's fragment iterator
+  using AccumulatorFragment = typename Base::AccumulatorFragmentIterator::Fragment;
+
+  /// Shared storage structure (shadows base) with additional SMEM buffer for reduction
+  using SharedStorage = typename Base::SharedStorage;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueStreamkWithBroadcast(
+    SharedStorage &shared_storage,                    ///< Shared storage object    
+    int thread_idx,                                   ///< ID of a thread within the threadblock
+    int warp_idx,                                     ///< ID of warp within threadblock
+    int lane_idx                                      ///< Id of thread within warp
+  ):
+    Base(shared_storage, thread_idx, warp_idx, lane_idx),
+    BaseStreamK(thread_idx)
+  { }
+
+
+  /// Aggregates the accumulator sets shared by peer blocks in the global workspace,
+  /// performing epilogue computations, writing to output
+  CUTLASS_DEVICE
+  void reduce(
+      int peer_idx_begin,
+      int peer_idx_end,
+      int reduce_fragment_idx,
+      void *element_workspace,
+      OutputOp const &output_op,                      ///< Output operator
+      ElementVector const * broadcast_ptr,            ///< Broadcast vector
+      OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+      OutputTileIterator source_iterator1,            ///< Tile iterator for first  source accumulator matrix
+      OutputTileIterator source_iterator2,            ///< Tile iterator for second source accumulator matrix
+      TensorTileIterator tensor_iterator,             ///< Threadblock tile iterator for additional tensor operand
+      MatrixCoord const &problem_size =               ///< Problem size needed to guard against out-of-bounds accesses
+          MatrixCoord(Shape::kM, Shape::kN),
+      MatrixCoord const &threadblock_offset =         ///< Threadblock's initial offset within the problem size space
+          MatrixCoord()) 
+  {
+    // Reduce peer accumulator fragments into one fragment
+    AccumulatorFragment accum_fragment;
+    BaseStreamK::reduce(accum_fragment, peer_idx_begin, peer_idx_end, reduce_fragment_idx, element_workspace);
+
+    // Store fragment to shared memory
+    this->warp_tile_iterator_.store(accum_fragment);
+
+    __syncthreads();
+
+    Base::reduce(reduce_fragment_idx, output_op, broadcast_ptr, destination_iterator, source_iterator1, source_iterator2, tensor_iterator, problem_size, threadblock_offset);
+    
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// EpilogueStreamkWithBroadcast: Single source
+
+template <
+  typename Shape_,
+  typename WarpMmaOperator_,
+  int PartitionsK,
+  typename OutputTileIterator_,
+  typename TensorTileIterator_,
+  typename ElementVector_,
+  typename AccumulatorFragmentIterator_,
+  typename WarpTileIterator_,
+  typename SharedLoadIterator_,
+  typename OutputOp_,
+  typename Padding_,
+  int FragmentsPerPartition,
+  int IterationsUnroll
+>
+class EpilogueStreamkWithBroadcast<
+  Shape_,
+  WarpMmaOperator_,
+  PartitionsK,
+  OutputTileIterator_,
+  TensorTileIterator_,
+  ElementVector_,
+  AccumulatorFragmentIterator_,
+  WarpTileIterator_,
+  SharedLoadIterator_,
+  OutputOp_,
+  Padding_,
+  FragmentsPerPartition,
+  IterationsUnroll,
+  true
+> : 
+  public EpilogueWithBroadcast<
+    Shape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    OutputTileIterator_,
+    TensorTileIterator_,
+    ElementVector_,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    SharedLoadIterator_,
+    OutputOp_,
+    Padding_,
+    FragmentsPerPartition,
+    IterationsUnroll,
+    true>,
+  public EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>
+{
+
+public:
+
+  using Base = EpilogueWithBroadcast<
+    Shape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    OutputTileIterator_,
+    TensorTileIterator_,
+    ElementVector_,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    SharedLoadIterator_,
+    OutputOp_,
+    Padding_,
+    FragmentsPerPartition,
+    IterationsUnroll,
+    true>;
+
+  using BaseStreamK = EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>;
+
+  using Shape = Shape_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using TensorTileIterator = TensorTileIterator_;
+  using ElementVector = ElementVector_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+
+  /// Fragment type used by the accumulator tile's fragment iterator
+  using AccumulatorFragment = typename Base::AccumulatorFragmentIterator::Fragment;
+
+  /// Shared storage structure (shadows base) with additional SMEM buffer for reduction
+  using SharedStorage = typename Base::SharedStorage;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueStreamkWithBroadcast(
+    SharedStorage &shared_storage,                    ///< Shared storage object    
+    int thread_idx,                                   ///< ID of a thread within the threadblock
+    int warp_idx,                                     ///< ID of warp within threadblock
+    int lane_idx                                      ///< Id of thread within warp
+  ):
+    Base(shared_storage, thread_idx, warp_idx, lane_idx),
+    BaseStreamK(thread_idx)
+  { }
+
+
+  /// Aggregates the accumulator sets shared by peer blocks in the global workspace,
+  /// performing epilogue computations, writing to output
+  CUTLASS_DEVICE
+  void reduce(
+      int peer_idx_begin,
+      int peer_idx_end,
+      int reduce_fragment_idx,
+      void *element_workspace,
+      OutputOp const &output_op,                      ///< Output operator
+      ElementVector const * broadcast_ptr,            ///< Broadcast vector
+      OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+      OutputTileIterator source_iterator,             ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+      TensorTileIterator tensor_iterator,             ///< Threadblock tile iterator for additional tensor operand
+      MatrixCoord const &problem_size =               ///< Problem size needed to guard against out-of-bounds accesses
+          MatrixCoord(Shape::kM, Shape::kN),
+      MatrixCoord const &threadblock_offset =         ///< Threadblock's initial offset within the problem size space
+          MatrixCoord()) 
+  {
+    // Reduce peer accumulator fragments into one fragment
+    AccumulatorFragment accum_fragment;
+    BaseStreamK::reduce(accum_fragment, peer_idx_begin, peer_idx_end, reduce_fragment_idx, element_workspace);
+
+    // Store fragment to shared memory
+    this->warp_tile_iterator_.store(accum_fragment);
+
+    __syncthreads();
+
+    Base::reduce(reduce_fragment_idx, output_op, broadcast_ptr, destination_iterator, source_iterator, tensor_iterator, problem_size, threadblock_offset);
+    
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/threadblock/epilogue_visitor_with_softmax.h b/include/cutlass/epilogue/threadblock/epilogue_visitor_with_softmax.h
new file mode 100644
index 0000000000..8202284b6c
--- /dev/null
+++ b/include/cutlass/epilogue/threadblock/epilogue_visitor_with_softmax.h
@@ -0,0 +1,513 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue visitor for threadblock scoped GEMMs that process softmax computations in epilogue.
+
+  The epilogue finds max values in each row of the row-major output matrix and stores them.
+  The max values are also used for a further round of threadblock scoped reduction operation, where
+  the partial reduction results are stored in a pre-allocated array and used for further full reduction.
+
+*/
+
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/fast_math.h"
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+template <
+  typename ThreadblockShape_,
+  int ThreadCount,
+  typename OutputTileIterator_,
+  typename ElementAccumulator_,
+  typename ElementNorm_,
+  typename ElementSum_,
+  typename ElementSoftmaxCompute_,
+  typename ElementwiseFunctor_,
+  bool UseMasking_ = false
+>
+class EpilogueVisitorSoftmax {
+public:
+
+  using ThreadblockShape   = ThreadblockShape_;
+  static int const kThreadCount = ThreadCount;
+
+  using OutputTileIterator = OutputTileIterator_;
+  using ElementwiseFunctor = ElementwiseFunctor_;
+
+  static int const kIterations = OutputTileIterator::kIterations;
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  using ElementOutput = typename OutputTileIterator::Element;
+  using LayoutOutput = cutlass::layout::RowMajor;
+  using ElementAccumulator = ElementAccumulator_;
+
+  using ElementNorm = ElementNorm_;
+  using ElementSum = ElementSum_;
+  using ElementSoftmaxCompute = ElementSoftmaxCompute_;
+
+  using AccumulatorFragment = Array<ElementAccumulator, kElementsPerAccess>;
+  using SoftmaxFragment = Array<ElementSoftmaxCompute, kElementsPerAccess>;
+  using OutputVector = Array<ElementOutput, kElementsPerAccess>;
+  using TensorRefD = TensorRef<ElementOutput, LayoutOutput>;
+
+  static int const kThreadsPerRow = OutputTileIterator::ThreadMap::Detail::kAccessWidth;
+  static bool const kHasMultiStepsInRow = (OutputTileIterator::ThreadMap::Iterations::kColumn > 1);
+  static bool const kUseMasking = UseMasking_;
+
+  /// Argument structure
+  struct Arguments {
+
+    typename ElementwiseFunctor::Params   elementwise;
+    int64_t                               batch_stride_C;
+    int64_t                               batch_stride_D;
+    int64_t                               batch_stride_Max;
+    int64_t                               batch_stride_Sum;
+
+    //
+    // Methods
+    //
+    Arguments():
+      batch_stride_C(0),
+      batch_stride_D(0),
+      batch_stride_Max(0),
+      batch_stride_Sum(0)
+    {
+
+    }
+
+    Arguments(
+      typename ElementwiseFunctor::Params   elementwise_
+    ):
+      elementwise(elementwise_),
+      batch_stride_C(0),
+      batch_stride_D(0),
+      batch_stride_Max(0),
+      batch_stride_Sum(0)
+    {
+
+    }
+
+    Arguments(
+      typename ElementwiseFunctor::Params   elementwise_,
+      int64_t                               batch_stride_C_,
+      int64_t                               batch_stride_D_,
+      int64_t                               batch_stride_Max_,
+      int64_t                               batch_stride_Sum_
+    ):
+      elementwise(elementwise_),
+      batch_stride_C(batch_stride_C_),
+      batch_stride_D(batch_stride_D_),
+      batch_stride_Max(batch_stride_Max_),
+      batch_stride_Sum(batch_stride_Sum_)
+    {
+
+    }
+
+  };
+
+  struct Params {
+
+    typename ElementwiseFunctor::Params   elementwise;
+    int64_t                               batch_stride_C;
+    int64_t                               batch_stride_D;
+    int64_t                               batch_stride_Max;
+    int64_t                               batch_stride_Sum;
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params()
+    {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(Arguments const &args):
+      elementwise(args.elementwise),
+      batch_stride_C(args.batch_stride_C),
+      batch_stride_D(args.batch_stride_D),
+      batch_stride_Max(args.batch_stride_Max),
+      batch_stride_Sum(args.batch_stride_Sum)
+    {
+
+    }
+  };
+
+  /// Shared storage
+  struct SharedStorage {
+
+  };
+
+private:
+
+  Params const &                        params_;
+  SharedStorage &                       shared_storage_;
+  MatrixCoord                           extent_;
+  MatrixCoord                           extent_real_;
+  ElementwiseFunctor                    elementwise_;
+
+  OutputTileIterator                    iterator_C_;
+  OutputTileIterator                    iterator_D_;
+  typename OutputTileIterator::Fragment fragment_C_;
+  typename OutputTileIterator::Fragment fragment_D_;
+
+  ElementAccumulator                    alpha_;
+  ElementAccumulator                    beta_;
+
+  ElementNorm                           *ptr_Max_;
+  ElementSum                            *ptr_Sum_;
+
+  int                                   column_offset_;
+
+  ElementSoftmaxCompute                 accum_max_;
+  ElementSoftmaxCompute                 accum_sum_;
+
+  MatrixCoord                           thread_offset_;
+
+  float                                 infinity_;
+
+public:
+
+  CUTLASS_DEVICE
+  EpilogueVisitorSoftmax(
+    Params const &params,
+    SharedStorage &shared_storage,
+    cutlass::MatrixCoord const &problem_size,
+    int thread_idx,
+    int warp_idx,
+    int lane_idx,
+    typename OutputTileIterator::Params params_C,
+    typename OutputTileIterator::Params params_D,
+    typename OutputTileIterator::Element *ptr_C,
+    typename OutputTileIterator::Element *ptr_D,
+    ElementNorm *ptr_Max = nullptr,
+    ElementSum *ptr_Sum = nullptr,
+    cutlass::MatrixCoord const &threadblock_offset = cutlass::MatrixCoord(0, 0),
+    int column_offset = 0,
+    cutlass::MatrixCoord const &problem_size_real = cutlass::MatrixCoord(0, 0),
+    float infinity = 10000.0f
+  ):
+    params_(params),
+    shared_storage_(shared_storage),
+    extent_(problem_size),
+    elementwise_(params.elementwise),
+    iterator_C_(params_C, ptr_C, problem_size, thread_idx, threadblock_offset),
+    iterator_D_(params_D, ptr_D, problem_size, thread_idx, threadblock_offset),
+    ptr_Max_(ptr_Max),
+    ptr_Sum_(ptr_Sum),
+    column_offset_(column_offset),
+    extent_real_(problem_size_real),
+    infinity_(infinity)
+  {
+    alpha_ = (params.elementwise.alpha_ptr ? *params.elementwise.alpha_ptr : params.elementwise.alpha);
+    beta_ =  (params.elementwise.beta_ptr ? *params.elementwise.beta_ptr : params.elementwise.beta);
+
+    if (beta_ == ElementAccumulator()) {
+      iterator_C_.clear_mask();
+    }
+  }
+
+  /// Helper to indicate split-K behavior
+  CUTLASS_DEVICE
+  void set_k_partition(
+    int split_k_index,                                            ///< Index of this threadblock within split-K partitioned scheme
+    int split_k_slices) {                                         ///< Total number of split-K slices
+
+  }
+
+  /// Called to set the batch index
+  CUTLASS_DEVICE
+  void set_batch_index(int batch_idx) {
+    iterator_C_.add_pointer_offset(batch_idx * params_.batch_stride_C);
+    iterator_D_.add_pointer_offset(batch_idx * params_.batch_stride_D);
+  }
+
+  /// Called at the start of the epilogue just before iterating over accumulator slices
+  CUTLASS_DEVICE
+  void begin_epilogue() {
+
+  }
+
+  /// Called at the start of one step before starting accumulator exchange
+  CUTLASS_DEVICE
+  void begin_step(int step_idx) {
+    fragment_D_.clear();
+    fragment_C_.clear();
+
+    if (elementwise_.kScale != cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling) {
+      iterator_C_.load(fragment_C_);
+      ++iterator_C_;
+    }
+    
+  }
+
+  /// Called at the start of a row
+  CUTLASS_DEVICE
+  void begin_row(int row_idx) {
+    // Clear accumulators for max and sum when starting a whole row
+    clear_accum_();
+
+  }
+
+  /// Called after accumulators have been exchanged for each accumulator vector
+  CUTLASS_DEVICE
+  void visit(
+    int iter_idx,
+    int row_idx,
+    int column_idx,
+    int frag_idx,
+    AccumulatorFragment const &accum) {
+
+    using Mul = cutlass::multiplies<SoftmaxFragment>;
+    using Minus = cutlass::minus<SoftmaxFragment>;
+    using Exp   = cutlass::fast_exp_op<SoftmaxFragment>;
+
+    Minus     minus;
+    Exp       exponential;
+
+    SoftmaxFragment result;
+
+    NumericArrayConverter<ElementSoftmaxCompute, ElementOutput, kElementsPerAccess> source_converter;
+    OutputVector &source_vector = reinterpret_cast<OutputVector *>(&fragment_C_)[frag_idx];
+
+    if (elementwise_.kScale == cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling) {
+      result = source_converter(elementwise_(accum));
+    }else{
+      result = source_converter(elementwise_(accum, source_vector));
+    }
+
+    thread_offset_ =
+      iterator_D_.thread_start() +
+      OutputTileIterator::ThreadMap::iteration_offset(frag_idx);
+
+    bool column_guard = (thread_offset_.column() < extent_.column());
+
+    if (kUseMasking) {
+      int elements_in_boundary = extent_real_.column() - thread_offset_.column();
+      elements_in_boundary = (elements_in_boundary > kElementsPerAccess) ? kElementsPerAccess : elements_in_boundary;
+      elementwise_padding_(result, elements_in_boundary);
+    }
+
+    ElementSoftmaxCompute accum_max_prev = accum_max_;
+
+    // Compute the maximum within one row
+    if (!column_idx) {
+      // This is the first fragment in a new row
+      if (column_guard) {
+        accum_max_ = maximum_accumulator_(result);
+      }
+    }
+    else {
+      // This is an additional fragment in the same row
+      if (column_guard) {
+        accum_max_ = maximum_accumulator_(result, accum_max_);
+      }
+    }
+
+    // proactively compute max in warps
+    accum_max_ = warp_reduce_max_(accum_max_);
+
+    ElementSoftmaxCompute updater = fast_exp(accum_max_prev - accum_max_);
+
+    SoftmaxFragment intermediate = exponential(minus(result, accum_max_));
+
+    if (kHasMultiStepsInRow) {
+      if (!column_idx) {
+        accum_sum_ = (column_guard) ? \
+          sum_accumulator_(intermediate) : ElementSoftmaxCompute(0);
+      } else {
+        // Algorithm in $3.1, https://arxiv.org/pdf/2205.14135v1.pdf
+        // S* = S* x updater + sum_row(P'), where updater = exp(M* - M_row)
+        accum_sum_ = (column_guard) ? \
+          sum_accumulator_(intermediate, accum_sum_ * updater) : accum_sum_ * updater;
+      }
+    } else {
+      accum_sum_ = (column_guard) ? sum_accumulator_(intermediate, accum_sum_) : ElementSoftmaxCompute(0);
+    }
+
+    // Convert to the output
+    NumericArrayConverter<ElementOutput, ElementSoftmaxCompute, kElementsPerAccess> output_converter;
+    OutputVector &output = reinterpret_cast<OutputVector *>(&fragment_D_)[frag_idx];
+    output = output_converter(result);
+  }
+
+  /// Called at the end of a row
+  CUTLASS_DEVICE
+  void end_row(int row_idx) {
+
+    using ConvertSumOutput = cutlass::NumericConverter<ElementSum, ElementSoftmaxCompute>;
+    using ConvertNormOutput = cutlass::NumericConverter<ElementNorm, ElementSoftmaxCompute>;
+
+    ConvertSumOutput   convert_sum_output;
+    ConvertNormOutput  convert_norm_output;
+
+    // Compute accumulate sum only in the last step
+    accum_sum_ = warp_reduce_sum_(accum_sum_);
+
+    bool is_first_thread_in_tile = ((threadIdx.x % kThreadsPerRow) == 0);
+    bool row_guard = thread_offset_.row() < extent_.row();
+    bool is_write_thread = row_guard && is_first_thread_in_tile;
+
+    int block_batch = blockIdx.z;
+
+    ElementNorm *curr_ptr_max = ptr_Max_ + thread_offset_.row() + column_offset_ + block_batch * params_.batch_stride_Max;
+    ElementSum *curr_ptr_sum = ptr_Sum_ + thread_offset_.row() + column_offset_ + block_batch * params_.batch_stride_Sum;
+
+    arch::global_store<ElementNorm, sizeof(ElementNorm)>(
+              convert_norm_output(accum_max_),
+              (void *)curr_ptr_max,
+              is_write_thread);
+
+    arch::global_store<ElementSum, sizeof(ElementSum)>(
+              convert_sum_output(accum_sum_),
+              (void *)curr_ptr_sum,
+              is_write_thread);
+
+    // Clear accumulators for max and sum when finishing a whole row
+    clear_accum_();
+
+  }
+
+  /// Called after all accumulator elements have been visited
+  CUTLASS_DEVICE
+  void end_step(int step_idx) {
+
+    iterator_D_.store(fragment_D_);
+    ++iterator_D_;
+  }
+
+  /// Called after all steps have been completed
+  CUTLASS_DEVICE
+  void end_epilogue() {
+
+  }
+
+private:
+
+  CUTLASS_DEVICE
+  void elementwise_padding_(SoftmaxFragment &result, int elements_in_boundary) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < SoftmaxFragment::kElements; ++i) {
+      result[i] = (i < elements_in_boundary) ? result[i] : ElementSoftmaxCompute(-infinity_);
+    }
+  }
+
+  CUTLASS_DEVICE
+  ElementSoftmaxCompute warp_reduce_sum_(ElementSoftmaxCompute sum_) {
+    int half_thread_in_row = (kThreadsPerRow >> 1);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = half_thread_in_row; i > 0; i >>= 1) {
+      ElementSoftmaxCompute tmp = __shfl_xor_sync(0xFFFFFFFF, sum_, i);
+      sum_ += tmp;
+    }
+    return sum_;
+  }
+
+  CUTLASS_DEVICE
+  ElementSoftmaxCompute warp_reduce_max_(ElementSoftmaxCompute max_) {
+    int half_thread_in_row = (kThreadsPerRow >> 1);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = half_thread_in_row; i > 0; i >>= 1) {
+      ElementSoftmaxCompute tmp = __shfl_xor_sync(0xFFFFFFFF, max_, i);
+      max_ = fast_max(max_, tmp);
+    }
+    return max_;
+  }
+
+  CUTLASS_DEVICE
+  void clear_accum_() {
+
+    uint32_t float_max_bits = 0xff7fffff;   // -FLT_MAX
+    float min_float = reinterpret_cast<float const &>(float_max_bits);
+    accum_max_ = ElementSoftmaxCompute(min_float);
+    accum_sum_ = ElementSoftmaxCompute(0);
+  }
+
+  CUTLASS_DEVICE
+  ElementSoftmaxCompute sum_accumulator_(SoftmaxFragment const &accum) {
+    ElementSoftmaxCompute sum_ = ElementSoftmaxCompute(0);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < SoftmaxFragment::kElements; ++i) {
+      sum_ += ElementSoftmaxCompute(accum[i]);
+    }
+
+    return sum_;
+  }
+
+  CUTLASS_DEVICE
+  ElementSoftmaxCompute sum_accumulator_(SoftmaxFragment const &accum, ElementSoftmaxCompute sum_) {
+    // ElementSoftmaxCompute sum_ = ElementSoftmaxCompute(0);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < SoftmaxFragment::kElements; ++i) {
+      sum_ += ElementSoftmaxCompute(accum[i]);
+    }
+
+    return sum_;
+  }
+
+  CUTLASS_DEVICE
+  ElementSoftmaxCompute maximum_accumulator_(SoftmaxFragment const &accum) {
+    ElementSoftmaxCompute max_ = accum[0];
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < SoftmaxFragment::kElements; ++i) {
+      max_ = fast_max(max_, ElementSoftmaxCompute(accum[i]));
+    }
+
+    return max_;
+  }
+
+  CUTLASS_DEVICE
+  ElementSoftmaxCompute maximum_accumulator_(SoftmaxFragment const &accum, ElementSoftmaxCompute max_) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < SoftmaxFragment::kElements; ++i) {
+      max_ = fast_max(max_, ElementSoftmaxCompute(accum[i]));
+    }
+
+    return max_;
+  }
+};
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
diff --git a/include/cutlass/epilogue/threadblock/epilogue_with_absmax.h b/include/cutlass/epilogue/threadblock/epilogue_with_absmax.h
new file mode 100644
index 0000000000..df5bbc5c0e
--- /dev/null
+++ b/include/cutlass/epilogue/threadblock/epilogue_with_absmax.h
@@ -0,0 +1,923 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+
+  \brief Threadblock-level epilogue computing:
+    Aux = ((alpha * scale_a * scale_b) * accumulator) + ((beta * scale_c) * source) + bias
+    D = activation(Aux)
+
+    if Aux is fp8 type:
+        abs_max_output = max( abs(aux) | (for every aux in Aux))
+        Aux = scale_aux * Aux
+    endif
+
+    if D is fp8 type:
+        abs_max_output = max( abs(d) | (for every d in D))
+        D = scale_d * D
+    endif
+
+    Parameter Aux is optionally stored to global memory
+*/
+
+#pragma once
+
+#include <cuda/std/cassert>
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/utility>
+#else
+#include <utility>
+#endif
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/functional.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+
+#include "cutlass/numeric_types.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// Helper class for keeping track of absolute maximums and performing scaling
+template <
+  typename Iterator,        // Iterator type used for storing the data for which absolute maximum and scaling
+                            // will be computed. This type is used for predicating absolute maximum calculations.
+  typename Fragment,        // Type of input to be computed on
+  bool ScalingAndAmaxNeeded // Whether to perform absolute maximum and scaling operations
+>
+struct ScalingAndAmaxHelper;
+
+/// Partial specialization that does not perform scaling or calculate an absolute maximum
+template <typename Iterator, typename Fragment>
+struct ScalingAndAmaxHelper<Iterator, Fragment, false> {
+  using Element = typename Fragment::Element;
+
+  CUTLASS_HOST_DEVICE
+  ScalingAndAmaxHelper(Element scale) { }
+
+  CUTLASS_DEVICE
+  Fragment operator()(const Iterator& iterator, const Fragment& inp) {
+    return inp;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Element get_abs_max() const {
+    return Element(0.);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_scaling_factor(Element scale_) { }
+};
+
+/// Partial specialization that keeps track of an absolute maximum value of inputs seen
+/// and scales inputs
+template <typename Iterator, typename Fragment>
+struct ScalingAndAmaxHelper<Iterator, Fragment, true> {
+  using Element = typename Fragment::Element;
+  using AccessType = typename Iterator::AccessType;
+  using ThreadMap = typename Iterator::ThreadMap;
+
+  Element abs_max;
+  Element scale;
+
+  // Operators
+  maximum_with_nan_propogation<Element> max_op;
+  absolute_value_op<Element> abs_op;
+  multiplies<Fragment> multiply;
+
+  CUTLASS_HOST_DEVICE
+  ScalingAndAmaxHelper(Element scale_) : abs_max(0.), scale(scale_) { }
+
+  // Compute the absolute maximum value between `abs_max` and the entries
+  // of `frag` for predicated-on entries of `iterator`. Return a scaled
+  // version of `inp`.
+  CUTLASS_DEVICE
+  Fragment operator()(const Iterator& iterator, const Fragment& frag) {
+    using PredicateGroup = Array<Element, Iterator::ThreadMap::kElementsPerAccess>;
+    PredicateGroup const *frag_ptr = reinterpret_cast<PredicateGroup const *>(&frag);
+
+    typename Iterator::Mask mask;
+    iterator.get_mask(mask);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow
+            + group * ThreadMap::Delta::kGroup
+            + cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + iterator.thread_start_row()) < iterator.extent_row());
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+            bool guard = row_guard && mask.predicates[column];
+
+            if (guard) {
+              int access_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column;
+              CUTLASS_PRAGMA_UNROLL
+              for (int i = 0; i < PredicateGroup::kElements; ++i) {
+                abs_max = max_op(abs_max, abs_op(frag_ptr[access_idx][i]));
+              }
+            }
+          }
+        }
+      }
+    }
+
+    // Perform scaling
+    return multiply(scale, frag);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Element get_abs_max() const {
+    return abs_max;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_scaling_factor(Element scale_) {
+    scale = scale_;
+  }
+};
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  int PartitionsK,                          ///< Number of partitions of the K dimension
+  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors
+  typename AuxOutputTileIterator_,          ///< Tile iterator writing auxiliary output tensors
+  typename ElementVector_,                  ///< Data type of bias vector
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
+  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
+  typename OutputOp_,                       ///< Output operator
+  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
+  int FragmentsPerPartition = 1,            ///< Used to coarsen the epilogue granularity
+  int IterationsUnroll =                    ///< Used to reduce binary size when epilogue op is large
+    (!IsEpilogueFunctorHeavy<OutputOp_>::value)
+>
+class EpilogueWithAbsMax :
+  public EpilogueBase<
+    Shape_,
+    typename WarpMmaOperator_::Shape,
+    PartitionsK,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    Padding_,
+    FragmentsPerPartition> {
+
+public:
+
+  using Base = EpilogueBase<
+    Shape_,
+    typename WarpMmaOperator_::Shape,
+    PartitionsK,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    Padding_,
+    FragmentsPerPartition>;
+
+  static bool const kIsSingleSource = true;
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using AuxOutputTileIterator = AuxOutputTileIterator_;
+  using ElementVector = ElementVector_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = Padding_;
+
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename Base::AccumulatorTile;
+
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+
+  /// Data type used for absolute maximum value
+  using ElementAbsmax = typename OutputOp::ElementAbsmax;
+
+  /// Compute data type produced by the output op
+  using ElementCompute = typename OutputOp::ElementCompute;
+
+  /// Compute fragment
+  using FragmentCompute = Array<ElementCompute, OutputTileIterator::Fragment::kElements>;
+
+  /// Helpers for (optionally) computing absolute maximums and scaling output and auxiliary output
+  using OutputScaler = detail::ScalingAndAmaxHelper<OutputTileIterator,
+                                                    FragmentCompute,
+                                                    OutputOp::kIsScalingAndAmaxOutputNeeded>;
+
+  using AuxOutputScaler = detail::ScalingAndAmaxHelper<AuxOutputTileIterator,
+                                                       FragmentCompute,
+                                                       OutputOp::kIsScalingAndAmaxAuxOutputNeeded>;
+
+  /// Thread map used by output tile iterators
+  using ThreadMap = typename OutputTileIterator::ThreadMap;
+
+  /// Fragment object used to store the broadcast values
+  using BroadcastFragment = Array<
+    ElementCompute,
+    ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess>;
+
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+
+  /// Data type of auxiliary output
+  using ElementAuxOutput = typename AuxOutputTileIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+  /// Array type used to output
+  using OutputAccessType = Array<
+    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+  /// Array type used by output functor
+  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+  /// Array type used by output functor
+  using ComputeAccessType = Array<ElementCompute, OutputTileIterator::kElementsPerAccess>;
+
+  /// Auxiliary output access type
+  using AuxAccessType = Array<ElementAuxOutput, OutputTileIterator::kElementsPerAccess>;
+
+  /// Number of warps
+  using WarpCount = typename Base::WarpCount;
+
+  /// Shared memory allocation from epilogue base class
+  using BaseSharedStorage = typename Base::SharedStorage;
+
+  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
+  static int constexpr kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+
+  /// Used for the broadcast
+  struct BroadcastDetail {
+
+    /// Number of threads per warp
+    static int const kWarpSize = 32;
+
+    static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+    /// Number of distinct scalar column indices handled by each thread
+    static int const kColumnsPerThread = ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess;
+
+    /// Number of distinct scalar row indices handled by each thread
+    static int const kRowsPerThread = ThreadMap::Iterations::kCount / ThreadMap::Iterations::kColumn;
+
+    /// Number of threads per threadblock
+    static int const kThreadCount = kWarpSize * WarpCount::kCount;
+
+    /// Number of distinct threads per row of output tile
+    static int const kThreadsPerRow = (Shape::kN / kColumnsPerThread);
+
+    /// Number of distinct threads which must be reduced during the final reduction phase within the threadblock.
+    static int const kThreadRows = kThreadCount / kThreadsPerRow;
+
+    /// I'm not sure what I meant here.
+    static int const kThreadAccessesPerRow = const_max(1, (Shape::kN + kThreadCount - 1) / kThreadCount);
+
+    /// Shape of the shared memory allocation for the epilogue
+    using StorageShape = MatrixShape<
+      kThreadRows,
+      Shape::kN
+    >;
+
+    /// Debug printing
+    CUTLASS_DEVICE
+    static void print() {
+#if 0
+      printf("BroadcastDetail {\n");
+      printf(
+        "  kColumnsPerThread: %d\nkRowsPerThread: %d\n,kThreadCount: %d\nkThreadsPerRow: %d\n"
+        "kThreadRows: %d\nThreadAccessesPerRow: %d\nStorageShape: %d x %d (count: %d)\n",
+        kColumnsPerThread,
+        kRowsPerThread,
+        kThreadCount,
+        kThreadsPerRow,
+        kThreadRows,
+        kThreadAccessesPerRow,
+        StorageShape::kRow,
+        StorageShape::kColumn,
+        StorageShape::kCount
+      );
+      printf("};\n");
+#endif
+    }
+  };
+
+  /// Shared storage structure (shadows base) with additional SMEM buffer for reduction
+  struct SharedStorage {
+    union {
+      BaseSharedStorage base;
+    };
+
+    CUTLASS_HOST_DEVICE
+    SharedStorage() { }
+  };
+
+public:
+
+
+  static_assert(SharedLoadIterator::Fragment::kElements == OutputTileIterator::Fragment::kElements,
+    "Mismatch between shared load iterator and output tile iterator.");
+
+  static_assert(OutputTileIterator::kElementsPerAccess, "OutputTileIterator::kElementsPerAccess must not be zero.");
+
+  static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess),
+    "Divisibility");
+
+private:
+
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+
+  /// Thread index within the threadblock
+  int thread_idx_;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueWithAbsMax(
+    SharedStorage &shared_storage,                    ///< Shared storage object
+    int thread_idx,                                   ///< ID of a thread within the threadblock
+    int warp_idx,                                     ///< ID of warp within threadblock
+    int lane_idx                                      ///< Id of thread within warp
+  ):
+    Base(shared_storage.base, thread_idx, warp_idx, lane_idx),
+    shared_load_iterator_(shared_storage.base.reference(), thread_idx),
+    thread_idx_(thread_idx)
+  {
+
+  }
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp &output_op,                              ///< Output operator
+    ElementVector const * broadcast_ptr,              ///< Broadcast vector
+    OutputTileIterator destination_iterator,          ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator,               ///< Tile iterator for source accumulator matrix
+    AuxOutputTileIterator aux_iterator,               ///< Tile iterator for destination auxiliary output
+    MatrixCoord const &problem_size =                 ///< Problem size needed to guard against out-of-bounds accesses
+        MatrixCoord(Shape::kM, Shape::kN),
+    MatrixCoord const &threadblock_offset =           ///< Threadblock's initial offset within the problem size space
+        MatrixCoord()) {
+
+    BroadcastFragment broadcast_fragment;
+
+    load_broadcast_fragment_(broadcast_fragment, broadcast_ptr, problem_size, threadblock_offset);
+
+    OutputScaler output_scaler(output_op.get_scale_d());
+
+    AuxOutputScaler aux_scaler(output_op.get_scale_aux());
+
+    if (!output_op.is_source_needed()) {
+      compute_source_not_needed_(
+        output_op,
+        broadcast_fragment,
+        destination_iterator,
+        accumulators,
+        aux_iterator,
+        output_scaler,
+        aux_scaler);
+    }
+    else {
+      compute_source_needed_(
+        output_op,
+        broadcast_fragment,
+        destination_iterator,
+        accumulators,
+        source_iterator,
+        aux_iterator,
+        output_scaler,
+        aux_scaler);
+    }
+
+    // Store the absolute maximum values of the output and auxiliar tensors, if needed.
+    if (output_op.get_ptr_output_abs_max() != nullptr) {
+      ElementAbsmax local_abs_max =
+          NumericConverter<ElementAbsmax, ElementCompute, OutputOp::kRound>{}(output_scaler.get_abs_max());
+      atomic_maximum<ElementAbsmax>{}(
+        output_op.get_ptr_output_abs_max(), local_abs_max);
+    }
+
+    if (output_op.get_ptr_aux_output_abs_max() != nullptr) {
+      ElementAbsmax local_abs_max =
+          NumericConverter<ElementAbsmax, ElementCompute, OutputOp::kRound>{}(aux_scaler.get_abs_max());
+      atomic_maximum<ElementAbsmax>{}(
+        output_op.get_ptr_aux_output_abs_max(), local_abs_max);
+    }
+  }
+
+private:
+
+  CUTLASS_DEVICE
+  void load_broadcast_fragment_(
+    BroadcastFragment & broadcast_fragment,      ///< Fragment containing the accumulated partial reduction over columns
+    ElementVector const * broadcast_ptr,         ///< Broadcast vector
+    MatrixCoord const &problem_size,             ///< Problem size needed to guard against out-of-bounds accesses
+    MatrixCoord const &threadblock_offset        ///< Threadblock's initial offset within the problem size space
+    ) {
+
+    broadcast_fragment.clear();
+
+    // If no pointer is supplied, set with all zeros and avoid memory accesses
+    if (!broadcast_ptr) {
+      return;
+    }
+
+    int thread_initial_column = ThreadMap::initial_offset(thread_idx_).column();
+
+    int thread_column_idx = threadblock_offset.column() + thread_initial_column;
+    broadcast_ptr += thread_initial_column;
+
+    NumericArrayConverter<ElementCompute, ElementVector, BroadcastDetail::kElementsPerAccess> converter;
+    using AccessType = AlignedArray<ElementVector, BroadcastDetail::kElementsPerAccess>;
+    using ComputeFragmentType = Array<ElementCompute, BroadcastDetail::kElementsPerAccess>;
+
+    ComputeFragmentType *frag_ptr = reinterpret_cast<ComputeFragmentType *>(&broadcast_fragment);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < ThreadMap::Iterations::kColumn; ++j) {
+
+      AccessType loaded;
+
+      loaded.clear();
+
+      if (thread_column_idx < problem_size.column()) {
+        loaded = *reinterpret_cast<AccessType const *>(broadcast_ptr);
+      }
+
+      ComputeFragmentType cvt = converter(loaded);
+      frag_ptr[j] = cvt;
+
+      thread_column_idx += ThreadMap::Delta::kColumn;
+      broadcast_ptr += ThreadMap::Delta::kColumn;
+    }
+  }
+
+  template <class Seq>
+  struct acc2smem_source_not_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_not_needed<cutlass::index_sequence<Seq...>> {
+    template <int Advance>
+    CUTLASS_DEVICE static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                                      WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+        typename AccumulatorFragmentIterator::Fragment accum_fragment;
+
+        accum_fragment_iterator.load(accum_fragment);
+        ++accum_fragment_iterator;
+
+        warp_tile_iterator.store(accum_fragment);
+        if (p < Base::kFragmentsPerIteration - 1) {
+          warp_tile_iterator.add_pointer_offset(kSmemPointerOffset);
+        }
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        warp_tile_iterator.add_pointer_offset(kSmemPointerOffset *
+                                              (1 - Base::kFragmentsPerIteration));
+      }
+    }
+
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                     AccumulatorFragmentIterator const &iterator_begin,
+                     WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {
+          (pos == (Seq * Base::kFragmentsPerIteration)) &&
+          (helper<Seq * Base::kFragmentsPerIteration>(iterator_begin, warp_tile_iterator), 0)...};
+
+      CUTLASS_UNUSED(dummy[0]);
+    }
+  };
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_not_needed_(
+    OutputOp &output_op,                              ///< Output operator
+    BroadcastFragment const &broadcast_fragment,      ///< Fragment containing the accumulated partial reduction over columns
+    OutputTileIterator destination_iterator,          ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile
+    AuxOutputTileIterator aux_iterator,               ///< Tile iterator for destination auxiliary output
+    OutputScaler& output_scaler,                      ///< Helper for (optionally) computing the absolute maximum and scaling output
+    AuxOutputScaler& aux_scaler                       ///< Helper for (optionally) computing the absolute maximum and scaling the auxiliary output
+    ) {
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+    // CUTLASS_PRAGMA_UNROLL
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations / Base::kFragmentsPerIteration : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; iter += Base::kFragmentsPerIteration) {
+
+      //
+      // Convert and store fragment
+      //
+
+
+      __syncthreads();
+
+      acc2smem_source_not_needed<
+          cutlass::make_index_sequence<OutputTileIterator::kIterations /
+                                   Base::kFragmentsPerIteration>>::push(iter,
+                                                                        accum_fragment_iterator,
+                                                                        this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+
+
+        typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+        shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+        if (p < Base::kFragmentsPerIteration - 1) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+        }
+        else if (kPartitionsK > 1) {
+
+          plus <typename SharedLoadIterator::Fragment> add_fragments;
+
+          CUTLASS_PRAGMA_UNROLL
+          for ( int i = 1; i < kPartitionsK; ++i) {
+            shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+            shared_load_iterator_.load(aligned_accum_fragment[i]);
+            aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+          }
+
+          shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
+        }
+
+        //
+        // Apply output operation
+        //
+
+        FragmentCompute frag_Z_compute;
+        FragmentCompute frag_Aux_compute;
+
+        apply_output_operator_source_not_needed_(
+          frag_Z_compute,
+          frag_Aux_compute,
+          output_op,
+          aligned_accum_fragment[0],
+          broadcast_fragment);
+
+        //
+        // Conditionally store fragments
+        //
+
+        // (Optionally) compute the absolute maximum of frag_Z and scale frag_Z
+        frag_Z_compute = output_scaler(destination_iterator, frag_Z_compute);
+        NumericArrayConverter<typename OutputTileIterator::Fragment::Element, ElementCompute,
+                              OutputTileIterator::Fragment::kElements> cvt_to_dst;
+        typename OutputTileIterator::Fragment frag_Z = cvt_to_dst(frag_Z_compute);
+
+        // Always store the output
+        destination_iterator.store(frag_Z);
+        ++destination_iterator;
+
+        // Only store the auxiliary output if scaling and absolute-maximum calculation were needed
+        if (OutputOp::kIsScalingAndAmaxAuxOutputNeeded) {
+          frag_Aux_compute = aux_scaler(aux_iterator, frag_Aux_compute);
+
+          NumericArrayConverter<typename AuxOutputTileIterator::Fragment::Element, ElementCompute,
+                                AuxOutputTileIterator::Fragment::kElements> cvt_to_aux;
+          typename AuxOutputTileIterator::Fragment frag_Aux = cvt_to_aux(frag_Aux_compute);
+          aux_iterator.store(frag_Aux);
+          ++aux_iterator;
+        }
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        shared_load_iterator_.add_pointer_offset(kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
+    }
+  }
+
+
+  template<class Seq>
+  struct acc2smem_source_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
+    template<int Advance>
+    CUTLASS_DEVICE
+    static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                       WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+      accum_fragment_iterator.load(accum_fragment);
+      warp_tile_iterator.store(accum_fragment);
+    }
+
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                     AccumulatorFragmentIterator const &iterator_begin,
+                     WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
+    }
+  };
+
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_needed_(
+    OutputOp &output_op,                          ///< Output operator
+    BroadcastFragment const &broadcast_fragment,  ///< Fragment containing the accumulated partial reduction over columns
+    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,          ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator,           ///< Tile iterator for source accumulator matrix
+    AuxOutputTileIterator aux_iterator,               ///< Tile iterator for destination auxiliary output
+    OutputScaler& output_scaler,                      ///< Helper for (optionally) computing the absolute maximum and scaling output
+    AuxOutputScaler& aux_scaler                       ///< Helper for (optionally) computing the absolute maximum and scaling the auxiliary output
+    ) {
+
+    typename OutputTileIterator::Fragment source_fragment;
+    source_fragment.clear();
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+
+      //
+      // Load the source
+      //
+
+      source_iterator.load(source_fragment);
+      ++source_iterator;
+
+      //
+      // Convert and store fragment
+      //
+
+      __syncthreads();
+
+      acc2smem_source_needed<cutlass::make_index_sequence<OutputTileIterator::kIterations>>::push(
+          iter, accum_fragment_iterator, this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+      // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
+      if (kPartitionsK > 1)
+      {
+        plus <typename SharedLoadIterator::Fragment> add_fragments;
+        const int tile_row_offset = Base::SharedStorage::StorageShape::kRow / PartitionsK;
+
+        CUTLASS_PRAGMA_UNROLL
+        for ( int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_tile_offset({tile_row_offset , 0});
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+
+        shared_load_iterator_.add_tile_offset({-1 * (kPartitionsK-1) * tile_row_offset, 0});
+      }
+
+      //
+      // Apply output operation
+      //
+
+      FragmentCompute frag_Z_compute;
+      FragmentCompute frag_Aux_compute;
+
+      apply_output_operator_(
+        frag_Z_compute,
+        frag_Aux_compute,
+        output_op,
+        aligned_accum_fragment[0],
+        source_fragment,
+        broadcast_fragment);
+
+      //
+      // Conditionally store fragments
+      //
+
+      // (Optionally) compute the absolute maximum of frag_Z and scale frag_Z
+      frag_Z_compute = output_scaler(destination_iterator, frag_Z_compute);
+      NumericArrayConverter<typename OutputTileIterator::Fragment::Element, ElementCompute,
+                            OutputTileIterator::Fragment::kElements> cvt_to_dst;
+      typename OutputTileIterator::Fragment frag_Z = cvt_to_dst(frag_Z_compute);
+
+      // Always store the output
+      destination_iterator.store(frag_Z);
+      ++destination_iterator;
+
+      // Only store the auxiliary output if scaling and absolute-maximum calculation were needed
+      if (OutputOp::kIsScalingAndAmaxAuxOutputNeeded) {
+        frag_Aux_compute = aux_scaler(aux_iterator, frag_Aux_compute);
+
+        NumericArrayConverter<typename AuxOutputTileIterator::Fragment::Element, ElementCompute,
+                              AuxOutputTileIterator::Fragment::kElements> cvt_to_aux;
+        typename AuxOutputTileIterator::Fragment frag_Aux = cvt_to_aux(frag_Aux_compute);
+        aux_iterator.store(frag_Aux);
+        ++aux_iterator;
+      }
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_(
+    FragmentCompute &frag_Z,
+    FragmentCompute &frag_Aux,
+    OutputOp &output_op,
+    typename SharedLoadIterator::Fragment const &frag_AB,
+    typename OutputTileIterator::Fragment const &frag_C,
+    BroadcastFragment const &frag_Broadcast) {
+
+    using AccessTypeZ = Array<ElementCompute, kElementsPerAccess>;
+    using AccessTypeAux = Array<ElementCompute, kElementsPerAccess>;
+    using AccessTypeBroadcast = Array<ElementCompute, kElementsPerAccess>;
+
+    AccessTypeZ *frag_Z_ptr = reinterpret_cast<AccessTypeZ *>(&frag_Z);
+    AccessTypeAux *frag_Aux_ptr = reinterpret_cast<AccessTypeAux *>(&frag_Aux);
+
+    AccumulatorAccessType const *frag_AB_ptr =
+      reinterpret_cast<AccumulatorAccessType const *>(&frag_AB);
+
+    OutputAccessType const *frag_C_ptr =
+      reinterpret_cast<OutputAccessType const *>(&frag_C);
+
+    AccessTypeBroadcast const *frag_Broadcast_ptr =
+      reinterpret_cast<AccessTypeBroadcast const *>(&frag_Broadcast);
+
+    int const kOutputOpIterations =
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+        output_op(
+          frag_Z_ptr[i],
+          frag_Aux_ptr[i],
+          frag_AB_ptr[i],
+          frag_Broadcast_ptr[i % ThreadMap::Iterations::kColumn],
+          frag_C_ptr[i]);
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_source_not_needed_(
+    FragmentCompute &frag_Z,
+    FragmentCompute &frag_Aux,
+    OutputOp &output_op,
+    typename SharedLoadIterator::Fragment const &frag_AB,
+    BroadcastFragment const &frag_Broadcast) {
+
+    using AccessTypeZ = Array<ElementCompute, kElementsPerAccess>;
+    using AccessTypeAux = Array<ElementCompute, kElementsPerAccess>;
+    using AccessTypeBroadcast = Array<ElementCompute, kElementsPerAccess>;
+
+    AccessTypeZ *frag_Z_ptr = reinterpret_cast<AccessTypeZ *>(&frag_Z);
+    AccessTypeAux *frag_Aux_ptr = reinterpret_cast<AccessTypeAux *>(&frag_Aux);
+
+    AccumulatorAccessType const *frag_AB_ptr =
+      reinterpret_cast<AccumulatorAccessType const *>(&frag_AB);
+
+    AccessTypeBroadcast const *frag_Broadcast_ptr =
+      reinterpret_cast<AccessTypeBroadcast const *>(&frag_Broadcast);
+
+    int const kOutputOpIterations =
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+
+      output_op(
+        frag_Z_ptr[i],
+        frag_Aux_ptr[i],
+        frag_AB_ptr[i],
+        frag_Broadcast_ptr[i % ThreadMap::Iterations::kColumn]);
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/threadblock/epilogue_with_broadcast.h b/include/cutlass/epilogue/threadblock/epilogue_with_broadcast.h
index 943cd5d09b..d69f43c4a5 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_with_broadcast.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_with_broadcast.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -39,11 +39,12 @@
 
 #pragma once
 
-#include <utility>
-#if defined(__CUDACC_RTC__)
 #include <cuda/std/cassert>
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/utility>
 #else
-#include <assert.h>
+#include <utility>
 #endif
 
 #include "cutlass/cutlass.h"
@@ -132,7 +133,8 @@ struct EpilogueWithBroadcastOpBase {
     FragmentZ &frag_Z, 
     FragmentT &frag_T, 
     FragmentAccumulator const &AB,
-    FragmentC const &frag_C,
+    FragmentC const &frag_C1,
+    FragmentC const &frag_C2,
     FragmentCompute const &V) const {
 
   }
@@ -179,9 +181,42 @@ template <
   typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
   int FragmentsPerPartition = 1,            ///< Used to coarsten the epilogue granularity
   int IterationsUnroll =                    ///< Used to reduce binary size when epilogue op is large
-    (!IsEpilogueFunctorHeavy<OutputOp_>::value)
+    (!IsEpilogueFunctorHeavy<OutputOp_>::value),
+  bool IsSingleSource = OutputOp_::kIsSingleSource
+>
+class EpilogueWithBroadcast;
+
+template <
+  typename Shape_,
+  typename WarpMmaOperator_,
+  int PartitionsK,
+  typename OutputTileIterator_,
+  typename TensorTileIterator_,
+  typename ElementVector_,
+  typename AccumulatorFragmentIterator_,
+  typename WarpTileIterator_,
+  typename SharedLoadIterator_,
+  typename OutputOp_,
+  typename Padding_,
+  int FragmentsPerPartition,
+  int IterationsUnroll
 >
-class EpilogueWithBroadcast : 
+class EpilogueWithBroadcast<
+  Shape_,
+  WarpMmaOperator_,
+  PartitionsK,
+  OutputTileIterator_,
+  TensorTileIterator_,
+  ElementVector_,
+  AccumulatorFragmentIterator_,
+  WarpTileIterator_,
+  SharedLoadIterator_,
+  OutputOp_,
+  Padding_,
+  FragmentsPerPartition,
+  IterationsUnroll,
+  false
+> : 
   public EpilogueBase<
     Shape_, 
     typename WarpMmaOperator_::Shape, 
@@ -202,6 +237,7 @@ class EpilogueWithBroadcast :
     Padding_,
     FragmentsPerPartition>;
 
+  static bool const kIsSingleSource = false;
   using Shape = Shape_;
   using WarpMmaOperator = WarpMmaOperator_;
   static int const kPartitionsK = PartitionsK;
@@ -382,10 +418,11 @@ class EpilogueWithBroadcast :
   CUTLASS_DEVICE
   void operator()(
     OutputOp const &output_op,                        ///< Output operator
-    ElementVector const * broadcast_ptr,           ///< Broadcast vector
+    ElementVector const * broadcast_ptr,              ///< Broadcast vector
     OutputTileIterator destination_iterator,          ///< Tile iterator for destination
     AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile
-    OutputTileIterator source_iterator,               ///< Tile iterator for source accumulator matrix
+    OutputTileIterator source_iterator1,              ///< Tile iterator for first source accumulator matrix
+    OutputTileIterator source_iterator2,              ///< Tile iterator for second source accumulator matrix
     TensorTileIterator tensor_iterator,               ///< Threadblock tile iterator for additional tensor operand
     MatrixCoord const &problem_size =                 ///< Problem size needed to guard against out-of-bounds accesses
         MatrixCoord(Shape::kM, Shape::kN),
@@ -410,7 +447,8 @@ class EpilogueWithBroadcast :
         broadcast_fragment, 
         destination_iterator, 
         accumulators, 
-        source_iterator,
+        source_iterator1,
+        source_iterator2,
         tensor_iterator);
     }
   }
@@ -645,12 +683,15 @@ class EpilogueWithBroadcast :
     BroadcastFragment const &broadcast_fragment,  ///< Fragment containing the accumulated partial reduction over columns
     OutputTileIterator destination_iterator,      ///< Tile iterator for destination
     AccumulatorTile const &accumulators,          ///< Complete warp-level accumulator tile
-    OutputTileIterator source_iterator,           ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+    OutputTileIterator source_iterator1,          ///< Tile iterator for first source accumulator matrix
+    OutputTileIterator source_iterator2,          ///< Tile iterator for second source accumulator matrix
     TensorTileIterator tensor_iterator            ///< Threadblock tile iterator for additioanl tensor operand
     ) { 
     
-    typename OutputTileIterator::Fragment source_fragment;
-    source_fragment.clear();
+    typename OutputTileIterator::Fragment source_fragment1;
+    source_fragment1.clear();
+    typename OutputTileIterator::Fragment source_fragment2;
+    source_fragment2.clear();
 
     //
     // Iterator over warp-level accumulator fragment
@@ -669,8 +710,11 @@ class EpilogueWithBroadcast :
       // Load the source
       //
 
-      source_iterator.load(source_fragment);
-      ++source_iterator;
+      source_iterator1.load(source_fragment1);
+      ++source_iterator1;
+
+      source_iterator2.load(source_fragment2);
+      ++source_iterator2;
 
       //
       // Convert and store fragment
@@ -719,7 +763,8 @@ class EpilogueWithBroadcast :
         frag_T,
         output_op,
         aligned_accum_fragment[0],
-        source_fragment,
+        source_fragment1,
+        source_fragment2,
         broadcast_fragment);
 
       //
@@ -745,7 +790,8 @@ class EpilogueWithBroadcast :
     typename TensorTileIterator::Fragment &frag_T,
     OutputOp const &output_op,
     typename SharedLoadIterator::Fragment const &frag_AB,
-    typename OutputTileIterator::Fragment const &frag_C,
+    typename OutputTileIterator::Fragment const &frag_C1,
+    typename OutputTileIterator::Fragment const &frag_C2,
     BroadcastFragment const &frag_Broadcast) {
 
     using AccessTypeZ = Array<typename OutputTileIterator::Element, kElementsPerAccess>;
@@ -758,8 +804,11 @@ class EpilogueWithBroadcast :
     AccumulatorAccessType const *frag_AB_ptr = 
       reinterpret_cast<AccumulatorAccessType const *>(&frag_AB);
 
-    OutputAccessType const *frag_C_ptr = 
-      reinterpret_cast<OutputAccessType const *>(&frag_C);
+    OutputAccessType const *frag_C1_ptr =
+      reinterpret_cast<OutputAccessType const *>(&frag_C1);
+
+    OutputAccessType const *frag_C2_ptr =
+      reinterpret_cast<OutputAccessType const *>(&frag_C2);
 
     AccessTypeBroadcast const *frag_Broadcast_ptr =
       reinterpret_cast<AccessTypeBroadcast const *>(&frag_Broadcast);
@@ -769,13 +818,13 @@ class EpilogueWithBroadcast :
 
     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < kOutputOpIterations; ++i) {
-
-      output_op(
-        frag_Z_ptr[i], 
-        frag_T_ptr[i], 
-        frag_AB_ptr[i], 
-        frag_C_ptr[i], 
-        frag_Broadcast_ptr[i % ThreadMap::Iterations::kColumn]);
+        output_op(
+          frag_Z_ptr[i],
+          frag_T_ptr[i],
+          frag_AB_ptr[i],
+          frag_C1_ptr[i],
+          frag_C2_ptr[i],
+          frag_Broadcast_ptr[i % ThreadMap::Iterations::kColumn]);
     }
   }
 
@@ -814,6 +863,850 @@ class EpilogueWithBroadcast :
         frag_Broadcast_ptr[i % ThreadMap::Iterations::kColumn]);
     }
   }
+
+  public:
+    /// Stream-K reduce helper
+    CUTLASS_DEVICE
+    void reduce(
+        int reduce_fragment_idx,                        ///< Reduce fragment index
+        OutputOp const &output_op,                      ///< Output operator
+        ElementVector const * broadcast_ptr,            ///< Broadcast vector
+        OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+        OutputTileIterator source_iterator1,            ///< Tile iterator for first  source accumulator matrix
+        OutputTileIterator source_iterator2,            ///< Tile iterator for second source accumulator matrix
+        TensorTileIterator tensor_iterator,             ///< Threadblock tile iterator for additional tensor operand
+        MatrixCoord const &problem_size =               ///< Problem size needed to guard against out-of-bounds accesses
+            MatrixCoord(Shape::kM, Shape::kN),
+        MatrixCoord const &threadblock_offset =         ///< Threadblock's initial offset within the problem size space
+            MatrixCoord()) 
+    {
+
+      BroadcastFragment broadcast_fragment;
+      load_broadcast_fragment_(broadcast_fragment, broadcast_ptr, problem_size, threadblock_offset);
+
+      // Initialize/load source-fragment data
+      typename OutputTileIterator::Fragment source_fragment1;
+      source_fragment1.clear();
+      typename OutputTileIterator::Fragment source_fragment2;
+      source_fragment2.clear();
+
+      if (output_op.is_source_needed())
+      {
+        source_iterator1 += reduce_fragment_idx;
+        source_iterator1.load(source_fragment1);
+
+        source_iterator2 += reduce_fragment_idx;
+        source_iterator2.load(source_fragment2);
+      }
+
+      // Load fragment from shared memory
+      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+      // Add fragments shared by other k partitions
+      if (kPartitionsK > 1)
+      {
+        plus <typename SharedLoadIterator::Fragment> add_fragments;
+
+        CUTLASS_PRAGMA_UNROLL
+        for ( int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+      }
+
+      //
+      // Apply output operation
+      //
+
+      typename OutputTileIterator::Fragment frag_Z;
+      typename TensorTileIterator::Fragment frag_T;
+
+      if (!output_op.is_source_needed()) {
+        apply_output_operator_source_not_needed_(
+          frag_Z,
+          frag_T,
+          output_op,
+          aligned_accum_fragment[0],
+          broadcast_fragment);
+      } else {
+        apply_output_operator_(
+          frag_Z,
+          frag_T,
+          output_op,
+          aligned_accum_fragment[0],
+          source_fragment1,
+          source_fragment2,
+          broadcast_fragment);
+      }
+
+      //
+      // Conditionally store fragments
+      //
+
+      if (OutputOp::kStoreZ) {
+        destination_iterator += reduce_fragment_idx;
+        destination_iterator.store(frag_Z);
+      }
+
+      if (OutputOp::kStoreT) {
+        tensor_iterator += reduce_fragment_idx;
+        tensor_iterator.store(frag_T);
+      }
+    }
+};
+
+
+template <
+  typename Shape_,
+  typename WarpMmaOperator_,
+  int PartitionsK,
+  typename OutputTileIterator_,
+  typename TensorTileIterator_,
+  typename ElementVector_,
+  typename AccumulatorFragmentIterator_,
+  typename WarpTileIterator_,
+  typename SharedLoadIterator_,
+  typename OutputOp_,
+  typename Padding_,
+  int FragmentsPerPartition,
+  int IterationsUnroll
+>
+class EpilogueWithBroadcast<
+  Shape_,
+  WarpMmaOperator_,
+  PartitionsK,
+  OutputTileIterator_,
+  TensorTileIterator_,
+  ElementVector_,
+  AccumulatorFragmentIterator_,
+  WarpTileIterator_,
+  SharedLoadIterator_,
+  OutputOp_,
+  Padding_,
+  FragmentsPerPartition,
+  IterationsUnroll,
+  true
+> : 
+  public EpilogueBase<
+    Shape_, 
+    typename WarpMmaOperator_::Shape, 
+    PartitionsK, 
+    AccumulatorFragmentIterator_, 
+    WarpTileIterator_, 
+    Padding_,
+    FragmentsPerPartition> {
+
+public:
+
+  using Base = EpilogueBase<
+    Shape_, 
+    typename WarpMmaOperator_::Shape, 
+    PartitionsK, 
+    AccumulatorFragmentIterator_, 
+    WarpTileIterator_, 
+    Padding_,
+    FragmentsPerPartition>;
+
+  static bool const kIsSingleSource = true;
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using TensorTileIterator = TensorTileIterator_;
+  using ElementVector = ElementVector_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = Padding_;
+
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename Base::AccumulatorTile;
+
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+
+  /// Compute data type produced by the output op
+  using ElementCompute = typename OutputOp::ElementCompute;
+
+  /// Compute fragment
+  using FragmentCompute = Array<ElementCompute, OutputTileIterator::Fragment::kElements>;
+
+  /// Thread map used by output tile iterators
+  using ThreadMap = typename OutputTileIterator::ThreadMap;
+
+  /// Fragment object used to store the broadcast values
+  using BroadcastFragment = Array<
+    ElementCompute, 
+    ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess>;
+
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+
+  /// Data type of additional tensor
+  using ElementTensor = typename TensorTileIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+  /// Array type used to output
+  using OutputAccessType = Array<
+    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+  /// Array type used by output functor
+  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>; 
+
+  /// Array type used by output functor
+  using ComputeAccessType = Array<ElementCompute, OutputTileIterator::kElementsPerAccess>;
+
+  /// Tensor access type
+  using TensorAccessType = Array<ElementTensor, OutputTileIterator::kElementsPerAccess>;
+  
+  /// Number of warps
+  using WarpCount = typename Base::WarpCount;
+
+  /// Shared memory allocation from epilogue base class
+  using BaseSharedStorage = typename Base::SharedStorage;
+
+  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
+  static int constexpr kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+
+  /// Used for the broadcast
+  struct BroadcastDetail {
+
+    /// Number of threads per warp
+    static int const kWarpSize = 32;
+
+    static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+    /// Number of distinct scalar column indices handled by each thread
+    static int const kColumnsPerThread = ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess;
+
+    /// Number of distinct scalar row indices handled by each thread
+    static int const kRowsPerThread = ThreadMap::Iterations::kCount / ThreadMap::Iterations::kColumn;
+
+    /// Number of threads per threadblock
+    static int const kThreadCount = kWarpSize * WarpCount::kCount;
+
+    /// Number of distinct threads per row of output tile
+    static int const kThreadsPerRow = (Shape::kN / kColumnsPerThread);
+
+    /// Number of distinct threads which must be reduced during the final reduction phase within the threadblock.
+    static int const kThreadRows = kThreadCount / kThreadsPerRow;
+
+    /// I'm not sure what I meant here.
+    static int const kThreadAccessesPerRow = const_max(1, (Shape::kN + kThreadCount - 1) / kThreadCount);
+
+    /// Shape of the shared memory allocation for the epilogue    
+    using StorageShape = MatrixShape<
+      kThreadRows,
+      Shape::kN
+    >;
+
+    /// Debug printing
+    CUTLASS_DEVICE
+    static void print() {
+#if 0
+      printf("BroadcastDetail {\n");
+      printf(
+        "  kColumnsPerThread: %d\nkRowsPerThread: %d\n,kThreadCount: %d\nkThreadsPerRow: %d\n"
+        "kThreadRows: %d\nThreadAccessesPerRow: %d\nStorageShape: %d x %d (count: %d)\n",
+        kColumnsPerThread,
+        kRowsPerThread,
+        kThreadCount,
+        kThreadsPerRow,
+        kThreadRows,
+        kThreadAccessesPerRow,
+        StorageShape::kRow,
+        StorageShape::kColumn,
+        StorageShape::kCount
+      );
+      printf("};\n");
+#endif
+    }
+  };
+
+  /// Shared storage structure (shadows base) with additional SMEM buffer for reduction
+  struct SharedStorage {
+    union {
+      BaseSharedStorage base;
+    };
+
+    CUTLASS_HOST_DEVICE
+    SharedStorage() { }
+  };
+
+public:
+
+
+  static_assert(SharedLoadIterator::Fragment::kElements == OutputTileIterator::Fragment::kElements,
+    "Mismatch between shared load iterator and output tile iterator.");
+
+  static_assert(OutputTileIterator::kElementsPerAccess, "OutputTileIterator::kElementsPerAccess must not be zero.");
+
+  static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess), 
+    "Divisibility");
+
+private:
+
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+
+  /// Thread index within the threadblock
+  int thread_idx_;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueWithBroadcast(
+    SharedStorage &shared_storage,                    ///< Shared storage object    
+    int thread_idx,                                   ///< ID of a thread within the threadblock
+    int warp_idx,                                     ///< ID of warp within threadblock
+    int lane_idx                                      ///< Id of thread within warp
+  ):
+    Base(shared_storage.base, thread_idx, warp_idx, lane_idx),
+    shared_load_iterator_(shared_storage.base.reference(), thread_idx),
+    thread_idx_(thread_idx)
+  {
+
+  }
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                        ///< Output operator
+    ElementVector const * broadcast_ptr,              ///< Broadcast vector
+    OutputTileIterator destination_iterator,          ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator,               ///< Tile iterator for source accumulator matrix
+    TensorTileIterator tensor_iterator,               ///< Threadblock tile iterator for additional tensor operand
+    MatrixCoord const &problem_size =                 ///< Problem size needed to guard against out-of-bounds accesses
+        MatrixCoord(Shape::kM, Shape::kN),
+    MatrixCoord const &threadblock_offset =           ///< Threadblock's initial offset within the problem size space
+        MatrixCoord()) {
+    
+    BroadcastFragment broadcast_fragment;
+
+    load_broadcast_fragment_(broadcast_fragment, broadcast_ptr, problem_size, threadblock_offset);
+
+    if (!output_op.is_source_needed()) {
+      compute_source_not_needed_(
+        output_op, 
+        broadcast_fragment, 
+        destination_iterator, 
+        accumulators,
+        tensor_iterator);
+    }
+    else {
+      compute_source_needed_(
+        output_op, 
+        broadcast_fragment, 
+        destination_iterator, 
+        accumulators, 
+        source_iterator,
+        tensor_iterator);
+    }
+  }
+
+private:
+
+  CUTLASS_DEVICE
+  void load_broadcast_fragment_(
+    BroadcastFragment & broadcast_fragment,      ///< Fragment containing the accumulated partial reduction over columns
+    ElementVector const * broadcast_ptr,         ///< Broadcast vector
+    MatrixCoord const &problem_size,             ///< Problem size needed to guard against out-of-bounds accesses
+    MatrixCoord const &threadblock_offset        ///< Threadblock's initial offset within the problem size space
+    ) {
+
+    broadcast_fragment.clear();
+    
+    // If no pointer is supplied, set with all zeros and avoid memory accesses
+    if (!broadcast_ptr) {
+      return;
+    }
+
+    int thread_initial_column = ThreadMap::initial_offset(thread_idx_).column();
+
+    int thread_column_idx = threadblock_offset.column() + thread_initial_column;
+    broadcast_ptr += thread_initial_column;
+
+    NumericArrayConverter<ElementCompute, ElementVector, BroadcastDetail::kElementsPerAccess> converter;
+    using AccessType = AlignedArray<ElementVector, BroadcastDetail::kElementsPerAccess>;
+    using ComputeFragmentType = Array<ElementCompute, BroadcastDetail::kElementsPerAccess>;
+
+    ComputeFragmentType *frag_ptr = reinterpret_cast<ComputeFragmentType *>(&broadcast_fragment);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < ThreadMap::Iterations::kColumn; ++j) {
+
+      AccessType loaded;
+
+      loaded.clear();
+
+      if (thread_column_idx < problem_size.column()) {
+        loaded = *reinterpret_cast<AccessType const *>(broadcast_ptr);
+      }
+
+      ComputeFragmentType cvt = converter(loaded);
+      frag_ptr[j] = cvt;
+
+      thread_column_idx += ThreadMap::Delta::kColumn;
+      broadcast_ptr += ThreadMap::Delta::kColumn;
+    }
+  }
+
+  template <class Seq>
+  struct acc2smem_source_not_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_not_needed<cutlass::index_sequence<Seq...>> {
+    template <int Advance>
+    CUTLASS_DEVICE static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                                      WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+        typename AccumulatorFragmentIterator::Fragment accum_fragment;
+
+        accum_fragment_iterator.load(accum_fragment);
+        ++accum_fragment_iterator;
+
+        warp_tile_iterator.store(accum_fragment);
+        if (p < Base::kFragmentsPerIteration - 1) {
+          warp_tile_iterator.add_pointer_offset(kSmemPointerOffset);
+        }
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        warp_tile_iterator.add_pointer_offset(kSmemPointerOffset *
+                                              (1 - Base::kFragmentsPerIteration));
+      }
+    }
+
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                     AccumulatorFragmentIterator const &iterator_begin,
+                     WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {
+          (pos == (Seq * Base::kFragmentsPerIteration)) &&
+          (helper<Seq * Base::kFragmentsPerIteration>(iterator_begin, warp_tile_iterator), 0)...};
+
+      CUTLASS_UNUSED(dummy[0]);
+    }
+  };
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_not_needed_(
+    OutputOp const &output_op,                        ///< Output operator
+    BroadcastFragment const &broadcast_fragment,      ///< Fragment containing the accumulated partial reduction over columns
+    OutputTileIterator destination_iterator,          ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile 
+    TensorTileIterator tensor_iterator                ///< Threadblock tile iterator for additioanl tensor operand
+    ) { 
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    // 
+
+    // CUTLASS_PRAGMA_UNROLL
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations / Base::kFragmentsPerIteration : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; iter += Base::kFragmentsPerIteration) {
+
+      //
+      // Convert and store fragment
+      //
+      
+
+      __syncthreads();
+
+      acc2smem_source_not_needed<
+          cutlass::make_index_sequence<OutputTileIterator::kIterations /
+                                   Base::kFragmentsPerIteration>>::push(iter,
+                                                                        accum_fragment_iterator,
+                                                                        this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+
+
+        typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+        shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+        if (p < Base::kFragmentsPerIteration - 1) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+        }
+        else if (kPartitionsK > 1) {
+
+          plus <typename SharedLoadIterator::Fragment> add_fragments;
+
+          CUTLASS_PRAGMA_UNROLL
+          for ( int i = 1; i < kPartitionsK; ++i) {
+            shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+            shared_load_iterator_.load(aligned_accum_fragment[i]);
+            aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+          }
+
+          shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
+        }
+
+        //
+        // Apply output operation
+        //
+
+        typename OutputTileIterator::Fragment frag_Z;
+        typename TensorTileIterator::Fragment frag_T;
+
+        apply_output_operator_source_not_needed_(
+          frag_Z,
+          frag_T,
+          output_op,
+          aligned_accum_fragment[0],
+          broadcast_fragment);
+
+        //
+        // Conditionally store fragments
+        //
+
+        if (OutputOp::kStoreZ) {
+          destination_iterator.store(frag_Z);
+          ++destination_iterator;
+        }
+
+        if (OutputOp::kStoreT) {
+          tensor_iterator.store(frag_T);
+          ++tensor_iterator;
+        }
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        shared_load_iterator_.add_pointer_offset(kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
+    }
+  }
+
+
+  template<class Seq>
+  struct acc2smem_source_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
+    template<int Advance>
+    CUTLASS_DEVICE
+    static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                       WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+      accum_fragment_iterator.load(accum_fragment);
+      warp_tile_iterator.store(accum_fragment);
+    }
+
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                     AccumulatorFragmentIterator const &iterator_begin,
+                     WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
+    }
+  };
+
+  
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_needed_(
+    OutputOp const &output_op,                    ///< Output operator
+    BroadcastFragment const &broadcast_fragment,  ///< Fragment containing the accumulated partial reduction over columns
+    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,          ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator,           ///< Tile iterator for source accumulator matrix
+    TensorTileIterator tensor_iterator            ///< Threadblock tile iterator for additioanl tensor operand
+    ) { 
+    
+    typename OutputTileIterator::Fragment source_fragment;
+    source_fragment.clear();
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    // 
+
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+
+      //
+      // Load the source
+      //
+
+      source_iterator.load(source_fragment);
+      ++source_iterator;
+
+      //
+      // Convert and store fragment
+      //
+      
+      __syncthreads();
+
+      acc2smem_source_needed<cutlass::make_index_sequence<OutputTileIterator::kIterations>>::push(
+          iter, accum_fragment_iterator, this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+      // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
+      if (kPartitionsK > 1)
+      {
+        plus <typename SharedLoadIterator::Fragment> add_fragments;
+        const int tile_row_offset = Base::SharedStorage::StorageShape::kRow / PartitionsK;
+
+        CUTLASS_PRAGMA_UNROLL
+        for ( int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_tile_offset({tile_row_offset , 0});
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+
+        shared_load_iterator_.add_tile_offset({-1 * (kPartitionsK-1) * tile_row_offset, 0});
+      }
+
+      //
+      // Apply output operation
+      //
+
+      typename OutputTileIterator::Fragment frag_Z;
+      typename TensorTileIterator::Fragment frag_T;
+
+      apply_output_operator_(
+        frag_Z,
+        frag_T,
+        output_op,
+        aligned_accum_fragment[0],
+        source_fragment,
+        broadcast_fragment);
+
+      //
+      // Conditionally store fragments
+      //
+
+      if (OutputOp::kStoreZ) {
+        destination_iterator.store(frag_Z);
+        ++destination_iterator;
+      }
+
+      if (OutputOp::kStoreT) {
+        tensor_iterator.store(frag_T);
+        ++tensor_iterator;
+      }
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_(
+    typename OutputTileIterator::Fragment &frag_Z,
+    typename TensorTileIterator::Fragment &frag_T,
+    OutputOp const &output_op,
+    typename SharedLoadIterator::Fragment const &frag_AB,
+    typename OutputTileIterator::Fragment const &frag_C,
+    BroadcastFragment const &frag_Broadcast) {
+
+    using AccessTypeZ = Array<typename OutputTileIterator::Element, kElementsPerAccess>;
+    using AccessTypeT = Array<typename TensorTileIterator::Element, kElementsPerAccess>;
+    using AccessTypeBroadcast = Array<ElementCompute, kElementsPerAccess>;
+
+    AccessTypeZ *frag_Z_ptr = reinterpret_cast<AccessTypeZ *>(&frag_Z);
+    AccessTypeT *frag_T_ptr = reinterpret_cast<AccessTypeT *>(&frag_T);
+    
+    AccumulatorAccessType const *frag_AB_ptr = 
+      reinterpret_cast<AccumulatorAccessType const *>(&frag_AB);
+
+    OutputAccessType const *frag_C_ptr =
+      reinterpret_cast<OutputAccessType const *>(&frag_C);
+
+    AccessTypeBroadcast const *frag_Broadcast_ptr =
+      reinterpret_cast<AccessTypeBroadcast const *>(&frag_Broadcast);
+
+    int const kOutputOpIterations = 
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+        output_op(
+          frag_Z_ptr[i],
+          frag_T_ptr[i],
+          frag_AB_ptr[i],
+          frag_C_ptr[i],
+          frag_Broadcast_ptr[i % ThreadMap::Iterations::kColumn]);
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_source_not_needed_(
+    typename OutputTileIterator::Fragment &frag_Z,
+    typename TensorTileIterator::Fragment &frag_T,
+    OutputOp const &output_op,
+    typename SharedLoadIterator::Fragment const &frag_AB,
+    BroadcastFragment const &frag_Broadcast) {
+
+    using AccessTypeZ = Array<typename OutputTileIterator::Element, kElementsPerAccess>;
+    using AccessTypeT = Array<typename TensorTileIterator::Element, kElementsPerAccess>;
+    using AccessTypeBroadcast = Array<ElementCompute, kElementsPerAccess>;
+
+    AccessTypeZ *frag_Z_ptr = reinterpret_cast<AccessTypeZ *>(&frag_Z);
+    AccessTypeT *frag_T_ptr = reinterpret_cast<AccessTypeT *>(&frag_T);
+    
+    AccumulatorAccessType const *frag_AB_ptr = 
+      reinterpret_cast<AccumulatorAccessType const *>(&frag_AB);
+
+    AccessTypeBroadcast const *frag_Broadcast_ptr =
+      reinterpret_cast<AccessTypeBroadcast const *>(&frag_Broadcast);
+
+    int const kOutputOpIterations = 
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+
+      output_op(
+        frag_Z_ptr[i], 
+        frag_T_ptr[i], 
+        frag_AB_ptr[i], 
+        frag_Broadcast_ptr[i % ThreadMap::Iterations::kColumn]);
+    }
+  }
+
+
+  public:
+    /// Stream-K reduce helper
+    CUTLASS_DEVICE
+    void reduce(
+        int reduce_fragment_idx,                        ///< Reduce fragment index
+        OutputOp const &output_op,                      ///< Output operator
+        ElementVector const * broadcast_ptr,            ///< Broadcast vector
+        OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+        OutputTileIterator source_iterator,             ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+        TensorTileIterator tensor_iterator,             ///< Threadblock tile iterator for additional tensor operand
+        MatrixCoord const &problem_size =               ///< Problem size needed to guard against out-of-bounds accesses
+            MatrixCoord(Shape::kM, Shape::kN),
+        MatrixCoord const &threadblock_offset =         ///< Threadblock's initial offset within the problem size space
+            MatrixCoord()) 
+    {
+
+      BroadcastFragment broadcast_fragment;
+      load_broadcast_fragment_(broadcast_fragment, broadcast_ptr, problem_size, threadblock_offset);
+
+      // Initialize/load source-fragment data
+      typename OutputTileIterator::Fragment source_fragment;
+      source_fragment.clear();
+
+      if (output_op.is_source_needed())
+      {
+        source_iterator += reduce_fragment_idx;
+        source_iterator.load(source_fragment);
+      }
+
+      // Load fragment from shared memory
+      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+      // Add fragments shared by other k partitions
+      if (kPartitionsK > 1)
+      {
+        plus <typename SharedLoadIterator::Fragment> add_fragments;
+
+        CUTLASS_PRAGMA_UNROLL
+        for ( int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+      }
+
+      //
+      // Apply output operation
+      //
+
+      typename OutputTileIterator::Fragment frag_Z;
+      typename TensorTileIterator::Fragment frag_T;
+
+      if (!output_op.is_source_needed()) {
+        apply_output_operator_source_not_needed_(
+          frag_Z,
+          frag_T,
+          output_op,
+          aligned_accum_fragment[0],
+          broadcast_fragment);
+      } else {
+        apply_output_operator_(
+          frag_Z,
+          frag_T,
+          output_op,
+          aligned_accum_fragment[0],
+          source_fragment,
+          broadcast_fragment);
+      }
+
+      //
+      // Conditionally store fragments
+      //
+
+      if (OutputOp::kStoreZ) {
+        destination_iterator.store(frag_Z);
+        ++destination_iterator;
+      }
+
+      if (OutputOp::kStoreT) {
+        tensor_iterator.store(frag_T);
+        ++tensor_iterator;
+      }
+    }
 };
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/threadblock/epilogue_with_reduction.h b/include/cutlass/epilogue/threadblock/epilogue_with_reduction.h
index 685777c60d..7f82bac7e8 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_with_reduction.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_with_reduction.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -39,11 +39,7 @@
 
 #pragma once
 
-#if defined(__CUDACC_RTC__)
 #include <cuda/std/cassert>
-#else
-#include <assert.h>
-#endif
 
 #include "cutlass/cutlass.h"
 #include "cutlass/array.h"
@@ -124,6 +120,8 @@ class EpilogueWithReduction :
   using Layout = layout::RowMajor;
   using LongIndex = typename Layout::LongIndex;
 
+  static bool const kIsSingleSource = true;
+
   /// The complete warp-level accumulator tile
   using AccumulatorTile = typename Base::AccumulatorTile;
 
@@ -294,7 +292,7 @@ class EpilogueWithReduction :
   CUTLASS_DEVICE
   void operator()(
     OutputOp const &output_op,                        ///< Output operator
-    ElementVector * reduction_output_ptr,          ///< Reduction output vector
+    ElementVector * reduction_output_ptr,             ///< Reduction output vector
     OutputTileIterator destination_iterator,          ///< Tile iterator for destination
     AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile
     OutputTileIterator source_iterator,               ///< Tile iterator for source accumulator matrix
diff --git a/examples/35_gemm_softmax/epilogue_with_visitor.h b/include/cutlass/epilogue/threadblock/epilogue_with_visitor.h
similarity index 98%
rename from examples/35_gemm_softmax/epilogue_with_visitor.h
rename to include/cutlass/epilogue/threadblock/epilogue_with_visitor.h
index aa32255399..6ab9cf069b 100644
--- a/examples/35_gemm_softmax/epilogue_with_visitor.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_with_visitor.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -121,6 +121,7 @@ class EpilogueFusedVisitorConcept {
   /// Called after accumulators have been exchanged for each accumulator vector
   CUTLASS_DEVICE
   void visit(
+    int iter_idx,
     int row_idx,
     int column_idx,
     int frag_idx,
@@ -128,7 +129,7 @@ class EpilogueFusedVisitorConcept {
 
   }
 
-  /// Called at the start of a row
+  /// Called at the end of a row
   CUTLASS_DEVICE
   void end_row(int row_idx) {
 
@@ -325,6 +326,7 @@ class EpilogueWithVisitor :
         }
 
         visitor.visit(
+          iter_idx,
           row_idx,
           col_idx,
           idx,
diff --git a/include/cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h b/include/cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h
new file mode 100644
index 0000000000..027830c299
--- /dev/null
+++ b/include/cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h
@@ -0,0 +1,526 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+ /*! \file
+  \brief Functor performing elementwise operations used by epilogues.
+*/
+
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+namespace detail {
+
+struct EVT2xBase { };
+
+template <class T>
+static constexpr bool is_2x_evt_v = platform::is_base_of<EVT2xBase, T>::value;
+
+} // namespace detail
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator
+template <
+  typename DefaultEpilogue,                 ///< Default Epilogue Descriptor
+  typename FusionCallbacks_,                ///< The called fusion callbacks
+  int Stages = 2,                           ///< Software pipeline stages for epilogue
+  int IterationsUnroll = true               ///< Used to reduce binary size when epilogue op is large
+>
+class EpilogueWithVisitorCallbacks :
+  public EpilogueBase<
+    typename DefaultEpilogue::Shape,
+    typename DefaultEpilogue::WarpMmaOperator::Shape,
+    DefaultEpilogue::kPartitionsK,
+    typename DefaultEpilogue::AccumulatorFragmentIterator,
+    typename DefaultEpilogue::WarpTileIterator,
+    typename DefaultEpilogue::Padding,
+    DefaultEpilogue::kFragmentsPerIteration>,
+  public EpilogueBaseStreamK<
+    typename DefaultEpilogue::Shape,
+    DefaultEpilogue::kPartitionsK,
+    typename DefaultEpilogue::WarpMmaOperator,
+    typename DefaultEpilogue::AccumulatorFragmentIterator>,
+  public detail::EVT2xBase
+   {
+
+public:
+
+  static_assert(Stages <= 2, "Sm80 EVT only support upto 2 Stages.");
+
+  // Whether the epilogue is pipelined
+  static bool constexpr Pipelined = Stages > 1;
+
+  using FusionCallbacks = FusionCallbacks_;
+
+  using OutputTileIterator = typename DefaultEpilogue::OutputTileIterator;
+  // Number of epilogue iterations. 
+  // Each iteration processes a 8xThreadblockTile::kN output tile
+  static const int kIterations = OutputTileIterator::kIterations;
+
+  using Base = EpilogueBase<
+    typename DefaultEpilogue::Shape,
+    typename DefaultEpilogue::WarpMmaOperator::Shape,
+    DefaultEpilogue::kPartitionsK,
+    typename DefaultEpilogue::AccumulatorFragmentIterator,
+    typename DefaultEpilogue::WarpTileIterator,
+    typename DefaultEpilogue::Padding,
+    DefaultEpilogue::kFragmentsPerIteration>;
+  
+  using BaseStreamK = EpilogueBaseStreamK<
+    typename DefaultEpilogue::Shape,
+    DefaultEpilogue::kPartitionsK,
+    typename DefaultEpilogue::WarpMmaOperator,
+    typename DefaultEpilogue::AccumulatorFragmentIterator>;
+
+  static int const kPartitionsK = DefaultEpilogue::kPartitionsK;
+
+  using AccumulatorFragmentIterator = typename DefaultEpilogue::AccumulatorFragmentIterator;
+  using WarpTileIterator = typename DefaultEpilogue::WarpTileIterator;
+  using SharedLoadIterator = typename DefaultEpilogue::SharedLoadIterator;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename Base::AccumulatorTile;
+
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+
+  struct OutputOp{
+    using ElementAccumulator = ElementAccumulator;
+    using Params = typename FusionCallbacks::Arguments;
+  };
+
+  /// Fragment type used by the accumulator tile's fragment iterator
+  using AccumulatorFragment = typename AccumulatorFragmentIterator::Fragment;
+
+  // Output access size
+  static int const kElementsPerAccess = DefaultEpilogue::kElementsPerAccess;
+
+  /// Array type used by output functor
+  using AccumulatorAccessType = Array<
+    typename WarpTileIterator::Element, kElementsPerAccess>;
+
+  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
+  static int constexpr kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+
+  using Params = typename FusionCallbacks::Params;
+
+  static size_t constexpr kSmemStageOffset = sizeof(Base::SharedStorage) / sizeof(ElementAccumulator);
+  static int constexpr kAccumulatorFragmentCount = AccumulatorTile::kElements / (kIterations * AccumulatorAccessType::kElements) / kPartitionsK;
+
+  struct SharedStorage {
+    typename Base::SharedStorage acc_smem[Stages];
+    typename FusionCallbacks::SharedStorage callback_smem;
+  };
+
+private:
+
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+  FusionCallbacks fusion_callbacks;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueWithVisitorCallbacks(
+    const Params &params_callbacks,   ///< Epilogue Visitor params
+    SharedStorage &shared_storage,    ///< Shared storage object
+    int thread_idx,                   ///< ID of a thread within the threadblock
+    int warp_idx,                     ///< ID of warp within threadblock
+    int lane_idx                      ///< Id of thread within warp
+  ):
+    Base(shared_storage.acc_smem[0], thread_idx, warp_idx, lane_idx),
+    BaseStreamK(thread_idx),
+    shared_load_iterator_(shared_storage.acc_smem[0].reference(), thread_idx),
+    fusion_callbacks(params_callbacks, shared_storage.callback_smem)
+  { }
+
+  /// Aggregates the accumulator sets shared by peer blocks in the global workspace,
+  /// performing epilogue computations, writing to output
+  template <class ProblemShape>
+  CUTLASS_DEVICE
+  void reduce(
+      int peer_idx_begin,
+      int peer_idx_end,
+      int reduce_fragment_idx,
+      void *element_workspace,
+      cutlass::gemm::GemmCoord threadblock_tile_offset,
+      ProblemShape problem_shape,
+      int thread_idx) 
+  {
+    auto callbacks = fusion_callbacks.get_callbacks(
+      threadblock_tile_offset,
+      thread_idx,
+      problem_shape
+    );
+
+    callbacks.begin_epilogue();
+    // Reduce peer accumulator fragments into one fragment
+    AccumulatorFragment accum_fragment;
+    BaseStreamK::reduce(accum_fragment, peer_idx_begin, peer_idx_end, reduce_fragment_idx, element_workspace);
+
+    // Store fragment to shared memory
+    this->warp_tile_iterator_.store(accum_fragment);
+
+    __syncthreads();
+
+    callbacks.begin_step(reduce_fragment_idx);
+
+    // Load fragment from shared memory
+    typename SharedLoadIterator::Fragment aligned_accum_fragment;
+    shared_load_iterator_.load(aligned_accum_fragment);
+
+    // Add fragments shared by other k partitions
+    if (kPartitionsK > 1)
+    {
+      plus <typename SharedLoadIterator::Fragment> add_fragments;
+
+      CUTLASS_PRAGMA_UNROLL
+      for ( int i = 1; i < kPartitionsK; ++i) {
+        typename SharedLoadIterator::Fragment aligned_addend_fragment;
+        shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+        shared_load_iterator_.load(aligned_addend_fragment);
+        aligned_accum_fragment = add_fragments(aligned_accum_fragment, aligned_addend_fragment);
+      }
+    }
+
+    //
+    // Iterate over output fragment
+    //
+
+    AccumulatorAccessType const *accum_frag_ptr =
+      reinterpret_cast<AccumulatorAccessType const*>(&aligned_accum_fragment);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int idx = 0; idx < kAccumulatorFragmentCount; ++idx) {
+      int row_idx = idx / SharedLoadIterator::ThreadMap::Iterations::kColumn;
+      int col_idx = idx % SharedLoadIterator::ThreadMap::Iterations::kColumn;
+
+      // Start a new row of the output fragment
+      if (!col_idx) {
+        callbacks.begin_row(row_idx);
+      }
+
+      callbacks.visit(
+        reduce_fragment_idx,
+        row_idx,
+        col_idx,
+        idx,
+        accum_frag_ptr[idx]
+      );
+
+      // End the row of the output fragment
+      if (col_idx + 1 == SharedLoadIterator::ThreadMap::Iterations::kColumn) {
+        callbacks.end_row(row_idx);
+      }
+    }
+
+    callbacks.end_step(reduce_fragment_idx);
+    callbacks.end_epilogue();
+  }
+
+  /// Streams the result to global memory
+  template <class ProblemShape>
+  CUTLASS_DEVICE
+  void operator()(
+    AccumulatorTile const &accumulators,
+    cutlass::gemm::GemmCoord threadblock_tile_offset,
+    ProblemShape problem_shape,
+    int thread_idx
+    ) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+
+    auto callbacks = fusion_callbacks.get_callbacks(
+      threadblock_tile_offset,
+      thread_idx,
+      problem_shape
+    );
+
+    callbacks.begin_epilogue();
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+    if constexpr(Pipelined){
+      __syncthreads();
+
+      //
+      // Pipeline Prologue
+      //
+      size_t warp_iterator_offset = kSmemStageOffset;
+      size_t smem_iterator_offset = kSmemStageOffset;
+      callbacks.begin_step(0);
+    
+      acc2smem_source_needed<cutlass::make_index_sequence<kIterations>>::push(
+            0, accum_fragment_iterator, this->warp_tile_iterator_);
+      
+      this->warp_tile_iterator_.add_pointer_offset(warp_iterator_offset);
+      warp_iterator_offset = -warp_iterator_offset;
+
+      //
+      // Pipeline Loop
+      //
+
+      #ifdef __clang__
+      #pragma clang diagnostic push
+      #pragma clang diagnostic ignored "-Wcuda-compat"
+      // Turn off clang warning about loop unroll argument using parens.
+      #endif
+
+      #pragma unroll(IterationsUnroll ? kIterations : 1)
+      for (int iter_idx = 1; iter_idx < kIterations + 1; ++iter_idx) {
+
+        __syncthreads();
+
+        // Skip the load for epilogue
+        if (iter_idx < kIterations) {
+          callbacks.begin_step(iter_idx);
+
+          acc2smem_source_needed<cutlass::make_index_sequence<kIterations>>::push(
+              iter_idx, accum_fragment_iterator, this->warp_tile_iterator_);
+
+          this->warp_tile_iterator_.add_pointer_offset(warp_iterator_offset);
+          warp_iterator_offset = -warp_iterator_offset;
+        }
+        
+        typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+        shared_load_iterator_.load(aligned_accum_fragment[0]);
+        // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
+        if (kPartitionsK > 1) {
+
+          plus <typename SharedLoadIterator::Fragment> add_fragments;
+
+          CUTLASS_PRAGMA_UNROLL
+          for ( int i = 1; i < kPartitionsK; ++i) {
+            shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+            shared_load_iterator_.load(aligned_accum_fragment[i]);
+            aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+          }
+
+          shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
+        }
+        shared_load_iterator_.add_pointer_offset(smem_iterator_offset);
+        smem_iterator_offset = -smem_iterator_offset;
+        
+        //
+        // Iterate over output fragments
+        //
+
+        AccumulatorAccessType const *accum_frag_ptr =
+          reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int idx = 0; idx < kAccumulatorFragmentCount; ++idx) {
+
+          int row_idx = idx / SharedLoadIterator::ThreadMap::Iterations::kColumn;
+          int col_idx = idx % SharedLoadIterator::ThreadMap::Iterations::kColumn;
+
+          // Start a new row of the output fragment
+          if (!col_idx) {
+            callbacks.begin_row(row_idx);
+          }
+
+          callbacks.visit(
+            iter_idx-1,
+            row_idx,
+            col_idx,
+            idx,
+            accum_frag_ptr[idx]
+          );
+
+          // End the row of the output fragment
+          if (col_idx + 1 == SharedLoadIterator::ThreadMap::Iterations::kColumn) {
+            callbacks.end_row(row_idx);
+          }
+        }
+
+        //
+        // Conclude the step
+        //
+
+        callbacks.end_step(iter_idx-1);
+      }
+
+      #ifdef __clang__
+      #pragma clang diagnostic pop
+      #endif
+
+    } else {
+
+      #ifdef __clang__
+      #pragma clang diagnostic push
+      #pragma clang diagnostic ignored "-Wcuda-compat"
+      // Turn off clang warning about loop unroll argument using parens.
+      #endif
+
+      #pragma unroll(IterationsUnroll ? kIterations : 1)
+      for (int iter_idx = 0; iter_idx < kIterations; ++iter_idx) {
+
+        //
+        // Load the source
+        //
+
+        callbacks.begin_step(iter_idx);
+
+        //
+        // Convert and store fragment
+        //
+
+        __syncthreads();
+
+        acc2smem_source_needed<cutlass::make_index_sequence<kIterations>>::push(
+            iter_idx, accum_fragment_iterator, this->warp_tile_iterator_);
+
+        __syncthreads();
+
+        //
+        // Load fragments from shared memory
+        //
+
+        typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+        shared_load_iterator_.load(aligned_accum_fragment[0]);
+        // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
+        if (kPartitionsK > 1) {
+
+          plus <typename SharedLoadIterator::Fragment> add_fragments;
+
+          CUTLASS_PRAGMA_UNROLL
+          for ( int i = 1; i < kPartitionsK; ++i) {
+            shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+            shared_load_iterator_.load(aligned_accum_fragment[i]);
+            aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+          }
+
+          shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
+        }
+
+        //
+        // Iterate over output fragments
+        //
+
+        AccumulatorAccessType const *accum_frag_ptr =
+          reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment[0]);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int idx = 0; idx < kAccumulatorFragmentCount; ++idx) {
+
+          int row_idx = idx / SharedLoadIterator::ThreadMap::Iterations::kColumn;
+          int col_idx = idx % SharedLoadIterator::ThreadMap::Iterations::kColumn;
+
+          // Start a new row of the output fragment
+          if (!col_idx) {
+            callbacks.begin_row(row_idx);
+          }
+
+          callbacks.visit(
+            iter_idx,
+            row_idx,
+            col_idx,
+            idx,
+            accum_frag_ptr[idx]
+          );
+
+          // End the row of the output fragment
+          if (col_idx + 1 == SharedLoadIterator::ThreadMap::Iterations::kColumn) {
+            callbacks.end_row(row_idx);
+          }
+        }
+
+        //
+        // Conclude the step
+        //
+
+        callbacks.end_step(iter_idx);
+      }
+
+      #ifdef __clang__
+      #pragma clang diagnostic pop
+      #endif
+
+    }
+
+    callbacks.end_epilogue();
+  }
+
+private:
+
+
+  template<class Seq>
+  struct acc2smem_source_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
+    template<int Advance>
+    CUTLASS_DEVICE
+    static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                       WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+      accum_fragment_iterator.load(accum_fragment);
+      warp_tile_iterator.store(accum_fragment);
+    }
+
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                     AccumulatorFragmentIterator const &iterator_begin,
+                     WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
+    }
+  };
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/threadblock/epilogue_workspace.h b/include/cutlass/epilogue/threadblock/epilogue_workspace.h
index 549913a79f..d41a0fa432 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_workspace.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_workspace.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/fusion/visitor_2x.hpp b/include/cutlass/epilogue/threadblock/fusion/visitor_2x.hpp
new file mode 100644
index 0000000000..8b1cd4fd33
--- /dev/null
+++ b/include/cutlass/epilogue/threadblock/fusion/visitor_2x.hpp
@@ -0,0 +1,433 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree operation base implementation to enable composable fusions
+         for the CUTLASS 2x epilogue
+*/
+
+#pragma once
+
+#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::threadblock {
+
+using namespace cute;
+using cute::tuple;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <class... Ops>
+struct VisitorImpl2x: fusion::detail::Sm90VisitorImplBase<Ops...> {
+  using fusion::detail::Sm90VisitorImplBase<Ops...>::Sm90VisitorImplBase;
+  using fusion::detail::Sm90VisitorImplBase<Ops...>::ops;
+
+  template <class CallbacksTuple>
+  struct Callbacks {
+    // Callbacks can store non-persistent variables (e.g. tensors) or copies of persistent variables
+    CallbacksTuple callbacks_tuple;
+
+    /// Called at the start of the epilogue just before iterating over accumulator slices
+    CUTLASS_DEVICE void
+    begin_epilogue() {
+      for_each(callbacks_tuple,
+        [] (auto& callbacks) {
+          callbacks.begin_epilogue();
+        }
+      );
+    }
+
+    /// Called at the start of one step before starting accumulator exchange
+    CUTLASS_DEVICE void
+    begin_step(int step_idx) {
+      for_each(callbacks_tuple,
+        [&] (auto& callbacks) {
+          callbacks.begin_step(step_idx);
+        }
+      );
+    }
+
+    /// Called at the start of a row
+    CUTLASS_DEVICE void
+    begin_row(int row_idx) {
+      for_each(callbacks_tuple,
+        [&] (auto& callbacks) {
+          callbacks.begin_row(row_idx);
+        }
+      );
+    }
+
+    /// Called after accumulators have been exchanged for each accumulator vector
+    template <typename ElementAccumulator, typename... ElementInputs, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc,
+          Array<ElementInputs, FragmentSize> const&... frg_inputs) // depends on the N-naryness of the op
+      = delete; // Must be implemented for each operation
+
+    /// Called at the start of a row
+    CUTLASS_DEVICE void
+    end_row(int row_idx) {
+      for_each(callbacks_tuple,
+        [&] (auto& callbacks) {
+          callbacks.end_row(row_idx);
+        }
+      );
+    }
+
+    /// Called after all accumulator elements have been visited
+    CUTLASS_DEVICE void
+    end_step(int step_idx) {
+      for_each(callbacks_tuple,
+        [&] (auto& callbacks) {
+          callbacks.end_step(step_idx);
+        }
+      );
+    }
+
+    /// Called after all steps have been completed
+    CUTLASS_DEVICE void
+    end_epilogue() {
+      for_each(callbacks_tuple,
+        [] (auto& callbacks) {
+          callbacks.end_epilogue();
+        }
+      );
+    }
+  };
+
+  // Callbacks factory
+  // All operations must redefine this
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    return transform_apply(ops,
+      [&] (auto& op) {
+        return op.get_callbacks(
+          threadblock_tile_offset,
+          thread_idx,
+          problem_shape);
+      },
+      [] (auto&&... callbacks) {
+        auto callbacks_tuple = cute::make_tuple(callbacks...);
+        return Callbacks<decltype(callbacks_tuple)>{callbacks_tuple};
+      }
+    );
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Convenience aliases
+using EmptyCallbacks = VisitorImpl2x<>::Callbacks<cute::tuple<>>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace detail
+
+using namespace detail;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Tree visitor
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class NodeOp, class... ChildOps>
+struct TreeVisitor2x : VisitorImpl2x<ChildOps..., NodeOp> {
+
+  using VisitorImpl2x<ChildOps..., NodeOp>::VisitorImpl2x;
+
+  template<class CallbacksImpl>
+  struct Callbacks : CallbacksImpl {
+    CUTLASS_DEVICE
+    Callbacks(CallbacksImpl&& impl)
+      : CallbacksImpl(cute::forward<CallbacksImpl>(impl)) {}
+
+    using CallbacksImpl::callbacks_tuple;
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
+      constexpr int Rm1 = sizeof...(ChildOps);
+      return cute::detail::tapply(callbacks_tuple,
+        [&] (auto& child_callbacks) {
+          return child_callbacks.visit(iter_idx, row_idx, column_idx, frg_idx, frg_acc);
+        },
+        [&] (auto&&... frg_inputs) {
+          return get<Rm1>(callbacks_tuple).visit(iter_idx, row_idx, column_idx, frg_idx, frg_acc, frg_inputs...);
+        },
+        make_seq<Rm1>{}
+      );
+    }
+  };
+
+  // Callbacks factory
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    return Callbacks<
+    decltype(VisitorImpl2x<ChildOps..., NodeOp>::
+      get_callbacks(
+        threadblock_tile_offset,
+        thread_idx,
+        problem_shape
+      ))>(
+      VisitorImpl2x<ChildOps..., NodeOp>::
+      get_callbacks(
+        threadblock_tile_offset,
+        thread_idx,
+        problem_shape
+      )
+    );
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template<
+  class ElementCompute,
+  class EdgeTuple,
+  class... Ops
+>
+struct TopologicalVisitor2x : VisitorImpl2x<Ops...> {
+  static_assert(is_static_v<EdgeTuple>);
+  static_assert(cute::rank(EdgeTuple{}) == sizeof...(Ops));
+  static_assert(sizeof...(Ops) > 1);
+
+  using VisitorImpl2x<Ops...>::VisitorImpl2x;
+
+  template<class CallbacksImpl>
+  struct Callbacks : CallbacksImpl {
+    CUTLASS_DEVICE
+    Callbacks(CallbacksImpl&& impl)
+      : CallbacksImpl(cute::forward<CallbacksImpl>(impl)) {}
+
+    using CallbacksImpl::callbacks_tuple;
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
+      constexpr int Rm1 = sizeof...(Ops) - 1;
+      auto frg_compute_tuple = cute::repeat<Rm1>(Array<ElementCompute, FragmentSize>{});
+
+      return cute::detail::tapply(EdgeTuple{}, callbacks_tuple, frg_compute_tuple,
+        // Visit the first R-1 ops in topological order
+        [&] (auto&& edge_seq, auto& callbacks, auto& frg_compute) {
+          frg_compute = cute::detail::apply(frg_compute_tuple,
+          // Compute the current op with children inputs
+          [&] (auto const&... frg_inputs) {
+            auto frg_output = callbacks.visit(iter_idx, row_idx, column_idx, frg_idx, frg_acc, frg_inputs...);
+            using ElementOutput = typename decltype(frg_output)::Element;
+            using ConvertOutput = NumericArrayConverter<ElementCompute, ElementOutput, FragmentSize>;
+            ConvertOutput convert_output{};
+
+            return convert_output(frg_output);
+          },
+          // Get inputs in the sequence given by the children indices of the current op
+          edge_seq
+        );
+        return frg_compute;
+      },
+      // Visit the last op
+      [&] (auto const&...ops) {
+        return cute::detail::apply(frg_compute_tuple,
+          // Compute the last op with children inputs
+          [&] (auto const&... frg_inputs) {
+            return get<Rm1>(callbacks_tuple).visit(iter_idx, row_idx, column_idx, frg_idx, frg_acc, frg_inputs...);
+          },
+          // Get inputs in the sequence given by the children indices of the last op
+          get<Rm1>(EdgeTuple{})
+        );
+      },
+      // Transform to visit R-1 ops, apply to visit last op
+      make_seq<Rm1>{}
+      );
+    }
+  };
+
+  // Callbacks factory
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    return Callbacks<decltype(
+      VisitorImpl2x<Ops...>::
+      get_callbacks(
+        threadblock_tile_offset,
+        thread_idx,
+        problem_shape
+      ))>(
+      VisitorImpl2x<Ops...>::
+      get_callbacks(
+        threadblock_tile_offset,
+        thread_idx,
+        problem_shape
+      )
+    );
+  }
+};
+
+
+template <class NodeOp, class... ChildOps>
+using Sm80EVT = TreeVisitor2x<NodeOp, ChildOps...>;
+
+template<
+  class ElementCompute,
+  class EdgeTuple,
+  class... Ops
+>
+using Sm80TopologicalVisitor = TopologicalVisitor2x<ElementCompute, EdgeTuple, Ops...>;
+
+
+using X = Underscore;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// OutputTileThreadLayout translate the CUTLASS 2.X OutputTileOptimalThreadMap into cute layout
+// used by CUTLASS 3.X Epilogue
+template <
+  typename ThreadblockShape_,
+  typename WarpShape_,
+  typename Element_,
+  int ElementsPerAccess,
+  int Stages_=1
+>
+struct OutputTileThreadLayout: DefaultThreadMapTensorOp<
+  ThreadblockShape_,
+  WarpShape_,
+  ThreadblockShape_::kK/WarpShape_::kK,
+  Element_,
+  ElementsPerAccess>::Type {
+
+  using Base = typename DefaultThreadMapTensorOp<
+    ThreadblockShape_,
+    WarpShape_,
+    ThreadblockShape_::kK/WarpShape_::kK,
+    Element_,
+    ElementsPerAccess>::Type;
+  using Base::Base;
+
+  // Software pipeline stages in epilogue
+  static_assert(Stages_ <= 2, "Sm80 EVT only support upto 2 Stages.");
+  static const int Stages = Stages_;
+
+  using ThreadShape = cute::Shape<
+    cute::Int<Base::Detail::kAccessWidth>,                 // lane col idx
+    cute::Int<Base::Detail::kAccessRows>,                  // lane row idx
+    cute::Int<Base::Detail::kWarpsRemainingForRows>,       // warp row idx
+    cute::Int<Base::Shape::kGroup>,                        // group idx
+    cute::Int<Base::Shape::kCluster>                       // cluster idx
+  >;
+
+  using Shape = typename Base::Shape;
+  using Count = typename Base::Count;
+
+  using ThreadMapShape = cute::Shape<
+    // Column
+    Int<Base::kElementsPerAccess>,                // vector
+    Int<Base::Detail::kAccessWidth>,              // lane_col_coord
+    Int<Base::Iterations::kColumn>,               // iteration::column
+    // Row
+    Int<Base::Detail::kAccessRows>,               // lane_row_coord
+    Int<Base::Iterations::kRow>,                  // iterations in row
+    Int<Base::Detail::kWarpsRemainingForRows>,    // warp_row_coord
+    Int<Count::kRow>,                             // iteration::row
+    Int<Count::kGroup>,                           // iteration::group
+    Int<Shape::kGroup>,                           // group_coord
+    Int<Count::kCluster>,                         // iteration::cluster
+    Int<Shape::kCluster>                          // cluster_coord
+  >;
+
+  // The shape of CTA Tile
+  using CtaShapeMNL = cute::Shape<
+    Int<
+      Shape::kRow * Count::kRow *
+      Shape::kGroup * Count::kGroup *
+      Shape::kCluster * Count::kCluster
+    >,
+    Int<Shape::kColumn * Count::kColumn>,
+    _1
+  >;
+
+  static const int kElementsPerAccess = ElementsPerAccess;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  static auto tid2coord(int thread_idx) {
+    return cute::idx2crd(thread_idx, ThreadShape{});
+  }
+
+  template <class TensorInput>
+  CUTLASS_DEVICE
+  static auto partition(TensorInput &&xT, int thread_idx, gemm::GemmCoord threadblock_tile_offset) {
+
+    // (BLK_M,BLK_N)
+    Tensor bCxT = local_tile(
+      xT, CtaShapeMNL{}, make_coord(_,_,_), Step<_1,_1, X>{}
+    )(_,_,threadblock_tile_offset.m(),threadblock_tile_offset.n(),threadblock_tile_offset.k());
+
+    auto [lane_col_coord, lane_row_coord, warp_row_coord, group_coord, cluster_coord] = tid2coord(thread_idx);
+
+    // transform to column-major
+    Tensor bCxT_nm = make_tensor(
+      std::forward<decltype(bCxT)>(bCxT).data(), make_layout(get<1>(bCxT.layout()), get<0>(bCxT.layout()))
+    ).compose(make_layout(ThreadMapShape{}));
+    // VECTOR, FRAGMENT_COLUMN, FRAGMENT_ROW, ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER
+    return bCxT_nm(_,lane_col_coord,_,lane_row_coord,_,warp_row_coord,_,_,group_coord,_,cluster_coord);
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::threadblock
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/threadblock/fusion/visitor_compute.hpp b/include/cutlass/epilogue/threadblock/fusion/visitor_compute.hpp
new file mode 100644
index 0000000000..69a0feab2a
--- /dev/null
+++ b/include/cutlass/epilogue/threadblock/fusion/visitor_compute.hpp
@@ -0,0 +1,109 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree compute operations for the CUTLASS 2x epilogue
+*/
+
+#pragma once
+
+#include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::threadblock {
+
+using namespace cute;
+using namespace detail;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// N-nary Elementwise Compute Operation
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+  template <class> class ComputeFn,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  class = void
+>
+struct VisitorCompute : VisitorImpl2x<> {
+
+  using VisitorImpl2x<>::VisitorImpl2x;
+
+  struct Callbacks : EmptyCallbacks {
+    template <typename ElementAccumulator, typename... ElementInputs, int FragmentSize>
+    CUTLASS_DEVICE Array<ElementOutput, FragmentSize>
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx, 
+          Array<ElementAccumulator, FragmentSize> const& frg_acc,
+          Array<ElementInputs, FragmentSize> const&... frg_inputs) {
+      return transform_apply(cute::make_tuple(frg_inputs...),
+        [&] (auto&& frg_input) {
+          using ElementInput = typename cute::remove_cvref_t<decltype(frg_input)>::Element;
+          using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
+          ConvertInput convert_input{};
+
+          return convert_input(frg_input);
+        },
+        [&] (auto&&... cvt_frg_inputs) {
+          using ComputeOutput = ComputeFn<Array<ElementCompute, FragmentSize>>;
+          using ConvertOutput = NumericArrayConverter<ElementOutput, ElementCompute, FragmentSize, RoundStyle>;
+          ComputeOutput compute_output{};
+          ConvertOutput convert_output{};
+
+          return convert_output(compute_output(cvt_frg_inputs...));
+        }
+      );
+    }
+
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    return Callbacks();
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::threadblock
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/threadblock/fusion/visitor_load.hpp b/include/cutlass/epilogue/threadblock/fusion/visitor_load.hpp
new file mode 100644
index 0000000000..28d482b704
--- /dev/null
+++ b/include/cutlass/epilogue/threadblock/fusion/visitor_load.hpp
@@ -0,0 +1,601 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree load operations for the CUTLASS 2x epilogue
+*/
+
+#pragma once
+
+#include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp"
+#include "cute/tensor.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::threadblock {
+
+using namespace cute;
+using namespace detail;
+
+using X = Underscore;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Elementwise Fetch Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// returns accumulator
+struct VisitorAccFetch : VisitorImpl2x<> {
+
+  using VisitorImpl2x<>::VisitorImpl2x;
+
+  struct Callbacks : EmptyCallbacks {
+    template <class ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<ElementAccumulator, FragmentSize>
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx, Array<ElementAccumulator, FragmentSize> const& frg_acc) {
+      return frg_acc;
+    }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    return Callbacks{};
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Broadcast Load Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Scalar broadcast
+template<
+  class Element,
+  class StrideMNL = Stride<_0,_0,_0>,
+  int BroadcastCount = 1,
+  template <class> class ReductionFn = multiplies
+>
+struct VisitorScalarBroadcast {
+  static_assert(
+    (cute::is_same_v<StrideMNL, Stride<_0,_0,_0>>) || // scalar broadcast, e.g. alpha
+    (cute::is_same_v<StrideMNL, Stride<_0,_0,_1>>) ||
+    (cute::is_same_v<StrideMNL, Stride<_0,_0,int>>));  // batched scalar broadcast, e.g. per-batch alpha
+
+  struct SharedStorage { };
+
+  struct Arguments {
+    Element scalars[BroadcastCount] = {};
+    Element const* scalar_ptrs[BroadcastCount] = {};
+    StrideMNL dScalar = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  CUTLASS_HOST_DEVICE
+  VisitorScalarBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  VisitorScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
+      : params_ptr(&params) {
+    // Get the scalar for non-batched broadcast
+    if constexpr (cute::is_same_v<StrideMNL, Stride<_0,_0,_0>>) {
+      update_scalar();
+    }
+  }
+
+  Element scalar;
+  Params const* params_ptr;
+
+  struct Callbacks: EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(Element scalar)
+      : scalar(scalar) {}
+
+    Element scalar;
+
+    template <class ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
+      Array<Element, FragmentSize> frg_scalar;
+      frg_scalar.fill(scalar);
+
+      return frg_scalar;
+    }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    // Get the scalar for batched broadcast
+    if constexpr (
+      cute::is_same_v<StrideMNL, Stride<_0,_0,_1>> ||
+      cute::is_same_v<StrideMNL, Stride<_0,_0,int>>) {
+      update_scalar(threadblock_tile_offset.k());
+    }
+    return Callbacks(scalar);
+  }
+
+private:
+  CUTLASS_DEVICE void
+  update_scalar(int l_coord = 0) {
+    int l_offset = l_coord * size<2>(params_ptr->dScalar);
+
+    if (params_ptr->scalar_ptrs[0] != nullptr) {
+      scalar = params_ptr->scalar_ptrs[0][l_offset];
+    } else {
+      // batch stride is ignored for nullptr fallback
+      scalar = params_ptr->scalars[0];
+    }
+
+    // Do reduction over multiple broadcasts if necessary
+    ReductionFn<Element> reduction_fn;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < BroadcastCount; ++i) {
+      if (params_ptr->scalar_ptrs[i] != nullptr) {
+        scalar = reduction_fn(scalar, params_ptr->scalar_ptrs[i][l_offset]);
+      } else {
+        // batch stride is ignored for nullptr fallback
+        scalar = reduction_fn(scalar, params_ptr->scalars[i]);
+      }
+    }
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Elementwise Load Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+  class ThreadMap,
+  class Element,
+  class StrideMNL
+>
+struct VisitorAuxLoad{
+
+  struct Arguments {
+    Element* ptr_aux = nullptr;
+    Element null_default = Element(0);
+    StrideMNL dAux = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  // Software pipeline stages
+  static const int Stages = ThreadMap::Stages;
+
+  struct SharedStorage {};
+
+  // Global load type
+  static int constexpr vec_bits = ThreadMap::kElementsPerAccess * sizeof_bits<Element>::value;
+  using VecType = uint_bit_t<cute::min(128, vec_bits)>;
+  static int constexpr VecLength = sizeof(VecType) / sizeof(Element);
+
+  CUTLASS_HOST_DEVICE
+  VisitorAuxLoad() { }
+
+  CUTLASS_HOST_DEVICE
+  VisitorAuxLoad(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+
+  Params const* params_ptr;
+
+  template <class GTensor, class RTensor, class CTensor, class ProblemShape>
+  struct Callbacks : EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(
+      GTensor&& tC_gAux,
+      RTensor&& tC_rAux,
+      CTensor&& tC_cAux,
+      ProblemShape problem_shape,
+      Params const* params_ptr
+    ):
+      tC_gAux(cute::forward<GTensor>(tC_gAux)),
+      tC_rAux(cute::forward<RTensor>(tC_rAux)),
+      tC_cAux(cute::forward<CTensor>(tC_cAux)),
+      problem_shape(problem_shape),
+      params_ptr(params_ptr) { }
+
+    GTensor tC_gAux;
+    RTensor tC_rAux;
+    CTensor tC_cAux;
+    Params const* params_ptr;
+    ProblemShape problem_shape;
+
+    CUTLASS_DEVICE void
+    begin_step(int step_idx) {
+      clear(tC_rAux(_,_,_,step_idx%Stages));
+      auto src_v = filter(tC_gAux(_,_,_,step_idx));
+      auto coord_v = filter(tC_cAux(_,_,_,step_idx));
+      auto dst_v = filter(tC_rAux(_,_,_,step_idx%Stages));
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(src_v); ++i) {
+        bool guard = elem_less(coord_v(i), problem_shape);
+        cutlass::arch::global_load<VecType, sizeof(VecType)>(dst_v(i), (void const*)&src_v(i), guard);
+      }
+    }
+
+    template <class ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
+      Tensor tC_rAux_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rAux(_,_,_,iter_idx%Stages)));
+      return tC_rAux_frg(frg_idx);
+    }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    Tensor mAux = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_aux),
+      problem_shape,
+      params_ptr->dAux);   // (M,N,L)
+    // VECTOR, FRAGMENT_COLUMN, FRAGMENT_ROW, ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER
+    Tensor tC_gAux = recast<VecType>(
+      group_modes<3,6>(ThreadMap::partition(mAux, thread_idx, threadblock_tile_offset)));
+    // VECTOR, FRAGMENT_COLUMN, FRAGMENT_ROW, Stages
+    Tensor tC_rAux = make_tensor<VecType>(
+      make_layout(flatten(make_shape(take<0,3>(tC_gAux.shape()), Int<Stages>{}))));
+
+    // Generate the pred tensor
+    Tensor cAux = make_identity_tensor(mAux.shape());
+    Tensor tC_cAux = outer_partition(
+      group_modes<3,6>(ThreadMap::partition(cAux, thread_idx, threadblock_tile_offset)),
+      Shape<Int<VecLength>>{},
+      (_0{})
+    );
+
+    return Callbacks<
+      decltype(tC_gAux), decltype(tC_rAux),
+      decltype(tC_cAux), ProblemShape>(
+      cute::move(tC_gAux),
+      cute::move(tC_rAux),
+      cute::move(tC_cAux),
+      problem_shape,
+      params_ptr
+    );
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Row vector broadcast
+template<
+  class ThreadMap,
+  class Element,
+  class StrideMNL,
+  bool EnableNullptr = true // Fallback scalar broadcast for nullptr params
+>
+struct VisitorRowBroadcast {
+
+  struct Arguments {
+    Element const* ptr_row = nullptr;
+    Element null_default = Element(0);
+    StrideMNL dRow = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  struct SharedStorage {};
+
+  // Global load type
+  static int constexpr vec_bits = ThreadMap::kElementsPerAccess * sizeof_bits<Element>::value;
+  using VecType = uint_bit_t<cute::min(128, vec_bits)>;
+  static int constexpr VecLength = sizeof(VecType) / sizeof(Element);
+
+  CUTLASS_HOST_DEVICE
+  VisitorRowBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  VisitorRowBroadcast(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+
+  Params const* params_ptr;
+
+  template <class GTensor, class RTensor, class CTensor, class ProblemShape>
+  struct Callbacks : EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(
+      GTensor&& tC_gRow,
+      RTensor&& tC_rRow,
+      CTensor&& tC_cRow,
+      ProblemShape problem_shape,
+      Params const* params_ptr
+    ):
+      tC_gRow(cute::forward<GTensor>(tC_gRow)),
+      tC_rRow(cute::forward<RTensor>(tC_rRow)),
+      tC_cRow(cute::forward<CTensor>(tC_cRow)),
+      n(get<1>(problem_shape)),
+      params_ptr(params_ptr) { }
+
+    GTensor tC_gRow;
+    RTensor tC_rRow;
+    CTensor tC_cRow;
+    Params const* params_ptr;
+    int n;
+
+    CUTLASS_DEVICE void
+    begin_epilogue() {
+      if constexpr (EnableNullptr) {
+        if (params_ptr->ptr_row == nullptr) {
+          auto tC_rRow_vec = recast<Array<Element, VecLength>>(coalesce(tC_rRow));
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 0; i < size(tC_rRow_vec); ++i) {
+            tC_rRow_vec[i].fill(params_ptr->null_default);
+          }
+          return;
+        }
+      }
+      clear(tC_rRow);
+      auto src_v = filter(tC_gRow);
+      auto coord_v = filter(tC_cRow);
+      auto dst_v = filter(tC_rRow);
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(src_v); ++i) {
+        bool guard = get<1>(coord_v(i)) < n;
+        cutlass::arch::global_load<VecType, sizeof(VecType)>(dst_v(i), (void const *)&src_v(i), guard);
+      }
+    }
+
+    template <class ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
+      Tensor rRow_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rRow));
+      return rRow_frg(column_idx);
+    }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    Tensor mRow = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_row),
+      problem_shape,
+      params_ptr->dRow);
+
+    // VECTOR, FRAGMENT_COLUMN
+    Tensor tC_gRow = recast<VecType>(
+      ThreadMap::partition(mRow, thread_idx, threadblock_tile_offset)
+    )(_,_,_0{},_0{},_0{},_0{});
+    Tensor tC_rRow = make_tensor_like(tC_gRow);
+
+    // Generate the pred tensor
+    Tensor cRow = make_identity_tensor(mRow.shape());
+    Tensor tC_cRow = outer_partition(
+      ThreadMap::partition(cRow, thread_idx, threadblock_tile_offset)(_,_,_0{},_0{},_0{},_0{}),
+      Shape<Int<VecLength>>{},
+      (_0{})
+    );
+
+    return Callbacks<
+      decltype(tC_gRow), decltype(tC_rRow),
+      decltype(tC_cRow), ProblemShape>(
+      cute::move(tC_gRow),
+      cute::move(tC_rRow),
+      cute::move(tC_cRow),
+      problem_shape,
+      params_ptr
+    );
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Column vector broadcast
+template<
+  class ThreadMap,
+  class Element,
+  class StrideMNL = Stride<_1,_0,_0>,
+  bool EnableNullptr = true // Fallback scalar broadcast for nullptr params
+>
+struct VisitorColBroadcast {
+
+  struct Arguments {
+    Element const* ptr_col = nullptr;
+    Element null_default = Element(0);
+    StrideMNL dCol = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  struct SharedStorage { };
+
+  CUTLASS_HOST_DEVICE
+  VisitorColBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  VisitorColBroadcast(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+
+  Params const* params_ptr;
+
+  template <class GTensor, class RTensor, class CTensor, class ProblemShape>
+  struct Callbacks : EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(
+      GTensor&& tC_gCol,
+      RTensor&& tC_rCol,
+      CTensor&& tC_cCol,
+      ProblemShape problem_shape,
+      Params const* params_ptr
+    ):
+      tC_gCol(cute::forward<GTensor>(tC_gCol)),
+      tC_rCol(cute::forward<RTensor>(tC_rCol)),
+      tC_cCol(cute::forward<CTensor>(tC_cCol)),
+      m(get<0>(problem_shape)),
+      params_ptr(params_ptr) { }
+
+    GTensor tC_gCol;
+    RTensor tC_rCol;
+    CTensor tC_cCol;
+    Params const* params_ptr;
+    int m;
+
+    CUTLASS_DEVICE void
+    begin_epilogue() {
+      if constexpr (EnableNullptr) {
+        if (params_ptr->ptr_col == nullptr) {
+          fill(tC_rCol, params_ptr->null_default);
+          return;
+        }
+      }
+      clear(tC_rCol);
+      Tensor pred = make_tensor<bool>(shape(tC_gCol));
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(pred); ++i) {
+        pred(i) = get<0>(tC_cCol(i)) < m;
+      }
+      copy_if(pred, tC_gCol, tC_rCol);
+    }
+
+    template <class ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
+      Array<Element, FragmentSize> frg_col;
+      frg_col.fill(tC_rCol(row_idx,iter_idx));
+      return frg_col;
+    }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    Tensor mCol = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_col),
+      problem_shape,
+      params_ptr->dCol);
+
+    // VECTOR, FRAGMENT_COLUMN, FRAGMENT_ROW, ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER
+    Tensor tC_gCol = group_modes<1,4>(
+      ThreadMap::partition(mCol, thread_idx, threadblock_tile_offset)(_0{},_0{},_,_,_,_));
+    Tensor tC_rCol = make_tensor_like(tC_gCol);
+
+    // Generate the pred tensor
+    Tensor cCol = make_identity_tensor(mCol.shape());
+    Tensor tC_cCol = group_modes<1,4>(
+      ThreadMap::partition(cCol, thread_idx, threadblock_tile_offset)(_0{},_0{},_,_,_,_));
+
+    return Callbacks<
+      decltype(tC_gCol), decltype(tC_rCol),
+      decltype(tC_cCol), ProblemShape>(
+      cute::move(tC_gCol),
+      cute::move(tC_rCol),
+      cute::move(tC_cCol),
+      problem_shape,
+      params_ptr
+    );
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::threadblock
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/threadblock/fusion/visitor_store.hpp b/include/cutlass/epilogue/threadblock/fusion/visitor_store.hpp
new file mode 100644
index 0000000000..dcec7ac83c
--- /dev/null
+++ b/include/cutlass/epilogue/threadblock/fusion/visitor_store.hpp
@@ -0,0 +1,802 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree store operations for the CUTLASS 2x epilogue
+*/
+
+#pragma once
+
+#include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::threadblock {
+
+using namespace cute;
+using namespace detail;
+using X = Underscore;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Elementwise Store Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+  class ThreadMap,
+  class Element,
+  FloatRoundStyle RoundStyle,
+  class StrideMNL
+>
+struct VisitorAuxStore{
+
+  struct Arguments {
+    Element* ptr_aux = nullptr;
+    StrideMNL dAux = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  struct SharedStorage {};
+
+  static int constexpr vec_bits = ThreadMap::kElementsPerAccess * sizeof_bits<Element>::value;
+  using VecType = uint_bit_t<cute::min(128, vec_bits)>;
+  static int constexpr VecLength = sizeof(VecType) / sizeof(Element);
+
+  CUTLASS_HOST_DEVICE
+  VisitorAuxStore() { }
+
+  CUTLASS_HOST_DEVICE
+  VisitorAuxStore(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+
+  Params const* params_ptr;
+
+  template <class GTensor, class RTensor, class CTensor, class ProblemShape>
+  struct Callbacks : EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(
+      GTensor&& tC_gAux,
+      RTensor&& tC_rAux,
+      CTensor&& tC_cAux,
+      ProblemShape problem_shape,
+      Params const* params_ptr
+    ):
+      tC_gAux(cute::forward<GTensor>(tC_gAux)),
+      tC_rAux(cute::forward<RTensor>(tC_rAux)),
+      tC_cAux(cute::forward<CTensor>(tC_cAux)),
+      problem_shape(problem_shape),
+      params_ptr(params_ptr) { }
+
+    GTensor tC_gAux;
+    RTensor tC_rAux;
+    CTensor tC_cAux;
+    Params const* params_ptr;
+    ProblemShape problem_shape;
+
+    CUTLASS_DEVICE void
+    begin_step(int step_idx) {
+      clear(tC_rAux);
+    }
+
+    template <class ElementAccumulator, class ElementInput, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc,
+          Array<ElementInput, FragmentSize> const& frg_input) {
+      using ConvertInput = NumericArrayConverter<Element, ElementInput, FragmentSize, RoundStyle>;
+      ConvertInput convert_input{};
+
+      Tensor tC_rAux_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rAux));
+      tC_rAux_frg(frg_idx) = convert_input(frg_input);
+
+      return frg_input;
+    }
+
+    CUTLASS_DEVICE void
+    end_step(int step_idx) {
+      auto src_v = filter(tC_rAux);
+      auto coord_v = filter(tC_cAux(_,_,_,step_idx));
+      auto dst_v = filter(tC_gAux(_,_,_,step_idx));
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(src_v); ++i) {
+        bool guard = elem_less(coord_v(i), problem_shape);
+        cutlass::arch::global_store<VecType, sizeof(VecType)>(src_v(i), (void*)&dst_v(i), guard);
+      }
+    }
+
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    Tensor mAux = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_aux),
+      problem_shape,
+      params_ptr->dAux);   // (M,N,L)
+    // VECTOR, FRAGMENT_COLUMN, FRAGMENT_ROW, ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER
+    Tensor tC_gAux = recast<VecType>(group_modes<3,6>(ThreadMap::partition(mAux, thread_idx, threadblock_tile_offset)));
+    Tensor tC_rAux = make_tensor_like(take<0,3>(tC_gAux));
+
+    // Generate the pred tensor
+    Tensor cAux = make_identity_tensor(mAux.shape());
+    Tensor tC_cAux = outer_partition(
+      group_modes<3,6>(ThreadMap::partition(cAux, thread_idx, threadblock_tile_offset)),
+      Shape<Int<VecLength>>{},
+      (_0{})
+    );
+
+    return Callbacks<
+      decltype(tC_gAux), decltype(tC_rAux),
+      decltype(tC_cAux), ProblemShape>(
+      cute::move(tC_gAux),
+      cute::move(tC_rAux),
+      cute::move(tC_cAux),
+      problem_shape,
+      params_ptr
+    );
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Reduction Store Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Helper functions
+template <
+  template <class> class ReduceFn,
+  int kThreads, class T>
+CUTLASS_DEVICE
+void intra_warp_row_reduce(T& value) {
+  using ReduceInput = ReduceFn<T>;
+  ReduceInput reduce_input{};
+  constexpr int kHalfThreads = kThreads >> 1;
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = kHalfThreads; i > 0; i >>= 1) {
+    value = reduce_input(value, __shfl_xor_sync(0xFFFFFFFF, value, i));
+  }
+}
+
+template <
+  template <class> class ReduceFn,
+  FloatRoundStyle RoundStyle,
+  class ElementCompute,
+  class ElementFragment, int FragmentSize>
+CUTLASS_DEVICE
+void fragment_reduce(ElementCompute& value, Array<ElementFragment, FragmentSize> const& frg) {
+  using ReduceInput = ReduceFn<ElementCompute>;
+  ReduceInput reduce_input{};
+  using ConvertInput = NumericConverter<ElementCompute, ElementFragment, RoundStyle>;
+  ConvertInput convert_input{};
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < FragmentSize; ++i) {
+    value = reduce_input(value, convert_input(frg[i]));
+  }
+}
+
+template<
+  template <class> class AtomicReduceFn,
+  FloatRoundStyle RoundStyle,
+  class ElementCompute,
+  class ElementOutput>
+CUTLASS_DEVICE
+void atomic_reduce(ElementOutput* ptr, ElementCompute const& value) {
+  using ReduceOutput = AtomicReduceFn<ElementOutput>;
+  using ConvertOutput = NumericConverter<ElementOutput, ElementCompute, RoundStyle>;
+  ReduceOutput reduce_output{};
+  ConvertOutput convert_output{};
+
+  reduce_output(ptr, convert_output(value));
+}
+
+// Col vector reduction
+template <
+  template <class> class RegReduceFn,
+  template <class> class AtomicReduceFn,
+  class ThreadMap,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  class StrideMNL = Stride<_1,_0,_0>
+>
+struct VisitorColReduction {
+
+  struct Arguments {
+    ElementOutput* ptr_col = nullptr;
+    ElementCompute reduction_identity = 0;
+    StrideMNL dCol = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  struct SharedStorage { };
+
+  CUTLASS_HOST_DEVICE
+  VisitorColReduction() { }
+
+  CUTLASS_HOST_DEVICE
+  VisitorColReduction(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+
+  Params const* params_ptr;
+
+  template <class GTensor, class CTensor, class ProblemShape>
+  struct Callbacks : EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(
+      GTensor&& tC_gCol,
+      CTensor&& tC_cCol,
+      ProblemShape problem_shape,
+      Params const* params_ptr,
+      int thread_idx
+    ):
+      tC_gCol(cute::forward<GTensor>(tC_gCol)),
+      tC_cCol(cute::forward<CTensor>(tC_cCol)),
+      m(get<0>(problem_shape)),
+      n(get<1>(problem_shape)),
+      params_ptr(params_ptr) {
+        // The partial reduction results of each warp are further
+        // reduced to the first thread in each row.
+        // Only the first thread in each row is the writing thread
+        is_writing_thread = thread_idx % ThreadMap::Detail::kAccessWidth == 0;
+      }
+
+    GTensor tC_gCol;
+    CTensor tC_cCol;
+    Params const* params_ptr;
+    int m;
+    int n;
+    int curr_iter_idx;
+    bool is_writing_thread;
+
+    ElementCompute reduction_accum;
+
+    CUTLASS_DEVICE void
+    begin_row(int row_idx) {
+      reduction_accum = ElementCompute(params_ptr->reduction_identity);
+    }
+
+    template <class ElementAccumulator, class ElementInput, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc,
+          Array<ElementInput, FragmentSize> const& frg_input) {
+
+      curr_iter_idx = iter_idx;
+
+      int coord_n = get<1>(tC_cCol(column_idx, row_idx, iter_idx));
+      if (coord_n < n) {
+        fragment_reduce<RegReduceFn, RoundStyle>(reduction_accum, frg_input);
+      }
+
+      // Intra-warp reduction
+      if (column_idx + 1 == ThreadMap::Iterations::kColumn) {
+        intra_warp_row_reduce<RegReduceFn, ThreadMap::Detail::kAccessWidth>(reduction_accum);
+      }
+
+      return frg_input;
+    }
+
+    CUTLASS_DEVICE auto
+    end_row(int row_idx) {
+      bool guard = get<0>(tC_cCol(_0{}, row_idx,curr_iter_idx)) < m;
+
+      if (guard && is_writing_thread) {
+        atomic_reduce<AtomicReduceFn, RoundStyle>(&tC_gCol(row_idx,curr_iter_idx), reduction_accum);
+      }
+    }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+
+    Tensor mCol = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_col),
+      problem_shape,
+      params_ptr->dCol);
+    // FRAGMENT_ROW, (ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER)
+    Tensor tC_gCol = group_modes<1,4>(
+      ThreadMap::partition(mCol, thread_idx, threadblock_tile_offset)(_0{},_0{},_,_,_,_));
+
+    // Generate the pred tensor
+    Tensor cCol = make_identity_tensor(mCol.shape());
+    // FRAGMENT_COL, FRAGMENT_ROW, (ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER)
+    Tensor tC_cCol = group_modes<2,5>(
+      ThreadMap::partition(cCol, thread_idx, threadblock_tile_offset)(_0{},_,_,_,_,_));
+
+    return Callbacks<
+      decltype(tC_gCol), decltype(tC_cCol),
+      ProblemShape>(
+      cute::move(tC_gCol),
+      cute::move(tC_cCol),
+      problem_shape,
+      params_ptr,
+      thread_idx
+    );
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Row vector reduction
+template <
+  template <class> class RegReduceFn,
+  template <class> class AtomicReduceFn,
+  class ThreadMap,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  class StrideMNL = Stride<_0,_1,_0>
+>
+struct VisitorRowReduction {
+
+  struct Arguments {
+    ElementOutput* ptr_row = nullptr;
+    ElementCompute reduction_identity = 0;
+    StrideMNL dRow = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  using SharedStorageShape = decltype(select<0,1,2,3,5,8,10>(typename ThreadMap::ThreadMapShape{}));
+
+  struct SharedStorage {
+    AlignedArray<ElementCompute, size(SharedStorageShape{}), 16> reduction;
+  };
+
+  static int constexpr vec_bits = ThreadMap::kElementsPerAccess * sizeof_bits<ElementOutput>::value;
+  using VecType = uint_bit_t<cute::min(128, vec_bits)>;
+
+  CUTLASS_HOST_DEVICE
+  VisitorRowReduction() { }
+
+  CUTLASS_HOST_DEVICE
+  VisitorRowReduction(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params),
+      smem_reduce(const_cast<ElementCompute*>(shared_storage.reduction.data())) { }
+
+  Params const* params_ptr;
+  ElementCompute* smem_reduce;
+
+  template <
+    class RTensorR2S, class STensorR2S, class CTensorR2S,
+    class STensorS2R, class RTensorS2R, class CTensorS2R,
+    class GTensor, class CTensor, class ProblemShape>
+  struct Callbacks : EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(
+      // R->S
+      RTensorR2S&& tRS_rSrc,
+      STensorR2S&& tRS_sRows,
+      CTensorR2S&& tRS_cSrc,
+      // S->R
+      STensorS2R&& tSR_sRows,
+      RTensorS2R&& tSR_rRows,
+      CTensorS2R&& tSR_cRows,
+      // R->G
+      GTensor&& tC_gRow,
+      CTensor&& tC_cRow,
+      ProblemShape problem_shape,
+      Params const* params_ptr
+    ):
+      // R->S
+      tRS_rSrc(cute::forward<RTensorR2S>(tRS_rSrc)),
+      tRS_sRows(cute::forward<STensorR2S>(tRS_sRows)),
+      tRS_cSrc(cute::forward<CTensorR2S>(tRS_cSrc)),
+      // S->R
+      tSR_sRows(cute::forward<STensorS2R>(tSR_sRows)),
+      tSR_rRows(cute::forward<RTensorS2R>(tSR_rRows)),
+      tSR_cRows(cute::forward<CTensorS2R>(tSR_cRows)),
+      // R->G
+      tC_gRow(cute::forward<GTensor>(tC_gRow)),
+      tC_cRow(cute::forward<CTensor>(tC_cRow)),
+      m(get<0>(problem_shape)),
+      n(get<1>(problem_shape)),
+      params_ptr(params_ptr) { }
+
+    // R->S
+    RTensorR2S tRS_rSrc;
+    STensorR2S tRS_sRows;
+    CTensorR2S tRS_cSrc;
+    // S->R
+    STensorS2R tSR_sRows;
+    RTensorS2R tSR_rRows;
+    CTensorS2R tSR_cRows;
+    // R->G
+    GTensor tC_gRow;
+    CTensor tC_cRow;
+
+    Params const* params_ptr;
+    int n;
+    int m;
+
+    CUTLASS_DEVICE void
+    begin_epilogue() {
+      fill(tRS_rSrc, params_ptr->reduction_identity);
+    }
+
+    template <class ElementAccumulator, class ElementInput, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc,
+          Array<ElementInput, FragmentSize> const& frg_input) {
+
+      using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
+      ConvertInput convert_input{};
+      Tensor tRS_rRow_frg = recast<Array<ElementCompute, FragmentSize>>(coalesce(tRS_rSrc));
+
+      int coord_m = get<0>(tRS_cSrc(column_idx,row_idx,iter_idx));
+      if (coord_m < m)
+        reduction(tRS_rRow_frg[column_idx], convert_input(frg_input));
+
+      return frg_input;
+    }
+
+    CUTLASS_DEVICE void
+    end_epilogue() {
+      //
+      // Store the partially reduced value to SMEM
+      //
+
+      // Guard against uses of the existing SMEM tile
+      __syncthreads();
+
+      copy(tRS_rSrc, tRS_sRows);
+
+      __syncthreads();
+
+      //
+      // Now, threads are assigned several columns of the output. They fetch over all rows from
+      // the compacted SMEM tile and perform a reduction.
+      //
+
+      fill(tSR_rRows, params_ptr->reduction_identity);
+
+      using ReduceInputReg = RegReduceFn<ElementCompute>;
+      ReduceInputReg reduce_input_reg{};
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < size(tSR_rRows); ++j) {
+        if (get<0>(tSR_cRows(j)) < get<1>(typename ThreadMap::CtaShapeMNL{}) && get<1>(tC_cRow(j)) < n) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 0; i < size(tSR_sRows) / size(tSR_rRows); ++i) {
+            tSR_rRows(j) = reduce_input_reg(tSR_rRows(j), tSR_sRows(i + j * size(tSR_sRows) / size(tSR_rRows)));
+          }
+          atomic_reduce<AtomicReduceFn, RoundStyle>(&tC_gRow(j), tSR_rRows(j));
+        }
+
+      }
+    }
+
+  private:
+
+    template <int FragmentSize>
+    CUTLASS_DEVICE ElementCompute
+    reduction(Array<ElementCompute, FragmentSize>& reduce_buffer, Array<ElementCompute, FragmentSize> const& result) {
+      using ReduceInput = RegReduceFn<ElementCompute>;
+      ReduceInput reduce_input{};
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < FragmentSize; ++i) {
+            reduce_buffer[i] = reduce_input(reduce_buffer[i], result[i]);
+        }
+    }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    Tensor mRow = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_row),
+      problem_shape,
+      params_ptr->dRow);
+
+    //
+    // Step 1: reduce fragment input (Src) into tRS_rSrc
+    //
+
+    // VECTOR,FRAGMENT_COL
+    Tensor tRS_rSrc = make_tensor<ElementCompute>(select<0,2>(typename ThreadMap::ThreadMapShape{}));
+
+    Tensor cSrc = make_identity_tensor(mRow.shape());
+    // FRAGMENT_COLUMN, FRAGMENT_ROW, (ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER)
+    Tensor tRS_cSrc = group_modes<2,5>(ThreadMap::partition(cSrc, thread_idx, threadblock_tile_offset)(_0{},_,_,_,_,_));
+
+    //
+    // Step 2: copy the partial results in tRS_rSrc to sRows in shared memory
+    //
+
+    // VECTOR,ACCESS_WIDTH,FRAGMENT_COL,ACCESS_ROWS,WARPS_PER_ROW,GROUPS,CLUSTERS
+    Tensor sRows = make_tensor(
+      make_smem_ptr(smem_reduce), SharedStorageShape{}
+    );
+
+    auto [lane_col_coord, lane_row_coord, warp_row_coord, group_coord, cluster_coord] = ThreadMap::tid2coord(thread_idx);
+    Tensor tRS_sRows = sRows(_,lane_col_coord,_,lane_row_coord,warp_row_coord,group_coord,cluster_coord);
+
+    //
+    // Step 3: copy the partial results in sRows to tSR_sRow for reduction
+    //
+
+    // VECTOR*ACCESS_WIDTH*FRAGMENT_COL,ACCESS_ROWS*WARPS_PER_ROW*GROUPS*CLUSTERS
+    Tensor sRows_nm = coalesce(group_modes<1,5>(group_modes<0,3>(sRows)), Shape<_1,_1>{});
+    // SMEM_ROW/THREADS,ACCESS_ROWS*WARPS_PER_ROW*GROUPS*CLUSTERS
+    Tensor tSR_sRows = outer_partition(sRows_nm, Shape<Int<ThreadMap::kThreads>,_1>{}, thread_idx);
+    // SMEM_ROW/THREADS
+    Tensor tSR_rRows = make_tensor_like(tSR_sRows(_,_0{}));
+    // Coord
+    Tensor cRows_nm = make_identity_tensor(sRows_nm.shape());
+    Tensor tSR_cRows = outer_partition(cRows_nm, Shape<Int<ThreadMap::kThreads>,_1>{}, thread_idx)(_,_0{});
+
+    //
+    // Step 4: atomically reduce the results to global memory
+    //
+
+    Tensor tC_gRow = outer_partition(
+      // Cta tile
+      local_tile(
+        mRow, typename ThreadMap::CtaShapeMNL{}, make_coord(_,_,_),Step<_1,_1, X>{}
+      )(_,_,threadblock_tile_offset.m(),threadblock_tile_offset.n(),threadblock_tile_offset.k()),
+      // Partition to threads
+      Shape<_1,Int<ThreadMap::kThreads>>{}, thread_idx
+    )(_0{},_);
+
+    Tensor cRow = make_identity_tensor(mRow.shape());
+    Tensor tC_cRow = outer_partition(
+      // Cta tile
+      local_tile(
+        cRow, typename ThreadMap::CtaShapeMNL{}, make_coord(_,_,_), Step<_1,_1, X>{}
+      )(_,_,threadblock_tile_offset.m(),threadblock_tile_offset.n(),threadblock_tile_offset.k()),
+      // Partition to threads
+      Shape<_1,Int<ThreadMap::kThreads>>{}, thread_idx
+    )(_0{},_);
+
+    return Callbacks<
+      decltype(tRS_rSrc), decltype(tRS_sRows),
+      decltype(tRS_cSrc), decltype(tSR_sRows),
+      decltype(tSR_rRows), decltype(tSR_cRows),
+      decltype(tC_gRow), decltype(tC_cRow),
+      ProblemShape>(
+      // R->S
+      cute::move(tRS_rSrc),
+      cute::move(tRS_sRows),
+      cute::move(tRS_cSrc),
+      // S->R
+      cute::move(tSR_sRows),
+      cute::move(tSR_rRows),
+      cute::move(tSR_cRows),
+      // R->G
+      cute::move(tC_gRow),
+      cute::move(tC_cRow),
+      problem_shape,
+      params_ptr
+    );
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Scalar reduction
+template <
+  template <class> class RegReduceFn,
+  template <class> class AtomicReduceFn,
+  class ThreadMap,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  class StrideMNL = Stride<_0,_0,_0>
+>
+struct VisitorScalarReduction {
+  static_assert(
+    (cute::is_same_v<StrideMNL, Stride<_0,_0, _0>>) || // scalar reduction, e.g. tensor max element
+    (cute::is_same_v<StrideMNL, Stride<_0,_0, _1>>) || // batched scalar reduction, e.g. per-batch max element
+    (cute::is_same_v<StrideMNL, Stride<_0,_0,int>>));
+
+  struct Arguments {
+    ElementOutput* ptr_scalar = nullptr;
+    ElementCompute reduction_identity = 0;
+    StrideMNL dScalar = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  struct SharedStorage { };
+
+  CUTLASS_HOST_DEVICE
+  VisitorScalarReduction(){ };
+
+  CUTLASS_HOST_DEVICE
+  VisitorScalarReduction(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+
+  Params const* params_ptr;
+
+  template <class CTensor, class GTensor, class ProblemShape>
+  struct Callbacks : EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(
+      CTensor&& tC_cSrc,
+      GTensor&& tC_gScalar,
+      ProblemShape problem_shape,
+      Params const* params_ptr,
+      int thread_idx
+    ):
+      tC_cSrc(cute::forward<CTensor>(tC_cSrc)),
+      tC_gScalar(cute::forward<GTensor>(tC_gScalar)),
+      problem_shape(problem_shape),
+      params_ptr(params_ptr) {
+        // The partial reduction results of each warp are further
+        // reduced to this first thread.
+        // Only the first thread of each warp is the writing thread
+        is_writing_thread = thread_idx % ThreadMap::kWarpSize == 0;
+      }
+
+      GTensor tC_gScalar;
+      CTensor tC_cSrc;
+      Params const* params_ptr;
+      ProblemShape problem_shape;
+      bool is_writing_thread;
+
+      ElementCompute reduction_accum;
+
+      CUTLASS_DEVICE void
+      begin_epilogue() {
+        reduction_accum = ElementCompute(params_ptr->reduction_identity);
+      }
+
+      template <class ElementAccumulator, class ElementInput, int FragmentSize>
+      CUTLASS_DEVICE auto
+      visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+            Array<ElementAccumulator, FragmentSize> const& frg_acc,
+            Array<ElementInput, FragmentSize> const& frg_input) {
+
+        auto coord = tC_cSrc(column_idx, row_idx, iter_idx);
+        if (elem_less(coord, problem_shape)) {
+          fragment_reduce<RegReduceFn, RoundStyle>(reduction_accum, frg_input);
+        }
+
+        return frg_input;
+      }
+
+      CUTLASS_DEVICE auto
+      end_epilogue() {
+        // Intra-warp reduction
+        intra_warp_row_reduce<RegReduceFn, ThreadMap::kWarpSize>(reduction_accum);
+
+        // Atomically reduce to global memory
+        atomic_reduce<AtomicReduceFn, RoundStyle>(&tC_gScalar(_0{},_0{}), reduction_accum);
+      }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    Tensor cSrc = make_identity_tensor(problem_shape);
+    // FRAGMENT_COL, FRAGMENT_ROW, (ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER)
+    Tensor tC_cSrc = group_modes<2,5>(
+      ThreadMap::partition(cSrc, thread_idx, threadblock_tile_offset)(_0{},_,_,_,_,_)
+    );
+
+    Tensor mScalar = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_scalar),
+      problem_shape,
+      params_ptr->dScalar
+    );
+
+    Tensor tC_gScalar = mScalar(_,_,threadblock_tile_offset.k());
+
+    return Callbacks<
+      decltype(tC_cSrc), decltype(tC_gScalar),
+      ProblemShape>(
+      cute::move(tC_cSrc),
+      cute::move(tC_gScalar),
+      problem_shape,
+      params_ptr,
+      thread_idx
+    );
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::threadblock
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/threadblock/fusion/visitors.hpp b/include/cutlass/epilogue/threadblock/fusion/visitors.hpp
new file mode 100644
index 0000000000..96fbc01d71
--- /dev/null
+++ b/include/cutlass/epilogue/threadblock/fusion/visitors.hpp
@@ -0,0 +1,38 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Higher-level header file includes all the CUTLASS 2x visitors
+*/
+#include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp"
+#include "cutlass/epilogue/threadblock/fusion/visitor_load.hpp"
+#include "cutlass/epilogue/threadblock/fusion/visitor_store.hpp"
+#include "cutlass/epilogue/threadblock/fusion/visitor_compute.hpp"
diff --git a/include/cutlass/epilogue/threadblock/interleaved_epilogue.h b/include/cutlass/epilogue/threadblock/interleaved_epilogue.h
index 685b512e70..305f5d7839 100644
--- a/include/cutlass/epilogue/threadblock/interleaved_epilogue.h
+++ b/include/cutlass/epilogue/threadblock/interleaved_epilogue.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -51,7 +51,7 @@
 #include "cutlass/transform/pitch_linear_thread_map.h"
 #include "cutlass/transform/threadblock/regular_tile_iterator.h"
 
-#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/epilogue_base_streamk.h"
 #include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -78,8 +78,21 @@ template <
     typename OutputOp_,
     /// Number of interleaved k
     int InterleavedK>
-class InterleavedEpilogue {
- public:
+class InterleavedEpilogue :
+  public EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>
+{
+public:
+
+  using BaseStreamK = EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>;
+
   using Shape = Shape_;
   using WarpMmaOperator = WarpMmaOperator_;
   static int const kPartitionsK = PartitionsK;
@@ -90,6 +103,9 @@ class InterleavedEpilogue {
   /// The complete warp-level accumulator tile
   using AccumulatorTile = typename AccumulatorFragmentIterator::AccumulatorTile;
 
+  /// Fragment type used by the accumulator tile's fragment iterator
+  using AccumulatorFragment = typename AccumulatorFragmentIterator::Fragment;
+
   /// Accumulator element
   using ElementAccumulator = typename AccumulatorTile::Element;
 
@@ -122,7 +138,8 @@ class InterleavedEpilogue {
       gemm::GemmShape<Shape::kM / WarpMmaOperator::Shape::kM,
                       Shape::kN / WarpMmaOperator::Shape::kN, kPartitionsK>;
 
- public:
+public:
+
   static_assert(OutputTileIterator::kElementsPerAccess,
                 "This must not be zero.");
 
@@ -130,99 +147,217 @@ class InterleavedEpilogue {
                   OutputTileIterator::kElementsPerAccess),
                 "Divisibility");
 
+public:
+
+  /// Aspect for when epilogue source is not needed
+  struct SourceAspectNotNeeded
+  {
+    /// Constructor
+    CUTLASS_DEVICE
+    SourceAspectNotNeeded()
+    {}
+
+    /// Invoke the output functor over each vector of output
+    CUTLASS_DEVICE
+    void apply_output_operator(
+      typename OutputTileIterator::Fragment &output_fragment,
+      OutputOp const &output_op,
+      typename AccumulatorFragmentIterator::Fragment const &aligned_accum_fragment)
+    {
+      OutputAccessType *output_frag_ptr =
+        reinterpret_cast<OutputAccessType *>(&output_fragment);
+
+      AccumulatorAccessType const *compute_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
+
+      int const kOutputOpIterations =
+        OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kOutputOpIterations; ++i)
+      {
+        // Call the output operator
+        output_frag_ptr[i] = output_op(compute_frag_ptr[i]);
+      }
+    }
+  };
+
+
+  /// Aspect for when epilogue source is needed
+  struct SourceAspectNeeded
+  {
+    OutputTileIterator source_iterator;
+
+    typename OutputTileIterator::Fragment source_fragment;
+
+    /// Invoke the output functor over each vector of output
+    CUTLASS_DEVICE
+    static void apply_output_operator(
+      typename OutputTileIterator::Fragment &output_fragment,
+      OutputOp const &output_op,
+      typename AccumulatorFragmentIterator::Fragment const &aligned_accum_fragment,
+      typename OutputTileIterator::Fragment const &source_fragment)
+    {
+      OutputAccessType *output_frag_ptr =
+        reinterpret_cast<OutputAccessType *>(&output_fragment);
+
+      AccumulatorAccessType const *compute_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
+
+      OutputAccessType const *source_frag_ptr =
+        reinterpret_cast<OutputAccessType const *>(&source_fragment);
+
+      int const kOutputOpIterations =
+        OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kOutputOpIterations; ++i)
+      {
+        // Call the output operator
+        output_frag_ptr[i] = output_op(compute_frag_ptr[i], source_frag_ptr[i]);
+      }
+    }
+
+    /// Constructor
+    CUTLASS_DEVICE
+    SourceAspectNeeded(OutputTileIterator source_iterator) :
+      source_iterator(source_iterator)
+    {
+      source_fragment.clear();
+    }
+
+    /// Invoke the output functor over each vector of output
+    CUTLASS_DEVICE
+    void apply_output_operator(
+      typename OutputTileIterator::Fragment &output_fragment,
+      OutputOp const &output_op,
+      typename AccumulatorFragmentIterator::Fragment const &aligned_accum_fragment)
+    {
+      // Load addend source fragment from global memory
+      source_iterator.load(source_fragment);
+      ++source_iterator;
+
+      apply_output_operator(output_fragment, output_op, aligned_accum_fragment, source_fragment);
+    }
+  };
+
+
   /// Shared storage allocation needed by the epilogue
   struct SharedStorage {};
 
 
- public:
+public:
+
   /// Constructor
   CUTLASS_DEVICE
   InterleavedEpilogue(
       SharedStorage &shared_storage,  ///< Shared storage object
       int thread_idx,                 ///< ID of a thread within the threadblock
       int warp_idx,                   ///< ID of warp within threadblock
-      int lane_idx                    ///< Id of thread within warp
-    ) {}
+      int lane_idx)                   ///< Id of thread within warp
+  :
+      BaseStreamK(thread_idx)
+  {}
 
-  /// Streams the result to global memory
+
+  /// Aggregates the accumulator sets shared by peer blocks in the global workspace,
+  /// performing epilogue computations, writing to output
   CUTLASS_DEVICE
-  void operator()(
-    OutputOp const &output_op,                    ///< Output operator
-    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
-    AccumulatorTile const &accumulators,          ///< Complete warp-level accumulator tile
-    OutputTileIterator source_iterator) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
-    if (!output_op.is_source_needed()) {
-      compute_source_not_needed_(output_op, destination_iterator, accumulators);  
-    }
-    else {
-      compute_source_needed_(output_op, destination_iterator, accumulators, source_iterator);
+  void reduce(
+      int peer_idx_begin,
+      int peer_idx_end,
+      int reduce_fragment_idx,
+      void *element_workspace,
+      OutputOp const &output_op,                      ///< Output operator
+      OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+      OutputTileIterator source_iterator)             ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+  {
+    // Redcuce peer accumulator fragments into one fragment
+    AccumulatorFragment accum_fragment;
+    BaseStreamK::reduce(accum_fragment, peer_idx_begin, peer_idx_end, reduce_fragment_idx, element_workspace);
+
+    // Source-fragment data (zero-initialized for scenarios where the
+    // output operator allows us to skip loading it from global input)
+    typename OutputTileIterator::Fragment source_fragment;
+    source_fragment.clear();
+
+    if (output_op.is_source_needed())
+    {
+      source_iterator += reduce_fragment_idx;
+      source_iterator.load(source_fragment);
     }
-  }
-   
-  /// Streams the result to global memory
-  CUTLASS_DEVICE
-  void compute_source_not_needed_(
-    OutputOp const &output_op,                    ///< Output operator
-    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
-    AccumulatorTile const &accumulators           ///< Complete warp-level accumulator tile
-    ) { 
 
-    //
-    // Iterator over warp-level accumulator fragment
-    //
+    // Compute the output result
+    typename OutputTileIterator::Fragment output_fragment;
 
-    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+    // Apply the output operator
+    SourceAspectNeeded::apply_output_operator(output_fragment, output_op, accum_fragment, source_fragment);
 
-    //
-    // Iterate over accumulator tile
-    //
+    // Store the final result
+    destination_iterator += reduce_fragment_idx;
+    destination_iterator.store(output_fragment);
+  }
 
-    CUTLASS_PRAGMA_UNROLL
-    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
 
-      //
-      // Convert fragment
-      //
+  /// Perform the epilogue computations and stream the result to global memory.
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                      ///< Output operator
+    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+    AccumulatorTile const &accumulators)            ///< Complete warp-level accumulator tile
+  {
+    operator()(output_op, destination_iterator, accumulators, SourceAspectNotNeeded());
+  }
 
-      typename AccumulatorFragmentIterator::Fragment accum_fragment;
 
-      accum_fragment_iterator.load(accum_fragment);
-      ++accum_fragment_iterator;
+  /// Perform the epilogue computations and stream the result to global memory.  Implements
+  /// two alternative codepaths, depending on whether the output op requires addend data to be loaded.
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                      ///< Output operator
+    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,            ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator )            ///< Tile iterator for addend source
+  {
+    if (output_op.is_source_needed())
+    {
+      operator()(output_op, destination_iterator, accumulators, SourceAspectNeeded(source_iterator));
+    }
+    else
+    {
+      operator()(output_op, destination_iterator, accumulators, SourceAspectNotNeeded());
+    }
+  }
 
-      //
-      // Compute the output result
-      //
 
-      typename OutputTileIterator::Fragment output_fragment;
-      apply_output_operator_source_not_needed_(output_op, output_fragment, accum_fragment);
+  /// Perform the epilogue computations and stream the result to global memory.  Implements a
+  /// single codepath, regardless of whether the output op requires addend data to be loaded
+  CUTLASS_DEVICE
+  void unified(
+    OutputOp const &output_op,                      ///< Output operator
+    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,            ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator )            ///< Tile iterator for addend source
+  {
+    if (!output_op.is_source_needed())
+    {
+      source_iterator.clear_mask();
+      __syncthreads();  // Dummy (CUDA 11.0)
+    }
 
-      //
-      // Store the final result
-      //
+    operator()(output_op, destination_iterator, accumulators, SourceAspectNeeded(source_iterator));
+  }
 
-      destination_iterator.set_iteration_index(iter);
-      destination_iterator.store(output_fragment);
-      ++destination_iterator;
-    }
-  } 
 
   /// Streams the result to global memory
+  template <typename SourceAspect>
   CUTLASS_DEVICE
-  void compute_source_needed_(
-    OutputOp const &output_op,                    ///< Output operator
-    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
-    AccumulatorTile const &accumulators,          ///< Complete warp-level accumulator tile
-    OutputTileIterator source_iterator           ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
-    ) { 
- 
-    //
-    // Predicated tile iterators constructed from members
-    //
-
-    typename OutputTileIterator::Fragment source_fragment;
-
-    source_fragment.clear();
-
+  void operator()(
+    OutputOp const &output_op,                      ///< Output operator
+    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,            ///< Complete warp-level accumulator tile
+    SourceAspect source)
+  {
     //
     // Iterator over warp-level accumulator fragment
     //
@@ -235,13 +370,6 @@ class InterleavedEpilogue {
 
     CUTLASS_PRAGMA_UNROLL
     for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
-      //
-      // Load the source
-      //
-
-      source_iterator.set_iteration_index(iter);
-      source_iterator.load(source_fragment);
-      ++source_iterator;
 
       //
       // Convert fragment
@@ -257,7 +385,7 @@ class InterleavedEpilogue {
       //
 
       typename OutputTileIterator::Fragment output_fragment;
-      apply_output_operator_source_needed_(output_op, output_fragment, accum_fragment, source_fragment);
+      source.apply_output_operator(output_fragment, output_op, accum_fragment);
 
       //
       // Store the final result
@@ -268,59 +396,6 @@ class InterleavedEpilogue {
       ++destination_iterator;
     }
   }
-
- private:
-  /// Helper to invoke the output functor over each vector of output
-  CUTLASS_DEVICE
-  void apply_output_operator_source_needed_(
-    OutputOp const &output_op,                    ///< Output operator
-      typename OutputTileIterator::Fragment &output_fragment,
-      typename AccumulatorFragmentIterator::Fragment const
-          &aligned_accum_fragment,
-      typename OutputTileIterator::Fragment const &source_fragment) {
-    OutputAccessType *output_frag_ptr =
-        reinterpret_cast<OutputAccessType *>(&output_fragment);
-
-    AccumulatorAccessType const *compute_frag_ptr =
-        reinterpret_cast<AccumulatorAccessType const *>(
-            &aligned_accum_fragment);
-
-    OutputAccessType const *source_frag_ptr =
-        reinterpret_cast<OutputAccessType const *>(&source_fragment);
-
-    int const kOutputOpIterations = OutputTileIterator::Fragment::kElements /
-                                    OutputTileIterator::kElementsPerAccess;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kOutputOpIterations; ++i) {
-      // Call the output operator
-      output_frag_ptr[i] = output_op(compute_frag_ptr[i], source_frag_ptr[i]);
-    }
-  }
-
-  /// Helper to invoke the output functor over each vector of output
-  CUTLASS_DEVICE
-  void apply_output_operator_source_not_needed_(
-    OutputOp const &output_op,                    ///< Output operator
-      typename OutputTileIterator::Fragment &output_fragment,
-      typename AccumulatorFragmentIterator::Fragment const
-          &aligned_accum_fragment) {
-    OutputAccessType *output_frag_ptr =
-        reinterpret_cast<OutputAccessType *>(&output_fragment);
-
-    AccumulatorAccessType const *compute_frag_ptr =
-        reinterpret_cast<AccumulatorAccessType const *>(
-            &aligned_accum_fragment);
-
-    int const kOutputOpIterations = OutputTileIterator::Fragment::kElements /
-                                    OutputTileIterator::kElementsPerAccess;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kOutputOpIterations; ++i) {
-      // Call the output operator
-      output_frag_ptr[i] = output_op(compute_frag_ptr[i]);
-    }
-  }
 };
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/threadblock/output_iterator_parameter.h b/include/cutlass/epilogue/threadblock/output_iterator_parameter.h
index 8cfba768c1..7300882730 100644
--- a/include/cutlass/epilogue/threadblock/output_iterator_parameter.h
+++ b/include/cutlass/epilogue/threadblock/output_iterator_parameter.h
@@ -1,3 +1,34 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
 #pragma once
 
 #include "cutlass/cutlass.h"
@@ -39,7 +70,6 @@ struct ConvOutputIteratorParameter {
   static int const kTensorStrideIdx = 
     (kConvolutionalOperator == conv::Operator::kWgrad ? kWgradStrideIdx : 0);
 
-
   CUTLASS_HOST_DEVICE
   static OutputIteratorLayout layout(const TensorRef & ref) {
     return ref.stride(kTensorStrideIdx);
@@ -49,10 +79,111 @@ struct ConvOutputIteratorParameter {
   static OutputTensorCoord extent(ConvProblemSize problem_size) {
     return conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn();
   }
+};
+
+template<
+  typename TensorRef_,                                ///! Input tensor to epilogue output iterator
+  typename ConvProblemSize_                          ///! Convolutional operator on 2D or 3D problem
+>
+struct ConvOutputIteratorParameter<layout::TensorNHWC, layout::TensorNHWC, TensorRef_, conv::Operator::kFprop, ConvProblemSize_> {
 
+  using TensorLayout = layout::TensorNHWC;
+  using OutputIteratorLayout = layout::TensorNHWC;
+  using MappedLayout = layout::RowMajor;
+  using OutputTensorCoord = typename OutputIteratorLayout::TensorCoord;
+  using MappedTensorCoord = typename MappedLayout::TensorCoord;
+  using TensorRef = TensorRef_;
+  static conv::Operator const kConvolutionalOperator = conv::Operator::kFprop;
+  using ConvProblemSize = ConvProblemSize_;
+
+  CUTLASS_HOST_DEVICE
+  static OutputIteratorLayout layout(const TensorRef & ref) {
+    return ref.stride();
+  }
+
+  CUTLASS_HOST_DEVICE
+  static MappedTensorCoord extent(ConvProblemSize problem_size) {
+    return conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn();
+  }
 };
 
+template<
+  typename TensorRef_,                                ///! Input tensor to epilogue output iterator
+  typename ConvProblemSize_                          ///! Convolutional operator on 2D or 3D problem
+>
+struct ConvOutputIteratorParameter<layout::TensorNHWC, layout::TensorNHWC, TensorRef_, conv::Operator::kDeconv, ConvProblemSize_> {
+
+  using TensorLayout = layout::TensorNHWC;
+  using OutputIteratorLayout = layout::TensorNHWC;
+  using MappedLayout = layout::RowMajor;
+  using OutputTensorCoord = typename OutputIteratorLayout::TensorCoord;
+  using MappedTensorCoord = typename MappedLayout::TensorCoord;
+  using TensorRef = TensorRef_;
+  static conv::Operator const kConvolutionalOperator = conv::Operator::kDeconv;
+  using ConvProblemSize = ConvProblemSize_;
+
+  CUTLASS_HOST_DEVICE
+  static OutputIteratorLayout layout(const TensorRef & ref) {
+    return ref.stride();
+  }
+
+  CUTLASS_HOST_DEVICE
+  static MappedTensorCoord extent(ConvProblemSize problem_size) {
+    return conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn();
+  }
+};
 
+template<
+  typename TensorRef_,                                ///! Input tensor to epilogue output iterator
+  typename ConvProblemSize_                          ///! Convolutional operator on 2D or 3D problem
+>
+struct ConvOutputIteratorParameter<layout::TensorNDHWC, layout::TensorNDHWC, TensorRef_, conv::Operator::kFprop, ConvProblemSize_> {
+
+  using TensorLayout = layout::TensorNDHWC;
+  using OutputIteratorLayout = layout::TensorNDHWC;
+  using MappedLayout = layout::RowMajor;
+  using OutputTensorCoord = typename OutputIteratorLayout::TensorCoord;
+  using MappedTensorCoord = typename MappedLayout::TensorCoord;
+  using TensorRef = TensorRef_;
+  static conv::Operator const kConvolutionalOperator = conv::Operator::kFprop;
+  using ConvProblemSize = ConvProblemSize_;
+
+  CUTLASS_HOST_DEVICE
+  static OutputIteratorLayout layout(const TensorRef & ref) {
+    return ref.stride();
+  }
+
+  CUTLASS_HOST_DEVICE
+  static MappedTensorCoord extent(ConvProblemSize problem_size) {
+    return conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn();
+  }
+};
+
+template<
+  typename TensorRef_,                                ///! Input tensor to epilogue output iterator
+  typename ConvProblemSize_                          ///! Convolutional operator on 2D or 3D problem
+>
+struct ConvOutputIteratorParameter<layout::TensorNDHWC, layout::TensorNDHWC, TensorRef_, conv::Operator::kDeconv, ConvProblemSize_> {
+
+  using TensorLayout = layout::TensorNDHWC;
+  using OutputIteratorLayout = layout::TensorNDHWC;
+  using MappedLayout = layout::RowMajor;
+  using OutputTensorCoord = typename OutputIteratorLayout::TensorCoord;
+  using MappedTensorCoord = typename MappedLayout::TensorCoord;
+  using TensorRef = TensorRef_;
+  static conv::Operator const kConvolutionalOperator = conv::Operator::kDeconv;
+  using ConvProblemSize = ConvProblemSize_;
+
+  CUTLASS_HOST_DEVICE
+  static OutputIteratorLayout layout(const TensorRef & ref) {
+    return ref.stride();
+  }
+
+  CUTLASS_HOST_DEVICE
+  static MappedTensorCoord extent(ConvProblemSize problem_size) {
+    return conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn();
+  }
+};
 
 template <
   int InterleavedK,
diff --git a/include/cutlass/epilogue/threadblock/output_tile_thread_map.h b/include/cutlass/epilogue/threadblock/output_tile_thread_map.h
index 83b07b99c6..8a88c0abc3 100644
--- a/include/cutlass/epilogue/threadblock/output_tile_thread_map.h
+++ b/include/cutlass/epilogue/threadblock/output_tile_thread_map.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -394,6 +394,7 @@ struct OutputTileOptimalThreadMap {
   CUTLASS_HOST_DEVICE
   static MatrixCoord initial_offset(int thread_idx) {
 
+//    int warp_idx = __shfl_sync(0xffffffff, thread_idx / kWarpSize, 0);
     int warp_idx = thread_idx / kWarpSize;
     int lane_idx = thread_idx % kWarpSize;
 
@@ -419,7 +420,7 @@ struct OutputTileOptimalThreadMap {
 
     return MatrixCoord(
       cluster_offset + group_offset + row_offset + lane_row_offset,
-      (column_offset + lane_col_offset) * kElementsPerAccess
+      column_offset + lane_col_offset * kElementsPerAccess
     );
   }
 
@@ -464,6 +465,7 @@ struct OutputTileOptimalThreadMap {
     CUTLASS_HOST_DEVICE
     static MatrixCoord initial_offset(int thread_idx) {
 
+//      int warp_idx = __shfl_sync(0xffffffff, thread_idx / kWarpSize, 0);
       int warp_idx = thread_idx / kWarpSize;
       int lane_idx = thread_idx % kWarpSize;
 
@@ -489,7 +491,7 @@ struct OutputTileOptimalThreadMap {
 
       MatrixCoord coord(
         cluster_offset + group_offset + row_offset + lane_row_offset,
-        (column_offset + lane_col_offset) * kElementsPerAccess
+        column_offset + lane_col_offset * kElementsPerAccess
       );
 
       return coord;
diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h
index 1cc8d1ced6..9943ea2563 100644
--- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h
+++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -43,6 +43,7 @@
 #include "cutlass/array.h"
 #include "cutlass/layout/matrix.h"
 #include "cutlass/layout/tensor.h"
+#include "cutlass/layout/permute.h"
 #include "cutlass/matrix_shape.h"
 #include "cutlass/tensor_ref.h"
 #include "cutlass/transform/pitch_linear_thread_map.h"
@@ -50,6 +51,8 @@
 #include "cutlass/arch/arch.h"
 #include "cutlass/arch/memory.h"
 #include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -70,6 +73,7 @@ template <
   typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
   typename Element_,         ///< Element data type
   bool ScatterD = false,     ///< Scatter D operand or not
+  typename PermuteDLayout = layout::NoPermute, ///< Permute D operand or not
   bool UseCUDAStore = false
 >
 class PredicatedTileIterator {
@@ -91,6 +95,8 @@ class PredicatedTileIterator {
   static int const kThreads = ThreadMap::kThreads;
   static int const kIterations = ThreadMap::Count::kTile;
 
+  static bool constexpr PermuteD = !layout::is_trivial_permute<PermuteDLayout>;
+
   static_assert( ThreadMap::Iterations::kRow > 0,"ThreadMap::Iterations::kRow must be > 0");
   static_assert( ThreadMap::Iterations::kGroup > 0,"ThreadMap::Iterations::kGroup must be > 0");
   static_assert( ThreadMap::Iterations::kCluster > 0,"ThreadMap::Iterations::kCluster must be > 0");
@@ -98,10 +104,10 @@ class PredicatedTileIterator {
 
   /// Fragment object
   using Fragment = Array<
-    Element, 
-    ThreadMap::Iterations::kColumn * 
-    ThreadMap::Iterations::kRow * 
-    ThreadMap::Iterations::kGroup * 
+    Element,
+    ThreadMap::Iterations::kColumn *
+    ThreadMap::Iterations::kRow *
+    ThreadMap::Iterations::kGroup *
     ThreadMap::Iterations::kCluster * ThreadMap::kElementsPerAccess>;
 
   /// Memory access size
@@ -119,13 +125,27 @@ class PredicatedTileIterator {
     Params() { }
 
     CUTLASS_HOST_DEVICE
-    Params(Layout const &layout): 
+    Params(Layout const &layout):
       PredicatedTileIteratorParams(
         layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
         make_OutputTileThreadMapDesc<ThreadMap>()
       ) 
     { }
 
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout,
+           // Not needed.  Added to be compatible with strided conv epilogue.
+           cutlass::Tensor4DCoord const &tensor_extent):
+      Params(layout)
+    { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout,
+           // Not needed.  Added to be compatible with strided conv epilogue.
+           cutlass::Tensor5DCoord const &tensor_extent):
+      Params(layout)
+    { }
+
     CUTLASS_HOST_DEVICE
     Params(Base const &base) : 
       Base(base) { }
@@ -173,9 +193,12 @@ class PredicatedTileIterator {
   /// Parameters structure containing reference and precomputed state.
   PredicatedTileIteratorParams params_;
 
-  /// Byte-level pointer
+  /// Byte-level pointer. This pointer is usually for both load() and store(), unless PermuteD is performed. When having PermuteD, byte_pointer_ is only for load().
   uint8_t *byte_pointer_;
 
+  /// Byte-level pointer for store(). Due to PermuteD Op, store_byte_pointer_ may be with different address computation compared to byte_pointer_.
+  uint8_t *store_byte_pointer_;
+
   /// Array of boolean values to contain steady-state predicates
   Mask mask_;
 
@@ -195,8 +218,11 @@ class PredicatedTileIterator {
   int state_[3];
 
   /// Scatter indices
-  int const *indices_; 
- 
+  int const *indices_;
+
+  /// PermuteDLayout
+  PermuteDLayout permute_layout_;
+
   //
   // Static asserts about internal strides
   //
@@ -227,7 +253,8 @@ class PredicatedTileIterator {
     TensorCoord threadblock_offset = TensorCoord(),
     int const *indices = nullptr
   ): 
-    params_(params), indices_(indices)
+    params_(params), indices_(indices),
+    permute_layout_(PitchLinearCoord(extent.column(), extent.row()), params_.stride * kElementsPerAccess / sizeof(AccessType))
   {
 
     TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
@@ -242,7 +269,7 @@ class PredicatedTileIterator {
     CUTLASS_PRAGMA_UNROLL
     for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
 
-      mask_.predicates[c] = ((thread_offset.column() 
+      mask_.predicates[c] = ((thread_offset.column()
         + ThreadMap::Delta::kColumn * c) < extent.column());
     }
 
@@ -255,9 +282,9 @@ class PredicatedTileIterator {
       mask_.clear();
     }
 
-    // Initialize pointer
-    byte_pointer_ = reinterpret_cast<uint8_t *>(pointer) + 
-      LongIndex(thread_offset.row()) * LongIndex(params_.stride) + 
+    // Initialize byte_pointer_
+    byte_pointer_ = reinterpret_cast<uint8_t *>(pointer) +
+      LongIndex(thread_offset.row()) * LongIndex(params_.stride) +
       LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess;
 
     if (ScatterD) {
@@ -265,6 +292,9 @@ class PredicatedTileIterator {
         LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess;
     }
 
+    // store_byte_pointer_ is set to be the same with byte_pointer_ unless PermuteD is used.
+    store_byte_pointer_ = PermuteD ? reinterpret_cast<uint8_t *>(pointer) : byte_pointer_;
+
     // Initialize internal state counter
     state_[0] = state_[1] = state_[2] = 0;
   }
@@ -272,6 +302,7 @@ class PredicatedTileIterator {
   /// Adds a pointer offset in units of Element
   CUTLASS_HOST_DEVICE
   void add_pointer_offset(LongIndex pointer_offset) {
+    store_byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
     byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
   }
 
@@ -291,7 +322,7 @@ class PredicatedTileIterator {
         CUTLASS_PRAGMA_UNROLL
         for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
 
-          int frag_row_idx = 
+          int frag_row_idx =
             (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
 
           int row_offset = row * ThreadMap::Delta::kRow 
@@ -315,7 +346,7 @@ class PredicatedTileIterator {
             bool guard = row_guard && mask_.predicates[column];
 
             cutlass::arch::global_load<
-              AccessType, 
+              AccessType,
               sizeof(AccessType)
             >(
                 frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
@@ -353,7 +384,7 @@ class PredicatedTileIterator {
   /// Stores a fragment to memory
   CUTLASS_DEVICE
   void store_with_byte_offset(Fragment const &frag, int64_t byte_offset) const {
-    uint8_t *byte_pointer = byte_pointer_;
+    uint8_t *byte_pointer = store_byte_pointer_;
     AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
 
     CUTLASS_PRAGMA_UNROLL
@@ -365,11 +396,11 @@ class PredicatedTileIterator {
         CUTLASS_PRAGMA_UNROLL
         for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
 
-          int frag_row_idx = 
+          int frag_row_idx =
             (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
 
-          int row_offset = row * ThreadMap::Delta::kRow 
-            + group * ThreadMap::Delta::kGroup 
+          int row_offset = row * ThreadMap::Delta::kRow
+            + group * ThreadMap::Delta::kGroup
             + cluster * ThreadMap::Delta::kCluster;
 
           bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
@@ -387,34 +418,54 @@ class PredicatedTileIterator {
           for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
 
             bool guard = row_guard && mask_.predicates[column];
+            
+            if (PermuteD) {
+
+              int col_offset = column * ThreadMap::Delta::kColumn;
+
+              int col = col_offset + thread_start_column_;
+              int row = row_offset + thread_start_row_;
+
+              // Locate memory_pointer
+              memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset
+                 + permute_layout_(PitchLinearCoord(col, row)) * sizeof(AccessType) / kElementsPerAccess);
+            }
 
             if (UseCUDAStore) {
               if (guard) {
-                memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess] =
+                memory_pointer[0] =
                     frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column];
               }
             } else {
               cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
                   frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
-                  (void *)&memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                  (void *)&memory_pointer[0],
                   guard);
             }
+
+            if (!PermuteD) {
+              memory_pointer += (ThreadMap::Delta::kColumn / kElementsPerAccess);
+            }
           }
 
           if (row + 1 < ThreadMap::Iterations::kRow) {
-            if (!ScatterD) {
+            if (!ScatterD && !PermuteD) {
               byte_pointer += params_.increment_row;
             }
           }
         }
 
         if (group + 1 < ThreadMap::Iterations::kGroup) {
-          byte_pointer += params_.increment_group;
+          if (!ScatterD && !PermuteD) {
+            byte_pointer += params_.increment_group;
+          }
         }
       }
 
       if (cluster + 1 < ThreadMap::Iterations::kCluster) {
-        byte_pointer += params_.increment_cluster;
+        if (!ScatterD && !PermuteD) {
+          byte_pointer += params_.increment_cluster;
+        }
       }
     }
   }
@@ -609,29 +660,57 @@ class PredicatedTileIterator {
       byte_pointer_ += params_.advance_row;
     }
 
+    if (!ScatterD && !PermuteD) {
+      store_byte_pointer_ += params_.advance_row;
+    }
+
     thread_start_row_ += ThreadMap::Shape::kRow;
-    
+
     if (state_[0] == ThreadMap::Count::kRow) {
 
       state_[0] = 0;
       ++state_[1];
-      byte_pointer_ += params_.advance_group;
 
-      thread_start_row_ += (ThreadMap::Shape::kGroup - 1) * 
+      if (!ScatterD) {
+        byte_pointer_ += params_.advance_group;
+      }
+
+      if (!ScatterD && !PermuteD) {
+        store_byte_pointer_ += params_.advance_group;
+      }
+
+      thread_start_row_ += (ThreadMap::Shape::kGroup - 1) *
         ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
 
       if (state_[1] == ThreadMap::Count::kGroup) {
 
         state_[1] = 0;
         ++state_[2];
-        byte_pointer_ += params_.advance_cluster;
 
-        thread_start_row_ += ThreadMap::Count::kGroup * 
+        if (!ScatterD) {
+          byte_pointer_ += params_.advance_cluster;
+        }
+
+        if (!ScatterD && !PermuteD) {
+          store_byte_pointer_ += params_.advance_cluster;
+        }
+
+        thread_start_row_ += ThreadMap::Count::kGroup *
           ThreadMap::Shape::kGroup * ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
 
         if (state_[2] == ThreadMap::Count::kCluster) {
           state_[2] = 0;
-          byte_pointer_ += params_.advance_tile;
+
+          if (!ScatterD) {
+            byte_pointer_ += params_.advance_tile;
+          }
+
+          if (!ScatterD && !PermuteD) {
+            store_byte_pointer_ += params_.advance_tile;
+          }
+
+          thread_start_row_ += ThreadMap::Shape::kGroup * ThreadMap::Shape::kRow
+            * ThreadMap::Shape::kCluster * ThreadMap::Shape::kTile;
         }
       }
     }
@@ -639,6 +718,60 @@ class PredicatedTileIterator {
     return *this;
   }
 
+  /// Advances a number of positions to load or store
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator &operator+=(int increment)
+  {
+    // Row
+    state_[0] += increment;
+    int increment_row = state_[0] / ThreadMap::Count::kRow;
+    state_[0] = state_[0] % ThreadMap::Count::kRow;
+
+    byte_pointer_ += (params_.advance_row * increment);
+    store_byte_pointer_ += (params_.advance_row * increment);
+    thread_start_row_ += (ThreadMap::Shape::kRow * increment);
+
+    // Group
+    state_[1] += increment_row;
+    int increment_group = state_[1] / ThreadMap::Count::kGroup;
+    state_[1] = state_[1] % ThreadMap::Count::kGroup;
+
+    byte_pointer_ += (params_.advance_group * increment_row);
+    store_byte_pointer_ += (params_.advance_group * increment_row);
+    thread_start_row_ +=
+        (ThreadMap::Shape::kGroup - 1) *
+        ThreadMap::Shape::kRow *
+        ThreadMap::Count::kRow *
+        increment_row;
+
+
+    // Cluster
+    state_[2] += increment_group;
+    int increment_cluster = state_[2] / ThreadMap::Count::kCluster;
+    state_[2] = state_[2] % ThreadMap::Count::kCluster;
+
+    byte_pointer_ += (params_.advance_cluster * increment_group);
+    store_byte_pointer_ += (params_.advance_cluster * increment_group);
+    thread_start_row_ +=
+        ThreadMap::Count::kGroup *
+        ThreadMap::Shape::kGroup *
+        ThreadMap::Count::kRow *
+        ThreadMap::Shape::kRow *
+        increment_group;
+
+    // Tile
+    byte_pointer_ += (params_.advance_tile * increment_cluster);
+    store_byte_pointer_ += (params_.advance_tile * increment_cluster);
+    thread_start_row_ +=
+        ThreadMap::Shape::kGroup *
+        ThreadMap::Shape::kRow *
+        ThreadMap::Shape::kCluster *
+        ThreadMap::Shape::kTile *
+        increment_cluster;
+
+    return *this;
+  }
+
   ///< Efficiently disables all accesses guarded by mask
   CUTLASS_DEVICE void clear_mask() {
     mask_.clear();
@@ -896,6 +1029,23 @@ class InterleavedPredicatedTileIterator {
     return *this;
   }
 
+  /// Advances a number of positions to load or store
+  CUTLASS_HOST_DEVICE
+  InterleavedPredicatedTileIterator &operator+=(int increment)
+  {
+    // Contiguous
+    iteration_contiguous_ += increment;
+    int increment_strided = iteration_contiguous_ / ThreadMap::Iterations::kContiguous;
+    iteration_contiguous_ = iteration_contiguous_ % ThreadMap::Iterations::kContiguous;
+    byte_pointer_ += (params_.advance_row * increment);
+
+    // Strided
+    iteration_strided_ += increment_strided;
+    byte_pointer_ += (params_.advance_column * increment_strided);
+
+    return *this;
+  }
+
   ///< Efficiently disables all accesses guarded by mask
   CUTLASS_DEVICE void clear_mask() {
     mask_.clear();
@@ -987,6 +1137,14 @@ class InterleavedConvPredicatedTileIterator {
 
       initialize(layout.stride());
     }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout,
+           // Not needed.  Added to be compatible with strided conv epilogue.
+           cutlass::Tensor4DCoord const &tensor_extent):
+      Params(layout)
+    { }
+
   };
 
   /// Mask object
diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h
index 90164a3a85..2b86ac0eaf 100644
--- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h
+++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine_layout_params.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine_layout_params.h
new file mode 100644
index 0000000000..7f7f17b5ad
--- /dev/null
+++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine_layout_params.h
@@ -0,0 +1,156 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/fast_math.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int Rank
+>
+struct PredicatedTileIteratorAffineLayoutRankNParams {
+  using Layout = layout::AffineRankN<Rank>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  static bool const kBigEndian = false;
+  
+  //
+  // Data members
+  //
+
+  Layout layout;
+
+  /// Stride in units of bytes along M modes
+  Coord<Layout::kRank/2, typename Layout::LongIndex> stride_m;
+
+  /// Stride in units of bytes along N modes
+  Coord<Layout::kRank/2, typename Layout::LongIndex> stride_n;
+
+  /// Fast divmod objects divided by tensor extents
+  FastDivmod divmod_m[(Layout::kRank == 2) ? 1 : (Layout::kRank/2 - 1)];
+
+  /// Fast divmod objects divided by tensor extents
+  FastDivmod divmod_n[(Layout::kRank == 2) ? 1 : (Layout::kRank/2 - 1)];
+
+  int64_t rank2_inc_col;
+  int64_t rank2_inc_row;
+
+  //
+  // Methods
+  //
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorAffineLayoutRankNParams() { }
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorAffineLayoutRankNParams(TensorCoord const &extent, 
+                                                Layout const &layout_,
+                                                int64_t element_sizeof_bits)
+  : layout(layout_) 
+  {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Layout::kRank / 2; ++i) {
+      stride_m[i] = OffsetBytes(layout_.stride()[i], element_sizeof_bits);
+      stride_n[i] = OffsetBytes(layout_.stride()[i + Layout::kRank / 2], element_sizeof_bits);
+    }
+
+    if (kBigEndian) {
+      // "Big Endian" scheme
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Layout::kRank / 2 - 1; ++i) {
+        divmod_m[i] = FastDivmod(extent[i + 1]);
+        divmod_n[i] = FastDivmod(extent[i + Layout::kRank / 2 + 1]);
+      }
+    }
+    else {
+      // "Little Endian" scheme
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Layout::kRank / 2 - 1; ++i) {
+        divmod_m[i] = FastDivmod(extent[i]);
+        divmod_n[i] = FastDivmod(extent[i + Layout::kRank / 2]);
+      }
+    }
+
+    #if 0
+    //
+    // Debug print statements to verify extents and strides are passed correctly.
+    //
+    printf("PredicatedTileIteratorAffine::Params() entered\n");
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Layout::kRank; ++i) {
+      printf("  extent[%d]: %d\n", i, extent[i]);
+    }
+    for (int i = 0; i < Layout::kRank; ++i) {
+      printf("  stride[%d]: %ld\n", i, layout_.stride()[i]);
+    }
+    printf("PredicatedTileIteratorAffine::Params() returning\n");
+    #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorAffineLayoutRankNParams(Layout const &layout_,
+                                                int32_t threadmap_delta_kColumn,
+                                                int32_t threadmap_delta_kRow,
+                                                int64_t element_sizeof_bits)
+  : layout(layout_) 
+  {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Layout::kRank / 2; ++i) {
+      stride_m[i] = OffsetBytes(layout_.stride()[i], element_sizeof_bits);
+      stride_n[i] = OffsetBytes(layout_.stride()[i + Layout::kRank / 2], element_sizeof_bits);
+    }
+
+    rank2_inc_col = threadmap_delta_kColumn * stride_n[0];
+    rank2_inc_row = threadmap_delta_kRow * stride_m[0];
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_blas3.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_blas3.h
index bbac398cd1..c2583674c5 100644
--- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_blas3.h
+++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_blas3.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_conv.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_conv.h
new file mode 100644
index 0000000000..a59437c091
--- /dev/null
+++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_conv.h
@@ -0,0 +1,562 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/permute.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load and store output tile from global memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator | PredicatedTileIteratorConv | ForwardTileIterator
+///
+template <
+  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
+  typename Element_,         ///< Element data type
+  bool ScatterD = false,     ///< Scatter D operand or not
+  typename PermuteDLayout = layout::NoPermute, ///< Permute D operand or not
+  bool UseCUDAStore = false,
+  int Rank = 4
+>
+class PredicatedTileIteratorConv {
+public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+
+  using Element = Element_;
+
+  static int const kRank = Rank;
+  using Layout = typename platform::conditional<kRank == 4,
+                                       layout::TensorNHWC,
+                                       layout::TensorNDHWC>::type;
+
+  using Stride = typename Layout::Stride;
+  static int const kStrideRank = Layout::kStrideRank;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using MappedLayout = layout::RowMajor;
+  using Index = typename MappedLayout::Index;
+  using LongIndex = typename MappedLayout::LongIndex;
+  using TensorCoord = typename MappedLayout::TensorCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kThreads = ThreadMap::kThreads;
+  static int const kIterations = ThreadMap::Count::kTile;
+
+  static bool constexpr PermuteD = !layout::is_trivial_permute<PermuteDLayout>;
+
+  static_assert( ThreadMap::Iterations::kRow > 0,"ThreadMap::Iterations::kRow must be > 0");
+  static_assert( ThreadMap::Iterations::kGroup > 0,"ThreadMap::Iterations::kGroup must be > 0");
+  static_assert( ThreadMap::Iterations::kCluster > 0,"ThreadMap::Iterations::kCluster must be > 0");
+  static_assert( ThreadMap::Iterations::kColumn > 0,"ThreadMap::Iterations::kColumn must be > 0");
+
+  /// Fragment object
+  using Fragment = Array<
+    Element,
+    ThreadMap::Iterations::kColumn *
+    ThreadMap::Iterations::kRow *
+    ThreadMap::Iterations::kGroup *
+    ThreadMap::Iterations::kCluster * ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+
+  //
+  // Parameters struct
+  //
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileIteratorParams {
+    using Base = PredicatedTileIteratorParams;
+
+    /// Fast divmod objects divided by tensor extents
+    FastDivmod divmod[kStrideRank - 1];
+    Stride tensor_stride;
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout, cutlass::Tensor4DCoord const &tensor_extent):
+      PredicatedTileIteratorParams(
+        layout.stride()[0] * int(sizeof(AccessType)) / kElementsPerAccess,
+        make_OutputTileThreadMapDesc<ThreadMap>()
+      ) {
+      divmod[0] = FastDivmod(tensor_extent[2] /* Q for Fprop & W for Deconv*/);
+      divmod[1] = FastDivmod(tensor_extent[1] /* P for Fprop & H for Deconv*/);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kStrideRank; ++i) {
+        tensor_stride[i] = layout.stride()[i];
+      }
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout, cutlass::Tensor5DCoord const &tensor_extent):
+      PredicatedTileIteratorParams(
+        layout.stride()[0] * int(sizeof(AccessType)) / kElementsPerAccess,
+        make_OutputTileThreadMapDesc<ThreadMap>()
+      ) {
+      divmod[0] = FastDivmod(tensor_extent[3] /* Q for Fprop & W for Deconv*/);
+      divmod[1] = FastDivmod(tensor_extent[2] /* P for Fprop & H for Deconv*/);
+      divmod[2] = FastDivmod(tensor_extent[1] /* Z for Fprop & D for Deconv*/);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kStrideRank; ++i) {
+        tensor_stride[i] = layout.stride()[i];
+      }
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const &base) :
+      Base(base) { }
+  };
+
+  /// Mask object
+  struct Mask {
+
+    static int const kCount = ThreadMap::Iterations::kColumn;
+
+    /// Predicate state
+    bool predicates[kCount];
+
+    //
+    // Mask
+    //
+    CUTLASS_HOST_DEVICE
+    Mask() {
+      enable();
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_HOST_DEVICE void clear() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = false;
+      }
+    }
+
+    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = true;
+      }
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Parameters structure containing reference and precomputed state.
+  Params params_;
+
+  /// Byte-level pointer. This pointer is usually for both load() and store(), unless PermuteD is performed. When having PermuteD, byte_pointer_ is only for load().
+  uint8_t *byte_pointer_;
+
+  /// Array of boolean values to contain steady-state predicates
+  Mask mask_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_row_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_column_;
+
+  /// A thread's starting row position (assuming steady-state predicates have been computed)
+  Index thread_start_row_;
+
+  /// A thread's starting column
+  Index thread_start_column_;
+
+  /// Internal state counter
+  int state_[3];
+
+  //
+  // Static asserts about internal strides
+  //
+
+  static_assert(sizeof(extent_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(PredicatedTileIteratorParams::stride) == 8, "Expected 64b strides");
+
+private:
+
+  //
+  // Methods
+  //
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  PredicatedTileIteratorConv(
+    Params const & params,
+    Element *pointer,
+    TensorCoord extent,
+    int thread_idx,
+    TensorCoord threadblock_offset = TensorCoord()
+  ):
+    params_(params)
+  {
+
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
+
+    extent_row_ = extent.row();
+    extent_column_ = extent.column();
+
+    thread_start_row_ = thread_offset.row();
+    thread_start_column_ = thread_offset.column();
+
+    // Initialize predicates
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
+
+      mask_.predicates[c] = ((thread_offset.column()
+        + ThreadMap::Delta::kColumn * c) < extent.column());
+    }
+
+    // Null pointer performs no accesses
+    if (!pointer) {
+      mask_.clear();
+    }
+
+    // Initialize byte_pointer_
+    byte_pointer_ = reinterpret_cast<uint8_t *>(pointer) +
+      LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess;
+
+    // Initialize internal state counter
+    state_[0] = state_[1] = state_[2] = 0;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, int64_t byte_offset) const {
+
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx =
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow
+            + group * ThreadMap::Delta::kGroup
+            + cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+          Stride tensor_coord = CoordinateDecompositionLittleEndian<kStrideRank>(row_offset + thread_start_row_, params_.divmod);
+
+          LongIndex tensor_offset = dot(tensor_coord, params_.tensor_stride);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<
+              AccessType,
+              sizeof(AccessType)
+            >(
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
+                         column],
+                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn /
+                                        kElementsPerAccess + tensor_offset / kElementsPerAccess],
+                guard);
+          }
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, int64_t byte_offset) const {
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx =
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow
+            + group * ThreadMap::Delta::kGroup
+            + cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          Stride tensor_coord = CoordinateDecompositionLittleEndian<kStrideRank>((row_offset + thread_start_row_), params_.divmod);
+
+          LongIndex tensor_offset = dot(tensor_coord, params_.tensor_stride);
+
+          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            bool guard = row_guard && mask_.predicates[column];
+
+            if (UseCUDAStore) {
+              if (guard) {
+                memory_pointer[tensor_offset / kElementsPerAccess] =
+                    frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column];
+              }
+            } else {
+              cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+                  frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                  (void *)&memory_pointer[tensor_offset / kElementsPerAccess],
+                  guard);
+            }
+
+            memory_pointer += (ThreadMap::Delta::kColumn / kElementsPerAccess);
+          }
+        }
+      }
+    }
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) const {
+
+    store_with_byte_offset(frag, 0);
+  }
+
+  CUTLASS_DEVICE
+  MatrixCoord thread_start() const {
+    return MatrixCoord(thread_start_row_, thread_start_column_);
+  }
+
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_row() const {
+    return thread_start_row_;
+  }
+
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_column() const {
+    return thread_start_column_;
+  }
+
+  /// Extent of the matrix in rows
+  CUTLASS_DEVICE
+  Index extent_row() const {
+    return extent_row_;
+  }
+
+  /// Extent of the matrix in columns
+  CUTLASS_DEVICE
+  Index extent_column() const {
+    return extent_column_;
+  }
+
+  /// Advances to the next position to load or store
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorConv &operator++() {
+
+    ++state_[0];
+
+    thread_start_row_ += ThreadMap::Shape::kRow;
+
+    if (state_[0] == ThreadMap::Count::kRow) {
+
+      state_[0] = 0;
+      ++state_[1];
+
+      thread_start_row_ += (ThreadMap::Shape::kGroup - 1) *
+        ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
+
+      if (state_[1] == ThreadMap::Count::kGroup) {
+
+        state_[1] = 0;
+        ++state_[2];
+
+        thread_start_row_ += ThreadMap::Count::kGroup *
+          ThreadMap::Shape::kGroup * ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
+
+        if (state_[2] == ThreadMap::Count::kCluster) {
+          state_[2] = 0;
+
+          thread_start_row_ += ThreadMap::Shape::kGroup * ThreadMap::Shape::kRow
+            * ThreadMap::Shape::kCluster * ThreadMap::Shape::kTile;
+        }
+      }
+    }
+
+    return *this;
+  }
+
+  /// Advances a number of positions to load or store
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorConv &operator+=(int increment)
+  {
+    // Row
+    state_[0] += increment;
+    int increment_row = state_[0] / ThreadMap::Count::kRow;
+    state_[0] = state_[0] % ThreadMap::Count::kRow;
+
+    thread_start_row_ += (ThreadMap::Shape::kRow * increment);
+
+    // Group
+    state_[1] += increment_row;
+    int increment_group = state_[1] / ThreadMap::Count::kGroup;
+    state_[1] = state_[1] % ThreadMap::Count::kGroup;
+
+    thread_start_row_ +=
+        (ThreadMap::Shape::kGroup - 1) *
+        ThreadMap::Shape::kRow *
+        ThreadMap::Count::kRow *
+        increment_row;
+
+    // Cluster
+    state_[2] += increment_group;
+    int increment_cluster = state_[2] / ThreadMap::Count::kCluster;
+    state_[2] = state_[2] % ThreadMap::Count::kCluster;
+
+    thread_start_row_ +=
+        ThreadMap::Count::kGroup *
+        ThreadMap::Shape::kGroup *
+        ThreadMap::Count::kRow *
+        ThreadMap::Shape::kRow *
+        increment_group;
+
+    // Tile
+    thread_start_row_ +=
+        ThreadMap::Shape::kGroup *
+        ThreadMap::Shape::kRow *
+        ThreadMap::Shape::kCluster *
+        ThreadMap::Shape::kTile *
+        increment_cluster;
+
+    return *this;
+  }
+
+  ///< Efficiently disables all accesses guarded by mask
+  CUTLASS_DEVICE void clear_mask() {
+    mask_.clear();
+  }
+
+  ///< Efficiently enables all accesses guarded by mask
+  CUTLASS_DEVICE void enable_mask() {
+    mask_.enable();
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void get_mask(Mask &mask) const {
+    mask = mask_;
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void set_mask(Mask const &mask) {
+    mask_ = mask;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_direct_conv.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_direct_conv.h
new file mode 100644
index 0000000000..8d7bf7edba
--- /dev/null
+++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_direct_conv.h
@@ -0,0 +1,445 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/permute.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load and store output tile from global memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator
+///
+template <
+  typename ThreadMap_,       ///< Thread map (conept: PitchLinearThreadMap)
+  typename Element_,         ///< Element data type
+  typename ThreadOutputShape_ = cutlass::conv::TensorNHWCShape<1, 1, 1, 1>,
+  typename ThreadBlockOutputShape_ = cutlass::conv::TensorNHWCShape<1, 1, 1, 1>
+>
+class PredicatedTileIteratorDirectConv {
+public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+  using ThreadOutputShape = ThreadOutputShape_;
+  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
+
+  using Element = Element_;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kThreads = ThreadMap::kThreads;
+
+  using ConvProblemSize = typename cutlass::conv::Conv2dProblemSize;
+
+  /// Fragment object
+  using Fragment = Array<Element, ThreadMap::Iterations::kCount * kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, kElementsPerAccess>;
+
+  static int const kLoadsPerAccess = AccessType::kElements / AccessType::kElements;
+
+  using ThreadTileCount = MatrixShape<
+    ThreadBlockOutputShape::kH / ThreadOutputShape::kH,
+    ThreadBlockOutputShape::kW / ThreadOutputShape::kW
+  >;
+
+  //
+  // Parameters struct
+  //
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileIteratorDirect2dConvParams {
+    using Base = PredicatedTileIteratorDirect2dConvParams;
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout, cutlass::conv::Conv2dProblemSize const &problem_size): 
+      PredicatedTileIteratorDirect2dConvParams(
+        layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
+        problem_size,
+        {ThreadBlockOutputShape::kH, ThreadBlockOutputShape::kW}
+      ) 
+    { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const &base) : 
+      Base(base) { }
+  };
+
+  /// Mask object
+  struct Mask {
+
+    static int const kCount = ThreadMap::Iterations::kContiguous;
+
+    /// Predicate state
+    bool predicates[kCount];
+
+    //
+    // Mask
+    //
+    CUTLASS_HOST_DEVICE
+    Mask() {
+      enable();
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_HOST_DEVICE void clear() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = false;
+      }
+    }
+
+    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = true;
+      }
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Parameters structure containing reference and precomputed state.
+  PredicatedTileIteratorDirect2dConvParams params_;
+
+  /// Byte-level pointer
+  uint8_t *byte_pointer_;
+
+  ///     
+  Element *pointer_;
+
+
+  /// Array of boolean values to contain steady-state predicates
+  Mask mask_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_row_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_column_;
+
+  /// A thread's starting row position (assuming steady-state predicates have been computed)
+  Index thread_start_row_;
+
+  /// A thread's starting column
+  Index thread_start_column_;
+
+  /// Initial thread output location
+  int thread_start_n_, thread_start_p_, thread_start_q_;
+
+  /// Current threadblock tile index
+  int tile_index_;
+
+  //
+  // Static asserts about internal strides
+  //
+
+  static_assert(sizeof(extent_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(PredicatedTileIteratorDirect2dConvParams::stride) == 8, "Expected 64b strides");
+
+private:
+
+  //
+  // Methods
+  //
+
+
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  PredicatedTileIteratorDirectConv(
+    PredicatedTileIteratorDirect2dConvParams const & params,
+    Element *pointer,
+    TensorCoord extent,
+    int thread_idx,
+    TensorCoord threadblock_offset = TensorCoord()
+  ): 
+    params_(params), pointer_(pointer)
+  {
+
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx);
+
+    extent_row_ = extent.row();
+    extent_column_ = extent.column();
+
+    // stride dim (PQ)
+    thread_start_row_ = thread_offset.column();
+    // contiguous dim (Channels)
+    thread_start_column_ = threadblock_offset.column() + thread_offset.row();
+
+    tile_index_ = threadblock_offset.row();
+
+    set_tile_index(0);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void set_tile_index(const int index) { 
+   
+    int residual;
+    params_.pq_divmod(thread_start_n_, residual, tile_index_ + index);
+    params_.q_divmod(thread_start_p_, thread_start_q_, residual);
+
+    // Compute the base output coord of ThreadBlock
+    thread_start_p_ *= ThreadBlockOutputShape::kH;
+    thread_start_q_ *= ThreadBlockOutputShape::kW;
+
+    // Initialize predicates
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+      mask_.predicates[c] = ((thread_start_column_ 
+        + c * ThreadMap::Delta::kContiguous) < extent_column_);
+    }
+
+    // Null pointer performs no accesses
+    if (!pointer_) {
+      mask_.clear();
+    }
+
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, int64_t byte_offset) const {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        int frag_base_idx = s * ThreadMap::Iterations::kContiguous + c;
+
+        int current_row = thread_start_row_ + s * ThreadMap::Delta::kStrided;
+        int p = current_row / ThreadBlockOutputShape::kW;
+        int q = current_row % ThreadBlockOutputShape::kW;
+
+        int current_p = thread_start_p_ + p;
+        int current_q = thread_start_q_ + q;
+
+        bool row_guard = (current_p) < params_.P && (current_q) < params_.Q &&
+                         (thread_start_n_ < params_.N) && current_row < ThreadMap::Shape::kStrided;
+
+        int output_row_offset =
+            thread_start_n_ * params_.stride_n + current_p * params_.stride_p + current_q;
+
+        uint8_t *byte_pointer =
+            reinterpret_cast<uint8_t *>(pointer_) +
+            LongIndex(output_row_offset) * LongIndex(params_.stride) +
+            LongIndex(thread_start_column_ + c * ThreadMap::Delta::kContiguous) *
+                sizeof(AccessType) / kElementsPerAccess;
+
+        AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+        AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+        bool guard = row_guard && mask_.predicates[c];
+
+        cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+            frag_ptr[frag_base_idx], (void *)&memory_pointer[0], guard);
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) const {
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, int64_t byte_offset) const {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        int frag_base_idx = s * ThreadMap::Iterations::kContiguous + c;
+
+        int current_row = thread_start_row_ + s * ThreadMap::Delta::kStrided;
+        int p = current_row / ThreadBlockOutputShape::kW;
+        int q = current_row % ThreadBlockOutputShape::kW;
+
+        int current_p = thread_start_p_ + p;
+        int current_q = thread_start_q_ + q;
+
+        bool row_guard = (current_p) < params_.P && (current_q) < params_.Q &&
+                         (thread_start_n_ < params_.N) && current_row < ThreadMap::Shape::kStrided;
+
+        int output_row_offset =
+            thread_start_n_ * params_.stride_n + current_p * params_.stride_p + current_q;
+
+        uint8_t *byte_pointer =
+            reinterpret_cast<uint8_t *>(pointer_) +
+            LongIndex(output_row_offset) * LongIndex(params_.stride) +
+            LongIndex(thread_start_column_ + c * ThreadMap::Delta::kContiguous) *
+                sizeof(AccessType) / kElementsPerAccess;
+
+        AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+        AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+        bool guard = row_guard && mask_.predicates[c];
+
+        cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+            frag_ptr[frag_base_idx], (void *)&memory_pointer[0], guard);
+      }
+    }
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) const {
+
+    store_with_byte_offset(frag, 0);
+  }
+
+  CUTLASS_DEVICE
+  MatrixCoord thread_start() const {
+    return MatrixCoord(thread_start_row_, thread_start_column_);
+  }
+
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_row() const {
+    return thread_start_row_;
+  }
+
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_column() const {
+    return thread_start_column_;
+  }
+
+  /// Extent of the matrix in rows
+  CUTLASS_DEVICE
+  Index extent_row() const {
+    return extent_row_;
+  }
+
+  /// Extent of the matrix in columns
+  CUTLASS_DEVICE
+  Index extent_column() const {
+    return extent_column_;
+  }
+
+  /// Advances to the next position to load or store
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorDirectConv &operator++() {
+    // do nothing
+
+    return *this;
+  }
+
+  ///< Efficiently disables all accesses guarded by mask
+  CUTLASS_DEVICE void clear_mask() {
+    mask_.clear();
+  }
+
+  ///< Efficiently enables all accesses guarded by mask
+  CUTLASS_DEVICE void enable_mask() {
+    mask_.enable();
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void get_mask(Mask &mask) const {
+    mask = mask_;
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void set_mask(Mask const &mask) {
+    mask_ = mask;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h
index 792c169759..5e9aa22bdb 100644
--- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h
+++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -36,6 +36,11 @@
 
 #include "cutlass/cutlass.h"
 
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/conv/conv2d_problem_size.h"
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
@@ -242,6 +247,95 @@ struct PredicatedTileIteratorParams {
   }
 };
 
+///////////////////////////////////////////////////////////////////////////////
+
+//
+// Parameters struct for PredicatedTileIteratorDirect2dConv
+//
+
+struct PredicatedTileIteratorDirect2dConvParams{
+  using Index = int32_t;
+  using LongIndex = int64_t;
+
+  //
+  // Data members
+  //
+  FastDivmod pq_divmod;
+  FastDivmod q_divmod;
+
+  LongIndex stride;
+  LongIndex stride_n;
+  LongIndex stride_p;
+
+  int N;
+  int P;
+  int Q;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Status initialize(LongIndex stride_,
+                    cutlass::conv::Conv2dProblemSize const &problem_size,
+                    MatrixCoord threadblock_output_shape) {
+    stride = stride_; // The stride per row of output tensor (bytes)
+    stride_n = problem_size.P * problem_size.Q;
+    stride_p = problem_size.Q ;
+
+    N = problem_size.N;
+    P = problem_size.P;
+    Q = problem_size.Q;
+
+    // Fastdivmod for output O, P, Q
+    if(threadblock_output_shape.row() != 0 && threadblock_output_shape.column() !=0 ){
+      // MSVC emits a "potential divide by 0" warning as error
+      // if the code just divides without a check and substitution.
+
+      CUTLASS_ASSERT(threadblock_output_shape.row() != 0);
+      const auto row_denom = threadblock_output_shape.row() != 0 ?
+        threadblock_output_shape.row() : cutlass::MatrixCoord::Index(1);
+      int tiles_p =
+          (problem_size.P + (threadblock_output_shape.row() - 1)) / row_denom;
+
+      CUTLASS_ASSERT(threadblock_output_shape.column() != 0);
+      const auto col_denom = threadblock_output_shape.column() != 0 ?
+        threadblock_output_shape.column() : cutlass::MatrixCoord::Index(1);
+      int tiles_q = (problem_size.Q + (threadblock_output_shape.column() - 1)) /
+                    col_denom;
+
+      pq_divmod = FastDivmod(tiles_p * tiles_q);
+      q_divmod = FastDivmod(tiles_q);
+    }
+
+    return Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Status initialize(
+      Index stride_,
+      cutlass::conv::Conv2dProblemSize const &problem_size = cutlass::conv::Conv2dProblemSize(),
+      MatrixCoord threadblock_output_shape = MatrixCoord()) {
+    return initialize(LongIndex(stride_), problem_size, threadblock_output_shape);
+  }
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorDirect2dConvParams() { initialize(LongIndex(0)); }
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorDirect2dConvParams(Index stride,
+                               cutlass::conv::Conv2dProblemSize const &problem_size,
+                               MatrixCoord threadblock_output_shape) {
+    initialize(stride, problem_size, threadblock_output_shape);
+  }
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorDirect2dConvParams(LongIndex stride,
+                               cutlass::conv::Conv2dProblemSize const &problem_size,
+                               MatrixCoord threadblock_output_shape) {
+    initialize(stride, problem_size, threadblock_output_shape);
+  }
+};
 
 ///////////////////////////////////////////////////////////////////////////////
 //  InterleavedPredicatedTileIterator
diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_predicates.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_predicates.h
index 8c657ff87b..2fbbc9a4fc 100644
--- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_predicates.h
+++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_predicates.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h
index f730e854f7..94b71b9b8a 100644
--- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h
+++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -186,10 +186,10 @@ class PredicatedTileIteratorStridedDgrad {
   /// Extent of the matrix tile in rows
   Index extent_row_;
 
-  /// Starting Dx h and w dimenstion for strided dgrad mapping
+  /// Starting Dx h and w dimension for strided dgrad mapping
   int start_h_, start_w_;
 
-  /// Effective Dy P and Q dimenstions for strided dgrad mapping
+  /// Effective Dy P and Q dimensions for strided dgrad mapping
   int p_, q_;
 
   /// A thread's starting row position (assuming steady-state predicates have been computed)
diff --git a/include/cutlass/epilogue/threadblock/shared_load_iterator.h b/include/cutlass/epilogue/threadblock/shared_load_iterator.h
index b01b92fd13..ccdb4a9f8c 100644
--- a/include/cutlass/epilogue/threadblock/shared_load_iterator.h
+++ b/include/cutlass/epilogue/threadblock/shared_load_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -201,6 +201,11 @@ class SharedLoadIterator {
     }
   }
 
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void set_smem_base_address(Index address) {
+  }
+
   /// Loads a fragment
   CUTLASS_DEVICE
   void load(Fragment &frag) const {
diff --git a/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h b/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h
index d845d95171..eef4d22bd9 100644
--- a/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h
+++ b/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -70,8 +70,9 @@ template <
   int ElementSizeBits_,      ///< Size of accumulator in bits
   int OutputSizeBits_,       ///< Size of output element in bits
   int ElementsPerAccess,     ///< Vector length of output vector
-  int ContiguousLanes        ///< Number of lanes in the warp writing to contiguous elements
+  int ContiguousLanes,       ///< Number of lanes in the warp writing to contiguous elements
                              ///  in the global memory tensor
+  bool EightBitsOutputOrLess = (OutputSizeBits_ <= 8)
 >
 class SharedLoadIteratorMixed;
 
@@ -85,7 +86,7 @@ template <
   typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
   typename Element_          ///< Accumulator data type
 >
-class SharedLoadIteratorMixed<ThreadMap_, Element_, 32, 16, 8, 8> {
+class SharedLoadIteratorMixed<ThreadMap_, Element_, 32, 16, 8, 8, false> {
 public:
   using ThreadMap = ThreadMap_;
   using Shape = typename ThreadMap::Shape;
@@ -234,6 +235,10 @@ class SharedLoadIteratorMixed<ThreadMap_, Element_, 32, 16, 8, 8> {
     }
   }
 
+  /// Set base smem address
+  CUTLASS_DEVICE
+  void set_smem_base_address(Index address) {}
+
   /// Loads a fragment
   CUTLASS_DEVICE
   void load(Fragment &frag) const {
@@ -244,17 +249,21 @@ class SharedLoadIteratorMixed<ThreadMap_, Element_, 32, 16, 8, 8> {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Partial specialization for int32_t x 16 => int8_t/int4b_t x 16 
+/// Partial specialization for
+///   int32_t x 16 => int8_t/int4b_t x 16 and
+///   float x 16 => float_e4m3_t/float_e5m2_t x 16
 template <
-  typename ThreadMap_,      ///< Thread map (conept: OutputTileThreadMap)
+  typename ThreadMap_,      ///< Thread map (concept: OutputTileThreadMap)
+  typename Element_,
   int OutputSizeBits_       ///< Size of output element in bits
 >
-class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, OutputSizeBits_, 16, 8> {
+class SharedLoadIteratorMixed<ThreadMap_, Element_, 32, OutputSizeBits_, 16, 8, true> {
 public:
   using ThreadMap = ThreadMap_;
   using Shape = typename ThreadMap::Shape;
 
-  using Element = int32_t;
+  using Element = Element_;
+  static_assert(sizeof_bits<Element>::value == 32, "Element size in bits must be 32.");
 
   using Layout = layout::RowMajor;
   using TensorRef = TensorRef<Element, Layout>;
@@ -395,6 +404,10 @@ class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, OutputSizeBits_, 16, 8> {
     }
   }
 
+  /// Set base smem address
+  CUTLASS_DEVICE
+  void set_smem_base_address(Index address) {}
+
   /// Loads a fragment
   CUTLASS_DEVICE
   void load(Fragment &frag) {
@@ -405,17 +418,21 @@ class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, OutputSizeBits_, 16, 8> {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Partial specialization for int32_t x 8 => int8_t/int4b_t x 8
+/// Partial specialization for:
+///   int32_t x 8 => int8_t/int4b_t x 8 and
+///   float x 8 => float_e4m3_t/float_e5m2_t x 8
 template <
-  typename ThreadMap_,      ///< Thread map (conept: OutputTileThreadMap)
+  typename ThreadMap_,      ///< Thread map (concept: OutputTileThreadMap)
+  typename Element_,
   int OutputSizeBits_
 >
-class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, OutputSizeBits_, 8, 8> {
+class SharedLoadIteratorMixed<ThreadMap_, Element_, 32, OutputSizeBits_, 8, 8, true> {
 public:
   using ThreadMap = ThreadMap_;
   using Shape = typename ThreadMap::Shape;
 
-  using Element = int32_t;
+  using Element = Element_;
+  static_assert(sizeof_bits<Element>::value == 32, "Element size in bits must be 32.");
 
   using Layout = layout::RowMajor;
   using TensorRef = TensorRef<Element, Layout>;
@@ -556,6 +573,10 @@ class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, OutputSizeBits_, 8, 8> {
     }
   }
 
+  /// Set base smem address
+  CUTLASS_DEVICE
+  void set_smem_base_address(Index address) {}
+
   /// Loads a fragment
   CUTLASS_DEVICE
   void load(Fragment &frag) {
diff --git a/include/cutlass/epilogue/threadblock/shared_load_iterator_pitch_linear.h b/include/cutlass/epilogue/threadblock/shared_load_iterator_pitch_linear.h
new file mode 100644
index 0000000000..5af6997ed3
--- /dev/null
+++ b/include/cutlass/epilogue/threadblock/shared_load_iterator_pitch_linear.h
@@ -0,0 +1,194 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  This assumes the shared memory tile is in a permuted layout which avoids bank conflicts on loading.
+  
+  When the fragment is loaded into registers, it matches the row-major thread map assumed by
+  the predicated tile iterator writing to global memory.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load output tile from shared memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator
+///
+template <typename ThreadMap_,  ///< Thread map (conept: PitchLinearThreadMap)
+          typename Element_,    ///< Element data type
+          int MaxAlignment = ThreadMap_::kElementsPerAccess *sizeof_bits<Element_>::value / 8>
+class SharedLoadIteratorPitchLinear {
+ public:
+  using ThreadMap = ThreadMap_;
+  using Element = Element_;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+  static int const kMinAlignment =
+      ThreadMap_::kElementsPerAccess * sizeof_bits<Element_>::value / 8;
+
+  static int const kAlignment = (MaxAlignment < kMinAlignment ? MaxAlignment : kMinAlignment);
+
+  static int const kThreads = ThreadMap::kThreads;
+
+  /// Fragment object
+  using Fragment = Array<Element, ThreadMap::Iterations::kCount * kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, kElementsPerAccess, kAlignment>;
+
+  /// Vector type used for SMEM loads
+  using LoadType =
+      AlignedArray<Element,
+                   const_min(128 / sizeof_bits<Element>::value, ThreadMap::kElementsPerAccess),
+                   const_min(16, kAlignment)>;
+
+  static int const kLoadsPerAccess = AccessType::kElements / LoadType::kElements;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Byte-level pointer
+  uint8_t *byte_pointer_;
+
+  /// Stride along adjacent rows
+  int stride_;
+
+  /// Base address offset
+  Index base_smem_address_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  SharedLoadIteratorPitchLinear(TensorRef ref, int thread_idx)
+      : byte_pointer_(reinterpret_cast<uint8_t *>(ref.data())),
+        stride_((ref.stride(0) * sizeof_bits<Element>::value) / 8),
+        base_smem_address_(0) {
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx);
+
+    // Initialize pointer
+    // thread_offset.row() is contiguous dim
+    // thread_offset.column() is stride dim
+    byte_pointer_ += thread_offset.row() * sizeof(AccessType) / kElementsPerAccess+
+                     thread_offset.column() * stride_ ;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &offset) {
+    byte_pointer_ +=
+        offset.row() * ThreadMap::StorageShape::kContiguous * sizeof(AccessType) / kElementsPerAccess +
+        offset.column() * ThreadMap::StorageShape::kStrided * stride_;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        uint8_t const *byte_pointer =
+            byte_pointer_ + s * ThreadMap::Delta::kStrided * stride_ +
+            c * ThreadMap::Delta::kContiguous * ThreadMap::kElementsPerAccess *
+                sizeof_bits<Element>::value / 8 +
+            pointer_offset * sizeof_bits<Element>::value / 8 + base_smem_address_;
+
+        int frag_base_idx = s * ThreadMap::Iterations::kContiguous + c;
+
+        LoadType *frag_ptr = reinterpret_cast<LoadType *>(&frag);
+
+        LoadType const *memory_pointer = reinterpret_cast<LoadType const *>(byte_pointer);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kLoadsPerAccess; ++v) {
+          frag_ptr[frag_base_idx * kLoadsPerAccess + v] = memory_pointer[v];
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void set_smem_base_address(Index address) { base_smem_address_ = address; }
+
+  /// Loads a fragment
+  CUTLASS_DEVICE
+  void load(Fragment &frag) const { load_with_pointer_offset(frag, 0); }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace epilogue
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h
index 4055f21da8..84a096c658 100644
--- a/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h
+++ b/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h
index 425589fd56..13b00762e1 100644
--- a/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h
+++ b/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/warp/fragment_iterator_simt.h b/include/cutlass/epilogue/warp/fragment_iterator_simt.h
index 35e6373ab5..92d3bf5821 100644
--- a/include/cutlass/epilogue/warp/fragment_iterator_simt.h
+++ b/include/cutlass/epilogue/warp/fragment_iterator_simt.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h
index e0dd3ee499..a69f0fd25a 100644
--- a/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h
+++ b/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -164,6 +164,107 @@ class FragmentIteratorTensorOp<WarpShape_, OperatorShape_, OperatorElementC_, Op
 
 ////////////////////////////////////////////////////////////////////////////////
 
+/// Partial specialization for col-major shared memory
+/// Only works for 168x tensor core kernels
+template <
+  typename WarpShape_,         ///< shape of the warp-level GEMM tile
+  typename OperatorShape_,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorElementC_,  ///< matrix multiply operation data type (concept: data type)
+  typename OperatorFragmentC_  ///< matrix multiply operation fragment (concept: Array)
+>
+class FragmentIteratorTensorOp<WarpShape_, OperatorShape_, OperatorElementC_, OperatorFragmentC_, layout::ColumnMajor> {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using OperatorElementC = OperatorElementC_;
+  using OperatorFragmentC = OperatorFragmentC_;
+  using Layout = layout::ColumnMajor;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    OperatorElementC, 
+    4 * Policy::OperatorCount::kRow * Policy::kElementsPerAccess>;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = Array<
+    OperatorElementC, 
+    OperatorFragmentC::kElements * Policy::OperatorCount::kRow * Policy::OperatorCount::kColumn>;
+
+  using OutputAccumulatorTile = AccumulatorTile;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+  using TileIterations = typename Policy::TileIterations;
+  static int const kIterationsPerTile = kIterations / TileIterations::kCount;
+
+private:
+
+  /// Internal access type
+  using AccessType = Array<OperatorElementC, Policy::kElementsPerAccess>;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType const *accumulators_;
+
+  /// Internal index
+  int index_;
+
+public:
+
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorTensorOp(AccumulatorTile const &accum): 
+    accumulators_(reinterpret_cast<AccessType const *>(&accum)), 
+    index_(0) {
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorTensorOp &operator++() {
+    ++index_;
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorTensorOp &operator--() {
+    --index_;
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, int index_offset = 0) const {
+
+    int index = index_ + index_offset;
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Policy::kAccumulatorRowStride; ++i) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < (Policy::OperatorCount::kRow * 2); ++m) {
+
+        int accumulator_access_offset = 
+          index * Policy::kAccumulatorColumnStride + m * Policy::kAccumulatorRowStride / Policy::kElementsPerAccess + i;
+
+        frag_ptr[m + i * Policy::OperatorCount::kRow * 2] = accumulators_[accumulator_access_offset];
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
 /// Dedicated to interleaved layout
 template <
     /// shape of the warp-level GEMM tile
diff --git a/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h
index 9ca1e12180..4979a38039 100644
--- a/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h
+++ b/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h
index 7538a224a5..955409f32a 100644
--- a/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h
+++ b/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/warp/simt_policy.h b/include/cutlass/epilogue/warp/simt_policy.h
index 5f3252666c..b30bf19d6c 100644
--- a/include/cutlass/epilogue/warp/simt_policy.h
+++ b/include/cutlass/epilogue/warp/simt_policy.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/warp/tensor_op_policy.h b/include/cutlass/epilogue/warp/tensor_op_policy.h
index 63b131474c..b3f3a4f59c 100644
--- a/include/cutlass/epilogue/warp/tensor_op_policy.h
+++ b/include/cutlass/epilogue/warp/tensor_op_policy.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -98,6 +98,47 @@ struct TensorOpPolicy<WarpShape, OperatorShape, layout::RowMajor> {
 
 ////////////////////////////////////////////////////////////////////////////////
 
+/// Partial specialization for row-major
+template <
+  typename WarpShape,           ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename OperatorShape        ///< matrix multiply operation shape (concept: gemm::GemmShape)
+>
+struct TensorOpPolicy<WarpShape, OperatorShape, layout::ColumnMajor> {
+
+  /// Number of operations
+  using OperatorCount = MatrixShape<
+    (WarpShape::kM + OperatorShape::kM - 1) / OperatorShape::kM,
+    (WarpShape::kN + OperatorShape::kN - 1) / OperatorShape::kN
+  >;
+
+  //
+  // Hard-coded constants regarding Tensor Operations
+  //
+
+  static int const kElementsPerAccess = 1;
+  static int const kColumnsPerIteration = 8;
+  static bool const kDivisible = 
+    !(WarpShape::kM % OperatorShape::kM) && !(WarpShape::kN % OperatorShape::kN);
+
+  //
+  // Derived quantities
+  //
+
+  // Number of 'externally visible' iterations per actual instruction
+  static int const kIterationsPerInstruction = OperatorShape::kN / kColumnsPerIteration;
+
+  // Number of externally visible iterations
+  static int const kIterations = OperatorCount::kColumn * kIterationsPerInstruction;
+
+  using TileIterations = MatrixShape<kIterations, 1>;
+
+  // Hard code for 16x8
+  static int const kAccumulatorRowStride = 2;
+  static int const kAccumulatorColumnStride = 4 * OperatorCount::kRow;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
 /// Partial specialization for column-major-interleaved
 template <
     typename WarpShape,  ///< shape of warp-level GEMM (concept: MatrixShape)
diff --git a/include/cutlass/epilogue/warp/tile_iterator_simt.h b/include/cutlass/epilogue/warp/tile_iterator_simt.h
index c2d801915c..0f470ff762 100644
--- a/include/cutlass/epilogue/warp/tile_iterator_simt.h
+++ b/include/cutlass/epilogue/warp/tile_iterator_simt.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -240,12 +240,301 @@ class TileIteratorSimt<WarpShape_, Operator_, Element_, layout::RowMajor, MmaSim
   void load(Fragment &frag) const {
     load_with_pointer_offset(frag, 0);
   }
+
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) {
+  }
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+/// Template for reading and writing tiles of accumulators to shared memory
+template <typename WarpShape_,     ///< shape of warp-level GEMM (concept: GemmShape)
+          typename Operator_,      ///< matrix multiply operation (concept: arch::Mma)
+          typename Element_,       ///< data type of element to be written
+          typename Layout_,         ///< target shared memory layout
+          typename MmaSimtPolicy_  ///< policy defining lane arrangement (concept: MmaSimtPolicy)
+          >
+class TileIteratorSimtDirectConv {
+ public:
+
+  using WarpShape = WarpShape_;
+  using Operator = Operator_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+
+  using TensorRef = TensorRef<Element, Layout>;  ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;               ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  using Policy = SimtPolicy<WarpShape, Operator, Layout, MmaSimtPolicy_>;
+
+  /// Shape of the tile in memory
+  using Shape = MatrixShape<Policy::kRowsPerIteration, WarpShape::kN>;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<typename Operator::ElementC, Policy::kElementsPerIteration>;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = Array<typename Operator::ElementC, Policy::kAccumulatorElementCount>;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+  /// Padding quantity
+  using Padding = MatrixShape<0,
+                              0
+                              >;
+
+private:
+  /// Storage type for accessing memory
+  using AccessType = AlignedArray<
+    Element, 
+    Policy::kElementsPerAccess
+  >;
+
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointer_;
+
+  /// Internal layout object
+  Layout layout_;
+
+  /// Base smem offset;
+  Index base_smem_address_;
+
+ public:
+  /// Default constructor
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtDirectConv() : pointer_(nullptr) {}
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtDirectConv(
+    TensorRef const &ref,
+    unsigned lane_id
+  ):
+    pointer_(reinterpret_cast<AccessType *>(ref.data())),
+    layout_(ref.stride()[0] / AccessType::kElements) {
+
+    auto lane_layout = Policy::MmaSimtPolicy::get_lane_layout();
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id);
+
+    pointer_ += layout_({
+      lane_offset.row(),
+      lane_offset.column() * Policy::kElementsPerAccess / int(AccessType::kElements)
+    });
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtDirectConv & add_pointer_offset(Index pointer_offset) {
+    pointer_ += pointer_offset / AccessType::kElements;
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtDirectConv & add_tile_offset(TensorCoord const &tile_offset) {
+
+    pointer_ += layout_({
+      tile_offset.row() * Shape::kRow, 
+      (tile_offset.column() * Shape::kColumn / int(AccessType::kElements))
+    });
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtDirectConv & operator+=(TensorCoord const &tile_offset) {
+
+    add_tile_offset(tile_offset);
+    
+    return *this;
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    // original vector stores
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+    AccessType * load_pointer_ = reinterpret_cast<AccessType *>(reinterpret_cast<uint8_t *>(pointer_) + base_smem_address_);
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::kAccessesPerIteration; ++n) {
+      load_pointer_[n * Policy::MmaSimtPolicy::WarpShape::kColumn + pointer_offset / int(AccessType::kElements)] = frag_ptr[n];
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::kAccessesPerIteration; ++n) {
+      frag_ptr[n] = pointer_[n * Policy::MmaSimtPolicy::WarpShape::kColumn + pointer_offset / int(AccessType::kElements)];
+    }
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address){
+    base_smem_address_ = address;
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Template for reading and writing tiles of accumulators to shared memory
+template <typename WarpShape_,               ///< shape of warp-level GEMM (concept: GemmShape)
+          typename ThreadOutputShape_,       /// Size of the matrix to load (concept: TensorNHWC)
+          typename ThreadBlockOutputShape_,  /// Size of the matrix to load (concept: TensorNHWC)
+          typename Operator_,                ///< matrix multi ply operation (concept: arch::Mma)
+          typename Element_,                 ///< data type of element to be written
+          typename Layout_,                  ///< target shared memory layout
+          typename MmaSimtPolicy_            ///< policy defining lane arrangement (concept: MmaSimtPolicy)
+          >
+class TileIteratorSimtDirect2dConv {
+ public:
+  using WarpShape = WarpShape_;
+  using ThreadOutputShape = ThreadOutputShape_;
+  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
+  using Operator = Operator_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  using MmaSimtPolicy = MmaSimtPolicy_;
+
+  using TensorRef = TensorRef<Element, Layout>;  ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;               ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  // Thread-level shape of a fragment
+  using ThreadShape = MatrixShape<ThreadOutputShape::kNHW, ThreadOutputShape::kC>;
+
+  static_assert(!(ThreadShape::kColumn % MmaSimtPolicy::LaneMmaShape::kN),
+                "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");
+
+  using ThreadTileCount = MatrixShape<ThreadBlockOutputShape::kH / ThreadOutputShape::kH,
+                                      ThreadBlockOutputShape::kW / ThreadOutputShape::kW>;
+
+  using Iterations =
+      MatrixShape<ThreadShape::kRow, ThreadShape::kColumn / MmaSimtPolicy::LaneMmaShape::kN>;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = typename Operator::FragmentC;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = AccumulatorTile;
+
+  /// Padding quantity
+  using Padding = MatrixShape<0, 0>;
+
+ private:
+  // Storage type for accessing memory
+  using AccessType = AlignedArray<Element, MmaSimtPolicy::LaneMmaShape::kN>;
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointer_;
+
+  /// Internal layout object
+  Layout layout_;
+
+  /// Base smem offset;
+  Index base_smem_address_;
+
+ public:
+  /// Default constructor
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtDirect2dConv() : pointer_(nullptr) {}
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtDirect2dConv(TensorRef const &ref, unsigned thread_id, unsigned lane_id)
+      : pointer_(reinterpret_cast<AccessType *>(ref.data())),
+        layout_(ref.stride()[0] / AccessType::kElements) {
+  
+    auto lane_layout = MmaSimtPolicy::get_lane_layout();
+
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id);
+
+    // Get base HW offset of current threads
+    const int threadgroup = thread_id / (ThreadBlockOutputShape::kC / ThreadOutputShape::kC);
+    const int base_p = (threadgroup / (ThreadTileCount::kColumn)) * ThreadOutputShape::kH;
+    const int base_q = (threadgroup % (ThreadTileCount::kColumn)) * ThreadOutputShape::kW;
+
+    const int row_offset = base_p * ThreadBlockOutputShape::kW + base_q;
+
+    pointer_ += layout_(
+        {row_offset,
+         lane_offset.column() * MmaSimtPolicy::LaneMmaShape::kN / int(AccessType::kElements)});
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtDirect2dConv &add_pointer_offset(Index pointer_offset) {
+    pointer_ += pointer_offset / AccessType::kElements;
+    return *this;
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    AccessType *storer_pointer_ =
+        reinterpret_cast<AccessType *>(reinterpret_cast<uint8_t *>(pointer_) + base_smem_address_);
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int h = 0; h < ThreadOutputShape::kH; ++h) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int w = 0; w < ThreadOutputShape::kW; ++w) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int col = 0; col < Iterations::kColumn; ++col) {
+          int offset = (w + h * ThreadBlockOutputShape::kW) *
+                           (ThreadBlockOutputShape::kC / AccessType::kElements) +
+                       col;
+          storer_pointer_[offset + pointer_offset / int(AccessType::kElements)] =
+              frag_ptr[w + h * ThreadOutputShape::kW + col];
+        }
+      }
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) { base_smem_address_ = address; }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 /// Template for reading and writing tiles of accumulators to shared memory
 template <
   typename WarpShape_,        ///< shape of warp-level GEMM (concept: GemmShape)
@@ -482,6 +771,10 @@ class TileIteratorSimtCanonical {
     return add_tile_offset({1, 0});
   }
 
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) {
+  }
 };
 
 
diff --git a/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h b/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h
index 24d94c7556..0bef031061 100644
--- a/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h
+++ b/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -228,6 +228,11 @@ class TileIteratorTensorOp<WarpShape_, OperatorShape_, Element_, layout::RowMajo
   TileIteratorTensorOp & operator++() {
     return add_tile_offset({1, 0});
   }
+  
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) {
+  }
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -420,6 +425,11 @@ class TileIteratorTensorOp<WarpShape_, OperatorShape_, Element_,
   TileIteratorTensorOp & operator++() {
     return add_tile_offset({0, 1});
   }
+
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) {
+  }
 };
 
 
@@ -645,6 +655,11 @@ class TileIteratorTensorOpCanonical {
   TileIteratorTensorOpCanonical & operator++() {
     return add_tile_offset({1, 0});
   }
+  
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) {
+  }
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h b/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h
index eadb779a66..3322a4c65c 100644
--- a/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h
+++ b/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -64,7 +64,8 @@ template <
   int ElementSizeBits,            ///< Size of accumulator element in bits
   int OutputSizeBits,             ///< Size of output element in bits
   int OutputElementCount,         ///< number of elements in output vector
-  int ContiguousLanes             ///< Number of consecutive lanes writing to contiguous memory
+  int ContiguousLanes,            ///< Number of consecutive lanes writing to contiguous memory
+  bool EightBitsOutputOrLess = (OutputSizeBits <= 8)
 >
 class TileIteratorTensorOpMixed {
 public:
@@ -107,7 +108,10 @@ class TileIteratorTensorOpMixed {
     static int const kPointerCount = 
       (OutputElementCount * sizeof_bits<Element>::value) / (const_min(128, OutputElementCount * sizeof_bits<Element>::value));
 
-    static_assert(kPointerCount <= 4, "Can only accommodate four pointers at present.");
+    // Currently support max 4 ptr
+    static constexpr int kMaxPointerCount{4};
+
+    static_assert(kPointerCount <= kMaxPointerCount, "Can only accommodate four pointers at present.");
     static_assert(sizeof(Element) == 4, "This can only be used with 32b accumulator data types (f32, s32).");
   };
 
@@ -126,24 +130,18 @@ class TileIteratorTensorOpMixed {
   //
 
   /// Internal pointer to memory
-  AccessType *pointers_[Detail::kPointerCount];
+  AccessType *pointers_[Detail::kPointerCount] = {nullptr};
 
   /// Stride in units of AccessType
-  int stride_;
+  int stride_{0};
 
   /// Logical column in which warp tile is aligned
-  int warp_column_;
+  int warp_column_{0};
 
 public:
 
   /// Default constructor
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOpMixed() {
-    CUTLASS_PRAGMA_UNROLL
-    for (int64_t i = 0; i < Detail::kPointerCount; ++i) {
-      pointers_[i] = nullptr;
-    }
-  }
+  TileIteratorTensorOpMixed() = default;
 
   /// Constructor from TensorRef
   CUTLASS_HOST_DEVICE
@@ -164,18 +162,7 @@ class TileIteratorTensorOpMixed {
 
       ptr += column_idx;
 
-      if (i == 0) {
-        pointers_[0 % Detail::kPointerCount] = ptr;
-      }
-      else if (i == 1) {
-        pointers_[1 % Detail::kPointerCount] = ptr;
-      }
-      else if (i == 2) {
-        pointers_[2 % Detail::kPointerCount] = ptr;
-      }
-      else if (i == 3) {
-        pointers_[3 % Detail::kPointerCount] = ptr;
-      }
+      pointers_[i % Detail::kPointerCount] = ptr;
     }
   }
 
@@ -225,15 +212,23 @@ class TileIteratorTensorOpMixed {
     // When the optimization is enabled, small tiles require separate logic.
     bool kN32_optimization = (WarpShape::kN * Detail::kLanesInQuad * Policy::kElementsPerAccess * sizeof_bits<Element>::value) % 1024 == 0;
     if (kN32_optimization) {
+      
       int ptr_idx = ((warp_column_ * sizeof_bits<Element>::value) / 1024) % Detail::kPointerCount;
+      
       if (ptr_idx == 0) {
         ptr = pointers_[0];
       } else if (ptr_idx == 1) {
-        ptr = pointers_[1];
+	if constexpr (AccessType::kElements >= 2) {
+          ptr = pointers_[1];
+	}
       } else if (ptr_idx == 2) {
-        ptr = pointers_[2];
+	if constexpr (AccessType::kElements >= 3) {
+          ptr = pointers_[2];
+	}
       } else if (ptr_idx == 3) {
-        ptr = pointers_[3];
+	if constexpr (AccessType::kElements >= 4) {
+          ptr = pointers_[3];
+	}
       }
     }
 
@@ -304,6 +299,11 @@ class TileIteratorTensorOpMixed {
   void load(Fragment &frag) const {
     load_with_pointer_offset(frag, 0);
   }
+  
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) {
+  }
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -314,7 +314,7 @@ template <
   typename OperatorShape_,        ///< matrix multiply operation shape (concept: gemm::GemmShape),
   int OutputSizeBits              ///< Size of output element in bits
 >
-class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, int32_t, 32, OutputSizeBits, 16, 8> {
+class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, int32_t, 32, OutputSizeBits, 16, 8, true> {
 public:
 
   using WarpShape = WarpShape_;
@@ -373,24 +373,18 @@ class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, int32_t, 32, OutputS
   //
 
   /// Internal pointer to memory
-  AccessType *pointers_[Detail::kPointerCount];
+  AccessType *pointers_[Detail::kPointerCount] = {nullptr};
 
   /// Stride in units of AccessType
-  int stride_;
+  int stride_{0};
 
   /// Uniform offset in bytes added to warp tile iterator
-  int uniform_offset_[Detail::kOffsetCount];
+  int uniform_offset_[Detail::kOffsetCount] = {0};
 
 public:
 
   /// Default constructor
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOpMixed() {
-    CUTLASS_PRAGMA_UNROLL
-    for (int64_t i = 0; i < Detail::kPointerCount; ++i) {
-      pointers_[i] = nullptr;
-    }
-  }
+  TileIteratorTensorOpMixed() = default;
 
   /// Constructor from TensorRef
   CUTLASS_HOST_DEVICE
@@ -506,6 +500,11 @@ class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, int32_t, 32, OutputS
   void store(Fragment const &frag) {
     store_with_pointer_offset(frag, 0);
   }
+
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) {
+  }
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -516,7 +515,7 @@ template <
   typename OperatorShape_,        ///< matrix multiply operation shape (concept: gemm::GemmShape)
   int OutputSizeBits              ///< Size of output element in bits
 >
-class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, int32_t, 32, OutputSizeBits, 8, 8> {
+class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, int32_t, 32, OutputSizeBits, 8, 8, true> {
 public:
 
   using WarpShape = WarpShape_;
@@ -572,21 +571,15 @@ class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, int32_t, 32, OutputS
   //
 
   /// Internal pointer to memory
-  AccessType *pointers_[Detail::kPointerCount];
+  AccessType *pointers_[Detail::kPointerCount] = {nullptr};
 
   /// Stride in units of AccessType
-  int stride_;
+  int stride_{0};
 
 public:
 
   /// Default constructor
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOpMixed() {
-    CUTLASS_PRAGMA_UNROLL
-    for (int64_t i = 0; i < Detail::kPointerCount; ++i) {
-      pointers_[i] = nullptr;
-    }
-  }
+  TileIteratorTensorOpMixed() = default;
 
   /// Constructor from TensorRef
   CUTLASS_HOST_DEVICE
@@ -673,6 +666,390 @@ class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, int32_t, 32, OutputS
 
       int offset = (n / 4) * 16 + pointer_offset / AccessType::kElements + (n % 4) * 4;
 
+#if 0
+      //
+      // Using inline PTX to avoid generic memory
+      //
+      AccessType *smem_ptr = pointers_[ptr_idx];
+      smem_ptr[offset] = frag_ptr[n];
+#else
+      uint32_t smem_addr = arch::cutlass_get_smem_pointer(ptr);
+      uint32_t const *data = reinterpret_cast<uint32_t const *>(frag_ptr + n);
+      uint32_t offset_in_bytes = offset * sizeof(AccessType);
+
+      asm volatile(
+        "{ .reg .u32 smem_ptr; add.u32 smem_ptr, %0, %1; st.shared.v2.u32 [smem_ptr], {%2, %3}; }\n"
+        : : "r"(smem_addr), "r"(offset_in_bytes), "r"(data[0]), "r"(data[1])
+      );
+#endif
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) {
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for float x 16 => float_e4m3_t/float_e5m2_t x 16
+template <
+  typename WarpShape_,            ///< shape of warp-level GEMM (concept: GemmShape)
+  typename OperatorShape_         ///< matrix multiply operation shape (concept: gemm::GemmShape),
+>
+class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, float, 32, 8, 16, 8> {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using Element = float;
+  using Layout = layout::RowMajor;
+  static int const kOutputElementCount = 16;
+
+  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// Shape of the tile in memory
+  using Shape = MatrixShape<
+    Policy::kRowsPerIteration,
+    WarpShape::kN
+  >;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    Element,
+    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
+
+  /// This is the complete warp-level accumulator tile.
+  //using AccumulatorTile = typename Operator::FragmentC;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+  // Internal constants
+  struct Detail {
+    static int const kLanesInQuad = 4;
+
+    /// Number of pointers needed to write accumulators
+    static int const kPointerCount = 2;
+
+    /// Offsets added
+    static int const kOffsetCount = 4;
+
+    static_assert(sizeof(Element) == 4, "This can only be used with 32b accumulator data types (f32, s32).");
+  };
+
+  /// Padding quantity
+  using Padding = MatrixShape<0, Detail::kLanesInQuad * 2>;
+
+private:
+
+  /// Storage type for accessing memory
+  using AccessType = AlignedArray<Element, 2>;
+
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointers_[Detail::kPointerCount] = {nullptr};
+
+  /// Stride in units of AccessType
+  int stride_{0};
+
+  /// Uniform offset in bytes added to warp tile iterator
+  int uniform_offset_[Detail::kOffsetCount] = {0};
+
+public:
+
+  /// Default constructor
+  TileIteratorTensorOpMixed() = default;
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed(
+    TensorRef const &ref,
+    unsigned lane_id
+  ):
+    stride_(ref.stride()[0] / AccessType::kElements) {
+
+    int quad_id = (lane_id / Detail::kLanesInQuad);
+    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kPointerCount; ++i) {
+      AccessType *ptr = reinterpret_cast<AccessType *>(ref.data()) + quad_id * stride_;
+      int column_idx = lane_in_quad ^ (i * 2);
+
+      ptr += column_idx;
+
+      if (i == 0) {
+        pointers_[0] = ptr;
+      }
+      else if (i == 1) {
+        pointers_[1] = ptr;
+      }
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kOffsetCount; ++i) {
+      uniform_offset_[i] = (i ^ 0) * 4 * sizeof(AccessType);
+    }
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & add_pointer_offset(Index pointer_offset) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int64_t i = 0; i < Detail::kPointerCount; ++i) {
+      pointers_[i] += pointer_offset / AccessType::kElements;
+    }
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & add_tile_offset(TensorCoord const &tile_offset) {
+
+    int ptr_offset = tile_offset.row() * Shape::kRow * stride_ +
+      tile_offset.column() * Shape::kColumn / AccessType::kElements;
+
+    pointers_[0] += ptr_offset;
+    pointers_[1] += ptr_offset;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kOffsetCount; ++i) {
+      uniform_offset_[i] = (i ^ tile_offset.column()) * 4 * sizeof(AccessType);
+    }
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & operator+=(TensorCoord const &tile_offset) {
+    return add_tile_offset(tile_offset);
+  }
+
+  /// Store
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+
+      int ptr_idx = (n / 4);
+      int offset_idx = (n % 4);
+
+      AccessType *ptr;
+      if (ptr_idx == 0) {
+        ptr = pointers_[0];
+      }
+      else if (ptr_idx == 1) {
+        ptr = pointers_[1];
+      }
+
+      int offset = (n / 4) * 16 + pointer_offset / AccessType::kElements;
+
+#if 0
+      //
+      // Using inline PTX to avoid generic memory
+      //
+      AccessType *smem_ptr = pointers_[ptr_idx];
+      smem_ptr[offset] = frag_ptr[n];
+#else
+      uint32_t smem_addr = arch::cutlass_get_smem_pointer(ptr);
+      uint32_t const *data = reinterpret_cast<uint32_t const *>(frag_ptr + n);
+      uint32_t offset_in_bytes = offset * sizeof(AccessType) + uniform_offset_[offset_idx];
+
+      asm volatile(
+        "{ .reg .u32 smem_ptr; add.u32 smem_ptr, %0, %1; st.shared.v2.u32 [smem_ptr], {%2, %3}; }\n"
+        : : "r"(smem_addr), "r"(offset_in_bytes), "r"(data[0]), "r"(data[1])
+      );
+#endif
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for float x 8 => float_e4m3_t/float_e5m2_t x 8
+template <
+  typename WarpShape_,            ///< shape of warp-level GEMM (concept: GemmShape)
+  typename OperatorShape_         ///< matrix multiply operation shape (concept: gemm::GemmShape)
+>
+class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, float, 32, 8, 8, 8> {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using Element = float;
+  using Layout = layout::RowMajor;
+  static int const kOutputElementCount = 8;
+
+  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// Shape of the tile in memory
+  using Shape = MatrixShape<
+    Policy::kRowsPerIteration,
+    WarpShape::kN
+  >;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    Element,
+    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
+
+  /// This is the complete warp-level accumulator tile.
+  //using AccumulatorTile = typename Operator::FragmentC;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+  // Internal constants
+  struct Detail {
+    static int const kLanesInQuad = 4;
+
+    /// Number of pointers needed to write accumulators
+    static int const kPointerCount = 2;
+
+    static_assert(sizeof(Element) == 4, "This can only be used with 32b accumulator data types (f32, s32).");
+  };
+
+  /// Padding quantity
+  using Padding = MatrixShape<0, Detail::kLanesInQuad * 2>;
+
+private:
+
+  /// Storage type for accessing memory
+  using AccessType = AlignedArray<Element, 2>;
+
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointers_[Detail::kPointerCount] = {nullptr};
+
+  /// Stride in units of AccessType
+  int stride_{0};
+
+public:
+
+  /// Default constructor
+  TileIteratorTensorOpMixed() = default;
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed(
+    TensorRef const &ref,
+    unsigned lane_id
+  ):
+    stride_(ref.stride()[0] / AccessType::kElements) {
+
+    int quad_id = (lane_id / Detail::kLanesInQuad);
+    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kPointerCount; ++i) {
+      AccessType *ptr = reinterpret_cast<AccessType *>(ref.data()) + quad_id * stride_;
+      int column_idx = lane_in_quad ^ (i * 2);
+
+      ptr += column_idx;
+
+      if (i == 0) {
+        pointers_[0] = ptr;
+      }
+      else if (i == 1) {
+        pointers_[1] = ptr;
+      }
+    }
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & add_pointer_offset(Index pointer_offset) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int64_t i = 0; i < Detail::kPointerCount; ++i) {
+      pointers_[i] += pointer_offset / AccessType::kElements;
+    }
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & add_tile_offset(TensorCoord const &tile_offset) {
+
+    int ptr_offset = tile_offset.row() * Shape::kRow * stride_ +
+      tile_offset.column() * Shape::kColumn / AccessType::kElements;
+
+    pointers_[0] += ptr_offset;
+    pointers_[1] += ptr_offset;
+
+    if (tile_offset.column() % 2) {
+      auto tmp = pointers_[0];
+      pointers_[0] = pointers_[1];
+      pointers_[1] = tmp;
+    }
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & operator+=(TensorCoord const &tile_offset) {
+    return add_tile_offset(tile_offset);
+  }
+
+  /// Store
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+
+      int ptr_idx = (n / 4);
+
+      AccessType *ptr;
+      if (ptr_idx == 0) {
+        ptr = pointers_[0];
+      }
+      else if (ptr_idx == 1) {
+        ptr = pointers_[1];
+      }
+
+      int offset = (n / 4) * 16 + pointer_offset / AccessType::kElements + (n % 4) * 4;
+
 #if 0
       //
       // Using inline PTX to avoid generic memory
diff --git a/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h b/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h
index a5035b6181..8ce4750c3f 100644
--- a/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h
+++ b/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -242,6 +242,11 @@ struct TileIteratorVoltaTensorOp<WarpShape_, gemm::GemmShape<32, 32, 4>, half_t,
   void load(Fragment const &frag) {
     load_with_pointer_offset(frag, 0);
   }
+  
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) {
+  }
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -411,7 +416,7 @@ struct TileIteratorVoltaTensorOp<WarpShape_, gemm::GemmShape<32, 32, 4>, float,
 
     AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
 
-    assert(0); // TODO
+    assert(0);
   }
 
   /// Load
@@ -419,6 +424,11 @@ struct TileIteratorVoltaTensorOp<WarpShape_, gemm::GemmShape<32, 32, 4>, float,
   void load(Fragment const &frag) {
     load_with_pointer_offset(frag, 0);
   }
+  
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) {
+  }
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h b/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h
index 630ff73d47..951833d4e2 100644
--- a/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h
+++ b/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -207,6 +207,12 @@ class TileIteratorWmmaTensorOp<WarpShape_, OperatorShape_, OperatorFragment_, la
   void load(Fragment &frag) const {
     load_with_pointer_offset(frag, 0);
   }
+
+  
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) {
+  }
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/warp/volta_tensor_op_policy.h b/include/cutlass/epilogue/warp/volta_tensor_op_policy.h
index 8aef282e02..f6df868e39 100644
--- a/include/cutlass/epilogue/warp/volta_tensor_op_policy.h
+++ b/include/cutlass/epilogue/warp/volta_tensor_op_policy.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h b/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h
index a4524b5b9c..a09c1f792f 100644
--- a/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h
+++ b/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/fast_math.h b/include/cutlass/fast_math.h
index dd6c0406ec..4ca8e113db 100644
--- a/include/cutlass/fast_math.h
+++ b/include/cutlass/fast_math.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -38,12 +38,12 @@
 #include <cmath>
 #include <type_traits>
 #endif
-
+#include <cuda/std/utility>
 #include "cutlass/cutlass.h"
 #include "cutlass/array.h"
 #include "cutlass/uint128.h"
 #include "cutlass/coord.h"
-#include "cutlass/numeric_types.h"
+#include "cutlass/half.h"
 
 /**
  * \file
@@ -54,24 +54,19 @@ namespace cutlass {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-template <typename T>
-CUTLASS_HOST_DEVICE void swap(T &lhs, T &rhs) {
-  T tmp = lhs;
-  lhs = rhs;
-  rhs = tmp;
-}
+using ::cuda::std::swap;
 
 /******************************************************************************
  * Static math utilities
  ******************************************************************************/
 
-/// Mixed precision dot product 
+/// Mixed precision dot product
 template <typename Index, typename LongIndex, int N>
 CUTLASS_HOST_DEVICE LongIndex dot(
-  Coord<N, Index> const &coord, 
-  Coord<N, LongIndex> const &stride, 
+  Coord<N, Index> const &coord,
+  Coord<N, LongIndex> const &stride,
   LongIndex acc = LongIndex()) {
-  
+
   CUTLASS_PRAGMA_UNROLL
   for (int n = 0; n < N; ++n) {
     acc += LongIndex(coord[n]) * stride[n];
@@ -144,19 +139,29 @@ struct divide_assert {
  * Round dividend up to the nearest multiple of divisor
  */
 template <typename dividend_t, typename divisor_t>
-CUTLASS_HOST_DEVICE dividend_t round_nearest(dividend_t dividend, divisor_t divisor) {
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+dividend_t round_nearest(dividend_t dividend, divisor_t divisor) {
   return ((dividend + divisor - 1) / divisor) * divisor;
 }
 
+template <typename value_t>
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+value_t abs_for_integer(value_t a) {
+  return ((a > 0) ? a : -a);
+}
 /**
  * Greatest common divisor
  */
 template <typename value_t>
-CUTLASS_HOST_DEVICE value_t gcd(value_t a, value_t b) {
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+value_t gcd(value_t a, value_t b) {
   for (;;) {
-    if (a == 0) return b;
+    if (a == 0) return cutlass::abs_for_integer(b);
     b %= a;
-    if (b == 0) return a;
+    if (b == 0) return cutlass::abs_for_integer(a);
     a %= b;
   }
 }
@@ -165,21 +170,46 @@ CUTLASS_HOST_DEVICE value_t gcd(value_t a, value_t b) {
  * Least common multiple
  */
 template <typename value_t>
-CUTLASS_HOST_DEVICE value_t lcm(value_t a, value_t b) {
-  value_t temp = gcd(a, b);
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+value_t lcm(value_t a, value_t b) {
+  value_t temp = cutlass::gcd(a, b);
+  return (temp != 0) ? value_t(cutlass::abs_for_integer(a) / temp * cutlass::abs_for_integer(b)) : value_t{};
+}
 
-  return temp ? (a / temp * b) : 0;
+/**
+ * Greatest common divisor
+ */
+template <typename value_t>
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+value_t gcd_cxx11(value_t a, value_t b) {
+  return (a == 0 || b == 0) ? cutlass::abs_for_integer(a | b) : cutlass::gcd_cxx11(b, a % b);
+}
+
+/**
+ * Least common multiple
+ */
+template <typename value_t>
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+value_t lcm_cxx11(value_t a, value_t b) {
+  return cutlass::gcd_cxx11(a, b) ? (cutlass::abs_for_integer(a) / cutlass::gcd_cxx11(a, b) *
+                                    cutlass::abs_for_integer(b))
+                                  : value_t{};
 }
 
 /// Returns the smallest value in the half-open range [a, a+b) that is a multiple of b
 CUTLASS_HOST_DEVICE
-constexpr int round_up(int a, int b) {
+CUTLASS_CONSTEXPR_IF_CXX17
+int round_up(int a, int b) {
   return ((a + b - 1) / b) * b;
 }
 
 /// Returns the ceiling of (a / b)
 CUTLASS_HOST_DEVICE
-constexpr int ceil_div(int a, int b) {
+CUTLASS_CONSTEXPR_IF_CXX17
+int ceil_div(int a, int b) {
   return (a + b - 1) / b;
 }
 
@@ -191,15 +221,20 @@ constexpr int ceil_div(int a, int b) {
  * log2_up/down codes?
  */
 template <typename value_t>
-CUTLASS_HOST_DEVICE value_t clz(value_t x) {
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+value_t clz(value_t x) {
   for (int i = 31; i >= 0; --i) {
-    if ((1 << i) & x) return 31 - i;
+    if ((1 << i) & x)
+      return value_t(31 - i);
   }
-  return 32;
+  return value_t(32);
 }
 
 template <typename value_t>
-CUTLASS_HOST_DEVICE value_t find_log2(value_t x) {
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+value_t find_log2(value_t x) {
   int a = int(31 - clz(x));
   a += (x & (x - 1)) != 0;  // Round up, add 1 if not a power of 2.
   return a;
@@ -209,7 +244,8 @@ CUTLASS_HOST_DEVICE value_t find_log2(value_t x) {
 /**
  * Find divisor, using find_log2
  */
-CUTLASS_HOST_DEVICE 
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
 void find_divisor(unsigned int& mul, unsigned int& shr, unsigned int denom) {
   if (denom == 1) {
     mul = 0;
@@ -226,7 +262,8 @@ void find_divisor(unsigned int& mul, unsigned int& shr, unsigned int denom) {
 /**
  * Find quotient and remainder using device-side intrinsics
  */
-CUTLASS_HOST_DEVICE 
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
 void fast_divmod(int& quo, int& rem, int src, int div, unsigned int mul, unsigned int shr) {
 
   #if defined(__CUDA_ARCH__)
@@ -242,6 +279,7 @@ void fast_divmod(int& quo, int& rem, int src, int div, unsigned int mul, unsigne
 
 // For long int input
 CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
 void fast_divmod(int& quo, int64_t& rem, int64_t src, int div, unsigned int mul, unsigned int shr) {
 
   #if defined(__CUDA_ARCH__)
@@ -269,36 +307,87 @@ void fast_divmod(int& quo, int64_t& rem, int64_t src, int div, unsigned int mul,
 ///
 ///   FastDivmod divmod(divisor);
 ///
-///   divmod(quotient, remainder, dividend);  
+///   divmod(quotient, remainder, dividend);
 ///
 ///   // quotient = (dividend / divisor)
 ///   // remainder = (dividend % divisor)
 ///
 struct FastDivmod {
+  using value_div_type = int;
+  using value_mod_type = int64_t;
+  int32_t divisor = 1;
+  uint32_t multiplier = 0u;
+  uint32_t shift_right = 0u;
+
+  // Find quotient and remainder using device-side intrinsics
+  CUTLASS_HOST_DEVICE
+  void fast_divmod(int& quotient, int& remainder, int dividend) const {
+
+#if defined(__CUDA_ARCH__)
+    // Use IMUL.HI if divisor != 1, else simply copy the source.
+    quotient = (divisor != 1) ? __umulhi(dividend, multiplier) >> shift_right : dividend;
+#else
+    quotient = int((divisor != 1) ? int(((int64_t)dividend * multiplier) >> 32) >> shift_right : dividend);
+#endif
+
+    // The remainder.
+    remainder = dividend - (quotient * divisor);
+  }
+
+  /// For long int input
+  CUTLASS_HOST_DEVICE
+  void fast_divmod(int& quotient, int64_t& remainder, int64_t dividend) const {
+
+#if defined(__CUDA_ARCH__)
+    // Use IMUL.HI if divisor != 1, else simply copy the source.
+    quotient = (divisor != 1) ? __umulhi(dividend, multiplier) >> shift_right : dividend;
+#else
+    quotient = int((divisor != 1) ? ((dividend * multiplier) >> 32) >> shift_right : dividend);
+#endif
+    // The remainder.
+    remainder = dividend - (quotient * divisor);
+  }
 
-  int divisor;
-  unsigned int multiplier;
-  unsigned int shift_right;
 
   /// Construct the FastDivmod object, in host code ideally.
   ///
   /// This precomputes some values based on the divisor and is computationally expensive.
 
-  CUTLASS_HOST_DEVICE
-  FastDivmod(): divisor(0), multiplier(0), shift_right(0) { }
+  constexpr FastDivmod() = default;
 
   CUTLASS_HOST_DEVICE
   FastDivmod(int divisor_): divisor(divisor_) {
-    find_divisor(multiplier, shift_right, divisor);
+    assert(divisor_ >= 0);
+    if (divisor != 1) {
+      unsigned int p = 31 + find_log2(divisor);
+      unsigned m = unsigned(((1ull << p) + unsigned(divisor) - 1) / unsigned(divisor));
+
+      multiplier = m;
+      shift_right = p - 32;
+    }
   }
 
   /// Computes integer division and modulus using precomputed values. This is computationally
   /// inexpensive.
   CUTLASS_HOST_DEVICE
   void operator()(int &quotient, int &remainder, int dividend) const {
-    fast_divmod(quotient, remainder, dividend, divisor, multiplier, shift_right);
+    fast_divmod(quotient, remainder, dividend);
   }
 
+  /// Computes integer division using precomputed values. This is computationally
+  /// inexpensive.
+  CUTLASS_HOST_DEVICE
+  int div(int dividend) const {
+    int quotient, remainder;
+    fast_divmod(quotient, remainder, dividend);
+    return quotient;
+  }
+
+  /// Alias for `div` to match the interface of FastDivmodU64
+  CUTLASS_HOST_DEVICE
+  int divide(int dividend) const {
+    return div(dividend);
+  }
 
   /// Computes integer division and modulus using precomputed values. This is computationally
   /// inexpensive.
@@ -307,7 +396,7 @@ struct FastDivmod {
   CUTLASS_HOST_DEVICE
   int divmod(int &remainder, int dividend) const {
     int quotient;
-    fast_divmod(quotient, remainder, dividend, divisor, multiplier, shift_right);
+    fast_divmod(quotient, remainder, dividend);
     return quotient;
   }
 
@@ -315,7 +404,7 @@ struct FastDivmod {
   /// inexpensive.
   CUTLASS_HOST_DEVICE
   void operator()(int &quotient, int64_t &remainder, int64_t dividend) const {
-    fast_divmod(quotient, remainder, dividend, divisor, multiplier, shift_right);
+    fast_divmod(quotient, remainder, dividend);
   }
 
   /// Computes integer division and modulus using precomputed values. This is computationally
@@ -323,11 +412,15 @@ struct FastDivmod {
   CUTLASS_HOST_DEVICE
   int divmod(int64_t &remainder, int64_t dividend) const {
     int quotient;
-    fast_divmod(quotient, remainder, dividend, divisor, multiplier, shift_right);
+    fast_divmod(quotient, remainder, dividend);
     return quotient;
   }
-};
 
+  /// Returns the divisor when cast to integer
+  CUTLASS_HOST_DEVICE
+  operator int() const { return divisor; }
+
+};
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Object to encapsulate the fast division+modulus operation for 64b integer division.
@@ -343,7 +436,7 @@ struct FastDivmod {
 ///
 ///   FastDivmodU64 divmod(divisor);
 ///
-///   divmod(quotient, remainder, dividend);  
+///   divmod(quotient, remainder, dividend);
 ///
 ///   // quotient = (dividend / divisor)
 ///   // remainder = (dividend % divisor)
@@ -406,7 +499,6 @@ struct FastDivmodU64 {
       }
       quotient = (x >> shift_right);
     #else
-      // TODO - use proper 'fast' division here also. No reason why x86-code shouldn't be optimized.
       quotient = dividend / divisor;
     #endif
 
@@ -416,7 +508,7 @@ struct FastDivmodU64 {
   /// Computes the remainder given a computed quotient and dividend
   CUTLASS_HOST_DEVICE
   uint64_t modulus(uint64_t quotient, uint64_t dividend) const {
-    return uint32_t(dividend - quotient * divisor);
+    return dividend - quotient * divisor;
   }
 
   /// Returns the quotient of floor(dividend / divisor) and computes the remainder
@@ -437,6 +529,54 @@ struct FastDivmodU64 {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+/// Object to encapsulate the fast division+modulus operation for 64b integer division
+/// in which the divisor is a power of two.
+struct FastDivmodU64Pow2 {
+
+  uint64_t divisor;
+  unsigned int shift_right;
+
+  /// Default ctor
+  CUTLASS_HOST_DEVICE
+  FastDivmodU64Pow2(): divisor(0), shift_right(0) { }
+
+  /// Construct the FastDivmod object, in host code ideally.
+  ///
+  /// This precomputes some values based on the divisor and is computationally expensive.
+  CUTLASS_HOST_DEVICE
+  FastDivmodU64Pow2(uint64_t divisor_): divisor(divisor_), shift_right(FastDivmodU64::integer_log2(divisor_)) { }
+
+  /// Returns the quotient of floor(dividend / divisor)
+  CUTLASS_HOST_DEVICE
+  uint64_t divide(uint64_t dividend) const {
+    return dividend >> shift_right;
+  }
+
+  /// Computes the remainder given a computed quotient and dividend
+  CUTLASS_HOST_DEVICE
+  uint64_t modulus(uint64_t dividend) const {
+    // See https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#division-modulo-operations
+    return dividend & (divisor - 1);
+  }
+
+  /// Returns the quotient of floor(dividend / divisor) and computes the remainder
+  CUTLASS_HOST_DEVICE
+  uint64_t divmod(uint64_t &remainder, uint64_t dividend) const {
+    uint64_t quotient = divide(dividend);
+    remainder = modulus(dividend);
+    return quotient;
+  }
+
+  /// Computes integer division and modulus using precomputed values. This is computationally
+  /// inexpensive.
+  CUTLASS_HOST_DEVICE
+  void operator()(uint64_t &quotient, uint64_t &remainder, uint64_t dividend) const {
+    quotient = divmod(remainder, dividend);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 /// Computes the coordinate decomposition from a linear index (64-bit linear index => coord<int32_t>)
 ///
 /// This decomposition is accelerated by the FastDivmodU64 object. It is assumed that
@@ -548,8 +688,8 @@ template <typename Element>
 CUTLASS_HOST_DEVICE int64_t OffsetBytes(int64_t index) {
 
   static_assert(
-    (sizeof_bits<Element>::value >= 8 && !(sizeof_bits<Element>::value % 8)) || 
-    (sizeof_bits<Element>::value <  8 && !(8 % sizeof_bits<Element>::value)), 
+    (sizeof_bits<Element>::value >= 8 && !(sizeof_bits<Element>::value % 8)) ||
+    (sizeof_bits<Element>::value <  8 && !(8 % sizeof_bits<Element>::value)),
     "Size of numeric type in bits must either be divisible by 8 bits, or 8 bits must be divisible by the size.");
 
   if (sizeof_bits<Element>::value >= 8) {
@@ -561,6 +701,17 @@ CUTLASS_HOST_DEVICE int64_t OffsetBytes(int64_t index) {
   }
 }
 
+CUTLASS_HOST_DEVICE int64_t OffsetBytes(int64_t index, int64_t element_sizeof_bits) {
+  if (element_sizeof_bits >= 8) {
+    return index * (element_sizeof_bits / 8);
+  }
+  else {
+    int64_t const kElementsPerByte = ((8 / element_sizeof_bits) + ((element_sizeof_bits >= 8) ? 1 : 0));
+    return index / kElementsPerByte;
+  }
+}
+
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Min/Max
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -576,12 +727,12 @@ struct Max {
 };
 
 CUTLASS_HOST_DEVICE
-constexpr int const_min(int a, int b) {
+CUTLASS_CONSTEXPR_IF_CXX17 int const_min(int a, int b) {
     return (b < a ? b : a);
 }
 
 CUTLASS_HOST_DEVICE
-constexpr int const_max(int a, int b) {
+CUTLASS_CONSTEXPR_IF_CXX17 int const_max(int a, int b) {
     return (b > a ? b : a);
 }
 
@@ -771,7 +922,7 @@ double fast_tanh(double x) {
 CUTLASS_HOST_DEVICE
 half_t fast_tanh(half_t x) {
   #if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && (__CUDA_ARCH__ >= 750)
-  
+
   asm volatile ( "tanh.approx.f16 %0, %1;" : "=h"(x.raw()) : "h"(x.raw()));
   return x;
 
@@ -850,13 +1001,13 @@ template <int N>
 struct fast_tanh_op<Array<half_t, N>> {
   CUTLASS_DEVICE
   Array<half_t, N> operator()(Array<half_t, N> const &rhs) const {
-    
+
     Array<half_t, N> result;
 
     // use x2 specialization
     uint32_t const *in  = reinterpret_cast<uint32_t const *>(&rhs);
     uint32_t *out = reinterpret_cast<uint32_t *>(&result);
-    
+
     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N / 2; ++i) {
       asm volatile ("tanh.approx.f16x2 %0, %1;" : "=r"(out[i]) : "r"(in[i]));
@@ -866,7 +1017,7 @@ struct fast_tanh_op<Array<half_t, N>> {
     if (N % 2) {
       uint16_t const *in = reinterpret_cast<uint16_t const *>(&rhs);
       uint16_t *out = reinterpret_cast<uint16_t *>(&result);
-      asm volatile ("tanh.approx.f16 %0, %1;" : "=h"(out[N - 1]) : "h"(in[N - 1])); 
+      asm volatile ("tanh.approx.f16 %0, %1;" : "=h"(out[N - 1]) : "h"(in[N - 1]));
     }
 
     return result;
@@ -878,7 +1029,7 @@ template <typename T, int N>
 struct fast_tanh_op<Array<T, N>> {
   CUTLASS_HOST_DEVICE
   Array<T, N> operator()(Array<T, N> const &rhs) const {
-    
+
     fast_tanh_op<T> fast_op;
     Array<T, N> y;
 
@@ -908,3 +1059,4 @@ T absolute_value(T x) {
 }  // namespace cutlass
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/include/cutlass/float8.h b/include/cutlass/float8.h
new file mode 100644
index 0000000000..cfb6b8bbb8
--- /dev/null
+++ b/include/cutlass/float8.h
@@ -0,0 +1,1284 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+    \file
+    \brief Defines a class for using IEEE half-precision floating-point types in host or
+      device code.
+*/
+
+#pragma once
+
+// FP8 types are available starting CUDA 11.8+
+#if (__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8))
+#define CUDA_FP8_ENABLED 1
+#endif
+
+#if defined(__CUDA_ARCH__)
+#  if (__CUDA_ARCH__ >= 900)
+#    if (__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8))
+#      define CUDA_PTX_FP8_CVT_ENABLED 1
+#    endif // (__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8))
+#  elif (__CUDA_ARCH__ == 890)
+#    if (__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 1))
+#      define CUDA_PTX_FP8_CVT_ENABLED 1
+#    endif // (__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 1))
+#  endif // (__CUDA_ARCH__ >= 900)
+#endif // defined(__CUDA_ARCH__)
+
+#ifdef __GNUC__
+// Ignore checks on reinterpret-casts that are being used for bitcasts.
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__CUDACC_RTC__)
+
+#include "cutlass/floating_point_nvrtc.h"
+
+#else
+//
+// Standard Library headers belong here to avoid conflicts with NVRTC.
+//
+#include <cmath>
+#include <limits>
+#include <cstdint>
+#include <cstring>
+#endif
+
+#ifdef CUDA_FP8_ENABLED
+#include <cuda_fp8.h>
+#endif
+#include <cuda_fp16.h>
+
+#include "cutlass/cutlass.h"
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  FP8 Has 2 encodings possible : E4M3 and E5M2
+//
+//  E4M3 : 7  |  6 5 4 3  |  2 1 0
+//  E5M2 : 7  |  6 5 4 3 2  |  1 0
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+enum class FloatEncoding {
+    E4M3,
+    E5M2
+};
+
+template<FloatEncoding T>
+struct alignas(1) float8_base {
+
+    static constexpr bool IS_E4M3 = (T == FloatEncoding::E4M3);
+    static constexpr bool IS_E5M2 = (T == FloatEncoding::E5M2);
+
+    // Number of Bits representing mantissa and exponents
+    static constexpr int FP32_NUM_BITS = 32;
+    static constexpr int FP32_NUM_EXPONENT_BITS = 8;
+    static constexpr int FP32_NUM_MANTISSA_BITS = 23;
+    static constexpr uint32_t FP32_NAN = 0x7fffffff;
+    static constexpr uint32_t FP32_INFINITY_MASK = 0x7f800000;
+    static constexpr int FP32_MAX_EXPONENT  =  127;
+    static constexpr int FP32_MIN_EXPONENT  = -126;
+    static constexpr int FP32_EXPONENT_BIAS =  127;
+
+    static constexpr int FP16_NUM_BITS = 16;
+    static constexpr int FP16_NUM_EXPONENT_BITS = 5;
+    static constexpr int FP16_NUM_MANTISSA_BITS = 10;
+    static constexpr uint16_t FP16_NAN = 0x7fff;
+    static constexpr uint16_t FP16_INFINITY_MASK = 0x7c00;
+    static constexpr int FP16_MAX_EXPONENT  = 15;
+    static constexpr int FP16_MIN_EXPONENT  = -14;
+    static constexpr int FP16_EXPONENT_BIAS = 15;
+
+    static constexpr int FP8_NUM_BITS = 8;
+    static constexpr int FP8_NUM_EXPONENT_BITS = IS_E4M3 ? 4 : 5;
+    static constexpr int FP8_NUM_MANTISSA_BITS = IS_E4M3 ? 3 : 2;
+    static constexpr uint8_t  FP8_NAN = 0x7f; // Also F8_INF
+    static constexpr uint8_t  FP8_INFINITY_MASK = IS_E4M3 ? 0x78 : 0x7c;
+    static constexpr int FP8_MAX_EXPONENT  = IS_E4M3 ?  7 :  15;
+    static constexpr int FP8_MIN_EXPONENT  = IS_E4M3 ? -6 : -14;
+    static constexpr int FP8_EXPONENT_BIAS = IS_E4M3 ?  7 :  15;
+
+    static constexpr uint8_t  FP8_EXPONENT_MASK = (1 << FP8_NUM_EXPONENT_BITS) - 1;
+    static constexpr uint8_t  FP8_MANTISSA_MASK = (1 << FP8_NUM_MANTISSA_BITS) - 1;
+
+    static constexpr uint8_t FP8_MAX_FLT = (IS_E4M3 ? 0x7e : 0x7b);
+
+    // 256 in float
+    static constexpr uint32_t FP8_SAT_VAL_FP32 = 0x43800000;
+
+    //
+    // Data members
+    //
+
+    /// Data container
+    uint8_t storage;
+
+    /// Ctors.
+    CUTLASS_HOST_DEVICE
+    float8_base() : storage(0) { }
+
+    /// Is finite implementation
+    CUTLASS_HOST_DEVICE
+    static bool isfinite(float flt) {
+        uint32_t s;
+
+        #if defined(__CUDA_ARCH__)
+        s = reinterpret_cast<uint32_t const &>(flt);
+        #else
+        std::memcpy(&s, &flt, sizeof(s));
+        #endif
+
+        return (s & 0x7f800000) < 0x7f800000;
+    }
+
+    /// Is NaN implementation
+    CUTLASS_HOST_DEVICE
+    static bool isnan(float flt) {
+        uint32_t s;
+
+        #if defined(__CUDA_ARCH__)
+        s = reinterpret_cast<uint32_t const &>(flt);
+        #else
+        std::memcpy(&s, &flt, sizeof(s));
+        #endif
+
+        return (s & 0x7fffffff) > 0x7f800000;
+    }
+
+    /// Is infinite implementation
+    CUTLASS_HOST_DEVICE
+    static bool isinf(float flt) {
+        uint32_t s;
+
+        #if defined(__CUDA_ARCH__)
+        s = reinterpret_cast<uint32_t const &>(flt);
+        #else
+        std::memcpy(&s, &flt, sizeof(s));
+        #endif
+
+        // Sign = 0 for +inf, 1 for -inf
+        // Exponent = all ones
+        // Mantissa = all zeros
+        return (s == 0x7f800000) || (s == 0xff800000);
+    }
+
+    /// FP32 -> FP8 conversion - rounds to nearest even
+    CUTLASS_HOST_DEVICE
+    static uint8_t convert_float_to_fp8(float const& flt) {
+
+        // software implementation rounds toward nearest even
+        uint32_t s;
+
+        #if defined(__CUDA_ARCH__)
+        s = reinterpret_cast<uint32_t const &>(flt);
+        #else
+        std::memcpy(&s, &flt, sizeof(s));
+        #endif
+
+        // Extract the bits in the FP32 type
+        uint8_t sign = uint8_t((s >> 24 & 0x80));
+        int32_t exp = int32_t((s >> FP32_NUM_MANTISSA_BITS) & 0xff) - FP32_EXPONENT_BIAS;
+        int mantissa = s & 0x7fffff;
+        uint8_t u = 0;
+
+        uint8_t const kF8_NaN = 0x7f;
+
+        // NaN => NaN
+        if (isnan(flt)) {
+            return kF8_NaN;
+        }
+
+        // Inf => MAX_FLT (satfinite)
+        if (isinf(flt)) {
+            return sign | FP8_MAX_FLT;
+        }
+
+        // Special handling
+        if (exp == -128) {
+            // int8 range is from -128 to 127
+            // So 255(inf) - 127(bias) = 128 - will show up as -128
+
+            // satfinite
+            return (sign | FP8_MAX_FLT);
+        }
+
+        int sticky_bit = 0;
+
+        bool skip_sign = false;
+        bool may_be_nan = false;
+
+        if ( (exp >= FP8_MIN_EXPONENT) && (exp <= FP8_MAX_EXPONENT) ) {
+            // normal fp32 to normal fp8
+            exp = exp + FP8_EXPONENT_BIAS;
+            u = uint8_t((uint32_t(exp) & FP8_EXPONENT_MASK) << FP8_NUM_MANTISSA_BITS);
+            u = uint8_t(u | (mantissa >> (FP32_NUM_MANTISSA_BITS - FP8_NUM_MANTISSA_BITS)));
+        } else if(exp < FP8_MIN_EXPONENT) {
+            // normal single-precision to subnormal float8-precision representation
+            int rshift = (FP8_MIN_EXPONENT - exp);
+            if (rshift < FP32_NUM_BITS) {
+                mantissa |= (1 << FP32_NUM_MANTISSA_BITS);
+
+                sticky_bit = ((mantissa & ((1 << rshift) - 1)) != 0);
+
+                mantissa = (mantissa >> rshift);
+                u = (uint8_t(mantissa >> (FP32_NUM_MANTISSA_BITS- FP8_NUM_MANTISSA_BITS)) & FP8_MANTISSA_MASK);
+            } else {
+                mantissa = 0;
+                u = 0;
+            }
+        // Exponent > FP8_MAX_EXPONENT - this is a special case done to match HW
+        // 0x4380_0000 to 0x43e0_0000 - maps from 256 to 448, and does not saturate / inf.
+        } else {
+            if( exp == (FP8_MAX_EXPONENT + 1) ) {
+                uint8_t mantissa_tmp = uint8_t(mantissa >> (FP32_NUM_MANTISSA_BITS - FP8_NUM_MANTISSA_BITS));
+                if( mantissa_tmp < FP8_MANTISSA_MASK) {
+                    exp = exp + FP8_EXPONENT_BIAS;
+                    u = uint8_t(uint32_t(exp) << FP8_NUM_MANTISSA_BITS) | mantissa_tmp;
+                    may_be_nan =  (mantissa_tmp == (FP8_MANTISSA_MASK-1));
+                } else {
+                    // satfinite
+                    return (sign | FP8_MAX_FLT);
+                }
+            } else{
+                // satfinite
+                return (sign | FP8_MAX_FLT);
+            }
+        }
+
+        // round to nearest even
+        int NUM_BITS_SHIFT = FP32_NUM_MANTISSA_BITS - (FP8_NUM_MANTISSA_BITS + 1);
+        int round_bit = ((mantissa >> NUM_BITS_SHIFT) & 1);
+        sticky_bit |= ((mantissa & ((1 << NUM_BITS_SHIFT) - 1)) != 0);
+
+        if ((round_bit && sticky_bit) || (round_bit && (u & 1))) {
+            u = uint8_t(u + 1);
+            if( may_be_nan ) {
+                skip_sign = true;
+            }
+        }
+
+        if (u > FP8_MAX_FLT) {
+            // satfinite
+            u = (sign | FP8_MAX_FLT);
+        }
+
+        if( ! skip_sign ) {
+            u |= sign;
+        }
+
+        return u;
+    }
+
+
+    /// Converts a fp8 value stored as a uint8_t to a float
+    CUTLASS_HOST_DEVICE
+    static float convert_fp8_to_float(uint8_t const& x) {
+
+        uint32_t constexpr kF32_NaN = 0x7fffffff;
+
+        uint8_t const &f8 = x;
+        uint32_t sign = (f8 >> (FP8_NUM_BITS - 1)) & 1;
+        uint32_t exp = (f8 >> FP8_NUM_MANTISSA_BITS) & FP8_EXPONENT_MASK;
+        uint32_t mantissa = f8 & FP8_MANTISSA_MASK;
+        unsigned f = (sign << (FP32_NUM_BITS-1));
+
+        if (IS_E4M3 && exp == 15 && mantissa == 0x7) {
+            f = kF32_NaN;
+        }
+        else if (exp > 0 && (IS_E4M3 || exp < (FP8_MAX_EXPONENT + FP8_EXPONENT_BIAS + 1))) {
+            // normal
+            exp += (FP32_EXPONENT_BIAS - FP8_EXPONENT_BIAS);
+            f = f |
+                (exp << FP32_NUM_MANTISSA_BITS) |
+                (mantissa << (FP32_NUM_MANTISSA_BITS-FP8_NUM_MANTISSA_BITS));
+        } else if (exp == 0) {
+            if (mantissa) {
+                // subnormal
+                exp += (FP32_EXPONENT_BIAS - FP8_EXPONENT_BIAS) + 1;
+                while ((mantissa & (1 << FP8_NUM_MANTISSA_BITS)) == 0) {
+                    mantissa <<= 1;
+                    exp--;
+                }
+                mantissa &= FP8_MANTISSA_MASK;
+                f = f |
+                    (exp << FP32_NUM_MANTISSA_BITS) |
+                    (mantissa << (FP32_NUM_MANTISSA_BITS-FP8_NUM_MANTISSA_BITS));
+            } else {
+                // sign-preserving zero
+            }
+        } else {
+            if(mantissa == 0){
+                // Sign-preserving infinity
+                f = (f | 0x7f800000);
+            } else {
+                // Canonical NaN
+                f = kF32_NaN;
+            }
+        }
+
+        #if defined(__CUDA_ARCH__)
+        return reinterpret_cast<float const&>(f);
+        #else
+        float flt;
+        std::memcpy(&flt, &f, sizeof(flt));
+        return flt;
+        #endif
+    }
+};
+
+
+// Forward declaration of float_e5m2_t to define float_e4m3_t <=> float_e5m2_t
+// conversions in class float_e4m3_t
+struct float_e5m2_t;
+
+
+///////////////////////////////////////////////////////////////
+///
+/// floating-point 8 type : E4M3
+///
+///////////////////////////////////////////////////////////////
+struct alignas(1) float_e4m3_t : float8_base<FloatEncoding::E4M3> {
+
+    using Base = float8_base<FloatEncoding::E4M3>;
+
+    static constexpr int MAX_EXPONENT = Base::FP8_MAX_EXPONENT;
+
+    //
+    // Static conversion operators
+    //
+
+    /// Constructs from an uint8_t
+    CUTLASS_HOST_DEVICE
+    static float_e4m3_t bitcast(uint8_t x) {
+        float_e4m3_t f;
+        f.storage = x;
+        return f;
+    }
+
+    /// FP32 -> FP8 conversion - rounds to nearest even
+    CUTLASS_HOST_DEVICE
+    static float_e4m3_t from_float(float const& flt) {
+    #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+        uint16_t tmp;
+        float y = float();
+        asm volatile("cvt.rn.satfinite.e4m3x2.f32 %0, %1, %2;" : "=h"(tmp) : "f"(y), "f"(flt));
+
+        return *reinterpret_cast<float_e4m3_t *>(&tmp);
+    #else
+        return bitcast(Base::convert_float_to_fp8(flt));
+    #endif
+    }
+
+    /// FP16 -> E5M2 conversion - rounds to nearest even
+    CUTLASS_HOST_DEVICE
+    static float_e4m3_t from_half(half const& flt) {
+    #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+        uint16_t tmp = 0;
+        uint32_t bits = reinterpret_cast<uint16_t const &>(flt);
+        asm volatile("cvt.rn.satfinite.e4m3x2.f16x2 %0, %1;" : "=h"(tmp) : "r"(bits));
+
+        return *reinterpret_cast<float_e4m3_t *>(&tmp);
+    #else
+        return bitcast(Base::convert_float_to_fp8(__half2float(flt)));
+    #endif
+    }
+
+    // E4M3 -> half
+    CUTLASS_HOST_DEVICE
+    static half to_half(float_e4m3_t const& x) {
+    #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+        uint16_t bits = x.storage;
+        uint32_t packed;
+        asm volatile("cvt.rn.f16x2.e4m3x2 %0, %1;\n" : "=r"(packed) : "h"(bits));
+
+        return reinterpret_cast<half2 const &>(packed).x;
+    #else
+        return __float2half(Base::convert_fp8_to_float(x.storage));
+    #endif
+    }
+
+    // E4M3 -> Float
+    CUTLASS_HOST_DEVICE
+    static float to_float(float_e4m3_t const& x) {
+    #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+        uint16_t bits = x.storage;
+        uint32_t packed;
+        asm volatile("cvt.rn.f16x2.e4m3x2 %0, %1;\n" : "=r"(packed) : "h"(bits));
+
+        return __half2float(reinterpret_cast<half2 const &>(packed).x);
+    #else
+        return Base::convert_fp8_to_float(x.storage);
+    #endif
+    }
+
+    //
+    // Methods
+    //
+
+    /// Constructor inheritance
+    using Base::Base;
+
+    /// Default constructor
+    float_e4m3_t() = default;
+
+#ifdef CUDA_FP8_ENABLED
+    /// Conversion from CUDA's FP8 type
+    CUTLASS_HOST_DEVICE
+    explicit float_e4m3_t(__nv_fp8_e4m3 x) {
+        storage = x.__x;
+    }
+#endif
+
+    /// Floating point conversion
+    CUTLASS_HOST_DEVICE
+    explicit float_e4m3_t(float x) {
+        storage = from_float(x).storage;
+    }
+
+    CUTLASS_HOST_DEVICE
+    explicit float_e4m3_t(half x) {
+        storage = from_half(x).storage;
+    }
+
+    /// Floating point conversion
+    CUTLASS_HOST_DEVICE
+    explicit float_e4m3_t(double x): float_e4m3_t(float(x)) {
+    }
+
+    /// Integer conversion
+    CUTLASS_HOST_DEVICE
+    explicit float_e4m3_t(int x): float_e4m3_t(float(x)) {
+    }
+
+    CUTLASS_HOST_DEVICE
+    explicit float_e4m3_t(unsigned x): float_e4m3_t(float(x)) {
+    }
+
+    /// E5M2 conversion. Defined after float_e5m2_t is defined.
+    CUTLASS_HOST_DEVICE
+    explicit float_e4m3_t(float_e5m2_t x);
+
+#ifdef CUDA_FP8_ENABLED
+    /// Assignment from CUDA's FP8 type
+    CUTLASS_HOST_DEVICE
+    float_e4m3_t & operator=(__nv_fp8_e4m3 x) {
+        storage = x.__x;
+        return *this;
+    }
+#endif
+
+    /// Converts to float
+    CUTLASS_HOST_DEVICE
+    operator float() const {
+        return to_float(*this);
+    }
+
+    /// Converts to half
+    CUTLASS_HOST_DEVICE
+    operator half() const {
+        return to_half(*this);
+    }
+
+    /// Converts to float
+    CUTLASS_HOST_DEVICE
+    explicit operator double() const {
+        return double(to_float(*this));
+    }
+
+    /// Converts to int
+    CUTLASS_HOST_DEVICE
+    explicit operator int() const {
+    #if defined(__CUDA_ARCH__)
+        return __half2int_rn(to_half(*this));
+    #else
+        return int(to_float(*this));
+    #endif
+    }
+
+    /// Casts to bool
+    CUTLASS_HOST_DEVICE
+    explicit operator bool() const {
+    #if defined(__CUDA_ARCH__)
+        return bool(__half2int_rn(to_half(*this)));
+    #else
+        return bool(int(to_float(*this)));
+    #endif
+    }
+
+    /// Accesses raw internal state
+    CUTLASS_HOST_DEVICE
+    uint8_t& raw() {
+        return storage;
+    }
+
+    /// Accesses raw internal state
+    CUTLASS_HOST_DEVICE
+    uint8_t raw() const {
+        return storage;
+    }
+
+    /// Returns the sign bit
+    CUTLASS_HOST_DEVICE
+    bool signbit() const {
+        return ((storage & (1 << (Base::FP8_NUM_BITS - 1))) != 0);
+    }
+
+    /// Returns the biased exponent
+    CUTLASS_HOST_DEVICE
+    int exponent_biased() const {
+        return int((storage >> FP8_NUM_MANTISSA_BITS) & Base::FP8_EXPONENT_MASK);
+    }
+
+    /// Returns the unbiased exponent
+    CUTLASS_HOST_DEVICE
+    int exponent() const {
+        return exponent_biased() - 15;
+    }
+
+    /// Returns the mantissa
+    CUTLASS_HOST_DEVICE
+    int mantissa() const {
+        return int(storage & Base::FP8_MANTISSA_MASK);
+    }
+
+    CUTLASS_HOST_DEVICE
+    friend bool isnan(float_e4m3_t const& x) {
+      return x.storage == uint8_t(0x7f);
+    }
+
+};
+///////////////////////////////////////////////////////////////
+///
+/// floating-point 8 type : E5M2
+///
+///////////////////////////////////////////////////////////////
+struct alignas(1) float_e5m2_t : float8_base<FloatEncoding::E5M2> {
+
+    using Base = float8_base<FloatEncoding::E5M2>;
+
+    static constexpr int MAX_EXPONENT = Base::FP8_MAX_EXPONENT;
+
+    //
+    // Static conversion operators
+    //
+
+    /// Constructs from an uint8_t
+    CUTLASS_HOST_DEVICE
+    static float_e5m2_t bitcast(uint8_t x) {
+        float_e5m2_t f;
+        f.storage = x;
+        return f;
+    }
+
+    /// FP32 -> FP8 conversion - rounds to nearest even
+    CUTLASS_HOST_DEVICE
+    static float_e5m2_t from_float(float const& flt) {
+    #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+        uint16_t tmp;
+        float y = float();
+        asm volatile("cvt.rn.satfinite.e5m2x2.f32 %0, %1, %2;" : "=h"(tmp) : "f"(y), "f"(flt));
+
+        return *reinterpret_cast<float_e5m2_t *>(&tmp);
+    #else
+        return bitcast(Base::convert_float_to_fp8(flt));
+    #endif
+    }
+
+    /// FP16 -> E5M2 conversion - rounds to nearest even
+    CUTLASS_HOST_DEVICE
+    static float_e5m2_t from_half(half const& flt) {
+    #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+        uint16_t tmp = 0;
+        uint32_t bits = reinterpret_cast<uint16_t const &>(flt);
+        asm volatile("cvt.rn.satfinite.e5m2x2.f16x2 %0, %1;" : "=h"(tmp) : "r"(bits));
+
+        return *reinterpret_cast<float_e5m2_t *>(&tmp);
+    #else
+        return bitcast(Base::convert_float_to_fp8(__half2float(flt)));
+    #endif
+    }
+
+    // E5M2 -> half
+    CUTLASS_HOST_DEVICE
+    static half to_half(float_e5m2_t const& x) {
+    #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+        uint16_t bits = x.storage;
+        uint32_t packed;
+        asm volatile("cvt.rn.f16x2.e5m2x2 %0, %1;\n" : "=r"(packed) : "h"(bits));
+
+        return reinterpret_cast<half2 const &>(packed).x;
+    #else
+        return __float2half(Base::convert_fp8_to_float(x.storage));
+    #endif
+    }
+
+    // E5M2 -> Float
+    CUTLASS_HOST_DEVICE
+    static float to_float(float_e5m2_t const& x) {
+    #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+        uint16_t bits = x.storage;
+        uint32_t packed;
+        asm volatile("cvt.rn.f16x2.e5m2x2 %0, %1;\n" : "=r"(packed) : "h"(bits));
+
+        return __half2float(reinterpret_cast<half2 const &>(packed).x);
+    #else
+        return Base::convert_fp8_to_float(x.storage);
+    #endif
+    }
+
+    //
+    // Methods
+    //
+
+    /// Constructor inheritance
+    using Base::Base;
+
+    /// Default constructor
+    float_e5m2_t() = default;
+
+#ifdef CUDA_FP8_ENABLED
+    /// Conversion from CUDA's FP8 type
+    CUTLASS_HOST_DEVICE
+    explicit float_e5m2_t(__nv_fp8_e5m2 x) {
+        storage = x.__x;
+    }
+#endif
+
+    /// Floating point conversion
+    CUTLASS_HOST_DEVICE
+    explicit float_e5m2_t(float x) {
+        storage = from_float(x).storage;
+    }
+
+    CUTLASS_HOST_DEVICE
+    explicit float_e5m2_t(half x) {
+      storage = from_half(x).storage;
+    }
+
+    /// Floating point conversion
+    CUTLASS_HOST_DEVICE
+    explicit float_e5m2_t(double x): float_e5m2_t(float(x)) {
+    }
+
+    /// Integer conversion
+    CUTLASS_HOST_DEVICE
+    explicit float_e5m2_t(int x): float_e5m2_t(float(x)) {
+    }
+
+    CUTLASS_HOST_DEVICE
+    explicit float_e5m2_t(unsigned x): float_e5m2_t(float(x)) {
+    }
+
+    /// E4M3 conversion
+    CUTLASS_HOST_DEVICE
+    explicit float_e5m2_t(float_e4m3_t x);
+
+#ifdef CUDA_FP8_ENABLED
+    /// Assignment from CUDA's FP8 type
+    CUTLASS_HOST_DEVICE
+    float_e5m2_t & operator=(__nv_fp8_e5m2 x) {
+        storage = x.__x;
+        return *this;
+    }
+#endif
+
+    /// Converts to float
+    CUTLASS_HOST_DEVICE
+    operator float() const {
+        return to_float(*this);
+    }
+
+    /// Converts to half
+    CUTLASS_HOST_DEVICE
+    operator half() const {
+      return to_half(*this);
+    }
+
+    /// Converts to float
+    CUTLASS_HOST_DEVICE
+    explicit operator double() const {
+        return double(to_float(*this));
+    }
+
+    /// Converts to int
+    CUTLASS_HOST_DEVICE
+    explicit operator int() const {
+    #if defined(__CUDA_ARCH__)
+        return __half2int_rn(to_half(*this));
+    #else
+        return int(to_float(*this));
+    #endif
+    }
+
+    /// Casts to bool
+    CUTLASS_HOST_DEVICE
+    explicit operator bool() const {
+    #if defined(__CUDA_ARCH__)
+        return bool(__half2int_rn(to_half(*this)));
+    #else
+        return bool(int(to_float(*this)));
+    #endif
+    }
+
+    /// Accesses raw internal state
+    CUTLASS_HOST_DEVICE
+    uint8_t& raw() {
+        return storage;
+    }
+
+    /// Accesses raw internal state
+    CUTLASS_HOST_DEVICE
+    uint8_t raw() const {
+        return storage;
+    }
+
+    /// Returns the sign bit
+    CUTLASS_HOST_DEVICE
+    bool signbit() const {
+        return ((storage & (1 << (Base::FP8_NUM_BITS - 1))) != 0);
+    }
+
+    /// Returns the biased exponent
+    CUTLASS_HOST_DEVICE
+    int exponent_biased() const {
+        return int((storage >> FP8_NUM_MANTISSA_BITS) & Base::FP8_EXPONENT_MASK);
+    }
+
+    /// Returns the unbiased exponent
+    CUTLASS_HOST_DEVICE
+    int exponent() const {
+        return exponent_biased() - 15;
+    }
+
+    /// Returns the mantissa
+    CUTLASS_HOST_DEVICE
+    int mantissa() const {
+        return int(storage & Base::FP8_MANTISSA_MASK);
+    }
+    
+    CUTLASS_HOST_DEVICE
+    friend bool isnan(float_e5m2_t const& x) {
+      return x.storage == uint8_t(0x7f);
+    }
+
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Arithmetic operators
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_HOST_DEVICE
+bool operator==(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
+    return float(lhs) == float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator!=(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
+    return float(lhs) != float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator<(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
+    return float(lhs) < float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator<=(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
+    return float(lhs) <= float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator>(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
+    return float(lhs) > float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator>=(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
+    return float(lhs) >= float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t operator+(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
+    return float_e4m3_t(float(lhs) + float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t operator-(float_e4m3_t const& lhs) {
+    return float_e4m3_t(-float(lhs));
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t operator-(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
+    return float_e4m3_t(float(lhs) - float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t operator*(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
+    return float_e4m3_t(float(lhs) * float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t operator/(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
+    return float_e4m3_t(float(lhs) / float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t& operator+=(float_e4m3_t & lhs, float_e4m3_t const& rhs) {
+    lhs = float_e4m3_t(float(lhs) + float(rhs));
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t& operator-=(float_e4m3_t & lhs, float_e4m3_t const& rhs) {
+    lhs = float_e4m3_t(float(lhs) - float(rhs));
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t& operator*=(float_e4m3_t & lhs, float_e4m3_t const& rhs) {
+    lhs = float_e4m3_t(float(lhs) * float(rhs));
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t& operator/=(float_e4m3_t & lhs, float_e4m3_t const& rhs) {
+    lhs = float_e4m3_t(float(lhs) / float(rhs));
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t& operator++(float_e4m3_t & lhs) {
+    float tmp(lhs);
+    ++tmp;
+    lhs = float_e4m3_t(tmp);
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t& operator--(float_e4m3_t & lhs) {
+    float tmp(lhs);
+    --tmp;
+    lhs = float_e4m3_t(tmp);
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t operator++(float_e4m3_t & lhs, int) {
+    float_e4m3_t ret(lhs);
+    float tmp(lhs);
+    tmp++;
+    lhs = float_e4m3_t(tmp);
+    return ret;
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t operator--(float_e4m3_t & lhs, int) {
+    float_e4m3_t ret(lhs);
+    float tmp(lhs);
+    tmp--;
+    lhs = float_e4m3_t(tmp);
+    return ret;
+}
+
+CUTLASS_HOST_DEVICE
+bool operator==(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
+    return float(lhs) == float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator!=(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
+    return float(lhs) != float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator<(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
+    return float(lhs) < float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator<=(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
+    return float(lhs) <= float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator>(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
+    return float(lhs) > float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator>=(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
+    return float(lhs) >= float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t operator+(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
+    return float_e5m2_t(float(lhs) + float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t operator-(float_e5m2_t const& lhs) {
+    return float_e5m2_t(-float(lhs));
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t operator-(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
+    return float_e5m2_t(float(lhs) - float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t operator*(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
+    return float_e5m2_t(float(lhs) * float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t operator/(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
+    return float_e5m2_t(float(lhs) / float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t& operator+=(float_e5m2_t & lhs, float_e5m2_t const& rhs) {
+    lhs = float_e5m2_t(float(lhs) + float(rhs));
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t& operator-=(float_e5m2_t & lhs, float_e5m2_t const& rhs) {
+    lhs = float_e5m2_t(float(lhs) - float(rhs));
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t& operator*=(float_e5m2_t & lhs, float_e5m2_t const& rhs) {
+    lhs = float_e5m2_t(float(lhs) * float(rhs));
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t& operator/=(float_e5m2_t & lhs, float_e5m2_t const& rhs) {
+    lhs = float_e5m2_t(float(lhs) / float(rhs));
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t& operator++(float_e5m2_t & lhs) {
+    float tmp(lhs);
+    ++tmp;
+    lhs = float_e5m2_t(tmp);
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t& operator--(float_e5m2_t & lhs) {
+    float tmp(lhs);
+    --tmp;
+    lhs = float_e5m2_t(tmp);
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t operator++(float_e5m2_t & lhs, int) {
+    float_e5m2_t ret(lhs);
+    float tmp(lhs);
+    tmp++;
+    lhs = float_e5m2_t(tmp);
+    return ret;
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t operator--(float_e5m2_t & lhs, int) {
+    float_e5m2_t ret(lhs);
+    float tmp(lhs);
+    tmp--;
+    lhs = float_e5m2_t(tmp);
+    return ret;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// float_e4m3_t <=> float_e5m2_t conversions
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// float_e4m3_t <= float_e5m2_t
+CUTLASS_HOST_DEVICE
+float_e4m3_t::float_e4m3_t(float_e5m2_t x) {
+    storage = from_float(float_e5m2_t::to_float(x)).storage;
+}
+
+/// float_e5m2_t <= float_e4m3_t
+CUTLASS_HOST_DEVICE
+float_e5m2_t::float_e5m2_t(float_e4m3_t x) {
+    storage = from_float(float_e4m3_t::to_float(x)).storage;
+}
+
+///////////////////////////////////////////////////////////////
+///
+/// Umbrella floating-point 8-bit data type : type_erased_dynamic_float8_t
+/// This umbrella datatype can be enabled when a user provides a specific
+/// datatype in runtime argument list.
+///
+/// Currently supported runtime datatypes compatible with type_erased_dynamic_float8_t:
+///   MXF8F6F4Format::E5M2
+///   MXF8F6F4Format::E4M3
+///
+///////////////////////////////////////////////////////////////
+
+union type_erased_dynamic_float8_t {
+  uint8_t data;
+  cutlass::float_e5m2_t e5m2;
+  cutlass::float_e4m3_t e4m3;
+  CUTLASS_HOST_DEVICE
+  explicit operator cutlass::float_e5m2_t() const {
+    return e5m2;
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit operator cutlass::float_e4m3_t() const {
+    return e4m3;
+  }
+
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Standard Library operations and definitions
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if !defined(__CUDACC_RTC__)
+namespace std {
+
+/// Numeric limits common to all float8 types
+template <typename T>
+struct float8_base_numeric_limits {
+private:
+  using F8Type = T;
+public:
+  static bool const is_specialized = true;
+  static bool const is_signed = true;
+  static bool const is_integer = false;
+  static bool const is_exact = false;
+  static bool const has_quiet_NaN = true;
+  static bool const has_signaling_NaN = false;
+  static std::float_denorm_style const has_denorm = std::denorm_present;
+  static bool const has_denorm_loss = true;
+  static std::float_round_style const round_style = std::round_to_nearest;
+  static bool const is_iec559 = false;
+  static bool const is_bounded = true;
+  static bool const is_modulo = false;
+  static int const digits = F8Type::FP8_NUM_MANTISSA_BITS;
+
+  /// Least positive value
+  CUTLASS_HOST_DEVICE
+  static F8Type min() { return F8Type::bitcast(0x01); }
+
+  /// Maximum finite value
+  CUTLASS_HOST_DEVICE
+  static F8Type max() { return F8Type::bitcast(F8Type::FP8_MAX_FLT); }
+
+  /// Returns maximum rounding error
+  CUTLASS_HOST_DEVICE
+  static F8Type round_error() { return F8Type(0.5f); }
+
+  /// Returns positive infinity value
+  CUTLASS_HOST_DEVICE
+  static F8Type infinity() { return F8Type::bitcast(F8Type::FP8_INFINITY_MASK); }
+
+  /// Returns quiet NaN value
+  CUTLASS_HOST_DEVICE
+  static F8Type quiet_NaN() { return F8Type::bitcast(F8Type::FP8_NAN); }
+
+  /// Returns signaling NaN value
+  CUTLASS_HOST_DEVICE
+  static F8Type signaling_NaN() { return F8Type::bitcast(F8Type::FP8_NAN); }
+
+  /// Returns smallest positive subnormal value
+  CUTLASS_HOST_DEVICE
+  static F8Type denorm_min() { return F8Type::bitcast(0x01); }
+};
+
+/// Numeric limits for float_e4m3_t
+template <>
+struct numeric_limits<cutlass::float_e4m3_t> :
+    public float8_base_numeric_limits<cutlass::float_e4m3_t> {
+  static bool const has_infinity = false;
+
+  /// Minimum finite value
+  static cutlass::float_e4m3_t lowest() { return cutlass::float_e4m3_t::bitcast(0xfe); }
+
+  /// Machine epsilon, that is, the difference between 1.0 and the next representable value
+  static cutlass::float_e4m3_t epsilon() { return cutlass::float_e4m3_t::bitcast(0x20); }
+};
+
+/// Numeric limits for float_e5m2_t
+template <>
+struct numeric_limits<cutlass::float_e5m2_t>  :
+    public float8_base_numeric_limits<cutlass::float_e5m2_t> {
+  static bool const has_infinity = true;
+
+  /// Minimum finite value
+  static cutlass::float_e5m2_t lowest() { return cutlass::float_e5m2_t::bitcast(0xfb); }
+
+  /// Machine epsilon, that is, the difference between 1.0 and the next representable value
+  static cutlass::float_e5m2_t epsilon() { return cutlass::float_e5m2_t::bitcast(0x34); }
+};
+
+}  // namespace std
+#endif
+
+namespace cutlass {
+namespace platform {
+
+/// Numeric limits common to all float8 types
+template <typename T>
+struct float8_base_numeric_limits {
+private:
+  using F8Type = T;
+public:
+  static bool const is_specialized = true;
+  static bool const is_signed = true;
+  static bool const is_integer = false;
+  static bool const is_exact = false;
+  static bool const has_quiet_NaN = true;
+  static bool const has_signaling_NaN = false;
+#if !defined(__CUDACC_RTC__)
+  static std::float_denorm_style const has_denorm = std::denorm_present;
+#endif
+  static bool const has_denorm_loss = true;
+#if !defined(__CUDACC_RTC__)
+  static std::float_round_style const round_style = std::round_to_nearest;
+#endif
+  static bool const is_iec559 = false;
+  static bool const is_bounded = true;
+  static bool const is_modulo = false;
+  static int const digits = F8Type::FP8_NUM_MANTISSA_BITS;
+
+  /// Least positive value
+  CUTLASS_HOST_DEVICE
+  static F8Type min() { return F8Type::bitcast(0x01); }
+
+  /// Maximum finite value
+  CUTLASS_HOST_DEVICE
+  static F8Type max() { return F8Type::bitcast(F8Type::FP8_MAX_FLT); }
+
+  /// Returns maximum rounding error
+  CUTLASS_HOST_DEVICE
+  static F8Type round_error() { return F8Type(0.5f); }
+
+  /// Returns positive infinity value
+  CUTLASS_HOST_DEVICE
+  static F8Type infinity() { return F8Type::bitcast(F8Type::FP8_INFINITY_MASK); }
+
+  /// Returns quiet NaN value
+  CUTLASS_HOST_DEVICE
+  static F8Type quiet_NaN() { return F8Type::bitcast(F8Type::FP8_NAN); }
+
+  /// Returns signaling NaN value
+  CUTLASS_HOST_DEVICE
+  static F8Type signaling_NaN() { return F8Type::bitcast(F8Type::FP8_NAN); }
+
+  /// Returns smallest positive subnormal value
+  CUTLASS_HOST_DEVICE
+  static F8Type denorm_min() { return F8Type::bitcast(0x01); }
+};
+
+/// Forward Declaration
+template <class T>
+struct numeric_limits;
+
+/// Numeric limits for float_e4m3_t
+template <>
+struct numeric_limits<cutlass::float_e4m3_t> :
+    public float8_base_numeric_limits<cutlass::float_e4m3_t> {
+  static bool const has_infinity = false;
+
+  /// Minimum finite value
+  static cutlass::float_e4m3_t lowest() { return cutlass::float_e4m3_t::bitcast(0xfe); }
+
+  /// Machine epsilon, that is, the difference between 1.0 and the next representable value
+  static cutlass::float_e4m3_t epsilon() { return cutlass::float_e4m3_t::bitcast(0x20); }
+};
+
+/// Numeric limits for float_e5m2_t
+template <>
+struct numeric_limits<cutlass::float_e5m2_t>  :
+    public float8_base_numeric_limits<cutlass::float_e5m2_t> {
+  static bool const has_infinity = true;
+
+  /// Minimum finite value
+  static cutlass::float_e5m2_t lowest() { return cutlass::float_e5m2_t::bitcast(0xfb); }
+
+  /// Machine epsilon, that is, the difference between 1.0 and the next representable value
+  static cutlass::float_e5m2_t epsilon() { return cutlass::float_e5m2_t::bitcast(0x34); }
+};
+
+}  // namespace platform
+
+}  // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// User-defined literals
+//
+
+CUTLASS_HOST_DEVICE
+cutlass::float_e4m3_t operator "" _fe4m3(long double x) {
+  return cutlass::float_e4m3_t(float(x));
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::float_e4m3_t operator "" _fe4m3(unsigned long long int x) {
+  return cutlass::float_e4m3_t(int(x));
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::float_e5m2_t operator "" _fe5m2(long double x) {
+  return cutlass::float_e5m2_t(float(x));
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::float_e5m2_t operator "" _fe5m2(unsigned long long int x) {
+  return cutlass::float_e5m2_t(int(x));
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/floating_point_nvrtc.h b/include/cutlass/floating_point_nvrtc.h
new file mode 100644
index 0000000000..c08396aa78
--- /dev/null
+++ b/include/cutlass/floating_point_nvrtc.h
@@ -0,0 +1,104 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+    \file
+    \brief Defines categories for floating point numbers for use in NVRTC-compiled code
+*/
+
+#pragma once
+
+#include <cutlass/detail/helper_macros.hpp> // CUTLASS_HOST_DEVICE
+#include <cutlass/platform/platform.h> // uint32_t
+#if !defined(__CUDACC_RTC__)
+#include <cstring> // std::memcpy
+#endif
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// All floating-point numbers can be put in one of these categories.
+enum  {
+    FP_NAN =
+# define FP_NAN 0
+      FP_NAN,
+    FP_INFINITE =
+# define FP_INFINITE 1
+      FP_INFINITE,
+    FP_ZERO =
+# define FP_ZERO 2
+      FP_ZERO,
+    FP_SUBNORMAL =
+# define FP_SUBNORMAL 3
+      FP_SUBNORMAL,
+    FP_NORMAL =
+# define FP_NORMAL 4
+      FP_NORMAL
+};
+
+CUTLASS_HOST_DEVICE
+int fpclassify(float const& f) {
+
+  uint32_t s;
+
+  #if defined(__CUDA_ARCH__)
+  s = reinterpret_cast<uint32_t const &>(f);
+  #else
+  std::memcpy(&s, &f, sizeof(s));
+  #endif
+
+  uint32_t exp      = s & 0x7f800000;
+  uint32_t mantissa = s & 0x007fffff;
+
+  if (exp == 0x7f800000) {
+    if (mantissa) {
+      return FP_NAN;
+    }
+    else {
+      return FP_INFINITE;
+    }
+  }
+  else if (!exp) {
+    if (mantissa) {
+      return FP_SUBNORMAL;
+    }
+    else {
+      return FP_ZERO;
+    }
+  }
+  return FP_NORMAL;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/functional.h b/include/cutlass/functional.h
index 582a8e3093..3c4d5c76ba 100644
--- a/include/cutlass/functional.h
+++ b/include/cutlass/functional.h
@@ -1,5 +1,5 @@
   /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -29,19 +29,29 @@
  *
  **************************************************************************************************/
 /*! \file
-    \brief Define basic numeric operators with specializations for Array<T, N>. SIMD-ize where possible.
+    \brief Define basic numeric operators
 
     This is inspired by the Standard Library's <functional> header.
 */
-
 #pragma once
 
 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
-#include "cutlass/complex.h"
-#include "cutlass/quaternion.h"
-#include "cutlass/array.h"
-#include "cutlass/half.h"
+#include "cutlass/platform/platform.h"
+#if defined(__CUDACC_RTC__)
+#include "cutlass/floating_point_nvrtc.h"
+#endif
+
+#include <cuda_runtime.h>
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include <mma.h>
+#endif // defined(CUTLASS_ARCH_WMMA_ENABLED)
+
+#ifdef _MSC_VER
+// Provides support for alternate operators such as 'and', 'or', ...
+#include <ciso646>
+#endif // _MSC_VER
 
 namespace cutlass {
 
@@ -55,6 +65,12 @@ struct absolute_value_op {
   }
 };
 
+template <>
+struct absolute_value_op<float> {
+  CUTLASS_HOST_DEVICE
+  float operator()(float lhs) const { return fabs(lhs); }
+};
+
 template <typename T>
 struct plus {
   CUTLASS_HOST_DEVICE
@@ -83,13 +99,71 @@ struct multiplies {
 };
 
 template <typename T>
-struct multiplies<Quaternion<T>> {
+struct scale {
+  T const scaling_factor_;
+
   CUTLASS_HOST_DEVICE
-  Quaternion<T> operator()(Quaternion<T> lhs, Quaternion<T> const &rhs) const {
-    lhs = lhs * rhs;
-    return lhs;
+  scale(float scaling_factor) : scaling_factor_(scaling_factor) {
+  }
+
+  T operator()(T const &rhs) const {
+    T result = rhs * scaling_factor_;
+    return result;
+  }
+};
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+/// Partial specializations needed when __CUDA_NO_HALF2_OPERATORS__ is set
+template<>
+struct plus<__half2> {
+  CUTLASS_HOST_DEVICE
+  __half2 operator()(__half2 lhs, __half2 const &rhs) const {
+    return __hadd2(lhs, rhs);
+  }
+};
+
+template<>
+struct minus<__half2> {
+  CUTLASS_HOST_DEVICE
+  __half2 operator()(__half2 lhs, __half2 const &rhs) const {
+    return __hsub2(lhs, rhs);
+  }
+};
+
+template<>
+struct multiplies<__half2> {
+  CUTLASS_HOST_DEVICE
+  __half2 operator()(__half2 lhs, __half2 const &rhs) const {
+    return __hmul2(lhs, rhs);
+  }
+};
+
+/// Partial specializations needed when __CUDA_NO_HALF_OPERATORS__ is set
+template<>
+struct plus<__half> {
+  CUTLASS_HOST_DEVICE
+  __half operator()(__half lhs, __half const &rhs) const {
+    return __hadd(lhs, rhs);
+  }
+};
+
+template<>
+struct minus<__half> {
+  CUTLASS_HOST_DEVICE
+  __half operator()(__half lhs, __half const &rhs) const {
+    return __hsub(lhs, rhs);
+  }
+};
+
+template<>
+struct multiplies<__half> {
+  CUTLASS_HOST_DEVICE
+  __half operator()(__half lhs, __half const &rhs) const {
+    return __hmul(lhs, rhs);
   }
 };
+#endif // defined(__CUDA_ARCH__)
+
 
 /// Squares with optional conversion
 template <typename T, typename Output = T>
@@ -115,37 +189,6 @@ struct magnitude_squared {
   }
 };
 
-/// Squares with optional conversion
-template <typename T, typename Output>
-struct magnitude_squared<complex<T>, Output> {
-  CUTLASS_HOST_DEVICE
-  Output operator()(complex<T> lhs) const {
-    multiplies<Output> mul_op;
-
-    Output y_r = Output(lhs.real());
-    Output y_i = Output(lhs.imag());
-
-    return mul_op(y_r, y_r) + mul_op(y_i, y_i);
-  }
-};
-
-/// Squares with optional conversion
-template <typename T, typename Output>
-struct magnitude_squared<Quaternion<T>, Output> {
-  CUTLASS_HOST_DEVICE
-  Output operator()(Quaternion<T> lhs) const {
-    multiplies<Output> mul_op;
-
-    Output y_w = Output(lhs.w());
-    Output y_x = Output(lhs.x());
-    Output y_y = Output(lhs.y());
-    Output y_z = Output(lhs.z());
-
-    return mul_op(y_w, y_w) + mul_op(y_x, y_x) + mul_op(y_y, y_y) + \
-           mul_op(y_z, y_z);
-  }
-};
-
 /// Computes the square of a difference with optional conversion
 template <typename T, typename Output = T>
 struct square_difference {
@@ -170,20 +213,36 @@ struct magnitude_squared_difference {
   }
 };
 
-/// Computes the square of a difference with optional conversion
-template <typename T, typename Output>
-struct magnitude_squared_difference<complex<T>, Output> {
-  CUTLASS_HOST_DEVICE
-  Output operator()(complex<T> lhs, complex<T> rhs) const {
-    multiplies<Output> mul_op;
+// Computes the reciprocal square root
+template <typename T>
+struct inverse_square_root;
 
-    Output y_r = Output(lhs.real()) - Output(rhs.real());
-    Output y_i = Output(lhs.imag()) - Output(rhs.imag());
+template <>
+struct inverse_square_root<float> {
+  CUTLASS_HOST_DEVICE
+  float operator()(float const &lhs) const {
+#if defined(__CUDA_ARCH__)
+    return rsqrtf(lhs);
+#else
+    return 1.f / std::sqrt(lhs);
+#endif
+  }
+};
 
-    return mul_op(y_r, y_r) + mul_op(y_i, y_i);
+template <>
+struct inverse_square_root<half_t> {
+  CUTLASS_HOST_DEVICE
+  half_t operator()(half_t const &lhs) const {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 520)
+    auto result = hrsqrt(reinterpret_cast<__half const &>(lhs));
+    return reinterpret_cast<half_t const &>(result);
+#else
+    return half_t(1.f / std::sqrt(half_t::convert(lhs)));
+#endif
   }
 };
 
+/// Divides
 template <typename T>
 struct divides {
   CUTLASS_HOST_DEVICE
@@ -193,7 +252,55 @@ struct divides {
   }
 };
 
+/// reciprocal_approximate
+template <typename T>
+struct reciprocal_approximate {
+  CUTLASS_HOST_DEVICE
+  T operator()(T lhs) const {
+    return divides<T>{}(T(1), lhs);
+  }
+};
+
+template <>
+struct reciprocal_approximate <float> {
+  CUTLASS_HOST_DEVICE
+  float operator()(float lhs) const {
+    float ret;
+    #if defined(__CUDA_ARCH__)
+      asm volatile ("rcp.approx.f32 %0, %1;\n" : "=f"(ret) : "f"(lhs));
+    #else
+      ret = 1.0f / lhs;
+    #endif
+    return ret;
+  }
+};
+
+/// reciprocal_approximate with ftz
+template<typename T>
+struct reciprocal_approximate_ftz :  reciprocal_approximate<T>
+{};
+
+template <>
+struct reciprocal_approximate_ftz <float> {
+  CUTLASS_HOST_DEVICE
+  float operator()(float lhs) const {
+    float ret;
+    #if defined(__CUDA_ARCH__)
+      asm volatile ("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(ret) : "f"(lhs));
+    #else
+      if (std::fpclassify(lhs) == FP_SUBNORMAL) {
+        lhs = 0.0f;
+      }
+      ret = 1.0f / lhs;
+      if (std::fpclassify(ret) == FP_SUBNORMAL) {
+        ret = 0.0f;
+      }
+    #endif
+    return ret;
+  }
+};
 
+/// Negate
 template <typename T>
 struct negate {
   CUTLASS_HOST_DEVICE
@@ -202,7 +309,7 @@ struct negate {
   }
 };
 
-/// Greater equal 
+/// Greater equal
 template <typename T>
 struct greater_equal {
   CUTLASS_HOST_DEVICE
@@ -211,7 +318,7 @@ struct greater_equal {
   }
 };
 
-/// Greater  
+/// Greater
 template <typename T>
 struct greater {
   CUTLASS_HOST_DEVICE
@@ -220,7 +327,7 @@ struct greater {
   }
 };
 
-/// Less equal 
+/// Less equal
 template <typename T>
 struct less_equal {
   CUTLASS_HOST_DEVICE
@@ -229,7 +336,7 @@ struct less_equal {
   }
 };
 
-/// Less  
+/// Less
 template <typename T>
 struct less {
   CUTLASS_HOST_DEVICE
@@ -238,40 +345,138 @@ struct less {
   }
 };
 
-template <typename T>
+template <typename T, bool PropagateNaN = false>
 struct maximum {
-
   CUTLASS_HOST_DEVICE
   T operator()(T const &lhs, T const &rhs) const {
-    return (lhs < rhs ? rhs : lhs);
+    if constexpr (PropagateNaN && cutlass::platform::is_floating_point<T>::value) {
+      using CUTLASS_CMATH_NAMESPACE :: isnan;
+
+      // Call isnan unqualified, so argument-dependent lookup (ADL)
+      // will find overloads such as cutlass::isnan(half_t).
+      // Calling ::isnan or std::isnan directly would force
+      // implicit conversions to float of custom number types
+      // in the cutlass namespace (e.g., cutlass::half_t).
+      return lhs > rhs || isnan(lhs) ? lhs : rhs;
+    }
+    else {
+      return (lhs < rhs ? rhs : lhs);
+    }
   }
 };
 
+// This is a subclass and not an alias
+// in order to work around a known Clang issue,
+// where a template template parameter with one template parameter
+// does not match classes that take multiple template parameters
+// but have defaults for all but the first.
+template<typename T>
+struct maximum_with_default_nan_propagation : public maximum<T>
+{};
+
 template <>
-struct maximum<float> {
+struct maximum<float, false> {
   CUTLASS_HOST_DEVICE
   float operator()(float const &lhs, float const &rhs) const {
     return fmaxf(lhs, rhs);
   }
 };
 
+template <>
+struct maximum<float, true> {
+  CUTLASS_HOST_DEVICE
+  float operator()(float lhs, float rhs) const {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+    float res;
+    asm volatile("max.NaN.f32 %0, %1, %2;\n" : "=f"(res) : "f"(lhs), "f"(rhs));
+    return res;
+#else
+    using CUTLASS_CMATH_NAMESPACE :: isnan;
+
+    return lhs > rhs || isnan(lhs) ? lhs : rhs;
+#endif
+  }
+};
+
+// This is a subclass and not an alias
+// in order to work around a known Clang issue,
+// where a template template parameter with one template parameter
+// does not match classes that take multiple template parameters
+// but have defaults for all but the first.
 template <typename T>
-struct minimum {
+struct maximum_with_nan_propagation : maximum<T, true>
+{};
+
+// This alias exists for backwards compatibility only.
+// Please use the correctly spelled class template above.
+template <typename T>
+using maximum_with_nan_propogation = maximum_with_nan_propagation<T>;
 
+template <typename T, bool PropagateNaN = false>
+struct minimum {
   CUTLASS_HOST_DEVICE
   T operator()(T const &lhs, T const &rhs) const {
-    return (rhs < lhs ? rhs : lhs);
+    if constexpr (PropagateNaN && cutlass::platform::is_floating_point<T>::value) {
+      using CUTLASS_CMATH_NAMESPACE :: isnan;
+
+      return lhs < rhs || isnan(lhs) ? lhs : rhs;
+    }
+    else {
+      return (rhs < lhs ? rhs : lhs);
+    }
   }
 };
 
 template <>
-struct minimum<float> {
+struct minimum<float, false> {
   CUTLASS_HOST_DEVICE
   float operator()(float const &lhs, float const &rhs) const {
     return fminf(lhs, rhs);
   }
 };
 
+template <>
+struct minimum<float, true> {
+  CUTLASS_HOST_DEVICE
+  float operator()(float lhs, float rhs) const {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+    float res;
+    asm volatile("min.NaN.f32 %0, %1, %2;\n" : "=f"(res) : "f"(lhs), "f"(rhs));
+    return res;
+#else
+    // No need for ADL; call std::isnan(float) on host and ::isnan(float) on device.
+    return lhs < rhs || (CUTLASS_CMATH_NAMESPACE :: isnan(lhs)) ? lhs : rhs;
+#endif
+  }
+};
+
+template <typename T>
+struct minimum_with_nan_propagation : minimum<T, true> 
+{};
+
+template <typename T, bool PropagateNaN = false>
+struct maximum_absolute_value {
+  CUTLASS_HOST_DEVICE
+  float operator()(T const &lhs, T const &rhs) const {
+    absolute_value_op<T> abs_op;
+    maximum<T, PropagateNaN> max_op;
+
+    return max_op(abs_op(lhs), abs_op(rhs));
+  }
+};
+
+// assumes the left operand is already an absolute value
+template <typename T, bool PropagateNaN = false>
+struct maximum_absolute_value_reduction {
+  CUTLASS_HOST_DEVICE
+  float operator()(T const &lhs, T const &rhs) const {
+    absolute_value_op<T> abs_op;
+    maximum<T, PropagateNaN> max_op;
+
+    return max_op(lhs, abs_op(rhs));
+  }
+};
+
 /// Fused multiply-add
 template <typename A, typename B = A, typename C = A>
 struct multiply_add {
@@ -281,6 +486,24 @@ struct multiply_add {
   }
 };
 
+template <typename T>
+struct square_and_plus {
+  CUTLASS_HOST_DEVICE
+  T operator()(T lhs, T const &rhs) const {
+    multiply_add<T> multiply_add_op;
+    return multiply_add_op(rhs, rhs, lhs);
+  }
+};
+
+// Fused multiply-add that takes exactly one template parameter.
+// This is useful for working around a known Clang issue,
+// where a template template parameter with one template parameter
+// does not match classes that take multiple template parameters
+// but have defaults for all but the first.
+template <typename A>
+struct homogeneous_multiply_add : public multiply_add<A, A, A>
+{};
+
 /// Fused multiply-add
 template <typename A, typename B = A, typename C = A>
 struct multiply_add_relu0 {
@@ -291,6 +514,78 @@ struct multiply_add_relu0 {
   }
 };
 
+/// Guarded-multiply-add
+template <typename A, typename B = A, typename C = A>
+struct guarded_multiply_add {
+  CUTLASS_HOST_DEVICE
+  C operator()(A const &a, B const &b, C const &c) const {
+    using CUTLASS_CMATH_NAMESPACE :: isnan;
+
+    if (isnan(a) || isnan(b)) {
+      return C(0);
+    }
+    return C(a) * C(b) + c;
+  }
+};
+
+/// Guarded-multiply-add
+template <>
+struct guarded_multiply_add<half_t, half_t, half_t> {
+  CUTLASS_HOST_DEVICE
+  half_t operator()(half_t const &a, half_t const &b, half_t const &c) const {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+    half_t result;
+    asm ("fma.rn.oob.f16 %0, %1, %2, %3;\n"
+      : "=h"(*reinterpret_cast<uint16_t*>(&result))
+      : "h"(*reinterpret_cast<uint16_t const*>(&a)), "h"(*reinterpret_cast<uint16_t const*>(&b)), "h"(*reinterpret_cast<uint16_t const*>(&c)));
+    return result;
+#else
+    // Namespace-qualifying isnan as cutlass::isnan saves the compiler
+    // the trouble of argument-dependent lookup.  Calling std::isnan or
+    // ::isnan here would result in unwanted implicit conversion to float.
+    if (cutlass::isnan(a) || cutlass::isnan(b)) {
+      return half_t(0);
+    }
+    return a * b + c;
+#endif
+  }
+};
+
+/// Guarded-multiply-add-relu0
+template <typename A, typename B = A, typename C = A>
+struct guarded_multiply_add_relu0 {
+  CUTLASS_HOST_DEVICE
+  C operator()(A const &a, B const &b, C const &c) const {
+    using CUTLASS_CMATH_NAMESPACE :: isnan;
+
+    if (isnan(a) || isnan(b)) {
+      return C(0);
+    }
+    maximum<C> mx;
+    return mx(C(a) * C(b) + c, C(0));
+  }
+};
+
+template <>
+struct guarded_multiply_add_relu0<half_t, half_t, half_t> {
+  CUTLASS_HOST_DEVICE
+  half_t operator()(half_t const &a, half_t const &b, half_t const &c) const {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+    half_t result;
+    asm ("fma.rn.oob.relu.f16 %0, %1, %2, %3;\n"
+      : "=h"(*reinterpret_cast<uint16_t*>(&result))
+      : "h"(*reinterpret_cast<uint16_t const*>(&a)), "h"(*reinterpret_cast<uint16_t const*>(&b)), "h"(*reinterpret_cast<uint16_t const*>(&c)));
+    return result;
+#else
+    if (cutlass::isnan(a) || cutlass::isnan(b)) {
+      return half_t(0);
+    }
+    maximum<half_t> mx;
+    return mx(a * b + c, half_t(0));
+#endif
+  }
+};
+
 /// Fused multiply-add
 template <typename T>
 struct and_add {
@@ -310,11 +605,111 @@ struct xor_add {
   }
 };
 
+namespace detail {
+
+// Whether namespace-unqualified conj(t) for t of type T is
+// well-formed.  This says whether the compiler can find
+// namespace-unqualified conj(T) via argument-dependent lookup.
+// If so, then CUTLASS assumes that conj(t) returns
+// the complex conjugate of t.
+template <typename T, typename Enable = void>
+struct has_unqualified_conj : cutlass::platform::false_type
+{};
+
+template<typename T>
+struct has_unqualified_conj<
+    T,
+    decltype(conj(cutlass::platform::declval<T>()), void())
+  > : cutlass::platform::true_type
+{};
+
+template <typename T>
+constexpr bool has_unqualified_conj_v = has_unqualified_conj<T>::value;
+  
+} // namespace detail
+
+// forward declaration (needed for conjugate below)
+template<class T>
+CUTLASS_HOST_DEVICE T conj(T const& z);
+
+namespace detail {
+
+// Whether cutlass::conj(t) for t of type T is well-formed.
+// If so, then CUTLASS assumes that cutlass::conj(t)
+// returns the complex conjugate of t.
+template <typename T, typename Enable = void>
+struct has_cutlass_conj : cutlass::platform::false_type
+{};
+
+template<typename T>
+struct has_cutlass_conj<
+    T,
+    decltype(cutlass::conj(cutlass::platform::declval<T>()), void())
+  > : cutlass::platform::true_type
+{};
+
+template <typename T>
+constexpr bool has_cutlass_conj_v = has_cutlass_conj<T>::value;
+
+} // namespace detail
+  
+// Return the complex conjugate of the input.
+//
+// If the struct hasn't already been specialized for type T, then
+//
+// 1. for arithmetic types, return z;
+//
+// 2. for types where either (namespace-unqualified) conj(z) or
+//    cutlass::conj(z) is well formed, declare "using cutlass::conj;"
+//    and return conj(z); and
+//
+// 3. for everything else, return z.
+//
+// Regarding (1), the C++ Standard Library makes std::conj always
+// return std::complex, even for (noncomplex) arithmetic types.
+// cutlass::conj(T t) needs to return type T.  This follows the
+// convention of linear algebra software like the BLAS, where
+// "conjugate transpose" means the same thing as "transpose" for a
+// matrix of noncomplex numbers.
+//
+// Case (2) covers std::complex, cuda::std::complex, and non-Standard
+// (including user-defined) complex number types (for which "conj(z)"
+// is findable via argument-dependent lookup).  cutlass::conj has a
+// totally generic overload, but a more type-specific overload in any
+// namespace will take precedence.
+//
+// Case (3) covers non-Standard non-complex number types.
+//
+// Users should not generally need to specialize this struct for their
+// own custom complex or noncomplex types.  The idiomatic way to
+// identify a type T as "complex" is to make namespace-unqualified
+// calls to conj(T) findable via argument-dependent lookup.
 template <typename T>
 struct conjugate {
   CUTLASS_HOST_DEVICE
-  T operator()(T const &a) const {
-    return a;
+  T operator()(T const& z) const {
+    if constexpr (cutlass::platform::is_arithmetic_v<T>) {
+      return z;
+    }
+    else if constexpr (detail::has_unqualified_conj_v<T> || detail::has_cutlass_conj_v<T>) {
+      using cutlass::conj;
+      return conj(z);
+    }
+    else {
+      return z;
+    }
+  }
+};
+
+template <typename T>
+struct first {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const & first, T const &...) const {
+    return first;
+  }
+  CUTLASS_HOST_DEVICE
+  T operator()(T const & first) const {
+    return first;
   }
 };
 
@@ -324,7 +719,7 @@ template <typename T>
 struct logical_and {
   CUTLASS_HOST_DEVICE
   T operator()(T const &a, T const &b) const {
-    return ((a && b) ? T(1) : T());
+    return ((static_cast<bool>(a) && static_cast<bool>(b)) ? T(1) : T());
   }
 };
 
@@ -332,7 +727,7 @@ template <typename T>
 struct logical_or {
   CUTLASS_HOST_DEVICE
   T operator()(T const &a, T const &b) const {
-    return ((a || b) ? T(1) : T());
+    return ((static_cast<bool>(a) || static_cast<bool>(b)) ? T(1) : T());
   }
 };
 
@@ -378,1991 +773,154 @@ struct bit_xor {
   }
 };
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Partial specializations for Arrays
-template <int N>
-struct bit_and<Array<uint1b_t, N>> {
-  CUTLASS_HOST_DEVICE
-  Array<uint1b_t, N> operator()(Array<uint1b_t, N> const &a, Array<uint1b_t, N> const &b) const {
-    using ArrayType = Array<uint1b_t, N>;
-    using Storage = typename ArrayType::Storage;
-    ArrayType result;
-
-    Storage *result_data = result.raw_data();
-    Storage const *a_data = a.raw_data();
-    Storage const *b_data = b.raw_data();
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < ArrayType::kStorageElements; ++i) {
-      result_data[i] = (a_data[i] & b_data[i]);
-    }
-
-    return result;
-  }
-};
-
-// Partial specializations for Arrays
-template <int N>
-struct bit_or<Array<uint1b_t, N>> {
-  CUTLASS_HOST_DEVICE
-  Array<uint1b_t, N> operator()(Array<uint1b_t, N> const &a, Array<uint1b_t, N> const &b) const {
-    using ArrayType = Array<uint1b_t, N>;
-    using Storage = typename ArrayType::Storage;
-    ArrayType result;
-
-    Storage *result_data = result.raw_data();
-    Storage const *a_data = a.raw_data();
-    Storage const *b_data = b.raw_data();
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < ArrayType::kStorageElements; ++i) {
-      result_data[i] = (a_data[i] | b_data[i]);
-    }
-
-    return result;
-  }
-};
-
-// Partial specializations for Arrays
-template <int N>
-struct bit_not<Array<uint1b_t, N>> {
-  CUTLASS_HOST_DEVICE
-  Array<uint1b_t, N> operator()(Array<uint1b_t, N> const &a) const {
-    using ArrayType = Array<uint1b_t, N>;
-    using Storage = typename ArrayType::Storage;
-    ArrayType result;
-
-    Storage *result_data = result.raw_data();
-    Storage const *a_data = a.raw_data();
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < ArrayType::kStorageElements; ++i) {
-      result_data[i] = (~a_data[i]);
-    }
-
-    return result;
-  }
-};
-
-// Partial specializations for Arrays
-template <int N>
-struct bit_xor<Array<uint1b_t, N>> {
-  CUTLASS_HOST_DEVICE
-  Array<uint1b_t, N> operator()(Array<uint1b_t, N> const &a, Array<uint1b_t, N> const &b) const {
-    using ArrayType = Array<uint1b_t, N>;
-    using Storage = typename ArrayType::Storage;
-    ArrayType result;
-
-    Storage *result_data = result.raw_data();
-    Storage const *a_data = a.raw_data();
-    Storage const *b_data = b.raw_data();
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < ArrayType::kStorageElements; ++i) {
-      result_data[i] = (a_data[i] ^ b_data[i]);
-    }
+//////////////////////////////////////////////////////////////////////////////////////////////////
+/// Atomic reductions
 
-    return result;
+template <typename T>
+struct atomic_add
+{
+  CUTLASS_DEVICE
+  void operator()(T *ptr, const T &data)
+  {
+#if defined(__CUDA_ARCH__)
+    atomicAdd(ptr, data);
+#else
+    CUTLASS_UNUSED(ptr);
+    CUTLASS_UNUSED(data);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+template<>
+struct atomic_add<double>
+{
+  CUTLASS_DEVICE
+  void operator()(double *ptr, const double &data)
+  {
+#if !defined(__CUDA_ARCH__)
+    CUTLASS_UNUSED(ptr);
+    CUTLASS_UNUSED(data);
+    CUTLASS_NOT_IMPLEMENTED();
+#elif (__CUDA_ARCH__ >= 600)
+    atomicAdd(ptr, data);
+#else
+    // Use CAS loop
+    unsigned long long int* ptr_int = reinterpret_cast<unsigned long long int*>(ptr);
+    unsigned long long int old_int = *ptr_int;
+    unsigned long long int assumed_int;
+
+    do {
+      double update = data + __longlong_as_double(old_int);
+      assumed_int = old_int;
+      old_int = atomicCAS(ptr_int, assumed_int, __double_as_longlong(update));
+    } while (assumed_int != old_int);
+#endif // (__CUDA_ARCH__ >= 600)
+  }
+};
+
+template<>
+struct atomic_add<half2>
+{
+  CUTLASS_DEVICE
+  void operator()(half2 *ptr, const half2 &data)
+  {
+#if !defined(__CUDA_ARCH__) || (defined(__CUDA_ARCH__)  && (__CUDA_ARCH__ < 600))
+      CUTLASS_UNUSED(ptr);
+      CUTLASS_UNUSED(data);
+      CUTLASS_NOT_IMPLEMENTED();
+#else
+    // Vector-2 atomic reduction requires .target sm_60 or higher
+    uint32_t word = reinterpret_cast<const uint32_t&>(data);
+    asm volatile ("red.gpu.global.add.noftz.f16x2 [%0], %1;\n" : : "l"(ptr), "r"(word));
+#endif // (__CUDA_ARCH__ >= 600)
   }
 };
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename T>
+using red [[deprecated("use atomic_add instead")]] = atomic_add<T>;
 
 template <typename T>
-struct conjugate<complex<T>>  {
-  CUTLASS_HOST_DEVICE
-  complex<T> operator()(complex<T> const &a) const {
-    return conj(a);
+struct atomic_maximum {
+  CUTLASS_DEVICE
+  T operator()(T *ptr, T value) const {
+#if defined(__CUDA_ARCH__)
+    return atomicMax(ptr, value);
+#else
+    CUTLASS_UNUSED(ptr);
+    CUTLASS_UNUSED(value);
+    CUTLASS_NOT_IMPLEMENTED();
+    return 0;
+#endif
   }
 };
 
-template <typename T, int N>
-struct conjugate<Array<T, N> >  {
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &a) const {
-
-    conjugate<T> conj_op;
-
-    Array<T, N> ca;
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      ca[i] = conj_op(a[i]);
-    }
-    return ca;
-  }
-};
+template <>
+struct atomic_maximum<float> {
+  CUTLASS_DEVICE
+  float operator()(float *ptr, float value) const {
+#if defined(__CUDA_ARCH__)
+    // In device code, make sure that we do NOT try to use
+    // std::signbit, as that won't work if building with NVRTC.
+    // Instead, prefix "::" to call signbit from the global namespace,
+    // which CUDA guarantees to work in device code without including
+    // any headers.
+    //
+    return ! ::signbit(value) ?
+      __int_as_float(atomicMax((int*)ptr, __float_as_int(value))) :
+      __uint_as_float(atomicMin((unsigned int*)ptr, __float_as_uint(value)));
+#else
+    CUTLASS_UNUSED(ptr);
+    CUTLASS_UNUSED(value);
+    CUTLASS_NOT_IMPLEMENTED();
+    return 0;
+#endif
+  }
+};
+
+// is_atomic
+template <class Fn>
+struct is_atomic : platform::false_type {};
+template <class T>
+struct is_atomic<atomic_add<T>> : platform::true_type {};
+template <class T>
+struct is_atomic<atomic_maximum<T>> : platform::true_type {};
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 //
-// Partial specialization for complex<T> to target four scalar fused multiply-adds.
+// Partial specializations for nvcuda::wmma::fragment<Use, m, n, k, T, Layout>
 //
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Fused multiply-add
-template <typename T>
-struct multiply_add<complex<T>, complex<T>, complex<T>> {
-  CUTLASS_HOST_DEVICE
-  complex<T> operator()(
-    complex<T> const &a, 
-    complex<T> const &b, 
-    complex<T> const &c) const {
-
-    T real = c.real();
-    T imag = c.imag();
-
-    real += a.real() * b.real();
-    real += -a.imag() * b.imag();
-    imag += a.real() * b.imag();
-    imag += a.imag () * b.real();
-
-    return complex<T>{
-      real,
-      imag
-    };
-  }
-};
-
-/// Fused multiply-add
-template <typename T>
-struct multiply_add<complex<T>, T, complex<T>> {
-  CUTLASS_HOST_DEVICE
-  complex<T> operator()(
-    complex<T> const &a, 
-    T const &b, 
-    complex<T> const &c) const {
-
-    T real = c.real();
-    T imag = c.imag();
-
-    real += a.real() * b;
-    imag += a.imag () * b;
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
 
-    return complex<T>{
-      real,
-      imag
-    };
-  }
-};
+template<typename Use, int m, int n, int k, typename T, typename Layout>
+struct plus<nvcuda::wmma::fragment<Use, m, n, k, T, Layout>>
+{
+  using Fragment = nvcuda::wmma::fragment<Use, m, n, k, T, Layout>;
+  using ElementType = typename Fragment::element_type;
 
-/// Fused multiply-add
-template <typename T>
-struct multiply_add<T, complex<T>, complex<T>> {
   CUTLASS_HOST_DEVICE
-  complex<T> operator()(
-    T const &a, 
-    complex<T> const &b, 
-    complex<T> const &c) const {
+  Fragment operator()(Fragment const &lhs, Fragment const &rhs) const
+  {
+    Fragment result;
+    plus<ElementType> scalar_op;
 
-    T real = c.real();
-    T imag = c.imag();
+    ElementType *result_elts = reinterpret_cast<ElementType*>(&result);
+    const ElementType *lhs_elts = reinterpret_cast<const ElementType*>(&lhs);
+    const ElementType *rhs_elts = reinterpret_cast<const ElementType*>(&rhs);
 
-    real += a * b.real();
-    imag += a * b.imag();
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Fragment::num_elements; i++) {
+      result_elts[i] = scalar_op(lhs_elts[i], rhs_elts[i]);
+    }
 
-    return complex<T>{
-      real,
-      imag
-    };
+    return result;
   }
 };
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Partial specializations for Array<T, N>
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename T, int N>
-struct absolute_value_op< Array<T, N> > {
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &lhs) const {
-
-    Array<T, N> result;
-    absolute_value_op<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(lhs[i]);
-    }
-
-    return result;
-  }
-};
-
-template <typename T, int N>
-struct plus<Array<T, N>> {
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
-    
-    Array<T, N> result;
-    plus<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(lhs[i], rhs[i]);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
-    
-    Array<T, N> result;
-    plus<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(lhs[i], scalar);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()( T const &scalar, Array<T, N> const &rhs) const {
-    
-    Array<T, N> result;
-    plus<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(scalar, rhs[i]);
-    }
-
-    return result;
-  }
-};
-template <typename T, int N>
-struct minus<Array<T, N>> {
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
-    
-    Array<T, N> result;
-    minus<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(lhs[i], rhs[i]);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
-    
-    Array<T, N> result;
-    minus<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(lhs[i], scalar);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()( T const &scalar, Array<T, N> const &rhs) const {
-    
-    Array<T, N> result;
-    minus<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(scalar, rhs[i]);
-    }
-
-    return result;
-  }
-};
-
-template <typename T, int N>
-struct multiplies<Array<T, N>> {
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
-    
-    Array<T, N> result;
-    multiplies<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(lhs[i], rhs[i]);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
-    
-    Array<T, N> result;
-    multiplies<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(lhs[i], scalar);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()( T const &scalar, Array<T, N> const &rhs) const {
-    
-    Array<T, N> result;
-    multiplies<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(scalar, rhs[i]);
-    }
-
-    return result;
-  }
-};
-
-template <typename T, int N>
-struct divides<Array<T, N>> {
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
-    
-    Array<T, N> result;
-    divides<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(lhs[i], rhs[i]);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
-    
-    Array<T, N> result;
-    divides<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(lhs[i], scalar);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()( T const &scalar, Array<T, N> const &rhs) const {
-    
-    Array<T, N> result;
-    divides<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(scalar, rhs[i]);
-    }
-
-    return result;
-  }
-};
-
-template <typename T, int N>
-struct maximum<Array<T, N>> {
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
-    
-    Array<T, N> result;
-    maximum<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(lhs[i], rhs[i]);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
-    
-    Array<T, N> result;
-    maximum<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(lhs[i], scalar);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()( T const &scalar, Array<T, N> const &rhs) const {
-    
-    Array<T, N> result;
-    maximum<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(scalar, rhs[i]);
-    }
-
-    return result;
-  }
-};
-
-template <typename T, int N>
-struct minimum<Array<T, N>> {
-
-  CUTLASS_HOST_DEVICE
-  static T scalar_op(T const &lhs, T const &rhs) {
-    return (rhs < lhs ? rhs : lhs);
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
-    
-    Array<T, N> result;
-    minimum<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(lhs[i], rhs[i]);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
-    
-    Array<T, N> result;
-    minimum<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(lhs[i], scalar);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()( T const &scalar, Array<T, N> const &rhs) const {
-    
-    Array<T, N> result;
-    minimum<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(scalar, rhs[i]);
-    }
-
-    return result;
-  }
-};
-
-template <typename T, int N>
-struct negate<Array<T, N>> {
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &lhs) const {
-    
-    Array<T, N> result;
-    negate<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(lhs[i]);
-    }
-
-    return result;
-  }
-};
-
-/// Fused multiply-add
-template <typename T, int N>
-struct multiply_add<Array<T, N>, Array<T, N>, Array<T, N>> {
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &a, Array<T, N> const &b, Array<T, N> const &c) const {
-    
-    Array<T, N> result;
-    multiply_add<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(a[i], b[i], c[i]);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &a, T const &scalar, Array<T, N> const &c) const {
-    
-    Array<T, N> result;
-    multiply_add<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(a[i], scalar, c[i]);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(T const &scalar, Array<T, N> const &b, Array<T, N> const &c) const {
-    
-    Array<T, N> result;
-    multiply_add<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(scalar, b[i], c[i]);
-    }
-
-    return result;
-  }
-};
-
-/// Fused multiply-add-relu0
-template <typename T, int N>
-struct multiply_add_relu0<Array<T, N>, Array<T, N>, Array<T, N>> {
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &a, Array<T, N> const &b, Array<T, N> const &c) const {
-    
-    Array<T, N> result;
-    multiply_add<T> scalar_op;
-    maximum<T> mx;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = mx(scalar_op(a[i], b[i], c[i]), T(0));
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &a, T const &scalar, Array<T, N> const &c) const {
-    
-    Array<T, N> result;
-    multiply_add<T> scalar_op;
-    maximum<T> mx;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = mx(scalar_op(a[i], scalar, c[i]), T(0));
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(T const &scalar, Array<T, N> const &b, Array<T, N> const &c) const {
-    
-    Array<T, N> result;
-    multiply_add<T> scalar_op;
-    maximum<T> mx;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = mx(scalar_op(scalar, b[i], c[i]), T(0));
-    }
-
-    return result;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Partial specializations for Array<half_t, N> targeting SIMD instructions in device code.
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <int N>
-struct plus<Array<half_t, N>> {
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
-    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hadd2(lhs_ptr[i], rhs_ptr[i]);
-    }
-
-    if (N % 2) {
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
-      __half d_residual = __hadd(a_residual_ptr[N - 1], b_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = lhs[i] + rhs[i];
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
-    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hadd2(lhs_pair, rhs_ptr[i]);
-    }
-
-    if (N % 2) {
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
-      __half d_residual = __hadd(reinterpret_cast<__half const &>(lhs), b_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = lhs + rhs[i];
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
-    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hadd2(lhs_ptr[i], rhs_pair);
-    }
-
-    if (N % 2) {
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
-      __half d_residual = __hadd(a_residual_ptr[N - 1], reinterpret_cast<__half const &>(rhs));
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = lhs[i] + rhs;
-    }
-    #endif
-
-    return result;
-  }
-};
-
-template <int N>
-struct minus<Array<half_t, N>> {
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
-    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hsub2(lhs_ptr[i], rhs_ptr[i]);
-    }
-
-    if (N % 2) {
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
-      __half d_residual = __hsub(a_residual_ptr[N - 1], b_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = lhs[i] - rhs[i];
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
-    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hsub2(lhs_pair, rhs_ptr[i]);
-    }
-
-    if (N % 2) {
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
-      __half d_residual = __hsub(reinterpret_cast<__half const &>(lhs), b_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = lhs - rhs[i];
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
-    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hsub2(lhs_ptr[i], rhs_pair);
-    }
-
-    if (N % 2) {
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
-      __half d_residual = __hsub(a_residual_ptr[N - 1], reinterpret_cast<__half const &>(rhs));
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = lhs[i] - rhs;
-    }
-    #endif
-
-    return result;
-  }
-};
-
-template <int N>
-struct multiplies<Array<half_t, N>> {
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
-    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hmul2(lhs_ptr[i], rhs_ptr[i]);
-    }
-
-    if (N % 2) {
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
-      __half d_residual = __hmul(a_residual_ptr[N - 1], b_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = lhs[i] * rhs[i];
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
-    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hmul2(lhs_pair, rhs_ptr[i]);
-    }
-
-    if (N % 2) {
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
-
-      __half d_residual = __hmul(
-        reinterpret_cast<__half const &>(lhs), 
-        b_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = lhs * rhs[i];
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
-    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hmul2(lhs_ptr[i], rhs_pair);
-    }
-
-    if (N % 2) {
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
-
-      __half d_residual = __hmul(
-        a_residual_ptr[N - 1], 
-        reinterpret_cast<__half const &>(rhs));
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = lhs[i] * rhs;
-    }
-    #endif
-
-    return result;
-  }
-};
-
-template <int N>
-struct divides<Array<half_t, N>> {
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
-    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __h2div(lhs_ptr[i], rhs_ptr[i]);
-    }
-
-    if (N % 2) {
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
-
-      __half d_residual = __hdiv(
-        a_residual_ptr[N - 1], 
-        b_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = lhs[i] / rhs[i];
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
-    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __h2div(lhs_pair, rhs_ptr[i]);
-    }
-
-    if (N % 2) {
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
-
-      __half d_residual = __hdiv(
-        reinterpret_cast<__half const &>(lhs), 
-        b_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = lhs / rhs[i];
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
-    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __h2div(lhs_ptr[i], rhs_pair);
-    }
-
-    if (N % 2) {
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
-
-      __half d_residual = __hdiv(
-        a_residual_ptr[N - 1], 
-        reinterpret_cast<__half const &>(rhs));
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = lhs[i] / rhs;
-    }
-    #endif
-
-    return result;
-  }
-};
-
-template <int N>
-struct negate<Array<half_t, N>> {
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(Array<half_t, N> const & lhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *source_ptr = reinterpret_cast<__half2 const *>(&lhs);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hneg2(source_ptr[i]);
-    }
-
-    if (N % 2) {
-      half_t x = lhs[N - 1];
-      __half lhs_val = -reinterpret_cast<__half const &>(x);
-      result[N - 1] = reinterpret_cast<half_t const &>(lhs_val);
-    }
-
-    #else
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = -lhs[i];
-    }
-    #endif
-
-    return result;
-  }
-};
-
-/// Fused multiply-add
-template <int N>
-struct multiply_add<Array<half_t, N>, Array<half_t, N>, Array<half_t, N>> {
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(
-    Array<half_t, N> const &a, 
-    Array<half_t, N> const &b, 
-    Array<half_t, N> const &c) const {
-    
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
-    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
-    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hfma2(a_ptr[i], b_ptr[i], c_ptr[i]);
-    }
-
-    if (N % 2) {
-
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
-      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
-
-      __half d_residual = __hfma(
-        a_residual_ptr[N - 1], 
-        b_residual_ptr[N - 1], 
-        c_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    multiply_add<half_t> op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = op(a[i], b[i], c[i]);
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(
-    half_t const &a, 
-    Array<half_t, N> const &b, 
-    Array<half_t, N> const &c) const {
-    
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 a_pair = __half2half2(reinterpret_cast<__half const &>(a));
-    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
-    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hfma2(a_pair, b_ptr[i], c_ptr[i]);
-    }
-
-    if (N % 2) {
-
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
-      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
-      __half d_residual = __hfma(
-        reinterpret_cast<__half const &>(a), 
-        b_residual_ptr[N - 1], 
-        c_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    multiply_add<half_t> op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = op(a, b[i], c[i]);
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(
-    Array<half_t, N> const &a, 
-    half_t const &b, 
-    Array<half_t, N> const &c) const {
-    
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
-    __half2 b_pair = __half2half2(reinterpret_cast<__half const &>(b));
-    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hfma2(a_ptr[i], b_pair, c_ptr[i]);
-    }
-
-    if (N % 2) {
-
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
-      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
-
-      __half d_residual = __hfma(
-        a_residual_ptr[N - 1], 
-        reinterpret_cast<__half const &>(b), 
-        c_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    multiply_add<half_t> op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = op(a[i], b, c[i]);
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(
-    Array<half_t, N> const &a, 
-    Array<half_t, N> const &b, 
-    half_t const &c) const {
-    
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
-    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
-    __half2 c_pair = __half2half2(reinterpret_cast<__half const &>(c));
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hfma2(a_ptr[i], b_ptr[i], c_pair);
-    }
-
-    if (N % 2) {
-
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
-
-      __half d_residual = __hfma(
-        a_residual_ptr[N - 1], 
-        b_residual_ptr[N - 1], 
-        reinterpret_cast<__half const &>(c));
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    multiply_add<half_t> op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = op(a[i], b[i], c);
-    }
-    #endif
-
-    return result;
-  }
-};
-
-/// Fused multiply-add-relu0
-template <int N>
-struct multiply_add_relu0<Array<half_t, N>, Array<half_t, N>, Array<half_t, N>> {
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(
-    Array<half_t, N> const &a, 
-    Array<half_t, N> const &b, 
-    Array<half_t, N> const &c) const {
-    
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
-    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
-    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hfma2_relu(a_ptr[i], b_ptr[i], c_ptr[i]);
-    }
-
-    if (N % 2) {
-
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
-      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
-
-      __half d_residual = __hfma_relu(
-        a_residual_ptr[N - 1], 
-        b_residual_ptr[N - 1], 
-        c_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    multiply_add<half_t> op;
-    maximum<half_t> mx;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = mx(op(a[i], b[i], c[i]), (half_t)0);
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(
-    half_t const &a, 
-    Array<half_t, N> const &b, 
-    Array<half_t, N> const &c) const {
-    
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 a_pair = __half2half2(reinterpret_cast<__half const &>(a));
-    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
-    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hfma2_relu(a_pair, b_ptr[i], c_ptr[i]);
-    }
-
-    if (N % 2) {
-
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
-      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
-      __half d_residual = __hfma_relu(
-        reinterpret_cast<__half const &>(a), 
-        b_residual_ptr[N - 1], 
-        c_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    multiply_add<half_t> op;
-    maximum<half_t> mx;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = mx(op(a, b[i], c[i]), half_t(0));
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(
-    Array<half_t, N> const &a, 
-    half_t const &b, 
-    Array<half_t, N> const &c) const {
-    
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
-    __half2 b_pair = __half2half2(reinterpret_cast<__half const &>(b));
-    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
+#endif // defined(CUTLASS_ARCH_WMMA_ENABLED)
 
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hfma2_relu(a_ptr[i], b_pair, c_ptr[i]);
-    }
-
-    if (N % 2) {
-
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
-      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
-
-      __half d_residual = __hfma_relu(
-        a_residual_ptr[N - 1], 
-        reinterpret_cast<__half const &>(b), 
-        c_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    multiply_add<half_t> op;
-    maximum<half_t> mx;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = mx(op(a[i], b, c[i]), half_t(0));
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(
-    Array<half_t, N> const &a, 
-    Array<half_t, N> const &b, 
-    half_t const &c) const {
-    
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
-    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
-    __half2 c_pair = __half2half2(reinterpret_cast<__half const &>(c));
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hfma2_relu(a_ptr[i], b_ptr[i], c_pair);
-    }
-
-    if (N % 2) {
-
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
-
-      __half d_residual = __hfma_relu(
-        a_residual_ptr[N - 1], 
-        b_residual_ptr[N - 1], 
-        reinterpret_cast<__half const &>(c));
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    multiply_add<half_t> op;
-    maximum<half_t> mx;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = mx(op(a[i], b[i], c), half_t(0));
-    }
-    #endif
-
-    return result;
-  }
-};
-
-template <int N>
-struct minimum<Array<half_t, N>> {
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
-    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hmin2(lhs_ptr[i], rhs_ptr[i]);
-    }
-
-    if (N % 2) {
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
-
-      __half d_residual = __hmin(
-        a_residual_ptr[N - 1], 
-        b_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = (rhs[i] < lhs[i] ? rhs[i] : lhs[i]);
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
-    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hmin2(lhs_pair, rhs_ptr[i]);
-    }
-
-    if (N % 2) {
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
-
-      __half d_residual = __hmin(
-        reinterpret_cast<__half const &>(lhs), 
-        b_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = (rhs[i] < lhs ? rhs[i] : lhs);
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
-    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hmin2(lhs_ptr[i], rhs_pair);
-    }
-
-    if (N % 2) {
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
-
-      __half d_residual = __hmin(
-        a_residual_ptr[N - 1], 
-        reinterpret_cast<__half const &>(rhs));
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = (rhs < lhs[i] ? rhs : lhs[i]);
-    }
-    #endif
-
-    return result;
-  }
-};
-
-template <int N>
-struct maximum<Array<half_t, N>> {
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
-    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hmax2(lhs_ptr[i], rhs_ptr[i]);
-    }
-
-    if (N % 2) {
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
-
-      __half d_residual = __hmax(
-        a_residual_ptr[N - 1], 
-        b_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = (lhs[i] < rhs[i] ? rhs[i] : lhs[i]);
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
-    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hmax2(lhs_pair, rhs_ptr[i]);
-    }
-
-    if (N % 2) {
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
-
-      __half d_residual = __hmax(
-        reinterpret_cast<__half const &>(lhs), 
-        b_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = (lhs < rhs[i] ? rhs[i] : lhs);
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
-    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hmax2(lhs_ptr[i], rhs_pair);
-    }
-
-    if (N % 2) {
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
-
-      __half d_residual = __hmax(
-        a_residual_ptr[N - 1], 
-        reinterpret_cast<__half const &>(rhs));
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = (lhs[i] < rhs ? rhs : lhs[i]);
-    }
-    #endif
-
-    return result;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fused multiply-add
-template <int N>
-struct multiply_add<Array<bfloat16_t, N>, Array<bfloat16_t, N>, Array<bfloat16_t, N>> {
-
-  CUTLASS_HOST_DEVICE
-  Array<bfloat16_t, N> operator()(
-    Array<bfloat16_t, N> const &a, 
-    Array<bfloat16_t, N> const &b, 
-    Array<bfloat16_t, N> const &c) const {
-    
-    Array<bfloat16_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-
-    unsigned *result_ptr = reinterpret_cast<unsigned *>(&result);
-    unsigned const *a_ptr = reinterpret_cast<unsigned const *>(&a);
-    unsigned const *b_ptr = reinterpret_cast<unsigned const *>(&b);
-    unsigned const *c_ptr = reinterpret_cast<unsigned const *>(&c);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n" 
-        : "=r"(result_ptr[i]) 
-        : "r"(a_ptr[i]), "r"(b_ptr[i]), "r"(c_ptr[i])
-      );
-    }
-
-    if (N % 2) {
-
-      uint16_t *result_ptr = reinterpret_cast<uint16_t *>(&result);
-      uint16_t const *a_residual_ptr = reinterpret_cast<uint16_t const *>(&a);
-      uint16_t const *b_residual_ptr = reinterpret_cast<uint16_t const *>(&b);
-      uint16_t const *c_residual_ptr = reinterpret_cast<uint16_t const *>(&c);
-
-      asm ("fma.rn.bf16 %0, %1, %2, %3;\n" 
-        : "=h"(result_ptr[N - 1]) 
-        : "h"(a_residual_ptr[N - 1]), "h"(b_residual_ptr[N - 1]), "h"(c_residual_ptr[N - 1])
-      );
-    }
-
-    #else
-
-    multiply_add<bfloat16_t> op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = op(a[i], b[i], c[i]);
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<bfloat16_t, N> operator()(
-    bfloat16_t const &a, 
-    Array<bfloat16_t, N> const &b, 
-    Array<bfloat16_t, N> const &c) const {
-    
-    Array<bfloat16_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-
-    unsigned *result_ptr = reinterpret_cast<unsigned *>(&result);
-
-    unsigned const *b_ptr = reinterpret_cast<unsigned const *>(&b);
-    unsigned const *c_ptr = reinterpret_cast<unsigned const *>(&c);
-
-    unsigned a_packed = static_cast<unsigned>(a.raw());
-    a_packed = (a_packed | (a_packed << 16));
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n" 
-        : "=r"(result_ptr[i]) 
-        : "r"(a_packed), "r"(b_ptr[i]), "r"(c_ptr[i])
-      );
-    }
-
-    if (N % 2) {
-
-      uint16_t *result_ptr = reinterpret_cast<uint16_t *>(&result);
-      uint16_t const *a_residual_ptr = reinterpret_cast<uint16_t const *>(&a);
-      uint16_t const *b_residual_ptr = reinterpret_cast<uint16_t const *>(&b);
-      uint16_t const *c_residual_ptr = reinterpret_cast<uint16_t const *>(&c);
-
-      asm ("fma.rn.bf16 %0, %1, %2, %3;\n" 
-        : "=h"(result_ptr[N - 1]) 
-        : "h"(a_residual_ptr[0]), "h"(b_residual_ptr[N - 1]), "h"(c_residual_ptr[N - 1])
-      );
-    }
-
-    #else
-
-    multiply_add<bfloat16_t> op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = op(a, b[i], c[i]);
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<bfloat16_t, N> operator()(
-    Array<bfloat16_t, N> const &a, 
-    bfloat16_t const &b, 
-    Array<bfloat16_t, N> const &c) const {
-    
-    Array<bfloat16_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-
-    unsigned *result_ptr = reinterpret_cast<unsigned *>(&result);
-    
-    unsigned const *a_ptr = reinterpret_cast<unsigned const *>(&a);
-    unsigned const *c_ptr = reinterpret_cast<unsigned const *>(&c);
-
-    unsigned b_packed = static_cast<unsigned>(b.raw());
-    b_packed = (b_packed | (b_packed << 16));
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n" 
-        : "=r"(result_ptr[i]) 
-        : "r"(a_ptr[i]), "r"(b_packed), "r"(c_ptr[i])
-      );
-    }
-
-    if (N % 2) {
-
-      uint16_t *result_ptr = reinterpret_cast<uint16_t *>(&result);
-      uint16_t const *a_residual_ptr = reinterpret_cast<uint16_t const *>(&a);
-      uint16_t const *b_residual_ptr = reinterpret_cast<uint16_t const *>(&b);
-      uint16_t const *c_residual_ptr = reinterpret_cast<uint16_t const *>(&c);
-
-      asm ("fma.rn.bf16 %0, %1, %2, %3;\n" 
-        : "=h"(result_ptr[N - 1]) 
-        : "h"(a_residual_ptr[N - 1]), "h"(b_residual_ptr[0]), "h"(c_residual_ptr[N - 1])
-      );
-    }
-
-    #else
-
-    multiply_add<bfloat16_t> op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = op(a[i], b, c[i]);
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<bfloat16_t, N> operator()(
-    Array<bfloat16_t, N> const &a, 
-    Array<bfloat16_t, N> const &b, 
-    bfloat16_t const &c) const {
-    
-    Array<bfloat16_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-
-    unsigned *result_ptr = reinterpret_cast<unsigned *>(&result);
-    
-    unsigned const *a_ptr = reinterpret_cast<unsigned const *>(&a);
-    unsigned const *b_ptr = reinterpret_cast<unsigned const *>(&b);
-
-    unsigned c_packed = static_cast<unsigned>(c.raw());
-    c_packed = (c_packed | (c_packed << 16));
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n" 
-        : "=r"(result_ptr[i]) 
-        : "r"(a_ptr[i]), "r"(b_ptr[i]), "r"(c_packed)
-      );
-    }
-
-    if (N % 2) {
-
-      uint16_t *result_ptr = reinterpret_cast<uint16_t *>(&result);
-      uint16_t const *a_residual_ptr = reinterpret_cast<uint16_t const *>(&a);
-      uint16_t const *b_residual_ptr = reinterpret_cast<uint16_t const *>(&b);
-      uint16_t const *c_residual_ptr = reinterpret_cast<uint16_t const *>(&c);
-
-      asm ("fma.rn.bf16 %0, %1, %2, %3;\n" 
-        : "=h"(result_ptr[N - 1]) 
-        : "h"(a_residual_ptr[N - 1]), "h"(b_residual_ptr[N - 1]), "h"(c_residual_ptr[0])
-      );
-    }
-
-    #else
-
-    multiply_add<bfloat16_t> op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = op(a[i], b[i], c);
-    }
-    #endif
-
-    return result;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-template <typename T, int N>
-CUTLASS_HOST_DEVICE
-Array<T, N> operator+(Array<T, N> const &lhs, Array<T, N> const &rhs) {
-  plus<Array<T, N>> op;
-  return op(lhs, rhs);
-}
-
-template <typename T, int N>
-CUTLASS_HOST_DEVICE
-Array<T, N> operator-(Array<T, N> const &lhs, Array<T, N> const &rhs) {
-  minus<Array<T, N>> op;
-  return op(lhs, rhs);
-}
-
-template <typename T, int N>
-CUTLASS_HOST_DEVICE
-Array<T, N> operator-(Array<T, N> const &lhs) {
-  negate<Array<T, N>> op;
-  return op(lhs);
-}
-
-template <typename T, int N>
-CUTLASS_HOST_DEVICE
-Array<T, N> operator*(Array<T, N> const &lhs, Array<T, N> const &rhs) {
-  multiplies<Array<T, N>> op;
-  return op(lhs, rhs);
-}
-
-template <typename T, int N>
-CUTLASS_HOST_DEVICE
-Array<T, N> operator*(T lhs, Array<T, N> const &rhs) {
-  multiplies<Array<T, N>> op;
-  return op(lhs, rhs);
-}
-
-template <typename T, int N>
-CUTLASS_HOST_DEVICE
-Array<T, N> operator*(Array<T, N> const &lhs, T rhs) {
-  multiplies<Array<T, N>> op;
-  return op(lhs, rhs);
-}
-
-template <typename T, int N>
-CUTLASS_HOST_DEVICE
-Array<T, N> operator/(Array<T, N> const &lhs, Array<T, N> const &rhs) {
-  divides<Array<T, N>> op;
-  return op(lhs, rhs);
-}
-
-template <typename T, int N>
-CUTLASS_HOST_DEVICE
-Array<T, N> fma(Array<T, N> const &a, Array<T, N> const &b, Array<T, N> const &c) {
-  multiply_add<Array<T, N>> op;
-  return op(a, b, c);
-}
-
-template <typename T, int N>
-CUTLASS_HOST_DEVICE
-Array<T, N> fma(T a, Array<T, N> const &b, Array<T, N> const &c) {
-  multiply_add<Array<T, N>> op;
-  return op(a, b, c);
-}
-
-template <typename T, int N>
-CUTLASS_HOST_DEVICE
-Array<T, N> fma(Array<T, N> const &a, T b, Array<T, N> const &c) {
-  multiply_add<Array<T, N>> op;
-  return op(a, b, c);
-}
-
-template <typename T, int N>
-CUTLASS_HOST_DEVICE
-Array<T, N> fma(Array<T, N> const &a, Array<T, N> const &b, T c) {
-  multiply_add<Array<T, N>> op;
-  return op(a, b, c);
-}
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Partial specializations for Quaternion<T> fused multiply-add
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename T>
-struct multiply_add<Quaternion<T>, Quaternion<T>, Quaternion<T>> {
-  CUTLASS_HOST_DEVICE
-  Quaternion<T> operator()(
-    Quaternion<T> const &a,
-    Quaternion<T> const &b,
-    Quaternion<T> const &c) const {
-
-    T x = c.x();
-    T y = c.y();
-    T z = c.z();
-    T w = c.w();
-
-    x += a.w() * b.x();
-    x += b.w() * a.x();
-    x += a.y() * b.z();
-    x += -a.z() * b.y(),
-
-    y += a.w() * b.y();
-    y += b.w() * a.y();
-    y += a.z() * b.x();
-    y += -a.x() * b.z();
-
-    z += a.w() * b.z();
-    z += b.w() * a.z();
-    z += a.x() * b.y();
-    z += -a.y() * b.x();
-
-    w += a.w() * b.w();
-    w += -a.x() * b.x();
-    w += -a.y() * b.y();
-    w += -a.z() * b.z();
-    
-    return cutlass::make_Quaternion(x, y, z, w);
-
-  }
-};
 
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/builders/sm90_common.inl b/include/cutlass/gemm/collective/builders/sm90_common.inl
new file mode 100644
index 0000000000..8d95967f97
--- /dev/null
+++ b/include/cutlass/gemm/collective/builders/sm90_common.inl
@@ -0,0 +1,419 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/arch/mma.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/detail/dependent_false.hpp"
+
+#include "cute/atom/mma_traits_sm90_gmma.hpp"
+#include "cute/atom/mma_traits_sm90_gmma_sparse.hpp"
+#include "cute/atom/copy_traits_sm90_tma.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+//
+// Some named constants
+//
+constexpr int tma_alignment_bytes = 16;
+constexpr int cp_async_min_alignment_bytes = 4;
+constexpr int sm90_smem_capacity_bytes = 232448;
+
+// Maps 2.x A matrix layout tag to respective GMMA major mode enum
+template <class ElementA, class LayoutA>
+constexpr cute::GMMA::Major
+gmma_ss_tag_to_major_A() {
+  // MN major mode is only valid for non-TF32, non-int and non-fp8 MMAs
+  if constexpr (cutlass::gemm::detail::is_mn_major_A<LayoutA>() &&
+                not cute::is_same_v<ElementA, tfloat32_t> &&
+                sizeof(ElementA) != 1) {
+    return cute::GMMA::Major::MN;
+  }
+  else {
+    return cute::GMMA::Major::K;
+  }
+}
+
+// Maps 2.x B matrix layout tag to respective GMMA major mode enum
+template <class ElementB, class LayoutB>
+constexpr cute::GMMA::Major
+gmma_ss_tag_to_major_B() {
+  // MN major mode is only valid for non-TF32, non-int and non-fp8 MMAs
+  if constexpr (cutlass::gemm::detail::is_mn_major_B<LayoutB>() &&
+                not cute::is_same_v<ElementB, tfloat32_t> &&
+                sizeof(ElementB) != 1) {
+    return cute::GMMA::Major::MN;
+  }
+  else {
+    return cute::GMMA::Major::K;
+  }
+}
+
+template <class LayoutA>
+constexpr cute::GMMA::Major
+gmma_rs_tag_to_major_A() {
+  // MN major mode is only valid for non-TF32 and non-int MMAs
+  if constexpr (cutlass::gemm::detail::is_mn_major_A<LayoutA>()) {
+    return cute::GMMA::Major::MN;
+  }
+  else {
+    return cute::GMMA::Major::K;
+  }
+}
+
+template <class LayoutB>
+constexpr cute::GMMA::Major
+gmma_rs_tag_to_major_B() {
+  // MN major mode is only valid for non-TF32 and non-int MMAs
+  if constexpr (cutlass::gemm::detail::is_mn_major_B<LayoutB>()) {
+    return cute::GMMA::Major::MN;
+  }
+  else {
+    return cute::GMMA::Major::K;
+  }
+}
+// Maps a rank-1 cute::Shape<> representing the cluster shape on to the TMA atom that should be used with it
+template <class UnimodalClusterShape>
+constexpr auto
+sm90_cluster_shape_to_tma_atom(UnimodalClusterShape) {
+  static_assert(cute::rank(UnimodalClusterShape{}) == 1,
+    "Use this function to figure out TMA for each mode individually.");
+
+  if constexpr (cute::size(UnimodalClusterShape{}) == 1) {
+    return cute::SM90_TMA_LOAD{};
+  }
+  else {
+    return cute::SM90_TMA_LOAD_MULTICAST{};
+  }
+}
+
+// Generates the most efficient possible TiledCopy with simt copy atom(e.g. cp.async) given a set of parameters.
+template<class CopyAtom, int ThreadCount, int Alignment, class StrideType, class TileMN, class TileK>
+constexpr auto
+make_simt_gmem_tiled_copy() {
+  using namespace cute;
+
+  constexpr int TileSizeMN  = cute::size(TileMN{});
+  constexpr int TileSizeK   = cute::size(TileK{});
+
+  // Maximize the number of threads along the gmem major mode to promote coalesced reads
+  // While making sure our thread layout tiles the threadblock tile evenly
+
+  if constexpr (cutlass::gemm::detail::is_k_major<StrideType>()) {
+    // K major thread layout for K major gmem
+    constexpr int threads_major = (ThreadCount >= TileSizeK / Alignment) ? (TileSizeK  / Alignment) : ThreadCount;
+    constexpr int threads_minor = ThreadCount / threads_major;
+    static_assert(threads_major > 0);
+    static_assert(ThreadCount % threads_major == 0);
+    static_assert(threads_minor == 0 || (TileSizeMN % threads_minor == 0));
+    return make_tiled_copy(
+      CopyAtom{},
+      Layout<Shape <Int<threads_minor>,Int<threads_major>>,
+             Stride<Int<threads_major>,                _1>>{},
+      Layout<Shape<_1,Int<Alignment>>>{});
+  }
+  else if constexpr (cutlass::gemm::detail::is_mn_major<StrideType>()) {
+    // MN major thread layout for MN major gmem
+    constexpr int threads_major = (ThreadCount >= TileSizeMN / Alignment) ? (TileSizeMN  / Alignment) : ThreadCount;
+    constexpr int threads_minor = ThreadCount / threads_major;
+    static_assert(threads_major > 0);
+    static_assert(ThreadCount % threads_major == 0);
+    static_assert(threads_minor == 0 || (TileSizeK % threads_minor == 0));
+    return make_tiled_copy(
+      CopyAtom{},
+      Layout<Shape <Int<threads_major>,Int<threads_minor>>,
+             Stride<                _1,Int<threads_major>>>{},
+      Layout<Shape<Int<Alignment>,_1>>{});
+  } else {
+    static_assert(cute::is_void_v<CopyAtom>, "Unsupported gmem layout for automatic gmem tiled copy builder.");
+  }
+}
+
+// Helper for SS GMMA smem selection that considers a tensor TileShape:
+//   (BLK_MN, BLK_K)
+//   or hierarchically
+//   ((BLK_MN0,BLK_MN1,...),(BLK_K0,BLK_K1,...))
+//   and returns the optimal GMMA::Layout that fits BLK_MN0 and BLK_K0
+template <cute::GMMA::Major major, class ElementType, class BLK_MN, class BLK_K, const bool is_ws_transposed_B = false>
+constexpr auto
+rs_smem_selector() {
+  using namespace cute;
+
+  auto BLK_MN0 = size<0>(BLK_MN{});
+  auto BLK_K0  = size<0>(BLK_K{});
+
+  static_assert(BLK_MN0 % 8 == 0, "BLK_MN0 must be a multiple of 8.");
+  static_assert(BLK_K0 % 8 == 0,  "BLK_K0 must be a multiple of 8.");
+  if constexpr (major == GMMA::Major::MN) {
+    if constexpr (sizeof(ElementType) == 4){
+      if constexpr (is_ws_transposed_B) {
+        // only optimized transpositionB(SW32 and SW128 for tf32) can be used, but prefer SW32 due to free bank conflict
+        if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW32_Atom<ElementType>{}) == 0) {
+          return GMMA::Layout_MN_SW32_Atom<ElementType>{};
+        }
+        else {
+          static_assert(BLK_MN0 % size<0>(GMMA::Layout_MN_SW32_Atom<ElementType>{}) == 0,
+                       "BLK_MN0 must be a multiple of size<0>(GMMA::Layout_MN_SW32_Atom<ElementType>{})");
+        }
+      }
+      else {
+        // Fall into SW32 due to free bank conflict
+        if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW32_Atom<ElementType>{}) == 0) {
+          return GMMA::Layout_MN_SW32_Atom<ElementType>{};
+        }
+        else if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_INTER_Atom<ElementType>{}) == 0) {
+          return GMMA::Layout_MN_INTER_Atom<ElementType>{};
+        }
+        else {
+          static_assert(BLK_MN0 % size<0>(GMMA::Layout_MN_INTER_Atom<ElementType>{}) == 0,
+                       "BLK_MN0 must be a multiple of size<0>(GMMA::Layout_MN_INTER_Atom<ElementType>{})");
+        }
+      }
+    }
+    // Used for int8, fp8, fp16 and bf16 I/O kernels
+    else if constexpr (sizeof(ElementType) == 1 || sizeof(ElementType) == 2) {
+      if constexpr (sizeof(ElementType) == 1 && is_ws_transposed_B) {
+        // Only optimized transpositionB (SW32 for int8 and fp8) can be used
+        if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW128_Atom<ElementType>{}) == 0) {
+          return GMMA::Layout_MN_SW128_Atom<ElementType>{};
+        }
+        else {
+          static_assert(BLK_MN0 % size<0>(GMMA::Layout_MN_SW128_Atom<ElementType>{}) == 0,
+                       "BLK_MN0 must be a multiple of size<0>(GMMA::Layout_MN_128_Atom<ElementType>{})");
+        }
+      }
+      else {
+        if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW128_Atom<ElementType>{}) == 0) {
+          return GMMA::Layout_MN_SW128_Atom<ElementType>{};
+        }
+        else if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW64_Atom<ElementType>{}) == 0) {
+          return GMMA::Layout_MN_SW64_Atom<ElementType>{};
+        }
+        else if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW32_Atom<ElementType>{}) == 0) {
+          return GMMA::Layout_MN_SW32_Atom<ElementType>{};
+        }
+        else if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_INTER_Atom<ElementType>{}) == 0) {
+          return GMMA::Layout_MN_INTER_Atom<ElementType>{};
+        }
+        else {
+          static_assert(BLK_MN0 % size<0>(GMMA::Layout_MN_INTER_Atom<ElementType>{}) == 0,
+                       "BLK_MN0 must be a multiple of size<0>(GMMA::Layout_MN_INTER_Atom<ElementType>{})");
+        }
+      }
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<ElementType>, "Smem selector does not support this element type");
+    }
+  }
+  else if constexpr (major == GMMA::Major::K) {
+    if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_SW128_Atom<ElementType>{}) == 0) {
+      return GMMA::Layout_K_SW128_Atom<ElementType>{};
+    }
+    else if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_SW64_Atom<ElementType>{}) == 0) {
+      return GMMA::Layout_K_SW64_Atom<ElementType>{};
+    }
+    else if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_SW32_Atom<ElementType>{}) == 0) {
+      return GMMA::Layout_K_SW32_Atom<ElementType>{};
+    }
+    else if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_INTER_Atom<ElementType>{}) == 0) {
+      return GMMA::Layout_K_INTER_Atom<ElementType>{};
+    }
+    else {
+      static_assert(BLK_K0 % size<1>(GMMA::Layout_K_INTER_Atom<ElementType>{}) == 0,
+                    "BLK_K0 must be a multiple of size<1>(GMMA::Layout_K_INTER_Atom<ElementType>{})");
+    }
+  }
+}
+
+// Helper for SS GMMA smem selection that considers a tensor TileShape:
+//   (BLK_MN, BLK_K)
+//   or hierarchically
+//   ((BLK_MN0,BLK_MN1,...),(BLK_K0,BLK_K1,...))
+//   and returns the largest GMMA::Layout that fits BLK_MN0 and BLK_K0
+template <cute::GMMA::Major major, class ElementType, class BLK_MN, class BLK_K>
+CUTE_HOST_DEVICE constexpr
+auto
+ss_smem_selector()
+{
+  using namespace cute;
+
+  auto BLK_MN0 = size<0>(BLK_MN{});
+  auto BLK_K0  = size<0>(BLK_K{});
+
+  static_assert(BLK_MN0 % 8 == 0, "BLK_MN0 must be a multiple of 8.");
+  static_assert(BLK_K0 % 8 == 0,  "BLK_K0 must be a multiple of 8.");
+
+  if constexpr (major == GMMA::Major::MN) {
+    if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW128_Atom<ElementType>{}) == 0) {
+      return GMMA::Layout_MN_SW128_Atom<ElementType>{};
+    }
+    else if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW64_Atom<ElementType>{}) == 0) {
+      return GMMA::Layout_MN_SW64_Atom<ElementType>{};
+    }
+    else if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW32_Atom<ElementType>{}) == 0) {
+      return GMMA::Layout_MN_SW32_Atom<ElementType>{};
+    }
+    else if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_INTER_Atom<ElementType>{}) == 0) {
+      return GMMA::Layout_MN_INTER_Atom<ElementType>{};
+    }
+    else {
+      static_assert(BLK_MN0 % size<0>(GMMA::Layout_MN_INTER_Atom<ElementType>{}) == 0,
+                    "BLK_MN0 must be a multiple of size<0>(GMMA::Layout_MN_INTER_Atom<ElementType>{})");
+    }
+  }
+  else if constexpr (major == GMMA::Major::K) {
+    if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_SW128_Atom<ElementType>{}) == 0) {
+      return GMMA::Layout_K_SW128_Atom<ElementType>{};
+    }
+    else if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_SW64_Atom<ElementType>{}) == 0) {
+      return GMMA::Layout_K_SW64_Atom<ElementType>{};
+    }
+    else if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_SW32_Atom<ElementType>{}) == 0) {
+      return GMMA::Layout_K_SW32_Atom<ElementType>{};
+    }
+    else if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_INTER_Atom<ElementType>{}) == 0) {
+      return GMMA::Layout_K_INTER_Atom<ElementType>{};
+    }
+    else {
+      static_assert(BLK_K0 % size<1>(GMMA::Layout_K_INTER_Atom<ElementType>{}) == 0,
+                    "BLK_K0 must be a multiple of size<1>(GMMA::Layout_K_INTER_Atom<ElementType>{})");
+    }
+  }
+}
+
+// Helper for SS GMMA smem selection that considers a tensor TileShape:
+//   (BLK_MN, BLK_K)
+//   or hierarchically
+//   ((BLK_MN0,BLK_MN1,...),(BLK_K0,BLK_K1,...))
+//   and returns the largest GMMA::Layout that fits BLK_MN0 and BLK_K0
+template <cute::GMMA::Major major, class ElementType, class BLK_MN, class BLK_K, class Sparsity>
+CUTE_HOST_DEVICE constexpr
+auto
+ss_smem_selector_sparse()
+{
+  using namespace cute;
+
+  auto BLK_MN0 = size<0>(BLK_MN{});
+  auto BLK_K0  = size<0>(BLK_K{});
+
+  static_assert(BLK_MN0 % 8 == 0, "BLK_MN0 must be a multiple of 8.");
+  static_assert(BLK_K0 % 8 == 0,  "BLK_K0 must be a multiple of 8.");
+
+  if constexpr (major == GMMA::Major::MN) {
+    if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW128_SpAtom<ElementType, Sparsity{}>{}) == 0) {
+      return GMMA::Layout_MN_SW128_SpAtom<ElementType, Sparsity{}>{};
+    }
+    else if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW64_SpAtom<ElementType, Sparsity{}>{}) == 0) {
+      return GMMA::Layout_MN_SW64_SpAtom<ElementType, Sparsity{}>{};
+    }
+    else if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW32_SpAtom<ElementType, Sparsity{}>{}) == 0) {
+      return GMMA::Layout_MN_SW32_SpAtom<ElementType, Sparsity{}>{};
+    }
+    else if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_INTER_SpAtom<ElementType, Sparsity{}>{}) == 0) {
+      return GMMA::Layout_MN_INTER_SpAtom<ElementType, Sparsity{}>{};
+    }
+    else {
+      static_assert(BLK_MN0 % size<0>(GMMA::Layout_MN_INTER_SpAtom<ElementType, Sparsity{}>{}) == 0,
+                    "BLK_MN0 must be a multiple of size<0>(GMMA::Layout_MN_INTER_Atom<ElementType>{})");
+    }
+  }
+  else if constexpr (major == GMMA::Major::K) {
+    if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_SW128_SpAtom<ElementType, Sparsity{}>{}) == 0) {
+      return GMMA::Layout_K_SW128_SpAtom<ElementType, Sparsity{}>{};
+    }
+    else if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_SW64_SpAtom<ElementType, Sparsity{}>{}) == 0) {
+      return GMMA::Layout_K_SW64_SpAtom<ElementType, Sparsity{}>{};
+    }
+    else if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_SW32_SpAtom<ElementType, Sparsity{}>{}) == 0) {
+      return GMMA::Layout_K_SW32_SpAtom<ElementType, Sparsity{}>{};
+    }
+    else if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_INTER_SpAtom<ElementType, Sparsity{}>{}) == 0) {
+      return GMMA::Layout_K_INTER_SpAtom<ElementType, Sparsity{}>{};
+    }
+    else {
+      static_assert(BLK_K0 % size<1>(GMMA::Layout_K_INTER_SpAtom<ElementType, Sparsity{}>{}) == 0,
+                    "BLK_K0 must be a multiple of size<1>(GMMA::Layout_K_INTER_Atom<ElementType>{})");
+    }
+  }
+}
+
+template <class ElementA, class ElementB>
+constexpr bool
+is_input_size_two_bytes() {
+  return (sizeof(ElementA) == 2 && sizeof(ElementB) == 2);
+}
+
+template <class ElementA, class ElementB>
+constexpr bool
+is_input_fp8() {
+  return ((cute::is_same_v<ElementA, float_e4m3_t> || cute::is_same_v<ElementA, float_e5m2_t>) &&
+          (cute::is_same_v<ElementB, float_e4m3_t> || cute::is_same_v<ElementB, float_e5m2_t>));
+}
+
+// We need to handle the tuples in this function since it is used in SFINAE dispatch in the CollectiveBuilder.
+// At that point, it is not guaranteed that the tuples have been split out into the required parts.
+template <class MaybeTupleElementA, class LayoutA, class MaybeTupleElementB, class LayoutB>
+constexpr bool
+is_use_rmem_A() {
+
+  using ElementA = detail::deduce_mixed_width_dtype_t<0, MaybeTupleElementA>;
+  using ElementB = detail::deduce_mixed_width_dtype_t<0, MaybeTupleElementB>;
+
+  constexpr bool IsABDifferentWidth = cute::sizeof_bits_v<ElementA> != cute::sizeof_bits_v<ElementB>;
+  constexpr bool HasScales = cute::is_tuple<MaybeTupleElementA>::value ^ cute::is_tuple<MaybeTupleElementB>::value;
+  constexpr bool IsInputSizeTwoBytes = is_input_size_two_bytes<ElementA, ElementB>();
+  constexpr bool IsLayoutAkBk = cutlass::gemm::detail::is_k_major_A<LayoutA>() &&
+                                cutlass::gemm::detail::is_k_major_B<LayoutB>();
+  constexpr bool IsUseRmemA = (!IsInputSizeTwoBytes && !IsLayoutAkBk) || IsABDifferentWidth || HasScales;
+  return IsUseRmemA;
+}
+
+template <class ElementA, int AlignmentA, class ElementB, int AlignmentB, int RequiredAlignment>
+constexpr bool
+is_aligned() {
+  return ((sizeof(ElementA) * AlignmentA) % RequiredAlignment == 0) &&
+         ((sizeof(ElementB) * AlignmentB) % RequiredAlignment == 0);
+}
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
diff --git a/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl b/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl
new file mode 100644
index 0000000000..f58fde8803
--- /dev/null
+++ b/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl
@@ -0,0 +1,1002 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/gemm/collective/builders/sm90_common.inl"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/pipeline/sm90_pipeline.hpp"
+#include "cutlass/gemm/collective/collective_mma_decl.hpp"
+#include "cutlass/gemm/collective/collective_builder_decl.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/tensor.hpp"
+
+// SM90 Collective Builders should be used only starting CUDA 12.0
+#if (__CUDACC_VER_MAJOR__ >= 12)
+#define CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count.
+template<int capacity_bytes, class ElementA, class ElementB, class TileShapeMNK, int stages, int alignment = 128>
+constexpr int
+compute_stage_count_or_override(StageCount<stages> stage_count) {
+  return stages;
+}
+
+// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count.
+template<int capacity_bytes, class ElementA, class ElementB, class TileShapeMNK, int stages, int alignment = 128>
+constexpr int
+compute_stage_count_or_override(cute::Int<stages> stage_count) {
+  return stages;
+}
+
+// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count.
+template<int capacity_bytes_, class ElementA, class ElementB, class TileShapeMNK, int carveout_bytes_, int alignment = 128>
+constexpr int
+compute_stage_count_or_override(StageCountAutoCarveout<carveout_bytes_> stage_count) {
+  constexpr auto mainloop_pipeline_bytes = sizeof(typename cutlass::PipelineTmaAsync<1>::SharedStorage);
+  constexpr auto a_bits = cute::sizeof_bits_v<ElementA>;
+  constexpr auto b_bits = cute::sizeof_bits_v<ElementB>;
+  constexpr int stage_bytes_ =
+    cutlass::bits_to_bytes(a_bits * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
+    cutlass::bits_to_bytes(b_bits * size<1>(TileShapeMNK{}) * size<2>(TileShapeMNK{}));
+
+  constexpr int stage_bytes = cutlass::round_up(stage_bytes_, alignment) +
+    static_cast<int>(mainloop_pipeline_bytes);
+  constexpr int carveout_bytes = cutlass::round_up(carveout_bytes_, alignment);
+  constexpr int capacity_bytes = capacity_bytes_ / alignment * alignment;
+
+  return (capacity_bytes - carveout_bytes) / stage_bytes;
+}
+
+// Returns the maximum number of smem tiles that can be used with a given smem capacity (with an optional scale matrix), or overrides with manual count.
+template<int capacity_bytes, class ElementA, class ElementB, class ElementScale, class ElementZero, class TileShapeMNK, int stages, int alignment = 128>
+constexpr int
+compute_stage_count_or_override_single_affine_transformed_input(StageCount<stages> stage_count) {
+  return stages;
+}
+
+template <class Element>
+constexpr int get_bits_for_possibly_void_element() {
+  if constexpr (cute::is_same_v<Element, void>) {
+    return 0;
+  }
+  else {
+    return sizeof_bits<Element>::value;
+  }
+}
+
+// Returns the maximum number of smem tiles that can be used with a given smem capacity (with an optional scale matrix), or overrides with manual count.
+template<int capacity_bytes_, class ElementA, class ElementB, class ElementScale, class ElementZero, class TileShapeMNK, int carveout_bytes_, int alignment = 128>
+constexpr int
+compute_stage_count_or_override_single_affine_transformed_input(StageCountAutoCarveout<carveout_bytes_> stage_count) {
+
+  // 32 bytes to account for barriers etc.
+  constexpr auto mainloop_pipeline_bytes = sizeof(typename cutlass::PipelineTmaAsync<1>::SharedStorage);
+  constexpr int scale_zero_k_tile = 1;
+  constexpr auto a_bits = cute::sizeof_bits_v<ElementA>;
+  constexpr auto b_bits = cute::sizeof_bits_v<ElementB>;
+  constexpr auto s_bits = get_bits_for_possibly_void_element<ElementScale>();
+  constexpr auto z_bits = get_bits_for_possibly_void_element<ElementZero>();
+
+  constexpr auto scale_bytes = cutlass::bits_to_bytes(s_bits * size<0>(TileShapeMNK{}) * scale_zero_k_tile);
+  constexpr auto zero_bytes  = cutlass::bits_to_bytes(z_bits * size<0>(TileShapeMNK{}) * scale_zero_k_tile);
+  static_assert(scale_bytes % 128 == 0, "Scale bytes must be a multiple of 128");
+  static_assert(zero_bytes  % 128 == 0, "Zero bytes must be a multiple of 128");
+
+  // When scales are void, s_bits will be 0 so no smem will be allocated for scales.
+  constexpr int stage_bytes_ =
+    cutlass::bits_to_bytes(a_bits * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
+    cutlass::bits_to_bytes(b_bits * size<1>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
+    scale_bytes + zero_bytes;
+
+  constexpr int stage_bytes = cutlass::round_up(stage_bytes_, alignment) +
+    static_cast<int>(mainloop_pipeline_bytes);
+  constexpr int carveout_bytes = cutlass::round_up(carveout_bytes_, alignment);
+  constexpr int capacity_bytes = capacity_bytes_ / alignment * alignment;
+
+  return (capacity_bytes - carveout_bytes) / stage_bytes;
+}
+
+template <class ElementA, class LayoutA, class ElementB, class LayoutB>
+constexpr bool
+is_swapAB(){
+  constexpr bool IsInputSizeTwoBytes = is_input_size_two_bytes<ElementA, ElementB>();
+  constexpr bool IsLayoutAkBmn = cutlass::gemm::detail::is_k_major_A<LayoutA>() &&
+                                 cutlass::gemm::detail::is_mn_major_B<LayoutB>();
+  constexpr bool SwapAB = !IsInputSizeTwoBytes && IsLayoutAkBmn;
+  return SwapAB;
+}
+
+template <class ElementA, class LayoutA, class ElementB, class LayoutB, class KernelScheduleType>
+constexpr bool
+is_warpspecialized_transpose_B(){
+  constexpr bool IsInputSizeTwoBytes = is_input_size_two_bytes<ElementA, ElementB>();
+  constexpr bool IsLayoutAmnBmn = cutlass::gemm::detail::is_mn_major_A<LayoutA>() &&
+                                  cutlass::gemm::detail::is_mn_major_B<LayoutB>();
+  constexpr bool IsWarpSpecialized = cute::is_base_of_v<KernelTmaWarpSpecialized, KernelScheduleType>                ||
+                                     cute::is_base_of_v<KernelTmaWarpSpecializedPingpong, KernelScheduleType>        ||
+                                     cute::is_base_of_v<KernelTmaWarpSpecializedCooperative, KernelScheduleType>     ||
+                                     cute::is_base_of_v<KernelCpAsyncWarpSpecialized, KernelScheduleType>            ||
+                                     cute::is_base_of_v<KernelCpAsyncWarpSpecializedPingpong, KernelScheduleType>    ||
+                                     cute::is_base_of_v<KernelCpAsyncWarpSpecializedCooperative, KernelScheduleType>;
+  constexpr bool IsWarpSpecializedTransposeB = !IsInputSizeTwoBytes && IsLayoutAmnBmn && IsWarpSpecialized;
+  return IsWarpSpecializedTransposeB;
+}
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA_TMA_WS_SS
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelScheduleType,
+    cute::enable_if_t<
+      (cute::is_any_of_v<KernelScheduleType,
+                         KernelTmaWarpSpecialized,
+                         KernelTmaWarpSpecializedCooperative,
+                         KernelTmaWarpSpecializedPingpong,
+                         KernelPtrArrayTmaWarpSpecializedCooperative,
+                         KernelPtrArrayTmaWarpSpecializedPingpong>) &&
+       not detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>()>
+> {
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
+                "Should meet TMA alignment requirement\n");
+
+  static constexpr bool IsArrayOfPointersGemm = (cute::is_any_of_v<KernelScheduleType,
+                                                                   KernelPtrArrayTmaWarpSpecializedCooperative,
+                                                                   KernelPtrArrayTmaWarpSpecializedPingpong>);
+  static constexpr bool IsFP8Input = detail::is_input_fp8<ElementA, ElementB>();
+  static_assert(!IsFP8Input || (IsFP8Input && !IsArrayOfPointersGemm),
+                "KernelPtrArrayTmaWarpSpecialized[Cooperative|Pingpong] is only compatible with FP8 FastAccum version right now.");
+
+  // For fp32 types, map to tf32 MMA value type
+  using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
+  using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
+
+  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A<ElementAMma, GmemLayoutATag>();
+  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B<ElementBMma, GmemLayoutBTag>();
+
+  static constexpr bool IsCooperative = cute::is_any_of_v<KernelScheduleType,
+                                                          KernelTmaWarpSpecializedCooperative,
+                                                          KernelPtrArrayTmaWarpSpecializedCooperative>;
+  using AtomLayoutMNK = cute::conditional_t<IsCooperative,
+      Layout<Shape<_2,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector<
+      ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{}));
+
+  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+
+  using SmemLayoutAtomA = decltype(detail::ss_smem_selector<
+      GmmaMajorA, ElementAMma, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+  using SmemLayoutAtomB = decltype(detail::ss_smem_selector<
+      GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+ 
+  static constexpr int Sm90ReducedSmemCapacityBytes =
+    detail::sm90_smem_capacity_bytes;
+
+  static constexpr int PipelineStages = detail::compute_stage_count_or_override<Sm90ReducedSmemCapacityBytes,
+      ElementAMma, ElementBMma, TileShape_MNK>(StageCountType{});
+  using DispatchPolicy = cute::conditional_t<IsArrayOfPointersGemm,
+      MainloopSm90ArrayTmaGmmaWarpSpecialized<PipelineStages, ClusterShape_MNK, KernelScheduleType>,
+      /* For FP8 use a separate mainloop compared to other datatypes */
+      cute::conditional_t<IsFP8Input,
+          MainloopSm90TmaGmmaWarpSpecializedFP8<PipelineStages, ClusterShape_MNK, KernelScheduleType>,
+          MainloopSm90TmaGmmaWarpSpecialized<PipelineStages, ClusterShape_MNK, KernelScheduleType>>>;
+
+  using SmemCopyAtomA = void;
+  using SmemCopyAtomB = void;
+
+  using CollectiveOp = CollectiveMma<
+      DispatchPolicy,
+      TileShape_MNK,
+      ElementA,
+      TagToStrideA_t<GmemLayoutATag>,
+      ElementB,
+      TagToStrideB_t<GmemLayoutBTag>,
+      TiledMma,
+      GmemTiledCopyA,
+      SmemLayoutAtomA,
+      SmemCopyAtomA,
+      cute::identity,
+      GmemTiledCopyB,
+      SmemLayoutAtomB,
+      SmemCopyAtomB,
+      cute::identity
+    >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA_TMA_WS_RS 
+template <
+  class ElementA_,
+  class GmemLayoutATag_,
+  int AlignmentA,
+  class ElementB_,
+  class GmemLayoutBTag_,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassTensorOp,
+    ElementA_,
+    GmemLayoutATag_,
+    AlignmentA,
+    ElementB_,
+    GmemLayoutBTag_,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelScheduleType,
+    cute::enable_if_t<
+      (cute::is_same_v<KernelScheduleType,  KernelTmaWarpSpecialized> ||
+       cute::is_same_v<KernelScheduleType,  KernelTmaWarpSpecializedPingpong> ||
+       cute::is_same_v<KernelScheduleType,  KernelTmaWarpSpecializedCooperative> ||
+       cute::is_same_v<KernelScheduleType,  KernelPtrArrayTmaWarpSpecializedCooperative> ||
+       cute::is_same_v<KernelScheduleType,  KernelPtrArrayTmaWarpSpecializedPingpong>) && 
+      (detail::is_use_rmem_A<ElementA_, GmemLayoutATag_, ElementB_, GmemLayoutBTag_>() ||
+       // ConvertAndScale and ConvertAndScaleWithZero 
+       cute::is_tuple<ElementA_>::value || cute::is_tuple<ElementB_>::value || 
+       // DirectConvert
+       sizeof_bits<ElementA_>::value != sizeof_bits<ElementB_>::value)>
+> {
+
+private:
+  using ScaleA = detail::deduce_mixed_width_dtype_t<1, ElementA_>;
+  using ScaleB = detail::deduce_mixed_width_dtype_t<1, ElementB_>;
+  using ZeroA = detail::deduce_mixed_width_dtype_t<2, ElementA_>;
+  using ZeroB = detail::deduce_mixed_width_dtype_t<2, ElementB_>;
+  static constexpr bool NeitherIsTuple = !cute::is_tuple<ElementA_>::value && !cute::is_tuple<ElementB_>::value;
+  // Determine if mixed input types.
+  static constexpr bool IsMixedInput =
+    cute::sizeof_bits_v<detail::deduce_mixed_width_dtype_t<0, ElementA_>> != cute::sizeof_bits_v<detail::deduce_mixed_width_dtype_t<0, ElementB_>>;
+  static constexpr bool IsArrayOfPointersGemm = cute::is_any_of_v<KernelScheduleType,
+                                                                  KernelPtrArrayTmaWarpSpecializedCooperative,
+                                                                  KernelPtrArrayTmaWarpSpecializedPingpong>;
+  static_assert(IsMixedInput || !IsArrayOfPointersGemm, "Only mixed input grouped RS GEMM is supported.");
+
+public:
+  using ElementA = detail::deduce_mixed_width_dtype_t<0, ElementA_>;
+  using ElementB = detail::deduce_mixed_width_dtype_t<0, ElementB_>;
+
+  static_assert(!IsMixedInput || (cute::is_tuple<ElementA_>::value ^ cute::is_tuple<ElementB_>::value ||
+               (NeitherIsTuple && (sizeof_bits<ElementA>::value != sizeof_bits<ElementB>::value))),
+    "Either A OR B must be a tuple or the widths of A and B must be different.");
+
+  static constexpr bool IsANarrow = sizeof_bits<ElementA>::value < sizeof_bits<ElementB>::value;
+
+  template<class T>
+  static auto get_stride(T const& t) {
+    if constexpr (not cute::is_layout<cute::remove_pointer_t<T>>::value) {
+      return t;
+    }
+    else {
+        return cute::stride(t);
+    }
+  }
+
+  using GmemLayoutATag = decltype(get_stride(GmemLayoutATag_{}));
+  using GmemLayoutBTag = decltype(get_stride(GmemLayoutBTag_{}));
+
+  using ElementPairA = cute::conditional_t<IsMixedInput && IsANarrow && NeitherIsTuple, cute::tuple<ElementA>, ElementA_>;
+  using ElementPairB = cute::conditional_t<IsMixedInput && !IsANarrow && NeitherIsTuple, cute::tuple<ElementB>, ElementB_>;
+
+  static constexpr bool IsATransformed = cute::is_tuple<ElementPairA>::value;
+  using ElementScale = cute::conditional_t<IsATransformed, ScaleA, ScaleB>;
+  using ElementZero = cute::conditional_t<IsATransformed, ZeroA, ZeroB>;
+
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
+                "Should meet TMA alignment requirement\n");
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_rs_tag_to_major_A<GmemLayoutATag>();
+  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_rs_tag_to_major_B<GmemLayoutBTag>();
+  // If A is scaled, then we don't need to swap. Otherwise, we must ensure B goes to rmem and we must swap the operands.
+  static constexpr bool SwapAB = IsMixedInput ? !IsATransformed : detail::is_swapAB<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>();
+  static constexpr bool IsWarpSpecializedTransposeB = detail::is_warpspecialized_transpose_B<
+      ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag, KernelScheduleType>();
+  static_assert(!IsMixedInput || !IsWarpSpecializedTransposeB, "Mixed input GEMM does not support WS transpose B.");
+
+  // When we relax the above assertion, we must handle setting the tile mma GmmaMajorB correctly.
+  static constexpr cute::GMMA::Major TiledMmaGmmaMajorB = SwapAB ? GmmaMajorA : GmmaMajorB;
+
+  // For fp32 types, map to tf32 MMA value type.
+  using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
+  using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
+
+  // Handle mixed dtypes and MMA.
+  using RealElementA = cute::conditional_t<SwapAB, ElementBMma, ElementAMma>;
+  using RealElementB = cute::conditional_t<SwapAB, ElementAMma, ElementBMma>;
+  using RealElementAMma = cute::conditional_t<IsMixedInput, RealElementB, RealElementA>;
+  // Always the same for element B.
+  using RealElementBMma = RealElementB;
+
+  static_assert(!IsMixedInput || TiledMmaGmmaMajorB == GMMA::Major::K || sizeof_bits<RealElementB>::value == 16, 
+      "Mixed input GEMM does not support MN major layout except for 16bit");
+
+  using AtomLayoutMNK = cute::conditional_t<
+      cute::is_any_of_v<KernelScheduleType, 
+                        KernelTmaWarpSpecializedCooperative,
+                        KernelPtrArrayTmaWarpSpecializedCooperative>,
+      Layout<Shape<_2,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::rs_op_selector<
+      RealElementAMma, RealElementBMma, ElementAccumulator, TileShape_MNK, GMMA::Major::K, GMMA::Major::K>(), AtomLayoutMNK{}));
+
+  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+
+  using SmemLayoutAtomA = decltype(detail::rs_smem_selector<GmmaMajorA, ElementAMma,
+      decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{})), IsWarpSpecializedTransposeB>());
+  using SmemLayoutAtomB = decltype(detail::rs_smem_selector<GmmaMajorB, ElementBMma,
+      decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{})), IsWarpSpecializedTransposeB>());
+
+  static constexpr size_t SmemAlignmentA = cutlass::detail::alignment_for_swizzle(SmemLayoutAtomA{}); 
+  static constexpr size_t SmemAlignmentB = cutlass::detail::alignment_for_swizzle(SmemLayoutAtomB{});
+  static constexpr int    SmemAlignment  = static_cast<int>(cute::max(SmemAlignmentA, SmemAlignmentB));
+
+  // Handle mixed dtype array GEMM's size of tensor map storage.
+  static constexpr size_t TensorMapStorage = sizeof(cute::TmaDescriptor) * size_t(IsMixedInput) * 4;
+  static constexpr int KernelSmemCarveout = static_cast<int>(TensorMapStorage);
+  static constexpr int Sm90ReducedSmemCapacityBytes = detail::sm90_smem_capacity_bytes - KernelSmemCarveout;
+
+  static constexpr int PipelineStages = IsMixedInput ?
+        detail::compute_stage_count_or_override_single_affine_transformed_input<detail::sm90_smem_capacity_bytes,
+          RealElementA, RealElementB, ElementScale, ElementZero, TileShape_MNK, StageCountType::bytes, SmemAlignment>(StageCountType{})
+      : detail::compute_stage_count_or_override<detail::sm90_smem_capacity_bytes,
+          ElementAMma, ElementBMma, TileShape_MNK, StageCountType::bytes, SmemAlignment>(StageCountType{});
+      
+  using DispatchPolicy = cute::conditional_t<IsMixedInput,
+        MainloopSm90TmaGmmaRmemAWarpSpecializedMixedInput<PipelineStages, ClusterShape_MNK, KernelScheduleType>
+      , MainloopSm90TmaGmmaRmemAWarpSpecialized<PipelineStages, ClusterShape_MNK, KernelScheduleType>>;
+
+  using SmemCopyAtomA = cute::conditional_t<SwapAB, void, Copy_Atom<cute::AutoVectorizingCopy, ElementA>>;
+  using SmemCopyAtomB = cute::conditional_t<SwapAB, Copy_Atom<cute::AutoVectorizingCopy, ElementB>, void>;
+  
+  // We pack the scale data with the operand that will be optionally scaled and converted before MMA.
+  using StrideA = cute::conditional_t<cute::is_layout<cute::remove_pointer_t<GmemLayoutATag_>>::value, GmemLayoutATag_, TagToStrideA_t<GmemLayoutATag>>;
+  using StrideB = cute::conditional_t<cute::is_layout<cute::remove_pointer_t<GmemLayoutBTag_>>::value, GmemLayoutBTag_, TagToStrideB_t<GmemLayoutBTag>>;
+
+  using CollectiveOp = CollectiveMma<
+      DispatchPolicy,
+      TileShape_MNK,
+      ElementPairA,
+      StrideA,
+      ElementPairB,
+      StrideB,
+      TiledMma,
+      GmemTiledCopyA,
+      SmemLayoutAtomA,
+      SmemCopyAtomA,
+      cute::identity,
+      GmemTiledCopyB,
+      SmemLayoutAtomB,
+      SmemCopyAtomB,
+      cute::identity
+    >;
+
+  static_assert(SmemAlignment == static_cast<int>(cute::max(CollectiveOp::SmemAlignmentA, CollectiveOp::SmemAlignmentB)));
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA_TMA_WS_FP8_FAST_ACCUM_SS
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelScheduleType,
+    cute::enable_if_t<
+      cute::is_any_of_v<KernelScheduleType,
+                        KernelTmaWarpSpecializedFP8FastAccum,
+                        KernelTmaWarpSpecializedPingpongFP8FastAccum,
+                        KernelTmaWarpSpecializedCooperativeFP8FastAccum,
+                        KernelPtrArrayTmaWarpSpecializedCooperativeFP8FastAccum,
+                        KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum>>
+> {
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
+                "Not meet TMA alignment requirement yet\n");
+  static_assert(detail::is_input_fp8<ElementA, ElementB>(),
+                "Only FP8 datatypes are compatible with these kernel schedules\n");
+  // Dispatch TN fp8 kernels only to TMA warp specialized FP8 builder
+  static_assert(!detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>(),
+                 "Not supported for fp8 non-TN warp specialized kernels yet\n");
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+
+  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A<ElementA, GmemLayoutATag>();
+  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B<ElementB, GmemLayoutBTag>();
+
+  static constexpr bool IsArrayOfPointersGemm = cute::is_any_of_v<KernelScheduleType,
+                                                                   KernelPtrArrayTmaWarpSpecializedCooperativeFP8FastAccum,
+                                                                   KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum>;
+
+  static constexpr bool IsCooperative = cute::is_any_of_v<KernelScheduleType,
+                                                          KernelTmaWarpSpecializedCooperativeFP8FastAccum,
+                                                          KernelPtrArrayTmaWarpSpecializedCooperativeFP8FastAccum>;
+
+  using AtomLayoutMNK = cute::conditional_t<IsCooperative, Layout<Shape<_2,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector<
+      ElementA, ElementB, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{}));
+
+  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+
+  using SmemLayoutAtomA = decltype(detail::ss_smem_selector<
+      GmmaMajorA, ElementA, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+  using SmemLayoutAtomB = decltype(detail::ss_smem_selector<
+      GmmaMajorB, ElementB, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  static constexpr size_t TensorMapStorage = IsArrayOfPointersGemm ? sizeof(cute::TmaDescriptor) * 2 /* for A and B */ : 0;
+  static constexpr int KernelSmemCarveout = static_cast<int>(TensorMapStorage);
+  static constexpr int Sm90ReducedSmemCapacityBytes = detail::sm90_smem_capacity_bytes - KernelSmemCarveout;
+
+  static constexpr int PipelineStages = detail::compute_stage_count_or_override<Sm90ReducedSmemCapacityBytes,
+      ElementA, ElementB, TileShape_MNK>(StageCountType{});
+  using DispatchPolicy = cute::conditional_t<IsArrayOfPointersGemm,
+      MainloopSm90ArrayTmaGmmaWarpSpecialized<PipelineStages, ClusterShape_MNK, KernelScheduleType>,
+      MainloopSm90TmaGmmaWarpSpecialized<PipelineStages, ClusterShape_MNK, KernelScheduleType>>;
+
+  using SmemCopyAtomA = void;
+  using SmemCopyAtomB = void;
+
+  using CollectiveOp = CollectiveMma<
+      DispatchPolicy,
+      TileShape_MNK,
+      ElementA,
+      TagToStrideA_t<GmemLayoutATag>,
+      ElementB,
+      TagToStrideB_t<GmemLayoutBTag>,
+      TiledMma,
+      GmemTiledCopyA,
+      SmemLayoutAtomA,
+      SmemCopyAtomA,
+      cute::identity,
+      GmemTiledCopyB,
+      SmemLayoutAtomB,
+      SmemCopyAtomB,
+      cute::identity
+    >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA_TMA_SS
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelScheduleType,
+    cute::enable_if_t<cute::is_same_v<KernelScheduleType, KernelTma> &&
+                     not detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>()>
+> {
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
+                "Should meet TMA alignment requirement\n");
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+
+  // For fp32 types, map to tf32 MMA value type
+  using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
+  using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
+
+  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A<ElementAMma, GmemLayoutATag>();
+  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B<ElementBMma, GmemLayoutBTag>();
+
+  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector<
+      ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>()));
+
+  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+
+  using SmemLayoutAtomA = decltype(detail::ss_smem_selector<
+      GmmaMajorA, ElementAMma, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+  using SmemLayoutAtomB = decltype(detail::ss_smem_selector<
+      GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  static constexpr int PipelineStages = detail::compute_stage_count_or_override<detail::sm90_smem_capacity_bytes,
+      ElementAMma, ElementBMma, TileShape_MNK>(StageCountType{});
+  using DispatchPolicy = MainloopSm90TmaGmma<PipelineStages, ClusterShape_MNK>;
+
+  using SmemCopyAtomA = void;
+  using SmemCopyAtomB = void;
+
+  using CollectiveOp = CollectiveMma<
+      DispatchPolicy,
+      TileShape_MNK,
+      ElementA,
+      TagToStrideA_t<GmemLayoutATag>,
+      ElementB,
+      TagToStrideB_t<GmemLayoutBTag>,
+      TiledMma,
+      GmemTiledCopyA,
+      SmemLayoutAtomA,
+      SmemCopyAtomA,
+      cute::identity,
+      GmemTiledCopyB,
+      SmemLayoutAtomB,
+      SmemCopyAtomB,
+      cute::identity
+    >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA_CpAsync
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType
+>
+struct [[deprecated("Use one of KernelCpAsyncWarpSpecialized schedules instead")]]
+CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelScheduleType,
+    cute::enable_if_t<
+      cute::is_same_v<KernelScheduleType, KernelMultistage>>
+> {
+  // Map to warp-specialized kernels for better performance
+  using CollectiveOp = typename CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelCpAsyncWarpSpecialized
+  >::CollectiveOp;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA_CpAsync_WS_SS
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int   AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int   AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelScheduleType,
+    cute::enable_if_t<
+      (cute::is_same_v<KernelScheduleType, KernelCpAsyncWarpSpecialized> ||
+       cute::is_same_v<KernelScheduleType, KernelCpAsyncWarpSpecializedCooperative> ||
+       cute::is_same_v<KernelScheduleType, KernelCpAsyncWarpSpecializedPingpong>) &&
+      not detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>()
+    >
+> {
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+
+  // For fp32 types, map to tf32 MMA value type
+  using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
+  using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
+
+  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::cp_async_min_alignment_bytes>(),
+                "Minimum alignment required for cp.async is 4B.");
+
+  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A<ElementA, GmemLayoutATag>();
+  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B<ElementB, GmemLayoutBTag>();
+
+  using AtomLayoutMNK = cute::conditional_t<cute::is_same_v<KernelScheduleType, KernelCpAsyncWarpSpecializedCooperative>,
+      Layout<Shape<cute::Int<(size<0>(TileShape_MNK{}) < 128) ? 1 : 2>,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector<
+      ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{}));
+
+  static constexpr int NumLoadWarpGroups = cute::is_same_v<KernelScheduleType, KernelCpAsyncWarpSpecialized> ? 2 : 1;
+
+  using AlignmentTypeA = cute::uint_byte_t<static_cast<int>(sizeof(ElementA)) * AlignmentA>;
+  using GmemCopyAtomA = cute::Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS_ZFILL<AlignmentTypeA>, ElementA>;
+  using GmemTiledCopyA = decltype(detail::make_simt_gmem_tiled_copy<
+      GmemCopyAtomA, NumThreadsPerWarpGroup * NumLoadWarpGroups, AlignmentA, TagToStrideA_t<GmemLayoutATag>,
+      decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  using AlignmentTypeB = cute::uint_byte_t<static_cast<int>(sizeof(ElementB)) * AlignmentB>;
+  using GmemCopyAtomB = cute::Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS_ZFILL<AlignmentTypeB>, ElementB>;
+  using GmemTiledCopyB = decltype(detail::make_simt_gmem_tiled_copy<
+      GmemCopyAtomB, NumThreadsPerWarpGroup * NumLoadWarpGroups, AlignmentB, TagToStrideB_t<GmemLayoutBTag>,
+      decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  using SmemLayoutAtomA = decltype(detail::ss_smem_selector<
+      GmmaMajorA, ElementAMma, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+  using SmemLayoutAtomB = decltype(detail::ss_smem_selector<
+      GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  static constexpr int PipelineStages = detail::compute_stage_count_or_override<
+      detail::sm90_smem_capacity_bytes, ElementAMma, ElementBMma, TileShape_MNK>(StageCountType{});
+
+  using DispatchPolicy = MainloopSm90CpAsyncGmmaWarpSpecialized<
+      PipelineStages, ClusterShape_MNK, KernelScheduleType>;
+
+  using CollectiveOp = CollectiveMma<
+      DispatchPolicy,
+      TileShape_MNK,
+      ElementA,
+      TagToStrideA_t<GmemLayoutATag>,
+      ElementB,
+      TagToStrideB_t<GmemLayoutBTag>,
+      TiledMma,
+      GmemTiledCopyA,
+      SmemLayoutAtomA,
+      void,
+      cute::identity,
+      GmemTiledCopyB,
+      SmemLayoutAtomB,
+      void,
+      cute::identity
+    >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA_CpAsync_WS_RS
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int   AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int   AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelScheduleType,
+    cute::enable_if_t<
+      (cute::is_same_v<KernelScheduleType, KernelCpAsyncWarpSpecialized> ||
+       cute::is_same_v<KernelScheduleType, KernelCpAsyncWarpSpecializedCooperative> ||
+       cute::is_same_v<KernelScheduleType, KernelCpAsyncWarpSpecializedPingpong>) &&
+      detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>()
+    >
+> {
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+
+  // For fp32 types, map to tf32 MMA value type
+  using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
+  using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
+
+  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::cp_async_min_alignment_bytes>(),
+                "Minimum alignment required for cp.async is 4B.");
+
+  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_rs_tag_to_major_A<GmemLayoutATag>();
+  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_rs_tag_to_major_B<GmemLayoutBTag>();
+  static constexpr bool SwapAB = detail::is_swapAB<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>();
+  static constexpr bool IsWarpSpecializedTransposeB = detail::is_warpspecialized_transpose_B<
+      ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag, KernelScheduleType>();
+
+  using AtomLayoutMNK = cute::conditional_t<cute::is_same_v<KernelScheduleType, KernelCpAsyncWarpSpecializedCooperative>,
+      Layout<Shape<cute::Int<(size<0>(TileShape_MNK{}) < 128) ? 1 : 2>,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::rs_op_selector<
+      ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, GMMA::Major::K, GMMA::Major::K>(), AtomLayoutMNK{}));
+
+  static constexpr int NumLoadWarpGroups = 1;
+
+  using AlignmentTypeA = cute::uint_byte_t<static_cast<int>(sizeof(ElementA)) * AlignmentA>;
+  using GmemCopyAtomA = cute::Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS_ZFILL<AlignmentTypeA>, ElementA>;
+  using GmemTiledCopyA = decltype(detail::make_simt_gmem_tiled_copy<
+      GmemCopyAtomA, NumThreadsPerWarpGroup * NumLoadWarpGroups, AlignmentA, TagToStrideA_t<GmemLayoutATag>,
+      decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  using AlignmentTypeB = cute::uint_byte_t<static_cast<int>(sizeof(ElementB)) * AlignmentB>;
+  using GmemCopyAtomB = cute::Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS_ZFILL<AlignmentTypeB>, ElementB>;
+  using GmemTiledCopyB = decltype(detail::make_simt_gmem_tiled_copy<
+      GmemCopyAtomB, NumThreadsPerWarpGroup * NumLoadWarpGroups, AlignmentB, TagToStrideB_t<GmemLayoutBTag>,
+      decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  using SmemLayoutAtomA = decltype(detail::rs_smem_selector<GmmaMajorA, ElementAMma,
+      decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{})), IsWarpSpecializedTransposeB>());
+  using SmemLayoutAtomB = decltype(detail::rs_smem_selector<GmmaMajorB, ElementBMma,
+      decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{})), IsWarpSpecializedTransposeB>());
+
+  static constexpr int PipelineStages = detail::compute_stage_count_or_override<
+      detail::sm90_smem_capacity_bytes, ElementAMma, ElementBMma, TileShape_MNK>(StageCountType{});
+
+  using DispatchPolicy = MainloopSm90CpAsyncGmmaRmemAWarpSpecialized<
+      PipelineStages, ClusterShape_MNK, KernelScheduleType>;
+
+  using SmemCopyAtomA = cute::conditional_t<SwapAB, void, Copy_Atom<cute::AutoVectorizingCopy, ElementA>>;
+  using SmemCopyAtomB = cute::conditional_t<SwapAB, Copy_Atom<cute::AutoVectorizingCopy, ElementB>, void>;
+
+  using CollectiveOp = CollectiveMma<
+      DispatchPolicy,
+      TileShape_MNK,
+      ElementA,
+      TagToStrideA_t<GmemLayoutATag>,
+      ElementB,
+      TagToStrideB_t<GmemLayoutBTag>,
+      TiledMma,
+      GmemTiledCopyA,
+      SmemLayoutAtomA,
+      SmemCopyAtomA,
+      cute::identity,
+      GmemTiledCopyB,
+      SmemLayoutAtomB,
+      SmemCopyAtomB,
+      cute::identity
+    >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA auto kernel schedule
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelScheduleType,
+    cute::enable_if_t<cute::is_same_v<KernelScheduleType, KernelScheduleAuto>>
+> {
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+
+using ExtractedElementA = detail::deduce_mixed_width_dtype_t<0, ElementA>;
+using ExtractedElementB = detail::deduce_mixed_width_dtype_t<0, ElementB>;
+
+static constexpr bool IsTmaCompatible = detail::is_aligned<
+    ExtractedElementA, AlignmentA, ExtractedElementB, AlignmentB, detail::tma_alignment_bytes>();
+
+// Users opt into scales via the builder by passing a tuple of Elements for the input that will be scaled. We detect
+// scale support if ONLY one of the inputs have tuples to describe them.
+static constexpr bool OnlyOneIsTuple = cute::is_tuple<ElementA>::value ^ cute::is_tuple<ElementB>::value;
+static constexpr bool IsDifferentWidth = sizeof_bits<ExtractedElementA>::value != sizeof_bits<ExtractedElementB>::value;
+static constexpr bool IsMixedWidthInput = IsDifferentWidth || (IsDifferentWidth && OnlyOneIsTuple);
+
+#if ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 1)))
+  // Persistent schedules perform best for CUDA Toolkits with version >= 12.1
+  // KernelTmaWarpSpecializedCooperative requires TileShape_M to be at least 128
+  using KernelTmaWarpSpecializedSchedule = cute::conditional_t<size<0>(TileShape_MNK{}) == Int<64>{},
+      KernelTmaWarpSpecializedPingpong, KernelTmaWarpSpecializedCooperative>;
+#else
+  using KernelTmaWarpSpecializedSchedule = KernelTmaWarpSpecialized;
+#endif
+
+  // Non-persistent schedule is a safer choice for CpAsync kernels due to register pressure
+  using KernelCpAsyncWarpSpecializedSchedule = KernelCpAsyncWarpSpecialized;
+  using KernelSchedule = cute::conditional_t<IsTmaCompatible, KernelTmaWarpSpecializedSchedule, KernelCpAsyncWarpSpecializedSchedule>;
+  static_assert((cute::is_same_v<KernelSchedule, KernelTmaWarpSpecializedSchedule> && IsMixedWidthInput) || !IsMixedWidthInput, "Only TMA warp specialized kernels are supported for mixed width input.");
+  using CollectiveOp = typename CollectiveBuilder<
+      arch::Sm90,
+      arch::OpClassTensorOp,
+      ElementA,
+      GmemLayoutATag,
+      AlignmentA,
+      ElementB,
+      GmemLayoutBTag,
+      AlignmentB,
+      ElementAccumulator,
+      TileShape_MNK,
+      ClusterShape_MNK,
+      StageCountType,
+      KernelSchedule
+    >::CollectiveOp;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/include/cutlass/gemm/collective/builders/sm90_sparse_config.inl b/include/cutlass/gemm/collective/builders/sm90_sparse_config.inl
new file mode 100644
index 0000000000..f9aa7bab2d
--- /dev/null
+++ b/include/cutlass/gemm/collective/builders/sm90_sparse_config.inl
@@ -0,0 +1,268 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Sparse configs specific for SM90 structure sparse kernels
+*/
+
+
+#pragma once
+
+#include "cute/atom/mma_traits_sm90_gmma.hpp"  // cute::GMMA::Major
+#include "cute/layout.hpp"                     // cute::Layout, cute::Shape, cute::Stride
+#include "cute/numeric/integral_constant.hpp"  // cute::Int
+#include "cute/numeric/numeric_types.hpp"      // cute::sizeof_bits_v
+#include "cute/pointer_sparse.hpp"             // cute::is_sparse
+#include "cute/util/type_traits.hpp"           // cute::is_same_v, cute::conditional_t
+#include "cutlass/fast_math.h"                 // cutlass::round_up
+#include "cutlass/layout/matrix.h"             // cutlass::RowMajor, cutlass::ColumnMajor
+
+namespace cutlass {
+
+using namespace cute;
+
+template<
+  class ElementAMma_,
+  GMMA::Major GmmaMajorA,
+  class ElementEMma_,
+  class MinTileShapeK = Int<32>
+>
+struct Sm90GemmSparseConfig {
+
+  static_assert(cute::is_sparse<ElementAMma_>::value, "ElementAMma MUST be sparse elem");
+  static_assert(cute::is_sparse<ElementEMma_>::value, "ElementEMma MUST be sparse elem");
+
+  // A
+  using ElementAMma         = ElementAMma_;
+  using ElementAMmaRaw      = typename ElementAMma::raw_type;
+  using ElementAMmaSparsity = Int<ElementAMma::sparsity>;
+
+  // Metadata (E)
+  using ElementEMma         = ElementEMma_;
+  using ElementEMmaRaw      = typename ElementEMma::raw_type;
+  using ElementEMmaSparsity = Int<ElementEMma::sparsity>;
+
+  // MMA type
+  static constexpr bool IsQmma = cute::is_same_v<ElementAMmaRaw, float_e4m3_t> && ElementAMmaSparsity{} == _2{} ||
+                                  cute::is_same_v<ElementAMmaRaw, float_e5m2_t> && ElementAMmaSparsity{} == _2{};
+  static constexpr bool IsImma = cute::is_same_v<ElementAMmaRaw, int8_t> && ElementAMmaSparsity{} == _2{} ||
+                                 cute::is_same_v<ElementAMmaRaw, uint8_t> && ElementAMmaSparsity{} == _2{};
+  static constexpr bool IsHmma = cute::is_same_v<ElementAMmaRaw, half_t> && ElementAMmaSparsity{} == _2{} ||
+                                 cute::is_same_v<ElementAMmaRaw, bfloat16_t> && ElementAMmaSparsity{} == _2{};
+  static constexpr bool IsTfmma = cute::is_same_v<ElementAMmaRaw, tfloat32_t> && ElementAMmaSparsity{} == _2{} || 
+                                  cute::is_same_v<ElementAMmaRaw, float> && ElementAMmaSparsity{} == _2{};
+  static_assert(int(IsQmma) + int(IsImma) + int(IsHmma) + int(IsTfmma) == 1, "Ambigious Input Type Config (failed to choose MMA type)");
+
+  // Number of ElementARaw stored in ElementAMmaRaw. For Hopper this is always 1.
+  using ElemsARawPerElementAMmaRaw = _1;
+
+  // ElementA Sparsity Ratio
+  using ElementASparsity = ElementAMmaSparsity;
+  static_assert(ElementASparsity{} == _2{}, "ElementASparsity must be 2 for Hopper Sparse Gemm");
+
+  // Logical/Physical ElementA per Chunk
+  using LogicalElemsAPerChunk = conditional_t<IsTfmma, _2, _4>;
+  using PhysicalElemsAPerChunk = Int<LogicalElemsAPerChunk{} / ElementASparsity{}>;
+
+  // Metadata Bits
+  using ElementEBitsPerChunk = _4;
+  using ElementEBitsPerElementAMma = cute::conditional_t<IsTfmma, _4, _2>;
+
+  // Metadata Layout. Unit in corresbonding logical elements.
+  // Basic metadata block is (16,64) for 8-bit, (16,32) for 16-bit, (16,16) for 32-bit data types.
+  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#sparse-wgmma-metadata-64n32-f16bf16
+  // Tensor E layout atom stacks 4 basic blocks along M mode to align with WGMMA instruction shape and
+  // stacks 1-4 blocks along K mode and reorders memory layout to allow for vectorized loads from smem.
+  using BlockK = Int<512 / sizeof_bits_v<ElementAMmaRaw>>;
+  static_assert(MinTileShapeK{} % BlockK{} == 0, "MinTileShapeK must be a multiple of BlockK");
+  using NumK = decltype(MinTileShapeK{} / BlockK{});
+
+  using TensorEAtom_32bit = decltype(make_ordered_layout(Shape<Shape<_8,_2,_4>, Shape<_8,_2,NumK>>{}, 
+                                                         Step <Step <_3,_1,_5>, Step <_0,_4,  _2>>{}));
+
+  using TensorEAtom_16bit = decltype(make_ordered_layout(Shape<Shape<_8,_2,_4>, Shape<_16,_2,NumK>>{},
+                                                         Step <Step <_3,_1,_5>, Step < _0,_4,  _2>>{}));
+
+  using TensorEAtom_8bit  = decltype(make_ordered_layout(Shape<_64,MinTileShapeK>{},
+                                                         Step < _1,           _0>{}));
+
+  using TensorEAtom = cute::conditional_t<(IsQmma || IsImma),  TensorEAtom_8bit, 
+                      cute::conditional_t<IsTfmma, TensorEAtom_32bit,
+                      TensorEAtom_16bit>>;
+
+  // Logical elems that construct the atomK for tensorE/A.  
+  using TensorEAtomK = Int<size<1>(TensorEAtom{})>;
+  using TensorEAtomM = Int<size<0>(TensorEAtom{})>;
+
+  // Tensor E alignment requirements
+  using TensorEAlignmentM = TensorEAtomM;
+  using TensorEAlignmentK = TensorEAtomK;
+
+  // Tensor A alignment requirements
+  // When A is MN major, TensorAAlignmentK needs to be multiplier of chunk size
+  // When A is K major, TensorAAlignmentK needs to be multiplier of TMA requirements times tensorA sparsity
+  //   this is b.c. TensorACompressed needs to satisfy TMA requirements
+  using TensorAAlignmentK = cute::conditional_t<GmmaMajorA == GMMA::Major::MN,
+                                                LogicalElemsAPerChunk,
+                                                Int<128 / cute::sizeof_bits_v<ElementAMma>>>;
+
+  // When A is MN Major, TensorAAlignmentM needs to be multiplier of TMA requirements
+  // When A is K Major, no requirements on TensorAAlignmentM.
+  using TensorAAlignmentM = cute::conditional_t<GmmaMajorA == GMMA::Major::MN,
+                                                Int<128 / cute::sizeof_bits_v<ElementAMmaRaw> * ElemsARawPerElementAMmaRaw{}>,
+                                                _1>;
+
+  // The following two functions are provided for user determine the static layouts type
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  deduce_layoutA() {
+    using LayoutMMajor = Layout<Shape <int32_t,
+                                       Shape<ElementASparsity, int32_t>,
+                                       int32_t>,
+                                Stride<ElementASparsity,
+                                       Stride<_1, int64_t>,
+                                       int64_t>>;
+
+    using LayoutKMajor = Layout<Shape <int32_t,
+                                       Shape<ElementASparsity, int32_t>,
+                                       int32_t>,
+                                Stride<int64_t,
+                                       Stride<_1, ElementASparsity>,
+                                       int64_t>>;
+
+    if constexpr (GmmaMajorA == GMMA::Major::MN) {
+      return LayoutMMajor{};
+    }
+    else {
+      return LayoutKMajor{};
+    }
+  }
+
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  deduce_layoutE() {
+    return make_layout(
+      make_shape(make_shape(shape<0>(TensorEAtom{}), int32_t(0)),
+                 make_shape(shape<1>(TensorEAtom{}), int32_t(0)),
+                 int32_t(0)),
+      make_stride(make_stride(stride<0>(TensorEAtom{}), cute::Int<cute::cosize(TensorEAtom{})>{}),
+                  make_stride(stride<1>(TensorEAtom{}), int64_t(0)),
+                  int64_t(0))
+    );
+  }
+
+  // This function is used to revert a CuTe layout to a Cutlass layout tag (RowMajor/ColumnMajor)
+  template <class ShapeA, class StrideA>
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  deduce_layoutA_tag(Layout<ShapeA, StrideA> layout_a) {
+    /*
+      (m, (2, k/2), l) : (2, (1, m*2), m*k) M-major
+      (m, (2, k/2), l) : (k, (1, 2), m*k) K-major
+    */
+    // Check if the given layout_a is possibly a sparse tensorA layout.
+    static_assert(rank_v<ShapeA> == 3 && depth_v<ShapeA> == 2, "Rank and depth mismatch with the sparse tensorA's layout.");
+    static_assert(rank(get<1>(ShapeA{})) == 2 && rank(flatten(ShapeA{})) == 4,
+                  "Not likely to be a sparse tensorA's layout.");
+    static_assert(get<1,0>(StrideA{}) == 1 && get<1,0>(ShapeA{}) == ElementASparsity{},
+                  "Not likely to be a sparse tensorA's layout.");
+    static_assert(get<0>(StrideA{}) == ElementASparsity{} || get<1,1>(StrideA{}) == ElementASparsity{},
+                  "Not likely to be a sparse tensorA's layout.");
+
+    if constexpr (get<0>(StrideA{}) == ElementASparsity{}) {
+      return cutlass::layout::ColumnMajor{};
+    }
+    else {
+      return  cutlass::layout::RowMajor{};
+    }
+  }
+
+  // Fill tensor A layout from dynamic problem shape
+  template <class ProblemShape>
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  fill_layoutA(ProblemShape problem_shape) {
+
+    const auto [M, N, K, L] = problem_shape;
+
+    // Round up to satisfy TensorA Alignment requirement
+    const auto M_AlignedAC = cutlass::round_up(M, TensorAAlignmentM{});
+    const auto K_AlignedAC = cutlass::round_up(K, TensorAAlignmentK{});
+
+    if constexpr (GmmaMajorA == GMMA::Major::MN) {
+      return make_layout(
+        make_shape(int32_t(M_AlignedAC),
+                   make_shape(ElementASparsity{}, int32_t(K_AlignedAC) / ElementASparsity{}),
+                   int32_t(L)),
+        make_stride(ElementASparsity{},
+                    make_stride(_1{}, int64_t(M_AlignedAC) * ElementASparsity{}),
+                    (L == 1) ? int64_t(0) : int64_t(M_AlignedAC * K_AlignedAC))
+      );
+    }
+    else {
+      return make_layout(
+        make_shape(int32_t(M_AlignedAC),
+                   make_shape(ElementASparsity{}, int32_t(K_AlignedAC / ElementASparsity{})),
+                   int32_t(L)),
+        make_stride(int64_t(K_AlignedAC),
+                    make_stride(_1{}, ElementASparsity{}),
+                    (L == 1) ? int64_t(0) : int64_t(M_AlignedAC * K_AlignedAC))
+      );
+    }
+  }
+
+  // Fill tensor E layout from dynamic problem shape
+  template <class ProblemShape>
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  fill_layoutE(ProblemShape problem_shape) {
+    const auto [M, N, K, L] = problem_shape;
+
+    // Round up to satisfy TensorEAlignment requirement
+    const auto M_AlignedE = cutlass::round_up(M, TensorEAlignmentM{});
+    const auto K_AlignedE = cutlass::round_up(K, TensorEAlignmentK{});
+
+    // TensorEAtom first along m-dim, then along k-dim, then along batch
+    static_assert(TensorEAlignmentM{} == TensorEAtomM{}, "Shape below assumes TensorEAlignmentM == TensorEAtomM");
+    static_assert(TensorEAlignmentK{} == TensorEAtomK{}, "Shape below assumes TensorEAlignmentK == TensorEAtomK");
+
+    return make_layout(
+      make_shape(make_shape(shape<0>(TensorEAtom{}), int32_t(M_AlignedE / TensorEAtomM{})),
+                 make_shape(shape<1>(TensorEAtom{}), int32_t(K_AlignedE / TensorEAtomK{})),
+                 int32_t(L)),
+      make_stride(make_stride(stride<0>(TensorEAtom{}), cute::Int<cute::cosize(TensorEAtom{})>{}),
+                  make_stride(stride<1>(TensorEAtom{}), int64_t(M_AlignedE * TensorEAtomK{})),
+                  (L == 1) ? int64_t(0) : int64_t(M_AlignedE * K_AlignedE))
+    );
+  }
+};
+
+} // namespace cutlass
diff --git a/include/cutlass/gemm/collective/builders/sm90_sparse_gmma_builder.inl b/include/cutlass/gemm/collective/builders/sm90_sparse_gmma_builder.inl
new file mode 100644
index 0000000000..9b608fe022
--- /dev/null
+++ b/include/cutlass/gemm/collective/builders/sm90_sparse_gmma_builder.inl
@@ -0,0 +1,388 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/gemm/collective/builders/sm90_common.inl"
+#include "cutlass/gemm/collective/builders/sm90_sparse_config.inl"
+
+// SM90 Collective Builders should be used only starting CUDA 12.0
+#if (__CUDACC_VER_MAJOR__ >= 12)
+#define CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count. 
+template<int CapacityBytes, class ElementAMma, class ElementB, class ElementEMma, class TileShapeMNK, int stages>
+constexpr int
+compute_stage_count_or_override_sparse(StageCount<stages> stage_count) {
+  return stages;
+}
+
+// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count. 
+template<int CapacityBytes, class ElementAMma, class ElementB, class ElementEMma, class TileShapeMNK, int stages>
+constexpr int
+compute_stage_count_or_override_sparse(cute::Int<stages> stage_count) {
+  return stages;
+}
+
+// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count. 
+template<int CapacityBytes, class ElementAMma, class ElementB, class ElementEMma, class TileShapeMNK, int carveout_bytes>
+constexpr int
+compute_stage_count_or_override_sparse(StageCountAutoCarveout<carveout_bytes> stage_count) {
+  constexpr auto mainloop_pipeline_bytes = sizeof(typename cutlass::PipelineTmaAsync<1>::SharedStorage);
+  constexpr auto a_bits = cute::sizeof_bits_v<ElementAMma>;
+  constexpr auto b_bits = cute::sizeof_bits_v<ElementB>;
+  constexpr auto e_bits = cute::sizeof_bits_v<ElementEMma>;
+  constexpr int stage_bytes =
+    cutlass::bits_to_bytes(a_bits * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
+    cutlass::bits_to_bytes(b_bits * size<1>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
+    cutlass::bits_to_bytes(e_bits * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
+    static_cast<int>(mainloop_pipeline_bytes);
+
+  return (CapacityBytes - carveout_bytes) / stage_bytes;
+}
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA_TMA_WS_SS_SPARSE
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassSparseTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelScheduleType,
+    cute::enable_if_t<
+      (cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecialized> ||
+       cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedPingpong> ||
+       cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperative>) &&
+       not detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>()>
+> {
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
+                "Should meet TMA alignment requirement\n");
+
+  static constexpr bool IsFP8Input = detail::is_input_fp8<ElementA, ElementB>();
+  static_assert(!IsFP8Input, "FP8 sparse collective currently only supports FastAccum schedules");
+
+  // For fp32 types, map to tf32 MMA value type
+  using ElementAMmaRaw = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
+  using ElementBMma    = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
+
+  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A<ElementAMmaRaw, GmemLayoutATag>();
+  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B<ElementBMma, GmemLayoutBTag>();
+
+  using AtomLayoutMNK = cute::conditional_t<
+      cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperative>,
+      Layout<Shape<_2,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector_sparse<
+      ElementAMmaRaw, ElementBMma, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{}));
+
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using ElementAMmaSparsity = Int<ElementAMma::sparsity>;
+  using ElementEMma = typename TiledMma::ValTypeE;
+  using SparseConfig = cutlass::Sm90GemmSparseConfig<ElementAMma, GmmaMajorA, ElementEMma, 
+                                                     decltype(cute::min(size<2>(TileShape_MNK{}),_128{}))>;
+
+  using LayoutA = decltype(SparseConfig::deduce_layoutA());
+  using LayoutE = decltype(SparseConfig::deduce_layoutE());
+  using LayoutPairAE = decltype(cute::make_tuple(LayoutA{}, LayoutE{}));
+
+  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+
+  using SmemLayoutAtomA = decltype(detail::ss_smem_selector_sparse<
+      GmmaMajorA, ElementAMmaRaw, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{})), ElementAMmaSparsity>());
+  using SmemLayoutAtomB = decltype(detail::ss_smem_selector<
+      GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  static constexpr int PipelineStages = detail::compute_stage_count_or_override_sparse<detail::sm90_smem_capacity_bytes,
+      ElementAMma, ElementBMma, ElementEMma, TileShape_MNK>(StageCountType{});
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedSparse<PipelineStages, ClusterShape_MNK, KernelScheduleType>;
+
+  using SmemCopyAtomA = void; 
+  using SmemCopyAtomB = void; 
+
+  using CollectiveOp = CollectiveMma<
+      DispatchPolicy,
+      TileShape_MNK,
+      ElementA,
+      LayoutPairAE,
+      ElementB,
+      TagToStrideB_t<GmemLayoutBTag>,
+      TiledMma,
+      GmemTiledCopyA,
+      SmemLayoutAtomA,
+      SmemCopyAtomA,
+      cute::identity,
+      GmemTiledCopyB,
+      SmemLayoutAtomB,
+      SmemCopyAtomB,
+      cute::identity
+    >;
+};
+
+// GMMA_TMA_WS_SS_FP8_FAST_ACCUM_SPARSE
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassSparseTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelScheduleType,
+    cute::enable_if_t<
+      (cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedFP8FastAccum> ||
+       cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedPingpongFP8FastAccum> ||
+       cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperativeFP8FastAccum>)>
+> {
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
+                "Should meet TMA alignment requirement\n");
+  static_assert(detail::is_input_fp8<ElementA, ElementB>(),
+                "Only FP8 datatypes are compatible with these kernel schedules\n");
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+
+  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A<ElementA, GmemLayoutATag>();
+  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B<ElementB, GmemLayoutBTag>();
+
+  using AtomLayoutMNK = cute::conditional_t<
+      cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperativeFP8FastAccum>,
+      Layout<Shape<_2,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector_sparse<
+      ElementA, ElementB, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{}));
+
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using ElementAMmaSparsity = Int<ElementAMma::sparsity>;
+  using ElementEMma = typename TiledMma::ValTypeE;
+  using SparseConfig = cutlass::Sm90GemmSparseConfig<ElementAMma, GmmaMajorA, ElementEMma, 
+                                                     decltype(cute::min(size<2>(TileShape_MNK{}),_128{}))>;
+
+  using LayoutA = decltype(SparseConfig::deduce_layoutA());
+  using LayoutE = decltype(SparseConfig::deduce_layoutE());
+  using LayoutPairAE = decltype(cute::make_tuple(LayoutA{}, LayoutE{}));
+
+  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+
+  using SmemLayoutAtomA = decltype(detail::ss_smem_selector_sparse<
+      GmmaMajorA, ElementA, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{})), ElementAMmaSparsity>());
+  using SmemLayoutAtomB = decltype(detail::ss_smem_selector<
+      GmmaMajorB, ElementB, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  static constexpr int PipelineStages = detail::compute_stage_count_or_override_sparse<detail::sm90_smem_capacity_bytes,
+      ElementAMma, ElementB, ElementEMma, TileShape_MNK>(StageCountType{});
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedSparse<PipelineStages, ClusterShape_MNK, KernelScheduleType>;
+
+  using SmemCopyAtomA = void; 
+  using SmemCopyAtomB = void; 
+
+  using CollectiveOp = CollectiveMma<
+      DispatchPolicy,
+      TileShape_MNK,
+      ElementA,
+      LayoutPairAE,
+      ElementB,
+      TagToStrideB_t<GmemLayoutBTag>,
+      TiledMma,
+      GmemTiledCopyA,
+      SmemLayoutAtomA,
+      SmemCopyAtomA,
+      cute::identity,
+      GmemTiledCopyB,
+      SmemLayoutAtomB,
+      SmemCopyAtomB,
+      cute::identity
+    >;
+};
+
+// GMMA_TMA_WS_RS_SPARSE
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassSparseTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelScheduleType,
+    cute::enable_if_t<
+      (cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecialized> ||
+       cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedPingpong> ||
+       cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperative>) &&
+       detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>()>
+> {
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Mainloop with sparse A sourced from RF is not implemented.");
+};
+
+// Sparse GMMA auto kernel schedule
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassSparseTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelScheduleType,
+    cute::enable_if_t<cute::is_same_v<KernelScheduleType, KernelScheduleAuto>>
+> {
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+
+  static constexpr bool IsFP8Input = detail::is_input_fp8<ElementA, ElementB>();
+
+  using KernelSchedule = cute::conditional_t<size<0>(TileShape_MNK{}) == Int<64>{},
+                                             cute::conditional_t<IsFP8Input,
+                                                                 KernelTmaWarpSpecializedPingpongFP8FastAccum,
+                                                                 KernelTmaWarpSpecializedPingpong>,
+                                             cute::conditional_t<IsFP8Input,
+                                                                 KernelTmaWarpSpecializedCooperativeFP8FastAccum,
+                                                                 KernelTmaWarpSpecializedCooperative>>;
+
+  using CollectiveOp = typename CollectiveBuilder<
+      arch::Sm90,
+      arch::OpClassSparseTensorOp,
+      ElementA,
+      GmemLayoutATag,
+      AlignmentA,
+      ElementB,
+      GmemLayoutBTag,
+      AlignmentB,
+      ElementAccumulator,
+      TileShape_MNK,
+      ClusterShape_MNK,
+      StageCountType,
+      KernelSchedule
+    >::CollectiveOp;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/collective_builder.hpp b/include/cutlass/gemm/collective/collective_builder.hpp
new file mode 100644
index 0000000000..ccd8d8b3c7
--- /dev/null
+++ b/include/cutlass/gemm/collective/collective_builder.hpp
@@ -0,0 +1,42 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/gemm/collective/collective_mma_decl.hpp"
+#include "cutlass/gemm/collective/collective_mma.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/gemm/collective/collective_builder_decl.hpp"
+#include "cutlass/gemm/collective/builders/sm90_gmma_builder.inl"
+#include "cutlass/gemm/collective/builders/sm90_sparse_gmma_builder.inl"
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/collective_builder_decl.hpp b/include/cutlass/gemm/collective/collective_builder_decl.hpp
new file mode 100644
index 0000000000..c27a84f21e
--- /dev/null
+++ b/include/cutlass/gemm/collective/collective_builder_decl.hpp
@@ -0,0 +1,100 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/numeric/integral_constant.hpp>
+#include <cutlass/detail/dependent_false.hpp>
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Used to specify stage counts or dispatch to automatic computation of stage count
+template<int num_stages>
+struct StageCount {
+  static constexpr int value = num_stages;
+
+  StageCount() = default;
+  explicit StageCount(cute::Int<num_stages>) {}
+};
+
+template<int carveout_bytes>
+struct StageCountAutoCarveout {
+  static constexpr int bytes = carveout_bytes;
+
+  StageCountAutoCarveout() = default;
+  explicit StageCountAutoCarveout(cute::Int<carveout_bytes>) {}
+};
+
+namespace detail {
+
+// Forward Declaration
+template<class CollectiveEpilogue>
+constexpr int
+compute_carveout_from_epi();
+
+} // namespace detail
+
+template<class CollectiveEpilogue>
+struct StageCountAutoCarveoutEpi : StageCountAutoCarveout<detail::compute_carveout_from_epi<CollectiveEpilogue>()> {};
+
+using StageCountAuto = StageCountAutoCarveout<0>;
+
+// Used to automatically let the builder pick the kernel schedule.
+// Can be overridden with kernel schedule tags in cutlass/gemm/dispatch_policy.hpp
+struct KernelScheduleAuto final {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ArchTag,
+  class OpClass,
+  class ElementA,
+  class GmemLayoutA,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutB,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType,
+  class Enable = void
+>
+struct CollectiveBuilder {
+  static_assert(sizeof(ElementA) == 0, "Could not build a collective for given parameters.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
diff --git a/include/cutlass/gemm/collective/collective_mma.hpp b/include/cutlass/gemm/collective/collective_mma.hpp
new file mode 100644
index 0000000000..21f8a557be
--- /dev/null
+++ b/include/cutlass/gemm/collective/collective_mma.hpp
@@ -0,0 +1,50 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/gemm/collective/collective_mma_decl.hpp"
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/gemm/collective/sm70_mma_twostage.hpp"
+#include "cutlass/gemm/collective/sm80_mma_multistage.hpp"
+#include "cutlass/gemm/collective/sm90_mma_multistage_gmma_ss_warpspecialized.hpp"
+#include "cutlass/gemm/collective/sm90_mma_multistage_gmma_rs_warpspecialized.hpp"
+#include "cutlass/gemm/collective/sm90_mma_tma_gmma_ss.hpp"
+#include "cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp"
+#include "cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized_mixed_input.hpp" 
+#include "cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp"
+#include "cutlass/gemm/collective/sm90_sparse_mma_tma_gmma_ss_warpspecialized.hpp"
+#include "cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp"
+#include "cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/collective_mma_decl.hpp b/include/cutlass/gemm/collective/collective_mma_decl.hpp
new file mode 100644
index 0000000000..feef54962c
--- /dev/null
+++ b/include/cutlass/gemm/collective/collective_mma_decl.hpp
@@ -0,0 +1,64 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/numeric/integral_constant.hpp>
+#include <cutlass/detail/dependent_false.hpp>
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class DispatchPolicy,
+  class TileShape,
+  class ElementA,
+  class StrideA,
+  class ElementB,
+  class StrideB,
+  class TiledMma,
+  class GmemTiledCopyA,
+  class SmemLayoutAtomA,
+  class SmemCopyAtomA,
+  class TransformA,
+  class GmemTiledCopyB,
+  class SmemLayoutAtomB,
+  class SmemCopyAtomB,
+  class TransformB
+>
+struct CollectiveMma {
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Could not find a mainloop specialization.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
diff --git a/include/cutlass/gemm/collective/fp8_accumulation.hpp b/include/cutlass/gemm/collective/fp8_accumulation.hpp
new file mode 100644
index 0000000000..374fffafc5
--- /dev/null
+++ b/include/cutlass/gemm/collective/fp8_accumulation.hpp
@@ -0,0 +1,121 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cute/algorithm/clear.hpp"
+#include "cute/tensor.hpp"
+
+//////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////FP8 Accumulation///////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+/// It would promote (add) the results from the tensor core accumulators to the
+/// main accumulators when the number of MMAs reaches the max number of MMA
+/// interval specified by user, after that the tensor core accumulators are
+/// zeroed.
+//////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+template <
+    class EngineAccum,
+    class LayoutAccum>
+struct GmmaFP8Accumulation {  
+ using TensorAccum = cute::Tensor<EngineAccum, LayoutAccum>;
+
+  static_assert(is_static<LayoutAccum>::value, "Accumulator Layout should be static");
+  static_assert(is_rmem<TensorAccum>::value , "Accumulator tensor must be rmem resident.");
+
+private:
+  TensorAccum& accum_;
+  TensorAccum accum_temp_;
+
+  uint32_t accum_promotion_interval_;         // defines the max num of executed MMAs after which accum should be promoted.
+  uint32_t mma_count_per_mainloop_iteration_; // num of MMAs per k_tile of mainloop
+  uint32_t mma_count_;                        // current executed MMAs
+  uint32_t reset_accum_flag_;                 // accum needs to be zeroed or not. 
+
+  CUTLASS_DEVICE
+  void promote_core() {
+    warpgroup_wait<0>();
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(accum_); ++i) {
+      accum_(i) += accum_temp_(i);
+    }
+  }
+
+public:
+  CUTLASS_DEVICE
+  GmmaFP8Accumulation(
+      TensorAccum &accum,
+      uint32_t accum_promotion_interval,
+      uint32_t mma_count_per_mainloop_iteration)
+      : accum_(accum), 
+        accum_promotion_interval_(accum_promotion_interval),
+        mma_count_per_mainloop_iteration_(mma_count_per_mainloop_iteration),
+        mma_count_(0), 
+        reset_accum_flag_(0) 
+  {
+    accum_temp_ = cute::make_fragment_like(accum);
+  }
+
+  CUTLASS_DEVICE 
+  TensorAccum& operator()() {
+    return accum_temp_;
+  }
+
+  /// prepare the MMA accumulators when initialization or zeroing is required.
+  CUTLASS_DEVICE
+  bool prepare_if_needed() { 
+    return reset_accum_flag_;
+  }
+
+  /// promote (add) the results from the MMA accumulators to main accumulator if needed.
+  CUTLASS_DEVICE
+  void promote_if_needed() {
+    mma_count_ += mma_count_per_mainloop_iteration_;
+    reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0);
+    if (reset_accum_flag_) {
+      promote_core();
+      mma_count_ = 0;
+    }
+  }
+
+  /// promote (add) the residue results from the MMA accumulators to main accumulator if needed.
+  CUTLASS_DEVICE
+  void promote_residue_if_needed() {
+    if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) {
+      promote_core();
+    }
+  }
+};
+
+} // namespace cutlass::gemm::collective
diff --git a/include/cutlass/gemm/collective/sm70_mma_twostage.hpp b/include/cutlass/gemm/collective/sm70_mma_twostage.hpp
new file mode 100644
index 0000000000..3d9e03edff
--- /dev/null
+++ b/include/cutlass/gemm/collective/sm70_mma_twostage.hpp
@@ -0,0 +1,597 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cutlass/gemm/collective/collective_mma_decl.hpp"
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm70TwoStageUnpredicated,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm70TwoStageUnpredicated;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}))));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}))));
+
+  struct SharedStorage
+  {
+    cute::array_aligned<ElementA, cute::cosize_v<SmemLayoutA>> smem_a;
+    cute::array_aligned<ElementB, cute::cosize_v<SmemLayoutB>> smem_b;
+  };
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A;
+    StrideA dA;
+    ElementB const* ptr_B;
+    StrideB dB;
+  };
+
+  // Device side kernel params
+  using Params = Arguments;
+
+  //
+  // Methods
+  //
+
+  CollectiveMma() = default;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& _, Arguments const& args, void* workspace) {
+    (void) workspace;
+    return args;
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  template <
+    class FrgTensorD,
+    class TensorA,
+    class TensorB,
+    class FrgTensorC,
+    class KTileIterator,
+    class ResidueMNK
+  >
+  CUTLASS_DEVICE void
+  operator() (
+      FrgTensorD &accum,
+      TensorA gA,
+      TensorB gB,
+      FrgTensorC const &src_accum,
+      KTileIterator k_tile_iter, int k_tile_count,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      char *smem_buf)
+  {
+    using namespace cute;
+
+    (void)residue_mnk;
+
+    static_assert(is_rmem<FrgTensorD>::value, "D tensor must be rmem resident.");
+    static_assert(is_gmem<TensorA>::value, "A tensor must be gmem resident.");
+    static_assert(is_gmem<TensorB>::value, "B tensor must be gmem resident.");
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 2,
+      "MainloopTwoStage must not have a smem shape with a pipeline mode.");
+    static_assert(cute::rank(SmemLayoutB{}) == 2,
+      "MainloopTwoStage must not have a smem shape with a pipeline mode.");
+
+    // Construct shared memory tiles
+    SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+    Tensor sA = make_tensor(make_smem_ptr(storage.smem_a.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(storage.smem_b.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE)
+
+    // Partition the copying of A and B tiles across the threads
+    GmemTiledCopyA gmem_tiled_copy_a;
+    GmemTiledCopyB gmem_tiled_copy_b;
+    auto copy_a_thr = gmem_tiled_copy_a.get_slice(thread_idx);
+    auto copy_b_thr = gmem_tiled_copy_b.get_slice(thread_idx);
+
+    Tensor tAgA = copy_a_thr.partition_S(gA);                                  // (ACPY,ACPY_M,ACPY_K,k)
+    Tensor tAsA = copy_a_thr.partition_D(sA);                                  // (ACPY,ACPY_M,ACPY_K)
+    Tensor tBgB = copy_b_thr.partition_S(gB);                                  // (BCPY,BCPY_N,BCPY_K,k)
+    Tensor tBsB = copy_b_thr.partition_D(sB);                                  // (BCPY,BCPY_N,BCPY_K)
+
+    // Allocate the register tiles for double buffering -- same shape as partitioned data
+    Tensor tArA = make_fragment_like(tAsA);                                    // (ACPY,ACPY_M,ACPY_K)
+    Tensor tBrB = make_fragment_like(tBsB);                                    // (BCPY,BCPY_N,BCPY_K)
+
+    // Tile MMA compute thread partitions and allocate accumulators
+    TiledMma tiled_mma;
+    auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCrA  = thr_mma.partition_fragment_A(sA);                           // (MMA,MMA_M,MMA_K)
+    Tensor tCrB  = thr_mma.partition_fragment_B(sB);                           // (MMA,MMA_M,MMA_K)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));                     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(src_accum));                 // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(accum));                     // MMA_N
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(src_accum));                 // MMA_N
+    CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB));                      // MMA_K
+
+    //
+    // Copy Atom retiling
+    //
+
+    auto thr_copy_A       = make_tiled_copy_A(SmemCopyAtomA{}, tiled_mma).get_thread_slice(thread_idx);
+    Tensor tCsA           = thr_copy_A.partition_S(sA);
+    Tensor tCrA_copy_view = thr_copy_A.retile_D(tCrA);
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));            // M
+
+    auto thr_copy_B       = make_tiled_copy_B(SmemCopyAtomB{}, tiled_mma).get_thread_slice(thread_idx);
+    Tensor tCsB           = thr_copy_B.partition_S(sB);
+    Tensor tCrB_copy_view = thr_copy_B.retile_D(tCrB);
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view));            // N
+
+    //
+    // Prologue
+    //
+
+    // Copy gmem to rmem for the first k_tile
+    copy(gmem_tiled_copy_a, tAgA(_,_,_,*k_tile_iter), tArA);
+    copy(gmem_tiled_copy_b, tBgB(_,_,_,*k_tile_iter), tBrB);
+    if (--k_tile_count > 0) ++k_tile_iter;
+    // Copy rmem to smem
+    copy(tArA, tAsA);
+    copy(tBrB, tBsB);
+    // Clear accumulators
+    __syncthreads();
+
+    // Load A, B smem->rmem for k=0
+    copy(tCsA(_,_,0), tCrA_copy_view(_,_,0));
+    copy(tCsB(_,_,0), tCrB_copy_view(_,_,0));
+    //
+    // Mainloop
+    //
+
+    // Size of the k-tiles's outer product mode (k)
+    auto K_BLOCK_MAX = size<2>(tCrA);
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > -1)
+    {
+      // Pipeline the outer products with a static for loop
+      for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block)
+      {
+        if (k_block == K_BLOCK_MAX - 1)
+        {
+          __syncthreads();
+
+          // Copy rmem to smem
+          copy(tArA, tAsA);
+          copy(tBrB, tBsB);
+          __syncthreads();
+        }
+
+        // Load A, B smem->rmem for k+1
+        int k_block_next = (k_block + Int<1>{}) % K_BLOCK_MAX;     // static
+        copy(tCsA(_,_,k_block_next), tCrA_copy_view(_,_,k_block_next));
+        copy(tCsB(_,_,k_block_next), tCrB_copy_view(_,_,k_block_next));
+        if (k_block == 0)
+        {
+          // Copy gmem to rmem
+          copy(gmem_tiled_copy_a, tAgA(_,_,_,*k_tile_iter), tArA);
+          copy(gmem_tiled_copy_b, tBgB(_,_,_,*k_tile_iter), tBrB);
+          if (--k_tile_count > 0) ++k_tile_iter;
+        }
+
+        // transform before compute
+        cute::transform(tCrA(_,_,k_block), TransformA{});
+        cute::transform(tCrB(_,_,k_block), TransformB{});
+
+        // Thread-level register gemm for k
+        // disambiguate gemm (shared with the namespace name)
+        cute::gemm(tiled_mma, accum, tCrA(_,_,k_block), tCrB(_,_,k_block), src_accum);
+      });
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm70TwoStage,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm70TwoStage;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}))));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}))));
+
+  struct SharedStorage
+  {
+    cute::array_aligned<ElementA, cute::cosize_v<SmemLayoutA>> smem_a;
+    cute::array_aligned<ElementB, cute::cosize_v<SmemLayoutB>> smem_b;
+  };
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A;
+    StrideA dA;
+    ElementB const* ptr_B;
+    StrideB dB;
+  };
+
+  // Device side kernel params
+  using Params = Arguments;
+
+  //
+  // Methods
+  //
+
+  CollectiveMma() = default;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& _, Arguments const& args, void* workspace) {
+    (void) workspace;
+    return args;
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  template <
+    class FrgTensorD,
+    class TensorA,
+    class TensorB,
+    class FrgTensorC,
+    class KTileIterator,
+    class ResidueMNK
+  >
+  CUTLASS_DEVICE void
+  operator() (
+      FrgTensorD &accum,
+      TensorA gA,
+      TensorB gB,
+      FrgTensorC const &src_accum,
+      KTileIterator k_tile_iter, int k_tile_count,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      char *smem_buf)
+  {
+    using namespace cute;
+
+    static_assert(is_rmem<FrgTensorD>::value, "D tensor must be rmem resident.");
+    static_assert(is_gmem<TensorA>::value, "A tensor must be gmem resident.");
+    static_assert(is_gmem<TensorB>::value, "B tensor must be gmem resident.");
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 2,
+      "MainloopTwoStage must not have a smem shape with a pipeline mode.");
+    static_assert(cute::rank(SmemLayoutB{}) == 2,
+      "MainloopTwoStage must not have a smem shape with a pipeline mode.");
+
+    // Construct shared memory tiles
+    SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+    Tensor sA = make_tensor(make_smem_ptr(storage.smem_a.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(storage.smem_b.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE)
+
+    // Shift tensor so residue_k is at origin (Can't read any k_coord < residue_k)
+    // This aligns the tensor with BLK_K for all but the 0th k_tile
+    gA.data() = &gA(0, get<2>(residue_mnk), 0);
+    gB.data() = &gB(0, get<2>(residue_mnk), 0);
+
+    // Partition the copying of A and B tiles across the threads
+    GmemTiledCopyA gmem_tiled_copy_a;
+    GmemTiledCopyB gmem_tiled_copy_b;
+    auto gmem_thr_copy_a = gmem_tiled_copy_a.get_slice(thread_idx);
+    auto gmem_thr_copy_b = gmem_tiled_copy_b.get_slice(thread_idx);
+
+    Tensor tAgA = gmem_thr_copy_a.partition_S(gA);                             // (ACPY,ACPY_M,ACPY_K,k)
+    Tensor tAsA = gmem_thr_copy_a.partition_D(sA);                             // (ACPY,ACPY_M,ACPY_K,PIPE)
+    Tensor tBgB = gmem_thr_copy_b.partition_S(gB);                             // (BCPY,BCPY_N,BCPY_K,k)
+    Tensor tBsB = gmem_thr_copy_b.partition_D(sB);                             // (BCPY,BCPY_N,BCPY_K,PIPE)
+
+    // Allocate the register tiles for double buffering -- same shape as partitioned data
+    Tensor tArA = make_fragment_like(tAsA);                                    // (ACPY,ACPY_M,ACPY_K)
+    Tensor tBrB = make_fragment_like(tBsB);                                    // (BCPY,BCPY_N,BCPY_K)
+
+    //
+    // PREDICATES
+    //
+
+    // Allocate predicate tensors for m and n
+    Tensor tApA = make_tensor<bool>(make_shape(size<1>(tAsA), size<2>(tAsA)), Stride<_1,_0>{});
+    Tensor tBpB = make_tensor<bool>(make_shape(size<1>(tBsB), size<2>(tBsB)), Stride<_1,_0>{});
+
+    // Construct identity layout for sA and sB
+    Tensor cA = make_identity_tensor(make_shape(size<0>(sA), size<1>(sA)));    // (BLK_M,BLK_K) -> (blk_m,blk_k)
+    Tensor cB = make_identity_tensor(make_shape(size<0>(sB), size<1>(sB)));    // (BLK_N,BLK_K) -> (blk_n,blk_k)
+
+    // Repeat the partitioning with identity layouts
+    Tensor tAcA = gmem_thr_copy_a.partition_S(cA);                             // (ACPY,ACPY_M,ACPY_K) -> (blk_m,blk_k)
+    Tensor tBcB = gmem_thr_copy_b.partition_S(cB);                             // (BCPY,BCPY_N,BCPY_K) -> (blk_n,blk_k)
+
+    // Set predicates for m bounds
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < size<0>(tApA); ++m) {
+      tApA(m,0) = get<0>(tAcA(0,m,0)) < get<0>(residue_mnk);  // blk_m coord < residue_m
+    }
+    // Set predicates for n bounds
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < size<0>(tBpB); ++n) {
+      tBpB(n,0) = get<0>(tBcB(0,n,0)) < get<1>(residue_mnk);  // blk_n coord < residue_n
+    }
+
+    //
+    // PREFETCH
+    //
+
+    // Clear the rmem tiles to account for predicated off loads
+    clear(tArA);
+    clear(tBrB);
+
+    // Start async loads for 0th k-tile, where we take care of the k residue
+    {
+      Tensor tAgAk = tAgA(_,_,_,*k_tile_iter);
+      CUTLASS_PRAGMA_UNROLL
+      for (int k = 0; k < size<2>(tArA); ++k) {
+        if (get<1>(tAcA(0,0,k)) >= -get<2>(residue_mnk)) {      // blk_k coord < residue_k (gA shifted)
+          copy_if(gmem_tiled_copy_a, tApA(_,k), tAgAk(_,_,k), tArA(_,_,k));
+        }
+      }
+      Tensor tBgBk = tBgB(_,_,_,*k_tile_iter);
+      CUTLASS_PRAGMA_UNROLL
+      for (int k = 0; k < size<2>(tBrB); ++k) {
+        if (get<1>(tBcB(0,0,k)) >= -get<2>(residue_mnk)) {      // blk_k coord < residue_k (gB shifted)
+          copy_if(gmem_tiled_copy_b, tBpB(_,k), tBgBk(_,_,k), tBrB(_,_,k));
+        }
+      }
+      ++k_tile_iter;
+      --k_tile_count;
+    }
+
+    // Tile MMA compute thread partitions and allocate accumulators
+    TiledMma tiled_mma;
+    auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCrA  = thr_mma.make_fragment_A(thr_mma.partition_A(sA));           // (MMA,MMA_M,MMA_K)
+    Tensor tCrB  = thr_mma.make_fragment_B(thr_mma.partition_B(sB));           // (MMA,MMA_M,MMA_K)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));                     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(src_accum));                 // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(accum));                     // MMA_N
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(src_accum));                 // MMA_N
+    CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB));                      // MMA_K
+
+    //
+    // Copy Atom retiling
+    //
+
+    auto thr_copy_A       = make_tiled_copy_A(SmemCopyAtomA{}, tiled_mma).get_thread_slice(thread_idx);
+    Tensor tCsA           = thr_copy_A.partition_S(sA);
+    Tensor tCrA_copy_view = thr_copy_A.retile_D(tCrA);
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));            // M
+
+    auto thr_copy_B       = make_tiled_copy_B(SmemCopyAtomB{}, tiled_mma).get_thread_slice(thread_idx);
+    Tensor tCsB           = thr_copy_B.partition_S(sB);
+    Tensor tCrB_copy_view = thr_copy_B.retile_D(tCrB);
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view));            // N
+
+    //
+    // Prologue
+    //
+
+    // Copy rmem to smem
+    copy(tArA, tAsA);
+    copy(tBrB, tBsB);
+    // Clear accumulators
+    __syncthreads();
+
+    // Load A, B smem->rmem for k=0
+    copy(tCsA(_,_,0), tCrA_copy_view(_,_,0));
+    copy(tCsB(_,_,0), tCrB_copy_view(_,_,0));
+    //
+    // Mainloop
+    //
+
+    // Size of the k-tiles's outer product mode (k)
+    auto K_BLOCK_MAX = size<2>(tCrA);
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > -1)
+    {
+      // Pipeline the outer products with a static for loop
+      for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block)
+      {
+        if (k_block == K_BLOCK_MAX - 1)
+        {
+          __syncthreads();
+
+          // Copy rmem to smem
+          copy(tArA, tAsA);
+          copy(tBrB, tBsB);
+          __syncthreads();
+        }
+
+        // Load A, B smem->rmem for k+1
+        int k_block_next = (k_block + Int<1>{}) % K_BLOCK_MAX;    // static
+        copy(tCsA(_,_,k_block_next), tCrA_copy_view(_,_,k_block_next));
+        copy(tCsB(_,_,k_block_next), tCrB_copy_view(_,_,k_block_next));
+        if (k_block == 0)
+        {
+          if (k_tile_count <= 0) {
+            clear(tApA);
+            clear(tBpB);
+          }
+          copy_if(gmem_tiled_copy_a, tApA, tAgA(_,_,_,*k_tile_iter), tArA);
+          copy_if(gmem_tiled_copy_b, tBpB, tBgB(_,_,_,*k_tile_iter), tBrB);
+          ++k_tile_iter;
+          --k_tile_count;
+        }
+
+        // transform before compute
+        cute::transform(tCrA(_,_,k_block), TransformA{});
+        cute::transform(tCrB(_,_,k_block), TransformB{});
+
+        // Thread-level register gemm for k
+        // disambiguate gemm (shared with the namespace name)
+        cute::gemm(tiled_mma, accum, tCrA(_,_,k_block), tCrB(_,_,k_block), src_accum);
+      });
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/sm80_mma_multistage.hpp b/include/cutlass/gemm/collective/sm80_mma_multistage.hpp
new file mode 100644
index 0000000000..a129b56e3c
--- /dev/null
+++ b/include/cutlass/gemm/collective/sm80_mma_multistage.hpp
@@ -0,0 +1,707 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int Stages,
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm80CpAsyncUnpredicated<Stages>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_
+  >
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm80CpAsyncUnpredicated<Stages>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+  // Follow the change in TestSmall: TileShape switch to CtaShape 
+  // For sm80 arch, CtaShape should euqal to TileShape
+  using CtaShape_MNK = TileShape;
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
+
+  static_assert(DispatchPolicy::Stages >= 2, "CpAsync mainloop must have at least 2 stages in the pipeline.");
+
+  struct SharedStorage
+  {
+    cute::array_aligned<ElementA, cute::cosize_v<SmemLayoutA>> smem_a;
+    cute::array_aligned<ElementB, cute::cosize_v<SmemLayoutB>> smem_b;
+  };
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A;
+    StrideA dA;
+    ElementB const* ptr_B;
+    StrideB dB;
+  };
+
+  // Device side kernel params
+  using Params = Arguments;
+
+  //
+  // Methods
+  //
+
+  CollectiveMma() = default;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& _, Arguments const& args, void* workspace) {
+    (void) workspace;
+    return args;
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  template <
+    class FrgTensorD,
+    class TensorA,
+    class TensorB,
+    class FrgTensorC,
+    class KTileIterator,
+    class ResidueMNK
+  >
+  CUTLASS_DEVICE void
+  operator() (
+      FrgTensorD &accum,
+      TensorA gA,
+      TensorB gB,
+      FrgTensorC const &src_accum,
+      KTileIterator k_tile_iter, int k_tile_count,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      char *smem_buf)
+  {
+    using namespace cute;
+
+    static_assert(is_rmem<FrgTensorD>::value, "D tensor must be rmem resident.");
+    static_assert(is_gmem<TensorA>::value,    "A tensor must be gmem resident.");
+    static_assert(is_gmem<TensorB>::value,    "B tensor must be gmem resident.");
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3,
+      "MainloopSm80CpAsync must have a pipeline mode in the smem layout.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3,
+      "MainloopSm80CpAsync must have a pipeline mode in the smem layout.");
+
+    // Construct shared memory tiles
+    SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+    Tensor sA = make_tensor(make_smem_ptr(storage.smem_a.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(storage.smem_b.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<0>(gA) == size<0>(sA));                          // BLK_M
+    CUTE_STATIC_ASSERT_V(size<1>(gA) == size<1>(sA));                          // BLK_K
+    CUTE_STATIC_ASSERT_V(size<0>(gB) == size<0>(sB));                          // BLK_N
+    CUTE_STATIC_ASSERT_V(size<1>(gB) == size<1>(sB));                          // BLK_K
+    CUTE_STATIC_ASSERT_V(size<1>(sA) == size<1>(sB));                          // BLK_K
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));        // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));        // PIPE
+
+    // Partition the copying of A and B tiles across the threads
+    GmemTiledCopyA gmem_tiled_copy_A;
+    GmemTiledCopyB gmem_tiled_copy_B;
+    auto gmem_thr_copy_A = gmem_tiled_copy_A.get_slice(thread_idx);
+    auto gmem_thr_copy_B = gmem_tiled_copy_B.get_slice(thread_idx);
+
+    Tensor tAgA = gmem_thr_copy_A.partition_S(gA);                             // (ACPY,ACPY_M,ACPY_K,k)
+    Tensor tAsA = gmem_thr_copy_A.partition_D(sA);                             // (ACPY,ACPY_M,ACPY_K,PIPE)
+    Tensor tBgB = gmem_thr_copy_B.partition_S(gB);                             // (BCPY,BCPY_N,BCPY_K,k)
+    Tensor tBsB = gmem_thr_copy_B.partition_D(sB);                             // (BCPY,BCPY_N,BCPY_K,PIPE)
+
+    //
+    // PREDICATES
+    //
+
+    (void) residue_mnk;
+    //assert(residue_mnk == make_tuple(0,0,0));
+
+    //
+    // PREFETCH
+    //
+
+    // Start async loads for all pipes but the last
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_pipe = 0; k_pipe < DispatchPolicy::Stages-1; ++k_pipe) {
+      copy(gmem_tiled_copy_A, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,k_pipe));
+      copy(gmem_tiled_copy_B, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,k_pipe));
+      cp_async_fence();
+      --k_tile_count;
+      if (k_tile_count > 0) { ++k_tile_iter; }
+    }
+
+    //
+    // MMA Atom partitioning
+    //
+
+    // Tile MMA compute thread partitions and allocate accumulators
+    TiledMma tiled_mma;
+    auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCrA = thr_mma.partition_fragment_A(sA(_,_,0));                     // (MMA,MMA_M,MMA_K)
+    Tensor tCrB = thr_mma.partition_fragment_B(sB(_,_,0));                     // (MMA,MMA_N,MMA_K)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));                     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(src_accum));                 // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(accum));                     // MMA_N
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(src_accum));                 // MMA_N
+    CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB));                      // MMA_K
+    CUTE_STATIC_ASSERT_V(size(gmem_tiled_copy_A) == size(tiled_mma));
+    CUTE_STATIC_ASSERT_V(size(gmem_tiled_copy_B) == size(tiled_mma));
+
+    //
+    // Copy Atom retiling
+    //
+
+    auto smem_tiled_copy_A = make_tiled_copy_A(SmemCopyAtomA{}, tiled_mma);
+    auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(thread_idx);
+    Tensor tCsA            = smem_thr_copy_A.partition_S(sA);                  // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrA_copy_view  = smem_thr_copy_A.retile_D(tCrA);                   // (CPY,CPY_M,CPY_K)
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));            // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));            // CPY_K
+
+    auto smem_tiled_copy_B = make_tiled_copy_B(SmemCopyAtomB{}, tiled_mma);
+    auto smem_thr_copy_B   = smem_tiled_copy_B.get_thread_slice(thread_idx);
+    Tensor tCsB            = smem_thr_copy_B.partition_S(sB);                  // (CPY,CPY_N,CPY_K,PIPE)
+    Tensor tCrB_copy_view  = smem_thr_copy_B.retile_D(tCrB);                   // (CPY,CPY_N,CPY_K)
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view));            // CPY_N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsB) == size<2>(tCrB_copy_view));            // CPY_K
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    // Current pipe index in smem to read from
+    int smem_pipe_read  = 0;
+    // Current pipe index in smem to write to
+    int smem_pipe_write = DispatchPolicy::Stages-1;
+
+    Tensor tCsA_p = tCsA(_,_,_,smem_pipe_read);
+    Tensor tCsB_p = tCsB(_,_,_,smem_pipe_read);
+
+    // Size of the register pipeline
+    auto K_BLOCK_MAX = size<2>(tCrA);
+
+    // PREFETCH register pipeline
+    if (K_BLOCK_MAX > 1) {
+      // Wait until our first prefetched tile is loaded in
+      cp_async_wait<DispatchPolicy::Stages-2>();
+      __syncthreads();
+
+      // Prefetch the first rmem from the first k-tile
+      copy(smem_tiled_copy_A, tCsA_p(_,_,Int<0>{}), tCrA_copy_view(_,_,Int<0>{}));
+      copy(smem_tiled_copy_B, tCsB_p(_,_,Int<0>{}), tCrB_copy_view(_,_,Int<0>{}));
+    }
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > -(DispatchPolicy::Stages-1))
+    {
+      // Pipeline the outer products with a static for loop.
+      //
+      // Note, the for_each() function is required here to ensure `k_block` is of type Int<x>.
+      for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block)
+      {
+        if (k_block == K_BLOCK_MAX - 1)
+        {
+          // Slice the smem_pipe_read smem
+          tCsA_p = tCsA(_,_,_,smem_pipe_read);
+          tCsB_p = tCsB(_,_,_,smem_pipe_read);
+
+          // Commit the smem for smem_pipe_read
+          cp_async_wait<DispatchPolicy::Stages-2>();
+          __syncthreads();
+        }
+
+        // Load A, B shmem->regs for k_block+1
+        auto k_block_next = (k_block + Int<1>{}) % K_BLOCK_MAX;  // static
+        copy(smem_tiled_copy_A, tCsA_p(_,_,k_block_next), tCrA_copy_view(_,_,k_block_next));
+        copy(smem_tiled_copy_B, tCsB_p(_,_,k_block_next), tCrB_copy_view(_,_,k_block_next));
+        // Copy gmem to smem before computing gemm on each k-pipe
+        if (k_block == 0)
+        {
+          copy(gmem_tiled_copy_A, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,smem_pipe_write));
+          copy(gmem_tiled_copy_B, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,smem_pipe_write));
+          cp_async_fence();
+          
+          // Advance the tile
+          --k_tile_count;
+          if (k_tile_count > 0) { ++k_tile_iter; }
+
+          // Advance the pipe -- Doing it here accounts for K_BLOCK_MAX = 1 (no rmem pipe)
+          smem_pipe_write = smem_pipe_read;
+          ++smem_pipe_read;
+          smem_pipe_read = (smem_pipe_read == DispatchPolicy::Stages) ? 0 : smem_pipe_read;
+        }
+
+        // Transform before compute
+        cute::transform(tCrA(_,_,k_block), TransformA{});
+        cute::transform(tCrB(_,_,k_block), TransformB{});
+        // Thread-level register gemm for k_block
+        cute::gemm(tiled_mma, accum, tCrA(_,_,k_block), tCrB(_,_,k_block), src_accum);
+      });
+
+    }
+
+    cp_async_wait<0>();
+    __syncthreads();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int Stages,
+  class ClusterShape_,
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_
+>
+struct CollectiveMma<
+    MainloopSm80CpAsync<
+      Stages,
+      ClusterShape_>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_
+   >
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm80CpAsync<
+                          Stages,
+                          ClusterShape_>;
+  using TileShape = TileShape_;
+  // Follow the change in TestSmall: TileShape switch to CtaShape 
+  // In legacy arch, it should be same
+  using CtaShape_MNK = TileShape;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
+
+  static_assert(DispatchPolicy::Stages >= 2, "CpAsync mainloop must have at least 2 stages in the pipeline.");
+
+  struct SharedStorage
+  {
+    cute::array_aligned<ElementA, cute::cosize_v<SmemLayoutA>> smem_a;
+    cute::array_aligned<ElementB, cute::cosize_v<SmemLayoutB>> smem_b;
+  };
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A;
+    StrideA dA;
+    ElementB const* ptr_B;
+    StrideB dB;
+  };
+
+  // Device side kernel params
+  using Params = Arguments;
+
+  //
+  // Methods
+  //
+
+  CollectiveMma() = default;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& _, Arguments const& args, void* workspace) {
+    (void) workspace;
+    return args;
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  template <
+    class FrgTensorD,
+    class TensorA,
+    class TensorB,
+    class FrgTensorC,
+    class KTileIterator,
+    class ResidueMNK
+  >
+  CUTLASS_DEVICE void
+  operator() (
+      FrgTensorD &accum,
+      TensorA gA,                   // (BLK_M, BLK_K, K_TILES)
+      TensorB gB,                   // (BLK_N, BLK_K, K_TILES)
+      FrgTensorC const &src_accum,
+      KTileIterator k_tile_iter, int k_tile_count,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      char *smem_buf)
+  {
+    using namespace cute;
+
+    static_assert(is_rmem<FrgTensorD>::value, "D tensor must be rmem resident.");
+    static_assert(is_gmem<TensorA>::value,    "A tensor must be gmem resident.");
+    static_assert(is_gmem<TensorB>::value,    "B tensor must be gmem resident.");
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+
+    // Construct shared memory tiles
+    SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+    Tensor sA = make_tensor(make_smem_ptr(storage.smem_a.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(storage.smem_b.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<0>(gA) == size<0>(sA));                          // BLK_M
+    CUTE_STATIC_ASSERT_V(size<1>(gA) == size<1>(sA));                          // BLK_K
+    CUTE_STATIC_ASSERT_V(size<0>(gB) == size<0>(sB));                          // BLK_N
+    CUTE_STATIC_ASSERT_V(size<1>(gB) == size<1>(sB));                          // BLK_K
+    CUTE_STATIC_ASSERT_V(size<1>(sA) == size<1>(sB));                          // BLK_K
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));        // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));        // PIPE
+
+    // Shift tensor so residue_k is at origin (Can't read any k_coord < residue_k)
+    // This aligns the tensor with BLK_K for all but the 0th k_tile
+    gA = cute::domain_offset(make_coord(0, get<2>(residue_mnk), 0), gA);
+    gB = cute::domain_offset(make_coord(0, get<2>(residue_mnk), 0), gB);
+
+    // Partition the copying of A and B tiles across the threads
+    GmemTiledCopyA gmem_tiled_copy_A;
+    GmemTiledCopyB gmem_tiled_copy_B;
+    auto gmem_thr_copy_A = gmem_tiled_copy_A.get_slice(thread_idx);
+    auto gmem_thr_copy_B = gmem_tiled_copy_B.get_slice(thread_idx);
+
+    Tensor tAgA = gmem_thr_copy_A.partition_S(gA);                             // (ACPY,ACPY_M,ACPY_K,k)
+    Tensor tAsA = gmem_thr_copy_A.partition_D(sA);                             // (ACPY,ACPY_M,ACPY_K,PIPE)
+    Tensor tBgB = gmem_thr_copy_B.partition_S(gB);                             // (BCPY,BCPY_N,BCPY_K,k)
+    Tensor tBsB = gmem_thr_copy_B.partition_D(sB);                             // (BCPY,BCPY_N,BCPY_K,PIPE)
+
+    //
+    // PREDICATES
+    //
+
+    // Allocate predicate tensors for m and n
+    Tensor tApA = make_tensor<bool>(make_shape(size<1>(tAsA), size<2>(tAsA)), Stride<_1,_0>{});
+    Tensor tBpB = make_tensor<bool>(make_shape(size<1>(tBsB), size<2>(tBsB)), Stride<_1,_0>{});
+
+    // Construct identity layout for sA and sB
+    Tensor cA = make_identity_tensor(make_shape(size<0>(sA), size<1>(sA)));    // (BLK_M,BLK_K) -> (blk_m,blk_k)
+    Tensor cB = make_identity_tensor(make_shape(size<0>(sB), size<1>(sB)));    // (BLK_N,BLK_K) -> (blk_n,blk_k)
+
+    // Repeat the partitioning with identity layouts
+    Tensor tAcA = gmem_thr_copy_A.partition_S(cA);                             // (ACPY,ACPY_M,ACPY_K) -> (blk_m,blk_k)
+    Tensor tBcB = gmem_thr_copy_B.partition_S(cB);                             // (BCPY,BCPY_N,BCPY_K) -> (blk_n,blk_k)
+
+    // Set predicates for m bounds
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < size<0>(tApA); ++m) {
+      tApA(m,0) = get<0>(tAcA(0,m,0)) < get<0>(residue_mnk);  // blk_m coord < residue_m
+    }
+    // Set predicates for n bounds
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < size<0>(tBpB); ++n) {
+      tBpB(n,0) = get<0>(tBcB(0,n,0)) < get<1>(residue_mnk);  // blk_n coord < residue_n
+    }
+
+    //
+    // PREFETCH
+    //
+
+    // Clear the smem tiles to account for predicated off loads
+    clear(tAsA);
+    clear(tBsB);
+
+    // Start async loads for 0th k-tile, where we take care of the k residue
+    {
+      constexpr int k_pipe = 0;
+
+      Tensor tAgAk = tAgA(_,_,_,*k_tile_iter);
+      CUTLASS_PRAGMA_UNROLL
+      for (int k = 0; k < size<2>(tAsA); ++k) {
+        if (get<1>(tAcA(0,0,k)) >= -get<2>(residue_mnk)) {      // blk_k coord < residue_k (gA shifted)
+          copy_if(gmem_tiled_copy_A, tApA(_,k), tAgAk(_,_,k), tAsA(_,_,k,k_pipe));
+        }
+      }
+      Tensor tBgBk = tBgB(_,_,_,*k_tile_iter);
+      CUTLASS_PRAGMA_UNROLL
+      for (int k = 0; k < size<2>(tBsB); ++k) {
+        if (get<1>(tBcB(0,0,k)) >= -get<2>(residue_mnk)) {      // blk_k coord < residue_k (gB shifted)
+          copy_if(gmem_tiled_copy_B, tBpB(_,k), tBgBk(_,_,k), tBsB(_,_,k,k_pipe));
+        }
+      }
+      cp_async_fence();
+      ++k_tile_iter;
+      --k_tile_count;
+    }
+
+    // Start async loads for 1st k-tile onwards, no k-residue handling needed
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_pipe = 1; k_pipe < DispatchPolicy::Stages-1; ++k_pipe) {
+      if (k_tile_count <= 0) {
+        clear(tApA);
+        clear(tBpB);
+      }
+      copy_if(gmem_tiled_copy_A, tApA, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,k_pipe));  // CpAsync
+      copy_if(gmem_tiled_copy_B, tBpB, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,k_pipe));  // CpAsync
+      cp_async_fence();
+      ++k_tile_iter;
+      --k_tile_count;
+    }
+
+    //
+    // MMA Atom partitioning
+    //
+
+    // Tile MMA compute thread partitions and allocate accumulators
+    TiledMma tiled_mma;
+    auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCrA  = thr_mma.partition_fragment_A(sA(_,_,0));                    // (MMA,MMA_M,MMA_K)
+    Tensor tCrB  = thr_mma.partition_fragment_B(sB(_,_,0));                    // (MMA,MMA_N,MMA_K)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));                     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(src_accum));                 // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(accum));                     // MMA_N
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(src_accum));                 // MMA_N
+    CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB));                      // MMA_K
+
+    //
+    // Copy Atom retiling
+    //
+
+    auto smem_tiled_copy_A   = make_tiled_copy_A(SmemCopyAtomA{}, tiled_mma);
+    auto smem_thr_copy_A     = smem_tiled_copy_A.get_thread_slice(thread_idx);
+    Tensor tCsA           = smem_thr_copy_A.partition_S(sA);                   // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrA_copy_view = smem_thr_copy_A.retile_D(tCrA);                    // (CPY,CPY_M,CPY_K)
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));            // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));            // CPY_K
+
+    auto smem_tiled_copy_B = make_tiled_copy_B(SmemCopyAtomB{}, tiled_mma);
+    auto smem_thr_copy_B   = smem_tiled_copy_B.get_thread_slice(thread_idx);
+    Tensor tCsB              = smem_thr_copy_B.partition_S(sB);                // (CPY,CPY_N,CPY_K,PIPE)
+    Tensor tCrB_copy_view    = smem_thr_copy_B.retile_D(tCrB);                 // (CPY,CPY_N,CPY_K)
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view));            // CPY_N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsB) == size<2>(tCrB_copy_view));            // CPY_K
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    // Current pipe index in smem to read from
+    int smem_pipe_read  = 0;
+    // Current pipe index in smem to write to
+    int smem_pipe_write = DispatchPolicy::Stages-1;
+
+    Tensor tCsA_p = tCsA(_,_,_,smem_pipe_read);
+    Tensor tCsB_p = tCsB(_,_,_,smem_pipe_read);
+
+    // Size of the register pipeline
+    auto K_BLOCK_MAX = size<2>(tCrA);
+
+    // PREFETCH register pipeline
+    if (K_BLOCK_MAX > 1) {
+      // Wait until our first prefetched tile is loaded in
+      cp_async_wait<DispatchPolicy::Stages-2>();
+      __syncthreads();
+
+      // Prefetch the first rmem from the first k-tile
+      copy(smem_tiled_copy_A, tCsA_p(_,_,Int<0>{}), tCrA_copy_view(_,_,Int<0>{}));
+      copy(smem_tiled_copy_B, tCsB_p(_,_,Int<0>{}), tCrB_copy_view(_,_,Int<0>{}));
+    }
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > -(DispatchPolicy::Stages-1); --k_tile_count)
+    {
+      // Pipeline the outer products with a static for loop.
+      //
+      // Note, the for_each() function is required here to ensure `k_block` is of type Int<N>.
+      for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block)
+      {
+        if (k_block == K_BLOCK_MAX - 1)
+        {
+          // Slice the smem_pipe_read smem
+          tCsA_p = tCsA(_,_,_,smem_pipe_read);
+          tCsB_p = tCsB(_,_,_,smem_pipe_read);
+
+          // Commit the smem for smem_pipe_read
+          cp_async_wait<DispatchPolicy::Stages-2>();
+          __syncthreads();
+        }
+
+        // Load A, B shmem->regs for k_block+1
+        auto k_block_next = (k_block + Int<1>{}) % K_BLOCK_MAX;  // static
+        copy(smem_tiled_copy_A, tCsA_p(_,_,k_block_next), tCrA_copy_view(_,_,k_block_next));
+        copy(smem_tiled_copy_B, tCsB_p(_,_,k_block_next), tCrB_copy_view(_,_,k_block_next));
+        // Copy gmem to smem before computing gemm on each k-pipe
+        if (k_block == 0)
+        {
+          // Set all predicates to false if we are going to overshoot bounds
+          if (k_tile_count <= 0) {
+            clear(tApA);
+            clear(tBpB);
+          }
+          copy_if(gmem_tiled_copy_A, tApA, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,smem_pipe_write));
+          copy_if(gmem_tiled_copy_B, tBpB, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,smem_pipe_write));
+          cp_async_fence();
+          ++k_tile_iter;
+
+          // Advance the pipe -- Doing it here accounts for K_BLOCK_MAX = 1 (no rmem pipe)
+          smem_pipe_write = smem_pipe_read;
+          ++smem_pipe_read;
+          smem_pipe_read = (smem_pipe_read == DispatchPolicy::Stages) ? 0 : smem_pipe_read;
+        }
+
+        // Transform before compute
+        cute::transform(tCrA(_,_,k_block), TransformA{});
+        cute::transform(tCrB(_,_,k_block), TransformB{});
+        // Thread-level register gemm for k_block
+        cute::gemm(tiled_mma, accum, tCrA(_,_,k_block), tCrB(_,_,k_block), src_accum);
+      });
+
+    }
+
+    cp_async_wait<0>();
+    __syncthreads();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input.hpp b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input.hpp
new file mode 100644
index 0000000000..ed223a56b8
--- /dev/null
+++ b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input.hpp
@@ -0,0 +1,1370 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/cuda_host_adapter.hpp"
+#include "cutlass/detail/collective/mixed_input_utils.hpp"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <
+  int Stages,
+  class ClusterShape,
+  class KernelSchedule_,
+  class TileShape_,
+  class ElementAOptionalTuple,
+  class StrideA_,
+  class ElementBOptionalTuple,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90ArrayTmaGmmaWarpSpecializedMixedInput<Stages, ClusterShape, KernelSchedule_>,
+    TileShape_,
+    ElementAOptionalTuple,
+    StrideA_,
+    ElementBOptionalTuple,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+public:
+  enum class ConversionMode {
+    DirectConvert,
+    ConvertAndScale,
+    ConvertAndScaleWithZero
+  };
+  
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90ArrayTmaGmmaWarpSpecializedMixedInput<Stages, ClusterShape, KernelSchedule_>;
+  using TileShape = TileShape_;
+  using KernelSchedule = KernelSchedule_;
+
+private:
+  template<class T> friend struct detail::MixedInputUtils;
+  using CollectiveType = CollectiveMma<DispatchPolicy, TileShape_, 
+                                       ElementAOptionalTuple, StrideA_, 
+                                       ElementBOptionalTuple, StrideB_,
+                                       TiledMma_, 
+                                       GmemTiledCopyA_, SmemLayoutAtomA_, SmemCopyAtomA_,
+                                       TransformA_,
+                                       GmemTiledCopyB_, SmemLayoutAtomB_, SmemCopyAtomB_,
+                                       TransformB_>;
+  using Utils = detail::MixedInputUtils<CollectiveType>;
+
+  //
+  // Type Aliases
+  //
+  using ScaleA = detail::deduce_mixed_width_dtype_t<1, ElementAOptionalTuple>;
+  using ScaleB = detail::deduce_mixed_width_dtype_t<1, ElementBOptionalTuple>;
+  using ZeroA = detail::deduce_mixed_width_dtype_t<2, ElementAOptionalTuple>;
+  using ZeroB = detail::deduce_mixed_width_dtype_t<2, ElementBOptionalTuple>;
+
+public:
+  static_assert(cute::is_tuple<ElementAOptionalTuple>::value ^ cute::is_tuple<ElementBOptionalTuple>::value, 
+    "Either A OR B must be a tuple. It must take the from {ElementOperand, [ElementScale], [ElementZero]}. Inputs in [] are optional.");
+  
+  using ElementA = detail::deduce_mixed_width_dtype_t<0, ElementAOptionalTuple>;
+  using ElementB = detail::deduce_mixed_width_dtype_t<0, ElementBOptionalTuple>;
+  static constexpr bool IsATransformed = cute::is_tuple<ElementAOptionalTuple>::value;
+  using ElementScale = cute::conditional_t<IsATransformed, ScaleA, ScaleB>;
+  using ElementZero = cute::conditional_t<IsATransformed, ZeroA, ZeroB>;
+  // For cases where we can't have a void type, we can use this to allow the code to compile when the scale / zero is void.
+  using NonVoidElementScale = cute::conditional_t<cute::is_void_v<ElementScale>, float, ElementScale>;
+  using NonVoidElementZero = cute::conditional_t<cute::is_void_v<ElementZero>, float, ElementZero>;
+
+  using StrideA = StrideA_;
+  using InternalStrideA = cute::remove_pointer_t<StrideA>;
+  using StrideB = StrideB_;
+  using InternalStrideB = cute::remove_pointer_t<StrideB>;
+  
+  using StrideScale = cute::Stride<cute::Int<1>, int64_t, int64_t>;
+  using NonVoidStrideScale = cute::conditional_t<cute::is_void_v<StrideScale>, cute::Stride<_1, int64_t, int64_t>, StrideScale>;
+
+  static_assert(( IsATransformed && (cutlass::gemm::detail::is_k_major<StrideA>() || is_layout<StrideA>::value || is_layout<InternalStrideA>::value)) || 
+                (!IsATransformed && (cutlass::gemm::detail::is_k_major<StrideB>() || is_layout<StrideB>::value || is_layout<InternalStrideB>::value)),
+                "The transformed type must be K-major.");
+
+  static_assert(( IsATransformed && (sizeof(ElementB) == 2)) ||
+                (!IsATransformed && (sizeof(ElementA) == 2)) ||
+                ((cutlass::gemm::detail::is_k_major<StrideA>() || is_layout<StrideA>::value || is_layout<InternalStrideA>::value) && 
+                 (cutlass::gemm::detail::is_k_major<StrideB>() || is_layout<StrideB>::value || is_layout<InternalStrideB>::value)), 
+                "The unscaled element must be 2 bytes OR both inputs must be K-major");
+
+  static_assert(cutlass::gemm::detail::is_mn_major<NonVoidStrideScale>(), 
+    "Scale must be MN major [Col Major if A is scaled, Row Major if B is scaled].");
+  
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using GmemTiledCopyScale = cute::SM90_TMA_LOAD;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using SmemCopyAtomScale = Copy_Atom<cute::AutoVectorizingCopy, NonVoidElementScale>;
+
+  // We must ensure the type to be scaled goes to RF
+  static constexpr bool SwapAB = !IsATransformed;
+  using SwappedStrideA = cute::conditional_t<!SwapAB, StrideA, StrideB>;
+  using SwappedStrideB = cute::conditional_t<!SwapAB, StrideB, StrideA>;
+  using InternalSwappedStrideA = cute::conditional_t<!SwapAB, InternalStrideA, InternalStrideB>;
+  using InternalSwappedStrideB = cute::conditional_t<!SwapAB, InternalStrideB, InternalStrideA>;
+  using SwappedSmemLayoutAtomA = cute::conditional_t<!SwapAB, SmemLayoutAtomA, SmemLayoutAtomB>;
+  using SwappedSmemLayoutAtomB = cute::conditional_t<!SwapAB, SmemLayoutAtomB, SmemLayoutAtomA>;
+  using SwappedSmemCopyAtomA   = cute::conditional_t<!SwapAB, SmemCopyAtomA, SmemCopyAtomB>;
+  using SwappedSmemCopyAtomB   = cute::conditional_t<!SwapAB, SmemCopyAtomB, SmemCopyAtomA>;
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using ConvertedElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using ConvertedElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
+  using RealSwappedElementA = cute::conditional_t<!SwapAB, ElementA, ElementB>;
+  using RealSwappedElementB = cute::conditional_t<!SwapAB, ElementB, ElementA>;
+  using SwappedElementA = cute::conditional_t<!SwapAB, ConvertedElementA, ConvertedElementB>;
+  using SwappedElementB = cute::conditional_t<!SwapAB, ConvertedElementB, ConvertedElementA>;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using SwappedTransformA  = cute::conditional_t<!SwapAB, TransformA, TransformB>;
+  using SwappedTransformB  = cute::conditional_t<!SwapAB, TransformB, TransformA>;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  static constexpr int IsSubbyteA = cute::sizeof_bits_v<SwappedElementA> < 8;
+  using TmaElementA = cute::conditional_t<IsSubbyteA, uint8_t, SwappedElementA>;
+  using TmaElementScale = uint_bit_t<sizeof_bits_v<NonVoidElementScale> >; // in case we have array. translating to uint to satisfy tma descriptor's specialization
+
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  using SmemLayoutAtomScale = Layout<Shape<decltype(cute::shape<0>(SwappedSmemLayoutAtomA{})), cute::Int<1>>>;
+  using ScaleTileShape = decltype(make_shape(shape<0>(TileShape{}), shape<1>(SmemLayoutAtomScale{})));
+
+  static_assert(cute::rank(SwappedSmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SwappedSmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SwappedSmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SwappedSmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SwappedSmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SwappedSmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(rank(SmemLayoutAtomScale{}) == 2, "SmemLayoutAtomScale must be rank 2");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomScale{})) == 0, "SmemLayoutAtomScale must equal the tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomScale{})) == 0, "SmemLayoutAtomScale must evenly divide tile k shape.");
+
+  /// Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(detail::get_smem_layout<DispatchPolicy::Stages>(SwappedSmemLayoutAtomA{}, select<0,2>(TileShape{}), InternalSwappedStrideA{}));
+  using SmemLayoutB = decltype(detail::get_smem_layout<DispatchPolicy::Stages>(SwappedSmemLayoutAtomB{}, select<1,2>(TileShape{}), InternalSwappedStrideB{}));
+  
+  // It is assumed that the scales and zero-points share the same smem layout
+  using SmemLayoutScale = decltype(tile_to_shape(
+    SmemLayoutAtomScale{}, 
+    make_shape(shape<0>(ScaleTileShape{}), shape<1>(ScaleTileShape{}), Int<Stages>{}),
+    cute::conditional_t< ::cutlass::gemm::detail::is_major<0,NonVoidStrideScale>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                    cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source A from rmem and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  // To relax them, we need to handle loading more than 1 row of scales for every main loop iteration.
+  // We must also handle updating the pipeline transaction bytes on the fly.
+  // NOTE: Deleting this assertion without required changes will cause the code to hang.
+  static_assert(size<1>(SmemLayoutAtomScale{}) == 1, "size<1>(SmemLayoutAtomScale) must be 1.");
+
+private:
+  static constexpr ConversionMode 
+  get_conversion_mode() {
+    if constexpr (cute::is_void_v<ElementScale>) {
+      return ConversionMode::DirectConvert;
+    } 
+    else if constexpr (cute::is_void_v<ElementZero>) {
+      return ConversionMode::ConvertAndScale;
+    }
+    else {
+      return ConversionMode::ConvertAndScaleWithZero;
+    }
+  }  
+
+public:
+  static constexpr ConversionMode KernelConversionMode = get_conversion_mode();
+  static constexpr bool ModeHasScales = KernelConversionMode == ConversionMode::ConvertAndScale ||
+                                        KernelConversionMode == ConversionMode::ConvertAndScaleWithZero;
+  static constexpr bool UseScaleLookupTable = KernelConversionMode == ConversionMode::ConvertAndScale &&
+                                              cutlass::detail::is_Array_v<ElementScale>;
+  static constexpr size_t SmemAlignmentA = cutlass::detail::alignment_for_swizzle(SmemLayoutA{}); 
+  static constexpr size_t SmemAlignmentB = cutlass::detail::alignment_for_swizzle(SmemLayoutB{});
+  static constexpr size_t SmemAlignmentScale = cute::max(SmemAlignmentA, SmemAlignmentB);
+
+  static_assert(SmemAlignmentA >= 128 and SmemAlignmentB >= 128, "Require at least 128B alignment");
+
+  struct SharedStorage {
+    static constexpr int scale_elements = Utils::elements_per_smem_scale();
+    static constexpr int zero_elements = Utils::elements_per_smem_zero();
+    struct TensorStorage {
+      CUTE_ALIGNAS(SmemAlignmentA) cute::ArrayEngine<RealSwappedElementA, cute::cosize_v<SmemLayoutA>> smem_A;
+      CUTE_ALIGNAS(SmemAlignmentB) cute::ArrayEngine<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::ArrayEngine<NonVoidElementScale, scale_elements> smem_scale;
+      cute::ArrayEngine<NonVoidElementZero, zero_elements> smem_zero;
+    } tensors;
+
+    struct TensorMapStorage {
+      cute::TmaDescriptor smem_tensormap_A;
+      cute::TmaDescriptor smem_tensormap_B;
+      cute::TmaDescriptor smem_tensormap_scale;
+      cute::TmaDescriptor smem_tensormap_zero;
+    };
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using TensorMapStorage = typename SharedStorage::TensorMapStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideA, StrideA>;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const** ptr_A;
+    StrideA dA;
+    ElementB const** ptr_B;
+    StrideB dB;
+    ElementScale const** ptr_S = nullptr;
+    NonVoidStrideScale const* dS{};
+    int chunk_size = 0;
+    ElementZero const** ptr_Z = nullptr;
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using LayoutA = decltype(detail::get_gmem_layout(repeat_like(InternalSwappedStrideA{}, int32_t(0)), InternalSwappedStrideA{}));
+    using LayoutB = decltype(detail::get_gmem_layout(repeat_like(InternalSwappedStrideB{}, int32_t(0)), InternalSwappedStrideB{}));
+
+    using TMA_A = decltype(make_tma_copy<TmaElementA>(
+        GmemTiledCopyA{},
+        make_tensor(detail::get_logical_ptr(static_cast<SwappedElementA const*>(nullptr)), LayoutA{}),
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})));  // mcast along N mode for this M load, if any
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy(
+        GmemTiledCopyB{},
+        make_tensor(detail::get_logical_ptr(static_cast<SwappedElementB const*>(nullptr)), LayoutB{}),
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{}))); // mcast along M mode for this N load, if any
+
+    using TMA_Scale = decltype(make_tma_copy<TmaElementScale>(
+        GmemTiledCopyScale{},
+        make_tensor(detail::get_logical_ptr(static_cast<NonVoidElementScale const*>(nullptr)), repeat_like(NonVoidStrideScale{}, int32_t(0)), NonVoidStrideScale{}),
+        SmemLayoutScale{}(_,_,cute::Int<0>{}),
+        ScaleTileShape{},
+        _1{}));  // mcast along N mode for this M load, if any. Scale is ALWAYS loaded with A for RF kernel
+
+   using TMA_Zero = decltype(make_tma_copy(
+        GmemTiledCopyScale{},
+        make_tensor(detail::get_logical_ptr(static_cast<NonVoidElementZero const*>(nullptr)), repeat_like(NonVoidStrideScale{}, int32_t(0)), NonVoidStrideScale{}),
+        SmemLayoutScale{}(_,_,cute::Int<0>{}),
+        ScaleTileShape{},
+        _1{}));  // mcast along N mode for this M load, if any. Scale is ALWAYS loaded with A for RF kernel
+    
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    TMA_Scale tma_load_scale;
+    TMA_Zero tma_load_zero;
+    void* tensormaps;
+    SwappedElementA const** ptr_A;
+    SwappedStrideA ptr_dA;
+    SwappedElementB const** ptr_B;
+    SwappedStrideB ptr_dB;
+    NonVoidElementScale const** ptr_S;
+    NonVoidStrideScale const* dS;
+    NonVoidElementZero const** ptr_Z;
+    int64_t scale_k;
+    int chunk_size;
+    int reload_factor = (chunk_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{});
+    InternalSwappedStrideA dA;
+    InternalSwappedStrideB dB;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      ProblemShape problem_shapes,
+      Arguments const& args,
+      void* workspace) {
+
+    // These tensor shapes (only applicable for grouped gemm) and pointers are only used to create tensormap/tma desc.
+    // These will be replaced with correct values before the initial tma load.
+    auto init_shape = repeat_like(typename ProblemShape::UnderlyingProblemShape{}, int32_t(1));
+    auto init_M = get<0>(init_shape);
+    auto init_N = get<1>(init_shape);
+    auto init_K = get<2>(init_shape);
+
+    if constexpr (SwapAB) {
+      init_M = get<1>(init_shape);
+      init_N = get<0>(init_shape);
+    }
+    // Batches/Groups are managed by using appropriate pointers to input matrices
+    const uint32_t mock_L = 1;
+    SwappedElementA const* ptr_A_first_batch;
+    SwappedElementB const* ptr_B_first_batch;
+    SwappedStrideA ptr_dA;
+    SwappedStrideB ptr_dB;
+    InternalSwappedStrideA dA;
+    InternalSwappedStrideB dB;
+
+    if constexpr (not SwapAB) {
+      ptr_A_first_batch = reinterpret_cast<SwappedElementA const*>(args.ptr_A);
+      ptr_B_first_batch = reinterpret_cast<SwappedElementB const*>(args.ptr_B);
+    }
+    else {
+      ptr_A_first_batch = reinterpret_cast<SwappedElementA const*>(args.ptr_B);
+      ptr_B_first_batch = reinterpret_cast<SwappedElementB const*>(args.ptr_A);
+    }
+
+    if constexpr (IsGroupedGemmKernel) {
+      // Strides for Grouped Gemm will be replaced prior to the first access regardless.
+      if constexpr (not SwapAB) {
+        ptr_dA = args.dA;
+        ptr_dB = args.dB;
+      }
+      else {
+        ptr_dA = args.dB;
+        ptr_dB = args.dA;
+      }
+      dA = InternalSwappedStrideA{};
+      if constexpr (is_layout<InternalSwappedStrideA>::value) {
+        dA = make_layout(
+          transform_leaf(dA.shape(), [](auto x){ 
+            if constexpr (not is_static_v<decltype(x)>) {
+              return static_cast<decltype(x)>(1);
+            } else {
+              return x;
+            }
+          }),
+          dA.stride());
+      }
+      dB = InternalSwappedStrideB{};
+    }
+    else {
+      // Tensor shapes for Ptr-Array are initialized correctly only here.
+      auto problem_shape_MNK = problem_shapes.get_host_problem_shape(0);
+      init_M = get<0>(problem_shape_MNK);
+      init_N = get<1>(problem_shape_MNK);
+      init_K = get<2>(problem_shape_MNK);
+
+      if constexpr (not SwapAB) {
+        dA = args.dA;
+        dB = args.dB;
+      }
+      else {
+        dA = args.dB;
+        dB = args.dA;
+      }
+      ptr_dA = SwappedStrideA{};
+      ptr_dB = SwappedStrideB{};
+    }
+    Tensor tensor_a = make_tensor(ptr_A_first_batch, detail::get_gmem_layout(make_shape(init_M,init_K,mock_L), dA));
+    Tensor tensor_b = make_tensor(ptr_B_first_batch, detail::get_gmem_layout(make_shape(init_N,init_K,mock_L), dB));
+
+    typename Params::TMA_A tma_load_a = make_tma_copy<TmaElementA>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})); // mcast along N mode for this M load, if any
+    typename Params::TMA_B tma_load_b = make_tma_copy(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{})); // mcast along M mode for this N load, if any
+    typename Params::TMA_Scale tma_load_scale{};
+    typename Params::TMA_Zero tma_load_zero{};
+
+    void* tensormaps = workspace;
+    auto args_setup = [&](auto ptr_A, auto ptr_B, int64_t scale_k = 0, int chunk_size = 0, int reload_factor = 1) -> Params {
+      return {
+          tma_load_a,
+          tma_load_b,
+          TmaTransactionBytes,
+          tma_load_scale,
+          tma_load_zero,
+          tensormaps,
+          reinterpret_cast<SwappedElementA const**>(ptr_A),
+          ptr_dA,
+          reinterpret_cast<SwappedElementB const**>(ptr_B),
+          ptr_dB,
+          reinterpret_cast<NonVoidElementScale const**>(args.ptr_S),
+          args.dS,
+          reinterpret_cast<NonVoidElementZero const**>(args.ptr_Z),
+          scale_k,
+          chunk_size,
+          reload_factor,
+          dA,
+          dB
+      };
+    };
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return SwapAB ? args_setup(args.ptr_B, args.ptr_A)
+                    : args_setup(args.ptr_A, args.ptr_B);
+    }
+    else if constexpr (ModeHasScales) {
+      // NOTE: fix chunk wise scaling
+      //auto scale_k = (K + args.chunk_size - 1) / args.chunk_size;
+      auto scale_k = 1;
+      ElementScale const* ptr_S = reinterpret_cast<ElementScale const*>(args.ptr_S);
+      StrideScale dS{};
+      Tensor tensor_scale = make_tensor(detail::get_logical_ptr(ptr_S), make_layout(make_shape(init_M,scale_k,mock_L), dS));
+      tma_load_scale = make_tma_copy<TmaElementScale>(
+          GmemTiledCopyScale{},
+          tensor_scale,
+          SmemLayoutScale{}(_,_,cute::Int<0>{}),
+          ScaleTileShape{},
+          _1{}); // mcast along N mode for this M load, if any
+
+      if constexpr(KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return SwapAB ? args_setup(args.ptr_B, args.ptr_A, scale_k, args.chunk_size, (args.chunk_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{}))
+                      : args_setup(args.ptr_A, args.ptr_B, scale_k, args.chunk_size, (args.chunk_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{}));
+      }
+      else if constexpr(KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        ElementZero const* ptr_Z = reinterpret_cast<ElementZero const*>(args.ptr_Z);
+        Tensor tensor_zero = make_tensor(detail::get_logical_ptr(ptr_Z), make_layout(make_shape(init_M,scale_k,mock_L), dS));
+        tma_load_zero = make_tma_copy(
+            GmemTiledCopyScale{},
+            tensor_zero,
+            SmemLayoutScale{}(_,_,cute::Int<0>{}),
+            ScaleTileShape{},
+            _1{}); // mcast along N mode for this M load, if any
+        return SwapAB ? args_setup(args.ptr_B, args.ptr_A, scale_k, args.chunk_size, (args.chunk_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{}))
+                      : args_setup(args.ptr_A, args.ptr_B, scale_k, args.chunk_size, (args.chunk_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{}));
+        
+      } 
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in to_underlying_arguments.");
+      }
+    } 
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in to_underlying_arguments.");
+    } 
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args, int sm_count) {
+    constexpr size_t SizeOfCuTensorMap = sizeof(cute::TmaDescriptor);
+
+    // Calculating workspace size
+    auto calculate_workspace_size = [SizeOfCuTensorMap, sm_count](uint32_t num_input_tensors) {
+        return num_input_tensors * SizeOfCuTensorMap * sm_count;
+    };
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B tensormap copies
+      return calculate_workspace_size(2);
+    } 
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B tensormap copies, followed by scale tensormap copies
+      return calculate_workspace_size(3);
+    } 
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B tensormap copies, followed by scale and zeros tensormap copies
+      return calculate_workspace_size(4);
+    } 
+    else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in get_workspace_size.");
+    }
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream, CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+
+  template<class ProblemShape>
+  CUTLASS_HOST_DEVICE static bool
+  can_implement(
+      ProblemShape problem_shapes,
+      Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+
+    bool implementable = true;
+    if (problem_shapes.is_host_problem_shape_available()) {
+      // Check alignment for all problem sizes
+      for (int i = 0; i < problem_shapes.groups(); i++) {
+        auto problem_shape_MNKL = append<4>(problem_shapes.get_host_problem_shape(i), 1);
+        auto [M,N,K,L] = problem_shape_MNKL;
+        auto get_stride = [](auto stride) {
+          if constexpr (cute::is_pointer_v<cute::decay_t<decltype(stride)>>) {
+            return *stride;
+          }
+          else {
+            return stride;
+          }
+        };
+        auto dA = get_stride(args.dA);
+        auto dB = get_stride(args.dB);
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(detail::get_gmem_layout(cute::make_shape(M,K,L), dA));
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(detail::get_gmem_layout(cute::make_shape(N,K,L), dB));
+        if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+          implementable = implementable && (args.ptr_S == nullptr);
+          implementable = implementable && (args.ptr_Z == nullptr);
+        }
+        else if constexpr (ModeHasScales) {
+          const int scale_mn = SwapAB ? N : M;
+          const int scale_k = (K + args.chunk_size - 1) / args.chunk_size;
+          constexpr int min_tma_aligned_elements_scale = tma_alignment_bits / cutlass::sizeof_bits<ElementScale>::value;
+          implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_scale>(cute::make_shape(scale_mn,scale_k,L), StrideScale{});
+          implementable = implementable && (args.chunk_size == K || ((args.chunk_size % size<2>(TileShape{})) == 0));
+          implementable = implementable && args.chunk_size != 0;
+          implementable = implementable && (args.ptr_S != nullptr);
+          if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+            implementable = implementable && (args.ptr_Z == nullptr);
+          }
+          else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+            constexpr int min_tma_aligned_elements_zero = tma_alignment_bits / cutlass::sizeof_bits<ElementZero>::value;
+            implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_zero>(cute::make_shape(scale_mn,scale_k,L), StrideScale{});
+            implementable = implementable && (args.ptr_Z != nullptr);
+          } 
+          else {
+            static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in can_implement.");
+          }
+        }
+        else {
+          static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in can_implement.");
+        }
+      }
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 1;
+  static constexpr uint32_t TmaTransactionBytesMK = Utils::compute_tma_transaction_bytes_mk();
+  static constexpr uint32_t TmaTransactionBytesNK = Utils::compute_tma_transaction_bytes_nk();
+  static constexpr uint32_t TmaTransactionBytesExtra = Utils::compute_tma_transaction_bytes_extra();
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK + TmaTransactionBytesExtra;
+
+  // Set up the data needed by this collective for load and mma.
+  // Returns a tuple of tensors. The collective and the kernel layer have the contract that the
+  // returned tuple must contain at least two elements, with the first two elements being:
+  // gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  // gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  // The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+    const int32_t mock_L = 1;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(shape(detail::get_gmem_layout(make_shape(M,K,mock_L), mainloop_params.dA))); // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(shape(detail::get_gmem_layout(make_shape(N,K,mock_L), mainloop_params.dB))); // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});  // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});  // (BLK_N,BLK_K,n,k,l)
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return cute::make_tuple(gA_mkl, gB_nkl);
+    } 
+    else if constexpr (ModeHasScales) {
+      auto scale_k = mainloop_params.scale_k;
+      Tensor mS_mkl = mainloop_params.tma_load_scale.get_tma_tensor(make_shape(M,scale_k,L));      // (m,scale_k,l)
+      Tensor gS_mkl = local_tile(mS_mkl, ScaleTileShape{}, make_coord(_,_));       // (BLK_M,BLK_Scale_K,m,scale_k,l)
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(gA_mkl, gB_nkl, gS_mkl);
+      }
+      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor mZ_mkl = mainloop_params.tma_load_zero.get_tma_tensor(make_shape(M,scale_k,L));      // (m,scale_k,l)
+        Tensor gZ_mkl = local_tile(mZ_mkl, ScaleTileShape{}, make_coord(_,_));      // (BLK_M,BLK_Scale_K,m,scale_k,l)
+        return cute::make_tuple(gA_mkl, gB_nkl, gS_mkl, gZ_mkl);
+      }
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in load_init.");
+      }
+    } 
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in load_init.");
+    }
+  }
+
+  // Perform a collective-scoped matrix multiply-accumulate
+  // Producer Perspective
+  template <
+    class... Ts,
+    class... TMs,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline, 
+      PipelineState smem_pipe_write,
+      cute::tuple<Ts...> const& load_inputs,
+      cute::tuple<TMs...> const& input_tensormaps,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      static_assert(sizeof... (Ts) == 2, "Direct convert needs two inputs");
+      static_assert(sizeof... (TMs) == 2, "Direct convert needs two tensormaps");
+    } 
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      static_assert(sizeof... (Ts) == 3, "Scaled convert needs three inputs");
+      static_assert(sizeof... (TMs) == 3, "Scaled convert needs three tensormaps");
+    } 
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      static_assert(sizeof... (Ts) == 4, "Scaled and zero convert needs four inputs");
+      static_assert(sizeof... (TMs) == 4, "Scaled and zero convert needs four tensormaps");
+    } 
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in TMA load.");
+    }
+
+    Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB_ = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+    Tensor sA  = as_position_independent_swizzle_tensor(sA_);                                       // (BLK_M,BLK_K,PIPE)
+    Tensor sB  = as_position_independent_swizzle_tensor(sB_);                                       // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Prepare the TMA loads for A and B
+    //
+
+    constexpr uint32_t cluster_shape_x = get<0>(typename DispatchPolicy::ClusterShape());
+    uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gB_nkl = get<1>(load_inputs);
+
+    auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+    auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+    // Partition the inputs based on the current block coordinates.
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+    Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+    Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+    // Applies the mapping from block_tma_a
+    Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+    Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+    Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+    Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+    uint16_t mcast_mask_a = 0;
+    uint16_t mcast_mask_b = 0;
+    uint16_t mcast_mask_s = 0;
+
+    // Issue TmaLoads
+    // Maps the tile -> block, value
+    if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+      for (int n = 0; n < size<1>(block_layout); ++n) {
+        mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+      }
+    }
+
+    if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+      for (int m = 0; m < size<0>(block_layout); ++m) {
+        mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+      }
+    }
+
+    auto extra_input_partitions = Utils::partition_extra_tma_inputs(mainloop_params, load_inputs, shared_tensors, cluster_local_block_id, m_coord, l_coord);
+
+    // Mainloop
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count)
+    {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+
+      //
+      // Copy gmem to smem for *k_tile_iter
+      //
+
+      using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+      BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+      int write_stage = smem_pipe_write.index();
+      if (cute::elect_one_sync()) {
+        copy(mainloop_params.tma_load_a.with(get<0>(input_tensormaps), *tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_b.with(get<1>(input_tensormaps), *tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+      }
+      if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+        // Nothing extra to do.
+      }
+      else if constexpr (ModeHasScales) {
+        auto tSgS = get<0>(extra_input_partitions);
+        auto tSsS = get<1>(extra_input_partitions);
+
+        // Temporary factor which will determine which k tile to reload from gmem. Needed so we don't modify tma transaction bytes
+        // on the fly.
+        // We must do a ceiling divide here to correctly handle with chunk_size == K. In that case, we don't require that K
+        // is a multiple of the threadblock tile K
+        const int scale_load_k = *k_tile_iter / mainloop_params.reload_factor; // This will always be 0 when chunk_size == K.
+        if (cute::elect_one_sync()) {
+          copy(mainloop_params.tma_load_scale.with(get<2>(input_tensormaps), *tma_barrier, mcast_mask_s), tSgS(_,_,_,scale_load_k), tSsS(_,_,_,write_stage));
+        }
+
+        if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+          // Nothing extra to do
+        } 
+        else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+          auto tZgZ = get<2>(extra_input_partitions);
+          auto tZsZ = get<3>(extra_input_partitions);
+          if (cute::elect_one_sync()) {
+            copy(mainloop_params.tma_load_zero.with(get<3>(input_tensormaps), *tma_barrier, mcast_mask_s), tZgZ(_,_,_,scale_load_k), tZsZ(_,_,_,write_stage));
+          }
+        }
+        else {
+          static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled for TMA copy op.");
+        } 
+      } 
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled for TMA copy op.");
+      }
+      ++k_tile_iter;
+
+      // Advance smem_pipe_write
+      ++smem_pipe_write;
+    }
+  }
+
+  // Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      // This helps avoid early exit of blocks in Cluster.
+      // Waits for all stages to either be released (all 
+      // Consumer UNLOCKs), or if the stage was never used
+      // then it would just be acquired since the phase was 
+      // still inverted from make_producer_start_state.
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SwappedSmemLayoutAtomA{}) == 2, "SwappedSmemLayoutAtomA must be rank 2.");
+    static_assert(cute::rank(SwappedSmemLayoutAtomB{}) == 2, "SwappedSmemLayoutAtomB must be rank 2.");
+    static_assert(!cute::is_void_v<SwappedSmemCopyAtomA>,
+      "SM90 GMMA mainloops must specify a non-void copy atom for smem sourced instructions.");
+    static_assert(cute::is_void_v<SwappedSmemCopyAtomB>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    // Obtain warp index
+    int warp_idx = canonical_warp_idx_sync();
+    [[maybe_unused]] int warp_group_thread_idx = thread_idx % 128;
+
+    
+    Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+    Tensor sA = as_position_independent_swizzle_tensor(sA_);                                      // (BLK_M,BLK_K,PIPE)
+
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});         // (BLK_N,BLK_K,PIPE)
+
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{},
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto mma_thread_slice = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCsA = mma_thread_slice.partition_A(sA);
+    auto mma_warpgroup_slice = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    // Allocate fragments and descriptors
+    Tensor tCrA_mma = mma_thread_slice.partition_fragment_A(sA(_,_,Int<0>{}));                // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrA_load = [&]{
+      if constexpr (not is_layout<InternalSwappedStrideA>::value) {
+        // Make register tensor with MMA layout
+        return make_fragment_like<RealSwappedElementA>(tCrA_mma);
+      }
+      else {
+        // Make register tensor matching smem layout, converter will take care of de-swizzling
+        return make_tensor_like<RealSwappedElementA>(tCsA(_,_,_,Int<0>{}));
+      }
+    }();
+    Tensor tCsB = mma_warpgroup_slice.partition_B(sB);                                        // (MMA,MMA_N,MMA_K,PIPE)
+    Tensor tCrB = mma_warpgroup_slice.make_fragment_B(tCsB);                                  // (MMA,MMA_N,MMA_K,PIPE)
+
+    //
+    // Copy Atom A retiling
+    //
+    auto smem_tiled_copy_A = make_tiled_copy_A(SwappedSmemCopyAtomA{}, tiled_mma);
+    auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(warp_group_thread_idx);
+
+    Tensor tCrA_copy_view  = smem_thr_copy_A.retile_D(tCrA_load);                                  // (CPY,CPY_M,CPY_K)
+
+    // Partition of thread -> shared and thread -> RF
+    auto partitioned_extra_info = Utils::partition_extra_mma_info(mma_thread_slice, shared_tensors);
+    auto copy_partitions_extra_info = Utils::retile_extra_mma_info(tiled_mma, partitioned_extra_info, warp_group_thread_idx);
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));                                            // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));                                            // CPY_K
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA_mma) == size<1>(accum));                                                 // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+    
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    warpgroup_fence_operand(accum);
+
+    constexpr int K_BLOCK_MAX = size<2>(tCrA_load);
+    constexpr int K_WAIT_MAX = cute::min(K_BLOCK_MAX - 1, 7);
+    static_assert(K_BLOCK_MAX >= 4, "Consider increasing TileShapeK");
+
+    ConsumerToken barrier_token = {BarrierStatus::WaitAgain};
+    // first k tile
+    {
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+
+      ++smem_pipe_read;
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+
+      // copy smem->rmem for A operand
+
+      Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
+        partitioned_extra_info, copy_partitions_extra_info, 0, read_stage);
+      if (K_BLOCK_MAX > 1) {
+        Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
+          partitioned_extra_info, copy_partitions_extra_info, 1, read_stage);
+      }
+      
+      Utils::dequantize_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, 0);
+      
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) {
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA_mma(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+
+        if (k_block < K_BLOCK_MAX - 2) {
+          Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
+            partitioned_extra_info, copy_partitions_extra_info, k_block + 2, read_stage);
+        }
+        if (k_block < K_BLOCK_MAX - 1) {
+          Utils::dequantize_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, k_block + 1);
+        }
+      }     
+
+      --k_tile_count;
+      if (k_tile_count > 0) {
+        // Wait for K_BLOCK_MAX - 1 to be in flight to ensure that it is safe to overwrite the A registers for the first mma. 
+        pipeline.consumer_wait(smem_pipe_read, barrier_token);
+        
+        Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
+          partitioned_extra_info, copy_partitions_extra_info, 0, smem_pipe_read.index());
+        
+        // NOTE: Check this when applying swizzling PR on top of GGMD
+        Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
+          partitioned_extra_info, copy_partitions_extra_info, 1, smem_pipe_read.index());
+        
+        warpgroup_wait<K_WAIT_MAX>(); 
+        Utils::dequantize_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, 0);
+      }
+    }
+
+    if (k_tile_count == 0) {
+      return;
+    }
+
+    warpgroup_fence_operand(accum);
+    // Mainloop GMMAs
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 1; --k_tile_count) {
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+      ++smem_pipe_read;
+
+      warpgroup_fence_operand(accum);
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) {
+        
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA_mma(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+
+        warpgroup_wait<K_WAIT_MAX>(); // We have K_BLOCK_MAX - 1 GMMA instructions pending for this stage, so we can release prior barrier
+        if (k_block == K_BLOCK_MAX - 1) {
+          pipeline.consumer_release(smem_pipe_release);             // UNLOCK smem_pipe_release, done _computing_ on it
+          ++smem_pipe_release;
+        }
+
+        if (k_block == 0) {
+          barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+        }
+
+        if (k_block == K_BLOCK_MAX - 1) { 
+          pipeline.consumer_wait(smem_pipe_read, barrier_token);
+          Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
+            partitioned_extra_info, copy_partitions_extra_info, 0, smem_pipe_read.index());
+
+          // NOTE: Check this when applying swizzling PR on top of GGMD
+          Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
+            partitioned_extra_info, copy_partitions_extra_info, 1, smem_pipe_read.index());
+          Utils::dequantize_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, 0);
+        } 
+        else {
+          if (k_block < K_BLOCK_MAX - 2) {
+            Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
+              partitioned_extra_info, copy_partitions_extra_info, k_block + 2, read_stage);
+          }
+          Utils::dequantize_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, k_block + 1);
+        }
+      }
+      warpgroup_fence_operand(accum);
+
+    }
+
+    warpgroup_fence_operand(accum);
+
+    {
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+
+      warpgroup_fence_operand(accum);
+      
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) {
+
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA_mma(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+
+        warpgroup_wait<K_WAIT_MAX>();
+        if (k_block == K_BLOCK_MAX - 1) {
+          // release prior barrier
+          pipeline.consumer_release(smem_pipe_release);             // UNLOCK smem_pipe_release, done _computing_ on it
+          ++smem_pipe_release;
+        }
+
+        if (k_block < K_BLOCK_MAX - 2) {
+          Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
+            partitioned_extra_info, copy_partitions_extra_info, k_block + 2, read_stage);
+        }
+        if (k_block < K_BLOCK_MAX - 1) {
+          Utils::dequantize_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, k_block + 1);
+        }
+      }
+    }
+
+    warpgroup_fence_operand(accum);
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = 1;
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+    
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+
+  //
+  // Methods to perform different parts of TMA/Tensormap modifications
+  //
+  CUTLASS_DEVICE auto
+  tensormaps_init(
+      Params const& mainloop_params,
+      TensorMapStorage& shared_tensormaps,
+      int32_t sm_count,
+      int32_t sm_idx) {
+    cute::TmaDescriptor* gmem_tensormap = reinterpret_cast<cute::TmaDescriptor*>(mainloop_params.tensormaps);
+
+    cute::TmaDescriptor* tma_desc_a = &gmem_tensormap[sm_idx];
+    cute::TmaDescriptor* tma_desc_b = &gmem_tensormap[sm_idx + sm_count];
+    cute::TmaDescriptor* tma_desc_scale = &gmem_tensormap[sm_idx + 2*sm_count];
+    cute::TmaDescriptor* tma_desc_zero = &gmem_tensormap[sm_idx + 3*sm_count];
+
+    // Bringing tensormaps from params to smem for modification later
+    Tensor pA_tensormap = make_tensor(mainloop_params.tma_load_a.get_tma_descriptor(), Int<1>{}, Int<1>{});
+    Tensor sA_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_A), Int<1>{}, Int<1>{});
+    Tensor pB_tensormap = make_tensor(mainloop_params.tma_load_b.get_tma_descriptor(), Int<1>{}, Int<1>{});
+    Tensor sB_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_B), Int<1>{}, Int<1>{});
+
+    if (cute::elect_one_sync()) {
+      copy(recast<uint128_t>(pA_tensormap), recast<uint128_t>(sA_tensormap));
+      copy(recast<uint128_t>(pB_tensormap), recast<uint128_t>(sB_tensormap));
+    }
+    
+    if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      Tensor pS_tensormap = make_tensor(mainloop_params.tma_load_scale.get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sS_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_scale), Int<1>{}, Int<1>{});
+      if (cute::elect_one_sync()) {
+        copy(recast<uint128_t>(pS_tensormap), recast<uint128_t>(sS_tensormap));
+      }
+    }
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      Tensor pZ_tensormap = make_tensor(mainloop_params.tma_load_zero.get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sZ_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_zero), Int<1>{}, Int<1>{});
+      if (cute::elect_one_sync()) {
+        copy(recast<uint128_t>(pZ_tensormap), recast<uint128_t>(sZ_tensormap));
+      }
+    }
+    else if constexpr (KernelConversionMode != ConversionMode::DirectConvert){
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in tensormaps_init.");
+    }
+
+    __syncwarp();
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return cute::make_tuple(tma_desc_a, tma_desc_b);
+    }
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      return cute::make_tuple(tma_desc_a, tma_desc_b, tma_desc_scale);
+    }
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      return cute::make_tuple(tma_desc_a, tma_desc_b, tma_desc_scale, tma_desc_zero);
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in tensormaps_init.");
+    }
+  }
+
+  // Replace address for the global tensor (to be done by single thread)
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_address(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      int32_t next_batch) {
+    // Replacing global_address for the next batch
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_A,
+                                                    mainloop_params.ptr_A[next_batch]);
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_B,
+                                                    mainloop_params.ptr_B[next_batch]);
+    if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_scale,
+                                                    mainloop_params.ptr_S[next_batch]);
+    }
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_zero,
+                                                    mainloop_params.ptr_Z[next_batch]);
+    }
+    else if constexpr (KernelConversionMode != ConversionMode::DirectConvert){
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in tensormaps_replace_global_address.");
+    }
+  }
+
+  // Replace dim and strides for the global tensor - used only for Grouped GEMM (to be done by single thread)
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_tensor_properties(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      int32_t next_group,
+      ProblemShape_MNKL problem_shape_mnkl) {
+    const uint32_t M = get<0>(problem_shape_mnkl);
+    const uint32_t N = get<1>(problem_shape_mnkl);
+    const uint32_t K = get<2>(problem_shape_mnkl);
+    
+    // Replace all dims for consistency
+    constexpr int MaxTensorRank = 5;
+    cute::array<uint32_t, MaxTensorRank> prob_shape_A  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_A = {0,0,0,0,0};
+    cute::array<uint32_t, MaxTensorRank> prob_shape_B  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_B = {0,0,0,0,0};
+    cute::array<uint32_t, MaxTensorRank> prob_shape_scale  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_scale = {0,0,0,0,0};
+    cute::array<uint32_t, MaxTensorRank> prob_shape_zero   = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_zero  = {0,0,0,0,0};
+
+    SwappedElementA const* ptr_A = nullptr;
+    Tensor tensor_a = make_tensor(ptr_A, detail::get_gmem_layout(make_shape(M,K,Int<1>{}), mainloop_params.ptr_dA[next_group]));
+
+    SwappedElementB const* ptr_B = nullptr;
+    Tensor tensor_b = make_tensor(ptr_B, detail::get_gmem_layout(make_shape(N,K,Int<1>{}), mainloop_params.ptr_dB[next_group]));
+
+    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_a, tensor_a, 
+                                             prob_shape_A, prob_stride_A);
+    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_b, tensor_b, 
+                                             prob_shape_B, prob_stride_B);
+
+    if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      NonVoidElementScale const* ptr_S = nullptr;
+      // NOTE: figure out chunk wise scaling. auto scale_k = (K + mainloop_params.chunk_size - 1) / mainloop_params.chunk_size;
+      auto scale_k = 1;
+      Tensor tensor_scale = make_tensor(detail::get_logical_ptr(ptr_S), make_shape(M,scale_k,Int<1>{}), mainloop_params.dS[next_group]);
+      cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_scale, tensor_scale, 
+                                             prob_shape_scale, prob_stride_scale);
+    }
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      ElementZero const* ptr_Z = nullptr;
+      // NOTE: figure out chunk wise scaling. auto scale_k = (K + mainloop_params.chunk_size - 1) / mainloop_params.chunk_size;
+      auto scale_k = 1;
+      Tensor tensor_zero = make_tensor(detail::get_logical_ptr(ptr_Z), make_shape(M,scale_k,Int<1>{}), mainloop_params.dS[next_group]);
+      cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_zero, tensor_zero, 
+                                               prob_shape_zero, prob_stride_zero);
+    }
+    else if constexpr (KernelConversionMode != ConversionMode::DirectConvert){
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in tensormaps_replace_global_tensor_properties.");
+    }
+
+    // Convert strides to byte strides
+    for (uint64_t& stride : prob_stride_A) {
+      stride = (stride * sizeof_bits_v<SwappedElementA>) / 8;
+    }
+    for (uint64_t& stride : prob_stride_B) {
+      stride = (stride * sizeof_bits_v<SwappedElementB>) / 8;
+    }
+    for (uint64_t& stride : prob_stride_scale) {
+      stride = (stride * sizeof_bits_v<NonVoidElementScale>) / 8;
+    }
+    for (uint64_t& stride : prob_stride_zero) {
+      stride = (stride * sizeof_bits_v<NonVoidElementScale>) / 8;
+    }
+
+
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_A,
+                                                            prob_shape_A,
+                                                            prob_stride_A);
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_B,
+                                                            prob_shape_B,
+                                                            prob_stride_B);
+
+    if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_scale,
+                                                            prob_shape_scale,
+                                                            prob_stride_scale);
+    }
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_zero,
+                                                            prob_shape_zero,
+                                                            prob_stride_zero);
+    }
+    else if constexpr (KernelConversionMode != ConversionMode::DirectConvert){
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in tensormaps_replace_global_tensor_properties.");
+    } 
+  }
+
+  template <class... TMs, class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  void
+  tensormaps_perform_update(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      cute::tuple<TMs...> const& input_tensormaps,
+      ProblemShape_MNKL problem_shape_mnkl,
+      int32_t next_batch) {
+    if (cute::elect_one_sync()) {
+      // Replacing global_address for the next batch
+      tensormaps_replace_global_address(shared_tensormaps, mainloop_params, next_batch);
+
+      if constexpr (IsGroupedGemmKernel) {
+        // Replacing global dims and strides for the next batch
+        tensormaps_replace_global_tensor_properties(shared_tensormaps,
+          mainloop_params, next_batch, problem_shape_mnkl);
+      }
+    }
+  }
+
+  template <class... TMs>
+  CUTLASS_DEVICE
+  void
+  tensormaps_cp_fence_release (
+      TensorMapStorage& shared_tensormaps,
+      cute::tuple<TMs...> const& input_tensormaps) {
+    // Entire warp must do this (i.e. it's aligned)
+    tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormaps.smem_tensormap_A);
+    tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormaps.smem_tensormap_B);
+    if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      tma_descriptor_cp_fence_release(get<2>(input_tensormaps), shared_tensormaps.smem_tensormap_scale);
+    }
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      tma_descriptor_cp_fence_release(get<3>(input_tensormaps), shared_tensormaps.smem_tensormap_zero);
+    }
+    else if constexpr (KernelConversionMode != ConversionMode::DirectConvert){
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in tensormaps_cp_fence_release.");
+    }
+  }
+
+  // The entire warp must call this function collectively (that is, the instructions are aligned)
+  template <class... TMs>
+  CUTLASS_DEVICE
+  void
+  tensormaps_fence_acquire(cute::tuple<TMs...> const& input_tensormaps) {
+    cute::tma_descriptor_fence_acquire(get<0>(input_tensormaps));
+    cute::tma_descriptor_fence_acquire(get<1>(input_tensormaps));
+    if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      cute::tma_descriptor_fence_acquire(get<2>(input_tensormaps));
+    }
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      cute::tma_descriptor_fence_acquire(get<3>(input_tensormaps));
+    }
+    else if constexpr (KernelConversionMode != ConversionMode::DirectConvert){
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in tensormaps_fence_acquire.");
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp
new file mode 100644
index 0000000000..5264aa4c7f
--- /dev/null
+++ b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp
@@ -0,0 +1,759 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/cuda_host_adapter.hpp"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <
+  int Stages,
+  class ClusterShape,
+  class KernelSchedule,
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90ArrayTmaGmmaWarpSpecialized<Stages, ClusterShape, KernelSchedule>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90ArrayTmaGmmaWarpSpecialized<Stages, ClusterShape, KernelSchedule>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using InternalStrideA = cute::remove_pointer_t<StrideA>;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using InternalStrideB = cute::remove_pointer_t<StrideB>;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using InternalElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using InternalElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
+
+  // Assumption: StrideA is congruent with Problem_MK
+  using TMA_A = decltype(make_tma_copy(
+      GmemTiledCopyA{},
+      make_tensor(static_cast<InternalElementA const*>(nullptr), repeat_like(InternalStrideA{}, int32_t(0)), InternalStrideA{}),
+      SmemLayoutA{}(_,_,cute::Int<0>{}),
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+      size<1>(ClusterShape{})));  // mcast along N mode for this M load, if any
+  // Assumption: StrideB is congruent with Problem_NK
+  using TMA_B = decltype(make_tma_copy(
+      GmemTiledCopyB{},
+      make_tensor(static_cast<InternalElementB const*>(nullptr), repeat_like(InternalStrideB{}, int32_t(0)), InternalStrideB{}),
+      SmemLayoutB{}(_,_,cute::Int<0>{}),
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+      size<0>(ClusterShape{}))); // mcast along M mode for this N load, if any
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+    } tensors;
+
+    struct TensorMapStorage : cute::aligned_struct<128, _0> {
+      cute::TmaDescriptor smem_tensormap_A;
+      cute::TmaDescriptor smem_tensormap_B;
+    } tensormaps;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using TensorMapStorage = typename SharedStorage::TensorMapStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideA, StrideA>;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const** ptr_A;
+    StrideA dA;
+    ElementB const** ptr_B;
+    StrideB dB;
+  };
+
+  // Device side kernel params
+  struct Params {
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    void* tensormaps;
+    InternalElementA const** ptr_A;
+    StrideA dA;
+    InternalElementB const** ptr_B;
+    StrideB dB;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      ProblemShape problem_shapes,
+      Arguments const& args,
+      void* workspace) {
+    // These tensor shapes (only applicable for grouped gemm) and pointers are only used to create tensormap/tma desc.
+    // These will be replaced with correct values before the initial tma load.
+    auto init_shape = repeat_like(typename ProblemShape::UnderlyingProblemShape{}, int32_t(1));
+    auto init_M = get<0>(init_shape);
+    auto init_N = get<1>(init_shape);
+    auto init_K = get<2>(init_shape);
+    // Batches/Groups are managed by using appropriate pointers to input matrices
+    const uint32_t mock_L = 1;
+    InternalElementA const* ptr_A_first_batch = reinterpret_cast<InternalElementA const*>(args.ptr_A);
+    InternalElementB const* ptr_B_first_batch = reinterpret_cast<InternalElementB const*>(args.ptr_B);
+
+    InternalStrideA stride_a;
+    InternalStrideB stride_b;
+    if constexpr (IsGroupedGemmKernel) {
+      // Strides for Grouped Gemm will be replaced prior to the first access regardless.
+      stride_a = InternalStrideA{};
+      stride_b = InternalStrideB{};
+    }
+    else {
+      // Tensor shapes for Ptr-Array are initialized correctly only here.
+      auto problem_shape_MNK = problem_shapes.get_host_problem_shape(0);
+      init_M = get<0>(problem_shape_MNK);
+      init_N = get<1>(problem_shape_MNK);
+      init_K = get<2>(problem_shape_MNK);
+
+      stride_a = args.dA;
+      stride_b = args.dB;
+    }
+    Tensor tensor_a = make_tensor(ptr_A_first_batch, make_layout(make_shape(init_M,init_K,mock_L), stride_a));
+    Tensor tensor_b = make_tensor(ptr_B_first_batch, make_layout(make_shape(init_N,init_K,mock_L), stride_b));
+    TMA_A tma_load_a = make_tma_copy(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})); // mcast along N mode for this M load, if any
+    TMA_B tma_load_b = make_tma_copy(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{})); // mcast along M mode for this N load, if any
+
+    void* tensormaps = workspace;
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      TmaTransactionBytes,
+      tensormaps,
+      reinterpret_cast<InternalElementA const**>(args.ptr_A),
+      args.dA,
+      reinterpret_cast<InternalElementB const**>(args.ptr_B),
+      args.dB
+    };
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args, int sm_count) {
+    constexpr uint32_t NumInputTensors = 2;
+    constexpr size_t SizeOfCuTensorMap = sizeof(cute::TmaDescriptor);
+    // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B tensormap copies
+    return (NumInputTensors * SizeOfCuTensorMap * sm_count);
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream, CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape problem_shapes,
+      Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+
+    bool implementable = true;
+    if (problem_shapes.is_host_problem_shape_available()) {
+      // Check alignment for all problem sizes
+      for (int i = 0; i < problem_shapes.groups(); i++) {
+        auto problem_shape_MNKL = append<4>(problem_shapes.get_host_problem_shape(i), 1);
+        auto [M,N,K,L] = problem_shape_MNKL;
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), InternalStrideA{});
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), InternalStrideB{});
+      }
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 1;
+  static constexpr uint32_t TmaTransactionBytes =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<ElementA>::value))+
+        cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<ElementB>::value));
+
+  // Set up the data needed by this collective for load and mma.
+  // Returns a tuple of tensors. The collective and the kernel layer have the contract that the
+  // returned tuple must contain at least two elements, with the first two elements being:
+  // gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  // gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  // The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+    const int32_t mock_L = 1;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,mock_L));                            // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,mock_L));                            // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});  // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});  // (BLK_N,BLK_K,n,k,l)
+
+    return cute::make_tuple(gA_mkl, gB_nkl);
+  }
+
+  // Perform a collective-scoped matrix multiply-accumulate
+  // Producer Perspective
+  template <
+    class TensorA, class TensorB,
+    class TensorMapA, class TensorMapB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline, 
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA, TensorB> const& load_inputs,
+      cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+
+      //
+      // Prepare the TMA loads for A and B
+      //
+
+      constexpr uint32_t cluster_shape_x = get<0>(typename DispatchPolicy::ClusterShape());
+      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+      Tensor gA_mkl = get<0>(load_inputs);
+      Tensor gB_nkl = get<1>(load_inputs);
+
+      auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+      auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+      Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+      // Applies the mapping from block_tma_a
+      Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+      uint16_t mcast_mask_a = 0;
+      uint16_t mcast_mask_b = 0;
+
+      // Issue TmaLoads
+      // Maps the tile -> block, value
+      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+        for (int n = 0; n < size<1>(block_layout); ++n) {
+          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+        }
+      }
+
+      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+        for (int m = 0; m < size<0>(block_layout); ++m) {
+          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+        }
+      }
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for ( ; k_tile_count > 0; --k_tile_count)
+      {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(mainloop_params.tma_load_a.with(get<0>(input_tensormaps), *tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_b.with(get<1>(input_tensormaps), *tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+        ++k_tile_iter;
+
+        // Advance smem_pipe_write
+        ++smem_pipe_write;
+      }
+    }
+  }
+
+  // Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      // This helps avoid early exit of blocks in Cluster.
+      // Waits for all stages to either be released (all 
+      // Consumer UNLOCKs), or if the stage was never used
+      // then it would just be acquired since the phase was 
+      // still inverted from make_producer_start_state.
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+    static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and 
+                  stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{},
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
+        "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    assert(k_tile_count >= 1);
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+    warpgroup_fence_operand(accum);
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+
+      warpgroup_commit_batch();
+
+      ++smem_pipe_read;
+    }
+
+    warpgroup_fence_operand(accum);
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_tile_prologue = prologue_mma_count - 1; k_tile_prologue > 0; --k_tile_prologue)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+      warpgroup_arrive();
+      cute::gemm(tiled_mma, tCrA(_,_,_,read_stage), tCrB(_,_,_,read_stage), accum); // (V,M,K) x (V,N,K) => (V,M,N)
+      warpgroup_commit_batch();
+
+      ++smem_pipe_read;
+    }
+
+    warpgroup_fence_operand(accum);
+    // Mainloop GMMAs
+    k_tile_count -= prologue_mma_count;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+      warpgroup_fence_operand(accum);
+      warpgroup_arrive();
+      cute::gemm(tiled_mma, tCrA(_,_,_,read_stage), tCrB(_,_,_,read_stage), accum); // (V,M,K) x (V,N,K) => (V,M,N)
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
+      warpgroup_wait<K_PIPE_MMAS>();
+      warpgroup_fence_operand(accum);
+
+      // UNLOCK smem_pipe_release, done _computing_ on it
+      pipeline.consumer_release(smem_pipe_release);
+
+      // Advance smem_pipe_read and smem_pipe_release
+      ++smem_pipe_read;
+      ++smem_pipe_release;
+    }
+
+    warpgroup_fence_operand(accum);
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+    
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+
+  //
+  // Methods to perform different parts of TMA/Tensormap modifications
+  //
+
+  CUTLASS_DEVICE auto
+  tensormaps_init(
+      Params const& mainloop_params,
+      TensorMapStorage& shared_tensormaps,
+      int32_t sm_count,
+      int32_t sm_idx) {
+    cute::TmaDescriptor* gmem_tensormap = reinterpret_cast<cute::TmaDescriptor*>(mainloop_params.tensormaps);
+
+    cute::TmaDescriptor* tma_desc_a = &gmem_tensormap[sm_idx];
+    cute::TmaDescriptor* tma_desc_b = &gmem_tensormap[sm_idx + sm_count];
+
+    if (cute::elect_one_sync()) {
+      // Bringing tensormaps from params to smem for modification later
+      Tensor pA_tensormap = make_tensor(mainloop_params.tma_load_a.get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sA_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_A), Int<1>{}, Int<1>{});
+      Tensor pB_tensormap = make_tensor(mainloop_params.tma_load_b.get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sB_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_B), Int<1>{}, Int<1>{});
+
+      copy(recast<uint128_t>(pA_tensormap), recast<uint128_t>(sA_tensormap));
+      copy(recast<uint128_t>(pB_tensormap), recast<uint128_t>(sB_tensormap));
+    }
+    __syncwarp();
+
+    return cute::make_tuple(tma_desc_a, tma_desc_b);
+  }
+
+  // Replace address for the global tensor (to be done by single thread)
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_address(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      int32_t next_batch) {
+    // Replacing global_address for the next batch
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_A,
+                                                    mainloop_params.ptr_A[next_batch]);
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_B,
+                                                    mainloop_params.ptr_B[next_batch]);
+  }
+
+  // Replace dim and strides for the global tensor - used only for Grouped GEMM (to be done by single thread)
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_tensor_properties(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      int32_t next_group,
+      ProblemShape_MNKL problem_shape_mnkl) {
+    const uint32_t M = get<0>(problem_shape_mnkl);
+    const uint32_t N = get<1>(problem_shape_mnkl);
+    const uint32_t K = get<2>(problem_shape_mnkl);
+    // Replace all dims for consistency
+    constexpr int MaxTensorRank = 5;
+    cute::array<uint32_t, MaxTensorRank> prob_shape_A  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_A = {0,0,0,0,0};
+    cute::array<uint32_t, MaxTensorRank> prob_shape_B  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_B = {0,0,0,0,0};
+
+    InternalElementA const* ptr_A = nullptr;
+    Tensor tensor_a = make_tensor(ptr_A, make_shape(M,K,Int<1>{}), mainloop_params.dA[next_group]);
+
+    InternalElementB const* ptr_B = nullptr;
+    Tensor tensor_b = make_tensor(ptr_B, make_shape(N,K,Int<1>{}), mainloop_params.dB[next_group]);
+
+    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_a, tensor_a, 
+                                             prob_shape_A, prob_stride_A);
+    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_b, tensor_b, 
+                                             prob_shape_B, prob_stride_B);
+
+    // Convert strides to byte strides
+    for (uint64_t& stride : prob_stride_A) {
+      stride = (stride * sizeof_bits_v<InternalElementA>) / 8;
+    }
+    for (uint64_t& stride : prob_stride_B) {
+      stride = (stride * sizeof_bits_v<InternalElementB>) / 8;
+    }
+
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_A,
+                                                            prob_shape_A,
+                                                            prob_stride_A);
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_B,
+                                                            prob_shape_B,
+                                                            prob_stride_B);
+  }
+
+  template <class TensorMapA, class TensorMapB, class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  void
+  tensormaps_perform_update(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps,
+      ProblemShape_MNKL problem_shape_mnkl,
+      int32_t next_batch) {
+    if (cute::elect_one_sync()) {
+      // Replacing global_address for the next batch
+      tensormaps_replace_global_address(shared_tensormaps, mainloop_params, next_batch);
+
+      if constexpr (IsGroupedGemmKernel) {
+        // Replacing global dims and strides for the next batch
+        tensormaps_replace_global_tensor_properties(shared_tensormaps,
+          mainloop_params, next_batch, problem_shape_mnkl);
+      }
+    }
+  }
+
+  template <class TensorMapA, class TensorMapB>
+  CUTLASS_DEVICE
+  void
+  tensormaps_cp_fence_release (
+      TensorMapStorage& shared_tensormaps,
+      cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps) {
+    // Entire warp must do this (i.e. it's aligned)
+    tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormaps.smem_tensormap_A);
+    tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormaps.smem_tensormap_B);
+  }
+
+  // The entire warp must call this function collectively (that is, the instructions are aligned)
+  template <class TensorMapA, class TensorMapB>
+  CUTLASS_DEVICE
+  void
+  tensormaps_fence_acquire(cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps) {
+    cute::tma_descriptor_fence_acquire(get<0>(input_tensormaps));
+    cute::tma_descriptor_fence_acquire(get<1>(input_tensormaps));
+  }
+
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_rs_warpspecialized.hpp b/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_rs_warpspecialized.hpp
new file mode 100644
index 0000000000..69b31fdabe
--- /dev/null
+++ b/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_rs_warpspecialized.hpp
@@ -0,0 +1,677 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/transform/collective/sm90_wgmma_transpose.hpp"
+#include "cutlass/trace.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <
+  int Stages,
+  class ClusterShape_,
+  class TileShape_,
+  class KernelSchedule,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90CpAsyncGmmaRmemAWarpSpecialized<Stages,ClusterShape_,KernelSchedule>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90CpAsyncGmmaRmemAWarpSpecialized<Stages,ClusterShape_,KernelSchedule>;
+  using TileShape = TileShape_;
+  using ClusterShape = ClusterShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  // Swap and transpose A/B for A k-major layout and B mn-major layout since WGMMA is k-major only (e.g. tf32, Fp32, Int8, Fp8 WGMMA)
+  static constexpr bool IsLayoutAkBmn =
+    cute::is_same_v<gemm::detail::StrideToLayoutTagA_t<StrideA>, layout::RowMajor> &&
+    cute::is_same_v<gemm::detail::StrideToLayoutTagB_t<StrideB>, layout::RowMajor>;
+
+  static constexpr bool IsInputSizeTwoBytes = sizeof(ElementA) == 2 && sizeof(ElementB) == 2;
+  static constexpr bool SwapAB =  !IsInputSizeTwoBytes && IsLayoutAkBmn;
+  using InternalGmemTiledCopyA = cute::conditional_t<!SwapAB, GmemTiledCopyA, GmemTiledCopyB>;
+  using InternalGmemTiledCopyB = cute::conditional_t<!SwapAB, GmemTiledCopyB, GmemTiledCopyA>;
+  using InternalSmemLayoutAtomA = cute::conditional_t<!SwapAB, SmemLayoutAtomA, SmemLayoutAtomB>;
+  using InternalSmemLayoutAtomB = cute::conditional_t<!SwapAB, SmemLayoutAtomB, SmemLayoutAtomA>;
+  using InternalSmemCopyAtomA   = cute::conditional_t<!SwapAB, SmemCopyAtomA, SmemCopyAtomB>;
+  using InternalSmemCopyAtomB   = cute::conditional_t<!SwapAB, SmemCopyAtomB, SmemCopyAtomA>;
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using ConvertedElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using ConvertedElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
+  using InternalElementA = cute::conditional_t<!SwapAB, ConvertedElementA, ConvertedElementB>;
+  using InternalElementB = cute::conditional_t<!SwapAB, ConvertedElementB, ConvertedElementA>;
+  using InternalStrideA  = cute::conditional_t<!SwapAB, StrideA, StrideB>;
+  using InternalStrideB  = cute::conditional_t<!SwapAB, StrideB, StrideA>;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using MainloopPipeline = cutlass::PipelineAsync<DispatchPolicy::Stages>;
+  using PipelineState    = typename MainloopPipeline::PipelineState;
+  using PipelineParams   = typename MainloopPipeline::Params;
+
+  static_assert(cute::rank(InternalSmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(InternalSmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(InternalSmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(InternalSmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(InternalSmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(InternalSmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  using SmemLayoutA = decltype(tile_to_shape(
+      InternalSmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
+  using SmemLayoutB = decltype(tile_to_shape(
+      InternalSmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
+
+  // If A mn-layout and B mn-layout, transposing B matrix since WGMMA is k-major only (e.g. tf32, fp32, fp8, int8).
+  static constexpr bool IsLayoutAmnBmn =
+    cute::is_same_v<gemm::detail::StrideToLayoutTagA_t<StrideA>, layout::ColumnMajor> &&
+    cute::is_same_v<gemm::detail::StrideToLayoutTagB_t<StrideB>, layout::RowMajor>;
+  static constexpr bool TransposeB = !IsInputSizeTwoBytes && IsLayoutAmnBmn;
+  using TransposeOperandB = decltype(cutlass::transform::collective::detail::make_transpose_operand_b(
+                                      0, 0, TiledMma{}, SmemLayoutB{}, InternalSmemLayoutAtomB{},
+                                      InternalElementB{}, cute::bool_constant<TransposeB>{})); 
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                    cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source A from rmem and B operand from smem_desc for this mainloop.");
+
+  using GmmaSmemLayoutAtomB = decltype(transform::collective::detail::gmma_smem_transpose_or_passthrough<
+      TransposeB, InternalSmemLayoutAtomB, InternalElementB>());
+
+  // SmemLayoutB for GMMA is different from SmemLayoutB for TMA if TransposeB
+  using GmmaSmemLayoutB = decltype(tile_to_shape(
+      GmmaSmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
+
+  static_assert(!SwapAB || !TransposeB, "Cannot SwapAB and TransposeB at the same time.");
+  static_assert(TransposeB xor (cute::is_same_v<SmemLayoutB, GmmaSmemLayoutB>),
+    "Should be same layout if not TransposeB.");
+  static_assert(!TransposeB || (cutlass::bits_to_bytes(size<1>(SmemLayoutB{}) * sizeof_bits<InternalElementB>::value)) == 128,
+    "SmemLayoutB K must be 128bytes to be transposed.");
+  static_assert(!transform::collective::detail::use_universal_transposition<InternalSmemLayoutAtomB, InternalElementB>(),
+    "Warp specialized ARF kernels have not supported universal B transposition yet.");
+
+  struct SharedStorage
+  {
+    struct TensorStorage : cute::aligned_struct<256, _0> { 
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>, 256> smem_A;
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>, 256> smem_B;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A = nullptr;
+    StrideA dA{};
+    ElementB const* ptr_B = nullptr;
+    StrideB dB{};
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  // Device side kernel params
+  struct Params {
+    InternalElementA const* ptr_A = nullptr;
+    InternalStrideA dA{};
+    InternalElementB const* ptr_B = nullptr;
+    InternalStrideB dB{};
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+    [[maybe_unused]] ProblemShape const& problem_shape,
+    Arguments const& args,
+    [[maybe_unused]] void* workspace) {
+    if constexpr (not SwapAB) {
+      return {
+        reinterpret_cast<InternalElementA const*>(args.ptr_A),
+        args.dA,
+        reinterpret_cast<InternalElementB const*>(args.ptr_B),
+        args.dB
+      };
+    }
+    else {
+      return {
+        reinterpret_cast<InternalElementA const*>(args.ptr_B),
+        args.dB,
+        reinterpret_cast<InternalElementB const*>(args.ptr_A),
+        args.dA
+      };
+    }
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    bool implementable = true;
+    implementable = implementable && cutlass::detail::check_alignment<GmemTiledCopyA::NumValSrc>(cute::make_shape(M,K,L), StrideA{});
+    implementable = implementable && cutlass::detail::check_alignment<GmemTiledCopyB::NumValSrc>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 1;
+  
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA,
+    class TensorB,
+    class KTileIterator,
+    class ResidueMNK
+  >
+  CUTLASS_DEVICE void
+  load(
+      MainloopPipeline pipeline, 
+      PipelineState smem_pipe_write,
+      TensorA const& gA_in,
+      TensorB const& gB_in,
+      KTileIterator k_tile_iter, int k_tile_count,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      TensorStorage& shared_tensors)
+  {
+    using namespace cute;
+
+    static_assert(is_gmem<TensorA>::value, "A tensor must be gmem resident.");
+    static_assert(is_gmem<TensorB>::value, "B tensor must be gmem resident.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+
+    // Shift tensor so residue_k is at origin (Can't read any k_coord < residue_k)
+    // This aligns the tensor with BLK_K for all but the 0th k_tile
+    Tensor gA = domain_offset(make_coord(0, get<2>(residue_mnk), 0), gA_in);
+    Tensor gB = domain_offset(make_coord(0, get<2>(residue_mnk), 0), gB_in);
+
+    // Partition the copying of A and B tiles across the threads
+    InternalGmemTiledCopyA gmem_tiled_copy_a;
+    InternalGmemTiledCopyB gmem_tiled_copy_b;
+    auto gmem_thr_copy_a = gmem_tiled_copy_a.get_slice(thread_idx);
+    auto gmem_thr_copy_b = gmem_tiled_copy_b.get_slice(thread_idx);
+
+    Tensor tAgA = gmem_thr_copy_a.partition_S(gA);                        // (ACPY,ACPY_M,ACPY_K,k)
+    Tensor tAsA = gmem_thr_copy_a.partition_D(sA);                        // (ACPY,ACPY_M,ACPY_K,PIPE)
+    Tensor tBgB = gmem_thr_copy_b.partition_S(gB);                        // (BCPY,BCPY_N,BCPY_K,k)
+    Tensor tBsB = gmem_thr_copy_b.partition_D(sB);                        // (BCPY,BCPY_N,BCPY_K,PIPE)
+
+    // Allocate predicate tensors for m and n
+    Tensor tApA = make_tensor<bool>(make_shape(size<1>(tAsA), size<2>(tAsA)), Stride<_1,_0>{});
+    Tensor tBpB = make_tensor<bool>(make_shape(size<1>(tBsB), size<2>(tBsB)), Stride<_1,_0>{});
+
+    // Construct identity layout for sA and sB
+    Tensor cA = make_identity_tensor(make_shape(size<0>(sA), size<1>(sA)));    // (BLK_M,BLK_K) -> (blk_m,blk_k)
+    Tensor cB = make_identity_tensor(make_shape(size<0>(sB), size<1>(sB)));    // (BLK_N,BLK_K) -> (blk_n,blk_k)
+
+    // Repeat the partitioning with identity layouts
+    Tensor tAcA = gmem_thr_copy_a.partition_S(cA);                             // (ACPY,ACPY_M,ACPY_K) -> (blk_m,blk_k)
+    Tensor tBcB = gmem_thr_copy_b.partition_S(cB);                             // (BCPY,BCPY_N,BCPY_K) -> (blk_n,blk_k)
+
+    // Set predicates for m bounds
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < size<0>(tApA); ++m) {
+      tApA(m,0) = get<0>(tAcA(0,m,0)) < get<0>(residue_mnk);  // blk_m coord < residue_m
+    }
+    // Set predicates for n bounds
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < size<0>(tBpB); ++n) {
+      tBpB(n,0) = get<0>(tBcB(0,n,0)) < get<1>(residue_mnk);  // blk_n coord < residue_n
+    }
+
+    // 0-th stage with predication on k to account for residue
+    {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+      int write_stage = smem_pipe_write.index();
+
+      // Copy gmem to smem for *k_tile_iter, predicating for k residue
+      Tensor tAgAk = tAgA(_,_,_,*k_tile_iter);
+      CUTLASS_PRAGMA_UNROLL
+      for (int k = 0; k < size<2>(tAsA); ++k) {
+        if (get<1>(tAcA(0,0,k)) >= -get<2>(residue_mnk)) {      // blk_k coord < residue_k (gA shifted)
+          copy_if(gmem_tiled_copy_a, tApA(_,k), tAgAk(_,_,k), tAsA(_,_,k,write_stage));
+        }
+        else {
+          clear(tAsA(_,_,k,write_stage));
+        }
+      }
+      Tensor tBgBk = tBgB(_,_,_,*k_tile_iter);
+      CUTLASS_PRAGMA_UNROLL
+      for (int k = 0; k < size<2>(tBsB); ++k) {
+        if (get<1>(tBcB(0,0,k)) >= -get<2>(residue_mnk)) {      // blk_k coord < residue_k (gB shifted)
+          copy_if(gmem_tiled_copy_b, tBpB(_,k), tBgBk(_,_,k), tBsB(_,_,k,write_stage));
+        }
+        else {
+          clear(tBsB(_,_,k,write_stage));
+        }
+      }
+      
+      ++k_tile_iter;
+      --k_tile_count;
+
+      // UNLOCK smem_pipe_write
+      pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive);
+
+      // Advance smem_pipe_write
+      ++smem_pipe_write;
+    }
+
+    // Mainloop
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+      int write_stage = smem_pipe_write.index();
+
+      // Copy gmem to smem for *k_tile_iter
+      copy_if(gmem_tiled_copy_a, tApA, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+      copy_if(gmem_tiled_copy_b, tBpB, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+      ++k_tile_iter;
+
+      // UNLOCK smem_pipe_write
+      pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive);
+
+      // Advance smem_pipe_write
+      ++smem_pipe_write;
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(
+      MainloopPipeline pipeline, 
+      PipelineState smem_pipe_write) {
+    // Issue the epilogue waits
+    /* This helps avoid early exit of blocks in Cluster
+     * Waits for all stages to either be released (all 
+     * Consumer UNLOCKs), or if the stage was never used
+     * then would just be acquired since the phase was 
+     * still inverted from make_producer_start_state
+     */
+    pipeline.producer_tail(smem_pipe_write);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params)
+  {
+    using namespace cute;
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(InternalSmemLayoutAtomA{}) == 2, "InternalSmemLayoutAtomA must be rank 2.");
+    static_assert(cute::rank(InternalSmemLayoutAtomB{}) == 2, "InternalSmemLayoutAtomB must be rank 2.");
+    static_assert(!cute::is_void_v<InternalSmemCopyAtomA>,
+      "SM90 GMMA mainloops must specify a non-void copy atom for smem sourced instructions.");
+    static_assert(cute::is_void_v<InternalSmemCopyAtomB>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    // Obtain warp index
+    int warp_idx = canonical_warp_idx_sync();
+    [[maybe_unused]] int warp_group_thread_idx = thread_idx % 128;
+    
+    Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});         // (BLK_M,BLK_K,PIPE)
+    Tensor sA  = as_position_independent_swizzle_tensor(sA_);                                     // (BLK_M,BLK_K,PIPE)
+    Tensor sB_ = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});         // (BLK_N,BLK_K,PIPE)
+    Tensor sB  = as_position_independent_swizzle_tensor(sB_);                                     // (BLK_M,BLK_K,PIPE)
+
+    // If TransposeB, GMMA will read from transposed B layout SMEM
+    Tensor gmma_sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), GmmaSmemLayoutB{}); // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{}, 
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto mma_thread_slice = tiled_mma.get_thread_slice(thread_idx);
+    auto mma_warpgroup_slice = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    // Allocate fragments and descriptors
+    Tensor tCsA = mma_thread_slice.partition_A(sA);
+    Tensor tCrA = mma_thread_slice.partition_fragment_A(sA(_,_,Int<0>{}));                    // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = mma_warpgroup_slice.partition_B(gmma_sB);                                   // (MMA,MMA_N,MMA_K,PIPE)
+    Tensor tCrB = mma_warpgroup_slice.make_fragment_B(tCsB);                                  // (MMA,MMA_N,MMA_K,PIPE)
+
+    //
+    // Copy Atom A retiling
+    //
+
+
+    auto smem_tiled_copy_A = make_tiled_copy_A(InternalSmemCopyAtomA{}, tiled_mma);
+
+    auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(thread_idx);
+
+    Tensor tCrA_copy_view  = smem_thr_copy_A.retile_D(tCrA);                                       // (CPY,CPY_M,CPY_K)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));                                            // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));                                            // CPY_K
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));                                                     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
+        "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    TransposeOperandB transpose = cutlass::transform::collective::detail::make_transpose_operand_b(
+                                    warp_idx, warp_group_thread_idx, tiled_mma, SmemLayoutB{}, 
+                                    InternalSmemLayoutAtomB{}, InternalElementB{}, 
+                                    cute::bool_constant<TransposeB>{});
+
+    warpgroup_fence_operand(accum);
+    // first k tile
+    {
+      pipeline.consumer_wait(smem_pipe_read);
+
+      int read_stage = smem_pipe_read.index();
+
+      ++smem_pipe_read;
+
+      bool skip_wait = (pipeline.consumer_try_wait(smem_pipe_read) == BarrierStatus::WaitDone);
+
+      // copy smem->rmem for A operand
+      copy(smem_tiled_copy_A, tCsA(_,_,0,read_stage), tCrA_copy_view(_,_,0));
+      // transpose B operand in SMEM
+      transpose(sB, gmma_sB, read_stage, 0);
+
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA) - 1; ++k_block) {
+        copy(smem_tiled_copy_A, tCsA(_,_,k_block + 1,read_stage), tCrA_copy_view(_,_,k_block + 1));
+        if (k_block == 0) {
+          transpose(sB, gmma_sB, read_stage, 1);
+          transpose.synchronize();
+        }
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+      }
+
+      warpgroup_wait<2>();
+      
+      
+      if (k_tile_count - 1 > 0) {
+        if (!skip_wait) {
+          pipeline.consumer_wait(smem_pipe_read);
+        }
+        copy(smem_tiled_copy_A, tCsA(_,_,0,smem_pipe_read.index()), tCrA_copy_view(_,_,0));
+        transpose(sB, gmma_sB, smem_pipe_read.index(), 0);
+      }
+
+      warpgroup_arrive();
+      // (V,M) x (V,N) => (V,M,N)
+      cute::gemm(tiled_mma, tCrA(_,_,size<2>(tCrA) - 1), tCrB(_,_,size<2>(tCrA) - 1,read_stage), accum);
+      tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      warpgroup_commit_batch();
+      warpgroup_wait<2>();
+    }
+
+    warpgroup_fence_operand(accum);
+    // Mainloop GMMAs
+    --k_tile_count;
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 1; --k_tile_count) {
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+
+      ++smem_pipe_read;
+      bool skip_wait = (pipeline.consumer_try_wait(smem_pipe_read) == BarrierStatus::WaitDone);
+
+      warpgroup_fence_operand(accum);
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        if (k_block == size<2>(tCrA) - 1) {
+          if (!skip_wait) {
+            pipeline.consumer_wait(smem_pipe_read);
+          }
+          copy(smem_tiled_copy_A, tCsA(_,_,0,smem_pipe_read.index()), tCrA_copy_view(_,_,0));
+          // transpose B operand in SMEM
+          transpose(sB, gmma_sB, smem_pipe_read.index(), 0);
+        } else {
+          copy(smem_tiled_copy_A, tCsA(_,_,k_block + 1,read_stage), tCrA_copy_view(_,_,k_block + 1));
+          // transpose B operand in SMEM
+          if (k_block < 2) {
+            transpose.synchronize(k_block);                                      // make transpose of k_block available
+          }
+          if (k_block == 0) {
+            transpose(sB, gmma_sB, read_stage, 1);
+          }
+        }
+        
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+        warpgroup_wait<2>();
+        if (k_block == 1) {
+          // release prior barrier
+          pipeline.consumer_release(smem_pipe_release);             // UNLOCK smem_pipe_release, done _computing_ on it
+          ++smem_pipe_release;
+        }
+      }
+      warpgroup_fence_operand(accum);
+
+    }
+
+    warpgroup_fence_operand(accum);
+
+    if (k_tile_count > 0) {
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+
+      warpgroup_fence_operand(accum);
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA) - 1; ++k_block) {
+        copy(smem_tiled_copy_A, tCsA(_,_,k_block + 1,read_stage), tCrA_copy_view(_,_,k_block + 1));
+        if (k_block < 2) {
+          transpose.synchronize(k_block);                                           // make k_block transpose available
+        }
+        if (k_block == 0) {
+          transpose(sB, gmma_sB, read_stage, 1);
+        }
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+        warpgroup_wait<2>();
+        if (k_block == 1) {
+          // release prior barrier
+          pipeline.consumer_release(smem_pipe_release);             // UNLOCK smem_pipe_release, done _computing_ on it
+          ++smem_pipe_release;
+        }
+      }
+      
+      warpgroup_arrive();
+      // (V,M) x (V,N) => (V,M,N)
+      cute::gemm(tiled_mma, tCrA(_,_,size<2>(tCrA) - 1), tCrB(_,_,size<2>(tCrA) - 1,read_stage), accum);
+      tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      warpgroup_commit_batch();
+      warpgroup_wait<2>();
+      warpgroup_fence_operand(accum);
+    }
+
+    warpgroup_fence_operand(accum);
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+    
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_ss_warpspecialized.hpp b/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_ss_warpspecialized.hpp
new file mode 100644
index 0000000000..e336bd4755
--- /dev/null
+++ b/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_ss_warpspecialized.hpp
@@ -0,0 +1,509 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <
+  int Stages,
+  class ClusterShape_,
+  class TileShape_,
+  class KernelSchedule,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90CpAsyncGmmaWarpSpecialized<Stages,ClusterShape_,KernelSchedule>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90CpAsyncGmmaWarpSpecialized<Stages,ClusterShape_,KernelSchedule>;
+  using TileShape = TileShape_;
+  using ClusterShape = ClusterShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using MainloopPipeline = cutlass::PipelineAsync<DispatchPolicy::Stages>;
+  using PipelineState    = typename MainloopPipeline::PipelineState;
+  using PipelineParams   = typename MainloopPipeline::Params;
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+
+  struct SharedStorage
+  {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A = nullptr;
+    StrideA dA{};
+    ElementB const* ptr_B = nullptr;
+    StrideB dB{};
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  // Device side kernel params
+  using Params = Arguments;
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+    [[maybe_unused]] ProblemShape const& problem_shape,
+    Arguments const& args,
+    [[maybe_unused]] void* workspace) {
+    return args;
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    bool implementable = true;
+    implementable = implementable && cutlass::detail::check_alignment<GmemTiledCopyA::NumValSrc>(cute::make_shape(M,K,L), StrideA{});
+    implementable = implementable && cutlass::detail::check_alignment<GmemTiledCopyB::NumValSrc>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 1;
+  
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA,
+    class TensorB,
+    class KTileIterator,
+    class ResidueMNK
+  >
+  CUTLASS_DEVICE void
+  load(
+      MainloopPipeline pipeline, 
+      PipelineState smem_pipe_write,
+      TensorA const& gA_in,
+      TensorB const& gB_in,
+      KTileIterator k_tile_iter, int k_tile_count,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      TensorStorage& shared_tensors)
+  {
+    using namespace cute;
+
+    static_assert(is_gmem<TensorA>::value, "A tensor must be gmem resident.");
+    static_assert(is_gmem<TensorB>::value, "B tensor must be gmem resident.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+
+    // Shift tensor so residue_k is at origin (Can't read any k_coord < residue_k)
+    // This aligns the tensor with BLK_K for all but the 0th k_tile
+    Tensor gA = domain_offset(make_coord(0, get<2>(residue_mnk), 0), gA_in);
+    Tensor gB = domain_offset(make_coord(0, get<2>(residue_mnk), 0), gB_in);
+
+    // Partition the copying of A and B tiles across the threads
+    GmemTiledCopyA gmem_tiled_copy_a;
+    GmemTiledCopyB gmem_tiled_copy_b;
+    auto gmem_thr_copy_a = gmem_tiled_copy_a.get_slice(thread_idx);
+    auto gmem_thr_copy_b = gmem_tiled_copy_b.get_slice(thread_idx);
+
+    Tensor tAgA = gmem_thr_copy_a.partition_S(gA);                        // (ACPY,ACPY_M,ACPY_K,k)
+    Tensor tAsA = gmem_thr_copy_a.partition_D(sA);                        // (ACPY,ACPY_M,ACPY_K,PIPE)
+    Tensor tBgB = gmem_thr_copy_b.partition_S(gB);                        // (BCPY,BCPY_N,BCPY_K,k)
+    Tensor tBsB = gmem_thr_copy_b.partition_D(sB);                        // (BCPY,BCPY_N,BCPY_K,PIPE)
+
+    // Allocate predicate tensors for m and n
+    Tensor tApA = make_tensor<bool>(make_shape(size<1>(tAsA), size<2>(tAsA)), Stride<_1,_0>{});
+    Tensor tBpB = make_tensor<bool>(make_shape(size<1>(tBsB), size<2>(tBsB)), Stride<_1,_0>{});
+
+    // Construct identity layout for sA and sB
+    Tensor cA = make_identity_tensor(make_shape(size<0>(sA), size<1>(sA)));    // (BLK_M,BLK_K) -> (blk_m,blk_k)
+    Tensor cB = make_identity_tensor(make_shape(size<0>(sB), size<1>(sB)));    // (BLK_N,BLK_K) -> (blk_n,blk_k)
+
+    // Repeat the partitioning with identity layouts
+    Tensor tAcA = gmem_thr_copy_a.partition_S(cA);                             // (ACPY,ACPY_M,ACPY_K) -> (blk_m,blk_k)
+    Tensor tBcB = gmem_thr_copy_b.partition_S(cB);                             // (BCPY,BCPY_N,BCPY_K) -> (blk_n,blk_k)
+
+    // Set predicates for m bounds
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < size<0>(tApA); ++m) {
+      tApA(m,0) = get<0>(tAcA(0,m,0)) < get<0>(residue_mnk);  // blk_m coord < residue_m
+    }
+    // Set predicates for n bounds
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < size<0>(tBpB); ++n) {
+      tBpB(n,0) = get<0>(tBcB(0,n,0)) < get<1>(residue_mnk);  // blk_n coord < residue_n
+    }
+
+    // 0-th stage with predication on k to account for residue
+    {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+      int write_stage = smem_pipe_write.index();
+
+      // Copy gmem to smem for *k_tile_iter, predicating for k residue
+      Tensor tAgAk = tAgA(_,_,_,*k_tile_iter);
+      CUTLASS_PRAGMA_UNROLL
+      for (int k = 0; k < size<2>(tAsA); ++k) {
+        if (get<1>(tAcA(0,0,k)) >= -get<2>(residue_mnk)) {      // blk_k coord < residue_k (gA shifted)
+          copy_if(gmem_tiled_copy_a, tApA(_,k), tAgAk(_,_,k), tAsA(_,_,k,write_stage));
+        }
+        else {
+          clear(tAsA(_,_,k,write_stage));
+        }
+      }
+      Tensor tBgBk = tBgB(_,_,_,*k_tile_iter);
+      CUTLASS_PRAGMA_UNROLL
+      for (int k = 0; k < size<2>(tBsB); ++k) {
+        if (get<1>(tBcB(0,0,k)) >= -get<2>(residue_mnk)) {      // blk_k coord < residue_k (gB shifted)
+          copy_if(gmem_tiled_copy_b, tBpB(_,k), tBgBk(_,_,k), tBsB(_,_,k,write_stage));
+        }
+        else {
+          clear(tBsB(_,_,k,write_stage));
+        }
+      }
+      ++k_tile_iter;
+      --k_tile_count;
+
+      // UNLOCK smem_pipe_write
+      pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive);
+
+      // Advance smem_pipe_write
+      ++smem_pipe_write;
+    }
+
+    // Mainloop
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+      int write_stage = smem_pipe_write.index();
+
+      // Copy gmem to smem for *k_tile_iter
+      copy_if(gmem_tiled_copy_a, tApA, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+      copy_if(gmem_tiled_copy_b, tBpB, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+      ++k_tile_iter;
+
+      // UNLOCK smem_pipe_write
+      pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive);
+
+      // Advance smem_pipe_write
+      ++smem_pipe_write;
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(
+      MainloopPipeline pipeline, 
+      PipelineState smem_pipe_write) {
+    // Issue the epilogue waits
+    /* This helps avoid early exit of blocks in Cluster
+     * Waits for all stages to either be released (all 
+     * Consumer UNLOCKs), or if the stage was never used
+     * then would just be acquired since the phase was 
+     * still inverted from make_producer_start_state
+     */
+    pipeline.producer_tail(smem_pipe_write);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params)
+  {
+    using namespace cute;
+
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+    static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and 
+                  stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{}, 
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
+        "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    assert(k_tile_count >= 1);
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+    warpgroup_fence_operand(accum);
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+
+      warpgroup_arrive();
+
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+
+      warpgroup_commit_batch();
+
+      ++smem_pipe_read;
+    }
+
+    warpgroup_fence_operand(accum);
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_tile_prologue = prologue_mma_count - 1; k_tile_prologue > 0; --k_tile_prologue) {
+
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+
+      warpgroup_arrive();
+
+      // (V,M,K) x (V,N,K) => (V,M,N)
+      cute::gemm(tiled_mma, tCrA(_,_,_,read_stage), tCrB(_,_,_,read_stage), accum);
+
+      warpgroup_commit_batch();
+
+      ++smem_pipe_read;
+    }
+
+    warpgroup_fence_operand(accum);
+
+    // Mainloop GMMAs
+    k_tile_count -= prologue_mma_count;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+      
+      warpgroup_fence_operand(accum);
+      warpgroup_arrive();
+      // (V,M,K) x (V,N,K) => (V,M,N)
+      cute::gemm(tiled_mma, tCrA(_,_,_,read_stage), tCrB(_,_,_,read_stage), accum);
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
+      warpgroup_wait<K_PIPE_MMAS>();
+      warpgroup_fence_operand(accum);
+
+      // UNLOCK smem_pipe_release, done _computing_ on it
+      pipeline.consumer_release(smem_pipe_release);
+
+      // Advance smem_pipe_read and smem_pipe_release
+      ++smem_pipe_read;
+      ++smem_pipe_release;
+    }
+
+    warpgroup_fence_operand(accum);
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+    
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp
new file mode 100644
index 0000000000..b30fed1c85
--- /dev/null
+++ b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp
@@ -0,0 +1,752 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/transform/collective/sm90_wgmma_transpose.hpp"
+#include "cutlass/trace.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop that source A operand from registers
+template <
+  int Stages,
+  class ClusterShape,
+  class KernelSchedule,
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90TmaGmmaRmemAWarpSpecialized<Stages, ClusterShape, KernelSchedule>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90TmaGmmaRmemAWarpSpecialized<Stages, ClusterShape, KernelSchedule>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  // Swap and transpose A/B for A k-major layout and B mn-major layout since WGMMA is k-major only
+  // (e.g. tf32, Fp32, Int8, Fp8 WGMMA)
+  static constexpr bool IsLayoutAkBmn =
+    cute::is_same_v<gemm::detail::StrideToLayoutTagA_t<StrideA>, layout::RowMajor> &&
+    cute::is_same_v<gemm::detail::StrideToLayoutTagB_t<StrideB>, layout::RowMajor>;
+
+  static constexpr bool IsInputSizeTwoBytes = sizeof(ElementA) == 2 && sizeof(ElementB) == 2;
+  static constexpr bool SwapAB =  !IsInputSizeTwoBytes && IsLayoutAkBmn;
+  using InternalSmemLayoutAtomA = cute::conditional_t<!SwapAB, SmemLayoutAtomA, SmemLayoutAtomB>;
+  using InternalSmemLayoutAtomB = cute::conditional_t<!SwapAB, SmemLayoutAtomB, SmemLayoutAtomA>;
+  using InternalSmemCopyAtomA   = cute::conditional_t<!SwapAB, SmemCopyAtomA, SmemCopyAtomB>;
+  using InternalSmemCopyAtomB   = cute::conditional_t<!SwapAB, SmemCopyAtomB, SmemCopyAtomA>;
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using ConvertedElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using ConvertedElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
+  using InternalElementA = cute::conditional_t<!SwapAB, ConvertedElementA, ConvertedElementB>;
+  using InternalElementB = cute::conditional_t<!SwapAB, ConvertedElementB, ConvertedElementA>;
+  using InternalStrideA  = cute::conditional_t<!SwapAB, StrideA, StrideB>;
+  using InternalStrideB  = cute::conditional_t<!SwapAB, StrideB, StrideA>;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  static_assert(cute::rank(InternalSmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(InternalSmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(InternalSmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(InternalSmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(InternalSmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(InternalSmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      InternalSmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,InternalStrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      InternalSmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,InternalStrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  // If A mn-layout and B mn-layout, transposing B matrix since WGMMA is k-major only (e.g. tf32, fp32, fp8, int8).
+  static constexpr bool IsLayoutAmnBmn =
+    cute::is_same_v<gemm::detail::StrideToLayoutTagA_t<StrideA>, layout::ColumnMajor> &&
+    cute::is_same_v<gemm::detail::StrideToLayoutTagB_t<StrideB>, layout::RowMajor>;
+  static constexpr bool TransposeB = !IsInputSizeTwoBytes && IsLayoutAmnBmn;
+  using TransposeOperandB = decltype(cutlass::transform::collective::detail::make_transpose_operand_b(
+                                      0, 0, TiledMma{}, SmemLayoutB{}, InternalSmemLayoutAtomB{},
+                                      InternalElementB{}, cute::bool_constant<TransposeB>{})); 
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                    cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source A from rmem and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  using GmmaSmemLayoutAtomB = decltype(transform::collective::detail::gmma_smem_transpose_or_passthrough<
+      TransposeB, InternalSmemLayoutAtomB, InternalElementB>());
+
+  // SmemLayoutB for GMMA is different from SmemLayoutB for TMA if TransposeB
+  using GmmaSmemLayoutB = decltype(tile_to_shape(
+      GmmaSmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,InternalStrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(!SwapAB || !TransposeB, "Cannot SwapAB and TransposeB at the same time.");
+  static_assert(TransposeB xor (cute::is_same_v<SmemLayoutB, GmmaSmemLayoutB>),
+    "Should be same layout if not TransposeB.");
+  static_assert(!TransposeB || (cutlass::bits_to_bytes((size<1>(SmemLayoutB{}) * sizeof_bits<InternalElementB>::value))) == 128,
+    "SmemLayoutB K must be 128bytes to be transposed.");
+
+  static constexpr bool uses_universal_transposition() {
+    if constexpr (TransposeB) {
+      return transform::collective::detail::use_universal_transposition<InternalSmemLayoutAtomB, InternalElementB>();
+    }
+    else {
+      return false;
+    }
+  }
+
+  static_assert(!uses_universal_transposition(),
+    "Warp specialized ARF kernels have not supported universal B transposition yet.");
+  
+  static constexpr size_t SmemAlignmentA = cutlass::detail::alignment_for_swizzle(SmemLayoutA{}); 
+
+  static constexpr size_t SmemAlignmentB = cutlass::detail::alignment_for_swizzle(SmemLayoutB{});
+
+  static_assert(SmemAlignmentA >= 128 and SmemAlignmentB >= 128, "Require at least 128B alignment");
+
+  struct SharedStorage
+  {
+    struct TensorStorage : cute::aligned_struct<cute::max(SmemAlignmentA, SmemAlignmentB), _0> { 
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>, SmemAlignmentA> smem_A;
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>, SmemAlignmentB> smem_B;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A = nullptr;
+    StrideA dA{};
+    ElementB const* ptr_B = nullptr;
+    StrideB dB{};
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy_A_sm90(
+        GmemTiledCopyA{},
+        make_tensor(static_cast<InternalElementA const*>(nullptr), repeat_like(InternalStrideA{}, int32_t(0)), InternalStrideA{}),
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}));
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        make_tensor(static_cast<InternalElementB const*>(nullptr), repeat_like(InternalStrideB{}, int32_t(0)), InternalStrideB{}),
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}));
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    if constexpr (SwapAB) {
+      M = get<1>(problem_shape_MNKL);
+      N = get<0>(problem_shape_MNKL);
+    }
+
+    InternalElementA const* ptr_A;
+    InternalStrideA dA;
+    InternalElementB const* ptr_B;
+    InternalStrideB dB;
+
+    if constexpr (not SwapAB) {
+      ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_A);
+      ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_B);
+      dA = args.dA;
+      dB = args.dB;
+    }
+    else {
+      ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_B);
+      ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_A);
+      dA = args.dB;
+      dB = args.dA;
+    }
+
+    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), dA));
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), dB));
+    typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{});
+    typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{});
+    uint32_t transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t transaction_bytes_nk = TmaTransactionBytesNK;
+    uint32_t transaction_bytes = transaction_bytes_mk + transaction_bytes_nk;
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      transaction_bytes,
+      transaction_bytes_mk,
+      transaction_bytes_nk
+    };
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+    
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr uint32_t TmaTransactionBytesMK =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<InternalElementA>::value));
+  static constexpr uint32_t TmaTransactionBytesNK =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<InternalElementB>::value)) ;
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  /// The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L));                            // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
+
+    return cute::make_tuple(gA_mkl, gB_nkl);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA, class TensorB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA, TensorB> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+      Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});       // (BLK_M,BLK_K,PIPE)
+      Tensor sB_ = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});       // (BLK_N,BLK_K,PIPE)
+      Tensor sA  = as_position_independent_swizzle_tensor(sA_);                                   // (BLK_M,BLK_K,PIPE)
+      Tensor sB  = as_position_independent_swizzle_tensor(sB_);                                   // (BLK_N,BLK_K,PIPE)
+
+      //
+      // Prepare the TMA loads for A and B
+      //
+      
+      constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+      Tensor gA_mkl = get<0>(load_inputs);
+      Tensor gB_nkl = get<1>(load_inputs);
+
+      auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+      auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+      Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+      // Applies the mapping from block_tma_a
+      Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+      uint16_t mcast_mask_a = 0;
+      uint16_t mcast_mask_b = 0;
+
+      // Issue TmaLoads
+      // Maps the tile -> block, value
+      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+        for (int n = 0; n < size<1>(block_layout); ++n) {
+          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+        }
+      }
+
+      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+        for (int m = 0; m < size<0>(block_layout); ++m) {
+          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+        }
+      }
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for ( ; k_tile_count > 0; --k_tile_count) {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+        ++k_tile_iter;
+
+        // Advance smem_pipe_write
+        ++smem_pipe_write;
+      }
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all 
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was 
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(InternalSmemLayoutAtomA{}) == 2, "InternalSmemLayoutAtomA must be rank 2.");
+    static_assert(cute::rank(InternalSmemLayoutAtomB{}) == 2, "InternalSmemLayoutAtomB must be rank 2.");
+    static_assert(!cute::is_void_v<InternalSmemCopyAtomA>,
+      "SM90 GMMA mainloops must specify a non-void copy atom for smem sourced instructions.");
+    static_assert(cute::is_void_v<InternalSmemCopyAtomB>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    // Obtain warp index
+    int warp_idx = canonical_warp_idx_sync();
+    [[maybe_unused]] int warp_group_thread_idx = thread_idx % 128;
+    
+    Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});         // (BLK_M,BLK_K,PIPE)
+    Tensor sA = as_position_independent_swizzle_tensor(sA_);                                      // (BLK_M,BLK_K,PIPE)
+    
+    Tensor sB_ = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});         // (BLK_N,BLK_K,PIPE)
+    Tensor sB  = as_position_independent_swizzle_tensor(sB_);                                     // (BLK_M,BLK_K,PIPE)
+
+    // If TransposeB, GMMA will read from transposed B layout SMEM
+    Tensor gmma_sB_position_dependent = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), 
+                                          GmmaSmemLayoutB{});                                     // (BLK_N,BLK_K,PIPE)
+    Tensor gmma_sB = as_position_independent_swizzle_tensor(gmma_sB_position_dependent);          // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{}, 
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto mma_thread_slice = tiled_mma.get_thread_slice(thread_idx);
+    auto mma_warpgroup_slice = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    // Allocate fragments and descriptors
+    Tensor tCsA = mma_thread_slice.partition_A(sA);
+    Tensor tCrA = mma_thread_slice.partition_fragment_A(sA(_,_,Int<0>{}));                    // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = mma_warpgroup_slice.partition_B(gmma_sB_position_dependent);                // (MMA,MMA_N,MMA_K,PIPE)
+    Tensor tCrB = mma_warpgroup_slice.make_fragment_B(tCsB);                                  // (MMA,MMA_N,MMA_K,PIPE)
+
+    //
+    // Copy Atom A retiling
+    //
+
+
+    auto smem_tiled_copy_A = make_tiled_copy_A(InternalSmemCopyAtomA{}, tiled_mma);
+
+    auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(thread_idx);
+
+    Tensor tCrA_copy_view  = smem_thr_copy_A.retile_D(tCrA);                                       // (CPY,CPY_M,CPY_K)
+    Tensor tCsA_copy_view  = smem_thr_copy_A.partition_S(sA);                                      // (CPY,CPY_M,CPY_K)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));                                            // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));                                            // CPY_K
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA_copy_view) == size<1>(tCrA_copy_view));                                  // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA_copy_view) == size<2>(tCrA_copy_view));                                  // CPY_K
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));                                                     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(size<2>(tCrA) > _2{}, "RS loops require more than 2 MMA k-iterations for correctness.");
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    TransposeOperandB transpose = cutlass::transform::collective::detail::make_transpose_operand_b(
+                                    warp_idx, warp_group_thread_idx, tiled_mma, SmemLayoutB{}, 
+                                    InternalSmemLayoutAtomB{}, InternalElementB{}, 
+                                    cute::bool_constant<TransposeB>{});
+
+    warpgroup_fence_operand(accum);
+    
+    ConsumerToken barrier_token = {BarrierStatus::WaitAgain};
+    // first k tile
+    {
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+
+      ++smem_pipe_read;
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+
+      // copy smem->rmem for A operand
+      copy(smem_tiled_copy_A, tCsA_copy_view(_,_,0,read_stage), tCrA_copy_view(_,_,0));
+      // transpose B operand in SMEM
+      transpose(sB, gmma_sB, read_stage, 0);
+      
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA) - 1; ++k_block) {
+        copy(smem_tiled_copy_A, tCsA_copy_view(_,_,k_block + 1,read_stage), tCrA_copy_view(_,_,k_block + 1));
+        transpose.synchronize(k_block);
+        transpose(sB, gmma_sB, read_stage, k_block + 1);
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
+        if(k_block == 0) {
+          tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        }
+        warpgroup_commit_batch();
+      }
+
+      warpgroup_wait<2>();
+      
+      warpgroup_arrive();
+      // (V,M) x (V,N) => (V,M,N)
+      cute::gemm(tiled_mma, tCrA(_,_,size<2>(tCrA) - 1), tCrB(_,_,size<2>(tCrA) - 1,read_stage), accum);
+      warpgroup_commit_batch();
+      --k_tile_count;
+      if(k_tile_count == 0) {
+        return;
+      }
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+      copy(smem_tiled_copy_A, tCsA_copy_view(_,_,0,smem_pipe_read.index()), tCrA_copy_view(_,_,0));
+      transpose(sB, gmma_sB, smem_pipe_read.index(), 0);
+      warpgroup_wait<2>();
+    }
+
+    warpgroup_fence_operand(accum);
+    // Mainloop GMMAs
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 1; --k_tile_count) {
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+      ++smem_pipe_read;
+
+      warpgroup_fence_operand(accum);
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        if (k_block == 0) {
+          barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+        }
+        if (k_block == size<2>(tCrA) - 1) {
+          pipeline.consumer_wait(smem_pipe_read, barrier_token);
+          copy(smem_tiled_copy_A, tCsA_copy_view(_,_,0,smem_pipe_read.index()), tCrA_copy_view(_,_,0));
+          // transpose B operand in SMEM
+          transpose(sB, gmma_sB, smem_pipe_read.index(), 0);
+        } 
+        else {
+          copy(smem_tiled_copy_A, tCsA_copy_view(_,_,k_block + 1,read_stage), tCrA_copy_view(_,_,k_block + 1));
+          // transpose B operand in SMEM
+          transpose.synchronize(k_block);                                      // make transpose of k_block available
+          transpose(sB, gmma_sB, read_stage, k_block + 1);
+        }
+        
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
+        warpgroup_commit_batch();
+        warpgroup_wait<2>();
+        if (k_block == 1) {
+          // release prior barrier
+          pipeline.consumer_release(smem_pipe_release);             // UNLOCK smem_pipe_release, done _computing_ on it
+          ++smem_pipe_release;
+        }
+      }
+      warpgroup_fence_operand(accum);
+
+    }
+
+    warpgroup_fence_operand(accum);
+
+    {
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+
+      warpgroup_fence_operand(accum);
+      
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA) - 1; ++k_block) {
+        copy(smem_tiled_copy_A, tCsA_copy_view(_,_,k_block + 1,read_stage), tCrA_copy_view(_,_,k_block + 1));
+        transpose.synchronize(k_block);                                           // make k_block transpose available
+        transpose(sB, gmma_sB, read_stage, k_block + 1);
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+        warpgroup_wait<2>();
+        if (k_block == 1) {
+          // release prior barrier
+          pipeline.consumer_release(smem_pipe_release);             // UNLOCK smem_pipe_release, done _computing_ on it
+          ++smem_pipe_release;
+        }
+      }
+      
+      warpgroup_arrive();
+      // (V,M) x (V,N) => (V,M,N)
+      cute::gemm(tiled_mma, tCrA(_,_,size<2>(tCrA) - 1), tCrB(_,_,size<2>(tCrA) - 1,read_stage), accum);
+      warpgroup_commit_batch();
+    }
+
+    warpgroup_fence_operand(accum);
+  }
+  
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = 1;
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+    
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized_mixed_input.hpp b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized_mixed_input.hpp
new file mode 100644
index 0000000000..6292f6e441
--- /dev/null
+++ b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized_mixed_input.hpp
@@ -0,0 +1,1034 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/transform/collective/sm90_wgmma_transpose.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/detail/collective/mixed_input_utils.hpp"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/atom/copy_traits_sm90_tma.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop that source A operand from registers
+template <
+  int Stages,
+  class ClusterShape,
+  class KernelSchedule_,
+  class TileShape_,
+  class ElementAOptionalTuple,
+  class StrideA_,
+  class ElementBOptionalTuple,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90TmaGmmaRmemAWarpSpecializedMixedInput<Stages, ClusterShape, KernelSchedule_>,
+    TileShape_,
+    ElementAOptionalTuple,
+    StrideA_,
+    ElementBOptionalTuple,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+public:
+  enum class ConversionMode {
+    DirectConvert,
+    ConvertAndScale,
+    ConvertAndScaleWithZero
+  };
+  
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90TmaGmmaRmemAWarpSpecializedMixedInput<Stages, ClusterShape, KernelSchedule_>;
+  using TileShape = TileShape_;
+  using KernelSchedule = KernelSchedule_;
+
+private:
+  template<class T> friend struct detail::MixedInputUtils;
+  using CollectiveType = CollectiveMma<DispatchPolicy, TileShape_, 
+                                       ElementAOptionalTuple, StrideA_, 
+                                       ElementBOptionalTuple, StrideB_,
+                                       TiledMma_, 
+                                       GmemTiledCopyA_, SmemLayoutAtomA_, SmemCopyAtomA_,
+                                       TransformA_,
+                                       GmemTiledCopyB_, SmemLayoutAtomB_, SmemCopyAtomB_,
+                                       TransformB_>;
+  using Utils = detail::MixedInputUtils<CollectiveType>;
+
+  using ScaleA = detail::deduce_mixed_width_dtype_t<1, ElementAOptionalTuple>;
+  using ScaleB = detail::deduce_mixed_width_dtype_t<1, ElementBOptionalTuple>;
+  using ZeroA = detail::deduce_mixed_width_dtype_t<2, ElementAOptionalTuple>;
+  using ZeroB = detail::deduce_mixed_width_dtype_t<2, ElementBOptionalTuple>;
+
+public:
+  static_assert(cute::is_tuple<ElementAOptionalTuple>::value ^ cute::is_tuple<ElementBOptionalTuple>::value, 
+    "Either A OR B must be a tuple. It must take the from {ElementOperand, [ElementScale],"
+    "[ElementZero]}. Inputs in [] are optional.");
+
+  using ElementA = detail::deduce_mixed_width_dtype_t<0, ElementAOptionalTuple>;
+  using ElementB = detail::deduce_mixed_width_dtype_t<0, ElementBOptionalTuple>;
+  static constexpr bool IsATransformed = cute::is_tuple<ElementAOptionalTuple>::value;
+  using ElementScale = cute::conditional_t<IsATransformed, ScaleA, ScaleB>;
+  using ElementZero = cute::conditional_t<IsATransformed, ZeroA, ZeroB>;
+  // For cases where we can't have a void type, we can use this to allow the code to compile when the scale / zero is void.
+  using NonVoidElementScale = cute::conditional_t<cute::is_void_v<ElementScale>, float, ElementScale>;
+  using NonVoidElementZero = cute::conditional_t<cute::is_void_v<ElementZero>, float, ElementZero>;
+
+  using StrideA = StrideA_;
+  using StrideB = StrideB_;
+  // These are always MN major
+  using StrideScale = cute::Stride<cute::Int<1>, int64_t, int64_t>;
+  // For cases where we can't have a void scale, we can use this to allow the code to compile when the scale is void.
+  using NonVoidStrideScale = cute::conditional_t<
+      cute::is_void_v<StrideScale>, cute::Stride<_1, int64_t, int64_t>, StrideScale>;
+
+  static_assert(( IsATransformed && (cutlass::gemm::detail::is_k_major<StrideA>() || is_layout<StrideA>::value)) || 
+                (!IsATransformed && (cutlass::gemm::detail::is_k_major<StrideB>() || is_layout<StrideB>::value)),
+                "The transformed type must be K-major.");
+
+  static_assert(( IsATransformed && (sizeof(ElementB) == 2)) ||
+                (!IsATransformed && (sizeof(ElementA) == 2)) ||
+                ((cutlass::gemm::detail::is_k_major<StrideA>() || is_layout<StrideA>::value) && 
+                 (cutlass::gemm::detail::is_k_major<StrideB>() || is_layout<StrideB>::value)), 
+                "The unscaled element must be 2 bytes OR both inputs must be K-major");
+
+  static_assert(cutlass::gemm::detail::is_mn_major<NonVoidStrideScale>(), 
+    "Scale must be MN major [Col Major if A is scaled, Row Major if B is scaled].");
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using GmemTiledCopyScale = cute::SM90_TMA_LOAD;
+
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  // Scale layout atom set after swapping.
+
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using SmemCopyAtomScale = Copy_Atom<cute::AutoVectorizingCopy, NonVoidElementScale>;
+
+  // We must ensure the type to be scaled goes to RF
+  static constexpr bool SwapAB = !IsATransformed;
+  using SwappedSmemLayoutAtomA = cute::conditional_t<!SwapAB, SmemLayoutAtomA, SmemLayoutAtomB>;
+  using SwappedSmemLayoutAtomB = cute::conditional_t<!SwapAB, SmemLayoutAtomB, SmemLayoutAtomA>;
+  using SwappedSmemCopyAtomA   = cute::conditional_t<!SwapAB, SmemCopyAtomA, SmemCopyAtomB>;
+  using SwappedSmemCopyAtomB   = cute::conditional_t<!SwapAB, SmemCopyAtomB, SmemCopyAtomA>;
+  
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using ConvertedElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using ConvertedElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
+  using RealSwappedElementA = cute::conditional_t<!SwapAB, ElementA, ElementB>;
+  using RealSwappedElementB = cute::conditional_t<!SwapAB, ElementB, ElementA>;
+  using SwappedElementA = cute::conditional_t<!SwapAB, ConvertedElementA, ConvertedElementB>;
+  using SwappedElementB = cute::conditional_t<!SwapAB, ConvertedElementB, ConvertedElementA>;
+  using SwappedStrideA  = cute::conditional_t<!SwapAB, StrideA, StrideB>;
+  using SwappedStrideB  = cute::conditional_t<!SwapAB, StrideB, StrideA>;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using SwappedTransformA  = cute::conditional_t<!SwapAB, TransformA, TransformB>;
+  using SwappedTransformB  = cute::conditional_t<!SwapAB, TransformB, TransformA>;
+
+  static constexpr int IsSubbyteA = cute::sizeof_bits_v<SwappedElementA> < 8;
+  using TmaElementA = cute::conditional_t<IsSubbyteA, uint8_t, SwappedElementA>;
+  using TmaElementScale = uint_bit_t<sizeof_bits_v<NonVoidElementScale> >; // in case we have array. translating to uint to satisfy tma descriptor's specialization
+
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using MainloopPipeline = cutlass::PipelineTmaAsync<
+                             DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  using SmemLayoutAtomScale = Layout<Shape<decltype(cute::shape<0>(SwappedSmemLayoutAtomA{})), cute::Int<1>>>;
+  using ScaleTileShape = decltype(make_shape(shape<0>(TileShape{}), shape<1>(SmemLayoutAtomScale{})));
+
+  static_assert(cute::rank(SwappedSmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SwappedSmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SwappedSmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SwappedSmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SwappedSmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SwappedSmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(rank(SmemLayoutAtomScale{}) == 2, "SmemLayoutAtomScale must be rank 2");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomScale{})) == 0, "SmemLayoutAtomScale must equal the tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomScale{})) == 0, "SmemLayoutAtomScale must evenly divide tile k shape.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+
+  using SmemLayoutA = decltype(detail::get_smem_layout<DispatchPolicy::Stages>(SwappedSmemLayoutAtomA{}, select<0,2>(TileShape{}), SwappedStrideA{}));
+  using SmemLayoutB = decltype(detail::get_smem_layout<DispatchPolicy::Stages>(SwappedSmemLayoutAtomB{}, select<1,2>(TileShape{}), SwappedStrideB{}));
+    
+  // It is assumed that the scales and zero-points share the same smem layout
+  using SmemLayoutScale = decltype(tile_to_shape(
+    SmemLayoutAtomScale{}, 
+    make_shape(shape<0>(ScaleTileShape{}), shape<1>(ScaleTileShape{}), Int<Stages>{}),
+    cute::conditional_t< ::cutlass::gemm::detail::is_major<0,NonVoidStrideScale>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                    cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source A from rmem and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  // To relax them, we need to handle loading more than 1 row of scales for every main loop iteration.
+  // We must also handle updating the pipeline transaction bytes on the fly.
+  // NOTE: Deleting this assertion without required changes will cause the code to hang.
+  static_assert(size<1>(SmemLayoutAtomScale{}) == 1, "size<1>(SmemLayoutAtomScale) must be 1.");
+
+private:
+  static constexpr ConversionMode 
+  get_conversion_mode() {
+    if constexpr (cute::is_void_v<ElementScale>) {
+      return ConversionMode::DirectConvert;
+    } 
+    else if constexpr (cute::is_void_v<ElementZero>) {
+      return ConversionMode::ConvertAndScale;
+    }
+    else {
+      return ConversionMode::ConvertAndScaleWithZero;
+    }
+  }
+
+public:
+  static constexpr ConversionMode KernelConversionMode = get_conversion_mode();
+  static constexpr bool ModeHasScales = KernelConversionMode == ConversionMode::ConvertAndScale ||
+                                        KernelConversionMode == ConversionMode::ConvertAndScaleWithZero;
+  static constexpr bool UseScaleLookupTable = KernelConversionMode == ConversionMode::ConvertAndScale &&
+                                              cutlass::detail::is_Array_v<ElementScale>;
+  static constexpr size_t SmemAlignmentA = cutlass::detail::alignment_for_swizzle(SmemLayoutA{}); 
+
+  static constexpr size_t SmemAlignmentB = cutlass::detail::alignment_for_swizzle(SmemLayoutB{});
+
+  // Just pick the max alignment of A and B since it is required to be at least 128B
+  static constexpr size_t SmemAlignmentScale = cute::max(SmemAlignmentA, SmemAlignmentB);
+
+  static_assert(SmemAlignmentA >= 128 and SmemAlignmentB >= 128, "Require at least 128B alignment");
+
+  struct SharedStorage
+  {
+    static constexpr int scale_elements = Utils::elements_per_smem_scale();
+    static constexpr int zero_elements = Utils::elements_per_smem_zero();
+    struct TensorStorage {
+      CUTE_ALIGNAS(SmemAlignmentA) cute::ArrayEngine<RealSwappedElementA, cute::cosize_v<SmemLayoutA>> smem_A;
+      CUTE_ALIGNAS(SmemAlignmentB) cute::ArrayEngine<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::ArrayEngine<NonVoidElementScale, scale_elements> smem_scale;
+      cute::ArrayEngine<NonVoidElementZero, zero_elements> smem_zero;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A = nullptr;
+    StrideA dA{};
+    ElementB const* ptr_B = nullptr;
+    StrideB dB{};
+    ElementScale const* ptr_S = nullptr;
+    NonVoidStrideScale dS{};
+    int group_size = 0;
+    ElementZero const* ptr_Z = nullptr;
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  // Device side kernel params
+  struct Params {
+  public:
+
+    // Assumption: StrideA is congruent with Problem_MK
+    using LayoutA = decltype(detail::get_gmem_layout(repeat_like(SwappedStrideA{}, int32_t(0)), SwappedStrideA{}));
+    using LayoutB = decltype(detail::get_gmem_layout(repeat_like(SwappedStrideB{}, int32_t(0)), SwappedStrideB{}));
+
+    using TMA_A = decltype(make_tma_copy_A_sm90<TmaElementA>(
+        GmemTiledCopyA{},
+        make_tensor(detail::get_logical_ptr(static_cast<SwappedElementA const*>(nullptr)), LayoutA{}),
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}));  // mcast along N mode for this M load, if any
+
+   using TMA_Scale = decltype(make_tma_copy<TmaElementScale>(
+        GmemTiledCopyScale{},
+        make_tensor(detail::get_logical_ptr(static_cast<NonVoidElementScale const*>(nullptr)), repeat_like(NonVoidStrideScale{}, int32_t(0)), NonVoidStrideScale{}),
+        SmemLayoutScale{}(_,_,cute::Int<0>{}),
+        ScaleTileShape{},
+        _1{}));  // mcast along N mode for this M load, if any. Scale is ALWAYS loaded with A for RF kernel
+
+   using TMA_Zero = decltype(make_tma_copy(
+        GmemTiledCopyScale{},
+        make_tensor(detail::get_logical_ptr(static_cast<NonVoidElementZero const*>(nullptr)), repeat_like(NonVoidStrideScale{}, int32_t(0)), NonVoidStrideScale{}),
+        SmemLayoutScale{}(_,_,cute::Int<0>{}),
+        ScaleTileShape{},
+        _1{}));  // mcast along N mode for this M load, if any. Scale is ALWAYS loaded with A for RF kernel
+
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        make_tensor(detail::get_logical_ptr(static_cast<SwappedElementB const*>(nullptr)), LayoutB{}),
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{})); // mcast along M mode for this N load, if any
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    TMA_Scale tma_load_scale;
+    TMA_Zero tma_load_zero;
+    int64_t scale_k;
+    int group_size;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    int reload_factor = (group_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{});
+    SwappedStrideA dA;
+    SwappedStrideB dB;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    if constexpr (SwapAB) {
+      M = get<1>(problem_shape_MNKL);
+      N = get<0>(problem_shape_MNKL);
+    }
+
+    SwappedElementA const* ptr_A;
+    SwappedStrideA dA;
+    SwappedElementB const* ptr_B;
+    SwappedStrideB dB;
+
+    if constexpr (not SwapAB) {
+      ptr_A = reinterpret_cast<SwappedElementA const*>(args.ptr_A);
+      ptr_B = reinterpret_cast<SwappedElementB const*>(args.ptr_B);
+      dA = args.dA;
+      dB = args.dB;
+    }
+    else {
+      ptr_A = reinterpret_cast<SwappedElementA const*>(args.ptr_B);
+      ptr_B = reinterpret_cast<SwappedElementB const*>(args.ptr_A);
+      dA = args.dB;
+      dB = args.dA;
+    }
+
+    Tensor tensor_a = make_tensor(detail::get_logical_ptr(ptr_A), detail::get_gmem_layout(make_shape(M,K,L), dA));
+    Tensor tensor_b = make_tensor(detail::get_logical_ptr(ptr_B), detail::get_gmem_layout(make_shape(N,K,L), dB));
+    typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90<TmaElementA>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}); // mcast along N mode for this M load, if any
+
+    typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}); // mcast along M mode for this N load, if any
+
+    typename Params::TMA_Scale tma_load_scale{};
+    typename Params::TMA_Zero tma_load_zero{};
+
+    uint32_t tma_transaction_bytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return { tma_load_a, tma_load_b, tma_load_scale, tma_load_zero, 0, 0, tma_transaction_bytes, 1, dA, dB };
+    } 
+    else if constexpr (ModeHasScales) {
+      auto scale_k = (K + args.group_size - 1) / args.group_size;
+      ElementScale const* ptr_S = args.ptr_S;
+      StrideScale dS = args.dS;
+      Tensor tensor_scale = make_tensor(detail::get_logical_ptr(ptr_S), make_layout(make_shape(M,scale_k,L), dS));
+      tma_load_scale = make_tma_copy<TmaElementScale>(
+          GmemTiledCopyScale{},
+          tensor_scale,
+          SmemLayoutScale{}(_,_,cute::Int<0>{}),
+          ScaleTileShape{},
+          _1{}); // mcast along N mode for this M load, if any
+
+      if constexpr(KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return { tma_load_a, tma_load_b, tma_load_scale, tma_load_zero, scale_k, args.group_size, tma_transaction_bytes + TmaTransactionBytesExtra, (args.group_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{}), dA, dB };
+      }
+      else if constexpr(KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor tensor_zero = make_tensor(detail::get_logical_ptr(args.ptr_Z), make_layout(make_shape(M,scale_k,L), dS));
+        tma_load_zero = make_tma_copy(
+            GmemTiledCopyScale{},
+            tensor_zero,
+            SmemLayoutScale{}(_,_,cute::Int<0>{}),
+            ScaleTileShape{},
+            _1{}); // mcast along N mode for this M load, if any
+        return { tma_load_a, tma_load_b, tma_load_scale, tma_load_zero, scale_k, args.group_size, tma_transaction_bytes + TmaTransactionBytesExtra, (args.group_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{}), dA, dB };
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in to_underlying_arguments.");
+      }
+    } 
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in to_underlying_arguments.");
+    }
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    bool check_aligned_A = cutlass::detail::check_alignment<min_tma_aligned_elements_A>(detail::get_gmem_layout(cute::make_shape(M,K,L), args.dA));
+
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    bool check_aligned_B = cutlass::detail::check_alignment<min_tma_aligned_elements_B>(detail::get_gmem_layout(cute::make_shape(N,K,L), args.dB));
+
+    bool check_aligned_S = true;
+    bool check_aligned_Z = true;
+    bool check_mode_args = true;
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      check_mode_args = check_mode_args && (args.ptr_S == nullptr);
+      check_mode_args = check_mode_args && (args.ptr_Z == nullptr);
+    } 
+    else if constexpr (ModeHasScales) {
+      const int scale_mn = SwapAB ? N : M;
+      const int scale_k = (K + args.group_size - 1) / args.group_size;
+      constexpr int min_tma_aligned_elements_scale = tma_alignment_bits / cutlass::sizeof_bits<ElementScale>::value;
+      check_aligned_S = cutlass::detail::check_alignment<min_tma_aligned_elements_scale>(cute::make_shape(scale_mn,scale_k,L), args.dS);
+      check_mode_args = check_mode_args && (args.group_size == K || ((args.group_size % size<2>(TileShape{})) == 0));
+      check_mode_args = check_mode_args && args.group_size != 0;
+      check_mode_args = check_mode_args && (args.ptr_S != nullptr);
+
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        check_mode_args = check_mode_args && (args.ptr_Z == nullptr);
+      }
+      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        constexpr int min_tma_aligned_elements_zero = tma_alignment_bits / cutlass::sizeof_bits<ElementZero>::value;
+        check_aligned_Z = cutlass::detail::check_alignment<min_tma_aligned_elements_zero>(cute::make_shape(scale_mn,scale_k,L), args.dS);
+        check_mode_args = check_mode_args && (args.ptr_Z != nullptr);
+      } 
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in can_implement.");
+      }
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in can_implement.");
+    }
+
+    if (!check_mode_args) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Invalid arguments for the selected conversion mode.\n");
+    }
+    if (!check_aligned_A) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Tensor A meet the minimum alignment requirements for TMA.\n");
+    }
+    if (!check_aligned_B) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Tensor B meet the minimum alignment requirements for TMA.\n");
+    }
+    if (!check_aligned_S) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Tensor S (scale) meet the minimum alignment requirements for TMA.\n");
+    }
+    if (!check_aligned_Z) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Tensor Z (zeros) meet the minimum alignment requirements for TMA.\n");
+    }
+
+    return check_mode_args && check_aligned_A && check_aligned_B && check_aligned_S && check_aligned_Z;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr uint32_t TmaTransactionBytesMK = Utils::compute_tma_transaction_bytes_mk();
+  static constexpr uint32_t TmaTransactionBytesNK = Utils::compute_tma_transaction_bytes_nk();
+  static constexpr uint32_t TmaTransactionBytesExtra = Utils::compute_tma_transaction_bytes_extra();
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK + TmaTransactionBytesExtra;
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      // Nothing extra to do
+    } 
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      cute::prefetch_tma_descriptor(mainloop_params.tma_load_scale.get_tma_descriptor());
+    }
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      cute::prefetch_tma_descriptor(mainloop_params.tma_load_scale.get_tma_descriptor());
+      cute::prefetch_tma_descriptor(mainloop_params.tma_load_zero.get_tma_descriptor());
+    }  
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in TMA prefetch.");
+    }
+    
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  /// The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(shape(detail::get_gmem_layout(make_shape(M,K,L), mainloop_params.dA))); // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(shape(detail::get_gmem_layout(make_shape(N,K,L), mainloop_params.dB))); // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return cute::make_tuple(gA_mkl, gB_nkl);
+    } 
+    else if constexpr (ModeHasScales) {
+      auto scale_k = mainloop_params.scale_k;
+      Tensor mS_mkl = mainloop_params.tma_load_scale.get_tma_tensor(make_shape(M,scale_k,L));          // (m,scale_k,l)
+      Tensor gS_mkl = local_tile(mS_mkl, ScaleTileShape{}, make_coord(_,_));         // (BLK_M,BLK_Scale_K,m,scale_k,l)
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(gA_mkl, gB_nkl, gS_mkl);
+      }
+      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor mZ_mkl = mainloop_params.tma_load_zero.get_tma_tensor(make_shape(M,scale_k,L));         // (m,scale_k,l)
+        Tensor gZ_mkl = local_tile(mZ_mkl, ScaleTileShape{}, make_coord(_,_));       // (BLK_M,BLK_Scale_K,m,scale_k,l)
+        return cute::make_tuple(gA_mkl, gB_nkl, gS_mkl, gZ_mkl);
+      }
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in load_init.");
+      }
+    } 
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in load_init.");
+    }
+  }  
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  /// This overload gets triggered when we have scales.
+  template <
+    class... Ts,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline, 
+      PipelineState smem_pipe_write,
+      cute::tuple<Ts...> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      static_assert(sizeof... (Ts) == 2, "Direct convert needs two inputs");
+    } 
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      static_assert(sizeof... (Ts) == 3, "Scaled convert needs three inputs");
+    } 
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      static_assert(sizeof... (Ts) == 4, "Scaled and zero convert needs four inputs");
+    } 
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in TMA load.");
+    }
+
+    Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});      // (BLK_M,BLK_K,PIPE)
+    Tensor sB_ = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});      // (BLK_N,BLK_K,PIPE)
+    Tensor sA  = as_position_independent_swizzle_tensor(sA_);                                   // (BLK_M,BLK_K,PIPE)
+    Tensor sB  = as_position_independent_swizzle_tensor(sB_);                                   // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Prepare the TMA loads for A, B and Scales
+    //
+    
+    constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+    uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gB_nkl = get<1>(load_inputs);
+
+    auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+    auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+    // Partition the inputs based on the current block coordinates.
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+    Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+    Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+    // Applies the mapping from block_tma_a
+    Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+    Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+    Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+    Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+    uint16_t mcast_mask_a = 0;
+    uint16_t mcast_mask_b = 0;
+    uint16_t mcast_mask_s = 0;
+
+    // Issue TmaLoads
+    // Maps the tile -> block, value
+    if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+      for (int n = 0; n < size<1>(block_layout); ++n) {
+        mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+      }
+    }
+
+    if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+      for (int m = 0; m < size<0>(block_layout); ++m) {
+        mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+      }
+    }
+
+    auto extra_input_partitions = Utils::partition_extra_tma_inputs(mainloop_params, load_inputs, shared_tensors, cluster_local_block_id, m_coord, l_coord);
+
+    // Mainloop
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+
+      //
+      // Copy gmem to smem for *k_tile_iter
+      //
+
+      using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+      BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+      int write_stage = smem_pipe_write.index();
+      if (cute::elect_one_sync()) {
+        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+      }
+
+      if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+        // Nothing extra to do.
+      }
+      else if constexpr (ModeHasScales) {
+        auto tSgS = get<0>(extra_input_partitions);
+        auto tSsS = get<1>(extra_input_partitions);
+
+        // Temporary factor which will determine which k tile to reload from gmem. Needed so we don't modify tma transaction bytes
+        // on the fly.
+        // We must do a ceiling divide here to correctly handle with group_size == K. In that case, we don't require that K
+        // is a multiple of the threadblock tile K
+        int const scale_load_k = *k_tile_iter / mainloop_params.reload_factor; // This will always be 0 when group_size == K.
+        if (cute::elect_one_sync()) copy(mainloop_params.tma_load_scale.with(*tma_barrier, mcast_mask_s), tSgS(_,_,_,scale_load_k), tSsS(_,_,_,write_stage));
+
+        if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+          // Nothing extra to do
+        } 
+        else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+          auto tZgZ = get<2>(extra_input_partitions);
+          auto tZsZ = get<3>(extra_input_partitions);
+          if (cute::elect_one_sync()) copy(mainloop_params.tma_load_zero.with(*tma_barrier, mcast_mask_s), tZgZ(_,_,_,scale_load_k), tZsZ(_,_,_,write_stage));
+        }
+        else {
+          static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled for TMA copy op.");
+        } 
+      } 
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled for TMA copy op.");
+      }
+
+      ++k_tile_iter;
+
+      // Advance smem_pipe_write
+      ++smem_pipe_write;
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    // Issue the epilogue waits
+    if (cute::elect_one_sync()) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all 
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was 
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SwappedSmemLayoutAtomA{}) == 2, "SwappedSmemLayoutAtomA must be rank 2.");
+    static_assert(cute::rank(SwappedSmemLayoutAtomB{}) == 2, "SwappedSmemLayoutAtomB must be rank 2.");
+    static_assert(!cute::is_void_v<SwappedSmemCopyAtomA>,
+      "SM90 GMMA mainloops must specify a non-void copy atom for RF sourced instructions.");
+    static_assert(cute::is_void_v<SwappedSmemCopyAtomB>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    // Obtain warp index
+    int warp_idx = canonical_warp_idx_sync();
+    [[maybe_unused]] int warp_group_thread_idx = thread_idx % 128;
+    
+    Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+    Tensor sA = as_position_independent_swizzle_tensor(sA_);                                      // (BLK_M,BLK_K,PIPE)
+    
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});         // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{}, 
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto mma_thread_slice = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCsA = mma_thread_slice.partition_A(sA);
+    auto mma_warpgroup_slice = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    // Allocate fragments and descriptors
+    Tensor tCrA_mma = mma_thread_slice.partition_fragment_A(sA(_,_,Int<0>{}));                // (MMA,MMA_M,MMA_K,PIPE)
+
+    Tensor tCrA_load = [&]{
+      if constexpr (not is_layout<SwappedStrideA>::value) {
+        // Make register tensor with MMA layout
+        return make_fragment_like<RealSwappedElementA>(tCrA_mma);
+      }
+      else {
+        // Make register tensor matching smem layout, converter will take care of de-swizzling
+        return make_tensor_like<RealSwappedElementA>(tCsA(_,_,_,Int<0>{}));
+      }
+    }();
+    
+    Tensor tCsB = mma_warpgroup_slice.partition_B(sB);                                        // (MMA,MMA_N,MMA_K,PIPE)
+    Tensor tCrB = mma_warpgroup_slice.make_fragment_B(tCsB);                                  // (MMA,MMA_N,MMA_K,PIPE)
+
+    //
+    // Copy Atom A retiling
+    //
+    auto smem_tiled_copy_A = make_tiled_copy_A(SwappedSmemCopyAtomA{}, tiled_mma);
+    auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(warp_group_thread_idx);
+
+    Tensor tCrA_copy_view  = smem_thr_copy_A.retile_D(tCrA_load);                                  // (CPY,CPY_M,CPY_K)
+
+    // Partition of thread -> shared and thread -> RF
+    auto partitioned_extra_info = Utils::partition_extra_mma_info(mma_thread_slice, shared_tensors);
+    auto copy_partitions_extra_info = Utils::retile_extra_mma_info(tiled_mma, partitioned_extra_info, warp_group_thread_idx);
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));                                            // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));                                            // CPY_K
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA_mma) == size<1>(accum));                                                 // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    warpgroup_fence_operand(accum);
+
+    constexpr int K_BLOCK_MAX = size<2>(tCrA_load);
+    constexpr int K_WAIT_MAX = cute::min(K_BLOCK_MAX - 1, 7);
+    static_assert(K_BLOCK_MAX >= 4, "Consider increasing TileShapeK");
+
+    ConsumerToken barrier_token = {BarrierStatus::WaitAgain};
+    // first k tile
+    {
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+
+      ++smem_pipe_read;
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+
+      // copy smem->rmem for A operand
+      Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
+        partitioned_extra_info, copy_partitions_extra_info, 0, read_stage);
+      if (K_BLOCK_MAX > 1) { // prefetch next block
+        Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
+          partitioned_extra_info, copy_partitions_extra_info, 1, read_stage);
+      }
+      Utils::dequantize_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, 0);
+      
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) {
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA_mma(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+
+        if (k_block < K_BLOCK_MAX - 2) { // prefetch next block
+          Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
+            partitioned_extra_info, copy_partitions_extra_info, k_block + 2, read_stage);
+        }
+        if (k_block < K_BLOCK_MAX - 1) {
+          Utils::dequantize_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, k_block + 1);
+        }
+      }     
+
+      --k_tile_count;
+      if (k_tile_count > 0) {
+        // Wait for K_BLOCK_MAX - 1 to be in flight to ensure that it is safe to overwrite the A registers for the first mma.
+        pipeline.consumer_wait(smem_pipe_read, barrier_token);
+        Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
+          partitioned_extra_info, copy_partitions_extra_info, 0, smem_pipe_read.index());
+        if (K_BLOCK_MAX > 1) { // prefetch next block
+          Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
+            partitioned_extra_info, copy_partitions_extra_info, 1, smem_pipe_read.index());
+        }
+        warpgroup_wait<K_WAIT_MAX>(); 
+        Utils::dequantize_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, 0);
+      }
+    }
+
+    if (k_tile_count == 0) {
+      return;
+    }
+
+    warpgroup_fence_operand(accum);
+    // Mainloop GMMAs
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 1; --k_tile_count) {
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+      ++smem_pipe_read;
+
+      warpgroup_fence_operand(accum);
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) {
+        
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA_mma(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+
+        warpgroup_wait<K_WAIT_MAX>(); // We have K_BLOCK_MAX - 1 GMMA instructions pending for this stage, so we can release prior barrier
+        if (k_block == K_BLOCK_MAX - 1) {
+          pipeline.consumer_release(smem_pipe_release);             // UNLOCK smem_pipe_release, done _computing_ on it
+          ++smem_pipe_release;
+        }
+
+        if (k_block == 0) {
+          barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+        }
+
+        if (k_block == K_BLOCK_MAX - 1) { 
+          pipeline.consumer_wait(smem_pipe_read, barrier_token);
+          Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
+            partitioned_extra_info, copy_partitions_extra_info, 0, smem_pipe_read.index());
+          if (K_BLOCK_MAX > 1) { // prefetch next block
+            Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
+              partitioned_extra_info, copy_partitions_extra_info, 1, smem_pipe_read.index());
+          }
+          Utils::dequantize_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, 0);
+        } 
+        else {
+          if (k_block < K_BLOCK_MAX - 2) { // prefetch next block
+            Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
+              partitioned_extra_info, copy_partitions_extra_info, k_block + 2, read_stage);
+          }
+          Utils::dequantize_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, k_block + 1);
+        }
+      }
+      warpgroup_fence_operand(accum);
+
+    }
+
+    warpgroup_fence_operand(accum);
+
+    {
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+
+      warpgroup_fence_operand(accum);
+      
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) {
+
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA_mma(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+
+        warpgroup_wait<K_WAIT_MAX>();
+        if (k_block == K_BLOCK_MAX - 1) { // release prior barrier
+          pipeline.consumer_release(smem_pipe_release);             // UNLOCK smem_pipe_release, done _computing_ on it
+          ++smem_pipe_release;
+        }
+
+        if (k_block < K_BLOCK_MAX - 2) { // prefetch next block
+          Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
+            partitioned_extra_info, copy_partitions_extra_info, k_block + 2, read_stage);
+        }
+        if (k_block < K_BLOCK_MAX - 1) {
+          Utils::dequantize_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, k_block + 1);
+        }
+      }
+    }
+
+    warpgroup_fence_operand(accum);
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = 1;
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+    
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss.hpp b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss.hpp
new file mode 100644
index 0000000000..daaed6210b
--- /dev/null
+++ b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss.hpp
@@ -0,0 +1,539 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int Stages,
+  class ClusterShape,
+  int PipelineAsyncMmaStages,
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90TmaGmma<Stages, ClusterShape, PipelineAsyncMmaStages>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90TmaGmma<Stages, ClusterShape, PipelineAsyncMmaStages>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+  using PipelineState  = typename cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  static constexpr int ThreadCount = CUTE_STATIC_V(size(TiledMma{}));
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
+  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using InternalElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using InternalElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
+
+  struct SharedStorage {
+    cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+    cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    alignas(16) PipelineStorage pipeline_storage;
+  };
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A;
+    StrideA dA;
+    ElementB const* ptr_B;
+    StrideB dB;
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy(
+        GmemTiledCopyA{},
+        make_tensor(static_cast<InternalElementA const*>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
+        SmemLayoutA{}(_,_,0),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})));  // mcast along N mode for this M load, if any
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy(
+        GmemTiledCopyB{},
+        make_tensor(static_cast<InternalElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,0),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{}))); // mcast along M mode for this N load, if any
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    auto ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_A);
+    auto ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+    typename Params::TMA_A tma_load_a = make_tma_copy(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})); // mcast along N mode for this M load, if any
+    typename Params::TMA_B tma_load_b = make_tma_copy(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{})); // mcast along M mode for this N load, if any
+    return {
+      tma_load_a,
+      tma_load_b
+    };
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+    
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA, class TMA_LOAD_A,
+    class TensorB, class TMA_LOAD_B,
+    class FrgTensorC,
+    class KTileIterator
+  >
+  CUTLASS_DEVICE void
+  operator() (
+      TensorA const& gA, TMA_LOAD_A& tma_load_a,
+      TensorB const& gB, TMA_LOAD_B& tma_load_b,
+      FrgTensorC& accum,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      char* shared_memory,
+      Params const& mainloop_params)
+  {
+    using namespace cute;
+
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2.");
+    static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+    static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    SharedStorage& storage = *reinterpret_cast<SharedStorage*>(shared_memory);
+    Tensor sA = make_tensor(make_smem_ptr(storage.smem_A.data()), SmemLayoutA{});                 // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(storage.smem_B.data()), SmemLayoutB{});                 // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Prepare the TMA loads for A and B
+    //
+
+    constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+    uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+    auto block_tma_a = tma_load_a.get_slice(cluster_local_block_id.y);
+    auto block_tma_b = tma_load_b.get_slice(cluster_local_block_id.x);
+
+    // Applies the mapping from block_tma_a
+    Tensor tAgA = block_tma_a.partition_S(gA);                                                // (TMA,TMA_M,TMA_K,k)
+    Tensor tAsA = block_tma_a.partition_D(sA);                                                // (TMA,TMA_M,TMA_K,PIPE)
+
+    Tensor tBgB = block_tma_b.partition_S(gB);                                                // (TMA,TMA_N,TMA_K,k)
+    Tensor tBsB = block_tma_b.partition_D(sB);                                                // (TMA,TMA_N,TMA_K,PIPE)
+
+    //
+    // Prepare TMA membars and PREFETCH
+    //
+
+    // Number of pipelined k-tiles in smem
+    constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+
+    // NOTE: Another parameter: Partition the pipeline between active MMAs and active TMAs
+    // Tunable via the dispatch policy to tollerate latencies evenly across the math and compute stages
+    // K_PIPE_MMAS: The max number of active MMA pipes at beginning of every loop
+    // K_PIPE_TMAS: The max number of active TMA pipes at beginning of every loop (geq 1)
+    constexpr int K_PIPE_MMAS = DispatchPolicy::PipelineAsyncMmaStages;
+    constexpr int K_PIPE_TMAS = K_PIPE_MAX - K_PIPE_MMAS;
+    static_assert(0 <= K_PIPE_MMAS && K_PIPE_MMAS <  K_PIPE_MAX);
+    static_assert(0 <  K_PIPE_TMAS && K_PIPE_TMAS <= K_PIPE_MAX);
+
+    static_assert(K_PIPE_MMAS < K_PIPE_MAX - 1);
+
+    // Set the bytes transferred in this TMA transaction (may involve multiple issues)
+    constexpr uint32_t TmaTransactionBytes = static_cast<uint32_t>(
+        cutlass::bits_to_bytes(size<0>(sA) * size<1>(sA) * sizeof_bits<InternalElementA>::value) +
+        cutlass::bits_to_bytes(size<0>(sB) * size<1>(sB) * sizeof_bits<InternalElementB>::value));
+
+    // Obtain warp index
+    int warp_idx = canonical_warp_idx_sync();
+    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
+
+    PipelineParams params;
+    params.transaction_bytes = TmaTransactionBytes;
+    params.role = MainloopPipeline::ThreadCategory::ProducerConsumer;
+    params.is_leader = warp_group_thread_idx == 0;
+    params.num_consumers = NumThreadsPerWarpGroup;
+
+    MainloopPipeline pipeline(storage.pipeline_storage, params, ClusterShape{});
+
+    // State variables used for iterating the circular buffer
+    // smem_pipe_read / release is used by the consumer of SMEM data - i.e MMA
+    // smem_pipe_write is used by the producer of SMEM data - i.e TMA
+    PipelineState smem_pipe_read;
+    PipelineState smem_pipe_release;
+    PipelineState smem_pipe_write = cutlass::make_producer_start_state<MainloopPipeline>();
+
+    // We need this to guarantee that the Pipeline init is visible
+    // To all producers and consumer blocks in the Cluster
+    if constexpr (size(ClusterShape{}) > 1) {
+      cute::cluster_arrive_relaxed();
+      cute::cluster_wait();
+    }
+    else {
+      __syncthreads();
+    }
+
+    // Set predicate for the lowest lane_id in the warp
+    int lane_predicate = cute::elect_one_sync();
+
+    uint16_t mcast_mask_a = 0;
+    uint16_t mcast_mask_b = 0;
+    // Keep a copy to know when to stop issuing loads
+    int k_tile_count_tma = k_tile_count;
+
+    // Issue TmaLoads (Prologue fetches)
+    if (warp_idx == 0 && lane_predicate == 1) {
+      // Maps the tile -> block, value
+      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+        for (int n = 0; n < size<1>(block_layout); ++n) {
+          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+        }
+      }
+
+      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+        for (int m = 0; m < size<0>(block_layout); ++m) {
+          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+        }
+      }
+
+      // Issue the prologue loads
+      int prologue_tma_count = min(K_PIPE_MAX, k_tile_count);
+      CUTLASS_PRAGMA_UNROLL
+      for (int stage = 0; stage < prologue_tma_count; ++stage) {
+        pipeline.producer_acquire(smem_pipe_write);
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        copy(tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,stage));
+        copy(tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,stage));
+        ++k_tile_iter;
+        ++smem_pipe_write;
+      }
+      k_tile_count_tma -= prologue_tma_count;
+    }
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and 
+                  stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{}, 
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    Tensor tCsA = thread_mma.partition_A(sA);                                  // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);                                  // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                            // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                            // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                     // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                     // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                      // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                      // PIPE
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tAsA));                      // PIPE
+    CUTE_STATIC_ASSERT_V(size<3>(tCsB) == size<3>(tBsB));                      // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));        // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));        // PIPE
+
+    __syncthreads();
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    warpgroup_fence_operand(accum);
+    // Prologue MMAs
+    assert(k_tile_count >= 1);
+    {
+      // WAIT on smem_pipe_read until it's data is available
+      pipeline.consumer_wait(smem_pipe_read);
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,smem_pipe_read.index()), tCrB(_,_,k_block,smem_pipe_read.index()), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+
+      warpgroup_commit_batch();
+      ++smem_pipe_read;
+      --k_tile_count;
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count) - 1; 
+        prologue_mma_count > 0; --prologue_mma_count)
+    {
+      // WAIT on smem_pipe_read until it's data is available
+      pipeline.consumer_wait(smem_pipe_read);
+      warpgroup_arrive();
+      // (V,M,K) x (V,N,K) => (V,M,N)
+      cute::gemm(tiled_mma, tCrA(_,_,_,smem_pipe_read.index()), tCrB(_,_,_,smem_pipe_read.index()), accum);
+      warpgroup_commit_batch();
+      ++smem_pipe_read;
+      --k_tile_count;
+    }
+    warpgroup_fence_operand(accum);
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count)
+    {
+      // WAIT on smem_pipe_read until data is available
+      pipeline.consumer_wait(smem_pipe_read);
+
+      //
+      // Compute on k_tile
+      //
+
+      warpgroup_fence_operand(accum);
+      warpgroup_arrive();
+      // (V,M,K) x (V,N,K) => (V,M,N)
+      cute::gemm(tiled_mma, tCrA(_,_,_,smem_pipe_read.index()), tCrB(_,_,_,smem_pipe_read.index()), accum);
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
+      warpgroup_wait<K_PIPE_MMAS>();
+      warpgroup_fence_operand(accum);
+
+      pipeline.consumer_release(smem_pipe_release);  // UNLOCK wr stage, done _computing_ on it
+
+      //
+      // Copy gmem to smem for *k_tile_iter
+      //
+
+      // Do Acquire & Load only if needed - helps with both performance and also corner case illegal barrier-ops
+      if (warp_idx == 0 && lane_predicate == 1 && (k_tile_count_tma > 0) ) {
+        pipeline.producer_acquire(smem_pipe_write);  // LOCK wr stage, for _writing_
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        copy(tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,smem_pipe_write.index()));
+        copy(tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,smem_pipe_write.index()));
+        ++smem_pipe_write;
+        ++k_tile_iter;
+        --k_tile_count_tma;
+      }
+
+      // Advance consumer pipeline
+      ++smem_pipe_read;
+      ++smem_pipe_release;
+    }
+
+    // Wait on all GMMAs
+    warpgroup_wait<0>();
+    warpgroup_fence_operand(accum);
+
+    // Workaround for ensuring Smem destruction doesn't happen accidentally
+    if constexpr (size(typename DispatchPolicy::ClusterShape{}) > 1) {
+      cute::cluster_arrive();
+      cute::cluster_wait();
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp
new file mode 100644
index 0000000000..b370dc70b5
--- /dev/null
+++ b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp
@@ -0,0 +1,582 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <
+  int Stages,
+  class ClusterShape,
+  class KernelSchedule,
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90TmaGmmaWarpSpecialized<Stages, ClusterShape, KernelSchedule>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecialized<Stages, ClusterShape, KernelSchedule>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using InternalElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using InternalElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
+
+  struct SharedStorage
+  {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A;
+    StrideA dA;
+    ElementB const* ptr_B;
+    StrideB dB;
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy_A_sm90(
+        GmemTiledCopyA{},
+        make_tensor(static_cast<InternalElementA const*>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}));
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        make_tensor(static_cast<InternalElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}));
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    auto ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_A);
+    auto ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+
+    typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{});
+    typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{});
+    uint32_t transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t transaction_bytes_nk = TmaTransactionBytesNK;
+    uint32_t transaction_bytes = transaction_bytes_mk + transaction_bytes_nk;
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      transaction_bytes,
+      transaction_bytes_mk,
+      transaction_bytes_nk
+    };
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+    
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 1;
+  static constexpr uint32_t TmaTransactionBytesMK =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<ElementA>::value));
+  static constexpr uint32_t TmaTransactionBytesNK =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<ElementB>::value));
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  /// The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L));                            // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
+
+    return cute::make_tuple(gA_mkl, gB_nkl);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA, class TensorB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA, TensorB> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+
+      //
+      // Prepare the TMA loads for A and B
+      //
+
+      constexpr uint32_t cluster_shape_x = get<0>(typename DispatchPolicy::ClusterShape());
+      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+      Tensor gA_mkl = get<0>(load_inputs);
+      Tensor gB_nkl = get<1>(load_inputs);
+
+      auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+      auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+      Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+      // Applies the mapping from block_tma_a
+      Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+      uint16_t mcast_mask_a = 0;
+      uint16_t mcast_mask_b = 0;
+
+      // Issue TmaLoads
+      // Maps the tile -> block, value
+      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+        for (int n = 0; n < size<1>(block_layout); ++n) {
+          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+        }
+      }
+
+      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+        for (int m = 0; m < size<0>(block_layout); ++m) {
+          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+        }
+      }
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for ( ; k_tile_count > 0; --k_tile_count) {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+        ++k_tile_iter;
+
+        // Advance smem_pipe_write
+        ++smem_pipe_write;
+      }
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all 
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was 
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+    static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and 
+                  stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{}, 
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
+        "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    assert(k_tile_count >= 1);
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+    warpgroup_fence_operand(accum);
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+      warpgroup_arrive();
+      tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+
+      warpgroup_commit_batch();
+
+      ++smem_pipe_read;
+    }
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+
+    warpgroup_fence_operand(accum);
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_tile_prologue = prologue_mma_count - 1; k_tile_prologue > 0; --k_tile_prologue)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+      warpgroup_arrive();
+      // (V,M,K) x (V,N,K) => (V,M,N)
+      cute::gemm(tiled_mma, tCrA(_,_,_,read_stage), tCrB(_,_,_,read_stage), accum);
+      warpgroup_commit_batch();
+
+      ++smem_pipe_read;
+    }
+
+    warpgroup_fence_operand(accum);
+    // Mainloop GMMAs
+    k_tile_count -= prologue_mma_count;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+      warpgroup_fence_operand(accum);
+      warpgroup_arrive();
+      // (V,M,K) x (V,N,K) => (V,M,N)
+      cute::gemm(tiled_mma, tCrA(_,_,_,read_stage), tCrB(_,_,_,read_stage), accum);
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
+      warpgroup_wait<K_PIPE_MMAS>();
+      warpgroup_fence_operand(accum);
+
+      // UNLOCK smem_pipe_release, done _computing_ on it
+      pipeline.consumer_release(smem_pipe_release);
+
+      // Advance smem_pipe_read and smem_pipe_release
+      ++smem_pipe_read;
+      ++smem_pipe_release;
+    }
+
+    warpgroup_fence_operand(accum);
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+    
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8.hpp b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8.hpp
new file mode 100644
index 0000000000..da5274469f
--- /dev/null
+++ b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8.hpp
@@ -0,0 +1,584 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/fp8_accumulation.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/numeric_types.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <
+  int Stages,
+  class ClusterShape,
+  class KernelSchedule,
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90TmaGmmaWarpSpecializedFP8<Stages, ClusterShape, KernelSchedule>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedFP8<Stages, ClusterShape, KernelSchedule>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
+  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  struct SharedStorage
+  {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A;
+    StrideA dA;
+    ElementB const* ptr_B;
+    StrideB dB;
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy_A_sm90(
+        GmemTiledCopyA{},
+        make_tensor(static_cast<ElementA const*>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
+        SmemLayoutA{}(_,_,0),
+        TileShape{},
+        ClusterShape{}));
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        make_tensor(static_cast<ElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,0),
+        TileShape{},
+        ClusterShape{}));
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    auto ptr_A = reinterpret_cast<ElementA const*>(args.ptr_A);
+    auto ptr_B = reinterpret_cast<ElementB const*>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+    typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{});
+    typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{});
+    uint32_t transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t transaction_bytes_nk = TmaTransactionBytesNK;
+    uint32_t transaction_bytes = transaction_bytes_mk + transaction_bytes_nk;
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      transaction_bytes,
+      transaction_bytes_mk,
+      transaction_bytes_nk,
+      args.mma_promotion_interval
+    };
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+    
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+    /* MMA promotion interval should be a multiple of the number of MMA instructions issued by each mainloop iteration. */
+    implementable = implementable && (args.mma_promotion_interval % (size<2>(TileShape{})() / TiledMma().template tile_size_mnk<2>()()) == 0);
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 1;
+  static constexpr uint32_t TmaTransactionBytesMK =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<ElementA>::value));
+  static constexpr uint32_t TmaTransactionBytesNK =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<ElementB>::value));
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params)
+  {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L));                            // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
+
+    return cute::make_tuple(gA_mkl, gB_nkl);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA, class TensorB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA, TensorB> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+
+      //
+      // Prepare the TMA loads for A and B
+      //
+
+      constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+      Tensor gA_mkl = get<0>(load_inputs);
+      Tensor gB_nkl = get<1>(load_inputs);
+
+      auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+      auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+      Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+      // Applies the mapping from block_tma_a
+      Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+      uint16_t mcast_mask_a = 0;
+      uint16_t mcast_mask_b = 0;
+
+      // Issue TmaLoads
+      // Maps the tile -> block, value
+      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+        for (int n = 0; n < size<1>(block_layout); ++n) {
+          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+        }
+      }
+
+      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+        for (int m = 0; m < size<0>(block_layout); ++m) {
+          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+        }
+      }
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for ( ; k_tile_count > 0; --k_tile_count) {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+        ++k_tile_iter;
+
+        // Advance smem_pipe_write
+        ++smem_pipe_write;
+      }
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+    static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+    
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and 
+                  stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{}, 
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
+        "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    GmmaFP8Accumulation accumulation(accum, mainloop_params.mma_promotion_interval, size<2>(tCrA));
+    warpgroup_fence_operand(accumulation());
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      if (accumulation.prepare_if_needed()) {
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+      }
+
+      int read_stage = smem_pipe_read.index();
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      accumulation.promote_if_needed();
+
+      ++smem_pipe_read;
+    }
+
+    warpgroup_fence_operand(accumulation());
+    // Mainloop GMMAs
+    k_tile_count -= prologue_mma_count;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+
+      if (accumulation.prepare_if_needed()) {
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+      }
+
+      warpgroup_fence_operand(accumulation());
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
+      warpgroup_wait<K_PIPE_MMAS>();
+      warpgroup_fence_operand(accumulation());
+
+      accumulation.promote_if_needed();
+
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+
+      // Advance smem_pipe_read and smem_pipe_release
+      ++smem_pipe_read;
+      ++smem_pipe_release;
+    }
+
+    accumulation.promote_residue_if_needed();
+
+    warpgroup_fence_operand(accumulation());
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/sm90_sparse_mma_tma_gmma_ss_warpspecialized.hpp b/include/cutlass/gemm/collective/sm90_sparse_mma_tma_gmma_ss_warpspecialized.hpp
new file mode 100644
index 0000000000..01e83bdf54
--- /dev/null
+++ b/include/cutlass/gemm/collective/sm90_sparse_mma_tma_gmma_ss_warpspecialized.hpp
@@ -0,0 +1,724 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/collective/builders/sm90_sparse_config.inl"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <
+  int Stages,
+  class ClusterShape,
+  class KernelSchedule,
+  class TileShape_,
+  class ElementA_,
+  class LayoutPairAE_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90TmaGmmaWarpSpecializedSparse<Stages, ClusterShape, KernelSchedule>,
+    TileShape_,
+    ElementA_,
+    LayoutPairAE_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedSparse<Stages, ClusterShape, KernelSchedule>;
+  using TileShape = TileShape_;
+  using TiledMma = TiledMma_;
+  using ElementA = ElementA_;
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using ElementAMmaRaw = typename ElementAMma::raw_type;
+  using LayoutPairAE = LayoutPairAE_;
+  using LayoutA = remove_cvref_t<decltype(get<0>(LayoutPairAE{}))>;
+  using LayoutE = remove_cvref_t<decltype(get<1>(LayoutPairAE{}))>;
+  using StrideA = decltype(cute::stride(LayoutA{}));
+  using ElementB = ElementB_;
+  using ElementBMma = typename TiledMma::ValTypeB;
+  using StrideB = StrideB_;
+  using ElementEMma = typename TiledMma::ValTypeE;
+  using ElementE = typename ElementEMma::raw_type;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+  using ArrayElementA = ElementA;
+  using ArrayElementB = ElementB;
+
+  static_assert(is_sparse<ElementAMma>::value, "ElementAMma is sparse");
+  static_assert(!is_sparse<ElementA>::value, "ElementA is not sparse");
+
+  static constexpr int ElementAMmaSparsity = ElementAMma::sparsity;
+  static constexpr int ElementEMmaSparsity = ElementEMma::sparsity;
+
+  // LayoutA is nested in the stride due to the sparsity.
+  static constexpr bool is_A_mn_major = cute::is_same_v<decltype(get<0>(LayoutA{}.stride())), Int<ElementAMmaSparsity>>;
+  static constexpr bool is_B_mn_major = cutlass::gemm::detail::is_major<0,StrideB>();
+
+  using SparseConfig = cutlass::Sm90GemmSparseConfig<ElementAMma,
+                                                     (is_A_mn_major ? GMMA::Major::MN : GMMA::Major::K),
+                                                     ElementEMma,
+                                                     decltype(cute::min(size<2>(TileShape{}),_128{}))>;
+
+  // The offline permutation for the metadata.
+  using SmemLayoutAtomE_ = typename SparseConfig::TensorEAtom;
+  using SmemLayoutAtomE  = ComposedLayout<Swizzle<0,4,3>,
+                                          smem_sparse_ptr_flag_bits<ElementEMmaSparsity, sizeof_bits_v<ElementE>>,
+                                          SmemLayoutAtomE_>;
+
+  // Metadata pathways
+  using SmemCopyAtomE = AutoVectorizingCopy;
+  using GmemCopyAtomE = GmemTiledCopyA;
+
+  using CtaShape_MNK = TileShape;
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M,K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (N,K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<is_A_mn_major, Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutE = decltype(tile_to_shape(
+      SmemLayoutAtomE{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<is_B_mn_major, Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  static_assert(cute::is_void_v<SmemCopyAtomA>,
+    "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+  static_assert(cute::is_void_v<SmemCopyAtomB>,
+    "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  using TmaInternalElementA = cute::sparse_elem<ElementAMmaSparsity,
+                                                cute::conditional_t<cute::is_same_v<ElementA, float>,
+                                                                    cutlass::tfloat32_t,
+                                                                    uint_bit_t<sizeof_bits_v<ElementAMmaRaw>>>>;
+  using TmaInternalElementB = cute::conditional_t<cute::is_same_v<float, ElementB>, 
+                                                  tfloat32_t,
+                                                  uint_bit_t<sizeof_bits_v<ElementBMma>>>;
+
+  struct SharedStorage
+  {
+    struct TensorStorage {
+      alignas(128) cute::ArrayEngine<ElementAMma, cute::cosize_v<SmemLayoutA>> smem_A;
+      alignas(128) cute::ArrayEngine<ElementBMma, cute::cosize_v<SmemLayoutB>> smem_B;
+      alignas(128) cute::ArrayEngine<ElementEMma, cute::cosize_v<SmemLayoutE>> smem_E;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 0;
+
+  static constexpr uint32_t TmaTransactionBytes =
+        cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutA{})) * cute::sizeof_bits_v<ElementAMma>) +
+        cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutE{})) * cute::sizeof_bits_v<ElementEMma>) +
+        cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutB{})) * cute::sizeof_bits_v<ElementBMma>);
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A{};
+    LayoutA layout_a{};
+    ElementB const* ptr_B{};
+    StrideB dB{};
+    ElementE const* ptr_E{};
+    LayoutE layout_e{};
+  };
+
+  // Device side kernel params
+  struct Params {
+
+    using TMA_A = decltype(make_tma_copy<typename TmaInternalElementA::raw_type>(
+        GmemTiledCopyA{},
+        make_tensor(recast_ptr<TmaInternalElementA>(nullptr), LayoutA{}),
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})));  // mcast along N mode for this M load, if any
+
+    using TMA_E = decltype(make_tma_copy<uint64_t>( // use uint64_t to get the largest loading box.
+        GmemCopyAtomE{},
+        make_tensor(recast_ptr<sparse_elem<ElementEMmaSparsity, ElementE>>(nullptr), LayoutE{}),
+        SmemLayoutE{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})));  // mcast along N mode for this M load, if any
+
+    using TMA_B = decltype(make_tma_copy<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        make_tensor(static_cast<TmaInternalElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{}))); // mcast along M mode for this N load, if any
+
+    TMA_A tma_load_a;
+    TMA_E tma_load_e;
+    TMA_B tma_load_b;
+    LayoutA layout_a;
+    LayoutE layout_e;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    auto ptr_A = recast_ptr<TmaInternalElementA>(args.ptr_A);
+    auto ptr_B = recast_ptr<TmaInternalElementB>(args.ptr_B);
+    auto ptr_E = recast_ptr<sparse_elem<ElementEMmaSparsity, ElementE>>(args.ptr_E);
+
+    Tensor tensor_a = make_tensor(ptr_A, args.layout_a);
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+    Tensor tensor_e = make_tensor(ptr_E, args.layout_e);
+
+    typename Params::TMA_A tma_load_a = make_tma_copy<typename TmaInternalElementA::raw_type>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})); // mcast along N mode for this M load, if any
+
+    typename Params::TMA_E tma_load_e = make_tma_copy<uint64_t>( // use uint64_t to get the largest loading box.
+        GmemCopyAtomE{},
+        tensor_e,
+        SmemLayoutE{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})); // mcast along N mode for this M load, if any
+
+    typename Params::TMA_B tma_load_b = make_tma_copy<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{})); // mcast along M mode for this N load, if any
+
+    return {
+      tma_load_a,
+      tma_load_e,
+      tma_load_b,
+      args.layout_a,
+      args.layout_e
+    };
+  }
+
+  template<class ProblemShape>
+  CUTLASS_HOST_DEVICE static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+    
+    bool size_check = true;
+    // Check Alignment A
+    if constexpr (is_A_mn_major) {
+      size_check = size_check && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K/2,L), cute::make_stride(_1{}, M, M*K/2));
+    }
+    else { // If A is K-major
+      size_check = size_check && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K/2,L), cute::make_stride(K/2, _1{}, M*K/2));
+    }
+    size_check = size_check && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!size_check) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+
+    // Check if layout_a and layout_e is filled correctly
+    auto layout_a_ref = SparseConfig::fill_layoutA(problem_shape_MNKL);
+    auto layout_e_ref = SparseConfig::fill_layoutE(problem_shape_MNKL);
+    bool layout_check = true;
+    layout_check = layout_check && (layout_a_ref == args.layout_a);
+    layout_check = layout_check && (layout_e_ref == args.layout_e);
+
+    if (!layout_check) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Layout_a/e mismatch.\n");
+    }
+
+    return size_check && layout_check;
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_e.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  /// The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(mainloop_params.layout_a.shape());                      // (m,k,l)
+    Tensor mE_mkl = mainloop_params.tma_load_e.get_tma_tensor(mainloop_params.layout_e.shape());                      // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gE_mkl = local_tile(mE_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
+
+    return cute::make_tuple(gA_mkl, gB_nkl, gE_mkl);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA, class TensorB, class TensorE,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline, 
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA, TensorB, TensorE> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+      Tensor sE = make_tensor(make_smem_ptr(shared_tensors.smem_E.begin()), SmemLayoutE{});        // (BLK_M,BLK_K,PIPE)
+      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+
+      auto [gA_mkl, gB_nkl, gE_mkl] = load_inputs;
+
+      // Define the CTA-in-cluster Layout and Coord
+      Layout cta_layout_mnk = make_layout(ClusterShape{});
+      auto cta_coord_mnk = cta_layout_mnk.get_flat_coord(block_rank_in_cluster);
+
+      // TMA Multicast Masks
+      uint16_t mcast_mask_a = create_tma_multicast_mask<1>(cta_layout_mnk, cta_coord_mnk);
+      uint16_t mcast_mask_e = create_tma_multicast_mask<1>(cta_layout_mnk, cta_coord_mnk);
+      uint16_t mcast_mask_b = create_tma_multicast_mask<0>(cta_layout_mnk, cta_coord_mnk);
+
+      auto block_tma_a = mainloop_params.tma_load_a.get_slice(get<1>(cta_coord_mnk));
+      auto block_tma_e = mainloop_params.tma_load_e.get_slice(get<1>(cta_coord_mnk));
+      auto block_tma_b = mainloop_params.tma_load_b.get_slice(get<0>(cta_coord_mnk));
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+      Tensor gE = gE_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+      Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+      // Applies the mapping from block_tma_a
+      Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tEgE = block_tma_e.partition_S(gE);                                                 // (TMA,TMA_M,TMA_K,k)
+      Tensor tEsE = block_tma_e.partition_D(sE);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for ( ; k_tile_count > 0; --k_tile_count)
+      {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_e.with(*tma_barrier, mcast_mask_e), tEgE(_,_,_,*k_tile_iter), tEsE(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+        ++k_tile_iter;
+
+        // Advance smem_pipe_write
+        ++smem_pipe_write;
+      }
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all 
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was 
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutE{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+
+    Tensor sE_ = make_tensor(make_smem_ptr(shared_tensors.smem_E.begin()), SmemLayoutE{});         // (BLK_M,BLK_K,PIPE)
+    Tensor sE = as_position_independent_swizzle_tensor(sE_);
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+
+    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+    auto copy_atom_E = Copy_Atom<SmemCopyAtomE, uint32_t>{};
+
+    Tensor tCsE = partition_E(thread_mma, sE(_,_,Int<0>{}));            // (MMA,MMA_M,MMA_K)
+    Tensor tCrE = make_fragment_like<ElementEMma>(tCsE);                // (MMA,MMA_M,MMA_K)
+
+    auto smem_tiled_copy_E = make_tiled_copy_E(copy_atom_E, tiled_mma);
+    auto smem_thr_copy_E   = smem_tiled_copy_E.get_thread_slice(thread_idx);
+
+    Tensor tEsE  = smem_thr_copy_E.partition_S(sE);                     // (ECPY,ECPY_M,ECPY_K)
+    Tensor tErE  = smem_thr_copy_E.retile_D(tCrE);                      // (ECPY,ECPY_M,ECPY_K)
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
+        "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    warpgroup_fence_operand(accum);
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+      int read_stage = smem_pipe_read.index();
+
+      // Load metadata smem->rmem for one stage
+      copy(smem_tiled_copy_E, tEsE(_,_,_,read_stage), tErE);
+
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        cute::gemm(tiled_mma, make_zip_tensor(tCrA(_,_,k_block,read_stage), tErE(_,_,k_block)), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+
+      warpgroup_commit_batch();
+
+      ++smem_pipe_read;
+    }
+
+    warpgroup_fence_operand(accum);
+    // Mainloop GMMAs
+    k_tile_count -= prologue_mma_count;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+      int read_stage = smem_pipe_read.index();
+
+      // Load metadata smem->rmem for one stage
+      copy(smem_tiled_copy_E, tEsE(_,_,_,read_stage), tErE);
+
+      warpgroup_fence_operand(accum);
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        cute::gemm(tiled_mma, make_zip_tensor(tCrA(_,_,k_block,read_stage), tErE(_,_,k_block)), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
+      warpgroup_wait<K_PIPE_MMAS>();
+      warpgroup_fence_operand(accum);
+
+      // UNLOCK smem_pipe_release, done _computing_ on it
+      pipeline.consumer_release(smem_pipe_release);
+
+      // Advance smem_pipe_read and smem_pipe_release
+      ++smem_pipe_read;
+      ++smem_pipe_release;
+    }
+
+    warpgroup_fence_operand(accum);
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+    
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+
+private:
+
+  template <class MMA_Atom,
+            class AtomLayoutMNK,
+            class PermutationMNK,
+            class ETensor>
+  CUTE_HOST_DEVICE static constexpr
+  auto
+  thrfrg_E(TiledMMA<MMA_Atom, AtomLayoutMNK, PermutationMNK> const& mma, ETensor&& etensor)
+  {
+    using TiledMma = TiledMMA<MMA_Atom, AtomLayoutMNK, PermutationMNK>;
+
+    CUTE_STATIC_ASSERT_V(rank(etensor) >= Int<2>{});
+
+    // Reorder the tensor for the TiledAtom
+    auto t_tile = make_tile(get<0>(PermutationMNK{}),
+                            get<2>(PermutationMNK{}));
+    auto t_tensor = logical_divide(etensor, t_tile);                 // (PermM,PermK)
+
+    // Tile the tensor for the Atom
+    auto e_tile = make_tile(make_layout(size<0>(typename TiledMma::AtomShape_MNK{})),
+                            make_layout(size<2>(typename TiledMma::AtomShape_MNK{})));
+    auto e_tensor = zipped_divide(t_tensor, e_tile);                 // ((AtomM,AtomK),(RestM,RestK))
+
+    // Transform the Atom mode from (M,K) to (Thr,Val)
+    using AtomLayoutE_TV = typename TiledMma::Atom::Traits::ELayout;
+    auto tv_tensor = e_tensor.compose(AtomLayoutE_TV{},_);           // ((ThrV,FrgV),(RestM,RestK))
+
+    // Tile the tensor for the Thread
+    auto thr_tile = make_tile(_,
+                              make_tile(make_layout(size<1>(mma.thr_layout_vmnk_)),
+                                        make_layout(size<3>(mma.thr_layout_vmnk_))));
+    auto thr_tensor = zipped_divide(tv_tensor, thr_tile);            // ((ThrV,(ThrM,ThrK)),(FrgV,(RestM,RestK)))
+
+    return thr_tensor;
+  }
+
+  template<class... MArgs>
+  CUTE_HOST_DEVICE static constexpr
+  auto
+  get_layoutE_TV(TiledMMA<MArgs...> const& mma)
+  {
+    // (M,K) -> (M,K)
+    auto ref_E = make_layout(make_shape(tile_size<0>(mma), tile_size<2>(mma)));
+    // (ethrid,val) -> (M,K)
+    auto layoutE_TV = thrfrg_E(mma, ref_E);
+
+    // (ThrV,(ThrM,ThrK)) -> (ThrV,(ThrM,ThrN,ThrK))
+    auto etile = make_tile(_,
+                            make_tile(make_layout(make_shape (size<1>(mma.thr_layout_vmnk_), size<2>(mma.thr_layout_vmnk_)),
+                                                  make_stride(               Int<1>{} ,                Int<0>{} )),
+                                      _));
+
+    // thr_idx -> (ThrV,ThrM,ThrN,ThrK)
+    auto thridx_2_thrid = right_inverse(mma.thr_layout_vmnk_);
+
+    // (thr_idx,val) -> (M,K)
+    return layoutE_TV.compose(etile, _).compose(thridx_2_thrid, _);
+  }
+
+  template <class... MArgs, class ETensor>
+  CUTE_HOST_DEVICE static constexpr
+  auto
+  partition_E(ThrMMA<MArgs...> const& thr_mma, ETensor&& etensor)
+  {
+    auto thr_tensor = make_tensor(static_cast<ETensor&&>(etensor).data(), thrfrg_E(thr_mma, etensor.layout()));
+
+    auto thr_vmk = make_coord(get<0>(thr_mma.thr_vmnk_), make_coord(get<1>(thr_mma.thr_vmnk_), get<3>(thr_mma.thr_vmnk_)));
+    return thr_tensor(thr_vmk, make_coord(_, repeat<rank<1,1>(thr_tensor)>(_)));
+  }
+
+  template <class... CArgs, class... MArgs>
+  CUTE_HOST_DEVICE static constexpr
+  auto
+  make_tiled_copy_E(Copy_Atom<CArgs...> const& copy_atom,
+                    TiledMMA<MArgs...>  const& mma)
+  {
+    return make_tiled_copy_impl(copy_atom, get_layoutE_TV(mma), make_shape(tile_size<0>(mma),tile_size<2>(mma)));
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/device/base_grouped.h b/include/cutlass/gemm/device/base_grouped.h
new file mode 100644
index 0000000000..eec61981f8
--- /dev/null
+++ b/include/cutlass/gemm/device/base_grouped.h
@@ -0,0 +1,478 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file
+  \brief Base device-level grouped kernel.
+*/
+
+#pragma once
+
+#include <limits>
+#include <numeric>
+#include <vector>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_universal.h"
+
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+#include "cutlass/trace.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// GEMM Grouped
+template <typename BaseKernel_>
+class BaseGrouped {
+public:
+
+  using BaseKernel = BaseKernel_;
+
+  using ElementA = typename BaseKernel::ElementA;
+  using LayoutA = typename BaseKernel::LayoutA;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  static ComplexTransform const kTransformA = BaseKernel::kTransformA;
+  static int const kAlignmentA = BaseKernel::kAlignmentA;
+
+  using ElementB = typename BaseKernel::ElementB;
+  using LayoutB = typename BaseKernel::LayoutB;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  static ComplexTransform const kTransformB = BaseKernel::kTransformB;
+  static int const kAlignmentB = BaseKernel::kAlignmentB;
+
+  using ElementC = typename BaseKernel::ElementC;
+  using LayoutC = typename BaseKernel::LayoutC;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  static int const kAlignmentC = BaseKernel::kAlignmentC;
+
+  using ElementAccumulator = typename BaseKernel::Mma::Policy::Operator::ElementC;
+
+  using EpilogueOutputOp = typename BaseKernel::EpilogueOutputOp;
+  using ThreadblockSwizzle = typename BaseKernel::ThreadblockSwizzle;
+
+  using Operator = typename BaseKernel::Operator;
+  using WarpMmaOperator = typename BaseKernel::Mma::Policy::Operator;
+
+  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
+  using MathOperator = typename WarpMmaOperator::MathOperator;
+  using OperatorClass = typename WarpMmaOperator::OperatorClass;
+  using ArchTag = typename WarpMmaOperator::ArchTag;
+  using ThreadblockShape = typename BaseKernel::Mma::Shape;
+  using WarpShape = typename BaseKernel::WarpShape;
+  using InstructionShape = typename BaseKernel::InstructionShape;
+  static int const kStages = BaseKernel::Mma::kStages;
+
+  /// Argument structure
+  using Arguments = typename BaseKernel::Arguments;
+
+  using ProblemInfo = typename BaseKernel::ProblemVisitor::ProblemInfo;
+
+protected:
+
+  /// Kernel parameters object
+  typename BaseKernel::Params params_;
+
+private:
+
+  /// Get the number of tiles across all problems in a group
+  static int32_t group_tile_count(const cutlass::gemm::GemmCoord* problem_sizes_ptr, int problem_count) {
+    int32_t tiles = 0;
+    for (int32_t i = 0; i < problem_count; ++i) {
+      cutlass::gemm::GemmCoord problem = problem_sizes_ptr[i];
+      BaseKernel::ProblemVisitor::possibly_transpose_problem(problem);
+      tiles += problem_tile_count(problem);
+    }
+    return tiles;
+  }
+
+  /// Copy from `data` to `workspace`
+  Status copy_to_workspace(void* workspace, void* data, size_t bytes) {
+    cudaError_t cuda_error = cudaMemcpy(workspace, data, bytes, cudaMemcpyHostToDevice);
+    if (cuda_error != cudaSuccess) {
+      // Call cudaGetLastError() to clear the error bit
+      cuda_error = cudaGetLastError();
+      CUTLASS_TRACE_HOST(
+          "  cudaMemcpy() returned error "
+          << cudaGetErrorString(cuda_error));
+      return Status::kErrorInternal;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Precomputes scheduling information for the grouped GEMM
+  Status precompute(Arguments const &args, int32_t tile_count, void* workspace) {
+    size_t workspace_bytes = get_workspace_size(args);
+    std::vector<uint8_t> host_workspace(workspace_bytes);
+    BaseKernel::ProblemVisitor::host_precompute(args.host_problem_sizes,
+                                                args.problem_count,
+                                                args.threadblock_count,
+                                                (void*)host_workspace.data());
+    return copy_to_workspace(workspace, host_workspace.data(), workspace_bytes);
+  }
+
+  /// Reorder `data` according to `indices`
+  template <typename T>
+  static void reorder_array(T* data, const std::vector<size_t>& indices) {
+    // For now, simply create a copy of the data and then copy over to the original.
+    std::vector<T> copy(indices.size());
+    for (size_t i = 0; i < indices.size(); ++i) {
+      copy.at(i) = data[indices[i]];
+    }
+
+    memcpy(data, copy.data(), indices.size() * sizeof(T));
+  }
+
+public:
+
+  /// Constructs the GEMM.
+  BaseGrouped() { }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return BaseKernel::can_implement(args);
+  }
+
+  /// Get the number of tiles in a problem
+  static int32_t problem_tile_count(cutlass::gemm::GemmCoord const &problem) {
+    auto grid = BaseKernel::ProblemVisitor::grid_shape(problem);
+    return BaseKernel::ProblemVisitor::tile_count(grid);
+  }
+
+  /// Get the number of tiles across all problems in a group
+  static int32_t group_tile_count(Arguments const &args) {
+    if (args.host_problem_sizes == nullptr) {
+        CUTLASS_TRACE_HOST("Received nullptr for `args.host_problem_sizes");
+        return -1;
+    }
+
+    return group_tile_count(args.host_problem_sizes, args.problem_count);
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    if (BaseKernel::ProblemVisitor::kRequiresPrecomputation) {
+      return BaseKernel::ProblemVisitor::get_workspace_size(args.host_problem_sizes,
+                                                            args.problem_count,
+                                                            args.threadblock_count);
+    } else {
+      return 0;
+    }
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const &args) {
+
+    return dim3(args.threadblock_count, 1, 1);
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+
+    CUTLASS_TRACE_HOST("BaseGrouped::maximum_active_blocks()");
+
+    int smem_size = int(sizeof(typename BaseKernel::SharedStorage));
+
+    CUTLASS_TRACE_HOST("  smem_size: " << smem_size << " bytes");
+
+    cudaError_t result;
+    if (smem_size > (48 << 10)) {
+      result = cudaFuncSetAttribute(Kernel<BaseKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        // Call cudaGetLastError() to clear the error bit
+        result = cudaGetLastError();
+        CUTLASS_TRACE_HOST(
+          "  cudaFuncSetAttribute() returned error "
+          << cudaGetErrorString(result));
+        return -1;
+      }
+    }
+
+    int max_active_blocks = -1;
+    result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks,
+        Kernel<BaseKernel>,
+        BaseKernel::kThreadCount,
+        smem_size);
+
+    if (result != cudaSuccess) {
+      // Call cudaGetLastError() to clear the error bit
+      result = cudaGetLastError();
+      CUTLASS_TRACE_HOST(
+        "  cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error "
+        << cudaGetErrorString(result));
+      return -1;
+    }
+
+    CUTLASS_TRACE_HOST("  max_active_blocks: " << max_active_blocks);
+    return max_active_blocks;
+  }
+
+  /// Sorts each pointer passed in according to the indices that sort
+  /// `problem_sizes_ptr` in descending order of problem-K dimension.
+  static void sort_problems(int problem_count,
+                            cutlass::gemm::GemmCoord* problem_sizes_ptr,
+                            int64_t* lda_host_ptr,
+                            int64_t* ldb_host_ptr,
+                            int64_t* ldc_host_ptr,
+                            int64_t* ldd_host_ptr,
+                            int64_t* offset_A_ptr,
+                            int64_t* offset_B_ptr,
+                            int64_t* offset_C_ptr,
+                            int64_t* offset_D_ptr)
+  {
+    std::vector<size_t> indices(problem_count);
+    std::iota(indices.begin(), indices.end(), 0);
+    std::stable_sort(indices.begin(), indices.end(),
+      [&problem_sizes_ptr](size_t i, size_t j) {
+        return problem_sizes_ptr[i].k() > problem_sizes_ptr[j].k();
+      });
+
+    reorder_array(problem_sizes_ptr, indices);
+    reorder_array(lda_host_ptr, indices);
+    reorder_array(ldb_host_ptr, indices);
+    reorder_array(ldc_host_ptr, indices);
+    reorder_array(ldd_host_ptr, indices);
+    reorder_array(offset_A_ptr, indices);
+    reorder_array(offset_B_ptr, indices);
+    reorder_array(offset_C_ptr, indices);
+    reorder_array(offset_D_ptr, indices);
+  }
+
+  /// Computes the number of threadblocks to launch for the grouped kernel
+  static int sufficient(const cutlass::gemm::GemmCoord* problem_sizes_ptr=nullptr,
+                        int problem_count=0,
+                        int available_sm_count=-1) {
+    // Determine the number of blocks that would be launched to fill up a single
+    // wave on the GPU with each SM having maximum occupancy.
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+    if (result != cudaSuccess) {
+      // Call cudaGetLastError() to clear the error bit
+      result = cudaGetLastError();
+      CUTLASS_TRACE_HOST("  cudaGetDevice() returned error "
+          << cudaGetErrorString(result));
+      return 0;
+    }
+
+    int multiprocessor_count;
+    result = cudaDeviceGetAttribute(&multiprocessor_count,
+      cudaDevAttrMultiProcessorCount, device_idx);
+    if (result != cudaSuccess) {
+      CUTLASS_TRACE_HOST(
+        "  cudaDeviceGetAttribute() returned error "
+        << cudaGetErrorString(result));
+      return 0;
+    }
+
+    bool override_sm_count = (available_sm_count < 0 || available_sm_count > multiprocessor_count);
+    if (override_sm_count) {
+      available_sm_count = multiprocessor_count;
+    }
+
+    int max_active_blocks = maximum_active_blocks();
+    if (max_active_blocks <= 0) {
+      return 0;
+    }
+
+    int occupancy_based_block_count = available_sm_count * max_active_blocks;
+
+    if (problem_sizes_ptr == nullptr || problem_count == 0) {
+      return occupancy_based_block_count;
+    }
+
+    int total_tiles = group_tile_count(problem_sizes_ptr, problem_count);
+
+    // If the group contains a single problem, launching the exact number of
+    // threadblocks needed to cover the problem minimizes the work performed
+    // per threadblock in finding the next tile to compute. We return total_tiles
+    // unless the user has provided the SM count.
+    if (problem_count == 1 && override_sm_count) {
+      return total_tiles;
+    }
+
+    // Choose between the full wave of threadblocks and the tile count. If there
+    // are fewer tiles in the group than threadblocks in the full wave, only
+    // some threadblocks will be assigned tiles. Those threadblocks
+    // which are not assigned tiles still need to perform the work of iterating through
+    // problem sizes to determine that they have no work to do. This competes for cycles
+    // with those threadblocks that are assigned tiles to compute.
+    return std::min(total_tiles, occupancy_based_block_count);
+  }
+
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    CUTLASS_TRACE_HOST("BaseGrouped::initialize() - workspace "
+      << workspace << ", stream: " << (stream ? "non-null" : "null"));
+
+    // Workspace
+    size_t workspace_bytes = get_workspace_size(args);
+
+    if (workspace_bytes && !workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    if (BaseKernel::ProblemVisitor::kRequiresPrecomputation) {
+      int32_t tile_count = group_tile_count(args);
+      Status status = precompute(args, tile_count, workspace);
+      if (status != Status::kSuccess) {
+        return status;
+      }
+
+      params_ = typename BaseKernel::Params(args, workspace, tile_count);
+    } else {
+      params_ = typename BaseKernel::Params(args, workspace);
+    }
+
+    // Specify shared memory capacity for kernel.
+    int smem_size = int(sizeof(typename BaseKernel::SharedStorage));
+
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(Kernel<BaseKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    size_t workspace_bytes = get_workspace_size(args);
+
+    if (workspace_bytes && !workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    if (BaseKernel::ProblemVisitor::kRequiresPrecomputation) {
+      int32_t tile_count = group_tile_count(args);
+      Status status = precompute(args, tile_count, workspace);
+      if (status != Status::kSuccess) {
+        return status;
+      }
+
+      params_.update(args, workspace, tile_count);
+    } else {
+      params_.update(args, workspace);
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    //
+    // Configure grid and block dimensions
+    //
+
+    if (!params_.problem_visitor.problem_count) {
+      return Status::kSuccess;
+    }
+
+    dim3 grid(params_.threadblock_count, 1, 1);
+    dim3 block(BaseKernel::kThreadCount, 1, 1);
+
+    int smem_size = int(sizeof(typename BaseKernel::SharedStorage));
+
+    //
+    // Launch kernel
+    //
+
+    // Launch
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<BaseKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    //
+    // Query for errors
+    //
+    cudaError_t result = cudaGetLastError();
+
+    if (result != cudaSuccess) {
+      CUTLASS_TRACE_HOST("  grid launch failed with error " << cudaGetErrorString(result));
+      return Status::kErrorInternal;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Initializes and runs the kernel.
+  Status operator()(
+    Arguments const &args,
+    void *workspace,
+    cudaStream_t stream = nullptr) {
+
+    Status status = initialize(args, workspace, stream);
+
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/device/default_gemm_configuration.h b/include/cutlass/gemm/device/default_gemm_configuration.h
index 55592d2b56..e7ed2da940 100644
--- a/include/cutlass/gemm/device/default_gemm_configuration.h
+++ b/include/cutlass/gemm/device/default_gemm_configuration.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -490,13 +490,13 @@ struct DefaultGemmConfiguration<arch::OpClassTensorOp, arch::Sm80, double,
   static int const kAlignmentA = 1;
   static int const kAlignmentB = 1;
   
-  using ThreadblockShape = GemmShape<128, 256, 64>;
-  using WarpShape = GemmShape<64, 64, 64>;
-  using InstructionShape = GemmShape<16, 8, 16>;
+  using ThreadblockShape = GemmShape<128, 128, 16>;
+  using WarpShape = GemmShape<32, 64, 16>;
+  using InstructionShape = GemmShape<8, 8, 4>;
   static int const kStages = 3;
 
   using EpilogueOutputOp = epilogue::thread::LinearCombination<
-      ElementC, 128 / sizeof_bits<ElementC>::value, ElementAccumulator,
+      ElementC, 1, ElementAccumulator,
       ElementAccumulator>;
 
   using Operator = arch::OpMultiplyAdd;
@@ -764,7 +764,190 @@ struct DefaultGemmConfiguration<
 
 ////////////////////////////////////////////////////////////////////////////////
 
+template <
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp,
+  arch::Sm80,
+  int4b_t,
+  int8_t,
+  ElementC,
+  int32_t> {
+
+  static int const kAlignmentA = 128 / sizeof_bits<int4b_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<int8_t>::value;
+
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<16, 8, 32>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp,
+  arch::Sm80,
+  int8_t,
+  int4b_t,
+  ElementC,
+  int32_t> {
+
+  static int const kAlignmentA = 128 / sizeof_bits<int8_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<int4b_t>::value;
+
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<16, 8, 32>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Base configuration for all {fe4m3, fe5m2} x {fe4m3, fe5m2} combinations on SM89
+template <
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename ElementAccumulator>
+struct DefaultGemmConfigurationSm89F8 {
+  static_assert((platform::is_same<ElementA, cutlass::float_e4m3_t>::value ||
+                 platform::is_same<ElementA, cutlass::float_e5m2_t>::value),
+                "ElementA must be of type float_e4m3_t or float_e5m2_t");
+  static_assert((platform::is_same<ElementB, cutlass::float_e4m3_t>::value ||
+                 platform::is_same<ElementB, cutlass::float_e5m2_t>::value),
+                "ElementB must be of type float_e4m3_t or float_e5m2_t");
+
+  static int const kAlignmentA = 128 / sizeof_bits<ElementA>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<ElementB>::value;
+
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<16, 8, 32>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombination<
+      ElementC, 128 / sizeof_bits<ElementC>::value, ElementAccumulator,
+      ElementAccumulator>;
+
+  using Operator = arch::OpMultiplyAdd;
+};
+
+/// Partial specialization for SM89 fe4m3 x fe4m3
+template <typename ElementC, typename ElementAccumulator>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp,
+  arch::Sm89,
+  cutlass::float_e4m3_t,
+  cutlass::float_e4m3_t,
+  ElementC,
+  ElementAccumulator> : DefaultGemmConfigurationSm89F8<
+                            cutlass::float_e4m3_t,
+                            cutlass::float_e4m3_t,
+                            ElementC,
+                            ElementAccumulator> {};
+
+/// Partial specialization for SM89 fe4m3 x fe5m2
+template <typename ElementC, typename ElementAccumulator>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp,
+  arch::Sm89,
+  cutlass::float_e4m3_t,
+  cutlass::float_e5m2_t,
+  ElementC,
+  ElementAccumulator> : DefaultGemmConfigurationSm89F8<
+                            cutlass::float_e4m3_t,
+                            cutlass::float_e5m2_t,
+                            ElementC,
+                            ElementAccumulator> {};
+
+/// Partial specialization for SM89 fe5m2 x fe4m3
+template <typename ElementC, typename ElementAccumulator>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp,
+  arch::Sm89,
+  cutlass::float_e5m2_t,
+  cutlass::float_e4m3_t,
+  ElementC,
+  ElementAccumulator> : DefaultGemmConfigurationSm89F8<
+                            cutlass::float_e5m2_t,
+                            cutlass::float_e4m3_t,
+                            ElementC,
+                            ElementAccumulator> {};
+
+/// Partial specialization for SM89 fe5m2 x fe5m2
+template <typename ElementC, typename ElementAccumulator>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp,
+  arch::Sm89,
+  cutlass::float_e5m2_t,
+  cutlass::float_e5m2_t,
+  ElementC,
+  ElementAccumulator> : DefaultGemmConfigurationSm89F8<
+                            cutlass::float_e5m2_t,
+                            cutlass::float_e5m2_t,
+                            ElementC,
+                            ElementAccumulator> {};
+
 ////////////////////////////////////////////////////////////////////////////////
+
+template <typename ElementC,
+          typename ElementAccumulator>
+struct DefaultGemmConfiguration<arch::OpClassTensorOp, arch::Sm90, double,
+                                double, ElementC, ElementAccumulator> {
+
+  static int const kAlignmentA = 1;
+  static int const kAlignmentB = 1;
+  
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<16, 8, 4>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombination<
+      ElementC, 1, ElementAccumulator,
+      ElementAccumulator>;
+
+  using Operator = arch::OpMultiplyAdd;
+};
+
+template <>
+struct DefaultGemmConfiguration<
+    arch::OpClassTensorOp, 
+    arch::Sm90, 
+    complex<double>,
+    complex<double>, 
+    complex<double>,
+    complex<double>
+  > {
+
+  static int const kAlignmentA = 1;
+  static int const kAlignmentB = 1;
+  
+  using ThreadblockShape = GemmShape<64, 64, 16>;
+  using WarpShape = GemmShape<32, 32, 16>;
+  using InstructionShape = GemmShape<16, 8, 4>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombination<
+      complex<double>, 1, complex<double>,
+      complex<double>>;
+
+  using Operator = arch::OpMultiplyAddComplex;
+};
+
 } // namespace device
 } // namespace gemm
 } // namespace cutlass
diff --git a/include/cutlass/gemm/device/ell_gemm.h b/include/cutlass/gemm/device/ell_gemm.h
new file mode 100644
index 0000000000..54ddab4007
--- /dev/null
+++ b/include/cutlass/gemm/device/ell_gemm.h
@@ -0,0 +1,849 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a Block-Ell sparse gemm kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/ell_gemm.h"
+
+#include "cutlass/gemm/kernel/default_ell_gemm.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*! Blocked-Ell sparse gemm device-level operator. This is an interface to efficient CUTLASS
+  Blocked-Ell kernels that may be invoked from host code.
+
+  The contributions of this class are:
+    
+    1. At compile time, it maps data types and high-level structural parameters onto 
+       specific CUTLASS components.
+
+    2. At runtime, it maps logical arguments to Blocked-Ell problems to kernel parameters.
+
+    3. At runtime, it launches kernels on the device.
+
+  Example of a CUTLASS EllGemm operator is as follows:
+
+    //
+    // Instantiate the CUTLASS EllGemm operator.
+    //
+
+    cutlass::gemm::device::EllGemm<
+      cutlass::half_t,
+      cutlass::layout::RowMajor,
+      cutlass::half_t,
+      cutlass::layout::ColumnMajor,
+      cutlass::half_t,
+      cutlass::layout::ColumnMajor,
+      float, 
+      cutlass::arch::OpClassTensorOp, 
+      cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, 
+      cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          cutlass::half_t, 128 / cutlass::sizeof_bits<cutlass::half_t>::value,
+          float, float>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>, 
+      4, // Stages
+      128 / cutlass::sizeof_bits<cutlass::half_t>::value, // Alignment A
+      128 / cutlass::sizeof_bits<cutlass::half_t>::value  // Alignment B
+    > ellgemm_op;
+
+    //
+    // Launch the EllGemm operation on the device
+    //
+
+    Description of parameters and tensors used to represent the Blocked-Ellpack (ELL) format:
+      a_rows              - Rows in the sparse matrix.
+      a_cols              - Colums in the sparse matrix.
+      BlockedEllA         - Packed matrix (ellValue matrix) that stores non-zero values in 
+                            consecutive blocks, whose size is (a_rows * a_ell_num_columns)
+      ell_idx             - Blocked-ELL Column indices (ellColInd) matrix, whose size is
+                            (a_rows / a_ell_blocksize) * (a_ell_num_columns / a_ell_blocksize)
+      a_ell_blocksize     - Size of the ELL-Blocks.
+      a_ell_num_columns   - Number of columns in the Blocked-Ellpack format (ellValue columns)
+      B                   - Input dense matrix whose size is (a_cols * n)
+      C/D                 - Output dense matrix whose size is (a_rows * n)
+
+    cutlass::Status status = ellgemm_op({
+      {a_rows, n, a_cols},  // GemmCoord problem_size
+      {BlockedEllA, lda},   // TensorRef<cutlass::half_t, layout::RowMajor> ref_BlockedEllA
+      {B, ldb},             // TensorRef<cutlass::half_t, layout::ColumnMajor> ref_B,
+      {C, ldc},             // TensorRef<float, layout::ColumnMajor> ref_C,
+      {D, ldd},             // TensorRef<float, layout::ColumnMajor> ref_D,
+      ell_idx,              // Blocked-ELL Column indices or ellColInd matrix (const int*)
+      a_ell_num_columns,    // Columns in the Blocked-Ellpack (ellValue) matrix (int)
+      a_ell_blocksize,      // Size of the ELL-Blocks (int)
+      a_ell_base,           // Base index of ellColInd (int) - Zero or One
+      {alpha, beta}         // EpilogueOutputOp::Params epilogue_op_params
+    });
+
+  A simplified view of the template is listed below.
+
+    template <
+      /// Element type for A matrix operand
+      typename ElementA,
+      
+      /// Layout type for A matrix operand
+      typename LayoutA,
+      
+      /// Element type for B matrix operand
+      typename ElementB,
+      
+      /// Layout type for B matrix operand
+      typename LayoutB,
+      
+      /// Element type for C and D matrix operands
+      typename ElementC,
+      
+      /// Layout type for C and D matrix operands
+      typename LayoutC,
+      
+      /// Element type for internal accumulation
+      typename ElementAccumulator,
+
+      /// Operator class tag
+      typename OperatorClass,
+      
+      /// Tag indicating architecture to tune for.  This is the minimum SM that
+      /// supports the intended feature. The device kernel can be built
+      /// targeting any SM larger than this number.
+      typename ArchTag,
+      
+      /// Threadblock-level tile size (concept: GemmShape)
+      typename ThreadblockShape,
+      
+      /// Warp-level tile size (concept: GemmShape)
+      typename WarpShape,
+      
+      /// Warp-level tile size (concept: GemmShape)
+      typename InstructionShape,
+      
+      /// Epilogue output operator
+      typename EpilogueOutputOp,
+      
+      /// Threadblock-level swizzling operator
+      typename ThreadblockSwizzle,
+      
+      /// Number of stages used in the pipelined mainloop
+      int Stages
+
+      /// Access granularity of A matrix in units of elements
+      int AlignmentA,
+
+      /// Access granularity of B matrix in units of elements
+      int AlignmentB,
+
+      /// Supports split-K with serial reduction
+      bool SplitKSerial,
+
+      /// Operation performed by GEMM
+      typename Operator,
+
+      /// Sparse matrix is A or not
+      bool IsASparse
+    >
+    class EllGemm;
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassTensorOp,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm80,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ =
+        typename threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial = false,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Sparse matrix is A or not
+    bool IsASparse = true
+    >
+class EllGemm {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+  static bool const kIsASparse = IsASparse;
+
+  /// Define the kernel
+  using GemmKernel = typename kernel::DefaultEllGemm<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kSplitKSerial,
+    Operator,
+    kIsASparse
+  >::GemmKernel;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+    TensorRef<ElementA const, LayoutA> ref_A;
+    TensorRef<ElementB const, LayoutB> ref_B;
+    TensorRef<ElementC const, LayoutC> ref_C;
+    TensorRef<ElementC, LayoutC> ref_D;
+    const int* ell_idx;
+    int ell_ncol;
+    int ell_blocksize;
+    int ell_base_idx;
+    typename EpilogueOutputOp::Params epilogue;
+    int split_k_slices;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(): problem_size(0, 0, 0), split_k_slices(1) {
+
+    }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      TensorRef<ElementA const, LayoutA> ref_A_,
+      TensorRef<ElementB const, LayoutB> ref_B_,
+      TensorRef<ElementC const, LayoutC> ref_C_,
+      TensorRef<ElementC, LayoutC> ref_D_,
+      const int* ell_idx_,
+      int ell_ncol_,
+      int ell_blocksize_,
+      int ell_base_idx_,
+      typename EpilogueOutputOp::Params epilogue_ = 
+        typename EpilogueOutputOp::Params(),
+      int split_k_slices = 1
+    ):
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      ref_B(ref_B_),
+      ref_C(ref_C_),
+      ref_D(ref_D_),
+      ell_idx(ell_idx_),
+      ell_ncol(ell_ncol_),
+      ell_blocksize(ell_blocksize_),
+      ell_base_idx(ell_base_idx_),
+      epilogue(epilogue_),
+      split_k_slices(split_k_slices) {
+
+    }
+  };
+
+private:
+
+  /// Kernel parameters object
+  typename GemmKernel::Params params_{};
+
+public:
+
+  /// Constructs the GEMM.
+  EllGemm() { }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    if (!kSplitKSerial && args.split_k_slices > 1) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    Status status = GemmKernel::can_implement(
+      args.problem_size,
+      args.ref_A.non_const_ref(),
+      args.ref_B.non_const_ref(),
+      args.ref_C.non_const_ref(),
+      args.ref_D
+    );
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    size_t bytes = 0;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
+                                              args.problem_size, 
+                                              {args.ell_blocksize,
+                                              ThreadblockShape::kN, ThreadblockShape::kK},
+                                              args.split_k_slices);
+      
+    tiled_shape.m() *= (args.ell_blocksize + ThreadblockShape::kM - 1 ) / ThreadblockShape::kM;
+    
+    if (kSplitKSerial && args.split_k_slices > 1) {
+
+      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
+    }
+
+    return bytes;
+  }
+
+  Status set(Arguments const &args, cutlass::gemm::GemmCoord const &grid_shape, void *workspace){
+    // Initialize the Params structure
+    params_ = typename GemmKernel::Params{
+      args.problem_size,
+      grid_shape,
+      args.ref_A.non_const_ref(),
+      args.ref_B.non_const_ref(),
+      args.ref_C.non_const_ref(),
+      args.ref_D,
+      args.ell_idx,
+      args.ell_ncol,
+      args.ell_blocksize,
+      args.ell_base_idx,
+      args.epilogue,
+      static_cast<int *>(workspace)
+    };
+    return Status::kSuccess;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {args.ell_blocksize, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.split_k_slices);
+
+    grid_shape.m() *= (args.ell_blocksize + ThreadblockShape::kM - 1 ) / ThreadblockShape::kM;
+
+    if (kSplitKSerial) {
+      if (args.split_k_slices > 1) {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        size_t bytes = get_workspace_size(args);
+      
+        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
+
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    else {
+
+      if (args.split_k_slices > 1) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    return set(args, grid_shape, workspace);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    
+    if (kSplitKSerial && args.split_k_slices > 1) {  
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+    }
+
+    params_.ref_A.reset(args.ref_A.non_const_ref().data());
+    params_.ref_B.reset(args.ref_B.non_const_ref().data());
+    params_.ref_C.reset(args.ref_C.non_const_ref().data());
+    params_.ref_D.reset(args.ref_D.data());
+    params_.output_op = args.epilogue;
+    params_.semaphore = static_cast<int *>(workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(GemmKernel::kThreadCount, 1, 1);
+
+    cudaError_t result;
+
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+
+    if (smem_size >= (48 << 10)) {
+      result = cudaFuncSetAttribute(Kernel<GemmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for column-major output exchanges problem size and operand.
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB,
+    /// If true, kernel supports split-K as a serial reduction
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator_,
+    /// Sparse matrix is A or not
+    bool IsASparse>
+class EllGemm<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
+           layout::ColumnMajor,  // partially specialized on LayoutC
+           ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
+           WarpShape_, InstructionShape_, EpilogueOutputOp_,
+           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
+           SplitKSerial, Operator_, IsASparse> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+  static bool const kSplitKSerial = SplitKSerial;
+  static bool const kIsASparse = false;
+
+  using UnderlyingOperator = EllGemm< 
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type,
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    ElementC,
+    layout::RowMajor,    
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    kAlignmentB,
+    kAlignmentA,
+    SplitKSerial,
+    Operator,
+    kIsASparse
+  >;
+
+  using UnderlyingArguments = typename UnderlyingOperator::Arguments;
+  using GemmKernel = typename UnderlyingOperator::GemmKernel;
+  static int const kAlignmentC = UnderlyingOperator::kAlignmentC;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+    TensorRef<ElementA const, LayoutA> ref_A;
+    TensorRef<ElementB const, LayoutB> ref_B;
+    TensorRef<ElementC const, LayoutC> ref_C;
+    TensorRef<ElementC, LayoutC> ref_D;
+    const int* ell_idx;
+    int ell_ncol;
+    int ell_blocksize;
+    int ell_base_idx;
+    typename EpilogueOutputOp::Params epilogue;
+    int split_k_slices;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() { }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      TensorRef<ElementA const, LayoutA> ref_A_,
+      TensorRef<ElementB const, LayoutB> ref_B_,
+      TensorRef<ElementC const, LayoutC> ref_C_,
+      TensorRef<ElementC, LayoutC> ref_D_,
+      const int* ell_idx_,
+      int ell_ncol_,
+      int ell_blocksize_,
+      int ell_base_idx_,
+      typename EpilogueOutputOp::Params epilogue_ = 
+        typename EpilogueOutputOp::Params(),
+      int split_k_slices = 1
+    ):
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      ref_B(ref_B_),
+      ref_C(ref_C_),
+      ref_D(ref_D_),
+      ell_idx(ell_idx_),
+      ell_ncol(ell_ncol_),
+      ell_blocksize(ell_blocksize_),
+      ell_base_idx(ell_base_idx_),
+      epilogue(epilogue_),
+      split_k_slices(split_k_slices) { }
+  };
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the GEMM.
+  EllGemm() { }
+
+  /// Helper to construct a transposed equivalent for the underying GEMM operator
+  static UnderlyingArguments to_underlying_arguments(Arguments const &args) {
+    return UnderlyingArguments(
+      {args.problem_size.n(), args.problem_size.m(), args.problem_size.k()},
+      {args.ref_B.data(), args.ref_B.stride(0)},
+      {args.ref_A.data(), args.ref_A.stride(0)},
+      {args.ref_C.data(), args.ref_C.stride(0)},
+      {args.ref_D.data(), args.ref_D.stride(0)},
+      args.ell_idx,
+      args.ell_ncol,
+      args.ell_blocksize,
+      args.ell_base_idx,
+      args.epilogue,
+      args.split_k_slices
+    );
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    size_t bytes = 0;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, args.ell_blocksize, ThreadblockShape::kK},
+      args.split_k_slices);
+    
+    tiled_shape.n() *= (args.ell_blocksize + ThreadblockShape::kN - 1 ) / ThreadblockShape::kN;
+
+    if (kSplitKSerial && args.split_k_slices > 1) {
+
+      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
+    }
+
+    return bytes;
+  }
+
+  Status set(Arguments const &args, cutlass::gemm::GemmCoord const &grid_shape, void *workspace){
+    // Initialize the Params structure
+    return underlying_operator_.set(to_underlying_arguments(args), grid_shape, workspace);
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      {args.problem_size.n(), args.problem_size.m(), args.problem_size.k()}, 
+      {ThreadblockShape::kM, args.ell_blocksize, ThreadblockShape::kK},
+      args.split_k_slices);
+    
+    grid_shape.n() *= (args.ell_blocksize + ThreadblockShape::kN - 1 ) / ThreadblockShape::kN;
+
+    if (kSplitKSerial) {
+      if (args.split_k_slices > 1) {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        size_t bytes = get_workspace_size(args);
+      
+        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
+
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    else {
+
+      if (args.split_k_slices > 1) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    // Initialize the Params structure
+    set(args, grid_shape, workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/device/gemm.h b/include/cutlass/gemm/device/gemm.h
index 045fba7427..c6f488b146 100644
--- a/include/cutlass/gemm/device/gemm.h
+++ b/include/cutlass/gemm/device/gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -45,6 +45,8 @@
 #include "cutlass/gemm/kernel/default_gemm.h"
 #include "cutlass/gemm/device/default_gemm_configuration.h"
 
+#include "cutlass/layout/permute.h"
+
 ////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
@@ -225,7 +227,9 @@ template <
     /// Gather operand B by using an index array
     bool GatherB = false,
     /// Scatter result D by using an index array
-    bool ScatterD = false>
+    bool ScatterD = false,
+    /// Permute result D
+    typename PermuteDLayout = layout::NoPermute>
 class Gemm {
  public:
 
@@ -280,7 +284,8 @@ class Gemm {
     SharedMemoryClearOption::kNone,
     GatherA,
     GatherB,
-    ScatterD
+    ScatterD,
+    PermuteDLayout
   >::GemmKernel;
 
   /// Argument structure
@@ -486,6 +491,7 @@ class Gemm {
       }
     }
 
+    cutlass::arch::synclog_setup();
     cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
 
     result = cudaGetLastError();
@@ -504,7 +510,7 @@ class Gemm {
     void *workspace = nullptr, 
     cudaStream_t stream = nullptr) {
     
-    Status status = initialize(args, workspace);
+    Status status = initialize(args, workspace, stream);
     
     if (status == Status::kSuccess) {
       status = run(stream);
@@ -516,7 +522,7 @@ class Gemm {
 
 ////////////////////////////////////////////////////////////////////////////////
 
-/// Parital specialization for column-major output exchanges problem size and operand.
+/// Partial specialization for column-major output exchanges problem size and operand.
 template <
     /// Element type for A matrix operand
     typename ElementA_,
@@ -559,14 +565,16 @@ template <
     /// Gather operand B by using an index array
     bool GatherB,
     /// Scatter result D by using an index array
-    bool ScatterD
+    bool ScatterD,
+    /// Permute result D
+    typename PermuteDLayout
 >
 class Gemm<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
            layout::ColumnMajor,  // partially specialized on LayoutC
            ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
            WarpShape_, InstructionShape_, EpilogueOutputOp_,
            ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial,
-           Operator_, GatherA, GatherB, ScatterD> {
+           Operator_, GatherA, GatherB, ScatterD, PermuteDLayout> {
  public:
 
   using ElementA = ElementA_;
@@ -617,7 +625,8 @@ class Gemm<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
     Operator,
     GatherB,
     GatherA,
-    ScatterD
+    ScatterD,
+    PermuteDLayout
   >;
 
   using UnderlyingArguments = typename UnderlyingOperator::Arguments;
diff --git a/include/cutlass/gemm/device/gemm_array.h b/include/cutlass/gemm/device/gemm_array.h
index ac47a5c909..1ae2db467f 100644
--- a/include/cutlass/gemm/device/gemm_array.h
+++ b/include/cutlass/gemm/device/gemm_array.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -446,6 +446,7 @@ class GemmArray {
       }
     }
 
+    cutlass::arch::synclog_setup();
     cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
 
     result = cudaGetLastError();
@@ -476,7 +477,7 @@ class GemmArray {
 
 ////////////////////////////////////////////////////////////////////////////////
 
-/// Parital specialization for column-major output exchanges problem size and operand.
+/// Partial specialization for column-major output exchanges problem size and operand.
 template <
   /// Element type for A matrix operand
   typename ElementA_,
diff --git a/include/cutlass/gemm/device/gemm_batched.h b/include/cutlass/gemm/device/gemm_batched.h
index fe265b5ab5..5981457c73 100644
--- a/include/cutlass/gemm/device/gemm_batched.h
+++ b/include/cutlass/gemm/device/gemm_batched.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -29,7 +29,7 @@
  *
  **************************************************************************************************/
 /*! \file
-    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+    \brief Template for a pipelined batch GEMM kernel.
 */
 
 #pragma once
@@ -424,6 +424,7 @@ class GemmBatched {
       }
     }
 
+    cutlass::arch::synclog_setup();
     cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
 
     result = cudaGetLastError();
@@ -454,7 +455,7 @@ class GemmBatched {
 
 ////////////////////////////////////////////////////////////////////////////////
 
-/// Parital specialization for column-major output exchanges problem size and operand.
+/// Partial specialization for column-major output exchanges problem size and operand.
 template <
   /// Element type for A matrix operand
   typename ElementA_,
diff --git a/include/cutlass/gemm/device/gemm_complex.h b/include/cutlass/gemm/device/gemm_complex.h
index a230636162..e36c69cefb 100644
--- a/include/cutlass/gemm/device/gemm_complex.h
+++ b/include/cutlass/gemm/device/gemm_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -445,6 +445,7 @@ class GemmComplex {
       }
     }
 
+    cutlass::arch::synclog_setup();
     cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
 
     result = cudaGetLastError();
@@ -475,7 +476,7 @@ class GemmComplex {
 
 ////////////////////////////////////////////////////////////////////////////////
 
-/// Parital specialization for column-major output exchanges problem size and operand.
+/// Partial specialization for column-major output exchanges problem size and operand.
 template <
   /// Element type for A matrix operand
   typename ElementA_,
diff --git a/include/cutlass/gemm/device/gemm_grouped.h b/include/cutlass/gemm/device/gemm_grouped.h
index 628a56b033..877375e946 100644
--- a/include/cutlass/gemm/device/gemm_grouped.h
+++ b/include/cutlass/gemm/device/gemm_grouped.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -28,29 +28,14 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
-/*! 
+/*!
   \file
-  \brief The universal GEMM accommodates serial reductions, parallel reductions, batched strided, and 
-    batched array variants.
+  \brief Device-level grouped GEMM.
 */
 
 #pragma once
 
-#include <limits>
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/device_kernel.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/gemm/kernel/gemm_universal.h"
-
-#include "cutlass/gemm/kernel/default_gemm_universal.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-
-#include "cutlass/trace.h"
+#include "cutlass/gemm/device/base_grouped.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -62,220 +47,9 @@ namespace device {
 
 /// GEMM Grouped
 template <typename GemmKernel_>
-class GemmGrouped {
+class GemmGrouped : public BaseGrouped<GemmKernel_> {
 public:
-
   using GemmKernel = GemmKernel_;
-  
-  using ElementA = typename GemmKernel::ElementA;
-  using LayoutA = typename GemmKernel::LayoutA;
-  using TensorRefA = TensorRef<ElementA const, LayoutA>;
-  static ComplexTransform const kTransformA = GemmKernel::kTransformA;
-  static int const kAlignmentA = GemmKernel::kAlignmentA;
-
-  using ElementB = typename GemmKernel::ElementB;
-  using LayoutB = typename GemmKernel::LayoutB;
-  using TensorRefB = TensorRef<ElementB const, LayoutB>;
-  static ComplexTransform const kTransformB = GemmKernel::kTransformB;
-  static int const kAlignmentB = GemmKernel::kAlignmentB;
-
-  using ElementC = typename GemmKernel::ElementC;
-  using LayoutC = typename GemmKernel::LayoutC;
-  using TensorRefC = TensorRef<ElementC const, LayoutC>;
-  using TensorRefD = TensorRef<ElementC, LayoutC>;
-  static int const kAlignmentC = GemmKernel::kAlignmentC;
-
-  using ElementAccumulator = typename GemmKernel::Mma::Policy::Operator::ElementC;
-
-  using EpilogueOutputOp = typename GemmKernel::EpilogueOutputOp;
-  using ThreadblockSwizzle = typename GemmKernel::ThreadblockSwizzle;
-
-  using Operator = typename GemmKernel::Operator;
-  using WarpMmaOperator = typename GemmKernel::Mma::Policy::Operator;
-
-  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
-  using MathOperator = typename WarpMmaOperator::MathOperator;
-  using OperatorClass = typename WarpMmaOperator::OperatorClass;
-  using ArchTag = typename WarpMmaOperator::ArchTag;
-  using ThreadblockShape = typename GemmKernel::Mma::Shape;
-  using WarpShape = typename GemmKernel::WarpShape;
-  using InstructionShape = typename GemmKernel::InstructionShape;
-  static int const kStages = GemmKernel::Mma::kStages;
-
-  /// Argument structure
-  using Arguments = typename GemmKernel::Arguments;
-
-protected:
-
-  /// Kernel parameters object
-  typename GemmKernel::Params params_;
-
-public:
-
-  /// Constructs the GEMM.
-  GemmGrouped() { }
-
-  /// Determines whether the GEMM can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-    
-    return GemmKernel::can_implement(args);
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
- 
-    // This kerenl does not utilize a workspace
-    return size_t();
-  }
-
-  /// Computes the grid shape
-  static dim3 get_grid_shape(Arguments const &args) {
-
-    return dim3(args.threadblock_count, 1, 1);
-  }
-
-  /// Computes the maximum number of active blocks per multiprocessor
-  static int maximum_active_blocks(int smem_capacity = -1) {
-
-    CUTLASS_TRACE_HOST("GemmUniversalBase::maximum_active_blocks()");
-
-    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
-
-    CUTLASS_TRACE_HOST("  smem_size: " << smem_size << " bytes");
-
-    cudaError_t result;
-    if (smem_size > (48 << 10)) {
-      result = cudaFuncSetAttribute(Kernel<GemmKernel>,
-                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                    smem_size);
-
-      if (result != cudaSuccess) {
-        CUTLASS_TRACE_HOST(
-          "  cudaFuncSetAttribute() returned error "
-          << cudaGetErrorString(result));
-        return -1;
-      }
-    }
-
-    int max_active_blocks = -1;
-    result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_blocks,
-        Kernel<GemmKernel>,
-        GemmKernel::kThreadCount,
-        smem_size);
-
-    if (result != cudaSuccess) {
-      CUTLASS_TRACE_HOST(
-        "  cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error "
-        << cudaGetErrorString(result));
-      return -1;
-    }
-
-    CUTLASS_TRACE_HOST("  max_active_blocks: " << max_active_blocks);
-    return max_active_blocks;
-  }
-
-  /// Initializes GEMM state from arguments.
-  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
-
-    CUTLASS_TRACE_HOST("GemmUniversalBase::initialize() - workspace " 
-      << workspace << ", stream: " << (stream ? "non-null" : "null"));
-
-    // Workspace
-    size_t workspace_bytes = get_workspace_size(args);
-
-    if (workspace_bytes && !workspace) {
-      return Status::kErrorWorkspaceNull;
-    }
-
-    // Initialize the Params structure
-    params_ = typename GemmKernel::Params(args, workspace);
-   
-    // Specify shared memory capacity for kernel. 
-    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
-
-    if (smem_size >= (48 << 10)) {
-      cudaError_t result = cudaFuncSetAttribute(Kernel<GemmKernel>,
-                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                    smem_size);
-
-      if (result != cudaSuccess) {
-        return Status::kErrorInternal;
-      }
-    }
-
-    return Status::kSuccess;
-  }
-
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
-
-    size_t workspace_bytes = get_workspace_size(args);
-
-    if (workspace_bytes && !workspace) {
-      return Status::kErrorWorkspaceNull;
-    }
-    
-    params_.update(args, workspace);
-    
-    return Status::kSuccess;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-
-    //
-    // Configure grid and block dimensions
-    //
-
-    if (!params_.problem_visitor.problem_count) {
-      return Status::kSuccess;
-    }
-
-    dim3 grid(params_.threadblock_count, 1, 1);
-    dim3 block(GemmKernel::kThreadCount, 1, 1);
-
-    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
-
-    //
-    // Launch kernel
-    //
-
-    // Launch
-    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
-
-    //
-    // Query for errors
-    //
-    cudaError_t result = cudaGetLastError();
-
-    if (result != cudaSuccess) {
-      CUTLASS_TRACE_HOST("  grid launch failed with error " << cudaGetErrorString(result));
-      return Status::kErrorInternal;
-    }
-  
-    return Status::kSuccess;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-    
-    Status status = initialize(args, workspace, stream);
-    
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/device/gemm_layernorm_mainloop_fusion.h b/include/cutlass/gemm/device/gemm_layernorm_mainloop_fusion.h
new file mode 100644
index 0000000000..3de3cecbf0
--- /dev/null
+++ b/include/cutlass/gemm/device/gemm_layernorm_mainloop_fusion.h
@@ -0,0 +1,385 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Device-level GEMM with layernorm elementwise operations fused in mainloop
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_universal.h"
+
+#include "cutlass/gemm/kernel/default_gemm_layernorm_mainloop_fusion.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/device/gemm_universal_base.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*! 
+  The universal GEMM accommodates serial reductions, parallel reductions, batched strided, and 
+  batched array variants.
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for Scale/Bias vectors
+    typename ElementScaleBias_,
+    /// Layout type for Scale/Bias vectors
+    typename LayoutScaleBias_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator
+>
+class GemmLayernormMainloopFusion : 
+  public GemmUniversalBase<
+    typename kernel::DefaultGemmLayernormMainloopFusion<
+      ElementA_,
+      LayoutA_,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      AlignmentB,
+      ElementScaleBias_,
+      LayoutScaleBias_,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_,
+      SharedMemoryClearOption::kNone
+    >::GemmKernel
+  > {
+
+ public:
+
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+
+  using Base = GemmUniversalBase<
+    typename kernel::DefaultGemmLayernormMainloopFusion<
+      ElementA_,
+      LayoutA_,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      AlignmentB,
+      ElementScaleBias_,
+      LayoutScaleBias_,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_,
+      SharedMemoryClearOption::kNone
+    >::GemmKernel
+  >;
+
+  using Arguments = typename Base::Arguments;
+  using GemmKernel = typename Base::GemmKernel;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for column-major output exchanges problem size and operand.
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for Scale/Bias vectors
+    typename ElementScaleBias_,
+    /// Layout type for Scale/Bias vectors
+    typename LayoutScaleBias_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_
+>
+class GemmLayernormMainloopFusion<ElementA_, LayoutA_, ElementB_, LayoutB_, 
+           ElementScaleBias_, LayoutScaleBias_,
+           ElementC_,
+           layout::ColumnMajor,  // partially specialized on LayoutC
+           ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
+           WarpShape_, InstructionShape_, EpilogueOutputOp_,
+           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
+           Operator_> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementScaleBias = ElementScaleBias_;
+  using LayoutScaleBias = LayoutScaleBias_;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+
+  using UnderlyingOperator = typename GemmLayernormMainloopFusion< 
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type,
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    ElementScaleBias,
+    LayoutScaleBias, 
+    ElementC,
+    layout::RowMajor,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    kAlignmentB,
+    kAlignmentA,
+    Operator
+  >::Base;
+
+  using GemmKernel = typename UnderlyingOperator::GemmKernel;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+
+  /// Argument structure
+  using Arguments = typename UnderlyingOperator::Arguments;
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmLayernormMainloopFusion() { }
+
+  /// Helper to construct a transposed equivalent for the underlying GEMM operator
+  static Arguments to_underlying_arguments(Arguments const &args) {
+    return args.transposed_problem();
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const &args) { 
+    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/device/gemm_sparse.h b/include/cutlass/gemm/device/gemm_sparse.h
index 34341771e0..ac453c63b5 100644
--- a/include/cutlass/gemm/device/gemm_sparse.h
+++ b/include/cutlass/gemm/device/gemm_sparse.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -479,6 +479,7 @@ class SparseGemm {
 
     int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
 
+    cutlass::arch::synclog_setup();
     cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
 
     cudaError_t result = cudaGetLastError();
diff --git a/include/cutlass/gemm/device/gemm_sparse_universal.h b/include/cutlass/gemm/device/gemm_sparse_universal.h
new file mode 100644
index 0000000000..b7d8cecfa7
--- /dev/null
+++ b/include/cutlass/gemm/device/gemm_sparse_universal.h
@@ -0,0 +1,211 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief
+*/
+
+#pragma once
+
+#include "cutlass/arch/mma.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_sparse_universal.h"
+
+#include "cutlass/gemm/kernel/default_gemm_sparse_universal.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/device/gemm_universal_base.h"
+
+#include "cutlass/layout/permute.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*! 
+  GemmSparseUniversal is a stateful, reusable Sparse GEMM handle.  Once initialized for a given GEMM computation
+  (problem geometry and data references), it can be reused across different GEMM problems having the
+  geometry.  (Once initialized, details regarding problem geometry and references to workspace memory
+  cannot be updated.)
+
+  The universal GEMM accommodates serial reductions, parallel reductions, batched strided, and 
+  batched array variants.
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassTensorOp,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_ = arch::Sm80,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator>
+class GemmSparseUniversal : 
+  public GemmUniversalBase<
+    typename kernel::DefaultGemmSparseUniversal<
+      ElementA_,
+      LayoutA_,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_
+    >::GemmKernel
+  > {
+
+ public:
+
+  static_assert((platform::is_same<LayoutC_, layout::RowMajor>::value),
+             "Epilogue of Ampere sparse GEMM must be row major for now.");
+
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+
+  using Base = GemmUniversalBase<
+    typename kernel::DefaultGemmSparseUniversal<
+      ElementA_,
+      LayoutA_,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_
+    >::GemmKernel
+  >;
+
+  using Arguments = typename Base::Arguments;
+  using GemmKernel = typename Base::GemmKernel;
+
+  using ElementE = typename GemmKernel::ElementE;
+
+  using LayoutE = typename GemmKernel::LayoutE;
+
+  static int const kAlignmentE = 128 / sizeof_bits<ElementE>::value;
+
+  static int const kSparse = GemmKernel::kSparse;
+  static int const kMetaSizeInBits = GemmKernel::kMetaSizeInBits;
+  static int const kElementsPerElementE = GemmKernel::kElementsPerElementE;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/device/gemm_sparse_universal_with_absmax.h b/include/cutlass/gemm/device/gemm_sparse_universal_with_absmax.h
new file mode 100644
index 0000000000..a313ddc907
--- /dev/null
+++ b/include/cutlass/gemm/device/gemm_sparse_universal_with_absmax.h
@@ -0,0 +1,202 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief
+*/
+
+#pragma once
+
+#include "cutlass/arch/mma.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_sparse_universal.h"
+
+#include "cutlass/gemm/kernel/default_gemm_sparse_universal_with_absmax.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/device/gemm_universal_base.h"
+
+#include "cutlass/layout/permute.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassTensorOp,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_ = arch::Sm80,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator>
+class GemmSparseUniversalWithAbsmax :
+  public GemmUniversalBase<
+    typename kernel::DefaultGemmSparseUniversalWithAbsmax<
+      ElementA_,
+      LayoutA_,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_
+    >::GemmKernel
+  > {
+
+ public:
+
+  static_assert((platform::is_same<LayoutC_, layout::RowMajor>::value),
+             "Epilogue of Ada sparse GEMM must be row major for now.");
+
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+
+  using Base = GemmUniversalBase<
+    typename kernel::DefaultGemmSparseUniversalWithAbsmax<
+      ElementA_,
+      LayoutA_,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_
+    >::GemmKernel
+  >;
+
+  using Arguments = typename Base::Arguments;
+  using GemmKernel = typename Base::GemmKernel;
+
+  using ElementE = typename GemmKernel::ElementE;
+
+  using LayoutE = typename GemmKernel::LayoutE;
+
+  static int const kAlignmentE = 128 / sizeof_bits<ElementE>::value;
+
+  static int const kSparse = GemmKernel::kSparse;
+  static int const kMetaSizeInBits = GemmKernel::kMetaSizeInBits;
+  static int const kElementsPerElementE = GemmKernel::kElementsPerElementE;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/device/gemm_sparse_with_absmax.h b/include/cutlass/gemm/device/gemm_sparse_with_absmax.h
new file mode 100644
index 0000000000..e599217a13
--- /dev/null
+++ b/include/cutlass/gemm/device/gemm_sparse_with_absmax.h
@@ -0,0 +1,360 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a sparse GEMM kernel that computes the absolute maximum of the output tensor
+    and applies additional scaling factors to operands.
+*/
+
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/sparse_gemm.h"
+
+#include "cutlass/gemm/kernel/default_gemm_sparse_with_absmax.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ =
+        typename threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial = false,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator>
+class SparseGemmWithAbsmax {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  using MathOperator = Operator;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  /// Define the kernel
+  using GemmKernel = typename kernel::DefaultSparseGemmWithAbsmax<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kSplitKSerial,
+    Operator
+  >::GemmKernel;
+
+  using ElementE = typename GemmKernel::ElementE;
+
+  using LayoutE = typename GemmKernel::LayoutE;
+
+  static int const kAlignmentE = 128 / sizeof_bits<ElementE>::value;
+
+  static int const kSparse = GemmKernel::kSparse;
+  static int const kMetaSizeInBits = GemmKernel::kMetaSizeInBits;
+  static int const kElementsPerElementE = GemmKernel::kElementsPerElementE;
+
+  using Arguments = typename GemmKernel::Arguments;
+
+private:
+
+  /// Kernel parameters object
+  typename GemmKernel::Params params_;
+
+public:
+
+  /// Constructs the GEMM.
+  SparseGemmWithAbsmax() { }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    if (!kSplitKSerial && args.split_k_slices > 1) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    Status status = GemmKernel::can_implement(
+      args.problem_size,
+      args.ref_A.non_const_ref(),
+      args.ref_B.non_const_ref(),
+      args.ref_C.non_const_ref(),
+      args.ref_D,
+      args.ref_E.non_const_ref()
+    );
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    size_t bytes = 0;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.split_k_slices);
+    
+    if (kSplitKSerial && args.split_k_slices > 1) {
+
+      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
+    }
+
+    return bytes;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.split_k_slices);
+
+    if (kSplitKSerial) {
+      if (args.split_k_slices > 1) {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        size_t bytes = get_workspace_size(args);
+      
+        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
+
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    else {
+
+      if (args.split_k_slices > 1) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    // Initialize the Params structure
+    params_ = typename GemmKernel::Params{
+      args.problem_size,
+      grid_shape,
+      args.ref_A.non_const_ref(),
+      args.ref_B.non_const_ref(),
+      args.ref_C.non_const_ref(),
+      args.ref_D,
+      args.ref_E.non_const_ref(),
+      args.ref_Aux,
+      args.ptr_Vector,
+      args.ldr,
+      args.epilogue,
+      static_cast<int *>(workspace)
+    };
+    
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(Kernel<GemmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    
+    if (kSplitKSerial && args.split_k_slices > 1) {  
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+    }
+
+    params_.ref_A.reset(args.ref_A.non_const_ref().data());
+    params_.ref_B.reset(args.ref_B.non_const_ref().data());
+    params_.ref_C.reset(args.ref_C.non_const_ref().data());
+    params_.ref_D.reset(args.ref_D.data());
+    params_.ref_E.reset(args.ref_E.non_const_ref().data());
+    params_.output_op = args.epilogue;
+    params_.semaphore = static_cast<int *>(workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(GemmKernel::kThreadCount, 1, 1);
+
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    cudaError_t result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/device/gemm_sparse_with_visitor.h b/include/cutlass/gemm/device/gemm_sparse_with_visitor.h
new file mode 100644
index 0000000000..73edfa35dd
--- /dev/null
+++ b/include/cutlass/gemm/device/gemm_sparse_with_visitor.h
@@ -0,0 +1,342 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/sparse_gemm.h"
+
+#include "cutlass/gemm/kernel/default_gemm_sparse_with_visitor.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+#include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*! Sparse GEMM with visitor
+ */
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm80,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename FusionCallbacks_ =
+        typename cutlass::epilogue::threadblock::detail::EmptyCallbacks,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ =
+        typename threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Number of stages used in the pipelined epilogue
+    int EpilogueStages = 1>
+class SparseGemmWithVisitor {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using FusionCallbacks = FusionCallbacks_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  using MathOperator = Operator;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+
+  /// Define the kernel
+  using GemmKernel = typename kernel::DefaultSparseGemmWithVisitor<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    FusionCallbacks,
+    ThreadblockSwizzle,
+    kStages,
+    Operator,
+    EpilogueStages
+  >::GemmKernel;
+
+  using ElementE = typename GemmKernel::ElementE;
+
+  using LayoutE = typename GemmKernel::LayoutE;
+
+  static int const kAlignmentE = 128 / sizeof_bits<ElementE>::value;
+
+  static int const kSparse = GemmKernel::kSparse;
+  static int const kMetaSizeInBits = GemmKernel::kMetaSizeInBits;
+  static int const kElementsPerElementE = GemmKernel::kElementsPerElementE;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+    TensorRef<ElementA const, LayoutA> ref_A;
+    TensorRef<ElementB const, LayoutB> ref_B;
+    TensorRef<ElementE const, LayoutE> ref_E;
+    typename FusionCallbacks::Arguments epilogue;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(): problem_size(0, 0, 0) {
+
+    }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      TensorRef<ElementA const, LayoutA> ref_A_,
+      TensorRef<ElementB const, LayoutB> ref_B_,
+      TensorRef<ElementE, LayoutE> ref_E_,
+      typename FusionCallbacks::Arguments epilogue_ = 
+        typename FusionCallbacks::Arguments()
+    ):
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      ref_B(ref_B_),
+      ref_E(ref_E_),
+      epilogue(epilogue_) {
+
+    }
+  };
+
+private:
+
+  /// Kernel parameters object
+  typename GemmKernel::Params params_;
+
+public:
+
+  /// Constructs the GEMM.
+  SparseGemmWithVisitor() { }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    Status status = GemmKernel::can_implement(
+      args.problem_size,
+      args.ref_A.non_const_ref(),
+      args.ref_B.non_const_ref(),
+      cutlass::TensorRef<ElementC, LayoutC>(), // It only matters that it's empty.
+      cutlass::TensorRef<ElementC, LayoutC>(), // Same as above.
+      args.ref_E.non_const_ref()
+    );
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+
+    size_t bytes = 0;
+
+    return bytes;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    constexpr int SplitKSlices = 1;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      SplitKSlices);
+
+    // Initialize the Params structure
+    params_ = typename GemmKernel::Params{
+      args.problem_size,
+      grid_shape,
+      args.ref_A.non_const_ref(),
+      args.ref_B.non_const_ref(),
+      args.ref_E.non_const_ref(),
+      args.epilogue
+    };
+
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(Kernel<GemmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    params_.ref_A.reset(args.ref_A.non_const_ref().data());
+    params_.ref_B.reset(args.ref_B.non_const_ref().data());
+    params_.ref_E.reset(args.ref_E.non_const_ref().data());
+    params_.output_op = args.epilogue;
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(GemmKernel::kThreadCount, 1, 1);
+
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+
+    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    cudaError_t result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+
+    Status status = initialize(args, workspace, stream);
+
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/device/gemm_splitk_parallel.h b/include/cutlass/gemm/device/gemm_splitk_parallel.h
index 243a808a02..f78c5a2169 100644
--- a/include/cutlass/gemm/device/gemm_splitk_parallel.h
+++ b/include/cutlass/gemm/device/gemm_splitk_parallel.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -250,9 +250,6 @@ class GemmSplitKParallel {
 
   /// Determines whether the GEMM can execute the given problem.
   static Status can_implement(Arguments const &args) {
-
-    // TODO
-
     return Status::kSuccess;
   }
 
@@ -360,6 +357,7 @@ class GemmSplitKParallel {
       }
     }
 
+    cutlass::arch::synclog_setup();
     Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(gemm_params_);
 
     result = cudaGetLastError();
diff --git a/include/cutlass/gemm/device/gemm_universal.h b/include/cutlass/gemm/device/gemm_universal.h
index ddd997a8b8..55413b77ab 100644
--- a/include/cutlass/gemm/device/gemm_universal.h
+++ b/include/cutlass/gemm/device/gemm_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -34,6 +34,7 @@
 
 #pragma once
 
+#include "cutlass/arch/mma.h"
 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
 #include "cutlass/arch/arch.h"
@@ -47,6 +48,8 @@
 #include "cutlass/gemm/device/default_gemm_configuration.h"
 #include "cutlass/gemm/device/gemm_universal_base.h"
 
+#include "cutlass/layout/permute.h"
+
 ////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
@@ -56,6 +59,11 @@ namespace device {
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /*! 
+  GemmUniversal is a stateful, reusable GEMM handle.  Once initialized for a given GEMM computation
+  (problem geometry and data references), it can be reused across different GEMM problems having the
+  geometry.  (Once initialized, details regarding problem geometry and references to workspace memory
+  cannot be updated.)
+
   The universal GEMM accommodates serial reductions, parallel reductions, batched strided, and 
   batched array variants.
 */
@@ -123,7 +131,13 @@ template <
     /// Gather operand B by using an index array
     bool GatherB = false,
     /// Scatter result D by using an index array
-    bool ScatterD = false
+    bool ScatterD = false,
+    /// Permute result D
+    typename PermuteDLayout_ = layout::NoPermute,
+    /// Permute operand A
+    typename PermuteALayout_ = layout::NoPermute,
+    /// Permute operand B
+    typename PermuteBLayout_ = layout::NoPermute
 >
 class GemmUniversal : 
   public GemmUniversalBase<
@@ -151,7 +165,10 @@ class GemmUniversal :
       SharedMemoryClearOption::kNone,
       GatherA,
       GatherB,
-      ScatterD
+      ScatterD,
+      PermuteDLayout_,
+      PermuteALayout_,
+      PermuteBLayout_
     >::GemmKernel
   > {
 
@@ -166,6 +183,9 @@ class GemmUniversal :
   using EpilogueOutputOp = EpilogueOutputOp_;
   using ThreadblockSwizzle = ThreadblockSwizzle_;
   using Operator = Operator_;
+  using PermuteDLayout = PermuteDLayout_;
+  using PermuteALayout = PermuteALayout_;
+  using PermuteBLayout = PermuteBLayout_;
   static int const kStages = Stages;
   static int const kAlignmentA = AlignmentA;
   static int const kAlignmentB = AlignmentB;
@@ -198,7 +218,10 @@ class GemmUniversal :
       SharedMemoryClearOption::kNone,
       GatherA,
       GatherB,
-      ScatterD
+      ScatterD,
+      PermuteDLayout_,
+      PermuteALayout_,
+      PermuteBLayout_
     >::GemmKernel
   >;
 
@@ -208,7 +231,7 @@ class GemmUniversal :
 
 ////////////////////////////////////////////////////////////////////////////////
 
-/// Parital specialization for column-major output exchanges problem size and operand.
+/// Partial specialization for column-major output exchanges problem size and operand.
 template <
     /// Element type for A matrix operand
     typename ElementA_,
@@ -255,14 +278,21 @@ template <
     /// Gather operand B by using an index array
     bool GatherB,
     /// Scatter result D by using an index array
-    bool ScatterD
+    bool ScatterD,
+    /// Permute result D
+    typename PermuteDLayout_,
+    /// Permute operand A
+    typename PermuteALayout_,
+    /// Permute operand B
+    typename PermuteBLayout_
 >
 class GemmUniversal<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
            layout::ColumnMajor,  // partially specialized on LayoutC
            ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
            WarpShape_, InstructionShape_, EpilogueOutputOp_,
            ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
-           Operator_, TransformA, TransformB, GatherA, GatherB, ScatterD> {
+           Operator_, TransformA, TransformB, GatherA, GatherB, ScatterD,
+           PermuteDLayout_, PermuteALayout_, PermuteBLayout_> {
  public:
 
   using ElementA = ElementA_;
@@ -284,6 +314,9 @@ class GemmUniversal<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
   using EpilogueOutputOp = EpilogueOutputOp_;
   using ThreadblockSwizzle = ThreadblockSwizzle_;
   using Operator = Operator_;
+  using PermuteDLayout = PermuteDLayout_;
+  using PermuteALayout = PermuteALayout_;
+  using PermuteBLayout = PermuteBLayout_;
   static int const kStages = Stages;
   static int const kAlignmentA = AlignmentA;
   static int const kAlignmentB = AlignmentB;
@@ -313,7 +346,10 @@ class GemmUniversal<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
     kTransformA,
     GatherB,
     GatherA,
-    ScatterD
+    ScatterD,
+    PermuteDLayout,
+    PermuteBLayout,
+    PermuteALayout
   >::Base;
 
   using GemmKernel = typename UnderlyingOperator::GemmKernel;
diff --git a/include/cutlass/gemm/device/gemm_universal_adapter.h b/include/cutlass/gemm/device/gemm_universal_adapter.h
index 95196ae29b..5c6c2a0f08 100644
--- a/include/cutlass/gemm/device/gemm_universal_adapter.h
+++ b/include/cutlass/gemm/device/gemm_universal_adapter.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -28,45 +28,563 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
-/*! 
+/*!
   \file
-  \brief The universal GEMM accommodates serial reductions, parallel reductions, batched strided, and 
+  \brief The universal GEMM accommodates serial reductions, parallel reductions, batched strided, and
     batched array variants.
 */
 
 #pragma once
 
+// common
 #include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/detail/mma.hpp"
+#include "cutlass/cuda_host_adapter.hpp"
+
+#include "cutlass/kernel_launch.h"
+#if !defined(__CUDACC_RTC__)
+#include "cutlass/cluster_launch.hpp"
+#include "cutlass/trace.h"
+#endif // !defined(__CUDACC_RTC__)
+
+// 2.x
 #include "cutlass/gemm/device/gemm_universal_base.h"
 #include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h"
+
+// 3.x
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::device {
 
 ////////////////////////////////////////////////////////////////////////////////
 
-namespace cutlass {
-namespace gemm {
-namespace device {
+/*!
+  GemmUniversalAdapter is a stateful, reusable GEMM handle built around a kernel
+  of type cutlass::gemm::kernel::Gemm or cutlass::gemm::kernel::GemmUniversal.
+
+  It manages the lifetime of the underlying `kernel::Params` struct, and exposes APIs
+  to create it from the host facing arguments. For power users, new static methods
+  are exposed in 3.x APIs that bypass the stateful methods or args->params lowering.
+
+  It supports kernel types that implement both the 2.x and 3.0 APIs,
+  however, this is done by specializing the implementation of GemmUniversalAdapter
+  on the two kernel API types, and thus, GemmUniversalAdapter's behaviour might
+  differ between the two specializations.
+*/
+template <class GemmKernel_, class Enable = void>
+class GemmUniversalAdapter;
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////// CUTLASS 3.x API /////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+// Work-around for some DispatchPolicy types not having a Stages member.
+// In that case, the Stages value is 0.  Most code should static_assert
+// that the number of stages is valid.
+
+// Whether DispatchPolicy::Stages is valid.
+// It should also be convertible to int, but if not, that will show up
+// as a build error when GemmUniversalAdapter attempts to assign it to kStages.
+template <class DispatchPolicy, class Enable = void>
+struct has_Stages : cute::false_type {};
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
+template <class DispatchPolicy>
+struct has_Stages<DispatchPolicy, cute::void_t<decltype(DispatchPolicy::Stages)>> : cute::true_type {};
 
-template <typename GemmKernel_>
-class GemmUniversalAdapter {
+template<class DispatchPolicy>
+constexpr int stages_member(DispatchPolicy) {
+  if constexpr (has_Stages<DispatchPolicy>::value) {
+    return DispatchPolicy::Stages;
+  }
+  else {
+    return 0;
+  }
+}
+
+} // namespace detail
+
+template <class GemmKernel_>
+class GemmUniversalAdapter<
+  GemmKernel_,
+  cute::enable_if_t<gemm::detail::IsCutlass3GemmKernel<GetUnderlyingKernel_t<GemmKernel_>>::value>>
+{
 public:
+  using GemmKernel = GetUnderlyingKernel_t<GemmKernel_>;
+  using TileShape = typename GemmKernel::TileShape;
+  using ElementA = typename GemmKernel::ElementA;
+  using ElementB = typename GemmKernel::ElementB;
+  using ElementC = typename GemmKernel::ElementC;
+  using ElementD = typename GemmKernel::ElementD;
+  using ElementAccumulator = typename GemmKernel::ElementAccumulator;
+  using DispatchPolicy = typename GemmKernel::DispatchPolicy;
+  using CollectiveMainloop = typename GemmKernel::CollectiveMainloop;
+  using CollectiveEpilogue = typename GemmKernel::CollectiveEpilogue;
+
+  // Map back to 2.x type as best as possible
+  using LayoutA = gemm::detail::StrideToLayoutTagA_t<typename GemmKernel::StrideA>;
+  using LayoutB = gemm::detail::StrideToLayoutTagB_t<typename GemmKernel::StrideB>;
+  using LayoutC = gemm::detail::StrideToLayoutTagC_t<typename GemmKernel::StrideC>;
+  using LayoutD = gemm::detail::StrideToLayoutTagC_t<typename GemmKernel::StrideD>;
+
+  static bool const kEnableCudaHostAdapter = CUTLASS_ENABLE_CUDA_HOST_ADAPTER;
+
+  static ComplexTransform const kTransformA = cute::is_same_v<typename GemmKernel::CollectiveMainloop::TransformA, cute::conjugate> ?
+                                              ComplexTransform::kConjugate : ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = cute::is_same_v<typename GemmKernel::CollectiveMainloop::TransformB, cute::conjugate> ?
+                                              ComplexTransform::kConjugate : ComplexTransform::kNone;
+
+  // Legacy: Assume MultiplyAdd only since we do not use this tag type in 3.0
+  using MathOperator = cutlass::arch::OpMultiplyAdd;
+
+  using OperatorClass = cutlass::detail::get_operator_class_t<typename CollectiveMainloop::TiledMma>;
+
+  using ArchTag = typename GemmKernel::ArchTag;
+
+  // NOTE: Assume identity swizzle for now
+  using ThreadblockSwizzle = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
+
+  // Assume TiledMma's ShapeMNK is the same as 2.x's ThreadblockShape
+  using ThreadblockShape = cutlass::gemm::GemmShape<
+      cute::size<0>(TileShape{}),
+      cute::size<1>(TileShape{}),
+      cute::size<2>(TileShape{})>;
+
+  using ClusterShape = cutlass::gemm::GemmShape<
+      cute::size<0>(typename GemmKernel::DispatchPolicy::ClusterShape{}),
+      cute::size<1>(typename GemmKernel::DispatchPolicy::ClusterShape{}),
+      cute::size<2>(typename GemmKernel::DispatchPolicy::ClusterShape{})>;
+
+  // Instruction shape is easy too, since we get that directly from our TiledMma's atom shape
+  using InstructionShape = cutlass::gemm::GemmShape<
+      cute::size<0>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{}),
+      cute::size<1>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{}),
+      cute::size<2>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{})>;
+
+  // Legacy: provide a correct warp count, but no reliable warp shape
+  static int const kThreadCount = GemmKernel::MaxThreadsPerBlock;
+
+  // Warp shape is not a primary API type in 3.x
+  // But we can best approximate it by inspecting the TiledMma
+  // For this, we make the assumption that we always have 4 warps along M, and rest along N, none along K
+  // We also always round up the warp count to 4 if the tiled mma is smaller than 128 threads
+  static constexpr int WarpsInMma = cute::max(4, CUTE_STATIC_V(cute::size(typename GemmKernel::TiledMma{})) / 32);
+  static constexpr int WarpsInMmaM = 4;
+  static constexpr int WarpsInMmaN = cute::ceil_div(WarpsInMma, WarpsInMmaM);
+  using WarpCount = cutlass::gemm::GemmShape<WarpsInMmaM, WarpsInMmaN, 1>;
+  using WarpShape = cutlass::gemm::GemmShape<
+      CUTE_STATIC_V(cute::tile_size<0>(typename CollectiveMainloop::TiledMma{})) / WarpsInMmaM,
+      CUTE_STATIC_V(cute::tile_size<1>(typename CollectiveMainloop::TiledMma{})) / WarpsInMmaN,
+      CUTE_STATIC_V(cute::tile_size<2>(typename CollectiveMainloop::TiledMma{}))>;
+
+  static int constexpr kStages = detail::stages_member(typename CollectiveMainloop::DispatchPolicy{});
+
+  // Inspect TiledCopy for A and B to compute the alignment size
+  static int constexpr kAlignmentA = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+      typename CollectiveMainloop::GmemTiledCopyA, ElementA, typename CollectiveMainloop::TiledMma::ValTypeA>();
+  static int constexpr kAlignmentB = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+      typename CollectiveMainloop::GmemTiledCopyB, ElementB, typename CollectiveMainloop::TiledMma::ValTypeB>();
+  static int constexpr kAlignmentC = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+      typename CollectiveEpilogue::GmemTiledCopyC, ElementC>();
+  static int constexpr kAlignmentD = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+      typename CollectiveEpilogue::GmemTiledCopyD, ElementD>();
+
+  using EpilogueOutputOp = typename CollectiveEpilogue::ThreadEpilogueOp;
+
+  // Split-K preserves splits that are 128b aligned
+  static int constexpr kSplitKAlignment = cute::max(
+      128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
+
+  /// Argument structure: User API
+  using Arguments = typename GemmKernel::Arguments;
+  /// Argument structure: Kernel API
+  using Params = typename GemmKernel::Params;
 
-  using GemmKernel = GemmKernel_;
+private:
 
-  static bool const kInternalTranspose = 
-    platform::is_same<typename GemmKernel::LayoutC, cutlass::layout::RowMajor>::value;
+  /// Kernel API parameters object
+  Params params_;
+
+public:
+
+  /// Access the Params structure
+  Params const& params() const {
+    return params_;
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status
+  can_implement(Arguments const& args) {
+    if (GemmKernel::can_implement(args)) {
+      return Status::kSuccess;
+    }
+    else {
+      return Status::kInvalid;
+    }
+  }
+
+  /// Gets the workspace size
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_bytes = 0;
+    if (args.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      workspace_bytes += sizeof(int) * size_t(cute::size<0>(TileShape{})) * size_t(cute::size<1>(TileShape{}));
+    }
+
+    workspace_bytes += GemmKernel::get_workspace_size(args);
+
+    CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
+
+    return workspace_bytes;
+  }
+
+  /// Computes the grid shape
+  static dim3
+  get_grid_shape(Arguments const& args, void* workspace = nullptr) {
+    auto tmp_params = GemmKernel::to_underlying_arguments(args, workspace);
+    return GemmKernel::get_grid_shape(tmp_params);
+  }
+
+  /// Computes the grid shape
+  static dim3
+  get_grid_shape(Params const& params) {
+    return GemmKernel::get_grid_shape(params);
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int /* smem_capacity */ = -1) {
+    CUTLASS_TRACE_HOST("GemmUniversal::maximum_active_blocks()");
+    int max_active_blocks = -1;
+    int smem_size = GemmKernel::SharedStorageSize;
+
+    // first, account for dynamic smem capacity if needed
+    cudaError_t result;
+    if (smem_size >= (48 << 10)) {
+      CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
+      result = cudaFuncSetAttribute(
+          device_kernel<GemmKernel>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize,
+          smem_size);
+      if (cudaSuccess != result) {
+        result = cudaGetLastError(); // to clear the error bit
+        CUTLASS_TRACE_HOST(
+          "  cudaFuncSetAttribute() returned error: "
+          << cudaGetErrorString(result));
+        return -1;
+      }
+    }
+
+    // query occupancy after setting smem size
+    result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks,
+        device_kernel<GemmKernel>,
+        GemmKernel::MaxThreadsPerBlock,
+        smem_size);
+
+    if (cudaSuccess != result) {
+      result = cudaGetLastError(); // to clear the error bit
+      CUTLASS_TRACE_HOST(
+        "  cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error: "
+        << cudaGetErrorString(result));
+      return -1;
+    }
+
+    CUTLASS_TRACE_HOST("  max_active_blocks: " << max_active_blocks);
+    return max_active_blocks;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status
+  initialize(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+
+    CUTLASS_TRACE_HOST("GemmUniversal::initialize() - workspace "
+      << workspace << ", stream: " << (stream ? "non-null" : "null"));
+
+    // Initialize the workspace
+    Status status = GemmKernel::initialize_workspace(args, workspace, stream, cuda_adapter);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    // Initialize the Params structure
+    params_ = GemmKernel::to_underlying_arguments(args, workspace);
+    // Don't set the function attributes - require the CudaHostAdapter to set it.
+    if constexpr (kEnableCudaHostAdapter) {
+      CUTLASS_ASSERT(cuda_adapter);
+      return Status::kSuccess;
+    }
+    else {
+      //
+      // Account for dynamic smem capacity if needed
+      //
+      int smem_size = GemmKernel::SharedStorageSize;
+
+      CUTLASS_ASSERT(cuda_adapter == nullptr);
+
+      if (smem_size >= (48 << 10)) {
+        CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
+        cudaError_t result = cudaFuncSetAttribute(
+            device_kernel<GemmKernel>,
+            cudaFuncAttributeMaxDynamicSharedMemorySize,
+            smem_size);
+        if (cudaSuccess != result) {
+          result = cudaGetLastError(); // to clear the error bit
+          CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error: " << cudaGetErrorString(result));
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    return Status::kSuccess;
+  }
+
+  /// Update API is preserved in 3.0, but does not guarantee a lightweight update of params.
+  Status
+  update(Arguments const& args, void* workspace = nullptr) {
+    CUTLASS_TRACE_HOST("GemmUniversal()::update() - workspace: " << workspace);
+
+    size_t workspace_bytes = get_workspace_size(args);
+    if (workspace_bytes > 0 && nullptr == workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    params_ = GemmKernel::to_underlying_arguments(args, workspace);
+    return Status::kSuccess;
+  }
+
+  /// Primary run() entry point API that is static allowing users to create and manage their own params.
+  /// Supplied params struct must be construct by calling GemmKernel::to_underlying_arguments()
+  static Status
+  run(Params& params,
+      cudaStream_t stream = nullptr,
+      CudaHostAdapter *cuda_adapter = nullptr,
+      bool launch_with_pdl = false) {
+    CUTLASS_TRACE_HOST("GemmUniversal::run()");
+    dim3 const block = GemmKernel::get_block_shape();
+    dim3 const grid = get_grid_shape(params);
+
+    // configure smem size and carveout
+    int smem_size = GemmKernel::SharedStorageSize;
+
+    Status launch_result{ Status::kSuccess };
+    // Use extended launch API only for mainloops that use it
+    if constexpr (GemmKernel::ArchTag::kMinComputeCapability >= 90) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      CUTLASS_TRACE_HOST("GemmUniversal::run: Use extended launch API");
+#endif
+      [[maybe_unused]] constexpr bool is_static_1x1x1 =
+        cute::is_static_v<typename GemmKernel::DispatchPolicy::ClusterShape> and
+        cute::size(typename GemmKernel::DispatchPolicy::ClusterShape{}) == 1;
+      [[maybe_unused]] dim3 cluster(cute::size<0>(typename GemmKernel::DispatchPolicy::ClusterShape{}),
+        cute::size<1>(typename GemmKernel::DispatchPolicy::ClusterShape{}),
+        cute::size<2>(typename GemmKernel::DispatchPolicy::ClusterShape{}));
+      [[maybe_unused]] void* kernel_params[] = {&params};
+
+      if constexpr (kEnableCudaHostAdapter) {
+        //
+        // Use the cuda host adapter
+        //
+        CUTLASS_ASSERT(cuda_adapter);
+        if (cuda_adapter) {
+          if (launch_with_pdl) {
+            CUTLASS_TRACE_HOST(
+              "GemmUniversal::run() does not support launching with PDL and a custom cuda adapter.");
+            return Status::kErrorInternal;
+          }
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+          CUTLASS_TRACE_HOST("GemmUniversal::run: Launching kernel with CUDA host adapter");
+#endif
+          if constexpr (is_static_1x1x1) {
+            launch_result = cuda_adapter->launch(grid,
+                                                block,
+                                                smem_size,
+                                                stream,
+                                                kernel_params,
+                                                0);
+          }
+          else {
+            launch_result = cuda_adapter->launch(grid,
+                                                cluster,
+                                                block,
+                                                smem_size,
+                                                stream,
+                                                kernel_params,
+                                                0);
+          }
+        }
+        else {
+          CUTLASS_TRACE_HOST("GemmUniversal::run: kEnableCudaHostAdapter is true, but CUDA host adapter is null");
+          return Status::kErrorInternal;
+        }
+      }
+      else {
+        CUTLASS_ASSERT(cuda_adapter == nullptr);
+        [[maybe_unused]] void const* kernel = (void const*) device_kernel<GemmKernel>;
+        static constexpr bool kClusterLaunch = GemmKernel::ArchTag::kMinComputeCapability == 90
+                                        ;
+        if constexpr (kClusterLaunch) {
+          if constexpr (is_static_1x1x1) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+            CUTLASS_TRACE_HOST("GemmUniversal::run: Launching static 1x1x1 kernel");
+#endif
+            launch_result = cutlass::kernel_launch<GemmKernel>(
+              grid, block, smem_size, stream, params, launch_with_pdl);
+            if (launch_result != Status::kSuccess) {
+              CUTLASS_TRACE_HOST("GemmUniversal::run: cutlass::kernel_launch reports failure");
+            }
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+            else {
+              CUTLASS_TRACE_HOST("GemmUniversal::run: cutlass::kernel_launch reports success");
+            }
+#endif
+          }
+          else {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+            CUTLASS_TRACE_HOST("GemmUniversal::run: Launching dynamic cluster kernel");
+#endif
+            launch_result = ClusterLauncher::launch(
+              grid, cluster, block, smem_size, stream, kernel, kernel_params, launch_with_pdl);
+          }
+        }
+      }
+    }
+    else {
+      launch_result = Status::kSuccess;
+      cutlass::arch::synclog_setup();
+
+      if constexpr (kEnableCudaHostAdapter) {
+        CUTLASS_ASSERT(cuda_adapter);
+        if (cuda_adapter) {
+          void* kernel_params[] = {&params};
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+          CUTLASS_TRACE_HOST("GemmUniversal::run: Launching kernel with CUDA host adapter");
+#endif
+          launch_result = cuda_adapter->launch(
+            grid, block, smem_size, stream, kernel_params, 0
+          );
+
+        }
+        else {
+          CUTLASS_TRACE_HOST("GemmUniversal::run: CUDA host adapter is null");
+          return Status::kErrorInternal;
+        }
+      }
+      else {
+        CUTLASS_ASSERT(cuda_adapter == nullptr);
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+        CUTLASS_TRACE_HOST("GemmUniversal::run: Launching kernel with cutlass::kernel_launch");
+#endif
+        launch_result = cutlass::kernel_launch<GemmKernel>(
+          grid, block, smem_size, stream, params, launch_with_pdl);
+        if (launch_result != Status::kSuccess) {
+          CUTLASS_TRACE_HOST("GemmUniversal::run: cutlass::kernel_launch reports failure");
+        }
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+        else {
+          CUTLASS_TRACE_HOST("GemmUniversal::run: cutlass::kernel_launch reports success");
+        }
+#endif
+      }
+    }
+
+    cudaError_t result = cudaGetLastError();
+    if (cudaSuccess == result && Status::kSuccess == launch_result) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      CUTLASS_TRACE_HOST("GemmUniversal::run: cudaGetLastError reports success");
+#endif
+      return Status::kSuccess;
+    }
+    else {
+      CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << result);
+      return Status::kErrorInternal;
+    }
+  }
+
+  //
+  // Non-static launch overloads that first create and set the internal params struct of this kernel handle.
+  //
+
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  run(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr,
+    bool launch_with_pdl = false
+  ) {
+    Status status = initialize(args, workspace, stream, cuda_adapter);
+
+    if (Status::kSuccess == status) {
+      status = run(params_, stream, cuda_adapter, launch_with_pdl);
+    }
+    return status;
+  }
+
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  operator()(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr,
+    bool launch_with_pdl = false) {
+    return run(args, workspace, stream, cuda_adapter, launch_with_pdl);
+  }
+
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  run(
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr,
+    bool launch_with_pdl = false) {
+    return run(params_, stream, cuda_adapter, launch_with_pdl);
+  }
+
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  operator()(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, bool launch_with_pdl = false) {
+    return run(params_, stream, cuda_adapter, launch_with_pdl);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////// CUTLASS 2.x API /////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+template <class GemmKernel_>
+class GemmUniversalAdapter<
+  GemmKernel_,
+  cute::enable_if_t<not gemm::detail::IsCutlass3GemmKernel<GetUnderlyingKernel_t<GemmKernel_>>::value>>
+{
+public:
+
+  using GemmKernel = GetUnderlyingKernel_t<GemmKernel_>;
+
+  static bool const kInternalTranspose =
+    !cutlass::epilogue::threadblock::detail::is_2x_evt_v<typename GemmKernel::Epilogue> &&  // 2.x EVT does not require internal transpose
+    cute::is_same<typename GemmKernel::LayoutC, cutlass::layout::RowMajor>::value;
 
   using ThreadblockShape = typename GemmKernel::Mma::Shape;
   using WarpShape = typename GemmKernel::WarpShape;
   using InstructionShape = typename GemmKernel::InstructionShape;
 
-  // warp-level, arch-level (instruction), math operator 
+  // warp-level, arch-level (instruction), math operator
   using WarpMmaOperator = typename GemmKernel::Mma::Policy::Operator;
   using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
   using MathOperator = typename WarpMmaOperator::MathOperator;
-  
-  // Operator class and arch tag extract bottom-up 
+
+  // Operator class and arch tag extract bottom-up
   // set it for top-level gemm device-level template
   using OperatorClass = typename WarpMmaOperator::OperatorClass;
   using ArchTag = typename WarpMmaOperator::ArchTag;
@@ -88,28 +606,31 @@ class GemmUniversalAdapter {
   using ElementA = typename MapArguments::ElementA;
   using LayoutA = typename MapArguments::LayoutA;
   static ComplexTransform const kTransformA = MapArguments::kTransformA;
-  static int const kAlignmentA = GemmKernel::kAlignmentA;
+  static int const kAlignmentA = MapArguments::kAlignmentA;
 
   using ElementB = typename MapArguments::ElementB;
   using LayoutB = typename MapArguments::LayoutB;
   static ComplexTransform const kTransformB = MapArguments::kTransformB;
-  static int const kAlignmentB = GemmKernel::kAlignmentB;
-  
+  static int const kAlignmentB = MapArguments::kAlignmentB;
+
   using ElementC = typename GemmKernel::ElementC;
   using LayoutC = typename MapArguments::LayoutC;
   static int const kAlignmentC = GemmKernel::kAlignmentC;
- 
+
+  // C and D same type for 2.x kernel
+  using ElementD = ElementC;
+  using LayoutD = LayoutC;
+
   using TensorRefA = TensorRef<ElementA const, LayoutA>;
   using TensorRefB = TensorRef<ElementB const, LayoutB>;
   using TensorRefC = TensorRef<ElementC const, LayoutC>;
-  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using TensorRefD = TensorRef<ElementD, LayoutD>;
 
   static int const kStages = GemmKernel::Mma::kStages;
 
   using EpilogueOutputOp = typename GemmKernel::EpilogueOutputOp;
   using ElementAccumulator = typename EpilogueOutputOp::ElementAccumulator;
   using ThreadblockSwizzle = typename GemmKernel::ThreadblockSwizzle;
-
   using UnderlyingOperator = GemmUniversalBase<GemmKernel>;
   using Arguments = typename UnderlyingOperator::Arguments;
 
@@ -133,19 +654,19 @@ class GemmUniversalAdapter {
   }
 
   /// Determines whether the GEMM can execute the given problem.
-  static Status can_implement(Arguments const &args) {
+  static Status can_implement(Arguments const &args, CudaHostAdapter *cuda_adapter = nullptr) {
 
-    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args), cuda_adapter);
   }
 
   /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-    
-    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  static size_t get_workspace_size(Arguments const &args, CudaHostAdapter *cuda_adapter = nullptr) {
+
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args), cuda_adapter);
   }
 
   /// Computes the grid shape
-  static dim3 get_grid_shape(Arguments const &args) { 
+  static dim3 get_grid_shape(Arguments const &args) {
     return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
   }
 
@@ -155,48 +676,57 @@ class GemmUniversalAdapter {
   }
 
   /// Initializes GEMM state from arguments.
-  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
-
-    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
+  Status initialize(
+    Arguments const &args,
+    void *workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr
+  ) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream, cuda_adapter);
   }
 
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
+  /// Lightweight update given a subset of arguments.
+  Status update(Arguments const &args) {
 
-    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+    return underlying_operator_.update(to_underlying_arguments(args));
   }
 
   /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
+  Status run(
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr) {
 
-    return underlying_operator_.run(stream);
+    return underlying_operator_.run(stream, cuda_adapter);
   }
 
   /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
+  Status operator()(
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+
     return run(stream);
   }
 
   /// Runs the kernel using initialized state.
   Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-    
-    Status status = initialize(args, workspace, stream);
-    
+    Arguments const &args,
+    void *workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+
+    Status status = initialize(args, workspace, stream, cuda_adapter);
+
     if (status == Status::kSuccess) {
-      status = run(stream);
+      status = run(stream, cuda_adapter);
     }
 
     return status;
   }
 };
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 
-} // namespace device
-} // namespace gemm
-} // namespace cutlass
+} // namespace cutlass::gemm::device
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/device/gemm_universal_base.h b/include/cutlass/gemm/device/gemm_universal_base.h
index f338950c62..e23191eae5 100644
--- a/include/cutlass/gemm/device/gemm_universal_base.h
+++ b/include/cutlass/gemm/device/gemm_universal_base.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -28,23 +28,26 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
-/*! 
+/*!
   \file
-  \brief The universal GEMM accommodates serial reductions, parallel reductions, batched strided, and 
-    batched array variants.
+  \brief The universal GEMM accommodates streamk, batched strided, and batched array variants.
 */
 
 #pragma once
 
-//#include <limits>
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/limits>
+#else
+#include <limits>
+#endif
 
 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
 #include "cutlass/arch/arch.h"
 #include "cutlass/device_kernel.h"
+#include "cutlass/cuda_host_adapter.hpp"
 
 #include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
 #include "cutlass/gemm/kernel/gemm_universal.h"
 
 #include "cutlass/gemm/kernel/default_gemm_universal.h"
@@ -52,7 +55,7 @@
 
 #include "cutlass/trace.h"
 
-////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
 namespace gemm {
@@ -66,8 +69,12 @@ class GemmUniversalBase {
 public:
 
   using GemmKernel = GemmKernel_;
+
+  /// Boolean indicating whether the CudaHostAdapter is enabled
+  static bool const kEnableCudaHostAdapter = CUTLASS_ENABLE_CUDA_HOST_ADAPTER;
+
   using ThreadblockShape = typename GemmKernel::Mma::Shape;
-  
+
   using ElementA = typename GemmKernel::ElementA;
   using LayoutA = typename GemmKernel::LayoutA;
   using TensorRefA = TensorRef<ElementA const, LayoutA>;
@@ -83,7 +90,8 @@ class GemmUniversalBase {
   using TensorRefC = TensorRef<ElementC const, LayoutC>;
   using TensorRefD = TensorRef<ElementC, LayoutC>;
 
-  using ElementAccumulator = typename GemmKernel::Mma::Policy::Operator::ElementC;
+  /// Numerical accumulation element type
+  using ElementAccumulator = typename GemmKernel::Mma::ElementC;
 
   using EpilogueOutputOp = typename GemmKernel::EpilogueOutputOp;
   using ThreadblockSwizzle = typename GemmKernel::ThreadblockSwizzle;
@@ -92,326 +100,419 @@ class GemmUniversalBase {
   /// Argument structure
   using Arguments = typename GemmKernel::Arguments;
 
+
+  /// Index of the GEMM Kernel within the CudaHostAdapter
+  static int32_t const kGemmKernelIndex = 0;
+
+  /// Kernel dynamic shared memory allocation requirement
+  /// Update the kernel function's shared memory configuration for the current device
+  static constexpr size_t kSharedStorageSize = sizeof(typename GemmKernel::SharedStorage);
+
 protected:
 
-  /// Kernel parameters object
-  typename GemmKernel::Params params_;
+  //
+  // Device properties (uniform across all instances of the current thread)
+  //
+
+  // Device ordinal
+  CUTLASS_THREAD_LOCAL static int device_ordinal_;
+
+  /// Device SM count
+  CUTLASS_THREAD_LOCAL static int device_sms_;
+
+  /// Kernel SM occupancy (in thread blocks)
+  CUTLASS_THREAD_LOCAL static int sm_occupancy_;
 
 protected:
 
-  /// Private helper to obtain the grid dimensions with fix-up for split-K
-  static void get_grid_shape_(gemm::GemmCoord &grid_tiled_shape, int &gemm_k_size, Arguments const &args) {
+  /// Initialize static thread-local members for the thread's current device,
+  /// if necessary.
+  static Status init_device_props()
+  {
+    CUTLASS_TRACE_HOST("GemmUniversalBase::init_device_props()");
 
-    // Determine grid shape
-    ThreadblockSwizzle threadblock_swizzle;
+    cudaError_t cudart_result;
 
-    grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
-      args.problem_size, 
-      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-      args.batch_count);
-    
-    gemm_k_size = args.problem_size.k();
+    // Get current device ordinal
+    int current_ordinal;
+    cudart_result = cudaGetDevice(&current_ordinal);
+    if (cudart_result != cudaSuccess) {
+      CUTLASS_TRACE_HOST("  cudaGetDevice() returned error " << cudaGetErrorString(cudart_result));
+      return Status::kErrorInternal;
+    }
 
-    if (args.mode == GemmUniversalMode::kGemm || args.mode == GemmUniversalMode::kGemmSplitKParallel) {
+    // Done if matches the current static member
+    if (current_ordinal == device_ordinal_) {
+      // Already initialized
+      return Status::kSuccess;
+    }
 
-      int const kAlignK = const_max(const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value), 1);
+    // Update SM count member
+    cudart_result = cudaDeviceGetAttribute (&device_sms_, cudaDevAttrMultiProcessorCount, current_ordinal);
+    if (cudart_result != cudaSuccess) {
+      CUTLASS_TRACE_HOST("  cudaDeviceGetAttribute() returned error " << cudaGetErrorString(cudart_result));
+      return Status::kErrorInternal;
+    }
 
-      gemm_k_size = round_up(ceil_div(args.problem_size.k(), args.batch_count), kAlignK);
-      
-      if (gemm_k_size) {
-        grid_tiled_shape.k() = ceil_div(args.problem_size.k(), gemm_k_size);
+    // If requires more than 48KB: configure for extended, dynamic shared memory
+    if constexpr (kSharedStorageSize >= (48 << 10))
+    {
+      cudart_result = cudaFuncSetAttribute(
+        Kernel2<GemmKernel>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        kSharedStorageSize);
+      if (cudart_result != cudaSuccess) {
+        CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error " << cudaGetErrorString(cudart_result));
+        return Status::kErrorInternal;
       }
     }
-  }
 
-public:
+    // Update SM occupancy member
+    cudart_result = cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+      &sm_occupancy_,
+      Kernel2<GemmKernel>,
+      GemmKernel::kThreadCount,
+      kSharedStorageSize,
+      cudaOccupancyDisableCachingOverride);
+    if (cudart_result != cudaSuccess) {
+      CUTLASS_TRACE_HOST("  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags() returned error " << cudaGetErrorString(cudart_result));
+      return Status::kErrorInternal;
+    }
 
-  /// Constructs the GEMM.
-  GemmUniversalBase() { }
+    // Update device ordinal member on success
+    device_ordinal_ = current_ordinal;
 
-  /// Determines whether the GEMM can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-    
-    // Determine grid shape
-    cutlass::gemm::GemmCoord grid_tiled_shape;
-    int gemm_k_size = 0;
-    
-    get_grid_shape_(grid_tiled_shape, gemm_k_size, args);
-    
-    ThreadblockSwizzle threadblock_swizzle;
-    dim3 grid = threadblock_swizzle.get_grid_shape(grid_tiled_shape);
-
-    uint32_t const kGridYZMax = ((1 << (sizeof(uint16_t) * 8)) - 1);
-  
-    if (!(grid.y <= kGridYZMax && grid.z <= kGridYZMax)) {
-
-      return Status::kErrorInvalidProblem;
-    } 
+    CUTLASS_TRACE_HOST("  "
+      "device_ordinal: (" << device_ordinal_ << "), "
+      "device_sms: (" << device_sms_ << "), "
+      "sm_occupancy: (" << sm_occupancy_ << ") "
+      "smem_size: (" << kSharedStorageSize << ") "
+      "GemmKernel::kThreadCount: (" << GemmKernel::kThreadCount << ")");
 
-    return GemmKernel::can_implement(args);
+    return Status::kSuccess;
   }
 
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
 
-    CUTLASS_TRACE_HOST("GemmUniversalBase::get_workspace_size()");
- 
-    size_t workspace_bytes = 0;
-
-    // Determine grid shape
-    cutlass::gemm::GemmCoord grid_tiled_shape;
-    int gemm_k_size = 0;
-    
-    get_grid_shape_(grid_tiled_shape, gemm_k_size, args);
-    
-    if (args.mode == GemmUniversalMode::kGemmSplitKParallel) {
-
-      // Split-K parallel always requires a temporary workspace
-      workspace_bytes = 
-        sizeof(ElementC) *
-        size_t(args.batch_stride_D) *
-        size_t(grid_tiled_shape.k());
-    }
-    else if (args.mode == GemmUniversalMode::kGemm && grid_tiled_shape.k() > 1) {
+protected:
 
-      // Serial split-K only requires a temporary workspace if the number of partitions along the
-      // GEMM K dimension is greater than one.
-      workspace_bytes = sizeof(int) * size_t(grid_tiled_shape.m()) * size_t(grid_tiled_shape.n());
-    }
+  //
+  // Instance data members
+  //
 
-    CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
+  /// Kernel parameters
+  typename GemmKernel::Params params_;
 
-    workspace_bytes += GemmKernel::get_extra_workspace_size(args, grid_tiled_shape);
- 
-    return workspace_bytes;
-  }
 
-  /// Computes the grid shape
-  static dim3 get_grid_shape(Arguments const &args) {
+  /// Initialize params member
+  Status init_params(Arguments const &args, CudaHostAdapter *cuda_adapter = nullptr)
+  {
+    int32_t device_sms = 0;
+    int32_t sm_occupancy = 0;
 
-    CUTLASS_TRACE_HOST("GemmUniversalBase::get_grid_shape()");
+    if constexpr (kEnableCudaHostAdapter) {
+      CUTLASS_ASSERT(cuda_adapter);
 
-    ThreadblockSwizzle threadblock_swizzle;
+      //
+      // Occupancy query using CudaHostAdapter::query_occupancy().
+      //
 
-    cutlass::gemm::GemmCoord grid_tiled_shape;
-    int gemm_k_size = 0;
+      if (cuda_adapter) {
 
-    get_grid_shape_(grid_tiled_shape, gemm_k_size, args);
-    dim3 result = threadblock_swizzle.get_grid_shape(grid_tiled_shape);    
-    
-    CUTLASS_TRACE_HOST(
-         "  grid_tiled_shape: " << grid_tiled_shape  << "\n"
-      << "  result = {" << result << "}");
+        Status status = cuda_adapter->query_occupancy(
+          &device_sms,
+          &sm_occupancy,
+          kGemmKernelIndex,
+          GemmKernel::kThreadCount,
+          kSharedStorageSize);
 
-    return result;
-  }
+        CUTLASS_ASSERT(status == Status::kSuccess);
 
-  /// Computes the maximum number of active blocks per multiprocessor
-  static int maximum_active_blocks(int smem_capacity = -1) {
+        if (status != Status::kSuccess) {
+          return status;
+        }
+      }
+      else {
+        return Status::kErrorInternal;
+      }
+    }
+    else {
+      CUTLASS_ASSERT(cuda_adapter == nullptr);
 
-    CUTLASS_TRACE_HOST("GemmUniversalBase::maximum_active_blocks()");
+      // Initialize static device properties, if necessary
+      Status result = init_device_props();
+
+      if (result != Status::kSuccess) {
+        return result;
+      }
+
+      //
+      // Use thread-local static members for occupancy query initialized by call to
+      // `init_device_props()`
+      //
+
+      device_sms   = device_sms_;
+      sm_occupancy = sm_occupancy_;
+    }
+
+    // Initialize params member
+    params_ = typename GemmKernel::Params(args, device_sms, sm_occupancy);
+    return Status::kSuccess;
+  }
 
-    int max_active_blocks = -1;
-    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+public:
+
+  //---------------------------------------------------------------------------------------------
+  // Stateless API
+  //---------------------------------------------------------------------------------------------
 
-    CUTLASS_TRACE_HOST("  smem_size: " << smem_size << " bytes");
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args, CudaHostAdapter *cuda_adapter = nullptr)
+  {
+    CUTLASS_TRACE_HOST("GemmUniversalBase::can_implement()");
 
-    if (smem_size <= (48 << 10)) {
+    if (!kEnableCudaHostAdapter || cuda_adapter) {
 
-      cudaError_t result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_blocks,
-        Kernel<GemmKernel>,
-        GemmKernel::kThreadCount,
-        smem_size);
+      dim3 grid = get_grid_shape(args, cuda_adapter);
 
-      if (result == cudaSuccess) {
-        CUTLASS_TRACE_HOST("  max_active_blocks: " << max_active_blocks);
-        return max_active_blocks;
+      if (!(grid.y <= std::numeric_limits<uint16_t>::max() &&
+            grid.z <= std::numeric_limits<uint16_t>::max()))
+      {
+        return Status::kErrorInvalidProblem;
       }
     }
     else {
+      //
+      // With a null host adapter, a conservative grid shape is computed and required to conform to CUDA grid
+      // dimension limits.
+      //
 
-      // Query assuming zero shared memory then compute occupancy limit based on SMEM
-      cudaError_t result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_blocks,
-        Kernel<GemmKernel>,
-        GemmKernel::kThreadCount,
-        0);
+      int64_t logicalGridM = (int64_t(args.problem_size.m()) + ThreadblockShape::kM - 1) / ThreadblockShape::kM;
+      int64_t logicalGridN = (int64_t(args.problem_size.n()) + ThreadblockShape::kN - 1) / ThreadblockShape::kN;
+      int32_t logicalGridL = args.batch_count;
 
-      if (result != cudaSuccess) {
+      if ((int64_t(std::numeric_limits<uint32_t>::max()) < logicalGridM) ||
+          (int64_t(std::numeric_limits<uint16_t>::max()) < logicalGridN) ||
+          (int32_t(std::numeric_limits<uint16_t>::max()) < logicalGridL)) {
 
-        CUTLASS_TRACE_HOST(
-          "  cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error "
-          << cudaGetErrorString(result));
-
-        return -1;
+        return Status::kErrorInvalidProblem;
       }
 
-      if (smem_capacity < 0) {
-        int device_idx = 0;
-        result = cudaGetDevice(&device_idx);
+    }
 
-        if (result != cudaSuccess) {
-          return -1;
-        }
+    return GemmKernel::can_implement(args);
+  }
 
-        cudaDeviceProp properties;
-        result = cudaGetDeviceProperties(&properties, device_idx);
 
-        if (result != cudaSuccess) {
-          return -1;
-        }
+  /// Returns the workspace size (in bytes) needed for the problem
+  /// geometry expressed by these arguments
+  static size_t get_workspace_size(Arguments const &args, CudaHostAdapter *cuda_adapter = nullptr)
+  {
+    CUTLASS_TRACE_HOST("GemmUniversalBase::get_workspace_size()");
 
-        smem_capacity = static_cast<int>(properties.sharedMemPerMultiprocessor);
-      }
+    // Initialize parameters from args
+    GemmUniversalBase base;
+    if (base.init_params(args, cuda_adapter) != Status::kSuccess) {
+      return 0;
+    }
+
+    // Get size from parameters
+    size_t workspace_bytes = base.params_.get_workspace_size();
+
+    CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
+    return workspace_bytes;
+  }
 
-      int occupancy = std::min(max_active_blocks, smem_capacity / smem_size);
 
-      CUTLASS_TRACE_HOST("  occupancy: " << occupancy);
+  /// Returns the grid extents in thread blocks to launch
+  static dim3 get_grid_shape(Arguments const &args, CudaHostAdapter *cuda_adapter = nullptr)
+  {
+    CUTLASS_TRACE_HOST("GemmUniversalBase::get_grid_shape()");
 
-      return occupancy;
+    // Initialize parameters from args
+    GemmUniversalBase base;
+    if (base.init_params(args, cuda_adapter) != Status::kSuccess) {
+      return dim3(0,0,0);
     }
 
-    CUTLASS_TRACE_HOST("  returning internal error");
+    // Get dims from parameters
+    dim3 grid_dims = base.params_.get_grid_dims();
+
+    CUTLASS_TRACE_HOST(
+         "  tiled_shape: " << base.params_.get_tiled_shape()  << "\n"
+      << "  grid_dims: {" << grid_dims << "}");
 
-    return -1;
+    return grid_dims;
   }
 
-  /// Initializes GEMM state from arguments.
-  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
 
-    CUTLASS_TRACE_HOST("GemmUniversalBase::initialize() - workspace " 
-      << workspace << ", stream: " << (stream ? "non-null" : "null"));
+  /// Returns the maximum number of active thread blocks per multiprocessor
+  static int maximum_active_blocks(CudaHostAdapter *cuda_adapter = nullptr)
+  {
+    CUTLASS_TRACE_HOST("GemmUniversalBase::maximum_active_blocks()");
 
-    size_t workspace_bytes = get_workspace_size(args);
+    int32_t device_sms   = 0;
+    int32_t sm_occupancy = 0;
 
-    CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
 
-    if (workspace_bytes) {
-      
-      if (!workspace) {
-        CUTLASS_TRACE_HOST("  error: device workspace must not be null");
+    if constexpr (kEnableCudaHostAdapter) {
+      CUTLASS_ASSERT(cuda_adapter);
 
-        return Status::kErrorWorkspaceNull;
-      }
+      if (cuda_adapter) {
 
-      if (args.mode == GemmUniversalMode::kGemm) {
-        CUTLASS_TRACE_HOST("  clearing device workspace");
-        cudaError_t result = cudaMemsetAsync(workspace, 0, workspace_bytes, stream);
+        Status status = cuda_adapter->query_occupancy(
+          &device_sms,
+          &sm_occupancy,
+          kGemmKernelIndex,
+          GemmKernel::kThreadCount,
+          kSharedStorageSize);
 
-        if (result != cudaSuccess) {
-          CUTLASS_TRACE_HOST("  cudaMemsetAsync() returned error " << cudaGetErrorString(result));
+        CUTLASS_ASSERT(status == Status::kSuccess);
 
-          return Status::kErrorInternal;
+        if (status != Status::kSuccess) {
+        return -1;
         }
       }
+      else {
+        return -1;
+      }
     }
+    else {
+      CUTLASS_ASSERT(cuda_adapter == nullptr);
+      // Initialize static device properties, if necessary
+      if (init_device_props() != Status::kSuccess) {
+        return -1;
+      }
 
-    // Get CUDA grid shape
-    cutlass::gemm::GemmCoord grid_tiled_shape;
-    int gemm_k_size = 0;
+      sm_occupancy = sm_occupancy_;
+    }
 
-    get_grid_shape_(grid_tiled_shape, gemm_k_size, args);
+    CUTLASS_TRACE_HOST("  max_active_blocks: " << sm_occupancy_);
+    return sm_occupancy;
+  }
 
-    // Initialize the Params structure
-    params_ = typename GemmKernel::Params(
-      args,
-      grid_tiled_shape,
-      gemm_k_size,
-      static_cast<int *>(workspace)
-    );
-   
-    // Specify shared memory capacity for kernel. 
-    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
 
-    if (smem_size >= (48 << 10)) {
-      cudaError_t result = cudaFuncSetAttribute(Kernel<GemmKernel>,
-                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                    smem_size);
+  //---------------------------------------------------------------------------------------------
+  // Stateful API
+  //---------------------------------------------------------------------------------------------
 
-      if (result != cudaSuccess) {
-        return Status::kErrorInternal;
-      }
+  /// Initializes GEMM state from arguments and workspace memory
+  Status initialize(
+    Arguments const &args,
+    void *workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr)
+  {
+    CUTLASS_TRACE_HOST("GemmUniversalBase::initialize() - workspace "
+      << workspace << ", stream: " << (stream ? "non-null" : "null"));
+
+    // Initialize parameters from args
+    Status result = init_params(args, cuda_adapter);
+    if (result != Status::kSuccess) {
+      return result;
+    }
+
+    // Assign and prepare workspace memory
+    if (args.mode == GemmUniversalMode::kGemm) {
+      return params_.init_workspace(workspace, stream);
     }
 
     return Status::kSuccess;
   }
 
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
-
-    CUTLASS_TRACE_HOST("GemmUniversalBase()::update() - workspace: " << workspace);
 
-    size_t workspace_bytes = get_workspace_size(args);
-
-    if (workspace_bytes && !workspace) {
-      return Status::kErrorWorkspaceNull;
-    }
-    
-    params_.update(args, workspace);
-    
+  /// Lightweight update given a subset of arguments.
+  Status update(Arguments const &args)
+  {
+    CUTLASS_TRACE_HOST("GemmUniversalBase()::update()");
+    params_.update(args);
     return Status::kSuccess;
   }
 
   /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
+  Status run(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr)
+  {
     CUTLASS_TRACE_HOST("GemmUniversalBase::run()");
 
-    //
     // Configure grid and block dimensions
-    //
-
-    ThreadblockSwizzle threadblock_swizzle;
-
-    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
     dim3 block(GemmKernel::kThreadCount, 1, 1);
+    dim3 grid = params_.get_grid_dims();
 
-    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
-
-    //
     // Launch kernel
-    //
-
-    CUTLASS_TRACE_HOST("  grid: (" << grid << "),  block: (" << block 
-      << "),  SMEM: " << smem_size << " bytes");
-
-    // Launch
-    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
+    CUTLASS_TRACE_HOST("  "
+      "grid: (" << grid << "), "
+      "block: (" << block << "), "
+      "SMEM: (" << kSharedStorageSize << ")");
+
+    cutlass::arch::synclog_setup();
+
+    if constexpr (kEnableCudaHostAdapter) {
+      CUTLASS_ASSERT(cuda_adapter);
+      if (cuda_adapter) {
+        void* kernel_params[] = {&params_};
+        return cuda_adapter->launch(grid, block, kSharedStorageSize, stream, kernel_params, 0);
+      }
+      else {
+        return Status::kErrorInternal;
+      }
+    }
+    else {
+      CUTLASS_ASSERT(cuda_adapter == nullptr);
 
-    //
-    // Query for errors
-    //
-    cudaError_t result = cudaGetLastError();
+      Kernel2<GemmKernel><<<grid, block, kSharedStorageSize, stream>>>(params_);
 
-    if (result != cudaSuccess) {
-      CUTLASS_TRACE_HOST("  grid launch failed with error " << cudaGetErrorString(result));
-      return Status::kErrorInternal;
+      // Query for errors
+      cudaError_t result = cudaGetLastError();
+      if (result != cudaSuccess) {
+        CUTLASS_TRACE_HOST("  grid launch failed with error " << cudaGetErrorString(result));
+        return Status::kErrorInternal;
+      }
     }
-  
+
     return Status::kSuccess;
   }
 
+
   /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
+  Status operator()(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr)
+  {
+    return run(stream, cuda_adapter);
   }
 
+
   /// Runs the kernel using initialized state.
   Status operator()(
     Arguments const &args, 
     void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-    
-    Status status = initialize(args, workspace, stream);
-    
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr)
+  {
+    Status status = initialize(args, workspace, stream, cuda_adapter);
+
     if (status == Status::kSuccess) {
-      status = run(stream);
+      status = run(stream, cuda_adapter);
     }
 
     return status;
   }
 };
 
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Static initializers
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Device ordinal
+template <typename GemmKernel_>
+CUTLASS_THREAD_LOCAL int GemmUniversalBase<GemmKernel_>::device_ordinal_ = -1;
+
+/// Device SM count
+template <typename GemmKernel_>
+CUTLASS_THREAD_LOCAL int GemmUniversalBase<GemmKernel_>::device_sms_ = -1;
+
+/// Kernel SM occupancy (in thread blocks)
+template <typename GemmKernel_>
+CUTLASS_THREAD_LOCAL int GemmUniversalBase<GemmKernel_>::sm_occupancy_ = -1;
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace device
diff --git a/include/cutlass/gemm/device/gemm_universal_streamk_with_broadcast.h b/include/cutlass/gemm/device/gemm_universal_streamk_with_broadcast.h
new file mode 100644
index 0000000000..7ef581ac9a
--- /dev/null
+++ b/include/cutlass/gemm/device/gemm_universal_streamk_with_broadcast.h
@@ -0,0 +1,386 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Template for a Stream-K GEMM kernel that can broadcast bias vector in the
+           epilogue.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_universal.h"
+
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+#include "cutlass/gemm/kernel/default_gemm_streamk_with_broadcast.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/device/gemm_universal_base.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*!
+  The universal GEMM with a broadcast epilogue.
+  Supports
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator      - must satisfy concept of 'EpilogueWithBroadcastOp'
+    typename EpilogueOutputOp_ = cutlass::epilogue::thread::LinearCombinationBiasElementwise<
+        ElementC_, ElementAccumulator_, ElementAccumulator_,
+        ElementC_, ElementC_, 128 / cutlass::sizeof_bits<ElementC_>::value>,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB = ComplexTransform::kNone
+>
+class GemmUniversalStreamkWithBroadcast :
+  public GemmUniversalBase<
+    typename kernel::DefaultGemmStreamkWithBroadcast<
+      ElementA_,
+      LayoutA_,
+      TransformA,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      TransformB,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_
+    >::GemmKernel
+  > {
+
+ public:
+
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+  using Base = GemmUniversalBase<
+    typename kernel::DefaultGemmStreamkWithBroadcast<
+      ElementA_,
+      LayoutA_,
+      TransformA,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      TransformB,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_
+    >::GemmKernel
+  >;
+
+  using Arguments = typename Base::Arguments;
+  using GemmKernel = typename Base::GemmKernel;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for column-major output exchanges problem size and operand.
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB>
+class GemmUniversalStreamkWithBroadcast<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
+           layout::ColumnMajor,  // partially specialized on LayoutC
+           ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
+           WarpShape_, InstructionShape_, EpilogueOutputOp_,
+           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
+           Operator_, TransformA, TransformB> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+  using UnderlyingOperator = typename GemmUniversalStreamkWithBroadcast<
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type,
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    ElementC,
+    layout::RowMajor,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    kAlignmentB,
+    kAlignmentA,
+    Operator,
+    kTransformB,
+    kTransformA
+  >::Base;
+
+  using GemmKernel = typename UnderlyingOperator::GemmKernel;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+
+  /// Argument structure
+  using Arguments = typename UnderlyingOperator::Arguments;
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmUniversalStreamkWithBroadcast() { }
+
+  /// Helper to construct a transposed equivalent for the underying GEMM operator
+  static Arguments to_underlying_arguments(Arguments const &args) {
+    return args.transposed_problem();
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const &args) {
+    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args,
+    void *workspace = nullptr,
+    cudaStream_t stream = nullptr) {
+
+    Status status = initialize(args, workspace, stream);
+
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/device/gemm_universal_with_absmax.h b/include/cutlass/gemm/device/gemm_universal_with_absmax.h
new file mode 100644
index 0000000000..35f7b5416b
--- /dev/null
+++ b/include/cutlass/gemm/device/gemm_universal_with_absmax.h
@@ -0,0 +1,404 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Template for a GEMM kernel that computes the absolute maximum of the output tensor
+    and applies additional scaling factors to operands.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_universal.h"
+
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+#include "cutlass/gemm/kernel/default_gemm_with_absmax.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/device/gemm_universal_base.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Universal GEMM with absolute-maximum calculation and scaling
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassTensorOp,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_ = arch::Sm89,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = cutlass::epilogue::thread::LinearCombinationBiasElementwise<
+        ElementC_, ElementAccumulator_, ElementAccumulator_,
+        ElementC_, ElementC_, 128 / cutlass::sizeof_bits<ElementC_>::value>,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB = ComplexTransform::kNone
+>
+class GemmUniversalWithAbsMax;
+
+// Partial specialization for SM89
+template <
+    typename ElementA_,
+    typename LayoutA_,
+    typename ElementB_,
+    typename LayoutB_,
+    typename ElementC_,
+    typename LayoutC_,
+    typename ElementAccumulator_,
+    typename ThreadblockShape_,
+    typename WarpShape_,
+    typename InstructionShape_,
+    typename EpilogueOutputOp_,
+    typename ThreadblockSwizzle_,
+    int Stages,
+    int AlignmentA,
+    int AlignmentB,
+    typename Operator_,
+    ComplexTransform TransformA,
+    ComplexTransform TransformB
+>
+class GemmUniversalWithAbsMax<
+    ElementA_,
+    LayoutA_,
+    ElementB_,
+    LayoutB_,
+    ElementC_,
+    LayoutC_,
+    ElementAccumulator_,
+    arch::OpClassTensorOp,
+    arch::Sm89,
+    ThreadblockShape_,
+    WarpShape_,
+    InstructionShape_,
+    EpilogueOutputOp_,
+    ThreadblockSwizzle_,
+    Stages,
+    AlignmentA,
+    AlignmentB,
+    Operator_,
+    TransformA,
+    TransformB
+> :
+  public GemmUniversalBase<
+    typename kernel::DefaultGemmWithAbsMax<
+      ElementA_,
+      LayoutA_,
+      TransformA,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      TransformB,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      arch::OpClassTensorOp,
+      arch::Sm89,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_
+    >::GemmKernel
+  > {
+
+ public:
+
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = arch::OpClassTensorOp;
+  using ArchTag = arch::Sm89;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+  using Base = GemmUniversalBase<
+    typename kernel::DefaultGemmWithAbsMax<
+      ElementA_,
+      LayoutA_,
+      TransformA,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      TransformB,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass,
+      ArchTag,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_
+    >::GemmKernel
+  >;
+
+  using Arguments = typename Base::Arguments;
+  using GemmKernel = typename Base::GemmKernel;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for SM89 column-major output exchanges problem size and operand.
+template <
+    typename ElementA_,
+    typename LayoutA_,
+    typename ElementB_,
+    typename LayoutB_,
+    typename ElementC_,
+    typename ElementAccumulator_,
+    typename ThreadblockShape_,
+    typename WarpShape_,
+    typename InstructionShape_,
+    typename EpilogueOutputOp_,
+    typename ThreadblockSwizzle_,
+    int Stages,
+    int AlignmentA,
+    int AlignmentB,
+    typename Operator_,
+    ComplexTransform TransformA,
+    ComplexTransform TransformB>
+class GemmUniversalWithAbsMax<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
+           layout::ColumnMajor,  // partially specialized on LayoutC
+           ElementAccumulator_, arch::OpClassTensorOp, arch::Sm89, ThreadblockShape_,
+           WarpShape_, InstructionShape_, EpilogueOutputOp_,
+           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
+           Operator_, TransformA, TransformB> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = arch::OpClassTensorOp;
+  using ArchTag = arch::Sm89;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+  using UnderlyingOperator = typename GemmUniversalWithAbsMax<
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type,
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    ElementC,
+    layout::RowMajor,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    kAlignmentB,
+    kAlignmentA,
+    Operator,
+    kTransformB,
+    kTransformA
+  >::Base;
+
+  using GemmKernel = typename UnderlyingOperator::GemmKernel;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+
+  /// Argument structure
+  using Arguments = typename UnderlyingOperator::Arguments;
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmUniversalWithAbsMax() { }
+
+  /// Helper to construct a transposed equivalent for the underying GEMM operator
+  static Arguments to_underlying_arguments(Arguments const &args) {
+    return args.transposed_problem();
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const &args) {
+    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args,
+    void *workspace = nullptr,
+    cudaStream_t stream = nullptr) {
+
+    Status status = initialize(args, workspace, stream);
+
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/device/gemm_universal_with_broadcast.h b/include/cutlass/gemm/device/gemm_universal_with_broadcast.h
new file mode 100644
index 0000000000..809a504a7f
--- /dev/null
+++ b/include/cutlass/gemm/device/gemm_universal_with_broadcast.h
@@ -0,0 +1,386 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Template for a GEMM kernel that can broadcast bias vector in the
+           epilogue.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_universal.h"
+
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+#include "cutlass/gemm/kernel/default_gemm_with_broadcast.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/device/gemm_universal_base.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*!
+  The universal GEMM with a broadcast epilogue.
+  Supports
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator      - must satisfy concept of 'EpilogueWithBroadcastOp'
+    typename EpilogueOutputOp_ = cutlass::epilogue::thread::LinearCombinationBiasElementwise<
+        ElementC_, ElementAccumulator_, ElementAccumulator_,
+        ElementC_, ElementC_, 128 / cutlass::sizeof_bits<ElementC_>::value>,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB = ComplexTransform::kNone
+>
+class GemmUniversalWithBroadcast :
+  public GemmUniversalBase<
+    typename kernel::DefaultGemmWithBroadcast<
+      ElementA_,
+      LayoutA_,
+      TransformA,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      TransformB,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_
+    >::GemmKernel
+  > {
+
+ public:
+
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+  using Base = GemmUniversalBase<
+    typename kernel::DefaultGemmWithBroadcast<
+      ElementA_,
+      LayoutA_,
+      TransformA,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      TransformB,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_
+    >::GemmKernel
+  >;
+
+  using Arguments = typename Base::Arguments;
+  using GemmKernel = typename Base::GemmKernel;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for column-major output exchanges problem size and operand.
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB>
+class GemmUniversalWithBroadcast<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
+           layout::ColumnMajor,  // partially specialized on LayoutC
+           ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
+           WarpShape_, InstructionShape_, EpilogueOutputOp_,
+           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
+           Operator_, TransformA, TransformB> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+  using UnderlyingOperator = typename GemmUniversalWithBroadcast<
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type,
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    ElementC,
+    layout::RowMajor,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    kAlignmentB,
+    kAlignmentA,
+    Operator,
+    kTransformB,
+    kTransformA
+  >::Base;
+
+  using GemmKernel = typename UnderlyingOperator::GemmKernel;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+
+  /// Argument structure
+  using Arguments = typename UnderlyingOperator::Arguments;
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmUniversalWithBroadcast() { }
+
+  /// Helper to construct a transposed equivalent for the underying GEMM operator
+  static Arguments to_underlying_arguments(Arguments const &args) {
+    return args.transposed_problem();
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const &args) {
+    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args,
+    void *workspace = nullptr,
+    cudaStream_t stream = nullptr) {
+
+    Status status = initialize(args, workspace, stream);
+
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/device/gemm_with_k_reduction.h b/include/cutlass/gemm/device/gemm_with_k_reduction.h
new file mode 100644
index 0000000000..b25ae6a369
--- /dev/null
+++ b/include/cutlass/gemm/device/gemm_with_k_reduction.h
@@ -0,0 +1,415 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a GEMM kernel that can reduce one of the input matrix
+    into a vector along the K dimension.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_with_k_reduction.h"
+
+#include "cutlass/gemm/kernel/default_gemm_with_k_reduction.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/device/gemm_universal_base.h"
+
+#include "cutlass/layout/permute.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*! 
+  The universal GEMM accommodates serial reductions, parallel reductions, batched strided, and 
+  batched array variants.
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Reduce A or B operand along the K dimension
+    bool ReduceKForA_ = true,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    /// Gather operand A by using an index array
+    bool GatherA = false,
+    /// Gather operand B by using an index array
+    bool GatherB = false,
+    /// Scatter result D by using an index array
+    bool ScatterD = false,
+    /// Permute result D
+    typename PermuteDLayout = layout::NoPermute
+>
+class GemmWithKReduction : 
+  public GemmUniversalBase<
+    typename kernel::DefaultGemmWithKReduction<
+      ElementA_,
+      LayoutA_,
+      TransformA,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      TransformB,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ReduceKForA_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_,
+      SharedMemoryClearOption::kNone
+    >::GemmKernel
+  > {
+
+ public:
+
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static constexpr int kStages = Stages;
+  static constexpr int kAlignmentA = AlignmentA;
+  static constexpr int kAlignmentB = AlignmentB;
+  static constexpr int kAlignmentC = EpilogueOutputOp::kCount;
+  static constexpr ComplexTransform kTransformA = TransformA;
+  static constexpr ComplexTransform kTransformB = TransformB;
+
+  using Base = GemmUniversalBase<
+    typename kernel::DefaultGemmWithKReduction<
+      ElementA_,
+      LayoutA_,
+      TransformA,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      TransformB,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ReduceKForA_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_,
+      SharedMemoryClearOption::kNone
+    >::GemmKernel
+  >;
+
+  using Arguments = typename Base::Arguments;
+  using GemmKernel = typename Base::GemmKernel;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for column-major output exchanges problem size and operand.
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Reduce A or B operand along the K dimension
+    bool ReduceKForA_,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Scatter result D by using an index array
+    bool ScatterD,
+    /// Permute result D
+    typename PermuteDLayout
+>
+class GemmWithKReduction<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
+           layout::ColumnMajor,  // partially specialized on LayoutC
+           ElementAccumulator_, OperatorClass_, ReduceKForA_, ArchTag_, ThreadblockShape_,
+           WarpShape_, InstructionShape_, EpilogueOutputOp_,
+           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
+           Operator_, TransformA, TransformB, GatherA, GatherB, ScatterD, PermuteDLayout> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+  using UnderlyingOperator = typename GemmWithKReduction< 
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type,
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    ElementC,
+    layout::RowMajor,    
+    ElementAccumulator,
+    OperatorClass,
+    !ReduceKForA_,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    kAlignmentB,
+    kAlignmentA,
+    Operator,
+    kTransformB,
+    kTransformA,
+    GatherB,
+    GatherA,
+    ScatterD,
+    PermuteDLayout
+  >::Base;
+
+  using GemmKernel = typename UnderlyingOperator::GemmKernel;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+
+  /// Argument structure
+  using Arguments = typename UnderlyingOperator::Arguments;
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmWithKReduction() = default;
+
+  /// Helper to construct a transposed equivalent for the underying GEMM operator
+  static Arguments to_underlying_arguments(Arguments const &args) {
+    return args.transposed_problem();
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const &args) { 
+    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/device/gemv.h b/include/cutlass/gemm/device/gemv.h
index 18d98f2e6f..5e181743ef 100644
--- a/include/cutlass/gemm/device/gemv.h
+++ b/include/cutlass/gemm/device/gemv.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -75,11 +75,7 @@ class Gemv {
   static ComplexTransform const kTransformB = GemvKernel::kTransformB;
 
   static int const kThreadCount = GemvKernel::kThreadCount;
-  static int const kStages = GemvKernel::kStages;
-
-  static int const kAlignmentA = GemvKernel::kAlignmentA;
-  static int const kAlignmentB = GemvKernel::kAlignmentB;
-  static int const kAlignmentC = GemvKernel::kAlignmentC;
+  static int const kThreadsPerRow = GemvKernel::kThreadsPerRow;
 
   using Arguments = typename GemvKernel::Arguments;
   using Params = typename GemvKernel::Params;
@@ -106,8 +102,23 @@ class Gemv {
   }
 
   /// Computes the grid shape
-  static dim3 get_grid_shape(Arguments const &args) { 
-    return dim3((args.problem_size.row() + (kThreadCount - 1)) / kThreadCount, 1, args.batch_count % 65565);
+  static dim3 get_grid_shape(Arguments const &args, dim3 const &block) { 
+    if(platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      return dim3((args.problem_size.row() + (block.x - 1)) / block.x, 1, args.batch_count % 65536);
+    }
+    else {
+      return dim3((args.problem_size.row() + (block.y - 1)) / block.y, 1, args.batch_count % 65536);
+    }
+  }
+
+  /// Computes the block shape
+  static dim3 get_block_shape() { 
+    if(platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      return dim3(kThreadCount, 1, 1);
+    }
+    else {
+      return dim3(kThreadsPerRow, kThreadCount / kThreadsPerRow, 1);
+    }
   }
 
   /// Initializes Gemv state from arguments.
@@ -124,12 +135,13 @@ class Gemv {
   /// Runs the kernel using initialized state.
   Status run(cudaStream_t stream = nullptr) {
 
-    dim3 grid = get_grid_shape(params_);
-    dim3 block(GemvKernel::kThreadCount, 1, 1);
+    dim3 block = get_block_shape();
+    dim3 grid = get_grid_shape(params_, block);
 
     int smem_size = int(sizeof(typename GemvKernel::SharedStorage));
     
     // Launch
+    cutlass::arch::synclog_setup();
     cutlass::Kernel<GemvKernel><<<grid, block, smem_size, stream>>>(params_);
 
     //
@@ -137,11 +149,7 @@ class Gemv {
     //
     cudaError_t result = cudaGetLastError();
 
-    if (result != cudaSuccess) {
-      return Status::kErrorInternal;
-    }
-  
-    return Status::kSuccess;
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
   }
 
   /// Runs the kernel using initialized state.
diff --git a/include/cutlass/gemm/device/rank_2k.h b/include/cutlass/gemm/device/rank_2k.h
index b10c5e6acc..296f38cad2 100644
--- a/include/cutlass/gemm/device/rank_2k.h
+++ b/include/cutlass/gemm/device/rank_2k.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -319,6 +319,7 @@ class Rank2K {
 
     int smem_size = int(sizeof(typename Rank2Kkernel::SharedStorage));
 
+    cutlass::arch::synclog_setup();
     cutlass::Kernel<Rank2Kkernel><<<grid, block, smem_size, stream>>>(params_);
 
     cudaError_t result = cudaGetLastError();
@@ -348,7 +349,7 @@ class Rank2K {
 };
 ////////////////////////////////////////////////////////////////////////////////
 
-/// Parital specialization for column-major output exchange operand.
+/// Partial specialization for column-major output exchange operand.
 template <
     /// Element type for A matrix operand
     typename ElementA_,
diff --git a/include/cutlass/gemm/device/rank_2k_grouped.h b/include/cutlass/gemm/device/rank_2k_grouped.h
new file mode 100644
index 0000000000..6cbebc5d74
--- /dev/null
+++ b/include/cutlass/gemm/device/rank_2k_grouped.h
@@ -0,0 +1,63 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file
+  \brief Device-level grouped Rank2K.
+*/
+
+#pragma once
+
+#include "cutlass/gemm/device/base_grouped.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Rank2K Grouped
+template <typename Rank2Kkernel_>
+class Rank2KGrouped : public BaseGrouped<Rank2Kkernel_> {
+public:
+  using Rank2Kkernel = Rank2Kkernel_;
+  static const cutlass::FillMode kFillModeC = Rank2Kkernel::kFillModeC;
+  static const cutlass::BlasMode kBlasMode = Rank2Kkernel::kBlasMode;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/device/rank_k.h b/include/cutlass/gemm/device/rank_k.h
index a85c6958ea..ae18a11b80 100644
--- a/include/cutlass/gemm/device/rank_k.h
+++ b/include/cutlass/gemm/device/rank_k.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -296,6 +296,7 @@ class RankK {
 
     int smem_size = int(sizeof(typename RankKkernel::SharedStorage));
 
+    cutlass::arch::synclog_setup();
     cutlass::Kernel<RankKkernel><<<grid, block, smem_size, stream>>>(params_);
 
     cudaError_t result = cudaGetLastError();
@@ -325,7 +326,7 @@ class RankK {
 };
 ////////////////////////////////////////////////////////////////////////////////
 
-/// Parital specialization for column-major output exchange operand.
+/// Partial specialization for column-major output exchange operand.
 template <
     /// Element type for A matrix operand
     typename ElementA_,
diff --git a/include/cutlass/gemm/device/symm.h b/include/cutlass/gemm/device/symm.h
index 6380223e53..c36ef959b1 100755
--- a/include/cutlass/gemm/device/symm.h
+++ b/include/cutlass/gemm/device/symm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -337,6 +337,7 @@ class Symm {
       }
     }
 
+    cutlass::arch::synclog_setup();
     cutlass::Kernel<SymmKernel><<<grid, block, smem_size, stream>>>(params_);
 
     cudaError_t result = cudaGetLastError();
@@ -408,7 +409,7 @@ class Symm {
    call GEMM mainloop for with RowMajor efficient-epilogue
 ********************************************************************************************************/
 
-/// Parital specialization for column-major output exchanges problem size and operand.
+/// Partial specialization for column-major output exchanges problem size and operand.
 template <
     /// Element type for A matrix operand
     typename ElementA_,
diff --git a/include/cutlass/gemm/device/trmm.h b/include/cutlass/gemm/device/trmm.h
index 28a1825025..09b9152cbb 100644
--- a/include/cutlass/gemm/device/trmm.h
+++ b/include/cutlass/gemm/device/trmm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -90,7 +90,7 @@ namespace device {
     // Instantiate the CUTLASS TRMM operator.
     //
 
-    cutlass::gemm::device::trmm<
+    cutlass::gemm::device::Trmm<
       float,
       cutlass::layout::ColumnMajor,
       cutlass::SideMode::kLeft,
@@ -110,7 +110,7 @@ namespace device {
       cutlass::gemm::GemmUniversalMode,   // Trmm Problem Mode
       {m, n, m/n},                        // GemmCoord problem_size (k is based on left- or right-side mode)
       batch_count,
-      {alpha}                            // EpilogueOutputOp::Params epilogue_op_params
+      {alpha},                            // EpilogueOutputOp::Params epilogue_op_params
       void const * ptr_A,
       void const * ptr_B,
       void const * ptr_C,
@@ -495,6 +495,7 @@ class Trmm {
       }
     }
 
+    cutlass::arch::synclog_setup();
     cutlass::Kernel<TrmmKernel><<<grid, block, smem_size, stream>>>(params_);
 
     cudaError_t result = cudaGetLastError();
@@ -563,7 +564,7 @@ For the mainloop and trmm kernel, `A` and `B` points to left-side and right-side
    call GEMM mainloop for with RowMajor efficient-epilogue
 ********************************************************************************************************/
 
-/// Parital specialization for column-major output exchanges problem size and operand.
+/// Partial specialization for column-major output exchanges problem size and operand.
 template <
     /// Element type for A matrix operand
     typename ElementA_,
diff --git a/include/cutlass/gemm/dispatch_policy.hpp b/include/cutlass/gemm/dispatch_policy.hpp
new file mode 100644
index 0000000000..236a1227c2
--- /dev/null
+++ b/include/cutlass/gemm/dispatch_policy.hpp
@@ -0,0 +1,319 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cute/layout.hpp"
+#include "cute/numeric/integral_constant.hpp" // cute::false_type
+//////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::detail {
+
+template <class T, template <int...> class U>
+struct is_kernel_tag_of : cute::false_type {};
+
+template <template <int...> class U, int... Args>
+struct is_kernel_tag_of<U<Args...>, U> : cute::true_type {};
+
+template <class T, template <int...> class U>
+constexpr bool is_kernel_tag_of_v = is_kernel_tag_of<T, U>::value;
+
+template <class T, template <int,bool> class U>
+struct is_asymmetric_dma_kernel_tag_of : cute::false_type {};
+
+template <template <int, bool> class U, int I0, bool B0>
+struct is_asymmetric_dma_kernel_tag_of<U<I0, B0>, U> : cute::true_type {};
+
+template <class T, template <int, bool> class U>
+constexpr bool is_asymmetric_dma_kernel_tag_of_v = \
+                              is_asymmetric_dma_kernel_tag_of<T, U>::value;
+
+}
+
+//////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm {
+using namespace cute;
+
+//////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+enum class KernelInputTransformType {
+    FastF32,
+    InterleavedComplexTF32
+};
+
+} // namespace detail
+
+//////////////////////////////////////////////////////////////////////////////
+
+namespace kernel::detail {
+
+// Has_SwapAB<T>::value will be true only if:
+//   class T has member SwapAB and T::SwapAB is true
+template <typename T, typename = void>
+struct Has_SwapAB { static constexpr bool value = false; };
+
+template <typename T>
+struct Has_SwapAB <T, CUTE_STL_NAMESPACE::void_t<decltype(T::SwapAB)>>
+{ static constexpr bool value = T::SwapAB; };
+
+template <typename T>
+static constexpr bool Has_SwapAB_v = Has_SwapAB<T>::value;
+
+} // namespace kernel::detail
+
+//////////////////////////////////////////////////////////////////////////////
+
+//
+// Kernel schedule policies (the base class tags, one for each kernel layer file)
+//
+struct KernelMultistage { };
+struct KernelCpAsyncWarpSpecialized { };
+struct KernelCpAsyncWarpSpecializedPingpong { };
+struct KernelCpAsyncWarpSpecializedCooperative { };
+struct KernelTma { };
+struct KernelTmaWarpSpecialized { };
+struct KernelTmaWarpSpecializedPingpong { 
+};
+struct KernelTmaWarpSpecializedCooperative { 
+};
+
+struct KernelPtrArrayTmaWarpSpecializedCooperative { };
+struct KernelPtrArrayTmaWarpSpecializedPingpong { };
+
+//////////////////////////////////////////////////////////////////////////////
+
+//
+// Builder dispatch policies (not a part of the main CUTLASS layers, simply used to opt into
+// specific collective builder dispatches)
+//
+
+// FP8 related policies (including Fast Accumulation)
+struct KernelTmaWarpSpecializedFP8FastAccum : KernelTmaWarpSpecialized { };
+struct KernelTmaWarpSpecializedPingpongFP8FastAccum : KernelTmaWarpSpecializedPingpong { };
+struct KernelTmaWarpSpecializedCooperativeFP8FastAccum: KernelTmaWarpSpecializedCooperative { };
+struct KernelPtrArrayTmaWarpSpecializedCooperativeFP8FastAccum : KernelPtrArrayTmaWarpSpecializedCooperative { };
+struct KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum : KernelPtrArrayTmaWarpSpecializedPingpong { };
+
+//////////////////////////////////////////////////////////////////////////////
+
+// Policies for dispatch of epilogue
+struct EpilogueDefault { };
+struct EpilogueTransposed { };
+
+//////////////////////////////////////////////////////////////////////////////
+
+//
+// Collective Mainloop Policies
+//
+
+// 2 stage pipeline through 1 stage in smem, 1 in rmem, WITHOUT predicated gmem loads
+struct MainloopSm70TwoStageUnpredicated {
+  constexpr static int Stages = 2;
+  using ArchTag = arch::Sm70;
+  using Schedule = KernelMultistage;
+  using ClusterShape = Shape<_1,_1,_1>;
+};
+
+// 2 stage pipeline through 1 stage in smem, 1 in rmem, with predicated gmem loads
+struct MainloopSm70TwoStage {
+  constexpr static int Stages = 2;
+  using ArchTag = arch::Sm70;
+  using Schedule = KernelMultistage;
+  using ClusterShape = Shape<_1,_1,_1>;
+};
+
+// n-buffer in smem (cp.async), pipelined with registers, WITHOUT predicated gmem loads
+template<int Stages_>
+struct MainloopSm80CpAsyncUnpredicated {
+  constexpr static int Stages = Stages_;
+  using ArchTag = arch::Sm80;
+  using Schedule = KernelMultistage;
+  using ClusterShape = Shape<_1,_1,_1>;
+};
+
+// n-buffer in smem (cp.async), pipelined with registers, with predicated gmem loads
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>
+>
+struct MainloopSm80CpAsync {
+  constexpr static int Stages = Stages_;
+  using ArchTag = cute::conditional_t<(size(ClusterShape_{}) > 1), arch::Sm90, arch::Sm80>;
+  using Schedule = KernelMultistage;
+  using ClusterShape = ClusterShape_;
+};
+
+// n-buffer in smem (cp.async), pipelined with Hopper GMMA, with predicated gmem loads, warp specialized dynamic schedule
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  class KernelSchedule = KernelCpAsyncWarpSpecialized
+>
+struct MainloopSm90CpAsyncGmmaWarpSpecialized {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm90;
+  using Schedule = KernelSchedule;
+};
+
+// n-buffer in smem (cp.async), pipelined with Hopper GMMA, with predicated gmem loads, warp specialized dynamic schedule
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  class KernelSchedule = KernelCpAsyncWarpSpecialized
+>
+struct MainloopSm90CpAsyncGmmaRmemAWarpSpecialized {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm90;
+  using Schedule = KernelSchedule;
+};
+
+// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, static schedule between TMA and GMMA
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  int PipelineAsyncMmaStages_ = 1
+>
+struct MainloopSm90TmaGmma {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  constexpr static int PipelineAsyncMmaStages = PipelineAsyncMmaStages_;
+  using ArchTag = arch::Sm90;
+  using Schedule = KernelTma;
+};
+
+// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp specialized dynamic schedule
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  class KernelSchedule = KernelTmaWarpSpecializedCooperative
+>
+struct MainloopSm90TmaGmmaWarpSpecialized {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm90;
+  using Schedule = KernelSchedule;
+};
+
+// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp specialized dynamic schedule
+// With GMMA's A data from registers.
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  class KernelSchedule = KernelTmaWarpSpecialized
+>
+struct MainloopSm90TmaGmmaRmemAWarpSpecialized {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm90;
+  using Schedule = KernelSchedule;
+  static_assert(
+    cute::is_same_v<Schedule, KernelTmaWarpSpecialized> ||
+    cute::is_same_v<Schedule, KernelTmaWarpSpecializedPingpong> ||
+    cute::is_same_v<Schedule, KernelTmaWarpSpecializedCooperative>,
+    "KernelSchedule must be one of the warp specialized policies");
+};
+
+
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  class KernelSchedule = KernelTmaWarpSpecialized
+>
+struct MainloopSm90TmaGmmaRmemAWarpSpecializedMixedInput {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm90;
+  using Schedule = KernelSchedule;
+  static_assert(
+    cute::is_same_v<Schedule, KernelTmaWarpSpecialized> ||
+    cute::is_same_v<Schedule, KernelTmaWarpSpecializedPingpong> ||
+    cute::is_same_v<Schedule, KernelTmaWarpSpecializedCooperative>,
+    "KernelSchedule must be one of the warp specialized policies");
+};
+
+// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp specialized dynamic schedule
+// For FP8 kernels
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  class KernelSchedule = KernelTmaWarpSpecialized
+>
+struct MainloopSm90TmaGmmaWarpSpecializedFP8
+  : MainloopSm90TmaGmmaWarpSpecialized<Stages_, ClusterShape_, KernelSchedule> {
+  static_assert(
+    cute::is_same_v<KernelSchedule, KernelTmaWarpSpecialized> ||
+    cute::is_same_v<KernelSchedule, KernelTmaWarpSpecializedPingpong> ||
+    cute::is_same_v<KernelSchedule, KernelTmaWarpSpecializedCooperative>,
+    "KernelSchedule must be one of the warp specialized policies");
+};
+
+// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp specialized dynamic schedule for Ptr-Array and Grouped Gemm
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  class KernelSchedule = KernelPtrArrayTmaWarpSpecializedCooperative
+>
+struct MainloopSm90ArrayTmaGmmaWarpSpecialized {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm90;
+  using Schedule = KernelSchedule;
+  static_assert(
+    cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedCooperative, KernelSchedule> ||
+    cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedPingpong, KernelSchedule>,
+    "KernelSchedule must be one of the Ptr-Array or Grouped Gemm TMA Warp Specialized Cooperative or Pingpong policies");
+};
+
+// n-buffer in smem (Hopper TMA), pipelined with Hopper sparse GMMA and TMA, Warp specialized dynamic schedule
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  class KernelSchedule = KernelTmaWarpSpecializedCooperative
+>
+struct MainloopSm90TmaGmmaWarpSpecializedSparse {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm90;
+  using Schedule = KernelSchedule;
+};
+
+
+//////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm
+
diff --git a/include/cutlass/gemm/gemm.h b/include/cutlass/gemm/gemm.h
index e2db4c95ad..ac288e3e81 100644
--- a/include/cutlass/gemm/gemm.h
+++ b/include/cutlass/gemm/gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,390 +35,97 @@
 
 #include "cutlass/cutlass.h"
 #include "cutlass/coord.h"
-
-namespace cutlass {
-namespace gemm {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// GEMM operand enumeration: D = A * B + C
-enum class Operand {
-  kA, /// A multiplicand
-  kB, /// B multiplicand
-  kC, /// Source accumulator
-  kD  /// Destination accumulator
-};
+#include "cutlass/gemm_coord.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/gemm_enumerated_types.h"
+#include "cute/layout.hpp"
+#include "cutlass/detail/layout.hpp"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Shape of a matrix multiply-add operation
-template <
-  /// Rows of matrix product
-  int M = 1,
-  /// Columns of matrix product
-  int N = 1,
-  /// Inner dimension of matrix product
-  int K = 1
->
-struct GemmShape {
-  static int const kM = M;
-  static int const kN = N;
-  static int const kK = K;
-
-  static int const kMN = M * N;
-  static int const kMK = M * K;
-  static int const kKN = N * K;
-  static int const kMNK = M * N * K;
-
-  static int const kCount = kMNK;
-
-  //
-  // Static member functions
-  //
-
-  /// Returns a Coord object
-  CUTLASS_HOST_DEVICE
-  static Coord<3> toCoord() {
-    return make_Coord(kM, kN, kK);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace gemm {
 
-/// Type alias of the transpose of a GemmShape
-template <
-  /// concept: GemmShape
-  typename Shape
->
-using GemmShapeTranspose = GemmShape<Shape::kN, Shape::kM, Shape::kK>;
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// GemmCoord is a structure derived from Coord<3> that specifies a location within the
-/// coordinate space of a GEMM problem.
-struct GemmCoord : public Coord<3, int> {
-
-  /// Integer-valued index
-  typedef int Index;
-
-  /// Base type is a Coord of rank=3
-  typedef Coord<3, Index> Base;
-
-  /// GEMM M dimension - rows of the output C matrix
-  static int const kM = 0;
-
-  /// GEMM N dimension - columns of the output C matrix
-  static int const kN = 1;
-
-  /// GEMM K dimension - inner dimension of the GEMM problem
-  static int const kK = 2;
-
-  //
-  // Methods
-  //
-
-  /// Default ctor
-  CUTLASS_HOST_DEVICE
-  GemmCoord() { }
-
-  /// Constructs from Coord<3> and a batch
-  CUTLASS_HOST_DEVICE
-  GemmCoord(Coord<3, Index> const &coord): Base(make_Coord(coord[0], coord[1], coord[2])) { }
-
-  /// Helper to construct from a K, N, M, batch variables
-  CUTLASS_HOST_DEVICE
-  GemmCoord(Index m, Index n, Index k): Base(make_Coord(m, n, k)) { }
-
-  /// Returns the GEMM M coordinate
-  CUTLASS_HOST_DEVICE
-  Index const & m() const { return this->at(kM); }
-
-  /// Returns reference to the GEMM M coordinate
-  CUTLASS_HOST_DEVICE
-  Index & m() { return this->at(kM); }
-
-  /// Returns the GEMM N coordinate
-  CUTLASS_HOST_DEVICE
-  Index const & n() const { return this->at(kN); }
-
-  /// Returns reference to the GEMM N coordinate
-  CUTLASS_HOST_DEVICE
-  Index & n() { return this->at(kN); }
-
-  /// Returns the GEMM K coordinate
-  CUTLASS_HOST_DEVICE
-  Index const & k() const { return this->at(kK); }
-
-  /// Returns reference to the GEMM K coordinate
-  CUTLASS_HOST_DEVICE
-  Index & k() { return this->at(kK); }
-
-  /// Obtains a Coord<3> from GemmCoord
-  CUTLASS_HOST_DEVICE
-  Coord<3> mnk() const {
-    return make_Coord(m(), n(), k());
-  }
-
-  /// Obtains a Coord<3> from GemmCoord
-  CUTLASS_HOST_DEVICE
-  Coord<3> knm() const {
-    return make_Coord(k(), n(), m());
-  }
-
-  /// Obtains a Coord<2> from GemmCoord
-  CUTLASS_HOST_DEVICE
-  Coord<2> nm() const {
-    return make_Coord(n(), m());
-  }
-
-  /// Obtains a Coord<2> from GemmCoord
-  CUTLASS_HOST_DEVICE
-  Coord<2> mn() const {
-    return make_Coord(m(), n());
-  }
-
-  /// Obtains a Coord<2> from GemmCoord
-  CUTLASS_HOST_DEVICE
-  Coord<2> mk() const {
-    return make_Coord(m(), k());
-  }
-
-  /// Obtains a Coord<2> from GemmCoord
-  CUTLASS_HOST_DEVICE
-  Coord<2> km() const {
-    return make_Coord(k(), m());
-  }
-
-  /// Obtains a Coord<2> from GemmCoord
-  CUTLASS_HOST_DEVICE
-  Coord<2> nk() const {
-    return make_Coord(n(), k());
-  }
-
-  /// Obtains a Coord<2> from GemmCoord
-  CUTLASS_HOST_DEVICE
-  Coord<2> kn() const {
-    return make_Coord(k(), n());
-  }
-
-  //
-  // Coord operators
-  //
-
-  /// Element-wise addition
-  CUTLASS_HOST_DEVICE
-  GemmCoord operator+(Base const& b) const {
-    return GemmCoord(Base::operator+(b));
-  }
-
-  /// Element-wise subtraction
-  CUTLASS_HOST_DEVICE
-  GemmCoord operator-(Base const& b) const {
-    return GemmCoord(Base::operator-(b));
-  }
-
-  /// Element-wise multiplication
-  CUTLASS_HOST_DEVICE
-  GemmCoord operator*(Base const& b) const {
-    return GemmCoord(Base::operator*(b));
-  }
-
-  /// Element-wise division
-  CUTLASS_HOST_DEVICE
-  GemmCoord operator/(Base const& b) const {
-    return GemmCoord(Base::operator/(b));
-  }
-
-  /// In-place addition
-  CUTLASS_HOST_DEVICE
-  GemmCoord& operator+=(Base const& b) {
-    Base::operator+=(b);
-    return *this;
-  }
-
-  /// In-place subtraction
-  CUTLASS_HOST_DEVICE
-  GemmCoord& operator-=(Base const& b) {
-    Base::operator-=(b);
-    return *this;
-  }
-
-  /// In-place multiplication
-  CUTLASS_HOST_DEVICE
-  GemmCoord& operator*=(Base const& b) {
-    Base::operator*=(b);
-    return *this;
-  }
-
-  /// In-place division
-  CUTLASS_HOST_DEVICE
-  GemmCoord& operator/=(Base const& b) {
-    Base::operator/=(b);
-    return *this;
-  }
-};
+using cutlass::detail::TagToStrideA;
+using cutlass::detail::TagToStrideB;
+using cutlass::detail::TagToStrideC;
+using cutlass::detail::TagToStrideA_t;
+using cutlass::detail::TagToStrideB_t;
+using cutlass::detail::TagToStrideC_t;
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// BatchedGemmCoord is a structure derived from Coord<4> that specifies a location within the
-/// coordinate space of a batched GEMM problem.
-struct BatchedGemmCoord : public Coord<4, int> {
-
-  /// Integer-valued index
-  typedef int Index;
-
-  /// Base type is a Coord of rank=4
-  typedef Coord<4, Index> Base;
-
-  /// GEMM M dimension - rows of the output C matrix
-  static int const kM = 0;
-
-  /// GEMM N dimension - columns of the output C matrix
-  static int const kN = 1;
-
-  /// GEMM K dimension - inner dimension of the GEMM problem
-  static int const kK = 2;
-
-  /// GEMM Batch dimension - inner dimension of the GEMM problem
-  static int const kBatch = 3;
-
-  //
-  // Methods
-  //
-
-  /// Default ctor
-  CUTLASS_HOST_DEVICE
-  BatchedGemmCoord() { }
-
-  /// Constructs from Coord<4>
-  CUTLASS_HOST_DEVICE
-  BatchedGemmCoord(Base const &coord): Base(coord) { }
-
-  /// Helper to construct from a K, N, M, and batch variables
-  CUTLASS_HOST_DEVICE
-  BatchedGemmCoord(Index m, Index n, Index k, Index b): Base(make_Coord(m, n, k, b)) { }
-
-  /// Returns the GEMM M coordinate
-  CUTLASS_HOST_DEVICE
-  Index const & m() const { return this->at(kM); }
-
-  /// Returns reference to the GEMM M coordinate
-  CUTLASS_HOST_DEVICE
-  Index & m() { return this->at(kM); }
-
-  /// Returns the GEMM N coordinate
-  CUTLASS_HOST_DEVICE
-  Index const & n() const { return this->at(kN); }
-
-  /// Returns reference to the GEMM N coordinate
-  CUTLASS_HOST_DEVICE
-  Index & n() { return this->at(kN); }
-
-  /// Returns the GEMM K coordinate
-  CUTLASS_HOST_DEVICE
-  Index const & k() const { return this->at(kK); }
-
-  /// Returns reference to the GEMM K coordinate
-  CUTLASS_HOST_DEVICE
-  Index & k() { return this->at(kK); }
-
-  /// Returns the GEMM batch coordinate
-  CUTLASS_HOST_DEVICE
-  Index const & batch() const { return this->at(kBatch); }
-
-  /// Returns reference to the GEMM batch coordinate
-  CUTLASS_HOST_DEVICE
-  Index & batch() { return this->at(kBatch); }
-
-  /// Obtains a GemmCoord from BatchedGemmCoord
-  CUTLASS_HOST_DEVICE
-  GemmCoord mnk() const {
-    return GemmCoord(m(), n(), k());
-  }
-
-  /// Obtains a Coord<4> from BatchedGemmCoord
-  CUTLASS_HOST_DEVICE
-  Coord<4> mnkb() const {
-    return make_Coord(m(), n(), k(), batch());
-  }
-
-  //
-  // Coord operators
-  //
-
-  /// Element-wise addition
-  CUTLASS_HOST_DEVICE
-  BatchedGemmCoord operator+(Base const& b) const {
-    return BatchedGemmCoord(Base::operator+(b));
-  }
-
-  /// Element-wise subtraction
-  CUTLASS_HOST_DEVICE
-  BatchedGemmCoord operator-(Base const& b) const {
-    return BatchedGemmCoord(Base::operator-(b));
-  }
-
-  /// Element-wise multiplication
-  CUTLASS_HOST_DEVICE
-  BatchedGemmCoord operator*(Base const& b) const {
-    return BatchedGemmCoord(Base::operator*(b));
-  }
-
-  /// Element-wise division
-  CUTLASS_HOST_DEVICE
-  BatchedGemmCoord operator/(Base const& b) const {
-    return BatchedGemmCoord(Base::operator/(b));
-  }
-
-  /// In-place addition
-  CUTLASS_HOST_DEVICE
-  BatchedGemmCoord& operator+=(Base const& b) {
-    Base::operator+=(b);
-    return *this;
-  }
-
-  /// In-place subtraction
-  CUTLASS_HOST_DEVICE
-  BatchedGemmCoord& operator-=(Base const& b) {
-    Base::operator-=(b);
-    return *this;
-  }
-
-  /// In-place multiplication
-  CUTLASS_HOST_DEVICE
-  BatchedGemmCoord& operator*=(Base const& b) {
-    Base::operator*=(b);
-    return *this;
-  }
-
-  /// In-place division
-  CUTLASS_HOST_DEVICE
-  BatchedGemmCoord& operator/=(Base const& b) {
-    Base::operator/=(b);
-    return *this;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-enum class GemmUniversalMode {
-  kGemm,
-  kGemmSplitKParallel,
-  kBatched,
-  kArray,
-  kInvalid
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Some options for clearing shared memory
-enum class SharedMemoryClearOption {
-  kNone,            ///< SMEM is in don't-care state
-  kZfill,           ///< Kernels fill out of bounds accesses with zeros
-  kClearLastStage   ///< Last SMEM stage is explicitly cleared. Mainloop uses 'kNone'
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+
+using cutlass::detail::StrideToLayoutTagA;
+using cutlass::detail::StrideToLayoutTagB;
+using cutlass::detail::StrideToLayoutTagC;
+using cutlass::detail::StrideToLayoutTagA_t;
+using cutlass::detail::StrideToLayoutTagB_t;
+using cutlass::detail::StrideToLayoutTagC_t;
+
+template<int ModeIndex, class Stride>
+constexpr bool
+is_major(Stride = {}) {
+  return ::cutlass::detail::is_major<ModeIndex>(Stride{});
+}
+
+template<class Stride>
+constexpr bool
+is_mn_major() {
+  return is_major<0,Stride>();
+}
+
+template<class Stride>
+constexpr
+bool
+is_k_major() {
+  return is_major<1,Stride>();
+}
+
+template<class LayoutA>
+constexpr bool
+is_mn_major_A() {
+  return is_mn_major<TagToStrideA_t<LayoutA>>();
+}
+
+template<class LayoutB>
+constexpr bool
+is_mn_major_B() {
+  return is_mn_major<TagToStrideB_t<LayoutB>>();
+}
+
+template<class LayoutA>
+constexpr bool
+is_k_major_A() {
+  return is_k_major<TagToStrideA_t<LayoutA>>();
+}
+
+template<class LayoutB>
+constexpr bool
+is_k_major_B() {
+  return is_k_major<TagToStrideB_t<LayoutB>>();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+// The following two metafunctions are used to detect whether a `kernel::Gemm` or `kernel::GemmUniversal`
+// is implementing the CUTLASS 3.x API or not, by checking if the problem shape type is aliased within or not.
+template <class GemmKernel, class = void>
+struct IsCutlass3GemmKernel : cute::false_type { };
+
+template <typename GemmKernel>
+struct IsCutlass3GemmKernel<GemmKernel, cute::void_t<typename GemmKernel::ProblemShape>>
+    : cute::true_type { };
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////
 
 } // namespace gemm
 } // namespace cutlass
diff --git a/include/cutlass/gemm/gemm_enumerated_types.h b/include/cutlass/gemm/gemm_enumerated_types.h
new file mode 100644
index 0000000000..66aae898d7
--- /dev/null
+++ b/include/cutlass/gemm/gemm_enumerated_types.h
@@ -0,0 +1,80 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines common types used for all GEMM-like operators.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/coord.h"
+#include "cutlass/gemm_coord.h"
+#include "cutlass/layout/matrix.h"
+
+namespace cutlass {
+namespace gemm {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// GEMM operand enumeration: D = A * B + C
+enum class Operand {
+  kA, /// A multiplicand
+  kB, /// B multiplicand
+  kC, /// Source accumulator
+  kD  /// Destination accumulator
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+enum class GemmUniversalMode {
+  kGemm,
+  kGemmSplitKParallel,
+  kBatched,
+  kArray,
+  kGrouped,
+  kInvalid
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Some options for clearing shared memory
+enum class SharedMemoryClearOption {
+  kNone,            ///< SMEM is in don't-care state
+  kZfill,           ///< Kernels fill out of bounds accesses with zeros
+  kClearLastStage   ///< Last SMEM stage is explicitly cleared. Mainloop uses 'kNone'
+};
+
+/////////////////////////////////////////////////////////////////////////
+
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/group_array_problem_shape.hpp b/include/cutlass/gemm/group_array_problem_shape.hpp
new file mode 100644
index 0000000000..fbc0fdd715
--- /dev/null
+++ b/include/cutlass/gemm/group_array_problem_shape.hpp
@@ -0,0 +1,123 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This file contains definitions and utility functions for describing problem shapes 
+           for 3.x Ptr-Array GEMMs and Grouped GEMMs.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_coord.h"
+
+#include "cute/container/array.hpp"
+
+#if ! defined(__CUDACC_RTC__)
+#include <initializer_list>
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class ProblemShape_>
+struct GroupProblemShape {
+  using UnderlyingProblemShape = ProblemShape_;
+  int32_t num_groups = 1;
+  UnderlyingProblemShape* problem_shapes = nullptr;
+  UnderlyingProblemShape const* host_problem_shapes = nullptr;
+
+  CUTLASS_HOST_DEVICE
+  int32_t groups() const { return num_groups; }
+
+  CUTLASS_HOST_DEVICE
+  UnderlyingProblemShape const
+  get_problem_shape(int32_t group_idx) const {
+    return problem_shapes[group_idx];
+  }
+
+  CUTLASS_HOST_DEVICE
+  UnderlyingProblemShape const
+  get_host_problem_shape(int32_t group_idx) const {
+    return host_problem_shapes != nullptr ? host_problem_shapes[group_idx] : UnderlyingProblemShape{};
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool
+  is_host_problem_shape_available() {
+    return host_problem_shapes != nullptr;
+  }
+};
+
+template <class ProblemShape_>
+class ArrayProblemShape {
+public:
+  using UnderlyingProblemShape = ProblemShape_;
+
+  ArrayProblemShape() = default;
+  ArrayProblemShape(UnderlyingProblemShape ps) : problem_shape_(ps) {}
+
+  // Num of groups for Ptr-Array GEMM always remain one, just the number of batches (l) can vary
+  // This is just to maintain uniformity with GroupProblemShape
+  constexpr int32_t groups() const { return 1; }
+
+  UnderlyingProblemShape* problem_shapes() const {
+    return &problem_shape_;
+  }
+  UnderlyingProblemShape const* host_problem_shapes() const {
+    return &problem_shape_;
+  }
+
+  // This is just to maintain uniformity with GroupProblemShape
+  CUTLASS_HOST_DEVICE
+  UnderlyingProblemShape const
+  get_problem_shape(int32_t /* unused */ = 0) const {
+    return problem_shape_;
+  }
+
+  CUTLASS_HOST_DEVICE
+  UnderlyingProblemShape const
+  get_host_problem_shape(int32_t /* unused */ = 0) const {
+    return problem_shape_;
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool
+  is_host_problem_shape_available() {
+    return true;
+  }
+private:
+  UnderlyingProblemShape problem_shape_{};
+};
+
+} // namespace cutlass::gemm 
diff --git a/include/cutlass/gemm/kernel/default_ell_gemm.h b/include/cutlass/gemm/kernel/default_ell_gemm.h
new file mode 100644
index 0000000000..49f9eef330
--- /dev/null
+++ b/include/cutlass/gemm/kernel/default_ell_gemm.h
@@ -0,0 +1,837 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Default kernel-level Blocked-Ell sparse gemm operators.
+      This operator combines threadblock-scoped ELL MMA
+      with the appropriate threadblock-scoped epilogue.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm.h"
+#include "cutlass/gemm/kernel/gemm_pipelined.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+#include "cutlass/gemm/kernel/ell_gemm.h"
+#include "cutlass/gemm/threadblock/default_ell_mma.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Sparse matrix is A or not
+    bool IsASparse>
+struct DefaultEllGemm;
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Sparse matrix is A or not
+    bool IsASparse
+>
+struct DefaultEllGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
+                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+                   arch::Sm80, ThreadblockShape, WarpShape, InstructionShape,
+                   EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                   Operator, IsASparse> {
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial, IsASparse>;
+};
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Turing Architecture
+template <
+  /// Element type for A matrix operand
+  typename ElementA,
+  /// Layout type for A matrix operand
+  typename LayoutA,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB,
+  /// Layout type for B matrix operand
+  typename LayoutB,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator
+  typename EpilogueOutputOp,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// If true, kernel is configured to support serial reduction in the epilogue
+  bool SplitKSerial,
+  /// Operation performed by GEMM
+  typename Operator,
+  /// Sparse matrix is A or not
+  bool IsASparse
+>
+struct DefaultEllGemm<
+  ElementA, LayoutA, kAlignmentA,
+  ElementB, LayoutB, kAlignmentB,
+  ElementC, layout::RowMajor,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  arch::Sm75,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  SplitKSerial,
+  Operator,
+  IsASparse
+> {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementAccumulator,
+    layout::RowMajor,
+    arch::OpClassTensorOp,
+    arch::Sm75,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    2,
+    Operator
+  >::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    typename Mma::Operator,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial, IsASparse>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Integer Matrix Multiply Interleaved layout
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Number of Interleaved k
+    int InterleavedK,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Sparse matrix is A or not
+    bool IsASparse>
+struct DefaultEllGemm<
+    ElementA, layout::ColumnMajorInterleaved<InterleavedK>, kAlignmentA,
+    ElementB, layout::RowMajorInterleaved<InterleavedK>, kAlignmentB, ElementC,
+    layout::ColumnMajorInterleaved<InterleavedK>, int32_t,
+    arch::OpClassTensorOp, arch::Sm80, ThreadblockShape, WarpShape,
+    InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages,
+    SplitKSerial, Operator, IsASparse> {
+  using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
+  using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
+  using LayoutC = layout::ColumnMajorInterleaved<InterleavedK>;
+
+  using ElementAccumulator = int32_t;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages, Operator,
+      true>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::
+      DefaultInterleavedEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          64 / sizeof_bits<ElementC>::value, InterleavedK>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial, IsASparse>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Turing Integer Matrix Multiply Interleaved layout
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of Interleaved k
+    int InterleavedK,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Sparse matrix is A or not
+    bool IsASparse>
+struct DefaultEllGemm<ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+                   kAlignmentA, ElementB,
+                   layout::RowMajorInterleaved<InterleavedK>, kAlignmentB,
+                   ElementC, layout::ColumnMajorInterleaved<InterleavedK>,
+                   int32_t, arch::OpClassTensorOp, arch::Sm75, ThreadblockShape,
+                   WarpShape, InstructionShape, EpilogueOutputOp,
+                   ThreadblockSwizzle, 2, SplitKSerial, Operator, IsASparse> {
+  using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
+  using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
+  using LayoutC = layout::ColumnMajorInterleaved<InterleavedK>;
+
+  using ElementAccumulator = int32_t;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementAccumulator, LayoutC,
+      arch::OpClassTensorOp, arch::Sm75, ThreadblockShape, WarpShape,
+      InstructionShape, 2, Operator, true>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::
+      DefaultInterleavedEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          64 / sizeof_bits<ElementC>::value, InterleavedK>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial, IsASparse>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+/// Partial specialization for Volta architecture
+template <
+  /// Element type for A matrix operand
+  typename ElementA,
+  /// Layout type for A matrix operand
+  typename LayoutA,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB,
+  /// Layout type for B matrix operand
+  typename LayoutB,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Epilogue output operator
+  typename EpilogueOutputOp,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// If true, kernel is configured to support serial reduction in the epilogue
+  bool SplitKSerial,
+  /// Operation performed by GEMM
+  typename Operator,
+  /// Sparse matrix is A or not
+  bool IsASparse
+>
+struct DefaultEllGemm<
+  ElementA, LayoutA, kAlignmentA,
+  ElementB, LayoutB, kAlignmentB,
+  ElementC, layout::RowMajor,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  arch::Sm70,
+  ThreadblockShape,
+  WarpShape,
+  GemmShape<8, 8, 4>,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  SplitKSerial,
+  Operator,
+  IsASparse
+> {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementAccumulator,
+    layout::RowMajor,
+    arch::OpClassTensorOp,
+    arch::Sm70,
+    ThreadblockShape,
+    WarpShape,
+    GemmShape<8, 8, 4>,
+    2,
+    Operator
+  >::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueVoltaTensorOp<
+    ThreadblockShape,
+    typename Mma::Operator,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial, IsASparse>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for SIMT
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// If true, kernel is configured to support serial reduction in the epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Sparse matrix is A or not
+    bool IsASparse
+  >
+struct DefaultEllGemm<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementC,
+    layout::RowMajor,
+    ElementAccumulator,
+    arch::OpClassSimt,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    GemmShape<1, 1, 1>,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    2,
+    SplitKSerial,
+    Operator,
+    IsASparse> {
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<
+      ElementA,
+      LayoutA,
+      kAlignmentA,
+      ElementB,
+      LayoutB,
+      kAlignmentB,
+      ElementAccumulator,
+      layout::RowMajor,
+      arch::OpClassSimt,
+      arch::Sm50,
+      ThreadblockShape,
+      WarpShape,
+      GemmShape<1, 1, 1>,
+      2,
+      Operator>::ThreadblockMma;
+
+  static int const kEpilogueElementsPerAccess = EpilogueOutputOp::kCount;
+  static_assert(kEpilogueElementsPerAccess == 1, "simt epilogue must operate on scalars");
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
+      ThreadblockShape,
+      typename Mma::Operator,
+      EpilogueOutputOp,
+      kEpilogueElementsPerAccess
+      >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial, IsASparse>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator, 
+    /// Sparse matrix is A or not
+    bool IsASparse
+    >
+struct DefaultEllGemm<ElementA,
+                   LayoutA,
+                   kAlignmentA,
+                   ElementB,
+                   LayoutB,
+                   kAlignmentB,
+                   ElementC,
+                   layout::RowMajor,
+                   ElementAccumulator,
+                   arch::OpClassSimt,
+                   arch::Sm80,
+                   ThreadblockShape,
+                   WarpShape,
+                   GemmShape<1, 1, 1>,
+                   EpilogueOutputOp,
+                   ThreadblockSwizzle,
+                   Stages,
+                   SplitKSerial,
+                   Operator,
+                   IsASparse> {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassSimt, arch::Sm80,
+      ThreadblockShape, WarpShape, GemmShape<1, 1, 1>, Stages,
+      Operator>::ThreadblockMma;
+
+  static int const kEpilogueElementsPerAccess = EpilogueOutputOp::kCount;
+  static_assert(kEpilogueElementsPerAccess == 1, "simt epilogue must operate on scalars");
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
+      ThreadblockShape,
+      typename Mma::Operator,
+      EpilogueOutputOp,
+      kEpilogueElementsPerAccess
+      >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial,IsASparse>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for SIMT DP4A
+
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Layout type for C matrix operand
+    typename LayoutC,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Sparse matrix is A or not
+    bool IsASparse
+    >
+struct DefaultEllGemm<int8_t, LayoutA, kAlignmentA, int8_t, LayoutB, kAlignmentB,
+                   ElementC, LayoutC, ElementAccumulator, arch::OpClassSimt,
+                   ArchTag, ThreadblockShape, WarpShape, GemmShape<1, 1, 4>,
+                   EpilogueOutputOp, ThreadblockSwizzle, 2, SplitKSerial,
+                   Operator, IsASparse> {
+  using InstructionShape = GemmShape<1, 1, 4>;
+  using ElementA = int8_t;
+  using ElementB = int8_t;
+
+  using OperatorClass =  arch::OpClassSimt;
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<ElementA,
+      LayoutA,
+      kAlignmentA,
+      ElementB,
+      LayoutB,
+      kAlignmentB,
+      ElementAccumulator,
+      LayoutC,
+      arch::OpClassSimt,
+      arch::Sm50,
+      ThreadblockShape,
+      WarpShape,
+      InstructionShape,
+      2,
+      Operator
+      >::ThreadblockMma;
+
+  static int const kEpilogueElementsPerAccess = EpilogueOutputOp::kCount;
+  static_assert(kEpilogueElementsPerAccess == 1, "simt epilogue must operate on scalars");
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
+      ThreadblockShape,
+      typename Mma::Operator,
+      EpilogueOutputOp,
+      kEpilogueElementsPerAccess
+      >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial, IsASparse>;
+};
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for Wmma Gemm Kernel
+template <
+    ///< Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Sparse matrix is A or not
+    bool IsASparse
+    > 
+struct DefaultEllGemm<
+  ElementA, LayoutA, kAlignmentA, 
+  ElementB, LayoutB, kAlignmentB, 
+  ElementC, LayoutC, 
+  ElementAccumulator, 
+  arch::OpClassWmmaTensorOp,
+  ArchTag, 
+  ThreadblockShape, WarpShape, InstructionShape,
+  EpilogueOutputOp, 
+  ThreadblockSwizzle, 
+  Stages, 
+  SplitKSerial,
+  Operator,
+  IsASparse> {
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<
+      ElementA, LayoutA, kAlignmentA,
+      ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, LayoutC, 
+      arch::OpClassWmmaTensorOp, 
+      ArchTag,
+      ThreadblockShape, 
+      WarpShape, 
+      InstructionShape, 
+      Stages,
+      Operator>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue 
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWmmaTensorOp<
+      ThreadblockShape,
+      typename Mma::Operator, 
+      kPartitionsK, 
+      EpilogueOutputOp,
+      EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial, IsASparse>;
+};
+////////////////////////////////////////////////////////////////////////////////
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/include/cutlass/gemm/kernel/default_gemm.h b/include/cutlass/gemm/kernel/default_gemm.h
index 8b433d24d9..4678df4af0 100644
--- a/include/cutlass/gemm/kernel/default_gemm.h
+++ b/include/cutlass/gemm/kernel/default_gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -65,6 +65,8 @@
 #include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
 #include "cutlass/transform/threadblock/predicated_tile_iterator.h"
 
+#include "cutlass/layout/permute.h"
+
 #if defined(CUTLASS_ARCH_WMMA_ENABLED)
 #include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
 #endif //CUTLASS_ARCH_WMMA_ENABLED
@@ -125,12 +127,173 @@ template <
     bool GatherB = false,
     /// Scatter result D by using an index array
     bool ScatterD = false,
+    /// Permute result D
+    typename PermuteDLayout = layout::NoPermute,
+    /// Permute operand A
+    typename PermuteALayout = layout::NoPermute,
+    /// Permute operand B
+    typename PermuteBLayout = layout::NoPermute,
     ///
     typename Enable = void
 >
 struct DefaultGemm;
 
 ////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Hopper Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Scatter result D by using an index array
+    bool ScatterD,
+    /// Permute result D
+    typename PermuteDLayout,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
+>
+struct DefaultGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
+                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+                   arch::Sm90, ThreadblockShape, WarpShape, InstructionShape,
+                   EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                   Operator, SharedMemoryClear, GatherA, GatherB, ScatterD,
+                   PermuteDLayout, PermuteALayout, PermuteBLayout> {
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator, false, SharedMemoryClear, GatherA, GatherB, 
+      PermuteALayout, PermuteBLayout>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, ScatterD, PermuteDLayout>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ada Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Scatter result D by using an index array
+    bool ScatterD,
+    /// Permute result D
+    typename PermuteDLayout,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
+>
+struct DefaultGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
+                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+                   arch::Sm89, ThreadblockShape, WarpShape, InstructionShape,
+                   EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                   Operator, SharedMemoryClear, GatherA, GatherB, ScatterD, 
+                   PermuteDLayout, PermuteALayout, PermuteBLayout> {
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm89,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator, false, SharedMemoryClear, GatherA, GatherB,
+      PermuteALayout, PermuteBLayout>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, ScatterD, PermuteDLayout>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
 ////////////////////////////////////////////////////////////////////////////////
 
 /// Partial specialization for Ampere Architecture
@@ -177,16 +340,23 @@ template <
     /// Gather operand B by using an index array
     bool GatherB,
     /// Scatter result D by using an index array
-    bool ScatterD
+    bool ScatterD,
+    /// Permute result D
+    typename PermuteDLayout,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
 >
 struct DefaultGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
                    LayoutC, ElementAccumulator, arch::OpClassTensorOp,
                    arch::Sm80, ThreadblockShape, WarpShape, InstructionShape,
                    EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
-                   Operator, SharedMemoryClear, GatherA, GatherB, ScatterD> {
+                   Operator, SharedMemoryClear, GatherA, GatherB, ScatterD,
+                   PermuteDLayout, PermuteALayout, PermuteBLayout> {
 
-  static_assert(platform::is_same<LayoutC, layout::RowMajor>::value
-             || platform::is_same<LayoutC, layout::AffineRankN<2>>::value,
+  static_assert((platform::is_same<LayoutC, layout::RowMajor>::value
+             || platform::is_same<LayoutC, layout::AffineRankN<2>>::value),
              "Epilogue in the kernel level must be row major");
 
   /// Define the threadblock-scoped matrix multiply-accumulate
@@ -194,7 +364,8 @@ struct DefaultGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignment
       ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
       ElementAccumulator, LayoutC, arch::OpClassTensorOp, arch::Sm80,
       ThreadblockShape, WarpShape, InstructionShape, Stages,
-      Operator, false, SharedMemoryClear, GatherA, GatherB>::ThreadblockMma;
+      Operator, false, SharedMemoryClear, GatherA, GatherB,
+      PermuteALayout, PermuteBLayout>::ThreadblockMma;
 
   static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
 
@@ -202,14 +373,14 @@ struct DefaultGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignment
   using RegularEpilogue =
       typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
           ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
-          EpilogueOutputOp::kCount, ScatterD>::Epilogue;
+          EpilogueOutputOp::kCount, ScatterD, PermuteDLayout>::Epilogue;
 
   using Affine2Epilogue =
       typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOpAffineRankN<
           2, ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
           EpilogueOutputOp::kCount>::Epilogue;
 
-  using Epilogue = typename cutlass::platform::conditional<platform::is_same<LayoutC, layout::RowMajor>::value,
+  using Epilogue = typename platform::conditional<platform::is_same<LayoutC, layout::RowMajor>::value,
                                                   RegularEpilogue,
                                                   Affine2Epilogue>::type;
 
@@ -258,7 +429,13 @@ template <
   /// Gather operand B by using an index array
   bool GatherB,
   /// Scatter result D by using an index array
-  bool ScatterD
+  bool ScatterD,
+  /// Permute result D
+  typename PermuteDLayout,
+  /// Permute operand A
+  typename PermuteALayout,
+  /// Permute operand B
+  typename PermuteBLayout
 >
 struct DefaultGemm<
   ElementA, LayoutA, kAlignmentA,
@@ -278,7 +455,10 @@ struct DefaultGemm<
   SharedMemoryClear,
   GatherA,
   GatherB,
-  ScatterD
+  ScatterD,
+  PermuteDLayout,
+  PermuteALayout,
+  PermuteBLayout
 > {
 
   /// Define the threadblock-scoped matrix multiply-accumulate
@@ -301,7 +481,9 @@ struct DefaultGemm<
     false,
     SharedMemoryClear,
     GatherA,
-    GatherB
+    GatherB,
+    PermuteALayout,
+    PermuteBLayout
   >::ThreadblockMma;
 
   static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
@@ -313,7 +495,8 @@ struct DefaultGemm<
     kPartitionsK,
     EpilogueOutputOp,
     EpilogueOutputOp::kCount,
-    ScatterD
+    ScatterD,
+    PermuteDLayout
   >::Epilogue;
 
   /// Define the kernel-level GEMM operator.
@@ -493,7 +676,13 @@ template <
   /// Gather operand B by using an index array
   bool GatherB,
   /// Scatter result D by using an index array
-  bool ScatterD
+  bool ScatterD,
+  /// Permute result D
+  typename PermuteDLayout,
+  /// Permute operand A
+  typename PermuteALayout,
+  /// Permute operand B
+  typename PermuteBLayout
 >
 struct DefaultGemm<
   ElementA, LayoutA, kAlignmentA,
@@ -513,7 +702,10 @@ struct DefaultGemm<
   SharedMemoryClear,
   GatherA,
   GatherB,
-  ScatterD
+  ScatterD,
+  PermuteDLayout,
+  PermuteALayout,
+  PermuteBLayout
 > {
 
   /// Define the threadblock-scoped matrix multiply-accumulate
@@ -536,7 +728,9 @@ struct DefaultGemm<
     false,
     SharedMemoryClear,
     GatherA,
-    GatherB
+    GatherB,
+    PermuteALayout,
+    PermuteBLayout
   >::ThreadblockMma;
 
   static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
@@ -548,7 +742,8 @@ struct DefaultGemm<
     kPartitionsK,
     EpilogueOutputOp,
     EpilogueOutputOp::kCount,
-    ScatterD
+    ScatterD,
+    PermuteDLayout
   >::Epilogue;
 
   /// Define the kernel-level GEMM operator.
@@ -598,7 +793,13 @@ template <
     /// Gather operand B by using an index array
     bool GatherB,
     /// Scatter result D by using an index array
-    bool ScatterD
+    bool ScatterD,
+    /// Permute result D
+    typename PermuteDLayout,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
   >
 struct DefaultGemm<
     ElementA,
@@ -624,10 +825,13 @@ struct DefaultGemm<
     GatherA,
     GatherB,
     ScatterD,
+    PermuteDLayout,
+    PermuteALayout,
+    PermuteBLayout,
     typename platform::enable_if< ! platform::is_same<ArchTag, arch::Sm80>::value >::type > {
 
-  static_assert(platform::is_same<LayoutC, layout::RowMajor>::value
-             || platform::is_same<LayoutC, layout::AffineRankN<2>>::value,
+  static_assert((platform::is_same<LayoutC, layout::RowMajor>::value
+             || platform::is_same<LayoutC, layout::AffineRankN<2>>::value),
              "Epilogue in the kernel level must be row major");
 
   /// Define the threadblock-scoped matrix multiply-accumulate
@@ -650,7 +854,9 @@ struct DefaultGemm<
       false,
       SharedMemoryClear,
       GatherA,
-      GatherB>::ThreadblockMma;
+      GatherB,
+      PermuteALayout,
+      PermuteBLayout>::ThreadblockMma;
 
   static int const kEpilogueElementsPerAccess = EpilogueOutputOp::kCount;
   static_assert(kEpilogueElementsPerAccess == 1, "simt epilogue must operate on scalars");
@@ -661,7 +867,8 @@ struct DefaultGemm<
       typename Mma::Operator,
       EpilogueOutputOp,
       kEpilogueElementsPerAccess,
-      ScatterD
+      ScatterD,
+      PermuteDLayout
       >::Epilogue;
 
   using Affine2Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimtAffineRankN<
@@ -672,7 +879,7 @@ struct DefaultGemm<
       kEpilogueElementsPerAccess
       >::Epilogue;
 
-  using Epilogue = typename cutlass::platform::conditional<platform::is_same<LayoutC, layout::RowMajor>::value,
+  using Epilogue = typename platform::conditional<platform::is_same<LayoutC, layout::RowMajor>::value,
                                                   RegularEpilogue,
                                                   Affine2Epilogue>::type;
 
@@ -723,7 +930,13 @@ template <
     /// Gather operand B by using an index array
     bool GatherB,
     /// Scatter result D by using an index array
-    bool ScatterD
+    bool ScatterD,
+    /// Permute result D
+    typename PermuteDLayout,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
 >
 struct DefaultGemm<ElementA,
                    LayoutA,
@@ -747,10 +960,13 @@ struct DefaultGemm<ElementA,
                    SharedMemoryClear,
                    GatherA,
                    GatherB,
-                   ScatterD> {
+                   ScatterD,
+                   PermuteDLayout,
+                   PermuteALayout,
+                   PermuteBLayout> {
 
-  static_assert(platform::is_same<LayoutC, layout::RowMajor>::value
-             || platform::is_same<LayoutC, layout::AffineRankN<2>>::value,
+  static_assert((platform::is_same<LayoutC, layout::RowMajor>::value
+             || platform::is_same<LayoutC, layout::AffineRankN<2>>::value),
              "Epilogue in the kernel level must be row major");
 
   /// Define the threadblock-scoped matrix multiply-accumulate
@@ -758,7 +974,8 @@ struct DefaultGemm<ElementA,
       ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
       ElementAccumulator, LayoutC, arch::OpClassSimt, arch::Sm80,
       ThreadblockShape, WarpShape, GemmShape<1, 1, 1>, Stages,
-      Operator, false, SharedMemoryClear, GatherA, GatherB>::ThreadblockMma;
+      Operator, false, SharedMemoryClear, GatherA, GatherB,
+      PermuteALayout, PermuteBLayout>::ThreadblockMma;
 
   static int const kEpilogueElementsPerAccess = EpilogueOutputOp::kCount;
   static_assert(kEpilogueElementsPerAccess == 1, "simt epilogue must operate on scalars");
@@ -769,7 +986,8 @@ struct DefaultGemm<ElementA,
       typename Mma::Operator,
       EpilogueOutputOp,
       kEpilogueElementsPerAccess,
-      ScatterD
+      ScatterD,
+      PermuteDLayout
       >::Epilogue;
 
   using Affine2Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimtAffineRankN<
@@ -780,7 +998,7 @@ struct DefaultGemm<ElementA,
       kEpilogueElementsPerAccess
       >::Epilogue;
 
-  using Epilogue = typename cutlass::platform::conditional<platform::is_same<LayoutC, layout::RowMajor>::value,
+  using Epilogue = typename platform::conditional<platform::is_same<LayoutC, layout::RowMajor>::value,
                                                   RegularEpilogue,
                                                   Affine2Epilogue>::type;
 
@@ -828,14 +1046,16 @@ struct DefaultGemm<int8_t, LayoutA, kAlignmentA, int8_t, LayoutB, kAlignmentB,
                    ElementC, LayoutC, ElementAccumulator, arch::OpClassSimt,
                    ArchTag, ThreadblockShape, WarpShape, GemmShape<1, 1, 4>,
                    EpilogueOutputOp, ThreadblockSwizzle, 2, SplitKSerial,
-                   Operator, SharedMemoryClear, false, false, false> {
+                   Operator, SharedMemoryClear, false, false, false,
+                   layout::NoPermute, layout::NoPermute> {
   using InstructionShape = GemmShape<1, 1, 4>;
   using ElementA = int8_t;
   using ElementB = int8_t;
 
   using OperatorClass =  arch::OpClassSimt;
   /// Define the threadblock-scoped matrix multiply-accumulate
-  using Mma = typename cutlass::gemm::threadblock::DefaultMma<ElementA,
+  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA,
       LayoutA,
       kAlignmentA,
       ElementB,
@@ -927,7 +1147,9 @@ struct DefaultGemm<
   SharedMemoryClear,
   false,
   false,
-  false
+  false,
+  layout::NoPermute,
+  layout::NoPermute
 > {
   /// Define the threadblock-scoped matrix multiply-accumulate
   using Mma = typename cutlass::gemm::threadblock::DefaultMma<
diff --git a/include/cutlass/gemm/kernel/default_gemm_complex.h b/include/cutlass/gemm/kernel/default_gemm_complex.h
index 1fad14124e..7ef46c6cf6 100644
--- a/include/cutlass/gemm/kernel/default_gemm_complex.h
+++ b/include/cutlass/gemm/kernel/default_gemm_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -119,6 +119,66 @@ struct DefaultGemmComplex;
 
 ////////////////////////////////////////////////////////////////////////////////
 
+/// Partial specialization for Hopper Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Multiply-add operator 
+    // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the epilogue
+    bool SplitKSerial
+  >
+struct DefaultGemmComplex<
+  ElementA, LayoutA, ElementB, LayoutB, ElementC,
+  layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm90, ThreadblockShape, WarpShape, InstructionShape,
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, Operator, SplitKSerial> {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementA, LayoutA, ElementB, LayoutB, ElementAccumulator,
+      layout::RowMajor, arch::OpClassTensorOp, arch::Sm90, ThreadblockShape,
+      WarpShape, InstructionShape, Stages, TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOp<
+          ThreadblockShape, typename Mma::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
 /// Partial specialization for Ampere Architecture
 template <
     /// Element type for A matrix operand
diff --git a/include/cutlass/gemm/kernel/default_gemm_grouped.h b/include/cutlass/gemm/kernel/default_gemm_grouped.h
index ec9cac3248..f9163874c5 100644
--- a/include/cutlass/gemm/kernel/default_gemm_grouped.h
+++ b/include/cutlass/gemm/kernel/default_gemm_grouped.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -54,6 +54,8 @@
 #include "cutlass/gemm/kernel/default_gemm_complex.h"
 #include "cutlass/gemm/device/default_gemm_configuration.h"
 
+#include "cutlass/layout/permute.h"
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
@@ -101,12 +103,16 @@ template <
     typename ThreadblockSwizzle,
     /// Number of stages used in the pipelined mainloop
     int Stages,
+    /// Whether the schedule of problems to visit has been precomputed
+    GroupScheduleMode GroupScheduleMode_ = GroupScheduleMode::kDeviceOnly,
     /// Operation performed by GEMM
     typename Operator = typename device::DefaultGemmConfiguration<
         OperatorClass, ArchTag, ElementA_, ElementB_, ElementC_,
         ElementAccumulator>::Operator,
     /// Use zfill or predicate for out-of-bound cp.async
     SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Permute result D
+    typename PermuteDLayout = layout::NoPermute,
     ///
     typename Enable = void
     >
@@ -152,10 +158,14 @@ template <
     typename ThreadblockSwizzle,
     /// Number of stages used in the pipelined mainloop
     int Stages,
+    /// Whether the schedule of problems to visit has been precomputed
+    GroupScheduleMode GroupScheduleMode_,
     /// Operation performed by GEMM
     typename Operator,
     /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Permute result D
+    typename PermuteDLayout
 >
 struct DefaultGemmGrouped<
   ElementA,
@@ -177,9 +187,11 @@ struct DefaultGemmGrouped<
   EpilogueOutputOp,
   ThreadblockSwizzle,
   Stages,
+  GroupScheduleMode_,
   Operator,
   SharedMemoryClear,
-  typename std::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
+  PermuteDLayout,
+  typename platform::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
 > {
 
   // If true, we must construct a 'transposed-and-exchanged' Mma operator.
@@ -219,7 +231,11 @@ struct DefaultGemmGrouped<
     Stages,
     true,
     Operator,
-    SharedMemoryClear
+    SharedMemoryClear,
+    false, /*GatherA*/
+    false, /*GatherB*/
+    false, /*ScatterD*/
+    PermuteDLayout
   >::GemmKernel;
 
     /// Define the kernel in terms of the default kernel
@@ -227,6 +243,7 @@ struct DefaultGemmGrouped<
     typename DefaultGemmKernel::Mma,
     typename DefaultGemmKernel::Epilogue,
     ThreadblockSwizzle,
+    GroupScheduleMode_,
     kInternalTranspose
   >;
 };
@@ -276,6 +293,8 @@ template <
     typename ThreadblockSwizzle,
     /// Number of stages used in the pipelined mainloop
     int Stages,
+    /// Whether the schedule of problems to visit has been precomputed
+    GroupScheduleMode GroupScheduleMode_,
     /// Operation performed by GEMM
     typename Operator,
     /// Use zfill or predicate for out-of-bound cp.async
@@ -301,9 +320,11 @@ struct DefaultGemmGrouped<
   EpilogueOutputOp,
   ThreadblockSwizzle,
   Stages,
+  GroupScheduleMode_,
   Operator,
   SharedMemoryClear,
-  typename std::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
+  layout::NoPermute, /*PermuteDLayout*/
+  typename platform::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
 > {
 
   // If true, we must construct a 'transposed-and-exchanged' Mma operator.
@@ -349,6 +370,7 @@ struct DefaultGemmGrouped<
     typename DefaultGemmKernel::Mma,
     typename DefaultGemmKernel::Epilogue, 
     ThreadblockSwizzle,
+    GroupScheduleMode_,
     kInternalTranspose
   >;
 };
diff --git a/include/cutlass/gemm/kernel/default_gemm_grouped_per_group_scale.h b/include/cutlass/gemm/kernel/default_gemm_grouped_per_group_scale.h
new file mode 100644
index 0000000000..3b7b126ae4
--- /dev/null
+++ b/include/cutlass/gemm/kernel/default_gemm_grouped_per_group_scale.h
@@ -0,0 +1,384 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/kernel/gemm_grouped_per_group_scale.h"
+#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/kernel/default_gemm_complex.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+#include "cutlass/layout/permute.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Whether the schedule of problems to visit has been precomputed
+    GroupScheduleMode GroupScheduleMode_ = GroupScheduleMode::kDeviceOnly,
+    /// Operation performed by GEMM
+    typename Operator = typename device::DefaultGemmConfiguration<
+        OperatorClass, ArchTag, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator>::Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Permute result D
+    typename PermuteDLayout = layout::NoPermute,
+    ///
+    typename Enable = void
+    >
+struct DefaultGemmGroupedPerGroupScale;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Real-valued GEMM kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Whether the schedule of problems to visit has been precomputed
+    GroupScheduleMode GroupScheduleMode_,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Permute result D
+    typename PermuteDLayout
+>
+struct DefaultGemmGroupedPerGroupScale<
+  ElementA,
+  LayoutA,
+  ComplexTransform::kNone,   // transform A
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  ComplexTransform::kNone,   // transform B
+  kAlignmentB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  GroupScheduleMode_,
+  Operator,
+  SharedMemoryClear,
+  PermuteDLayout,
+  typename platform::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  // If true, we must construct a 'transposed-and-exchanged' Mma operator.
+  static bool const kInternalTranspose = platform::is_same<LayoutC, layout::ColumnMajor>::value;
+
+  using MapArguments = kernel::detail::MapArguments<
+    ElementA,
+    LayoutA,
+    ComplexTransform::kNone,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    ComplexTransform::kNone,
+    kAlignmentB,
+    LayoutC,
+    kInternalTranspose
+  >;
+
+  // Define the default GEMM kernel
+  using DefaultGemmKernel = typename kernel::DefaultGemm<
+    typename MapArguments::ElementA,
+    typename MapArguments::LayoutA,
+    MapArguments::kAlignmentA,
+    typename MapArguments::ElementB,
+    typename MapArguments::LayoutB,
+    MapArguments::kAlignmentB,
+    ElementC,
+    typename MapArguments::LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    true,
+    Operator,
+    SharedMemoryClear,
+    false, /*GatherA*/
+    false, /*GatherB*/
+    false, /*ScatterD*/
+    PermuteDLayout
+  >::GemmKernel;
+
+    /// Define the kernel in terms of the default kernel
+  using GemmKernel = kernel::GemmGroupedPerGroupScale<
+    typename DefaultGemmKernel::Mma,
+    typename DefaultGemmKernel::Epilogue,
+    ThreadblockSwizzle,
+    GroupScheduleMode_,
+    kInternalTranspose
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Complex-valued GEMM kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Whether the schedule of problems to visit has been precomputed
+    GroupScheduleMode GroupScheduleMode_,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear
+  >
+struct DefaultGemmGroupedPerGroupScale<
+  ElementA,
+  LayoutA,
+  TransformA,
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  TransformB,
+  kAlignmentB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  GroupScheduleMode_,
+  Operator,
+  SharedMemoryClear,
+  layout::NoPermute, /*PermuteDLayout*/
+  typename platform::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  // If true, we must construct a 'transposed-and-exchanged' Mma operator.
+  static bool const kInternalTranspose = platform::is_same<LayoutC, layout::ColumnMajor>::value;
+
+  using MapArguments = kernel::detail::MapArguments<
+    ElementA,
+    LayoutA,
+    TransformA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    TransformB,
+    kAlignmentB,
+    LayoutC,
+    kInternalTranspose
+  >;
+
+  using DefaultGemmKernel = typename kernel::DefaultGemmComplex<
+    typename MapArguments::ElementA,
+    typename MapArguments::LayoutA,
+    typename MapArguments::ElementB,
+    typename MapArguments::LayoutB,
+    ElementC,
+    typename MapArguments::LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MapArguments::kTransformA,
+    MapArguments::kTransformB,
+    Operator,
+    false
+  >::GemmKernel;
+
+  /// Define the kernel in terms of the default kernel
+  using GemmKernel = kernel::GemmGroupedPerGroupScale<
+    typename DefaultGemmKernel::Mma,
+    typename DefaultGemmKernel::Epilogue, 
+    ThreadblockSwizzle,
+    GroupScheduleMode_,
+    kInternalTranspose
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/default_gemm_grouped_softmax_mainloop_fusion.h b/include/cutlass/gemm/kernel/default_gemm_grouped_softmax_mainloop_fusion.h
new file mode 100644
index 0000000000..a031c1a954
--- /dev/null
+++ b/include/cutlass/gemm/kernel/default_gemm_grouped_softmax_mainloop_fusion.h
@@ -0,0 +1,164 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level softmax-grouped-GEMM
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/kernel/gemm_grouped_softmax_mainloop_fusion.h"
+#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/kernel/default_gemm_complex.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/threadblock/default_mma_softmax_mainloop_fusion.h"
+
+#include "cutlass/layout/permute.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for Scale/Bias vectors
+    typename ElementScaleBias_,
+    /// Layout type for Scale/Bias vectors
+    typename LayoutScaleBias_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Whether the schedule of problems to visit has been precomputed
+    GroupScheduleMode GroupScheduleMode_ = GroupScheduleMode::kDeviceOnly,
+    /// Operation performed by GEMM
+    typename Operator = typename device::DefaultGemmConfiguration<
+        OperatorClass, ArchTag, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator>::Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone
+    >
+struct DefaultGemmGroupedSoftmaxMainloopFusion {
+  // If true, we must construct a 'transposed-and-exchanged' Mma operator.
+  static bool const kInternalTranspose = platform::is_same<LayoutC_, layout::ColumnMajor>::value;
+
+  using MapArguments = kernel::detail::MapArguments<
+    ElementA_,
+    LayoutA_,
+    ComplexTransform::kNone,
+    kAlignmentA,
+    ElementB_,
+    LayoutB_,
+    ComplexTransform::kNone,
+    kAlignmentB,
+    LayoutC_,
+    kInternalTranspose
+  >;
+
+private:
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMmaSoftmaxMainloopFusion<
+      typename MapArguments::ElementA, typename MapArguments::LayoutA, MapArguments::kAlignmentA,
+      typename MapArguments::ElementB, typename MapArguments::LayoutB, MapArguments::kAlignmentB,
+      ElementScaleBias_, LayoutScaleBias_, ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag,
+      ThreadblockShape, WarpShape, InstructionShape, Stages, kInternalTranspose,
+      Operator, false, SharedMemoryClear>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount>::Epilogue;
+
+public:
+  using GemmKernel = kernel::GemmGroupedSoftmaxMainloopFusion<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    GroupScheduleMode_,
+    kInternalTranspose
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/default_gemm_layernorm_mainloop_fusion.h b/include/cutlass/gemm/kernel/default_gemm_layernorm_mainloop_fusion.h
new file mode 100644
index 0000000000..68d739e30a
--- /dev/null
+++ b/include/cutlass/gemm/kernel/default_gemm_layernorm_mainloop_fusion.h
@@ -0,0 +1,137 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm_layernorm_mainloop_fusion.h"
+#include "cutlass/gemm/threadblock/default_mma_layernorm_mainloop_fusion.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for Scale/Bias vectors
+    typename ElementScaleBias,
+    /// Layout type for Scale/Bias vectors
+    typename LayoutScaleBias,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone>
+struct DefaultGemmLayernormMainloopFusion {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMmaLayernormMainloopFusion<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementScaleBias, LayoutScaleBias, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator, false, SharedMemoryClear>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::GemmLayernormMainloopFusion<Mma, Epilogue, ThreadblockSwizzle>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h b/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h
index 69bd907758..df74a07491 100644
--- a/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h
+++ b/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_gemm_sparse.h b/include/cutlass/gemm/kernel/default_gemm_sparse.h
index aa0e8ed33b..f1841a3770 100644
--- a/include/cutlass/gemm/kernel/default_gemm_sparse.h
+++ b/include/cutlass/gemm/kernel/default_gemm_sparse.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -183,6 +183,67 @@ struct DefaultSparseGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAli
   using GemmKernel = kernel::SparseGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
 };
 
+///////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ada Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultSparseGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
+                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+                   arch::Sm89, ThreadblockShape, WarpShape, InstructionShape,
+                   EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                   Operator> {
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultSparseMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm89,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::SparseGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
 ////////////////////////////////////////////////////////////////////////////////
 
 }  // namespace kernel
diff --git a/include/cutlass/gemm/kernel/default_gemm_sparse_universal.h b/include/cutlass/gemm/kernel/default_gemm_sparse_universal.h
new file mode 100644
index 0000000000..250a0e7b29
--- /dev/null
+++ b/include/cutlass/gemm/kernel/default_gemm_sparse_universal.h
@@ -0,0 +1,141 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level Sparse GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/kernel/gemm_sparse_universal.h"
+#include "cutlass/gemm/kernel/default_gemm_sparse.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Real-valued GEMM kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator
+>
+struct DefaultGemmSparseUniversal {
+
+  using DefaultGemmKernel = typename kernel::DefaultSparseGemm<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    true,
+    Operator
+  >::GemmKernel;
+
+  /// Select kernel by ThreadblockSwizzle's support for StreamkFeature
+  using GemmKernel = kernel::GemmSparseUniversal<
+      typename DefaultGemmKernel::Mma,
+      typename DefaultGemmKernel::Epilogue,
+      ThreadblockSwizzle>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/default_gemm_sparse_universal_with_absmax.h b/include/cutlass/gemm/kernel/default_gemm_sparse_universal_with_absmax.h
new file mode 100644
index 0000000000..0193909217
--- /dev/null
+++ b/include/cutlass/gemm/kernel/default_gemm_sparse_universal_with_absmax.h
@@ -0,0 +1,144 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level Sparse GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_with_absmax.h"
+#include "cutlass/gemm/kernel/gemm_sparse_universal_with_absmax.h"
+#include "cutlass/gemm/kernel/default_gemm_sparse.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Real-valued GEMM kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator
+>
+struct DefaultGemmSparseUniversalWithAbsmax {
+
+  using GemmBase = typename DefaultSparseGemm<
+    ElementA, LayoutA, kAlignmentA,
+    ElementB, LayoutB, kAlignmentB,
+    ElementC, LayoutC, ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    false, // SplitKSerial
+    Operator
+  >::GemmKernel;
+
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithAbsMax<
+    typename GemmBase::Epilogue::Shape,
+    typename GemmBase::Epilogue::WarpMmaOperator,
+    GemmBase::Epilogue::kPartitionsK,
+    ElementC,
+    typename EpilogueOutputOp::ElementAuxOutput,
+    ElementC,
+    EpilogueOutputOp,
+    GemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  using GemmKernel = kernel::GemmSparseUniversalWithAbsmax<
+      typename GemmBase::Mma, Epilogue, ThreadblockSwizzle>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/default_gemm_sparse_with_absmax.h b/include/cutlass/gemm/kernel/default_gemm_sparse_with_absmax.h
new file mode 100644
index 0000000000..30d0632334
--- /dev/null
+++ b/include/cutlass/gemm/kernel/default_gemm_sparse_with_absmax.h
@@ -0,0 +1,157 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief
+    Default configuration for a sparse GEMM with fused absolute-maximum calculations and scaling
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm.h"
+#include "cutlass/gemm/kernel/sparse_gemm_with_absmax.h"
+#include "cutlass/gemm/kernel/default_gemm_sparse.h"
+#include "cutlass/gemm/kernel/gemm_pipelined.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h"
+#include "cutlass/gemm/threadblock/default_sparse_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_with_absmax.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultSparseGemmWithAbsmax {
+
+  using GemmBase = typename DefaultSparseGemm<
+    ElementA_, LayoutA_, kAlignmentA,
+    ElementB_, LayoutB_, kAlignmentB,
+    ElementC_, LayoutC_, ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    SplitKSerial,
+    Operator
+  >::GemmKernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithAbsMax<
+    typename GemmBase::Epilogue::Shape,
+    typename GemmBase::Epilogue::WarpMmaOperator,
+    GemmBase::Epilogue::kPartitionsK,
+    ElementC_,
+    typename EpilogueOutputOp::ElementAuxOutput,
+    ElementC_,
+    EpilogueOutputOp,
+    GemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::SparseGemmWithAbsmax<typename GemmBase::Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/include/cutlass/gemm/kernel/default_gemm_sparse_with_visitor.h b/include/cutlass/gemm/kernel/default_gemm_sparse_with_visitor.h
new file mode 100644
index 0000000000..9d7f2c6f7e
--- /dev/null
+++ b/include/cutlass/gemm/kernel/default_gemm_sparse_with_visitor.h
@@ -0,0 +1,197 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Default sparse GEMM with visitor.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm.h"
+#include "cutlass/gemm/kernel/default_gemm_sparse.h"
+#include "cutlass/gemm/kernel/sparse_gemm_with_visitor.h"
+#include "cutlass/gemm/kernel/gemm_pipelined.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h"
+#include "cutlass/gemm/threadblock/default_sparse_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename FusionCallbacks,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Number of stages used in the pipelined epilogue
+    int EpilogueStages = 1>
+struct DefaultSparseGemmWithVisitor;
+
+////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename FusionCallbacks,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Number of stages used in the pipelined epilogue
+    int EpilogueStages>
+struct DefaultSparseGemmWithVisitor<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+                   ElementC, LayoutC, ElementAccumulator, arch::OpClassTensorOp,
+                   arch::Sm80, ThreadblockShape, WarpShape, InstructionShape,
+                   FusionCallbacks, ThreadblockSwizzle, Stages, Operator,
+                   EpilogueStages> {
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultSparseMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+
+  static constexpr int kAlignmentC = 128 / sizeof_bits<ElementC>::value;
+  using ElementEpilogue = ElementAccumulator;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+  using EpilogueOutputOp =
+      typename epilogue::thread::LinearCombination<
+          ElementC, kAlignmentC,
+          ElementAccumulator, ElementEpilogue>;
+  using BaseEpilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK,
+          EpilogueOutputOp, EpilogueOutputOp::kCount>::Epilogue;
+
+  // Define epilogue
+  using Epilogue = cutlass::epilogue::threadblock::EpilogueWithVisitorCallbacks<
+      BaseEpilogue,
+      FusionCallbacks,
+      EpilogueStages>;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::SparseGemmWithEpilogueVisitor<Mma, Epilogue, ThreadblockSwizzle>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h b/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h
index 375e2767a6..061bb74949 100644
--- a/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h
+++ b/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_gemm_streamk_with_broadcast.h b/include/cutlass/gemm/kernel/default_gemm_streamk_with_broadcast.h
new file mode 100644
index 0000000000..c19fdb5e23
--- /dev/null
+++ b/include/cutlass/gemm/kernel/default_gemm_streamk_with_broadcast.h
@@ -0,0 +1,146 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief 
+    Defines a Stream-K GEMM that can broadcast a bias vector in the epilogue.
+    Similar structure to DefaultGemmWithBroadcast, but uses its own epilogue 
+    (DefaultStreamkEpilogueWithBroadcastTensorOp) and its own GEMM kernel 
+    (GemmStreamkWithFusedEpilogue).
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/kernel/gemm_streamk_with_fused_epilogue.h"
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_broadcast.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Complex elementwise transformation on A operand
+  ComplexTransform TransformA,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Complex elementwise transformation on B operand
+  ComplexTransform TransformB,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Layout type for C and D matrix operands
+  typename LayoutC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Operator class tag
+  typename OperatorClass,
+  /// Tag indicating architecture to tune for
+  typename ArchTag,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator      - must satisfy concept of 'EpilogueWithBroadcastOp' 
+  typename EpilogueOutputOp,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Operation performed by GEMM
+  typename Operator,
+  ///
+  typename Enable = void
+>
+struct DefaultGemmStreamkWithBroadcast {
+
+  using GemmBase = typename DefaultGemmUniversal<
+    ElementA_, LayoutA_, TransformA, kAlignmentA,
+    ElementB_, LayoutB_, TransformB, kAlignmentB,
+    ElementC_, LayoutC_, ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    Operator
+  >::GemmKernel;
+
+  // Replace epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultStreamkEpilogueWithBroadcastTensorOp<
+    typename GemmBase::Epilogue::Shape,
+    typename GemmBase::Epilogue::WarpMmaOperator,
+    GemmBase::Epilogue::kPartitionsK,
+    ElementC_,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    GemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Compose the GEMM kernel
+  using GemmKernel = GemmStreamkWithFusedEpilogue<
+    typename GemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/default_gemm_universal.h b/include/cutlass/gemm/kernel/default_gemm_universal.h
index 45b2c7694c..ed7951be58 100644
--- a/include/cutlass/gemm/kernel/default_gemm_universal.h
+++ b/include/cutlass/gemm/kernel/default_gemm_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -30,10 +30,10 @@
  **************************************************************************************************/
 
 /*! \file
-    \brief 
+    \brief
       Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
       the appropriate threadblock-scoped epilogue.
-  
+
       Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
       accommodated by exchanging A and B operands and assuming transposed layouts. Partial
       specializations here choose 'device::GemmTransposed' to implement this functionality.
@@ -49,9 +49,12 @@
 #include "cutlass/numeric_types.h"
 
 #include "cutlass/gemm/kernel/gemm_universal.h"
+#include "cutlass/gemm/kernel/gemm_universal_streamk.h"
 #include "cutlass/gemm/kernel/default_gemm.h"
 #include "cutlass/gemm/kernel/default_gemm_complex.h"
 
+#include "cutlass/layout/permute.h"
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
@@ -91,7 +94,7 @@ template <
     typename ThreadblockShape,
     /// Warp-level tile size (concept: GemmShape)
     typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
+    /// Instruction tile size (concept: GemmShape)
     typename InstructionShape,
     /// Epilogue output operator
     typename EpilogueOutputOp,
@@ -109,6 +112,12 @@ template <
     bool GatherB = false,
     /// Scatter result D by using an index array
     bool ScatterD = false,
+    /// Permute result D
+    typename PermuteDLayout = layout::NoPermute,
+    /// Permute operand A
+    typename PermuteALayout_ = layout::NoPermute,
+    /// Permute operand B
+    typename PermuteBLayout_ = layout::NoPermute,
     ///
     typename Enable = void
     >
@@ -163,7 +172,13 @@ template <
     /// Gather operand B by using an index array
     bool GatherB,
     /// Scatter result D by using an index array
-    bool ScatterD
+    bool ScatterD,
+    /// Permute result D
+    typename PermuteDLayout,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
 >
 struct DefaultGemmUniversal<
   ElementA,
@@ -190,6 +205,9 @@ struct DefaultGemmUniversal<
   GatherA,
   GatherB,
   ScatterD,
+  PermuteDLayout,
+  PermuteALayout,
+  PermuteBLayout,
   typename platform::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
 > {
 
@@ -216,15 +234,32 @@ struct DefaultGemmUniversal<
     SharedMemoryClear,
     GatherA,
     GatherB,
-    ScatterD
+    ScatterD,
+    PermuteDLayout,
+    PermuteALayout,
+    PermuteBLayout
   >::GemmKernel;
 
-    /// Define the kernel in terms of the default kernel
-  using GemmKernel = kernel::GemmUniversal<
-    typename DefaultGemmKernel::Mma,
-    typename DefaultGemmKernel::Epilogue,
-    ThreadblockSwizzle
-  >;
+  /// Universal kernel without StreamkFeature member type
+  template <class SwizzleT, class Enable = void>
+  class SelectBase :
+    public kernel::GemmUniversal<
+      typename DefaultGemmKernel::Mma,
+      typename DefaultGemmKernel::Epilogue,
+      SwizzleT>
+  {};
+
+  /// Universal kernel with StreamkFeature member type
+  template <class SwizzleT>
+  class SelectBase<SwizzleT, typename SwizzleT::StreamkFeature> :
+    public kernel::GemmUniversalStreamk<
+      typename DefaultGemmKernel::Mma,
+      typename DefaultGemmKernel::Epilogue,
+      SwizzleT>
+  {};
+
+  /// Select kernel by ThreadblockSwizzle's support for StreamkFeature
+  using GemmKernel = SelectBase<ThreadblockSwizzle>;
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -302,6 +337,9 @@ struct DefaultGemmUniversal<
   false,
   false,
   false,
+  layout::NoPermute,
+  layout::NoPermute,
+  layout::NoPermute,
   typename platform::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
 > {
 
@@ -327,12 +365,26 @@ struct DefaultGemmUniversal<
     false
   >::GemmKernel;
 
-  /// Define the kernel in terms of the default kernel
-  using GemmKernel = kernel::GemmUniversal<
-    typename DefaultGemmKernel::Mma,
-    typename DefaultGemmKernel::Epilogue, 
-    ThreadblockSwizzle
-  >;
+  /// Universal kernel without StreamkFeature member type
+  template <class SwizzleT, class Enable = void>
+  class SelectBase :
+    public kernel::GemmUniversal<
+      typename DefaultGemmKernel::Mma,
+      typename DefaultGemmKernel::Epilogue,
+      SwizzleT>
+  {};
+
+  /// Universal kernel with StreamkFeature member type
+  template <class SwizzleT>
+  class SelectBase<SwizzleT, typename SwizzleT::StreamkFeature> :
+    public kernel::GemmUniversalStreamk<
+      typename DefaultGemmKernel::Mma,
+      typename DefaultGemmKernel::Epilogue,
+      SwizzleT>
+  {};
+
+  /// Select kernel by ThreadblockSwizzle's support for StreamkFeature
+  using GemmKernel = SelectBase<ThreadblockSwizzle>;
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/default_gemm_universal_with_visitor.h b/include/cutlass/gemm/kernel/default_gemm_universal_with_visitor.h
new file mode 100644
index 0000000000..a3c69f2dca
--- /dev/null
+++ b/include/cutlass/gemm/kernel/default_gemm_universal_with_visitor.h
@@ -0,0 +1,157 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief
+    Default configuration for a GEMM with fused epilogue visitor callbacks
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+
+#include "cutlass/gemm/kernel/gemm_universal_with_visitor.h"
+#include "cutlass/gemm/kernel/gemm_universal_with_visitor_streamk.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Complex elementwise transformation on A operand
+  ComplexTransform TransformA,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Complex elementwise transformation on B operand
+  ComplexTransform TransformB,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Layout type for C and D matrix operands
+  typename LayoutC_,
+  /// Access granularity of C matrix in unit of elements
+  int kAlignmentC,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Element type for epilogue computation
+  typename ElementEpilogue,
+  /// Operator class tag
+  typename OperatorClass,
+  /// Tag indicating architecture to tune for
+  typename ArchTag,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator
+  typename FusionCallbacks,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Operation performed by GEMM
+  typename Operator,
+  /// Number of stages used in the pipelined epilogue
+  int EpilogueStages = 1
+>
+struct DefaultGemmWithVisitor {
+
+  using GemmBase = typename DefaultGemmUniversal<
+    ElementA_, LayoutA_, TransformA, kAlignmentA, 
+    ElementB_, LayoutB_, TransformB, kAlignmentB,
+    ElementC_, LayoutC_, ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    epilogue::thread::LinearCombination<
+        ElementC_, kAlignmentC, 
+        ElementAccumulator, ElementEpilogue 
+    >,
+    ThreadblockSwizzle,
+    Stages,
+    Operator
+  >::GemmKernel;
+
+  // Define epilogue
+  using Epilogue = cutlass::epilogue::threadblock::EpilogueWithVisitorCallbacks<
+      typename GemmBase::Epilogue,
+      FusionCallbacks,
+      EpilogueStages
+  >;
+
+  /// GemmWithVisitor without StreamkFeature member type
+  template <class SwizzleT, class Enable = void>
+  class SelectBase :
+    public GemmWithEpilogueVisitor<
+      typename GemmBase::Mma,
+      Epilogue,
+      SwizzleT>
+  {};
+
+  /// GemmWIthVisitor with StreamkFeature member type
+  template <class SwizzleT>
+  class SelectBase<SwizzleT, typename SwizzleT::StreamkFeature> :
+    public GemmWithEpilogueVisitorStreamk<
+      typename GemmBase::Mma,
+      Epilogue,
+      SwizzleT>
+  {};
+
+  /// Select kernel by ThreadblockSwizzle's support for StreamkFeature
+  using GemmKernel = SelectBase<ThreadblockSwizzle>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/default_gemm_with_absmax.h b/include/cutlass/gemm/kernel/default_gemm_with_absmax.h
new file mode 100644
index 0000000000..3fd643e7e9
--- /dev/null
+++ b/include/cutlass/gemm/kernel/default_gemm_with_absmax.h
@@ -0,0 +1,143 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief
+    Default configuration for a GEMM with fused absolute-maximum calculations and scaling
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/kernel/gemm_with_absmax.h"
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_with_absmax.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_absmax.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Complex elementwise transformation on A operand
+  ComplexTransform TransformA,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Complex elementwise transformation on B operand
+  ComplexTransform TransformB,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Layout type for C and D matrix operands
+  typename LayoutC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Operator class tag
+  typename OperatorClass,
+  /// Tag indicating architecture to tune for
+  typename ArchTag,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator
+  typename EpilogueOutputOp,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Operation performed by GEMM
+  typename Operator,
+  ///
+  typename Enable = void
+>
+struct DefaultGemmWithAbsMax {
+
+  using GemmBase = typename DefaultGemmUniversal<
+    ElementA_, LayoutA_, TransformA, kAlignmentA,
+    ElementB_, LayoutB_, TransformB, kAlignmentB,
+    ElementC_, LayoutC_, ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    Operator
+  >::GemmKernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithAbsMax<
+    typename GemmBase::Epilogue::Shape,
+    typename GemmBase::Epilogue::WarpMmaOperator,
+    GemmBase::Epilogue::kPartitionsK,
+    ElementC_,
+    typename EpilogueOutputOp::ElementAuxOutput,
+    ElementC_,
+    EpilogueOutputOp,
+    GemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Compose the GEMM kernel
+  using GemmKernel = GemmWithAbsMax<
+    typename GemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle
+  >;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/default_gemm_with_broadcast.h b/include/cutlass/gemm/kernel/default_gemm_with_broadcast.h
index 8cf0f7bb5b..e95c256109 100644
--- a/include/cutlass/gemm/kernel/default_gemm_with_broadcast.h
+++ b/include/cutlass/gemm/kernel/default_gemm_with_broadcast.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -114,14 +114,14 @@ struct DefaultGemmWithBroadcast {
     Operator
   >::GemmKernel;
 
-  // Replace epilogue
+  // Define epilogue
   using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithBroadcastTensorOp<
     typename GemmBase::Epilogue::Shape,
     typename GemmBase::Epilogue::WarpMmaOperator,
     GemmBase::Epilogue::kPartitionsK,
     ElementC_,
     typename EpilogueOutputOp::ElementT,
-    ElementC_,
+    typename EpilogueOutputOp::ElementVector,
     EpilogueOutputOp,
     GemmBase::Epilogue::kElementsPerAccess
   >::Epilogue;
@@ -137,7 +137,7 @@ struct DefaultGemmWithBroadcast {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Parital specialization: ArchTag = cutlass::arch::Sm70
+/// Partial specialization: ArchTag = cutlass::arch::Sm70
 ///
 ///
 template <
@@ -214,14 +214,14 @@ struct DefaultGemmWithBroadcast<
     Operator
   >::GemmKernel;
 
-  // Replace epilogue
+  // Define epilogue
   using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithBroadcastVoltaTensorOp<
     typename GemmBase::Epilogue::Shape,
     typename GemmBase::Epilogue::WarpMmaOperator,
     GemmBase::Epilogue::kPartitionsK,
     ElementC_,
     typename EpilogueOutputOp::ElementT,
-    ElementC_,
+    typename EpilogueOutputOp::ElementVector,
     EpilogueOutputOp,
     GemmBase::Epilogue::kElementsPerAccess
   >::Epilogue;
diff --git a/include/cutlass/gemm/kernel/default_gemm_with_k_reduction.h b/include/cutlass/gemm/kernel/default_gemm_with_k_reduction.h
index ddb78b2c7f..ca4c2cba69 100644
--- a/include/cutlass/gemm/kernel/default_gemm_with_k_reduction.h
+++ b/include/cutlass/gemm/kernel/default_gemm_with_k_reduction.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -91,7 +91,7 @@ template <
     typename ElementAccumulator,
     /// Operator class tag
     typename OperatorClass,
-    ///
+    /// Reduce A or B along the K dimension
     bool ReduceKForA_,
     /// Tag indicating architecture to tune for
     typename ArchTag,
diff --git a/include/cutlass/gemm/kernel/default_gemm_with_reduction.h b/include/cutlass/gemm/kernel/default_gemm_with_reduction.h
index a808a7e051..1a578f09ff 100644
--- a/include/cutlass/gemm/kernel/default_gemm_with_reduction.h
+++ b/include/cutlass/gemm/kernel/default_gemm_with_reduction.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -117,7 +117,7 @@ struct DefaultGemmWithReduction {
     SharedMemoryClearOption::kClearLastStage
   >::GemmKernel;
 
-  // Replace epilogue
+  // Define epilogue
   using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithReductionTensorOp<
     typename GemmBase::Epilogue::Shape,
     typename GemmBase::Epilogue::WarpMmaOperator,
@@ -138,7 +138,7 @@ struct DefaultGemmWithReduction {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Parital specialization: ArchTag = cutlass::arch::Sm70
+/// Partial specialization: ArchTag = cutlass::arch::Sm70
 ///
 ///
 template <
@@ -218,7 +218,7 @@ struct DefaultGemmWithReduction<
     Operator
   >::GemmKernel;
 
-  // Replace epilogue
+  // Define epilogue
   using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithReductionVoltaTensorOp<
     typename GemmBase::Epilogue::Shape,
     typename GemmBase::Epilogue::WarpMmaOperator,
diff --git a/include/cutlass/gemm/kernel/default_gemv.h b/include/cutlass/gemm/kernel/default_gemv.h
index b9d4bb1a49..db6306401e 100755
--- a/include/cutlass/gemm/kernel/default_gemv.h
+++ b/include/cutlass/gemm/kernel/default_gemv.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_rank_2k.h b/include/cutlass/gemm/kernel/default_rank_2k.h
index d421d85d40..63400ef40c 100644
--- a/include/cutlass/gemm/kernel/default_rank_2k.h
+++ b/include/cutlass/gemm/kernel/default_rank_2k.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -120,6 +120,84 @@ template <
     BlasMode BlasMode_ = BlasMode::kSymmetric>
 struct DefaultRank2K;
 
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Hopper Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultRank2K<
+                    ElementA, LayoutA, kAlignmentA, 
+                    ElementB, LayoutB, kAlignmentB, 
+                    ElementC,layout::RowMajor, FillModeC, 
+                    ElementAccumulator, arch::OpClassTensorOp, arch::Sm90, 
+                    ThreadblockShape, WarpShape, InstructionShape,
+                    EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                    Operator> {
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x BT)
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA, LayoutA, 
+      kAlignmentA, 
+      ElementB, typename layout::LayoutTranspose<LayoutB>::type, 
+      kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+  
+  /// Define the threadblock-scoped matrix multiply-accumulate (B x AT)
+  using Mma2 = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementB, LayoutB, 
+      kAlignmentB, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      kAlignmentA,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOpBlas3<
+          ThreadblockShape, typename Mma1::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, BlasMode::kSymmetric>::Epilogue;
+
+  /// Define the kernel-level Rank2K operator.
+  using Rank2Kkernel = kernel::Rank2KUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, FillModeC, BlasMode::kSymmetric>;
+};
 
 ////////////////////////////////////////////////////////////////////////////////
 
diff --git a/include/cutlass/gemm/kernel/default_rank_2k_complex.h b/include/cutlass/gemm/kernel/default_rank_2k_complex.h
index 8255cef26f..1a685286cb 100644
--- a/include/cutlass/gemm/kernel/default_rank_2k_complex.h
+++ b/include/cutlass/gemm/kernel/default_rank_2k_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -163,6 +163,170 @@ template <>
 };
 
 }
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Hopper Architecture complex datatype (symmetric)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultRank2KComplex<
+  ElementA, LayoutA, ElementB, LayoutB, ElementC, 
+  layout::RowMajor, FillModeC, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm90, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  TransformA, TransformB, Operator, SplitKSerial, BlasMode::kSymmetric> {
+
+  static BlasMode const kBlasMode = BlasMode::kSymmetric;
+  
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x B^T)
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementA, LayoutA, 
+      ElementB, typename layout::LayoutTranspose<LayoutB>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate (B x A^T)
+  using Mma2 = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementB, LayoutB, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOpBlas3<
+          ThreadblockShape, typename Mma1::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator, kBlasMode>::Epilogue;
+
+  /// Define the kernel-level Rank2K operator.
+  using Rank2Kkernel = kernel::Rank2KUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, FillModeC, kBlasMode>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Hopper Architecture complex datatype (hermitian)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultRank2KComplex<
+  ElementA, LayoutA, ElementB, LayoutB, ElementC, 
+  layout::RowMajor, FillModeC, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm90, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  TransformA, TransformB, Operator, SplitKSerial, BlasMode::kHermitian> {
+
+  static BlasMode const kBlasMode = BlasMode::kHermitian;
+
+  // Complex transform for input A and B matrices (function on input layout)
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+  using TransposedComplexTransform = detail::Rank2KTransposedComplexTransform<
+                                        LayoutA, LayoutB, 
+                                        TransformA, TransformB,
+                                        kBlasMode>;
+
+  // Complex transform on operandA and operandB (function of blas3 computation)
+  static ComplexTransform const kTransformOperandA = TransposedComplexTransform::kTransformA;
+  static ComplexTransform const kTransformOperandB = TransposedComplexTransform::kTransformB;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x B^H)
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementA, LayoutA, 
+      ElementB, typename layout::LayoutTranspose<LayoutB>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      kTransformOperandA, kTransformOperandB, Operator>::ThreadblockMma;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate (B x A^H)
+  using Mma2 = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementB, LayoutB, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      kTransformOperandA, kTransformOperandB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOpBlas3<
+          ThreadblockShape, typename Mma1::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator, kBlasMode>::Epilogue;
+
+  /// Define the kernel-level Rank2K operator.
+  using Rank2Kkernel = kernel::Rank2KUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, FillModeC, kBlasMode>;
+
+};
+
 ////////////////////////////////////////////////////////////////////////////////
 
 /// Partial specialization for Ampere Architecture complex datatype (symmetric)
diff --git a/include/cutlass/gemm/kernel/default_rank_2k_grouped.h b/include/cutlass/gemm/kernel/default_rank_2k_grouped.h
new file mode 100644
index 0000000000..7c79dd61aa
--- /dev/null
+++ b/include/cutlass/gemm/kernel/default_rank_2k_grouped.h
@@ -0,0 +1,355 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+      Default kernel-level grouped Rank2K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/kernel/rank_2k_transpose_operands.h"
+#include "cutlass/gemm/kernel/default_rank_2k.h"
+#include "cutlass/gemm/kernel/default_rank_2k_complex.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Blas3 computation mode
+    BlasMode BlasMode_ = BlasMode::kSymmetric,
+    /// Whether the schedule of problems to visit has been precomputed
+    GroupScheduleMode GroupScheduleMode_ = GroupScheduleMode::kDeviceOnly,
+    ///
+    typename Enable = void
+    >
+struct DefaultRank2KGrouped;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Real-valued grouped Rank2K
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Blas3 computation mode
+    BlasMode BlasMode_,
+    /// Whether the schedule of problems to visit has been precomputed
+    GroupScheduleMode GroupScheduleMode_
+    >
+struct DefaultRank2KGrouped<ElementA, LayoutA, TransformA, kAlignmentA,
+          ElementB, LayoutB, TransformB, kAlignmentB,
+          ElementC, LayoutC,
+          FillModeC, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape,
+          WarpShape, InstructionShape, EpilogueOutputOp,
+          ThreadblockSwizzle, Stages, Operator, BlasMode_, GroupScheduleMode_,
+          typename platform::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+  // If true, we must construct a 'transposed-and-exchanged' Rank2K operator.
+  static bool const kInternalTranspose = platform::is_same<LayoutC, layout::ColumnMajor>::value;
+
+  using MapArguments = kernel::detail::Rank2KMapArguments<
+    ElementA,
+    LayoutA,
+    TransformA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    TransformB,
+    kAlignmentB,
+    LayoutC,
+    FillModeC,
+    kInternalTranspose
+  >;
+
+  // Define the default grouped Rank2K kernel
+  using DefaultRank2Kkernel = typename kernel::DefaultRank2K<
+    typename MapArguments::ElementA,
+    typename MapArguments::LayoutA,
+    MapArguments::kAlignmentA,
+    typename MapArguments::ElementB,
+    typename MapArguments::LayoutB,
+    MapArguments::kAlignmentB,
+    ElementC,
+    typename MapArguments::LayoutC,
+    MapArguments::kFillModeC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    false,                  // SplitKSerial
+    Operator,
+    BlasMode_
+  >::Rank2Kkernel;
+
+  /// Define the kernel in terms of the default kernel
+  using Rank2Kkernel = kernel::Rank2KGrouped<
+    typename DefaultRank2Kkernel::Mma1,
+    typename DefaultRank2Kkernel::Mma2,
+    typename DefaultRank2Kkernel::Epilogue,
+    ThreadblockSwizzle,
+    TransformA,
+    TransformB,
+    DefaultRank2Kkernel::kFillModeC,
+    DefaultRank2Kkernel::kBlasMode,
+    GroupScheduleMode_,
+    kInternalTranspose
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Complex-valued grouped Rank2K
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Blas3 computation mode
+    BlasMode BlasMode_,
+    /// Whether the schedule of problems to visit has been precomputed
+    GroupScheduleMode GroupScheduleMode_
+    >
+struct DefaultRank2KGrouped<ElementA, LayoutA, TransformA, kAlignmentA,
+          ElementB, LayoutB, TransformB, kAlignmentB,
+          ElementC, LayoutC,
+          FillModeC, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape,
+          WarpShape, InstructionShape, EpilogueOutputOp,
+          ThreadblockSwizzle, Stages, Operator, BlasMode_, GroupScheduleMode_,
+          typename platform::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+  // If true, we must construct a 'transposed-and-exchanged' Rank2K operator.
+  static bool const kInternalTranspose = platform::is_same<LayoutC, layout::ColumnMajor>::value;
+
+  using MapArguments = kernel::detail::Rank2KMapArguments<
+    ElementA,
+    LayoutA,
+    TransformA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    TransformB,
+    kAlignmentB,
+    LayoutC,
+    FillModeC,
+    kInternalTranspose
+  >;
+
+  // Define the default grouped Rank2K kernel
+  using DefaultRank2Kkernel = typename kernel::DefaultRank2KComplex<
+    typename MapArguments::ElementA,
+    typename MapArguments::LayoutA,
+    typename MapArguments::ElementB,
+    typename MapArguments::LayoutB,
+    ElementC,
+    typename MapArguments::LayoutC,
+    MapArguments::kFillModeC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MapArguments::kTransformA,
+    MapArguments::kTransformB,
+    Operator,
+    false,                  // SplitKSerial
+    BlasMode_
+  >::Rank2Kkernel;
+
+  /// Define the kernel in terms of the default kernel
+  /// Pass through the user-provided TransformA and TransformB so as to
+  /// correctly set public-facing TransformA and TransformB in kernel::Rank2KGrouped.
+  /// This is needed because kernel::DefaultRank2KComplex may change TransformA and
+  /// TransformB that become template arguments to Mma1 and Mma2.
+  using Rank2Kkernel = kernel::Rank2KGrouped<
+    typename DefaultRank2Kkernel::Mma1,
+    typename DefaultRank2Kkernel::Mma2,
+    typename DefaultRank2Kkernel::Epilogue,
+    ThreadblockSwizzle,
+    TransformA,
+    TransformB,
+    DefaultRank2Kkernel::kFillModeC,
+    DefaultRank2Kkernel::kBlasMode,
+    GroupScheduleMode_,
+    kInternalTranspose
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/default_rank_2k_universal.h b/include/cutlass/gemm/kernel/default_rank_2k_universal.h
index 180e65790c..41e9cc45c0 100644
--- a/include/cutlass/gemm/kernel/default_rank_2k_universal.h
+++ b/include/cutlass/gemm/kernel/default_rank_2k_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -183,7 +183,7 @@ struct DefaultRank2KUniversal<
   SplitKSerial,
   Operator,
   BlasMode::kSymmetric,
-  typename std::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
+  typename platform::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
 > {
 
   using DefaultRank2Kkernel = typename kernel::DefaultRank2K<
@@ -301,7 +301,7 @@ struct DefaultRank2KUniversal<
   SplitKSerial,
   Operator,
   kBlasMode,
-  typename std::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
+  typename platform::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
 > {
 
   using DefaultRank2Kkernel = typename kernel::DefaultRank2KComplex<
diff --git a/include/cutlass/gemm/kernel/default_rank_k.h b/include/cutlass/gemm/kernel/default_rank_k.h
index 80960f2a73..780b205a42 100644
--- a/include/cutlass/gemm/kernel/default_rank_k.h
+++ b/include/cutlass/gemm/kernel/default_rank_k.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -114,6 +114,68 @@ template <
     BlasMode BlasMode_ = BlasMode::kSymmetric>
 struct DefaultRankK;
 
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Hopper Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultRankK<
+                    ElementA, LayoutA, kAlignmentA, 
+                    ElementC,layout::RowMajor, FillModeC, 
+                    ElementAccumulator, arch::OpClassTensorOp, arch::Sm90, 
+                    ThreadblockShape, WarpShape, InstructionShape,
+                    EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                    Operator> {
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x AT)
+  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA, LayoutA, 
+      kAlignmentA, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      kAlignmentA,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+  
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOpBlas3<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, BlasMode::kSymmetric>::Epilogue;
+
+  /// Define the kernel-level Rank2 operator.
+  using RankKkernel = kernel::RankKUniversal<Mma, Epilogue, ThreadblockSwizzle, FillModeC>;
+};
 
 ////////////////////////////////////////////////////////////////////////////////
 
diff --git a/include/cutlass/gemm/kernel/default_rank_k_complex.h b/include/cutlass/gemm/kernel/default_rank_k_complex.h
index 21d8607bb4..56d2fcc997 100644
--- a/include/cutlass/gemm/kernel/default_rank_k_complex.h
+++ b/include/cutlass/gemm/kernel/default_rank_k_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -155,6 +155,140 @@ template <>
 }
 ////////////////////////////////////////////////////////////////////////////////
 
+/// Partial specialization for Hopper Architecture complex datatype (symmetric)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultRankKComplex<
+  ElementA, LayoutA, ElementC, 
+  layout::RowMajor, FillModeC, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm90, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  TransformA, Operator, SplitKSerial, BlasMode::kSymmetric> {
+
+  static BlasMode const kBlasMode = BlasMode::kSymmetric;
+  
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x B^T)
+  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementA, LayoutA, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      TransformA, TransformA, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOpBlas3<
+          ThreadblockShape, typename Mma::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator, kBlasMode>::Epilogue;
+
+  /// Define the kernel-level RankK operator.
+  using RankKkernel = kernel::RankKUniversal<Mma, Epilogue, ThreadblockSwizzle, FillModeC>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Hopper Architecture complex datatype (hermitian)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultRankKComplex<
+  ElementA, LayoutA, ElementC, 
+  layout::RowMajor, FillModeC, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm90, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  TransformA, Operator, SplitKSerial, BlasMode::kHermitian> {
+
+  static BlasMode const kBlasMode = BlasMode::kHermitian;
+
+  // Complex transform for input A and B matrices (function on input layout)
+  static ComplexTransform const kTransformA = TransformA;
+
+  using TransposedComplexTransform = detail::RankKTransposedComplexTransform<
+                                        LayoutA, 
+                                        TransformA,
+                                        kBlasMode>;
+
+  // Complex transform on operandA and operandB (function of blas3 computation)
+  static ComplexTransform const kTransformOperandA = TransposedComplexTransform::kTransformA;
+  static ComplexTransform const kTransformOperandB = TransposedComplexTransform::kTransformB;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x A^H)
+  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementA, LayoutA, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      kTransformOperandA, kTransformOperandB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOpBlas3<
+          ThreadblockShape, typename Mma::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator, kBlasMode>::Epilogue;
+
+  /// Define the kernel-level RankK operator.
+  using RankKkernel = kernel::RankKUniversal<Mma, Epilogue, ThreadblockSwizzle, FillModeC>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
 /// Partial specialization for Ampere Architecture complex datatype (symmetric)
 template <
     /// Element type for A matrix operand
diff --git a/include/cutlass/gemm/kernel/default_rank_k_universal.h b/include/cutlass/gemm/kernel/default_rank_k_universal.h
index 01181984a1..309ea46420 100644
--- a/include/cutlass/gemm/kernel/default_rank_k_universal.h
+++ b/include/cutlass/gemm/kernel/default_rank_k_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -165,7 +165,7 @@ struct DefaultRankKUniversal<
   SplitKSerial,
   Operator,
   BlasMode::kSymmetric,
-  typename std::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
+  typename platform::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
 > {
 
   using DefaultRankKkernel = typename kernel::DefaultRankK<
@@ -265,7 +265,7 @@ struct DefaultRankKUniversal<
   SplitKSerial,
   Operator,
   kBlasMode,
-  typename std::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
+  typename platform::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
 > {
 
   using DefaultRankKkernel = typename kernel::DefaultRankKComplex<
diff --git a/include/cutlass/gemm/kernel/default_symm.h b/include/cutlass/gemm/kernel/default_symm.h
index 079bb09bc6..8f0ff42555 100755
--- a/include/cutlass/gemm/kernel/default_symm.h
+++ b/include/cutlass/gemm/kernel/default_symm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -123,6 +123,101 @@ template <
     BlasMode BlasMode_ = BlasMode::kSymmetric>
 struct DefaultSymm;
 
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Hopper Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode kSideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode kFillModeA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultSymm<
+                    ElementA, LayoutA, kSideModeA, kFillModeA, kAlignmentA, 
+                    ElementB, LayoutB, kAlignmentB, 
+                    ElementC,layout::RowMajor, 
+                    ElementAccumulator, arch::OpClassTensorOp, arch::Sm90, 
+                    ThreadblockShape, WarpShape, InstructionShape,
+                    EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                    Operator> {
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  /// TRMM - with diagonal: alpha * A * B or alpha * B * A
+	static const DiagType kDiagTypeMma1 = DiagType::kNonUnit;
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultTrmm<
+      ElementA, LayoutA, kAlignmentA, 
+      ElementB, LayoutB, kAlignmentB,
+      kSideModeA, kFillModeA, kDiagTypeMma1, 
+      ElementAccumulator, layout::RowMajor, 
+      arch::OpClassTensorOp, arch::Sm90,
+      ThreadblockShape, WarpShape, InstructionShape,
+      Stages, Operator>::ThreadblockMma;
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate 
+  /// TRMM - withOUT diagonal: alpha * AT * B or alpha * B * AT
+	static const DiagType kDiagTypeMma2 = DiagType::kZero;
+  using LayoutAMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                typename layout::LayoutTranspose<LayoutA>::type, 
+                                LayoutA
+                              >::type;
+  using LayoutBMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                LayoutB, 
+                                typename layout::LayoutTranspose<LayoutB>::type
+                              >::type; 
+	using Mma2 = typename cutlass::gemm::threadblock::DefaultTrmm<
+			ElementA, LayoutAMma2, kAlignmentA, 
+			ElementB, LayoutBMma2, kAlignmentB,
+			kSideModeA, InvertFillMode<kFillModeA>::mode, kDiagTypeMma2, 
+			ElementAccumulator, layout::RowMajor, 
+			arch::OpClassTensorOp, arch::Sm90,
+			ThreadblockShape, WarpShape, InstructionShape,
+			Stages, Operator>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma1::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount>::Epilogue;
+
+  /// Define the kernel-level SYMM/HEMM operator.
+  using SymmKernel = kernel::SymmUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, kSideModeA, kFillModeA>;
+};
 
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -221,7 +316,6 @@ struct DefaultSymm<
 };
 ////////////////////////////////////////////////////////////////////////////////
 
-
 }  // namespace kernel
 }  // namespace gemm
 }  // namespace cutlass
diff --git a/include/cutlass/gemm/kernel/default_symm_complex.h b/include/cutlass/gemm/kernel/default_symm_complex.h
index a8b5d218cd..c2f8031006 100755
--- a/include/cutlass/gemm/kernel/default_symm_complex.h
+++ b/include/cutlass/gemm/kernel/default_symm_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -117,6 +117,199 @@ struct DefaultSymmComplex;
 
 ////////////////////////////////////////////////////////////////////////////////
 
+/// Partial specialization for Hopper Architecture complex datatype (symmetric)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode kSideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode kFillModeA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultSymmComplex<
+  ElementA, LayoutA, kSideModeA, kFillModeA, ElementB, LayoutB, ElementC, 
+  layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm90, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  Operator, SplitKSerial, BlasMode::kSymmetric> {
+
+  static BlasMode const kBlasMode = BlasMode::kSymmetric;
+  // Complex Transform don't appply to A or B for SYMM
+  static ComplexTransform const TransformA = ComplexTransform::kNone; 
+  static ComplexTransform const TransformB = ComplexTransform::kNone; 
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  /// TRMM - with diagonal: alpha * A * B or alpha * B * A
+	static const DiagType kDiagTypeMma1 = DiagType::kNonUnit;
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
+      ElementA, LayoutA, 
+      ElementB, LayoutB, 
+      kSideModeA, kFillModeA, kDiagTypeMma1, 
+      ElementAccumulator, layout::RowMajor, 
+      arch::OpClassTensorOp, arch::Sm90,
+      ThreadblockShape, WarpShape, InstructionShape,
+      Stages, TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  /// TRMM - withOUT diagonal: alpha * AT * B or alpha * B * AT
+	static const DiagType kDiagTypeMma2 = DiagType::kZero;
+  using LayoutAMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                typename layout::LayoutTranspose<LayoutA>::type, 
+                                LayoutA
+                              >::type;
+  using LayoutBMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                LayoutB, 
+                                typename layout::LayoutTranspose<LayoutB>::type
+                              >::type; 
+	using Mma2 = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
+			ElementA, LayoutAMma2, 
+			ElementB, LayoutBMma2, 
+			kSideModeA, InvertFillMode<kFillModeA>::mode, kDiagTypeMma2, 
+			ElementAccumulator, layout::RowMajor, 
+			arch::OpClassTensorOp, arch::Sm90,
+			ThreadblockShape, WarpShape, InstructionShape,
+			Stages, TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOp<
+          ThreadblockShape, typename Mma1::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator>::Epilogue;
+
+  /// Define the kernel-level Symm operator.
+  using SymmKernel = kernel::SymmUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, kSideModeA, kFillModeA>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Hopper Architecture complex datatype (hermitian)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode kSideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode kFillModeA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultSymmComplex<
+  ElementA, LayoutA, kSideModeA, kFillModeA, ElementB, LayoutB, ElementC, 
+  layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm90, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  Operator, SplitKSerial, BlasMode::kHermitian> {
+
+  static BlasMode const kBlasMode = BlasMode::kHermitian;
+
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  /// TRMM - with diagonal: alpha * A * B or alpha * B * A
+	static const DiagType kDiagTypeMma1 = DiagType::kNonUnit;
+  static ComplexTransform const TransformAMma1 = ComplexTransform::kNone; 
+  static ComplexTransform const TransformBMma1 = ComplexTransform::kNone; 
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
+      ElementA, LayoutA, 
+      ElementB, LayoutB, 
+      kSideModeA, kFillModeA, kDiagTypeMma1, 
+      ElementAccumulator, layout::RowMajor, 
+      arch::OpClassTensorOp, arch::Sm90,
+      ThreadblockShape, WarpShape, InstructionShape,
+      Stages, TransformAMma1, TransformBMma1, Operator, BlasMode::kHermitian>::ThreadblockMma;
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  /// TRMM - withOUT diagonal - with conjugate transpose: alpha * AT * B or alpha * B * AT
+	static const DiagType kDiagTypeMma2 = DiagType::kZero;
+  using LayoutAMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                typename layout::LayoutTranspose<LayoutA>::type, 
+                                LayoutA
+                              >::type;
+  using LayoutBMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                LayoutB, 
+                                typename layout::LayoutTranspose<LayoutB>::type
+                              >::type;
+  static ComplexTransform const TransformAMma2 = (kSideModeA == SideMode::kLeft) ? 
+                                              ComplexTransform::kConjugate : ComplexTransform::kNone;
+  static ComplexTransform const TransformBMma2 = (kSideModeA == SideMode::kLeft) ? 
+                                              ComplexTransform::kNone : ComplexTransform::kConjugate;
+
+	using Mma2 = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
+			ElementA, LayoutAMma2, 
+			ElementB, LayoutBMma2, 
+			kSideModeA, InvertFillMode<kFillModeA>::mode, kDiagTypeMma2, 
+			ElementAccumulator, layout::RowMajor, 
+			arch::OpClassTensorOp, arch::Sm90,
+			ThreadblockShape, WarpShape, InstructionShape,
+			Stages, TransformAMma2, TransformBMma2, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOp<
+          ThreadblockShape, typename Mma1::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator>::Epilogue;
+
+  /// Define the kernel-level Symm operator.
+  using SymmKernel = kernel::SymmUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, kSideModeA, kFillModeA>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
 /// Partial specialization for Ampere Architecture complex datatype (symmetric)
 template <
     /// Element type for A matrix operand
@@ -310,7 +503,6 @@ struct DefaultSymmComplex<
 
 ////////////////////////////////////////////////////////////////////////////////
 
-
 }  // namespace kernel
 }  // namespace gemm
 }  // namespace cutlass
diff --git a/include/cutlass/gemm/kernel/default_symm_universal.h b/include/cutlass/gemm/kernel/default_symm_universal.h
index f3bd5552b6..ac0da25d1a 100755
--- a/include/cutlass/gemm/kernel/default_symm_universal.h
+++ b/include/cutlass/gemm/kernel/default_symm_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -182,7 +182,7 @@ struct DefaultSymmUniversal<
   SplitKSerial,
   Operator,
   BlasMode::kSymmetric,
-  typename std::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
+  typename platform::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
 > {
 
   using DefaultSymmkernel = typename kernel::DefaultSymm<
@@ -298,7 +298,7 @@ struct DefaultSymmUniversal<
   SplitKSerial,
   Operator,
   kBlasMode,
-  typename std::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
+  typename platform::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
 > {
 
   using DefaultSymmkernel = typename kernel::DefaultSymmComplex<
diff --git a/include/cutlass/gemm/kernel/default_trmm.h b/include/cutlass/gemm/kernel/default_trmm.h
index a3d6b9efea..3380eee37f 100644
--- a/include/cutlass/gemm/kernel/default_trmm.h
+++ b/include/cutlass/gemm/kernel/default_trmm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -124,6 +124,76 @@ struct DefaultTrmm;
 
 ////////////////////////////////////////////////////////////////////////////////
 
+/// Partial specialization for Hopper Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultTrmm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+                   kSideMode, kFillMode, kDiagType, ElementC,
+                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+                   arch::Sm90, ThreadblockShape, WarpShape, InstructionShape,
+                   EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                   Operator> {
+                    
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultTrmm<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      kSideMode, kFillMode, kDiagType, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount>::Epilogue;
+
+  /// Define the kernel-level TRMM operator.
+  using TrmmKernel = kernel::TrmmUniversal<Mma, Epilogue, ThreadblockSwizzle, kSideMode, kFillMode, kDiagType>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
 /// Partial specialization for Ampere Architecture
 template <
     /// Element type for A matrix operand
diff --git a/include/cutlass/gemm/kernel/default_trmm_complex.h b/include/cutlass/gemm/kernel/default_trmm_complex.h
index 9dcb5a2f4c..c5cba8fb46 100644
--- a/include/cutlass/gemm/kernel/default_trmm_complex.h
+++ b/include/cutlass/gemm/kernel/default_trmm_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -122,6 +122,74 @@ struct DefaultTrmmComplex;
 
 ////////////////////////////////////////////////////////////////////////////////
 
+/// Partial specialization for Hopper Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Multiply-add operator 
+    // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the epilogue
+    bool SplitKSerial
+  >
+struct DefaultTrmmComplex<
+  ElementA, LayoutA, ElementB, LayoutB, 
+  kSideMode, kFillMode, kDiagType,
+  ElementC, layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm90, ThreadblockShape, WarpShape, InstructionShape,
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, Operator, SplitKSerial> {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
+      ElementA, LayoutA, ElementB, LayoutB, 
+      kSideMode, kFillMode, kDiagType,
+      ElementAccumulator,layout::RowMajor, arch::OpClassTensorOp, arch::Sm90, ThreadblockShape,
+      WarpShape, InstructionShape, Stages, TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOp<
+          ThreadblockShape, typename Mma::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator>::Epilogue;
+
+  /// Define the kernel-level TRMM operator.
+  using TrmmKernel = kernel::TrmmUniversal<Mma, Epilogue, ThreadblockSwizzle, kSideMode, kFillMode, kDiagType>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
 /// Partial specialization for Ampere Architecture
 template <
     /// Element type for A matrix operand
diff --git a/include/cutlass/gemm/kernel/default_trmm_universal.h b/include/cutlass/gemm/kernel/default_trmm_universal.h
index 58c2e8144f..e06e15ca3e 100644
--- a/include/cutlass/gemm/kernel/default_trmm_universal.h
+++ b/include/cutlass/gemm/kernel/default_trmm_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -190,7 +190,7 @@ struct DefaultTrmmUniversal<
   Stages,
   SplitKSerial,
   Operator,
-  typename std::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
+  typename platform::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
 > {
 
   using DefaultTrmmKernel = typename kernel::DefaultTrmm<
@@ -311,7 +311,7 @@ struct DefaultTrmmUniversal<
   Stages,
   SplitKSerial,
   Operator,
-  typename std::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
+  typename platform::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
 > {
 
   using DefaultTrmmKernel = typename kernel::DefaultTrmmComplex<
diff --git a/include/cutlass/gemm/kernel/ell_gemm.h b/include/cutlass/gemm/kernel/ell_gemm.h
new file mode 100644
index 0000000000..aad3295925
--- /dev/null
+++ b/include/cutlass/gemm/kernel/ell_gemm.h
@@ -0,0 +1,824 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Template for a Block-Ell sparse gemm kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/arch/arch.h"
+
+#include "cutlass/transform/threadblock/ell_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  bool SplitKSerial,              ///! If true, code supporting split-K via serial reduction is enabled.
+  bool IsASparse                  ///! If true, A is sparse matrix
+>
+struct EllGemm {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using OutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static bool const kSplitKSerial = SplitKSerial;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Parameters structure
+  struct Params {
+    cutlass::gemm::GemmCoord problem_size{};
+    cutlass::gemm::GemmCoord grid_tiled_shape{};
+    int swizzle_log_tile{0};
+    typename Mma::IteratorA::Params params_A{};
+    typename Mma::IteratorA::TensorRef ref_A{};
+    typename Mma::IteratorB::Params params_B{};
+    typename Mma::IteratorB::TensorRef ref_B{};
+    typename Epilogue::OutputTileIterator::Params params_C{};
+    typename Epilogue::OutputTileIterator::TensorRef ref_C{};
+    typename Epilogue::OutputTileIterator::Params params_D{};
+    typename Epilogue::OutputTileIterator::TensorRef ref_D{};
+    typename OutputOp::Params output_op{};
+    int *semaphore = nullptr;
+    int gemm_k_iterations{0};
+    int gemm_k_size{0};
+    const int* ell_idx = nullptr;
+    int ell_ncol{0};
+    int ell_blocksize{0};
+    int ell_base_idx{0};
+
+    //
+    // Methods
+    //
+   Params() = default;
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const & problem_size,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      typename Mma::IteratorA::TensorRef ref_A,
+      typename Mma::IteratorB::TensorRef ref_B,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D,
+      const int* ell_idx,
+      int ell_ncol,
+      int ell_blocksize,
+      int ell_base_idx,
+      typename OutputOp::Params output_op = typename OutputOp::Params(),
+      int *workspace = nullptr
+    ):
+      problem_size(problem_size),
+      grid_tiled_shape(grid_tiled_shape),
+      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      params_A(ref_A.layout()),
+      ref_A(ref_A),
+      params_B(ref_B.layout()),
+      ref_B(ref_B),
+      params_C(ref_C.layout()),
+      ref_C(ref_C),
+      params_D(ref_D.layout()),
+      ref_D(ref_D),
+      output_op(output_op),
+      ell_idx(ell_idx),
+      ell_ncol(ell_ncol),
+      ell_blocksize(ell_blocksize),
+      ell_base_idx(ell_base_idx)
+    {
+
+      int total_gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+      int gemm_k_iterations = (total_gemm_k_iterations + grid_tiled_shape.k() - 1) / grid_tiled_shape.k();
+
+      gemm_k_size = gemm_k_iterations * Mma::Shape::kK;
+
+    semaphore = workspace;
+    }
+  };
+
+  /// Shared memory storage structure
+  struct SharedStorage {
+    union{
+      typename Mma::SharedStorage main_loop;
+      typename Epilogue::SharedStorage epilogue;
+    };
+    typename cutlass::transform::threadblock::ell::SharedStorage ell;
+  };
+
+  //
+  // Methods
+  //
+  EllGemm() = default;
+
+  /// Determines whether kernel satisfies alignment
+    static Status can_implement(
+      cutlass::gemm::GemmCoord const & problem_size,
+      typename Mma::IteratorA::TensorRef ref_A,
+      typename Mma::IteratorB::TensorRef ref_B,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D) {
+
+    static int const kAlignmentA = (platform::is_same<typename Mma::IteratorA::Layout,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<typename Mma::IteratorA::Layout,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB =  (platform::is_same<typename Mma::IteratorB::Layout,
+                                                       layout::RowMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<typename Mma::IteratorB::Layout,
+                                                        layout::RowMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    if (!TensorRef_aligned(ref_A, kAlignmentA)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_B, kAlignmentB)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_C, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_D, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if ((problem_size.m() % kAlignmentA) || (problem_size.k() % kAlignmentA) ||
+      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
+      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC)) {
+
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int tile_in_ell_block = (params.ell_blocksize + Mma::Shape::kM - 1 ) / Mma::Shape::kM;
+    int ell_block_offset_m = threadblock_tile_offset.m() / tile_in_ell_block;
+    int tile_offset_m = threadblock_tile_offset.m() % tile_in_ell_block;
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int lane_idx = threadIdx.x % 32;
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // skip computation if matrix is 0
+    if (params.ell_ncol > 0) {
+
+      // Compute initial location in logical coordinates
+      cutlass::MatrixCoord tb_offset_A{
+        ell_block_offset_m * params.ell_blocksize
+        + tile_offset_m * Mma::Shape::kM,
+        threadblock_tile_offset.k() * params.gemm_k_size
+      };
+
+      cutlass::MatrixCoord tb_offset_B{
+        threadblock_tile_offset.k() * params.gemm_k_size,
+        threadblock_tile_offset.n() * Mma::Shape::kN
+      };
+
+      int ell_idx_start =
+        (threadblock_tile_offset.m() / tile_in_ell_block) *
+        (params.ell_ncol / params.ell_blocksize);
+      const int* ell_idx_ptr = &(params.ell_idx[ell_idx_start]);
+
+      // Problem size is a function of threadblock index in the K dimension
+      int problem_size_k = min(
+        params.problem_size.k(),
+        (threadblock_tile_offset.k() + 1) * params.gemm_k_size);
+      problem_size_k = min(problem_size_k, params.ell_ncol);
+
+      // Compute threadblock-scoped matrix multiply-add
+      int gemm_k_iterations =
+        (problem_size_k - tb_offset_A.column() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+      // Construct iterators to A and B operands
+      typename Mma::IteratorA iterator_A(
+        params.params_A,
+        params.ref_A.data(),
+        {params.problem_size.m(), problem_size_k},
+        thread_idx,
+        tb_offset_A);
+
+      typename Mma::IteratorB iterator_B(
+        params.params_B,
+        params.ref_B.data(),
+        {problem_size_k, params.problem_size.n()},
+        thread_idx,
+        tb_offset_B);
+
+      // Define coef for ELL index depending on LayoutB
+      int ell_stride = iterator_B.get_stride();
+
+      typename cutlass::transform::threadblock::ell::Iterator ell_iterator(
+        shared_storage.ell,
+        ell_idx_ptr,
+        params.ell_blocksize,
+        params.ell_base_idx,
+        Mma::Shape::kK,
+        problem_size_k,
+        ell_stride,
+        thread_idx
+      );
+
+      //
+      // Main loop
+      //
+
+      // Construct thread-scoped matrix multiply
+      Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+      if (!kSplitKSerial || gemm_k_iterations > 0) {
+        // check if index computations can be skipped
+        static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+        static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+        static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+        constexpr bool is_double = (sizeof(Mma::IteratorA::Element) == 8);
+        constexpr bool is_multiple_alignment =  
+          (kAlignmentA > 1) && (kAlignmentB > 1) && (kAlignmentC > 1);
+        const bool is_specialized_blocksize =
+          ((params.ell_blocksize) & (params.ell_blocksize-1)) == 0
+          && params.ell_blocksize >= Mma::Shape::kK;
+        // Compute threadblock-scoped matrix multiply-add
+        if ((is_double || is_multiple_alignment) && is_specialized_blocksize) {
+          mma.operator()<true, true>(
+              gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators, ell_iterator);
+        } 
+        else {
+          mma.operator()<true, false>(
+              gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators, ell_iterator);
+        }
+      }
+    } // if (params.ell_ncols > 0)
+
+    //
+    // Epilogue
+    //
+
+    OutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    ell_block_offset_m = threadblock_tile_offset.m() / tile_in_ell_block;
+    tile_offset_m = threadblock_tile_offset.m() % tile_in_ell_block;
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      ell_block_offset_m * params.ell_blocksize
+      + tile_offset_m * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    //avoid out of bounds
+    MatrixCoord threadblock_extent(
+      min(params.problem_size.m(),
+         ell_block_offset_m * params.ell_blocksize
+         + min((tile_offset_m + 1) * Mma::Shape::kM, params.ell_blocksize)),
+      min(params.problem_size.n(),
+        (threadblock_tile_offset.n()+1) * Mma::Shape::kN)
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    // If performing a reduction via split-K, fetch the initial synchronization
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+
+      // Fetch the synchronization lock initially but do not block.
+      semaphore.fetch();
+
+      // Indicate which position in a serial reduction the output operator is currently updating
+      output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      params.ref_C.data(),
+      threadblock_extent,
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      params.ref_D.data(),
+      threadblock_extent,
+      thread_idx,
+      threadblock_offset
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(output_op, iterator_D, accumulators, iterator_C);
+
+    //
+    // Release the semaphore
+    //
+
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      semaphore.release(lock);
+    }
+  }
+};
+
+// B is Sparse
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  bool SplitKSerial               ///! If true, code supporting split-K via serial reduction is enabled.
+>
+struct EllGemm<Mma_, Epilogue_, ThreadblockSwizzle_, SplitKSerial, false> {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using OutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static bool const kSplitKSerial = SplitKSerial;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Parameters structure
+  struct Params {
+    cutlass::gemm::GemmCoord problem_size{};
+    cutlass::gemm::GemmCoord grid_tiled_shape{};
+    int swizzle_log_tile{0};
+    typename Mma::IteratorA::Params params_A{};
+    typename Mma::IteratorA::TensorRef ref_A{};
+    typename Mma::IteratorB::Params params_B{};
+    typename Mma::IteratorB::TensorRef ref_B{};
+    typename Epilogue::OutputTileIterator::Params params_C{};
+    typename Epilogue::OutputTileIterator::TensorRef ref_C{};
+    typename Epilogue::OutputTileIterator::Params params_D{};
+    typename Epilogue::OutputTileIterator::TensorRef ref_D{};
+    typename OutputOp::Params output_op{};
+    int *semaphore = nullptr;
+    int gemm_k_iterations{0};
+    int gemm_k_size{0};
+    const int* ell_idx = nullptr;
+    int ell_ncol{0};
+    int ell_blocksize{0};
+    int ell_base_idx{0};
+
+    //
+    // Methods
+    //
+    Params() = default;
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const & problem_size,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      typename Mma::IteratorA::TensorRef ref_A,
+      typename Mma::IteratorB::TensorRef ref_B,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D,
+      const int* ell_idx,
+      int ell_ncol,
+      int ell_blocksize,
+      int ell_base_idx,
+      typename OutputOp::Params output_op = typename OutputOp::Params(),
+      int *workspace = nullptr
+    ):
+      problem_size(problem_size),
+      grid_tiled_shape(grid_tiled_shape),
+      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      params_A(ref_A.layout()),
+      ref_A(ref_A),
+      params_B(ref_B.layout()),
+      ref_B(ref_B),
+      params_C(ref_C.layout()),
+      ref_C(ref_C),
+      params_D(ref_D.layout()),
+      ref_D(ref_D),
+      output_op(output_op),
+      ell_idx(ell_idx),
+      ell_ncol(ell_ncol),
+      ell_blocksize(ell_blocksize),
+      ell_base_idx(ell_base_idx)
+    {
+
+      int total_gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+      int gemm_k_iterations = (total_gemm_k_iterations + grid_tiled_shape.k() - 1) / grid_tiled_shape.k();
+
+      gemm_k_size = gemm_k_iterations * Mma::Shape::kK;
+
+    semaphore = workspace;
+    }
+  };
+
+  /// Shared memory storage structure
+  struct SharedStorage {
+    union{
+      typename Mma::SharedStorage main_loop;
+      typename Epilogue::SharedStorage epilogue;
+    };
+    typename cutlass::transform::threadblock::ell::SharedStorage ell;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  EllGemm() { }
+
+  /// Determines whether kernel satisfies alignment
+    static Status can_implement(
+      cutlass::gemm::GemmCoord const & problem_size,
+      typename Mma::IteratorA::TensorRef ref_A,
+      typename Mma::IteratorB::TensorRef ref_B,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D) {
+
+    static int const kAlignmentA = (platform::is_same<typename Mma::IteratorA::Layout,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<typename Mma::IteratorA::Layout,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB =  (platform::is_same<typename Mma::IteratorB::Layout,
+                                                       layout::RowMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<typename Mma::IteratorB::Layout,
+                                                        layout::RowMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    if (!TensorRef_aligned(ref_A, kAlignmentA)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_B, kAlignmentB)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_C, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_D, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if ((problem_size.m() % kAlignmentA) || (problem_size.k() % kAlignmentA) ||
+      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
+      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC)) {
+
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+        params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int tile_in_ell_block = (params.ell_blocksize + Mma::Shape::kN - 1 ) / Mma::Shape::kN;
+    int ell_block_offset_n = threadblock_tile_offset.n() / tile_in_ell_block;
+    int tile_offset_n = threadblock_tile_offset.n() % tile_in_ell_block;
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int lane_idx = threadIdx.x % 32;
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // skip computation if matrix is 0
+    if (params.ell_ncol > 0) {
+
+      // Compute initial location in logical coordinates
+      cutlass::MatrixCoord tb_offset_A{
+        threadblock_tile_offset.m() * Mma::Shape::kM,
+        threadblock_tile_offset.k() * params.gemm_k_size,
+      };
+
+      cutlass::MatrixCoord tb_offset_B{
+        threadblock_tile_offset.k() * params.gemm_k_size,
+        ell_block_offset_n * params.ell_blocksize
+        + tile_offset_n * Mma::Shape::kN,
+      };
+
+      int ell_idx_start =
+        (threadblock_tile_offset.n() / tile_in_ell_block) *
+        (params.ell_ncol / params.ell_blocksize);
+      const int* ell_idx_ptr = &(params.ell_idx[ell_idx_start]);
+
+      // Problem size is a function of threadblock index in the K dimension
+      int problem_size_k = min(
+        params.problem_size.k(),
+        (threadblock_tile_offset.k() + 1) * params.gemm_k_size);
+      problem_size_k = min(problem_size_k, params.ell_ncol);
+
+      // Compute threadblock-scoped matrix multiply-add
+      int gemm_k_iterations =
+        (problem_size_k - tb_offset_A.column() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+      // Construct iterators to A and B operands
+      typename Mma::IteratorA iterator_A(
+        params.params_A,
+        params.ref_A.data(),
+        {params.problem_size.m(), problem_size_k},
+        thread_idx,
+        tb_offset_A);
+
+      typename Mma::IteratorB iterator_B(
+        params.params_B,
+        params.ref_B.data(),
+        {problem_size_k, params.problem_size.n()},
+        thread_idx,
+        tb_offset_B);
+
+      // Define coef for ELL index depending on LayoutA
+      int ell_stride = iterator_A.get_stride();
+
+      typename cutlass::transform::threadblock::ell::Iterator ell_iterator(
+        shared_storage.ell,
+        ell_idx_ptr,
+        params.ell_blocksize,
+        params.ell_base_idx,
+        Mma::Shape::kK,
+        problem_size_k,
+        ell_stride,
+        thread_idx
+      );
+
+      //
+      // Main loop
+      //
+
+      // Construct thread-scoped matrix multiply
+      Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+      if (!kSplitKSerial || gemm_k_iterations > 0) {
+        // check if index computations can be skipped
+        static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+        static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+        static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+        constexpr bool is_double = (sizeof(typename Mma::IteratorA::Element) == 8);
+        constexpr bool is_multiple_alignment =
+          (kAlignmentA > 1) && (kAlignmentB > 1) && (kAlignmentC > 1);
+        const bool is_specialized_blocksize =
+          ((params.ell_blocksize) & (params.ell_blocksize-1)) == 0
+          && params.ell_blocksize >= Mma::Shape::kK;
+        // Compute threadblock-scoped matrix multiply-add
+        if ((is_double || is_multiple_alignment) && is_specialized_blocksize) {
+          mma.template operator()<false, true>(
+              gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators, ell_iterator);
+        }
+        else {
+          mma.template operator()<false, false>(
+              gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators, ell_iterator);
+        }
+      }
+    } // if (params.ell_ncols > 0)
+
+    //
+    // Epilogue
+    //
+
+    OutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    ell_block_offset_n = threadblock_tile_offset.n() / tile_in_ell_block;
+    tile_offset_n = threadblock_tile_offset.n() % tile_in_ell_block;
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      ell_block_offset_n * params.ell_blocksize
+      + tile_offset_n * Mma::Shape::kN
+    );
+
+    //avoid out of bounds
+    MatrixCoord threadblock_extent(
+      min(params.problem_size.m(),
+        (threadblock_tile_offset.m()+1) * Mma::Shape::kM),
+      min(params.problem_size.n(),
+         ell_block_offset_n * params.ell_blocksize
+         + min((tile_offset_n + 1) * Mma::Shape::kN, params.ell_blocksize))
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    // If performing a reduction via split-K, fetch the initial synchronization
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+
+      // Fetch the synchronization lock initially but do not block.
+      semaphore.fetch();
+
+      // Indicate which position in a serial reduction the output operator is currently updating
+      output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      params.ref_C.data(),
+      threadblock_extent,
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      params.ref_D.data(),
+      threadblock_extent,
+      thread_idx,
+      threadblock_offset
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(output_op, iterator_D, accumulators, iterator_C);
+
+    //
+    // Release the semaphore
+    //
+
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
diff --git a/include/cutlass/gemm/kernel/gemm.h b/include/cutlass/gemm/kernel/gemm.h
index 02676edee3..354f5ea8a8 100644
--- a/include/cutlass/gemm/kernel/gemm.h
+++ b/include/cutlass/gemm/kernel/gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -256,7 +256,7 @@ struct Gemm {
 
     // Broadcast the warp_id computed by lane 0 to ensure dependent code
     // is compiled as warp-uniform.
-    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int warp_idx = canonical_warp_idx_sync();
     int lane_idx = threadIdx.x % 32;
 
     //
diff --git a/include/cutlass/gemm/kernel/gemm_array.h b/include/cutlass/gemm/kernel/gemm_array.h
index 0d8a9aad8e..bafa5fa8be 100644
--- a/include/cutlass/gemm/kernel/gemm_array.h
+++ b/include/cutlass/gemm/kernel/gemm_array.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -193,7 +193,7 @@ struct GemmArray {
       
       // Broadcast the warp_id computed by lane 0 to ensure dependent code
       // is compiled as warp-uniform.
-      int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+      int warp_idx = canonical_warp_idx_sync();
 
       int lane_idx = threadIdx.x % 32;
       
diff --git a/include/cutlass/gemm/kernel/gemm_batched.h b/include/cutlass/gemm/kernel/gemm_batched.h
index 7a9f7af323..0c11e997c5 100644
--- a/include/cutlass/gemm/kernel/gemm_batched.h
+++ b/include/cutlass/gemm/kernel/gemm_batched.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -65,31 +65,29 @@ struct GemmBatched {
 
   /// Parameters structure
   struct Params {
-    cutlass::gemm::GemmCoord problem_size;
-    cutlass::gemm::GemmCoord grid_tiled_shape;
-    int swizzle_log_tile;
-    typename Mma::IteratorA::Params params_A;
-    typename Mma::IteratorA::TensorRef ref_A;
-    int64_t stride_A;
-    typename Mma::IteratorB::Params params_B;
-    typename Mma::IteratorB::TensorRef ref_B;
-    int64_t stride_B;
-    typename Epilogue::OutputTileIterator::Params params_C;
-    typename Epilogue::OutputTileIterator::TensorRef ref_C;
-    int64_t stride_C;
-    typename Epilogue::OutputTileIterator::Params params_D;
-    typename Epilogue::OutputTileIterator::TensorRef ref_D;
-    int64_t stride_D;
-    typename OutputOp::Params epilogue;
-    int batch_count;
-    int gemm_k_iterations;
+    cutlass::gemm::GemmCoord problem_size{};
+    cutlass::gemm::GemmCoord grid_tiled_shape{};
+    int swizzle_log_tile{0};
+    typename Mma::IteratorA::Params params_A{};
+    typename Mma::IteratorA::TensorRef ref_A{};
+    int64_t stride_A{0};
+    typename Mma::IteratorB::Params params_B{};
+    typename Mma::IteratorB::TensorRef ref_B{};
+    int64_t stride_B{0};
+    typename Epilogue::OutputTileIterator::Params params_C{};
+    typename Epilogue::OutputTileIterator::TensorRef ref_C{};
+    int64_t stride_C{0};
+    typename Epilogue::OutputTileIterator::Params params_D{};
+    typename Epilogue::OutputTileIterator::TensorRef ref_D{};
+    int64_t stride_D{0};
+    typename OutputOp::Params epilogue{};
+    int batch_count{1};
+    int gemm_k_iterations{0};
 
     //
     // Methods
     //
-
-    CUTLASS_HOST_DEVICE
-    Params() : swizzle_log_tile(0) { }
+    Params() = default;
 
     CUTLASS_HOST_DEVICE
     Params(
@@ -123,9 +121,7 @@ struct GemmBatched {
       stride_D(stride_D_),
       epilogue(epilogue_),
       batch_count(batch_count_),
-      gemm_k_iterations((problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK) {
-
-    }
+      gemm_k_iterations((problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK) {}
   };
 
   /// Shared memory storage structure
@@ -137,9 +133,7 @@ struct GemmBatched {
   //
   // Methods
   //
-
-  CUTLASS_HOST_DEVICE
-  GemmBatched() { } 
+  GemmBatched() = default;
 
   /// Executes one GEMM
   CUTLASS_DEVICE
@@ -204,7 +198,7 @@ struct GemmBatched {
 
       // Broadcast the warp_id computed by lane 0 to ensure dependent code
       // is compiled as warp-uniform.
-      int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+      int warp_idx = canonical_warp_idx_sync();
 
       int lane_idx = threadIdx.x % 32;
       
diff --git a/include/cutlass/gemm/kernel/gemm_grouped.h b/include/cutlass/gemm/kernel/gemm_grouped.h
index ceca1f7dd7..daa6cbd772 100644
--- a/include/cutlass/gemm/kernel/gemm_grouped.h
+++ b/include/cutlass/gemm/kernel/gemm_grouped.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,7 @@
  **************************************************************************************************/
 
 /*! \file
-    \brief 
+    \brief Problem visitor for grouped GEMMs
 */
 
 #pragma once
@@ -45,6 +45,7 @@
 #include "cutlass/layout/matrix.h"
 #include "cutlass/trace.h"
 #include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+#include "cutlass/gemm/kernel/gemm_grouped_problem_visitor.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -54,168 +55,11 @@ namespace kernel {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Visitor class to abstract away the algorithm for iterating over tiles
-template <bool Transposed = false>
-struct GemmGroupedProblemVisitor {
-
-  static bool const kTransposed = Transposed;
-
-  struct Params {
-    cutlass::gemm::GemmCoord const *problem_sizes;
-    int32_t                         problem_count;
-
-    //
-    // Methods
-    // 
-
-    /// Ctor
-    CUTLASS_HOST_DEVICE
-    Params(): problem_sizes(nullptr), problem_count(0) { }
-
-    /// Ctor
-    CUTLASS_HOST_DEVICE
-    Params(
-      cutlass::gemm::GemmCoord const *problem_sizes,
-      int32_t                         problem_count
-    ):
-      problem_sizes(problem_sizes),
-      problem_count(problem_count)
-    {}
-
-  };
-
-  struct SharedStorage {
-    //
-    // Nothing for now. As an optimization step, we could consider parallel
-    // argmin or prefix sums across the block.
-    //
-  };
-
-  //
-  // Data members
-  //
-  
-  Params const &params;
-  SharedStorage &shared_storage;
-  cutlass::MatrixCoord threadblock_shape;
-
-  int64_t tile_idx;
-  int64_t tile_count_sum;
-  int64_t problem_tile_start;
-  int32_t problem_idx;
-
-  //
-  // Methods
-  //
-  CUTLASS_DEVICE
-  GemmGroupedProblemVisitor(
-    Params const &params_,
-    SharedStorage &shared_storage_, 
-    cutlass::MatrixCoord threadblock_shape_,
-    int32_t block_idx
-  ):
-    shared_storage(shared_storage_),
-    params(params_),
-    threadblock_shape(threadblock_shape_),
-    tile_idx(block_idx),
-    tile_count_sum(0),
-    problem_idx(0)
-  {
-
-    cutlass::gemm::GemmCoord problem = problem_size();
-    cutlass::gemm::GemmCoord  grid = grid_shape(problem);
-
-    problem_tile_start = 0;
-    tile_count_sum = grid.m() * grid.n();
-  }
-
-  /// Get the grid shape
-  CUTLASS_HOST_DEVICE
-  static cutlass::gemm::GemmCoord grid_shape(
-    cutlass::gemm::GemmCoord problem,
-    cutlass::MatrixCoord const & block_shape) {
-
-    return cutlass::gemm::GemmCoord(
-      ((problem.m() - 1 + block_shape.row()) / block_shape.row()),
-      ((problem.n() - 1 + block_shape.column()) / block_shape.column()),
-      1);
-  }
-
-  /// Get the grid shape
-  CUTLASS_DEVICE
-  cutlass::gemm::GemmCoord grid_shape(cutlass::gemm::GemmCoord const &problem) const {
-    return grid_shape(problem, threadblock_shape);
-  }
-
-  /// Returns true if there is a tile to compute
-  CUTLASS_DEVICE
-  bool next_tile() {
-
-    if (tile_idx < tile_count_sum) {
-      return true;
-    }
-
-    do {
-      ++problem_idx;
-
-      if (problem_idx >= params.problem_count) {
-        return false;
-      }
-
-      cutlass::gemm::GemmCoord problem = problem_size();
-      cutlass::gemm::GemmCoord  grid = grid_shape(problem);
-
-      int64_t tile_count = grid.m() * grid.n();
-
-      problem_tile_start = tile_count_sum;
-      tile_count_sum += tile_count;
-
-    } while (tile_count_sum <= tile_idx);
-
-    return true;
-  }
-
-  /// Gets the global tile index
-  CUTLASS_HOST_DEVICE
-  int64_t tile_index() const {
-    return tile_idx;
-  }
-
-  /// Gets the index of the problem
-  CUTLASS_HOST_DEVICE
-  int32_t problem_index() const {
-    return problem_idx;
-  }
-
-  /// Returns the problem size for the current problem
-  CUTLASS_HOST_DEVICE
-  cutlass::gemm::GemmCoord problem_size() const {
-    GemmCoord problem = params.problem_sizes[problem_idx];
-
-    if (kTransposed) {
-      swap(problem.m(), problem.n());
-    }
-
-    return problem;
-  }
-
-  CUTLASS_HOST_DEVICE
-  int64_t threadblock_index() const {
-    return tile_idx - problem_tile_start;
-  }
-
-  CUTLASS_DEVICE
-  void advance(int32_t grid_size) {
-    tile_idx += grid_size; 
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
 template <
-  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
-  typename Epilogue_,             ///! Epilogue
-  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  typename Mma_,                           ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,                      ///! Epilogue
+  typename ThreadblockSwizzle_,            ///! Threadblock swizzling function
+  GroupScheduleMode GroupScheduleMode_,    ///! Type of scheduling to perform
   bool Transposed = false
 >
 struct GemmGrouped {
@@ -225,6 +69,7 @@ struct GemmGrouped {
   using Epilogue = Epilogue_;
   using EpilogueOutputOp = typename Epilogue::OutputOp;
   using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static GroupScheduleMode const kGroupScheduleMode = GroupScheduleMode_;
   static bool const kTransposed = Transposed;
 
   // Optional transpose
@@ -270,6 +115,13 @@ struct GemmGrouped {
   using WarpCount = typename Mma::WarpCount;
   static int const kThreadCount = 32 * WarpCount::kCount;
 
+  using ProblemVisitor = GemmGroupedProblemVisitor<
+                            ThreadblockShape,
+                            kGroupScheduleMode,
+                            kThreadCount,
+                            kThreadCount,
+                            kTransposed>;
+
   //
   // Structures
   //
@@ -281,42 +133,32 @@ struct GemmGrouped {
     // Data members
     //
 
-    GemmCoord *problem_sizes;
-    int problem_count;
-    int threadblock_count;
+    GemmCoord *problem_sizes{nullptr};
+    int problem_count{0};
+    int threadblock_count{0};
+
+    typename EpilogueOutputOp::Params output_op{};
 
-    typename EpilogueOutputOp::Params output_op;
+    ElementA ** ptr_A{nullptr};
+    ElementB ** ptr_B{nullptr};
+    ElementC ** ptr_C{nullptr};
+    ElementC ** ptr_D{nullptr};
 
-    ElementA ** ptr_A;
-    ElementB ** ptr_B;
-    ElementC ** ptr_C;
-    ElementC       ** ptr_D;
+    typename LayoutA::Stride::LongIndex *lda{nullptr};
+    typename LayoutB::Stride::LongIndex *ldb{nullptr};
+    typename LayoutC::Stride::LongIndex *ldc{nullptr};
+    typename LayoutC::Stride::LongIndex *ldd{nullptr};
+
+    // Only used by device-level operator
+    GemmCoord *host_problem_sizes{nullptr};
 
-    typename LayoutA::Stride::LongIndex *lda;
-    typename LayoutB::Stride::LongIndex *ldb;
-    typename LayoutC::Stride::LongIndex *ldc;
-    typename LayoutC::Stride::LongIndex *ldd;
 
     //
     // Methods
     //
 
     /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Arguments(): 
-      problem_count(0), 
-      threadblock_count(0), 
-      ptr_A(nullptr), 
-      ptr_B(nullptr), 
-      ptr_C(nullptr), 
-      ptr_D(nullptr), 
-      lda(nullptr),
-      ldb(nullptr),
-      ldc(nullptr),
-      ldd(nullptr)
-    {
-
-    }
+    Arguments() = default;
 
     /// Ctor
     CUTLASS_HOST_DEVICE
@@ -328,11 +170,12 @@ struct GemmGrouped {
       ElementA ** ptr_A,
       ElementB ** ptr_B,
       ElementC ** ptr_C,
-      ElementC       ** ptr_D,
+      ElementC ** ptr_D,
       typename LayoutA::Stride::LongIndex *lda,
       typename LayoutB::Stride::LongIndex *ldb,
       typename LayoutC::Stride::LongIndex *ldc,
-      typename LayoutC::Stride::LongIndex *ldd
+      typename LayoutC::Stride::LongIndex *ldd,
+      GemmCoord *host_problem_sizes=nullptr
     ): 
       problem_sizes(problem_sizes),
       problem_count(problem_count),
@@ -345,7 +188,8 @@ struct GemmGrouped {
       lda(lda),
       ldb(ldb),
       ldc(ldc),
-      ldd(ldd)
+      ldd(ldd),
+      host_problem_sizes(host_problem_sizes)
     {
 
     }
@@ -358,41 +202,32 @@ struct GemmGrouped {
   /// Parameters structure
   struct Params {
 
-    typename GemmGroupedProblemVisitor<kTransposed>::Params problem_visitor;
-    int threadblock_count;
+    typename ProblemVisitor::Params problem_visitor{};
+    int threadblock_count{0};
 
-    typename EpilogueOutputOp::Params output_op;
+    typename EpilogueOutputOp::Params output_op{};
 
-    ElementA ** ptr_A;
-    ElementB ** ptr_B;
-    ElementC ** ptr_C;
-    ElementC ** ptr_D;
-
-    typename LayoutA::Stride::LongIndex *lda;
-    typename LayoutB::Stride::LongIndex *ldb;
-    typename LayoutC::Stride::LongIndex *ldc;
-    typename LayoutC::Stride::LongIndex *ldd;
+    ElementA ** ptr_A{nullptr};
+    ElementB ** ptr_B{nullptr};
+    ElementC ** ptr_C{nullptr};
+    ElementC ** ptr_D{nullptr};
 
+    typename LayoutA::Stride::LongIndex *lda{nullptr};
+    typename LayoutB::Stride::LongIndex *ldb{nullptr};
+    typename LayoutC::Stride::LongIndex *ldc{nullptr};
+    typename LayoutC::Stride::LongIndex *ldd{nullptr};
 
     //
     // Methods
     //
 
-    CUTLASS_HOST_DEVICE
-    Params():
-      ptr_A(nullptr),
-      ptr_B(nullptr),
-      ptr_C(nullptr),
-      ptr_D(nullptr),
-      lda(nullptr),
-      ldb(nullptr),
-      ldc(nullptr),
-      ldd(nullptr)
-    { }
+    Params() = default;
 
     CUTLASS_HOST_DEVICE
-    Params(Arguments const &args, void *workspace = nullptr):
-      problem_visitor(args.problem_sizes, args.problem_count),
+    Params(Arguments const &args,
+          void *workspace = nullptr,
+          int tile_count = 0):
+      problem_visitor(args.problem_sizes, args.problem_count, workspace, tile_count),
       threadblock_count(args.threadblock_count),
       output_op(args.output_op),
       ptr_A(args.ptr_A),
@@ -410,9 +245,11 @@ struct GemmGrouped {
     CUTLASS_HOST_DEVICE
     void update(
       Arguments const &args,
-      void *workspace = nullptr) {
+      void *workspace = nullptr,
+      int tile_count = 0) {
 
-      problem_visitor = typename GemmGroupedProblemVisitor<kTransposed>::Params(args.problem_sizes, args.problem_count);
+      problem_visitor = typename ProblemVisitor::Params(args.problem_sizes, args.problem_count,
+                                                        workspace, tile_count);
       threadblock_count = args.threadblock_count;
       output_op = args.output_op;
       ptr_A = args.ptr_A;
@@ -427,10 +264,14 @@ struct GemmGrouped {
   };
 
   /// Shared memory storage structure
-  union SharedStorage {
-    typename GemmGroupedProblemVisitor<kTransposed>::SharedStorage problem_visitor;
-    typename Mma::SharedStorage main_loop;
-    typename Epilogue::SharedStorage epilogue;
+  struct SharedStorage {
+    union {
+      typename Mma::SharedStorage main_loop;
+      typename Epilogue::SharedStorage epilogue;
+    } kernel;
+
+    // ProblemVisitor shared storage can't be overlapped with others
+    typename ProblemVisitor::SharedStorage problem_visitor;
   };
 
 public:
@@ -450,13 +291,6 @@ struct GemmGrouped {
   static Status can_implement(Arguments const &args) {
     return Status::kSuccess;
   }
-
-  static size_t get_extra_workspace_size(
-    Arguments const &args,
-    cutlass::gemm::GemmCoord const &grid_tiled_shape) {
-
-    return 0;
-  }
  
   /// Executes one GEMM
   CUTLASS_DEVICE
@@ -476,24 +310,23 @@ struct GemmGrouped {
     //
     // Problem visitor.
     //
-    GemmGroupedProblemVisitor<kTransposed> problem_visitor(
-      params.problem_visitor, 
-      shared_storage.problem_visitor, 
-      {Mma::Shape::kM, Mma::Shape::kN}, 
+    ProblemVisitor problem_visitor(
+      params.problem_visitor,
+      shared_storage.problem_visitor,
       blockIdx.x);
 
     // Outer 'persistent' loop to iterate over tiles
     while (problem_visitor.next_tile()) {
 
-      GemmCoord problem_size = problem_visitor.problem_size();
-      int32_t problem_idx    = problem_visitor.problem_index();
-      int32_t cta_idx        = int32_t(problem_visitor.threadblock_index());
+      GemmCoord problem_size  = problem_visitor.problem_size();
+      int32_t problem_idx     = problem_visitor.problem_index();
+      int32_t threadblock_idx = int32_t(problem_visitor.threadblock_idx());
 
       GemmCoord grid_shape = problem_visitor.grid_shape(problem_size);
 
       cutlass::gemm::GemmCoord threadblock_offset(
-        int(cta_idx / grid_shape.n()) * Mma::Shape::kM,
-        int(cta_idx % grid_shape.n()) * Mma::Shape::kN,
+        int(threadblock_idx / grid_shape.n()) * Mma::Shape::kM,
+        int(threadblock_idx % grid_shape.n()) * Mma::Shape::kN,
         0);
 
       // Load element pointers. Exchange pointers and strides if working on the transpose
@@ -538,7 +371,7 @@ struct GemmGrouped {
       
       // Broadcast the warp_id computed by lane 0 to ensure dependent code
       // is compiled as warp-uniform.
-      int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+      int warp_idx = canonical_warp_idx_sync();
 
       int lane_idx = threadIdx.x % 32;
 
@@ -547,7 +380,7 @@ struct GemmGrouped {
       //
 
       // Construct thread-scoped matrix multiply
-      Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+      Mma mma(shared_storage.kernel.main_loop, thread_idx, warp_idx, lane_idx);
 
       // Compute threadblock-scoped matrix multiply-add
       int gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
@@ -597,7 +430,7 @@ struct GemmGrouped {
       );
 
       Epilogue epilogue(
-        shared_storage.epilogue, 
+        shared_storage.kernel.epilogue, 
         thread_idx, 
         warp_idx, 
         lane_idx);
diff --git a/include/cutlass/gemm/kernel/gemm_grouped_per_group_scale.h b/include/cutlass/gemm/kernel/gemm_grouped_per_group_scale.h
new file mode 100644
index 0000000000..972681ab38
--- /dev/null
+++ b/include/cutlass/gemm/kernel/gemm_grouped_per_group_scale.h
@@ -0,0 +1,261 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Problem visitor for grouped GEMMs
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/trace.h"
+#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+#include "cutlass/gemm/kernel/gemm_grouped_problem_visitor.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/kernel/gemm_grouped.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename Mma_,                           ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,                      ///! Epilogue
+  typename ThreadblockSwizzle_,            ///! Threadblock swizzling function
+  GroupScheduleMode GroupScheduleMode_,    ///! Type of scheduling to perform
+  bool Transposed = false
+>
+struct GemmGroupedPerGroupScale : 
+  public GemmGrouped<Mma_, Epilogue_, ThreadblockSwizzle_, GroupScheduleMode_, Transposed> {
+
+  // Inherit constructors
+  using Base = GemmGrouped<Mma_, Epilogue_, ThreadblockSwizzle_, GroupScheduleMode_, Transposed>;
+
+  // Inherit type definitions
+  using typename Base::Mma;
+  using typename Base::Epilogue;
+  using typename Base::EpilogueOutputOp;
+  using typename Base::ThreadblockSwizzle;
+  using typename Base::Params;
+  using typename Base::SharedStorage;
+
+  // Explicitly inherit the kTransposed constant
+  static bool const kTransposed = Base::kTransposed;
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    //
+    // These types shadow the type-level definitions and support the ability to implement
+    // a 'transposed' GEMM that computes the transposed problems.
+    //
+    using ElementA = typename Mma::IteratorA::Element;
+    using LayoutA = typename Mma::IteratorA::Layout;
+    using ElementB = typename Mma::IteratorB::Element;
+    using LayoutB = typename Mma::IteratorB::Layout;
+    using ElementC = typename Epilogue::OutputTileIterator::Element;
+    using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+    //
+    // Problem visitor.
+    //
+    typename Base::ProblemVisitor problem_visitor(
+      params.problem_visitor,
+      shared_storage.problem_visitor,
+      blockIdx.x);
+
+    // Outer 'persistent' loop to iterate over tiles
+    while (problem_visitor.next_tile()) {
+
+      GemmCoord problem_size  = problem_visitor.problem_size();
+      int32_t problem_idx     = problem_visitor.problem_index();
+      int32_t threadblock_idx = int32_t(problem_visitor.threadblock_idx());
+
+      GemmCoord grid_shape = problem_visitor.grid_shape(problem_size);
+
+      cutlass::gemm::GemmCoord threadblock_offset(
+        int(threadblock_idx / grid_shape.n()) * Mma::Shape::kM,
+        int(threadblock_idx % grid_shape.n()) * Mma::Shape::kN,
+        0);
+
+      // Load element pointers. Exchange pointers and strides if working on the transpose
+      ElementA *ptr_A = reinterpret_cast<ElementA *>((kTransposed ? params.ptr_B[problem_idx] : params.ptr_A[problem_idx]));
+      typename LayoutA::LongIndex ldm_A = (kTransposed ? params.ldb[problem_idx] : params.lda[problem_idx]);
+
+      ElementB *ptr_B = reinterpret_cast<ElementB *>((kTransposed ? params.ptr_A[problem_idx] : params.ptr_B[problem_idx]));
+      typename LayoutB::LongIndex ldm_B = (kTransposed ? params.lda[problem_idx] : params.ldb[problem_idx]);
+
+      // Compute initial location in logical coordinates
+      cutlass::MatrixCoord tb_offset_A{
+        threadblock_offset.m(),
+        0,
+      };
+
+      cutlass::MatrixCoord tb_offset_B{
+        0,
+        threadblock_offset.n()
+      };
+
+      // Compute position within threadblock
+      int thread_idx = threadIdx.x;
+
+      // Construct iterators to A and B operands
+      typename Mma::IteratorA iterator_A(
+        LayoutA(ldm_A),
+        ptr_A,
+        {problem_size.m(), problem_size.k()},
+        thread_idx,
+        tb_offset_A);
+
+      typename Mma::IteratorB iterator_B(
+        LayoutB(ldm_B),
+        ptr_B,
+        {problem_size.k(), problem_size.n()},
+        thread_idx,
+        tb_offset_B);
+
+      typename Mma::FragmentC accumulators;
+
+      accumulators.clear();
+      
+      // Broadcast the warp_id computed by lane 0 to ensure dependent code
+      // is compiled as warp-uniform.
+      int warp_idx = canonical_warp_idx_sync();
+
+      int lane_idx = threadIdx.x % 32;
+
+      //
+      // Matrix multiply phase
+      //
+
+      // Construct thread-scoped matrix multiply
+      Mma mma(shared_storage.kernel.main_loop, thread_idx, warp_idx, lane_idx);
+
+      // Compute threadblock-scoped matrix multiply-add
+      int gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+      // Wait for all threads to finish their epilogue phases from the previous tile.
+      __syncthreads();
+
+      // Compute threadblock-scoped matrix multiply-add
+      mma(
+        gemm_k_iterations, 
+        accumulators, 
+        iterator_A, 
+        iterator_B, 
+        accumulators);
+
+      //
+      // Epilogue
+      //
+
+      ElementC *ptr_C = params.ptr_C[problem_idx];
+      ElementC *ptr_D = params.ptr_D[problem_idx];
+
+      LayoutC layout_C(params.ldc[problem_idx]);
+      LayoutC layout_D(params.ldd[problem_idx]);
+
+      typename Epilogue::OutputTileIterator::Params params_C(layout_C);
+      typename Epilogue::OutputTileIterator::Params params_D(layout_D);
+
+      // Tile iterator loading from source tensor.
+      typename Epilogue::OutputTileIterator iterator_C(
+        params_C,
+        ptr_C,
+        problem_size.mn(),
+        thread_idx,
+        threadblock_offset.mn()
+      );
+
+      // Tile iterator writing to destination tensor.
+      typename Epilogue::OutputTileIterator iterator_D(
+        params_D,
+        ptr_D,
+        problem_size.mn(),
+        thread_idx,
+        threadblock_offset.mn()
+      );
+
+      Epilogue epilogue(
+        shared_storage.kernel.epilogue, 
+        thread_idx, 
+        warp_idx, 
+        lane_idx);
+
+      // The if branch is for the per-group scaling epilogue. The customized epilogue operator scales each gemm output by a scalar value.
+      // This branch is only enabled if EpilogueOutputOp is LinearCombination.
+      if constexpr (platform::is_same<EpilogueOutputOp,
+                              ::cutlass::epilogue::thread::LinearCombination<typename EpilogueOutputOp::ElementOutput,
+                                  EpilogueOutputOp::kCount, typename EpilogueOutputOp::ElementAccumulator,
+                                  typename EpilogueOutputOp::ElementCompute, EpilogueOutputOp::kScale,
+                                  EpilogueOutputOp::kRound>>::value)
+      {
+        EpilogueOutputOp output_op(params.output_op, problem_idx);
+        // Execute the epilogue operator to update the destination tensor.
+        epilogue(
+            output_op, 
+            iterator_D, 
+            accumulators, 
+            iterator_C); 
+      } else {
+        EpilogueOutputOp output_op(params.output_op);
+        // Execute the epilogue operator to update the destination tensor.
+        epilogue(
+            output_op, 
+            iterator_D, 
+            accumulators, 
+            iterator_C); 
+      }
+
+      // Next tile
+      problem_visitor.advance(gridDim.x);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/gemm_grouped_problem_visitor.h b/include/cutlass/gemm/kernel/gemm_grouped_problem_visitor.h
new file mode 100644
index 0000000000..1c4411bd55
--- /dev/null
+++ b/include/cutlass/gemm/kernel/gemm_grouped_problem_visitor.h
@@ -0,0 +1,121 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Scheduler for grouped GEMM
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/gemm/kernel/grouped_problem_visitor.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+// Helper for correctly representing problem sizes in grouped kernels 
+template <
+  typename ThreadblockShape,
+  bool Transposed
+>
+struct GemmGroupedProblemSizeHelper {
+
+  static bool const kTransposed = Transposed;
+
+  CUTLASS_HOST_DEVICE
+  static cutlass::gemm::GemmCoord grid_shape(const cutlass::gemm::GemmCoord& problem) {
+    return cutlass::gemm::GemmCoord(
+      ((problem.m() - 1 + ThreadblockShape::kM) / ThreadblockShape::kM),
+      ((problem.n() - 1 + ThreadblockShape::kN) / ThreadblockShape::kN),
+      1);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static void possibly_transpose_problem(cutlass::gemm::GemmCoord& problem) {
+    if (kTransposed) {
+      cutlass::swap(problem.m(), problem.n());
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t tile_count(const cutlass::gemm::GemmCoord& grid) {
+    return grid.m() * grid.n();
+  }
+};
+
+} // namespace detail
+
+/// Visitor class to abstract away the algorithm for iterating over tiles
+template <typename ThreadblockShape,
+          GroupScheduleMode GroupScheduleMode_,
+          int PrefetchTileCount,
+          int ThreadCount,
+          bool Transposed = false>
+struct GemmGroupedProblemVisitor : public GroupedProblemVisitor<
+                                            detail::GemmGroupedProblemSizeHelper<ThreadblockShape, Transposed>,
+                                            ThreadblockShape,
+                                            GroupScheduleMode_,
+                                            PrefetchTileCount,
+                                            ThreadCount> {
+
+  static bool const kTransposed = Transposed;
+
+  using ProblemSizeHelper = detail::GemmGroupedProblemSizeHelper<ThreadblockShape, Transposed>;
+  using Base = GroupedProblemVisitor<ProblemSizeHelper, ThreadblockShape, GroupScheduleMode_, PrefetchTileCount, ThreadCount>;
+  using Params = typename Base::Params;
+  using SharedStorage = typename Base::SharedStorage;
+
+  //
+  // Methods
+  //
+  CUTLASS_DEVICE
+  GemmGroupedProblemVisitor(
+    Params const &params_,
+    SharedStorage &shared_storage_, 
+    int32_t block_idx
+  ): Base (params_, shared_storage_, block_idx)
+  {}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/gemm_grouped_softmax_mainloop_fusion.h b/include/cutlass/gemm/kernel/gemm_grouped_softmax_mainloop_fusion.h
new file mode 100644
index 0000000000..3d889469fe
--- /dev/null
+++ b/include/cutlass/gemm/kernel/gemm_grouped_softmax_mainloop_fusion.h
@@ -0,0 +1,481 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Problem visitor for grouped GEMMs with a softmax fused beforehand
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/trace.h"
+#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+#include "cutlass/gemm/kernel/gemm_grouped_problem_visitor.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                           ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,                      ///! Epilogue
+  typename ThreadblockSwizzle_,            ///! Threadblock swizzling function
+  GroupScheduleMode GroupScheduleMode_,    ///! Type of scheduling to perform
+  bool Transposed = false
+>
+struct GemmGroupedSoftmaxMainloopFusion {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static GroupScheduleMode const kGroupScheduleMode = GroupScheduleMode_;
+  static bool const kTransposed = Transposed;
+
+  // Optional transpose
+  using MapArguments = kernel::detail::MapArguments<
+    typename Mma::IteratorA::Element,
+    typename Mma::IteratorA::Layout,
+    Mma::kTransformA,
+    Mma::IteratorA::AccessType::kElements,
+    typename Mma::IteratorB::Element,
+    typename Mma::IteratorB::Layout,
+    Mma::kTransformB,
+    Mma::IteratorB::AccessType::kElements,
+    typename Mma::LayoutC,
+    kTransposed
+  >;
+
+  // Public-facing type definitions related to operand element type, layout, and complex conjugate
+  // operation. Must interact with the 'kTransposed' notion.
+  using ElementA = typename MapArguments::ElementA;
+  using LayoutA = typename MapArguments::LayoutA;
+  using ElementB = typename MapArguments::ElementB;
+  using LayoutB = typename MapArguments::LayoutB;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename MapArguments::LayoutC;
+
+  using ElementScaleBias = typename Mma::IteratorNormSum::Element;
+
+  static ComplexTransform const kTransformA = MapArguments::kTransformA;
+  static ComplexTransform const kTransformB = MapArguments::kTransformB;
+
+  // Type definitions about the mainloop.
+  using Operator = typename Mma::Operator;
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = MapArguments::kAlignmentA;
+  static int const kAlignmentB = MapArguments::kAlignmentB;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  using ProblemVisitor = GemmGroupedProblemVisitor<
+                            ThreadblockShape,
+                            kGroupScheduleMode,
+                            kThreadCount,
+                            kThreadCount,
+                            kTransposed>;
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord *problem_sizes{nullptr};
+    int problem_count{0};
+    int threadblock_count{0};
+
+    typename EpilogueOutputOp::Params output_op{};
+
+    ElementA ** ptr_A{nullptr};
+    ElementB ** ptr_B{nullptr};
+    ElementC ** ptr_C{nullptr};
+    ElementC ** ptr_D{nullptr};
+    void ** ptr_norm{nullptr};
+    void ** ptr_sum{nullptr};
+
+    typename LayoutA::Stride::LongIndex *lda{nullptr};
+    typename LayoutB::Stride::LongIndex *ldb{nullptr};
+    typename LayoutC::Stride::LongIndex *ldc{nullptr};
+    typename LayoutC::Stride::LongIndex *ldd{nullptr};
+
+    // Only used by device-level operator
+    GemmCoord *host_problem_sizes{nullptr};
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    Arguments() = default;
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord *problem_sizes,
+      int problem_count,
+      int threadblock_count,
+      typename EpilogueOutputOp::Params output_op,
+      ElementA ** ptr_A,
+      ElementB ** ptr_B,
+      ElementC ** ptr_C,
+      ElementC ** ptr_D,
+      void ** ptr_norm,
+      void ** ptr_sum,
+      typename LayoutA::Stride::LongIndex *lda,
+      typename LayoutB::Stride::LongIndex *ldb,
+      typename LayoutC::Stride::LongIndex *ldc,
+      typename LayoutC::Stride::LongIndex *ldd,
+      GemmCoord *host_problem_sizes=nullptr
+    ):
+      problem_sizes(problem_sizes),
+      problem_count(problem_count),
+      threadblock_count(threadblock_count),
+      output_op(output_op),
+      ptr_A(ptr_A),
+      ptr_B(ptr_B),
+      ptr_C(ptr_C),
+      ptr_D(ptr_D),
+      ptr_norm(ptr_norm),
+      ptr_sum(ptr_sum),
+      lda(lda),
+      ldb(ldb),
+      ldc(ldc),
+      ldd(ldd),
+      host_problem_sizes(host_problem_sizes)
+    {
+
+    }
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+
+    typename ProblemVisitor::Params problem_visitor{};
+    int threadblock_count{0};
+
+    typename EpilogueOutputOp::Params output_op{};
+
+    ElementA ** ptr_A{nullptr};
+    ElementB ** ptr_B{nullptr};
+    ElementC ** ptr_C{nullptr};
+    ElementC ** ptr_D{nullptr};
+
+    void ** ptr_norm{nullptr};
+    void ** ptr_sum{nullptr};
+
+    typename LayoutA::Stride::LongIndex *lda{nullptr};
+    typename LayoutB::Stride::LongIndex *ldb{nullptr};
+    typename LayoutC::Stride::LongIndex *ldc{nullptr};
+    typename LayoutC::Stride::LongIndex *ldd{nullptr};
+
+    //
+    // Methods
+    //
+
+    Params() = default;
+
+    CUTLASS_HOST_DEVICE
+    Params(Arguments const &args,
+          void *workspace = nullptr,
+          int tile_count = 0):
+      problem_visitor(args.problem_sizes, args.problem_count, workspace, tile_count),
+      threadblock_count(args.threadblock_count),
+      output_op(args.output_op),
+      ptr_A(args.ptr_A),
+      ptr_B(args.ptr_B),
+      ptr_C(args.ptr_C),
+      ptr_D(args.ptr_D),
+      ptr_norm(args.ptr_norm),
+      ptr_sum(args.ptr_sum),
+      lda(args.lda),
+      ldb(args.ldb),
+      ldc(args.ldc),
+      ldd(args.ldd)
+    {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    void update(
+      Arguments const &args,
+      void *workspace = nullptr,
+      int tile_count = 0) {
+
+      problem_visitor = typename ProblemVisitor::Params(args.problem_sizes, args.problem_count,
+                                                        workspace, tile_count);
+      threadblock_count = args.threadblock_count;
+      output_op = args.output_op;
+      ptr_A = args.ptr_A;
+      ptr_B = args.ptr_B;
+      ptr_C = args.ptr_C;
+      ptr_D = args.ptr_D;
+      ptr_norm = args.ptr_norm;
+      ptr_sum = args.ptr_sum;
+      lda = args.lda;
+      ldb = args.ldb;
+      ldc = args.ldc;
+      ldd = args.ldd;
+    }
+  };
+
+  /// Shared memory storage structure
+  struct SharedStorage {
+    union {
+      typename Mma::SharedStorage main_loop;
+      typename Epilogue::SharedStorage epilogue;
+    } kernel;
+
+    // ProblemVisitor shared storage can't be overlapped with others
+    typename ProblemVisitor::SharedStorage problem_visitor;
+  };
+
+public:
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  GemmGroupedSoftmaxMainloopFusion() { }
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(cutlass::gemm::GemmCoord const & problem_size) {
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return Status::kSuccess;
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    //
+    // These types shadow the type-level definitions and support the ability to implement
+    // a 'transposed' GEMM that computes the transposed problems.
+    //
+    using ElementA = typename Mma::IteratorA::Element;
+    using LayoutA = typename Mma::IteratorA::Layout;
+    using ElementB = typename Mma::IteratorB::Element;
+    using LayoutB = typename Mma::IteratorB::Layout;
+    using ElementC = typename Epilogue::OutputTileIterator::Element;
+    using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+    //
+    // Problem visitor.
+    //
+    ProblemVisitor problem_visitor(
+      params.problem_visitor,
+      shared_storage.problem_visitor,
+      blockIdx.x);
+
+    // Outer 'persistent' loop to iterate over tiles
+    while (problem_visitor.next_tile()) {
+
+      GemmCoord problem_size  = problem_visitor.problem_size();
+      int32_t problem_idx     = problem_visitor.problem_index();
+      int32_t threadblock_idx = int32_t(problem_visitor.threadblock_idx());
+
+      GemmCoord grid_shape = problem_visitor.grid_shape(problem_size);
+
+      cutlass::gemm::GemmCoord threadblock_offset(
+        int(threadblock_idx / grid_shape.n()) * Mma::Shape::kM,
+        int(threadblock_idx % grid_shape.n()) * Mma::Shape::kN,
+        0);
+
+      // Load element pointers. Exchange pointers and strides if working on the transpose
+      ElementA *ptr_A = reinterpret_cast<ElementA *>((kTransposed ? params.ptr_B[problem_idx] : params.ptr_A[problem_idx]));
+      typename LayoutA::LongIndex ldm_A = (kTransposed ? params.ldb[problem_idx] : params.lda[problem_idx]);
+
+      ElementB *ptr_B = reinterpret_cast<ElementB *>((kTransposed ? params.ptr_A[problem_idx] : params.ptr_B[problem_idx]));
+      typename LayoutB::LongIndex ldm_B = (kTransposed ? params.lda[problem_idx] : params.ldb[problem_idx]);
+
+      // Compute initial location in logical coordinates
+      cutlass::MatrixCoord tb_offset_A{
+        threadblock_offset.m(),
+        0,
+      };
+
+      cutlass::MatrixCoord tb_offset_B{
+        0,
+        threadblock_offset.n()
+      };
+
+      // Compute position within threadblock
+      int thread_idx = threadIdx.x;
+
+      // Construct iterators to A and B operands
+      typename Mma::IteratorA iterator_A(
+        LayoutA(ldm_A),
+        ptr_A,
+        {problem_size.m(), problem_size.k()},
+        thread_idx,
+        tb_offset_A);
+
+      typename Mma::IteratorB iterator_B(
+        LayoutB(ldm_B),
+        ptr_B,
+        {problem_size.k(), problem_size.n()},
+        thread_idx,
+        tb_offset_B);
+
+      // Construct iterator to the softmax norm/sum vector
+      typename Mma::IteratorNormSum iterator_norm_sum(
+        problem_size.m(),
+        static_cast<ElementScaleBias const *>(params.ptr_norm[problem_idx]),
+        static_cast<ElementScaleBias const *>(params.ptr_sum[problem_idx]),
+        thread_idx,
+        MatrixCoord(0, threadblock_offset.m())
+      );
+
+      typename Mma::FragmentC accumulators;
+
+      accumulators.clear();
+
+      // Broadcast the warp_id computed by lane 0 to ensure dependent code
+      // is compiled as warp-uniform.
+      int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+
+      int lane_idx = threadIdx.x % 32;
+
+      //
+      // Matrix multiply phase
+      //
+
+      // Construct thread-scoped matrix multiply
+      Mma mma(shared_storage.kernel.main_loop, thread_idx, warp_idx, lane_idx);
+
+      // Compute threadblock-scoped matrix multiply-add
+      int gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+      // Wait for all threads to finish their epilogue phases from the previous tile.
+      __syncthreads();
+
+      // Compute threadblock-scoped matrix multiply-add
+      mma(
+        gemm_k_iterations,
+        accumulators,
+        iterator_A,
+        iterator_B,
+        iterator_norm_sum,
+        accumulators);
+
+      //
+      // Epilogue
+      //
+
+      EpilogueOutputOp output_op(params.output_op);
+
+      ElementC *ptr_C = params.ptr_C[problem_idx];
+      ElementC *ptr_D = params.ptr_D[problem_idx];
+
+      LayoutC layout_C(params.ldc[problem_idx]);
+      LayoutC layout_D(params.ldd[problem_idx]);
+
+      typename Epilogue::OutputTileIterator::Params params_C(layout_C);
+      typename Epilogue::OutputTileIterator::Params params_D(layout_D);
+
+      // Tile iterator loading from source tensor.
+      typename Epilogue::OutputTileIterator iterator_C(
+        params_C,
+        ptr_C,
+        problem_size.mn(),
+        thread_idx,
+        threadblock_offset.mn()
+      );
+
+      // Tile iterator writing to destination tensor.
+      typename Epilogue::OutputTileIterator iterator_D(
+        params_D,
+        ptr_D,
+        problem_size.mn(),
+        thread_idx,
+        threadblock_offset.mn()
+      );
+
+      Epilogue epilogue(
+        shared_storage.kernel.epilogue,
+        thread_idx,
+        warp_idx,
+        lane_idx);
+
+      // Execute the epilogue operator to update the destination tensor.
+      epilogue(
+        output_op,
+        iterator_D,
+        accumulators,
+        iterator_C);
+
+      // Next tile
+      problem_visitor.advance(gridDim.x);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/gemm_layernorm_mainloop_fusion.h b/include/cutlass/gemm/kernel/gemm_layernorm_mainloop_fusion.h
new file mode 100644
index 0000000000..f324d7b30a
--- /dev/null
+++ b/include/cutlass/gemm/kernel/gemm_layernorm_mainloop_fusion.h
@@ -0,0 +1,782 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Template for a multistage GEMM kernel with layernorm operations fused in mainloop.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/gemm/kernel/params_universal_base.h"
+
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+struct GemmLayernormMainloopFusion {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+  using ElementScaleBias = typename Mma::IteratorVarMean::Element;
+  using LayoutScaleBias = typename Mma::IteratorVarMean::Layout;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments : UniversalArgumentsBase
+  {
+    //
+    // Data members
+    //
+
+    typename EpilogueOutputOp::Params epilogue{};
+
+    void const * ptr_A{nullptr};
+    void const * ptr_B{nullptr};
+    void const * ptr_var{nullptr};
+    void const * ptr_mean{nullptr};
+    void const * ptr_gamma{nullptr};
+    void const * ptr_beta{nullptr};
+    void const * ptr_C{nullptr};
+    void * ptr_D{nullptr};
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+    int64_t batch_stride_var{0};
+    int64_t batch_stride_mean{0};
+    int64_t batch_stride_gamma{0};
+    int64_t batch_stride_beta{0};
+    int64_t batch_stride_C{0};
+
+    typename LayoutA::Stride stride_a{};
+    typename LayoutB::Stride stride_b{};
+    typename LayoutScaleBias::Stride stride_var{};
+    typename LayoutScaleBias::Stride stride_mean{};
+    typename LayoutScaleBias::Stride stride_gamma{};
+    typename LayoutScaleBias::Stride stride_beta{};
+    typename LayoutC::Stride stride_c{};
+    typename LayoutC::Stride stride_d{};
+
+    typename LayoutA::Stride::LongIndex lda{};
+    typename LayoutB::Stride::LongIndex ldb{};
+    typename LayoutScaleBias::Stride::LongIndex ld_var{};
+    typename LayoutScaleBias::Stride::LongIndex ld_mean{};
+    typename LayoutScaleBias::Stride::LongIndex ld_gamma{};
+    typename LayoutScaleBias::Stride::LongIndex ld_beta{};
+    typename LayoutC::Stride::LongIndex ldc{};
+    typename LayoutC::Stride::LongIndex ldd{};
+
+    int const * ptr_gather_A_indices{nullptr};
+    int const * ptr_gather_B_indices{nullptr};
+    int const * ptr_scatter_D_indices{nullptr};
+
+    //
+    // Methods
+    //
+    
+    Arguments() = default;
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_var,
+      void const * ptr_mean,
+      void const * ptr_gamma,
+      void const * ptr_beta,
+      void const * ptr_C,
+      void * ptr_D,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_var,
+      int64_t batch_stride_mean,
+      int64_t batch_stride_gamma,
+      int64_t batch_stride_beta,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      typename LayoutA::Stride stride_a,
+      typename LayoutB::Stride stride_b,
+      typename LayoutScaleBias::Stride stride_var,
+      typename LayoutScaleBias::Stride stride_mean,
+      typename LayoutScaleBias::Stride stride_gamma,
+      typename LayoutScaleBias::Stride stride_beta,
+      typename LayoutC::Stride stride_c,
+      typename LayoutC::Stride stride_d,
+      int const *ptr_gather_A_indices = nullptr,
+      int const *ptr_gather_B_indices = nullptr,
+      int const *ptr_scatter_D_indices = nullptr)
+    :
+      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
+      epilogue(epilogue), 
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
+      ptr_var(ptr_var), ptr_mean(ptr_mean), 
+      ptr_gamma(ptr_gamma), ptr_beta(ptr_beta), 
+      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C),
+      batch_stride_var(batch_stride_var), batch_stride_mean(batch_stride_mean),
+      batch_stride_gamma(batch_stride_gamma), batch_stride_beta(batch_stride_beta),
+      lda(0), ldb(0), ldc(0), ldd(0),
+      ld_var(0), ld_mean(0),
+      ld_gamma(0), ld_beta(0),
+      stride_a(stride_a), stride_b(stride_b), stride_c(stride_c), stride_d(stride_d),
+      stride_var(stride_var), stride_mean(stride_mean),
+      stride_gamma(stride_gamma), stride_beta(stride_beta),
+      ptr_gather_A_indices(ptr_gather_A_indices), ptr_gather_B_indices(ptr_gather_B_indices),
+      ptr_scatter_D_indices(ptr_scatter_D_indices)
+    {
+      CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
+    }
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_var,
+      void const * ptr_mean,
+      void const * ptr_gamma,
+      void const * ptr_beta,
+      void const * ptr_C,
+      void * ptr_D,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_var,
+      int64_t batch_stride_mean,
+      int64_t batch_stride_gamma,
+      int64_t batch_stride_beta,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      typename LayoutA::Stride::LongIndex lda,
+      typename LayoutB::Stride::LongIndex ldb,
+      typename LayoutScaleBias::Stride::LongIndex ld_var,
+      typename LayoutScaleBias::Stride::LongIndex ld_mean,
+      typename LayoutScaleBias::Stride::LongIndex ld_gamma,
+      typename LayoutScaleBias::Stride::LongIndex ld_beta,
+      typename LayoutC::Stride::LongIndex ldc,
+      typename LayoutC::Stride::LongIndex ldd,
+      int const *ptr_gather_A_indices = nullptr,
+      int const *ptr_gather_B_indices = nullptr,
+      int const *ptr_scatter_D_indices = nullptr)
+    :
+      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
+      epilogue(epilogue), 
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
+      ptr_var(ptr_var), ptr_mean(ptr_mean), 
+      ptr_gamma(ptr_gamma), ptr_beta(ptr_beta), 
+      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C),
+      batch_stride_var(batch_stride_var), batch_stride_mean(batch_stride_mean),
+      batch_stride_gamma(batch_stride_gamma), batch_stride_beta(batch_stride_beta),
+      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd),
+      ld_var(ld_var), ld_mean(ld_mean),
+      ld_gamma(ld_gamma), ld_beta(ld_beta),
+      ptr_gather_A_indices(ptr_gather_A_indices), ptr_gather_B_indices(ptr_gather_B_indices),
+      ptr_scatter_D_indices(ptr_scatter_D_indices)
+    {
+      stride_a = make_Coord(lda);
+      stride_b = make_Coord(ldb);
+      stride_c = make_Coord(ldc);
+      stride_d = make_Coord(ldd);
+      stride_var = make_Coord(ld_var);
+      stride_mean = make_Coord(ld_mean);
+      stride_gamma = make_Coord(ld_gamma);
+      stride_beta = make_Coord(ld_beta);
+      CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
+    }
+
+    /// Returns arguments for the transposed problem
+    Arguments transposed_problem() const {
+      Arguments args(*this);
+      
+      std::swap(args.problem_size.m(), args.problem_size.n());
+      std::swap(args.ptr_A, args.ptr_B);
+      std::swap(args.lda, args.ldb);
+      std::swap(args.stride_a, args.stride_b);
+      std::swap(args.batch_stride_A, args.batch_stride_B);
+      std::swap(args.ptr_gather_A_indices, args.ptr_gather_B_indices);
+
+      return args;
+    }
+  };
+
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params : UniversalParamsBase<
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>
+  {
+    using ParamsBase = UniversalParamsBase<
+      ThreadblockSwizzle,
+      ThreadblockShape,
+      ElementA,
+      ElementB,
+      ElementC,
+      LayoutA,
+      LayoutB>;
+
+    //
+    // Data members
+    //
+
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorB::Params params_B;
+    typename Epilogue::OutputTileIterator::Params params_C;
+    typename Epilogue::OutputTileIterator::Params params_D;
+    
+    typename EpilogueOutputOp::Params output_op;
+
+    void * ptr_A;
+    void * ptr_B;
+    void * ptr_var;
+    void * ptr_mean;
+    void * ptr_gamma;
+    void * ptr_beta;
+    void * ptr_C;
+    void * ptr_D;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_var;
+    int64_t batch_stride_mean;
+    int64_t batch_stride_gamma;
+    int64_t batch_stride_beta;
+    int64_t batch_stride_C;
+
+    int * ptr_gather_A_indices;
+    int * ptr_gather_B_indices;
+    int * ptr_scatter_D_indices;
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      ParamsBase(args, device_sms, sm_occupancy),
+      params_A(args.lda ? make_Coord_with_padding<LayoutA::kStrideRank>(args.lda) : args.stride_a),
+      params_B(args.ldb ? make_Coord_with_padding<LayoutB::kStrideRank>(args.ldb) : args.stride_b),
+      params_C(args.ldc ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldc) : args.stride_c),
+      params_D(args.ldd ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldd) : args.stride_d),
+      output_op(args.epilogue),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_var(const_cast<void *>(args.ptr_var)),
+      ptr_mean(const_cast<void *>(args.ptr_mean)),
+      ptr_gamma(const_cast<void *>(args.ptr_gamma)),
+      ptr_beta(const_cast<void *>(args.ptr_beta)),
+      ptr_C(const_cast<void *>(args.ptr_C)),
+      ptr_D(args.ptr_D),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_var(args.batch_stride_var),
+      batch_stride_mean(args.batch_stride_mean),
+      batch_stride_gamma(args.batch_stride_gamma),
+      batch_stride_beta(args.batch_stride_beta),
+      batch_stride_C(args.batch_stride_C),
+      ptr_gather_A_indices(const_cast<int *>(args.ptr_gather_A_indices)),
+      ptr_gather_B_indices(const_cast<int *>(args.ptr_gather_B_indices)),
+      ptr_scatter_D_indices(const_cast<int *>(args.ptr_scatter_D_indices))
+    {}
+
+    /// Lightweight update given a subset of arguments.
+    void update(Arguments const &args)
+    {
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_var = const_cast<void *>(args.ptr_var);
+      ptr_mean = const_cast<void *>(args.ptr_mean);
+      ptr_gamma = const_cast<void *>(args.ptr_gamma);
+      ptr_beta = const_cast<void *>(args.ptr_beta);
+      ptr_C = const_cast<void *>(args.ptr_C);
+      ptr_D = args.ptr_D;
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C = args.batch_stride_C;
+      batch_stride_var = args.batch_stride_var;
+      batch_stride_mean = args.batch_stride_mean;
+      batch_stride_gamma = args.batch_stride_gamma;
+      batch_stride_beta = args.batch_stride_beta;
+      this->batch_stride_D = args.batch_stride_D;
+
+      ptr_gather_A_indices = const_cast<int *>(args.ptr_gather_A_indices);
+      ptr_gather_B_indices = const_cast<int *>(args.ptr_gather_B_indices);
+      ptr_scatter_D_indices = const_cast<int *>(args.ptr_scatter_D_indices);
+
+      output_op = args.epilogue;
+      
+      CUTLASS_TRACE_HOST("GemmUniversal::Params::update()");
+    }
+  };
+
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+public:
+
+  //
+  // Host dispatch API
+  //
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size) {
+
+    CUTLASS_TRACE_HOST("GemmUniversal::can_implement()");
+
+    static int const kAlignmentA = (platform::is_same<LayoutA,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<LayoutA,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = (platform::is_same<LayoutB,
+                                                      layout::RowMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<LayoutB,
+                                                        layout::RowMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = (platform::is_same<LayoutC,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<LayoutC,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
+            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    CUTLASS_TRACE_HOST("  returning kSuccess");
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmLayernormMainloopFusion op;
+    op(params, shared_storage);
+  }
+ 
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A); 
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm || 
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size; 
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
+    }
+
+    __syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A,
+      params.ptr_gather_A_indices);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B,
+      params.ptr_gather_B_indices);
+
+    // Construct iterators to A var/mean vector
+    typename Mma::IteratorVarMean iterator_var_mean(
+      params.problem_size.m(),
+      static_cast<ElementScaleBias const *>(params.ptr_var),
+      static_cast<ElementScaleBias const *>(params.ptr_mean),
+      thread_idx,
+      MatrixCoord(0, (threadblock_tile_offset.m() * Mma::Shape::kM))
+    );
+
+    // Construct iterators to A scale/bias vector
+    typename Mma::IteratorGammaBeta iterator_gamma_beta(
+      problem_size_k,
+      static_cast<ElementScaleBias const *>(params.ptr_gamma),
+      static_cast<ElementScaleBias const *>(params.ptr_beta),
+      thread_idx,
+      MatrixCoord(
+        0, (threadblock_tile_offset.k() * Mma::Shape::kK)
+      )
+    );
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations, 
+      accumulators, 
+      iterator_A, 
+      iterator_B,
+      iterator_var_mean,
+      iterator_gamma_beta, 
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C); 
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+        
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      ptr_C,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset,
+      params.ptr_scatter_D_indices
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset,
+      params.ptr_scatter_D_indices
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+      output_op, 
+      iterator_D, 
+      accumulators, 
+      iterator_C); 
+    
+    //
+    // Release the semaphore
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) { 
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+      
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/gemm_params.h b/include/cutlass/gemm/kernel/gemm_params.h
index 31b54d4da7..5a7f29d8f3 100755
--- a/include/cutlass/gemm/kernel/gemm_params.h
+++ b/include/cutlass/gemm/kernel/gemm_params.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -70,46 +70,36 @@ struct GemmParams {
   // Data members
   //
 
-  cutlass::gemm::GemmCoord problem_size;
-  cutlass::gemm::GemmCoord grid_tiled_shape;
-  int swizzle_log_tile;
+  cutlass::gemm::GemmCoord problem_size{};
+  cutlass::gemm::GemmCoord grid_tiled_shape{};
+  int swizzle_log_tile{};
 
-  // Data members for Mma::Iterator::Params
-  MmaIteratorParams params_itr_a;
-  MmaIteratorParams params_itr_b;  
+  GemmUniversalMode mode{GemmUniversalMode::kGemm};
+  int batch_count{1};
+  int gemm_k_size{0};
 
-  // Data member for Epilogue::OutputTileIterator::Params 
-  EpilogueIteratorParams params_itr_c;
-  EpilogueIteratorParams params_itr_d;
+  void * ptr_A{nullptr};
+  void * ptr_B{nullptr};
+  void * ptr_C{nullptr};
+  void * ptr_D{nullptr};
 
+  LongIndex lda{0};
+  LongIndex ldb{0};
+  LongIndex ldc{0};
+  LongIndex ldd{0};
 
-  GemmUniversalMode mode;
-  int batch_count;
-  int gemm_k_size;
+  LongIndex batch_stride_A{0};
+  LongIndex batch_stride_B{0};
+  LongIndex batch_stride_C{0};
+  LongIndex batch_stride_D{0};
 
-  void * ptr_A;
-  void * ptr_B;
-  void * ptr_C;
-  void * ptr_D;
-
-  LongIndex lda; 
-  LongIndex ldb; 
-  LongIndex ldc; 
-  LongIndex ldd;
-
-  LongIndex batch_stride_A;
-  LongIndex batch_stride_B;
-  LongIndex batch_stride_C;
-  LongIndex batch_stride_D;
-
-  int *semaphore;
+  int *semaphore{nullptr};
 
   //
   // Methods
   //
 
-  CUTLASS_HOST_DEVICE
-  GemmParams()  {}
+  GemmParams() = default;
 
   CUTLASS_HOST_DEVICE
   GemmParams(
diff --git a/include/cutlass/gemm/kernel/gemm_pipelined.h b/include/cutlass/gemm/kernel/gemm_pipelined.h
index ee28b4769a..019f93c8f0 100644
--- a/include/cutlass/gemm/kernel/gemm_pipelined.h
+++ b/include/cutlass/gemm/kernel/gemm_pipelined.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -53,7 +53,7 @@ namespace kernel {
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <typename Mma, typename Epilogue, typename ThreadblockSwizzle>
-__global__ void GemmPipelined(
+CUTLASS_GLOBAL void GemmPipelined(
   cutlass::gemm::GemmCoord problem_size,
   cutlass::gemm::GemmCoord grid_tiled_shape,
   typename Mma::IteratorA::Params params_A,
@@ -111,7 +111,7 @@ __global__ void GemmPipelined(
     tb_thread_id,
     tb_offset_B);
 
-  int warp_id = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+  int warp_id = canonical_warp_idx_sync();
   int lane_id = threadIdx.x % 32;
 
   //
diff --git a/include/cutlass/gemm/kernel/gemm_planar_complex.h b/include/cutlass/gemm/kernel/gemm_planar_complex.h
index 7b85fdbdd0..09228ca019 100644
--- a/include/cutlass/gemm/kernel/gemm_planar_complex.h
+++ b/include/cutlass/gemm/kernel/gemm_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -41,6 +41,7 @@
 #include "cutlass/matrix_coord.h"
 #include "cutlass/complex.h"
 #include "cutlass/semaphore.h"
+#include "cutlass/gemm/kernel/params_universal_base.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -105,65 +106,45 @@ struct GemmPlanarComplex {
   //
 
   /// Argument structure
-  struct Arguments {
-
+  struct Arguments : UniversalArgumentsBase
+  {
     //
     // Data members
     //
 
-    GemmUniversalMode mode;
-    GemmCoord problem_size;
-    int batch_count;
-
-    typename EpilogueOutputOp::Params epilogue;
-
-    void const * ptr_A_real;
-    void const * ptr_A_imag;
-
-    void const * ptr_B_real;
-    void const * ptr_B_imag;
-
-    void const * ptr_C_real;
-    void const * ptr_C_imag;
-
-    void * ptr_D_real;
-    void * ptr_D_imag;
-
-    typename LayoutA::Stride::Index lda_real;
-    typename LayoutA::Stride::Index lda_imag;
-    typename LayoutB::Stride::Index ldb_real;
-    typename LayoutB::Stride::Index ldb_imag;
-    typename LayoutC::Stride::Index ldc_real;
-    typename LayoutC::Stride::Index ldc_imag;
-    typename LayoutC::Stride::Index ldd_real;
-    typename LayoutC::Stride::Index ldd_imag;
+    typename EpilogueOutputOp::Params epilogue{};
+
+    void const * ptr_A_real{nullptr};
+    void const * ptr_A_imag{nullptr};
+    void const * ptr_B_real{nullptr};
+    void const * ptr_B_imag{nullptr};
+    void const * ptr_C_real{nullptr};
+    void const * ptr_C_imag{nullptr};
+    void * ptr_D_real{nullptr};
+    void * ptr_D_imag{nullptr};
+
+    typename LayoutA::Stride::Index lda_real{};
+    typename LayoutA::Stride::Index lda_imag{};
+    typename LayoutB::Stride::Index ldb_real{};
+    typename LayoutB::Stride::Index ldb_imag{};
+    typename LayoutC::Stride::Index ldc_real{};
+    typename LayoutC::Stride::Index ldc_imag{};
+    typename LayoutC::Stride::Index ldd_real{};
+    typename LayoutC::Stride::Index ldd_imag{};
     
-    int64_t batch_stride_A;
-    int64_t batch_stride_A_imag;
-    int64_t batch_stride_B;
-    int64_t batch_stride_B_imag;
-    int64_t batch_stride_C;
-    int64_t batch_stride_C_imag;
-    int64_t batch_stride_D;
-    int64_t batch_stride_D_imag;
-
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_A_imag{0};
+    int64_t batch_stride_B{0};
+    int64_t batch_stride_B_imag{0};
+    int64_t batch_stride_C{0};
+    int64_t batch_stride_C_imag{0};
+    int64_t batch_stride_D_imag{0};
 
     //
     // Methods
     //
-    
-    Arguments(): 
-      mode(GemmUniversalMode::kGemm), 
-      batch_count(1), 
-      ptr_A_real(nullptr), 
-      ptr_A_imag(nullptr), 
-      ptr_B_real(nullptr), 
-      ptr_B_imag(nullptr), 
-      ptr_C_real(nullptr), 
-      ptr_C_imag(nullptr), 
-      ptr_D_real(nullptr),
-      ptr_D_imag(nullptr)
-      { }
+
+    Arguments() = default;
 
     /// constructs an arguments structure
     Arguments(
@@ -194,11 +175,9 @@ struct GemmPlanarComplex {
       int64_t batch_stride_C = 0,
       int64_t batch_stride_C_imag = 0,
       int64_t batch_stride_D = 0,
-      int64_t batch_stride_D_imag = 0
-    ):
-      mode(mode), 
-      problem_size(problem_size), 
-      batch_count(batch_count),
+      int64_t batch_stride_D_imag = 0)
+    :
+      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
       epilogue(epilogue), 
       ptr_A_real(ptr_A_real), 
       ptr_A_imag(ptr_A_imag), 
@@ -222,10 +201,8 @@ struct GemmPlanarComplex {
       batch_stride_B_imag(batch_stride_B_imag),
       batch_stride_C(batch_stride_C),
       batch_stride_C_imag(batch_stride_C_imag),
-      batch_stride_D(batch_stride_D),
-      batch_stride_D_imag(batch_stride_D_imag) {
-
-      }
+      batch_stride_D_imag(batch_stride_D_imag)
+    {}
 
     /// Returns arguments for the transposed problem
     Arguments transposed_problem() const {
@@ -243,89 +220,77 @@ struct GemmPlanarComplex {
     }
   };
 
+
   //
   // Structure for precomputing values in host memory and passing to kernels
   //
 
   /// Parameters structure
-  struct Params {
-    cutlass::gemm::GemmCoord problem_size;
-    cutlass::gemm::GemmCoord grid_tiled_shape;
-    int swizzle_log_tile;
-    
-    typename Mma::IteratorA::Params params_A_real;
-    typename Mma::IteratorA::Params params_A_imag;
-    typename Mma::IteratorB::Params params_B_real;
-    typename Mma::IteratorB::Params params_B_imag;
-    typename Epilogue::OutputTileIterator::Params params_C_real;
-    typename Epilogue::OutputTileIterator::Params params_C_imag;
-    typename Epilogue::OutputTileIterator::Params params_D_real;
-    typename Epilogue::OutputTileIterator::Params params_D_imag;
+  struct Params : UniversalParamsBase<
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>
+  {
+    using ParamsBase = UniversalParamsBase<
+      ThreadblockSwizzle,
+      ThreadblockShape,
+      ElementA,
+      ElementB,
+      ElementC,
+      LayoutA,
+      LayoutB>;
+
+    //
+    // Data members
+    //
+
+    typename Mma::IteratorA::Params params_A_real{};
+    typename Mma::IteratorA::Params params_A_imag{};
+    typename Mma::IteratorB::Params params_B_real{};
+    typename Mma::IteratorB::Params params_B_imag{};
+    typename Epilogue::OutputTileIterator::Params params_C_real{};
+    typename Epilogue::OutputTileIterator::Params params_C_imag{};
+    typename Epilogue::OutputTileIterator::Params params_D_real{};
+    typename Epilogue::OutputTileIterator::Params params_D_imag{};
     
-    typename EpilogueOutputOp::Params output_op;
-
-    GemmUniversalMode mode;
-    int batch_count;
-    int gemm_k_size;
-
-    void * ptr_A_real;
-    void * ptr_A_imag;
-    void * ptr_B_real;
-    void * ptr_B_imag;
-    void * ptr_C_real;
-    void * ptr_C_imag;
-    void * ptr_D_real;
-    void * ptr_D_imag;
-
-    int64_t batch_stride_A;
-    int64_t batch_stride_A_imag;
-    int64_t batch_stride_B;
-    int64_t batch_stride_B_imag;
-    int64_t batch_stride_C;
-    int64_t batch_stride_C_imag;
-    int64_t batch_stride_D;
-    int64_t batch_stride_D_imag;
-
-    int *semaphore;
+    typename EpilogueOutputOp::Params output_op{};
+
+    void * ptr_A_real{nullptr};
+    void * ptr_A_imag{nullptr};
+    void * ptr_B_real{nullptr};
+    void * ptr_B_imag{nullptr};
+    void * ptr_C_real{nullptr};
+    void * ptr_C_imag{nullptr};
+    void * ptr_D_real{nullptr};
+    void * ptr_D_imag{nullptr};
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+    int64_t batch_stride_C{0};
+
+    int64_t batch_stride_A_imag{0};
+    int64_t batch_stride_B_imag{0};
+    int64_t batch_stride_C_imag{0};
+    int64_t batch_stride_D_imag{0};
 
     //
-    // Methods
+    // Host dispatch API
     //
 
-    CUTLASS_HOST_DEVICE
-    Params():
-      batch_count(0),
-      gemm_k_size(0),
-      swizzle_log_tile(0),
-      mode(cutlass::gemm::GemmUniversalMode::kGemm),
-      ptr_A_real(nullptr),
-      ptr_A_imag(nullptr),
-      ptr_B_real(nullptr),
-      ptr_B_imag(nullptr),
-      ptr_C_real(nullptr),
-      ptr_C_imag(nullptr),
-      ptr_D_real(nullptr),
-      ptr_D_imag(nullptr),
-      batch_stride_A(0),
-      batch_stride_A_imag(0),
-      batch_stride_B(0),
-      batch_stride_B_imag(0),
-      batch_stride_C(0),
-      batch_stride_C_imag(0),
-      batch_stride_D(0),
-      batch_stride_D_imag(0),
-      semaphore(nullptr) { }
-
-    CUTLASS_HOST_DEVICE
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
     Params(
-      Arguments const &args,
-      cutlass::gemm::GemmCoord const & grid_tiled_shape,
-      int gemm_k_size,
-      void *workspace = nullptr
-    ):
-      problem_size(args.problem_size),
-      grid_tiled_shape(grid_tiled_shape),
-      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      ParamsBase(args, device_sms, sm_occupancy),
       params_A_real(args.lda_real),
       params_A_imag(args.lda_imag),
       params_B_real(args.ldb_real),
@@ -335,9 +300,6 @@ struct GemmPlanarComplex {
       params_D_real(args.ldd_real),
       params_D_imag(args.ldd_imag),
       output_op(args.epilogue),
-      mode(args.mode),
-      batch_count(args.batch_count),
-      gemm_k_size(gemm_k_size),
       ptr_A_real(const_cast<void *>(args.ptr_A_real)),
       ptr_A_imag(const_cast<void *>(args.ptr_A_imag)),
       ptr_B_real(const_cast<void *>(args.ptr_B_real)),
@@ -347,21 +309,31 @@ struct GemmPlanarComplex {
       ptr_D_real(args.ptr_D_real),
       ptr_D_imag(args.ptr_D_imag),
       batch_stride_A(args.batch_stride_A),
-      batch_stride_A_imag(args.batch_stride_A_imag),
       batch_stride_B(args.batch_stride_B),
-      batch_stride_B_imag(args.batch_stride_B_imag),
       batch_stride_C(args.batch_stride_C),
+      batch_stride_A_imag(args.batch_stride_A_imag),
+      batch_stride_B_imag(args.batch_stride_B_imag),
       batch_stride_C_imag(args.batch_stride_C_imag),
-      batch_stride_D(args.batch_stride_D),
-      batch_stride_D_imag(args.batch_stride_D_imag),
-      semaphore(static_cast<int *>(workspace)) {
+      batch_stride_D_imag(args.batch_stride_D_imag)
+    {}
+
+    /// Returns the workspace size (in bytes) needed for this problem geometry
+    size_t get_workspace_size() const
+    {
+      size_t workspace_bytes = ParamsBase::get_workspace_size();
+      if (this->mode == GemmUniversalMode::kGemmSplitKParallel)
+      {
+        // Double the size returned by the base class because we need to
+        // accumulate two ElementC components
+        workspace_bytes *= 2;
+      }
 
+      return workspace_bytes;
     }
 
-    void update(
-      Arguments const &args,
-      void *workspace = nullptr) {
-
+    /// Lightweight update given a subset of arguments.
+    void update(Arguments const &args)
+    {
       ptr_A_real = const_cast<void *>(args.ptr_A_real);
       ptr_A_imag = const_cast<void *>(args.ptr_A_imag);
 
@@ -375,20 +347,19 @@ struct GemmPlanarComplex {
       ptr_D_imag = const_cast<void *>(args.ptr_D_imag);
 
       batch_stride_A = args.batch_stride_A;
-      batch_stride_A_imag = args.batch_stride_A_imag;
       batch_stride_B = args.batch_stride_B;
-      batch_stride_B_imag = args.batch_stride_B_imag;
       batch_stride_C = args.batch_stride_C;
+      this->batch_stride_D = args.batch_stride_D;
+      batch_stride_A_imag = args.batch_stride_A_imag;
+      batch_stride_B_imag = args.batch_stride_B_imag;
       batch_stride_C_imag = args.batch_stride_C_imag;
-      batch_stride_D = args.batch_stride_D;
       batch_stride_D_imag = args.batch_stride_D_imag;
 
       output_op = args.epilogue;
-      
-      semaphore = static_cast<int *>(workspace);
     }
   };
 
+
   /// Shared memory storage structure
   union SharedStorage {
     typename Mma::SharedStorage main_loop;
@@ -398,15 +369,12 @@ struct GemmPlanarComplex {
 public:
 
   //
-  // Methods
+  // Host dispatch API
   //
 
-  CUTLASS_DEVICE
-  GemmPlanarComplex() { } 
-
   /// Determines whether kernel satisfies alignment
-  static Status can_implement(Arguments const &args) {
-
+  static Status can_implement(Arguments const &args)
+  {
     static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
     static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
     static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
@@ -440,12 +408,23 @@ struct GemmPlanarComplex {
     return Status::kSuccess;
   }
 
-  static size_t get_extra_workspace_size(Arguments const &args,
-                                         cutlass::gemm::GemmCoord const &grid_tiled_shape) {
+public:
+
+  //
+  // Device-only API
+  //
 
-    return 0;
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmPlanarComplex op;
+    op(params, shared_storage);
   }
 
+
   /// Executes one GEMM
   CUTLASS_DEVICE
   void operator()(Params const &params, SharedStorage &shared_storage) {
@@ -546,7 +525,7 @@ struct GemmPlanarComplex {
 
     // Broadcast the warp_id computed by lane 0 to ensure dependent code
     // is compiled as warp-uniform.
-    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int warp_idx = canonical_warp_idx_sync();
 
     int lane_idx = threadIdx.x % 32;
 
diff --git a/include/cutlass/gemm/kernel/gemm_planar_complex_array.h b/include/cutlass/gemm/kernel/gemm_planar_complex_array.h
index 1360a2c047..0c21fb8d8f 100644
--- a/include/cutlass/gemm/kernel/gemm_planar_complex_array.h
+++ b/include/cutlass/gemm/kernel/gemm_planar_complex_array.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -41,6 +41,7 @@
 #include "cutlass/matrix_coord.h"
 #include "cutlass/complex.h"
 #include "cutlass/semaphore.h"
+#include "cutlass/gemm/kernel/params_universal_base.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -105,65 +106,44 @@ struct GemmPlanarComplexArray {
   //
 
   /// Argument structure
-  struct Arguments {
-
+  struct Arguments : UniversalArgumentsBase
+  {
     //
     // Data members
     //
 
-    GemmUniversalMode mode;
-    GemmCoord problem_size;
-    int batch_count;
-
-    typename EpilogueOutputOp::Params epilogue;
+    typename EpilogueOutputOp::Params epilogue{};
 
-    int const *ptr_M;
-    int const *ptr_N;
-    int const *ptr_K;
+    int const *ptr_M{nullptr};
+    int const *ptr_N{nullptr};
+    int const *ptr_K{nullptr};
 
-    void const * const * ptr_A_real;
-    void const * const * ptr_A_imag;
+    void const * const * ptr_A_real{nullptr};
+    void const * const * ptr_A_imag{nullptr};
 
-    void const * const * ptr_B_real;
-    void const * const * ptr_B_imag;
+    void const * const * ptr_B_real{nullptr};
+    void const * const * ptr_B_imag{nullptr};
 
-    void const * const * ptr_C_real;
-    void const * const * ptr_C_imag;
+    void const * const * ptr_C_real{nullptr};
+    void const * const * ptr_C_imag{nullptr};
 
-    void * const * ptr_D_real;
-    void * const * ptr_D_imag;
+    void * const * ptr_D_real{nullptr};
+    void * const * ptr_D_imag{nullptr};
 
-    typename LayoutA::Stride::Index lda_real;
-    typename LayoutA::Stride::Index lda_imag;
-    typename LayoutB::Stride::Index ldb_real;
-    typename LayoutB::Stride::Index ldb_imag;
-    typename LayoutC::Stride::Index ldc_real;
-    typename LayoutC::Stride::Index ldc_imag;
-    typename LayoutC::Stride::Index ldd_real;
-    typename LayoutC::Stride::Index ldd_imag;
-
-    int64_t batch_stride_D;    // unused
+    typename LayoutA::Stride::Index lda_real{};
+    typename LayoutA::Stride::Index lda_imag{};
+    typename LayoutB::Stride::Index ldb_real{};
+    typename LayoutB::Stride::Index ldb_imag{};
+    typename LayoutC::Stride::Index ldc_real{};
+    typename LayoutC::Stride::Index ldc_imag{};
+    typename LayoutC::Stride::Index ldd_real{};
+    typename LayoutC::Stride::Index ldd_imag{};
 
     //
     // Methods
     //
-    
-    Arguments(): 
-      mode(GemmUniversalMode::kArray),
-      batch_count(1),
-      ptr_M(nullptr),
-      ptr_N(nullptr),
-      ptr_K(nullptr),
-      ptr_A_real(nullptr), 
-      ptr_A_imag(nullptr), 
-      ptr_B_real(nullptr), 
-      ptr_B_imag(nullptr), 
-      ptr_C_real(nullptr), 
-      ptr_C_imag(nullptr), 
-      ptr_D_real(nullptr),
-      ptr_D_imag(nullptr),
-      batch_stride_D(0)
-      { }
+
+    Arguments() = default;
 
     /// constructs an arguments structure
     Arguments(
@@ -188,11 +168,9 @@ struct GemmPlanarComplexArray {
       typename LayoutC::Stride::Index ldc_real,
       typename LayoutC::Stride::Index ldc_imag,
       typename LayoutC::Stride::Index ldd_real,
-      typename LayoutC::Stride::Index ldd_imag
-    ):
-      mode(GemmUniversalMode::kArray),
-      problem_size(problem_size), 
-      batch_count(batch_count),
+      typename LayoutC::Stride::Index ldd_imag)
+    :
+      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
       epilogue(epilogue),
       ptr_M(ptr_M),
       ptr_N(ptr_N),
@@ -212,10 +190,8 @@ struct GemmPlanarComplexArray {
       ldc_real(ldc_real),
       ldc_imag(ldc_imag),
       ldd_real(ldd_real),
-      ldd_imag(ldd_imag),
-      batch_stride_D(0) {
-
-      }
+      ldd_imag(ldd_imag)
+    {}
 
     /// Returns arguments for the transposed problem
     Arguments transposed_problem() const {
@@ -232,71 +208,72 @@ struct GemmPlanarComplexArray {
     }
   };
 
+
   //
   // Structure for precomputing values in host memory and passing to kernels
   //
 
   /// Parameters structure
-  struct Params {
-    cutlass::gemm::GemmCoord problem_size;
-    cutlass::gemm::GemmCoord grid_tiled_shape;
-    int swizzle_log_tile;
-    typename Mma::IteratorA::Params params_A_real;
-    typename Mma::IteratorA::Params params_A_imag;
-    typename Mma::IteratorB::Params params_B_real;
-    typename Mma::IteratorB::Params params_B_imag;
-    typename Epilogue::OutputTileIterator::Params params_C_real;
-    typename Epilogue::OutputTileIterator::Params params_C_imag;
-    typename Epilogue::OutputTileIterator::Params params_D_real;
-    typename Epilogue::OutputTileIterator::Params params_D_imag;
-    
-    typename EpilogueOutputOp::Params output_op;
+  struct Params : UniversalParamsBase<
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>
+  {
+    using ParamsBase = UniversalParamsBase<
+      ThreadblockSwizzle,
+      ThreadblockShape,
+      ElementA,
+      ElementB,
+      ElementC,
+      LayoutA,
+      LayoutB>;
 
-    int batch_count;
-    
-    int const *ptr_M;
-    int const *ptr_N;
-    int const *ptr_K;
-
-    void const * const * ptr_A_real;
-    void const * const * ptr_A_imag;
-    void const * const * ptr_B_real;
-    void const * const * ptr_B_imag;
-    void const * const * ptr_C_real;
-    void const * const * ptr_C_imag;
-    void * const * ptr_D_real;
-    void * const * ptr_D_imag;
+    //
+    // Data members
+    //
+
+    typename Mma::IteratorA::Params params_A_real{};
+    typename Mma::IteratorA::Params params_A_imag{};
+    typename Mma::IteratorB::Params params_B_real{};
+    typename Mma::IteratorB::Params params_B_imag{};
+    typename Epilogue::OutputTileIterator::Params params_C_real{};
+    typename Epilogue::OutputTileIterator::Params params_C_imag{};
+    typename Epilogue::OutputTileIterator::Params params_D_real{};
+    typename Epilogue::OutputTileIterator::Params params_D_imag{};
+
+    typename EpilogueOutputOp::Params output_op{};
+
+    int const *ptr_M{nullptr};
+    int const *ptr_N{nullptr};
+    int const *ptr_K{nullptr};
+
+    void const * const * ptr_A_real{nullptr};
+    void const * const * ptr_A_imag{nullptr};
+    void const * const * ptr_B_real{nullptr};
+    void const * const * ptr_B_imag{nullptr};
+    void const * const * ptr_C_real{nullptr};
+    void const * const * ptr_C_imag{nullptr};
+    void * const * ptr_D_real{nullptr};
+    void * const * ptr_D_imag{nullptr};
 
     //
-    // Methods
+    // Host dispatch API
     //
 
-    CUTLASS_HOST_DEVICE
-    Params():
-      batch_count(0),
-      swizzle_log_tile(0),
-      ptr_M(nullptr),
-      ptr_N(nullptr),
-      ptr_K(nullptr),
-      ptr_A_real(nullptr),
-      ptr_A_imag(nullptr),
-      ptr_B_real(nullptr),
-      ptr_B_imag(nullptr),
-      ptr_C_real(nullptr),
-      ptr_C_imag(nullptr),
-      ptr_D_real(nullptr),
-      ptr_D_imag(nullptr) { }
-
-    CUTLASS_HOST_DEVICE
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
     Params(
-      Arguments const &args,
-      cutlass::gemm::GemmCoord const & grid_tiled_shape,
-      int gemm_k_size = 0,                                    // ignored
-      void *workspace = nullptr                               // ignored
-    ):
-      problem_size(args.problem_size),
-      grid_tiled_shape(grid_tiled_shape),
-      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      ParamsBase(args, device_sms, sm_occupancy),
       ptr_M(args.ptr_M),
       ptr_N(args.ptr_N),
       ptr_K(args.ptr_K),
@@ -309,7 +286,6 @@ struct GemmPlanarComplexArray {
       params_D_real(args.ldd_real),
       params_D_imag(args.ldd_imag),
       output_op(args.epilogue),
-      batch_count(args.batch_count),
       ptr_A_real(args.ptr_A_real),
       ptr_A_imag(args.ptr_A_imag),
       ptr_B_real(args.ptr_B_real),
@@ -317,14 +293,12 @@ struct GemmPlanarComplexArray {
       ptr_C_real(args.ptr_C_real),
       ptr_C_imag(args.ptr_C_imag),
       ptr_D_real(args.ptr_D_real),
-      ptr_D_imag(args.ptr_D_imag) {
-
-    }
-
-    void update(
-      Arguments const &args,
-      void *workspace = nullptr) {
+      ptr_D_imag(args.ptr_D_imag)
+    {}
 
+    /// Lightweight update given a subset of arguments.
+    void update(Arguments const &args)
+    {
       ptr_M = args.ptr_M;
       ptr_N = args.ptr_N;
       ptr_K = args.ptr_K;
@@ -345,6 +319,7 @@ struct GemmPlanarComplexArray {
     }
   };
 
+
   /// Shared memory storage structure
   union SharedStorage {
     typename Mma::SharedStorage main_loop;
@@ -354,12 +329,9 @@ struct GemmPlanarComplexArray {
 public:
 
   //
-  // Methods
+  // Host dispatch API
   //
 
-  CUTLASS_DEVICE
-  GemmPlanarComplexArray() { } 
-
   /// Determines whether kernel satisfies alignment
   static Status can_implement(Arguments const &args) {
 
@@ -396,12 +368,24 @@ struct GemmPlanarComplexArray {
     return Status::kSuccess;
   }
 
-  static size_t get_extra_workspace_size(Arguments const &args,
-                                         cutlass::gemm::GemmCoord const &grid_tiled_shape) {
 
-    return 0;
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmPlanarComplexArray op;
+    op(params, shared_storage);
   }
- 
+
+
   /// Executes one GEMM
   CUTLASS_DEVICE
   void operator()(Params const &params, SharedStorage &shared_storage) {
@@ -474,7 +458,7 @@ struct GemmPlanarComplexArray {
 
         // Broadcast the warp_id computed by lane 0 to ensure dependent code
         // is compiled as warp-uniform.
-        int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+        int warp_idx = canonical_warp_idx_sync();
         int lane_idx = threadIdx.x % 32;
     
         //
diff --git a/include/cutlass/gemm/kernel/gemm_sparse_universal.h b/include/cutlass/gemm/kernel/gemm_sparse_universal.h
new file mode 100644
index 0000000000..c5420c72d9
--- /dev/null
+++ b/include/cutlass/gemm/kernel/gemm_sparse_universal.h
@@ -0,0 +1,804 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/params_universal_base.h"
+
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+namespace detail {
+
+template <
+  typename LayoutA,
+  typename LayoutB,
+  typename LayoutC,
+  typename LayoutE
+>
+struct SparseUniversalArgumentsBase : UniversalArgumentsBase {
+  //
+  // Data members
+  //
+
+  void const * ptr_A;
+  void const * ptr_B;
+  void const * ptr_C;
+  void * ptr_D;
+  void const * ptr_E;
+
+  int64_t batch_stride_A;
+  int64_t batch_stride_B;
+  int64_t batch_stride_C;
+  int64_t batch_stride_E;
+
+  typename LayoutA::Stride::LongIndex lda;
+  typename LayoutB::Stride::LongIndex ldb;
+  typename LayoutC::Stride::LongIndex ldc;
+  typename LayoutC::Stride::LongIndex ldd;
+  typename LayoutE::Stride::LongIndex lde;
+
+  //
+  // Methods
+  //
+
+  SparseUniversalArgumentsBase():
+    ptr_A(nullptr), ptr_B(nullptr), ptr_C(nullptr), ptr_D(nullptr), ptr_E(nullptr)
+  {}
+
+  /// constructs an arguments structure
+  SparseUniversalArgumentsBase(
+    GemmUniversalMode mode,
+    GemmCoord problem_size,
+    int batch_count,
+    void const * ptr_A,
+    void const * ptr_B,
+    void const * ptr_C,
+    void * ptr_D,
+    void const * ptr_E,
+    int64_t batch_stride_A,
+    int64_t batch_stride_B,
+    int64_t batch_stride_C,
+    int64_t batch_stride_D,
+    int64_t batch_stride_E,
+    typename LayoutA::Stride::LongIndex lda,
+    typename LayoutB::Stride::LongIndex ldb,
+    typename LayoutC::Stride::LongIndex ldc,
+    typename LayoutC::Stride::LongIndex ldd,
+    typename LayoutC::Stride::LongIndex lde)
+  :
+    UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
+    ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D), ptr_E(ptr_E),
+    batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C),
+    batch_stride_E(batch_stride_E),
+    lda(lda), ldb(ldb), ldc(ldc), ldd(ldd), lde(lde)
+  {
+    CUTLASS_TRACE_HOST("SparseUniversalArgumentsBase::Arguments() - problem_size: " << problem_size);
+  }
+};
+
+template <
+  typename Mma,
+  typename Epilogue,
+  typename Arguments,
+  typename ThreadblockSwizzle,
+  typename ThreadblockShape,
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename LayoutA,
+  typename LayoutB
+>
+struct SparseUniversalParamsBase : UniversalParamsBase<
+  ThreadblockSwizzle,
+  ThreadblockShape,
+  ElementA,
+  ElementB,
+  ElementC,
+  LayoutA,
+  LayoutB> {
+  using ParamsBase = UniversalParamsBase<
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>;
+
+  //
+  // Data members
+  //
+
+  typename Mma::IteratorA::Params params_A;
+  typename Mma::IteratorB::Params params_B;
+  typename Epilogue::OutputTileIterator::Params params_C;
+  typename Epilogue::OutputTileIterator::Params params_D;
+  typename Mma::IteratorE::Params params_E;
+
+  void * ptr_A;
+  void * ptr_B;
+  void * ptr_C;
+  void * ptr_D;
+  void * ptr_E;
+
+  int64_t batch_stride_A;
+  int64_t batch_stride_B;
+  int64_t batch_stride_C;
+  int64_t batch_stride_E;
+
+  //
+  // Host dispatch API
+  //
+
+  /// Default constructor
+  SparseUniversalParamsBase() = default;
+
+  /// Constructor
+  SparseUniversalParamsBase(
+    Arguments const &args,  /// GEMM application arguments
+    int device_sms,         /// Number of SMs on the device
+    int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+  :
+    ParamsBase(args, device_sms, sm_occupancy),
+    params_A(args.lda),
+    params_B(args.ldb),
+    params_C(args.ldc),
+    params_D(args.ldd),
+    params_E(args.lde),
+    ptr_A(const_cast<void *>(args.ptr_A)),
+    ptr_B(const_cast<void *>(args.ptr_B)),
+    ptr_C(const_cast<void *>(args.ptr_C)),
+    ptr_D(args.ptr_D),
+    ptr_E(const_cast<void *>(args.ptr_E)),
+    batch_stride_A(args.batch_stride_A),
+    batch_stride_B(args.batch_stride_B),
+    batch_stride_C(args.batch_stride_C),
+    batch_stride_E(args.batch_stride_E)
+  {}
+
+  /// Lightweight update given a subset of arguments.
+  void update(Arguments const &args)
+  {
+    CUTLASS_TRACE_HOST("SparseUniversalParamsBase::update()");
+
+    // Update input/output pointers
+    this->ptr_A = const_cast<void *>(args.ptr_A);
+    this->ptr_B = const_cast<void *>(args.ptr_B);
+    this->ptr_C = const_cast<void *>(args.ptr_C);
+    this->ptr_D = args.ptr_D;
+    this->ptr_E = const_cast<void *>(args.ptr_E);
+
+    this->batch_stride_A = args.batch_stride_A;
+    this->batch_stride_B = args.batch_stride_B;
+    this->batch_stride_C = args.batch_stride_C;
+    this->batch_stride_D = args.batch_stride_D;
+    this->batch_stride_E = args.batch_stride_E;
+  }
+};
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+class GemmSparseUniversal {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  static int const kSparse = Mma::kSparse;
+  static int const kMetaSizeInBits = Mma::kMetaSizeInBits;
+  static int const kMaxID2 = Mma::kMaxID2;
+  static int const kElementsPerElementE = Mma::kElementsPerElementE;
+
+  using ElementE = typename Mma::ElementE;
+  using LayoutE = typename Mma::LayoutE;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments : detail::SparseUniversalArgumentsBase<
+      LayoutA,
+      LayoutB,
+      LayoutC,
+      LayoutE
+    > {
+    using Base = detail::SparseUniversalArgumentsBase<
+      LayoutA,
+      LayoutB,
+      LayoutC,
+      LayoutE
+    >;
+
+    typename EpilogueOutputOp::Params epilogue;
+
+    Arguments() {}
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      void const * ptr_E,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      int64_t batch_stride_E,
+      typename LayoutA::Stride::LongIndex lda,
+      typename LayoutB::Stride::LongIndex ldb,
+      typename LayoutC::Stride::LongIndex ldc,
+      typename LayoutC::Stride::LongIndex ldd,
+      typename LayoutC::Stride::LongIndex lde)
+    :
+      Base(
+        mode, problem_size, batch_count,
+        ptr_A, ptr_B, ptr_C, ptr_D, ptr_E,
+        batch_stride_A, batch_stride_B, batch_stride_C, batch_stride_D, batch_stride_E,
+        lda, ldb, ldc, ldd, lde
+      ),
+      epilogue(epilogue)
+    {
+      CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
+    }
+  };
+
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params : detail::SparseUniversalParamsBase<
+    Mma,
+    Epilogue,
+    Arguments,
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>
+  {
+    using ParamsBase = detail::SparseUniversalParamsBase<
+      Mma,
+      Epilogue,
+      Arguments,
+      ThreadblockSwizzle,
+      ThreadblockShape,
+      ElementA,
+      ElementB,
+      ElementC,
+      LayoutA,
+      LayoutB>;
+
+    typename EpilogueOutputOp::Params output_op;
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      ParamsBase(args, device_sms, sm_occupancy),
+      output_op(args.epilogue)
+    {}
+
+    /// Lightweight update given a subset of arguments.
+    void update(Arguments const &args)
+    {
+      CUTLASS_TRACE_HOST("GemmUniversal::Params::update()");
+
+      // Update input/output pointers
+      this->ptr_A = const_cast<void *>(args.ptr_A);
+      this->ptr_B = const_cast<void *>(args.ptr_B);
+      this->ptr_C = const_cast<void *>(args.ptr_C);
+      this->ptr_D = args.ptr_D;
+      this->ptr_E = const_cast<void *>(args.ptr_E);
+
+      this->batch_stride_A = args.batch_stride_A;
+      this->batch_stride_B = args.batch_stride_B;
+      this->batch_stride_C = args.batch_stride_C;
+      this->batch_stride_D = args.batch_stride_D;
+      this->batch_stride_E = args.batch_stride_E;
+
+      output_op = args.epilogue;
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+
+public:
+
+  //
+  // Host dispatch API
+  //
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size,
+    GemmUniversalMode mode,
+    int split_k_count)
+  {
+    CUTLASS_TRACE_HOST("GemmUniversal::can_implement()");
+
+    static int const kAlignmentA = (cute::is_same<LayoutA,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (cute::is_same<LayoutA,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = (cute::is_same<LayoutB,
+                                                      layout::RowMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (cute::is_same<LayoutB,
+                                                        layout::RowMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = (cute::is_same<LayoutC,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (cute::is_same<LayoutC,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    static int const kAlignmentE = Mma::IteratorE::AccessType::kElements;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+    bool isEMisaligned = false;
+
+    if (cute::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = (problem_size.k() / kSparse) % kAlignmentA;
+    } else if (cute::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (cute::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
+            || cute::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = (problem_size.k() / kSparse) % kAlignmentA;
+    }
+
+    if (cute::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (cute::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = (problem_size.k() / kSparse) % kAlignmentB;
+    } else if (cute::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
+            || cute::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = (problem_size.k() / kSparse) % kAlignmentB;
+    }
+
+    if (cute::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (cute::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (cute::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
+            || cute::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    isEMisaligned = (problem_size.m() % kAlignmentE)
+                  || ((problem_size.k() / kSparse) % kAlignmentE);
+
+    // The k dimension has to be the multiple of the Threadblock k because out
+    // of bound meta data would be initialized to 0 by acync.zfill but 0 is not
+    // a valid meta data.
+    if (problem_size.k() % Mma::Shape::kK) {
+      isEMisaligned = true;
+    }
+
+    if (mode == GemmUniversalMode::kGemm
+     || mode == GemmUniversalMode::kGemmSplitKParallel) {
+      if ((problem_size.k() / split_k_count) % Mma::Shape::kK) {
+        isEMisaligned = true;
+      }
+    }
+
+    // M dimension has to be multiple of 32 (sparse float) or 16 (sparse int) 
+    // because of the row reordering of operand E
+    static int const kAlignmentM = (sizeof(ElementE) == 2) ? 32 : 16;
+
+    if (problem_size.m() % kAlignmentM) {
+      isEMisaligned = true;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isEMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for E operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    CUTLASS_TRACE_HOST("  returning kSuccess");
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size, args.mode, args.batch_count);
+  }
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmSparseUniversal op;
+    op(params, shared_storage);
+  }
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+    ThreadblockSwizzle threadblock_swizzle;
+    run_with_swizzle(params, shared_storage, threadblock_swizzle);
+  }
+
+  /// Executes one GEMM with an externally-provided swizzling function
+  CUTLASS_DEVICE
+  void run_with_swizzle(Params const &params, SharedStorage &shared_storage, ThreadblockSwizzle& threadblock_swizzle) {
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+    ElementE *ptr_E = static_cast<ElementE *>(params.ptr_E);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm ||
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A / kSparse;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+      ptr_E += threadblock_tile_offset.k() * params.batch_stride_E / kSparse;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
+      ptr_E = static_cast<ElementE * const *>(params.ptr_E)[threadblock_tile_offset.k()];
+    }
+
+    __syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k / kSparse,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    cutlass::MatrixCoord tb_offset_E{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k / kSparse / kElementsPerElementE,
+    };
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k / kSparse},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    typename Mma::IteratorE iterator_E(
+      params.params_E,
+      ptr_E,
+      {params.problem_size.m(), problem_size_k / kSparse / kElementsPerElementE},
+      thread_idx,
+      tb_offset_E);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations,
+      accumulators,
+      iterator_A,
+      iterator_B,
+      iterator_E,
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+
+    //
+    // Fetch pointers based on mode.
+    //
+
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      ptr_C,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+    }
+
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+      output_op,
+      iterator_D,
+      accumulators,
+      iterator_C);
+
+    //
+    // Release the semaphore
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/gemm_sparse_universal_with_absmax.h b/include/cutlass/gemm/kernel/gemm_sparse_universal_with_absmax.h
new file mode 100644
index 0000000000..47b76a171d
--- /dev/null
+++ b/include/cutlass/gemm/kernel/gemm_sparse_universal_with_absmax.h
@@ -0,0 +1,609 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/params_universal_base.h"
+#include "cutlass/gemm/kernel/gemm_sparse_universal.h"
+
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+class GemmSparseUniversalWithAbsmax {
+public:
+  using Base = GemmSparseUniversal<Mma_, Epilogue_, ThreadblockSwizzle_>;
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  static int const kSparse = Mma::kSparse;
+  static int const kMetaSizeInBits = Mma::kMetaSizeInBits;
+  static int const kMaxID2 = Mma::kMaxID2;
+  static int const kElementsPerElementE = Mma::kElementsPerElementE;
+
+  using ElementE = typename Mma::ElementE;
+  using LayoutE = typename Mma::LayoutE;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+  using ElementAux = typename Epilogue::AuxOutputTileIterator::Element;
+  using LayoutAux = typename Epilogue::AuxOutputTileIterator::Layout;
+  using ElementVector = typename Epilogue::ElementVector;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments : detail::SparseUniversalArgumentsBase<
+      LayoutA,
+      LayoutB,
+      LayoutC,
+      LayoutE
+    > {
+    using Base = detail::SparseUniversalArgumentsBase<
+      LayoutA,
+      LayoutB,
+      LayoutC,
+      LayoutE
+    >;
+
+    void const* ptr_Aux;
+    void const* ptr_Vector;
+    int64_t batch_stride_Aux;
+    int64_t batch_stride_Vector;
+    typename LayoutAux::Stride::LongIndex ldaux;
+    int64_t ldvector;
+
+    typename EpilogueOutputOp::Params epilogue;
+
+    Arguments() {}
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      void const * ptr_E,
+      void const * ptr_Aux,
+      void const * ptr_Vector,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      int64_t batch_stride_E,
+      int64_t batch_stride_Aux,
+      int64_t batch_stride_Vector,
+      typename LayoutA::Stride::LongIndex lda,
+      typename LayoutB::Stride::LongIndex ldb,
+      typename LayoutC::Stride::LongIndex ldc,
+      typename LayoutC::Stride::LongIndex ldd,
+      typename LayoutC::Stride::LongIndex lde,
+      typename LayoutAux::Stride::LongIndex ldaux,
+      int64_t ldvector
+      )
+    :
+      Base(
+        mode, problem_size, batch_count,
+        ptr_A, ptr_B, ptr_C, ptr_D, ptr_E,
+        batch_stride_A, batch_stride_B, batch_stride_C, batch_stride_D, batch_stride_E,
+        lda, ldb, ldc, ldd, lde
+      ),
+      ptr_Aux(ptr_Aux),
+      ptr_Vector(ptr_Vector),
+      batch_stride_Aux(batch_stride_Aux),
+      batch_stride_Vector(batch_stride_Vector),
+      ldaux(ldaux),
+      ldvector(ldvector),
+      epilogue(epilogue)
+    { }
+  };
+
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params : detail::SparseUniversalParamsBase<
+    Mma,
+    Epilogue,
+    Arguments,
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>
+  {
+    using ParamsBase = detail::SparseUniversalParamsBase<
+      Mma,
+      Epilogue,
+      Arguments,
+      ThreadblockSwizzle,
+      ThreadblockShape,
+      ElementA,
+      ElementB,
+      ElementC,
+      LayoutA,
+      LayoutB>;
+
+    typename Epilogue::AuxOutputTileIterator::Params params_Aux;
+    int64_t ldvector;
+
+    void* ptr_Aux;
+    void* ptr_Vector;
+
+    int64_t batch_stride_Aux;
+    int64_t batch_stride_Vector;
+    typename EpilogueOutputOp::Params output_op;
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      ParamsBase(args, device_sms, sm_occupancy),
+      params_Aux(args.ldaux),
+      ldvector(args.ldvector),
+      ptr_Aux(const_cast<void *>(args.ptr_Aux)),
+      ptr_Vector(const_cast<void *>(args.ptr_Vector)),
+      batch_stride_Aux(args.batch_stride_Aux),
+      batch_stride_Vector(args.batch_stride_Vector),
+      output_op(args.epilogue)
+    {}
+
+    /// Lightweight update given a subset of arguments.
+    void update(Arguments const &args)
+    {
+      CUTLASS_TRACE_HOST("GemmUniversal::Params::update()");
+
+      // Update input/output pointers
+      this->ptr_A = const_cast<void *>(args.ptr_A);
+      this->ptr_B = const_cast<void *>(args.ptr_B);
+      this->ptr_C = const_cast<void *>(args.ptr_C);
+      this->ptr_D = args.ptr_D;
+      this->ptr_E = const_cast<void *>(args.ptr_E);
+      ptr_Aux = const_cast<void *>(args.ptr_Aux);
+      ptr_Vector = const_cast<void *>(args.ptr_Vector);
+
+      this->batch_stride_A = args.batch_stride_A;
+      this->batch_stride_B = args.batch_stride_B;
+      this->batch_stride_C = args.batch_stride_C;
+      this->batch_stride_D = args.batch_stride_D;
+      this->batch_stride_E = args.batch_stride_E;
+      this->batch_stride_Aux = args.batch_stride_Aux;
+      batch_stride_Vector = args.batch_stride_Vector;
+
+      output_op = args.epilogue;
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+
+public:
+
+  //
+  // Host dispatch API
+  //
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size,
+    GemmUniversalMode mode,
+    int split_k_count) {
+    return Base::can_implement(problem_size, mode, split_k_count);
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size, args.mode, args.batch_count);
+  }
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmSparseUniversalWithAbsmax op;
+    op(params, shared_storage);
+  }
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+    ThreadblockSwizzle threadblock_swizzle;
+    run_with_swizzle(params, shared_storage, threadblock_swizzle);
+  }
+
+  /// Executes one GEMM with an externally-provided swizzling function
+  CUTLASS_DEVICE
+  void run_with_swizzle(Params const &params, SharedStorage &shared_storage, ThreadblockSwizzle& threadblock_swizzle) {
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+    ElementE *ptr_E = static_cast<ElementE *>(params.ptr_E);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm ||
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A / kSparse;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+      ptr_E += threadblock_tile_offset.k() * params.batch_stride_E / kSparse;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
+      ptr_E = static_cast<ElementE * const *>(params.ptr_E)[threadblock_tile_offset.k()];
+    }
+
+    __syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k / kSparse,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    cutlass::MatrixCoord tb_offset_E{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k / kSparse / kElementsPerElementE,
+    };
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k / kSparse},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    typename Mma::IteratorE iterator_E(
+      params.params_E,
+      ptr_E,
+      {params.problem_size.m(), problem_size_k / kSparse / kElementsPerElementE},
+      thread_idx,
+      tb_offset_E);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations,
+      accumulators,
+      iterator_A,
+      iterator_B,
+      iterator_E,
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+    ElementAux * ptr_Aux = static_cast<ElementAux *>(params.ptr_Aux);
+    ElementVector * ptr_Vector = static_cast<ElementVector *>(params.ptr_Vector);
+
+    //
+    // Fetch pointers based on mode.
+    //
+
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+      if (ptr_Aux) {
+        ptr_Aux += threadblock_tile_offset.k() * params.batch_stride_Aux;
+      }
+      if (ptr_Vector) {
+        ptr_Vector += threadblock_tile_offset.k() * params.batch_stride_Vector;
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+      if (ptr_Aux) {
+        ptr_Aux = static_cast<ElementAux * const *>(params.ptr_Aux)[threadblock_tile_offset.k()];
+      }
+      if (ptr_Vector) {
+        ptr_Vector = static_cast<ElementVector * const *>(params.ptr_Vector)[threadblock_tile_offset.k()];
+      }
+    }
+
+    // Move to appropriate location for this output tile
+    if (ptr_Vector) {
+      ptr_Vector += threadblock_offset.column() + threadblock_tile_offset.m() * params.ldvector;
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      ptr_C,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to auxiliary destination tensor.
+    typename Epilogue::AuxOutputTileIterator iterator_Aux(
+      params.params_Aux,
+      // Only the final block writes the auxiliary tensor
+      ((params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) &&
+          (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
+          ? nullptr
+          : ptr_Aux,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+    }
+
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+      output_op,
+      // Only the final block uses Vector
+      ((params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) &&
+       (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
+          ? nullptr
+          : ptr_Vector,
+      iterator_D,
+      accumulators,
+      iterator_C,
+      iterator_Aux,
+      params.problem_size.mn(),
+      threadblock_offset);
+
+    //
+    // Release the semaphore
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/gemm_splitk_parallel.h b/include/cutlass/gemm/kernel/gemm_splitk_parallel.h
index f9b117fdb0..8ab98ff010 100644
--- a/include/cutlass/gemm/kernel/gemm_splitk_parallel.h
+++ b/include/cutlass/gemm/kernel/gemm_splitk_parallel.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/gemm_streamk_with_fused_epilogue.h b/include/cutlass/gemm/kernel/gemm_streamk_with_fused_epilogue.h
new file mode 100644
index 0000000000..013fb7730e
--- /dev/null
+++ b/include/cutlass/gemm/kernel/gemm_streamk_with_fused_epilogue.h
@@ -0,0 +1,2396 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Stream-K Gemm kernel compatible with fused epilogues
+    that broadcast a bias vector over the MMA output.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/layout.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/barrier.h"
+#include "cutlass/block_striped.h"
+#include "cutlass/semaphore.h"
+
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  bool IsSingleSource = Epilogue_::kIsSingleSource
+>
+struct GemmStreamkWithFusedEpilogue;
+
+// GemmStreamkWithFusedEpilogue with two sources
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+struct GemmStreamkWithFusedEpilogue<Mma_, Epilogue_, ThreadblockSwizzle_, false> {
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+  /// The per-thread tile of raw accumulators
+  using AccumulatorTile = typename Mma::FragmentC;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Workspace bytes per thread block
+  static size_t const kWorkspaceBytesPerBlock =
+    __NV_STD_MAX(
+      kThreadCount * sizeof(AccumulatorTile),
+      Epilogue::kWorkspaceBytesPerBlock);
+
+  /// Block-striped reduction utility
+  using BlockStripedReduceT = BlockStripedReduce<kThreadCount, AccumulatorTile>;
+
+
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmUniversalMode mode{GemmUniversalMode::kGemm};
+    GemmCoord problem_size{};
+    int batch_count{1};        // Either (mode == GemmUniversalMode::kBatched) the batch count, or (mode == GemmUniversalMode::kGemm) the tile-splitting factor
+
+    typename EpilogueOutputOp::Params epilogue{};
+
+    void const * ptr_A{nullptr};
+    void const * ptr_B{nullptr};
+    void const * ptr_C1{nullptr};
+    void const * ptr_C2{nullptr};
+    void * ptr_D{nullptr};
+
+    void * ptr_Vector;
+    void * ptr_Tensor;
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+    int64_t batch_stride_C1{0};
+    int64_t batch_stride_C2{0};
+    int64_t batch_stride_D{0};
+    int64_t batch_stride_Vector{0};
+    int64_t batch_stride_Tensor{0};
+
+    typename LayoutA::Stride::Index lda{};
+    typename LayoutB::Stride::Index ldb{};
+    typename LayoutC::Stride::Index ldc1{};
+    typename LayoutC::Stride::Index ldc2{};
+    typename LayoutC::Stride::Index ldd{};
+    typename LayoutC::Stride::Index ldr{};
+    typename LayoutC::Stride::Index ldt{};
+
+    int avail_sms{-1};          /// The number of SMs that StreamK dispatch heuristics will attempt to load-balance across (-1 defaults to device width, 1 implies classic data-parallel scheduling)
+
+
+    //
+    // Methods
+    //
+
+    /// Default Constructor
+    Arguments() = default;
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_split,                              /// Either (mode == GemmUniversalMode::kBatched) the batch count, or (mode == GemmUniversalMode::kGemm) the tile-splitting factor (1 defaults to StreamK, >1 emulates Split-K)
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C1,
+      void const * ptr_C2,
+      void * ptr_D,
+      void * ptr_Vector,
+      void * ptr_Tensor,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C1,
+      int64_t batch_stride_C2,
+      int64_t batch_stride_D,
+      int64_t batch_stride_Vector,
+      int64_t batch_stride_Tensor,
+      typename LayoutA::Stride::Index lda,
+      typename LayoutB::Stride::Index ldb,
+      typename LayoutC::Stride::Index ldc1,
+      typename LayoutC::Stride::Index ldc2,
+      typename LayoutC::Stride::Index ldd,
+      typename LayoutC::Stride::Index ldr,
+      typename LayoutC::Stride::Index ldt,
+      int avail_sms = -1)                           /// The number of SMs that StreamK dispatch heuristics will attempt to load-balance across (-1 defaults to device width, 1 implies classic data-parallel scheduling)
+    :
+      mode(mode),
+      problem_size(problem_size),
+      batch_count(batch_split),
+      epilogue(epilogue),
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C1(ptr_C1), ptr_C2(ptr_C2), ptr_D(ptr_D),
+      ptr_Vector(ptr_Vector),
+      ptr_Tensor(ptr_Tensor),
+      batch_stride_A(batch_stride_A),
+      batch_stride_B(batch_stride_B),
+      batch_stride_C1(batch_stride_C1),
+      batch_stride_C2(batch_stride_C2),
+      batch_stride_Vector(batch_stride_Vector),
+      batch_stride_Tensor(batch_stride_Tensor),
+      lda(lda), ldb(ldb), ldc1(ldc1), ldc2(ldc2), ldd(ldd), ldr(ldr), ldt(ldt), avail_sms(avail_sms)
+    {
+      CUTLASS_TRACE_HOST("GemmStreamkWithFusedEpilogue::Arguments::Arguments() - problem_size: " << problem_size);
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
+      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
+      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
+      CUTLASS_TRACE_HOST("  ldt: " << this->ldt);
+      CUTLASS_TRACE_HOST("  avail_sms: " << this->avail_sms);
+    }
+
+    /// Returns arguments for the transposed problem
+    Arguments transposed_problem() const {
+      Arguments args(*this);
+
+      std::swap(args.problem_size.m(), args.problem_size.n());
+      std::swap(args.ptr_A, args.ptr_B);
+      std::swap(args.lda, args.ldb);
+      std::swap(args.batch_stride_A, args.batch_stride_B);
+
+      return args;
+    }
+  };
+
+
+  /// Parameters structure
+  struct Params
+  {
+  public:
+
+    //
+    // Data members
+    //
+
+    void * ptr_A{nullptr};
+    void * ptr_B{nullptr};
+
+    typename Mma::IteratorA::Params params_A{};
+    typename Mma::IteratorB::Params params_B{};
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+
+    GemmUniversalMode mode{GemmUniversalMode::kGemm};
+
+    ThreadblockSwizzle block_mapping{};
+
+    void *barrier_workspace{nullptr};
+    void *partials_workspace{nullptr};
+
+    typename EpilogueOutputOp::Params output_op{};
+
+    void * ptr_C1{nullptr};
+    void * ptr_C2{nullptr};
+    void * ptr_D{nullptr};
+    void * ptr_Tensor{nullptr};
+    void * ptr_Vector{nullptr};
+
+    typename Epilogue::OutputTileIterator::Params params_C1{};
+    typename Epilogue::OutputTileIterator::Params params_C2{};
+    typename Epilogue::OutputTileIterator::Params params_D{};
+    typename Epilogue::TensorTileIterator::Params params_Tensor{};
+
+    int64_t batch_stride_C1{0};
+    int64_t batch_stride_C2{0};
+    int64_t batch_stride_D{0};
+    int64_t batch_stride_Vector{0};
+    int64_t batch_stride_Tensor{0};
+
+    typename LayoutC::Stride::Index ldr{};
+
+  protected:
+
+    //
+    // Host-only dispatch-utilities
+    //
+
+    /// Pad the given allocation size up to the nearest cache line
+    static size_t cacheline_align_up(size_t size)
+    {
+      static const int CACHELINE_SIZE = 128;
+      return (size + CACHELINE_SIZE - 1) / CACHELINE_SIZE * CACHELINE_SIZE;
+    }
+
+    /// Get the workspace size needed for barrier
+    size_t get_barrier_workspace_size() const
+    {
+      // For atomic reduction, each SK-block needs a synchronization flag.  For parallel reduction,
+      // each reduction block needs its own synchronization flag.
+      int sk_blocks = block_mapping.sk_regions() * block_mapping.sk_blocks_per_region();
+      int num_flags = fast_max(sk_blocks, block_mapping.reduction_blocks);
+
+      return cacheline_align_up(sizeof(typename Barrier::T) * num_flags);
+    }
+
+    /// Get the workspace size needed for intermediate partial sums
+    size_t get_partials_workspace_size() const
+    {
+      int sk_blocks = block_mapping.sk_regions() * block_mapping.sk_blocks_per_region();
+      return cacheline_align_up(kWorkspaceBytesPerBlock * sk_blocks);
+    }
+
+
+  public:
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      params_A(args.lda),
+      params_B(args.ldb),
+      params_C1(args.ldc1),
+      params_C2(args.ldc2),
+      params_D(args.ldd),
+      params_Tensor(args.ldt),
+      output_op(args.epilogue),
+      mode(args.mode),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_C1(const_cast<void *>(args.ptr_C1)),
+      ptr_C2(const_cast<void *>(args.ptr_C2)),
+      ptr_D(args.ptr_D),
+      ptr_Vector(args.ptr_Vector),
+      ldr(args.ldr),
+      ptr_Tensor(args.ptr_Tensor),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_C1(args.batch_stride_C1),
+      batch_stride_C2(args.batch_stride_C2),
+      batch_stride_D(args.batch_stride_D),
+      batch_stride_Vector(args.batch_stride_Vector),
+      batch_stride_Tensor(args.batch_stride_Tensor),
+      barrier_workspace(nullptr),
+      partials_workspace(nullptr)
+    {
+      CUTLASS_TRACE_HOST("GemmStreamkWithFusedEpilogue::Params::Params()");
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
+      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
+      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
+      CUTLASS_TRACE_HOST("  ldt: " << args.ldt);
+
+      // Number of SMs to make available for StreamK decomposition
+      int avail_sms = (args.avail_sms == -1) ?
+                        device_sms :
+                        fast_min(args.avail_sms, device_sms);
+      CUTLASS_TRACE_HOST("  avail_sms: " << avail_sms);
+
+      // Initialize the block mapping structure
+      block_mapping = ThreadblockSwizzle(
+        args.mode,
+        args.problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.batch_count,
+        sm_occupancy,
+        device_sms,
+        avail_sms,
+        sizeof(ElementA),
+        sizeof(ElementB),
+        sizeof(ElementC),
+        Epilogue::kAccumulatorFragments);
+    }
+
+    /// Returns the workspace size (in bytes) needed for these parameters
+    size_t get_workspace_size() const
+    {
+      return
+        get_barrier_workspace_size() +
+        get_partials_workspace_size();
+    }
+
+    /// Assign and initialize the specified workspace buffer.  Assumes
+    /// the memory allocated to workspace is at least as large as get_workspace_size().
+    Status init_workspace(
+      void *workspace,
+      cudaStream_t stream = nullptr)
+    {
+      uint8_t *ptr = static_cast<uint8_t*>(workspace);
+
+      // Establish partials workspace
+      partials_workspace = nullptr;
+      size_t partials_workspace_bytes = get_partials_workspace_size();
+      if (partials_workspace_bytes > 0)
+      {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+        partials_workspace = ptr;
+        ptr += partials_workspace_bytes;
+      }
+
+      // Establish barrier workspace
+      barrier_workspace = nullptr;
+      size_t barrier_workspace_bytes = get_barrier_workspace_size();
+      if (barrier_workspace_bytes > 0)
+      {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+        barrier_workspace = ptr;
+        ptr += barrier_workspace_bytes;
+      }
+
+      // Zero-initialize barrier workspace
+      if (barrier_workspace)
+      {
+        size_t barrier_workspace_bytes = get_barrier_workspace_size();
+
+        CUTLASS_TRACE_HOST("  Initialize " << barrier_workspace_bytes << " barrier bytes");
+
+        cudaError_t result = cudaMemsetAsync(
+          barrier_workspace,
+          0,
+          barrier_workspace_bytes,
+          stream);
+
+        if (result != cudaSuccess) {
+          CUTLASS_TRACE_HOST("  cudaMemsetAsync() returned error " << cudaGetErrorString(result));
+          return Status::kErrorInternal;
+        }
+      }
+
+      return Status::kSuccess;
+    }
+
+
+    /// Returns the GEMM volume in thread block tiles
+    cutlass::gemm::GemmCoord get_tiled_shape() const
+    {
+      return block_mapping.tiled_shape();
+    }
+
+    /// Returns the total number of thread blocks to launch
+    int get_grid_blocks() const
+    {
+      dim3 grid_dims = get_grid_dims();
+      return grid_dims.x * grid_dims.y * grid_dims.z;
+    }
+
+    /// Returns the grid extents in thread blocks to launch
+    dim3 get_grid_dims() const
+    {
+      return block_mapping.get_grid_dims();
+    }
+
+    /// Lightweight update given a subset of arguments.  Problem geometry is assumed
+    /// to remain the same.
+    CUTLASS_HOST_DEVICE
+    void update(Arguments const &args)
+    {
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_C1 = const_cast<void *>(args.ptr_C1);
+      ptr_C2 = const_cast<void *>(args.ptr_C2);
+      ptr_D = args.ptr_D;
+
+      ptr_Vector = args.ptr_Vector;
+      ldr = args.ldr;
+      ptr_Tensor = args.ptr_Tensor;
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C1 = args.batch_stride_C1;
+      batch_stride_C2 = args.batch_stride_C2;
+      batch_stride_D = args.batch_stride_D;
+      batch_stride_Vector = args.batch_stride_Vector;
+      batch_stride_Tensor = args.batch_stride_Tensor;
+
+      output_op = args.epilogue;
+
+      CUTLASS_TRACE_HOST("GemmStreamkWithFusedEpilogue::Params::update()");
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
+      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
+      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
+    }
+  };
+
+  /// Tile work descriptor
+  struct TileWorkDesc
+  {
+    /// The linear tile index
+    int tile_idx;
+
+    /// The location of this tile (in threadblock-tile coordinates) in the output matrix
+    cutlass::gemm::GemmCoord tiled_coord;
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    int iter_begin;
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    int k_begin;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    int k_end;
+
+    /// The number of remaining MAC-iterations this threadblock will perform for this tile
+    int k_iters_remaining;
+
+    // Whether this block will perform the first iteration of this tile
+    CUTLASS_DEVICE
+    bool tile_started()
+    {
+      return (k_begin == 0);
+    }
+
+    // Whether this block will perform the last iteration of this tile
+    CUTLASS_DEVICE
+    bool tile_finished(Params const &params)
+    {
+      return (k_end == params.block_mapping.problem_size.k());
+    }
+  };
+
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+
+protected:
+
+  //
+  // Data members
+  //
+
+  /// GEMM problem parameters
+  Params const &params;
+
+  /// Shared storage reference
+  SharedStorage &shared_storage;
+
+  /// ID within the threadblock
+  int thread_idx;
+
+  /// ID of warp
+  int warp_idx;
+
+  /// ID of each thread within a warp
+  int lane_idx;
+
+  /// Threadblock scoped epilogue
+  Epilogue epilogue;
+
+
+public:
+
+  //
+  // Host dispatch API
+  //
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size) {
+
+    CUTLASS_TRACE_HOST("GemmStreamkWithFusedEpilogue::can_implement()");
+
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
+            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    CUTLASS_TRACE_HOST("  returning kSuccess");
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+protected:
+
+  //
+  // Device-only utility methods
+  //
+
+  /// Iterator for fetching tile fragments from A
+  CUTLASS_DEVICE
+  typename Mma::IteratorA init_iterator_A(
+    TileWorkDesc &tile_work,
+    GemmUniversalMode mode)
+  {
+    // The input A matrix
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
+
+    // Update input pointers based on batched/array mode
+    if (mode == GemmUniversalMode::kBatched) {
+      ptr_A += tile_work.tiled_coord.k() * params.batch_stride_A;
+    }
+    if (mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[tile_work.tiled_coord.k()];
+    }
+
+    int m_begin = tile_work.tiled_coord.m() * Mma::Shape::kM;
+    int m_end = params.block_mapping.problem_size.m();
+    return Mma::IteratorA(
+        params.params_A,
+        ptr_A,
+        { m_end, tile_work.k_end },
+        threadIdx.x,
+        { m_begin, tile_work.k_begin });
+
+  }
+
+
+  /// Iterator for fetching tile fragments from B
+  CUTLASS_DEVICE
+  typename Mma::IteratorB init_iterator_B(
+    TileWorkDesc &tile_work,
+    GemmUniversalMode mode)
+  {
+    // The input B matrix
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    // Update input pointers based on batched/array mode
+    if (mode == GemmUniversalMode::kBatched) {
+      ptr_B += tile_work.tiled_coord.k() * params.batch_stride_B;
+    }
+    if (mode == GemmUniversalMode::kArray) {
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[tile_work.tiled_coord.k()];
+    }
+
+    int n_begin = tile_work.tiled_coord.n() * Mma::Shape::kN;
+    int n_end = params.block_mapping.problem_size.n();
+    return Mma::IteratorB(
+        params.params_B,
+        ptr_B,
+        { tile_work.k_end, n_end },
+        threadIdx.x,
+        { tile_work.k_begin, n_begin });
+  }
+
+
+  CUTLASS_DEVICE
+  void init_dp_tile_work(
+      TileWorkDesc &tile_work,
+      int tile_idx)
+  {
+    // The linear tile index
+    tile_work.tile_idx = tile_idx;
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    tile_work.iter_begin = tile_idx * params.block_mapping.iters_per_tile();
+
+    // The number of MAC-iterations this threadblock will perform for this tile
+    tile_work.k_iters_remaining = params.block_mapping.iters_per_tile();
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_begin = 0;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_end = params.block_mapping.problem_size.k();
+
+    // The location of this tile (in threadblock-tile coordinates) in the output matrix
+    tile_work.tiled_coord = params.block_mapping.get_tile_offset(tile_work.tile_idx);
+  }
+
+
+  CUTLASS_DEVICE
+  void init_sk_tile_work(
+      TileWorkDesc &tile_work,
+      int tile_idx,
+      int block_iter_begin,
+      int block_iter_end)
+  {
+    // The linear tile index
+    tile_work.tile_idx = tile_idx;
+
+    // The first global-scoped MAC-iteration for this tile
+    int tile_iter_begin = tile_idx * params.block_mapping.iters_per_tile();
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    tile_work.iter_begin = max(block_iter_begin, tile_iter_begin);
+
+    // The first tile-scoped MAC-iteration this threadblock will perform for this tile
+    int k_iter_begin = tile_work.iter_begin - tile_iter_begin;
+
+    // The last (one past) tile-scoped MAC-iteration this threadblock will perform for this tile
+    int k_iter_end = block_iter_end - tile_iter_begin;
+
+    // The number of MAC-iterations this threadblock will perform for this tile
+    tile_work.k_iters_remaining = k_iter_end - k_iter_begin;
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_begin = k_iter_begin * Mma::Shape::kK;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_end = min(
+        params.block_mapping.problem_size.k(),            // extent of k domain
+        (k_iter_end * Mma::Shape::kK));                   // extent of the threadblock's global iteration assignment
+
+    // The location of this tile (in threadblock-tile coordinates) in the output matrix
+    tile_work.tiled_coord = params.block_mapping.get_tile_offset(tile_work.tile_idx);
+  }
+
+
+  /// Share accumulators with peers
+  CUTLASS_DEVICE
+  void share_accumulators(
+    AccumulatorTile const &accumulator_tile,
+    int block_idx,
+    int first_block_idx)
+  {
+    AccumulatorTile *accum_tile_workspace = reinterpret_cast<AccumulatorTile *>(params.partials_workspace);
+
+    int accum_tile_offset = first_block_idx * kThreadCount;
+
+    if (block_idx == first_block_idx)
+    {
+      // First peer initializes the workspace partials
+      BlockStripedReduceT::store(accum_tile_workspace + accum_tile_offset, accumulator_tile, thread_idx);
+    }
+    else
+    {
+      // Subsequent peers atomically accumulate into the workspace partials
+      if (ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kAtomic)
+      {
+        // Non-deterministic reduction order: wait for the first peer to have initialized the partials before we add to them
+        Barrier::wait_lt(params.barrier_workspace, thread_idx, first_block_idx, 1);
+      }
+      else
+      {
+        // Turnstile reduction order: wait until the previous peer has written
+        int wait_count = block_idx - first_block_idx;
+        Barrier::wait_eq(params.barrier_workspace, thread_idx, first_block_idx, wait_count);
+      }
+
+      // Perform reduction in workspace
+      BlockStripedReduceT::reduce(accum_tile_workspace + accum_tile_offset, accumulator_tile, thread_idx);
+    }
+
+    // Signal our arrival
+    Barrier::arrive_inc(params.barrier_workspace, thread_idx, first_block_idx);
+  }
+
+
+  /// Acquire accumulators from peers
+  CUTLASS_DEVICE
+  void acquire_accumulators(
+    AccumulatorTile &accumulator_tile,
+    int block_idx,
+    int first_block_idx)
+  {
+    AccumulatorTile *accum_tile_workspace = reinterpret_cast<AccumulatorTile *>(params.partials_workspace);
+
+    // Wait for arrival
+    int num_carry_in = block_idx - first_block_idx;
+    Barrier::wait_eq_reset(params.barrier_workspace, thread_idx, first_block_idx, num_carry_in);
+
+    // Load and add peer-partials accumulator tile to local accumulator tile
+    int accum_tile_offset = first_block_idx * kThreadCount;
+    BlockStripedReduceT::load_add(accumulator_tile, accum_tile_workspace + accum_tile_offset, thread_idx);
+  }
+
+
+  /// Perform epilogue computations and output
+  CUTLASS_DEVICE
+  void do_epilogue(
+    TileWorkDesc &tile_work,
+    AccumulatorTile &accumulator_tile)
+  {
+    ElementC *ptr_C1 = static_cast<ElementC *>(params.ptr_C1);
+    ElementC *ptr_C2 = static_cast<ElementC *>(params.ptr_C2);
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+    typename Epilogue::ElementTensor *ptr_Tensor = static_cast<typename Epilogue::ElementTensor *>(params.ptr_Tensor);
+
+    // Define the reduction output pointer and move to the appropriate place
+    typename Epilogue::ElementVector *ptr_Vector =
+      static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
+
+    // Update pointers for batched/array mode(s)
+    if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C1 += tile_work.tiled_coord.k() * params.batch_stride_C1;
+      if (ptr_C2) {
+        ptr_C2 += tile_work.tiled_coord.k() * params.batch_stride_C2;
+      }
+      ptr_D += tile_work.tiled_coord.k() * params.batch_stride_D;
+      if (ptr_Tensor) {
+        ptr_Tensor = ReferenceFactory<typename Epilogue::ElementTensor>::add_pointer_offset(
+          ptr_Tensor,
+          tile_work.tiled_coord.k() * params.batch_stride_Tensor);
+      }
+      if (ptr_Vector) {
+        ptr_Vector += tile_work.tiled_coord.k() * params.batch_stride_Vector;
+      }
+    }
+    if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C1 = static_cast<ElementC * const *>(params.ptr_C1)[tile_work.tiled_coord.k()];
+      if (ptr_C2) {
+        ptr_C2 = static_cast<ElementC * const *>(params.ptr_C2)[tile_work.tiled_coord.k()];
+      }
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[tile_work.tiled_coord.k()];
+      if (ptr_Tensor) {
+        ptr_Tensor = static_cast<typename Epilogue::ElementTensor * const *>(params.ptr_Tensor)[tile_work.tiled_coord.k()];
+      }
+      if (ptr_Vector) {
+        ptr_Vector = static_cast<typename Epilogue::ElementVector * const *>(params.ptr_Vector)[tile_work.tiled_coord.k()];
+      }
+    }
+
+    // Location of this tile in item-coords
+    MatrixCoord threadblock_item_begin(
+      tile_work.tiled_coord.m() * Mma::Shape::kM,
+      tile_work.tiled_coord.n() * Mma::Shape::kN
+    );
+
+    // Tile iterator loading from residual1.
+    typename Epilogue::OutputTileIterator iterator_C1(
+        params.params_C1,
+        ptr_C1,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Tile iterator loading from residual2.
+    typename Epilogue::OutputTileIterator iterator_C2(
+        params.params_C2,
+        ptr_C2,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+        params.params_D,
+        ptr_D,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Additional tensor to load from
+    typename Epilogue::TensorTileIterator tensor_iterator(
+        params.params_Tensor,
+        ptr_Tensor,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Move to appropriate location for this output tile
+    if (ptr_Vector) {
+      ptr_Vector += threadblock_item_begin.column() + tile_work.tiled_coord.m() * params.ldr;
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+        EpilogueOutputOp(params.output_op),
+        ptr_Vector,
+        iterator_D,
+        accumulator_tile,
+        iterator_C1,
+        iterator_C2,
+        tensor_iterator,
+        params.block_mapping.problem_size.mn(),
+        threadblock_item_begin);
+  }
+
+
+  CUTLASS_DEVICE
+  void separate_reduction(int reduce_idx)
+  {
+    int peer_idx_begin, peer_idx_last, reduce_tile_idx, reduce_fragment_idx;
+
+    // Reduce by sk-tile (every tile contributed to by one or more blocks)
+    reduce_tile_idx = reduce_idx / Epilogue::kAccumulatorFragments;
+    reduce_fragment_idx = reduce_idx % Epilogue::kAccumulatorFragments;
+
+    int iter_tile_first = reduce_tile_idx * params.block_mapping.iters_per_tile();
+    int iter_tile_last = iter_tile_first + params.block_mapping.iters_per_tile() - 1;
+
+    peer_idx_begin = params.block_mapping.get_sk_block_idx(iter_tile_first);
+    peer_idx_last = params.block_mapping.get_sk_block_idx(iter_tile_last);
+
+    // Wait for peers to complete
+    int peer_idx_end = peer_idx_last + 1;
+    int num_peers = peer_idx_end - peer_idx_begin;
+    Barrier::wait_eq_reset(
+        params.barrier_workspace,
+        thread_idx,
+        (reduce_tile_idx * Epilogue::kAccumulatorFragments) + reduce_fragment_idx,
+        num_peers);
+
+    /// The location of this tile (in threadblock-tile coordinates) in the output matrix
+    GemmCoord tiled_coord = params.block_mapping.get_tile_offset(reduce_tile_idx);
+
+    // Location of this tile in item-coords
+    MatrixCoord threadblock_item_begin(
+      tiled_coord.m() * Mma::Shape::kM,
+      tiled_coord.n() * Mma::Shape::kN
+    );
+
+    ElementC *ptr_C1 = static_cast<ElementC *>(params.ptr_C1);
+    ElementC *ptr_C2 = static_cast<ElementC *>(params.ptr_C2);
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+    typename Epilogue::ElementTensor *ptr_Tensor = static_cast<typename Epilogue::ElementTensor *>(params.ptr_Tensor);
+
+    // Define the reduction output pointer and move to the appropriate place
+    typename Epilogue::ElementVector *ptr_Vector =
+      static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
+
+    // Tile iterator loading from residual1.
+    typename Epilogue::OutputTileIterator iterator_C1(
+        params.params_C1,
+        ptr_C1,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Tile iterator loading from residual2.
+    typename Epilogue::OutputTileIterator iterator_C2(
+        params.params_C2,
+        ptr_C2,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+        params.params_D,
+        ptr_D,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Additional tensor to load from
+    typename Epilogue::TensorTileIterator tensor_iterator(
+        params.params_Tensor,
+        ptr_Tensor,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Move to appropriate location for this output tile
+    if (ptr_Vector) {
+      ptr_Vector += threadblock_item_begin.column() + tiled_coord.m() * params.ldr;
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue.reduce(
+        peer_idx_begin,
+        peer_idx_end,
+        reduce_fragment_idx,
+        params.partials_workspace,
+        EpilogueOutputOp(params.output_op),
+        ptr_Vector,
+        iterator_D,
+        iterator_C1,
+        iterator_C2,
+        tensor_iterator,
+        params.block_mapping.problem_size.mn(),
+        threadblock_item_begin);
+  }
+
+
+  CUTLASS_DEVICE
+  void process_tile(
+    TileWorkDesc tile_work,
+    int block_idx,
+    int dp_start_block_idx,
+    int block_iter_begin)
+  {
+    // Initialize input iterators
+    typename Mma::IteratorA iterator_A = init_iterator_A(tile_work, params.mode);
+    typename Mma::IteratorB iterator_B = init_iterator_B(tile_work, params.mode);
+
+    // Initialize accumulators
+    AccumulatorTile accumulator_tile;
+    accumulator_tile.clear();
+
+    // Initialize MMA abstraction
+    Mma mma(
+      shared_storage.main_loop,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    // Perform this tile's range of multiply-accumulate (MAC) iterations
+    mma(tile_work.k_iters_remaining, accumulator_tile, iterator_A, iterator_B, accumulator_tile);
+
+    if ((ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kAtomic) ||
+        (params.block_mapping.reduction_blocks == 0) ||
+        (block_idx >= dp_start_block_idx))
+    {
+      //
+      // Cooperative SK peer reduction or DP block
+      //
+
+      int first_block_idx = params.block_mapping.get_first_block_idx(tile_work.tile_idx, block_idx);
+
+      if (!tile_work.tile_finished(params)) {
+        // Non "finishing" SK blocks must share their partial accumulator sums through global scratch workspace
+        share_accumulators(accumulator_tile, block_idx, first_block_idx);
+      }
+      else
+      {
+        // DP blocks and "finishing" SK blocks must perform epilogue operations and write the output tile
+        if (!tile_work.tile_started())
+        {
+          // A "finishing" SK block must first aggregate its accumulator partial sums with those shared by peer threadblocks
+          acquire_accumulators(accumulator_tile, block_idx, first_block_idx);
+        }
+
+        do_epilogue(tile_work, accumulator_tile);
+      }
+    }
+    else
+    {
+      //
+      // Separate peer reduction
+      //
+
+      // Share accumulator partial sums with peer threadblock(s) through scratch workspace
+      epilogue.share(block_idx, params.partials_workspace, accumulator_tile, tile_work.tile_started());
+
+      // Signal arrival
+      Barrier::arrive_range_inc(
+        params.barrier_workspace,
+        thread_idx,
+        tile_work.tile_idx * Epilogue::kAccumulatorFragments,
+        Epilogue::kAccumulatorFragments);
+    }
+  }
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void gemm()
+  {
+    // Initialize block's iteration range
+    int tile_idx = 0;
+    int block_iter_begin = 0;
+    int block_iters_remaining = 0;
+
+    int block_idx = params.block_mapping.get_block_idx();
+
+    int sk_padding_start_block_idx =  params.block_mapping.sk_regions() * params.block_mapping.sk_blocks_per_region();
+    int dp_start_block_idx = params.block_mapping.sk_waves * params.block_mapping.avail_sms;
+    int reduce_start_block_idx = dp_start_block_idx + params.block_mapping.dp_blocks;
+    int grid_padding_start_block_idx = reduce_start_block_idx + params.block_mapping.reduction_blocks;
+
+    // Initialize tile work descriptor
+    TileWorkDesc tile_work;
+
+    bool dp_block = (block_idx >= dp_start_block_idx) && (block_idx < reduce_start_block_idx);
+    bool sk_block = (block_idx < sk_padding_start_block_idx);
+    bool reduce_block = (block_idx >= reduce_start_block_idx) &&
+            (block_idx < grid_padding_start_block_idx) &&
+            (ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kMixed);
+
+    if (dp_block)
+    {
+      // This is a DP block
+      int dp_block_idx = block_idx - dp_start_block_idx;
+      int first_dp_tile = (params.block_mapping.cohort_raster) ? 0 : params.block_mapping.sk_tiles;
+
+      // Blocks in first DP wave get configured number of tiles
+      tile_idx = first_dp_tile + dp_block_idx;
+      int tile_allottment = params.block_mapping.dp_first_wave_tiles;
+
+      // Blocks in subsequent DP waves get 1 tile
+      if (dp_block_idx >= params.block_mapping.avail_sms) {
+          tile_allottment = 1;
+          tile_idx += (params.block_mapping.dp_first_wave_tiles - 1) * params.block_mapping.avail_sms;
+      }
+
+      block_iters_remaining = params.block_mapping.iters_per_tile() * tile_allottment;
+
+      init_dp_tile_work(tile_work, tile_idx);
+
+      // DP blocks exit if out of bounds or overlap an SK tile (only possible during cohort rasterization, where dp_first_wave_tiles must be 1)
+      if ((tile_idx < params.block_mapping.sk_tiles) ||
+          (tile_work.tiled_coord.m() >= params.block_mapping.tiled_shape().m()) ||
+          (tile_work.tiled_coord.n() >= params.block_mapping.tiled_shape().n()))
+      {
+        return;
+      }
+    }
+    else if (sk_block)
+    {
+      // This is a SK block
+      int block_iter_end;
+      params.block_mapping.get_iter_extents(block_idx, block_iter_begin, block_iter_end);
+      block_iters_remaining = block_iter_end - block_iter_begin;
+
+      tile_idx = params.block_mapping.get_sk_tile_idx(block_iter_end - 1);
+      init_sk_tile_work(tile_work, tile_idx, block_iter_begin, block_iter_begin + block_iters_remaining);
+    }
+    else
+    {
+      if (reduce_block)
+      {
+        // This is a reduction threadblock
+        int reduce_block_idx = block_idx - reduce_start_block_idx;
+        separate_reduction(reduce_block_idx);
+      }
+
+      return;
+    }
+
+    // Iteration-processing loop body
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (true)
+    {
+      // Perform this block's share of work for this tile
+      process_tile(
+        tile_work,
+        block_idx,
+        dp_start_block_idx,
+        block_iter_begin);
+
+      block_iters_remaining -= tile_work.k_iters_remaining;
+
+      if (block_iters_remaining == 0)
+      {
+        break;
+      }
+
+      // Continue to next tile
+      __syncthreads();
+
+      if (block_idx >= dp_start_block_idx)
+      {
+        // DP block consume their tiles at stride
+        tile_idx += params.block_mapping.avail_sms;
+        init_dp_tile_work(tile_work, tile_idx);
+      }
+      else
+      {
+        // SK blocks consume their tiles in backwards order
+        tile_idx--;
+        init_sk_tile_work(tile_work, tile_idx, block_iter_begin, block_iter_begin + block_iters_remaining);
+      }
+    }
+
+  }
+
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmStreamkWithFusedEpilogue op(params, shared_storage);
+    op();
+  }
+
+
+  // Constructor
+  CUTLASS_DEVICE
+  GemmStreamkWithFusedEpilogue(
+      Params const &params,
+      SharedStorage &shared_storage)
+    :
+      params(params),
+      shared_storage(shared_storage),
+      thread_idx(threadIdx.x),
+      warp_idx(__shfl_sync(0xffffffff, threadIdx.x / 32, 0)),   // broadcast the warp_id computed by lane 0 to ensure dependent code
+      lane_idx(threadIdx.x % 32),
+      epilogue(
+        shared_storage.epilogue,
+        thread_idx,
+        warp_idx,
+        lane_idx)
+  {}
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()() {
+    // Generic SK code path
+    gemm();
+
+  }
+};
+
+
+// GemmStreamkWithFusedEpilogue with one source
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+struct GemmStreamkWithFusedEpilogue<Mma_, Epilogue_, ThreadblockSwizzle_, true> {
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+  /// The per-thread tile of raw accumulators
+  using AccumulatorTile = typename Mma::FragmentC;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Workspace bytes per thread block
+  static size_t const kWorkspaceBytesPerBlock =
+    __NV_STD_MAX(
+      kThreadCount * sizeof(AccumulatorTile),
+      Epilogue::kWorkspaceBytesPerBlock);
+
+  /// Block-striped reduction utility
+  using BlockStripedReduceT = BlockStripedReduce<kThreadCount, AccumulatorTile>;
+
+
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments
+  {
+
+    //
+    // Data members
+    //
+
+    GemmUniversalMode mode{GemmUniversalMode::kGemm};
+    GemmCoord problem_size{};
+    int batch_count{1};        // Either (mode == GemmUniversalMode::kBatched) the batch count, or (mode == GemmUniversalMode::kGemm) the tile-splitting factor
+
+    typename EpilogueOutputOp::Params epilogue{};
+
+    void const * ptr_A{nullptr};
+    void const * ptr_B{nullptr};
+    void const * ptr_C{nullptr};
+    void * ptr_D{nullptr};
+
+    void * ptr_Vector{nullptr};
+    void * ptr_Tensor{nullptr};
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+    int64_t batch_stride_C{0};
+    int64_t batch_stride_D{0};
+    int64_t batch_stride_Vector{0};
+    int64_t batch_stride_Tensor{0};
+
+    typename LayoutA::Stride::Index lda{};
+    typename LayoutB::Stride::Index ldb{};
+    typename LayoutC::Stride::Index ldc{};
+    typename LayoutC::Stride::Index ldd{};
+    typename LayoutC::Stride::Index ldr{};
+    typename LayoutC::Stride::Index ldt{};
+
+    int avail_sms{-1};          /// The number of SMs that StreamK dispatch heuristics will attempt to load-balance across (-1 defaults to device width, 1 implies classic data-parallel scheduling)
+
+
+    //
+    // Methods
+    //
+
+    /// Default Constructor
+    Arguments() = default;
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_split,                              /// Either (mode == GemmUniversalMode::kBatched) the batch count, or (mode == GemmUniversalMode::kGemm) the tile-splitting factor (1 defaults to StreamK, >1 emulates Split-K)
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      void * ptr_Vector,
+      void * ptr_Tensor,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      int64_t batch_stride_Vector,
+      int64_t batch_stride_Tensor,
+      typename LayoutA::Stride::Index lda,
+      typename LayoutB::Stride::Index ldb,
+      typename LayoutC::Stride::Index ldc,
+      typename LayoutC::Stride::Index ldd,
+      typename LayoutC::Stride::Index ldr,
+      typename LayoutC::Stride::Index ldt,
+      int avail_sms = -1)                           /// The number of SMs that StreamK dispatch heuristics will attempt to load-balance across (-1 defaults to device width, 1 implies classic data-parallel scheduling)
+    :
+      mode(mode),
+      problem_size(problem_size),
+      batch_count(batch_split),
+      epilogue(epilogue),
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
+      ptr_Vector(ptr_Vector),
+      ptr_Tensor(ptr_Tensor),
+      batch_stride_A(batch_stride_A),
+      batch_stride_B(batch_stride_B),
+      batch_stride_C(batch_stride_C),
+      batch_stride_Vector(batch_stride_Vector),
+      batch_stride_Tensor(batch_stride_Tensor),
+      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd), ldr(ldr), ldt(ldt), avail_sms(avail_sms)
+    {
+      CUTLASS_TRACE_HOST("GemmStreamkWithFusedEpilogue::Arguments::Arguments() - problem_size: " << problem_size);
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
+      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
+      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
+      CUTLASS_TRACE_HOST("  ldt: " << this->ldt);
+      CUTLASS_TRACE_HOST("  avail_sms: " << this->avail_sms);
+    }
+
+    /// Returns arguments for the transposed problem
+    Arguments transposed_problem() const {
+      Arguments args(*this);
+
+      std::swap(args.problem_size.m(), args.problem_size.n());
+      std::swap(args.ptr_A, args.ptr_B);
+      std::swap(args.lda, args.ldb);
+      std::swap(args.batch_stride_A, args.batch_stride_B);
+
+      return args;
+    }
+  };
+
+
+  /// Parameters structure
+  struct Params
+  {
+
+  public:
+
+    //
+    // Data members
+    //
+
+    void * ptr_A{nullptr};
+    void * ptr_B{nullptr};
+
+    typename Mma::IteratorA::Params params_A{};
+    typename Mma::IteratorB::Params params_B{};
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+
+    GemmUniversalMode mode{GemmUniversalMode::kGemm};
+
+    ThreadblockSwizzle block_mapping{};
+
+    void *barrier_workspace{nullptr};
+    void *partials_workspace{nullptr};
+
+    typename EpilogueOutputOp::Params output_op{};
+
+    void * ptr_C{nullptr};
+    void * ptr_D{nullptr};
+    void * ptr_Tensor{nullptr};
+    void * ptr_Vector{nullptr};
+
+    typename Epilogue::OutputTileIterator::Params params_C{};
+    typename Epilogue::OutputTileIterator::Params params_D{};
+    typename Epilogue::TensorTileIterator::Params params_Tensor{};
+
+    int64_t batch_stride_C{0};
+    int64_t batch_stride_D{0};
+    int64_t batch_stride_Vector{0};
+    int64_t batch_stride_Tensor{0};
+
+    typename LayoutC::Stride::Index ldr{};
+
+  protected:
+
+    //
+    // Host-only dispatch-utilities
+    //
+
+    /// Pad the given allocation size up to the nearest cache line
+    static size_t cacheline_align_up(size_t size)
+    {
+      static const int CACHELINE_SIZE = 128;
+      return (size + CACHELINE_SIZE - 1) / CACHELINE_SIZE * CACHELINE_SIZE;
+    }
+
+    /// Get the workspace size needed for barrier
+    size_t get_barrier_workspace_size() const
+    {
+      // For atomic reduction, each SK-block needs a synchronization flag.  For parallel reduction,
+      // each reduction block needs its own synchronization flag.
+      int sk_blocks = block_mapping.sk_regions() * block_mapping.sk_blocks_per_region();
+      int num_flags = fast_max(sk_blocks, block_mapping.reduction_blocks);
+
+      return cacheline_align_up(sizeof(typename Barrier::T) * num_flags);
+    }
+
+    /// Get the workspace size needed for intermediate partial sums
+    size_t get_partials_workspace_size() const
+    {
+      int sk_blocks = block_mapping.sk_regions() * block_mapping.sk_blocks_per_region();
+      return cacheline_align_up(kWorkspaceBytesPerBlock * sk_blocks);
+    }
+
+
+  public:
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      params_A(args.lda),
+      params_B(args.ldb),
+      params_C(args.ldc),
+      params_D(args.ldd),
+      params_Tensor(args.ldt),
+      output_op(args.epilogue),
+      mode(args.mode),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_C(const_cast<void *>(args.ptr_C)),
+      ptr_D(args.ptr_D),
+      ptr_Vector(args.ptr_Vector),
+      ldr(args.ldr),
+      ptr_Tensor(args.ptr_Tensor),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_C(args.batch_stride_C),
+      batch_stride_D(args.batch_stride_D),
+      batch_stride_Vector(args.batch_stride_Vector),
+      batch_stride_Tensor(args.batch_stride_Tensor),
+      barrier_workspace(nullptr),
+      partials_workspace(nullptr)
+    {
+      CUTLASS_TRACE_HOST("GemmStreamkWithFusedEpilogue::Params::Params()");
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
+      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
+      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
+      CUTLASS_TRACE_HOST("  ldt: " << args.ldt);
+
+      // Number of SMs to make available for StreamK decomposition
+      int avail_sms = (args.avail_sms == -1) ?
+                        device_sms :
+                        fast_min(args.avail_sms, device_sms);
+      CUTLASS_TRACE_HOST("  avail_sms: " << avail_sms);
+
+      // Initialize the block mapping structure
+      block_mapping = ThreadblockSwizzle(
+        args.mode,
+        args.problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.batch_count,
+        sm_occupancy,
+        device_sms,
+        avail_sms,
+        sizeof(ElementA),
+        sizeof(ElementB),
+        sizeof(ElementC),
+        Epilogue::kAccumulatorFragments);
+    }
+
+    /// Returns the workspace size (in bytes) needed for these parameters
+    size_t get_workspace_size() const
+    {
+      return
+        get_barrier_workspace_size() +
+        get_partials_workspace_size();
+    }
+
+
+    /// Assign and initialize the specified workspace buffer.  Assumes
+    /// the memory allocated to workspace is at least as large as get_workspace_size().
+    Status init_workspace(
+      void *workspace,
+      cudaStream_t stream = nullptr)
+    {
+      uint8_t *ptr = static_cast<uint8_t*>(workspace);
+
+      // Establish partials workspace
+      partials_workspace = nullptr;
+      size_t partials_workspace_bytes = get_partials_workspace_size();
+      if (partials_workspace_bytes > 0)
+      {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+        partials_workspace = ptr;
+        ptr += partials_workspace_bytes;
+      }
+
+      // Establish barrier workspace
+      barrier_workspace = nullptr;
+      size_t barrier_workspace_bytes = get_barrier_workspace_size();
+      if (barrier_workspace_bytes > 0)
+      {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+        barrier_workspace = ptr;
+        ptr += barrier_workspace_bytes;
+      }
+
+      // Zero-initialize barrier workspace
+      if (barrier_workspace)
+      {
+        size_t barrier_workspace_bytes = get_barrier_workspace_size();
+
+        CUTLASS_TRACE_HOST("  Initialize " << barrier_workspace_bytes << " barrier bytes");
+
+        cudaError_t result = cudaMemsetAsync(
+          barrier_workspace,
+          0,
+          barrier_workspace_bytes,
+          stream);
+
+        if (result != cudaSuccess) {
+          CUTLASS_TRACE_HOST("  cudaMemsetAsync() returned error " << cudaGetErrorString(result));
+          return Status::kErrorInternal;
+        }
+      }
+
+      return Status::kSuccess;
+    }
+
+
+    /// Returns the GEMM volume in thread block tiles
+    cutlass::gemm::GemmCoord get_tiled_shape() const
+    {
+      return block_mapping.tiled_shape();
+    }
+
+
+    /// Returns the total number of thread blocks to launch
+    int get_grid_blocks() const
+    {
+      dim3 grid_dims = get_grid_dims();
+      return grid_dims.x * grid_dims.y * grid_dims.z;
+    }
+
+
+    /// Returns the grid extents in thread blocks to launch
+    dim3 get_grid_dims() const
+    {
+      return block_mapping.get_grid_dims();
+    }
+
+    /// Lightweight update given a subset of arguments.  Problem geometry is assumed
+    /// to remain the same.
+    CUTLASS_HOST_DEVICE
+    void update(Arguments const &args)
+    {
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_C = const_cast<void *>(args.ptr_C);
+      ptr_D = args.ptr_D;
+
+      ptr_Vector = args.ptr_Vector;
+      ldr = args.ldr;
+      ptr_Tensor = args.ptr_Tensor;
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C = args.batch_stride_C;
+      batch_stride_D = args.batch_stride_D;
+      batch_stride_Vector = args.batch_stride_Vector;
+      batch_stride_Tensor = args.batch_stride_Tensor;
+
+      output_op = args.epilogue;
+
+      CUTLASS_TRACE_HOST("GemmStreamkWithFusedEpilogue::Params::update()");
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
+      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
+      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
+    }
+  };
+
+  /// Tile work descriptor
+  struct TileWorkDesc
+  {
+    /// The linear tile index
+    int tile_idx;
+
+    /// The location of this tile (in threadblock-tile coordinates) in the output matrix
+    cutlass::gemm::GemmCoord tiled_coord;
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    int iter_begin;
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    int k_begin;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    int k_end;
+
+    /// The number of remaining MAC-iterations this threadblock will perform for this tile
+    int k_iters_remaining;
+
+    // Whether this block will perform the first iteration of this tile
+    CUTLASS_DEVICE
+    bool tile_started()
+    {
+      return (k_begin == 0);
+    }
+
+    // Whether this block will perform the last iteration of this tile
+    CUTLASS_DEVICE
+    bool tile_finished(Params const &params)
+    {
+      return (k_end == params.block_mapping.problem_size.k());
+    }
+  };
+
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+
+protected:
+
+  //
+  // Data members
+  //
+
+  /// GEMM problem parameters
+  Params const &params;
+
+  /// Shared storage reference
+  SharedStorage &shared_storage;
+
+  /// ID within the threadblock
+  int thread_idx;
+
+  /// ID of warp
+  int warp_idx;
+
+  /// ID of each thread within a warp
+  int lane_idx;
+
+  /// Threadblock scoped epilogue
+  Epilogue epilogue;
+
+
+public:
+
+  //
+  // Host dispatch API
+  //
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size) {
+
+    CUTLASS_TRACE_HOST("GemmStreamkWithFusedEpilogue::can_implement()");
+
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
+            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    CUTLASS_TRACE_HOST("  returning kSuccess");
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+protected:
+
+  //
+  // Device-only utility methods
+  //
+
+  /// Iterator for fetching tile fragments from A
+  CUTLASS_DEVICE
+  typename Mma::IteratorA init_iterator_A(
+    TileWorkDesc &tile_work,
+    GemmUniversalMode mode)
+  {
+    // The input A matrix
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
+
+    // Update input pointers based on batched/array mode
+    if (mode == GemmUniversalMode::kBatched) {
+      ptr_A += tile_work.tiled_coord.k() * params.batch_stride_A;
+    }
+    if (mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[tile_work.tiled_coord.k()];
+    }
+
+    int m_begin = tile_work.tiled_coord.m() * Mma::Shape::kM;
+    int m_end = params.block_mapping.problem_size.m();
+    return Mma::IteratorA(
+        params.params_A,
+        ptr_A,
+        { m_end, tile_work.k_end },
+        threadIdx.x,
+        { m_begin, tile_work.k_begin });
+
+  }
+
+
+  /// Iterator for fetching tile fragments from B
+  CUTLASS_DEVICE
+  typename Mma::IteratorB init_iterator_B(
+    TileWorkDesc &tile_work,
+    GemmUniversalMode mode)
+  {
+    // The input B matrix
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    // Update input pointers based on batched/array mode
+    if (mode == GemmUniversalMode::kBatched) {
+      ptr_B += tile_work.tiled_coord.k() * params.batch_stride_B;
+    }
+    if (mode == GemmUniversalMode::kArray) {
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[tile_work.tiled_coord.k()];
+    }
+
+    int n_begin = tile_work.tiled_coord.n() * Mma::Shape::kN;
+    int n_end = params.block_mapping.problem_size.n();
+    return Mma::IteratorB(
+        params.params_B,
+        ptr_B,
+        { tile_work.k_end, n_end },
+        threadIdx.x,
+        { tile_work.k_begin, n_begin });
+  }
+
+
+  CUTLASS_DEVICE
+  void init_dp_tile_work(
+      TileWorkDesc &tile_work,
+      int tile_idx)
+  {
+    // The linear tile index
+    tile_work.tile_idx = tile_idx;
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    tile_work.iter_begin = tile_idx * params.block_mapping.iters_per_tile();
+
+    // The number of MAC-iterations this threadblock will perform for this tile
+    tile_work.k_iters_remaining = params.block_mapping.iters_per_tile();
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_begin = 0;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_end = params.block_mapping.problem_size.k();
+
+    // The location of this tile (in threadblock-tile coordinates) in the output matrix
+    tile_work.tiled_coord = params.block_mapping.get_tile_offset(tile_work.tile_idx);
+  }
+
+
+  CUTLASS_DEVICE
+  void init_sk_tile_work(
+      TileWorkDesc &tile_work,
+      int tile_idx,
+      int block_iter_begin,
+      int block_iter_end)
+  {
+    // The linear tile index
+    tile_work.tile_idx = tile_idx;
+
+    // The first global-scoped MAC-iteration for this tile
+    int tile_iter_begin = tile_idx * params.block_mapping.iters_per_tile();
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    tile_work.iter_begin = max(block_iter_begin, tile_iter_begin);
+
+    // The first tile-scoped MAC-iteration this threadblock will perform for this tile
+    int k_iter_begin = tile_work.iter_begin - tile_iter_begin;
+
+    // The last (one past) tile-scoped MAC-iteration this threadblock will perform for this tile
+    int k_iter_end = block_iter_end - tile_iter_begin;
+
+    // The number of MAC-iterations this threadblock will perform for this tile
+    tile_work.k_iters_remaining = k_iter_end - k_iter_begin;
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_begin = k_iter_begin * Mma::Shape::kK;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_end = min(
+        params.block_mapping.problem_size.k(),            // extent of k domain
+        (k_iter_end * Mma::Shape::kK));                   // extent of the threadblock's global iteration assignment
+
+    // The location of this tile (in threadblock-tile coordinates) in the output matrix
+    tile_work.tiled_coord = params.block_mapping.get_tile_offset(tile_work.tile_idx);
+  }
+
+
+  /// Share accumulators with peers
+  CUTLASS_DEVICE
+  void share_accumulators(
+    AccumulatorTile const &accumulator_tile,
+    int block_idx,
+    int first_block_idx)
+  {
+    AccumulatorTile *accum_tile_workspace = reinterpret_cast<AccumulatorTile *>(params.partials_workspace);
+
+    int accum_tile_offset = first_block_idx * kThreadCount;
+
+    if (block_idx == first_block_idx)
+    {
+      // First peer initializes the workspace partials
+      BlockStripedReduceT::store(accum_tile_workspace + accum_tile_offset, accumulator_tile, thread_idx);
+    }
+    else
+    {
+      // Subsequent peers atomically accumulate into the workspace partials
+      if (ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kAtomic)
+      {
+        // Non-deterministic reduction order: wait for the first peer to have initialized the partials before we add to them
+        Barrier::wait_lt(params.barrier_workspace, thread_idx, first_block_idx, 1);
+      }
+      else
+      {
+        // Turnstile reduction order: wait until the previous peer has written
+        int wait_count = block_idx - first_block_idx;
+        Barrier::wait_eq(params.barrier_workspace, thread_idx, first_block_idx, wait_count);
+      }
+
+      // Perform reduction in workspace
+      BlockStripedReduceT::reduce(accum_tile_workspace + accum_tile_offset, accumulator_tile, thread_idx);
+    }
+
+    // Signal our arrival
+    Barrier::arrive_inc(params.barrier_workspace, thread_idx, first_block_idx);
+  }
+
+
+  /// Acquire accumulators from peers
+  CUTLASS_DEVICE
+  void acquire_accumulators(
+    AccumulatorTile &accumulator_tile,
+    int block_idx,
+    int first_block_idx)
+  {
+    AccumulatorTile *accum_tile_workspace = reinterpret_cast<AccumulatorTile *>(params.partials_workspace);
+
+    // Wait for arrival
+    int num_carry_in = block_idx - first_block_idx;
+    Barrier::wait_eq_reset(params.barrier_workspace, thread_idx, first_block_idx, num_carry_in);
+
+    // Load and add peer-partials accumulator tile to local accumulator tile
+    int accum_tile_offset = first_block_idx * kThreadCount;
+    BlockStripedReduceT::load_add(accumulator_tile, accum_tile_workspace + accum_tile_offset, thread_idx);
+  }
+
+
+  /// Perform epilogue computations and output
+  CUTLASS_DEVICE
+  void do_epilogue(
+    TileWorkDesc &tile_work,
+    AccumulatorTile &accumulator_tile)
+  {
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+    typename Epilogue::ElementTensor *ptr_Tensor = static_cast<typename Epilogue::ElementTensor *>(params.ptr_Tensor);
+
+    // Define the reduction output pointer and move to the appropriate place
+    typename Epilogue::ElementVector *ptr_Vector =
+      static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
+
+    // Update pointers for batched/array mode(s)
+    if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C += tile_work.tiled_coord.k() * params.batch_stride_C;
+      ptr_D += tile_work.tiled_coord.k() * params.batch_stride_D;
+      if (ptr_Tensor) {
+        ptr_Tensor = ReferenceFactory<typename Epilogue::ElementTensor>::add_pointer_offset(
+          ptr_Tensor,
+          tile_work.tiled_coord.k() * params.batch_stride_Tensor);
+      }
+      if (ptr_Vector) {
+        ptr_Vector += tile_work.tiled_coord.k() * params.batch_stride_Vector;
+      }
+    }
+    if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[tile_work.tiled_coord.k()];
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[tile_work.tiled_coord.k()];
+      if (ptr_Tensor) {
+        ptr_Tensor = static_cast<typename Epilogue::ElementTensor * const *>(params.ptr_Tensor)[tile_work.tiled_coord.k()];
+      }
+      if (ptr_Vector) {
+        ptr_Vector = static_cast<typename Epilogue::ElementVector * const *>(params.ptr_Vector)[tile_work.tiled_coord.k()];
+      }
+    }
+
+    // Location of this tile in item-coords
+    MatrixCoord threadblock_item_begin(
+      tile_work.tiled_coord.m() * Mma::Shape::kM,
+      tile_work.tiled_coord.n() * Mma::Shape::kN
+    );
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+        params.params_C,
+        ptr_C,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+        params.params_D,
+        ptr_D,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Additional tensor to load from
+    typename Epilogue::TensorTileIterator tensor_iterator(
+        params.params_Tensor,
+        ptr_Tensor,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Move to appropriate location for this output tile
+    if (ptr_Vector) {
+      ptr_Vector += threadblock_item_begin.column() + tile_work.tiled_coord.m() * params.ldr;
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+        EpilogueOutputOp(params.output_op),
+        ptr_Vector,
+        iterator_D,
+        accumulator_tile,
+        iterator_C,
+        tensor_iterator,
+        params.block_mapping.problem_size.mn(),
+        threadblock_item_begin);
+  }
+
+
+  CUTLASS_DEVICE
+  void separate_reduction(int reduce_idx)
+  {
+    int peer_idx_begin, peer_idx_last, reduce_tile_idx, reduce_fragment_idx;
+
+    // Reduce by sk-tile (every tile contributed to by one or more blocks)
+    reduce_tile_idx = reduce_idx / Epilogue::kAccumulatorFragments;
+    reduce_fragment_idx = reduce_idx % Epilogue::kAccumulatorFragments;
+
+    int iter_tile_first = reduce_tile_idx * params.block_mapping.iters_per_tile();
+    int iter_tile_last = iter_tile_first + params.block_mapping.iters_per_tile() - 1;
+
+    peer_idx_begin = params.block_mapping.get_sk_block_idx(iter_tile_first);
+    peer_idx_last = params.block_mapping.get_sk_block_idx(iter_tile_last);
+
+    // Wait for peers to complete
+    int peer_idx_end = peer_idx_last + 1;
+    int num_peers = peer_idx_end - peer_idx_begin;
+    Barrier::wait_eq_reset(
+        params.barrier_workspace,
+        thread_idx,
+        (reduce_tile_idx * Epilogue::kAccumulatorFragments) + reduce_fragment_idx,
+        num_peers);
+
+    /// The location of this tile (in threadblock-tile coordinates) in the output matrix
+    GemmCoord tiled_coord = params.block_mapping.get_tile_offset(reduce_tile_idx);
+
+    // Location of this tile in item-coords
+    MatrixCoord threadblock_item_begin(
+      tiled_coord.m() * Mma::Shape::kM,
+      tiled_coord.n() * Mma::Shape::kN
+    );
+
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+    typename Epilogue::ElementTensor *ptr_Tensor = static_cast<typename Epilogue::ElementTensor *>(params.ptr_Tensor);
+
+    // Define the reduction output pointer and move to the appropriate place
+    typename Epilogue::ElementVector *ptr_Vector =
+      static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+        params.params_C,
+        ptr_C,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+        params.params_D,
+        ptr_D,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Additional tensor to load from
+    typename Epilogue::TensorTileIterator tensor_iterator(
+        params.params_Tensor,
+        ptr_Tensor,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Move to appropriate location for this output tile
+    if (ptr_Vector) {
+      ptr_Vector += threadblock_item_begin.column() + tiled_coord.m() * params.ldr;
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue.reduce(
+        peer_idx_begin,
+        peer_idx_end,
+        reduce_fragment_idx,
+        params.partials_workspace,
+        EpilogueOutputOp(params.output_op),
+        ptr_Vector,
+        iterator_D,
+        iterator_C,
+        tensor_iterator,
+        params.block_mapping.problem_size.mn(),
+        threadblock_item_begin);
+  }
+
+
+  CUTLASS_DEVICE
+  void process_tile(
+    TileWorkDesc tile_work,
+    int block_idx,
+    int dp_start_block_idx,
+    int block_iter_begin)
+  {
+    // Initialize input iterators
+    typename Mma::IteratorA iterator_A = init_iterator_A(tile_work, params.mode);
+    typename Mma::IteratorB iterator_B = init_iterator_B(tile_work, params.mode);
+
+    // Initialize accumulators
+    AccumulatorTile accumulator_tile;
+    accumulator_tile.clear();
+
+    // Initialize MMA abstraction
+    Mma mma(
+      shared_storage.main_loop,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    // Perform this tile's range of multiply-accumulate (MAC) iterations
+    mma(tile_work.k_iters_remaining, accumulator_tile, iterator_A, iterator_B, accumulator_tile);
+
+    if ((ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kAtomic) ||
+        (params.block_mapping.reduction_blocks == 0) ||
+        (block_idx >= dp_start_block_idx))
+    {
+      //
+      // Cooperative SK peer reduction or DP block
+      //
+
+      int first_block_idx = params.block_mapping.get_first_block_idx(tile_work.tile_idx, block_idx);
+
+      if (!tile_work.tile_finished(params)) {
+        // Non "finishing" SK blocks must share their partial accumulator sums through global scratch workspace
+        share_accumulators(accumulator_tile, block_idx, first_block_idx);
+      }
+      else
+      {
+        // DP blocks and "finishing" SK blocks must perform epilogue operations and write the output tile
+        if (!tile_work.tile_started())
+        {
+          // A "finishing" SK block must first aggregate its accumulator partial sums with those shared by peer threadblocks
+          acquire_accumulators(accumulator_tile, block_idx, first_block_idx);
+        }
+
+        do_epilogue(tile_work, accumulator_tile);
+      }
+    }
+    else
+    {
+      //
+      // Separate peer reduction
+      //
+
+      // Share accumulator partial sums with peer threadblock(s) through scratch workspace
+      epilogue.share(block_idx, params.partials_workspace, accumulator_tile, tile_work.tile_started());
+
+      // Signal arrival
+      Barrier::arrive_range_inc(
+        params.barrier_workspace,
+        thread_idx,
+        tile_work.tile_idx * Epilogue::kAccumulatorFragments,
+        Epilogue::kAccumulatorFragments);
+    }
+  }
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void gemm()
+  {
+    // Initialize block's iteration range
+    int tile_idx = 0;
+    int block_iter_begin = 0;
+    int block_iters_remaining = 0;
+
+    int block_idx = params.block_mapping.get_block_idx();
+
+    int sk_padding_start_block_idx =  params.block_mapping.sk_regions() * params.block_mapping.sk_blocks_per_region();
+    int dp_start_block_idx = params.block_mapping.sk_waves * params.block_mapping.avail_sms;
+    int reduce_start_block_idx = dp_start_block_idx + params.block_mapping.dp_blocks;
+    int grid_padding_start_block_idx = reduce_start_block_idx + params.block_mapping.reduction_blocks;
+
+    // Initialize tile work descriptor
+    TileWorkDesc tile_work;
+
+    bool dp_block = (block_idx >= dp_start_block_idx) && (block_idx < reduce_start_block_idx);
+    bool sk_block = (block_idx < sk_padding_start_block_idx);
+    bool reduce_block = (block_idx >= reduce_start_block_idx) &&
+            (block_idx < grid_padding_start_block_idx) &&
+            (ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kMixed);
+
+    if (dp_block)
+    {
+      // This is a DP block
+      int dp_block_idx = block_idx - dp_start_block_idx;
+      int first_dp_tile = (params.block_mapping.cohort_raster) ? 0 : params.block_mapping.sk_tiles;
+
+      // Blocks in first DP wave get configured number of tiles
+      tile_idx = first_dp_tile + dp_block_idx;
+      int tile_allottment = params.block_mapping.dp_first_wave_tiles;
+
+      // Blocks in subsequent DP waves get 1 tile
+      if (dp_block_idx >= params.block_mapping.avail_sms) {
+          tile_allottment = 1;
+          tile_idx += (params.block_mapping.dp_first_wave_tiles - 1) * params.block_mapping.avail_sms;
+      }
+
+      block_iters_remaining = params.block_mapping.iters_per_tile() * tile_allottment;
+
+      init_dp_tile_work(tile_work, tile_idx);
+
+      // DP blocks exit if out of bounds or overlap an SK tile (only possible during cohort rasterization, where dp_first_wave_tiles must be 1)
+      if ((tile_idx < params.block_mapping.sk_tiles) ||
+          (tile_work.tiled_coord.m() >= params.block_mapping.tiled_shape().m()) ||
+          (tile_work.tiled_coord.n() >= params.block_mapping.tiled_shape().n()))
+      {
+        return;
+      }
+    }
+    else if (sk_block)
+    {
+      // This is a SK block
+      int block_iter_end;
+      params.block_mapping.get_iter_extents(block_idx, block_iter_begin, block_iter_end);
+      block_iters_remaining = block_iter_end - block_iter_begin;
+
+      tile_idx = params.block_mapping.get_sk_tile_idx(block_iter_end - 1);
+      init_sk_tile_work(tile_work, tile_idx, block_iter_begin, block_iter_begin + block_iters_remaining);
+    }
+    else
+    {
+      if (reduce_block)
+      {
+        // This is a reduction threadblock
+        int reduce_block_idx = block_idx - reduce_start_block_idx;
+        separate_reduction(reduce_block_idx);
+      }
+
+      return;
+    }
+
+    // Iteration-processing loop body
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (true)
+    {
+      // Perform this block's share of work for this tile
+      process_tile(
+        tile_work,
+        block_idx,
+        dp_start_block_idx,
+        block_iter_begin);
+
+      block_iters_remaining -= tile_work.k_iters_remaining;
+
+      if (block_iters_remaining == 0)
+      {
+        break;
+      }
+
+      // Continue to next tile
+      __syncthreads();
+
+      if (block_idx >= dp_start_block_idx)
+      {
+        // DP block consume their tiles at stride
+        tile_idx += params.block_mapping.avail_sms;
+        init_dp_tile_work(tile_work, tile_idx);
+      }
+      else
+      {
+        // SK blocks consume their tiles in backwards order
+        tile_idx--;
+        init_sk_tile_work(tile_work, tile_idx, block_iter_begin, block_iter_begin + block_iters_remaining);
+      }
+    }
+
+  }
+
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmStreamkWithFusedEpilogue op(params, shared_storage);
+    op();
+  }
+
+
+  // Constructor
+  CUTLASS_DEVICE
+  GemmStreamkWithFusedEpilogue(
+      Params const &params,
+      SharedStorage &shared_storage)
+    :
+      params(params),
+      shared_storage(shared_storage),
+      thread_idx(threadIdx.x),
+      warp_idx(__shfl_sync(0xffffffff, threadIdx.x / 32, 0)),   // broadcast the warp_id computed by lane 0 to ensure dependent code
+      lane_idx(threadIdx.x % 32),
+      epilogue(
+        shared_storage.epilogue,
+        thread_idx,
+        warp_idx,
+        lane_idx)
+  {}
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()() {
+    // Generic SK code path
+    gemm();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/gemm_transpose_operands.h b/include/cutlass/gemm/kernel/gemm_transpose_operands.h
index ab80f4da20..4a2258c41f 100644
--- a/include/cutlass/gemm/kernel/gemm_transpose_operands.h
+++ b/include/cutlass/gemm/kernel/gemm_transpose_operands.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/gemm_universal.h b/include/cutlass/gemm/kernel/gemm_universal.h
index 982f765dae..08b30c74cf 100644
--- a/include/cutlass/gemm/kernel/gemm_universal.h
+++ b/include/cutlass/gemm/kernel/gemm_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -30,20 +30,23 @@
  **************************************************************************************************/
 
 /*! \file
-    \brief 
+    \brief
 */
 
 #pragma once
 
 #include "cutlass/cutlass.h"
+
+#include "cutlass/arch/arch.h"
 #include "cutlass/fast_math.h"
-#include "cutlass/gemm/gemm.h"
 #include "cutlass/matrix_coord.h"
 #include "cutlass/complex.h"
 #include "cutlass/semaphore.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
 
 #include "cutlass/layout/matrix.h"
-
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/params_universal_base.h"
 #include "cutlass/trace.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -55,11 +58,19 @@ namespace kernel {
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <
-  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
   typename Epilogue_,             ///! Epilogue
   typename ThreadblockSwizzle_    ///! Threadblock swizzling function
 >
-struct GemmUniversal {
+class GemmUniversal<
+  Mma_,
+  Epilogue_,
+  ThreadblockSwizzle_,
+  void,
+  // 3.x kernels use the first template argument to define the ProblemShape
+  // We use this invariant to SFINAE dispatch against either the 2.x API or the 3.x API
+  cute::enable_if_t<not (cute::is_tuple<Mma_>::value || IsCutlass3ArrayKernel<Mma_>::value)>
+> {
 public:
 
   using Mma = Mma_;
@@ -101,16 +112,12 @@ struct GemmUniversal {
   //
 
   /// Argument structure
-  struct Arguments {
-
+  struct Arguments : UniversalArgumentsBase
+  {
     //
     // Data members
     //
 
-    GemmUniversalMode mode;
-    GemmCoord problem_size;
-    int batch_count;
-
     typename EpilogueOutputOp::Params epilogue;
 
     void const * ptr_A;
@@ -121,7 +128,6 @@ struct GemmUniversal {
     int64_t batch_stride_A;
     int64_t batch_stride_B;
     int64_t batch_stride_C;
-    int64_t batch_stride_D;
 
     typename LayoutA::Stride stride_a;
     typename LayoutB::Stride stride_b;
@@ -140,14 +146,13 @@ struct GemmUniversal {
     //
     // Methods
     //
-    
-    Arguments(): 
-      mode(GemmUniversalMode::kGemm), 
-      batch_count(1), 
+
+    Arguments():
       ptr_A(nullptr), ptr_B(nullptr), ptr_C(nullptr), ptr_D(nullptr),
       ptr_gather_A_indices(nullptr),
       ptr_gather_B_indices(nullptr),
-      ptr_scatter_D_indices(nullptr) {}
+      ptr_scatter_D_indices(nullptr)
+    {}
 
     /// constructs an arguments structure
     Arguments(
@@ -169,23 +174,22 @@ struct GemmUniversal {
       typename LayoutC::Stride stride_d,
       int const *ptr_gather_A_indices = nullptr,
       int const *ptr_gather_B_indices = nullptr,
-      int const *ptr_scatter_D_indices = nullptr
-    ):
-      mode(mode), 
-      problem_size(problem_size), 
-      batch_count(batch_count),
-      epilogue(epilogue), 
-      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D), 
-      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C), batch_stride_D(batch_stride_D), 
+      int const *ptr_scatter_D_indices = nullptr)
+    :
+      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
+      epilogue(epilogue),
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
+      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C),
       stride_a(stride_a), stride_b(stride_b), stride_c(stride_c), stride_d(stride_d),
       ptr_gather_A_indices(ptr_gather_A_indices), ptr_gather_B_indices(ptr_gather_B_indices),
-      ptr_scatter_D_indices(ptr_scatter_D_indices) {
+      ptr_scatter_D_indices(ptr_scatter_D_indices)
+    {
       lda = 0;
       ldb = 0;
       ldc = 0;
       ldd = 0;
       CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
-      }
+    }
 
     /// constructs an arguments structure
     Arguments(
@@ -209,26 +213,26 @@ struct GemmUniversal {
       int const *ptr_gather_B_indices = nullptr,
       int const *ptr_scatter_D_indices = nullptr
     ):
-      mode(mode), 
-      problem_size(problem_size), 
-      batch_count(batch_count),
-      epilogue(epilogue), 
-      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D), 
-      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C), batch_stride_D(batch_stride_D),
+      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
+      epilogue(epilogue),
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
+      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C),
       lda(lda), ldb(ldb), ldc(ldc), ldd(ldd),
       ptr_gather_A_indices(ptr_gather_A_indices), ptr_gather_B_indices(ptr_gather_B_indices),
-      ptr_scatter_D_indices(ptr_scatter_D_indices) {
+      ptr_scatter_D_indices(ptr_scatter_D_indices)
+    {
       stride_a = make_Coord(lda);
       stride_b = make_Coord(ldb);
       stride_c = make_Coord(ldc);
       stride_d = make_Coord(ldd);
       CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
-      }
+    }
 
     /// Returns arguments for the transposed problem
-    Arguments transposed_problem() const {
+    Arguments transposed_problem() const
+    {
       Arguments args(*this);
-      
+
       std::swap(args.problem_size.m(), args.problem_size.n());
       std::swap(args.ptr_A, args.ptr_B);
       std::swap(args.lda, args.ldb);
@@ -240,27 +244,40 @@ struct GemmUniversal {
     }
   };
 
+
   //
   // Structure for precomputing values in host memory and passing to kernels
   //
 
   /// Parameters structure
-  struct Params {
+  struct Params : UniversalParamsBase<
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>
+  {
+    using ParamsBase = UniversalParamsBase<
+      ThreadblockSwizzle,
+      ThreadblockShape,
+      ElementA,
+      ElementB,
+      ElementC,
+      LayoutA,
+      LayoutB>;
+
+    //
+    // Data members
+    //
 
-    cutlass::gemm::GemmCoord problem_size;
-    cutlass::gemm::GemmCoord grid_tiled_shape;
-    int swizzle_log_tile;
-    
     typename Mma::IteratorA::Params params_A;
     typename Mma::IteratorB::Params params_B;
     typename Epilogue::OutputTileIterator::Params params_C;
     typename Epilogue::OutputTileIterator::Params params_D;
-    
-    typename EpilogueOutputOp::Params output_op;
 
-    GemmUniversalMode mode;
-    int batch_count;
-    int gemm_k_size;
+    typename EpilogueOutputOp::Params output_op;
 
     void * ptr_A;
     void * ptr_B;
@@ -270,59 +287,30 @@ struct GemmUniversal {
     int64_t batch_stride_A;
     int64_t batch_stride_B;
     int64_t batch_stride_C;
-    int64_t batch_stride_D;
 
     int * ptr_gather_A_indices;
     int * ptr_gather_B_indices;
     int * ptr_scatter_D_indices;
 
-    int *semaphore;
-
     //
-    // Methods
+    // Host dispatch API
     //
 
-    CUTLASS_HOST_DEVICE
-    Params():
-      swizzle_log_tile(0),
-      params_A(0),
-      params_B(0),
-      params_C(0),
-      params_D(0),
-      batch_count(0),
-      gemm_k_size(0),
-      mode(cutlass::gemm::GemmUniversalMode::kGemm),
-      ptr_A(nullptr),
-      ptr_B(nullptr),
-      ptr_C(nullptr),
-      ptr_D(nullptr),
-      batch_stride_A(0),
-      batch_stride_B(0),
-      batch_stride_C(0),
-      batch_stride_D(0),
-      ptr_gather_A_indices(nullptr),
-      ptr_gather_B_indices(nullptr),
-      ptr_scatter_D_indices(nullptr),
-      semaphore(nullptr) { }
+    /// Default constructor
+    Params() = default;
 
-    CUTLASS_HOST_DEVICE
+    /// Constructor
     Params(
-      Arguments const &args,
-      cutlass::gemm::GemmCoord const & grid_tiled_shape,
-      int gemm_k_size,
-      void *workspace = nullptr
-    ):
-      problem_size(args.problem_size),
-      grid_tiled_shape(grid_tiled_shape),
-      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      ParamsBase(args, device_sms, sm_occupancy),
       params_A(args.lda ? make_Coord_with_padding<LayoutA::kStrideRank>(args.lda) : args.stride_a),
       params_B(args.ldb ? make_Coord_with_padding<LayoutB::kStrideRank>(args.ldb) : args.stride_b),
       params_C(args.ldc ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldc) : args.stride_c),
       params_D(args.ldd ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldd) : args.stride_d),
       output_op(args.epilogue),
-      mode(args.mode),
-      batch_count(args.batch_count),
-      gemm_k_size(gemm_k_size),
       ptr_A(const_cast<void *>(args.ptr_A)),
       ptr_B(const_cast<void *>(args.ptr_B)),
       ptr_C(const_cast<void *>(args.ptr_C)),
@@ -330,38 +318,34 @@ struct GemmUniversal {
       batch_stride_A(args.batch_stride_A),
       batch_stride_B(args.batch_stride_B),
       batch_stride_C(args.batch_stride_C),
-      batch_stride_D(args.batch_stride_D),
       ptr_gather_A_indices(const_cast<int *>(args.ptr_gather_A_indices)),
       ptr_gather_B_indices(const_cast<int *>(args.ptr_gather_B_indices)),
-      ptr_scatter_D_indices(const_cast<int *>(args.ptr_scatter_D_indices)),
-      semaphore(static_cast<int *>(workspace)) {
-
-    }
+      ptr_scatter_D_indices(const_cast<int *>(args.ptr_scatter_D_indices))
+    {}
 
-    CUTLASS_HOST_DEVICE
-    void update(
-      Arguments const &args,
-      void *workspace = nullptr) {
+    /// Lightweight update given a subset of arguments.
+    void update(Arguments const &args)
+    {
+      CUTLASS_TRACE_HOST("GemmUniversal::Params::update()");
 
+      // Update input/output pointers
       ptr_A = const_cast<void *>(args.ptr_A);
       ptr_B = const_cast<void *>(args.ptr_B);
       ptr_C = const_cast<void *>(args.ptr_C);
       ptr_D = args.ptr_D;
 
-      ptr_gather_A_indices = const_cast<int *>(args.ptr_gather_A_indices);
-      ptr_gather_B_indices = const_cast<int *>(args.ptr_gather_B_indices);
-      ptr_scatter_D_indices = const_cast<int *>(args.ptr_scatter_D_indices);
-
       batch_stride_A = args.batch_stride_A;
       batch_stride_B = args.batch_stride_B;
       batch_stride_C = args.batch_stride_C;
-      batch_stride_D = args.batch_stride_D;
+      this->batch_stride_D = args.batch_stride_D;
+
+      ptr_gather_A_indices = const_cast<int *>(args.ptr_gather_A_indices);
+      ptr_gather_B_indices = const_cast<int *>(args.ptr_gather_B_indices);
+      ptr_scatter_D_indices = const_cast<int *>(args.ptr_scatter_D_indices);
 
       output_op = args.epilogue;
-      
-      semaphore = static_cast<int *>(workspace);
-      CUTLASS_TRACE_HOST("GemmUniversal::Params::update()");
     }
+
   };
 
   /// Shared memory storage structure
@@ -370,39 +354,37 @@ struct GemmUniversal {
     typename Epilogue::SharedStorage epilogue;
   };
 
+
 public:
 
   //
-  // Methods
+  // Host dispatch API
   //
 
-  CUTLASS_DEVICE
-  GemmUniversal() { } 
-
   /// Determines whether kernel satisfies alignment
   static Status can_implement(
-    cutlass::gemm::GemmCoord const & problem_size) {
-
+    cutlass::gemm::GemmCoord const & problem_size)
+  {
     CUTLASS_TRACE_HOST("GemmUniversal::can_implement()");
 
-    static int const kAlignmentA = (platform::is_same<LayoutA,
+    static int const kAlignmentA = (cute::is_same<LayoutA,
                                                       layout::ColumnMajorInterleaved<32>>::value)
                                    ? 32
-                                   : (platform::is_same<LayoutA,
+                                   : (cute::is_same<LayoutA,
                                                         layout::ColumnMajorInterleaved<64>>::value)
                                      ? 64
                                      : Mma::IteratorA::AccessType::kElements;
-    static int const kAlignmentB = (platform::is_same<LayoutB,
+    static int const kAlignmentB = (cute::is_same<LayoutB,
                                                       layout::RowMajorInterleaved<32>>::value)
                                    ? 32
-                                   : (platform::is_same<LayoutB,
+                                   : (cute::is_same<LayoutB,
                                                         layout::RowMajorInterleaved<64>>::value)
                                      ? 64
                                      : Mma::IteratorB::AccessType::kElements;
-    static int const kAlignmentC = (platform::is_same<LayoutC,
+    static int const kAlignmentC = (cute::is_same<LayoutC,
                                                       layout::ColumnMajorInterleaved<32>>::value)
                                    ? 32
-                                   : (platform::is_same<LayoutC,
+                                   : (cute::is_same<LayoutC,
                                                         layout::ColumnMajorInterleaved<64>>::value)
                                      ? 64
                                      : Epilogue::OutputTileIterator::kElementsPerAccess;
@@ -411,30 +393,30 @@ struct GemmUniversal {
     bool isBMisaligned = false;
     bool isCMisaligned = false;
 
-    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+    if (cute::is_same<LayoutA, layout::RowMajor>::value) {
       isAMisaligned = problem_size.k() % kAlignmentA;
-    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+    } else if (cute::is_same<LayoutA, layout::ColumnMajor>::value) {
       isAMisaligned = problem_size.m() % kAlignmentA;
-    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
-            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+    } else if (cute::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
+            || cute::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
       isAMisaligned = problem_size.k() % kAlignmentA;
     }
 
-    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+    if (cute::is_same<LayoutB, layout::RowMajor>::value) {
       isBMisaligned = problem_size.n() % kAlignmentB;
-    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+    } else if (cute::is_same<LayoutB, layout::ColumnMajor>::value) {
       isBMisaligned = problem_size.k() % kAlignmentB;
-    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
-            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+    } else if (cute::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
+            || cute::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
       isBMisaligned = problem_size.k() % kAlignmentB;
     }
 
-    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+    if (cute::is_same<LayoutC, layout::RowMajor>::value) {
       isCMisaligned = problem_size.n() % kAlignmentC;
-    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+    } else if (cute::is_same<LayoutC, layout::ColumnMajor>::value) {
       isCMisaligned = problem_size.m() % kAlignmentC;
-    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
-            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+    } else if (cute::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
+            || cute::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
       isCMisaligned = problem_size.n() % kAlignmentC;
     }
 
@@ -462,18 +444,34 @@ struct GemmUniversal {
     return can_implement(args.problem_size);
   }
 
-  static size_t get_extra_workspace_size(Arguments const &args,
-                                         cutlass::gemm::GemmCoord const &grid_tiled_shape) {
 
-    return 0;
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmUniversal op;
+    op(params, shared_storage);
   }
- 
+
+
   /// Executes one GEMM
   CUTLASS_DEVICE
   void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    // Compute threadblock location
     ThreadblockSwizzle threadblock_swizzle;
+    run_with_swizzle(params, shared_storage, threadblock_swizzle);
+  }
+
+  /// Executes one GEMM with an externally-provided swizzling function
+  CUTLASS_DEVICE
+  void run_with_swizzle(Params const &params, SharedStorage &shared_storage, ThreadblockSwizzle& threadblock_swizzle) {
 
     cutlass::gemm::GemmCoord threadblock_tile_offset =
         threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
@@ -488,18 +486,18 @@ struct GemmUniversal {
     int offset_k = 0;
     int problem_size_k = params.problem_size.k();
 
-    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A); 
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
     ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
 
     //
     // Fetch pointers based on mode.
     //
-    if (params.mode == GemmUniversalMode::kGemm || 
+    if (params.mode == GemmUniversalMode::kGemm ||
       params.mode == GemmUniversalMode::kGemmSplitKParallel) {
 
       if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
 
-        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size; 
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
       }
 
       offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
@@ -548,7 +546,7 @@ struct GemmUniversal {
 
     // Broadcast the warp_id computed by lane 0 to ensure dependent code
     // is compiled as warp-uniform.
-    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int warp_idx = canonical_warp_idx_sync();
 
     int lane_idx = threadIdx.x % 32;
 
@@ -568,10 +566,10 @@ struct GemmUniversal {
 
     // Compute threadblock-scoped matrix multiply-add
     mma(
-      gemm_k_iterations, 
-      accumulators, 
-      iterator_A, 
-      iterator_B, 
+      gemm_k_iterations,
+      accumulators,
+      iterator_A,
+      iterator_B,
       accumulators);
 
     //
@@ -594,13 +592,13 @@ struct GemmUniversal {
 
     int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
 
-    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C); 
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
     ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
 
     //
     // Fetch pointers based on mode.
     //
-    
+
     // Construct the semaphore.
     Semaphore semaphore(params.semaphore + block_idx, thread_idx);
 
@@ -608,7 +606,7 @@ struct GemmUniversal {
 
       // If performing a reduction via split-K, fetch the initial synchronization
       if (params.grid_tiled_shape.k() > 1) {
-        
+
         // Fetch the synchronization lock initially but do not block.
         semaphore.fetch();
 
@@ -649,14 +647,14 @@ struct GemmUniversal {
     );
 
     Epilogue epilogue(
-      shared_storage.epilogue, 
-      thread_idx, 
-      warp_idx, 
+      shared_storage.epilogue,
+      thread_idx,
+      warp_idx,
       lane_idx);
 
     // Wait on the semaphore - this latency may have been covered by iterator construction
     if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
-        
+
       // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
       if (threadblock_tile_offset.k()) {
         iterator_C = iterator_D;
@@ -668,16 +666,16 @@ struct GemmUniversal {
 
     // Execute the epilogue operator to update the destination tensor.
     epilogue(
-      output_op, 
-      iterator_D, 
-      accumulators, 
-      iterator_C); 
-    
+      output_op,
+      iterator_D,
+      accumulators,
+      iterator_C);
+
     //
     // Release the semaphore
     //
 
-    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) { 
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
 
       int lock = 0;
       if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
@@ -689,7 +687,7 @@ struct GemmUniversal {
         // Otherwise, the semaphore is incremented
         lock = threadblock_tile_offset.k() + 1;
       }
-      
+
       semaphore.release(lock);
     }
   }
diff --git a/include/cutlass/gemm/kernel/gemm_universal.hpp b/include/cutlass/gemm/kernel/gemm_universal.hpp
new file mode 100644
index 0000000000..6c7b89a241
--- /dev/null
+++ b/include/cutlass/gemm/kernel/gemm_universal.hpp
@@ -0,0 +1,66 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/gemm/kernel/gemm_universal_decl.h"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+// In cases where ProblemShape is not a tuple, this is used to check if the
+// underlying problem shape type is aliased within or not.
+// Used for dispatching GemmUniversal to 2.x API or 3.x API
+template <class ProblemShape, class = void>
+struct IsCutlass3ArrayKernel : cute::false_type { };
+
+template <typename ProblemShape>
+struct IsCutlass3ArrayKernel<ProblemShape, cute::void_t<typename ProblemShape::UnderlyingProblemShape>>
+    : cute::true_type { };
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
+
+////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/gemm/kernel/sm70_gemm.hpp"
+#include "cutlass/gemm/kernel/sm90_gemm_tma.hpp"
+#include "cutlass/gemm/kernel/sm90_gemm_warpspecialized.hpp"
+#include "cutlass/gemm/kernel/sm90_gemm_warpspecialized_pingpong.hpp"
+#include "cutlass/gemm/kernel/sm90_gemm_warpspecialized_cooperative.hpp"
+#include "cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp"
+#include "cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp"
+#include "cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp"
+#include "cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp"
+#include "cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp"
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/gemm_universal_decl.h b/include/cutlass/gemm/kernel/gemm_universal_decl.h
new file mode 100644
index 0000000000..73426db5b7
--- /dev/null
+++ b/include/cutlass/gemm/kernel/gemm_universal_decl.h
@@ -0,0 +1,61 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+namespace cutlass::gemm::kernel {
+
+
+/*
+ * Stateless universal device GEMM kernel type that treats GEMM as
+ * a composition of a collective mainloop and a collective epilogue.
+ *
+ * Supports both the 2.x and 3.x APIs based on whether the first type is
+ * a cute::tuple<> or not.
+ * 2.x API implementation: cutlass/gemm/kernel/gemm_universal.h
+ * 3.x API implementation: cutlass/gemm/kernel/gemm_*.hpp
+ *
+ * In the following declaration, the name preceding the 'Or' refers to
+ * 3.x API type argument order, and the name succeeding the 'Or' refers to
+ * 2.x API type argument order. Template arguments without two names
+ * belong to the 3.x API only.
+**/
+template <
+  class ProblemShapeOrThreadblockMma_, // (m, n, k) or (m, n, k, l)
+  class CollectiveMainloopOrEpilogue_,
+  class CollectiveEpilogueOrThreadblockSwizzle_,
+  class TileScheduler_ = void,
+  class Enable = void
+>
+class GemmUniversal;
+
+
+} // namespace cutlass::gemm::kernel
+
diff --git a/include/cutlass/gemm/kernel/gemm_universal_streamk.h b/include/cutlass/gemm/kernel/gemm_universal_streamk.h
new file mode 100644
index 0000000000..39a9bfb58e
--- /dev/null
+++ b/include/cutlass/gemm/kernel/gemm_universal_streamk.h
@@ -0,0 +1,1168 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/barrier.h"
+#include "cutlass/block_striped.h"
+
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock mapping function
+>
+struct GemmUniversalStreamk {
+public:
+
+
+  //
+  // Types and constants
+  //
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+  /// The per-thread tile of raw accumulators
+  using AccumulatorTile = typename Mma::FragmentC;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Workspace bytes per thread block
+  static size_t const kWorkspaceBytesPerBlock =
+    __NV_STD_MAX(
+      kThreadCount * sizeof(AccumulatorTile),
+      Epilogue::kWorkspaceBytesPerBlock);
+
+  /// Block-striped reduction utility
+  using BlockStripedReduceT = BlockStripedReduce<kThreadCount, AccumulatorTile>;
+
+
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmUniversalMode mode = GemmUniversalMode::kGemm;
+    GemmCoord problem_size {};
+    int batch_count {1};        // Either (mode == GemmUniversalMode::kBatched) the batch count, or (mode == GemmUniversalMode::kGemm) the tile-splitting factor
+
+    typename EpilogueOutputOp::Params epilogue{};
+
+    void const * ptr_A = nullptr;
+    void const * ptr_B = nullptr;
+    void const * ptr_C = nullptr;
+    void * ptr_D = nullptr;
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+    int64_t batch_stride_C{0};
+    int64_t batch_stride_D{0};
+
+    typename LayoutA::Stride stride_a{0};
+    typename LayoutB::Stride stride_b{0};
+    typename LayoutC::Stride stride_c{0};
+    typename LayoutC::Stride stride_d{0};
+
+    typename LayoutA::Stride::LongIndex lda{0};
+    typename LayoutB::Stride::LongIndex ldb{0};
+    typename LayoutC::Stride::LongIndex ldc{0};
+    typename LayoutC::Stride::LongIndex ldd{0};
+
+    int avail_sms{-1};          /// The number of SMs that StreamK dispatch heuristics will attempt to load-balance across (-1 defaults to device width, 1 implies classic data-parallel scheduling)
+
+
+    //
+    // Methods
+    //
+
+    /// Default Constructor
+    Arguments() = default;
+
+    /// Constructor
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_split,                              /// Either (mode == GemmUniversalMode::kBatched) the batch count, or (mode == GemmUniversalMode::kGemm) the tile-splitting factor (1 defaults to StreamK, >1 emulates Split-K)
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      typename LayoutA::Stride stride_a,
+      typename LayoutB::Stride stride_b,
+      typename LayoutC::Stride stride_c,
+      typename LayoutC::Stride stride_d,
+      int avail_sms = -1                            /// The number of SMs that StreamK dispatch heuristics will attempt to load-balance across (-1 defaults to device width, 1 implies classic data-parallel scheduling)
+    ):
+      mode(mode),
+      problem_size(problem_size),
+      batch_count(batch_split),
+      epilogue(epilogue),
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
+      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C), batch_stride_D(batch_stride_D),
+      stride_a(stride_a), stride_b(stride_b), stride_c(stride_c), stride_d(stride_d), avail_sms(avail_sms)
+    {
+      CUTLASS_TRACE_HOST("GemmUniversalStreamk::Arguments::Arguments() - problem_size: " << problem_size);
+    }
+
+    /// Constructor
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_split,                              /// Either (mode == GemmUniversalMode::kBatched) the batch count, or (mode == GemmUniversalMode::kGemm) the tile-splitting factor (1 defaults to StreamK, >1 emulates Split-K)
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      typename LayoutA::Stride::LongIndex lda,
+      typename LayoutB::Stride::LongIndex ldb,
+      typename LayoutC::Stride::LongIndex ldc,
+      typename LayoutC::Stride::LongIndex ldd,
+      int avail_sms = -1                            /// The number of SMs that StreamK dispatch heuristics will attempt to load-balance across (-1 defaults to device width, 1 implies classic data-parallel scheduling)
+    ):
+      mode(mode),
+      problem_size(problem_size),
+      batch_count(batch_split),
+      epilogue(epilogue),
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
+      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C), batch_stride_D(batch_stride_D),
+      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd), avail_sms(avail_sms)
+    {
+      stride_a = make_Coord(lda);
+      stride_b = make_Coord(ldb);
+      stride_c = make_Coord(ldc);
+      stride_d = make_Coord(ldd);
+      CUTLASS_TRACE_HOST("GemmUniversalStreamk::Arguments::Arguments() - problem_size: " << problem_size);
+    }
+
+    /// Returns arguments for the transposed problem
+    Arguments transposed_problem() const
+    {
+      Arguments args(*this);
+
+      std::swap(args.problem_size.m(), args.problem_size.n());
+      std::swap(args.ptr_A, args.ptr_B);
+      std::swap(args.lda, args.ldb);
+      std::swap(args.stride_a, args.stride_b);
+      std::swap(args.batch_stride_A, args.batch_stride_B);
+
+      return args;
+    }
+  };
+
+
+  /// Parameters structure
+  struct Params
+  {
+  public:
+
+    //
+    // Data members
+    //
+
+    void * ptr_A = nullptr;
+    void * ptr_B = nullptr;
+
+    typename Mma::IteratorA::Params params_A{};
+    typename Mma::IteratorB::Params params_B{};
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+
+    GemmUniversalMode mode = GemmUniversalMode::kGemm;
+
+    ThreadblockSwizzle block_mapping{};
+
+    void *barrier_workspace = nullptr;
+    void *partials_workspace = nullptr;
+
+    typename EpilogueOutputOp::Params output_op{};
+
+    void * ptr_D = nullptr;
+    void * ptr_C = nullptr;
+
+    typename Epilogue::OutputTileIterator::Params params_D{};
+    typename Epilogue::OutputTileIterator::Params params_C{};
+
+    int64_t batch_stride_D{0};
+    int64_t batch_stride_C{0};
+
+
+  protected:
+
+    //
+    // Host-only dispatch-utilities
+    //
+
+    /// Pad the given allocation size up to the nearest cache line
+    static size_t cacheline_align_up(size_t size)
+    {
+      static const int CACHELINE_SIZE = 128;
+      return (size + CACHELINE_SIZE - 1) / CACHELINE_SIZE * CACHELINE_SIZE;
+    }
+
+    /// Get the workspace size needed for barrier
+    size_t get_barrier_workspace_size() const
+    {
+      // For atomic reduction, each SK-block needs a synchronization flag.  For parallel reduction,
+      // each reduction block needs its own synchronization flag.
+      int sk_blocks = block_mapping.sk_regions() * block_mapping.sk_blocks_per_region();
+      int num_flags = fast_max(sk_blocks, block_mapping.reduction_blocks);
+
+      return cacheline_align_up(sizeof(typename Barrier::T) * num_flags);
+    }
+
+    /// Get the workspace size needed for intermediate partial sums
+    size_t get_partials_workspace_size() const
+    {
+      int sk_blocks = block_mapping.sk_regions() * block_mapping.sk_blocks_per_region();
+      return cacheline_align_up(kWorkspaceBytesPerBlock * sk_blocks);
+    }
+
+
+  public:
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      params_A(args.lda ? make_Coord_with_padding<LayoutA::kStrideRank>(args.lda) : args.stride_a),
+      params_B(args.ldb ? make_Coord_with_padding<LayoutB::kStrideRank>(args.ldb) : args.stride_b),
+      params_C(args.ldc ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldc) : args.stride_c),
+      params_D(args.ldd ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldd) : args.stride_d),
+      output_op(args.epilogue),
+      mode(args.mode),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_C(const_cast<void *>(args.ptr_C)),
+      ptr_D(args.ptr_D),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_C(args.batch_stride_C),
+      batch_stride_D(args.batch_stride_D),
+      barrier_workspace(nullptr),
+      partials_workspace(nullptr)
+    {
+      // Number of SMs to make available for StreamK decomposition
+      int avail_sms = (args.avail_sms == -1) ?
+                        device_sms :
+                        fast_min(args.avail_sms, device_sms);
+
+      // Initialize the block mapping structure
+      block_mapping = ThreadblockSwizzle(
+        args.mode,
+        args.problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.batch_count,
+        sm_occupancy,
+        device_sms,
+        avail_sms,
+        sizeof(ElementA),
+        sizeof(ElementB),
+        sizeof(ElementC),
+        Epilogue::kAccumulatorFragments);
+    }
+
+
+    /// Returns the workspace size (in bytes) needed for these parameters
+    size_t get_workspace_size() const
+    {
+      return
+        get_barrier_workspace_size() +
+        get_partials_workspace_size();
+    }
+
+
+    /// Assign and initialize the specified workspace buffer.  Assumes
+    /// the memory allocated to workspace is at least as large as get_workspace_size().
+    Status init_workspace(
+      void *workspace,
+      cudaStream_t stream = nullptr)
+    {
+      uint8_t *ptr = static_cast<uint8_t*>(workspace);
+
+      // Establish partials workspace
+      partials_workspace = nullptr;
+      size_t partials_workspace_bytes = get_partials_workspace_size();
+      if (partials_workspace_bytes > 0)
+      {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+        partials_workspace = ptr;
+        ptr += partials_workspace_bytes;
+      }
+
+      // Establish barrier workspace
+      barrier_workspace = nullptr;
+      size_t barrier_workspace_bytes = get_barrier_workspace_size();
+      if (barrier_workspace_bytes > 0)
+      {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+        barrier_workspace = ptr;
+        ptr += barrier_workspace_bytes;
+      }
+
+      // Zero-initialize barrier workspace
+      if (barrier_workspace)
+      {
+        size_t barrier_workspace_bytes = get_barrier_workspace_size();
+
+        CUTLASS_TRACE_HOST("  Initialize " << barrier_workspace_bytes << " barrier bytes");
+
+        cudaError_t result = cudaMemsetAsync(
+          barrier_workspace,
+          0,
+          barrier_workspace_bytes,
+          stream);
+
+        if (result != cudaSuccess) {
+          CUTLASS_TRACE_HOST("  cudaMemsetAsync() returned error " << cudaGetErrorString(result));
+          return Status::kErrorInternal;
+        }
+      }
+
+      return Status::kSuccess;
+    }
+
+
+    /// Returns the GEMM volume in thread block tiles
+    cutlass::gemm::GemmCoord get_tiled_shape() const
+    {
+      return block_mapping.tiled_shape();
+    }
+
+
+    /// Returns the total number of thread blocks to launch
+    int get_grid_blocks() const
+    {
+      dim3 grid_dims = get_grid_dims();
+      return grid_dims.x * grid_dims.y * grid_dims.z;
+    }
+
+
+    /// Returns the grid extents in thread blocks to launch
+    dim3 get_grid_dims() const
+    {
+      return block_mapping.get_grid_dims();
+    }
+
+
+    /// Lightweight update given a subset of arguments.
+    void update(Arguments const &args)
+    {
+      CUTLASS_TRACE_HOST("GemmUniversalStreamK::Params::update()");
+
+      // Update input/output pointers
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_C = const_cast<void *>(args.ptr_C);
+      ptr_D = args.ptr_D;
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C = args.batch_stride_C;
+      batch_stride_D = args.batch_stride_D;
+
+      output_op = args.epilogue;
+    }
+
+  };
+
+  /// Tile work descriptor
+  struct TileWorkDesc
+  {
+    /// The linear tile index
+    int tile_idx;
+
+    /// The location of this tile (in threadblock-tile coordinates) in the output matrix
+    cutlass::gemm::GemmCoord tiled_coord;
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    int iter_begin;
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    int k_begin;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    int k_end;
+
+    /// The number of remaining MAC-iterations this threadblock will perform for this tile
+    int k_iters_remaining;
+
+    // Whether this block will perform the first iteration of this tile
+    CUTLASS_DEVICE
+    bool tile_started()
+    {
+      return (k_begin == 0);
+    }
+
+    // Whether this block will perform the last iteration of this tile
+    CUTLASS_DEVICE
+    bool tile_finished(Params const &params)
+    {
+      return (k_end == params.block_mapping.problem_size.k());
+    }
+  };
+
+
+  /// Shared memory storage structure
+  union SharedStorage
+  {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+
+protected:
+
+  //
+  // Data members
+  //
+
+  /// GEMM problem parameters
+  Params params;
+
+  /// Shared storage reference
+  SharedStorage &shared_storage;
+
+  /// ID within the threadblock
+  int thread_idx;
+
+  /// ID of warp
+  int warp_idx;
+
+  /// ID of each thread within a warp
+  int lane_idx;
+
+  /// Threadblock scoped epilogue
+  Epilogue epilogue;
+
+
+public:
+
+  //
+  // Host-only dispatch API
+  //
+
+  /// Determines whether the GEMM problem size satisfies this kernel's
+  /// alignment requirements
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size)
+  {
+    CUTLASS_TRACE_HOST("GemmUniversalStreamk::can_implement()");
+
+    static int const kAlignmentA = (platform::is_same<LayoutA,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<LayoutA,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = (platform::is_same<LayoutB,
+                                                      layout::RowMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<LayoutB,
+                                                        layout::RowMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = (platform::is_same<LayoutC,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<LayoutC,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
+            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    CUTLASS_TRACE_HOST("  returning kSuccess");
+
+    return Status::kSuccess;
+  }
+
+  /// Determines whether the GEMM problem satisfies this kernel's
+  /// alignment requirements
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+protected:
+
+  //
+  // Device-only utility methods
+  //
+
+  /// Iterator for fetching tile fragments from A
+  CUTLASS_DEVICE
+  typename Mma::IteratorA init_iterator_A(
+    TileWorkDesc &tile_work,
+    GemmUniversalMode mode)
+  {
+    // The input A matrix
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
+
+    // Update input pointers based on batched/array mode
+    if (mode == GemmUniversalMode::kBatched) {
+      ptr_A += tile_work.tiled_coord.k() * params.batch_stride_A;
+    }
+    if (mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[tile_work.tiled_coord.k()];
+    }
+
+    int m_begin = tile_work.tiled_coord.m() * Mma::Shape::kM;
+    int m_end = params.block_mapping.problem_size.m();
+    return typename Mma::IteratorA(
+        params.params_A,
+        ptr_A,
+        { m_end, tile_work.k_end },
+        threadIdx.x,
+        { m_begin, tile_work.k_begin });
+
+  }
+
+
+  /// Iterator for fetching tile fragments from B
+  CUTLASS_DEVICE
+  typename Mma::IteratorB init_iterator_B(
+    TileWorkDesc &tile_work,
+    GemmUniversalMode mode)
+  {
+    // The input B matrix
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    // Update input pointers based on batched/array mode
+    if (mode == GemmUniversalMode::kBatched) {
+      ptr_B += tile_work.tiled_coord.k() * params.batch_stride_B;
+    }
+    if (mode == GemmUniversalMode::kArray) {
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[tile_work.tiled_coord.k()];
+    }
+
+    int n_begin = tile_work.tiled_coord.n() * Mma::Shape::kN;
+    int n_end = params.block_mapping.problem_size.n();
+    return typename Mma::IteratorB(
+        params.params_B,
+        ptr_B,
+        { tile_work.k_end, n_end },
+        threadIdx.x,
+        { tile_work.k_begin, n_begin });
+  }
+
+
+  CUTLASS_DEVICE
+  void init_dp_tile_work(
+      TileWorkDesc &tile_work,
+      int tile_idx)
+  {
+    // The linear tile index
+    tile_work.tile_idx = tile_idx;
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    tile_work.iter_begin = tile_idx * params.block_mapping.iters_per_tile();
+
+    // The number of MAC-iterations this threadblock will perform for this tile
+    tile_work.k_iters_remaining = params.block_mapping.iters_per_tile();
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_begin = 0;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_end = params.block_mapping.problem_size.k();
+
+    // The location of this tile (in threadblock-tile coordinates) in the output matrix
+    tile_work.tiled_coord = params.block_mapping.get_tile_offset(tile_work.tile_idx);
+  }
+
+
+  CUTLASS_DEVICE
+  void init_sk_tile_work(
+      TileWorkDesc &tile_work,
+      int tile_idx,
+      int block_iter_begin,
+      int block_iter_end)
+  {
+    // The linear tile index
+    tile_work.tile_idx = tile_idx;
+
+    // The first global-scoped MAC-iteration for this tile
+    int tile_iter_begin = tile_idx * params.block_mapping.iters_per_tile();
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    tile_work.iter_begin = max(block_iter_begin, tile_iter_begin);
+
+    // The first tile-scoped MAC-iteration this threadblock will perform for this tile
+    int k_iter_begin = tile_work.iter_begin - tile_iter_begin;
+
+    // The last (one past) tile-scoped MAC-iteration this threadblock will perform for this tile
+    int k_iter_end = block_iter_end - tile_iter_begin;
+
+    // The number of MAC-iterations this threadblock will perform for this tile
+    tile_work.k_iters_remaining = k_iter_end - k_iter_begin;
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_begin = k_iter_begin * Mma::Shape::kK;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_end = min(
+        params.block_mapping.problem_size.k(),            // extent of k domain
+        (k_iter_end * Mma::Shape::kK));                   // extent of the threadblock's global iteration assignment
+
+    // The location of this tile (in threadblock-tile coordinates) in the output matrix
+    tile_work.tiled_coord = params.block_mapping.get_tile_offset(tile_work.tile_idx);
+  }
+
+
+  /// Share accumulators with peers
+  CUTLASS_DEVICE
+  void share_accumulators(
+    AccumulatorTile const &accumulator_tile,
+    int block_idx,
+    int first_block_idx)
+  {
+    AccumulatorTile *accum_tile_workspace = reinterpret_cast<AccumulatorTile *>(params.partials_workspace);
+
+    int accum_tile_offset = first_block_idx * kThreadCount;
+
+    if (block_idx == first_block_idx)
+    {
+      // First peer initializes the workspace partials
+      BlockStripedReduceT::store(accum_tile_workspace + accum_tile_offset, accumulator_tile, thread_idx);
+    }
+    else
+    {
+      // Subsequent peers atomically accumulate into the workspace partials
+      if (ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kAtomic)
+      {
+        // Non-deterministic reduction order: wait for the first peer to have initialized the partials before we add to them
+        Barrier::wait_lt(params.barrier_workspace, thread_idx, first_block_idx, 1);
+      }
+      else
+      {
+        // Turnstile reduction order: wait until the previous peer has written
+        int wait_count = block_idx - first_block_idx;
+        Barrier::wait_eq(params.barrier_workspace, thread_idx, first_block_idx, wait_count);
+      }
+
+      // Perform reduction in workspace
+      BlockStripedReduceT::reduce(accum_tile_workspace + accum_tile_offset, accumulator_tile, thread_idx);
+    }
+
+    // Signal our arrival
+    Barrier::arrive_inc(params.barrier_workspace, thread_idx, first_block_idx);
+  }
+
+
+  /// Acquire accumulators from peers
+  CUTLASS_DEVICE
+  void acquire_accumulators(
+    AccumulatorTile &accumulator_tile,
+    int block_idx,
+    int first_block_idx)
+  {
+    AccumulatorTile *accum_tile_workspace = reinterpret_cast<AccumulatorTile *>(params.partials_workspace);
+
+    // Wait for arrival
+    int num_carry_in = block_idx - first_block_idx;
+    Barrier::wait_eq_reset(params.barrier_workspace, thread_idx, first_block_idx, num_carry_in);
+
+    // Load and add peer-partials accumulator tile to local accumulator tile
+    int accum_tile_offset = first_block_idx * kThreadCount;
+    BlockStripedReduceT::load_add(accumulator_tile, accum_tile_workspace + accum_tile_offset, thread_idx);
+  }
+
+
+  /// Perform epilogue computations and output
+  CUTLASS_DEVICE
+  void do_epilogue(
+    TileWorkDesc &tile_work,
+    AccumulatorTile &accumulator_tile)
+  {
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+
+    // Update pointers for batched/array mode(s)
+    if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C += tile_work.tiled_coord.k() * params.batch_stride_C;
+      ptr_D += tile_work.tiled_coord.k() * params.batch_stride_D;
+    }
+    if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[tile_work.tiled_coord.k()];
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[tile_work.tiled_coord.k()];
+    }
+
+    // Location of this tile in item-coords
+    MatrixCoord threadblock_item_begin(
+      tile_work.tiled_coord.m() * Mma::Shape::kM,
+      tile_work.tiled_coord.n() * Mma::Shape::kN
+    );
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+        params.params_C,
+        ptr_C,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+        params.params_D,
+        ptr_D,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+        EpilogueOutputOp(params.output_op),
+        iterator_D,
+        accumulator_tile,
+        iterator_C);
+  }
+
+
+  CUTLASS_DEVICE
+  void separate_reduction(int reduce_idx)
+  {
+    int peer_idx_begin, peer_idx_last, reduce_tile_idx, reduce_fragment_idx;
+
+    // Reduce by sk-tile (every tile contributed to by one or more blocks)
+    reduce_tile_idx = reduce_idx / Epilogue::kAccumulatorFragments;
+    reduce_fragment_idx = reduce_idx % Epilogue::kAccumulatorFragments;
+
+    int iter_tile_first = reduce_tile_idx * params.block_mapping.iters_per_tile();
+    int iter_tile_last = iter_tile_first + params.block_mapping.iters_per_tile() - 1;
+
+    peer_idx_begin = params.block_mapping.get_sk_block_idx(iter_tile_first);
+    peer_idx_last = params.block_mapping.get_sk_block_idx(iter_tile_last);
+
+    // Wait for peers to complete
+    int peer_idx_end = peer_idx_last + 1;
+    int num_peers = peer_idx_end - peer_idx_begin;
+    Barrier::wait_eq_reset(
+        params.barrier_workspace,
+        thread_idx,
+        (reduce_tile_idx * Epilogue::kAccumulatorFragments) + reduce_fragment_idx,
+        num_peers);
+
+    /// The location of this tile (in threadblock-tile coordinates) in the output matrix
+    GemmCoord tiled_coord = params.block_mapping.get_tile_offset(reduce_tile_idx);
+
+    // Location of this tile in item-coords
+    MatrixCoord threadblock_item_begin(
+      tiled_coord.m() * Mma::Shape::kM,
+      tiled_coord.n() * Mma::Shape::kN
+    );
+
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+        params.params_C,
+        ptr_C,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+        params.params_D,
+        ptr_D,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue.reduce(
+        peer_idx_begin,
+        peer_idx_end,
+        reduce_fragment_idx,
+        params.partials_workspace,
+        EpilogueOutputOp(params.output_op),
+        iterator_D,
+        iterator_C);
+  }
+
+
+  CUTLASS_DEVICE
+  void process_tile(
+    TileWorkDesc tile_work,
+    int block_idx,
+    int dp_start_block_idx,
+    int block_iter_begin)
+  {
+    // Initialize input iterators
+    typename Mma::IteratorA iterator_A = init_iterator_A(tile_work, params.mode);
+    typename Mma::IteratorB iterator_B = init_iterator_B(tile_work, params.mode);
+
+    // Initialize accumulators
+    AccumulatorTile accumulator_tile;
+    accumulator_tile.clear();
+
+    // Initialize MMA abstraction
+    Mma mma(
+      shared_storage.main_loop,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    // Perform this tile's range of multiply-accumulate (MAC) iterations
+    mma(tile_work.k_iters_remaining, accumulator_tile, iterator_A, iterator_B, accumulator_tile);
+
+    if ((ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kAtomic) ||
+        (params.block_mapping.reduction_blocks == 0) ||
+        (block_idx >= dp_start_block_idx))
+    {
+      //
+      // Cooperative SK peer reduction or DP block
+      //
+
+      int first_block_idx = params.block_mapping.get_first_block_idx(tile_work.tile_idx, block_idx);
+
+      if (!tile_work.tile_finished(params)) {
+        // Non "finishing" SK blocks must share their partial accumulator sums through global scratch workspace
+        share_accumulators(accumulator_tile, block_idx, first_block_idx);
+      }
+      else
+      {
+        // DP blocks and "finishing" SK blocks must perform epilogue operations and write the output tile
+        if (!tile_work.tile_started())
+        {
+          // A "finishing" SK block must first aggregate its accumulator partial sums with those shared by peer threadblocks
+          acquire_accumulators(accumulator_tile, block_idx, first_block_idx);
+        }
+
+        do_epilogue(tile_work, accumulator_tile);
+      }
+    }
+    else
+    {
+      //
+      // Separate peer reduction
+      //
+
+      // Share accumulator partial sums with peer threadblock(s) through scratch workspace
+      epilogue.share(block_idx, params.partials_workspace, accumulator_tile, tile_work.tile_started());
+
+      // Signal arrival
+      Barrier::arrive_range_inc(
+        params.barrier_workspace,
+        thread_idx,
+        tile_work.tile_idx * Epilogue::kAccumulatorFragments,
+        Epilogue::kAccumulatorFragments);
+    }
+  }
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void gemm()
+  {
+    // Initialize block's iteration range
+    int tile_idx = 0;
+    int block_iter_begin = 0;
+    int block_iters_remaining = 0;
+
+    int block_idx = params.block_mapping.get_block_idx();
+
+    int sk_padding_start_block_idx =  params.block_mapping.sk_regions() * params.block_mapping.sk_blocks_per_region();
+    int dp_start_block_idx = params.block_mapping.sk_waves * params.block_mapping.avail_sms;
+    int reduce_start_block_idx = dp_start_block_idx + params.block_mapping.dp_blocks;
+    int grid_padding_start_block_idx = reduce_start_block_idx + params.block_mapping.reduction_blocks;
+
+    // Initialize tile work descriptor
+    TileWorkDesc tile_work;
+
+    bool dp_block = (block_idx >= dp_start_block_idx) && (block_idx < reduce_start_block_idx);
+    bool sk_block = (block_idx < sk_padding_start_block_idx);
+    bool reduce_block = (block_idx >= reduce_start_block_idx) &&
+            (block_idx < grid_padding_start_block_idx) &&
+            (ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kMixed);
+
+    if (dp_block)
+    {
+      // This is a DP block
+      int dp_block_idx = block_idx - dp_start_block_idx;
+      int first_dp_tile = (params.block_mapping.cohort_raster) ? 0 : params.block_mapping.sk_tiles;
+
+      // Blocks in first DP wave get configured number of tiles
+      tile_idx = first_dp_tile + dp_block_idx;
+      int tile_allottment = params.block_mapping.dp_first_wave_tiles;
+
+      // Blocks in subsequent DP waves get 1 tile
+      if (dp_block_idx >= params.block_mapping.avail_sms) {
+          tile_allottment = 1;
+          tile_idx += (params.block_mapping.dp_first_wave_tiles - 1) * params.block_mapping.avail_sms;
+      }
+
+      block_iters_remaining = params.block_mapping.iters_per_tile() * tile_allottment;
+
+      init_dp_tile_work(tile_work, tile_idx);
+
+      // DP blocks exit if out of bounds or overlap an SK tile (only possible during cohort rasterization, where dp_first_wave_tiles must be 1)
+      if ((tile_idx < params.block_mapping.sk_tiles) ||
+          (tile_work.tiled_coord.m() >= params.block_mapping.tiled_shape().m()) ||
+          (tile_work.tiled_coord.n() >= params.block_mapping.tiled_shape().n()))
+      {
+        return;
+      }
+    }
+    else if (sk_block)
+    {
+      // This is a SK block
+      int block_iter_end;
+      params.block_mapping.get_iter_extents(block_idx, block_iter_begin, block_iter_end);
+      block_iters_remaining = block_iter_end - block_iter_begin;
+
+      tile_idx = params.block_mapping.get_sk_tile_idx(block_iter_end - 1);
+      init_sk_tile_work(tile_work, tile_idx, block_iter_begin, block_iter_begin + block_iters_remaining);
+    }
+    else
+    {
+      if (reduce_block)
+      {
+        // This is a reduction threadblock
+        int reduce_block_idx = block_idx - reduce_start_block_idx;
+        separate_reduction(reduce_block_idx);
+      }
+
+      return;
+    }
+
+    // Iteration-processing loop body
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (true)
+    {
+      // Perform this block's share of work for this tile
+      process_tile(
+        tile_work,
+        block_idx,
+        dp_start_block_idx,
+        block_iter_begin);
+
+      block_iters_remaining -= tile_work.k_iters_remaining;
+
+      if (block_iters_remaining == 0)
+      {
+        break;
+      }
+
+      // Continue to next tile
+      __syncthreads();
+
+      if (block_idx >= dp_start_block_idx)
+      {
+        // DP block consume their tiles at stride
+        tile_idx += params.block_mapping.avail_sms;
+        init_dp_tile_work(tile_work, tile_idx);
+      }
+      else
+      {
+        // SK blocks consume their tiles in backwards order
+        tile_idx--;
+        init_sk_tile_work(tile_work, tile_idx, block_iter_begin, block_iter_begin + block_iters_remaining);
+      }
+    }
+
+  }
+
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmUniversalStreamk op(params, shared_storage);
+    op();
+  }
+
+
+  // Constructor
+  CUTLASS_DEVICE
+  GemmUniversalStreamk(
+      Params const &params,
+      SharedStorage &shared_storage)
+    :
+      params(params),
+      shared_storage(shared_storage),
+      thread_idx(threadIdx.x),
+      warp_idx(__shfl_sync(0xffffffff, threadIdx.x / 32, 0)),   // broadcast the warp_id computed by lane 0 to ensure dependent code
+      lane_idx(threadIdx.x % 32),
+      epilogue(
+        shared_storage.epilogue,
+        thread_idx,
+        warp_idx,
+        lane_idx)
+  {}
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()()
+  {
+    // Generic SK code path
+    gemm();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/gemm_universal_with_visitor.h b/include/cutlass/gemm/kernel/gemm_universal_with_visitor.h
new file mode 100644
index 0000000000..5ce123a1a6
--- /dev/null
+++ b/include/cutlass/gemm/kernel/gemm_universal_with_visitor.h
@@ -0,0 +1,321 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Gemm kernel with an epilogue defined under the epilogue visitor concept
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/kernel/gemm_universal.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Gemm that compute the epilogue visitor functor
+template <
+  typename Mma,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue,             ///! Epilogue
+  typename ThreadblockSwizzle_   ///! Threadblock swizzling function
+>
+class GemmWithEpilogueVisitor: public GemmUniversal<Mma, Epilogue, ThreadblockSwizzle_> {
+public:
+
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using Base = GemmUniversal<Mma, Epilogue, ThreadblockSwizzle>;
+  using Base::Base;
+
+  using FusionCallbacks = typename Epilogue::FusionCallbacks;
+
+  using ElementA = typename Base::ElementA;
+  using LayoutA = typename Base::LayoutA;
+  using ElementB = typename Base::ElementB;
+  using LayoutB = typename Base::LayoutB;
+  using ElementC = typename Base::ElementC;
+  using LayoutC = typename Base::LayoutC;
+
+  using ThreadblockShape = typename Mma::Shape;
+
+  //
+  // Structures
+  //
+
+  using SharedStorage = typename Base::SharedStorage;
+  using Arguments = typename Base::Arguments;
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params : UniversalParamsBase<
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>
+  {
+    using ParamsBase = UniversalParamsBase<
+      ThreadblockSwizzle,
+      ThreadblockShape,
+      ElementA,
+      ElementB,
+      ElementC,
+      LayoutA,
+      LayoutB>;
+
+    //
+    // Data members
+    //
+    cute::Shape<int32_t,int32_t,int32_t> problem_shape;
+
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorB::Params params_B;
+    typename FusionCallbacks::Params output_op;
+
+    void * ptr_A;
+    void * ptr_B;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+
+    int * ptr_gather_A_indices;
+    int * ptr_gather_B_indices;
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      ParamsBase(args, device_sms, sm_occupancy),
+      params_A(args.lda ? make_Coord_with_padding<LayoutA::kStrideRank>(args.lda) : args.stride_a),
+      params_B(args.ldb ? make_Coord_with_padding<LayoutB::kStrideRank>(args.ldb) : args.stride_b),
+      output_op(FusionCallbacks::to_underlying_arguments(args.problem_size, args.epilogue, nullptr /*workspace*/)),
+      problem_shape({args.problem_size.m(), args.problem_size.n(), args.batch_count}),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      ptr_gather_A_indices(const_cast<int *>(args.ptr_gather_A_indices)),
+      ptr_gather_B_indices(const_cast<int *>(args.ptr_gather_B_indices))
+    {
+      // Raise error on unsupported modes
+      assert(args.mode != GemmUniversalMode::kGemmSplitKParallel && "Sm80 EVT does not support SplitKParallel.");
+      assert(!(args.mode == GemmUniversalMode::kGemm && this->grid_tiled_shape.k() > 1 )
+        && "Sm80 EVT does not support SplitKSerial.");
+      assert(args.mode != GemmUniversalMode::kArray && "Sm80 EVT does not support Array Gemm.");
+    }
+
+    /// Lightweight update given a subset of arguments.
+    void update(Arguments const &args)
+    {
+      CUTLASS_TRACE_HOST("GemmUniversalwithVisitor::Params::update()");
+
+      // Update input pointers
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      this->batch_stride_D = args.batch_stride_D;
+
+      ptr_gather_A_indices = const_cast<int *>(args.ptr_gather_A_indices);
+      ptr_gather_B_indices = const_cast<int *>(args.ptr_gather_B_indices);
+
+      output_op = FusionCallbacks::to_underlying_arguments(args.problem_size, args.epilogue, nullptr /*workspace*/);
+      problem_shape = make_shape(args.problem_size.m(), args.problem_size.n(), args.batch_count);
+    }
+  };
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmWithEpilogueVisitor op;
+    op(params, shared_storage);
+  }
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+    ThreadblockSwizzle threadblock_swizzle;
+    run_with_swizzle(params, shared_storage, threadblock_swizzle);
+  }
+
+  /// Executes one GEMM with an externally-provided swizzling function
+  CUTLASS_DEVICE
+  void run_with_swizzle(Params const &params, SharedStorage &shared_storage, ThreadblockSwizzle& threadblock_swizzle) {
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A); 
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size; 
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+    }
+
+    __syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A,
+      params.ptr_gather_A_indices);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B,
+      params.ptr_gather_B_indices);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations, 
+      accumulators, 
+      iterator_A, 
+      iterator_B, 
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    Epilogue epilogue(
+      params.output_op,
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(accumulators, threadblock_tile_offset, params.problem_shape, thread_idx); 
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/gemm_universal_with_visitor_streamk.h b/include/cutlass/gemm/kernel/gemm_universal_with_visitor_streamk.h
new file mode 100644
index 0000000000..5d8ce78908
--- /dev/null
+++ b/include/cutlass/gemm/kernel/gemm_universal_with_visitor_streamk.h
@@ -0,0 +1,895 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Gemm kernel with an epilogue defined under the epilogue visitor concept with streamk.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/barrier.h"
+#include "cutlass/block_striped.h"
+
+#include "cutlass/trace.h"
+#include "cutlass/gemm/kernel/gemm_universal_streamk.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock mapping function
+>
+class GemmWithEpilogueVisitorStreamk {
+public:
+
+  using Base = GemmUniversalStreamk<Mma_, Epilogue_, ThreadblockSwizzle_>;
+
+  //
+  // Types and constants
+  //
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using FusionCallbacks = typename Epilogue::FusionCallbacks;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+  /// The per-thread tile of raw accumulators
+  using AccumulatorTile = typename Mma::FragmentC;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Workspace bytes per thread block
+  static size_t const kWorkspaceBytesPerBlock =
+    __NV_STD_MAX(
+      kThreadCount * sizeof(AccumulatorTile),
+      Epilogue::kWorkspaceBytesPerBlock);
+
+  /// Block-striped reduction utility
+  using BlockStripedReduceT = BlockStripedReduce<kThreadCount, AccumulatorTile>;
+
+
+
+  //
+  // Structures
+  //
+
+  using Arguments = typename Base::Arguments;
+
+
+  /// Parameters structure
+  struct Params
+  {
+  public:
+
+    //
+    // Data members
+    //
+    cute::Shape<int32_t,int32_t,int32_t> problem_shape{};
+
+    void * ptr_A{nullptr};
+    void * ptr_B{nullptr};
+
+    typename Mma::IteratorA::Params params_A{};
+    typename Mma::IteratorB::Params params_B{};
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+
+    GemmUniversalMode mode{GemmUniversalMode::kGemm};
+
+    ThreadblockSwizzle block_mapping{};
+
+    void *barrier_workspace{nullptr};
+    void *partials_workspace{nullptr};
+
+    typename FusionCallbacks::Params output_op{};
+
+
+    void * ptr_D{nullptr};
+    void * ptr_C{nullptr};
+
+    typename Epilogue::OutputTileIterator::Params params_D{};
+    typename Epilogue::OutputTileIterator::Params params_C{};
+
+    int64_t batch_stride_D{0};
+    int64_t batch_stride_C{0};
+
+
+  protected:
+
+    //
+    // Host-only dispatch-utilities
+    //
+
+    /// Pad the given allocation size up to the nearest cache line
+    static size_t cacheline_align_up(size_t size)
+    {
+      static const int CACHELINE_SIZE = 128;
+      return (size + CACHELINE_SIZE - 1) / CACHELINE_SIZE * CACHELINE_SIZE;
+    }
+
+    /// Get the workspace size needed for barrier
+    size_t get_barrier_workspace_size() const
+    {
+      // For atomic reduction, each SK-block needs a synchronization flag.  For parallel reduction,
+      // each reduction block needs its own synchronization flag.
+      int sk_blocks = block_mapping.sk_regions() * block_mapping.sk_blocks_per_region();
+      int num_flags = fast_max(sk_blocks, block_mapping.reduction_blocks);
+
+      return cacheline_align_up(sizeof(typename Barrier::T) * num_flags);
+    }
+
+    /// Get the workspace size needed for intermediate partial sums
+    size_t get_partials_workspace_size() const
+    {
+      int sk_blocks = block_mapping.sk_regions() * block_mapping.sk_blocks_per_region();
+      return cacheline_align_up(kWorkspaceBytesPerBlock * sk_blocks);
+    }
+
+
+  public:
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      problem_shape({args.problem_size.m(), args.problem_size.n(), args.batch_count}),
+      params_A(args.lda ? make_Coord_with_padding<LayoutA::kStrideRank>(args.lda) : args.stride_a),
+      params_B(args.ldb ? make_Coord_with_padding<LayoutB::kStrideRank>(args.ldb) : args.stride_b),
+      params_C(args.ldc ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldc) : args.stride_c),
+      params_D(args.ldd ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldd) : args.stride_d),
+      output_op(FusionCallbacks::to_underlying_arguments(args.problem_size, args.epilogue, nullptr /*workspace*/)),
+      mode(args.mode),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_C(const_cast<void *>(args.ptr_C)),
+      ptr_D(args.ptr_D),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_C(args.batch_stride_C),
+      batch_stride_D(args.batch_stride_D),
+      barrier_workspace(nullptr),
+      partials_workspace(nullptr)
+    {
+      // Number of SMs to make available for StreamK decomposition
+      int avail_sms = (args.avail_sms == -1) ?
+                        device_sms :
+                        fast_min(args.avail_sms, device_sms);
+
+      // Initialize the block mapping structure
+      block_mapping = ThreadblockSwizzle(
+        args.mode,
+        args.problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.batch_count,
+        sm_occupancy,
+        device_sms,
+        avail_sms,
+        sizeof(ElementA),
+        sizeof(ElementB),
+        sizeof(ElementC),
+        Epilogue::kAccumulatorFragments);
+    }
+
+
+    /// Returns the workspace size (in bytes) needed for these parameters
+    size_t get_workspace_size() const
+    {
+      return
+        get_barrier_workspace_size() +
+        get_partials_workspace_size();
+    }
+
+
+    /// Assign and initialize the specified workspace buffer.  Assumes
+    /// the memory allocated to workspace is at least as large as get_workspace_size().
+    Status init_workspace(
+      void *workspace,
+      cudaStream_t stream = nullptr)
+    {
+      uint8_t *ptr = static_cast<uint8_t*>(workspace);
+
+      // Establish partials workspace
+      partials_workspace = nullptr;
+      size_t partials_workspace_bytes = get_partials_workspace_size();
+      if (partials_workspace_bytes > 0)
+      {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+        partials_workspace = ptr;
+        ptr += partials_workspace_bytes;
+      }
+
+      // Establish barrier workspace
+      barrier_workspace = nullptr;
+      size_t barrier_workspace_bytes = get_barrier_workspace_size();
+      if (barrier_workspace_bytes > 0)
+      {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+        barrier_workspace = ptr;
+        ptr += barrier_workspace_bytes;
+      }
+
+      // Zero-initialize barrier workspace
+      if (barrier_workspace)
+      {
+        size_t barrier_workspace_bytes = get_barrier_workspace_size();
+
+        CUTLASS_TRACE_HOST("  Initialize " << barrier_workspace_bytes << " barrier bytes");
+
+        cudaError_t result = cudaMemsetAsync(
+          barrier_workspace,
+          0,
+          barrier_workspace_bytes,
+          stream);
+
+        if (result != cudaSuccess) {
+          CUTLASS_TRACE_HOST("  cudaMemsetAsync() returned error " << cudaGetErrorString(result));
+          return Status::kErrorInternal;
+        }
+      }
+
+      return Status::kSuccess;
+    }
+
+
+    /// Returns the GEMM volume in thread block tiles
+    cutlass::gemm::GemmCoord get_tiled_shape() const
+    {
+      return block_mapping.tiled_shape();
+    }
+
+
+    /// Returns the total number of thread blocks to launch
+    int get_grid_blocks() const
+    {
+      dim3 grid_dims = get_grid_dims();
+      return grid_dims.x * grid_dims.y * grid_dims.z;
+    }
+
+
+    /// Returns the grid extents in thread blocks to launch
+    dim3 get_grid_dims() const
+    {
+      return block_mapping.get_grid_dims();
+    }
+
+
+    /// Lightweight update given a subset of arguments.
+    void update(Arguments const &args)
+    {
+      CUTLASS_TRACE_HOST("GemmUniversalStreamK::Params::update()");
+
+      // Update input/output pointers
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_C = const_cast<void *>(args.ptr_C);
+      ptr_D = args.ptr_D;
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C = args.batch_stride_C;
+      batch_stride_D = args.batch_stride_D;
+
+      output_op = FusionCallbacks::to_underlying_arguments(args.problem_size, args.epilogue, nullptr /*workspace*/);
+      problem_shape = make_shape(args.problem_size.m(), args.problem_size.n(), args.batch_count);
+    }
+
+  };
+
+  struct TileWorkDesc: Base::TileWorkDesc {
+    int k_end;
+    CUTLASS_DEVICE
+    bool tile_finished(Params const &params)
+    {
+      return (k_end == params.block_mapping.problem_size.k());
+    }
+  };
+
+  // using TileWorkDesc = typename Base::TileWorkDesc;
+  using SharedStorage = typename Base::SharedStorage;
+
+protected:
+
+  //
+  // Data members
+  //
+
+  /// GEMM problem parameters
+  Params params;
+
+  /// Shared storage reference
+  SharedStorage &shared_storage;
+
+  /// ID within the threadblock
+  int thread_idx;
+
+  /// ID of warp
+  int warp_idx;
+
+  /// ID of each thread within a warp
+  int lane_idx;
+
+  /// Threadblock scoped epilogue
+  Epilogue epilogue;
+
+
+public:
+
+  //
+  // Host-only dispatch API
+  //
+
+  /// Determines whether the GEMM problem size satisfies this kernel's
+  /// alignment requirements
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size)
+  {
+    return Base::can_implement(problem_size);
+  }
+
+  /// Determines whether the GEMM problem satisfies this kernel's
+  /// alignment requirements
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+protected:
+
+  //
+  // Device-only utility methods
+  //
+
+  /// Iterator for fetching tile fragments from A
+  CUTLASS_DEVICE
+  typename Mma::IteratorA init_iterator_A(
+    TileWorkDesc &tile_work,
+    GemmUniversalMode mode)
+  {
+    // The input A matrix
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
+
+    // Update input pointers based on batched/array mode
+    if (mode == GemmUniversalMode::kBatched) {
+      ptr_A += tile_work.tiled_coord.k() * params.batch_stride_A;
+    }
+    if (mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[tile_work.tiled_coord.k()];
+    }
+
+    int m_begin = tile_work.tiled_coord.m() * Mma::Shape::kM;
+    int m_end = params.block_mapping.problem_size.m();
+    return typename Mma::IteratorA(
+        params.params_A,
+        ptr_A,
+        { m_end, tile_work.k_end },
+        threadIdx.x,
+        { m_begin, tile_work.k_begin });
+
+  }
+
+
+  /// Iterator for fetching tile fragments from B
+  CUTLASS_DEVICE
+  typename Mma::IteratorB init_iterator_B(
+    TileWorkDesc &tile_work,
+    GemmUniversalMode mode)
+  {
+    // The input B matrix
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    // Update input pointers based on batched/array mode
+    if (mode == GemmUniversalMode::kBatched) {
+      ptr_B += tile_work.tiled_coord.k() * params.batch_stride_B;
+    }
+    if (mode == GemmUniversalMode::kArray) {
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[tile_work.tiled_coord.k()];
+    }
+
+    int n_begin = tile_work.tiled_coord.n() * Mma::Shape::kN;
+    int n_end = params.block_mapping.problem_size.n();
+    return typename Mma::IteratorB(
+        params.params_B,
+        ptr_B,
+        { tile_work.k_end, n_end },
+        threadIdx.x,
+        { tile_work.k_begin, n_begin });
+  }
+
+
+  CUTLASS_DEVICE
+  void init_dp_tile_work(
+      TileWorkDesc &tile_work,
+      int tile_idx)
+  {
+    // The linear tile index
+    tile_work.tile_idx = tile_idx;
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    tile_work.iter_begin = tile_idx * params.block_mapping.iters_per_tile();
+
+    // The number of MAC-iterations this threadblock will perform for this tile
+    tile_work.k_iters_remaining = params.block_mapping.iters_per_tile();
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_begin = 0;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_end = params.block_mapping.problem_size.k();
+
+    // The location of this tile (in threadblock-tile coordinates) in the output matrix
+    tile_work.tiled_coord = params.block_mapping.get_tile_offset(tile_work.tile_idx);
+  }
+
+
+  CUTLASS_DEVICE
+  void init_sk_tile_work(
+      TileWorkDesc &tile_work,
+      int tile_idx,
+      int block_iter_begin,
+      int block_iter_end)
+  {
+    // The linear tile index
+    tile_work.tile_idx = tile_idx;
+
+    // The first global-scoped MAC-iteration for this tile
+    int tile_iter_begin = tile_idx * params.block_mapping.iters_per_tile();
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    tile_work.iter_begin = max(block_iter_begin, tile_iter_begin);
+
+    // The first tile-scoped MAC-iteration this threadblock will perform for this tile
+    int k_iter_begin = tile_work.iter_begin - tile_iter_begin;
+
+    // The last (one past) tile-scoped MAC-iteration this threadblock will perform for this tile
+    int k_iter_end = block_iter_end - tile_iter_begin;
+
+    // The number of MAC-iterations this threadblock will perform for this tile
+    tile_work.k_iters_remaining = k_iter_end - k_iter_begin;
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_begin = k_iter_begin * Mma::Shape::kK;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_end = min(
+        params.block_mapping.problem_size.k(),            // extent of k domain
+        (k_iter_end * Mma::Shape::kK));                   // extent of the threadblock's global iteration assignment
+
+    // The location of this tile (in threadblock-tile coordinates) in the output matrix
+    tile_work.tiled_coord = params.block_mapping.get_tile_offset(tile_work.tile_idx);
+  }
+
+
+  /// Share accumulators with peers
+  CUTLASS_DEVICE
+  void share_accumulators(
+    AccumulatorTile const &accumulator_tile,
+    int block_idx,
+    int first_block_idx)
+  {
+    AccumulatorTile *accum_tile_workspace = reinterpret_cast<AccumulatorTile *>(params.partials_workspace);
+
+    int accum_tile_offset = first_block_idx * kThreadCount;
+
+    if (block_idx == first_block_idx)
+    {
+      // First peer initializes the workspace partials
+      BlockStripedReduceT::store(accum_tile_workspace + accum_tile_offset, accumulator_tile, thread_idx);
+    }
+    else
+    {
+      // Subsequent peers atomically accumulate into the workspace partials
+      if (ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kAtomic)
+      {
+        // Non-deterministic reduction order: wait for the first peer to have initialized the partials before we add to them
+        Barrier::wait_lt(params.barrier_workspace, thread_idx, first_block_idx, 1);
+      }
+      else
+      {
+        // Turnstile reduction order: wait until the previous peer has written
+        int wait_count = block_idx - first_block_idx;
+        Barrier::wait_eq(params.barrier_workspace, thread_idx, first_block_idx, wait_count);
+      }
+
+      // Perform reduction in workspace
+      BlockStripedReduceT::reduce(accum_tile_workspace + accum_tile_offset, accumulator_tile, thread_idx);
+    }
+
+    // Signal our arrival
+    Barrier::arrive_inc(params.barrier_workspace, thread_idx, first_block_idx);
+  }
+
+
+  /// Acquire accumulators from peers
+  CUTLASS_DEVICE
+  void acquire_accumulators(
+    AccumulatorTile &accumulator_tile,
+    int block_idx,
+    int first_block_idx)
+  {
+    AccumulatorTile *accum_tile_workspace = reinterpret_cast<AccumulatorTile *>(params.partials_workspace);
+
+    // Wait for arrival
+    int num_carry_in = block_idx - first_block_idx;
+    Barrier::wait_eq_reset(params.barrier_workspace, thread_idx, first_block_idx, num_carry_in);
+
+    // Load and add peer-partials accumulator tile to local accumulator tile
+    int accum_tile_offset = first_block_idx * kThreadCount;
+    BlockStripedReduceT::load_add(accumulator_tile, accum_tile_workspace + accum_tile_offset, thread_idx);
+  }
+
+
+  /// Perform epilogue computations and output
+  CUTLASS_DEVICE
+  void do_epilogue(
+    TileWorkDesc &tile_work,
+    AccumulatorTile &accumulator_tile)
+  {
+    cutlass::gemm::GemmCoord threadblock_tile_offset{
+      tile_work.tiled_coord.m(),
+      tile_work.tiled_coord.n(),
+      tile_work.tiled_coord.k()
+    };
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+      accumulator_tile,
+      threadblock_tile_offset,
+      params.problem_shape,
+      thread_idx);
+  }
+
+
+  CUTLASS_DEVICE
+  void separate_reduction(int reduce_idx)
+  {
+    int peer_idx_begin, peer_idx_last, reduce_tile_idx, reduce_fragment_idx;
+
+    // Reduce by sk-tile (every tile contributed to by one or more blocks)
+    reduce_tile_idx = reduce_idx / Epilogue::kAccumulatorFragments;
+    reduce_fragment_idx = reduce_idx % Epilogue::kAccumulatorFragments;
+
+    int iter_tile_first = reduce_tile_idx * params.block_mapping.iters_per_tile();
+    int iter_tile_last = iter_tile_first + params.block_mapping.iters_per_tile() - 1;
+
+    peer_idx_begin = params.block_mapping.get_sk_block_idx(iter_tile_first);
+    peer_idx_last = params.block_mapping.get_sk_block_idx(iter_tile_last);
+
+    // Wait for peers to complete
+    int peer_idx_end = peer_idx_last + 1;
+    int num_peers = peer_idx_end - peer_idx_begin;
+    Barrier::wait_eq_reset(
+        params.barrier_workspace,
+        thread_idx,
+        (reduce_tile_idx * Epilogue::kAccumulatorFragments) + reduce_fragment_idx,
+        num_peers);
+
+    /// The location of this tile (in threadblock-tile coordinates) in the output matrix
+    GemmCoord tiled_coord = params.block_mapping.get_tile_offset(reduce_tile_idx);
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue.reduce(
+        peer_idx_begin,
+        peer_idx_end,
+        reduce_fragment_idx,
+        params.partials_workspace,
+        tiled_coord,
+        params.problem_shape,
+        thread_idx);
+  }
+
+
+  CUTLASS_DEVICE
+  void process_tile(
+    TileWorkDesc tile_work,
+    int block_idx,
+    int dp_start_block_idx,
+    int block_iter_begin)
+  {
+    // Initialize input iterators
+    typename Mma::IteratorA iterator_A = init_iterator_A(tile_work, params.mode);
+    typename Mma::IteratorB iterator_B = init_iterator_B(tile_work, params.mode);
+
+    // Initialize accumulators
+    AccumulatorTile accumulator_tile;
+    accumulator_tile.clear();
+
+    // Initialize MMA abstraction
+    Mma mma(
+      shared_storage.main_loop,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    // Perform this tile's range of multiply-accumulate (MAC) iterations
+    mma(tile_work.k_iters_remaining, accumulator_tile, iterator_A, iterator_B, accumulator_tile);
+
+    if ((ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kAtomic) ||
+        (params.block_mapping.reduction_blocks == 0) ||
+        (block_idx >= dp_start_block_idx))
+    {
+      //
+      // Cooperative SK peer reduction or DP block
+      //
+
+      int first_block_idx = params.block_mapping.get_first_block_idx(tile_work.tile_idx, block_idx);
+
+      if (!tile_work.tile_finished(params)) {
+        // Non "finishing" SK blocks must share their partial accumulator sums through global scratch workspace
+        share_accumulators(accumulator_tile, block_idx, first_block_idx);
+      }
+      else
+      {
+        // DP blocks and "finishing" SK blocks must perform epilogue operations and write the output tile
+        if (!tile_work.tile_started())
+        {
+          // A "finishing" SK block must first aggregate its accumulator partial sums with those shared by peer threadblocks
+          acquire_accumulators(accumulator_tile, block_idx, first_block_idx);
+        }
+
+        do_epilogue(tile_work, accumulator_tile);
+      }
+    }
+    else
+    {
+      //
+      // Separate peer reduction
+      //
+
+      // Share accumulator partial sums with peer threadblock(s) through scratch workspace
+      epilogue.share(block_idx, params.partials_workspace, accumulator_tile, tile_work.tile_started());
+
+      // Signal arrival
+      Barrier::arrive_range_inc(
+        params.barrier_workspace,
+        thread_idx,
+        tile_work.tile_idx * Epilogue::kAccumulatorFragments,
+        Epilogue::kAccumulatorFragments);
+    }
+  }
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void gemm()
+  {
+    // Initialize block's iteration range
+    int tile_idx = 0;
+    int block_iter_begin = 0;
+    int block_iters_remaining = 0;
+
+    int block_idx = params.block_mapping.get_block_idx();
+
+    int sk_padding_start_block_idx =  params.block_mapping.sk_regions() * params.block_mapping.sk_blocks_per_region();
+    int dp_start_block_idx = params.block_mapping.sk_waves * params.block_mapping.avail_sms;
+    int reduce_start_block_idx = dp_start_block_idx + params.block_mapping.dp_blocks;
+    int grid_padding_start_block_idx = reduce_start_block_idx + params.block_mapping.reduction_blocks;
+
+    // Initialize tile work descriptor
+    TileWorkDesc tile_work;
+
+    bool dp_block = (block_idx >= dp_start_block_idx) && (block_idx < reduce_start_block_idx);
+    bool sk_block = (block_idx < sk_padding_start_block_idx);
+    bool reduce_block = (block_idx >= reduce_start_block_idx) &&
+            (block_idx < grid_padding_start_block_idx) &&
+            (ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kMixed);
+
+    if (dp_block)
+    {
+      // This is a DP block
+      int dp_block_idx = block_idx - dp_start_block_idx;
+      int first_dp_tile = (params.block_mapping.cohort_raster) ? 0 : params.block_mapping.sk_tiles;
+
+      // Blocks in first DP wave get configured number of tiles
+      tile_idx = first_dp_tile + dp_block_idx;
+      int tile_allottment = params.block_mapping.dp_first_wave_tiles;
+
+      // Blocks in subsequent DP waves get 1 tile
+      if (dp_block_idx >= params.block_mapping.avail_sms) {
+          tile_allottment = 1;
+          tile_idx += (params.block_mapping.dp_first_wave_tiles - 1) * params.block_mapping.avail_sms;
+      }
+
+      block_iters_remaining = params.block_mapping.iters_per_tile() * tile_allottment;
+
+      init_dp_tile_work(tile_work, tile_idx);
+
+      // DP blocks exit if out of bounds or overlap an SK tile (only possible during cohort rasterization, where dp_first_wave_tiles must be 1)
+      if ((tile_idx < params.block_mapping.sk_tiles) ||
+          (tile_work.tiled_coord.m() >= params.block_mapping.tiled_shape().m()) ||
+          (tile_work.tiled_coord.n() >= params.block_mapping.tiled_shape().n()))
+      {
+        return;
+      }
+    }
+    else if (sk_block)
+    {
+      // This is a SK block
+      int block_iter_end;
+      params.block_mapping.get_iter_extents(block_idx, block_iter_begin, block_iter_end);
+      block_iters_remaining = block_iter_end - block_iter_begin;
+
+      tile_idx = params.block_mapping.get_sk_tile_idx(block_iter_end - 1);
+      init_sk_tile_work(tile_work, tile_idx, block_iter_begin, block_iter_begin + block_iters_remaining);
+    }
+    else
+    {
+      if (reduce_block)
+      {
+        // This is a reduction threadblock
+        int reduce_block_idx = block_idx - reduce_start_block_idx;
+        separate_reduction(reduce_block_idx);
+      }
+
+      return;
+    }
+
+    // Iteration-processing loop body
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (true)
+    {
+      // Perform this block's share of work for this tile
+      process_tile(
+        tile_work,
+        block_idx,
+        dp_start_block_idx,
+        block_iter_begin);
+
+      block_iters_remaining -= tile_work.k_iters_remaining;
+
+      if (block_iters_remaining == 0)
+      {
+        break;
+      }
+
+      // Continue to next tile
+      __syncthreads();
+
+      if (block_idx >= dp_start_block_idx)
+      {
+        // DP block consume their tiles at stride
+        tile_idx += params.block_mapping.avail_sms;
+        init_dp_tile_work(tile_work, tile_idx);
+      }
+      else
+      {
+        // SK blocks consume their tiles in backwards order
+        tile_idx--;
+        init_sk_tile_work(tile_work, tile_idx, block_iter_begin, block_iter_begin + block_iters_remaining);
+      }
+    }
+
+  }
+
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmWithEpilogueVisitorStreamk op(params, shared_storage);
+    op();
+  }
+
+
+  CUTLASS_DEVICE
+  GemmWithEpilogueVisitorStreamk(
+      Params const &params,
+      SharedStorage &shared_storage)
+    :
+      params(params),
+      shared_storage(shared_storage),
+      thread_idx(threadIdx.x),
+      warp_idx(__shfl_sync(0xffffffff, threadIdx.x / 32, 0)),   // broadcast the warp_id computed by lane 0 to ensure dependent code
+      lane_idx(threadIdx.x % 32),
+      epilogue(
+        params.output_op,
+        shared_storage.epilogue,
+        thread_idx,
+        warp_idx,
+        lane_idx)
+  {}
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()()
+  {
+    // Generic SK code path
+    gemm();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/gemm_with_absmax.h b/include/cutlass/gemm/kernel/gemm_with_absmax.h
new file mode 100644
index 0000000000..470eaef539
--- /dev/null
+++ b/include/cutlass/gemm/kernel/gemm_with_absmax.h
@@ -0,0 +1,759 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Gemm kernel with an epilogue that computes the absolute maximum value of the output
+    and a pre-activation-function auxiliary output. The auxiliary output is also (optionally)
+    stored to global memory.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/layout.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/gemm/kernel/params_universal_base.h"
+
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Gemm that computes the absolute maximum value of the output and a pre-activation-function
+// auxiliary output.
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+struct GemmWithAbsMax {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(
+    128 / sizeof_bits<ElementA>::value,
+    128 / sizeof_bits<ElementB>::value
+  );
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments : UniversalArgumentsBase
+  {
+    //
+    // Data members
+    //
+
+    typename EpilogueOutputOp::Params epilogue;
+
+    void const * ptr_A;
+    void const * ptr_B;
+    void const * ptr_C;
+    void * ptr_D;
+    void * ptr_Aux;
+
+    void * ptr_Vector;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C;
+    int64_t batch_stride_Vector;
+
+    typename LayoutA::Stride::Index lda;
+    typename LayoutB::Stride::Index ldb;
+    typename LayoutC::Stride::Index ldc;
+    typename LayoutC::Stride::Index ldd;
+    typename LayoutC::Stride::Index ldaux;
+    typename LayoutC::Stride::Index ldr;
+
+    //
+    // Methods
+    //
+
+    Arguments():
+      ptr_A(nullptr),
+      ptr_B(nullptr),
+      ptr_C(nullptr),
+      ptr_D(nullptr),
+      ptr_Aux(nullptr)
+    {}
+
+    /// Constructs an arguments structure with ldaux
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      void * ptr_Aux,
+      void * ptr_Vector,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      int64_t batch_stride_Vector,
+      typename LayoutA::Stride::Index lda,
+      typename LayoutB::Stride::Index ldb,
+      typename LayoutC::Stride::Index ldc,
+      typename LayoutC::Stride::Index ldd,
+      typename LayoutC::Stride::Index ldr,
+      typename LayoutC::Stride::Index ldaux)
+    :
+      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
+      epilogue(epilogue),
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D), ptr_Aux(ptr_Aux),
+      ptr_Vector(ptr_Vector),
+      batch_stride_A(batch_stride_A),
+      batch_stride_B(batch_stride_B),
+      batch_stride_C(batch_stride_C),
+      batch_stride_Vector(batch_stride_Vector),
+      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd), ldaux(ldaux), ldr(ldr)
+    {
+    }
+
+    /// Constructs an Arguments structure without ldaux.
+    /// These parameters are overridden with D batch stride and ldd.
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      void * ptr_Aux,
+      void * ptr_Vector,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      int64_t batch_stride_Vector,
+      typename LayoutA::Stride::Index lda,
+      typename LayoutB::Stride::Index ldb,
+      typename LayoutC::Stride::Index ldc,
+      typename LayoutC::Stride::Index ldd,
+      typename LayoutC::Stride::Index ldr)
+    : Arguments(mode, problem_size, batch_count, epilogue, ptr_A, ptr_B, ptr_C, ptr_D, ptr_Aux, ptr_Vector,
+               batch_stride_A, batch_stride_B, batch_stride_C, batch_stride_D, batch_stride_Vector,
+               lda, ldb, ldc, ldd, ldr, ldd)
+    {
+    }
+
+    /// Returns arguments for the transposed problem
+    Arguments transposed_problem() const {
+      Arguments args(*this);
+
+      std::swap(args.problem_size.m(), args.problem_size.n());
+      std::swap(args.ptr_A, args.ptr_B);
+      std::swap(args.lda, args.ldb);
+      std::swap(args.batch_stride_A, args.batch_stride_B);
+
+      return args;
+    }
+  };
+
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params : UniversalParamsBase<
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>
+  {
+    using ParamsBase = UniversalParamsBase<
+      ThreadblockSwizzle,
+      ThreadblockShape,
+      ElementA,
+      ElementB,
+      ElementC,
+      LayoutA,
+      LayoutB>;
+
+    //
+    // Data members
+    //
+
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorB::Params params_B;
+    typename Epilogue::OutputTileIterator::Params params_C;
+    typename Epilogue::OutputTileIterator::Params params_D;
+    typename Epilogue::AuxOutputTileIterator::Params params_Aux;
+
+    typename EpilogueOutputOp::Params output_op;
+
+    void * ptr_A;
+    void * ptr_B;
+    void * ptr_C;
+    void * ptr_D;
+    void * ptr_Aux;
+
+    void * ptr_Vector;
+    typename LayoutC::Stride::Index ldr;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C;
+    int64_t batch_stride_Vector;
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      ParamsBase(args, device_sms, sm_occupancy),
+      params_A(args.lda),
+      params_B(args.ldb),
+      params_C(args.ldc),
+      params_D(args.ldd),
+      params_Aux(args.ldaux),
+      output_op(args.epilogue),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_C(const_cast<void *>(args.ptr_C)),
+      ptr_D(args.ptr_D),
+      ptr_Aux(args.ptr_Aux),
+      ptr_Vector(args.ptr_Vector),
+      ldr(args.ldr),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_C(args.batch_stride_C),
+      batch_stride_Vector(args.batch_stride_Vector)
+    {
+
+    }
+
+    /// Lightweight update given a subset of arguments.
+    CUTLASS_HOST_DEVICE
+    void update(Arguments const &args)
+    {
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_C = const_cast<void *>(args.ptr_C);
+      ptr_D = args.ptr_D;
+      ptr_Aux = args.ptr_Aux;
+
+      ptr_Vector = args.ptr_Vector;
+      ldr = args.ldr;
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C = args.batch_stride_C;
+      this->batch_stride_D = args.batch_stride_D;
+      batch_stride_Vector = args.batch_stride_Vector;
+
+      output_op = args.epilogue;
+    }
+  };
+
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+public:
+
+  //
+  // Host dispatch API
+  //
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size) {
+
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
+            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    CUTLASS_TRACE_HOST("  returning kSuccess");
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmWithAbsMax op;
+    op(params, shared_storage);
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm ||
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
+    }
+
+    __syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations,
+      accumulators,
+      iterator_A,
+      iterator_B,
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+    typename Epilogue::ElementAuxOutput *ptr_Aux = static_cast<typename Epilogue::ElementAuxOutput *>(params.ptr_Aux);
+    typename Epilogue::ElementVector *ptr_Vector = static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
+
+    //
+    // Fetch pointers based on mode.
+    //
+
+    //
+    // Special path when split-K not enabled.
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() == 1) {
+
+      // Tile iterators loading from source tensors.
+      typename Epilogue::OutputTileIterator iterator_C(
+        params.params_C,
+        ptr_C,
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      // Tile iterator writing to destination tensor.
+      typename Epilogue::OutputTileIterator iterator_D(
+        params.params_D,
+        ptr_D,
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      // Tile iterator writing to auxiliary tensor.
+      typename Epilogue::AuxOutputTileIterator iterator_Aux(
+        params.params_Aux,
+        ptr_Aux,
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      // Construct the epilogue
+      Epilogue epilogue(
+        shared_storage.epilogue,
+        thread_idx,
+        warp_idx,
+        lane_idx);
+
+      // Move to appropriate location for this output tile
+      if (ptr_Vector) {
+        ptr_Vector += threadblock_offset.column() + threadblock_tile_offset.m() * params.ldr;
+      }
+
+      // Execute the epilogue operator to update the destination tensor.
+      epilogue(output_op,
+               ptr_Vector,
+               iterator_D,
+               accumulators,
+               iterator_C,
+               iterator_Aux,
+               params.problem_size.mn(),
+               threadblock_offset);
+
+      return;
+    }
+
+    //
+    // Slower path when split-K or batching is needed
+    //
+
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+      if (ptr_Aux) {
+        ptr_Aux += threadblock_tile_offset.k() * params.batch_stride_D;
+      }
+      if (ptr_Vector) {
+        ptr_Vector += threadblock_tile_offset.k() * params.batch_stride_Vector;
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+      if (ptr_Aux) {
+        ptr_Aux = static_cast<typename Epilogue::ElementAuxOutput * const *>(params.ptr_Aux)[threadblock_tile_offset.k()];
+      }
+      if (ptr_Vector) {
+        ptr_Vector = static_cast<typename Epilogue::ElementVector * const *>(params.ptr_Vector)[threadblock_tile_offset.k()];
+      }
+    }
+
+    // Tile iterators loading from source tensors.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      ptr_C,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to auxiliary destination tensor.
+    typename Epilogue::AuxOutputTileIterator iterator_Aux(
+      params.params_Aux,
+      // Only the final block writes the auxiliary tensor
+      ((params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) &&
+          (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
+          ? nullptr
+          : ptr_Aux,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Construct the epilogue
+    Epilogue epilogue(
+      shared_storage.epilogue,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if ((params.mode == GemmUniversalMode::kGemm) && params.grid_tiled_shape.k() > 1) {
+
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+    }
+
+    // Move to appropriate location for this output tile
+    if (ptr_Vector) {
+      ptr_Vector += threadblock_offset.column() + threadblock_tile_offset.m() * params.ldr;
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(output_op,
+             // Only the final block uses Vector
+             ((params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) &&
+              (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
+                 ? nullptr
+                 : ptr_Vector,
+             iterator_D,
+             accumulators,
+             iterator_C,
+             iterator_Aux,
+             params.problem_size.mn(),
+             threadblock_offset);
+
+    //
+    // Release the semaphore
+    //
+
+    if ((params.mode == GemmUniversalMode::kGemm)  && params.grid_tiled_shape.k() > 1) {
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/gemm_with_fused_epilogue.h b/include/cutlass/gemm/kernel/gemm_with_fused_epilogue.h
index 264fde9389..363d109ce8 100644
--- a/include/cutlass/gemm/kernel/gemm_with_fused_epilogue.h
+++ b/include/cutlass/gemm/kernel/gemm_with_fused_epilogue.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -36,27 +36,779 @@
 
 #include "cutlass/cutlass.h"
 #include "cutlass/fast_math.h"
+#include "cutlass/layout/layout.h"
 #include "cutlass/gemm/gemm.h"
 #include "cutlass/matrix_coord.h"
 #include "cutlass/complex.h"
 #include "cutlass/semaphore.h"
+#include "cutlass/gemm/kernel/params_universal_base.h"
+#include "cutlass/subbyte_reference.h"
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  bool IsSingleSource = Epilogue_::kIsSingleSource
+>
+struct GemmWithFusedEpilogue;
+
+// GemmWithFusedEpilogue with two sources
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+struct GemmWithFusedEpilogue<Mma_, Epilogue_, ThreadblockSwizzle_, false> {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(
+    128 / sizeof_bits<ElementA>::value,
+    128 / sizeof_bits<ElementB>::value
+  );
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments : UniversalArgumentsBase{
+
+    //
+    // Data members
+    //
+
+    typename EpilogueOutputOp::Params epilogue;
+
+    void const * ptr_A;
+    void const * ptr_B;
+    void const * ptr_C1;
+    void const * ptr_C2;
+    void * ptr_D;
+
+    void * ptr_Vector;
+    void * ptr_Tensor;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C1;
+    int64_t batch_stride_C2;
+    int64_t batch_stride_Vector;
+    int64_t batch_stride_Tensor;
+
+    typename LayoutA::Stride::Index lda;
+    typename LayoutB::Stride::Index ldb;
+    typename LayoutC::Stride::Index ldc1;
+    typename LayoutC::Stride::Index ldc2;
+    typename LayoutC::Stride::Index ldd;
+    typename LayoutC::Stride::Index ldr;
+    typename LayoutC::Stride::Index ldt;
+
+    //
+    // Methods
+    //
+
+    Arguments():
+      ptr_A(nullptr),
+      ptr_B(nullptr),
+      ptr_C1(nullptr),
+      ptr_C2(nullptr),
+      ptr_D(nullptr)
+    {}
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C1,
+      void const * ptr_C2,
+      void * ptr_D,
+      void * ptr_Vector,
+      void * ptr_Tensor,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C1,
+      int64_t batch_stride_C2,
+      int64_t batch_stride_D,
+      int64_t batch_stride_Vector,
+      int64_t batch_stride_Tensor,
+      typename LayoutA::Stride::Index lda,
+      typename LayoutB::Stride::Index ldb,
+      typename LayoutC::Stride::Index ldc1,
+      typename LayoutC::Stride::Index ldc2,
+      typename LayoutC::Stride::Index ldd,
+      typename LayoutC::Stride::Index ldr,
+      typename LayoutC::Stride::Index ldt)
+    :
+      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
+      epilogue(epilogue),
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C1(ptr_C1), ptr_C2(ptr_C2), ptr_D(ptr_D),
+      ptr_Vector(ptr_Vector),
+      ptr_Tensor(ptr_Tensor),
+      batch_stride_A(batch_stride_A),
+      batch_stride_B(batch_stride_B),
+      batch_stride_C1(batch_stride_C1),
+      batch_stride_C2(batch_stride_C2),
+      batch_stride_Vector(batch_stride_Vector),
+      batch_stride_Tensor(batch_stride_Tensor),
+      lda(lda), ldb(ldb), ldc1(ldc1), ldc2(ldc2), ldd(ldd), ldr(ldr), ldt(ldt)
+    {
+      CUTLASS_TRACE_HOST("GemmWithFusedEpilogue::Arguments::Arguments() - problem_size: " << problem_size);
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
+      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
+      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
+      CUTLASS_TRACE_HOST("  ldt: " << this->ldt);
+    }
+
+    /// Returns arguments for the transposed problem
+    Arguments transposed_problem() const {
+      Arguments args(*this);
+
+      std::swap(args.problem_size.m(), args.problem_size.n());
+      std::swap(args.ptr_A, args.ptr_B);
+      std::swap(args.lda, args.ldb);
+      std::swap(args.batch_stride_A, args.batch_stride_B);
+
+      return args;
+    }
+  };
+
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params : UniversalParamsBase<
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>
+  {
+    using ParamsBase = UniversalParamsBase<
+      ThreadblockSwizzle,
+      ThreadblockShape,
+      ElementA,
+      ElementB,
+      ElementC,
+      LayoutA,
+      LayoutB>;
+
+    //
+    // Data members
+    //
+
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorB::Params params_B;
+    typename Epilogue::OutputTileIterator::Params params_C1;
+    typename Epilogue::OutputTileIterator::Params params_C2;
+    typename Epilogue::OutputTileIterator::Params params_D;
+    typename Epilogue::TensorTileIterator::Params params_Tensor;
+    typename EpilogueOutputOp::Params output_op;
+
+    void * ptr_A;
+    void * ptr_B;
+    void * ptr_C1;
+    void * ptr_C2;
+    void * ptr_D;
+
+    void * ptr_Vector;
+    typename LayoutC::Stride::Index ldr;
+
+    void * ptr_Tensor;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C1;
+    int64_t batch_stride_C2;
+    int64_t batch_stride_Vector;
+    int64_t batch_stride_Tensor;
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      ParamsBase(args, device_sms, sm_occupancy),
+      params_A(args.lda),
+      params_B(args.ldb),
+      params_C1(args.ldc1),
+      params_C2(args.ldc2),
+      params_D(args.ldd),
+      params_Tensor(args.ldt),
+      output_op(args.epilogue),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_C1(const_cast<void *>(args.ptr_C1)),
+      ptr_C2(const_cast<void *>(args.ptr_C2)),
+      ptr_D(args.ptr_D),
+      ptr_Vector(args.ptr_Vector),
+      ldr(args.ldr),
+      ptr_Tensor(args.ptr_Tensor),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_C1(args.batch_stride_C1),
+      batch_stride_C2(args.batch_stride_C2),
+      batch_stride_Vector(args.batch_stride_Vector),
+      batch_stride_Tensor(args.batch_stride_Tensor)
+    {
+      CUTLASS_TRACE_HOST("GemmWithFusedEpilogue::Params::Params()");
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
+      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
+      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
+      CUTLASS_TRACE_HOST("  ldt: " << args.ldt);
+    }
+
+    /// Lightweight update given a subset of arguments.
+    CUTLASS_HOST_DEVICE
+    void update(Arguments const &args)
+    {
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_C1 = const_cast<void *>(args.ptr_C1);
+      ptr_C2 = const_cast<void *>(args.ptr_C2);
+      ptr_D = args.ptr_D;
+
+      ptr_Vector = args.ptr_Vector;
+      ldr = args.ldr;
+      ptr_Tensor = args.ptr_Tensor;
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C1 = args.batch_stride_C1;
+      batch_stride_C2 = args.batch_stride_C2;
+      batch_stride_Vector = args.batch_stride_Vector;
+      batch_stride_Tensor = args.batch_stride_Tensor;
+      this->batch_stride_D = args.batch_stride_D;
+
+      output_op = args.epilogue;
+
+      CUTLASS_TRACE_HOST("GemmWithFusedEpilogue::Params::update()");
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
+      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
+      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
+    }
+  };
+
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+public:
+
+  //
+  // Host dispatch API
+  //
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size) {
+
+    CUTLASS_TRACE_HOST("GemmWithFusedEpilogue::can_implement()");
+
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
+            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    CUTLASS_TRACE_HOST("  returning kSuccess");
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmWithFusedEpilogue op;
+    op(params, shared_storage);
+  }
+
+  #define SPLIT_K_ENABLED 1
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+
+    #if SPLIT_K_ENABLED
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm ||
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
+    }
+    #endif
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations,
+      accumulators,
+      iterator_A,
+      iterator_B,
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
 
-#include "cutlass/trace.h"
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
+    ElementC *ptr_C1 = static_cast<ElementC *>(params.ptr_C1);
+    ElementC *ptr_C2 = static_cast<ElementC *>(params.ptr_C2);
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+    typename Epilogue::ElementTensor *ptr_Tensor = static_cast<typename Epilogue::ElementTensor *>(params.ptr_Tensor);
 
-namespace cutlass {
-namespace gemm {
-namespace kernel {
+    // Define the reduction output pointer and move to the appropriate place
+    typename Epilogue::ElementVector *ptr_Vector =
+      static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
+    //
+    // Fetch pointers based on mode.
+    //
+
+    //
+    // Special path when split-K not enabled.
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() == 1) {
+
+      // Tile iterators loading from source tensors.
+      typename Epilogue::OutputTileIterator iterator_C1(
+        params.params_C1,
+        ptr_C1,
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      typename Epilogue::OutputTileIterator iterator_C2(
+        params.params_C2,
+        ptr_C2,
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      // Tile iterator writing to destination tensor.
+      typename Epilogue::OutputTileIterator iterator_D(
+        params.params_D,
+        ptr_D,
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      // Additional tensor to load from
+      typename Epilogue::TensorTileIterator tensor_iterator(
+          params.params_Tensor,
+          // Only the final block outputs Tensor
+          ptr_Tensor,
+          params.problem_size.mn(),
+          thread_idx,
+          threadblock_offset);
+
+      // Construct the epilogue
+      Epilogue epilogue(
+        shared_storage.epilogue,
+        thread_idx,
+        warp_idx,
+        lane_idx);
+
+      // Move to appropriate location for this output tile
+      if (ptr_Vector) {
+        ptr_Vector += threadblock_offset.column() + threadblock_tile_offset.m() * params.ldr;
+      }
+
+      // Execute the epilogue operator to update the destination tensor.
+      epilogue(output_op,
+               ptr_Vector,
+               iterator_D,
+               accumulators,
+               iterator_C1,
+               iterator_C2,
+               tensor_iterator,
+               params.problem_size.mn(),
+               threadblock_offset);
+
+      return;
+    }
+
+    //
+    // Slower path when split-K or batching is needed
+    //
+
+
+    #if SPLIT_K_ENABLED
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C1 += threadblock_tile_offset.k() * params.batch_stride_C1;
+      if (ptr_C2) {
+        ptr_C2 += threadblock_tile_offset.k() * params.batch_stride_C2;
+      }
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+      if (ptr_Tensor) {
+        ptr_Tensor = ReferenceFactory<typename Epilogue::ElementTensor>::add_pointer_offset(
+          ptr_Tensor,
+          threadblock_tile_offset.k() * params.batch_stride_Tensor);
+      }
+      if (ptr_Vector) {
+        ptr_Vector += threadblock_tile_offset.k() * params.batch_stride_Vector;
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C1 = static_cast<ElementC * const *>(params.ptr_C1)[threadblock_tile_offset.k()];
+      if (ptr_C2) {
+        ptr_C2 = static_cast<ElementC * const *>(params.ptr_C2)[threadblock_tile_offset.k()];
+      }
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+      if (ptr_Tensor) {
+        ptr_Tensor = static_cast<typename Epilogue::ElementTensor * const *>(params.ptr_Tensor)[threadblock_tile_offset.k()];
+      }
+      if (ptr_Vector) {
+        ptr_Vector = static_cast<typename Epilogue::ElementVector * const *>(params.ptr_Vector)[threadblock_tile_offset.k()];
+      }
+    }
+    #endif
+
+    // Tile iterators loading from source tensors.
+    typename Epilogue::OutputTileIterator iterator_C1(
+      params.params_C1,
+      ptr_C1,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    typename Epilogue::OutputTileIterator iterator_C2(
+      params.params_C2,
+      ptr_C2,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Additional tensor to load from
+    typename Epilogue::TensorTileIterator tensor_iterator(
+        params.params_Tensor,
+        // Only the final block outputs Tensor
+        ((params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) &&
+         (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
+            ? nullptr
+            : ptr_Tensor,
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset);
+
+    // Construct the epilogue
+    Epilogue epilogue(
+      shared_storage.epilogue,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    #if SPLIT_K_ENABLED
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if ((params.mode == GemmUniversalMode::kGemm) && params.grid_tiled_shape.k() > 1) {
+
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C1 = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+    }
+    #endif
+
+    // Move to appropriate location for this output tile
+    if (ptr_Vector) {
+      ptr_Vector += threadblock_offset.column() + threadblock_tile_offset.m() * params.ldr;
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(output_op,
+             // Only the final block uses Vector
+             ((params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) &&
+              (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
+                 ? nullptr
+                 : ptr_Vector,
+             iterator_D,
+             accumulators,
+             iterator_C1,
+             iterator_C2,
+             tensor_iterator,
+             params.problem_size.mn(),
+             threadblock_offset);
+
+    //
+    // Release the semaphore
+    //
+
+    #if SPLIT_K_ENABLED
+    if ((params.mode == GemmUniversalMode::kGemm)  && params.grid_tiled_shape.k() > 1) {
 
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      semaphore.release(lock);
+    }
+    #endif
+  }
+};
+
+// GemmWithFusedEpilogue with one source
 template <
-  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
   typename Epilogue_,             ///! Epilogue
   typename ThreadblockSwizzle_    ///! Threadblock swizzling function
 >
-struct GemmWithFusedEpilogue {
+struct GemmWithFusedEpilogue<Mma_, Epilogue_, ThreadblockSwizzle_, true> {
 public:
 
   using Mma = Mma_;
@@ -92,7 +844,7 @@ struct GemmWithFusedEpilogue {
 
   /// Split-K preserves splits that are 128b aligned
   static int const kSplitKAlignment = const_max(
-    128 / sizeof_bits<ElementA>::value, 
+    128 / sizeof_bits<ElementA>::value,
     128 / sizeof_bits<ElementB>::value
   );
 
@@ -101,16 +853,12 @@ struct GemmWithFusedEpilogue {
   //
 
   /// Argument structure
-  struct Arguments {
-
+  struct Arguments : UniversalArgumentsBase
+  {
     //
     // Data members
     //
 
-    GemmUniversalMode mode;
-    GemmCoord problem_size;
-    int batch_count;
-
     typename EpilogueOutputOp::Params epilogue;
 
     void const * ptr_A;
@@ -124,7 +872,6 @@ struct GemmWithFusedEpilogue {
     int64_t batch_stride_A;
     int64_t batch_stride_B;
     int64_t batch_stride_C;
-    int64_t batch_stride_D;
     int64_t batch_stride_Vector;
     int64_t batch_stride_Tensor;
 
@@ -138,11 +885,13 @@ struct GemmWithFusedEpilogue {
     //
     // Methods
     //
-    
-    Arguments(): 
-      mode(GemmUniversalMode::kGemm), 
-      batch_count(1), 
-      ptr_A(nullptr), ptr_B(nullptr), ptr_C(nullptr), ptr_D(nullptr) { }
+
+    Arguments():
+      ptr_A(nullptr),
+      ptr_B(nullptr),
+      ptr_C(nullptr),
+      ptr_D(nullptr)
+    {}
 
     /// constructs an arguments structure
     Arguments(
@@ -167,25 +916,22 @@ struct GemmWithFusedEpilogue {
       typename LayoutC::Stride::Index ldc,
       typename LayoutC::Stride::Index ldd,
       typename LayoutC::Stride::Index ldr,
-      typename LayoutC::Stride::Index ldt
-    ):
-      mode(mode), 
-      problem_size(problem_size), 
-      batch_count(batch_count),
-      epilogue(epilogue), 
-      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D), 
-      ptr_Vector(ptr_Vector), 
+      typename LayoutC::Stride::Index ldt)
+    :
+      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
+      epilogue(epilogue),
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
+      ptr_Vector(ptr_Vector),
       ptr_Tensor(ptr_Tensor),
-      batch_stride_A(batch_stride_A), 
-      batch_stride_B(batch_stride_B), 
-      batch_stride_C(batch_stride_C), 
-      batch_stride_D(batch_stride_D), 
+      batch_stride_A(batch_stride_A),
+      batch_stride_B(batch_stride_B),
+      batch_stride_C(batch_stride_C),
       batch_stride_Vector(batch_stride_Vector),
       batch_stride_Tensor(batch_stride_Tensor),
       lda(lda), ldb(ldb), ldc(ldc), ldd(ldd), ldr(ldr), ldt(ldt)
     {
       CUTLASS_TRACE_HOST("GemmWithFusedEpilogue::Arguments::Arguments() - problem_size: " << problem_size);
-      CUTLASS_TRACE_HOST("  ptr_Reduction: " << (void *)this->ptr_Reduction);
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
       CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
       CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
       CUTLASS_TRACE_HOST("  ldt: " << this->ldt);
@@ -194,7 +940,7 @@ struct GemmWithFusedEpilogue {
     /// Returns arguments for the transposed problem
     Arguments transposed_problem() const {
       Arguments args(*this);
-      
+
       std::swap(args.problem_size.m(), args.problem_size.n());
       std::swap(args.ptr_A, args.ptr_B);
       std::swap(args.lda, args.ldb);
@@ -204,35 +950,47 @@ struct GemmWithFusedEpilogue {
     }
   };
 
+
   //
   // Structure for precomputing values in host memory and passing to kernels
   //
 
   /// Parameters structure
-  struct Params {
+  struct Params : UniversalParamsBase<
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>
+  {
+    using ParamsBase = UniversalParamsBase<
+      ThreadblockSwizzle,
+      ThreadblockShape,
+      ElementA,
+      ElementB,
+      ElementC,
+      LayoutA,
+      LayoutB>;
 
-    cutlass::gemm::GemmCoord problem_size;
-    cutlass::gemm::GemmCoord grid_tiled_shape;
-    int swizzle_log_tile;
+    //
+    // Data members
+    //
 
     typename Mma::IteratorA::Params params_A;
     typename Mma::IteratorB::Params params_B;
     typename Epilogue::OutputTileIterator::Params params_C;
     typename Epilogue::OutputTileIterator::Params params_D;
     typename Epilogue::TensorTileIterator::Params params_Tensor;
-    
-    typename EpilogueOutputOp::Params output_op;
-
 
-    GemmUniversalMode mode;
-    int batch_count;
-    int gemm_k_size;
+    typename EpilogueOutputOp::Params output_op;
 
     void * ptr_A;
     void * ptr_B;
     void * ptr_C;
     void * ptr_D;
-    
+
     void * ptr_Vector;
     typename LayoutC::Stride::Index ldr;
 
@@ -241,89 +999,53 @@ struct GemmWithFusedEpilogue {
     int64_t batch_stride_A;
     int64_t batch_stride_B;
     int64_t batch_stride_C;
-    int64_t batch_stride_D;
     int64_t batch_stride_Vector;
     int64_t batch_stride_Tensor;
 
-    int *semaphore;
-
     //
-    // Methods
+    // Host dispatch API
     //
 
-    CUTLASS_HOST_DEVICE
-    Params():
-      swizzle_log_tile(0),
-      params_A(0),
-      params_B(0),
-      params_C(0),
-      params_D(0),
-      batch_count(0),
-      gemm_k_size(0),
-      mode(cutlass::gemm::GemmUniversalMode::kGemm),
-      ptr_A(nullptr),
-      ptr_B(nullptr),
-      ptr_C(nullptr),
-      ptr_D(nullptr),
-      ptr_Vector(nullptr),
-      ldr(0),
-      ptr_Tensor(nullptr),
-      batch_stride_A(0),
-      batch_stride_B(0),
-      batch_stride_C(0),
-      batch_stride_D(0),
-      batch_stride_Vector(0),
-      batch_stride_Tensor(0),
-      semaphore(nullptr) { }
+    /// Default constructor
+    Params() = default;
 
-    CUTLASS_HOST_DEVICE
+    /// Constructor
     Params(
-      Arguments const &args,
-      cutlass::gemm::GemmCoord const & grid_tiled_shape,
-      int gemm_k_size,
-      void *workspace = nullptr
-    ):
-      problem_size(args.problem_size),
-      grid_tiled_shape(grid_tiled_shape),
-      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      ParamsBase(args, device_sms, sm_occupancy),
       params_A(args.lda),
       params_B(args.ldb),
       params_C(args.ldc),
       params_D(args.ldd),
       params_Tensor(args.ldt),
       output_op(args.epilogue),
-      mode(args.mode),
-      batch_count(args.batch_count),
-      gemm_k_size(gemm_k_size),
       ptr_A(const_cast<void *>(args.ptr_A)),
       ptr_B(const_cast<void *>(args.ptr_B)),
       ptr_C(const_cast<void *>(args.ptr_C)),
       ptr_D(args.ptr_D),
-      ptr_Vector(args.ptr_Vector), 
+      ptr_Vector(args.ptr_Vector),
       ldr(args.ldr),
       ptr_Tensor(args.ptr_Tensor),
-
       batch_stride_A(args.batch_stride_A),
       batch_stride_B(args.batch_stride_B),
       batch_stride_C(args.batch_stride_C),
-      batch_stride_D(args.batch_stride_D),
       batch_stride_Vector(args.batch_stride_Vector),
-      batch_stride_Tensor(args.batch_stride_Tensor),
-
-      semaphore(static_cast<int *>(workspace)) {
-
-      CUTLASS_TRACE_HOST("GemmWithFusedEpilogue::Params::Params() - problem_size: " << problem_size);
-      CUTLASS_TRACE_HOST("  ptr_Reduction: " << (void *)this->ptr_Reduction);
+      batch_stride_Tensor(args.batch_stride_Tensor)
+    {
+      CUTLASS_TRACE_HOST("GemmWithFusedEpilogue::Params::Params()");
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
       CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
       CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
       CUTLASS_TRACE_HOST("  ldt: " << args.ldt);
     }
 
+    /// Lightweight update given a subset of arguments.
     CUTLASS_HOST_DEVICE
-    void update(
-      Arguments const &args,
-      void *workspace = nullptr) {
-
+    void update(Arguments const &args)
+    {
       ptr_A = const_cast<void *>(args.ptr_A);
       ptr_B = const_cast<void *>(args.ptr_B);
       ptr_C = const_cast<void *>(args.ptr_C);
@@ -336,21 +1058,20 @@ struct GemmWithFusedEpilogue {
       batch_stride_A = args.batch_stride_A;
       batch_stride_B = args.batch_stride_B;
       batch_stride_C = args.batch_stride_C;
-      batch_stride_D = args.batch_stride_D;
       batch_stride_Vector = args.batch_stride_Vector;
       batch_stride_Tensor = args.batch_stride_Tensor;
+      this->batch_stride_D = args.batch_stride_D;
 
       output_op = args.epilogue;
 
-      semaphore = static_cast<int *>(workspace);
-
       CUTLASS_TRACE_HOST("GemmWithFusedEpilogue::Params::update()");
-      CUTLASS_TRACE_HOST("  ptr_Reduction: " << (void *)this->ptr_Reduction);
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
       CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
       CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
     }
   };
 
+
   /// Shared memory storage structure
   union SharedStorage {
     typename Mma::SharedStorage main_loop;
@@ -360,12 +1081,9 @@ struct GemmWithFusedEpilogue {
 public:
 
   //
-  // Methods
+  // Host dispatch API
   //
 
-  CUTLASS_DEVICE
-  GemmWithFusedEpilogue() { } 
-
   /// Determines whether kernel satisfies alignment
   static Status can_implement(
     cutlass::gemm::GemmCoord const & problem_size) {
@@ -431,10 +1149,20 @@ struct GemmWithFusedEpilogue {
     return can_implement(args.problem_size);
   }
 
-  static size_t get_extra_workspace_size(Arguments const &args,
-                                         cutlass::gemm::GemmCoord const &grid_tiled_shape) {
+public:
+
+  //
+  // Device-only API
+  //
 
-    return 0;
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmWithFusedEpilogue op;
+    op(params, shared_storage);
   }
 
   #define SPLIT_K_ENABLED 1
@@ -458,7 +1186,7 @@ struct GemmWithFusedEpilogue {
     int offset_k = 0;
     int problem_size_k = params.problem_size.k();
 
-    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A); 
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
     ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
 
 
@@ -466,12 +1194,12 @@ struct GemmWithFusedEpilogue {
     //
     // Fetch pointers based on mode.
     //
-    if (params.mode == GemmUniversalMode::kGemm || 
+    if (params.mode == GemmUniversalMode::kGemm ||
       params.mode == GemmUniversalMode::kGemmSplitKParallel) {
 
       if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
 
-        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size; 
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
       }
 
       offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
@@ -517,7 +1245,7 @@ struct GemmWithFusedEpilogue {
 
     // Broadcast the warp_id computed by lane 0 to ensure dependent code
     // is compiled as warp-uniform.
-    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int warp_idx = canonical_warp_idx_sync();
 
     int lane_idx = threadIdx.x % 32;
 
@@ -537,10 +1265,10 @@ struct GemmWithFusedEpilogue {
 
     // Compute threadblock-scoped matrix multiply-add
     mma(
-      gemm_k_iterations, 
-      accumulators, 
-      iterator_A, 
-      iterator_B, 
+      gemm_k_iterations,
+      accumulators,
+      iterator_A,
+      iterator_B,
       accumulators);
 
     //
@@ -563,25 +1291,25 @@ struct GemmWithFusedEpilogue {
 
     int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
 
-    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C); 
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
     ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
     typename Epilogue::ElementTensor *ptr_Tensor = static_cast<typename Epilogue::ElementTensor *>(params.ptr_Tensor);
 
     // Define the reduction output pointer and move to the appropriate place
-    typename Epilogue::ElementVector *ptr_Vector = 
+    typename Epilogue::ElementVector *ptr_Vector =
       static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
 
     //
     // Fetch pointers based on mode.
     //
-    
+
     //
     // Special path when split-K not enabled.
-    // 
+    //
 
     if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() == 1) {
 
-      // Tile iterator loading from source tensor.
+      // Tile iterators loading from source tensors.
       typename Epilogue::OutputTileIterator iterator_C(
         params.params_C,
         ptr_C,
@@ -610,9 +1338,9 @@ struct GemmWithFusedEpilogue {
 
       // Construct the epilogue
       Epilogue epilogue(
-        shared_storage.epilogue, 
-        thread_idx, 
-        warp_idx, 
+        shared_storage.epilogue,
+        thread_idx,
+        warp_idx,
         lane_idx);
 
       // Move to appropriate location for this output tile
@@ -637,7 +1365,7 @@ struct GemmWithFusedEpilogue {
     // Slower path when split-K or batching is needed
     //
 
-      
+
     #if SPLIT_K_ENABLED
     // Construct the semaphore.
     Semaphore semaphore(params.semaphore + block_idx, thread_idx);
@@ -646,7 +1374,7 @@ struct GemmWithFusedEpilogue {
 
       // If performing a reduction via split-K, fetch the initial synchronization
       if (params.grid_tiled_shape.k() > 1) {
-        
+
         // Fetch the synchronization lock initially but do not block.
         semaphore.fetch();
 
@@ -661,7 +1389,9 @@ struct GemmWithFusedEpilogue {
       ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
       ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
       if (ptr_Tensor) {
-        ptr_Tensor += threadblock_tile_offset.k() * params.batch_stride_Tensor;
+        ptr_Tensor = ReferenceFactory<typename Epilogue::ElementTensor>::add_pointer_offset(
+          ptr_Tensor,
+          threadblock_tile_offset.k() * params.batch_stride_Tensor);
       }
       if (ptr_Vector) {
         ptr_Vector += threadblock_tile_offset.k() * params.batch_stride_Vector;
@@ -679,7 +1409,7 @@ struct GemmWithFusedEpilogue {
     }
     #endif
 
-    // Tile iterator loading from source tensor.
+    // Tile iterators loading from source tensors.
     typename Epilogue::OutputTileIterator iterator_C(
       params.params_C,
       ptr_C,
@@ -711,15 +1441,15 @@ struct GemmWithFusedEpilogue {
 
     // Construct the epilogue
     Epilogue epilogue(
-      shared_storage.epilogue, 
-      thread_idx, 
-      warp_idx, 
+      shared_storage.epilogue,
+      thread_idx,
+      warp_idx,
       lane_idx);
 
     #if SPLIT_K_ENABLED
     // Wait on the semaphore - this latency may have been covered by iterator construction
     if ((params.mode == GemmUniversalMode::kGemm) && params.grid_tiled_shape.k() > 1) {
-        
+
       // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
       if (threadblock_tile_offset.k()) {
         iterator_C = iterator_D;
@@ -754,7 +1484,7 @@ struct GemmWithFusedEpilogue {
     //
 
     #if SPLIT_K_ENABLED
-    if ((params.mode == GemmUniversalMode::kGemm)  && params.grid_tiled_shape.k() > 1) { 
+    if ((params.mode == GemmUniversalMode::kGemm)  && params.grid_tiled_shape.k() > 1) {
 
       int lock = 0;
       if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
@@ -766,7 +1496,7 @@ struct GemmWithFusedEpilogue {
         // Otherwise, the semaphore is incremented
         lock = threadblock_tile_offset.k() + 1;
       }
-      
+
       semaphore.release(lock);
     }
     #endif
diff --git a/include/cutlass/gemm/kernel/gemm_with_k_reduction.h b/include/cutlass/gemm/kernel/gemm_with_k_reduction.h
index ab69d7f668..49c4b0a1ac 100644
--- a/include/cutlass/gemm/kernel/gemm_with_k_reduction.h
+++ b/include/cutlass/gemm/kernel/gemm_with_k_reduction.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -41,6 +41,8 @@
 #include "cutlass/matrix_coord.h"
 #include "cutlass/complex.h"
 #include "cutlass/semaphore.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/gemm/kernel/params_universal_base.h"
 
 #include "cutlass/trace.h"
 
@@ -104,16 +106,12 @@ struct GemmWithKReduction {
   //
 
   /// Argument structure
-  struct Arguments {
-
+  struct Arguments : UniversalArgumentsBase
+  {
     //
     // Data members
     //
 
-    GemmUniversalMode mode;
-    GemmCoord problem_size;
-    int batch_count;
-
     typename EpilogueOutputOp::Params epilogue;
 
     void const * ptr_A;
@@ -125,7 +123,6 @@ struct GemmWithKReduction {
     int64_t batch_stride_A;
     int64_t batch_stride_B;
     int64_t batch_stride_C;
-    int64_t batch_stride_D;
     int64_t batch_stride_gemm_k_reduction;
 
     typename LayoutA::Stride::Index lda;
@@ -137,11 +134,14 @@ struct GemmWithKReduction {
     //
     // Methods
     //
-    
-    Arguments(): 
-      mode(GemmUniversalMode::kGemm), 
-      batch_count(1), 
-      ptr_A(nullptr), ptr_B(nullptr), ptr_C(nullptr), ptr_D(nullptr), ptr_gemm_k_reduction(nullptr) { }
+
+    Arguments() :
+      ptr_A(nullptr),
+      ptr_B(nullptr),
+      ptr_C(nullptr),
+      ptr_D(nullptr),
+      ptr_gemm_k_reduction(nullptr)
+    {}
 
     /// constructs an arguments structure
     Arguments(
@@ -163,23 +163,21 @@ struct GemmWithKReduction {
       typename LayoutB::Stride::Index ldb,
       typename LayoutC::Stride::Index ldc,
       typename LayoutC::Stride::Index ldd,
-      typename LayoutGemmKReduction::Stride::Index ld_gemm_k_reduction
-    ):
-      mode(mode), 
-      problem_size(problem_size), 
-      batch_count(batch_count),
-      epilogue(epilogue), 
-      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D), ptr_gemm_k_reduction(ptr_gemm_k_reduction), 
-      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C), batch_stride_D(batch_stride_D), batch_stride_gemm_k_reduction(batch_stride_gemm_k_reduction),
-      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd), ld_gemm_k_reduction(ld_gemm_k_reduction) {
-
+      typename LayoutGemmKReduction::Stride::Index ld_gemm_k_reduction)
+    :
+      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
+      epilogue(epilogue),
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D), ptr_gemm_k_reduction(ptr_gemm_k_reduction),
+      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C), batch_stride_gemm_k_reduction(batch_stride_gemm_k_reduction),
+      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd), ld_gemm_k_reduction(ld_gemm_k_reduction)
+    {
       CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
-      }
+    }
 
     /// Returns arguments for the transposed problem
     Arguments transposed_problem() const {
       Arguments args(*this);
-      
+
       std::swap(args.problem_size.m(), args.problem_size.n());
       std::swap(args.ptr_A, args.ptr_B);
       std::swap(args.lda, args.ldb);
@@ -189,16 +187,33 @@ struct GemmWithKReduction {
     }
   };
 
+
   //
   // Structure for precomputing values in host memory and passing to kernels
   //
 
   /// Parameters structure
-  struct Params {
+  struct Params : UniversalParamsBase<
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>
+  {
+    using ParamsBase = UniversalParamsBase<
+      ThreadblockSwizzle,
+      ThreadblockShape,
+      ElementA,
+      ElementB,
+      ElementC,
+      LayoutA,
+      LayoutB>;
 
-    cutlass::gemm::GemmCoord problem_size;
-    cutlass::gemm::GemmCoord grid_tiled_shape;
-    int swizzle_log_tile;
+    //
+    // Data members
+    //
     
     typename Mma::IteratorA::Params params_A;
     typename Mma::IteratorB::Params params_B;
@@ -207,10 +222,6 @@ struct GemmWithKReduction {
     
     typename EpilogueOutputOp::Params output_op;
 
-    GemmUniversalMode mode;
-    int batch_count;
-    int gemm_k_size;
-
     void * ptr_A;
     void * ptr_B;
     void * ptr_C;
@@ -220,82 +231,77 @@ struct GemmWithKReduction {
     int64_t batch_stride_A;
     int64_t batch_stride_B;
     int64_t batch_stride_C;
-    int64_t batch_stride_D;
     int64_t batch_stride_gemm_k_reduction;
 
-    int *semaphore;
-
     //
-    // Methods
+    // Host dispatch API
     //
 
-    CUTLASS_HOST_DEVICE
-    Params():
-      swizzle_log_tile(0),
-      params_A(0),
-      params_B(0),
-      params_C(0),
-      params_D(0),
-      batch_count(0),
-      gemm_k_size(0),
-      mode(cutlass::gemm::GemmUniversalMode::kGemm),
-      ptr_A(nullptr),
-      ptr_B(nullptr),
-      ptr_C(nullptr),
-      ptr_D(nullptr),
-      ptr_gemm_k_reduction(nullptr),
-      batch_stride_A(0),
-      batch_stride_B(0),
-      batch_stride_C(0),
-      batch_stride_D(0),
-      batch_stride_gemm_k_reduction(0),
-      semaphore(nullptr) { }
-
-    CUTLASS_HOST_DEVICE
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
     Params(
-      Arguments const &args,
-      cutlass::gemm::GemmCoord const & grid_tiled_shape,
-      int gemm_k_size,
-      void *workspace = nullptr
-    ):
-      problem_size(args.problem_size),
-      grid_tiled_shape(grid_tiled_shape),
-      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      ParamsBase(args, device_sms, sm_occupancy),
       params_A(args.lda),
       params_B(args.ldb),
       params_C(args.ldc),
       params_D(args.ldd),
       output_op(args.epilogue),
-      mode(args.mode),
-      batch_count(args.batch_count),
-      gemm_k_size(gemm_k_size),
       ptr_A(const_cast<void *>(args.ptr_A)),
       ptr_B(const_cast<void *>(args.ptr_B)),
       ptr_C(const_cast<void *>(args.ptr_C)),
       batch_stride_A(args.batch_stride_A),
       batch_stride_B(args.batch_stride_B),
       batch_stride_C(args.batch_stride_C),
-      batch_stride_D(args.batch_stride_D),
       batch_stride_gemm_k_reduction(args.batch_stride_gemm_k_reduction),
-      semaphore(static_cast<int *>(workspace)) {
-
-      CUTLASS_TRACE_HOST("GemmUniversal::Params::Params() - problem_size: " << problem_size);
-
-      if (args.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D(args.ptr_D),
+      ptr_gemm_k_reduction(args.ptr_gemm_k_reduction)
+    {}
+
+    /// Assign and initialize the specified workspace buffer.  Assumes
+    /// the memory allocated to workspace is at least as large as get_workspace_size().
+    Status init_workspace(
+      void *workspace,
+      cudaStream_t stream = nullptr)
+    {
+      CUTLASS_TRACE_HOST("GemmUniversal::Params::Params() - problem_size: " << this->problem_size);
+
+      if (this->mode == GemmUniversalMode::kGemmSplitKParallel) {
         ptr_D = workspace;
         ptr_gemm_k_reduction = static_cast<uint8_t *>(workspace)
-                 + sizeof(ElementC) * size_t(args.batch_stride_D) * size_t(grid_tiled_shape.k());
-      } else {
-        ptr_D = args.ptr_D;
-        ptr_gemm_k_reduction = args.ptr_gemm_k_reduction;
+                 + sizeof(ElementC) * size_t(this->batch_stride_D) * size_t(this->grid_tiled_shape.k());
+
+        return Status::kSuccess;
       }
+
+      return ParamsBase::init_workspace(workspace, stream);
     }
 
-    CUTLASS_HOST_DEVICE
-    void update(
-      Arguments const &args,
-      void *workspace = nullptr) {
+    /// Returns the workspace size (in bytes) needed for this problem geometry
+    size_t get_workspace_size() const
+    {
+      size_t workspace_bytes = ParamsBase::get_workspace_size();
+
+      if (this->mode == GemmUniversalMode::kGemmSplitKParallel)
+      {
+        // Split-K parallel always requires a temporary workspace
+        workspace_bytes +=
+          sizeof(ElementC) *
+          size_t(batch_stride_gemm_k_reduction) *
+          size_t(this->grid_tiled_shape.k());
+      }
+
+      return workspace_bytes;
+    }
 
+    /// Lightweight update given a subset of arguments.
+    void update(Arguments const &args)
+    {
       ptr_A = const_cast<void *>(args.ptr_A);
       ptr_B = const_cast<void *>(args.ptr_B);
       ptr_C = const_cast<void *>(args.ptr_C);
@@ -305,12 +311,11 @@ struct GemmWithKReduction {
       batch_stride_A = args.batch_stride_A;
       batch_stride_B = args.batch_stride_B;
       batch_stride_C = args.batch_stride_C;
-      batch_stride_D = args.batch_stride_D;
       batch_stride_gemm_k_reduction = args.batch_stride_gemm_k_reduction;
+      this->batch_stride_D = args.batch_stride_D;
 
       output_op = args.epilogue;
 
-      semaphore = static_cast<int *>(workspace);
       CUTLASS_TRACE_HOST("GemmUniversal::Params::update()");
     }
   };
@@ -321,15 +326,13 @@ struct GemmWithKReduction {
     typename Epilogue::SharedStorage epilogue;
   };
 
+
 public:
 
   //
-  // Methods
+  // Host dispatch API
   //
 
-  CUTLASS_DEVICE
-  GemmWithKReduction() { } 
-
   /// Determines whether kernel satisfies alignment
   static Status can_implement(
     cutlass::gemm::GemmCoord const & problem_size) {
@@ -409,26 +412,29 @@ struct GemmWithKReduction {
     return Status::kSuccess;
   }
 
+
   static Status can_implement(Arguments const &args) {
     return can_implement(args.problem_size);
   }
 
-  static size_t get_extra_workspace_size(Arguments const &args,
-                                         cutlass::gemm::GemmCoord const &grid_tiled_shape) {
-    size_t workspace_bytes = 0;
 
-    if (args.mode == GemmUniversalMode::kGemmSplitKParallel) {                                        
-      
-      // Split-K parallel always requires a temporary workspace                                       
-      workspace_bytes =  
-        sizeof(ElementC) *
-        size_t(args.batch_stride_gemm_k_reduction) * 
-        size_t(grid_tiled_shape.k());                                                                 
-    }
+public:
 
-    return workspace_bytes;
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmWithKReduction op;
+    op(params, shared_storage);
   }
-  
+
+
   /// Executes one GEMM
   CUTLASS_DEVICE
   void operator()(Params const &params, SharedStorage &shared_storage) {
@@ -508,7 +514,7 @@ struct GemmWithKReduction {
 
     // Broadcast the warp_id computed by lane 0 to ensure dependent code
     // is compiled as warp-uniform.
-    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int warp_idx = canonical_warp_idx_sync();
 
     int lane_idx = threadIdx.x % 32;
 
diff --git a/include/cutlass/gemm/kernel/gemv.h b/include/cutlass/gemm/kernel/gemv.h
index d7f10a59e7..9ec55e13c4 100644
--- a/include/cutlass/gemm/kernel/gemv.h
+++ b/include/cutlass/gemm/kernel/gemv.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -41,9 +41,13 @@
 #include "cutlass/complex.h"
 #include "cutlass/tensor_ref.h"
 
+#include "cutlass/arch/memory.h"
+#include "cutlass/arch/cache_operation.h"
+
 #include "cutlass/gemm/gemm.h"
 #include "cutlass/layout/matrix.h"
 
+#include "cutlass/numeric_conversion.h"
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
@@ -58,18 +62,49 @@ template <
   typename ElementB_,
   typename ElementC_,
   typename ElementAccumulator_,
-  typename EpilogueOutputOp_
+  typename EpilogueOutputOp_,
+  int kElementsPerAccess_ = 1,            ///< Number of elements involved in a global access.
+  int kThreadCount_ = 0,                  ///< Number of threads in the thread block.
+                                          ///  It will be calculated automatically if set to 0.
+  int kThreadsPerRow_ = 0                 ///< Number of threads in the k dimension.
+                                          ///  It will be calculated automatically if set to 0.
+>
+struct Gemv;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Specializations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GEMV for column-major A matrix
+template <
+  typename ElementA_,
+  typename ElementB_,
+  typename ElementC_,
+  typename ElementAccumulator_,
+  typename EpilogueOutputOp_,
+  int kElementsPerAccess_,
+  int kThreadCount_,
+  int kThreadsPerRow_
 >
-struct Gemv {
+struct Gemv <
+  ElementA_,
+  layout::ColumnMajor,
+  ElementB_,
+  ElementC_,
+  ElementAccumulator_,
+  EpilogueOutputOp_,
+  kElementsPerAccess_,
+  kThreadCount_,
+  kThreadsPerRow_
+>{
 public:
 
   using ElementA = ElementA_;
   using LayoutA = layout::ColumnMajor;
   using TensorRefA = TensorRef<ElementA, LayoutA>;
 
-  static_assert(platform::is_same<LayoutA, LayoutA_>::value, 
-    "Only supported for column-major A matrix");
-
   using ElementB = ElementB_;
   using ElementC = ElementC_;
 
@@ -79,7 +114,10 @@ struct Gemv {
   static ComplexTransform const kTransformA = ComplexTransform::kNone;
   static ComplexTransform const kTransformB = ComplexTransform::kNone;
 
-  static int const kThreadCount = 32;
+  // thread block shape (kThreadCount, 1, 1)
+  static int const kThreadCount = (kThreadCount_ <= 0) ? 32 : kThreadCount_;
+  static int const kThreadsPerRow = (kThreadsPerRow_ <= 0) ? 1 : kThreadsPerRow_;
+
   static int const kStages = 1;
 
   static int const kAlignmentA = 1;
@@ -121,17 +159,17 @@ struct Gemv {
       MatrixCoord problem_size,
       int batch_count,
       typename EpilogueOutputOp::Params output_op,
-      TensorRefA   ref_A,
-      void const * ptr_B,
-      void const * ptr_C,
-      void * ptr_D,
-      int64_t  inc_B,
-      int64_t  inc_C,
-      int64_t  inc_D,
-      int64_t batch_stride_A,
-      int64_t batch_stride_B,
-      int64_t batch_stride_C,
-      int64_t batch_stride_D
+      TensorRefA  ref_A,
+      void const *ptr_B,
+      void const *ptr_C,
+      void       *ptr_D,
+      int64_t     inc_B,
+      int64_t     inc_C,
+      int64_t     inc_D,
+      int64_t     batch_stride_A,
+      int64_t     batch_stride_B,
+      int64_t     batch_stride_C,
+      int64_t     batch_stride_D
     ): 
       problem_size(problem_size),
       batch_count(batch_count),
@@ -149,16 +187,46 @@ struct Gemv {
       batch_stride_D(batch_stride_D)
     { }
 
+    Arguments(
+      MatrixCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params output_op,
+      TensorRefA  ref_A,
+      void const *ptr_B,
+      void const *ptr_C,
+      void       *ptr_D,
+      int64_t     batch_stride_A,
+      int64_t     batch_stride_B,
+      int64_t     batch_stride_C,
+      int64_t     batch_stride_D
+    ): 
+      Arguments(
+        problem_size, 
+        batch_count, 
+        output_op, 
+        ref_A, 
+        ptr_B, 
+        ptr_C, 
+        ptr_D,
+        1, 
+        1, 
+        1, 
+        batch_stride_A,
+        batch_stride_B,
+        batch_stride_C,
+        batch_stride_D)
+    { }
+
     Arguments(
       MatrixCoord problem_size,
       typename EpilogueOutputOp::Params output_op,
-      TensorRefA   ref_A,
-      void const * ptr_B,
-      void const * ptr_C,
-      void       * ptr_D,
-      int64_t  inc_B,
-      int64_t  inc_C,
-      int64_t  inc_D
+      TensorRefA  ref_A,
+      void const *ptr_B,
+      void const *ptr_C,
+      void       *ptr_D,
+      int64_t     inc_B,
+      int64_t     inc_C,
+      int64_t     inc_D
     ): 
       Arguments(
         problem_size, 
@@ -206,7 +274,6 @@ struct Gemv {
 
   /// Determines whether kernel satisfies alignment
   static Status can_implement(cutlass::MatrixCoord const & problem_size) {
-
     return Status::kSuccess;
   }
 
@@ -214,7 +281,7 @@ struct Gemv {
     return can_implement(args.problem_size);
   }
  
-  /// Executes one GEMM
+  /// Executes one GEMV
   CUTLASS_DEVICE
   void operator()(Params const &params, SharedStorage &shared_storage) {
 
@@ -282,6 +349,288 @@ struct Gemv {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+// GEMV for row-major A matrix
+template <
+    typename ElementA_,
+    typename ElementB_,
+    typename ElementC_,
+    typename ElementAccumulator_,
+    typename EpilogueOutputOp_,
+    int kElementsPerAccess_,
+    int kThreadCount_,
+    int kThreadsPerRow_ 
+>
+struct Gemv <
+    ElementA_,            
+    layout::RowMajor,
+    ElementB_,            
+    ElementC_,
+    ElementAccumulator_,
+    EpilogueOutputOp_,
+    kElementsPerAccess_,
+    kThreadCount_,
+    kThreadsPerRow_
+>{
+public:
+
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using TensorRefA = TensorRef<ElementA, LayoutA>;
+
+  using ElementB = ElementB_;
+  using ElementC = ElementC_;
+
+  using ElementAccumulator = ElementAccumulator_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  static FloatRoundStyle const Round = cutlass::FloatRoundStyle::round_to_nearest;
+
+  // number of return elements in a global access
+  static int const kElementsPerAccess = kElementsPerAccess_;
+  
+  using FragmentA = Array<ElementA, kElementsPerAccess>;
+  using FragmentB = Array<ElementB, kElementsPerAccess>;
+  using FragmentCompute = Array<ElementAccumulator, kElementsPerAccess>;
+
+  // thread block shape (kThreadsPerRow, kThreadCount / kThreadsPerRow, 1)
+  static int const kThreadCount = (kThreadCount_ <= 0) ? 128 : kThreadCount_;
+  static int const kThreadsPerRow = (kThreadsPerRow_ <= 0) ?
+                                  std::min(static_cast<int>(kThreadCount / (kElementsPerAccess * sizeof(ElementA))), 16)
+                                  : kThreadsPerRow_;
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+    MatrixCoord     problem_size;
+    int32_t         batch_count;
+    typename EpilogueOutputOp::Params output_op;
+
+    TensorRefA      ref_A;
+
+    ElementB const *ptr_B;
+    ElementC const *ptr_C;
+    ElementC       *ptr_D;
+
+    int64_t         batch_stride_A;
+    int64_t         batch_stride_B;
+    int64_t         batch_stride_C;
+    int64_t         batch_stride_D;
+
+    //
+    // Methods
+    //
+
+    Arguments(): batch_count(0) { }
+
+    Arguments(
+      MatrixCoord problem_size,
+      int32_t     batch_count,
+      typename EpilogueOutputOp::Params output_op,
+      TensorRefA  ref_A,
+      void const *ptr_B,
+      void const *ptr_C,
+      void       *ptr_D,
+      int64_t     batch_stride_A,
+      int64_t     batch_stride_B,
+      int64_t     batch_stride_C,
+      int64_t     batch_stride_D
+    ):
+      problem_size(problem_size),
+      batch_count(batch_count),
+      output_op(output_op),
+      ref_A(ref_A),
+      ptr_B(static_cast<ElementB const *>(ptr_B)),
+      ptr_C(static_cast<ElementC const *>(ptr_C)),
+      ptr_D(static_cast<ElementC       *>(ptr_D)),
+      batch_stride_A(batch_stride_A),
+      batch_stride_B(batch_stride_B),
+      batch_stride_C(batch_stride_C),
+      batch_stride_D(batch_stride_D)
+    { }
+
+    Arguments(
+      MatrixCoord problem_size,
+      typename EpilogueOutputOp::Params output_op,
+      TensorRefA  ref_A,
+      void const *ptr_B,
+      void const *ptr_C,
+      void       *ptr_D
+    ):
+      Arguments(
+        problem_size,
+        1,
+        output_op,
+        ref_A,
+        ptr_B,
+        ptr_C,
+        ptr_D,
+        1,
+        1,
+        1,
+        1)
+    { }
+
+    Status update(Arguments const &args) {
+      problem_size = args.problem_size;
+      batch_count = args.batch_count;
+      output_op = args.output_op;
+      ref_A = ref_A;
+      ptr_B = args.ptr_B;
+      ptr_C = args.ptr_C;
+      ptr_D = args.ptr_D;
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C = args.batch_stride_C;
+      batch_stride_D = args.batch_stride_D;
+
+      return Status::kSuccess;
+    }
+  };
+
+  using Params = Arguments;
+
+  /// Shared memory storage structure
+  union SharedStorage {
+
+  };
+
+public:
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  Gemv() {}
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(cutlass::MatrixCoord const &problem_size) {
+    if (problem_size.column() % kElementsPerAccess != 0) {
+      return Status::kErrorMisalignedOperand;
+    }
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+  /// Executes one GEMV
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+    
+    // Loop over batch indices
+    for (int batch_idx = blockIdx.z; batch_idx < params.batch_count; batch_idx += gridDim.z) {
+      int idx_col_k = threadIdx.x;
+      int idx_row_m = blockIdx.x * blockDim.y + threadIdx.y;
+
+      if (idx_row_m < params.problem_size.row()) {
+        // problem_size (row = m, column = k)
+        // matrix A (batch, m, k)
+        // vector B (batch, 1, k)
+        // vector C (batch, m, 1)
+        // vector D (batch, m, 1)
+
+        // move in the batch dimension
+        ElementA const *ptr_A = params.ref_A.data() + batch_idx * params.batch_stride_A;
+        ElementB const *ptr_B = params.ptr_B + batch_idx * params.batch_stride_B;
+
+        ElementC const *ptr_C = params.ptr_C + batch_idx * params.batch_stride_C;
+        ElementC *ptr_D = params.ptr_D + batch_idx * params.batch_stride_D;
+
+        // move in the k dimension
+        ptr_A += idx_col_k * kElementsPerAccess;
+        ptr_B += idx_col_k * kElementsPerAccess;
+
+        // move in the m dimension
+        ptr_A += idx_row_m * params.problem_size.column();
+        ptr_C += idx_row_m;
+        ptr_D += idx_row_m;
+
+        NumericArrayConverter<ElementAccumulator, ElementA, kElementsPerAccess, Round> srcA_converter;
+        NumericArrayConverter<ElementAccumulator, ElementB, kElementsPerAccess, Round> srcB_converter;
+
+        ElementAccumulator accum = 0.f;
+
+        FragmentB fragB;
+        FragmentA fragA;
+
+        int unroll_col_k = 0;
+
+        // rows of the rolling tile
+        int const tileA_k = kThreadsPerRow * kElementsPerAccess;
+
+        for (; unroll_col_k < params.problem_size.column() / tileA_k * tileA_k; unroll_col_k += tileA_k) {
+
+          // fetch from matrix A
+          arch::global_load<FragmentA,
+                            sizeof(FragmentA),
+                            arch::CacheOperation::LastUse>(fragA, (ptr_A + unroll_col_k), true);
+
+          // fetch from vector B
+          arch::global_load<FragmentB,
+                            sizeof(FragmentB),
+                            arch::CacheOperation::Always>(fragB, (ptr_B + unroll_col_k), true);
+
+          FragmentCompute fragB_Compute = srcB_converter(fragB);
+          FragmentCompute fragA_Compute = srcA_converter(fragA);
+
+          // Math
+          CUTLASS_PRAGMA_UNROLL
+          for (int e = 0; e < kElementsPerAccess; e++) {
+            accum += fragA_Compute.at(e) * fragB_Compute.at(e);
+          }
+        }
+
+        // calculate the rest of K elements
+        // each thread fetch 1 element each time
+        for (int k = unroll_col_k + idx_col_k; k < params.problem_size.column(); k += kThreadsPerRow) {
+          ElementB b = *(ptr_B - idx_col_k * kElementsPerAccess + k);
+          ElementA a = *(ptr_A - idx_col_k * kElementsPerAccess + k);
+
+          accum += ElementAccumulator(a) * ElementAccumulator(b);
+        }
+
+        EpilogueOutputOp output_op(params.output_op);
+        typename EpilogueOutputOp::FragmentOutput source_fragment;
+
+        // prefetch from source matrix C
+        if (output_op.is_source_needed()) {         
+          source_fragment[0] = *(ptr_C);
+        }
+
+        typename EpilogueOutputOp::FragmentAccumulator accum_fragment;
+        typename EpilogueOutputOp::FragmentOutput output_fragment;
+
+        for (int mask = (kThreadsPerRow >> 1); mask > 0; mask >>= 1) {
+          accum += __shfl_xor_sync(0xFFFFFFFF, accum, mask, 32);
+        }
+
+        if (idx_col_k == 0) {
+          accum_fragment[0] = accum;
+
+          if (output_op.is_source_needed()) {
+            output_fragment = output_op(accum_fragment, source_fragment);
+          }
+          else {
+            output_fragment = output_op(accum_fragment);
+          }
+
+          *ptr_D = output_fragment[0];
+        }
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 } // namespace kernel
 } // namespace gemm
 } // namespace cutlass
diff --git a/include/cutlass/gemm/kernel/gemv_batched_strided.h b/include/cutlass/gemm/kernel/gemv_batched_strided.h
index 393c41ae29..673f1995c9 100755
--- a/include/cutlass/gemm/kernel/gemv_batched_strided.h
+++ b/include/cutlass/gemm/kernel/gemv_batched_strided.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -149,7 +149,7 @@ CUTLASS_DEVICE void GemvBatchedStridedDevice(
   mma(problem_size.mnk(), accumulators, iterator_A, iterator_B, accumulators);
 
   //
-  // Epilogue (TODO: Epiloge as template argument)
+  // Epilogue
   //
   typename GemvKernel::FragmentCD fragment_CD;
 
@@ -186,7 +186,7 @@ CUTLASS_DEVICE void GemvBatchedStridedDevice(
 }
 
 template <typename GemvKernel, typename ElementAlphaBeta, bool BetaIsZero>
-__global__ void GemvBatchedStrided(
+CUTLASS_GLOBAL void GemvBatchedStrided(
   cutlass::gemm::BatchedGemmCoord problem_size,
   ElementAlphaBeta alpha,
   ElementAlphaBeta beta,
@@ -205,7 +205,7 @@ __global__ void GemvBatchedStrided(
 }
 
 template <typename GemvKernel, typename ElementAlphaBeta>
-__global__ void GemvBatchedStrided(
+CUTLASS_GLOBAL void GemvBatchedStrided(
   cutlass::gemm::BatchedGemmCoord problem_size,
   ElementAlphaBeta alpha,
   typename GemvKernel::IteratorA::TensorRef ref_A,
@@ -221,7 +221,7 @@ __global__ void GemvBatchedStrided(
 }
 
 template <typename GemvKernel>
-__global__ void GemvBatchedStrided(
+CUTLASS_GLOBAL void GemvBatchedStrided(
   cutlass::gemm::BatchedGemmCoord problem_size,
   typename GemvKernel::IteratorA::TensorRef ref_A,
   typename GemvKernel::IteratorA::TensorRef::LongIndex lda, 
diff --git a/include/cutlass/gemm/kernel/grouped_problem_visitor.h b/include/cutlass/gemm/kernel/grouped_problem_visitor.h
new file mode 100644
index 0000000000..4df76ec0bd
--- /dev/null
+++ b/include/cutlass/gemm/kernel/grouped_problem_visitor.h
@@ -0,0 +1,463 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Base scheduler for grouped problems
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Enumerated type describing the type of scheduling to perform for the ProblemVisitor
+enum class GroupScheduleMode {
+  // Perform all scheduling on device
+  kDeviceOnly,
+  // Precompute on the host the full sequence of problems to access
+  kHostPrecompute
+};
+
+/// Visitor class to abstract away the algorithm for iterating over tiles
+template <typename ProblemSizeHelper,
+          typename ThreadblockShape_>
+struct BaseGroupedProblemVisitor {
+  using ThreadblockShape = ThreadblockShape_;
+
+  struct ProblemInfo {
+    static int32_t const kNoPrefetchEntry = -1;
+    int32_t problem_idx;
+    int32_t problem_start;
+
+    CUTLASS_HOST_DEVICE
+    ProblemInfo() : problem_idx(kNoPrefetchEntry), problem_start(kNoPrefetchEntry) {}
+
+    CUTLASS_HOST_DEVICE
+    ProblemInfo(int32_t problem_idx_, int32_t problem_start_) :
+      problem_idx(problem_idx_), problem_start(problem_start_) {}
+  };
+
+  struct Params {
+    cutlass::gemm::GemmCoord const *problem_sizes;
+    int32_t                         problem_count;
+    void const                     *workspace;
+    int32_t                         tile_count;
+
+    //
+    // Methods
+    //
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Params(): problem_sizes(nullptr), problem_count(0), workspace(nullptr), tile_count(0) { }
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const *problem_sizes,
+      int32_t                         problem_count,
+      void const                     *workspace = nullptr,
+      int32_t                         tile_count = 0
+    ):
+      problem_sizes(problem_sizes),
+      problem_count(problem_count),
+      workspace(workspace),
+      tile_count(tile_count)
+    {}
+
+  };
+
+  Params params;
+  int32_t tile_idx;
+  int32_t problem_tile_start;
+  int32_t problem_idx;
+
+  //
+  // Methods
+  //
+  CUTLASS_DEVICE
+  BaseGroupedProblemVisitor(
+    Params const &params_,
+    int32_t block_idx
+  ):
+  params(params_),
+  tile_idx(block_idx),
+  problem_tile_start(0),
+  problem_idx(0)
+  {}
+
+  /// Get the grid shape
+  CUTLASS_HOST_DEVICE
+  static cutlass::gemm::GemmCoord grid_shape(const cutlass::gemm::GemmCoord& problem) {
+    return ProblemSizeHelper::grid_shape(problem);
+  }
+
+  /// Gets the global tile index
+  CUTLASS_HOST_DEVICE
+  int32_t tile_index() const {
+    return tile_idx;
+  }
+
+  /// Gets the index of the problem
+  CUTLASS_HOST_DEVICE
+  int32_t problem_index() const {
+    return problem_idx;
+  }
+
+  CUTLASS_HOST_DEVICE
+  int32_t threadblock_idx() const {
+    return tile_idx - problem_tile_start;
+  }
+
+  CUTLASS_DEVICE
+  void advance(int32_t grid_size) {
+    tile_idx += grid_size;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static void possibly_transpose_problem(cutlass::gemm::GemmCoord& problem) {
+    ProblemSizeHelper::possibly_transpose_problem(problem);
+  }
+
+  /// Returns the problem size for the current problem
+  CUTLASS_HOST_DEVICE
+  cutlass::gemm::GemmCoord problem_size() const {
+    GemmCoord problem = params.problem_sizes[problem_idx];
+    ProblemSizeHelper::possibly_transpose_problem(problem);
+    return problem;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t tile_count(const cutlass::gemm::GemmCoord& grid) {
+    return ProblemSizeHelper::tile_count(grid);
+  }
+
+  static int32_t group_tile_count(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr, int32_t problem_count) {
+    int32_t total_tiles = 0;
+    for (int32_t i = 0; i < problem_count; ++i) {
+      auto problem = host_problem_sizes_ptr[i];
+      possibly_transpose_problem(problem);
+      auto grid = grid_shape(problem);
+      total_tiles += tile_count(grid);
+    }
+
+    return total_tiles;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ProblemSizeHelper,
+  typename ThreadblockShape,
+  GroupScheduleMode GroupScheduleMode_,
+  int PrefetchTileCount,
+  int ThreadCount
+>
+struct GroupedProblemVisitor;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// ProblemVisitor that performs all scheduling on device
+//
+template <typename ProblemSizeHelper,
+          typename ThreadblockShape,
+          int PrefetchTileCount,
+          int ThreadCount>
+struct GroupedProblemVisitor<ProblemSizeHelper,
+                             ThreadblockShape,
+                             GroupScheduleMode::kDeviceOnly,
+                             PrefetchTileCount,
+                             ThreadCount>: public BaseGroupedProblemVisitor<ProblemSizeHelper, ThreadblockShape> {
+  using Base = BaseGroupedProblemVisitor<ProblemSizeHelper, ThreadblockShape>;
+  using Params = typename Base::Params;
+  static int const kThreadCount = ThreadCount;
+  static bool const kRequiresPrecomputation = false;
+  static int const kThreadsPerWarp = 32;
+
+  struct SharedStorage {};
+
+  // Final tile of the problem loaded by this thread. Each thread will hold
+  // a separate value.
+  int32_t problem_ending_tile;
+
+  SharedStorage &shared_storage;
+
+  //
+  // Methods
+  //
+  CUTLASS_DEVICE
+  GroupedProblemVisitor(
+    Params const &params_,
+    SharedStorage &shared_storage_,
+    int32_t block_idx
+  ): Base(params_, block_idx),
+  problem_ending_tile(0),
+  shared_storage(shared_storage_)
+  {
+    this->problem_idx = -1 * kThreadsPerWarp;
+    this->problem_tile_start = 0;
+  }
+
+  CUTLASS_DEVICE
+  bool next_tile() {
+    // Check whether the tile to compute is within the range of the current problem.
+    int32_t problem_tile_end = __shfl_sync(0xffffffff, problem_ending_tile, this->problem_idx % kThreadsPerWarp);
+    if (this->tile_idx < problem_tile_end) {
+      return true;
+    }
+
+    // Check whether the tile to compute is within the current group of problems fetched by the warp.
+    // The last tile for this group is the final tile of the problem held by the final thread in the warp.
+    int32_t group_tile_end = __shfl_sync(0xffffffff, problem_ending_tile, kThreadsPerWarp-1);
+
+    // Keep the starting problem for this group in `problem_idx`. This is done to reduce
+    // register pressure. The starting problem for this group is simply the first problem
+    // in the group most recently fetched by the warp.
+    int32_t &group_problem_start = this->problem_idx;
+    group_problem_start = (this->problem_idx / kThreadsPerWarp) * kThreadsPerWarp;
+
+    // Keep the starting tile for this group in `problem_tile_start`. This is done to reduce
+    // register pressure.
+    int32_t &group_tile_start = this->problem_tile_start;
+
+    // Each thread in the warp processes a separate problem to advance until
+    // reaching a problem whose starting tile is less less than tile_idx.
+    while (group_tile_end <= this->tile_idx) {
+      group_problem_start += kThreadsPerWarp;
+      if (group_problem_start > this->params.problem_count) {
+        return false;
+      }
+
+      // Since `group_tile_start` is a reference to `this->problem_tile_start`, this
+      // also sets `this->problem_tile_start`. The fact that `this->problem_tile_start`
+      // is also set here is used later in `next_tile`.
+      group_tile_start = group_tile_end;
+
+      int lane_idx = threadIdx.x % kThreadsPerWarp;
+      int32_t lane_problem = group_problem_start + lane_idx;
+
+      // Compute the number of tiles in the problem assigned to each thread.
+      problem_ending_tile = 0;
+      if (lane_problem < this->params.problem_count) {
+        cutlass::gemm::GemmCoord problem = this->params.problem_sizes[lane_problem];
+        this->possibly_transpose_problem(problem);
+        cutlass::gemm::GemmCoord grid = this->grid_shape(problem);
+        problem_ending_tile = this->tile_count(grid);
+      }
+
+      // Compute a warp-wide inclusive prefix sum to compute the ending tile index of
+      // each thread's problem.
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 1; i < kThreadsPerWarp; i <<= 1) {
+        int32_t val = __shfl_up_sync(0xffffffff, problem_ending_tile, i);
+        if (lane_idx >= i) {
+          problem_ending_tile += val;
+        }
+      }
+
+      // The total tile count for this group is now in the final position of the prefix sum
+      int32_t tiles_in_group = __shfl_sync(0xffffffff, problem_ending_tile, kThreadsPerWarp-1);
+
+      problem_ending_tile += group_tile_start;
+      group_tile_end += tiles_in_group;
+    }
+
+    // The next problem to process is the first one that does not have ending tile position
+    // that is greater than or equal to tile index.
+    int32_t problem_idx_in_group =
+        __popc(__ballot_sync(0xffffffff, problem_ending_tile <= this->tile_idx));
+
+    this->problem_idx = group_problem_start + problem_idx_in_group;
+
+    // The starting tile for this problem is the ending tile of the previous problem. In cases
+    // where `problem_idx_in_group` is the first problem in the group, we do not need to reset
+    // `problem_tile_start`, because it is set to the previous group's ending tile in the while
+    // loop above.
+    if (problem_idx_in_group > 0) {
+      this->problem_tile_start = __shfl_sync(0xffffffff, problem_ending_tile, problem_idx_in_group - 1);
+    }
+
+    return true;
+  }
+
+  static size_t get_workspace_size(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr,
+                                   int32_t problem_count,
+                                   int32_t block_count) {
+    return 0;
+  }
+
+  static void host_precompute(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr,
+                              int32_t problem_count,
+                              int32_t block_count,
+                              void* host_workspace_ptr) {}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Precomputes schedule on host and prefetches into shared memory
+//
+template <typename ProblemSizeHelper,
+          typename ThreadblockShape,
+          int PrefetchTileCount,
+          int ThreadCount>
+struct GroupedProblemVisitor<ProblemSizeHelper,
+                             ThreadblockShape,
+                             GroupScheduleMode::kHostPrecompute,
+                             PrefetchTileCount,
+                             ThreadCount> : public BaseGroupedProblemVisitor<ProblemSizeHelper, ThreadblockShape> {
+  static_assert(PrefetchTileCount > 0,
+                "GroupedProblemVisitor with GroupScheduleMode `kHostPrecompute` currently requires prefetching to shared memory");
+
+  using Base = BaseGroupedProblemVisitor<ProblemSizeHelper, ThreadblockShape>;
+  using Params = typename Base::Params;
+  using ProblemInfo = typename Base::ProblemInfo;
+  static bool const kRequiresPrecomputation = true;
+
+  static int const kPrefetchTileCount = PrefetchTileCount;
+  static int const kThreadCount = ThreadCount;
+
+  struct SharedStorage {
+    // Sequence of problem IDs and starting tiles to compute
+    cutlass::Array<ProblemInfo, kPrefetchTileCount> prefetched_problems;
+  };
+
+  int32_t tiles_computed;
+  int32_t iterations_per_block;
+  int32_t block_load_start;
+  SharedStorage &shared_storage;
+  ProblemInfo const *problem_info_ptr;
+
+  //
+  // Methods
+  //
+  CUTLASS_DEVICE
+  GroupedProblemVisitor(
+    Params const &params_,
+    SharedStorage &shared_storage_,
+    int32_t block_idx
+  ): Base(params_, block_idx),
+  tiles_computed(0),
+  shared_storage(shared_storage_),
+  problem_info_ptr(reinterpret_cast<ProblemInfo const*>(params_.workspace))
+  {
+    iterations_per_block = (params_.tile_count - 1 + gridDim.x) / gridDim.x;
+    block_load_start = iterations_per_block * block_idx;
+    // Start prefetching the first set of tiles to compute
+    prefetch_tiles();
+  }
+
+  CUTLASS_DEVICE
+  bool next_tile() {
+    if (this->tile_idx >= this->params.tile_count) {
+      return false;
+    }
+
+    int32_t prefetch_idx = (tiles_computed % kPrefetchTileCount);
+    if (prefetch_idx == 0) {
+      // Ensure all previous stores to shared memory have been completed
+      __syncthreads();
+    }
+
+    auto problem_info = shared_storage.prefetched_problems[prefetch_idx];
+    ++tiles_computed;
+
+    if ((tiles_computed % kPrefetchTileCount) == 0) {
+      // Begin prefetching next set of tiles. Synchronize first to ensure that
+      // we don't overwrite the current buffer while someone else is using it.
+      __syncthreads();
+      prefetch_tiles();
+    }
+
+    this->problem_idx = problem_info.problem_idx;
+    this->problem_tile_start = problem_info.problem_start;
+
+    return true;
+  }
+
+  static size_t get_workspace_size(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr,
+                                   int32_t problem_count,
+                                   int32_t block_count) {
+    int32_t total_tiles = Base::group_tile_count(host_problem_sizes_ptr, problem_count);
+    int32_t entries_per_block = ((total_tiles - 1 + block_count) / block_count);
+    return sizeof(ProblemInfo) * entries_per_block * block_count;
+  }
+#if !defined(__CUDACC_RTC__)
+  static void host_precompute(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr,
+                              int32_t problem_count,
+                              int32_t block_count,
+                              void* host_workspace_ptr) {
+    ProblemInfo* host_problem_info_ptr = reinterpret_cast<ProblemInfo*>(host_workspace_ptr);
+    int32_t total_tiles = Base::group_tile_count(host_problem_sizes_ptr, problem_count);
+    int32_t entries_per_block = (total_tiles - 1 + block_count) / block_count;
+
+    int tile = 0;
+    int start_tile = 0;
+    for (int p_idx = 0; p_idx < problem_count; ++p_idx) {
+      auto problem = host_problem_sizes_ptr[p_idx];
+      Base::possibly_transpose_problem(problem);
+      auto grid = Base::grid_shape(problem);
+      int tiles = Base::tile_count(grid);
+      ProblemInfo problem_info(p_idx, start_tile);
+      for (int i = 0; i < tiles; ++i, ++tile) {
+        host_problem_info_ptr[(entries_per_block * (tile % block_count)) + (tile / block_count)] = problem_info;
+      }
+      start_tile += tiles;
+    }
+  }
+#endif
+private:
+  CUTLASS_DEVICE
+  void prefetch_tiles() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int32_t i = 0; i < kPrefetchTileCount; i += kThreadCount) {
+      int32_t offset = threadIdx.x + i;
+      if (offset < kPrefetchTileCount && (tiles_computed + offset < iterations_per_block)) {
+        shared_storage.prefetched_problems[offset] = problem_info_ptr[block_load_start + tiles_computed + offset];
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/params_sparse_base.h b/include/cutlass/gemm/kernel/params_sparse_base.h
new file mode 100644
index 0000000000..6080e79947
--- /dev/null
+++ b/include/cutlass/gemm/kernel/params_sparse_base.h
@@ -0,0 +1,115 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Base functionality for common types of sparse GEMM kernel parameters
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters structure
+template <
+  typename ThreadblockSwizzle,
+  typename ParamsA,
+  typename TensorRefA,
+  typename ParamsB,
+  typename TensorRefB,
+  typename ParamsE,
+  typename TensorRefE>
+struct SparseParamsBase
+{
+  //
+  // Data members
+  //
+
+  cutlass::gemm::GemmCoord problem_size{};
+  cutlass::gemm::GemmCoord grid_tiled_shape{};
+  int swizzle_log_tile;
+  ParamsA params_A{};
+  TensorRefA ref_A{};
+  ParamsB params_B{};
+  TensorRefB ref_B{};
+  ParamsE params_E{};
+  TensorRefE ref_E{};
+  int gemm_k_iterations{0};
+  int gemm_k_size{0};
+
+  //
+  // Host dispatch API
+  //
+
+  /// Default constructor
+  SparseParamsBase() = default;
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  SparseParamsBase(
+    cutlass::gemm::GemmCoord const & problem_size,
+    cutlass::gemm::GemmCoord const & grid_tiled_shape,
+    TensorRefA ref_A,
+    TensorRefB ref_B,
+    TensorRefE ref_E,
+    int const mma_shape_k)
+  :
+    problem_size(problem_size),
+    grid_tiled_shape(grid_tiled_shape),
+    swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+    params_A(ref_A.layout()),
+    ref_A(ref_A),
+    params_B(ref_B.layout()),
+    ref_B(ref_B),
+    params_E(ref_E.layout()),
+    ref_E(ref_E)
+  {
+    int total_gemm_k_iterations = (problem_size.k() + mma_shape_k - 1) / mma_shape_k;
+    int gemm_k_iterations = (total_gemm_k_iterations + grid_tiled_shape.k() - 1) / grid_tiled_shape.k();
+
+    gemm_k_size = gemm_k_iterations * mma_shape_k;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/params_universal_base.h b/include/cutlass/gemm/kernel/params_universal_base.h
new file mode 100644
index 0000000000..172855edf4
--- /dev/null
+++ b/include/cutlass/gemm/kernel/params_universal_base.h
@@ -0,0 +1,264 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Base functionality for common types of universal GEMM kernel parameters
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/trace.h"
+#include "cutlass/gemm/gemm.h"
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace util {
+
+template <class LayoutA, class LayoutB>
+CUTLASS_HOST_DEVICE
+static bool 
+is_continous_k_aligned(GemmCoord problem_size, size_t alignmentA, size_t alignmentB) {
+  return (platform::is_same<LayoutA, layout::RowMajor>::value && (problem_size.k() % alignmentA) == 0) ||
+         (platform::is_same<LayoutB, layout::ColumnMajor>::value && (problem_size.k() % alignmentB) == 0);
+}
+
+}  // namespace util
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Argument structure
+struct UniversalArgumentsBase
+{
+  //
+  // Data members
+  //
+
+  GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
+  GemmCoord problem_size{};
+  int batch_count{1};
+  int64_t batch_stride_D{0};
+
+  //
+  // Methods
+  //
+
+  UniversalArgumentsBase() = default;
+
+  /// constructs an arguments structure
+  UniversalArgumentsBase(
+    GemmUniversalMode mode,
+    GemmCoord problem_size,
+    int batch_count,
+    int64_t batch_stride_D)
+  :
+    mode(mode),
+    problem_size(problem_size),
+    batch_count(batch_count),
+    batch_stride_D(batch_stride_D)
+  {
+    CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
+  }
+};
+
+
+/// Parameters structure
+template <
+  typename ThreadblockSwizzle,
+  typename ThreadblockShape,
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename LayoutA,
+  typename LayoutB>
+struct UniversalParamsBase
+{
+  //
+  // Data members
+  //
+
+  GemmCoord problem_size{};
+  GemmCoord grid_tiled_shape{};
+  int swizzle_log_tile{0};
+  GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
+  int batch_count {0};
+  int gemm_k_size {0};
+  int64_t batch_stride_D {0};
+  int *semaphore = nullptr;
+
+
+  //
+  // Host dispatch API
+  //
+
+  /// Default constructor
+  UniversalParamsBase() = default;
+
+  /// Constructor
+  UniversalParamsBase(
+    UniversalArgumentsBase const &args, /// GEMM application arguments
+    int device_sms,                     /// Number of SMs on the device
+    int sm_occupancy)                   /// Kernel SM occupancy (in thread blocks)
+  :
+    problem_size(args.problem_size),
+    mode(args.mode),
+    batch_count(args.batch_count),
+    batch_stride_D(args.batch_stride_D),
+    semaphore(nullptr)
+  {
+    init_grid_tiled_shape();
+  }
+
+  /// Returns the workspace size (in bytes) needed for this problem geometry
+  size_t get_workspace_size() const
+  {
+    size_t workspace_bytes = 0;
+    if (mode == GemmUniversalMode::kGemmSplitKParallel)
+    {
+      // Split-K parallel always requires a temporary workspace
+      workspace_bytes =
+        sizeof(ElementC) *
+        size_t(batch_stride_D) *
+        size_t(grid_tiled_shape.k());
+    }
+    else if (mode == GemmUniversalMode::kGemm && grid_tiled_shape.k() > 1)
+    {
+      // Serial split-K only requires a temporary workspace if the number of partitions along the
+      // GEMM K dimension is greater than one.
+      workspace_bytes = sizeof(int) * size_t(grid_tiled_shape.m()) * size_t(grid_tiled_shape.n());
+    }
+
+    return workspace_bytes;
+  }
+
+
+  /// Assign and initialize the specified workspace buffer.  Assumes
+  /// the memory allocated to workspace is at least as large as get_workspace_size().
+  Status init_workspace(
+    void *workspace,
+    cudaStream_t stream = nullptr)
+  {
+    semaphore = static_cast<int *>(workspace);
+    // Zero-initialize entire workspace
+    if (semaphore)
+    {
+      size_t workspace_bytes = get_workspace_size();
+
+      CUTLASS_TRACE_HOST("  Initialize " << workspace_bytes << " workspace bytes");
+
+      cudaError_t result = cudaMemsetAsync(
+        static_cast<int *>(workspace),
+        0,
+        workspace_bytes,
+        stream);
+
+      if (result != cudaSuccess) {
+        CUTLASS_TRACE_HOST("  cudaMemsetAsync() returned error " << cudaGetErrorString(result));
+        return Status::kErrorInternal;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+
+  /// Returns the GEMM volume in thread block tiles
+  GemmCoord get_tiled_shape() const
+  {
+    return grid_tiled_shape;
+  }
+
+
+  /// Returns the total number of thread blocks to launch
+  int get_grid_blocks() const
+  {
+    dim3 grid_dims = get_grid_dims();
+    return grid_dims.x * grid_dims.y * grid_dims.z;
+  }
+
+
+  /// Returns the grid extents in thread blocks to launch
+  dim3 get_grid_dims() const
+  {
+    return ThreadblockSwizzle().get_grid_shape(grid_tiled_shape);
+  }
+
+private:
+  CUTLASS_HOST_DEVICE
+  void init_grid_tiled_shape() {
+    // Get GEMM volume in thread block tiles
+    grid_tiled_shape = ThreadblockSwizzle::get_tiled_shape(
+      problem_size,
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      batch_count);
+
+    swizzle_log_tile = ThreadblockSwizzle::get_log_tile(grid_tiled_shape);
+
+    // Determine extent of K-dimension assigned to each block
+    gemm_k_size = problem_size.k();
+
+    if (mode == GemmUniversalMode::kGemm || mode == GemmUniversalMode::kGemmSplitKParallel)
+    {
+      static const uint32_t CACHELINE_BYTES = 128;
+      static const size_t element_bytes_a = sizeof(ElementA);
+      static const size_t element_bytes_b = sizeof(ElementB);
+      static const size_t cacheline_elements_a = CACHELINE_BYTES / element_bytes_a;
+      static const size_t cacheline_elements_b = CACHELINE_BYTES / element_bytes_b;
+
+      const bool cacheline_alignment_needed =
+          util::is_continous_k_aligned<LayoutA, LayoutB>(problem_size, cacheline_elements_a, cacheline_elements_b);
+
+      int const kAlignK = const_max(
+                                    const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value),
+                                    cacheline_alignment_needed ? const_max(cacheline_elements_a, cacheline_elements_b) : 1);
+
+      gemm_k_size = round_up(ceil_div(problem_size.k(), batch_count), kAlignK);
+      if (gemm_k_size) {
+        grid_tiled_shape.k() = ceil_div(problem_size.k(), gemm_k_size);
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/rank_2k_grouped.h b/include/cutlass/gemm/kernel/rank_2k_grouped.h
new file mode 100644
index 0000000000..e8383faf19
--- /dev/null
+++ b/include/cutlass/gemm/kernel/rank_2k_grouped.h
@@ -0,0 +1,688 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Grouped Rank2K kernel.
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/trace.h"
+#include "cutlass/gemm/kernel/rank_2k_transpose_operands.h"
+#include "cutlass/gemm/kernel/rank_2k_grouped_problem_visitor.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma1_,                          ///! Threadblock-scoped matrix multiply-accumulate (A*B^T)
+  typename Mma2_,                          ///! Threadblock-scoped matrix multiply-accumulate (B*A^T)
+  typename Epilogue_,                      ///! Epilogue
+  typename ThreadblockSwizzle_,            ///! Threadblock swizzling function
+  ComplexTransform OriginalTransformA_,    ///! Public-facing transformation on A
+  ComplexTransform OriginalTransformB_,    ///! Public-facing transformation on B
+  FillMode FillModeC_,                     ///! Fill Mode for C (kLower or kUpper)
+  BlasMode BlasMode_,                      ///! Blas3 computation mode
+  GroupScheduleMode GroupScheduleMode_,    ///! Type of scheduling to perform
+  bool Transposed = false
+>
+struct Rank2KGrouped {
+public:
+
+  using Mma1 = Mma1_;
+  using Mma2 = Mma2_;
+
+  static_assert(platform::is_same<typename Mma1::LayoutC, cutlass::layout::RowMajor>::value &&
+                platform::is_same<typename Mma2::LayoutC, cutlass::layout::RowMajor>::value,
+                "Kernel-level grouped Rank2K requires that LayoutC be row major.");
+
+  // Define generic Mma for usecases that use Kernel::Mma
+  using Mma = Mma1_;
+
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static GroupScheduleMode const kGroupScheduleMode = GroupScheduleMode_;
+  static bool const kTransposed = Transposed;
+
+  // Public-facing type definitions related to operand element type, layout, and complex conjugate
+  // operation. Must interact with the 'kTransposed' notion to reflect the original layout,
+  // fill mode, etc. passed in.
+  //
+  // Recall that a Rank2K operation performs (A x BT) + (B x AT)
+  // This is performed via:
+  //    Mma1 = (A x BT)
+  //    Mma2 = (B x AT)
+  //
+  // However, if C needs to be transposed, then this is changed to the following:
+  //    Mma1 = (B x AT)
+  //    Mma2 = (A x BT)
+  //
+  // The transformation above is achieved by swapping the Layouts/Elements/Transforms/etc.
+  // of A and B as they are passed into the instantiations of Mma1 and Mma2.
+  //
+  // Now, given access to only Mma1 and Mma2, as well as whether a transposition has occurred,
+  // we wish to retrieve the original Layouts/Elements/etc. for A and B that were passed into
+  // the device-level call.
+  //
+  // The logic to do this (which is made clearer by referencing the above instantiations) is as follows:
+  //   LayoutA = kTransposed ? Mma2::LayoutA : Mma1::LayoutA
+  //   LayoutB = kTransposed ? Mma1::LayoutA : Mma2::LayoutA
+  //
+  // We achieve this swapping by passing Mma1::*A and Mma2::*B to Rank2KMapArguments:
+  using MapArgumentsA = kernel::detail::Rank2KMapArguments<
+    typename Mma1::IteratorA::Element,
+    typename Mma1::IteratorA::Layout,
+    Mma1::kTransformA,
+    Mma1::IteratorA::AccessType::kElements,
+    typename Mma2::IteratorA::Element,
+    typename Mma2::IteratorA::Layout,
+    Mma2::kTransformA,
+    Mma2::IteratorA::AccessType::kElements,
+    typename Mma1::LayoutC,
+    FillModeC_,
+    kTransposed
+  >;
+
+  using ElementA = typename MapArgumentsA::ElementA;
+  using LayoutA = typename MapArgumentsA::LayoutA;
+  static int const kAlignmentA = MapArgumentsA::kAlignmentA;
+
+  using MapArgumentsB = kernel::detail::Rank2KMapArguments<
+    typename Mma2::IteratorA::Element,
+    typename Mma2::IteratorA::Layout,
+    Mma2::kTransformA,
+    Mma2::IteratorA::AccessType::kElements,
+    typename Mma1::IteratorA::Element,
+    typename Mma1::IteratorA::Layout,
+    Mma1::kTransformA,
+    Mma1::IteratorA::AccessType::kElements,
+    typename Mma2::LayoutC,
+    FillModeC_,
+    kTransposed
+  >;
+
+  using ElementB = typename MapArgumentsB::ElementA;
+  using LayoutB = typename MapArgumentsB::LayoutA;
+  static int const kAlignmentB = MapArgumentsB::kAlignmentA;
+
+  // Use the user-provided TransformA and TransformB, rather than those
+  // resulting from MapArguments, because Mma1 and Mma2 may have different
+  // complex transforms than those passed in by the user.
+  // (See kernel/rank_2k_complex.h for an example of this)
+  static cutlass::ComplexTransform const kTransformA = OriginalTransformA_;
+  static cutlass::ComplexTransform const kTransformB = OriginalTransformB_;
+
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename MapArgumentsA::LayoutC;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+  static FillMode const kFillModeC = MapArgumentsA::kFillModeC;
+
+  // Common type definitions for Mma1 and Mma2
+  using Operator = typename Mma1::Operator;
+  using OperatorClass = typename Mma1::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma1::Shape;
+  using WarpShape = typename Mma1::Operator::Shape;
+  using InstructionShape = typename Mma1::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma1::ArchTag;
+
+  static int const kStages = Mma1::kStages;
+  static BlasMode const kBlasMode = BlasMode_;
+
+private:
+  static FillMode const kInternalFillModeC = FillModeC_;
+
+public:
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma1::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  using ProblemVisitor = Rank2KGroupedProblemVisitor<
+                            ThreadblockShape,
+                            kGroupScheduleMode,
+                            kThreadCount,
+                            kThreadCount,
+                            kInternalFillModeC>;
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmUniversalMode mode = GemmUniversalMode::kGemm;
+    GemmCoord *problem_sizes = nullptr;
+    int problem_count{0};
+    int threadblock_count{0};
+
+    typename EpilogueOutputOp::Params epilogue;
+
+    ElementA ** ptr_A = nullptr;
+    ElementB ** ptr_B = nullptr;
+    ElementC ** ptr_C = nullptr;
+    ElementC ** ptr_D = nullptr;
+
+    typename LayoutA::Stride::LongIndex *lda = nullptr;
+    typename LayoutB::Stride::LongIndex *ldb = nullptr;
+    typename LayoutC::Stride::LongIndex *ldc = nullptr;
+    typename LayoutC::Stride::LongIndex *ldd = nullptr;
+
+    // Only used by device-level operator
+    GemmCoord *host_problem_sizes = nullptr;
+
+    bool allow_early_exit = false;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    Arguments() = default;
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord *problem_sizes,
+      int problem_count,
+      int threadblock_count,
+      typename EpilogueOutputOp::Params epilogue,
+      ElementA ** ptr_A,
+      ElementB ** ptr_B,
+      ElementC ** ptr_C,
+      ElementC ** ptr_D,
+      typename LayoutA::Stride::LongIndex *lda,
+      typename LayoutB::Stride::LongIndex *ldb,
+      typename LayoutC::Stride::LongIndex *ldc,
+      typename LayoutC::Stride::LongIndex *ldd,
+      GemmCoord *host_problem_sizes=nullptr,
+      bool allow_early_exit=false
+    ):
+      mode(mode),
+      problem_sizes(problem_sizes),
+      problem_count(problem_count),
+      threadblock_count(threadblock_count),
+      epilogue(epilogue),
+      ptr_A(ptr_A),
+      ptr_B(ptr_B),
+      ptr_C(ptr_C),
+      ptr_D(ptr_D),
+      lda(lda),
+      ldb(ldb),
+      ldc(ldc),
+      ldd(ldd),
+      host_problem_sizes(host_problem_sizes),
+      allow_early_exit(allow_early_exit)
+    {
+
+    }
+
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+
+    typename ProblemVisitor::Params problem_visitor{};
+    int threadblock_count = 0;
+
+    typename EpilogueOutputOp::Params output_op{};
+
+    GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
+    int batch_count = 0;
+
+    ElementA** ptr_A = nullptr;
+    ElementB** ptr_B = nullptr;
+    ElementC** ptr_C = nullptr;
+    ElementC** ptr_D = nullptr;
+
+    typename LayoutA::Stride::LongIndex* lda = nullptr;
+    typename LayoutB::Stride::LongIndex* ldb = nullptr;
+    typename LayoutC::Stride::LongIndex* ldc = nullptr;
+    typename LayoutC::Stride::LongIndex* ldd = nullptr;
+
+    bool allow_early_exit = false;
+
+    //
+    // Methods
+    //
+
+    Params() = default;
+
+    CUTLASS_HOST_DEVICE
+    Params(Arguments const &args, void *workspace = nullptr, int tile_count = 0):
+      problem_visitor(args.problem_sizes, args.problem_count, workspace, tile_count),
+      threadblock_count(args.threadblock_count),
+      output_op(args.epilogue),
+      ptr_A(args.ptr_A),
+      ptr_B(args.ptr_B),
+      ptr_C(args.ptr_C),
+      ptr_D(args.ptr_D),
+      lda(args.lda),
+      ldb(args.ldb),
+      ldc(args.ldc),
+      ldd(args.ldd),
+      allow_early_exit(args.allow_early_exit)
+    {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    void update(
+      Arguments const &args,
+      void *workspace = nullptr,
+      int tile_count = 0) {
+
+      problem_visitor = typename ProblemVisitor::Params(args.problem_sizes, args.problem_count, workspace, tile_count);
+      threadblock_count = args.threadblock_count;
+      output_op = args.output_op;
+      ptr_A = args.ptr_A;
+      ptr_B = args.ptr_B;
+      ptr_C = args.ptr_C;
+      ptr_D = args.ptr_D;
+    }
+  };
+
+  /// Shared memory storage structure
+  struct SharedStorage {
+    union {
+      typename Mma1::SharedStorage mma1_main_loop;
+      typename Mma2::SharedStorage mma2_main_loop;
+      typename Epilogue::SharedStorage epilogue;
+    } kernel;
+
+    // ProblemVisitor shared storage can't be overlapped with others
+    typename ProblemVisitor::SharedStorage problem_visitor;
+  };
+
+public:
+
+  //
+  // Methods
+  //
+
+  Rank2KGrouped() = default;
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(cutlass::gemm::GemmCoord const & problem_size) {
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return Status::kSuccess;
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Early exit following LAPACK's definition
+    if (params.allow_early_exit &&
+        (params.output_op.alpha == ElementC(0)) && (params.output_op.beta == ElementC(1))) {
+      return;
+    }
+
+    //
+    // Problem visitor.
+    //
+
+    ProblemVisitor problem_visitor(
+      params.problem_visitor,
+      shared_storage.problem_visitor,
+      blockIdx.x);
+
+    // Outer 'persistent' loop to iterate over tiles
+    while (problem_visitor.next_tile()) {
+
+      GemmCoord problem_size  = problem_visitor.problem_size();
+      int32_t problem_idx     = problem_visitor.problem_index();
+      int32_t threadblock_idx = int32_t(problem_visitor.threadblock_idx());
+
+      GemmCoord grid_shape = problem_visitor.grid_shape(problem_size);
+
+      cutlass::gemm::GemmCoord threadblock_tile_offset = problem_visitor.threadblock_offset(threadblock_idx);
+
+      //
+      // Perform checks to determine whether the results of this threadblock will be needed.
+      // An example of an unneeded threadblock is one that is assigned to compute in the upper
+      // portion of a Rank2K kernel filled with mode kLower.
+      //
+      // TODO: Consider pushing these checks into ProblemVisitor to avoid spuriously
+      // returning from `next_tile()`.
+      //
+
+      // Early exit if threadblock is out of range
+      if (grid_shape.m() <= threadblock_tile_offset.m() ||
+          grid_shape.n() <= threadblock_tile_offset.n()) {
+        // Next tile
+        problem_visitor.advance(gridDim.x);
+        continue;
+      }
+
+      // Skip this tile if Fill Mode is Lower and
+      // if the entire tile is above the main diagonal (bottom-left corner is at or above the diagonal)
+      if (kInternalFillModeC == cutlass::FillMode::kLower &&
+          (threadblock_tile_offset.m() + 1) * Mma1::Shape::kM <= threadblock_tile_offset.n() * Mma1::Shape::kN) {
+        // Next tile
+        problem_visitor.advance(gridDim.x);
+        continue;
+      }
+
+      // Skip this tile if Fill Mode is Upper and
+      // if the entire tile is below the main diagonal (top-right corner is at or below the diagonal)
+      if (kInternalFillModeC == cutlass::FillMode::kUpper &&
+          threadblock_tile_offset.m() * Mma1::Shape::kM >= (threadblock_tile_offset.n() + 1) * Mma1::Shape::kN) {
+        // Next tile
+        problem_visitor.advance(gridDim.x);
+        continue;
+      }
+
+      bool tile_on_diagonal = false;
+      // Mark tiles that are being crossed by the main diagonal
+      // (top-right and bottom-left corners are on either side of the diagonal)
+      if ((threadblock_tile_offset.m() + 1) * Mma1::Shape::kM > threadblock_tile_offset.n() * Mma1::Shape::kN
+          && threadblock_tile_offset.m() * Mma1::Shape::kM < (threadblock_tile_offset.n() + 1) * Mma1::Shape::kN) {
+        tile_on_diagonal = true;
+      }
+
+      int offset_k = 0;
+      int problem_size_k = problem_size.k();
+
+      //
+      // Fetch pointers based on mode.
+      //
+      if (params.mode == GemmUniversalMode::kGemm ||
+          params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+        if (threadblock_tile_offset.k() + 1 < grid_shape.k()) {
+          problem_size_k = (threadblock_tile_offset.k() + 1) * problem_size.k();
+        }
+
+        offset_k = threadblock_tile_offset.k() * problem_size.k();
+      }
+
+      ElementA *ptr_A = reinterpret_cast<ElementA *>((kTransposed ? params.ptr_B[problem_idx] : params.ptr_A[problem_idx]));
+      typename LayoutA::Stride::LongIndex ldm_A = (kTransposed ? params.ldb[problem_idx] : params.lda[problem_idx]);
+
+      ElementB *ptr_B = reinterpret_cast<ElementB *>((kTransposed ? params.ptr_A[problem_idx] : params.ptr_B[problem_idx]));
+      typename LayoutB::Stride::LongIndex ldm_B = (kTransposed ? params.lda[problem_idx] : params.ldb[problem_idx]);
+
+      // Compute initial location in logical coordinates
+      cutlass::MatrixCoord tb_offset_MxK{
+        threadblock_tile_offset.m() * Mma1::Shape::kM,
+        offset_k,
+      };
+
+      cutlass::MatrixCoord tb_offset_KxN{
+        offset_k,
+        threadblock_tile_offset.n() * Mma1::Shape::kN
+      };
+
+      // Assume identity swizzle
+      MatrixCoord tb_offset(
+        threadblock_tile_offset.m() * Mma1::Shape::kM,
+        threadblock_tile_offset.n() * Mma1::Shape::kN
+      );
+
+      // Compute position within threadblock
+      int thread_idx = threadIdx.x;
+
+      // Construct iterators to A and B operands for Mma1
+      typename Mma1::IteratorA iterator_A(
+        typename Mma1::IteratorA::Params(ldm_A),
+        ptr_A,
+        {problem_size.m(), problem_size_k},
+        thread_idx,
+        tb_offset_MxK);
+
+      typename Mma1::IteratorB iterator_BT(
+        typename Mma1::IteratorB::Params(ldm_B),
+        ptr_B,
+        {problem_size_k, problem_size.n()},
+        thread_idx,
+        tb_offset_KxN);
+
+      // Construct iterators to A and B operands for Mma2
+      typename Mma2::IteratorA iterator_B(
+        typename Mma2::IteratorA::Params(ldm_B),
+        ptr_B,
+        {problem_size.m(), problem_size_k},
+        thread_idx,
+        tb_offset_MxK);
+
+      typename Mma2::IteratorB iterator_AT(
+        typename Mma2::IteratorB::Params(ldm_A),
+        ptr_A,
+        {problem_size_k, problem_size.n()},
+        thread_idx,
+        tb_offset_KxN);
+
+      // Broadcast the warp_id computed by lane 0 to ensure dependent code
+      // is compiled as warp-uniform.
+      int warp_idx = canonical_warp_idx_sync();
+
+      int lane_idx = threadIdx.x % 32;
+
+      //
+      // Main loop
+      //
+
+      // Construct thread-scoped matrix multiply for Mma1 (A x BT)
+      Mma1 mma1(shared_storage.kernel.mma1_main_loop, thread_idx, warp_idx, lane_idx);
+
+      // Construct thread-scoped matrix multiply for Mma2 (B x AT)
+      Mma2 mma2(shared_storage.kernel.mma2_main_loop, thread_idx, warp_idx, lane_idx);
+
+      typename Mma1::FragmentC accumulators;
+
+      accumulators.clear();
+
+      // Compute threadblock-scoped matrix multiply-add
+      int gemm_k_iterations = (problem_size_k - offset_k + Mma1::Shape::kK - 1) / Mma1::Shape::kK;
+
+      // Wait for all threads to finish their epilogue phases from the previous tile.
+      __syncthreads();
+
+      // Compute threadblock-scoped matrix multiply-add (A x BT)
+      mma1(
+        gemm_k_iterations,
+        accumulators,
+        iterator_A,
+        iterator_BT,
+        accumulators);
+
+      // HER2K kernel needs Alpha to be complex and is conj(Alpha) is applied to the second HERK.
+      if (kBlasMode == BlasMode::kHermitian) {
+
+        //
+        // Epilogue
+        //
+
+        EpilogueOutputOp output_op(params.output_op);
+
+        int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * grid_shape.m();
+
+        ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C[problem_idx]);
+        ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D[problem_idx]);
+
+        // If TB not on diagonal, FillMode doesn't apply.
+        FillMode kFillModeTB = tile_on_diagonal ? kInternalFillModeC : FillMode::kNone;
+
+        // Tile iterator loading from source tensor.
+        typename Epilogue::OutputTileIterator iterator_C(
+          typename Epilogue::OutputTileIterator::Params(params.ldc[problem_idx]),
+          ptr_C,
+          problem_size.mn(),
+          thread_idx,
+          tb_offset,
+          kFillModeTB
+        );
+
+        // Tile iterator writing to destination tensor.
+        typename Epilogue::OutputTileIterator iterator_D(
+          typename Epilogue::OutputTileIterator::Params(params.ldd[problem_idx]),
+          ptr_D,
+          problem_size.mn(),
+          thread_idx,
+          tb_offset,
+          kFillModeTB
+        );
+
+        Epilogue epilogue(
+          shared_storage.kernel.epilogue,
+          thread_idx,
+          warp_idx,
+          lane_idx);
+
+        // Execute the epilogue operator to update the destination tensor.
+        epilogue(
+          output_op,
+          iterator_D,
+          accumulators,
+          iterator_C);
+
+        __syncthreads();
+
+        accumulators.clear();
+      }
+
+      // Compute threadblock-scoped matrix multiply-add (B x AT)
+      mma2(
+        gemm_k_iterations,
+        accumulators,
+        iterator_B,
+        iterator_AT,
+        accumulators);
+
+      //
+      // Epilogue
+      //
+
+      EpilogueOutputOp output_op(params.output_op);
+
+      /* Needed for HER2K where the second HERK is multiplied by conj(alpha) */
+      typename EpilogueOutputOp::Params second_her2k_params(conj(params.output_op.alpha), 1);
+      EpilogueOutputOp output_op_her2k(second_her2k_params);
+
+      //
+      // Masked tile iterators constructed from members
+      //
+
+      int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * grid_shape.m();
+
+      ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C[problem_idx]);
+
+      // HER2K kernel needs Alpha to be complex and is conj(Alpha) is applied to the second HERK.
+      if (kBlasMode == BlasMode::kHermitian) {
+        ptr_C = static_cast<ElementC *>(params.ptr_D[problem_idx]);
+      }
+
+      ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D[problem_idx]);
+
+      // If TB not on diagonal, FillMode doesn't apply.
+      FillMode kFillModeTB = tile_on_diagonal ? kInternalFillModeC : FillMode::kNone;
+
+      // Tile iterator loading from source tensor.
+      typename Epilogue::OutputTileIterator iterator_C(
+        typename Epilogue::OutputTileIterator::Params(params.ldc[problem_idx]),
+        ptr_C,
+        problem_size.mn(),
+        thread_idx,
+        tb_offset,
+        kFillModeTB
+      );
+
+      // Tile iterator writing to destination tensor.
+      typename Epilogue::OutputTileIterator iterator_D(
+        typename Epilogue::OutputTileIterator::Params(params.ldd[problem_idx]),
+        ptr_D,
+        problem_size.mn(),
+        thread_idx,
+        tb_offset,
+        kFillModeTB
+      );
+
+      Epilogue epilogue(
+        shared_storage.kernel.epilogue,
+        thread_idx,
+        warp_idx,
+        lane_idx);
+
+      // Execute the epilogue operator to update the destination tensor.
+      if (kBlasMode == BlasMode::kSymmetric) {
+        epilogue(
+          output_op,
+          iterator_D,
+          accumulators,
+          iterator_C);
+      } else {
+        epilogue(
+          output_op_her2k,
+          iterator_D,
+          accumulators,
+          iterator_C);
+      }
+
+      // Next tile
+      problem_visitor.advance(gridDim.x);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/rank_2k_grouped_problem_visitor.h b/include/cutlass/gemm/kernel/rank_2k_grouped_problem_visitor.h
new file mode 100644
index 0000000000..054d2a73df
--- /dev/null
+++ b/include/cutlass/gemm/kernel/rank_2k_grouped_problem_visitor.h
@@ -0,0 +1,376 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Problem visitor for grouped Rank2K operations.
+
+    This problem visitor is specialized for Rank2K operations, for which matrix C is upper/lower
+    triangular. Using a problem visitor designed for GEMMs for Rank2K problems is inefficient
+    because threadblocks will be frequently assigned to tiles that exit early (e.g., due to
+    being assigned to a tile in the upper-triangular portion of a lower-triangular problem).
+    This can lead to load imbalance among threadblocks, as the GEMM-based scheduler
+    assigns all threadblocks to nearly the same number of tiles, regardless of whether
+    those tiles exit early.
+
+    Consider an example of a group of four Rank2Ks with matrix C consisting of a grid of 2x2 tiles.
+    Consider a grid of 8 threadblocks. The default GEMM scheduler will assign threadblocks to
+    tiles in the following order:
+        Rank2K 0      Rank2K 1       Rank2K 2      Rank2K 3
+          0  1          4  5           0  1          4  5
+          2  3          6  7           2  3          6  7
+    Assuming that the problems are lower triangular, blocks 1 and 5 are continuously assigned
+    to inactive tiles.
+
+    This problem visitor aims to assign threadblocks to only those tiles which are in the
+    upper/lower triangular portion of a given problem. Using the example above, the resulting
+    assignment would be:
+        Rank2K 0      Rank2K 1       Rank2K 2      Rank2K 3
+          0  -          3  -           6  -          1  -
+          1  2          4  5           7  0          2  3
+
+    Achieving the schedule above requires a mapping from threadblock ID to tile coordinates (i, j).
+    We will illustrate this by mapping on a lower-triangular matrix with a 3x3 grid. We first
+    calculate row and column indices assuming one-indexed rows, tiles, and threadblock IDs, and
+    then subtract one to convert to zero-indexed.
+                      Col 1   Col 2   Col 3
+                     ----------------------
+              Row 1 |   1      -       -
+              Row 2 |   2      3       -
+              Row 3 |   4      5       6
+
+    We next outline this mapping, borrowing from: https://stackoverflow.com/a/40954159
+
+    Calculating row i given threadblock ID t
+    ----------------------------------------
+    For a given row i, all threadblock IDs t in that row satisfy the following:
+          t <= 1 + 2 + 3 + ... + (i-1) + i
+
+    The closed-form equation for the right-hand side is: i(i+1)/2.
+    Using this, we can solve for i given t:
+          t  <= i(i+1)/2
+          2t <= i^2 + i
+          2t <= i^2 + i + 0.25 - 0.25
+          2t + 0.25 <= i^2 + i + 0.25
+          2t + 0.25 <= (i + 0.5)^2
+          sqrt(2t + 0.25) - 0.5 <= i
+
+    To account for fractional values, we set:
+          i = ceil(sqrt(2t + 0.25) - 0.5)
+
+    To turn this into a zero-indexed row and work with zero-indexed t, we perform:
+          i = ceil(sqrt(2(t+1) + 0.25) - 0.5) - 1
+            = ceil(sqrt(2t + 2.25) - 0.5) - 1
+
+    Calculating column j given threadblock ID t and row i
+    -----------------------------------------------------
+    For a given row i, all threadblock IDs t in that row also satisfy the following:
+          t > 1 + 2 + 3 + ... + (i-2) + (i-1)
+      --> t > i(i-1)/2
+
+    Threadblock IDs within a given row are sequential, so the one-indexed column ID
+    for one-indexed threadblock ID t and row i is:
+          j = t - (i(i-1)/2)
+
+    The zero-indexed version becomes:
+          j = (t+1) - (i(i+1)/2) -1
+            = t - (i(i+1)/2)
+
+    Accounting for non-square grids
+    -------------------------------
+    Though the overall output problem size for Rank2K problems is guranteed to be square, the
+    grids used in computing may not be square due to using non-square threadblock shapes. For
+    example, a threadblock shape of 64x32 operating on a problem of output size 128x128 would
+    result in a grid of 2x4 tiles.
+
+    This case can be handled by noting that the output resembles a square grid of 2x2 "macro tiles"
+    each of which contains 2 "true tiles." We can thus first map a threadblock ID to its "macro tile"
+    using the equations above, and then map it to the "true tile" within its "macro tile." In the example
+    of a 2x4 grid, this mapping would look as follows:
+        "Macro grid"           "True grid"
+       {0, 1}    -            0   1   -   -
+       {2, 3}  {4, 5}         2   3   4   5
+
+    A zero-indexed threadblock ID t is mapped to its "macro tile ID" t_macro as:
+      t_macro = t // r
+    Where r is the ratio of the maximum dimension of the grid to the minimum dimension of the grid
+    (i.e., r = 4 / 2 = 2 in the previous example).
+
+    One uses t_macro and the calculations above to find the row and column in the square matrix to
+    obtain i_macro and j_macro (zero-indexed). The mapping from (i_macro, j_macro) --> (i, j)
+    is simply the following:
+        if (ThreadblockShape::M > ThreadblockShape::N):
+            r = ThreadblockShape::M / ThreadblockShape::N
+            i = i_macro
+            j = (j_macro * r) + (t % r)
+        elif (ThreadblockShape::M < ThreadblockShape::N):
+            r = ThreadblockShape::N / ThreadblockShape::M
+            i = (i_macro * r) + (t % r)
+            j = j_macro
+        else:
+            i = i_macro
+            j = j_macro
+
+    Handling cases with grid dimensions that aren't multiples of eachother
+    ----------------------------------------------------------------------
+    Even though threadblock shapes M and N are typically multiples of one another, the grid
+    for a given problem may not have dimensions of the same ratio as that of the threadblock.
+    For example, a problem of size 132x132 using a threadblock of shape 64x32 will result
+    in a grid of 3x5 tiles. In this case, there is not an integer number of "true tiles"
+    per "macro tile."
+
+    When this scenario arises, we simply pad the larger dimension of the grid such that
+    there are an integer number of "true tiles" per "macro tile." Thus, the 3x5 grid in
+    the example above will be treated as a 3x6 grid. Row and column positions for each
+    tile are calculated as above. Any threadblocks that map to tiles that are outside the
+    problem range or upper/lower triangular portion (e.g., (2, 5)) will exit early from
+    this problem and may proceed to the next problem in the group.
+
+    Handling upper-triangular matrices
+    ----------------------------------
+    The only modification needed for upper-triangular matrices is to swap i_macro and j_macro
+    in the calculations above.
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+
+#include "cutlass/gemm/kernel/grouped_problem_visitor.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+namespace detail {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Helpers for calculating offsets for Rank2K problem visitor. These helpers specifically pertain
+// to the conversion from "macro tiles" to "true tiles" in the description above.
+//
+template <
+  typename ThreadblockShape,
+  typename Enable = void
+>
+struct Rank2KGroupedProblemVisitorOffsetHelper;
+
+// Partial specialization for the case where threadblock shape M > threadblock shape N
+template <
+  typename ThreadblockShape
+>
+struct Rank2KGroupedProblemVisitorOffsetHelper<
+    ThreadblockShape,
+    typename platform::enable_if< (ThreadblockShape::kM > ThreadblockShape::kN) >::type
+> {
+  static_assert(ThreadblockShape::kM % ThreadblockShape::kN == 0,
+             "Rank2KGroupedProblemVisitor with threadblock shape M > threadblock shape N "
+             "requires that threadblock shape M be a multiple of threadblock shape N.");
+
+  static int32_t const kThreadblockSkewRatio = ThreadblockShape::kM / ThreadblockShape::kN;
+
+  CUTLASS_HOST_DEVICE
+  static int32_t min_dim(cutlass::gemm::GemmCoord grid) {
+    return grid.m();
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t macro_row_to_row(int32_t row, int32_t threadblock_id) {
+    return row;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t macro_col_to_col(int32_t col, int32_t threadblock_id) {
+    return (col * kThreadblockSkewRatio) + (threadblock_id % kThreadblockSkewRatio);
+  }
+};
+
+// Partial specialization for the case where threadblock shape M < threadblock shape N
+template <
+  typename ThreadblockShape
+>
+struct Rank2KGroupedProblemVisitorOffsetHelper<
+    ThreadblockShape,
+    typename platform::enable_if< (ThreadblockShape::kM < ThreadblockShape::kN) >::type
+> {
+
+  static_assert(ThreadblockShape::kN % ThreadblockShape::kM == 0,
+             "Rank2KGroupedProblemVisitor with threadblock shape M < threadblock shape N "
+             "requires that threadblock shape N be a multiple of threadblock shape M.");
+
+  static int32_t const kThreadblockSkewRatio = ThreadblockShape::kN / ThreadblockShape::kM;
+
+  CUTLASS_HOST_DEVICE
+  static int32_t min_dim(cutlass::gemm::GemmCoord grid) {
+    return grid.n();
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t macro_row_to_row(int32_t row, int32_t threadblock_id) {
+    return (row * kThreadblockSkewRatio) + (threadblock_id % kThreadblockSkewRatio);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t macro_col_to_col(int32_t col, int32_t threadblock_id) {
+    return col;
+  }
+};
+
+// Partial specialization for the case where threadblock shape M == threadblock shape N
+// In this case, macro tiles are equivalent to true tiles, so the conversions are
+// identity functions.
+template <
+  typename ThreadblockShape
+>
+struct Rank2KGroupedProblemVisitorOffsetHelper<
+    ThreadblockShape,
+    typename platform::enable_if< (ThreadblockShape::kM == ThreadblockShape::kN) >::type
+> {
+
+  static int32_t const kThreadblockSkewRatio = 1;
+
+  CUTLASS_HOST_DEVICE
+  static int32_t min_dim(cutlass::gemm::GemmCoord grid) {
+    return grid.m();
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t macro_row_to_row(int32_t row, int32_t threadblock_id) {
+    return row;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t macro_col_to_col(int32_t col, int32_t threadblock_id) {
+    return col;
+  }
+};
+
+// Helper for correctly representing problem sizes in grouped kernels 
+template <typename ThreadblockShape>
+struct Rank2KGroupedProblemSizeHelper {
+  using OffsetHelper = Rank2KGroupedProblemVisitorOffsetHelper<ThreadblockShape>;
+
+  CUTLASS_HOST_DEVICE
+  static cutlass::gemm::GemmCoord grid_shape(const cutlass::gemm::GemmCoord& problem) {
+    return cutlass::gemm::GemmCoord(
+      ((problem.m() - 1 + ThreadblockShape::kM) / ThreadblockShape::kM),
+      ((problem.n() - 1 + ThreadblockShape::kN) / ThreadblockShape::kN),
+      1);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t tile_count(const cutlass::gemm::GemmCoord& grid) {
+    // Return the number of tiles at or below the diagonal (or at and above
+    // for mode kUpper). We do this by first calculating this value assuming
+    // we have a square matrix of tiles of size `dim x dim` where `dim` is the
+    // minimum among {grid.m(), grid.n()}. We then multiply the resulting value
+    // by OffsetHelper::kThreadblockSkewRatio to account for cases in which there
+    // are more tiles in one dimension than the other.
+    int32_t dim = OffsetHelper::min_dim(grid);
+    int32_t tiles_on_diagonal = dim;
+    int32_t tiles_below_diagonal = ((dim * (dim - 1)) / 2);
+    return (tiles_on_diagonal + tiles_below_diagonal) * OffsetHelper::kThreadblockSkewRatio;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static void possibly_transpose_problem(cutlass::gemm::GemmCoord& problem) {}
+};
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Default problem visitor for fill modes kUpper and kLower.
+//
+template <typename ThreadblockShape,
+          GroupScheduleMode GroupScheduleMode_,
+          int PrefetchTileCount,
+          int ThreadCount,
+          cutlass::FillMode FillModeC>
+struct Rank2KGroupedProblemVisitor : public GroupedProblemVisitor<
+                                              detail::Rank2KGroupedProblemSizeHelper<ThreadblockShape>,
+                                              ThreadblockShape,
+                                              GroupScheduleMode_,
+                                              PrefetchTileCount,
+                                              ThreadCount> {
+
+  static cutlass::FillMode const kFillModeC = FillModeC;
+
+  static_assert(kFillModeC == cutlass::FillMode::kLower || kFillModeC == cutlass::FillMode::kUpper,
+              "Default Rank2KGroupedProblemVisitor requires fill mode of kLower or kUpper.");
+
+  using ProblemSizeHelper = detail::Rank2KGroupedProblemSizeHelper<ThreadblockShape>;
+  using Base = GroupedProblemVisitor<ProblemSizeHelper,
+                                     ThreadblockShape,
+                                     GroupScheduleMode_,
+                                     PrefetchTileCount,
+                                     ThreadCount>;
+  using OffsetHelper = typename ProblemSizeHelper::OffsetHelper;
+  using Params = typename Base::Params;
+  using SharedStorage = typename Base::SharedStorage;
+
+  //
+  // Methods
+  //
+  CUTLASS_DEVICE
+  Rank2KGroupedProblemVisitor(
+    Params const &params_,
+    SharedStorage &shared_storage_,
+    int32_t block_idx
+  ): Base(params_, shared_storage_, block_idx)
+  {}
+
+  CUTLASS_DEVICE
+  cutlass::gemm::GemmCoord threadblock_offset(int32_t threadblock_id) const {
+    int32_t macro_id = threadblock_id / OffsetHelper::kThreadblockSkewRatio;
+    int32_t macro_row = ceil(cutlass::fast_sqrt((2*macro_id) + 2.25) - 0.5) - 1;
+    int32_t macro_col = macro_id - (((macro_row+1) * macro_row)/2);
+
+    if (kFillModeC == cutlass::FillMode::kUpper) {
+      cutlass::swap(macro_row, macro_col);
+    }
+
+    int32_t row = OffsetHelper::macro_row_to_row(macro_row, threadblock_id);
+    int32_t col = OffsetHelper::macro_col_to_col(macro_col, threadblock_id);
+
+    return cutlass::gemm::GemmCoord(row, col, 0);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/rank_2k_transpose_operands.h b/include/cutlass/gemm/kernel/rank_2k_transpose_operands.h
new file mode 100644
index 0000000000..11b2a915ab
--- /dev/null
+++ b/include/cutlass/gemm/kernel/rank_2k_transpose_operands.h
@@ -0,0 +1,129 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file
+  \brief Transpositions for Rank2K problems.
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA_,
+  typename LayoutA_,
+  ComplexTransform TransformA,
+  int AlignmentA,
+  typename ElementB_,
+  typename LayoutB_,
+  ComplexTransform TransformB,
+  int AlignmentB,
+  typename LayoutC_,
+  FillMode FillModeC_,
+  bool Transpose
+>
+struct Rank2KMapArguments {
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  static ComplexTransform const kTransformA = TransformA;
+  static int const kAlignmentA = AlignmentA;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  static ComplexTransform const kTransformB = TransformB;
+  static int const kAlignmentB = AlignmentB;
+  using LayoutC = LayoutC_;
+  static FillMode const kFillModeC = FillModeC_;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA_,
+  typename LayoutA_,
+  ComplexTransform TransformA,
+  int AlignmentA,
+  typename ElementB_,
+  typename LayoutB_,
+  ComplexTransform TransformB,
+  int AlignmentB,
+  typename LayoutC_,
+  FillMode FillModeC_
+>
+struct Rank2KMapArguments<
+  ElementA_,
+  LayoutA_,
+  TransformA,
+  AlignmentA,
+  ElementB_,
+  LayoutB_,
+  TransformB,
+  AlignmentB,
+  LayoutC_,
+  FillModeC_,
+  true
+> {
+  using ElementA = ElementB_;
+  using LayoutA = LayoutB_;
+  static ComplexTransform const kTransformA = TransformB;
+  static int const kAlignmentA = AlignmentB;
+  using ElementB = ElementA_;
+  using LayoutB = LayoutA_;
+  static ComplexTransform const kTransformB = TransformA;
+  static int const kAlignmentB = AlignmentA;
+  using LayoutC = typename layout::LayoutTranspose<LayoutC_>::type;
+  static FillMode const kFillModeC = InvertFillMode<FillModeC_>::mode;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}
+}
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/rank_2k_universal.h b/include/cutlass/gemm/kernel/rank_2k_universal.h
index 0346cbbf8c..bd7ffb0e3a 100644
--- a/include/cutlass/gemm/kernel/rank_2k_universal.h
+++ b/include/cutlass/gemm/kernel/rank_2k_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -119,35 +119,34 @@ struct Rank2KUniversal {
     // Data members
     //
 
-    GemmUniversalMode mode;
-    GemmCoord problem_size;
-    int batch_count;
+    GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
+    GemmCoord problem_size {};
+    int batch_count{1};
 
-    typename EpilogueOutputOp::Params epilogue;
+    typename EpilogueOutputOp::Params epilogue{};
 
-    void const * ptr_A;
-    void const * ptr_B;
-    void const * ptr_C;
-    void * ptr_D;
+    void const * ptr_A = nullptr;
+    void const * ptr_B = nullptr;
+    void const * ptr_C = nullptr;
+    void * ptr_D = nullptr;
 
-    int64_t batch_stride_A;
-    int64_t batch_stride_B;
-    int64_t batch_stride_C;
-    int64_t batch_stride_D;
+    int64_t batch_stride_A {0};
+    int64_t batch_stride_B {0};
+    int64_t batch_stride_C {0};
+    int64_t batch_stride_D {0};
 
-    typename LayoutA::Stride::Index lda;
-    typename LayoutB::Stride::Index ldb;
-    typename LayoutC::Stride::Index ldc;
-    typename LayoutC::Stride::Index ldd;
+    typename LayoutA::Stride::Index lda{0};
+    typename LayoutB::Stride::Index ldb{0};
+    typename LayoutC::Stride::Index ldc{0};
+    typename LayoutC::Stride::Index ldd{0};
+
+    bool allow_early_exit{false};
 
     //
     // Methods
     //
     
-    Arguments(): 
-      mode(GemmUniversalMode::kGemm), 
-      batch_count(1), 
-      ptr_A(nullptr), ptr_B(nullptr), ptr_C(nullptr), ptr_D(nullptr) { }
+    Arguments() = default;
 
     /// constructs an arguments structure
     Arguments(
@@ -166,15 +165,18 @@ struct Rank2KUniversal {
       typename LayoutA::Stride::Index lda,
       typename LayoutB::Stride::Index ldb,
       typename LayoutC::Stride::Index ldc,
-      typename LayoutC::Stride::Index ldd
+      typename LayoutC::Stride::Index ldd,
+      bool allow_early_exit = false
     ):
       mode(mode), 
       problem_size(problem_size), 
       batch_count(batch_count),
       epilogue(epilogue), 
       ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D), 
-      batch_stride_A(batch_stride_A), batch_stride_C(batch_stride_C), batch_stride_D(batch_stride_D), 
-      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd) {
+      batch_stride_A(batch_stride_A), batch_stride_B(0),
+      batch_stride_C(batch_stride_C), batch_stride_D(batch_stride_D), 
+      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd),
+      allow_early_exit(allow_early_exit) {
 
       }
 
@@ -198,64 +200,46 @@ struct Rank2KUniversal {
   /// Parameters structure
   struct Params {
 
-    cutlass::gemm::GemmCoord problem_size;
-    cutlass::gemm::GemmCoord grid_tiled_shape;
-    int swizzle_log_tile;
+    cutlass::gemm::GemmCoord problem_size{};
+    cutlass::gemm::GemmCoord grid_tiled_shape{};
+    int swizzle_log_tile{0};
     
     // Mma1 Iterator A and B params
-    typename Mma1::IteratorA::Params params_A;
-    typename Mma1::IteratorB::Params params_BT;
+    typename Mma1::IteratorA::Params params_A{};
+    typename Mma1::IteratorB::Params params_BT{};
 
     // Mma2 Iterator A and B params 
-    typename Mma2::IteratorA::Params params_B;
-    typename Mma2::IteratorB::Params params_AT;
+    typename Mma2::IteratorA::Params params_B{};
+    typename Mma2::IteratorB::Params params_AT{};
 
-    typename Epilogue::OutputTileIterator::Params params_C;
-    typename Epilogue::OutputTileIterator::Params params_D;
+    typename Epilogue::OutputTileIterator::Params params_C{};
+    typename Epilogue::OutputTileIterator::Params params_D{};
     
-    typename EpilogueOutputOp::Params output_op;
+    typename EpilogueOutputOp::Params output_op{};
+
+    GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
+    int batch_count{0};
+    int gemm_k_size{0};
 
-    GemmUniversalMode mode;
-    int batch_count;
-    int gemm_k_size;
+    void * ptr_A = nullptr;
+    void * ptr_B = nullptr;
+    void * ptr_C = nullptr;
+    void * ptr_D = nullptr;
 
-    void * ptr_A;
-    void * ptr_B;
-    void * ptr_C;
-    void * ptr_D;
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+    int64_t batch_stride_C{0};
+    int64_t batch_stride_D{0};
 
-    int64_t batch_stride_A;
-    int64_t batch_stride_B;
-    int64_t batch_stride_C;
-    int64_t batch_stride_D;
+    int *semaphore = nullptr;
 
-    int *semaphore;
+    bool allow_early_exit {false};
 
     //
     // Methods
     //
 
-    CUTLASS_HOST_DEVICE
-    Params():
-      swizzle_log_tile(0),
-      params_A(0),
-      params_BT(0),
-      params_B(0),
-      params_AT(0),
-      params_C(0),
-      params_D(0),
-      batch_count(0),
-      gemm_k_size(0),
-      mode(cutlass::gemm::GemmUniversalMode::kGemm),
-      ptr_A(nullptr),
-      ptr_B(nullptr),
-      ptr_C(nullptr),
-      ptr_D(nullptr),
-      batch_stride_A(0),
-      batch_stride_B(0),
-      batch_stride_C(0),
-      batch_stride_D(0),
-      semaphore(nullptr) { }
+    Params() = default;
 
     CUTLASS_HOST_DEVICE
     Params(
@@ -285,7 +269,8 @@ struct Rank2KUniversal {
       batch_stride_B(args.batch_stride_B),
       batch_stride_C(args.batch_stride_C),
       batch_stride_D(args.batch_stride_D),
-      semaphore(static_cast<int *>(workspace)) {
+      semaphore(static_cast<int *>(workspace)),
+      allow_early_exit(args.allow_early_exit) {
     }
 
     CUTLASS_HOST_DEVICE
@@ -347,6 +332,12 @@ struct Rank2KUniversal {
   CUTLASS_DEVICE
   void operator()(Params const &params, SharedStorage &shared_storage) {
 
+    // Early exit following LAPACK's definition
+    if (params.allow_early_exit &&
+        (params.output_op.alpha == ElementC(0)) && (params.output_op.beta == ElementC(1))) {
+      return;
+    }
+
     // Compute threadblock location
     ThreadblockSwizzle threadblock_swizzle;
 
@@ -450,7 +441,7 @@ struct Rank2KUniversal {
 
     // Broadcast the warp_id computed by lane 0 to ensure dependent code
     // is compiled as warp-uniform.
-    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int warp_idx = canonical_warp_idx_sync();
 
     int lane_idx = threadIdx.x % 32;
 
diff --git a/include/cutlass/gemm/kernel/rank_k_universal.h b/include/cutlass/gemm/kernel/rank_k_universal.h
index 41f538989a..ad418286b6 100644
--- a/include/cutlass/gemm/kernel/rank_k_universal.h
+++ b/include/cutlass/gemm/kernel/rank_k_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -106,33 +106,32 @@ struct RankKUniversal {
     // Data members
     //
 
-    GemmUniversalMode mode;
-    GemmCoord problem_size;
-    int batch_count;
+    GemmUniversalMode mode{GemmUniversalMode::kGemm};
+    GemmCoord problem_size{};
+    int batch_count{1};
 
-    typename EpilogueOutputOp::Params epilogue;
+    typename EpilogueOutputOp::Params epilogue{};
 
-    void const * ptr_A;
-    void const * ptr_C;
-    void * ptr_D;
+    void const * ptr_A{nullptr};
+    void const * ptr_C{nullptr};
+    void * ptr_D{nullptr};
 
-    int64_t batch_stride_A;
-    int64_t batch_stride_C;
-    int64_t batch_stride_D;
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_C{0};
+    int64_t batch_stride_D{0};
 
-    typename LayoutA::Stride::Index lda;
-    typename LayoutB::Stride::Index ldb;
-    typename LayoutC::Stride::Index ldc;
-    typename LayoutC::Stride::Index ldd;
+    typename LayoutA::Stride::Index lda{};
+    typename LayoutB::Stride::Index ldb{};
+    typename LayoutC::Stride::Index ldc{};
+    typename LayoutC::Stride::Index ldd{};
+
+    bool allow_early_exit{false};
 
     //
     // Methods
     //
     
-    Arguments(): 
-      mode(GemmUniversalMode::kGemm), 
-      batch_count(1), 
-      ptr_A(nullptr), ptr_C(nullptr), ptr_D(nullptr) { }
+    Arguments() = default;
 
     /// constructs an arguments structure
     Arguments(
@@ -148,7 +147,8 @@ struct RankKUniversal {
       int64_t batch_stride_D,
       typename LayoutA::Stride::Index lda,
       typename LayoutC::Stride::Index ldc,
-      typename LayoutC::Stride::Index ldd
+      typename LayoutC::Stride::Index ldd,
+      bool allow_early_exit = false
     ):
       mode(mode), 
       problem_size(problem_size), 
@@ -156,7 +156,9 @@ struct RankKUniversal {
       epilogue(epilogue), 
       ptr_A(ptr_A), ptr_C(ptr_C), ptr_D(ptr_D), 
       batch_stride_A(batch_stride_A), batch_stride_C(batch_stride_C), batch_stride_D(batch_stride_D), 
-      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd) {
+      lda(lda), ldb(0),
+      ldc(ldc), ldd(ldd),
+      allow_early_exit(allow_early_exit) {
 
       }
 
@@ -169,56 +171,38 @@ struct RankKUniversal {
   /// Parameters structure
   struct Params {
 
-    cutlass::gemm::GemmCoord problem_size;
-    cutlass::gemm::GemmCoord grid_tiled_shape;
-    int swizzle_log_tile;
+    cutlass::gemm::GemmCoord problem_size{};
+    cutlass::gemm::GemmCoord grid_tiled_shape{};
+    int swizzle_log_tile{0};
    
-    typename Mma::IteratorA::Params params_A;
-    typename Mma::IteratorB::Params params_B;
-    typename Epilogue::OutputTileIterator::Params params_C;
-    typename Epilogue::OutputTileIterator::Params params_D;
-    
-    typename EpilogueOutputOp::Params output_op;
+    typename Mma::IteratorA::Params params_A{};
+    typename Mma::IteratorB::Params params_B{};
+    typename Epilogue::OutputTileIterator::Params params_C{};
+    typename Epilogue::OutputTileIterator::Params params_D{};
+    typename EpilogueOutputOp::Params output_op{};
 
-    GemmUniversalMode mode;
-    int batch_count;
-    int gemm_k_size;
+    GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
+    int batch_count{0};
+    int gemm_k_size{0};
 
-    void * ptr_A;
-    void * ptr_B;
-    void * ptr_C;
-    void * ptr_D;
+    void * ptr_A{nullptr};
+    void * ptr_B{nullptr};
+    void * ptr_C{nullptr};
+    void * ptr_D{nullptr};
 
-    int64_t batch_stride_A;
-    int64_t batch_stride_B;
-    int64_t batch_stride_C;
-    int64_t batch_stride_D;
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+    int64_t batch_stride_C{0};
+    int64_t batch_stride_D{0};
 
-    int *semaphore;
+    int *semaphore{nullptr};
+
+    bool allow_early_exit{false};
 
     //
     // Methods
     //
-
-    CUTLASS_HOST_DEVICE
-    Params():
-      swizzle_log_tile(0),
-      params_A(0),
-      params_B(0),
-      params_C(0),
-      params_D(0),
-      batch_count(0),
-      gemm_k_size(0),
-      mode(cutlass::gemm::GemmUniversalMode::kGemm),
-      ptr_A(nullptr),
-      ptr_B(nullptr),
-      ptr_C(nullptr),
-      ptr_D(nullptr),
-      batch_stride_A(0),
-      batch_stride_B(0),
-      batch_stride_C(0),
-      batch_stride_D(0),
-      semaphore(nullptr) { }
+    Params() = default;
 
     CUTLASS_HOST_DEVICE
     Params(
@@ -246,7 +230,8 @@ struct RankKUniversal {
       batch_stride_B(args.batch_stride_A),
       batch_stride_C(args.batch_stride_C),
       batch_stride_D(args.batch_stride_D),
-      semaphore(static_cast<int *>(workspace)) {
+      semaphore(static_cast<int *>(workspace)),
+      allow_early_exit(args.allow_early_exit) {
     }
 
     CUTLASS_HOST_DEVICE
@@ -313,6 +298,12 @@ struct RankKUniversal {
     cutlass::gemm::GemmCoord threadblock_tile_offset =
         threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
 
+    // Early exit following LAPACK's definition
+    if (params.allow_early_exit &&
+        (params.output_op.alpha == ElementC(0)) && (params.output_op.beta == ElementC(1))) {
+      return;
+    }
+
     // Early exit if CTA is out of range
     if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
       params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
@@ -403,7 +394,7 @@ struct RankKUniversal {
 
     // Broadcast the warp_id computed by lane 0 to ensure dependent code
     // is compiled as warp-uniform.
-    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int warp_idx = canonical_warp_idx_sync();
 
     int lane_idx = threadIdx.x % 32;
 
diff --git a/include/cutlass/gemm/kernel/sm70_gemm.hpp b/include/cutlass/gemm/kernel/sm70_gemm.hpp
new file mode 100644
index 0000000000..b6ad7613db
--- /dev/null
+++ b/include/cutlass/gemm/kernel/sm70_gemm.hpp
@@ -0,0 +1,270 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "cute/tensor.hpp"
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<cute::is_base_of_v<KernelMultistage, typename CollectiveMainloop_::DispatchPolicy::Schedule>>>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(rank(ProblemShape{}) == 3 or rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+
+  using TileSchedulerTag = TileScheduler_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileScheduler_, ArchTag, TileShape,
+    cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  static constexpr bool IsGdcEnabled = false;
+
+  static constexpr bool is_valid_tile_scheduler =
+  cute::is_void_v<TileScheduler_> or cute::is_same_v<TileScheduler_, PersistentScheduler>;
+static_assert(is_valid_tile_scheduler, "SM70 kernel does not support specializing the tile scheduler.");
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+  static_assert(cute::is_same_v<ElementAccumulator, typename CollectiveEpilogue::ElementAccumulator>,
+    "Mainloop and epilogue do not agree on accumulator value type.");
+
+  // MSVC requires the cast to fix a warning-as-error.
+  static constexpr int SharedStorageSize = static_cast<int>(cute::max(
+      sizeof(typename CollectiveMainloop::SharedStorage),
+      sizeof(typename CollectiveEpilogue::SharedStorage)));
+
+  static constexpr uint32_t MaxThreadsPerBlock = CUTE_STATIC_V(cute::size(TiledMma{}));
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    KernelHardwareInfo hw_info{args.hw_info.device_id, args.hw_info.sm_count};
+    auto problem_shape_MNKL = append<4>(args.problem_shape, Int<1>{});
+
+    return {
+      args.mode,
+      args.problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, workspace)
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool mode_implementable = args.mode == GemmUniversalMode::kGemm or
+          (args.mode == GemmUniversalMode::kBatched && rank(ProblemShape{}) == 4);
+    return mode_implementable && TileScheduler::can_implement(args.scheduler);
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_size = 0;
+    return workspace_size;
+  }
+
+  static
+  cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr, 
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    cutlass::Status status = Status::kSuccess;
+
+    return status;
+  }
+
+  static dim3
+  get_grid_shape(Params const& params) {
+    int batch_count = 1;
+    if constexpr (cute::rank(ProblemShape{}) == 4) {
+      batch_count = cute::size<3>(params.problem_shape);
+    }
+
+    return dim3(
+      cute::size(cute::ceil_div(cute::shape<0>(params.problem_shape), cute::shape<0>(TileShape{}))),
+      cute::size(cute::ceil_div(cute::shape<1>(params.problem_shape), cute::shape<1>(TileShape{}))),
+      batch_count
+    );
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+    // Preconditions
+    CUTE_STATIC_ASSERT(is_static<TileShape>::value);
+
+    // Separate out problem shape for convenience
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // Preconditions
+    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    int thread_idx = int(threadIdx.x);
+    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
+    auto [m_coord, n_coord, l_coord] = static_cast<uint3>(blockIdx);
+    auto blk_coord_mnkl = make_coord(m_coord, n_coord, _, l_coord);                                        // (m,n,k,l)
+
+    // Represent the full tensors
+    Tensor mA_mkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_A), make_shape(M,K,L), params.mainloop.dA); //(m,k,l)
+    Tensor mB_nkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_B), make_shape(N,K,L), params.mainloop.dB); //(n,k,l)
+
+    // Get batch slice
+    Tensor mA_mk = mA_mkl(_,_,l_coord);                                                                        // (m,k)
+    Tensor mB_nk = mB_nkl(_,_,l_coord);                                                                        // (n,k)
+
+    // Slice to get the tiles this thread block is responsible for
+    Tensor gA = local_tile(mA_mk, blk_shape, take<0,3>(blk_coord_mnkl), Step<_1, X,_1>{});           // (BLK_M,BLK_K,k)
+    Tensor gB = local_tile(mB_nk, blk_shape, take<0,3>(blk_coord_mnkl), Step< X,_1,_1>{});           // (BLK_N,BLK_K,k)
+
+    // Compute tile residues for predication
+    auto m_max_coord = M - size<0>(gA) * get<0>(blk_coord_mnkl);                             // M - BLK_M * m_coord
+    auto n_max_coord = N - size<0>(gB) * get<1>(blk_coord_mnkl);                             // N - BLK_N * n_coord
+    auto k_residue   = K - size<1>(gA) * size<2>(gA);                                        // K - BLK_K * k_coord_max
+    auto residue_mnk = make_tuple(m_max_coord, n_max_coord, k_residue);
+
+    // Allocate the tiled_mma and the accumulators for the (M,N) blk_shape
+    TiledMma tiled_mma;
+    Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape)); // (MMA,MMA_M,MMA_N)
+    clear(accumulators);
+
+    auto k_tile_iter  = cute::make_coord_iterator(shape<2>(gA));
+    int  k_tile_count = size<2>(gA);
+
+    // Perform the collective scoped MMA
+    CollectiveMainloop collective_mma;
+    collective_mma(
+      accumulators,
+      gA,
+      gB,
+      accumulators,
+      k_tile_iter, k_tile_count,
+      residue_mnk,
+      thread_idx,
+      smem_buf
+    );
+    // Epilogue and write to gD
+    CollectiveEpilogue epilogue{params.epilogue};
+    epilogue(
+      problem_shape_MNKL,
+      blk_shape,
+      blk_coord_mnkl,
+      accumulators,
+      tiled_mma,
+      residue_mnk,
+      thread_idx,
+      smem_buf
+    );
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp b/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp
new file mode 100644
index 0000000000..c0c10b97b7
--- /dev/null
+++ b/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp
@@ -0,0 +1,883 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/arch/mma_sm90.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/gemm_universal_decl.h"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/gemm/group_array_problem_shape.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cute/tensor.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedCooperative, typename CollectiveMainloop_::DispatchPolicy::Schedule>>
+>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(rank(typename ProblemShape::UnderlyingProblemShape{}) == 3 or rank(typename ProblemShape::UnderlyingProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+
+  static_assert(cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedCooperative, typename CollectiveMainloop_::DispatchPolicy::Schedule>);
+
+  static constexpr bool IsGdcEnabled = false;
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using InternalStrideA = typename CollectiveMainloop::InternalStrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using InternalStrideB = typename CollectiveMainloop::InternalStrideB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using Schedule = typename DispatchPolicy::Schedule;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using InternalStrideC = typename CollectiveEpilogue::InternalStrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using InternalStrideD = typename CollectiveEpilogue::InternalStrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+  static_assert(cute::is_void_v<TileScheduler_>,
+    "Ptr-Array Cooperative and Grouped Gemm Cooperative kernel only supports the default scheduler.");
+
+  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideA, StrideA>;
+
+  using TileScheduler = cute::conditional_t<IsGroupedGemmKernel,
+    typename detail::TileSchedulerSelector<
+      GroupScheduler, ArchTag,
+      TileShape, ClusterShape,
+      ProblemShape>::Scheduler,
+    typename detail::TileSchedulerSelector<
+    void, ArchTag, TileShape, ClusterShape>::Scheduler>;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  static constexpr uint32_t NumLoadWarpGroups = 1;
+  static constexpr uint32_t NumMmaThreads = CUTE_STATIC_V(size(TiledMma{}));
+  static constexpr uint32_t NumMmaWarpGroups = NumMmaThreads / NumThreadsPerWarpGroup;
+  static constexpr uint32_t MaxThreadsPerBlock = NumMmaThreads + (NumLoadWarpGroups * NumThreadsPerWarpGroup);
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  /// Register requirement for Load and Math WGs
+  static constexpr uint32_t LoadRegisterRequirement = 40;
+  static constexpr uint32_t MmaRegisterRequirement = 232;
+
+  // 1 stage ordered sequence between mainloop and epilogue producer load threads
+  using LoadWarpOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _1> {
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+
+      MainloopTensorStorage mainloop;
+      EpilogueTensorStorage epilogue;
+    } tensors;
+
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+      alignas(16) typename LoadWarpOrderBarrier::SharedStorage load_order;
+    } pipelines;
+
+    struct TensorMapStorage : cute::aligned_struct<128, _1> {
+      using MainloopTensorMapStorage = typename CollectiveMainloop::TensorMapStorage;
+      using EpilogueTensorMapStorage = typename CollectiveEpilogue::TensorMapStorage;
+
+      alignas(128) MainloopTensorMapStorage mainloop;
+      alignas(128) EpilogueTensorMapStorage epilogue;
+    } tensormaps;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerParams scheduler{};
+    void* workspace{nullptr};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    CUTLASS_TRACE_HOST("to_underlying_arguments():");
+
+    ProblemShape problem_shapes = args.problem_shape;
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+
+    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
+
+    // Calculate workspace pointers
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    void* epilogue_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveEpilogue::get_workspace_size(problem_shapes, args.epilogue, sm_count);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    void* mainloop_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveMainloop::get_workspace_size(problem_shapes, args.mainloop, sm_count);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    void* scheduler_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    TileSchedulerParams scheduler;
+    if constexpr (IsGroupedGemmKernel) {
+      scheduler = TileScheduler::to_underlying_arguments(
+      problem_shapes, TileShape{}, ClusterShape{}, hw_info, args.scheduler, scheduler_workspace);
+    }
+    else {
+      scheduler = TileScheduler::to_underlying_arguments(
+      problem_shapes.get_host_problem_shape(), TileShape{}, ClusterShape{}, hw_info, args.scheduler, scheduler_workspace);
+    }
+
+    return {
+      args.mode,
+      problem_shapes,
+      CollectiveMainloop::to_underlying_arguments(problem_shapes, args.mainloop, mainloop_workspace),
+      CollectiveEpilogue::to_underlying_arguments(problem_shapes, args.epilogue, epilogue_workspace),
+      hw_info,
+      scheduler,
+      workspace
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = true;
+    if constexpr (IsGroupedGemmKernel) {
+      // Group GEMM currently only supports rank-3 problem shapes
+      implementable &= (args.mode == GemmUniversalMode::kGrouped && rank(typename ProblemShape::UnderlyingProblemShape{}) == 3);
+    } else {
+      implementable &= (args.mode == GemmUniversalMode::kArray && rank(typename ProblemShape::UnderlyingProblemShape{}) == 4);
+    }
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements for Ptr Array Gemm or Grouped Gemm.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+    return implementable;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_size = 0;
+    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+
+    workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue, sm_count);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    workspace_size += CollectiveMainloop::get_workspace_size(args.problem_shape, args.mainloop, sm_count);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    workspace_size += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
+    static constexpr uint32_t NumAccumulatorMtxs = 1;
+
+    status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue, args.hw_info.sm_count);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = CollectiveMainloop::initialize_workspace(args.problem_shape, args.mainloop, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveMainloop::get_workspace_size(args.problem_shape, args.mainloop, args.hw_info.sm_count);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = TileScheduler::template initialize_workspace<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, workspace_ptr + workspace_offset, stream, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles, NumAccumulatorMtxs, cuda_adapter);
+    workspace_offset += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    // Given device SM count, set grid size s.t. we do not launch more thread blocks than we can run concurrently
+    TileSchedulerArguments args{};
+    if constexpr (!std::is_const_v<decltype(args.max_swizzle_size)>) {
+      args.max_swizzle_size = 1 << params.scheduler.log_swizzle_size_;
+    }
+    args.raster_order = params.scheduler.raster_order_ == TileScheduler::RasterOrder::AlongN ? TileScheduler::RasterOrderOptions::AlongN : TileScheduler::RasterOrderOptions::AlongM;
+    dim3 grid_shape;
+    if constexpr (IsGroupedGemmKernel) {
+      grid_shape = TileScheduler::get_grid_shape(params.scheduler, params.problem_shape, TileShape{}, ClusterShape{}, params.hw_info, args);
+    }
+    else {
+      grid_shape = TileScheduler::get_grid_shape(params.scheduler, params.problem_shape.get_host_problem_shape(), TileShape{}, ClusterShape{}, params.hw_info, args);
+    }
+    return grid_shape;
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
+#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
+#else
+
+    // Preconditions
+    static_assert(size(TiledMma{}) == 256, "Cooperative kernel must have TiledMMA operating using 256 threads.");
+    static_assert(size<0>(TileShape{}) >= 128,
+        "Cooperative kernel requires Tile Size to be greater than or equal to 128 along the M-dimension.");
+    static_assert(NumMmaWarpGroups == 2, "Cooperative kernels currently only support NumMmaWarpGroups == 2");
+
+    if constexpr (cutlass::epilogue::collective::detail::sm90_is_ptr_array_tma_dispatch_policy_v<typename CollectiveEpilogue::DispatchPolicy>) {
+      static_assert(NumMmaWarpGroups == CollectiveEpilogue::NumEpilogueWarpGroups,
+                    "Tiled MmA does not match expected warp groups performing the epilogue");
+    }
+
+    static_assert(cute::rank(InternalStrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(InternalStrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(InternalStrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(InternalStrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    /* In the Cooperative kernel, Consumer0 and Consumer1 collaborate on the same tile */
+    enum class WarpGroupRole {
+      Producer = 0,
+      Consumer0 = 1,
+      Consumer1 = 2
+    };
+    enum class ProducerWarpRole {
+      Mainloop = 0,
+      Warp1 = 1,
+      Epilogue = 2,
+      Warp3 = 3
+    };
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    int thread_idx = int(threadIdx.x);
+    int lane_idx = canonical_lane_idx();
+    int warp_idx = canonical_warp_idx_sync();
+    int warp_idx_in_warp_group = warp_idx % NumWarpsPerWarpGroup;
+    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
+    int mma_thread_idx = thread_idx % size(TiledMma{});
+    auto warp_group_idx = canonical_warp_group_idx();
+    auto warp_group_role = WarpGroupRole(warp_group_idx);
+    auto producer_warp_role = ProducerWarpRole(warp_idx_in_warp_group);
+    int lane_predicate = cute::elect_one_sync();
+    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
+
+    // Note: Tma Descriptor Prefetch (from either const or param) is not applicable here
+
+    // Mainloop Load pipeline
+    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+    typename MainloopPipeline::Params mainloop_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Mainloop) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_pipeline_params.is_leader = warp_group_thread_idx == 0;
+    mainloop_pipeline_params.num_consumers = size(TiledMma{});
+    mainloop_pipeline_params.transaction_bytes = params.mainloop.tma_transaction_bytes;
+    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params, ClusterShape{});
+
+    // Epilogue Load pipeline
+    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Epilogue) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.dst_blockid = cute::block_rank_in_cluster();
+    epi_load_pipeline_params.producer_arv_count = NumThreadsPerWarp;
+    epi_load_pipeline_params.consumer_arv_count = size(TiledMma{});
+    if constexpr (CollectiveEpilogue::RequiresTransactionBytes) {
+      epi_load_pipeline_params.transaction_bytes = params.epilogue.tma_transaction_bytes;
+    }
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    typename LoadWarpOrderBarrier::Params params_load_order_barrier;
+    params_load_order_barrier.group_id = producer_warp_role == ProducerWarpRole::Mainloop ? 0 : 1;
+    params_load_order_barrier.group_size = NumThreadsPerWarp;
+    LoadWarpOrderBarrier load_order_barrier(shared_storage.pipelines.load_order, params_load_order_barrier);
+
+    // Initialize starting pipeline states for the collectives
+    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
+    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
+
+    // For the DMA Load (producer) we start with an opposite phase
+    // i.e., we skip all waits since we know that the buffer is indeed empty
+    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
+    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    auto cluster_wait_fn = [] () {
+      // We need this to guarantee that the Pipeline init is visible
+      // To all producers and consumer thread blocks in the Cluster
+      if constexpr (size(ClusterShape{}) > 1) {
+        cute::cluster_arrive_relaxed();
+        return [] () { cute::cluster_wait(); };
+      }
+      else {
+        __syncthreads();
+        return [] () {}; // do nothing
+      }
+    } ();
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    TiledMma tiled_mma;
+    const auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
+    const auto c_tile_count = CollectiveEpilogue::get_load_pipe_increment(blk_shape);
+    const auto d_tile_count = CollectiveEpilogue::get_store_pipe_increment(blk_shape);
+
+    TileScheduler scheduler{params.scheduler};
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop;
+    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+    // Wait for all thread blocks in the Cluster
+    cluster_wait_fn();
+
+    auto work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
+    if (not work_tile_info.is_valid()) {
+      // When problem shapes are only on device, the grid launched may be larger than the total number of blocks across groups
+      return;
+    }
+
+    // Optionally append 1s until problem shape is rank-4 in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+
+    // Prepare and partition the input tensors. Expects a tuple of tensors where:
+    // get<0>(load_inputs) is the tma tensor A after local tiling so that it has shape (BLK_M,BLK_K,m,k,l)
+    // get<1>(load_inputs) is the tma tensor B after local tiling so that it has shape (BLK_N,BLK_K,n,k,l)
+    auto load_inputs = collective_mainloop.load_init(problem_shape_MNKL, params.mainloop);
+    static_assert(cute::tuple_size_v<decltype(load_inputs)> >= 2, "Output of load_init must have at least two elements (A, B)");
+
+    // Extract out partitioned A and B.
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gB_nkl = get<1>(load_inputs);
+
+    // Get pipeline stage increments from tensor shapes
+    auto k_tile_count = size<3>(gA_mkl);
+
+    if (warp_group_role == WarpGroupRole::Producer) {
+      cutlass::arch::warpgroup_reg_dealloc<LoadRegisterRequirement>();
+
+      // Mainloop Producer Warp
+      if (producer_warp_role == ProducerWarpRole::Mainloop) {
+        int32_t curr_batch = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl)); // Usually just returns work_tile_info.L_idx;
+        int32_t const mock_l_coord = 0;
+        int32_t const sm_idx = blockIdx.x + (blockIdx.y * gridDim.x);
+        int32_t const sm_count = params.hw_info.sm_count;
+
+        // Fetch a copy of tensormaps for the CTA
+        auto input_tensormaps = collective_mainloop.tensormaps_init(params.mainloop, shared_storage.tensormaps.mainloop, sm_count, sm_idx);
+
+        // Update tensormap for the initial batch for the CTA
+        if (work_tile_info.is_valid()) {
+          collective_mainloop.tensormaps_perform_update(
+            shared_storage.tensormaps.mainloop,
+            params.mainloop,
+            input_tensormaps,
+            problem_shape_MNKL,
+            curr_batch
+          );
+          // Ensure warp is converged before issuing tensormap fence release
+          __syncwarp();
+          // Entire warp must do this (i.e. it's aligned)
+          collective_mainloop.tensormaps_cp_fence_release(shared_storage.tensormaps.mainloop, input_tensormaps);
+        }
+
+        bool do_load_order_arrive = true;
+        bool did_batch_change = true;
+        while (work_tile_info.is_valid()) {
+          if (!TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
+            auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+            work_tile_info = next_work_tile_info;
+            continue;
+          }
+
+          // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+          auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+          auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+          auto blk_coord = make_coord(m_coord, n_coord, _, mock_l_coord);
+
+          // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+          auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+          auto work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
+          auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, shape<3>(gA_mkl)), shape<3>(gA_mkl));
+
+          if (did_batch_change) {
+            collective_mainloop.tensormaps_fence_acquire(input_tensormaps);
+          }
+
+          collective_mainloop.load(
+            params.mainloop,
+            mainloop_pipeline,
+            mainloop_pipe_producer_state,
+            load_inputs,
+            input_tensormaps,
+            blk_coord,
+            k_tile_iter, work_k_tile_count,
+            lane_idx,
+            block_rank_in_cluster,
+            shared_storage.tensors.mainloop
+          );
+          // Update starting pipeline state for the next tile
+          // Wait for the last TMA stage to complete loading, before issuing tensormap updates
+          mainloop_pipe_producer_state.advance(work_k_tile_count - 1);
+
+          // Signal for the epilogue load warp to begin
+          if (do_load_order_arrive) {
+            load_order_barrier.arrive();
+            do_load_order_arrive = false;
+          }
+
+          // Get next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+          work_tile_info = next_work_tile_info;
+          auto next_batch = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl)); // Usually just returns work_tile_info.L_idx
+          did_batch_change = next_batch != curr_batch;
+          if (work_tile_info.is_valid() && did_batch_change) {
+            curr_batch = next_batch;
+            if constexpr (IsGroupedGemmKernel) {
+              problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(curr_batch), 1);
+            }
+            // Purpose of this pipeline state is to make sure TMA loads have finished before doing descriptor updates
+            // Since this state is waiting for loads to finish, it must start in the inverted phase.
+            typename CollectiveMainloop::PipelineState mainloop_pipe_tma_consumer_state =
+              {mainloop_pipe_producer_state.index(), !mainloop_pipe_producer_state.phase(), mainloop_pipe_producer_state.count()};
+            mainloop_pipeline.consumer_wait(mainloop_pipe_tma_consumer_state);
+            collective_mainloop.tensormaps_perform_update(
+              shared_storage.tensormaps.mainloop,
+              params.mainloop,
+              input_tensormaps,
+              problem_shape_MNKL,
+              curr_batch
+            );
+            // Ensure warp is converged before issuing tensor replace
+            __syncwarp();
+            // Entire warp must do this (i.e. it's aligned)
+            collective_mainloop.tensormaps_cp_fence_release(shared_storage.tensormaps.mainloop, input_tensormaps);
+          }
+          // Advance the producer state for the last remaining stage that was being waited for above
+          mainloop_pipe_producer_state.advance(1);
+        } // Scheduler work fetch loop
+
+        // Make sure all Consumer Warp Groups have been waited upon
+        collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+      } // Mainloop Producer Warp End
+
+      // Epilogue Producer Warp
+      else if (producer_warp_role == ProducerWarpRole::Epilogue && collective_epilogue.is_producer_load_needed()) {
+        int32_t const sm_idx = blockIdx.x + (blockIdx.y * gridDim.x);
+        int32_t const sm_count = params.hw_info.sm_count;
+
+        auto epi_load_tensormap = get<0>(collective_epilogue.load_init(params.epilogue, shared_storage.tensormaps.epilogue, sm_count, sm_idx));
+
+        bool did_batch_change = true;
+        constexpr bool IsEpiLoad = true;
+
+        if (work_tile_info.is_valid()) {
+          collective_epilogue.template tensormaps_perform_update<IsEpiLoad>(
+            shared_storage.tensormaps.epilogue,
+            params.epilogue,
+            epi_load_tensormap,
+            problem_shape_MNKL,
+            work_tile_info.L_idx,
+            0
+          );
+
+          // Converge before issuing tensormap fence release since fence is aligned
+          __syncwarp();
+          collective_epilogue.template tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue, epi_load_tensormap, 0);
+        }
+
+        load_order_barrier.wait();
+
+        while (work_tile_info.is_valid()) {
+          int32_t curr_batch = work_tile_info.L_idx;
+
+          // Get next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+
+          if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
+            if constexpr (IsGroupedGemmKernel) {
+              problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+            }
+
+            // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+            auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+            auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+            auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+            auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+            if (did_batch_change) {
+              collective_epilogue.template tensormaps_fence_acquire<IsEpiLoad>(epi_load_tensormap);
+            }
+
+            bool wait = work_tile_info.is_valid() && curr_batch != next_work_tile_info.L_idx;
+
+            epi_load_pipe_producer_state = collective_epilogue.load(
+              epi_load_pipeline,
+              epi_load_pipe_producer_state,
+              problem_shape_MNKL,
+              blk_shape,
+              blk_coord,
+              tiled_mma,
+              lane_idx,
+              shared_storage.tensors.epilogue,
+              epi_load_tensormap,
+              work_tile_info.reduction_subtile_idx(),
+              wait
+            );
+          }
+
+          work_tile_info = next_work_tile_info;
+          did_batch_change = curr_batch != work_tile_info.L_idx;
+
+          if (work_tile_info.is_valid() && did_batch_change) {
+            if constexpr (IsGroupedGemmKernel) {
+              problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+            }
+
+            // tensormap update
+            {
+              collective_epilogue.template tensormaps_perform_update<IsEpiLoad>(
+                shared_storage.tensormaps.epilogue,
+                params.epilogue,
+                epi_load_tensormap,
+                problem_shape_MNKL,
+                work_tile_info.L_idx,
+                0
+              );
+
+              // Converge before issuing tensormap fence release since fence is aligned
+              __syncwarp();
+              collective_epilogue.template tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue, epi_load_tensormap, 0);
+            }
+          }
+
+        } // Scheduler work fetch loop
+
+        // Make sure all Consumer Warp Groups have been waited upon
+        collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
+      } // Epilogue Producer Warp End
+    } // Producer Warp Group End
+
+    else if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      cutlass::arch::warpgroup_reg_alloc<MmaRegisterRequirement>();
+
+      // Index of warp group within consumer warp groups
+      int consumer_warp_group_idx = warp_group_role == WarpGroupRole::Consumer0 ? 0 : 1;
+
+      int32_t const sm_idx = blockIdx.x + (blockIdx.y * gridDim.x);
+      int32_t const sm_count = params.hw_info.sm_count;
+      // Do we potentially issue tail arrives for TMA stores, if epilogue load is waiting for it
+      bool do_store_tail = false;
+      // Get a copy of tensormaps
+      auto epi_store_tensormap = get<0>(collective_epilogue.store_init(params.epilogue, shared_storage.tensormaps.epilogue, sm_count, sm_idx, consumer_warp_group_idx));
+
+      bool did_batch_change = true;
+      constexpr bool IsEpiLoad = false;
+
+      if (work_tile_info.is_valid()) {
+
+        if (warp_idx_in_warp_group == 0) {
+          collective_epilogue.template tensormaps_perform_update<IsEpiLoad>(
+            shared_storage.tensormaps.epilogue,
+            params.epilogue,
+            epi_store_tensormap,
+            problem_shape_MNKL,
+            work_tile_info.L_idx,
+            consumer_warp_group_idx
+          );
+
+          // Converge before issuing tensormap fence release since fence is aligned
+          __syncwarp();
+          collective_epilogue.template tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue, 
+                                                                     epi_store_tensormap, 
+                                                                     consumer_warp_group_idx);
+        }
+      }
+
+      while (work_tile_info.is_valid()) {
+        if constexpr (IsGroupedGemmKernel) {
+          problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+        }
+
+        int32_t curr_batch = work_tile_info.L_idx;
+
+        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+        auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+
+        // Allocate the accumulators for the (M,N) blk_shape
+        //
+        // MSVC CTAD breaks if we say "Tensor" here, so we use "auto" instead.
+        auto accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));               // (MMA,MMA_M,MMA_N)
+
+        static_assert(cute::is_any_of_v<TileScheduler,
+            detail::PersistentTileSchedulerSm90Group<ProblemShape>,
+            detail::PersistentTileSchedulerSm90>);
+        if (TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
+
+          collective_mainloop.mma(
+            mainloop_pipeline,
+            mainloop_pipe_consumer_state,
+            accumulators,
+            work_k_tile_count,
+            mma_thread_idx,
+            shared_storage.tensors.mainloop,
+            params.mainloop
+          );
+
+          // Make sure the math instructions are done and free buffers before entering the epilogue
+          collective_mainloop.mma_tail(
+            mainloop_pipeline,
+            mainloop_pipe_consumer_state,
+            work_k_tile_count
+          );
+
+          // Update starting mainloop pipeline state for the next tile
+          mainloop_pipe_consumer_state.advance(work_k_tile_count);
+        }
+
+        // Perform reduction across splits, if needed
+        TileScheduler::fixup(
+          params.scheduler, work_tile_info, accumulators, NumMmaWarpGroups, consumer_warp_group_idx);
+
+        if (did_batch_change) {
+          collective_epilogue.template tensormaps_fence_acquire<IsEpiLoad>(epi_store_tensormap);
+        }
+
+        if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
+
+          // Epilogue and write to gD
+          auto [epi_load_pipe_consumer_state_next, epi_store_pipe_producer_state_next] =
+          collective_epilogue.store(
+            epi_load_pipeline,
+            epi_load_pipe_consumer_state,
+            epi_store_pipeline,
+            epi_store_pipe_producer_state,
+            problem_shape_MNKL,
+            blk_shape,
+            blk_coord,
+            accumulators,
+            tiled_mma,
+            mma_thread_idx,
+            shared_storage.tensors.epilogue,
+            epi_store_tensormap,
+            work_tile_info.reduction_subtile_idx()
+          );
+
+          epi_load_pipe_consumer_state = epi_load_pipe_consumer_state_next;
+          epi_store_pipe_producer_state = epi_store_pipe_producer_state_next;
+          do_store_tail = true;
+        }
+
+        // Get next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+        work_tile_info = next_work_tile_info;
+
+        did_batch_change = curr_batch != work_tile_info.L_idx;
+        if (work_tile_info.is_valid() && did_batch_change) {
+          if constexpr (IsGroupedGemmKernel) {
+            problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+          }
+          if (warp_idx_in_warp_group == 0) {
+            collective_epilogue.template tensormaps_perform_update<IsEpiLoad>(
+              shared_storage.tensormaps.epilogue,
+              params.epilogue,
+              epi_store_tensormap,
+              problem_shape_MNKL,
+              work_tile_info.L_idx,
+              consumer_warp_group_idx
+            );
+
+            // Converge before issuing tensormap fence release since fence is aligned
+            __syncwarp();
+            collective_epilogue.template tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue, 
+                                                                       epi_store_tensormap,
+                                                                       consumer_warp_group_idx);
+          }
+        }
+
+      } // Scheduler work fetch loop
+
+      // Cooperative only needs TMA to complete at the very end of the kernel
+      if (do_store_tail) {
+        collective_epilogue.store_tail(
+          epi_load_pipeline,
+          epi_load_pipe_consumer_state,
+          epi_store_pipeline,
+          epi_store_pipe_producer_state
+        );
+      }
+    } // Consumer Warp Groups End
+#endif
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp b/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp
new file mode 100644
index 0000000000..1b7c0cb412
--- /dev/null
+++ b/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp
@@ -0,0 +1,947 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/arch/mma_sm90.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/gemm_universal_decl.h"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/gemm/group_array_problem_shape.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cute/tensor.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedPingpong, typename CollectiveMainloop_::DispatchPolicy::Schedule>>
+>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(rank(typename ProblemShape::UnderlyingProblemShape{}) == 3 or rank(typename ProblemShape::UnderlyingProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+
+  static_assert(cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedPingpong, typename CollectiveMainloop_::DispatchPolicy::Schedule>);
+
+  static constexpr bool IsGdcEnabled = false;
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using InternalStrideA = typename CollectiveMainloop::InternalStrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using InternalStrideB = typename CollectiveMainloop::InternalStrideB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using Schedule = typename DispatchPolicy::Schedule;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using InternalStrideC = typename CollectiveEpilogue::InternalStrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using InternalStrideD = typename CollectiveEpilogue::InternalStrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+  static_assert(cute::is_void_v<TileScheduler_>,
+    "Ptr-Array Pingpong and Grouped Gemm Pingpong kernel only supports the default scheduler.");
+
+  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideA, StrideA>;
+
+  using TileScheduler = cute::conditional_t<IsGroupedGemmKernel,
+    typename detail::TileSchedulerSelector<
+      GroupScheduler, ArchTag,
+      TileShape, ClusterShape,
+      ProblemShape>::Scheduler,
+    typename detail::TileSchedulerSelector<
+    void, ArchTag, TileShape, ClusterShape>::Scheduler>;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  static constexpr uint32_t NumLoadWarpGroups = 1;
+  static constexpr uint32_t NumMmaWarpGroups = 2;
+  static constexpr uint32_t MaxThreadsPerBlock = CUTE_STATIC_V(size(TiledMma{})) + (NumMmaWarpGroups * NumThreadsPerWarpGroup);
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  /// Register requirement for Load and Math WGs
+  static constexpr uint32_t LoadRegisterRequirement = 40;
+  static constexpr uint32_t MmaRegisterRequirement = 232;
+
+  // 1 stage ordered sequence between mainloop and epilogue producer load threads
+  using LoadWarpOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>;
+
+  // Order Sequence barrier with two stages: one for Mainloop and one for Epilogue
+  static constexpr uint32_t StagesPerMathWarpGroup = 2;
+  using MathWarpGroupOrderBarrier = cutlass::OrderedSequenceBarrier<StagesPerMathWarpGroup, NumMmaWarpGroups>;
+  using MathWarpGroupOrderBarrierSharedStorage = cutlass::PipelineDetail::OrderedSequenceBarrierSharedStorage<
+      MathWarpGroupOrderBarrier::SequenceDepth,
+      MathWarpGroupOrderBarrier::SequenceLength>;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _1> {
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+
+      MainloopTensorStorage mainloop;
+      EpilogueTensorStorage epilogue;
+    } tensors;
+
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+      using MathWarpGroupOrderBarrierStorage = MathWarpGroupOrderBarrierSharedStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+      alignas(16) typename LoadWarpOrderBarrier::SharedStorage load_order;
+      alignas(16) MathWarpGroupOrderBarrierStorage math_wg_order;
+    } pipelines;
+
+    struct TensorMapStorage : cute::aligned_struct<128, _1> {
+      using MainloopTensorMapStorage = typename CollectiveMainloop::TensorMapStorage;
+      using EpilogueTensorMapStorage = typename CollectiveEpilogue::TensorMapStorage;
+
+      alignas(128) MainloopTensorMapStorage mainloop;
+      alignas(128) EpilogueTensorMapStorage epilogue;
+    } tensormaps;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerParams scheduler{};
+    void* workspace{nullptr};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    CUTLASS_TRACE_HOST("to_underlying_arguments():");
+
+    ProblemShape problem_shapes = args.problem_shape;
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+
+    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
+
+    // Calculate workspace pointers
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    void* epilogue_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveEpilogue::get_workspace_size(problem_shapes, args.epilogue, sm_count);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    void* mainloop_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveMainloop::get_workspace_size(problem_shapes, args.mainloop, sm_count);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    void* scheduler_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    // Precompute the sub tiles numbers in epilogue, pass into tile scheduler.  Therefore it will be used
+    // in separate reduction scheme for streamk case, NumEpilogueSubTiles default value is 1, which means
+    // subtile will not be used, therefore separate reduction will not be enabled.
+    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
+    TileSchedulerParams scheduler;
+    if constexpr (IsGroupedGemmKernel) {
+      scheduler = TileScheduler::to_underlying_arguments(
+      problem_shapes, TileShape{}, ClusterShape{}, hw_info, args.scheduler, scheduler_workspace, NumEpilogueSubTiles);
+    }
+    else {
+      scheduler = TileScheduler::to_underlying_arguments(
+      problem_shapes.get_host_problem_shape(), TileShape{}, ClusterShape{}, hw_info, args.scheduler, scheduler_workspace, NumEpilogueSubTiles);
+    }
+
+    return {
+      args.mode,
+      problem_shapes,
+      CollectiveMainloop::to_underlying_arguments(problem_shapes, args.mainloop, mainloop_workspace),
+      CollectiveEpilogue::to_underlying_arguments(problem_shapes, args.epilogue, epilogue_workspace),
+      hw_info,
+      scheduler,
+      workspace
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = true;
+    if constexpr (IsGroupedGemmKernel) {
+      // Group GEMM currently only supports rank-3 problem shapes
+      implementable &= (args.mode == GemmUniversalMode::kGrouped && rank(typename ProblemShape::UnderlyingProblemShape{}) == 3);
+    } else {
+      implementable &= (args.mode == GemmUniversalMode::kArray && rank(typename ProblemShape::UnderlyingProblemShape{}) == 4);
+    }
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements for Ptr Array Gemm or Grouped Gemm.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+    return implementable;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_size = 0;
+    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+
+    workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue, sm_count);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    workspace_size += CollectiveMainloop::get_workspace_size(args.problem_shape, args.mainloop, sm_count);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    workspace_size += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
+    static constexpr uint32_t NumAccumulatorMtxs = 1;
+
+    status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue, args.hw_info.sm_count);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = CollectiveMainloop::initialize_workspace(args.problem_shape, args.mainloop, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveMainloop::get_workspace_size(args.problem_shape, args.mainloop, args.hw_info.sm_count);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = TileScheduler::template initialize_workspace<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, workspace_ptr + workspace_offset, stream, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles, NumAccumulatorMtxs, cuda_adapter);
+    workspace_offset += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    return status;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    // Given device SM count, set grid size s.t. we do not launch more thread blocks than we can run concurrently
+    TileSchedulerArguments args{};
+    if constexpr (!std::is_const_v<decltype(args.max_swizzle_size)>) {
+      args.max_swizzle_size = 1 << params.scheduler.log_swizzle_size_;
+    }
+    args.raster_order = params.scheduler.raster_order_ == TileScheduler::RasterOrder::AlongN ? TileScheduler::RasterOrderOptions::AlongN : TileScheduler::RasterOrderOptions::AlongM;
+    dim3 grid_shape;
+    if constexpr (IsGroupedGemmKernel) {
+      grid_shape = TileScheduler::get_grid_shape(params.scheduler, params.problem_shape, TileShape{}, ClusterShape{}, params.hw_info, args);
+    }
+    else {
+      grid_shape = TileScheduler::get_grid_shape(params.scheduler, params.problem_shape.get_host_problem_shape(), TileShape{}, ClusterShape{}, params.hw_info, args);
+    }
+    return grid_shape;
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
+#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
+#else
+
+    // Preconditions
+    static_assert(size(TiledMma{}) == 128, "Pingpong kernel must have TiledMMA operating using 128 threads.");
+    static_assert(NumMmaWarpGroups == 2, "Pingpong kernels currently only support NumMmaWarpGroups == 2");
+
+    if constexpr (cutlass::epilogue::collective::detail::sm90_is_ptr_array_tma_dispatch_policy_v<typename CollectiveEpilogue::DispatchPolicy>) {
+      static_assert(NumMmaWarpGroups == CollectiveEpilogue::NumEpilogueWarpGroups,
+                    "Tiled MmA does not match expected warp groups performing the epilogue");
+    }
+
+    static_assert(cute::rank(InternalStrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(InternalStrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(InternalStrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(InternalStrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    enum class WarpGroupRole {
+      Producer = 0,
+      Consumer0 = 1,
+      Consumer1 = 2
+    };
+    enum class ProducerWarpRole {
+      Mainloop = 0,
+      Warp1 = 1,
+      Epilogue = 2,
+      Warp3 = 3
+    };
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    int thread_idx = int(threadIdx.x);
+    int lane_idx = canonical_lane_idx();
+    int warp_idx = canonical_warp_idx_sync();
+    int warp_idx_in_warp_group = warp_idx % NumWarpsPerWarpGroup;
+    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
+    int mma_thread_idx = thread_idx % size(TiledMma{});
+    auto warp_group_idx = canonical_warp_group_idx();
+    auto warp_group_role = WarpGroupRole(warp_group_idx);
+    auto producer_warp_role = ProducerWarpRole(warp_idx_in_warp_group);
+    int lane_predicate = cute::elect_one_sync();
+    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
+
+    // Note: Tma Descriptor Prefetch (from either const or param) is not applicable here
+
+    // Mainloop Load pipeline
+    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+    typename MainloopPipeline::Params mainloop_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Mainloop) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_pipeline_params.is_leader = warp_group_thread_idx == 0;
+    mainloop_pipeline_params.num_consumers = NumThreadsPerWarpGroup;
+    mainloop_pipeline_params.transaction_bytes = params.mainloop.tma_transaction_bytes;
+    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params, ClusterShape{});
+
+    // Epilogue Load pipeline
+    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Epilogue) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.dst_blockid = cute::block_rank_in_cluster();
+    epi_load_pipeline_params.producer_arv_count = NumThreadsPerWarp;
+    epi_load_pipeline_params.consumer_arv_count = NumThreadsPerWarpGroup;
+    if constexpr (CollectiveEpilogue::RequiresTransactionBytes) {
+      epi_load_pipeline_params.transaction_bytes = params.epilogue.tma_transaction_bytes;
+    }
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    typename LoadWarpOrderBarrier::Params params_load_order_barrier;
+    params_load_order_barrier.group_id = producer_warp_role == ProducerWarpRole::Mainloop ? 0 : 1;
+    params_load_order_barrier.group_size = NumThreadsPerWarp;
+    LoadWarpOrderBarrier load_order_barrier(shared_storage.pipelines.load_order, params_load_order_barrier);
+
+    typename MathWarpGroupOrderBarrier::Params params_math_wg_order_barrier;
+    // DMA Load WG will not participate in these Ordered Barrier syncs
+    params_math_wg_order_barrier.group_id = warp_group_idx - static_cast<int>(WarpGroupRole::Consumer0);
+    params_math_wg_order_barrier.group_size = NumThreadsPerWarpGroup; // Number of threads / participants in a group
+    MathWarpGroupOrderBarrier math_wg_order_barrier(shared_storage.pipelines.math_wg_order, params_math_wg_order_barrier);
+
+    // Initialize starting pipeline states for the collectives
+    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
+    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
+
+    // For the DMA Load (producer) we start with an opposite phase
+    // i.e., we skip all waits since we know that the buffer is indeed empty
+    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
+    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    auto cluster_wait_fn = [] () {
+      // We need this to guarantee that the Pipeline init is visible
+      // To all producers and consumer thread blocks in the Cluster
+      if constexpr (size(ClusterShape{}) > 1) {
+        cute::cluster_arrive_relaxed();
+        return [] () { cute::cluster_wait(); };
+      }
+      else {
+        __syncthreads();
+        return [] () {}; // do nothing
+      }
+    } ();
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    TiledMma tiled_mma;
+    const auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
+    const auto c_tile_count = CollectiveEpilogue::get_load_pipe_increment(blk_shape);
+    const auto d_tile_count = CollectiveEpilogue::get_store_pipe_increment(blk_shape);
+
+    TileScheduler scheduler{params.scheduler};
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop;
+    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+    // Wait for all thread blocks in the Cluster
+    cluster_wait_fn();
+
+    auto work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
+    if (not work_tile_info.is_valid()) {
+      // When problem shapes are only on device, the grid launched may be larger than the total number of blocks across groups
+      return;
+    }
+
+    // Optionally append 1s until problem shape is rank-4 in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+
+    if (warp_group_role == WarpGroupRole::Consumer1) {
+      // Advance 2nd Math WG to the next work tile for the startup
+      const auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+
+      auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+      work_tile_info = next_work_tile_info;
+      if (!work_tile_info.is_valid()) {
+        return;
+      }
+
+      // Advance 2nd Math WG pipeline states to the end of 1st Math WG
+      mainloop_pipe_consumer_state.advance(k_tile_count);
+      epi_load_pipe_consumer_state.advance(c_tile_count);
+      epi_store_pipe_producer_state.advance(d_tile_count);
+
+      problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+    }
+
+    // Prepare and partition the input tensors. Expects a tuple of tensors where:
+    // get<0>(load_inputs) is the tma tensor A after local tiling so that it has shape (BLK_M,BLK_K,m,k,l)
+    // get<1>(load_inputs) is the tma tensor B after local tiling so that it has shape (BLK_N,BLK_K,n,k,l)
+    auto load_inputs = collective_mainloop.load_init(problem_shape_MNKL, params.mainloop);
+    static_assert(cute::tuple_size_v<decltype(load_inputs)> >= 2, "Output of load_init must have at least two elements (A, B)");
+
+    // Extract out partitioned A and B.
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gB_nkl = get<1>(load_inputs);
+
+    // Get pipeline stage increments from tensor shapes
+    auto k_tile_count = size<3>(gA_mkl);
+
+    if (warp_group_role == WarpGroupRole::Producer) {
+      cutlass::arch::warpgroup_reg_dealloc<LoadRegisterRequirement>();
+
+      // Mainloop Producer Warp
+      if (producer_warp_role == ProducerWarpRole::Mainloop) {
+        int32_t curr_batch = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl)); // Usually just returns work_tile_info.L_idx;
+        int32_t const mock_l_coord = 0;
+        int32_t const sm_idx = blockIdx.x + (blockIdx.y * gridDim.x);
+        int32_t const sm_count = params.hw_info.sm_count;
+
+        // Fetch a copy of tensormaps for the CTA
+        auto input_tensormaps = collective_mainloop.tensormaps_init(params.mainloop, shared_storage.tensormaps.mainloop, sm_count, sm_idx);
+
+        // Update tensormap for the initial batch for the CTA
+        if (work_tile_info.is_valid()) {
+          collective_mainloop.tensormaps_perform_update(
+            shared_storage.tensormaps.mainloop,
+            params.mainloop,
+            input_tensormaps,
+            problem_shape_MNKL,
+            curr_batch
+          );
+          // Ensure warp is converged before issuing tensormap fence release
+          __syncwarp();
+          // Entire warp must do this (i.e. it's aligned)
+          collective_mainloop.tensormaps_cp_fence_release(shared_storage.tensormaps.mainloop, input_tensormaps);
+        }
+
+        bool do_load_order_arrive = true;
+        bool did_batch_change = true;
+        while (work_tile_info.is_valid()) {
+          if (!TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
+            auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+            work_tile_info = next_work_tile_info;
+            continue;
+          }
+
+          // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+          auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+          auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+          auto blk_coord = make_coord(m_coord, n_coord, _, mock_l_coord);
+
+          // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+          auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+          auto work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
+          auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, shape<3>(gA_mkl)), shape<3>(gA_mkl));
+
+          if (did_batch_change) {
+            collective_mainloop.tensormaps_fence_acquire(input_tensormaps);
+          }
+
+          collective_mainloop.load(
+            params.mainloop,
+            mainloop_pipeline,
+            mainloop_pipe_producer_state,
+            load_inputs,
+            input_tensormaps,
+            blk_coord,
+            k_tile_iter, work_k_tile_count,
+            lane_idx,
+            block_rank_in_cluster,
+            shared_storage.tensors.mainloop
+          );
+          // Update starting pipeline state for the next tile
+          // Wait for the last TMA stage to complete loading, before issuing tensormap updates
+          mainloop_pipe_producer_state.advance(work_k_tile_count - 1);
+
+          // Signal for the epilogue load warp to begin
+          if (do_load_order_arrive) {
+            load_order_barrier.arrive();
+            do_load_order_arrive = false;
+          }
+
+          // Get next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+          work_tile_info = next_work_tile_info;
+          auto next_batch = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl)); // Usually just returns work_tile_info.L_idx
+          did_batch_change = next_batch != curr_batch;
+          if (work_tile_info.is_valid() && did_batch_change) {
+            curr_batch = next_batch;
+            if constexpr (IsGroupedGemmKernel) {
+              problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(curr_batch), 1);
+            }
+            // Purpose of this pipeline state is to make sure TMA loads have finished before doing descriptor updates
+            // Since this state is waiting for loads to finish, it must start in the inverted phase.
+            typename CollectiveMainloop::PipelineState mainloop_pipe_tma_consumer_state =
+              {mainloop_pipe_producer_state.index(), !mainloop_pipe_producer_state.phase(), mainloop_pipe_producer_state.count()};
+            mainloop_pipeline.consumer_wait(mainloop_pipe_tma_consumer_state);
+            collective_mainloop.tensormaps_perform_update(
+              shared_storage.tensormaps.mainloop,
+              params.mainloop,
+              input_tensormaps,
+              problem_shape_MNKL,
+              curr_batch
+            );
+            // Ensure warp is converged before issuing tensor replace
+            __syncwarp();
+            // Entire warp must do this (i.e. it's aligned)
+            collective_mainloop.tensormaps_cp_fence_release(shared_storage.tensormaps.mainloop, input_tensormaps);
+          }
+          // Advance the producer state for the last remaining stage that was being waited for above
+          mainloop_pipe_producer_state.advance(1);
+        } // Scheduler work fetch loop
+
+        // Make sure all Consumer Warp Groups have been waited upon
+        collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+      } // Mainloop Producer Warp End
+
+      // Epilogue Producer Warp
+      else if (producer_warp_role == ProducerWarpRole::Epilogue && collective_epilogue.is_producer_load_needed()) {
+        int32_t const sm_idx = blockIdx.x + (blockIdx.y * gridDim.x);
+        int32_t const sm_count = params.hw_info.sm_count;
+
+        auto epi_load_tensormap = get<0>(collective_epilogue.load_init(params.epilogue, shared_storage.tensormaps.epilogue, sm_count, sm_idx));
+
+        bool did_batch_change = true;
+        constexpr bool IsEpiLoad = true;
+
+        if (work_tile_info.is_valid()) {
+          collective_epilogue.template tensormaps_perform_update<IsEpiLoad>(
+            shared_storage.tensormaps.epilogue,
+            params.epilogue,
+            epi_load_tensormap,
+            problem_shape_MNKL,
+            work_tile_info.L_idx,
+            0
+          );
+
+          // Converge before issuing tensormap fence release since fence is aligned
+          __syncwarp();
+          collective_epilogue.template tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue, epi_load_tensormap, 0);
+        }
+
+        load_order_barrier.wait();
+
+        while (work_tile_info.is_valid()) {
+          int32_t curr_batch = work_tile_info.L_idx;
+
+          // Get next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+
+          if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
+            if constexpr (IsGroupedGemmKernel) {
+              problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+            }
+
+            // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+            auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+            auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+            auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+            auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+            if (did_batch_change) {
+              collective_epilogue.template tensormaps_fence_acquire<IsEpiLoad>(epi_load_tensormap);
+            }
+
+            bool wait = work_tile_info.is_valid() && curr_batch != next_work_tile_info.L_idx;
+
+            epi_load_pipe_producer_state = collective_epilogue.load(
+              epi_load_pipeline,
+              epi_load_pipe_producer_state,
+              problem_shape_MNKL,
+              blk_shape,
+              blk_coord,
+              tiled_mma,
+              lane_idx,
+              shared_storage.tensors.epilogue,
+              epi_load_tensormap,
+              work_tile_info.reduction_subtile_idx(),
+              wait
+            );
+          }
+
+          work_tile_info = next_work_tile_info;
+          did_batch_change = curr_batch != work_tile_info.L_idx;
+
+          if (work_tile_info.is_valid() && did_batch_change) {
+            if constexpr (IsGroupedGemmKernel) {
+              problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+            }
+
+            // tensormap update
+            {
+              collective_epilogue.template tensormaps_perform_update<IsEpiLoad>(
+                shared_storage.tensormaps.epilogue,
+                params.epilogue,
+                epi_load_tensormap,
+                problem_shape_MNKL,
+                work_tile_info.L_idx,
+                0
+              );
+
+              // Converge before issuing tensormap fence release since fence is aligned
+              __syncwarp();
+              collective_epilogue.template tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue, epi_load_tensormap, 0);
+            }
+          }
+
+        } // Scheduler work fetch loop
+
+        // Make sure all Consumer Warp Groups have been waited upon
+        collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
+      } // Epilogue Producer Warp End
+    } // Producer Warp Group End
+
+    else if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      cutlass::arch::warpgroup_reg_alloc<MmaRegisterRequirement>();
+
+      // Index of warp group within consumer warp groups
+      int consumer_warp_group_idx = warp_group_role == WarpGroupRole::Consumer0 ? 0 : 1;
+
+      int32_t const sm_idx = blockIdx.x + (blockIdx.y * gridDim.x);
+      int32_t const sm_count = params.hw_info.sm_count;
+      // Do we potentially issue tail arrives for TMA stores, if epilogue load is waiting for it
+      bool do_store_tail = false;
+      // Get a copy of tensormaps
+      auto epi_store_tensormap = get<0>(collective_epilogue.store_init(params.epilogue, shared_storage.tensormaps.epilogue, sm_count, sm_idx, consumer_warp_group_idx));
+
+      bool did_batch_change = true;
+      constexpr bool IsEpiLoad = false;
+
+      if (work_tile_info.is_valid()) {
+
+        if (warp_idx_in_warp_group == 0) {
+          collective_epilogue.template tensormaps_perform_update<IsEpiLoad>(
+            shared_storage.tensormaps.epilogue,
+            params.epilogue,
+            epi_store_tensormap,
+            problem_shape_MNKL,
+            work_tile_info.L_idx,
+            consumer_warp_group_idx
+          );
+
+          // Converge before issuing tensormap fence release since fence is aligned
+          __syncwarp();
+          collective_epilogue.template tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue,
+                                                                     epi_store_tensormap,
+                                                                     consumer_warp_group_idx);
+        }
+      }
+
+      while (work_tile_info.is_valid()) {
+        if constexpr (IsGroupedGemmKernel) {
+          problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+        }
+
+        int32_t curr_batch = work_tile_info.L_idx;
+
+        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+        auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+
+        // Allocate the accumulators for the (M,N) blk_shape
+        //
+        // MSVC CTAD breaks if we say "Tensor" here, so we use "auto" instead.
+        auto accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));               // (MMA,MMA_M,MMA_N)
+
+        static_assert(cute::is_any_of_v<TileScheduler,
+            detail::PersistentTileSchedulerSm90Group<ProblemShape>,
+            detail::PersistentTileSchedulerSm90>);
+        if (TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
+
+          math_wg_order_barrier.wait();
+
+          collective_mainloop.mma(
+            mainloop_pipeline,
+            mainloop_pipe_consumer_state,
+            accumulators,
+            work_k_tile_count,
+            mma_thread_idx,
+            shared_storage.tensors.mainloop,
+            params.mainloop
+          );
+
+          math_wg_order_barrier.arrive();
+
+          // Make sure the math instructions are done and free buffers before entering the epilogue
+          collective_mainloop.mma_tail(
+            mainloop_pipeline,
+            mainloop_pipe_consumer_state,
+            work_k_tile_count
+          );
+
+           math_wg_order_barrier.wait();
+
+          // Update starting mainloop pipeline state for the next tile
+          mainloop_pipe_consumer_state.advance(work_k_tile_count);
+        }
+
+        // Perform reduction across splits, if needed
+        TileScheduler::fixup(
+          params.scheduler, work_tile_info, accumulators, NumMmaWarpGroups, consumer_warp_group_idx);
+
+        if (did_batch_change) {
+          collective_epilogue.template tensormaps_fence_acquire<IsEpiLoad>(epi_store_tensormap);
+        }
+
+        if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
+
+          // Epilogue and write to gD
+          auto [epi_load_pipe_consumer_state_next, epi_store_pipe_producer_state_next] =
+          collective_epilogue.store(
+            epi_load_pipeline,
+            epi_load_pipe_consumer_state,
+            epi_store_pipeline,
+            epi_store_pipe_producer_state,
+            problem_shape_MNKL,
+            blk_shape,
+            blk_coord,
+            accumulators,
+            tiled_mma,
+            mma_thread_idx,
+            shared_storage.tensors.epilogue,
+            epi_store_tensormap,
+            work_tile_info.reduction_subtile_idx()
+          );
+
+          epi_load_pipe_consumer_state = epi_load_pipe_consumer_state_next;
+          epi_store_pipe_producer_state = epi_store_pipe_producer_state_next;
+          do_store_tail = true;
+        }
+
+        // Get next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+        work_tile_info = next_work_tile_info;
+
+        // Skip a tile for pingpong
+        if (work_tile_info.is_valid()) {
+          if constexpr (IsGroupedGemmKernel) {
+            problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+          }
+          work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+          mainloop_pipe_consumer_state.advance(work_k_tile_count);
+
+          // Go to next tile
+          auto [next_next_work_tile_info, next_increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+
+          work_tile_info = next_next_work_tile_info;
+          increment_pipe = next_increment_pipe;
+        }
+
+        did_batch_change = curr_batch != work_tile_info.L_idx;
+        if (work_tile_info.is_valid() && did_batch_change) {
+          if constexpr (IsGroupedGemmKernel) {
+            problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+          }
+          if (warp_idx_in_warp_group == 0) {
+            collective_epilogue.template tensormaps_perform_update<IsEpiLoad>(
+              shared_storage.tensormaps.epilogue,
+              params.epilogue,
+              epi_store_tensormap,
+              problem_shape_MNKL,
+              work_tile_info.L_idx,
+              consumer_warp_group_idx
+            );
+
+            // Converge before issuing tensormap fence release since fence is aligned
+            __syncwarp();
+            collective_epilogue.template tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue,
+                                                                       epi_store_tensormap,
+                                                                       consumer_warp_group_idx);
+          }
+        }
+
+        // TMA store pipeline wait is only visible to TMA-issuing warp, so for multiple-consumer kernels
+        // we need to wait for all TMA stores to complete before issuing consumer order barrier arrives
+        // to ensure next math consumer doesn't overwrite smem of in-flight TMA stores of current consumer.
+        auto [epi_load_pipe_consumer_state_next_, epi_store_pipe_producer_state_next_] =
+        collective_epilogue.store_tail(
+          epi_load_pipeline,
+          epi_load_pipe_consumer_state,
+          epi_store_pipeline,
+          epi_store_pipe_producer_state
+        );
+
+        // Update starting load/store pipeline states for the next tile
+        // state has already been incremented by 1 tile in collective calls, advance once again for ping pong
+        epi_load_pipe_consumer_state = epi_load_pipe_consumer_state_next_;
+        epi_store_pipe_producer_state = epi_store_pipe_producer_state_next_;
+        epi_load_pipe_consumer_state.advance(c_tile_count);
+        epi_store_pipe_producer_state.advance(d_tile_count);
+
+        // Cue for next Math WG's Epilogue to start
+        math_wg_order_barrier.arrive();
+
+      } // Scheduler work fetch loop
+    } // Consumer Warp Groups End
+#endif
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp
new file mode 100644
index 0000000000..c7245457ea
--- /dev/null
+++ b/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp
@@ -0,0 +1,306 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/mma_sm90.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/gemm_universal_decl.h"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/trace.h"
+#include "cute/tensor.hpp"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<cute::is_base_of_v<KernelTma, typename CollectiveMainloop_::DispatchPolicy::Schedule>>>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(cute::rank(ProblemShape{}) == 3 or cute::rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+  static constexpr bool IsGdcEnabled = false;
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+  static_assert(cute::is_same_v<ElementAccumulator, typename CollectiveEpilogue::ElementAccumulator>,
+    "Mainloop and epilogue do not agree on accumulator value type.");
+
+  static_assert(cute::is_void_v<TileScheduler_> or cute::is_same_v<TileScheduler_, PersistentScheduler>,
+    "TMA kernel does not support specializing the tile scheduler.");
+  using TileSchedulerTag = TileScheduler_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileScheduler_, ArchTag, TileShape, ClusterShape>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+
+  static constexpr int SharedStorageSize = static_cast<int>(cute::max(
+      sizeof(typename CollectiveMainloop::SharedStorage),
+      sizeof(typename CollectiveEpilogue::SharedStorage)));
+
+  static constexpr uint32_t MaxThreadsPerBlock = CollectiveMainloop::ThreadCount;
+
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    (void) workspace;
+    auto problem_shape = args.problem_shape;
+    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
+      // swap M/N
+      get<0>(problem_shape) = get<1>(args.problem_shape);
+      get<1>(problem_shape) = get<0>(args.problem_shape);
+    }
+    return {
+      args.mode,
+      problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, workspace)
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
+        (args.mode == GemmUniversalMode::kBatched && cute::rank(ProblemShape{}) == 4);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+
+    return implementable;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    return 0;
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return Status::kSuccess;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    auto cluster_shape = ClusterShape{};
+    auto tile_shape = TileShape{};
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    return TileScheduler::get_tiled_cta_shape_mnl(
+        problem_shape_MNKL, tile_shape, cluster_shape);
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
+#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
+#else
+
+    // Preconditions
+    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    int thread_idx = int(threadIdx.x);
+    int warp_idx   = canonical_warp_idx_sync();
+    int lane_predicate = cute::elect_one_sync();
+    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
+
+    // Issue Tma Descriptor Prefetch from a single thread
+    if ((warp_idx == 0) && lane_predicate) {
+      CollectiveMainloop::prefetch_tma_descriptors(params.mainloop);
+    }
+
+    // Separate out problem shape for convenience
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    auto M = get<0>(problem_shape_MNKL);
+    auto N = get<1>(problem_shape_MNKL);
+    auto K = get<2>(problem_shape_MNKL);
+    auto L = get<3>(problem_shape_MNKL);
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = params.mainloop.tma_load_a.get_tma_tensor(make_shape(M,K,L));                            // (m,k,l)
+    Tensor mB_nkl = params.mainloop.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
+    auto blk_coord = make_coord(_,_,_);                                                   // (m,n,k) -- defer the slice
+
+    // Make tiled views
+    Tensor gA_mkl = local_tile(mA_mkl, blk_shape, blk_coord, Step<_1, X,_1>{});                  // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, blk_shape, blk_coord, Step< X,_1,_1>{});                  // (BLK_N,BLK_K,n,k,l)
+
+    // Compute m_coord, n_coord, and l_coord with their post-tiled shapes
+    auto m_coord = idx2crd(int(blockIdx.x), shape<2>(gA_mkl));
+    auto n_coord = idx2crd(int(blockIdx.y), shape<2>(gB_nkl));
+    auto l_coord = idx2crd(int(blockIdx.z), shape<4>(gB_nkl));
+    auto output_tile_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+    // Slice with m_coord and n_coord
+    Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                       // (BLK_M,BLK_K,k)
+    Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                       // (BLK_N,BLK_K,k)
+
+    // Allocate the tiled_mma and the accumulators for the (M,N) blk_shape
+    TiledMma tiled_mma;
+    Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));                   // (MMA,MMA_M,MMA_N)
+
+    auto k_tile_iter  = cute::make_coord_iterator(shape<2>(gA));
+    auto k_tile_count = size<2>(gA);
+
+    // Perform the collective scoped MMA
+    CollectiveMainloop collective_mma;
+    collective_mma(
+      gA, params.mainloop.tma_load_a,
+      gB, params.mainloop.tma_load_b,
+      accumulators,
+      k_tile_iter, k_tile_count,
+      thread_idx,
+      block_rank_in_cluster,
+      smem_buf,
+      params.mainloop
+    );
+
+    constexpr int BLK_M_RANK = cute::rank<0>(blk_shape);
+    auto m_max_coord = unwrap(cute::transform(make_seq<BLK_M_RANK>{}, [&](auto i) {
+        return  get<i>(M) - get<0,i>(blk_shape) * get<i>(m_coord);
+      }));
+
+    constexpr int BLK_N_RANK = cute::rank<1>(blk_shape);
+    auto n_max_coord = unwrap(cute::transform(make_seq<BLK_N_RANK>{}, [&](auto i) {
+        return  get<i>(N) - get<1,i>(blk_shape) * get<i>(n_coord);
+      }));
+    auto residue_mnk = make_tuple(m_max_coord, n_max_coord, Int<0>{});
+
+    // Epilogue and write to gD
+    CollectiveEpilogue epilogue{params.epilogue};
+    epilogue(
+      problem_shape_MNKL,
+      blk_shape,
+      output_tile_coord,
+      accumulators,
+      tiled_mma,
+      residue_mnk,
+      thread_idx,
+      smem_buf
+    );
+#endif
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp
new file mode 100644
index 0000000000..b278f96e9c
--- /dev/null
+++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp
@@ -0,0 +1,522 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/arch/mma_sm90.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+
+#include "cutlass/conv/detail.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+
+#include "cutlass/arch/grid_dependency_control.h"
+
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<cute::is_base_of_v<cutlass::gemm::KernelTmaWarpSpecialized, typename CollectiveMainloop_::DispatchPolicy::Schedule>>
+>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+
+  // Handles the static_assert placed inside the operator()
+  // This is also used to decide whether the load_init inside collective mainloop returns rank 4 tensors or rank 5 tensors
+  static constexpr bool IsConvProblemShape = not (cute::is_tuple_v<ProblemShape>|| IsCutlass3ArrayKernel<ProblemShape>::value);
+  static_assert( IsConvProblemShape || (cute::rank(ProblemShape{}) == 3 || cute::rank(ProblemShape{}) == 4), "ProblemShape{} should be <M,N,K> or <M,N,K,L> for Gemm");
+
+  static constexpr bool IsGdcEnabled = cutlass::arch::IsGdcGloballyEnabled;
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  static_assert(cute::is_void_v<TileScheduler_> or cute::is_same_v<TileScheduler_, PersistentScheduler>,
+    "TMA warp-specialized kernel does not support specializing the tile scheduler.");
+  using TileSchedulerTag = TileScheduler_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileSchedulerTag, ArchTag, TileShape, ClusterShape>::Scheduler;
+
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    // Mainloop and epilogue don't use smem concurrently since kernel is non-persistent, so we can use a union
+    union TensorStorage {
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+
+      MainloopTensorStorage mainloop;
+      EpilogueTensorStorage epilogue;
+    } tensors;
+
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+    } pipelines;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+  static constexpr uint32_t NumLoadWarpGroups = 1;
+  static constexpr uint32_t NumMmaWarpGroups = 1;
+  static constexpr uint32_t MaxThreadsPerBlock = CUTE_STATIC_V(size(TiledMma{})) + (NumLoadWarpGroups * NumThreadsPerWarpGroup);
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  // Device side arguments
+  struct Arguments {
+    cutlass::gemm::GemmUniversalMode mode{}; //maintained here for backward compatibility
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+
+    // Default constructor
+    Arguments() = default;
+
+    // Constructor with specified mode 
+    // It is used for Gemm
+    Arguments(
+        cutlass::gemm::GemmUniversalMode mode_,
+        ProblemShape problem_shape_,
+        MainloopArguments mainloop_,
+        EpilogueArguments epilogue_,
+        KernelHardwareInfo hw_info_ = KernelHardwareInfo(),
+        TileSchedulerArguments scheduler_ = TileSchedulerArguments())
+    : mode(mode_)
+      , problem_shape(problem_shape_)
+      , mainloop(mainloop_)
+      , epilogue(epilogue_)
+      , hw_info(hw_info_)
+      , scheduler(scheduler_) {}
+
+    // Constructor with default value for 'mode'
+    // This allows us to set GemmUniversal mode as kGemm for Conv right away
+    // while keeping the testbeds unchanged
+    Arguments(
+        ProblemShape problem_shape_,
+        MainloopArguments mainloop_,
+        EpilogueArguments epilogue_,
+        KernelHardwareInfo hw_info_ = KernelHardwareInfo(),
+        TileSchedulerArguments scheduler_ = TileSchedulerArguments())
+    : mode(cutlass::gemm::GemmUniversalMode::kGemm) // Default mode
+      , problem_shape(problem_shape_)
+      , mainloop(mainloop_)
+      , epilogue(epilogue_)
+      , hw_info(hw_info_)
+      , scheduler(scheduler_) {}
+
+  };
+
+  // Kernel entry point API
+  struct Params {
+    using ProblemShapeMNKL = decltype(cutlass::conv::detail::get_problem_shape_MNKL_helper<CollectiveMainloop>(ProblemShape{}, cute::conditional_t<IsConvProblemShape, cute::true_type, cute::false_type>{}));
+    ProblemShapeMNKL problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+
+    (void) workspace;
+    auto problem_shape_mnkl = cutlass::conv::detail::get_problem_shape_MNKL_helper<CollectiveMainloop>(args.problem_shape, cute::conditional_t<IsConvProblemShape, cute::true_type, cute::false_type>{});
+    auto transformed_problem_shape = cutlass::conv::detail::get_transformed_problem_shape_MNKL(args.problem_shape);
+
+    auto swapped_problem_shape = problem_shape_mnkl;
+    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
+      // swap M/N
+      get<0>(swapped_problem_shape) = get<1>(problem_shape_mnkl);
+      get<1>(swapped_problem_shape) = get<0>(problem_shape_mnkl);
+    }
+    return {
+      swapped_problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace),
+      CollectiveEpilogue::to_underlying_arguments(transformed_problem_shape, args.epilogue, workspace)
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = true;
+    auto transformed_problem_shape = cutlass::conv::detail::get_transformed_problem_shape_MNKL(args.problem_shape);
+
+    if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+        return implementable;
+    }
+
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(transformed_problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+
+    return implementable;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    return 0;
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return Status::kSuccess;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    auto cluster_shape = ClusterShape{};
+    auto tile_shape = TileShape{};
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    return TileScheduler::get_tiled_cta_shape_mnl(
+        problem_shape_MNKL, tile_shape, cluster_shape);
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+#if defined(__CUDA_ARCH_FEAT_SM90_ALL)
+#  define ENABLE_SM90_KERNEL_LEVEL 1
+#endif
+// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
+#if ! defined(ENABLE_SM90_KERNEL_LEVEL)
+    printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
+#else
+
+    enum class WarpGroupRole {
+      Producer = 0,
+      Consumer = 1,
+    };
+    enum class ProducerWarpRole {
+      MainloopEpilogue = 0,
+      Warp1 = 1,
+      Warp2 = 2,
+      Warp3 = 3
+    };
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    int thread_idx = int(threadIdx.x);
+    int lane_idx = canonical_lane_idx();
+    int warp_idx = canonical_warp_idx_sync();
+    int warp_idx_in_warp_group = warp_idx % NumWarpsPerWarpGroup;
+    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
+    auto warp_group_role = WarpGroupRole(canonical_warp_group_idx());
+    auto producer_warp_role = ProducerWarpRole(warp_idx_in_warp_group);
+    int lane_predicate = cute::elect_one_sync();
+    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
+
+
+    // Issue Tma Descriptor Prefetch from a single thread
+    if ((warp_idx == 0) && lane_predicate) {
+      CollectiveMainloop::prefetch_tma_descriptors(params.mainloop);
+      CollectiveEpilogue::prefetch_tma_descriptors(params.epilogue);
+    }
+
+    // Mainloop Load pipeline
+    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+    typename MainloopPipeline::Params mainloop_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::MainloopEpilogue) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_pipeline_params.is_leader = warp_group_thread_idx == 0;
+    mainloop_pipeline_params.num_consumers = NumThreadsPerWarpGroup;
+    mainloop_pipeline_params.transaction_bytes = params.mainloop.tma_transaction_bytes;
+    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params, ClusterShape{});
+
+    // Epilogue Load pipeline
+    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::MainloopEpilogue) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.dst_blockid = cute::block_rank_in_cluster();
+    epi_load_pipeline_params.producer_arv_count = NumThreadsPerWarp;
+    epi_load_pipeline_params.consumer_arv_count = NumThreadsPerWarpGroup;
+    if constexpr (CollectiveEpilogue::RequiresTransactionBytes) {
+      epi_load_pipeline_params.transaction_bytes = params.epilogue.tma_transaction_bytes;
+    }
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    // Initialize starting pipeline states for the collectives
+    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
+    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
+
+    // For the DMA Load (producer) we start with an opposite phase
+    // i.e., we skip all waits since we know that the buffer is indeed empty
+    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
+    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    auto cluster_wait_fn = [&] () {
+      // We need this to guarantee that the Pipeline init is visible
+      // To all producers and consumer thread blocks in the Cluster
+      if constexpr (size(ClusterShape{}) > 1) {
+        cute::cluster_arrive_relaxed();
+        return [] () { cute::cluster_wait(); };
+      }
+      else {
+        __syncthreads();
+        return [] () {}; // do nothing
+      }
+    } ();
+  
+    // Preconditions only valid for Gemm
+    static_assert(IsConvProblemShape || cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(IsConvProblemShape || cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(IsConvProblemShape || cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(IsConvProblemShape || cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    auto blk_shape = TileShape{}; // (BLK_M,BLK_N,BLK_K)
+    TiledMma tiled_mma;
+
+    // Optionally append 1s until problem shape is rank-4 in case it is only rank-3 (MNK)
+    // Using constexpr if (C++17 and later)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, cute::Int<1>{});
+    
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop;
+    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+    // Prepare and partition the input tensors. 
+    // Expects a tuple of tensors for conv where:
+    // get<0>(load_inputs) is the tma tensor A after local tiling so that it has shape (BLK_M,BLK_K,m,k)
+    // get<1>(load_inputs) is the tma tensor B after local tiling so that it has shape (BLK_N,BLK_K,n,k)
+    auto load_inputs = collective_mainloop.load_init(problem_shape_MNKL, params.mainloop);
+    static_assert(cute::tuple_size_v<decltype(load_inputs)> >= 2, "Output of load_init must have at least two elements (A, B)");
+    
+    // Extract out partitioned A and B.
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gB_nkl = get<1>(load_inputs);
+
+    // Compute m_coord, n_coord, and l_coord with their post-tiled shapes
+    auto m_coord = idx2crd(int(blockIdx.x), shape<2>(gA_mkl));
+
+    auto n_coord = idx2crd(int(blockIdx.y), shape<2>(gB_nkl), compact_col_major(shape<2>(gB_nkl)));
+
+    // handles the difference between the rank of Tensor returned by load_input in case they do not have a batch mode
+    auto l_coord = [&] (auto const& gB_nkl_) {
+      // gB_nkl needs to be passed into the lambda because C++17
+      // does not permit lambda capture of structured bindings.
+      if constexpr (not IsConvProblemShape) {
+        // This needs to be inside an `if constexpr`,
+        // because shape<4>(gB_nkl) is not well-formed otherwise.
+        return idx2crd(int(blockIdx.z), shape<4>(gB_nkl_));
+      }
+      else {
+        return Int<0>{};
+      }
+    } (gB_nkl);
+
+    auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+    // Get pipeline iterators and increments from tensor shapes
+    auto k_tile_iter  = cute::make_coord_iterator(shape<3>(gA_mkl));
+    auto k_tile_count = size<3>(gA_mkl);
+
+    // Wait for all thread blocks in the Cluster
+    cluster_wait_fn();
+
+    if (warp_group_role == WarpGroupRole::Producer) {
+      if (producer_warp_role == ProducerWarpRole::MainloopEpilogue) {
+        // Ensure that the prefetched kernel does not touch
+        // unflushed global memory prior to this instruction
+        cutlass::arch::wait_on_dependent_grids();
+        collective_mainloop.load(
+          params.mainloop,
+          mainloop_pipeline,
+          mainloop_pipe_producer_state,
+          load_inputs,
+          blk_coord,
+          k_tile_iter, k_tile_count,
+          lane_idx,
+          block_rank_in_cluster,
+          shared_storage.tensors.mainloop
+        );
+        // Update starting mainloop pipeline state for the pipeline drain
+        mainloop_pipe_producer_state.advance(k_tile_count);
+        // Make sure mainloop consumer has been waited upon before issuing epilogue load
+        collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+
+        if (collective_epilogue.is_producer_load_needed()) {
+          // Ensure warp is converged before issuing epilogue loads
+          __syncwarp();
+          epi_load_pipe_producer_state = collective_epilogue.load(
+            epi_load_pipeline,
+            epi_load_pipe_producer_state,
+            problem_shape_MNKL,
+            blk_shape,
+            blk_coord,
+            tiled_mma,
+            lane_idx,
+            shared_storage.tensors.epilogue
+          );
+          collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
+        }
+      } 
+    }
+    else if (warp_group_role == WarpGroupRole::Consumer) {
+      Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));                 // (MMA,MMA_M,MMA_N)
+
+      collective_mainloop.mma(
+        mainloop_pipeline,
+        mainloop_pipe_consumer_state,
+        accumulators,
+        k_tile_count,
+        warp_group_thread_idx,
+        shared_storage.tensors.mainloop,
+        params.mainloop
+      );
+
+      // Make sure the math instructions are done and free buffers before entering the epilogue
+      collective_mainloop.mma_tail(
+        mainloop_pipeline,
+        mainloop_pipe_consumer_state,
+        k_tile_count
+      );
+
+      // Hint on an early release of global memory resources.
+      // The timing of calling this function only influences performance,
+      // not functional correctness.
+      cutlass::arch::launch_dependent_grids();
+
+      // Epilogue and write to gD
+      auto [epi_load_pipe_consumer_state_next, epi_store_pipe_producer_state_next] =
+      collective_epilogue.store(
+        epi_load_pipeline,
+        epi_load_pipe_consumer_state,
+        epi_store_pipeline,
+        epi_store_pipe_producer_state,
+        problem_shape_MNKL,
+        blk_shape,
+        blk_coord,
+        accumulators,
+        tiled_mma,
+        warp_group_thread_idx,
+        shared_storage.tensors.epilogue
+      );
+
+      collective_epilogue.store_tail(
+        epi_load_pipeline,
+        epi_load_pipe_consumer_state_next,
+        epi_store_pipeline,
+        epi_store_pipe_producer_state_next
+      );
+    }
+#endif
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
new file mode 100644
index 0000000000..0dece13924
--- /dev/null
+++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
@@ -0,0 +1,668 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/arch/mma_sm90.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cute/tensor.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/gemm/kernel/gemm_universal_decl.h"
+#include "cutlass/arch/grid_dependency_control.h"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileSchedulerTag_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileSchedulerTag_,
+  cute::enable_if_t<cute::is_base_of_v<KernelTmaWarpSpecializedCooperative, typename CollectiveMainloop_::DispatchPolicy::Schedule>>>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(cute::rank(ProblemShape{}) == 3 or cute::rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+
+  using TileSchedulerTag = TileSchedulerTag_;
+
+  using TileScheduler = typename detail::TileSchedulerSelector<
+                                          TileSchedulerTag, 
+                                          ArchTag, 
+                                          TileShape,
+                                          ClusterShape
+                                          >::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+  
+  // Warp specialization thread count per threadblock
+  static constexpr uint32_t NumMMAThreads          = size(TiledMma{});       // 8 warps
+  static constexpr uint32_t NumMainloopLoadThreads = NumThreadsPerWarp;      // 1 warp
+  static constexpr uint32_t NumEpilogueLoadThreads = NumThreadsPerWarp;      // 1 warp for C
+
+  static constexpr bool IsGdcEnabled = cutlass::arch::IsGdcGloballyEnabled;
+
+  static constexpr uint32_t NumLoadWarpGroups = 1;
+  static constexpr uint32_t NumMmaWarpGroups = NumMMAThreads / NumThreadsPerWarpGroup;
+  static constexpr uint32_t MaxThreadsPerBlock = NumMMAThreads + (NumLoadWarpGroups * NumThreadsPerWarpGroup);
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+  static constexpr uint32_t NumFixupBarriers = NumMmaWarpGroups;
+  
+  /// Register requirement for Load and Math WGs
+  static constexpr uint32_t LoadRegisterRequirement = 40;
+  static constexpr uint32_t MmaRegisterRequirement = 232;
+
+  // 1 stage ordered sequence between mainloop and epilogue producer load threads
+  using LoadWarpOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+      alignas(16) typename LoadWarpOrderBarrier::SharedStorage load_order;
+    } pipelines;
+
+    struct TensorStorage : cute::aligned_struct<128, _1> {
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+
+      EpilogueTensorStorage epilogue;
+      MainloopTensorStorage mainloop;
+    } tensors;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerParams scheduler{};
+    void* workspace{nullptr};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    CUTLASS_TRACE_HOST("to_underlying_arguments():");
+
+    auto problem_shape = args.problem_shape;
+    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
+      // swap M/N
+      get<0>(problem_shape) = get<1>(args.problem_shape);
+      get<1>(problem_shape) = get<0>(args.problem_shape);
+    }
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+
+    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
+
+    // Calculate workspace pointers
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    void* epilogue_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    void* scheduler_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumMmaWarpGroups);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    void* mainloop_workspace = nullptr;
+    // Precompute the sub tiles numbers in epilogue, pass into tile scheduler.  Therefore it will be used
+    // in separate reduction scheme for streamk case, NumEpilogueSubTiles default value is 1, which means
+    // subtile will not be used, therefore separate reduction will not be enabled.
+    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
+    TileSchedulerParams scheduler = TileScheduler::to_underlying_arguments(
+      problem_shape_MNKL, TileShape{}, ClusterShape{}, hw_info, args.scheduler, scheduler_workspace, NumEpilogueSubTiles
+      );
+
+    return {
+      args.mode,
+      problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, mainloop_workspace),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, epilogue_workspace),
+      hw_info,
+      scheduler,
+      workspace
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
+        (args.mode == GemmUniversalMode::kBatched && cute::rank(ProblemShape{}) == 4);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+    return implementable;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_size = 0;
+    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
+
+    workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    workspace_size += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+    return workspace_size;
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
+    static constexpr uint32_t NumAccumulatorMtxs = 1;
+
+    status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = TileScheduler::template initialize_workspace<ProblemShape, ElementAccumulator>(
+      args.scheduler, workspace_ptr + workspace_offset, stream, args.problem_shape, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles, NumAccumulatorMtxs, cuda_adapter);
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    // Given device SM count, set grid size s.t. we do not launch more thread blocks than we can run concurrently
+    TileSchedulerArguments args{};
+    if constexpr (!std::is_const_v<decltype(args.max_swizzle_size)>) {
+      args.max_swizzle_size = 1 << params.scheduler.log_swizzle_size_;
+    }
+    args.raster_order = params.scheduler.raster_order_ == TileScheduler::RasterOrder::AlongN ? TileScheduler::RasterOrderOptions::AlongN : TileScheduler::RasterOrderOptions::AlongM;
+    return TileScheduler::get_grid_shape(params.scheduler, params.problem_shape, TileShape{}, ClusterShape{}, params.hw_info, args);
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+#if defined(__CUDA_ARCH_FEAT_SM90_ALL)
+#  define ENABLE_SM90_KERNEL_LEVEL 1
+#endif
+// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
+#if ! defined(ENABLE_SM90_KERNEL_LEVEL)
+    printf("ERROR : Arch conditional MMA instruction used without targeting appropriate compute capability. Aborting.\n");
+#else
+
+    // Preconditions
+    static_assert(NumMMAThreads == 256, "Cooperative kernel must have TiledMMA operating using 256 threads.");
+    static_assert(size<0>(TileShape{}) >= 128,
+        "Cooperative kernel requires Tile Size to be greater than or equal to 128 along the M-dimension.");
+
+    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    /* In the Cooperative kernel, Consumer0 and Consumer1 collaborate on the same tile */
+    enum class WarpGroupRole {
+      Producer = 0,
+      Consumer0 = 1,
+      Consumer1 = 2
+    };
+    enum class ProducerWarpRole {
+      Mainloop = 0,
+      Warp1 = 1,
+      Epilogue = 2,
+      Warp3 = 3
+    };
+
+
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    int thread_idx = int(threadIdx.x);
+    int lane_idx = canonical_lane_idx();
+    int warp_idx = canonical_warp_idx_sync();
+    int warp_idx_in_warp_group = warp_idx % NumWarpsPerWarpGroup;
+    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
+    int mma_thread_idx = thread_idx % NumMMAThreads;
+    auto warp_group_role = WarpGroupRole(canonical_warp_group_idx());
+    auto producer_warp_role = ProducerWarpRole(warp_idx_in_warp_group);
+    int lane_predicate = cute::elect_one_sync();
+    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
+
+    // Issue Tma Descriptor Prefetch from a single thread
+    if ((warp_idx == 0) && lane_predicate) {
+      CollectiveMainloop::prefetch_tma_descriptors(params.mainloop);
+      CollectiveEpilogue::prefetch_tma_descriptors(params.epilogue);
+    }
+
+    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+    bool is_epi_load_needed = collective_epilogue.is_producer_load_needed();
+    // Mainloop Load pipeline
+    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+    typename MainloopPipeline::Params mainloop_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Mainloop) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_pipeline_params.is_leader = warp_group_thread_idx == 0;
+    mainloop_pipeline_params.num_consumers = NumMMAThreads;
+    mainloop_pipeline_params.transaction_bytes = params.mainloop.tma_transaction_bytes;
+    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params, ClusterShape{});
+
+    // Epilogue Load pipeline
+    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Epilogue) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    } 
+    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.dst_blockid = cute::block_rank_in_cluster();
+    epi_load_pipeline_params.producer_arv_count = NumEpilogueLoadThreads;
+    epi_load_pipeline_params.consumer_arv_count = NumMMAThreads;
+    if constexpr (CollectiveEpilogue::RequiresTransactionBytes) {
+      epi_load_pipeline_params.transaction_bytes = params.epilogue.tma_transaction_bytes;
+    }
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    typename LoadWarpOrderBarrier::Params params_load_order_barrier;
+    params_load_order_barrier.group_id = producer_warp_role == ProducerWarpRole::Mainloop ? 0 : 1;
+    params_load_order_barrier.group_size = NumThreadsPerWarp;
+    LoadWarpOrderBarrier load_order_barrier(shared_storage.pipelines.load_order, params_load_order_barrier);
+
+    // Initialize starting pipeline states for the collectives
+    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
+    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
+
+    // For the DMA Load (producer) we start with an opposite phase
+    // i.e., we skip all waits since we know that the buffer is indeed empty
+    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
+    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+
+    auto cluster_wait_fn = [] () {
+      // We need this to guarantee that the Pipeline init is visible
+      // To all producers and consumer thread blocks in the Cluster
+      if constexpr (size(ClusterShape{}) > 1) {
+        cute::cluster_arrive_relaxed();
+        return [] () { cute::cluster_wait(); };
+      }
+      else {
+        __syncthreads();
+        return [] () {}; // do nothing
+      }
+    } ();
+
+    // Optionally append 1s until problem shape is rank-4 in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    TiledMma tiled_mma;
+    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
+
+    TileScheduler scheduler{params.scheduler};
+    auto work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
+    
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop;
+
+    // Prepare and partition the input tensors. Expects a tuple of tensors where:
+    // get<0>(load_inputs) is the tma tensor A after local tiling so that it has shape (BLK_M,BLK_K,m,k,l)
+    // get<1>(load_inputs) is the tma tensor B after local tiling so that it has shape (BLK_N,BLK_K,n,k,l)
+    auto load_inputs = collective_mainloop.load_init(problem_shape_MNKL, params.mainloop);
+    static_assert(cute::tuple_size_v<decltype(load_inputs)> >= 2, "Output of load_init must have at least two elements (A, B)");
+
+    // Extract out partitioned A and B.
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gB_nkl = get<1>(load_inputs);
+
+    // Wait for all thread blocks in the Cluster
+    cluster_wait_fn();
+
+    if (warp_group_role == WarpGroupRole::Producer) {
+      cutlass::arch::warpgroup_reg_dealloc<LoadRegisterRequirement>();
+
+      // Mainloop Producer Warp
+      if (producer_warp_role == ProducerWarpRole::Mainloop) {
+        // Ensure that the prefetched kernel does not touch
+        // unflushed global memory prior to this instruction
+        cutlass::arch::wait_on_dependent_grids();
+        bool do_load_order_arrive = true;
+        while (work_tile_info.is_valid()) {
+          if (!TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
+            auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+            work_tile_info = next_work_tile_info;   
+            continue;
+          }
+
+          // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+          auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+          auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+          auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+          auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+          // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+          auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+          auto work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
+          auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, shape<3>(gA_mkl)), shape<3>(gA_mkl));
+
+          collective_mainloop.load(
+            params.mainloop,
+            mainloop_pipeline,
+            mainloop_pipe_producer_state,
+            load_inputs,
+            blk_coord,
+            k_tile_iter, work_k_tile_count,
+            lane_idx,
+            block_rank_in_cluster,
+            shared_storage.tensors.mainloop
+          );
+          // Update starting pipeline state for the next tile
+          mainloop_pipe_producer_state.advance(work_k_tile_count);
+
+          // Signal for the epilogue load warp to begin
+          if (do_load_order_arrive) {
+            load_order_barrier.arrive();
+            do_load_order_arrive = false;
+          }
+          // Get next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info
+                                                                           );
+
+          work_tile_info = next_work_tile_info;
+        } // Scheduler work fetch loop
+
+        // Make sure all Consumer Warp Groups have been waited upon
+        collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+
+      } // Mainloop Producer Warp End
+
+      // Epilogue Producer Warp
+      else if (producer_warp_role == ProducerWarpRole::Epilogue && is_epi_load_needed) {
+
+        // Ensure that the prefetched kernel does not touch
+        // unflushed global memory prior to this instruction
+        cutlass::arch::wait_on_dependent_grids();
+
+        if (!TileScheduler::requires_separate_reduction(params.scheduler) && work_tile_info.is_valid()) {
+          load_order_barrier.wait();
+        }
+
+        CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+        while (work_tile_info.is_valid()) {
+          if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
+            // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+            auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+            auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+            auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+            auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+            
+            epi_load_pipe_producer_state =
+            collective_epilogue.load(
+              epi_load_pipeline,
+              epi_load_pipe_producer_state,
+              problem_shape_MNKL,
+              blk_shape,
+              blk_coord,
+              tiled_mma,
+              lane_idx,
+              shared_storage.tensors.epilogue,
+              work_tile_info.reduction_subtile_idx()
+            );
+          }
+
+          // Get next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info
+                                                                           );
+          work_tile_info = next_work_tile_info;
+        } // Scheduler work fetch loop
+
+        // Make sure all Consumer Warp Groups have been waited upon
+        collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
+      } // Epilogue Producer Warp End
+    } // Producer Warp Group End
+
+    else if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      cutlass::arch::warpgroup_reg_alloc<MmaRegisterRequirement>();
+
+      CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+      // Do we potentially issue tail arrives for TMA stores, if epilogue load is waiting for it
+      bool do_store_tail = false;
+      while (work_tile_info.is_valid()) {
+        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+        auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+        // Allocate the accumulators for the (M,N) blk_shape
+        //
+        // MSVC CTAD breaks if we say "Tensor" here, so we use "auto" instead.
+        auto accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));                 // (MMA,MMA_M,MMA_N)
+        if (TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
+          collective_mainloop.mma(
+            mainloop_pipeline,
+            mainloop_pipe_consumer_state,
+            accumulators,
+            work_k_tile_count,
+            mma_thread_idx,
+            shared_storage.tensors.mainloop,
+            params.mainloop
+          );
+
+          // Make sure the math instructions are done and free buffers before entering the epilogue
+          collective_mainloop.mma_tail(
+            mainloop_pipeline,
+            mainloop_pipe_consumer_state,
+            work_k_tile_count
+          );
+
+          // Update starting mainloop pipeline state for the next tile
+          mainloop_pipe_consumer_state.advance(work_k_tile_count);
+        }
+        #ifdef CUTLASS_ENABLE_GDC_FOR_SM90
+        if (scheduler.is_last_tile(work_tile_info)) {
+          // Hint on an early release of global memory resources.
+          // The timing of calling this function only influences performance,
+          // not functional correctness.
+          cutlass::arch::launch_dependent_grids();
+
+        }
+        #endif
+
+        // Index of warp group within consumer warp groups
+        int consumer_warp_group_idx = canonical_warp_group_idx() - NumLoadWarpGroups;
+
+        // Perform reduction across splits, if needed
+        TileScheduler::fixup(
+          params.scheduler, work_tile_info, accumulators, NumMmaWarpGroups, consumer_warp_group_idx);
+
+        if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
+          // Epilogue and write to gD
+          auto [epi_load_pipe_consumer_state_next, epi_store_pipe_producer_state_next] =
+          collective_epilogue.store(
+            epi_load_pipeline,
+            epi_load_pipe_consumer_state,
+            epi_store_pipeline,
+            epi_store_pipe_producer_state,
+            problem_shape_MNKL,
+            blk_shape,
+            blk_coord,
+            accumulators,
+            tiled_mma,
+            mma_thread_idx,
+            shared_storage.tensors.epilogue,
+            work_tile_info.reduction_subtile_idx()
+          );
+          epi_load_pipe_consumer_state = epi_load_pipe_consumer_state_next;
+          epi_store_pipe_producer_state = epi_store_pipe_producer_state_next;
+          do_store_tail = true;
+        }
+
+        // Get next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info
+                                                                          );
+        work_tile_info = next_work_tile_info;
+      } // Scheduler work fetch loop
+
+      if (do_store_tail) {
+        collective_epilogue.store_tail(
+          epi_load_pipeline,
+          epi_load_pipe_consumer_state,
+          epi_store_pipeline,
+          epi_store_pipe_producer_state
+        );
+      }
+    } // Consumer Warp Groups End
+#endif
+  }
+
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
new file mode 100644
index 0000000000..c19a8e9f8c
--- /dev/null
+++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
@@ -0,0 +1,683 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/fast_math.h"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/arch/mma_sm90.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/gemm/kernel/gemm_universal_decl.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/arch/grid_dependency_control.h"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<cute::is_base_of_v<KernelTmaWarpSpecializedPingpong, typename CollectiveMainloop_::DispatchPolicy::Schedule>>>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(cute::rank(ProblemShape{}) == 3 or cute::rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+  static constexpr bool IsGdcEnabled = cutlass::arch::IsGdcGloballyEnabled;
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  static_assert(!cute::is_same_v<TileScheduler_, StreamKScheduler>, "Ping-pong kernel does not currently support stream-K scheduler.");
+  using TileSchedulerTag = TileScheduler_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+                                          TileSchedulerTag, 
+                                          ArchTag, 
+                                          TileShape,
+                                          ClusterShape
+                                          >::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+  // Warp specialization thread count per threadblock
+  static constexpr uint32_t NumMainloopLoadThreads = NumThreadsPerWarp;      // 1 warp
+  static constexpr uint32_t NumEpilogueLoadThreads = NumThreadsPerWarp;      // 1 warp for C
+  static constexpr uint32_t NumLoadWarpGroups = 1;
+  static constexpr uint32_t NumMmaWarpGroups = 2;
+  static constexpr uint32_t NumMMAThreads = size(TiledMma{});                 // 4 warp 
+  static constexpr uint32_t MaxThreadsPerBlock = NumMMAThreads * NumMmaWarpGroups + (NumLoadWarpGroups * NumThreadsPerWarpGroup);
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+  
+  static_assert(NumMMAThreads == 128, "Pingpong kernel must have TiledMMA operating using 128 threads.");
+  static_assert(MaxThreadsPerBlock == 384, "Pingpong kernel must have 384 threads in total.");
+
+  /// Register requirement for Load and Math WGs
+  static constexpr uint32_t LoadRegisterRequirement = 40;
+  static constexpr uint32_t MmaRegisterRequirement = 232;
+
+  // 1 stage ordered sequence between mainloop and epilogue producer load threads
+  using LoadWarpOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>;
+
+  // Order Sequence barrier with two stages: one for Mainloop and one for Epilogue
+  static constexpr uint32_t StagesPerMathWarpGroup = 2;
+  using MathWarpGroupOrderBarrier = cutlass::OrderedSequenceBarrier<
+    StagesPerMathWarpGroup, NumMmaWarpGroups>;
+  using MathWarpGroupOrderBarrierSharedStorage =
+    cutlass::PipelineDetail::OrderedSequenceBarrierSharedStorage<
+      MathWarpGroupOrderBarrier::SequenceDepth,
+      MathWarpGroupOrderBarrier::SequenceLength>;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+      using MathWarpGroupOrderBarrierStorage = MathWarpGroupOrderBarrierSharedStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+      alignas(16) MathWarpGroupOrderBarrierStorage math_wg_order;
+      alignas(16) typename LoadWarpOrderBarrier::SharedStorage load_order;
+    } pipelines;
+    
+    struct TensorStorage : cute::aligned_struct<128, _1> {
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+
+      EpilogueTensorStorage epilogue;
+      MainloopTensorStorage mainloop;
+    } tensors;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerParams scheduler{};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    CUTLASS_TRACE_HOST("to_underlying_arguments():");
+
+    (void) workspace;
+    auto problem_shape = args.problem_shape;
+    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
+      // swap M/N
+      get<0>(problem_shape) = get<1>(args.problem_shape);
+      get<1>(problem_shape) = get<0>(args.problem_shape);
+    }
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
+
+    // Calculate workspace pointers
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    void* epilogue_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    void* scheduler_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumMmaWarpGroups);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    void* mainloop_workspace = nullptr;
+    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
+
+    return {
+      args.mode,
+      problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, mainloop_workspace),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, epilogue_workspace),
+      hw_info,
+      TileScheduler::to_underlying_arguments(
+        problem_shape_MNKL, TileShape{}, ClusterShape{}, hw_info, args.scheduler, scheduler_workspace, NumEpilogueSubTiles
+      )
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
+        (args.mode == GemmUniversalMode::kBatched && cute::rank(ProblemShape{}) == 4);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+
+    return implementable;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_size = 0;
+
+    workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    workspace_size += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumMmaWarpGroups);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+    static constexpr uint32_t NumEpilogueSubTiles = 1;
+    static constexpr uint32_t NumAccumulatorMtxs = 1;
+
+    status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = TileScheduler::template initialize_workspace<ProblemShape, ElementAccumulator>(
+      args.scheduler, workspace_ptr + workspace_offset, stream, args.problem_shape, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles, NumAccumulatorMtxs, cuda_adapter);
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumMmaWarpGroups);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    // Given device SM count, set grid size s.t. we do not launch more thread blocks than we can run concurrently
+    TileSchedulerArguments args{};
+    if constexpr (!std::is_const_v<decltype(args.max_swizzle_size)>) {
+      args.max_swizzle_size = 1 << params.scheduler.log_swizzle_size_;
+    }
+    args.raster_order = params.scheduler.raster_order_ == TileScheduler::RasterOrder::AlongN ? TileScheduler::RasterOrderOptions::AlongN : TileScheduler::RasterOrderOptions::AlongM;
+    return TileScheduler::get_grid_shape(params.scheduler, params.problem_shape, TileShape{}, ClusterShape{}, params.hw_info, args);
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+#if defined(__CUDA_ARCH_FEAT_SM90_ALL)
+#  define ENABLE_SM90_KERNEL_LEVEL 1
+#endif
+// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
+#if ! defined(ENABLE_SM90_KERNEL_LEVEL)
+    printf("ERROR : Arch conditional MMA instruction used without targeting appropriate compute capability. Aborting.\n");
+#else
+
+    // Preconditions
+    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    enum class WarpGroupRole {
+      Producer = 0,
+      Consumer0 = 1,
+      Consumer1 = 2
+    };
+    enum class ProducerWarpRole {
+      Mainloop = 0,
+      Warp1 = 1,
+      Epilogue = 2,
+      Warp3 = 3
+    };
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    int thread_idx = int(threadIdx.x);
+    int lane_idx = canonical_lane_idx();
+    int warp_idx = canonical_warp_idx_sync();
+    int warp_idx_in_warp_group = warp_idx % NumWarpsPerWarpGroup;
+    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
+    auto warp_group_role = WarpGroupRole(canonical_warp_group_idx());
+    auto producer_warp_role = ProducerWarpRole(warp_idx_in_warp_group);
+    int lane_predicate = cute::elect_one_sync();
+    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
+
+    // Issue Tma Descriptor Prefetch from a single thread
+    if ((warp_idx == 0) && lane_predicate) {
+      CollectiveMainloop::prefetch_tma_descriptors(params.mainloop);
+      CollectiveEpilogue::prefetch_tma_descriptors(params.epilogue);
+    }
+
+
+    // Mainloop Load pipeline
+    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+    typename MainloopPipeline::Params mainloop_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Mainloop) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_pipeline_params.is_leader = warp_group_thread_idx == 0;
+    mainloop_pipeline_params.num_consumers = NumThreadsPerWarpGroup;
+    mainloop_pipeline_params.transaction_bytes = params.mainloop.tma_transaction_bytes;
+    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params, ClusterShape{});
+
+    // Epilogue Load pipeline
+    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Epilogue) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.dst_blockid = cute::block_rank_in_cluster();
+    epi_load_pipeline_params.producer_arv_count = NumThreadsPerWarp;
+    epi_load_pipeline_params.consumer_arv_count = NumThreadsPerWarpGroup;
+    if constexpr (CollectiveEpilogue::RequiresTransactionBytes) {
+      epi_load_pipeline_params.transaction_bytes = params.epilogue.tma_transaction_bytes;
+    }
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    typename LoadWarpOrderBarrier::Params params_load_order_barrier;
+    params_load_order_barrier.group_id = producer_warp_role == ProducerWarpRole::Mainloop ? 0 : 1;
+    params_load_order_barrier.group_size = NumThreadsPerWarp;
+    LoadWarpOrderBarrier load_order_barrier(shared_storage.pipelines.load_order, params_load_order_barrier);
+
+    typename MathWarpGroupOrderBarrier::Params params_math_wg_order_barrier;
+    // DMA Load WG will not participate in these Ordered Barrier syncs
+    params_math_wg_order_barrier.group_id = canonical_warp_group_idx() - static_cast<int>(WarpGroupRole::Consumer0);
+    params_math_wg_order_barrier.group_size = NumThreadsPerWarpGroup; // Number of threads / participants in a group
+    MathWarpGroupOrderBarrier math_wg_order_barrier(shared_storage.pipelines.math_wg_order, params_math_wg_order_barrier);
+
+    // Initialize starting pipeline states for the collectives
+    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
+    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
+
+    // For the DMA Load (producer) we start with an opposite phase
+    // i.e., we skip all waits since we know that the buffer is indeed empty
+    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
+    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    auto cluster_wait_fn = [&] () {
+      // We need this to guarantee that the Pipeline init is visible
+      // To all producers and consumer thread blocks in the Cluster
+      if constexpr (size(ClusterShape{}) > 1) {
+        cute::cluster_arrive_relaxed();
+        return [] () { cute::cluster_wait(); };
+      }
+      else {
+        __syncthreads();
+        return [] () {}; // do nothing
+      }
+    } ();
+
+    // Separate out problem shape for convenience
+    // Optionally append 1s until problem shape is rank-4 in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    TiledMma tiled_mma;
+    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop;
+    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+    // Prepare and partition the input tensors. Expects a tuple of tensors where:
+    // get<0>(load_inputs) is the tma tensor A after local tiling so that it has shape (BLK_M,BLK_K,m,k,l)
+    // get<1>(load_inputs) is the tma tensor B after local tiling so that it has shape (BLK_N,BLK_K,n,k,l)
+    auto load_inputs = collective_mainloop.load_init(problem_shape_MNKL, params.mainloop);
+    static_assert(cute::tuple_size_v<decltype(load_inputs)> >= 2, "Output of load_init must have at least two elements (A, B)");
+
+    // Extract out partitioned A and B.
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gB_nkl = get<1>(load_inputs);
+
+    // Get pipeline stage increments from tensor shapes
+    auto k_tile_count = size<3>(gA_mkl);
+    auto c_tile_count = CollectiveEpilogue::get_load_pipe_increment(blk_shape);
+    auto d_tile_count = CollectiveEpilogue::get_store_pipe_increment(blk_shape);
+
+    TileScheduler scheduler{params.scheduler};
+    if (warp_group_role == WarpGroupRole::Consumer1) {
+
+      // Advance 2nd Math WG to the next work tile for the startup
+      scheduler.advance_to_next_work();
+      // Advance 2nd Math WG pipeline states to the end of 1st Math WG
+      mainloop_pipe_consumer_state.advance(k_tile_count);
+      epi_load_pipe_consumer_state.advance(c_tile_count);
+      epi_store_pipe_producer_state.advance(d_tile_count);
+    }
+    auto work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
+
+    // Wait for all thread blocks in the Cluster
+    cluster_wait_fn();
+
+    if (warp_group_role == WarpGroupRole::Producer) {
+      cutlass::arch::warpgroup_reg_dealloc<LoadRegisterRequirement>();
+    
+      // Mainloop Producer Warp
+      if (producer_warp_role == ProducerWarpRole::Mainloop) {
+        // Ensure that the prefetched kernel does not touch
+        // unflushed global memory prior to this instruction
+        cutlass::arch::wait_on_dependent_grids();
+        bool do_load_order_arrive = true;
+        while (work_tile_info.is_valid()) {
+          // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+          auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+          auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+          auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+          auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+          auto k_tile_iter  = cute::make_coord_iterator(shape<3>(gA_mkl));
+
+          collective_mainloop.load(
+            params.mainloop,
+            mainloop_pipeline,
+            mainloop_pipe_producer_state,
+            load_inputs,
+            blk_coord,
+            k_tile_iter, k_tile_count,
+            lane_idx,
+            block_rank_in_cluster,
+            shared_storage.tensors.mainloop
+          );
+          // Update starting pipeline state for the next tile
+          mainloop_pipe_producer_state.advance(k_tile_count);
+
+          // Signal for the epilogue load warp to begin
+          if (do_load_order_arrive) {
+            load_order_barrier.arrive();
+            do_load_order_arrive = false;
+          }
+
+          // Get next work tile
+          scheduler.advance_to_next_work();
+          work_tile_info = scheduler.get_current_work();
+        } // Scheduler work fetch loop
+
+        // Make sure all Consumer Warp Groups have been waited upon
+        collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+
+      } // Mainloop Producer Warp End
+
+      // Epilogue Producer Warp
+      else if (producer_warp_role == ProducerWarpRole::Epilogue && collective_epilogue.is_producer_load_needed()) {
+
+        // Ensure that the prefetched kernel does not touch
+        // unflushed global memory prior to this instruction
+        cutlass::arch::wait_on_dependent_grids();
+
+        load_order_barrier.wait();
+        while (work_tile_info.is_valid()) {
+          // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+          auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+          auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+          auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+          auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+          epi_load_pipe_producer_state =
+          collective_epilogue.load(
+            epi_load_pipeline,
+            epi_load_pipe_producer_state,
+            problem_shape_MNKL,
+            blk_shape,
+            blk_coord,
+            tiled_mma,
+            lane_idx,
+            shared_storage.tensors.epilogue
+          );
+
+          // Get next work tile
+          scheduler.advance_to_next_work();
+          work_tile_info = scheduler.get_current_work();
+        } // Scheduler work fetch loop
+
+        // Make sure all Consumer Warp Groups have been waited upon
+        collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
+
+      } // Epilogue Producer Warp End
+    } // Producer Warp Group End
+
+    else if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      cutlass::arch::warpgroup_reg_alloc<MmaRegisterRequirement>();
+
+      #ifdef CUTLASS_ENABLE_GDC_FOR_SM90
+      // It is possible to have work tiles start off invalid,
+      // so we have to check that first.
+      if (not work_tile_info.is_valid()) {
+        // Hint on an early release of global memory resources.
+        // The timing of calling this function only influences performance,
+        // not functional correctness.
+        cutlass::arch::launch_dependent_grids();
+
+        return;
+      }
+      #endif
+      
+      while (work_tile_info.is_valid()) {
+        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+        // Allocate the accumulators for the (M,N) blk_shape
+        Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));               // (MMA,MMA_M,MMA_N)
+
+        // Order two Math WG's MMA one after the other, helps hide Epilogue
+        math_wg_order_barrier.wait();
+
+        collective_mainloop.mma(
+          mainloop_pipeline,
+          mainloop_pipe_consumer_state,
+          accumulators,
+          k_tile_count,
+          warp_group_thread_idx,
+          shared_storage.tensors.mainloop,
+          params.mainloop
+        );
+
+        // Cue for next Math WG's MMA to start
+        math_wg_order_barrier.arrive();
+
+        // Make sure the math instructions are done and free buffers before entering the epilogue
+        collective_mainloop.mma_tail(
+          mainloop_pipeline,
+          mainloop_pipe_consumer_state,
+          k_tile_count
+        );
+        // Update starting mainloop pipeline state for the next tile
+        mainloop_pipe_consumer_state.advance(k_tile_count * NumMmaWarpGroups);
+
+        #ifdef CUTLASS_ENABLE_GDC_FOR_SM90
+        if (scheduler.is_last_tile(work_tile_info, NumMmaWarpGroups)) {
+          // Hint on an early release of global memory resources.
+          // The timing of calling this function only influences performance,
+          // not functional correctness.
+          cutlass::arch::launch_dependent_grids();
+
+        }
+        #endif
+
+        // Order two Math WG's Epilogue one after the other
+        math_wg_order_barrier.wait();
+
+        // Epilogue and write to gD
+        auto [epi_load_pipe_consumer_state_next, epi_store_pipe_producer_state_next] =
+        collective_epilogue.store(
+          epi_load_pipeline,
+          epi_load_pipe_consumer_state,
+          epi_store_pipeline,
+          epi_store_pipe_producer_state,
+          problem_shape_MNKL,
+          blk_shape,
+          blk_coord,
+          accumulators,
+          tiled_mma,
+          warp_group_thread_idx,
+          shared_storage.tensors.epilogue
+        );
+
+        // TMA store pipeline wait is only visible to TMA-issuing warp, so for multiple-consumer kernels
+        // we need to wait for all TMA stores to complete before issuing consumer order barrier arrives
+        // to ensure next math consumer doesn't overwrite smem of in-flight TMA stores of current consumer.
+        auto [epi_load_pipe_consumer_state_next_, epi_store_pipe_producer_state_next_] =
+        collective_epilogue.store_tail(
+          epi_load_pipeline,
+          epi_load_pipe_consumer_state_next,
+          epi_store_pipeline,
+          epi_store_pipe_producer_state_next
+        );
+
+        // Update starting load/store pipeline states for the next tile
+        // state has already been incremented by 1 tile in collective calls, advance once again for ping pong
+        epi_load_pipe_consumer_state = epi_load_pipe_consumer_state_next_;
+        epi_store_pipe_producer_state = epi_store_pipe_producer_state_next_;
+        epi_load_pipe_consumer_state.advance(c_tile_count);
+        epi_store_pipe_producer_state.advance(d_tile_count);
+
+        // Cue for next Math WG's Epilogue to start
+        math_wg_order_barrier.arrive();
+
+        // Get next work tile
+        scheduler.advance_to_next_work(NumMmaWarpGroups);
+        work_tile_info = scheduler.get_current_work();
+      } // Scheduler work fetch loop
+    } // Consumer Warp Groups End
+#endif
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized.hpp b/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized.hpp
new file mode 100644
index 0000000000..c2a888ae36
--- /dev/null
+++ b/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized.hpp
@@ -0,0 +1,417 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/arch/mma_sm90.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cute/tensor.hpp"
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<cute::is_base_of_v<KernelCpAsyncWarpSpecialized, typename CollectiveMainloop_::DispatchPolicy::Schedule>>>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(cute::rank(ProblemShape{}) == 3 or cute::rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+  static constexpr bool IsGdcEnabled = false;
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  static_assert(cute::is_void_v<TileScheduler_> or cute::is_same_v<TileScheduler_, PersistentScheduler>,
+    "Non-persistent warp-specialized kernel does not support specializing the tile scheduler.");
+  using TileSchedulerTag = TileScheduler_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileScheduler_, ArchTag, TileShape, ClusterShape>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    union TensorStorage {
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+
+      MainloopTensorStorage mainloop;
+      EpilogueTensorStorage epilogue;
+    } tensors;
+
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+    } pipelines;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  using GmemTiledCopyA = typename CollectiveMainloop::GmemTiledCopyA;
+  using GmemTiledCopyB = typename CollectiveMainloop::GmemTiledCopyB;
+  static_assert(cute::size(GmemTiledCopyA{}) == cute::size(GmemTiledCopyB{}), "Number of threads in A/B tiled copies must be the same.");
+
+  static constexpr uint32_t NumLoadWarpGroups = cute::size(GmemTiledCopyA{}) / NumThreadsPerWarpGroup;
+  static constexpr uint32_t NumMmaWarpGroups = cute::size(TiledMma{}) / NumThreadsPerWarpGroup;
+  static constexpr uint32_t NumWarpGroups = NumLoadWarpGroups + NumMmaWarpGroups;
+  static_assert(NumWarpGroups == 2 || NumWarpGroups == 3, "Number of warp groups must be 2 or 3 for good performance.");
+
+  static constexpr uint32_t MaxThreadsPerBlock = NumWarpGroups * NumThreadsPerWarpGroup;
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    (void) workspace;
+    auto problem_shape = args.problem_shape;
+    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
+      // swap M/N
+      get<0>(problem_shape) = get<1>(args.problem_shape);
+      get<1>(problem_shape) = get<0>(args.problem_shape);
+    }
+    return {
+      args.mode,
+      problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, workspace)
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
+        (args.mode == GemmUniversalMode::kBatched && cute::rank(ProblemShape{}) == 4);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+
+    return implementable;
+  }
+
+  static
+  size_t
+  get_workspace_size(Arguments const& args) {
+    return 0;
+  }
+
+  static
+  cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return Status::kSuccess;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    auto cluster_shape = Shape<_1,_1,_1>{};
+    auto tile_shape = TileShape{};
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    return TileScheduler::get_tiled_cta_shape_mnl(
+        problem_shape_MNKL, tile_shape, cluster_shape);
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
+#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
+#else
+
+    enum class WarpGroupRole {
+      Producer = 0,
+      Consumer = 1,
+    };
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    int thread_idx = int(threadIdx.x);
+    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
+    int warp_group_idx = canonical_warp_group_idx();
+    CUTLASS_ASSERT(warp_group_idx < NumWarpGroups);
+    WarpGroupRole warp_group_role = warp_group_idx < NumLoadWarpGroups ? WarpGroupRole::Producer : WarpGroupRole::Consumer;
+
+    // Mainloop Load pipeline
+    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+    typename MainloopPipeline::Params mainloop_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_pipeline_params.producer_arv_count = NumLoadWarpGroups * NumThreadsPerWarpGroup;
+    mainloop_pipeline_params.consumer_arv_count = NumMmaWarpGroups * NumThreadsPerWarpGroup;
+    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params);
+
+    // Epilogue Load pipeline
+    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.producer_arv_count = NumLoadWarpGroups * NumThreadsPerWarpGroup;
+    epi_load_pipeline_params.consumer_arv_count = NumMmaWarpGroups * NumThreadsPerWarpGroup;
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    // Initialize starting pipeline states for the collectives
+    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
+    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
+
+    // For the DMA Load (producer) we start with an opposite phase
+    // i.e., we skip all waits since we know that the buffer is indeed empty
+    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
+    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    // Preconditions
+    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    // Separate out problem shape for convenience
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    auto M = get<0>(problem_shape_MNKL);
+    auto N = get<1>(problem_shape_MNKL);
+    auto K = get<2>(problem_shape_MNKL);
+    auto L = get<3>(problem_shape_MNKL);
+
+    // Represent the full tensors
+    Tensor mA_mkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_A), make_shape(M,K,L), params.mainloop.dA); //(m,k,l)
+    Tensor mB_nkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_B), make_shape(N,K,L), params.mainloop.dB); //(n,k,l)
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
+    TiledMma tiled_mma;
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, blk_shape, make_coord(_,_,_), Step<_1, X,_1>{});          // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, blk_shape, make_coord(_,_,_), Step< X,_1,_1>{});          // (BLK_N,BLK_K,n,k,l)
+
+    // Compute m_coord, n_coord, and l_coord with their post-tiled shapes
+    auto m_coord = idx2crd(int(blockIdx.x), shape<2>(gA_mkl));
+    auto n_coord = idx2crd(int(blockIdx.y), shape<2>(gB_nkl));
+    auto l_coord = idx2crd(int(blockIdx.z), shape<4>(gB_nkl));
+    auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+    // Slice with m_coord and n_coord
+    Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                       // (BLK_M,BLK_K,k)
+    Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                       // (BLK_N,BLK_K,k)
+
+    // Get pipeline iterators and increments from tensor shapes
+    auto k_tile_iter  = cute::make_coord_iterator(shape<2>(gA));
+    auto k_tile_count = size<2>(gA);
+    auto c_tile_count = CollectiveEpilogue::get_load_pipe_increment(blk_shape);
+    auto d_tile_count = CollectiveEpilogue::get_store_pipe_increment(blk_shape);
+
+    // Wait for all threads in the thread block
+    __syncthreads();
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop;
+    CollectiveEpilogue collective_epilogue{params.epilogue, shared_storage.tensors.epilogue};
+
+    if (warp_group_role == WarpGroupRole::Producer) {
+      // Compute tile residues for predication
+      auto m_max_coord = M - size<0>(gA) * get<0>(blk_coord);                             // M - BLK_M * m_coord
+      auto n_max_coord = N - size<0>(gB) * get<1>(blk_coord);                             // N - BLK_N * n_coord
+      auto k_residue   = K - size<1>(gA) * size<2>(gA);                                   // K - BLK_K * k_coord_max
+      auto residue_mnk = make_tuple(m_max_coord, n_max_coord, k_residue);
+
+      collective_mainloop.load(
+        mainloop_pipeline,
+        mainloop_pipe_producer_state,
+        gA,
+        gB,
+        k_tile_iter, k_tile_count,
+        residue_mnk,
+        thread_idx,
+        shared_storage.tensors.mainloop
+      );
+      // Update starting mainloop pipeline state for the pipeline drain
+      mainloop_pipe_producer_state.advance(k_tile_count);
+      // Make sure mainloop consumer has been waited upon before issuing epilogue load
+      collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+
+      if (collective_epilogue.is_producer_load_needed()) {
+        epi_load_pipe_producer_state =
+        collective_epilogue.load(
+          epi_load_pipeline,
+          epi_load_pipe_producer_state,
+          problem_shape_MNKL,
+          blk_shape,
+          blk_coord,
+          tiled_mma,
+          thread_idx,
+          shared_storage.tensors.epilogue
+        );
+        collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
+      }
+    }
+    else if (warp_group_role == WarpGroupRole::Consumer) {
+      Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));                 // (MMA,MMA_M,MMA_N)
+
+      collective_mainloop.mma(
+        mainloop_pipeline,
+        mainloop_pipe_consumer_state,
+        accumulators,
+        k_tile_count,
+        warp_group_thread_idx,
+        shared_storage.tensors.mainloop,
+        params.mainloop
+      );
+
+      // Make sure the math instructions are done and free buffers before entering the epilogue
+      collective_mainloop.mma_tail(
+        mainloop_pipeline,
+        mainloop_pipe_consumer_state,
+        k_tile_count
+      );
+
+      // Epilogue and write to gD
+      collective_epilogue.store(
+        epi_load_pipeline,
+        epi_load_pipe_consumer_state,
+        epi_store_pipeline,
+        epi_store_pipe_producer_state,
+        problem_shape_MNKL,
+        blk_shape,
+        blk_coord,
+        accumulators,
+        tiled_mma,
+        warp_group_thread_idx,
+        shared_storage.tensors.epilogue
+      );
+    }
+#endif
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_cooperative.hpp b/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_cooperative.hpp
new file mode 100644
index 0000000000..0417452064
--- /dev/null
+++ b/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_cooperative.hpp
@@ -0,0 +1,504 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/arch/mma_sm90.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cute/tensor.hpp"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<cute::is_base_of_v<KernelCpAsyncWarpSpecializedCooperative, typename CollectiveMainloop_::DispatchPolicy::Schedule>>>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(cute::rank(ProblemShape{}) == 3 or cute::rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+  static constexpr bool IsGdcEnabled = false;
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  using TileSchedulerTag = TileScheduler_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileScheduler_, ArchTag, TileShape, ClusterShape>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  using GmemTiledCopyA = typename CollectiveMainloop::GmemTiledCopyA;
+  using GmemTiledCopyB = typename CollectiveMainloop::GmemTiledCopyB;
+  static_assert(cute::size(GmemTiledCopyA{}) == cute::size(GmemTiledCopyB{}), "Number of threads in A/B tiled copies must be the same");
+
+  static constexpr uint32_t NumLoadWarpGroups = cute::size(GmemTiledCopyA{}) / NumThreadsPerWarpGroup;
+  static constexpr uint32_t NumMmaWarpGroups = cute::size(TiledMma{}) / NumThreadsPerWarpGroup;
+  static constexpr uint32_t NumWarpGroups = NumLoadWarpGroups + NumMmaWarpGroups;
+  static_assert(NumWarpGroups == 2 || NumWarpGroups == 3, "Number of warp groups must be 2 or 3 for good performance.");
+
+  static constexpr uint32_t MaxThreadsPerBlock = NumWarpGroups * NumThreadsPerWarpGroup;
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _1> {
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+
+      MainloopTensorStorage mainloop;
+      EpilogueTensorStorage epilogue;
+    } tensors;
+
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+    } pipelines;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerParams scheduler{};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    CUTLASS_TRACE_HOST("to_underlying_arguments():");
+
+    auto problem_shape = args.problem_shape;
+    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
+      // swap M/N
+      get<0>(problem_shape) = get<1>(args.problem_shape);
+      get<1>(problem_shape) = get<0>(args.problem_shape);
+    }
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+
+    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
+    TileSchedulerParams scheduler = TileScheduler::to_underlying_arguments(
+      problem_shape_MNKL, TileShape{}, ClusterShape{}, hw_info, args.scheduler, workspace);
+
+    return {
+      args.mode,
+      problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, workspace),
+      hw_info,
+      scheduler
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
+        (args.mode == GemmUniversalMode::kBatched && cute::rank(ProblemShape{}) == 4);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+
+    return implementable;
+  }
+
+  static
+  size_t
+  get_workspace_size(Arguments const& args) {
+    TileScheduler t;
+    return t.template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumMmaWarpGroups);
+  }
+
+  static
+  cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    TileScheduler t;
+    static constexpr uint32_t NumEpilogueSubTiles = 1;
+    static constexpr uint32_t NumAccumulatorMtxs = 1;
+    return t.template initialize_workspace<ProblemShape, ElementAccumulator>(
+      args.scheduler, workspace, stream, args.problem_shape, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles, NumAccumulatorMtxs, cuda_adapter);
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    // Given device SM count, set grid size s.t. we do not launch more thread blocks than we can run concurrently
+    TileSchedulerArguments args{};
+    if constexpr (!std::is_const_v<decltype(args.max_swizzle_size)>) {
+      args.max_swizzle_size = 1 << params.scheduler.log_swizzle_size_;
+    }
+    return TileScheduler::get_grid_shape(params.scheduler, params.problem_shape, TileShape{}, ClusterShape{}, params.hw_info, args);
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
+#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
+#else
+
+    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    /* In the Cooperative kernel, one or multiple Consumers collaborate on the same tile */
+    enum class WarpGroupRole {
+      Producer = 0,
+      Consumer = 1,
+    };
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    int thread_idx = int(threadIdx.x);
+    int mma_thread_idx = thread_idx % size(TiledMma{});
+    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
+    int warp_group_idx = canonical_warp_group_idx();
+    CUTLASS_ASSERT(warp_group_idx < NumWarpGroups);
+    WarpGroupRole warp_group_role = warp_group_idx < NumLoadWarpGroups ? WarpGroupRole::Producer : WarpGroupRole::Consumer;
+
+    // Mainloop Load pipeline
+    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+    typename MainloopPipeline::Params mainloop_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_pipeline_params.producer_arv_count = NumLoadWarpGroups * NumThreadsPerWarpGroup;
+    mainloop_pipeline_params.consumer_arv_count = NumMmaWarpGroups * NumThreadsPerWarpGroup;
+    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params);
+
+    // Epilogue Load pipeline
+    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.producer_arv_count = NumLoadWarpGroups * NumThreadsPerWarpGroup;
+    epi_load_pipeline_params.consumer_arv_count = NumMmaWarpGroups * NumThreadsPerWarpGroup;
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    // Initialize starting pipeline states for the collectives
+    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
+    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
+
+    // For the DMA Load (producer) we start with an opposite phase
+    // i.e., we skip all waits since we know that the buffer is indeed empty
+    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
+    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    // Separate out problem shape for convenience
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    auto M = get<0>(problem_shape_MNKL);
+    auto N = get<1>(problem_shape_MNKL);
+    auto K = get<2>(problem_shape_MNKL);
+    auto L = get<3>(problem_shape_MNKL);
+
+    // Represent the full tensors
+    Tensor mA_mkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_A), make_shape(M,K,L), params.mainloop.dA); //(m,k,l)
+    Tensor mB_nkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_B), make_shape(N,K,L), params.mainloop.dB); //(n,k,l)
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    TiledMma tiled_mma;
+    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, blk_shape, make_coord(_,_,_), Step<_1, X,_1>{});          // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, blk_shape, make_coord(_,_,_), Step< X,_1,_1>{});          // (BLK_N,BLK_K,n,k,l)
+
+    TileScheduler scheduler{params.scheduler};
+    auto work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop;
+    CollectiveEpilogue collective_epilogue{params.epilogue, shared_storage.tensors.epilogue};
+
+    // Wait for all threads in the thread block
+    __syncthreads();
+
+    if (warp_group_role == WarpGroupRole::Producer) {
+
+      while (work_tile_info.is_valid()) {
+        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+        // Slice with our work tile coordinates to construct mainloop tensor views
+        Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                   // (BLK_M,BLK_K,k)
+        Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                   // (BLK_N,BLK_K,k)
+
+        // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+        auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+        auto work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
+        auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, shape<2>(gA)), shape<2>(gA));
+
+        // Compute tile residues for predication
+        auto m_max_coord = M - size<0>(gA) * get<0>(blk_coord);                             // M - BLK_M * m_coord
+        auto n_max_coord = N - size<0>(gB) * get<1>(blk_coord);                             // N - BLK_N * n_coord
+        auto k_residue   = K - size<1>(gA) * size<2>(gA);                                   // K - BLK_K * k_coord_max
+        auto residue_mnk = make_tuple(m_max_coord, n_max_coord, k_residue);
+
+        collective_mainloop.load(
+          mainloop_pipeline,
+          mainloop_pipe_producer_state,
+          gA,
+          gB,
+          k_tile_iter, work_k_tile_count,
+          residue_mnk,
+          thread_idx,
+          shared_storage.tensors.mainloop
+        );
+        // Update starting pipeline state for the next tile
+        mainloop_pipe_producer_state.advance(work_k_tile_count);
+
+        if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler) &&
+           collective_epilogue.is_producer_load_needed()) {
+          epi_load_pipe_producer_state =
+          collective_epilogue.load(
+            epi_load_pipeline,
+            epi_load_pipe_producer_state,
+            problem_shape_MNKL,
+            blk_shape,
+            blk_coord,
+            tiled_mma,
+            warp_group_thread_idx,
+            shared_storage.tensors.epilogue
+          );
+      }
+
+        // Get next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+        work_tile_info = next_work_tile_info;
+      } // Scheduler work fetch loop
+
+      // Make sure all Consumer Warp Groups have been waited upon
+      collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+      
+      if (collective_epilogue.is_producer_load_needed()) {
+        collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
+      }
+    } // Producer Warp Group End
+
+    else if (warp_group_role == WarpGroupRole::Consumer) {
+
+      bool do_store_tail = false;
+      while (work_tile_info.is_valid()) {
+        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+        auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+
+        // Allocate the the accumulators for the (M,N) blk_shape
+        //
+        // MSVC CTAD breaks if we say "Tensor" here, so we use "auto" instead.
+        auto accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));               // (MMA,MMA_M,MMA_N)
+
+        collective_mainloop.mma(
+          mainloop_pipeline,
+          mainloop_pipe_consumer_state,
+          accumulators,
+          work_k_tile_count,
+          mma_thread_idx,
+          shared_storage.tensors.mainloop,
+          params.mainloop
+        );
+
+        // Make sure the math instructions are done and free buffers before entering the epilogue
+        collective_mainloop.mma_tail(
+          mainloop_pipeline,
+          mainloop_pipe_consumer_state,
+          work_k_tile_count
+        );
+
+        // Update starting mainloop pipeline state for the next tile
+        mainloop_pipe_consumer_state.advance(work_k_tile_count);
+
+        // Index of warp group within consumer warp groups
+        int consumer_warp_group_idx = canonical_warp_group_idx() - NumLoadWarpGroups;
+
+        // Perform reduction across splits, if needed
+        TileScheduler::fixup(
+          params.scheduler, work_tile_info, accumulators, NumMmaWarpGroups, consumer_warp_group_idx);
+
+        if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
+          // Epilogue and write to gD
+          auto [epi_load_pipe_consumer_state_next, epi_store_pipe_producer_state_next] =
+          collective_epilogue.store(
+            epi_load_pipeline,
+            epi_load_pipe_consumer_state,
+            epi_store_pipeline,
+            epi_store_pipe_producer_state,
+            problem_shape_MNKL,
+            blk_shape,
+            blk_coord,
+            accumulators,
+            tiled_mma,
+            mma_thread_idx,
+            shared_storage.tensors.epilogue
+          );
+          epi_load_pipe_consumer_state = epi_load_pipe_consumer_state_next;
+          epi_store_pipe_producer_state = epi_store_pipe_producer_state_next;
+          do_store_tail = true;
+        }
+
+        // Get next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+        work_tile_info = next_work_tile_info;
+      } // Scheduler work fetch loop
+
+      if (do_store_tail) {
+        collective_epilogue.store_tail(
+          epi_load_pipeline,
+          epi_load_pipe_consumer_state,
+          epi_store_pipeline,
+          epi_store_pipe_producer_state
+        );
+      }
+    } // Consumer Warp Groups End
+#endif
+  }
+
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_pingpong.hpp b/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_pingpong.hpp
new file mode 100644
index 0000000000..142fabd2f5
--- /dev/null
+++ b/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_pingpong.hpp
@@ -0,0 +1,516 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/fast_math.h"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/arch/mma_sm90.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/gemm/kernel/gemm_universal_decl.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+
+#include "cute/tensor.hpp"
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<cute::is_base_of_v<KernelCpAsyncWarpSpecializedPingpong, typename CollectiveMainloop_::DispatchPolicy::Schedule>>>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(cute::rank(ProblemShape{}) == 3 or cute::rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+  static constexpr bool IsGdcEnabled = false;
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  static_assert(!cute::is_same_v<TileScheduler_, StreamKScheduler>, "Ping-pong kernel does not currently support stream-K scheduler.");
+  using TileSchedulerTag = TileScheduler_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileScheduler_, ArchTag, TileShape, ClusterShape>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  using GmemTiledCopyA = typename CollectiveMainloop::GmemTiledCopyA;
+  using GmemTiledCopyB = typename CollectiveMainloop::GmemTiledCopyB;
+  static_assert(cute::size(GmemTiledCopyA{}) == cute::size(GmemTiledCopyB{}), "Number of threads in A/B tiled copies must be the same");
+
+  static constexpr uint32_t NumLoadWarpGroups = cute::size(GmemTiledCopyA{}) / NumThreadsPerWarpGroup;
+  static constexpr uint32_t NumMmaWarpGroups = 2 * cute::size(TiledMma{}) / NumThreadsPerWarpGroup;
+  static constexpr uint32_t NumWarpGroups = NumLoadWarpGroups + NumMmaWarpGroups;
+  static_assert(NumWarpGroups == 2 || NumWarpGroups == 3, "Number of warp groups must be 2 or 3 for good performance.");
+  static_assert(NumMmaWarpGroups == 2, "Pingpong kernel requires 2 MMA warp groups.");
+
+  static constexpr uint32_t MaxThreadsPerBlock = NumWarpGroups * NumThreadsPerWarpGroup;
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  // Order Sequence barrier with two stages: one for Mainloop and one for Epilogue
+  static constexpr uint32_t StagesPerMathWarpGroup = 2;
+  using MathWarpGroupOrderBarrier = cutlass::OrderedSequenceBarrier<
+    StagesPerMathWarpGroup, NumMmaWarpGroups>;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _1> {
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+
+      MainloopTensorStorage mainloop;
+      EpilogueTensorStorage epilogue;
+    } tensors;
+
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+      using MathWarpGroupOrderBarrierStorage = typename MathWarpGroupOrderBarrier::SharedStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+      alignas(16) MathWarpGroupOrderBarrierStorage math_wg_order;
+    } pipelines;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerParams scheduler{};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    CUTLASS_TRACE_HOST("to_underlying_arguments():");
+
+    (void) workspace;
+    auto problem_shape = args.problem_shape;
+    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
+      // swap M/N
+      get<0>(problem_shape) = get<1>(args.problem_shape);
+      get<1>(problem_shape) = get<0>(args.problem_shape);
+    }
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+
+    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
+    TileSchedulerParams scheduler = TileScheduler::to_underlying_arguments(
+      problem_shape_MNKL, TileShape{}, ClusterShape{}, hw_info, args.scheduler, workspace);
+
+    return {
+      args.mode,
+      problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, workspace),
+      hw_info,
+      scheduler
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
+        (args.mode == GemmUniversalMode::kBatched && cute::rank(ProblemShape{}) == 4);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+
+    return implementable;
+  }
+
+  static
+  size_t
+  get_workspace_size(Arguments const& args) {
+    return 0;
+  }
+
+  static
+  cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return Status::kSuccess;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    // Given device SM count, set grid size s.t. we do not launch more thread blocks than we can run concurrently
+    TileSchedulerArguments args{};
+    if constexpr (!std::is_const_v<decltype(args.max_swizzle_size)>) {
+      args.max_swizzle_size = 1 << params.scheduler.log_swizzle_size_;
+    }
+    return TileScheduler::get_grid_shape(params.scheduler, params.problem_shape, TileShape{}, ClusterShape{}, params.hw_info, args);
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
+#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
+#else
+
+    // Preconditions
+    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    enum class WarpGroupRole {
+      Producer = 0,
+      Consumer = 1,
+    };
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    int thread_idx = int(threadIdx.x);
+    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
+    int warp_group_idx = canonical_warp_group_idx();
+    CUTLASS_ASSERT(warp_group_idx < NumWarpGroups);
+    WarpGroupRole warp_group_role = warp_group_idx < NumLoadWarpGroups ? WarpGroupRole::Producer : WarpGroupRole::Consumer;
+    int warp_group_consumer_idx = warp_group_idx - NumLoadWarpGroups;
+
+    // Mainloop Load pipeline
+    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+    typename MainloopPipeline::Params mainloop_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_pipeline_params.producer_arv_count = NumLoadWarpGroups * NumThreadsPerWarpGroup;
+    mainloop_pipeline_params.consumer_arv_count = NumThreadsPerWarpGroup; // only 1 WG consumes at a time
+    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params);
+
+    // Epilogue Load pipeline
+    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.producer_arv_count = NumLoadWarpGroups * NumThreadsPerWarpGroup;
+    epi_load_pipeline_params.consumer_arv_count = NumThreadsPerWarpGroup; // only 1 WG consumes at a time
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    typename MathWarpGroupOrderBarrier::Params params_math_wg_order_barrier;
+    // DMA Load WG will not participate in these Ordered Barrier syncs
+    params_math_wg_order_barrier.group_id = warp_group_consumer_idx;
+    params_math_wg_order_barrier.group_size = NumThreadsPerWarpGroup; // Number of threads / participants in a group
+    MathWarpGroupOrderBarrier math_wg_order_barrier(shared_storage.pipelines.math_wg_order, params_math_wg_order_barrier);
+
+    // Initialize starting pipeline states for the collectives
+    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
+    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
+
+    // For the DMA Load (producer) we start with an opposite phase
+    // i.e., we skip all waits since we know that the buffer is indeed empty
+    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
+    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    // Separate out problem shape for convenience
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    auto M = get<0>(problem_shape_MNKL);
+    auto N = get<1>(problem_shape_MNKL);
+    auto K = get<2>(problem_shape_MNKL);
+    auto L = get<3>(problem_shape_MNKL);
+
+    // Represent the full tensors
+    Tensor mA_mkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_A), make_shape(M,K,L), params.mainloop.dA); //(m,k,l)
+    Tensor mB_nkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_B), make_shape(N,K,L), params.mainloop.dB); //(n,k,l)
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    TiledMma tiled_mma;
+    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, blk_shape, make_coord(_,_,_), Step<_1, X,_1>{});          // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, blk_shape, make_coord(_,_,_), Step< X,_1,_1>{});          // (BLK_N,BLK_K,n,k,l)
+
+    // Get pipeline stage increments from tensor shapes
+    auto k_tile_count = size<3>(gA_mkl);
+    auto c_tile_count = CollectiveEpilogue::get_load_pipe_increment(blk_shape);
+    auto d_tile_count = CollectiveEpilogue::get_store_pipe_increment(blk_shape);
+
+    TileScheduler scheduler{params.scheduler};
+
+    if (warp_group_consumer_idx == 1) {
+      // Advance 2nd Math WG to the next work tile for the startup
+      scheduler.advance_to_next_work();
+      // Advance 2nd Math WG pipeline states to the end of 1st Math WG
+      mainloop_pipe_consumer_state.advance(k_tile_count);
+      epi_load_pipe_consumer_state.advance(c_tile_count);
+      epi_store_pipe_producer_state.advance(d_tile_count);
+    }
+    auto work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop;
+    CollectiveEpilogue collective_epilogue{params.epilogue, shared_storage.tensors.epilogue};
+
+    // Wait for all threads in the thread block
+    __syncthreads();
+
+    if (warp_group_role == WarpGroupRole::Producer) {
+
+      while (work_tile_info.is_valid()) {
+        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+        // Slice with our work tile coordinates to construct mainloop tensor views
+        Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                   // (BLK_M,BLK_K,k)
+        Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                   // (BLK_N,BLK_K,k)
+
+        auto k_tile_iter  = cute::make_coord_iterator(shape<2>(gA));
+
+        // Compute tile residues for predication
+        auto m_max_coord = M - size<0>(gA) * get<0>(blk_coord);                             // M - BLK_M * m_coord
+        auto n_max_coord = N - size<0>(gB) * get<1>(blk_coord);                             // N - BLK_N * n_coord
+        auto k_residue   = K - size<1>(gA) * size<2>(gA);                                   // K - BLK_K * k_coord_max
+        auto residue_mnk = make_tuple(m_max_coord, n_max_coord, k_residue);
+
+        collective_mainloop.load(
+          mainloop_pipeline,
+          mainloop_pipe_producer_state,
+          gA,
+          gB,
+          k_tile_iter, k_tile_count,
+          residue_mnk,
+          thread_idx,
+          shared_storage.tensors.mainloop
+        );
+        // Update starting pipeline state for the next tile
+        mainloop_pipe_producer_state.advance(k_tile_count);
+
+        if (collective_epilogue.is_producer_load_needed()) {
+          collective_epilogue.load(
+            epi_load_pipeline,
+            epi_load_pipe_producer_state,
+            problem_shape_MNKL,
+            blk_shape,
+            blk_coord,
+            tiled_mma,
+            warp_group_thread_idx,
+            shared_storage.tensors.epilogue
+          );
+          // Update starting pipeline state for the next tile
+          epi_load_pipe_producer_state.advance(c_tile_count);
+        }
+
+        // Get next work tile
+        scheduler.advance_to_next_work();
+        work_tile_info = scheduler.get_current_work();
+      } // Scheduler work fetch loop
+
+      // Make sure all Consumer Warp Groups have been waited upon
+      collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+      if (collective_epilogue.is_producer_load_needed()) {
+        collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
+      }
+    } // Producer Warp Group End
+
+    else if (warp_group_role == WarpGroupRole::Consumer) {
+
+      while (work_tile_info.is_valid()) {
+        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+        // Allocate the the accumulators for the (M,N) blk_shape
+        Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));               // (MMA,MMA_M,MMA_N)
+
+        // Order two Math WG's MMA one after the other, helps hide Epilogue
+        math_wg_order_barrier.wait();
+
+        collective_mainloop.mma(
+          mainloop_pipeline,
+          mainloop_pipe_consumer_state,
+          accumulators,
+          k_tile_count,
+          thread_idx,
+          shared_storage.tensors.mainloop,
+          params.mainloop
+        );
+
+        // Cue for next Math WG's MMA to start
+        math_wg_order_barrier.arrive();
+
+        // Make sure the math instructions are done and free buffers before entering the epilogue
+        collective_mainloop.mma_tail(
+          mainloop_pipeline,
+          mainloop_pipe_consumer_state,
+          k_tile_count
+        );
+        // Update starting mainloop pipeline state for the next tile
+        mainloop_pipe_consumer_state.advance(k_tile_count * NumMmaWarpGroups);
+
+        // Order two Math WG's Epilogue one after the other
+        math_wg_order_barrier.wait();
+
+        // Epilogue and write to gD
+        collective_epilogue.store(
+          epi_load_pipeline,
+          epi_load_pipe_consumer_state,
+          epi_store_pipeline,
+          epi_store_pipe_producer_state,
+          problem_shape_MNKL,
+          blk_shape,
+          blk_coord,
+          accumulators,
+          tiled_mma,
+          warp_group_thread_idx,
+          shared_storage.tensors.epilogue
+        );
+        // Update starting load/store pipeline states for the next tile
+        epi_load_pipe_consumer_state.advance(c_tile_count * NumMmaWarpGroups);
+        epi_store_pipe_producer_state.advance(d_tile_count * NumMmaWarpGroups);
+
+        // Wait for all TMA stores to complete
+        epi_store_pipeline.producer_tail(epi_store_pipe_producer_state);
+
+        // Cue for next Math WG's Epilogue to start
+        math_wg_order_barrier.arrive();
+
+        // Get next work tile
+        scheduler.advance_to_next_work(NumMmaWarpGroups);
+        work_tile_info = scheduler.get_current_work();
+      } // Scheduler work fetch loop
+    } // Consumer Warp Groups End
+#endif
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp b/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp
new file mode 100644
index 0000000000..08437c70c5
--- /dev/null
+++ b/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp
@@ -0,0 +1,139 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/gemm/kernel/static_tile_scheduler.hpp"
+
+namespace cutlass::gemm::kernel::detail {
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Persistent Thread Block (TB) scheduler
+class PersistentTileSchedulerSm90:
+public StaticPersistentTileScheduler<PersistentTileSchedulerSm90> {
+
+  using BaseScheduler = StaticPersistentTileScheduler<PersistentTileSchedulerSm90>;
+public:
+  using StaticPersistentTileScheduler::StaticPersistentTileScheduler;
+  using Params = PersistentTileSchedulerSm90Params;
+  using RasterOrder = typename Params::RasterOrder;
+  using RasterOrderOptions = typename Params::RasterOrderOptions;
+  using Arguments = BaseScheduler::Arguments;
+
+  static constexpr bool IsDynamicPersistent = false;
+
+  // get work_idx_m, work_idx_n from blk_per_grid_dim while applying swizzle
+  static CUTLASS_DEVICE
+  cute::tuple<int32_t, int32_t>
+  get_work_idx_m_and_n(
+      uint64_t blk_per_grid_dim,
+      FastDivmodU64Pow2 const& divmod_cluster_shape_major,
+      FastDivmodU64Pow2 const& divmod_cluster_shape_minor,
+      FastDivmodU64 const& divmod_cluster_blk_major,
+      int32_t log_swizzle_size,
+      RasterOrder raster_order) {
+    auto [cta_m_in_cluster, cta_n_in_cluster, _] = cute::block_id_in_cluster();
+    return get_work_idx_m_and_n(
+      blk_per_grid_dim,
+      divmod_cluster_shape_major,
+      divmod_cluster_shape_minor,
+      divmod_cluster_blk_major,
+      log_swizzle_size,
+      raster_order,
+      cta_m_in_cluster,
+      cta_n_in_cluster
+    );
+  }
+
+  static CUTLASS_DEVICE
+  cute::tuple<int32_t, int32_t>
+  get_work_idx_m_and_n(
+      uint64_t blk_per_grid_dim,
+      FastDivmodU64Pow2 const& divmod_cluster_shape_major,
+      FastDivmodU64Pow2 const& divmod_cluster_shape_minor,
+      FastDivmodU64 const& divmod_cluster_blk_major,
+      int32_t log_swizzle_size,
+      RasterOrder raster_order,
+      uint64_t cta_m_in_cluster,
+      uint64_t cta_n_in_cluster) {
+
+    uint64_t cluster_id, cluster_major_offset = 0, cluster_minor_offset = 0;
+    divmod_cluster_shape_major(cluster_id, cluster_major_offset, blk_per_grid_dim);
+
+    if (raster_order == RasterOrder::AlongN) {
+      cluster_minor_offset = cta_m_in_cluster;
+    }
+    else {
+      cluster_minor_offset = cta_n_in_cluster;
+    }
+
+    uint64_t cluster_idx_minor, cluster_idx_major;
+
+    uint64_t cluster_idx_minor_div_swizzle, extra, offset;
+
+    offset = cluster_id & ((1 << log_swizzle_size) - 1);
+    extra = cluster_id >> log_swizzle_size;
+
+    divmod_cluster_blk_major(cluster_idx_minor_div_swizzle, cluster_idx_major, extra);
+
+    cluster_idx_minor = cluster_idx_minor_div_swizzle * (1 << log_swizzle_size) + offset;
+
+    auto minor_work_idx = static_cast<int32_t>(cluster_idx_minor * divmod_cluster_shape_minor.divisor +
+                                               cluster_minor_offset);
+    auto major_work_idx = static_cast<int32_t>(cluster_idx_major * divmod_cluster_shape_major.divisor +
+                                               cluster_major_offset);
+
+    if (raster_order == RasterOrder::AlongN) {
+      return {minor_work_idx, major_work_idx};
+    }
+    else {
+      return {major_work_idx, minor_work_idx};
+    }
+
+  }
+
+  // The basic tile scheduler does not require any additional workspace
+  template <class ProblemShape, class ElementAccumulator>
+  static size_t
+  get_workspace_size(Arguments const&, ProblemShape, KernelHardwareInfo const&, uint32_t, const uint32_t = 1, uint32_t = 1) {
+    return 0;
+  }
+
+  template <class ProblemShape, class ElementAccumulator>
+  static cutlass::Status
+  initialize_workspace(Arguments const&, void*, cudaStream_t, ProblemShape, KernelHardwareInfo const&,
+    uint32_t, const uint32_t = 1, uint32_t = 1, CudaHostAdapter* cuda_adapter = nullptr) {
+    return Status::kSuccess;
+  }
+
+};
+
+}
diff --git a/include/cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp b/include/cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp
new file mode 100644
index 0000000000..a30d9ce08b
--- /dev/null
+++ b/include/cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp
@@ -0,0 +1,514 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm_coord.hpp"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+#include "cute/layout.hpp"
+#include "cute/tensor.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+
+namespace cutlass::gemm::kernel::detail {
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Persistent Thread Block (TB) scheduler
+template <class GroupProblemShape>
+class PersistentTileSchedulerSm90Group {
+  //
+  // Data members
+  //
+
+private:
+  uint64_t current_work_linear_idx_ = 0;
+  uint64_t total_grid_size_ = 0;
+
+  // Tracking current group, its starting linear idx and total tiles
+  struct GroupInfo {
+    int group_idx = 0;
+    uint64_t start_linear_idx = 0;
+    uint64_t total_tiles = 0;
+  } current_group_info_;
+
+public:
+  struct WorkTileInfo {
+    int32_t M_idx = 0;
+    int32_t N_idx = 0;
+    int32_t L_idx = 0;
+    bool is_valid_tile = false;
+
+    CUTLASS_HOST_DEVICE
+    bool
+    is_valid() const {
+      return is_valid_tile;
+    }
+
+    CUTLASS_HOST_DEVICE
+    static WorkTileInfo
+    invalid_work_tile() {
+      return {-1, -1, -1, false};
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool
+    is_final_split(uint32_t k_tiles_per_output_tile) const {
+      return true;
+    }
+
+    CUTLASS_HOST_DEVICE
+    int32_t
+    reduction_subtile_idx() const {
+      return -1;
+    }
+  };
+
+  using ProblemShape = typename GroupProblemShape::UnderlyingProblemShape;
+  using Params = PersistentTileSchedulerSm90GroupParams<ProblemShape>;
+  using RasterOrder = typename Params::RasterOrder;
+  using RasterOrderOptions = typename Params::RasterOrderOptions;
+  static constexpr bool IsDynamicPersistent = false;
+
+  struct Arguments {
+    int max_swizzle_size = 1;
+    // Not applying Heuristics for Grouped problems, since largest dimension can change per group
+    RasterOrderOptions raster_order = RasterOrderOptions::AlongM;
+  };
+
+  // Sink scheduler params as a member
+  Params scheduler_params;
+
+  //
+  // Methods
+  //
+
+  template <class TileShape, class ClusterShape>
+  static Params
+  to_underlying_arguments(
+    GroupProblemShape problem_shapes,
+    TileShape tile_shape,
+    ClusterShape cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    Arguments const& arguments,
+    [[maybe_unused]] void* workspace=nullptr,
+    [[maybe_unused]] const uint32_t epilogue_subtile = 1,
+    [[maybe_unused]] uint32_t ktile_start_alignment_count = 1u
+    ) {
+
+    // We only need the tile and cluster shape during scheduler setup, so let FTAD do the magic
+    static_assert(cute::is_static<TileShape>::value);
+    static_assert(cute::is_static<ClusterShape>::value);
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(
+      problem_shapes.groups(),
+      problem_shapes,
+      hw_info,
+      tile_shape, cluster_shape);
+
+    Params params;
+    params.initialize(
+      problem_blocks,
+      problem_shapes.groups(),
+      problem_shapes.problem_shapes,
+      problem_shapes.host_problem_shapes,
+      to_gemm_coord(tile_shape),
+      to_gemm_coord(cluster_shape),
+      hw_info,
+      arguments.max_swizzle_size, 
+      arguments.raster_order
+    );
+
+    return params;
+  }
+
+  // Given the inputs, computes the physical grid we should launch.
+  template<class TileShape, class ClusterShape>
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_grid_shape(
+    [[maybe_unused]] Params const& params,
+    GroupProblemShape problem_shapes,
+    TileShape tile_shape,
+    ClusterShape cluster_shape,
+    KernelHardwareInfo hw_info,
+    Arguments arguments,
+    bool truncate_by_problem_size=true) {
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(
+      problem_shapes.groups(),
+      problem_shapes,
+      hw_info,
+      tile_shape, cluster_shape);
+
+    return Params::get_grid_shape(
+      problem_blocks,
+      to_gemm_coord(cluster_shape),
+      hw_info,
+      arguments.max_swizzle_size,
+      arguments.raster_order,
+      /* truncate_by_problem_size = */true
+    );
+  }
+
+  // Given the inputs, computes the total number of output blocks this problem will compute over
+  // Note that this is only the logical size of our grid, not the physical grid we will actually launch.
+  template<class BlockShape, class ClusterShape>
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_tiled_cta_shape_mnl(int groups, GroupProblemShape problem_shapes, KernelHardwareInfo hw_info, BlockShape cta_shape, ClusterShape cluster_shape) {
+    uint32_t total_ctas = 0;
+    uint32_t cta_in_N_dim = 1; // We linearize the blocks across all the problems here
+
+    // If host problem shapes are not provided.
+    if (!problem_shapes.is_host_problem_shape_available()) {
+      total_ctas = hw_info.sm_count;
+    }
+    // If host problem shapes are provided, make a better decision about possibility to launch smaller grid.
+    else {
+      for (int group = 0; group < groups; group++) {
+        auto ctas_along_m = cute::size(cute::ceil_div(cute::shape<0>(problem_shapes.get_host_problem_shape(group)), cute::shape<0>(cta_shape)));
+        auto ctas_along_n = cute::size(cute::ceil_div(cute::shape<1>(problem_shapes.get_host_problem_shape(group)), cute::shape<1>(cta_shape)));
+        auto problem_blocks_m = round_up(ctas_along_m, cute::get<0>(cluster_shape));
+        auto problem_blocks_n = round_up(ctas_along_n, cute::get<1>(cluster_shape));
+        total_ctas += problem_blocks_m * problem_blocks_n;
+      }
+    }
+
+    return Params::get_tiled_cta_shape_mnl(
+      to_gemm_coord(cluster_shape),
+      total_ctas, cta_in_N_dim
+    );
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    return true;
+  }
+
+  PersistentTileSchedulerSm90Group() = default;
+
+  CUTLASS_DEVICE explicit PersistentTileSchedulerSm90Group(Params const& params_) : scheduler_params(params_) {
+    // MSVC requires protecting use of CUDA-specific nonstandard syntax,
+    // like blockIdx and gridDim, with __CUDA_ARCH__.
+#if defined(__CUDA_ARCH__)
+    if (scheduler_params.raster_order_ == RasterOrder::AlongN) {
+      current_work_linear_idx_ = uint64_t(blockIdx.x) + uint64_t(blockIdx.y) * uint64_t(gridDim.x);
+    }
+    else {
+      current_work_linear_idx_ = uint64_t(blockIdx.x) * uint64_t(gridDim.y) + uint64_t(blockIdx.y);
+    }
+
+    total_grid_size_ = uint64_t(gridDim.x) * uint64_t(gridDim.y) * uint64_t(gridDim.z);
+
+    uint64_t ctas_along_m, ctas_along_n;
+    if (is_tuple<decltype(cute::shape<0>(params_.problem_shapes_[0]))>::value ||
+        is_tuple<decltype(cute::shape<1>(params_.problem_shapes_[0]))>::value) {
+      ctas_along_m = cute::size(cute::ceil_div(cute::shape<0>(params_.problem_shapes_[0]), scheduler_params.cta_shape_.m()));
+      ctas_along_n = cute::size(cute::ceil_div(cute::shape<1>(params_.problem_shapes_[0]), scheduler_params.cta_shape_.n()));
+    }
+    else {
+      ctas_along_m = scheduler_params.divmod_cta_shape_m_.divide(cute::shape<0>(params_.problem_shapes_[0]) +  scheduler_params.divmod_cta_shape_m_.divisor - 1);
+      ctas_along_n = scheduler_params.divmod_cta_shape_n_.divide(cute::shape<1>(params_.problem_shapes_[0]) +  scheduler_params.divmod_cta_shape_n_.divisor - 1);
+    }
+    auto problem_blocks_m = round_up(ctas_along_m, (1 << params_.log_swizzle_size_) * params_.cluster_shape_.m());
+    auto problem_blocks_n = round_up(ctas_along_n, (1 << params_.log_swizzle_size_) * params_.cluster_shape_.n());
+    current_group_info_.total_tiles = problem_blocks_m * problem_blocks_n;
+#else
+    CUTLASS_ASSERT(false && "This line should never be reached");
+#endif
+  }
+
+  CUTLASS_DEVICE
+  WorkTileInfo
+  get_current_work() {
+    return get_current_work_for_linear_idx(current_work_linear_idx_);
+  }
+
+  CUTLASS_DEVICE
+  WorkTileInfo
+  get_current_work_for_linear_idx(uint64_t linear_idx) {
+    if (scheduler_params.pre_processed_problem_shapes && linear_idx >= scheduler_params.blocks_across_problem_) {
+      return WorkTileInfo::invalid_work_tile();
+    }
+
+    return get_work_idx_m_and_n(linear_idx,
+                                current_group_info_,
+                                scheduler_params.groups_,
+                                scheduler_params.problem_shapes_,
+                                scheduler_params.cta_shape_,
+                                scheduler_params.cluster_shape_,
+                                scheduler_params.divmod_cluster_shape_major_,
+                                scheduler_params.divmod_cluster_shape_minor_,
+                                scheduler_params.divmod_cta_shape_m_,
+                                scheduler_params.divmod_cta_shape_n_,
+                                scheduler_params.log_swizzle_size_, 
+                                scheduler_params.raster_order_);
+  }
+
+  CUTLASS_DEVICE
+  void
+  advance_to_next_work(uint32_t advance_count = 1) {
+    current_work_linear_idx_ += total_grid_size_ * uint64_t(advance_count);
+  }
+
+  // get work_idx_m, work_idx_n from linear_idx while applying swizzle
+  static CUTLASS_DEVICE
+  WorkTileInfo
+  get_work_idx_m_and_n(
+      uint64_t linear_idx,
+      struct GroupInfo& group_info,
+      int32_t total_problem_groups,
+      ProblemShape* problem_shapes,
+      GemmCoord cta_shape,
+      GemmCoord cluster_shape,
+      FastDivmodU64Pow2 const& divmod_cluster_shape_major,
+      FastDivmodU64Pow2 const& divmod_cluster_shape_minor,
+      FastDivmodU64 const& divmod_cta_shape_m,
+      FastDivmodU64 const& divmod_cta_shape_n,
+      int32_t log_swizzle_size, 
+      RasterOrder raster_order) {
+
+    bool valid_tile = true;
+    uint64_t ctas_along_m, ctas_along_n;
+    if (is_tuple<decltype(cute::shape<0>(problem_shapes[group_info.group_idx]))>::value ||
+        is_tuple<decltype(cute::shape<1>(problem_shapes[group_info.group_idx]))>::value) {
+      ctas_along_m = cute::size(cute::ceil_div(cute::shape<0>(problem_shapes[group_info.group_idx]), cta_shape.m()));
+      ctas_along_n = cute::size(cute::ceil_div(cute::shape<1>(problem_shapes[group_info.group_idx]), cta_shape.n()));
+    }
+    else {
+      ctas_along_m = divmod_cta_shape_m.divide(cute::shape<0>(problem_shapes[group_info.group_idx]) +  divmod_cta_shape_m.divisor - 1);
+      ctas_along_n = divmod_cta_shape_n.divide(cute::shape<1>(problem_shapes[group_info.group_idx]) +  divmod_cta_shape_n.divisor - 1);
+    }
+    auto problem_blocks_m = round_up(ctas_along_m, (1 << log_swizzle_size) * cluster_shape.m());
+    auto problem_blocks_n = round_up(ctas_along_n, (1 << log_swizzle_size) * cluster_shape.n());
+    group_info.total_tiles = problem_blocks_m * problem_blocks_n;
+
+    while (group_info.start_linear_idx + group_info.total_tiles <= linear_idx) {
+      group_info.group_idx++;
+
+      if (group_info.group_idx >= total_problem_groups)
+        return WorkTileInfo::invalid_work_tile();
+
+      group_info.start_linear_idx += group_info.total_tiles;
+      if (is_tuple<decltype(cute::shape<0>(problem_shapes[group_info.group_idx]))>::value ||
+          is_tuple<decltype(cute::shape<1>(problem_shapes[group_info.group_idx]))>::value) {
+        ctas_along_m = cute::size(cute::ceil_div(cute::shape<0>(problem_shapes[group_info.group_idx]), cta_shape.m()));
+        ctas_along_n = cute::size(cute::ceil_div(cute::shape<1>(problem_shapes[group_info.group_idx]), cta_shape.n()));
+      }
+      else {
+        ctas_along_m = divmod_cta_shape_m.divide(cute::shape<0>(problem_shapes[group_info.group_idx]) +  divmod_cta_shape_m.divisor - 1);
+        ctas_along_n = divmod_cta_shape_n.divide(cute::shape<1>(problem_shapes[group_info.group_idx]) +  divmod_cta_shape_n.divisor - 1);
+      }
+      problem_blocks_m = round_up(ctas_along_m, (1 << log_swizzle_size) * cluster_shape.m());
+      problem_blocks_n = round_up(ctas_along_n, (1 << log_swizzle_size) * cluster_shape.n());
+      group_info.total_tiles = problem_blocks_m * problem_blocks_n;
+    }
+
+    uint64_t cluster_id, cluster_major_offset = 0, cluster_minor_offset = 0;
+    uint64_t blk_per_grid_dim = divmod_cluster_shape_minor.divide(linear_idx - group_info.start_linear_idx);
+    divmod_cluster_shape_major(cluster_id, cluster_major_offset, blk_per_grid_dim);
+
+    // With static schedulers, we launch grid such that all cluster are linear (1-D) order, i.e., 
+    // there can only be one cluster in the minor dimension. get_grid_shape() in scheduler params
+    // put cluster_shape.m/n() as the minor dimension based on raster order AlongN/M resp.
+    // Therefore, the offset of a CTA (inside a cluster) in the minor dimension can be directly be 
+    // inferred by the blockIdx along the minor dimension.
+    if (raster_order == RasterOrder::AlongN) {
+      cluster_minor_offset = blockIdx.x;
+    }
+    else {
+      cluster_minor_offset = blockIdx.y;
+    }
+
+    uint64_t cluster_idx_minor, cluster_idx_major;
+    
+    uint64_t cluster_idx_minor_div_swizzle, extra, offset;
+
+    offset = cluster_id & ((1 << log_swizzle_size) - 1);
+    extra = cluster_id >> log_swizzle_size;
+
+    uint64_t curr_group_cluster_blk_major;
+    if (raster_order == RasterOrder::AlongN) {
+      curr_group_cluster_blk_major = divmod_cluster_shape_major.divide(problem_blocks_n);
+    }
+    else {
+      curr_group_cluster_blk_major = divmod_cluster_shape_major.divide(problem_blocks_m);
+    }
+    cluster_idx_minor_div_swizzle = extra / curr_group_cluster_blk_major;
+    cluster_idx_major = extra % curr_group_cluster_blk_major;
+
+    cluster_idx_minor = cluster_idx_minor_div_swizzle * (1 << log_swizzle_size) + offset;
+
+    auto minor_work_idx = static_cast<int32_t>(cluster_idx_minor * divmod_cluster_shape_minor.divisor + 
+                                               cluster_minor_offset);
+    auto major_work_idx = static_cast<int32_t>(cluster_idx_major * divmod_cluster_shape_major.divisor + 
+                                               cluster_major_offset);
+
+    if (raster_order == RasterOrder::AlongN) {
+      return {minor_work_idx, major_work_idx, group_info.group_idx, valid_tile};
+    }
+    else {
+      return {major_work_idx, minor_work_idx, group_info.group_idx, valid_tile}; 
+    }
+
+  }
+
+  // Returns whether the block assigned this work should compute the epilogue for the corresponding
+  // output tile. For the basic tile scheduler, this is always true.
+  CUTLASS_HOST_DEVICE
+  static bool
+  compute_epilogue(WorkTileInfo const&, Params const&) {
+    return true;
+  }
+
+  // Performs the reduction across splits for a given output tile. Since this scheduler does
+  // not split output tiles, no reduction is needed.
+  template <class FrgTensorC>
+  CUTLASS_DEVICE
+  static void
+  fixup(Params const&, WorkTileInfo const&, FrgTensorC&, uint32_t, uint32_t) {}
+
+  // Returns whether the current WorkTileInfo passed in should continue to be used. Since
+  // this scheduler only schedules work in units of single, full output tiles, the WorkTileInfo
+  // passed in should not be used after having been processed.
+  CUTLASS_DEVICE
+  static bool
+  continue_current_work(WorkTileInfo&) {
+    return false;
+  }
+
+  // The basic tile scheduler does not require any additional workspace
+  template <class ProblemShape, class ElementAccumulator>
+  static size_t
+  get_workspace_size(Arguments const&, ProblemShape, KernelHardwareInfo const&, uint32_t, const uint32_t = 1, uint32_t = 1) {
+    return 0;
+  }
+
+  template <class ProblemShape, class ElementAccumulator>
+  static cutlass::Status
+  initialize_workspace(Arguments const&, void*, cudaStream_t, ProblemShape, KernelHardwareInfo const&,
+    uint32_t, const uint32_t = 1, uint32_t = 1, CudaHostAdapter* cuda_adapter = nullptr) {
+    return Status::kSuccess;
+  }
+
+  template <class ProblemShape_MNKL, class TileShape>
+  CUTLASS_HOST_DEVICE
+  static int
+  get_work_k_tile_count(WorkTileInfo const& work_tile_info, ProblemShape_MNKL problem_shape, TileShape tile_shape) {
+    // All work units returned by this scheduler cover the entire K iteration
+    // space of the output tile assigned to the work unit.
+    return cute::size(cute::ceil_div(cute::get<2>(problem_shape), cute::get<2>(tile_shape)));
+  }
+
+  CUTLASS_HOST_DEVICE
+  static uint32_t
+  get_work_k_tile_start(WorkTileInfo const&) {
+    // All work units returned by this scheduler start from K tile 0
+    return 0u;
+  }
+
+  CUTLASS_DEVICE
+  static bool
+  need_separate_reduction(Params const& params) {
+    return false;
+  }
+
+  CUTLASS_DEVICE
+  bool
+  is_work_tile_for_reduction(WorkTileInfo const& work_tile_info, Params const& params) {
+    return false;
+  }
+
+  CUTLASS_DEVICE
+  uint32_t
+  epilgoue_subtile_idx(WorkTileInfo const& work_tile_info, Params const& params) const {
+    return 0;
+  }
+
+  template <class FrgTensorC>
+  CUTLASS_DEVICE
+  void
+  separate_reduction(
+    Params const& params,
+    WorkTileInfo const& work_tile_info,
+    FrgTensorC& accumulators,
+    uint32_t num_barriers,
+    uint32_t barrier_idx) {
+  }
+
+  // Shares the accumulator set with peers in the global workspace
+  template <class FrgTensorC>
+  CUTLASS_DEVICE
+  static void
+  share(
+    Params const& params,
+    WorkTileInfo const& work_tile_info,
+    FrgTensorC& accumulators,
+    uint32_t num_barriers,
+    uint32_t barrier_idx) {
+  }
+
+  CUTLASS_DEVICE
+  static bool
+  valid_warpgroup_in_work_tile(WorkTileInfo const& work_tile_info) {
+    return true;
+  }
+
+  CUTLASS_DEVICE
+  static bool
+  requires_separate_reduction(Params const& params) {
+    return false;
+  }
+
+  // Kernel helper function to get next work tile
+  CUTLASS_DEVICE
+  auto
+  fetch_next_work(WorkTileInfo work_tile_info) {
+    if (continue_current_work(work_tile_info)) {
+      return cute::make_tuple(work_tile_info, true);
+    }
+
+    advance_to_next_work();
+    return cute::make_tuple(get_current_work(), true);
+  }
+  
+  // Returns the initial work tile info that will be computed over
+  template <class ClusterShape>
+  CUTLASS_DEVICE
+  WorkTileInfo
+  initial_work_tile_info(ClusterShape) {
+    return get_current_work();
+  }
+
+};
+
+} // namespace cutlass::gemm::kernel::detail
diff --git a/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp b/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp
new file mode 100644
index 0000000000..b5e62164da
--- /dev/null
+++ b/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp
@@ -0,0 +1,1074 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/barrier.h"
+#include "cutlass/block_striped.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cute/layout.hpp"
+#include "cute/tensor.hpp"
+
+namespace cutlass::gemm::kernel::detail {
+
+// Persistent Thread Block (TB) scheduler leveraging stream-K decomposition
+template <
+  class TileShape,
+  class ClusterShape
+>
+class PersistentTileSchedulerSm90StreamK {
+  //
+  // Data members
+  //
+
+private:
+  using UnderlyingScheduler = PersistentTileSchedulerSm90;
+
+private:
+  using UnderlyingArguments = typename UnderlyingScheduler::Arguments;
+  using UnderlyingParams = typename UnderlyingScheduler::Params;
+
+  dim3 block_id_in_cluster_;
+  uint64_t current_work_linear_idx_ = 0;
+  uint32_t unit_iter_start_ = 0;
+
+public:
+
+  using RasterOrder = UnderlyingScheduler::RasterOrder;
+  using RasterOrderOptions = UnderlyingScheduler::RasterOrderOptions;
+  static constexpr bool IsDynamicPersistent = false;
+
+  // Use a dummy barrier manager to simply get the type used to store the barrier
+  using BarrierType = typename NamedBarrierManager<1>::T;
+
+  using Params = PersistentTileSchedulerSm90StreamKParams;
+  using ReductionMode = Params::ReductionMode;
+  using DecompositionMode = Params::DecompositionMode;
+
+  struct WorkTileInfo {
+    int32_t M_idx = 0;
+    int32_t N_idx = 0;
+    int32_t K_idx = 0;
+    int32_t L_idx = 0;
+
+    // Number of k tiles to compute for this unit of work. For stream-K, this
+    // can indicate the number of K tiles across multiple output tiles.
+    uint32_t k_tile_count = 0;
+
+    // Number of k tiles remaining for the work unit as a whole
+    uint32_t k_tile_remaining = 0;
+
+    // Whether this unit of work is the final split for the given tile
+    bool is_separate_reduction = false;
+
+    CUTLASS_HOST_DEVICE
+    bool
+    is_valid() const {
+      // A work tile that computes no K tiles is invalid unless it is a separate-reduction work tile
+      // (which only performs reduction and epilogue)
+      return k_tile_count > 0 || is_separate_reduction;
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool
+    is_reduction_unit() const {
+      return is_separate_reduction;
+    }
+
+    CUTLASS_HOST_DEVICE
+    int32_t
+    reduction_subtile_idx() const {
+      // For separate reduction units, the K_idx of the work tile is unused.
+      // Therefore, we override it to contain the subtile of that the reduction
+      // unit operates on.
+      return is_reduction_unit() ? K_idx : -1;
+    }
+
+    CUTLASS_HOST_DEVICE
+    void
+    setup_separate_reduction(int32_t epilogue_subtile_idx) {
+      // Set the epilogue subtile in the K_idx, since this is otherwise unused
+      // by separate reduction units.
+      K_idx = epilogue_subtile_idx;
+
+      is_separate_reduction = true;
+      k_tile_count = 0;
+      // Clean up remaining k tiles
+      k_tile_remaining = 0;
+    }
+
+    CUTLASS_HOST_DEVICE
+    static WorkTileInfo
+    invalid_work_tile() {
+      return {-1, -1, -1, -1, 0};
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool
+    is_final_split(uint32_t k_tiles_per_output_tile) const {
+      return (K_idx + k_tile_count) == k_tiles_per_output_tile;
+    }
+  };
+
+  struct Arguments {
+
+    Arguments() = default;
+    Arguments(Arguments const&) = default;
+    Arguments(Arguments&&) = default;
+
+    CUTLASS_HOST_DEVICE
+    Arguments&
+    operator=(Arguments const& args) {
+      splits = args.splits;
+      max_swizzle_size = args.max_swizzle_size;
+      raster_order = args.raster_order;
+      reduction_mode = args.reduction_mode;
+      decomposition_mode = args.decomposition_mode;
+      return *this;
+    }
+
+    CUTLASS_HOST_DEVICE
+    Arguments&
+    operator=(Arguments&& args) noexcept {
+      splits = args.splits;
+      max_swizzle_size = args.max_swizzle_size;
+      raster_order = args.raster_order;
+      reduction_mode = args.reduction_mode;
+      decomposition_mode = args.decomposition_mode;
+      return *this;
+    }
+
+    CUTLASS_HOST_DEVICE
+    Arguments(int splits_) : splits(splits_) {}
+
+    CUTLASS_HOST_DEVICE
+    Arguments(int splits_, int max_swizzle_size_, RasterOrderOptions raster_order_, DecompositionMode decomposition_mode_) :
+      splits(splits_),
+      max_swizzle_size(max_swizzle_size_),
+      raster_order(raster_order_),
+      decomposition_mode(decomposition_mode_) {}
+
+    // The splitting factor to be used in a split-K decomposition of the problem.
+    // If this is set to a value greater than 1, stream-K decomposition logic
+    // is bypassed in favor of a split-K decomposition.
+    int splits = 1;
+    int max_swizzle_size = 1;
+    RasterOrderOptions raster_order = RasterOrderOptions::Heuristic;
+    ReductionMode reduction_mode = ReductionMode::Deterministic;
+    DecompositionMode decomposition_mode = DecompositionMode::Heuristic;
+  };
+
+  // Sink scheduler params as a member
+  Params scheduler_params;
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static Params
+  to_underlying_arguments(
+      ProblemShape problem_shape,
+      TileShape tile_shape,
+      ClusterShape cluster_shape,
+      KernelHardwareInfo const& hw_info,
+      Arguments const& args,
+      void* workspace,
+      const uint32_t epilogue_subtile = 1,
+      [[maybe_unused]] uint32_t ktile_start_alignment_count = 1u) {
+
+    static_assert(cute::is_static<TileShape>::value);
+    static_assert(cute::is_static<ClusterShape>::value);
+
+    auto problem_shape_mnkl = cute::append<4>(problem_shape, cute::Int<1>{});
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape, cluster_shape);
+    uint32_t k_tile_per_output_tile = cute::size(cute::ceil_div(cute::shape<2>(problem_shape_mnkl), cute::shape<2>(TileShape{})));
+
+    Params params;
+    params.initialize(
+      problem_blocks,
+      k_tile_per_output_tile,
+      to_gemm_coord(cluster_shape),
+      hw_info,
+      args.splits,
+      args.max_swizzle_size,
+      args.raster_order,
+      args.reduction_mode,
+      args.decomposition_mode,
+      workspace,
+      epilogue_subtile
+    );
+    return params;
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    // Split count > 1 is only valid for heuristic and split-K decomposition modes
+    return (args.splits == 1 ||
+            args.decomposition_mode == DecompositionMode::Heuristic ||
+            args.decomposition_mode == DecompositionMode::SplitK);
+  }
+
+  CUTLASS_HOST_DEVICE
+  PersistentTileSchedulerSm90StreamK() { };
+
+  CUTLASS_DEVICE
+  PersistentTileSchedulerSm90StreamK(Params const& params_) : scheduler_params(params_), block_id_in_cluster_(cute::block_id_in_cluster()) {
+    if (params_.raster_order_ == RasterOrder::AlongN) {
+      current_work_linear_idx_ = uint64_t(blockIdx.x) + uint64_t(blockIdx.y) * uint64_t(gridDim.x);
+    }
+    else {
+      current_work_linear_idx_ = uint64_t(blockIdx.x) * uint64_t(gridDim.y) + uint64_t(blockIdx.y);
+    }
+
+  }
+
+  CUTLASS_DEVICE
+  WorkTileInfo
+  get_current_work() {
+    return get_current_work_for_linear_idx(unit_iter_start_, current_work_linear_idx_, block_id_in_cluster_, scheduler_params);
+  }
+
+  CUTLASS_DEVICE
+  static WorkTileInfo
+  get_current_work_for_linear_idx(uint32_t &unit_iter_start, uint64_t linear_idx, dim3 block_id_in_cluster, Params const& params) {
+    // The maximum number of work units is units_per_problem_ * splits_.
+    // The multiplication by splits_ is used for handling split-K, in which
+    // units_per_problem_ is equal to the total number of output tiles. To account
+    // for the fact that we have splits_ peers per output tile, we multiply this
+    // value by splits_. For stream-K, this multiplication ends up being a no-op
+    // because splits_ is set to 1 for stream-K.
+    if(linear_idx >= (params.units_per_problem_ * params.divmod_splits_.divisor + params.separate_reduction_units_)) {
+      // Invalid work. Return an empty result.
+      return WorkTileInfo::invalid_work_tile();
+    }
+
+    WorkTileInfo work_tile_info;
+    assign_work(params, linear_idx, block_id_in_cluster, work_tile_info, unit_iter_start);
+    return work_tile_info;
+  }
+
+  // Returns whether the current work_tile_info passed in should continue to be used. This
+  // occurs only in the stream-K decomposition with stream-K work units, which encompass
+  // work over multiple output tiles. If the current work_tile_info should continue to be
+  // used, it is updated to advance to the next output tile it should cover.
+  CUTLASS_DEVICE
+  bool
+  continue_current_work(WorkTileInfo& work_tile_info) const {
+    return continue_current_work_for_linear_idx(
+      current_work_linear_idx_, unit_iter_start_, block_id_in_cluster_, work_tile_info, scheduler_params);
+  }
+
+  CUTLASS_DEVICE
+  static bool
+  continue_current_work_for_linear_idx(
+    uint64_t linear_idx,
+    uint32_t unit_iter_start,
+    dim3 block_id_in_cluster,
+    WorkTileInfo& work_tile_info,
+    Params const& params) {
+
+    work_tile_info.k_tile_remaining -= work_tile_info.k_tile_count;
+
+    if (work_tile_info.k_tile_remaining == 0) {
+      return false;
+    }
+    fast_assign_work(unit_iter_start, params, linear_idx, block_id_in_cluster, work_tile_info);
+    return work_tile_info.is_valid();
+  }
+
+  CUTLASS_DEVICE
+  void
+  advance_to_next_work(uint32_t advance_count = 1) {
+    current_work_linear_idx_ += uint64_t(gridDim.x) * uint64_t(gridDim.y) * uint64_t(gridDim.z) * uint64_t(advance_count);
+  }
+
+  CUTLASS_DEVICE
+  bool is_last_tile(WorkTileInfo work_tile_info, uint32_t advance_count = 1) const {
+     // Never pass this by reference; it needs a copy,
+    // because continue_current_work will modify it.
+    if (continue_current_work(work_tile_info)) {
+      return false;
+    }
+    return not get_current_work_for_linear_idx(
+        unit_iter_start_,
+        current_work_linear_idx_ + (
+          uint64_t(gridDim.x) * uint64_t(gridDim.y) * uint64_t(gridDim.z) * uint64_t(advance_count)
+          ),
+        block_id_in_cluster_,
+        scheduler_params
+    ).is_valid();
+  }
+
+  // Given the inputs, computes the total number of output blocks this problem will compute over
+  // Note that this is only the logical size of our grid, not the physical grid we will actually launch.
+  template <class ProblemShape>
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_tiled_cta_shape_mnl(ProblemShape problem_shape_mnkl, TileShape cta_shape, ClusterShape cluster_shape) {
+    return UnderlyingScheduler::get_tiled_cta_shape_mnl(problem_shape_mnkl, cta_shape, cluster_shape);
+  }
+
+  // Given the cluster shape, computes the physical grid we should launch.
+  template <class ProblemShape>
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_grid_shape(
+    [[maybe_unused]] Params const& params,
+    ProblemShape problem_shape,
+    TileShape tile_shape,
+    ClusterShape cluster_shape,
+    KernelHardwareInfo hw_info,
+    Arguments arguments) {
+
+    auto problem_shape_mnkl = cute::append<4>(problem_shape, cute::Int<1>{});
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape, cluster_shape);
+
+    return Params::get_grid_shape(
+      problem_blocks,
+      to_gemm_coord(cluster_shape),
+      hw_info,
+      arguments.max_swizzle_size,
+      arguments.raster_order
+    );
+  }
+
+  // Returns whether fixup is needed for `work_tile_info`.
+  CUTLASS_HOST_DEVICE
+  static bool
+  requires_fixup(Params const& params, WorkTileInfo const& work_tile_info) {
+    // Fixup is not needed for invalid or data-parallel tiles
+    return work_tile_info.is_valid() && work_tile_info.k_tile_count != params.divmod_tiles_per_output_tile_.divisor;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static bool
+  requires_separate_reduction(Params const& params) {
+    return params.requires_separate_reduction();
+  }
+
+  // When the work tile is not special for reduction, it's valid. Otherwise need to skip
+  // global loading that producer warpgroup do, also math computation that consumer warpgroup do.
+  CUTLASS_DEVICE
+  static bool
+  valid_warpgroup_in_work_tile(WorkTileInfo const& work_tile_info) {
+    return !work_tile_info.is_reduction_unit();
+  }
+
+  // Performs the reduction across splits for a given output tile.
+  template <class FrgTensorC>
+  CUTLASS_DEVICE
+  static void
+  fixup(
+    Params const& params,
+    WorkTileInfo const& work_tile_info,
+    FrgTensorC& accumulators,
+    uint32_t num_barriers,
+    uint32_t barrier_idx) {
+    static constexpr uint32_t Offset = static_cast<int>(cutlass::arch::ReservedNamedBarriers::StreamkBarrier0);
+    static constexpr uint32_t MaxNumNamedBarriers = 2;
+    using BarrierManager = NamedBarrierManager<NumThreadsPerWarpGroup, Offset, MaxNumNamedBarriers>;
+    return fixup_helper<FrgTensorC, BarrierManager>(
+      params, work_tile_info, accumulators, num_barriers, barrier_idx);
+  }
+
+  // Helper for performing the reduction across splits for a given output tile.
+  template <class FrgTensorC, class BarrierManager>
+  CUTLASS_DEVICE
+  static void
+  fixup_helper(
+    Params const& params,
+    WorkTileInfo const& work_tile_info,
+    FrgTensorC& accumulators,
+    uint32_t num_barriers,
+    uint32_t barrier_idx,
+    uint32_t num_accumulator_mtxs = 1) {
+
+    using ElementAccumulator = typename FrgTensorC::value_type;
+
+    if (!requires_fixup(params, work_tile_info)) {
+      return;
+    }
+    uint64_t tile_idx = output_tile_index(params, work_tile_info);
+
+    // Index of the lock on which to wait
+    uint64_t lock_idx = (tile_idx * num_barriers) + barrier_idx;
+
+    uint64_t reduction_tile_idx = tile_idx;
+    uint64_t num_peers = 0;
+    uint64_t reduction_peer_offset = 0;
+    if (
+      params.requires_separate_reduction()
+      ) {
+      // If separate reduction is to be performed, each stream-K unit writes its partials
+      // to a separate portion of the workspace. There are as many of these portions as there
+      // are peers for a given output tile, so we multiply the tile index by the maximum peer count.
+      auto [first_peer_id, my_peer_id, last_peer_id] = tile_peer_range(params, tile_idx, work_tile_info);
+      auto peer_id_in_output_tile = my_peer_id - first_peer_id;
+      num_peers = last_peer_id - first_peer_id + 1;
+      reduction_tile_idx = tile_idx * Params::max_peers_per_tile(params.sk_units_, params.sk_tiles_);
+      reduction_peer_offset = peer_id_in_output_tile * cute::size<0>(TileShape{}) * cute::size<1>(TileShape{}) * num_accumulator_mtxs;
+    }
+
+    // Reductions use BlockStripedReduce with a width of BarrierManager::ThreadCount under the hood.
+    // Thus, the start of the reduction space is the same across all threads in a warp group.
+    uint64_t reduction_offset_base = (static_cast<uint64_t>(cute::size<0>(TileShape{})) * static_cast<uint64_t>(cute::size<1>(TileShape{})) * reduction_tile_idx * num_accumulator_mtxs) +
+      (static_cast<uint64_t>(size(accumulators)) * barrier_idx * BarrierManager::ThreadCount);
+    uint64_t reduction_offset = reduction_offset_base + reduction_peer_offset;
+
+    ElementAccumulator* group_reduction_workspace = reinterpret_cast<ElementAccumulator*>(params.reduction_workspace_) + reduction_offset;
+
+    using AccumulatorArrayT = Array<typename FrgTensorC::value_type, size(FrgTensorC{})>;
+    using BlockStripedReduceT = BlockStripedReduce<BarrierManager::ThreadCount, AccumulatorArrayT>;
+
+    AccumulatorArrayT* reduction_workspace_array = reinterpret_cast<AccumulatorArrayT*>(group_reduction_workspace);
+    AccumulatorArrayT* accumulator_array = reinterpret_cast<AccumulatorArrayT*>(accumulators.data());
+
+    uint32_t barrier_group_thread_idx = threadIdx.x % BarrierManager::ThreadCount;
+
+    // The number of tiles for which reduction is required is either:
+    //   (a) the total number of output tiles (in the case of split-K)
+    //   (b) the number of stream-K tiles (potentially multiplied by peer count if using separate reduction)
+    // To calculate the total number of output tiles in the split-K case, we
+    // note that, in the split-K case, the units_per_problem_ member of Params will be
+    // the total number of output tiles.
+    uint32_t reduction_tiles = 0;
+    if (params.divmod_splits_.divisor > 1) {
+      reduction_tiles = params.units_per_problem_;
+    }
+    else if (
+      params.requires_separate_reduction()
+      ) {
+      reduction_tiles = params.sk_tiles_ * Params::max_peers_per_tile(params.sk_units_, params.sk_tiles_);
+    }
+    else {
+      reduction_tiles = params.sk_tiles_;
+    }
+
+    uint64_t reduction_workspace_size = Params::get_reduction_workspace_size(
+      reduction_tiles, to_gemm_coord(TileShape{}), sizeof_bits<ElementAccumulator>::value, num_accumulator_mtxs);
+    BarrierType* lock_workspace = reinterpret_cast<BarrierType*>(
+      reinterpret_cast<uint8_t*>(params.reduction_workspace_) + reduction_workspace_size);
+
+    if (work_tile_info.is_reduction_unit()) {
+      // Wait until the peers collaborating on this output tile have all written
+      // their accumulators to workspace.
+      BarrierManager::wait_eq(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, num_peers);
+
+      separate_reduction<FrgTensorC, BarrierManager>(accumulators, num_barriers, group_reduction_workspace, barrier_group_thread_idx, num_peers, num_accumulator_mtxs);
+    }
+    else if (!compute_epilogue(work_tile_info, params)) {
+      if (
+        params.requires_separate_reduction()
+        || work_tile_info.K_idx == 0
+        ) {
+        // The first peer initializes the workspace partials in the non-separate-reduction case,
+        // and all peers write to their own location in workspace when using separate reduction
+        BlockStripedReduceT::store(reduction_workspace_array, *accumulator_array, barrier_group_thread_idx);
+      }
+      else {
+        // Wait until the preceding split added its accumulators
+        BarrierManager::wait_eq(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, work_tile_info.K_idx);
+
+        // Perform reduction in workspace
+        BlockStripedReduceT::reduce(reduction_workspace_array, *accumulator_array, barrier_group_thread_idx);
+      }
+
+      // If separate reduction is being performed, each participating stream-K unit increments the barrier
+      // by only 1. Otherwise, increment by the K tile count that this unit has processed.
+      uint32_t increment = params.requires_separate_reduction() ? 1 : work_tile_info.k_tile_count;
+
+      // Signal our arrival
+      BarrierManager::arrive_inc(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, increment);
+    }
+    else {
+      if (
+        params.reduction_mode_ == ReductionMode::Deterministic
+      ) {
+
+        // Wait until the preceding split added its accumulators
+        BarrierManager::wait_eq(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, work_tile_info.K_idx);
+
+      }
+      else {
+        // Wait until the first split has stored its accumulators
+        BarrierManager::wait_lt(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, 1);
+      }
+
+      // The block computing the final split for the tile adds previously-reduced partials
+      // to its accumulators and computes the epilogue.
+      BlockStripedReduceT::load_add(*accumulator_array, reduction_workspace_array, barrier_group_thread_idx);
+    }
+  }
+
+  template <class FrgTensorC, class BarrierManager>
+  CUTLASS_DEVICE
+  static void
+  separate_reduction(
+      FrgTensorC& accumulators,
+      uint32_t num_barriers,
+      typename FrgTensorC::value_type* reduction_workspace,
+      uint32_t thread_idx,
+      uint64_t num_peers,
+      uint32_t num_accumulator_mtxs) {
+    using AccumulatorArrayT = Array<typename FrgTensorC::value_type, size(FrgTensorC{})>;
+    using BlockStripedReduceT = BlockStripedReduce<BarrierManager::ThreadCount, AccumulatorArrayT>;
+
+    AccumulatorArrayT* accumulator_array = reinterpret_cast<AccumulatorArrayT*>(accumulators.data());
+
+    plus<AccumulatorArrayT> add_fragments;
+    uint64_t peer_offset = cute::size<0>(TileShape{}) * cute::size<1>(TileShape{}) * num_accumulator_mtxs;
+
+    for (uint64_t i = 0; i < num_peers; ++i) {
+      // Load peer fragment
+      AccumulatorArrayT addend_fragment;
+      auto peer_reduction_workspace = reinterpret_cast<AccumulatorArrayT*>(reduction_workspace + (i * peer_offset));
+
+      BlockStripedReduceT::load(addend_fragment, peer_reduction_workspace, thread_idx);
+
+      // Add peer fragment
+      *accumulator_array = add_fragments(*accumulator_array, addend_fragment);
+    }
+  }
+
+  // Returns whether the block assigned this work should compute the epilogue for the corresponding
+  // output tile. For the case of stream-K, this should only occur if the work is marked as the final split.
+  CUTLASS_HOST_DEVICE
+  static bool
+  compute_epilogue(WorkTileInfo const& work_tile_info, Params const& params) {
+    // `is_final_split` will be set to `true` for the following scenarios, all of which must compute the epilogue:
+    //  1. The tile is computed in data-parallel mode
+    //  2. The tile is computed in split-/stream-K mode and this work unit represents the final split of the tile
+    //  3. The tile is computed in split-/stream-K mode and separate reduction is used, and this is a separate reduction unit
+    return work_tile_info.is_valid() &&
+            (work_tile_info.is_final_split(params.divmod_tiles_per_output_tile_.divisor) &&
+             !params.requires_separate_reduction()) || work_tile_info.is_separate_reduction;
+  }
+
+  // Returns the linearized index of the output tile corresponding to the tile with offset [L, M, K]
+  CUTLASS_DEVICE
+  static uint64_t
+  output_tile_index(Params const& params, WorkTileInfo const& work_tile_info) {
+    uint64_t linear_idx_in_batch = UnderlyingScheduler::get_linear_idx_from_m_and_n(
+      work_tile_info.M_idx, work_tile_info.N_idx,
+      params.divmod_cluster_shape_major_,
+      params.divmod_cluster_shape_minor_,
+      params.divmod_cluster_blk_major_,
+      params.log_swizzle_size_,
+      params.raster_order_
+    );
+
+    uint64_t tiles_mn = params.divmod_batch_.divisor;
+    return tiles_mn * work_tile_info.L_idx + linear_idx_in_batch;
+  }
+
+  template <class ProblemShape, class ElementAccumulator>
+  static size_t
+  get_workspace_size(
+    Arguments const& args,
+    ProblemShape problem_shape,
+    KernelHardwareInfo const& hw_info,
+    uint32_t mma_warp_groups,
+    const uint32_t epilogue_subtile = 1,
+    [[maybe_unused]] uint32_t num_accumulator_mtxs = 1) {
+
+    auto problem_shape_mnkl = cute::append<4>(problem_shape, 1);
+
+    ClusterShape cluster_shape;
+    TileShape tile_shape;
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape, cluster_shape);
+    uint32_t k_tile_per_output_tile = cute::size(cute::ceil_div(cute::shape<2>(problem_shape_mnkl), cute::shape<2>(TileShape{})));
+
+    return Params::get_workspace_size(
+      problem_blocks,
+      k_tile_per_output_tile,
+      to_gemm_coord(tile_shape),
+      to_gemm_coord(cluster_shape),
+      hw_info,
+      args.splits,
+      args.max_swizzle_size,
+      args.raster_order,
+      args.decomposition_mode,
+      args.reduction_mode,
+      mma_warp_groups,
+      sizeof_bits<BarrierType>::value,
+      sizeof_bits<ElementAccumulator>::value,
+      epilogue_subtile
+    );
+  }
+
+  template <class ProblemShape, class ElementAccumulator>
+  static cutlass::Status
+  initialize_workspace(
+    Arguments const& args,
+    void* workspace,
+    cudaStream_t stream,
+    ProblemShape const& problem_shape,
+    KernelHardwareInfo const& hw_info,
+    uint32_t mma_warp_groups,
+    const uint32_t epilogue_subtile = 1,
+    [[maybe_unused]] uint32_t num_accumulator_mtxs = 1,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+
+    auto problem_shape_mnkl = cute::append<4>(problem_shape, 1);
+
+    ClusterShape cluster_shape;
+    TileShape tile_shape;
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape, cluster_shape);
+    uint32_t k_tile_per_output_tile = cute::size(cute::ceil_div(cute::shape<2>(problem_shape_mnkl), cute::shape<2>(TileShape{})));
+
+    return Params::initialize_workspace(
+      workspace,
+      stream,
+      problem_blocks,
+      k_tile_per_output_tile,
+      to_gemm_coord(tile_shape),
+      to_gemm_coord(cluster_shape),
+      hw_info,
+      args.splits,
+      args.max_swizzle_size,
+      args.raster_order,
+      args.decomposition_mode,
+      args.reduction_mode,
+      mma_warp_groups,
+      sizeof_bits<BarrierType>::value,
+      sizeof_bits<ElementAccumulator>::value,
+      epilogue_subtile,
+      1,
+      cuda_adapter
+    );
+  }
+
+  template <class ProblemShape>
+  CUTLASS_HOST_DEVICE
+  static uint32_t
+  get_work_k_tile_count(WorkTileInfo const& work_tile_info, ProblemShape, TileShape) {
+    return work_tile_info.k_tile_count;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static uint32_t
+  get_work_k_tile_start(WorkTileInfo const& work_tile_info) {
+    return work_tile_info.K_idx;
+  }
+
+  // Kernel helper function to get next work tile
+  CUTLASS_DEVICE
+  auto
+  fetch_next_work(WorkTileInfo work_tile_info) {
+    if (continue_current_work(work_tile_info)) {
+      return cute::make_tuple(work_tile_info, true);
+    }
+
+    advance_to_next_work();
+    return cute::make_tuple(get_current_work(), true);
+  }
+
+  // Returns the initial work tile info that will be computed over
+  CUTLASS_DEVICE
+  WorkTileInfo
+  initial_work_tile_info(ClusterShape) {
+    return get_current_work();
+  }
+
+  // Given raster order and current work tile linear index, reset cta m and n index in the cluster.
+  CUTLASS_DEVICE
+  static dim3
+  get_current_work_cta_m_n_in_cluster(
+    Params const& params,
+    uint64_t linear_idx,
+    dim3 block_id_in_cluster) {
+    auto [cta_m_in_cluster_, cta_n_in_cluster_, _] = block_id_in_cluster;
+    uint64_t cta_m_in_cluster = static_cast<uint64_t>(cta_m_in_cluster_);
+    uint64_t cta_n_in_cluster = static_cast<uint64_t>(cta_n_in_cluster_);
+    return {static_cast<uint32_t>(cta_m_in_cluster), static_cast<uint32_t>(cta_n_in_cluster), _};
+  }
+
+private:
+
+  CUTLASS_DEVICE
+  static uint32_t
+  get_current_work_iter_start_possible_update_work_tile_k_remaining(
+    Params const& params,
+    uint64_t linear_idx,
+    WorkTileInfo& work_tile_info) {
+    // In the CUTLASS 2.x implementation of stream K, stream-K work is assigned to each stream-K
+    // threadblock individually. For the most part, the set of K iterations corresponding to stream-K
+    // work was divided amongst stream-K threadblocks, and a threadblock determined which tile
+    // it would compute a (potentially-partial) output tile for based on the space of k iterations
+    // assigned to it. This often results in stream-K threadblocks processing tiles with different
+    // offsets in the K dimension from one another. This can reduce locality, but is lmitied to the
+    // (generally few) waves of threadblocks assigned to compute stream-K work.
+    //
+    // With the introduction of threadblock clusters, there is additional benefit to maintaining
+    // locality in the K dimension: shared portions of operands can be multicasted to threadblocks
+    // within a cluster. Thus, we would like to ensure that the assignment of stream-K work to
+    // threadblocks respects the ability to perform multicasting.
+    //
+    // To do so, we divide up the linearized stream-K units into clusters and share the same K
+    // offsets for work within clusters.
+    uint64_t cluster_linear_work_idx = params.div_cluster_size(linear_idx);
+
+    uint64_t group_idx;
+    params.divmod_sk_groups_(cluster_linear_work_idx, group_idx, cluster_linear_work_idx);
+
+    // Determine whether we are in a "big group" that will process an additional
+    // stream-K cluster tile.
+    uint64_t sk_cluster_tiles = params.div_cluster_size(params.sk_tiles_);
+    uint64_t sk_cluster_tiles_in_group = params.divmod_sk_groups_.divide(sk_cluster_tiles);
+    if (group_idx < params.big_groups_) {
+      ++sk_cluster_tiles_in_group;
+    }
+
+    // Determine whether we are in a "big unit" within the group, that will process
+    // an additional K chunk in the group.
+    uint64_t sk_tiles_in_group = sk_cluster_tiles_in_group * params.get_cluster_size();
+    uint64_t k_tiles_in_group = sk_tiles_in_group * params.divmod_tiles_per_output_tile_.divisor;
+    uint64_t k_tiles_per_unit_in_group = params.divmod_sk_units_per_group_.divide(k_tiles_in_group);
+    uint64_t big_units_in_group = params.div_cluster_size(
+      k_tiles_in_group - (k_tiles_per_unit_in_group * params.divmod_sk_units_per_group_.divisor));
+
+    uint64_t split;
+    params.divmod_clusters_mnl_(split, cluster_linear_work_idx, cluster_linear_work_idx);
+
+    bool is_split_k = params.divmod_splits_.divisor > 1;
+    uint64_t big_unit_cmp_lhs = is_split_k ? split : cluster_linear_work_idx;
+    uint64_t big_unit_cmp_rhs = is_split_k ? params.big_units_ : big_units_in_group;
+    uint64_t linear_idx_mult = is_split_k ? params.divmod_tiles_per_output_tile_.divisor : k_tiles_per_unit_in_group;
+    uint64_t k_tiles_per_split = is_split_k ? params.divmod_k_tiles_per_sk_unit_.divisor : k_tiles_per_unit_in_group;
+
+    // Determine the starting k iteration computed by this stream-K work unit
+    uint32_t unit_iter_start = (linear_idx_mult * cluster_linear_work_idx) +
+                               (k_tiles_per_split * split);
+
+    // Adjust the starting position and number of k iterations for "big units," which
+    // compute one extra iteration. If there are any big units, they will be the first
+    // in the linearized ID space.
+    auto k_tiles_in_my_split = k_tiles_per_split;
+    if (big_unit_cmp_lhs < big_unit_cmp_rhs) {
+      // Since the "big units" are the first units in the linearized ID space, each
+      // of the units preceding this big unit computed one extra iteration. Thus,
+      // we must offset our start iteration by the number of units that precede
+      // the current unit in the linearized ID space.
+      unit_iter_start += big_unit_cmp_lhs;
+      ++k_tiles_in_my_split;
+    }
+    else {
+      // Increment by one for each of the big clusters (since all big units precede this unit)
+      unit_iter_start += big_unit_cmp_rhs;
+    }
+    if (!is_split_k) {
+      // Adjust the unit starting position and number of tiles to avoid
+      // computing splits of size less than min_iters_per_sk_unit_
+      int unused, start_tile_k_tile;
+      params.divmod_tiles_per_output_tile_(unused, start_tile_k_tile, unit_iter_start);
+      if (start_tile_k_tile < Params::min_iters_per_sk_unit_) {
+        // Starting K tile is in range [0, Params::min_iters_per_sk_unit_), which means that another
+        // stream-K unit will be computing a split with fewer than Params::min_iters_per_sk_unit_ K tiles.
+        // Adjust our work to take over these K tiles.
+        unit_iter_start -= start_tile_k_tile;
+        k_tiles_in_my_split += start_tile_k_tile;
+      }
+      else if (start_tile_k_tile > (params.divmod_tiles_per_output_tile_.divisor - Params::min_iters_per_sk_unit_)) {
+        // Starting K tile is within the final Params::min_iters_per_sk_unit_ K tiles of some output tile,
+        // which means that this unit will compute a split with fewer than Params::min_iters_per_sk_unit_ K tiles.
+        // Adjust our work to shed these K tiles to a neighboring stream-K unit that will compute more consecutive K tiles.
+        auto adjustment_tiles = (params.divmod_tiles_per_output_tile_.divisor - start_tile_k_tile);
+        unit_iter_start += adjustment_tiles;
+        k_tiles_in_my_split -= adjustment_tiles;
+      }
+      else if (params.ktile_start_alignment_count_ == 2 && start_tile_k_tile % 2 != 0) {
+        // ktile for each SM start from even number
+        // If start from odd number ktile within the output tile
+        //    now start at the ktile one before my initial ktile start (take one ktile from prev sm)
+        // if end on odd number ktile within the output tile
+        //    now end at ktile that one before my ktile end (give one ktile to next sm)
+        unit_iter_start -= 1;
+        k_tiles_in_my_split += 1;
+      }
+    }
+    if (work_tile_info.k_tile_count == 0) {
+      // This is a new unit
+
+      if (!is_split_k) {
+        //
+        // Adjust the unit ending position and number of tiles to avoid
+        // computing splits of size less than min_iters_per_sk_unit_
+        //
+
+        // Begin by assuming that no adjustment is needed
+        auto initial_unit_iter_end = unit_iter_start + k_tiles_in_my_split;
+
+        int unused, end_tile_k_tile;
+        params.divmod_tiles_per_output_tile_(unused, end_tile_k_tile, initial_unit_iter_end);
+
+        if (end_tile_k_tile < Params::min_iters_per_sk_unit_) {
+          // Ending K tile is within the first Params::min_iters_per_sk_unit_ K tiles of some output tile,
+          // which means that this unit will compute a split with fewer than Params::min_iters_per_sk_unit_ K tiles.
+          // Adjust our work to shed these K tiles to a neighboring stream-K unit that will compute more consecutive K tiles.
+          k_tiles_in_my_split -= end_tile_k_tile;
+        }
+        else if (end_tile_k_tile > (params.divmod_tiles_per_output_tile_.divisor - Params::min_iters_per_sk_unit_)) {
+          // Ending K tile is within the final Params::min_iters_per_sk_unit_ K tiles of some output tile,
+          // which means that some other unit will compute a split with fewer than Params::min_iters_per_sk_unit_ K tiles.
+          // Adjust our work to take on these K tiles.
+          k_tiles_in_my_split += (params.divmod_tiles_per_output_tile_.divisor - end_tile_k_tile);
+        }
+        else if (params.ktile_start_alignment_count_ == 2 && end_tile_k_tile % 2 != 0) {
+          // ktile for each SM start from even number
+          // If start from odd number ktile within the output tile
+          //    now start at the ktile one before my initial ktile start (take one ktile from prev sm)
+          // If end on odd number ktile within the output tile,
+          //    now end at ktile that one before my ktile end (give one ktile to next sm)
+          k_tiles_in_my_split -= 1;
+        }
+      }
+
+      work_tile_info.k_tile_remaining = k_tiles_in_my_split;
+    }
+    return unit_iter_start;
+  }
+
+  // Update output tile index given existing remaining k tiles of current work tile.
+  CUTLASS_DEVICE
+  static uint64_t update_output_tile_id_and_work_tile_k(
+    Params const& params,
+    WorkTileInfo& work_tile_info,
+    uint64_t linear_idx,
+    uint32_t unit_iter_start,
+    uint64_t cta_m_in_cluster,
+    uint64_t cta_n_in_cluster) {
+    // we divide up the linearized stream-K units into clusters and share the same K
+    // offsets for work within clusters.
+    uint64_t cluster_linear_work_idx = params.div_cluster_size(linear_idx);
+
+    uint64_t unused, group_idx;
+    params.divmod_sk_groups_(unused, group_idx, cluster_linear_work_idx);
+
+    uint32_t unit_iter_end = unit_iter_start + work_tile_info.k_tile_remaining - 1;
+
+    // Find the output tile corresponding to the final k tile covered by this
+    // work unit. Stream-K work units will work backwards in terms of the tiles they
+    // are responsible computing. This is beneficial because the final (partial)
+    // tile computed by a stream-K block is typically the beginning of the output
+    // tile, while the beginning (partial) tile is typically the ending of another
+    // output tile. Since ending portions of an output tile must reduce across
+    // other work units computing portions of that output tile, it is preferable
+    // for them to be computed later, so as to reduce the likelihood of blocking
+    // on other work.
+
+    auto output_tile_id_in_group = params.divmod_tiles_per_output_tile_.divide(unit_iter_end);
+    uint32_t output_tile_iter_start = output_tile_id_in_group * params.divmod_tiles_per_output_tile_.divisor;
+    uint32_t output_tile_iter_end = output_tile_iter_start + params.divmod_tiles_per_output_tile_.divisor;
+
+    // Convert the output tile from the linearized space within each group to the
+    // overall linearized space.
+    uint64_t output_tile_id = (output_tile_id_in_group * params.divmod_sk_groups_.divisor) + group_idx;
+
+    // Bring the linearized tile ID back into the space of tiles, rather than clusters
+    output_tile_id *= params.get_cluster_size();
+
+    // The final linearized tile ID is in units of the cluster dimension over which we rasterize.
+    if (params.raster_order_ == RasterOrder::AlongN) {
+      output_tile_id += cta_n_in_cluster * params.divmod_cluster_shape_minor_.divisor;
+    }
+    else {
+      output_tile_id += cta_m_in_cluster * params.divmod_cluster_shape_minor_.divisor;
+    }
+    // The unit's starting k iteration in the current tile is either the starting
+    // iteration for the tile as a whole, or the starting k iteration for the unit
+    // as a whole (if the latter is greater than the former).
+    uint32_t tile_iter_start = max(output_tile_iter_start, unit_iter_start);
+
+    // Similarly, the unit's ending k iteration (exclusive) is either the end of
+    // the current tile it is assigned, or the ending iteration of the unit as a whole
+    // (if the latter is less than the former).
+    uint32_t tile_iter_end = min(output_tile_iter_end, unit_iter_end + 1);
+
+    // Set the k offset to be the starting k tile for this output tile
+    work_tile_info.K_idx = static_cast<int32_t>(tile_iter_start - output_tile_iter_start);
+    work_tile_info.k_tile_count = tile_iter_end - tile_iter_start;
+
+    return output_tile_id;
+  }
+  // Given output tile index, update M, N, L index of current work tile info.
+  CUTLASS_DEVICE
+  static void
+  update_work_tile_m_n_l(
+    Params const& params,
+    uint32_t output_tile_id,
+    WorkTileInfo& work_tile_info,
+    uint64_t cta_m_in_cluster,
+    uint64_t cta_n_in_cluster) {
+
+    uint64_t work_idx_l, remainder;
+    params.divmod_batch_(work_idx_l, remainder, output_tile_id);
+
+    uint64_t cta_per_grid_dim = params.divmod_cluster_shape_minor_.divide(remainder);
+
+    auto [work_idx_m, work_idx_n] = UnderlyingScheduler::get_work_idx_m_and_n(
+                                          cta_per_grid_dim,
+                                          params.divmod_cluster_shape_major_,
+                                          params.divmod_cluster_shape_minor_,
+                                          params.divmod_cluster_blk_major_,
+                                          params.log_swizzle_size_,
+                                          params.raster_order_
+                                        );
+
+    // Set the M, N, and L block offsets
+    work_tile_info.M_idx = work_idx_m;
+    work_tile_info.N_idx = work_idx_n;
+    work_tile_info.L_idx = static_cast<int32_t>(work_idx_l);
+  }
+
+  // Sets the current stream-K work to compute within work_tile_info. If new_unit is true, work_tile_info
+  // is populated as a new unit of work. Otherwise, state existing in work_tile_info (e.g., remaining
+  // iterations) is used to find the next tile in the current work unit.
+  CUTLASS_DEVICE
+  static void
+  assign_work(
+    Params const& params,
+    uint64_t linear_idx,
+    dim3 block_id_in_cluster,
+    WorkTileInfo& work_tile_info,
+    uint32_t &unit_iter_start) {
+
+    auto [cta_m_in_cluster, cta_n_in_cluster, _] =
+      get_current_work_cta_m_n_in_cluster(params, linear_idx, block_id_in_cluster);
+
+    uint64_t output_tile_id = linear_idx;
+    if (linear_idx >= params.units_per_problem_ * params.divmod_splits_.divisor) {
+      // Separate-reduction work
+      auto cluster_size = params.get_cluster_size();
+      // Divide up the linearized separate reduction units into clusters
+      uint64_t cluster_linear_reduction_unit_idx = params.div_cluster_size((linear_idx - params.units_per_problem_));
+      uint64_t cluster_tile_idx, epi_subtile_idx;
+      params.divmod_epilogue_subtile_(cluster_tile_idx, epi_subtile_idx, cluster_linear_reduction_unit_idx);
+      // Bring the linearized tile ID back into the space of tiles, rather than clusters
+      output_tile_id = cluster_tile_idx * cluster_size;
+
+      work_tile_info.setup_separate_reduction(epi_subtile_idx);
+    }
+    else if (linear_idx >= params.sk_units_ && params.divmod_splits_.divisor == 1) {
+      // Data-parallel work
+      output_tile_id = linear_idx - params.sk_units_ + params.sk_tiles_;
+      work_tile_info.K_idx = 0;
+      work_tile_info.k_tile_count = params.divmod_tiles_per_output_tile_.divisor;
+      work_tile_info.k_tile_remaining = params.divmod_tiles_per_output_tile_.divisor;
+    }
+    else {
+      unit_iter_start = get_current_work_iter_start_possible_update_work_tile_k_remaining(params, linear_idx, work_tile_info);
+      output_tile_id = update_output_tile_id_and_work_tile_k(params, work_tile_info,
+        linear_idx, unit_iter_start, cta_m_in_cluster, cta_n_in_cluster);
+    }
+    update_work_tile_m_n_l(params, output_tile_id, work_tile_info, cta_m_in_cluster, cta_n_in_cluster);
+  }
+
+  // The fast path to get current output tile index then update fields of work tile info
+  // when continuing current work tile is needed, since k tile starting index has precomputed
+  // in the first time fetching current work tile.
+  CUTLASS_DEVICE
+  static void
+  fast_assign_work(
+    uint32_t unit_iter_start,
+    Params const& params,
+    uint64_t linear_idx,
+    dim3 block_id_in_cluster,
+    WorkTileInfo& work_tile_info) {
+
+    auto [cta_m_in_cluster, cta_n_in_cluster, _] =
+      get_current_work_cta_m_n_in_cluster(params, linear_idx, block_id_in_cluster);
+
+    uint64_t output_tile_id = update_output_tile_id_and_work_tile_k(params, work_tile_info,
+      linear_idx, unit_iter_start, cta_m_in_cluster, cta_n_in_cluster);
+
+    update_work_tile_m_n_l(params, output_tile_id, work_tile_info, cta_m_in_cluster, cta_n_in_cluster);
+  }
+
+  // Returns the starting and ending peer ID of this tile
+  CUTLASS_HOST_DEVICE
+  static auto
+  tile_peer_range(Params const& params, uint32_t tile_idx, WorkTileInfo const& work_tile_info) {
+    uint32_t cur_k_tile = static_cast<uint32_t>(work_tile_info.K_idx);
+    uint32_t tile_idx_in_cluster_path = params.div_cluster_size(tile_idx);
+    uint32_t start_k_tile = params.divmod_tiles_per_output_tile_.divisor * tile_idx_in_cluster_path;
+    uint32_t end_k_tile = start_k_tile + params.divmod_tiles_per_output_tile_.divisor - 1;
+    uint32_t big_unit_k_tiles = params.big_units_ * (params.divmod_k_tiles_per_sk_unit_.divisor + 1);
+
+    auto adjust_unit = [&](uint32_t k_tile, uint32_t unit_idx, uint32_t unit_k_start, uint32_t unit_k_end) {
+      if (k_tile - start_k_tile < Params::min_iters_per_sk_unit_ &&
+          unit_k_end - start_k_tile < Params::min_iters_per_sk_unit_) {
+        // k_tile is within the first min_iters_per_sk_unit_ K tiles of this output tile,
+        // and the stream-K unit computes fewer than min_iters_per_sk_unit_ K tiles for this
+        // output tile. This work will thus be subsumed by the next stream-K unit.
+        ++unit_idx;
+      }
+
+      if (end_k_tile + 1 - k_tile < Params::min_iters_per_sk_unit_ &&
+          end_k_tile + 1 - unit_k_start < Params::min_iters_per_sk_unit_) {
+        // k_tile is within the last min_iters_per_sk_unit_ K tiles of this output tile,
+        // and the stream-K unit computes fewer than min_iters_per_sk_unit_ K tiles for this
+        // output tile. This work will thus be subsumed by the previous stream-K unit.
+        --unit_idx;
+      }
+
+      return unit_idx;
+    };
+
+    // Lambda to find the ID of the stream-K unit that computes this K tile
+    auto find_unit = [&](uint32_t k_tile) {
+      if (k_tile < big_unit_k_tiles) {
+        // The tile is within the "big unit range"
+        uint32_t unit_idx = params.divmod_k_tiles_per_sk_big_unit_.divide(k_tile);
+        uint32_t unit_k_start = unit_idx * params.divmod_k_tiles_per_sk_big_unit_.divisor;
+        uint32_t unit_k_end = unit_k_start + params.divmod_k_tiles_per_sk_big_unit_.divisor;
+        return static_cast<uint64_t>(adjust_unit(k_tile, unit_idx, unit_k_start, unit_k_end));
+      }
+      else {
+        // The tile is after the "big unit range." Account for this by finding the "normal unit"
+        // that it belongs to, and then offsetting by the number of big units
+        uint32_t unit_idx_after_big_units = params.divmod_k_tiles_per_sk_unit_.divide(k_tile - big_unit_k_tiles);
+        uint32_t unit_k_start = unit_idx_after_big_units * params.divmod_k_tiles_per_sk_unit_.divisor + (params.big_units_ * params.divmod_k_tiles_per_sk_big_unit_.divisor);
+        uint32_t unit_k_end = unit_k_start + params.divmod_k_tiles_per_sk_unit_.divisor;
+        uint32_t unit_idx = unit_idx_after_big_units + params.big_units_;
+        return static_cast<uint64_t>(adjust_unit(k_tile, unit_idx, unit_k_start, unit_k_end));
+      }
+    };
+
+    return cute::make_tuple(find_unit(start_k_tile), find_unit(start_k_tile + cur_k_tile), find_unit(end_k_tile));
+  }
+};
+
+} // namespace cutlass::gemm::kernel::detail
diff --git a/include/cutlass/gemm/kernel/sparse_gemm.h b/include/cutlass/gemm/kernel/sparse_gemm.h
index 849ed08c66..af274ee096 100644
--- a/include/cutlass/gemm/kernel/sparse_gemm.h
+++ b/include/cutlass/gemm/kernel/sparse_gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -37,6 +37,7 @@
 #include "cutlass/cutlass.h"
 
 #include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/params_sparse_base.h"
 #include "cutlass/matrix_coord.h"
 #include "cutlass/semaphore.h"
 
@@ -74,65 +75,58 @@ struct SparseGemm {
   using WarpCount = typename Mma::WarpCount;
   static int const kThreadCount = 32 * WarpCount::kCount;
 
+  using ParamsA = typename Mma::IteratorA::Params;
+  using TensorRefA = typename Mma::IteratorA::TensorRef;
+  using ParamsB = typename Mma::IteratorB::Params;
+  using TensorRefB = typename Mma::IteratorB::TensorRef;
+  using ParamsE = typename Mma::IteratorE::Params;
+  using TensorRefE = typename Mma::IteratorE::TensorRef;
+
   /// Parameters structure
-  struct Params {
-    cutlass::gemm::GemmCoord problem_size;
-    cutlass::gemm::GemmCoord grid_tiled_shape;
-    int swizzle_log_tile;
-    typename Mma::IteratorA::Params params_A;
-    typename Mma::IteratorA::TensorRef ref_A;
-    typename Mma::IteratorB::Params params_B;
-    typename Mma::IteratorB::TensorRef ref_B;
+  struct Params : public SparseParamsBase<
+      ThreadblockSwizzle, ParamsA, TensorRefA, ParamsB, TensorRefB,
+      ParamsE, TensorRefE> {
+
+    using Base = SparseParamsBase<
+        ThreadblockSwizzle, ParamsA, TensorRefA, ParamsB, TensorRefB,
+        ParamsE, TensorRefE>;
+
+    //
+    // Data members
+    //
+
     typename Epilogue::OutputTileIterator::Params params_C;
     typename Epilogue::OutputTileIterator::TensorRef ref_C;
     typename Epilogue::OutputTileIterator::Params params_D;
     typename Epilogue::OutputTileIterator::TensorRef ref_D;
-    typename Mma::IteratorE::Params params_E;
-    typename Mma::IteratorE::TensorRef ref_E;
     typename OutputOp::Params output_op;
     int *semaphore;
-    int gemm_k_iterations;
-    int gemm_k_size;
 
     //
     // Methods
     //
 
     CUTLASS_HOST_DEVICE
-    Params(): swizzle_log_tile(0), semaphore(0), gemm_k_iterations(0), gemm_k_size(0) { }
+    Params() { }
 
     CUTLASS_HOST_DEVICE
     Params(
       cutlass::gemm::GemmCoord const & problem_size,
       cutlass::gemm::GemmCoord const & grid_tiled_shape,
-      typename Mma::IteratorA::TensorRef ref_A,
-      typename Mma::IteratorB::TensorRef ref_B,
+      TensorRefA ref_A,
+      TensorRefB ref_B,
       typename Epilogue::OutputTileIterator::TensorRef ref_C,
       typename Epilogue::OutputTileIterator::TensorRef ref_D,
-      typename Mma::IteratorE::TensorRef ref_E,
+      TensorRefE ref_E,
       typename OutputOp::Params output_op = typename OutputOp::Params(),
       int *workspace = nullptr
     ):
-      problem_size(problem_size),
-      grid_tiled_shape(grid_tiled_shape),
-      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
-      params_A(ref_A.layout()),
-      ref_A(ref_A),
-      params_B(ref_B.layout()),
-      ref_B(ref_B),
+      Base(problem_size, grid_tiled_shape, ref_A, ref_B, ref_E, Mma::Shape::kK),
       params_C(ref_C.layout()),
       ref_C(ref_C),
       params_D(ref_D.layout()),
       ref_D(ref_D),
-      params_E(ref_E.layout()),
-      ref_E(ref_E),
       output_op(output_op) {
-
-      int total_gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
-      int gemm_k_iterations = (total_gemm_k_iterations + grid_tiled_shape.k() - 1) / grid_tiled_shape.k();
-      
-      gemm_k_size = gemm_k_iterations * Mma::Shape::kK;
-
     semaphore = workspace;
     }
   };
@@ -277,7 +271,7 @@ struct SparseGemm {
 
     // Broadcast the warp_id computed by lane 0 to ensure dependent code
     // is compiled as warp-uniform.
-    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int warp_idx = canonical_warp_idx_sync();
     int lane_idx = threadIdx.x % 32;
 
     //
diff --git a/include/cutlass/gemm/kernel/sparse_gemm_with_absmax.h b/include/cutlass/gemm/kernel/sparse_gemm_with_absmax.h
new file mode 100644
index 0000000000..f464e29ccb
--- /dev/null
+++ b/include/cutlass/gemm/kernel/sparse_gemm_with_absmax.h
@@ -0,0 +1,509 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Sparse GEMM kernel with an epilogue that computes the absolute maximum value of the output
+    and a pre-activation-function auxiliary output. The auxiliary output is also (optionally)
+    stored to global memory.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/params_sparse_base.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/semaphore.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  bool SplitKSerial               ///! If true, code supporting split-K via serial reduction is enabled.
+>
+struct SparseGemmWithAbsmax {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using OutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static bool const kSplitKSerial = SplitKSerial;
+
+  static int const kSparse = Mma::kSparse;
+  static int const kMetaSizeInBits = Mma::kMetaSizeInBits;
+  static int const kMaxID2 = Mma::kMaxID2;
+  static int const kElementsPerElementE = Mma::kElementsPerElementE;
+
+  using ElementE = typename Mma::ElementE;
+  using LayoutE = typename Mma::LayoutE;
+
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  using ParamsA = typename Mma::IteratorA::Params;
+  using TensorRefA = typename Mma::IteratorA::TensorRef;
+  using ParamsB = typename Mma::IteratorB::Params;
+  using TensorRefB = typename Mma::IteratorB::TensorRef;
+  using ParamsE = typename Mma::IteratorE::Params;
+  using TensorRefE = typename Mma::IteratorE::TensorRef;
+
+  using ParamsC = typename Epilogue::OutputTileIterator::Params;
+  using TensorRefC = typename Epilogue::OutputTileIterator::TensorRef;
+  using ParamsD = typename Epilogue::OutputTileIterator::Params;
+  using TensorRefD = typename Epilogue::OutputTileIterator::TensorRef;
+  using ParamsAux = typename Epilogue::AuxOutputTileIterator::Params;
+  using TensorRefAux = typename Epilogue::AuxOutputTileIterator::TensorRef;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+    TensorRefA ref_A;
+    TensorRefB ref_B;
+    TensorRefC ref_C;
+    TensorRefD ref_D;
+    TensorRefE ref_E;
+    TensorRefAux ref_Aux;
+    void* ptr_Vector;
+    typename LayoutC::Stride::Index ldr;
+
+    typename Epilogue::OutputOp::Params epilogue;
+    int split_k_slices;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(): problem_size(0, 0, 0), split_k_slices(1) {
+
+    }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      TensorRefA ref_A_,
+      TensorRefB ref_B_,
+      TensorRefC ref_C_,
+      TensorRefD ref_D_,
+      TensorRefE ref_E_,
+      TensorRefAux ref_Aux_,
+      void* ptr_Vector_,
+      typename LayoutC::Stride::Index ldr_,
+      typename OutputOp::Params epilogue_ = 
+        typename OutputOp::Params(),
+      int split_k_slices = 1
+    ):
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      ref_B(ref_B_),
+      ref_C(ref_C_),
+      ref_D(ref_D_),
+      ref_E(ref_E_),
+      ref_Aux(ref_Aux_),
+      ptr_Vector(ptr_Vector_),
+      ldr(ldr_),
+      epilogue(epilogue_),
+      split_k_slices(split_k_slices) {
+
+    }
+  };
+
+  /// Parameters structure
+  struct Params : public SparseParamsBase<
+      ThreadblockSwizzle, ParamsA, TensorRefA, ParamsB, TensorRefB,
+      ParamsE, TensorRefE> {
+
+    using Base = SparseParamsBase<
+        ThreadblockSwizzle, ParamsA, TensorRefA, ParamsB, TensorRefB,
+        ParamsE, TensorRefE>;
+
+    //
+    // Data members
+    //
+
+    ParamsC params_C;
+    TensorRefC ref_C;
+    ParamsD params_D;
+    TensorRefD ref_D;
+    ParamsAux params_Aux;
+    TensorRefAux ref_Aux;
+
+    void* ptr_Vector;
+    typename LayoutC::Stride::Index ldr;
+
+    typename OutputOp::Params output_op;
+    int *semaphore;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const & problem_size,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      TensorRefA ref_A,
+      TensorRefB ref_B,
+      TensorRefC ref_C,
+      TensorRefD ref_D,
+      TensorRefE ref_E,
+      TensorRefAux ref_Aux,
+      void* ptr_Vector,
+      typename LayoutC::Stride::Index ldr,
+      typename OutputOp::Params output_op = typename OutputOp::Params(),
+      int *workspace = nullptr
+    ):
+      Base(problem_size, grid_tiled_shape, ref_A, ref_B, ref_E, Mma::Shape::kK),
+      params_C(ref_C.layout()),
+      ref_C(ref_C),
+      params_D(ref_D.layout()),
+      ref_D(ref_D),
+      output_op(output_op),
+      ref_Aux(ref_Aux),
+      params_Aux(ref_Aux.layout()),
+      ptr_Vector(ptr_Vector),
+      ldr(ldr) {
+    semaphore = workspace;
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  SparseGemmWithAbsmax() { } 
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+      cutlass::gemm::GemmCoord const & problem_size,
+      typename Mma::IteratorA::TensorRef ref_A,
+      typename Mma::IteratorB::TensorRef ref_B,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D,
+      typename Mma::IteratorE::TensorRef ref_E) {
+
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+    static int const kAlignmentE = Mma::IteratorE::AccessType::kElements;
+
+    if (!TensorRef_aligned(ref_A, kAlignmentA)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_B, kAlignmentB)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_C, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_D, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_E, kAlignmentE)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if ((problem_size.m() % kAlignmentA) || ((problem_size.k() / kSparse) % kAlignmentA) ||
+      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
+      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC) ||
+      (problem_size.m() % kAlignmentE) || ((problem_size.k() / kSparse) % kAlignmentE)) {
+
+      return Status::kErrorMisalignedOperand;
+    }
+
+    // The k dimension has to be the multiple of the Threadblock k because out
+    // of bound meta data would be initialized to 0 by acync.zfill but 0 is not
+    // a valid meta data.
+    if (problem_size.k() % Mma::Shape::kK) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    // M dimension has to be multiple of 32 (sparse float) or 16 (sparse int) 
+    // because of the row reordering of operand E
+    static int const kAlignmentM = (sizeof(ElementE) == 2) ? 32 : 16;
+
+    if (problem_size.m() % kAlignmentM) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.k() * params.gemm_k_size / kSparse,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      threadblock_tile_offset.k() * params.gemm_k_size,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    cutlass::MatrixCoord tb_offset_E{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.k() * params.gemm_k_size / kSparse,
+    };
+
+    // Problem size is a function of threadblock index in the K dimension
+    int problem_size_k = min(
+      params.problem_size.k(), 
+      (threadblock_tile_offset.k() + 1) * params.gemm_k_size);
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - tb_offset_B.row() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A, B, and E operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      params.ref_A.data(),
+      {params.problem_size.m(), problem_size_k / kSparse},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      params.ref_B.data(),
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    typename Mma::IteratorE iterator_E(
+        params.params_E, params.ref_E.data(),
+        {params.problem_size.m(),
+         problem_size_k / kSparse / kElementsPerElementE},
+        thread_idx, tb_offset_E);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    if (!kSplitKSerial || gemm_k_iterations > 0) {
+      // Compute threadblock-scoped matrix multiply-add
+      mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, iterator_E, accumulators);
+    }
+
+    //
+    // Epilogue
+    //
+
+    OutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    // If performing a reduction via split-K, fetch the initial synchronization
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+      
+      // Fetch the synchronization lock initially but do not block.
+      semaphore.fetch();
+
+      // Indicate which position in a serial reduction the output operator is currently updating
+      output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+    }
+
+    typename Epilogue::ElementVector *ptr_Vector = static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
+    // Move to appropriate location for this output tile
+    if (ptr_Vector) {
+      ptr_Vector += threadblock_offset.column() + threadblock_tile_offset.m() * params.ldr;
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      params.ref_C.data(),
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      params.ref_D.data(),
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to auxiliary destination tensor.
+    typename Epilogue::AuxOutputTileIterator iterator_Aux(
+      params.params_Aux,
+      // Only the final block writes the auxiliary tensor
+      ((kSplitKSerial && params.grid_tiled_shape.k() > 1) &&
+          (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
+          ? nullptr
+          : params.ref_Aux.data(),
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+      __threadfence();
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(output_op,
+             // Only the final block uses Vector
+             ((kSplitKSerial && params.grid_tiled_shape.k() > 1) &&
+              (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
+                 ? nullptr
+                 : ptr_Vector,
+             iterator_D,
+             accumulators,
+             iterator_C,
+             iterator_Aux,
+             params.problem_size.mn(),
+             threadblock_offset);
+    
+    //
+    // Release the semaphore
+    //
+
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+      
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      __threadfence();
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
diff --git a/include/cutlass/gemm/kernel/sparse_gemm_with_visitor.h b/include/cutlass/gemm/kernel/sparse_gemm_with_visitor.h
new file mode 100644
index 0000000000..3648040865
--- /dev/null
+++ b/include/cutlass/gemm/kernel/sparse_gemm_with_visitor.h
@@ -0,0 +1,238 @@
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Sparse GEMM with visitor.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/kernel/sparse_gemm.h"
+#include "cutlass/gemm/kernel/params_sparse_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Sparse Gemm that compute the epilogue visitor functor
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+struct SparseGemmWithEpilogueVisitor : public SparseGemm<Mma_, Epilogue_, ThreadblockSwizzle_, false>  {
+
+  using Base = SparseGemm<Mma_, Epilogue_, ThreadblockSwizzle_, false>;
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using FusionCallbacks = typename Epilogue::FusionCallbacks;
+
+  using ParamsA = typename Mma::IteratorA::Params;
+  using TensorRefA = typename Mma::IteratorA::TensorRef;
+  using ParamsB = typename Mma::IteratorB::Params;
+  using TensorRefB = typename Mma::IteratorB::TensorRef;
+  using ParamsE = typename Mma::IteratorE::Params;
+  using TensorRefE = typename Mma::IteratorE::TensorRef;
+
+  static int const kSparse = Base::kSparse;
+  static int const kElementsPerElementE = Base::kElementsPerElementE;
+  using SharedStorage = typename Base::SharedStorage;
+
+  /// Parameters structure
+  struct Params : public SparseParamsBase<
+      ThreadblockSwizzle, ParamsA, TensorRefA, ParamsB, TensorRefB,
+      ParamsE, TensorRefE> {
+
+    using Base = SparseParamsBase<
+        ThreadblockSwizzle, ParamsA, TensorRefA, ParamsB, TensorRefB,
+        ParamsE, TensorRefE>;
+
+    //
+    // Data members
+    //
+
+    typename FusionCallbacks::Params output_op;
+    cute::Shape<int32_t,int32_t,int32_t> problem_shape;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const & problem_size,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      typename Mma::IteratorA::TensorRef ref_A,
+      typename Mma::IteratorB::TensorRef ref_B,
+      typename Mma::IteratorE::TensorRef ref_E,
+      typename FusionCallbacks::Arguments output_op = typename FusionCallbacks::Arguments()
+    ):
+      Base(problem_size, grid_tiled_shape, ref_A, ref_B, ref_E, Mma::Shape::kK),
+      output_op(FusionCallbacks::to_underlying_arguments(problem_size, output_op, nullptr /*workspace*/)),
+      problem_shape(problem_size.m(), problem_size.n(), 1) {
+    }
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  SparseGemmWithEpilogueVisitor() { }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.k() * params.gemm_k_size / kSparse,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      threadblock_tile_offset.k() * params.gemm_k_size,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    cutlass::MatrixCoord tb_offset_E{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.k() * params.gemm_k_size / kSparse,
+    };
+
+    // Problem size is a function of threadblock index in the K dimension
+    int problem_size_k = min(
+      params.problem_size.k(), 
+      (threadblock_tile_offset.k() + 1) * params.gemm_k_size);
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - tb_offset_B.row() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A, B, and E operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      params.ref_A.data(),
+      {params.problem_size.m(), problem_size_k / kSparse},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      params.ref_B.data(),
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    typename Mma::IteratorE iterator_E(
+        params.params_E, params.ref_E.data(),
+        {params.problem_size.m(),
+         problem_size_k / kSparse / kElementsPerElementE},
+        thread_idx, tb_offset_E);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    if (gemm_k_iterations > 0) {
+      // Compute threadblock-scoped matrix multiply-add
+      mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, iterator_E, accumulators);
+    }
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    //
+    // Epilogue
+    //
+
+    Epilogue epilogue(
+      params.output_op,
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(accumulators, threadblock_tile_offset, params.problem_shape, thread_idx);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
diff --git a/include/cutlass/gemm/kernel/static_tile_scheduler.hpp b/include/cutlass/gemm/kernel/static_tile_scheduler.hpp
new file mode 100644
index 0000000000..67d346e3b2
--- /dev/null
+++ b/include/cutlass/gemm/kernel/static_tile_scheduler.hpp
@@ -0,0 +1,502 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm_coord.hpp"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+#include "cute/layout.hpp"
+#include "cute/tensor.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+namespace cutlass::gemm::kernel::detail {
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Users are not supposed to use this class directly.
+// This is a CRTP base class for the actual tile schedulers.
+template<class Subclass>
+class StaticPersistentTileScheduler {
+
+private:
+  uint64_t current_work_linear_idx_;
+  uint64_t total_grid_size_;
+
+public:
+  struct WorkTileInfo {
+    int32_t M_idx = 0;
+    int32_t N_idx = 0;
+    int32_t L_idx = 0;
+    bool is_valid_tile = false;
+
+    CUTLASS_HOST_DEVICE
+    bool
+    is_valid() const {
+      return is_valid_tile;
+    }
+
+    CUTLASS_HOST_DEVICE
+    static WorkTileInfo
+    invalid_work_tile() {
+      return {-1, -1, -1, false};
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool
+    is_final_split(uint32_t k_tiles_per_output_tile) const {
+      return true;
+    }
+
+    CUTLASS_HOST_DEVICE
+    int32_t
+    reduction_subtile_idx() const {
+      return -1;
+    }
+  };
+
+  using Params = PersistentTileSchedulerSm90Params;
+  using RasterOrder = typename Params::RasterOrder;
+  using RasterOrderOptions = typename Params::RasterOrderOptions;
+  static constexpr bool IsDynamicPersistent = false;
+
+public:
+  struct Arguments {
+    int max_swizzle_size = 1;
+    RasterOrderOptions raster_order = RasterOrderOptions::Heuristic;
+  };
+
+  template <class ProblemShapeMNKL, class TileShape, class ClusterShape>
+  static Params
+  to_underlying_arguments(
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShape tile_shape,
+      ClusterShape cluster_shape,
+      [[maybe_unused]] KernelHardwareInfo const& hw_info,
+      Arguments const& arguments,
+      [[maybe_unused]] void* workspace=nullptr,
+      [[maybe_unused]] const uint32_t epilogue_subtile = 1,
+      [[maybe_unused]] uint32_t ktile_start_alignment_count = 1u) {
+
+    // We only need the tile and cluster shape during scheduler setup, so let FTAD do the magic
+    static_assert(cute::is_static<TileShape>::value);
+    static_assert(cute::is_static<ClusterShape>::value);
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape, cluster_shape);
+
+    Params params;
+    params.initialize(
+      problem_blocks,
+      to_gemm_coord(cluster_shape),
+      hw_info,
+      arguments.max_swizzle_size,
+      arguments.raster_order
+    );
+
+    return params;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static bool
+  can_implement(Arguments const& args) {
+    return args.max_swizzle_size >= 1;
+  }
+
+  CUTLASS_HOST_DEVICE
+  StaticPersistentTileScheduler() { }
+
+  CUTLASS_DEVICE explicit StaticPersistentTileScheduler(Params const& params_) : scheduler_params(params_) {
+    // MSVC requires protecting use of CUDA-specific nonstandard syntax,
+    // like blockIdx and gridDim, with __CUDA_ARCH__.
+#if defined(__CUDA_ARCH__)
+    if (params_.raster_order_ == RasterOrder::AlongN) {
+      current_work_linear_idx_ = uint64_t(blockIdx.x) + uint64_t(blockIdx.y) * uint64_t(gridDim.x);
+    }
+    else {
+      current_work_linear_idx_ = uint64_t(blockIdx.x) * uint64_t(gridDim.y) + uint64_t(blockIdx.y);
+    }
+
+    total_grid_size_ = uint64_t(gridDim.x) * uint64_t(gridDim.y) * uint64_t(gridDim.z);
+#else
+    CUTLASS_ASSERT(false && "This line should never be reached");
+#endif
+  }
+
+  // Returns the initial work tile info that will be computed over
+  template <class ClusterShape>
+  CUTLASS_DEVICE
+  WorkTileInfo
+  initial_work_tile_info(ClusterShape cluster_shape) {
+    return get_current_work();
+  }
+
+  CUTLASS_DEVICE
+  WorkTileInfo
+  get_current_work() const {
+    return get_current_work_for_linear_idx(current_work_linear_idx_);
+  }
+
+  CUTLASS_DEVICE
+  WorkTileInfo
+  get_current_work_for_linear_idx(uint64_t linear_idx) const {
+    if (linear_idx >= scheduler_params.blocks_per_problem_) {
+      return WorkTileInfo::invalid_work_tile();
+    }
+
+    // Map worker's linear index into the CTA tiled problem shape to the corresponding MNL indices
+    uint64_t work_idx_l, remainder;
+    scheduler_params.divmod_batch_(work_idx_l, remainder, linear_idx);
+
+    uint64_t blk_per_grid_dim = scheduler_params.divmod_cluster_shape_minor_.divide(remainder);
+
+    auto [work_idx_m, work_idx_n] = Subclass::get_work_idx_m_and_n(blk_per_grid_dim,
+                                                         scheduler_params.divmod_cluster_shape_major_,
+                                                         scheduler_params.divmod_cluster_shape_minor_,
+                                                         scheduler_params.divmod_cluster_blk_major_,
+                                                         scheduler_params.log_swizzle_size_,
+                                                         scheduler_params.raster_order_);
+
+    return {work_idx_m, work_idx_n, static_cast<int32_t>(work_idx_l), true};
+  }
+
+  CUTLASS_DEVICE
+  void
+  advance_to_next_work(uint32_t advance_count = 1) {
+    current_work_linear_idx_ += total_grid_size_ * uint64_t(advance_count);
+  }
+
+  CUTLASS_DEVICE
+  bool is_last_tile(WorkTileInfo& work_tile_info, uint32_t advance_count = 1) const {
+    if (continue_current_work(work_tile_info)) {
+      return false;
+    }
+    return not get_current_work_for_linear_idx(
+        current_work_linear_idx_ + (total_grid_size_ * uint64_t(advance_count))
+    ).is_valid();
+  }
+
+  // Computes the linear index within a batch given M and N tile offsets within the batch.
+  // This essentially inverts the mapping performed in get_work_idx_m_and_n
+  static CUTLASS_DEVICE
+  uint64_t
+  get_linear_idx_from_m_and_n(
+    int32_t tile_m,
+    int32_t tile_n,
+    FastDivmodU64Pow2 const& divmod_cluster_shape_major,
+    FastDivmodU64Pow2 const& divmod_cluster_shape_minor,
+    FastDivmodU64 const& divmod_cluster_blk_major,
+    int32_t log_swizzle_size,
+    RasterOrder raster_order) {
+
+    uint64_t minor_work_idx, major_work_idx, cluster_minor_offset;
+    if (raster_order == RasterOrder::AlongN) {
+      minor_work_idx = static_cast<uint64_t>(tile_m);
+      major_work_idx = static_cast<uint64_t>(tile_n);
+      uint64_t cluster_m = divmod_cluster_shape_minor.divide(tile_m) * divmod_cluster_shape_minor.divisor;
+      cluster_minor_offset = tile_m - cluster_m;
+    }
+    else {
+      major_work_idx = static_cast<uint64_t>(tile_m);
+      minor_work_idx = static_cast<uint64_t>(tile_n);
+      uint64_t cluster_n = divmod_cluster_shape_minor.divide(tile_n) * divmod_cluster_shape_minor.divisor;
+      cluster_minor_offset = tile_n - cluster_n;
+    }
+
+    uint64_t cluster_idx_minor, cluster_idx_major, cluster_major_offset;
+    cluster_idx_minor = divmod_cluster_shape_minor.divide(minor_work_idx - cluster_minor_offset);
+    divmod_cluster_shape_major(cluster_idx_major, cluster_major_offset, major_work_idx);
+
+    uint64_t cluster_idx_minor_div_swizzle = cluster_idx_minor >> log_swizzle_size;
+    uint64_t offset = cluster_idx_minor & ((1 << log_swizzle_size) - 1);
+
+    uint64_t extra = cluster_idx_minor_div_swizzle * divmod_cluster_blk_major.divisor + cluster_idx_major;
+
+    uint64_t cluster_id = (extra << log_swizzle_size) | offset;
+    return (cluster_id * divmod_cluster_shape_major.divisor + cluster_major_offset) * divmod_cluster_shape_minor.divisor + cluster_minor_offset;
+  }
+
+  // Given the inputs, computes the total number of output blocks over which this problem will compute. 
+  // Note that this is only the logical size of our grid, not the physical grid we will actually launch.
+  template<class ProblemShapeMNKL, class BlockShape, class ClusterShape>
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_tiled_cta_shape_mnl(ProblemShapeMNKL problem_shape_mnkl, BlockShape cta_shape, ClusterShape cluster_shape) {
+    auto cta_m = cute::size(cute::ceil_div(cute::shape<0>(problem_shape_mnkl), cute::shape<0>(cta_shape)));
+    auto cta_n = cute::size(cute::ceil_div(cute::shape<1>(problem_shape_mnkl), cute::shape<1>(cta_shape)));
+
+    return Params::get_tiled_cta_shape_mnl(
+      to_gemm_coord(problem_shape_mnkl),
+      to_gemm_coord(cluster_shape),
+      cta_m, cta_n
+    );
+  }
+
+  // Reloaded interface that receives WorkTileInfo to deduce next work.
+  // Kernel helper function to get next work tile
+  CUTLASS_DEVICE
+  auto
+  fetch_next_work(WorkTileInfo work_tile_info) {
+    if (continue_current_work(work_tile_info)) {
+      return cute::make_tuple(work_tile_info, true);
+    }
+
+    advance_to_next_work();
+    return cute::make_tuple(get_current_work(), true);
+  }
+  
+  // Given the inputs, computes the total number of output blocks over which this problem will compute.
+  // Note that this is only the logical size of our grid, not the physical grid we will actually launch.
+  template<class ProblemShapeMNKL, class TileShape, class AtomThrShape, class ClusterShape>
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_tiled_cta_shape_mnl(ProblemShapeMNKL problem_shape_mnkl,
+                          TileShape tile_shape_mnk,
+                          AtomThrShape atom_thr_shape_mnk,
+                          ClusterShape cluster_shape_mnk) {
+    auto [tiles_m, tiles_n, tiles_l] = product_each(ceil_div(select<0,1,3>(problem_shape_mnkl), take<0,2>(tile_shape_mnk)));
+    auto cta_m = round_nearest(tiles_m * size<0>(atom_thr_shape_mnk), size<0>(cluster_shape_mnk));
+    auto cta_n = round_nearest(tiles_n * size<1>(atom_thr_shape_mnk), size<1>(cluster_shape_mnk));
+
+    return Params::get_tiled_cta_shape_mnl(
+      to_gemm_coord(problem_shape_mnkl),
+      to_gemm_coord(cluster_shape_mnk),
+      cta_m, cta_n
+    );
+  }
+
+  CUTLASS_DEVICE
+  static auto
+  work_tile_to_cta_coord(WorkTileInfo work_tile_info) {
+    // Get every cta coord in three dimensions of the cluster
+    auto [cta_m_in_cluster, cta_n_in_cluster, cta_l_in_cluster] = cute::block_id_in_cluster();
+    return make_coord(
+      work_tile_info.M_idx + static_cast<int32_t>(cta_m_in_cluster),
+      work_tile_info.N_idx + static_cast<int32_t>(cta_n_in_cluster),
+      _,
+      work_tile_info.L_idx + static_cast<int32_t>(cta_l_in_cluster)
+    );
+  }
+
+  CUTLASS_DEVICE
+  static auto
+  work_tile_to_cta_coord(WorkTileInfo work_tile_info, dim3 block_id_in_cluster) {
+    // Get every cta coord in three dimensions of the cluster
+    auto [cta_m_in_cluster, cta_n_in_cluster, cta_l_in_cluster] = block_id_in_cluster;
+    return make_coord(
+      work_tile_info.M_idx + static_cast<int32_t>(cta_m_in_cluster),
+      work_tile_info.N_idx + static_cast<int32_t>(cta_n_in_cluster),
+      _,
+      work_tile_info.L_idx + static_cast<int32_t>(cta_l_in_cluster)
+    );
+  }
+
+  // Given the inputs, computes the physical grid we should launch.
+  template<class ProblemShapeMNKL, class BlockShape, class ClusterShape>
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_grid_shape(
+      [[maybe_unused]] Params const& params,
+      ProblemShapeMNKL problem_shape_mnk,
+      BlockShape cta_shape,
+      ClusterShape cluster_shape,
+      KernelHardwareInfo hw_info,
+      Arguments arguments = Arguments{},
+      bool truncate_by_problem_size=true) {
+
+    auto problem_shape_mnkl = cute::append<4>(problem_shape_mnk, cute::Int<1>{});
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, cta_shape, cluster_shape);
+
+    return Params::get_grid_shape(
+      problem_blocks,
+      to_gemm_coord(cluster_shape),
+      hw_info,
+      arguments.max_swizzle_size,
+      arguments.raster_order,
+      /* truncate_by_problem_size = */true
+    );
+  }
+
+  // Given the inputs, computes the physical grid we should launch.
+  template<class ProblemShapeMNKL, class TileShape, class AtomThrShape, class ClusterShape>
+  static dim3
+  get_grid_shape(
+      Params const& params,
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShape tile_shape_mnk,
+      AtomThrShape atom_thr_shape_mnk,
+      ClusterShape cluster_shape_mnk,
+      KernelHardwareInfo hw_info) {
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape_mnk, atom_thr_shape_mnk, cluster_shape_mnk);
+    Arguments args{};
+    if constexpr (!std::is_const_v<decltype(args.max_swizzle_size)>) {
+      args.max_swizzle_size = 1 << params.log_swizzle_size_;
+    }
+    args.raster_order = params.raster_order_ == RasterOrder::AlongN ? RasterOrderOptions::AlongN : RasterOrderOptions::AlongM;
+
+    return Params::get_grid_shape(
+      problem_blocks,
+      to_gemm_coord(cluster_shape_mnk),
+      hw_info,
+      args.max_swizzle_size,
+      args.raster_order,
+      /* truncate_by_problem_size = */true
+    );
+  }
+
+  // Convert CTA-level work tile info to cluster-level tile coord
+  CUTLASS_DEVICE
+  auto
+  work_tile_to_cluster_coord_mnkl(WorkTileInfo work_tile_info) const {
+    // TileScheduler works at CTA-level, kernel works at cluster-level
+    int m_coord = idx2crd(work_tile_info.M_idx / scheduler_params.cluster_shape_m_,
+                          scheduler_params.problem_tiles_m_);
+    int n_coord = idx2crd(work_tile_info.N_idx / scheduler_params.cluster_shape_n_,
+                          scheduler_params.problem_tiles_n_);
+    int l_coord = idx2crd(work_tile_info.L_idx,
+                          scheduler_params.problem_tiles_l_);
+    return make_coord(m_coord, n_coord, _, l_coord);
+  }
+
+  // Returns whether the block assigned this work should compute the epilogue for the corresponding
+  // output tile. For the basic tile scheduler, this is always true.
+  CUTLASS_HOST_DEVICE
+  static bool
+  compute_epilogue(WorkTileInfo const&, Params const&) {
+    return true;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static bool
+  compute_epilogue(WorkTileInfo const&) {
+    return true;
+  }
+
+  // Performs the reduction across splits for a given output tile. Since this scheduler does
+  // not split output tiles, no reduction is needed.
+  template <class FrgTensorC>
+  CUTLASS_DEVICE
+  static void
+  fixup(Params const&, WorkTileInfo const&, FrgTensorC&, uint32_t, uint32_t) {}
+
+  // Performs the reduction across splits for a given output tile. No fixup is required for
+  // work units returned by this scheduler.
+  template <class FrgTensorC>
+  CUTLASS_DEVICE
+  void
+  fixup(WorkTileInfo const&, FrgTensorC&, uint32_t, uint32_t) const { }
+
+  // Returns whether the current WorkTileInfo passed in should continue to be used. Since
+  // this scheduler only schedules work in units of single, full output tiles, the WorkTileInfo
+  // passed in should not be used after having been processed.
+  CUTLASS_DEVICE
+  static bool
+  continue_current_work(WorkTileInfo&) {
+    return false;
+  }
+
+  template <class ProblemShapeMNKL, class TileShape, class Shape>
+  CUTLASS_DEVICE
+  auto
+  get_k_tile_iterator(WorkTileInfo const& work_tile_info, ProblemShapeMNKL problem_shape_MNKL, TileShape tile_shape, Shape) {
+    auto k_tiles = cute::ceil_div(cute::get<2>(problem_shape_MNKL), cute::get<2>(tile_shape));
+    return cute::make_coord_iterator(k_tiles);
+  }
+
+  template <class ProblemShape, class TileShape>
+  CUTLASS_HOST_DEVICE
+  static int
+  get_work_k_tile_count(WorkTileInfo const& work_tile_info, ProblemShape problem_shape, TileShape tile_shape) {
+    // All work units returned by this scheduler cover the entire K iteration
+    // space of the output tile assigned to the work unit.
+    return cute::size(cute::ceil_div(cute::get<2>(problem_shape), cute::get<2>(tile_shape)));
+  }
+
+  CUTLASS_HOST_DEVICE
+  static uint32_t
+  get_work_k_tile_start(WorkTileInfo const&) {
+    // All work units returned by this scheduler start from K tile 0
+    return 0u;
+  }
+
+  CUTLASS_DEVICE
+  static bool
+  need_separate_reduction(Params const& params) {
+    return false;
+  }
+
+  CUTLASS_DEVICE
+  bool
+  is_work_tile_for_reduction(WorkTileInfo const& work_tile_info, Params const& params) {
+    return false;
+  }
+
+  template <class FrgTensorC>
+  CUTLASS_DEVICE
+  void
+  separate_reduction(
+    Params const& params,
+    WorkTileInfo const& work_tile_info,
+    FrgTensorC& accumulators,
+    uint32_t num_barriers,
+    uint32_t barrier_idx) {
+  }
+
+  // Shares the accumulator set with peers in the global workspace
+  template <class FrgTensorC>
+  CUTLASS_DEVICE
+  static void
+  share(
+    Params const& params,
+    WorkTileInfo const& work_tile_info,
+    FrgTensorC& accumulators,
+    uint32_t num_barriers,
+    uint32_t barrier_idx) {
+  }
+
+  CUTLASS_DEVICE
+  static bool
+  valid_warpgroup_in_work_tile(WorkTileInfo const& work_tile_info) {
+    return true;
+  }
+
+  CUTLASS_DEVICE
+  static bool
+  requires_separate_reduction(Params const& params) {
+    return false;
+  }
+
+public:
+  // Sink scheduler params as a member
+  Params scheduler_params;
+};
+
+} // namespace cutlass::gemm::kernel::detail
diff --git a/include/cutlass/gemm/kernel/symm_universal.h b/include/cutlass/gemm/kernel/symm_universal.h
index 4e25e6be60..b51cc6ede4 100755
--- a/include/cutlass/gemm/kernel/symm_universal.h
+++ b/include/cutlass/gemm/kernel/symm_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -119,35 +119,32 @@ struct SymmUniversal {
     // Data members
     //
 
-    GemmUniversalMode mode;
-    GemmCoord problem_size;
-    int batch_count;
+    GemmUniversalMode mode = GemmUniversalMode::kGemm;
+    GemmCoord problem_size{};
+    int batch_count{1};
 
-    typename EpilogueOutputOp::Params epilogue;
+    typename EpilogueOutputOp::Params epilogue{};
 
-    void const * ptr_A;
-    void const * ptr_B;
-    void const * ptr_C;
-    void * ptr_D;
+    void const * ptr_A{nullptr};
+    void const * ptr_B{nullptr};
+    void const * ptr_C{nullptr};
+    void * ptr_D{nullptr};
 
-    int64_t batch_stride_A;
-    int64_t batch_stride_B;
-    int64_t batch_stride_C;
-    int64_t batch_stride_D;
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+    int64_t batch_stride_C{0};
+    int64_t batch_stride_D{0};
 
-    typename LayoutA::Stride::Index lda;
-    typename LayoutB::Stride::Index ldb;
-    typename LayoutC::Stride::Index ldc;
-    typename LayoutC::Stride::Index ldd;
+    typename LayoutA::Stride::Index lda{0};
+    typename LayoutB::Stride::Index ldb{0};
+    typename LayoutC::Stride::Index ldc{0};
+    typename LayoutC::Stride::Index ldd{0};
 
     //
     // Methods
     //
     
-    Arguments(): 
-      mode(GemmUniversalMode::kGemm), 
-      batch_count(1), 
-      ptr_A(nullptr), ptr_B(nullptr), ptr_C(nullptr), ptr_D(nullptr) { }
+    Arguments() = default;
 
     /// constructs an arguments structure
     Arguments(
@@ -173,7 +170,8 @@ struct SymmUniversal {
       batch_count(batch_count),
       epilogue(epilogue), 
       ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D), 
-      batch_stride_A(batch_stride_A), batch_stride_C(batch_stride_C), batch_stride_D(batch_stride_D), 
+      batch_stride_A(batch_stride_A), batch_stride_B(0),
+      batch_stride_C(batch_stride_C), batch_stride_D(batch_stride_D), 
       lda(lda), ldb(ldb), ldc(ldc), ldd(ldd) {
 
       }
@@ -206,64 +204,43 @@ struct SymmUniversal {
   /// Parameters structure
   struct Params {
 
-    cutlass::gemm::GemmCoord problem_size;
-    cutlass::gemm::GemmCoord grid_tiled_shape;
-    int swizzle_log_tile;
+    cutlass::gemm::GemmCoord problem_size{};
+    cutlass::gemm::GemmCoord grid_tiled_shape{};
+    int swizzle_log_tile{0};
     
     // Mma1 Iterator A and B params
-    typename Mma1::IteratorA::Params params_A_mma1;
-    typename Mma1::IteratorB::Params params_B_mma1;
+    typename Mma1::IteratorA::Params params_A_mma1{};
+    typename Mma1::IteratorB::Params params_B_mma1{};
 
     // Mma2 Iterator A and B params 
-    typename Mma2::IteratorA::Params params_A_mma2;
-    typename Mma2::IteratorB::Params params_B_mma2;
+    typename Mma2::IteratorA::Params params_A_mma2{};
+    typename Mma2::IteratorB::Params params_B_mma2{};
 
-    typename Epilogue::OutputTileIterator::Params params_C;
-    typename Epilogue::OutputTileIterator::Params params_D;
+    typename Epilogue::OutputTileIterator::Params params_C{};
+    typename Epilogue::OutputTileIterator::Params params_D{};
     
-    typename EpilogueOutputOp::Params output_op;
+    typename EpilogueOutputOp::Params output_op{};
 
-    GemmUniversalMode mode;
-    int batch_count;
-    int gemm_k_size;
+    GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
+    int batch_count {0};
+    int gemm_k_size {0};
 
-    void * ptr_A;
-    void * ptr_B;
-    void * ptr_C;
-    void * ptr_D;
+    void * ptr_A{nullptr};
+    void * ptr_B{nullptr};
+    void * ptr_C{nullptr};
+    void * ptr_D{nullptr};
 
-    int64_t batch_stride_A;
-    int64_t batch_stride_B;
-    int64_t batch_stride_C;
-    int64_t batch_stride_D;
+    int64_t batch_stride_A {0};
+    int64_t batch_stride_B {0};
+    int64_t batch_stride_C {0};
+    int64_t batch_stride_D {0};
 
-    int *semaphore;
+    int *semaphore{nullptr};
 
     //
     // Methods
     //
-
-    CUTLASS_HOST_DEVICE
-    Params():
-      swizzle_log_tile(0),
-      params_A_mma1(0),
-      params_B_mma1(0),
-      params_A_mma2(0),
-      params_B_mma2(0),
-      params_C(0),
-      params_D(0),
-      batch_count(0),
-      gemm_k_size(0),
-      mode(cutlass::gemm::GemmUniversalMode::kGemm),
-      ptr_A(nullptr),
-      ptr_B(nullptr),
-      ptr_C(nullptr),
-      ptr_D(nullptr),
-      batch_stride_A(0),
-      batch_stride_B(0),
-      batch_stride_C(0),
-      batch_stride_D(0),
-      semaphore(nullptr) { }
+    Params() = default;
 
     CUTLASS_HOST_DEVICE
     Params(
@@ -415,7 +392,7 @@ struct SymmUniversal {
 
     // Broadcast the warp_id computed by lane 0 to ensure dependent code
     // is compiled as warp-uniform.
-    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int warp_idx = canonical_warp_idx_sync();
 
     int lane_idx = threadIdx.x % 32;
 
diff --git a/include/cutlass/gemm/kernel/tile_scheduler.hpp b/include/cutlass/gemm/kernel/tile_scheduler.hpp
new file mode 100644
index 0000000000..ba6b424324
--- /dev/null
+++ b/include/cutlass/gemm/kernel/tile_scheduler.hpp
@@ -0,0 +1,147 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+/*! \file
+    \brief Utilities for selecting default tile schedulers
+*/
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/detail/dependent_false.hpp"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm {
+
+//
+// Tags for specifying tile schedulers
+//
+
+struct PersistentScheduler { };
+
+struct StreamKScheduler { };
+
+struct GroupScheduler { }; // Only used for Grouped GEMMs
+
+} // namespace cutlass::gemm
+////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp"
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel::detail {
+
+//
+// Selectors mapping tile scheduler tag and arch tag to a tile scheduler class
+//
+
+template <
+  class TileSchedulerTag,
+  class ArchTag,
+  class TileShape,
+  class ClusterShape
+  , class ProblemShapeType = void
+>
+struct TileSchedulerSelector {
+  static_assert(cutlass::detail::dependent_false<ArchTag>,
+      "Could not select a tile scheduler for given parameters.");
+};
+
+template <
+  class ArchTag,
+  class TileShape,
+  class ClusterShape
+>
+struct TileSchedulerSelector<
+    PersistentScheduler,
+    ArchTag,
+    TileShape,
+    ClusterShape
+  > {
+  using Scheduler = PersistentTileSchedulerSm90;
+};
+
+// Default (void) for Sm90 maps to PersistentTileSchedulerSm90
+template <
+  class ArchTag,
+  class TileShape,
+  class ClusterShape
+>
+struct TileSchedulerSelector<
+    void,
+    ArchTag,
+    TileShape,
+    ClusterShape
+  > {
+  using Scheduler = typename TileSchedulerSelector<
+      PersistentScheduler,
+      ArchTag,
+      TileShape,
+      ClusterShape
+  >::Scheduler;
+};
+
+template <
+  class TileShape,
+  class ClusterShape
+>
+struct TileSchedulerSelector<
+    StreamKScheduler,
+    arch::Sm90,
+    TileShape,
+    ClusterShape
+  > {
+  using Scheduler = PersistentTileSchedulerSm90StreamK<TileShape, ClusterShape>;
+};
+
+template <
+  class TileShape,
+  class ClusterShape
+  , class GroupProblemShape
+>
+struct TileSchedulerSelector<
+    GroupScheduler,
+    arch::Sm90,
+    TileShape,
+    ClusterShape
+    , GroupProblemShape
+  > {
+  using Scheduler = PersistentTileSchedulerSm90Group<GroupProblemShape>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel::detail
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/tile_scheduler_params.h b/include/cutlass/gemm/kernel/tile_scheduler_params.h
new file mode 100644
index 0000000000..da8794bb56
--- /dev/null
+++ b/include/cutlass/gemm/kernel/tile_scheduler_params.h
@@ -0,0 +1,1784 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+/*! \file
+    \brief Parameters structures for persistent tile schedulers
+*/
+
+#include "cutlass/coord.h"
+#include "cutlass/kernel_hardware_info.h"
+#include "cutlass/workspace.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm_coord.h"
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+namespace detail {
+
+////////////////////////////////////////////////////////////////////////////////
+
+  CUTLASS_HOST_DEVICE
+  static uint32_t
+  get_max_cta_occupancy(
+    int max_sm_per_gpc,
+    GemmCoord cluster_shape,
+    int sm_count) {
+      // Provided SM count could possibly be less than the assumed maximum SMs per GPC
+      auto cluster_size = cluster_shape.m() * cluster_shape.n();
+      int const min_num_gpc = sm_count < max_sm_per_gpc ? 1 : sm_count / max_sm_per_gpc;
+      int const max_cta_occupancy_per_gpc = max_sm_per_gpc - (max_sm_per_gpc % cluster_size);
+      int cta_per_device = min_num_gpc * max_cta_occupancy_per_gpc;
+
+      // The calculation below allows for larger grid size launch for different GPUs.
+      int const num_gpc_residual = sm_count < max_sm_per_gpc ? 0 : sm_count % max_sm_per_gpc;
+      int const max_cta_occupancy_per_residual_gpc = num_gpc_residual - (num_gpc_residual % cluster_size);
+      cta_per_device += max_cta_occupancy_per_residual_gpc;
+
+      cta_per_device = sm_count < cta_per_device ? sm_count : cta_per_device;
+      return cta_per_device;
+   }
+//
+// Parameters for SM90 tile schedulers
+//
+
+// Parameters for SM90 persistent tile scheduler
+struct PersistentTileSchedulerSm90Params {
+
+  enum class RasterOrder {
+    AlongM,
+    AlongN
+  };
+
+  enum class RasterOrderOptions {
+    Heuristic,
+    AlongM,
+    AlongN
+  };
+
+  FastDivmodU64Pow2 divmod_cluster_shape_major_{};
+  FastDivmodU64Pow2 divmod_cluster_shape_minor_{};
+  FastDivmodU64 divmod_batch_{};
+  FastDivmodU64 divmod_cluster_blk_major_{};
+
+  uint64_t blocks_per_problem_ = 0;
+  int32_t log_swizzle_size_ = 0;
+  RasterOrder raster_order_ = RasterOrder::AlongN;
+
+  uint32_t problem_tiles_m_ = 0;
+  uint32_t problem_tiles_n_ = 0;
+  uint32_t problem_tiles_l_ = 0;
+  uint32_t cluster_shape_m_ = 0;
+  uint32_t cluster_shape_n_ = 0;
+
+  // Initializes members. This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  void
+  initialize(
+    BatchedGemmCoord problem_shape,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option
+  ) {
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape, tile_shape, cluster_shape);
+    return initialize(
+      problem_blocks,
+      cluster_shape,
+      hw_info,
+      max_swizzle_size,
+      raster_order_option
+    );
+  }
+
+  // Version of initialize that takes in as input the number of CTAs in the M and N and L dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  void
+  initialize(
+    dim3 problem_blocks,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option
+  ) {
+
+    CUTLASS_UNUSED(hw_info);
+
+    // Round up to nearest multiple of swizzle_size along each mode
+    auto log_swizzle_size = get_log_swizzle_size(problem_blocks.x, problem_blocks.y, max_swizzle_size);
+    auto problem_blocks_m = round_up(problem_blocks.x, (1 << log_swizzle_size) * cluster_shape.m());
+    auto problem_blocks_n = round_up(problem_blocks.y, (1 << log_swizzle_size) * cluster_shape.n());
+
+    problem_tiles_m_ = problem_blocks_m / cluster_shape.m();
+    problem_tiles_n_ = problem_blocks_n / cluster_shape.n();
+    problem_tiles_l_ = problem_blocks.z;
+    cluster_shape_m_ = cluster_shape.m();
+    cluster_shape_n_ = cluster_shape.n();
+
+    RasterOrder raster_order = get_rasterization_order(
+      problem_blocks_m,
+      problem_blocks_n,
+      raster_order_option
+    );
+
+    //
+    // Set members
+    //
+
+    blocks_per_problem_ = problem_blocks_m * problem_blocks_n * problem_blocks.z;
+    log_swizzle_size_ = log_swizzle_size;
+    raster_order_ = raster_order;
+    divmod_batch_ = FastDivmodU64(problem_blocks_m * problem_blocks_n);
+
+    if (raster_order == RasterOrder::AlongN) {
+      divmod_cluster_shape_major_ = FastDivmodU64Pow2(cluster_shape.n());
+      divmod_cluster_shape_minor_ = FastDivmodU64Pow2(cluster_shape.m());
+      divmod_cluster_blk_major_ = FastDivmodU64(problem_blocks_n / cluster_shape.n());
+    }
+    else {
+      divmod_cluster_shape_major_ = FastDivmodU64Pow2(cluster_shape.m());
+      divmod_cluster_shape_minor_ = FastDivmodU64Pow2(cluster_shape.n());
+      divmod_cluster_blk_major_ = FastDivmodU64(problem_blocks_m / cluster_shape.m());
+    }
+  }
+
+  // Given the inputs, computes the physical grid we should launch.
+  // This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_grid_shape(
+    BatchedGemmCoord problem_shape,
+    GemmCoord cta_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo hw_info,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option,
+    bool truncate_by_problem_size=true
+    ) {
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape, cta_shape, cluster_shape);
+    return get_grid_shape(
+      problem_blocks,
+      cluster_shape,
+      hw_info,
+      max_swizzle_size,
+      raster_order_option,
+      truncate_by_problem_size
+    );
+  }
+
+  // Version of get_grid_shape that takes in as input the number of CTAs in the M and N and L dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_grid_shape(
+    dim3 problem_blocks,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo hw_info,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option,
+    bool truncate_by_problem_size=true
+    ) {
+
+    int const sm_count = hw_info.sm_count;
+
+    // Round up to nearest multiple of swizzle_size along each mode
+    auto log_swizzle_size = get_log_swizzle_size(problem_blocks.x, problem_blocks.y, max_swizzle_size);
+    auto problem_blocks_m = round_up(problem_blocks.x, (1 << log_swizzle_size) * cluster_shape.m());
+    auto problem_blocks_n = round_up(problem_blocks.y, (1 << log_swizzle_size) * cluster_shape.n());
+
+    int problem_blocks_total = problem_blocks_m * problem_blocks_n * problem_blocks.z;
+
+    RasterOrder raster_order = get_rasterization_order(
+      problem_blocks_m,
+      problem_blocks_n,
+      raster_order_option
+    );
+
+    dim3 launch_grid;
+
+    if (raster_order == RasterOrder::AlongN) {
+      launch_grid = dim3(cluster_shape.m(), 1, 1);
+    }
+    else {
+      launch_grid = dim3(1, cluster_shape.n(), 1);
+    }
+
+    auto possibly_truncate = [&](int x, int y) {
+      if (truncate_by_problem_size) {
+        return platform::min(x, y);
+      }
+      else {
+        return x;
+      }
+    };
+
+    // The else path is generic, however, we can avoid some divs if we know cluster size is 1
+    auto cluster_size = cluster_shape.m() * cluster_shape.n();
+    if (cluster_size == 1) {
+      if (raster_order == RasterOrder::AlongN) {
+        launch_grid.y = possibly_truncate(sm_count, problem_blocks_total);
+      }
+      else {
+        launch_grid.x = possibly_truncate(sm_count, problem_blocks_total);
+      }
+    }
+    else {
+      int cta_per_device = sm_count;
+      /*
+      * Optimal grid size calculation is based on
+      * GH100: 8 GPCs, 72 TPCs (9 TPCs/GPC), 2 SMs/TPC, 144 SMs per full GPU
+      * Hence, maximum SMs per GPC = 18
+      */
+      constexpr int max_sm_per_gpc = 18;
+      cta_per_device = get_max_cta_occupancy(max_sm_per_gpc, cluster_shape, sm_count);
+      if (raster_order == RasterOrder::AlongN) {
+        launch_grid.y = possibly_truncate(
+            cta_per_device       / cluster_shape.m(),
+            problem_blocks_total / cluster_shape.m());
+      }
+      else {
+        launch_grid.x = possibly_truncate(
+            cta_per_device       / cluster_shape.n(),
+            problem_blocks_total / cluster_shape.n());
+      }
+    }
+    return launch_grid;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t
+  get_log_swizzle_size(int problem_ctas_m, int problem_ctas_n, int max_swizzle_size) {
+    int min_cta_dim = platform::min(problem_ctas_m, problem_ctas_n);
+    if (max_swizzle_size >= 8 && min_cta_dim >= 6) {
+      return 3;
+    }
+    else if (max_swizzle_size >= 4 && min_cta_dim >= 3) {
+      return 2;
+    }
+    else if (max_swizzle_size >= 2 && min_cta_dim >= 2) {
+      return 1;
+    }
+    else {
+      return 0;
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  static RasterOrder
+  get_rasterization_order(
+    uint32_t tiles_m,
+    uint32_t tiles_n,
+    RasterOrderOptions raster_order_option
+  ) {
+
+    if (raster_order_option == RasterOrderOptions::Heuristic) {
+      if (tiles_n > tiles_m) {
+        return RasterOrder::AlongM;
+      }
+      else {
+        return RasterOrder::AlongN;
+      }
+    }
+    else {
+      switch (raster_order_option) {
+        case RasterOrderOptions::AlongN:
+          return RasterOrder::AlongN;
+          break;
+        default:
+          return RasterOrder::AlongM;
+      }
+    }
+  }
+
+  // Get the number of CTA tiles in this problem. This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  CUTLASS_HOST_DEVICE
+  static dim3
+  get_tiled_cta_shape_mnl(BatchedGemmCoord problem_shape, GemmCoord cta_shape, GemmCoord cluster_shape) {
+    auto cta_m = (problem_shape.m() + cta_shape.m() - 1) / cta_shape.m();
+    auto cta_n = (problem_shape.n() + cta_shape.n() - 1) / cta_shape.n();
+
+    return get_tiled_cta_shape_mnl(problem_shape, cluster_shape, cta_m, cta_n);
+  }
+
+  // Version of get_tiled_cta_shape_mnl that takes in as input the number of CTAs in the M and N dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  CUTLASS_HOST_DEVICE
+  static dim3
+  get_tiled_cta_shape_mnl(BatchedGemmCoord problem_shape, GemmCoord cluster_shape, uint32_t cta_m, uint32_t cta_n) {
+
+    // Round up to nearest multiple of cluster dim along each mode
+    auto problem_blocks_m = ((cta_m + cluster_shape.m() - 1) / cluster_shape.m()) * cluster_shape.m();
+    auto problem_blocks_n = ((cta_n + cluster_shape.n() - 1) / cluster_shape.n()) * cluster_shape.n();
+
+    return {
+      static_cast<uint32_t>(problem_blocks_m),
+      static_cast<uint32_t>(problem_blocks_n),
+      static_cast<uint32_t>(problem_shape.batch())
+    };
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Parameters for SM90 persistent stream-K scheduler
+struct PersistentTileSchedulerSm90StreamKParams {
+
+  // Strategies for computing reductions between CTAs computing portions of a given output tile
+  enum class ReductionMode {
+    // Participating CTAs perform reduction in a turnstile fashion in order of the K extent
+    // covered by each CTA. This requires a lock to be held exclusively be the CTA that is
+    // currently accumulating.
+    //
+    // Turnstile accumulation ensures deterministic numeric behavior when using this mode.
+    Deterministic,
+
+    // Participating CTAs perform reduction atomically to the same workspace (mostly) without locking.
+    // Locks are used only to wait for the first CTA to write its partial values (to initialize the
+    // workspace), and for all but the final CTA to have accumulated (so that the final CTA can load
+    // the accumulated value and accumulate it into registers on top of which the epilogue will
+    // be performed).
+    //
+    // Due to the nondeterminsitic ordering of accumulation, deterministic numeric behavior cannot
+    // be guaranteed with this mode (e.g., floating-point rounding error will depend on the order
+    // of accumulation)
+    Nondeterministic
+  };
+
+  // Strategies for decomposing the problem
+  enum class DecompositionMode {
+    // Use a heuristic to determine whether data-parallel, split-K, or stream-K decomposition should be performed
+    Heuristic,
+    // Force a data-parallel decomposition
+    DataParallel,
+    // Force a split-K decomposition. This should be paired with setting the `splits` parameter
+    SplitK,
+    // Force a stream-K decomposition
+    StreamK
+  };
+
+  using UnderlyingParams = PersistentTileSchedulerSm90Params;
+  using RasterOrder = UnderlyingParams::RasterOrder;
+  using RasterOrderOptions = UnderlyingParams::RasterOrderOptions;
+
+  // Cluster dimensions are typically always a power of 2, so use
+  // the power-of-two variants of FastDivmod for these.
+  FastDivmodU64Pow2 divmod_cluster_shape_major_{};
+  FastDivmodU64Pow2 divmod_cluster_shape_minor_{};
+
+  FastDivmodU64 divmod_batch_{};
+  FastDivmodU64 divmod_cluster_blk_major_{};
+
+  // Total number of cluster-sized output tiles (i.e., not including any
+  // splitting factors). This is primarily used for split-K decompositions,
+  // and may be overridden in other decompositions.
+  FastDivmodU64 divmod_clusters_mnl_{};
+
+  // We divide up the number of stream-K tiles amongst G groups of stream-K units.
+  // The stream-K units within a group collaborate to comptue over the `sk_tiles / G`
+  // tiles assigned to that group. Non-unit group sizes can help to preserve L2 locality of
+  // partial chunks computed by stream-K units -- units 0 in each group will compute identical K extents
+  // of tiles that would be assigned in the same wave according to the rasterization order of the
+  // data-parallel formulation of the problem.
+  FastDivmodU64 divmod_sk_groups_{};
+
+  // Number of stream-K units in each group
+  FastDivmodU64 divmod_sk_units_per_group_{};
+
+  uint64_t units_per_problem_ = 0;
+  FastDivmod divmod_tiles_per_output_tile_{};
+  int32_t log_swizzle_size_ = 0;
+  RasterOrder raster_order_ = RasterOrder::AlongN;
+
+  // The splitting factor to be used in a split-K decomposition of the problem.
+  // If this is set to a value greater than 1, stream-K decomposition logic
+  // is bypassed in favor of a split-K decomposition.
+  FastDivmod divmod_splits_{};
+
+  // Number of stream-K or split-K work units that compute an extra k iteration.
+  // This is done to handle residuals in dividing up the k iteration space.
+  // For stream-K, since the actual assignment of work to stream-K units will be done
+  // at the granularity of a cluster, we store only the number of big clusters.
+  uint32_t big_units_ = 0;
+
+  // The number of groups of stream-K units that will process an extra stream-K tile cluster.
+  uint32_t big_groups_ = 0;
+
+  // Workspace for holding partial accumulators to be reduced across stream-K/split-K units
+  void* reduction_workspace_ = nullptr;
+
+  // Number of tiles covered by stream-K work units
+  uint32_t sk_tiles_ = 0;
+
+  // Number of work units computing stream-K tiles
+  uint32_t sk_units_ = 0;
+
+  // Number of tiled k iterations computed by each stream-K work unit. This
+  // can potentially cover more than one output tile.
+  FastDivmod divmod_k_tiles_per_sk_unit_{};
+  // Number of tiled k iterations computed by each "big" stream-K units, which
+  // processes one more K chunk than a "normal" stream-K unit.
+  FastDivmod divmod_k_tiles_per_sk_big_unit_{};
+
+  // Strategy to use when reducing between collaborating CTAs
+  ReductionMode reduction_mode_ = ReductionMode::Deterministic;
+
+  // The number of sub blocks in the kernel epilogue
+  FastDivmodU64 divmod_epilogue_subtile_{};
+
+  // The number of blocks that launched for doing separate reduction
+  uint32_t separate_reduction_units_ = 0;
+
+  // Minimum number of k tiles that can be assigned to a stream-K unit
+  static constexpr uint32_t min_iters_per_sk_unit_ = 8u;
+
+  // Maximum number of groups of stream-K units
+  static constexpr uint32_t max_sk_groups_ = 8u;
+
+  // ktile start from even for each cta
+  uint32_t ktile_start_alignment_count_ { 1u };
+
+  // Divides dividend by the cluster size
+  CUTLASS_HOST_DEVICE
+  uint64_t
+  div_cluster_size(uint64_t dividend) const {
+    // Use each underlying fast divmod rather than performing integer division
+    // by the multiplication of major.divisor * minor.divisor
+    return divmod_cluster_shape_minor_.divide(
+      divmod_cluster_shape_major_.divide(dividend)
+    );
+  }
+
+  CUTLASS_HOST_DEVICE
+  uint64_t
+  get_cluster_size() const {
+    return divmod_cluster_shape_minor_.divisor * divmod_cluster_shape_major_.divisor;
+  }
+
+  // Returns whether the kernel uses separate reduction
+  CUTLASS_HOST_DEVICE
+  bool
+  requires_separate_reduction() const {
+    return separate_reduction_units_ > 0;
+  }
+
+  // Returns the maximum number of peers that can collaborate on a given output tile
+  CUTLASS_HOST_DEVICE
+  static uint32_t
+  max_peers_per_tile(uint64_t sk_units, uint64_t sk_tiles) {
+    // When we can divide up our SK units to SK tiles evenly, the number of peers
+    // per SK tile is exactly (sk_units_ / sk_tiles_). In cases where this division
+    // is not exact, some tiles will need to be covered by additional SK units. Because
+    // the extra work can occur at both the beginning and the end of the SK tile, at
+    // most 2 extra peers will be needed.
+    return static_cast<uint32_t>(sk_units / sk_tiles + 2);
+  }
+
+  // Initializes members. This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  void
+  initialize(
+    BatchedGemmCoord problem_shape,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    ReductionMode reduction_mode,
+    DecompositionMode decomposition_mode,
+    void* workspace,
+    const uint32_t epilogue_subtile = 1u
+  ) {
+    dim3 problem_blocks = UnderlyingParams::get_tiled_cta_shape_mnl(
+      problem_shape, tile_shape, cluster_shape);
+
+    // Number of k tiles in each output tile
+    uint32_t k_tiles_per_output_tile = (problem_shape.k() + tile_shape.k() - 1) / tile_shape.k();
+
+    initialize(
+      problem_blocks,
+      k_tiles_per_output_tile,
+      cluster_shape,
+      hw_info,
+      splits,
+      max_swizzle,
+      raster_order_option,
+      reduction_mode,
+      decomposition_mode,
+      workspace,
+      epilogue_subtile
+    );
+  }
+
+  // Version of initialize that takes in as input the number of CTAs in the M and N and L dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  void
+  initialize(
+    dim3 problem_blocks,
+    uint32_t k_tiles_per_output_tile,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    ReductionMode reduction_mode,
+    DecompositionMode decomposition_mode,
+    void* workspace,
+    const uint32_t epilogue_subtile = 1
+  ) {
+
+    #if !defined(__CUDACC_RTC__)
+    if (hw_info.sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      hw_info.sm_count = KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+    }
+    #endif // !defined(__CUDACC_RTC__) 
+
+    UnderlyingParams underlying_params;
+    underlying_params.initialize(
+      problem_blocks,
+      cluster_shape,
+      hw_info,
+      max_swizzle,
+      raster_order_option
+    );
+
+    // Set basic parameters that not affected by any heuristics in advance.
+    set_params_base(underlying_params, workspace);
+
+    // Call for internal streamk heuristic to setup streamk related params
+    stream_k_heuristic(
+      underlying_params,
+      problem_blocks,
+      k_tiles_per_output_tile,
+      cluster_shape,
+      hw_info,
+      splits,
+      max_swizzle,
+      raster_order_option,
+      decomposition_mode,
+      reduction_mode,
+      epilogue_subtile
+    ); 
+  }
+  
+  // max_sk_groups_ unless this extends beyond the extent of the dimension over
+  // which the problem is rasterized. For example, if the tiled problem shape
+  // (in CTA_M x CTA_N representation) when using 1x1 clusters is 4x16,
+  // and we rasterize along the M dimension, we choose 4 groups, rather than 8.
+  // If the cluster shape is 2x1, we choose 2 groups (CTA_M / CLUSTER_M).
+  uint32_t calculate_groups(
+    UnderlyingParams underlying_params,
+    ReductionMode reduction_mode,
+    uint32_t problem_blocks_m,
+    uint32_t problem_blocks_n,
+    GemmCoord cluster_shape,
+    uint64_t cluster_size,
+    uint32_t sk_tiles,
+    uint64_t sk_cluster_tiles,
+    uint64_t sk_units,
+    uint32_t k_tiles_per_output_tile,
+    bool do_separate_reduction) {
+
+    uint32_t max_groups_problem;
+    if (underlying_params.raster_order_ == RasterOrder::AlongM) {
+      max_groups_problem = problem_blocks_m / cluster_shape.m();
+    }
+    else {
+      max_groups_problem = problem_blocks_n / cluster_shape.n();
+    }
+
+    // Select the number of groups that will be use. We start with the maximum
+    // number of potential groups, and iterate down looking for a group size that
+    // evenly divides the stream-K units and tiles, and for which the resulting
+    // number of K tiles per stream-K unit remains above min_iters_per_sk_unit_
+
+    uint32_t groups = platform::min(max_groups_problem, uint32_t(max_sk_groups_));
+    // Grouping is disabled when separate reduction is used because grouping is primarily an attempt
+    // to improve L2 locality, and L2-locality optimizations are unnecessary when the the kernel
+    // is a single wave (which is the case for separate reduction).
+    if (
+      do_separate_reduction
+      ) {
+      groups = 1;
+    }
+
+    uint32_t fallback_groups = 0;
+    auto sk_cluster_units = sk_units / cluster_size;
+
+    auto sk_splits_too_small = [&](uint32_t g) {
+      // Check whether the number of K tiles computed per stream-K unit is less
+      // than min_iters_per_sk_unit_
+      auto total_sk_cluster_tiles = (sk_cluster_tiles / g) * cluster_size;
+      auto total_sk_k_tiles = total_sk_cluster_tiles * k_tiles_per_output_tile;
+      auto k_tiles_per_sk_unit = total_sk_k_tiles / (sk_units / g);
+      return k_tiles_per_sk_unit < min_iters_per_sk_unit_;
+    };
+
+    auto is_ideal_grouping = [&](uint32_t g) {
+      // An ideal grouping will evenly divide stream-K clusters, evenly divide
+      // stream-K tiles, and not result in stream-K splits that are too small.
+      return (sk_cluster_units % g == 0) && (sk_cluster_tiles % g == 0) && !sk_splits_too_small(g);
+    };
+
+    auto is_valid_grouping = [&](uint32_t g) {
+      // A grouping is valid, but not ideal, if it evenly divides the
+      // stream-K clusters and does not result in stream-K splits that are
+      // too small. Such a setting can be used as a fallback option in the
+      // case that an ideal grouping is not achievable
+      return sk_cluster_units % g == 0 && !sk_splits_too_small(g);
+    };
+
+    while (groups > 1 && !is_ideal_grouping(groups)) {
+      if (fallback_groups == 0 && is_valid_grouping(groups)) {
+        // Set fallback groups once in preference for a larger number of groups.
+        fallback_groups = groups;
+      }
+      --groups;
+    }
+
+    // If groups == 1, we did not find a group count that satisfies all criteria. If we have
+    // found a fallback group count, use this instead.
+    if (groups == 1 && fallback_groups > 0) {
+      groups = fallback_groups;
+    }
+    return groups;
+  }
+
+  // Stream-K kernel use below function to set stream-K feature related parameters to choose
+  // optimal/customized decomposition mode.
+  void stream_k_heuristic(
+    UnderlyingParams underlying_params,
+    dim3 problem_blocks,
+    uint32_t k_tiles_per_output_tile,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    DecompositionMode decomposition_mode,
+    ReductionMode reduction_mode,
+    const uint32_t epilogue_subtile = 1
+    ) {
+    uint32_t groups = 0;
+    uint32_t sk_tiles = 0;
+    uint64_t sk_units = 0;
+    uint64_t cluster_size = 0;
+    uint64_t dp_units = 0;
+    uint64_t k_tiles_per_group = 0;
+    uint64_t k_tiles_per_sk_unit = 0;
+    uint64_t sk_big_groups = 0;
+    uint32_t sk_splits = 1;
+    // Self calculated optimal heuristic mode
+    DecompositionMode heuristic_mode =
+      select_decomposition_mode(
+        groups,
+        sk_tiles,
+        sk_units,
+        cluster_size,
+        dp_units,
+        k_tiles_per_group,
+        k_tiles_per_sk_unit,
+        sk_big_groups,
+        sk_splits,
+        underlying_params,
+        problem_blocks,
+        k_tiles_per_output_tile,
+        cluster_shape,
+        hw_info,
+        splits,
+        max_swizzle,
+        raster_order_option,
+        decomposition_mode,
+        reduction_mode,
+        epilogue_subtile
+      );
+
+    // Given heuristic_mode returned from the heuristic() method, set params fields.
+    // Here, we decouple the params that have no relation with
+    // decomposition mode from the params that are decided within heuristic().
+    set_params(
+      heuristic_mode,
+      groups,
+      sk_tiles,
+      sk_units,
+      cluster_size,
+      dp_units,
+      k_tiles_per_group,
+      k_tiles_per_sk_unit,
+      sk_big_groups,
+      sk_splits,
+      underlying_params,
+      problem_blocks,
+      k_tiles_per_output_tile,
+      cluster_shape,
+      splits,
+      epilogue_subtile,
+      reduction_mode);
+  }
+
+  // Return the optimal decomposition result by heuristic.
+  DecompositionMode select_decomposition_mode(
+    uint32_t &groups,
+    uint32_t &sk_tiles,
+    uint64_t &sk_units,
+    uint64_t &cluster_size,
+    uint64_t &dp_units,
+    uint64_t &k_tiles_per_group,
+    uint64_t &k_tiles_per_sk_unit,
+    uint64_t &sk_big_groups,
+    uint32_t &sk_splits,
+    UnderlyingParams underlying_params,
+    dim3 problem_blocks,
+    uint32_t k_tiles_per_output_tile,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    DecompositionMode decomposition_mode,
+    ReductionMode reduction_mode,
+    uint32_t epilogue_subtile
+  ) {
+
+    // Get block numbers in m, n and l dimensions
+    if (decomposition_mode == DecompositionMode::SplitK ||
+        (decomposition_mode == DecompositionMode::Heuristic && splits > 1)) {
+      // Short circuit to basic split-K decomposition
+      uint32_t adapted_splits = adjust_split_count(
+        splits, hw_info.sm_count, k_tiles_per_output_tile
+      );
+      sk_splits = adapted_splits;
+      return DecompositionMode::SplitK;
+    }
+    else {
+      // Calculate the maximum number of blocks from clusters of shape cluster_shape that we
+      // can fit within sm_count SMs.
+      // Get block numbers in m, n and l dimensions
+      auto problem_blocks_l = problem_blocks.z;
+      auto problem_blocks_m = round_up(problem_blocks.x, (1 << underlying_params.log_swizzle_size_) * cluster_shape.m());
+      auto problem_blocks_n = round_up(problem_blocks.y, (1 << underlying_params.log_swizzle_size_) * cluster_shape.n());
+      uint64_t output_tiles = problem_blocks_m * problem_blocks_n * problem_blocks_l;
+      dim3 grid = get_grid_shape(
+        problem_blocks,
+        cluster_shape,
+        hw_info,
+        max_swizzle,
+        raster_order_option
+      );
+      uint64_t ctas_per_wave = grid.x * grid.y;
+      cluster_size = cluster_shape.m() * cluster_shape.n();
+      // The number of output tiles to be computed in stream-K and data-parallel fashion, respectively.
+      sk_tiles = get_num_sk_tiles(
+        output_tiles,
+        ctas_per_wave,
+        cluster_size,
+        k_tiles_per_output_tile,
+        decomposition_mode
+      );
+      uint64_t dp_tiles = output_tiles - sk_tiles;
+      // Calculate the number of work units covering the data-parallel and stream-K tiles.
+      // A "work unit" is a single index in the linearized ID space used by the scheduler.
+      // We distinguish it from a "block," which is typically tied to a hardware unit
+      // (e.g., the callers into this scheduler will be persistent thread blocks).
+      // A work unit can encompass multiple output tiles worth of work (as will be the
+      // case for stream-K blocks).
+      // Since splitting is not required for data-parallel tiles, only one data-parallel unit
+      // is needed per data-parallel tile.
+      dp_units = dp_tiles;
+
+      uint64_t ctas_per_sk_wave = ctas_per_wave;
+      sk_units = get_num_sk_units(cluster_shape, ctas_per_sk_wave, sk_tiles, k_tiles_per_output_tile);
+
+      if (decomposition_mode == DecompositionMode::DataParallel ||
+          (decomposition_mode == DecompositionMode::Heuristic && sk_tiles == 0) ||
+          sk_units == 0) {
+        // Short circuit to basic data-parallel decomposition
+        return DecompositionMode::DataParallel;
+      }
+      else {
+        bool do_separate_reduction = should_perform_separate_reduction(
+          epilogue_subtile, sk_units, sk_tiles, dp_tiles, ctas_per_wave);
+        
+        uint64_t sk_cluster_tiles = sk_tiles / cluster_size;
+
+        groups = calculate_groups(underlying_params, reduction_mode, problem_blocks_m, problem_blocks_n, cluster_shape,
+          cluster_size, sk_tiles, sk_cluster_tiles, sk_units, k_tiles_per_output_tile, do_separate_reduction);
+
+        auto sk_units_per_group = sk_units / groups;
+
+        // sk_tiles is guaranteed to be divisible by cluster_size because it is calculated as:
+        //    sk_tiles = (waves <= 2) ? total_tiles : (sm_count + (total_tiles % sm_count))
+        // Both total_tiles and sm_count are multiples of cluster size due to padding added
+        // prior to kernel launch.
+        uint64_t sk_cluster_tiles_per_group = sk_cluster_tiles / groups;
+        uint64_t sk_tiles_per_group = sk_cluster_tiles_per_group * cluster_size;
+
+        // Groups that will process an extra stream-K tile cluster. These differ from "big_units," which
+        // are stream-K units within a group that process an extra K chunk.
+        sk_big_groups = sk_cluster_tiles % groups;
+
+        k_tiles_per_group = k_tiles_per_output_tile * sk_tiles_per_group;
+
+        // Number of k tiles computed per stream-K unit
+        k_tiles_per_sk_unit = k_tiles_per_group / sk_units_per_group;
+
+        DecompositionMode heuristic_mode;
+        if (decomposition_mode == DecompositionMode::Heuristic && sk_tiles < sk_units && sk_units % sk_tiles == 0) {
+          // If the number of stream-K units is a multiple of the number of stream-K tiles, then
+          // the problem can leverage a basic split-K decomposition for the stream-K tiles.
+          // This case happens when separate reduction is disable.
+          sk_splits = static_cast<uint32_t>(sk_units / sk_tiles);
+          heuristic_mode = DecompositionMode::SplitK;
+        }
+        else {
+          // Rest scenario is streamk
+          heuristic_mode = DecompositionMode::StreamK;
+        }
+        // Refresh heuristic_mode using analytical model before choosing streamk/separate_reduction decomposition,
+        // ideally it's to get the final decomposition more accuracy. Comment it as it is place holder at this moment.
+        #if 0
+        uint32_t total_waves = static_cast<uint32_t>((output_tiles + ctas_per_wave - 1) / ctas_per_wave);
+        analytical_model(heuristic_mode, k_tiles_per_output_tile, k_tiles_per_sk_unit,
+          sk_splits, epilogue_subtile, total_waves);
+        #endif
+        return heuristic_mode;
+      }
+    }
+  }
+
+  // Given decomposition mode output from heuristic, set all feilds of params.
+  void set_params(
+    DecompositionMode heuristic_mode,
+    uint32_t groups,
+    uint32_t sk_tiles,
+    uint64_t sk_units,
+    uint64_t cluster_size,
+    uint64_t dp_units,
+    uint64_t k_tiles_per_group,
+    uint64_t k_tiles_per_sk_unit,
+    uint64_t sk_big_groups,
+    uint32_t sk_splits,
+    UnderlyingParams underlying_params,
+    dim3 problem_blocks,
+    uint32_t k_tiles_per_output_tile,
+    GemmCoord cluster_shape,
+    uint32_t splits,
+    uint32_t epilogue_subtile,
+    ReductionMode reduction_mode) {
+    // The highest priority when customers set as splitk mode, may set
+    // with a adpated splits value rather than the original splits
+    // even it does not make sense
+    if (splits > 1 && heuristic_mode == DecompositionMode::SplitK) {
+      set_params_basic(
+        underlying_params,
+        problem_blocks,
+        cluster_shape,
+        sk_splits, // split-k set by customers
+        k_tiles_per_output_tile,
+        reduction_mode
+      );
+    }
+    else if (heuristic_mode == DecompositionMode::DataParallel) {
+      set_params_basic(
+        underlying_params,
+        problem_blocks,
+        cluster_shape,
+        1, // fast path to fall back to the mode without any split scheme
+        k_tiles_per_output_tile,
+        reduction_mode
+      );
+    }
+    else if (heuristic_mode == DecompositionMode::SplitK) {
+      set_params_basic(
+        underlying_params,
+        problem_blocks,
+        cluster_shape,
+        sk_splits, // splits calculated by heuristic
+        k_tiles_per_output_tile,
+        reduction_mode
+      );
+    }
+    else {
+      // streamk
+      set_params_stream_k(
+        underlying_params,
+        k_tiles_per_output_tile,
+        groups,
+        sk_tiles,
+        sk_units,
+        cluster_size,
+        dp_units,
+        k_tiles_per_group,
+        k_tiles_per_sk_unit,
+        sk_big_groups,
+        reduction_mode,
+        1, /*epilogue_subtile*/
+        0  /*reduction_units*/
+      );
+    }
+  }
+
+  // Given the inputs, computes the physical grid we should launch.
+  // This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  CUTLASS_HOST_DEVICE
+  static dim3
+  get_grid_shape(
+    BatchedGemmCoord problem_shape,
+    GemmCoord cta_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo hw_info,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option
+  ) {
+
+    dim3 problem_blocks = UnderlyingParams::get_tiled_cta_shape_mnl(problem_shape, cta_shape, cluster_shape);
+
+    return get_grid_shape(
+      problem_blocks,
+      cluster_shape,
+      hw_info,
+      max_swizzle_size,
+      raster_order_option
+    );
+  }
+
+  // Version of get_grid_shape that takes in as input the number of CTAs in the M and N and L dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  CUTLASS_HOST_DEVICE
+  static dim3
+  get_grid_shape(
+    dim3 problem_blocks,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo hw_info,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option
+  ) {
+
+    // Call into the underlying get_grid_shape method, but do not allow the grid shape returned
+    // to be truncated based on the number of output tiles in the problem.
+    return UnderlyingParams::get_grid_shape(
+      problem_blocks,
+      cluster_shape,
+      hw_info,
+      max_swizzle_size,
+      raster_order_option,
+      /* truncate_by_problem_size = */false
+    );
+  }
+
+  // Returns the number of stream-K tiles that will be computed amongst `output_tiles` total
+  // output tiles on a device with `ctas_per_wave` CTAs in each wave.
+  static uint32_t
+  get_num_sk_tiles(
+    uint64_t output_tiles,
+    uint64_t ctas_per_wave,
+    uint64_t cluster_size,
+    uint32_t k_tiles_per_output_tile,
+    DecompositionMode decomposition_mode
+  ) {
+    uint32_t full_waves = static_cast<uint32_t>(output_tiles / ctas_per_wave);
+    uint32_t total_waves = static_cast<uint32_t>((output_tiles + ctas_per_wave - 1) / ctas_per_wave);
+
+    if (decomposition_mode == DecompositionMode::DataParallel ||
+        decomposition_mode == DecompositionMode::SplitK) {
+      return 0;
+    }
+
+    // If there is wave quantization, assign the first two waves worth of tiles to be
+    // covered by stream-K work and the remainder to be data-parallel. Since we know
+    // that full_waves == total_waves - 1 in this case, the number of data-parallel
+    // waves is simply full_waves-1 (unless full_waves == 0).
+    uint32_t dp_waves = full_waves > 1 ? full_waves - 1 : 0;
+    uint64_t dp_tiles = dp_waves * ctas_per_wave;
+    uint64_t sk_tiles = output_tiles - dp_tiles;
+
+    if (decomposition_mode == DecompositionMode::Heuristic) {
+      if (full_waves == total_waves || k_tiles_per_output_tile <= min_iters_per_sk_unit_) {
+        // All tiles will be data-parallel tiles if there is either no quantization
+        // or if there is no work to be split.
+        return 0;
+      }
+      //
+      // The final wave is not full. Perform some stream-K work.
+      //
+
+      // Rudimentary heuristic: prefer data-parallel decomposition if we have more than
+      // one wave and the tail wave is more than half full. This is subject to change.
+      uint64_t tail_tiles = output_tiles - (full_waves * ctas_per_wave);
+      if (2 * tail_tiles >= ctas_per_wave) {
+        return 0;
+      }
+    }
+
+    return static_cast<uint32_t>(sk_tiles);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static uint64_t
+  get_num_sk_units(GemmCoord cluster_shape, uint64_t ctas_per_sk_wave, uint32_t sk_tiles, uint32_t k_tiles_per_output_tile) {
+    // If there are stream-K tiles to compute and a sufficiently large number of k iterations
+    // across them, they will be covered by a single wave of persistent threadblocks. Thus, there
+    // will be as many work units as there are threadblocks in a single wave.
+    //
+    // When the total k iterations across stream-K tiles is too small to justify distributing
+    // across an entire wave of blocks, we instead distribute the iterations over a smaller
+    // set of blocks.
+
+    // Calculate the number of stream-K units that would be needed if each stream-K unit
+    // computed the minimum allowable k iterations. Truncate this to be in units of clusters.
+
+    // Number of k iterations computed by the stream-K units as a whole
+    uint64_t k_tiles_sk_total = k_tiles_per_output_tile * sk_tiles;
+
+    // Calculate the number of stream-K units that would be needed if each stream-K unit
+    // computed the minimum allowable k iterations. Truncate this to be in units of clusters.
+    auto cluster_size = cluster_shape.m() * cluster_shape.n();
+    uint64_t min_sized_sk_units = (k_tiles_sk_total / min_iters_per_sk_unit_);
+    min_sized_sk_units = (min_sized_sk_units / cluster_size) * cluster_size;
+
+    uint64_t sk_units = platform::min(ctas_per_sk_wave, min_sized_sk_units);
+    return sk_units;
+  }
+
+  // Calculates the size of the workspace needed for holding reduction barriers
+  CUTLASS_HOST_DEVICE
+  static size_t
+  get_barrier_workspace_size(uint64_t num_tiles, uint32_t mma_warp_groups, uint32_t barrier_bits) {
+    size_t workspace_bits = num_tiles * static_cast<size_t>(mma_warp_groups) * static_cast<size_t>(barrier_bits);
+    return round_up_to_l2_alignment(bits_to_bytes<size_t>(workspace_bits));
+  }
+
+  // Calculates the size of the workspace needed for holding partial outputs from splits
+  CUTLASS_HOST_DEVICE
+  static size_t
+  get_reduction_workspace_size(uint64_t num_tiles, GemmCoord tile_shape, uint32_t accumulator_bits, uint32_t num_accumulator_mtxs = 1) {
+    size_t output_tile_size = tile_shape.m() * tile_shape.n();
+    size_t workspace_bits = accumulator_bits * output_tile_size * num_tiles * num_accumulator_mtxs;
+    return round_up_to_l2_alignment(bits_to_bytes<size_t>(workspace_bits));
+  }
+
+  #if !defined(__CUDACC_RTC__)
+  static void
+  get_workspace_component_sizes(
+    dim3 problem_blocks,
+    uint32_t k_tiles_per_output_tile,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    size_t& barrier_workspace_size,
+    size_t& reduction_workspace_size,
+    KernelHardwareInfo const& hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    DecompositionMode decomposition_mode,
+    ReductionMode reduction_mode,
+    uint32_t mma_warp_groups,
+    uint32_t barrier_bits,
+    uint32_t accumulator_bits,
+    uint32_t epilogue_subtile = 1,
+    uint32_t num_accumulator_mtxs = 1,
+    uint32_t ktile_start_alignment_count = 1) {
+
+    auto log_swizzle_size = UnderlyingParams::get_log_swizzle_size(problem_blocks.x, problem_blocks.y, max_swizzle);
+    problem_blocks.x = round_up(problem_blocks.x, (1 << log_swizzle_size) * cluster_shape.m());
+    problem_blocks.y = round_up(problem_blocks.y, (1 << log_swizzle_size) * cluster_shape.n());
+
+    // Workspace is needed only for output tiles that will be split. Thus, we first determine the number
+    // of output tiles that will be split, and then calculate the workspace needed to cover these.
+    uint64_t output_tiles = problem_blocks.x * problem_blocks.y * problem_blocks.z;
+
+    if (decomposition_mode == DecompositionMode::DataParallel) {
+      barrier_workspace_size = 0;
+      reduction_workspace_size = 0;
+    }
+    else {
+      KernelHardwareInfo new_hw_info;
+      new_hw_info.device_id = hw_info.device_id;
+      new_hw_info.sm_count = hw_info.sm_count;
+      if (new_hw_info.sm_count <= 0) {
+        CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+            "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+        new_hw_info.sm_count = KernelHardwareInfo::query_device_multiprocessor_count(new_hw_info.device_id);
+      }
+
+      dim3 grid = get_grid_shape(
+        problem_blocks,
+        cluster_shape,
+        new_hw_info,
+        max_swizzle,
+        raster_order_option
+      );
+      uint64_t ctas_per_wave = grid.x * grid.y;
+      uint64_t cluster_size = cluster_shape.m() * cluster_shape.n();
+      uint32_t sk_tiles = get_num_sk_tiles(
+        output_tiles,
+        ctas_per_wave,
+        cluster_size,
+        static_cast<uint32_t>(k_tiles_per_output_tile),
+        decomposition_mode
+      );
+      uint64_t ctas_per_sk_wave = ctas_per_wave;
+      uint64_t sk_units = get_num_sk_units(cluster_shape, ctas_per_sk_wave, sk_tiles, k_tiles_per_output_tile);
+      uint64_t dp_tiles = output_tiles - sk_tiles;
+
+      if (decomposition_mode == DecompositionMode::SplitK ||
+         (decomposition_mode == DecompositionMode::Heuristic && splits > 1)) {
+        splits = adjust_split_count(
+          splits, new_hw_info.sm_count, k_tiles_per_output_tile
+        );
+      }
+
+      bool split_k_required = splits > 1 && (decomposition_mode == DecompositionMode::SplitK || decomposition_mode == DecompositionMode::Heuristic);
+      bool split_k_selected = decomposition_mode == DecompositionMode::Heuristic &&
+                              sk_units > sk_tiles &&
+                              sk_tiles != 0 &&
+                              sk_units % sk_tiles == 0;
+
+      if (split_k_required || split_k_selected) {
+        // Basic split-K variant requires workspace for all output tiles
+        barrier_workspace_size = get_barrier_workspace_size(output_tiles, mma_warp_groups, barrier_bits);
+        reduction_workspace_size = get_reduction_workspace_size(output_tiles, tile_shape, accumulator_bits, num_accumulator_mtxs);
+      }
+      else {
+        uint64_t reduction_tiles = sk_tiles;
+        if (
+          should_perform_separate_reduction(epilogue_subtile, sk_units, sk_tiles, dp_tiles, ctas_per_wave)
+          ) {
+          // In separate reduction, each peer writes to its own location in scratch space.
+          // Thus, for separate reduction, we need as many reduction tiles per output tile
+          // as there are the maximum number of peers that can collaborate on an output tile.
+          reduction_tiles *= max_peers_per_tile(sk_units, sk_tiles);
+        }
+
+        // Though separate reduction requires a larger reduction workspace, only one barrier
+        // is needed per output tile. Each peer will increment the barrier by one once the peer has
+        // written its accumulator to scratch space. The separate reduction unit will only begin
+        // performing the reduction when the barrier has reached the number of peers for the output tile.
+        barrier_workspace_size = get_barrier_workspace_size(sk_tiles, mma_warp_groups, barrier_bits);
+        reduction_workspace_size = get_reduction_workspace_size(reduction_tiles, tile_shape, accumulator_bits, num_accumulator_mtxs);
+      }
+    }
+  }
+  #endif // !defined(__CUDACC_RTC__)
+
+  // Returns whether the kernel is configured in a manner for which separate reduction should be used
+  CUTLASS_HOST_DEVICE
+  static bool
+  should_perform_separate_reduction(uint32_t, uint64_t, uint64_t, uint64_t, uint64_t) {
+    // Separate reduction is temporarily disabled, pending fixes
+    return false;
+  }
+
+  // Get the amount of scratch workspace needed for the kernel. This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  static size_t
+  get_workspace_size(
+    BatchedGemmCoord problem_shape,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    DecompositionMode decomposition_mode,
+    ReductionMode reduction_mode,
+    uint32_t mma_warp_groups,
+    uint32_t barrier_bits,
+    uint32_t element_accumulator_bits,
+    uint32_t epilogue_subtile,
+    uint32_t num_accumulator_mtxs,
+    uint32_t ktile_start_alignment_count = 1) {
+
+    dim3 problem_blocks = UnderlyingParams::get_tiled_cta_shape_mnl(problem_shape, tile_shape, cluster_shape);
+    uint32_t k_tiles_per_output_tile = (problem_shape.k() + tile_shape.k() - 1) / tile_shape.k();
+
+    return get_workspace_size(
+      problem_blocks,
+      k_tiles_per_output_tile,
+      tile_shape,
+      cluster_shape,
+      hw_info,
+      splits,
+      max_swizzle,
+      raster_order_option,
+      decomposition_mode,
+      reduction_mode,
+      mma_warp_groups,
+      barrier_bits,
+      element_accumulator_bits,
+      epilogue_subtile,
+      num_accumulator_mtxs,
+      ktile_start_alignment_count
+    );
+  }
+
+  // Version of get_workspace_size that takes in as input the number of CTAs in the M and N dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  static size_t
+  get_workspace_size(
+    dim3 problem_blocks,
+    uint32_t k_tiles_per_output_tile,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    DecompositionMode decomposition_mode,
+    ReductionMode reduction_mode,
+    uint32_t mma_warp_groups,
+    uint32_t barrier_bits,
+    uint32_t element_accumulator_bits,
+    uint32_t epilogue_subtile = 1,
+    uint32_t num_accumulator_mtxs = 1,
+    uint32_t ktile_start_alignment_count = 1) {
+
+    size_t barrier_workspace_size = 0;
+    size_t reduction_workspace_size = 0;
+
+    #if !defined(__CUDACC_RTC__)
+      get_workspace_component_sizes(
+        problem_blocks,
+        k_tiles_per_output_tile,
+        tile_shape,
+        cluster_shape,
+        barrier_workspace_size,
+        reduction_workspace_size,
+        hw_info,
+        splits,
+        max_swizzle,
+        raster_order_option,
+        decomposition_mode,
+        reduction_mode,
+        mma_warp_groups,
+        barrier_bits,
+        element_accumulator_bits,
+        epilogue_subtile,
+        num_accumulator_mtxs,
+        ktile_start_alignment_count
+      );
+    #endif
+
+    return barrier_workspace_size + reduction_workspace_size;
+  }
+
+  // Initialize the workspace to be used for the kernel. This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  static cutlass::Status
+  initialize_workspace(
+    void* workspace,
+    cudaStream_t stream,
+    BatchedGemmCoord problem_shape,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    DecompositionMode decomposition_mode,
+    ReductionMode reduction_mode,
+    uint32_t mma_warp_groups,
+    uint32_t barrier_bits,
+    uint32_t element_accumulator_bits,
+    uint32_t epilogue_subtile,
+    CudaHostAdapter* cuda_adapter = nullptr,
+    uint32_t ktile_start_alignment_count = 1) {
+
+    dim3 problem_blocks = UnderlyingParams::get_tiled_cta_shape_mnl(problem_shape, tile_shape, cluster_shape);
+    uint32_t k_tiles_per_output_tile = (problem_shape.k() + tile_shape.k() - 1) / tile_shape.k();
+
+    return initialize_workspace(
+      workspace,
+      stream,
+      problem_blocks,
+      k_tiles_per_output_tile,
+      tile_shape,
+      cluster_shape,
+      hw_info,
+      splits,
+      max_swizzle,
+      raster_order_option,
+      decomposition_mode,
+      reduction_mode,
+      mma_warp_groups,
+      barrier_bits,
+      element_accumulator_bits,
+      epilogue_subtile,
+      1,
+      cuda_adapter,
+      ktile_start_alignment_count
+    );
+  }
+
+  // Version of initialize_workspace that takes in as input the number of CTAs in the M and N dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  static cutlass::Status
+  initialize_workspace(
+    void* workspace,
+    cudaStream_t stream,
+    dim3 problem_blocks,
+    uint32_t k_tiles_per_output_tile,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    DecompositionMode decomposition_mode,
+    ReductionMode reduction_mode,
+    uint32_t mma_warp_groups,
+    uint32_t barrier_bits,
+    uint32_t element_accumulator_bits,
+    uint32_t epilogue_subtile = 1,
+    uint32_t num_accumulator_mtxs = 1,
+    CudaHostAdapter* cuda_adapter = nullptr,
+    uint32_t ktile_start_alignment_count = 1) {
+
+    #if !defined(__CUDACC_RTC__)
+      uint64_t barrier_workspace_size = 0;
+      uint64_t reduction_workspace_size = 0;
+
+      get_workspace_component_sizes(
+        problem_blocks,
+        k_tiles_per_output_tile,
+        tile_shape,
+        cluster_shape,
+        barrier_workspace_size,
+        reduction_workspace_size,
+        hw_info,
+        splits,
+        max_swizzle,
+        raster_order_option,
+        decomposition_mode,
+        reduction_mode,
+        mma_warp_groups,
+        barrier_bits,
+        element_accumulator_bits,
+        epilogue_subtile,
+        num_accumulator_mtxs,
+        ktile_start_alignment_count
+      );
+
+      if (barrier_workspace_size > 0) {
+        if (workspace == nullptr) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        // Only the barrier workspace needs to be cleared for stream-K.
+        // Barrier workspace follows reduction workspace.
+        uint8_t* barrier_workspace = reinterpret_cast<uint8_t*>(workspace) + reduction_workspace_size;
+        return zero_workspace(static_cast<void*>(barrier_workspace), barrier_workspace_size, stream, cuda_adapter);
+      }
+    #endif // !defined(__CUDACC_RTC__)
+
+    return Status::kSuccess;
+  }
+
+  // Set params for basic parameters, which will not affected by different decompositions.
+  void
+  set_params_base(UnderlyingParams const& underlying_params, void* reduction_workspace) {
+    divmod_cluster_shape_major_ = underlying_params.divmod_cluster_shape_major_;
+    divmod_cluster_shape_minor_ = underlying_params.divmod_cluster_shape_minor_;
+    divmod_cluster_blk_major_ = underlying_params.divmod_cluster_blk_major_;
+    log_swizzle_size_ = underlying_params.log_swizzle_size_;
+    raster_order_ = underlying_params.raster_order_;
+    reduction_workspace_ = reduction_workspace;
+  }
+
+  void
+  set_params_basic(
+    UnderlyingParams const& underlying_params,
+    dim3 problem_blocks,
+    GemmCoord cluster_shape,
+    uint32_t splits,
+    uint32_t k_tiles_per_output_tile,
+    ReductionMode reduction_mode) {
+
+    auto blocks_l = problem_blocks.z;
+    auto blocks_m = round_up(problem_blocks.x,
+                             (1 << underlying_params.log_swizzle_size_) * cluster_shape.m());
+    auto blocks_n = round_up(problem_blocks.y,
+                             (1 << underlying_params.log_swizzle_size_) * cluster_shape.n());
+
+    divmod_batch_ = FastDivmodU64(blocks_m * blocks_n);
+    divmod_tiles_per_output_tile_ = FastDivmod(k_tiles_per_output_tile);
+    divmod_sk_groups_ = FastDivmodU64(1u);
+    auto cluster_size = underlying_params.divmod_cluster_shape_major_.divisor *
+                        underlying_params.divmod_cluster_shape_minor_.divisor;
+    divmod_clusters_mnl_ = FastDivmodU64((blocks_m * blocks_n * blocks_l) / cluster_size);
+    divmod_splits_ = FastDivmod(splits);
+    units_per_problem_ = blocks_m * blocks_n * blocks_l;
+    big_units_ = k_tiles_per_output_tile % splits;
+    reduction_mode_ = reduction_mode;
+    divmod_k_tiles_per_sk_unit_ = FastDivmod(k_tiles_per_output_tile / splits);
+    divmod_k_tiles_per_sk_big_unit_ = FastDivmod(k_tiles_per_output_tile / splits + 1);
+
+    // No stream-K work is performed for "basic" data-parallel and split-K decompositions
+    sk_tiles_ = 0;
+    sk_units_ = 0;
+    divmod_sk_units_per_group_ = FastDivmodU64(1u);
+    separate_reduction_units_ = 0;
+  }
+
+  // Set params for streamk(streamk, separate-reduction included) decomposition.
+  void
+  set_params_stream_k(
+    UnderlyingParams const& underlying_params,
+    uint32_t k_tiles_per_output_tile,
+    uint32_t groups,
+    uint32_t sk_tiles,
+    uint64_t sk_units,
+    uint64_t cluster_size,
+    uint64_t dp_units,
+    uint64_t k_tiles_per_group,
+    uint64_t k_tiles_per_sk_unit,
+    uint64_t sk_big_groups,
+    ReductionMode reduction_mode,
+    uint32_t epilogue_subtile,
+    uint32_t reduction_units) {
+    // stream-k and separate-reduction decompostions
+    divmod_batch_ = underlying_params.divmod_batch_;
+    divmod_tiles_per_output_tile_ = FastDivmod(k_tiles_per_output_tile);
+    divmod_sk_groups_ = FastDivmodU64(static_cast<uint64_t>(groups));
+    divmod_sk_units_per_group_ = FastDivmodU64(static_cast<uint64_t>(sk_units / groups));
+
+    // Override divmod_clusters_mnl_ to be the number of cluster-sized stream-K units.
+    // This setting ensures that the use of this divmod for stream-K decompositions
+    // is essentially a no-op.
+    divmod_clusters_mnl_ = FastDivmodU64(sk_units / cluster_size);
+    divmod_splits_ = FastDivmod(1);
+    units_per_problem_ = static_cast<uint32_t>(dp_units + sk_units);
+
+    // Assign big_units_ assuming that group count == 1. This is unused by stream-K
+    // when group count > 1.
+    auto big_units_in_ctas = k_tiles_per_group % sk_units;
+
+    // Store big_units in terms of clusters. big_units_in_ctas is guaranteed to be divisible
+    // by cluster_size because both k_tiles_per_group and k_tiles_per_sk_unit must be a multiple
+    // of cluster_size.
+    auto big_units_in_clusters = big_units_in_ctas / cluster_size;
+    big_units_ = static_cast<uint32_t>(big_units_in_clusters);
+
+    big_groups_ = static_cast<uint32_t>(sk_big_groups);
+    sk_tiles_ = sk_tiles;
+    sk_units_ = static_cast<uint32_t>(sk_units);
+    divmod_k_tiles_per_sk_unit_ = FastDivmod(static_cast<uint32_t>(k_tiles_per_sk_unit));
+    divmod_k_tiles_per_sk_big_unit_ = FastDivmod(static_cast<uint32_t>(k_tiles_per_sk_unit + 1));
+    reduction_mode_ = reduction_mode;
+    divmod_epilogue_subtile_ = FastDivmodU64(epilogue_subtile);
+    separate_reduction_units_ = reduction_units;
+  }
+
+  private:
+  // Round up number of bytes to the nearest multiple of L2 cache line alignment
+  CUTLASS_HOST_DEVICE
+  static size_t
+  round_up_to_l2_alignment(size_t bytes) {
+    constexpr size_t L2CacheLineSizeBytes = 128u;
+    return (bytes + L2CacheLineSizeBytes - 1) / L2CacheLineSizeBytes * L2CacheLineSizeBytes;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int adjust_split_count(
+      int splits,
+      int sm_count,
+      uint32_t k_tiles_per_output_tile
+      ) {
+    // Don't split by more than the available number of SMs
+    if (splits > sm_count) {
+      splits = sm_count;
+    }
+
+    // Don't split by more than the K tile iterations
+    if (static_cast<uint32_t>(splits) > k_tiles_per_output_tile) {
+      splits = k_tiles_per_output_tile;
+    }
+
+    // If k_tiles_per_output_tiles / splits == 1, there will be one k_tile per cta
+    //   and this violate k_tile start from even requirements. Thus we need to
+    //   reduce the number of splits.
+    return splits;
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Parameters for SM90 persistent group scheduler (only used for Grouped Gemms)
+template<class ProblemShape>
+struct PersistentTileSchedulerSm90GroupParams {
+
+  enum class RasterOrder {
+    AlongM,
+    AlongN
+  };
+
+  enum class RasterOrderOptions {
+    Heuristic,
+    AlongM,
+    AlongN
+  };
+
+  FastDivmodU64Pow2 divmod_cluster_shape_major_{};
+  FastDivmodU64Pow2 divmod_cluster_shape_minor_{};
+  FastDivmodU64 divmod_cta_shape_m_{};
+  FastDivmodU64 divmod_cta_shape_n_{};
+
+  uint64_t blocks_across_problem_ = 0;
+  bool pre_processed_problem_shapes = true;
+  int32_t log_swizzle_size_ = 0;
+  RasterOrder raster_order_ = RasterOrder::AlongN;
+
+  int32_t groups_ = 0;
+  ProblemShape* problem_shapes_ = nullptr;
+  GemmCoord cta_shape_;
+  GemmCoord cluster_shape_;
+
+  // Version of initialize that takes in as input the number of CTAs in the M and N and L dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  void
+  initialize(
+    dim3 problem_blocks,
+    int32_t groups,
+    ProblemShape* problem_shapes,
+    ProblemShape const* host_problem_shapes,
+    GemmCoord cta_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option
+  ) {
+
+    CUTLASS_UNUSED(hw_info);
+
+    // Round up to nearest multiple of swizzle_size along each mode
+    auto log_swizzle_size = get_log_swizzle_size(problem_blocks.x, problem_blocks.y, max_swizzle_size);
+    auto problem_blocks_m = round_up(problem_blocks.x, (1 << log_swizzle_size) * cluster_shape.m());
+    auto problem_blocks_n = round_up(problem_blocks.y, (1 << log_swizzle_size) * cluster_shape.n());
+
+    RasterOrder raster_order = get_rasterization_order(
+      problem_blocks_m,
+      problem_blocks_n,
+      raster_order_option
+    );
+
+    //
+    // Set members
+    //
+    groups_ = groups;
+    problem_shapes_ = problem_shapes;
+    cta_shape_ = cta_shape;
+    cluster_shape_ = cluster_shape;
+
+    blocks_across_problem_ = problem_blocks.x * problem_blocks.y * problem_blocks.z;
+    pre_processed_problem_shapes = (host_problem_shapes == nullptr) ? false : true;
+    log_swizzle_size_ = log_swizzle_size;
+    raster_order_ = raster_order;
+
+    if (raster_order == RasterOrder::AlongN) {
+      divmod_cluster_shape_major_ = FastDivmodU64Pow2(cluster_shape.n());
+      divmod_cluster_shape_minor_ = FastDivmodU64Pow2(cluster_shape.m());
+    }
+    else {
+      divmod_cluster_shape_major_ = FastDivmodU64Pow2(cluster_shape.m());
+      divmod_cluster_shape_minor_ = FastDivmodU64Pow2(cluster_shape.n());
+    }
+
+    divmod_cta_shape_m_ = FastDivmodU64(cta_shape_.m());
+    divmod_cta_shape_n_ = FastDivmodU64(cta_shape_.n());
+  }
+
+  // Version of get_tiled_cta_shape_mnl that takes in as input the number of CTAs in the M and N dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  CUTLASS_HOST_DEVICE
+  static dim3
+  get_tiled_cta_shape_mnl(GemmCoord cluster_shape, uint32_t cta_m, uint32_t cta_n) {
+    // Round up to nearest multiple of cluster dim along each mode
+    auto problem_blocks_m = ((cta_m + cluster_shape.m() - 1) / cluster_shape.m()) * cluster_shape.m();
+    auto problem_blocks_n = ((cta_n + cluster_shape.n() - 1) / cluster_shape.n()) * cluster_shape.n();
+
+    return {
+      static_cast<uint32_t>(cta_m),
+      static_cast<uint32_t>(cta_n),
+      static_cast<uint32_t>(1) // Only a single batch per group is currently supported
+    };
+  }
+
+  // Version of get_grid_shape that takes in as input the number of CTAs in the M and N and L dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_grid_shape(
+    dim3 problem_blocks,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo hw_info,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option,
+    bool truncate_by_problem_size=true) {
+
+    int const sm_count = hw_info.sm_count;
+
+    // Round up to nearest multiple of swizzle_size along each mode
+    auto log_swizzle_size = get_log_swizzle_size(problem_blocks.x, problem_blocks.y, max_swizzle_size);
+    auto problem_blocks_m = round_up(problem_blocks.x, (1 << log_swizzle_size) * cluster_shape.m());
+    auto problem_blocks_n = round_up(problem_blocks.y, (1 << log_swizzle_size) * cluster_shape.n());
+
+    int problem_blocks_total = problem_blocks_m * problem_blocks_n * problem_blocks.z;
+
+    RasterOrder raster_order = get_rasterization_order(
+      problem_blocks_m,
+      problem_blocks_n,
+      raster_order_option
+    );
+
+    dim3 launch_grid;
+
+    if (raster_order == RasterOrder::AlongN) {
+      launch_grid = dim3(cluster_shape.m(), 1, 1);
+    }
+    else {
+      launch_grid = dim3(1, cluster_shape.n(), 1);
+    }
+
+    auto possibly_truncate = [&](int x, int y) {
+      if (truncate_by_problem_size) {
+        return platform::min(x, y);
+      }
+      else {
+        return x;
+      }
+    };
+
+    // The else path is generic, however, we can avoid some divs if we know cluster size is 1
+    auto cluster_size = cluster_shape.m() * cluster_shape.n();
+    if (cluster_size == 1) {
+      if (raster_order == RasterOrder::AlongN) {
+        launch_grid.y = possibly_truncate(sm_count, problem_blocks_total);
+      }
+      else {
+        launch_grid.x = possibly_truncate(sm_count, problem_blocks_total);
+      }
+    }
+    else {
+      // Optimal grid size calculation is based on
+      // GH100: 8 GPCs, 72 TPCs (9 TPCs/GPC), 2 SMs/TPC, 144 SMs per full GPU
+      // Hence, maximum SMs per GPC = 18
+      constexpr int max_sm_per_gpc = 18;
+      int cta_per_device = get_max_cta_occupancy(max_sm_per_gpc, cluster_shape, sm_count);
+
+      if (raster_order == RasterOrder::AlongN) {
+        launch_grid.y = possibly_truncate(
+            cta_per_device       / cluster_shape.m(),
+            problem_blocks_total / cluster_shape.m());
+      }
+      else {
+        launch_grid.x = possibly_truncate(
+            cta_per_device       / cluster_shape.n(),
+            problem_blocks_total / cluster_shape.n());
+      }
+    }
+    return launch_grid;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t
+  get_log_swizzle_size(int problem_ctas_m, int problem_ctas_n, int max_swizzle_size) {
+    int min_cta_dim = platform::min(problem_ctas_m, problem_ctas_n);
+    if (max_swizzle_size >= 8 && min_cta_dim >= 6) {
+      return 3;
+    }
+    else if (max_swizzle_size >= 4 && min_cta_dim >= 3) {
+      return 2;
+    }
+    else if (max_swizzle_size >= 2 && min_cta_dim >= 2) {
+      return 1;
+    }
+    else {
+      return 0;
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  static RasterOrder
+  get_rasterization_order(
+    uint32_t tiles_m,
+    uint32_t tiles_n,
+    RasterOrderOptions raster_order_option
+  ) {
+
+    if (raster_order_option == RasterOrderOptions::Heuristic) {
+      if (tiles_n > tiles_m) {
+        return RasterOrder::AlongM;
+      }
+      else {
+        return RasterOrder::AlongN;
+      }
+    }
+    else {
+      switch (raster_order_option) {
+        case RasterOrderOptions::AlongN:
+          return RasterOrder::AlongN;
+          break;
+        default:
+          return RasterOrder::AlongM;
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+} // namespace detail
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/trmm_universal.h b/include/cutlass/gemm/kernel/trmm_universal.h
index 2591f961cb..50b33eab7e 100644
--- a/include/cutlass/gemm/kernel/trmm_universal.h
+++ b/include/cutlass/gemm/kernel/trmm_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -110,32 +110,29 @@ struct TrmmUniversal {
     // Data members
     //
 
-    GemmUniversalMode mode;
-    GemmCoord problem_size;
-    int batch_count;
+    GemmUniversalMode mode{GemmUniversalMode::kGemm};
+    GemmCoord problem_size{};
+    int batch_count{1};
 
-    typename EpilogueOutputOp::Params epilogue;
+    typename EpilogueOutputOp::Params epilogue{};
 
-    void const * ptr_A;
-    void const * ptr_B;
-    void * ptr_D;
+    void const * ptr_A{nullptr};
+    void const * ptr_B{nullptr};
+    void * ptr_D{nullptr};
 
-    int64_t batch_stride_A;
-    int64_t batch_stride_B;
-    int64_t batch_stride_D;
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+    int64_t batch_stride_D{0};
 
-    typename LayoutA::Stride::Index lda;
-    typename LayoutB::Stride::Index ldb;
-    typename LayoutC::Stride::Index ldd;
+    typename LayoutA::Stride::Index lda{0};
+    typename LayoutB::Stride::Index ldb{0};
+    typename LayoutC::Stride::Index ldd{0};
 
     //
     // Methods
     //
-    
-    Arguments(): 
-      mode(GemmUniversalMode::kGemm), 
-      batch_count(1), 
-      ptr_A(nullptr), ptr_B(nullptr), ptr_D(nullptr) { }
+
+    Arguments() = default;
 
     /// constructs an arguments structure
     Arguments(
@@ -161,7 +158,7 @@ struct TrmmUniversal {
       batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_D(batch_stride_D), 
       lda(lda), ldb(ldb), ldd(ldd) {
       }
-
+    
     /// Returns arguments for the transposed problem sizes
     Arguments transposed_problem_size() const {
       Arguments args(*this);
@@ -190,50 +187,34 @@ struct TrmmUniversal {
   /// Parameters structure
   struct Params {
 
-    cutlass::gemm::GemmCoord problem_size;
-    cutlass::gemm::GemmCoord grid_tiled_shape;
-    int swizzle_log_tile;
+    cutlass::gemm::GemmCoord problem_size{};
+    cutlass::gemm::GemmCoord grid_tiled_shape{};
+    int swizzle_log_tile{0};
    
-    typename Mma::IteratorA::Params params_A;
-    typename Mma::IteratorB::Params params_B;
-    typename Epilogue::OutputTileIterator::Params params_D;
+    typename Mma::IteratorA::Params params_A{};
+    typename Mma::IteratorB::Params params_B{};
+    typename Epilogue::OutputTileIterator::Params params_D{};
     
-    typename EpilogueOutputOp::Params output_op;
+    typename EpilogueOutputOp::Params output_op{};
 
-    GemmUniversalMode mode;
-    int batch_count;
-    int gemm_k_size;
+    GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
+    int batch_count {0};
+    int gemm_k_size {0};
 
-    void * ptr_A;
-    void * ptr_B;
-    void * ptr_D;
+    void * ptr_A{nullptr};
+    void * ptr_B{nullptr};
+    void * ptr_D{nullptr};
 
-    int64_t batch_stride_A;
-    int64_t batch_stride_B;
-    int64_t batch_stride_D;
+    int64_t batch_stride_A {0};
+    int64_t batch_stride_B {0};
+    int64_t batch_stride_D {0};
 
-    int *semaphore;
+    int *semaphore{nullptr};
 
     //
     // Methods
     //
-
-    CUTLASS_HOST_DEVICE
-    Params():
-      swizzle_log_tile(0),
-      params_A(0),
-      params_B(0),
-      params_D(0),
-      batch_count(0),
-      gemm_k_size(0),
-      mode(cutlass::gemm::GemmUniversalMode::kGemm),
-      ptr_A(nullptr),
-      ptr_B(nullptr),
-      ptr_D(nullptr),
-      batch_stride_A(0),
-      batch_stride_B(0),
-      batch_stride_D(0),
-      semaphore(nullptr) { }
+    Params() = default;
 
     CUTLASS_HOST_DEVICE
     Params(
@@ -380,7 +361,7 @@ struct TrmmUniversal {
 
     // Broadcast the warp_id computed by lane 0 to ensure dependent code
     // is compiled as warp-uniform.
-    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int warp_idx = canonical_warp_idx_sync();
 
     int lane_idx = threadIdx.x % 32;
 
diff --git a/include/cutlass/gemm/thread/mma.h b/include/cutlass/gemm/thread/mma.h
index ca6c4b2c56..2e3798b15b 100644
--- a/include/cutlass/gemm/thread/mma.h
+++ b/include/cutlass/gemm/thread/mma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/thread/mma_sm50.h b/include/cutlass/gemm/thread/mma_sm50.h
index 253364243b..4c70bcf3fb 100644
--- a/include/cutlass/gemm/thread/mma_sm50.h
+++ b/include/cutlass/gemm/thread/mma_sm50.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -114,6 +114,9 @@ struct MmaGeneric {
 
   static bool const kMultipleOf2 = ((Shape::kM % 2 == 0) && (Shape::kN % 2 == 0));
 
+  static bool const kAllFp32 = platform::is_same<ElementA, float>::value &&
+      platform::is_same<ElementB, float>::value &&
+      platform::is_same<ElementC, float>::value;
   //
   // Methods
   //
@@ -144,8 +147,7 @@ struct MmaGeneric {
     CUTLASS_PRAGMA_UNROLL
     for (int k = 0; k < Shape::kK; ++k) {
       #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 860)
-      if (kMultipleOf2 && platform::is_same<ElementA, float>::value && platform::is_same<ElementB, float>::value && platform::is_same<ElementC, float>::value) {
-
+      if constexpr (kMultipleOf2 && kAllFp32) {
         //2x2 zigzag - m and n loops to increment by 2. Inner loop to process 4 multiply-adds in a 2x2 tile.
         CUTLASS_PRAGMA_UNROLL
         for (int n = 0; n < Shape::kN; n+=2) {
@@ -154,7 +156,7 @@ struct MmaGeneric {
           for (int m = 0; m < Shape::kM; m+=2) {
   
             int m_serpentine = (n % 4) ? (Shape::kM - 2 - m) : m;
-  
+
             //top-left element in 2x2 tile
             {
               MatrixCoord mn(m_serpentine, n);
@@ -250,6 +252,186 @@ struct MmaGeneric {
 };
 
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// Matrix multiply-add operation - assumes operand B is not changing
+struct MmaComplexF32_Column {
+
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using ElementC = complex<float>;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<complex<float>, 1> &d,
+    Array<complex<float>, 1> const &a,
+    Array<complex<float>, 1> const &b,
+    Array<complex<float>, 1> const &c
+  ) {
+
+    d[0].real() =  a[0].real() * b[0].real() + c[0].real();
+    d[0].imag() =  a[0].real() * b[0].imag() + d[0].imag();
+    d[0].real() = -a[0].imag() * b[0].imag() + d[0].real();
+    d[0].imag() =  a[0].imag() * b[0].real() + c[0].imag();
+  }
+};
+
+/// Matrix multiply-add operation - assumes operand A is not changing
+struct MmaComplexF32_Corner {
+
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using ElementC = complex<float>;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<complex<float>, 1> &d,
+    Array<complex<float>, 1> const &a,
+    Array<complex<float>, 1> const &b,
+    Array<complex<float>, 1> const &c
+  ) {
+
+    d[0].real() = -a[0].imag() * b[0].imag() + d[0].real();
+    d[0].imag() =  a[0].real() * b[0].imag() + d[0].imag();
+    d[0].real() =  a[0].real() * b[0].real() + c[0].real();
+    d[0].imag() =  a[0].imag() * b[0].real() + c[0].imag();
+  }
+};
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Gemplate that handles all packed matrix layouts
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Layout of A matrix (concept: layout::MapFunc)
+  typename LayoutA_,
+  /// Layout of B matrix (concept: layout::MapFunc)
+  typename LayoutB_,
+  /// Layout of C matrix (concept: layout::MapFunc)
+  typename LayoutC_
+>
+struct MmaGeneric<
+  Shape_,
+  complex<float>,
+  LayoutA_,
+  complex<float>,
+  LayoutB_,
+  complex<float>,
+  LayoutC_,
+  arch::OpMultiplyAdd> {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  /// Data type of operand A
+  using ElementA = complex<float>;
+
+  /// Layout of A matrix (concept: layout::MapFunc)
+  using LayoutA = LayoutA_;
+
+  /// Data type of operand B
+  using ElementB = complex<float>;
+
+  /// Layout of B matrix (concept: layout::MapFunc)
+  using LayoutB = LayoutB_;
+
+  /// Element type of operand C
+  using ElementC = complex<float>;
+
+  /// Layout of C matrix (concept: layout::MapFunc)
+  using LayoutC = LayoutC_;
+
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
+  /// A operand storage
+  using FragmentA = Array<ElementA, Shape::kMK>;
+
+  /// B operand storage
+  using FragmentB = Array<ElementB, Shape::kKN>;
+
+  /// C operand storage
+  using FragmentC = Array<ElementC, Shape::kMN>;
+
+  /// Instruction
+  using MmaOp = arch::Mma<
+    gemm::GemmShape<1,1,1>,
+    1,
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    Operator>;
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    TensorRef<ElementA const, LayoutA> a_ref(
+      reinterpret_cast<ElementA const *>(&A), LayoutA::packed({Shape::kM, Shape::kK}));
+
+    TensorRef<ElementB const, LayoutB> b_ref(
+      reinterpret_cast<ElementB const *>(&B), LayoutB::packed({Shape::kK, Shape::kN}));
+
+    TensorRef<ElementC, LayoutC> d_ref(
+      reinterpret_cast<ElementC *>(&D), LayoutC::packed(make_Coord(Shape::kM, Shape::kN)));
+
+    detail::MmaComplexF32_Column mma_column;
+    detail::MmaComplexF32_Corner mma_corner;
+
+    // Copy accumulators
+    D = C;
+
+    // Compute matrix product
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Shape::kK; ++k) {
+
+      {
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < Shape::kN; ++n) {
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < Shape::kM; ++m) {
+
+            int m_serpentine = (n % 2) ? (Shape::kM - 1 - m) : m;
+
+            MatrixCoord mn(m_serpentine, n);
+            MatrixCoord mk(m_serpentine, k);
+            MatrixCoord kn(k, n);
+
+            Array<ElementC, 1> d;
+            Array<ElementA, 1> a;
+            Array<ElementB, 1> b;
+
+            d[0] = d_ref.at(mn);
+            a[0] = a_ref.at(mk);
+            b[0] = b_ref.at(kn);
+
+            if ((m == 0 && n) || m == Shape::kM - 1) {
+              mma_corner(d, a, b, d);
+            }
+            else {
+              mma_column(d, a, b, d);
+            }
+
+            d_ref.at(mn) = d[0];
+          }
+        }
+      }
+    }
+  }
+};
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Gemplate that handles conventional layouts for FFMA and DFMA GEMM
diff --git a/include/cutlass/gemm/thread/mma_sm60.h b/include/cutlass/gemm/thread/mma_sm60.h
index dbf23d2c8c..5e2178982c 100644
--- a/include/cutlass/gemm/thread/mma_sm60.h
+++ b/include/cutlass/gemm/thread/mma_sm60.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -147,9 +147,7 @@ struct Mma_HFMA2 <
         CUTLASS_PRAGMA_UNROLL
         for(auto n=0; n < Shape::kN / Mma::Shape::kN; n++){
 
-            Array<half_t, 2> tmp;
-            Array<half_t, 2> *ptr_tmp = &tmp;
-            ptr_tmp[0] = ptr_D[n*Shape::kM/2 + m];
+            Array<half_t, 2> tmp { ptr_D[n*Shape::kM/2 + m] };
 
             mma(
                 tmp,
@@ -157,7 +155,7 @@ struct Mma_HFMA2 <
                 ptr_B[n*Shape::kK + k],
                 tmp);
 
-            ptr_D[n*Shape::kM/2 + m] = ptr_tmp[0];
+            ptr_D[n*Shape::kM/2 + m] = tmp;
         }
       }
     }
@@ -239,9 +237,7 @@ struct Mma_HFMA2<
           CUTLASS_PRAGMA_UNROLL
           for(auto m=0; m < Shape::kM / Mma::Shape::kM; m++){
 
-            Array<half_t, 2> tmp;
-            Array<half_t, 2> *ptr_tmp = &tmp;
-            ptr_tmp[0] = ptr_D[m*Shape::kN/2 + n];
+            Array<half_t, 2> tmp { ptr_D[m*Shape::kN/2 + n] };
 
             Array<half_t, 2> tmp_B;
             tmp_B[0] = ptr_B->at(2*n*Shape::kK + k);
@@ -253,7 +249,7 @@ struct Mma_HFMA2<
                 tmp_B,
                 tmp);
 
-            ptr_D[m*Shape::kN/2 + n] = ptr_tmp[0];
+            ptr_D[m*Shape::kN/2 + n] = tmp;
         }
       }
     }
@@ -335,10 +331,7 @@ struct Mma_HFMA2 <
           CUTLASS_PRAGMA_UNROLL
           for (int n = 0; n < Shape::kN / Mma::Shape::kN; ++n) {
 
-          Array<half_t, 2> tmp;
-          Array<half_t, 2> *ptr_tmp = &tmp;
-
-          ptr_tmp[0] = ptr_D[m + n * Shape::kM/2];
+          Array<half_t, 2> tmp { ptr_D[m + n * Shape::kM/2] };
 
           mma(
             tmp,
@@ -346,7 +339,7 @@ struct Mma_HFMA2 <
             ptr_B[k * Shape::kN + n],
             tmp);
 
-          ptr_D[m + n * Shape::kM/2] = ptr_tmp[0];
+          ptr_D[m + n * Shape::kM/2] = tmp;
         }
       }
     }
@@ -428,9 +421,7 @@ struct Mma_HFMA2<
           CUTLASS_PRAGMA_UNROLL
           for(auto m=0; m < Shape::kM / Mma::Shape::kM; m++){
 
-            Array<half_t, 2> tmp;
-            Array<half_t, 2> *ptr_tmp = &tmp;
-            ptr_tmp[0] = ptr_D[m*Shape::kN/2 + n];
+            Array<half_t, 2> tmp { ptr_D[m*Shape::kN/2 + n] };
 
             mma(
                 tmp,
@@ -438,7 +429,7 @@ struct Mma_HFMA2<
                 ptr_B[k*Shape::kN/2 + n],
                 tmp);
 
-            ptr_D[m*Shape::kN/2 + n] = ptr_tmp[0];
+            ptr_D[m*Shape::kN/2 + n] = tmp;
         }
       }
     }
@@ -521,9 +512,7 @@ struct Mma_HFMA2 <
         CUTLASS_PRAGMA_UNROLL
         for(auto n=0; n < Shape::kN / Mma::Shape::kN; n++){
 
-            Array<half_t, 2> tmp;
-            Array<half_t, 2> *ptr_tmp = &tmp;
-            ptr_tmp[0] = ptr_D[n*Shape::kM/2 + m];
+            Array<half_t, 2> tmp { ptr_D[n*Shape::kM/2 + m] };
 
             Array<half_t, 2> tmp_A;
             tmp_A[0] = ptr_A->at(2*m*Shape::kK + k);
@@ -535,7 +524,7 @@ struct Mma_HFMA2 <
                 ptr_B[n*Shape::kK + k],
                 tmp);
 
-            ptr_D[n*Shape::kM/2 + m] = ptr_tmp[0];
+            ptr_D[n*Shape::kM/2 + m] = tmp;
         }
       }
     }
@@ -617,9 +606,7 @@ struct Mma_HFMA2 <
           CUTLASS_PRAGMA_UNROLL
           for(auto m=0; m < Shape::kM / Mma::Shape::kM; m++){
 
-            Array<half_t, 2> tmp;
-            Array<half_t, 2> *ptr_tmp = &tmp;
-            ptr_tmp[0] = ptr_D[m*Shape::kN/2 + n];
+            Array<half_t, 2> tmp { ptr_D[m*Shape::kN/2 + n] };
 
             Array<half_t, 2> tmp_B;
             tmp_B[0] = ptr_B->at(2*n*Shape::kK + k);
@@ -631,7 +618,7 @@ struct Mma_HFMA2 <
                 tmp_B,
                 tmp);
 
-            ptr_D[m*Shape::kN/2 + n] = ptr_tmp[0];
+            ptr_D[m*Shape::kN/2 + n] = tmp;
         }
       }
     }
@@ -713,9 +700,7 @@ struct Mma_HFMA2 <
         CUTLASS_PRAGMA_UNROLL
         for(auto n=0; n < Shape::kN / Mma::Shape::kN; n++){
 
-            Array<half_t, 2> tmp;
-            Array<half_t, 2> *ptr_tmp = &tmp;
-            ptr_tmp[0] = ptr_D[n*Shape::kM/2 + m];
+            Array<half_t, 2> tmp { ptr_D[n*Shape::kM/2 + m] };
 
             Array<half_t, 2> tmp_A;
             tmp_A[0] = ptr_A->at(2*m*Shape::kK + k);
@@ -727,7 +712,7 @@ struct Mma_HFMA2 <
                 ptr_B[k*Shape::kN + n],
                 tmp);
 
-            ptr_D[n*Shape::kM/2 + m] = ptr_tmp[0];
+            ptr_D[n*Shape::kM/2 + m] = tmp;
         }
       }
     }
@@ -810,9 +795,7 @@ struct Mma_HFMA2<
           CUTLASS_PRAGMA_UNROLL
           for(auto m=0; m < Shape::kM / Mma::Shape::kM; m++){
 
-            Array<half_t, 2> tmp;
-            Array<half_t, 2> *ptr_tmp = &tmp;
-            ptr_tmp[0] = ptr_D[m*Shape::kN/2 + n];
+            Array<half_t, 2> tmp { ptr_D[m*Shape::kN/2 + n] };
 
             mma(
                 tmp,
@@ -820,7 +803,7 @@ struct Mma_HFMA2<
                 ptr_B[k*Shape::kN/2 + n],
                 tmp);
 
-            ptr_D[m*Shape::kN/2 + n] = ptr_tmp[0];
+            ptr_D[m*Shape::kN/2 + n] = tmp;
         }
       }
     }
diff --git a/include/cutlass/gemm/thread/mma_sm61.h b/include/cutlass/gemm/thread/mma_sm61.h
index 6309413019..a1abb05f6c 100644
--- a/include/cutlass/gemm/thread/mma_sm61.h
+++ b/include/cutlass/gemm/thread/mma_sm61.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_ell_mma.h b/include/cutlass/gemm/threadblock/default_ell_mma.h
new file mode 100644
index 0000000000..fba2812643
--- /dev/null
+++ b/include/cutlass/gemm/threadblock/default_ell_mma.h
@@ -0,0 +1,734 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Default template for a Blocked-Ell MMA.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/gemm/threadblock/default_mma_core_wmma.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+#include "cutlass/gemm/threadblock/ell_mma_pipelined.h"
+#include "cutlass/gemm/threadblock/ell_mma_multistage.h"
+#include "cutlass/transform/threadblock/ell_predicated_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false
+    >
+struct DefaultEllMma;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (OperatorClass Simt)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultEllMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, layout::RowMajor,
+                  arch::OpClassSimt, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, 2, Operator, false> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
+      arch::OpClassSimt, 2, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::EllPredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::EllPredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+      layout::RowMajor, typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (OperatorClass TensorOp)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator
+    >
+struct DefaultEllMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, 2, Operator, false> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
+      arch::OpClassTensorOp, 2, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::EllPredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::EllPredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+      layout::RowMajor, typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization for row-major output (OperatorClass TensorOp)
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator
+    >
+struct DefaultEllMma<float, LayoutA, kAlignmentA, float, LayoutB,
+                  kAlignmentB, float, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, 2, Operator, false> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, float, LayoutA, float,
+      LayoutB, float, layout::RowMajor, arch::OpClassTensorOp, 2,
+      arch::OpMultiplyAddFastF16>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::EllPredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
+          float, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::EllPredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
+          float, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, float,
+      layout::RowMajor, typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for column-major-interleaved output
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Number of Interleaved K
+    int InterleavedK>
+struct DefaultEllMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator,
+                  layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass,
+                  ArchTag, ThreadblockShape, WarpShape, InstructionShape, 2,
+                  Operator, true> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator,
+      layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass, 2, Operator,
+      true>;
+
+  static_assert(kAlignmentA == 128 / sizeof_bits<ElementA>::value, 
+    "Alignment must match thread data map's vector length");
+
+  static_assert(kAlignmentB ==128 / sizeof_bits<ElementB>::value,
+    "Alignment must match thread data map's vector length");
+
+  // Define iterators over tiles from the A operand
+  using IteratorA = cutlass::transform::threadblock::EllPredicatedTileIterator<
+      cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>, ElementA,
+      LayoutA, 1, typename MmaCore::IteratorThreadMapA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB = cutlass::transform::threadblock::EllPredicatedTileIterator<
+      cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>, ElementB,
+      LayoutB, 0, typename MmaCore::IteratorThreadMapB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+      layout::ColumnMajorInterleaved<InterleavedK>,
+      typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator
+    >
+struct DefaultEllMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, layout::RowMajor,
+                  arch::OpClassSimt, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, Stages, Operator, false> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA =
+      cutlass::transform::threadblock::EllPredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB =
+      cutlass::transform::threadblock::EllPredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (OperatorClass TensorOp)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator
+    >
+struct DefaultEllMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, Stages, Operator, false> {
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA =
+      cutlass::transform::threadblock::EllPredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB =
+      cutlass::transform::threadblock::EllPredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for column-major-interleaved output
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Number of Interleaved K
+    int InterleavedK>
+struct DefaultEllMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator,
+                  layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass,
+                  ArchTag, ThreadblockShape, WarpShape, InstructionShape,
+                  Stages, Operator, true> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator,
+      layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass, Stages,
+      Operator, true>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA =
+      cutlass::transform::threadblock::EllPredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB =
+      cutlass::transform::threadblock::EllPredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for SIMT IDP4A Kernels
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape>
+struct DefaultEllMma<int8_t, LayoutA, kAlignmentA, int8_t, LayoutB, kAlignmentB,
+                  ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+                  ArchTag, ThreadblockShape, WarpShape, GemmShape<1, 1, 4>, 2,
+                  Operator, false> {
+  using InstructionShape = GemmShape<1, 1, 4>;
+  using ElementA = int8_t;
+  using ElementB = int8_t;
+  using OperatorClass =  arch::OpClassSimt;
+
+  static const bool transposeA =  cutlass::platform::is_same< LayoutA, layout::ColumnMajor >::value;
+  static const bool transposeB =  cutlass::platform::is_same< LayoutB, layout::RowMajor >::value;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
+      OperatorClass, 2, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileIterator2dThreadTile<
+          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, transposeA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileIterator2dThreadTile<
+          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, transposeB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+      layout::RowMajor, typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+/// Specialization for Wmma TensorOp operator with 2 staged pipeline
+template <
+    ///< Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultEllMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, LayoutC,
+                  arch::OpClassWmmaTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, 2, Operator, false> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, LayoutC,
+      arch::OpClassWmmaTensorOp, 2, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::EllPredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::EllPredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+      LayoutC, typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for Wmma TensorOp operator with 1 staged pipeline
+template <
+    ///< Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultEllMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, LayoutC,
+                  arch::OpClassWmmaTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, 1, Operator, false> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, LayoutC,
+      arch::OpClassWmmaTensorOp, 1, Operator>; 
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::EllPredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::EllPredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB>;
+
+  // Define the threadblock-scoped singlestage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaSingleStage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+      LayoutC, typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass 
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/threadblock/default_gemv_core.h b/include/cutlass/gemm/threadblock/default_gemv_core.h
index 36bb2339b2..404e189193 100755
--- a/include/cutlass/gemm/threadblock/default_gemv_core.h
+++ b/include/cutlass/gemm/threadblock/default_gemv_core.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_mma.h b/include/cutlass/gemm/threadblock/default_mma.h
index 2fab97d779..8885d1ffc3 100644
--- a/include/cutlass/gemm/threadblock/default_mma.h
+++ b/include/cutlass/gemm/threadblock/default_mma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -40,6 +40,7 @@
 #include "cutlass/arch/wmma.h"
 
 #include "cutlass/layout/matrix.h"
+#include "cutlass/layout/permute.h"
 #include "cutlass/transform/threadblock/predicated_tile_iterator.h"
 #include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
 
@@ -100,7 +101,11 @@ template <
     /// Gather operand A by using an index array
     bool GatherA = false,
     /// Gather operand B by using an index array
-    bool GatherB = false
+    bool GatherB = false,
+    /// Permute operand A
+    typename PermuteALayout = layout::NoPermute,
+    /// Permute operand B
+    typename PermuteBLayout = layout::NoPermute
     >
 struct DefaultMma;
 
@@ -137,13 +142,17 @@ template <
     /// Gather operand A by using an index array
     bool GatherA,
     /// Gather operand B by using an index array
-    bool GatherB
+    bool GatherB,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
     >
 struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
                   kAlignmentB, ElementAccumulator, LayoutC,
                   arch::OpClassSimt, ArchTag, ThreadblockShape, WarpShape,
                   InstructionShape, 2, Operator, false, SharedMemoryClearOption::kNone,
-                  GatherA, GatherB> {
+                  GatherA, GatherB, PermuteALayout, PermuteBLayout> {
 
   static_assert(platform::is_same<LayoutC, layout::RowMajor>::value
              || platform::is_same<LayoutC, layout::AffineRankN<2>>::value,
@@ -159,13 +168,15 @@ struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
   using IteratorA =
       cutlass::transform::threadblock::PredicatedTileIterator<
           cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
-          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA, GatherA>;
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA,
+          GatherA, PermuteALayout>;
 
   // Define iterators over tiles from the B operand
   using IteratorB =
       cutlass::transform::threadblock::PredicatedTileIterator<
           cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
-          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB, GatherB>;
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB,
+          GatherB, PermuteBLayout>;
 
   // Define the threadblock-scoped pipelined matrix multiply
   using ThreadblockMma = cutlass::gemm::threadblock::MmaPipelined<
@@ -207,13 +218,17 @@ template <
     /// Gather operand A by using an index array
     bool GatherA,
     /// Gather operand B by using an index array
-    bool GatherB
+    bool GatherB,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
     >
 struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
                   kAlignmentB, ElementAccumulator, layout::RowMajor,
                   arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
                   InstructionShape, 2, Operator, false, SharedMemoryClear,
-                  GatherA, GatherB> {
+                  GatherA, GatherB, PermuteALayout, PermuteBLayout> {
   // Define the MmaCore components
   using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
       ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
@@ -225,14 +240,14 @@ struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
       cutlass::transform::threadblock::PredicatedTileIterator<
           cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
           ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA,
-          GatherA>;
+          GatherA, PermuteALayout>;
 
   // Define iterators over tiles from the B operand
   using IteratorB =
       cutlass::transform::threadblock::PredicatedTileIterator<
           cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
           ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB,
-          GatherB>;
+          GatherB, PermuteBLayout>;
 
   // Define the threadblock-scoped pipelined matrix multiply
   using ThreadblockMma = cutlass::gemm::threadblock::MmaPipelined<
@@ -265,13 +280,17 @@ template <
     /// Gather operand A by using an index array
     bool GatherA,
     /// Gather operand B by using an index array
-    bool GatherB
+    bool GatherB,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
     >
 struct DefaultMma<float, LayoutA, kAlignmentA, float, LayoutB,
                   kAlignmentB, float, layout::RowMajor,
                   arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
                   InstructionShape, 2, Operator, false, SharedMemoryClearOption::kNone,
-                  GatherA, GatherB> {
+                  GatherA, GatherB, PermuteALayout, PermuteBLayout> {
   // Define the MmaCore components
   using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
       ThreadblockShape, WarpShape, InstructionShape, float, LayoutA, float,
@@ -282,13 +301,15 @@ struct DefaultMma<float, LayoutA, kAlignmentA, float, LayoutB,
   using IteratorA =
       cutlass::transform::threadblock::PredicatedTileIterator<
           cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
-          float, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA, GatherA>;
+          float, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA,
+          GatherA, PermuteALayout>;
 
   // Define iterators over tiles from the B operand
   using IteratorB =
       cutlass::transform::threadblock::PredicatedTileIterator<
           cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
-          float, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB, GatherB>;
+          float, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB,
+          GatherB, PermuteBLayout>;
 
   // Define the threadblock-scoped pipelined matrix multiply
   using ThreadblockMma = cutlass::gemm::threadblock::MmaPipelined<
@@ -333,7 +354,8 @@ struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
                   kAlignmentB, ElementAccumulator,
                   layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass,
                   ArchTag, ThreadblockShape, WarpShape, InstructionShape, 2,
-                  Operator, true, SharedMemoryClearOption::kNone, false, false> {
+                  Operator, true, SharedMemoryClearOption::kNone, false, false,
+                  layout::NoPermute, layout::NoPermute> {
   // Define the MmaCore components
   using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
       ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
@@ -400,13 +422,17 @@ template <
     /// Gather operand A by using an index array
     bool GatherA,
     /// Gather operand B by using an index array
-    bool GatherB
+    bool GatherB,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
     >
 struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
                   kAlignmentB, ElementAccumulator, LayoutC,
                   arch::OpClassSimt, ArchTag, ThreadblockShape, WarpShape,
                   InstructionShape, Stages, Operator, false, SharedMemoryClearOption::kNone,
-                  GatherA, GatherB> {
+                  GatherA, GatherB, PermuteALayout, PermuteBLayout> {
 
   static_assert(platform::is_same<LayoutC, layout::RowMajor>::value
              || platform::is_same<LayoutC, layout::AffineRankN<2>>::value,
@@ -424,7 +450,7 @@ struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
   using IteratorA =
       cutlass::transform::threadblock::PredicatedTileAccessIterator<
           cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA, GatherA>;
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA, GatherA, PermuteALayout>;
 
   // Define iterators over tiles from the B operand
   using ThreadMapB = typename MmaCore::IteratorThreadMapB;
@@ -432,7 +458,7 @@ struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
   using IteratorB =
       cutlass::transform::threadblock::PredicatedTileAccessIterator<
           cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB, GatherB>;
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB, GatherB, PermuteBLayout>;
 
   // Define the threadblock-scoped multistage matrix multiply
   using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
@@ -479,13 +505,17 @@ template <
     /// Gather operand A by using an index array
     bool GatherA,
     /// Gather operand B by using an index array
-    bool GatherB
+    bool GatherB,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
     >
 struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
                   kAlignmentB, ElementAccumulator, LayoutC,
                   arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
                   InstructionShape, Stages, Operator, false, SharedMemoryClear,
-                  GatherA, GatherB> {
+                  GatherA, GatherB, PermuteALayout, PermuteBLayout> {
 
   static_assert(platform::is_same<LayoutC, layout::RowMajor>::value
              || platform::is_same<LayoutC, layout::AffineRankN<2>>::value,
@@ -513,7 +543,7 @@ struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
   using IteratorA =
       cutlass::transform::threadblock::PredicatedTileAccessIterator<
           cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA, GatherA>;
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA, GatherA, PermuteALayout>;
 
   // Define iterators over tiles from the B operand
   using ThreadMapB = typename MmaCore::IteratorThreadMapB;
@@ -521,7 +551,7 @@ struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
   using IteratorB =
       cutlass::transform::threadblock::PredicatedTileAccessIterator<
           cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB, GatherB>;
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB, GatherB, PermuteBLayout>;
 
   // Define the threadblock-scoped multistage matrix multiply
   using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
@@ -569,7 +599,8 @@ struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
                   kAlignmentB, ElementAccumulator,
                   layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass,
                   ArchTag, ThreadblockShape, WarpShape, InstructionShape,
-                  Stages, Operator, true, SharedMemoryClearOption::kNone, false, false> {
+                  Stages, Operator, true, SharedMemoryClearOption::kNone, 
+                  false, false, layout::NoPermute, layout::NoPermute> {
   // Define the MmaCore components
   using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
       ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
@@ -626,14 +657,15 @@ template <
 struct DefaultMma<int8_t, LayoutA, kAlignmentA, int8_t, LayoutB, kAlignmentB,
                   ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
                   ArchTag, ThreadblockShape, WarpShape, GemmShape<1, 1, 4>, 2,
-                  Operator, false, SharedMemoryClearOption::kNone, false, false> {
+                  Operator, false, SharedMemoryClearOption::kNone,
+                  false, false, layout::NoPermute, layout::NoPermute> {
   using InstructionShape = GemmShape<1, 1, 4>;
   using ElementA = int8_t;
   using ElementB = int8_t;
   using OperatorClass =  arch::OpClassSimt;
 
-  static const bool transposeA =  platform::is_same< LayoutA, layout::ColumnMajor >::value;
-  static const bool transposeB =  platform::is_same< LayoutB, layout::RowMajor >::value;
+  static const bool transposeA = platform::is_same< LayoutA, layout::ColumnMajor >::value;
+  static const bool transposeB = platform::is_same< LayoutB, layout::RowMajor >::value;
 
   // Define the MmaCore components
   using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
@@ -695,7 +727,7 @@ struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
                   kAlignmentB, ElementAccumulator, LayoutC,
                   arch::OpClassWmmaTensorOp, ArchTag, ThreadblockShape, WarpShape,
                   InstructionShape, 2, Operator, false, SharedMemoryClearOption::kNone,
-                  false, false> {
+                  false, false, layout::NoPermute, layout::NoPermute> {
   // Define the MmaCore components
   using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
       ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
@@ -755,7 +787,7 @@ struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
                   kAlignmentB, ElementAccumulator, LayoutC,
                   arch::OpClassWmmaTensorOp, ArchTag, ThreadblockShape, WarpShape,
                   InstructionShape, 1, Operator, false, SharedMemoryClearOption::kNone,
-                  false, false> {
+                  false, false, layout::NoPermute, layout::NoPermute> {
   // Define the MmaCore components
   using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
       ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
diff --git a/include/cutlass/gemm/threadblock/default_mma_core.h b/include/cutlass/gemm/threadblock/default_mma_core.h
index 1b67f34598..da83982f41 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -47,6 +47,7 @@
 #include "cutlass/gemm/threadblock/mma_pipelined.h"
 #include "cutlass/gemm/threadblock/mma_singlestage.h"
 #include "cutlass/arch/cache_operation.h" 
+#include "cutlass/arch/mma.h" 
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_simt.h b/include/cutlass/gemm/threadblock/default_mma_core_simt.h
index d850c0f1ef..91f4710ed1 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_simt.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_simt.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_sm70.h b/include/cutlass/gemm/threadblock/default_mma_core_sm70.h
index deb220dc02..41000dc187 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_sm70.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_sm70.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_sm75.h b/include/cutlass/gemm/threadblock/default_mma_core_sm75.h
index 09abd0ec41..0162ef0df3 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_sm75.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_sm75.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -125,17 +125,33 @@ struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
   /// Default Operator
   using Operator = Operator_;
 
+  // Warp thread arrangement
+  static int const kWarpThreadArrangementContiguousA =
+      platform::min(Shape::kM / (kAccessSizeInBits / sizeof_bits<ElementA>::value), 8);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  static int const kWarpThreadArrangementContiguousB =
+      platform::min(Shape::kN / (kAccessSizeInBits / sizeof_bits<ElementB>::value), 8);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
   //
   // Shared memory layouts
   //
-
+  static int const Crosswise_A = platform::min(int(128 / sizeof(ElementA)),
+                                               Shape::kM);
   using SmemLayoutA = 
     layout::ColumnMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<ElementA>::value, int(128 / sizeof(ElementA))>;
+      sizeof_bits<ElementA>::value, Crosswise_A>;
 
   // Shared memory layout
+  static int const Crosswise_B = platform::min(int(128 / sizeof(ElementB)),
+                                               Shape::kN);
   using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous<
-    sizeof_bits<ElementB>::value, int(128 / sizeof(ElementB))>;
+    sizeof_bits<ElementB>::value, Crosswise_B>;
 
   //
   // Iterators to write to shared memory
@@ -145,7 +161,8 @@ struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
   using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
     layout::PitchLinearShape<Shape::kM, Shape::kK>,
     kThreads,
-    layout::PitchLinearShape<8, 4>,
+    layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                             kWarpThreadArrangementStridedA>,
     kAccessSizeInBits / sizeof_bits<ElementA>::value
   >;
 
@@ -162,7 +179,8 @@ struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
   using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
     layout::PitchLinearShape<Shape::kN, Shape::kK>,
     kThreads,
-    layout::PitchLinearShape<8, 4>,
+    layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                             kWarpThreadArrangementStridedB>,
     kAccessSizeInBits / sizeof_bits<ElementB>::value
   >;
 
@@ -269,7 +287,7 @@ struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
       kWarpSize / kWarpThreadArrangementContiguousA;
 
   static int const kWarpThreadArrangementContiguousB =
-      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementB>::value);
 
   static int const kWarpThreadArrangementStridedB =
       kWarpSize / kWarpThreadArrangementContiguousB;
@@ -414,6 +432,12 @@ struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
   static int const kWarpThreadArrangementStridedA =
       kWarpSize / kWarpThreadArrangementContiguousA;
 
+  static int const kWarpThreadArrangementContiguousB =
+      platform::min(Shape::kN / (kAccessSizeInBits / sizeof_bits<ElementB>::value), 8);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
   //
   // Shared memory layouts
   //
@@ -422,8 +446,11 @@ struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
       sizeof_bits<ElementA>::value, Shape::kK>;
 
   // Shared memory layout
+  static int const Crosswise_B = platform::min(int(128 / sizeof(ElementB)),
+                                               Shape::kN);
+
   using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<ElementB>::value, int(128 / sizeof(ElementB))>;
+      sizeof_bits<ElementB>::value, Crosswise_B>;
 
   //
   // Iterators to write to shared memory
@@ -449,7 +476,8 @@ struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
   using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
     layout::PitchLinearShape<Shape::kN, Shape::kK>,
     kThreads,
-    layout::PitchLinearShape<8, 4>,
+    layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                             kWarpThreadArrangementStridedB>,
     kAccessSizeInBits / sizeof_bits<ElementB>::value
   >;
 
@@ -545,6 +573,12 @@ struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
   using Operator = Operator_; 
 
   // Warp thread arrangement 
+  static int const kWarpThreadArrangementContiguousA =
+      platform::min(Shape::kM / (kAccessSizeInBits / sizeof_bits<ElementA>::value), 8);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
   static int const kWarpThreadArrangementContiguousB =
       Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
 
@@ -554,9 +588,10 @@ struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
   //
   // Shared memory layouts
   //
-
+  static int const Crosswise_A = platform::min(int(128 / sizeof(ElementA)),
+                                               Shape::kM);
   using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<ElementA>::value, int(128 / sizeof(ElementA))>;
+      sizeof_bits<ElementA>::value, Crosswise_A>;
 
   // Shared memory layout
   using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
@@ -569,7 +604,8 @@ struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
   /// ThreadMap of iterator A
   using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
       layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
-      layout::PitchLinearShape<8, 4>,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
       kAccessSizeInBits / sizeof_bits<ElementA>::value>;
 
   /// Shared memory iterator to A operand
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_sm80.h b/include/cutlass/gemm/threadblock/default_mma_core_sm80.h
index bfa04d444e..ae21ee8bcb 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_sm80.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -36,6 +36,9 @@
 
       Partial specializations for threadblock::Mma operations targeting TensorOp
    instructions.
+
+      SM80 Multi stage kernel expects stage number to be larger or equal to 3
+   to use asyncronous copy.
 */
 
 #pragma once
@@ -1002,7 +1005,8 @@ struct DefaultMmaCore<
 
   static_assert(
     platform::is_same<Operator, arch::OpMultiplyAddComplex>::value ||
-    platform::is_same<Operator, arch::OpMultiplyAddGaussianComplex>::value,
+    platform::is_same<Operator, arch::OpMultiplyAddGaussianComplex>::value ||
+    platform::is_same<Operator, arch::OpMultiplyAddComplexFastF32>::value,
     "The operator tag must indicate complex multiplication.");
 
   //
@@ -1075,6 +1079,8 @@ template <
     typename Shape_,
     /// Shape of warp-level matrix multiply operator (concept: GemmShape)
     typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
     /// Layout for A operand
     typename LayoutA_,
     /// Layout for B operand
@@ -1095,7 +1101,7 @@ template <
     ComplexTransform TransformB_
     >
 struct DefaultMmaCore<
-  Shape_, WarpShape_, GemmShape<8, 8, 4>, 
+  Shape_, WarpShape_, InstructionShape_, 
   complex<double>, LayoutA_, 
   complex<double>, LayoutB_, 
   complex<double>, LayoutC_, 
@@ -1109,7 +1115,7 @@ struct DefaultMmaCore<
 
   using Shape = Shape_;
   using WarpShape = WarpShape_;
-  using InstructionShape = GemmShape<8, 8, 4>;
+  using InstructionShape = InstructionShape_;
   using ElementA = complex<double>;
   using LayoutA = LayoutA_;
   using ElementB = complex<double>;
@@ -1279,16 +1285,32 @@ struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
   /// Default Operator
   using Operator = Operator_;
 
+  // Warp thread arrangement
+  static int const kWarpThreadArrangementContiguousA =
+      platform::min(Shape::kM / (kAccessSizeInBits / sizeof_bits<ElementA>::value), 8);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  static int const kWarpThreadArrangementContiguousB =
+      platform::min(Shape::kN / (kAccessSizeInBits / sizeof_bits<ElementB>::value), 8);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
   //
   // Shared memory layouts
   //
-
+  static int const Crosswise_A = platform::min(int(128 / sizeof(ElementA)),
+                                               Shape::kM);
   using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<ElementA>::value, int(128 / sizeof(ElementA))>;
+      sizeof_bits<ElementA>::value, Crosswise_A>;
 
   // Shared memory layout
+  static int const Crosswise_B = platform::min(int(128 / sizeof(ElementB)),
+                                               Shape::kN);
   using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<ElementB>::value, int(128 / sizeof(ElementB))>;
+      sizeof_bits<ElementB>::value, Crosswise_B>;
 
   //
   // Iterators to write to shared memory
@@ -1297,7 +1319,8 @@ struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
   /// ThreadMap of iterator A
   using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
       layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
-      layout::PitchLinearShape<8, 4>,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
       kAccessSizeInBits / sizeof_bits<ElementA>::value>;
 
   /// Shared memory iterator to A operand
@@ -1308,7 +1331,8 @@ struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
   /// ThreadMap of iterator B
   using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
       layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
-      layout::PitchLinearShape<8, 4>,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
       kAccessSizeInBits / sizeof_bits<ElementB>::value>;
 
   /// Shared memory iterator to B operand
@@ -1410,7 +1434,7 @@ struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
       kWarpSize / kWarpThreadArrangementContiguousA;
 
   static int const kWarpThreadArrangementContiguousB =
-      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementB>::value);
 
   static int const kWarpThreadArrangementStridedB =
       kWarpSize / kWarpThreadArrangementContiguousB;
@@ -1543,6 +1567,12 @@ struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
   using Operator = Operator_;
 
   // Warp thread arrangement
+  static int const kWarpThreadArrangementContiguousA =
+      platform::min(Shape::kM / (kAccessSizeInBits / sizeof_bits<ElementA>::value), 8);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
   static int const kWarpThreadArrangementContiguousB =
       Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
 
@@ -1552,9 +1582,10 @@ struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
   //
   // Shared memory layouts
   //
-
+  static int const Crosswise_A = platform::min(int(128 / sizeof(ElementA)),
+                                               Shape::kM);
   using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<ElementA>::value, int(128 / sizeof(ElementA))>;
+      sizeof_bits<ElementA>::value, Crosswise_A>;
 
   // Shared memory layout
   using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
@@ -1567,7 +1598,8 @@ struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
   /// ThreadMap of iterator A
   using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
       layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
-      layout::PitchLinearShape<8, 4>,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
       kAccessSizeInBits / sizeof_bits<ElementA>::value>;
 
   /// Shared memory iterator to A operand
@@ -1680,6 +1712,12 @@ struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
   static int const kWarpThreadArrangementStridedA =
       kWarpSize / kWarpThreadArrangementContiguousA;
 
+  static int const kWarpThreadArrangementContiguousB =
+      platform::min(Shape::kN / (kAccessSizeInBits / sizeof_bits<ElementB>::value), 8);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
   //
   // Shared memory layouts
   //
@@ -1688,8 +1726,10 @@ struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
       sizeof_bits<ElementA>::value, Shape::kK>;
 
   // Shared memory layout
+  static int const Crosswise_B = platform::min(int(128 / sizeof(ElementB)),
+                                               Shape::kN);
   using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<ElementB>::value, int(128 / sizeof(ElementB))>;
+      sizeof_bits<ElementB>::value, Crosswise_B>;
 
   //
   // Iterators to write to shared memory
@@ -1710,7 +1750,8 @@ struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
   /// ThreadMap of iterator B
   using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
       layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
-      layout::PitchLinearShape<8, 4>,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
       kAccessSizeInBits / sizeof_bits<ElementB>::value>;
 
   /// Shared memory iterator to B operand
@@ -2007,7 +2048,7 @@ struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
   //
 
   // Define the warp-level op
-  static const int WarpNumThreadsM = 4; // TODO need to extract these from template data
+  static const int WarpNumThreadsM = 4;
   static const int WarpNumThreadsN = 8;
   static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
       "WarpShape must be divisible by ThreadTile shape.");
@@ -2162,7 +2203,7 @@ struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
   //
 
   // Define the warp-level op
-  static const int WarpNumThreadsM = 4; // TODO need to extract these from template data
+  static const int WarpNumThreadsM = 4;
   static const int WarpNumThreadsN = 8;
   static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
       "WarpShape must be divisible by ThreadTile shape.");
@@ -2319,7 +2360,7 @@ struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
   //
 
   // Define the warp-level op
-  static const int WarpNumThreadsM = 4; // TODO need to extract these from template data
+  static const int WarpNumThreadsM = 4;
   static const int WarpNumThreadsN = 8;
   static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
       "WarpShape must be divisible by ThreadTile shape.");
@@ -2476,7 +2517,7 @@ struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
   //
 
   // Define the warp-level op
-  static const int WarpNumThreadsM = 4; // TODO need to extract these from template data
+  static const int WarpNumThreadsM = 4;
   static const int WarpNumThreadsN = 8;
   static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
       "WarpShape must be divisible by ThreadTile shape.");
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h b/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h
index e5d54d35a3..985693ce6d 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -191,16 +191,34 @@ struct DefaultSparseMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
   /// Default Operator
   using Operator = Operator_;
 
+  // Warp thread arrangement
+  static int const kWarpThreadArrangementContiguousA =
+      platform::min(Shape::kM / (kAccessSizeInBits / sizeof_bits<ElementA>::value), 8);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  static int const kWarpThreadArrangementContiguousB =
+      platform::min(Shape::kN / (kAccessSizeInBits / sizeof_bits<ElementB>::value), 8);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
   //
   // Shared memory layouts
   //
+  static int const Crosswise_A = platform::min(int(128 / sizeof(ElementA)),
+                                               Shape::kM);
 
   using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<ElementA>::value, int(128 / sizeof(ElementA))>;
+      sizeof_bits<ElementA>::value, Crosswise_A>;
 
   // Shared memory layout
+  static int const Crosswise_B = platform::min(int(128 / sizeof(ElementB)),
+                                               Shape::kN);
+
   using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<ElementB>::value, int(128 / sizeof(ElementB))>;
+      sizeof_bits<ElementB>::value, Crosswise_B>;
 
   //
   // Iterators to write to shared memory
@@ -209,7 +227,8 @@ struct DefaultSparseMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
   /// ThreadMap of iterator A
   using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
       layout::PitchLinearShape<Shape::kM, Shape::kK / kSparse>, kThreads,
-      layout::PitchLinearShape<8, 4>,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
       kAccessSizeInBits / sizeof_bits<ElementA>::value>;
 
   /// Shared memory iterator to A operand
@@ -220,7 +239,8 @@ struct DefaultSparseMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
   /// ThreadMap of iterator B
   using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
       layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
-      layout::PitchLinearShape<8, 4>,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
       kAccessSizeInBits / sizeof_bits<ElementB>::value>;
 
   /// Shared memory iterator to B operand
@@ -547,6 +567,16 @@ struct DefaultSparseMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
   /// Default Operator
   using Operator = Operator_;
 
+  // Warp thread arrangement
+  static int const Crosswise_A = platform::min(int(128 / sizeof(ElementA)),
+                                               Shape::kM);
+
+  static int const kWarpThreadArrangementContiguousA =
+      platform::min(Shape::kM / (kAccessSizeInBits / sizeof_bits<ElementA>::value), 8);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
   // Warp thread arrangement
   // crosswise cannot be larger than 1024 bit.
   static int const kCrosswiseB =
@@ -565,7 +595,7 @@ struct DefaultSparseMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
   //
 
   using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<ElementA>::value, int(128 / sizeof(ElementA))>;
+      sizeof_bits<ElementA>::value, Crosswise_A>;
 
   // Shared memory layout
   using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
@@ -578,7 +608,8 @@ struct DefaultSparseMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
   /// ThreadMap of iterator A
   using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
       layout::PitchLinearShape<Shape::kM, Shape::kK / kSparse>, kThreads,
-      layout::PitchLinearShape<8, 4>,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
       kAccessSizeInBits / sizeof_bits<ElementA>::value>;
 
   /// Shared memory iterator to A operand
@@ -734,6 +765,16 @@ struct DefaultSparseMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
   static int const kWarpThreadArrangementStridedA =
       kWarpSize / kWarpThreadArrangementContiguousA;
 
+  static int const kWarpThreadArrangementContiguousB =
+      platform::min(Shape::kN / (kAccessSizeInBits / sizeof_bits<ElementB>::value), 8);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  static int const Crosswise_B = platform::min(int(128 / sizeof(ElementB)),
+                                               Shape::kN);
+
+
   //
   // Shared memory layouts
   //
@@ -743,7 +784,7 @@ struct DefaultSparseMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
 
   // Shared memory layout
   using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<ElementB>::value, int(128 / sizeof(ElementB))>;
+      sizeof_bits<ElementB>::value, Crosswise_B>;
 
   //
   // Iterators to write to shared memory
@@ -764,7 +805,8 @@ struct DefaultSparseMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
   /// ThreadMap of iterator B
   using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
       layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
-      layout::PitchLinearShape<8, 4>,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
       kAccessSizeInBits / sizeof_bits<ElementB>::value>;
 
   /// Shared memory iterator to B operand
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_with_access_size.h b/include/cutlass/gemm/threadblock/default_mma_core_with_access_size.h
index cf4de84f37..665010741a 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_with_access_size.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_with_access_size.h
@@ -1,3 +1,40 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic properties needed by CTA-level GEMMs assuming expectations about data
+      layout of the global memory fragments, data types, and internal tile sizes.
+
+      Partial specializations for threadblock::Mma operations targeting simt instructions.
+*/
+
 #pragma once
 
 #include "cutlass/cutlass.h"
@@ -149,7 +186,7 @@ template <
     int kAccessSizeInBits_,
     /// Operation performed by GEMM
     typename Operator_>
-struct DefaultMmaCoreWithAccessSize<Shape_, WarpShape_, typename std::enable_if<kAccessSizeInBits_ != -1, GemmShape<1, 1, 1>>::type, ElementA_,
+struct DefaultMmaCoreWithAccessSize<Shape_, WarpShape_, typename platform::enable_if<kAccessSizeInBits_ != -1, GemmShape<1, 1, 1>>::type, ElementA_,
                       layout::ColumnMajor, ElementB_, layout::RowMajor,
                       ElementC_, LayoutC_, arch::OpClassSimt, kAccessSizeInBits_, 2, Operator_
                      > {
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_with_reduction.h b/include/cutlass/gemm/threadblock/default_mma_core_with_reduction.h
index 57be0c3a2d..9f45601a89 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_with_reduction.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_with_reduction.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -90,7 +90,7 @@ template <
     typename LayoutC,
     /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
     typename OperatorClass,
-    ///                                                                                               
+    /// Reduce operand A or B along K dimension
     bool ReduceKForA_,
     /// Number of stages
     int Stages = 2,
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_wmma.h b/include/cutlass/gemm/threadblock/default_mma_core_wmma.h
index bab044e247..5f8e3e339f 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_wmma.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_wmma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_mma_layernorm_mainloop_fusion.h b/include/cutlass/gemm/threadblock/default_mma_layernorm_mainloop_fusion.h
new file mode 100644
index 0000000000..5dd3dbc3a8
--- /dev/null
+++ b/include/cutlass/gemm/threadblock/default_mma_layernorm_mainloop_fusion.h
@@ -0,0 +1,178 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/threadblock/default_mma_core.h"
+#include "cutlass/gemm/threadblock/mma_layernorm_mainloop_fusion_multistage.h"
+#include "cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h"
+#include "cutlass/transform/threadblock/predicated_scale_bias_vector_access_iterator.h"
+#include "cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h"
+#include "cutlass/gemm/warp/scale_bias_tile_iterator.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for Scale/Bias vectors
+    typename ElementScaleBias,
+    /// Layout type for Scale/Bias vectors
+    typename LayoutScaleBias,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false,
+    /// Use zfill or predicate for SM80 out-of-bound cp.async 
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone
+    >
+struct DefaultMmaLayernormMainloopFusion {
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpGammaBeta = CacheOpA;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using IteratorVarMean =
+      cutlass::transform::threadblock::PredicatedScaleBiasVectorIterator<
+          cutlass::MatrixShape<1, WarpShape::kN>,
+          ElementScaleBias,
+          LayoutScaleBias>;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using IteratorGammaBeta =
+      cutlass::transform::threadblock::PredicatedScaleBiasVectorAccessIterator<
+          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
+          LayoutScaleBias>;
+
+  using SmemIteratorGammaBeta =
+      cutlass::transform::threadblock::RegularScaleBiasVectorAccessIterator<
+          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
+          LayoutScaleBias>;
+
+  static int const kThreadCount = 32;
+
+  // Warp-level iterators to load scale and bias vectors
+  using WarpIteratorGammaBeta = cutlass::gemm::warp::ScaleBiasTileIterator<
+      MatrixShape<WarpShape::kM, WarpShape::kK>, ElementScaleBias,
+      LayoutScaleBias, MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+      typename MmaCore::MmaTensorOp::IteratorA::Base::Policy, kThreadCount,
+      MmaCore::WarpCount::kK>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaLayernormMainloopFusionMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, IteratorVarMean, IteratorGammaBeta, SmemIteratorGammaBeta,
+      CacheOpGammaBeta,
+      ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, WarpIteratorGammaBeta, Stages, SharedMemoryClear>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass 
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h b/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h
index e03e9beaad..1895962a7d 100644
--- a/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h
+++ b/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h b/include/cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h
index 67be56fd10..e800ba44d9 100644
--- a/include/cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h
+++ b/include/cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_mma_softmax_mainloop_fusion.h b/include/cutlass/gemm/threadblock/default_mma_softmax_mainloop_fusion.h
new file mode 100644
index 0000000000..f50d36a4b6
--- /dev/null
+++ b/include/cutlass/gemm/threadblock/default_mma_softmax_mainloop_fusion.h
@@ -0,0 +1,160 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined softmax-GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/threadblock/default_mma_core.h"
+#include "cutlass/gemm/threadblock/mma_softmax_mainloop_fusion_multistage.h"
+#include "cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h"
+#include "cutlass/transform/threadblock/predicated_scale_bias_vector_access_iterator.h"
+#include "cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h"
+#include "cutlass/gemm/warp/scale_bias_tile_iterator.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for Scale/Bias vectors
+    typename ElementScaleBias,
+    /// Layout type for Scale/Bias vectors
+    typename LayoutScaleBias,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Whether problem has been transformed. This determines to which operand
+    /// the softmax is applied.
+    bool InternalTranspose,
+    /// Operation perfomed by GEMM
+    typename Operator,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false,
+    /// Use zfill or predicate for SM80 out-of-bound cp.async 
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone
+    >
+struct DefaultMmaSoftmaxMainloopFusion {
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpGammaBeta = CacheOpA;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using IteratorNormSum =
+      cutlass::transform::threadblock::PredicatedScaleBiasVectorIterator<
+          cutlass::MatrixShape<1, WarpShape::kN>,
+          ElementScaleBias,
+          LayoutScaleBias>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaSoftmaxMainloopFusionMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, IteratorNormSum,
+      ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, InternalTranspose, SharedMemoryClear>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass 
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/threadblock/default_mma_with_reduction.h b/include/cutlass/gemm/threadblock/default_mma_with_reduction.h
index 65f38f2f20..677c114430 100644
--- a/include/cutlass/gemm/threadblock/default_mma_with_reduction.h
+++ b/include/cutlass/gemm/threadblock/default_mma_with_reduction.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h b/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h
index 721959dd95..7f249780de 100644
--- a/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h
+++ b/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h
index eda6baf83c..cab2a96ae7 100644
--- a/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h
+++ b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h
index 8e2d5ce071..33150314a4 100644
--- a/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h
+++ b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -81,6 +81,8 @@ template <
     typename Shape_,
     /// Shape of warp-level matrix multiply operator (concept: GemmShape)
     typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
     /// Layout of accumulator
     typename LayoutC_,
     /// Number of stages
@@ -96,7 +98,7 @@ template <
     /// Cache operation of operand B
     cutlass::arch::CacheOperation::Kind CacheOpB>
 struct DefaultMultistageMmaComplexCore<
-    Shape_, WarpShape_, GemmShape<8, 8, 4>, 
+    Shape_, WarpShape_, InstructionShape_, 
     complex<double>, layout::ColumnMajor,
     complex<double>, layout::RowMajor,
     complex<double>, LayoutC_, 
@@ -108,7 +110,7 @@ struct DefaultMultistageMmaComplexCore<
 
   using Shape = Shape_;
   using WarpShape = WarpShape_;
-  using InstructionShape = GemmShape<8, 8, 4>;
+  using InstructionShape = InstructionShape_;
   using ElementA = complex<double>;
   using LayoutA = layout::ColumnMajor;
   using ElementB = complex<double>;
@@ -210,6 +212,8 @@ template <
     typename Shape_,
     /// Shape of warp-level matrix multiply operator (concept: GemmShape)
     typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
     /// Layout of accumulator
     typename LayoutC_,
     /// Number of stages
@@ -225,7 +229,7 @@ template <
     /// Cache operation of operand B
     cutlass::arch::CacheOperation::Kind CacheOpB>
 struct DefaultMultistageMmaComplexCore<
-    Shape_, WarpShape_, GemmShape<8, 8, 4>, 
+    Shape_, WarpShape_, InstructionShape_, 
     complex<double>, layout::ColumnMajor,
     complex<double>, layout::ColumnMajor,
     complex<double>, LayoutC_, 
@@ -237,7 +241,7 @@ struct DefaultMultistageMmaComplexCore<
 
   using Shape = Shape_;
   using WarpShape = WarpShape_;
-  using InstructionShape = GemmShape<8, 8, 4>;
+  using InstructionShape = InstructionShape_;
   using ElementA = complex<double>;
   using LayoutA = layout::ColumnMajor;
   using ElementB = complex<double>;
@@ -339,6 +343,8 @@ template <
     typename Shape_,
     /// Shape of warp-level matrix multiply operator (concept: GemmShape)
     typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
     /// Layout of accumulator
     typename LayoutC_,
     /// Number of stages
@@ -354,7 +360,7 @@ template <
     /// Cache operation of operand B
     cutlass::arch::CacheOperation::Kind CacheOpB>
 struct DefaultMultistageMmaComplexCore<
-    Shape_, WarpShape_, GemmShape<8, 8, 4>, 
+    Shape_, WarpShape_, InstructionShape_, 
     complex<double>, layout::RowMajor,
     complex<double>, layout::ColumnMajor,
     complex<double>, LayoutC_, 
@@ -366,7 +372,7 @@ struct DefaultMultistageMmaComplexCore<
 
   using Shape = Shape_;
   using WarpShape = WarpShape_;
-  using InstructionShape = GemmShape<8, 8, 4>;
+  using InstructionShape = InstructionShape_;
   using ElementA = complex<double>;
   using LayoutA = layout::RowMajor;
   using ElementB = complex<double>;
@@ -469,6 +475,8 @@ template <
     typename Shape_,
     /// Shape of warp-level matrix multiply operator (concept: GemmShape)
     typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
     /// Layout of accumulator
     typename LayoutC_,
     /// Number of stages
@@ -484,7 +492,7 @@ template <
     /// Cache operation of operand B
     cutlass::arch::CacheOperation::Kind CacheOpB>
 struct DefaultMultistageMmaComplexCore<
-    Shape_, WarpShape_, GemmShape<8, 8, 4>, 
+    Shape_, WarpShape_, InstructionShape_, 
     complex<double>, layout::RowMajor,
     complex<double>, layout::RowMajor,
     complex<double>, LayoutC_, 
@@ -496,7 +504,7 @@ struct DefaultMultistageMmaComplexCore<
 
   using Shape = Shape_;
   using WarpShape = WarpShape_;
-  using InstructionShape = GemmShape<8, 8, 4>;
+  using InstructionShape = InstructionShape_;
   using ElementA = complex<double>;
   using LayoutA = layout::RowMajor;
   using ElementB = complex<double>;
@@ -592,7 +600,7 @@ struct DefaultMultistageMmaComplexCore<
 ///   A: column-major
 ///   B: column-major
 ///   Operator: arch::OpMultiplyAddComplex
-///   Math Instruction: MMA.1688.F32.TF32
+///   Math Instruction: mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32
 ///
 /// This uses the default warp-level operator given tile sizes
 template <
@@ -722,7 +730,7 @@ struct DefaultMultistageMmaComplexCore<
 ///   A: column-major
 ///   B: row-major
 ///   Operator: arch::OpMultiplyAddComplex
-///   Math Instruction: MMA.1688.F32.TF32
+///   Math Instruction: mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32
 ///
 /// This uses the default warp-level operator given tile sizes
 template <
@@ -853,7 +861,7 @@ struct DefaultMultistageMmaComplexCore<
 ///   A: row-major
 ///   B: column-major
 ///   Operator: arch::OpMultiplyAddComplex
-///   Math Instruction: MMA.1688.F32.TF32
+///   Math Instruction: mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32
 ///
 /// This uses the default warp-level operator given tile sizes
 template <
@@ -984,7 +992,7 @@ struct DefaultMultistageMmaComplexCore<
 ///   A: row-major
 ///   B: row-major
 ///   Operator: arch::OpMultiplyAddComplex
-///   Math Instruction: MMA.1688.F32.TF32
+///   Math Instruction: mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32
 ///
 /// This uses the default warp-level operator given tile sizes
 template <
@@ -1110,10 +1118,10 @@ struct DefaultMultistageMmaComplexCore<
 
 ////////////////////////////////////////////////////////////////////////////////
 
-/// Partial specialization for complex double-precision
+/// Partial specialization for complex SIMT operation
 ///
 ///   A: column-major
-///   B: row-major
+///   B: column-major
 ///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
 ///
 /// This uses the default warp-level operator given tile sizes
@@ -1236,7 +1244,7 @@ struct DefaultMultistageMmaComplexCore<
   //
 
   // Define the warp-level op
-  static const int WarpNumThreadsM = 4; // TODO need to extract these from template data
+  static const int WarpNumThreadsM = 4;
   static const int WarpNumThreadsN = 8;
   static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
       "WarpShape must be divisible by ThreadTile shape.");
@@ -1259,15 +1267,18 @@ struct DefaultMultistageMmaComplexCore<
   >;
 
   using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
-    WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
-    ElementA,  /// Data type of A elements
-    SmemLayoutA,   /// Layout of A matrix (concept: MatrixLayout)
-    ElementB,  /// Data type of B elements
-    SmemLayoutB,   /// Layout of B matrix (concept: MatrixLayout)
-    ElementC,  /// Element type of C matrix
-    LayoutC,   /// Layout of C matrix (concept: MatrixLayout)
-    Policy     /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
-    >;         /// Used for partial specialization
+    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,     /// Data type of A elements
+    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,     /// Data type of B elements
+    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,     /// Element type of C matrix
+    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
+    Policy,       /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+    1,            /// 1 partition along K dimension
+    kTransformA,  /// Transform for A
+    kTransformB   /// Transform for B
+    >;            /// Used for partial specialization
 
   /// Policy used to define MmaPipelined
   using MmaPolicy = MmaPolicy<
@@ -1277,7 +1288,7 @@ struct DefaultMultistageMmaComplexCore<
     WarpCount::kK>;
 };
 
-/// Partial specialization for complex double-precision
+/// Partial specialization for complex SIMT operation
 ///
 ///   A: column-major
 ///   B: row-major
@@ -1400,7 +1411,7 @@ struct DefaultMultistageMmaComplexCore<
   //
 
   // Define the warp-level op
-  static const int WarpNumThreadsM = 4; // TODO need to extract these from template data
+  static const int WarpNumThreadsM = 4;
   static const int WarpNumThreadsN = 8;
   static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
       "WarpShape must be divisible by ThreadTile shape.");
@@ -1423,15 +1434,18 @@ struct DefaultMultistageMmaComplexCore<
   >;
 
   using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
-    WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
-    ElementA,  /// Data type of A elements
-    SmemLayoutA,   /// Layout of A matrix (concept: MatrixLayout)
-    ElementB,  /// Data type of B elements
-    SmemLayoutB,   /// Layout of B matrix (concept: MatrixLayout)
-    ElementC,  /// Element type of C matrix
-    LayoutC,   /// Layout of C matrix (concept: MatrixLayout)
-    Policy     /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
-    >;         /// Used for partial specialization
+    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,     /// Data type of A elements
+    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,     /// Data type of B elements
+    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,     /// Element type of C matrix
+    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
+    Policy,       /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+    1,            /// 1 partition along K dimension
+    kTransformA,  /// Transform for A
+    kTransformB   /// Transform for B
+    >;            /// Used for partial specialization
 
   /// Policy used to define MmaPipelined
   using MmaPolicy = MmaPolicy<
@@ -1441,10 +1455,10 @@ struct DefaultMultistageMmaComplexCore<
     WarpCount::kK>;
 };
 
-/// Partial specialization for complex double-precision
+/// Partial specialization for complex SIMT operation
 ///
-///   A: column-major
-///   B: row-major
+///   A: row-major
+///   B: column-major
 ///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
 ///
 /// This uses the default warp-level operator given tile sizes
@@ -1570,7 +1584,7 @@ struct DefaultMultistageMmaComplexCore<
   //
 
   // Define the warp-level op
-  static const int WarpNumThreadsM = 4; // TODO need to extract these from template data
+  static const int WarpNumThreadsM = 4;
   static const int WarpNumThreadsN = 8;
   static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
       "WarpShape must be divisible by ThreadTile shape.");
@@ -1593,15 +1607,18 @@ struct DefaultMultistageMmaComplexCore<
   >;
 
   using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
-    WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
-    ElementA,  /// Data type of A elements
-    SmemLayoutA,   /// Layout of A matrix (concept: MatrixLayout)
-    ElementB,  /// Data type of B elements
-    SmemLayoutB,   /// Layout of B matrix (concept: MatrixLayout)
-    ElementC,  /// Element type of C matrix
-    LayoutC,   /// Layout of C matrix (concept: MatrixLayout)
-    Policy     /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
-    >;         /// Used for partial specialization
+    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,     /// Data type of A elements
+    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,     /// Data type of B elements
+    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,     /// Element type of C matrix
+    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
+    Policy,       /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+    1,            /// 1 partition along K dimension
+    kTransformA,  /// Transform for A
+    kTransformB   /// Transform for B
+    >;            /// Used for partial specialization
 
   /// Policy used to define MmaPipelined
   using MmaPolicy = MmaPolicy<
@@ -1611,9 +1628,9 @@ struct DefaultMultistageMmaComplexCore<
     WarpCount::kK>;
 };
 
-/// Partial specialization for complex double-precision
+/// Partial specialization for complex SIMT operation
 ///
-///   A: column-major
+///   A: row-major
 ///   B: row-major
 ///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
 ///
@@ -1737,7 +1754,7 @@ struct DefaultMultistageMmaComplexCore<
   //
 
   // Define the warp-level op
-  static const int WarpNumThreadsM = 4; // TODO need to extract these from template data
+  static const int WarpNumThreadsM = 4;
   static const int WarpNumThreadsN = 8;
   static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
       "WarpShape must be divisible by ThreadTile shape.");
@@ -1760,15 +1777,18 @@ struct DefaultMultistageMmaComplexCore<
   >;
 
   using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
-    WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
-    ElementA,  /// Data type of A elements
-    SmemLayoutA,   /// Layout of A matrix (concept: MatrixLayout)
-    ElementB,  /// Data type of B elements
-    SmemLayoutB,   /// Layout of B matrix (concept: MatrixLayout)
-    ElementC,  /// Element type of C matrix
-    LayoutC,   /// Layout of C matrix (concept: MatrixLayout)
-    Policy     /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
-    >;         /// Used for partial specialization
+    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,     /// Data type of A elements
+    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,     /// Data type of B elements
+    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,     /// Element type of C matrix
+    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
+    Policy,       /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+    1,            /// 1 partition along K dimension
+    kTransformA,  /// Transform for A
+    kTransformB   /// Transform for B
+    >;            /// Used for partial specialization
 
   /// Policy used to define MmaPipelined
   using MmaPolicy = MmaPolicy<
diff --git a/include/cutlass/gemm/threadblock/default_multistage_trmm_complex.h b/include/cutlass/gemm/threadblock/default_multistage_trmm_complex.h
index 4cca445951..abcb063e36 100644
--- a/include/cutlass/gemm/threadblock/default_multistage_trmm_complex.h
+++ b/include/cutlass/gemm/threadblock/default_multistage_trmm_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_sparse_mma.h b/include/cutlass/gemm/threadblock/default_sparse_mma.h
index a1c7dcd18b..388b9c4761 100644
--- a/include/cutlass/gemm/threadblock/default_sparse_mma.h
+++ b/include/cutlass/gemm/threadblock/default_sparse_mma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_trmm.h b/include/cutlass/gemm/threadblock/default_trmm.h
index bfa67ddc4e..5e90f25c2a 100644
--- a/include/cutlass/gemm/threadblock/default_trmm.h
+++ b/include/cutlass/gemm/threadblock/default_trmm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/ell_mma_multistage.h b/include/cutlass/gemm/threadblock/ell_mma_multistage.h
new file mode 100644
index 0000000000..17cc9dae85
--- /dev/null
+++ b/include/cutlass/gemm/threadblock/ell_mma_multistage.h
@@ -0,0 +1,648 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a multistage threadblock-scoped Blocked-Ell MMA.
+*/
+
+#pragma once
+
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/threadblock/mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class EllMmaMultistage : 
+  public MmaBase<Shape_, Policy_, Stages> {
+public:
+  ///< Base class
+  using Base = MmaBase<Shape_, Policy_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  using EllIterator = typename cutlass::transform::threadblock::ell::Iterator;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+  
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    static_assert(Base::kWarpGemmIterations > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA = typename Operator::FragmentA;
+  using WarpLoadedFragmentB = typename Operator::FragmentB;
+  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  EllMmaMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  template<bool is_A_sparse, bool is_offset_constant>
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(IteratorA &iterator_A, IteratorB &iterator_B, EllIterator &ell_iter,
+                              int group_start_A = 0, int group_start_B = 0) {
+    iterator_A.set_iteration_index(group_start_A *
+                                   IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess /
+                              IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A.get();
+          bool is_valid = iterator_A.valid();
+
+          if (!is_A_sparse){
+            if (is_offset_constant){
+              auto ell_offset = ell_iter.get_offset_fast();
+              is_valid = is_valid && (ell_offset >= 0);
+              gmem_ptr +=  ell_offset * sizeof(typename IteratorA::Element) / kSrcBytes;
+            } else {
+              int k_offset = iterator_A.get_k();
+              auto ell_offset = ell_iter.get_offset(k_offset);
+              is_valid = is_valid && (ell_offset >= 0);
+              gmem_ptr += (ell_offset * sizeof(typename IteratorA::Element)) / kSrcBytes;
+            }
+          }
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, gmem_ptr, is_valid);
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    iterator_B.set_iteration_index(group_start_B *
+                                   IteratorB::kAccessesPerVector);
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                              IteratorB::ThreadMap::kElementsPerAccess /
+                              IteratorB::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B.get();
+          bool is_valid = iterator_B.valid();
+
+          if (is_A_sparse){
+            if (is_offset_constant){
+              auto ell_offset = ell_iter.get_offset_fast();
+              is_valid = is_valid && (ell_offset >= 0);
+              gmem_ptr += ell_offset * sizeof(typename IteratorB::Element) / kSrcBytes;
+            } else {
+              int k_offset = iterator_B.get_k();
+              auto ell_offset = ell_iter.get_offset(k_offset);
+              is_valid = is_valid && (ell_offset >= 0);
+              gmem_ptr += ( ell_offset * sizeof(typename IteratorB::Element)) / kSrcBytes;
+            }
+          }
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+              dst_ptr + v, gmem_ptr, is_valid);
+
+          ++iterator_B;
+        }
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  template<bool is_A_sparse, bool is_offset_constant>
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< initial value of accumulator
+      FragmentC const &src_accum,
+      EllIterator &ell_iterator
+      ) {
+    //
+    // Prologue
+    //
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations) {
+
+      iterator_A.clear_mask(gemm_k_iterations == 0);
+      iterator_B.clear_mask(gemm_k_iterations == 0);
+
+      iterator_A.set_iteration_index(0);
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorA::Element>::value *
+              IteratorA::ThreadMap::kElementsPerAccess /
+              IteratorA::kAccessesPerVector / 8;
+
+          auto gmem_ptr = iterator_A.get();
+          bool is_valid = iterator_A.valid();
+
+          if (!is_A_sparse){
+            if (is_offset_constant){
+              auto ell_offset = ell_iterator.get_offset_fast();
+              is_valid = is_valid && (ell_offset >= 0);
+              gmem_ptr +=  ell_offset * sizeof(typename IteratorA::Element) / kSrcBytes;
+            } else {
+              int k_offset = iterator_A.get_k();
+              auto ell_offset = ell_iterator.get_offset(k_offset);
+              is_valid = is_valid && (ell_offset >= 0);
+              gmem_ptr += (ell_offset * sizeof(typename IteratorA::Element)) / kSrcBytes;
+            }
+          }
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, gmem_ptr, is_valid);
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+
+      iterator_B.set_iteration_index(0);
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB::Element>::value *
+              IteratorB::ThreadMap::kElementsPerAccess /
+              IteratorB::kAccessesPerVector / 8;
+          
+          auto gmem_ptr = iterator_B.get();
+          bool is_valid = iterator_B.valid();
+          
+          if (is_A_sparse){
+            if (is_offset_constant){
+              auto ell_offset = ell_iterator.get_offset_fast();
+              is_valid = is_valid && (ell_offset >= 0);
+              gmem_ptr += ell_offset * sizeof(typename IteratorB::Element) / kSrcBytes;
+            } else {
+              int k_offset = iterator_B.get_k();
+              auto ell_offset = ell_iterator.get_offset(k_offset);
+              is_valid = is_valid && (ell_offset >= 0);
+              gmem_ptr += ( ell_offset * sizeof(typename IteratorB::Element)) / kSrcBytes;
+            }
+          }
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+              dst_ptr + v, gmem_ptr, is_valid);
+
+          ++iterator_B;
+        }
+
+        ++this->smem_iterator_B_;
+      }
+
+      // Move to the next stage
+      iterator_A.add_tile_offset({0, 1});
+      iterator_B.add_tile_offset({1, 0});
+      ++ell_iterator;
+      
+      this->smem_iterator_A_.add_tile_offset({0, 1});
+      this->smem_iterator_B_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    // Waits until kStages-2 stages have committed.
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA warp_loaded_frag_A[2];
+    WarpLoadedFragmentB warp_loaded_frag_B[2];
+    WarpTransformedFragmentA warp_transformed_frag_A[2];
+    WarpTransformedFragmentB warp_transformed_frag_B[2];
+
+    Operator warp_mma;
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    iterator_A.clear_mask(gemm_k_iterations == 0);
+    iterator_B.clear_mask(gemm_k_iterations == 0);
+
+    if (is_A_sparse){
+      iterator_A.ell_add_mask(ell_iterator.get_blocksize());
+    }
+    else {
+      iterator_B.ell_add_mask(ell_iterator.get_blocksize());
+    }
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
+                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
+
+    // tf32x3 kernels use staging accumulation. warp_mma uses a temporary
+    // accumulator and this temporary accumulator is added to the final
+    // accumulator once in every mainloop iteration.
+    plus<FragmentC> plus_accum;
+
+    FragmentC tmp_accum;
+
+    if (platform::is_same<typename Operator::MathOperator,
+                          arch::OpMultiplyAddFastF32>::value
+      || platform::is_same<typename Operator::MathOperator,
+                           arch::OpMultiplyAddComplexFastF32>::value) {
+
+      tmp_accum.clear();
+    }
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        
+        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k > 0)
+          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
+                             warp_transformed_frag_B[warp_mma_k % 2],
+                             warp_loaded_frag_A[warp_mma_k % 2],
+                             warp_loaded_frag_B[warp_mma_k % 2]);
+
+        if (platform::is_same<typename Operator::MathOperator,
+                              arch::OpMultiplyAddFastF32>::value
+          || platform::is_same<typename Operator::MathOperator,
+                               arch::OpMultiplyAddComplexFastF32>::value) {
+
+          warp_mma(
+            tmp_accum, 
+            warp_transformed_frag_A[warp_mma_k % 2],
+            warp_transformed_frag_B[warp_mma_k % 2], 
+            tmp_accum
+          );
+
+          if (warp_mma_k == 0) {
+            accum = plus_accum(accum, tmp_accum);
+            tmp_accum.clear();
+          }
+        } else {
+          warp_mma(
+            accum, 
+            warp_transformed_frag_A[warp_mma_k % 2],
+            warp_transformed_frag_B[warp_mma_k % 2], 
+            accum
+          );
+        }
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations - 1) {
+          int group_start_iteration_A, group_start_iteration_B;
+
+          group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
+          group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance<is_A_sparse, is_offset_constant>(
+              iterator_A, iterator_B, ell_iterator, group_start_iteration_A, 
+                               group_start_iteration_B);
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+          int group_start_iteration_A, group_start_iteration_B;
+          group_start_iteration_A =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+          group_start_iteration_B =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance<is_A_sparse, is_offset_constant>(
+              iterator_A, iterator_B, ell_iterator, group_start_iteration_A, 
+                               group_start_iteration_B);
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A.add_tile_offset({0, 1});
+          iterator_B.add_tile_offset({1, 0});
+          ++ell_iterator;
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK *
+                        Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+          iterator_A.clear_mask(gemm_k_iterations == 0);
+          iterator_B.clear_mask(gemm_k_iterations == 0);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations)
+          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+      }
+
+    }
+
+    if (platform::is_same<typename Operator::MathOperator,
+                          arch::OpMultiplyAddFastF32>::value
+      || platform::is_same<typename Operator::MathOperator,
+                           arch::OpMultiplyAddComplexFastF32>::value) {
+      accum = plus_accum(accum, tmp_accum); 
+    }
+
+
+    // Commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/threadblock/ell_mma_pipelined.h b/include/cutlass/gemm/threadblock/ell_mma_pipelined.h
new file mode 100644
index 0000000000..55a951e1d4
--- /dev/null
+++ b/include/cutlass/gemm/threadblock/ell_mma_pipelined.h
@@ -0,0 +1,376 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped Blocked-Ell MMA.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Iterates over tiles of A operand in global memory 
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorA_,
+  /// Iterates over tiles of A operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorA_,
+  /// Iterates over tiles of B operand in global memory
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorB_,
+  /// Iterates over tiles of B operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorB_,
+  /// Data type of accumulator matrix
+  typename ElementC_,
+  /// Data type of accumulator matrix
+  typename LayoutC_,
+  /// Policy describing tuning details (concept: MmaPolicy)
+  typename Policy_,
+  /// Transformation applied to A operand
+  typename TransformA_ = NumericArrayConverter<
+    typename SmemIteratorA_::Element, 
+    typename IteratorA_::Element, 
+    IteratorA_::Fragment::kElements>,
+  ///
+  /// Transformation applied to B operand
+  typename TransformB_ = NumericArrayConverter<
+    typename SmemIteratorB_::Element, 
+    typename IteratorB_::Element, 
+    IteratorB_::Fragment::kElements>,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class EllMmaPipelined : public MmaBase<Shape_, Policy_, 2> {
+public:
+
+  ///< Base class
+  using Base = MmaBase<Shape_, Policy_, 2>;
+
+  using Shape = Shape_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using IteratorA = IteratorA_;     ///< Iterates over tiles of A operand in global memory
+  using IteratorB = IteratorB_;     ///< Iterates over tiles of B operand in global memory
+  using ElementC = ElementC_;       ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_;         ///< Layout of accumulator matrix
+  using Policy = Policy_;           ///< Policy describing tuning details
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand A loaded from global memory
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Obtain the arch tag from the warp-level operator
+  using ArchTag = typename Policy::Operator::ArchTag;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  // staticaly assert kStages for EllMmaPipelined is two (Double-buffered pipeline)
+  static_assert((Base::kStages==2), "EllMmaPipelined requires kStages set to value 2");
+
+private:
+
+  using WarpFragmentA = typename Operator::FragmentA;
+  using WarpFragmentB = typename Operator::FragmentB;
+
+protected:
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  using EllIterator = typename cutlass::transform::threadblock::ell::Iterator;
+
+public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  EllMmaPipelined(
+    typename Base::SharedStorage &shared_storage,       ///< Shared storage needed for internal use by threadblock-scoped GEMM
+    int thread_idx,                                     ///< ID within the threadblock
+    int warp_idx,                                       ///< ID of warp
+    int lane_idx                                        ///< ID of each thread within a warp
+  ):
+    Base(shared_storage, thread_idx, warp_idx, lane_idx),
+    smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+    smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) {
+
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+    
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  template<bool is_A_sparse, bool is_offset_constant>
+  CUTLASS_DEVICE
+  void operator()(
+    int gemm_k_iterations,                            ///< number of iterations of the mainloop
+    FragmentC &accum,                                 ///< destination accumulator tile
+    IteratorA iterator_A,                             ///< iterator over A operand in global memory
+    IteratorB iterator_B,                             ///< iterator over B operand in global memory
+    FragmentC const &src_accum,                       ///< source accumulator tile
+    EllIterator &ell_iterator,
+    TransformA transform_A = TransformA(),            ///< transformation applied to A fragment
+    TransformB transform_B = TransformB()) {          ///< transformation applied to B fragment
+
+    //
+    // Prologue
+    //
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    FragmentA tb_frag_A;
+    FragmentB tb_frag_B;
+
+    tb_frag_A.clear();
+    tb_frag_B.clear();
+
+    // load sparse matrix  
+    if (is_A_sparse){
+      iterator_A.load(tb_frag_A);
+    } else {
+      iterator_B.load(tb_frag_B);
+    }
+    
+    // load dense matrix
+    if (is_offset_constant){
+      if (is_A_sparse){
+        iterator_B.load_with_ell_index_fast(tb_frag_B, ell_iterator);
+      } else {
+        iterator_A.load_with_ell_index_fast(tb_frag_A, ell_iterator);
+      }
+    } else {
+      if (is_A_sparse){
+        iterator_B.load_with_ell_index(tb_frag_B, ell_iterator);
+      } else {
+        iterator_A.load_with_ell_index(tb_frag_A, ell_iterator);
+      }
+    }
+
+    ++iterator_A;
+    ++iterator_B;
+    ++ell_iterator;
+
+    this->smem_iterator_A_.store(transform_A(tb_frag_A));
+    this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+    ++this->smem_iterator_A_;
+    ++this->smem_iterator_B_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math instructions
+    WarpFragmentA warp_frag_A[2];
+    WarpFragmentB warp_frag_B[2];
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    Operator warp_mma;
+
+    int smem_write_stage_idx = 1;
+
+    // Avoid reading out of bounds
+    iterator_A.clear_mask(gemm_k_iterations <= 1);
+    iterator_B.clear_mask(gemm_k_iterations <= 1);
+
+    if (is_A_sparse){
+      iterator_A.ell_add_mask(ell_iterator.get_blocksize());
+    }
+    else {
+      iterator_B.ell_add_mask(ell_iterator.get_blocksize());
+    }
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
+    // shared memory loads (which have the tightest latency requirement).
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
+        // as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations - 1) {
+
+          // Write fragments to shared memory
+          this->smem_iterator_A_.store(transform_A(tb_frag_A));
+
+          this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+          __syncthreads();
+          
+          ++this->smem_iterator_A_;
+          ++this->smem_iterator_B_;
+
+          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+          }
+          else {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+        }
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        
+        this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k == 0) {
+          // load sparse matrix  
+          if (is_A_sparse){
+            iterator_A.load(tb_frag_A);
+          } else {
+            iterator_B.load(tb_frag_B);
+          }
+
+          // load dense matrix
+          if (is_offset_constant){
+            if (is_A_sparse){
+              iterator_B.load_with_ell_index_fast(tb_frag_B, ell_iterator);
+            } else {
+              iterator_A.load_with_ell_index_fast(tb_frag_A, ell_iterator);
+            }
+          } else {
+            if (is_A_sparse){
+              iterator_B.load_with_ell_index(tb_frag_B, ell_iterator);
+            } else {
+              iterator_A.load_with_ell_index(tb_frag_A, ell_iterator);
+            }
+          }
+
+          ++iterator_A;
+          ++iterator_B;
+          ++ell_iterator;
+
+          // Avoid reading out of bounds if this was the last loop iteration
+          iterator_A.clear_mask(gemm_k_iterations <= 2);
+          iterator_B.clear_mask(gemm_k_iterations <= 2);
+        }
+
+        warp_mma(accum, warp_frag_A[warp_mma_k % 2],
+                 warp_frag_B[warp_mma_k % 2], accum);
+      }
+    }
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/threadblock/gemv.h b/include/cutlass/gemm/threadblock/gemv.h
index 74e5e9a806..e246ddce63 100755
--- a/include/cutlass/gemm/threadblock/gemv.h
+++ b/include/cutlass/gemm/threadblock/gemv.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/index_remat.h b/include/cutlass/gemm/threadblock/index_remat.h
new file mode 100644
index 0000000000..8370f61452
--- /dev/null
+++ b/include/cutlass/gemm/threadblock/index_remat.h
@@ -0,0 +1,107 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Helpers for rematerializing indices/dimensions in the thread hierarchy from special registers
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to rematerialize block Idx. Reduces register liveness.
+CUTLASS_DEVICE
+int RematerializeThreadIdxX() {
+  return threadIdx.x;
+}
+
+/// Helper to rematerialize block Idx. Reduces register liveness.
+CUTLASS_DEVICE
+int RematerializeThreadIdxY() {
+  return threadIdx.y;
+}
+
+/// Helper to rematerialize block Idx. Reduces register liveness.
+CUTLASS_DEVICE
+int RematerializeThreadIdxZ() {
+  return threadIdx.z;
+}
+
+/// Helper to rematerialize block Idx. Reduces register liveness.
+CUTLASS_DEVICE
+int RematerializeBlockIdxX() {
+  return blockIdx.x;
+}
+
+/// Helper to rematerialize block Idx. Reduces register liveness.
+CUTLASS_DEVICE
+int RematerializeBlockIdxY() {
+  return blockIdx.y;
+}
+
+/// Helper to rematerialize block Idx. Reduces register liveness.
+CUTLASS_DEVICE
+int RematerializeBlockIdxZ() {
+  return blockIdx.z;
+}
+
+/// Helper to rematerialize block Dim. Reduces register liveness.
+CUTLASS_DEVICE
+int RematerializeBlockDimX() {
+  return blockDim.x;
+}
+
+/// Helper to rematerialize block Dim. Reduces register liveness.
+CUTLASS_DEVICE
+int RematerializeBlockDimY() {
+  return blockDim.y;
+}
+
+/// Helper to rematerialize block Dim. Reduces register liveness.
+CUTLASS_DEVICE
+int RematerializeBlockDimZ() {
+  return blockDim.z;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+
diff --git a/include/cutlass/gemm/threadblock/mma_base.h b/include/cutlass/gemm/threadblock/mma_base.h
index 0203c9cec4..16ec65688d 100644
--- a/include/cutlass/gemm/threadblock/mma_base.h
+++ b/include/cutlass/gemm/threadblock/mma_base.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -34,6 +34,7 @@
 
 #pragma once
 
+#include "cutlass/tensor_ref.h"
 #include "cutlass/aligned_buffer.h"
 #include "cutlass/arch/memory.h"
 #include "cutlass/array.h"
@@ -124,6 +125,13 @@ class MmaBase {
   /// Tensor reference to the B operand
   using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
 
+  static_assert(kWarpGemmIterations > 1,
+                "The pipelined structure requires at least two warp-level "
+                "GEMM operations.");
+
+  static_assert((kWarpGemmIterations % 2) == 0,
+                "Inner loop iteration must be an even number.");
+
   //
   // Nested structs
   //
diff --git a/include/cutlass/gemm/threadblock/mma_blas3_multistage.h b/include/cutlass/gemm/threadblock/mma_blas3_multistage.h
index d61442ee04..11eb20adbb 100644
--- a/include/cutlass/gemm/threadblock/mma_blas3_multistage.h
+++ b/include/cutlass/gemm/threadblock/mma_blas3_multistage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -139,10 +139,6 @@ class MmaBlas3Multistage :
   /// Internal structure exposed for introspection.
   struct Detail {
 
-    static_assert(Base::kWarpGemmIterations > 1,
-                  "The pipelined structure requires at least two warp-level "
-                  "GEMM operations.");
-
     /// Number of cp.async instructions to load one stage of operand A
     static int const AsyncCopyIterationsPerStageA =
         IteratorA::ThreadMap::Iterations::kCount;
@@ -688,12 +684,17 @@ class MmaBlas3Multistage :
     }
  
     if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
-      // commit and drain all pending and predicated LDGSTS pnz from the GEMM mainloop
+      // commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
       cutlass::arch::cp_async_fence();
       cutlass::arch::cp_async_wait<0>();
       __syncthreads();
     }
 
+    // Commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
   }
 };
 
diff --git a/include/cutlass/gemm/threadblock/mma_layernorm_mainloop_fusion_multistage.h b/include/cutlass/gemm/threadblock/mma_layernorm_mainloop_fusion_multistage.h
new file mode 100644
index 0000000000..11ad544461
--- /dev/null
+++ b/include/cutlass/gemm/threadblock/mma_layernorm_mainloop_fusion_multistage.h
@@ -0,0 +1,863 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+
+    It loads two loop invariant vectors, mean and var, in the prologue and
+    stores them in the register file.  In the mainloop, it loads two loop
+    variant vectors, gamma and beta, by using cp.async.  We will call
+    elementwise operation to apply var, mean, gamma, beta between ldmatrix and
+    warp mma.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+#include "cutlass/gemm/warp/layernorm_scale_bias_transform.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Element type of scale and bias vectors 
+    typename ElementScaleBias_,
+    /// Layout of scale and bias vectors
+    typename LayoutScaleBias_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// WarpIterator to load Scale or Bias vector from the shared memory
+    typename WarpIteratorGammaBeta_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaMainloopFusionBase {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Element type of scale and bias vectors 
+  using ElementScaleBias = ElementScaleBias_;
+
+  /// Layout of scale and bias vectors
+  using LayoutScaleBias = LayoutScaleBias_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  ///< WarpIterator to load Scale or Bias vector from the shared memory
+  using WarpIteratorGammaBeta = WarpIteratorGammaBeta_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = cutlass::gemm::GemmShape<Shape::kM / WarpGemm::kM,
+                                             Shape::kN / WarpGemm::kN,
+                                             Shape::kK / WarpGemm::kK>;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations =
+      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  /// Tensor reference to the A operand
+  using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+
+  /// Tensor reference to the scale and bias vectors
+  using TensorRefGammaBeta = TensorRef<ElementScaleBias, LayoutScaleBias>;
+
+  /// Tensor reference to the B operand
+  using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+
+    /// Shape of the A matrix operand in shared memory
+    using ShapeA = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow,
+                               Shape::kK * kStages +
+                                   Policy::SmemPaddingA::kColumn>;
+
+    /// Shape of the A scale and bias vectors in shared memory
+    using ShapeGammaBeta =
+        MatrixShape<1 + Policy::SmemPaddingA::kRow,
+                    2 * Shape::kK * kStages + Policy::SmemPaddingA::kColumn>;
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB =
+        MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+                    Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+   public:
+    //
+    // Data members
+    //
+
+    /// Buffer for A operand
+    AlignedBuffer<typename Operator::ElementA, ShapeA::kCount> operand_A;
+
+    /// Buffer for B operand
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+    /// Buffer for A operand Scale and Bias
+    AlignedBuffer<ElementScaleBias, ShapeGammaBeta::kCount> operand_A_gamma_beta;
+
+   public:
+
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the A matrix
+    CUTLASS_DEVICE
+    static typename Operator::LayoutA LayoutA() {
+      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
+    }
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() {
+      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+    }
+
+    /// Returns a layout object for the A scale and bias vectors
+    CUTLASS_DEVICE
+    static LayoutScaleBias LayoutScaleBias() {
+      return LayoutScaleBias::packed(
+          {ShapeGammaBeta::kRow, ShapeGammaBeta::kColumn});
+    }
+
+    /// Returns a TensorRef to the A operand
+    CUTLASS_HOST_DEVICE
+    TensorRefA operand_A_ref() {
+      return TensorRefA{operand_A.data(), LayoutA()};
+    }
+
+    /// Returns a TensorRef to the B operand
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() {
+      return TensorRefB{operand_B.data(), LayoutB()};
+    }
+
+    /// Returns a TensorRef to the A operand Scale vector
+    CUTLASS_HOST_DEVICE
+    TensorRefGammaBeta operand_A_gamma_beta_ref() {
+      return TensorRefGammaBeta{operand_A_gamma_beta.data(), LayoutScaleBias()};
+    }
+  };
+
+ protected:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A operand from shared memory
+  typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of A operand scale and bias vector
+  /// from shared memory
+  WarpIteratorGammaBeta warp_tile_iterator_A_gamma_beta_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaMainloopFusionBase(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : warp_tile_iterator_A_(shared_storage.operand_A_ref(), lane_idx),
+        warp_tile_iterator_A_gamma_beta_(
+            shared_storage.operand_A_gamma_beta_ref(), lane_idx),
+        warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx) {}
+};
+
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Iterates over vectors of var and mean vector in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorVarMean_,
+    /// Iterates over vectors of scale and bias vector in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorGammaBeta_,
+    /// Iterates over vectors of scale and bias vector in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorGammaBeta_,
+    /// Cache operation for scale/bias operand 
+    cutlass::arch::CacheOperation::Kind CacheOpGammaBeta,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// WarpIterator to load Scale or Bias vector from the shared memory
+    typename WarpIteratorGammaBeta_,
+    /// Number of stages,
+    int Stages,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaLayernormMainloopFusionMultistage : 
+  public MmaMainloopFusionBase<Shape_, typename IteratorGammaBeta_::Element,
+                       typename IteratorGammaBeta_::Layout, Policy_, WarpIteratorGammaBeta_, Stages> {
+public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Iterates over tiles of the var and mean vectors in global memory
+  using IteratorVarMean = IteratorVarMean_;
+  ///< Iterates over tiles of the scale and bias vectors in global memory
+  using IteratorGammaBeta = IteratorGammaBeta_;
+  ///< WarpIterator to load Scale or Bias vector from the shared memory
+  using WarpIteratorGammaBeta = WarpIteratorGammaBeta_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  ///< Base class
+  using Base = MmaMainloopFusionBase<Shape_, typename IteratorGammaBeta::Element, 
+                                     typename IteratorGammaBeta::Layout, Policy,
+                                     WarpIteratorGammaBeta, Stages>;
+
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+  using SmemIteratorGammaBeta = SmemIteratorGammaBeta_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpGammaBeta =
+      CacheOpGammaBeta;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+  
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    static_assert(Base::kWarpGemmIterations > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA = typename Operator::FragmentA;
+  using WarpLoadedFragmentB = typename Operator::FragmentB;
+  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+  using WarpLoadedFragmentVarMean = typename IteratorVarMean::Fragment;
+  using WarpLoadedFragmentGammaBeta =
+      typename WarpIteratorGammaBeta::Fragment;
+
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of A operand scale vector to shared memory
+  SmemIteratorGammaBeta smem_iterator_A_gamma_beta_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  int warp_idx_m_;
+
+  int warp_idx_n_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaLayernormMainloopFusionMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+      smem_iterator_A_gamma_beta_(shared_storage.operand_A_gamma_beta_ref(),
+                                  thread_idx),
+      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    warp_idx_m_ = warp_idx_mn % Base::WarpCount::kM;
+    warp_idx_n_ = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m_, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_A_gamma_beta_.add_tile_offset(
+        {warp_idx_m_, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n_});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(IteratorA &iterator_A,
+                              IteratorGammaBeta &iterator_A_gamma_beta,
+                              IteratorB &iterator_B,
+                              int group_start_A = 0, int group_start_B = 0) {
+    iterator_A.set_iteration_index(group_start_A *
+                                   IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess /
+                              IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A.get();
+
+          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, iterator_A.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, iterator_A.valid());
+          }
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    // Async Copy for operand A scale and bias vector.  Scale and bias vectors
+    // are small.  One iteration is enough.
+    if (group_start_A == 0) {
+      typename IteratorGammaBeta::AccessType *dst_ptr =
+          reinterpret_cast<typename IteratorGammaBeta::AccessType *>(
+              this->smem_iterator_A_gamma_beta_.get());
+
+      int const kSrcBytes =
+          sizeof_bits<typename IteratorGammaBeta::Element>::value *
+          IteratorGammaBeta::kElementsPerAccess / 8;
+
+      cutlass::arch::cp_async<kSrcBytes, kCacheOpGammaBeta>(
+          dst_ptr, iterator_A_gamma_beta.get(), iterator_A_gamma_beta.valid());
+    }
+
+    iterator_B.set_iteration_index(group_start_B *
+                                   IteratorB::kAccessesPerVector);
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                              IteratorB::ThreadMap::kElementsPerAccess /
+                              IteratorB::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B.get();
+
+          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, iterator_B.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, iterator_B.valid());
+          }
+
+          ++iterator_B;
+        }
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< iterator over B operand in global memory
+      IteratorVarMean iterator_var_mean,
+      ///< iterator over scale and bias vectors in global memory
+      IteratorGammaBeta iterator_A_gamma_beta,
+      ///< initial value of accumulator
+      FragmentC const &src_accum) {
+
+    //
+    // Prologue
+    //
+    // Issue several complete stages
+
+    WarpLoadedFragmentVarMean warp_loaded_frag_var_mean;
+    iterator_var_mean.add_tile_offset({0, warp_idx_m_});
+    iterator_var_mean.load(warp_loaded_frag_var_mean);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations) {
+
+      iterator_A.clear_mask(gemm_k_iterations == 0);
+      iterator_A_gamma_beta.clear_mask(gemm_k_iterations == 0);
+      iterator_B.clear_mask(gemm_k_iterations == 0);
+
+      iterator_A.set_iteration_index(0);
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorA::Element>::value *
+              IteratorA::ThreadMap::kElementsPerAccess /
+              IteratorA::kAccessesPerVector / 8;
+
+          int src_bytes = (iterator_A.valid() ? kSrcBytes : 0);
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+
+      // Async Copy for operand A scale and bias vectors.  Scale and bias
+      // vectors are small.  One iteration is enough.
+      {
+        typename IteratorGammaBeta::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorGammaBeta::AccessType *>(
+                this->smem_iterator_A_gamma_beta_.get());
+
+        int const kSrcBytes =
+            sizeof_bits<typename IteratorGammaBeta::Element>::value *
+            IteratorGammaBeta::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async<kSrcBytes, kCacheOpGammaBeta>(
+            dst_ptr, iterator_A_gamma_beta.get(), iterator_A_gamma_beta.valid());
+      }
+
+      iterator_B.set_iteration_index(0);
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB::Element>::value *
+              IteratorB::ThreadMap::kElementsPerAccess /
+              IteratorB::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+              dst_ptr + v, iterator_B.get(), iterator_B.valid());
+
+          ++iterator_B;
+        }
+
+        ++this->smem_iterator_B_;
+      }
+
+      // Move to the next stage
+      iterator_A.add_tile_offset({0, 1});
+      iterator_A_gamma_beta.add_tile_offset({0, 1});
+      iterator_B.add_tile_offset({1, 0});
+
+      this->smem_iterator_A_.add_tile_offset({0, 1});
+      this->smem_iterator_A_gamma_beta_.add_tile_offset({0, 1});
+      this->smem_iterator_B_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    // Waits until kStages-2 stages have committed.
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA warp_loaded_frag_A[2];
+    WarpLoadedFragmentB warp_loaded_frag_B[2];
+    WarpLoadedFragmentGammaBeta warp_loaded_frag_A_gamma_beta[2];
+    WarpTransformedFragmentA warp_transformed_frag_A[2];
+    WarpTransformedFragmentB warp_transformed_frag_B[2];
+
+    Operator warp_mma;
+    cutlass::gemm::warp::LayernormScaleBiasTransform<WarpTransformedFragmentA,
+                                            WarpLoadedFragmentVarMean,
+                                            WarpLoadedFragmentGammaBeta>
+                         elementwise_transform;
+ 
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_A_gamma_beta_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+    this->warp_tile_iterator_A_gamma_beta_.load(
+        warp_loaded_frag_A_gamma_beta[0]);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_A_gamma_beta_;
+    ++this->warp_tile_iterator_B_;
+
+    iterator_A.clear_mask(gemm_k_iterations == 0);
+    iterator_A_gamma_beta.clear_mask(gemm_k_iterations == 0);
+    iterator_B.clear_mask(gemm_k_iterations == 0);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
+                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
+
+    elementwise_transform(warp_transformed_frag_A[0],
+                         warp_loaded_frag_var_mean,
+                         warp_loaded_frag_A_gamma_beta[0]);
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_A_gamma_beta_.set_kgroup_index(
+            (warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        
+        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_A_gamma_beta_.load(
+            warp_loaded_frag_A_gamma_beta[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_A_gamma_beta_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k > 0) {
+          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
+                             warp_transformed_frag_B[warp_mma_k % 2],
+                             warp_loaded_frag_A[warp_mma_k % 2],
+                             warp_loaded_frag_B[warp_mma_k % 2]);
+
+          elementwise_transform(warp_transformed_frag_A[warp_mma_k % 2],
+                               warp_loaded_frag_var_mean,
+                               warp_loaded_frag_A_gamma_beta[warp_mma_k % 2]);
+        }
+
+        warp_mma(
+          accum, 
+          warp_transformed_frag_A[warp_mma_k % 2],
+          warp_transformed_frag_B[warp_mma_k % 2], 
+          accum
+        );
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations - 1) {
+          int group_start_iteration_A, group_start_iteration_B;
+
+          group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
+          group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance(iterator_A, iterator_A_gamma_beta, iterator_B,
+	  		       group_start_iteration_A, 
+                               group_start_iteration_B);
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+          int group_start_iteration_A, group_start_iteration_B;
+          group_start_iteration_A =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+          group_start_iteration_B =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance(iterator_A, iterator_A_gamma_beta, iterator_B,
+	                               group_start_iteration_A, 
+                                 group_start_iteration_B);
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A.add_tile_offset({0, 1});
+          iterator_A_gamma_beta.add_tile_offset({0, 1});
+          iterator_B.add_tile_offset({1, 0});
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_A_gamma_beta_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_A_gamma_beta_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK *
+                        Base::kWarpGemmIterations});
+            this->warp_tile_iterator_A_gamma_beta_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK *
+                        Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+          iterator_A.clear_mask(gemm_k_iterations == 0);
+          iterator_A_gamma_beta.clear_mask(gemm_k_iterations == 0);
+          iterator_B.clear_mask(gemm_k_iterations == 0);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
+          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+          elementwise_transform(
+              warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+              warp_loaded_frag_var_mean,
+              warp_loaded_frag_A_gamma_beta[(warp_mma_k + 1) % 2]);
+        }
+      }
+
+    }
+    
+    // commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/threadblock/mma_multistage.h b/include/cutlass/gemm/threadblock/mma_multistage.h
index a3041ea3fd..ef55131707 100644
--- a/include/cutlass/gemm/threadblock/mma_multistage.h
+++ b/include/cutlass/gemm/threadblock/mma_multistage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -34,6 +34,7 @@
 
 #pragma once
 
+
 #include "cutlass/aligned_buffer.h"
 #include "cutlass/arch/memory.h"
 #include "cutlass/array.h"
@@ -123,7 +124,7 @@ class MmaMultistage :
 
   /// Minimum architecture is Sm80 to support cp.async
   using ArchTag = arch::Sm80;
-  
+
   /// Complex transform on A operand
   static ComplexTransform const kTransformA = Operator::kTransformA;
 
@@ -133,10 +134,6 @@ class MmaMultistage :
   /// Internal structure exposed for introspection.
   struct Detail {
 
-    static_assert(Base::kWarpGemmIterations > 1,
-                  "The pipelined structure requires at least two warp-level "
-                  "GEMM operations.");
-
     /// Number of cp.async instructions to load one stage of operand A
     static int const AsyncCopyIterationsPerStageA =
         IteratorA::ThreadMap::Iterations::kCount;
@@ -155,14 +152,37 @@ class MmaMultistage :
     /// Number of cp.async instructions to load on group of operand B
     static int const kAccessesPerGroupB =
         (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    // Optional staged-accumulation (e.g., tf32x3 kernels) for improved numerical
+    // accuracy, where each mainloop iteration first accumulates into a temporary
+    // set of freshly-cleared accumulators, which are subsequently added to the
+    // final accumulator set.
+    static bool const kStagedAccumulation = arch::detail::UseStagedAccumulation<Operator>::value;
   };
 
  private:
 
-  using WarpLoadedFragmentA = typename Operator::FragmentA;
-  using WarpLoadedFragmentB = typename Operator::FragmentB;
-  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
-  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+  // Structure encapsulating pipeline state live from one iteration to the next
+  struct PipeState {
+
+    using WarpLoadedFragmentA = typename Operator::FragmentA;
+    using WarpLoadedFragmentB = typename Operator::FragmentB;
+    using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+    using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+    /// Temporary accumulator to facilitate staged-accumulation
+    FragmentC tmp_accum_;
+
+    /// Pair of A fragments used to overlap shared memory loads and math instructions
+    WarpLoadedFragmentA warp_loaded_frag_A_[2];
+    WarpTransformedFragmentA warp_transformed_frag_A_[2];
+
+    /// Pair of B fragments used to overlap shared memory loads and math instructions
+    WarpLoadedFragmentB warp_loaded_frag_B_[2];
+    WarpTransformedFragmentB warp_transformed_frag_B_[2];
+  };
+
 
  private:
 
@@ -170,12 +190,22 @@ class MmaMultistage :
   // Data members
   //
 
+  /// Warp-level MMA operator
+  Operator warp_mma_;
+
   /// Iterator to write threadblock-scoped tile of A operand to shared memory
   SmemIteratorA smem_iterator_A_;
 
   /// Iterator to write threadblock-scoped tile of B operand to shared memory
   SmemIteratorB smem_iterator_B_;
 
+  /// Shared memory write stage index
+  int smem_write_stage_idx_;
+
+  /// Shared memory read stage index
+  int smem_read_stage_idx_;
+
+
 public:
 
   /// Construct from tensor references
@@ -192,7 +222,9 @@ class MmaMultistage :
     ):
       Base(shared_storage, thread_idx, warp_idx, lane_idx),
       smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
-      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
+      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx),
+      smem_write_stage_idx_(0),
+      smem_read_stage_idx_(0)
   {
     // Compute warp location within threadblock tile by mapping the warp_id to
     // three coordinates:
@@ -213,6 +245,45 @@ class MmaMultistage :
         {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
   }
 
+  /// Advance shared memory read-iterators to the next stage
+  CUTLASS_DEVICE
+  void advance_smem_read_stage()
+  {
+    ++smem_read_stage_idx_;
+
+    if (smem_read_stage_idx_ == Base::kStages) {
+      // Wrap back around to the 'start' of the circular buffer in shared memory
+      this->warp_tile_iterator_A_.add_tile_offset({0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+      this->warp_tile_iterator_B_.add_tile_offset({-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations, 0});
+      smem_read_stage_idx_ = 0;
+    }
+  }
+
+  /// Advance global memory read-iterators and shared memory write-iterators to the stage
+  CUTLASS_DEVICE
+  void advance_smem_write_stage(
+    IteratorA &iterator_A,
+    IteratorB &iterator_B)
+  {
+    // Advance global iterators
+    iterator_A.add_tile_offset({0, 1});
+    iterator_B.add_tile_offset({1, 0});
+
+    // Advance shared iterators
+    smem_iterator_A_.add_tile_offset({0, 1});
+    smem_iterator_B_.add_tile_offset({1, 0});
+
+    // Increment shared memory write stage index
+    ++smem_write_stage_idx_;
+
+    if (smem_write_stage_idx_ == Base::kStages) {
+      // Wrap back around to the 'start' of the circular buffer in shared memory
+      smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+      smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+      smem_write_stage_idx_ = 0;
+    }
+  }
+
   CUTLASS_DEVICE
   void copy_tiles_and_advance(IteratorA &iterator_A, IteratorB &iterator_B,
                               int group_start_A = 0, int group_start_B = 0) {
@@ -286,29 +357,19 @@ class MmaMultistage :
     }
   }
 
-  /// Perform a threadblock-scoped matrix multiply-accumulate
+  /// GEMM prologue.  Bootstrap the global->shared memory pipeline by fetching
+  /// the global fragments needed by the first kStages-1 threadblock mainloop iterations
   CUTLASS_DEVICE
-  void operator()(
-      ///< problem size of GEMM
-      int gemm_k_iterations,
-      ///< destination accumulator tile
-      FragmentC &accum,
-      ///< iterator over A operand in global memory
-      IteratorA iterator_A,
-      ///< iterator over B operand in global memory
-      IteratorB iterator_B,
-      ///< initial value of accumulator
-      FragmentC const &src_accum) {
-
-    //
-    // Prologue
-    //
-
+  void prologue(
+    IteratorA &iterator_A,      ///< [in|out] iterator over A operand in global memory
+    IteratorB &iterator_B,      ///< [in|out] iterator over B operand in global memory
+    int &gemm_k_iterations)     ///< [in|out] number of threadblock mainloop iterations remaining
+  {
     // Issue several complete stages
     CUTLASS_PRAGMA_UNROLL
-    for (int stage = 0; stage < Base::kStages - 1;
-         ++stage, --gemm_k_iterations) {
+    for (int stage = 0; stage < Base::kStages - 1; ++stage, --gemm_k_iterations) {
 
+      // Disable global fetching if done with global fetch iterations
       iterator_A.clear_mask(gemm_k_iterations == 0);
       iterator_B.clear_mask(gemm_k_iterations == 0);
 
@@ -366,33 +427,22 @@ class MmaMultistage :
         ++this->smem_iterator_B_;
       }
 
-      // Move to the next stage
-      iterator_A.add_tile_offset({0, 1});
-      iterator_B.add_tile_offset({1, 0});
-
-      this->smem_iterator_A_.add_tile_offset({0, 1});
-      this->smem_iterator_B_.add_tile_offset({1, 0});
+      // Move to the next write stage
+      advance_smem_write_stage(iterator_A, iterator_B);
 
       // Defines the boundary of a stage of cp.async.
       cutlass::arch::cp_async_fence();
     }
 
-    // Perform accumulation in the 'd' output operand
-    accum = src_accum;
-
-    //
-    // Clear the remaining tiles of SMEM. This is a functional requirement for some kernels
-    // so that all accumulator elements outside the GEMM footprint are zero.
-    //
-
+    // Optionally clear the remaining stages of SMEM. This is a functional requirement for
+    // some kernels so that all accumulator elements outside the GEMM footprint are zero.
     if (SharedMemoryClear == SharedMemoryClearOption::kClearLastStage) {
 
       /// Iterator to write threadblock-scoped tile of A operand to shared memory
       SmemIteratorA last_smem_iterator_A(this->smem_iterator_A_);
-
       typename IteratorA::AccessType zero_A;
-      zero_A.clear();
 
+      zero_A.clear();
       last_smem_iterator_A.set_iteration_index(0);
 
       // Async Copy for operand A
@@ -428,200 +478,256 @@ class MmaMultistage :
         ++last_smem_iterator_B;
       }
     }
+  }
+
 
-    // Waits until kStages-2 stages have committed.
+  /// Wait until we have at least one completed global fetch stage
+  CUTLASS_DEVICE
+  void gmem_wait()
+  {
+    // Wait until we have at least one committed global fetch stage. (#uncommitted = Base::kStages - 1 - #committed)
     cutlass::arch::cp_async_wait<Base::kStages - 2>();
     __syncthreads();
+  }
 
-    // Pair of fragments used to overlap shared memory loads and math
-    // instructions
-    WarpLoadedFragmentA warp_loaded_frag_A[2];
-    WarpLoadedFragmentB warp_loaded_frag_B[2];
-    WarpTransformedFragmentA warp_transformed_frag_A[2];
-    WarpTransformedFragmentB warp_transformed_frag_B[2];
 
-    Operator warp_mma;
+  /// Perform a threadblock mainloop iteration of matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void mac_loop_iter(
+    PipeState &pipe_state,          ///< [in|out] loop-carried pipeline state
+    FragmentC &accum,               ///< [in|out] destination accumulator tile
+    IteratorA &iterator_A,          ///< [in|out] iterator over A operand in global memory
+    IteratorB &iterator_B,          ///< [in|out] iterator over B operand in global memory
+    int &gemm_k_iterations)         ///< [in|out] number of threadblock mainloop iterations remaining
+  {
+    // Unroll the warp-level MMA tiles of a threadblock's mainloop iteration
+    CUTLASS_PRAGMA_UNROLL
+    for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+
+      // Load the next warp-tile's A fragment from shared memory
+      this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+      this->warp_tile_iterator_A_.load(pipe_state.warp_loaded_frag_A_[(warp_mma_k + 1) % 2]);
+      ++this->warp_tile_iterator_A_;
+
+      // Load the next warp-tile's B fragment from shared memory
+      this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+      this->warp_tile_iterator_B_.load(pipe_state.warp_loaded_frag_B_[(warp_mma_k + 1) % 2]);
+      ++this->warp_tile_iterator_B_;
+
+      // Except for the first warp-tile, all warp-tiles convert their incoming shared memory fragments as necessary
+      if (warp_mma_k > 0) {
+        warp_mma_.transform(
+          pipe_state.warp_transformed_frag_A_[warp_mma_k % 2],
+          pipe_state.warp_transformed_frag_B_[warp_mma_k % 2],
+          pipe_state.warp_loaded_frag_A_[warp_mma_k % 2],
+          pipe_state.warp_loaded_frag_B_[warp_mma_k % 2]);
+      }
 
-    this->warp_tile_iterator_A_.set_kgroup_index(0);
-    this->warp_tile_iterator_B_.set_kgroup_index(0);
+      // Execute the current warp-tile of MMA operations
+      if (Detail::kStagedAccumulation) {
+        warp_mma_(
+          pipe_state.tmp_accum_,
+          pipe_state.warp_transformed_frag_A_[warp_mma_k % 2],
+          pipe_state.warp_transformed_frag_B_[warp_mma_k % 2],
+          pipe_state.tmp_accum_
+        );
+
+        if (warp_mma_k == 0) {
+          plus<FragmentC> plus_accum;
+          accum = plus_accum(accum, pipe_state.tmp_accum_);
+          pipe_state.tmp_accum_.clear();
+        }
+      } else {
+        warp_mma_(
+          accum,
+          pipe_state.warp_transformed_frag_A_[warp_mma_k % 2],
+          pipe_state.warp_transformed_frag_B_[warp_mma_k % 2],
+          accum
+        );
+      }
 
-    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
-    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+      // Except for the last warp-tile, all warp-tiles issue their share of
+      // global->shared fragment copies
+      if (warp_mma_k < Base::kWarpGemmIterations - 1) {
 
-    ++this->warp_tile_iterator_A_;
-    ++this->warp_tile_iterator_B_;
+        int group_start_iteration_A, group_start_iteration_B;
+        group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
+        group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
 
-    iterator_A.clear_mask(gemm_k_iterations == 0);
-    iterator_B.clear_mask(gemm_k_iterations == 0);
+        copy_tiles_and_advance(
+            iterator_A,
+            iterator_B,
+            group_start_iteration_A,
+            group_start_iteration_B);
+      }
 
-    int smem_write_stage_idx = Base::kStages - 1;
-    int smem_read_stage_idx = 0;
+      // The second-to-last warp-tile also:
+      //   - performs the last warp-tile's share of global->shared fragment copies
+      //   - moves to the next global fetch stage
+      if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
 
-    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
-                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
+        // Performs the last warp-tile's share of global->shared fragment copies
+        int group_start_iteration_A = (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+        int group_start_iteration_B = (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
 
-    // tf32x3 kernels use staging accumulation. warp_mma uses a temporary
-    // accumulator and this temporary accumulator is added to the final
-    // accumulator once in every mainloop iteration.
-    plus<FragmentC> plus_accum;
+        copy_tiles_and_advance(
+          iterator_A,
+          iterator_B,
+          group_start_iteration_A,
+          group_start_iteration_B);
 
-    FragmentC tmp_accum;
+        // Inserts a memory fence between stages of cp.async instructions.
+        cutlass::arch::cp_async_fence();
 
-    if (platform::is_same<typename Operator::MathOperator,
-                          arch::OpMultiplyAddFastF32>::value
-      || platform::is_same<typename Operator::MathOperator,
-                           arch::OpMultiplyAddComplexFastF32>::value) {
+        // Wait until we have at least one completed global fetch stage
+        gmem_wait();
 
-      tmp_accum.clear();
-    }
+        // Move to the next global fetch stage
+        advance_smem_write_stage(iterator_A, iterator_B);
+        advance_smem_read_stage();
 
-    //
-    // Mainloop
-    //
+        // Disable global fetching when done with global fetch iterations
+        --gemm_k_iterations;
+        iterator_A.clear_mask(gemm_k_iterations == 0);
+        iterator_B.clear_mask(gemm_k_iterations == 0);
+      }
 
-    CUTLASS_GEMM_LOOP
-    for (; gemm_k_iterations > (-Base::kStages + 1);) {
-      //
-      // Loop over GEMM K dimension
-      //
+      // The last warp-tile also converts the shared memory fragments used by
+      // the first warp-tile of the next iteration, if necessary (so we can
+      // immediately start issuing MMA instructions at the top of the loop )
+      if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
 
-      // Computes a warp-level GEMM on data held in shared memory
-      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
-      CUTLASS_PRAGMA_UNROLL
-      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
-           ++warp_mma_k) {
-
-        // Load warp-level tiles from shared memory, wrapping to k offset if
-        // this is the last group as the case may be.
-
-        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-        
-        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
-        this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
-
-        ++this->warp_tile_iterator_A_;
-        ++this->warp_tile_iterator_B_;
-
-        if (warp_mma_k > 0)
-          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
-                             warp_transformed_frag_B[warp_mma_k % 2],
-                             warp_loaded_frag_A[warp_mma_k % 2],
-                             warp_loaded_frag_B[warp_mma_k % 2]);
-
-        if (platform::is_same<typename Operator::MathOperator,
-                              arch::OpMultiplyAddFastF32>::value
-          || platform::is_same<typename Operator::MathOperator,
-                               arch::OpMultiplyAddComplexFastF32>::value) {
-
-          warp_mma(
-            tmp_accum, 
-            warp_transformed_frag_A[warp_mma_k % 2],
-            warp_transformed_frag_B[warp_mma_k % 2], 
-            tmp_accum
-          );
-
-          if (warp_mma_k == 0) {
-            accum = plus_accum(accum, tmp_accum);
-            tmp_accum.clear();
-          }
-        } else {
-          warp_mma(
-            accum, 
-            warp_transformed_frag_A[warp_mma_k % 2],
-            warp_transformed_frag_B[warp_mma_k % 2], 
-            accum
-          );
-        }
+        warp_mma_.transform(
+          pipe_state.warp_transformed_frag_A_[(warp_mma_k + 1) % 2],
+          pipe_state.warp_transformed_frag_B_[(warp_mma_k + 1) % 2],
+          pipe_state.warp_loaded_frag_A_[(warp_mma_k + 1) % 2],
+          pipe_state.warp_loaded_frag_B_[(warp_mma_k + 1) % 2]);
+      }
 
-        // Issue global->shared copies for the this stage
-        if (warp_mma_k < Base::kWarpGemmIterations - 1) {
-          int group_start_iteration_A, group_start_iteration_B;
+    }
+  }
 
-          group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
-          group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
 
-          copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A, 
-                               group_start_iteration_B);
-        }
+  /// Perform the specified number of threadblock mainloop iterations of matrix
+  /// multiply-accumulate.  Assumes prologue has been initiated.
+  CUTLASS_DEVICE
+  void gemm_iters(
+      int gemm_k_iterations,        ///< number of threadblock mainloop iterations
+      FragmentC &accum,             ///< [in|out] accumulator tile
+      IteratorA &iterator_A,        ///< [in|out] iterator over A operand in global memory
+      IteratorB &iterator_B)        ///< [in|out] iterator over B operand in global memory
+  {
+    PipeState pipe_state;
 
-        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
-          int group_start_iteration_A, group_start_iteration_B;
-          group_start_iteration_A =
-              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
-          group_start_iteration_B =
-              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+    // Disable global fetching if done with global fetch iterations
+    iterator_A.clear_mask(gemm_k_iterations == 0);
+    iterator_B.clear_mask(gemm_k_iterations == 0);
 
-          copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A, 
-                               group_start_iteration_B);
+    // Load first warp-tile's A fragment from shared memory
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_A_.load(pipe_state.warp_loaded_frag_A_[0]);
+    ++this->warp_tile_iterator_A_;
 
-          // Inserts a memory fence between stages of cp.async instructions.
-          cutlass::arch::cp_async_fence();
+    // Load first warp-tile's B fragment from shared memory
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.load(pipe_state.warp_loaded_frag_B_[0]);
+    ++this->warp_tile_iterator_B_;
 
-          // Waits until kStages-2 stages have committed.
-          arch::cp_async_wait<Base::kStages - 2>();
-          __syncthreads();
+    // Transform, if necessary, the first warp-tile's shared memory fragments
+    warp_mma_.transform(
+      pipe_state.warp_transformed_frag_A_[0],
+      pipe_state.warp_transformed_frag_B_[0],
+      pipe_state.warp_loaded_frag_A_[0],
+      pipe_state.warp_loaded_frag_B_[0]);
 
-          // Move to the next stage
-          iterator_A.add_tile_offset({0, 1});
-          iterator_B.add_tile_offset({1, 0});
+    if (Detail::kStagedAccumulation) {
+      pipe_state.tmp_accum_.clear();
+    }
 
-          this->smem_iterator_A_.add_tile_offset({0, 1});
-          this->smem_iterator_B_.add_tile_offset({1, 0});
+    // Mainloop
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      mac_loop_iter(
+        pipe_state,
+        accum,
+        iterator_A,
+        iterator_B,
+        gemm_k_iterations);
+    }
 
-          // Add negative offsets to return iterators to the 'start' of the
-          // circular buffer in shared memory
-          if (smem_write_stage_idx == (Base::kStages - 1)) {
-            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
-            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
-            smem_write_stage_idx = 0;
-          } else {
-            ++smem_write_stage_idx;
-          }
+    if (Detail::kStagedAccumulation) {
+      plus<FragmentC> plus_accum;
+      accum = plus_accum(accum, pipe_state.tmp_accum_);
+    }
 
-          if (smem_read_stage_idx == (Base::kStages - 1)) {
-            this->warp_tile_iterator_A_.add_tile_offset(
-                {0, -Base::kStages * Policy::kPartitionsK *
-                        Base::kWarpGemmIterations});
-            this->warp_tile_iterator_B_.add_tile_offset(
-                {-Base::kStages * Policy::kPartitionsK *
-                     Base::kWarpGemmIterations,
-                 0});
-            smem_read_stage_idx = 0;
-          } else {
-            ++smem_read_stage_idx;
-          }
+    // Commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
 
-          --gemm_k_iterations;
-          iterator_A.clear_mask(gemm_k_iterations == 0);
-          iterator_B.clear_mask(gemm_k_iterations == 0);
-        }
+  }
 
-        // Do any conversions feeding the first stage at the end of the loop so
-        // we can start right away on mma instructions
-        if (warp_mma_k + 1 == Base::kWarpGemmIterations)
-          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
-                             warp_transformed_frag_B[(warp_mma_k + 1) % 2],
-                             warp_loaded_frag_A[(warp_mma_k + 1) % 2],
-                             warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
-      }
 
+  /// Prepares the class for another prologue.
+  CUTLASS_DEVICE
+  void wind_down()
+  {
+    // Catch-up the smem-read iterator to the smem-write iterator (so this class can be reused for another tile's prologue)
+
+    // First, increment remaining warp tiles to get to the next full stage.  (Ideally we would
+    // just decrement one tile, but not all iterators implement --() decrement.)
+    #pragma unroll
+    for (int warp_mma_k = 1; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k)
+    {
+      this->warp_tile_iterator_A_.set_kgroup_index(warp_mma_k);
+      this->warp_tile_iterator_B_.set_kgroup_index(warp_mma_k);
+
+      ++this->warp_tile_iterator_A_;
+      ++this->warp_tile_iterator_B_;
     }
-
-    if (platform::is_same<typename Operator::MathOperator,
-                          arch::OpMultiplyAddFastF32>::value
-      || platform::is_same<typename Operator::MathOperator,
-                           arch::OpMultiplyAddComplexFastF32>::value) {
-      accum = plus_accum(accum, tmp_accum); 
+    smem_read_stage_idx_++;
+
+    // Then wrap back two full stages (one for the tile advancing we just did, and one to catch the write iterators)
+    static const int kStageIters = Policy::kPartitionsK * Base::kWarpGemmIterations;
+    if (smem_read_stage_idx_ > 1)
+    {
+      this->warp_tile_iterator_A_.add_tile_offset({0, (-2 * kStageIters)});
+      this->warp_tile_iterator_B_.add_tile_offset({(-2 * kStageIters), 0});
     }
- 
-    if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
-      // commit and drain all pending and predicated LDGSTS pnz from the GEMM mainloop
-      cutlass::arch::cp_async_fence();
-      cutlass::arch::cp_async_wait<0>();
-      __syncthreads();
+    else
+    {
+      this->warp_tile_iterator_A_.add_tile_offset({0, ((Base::kStages - 2) * kStageIters)});
+      this->warp_tile_iterator_B_.add_tile_offset({((Base::kStages - 2) * kStageIters), 0});
     }
+    smem_read_stage_idx_ = smem_write_stage_idx_;
+  }
+
 
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< initial value of accumulator
+      FragmentC const &src_accum) {
+
+    // Prologue (start fetching iterations of global fragments into shared memory)
+    prologue(iterator_A, iterator_B, gemm_k_iterations);
+
+    // Wait until we have at least one completed global fetch stage
+    gmem_wait();
+
+    // Initialize destination accumulators with source accumulators
+    accum = src_accum;
+
+    // Perform the MAC-iterations
+    gemm_iters(gemm_k_iterations, accum, iterator_A, iterator_B);
   }
 };
 
@@ -632,3 +738,4 @@ class MmaMultistage :
 }  // namespace cutlass
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/include/cutlass/gemm/threadblock/mma_pipelined.h b/include/cutlass/gemm/threadblock/mma_pipelined.h
index 05ab53e2a4..89681ebcef 100644
--- a/include/cutlass/gemm/threadblock/mma_pipelined.h
+++ b/include/cutlass/gemm/threadblock/mma_pipelined.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -136,12 +136,14 @@ class MmaPipelined : public MmaBase<Shape_, Policy_, 2> {
   // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
   static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2");
 
-private:
+protected:
 
-  using WarpFragmentA = typename Operator::FragmentA;
-  using WarpFragmentB = typename Operator::FragmentB;
+  //
+  // Data members
+  //
 
-protected:
+  /// Warp-level MMA operator
+  Operator warp_mma;
 
   /// Iterator to write threadblock-scoped tile of A operand to shared memory
   SmemIteratorA smem_iterator_A_;
@@ -149,6 +151,15 @@ class MmaPipelined : public MmaBase<Shape_, Policy_, 2> {
   /// Iterator to write threadblock-scoped tile of B operand to shared memory
   SmemIteratorB smem_iterator_B_;
 
+  ///< transformation applied to A fragment
+  TransformA transform_A_;
+
+  ///< transformation applied to B fragment
+  TransformB transform_B_;
+
+  /// Shared memory write stage index
+  int smem_write_stage_idx;
+
 public:
 
   /// Construct from tensor references
@@ -157,11 +168,17 @@ class MmaPipelined : public MmaBase<Shape_, Policy_, 2> {
     typename Base::SharedStorage &shared_storage,       ///< Shared storage needed for internal use by threadblock-scoped GEMM
     int thread_idx,                                     ///< ID within the threadblock
     int warp_idx,                                       ///< ID of warp
-    int lane_idx                                        ///< ID of each thread within a warp
+    int lane_idx,                                       ///< ID of each thread within a warp
+    TransformA transform_A = TransformA(),              ///< transformation applied to A fragment
+    TransformB transform_B = TransformB()               ///< transformation applied to B fragment
   ):
     Base(shared_storage, thread_idx, warp_idx, lane_idx),
     smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
-    smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) {
+    smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx),
+    transform_A_(transform_A),
+    transform_B_(transform_B),
+    smem_write_stage_idx(0)
+  {
 
     // Compute warp location within threadblock tile by mapping the warp_id to
     // three coordinates:
@@ -180,69 +197,121 @@ class MmaPipelined : public MmaBase<Shape_, Policy_, 2> {
     this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
   }
 
-  /// Perform a threadblock-scoped matrix multiply-accumulate
+
+  /// Advance shared memory write-iterators to the next stage
   CUTLASS_DEVICE
-  void operator()(
-    int gemm_k_iterations,                            ///< number of iterations of the mainloop
-    FragmentC &accum,                                 ///< destination accumulator tile
-    IteratorA iterator_A,                             ///< iterator over A operand in global memory
-    IteratorB iterator_B,                             ///< iterator over B operand in global memory
-    FragmentC const &src_accum,                       ///< source accumulator tile
-    TransformA transform_A = TransformA(),            ///< transformation applied to A fragment
-    TransformB transform_B = TransformB()) {          ///< transformation applied to B fragment
+  void advance_smem_write_stage()
+  {
+    ++this->smem_iterator_A_;
+    ++this->smem_iterator_B_;
 
-    //
-    // Prologue
-    //
+    // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+    if (smem_write_stage_idx == 1) {
+      this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+      this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+    }
 
-    // Perform accumulation in the 'd' output operand
-    accum = src_accum;
+    smem_write_stage_idx ^= 1;
+  }
 
-    FragmentA tb_frag_A;
-    FragmentB tb_frag_B;
+  /// Advance shared memory read- and write-iterators to the next stage
+  CUTLASS_DEVICE
+  void advance_smem_stages()
+  {
+    ++this->smem_iterator_A_;
+    ++this->smem_iterator_B_;
+
+    // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+    if (smem_write_stage_idx == 1) {
+      // wrap write stage
+      this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+      this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+    }
+    else
+    {
+      // wrap read stage
+      this->warp_tile_iterator_A_.add_tile_offset(
+        {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+      this->warp_tile_iterator_B_.add_tile_offset(
+        {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations, 0});
+    }
+
+    smem_write_stage_idx ^= 1;
+  }
 
-    tb_frag_A.clear();
-    tb_frag_B.clear();
 
+  /// GEMM prologue.  Bootstrap the global->shared memory pipeline by fetching
+  /// the global fragments needed by the first kStages-1 threadblock mainloop iterations
+  CUTLASS_DEVICE
+  void prologue(
+    IteratorA &iterator_A,      ///< [in|out] iterator over A operand in global memory
+    IteratorB &iterator_B,      ///< [in|out] iterator over B operand in global memory
+    int &gemm_k_iterations)     ///< [in|out] number of threadblock mainloop iterations remaining
+  {
     // The last kblock is loaded in the prolog
-    iterator_A.load(tb_frag_A);
-    iterator_B.load(tb_frag_B);
 
+    // Load A fragment from global A
+    FragmentA tb_frag_A;
+    tb_frag_A.clear();
+    iterator_A.load(tb_frag_A);
     ++iterator_A;
+
+    // Load B fragment from global B
+    FragmentB tb_frag_B;
+    tb_frag_B.clear();
+    iterator_B.load(tb_frag_B);
     ++iterator_B;
 
-    this->smem_iterator_A_.store(transform_A(tb_frag_A));
-    this->smem_iterator_B_.store(transform_B(tb_frag_B));
+    // Store A and B fragments to shared
+    this->smem_iterator_A_.store(transform_A_(tb_frag_A));
+    this->smem_iterator_B_.store(transform_B_(tb_frag_B));
 
-    ++this->smem_iterator_A_;
-    ++this->smem_iterator_B_;
+    // Advance write stage
+    advance_smem_write_stage();
+  }
 
+  /// Wait until we have at least one completed global fetch stage
+  CUTLASS_DEVICE
+  void gmem_wait()
+  {
     __syncthreads();
+  }
+
+
+  /// Perform the specified number of threadblock mainloop iterations of matrix
+  /// multiply-accumulate.  Assumes prologue has been initiated.
+  CUTLASS_DEVICE
+  void gemm_iters(
+    int gemm_k_iterations,        ///< number of threadblock mainloop iterations
+    FragmentC &accum,             ///< [in|out] accumulator tile
+    IteratorA &iterator_A,        ///< [in|out] iterator over A operand in global memory
+    IteratorB &iterator_B)        ///< [in|out] iterator over B operand in global memory
+  {
+    using WarpFragmentA = typename Operator::FragmentA;
+    using WarpFragmentB = typename Operator::FragmentB;
 
     // Pair of fragments used to overlap shared memory loads and math instructions
     WarpFragmentA warp_frag_A[2];
     WarpFragmentB warp_frag_B[2];
 
+    // Load A fragment from shared A
     this->warp_tile_iterator_A_.set_kgroup_index(0);
-    this->warp_tile_iterator_B_.set_kgroup_index(0);
-
     this->warp_tile_iterator_A_.load(warp_frag_A[0]);
-    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
-
     ++this->warp_tile_iterator_A_;
-    ++this->warp_tile_iterator_B_;
 
-    Operator warp_mma;
+    // Load B fragment from shared B
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+    ++this->warp_tile_iterator_B_;
 
-    int smem_write_stage_idx = 1;
+    // Pair of fragments used to overlap global memory loads and math instructions;
+    FragmentA tb_frag_A;
+    FragmentB tb_frag_B;
 
     // Avoid reading out of bounds
     iterator_A.clear_mask(gemm_k_iterations <= 1);
     iterator_B.clear_mask(gemm_k_iterations <= 1);
 
-    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
-    // shared memory loads (which have the tighest latency requirement).
-
     //
     // Mainloop
     //
@@ -263,34 +332,20 @@ class MmaPipelined : public MmaBase<Shape_, Policy_, 2> {
         if (warp_mma_k == Base::kWarpGemmIterations - 1) {
 
           // Write fragments to shared memory
-          this->smem_iterator_A_.store(transform_A(tb_frag_A));
-
-          this->smem_iterator_B_.store(transform_B(tb_frag_B));
-
-          __syncthreads();
-          
-          ++this->smem_iterator_A_;
-          ++this->smem_iterator_B_;
-
-          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
-          if (smem_write_stage_idx == 1) {
-            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
-            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
-          }
-          else {
-            this->warp_tile_iterator_A_.add_tile_offset(
-                {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
-            this->warp_tile_iterator_B_.add_tile_offset(
-                {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations,
-                 0});
-          }
-
-          smem_write_stage_idx ^= 1;
+          this->smem_iterator_A_.store(transform_A_(tb_frag_A));
+
+          this->smem_iterator_B_.store(transform_B_(tb_frag_B));
+
+          // Wait until we have at least one completed global fetch stage
+          gmem_wait();
+
+          // Advance smem read and write stages
+          advance_smem_stages();
         }
 
         this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
         this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-        
+
         this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
         this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);
 
@@ -299,10 +354,14 @@ class MmaPipelined : public MmaBase<Shape_, Policy_, 2> {
 
         if (warp_mma_k == 0) {
 
+          // Load fragment from global A
+          tb_frag_A.clear();
           iterator_A.load(tb_frag_A);
-          iterator_B.load(tb_frag_B);
-
           ++iterator_A;
+
+          // Load fragment from global B
+          tb_frag_B.clear();
+          iterator_B.load(tb_frag_B);
           ++iterator_B;
 
           // Avoid reading out of bounds if this was the last loop iteration
@@ -310,12 +369,65 @@ class MmaPipelined : public MmaBase<Shape_, Policy_, 2> {
           iterator_B.clear_mask(gemm_k_iterations <= 2);
         }
 
-        warp_mma(accum, warp_frag_A[warp_mma_k % 2],
-                 warp_frag_B[warp_mma_k % 2], accum);
+        warp_mma(
+          accum,
+          warp_frag_A[warp_mma_k % 2],
+          warp_frag_B[warp_mma_k % 2],
+          accum);
       }
     }
 
   }
+
+
+  /// Prepares the class for another prologue.
+  CUTLASS_DEVICE
+  void wind_down()
+  {
+    // First, increment remaining warp tiles to catch it up with the write stage.
+    #pragma unroll
+    for (int warp_mma_k = 1; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k)
+    {
+      this->warp_tile_iterator_A_.set_kgroup_index(warp_mma_k);
+      this->warp_tile_iterator_B_.set_kgroup_index(warp_mma_k);
+
+      ++this->warp_tile_iterator_A_;
+      ++this->warp_tile_iterator_B_;
+    }
+
+    // If we bumped the read iterators to the end of the circular buffer, wrap them around to
+    // align them with the write iterators
+    if (smem_write_stage_idx == 0)
+    {
+      this->warp_tile_iterator_A_.add_tile_offset(
+        {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+      this->warp_tile_iterator_B_.add_tile_offset(
+        {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations, 0});
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+    int gemm_k_iterations,                            ///< number of iterations of the mainloop
+    FragmentC &accum,                                 ///< destination accumulator tile
+    IteratorA iterator_A,                             ///< iterator over A operand in global memory
+    IteratorB iterator_B,                             ///< iterator over B operand in global memory
+    FragmentC const &src_accum)                       ///< source accumulator tile
+  {
+    // Prologue
+    prologue(iterator_A, iterator_B, gemm_k_iterations);
+
+    // Wait until we have at least one completed global fetch stage
+    gmem_wait();
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    // Perform the MAC-iterations
+    gemm_iters(gemm_k_iterations, accum, iterator_A, iterator_B);
+  }
+
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/threadblock/mma_planar_complex_base.h b/include/cutlass/gemm/threadblock/mma_planar_complex_base.h
index 77e9e8d37e..e8616cc904 100644
--- a/include/cutlass/gemm/threadblock/mma_planar_complex_base.h
+++ b/include/cutlass/gemm/threadblock/mma_planar_complex_base.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h b/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h
index 6b3455dd5d..b9deb6320e 100644
--- a/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h
+++ b/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -150,24 +150,22 @@ class MmaPlanarComplexMultistage :
                   "The pipelined structure requires at least two warp-level "
                   "GEMM operations.");
 
-    /// Number of LDGSTS instructions to load one stage of operand A
-    static int const TBLDGSTSIterationsA =
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const TBLoadIterationsA =
         IteratorA::ThreadMap::Iterations::kCount;
 
-    /// Number of LDGSTS instructions to load one stage of operand B
-    static int const TBLDGSTSIterationsB =
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const TBLoadIterationsB =
         IteratorB::ThreadMap::Iterations::kCount;
 
     /// Number of stages
     static int const kStages = Stages;
 
-    /// Number of LDGSTS instructions to load on group of operand A
     static int const kAccessesPerGroupA =
-        (TBLDGSTSIterationsA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+        (TBLoadIterationsA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
 
-    /// Number of LDGSTS instructions to load on group of operand B
     static int const kAccessesPerGroupB =
-        (TBLDGSTSIterationsB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+        (TBLoadIterationsB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
   };
 
  private:
@@ -239,7 +237,7 @@ class MmaPlanarComplexMultistage :
     iterator_A_imag.set_iteration_index(group_start_A * IteratorA::kAccessesPerVector);
     this->smem_iterator_A_.set_iteration_index(group_start_A);
 
-    // LDGSTS for operand A
+    // Load for operand A
     CUTLASS_PRAGMA_UNROLL
     for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
         
@@ -277,7 +275,7 @@ class MmaPlanarComplexMultistage :
     iterator_B_imag.set_iteration_index(group_start_B * IteratorB::kAccessesPerVector);
     this->smem_iterator_B_.set_iteration_index(group_start_B);
 
-    // LDGSTS for operand B
+    // Load for operand B
     CUTLASS_PRAGMA_UNROLL
     for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
       typename IteratorB::AccessType *dst_ptr = 
@@ -386,9 +384,9 @@ class MmaPlanarComplexMultistage :
 
       this->smem_iterator_A_.set_iteration_index(0);
 
-      // LDGSTS for operand A
+      // Load for operand A
       CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::TBLDGSTSIterationsA; ++j) {
+      for (int j = 0; j < Detail::TBLoadIterationsA; ++j) {
 
         typename IteratorA::AccessType *dst_ptr = 
           reinterpret_cast<typename IteratorA::AccessType *>(this->smem_iterator_A_.get());
@@ -427,9 +425,9 @@ class MmaPlanarComplexMultistage :
 
       this->smem_iterator_B_.set_iteration_index(0);
 
-      // LDGSTS for operand B
+      // Load for operand B
       CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::TBLDGSTSIterationsB; ++j) {
+      for (int j = 0; j < Detail::TBLoadIterationsB; ++j) {
 
         typename IteratorB::AccessType *dst_ptr = 
           reinterpret_cast<typename IteratorB::AccessType *>(this->smem_iterator_B_.get());
@@ -630,6 +628,12 @@ class MmaPlanarComplexMultistage :
 
     }
 
+
+    // Commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
   }
 };
 
diff --git a/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h b/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h
index 43d753308a..0e36a6dc62 100644
--- a/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h
+++ b/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -321,7 +321,7 @@ class MmaPlanarComplexPipelined :
     iterator_B_imag.clear_mask(gemm_k_iterations <= 1);
 
     // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
-    // shared memory loads (which have the tighest latency requirement).
+    // shared memory loads (which have the tightest latency requirement).
 
     //
     // Mainloop
diff --git a/include/cutlass/gemm/threadblock/mma_singlestage.h b/include/cutlass/gemm/threadblock/mma_singlestage.h
index c35475a90c..3115628650 100644
--- a/include/cutlass/gemm/threadblock/mma_singlestage.h
+++ b/include/cutlass/gemm/threadblock/mma_singlestage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/mma_softmax_mainloop_fusion_multistage.h b/include/cutlass/gemm/threadblock/mma_softmax_mainloop_fusion_multistage.h
new file mode 100644
index 0000000000..bd793fc84f
--- /dev/null
+++ b/include/cutlass/gemm/threadblock/mma_softmax_mainloop_fusion_multistage.h
@@ -0,0 +1,756 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+
+    It loads two loop invariant vectors, norm and sum, in the prologue and
+    stores them in the register file.  We will call elementwise operation to
+    apply norm and sum between ldmatrix and warp mma.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+#include "cutlass/gemm/warp/softmax_scale_bias_transform.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaMainloopFusionBase {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = cutlass::gemm::GemmShape<Shape::kM / WarpGemm::kM,
+                                             Shape::kN / WarpGemm::kN,
+                                             Shape::kK / WarpGemm::kK>;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations =
+      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  /// Tensor reference to the A operand
+  using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+
+  /// Tensor reference to the B operand
+  using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+
+    /// Shape of the A matrix operand in shared memory
+    using ShapeA = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow,
+                               Shape::kK * kStages +
+                                   Policy::SmemPaddingA::kColumn>;
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB =
+        MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+                    Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+   public:
+    //
+    // Data members
+    //
+
+    /// Buffer for A operand
+    AlignedBuffer<typename Operator::ElementA, ShapeA::kCount> operand_A;
+
+    /// Buffer for B operand
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+   public:
+
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the A matrix
+    CUTLASS_DEVICE
+    static typename Operator::LayoutA LayoutA() {
+      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
+    }
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() {
+      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+    }
+
+    /// Returns a TensorRef to the A operand
+    CUTLASS_HOST_DEVICE
+    TensorRefA operand_A_ref() {
+      return TensorRefA{operand_A.data(), LayoutA()};
+    }
+
+    /// Returns a TensorRef to the B operand
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() {
+      return TensorRefB{operand_B.data(), LayoutB()};
+    }
+  };
+
+ protected:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A operand from shared memory
+  typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaMainloopFusionBase(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : warp_tile_iterator_A_(shared_storage.operand_A_ref(), lane_idx),
+        warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx) {}
+};
+
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Iterates over vectors of var and mean vector in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorNormSum_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Whether problem has been transformed. This determines to which operand
+    /// the softmax is applied.
+    bool InternalTranspose,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaSoftmaxMainloopFusionMultistage : 
+  public MmaMainloopFusionBase<Shape_, Policy_, Stages> {
+public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Iterates over tiles of the var and mean vectors in global memory
+  using IteratorNormSum = IteratorNormSum_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  ///< Base class
+  using Base = MmaMainloopFusionBase<Shape_, Policy, Stages>;
+
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+  
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    static_assert(Base::kWarpGemmIterations > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA = typename Operator::FragmentA;
+  using WarpLoadedFragmentB = typename Operator::FragmentB;
+  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+  using WarpLoadedFragmentNormSum = typename IteratorNormSum::Fragment;
+
+  static bool const kInternalTranspose = InternalTranspose;
+
+  using SoftmaxFragment = typename platform::conditional<kInternalTranspose,
+                                                         WarpTransformedFragmentB,
+                                                         WarpTransformedFragmentA>::type;
+
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  int warp_idx_m_;
+
+  int warp_idx_n_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaSoftmaxMainloopFusionMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    warp_idx_m_ = warp_idx_mn % Base::WarpCount::kM;
+    warp_idx_n_ = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m_, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n_});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(IteratorA &iterator_A,
+                              IteratorB &iterator_B,
+                              int group_start_A = 0, int group_start_B = 0) {
+    iterator_A.set_iteration_index(group_start_A *
+                                   IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess /
+                              IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A.get();
+
+          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, iterator_A.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, iterator_A.valid());
+          }
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    iterator_B.set_iteration_index(group_start_B *
+                                   IteratorB::kAccessesPerVector);
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                              IteratorB::ThreadMap::kElementsPerAccess /
+                              IteratorB::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B.get();
+
+          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, iterator_B.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, iterator_B.valid());
+          }
+
+          ++iterator_B;
+        }
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< iterator over B operand in global memory
+      IteratorNormSum iterator_norm_sum,
+      ///< initial value of accumulator
+      FragmentC const &src_accum) {
+
+    //
+    // Prologue
+    //
+    // Issue several complete stages
+
+    WarpLoadedFragmentNormSum warp_loaded_frag_norm_sum;
+    iterator_norm_sum.add_tile_offset({0, warp_idx_m_});
+    iterator_norm_sum.load(warp_loaded_frag_norm_sum);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations) {
+
+      iterator_A.clear_mask(gemm_k_iterations == 0);
+      iterator_B.clear_mask(gemm_k_iterations == 0);
+
+      iterator_A.set_iteration_index(0);
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorA::Element>::value *
+              IteratorA::ThreadMap::kElementsPerAccess /
+              IteratorA::kAccessesPerVector / 8;
+
+          int src_bytes = (iterator_A.valid() ? kSrcBytes : 0);
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+
+      iterator_B.set_iteration_index(0);
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB::Element>::value *
+              IteratorB::ThreadMap::kElementsPerAccess /
+              IteratorB::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+              dst_ptr + v, iterator_B.get(), iterator_B.valid());
+
+          ++iterator_B;
+        }
+
+        ++this->smem_iterator_B_;
+      }
+
+      // Move to the next stage
+      iterator_A.add_tile_offset({0, 1});
+      iterator_B.add_tile_offset({1, 0});
+
+      this->smem_iterator_A_.add_tile_offset({0, 1});
+      this->smem_iterator_B_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    // Waits until kStages-2 stages have committed.
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA warp_loaded_frag_A[2];
+    WarpLoadedFragmentB warp_loaded_frag_B[2];
+    WarpTransformedFragmentA warp_transformed_frag_A[2];
+    WarpTransformedFragmentB warp_transformed_frag_B[2];
+
+    Operator warp_mma;
+    cutlass::gemm::warp::SoftmaxScaleBiasTransform<
+        SoftmaxFragment, WarpLoadedFragmentNormSum> elementwise_transform;
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    iterator_A.clear_mask(gemm_k_iterations == 0);
+    iterator_B.clear_mask(gemm_k_iterations == 0);
+
+    // Start issuing the first group of the next stage outside of the mainloop
+    copy_tiles_and_advance(iterator_A, iterator_B);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
+                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
+
+    if (kInternalTranspose) {
+      elementwise_transform(warp_transformed_frag_B[0],
+                         warp_loaded_frag_norm_sum);
+    } else {
+      elementwise_transform(warp_transformed_frag_A[0],
+                         warp_loaded_frag_norm_sum);
+    }
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        
+        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k > 0) {
+          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
+                             warp_transformed_frag_B[warp_mma_k % 2],
+                             warp_loaded_frag_A[warp_mma_k % 2],
+                             warp_loaded_frag_B[warp_mma_k % 2]);
+
+              if (kInternalTranspose) {
+                elementwise_transform(warp_transformed_frag_B[warp_mma_k % 2],
+                                  warp_loaded_frag_norm_sum);
+              } else {
+                elementwise_transform(warp_transformed_frag_A[warp_mma_k % 2],
+                                  warp_loaded_frag_norm_sum);
+              }
+        }
+
+        // Issue global->shared copies for the next stage
+        int group_start_iteration_A, group_start_iteration_B;
+
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
+          group_start_iteration_A = 0;
+          group_start_iteration_B = 0;
+        } else {
+          group_start_iteration_A =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+          group_start_iteration_B =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+        }
+
+        copy_tiles_and_advance(iterator_A, iterator_B,
+                               group_start_iteration_A,
+                               group_start_iteration_B);
+
+        warp_mma(
+          accum, 
+          warp_transformed_frag_A[warp_mma_k % 2],
+          warp_transformed_frag_B[warp_mma_k % 2], 
+          accum
+        );
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A.add_tile_offset({0, 1});
+          iterator_B.add_tile_offset({1, 0});
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK *
+                        Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+          iterator_A.clear_mask(gemm_k_iterations == 0);
+          iterator_B.clear_mask(gemm_k_iterations == 0);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
+          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+              if (kInternalTranspose) {
+                elementwise_transform(warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+                                  warp_loaded_frag_norm_sum);
+              } else {
+                elementwise_transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+                                  warp_loaded_frag_norm_sum);
+              }
+        }
+      }
+
+    }
+    
+    if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+      // commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+      cutlass::arch::cp_async_fence();
+      cutlass::arch::cp_async_wait<0>();
+      __syncthreads();
+    }
+
+    // Commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/threadblock/mma_sparse_base.h b/include/cutlass/gemm/threadblock/mma_sparse_base.h
index 917909181c..bb10c0a8f5 100644
--- a/include/cutlass/gemm/threadblock/mma_sparse_base.h
+++ b/include/cutlass/gemm/threadblock/mma_sparse_base.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -120,6 +120,13 @@ class SparseMmaBase {
   static int const kWarpGemmIterations =
       (WarpGemm::kK / Operator::Policy::MmaShape::kK);
 
+  static_assert(kWarpGemmIterations > 1,
+                "The pipelined structure requires at least two warp-level "
+                "GEMM operations.");
+
+  static_assert((kWarpGemmIterations % 2) == 0,
+                "Inner loop iteration must be an even number.");
+
   /// Number of stages
   static int const kStages = Stages;
 
diff --git a/include/cutlass/gemm/threadblock/mma_sparse_multistage.h b/include/cutlass/gemm/threadblock/mma_sparse_multistage.h
index a839e21873..8113583d69 100644
--- a/include/cutlass/gemm/threadblock/mma_sparse_multistage.h
+++ b/include/cutlass/gemm/threadblock/mma_sparse_multistage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -156,20 +156,16 @@ class SparseMmaMultistage :
   /// Internal structure exposed for introspection.
   struct Detail {
 
-    static_assert(Base::kWarpGemmIterations > 1,
-                  "The pipelined structure requires at least two warp-level "
-                  "GEMM operations.");
-
     /// Number of async copies to load one stage of operand A
-    static int const TBLDGSTSIterationsA =
+    static int const TBLoadIterationsA =
         IteratorA::ThreadMap::Iterations::kCount;
 
     /// Number of async copies to load one stage of operand B
-    static int const TBLDGSTSIterationsB =
+    static int const TBLoadIterationsB =
         IteratorB::ThreadMap::Iterations::kCount;
 
     /// Number of async copies to load one stage of operand E
-    static int const TBLDGSTSIterationsE =
+    static int const TBLoadIterationsE =
         IteratorE::ThreadMap::Iterations::kCount;
 
     /// Number of stages
@@ -177,15 +173,15 @@ class SparseMmaMultistage :
 
     /// Number of async copies to load one group of operand A
     static int const kAccessesPerGroupA =
-        (TBLDGSTSIterationsA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+        (TBLoadIterationsA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
 
     /// Number of async copies to load one group of operand B
     static int const kAccessesPerGroupB =
-        (TBLDGSTSIterationsB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+        (TBLoadIterationsB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
 
     /// Number of async copies to load one group of operand E
     static int const kAccessesPerGroupE =
-        (TBLDGSTSIterationsE + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+        (TBLoadIterationsE + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
 
     /// E operand is tiny.  For the most of time, not all the warps are needed
     /// to load it from the global memory.
@@ -283,7 +279,7 @@ class SparseMmaMultistage :
     // async copy for operand A
     CUTLASS_PRAGMA_UNROLL
     for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
-      if (group_start_A + j < Detail::TBLDGSTSIterationsA) {
+      if (group_start_A + j < Detail::TBLoadIterationsA) {
         typename IteratorA::AccessType *dst_ptr =
             reinterpret_cast<typename IteratorA::AccessType *>(
                 this->smem_iterator_A_.get());
@@ -313,7 +309,7 @@ class SparseMmaMultistage :
     // async copy for operand B
     CUTLASS_PRAGMA_UNROLL
     for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
-      if (group_start_B + j < Detail::TBLDGSTSIterationsB) {
+      if (group_start_B + j < Detail::TBLoadIterationsB) {
         typename IteratorB::AccessType *dst_ptr =
             reinterpret_cast<typename IteratorB::AccessType *>(
                 this->smem_iterator_B_.get());
@@ -341,7 +337,7 @@ class SparseMmaMultistage :
     // async copy for operand E
     CUTLASS_PRAGMA_UNROLL
     for (int j = 0; j < Detail::kAccessesPerGroupE; ++j) {
-      if (group_start_E + j < Detail::TBLDGSTSIterationsE) {
+      if (group_start_E + j < Detail::TBLoadIterationsE) {
         typename IteratorE::AccessType *dst_ptr =
             reinterpret_cast<typename IteratorE::AccessType *>(
                 this->smem_iterator_E_.get());
@@ -394,7 +390,7 @@ class SparseMmaMultistage :
 
       // async copy for operand A
       CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::TBLDGSTSIterationsA; ++j) {
+      for (int j = 0; j < Detail::TBLoadIterationsA; ++j) {
         typename IteratorA::AccessType *dst_ptr =
             reinterpret_cast<typename IteratorA::AccessType *>(
                 this->smem_iterator_A_.get());
@@ -420,7 +416,7 @@ class SparseMmaMultistage :
 
       // async copy for operand B
       CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::TBLDGSTSIterationsB; ++j) {
+      for (int j = 0; j < Detail::TBLoadIterationsB; ++j) {
         typename IteratorB::AccessType *dst_ptr =
             reinterpret_cast<typename IteratorB::AccessType *>(
                 this->smem_iterator_B_.get());
@@ -446,7 +442,7 @@ class SparseMmaMultistage :
 
       // async copy for operand E
       CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::TBLDGSTSIterationsE; ++j) {
+      for (int j = 0; j < Detail::TBLoadIterationsE; ++j) {
         typename IteratorE::AccessType *dst_ptr =
             reinterpret_cast<typename IteratorE::AccessType *>(
                 this->smem_iterator_E_.get());
@@ -471,14 +467,13 @@ class SparseMmaMultistage :
       this->smem_iterator_B_.add_tile_offset({1, 0});
       this->smem_iterator_E_.add_tile_offset({0, 1});
 
-      // LDGDEPBAR - completes a stage
+      // cp.async.commit_group - completes a stage
       cutlass::arch::cp_async_fence();
     }
 
     // Perform accumulation in the 'd' output operand
     accum = src_accum;
 
-    // DEPBAR+SYNC
     cutlass::arch::cp_async_wait<Base::kStages - 2>();
     __syncthreads();
 
@@ -648,13 +643,19 @@ class SparseMmaMultistage :
         // we can start right away on mma instructions
         if (warp_mma_k + 1 == Base::kWarpGemmIterations)
           warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
-                             warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B[(warp_mma_k + 1) % Detail::kBBufferSize],
                              warp_loaded_frag_A[(warp_mma_k + 1) % 2],
-                             warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+                             warp_loaded_frag_B[(warp_mma_k + 1) % Detail::kBBufferSize]);
       }
 
     }
 
+
+    // Commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
   }
 };
 
diff --git a/include/cutlass/gemm/threadblock/mma_with_reduction_multistage.h b/include/cutlass/gemm/threadblock/mma_with_reduction_multistage.h
index 2bd7f76591..fa95dd7d2a 100644
--- a/include/cutlass/gemm/threadblock/mma_with_reduction_multistage.h
+++ b/include/cutlass/gemm/threadblock/mma_with_reduction_multistage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -137,10 +137,6 @@ class MmaWithReductionMultistage :
   /// Internal structure exposed for introspection.
   struct Detail {
 
-    static_assert(Base::kWarpGemmIterations > 1,
-                  "The pipelined structure requires at least two warp-level "
-                  "GEMM operations.");
-
     /// Number of cp.async instructions to load one stage of operand A
     static int const AsyncCopyIterationsPerStageA =
         IteratorA::ThreadMap::Iterations::kCount;
@@ -532,12 +528,10 @@ class MmaWithReductionMultistage :
 
     }
     
-    if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
-      // commit and drain all pending and predicated LDGSTS pnz from the GEMM mainloop
-      cutlass::arch::cp_async_fence();
-      cutlass::arch::cp_async_wait<0>();
-      __syncthreads();
-    }
+    // commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
 
   }
 };
diff --git a/include/cutlass/gemm/threadblock/threadblock_swizzle.h b/include/cutlass/gemm/threadblock/threadblock_swizzle.h
index 7f47a08b48..1a4948d077 100644
--- a/include/cutlass/gemm/threadblock/threadblock_swizzle.h
+++ b/include/cutlass/gemm/threadblock/threadblock_swizzle.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -41,6 +41,8 @@
 #include "cutlass/gemm/gemm.h"
 #include "cutlass/conv/conv2d_problem_size.h"
 #include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/gemm/threadblock/index_remat.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle_streamk.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -50,62 +52,6 @@ namespace threadblock {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Helper to rematerialize block Idx. Reduces register liveness.
-CUTLASS_DEVICE
-int RematerializeThreadIdxX() {
-  return threadIdx.x;
-}
-
-/// Helper to rematerialize block Idx. Reduces register liveness.
-CUTLASS_DEVICE
-int RematerializeThreadIdxY() {
-  return threadIdx.y;
-}
-
-/// Helper to rematerialize block Idx. Reduces register liveness.
-CUTLASS_DEVICE
-int RematerializeThreadIdxZ() {
-  return threadIdx.z;
-}
-
-/// Helper to rematerialize block Idx. Reduces register liveness.
-CUTLASS_DEVICE
-int RematerializeBlockIdxX() {
-  return blockIdx.x;
-}
-
-/// Helper to rematerialize block Idx. Reduces register liveness.
-CUTLASS_DEVICE
-int RematerializeBlockIdxY() {
-  return blockIdx.y;
-}
-
-/// Helper to rematerialize block Idx. Reduces register liveness.
-CUTLASS_DEVICE
-int RematerializeBlockIdxZ() {
-  return blockIdx.z;
-}
-
-/// Helper to rematerialize block Dim. Reduces register liveness.
-CUTLASS_DEVICE
-int RematerializeBlockDimX() {
-  return blockDim.x;
-}
-
-/// Helper to rematerialize block Dim. Reduces register liveness.
-CUTLASS_DEVICE
-int RematerializeBlockDimY() {
-  return blockDim.y;
-}
-
-/// Helper to rematerialize block Dim. Reduces register liveness.
-CUTLASS_DEVICE
-int RematerializeBlockDimZ() {
-  return blockDim.z;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
 /// Threadblock swizzling function for GEMMs
 template <int N = 1>
 struct GemmIdentityThreadblockSwizzle {
@@ -116,10 +62,10 @@ struct GemmIdentityThreadblockSwizzle {
   /// Returns the shape of the problem in units of logical tiles
   /// *Gemm* problem size: gemm(M, N, K)
   CUTLASS_HOST_DEVICE
-  GemmCoord get_tiled_shape(
+  static GemmCoord get_tiled_shape(
     GemmCoord problem_size,
     GemmCoord tile_size,
-    int split_k_slices) const {
+    int split_k_slices) {
 
     return GemmCoord(
       (problem_size.m() + tile_size.m() - 1) / tile_size.m(),
@@ -130,11 +76,11 @@ struct GemmIdentityThreadblockSwizzle {
   /// Returns the shape of the problem in units of logical tiles
   /// *ImplicitGemm* Conv2d problem size: conv_operator(NPQK, NHWC, KRSC)
   CUTLASS_HOST_DEVICE
-  GemmCoord get_tiled_shape(
+  static GemmCoord get_tiled_shape(
     cutlass::conv::Operator conv_operator,
     cutlass::conv::Conv2dProblemSize const &problem_size,
     GemmCoord tile_size,
-    int split_k_slices) const {
+    int split_k_slices) {
 
     gemm::GemmCoord implicit_gemm_problem_size = 
     cutlass::conv::implicit_gemm_problem_size(conv_operator, problem_size);
@@ -146,11 +92,11 @@ struct GemmIdentityThreadblockSwizzle {
   /// Returns the shape of the problem in units of logical tiles
   /// *ImplicitGemm* Conv3d problem size: conv_operator(NZPQK, NDHWC, KTRSC)
   CUTLASS_HOST_DEVICE
-  GemmCoord get_tiled_shape(
+  static GemmCoord get_tiled_shape(
     cutlass::conv::Operator conv_operator,
     cutlass::conv::Conv3dProblemSize const &problem_size,
     GemmCoord tile_size,
-    int split_k_slices) const {
+    int split_k_slices) {
 
     gemm::GemmCoord implicit_gemm_problem_size = 
     cutlass::conv::implicit_gemm_problem_size(conv_operator, problem_size);
@@ -161,14 +107,14 @@ struct GemmIdentityThreadblockSwizzle {
 
   /// Computes CUDA grid dimensions given a size in units of logical tiles
   CUTLASS_HOST_DEVICE
-  dim3 get_grid_shape(GemmCoord tiled_shape) const {
+  static dim3 get_grid_shape(GemmCoord tiled_shape) {
     int tile = 1 << get_log_tile(tiled_shape);
     return dim3(tiled_shape.m() * tile, (tiled_shape.n() + tile - 1) / tile, tiled_shape.k());
   }
 
   /// Calculates optimal swizzle width
   CUTLASS_HOST_DEVICE
-  int get_log_tile(GemmCoord tiled_shape) const {
+  static int get_log_tile(GemmCoord tiled_shape) {
     auto n = tiled_shape.n();
     // Thresholds picked so that it doesn't cause too many no-op CTAs
     if (N >= 8 && n >= 6)
@@ -183,7 +129,7 @@ struct GemmIdentityThreadblockSwizzle {
 
   /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
   CUTLASS_DEVICE
-  GemmCoord get_tile_offset(int log_tile) const {
+  static GemmCoord get_tile_offset(int log_tile) {
     int block_idx_x = RematerializeBlockIdxX();
     int block_idx_y = RematerializeBlockIdxY();
     int block_idx_z = RematerializeBlockIdxZ();
@@ -195,7 +141,7 @@ struct GemmIdentityThreadblockSwizzle {
 
   /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
   CUTLASS_DEVICE
-  GemmCoord get_tile_offset(GemmCoord tiled_shape) const {
+  static GemmCoord get_tile_offset(GemmCoord tiled_shape) {
 
     int const kTile = N;
     int block_idx_x = RematerializeBlockIdxX();
@@ -222,10 +168,10 @@ struct GemmHorizontalThreadblockSwizzle {
 
   /// Returns the shape of the problem in units of logical tiles
   CUTLASS_HOST_DEVICE
-  GemmCoord get_tiled_shape(
+  static GemmCoord get_tiled_shape(
     GemmCoord problem_size,
     GemmCoord tile_size,
-    int split_k_slices) const {
+    int split_k_slices) {
 
     return GemmCoord(
       (problem_size.m() + tile_size.m() - 1) / tile_size.m(),
@@ -235,19 +181,19 @@ struct GemmHorizontalThreadblockSwizzle {
 
   /// Computes CUDA grid dimensions given a size in units of logical tiles
   CUTLASS_HOST_DEVICE
-  dim3 get_grid_shape(GemmCoord tiled_shape) const {
+  static dim3 get_grid_shape(GemmCoord tiled_shape) {
     return dim3(tiled_shape.n(), tiled_shape.m(), tiled_shape.k());
   }
 
   /// Calculates optimal swizzle width
   CUTLASS_HOST_DEVICE
-  int get_log_tile(GemmCoord tiled_shape) const {
+  static int get_log_tile(GemmCoord tiled_shape) {
     return 0;
   }
 
   /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
   CUTLASS_DEVICE
-  GemmCoord get_tile_offset(GemmCoord tiled_shape) const {
+  static GemmCoord get_tile_offset(GemmCoord tiled_shape) {
     return GemmCoord{
       RematerializeBlockIdxY(),
       RematerializeBlockIdxX(),
@@ -263,10 +209,10 @@ struct GemmBatchedIdentityThreadblockSwizzle {
 
   /// Returns the shape of the problem in units of logical tiles
   CUTLASS_HOST_DEVICE
-  GemmCoord get_tiled_shape(
+  static GemmCoord get_tiled_shape(
     GemmCoord problem_size,
     GemmCoord tile_size,
-    int batch_count) const {
+    int batch_count) {
 
     return GemmCoord(
       (problem_size.m() + tile_size.m() - 1) / tile_size.m(),
@@ -276,19 +222,19 @@ struct GemmBatchedIdentityThreadblockSwizzle {
 
   /// Computes CUDA grid dimensions given a size in units of logical tiles
   CUTLASS_HOST_DEVICE
-  dim3 get_grid_shape(GemmCoord tiled_shape) const {
+  static dim3 get_grid_shape(GemmCoord tiled_shape) {
     return dim3(tiled_shape.m(), tiled_shape.n(), tiled_shape.k());
   }
 
   /// Calculates optimal swizzle width
   CUTLASS_HOST_DEVICE
-  int get_log_tile(GemmCoord tiled_shape) const {
+  static int get_log_tile(GemmCoord tiled_shape) {
     return 0;
   }
 
   /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
   CUTLASS_DEVICE
-  GemmCoord get_tile_offset(GemmCoord tiled_shape) const {
+  static GemmCoord get_tile_offset(GemmCoord tiled_shape) {
     return GemmCoord{
       RematerializeBlockIdxX(),
       RematerializeBlockIdxY(),
@@ -298,7 +244,7 @@ struct GemmBatchedIdentityThreadblockSwizzle {
 
   /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
   CUTLASS_DEVICE
-  GemmCoord get_tile_offset(int log_tile) const {
+  static GemmCoord get_tile_offset(int log_tile) {
     int block_idx_x = RematerializeBlockIdxX();
     int block_idx_y = RematerializeBlockIdxY();
     int block_idx_z = RematerializeBlockIdxZ();
@@ -310,7 +256,7 @@ struct GemmBatchedIdentityThreadblockSwizzle {
 
   /// Gets the batch index
   CUTLASS_DEVICE
-  int get_batch_idx() const {
+  static int get_batch_idx() {
     return RematerializeBlockIdxZ();
   }
 };
@@ -325,10 +271,10 @@ struct GemmSplitKIdentityThreadblockSwizzle {
 
   /// Returns the shape of the problem in units of logical tiles
   CUTLASS_HOST_DEVICE
-  GemmCoord get_tiled_shape(
+  static GemmCoord get_tiled_shape(
     GemmCoord problem_size,
     GemmCoord tile_size,
-    int partitions) const {
+    int partitions) {
 
     return GemmCoord(
       (problem_size.m() + tile_size.m() - 1) / tile_size.m(),
@@ -338,7 +284,7 @@ struct GemmSplitKIdentityThreadblockSwizzle {
 
   /// Calculates optimal swizzle width
   CUTLASS_HOST_DEVICE
-  int get_log_tile(GemmCoord tiled_shape) const {
+  static int get_log_tile(GemmCoord tiled_shape) {
     auto n = tiled_shape.n();
     // Thresholds picked so that it doesn't cause too many no-op CTAs
     if (N >= 8 && n >= 6)
@@ -353,14 +299,14 @@ struct GemmSplitKIdentityThreadblockSwizzle {
 
   /// Computes CUDA grid dimensions given a size in units of logical tiles
   CUTLASS_HOST_DEVICE
-  dim3 get_grid_shape(GemmCoord tiled_shape) const {
+  static dim3 get_grid_shape(GemmCoord tiled_shape) {
     int tile = 1 << get_log_tile(tiled_shape);
     return dim3(tiled_shape.m() * tile, (tiled_shape.n() + tile - 1) / tile, tiled_shape.k());
   }
 
   /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
   CUTLASS_DEVICE
-  GemmCoord get_tile_offset(int log_tile) const {
+  static GemmCoord get_tile_offset(int log_tile) {
     int block_idx_x = RematerializeBlockIdxX();
     int block_idx_y = RematerializeBlockIdxY();
     int block_idx_z = RematerializeBlockIdxZ();
@@ -372,7 +318,7 @@ struct GemmSplitKIdentityThreadblockSwizzle {
 
   /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
   CUTLASS_DEVICE
-  GemmCoord get_tile_offset(GemmCoord tiled_shape) const {
+  static GemmCoord get_tile_offset(GemmCoord tiled_shape) {
 
     int const kTile = N;
     int block_idx_x = RematerializeBlockIdxX();
@@ -396,10 +342,10 @@ struct GemmSplitKHorizontalThreadblockSwizzle {
 
   /// Returns the shape of the problem in units of logical tiles
   CUTLASS_HOST_DEVICE
-  GemmCoord get_tiled_shape(
+  static GemmCoord get_tiled_shape(
     GemmCoord problem_size,
     GemmCoord tile_size,
-    int partitions) const {
+    int partitions) {
 
     return GemmCoord(
       (problem_size.m() + tile_size.m() - 1) / tile_size.m(),
@@ -409,19 +355,19 @@ struct GemmSplitKHorizontalThreadblockSwizzle {
 
   /// Computes CUDA grid dimensions given a size in units of logical tiles
   CUTLASS_HOST_DEVICE
-  dim3 get_grid_shape(GemmCoord tiled_shape) const {
+  static dim3 get_grid_shape(GemmCoord tiled_shape) {
     return dim3(tiled_shape.n(), tiled_shape.m(), tiled_shape.k());
   }
 
   /// Calculates optimal swizzle width
   CUTLASS_HOST_DEVICE
-  int get_log_tile(GemmCoord tiled_shape) const {
+  static int get_log_tile(GemmCoord tiled_shape) {
     return 0;
   }
 
   /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
   CUTLASS_DEVICE
-  GemmCoord get_tile_offset(int log_tile) const {
+  static GemmCoord get_tile_offset(int log_tile) {
     return GemmCoord{
       RematerializeBlockIdxY(),
       RematerializeBlockIdxX(),
@@ -431,7 +377,7 @@ struct GemmSplitKHorizontalThreadblockSwizzle {
 
   /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
   CUTLASS_DEVICE
-  GemmCoord get_tile_offset(GemmCoord tiled_shape) const {
+  static GemmCoord get_tile_offset(GemmCoord tiled_shape) {
     return GemmCoord{
       RematerializeBlockIdxY(),
       RematerializeBlockIdxX(),
@@ -447,9 +393,9 @@ struct GemvBatchedStridedThreadblockDefaultSwizzle {
 
   /// Returns the shape of the problem in units of logical tiles
   CUTLASS_HOST_DEVICE
-  BatchedGemmCoord get_tiled_shape(
+  static BatchedGemmCoord get_tiled_shape(
     BatchedGemmCoord problem_size,
-    BatchedGemmCoord tile_size) const {
+    BatchedGemmCoord tile_size) {
 
     return BatchedGemmCoord(
       1, // M is always 1
@@ -460,19 +406,19 @@ struct GemvBatchedStridedThreadblockDefaultSwizzle {
 
   /// Computes CUDA grid dimensions given a size in units of logical tiles
   CUTLASS_HOST_DEVICE
-  dim3 get_grid_shape(BatchedGemmCoord tiled_shape) const {
+  static dim3 get_grid_shape(BatchedGemmCoord tiled_shape) {
     return dim3(tiled_shape.n(), tiled_shape.batch(), tiled_shape.k());
   }
 
   /// Calculates optimal swizzle width
   CUTLASS_HOST_DEVICE
-  int get_log_tile(GemmCoord tiled_shape) const {
+  static int get_log_tile(GemmCoord tiled_shape) {
     return 0;
   }
 
   /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
   CUTLASS_DEVICE
-  BatchedGemmCoord get_tile_offset(int log_tile) const {
+  static BatchedGemmCoord get_tile_offset(int log_tile) {
     return BatchedGemmCoord{
       0, // M is always 1
       RematerializeBlockIdxX(),
@@ -483,7 +429,7 @@ struct GemvBatchedStridedThreadblockDefaultSwizzle {
 
   /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
   CUTLASS_DEVICE
-  BatchedGemmCoord get_tile_offset() const {
+  static BatchedGemmCoord get_tile_offset() {
     return BatchedGemmCoord{
       0, // M is always 1
       RematerializeBlockIdxX(),
@@ -494,13 +440,13 @@ struct GemvBatchedStridedThreadblockDefaultSwizzle {
 
   /// Gets the batch tile index
   CUTLASS_DEVICE
-  int get_batch_tile_idx() const {
+  static int get_batch_tile_idx() {
     return RematerializeBlockIdxY();
   }
 
   /// Gets the absolute batch index
   CUTLASS_DEVICE
-  int get_batch_idx() const {
+  static int get_batch_idx() {
     return RematerializeBlockDimY()*RematerializeBlockIdxY() + RematerializeThreadIdxY();
   }
 };
diff --git a/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h b/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h
new file mode 100644
index 0000000000..b79e587d7c
--- /dev/null
+++ b/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h
@@ -0,0 +1,801 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implements streamk threadblock mapping blockIdx to GEMM problems.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/gemm/gemm_enumerated_types.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/gemm/threadblock/index_remat.h"
+
+#if !defined(__CUDACC_RTC__)
+#include <iostream>
+#include "cutlass/core_io.h"
+#include "cutlass/trace.h"
+#endif
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Threadblock mapping control for GEMMs
+struct ThreadblockSwizzleStreamK {
+
+  /// Advertise StreamkFeature
+  using StreamkFeature = void;
+
+
+  /// Kernel traits
+  template <typename GemmKernel>
+  struct KernelTraits {};
+
+
+  /// Reduction strategy
+  enum ReductionStrategy
+  {
+    kNone,      // Data-parallel strategy (no seams, fixup, etc.)
+
+    kAtomic,    // Non-deterministic reduction of SK-block partials using atomic aggregation in L2
+
+    kMixed,     // Deterministic reduction of SK-block partials employing either:
+                //   (a) A separate wave of reduction thread blocks" (for scenarios with lots of
+                //       SK-blocks per SK-tile)
+                //   (b) Turnstile-ordered atomic aggregation in L2 (for scenarios with few
+                //       SK-blocks per SK-tile)
+  };
+
+  static ReductionStrategy const kReductionStrategy = kMixed;
+
+
+  //
+  // Heuristics
+  //
+
+  /// Data-parallel wave-quantization efficiency threshold (above which we go data-parallel)
+  static float constexpr kDpEfficiencyThreshold = 0.92f;
+
+  /// Minimum number of MAC-iterations per streamk block
+  static int const kMinItersPerSkBlock = 2;
+
+  /// Height in CTAs of a grid rasterization cohort
+  static int const kCohortCtasM = 8;
+
+  /// Width in CTAs of a grid rasterization cohort
+  static int const kCohortCtasN = 4;
+
+  /// Number of CTAs per cohort
+  static int const kCtasPerCohort = kCohortCtasN * kCohortCtasM;
+
+  /// Cost-equivalent number of SM-iterations for fixup I/O
+  static int const kFixupStartupIterEquiv = 10;
+  static int const kFixupPeerIterEquiv = 3;
+
+
+  //
+  // Member state
+  //
+
+
+  /// The 3D value-extents of the GEMM computation volume (m,n,k)
+  GemmCoord problem_size;
+
+  /// Div/mod accelerators
+  FastDivmod div_mod_tiled_shape_m;
+  FastDivmod div_mod_tiled_shape_n;
+  FastDivmod div_mod_tiled_cohort_shape_n;
+  FastDivmod div_mod_iters_per_tile;
+
+  /// Whether to perform cohort CTA rasterization
+  bool cohort_raster;
+
+  // Whether to pad and remap block indices
+  bool remap_block_indices;
+
+  /// CTA occupancy per SM
+  int sm_occupancy;
+
+  /// Number of SMs for dispatch heuristics to load-balance using Stream-K CTAs (wave size)
+  int avail_sms;
+
+  int dp_blocks;                            /// Number of data-parallel thread blocks in the grid
+  int dp_first_wave_tiles;                  /// Number of output tiles each CTA in the first DP wave will produce
+
+  /// Number of reduction blocks in the grid
+  int reduction_blocks;
+
+  int sk_waves;
+  int sk_tiles;
+  int sk_big_blocks_per_region;
+  int sk_iters_per_region;
+
+  /// Div/mod accelerators
+  FastDivmod div_mod_sk_iters_per_normal_block;
+  FastDivmod div_mod_sk_iters_per_big_block;
+  FastDivmod div_mod_sk_iters_per_region;
+  FastDivmod div_mod_sk_regions;                      //!! used in block map
+  FastDivmod div_mod_sk_blocks_per_region;            //!! used in block map
+
+  /// The batch count
+  int batch_count;
+
+
+  //
+  // Host+device interface
+  //
+
+  /// Constructor
+  ThreadblockSwizzleStreamK() = default;
+
+  /// Returns the GEMM volume in thread block tiles
+  CUTLASS_HOST_DEVICE
+  GemmCoord tiled_shape() const
+  {
+    return GemmCoord(
+        static_cast<int>(div_mod_tiled_shape_m),
+        static_cast<int>(div_mod_tiled_shape_n),
+        batch_count);
+  }
+
+  /// Number of iterations per output tile
+  CUTLASS_HOST_DEVICE
+  int iters_per_tile() const
+  {
+    return static_cast<int>(div_mod_iters_per_tile);
+  }
+
+  /// Number of iterations for normal SK-blocks
+  CUTLASS_HOST_DEVICE
+  int sk_iters_per_normal_block() const
+  {
+    return static_cast<int>(div_mod_sk_iters_per_normal_block);
+  }
+
+  /// Number of SK regions
+  CUTLASS_HOST_DEVICE
+  int sk_regions() const
+  {
+    return static_cast<int>(div_mod_sk_regions);
+  }
+
+  /// Number of SK blocks per region (splitting factor)
+  CUTLASS_HOST_DEVICE
+  int sk_blocks_per_region() const
+  {
+    return static_cast<int>(div_mod_sk_blocks_per_region);
+  }
+
+
+  //
+  // Host-side interface
+  //
+
+  /// Debug print
+  void Print()
+  {
+#ifndef __CUDA_ARCH__
+    auto tiles = tiled_shape().mn().product();
+    std::cout <<
+        "problem_size: (" << problem_size.m() << "," << problem_size.n() << ")" <<
+        ", tiled_shape: (" << tiled_shape().m() << "," << tiled_shape().n() << ")" <<
+        ", tiles: " << tiles <<
+        ", dp_tiles: " << tiles - sk_tiles <<
+        ", sk_tiles: " << sk_tiles <<
+        ", iters_per_tile: " << iters_per_tile() <<
+        ", reduction_blocks: " << reduction_blocks <<
+        ", dp_blocks: " << dp_blocks <<
+        ", dp_waves: " << dp_blocks / avail_sms <<
+        ", dp_first_wave_tiles: " << dp_first_wave_tiles <<
+        ", sk_blocks_per_region: " << sk_blocks_per_region() <<
+        ", sk_regions: " << sk_regions() <<
+        ", sk_waves: " << sk_waves <<
+        ", sk_iters_per_normal_block: " << sk_iters_per_normal_block() <<
+        ", sk_big_blocks_per_region: " << sk_big_blocks_per_region <<
+        ", remap_block_indices: " << remap_block_indices <<
+        ", cohort_raster: " << cohort_raster <<
+        ", sm_occupancy: " << sm_occupancy <<
+        ", avail_sms: " << avail_sms <<
+        ", num_blocks: " << get_num_blocks() <<
+        "\n\n";
+#endif
+  }
+
+
+  // Compute sk_blocks to dispatch for a given number of sk_tiles
+  static void get_sk_blocks(
+    int &sk_blocks,     /// [out]
+    int &savings_iters, /// [out]
+    int sk_tiles,
+    int iters_per_tile,
+    int avail_sms,
+    int max_sk_occupancy,
+    bool allow_partial_wave)
+  {
+    savings_iters = INT_MIN;
+    sk_blocks = 0;
+
+    if (sk_tiles == 0) {
+      return;
+    }
+
+    int sk_iters = sk_tiles * iters_per_tile;
+
+    int dp_equiv_waves = (sk_tiles + avail_sms - 1) / avail_sms;
+    int dp_equiv_iters = iters_per_tile * dp_equiv_waves;
+
+    int min_sk_blocks = (allow_partial_wave) ? fast_min(avail_sms, sk_tiles + 1) : avail_sms;
+    int max_sk_blocks = fast_min(avail_sms * max_sk_occupancy, sk_iters / kMinItersPerSkBlock);
+
+    for (int trial_sk_blocks = min_sk_blocks; trial_sk_blocks <= max_sk_blocks; ++trial_sk_blocks)
+    {
+      int sk_waves = (trial_sk_blocks + avail_sms - 1) / avail_sms;
+      int max_sk_iters_per_block = (sk_iters + trial_sk_blocks - 1) / trial_sk_blocks;
+      int sk_iter_equiv = max_sk_iters_per_block * sk_waves;
+
+      int num_peers = ((trial_sk_blocks + sk_tiles - 1) / sk_tiles) + 1;        // add one for alignment skew
+
+      float iter_cost = 0.02f * float(num_peers) * float(sk_iter_equiv);
+
+      if (trial_sk_blocks % sk_tiles == 0)
+      {
+        // aligned
+        num_peers = (trial_sk_blocks / sk_tiles);
+
+        iter_cost = 0.0f;
+      }
+
+      float peer_cost = 2.0f * float(num_peers);
+
+      float base_cost = 2.0f * float(sk_waves);
+
+      int fixup_iter_equiv = int(base_cost + iter_cost + peer_cost);
+
+      int trial_savings_iters = dp_equiv_iters - sk_iter_equiv - fixup_iter_equiv;
+
+      if (trial_savings_iters >= savings_iters) {
+          savings_iters = trial_savings_iters;
+          sk_blocks = trial_sk_blocks;
+      }
+    }
+  }
+
+
+  /// Determine the populations of DP and SK blocks to invoke for the given number of output tiles
+  static void get_blocks(
+    int &dp_tiles,      /// [out]
+    int &sk_blocks,     /// [out]
+    int output_tiles,
+    int iters_per_tile,
+    int avail_sms,
+    int sm_occupancy)
+  {
+    int full_waves = output_tiles / avail_sms;
+    int full_wave_tiles = full_waves * avail_sms;
+    int partial_wave_tiles = output_tiles - full_wave_tiles;
+
+    int score = -1;
+    dp_tiles = output_tiles;
+    sk_blocks = 0;
+
+    if (partial_wave_tiles == 0)
+    {
+      // Perfect quantization
+      return;
+    }
+
+    if (full_waves < sm_occupancy)
+    {
+        // We're less than full GPU occupancy
+
+        // Form the SK wave from the partial wave to get us up to full GPU occupancy
+        int max_sk_occupancy = sm_occupancy - full_waves;
+
+        dp_tiles = full_wave_tiles;
+
+        get_sk_blocks(
+          sk_blocks,
+          score,
+          partial_wave_tiles,
+          iters_per_tile,
+          avail_sms,
+          max_sk_occupancy,
+          true);                 // we can run with less than a full wave of SK-blocks
+
+        if (score < 0) {
+          // not profitable
+          sk_blocks = 0;
+          dp_tiles = output_tiles;
+        }
+
+        return;
+    }
+
+    // We're at (or greater) than GPU occupancy
+
+    if ((sm_occupancy > 1 ) && (full_waves % sm_occupancy == sm_occupancy - 1))
+    {
+        // If occupancy is more than one CTA per SM, form the SK wave from the partial
+        // wave to get us to full GPU occupancy
+        int max_sk_occupancy = 1;
+
+        dp_tiles = full_wave_tiles;
+
+        get_sk_blocks(
+          sk_blocks,
+          score,
+          partial_wave_tiles,
+          iters_per_tile,
+          avail_sms,
+          max_sk_occupancy,
+          true);                 // we can run with less than a full wave of SK-blocks
+
+        if (score >= 0) {
+            return;
+        }
+    }
+
+    // Form the SK wave by combining the last full wave and the partial wave
+    // We're less than full GPU occupancy
+    dp_tiles = full_wave_tiles - avail_sms;
+
+    int max_sk_occupancy = sm_occupancy - ((full_waves - 1) % sm_occupancy);
+
+    get_sk_blocks(
+      sk_blocks,
+      score,
+      partial_wave_tiles + avail_sms,
+      iters_per_tile,
+      avail_sms,
+      max_sk_occupancy,
+      false);                 // we cannot run with less than a full wave of SK-blocks
+
+    if (score < 0) {
+      // not profitable
+      sk_blocks = 0;
+      dp_tiles = output_tiles;
+    }
+
+  }
+
+  /// Constructor: *Gemm* problem size (m, n, k)
+  ThreadblockSwizzleStreamK(
+    GemmUniversalMode const mode_,
+    GemmCoord const problem_size_,
+    GemmCoord const tile_size_,
+    int const batch_split_,                        /// Either (mode == GemmUniversalMode::kBatched) the batch count, or (mode == GemmUniversalMode::kGemm) the tile-splitting factor (1 defaults to StreamK, >1 emulates Split-K)
+    int const sm_occupancy_,
+    int const device_sms_,
+    int const avail_sms_,                          /// The number of SMs that StreamK dispatch heuristics will attempt to load-balance across (-1 defaults to device width, 1 implies classic data-parallel scheduling)
+    size_t const element_A_bytes_,
+    size_t const element_B_bytes_,
+    size_t const element_C_bytes_,
+    int const epilogue_acc_fragments_)
+  :
+    problem_size(problem_size_),
+    batch_count((mode_ == GemmUniversalMode::kBatched || mode_ == GemmUniversalMode::kArray) ? batch_split_ : 1),
+    reduction_blocks(0),
+    dp_blocks(0),
+    dp_first_wave_tiles(1),     // Default: one tile per DP-block in the first wave of DP blocks
+    sk_tiles(0),
+    sk_big_blocks_per_region(0),
+    sk_iters_per_region(0),
+    sk_waves(0),
+    sm_occupancy(sm_occupancy_),
+    remap_block_indices(false),
+    avail_sms(fast_max(1, avail_sms_)),
+    cohort_raster(false)
+  {
+    int gpu_occupancy = device_sms_ * sm_occupancy;
+    int iters_per_tile = (problem_size.k() + tile_size_.k() - 1) / tile_size_.k();
+    int sk_iters_per_normal_block = 0;
+
+    int sk_regions = 1;              // Default: a single region of iteration space (across all SK tiles)
+    int sk_blocks_per_region = 0;
+
+    GemmCoord tiled_shape(
+      (problem_size.m() + tile_size_.m() - 1) / tile_size_.m(),
+      (problem_size.n() + tile_size_.n() - 1) / tile_size_.n(),
+      batch_count);
+
+    size_t problem_bytes =
+              (element_C_bytes_ * problem_size.m() * problem_size.n()) +
+              (element_A_bytes_ * problem_size.m() * problem_size.k()) +
+              (element_B_bytes_ * problem_size.k() * problem_size.n());
+
+    size_t problem_flops = size_t(problem_size.m()) * size_t(problem_size.n()) * size_t(problem_size.k()) * 2;
+
+    [[maybe_unused]] float flops_per_byte = float(problem_flops) / float(problem_bytes);
+
+    int output_tiles = tiled_shape.m() * tiled_shape.n();
+    int waves = (output_tiles + avail_sms - 1) / avail_sms;
+    [[maybe_unused]] float dp_efficiency = float(output_tiles) / float(waves * avail_sms);
+
+    //
+    // Determine dispatch composition of DP-tiles and SK-blocks
+    //
+
+    // Start with a DP-only configuration
+    int dp_tiles = output_tiles;    // Number of data-parallel tiles
+    int sk_blocks = 0;              // Number of thread blocks to produce the remaining SK tiles
+
+    // Only kGemm mode allows for SK load balancing
+    if (mode_ == GemmUniversalMode::kGemm)
+    {
+      int split_factor = batch_split_;
+      if (split_factor > 1)
+      {
+        // Split-K override
+        dp_tiles = 0;
+        sk_blocks = output_tiles * split_factor;
+      }
+      else if ((kReductionStrategy != kNone) &&   // Load-balancing strategy statically enabled
+        (avail_sms > 1))                         // Plurality of SMs to load balance across
+      {
+        // Use heuristics
+        get_blocks(
+          dp_tiles,      /// [out]
+          sk_blocks,     /// [out]
+          output_tiles,
+          iters_per_tile,
+          avail_sms,
+          sm_occupancy);
+      }
+    }
+
+    sk_tiles = output_tiles - dp_tiles;
+
+
+    // Compute SK block iteration details
+    if (sk_blocks > 0)
+    {
+      sk_waves = (sk_blocks + avail_sms - 1) / avail_sms;
+
+      int sk_iters = sk_tiles * iters_per_tile;
+      sk_blocks = fast_min(sk_blocks, sk_iters);
+
+      sk_iters_per_normal_block = sk_iters / sk_blocks;
+      int extra_sk_iters = sk_iters - (sk_iters_per_normal_block * sk_blocks);
+      int sk_big_blocks = extra_sk_iters;
+
+      if ((sk_blocks > sk_tiles) && (sk_blocks % sk_tiles == 0))
+      {
+        // Split-K decomposition
+        sk_regions = sk_tiles;
+      }
+
+      sk_blocks_per_region = sk_blocks / sk_regions;
+      sk_big_blocks_per_region = sk_big_blocks / sk_regions;
+      sk_iters_per_region = sk_iters / sk_regions;
+
+      // Use a separate reduction wave when all of:
+      // - Non-atomic reduction stratgy
+      // - The number of SK waves won't fully occupy the GPU (Otherwise we don't have
+      //   a strong-scaling case for more parallel reduction)
+      // - More than three peers working on an SK tile.  (This occurs when the ratio of
+      //   SK-blocks to SK-tiles > 2, as a single tile may be covered by four SK-blocks,
+      //   e.g.:[partial-block | block | block | partial-block] ).  With three or
+      //   less peers, the two non-finishing SK-blocks are not expexted to contend.
+      if ((kReductionStrategy == kMixed) &&
+          (sk_waves < sm_occupancy) &&
+          (sk_blocks > 2 * sk_tiles))
+      {
+        // Launch a reduction block for every accumulator fragment in each SK-tile
+        reduction_blocks = sk_tiles * epilogue_acc_fragments_;
+
+      }
+
+      // When we have a multi-occupancy kernel and at least two waves of active blocks (where
+      // at least one wave is SK blocks), we need to (1) dispatch at least four waves, and (2)
+      // remap the block indices so that we can reliably spread the SK blocks evenly across the
+      // device's first SM occupancy valence. Also see get_num_blocks() and get_block_idx().
+      remap_block_indices = (
+          (sm_occupancy > 1) &&
+          (device_sms_ == avail_sms) &&
+          (get_num_active_blocks() > avail_sms * 2));
+
+      // Initialize fast div/mod members related to SK
+      div_mod_sk_iters_per_normal_block = FastDivmod(sk_iters_per_normal_block);
+      div_mod_sk_iters_per_big_block = FastDivmod(sk_iters_per_normal_block + 1);
+      div_mod_sk_iters_per_region = FastDivmod(sk_iters_per_region);
+      div_mod_sk_regions = FastDivmod(sk_regions);
+      div_mod_sk_blocks_per_region = FastDivmod(sk_blocks_per_region);
+    }
+
+    //
+    // Compute DP blocks
+    //
+
+    dp_blocks = dp_tiles;
+
+    cutlass::gemm::GemmCoord tiled_cohort_shape(
+        (tiled_shape.m() + kCohortCtasM - 1) / kCohortCtasM,
+        (tiled_shape.n() + kCohortCtasN - 1) / kCohortCtasN,
+        tiled_shape.k());
+    int cohort_blocks = (tiled_cohort_shape.m() * tiled_cohort_shape.n()) * kCtasPerCohort;
+    float cohort_efficiency = float(dp_blocks) / float(cohort_blocks);
+
+    // Check if the SK tiles would be in cohorts that are in-bounds
+    bool sk_in_range = true;
+    if (sk_tiles > 0)
+    {
+      int last_sk_tile = sk_tiles - 1;
+      int cohort_tile_idx = last_sk_tile / kCtasPerCohort;
+      int cohort_grid_m = cohort_tile_idx / tiled_cohort_shape.n();
+      int cohort_grid_n = (cohort_grid_m > 0) ?
+        tiled_cohort_shape.n() - 1 :
+        cohort_tile_idx % tiled_cohort_shape.n();
+
+      if ((((cohort_grid_m + 1) * kCohortCtasM) >= tiled_shape.m()) ||
+          (((cohort_grid_n + 1) * kCohortCtasN) >= tiled_shape.n()))
+      {
+        sk_in_range = false;
+      }
+
+    }
+
+    // Decide if we're going to be doing cohort raster
+    if (sk_in_range &&
+        (dp_blocks >= gpu_occupancy * 2) &&
+        (cohort_efficiency > 0.85f))
+    {
+      cohort_raster = true;
+      dp_blocks = cohort_blocks;
+    }
+    else if (sk_waves > 0)
+    {
+      // Update semi-persistence of first DP wave to ensure full grid wavesets
+      // (Only applies when there's an SK component and we're not doing blocked cohort rasterization)
+      int dp_tile_waves = (dp_tiles + avail_sms - 1) / avail_sms;
+      int full_dp_tile_waves = dp_tiles / avail_sms;
+      int waveset_excess = (sk_waves + dp_tile_waves) % sm_occupancy;
+
+      if (dp_first_wave_tiles + waveset_excess <= full_dp_tile_waves)
+      {
+        dp_first_wave_tiles += waveset_excess;
+        dp_blocks -= (waveset_excess * avail_sms);
+      }
+    }
+
+    // Setup fast-div/mod for device-side usage
+    div_mod_tiled_shape_m = FastDivmod(tiled_shape.m());
+    div_mod_tiled_shape_n = FastDivmod(tiled_shape.n());
+    div_mod_tiled_cohort_shape_n = FastDivmod(tiled_cohort_shape.n());
+    div_mod_iters_per_tile = FastDivmod(iters_per_tile);
+
+  }
+
+  /// Number of blocks performing useful work
+  int get_num_active_blocks() const
+  {
+    return (sk_waves * avail_sms) + dp_blocks + reduction_blocks;
+  }
+
+  /// Obtains number of threadblocks per GEMM
+  int get_num_blocks() const
+  {
+    int active_blocks = get_num_active_blocks();
+    if (remap_block_indices)
+    {
+      // Add padding blocks if we are performing remapping in order to dispatch a grid of at least four waves
+      return fast_max(active_blocks, avail_sms * 4);
+    }
+
+    return active_blocks;
+  }
+
+
+  /// Obtains grid extents in CTAs
+  dim3 get_grid_dims() const
+  {
+    return dim3(get_num_blocks(), 1, batch_count);
+  }
+
+
+  //
+  // Device-side interface
+  //
+
+  /// Obtains number of threadblocks per GEMM
+  CUTLASS_DEVICE
+  int device_num_blocks() const
+  {
+    return gridDim.x;
+  }
+
+  /// Obtains tile index for the given sk iteration
+  CUTLASS_DEVICE
+  int get_sk_tile_idx(int iter) const
+  {
+    int tile_idx = div_mod_iters_per_tile.div(iter);
+    return tile_idx;
+  }
+
+  /// Obtains the batch index
+  CUTLASS_DEVICE
+  int get_batch_idx() const
+  {
+    return RematerializeBlockIdxZ();
+  }
+
+  /// Obtains the calling threadblock's tiled coordinates for the given tile index
+  CUTLASS_DEVICE
+  GemmCoord get_tile_offset(int tile_idx) const
+  {
+    int m, n;
+
+    // row-major raster
+    div_mod_tiled_shape_n(m, n, tile_idx);
+
+    if (tiled_shape().m() < tiled_shape().n())
+    {
+      // column-major raster
+      div_mod_tiled_shape_m(n, m, tile_idx);
+    }
+
+    if (cohort_raster)
+    {
+      // tiled cohort raster
+      int cohort_tile_idx = tile_idx / kCtasPerCohort;
+      int cohort_grid_m, cohort_grid_n;
+      div_mod_tiled_cohort_shape_n(cohort_grid_m, cohort_grid_n, cohort_tile_idx);
+
+      int block_idx_cohort = tile_idx % kCtasPerCohort;
+      int block_cohort_m = block_idx_cohort / kCohortCtasN;
+      int block_cohort_n = block_idx_cohort % kCohortCtasN;
+
+      m = (cohort_grid_m * kCohortCtasM) + block_cohort_m;
+      n = (cohort_grid_n * kCohortCtasN) + block_cohort_n;
+    }
+
+    return GemmCoord(m, n, get_batch_idx());
+  }
+
+  /// Obtains the calling threadblock's tiled coordinates for the given tile index (row-major rasterization)
+  CUTLASS_DEVICE
+  GemmCoord get_tile_offset_row_major(int tile_idx) const
+  {
+    // row-major raster
+    int m, n;
+    div_mod_tiled_shape_n(m, n, tile_idx);
+    return GemmCoord(m, n, get_batch_idx());
+  }
+
+  /// Obtains calling threadblock's linear threadblock index
+  CUTLASS_DEVICE
+  int get_block_idx() const
+  {
+    int block_idx = RematerializeBlockIdxX();
+
+    // Remap the block indices for the first two waves of thread blocks if
+    // we have multi-occupancy and the grid constitutes four or more waves
+    if (remap_block_indices && (block_idx < avail_sms * 2))
+    {
+      int dest_sm = block_idx / 2;
+      int dest_wave = block_idx % 2;
+      int remapped_block_idx = dest_sm + (dest_wave * avail_sms);
+      block_idx = remapped_block_idx;
+    }
+
+    // Remap block indices to interleave SK regions to limit intra-region waiting
+    if (block_idx < sk_regions() * sk_blocks_per_region())
+    {
+      int block_in_region;
+      int region;
+      div_mod_sk_regions(block_in_region, region, block_idx);
+      block_idx = (region * sk_blocks_per_region()) + block_in_region;
+    }
+
+    return block_idx;
+  }
+
+
+  /// Obtains calling linear threadblock index of the first block to work on the given tile
+  CUTLASS_DEVICE
+  int get_sk_block_idx(int iter) const
+  {
+    int region_idx;
+    int iter_in_region;
+    div_mod_sk_iters_per_region(region_idx, iter_in_region, iter);
+
+    int big_block_iters = (sk_big_blocks_per_region * sk_iters_per_normal_block()) + sk_big_blocks_per_region;   // number of iterations in the region's big blocks
+    int normal_block_iters = iter_in_region - big_block_iters;                                                 // number of iterations in the region's normal blocks
+
+    int big_block_idx_in_region = div_mod_sk_iters_per_big_block.div(iter_in_region);
+    int normal_block_idx_in_region = sk_big_blocks_per_region + div_mod_sk_iters_per_normal_block.div(normal_block_iters);
+
+    int block_idx_in_region = (big_block_idx_in_region < sk_big_blocks_per_region) ?
+        big_block_idx_in_region :
+        normal_block_idx_in_region;
+
+    int owning_block_idx = (sk_blocks_per_region() * region_idx) + block_idx_in_region;
+
+    return owning_block_idx;
+  }
+
+  /// Obtains iteration extends for the given SK block index
+  CUTLASS_DEVICE
+  void get_iter_extents(
+      int sk_block_idx,
+      int &block_iter_begin,
+      int &block_iter_end) const
+  {
+    int region_idx;
+    int block_idx_in_region;
+    div_mod_sk_blocks_per_region(region_idx, block_idx_in_region, sk_block_idx);
+
+    block_iter_begin = (region_idx * sk_iters_per_region) + (block_idx_in_region * sk_iters_per_normal_block());
+
+    // Adjust extents for the first "num_big_blocks" blocks that get one extra iteration
+    int block_iters = sk_iters_per_normal_block();
+    if (block_idx_in_region < sk_big_blocks_per_region) {
+      // This is a +1 iteration block
+      block_iter_begin += block_idx_in_region;
+      block_iters++;
+    } else {
+      // This is a regular block
+      block_iter_begin += sk_big_blocks_per_region;
+    }
+    block_iter_end = block_iter_begin + block_iters;
+  }
+
+
+  /// Obtains calling linear threadblock index of the first block to work on the given tile
+  CUTLASS_DEVICE
+  int get_first_block_idx(int tile_idx, int block_idx) const
+  {
+    if (tile_idx >= sk_tiles) {
+      // DP tile
+      return block_idx;
+    }
+
+    int iter = tile_idx * iters_per_tile();
+    return get_sk_block_idx(iter);
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
diff --git a/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h b/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h
index c57053cced..92e698f8a2 100644
--- a/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h
+++ b/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -214,7 +214,7 @@ struct DefaultMmaComplexTensorOp<
 /////////////////////////////////////////////////////////////////////////////////////////////////
 /// Partial specialization - input and output types are complex<float>*complex<float> 
 //  Use TF32 tensor operation internally
-//  4 real-valued MMA.1688.F32.TF32 operations on TF32 
+//  4 real-valued mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32 operations on TF32 
 //  A = (ar + j ai), B (br +j bi), D = AB
 //  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -246,7 +246,7 @@ struct DefaultMmaComplexTensorOp<
     TransformB,
     arch::OpMultiplyAddComplex> {
 
-  // Complex floating point tensor operation use MMA.1688.F32.TF32 mma instruction
+  // Complex floating point tensor operation use mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32 mma instruction
   using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
       cutlass::arch::Mma<
         InstructionShape_, 
@@ -278,7 +278,7 @@ struct DefaultMmaComplexTensorOp<
 /////////////////////////////////////////////////////////////////////////////////////////////////
 /// Partial specialization - input and output types are complex<float>*complex<float> 
 //  Use BF16 tensor operation internally
-//  4 real-valued MMA.1688.F32.BF16 operations on BF16
+//  4 real-valued mma.sync.aligned.m16n8k8.f32.bf16.bf16.f32 operations on BF16
 //  A = (ar + j ai), B (br +j bi), D = AB
 //  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -310,7 +310,7 @@ struct DefaultMmaComplexTensorOp<
     TransformB,
     arch::OpMultiplyAddFastBF16> {
 
-  // Complex floating point tensor operation use MMA.1688.F32.BF16 mma instruction
+  // Complex floating point tensor operation use mma.sync.aligned.m16n8k8.f32.bf16.bf16.f32 mma instruction
   using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
       cutlass::arch::Mma<
         InstructionShape_, 
@@ -342,7 +342,7 @@ struct DefaultMmaComplexTensorOp<
 /////////////////////////////////////////////////////////////////////////////////////////////////
 /// Partial specialization - input and output types are complex<float>*complex<float> 
 //  Use F16 tensor operation internally
-//  4 real-valued MMA.1688.F32.F16 operations on F16
+//  4 real-valued mma.sync.aligned.m16n8k8.f32.f16.f16.f32 operations on F16
 //  A = (ar + j ai), B (br +j bi), D = AB
 //  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -374,7 +374,7 @@ struct DefaultMmaComplexTensorOp<
     TransformB,
     arch::OpMultiplyAddFastF16> {
 
-  // Complex floating point tensor operation use MMA.1688.F32.F16 mma instruction
+  // Complex floating point tensor operation use mma.sync.aligned.m16n8k8.f32.f16.f16.f32 mma instruction
   using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
       cutlass::arch::Mma<
         InstructionShape_, 
@@ -407,7 +407,7 @@ struct DefaultMmaComplexTensorOp<
 /// 3xTF32 or 4xTF32 (fast and accurate complex<float> operation)
 /// Partial specialization - input and output types are complex<float> * complex<float> 
 //  Use 3xTF32 or 4xTF32 tensor operation internally
-//  4 real-valued MMA.1688.F32.TF32 operations on TF32 
+//  4 real-valued mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32 operations on TF32 
 //  A = (ar + j ai), B (br +j bi), D = AB
 //  D = dr + j di = 3x[(ar*br - ai*bi) + j (ar*bi + ai*br)]
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -441,7 +441,7 @@ struct DefaultMmaComplexTensorOp<
     TransformB,
     arch::OpMultiplyAddComplexFastF32> {
 
-  // Complex floating point tensor operation use MMA.1688.F32.TF32 mma instruction
+  // Complex floating point tensor operation use mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32 mma instruction
   using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
       cutlass::arch::Mma<
         InstructionShape_, 
@@ -470,6 +470,143 @@ struct DefaultMmaComplexTensorOp<
     TransformB>;
 };
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex<double>*complex<double> case
+//  4 real-valued mma.sync.aligned.m16n8k4.f64.f64.f64.f64 operations
+//  A = (ar + j ai), B (br +j bi), D = AB
+//  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename WarpShape_,
+    /// Real-valued underlying type of complex-valued A operand
+    typename RealElementA,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Real-valued underlying type of complex-valued B operand
+    typename RealElementB,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Real-valued underlying type of complex-valued C operand
+    typename RealElementC,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Complex transform on A operand
+    ComplexTransform TransformA,
+    /// Complex transform on B operand
+    ComplexTransform TransformB>
+struct DefaultMmaComplexTensorOp<
+    WarpShape_,
+    GemmShape<16, 8, 4>,
+    complex<RealElementA>,
+    LayoutA,
+    complex<RealElementB>,
+    LayoutB,
+    complex<RealElementC>,
+    LayoutC,
+    TransformA,
+    TransformB,
+    arch::OpMultiplyAddComplex> {
+
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        GemmShape<16, 8, 4>,
+        32, 
+        RealElementA,
+        cutlass::layout::RowMajor,
+        RealElementB,
+        cutlass::layout::ColumnMajor,
+        RealElementC,
+        cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd>,
+      cutlass::MatrixShape<1, 1>
+    >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaComplexTensorOp<
+    WarpShape_,
+    complex<RealElementA>,
+    LayoutA,
+    complex<RealElementB>,
+    LayoutB,
+    complex<RealElementC>,
+    LayoutC, 
+    Policy,
+    TransformA,
+    TransformB,
+    true>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for complex<T>*complex<T> case using GaussianComplex operation
+//  3 real-valued mma.sync.aligned.m16n8k4.f64.f64.f64.f64 operations 
+//  A  = (ar + j ai), B = (br +j bi), D = AB
+//  P1 = (ar + ai) * br, P2 = - ar * (br - bi), P3 = ai * (br + bi) 
+//  D  = dr + j di = (P1 - P3) + j (P1 + P2)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename WarpShape_,
+    /// Real-valued underlying type of complex-valued A operand
+    typename RealElementA,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Real-valued underlying type of complex-valued B operand
+    typename RealElementB,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Real-valued underlying type of complex-valued C operand
+    typename RealElementC,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Complex transform on A operand
+    ComplexTransform TransformA,
+    /// Complex transform on B operand
+    ComplexTransform TransformB>
+struct DefaultMmaComplexTensorOp<
+    WarpShape_,
+    GemmShape<16, 8, 4>,
+    complex<RealElementA>,
+    LayoutA,
+    complex<RealElementB>,
+    LayoutB,
+    complex<RealElementC>,
+    LayoutC,
+    TransformA,
+    TransformB,
+    arch::OpMultiplyAddGaussianComplex> {
+
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        GemmShape<16, 8, 4>,
+        32, 
+        RealElementA,
+        cutlass::layout::RowMajor,
+        RealElementB,
+        cutlass::layout::ColumnMajor,
+        RealElementC,
+        cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd>,
+      cutlass::MatrixShape<1, 1>
+    >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaGaussianComplexTensorOp<
+    WarpShape_,
+    complex<RealElementA>,
+    LayoutA,
+    complex<RealElementB>,
+    LayoutB,
+    complex<RealElementC>,
+    LayoutC, 
+    Policy,
+    TransformA,
+    TransformB,
+    true>;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 } // namespace warp
 } // namespace gemm
 } // namespace cutlass
diff --git a/include/cutlass/gemm/warp/default_mma_sparse_tensor_op.h b/include/cutlass/gemm/warp/default_mma_sparse_tensor_op.h
index 65e5571cfc..223426544f 100644
--- a/include/cutlass/gemm/warp/default_mma_sparse_tensor_op.h
+++ b/include/cutlass/gemm/warp/default_mma_sparse_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/default_mma_tensor_op.h b/include/cutlass/gemm/warp/default_mma_tensor_op.h
index b0add9d821..3a8cacd3d5 100644
--- a/include/cutlass/gemm/warp/default_mma_tensor_op.h
+++ b/include/cutlass/gemm/warp/default_mma_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -118,6 +118,6 @@ struct DefaultMmaTensorOp {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-#include "default_mma_tensor_op_sm80.h"
+#include "cutlass/gemm/warp/default_mma_tensor_op_sm80.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h b/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h
index cf92aa8400..67fcde77e5 100644
--- a/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h
+++ b/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -38,6 +38,7 @@
 #include "cutlass/numeric_types.h"
 #include "cutlass/arch/mma.h"
 #include "cutlass/gemm/warp/mma_tensor_op.h"
+#include "cutlass/gemm/warp/mma_mixed_input_tensor_op.h"
 #include "cutlass/gemm/warp/mma_tensor_op_fast_f32.h"
 #include "cutlass/gemm/warp/default_mma_tensor_op.h"
 
@@ -227,6 +228,142 @@ struct DefaultMmaTensorOp<
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+/// Partial Specialization - inputs are mixed types  - uses wider datatype internally.
+/// (e.g. F16 <= F16 x S8 + F16, F16 <= BF16 x S8 + F32)
+template <
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename WarpShape_,
+    /// Element type of A matrix
+    typename ElementA,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Element type of B matrix
+    typename ElementB,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Element type of C matrix
+    typename ElementC,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Number of partitions along K dimension
+    int PartitionsK,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor>
+struct DefaultMmaTensorOp<
+  WarpShape_,
+  GemmShape<16, 8, 16>,                 // InstructionShape
+  ElementA,                             // Element type of A matrix in Global Memory
+  LayoutA,                              // Layout of A matrix in Global Memory
+  ElementB,                             // Element type of B matrix in Global Memory
+  LayoutB,                              // Layout of B matrix in Global Memory
+  ElementC,                             // Element type of C matrix in Global Memory
+  LayoutC,                              // Layout of C matrix in Global Memory
+  arch::OpMultiplyAddMixedInputUpcast,  // Tag to indicate mixed-input datatype, where narrower datatype is upcasted to wider datatype
+  PartitionsK, AccumulatorsInRowMajor> {
+
+
+  // Check if the ElementA and ElementB are of different data types
+  static_assert(!platform::is_same<ElementA, ElementB>::value,
+    "DefaultMmaTensorOp with arch::OpMultiplyAddMixedInputUpcast ElementA and ElementB cannot be of the same data type");
+
+  // Data type used for internal computation - use the wider of the two data types for mma.sync operands
+  using ElementOperand = typename platform::conditional<(sizeof_bits<ElementA>::value > sizeof_bits<ElementB>::value),
+                                                    ElementA, ElementB>::type;
+
+  // Operand datatypes in the internal MMA instruction - use the wider of the two data types
+  using ElementAMma = ElementOperand;
+  using ElementBMma = ElementOperand;
+  using MmaElementC = ElementC;
+
+  // Uses
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        GemmShape<16, 8, 16>,
+        32,
+        ElementAMma, cutlass::layout::RowMajor,
+        ElementBMma, cutlass::layout::ColumnMajor,
+        MmaElementC, cutlass::layout::RowMajor,
+        arch::OpMultiplyAdd
+      >,
+      cutlass::MatrixShape<1, 1> >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaMixedInputTensorOp<
+      WarpShape_, ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+      Policy, PartitionsK, AccumulatorsInRowMajor>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial Specialization - inputs are mixed types  - uses wider datatype internally.
+/// (e.g. S32 <= S4 x S8 + S32, S32 <= S8 x S4 + S32)
+template <
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename WarpShape_,
+    /// Element type of A matrix
+    typename ElementA,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Element type of B matrix
+    typename ElementB,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Element type of C matrix
+    typename ElementC,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Number of partitions along K dimension
+    int PartitionsK,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor>
+struct DefaultMmaTensorOp<
+  WarpShape_,
+  GemmShape<16, 8, 32>,                 // InstructionShape
+  ElementA,                             // Element type of A matrix in Global Memory
+  LayoutA,                              // Layout of A matrix in Global Memory
+  ElementB,                             // Element type of B matrix in Global Memory
+  LayoutB,                              // Layout of B matrix in Global Memory
+  ElementC,                             // Element type of C matrix in Global Memory
+  LayoutC,                              // Layout of C matrix in Global Memory
+  arch::OpMultiplyAddMixedInputUpcast,  // Tag to indicate mixed-input datatype, where narrower datatype is upcasted to wider datatype
+  PartitionsK, AccumulatorsInRowMajor> {
+
+
+  // Check if the ElementA and ElementB are of different data types
+  static_assert(!platform::is_same<ElementA, ElementB>::value,
+    "DefaultMmaTensorOp with arch::OpMultiplyAddMixedInputUpcast ElementA and ElementB cannot be of the same data type");
+
+  // Data type used for internal computation - use the wider of the two data types for mma.sync operands
+  using ElementOperand = typename platform::conditional<(sizeof_bits<ElementA>::value > sizeof_bits<ElementB>::value),
+                                                    ElementA, ElementB>::type;
+
+  // Operand datatypes in the internal MMA instruction - use the wider of the two data types
+  using MmaElementA = ElementOperand;
+  using MmaElementB = ElementOperand;
+  using MmaElementC = ElementC;
+
+  // Uses
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        GemmShape<16, 8, 32>,
+        32,
+        MmaElementA, cutlass::layout::RowMajor,
+        MmaElementB, cutlass::layout::ColumnMajor,
+        MmaElementC, cutlass::layout::RowMajor,
+        arch::OpMultiplyAddSaturate
+      >,
+      cutlass::MatrixShape<1, 1> >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaMixedInputTensorOp<
+      WarpShape_, ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+      Policy, PartitionsK, AccumulatorsInRowMajor>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 } // namespace warp
 } // namespace gemm
 } // namespace cutlass
diff --git a/include/cutlass/gemm/warp/default_mma_with_reduction_tensor_op.h b/include/cutlass/gemm/warp/default_mma_with_reduction_tensor_op.h
index ace2d0f81d..db6713cb48 100644
--- a/include/cutlass/gemm/warp/default_mma_with_reduction_tensor_op.h
+++ b/include/cutlass/gemm/warp/default_mma_with_reduction_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -61,7 +61,9 @@ template <
     /// Layout of C matrix (concept: MatrixLayout)
     typename LayoutC,
     /// Operator describing the tensor operation
-    typename Operator_ = arch::OpMultiplyAdd,
+    typename Operator_,
+    /// Reduce operand A or B along K dimension
+    bool ReduceKForA_,
     /// Number of partitions along K dimension
     int PartitionsK = 1,
     /// Store the accumulators in row major or column major.  Row major is used
@@ -78,7 +80,7 @@ struct DefaultMmaWithReductionTensorOp {
   // Define the warp-level tensor op
   using Type = cutlass::gemm::warp::MmaWithReductionTensorOp<
       WarpShape_, ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-      Policy, PartitionsK, AccumulatorsInRowMajor>;
+      Policy, ReduceKForA_, PartitionsK, AccumulatorsInRowMajor>;
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h b/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h
index 2ce20a470a..145e4be7ca 100644
--- a/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h
+++ b/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/layernorm_scale_bias_transform.h b/include/cutlass/gemm/warp/layernorm_scale_bias_transform.h
new file mode 100644
index 0000000000..bbf0090b02
--- /dev/null
+++ b/include/cutlass/gemm/warp/layernorm_scale_bias_transform.h
@@ -0,0 +1,139 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level per channel scale+bias+relu before
+   matrix multiply-accumulate operations targeting Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/mma_sm75.h" 
+#include "cutlass/arch/mma_sm80.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename FragmentActivations, typename FragmentVarMean, typename FragmentGammaBeta>
+struct LayernormScaleBiasTransform {
+
+  using T = typename FragmentActivations::Element;
+
+  static int const NumActivations = FragmentActivations::kElements;
+  static int const NumVarMean = FragmentVarMean::kElements;
+  static int const NumGammaBeta = FragmentGammaBeta::kElements;
+  static int const MmaElements = 2;
+  // One element has one scale and one bias
+  static int const MmaScaleBiasPair = 2;
+  // 16816 has 2 columns and 2 rows
+  static int const MmaCols = 2;
+  static int const MmaRows = 2;
+
+  using MmaOperand = Array<T, MmaElements>;
+  using VarMeanOperand = Array<__half2, MmaScaleBiasPair>;
+  using GammaBetaOperand = Array<T, MmaElements * MmaScaleBiasPair>;
+
+  CUTLASS_DEVICE
+  void transform(MmaOperand &activations,
+                 VarMeanOperand const &var_mean,
+                 GammaBetaOperand const &gamma_beta) {
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+    uint32_t *ptr_activations = reinterpret_cast<uint32_t *>(&activations);
+    uint32_t const *ptr_var_mean = reinterpret_cast<uint32_t const *>(&var_mean);
+    uint32_t const *ptr_gamma_beta = reinterpret_cast<uint32_t const *>(&gamma_beta);
+
+    // Apply per channel scale+bias+relu if the data is not a special NaN
+    // (0x7eff).  If it is a special NaN (0x7eff), hard code the output to 0.
+
+    // We assumes the pair of FP16 are either both inbound or both out-of-bound.
+    // It requires C to be an even number.
+    asm volatile(
+        "{\n\t"
+        " fma.rn.f16x2 %0, %1, %2, %3;\n"
+        " fma.rn.f16x2 %0, %4, %0, %5;\n"
+        "}\n"
+        : "=r"(ptr_activations[0])
+        : "r"(ptr_var_mean[0]), "r"(ptr_activations[0]),
+          "r"(ptr_var_mean[1]),
+          "r"(ptr_gamma_beta[0]), "r"(ptr_gamma_beta[1]));
+#else
+    assert(0);
+#endif
+  }
+
+  CUTLASS_DEVICE
+  void operator()(FragmentActivations &activations,
+                  FragmentVarMean const &var_mean,
+                  FragmentGammaBeta const &gamma_beta) {
+    MmaOperand *ptr_activations = reinterpret_cast<MmaOperand *>(&activations);
+    VarMeanOperand const *ptr_var_mean =
+        reinterpret_cast<VarMeanOperand const *>(&var_mean);
+    GammaBetaOperand const *ptr_gamma_beta =
+        reinterpret_cast<GammaBetaOperand const *>(&gamma_beta);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < (NumActivations / MmaElements); ++i) {
+      transform(ptr_activations[i],
+                ptr_var_mean[i / (MmaCols * MmaRows) * MmaRows + i % MmaRows],
+                ptr_gamma_beta[(i / MmaScaleBiasPair) % MmaCols]);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm 
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/warp/mma.h b/include/cutlass/gemm/warp/mma.h
index 49e14c64c7..dc210b0255 100644
--- a/include/cutlass/gemm/warp/mma.h
+++ b/include/cutlass/gemm/warp/mma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/mma_complex_tensor_op.h b/include/cutlass/gemm/warp/mma_complex_tensor_op.h
index c524e1d5d7..2ef8bb42d5 100644
--- a/include/cutlass/gemm/warp/mma_complex_tensor_op.h
+++ b/include/cutlass/gemm/warp/mma_complex_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -46,6 +46,7 @@
 #include "cutlass/arch/memory_sm75.h"
 #include "cutlass/arch/mma_sm75.h"
 #include "cutlass/arch/mma_sm80.h"
+#include "cutlass/arch/mma_sm90.h"
 
 #include "cutlass/gemm/gemm.h"
 #include "cutlass/gemm/warp/mma.h"
@@ -251,6 +252,8 @@ template <
   ComplexTransform TransformA = ComplexTransform::kNone,
   /// Complex transform on B operand
   ComplexTransform TransformB = ComplexTransform::kNone,
+  /// Do source operands need more than one elements
+  bool GeneralizedOperatorElements = false,
   /// Used for partial specialization
   typename Enable = bool
 >
@@ -279,9 +282,7 @@ template <
   /// Complex transform on A operand
   ComplexTransform TransformA,
   /// Complex transform on B operand
-  ComplexTransform TransformB,
-  /// Used for partial specialization
-  typename Enable
+  ComplexTransform TransformB
 >
 class MmaComplexTensorOp<
   Shape_, 
@@ -293,8 +294,7 @@ class MmaComplexTensorOp<
   LayoutC_,
   Policy_,
   TransformA,
-  TransformB,
-  Enable>  {
+  TransformB>  {
 public:
   /// Shape of warp-level matrix operation (concept: GemmShape)
   using Shape = Shape_;
@@ -512,7 +512,6 @@ class MmaComplexTensorOp<
         mma(*accum, operand_A, operand_B, *accum);
       }
 
-      // mma(accum.imag(), a.imag(), b.real(), accum.imag())
       CUTLASS_PRAGMA_UNROLL
       for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
 
@@ -536,7 +535,6 @@ class MmaComplexTensorOp<
   CUTLASS_DEVICE
   void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
                  FragmentA const &A, FragmentB const &B) const {
-    //TODO: Implement this
     dst_A = A;
     dst_B = B;
   }
@@ -547,7 +545,7 @@ class MmaComplexTensorOp<
 /// Partial specialization for complex*complex+complex => complex:
 //  Operands data type: complex<float>
 //  Rounding: float -> tfloat32_t (round half_ulp_truncate nearest)
-//  Math instruction: MMA.1688.F32.TF32
+//  Math instruction: mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32
 //  Output data type: complex<float>
 // 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -565,9 +563,7 @@ template <
   /// Complex transform on A operand
   ComplexTransform TransformA,
   /// Complex transform on B operand
-  ComplexTransform TransformB,
-  /// Used for partial specialization
-  typename Enable
+  ComplexTransform TransformB
 >
 class MmaComplexTensorOp<
   Shape_, 
@@ -579,8 +575,7 @@ class MmaComplexTensorOp<
   LayoutC_,
   Policy_,
   TransformA,
-  TransformB,
-  Enable>  {
+  TransformB>  {
 public:
   /// Shape of warp-level matrix operation (concept: GemmShape)
   using Shape = Shape_;
@@ -738,7 +733,7 @@ class MmaComplexTensorOp<
     using MmaOperandC = typename ArchMmaOperator::FragmentC;
 
     static_assert(platform::is_same<cutlass::gemm::GemmShape<16, 8, 8>, typename ArchMmaOperator::Shape>::value, 
-      "This implementation only supports MMA.1688 math instructions.");
+      "This implementation only supports mma.m16n8k8 math instructions.");
 
     static_assert(InstMmaOperandA::kElements == 4, 
       "This implementation only supports math instructions in which exactly four element is needed for the A operand."
@@ -822,8 +817,13 @@ class MmaComplexTensorOp<
     // Define conversions from source type to instruction operands' type
     //
 
+    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+    FloatRoundStyle const kRoundA = FloatRoundStyle::round_to_nearest;
+    FloatRoundStyle const kRoundB = FloatRoundStyle::round_to_nearest;
+    #else
     FloatRoundStyle const kRoundA = FloatRoundStyle::round_half_ulp_trunc_dntz; 
     FloatRoundStyle const kRoundB = FloatRoundStyle::round_half_ulp_trunc_dntz;
+    #endif
 
     detail::UnpackComplexConvertAndPackForMma <
       RealElementA,
@@ -852,8 +852,312 @@ class MmaComplexTensorOp<
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for complex*complex+complex => complex:
+//  Operands data type: complex<double>
+//  Math instruction: mma.sync.aligned.m16n8k4.f64.f64.f64.f64
+//  Output data type: complex<double>
+// 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Complex transform on A operand
+  ComplexTransform TransformA,
+  /// Complex transform on B operand
+  ComplexTransform TransformB
+>
+class MmaComplexTensorOp<
+  Shape_, 
+  complex<double>, 
+  LayoutA_, 
+  complex<double>,
+  LayoutB_,
+  complex<double>,
+  LayoutC_,
+  Policy_,
+  TransformA,
+  TransformB,
+  true>  {
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of members of complex multiplicand A
+  using RealElementA = double;
+
+  /// Data type of multiplicand A
+  using ElementA = complex<RealElementA>;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of members of complex multiplicand B
+  using RealElementB = double;
+
+  /// Data type of multiplicand B
+  using ElementB = complex<RealElementB>;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of members of complex accumulator matrix C
+  using RealElementC = double;
+
+  /// Data type of accumulator matrix C
+  using ElementC = complex<RealElementC>;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicyTensorOp)
+  using Policy = Policy_;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Policy::Operator;
+
+  /// Shape of underlying instruction
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  /// Underlying arch tag
+  using ArchTag = typename ArchMmaOperator::ArchTag;
 
-// TODO - partial specializations of real*complex and complex*real
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Indicates math operator 
+  using MathOperator = typename arch::OpMultiplyAddComplex;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = TransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = TransformB;
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+public:
+
+  /// Iterates over the A operand in memory
+  using IteratorA = MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>,
+    Operand::kA,
+    ElementA,
+    LayoutA,
+    MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
+    Policy::OpDelta::kRow,
+    32,
+    1
+  >;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentA = FragmentA;
+
+  /// Iterates over the B operand in memory
+  using IteratorB = MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>,
+    Operand::kB,
+    ElementB,
+    LayoutB,
+    MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
+    Policy::OpDelta::kColumn,
+    32,
+    1
+  >;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed B tile
+  using TransformedFragmentB = FragmentB;
+
+  static_assert(
+    !(Shape::kM % ArchMmaOperator::Shape::kM) && 
+    !(Shape::kN % ArchMmaOperator::Shape::kN),
+    "Shape of warp-level Mma must be divisible by operator shape.");
+
+  /// Number of mma operations performed
+  using MmaIterations = MatrixShape<
+    Shape::kM / ArchMmaOperator::Shape::kM,
+    Shape::kN / ArchMmaOperator::Shape::kN
+  >;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaTensorOpAccumulatorTileIterator<
+     MatrixShape<Shape::kM, Shape::kN>, 
+     ElementC, 
+     LayoutC,
+     typename ArchMmaOperator::Shape, 
+     typename Policy::OpDelta>;
+
+  /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this
+  /// storage arrangement is to be considered 'planar complex' in the sense that all real-valued
+  /// parts are stored consecutively followed by all imaginary parts. This matches the structure
+  /// of Tensor Cores which are always real-valued matrix multiplies.
+  using FragmentC = typename IteratorC::Fragment;
+
+  static_assert(
+    FragmentC::kElements == 2 * MmaIterations::kCount * ArchMmaOperator::FragmentC::kElements,
+    "Unexpected planar complex fragment length.");
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying real-valued matrix multiply operator (concept: arch::Mma)
+  ArchMmaOperator mma;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaComplexTensorOp() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    FragmentA const &A, 
+    FragmentB const &B, 
+    FragmentC const &C
+  ) const {
+
+    // Alias types for underlying real-valued matrix multiply operator
+    using MmaOperandA = typename ArchMmaOperator::FragmentA;
+    using MmaOperandB = typename ArchMmaOperator::FragmentB;
+    using MmaOperandC = typename ArchMmaOperator::FragmentC;
+
+    D = C;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+      // mma(accum.real(), a.real(), b.real(), accum.real());
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_A;
+        MmaOperandB operand_B;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mk = 0; mk < MmaOperandA::kElements; ++mk)
+          operand_A[mk] = A[m*MmaOperandA::kElements + mk].real();
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int nk = 0; nk < MmaOperandB::kElements; ++nk)
+          operand_B[nk] = B[n*MmaOperandB::kElements + nk].real();
+
+        // Real-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow);
+
+          mma(*accum, operand_A, operand_B, *accum);
+      }
+
+      // mma(accum.imag(), a.real(), b.imag(), accum.imag()); 
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_A;
+        MmaOperandB operand_B;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mk = 0; mk < MmaOperandA::kElements; ++mk)
+          operand_A[mk] = A[m*MmaOperandA::kElements + mk].real();
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int nk = 0; nk < MmaOperandB::kElements; ++nk)
+          operand_B[nk] = (kTransformB == ComplexTransform::kConjugate ? 
+                          -B[n*MmaOperandB::kElements + nk].imag() : B[n*MmaOperandB::kElements + nk].imag());
+
+        // Complex-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
+
+        mma(*accum, operand_A, operand_B, *accum);
+      }
+
+      // mma(accum.real(), -a.imag(), b.imag(), accum.real())
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_A;
+        MmaOperandB operand_B;
+
+        // A imaginary part is intentionally negated
+        CUTLASS_PRAGMA_UNROLL
+        for (int mk = 0; mk < MmaOperandA::kElements; ++mk)
+          operand_A[mk] = (kTransformA == ComplexTransform::kConjugate ?
+                          A[m*MmaOperandA::kElements + mk].imag() : -A[m*MmaOperandA::kElements + mk].imag());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int nk = 0; nk < MmaOperandB::kElements; ++nk)
+            operand_B[nk] = (kTransformB == ComplexTransform::kConjugate ?
+                            -B[n*MmaOperandB::kElements + nk].imag() : B[n*MmaOperandB::kElements + nk].imag());
+
+        // Real-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow);
+
+        mma(*accum, operand_A, operand_B, *accum);
+      }
+
+      // mma(accum.imag(), a.imag(), b.real(), accum.imag())
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_A;
+        MmaOperandB operand_B;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mk = 0; mk < MmaOperandA::kElements; ++mk)
+          operand_A[mk] = (kTransformA == ComplexTransform::kConjugate ?
+                          -A[m*MmaOperandA::kElements + mk].imag() : A[m*MmaOperandA::kElements + mk].imag());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int nk = 0; nk < MmaOperandB::kElements; ++nk)
+          operand_B[nk] = B[n*MmaOperandB::kElements + nk].real();
+
+        // Complex-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
+
+        mma(*accum, operand_A, operand_B, *accum);
+      }
+    }
+  }
+
+  /// Transform the mma operands to the required types
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
+                 FragmentA const &A, FragmentB const &B) const {
+    dst_A = A;
+    dst_B = B;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/include/cutlass/gemm/warp/mma_complex_tensor_op_fast_f32.h b/include/cutlass/gemm/warp/mma_complex_tensor_op_fast_f32.h
index 99ae1964cc..d52c5e24b5 100644
--- a/include/cutlass/gemm/warp/mma_complex_tensor_op_fast_f32.h
+++ b/include/cutlass/gemm/warp/mma_complex_tensor_op_fast_f32.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -301,7 +301,7 @@ class MmaComplexTensorOpFastF32;
 /// Partial specialization for complex*complex+complex => complex:
 //  Operands data type: complex<float>
 //  Rounding: float -> tfloat32_t (round half_ulp_truncate nearest)
-//  Math instruction: MMA.1688.F32.TF32
+//  Math instruction: mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32
 //  Output data type: complex<float>
 // 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -497,7 +497,7 @@ class MmaComplexTensorOpFastF32<
   using MmaOperandC = typename ArchMmaOperator::FragmentC;
 
   static_assert(platform::is_same<cutlass::gemm::GemmShape<16, 8, 8>, typename ArchMmaOperator::Shape>::value, 
-    "This implementation only supports MMA.1688 math instructions.");
+    "This implementation only supports mma.m16n8k8 math instructions.");
 
   static_assert(InstMmaOperandA::kElements == 4, 
     "This implementation only supports math instructions in which exactly four element is needed for the A operand."
diff --git a/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h b/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h
index db48855185..bc51bca094 100644
--- a/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h
+++ b/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -530,7 +530,6 @@ class MmaTensorOpMultiplicandTileIterator<
       Fragment &frag,
       /// loads a tile with a logical offset in units of whole tiles
       TensorCoord const &tile_offset) const {
-    // TODO
   }
 
   /// Loads a fragment from memory with logical offset in units of whole tiles.
@@ -542,7 +541,6 @@ class MmaTensorOpMultiplicandTileIterator<
       TensorCoord const &tile_offset,
       /// loads a tile with a logical offset AND a pointer offset
       Index pointer_offset) const {
-    // TODO
   }
 
   /// Loads a fragment from memory with logical offset in units of whole tiles.
@@ -757,7 +755,6 @@ class MmaTensorOpMultiplicandTileIterator<
       Fragment &frag,
       /// loads a tile with a logical offset in units of whole tiles
       TensorCoord const &tile_offset) const {
-    // TODO
   }
 
   /// Loads a fragment from memory with logical offset in units of whole tiles.
@@ -769,7 +766,6 @@ class MmaTensorOpMultiplicandTileIterator<
       TensorCoord const &tile_offset,
       /// loads a tile with a logical offset AND a pointer offset
       Index pointer_offset) const {
-    // TODO
   }
 
   /// Loads a fragment from memory with logical offset in units of whole tiles.
@@ -1588,7 +1584,6 @@ class MmaTensorOpMultiplicandTileIterator<
       Fragment &frag,
       /// loads a tile with a logical offset in units of whole tiles
       TensorCoord const &tile_offset) const {
-    // TODO
   }
 
   /// Loads a fragment from memory with logical offset in units of whole tiles.
@@ -1600,7 +1595,6 @@ class MmaTensorOpMultiplicandTileIterator<
       TensorCoord const &tile_offset,
       /// loads a tile with a logical offset AND a pointer offset
       Index pointer_offset) const {
-    // TODO
   }
 
   /// Loads a fragment from memory with logical offset in units of whole tiles.
@@ -1816,7 +1810,6 @@ class MmaTensorOpMultiplicandTileIterator<
       Fragment &frag,
       /// loads a tile with a logical offset in units of whole tiles
       TensorCoord const &tile_offset) const {
-    // TODO
   }
 
   /// Loads a fragment from memory with logical offset in units of whole tiles.
@@ -1828,7 +1821,6 @@ class MmaTensorOpMultiplicandTileIterator<
       TensorCoord const &tile_offset,
       /// loads a tile with a logical offset AND a pointer offset
       Index pointer_offset) const {
-    // TODO
   }
 
   /// Loads a fragment from memory with logical offset in units of whole tiles.
diff --git a/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h
index 4638fecc25..5a02417aaa 100644
--- a/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h
+++ b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -84,6 +84,8 @@ template <
   ComplexTransform TransformA = ComplexTransform::kNone,
   /// Complex transform on B operand
   ComplexTransform TransformB = ComplexTransform::kNone,
+  /// Do source operands need more than one elements
+  bool GeneralizedOperatorElements = false,
   /// Used for partial specialization
   typename Enable = bool
 >
@@ -112,9 +114,7 @@ template <
   /// Complex transform on A operand
   ComplexTransform TransformA,
   /// Complex transform on B operand
-  ComplexTransform TransformB,
-  /// Used for partial specialization
-  typename Enable
+  ComplexTransform TransformB
 >
 class MmaGaussianComplexTensorOp<
   Shape_, 
@@ -126,8 +126,7 @@ class MmaGaussianComplexTensorOp<
   LayoutC_,
   Policy_,
   TransformA,
-  TransformB,
-  Enable>  {
+  TransformB>  {
 public:
   /// Shape of warp-level matrix operation (concept: GemmShape)
   using Shape = Shape_;
@@ -351,7 +350,282 @@ class MmaGaussianComplexTensorOp<
   CUTLASS_DEVICE
   void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
                  FragmentA const &A, FragmentB const &B) const {
-    //TODO: Implement this
+    dst_A = A;
+    dst_B = B;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex*complex+complex => complex using real-valued TensorOps
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename RealElementA,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename RealElementB,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename RealElementC,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Complex transform on A operand
+  ComplexTransform TransformA,
+  /// Complex transform on B operand
+  ComplexTransform TransformB
+>
+class MmaGaussianComplexTensorOp<
+  Shape_, 
+  complex<RealElementA>, 
+  LayoutA_, 
+  complex<RealElementB>,
+  LayoutB_,
+  complex<RealElementC>,
+  LayoutC_,
+  Policy_,
+  TransformA,
+  TransformB,
+  true>  {
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of multiplicand A
+  using ElementA = complex<RealElementA>;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = complex<RealElementB>;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulator matrix C
+  using ElementC = complex<RealElementC>;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Policy::Operator;
+
+  /// Shape of underlying instruction
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  /// Underlying arch tag
+  using ArchTag = typename ArchMmaOperator::ArchTag;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Indicates math operator 
+  using MathOperator = arch::OpMultiplyAddGaussianComplex;
+  
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = TransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = TransformB;
+
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+public:
+
+  /// Iterates over the A operand in memory
+  using IteratorA = MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>,
+    Operand::kA,
+    ElementA,
+    LayoutA,
+    MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
+    Policy::OpDelta::kRow,
+    32,
+    1
+  >;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentA = FragmentA;
+
+  /// Iterates over the B operand in memory
+  using IteratorB = MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>,
+    Operand::kB,
+    ElementB,
+    LayoutB,
+    MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
+    Policy::OpDelta::kColumn,
+    32,
+    1
+  >;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed B tile
+  using TransformedFragmentB = FragmentB;
+
+  static_assert(
+    !(Shape::kM % ArchMmaOperator::Shape::kM) && 
+    !(Shape::kN % ArchMmaOperator::Shape::kN),
+    "Shape of warp-level Mma must be divisible by operator shape.");
+
+  /// Number of mma operations performed
+  using MmaIterations = MatrixShape<
+    Shape::kM / ArchMmaOperator::Shape::kM,
+    Shape::kN / ArchMmaOperator::Shape::kN
+  >;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaTensorOpGaussianComplexAccumulatorTileIterator<
+     MatrixShape<Shape::kM, Shape::kN>, 
+     ElementC, 
+     LayoutC,
+     typename ArchMmaOperator::Shape, 
+     typename Policy::OpDelta>;
+
+  /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this
+  /// storage arrangement is to be considered 'gaussian complex' in the sense that the accumulation is
+  /// done in three parts namely part1, part2, and part3. The parts 1, 2, and 3 are stored consecutively 
+  /// in InteratorC::Frament. This matches the structure of Tensor Cores which are always real-valued matrix multiplies.
+  using FragmentC = typename IteratorC::Fragment;
+
+  static_assert(
+    FragmentC::kElements == 3 * MmaIterations::kCount * ArchMmaOperator::FragmentC::kElements,
+    "Unexpected gaussian complex fragment length.");
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying real-valued matrix multiply operator (concept: arch::Mma)
+  ArchMmaOperator mma;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaGaussianComplexTensorOp() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    FragmentA const &A, 
+    FragmentB const &B, 
+    FragmentC const &C
+  ) const {
+
+    // Alias types for underlying real-valued matrix multiply operator
+    using MmaOperandA = typename ArchMmaOperator::FragmentA;
+    using MmaOperandB = typename ArchMmaOperator::FragmentB;
+    using MmaOperandC = typename ArchMmaOperator::FragmentC;
+
+    D = C;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+      // mma(accum.part1(), (a.real() + a.imag()), b.real(), accum.part1());
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_Asum;
+        MmaOperandB operand_Br;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mk = 0; mk < MmaOperandA::kElements; ++mk)
+          operand_Asum[mk] = A[m*MmaOperandA::kElements + mk].real() + ((kTransformA == ComplexTransform::kConjugate) ?
+                            -A[m*MmaOperandA::kElements + mk].imag() : +A[m*MmaOperandA::kElements + mk].imag());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int nk = 0; nk < MmaOperandB::kElements; ++nk)
+          operand_Br[nk] = B[n*MmaOperandB::kElements + nk].real();
+
+        // accumulator part1
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow);
+
+        mma(*accum, operand_Asum, operand_Br, *accum);
+      }
+
+      // mma(accum.part2(), -a.real(), (b.real() - b.imag()), accum.part2()); 
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_Ar;
+        MmaOperandB operand_Bdiff;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mk = 0; mk < MmaOperandA::kElements; ++mk)
+          operand_Ar[mk] = -A[m*MmaOperandA::kElements + mk].real();
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int nk = 0; nk < MmaOperandB::kElements; ++nk)
+          operand_Bdiff[nk] = B[n*MmaOperandB::kElements + nk].real() - ((kTransformB == ComplexTransform::kConjugate) ?
+                              -B[n*MmaOperandB::kElements + nk].imag() : +B[n*MmaOperandB::kElements + nk].imag());
+
+        // accumulator part2
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
+
+        mma(*accum, operand_Ar, operand_Bdiff, *accum);
+      }
+
+      // mma(accum.part3(), a.imag(), (b.real() + b.imag()), accum.part3())
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_Ai;
+        MmaOperandB operand_Bsum;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mk = 0; mk < MmaOperandA::kElements; ++mk)
+          operand_Ai[mk] = (kTransformA == ComplexTransform::kConjugate) ?
+                           -A[m*MmaOperandA::kElements + mk].imag() : +A[m*MmaOperandA::kElements + mk].imag();
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int nk = 0; nk < MmaOperandB::kElements; ++nk)
+          operand_Bsum[nk] = B[n*MmaOperandB::kElements + nk].real() + ((kTransformB == ComplexTransform::kConjugate) ?
+                             -B[n*MmaOperandB::kElements + nk].imag() : +B[n*MmaOperandB::kElements + nk].imag());
+
+        // accumulator part3
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow) + 2 * MmaIterations::kCount;
+
+        mma(*accum, operand_Ai, operand_Bsum, *accum);
+      }
+    }
+  }
+
+  /// Transform the mma operands to the required types
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
+                 FragmentA const &A, FragmentB const &B) const {
     dst_A = A;
     dst_B = B;
   }
diff --git a/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h
index 6d3b9e1c1d..fe785f8d3a 100644
--- a/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h
+++ b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/mma_mixed_input_tensor_op.h b/include/cutlass/gemm/warp/mma_mixed_input_tensor_op.h
new file mode 100644
index 0000000000..f553fbde99
--- /dev/null
+++ b/include/cutlass/gemm/warp/mma_mixed_input_tensor_op.h
@@ -0,0 +1,566 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations targeting
+      Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/mma_sm75.h" 
+#include "cutlass/arch/mma_sm80.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+////////////////////////////////////////////////////////////////////////////////
+// Shuffle registers for layout conversion
+////////////////////////////////////////////////////////////////////////////////
+template <
+  /// Element type for the operand in registers for the mma.sync
+  typename ElementMma_, 
+  /// Element type for the operand in shared memory for ldmatrix
+  typename ElementLoad_,
+  /// Number of mma.sync operations performed along rows or columns         
+  int NumMmaInstructions,
+  /// Number of elements in warp fragment
+  int NumElementsInWarpFragment,
+  /// Number of elements in mma fragment
+  int NumElementsInMmaFragment,
+  /// Identifies A or B multiplicand
+  Operand Operand_,
+  ///
+  typename Enable = void >
+struct FragmentShuffler {
+  public:
+  using ElementMma = ElementMma_;
+  using ElementLoad = ElementLoad_;
+
+  static int const kNumMmaInstructions = NumMmaInstructions;
+  static int const kNumElementsInWarpFragment = NumElementsInWarpFragment;
+  static int const kNumElementsInMmaFragment = NumElementsInMmaFragment;
+  static Operand const kOperand = Operand_;
+
+  using WarpFragment = Array<ElementLoad, kNumElementsInWarpFragment>;
+  using MmaFragment = Array<ElementLoad, kNumElementsInMmaFragment>;
+
+  CUTLASS_DEVICE
+  WarpFragment operator()(WarpFragment const &src) {
+    return src;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for `mma.sync` on 16b (F16/BF16) and `ldmatrix` on 8b (S8/U8)
+/// or for `mma.sync` on 8b (S8/U8) and `ldmatrix` on 4b (S4/U4)
+/// for operand A multiplicand going through upcasting. 
+template <
+  /// Element type for the operand in registers for the mma.sync
+  typename ElementMma_, 
+  /// Element type for the operand in shared memory for ldmatrix
+  typename ElementLoad_,
+  /// Number of mma.sync operations performed along rows or columns         
+  int NumMmaInstructions,
+  /// Number of elements in warp fragment
+  int NumElementsInWarpFragment,
+  /// Number of elements in mma fragment
+  int NumElementsInMmaFragment
+> 
+struct FragmentShuffler <ElementMma_, ElementLoad_,
+                         NumMmaInstructions, 
+                         NumElementsInWarpFragment, 
+                         NumElementsInMmaFragment,
+                         Operand::kA,
+                         typename platform::enable_if<(sizeof_bits<ElementMma_>::value /
+                                                 sizeof_bits<ElementLoad_>::value == 2)>::type> {
+public:
+  using ElementMma = ElementMma_;
+  using ElementLoad = ElementLoad_;
+
+  static int const kNumMmaInstructions = NumMmaInstructions;
+  static int const kNumElementsInWarpFragment = NumElementsInWarpFragment;
+  static int const kNumElementsInMmaFragment = NumElementsInMmaFragment;
+  static Operand const kOperand = Operand::kA;
+
+  using WarpFragment = Array<ElementLoad, kNumElementsInWarpFragment>;
+  using MmaFragment = Array<ElementLoad, kNumElementsInMmaFragment>;
+
+  static uint32_t const kSelectBytesEvenThread = 0x5410;
+  static uint32_t const kSelectBytesOddThread = 0x7632;
+
+private:
+  int delta_up_;
+  int delta_down_;
+  int odd_even_lane_id_;
+  uint32_t byte_selector_;
+
+public:
+  CUTLASS_DEVICE
+  FragmentShuffler() {
+    int lane_id = cutlass::arch::LaneId();
+    delta_up_ = (lane_id & 1) + ((lane_id & 2) >> 1);
+    delta_down_ = 2 - delta_up_;
+    odd_even_lane_id_ = static_cast<int>(lane_id & 1);
+    byte_selector_ = odd_even_lane_id_ * kSelectBytesOddThread +
+                    (1 - odd_even_lane_id_) * kSelectBytesEvenThread;
+  }
+
+  CUTLASS_DEVICE
+  WarpFragment operator()(WarpFragment const &src) {
+
+    WarpFragment result;
+    MmaFragment const* mma_frag_src_ptr = reinterpret_cast<MmaFragment const*>(&src);
+    MmaFragment* mma_frag_dst_ptr = reinterpret_cast<MmaFragment*>(&result);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < kNumMmaInstructions; n++) {
+
+        uint32_t const* src_ptr = reinterpret_cast<uint32_t const *>(&mma_frag_src_ptr[n]);
+        uint32_t *dst_ptr = reinterpret_cast<uint32_t *>(&mma_frag_dst_ptr[n]);
+
+        // Shuffle data within the warp, pull from other threads within the warp
+        uint32_t tmp0 = __shfl_up_sync(0xFFFFFFFF, src_ptr[0], delta_up_);
+        uint32_t tmp1 = __shfl_down_sync(0xFFFFFFFF, src_ptr[0], delta_down_);
+        uint32_t tmp2 = __shfl_up_sync(0xFFFFFFFF, src_ptr[1], delta_up_);
+        uint32_t tmp3 = __shfl_down_sync(0xFFFFFFFF, src_ptr[1], delta_down_);
+
+        // Reorder the data within the 32-bit word (4x8b) required for mma.sync
+        dst_ptr[0] = __byte_perm(tmp0, tmp2, byte_selector_);
+        dst_ptr[1] = __byte_perm(tmp1, tmp3, byte_selector_);
+    }
+
+    return result;
+  }
+
+};
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for `mma.sync` on 16b (F16/BF16) and `ldmatrix` on 8b (S8/U8)
+/// or for `mma.sync` on 8b (S8/U8) and `ldmatrix` on 4b (S4/U4)
+/// for operand B multiplicand going through upcasting. 
+template <
+  /// Element type for the operand in registers for the mma.sync
+  typename ElementMma_, 
+  /// Element type for the operand in shared memory for ldmatrix
+  typename ElementLoad_,
+  /// Number of mma.sync operations performed along rows or columns         
+  int NumMmaInstructions,
+  /// Number of elements in warp fragment
+  int NumElementsInWarpFragment,
+  /// Number of elements in mma fragment
+  int NumElementsInMmaFragment
+> 
+struct FragmentShuffler <ElementMma_, ElementLoad_,
+                         NumMmaInstructions, 
+                         NumElementsInWarpFragment, 
+                         NumElementsInMmaFragment,
+                         Operand::kB,
+                         typename platform::enable_if<(sizeof_bits<ElementMma_>::value /
+                                                 sizeof_bits<ElementLoad_>::value == 2)>::type> {
+public:
+  using ElementMma = ElementMma_;
+  using ElementLoad = ElementLoad_;
+
+  static int const kNumMmaInstructions = NumMmaInstructions;
+  static int const kNumElementsInWarpFragment = NumElementsInWarpFragment;
+  static int const kNumElementsInMmaFragment = NumElementsInMmaFragment;
+  static Operand const kOperand = Operand::kB;
+
+  using WarpFragment = Array<ElementLoad, kNumElementsInWarpFragment>;
+  using MmaFragment = Array<ElementLoad, kNumElementsInMmaFragment>;
+
+  static uint32_t const kSelectBytesEvenThread = 0x5410;
+  static uint32_t const kSelectBytesOddThread = 0x7632;
+
+private:
+  int delta_up_;
+  int delta_down_;
+  int odd_even_lane_id_;
+  uint32_t byte_selector_;
+
+public:
+  CUTLASS_DEVICE
+  FragmentShuffler() {
+    int lane_id = cutlass::arch::LaneId();
+    delta_up_ = (lane_id & 1) + ((lane_id & 2) >> 1);
+    delta_down_ = 2 - delta_up_;
+    odd_even_lane_id_ = static_cast<int>(lane_id & 1);
+    byte_selector_ = odd_even_lane_id_ * kSelectBytesOddThread +
+                    (1 - odd_even_lane_id_) * kSelectBytesEvenThread;
+  }
+
+  CUTLASS_DEVICE
+  WarpFragment operator()(WarpFragment const &src) {
+
+    WarpFragment result;
+
+    MmaFragment const* mma_frag_src_ptr = reinterpret_cast<MmaFragment const *>(&src);
+    MmaFragment* mma_frag_dst_ptr = reinterpret_cast<MmaFragment *>(&result);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < kNumMmaInstructions; n++) {
+
+        uint32_t const* src_ptr = reinterpret_cast<uint32_t const*>(&mma_frag_src_ptr[n]);
+        uint32_t* dst_ptr = reinterpret_cast<uint32_t*>(&mma_frag_dst_ptr[n]);
+
+        // Shuffle data within the warp, pull from other threads within the warp
+        uint32_t tmp0 = __shfl_up_sync(0xFFFFFFFF, src_ptr[0], delta_up_);
+        uint32_t tmp1 = __shfl_down_sync(0xFFFFFFFF, src_ptr[0], delta_down_);
+
+        // Reorder the data within the 32-bit word (4x8b) required for mma.sync
+        dst_ptr[0] = __byte_perm(tmp0, tmp1, byte_selector_);
+    }
+
+    return result;
+  }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Data type conversion
+////////////////////////////////////////////////////////////////////////////////
+template <
+  /// Destination type
+  typename ElementDst_, 
+  /// Source type
+  typename ElementSrc_,
+  /// Number of elements
+  int N,
+  ///
+  typename Enable = void> 
+struct FragmentConverter {
+
+  using ElementDst = ElementDst_;
+  using ElementSrc = ElementSrc_;
+
+  // Operand fragment registers in destination and source types
+  using DestinationFragment = Array<ElementDst, N>;
+  using SourceFragment = Array<ElementSrc, N>;
+
+  FastNumericArrayConverter<ElementDst, ElementSrc, N> convert;
+
+  CUTLASS_DEVICE
+  DestinationFragment operator()(SourceFragment const &src) const {
+    return convert(src);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+
+// Partial specialization for when Destination type is the *same* as 
+// Source type
+template <
+  /// Data type
+  typename Element,
+  /// Number of elements
+  int N,
+  /// 
+  typename Enable>
+struct FragmentConverter<Element, Element, N, Enable> {
+
+  using DestinationFragment = Array<Element, N>;
+  using SourceFragment = Array<Element, N>;
+
+  CUTLASS_DEVICE
+  DestinationFragment operator()(SourceFragment const &src) const {
+    return src;
+  }
+};
+
+} // namespace detail
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename ElementA_,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename ElementB_,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename ElementC_,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Number of partitions along K dimension
+  int PartitionsK_ = 1,
+  /// Store the accumulators in row major or column major.  Row major is used
+  /// when output layout is interleaved.
+  bool AccumulatorsInRowMajor = false,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class MmaMixedInputTensorOp {
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of multiplicand A
+  using ElementA = ElementA_;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = ElementB_;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulator matrix C
+  using ElementC = ElementC_;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Policy::Operator;
+
+  /// Underlying arch::Mma instruction datatype for A operand
+  using ElementAMma = typename ArchMmaOperator::ElementA;
+
+  /// Underlying arch::Mma instruction datatype for B operand
+  using ElementBMma = typename ArchMmaOperator::ElementB;
+
+  /// Underlying arch::Mma instruction datatype for C operand
+  using MmaElementC = typename ArchMmaOperator::ElementC;
+
+  /// Indicates math operator 
+  using MathOperator = typename ArchMmaOperator::Operator;
+
+  /// Architecture tag from underlying instruction
+  using ArchTag = typename ArchMmaOperator::ArchTag;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Shape of underlying instruction
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// 
+  // static int const kLoadShapeK = InstructionShape::kK * 
+  //  (sizeof_bits<ElementAMma>::value / sizeof_bits<ElementB>::value);
+
+public:
+
+  /// Iterates over the A operand in Shared Memory
+  using IteratorA = MmaTensorOpMultiplicandTileIterator<
+     MatrixShape<Shape::kM, Shape::kK>, Operand::kA, ElementA, LayoutA,
+     MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
+     Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
+
+  /// Storage for A tile in registers (loaded from Shared Memory)
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile in registers (for use in Mma instruction)
+  using TransformedFragmentA =
+      Array<ElementAMma, FragmentA::kElements>;
+
+  /// Underlying arch::Mma instruction operand fragement for matrix A
+  using MmaOperandA = typename ArchMmaOperator::FragmentA;
+
+  /// Iterates over the B operand in Shared Memory
+  using IteratorB = MmaTensorOpMultiplicandTileIterator<
+      MatrixShape<Shape::kK, Shape::kN>, Operand::kB, ElementB, LayoutB,
+      MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
+      Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
+
+  /// Storage for B tile in registers (loaded from Shared Memory)
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed B tile in registers (for use in Mma instruction)
+  using TransformedFragmentB =
+      Array<ElementBMma, FragmentB::kElements>;
+
+  /// Underlying arch::Mma instruction operand fragement for matrix B
+  using MmaOperandB = typename ArchMmaOperator::FragmentB;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaTensorOpAccumulatorTileIterator<
+     MatrixShape<Shape::kM, Shape::kN>, ElementC, LayoutC,
+     typename ArchMmaOperator::Shape, typename Policy::OpDelta>;
+
+  /// Storage for C tile
+  using FragmentC = typename IteratorC::Fragment;
+
+  /// Underlying arch::Mma instruction operand fragement for matrix C
+  using MmaOperandC = typename ArchMmaOperator::FragmentC;
+
+  /// Number of mma operations performed
+  using MmaIterations = MatrixShape<
+    (Shape::kM + ArchMmaOperator::Shape::kM - 1) / ArchMmaOperator::Shape::kM,
+    (Shape::kN + ArchMmaOperator::Shape::kN - 1) / ArchMmaOperator::Shape::kN
+  >;
+
+
+public:
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  ArchMmaOperator mma;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaMixedInputTensorOp() {}
+
+    /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    TransformedFragmentA const &A, 
+    TransformedFragmentB const &B, 
+    FragmentC const &C
+  ) const {
+
+    D = C;
+
+    MmaOperandA const *ptr_A = reinterpret_cast<MmaOperandA const *>(&A);
+    MmaOperandB const *ptr_B = reinterpret_cast<MmaOperandB const *>(&B);
+    MmaOperandC *ptr_D = reinterpret_cast<MmaOperandC *>(&D);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        int n_serpentine = ((m % 2) ? (MmaIterations::kColumn - 1 - n) : n);
+
+        if (AccumulatorsInRowMajor) {  // matrix B is reordered
+          mma(
+            ptr_D[n_serpentine + m * MmaIterations::kColumn],
+            ptr_A[m],
+            ptr_B[n_serpentine],
+            ptr_D[n_serpentine + m * MmaIterations::kColumn]);
+        } else {
+          mma(ptr_D[m + n_serpentine * MmaIterations::kRow],
+              ptr_A[m],
+              ptr_B[n_serpentine],
+              ptr_D[m + n_serpentine * MmaIterations::kRow]);
+        }
+      }
+    }
+  }
+
+  /// Transform the operand warp fragment register to the required data types and layout 
+  /// for the `cultass::arch::Mma`
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
+                 FragmentA const &A, FragmentB const &B) const {
+
+    // Shuffle data within warp to obtain the mma.sync operand layout
+    detail::FragmentShuffler<ElementBMma, ElementB, MmaIterations::kColumn, 
+             FragmentB::kElements, MmaOperandB::kElements, Operand::kB> shuffler_B;
+    FragmentB tmp_B; 
+    tmp_B = shuffler_B(B);
+
+    // Convert the B operand to the Mma Instruction operand type
+    detail::FragmentConverter<ElementBMma, ElementB, FragmentB::kElements> convert_B;
+    dst_B = convert_B(tmp_B);
+
+    FragmentA tmp_A;
+
+    Array<ElementA, FragmentA::kElements / 2> *
+        ptr_tmp_A = reinterpret_cast<Array<ElementA,
+                                             FragmentA::kElements / 2> *>(&tmp_A);
+    Array<ElementAMma, FragmentA::kElements / 2> *
+        ptr_dst_A = reinterpret_cast<Array<ElementAMma,
+                                             FragmentA::kElements / 2> *>(&dst_A);
+
+    // Shuffle data within warp to obtain the mma.sync operand layout
+    detail::FragmentShuffler<ElementAMma, ElementA, MmaIterations::kRow,
+             FragmentA::kElements, MmaOperandA::kElements, Operand::kA> shuffler_A;
+
+    // Convert the A operand to the Mma Instruction operand type
+    detail::FragmentConverter<ElementAMma, ElementA, FragmentA::kElements / 2> convert_A;
+
+    tmp_A = shuffler_A(A);
+    ptr_dst_A[0] = convert_A(ptr_tmp_A[0]);
+
+    ptr_dst_A[1] = convert_A(ptr_tmp_A[1]);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/warp/mma_planar_complex.h b/include/cutlass/gemm/warp/mma_planar_complex.h
index bd43e0b36f..c5dcfb7c0c 100644
--- a/include/cutlass/gemm/warp/mma_planar_complex.h
+++ b/include/cutlass/gemm/warp/mma_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/mma_simt.h b/include/cutlass/gemm/warp/mma_simt.h
index 8fadccc433..f5f2f063fb 100644
--- a/include/cutlass/gemm/warp/mma_simt.h
+++ b/include/cutlass/gemm/warp/mma_simt.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -251,7 +251,6 @@ class MmaSimt {
   CUTLASS_DEVICE
   void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
                  FragmentA const &A, FragmentB const &B) const {
-    //TODO: Implement this
     dst_A = A;
     dst_B = B;
   }
diff --git a/include/cutlass/gemm/warp/mma_simt_policy.h b/include/cutlass/gemm/warp/mma_simt_policy.h
index 44b1185af0..8da3b9f867 100644
--- a/include/cutlass/gemm/warp/mma_simt_policy.h
+++ b/include/cutlass/gemm/warp/mma_simt_policy.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/mma_simt_tile_iterator.h b/include/cutlass/gemm/warp/mma_simt_tile_iterator.h
index 668c309da9..6b0647ffd9 100644
--- a/include/cutlass/gemm/warp/mma_simt_tile_iterator.h
+++ b/include/cutlass/gemm/warp/mma_simt_tile_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -618,7 +618,7 @@ class MmaSimtTileIterator<Shape_, Operand::kB, Element_, layout::RowMajor, Polic
   /// Fragment object holding a thread's part of a tile
   using Fragment = Array<Element, ThreadShape::kCount>;
 
-private:
+protected:
 
   /// Internal reference
   cutlass::TensorRef<Array<Element, Policy::LaneMmaShape::kN>, layout::RowMajor> ref_;
diff --git a/include/cutlass/gemm/warp/mma_sparse_tensor_op.h b/include/cutlass/gemm/warp/mma_sparse_tensor_op.h
index be5b86e373..1ce1051c41 100644
--- a/include/cutlass/gemm/warp/mma_sparse_tensor_op.h
+++ b/include/cutlass/gemm/warp/mma_sparse_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -158,6 +158,7 @@ class SparseMmaTensorOp {
   /// Max ID2
   static int const kMaxID2 = Policy::Operator::kMaxID2;
 
+    static int const kVerticalVisit = false;
   /// Data type of meta E that is moved at the same time
   using ElementE =
       typename cutlass::platform::conditional<kMaxID2 == 1, uint32_t,
@@ -251,8 +252,6 @@ class SparseMmaTensorOp {
     using MmaOperandC = typename Policy::Operator::FragmentC;
     using MmaOperandE = typename Policy::Operator::FragmentE;
 
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-
     D = C;
 
     MmaOperandA const *ptr_A = reinterpret_cast<MmaOperandA const *>(&A);
@@ -260,6 +259,36 @@ class SparseMmaTensorOp {
     MmaOperandC *ptr_D = reinterpret_cast<MmaOperandC *>(&D);
     MmaOperandE const *ptr_E = reinterpret_cast<MmaOperandE const *>(&E);
 
+    if (kVerticalVisit) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+          int m_serpentine = ((n % 2) ? (MmaIterations::kRow - 1 - m) : m);
+          int id2 = m_serpentine % kMaxID2;
+
+          if (AccumulatorsInRowMajor) {  // matrix B is reordered
+            mma(
+              ptr_D[n + m_serpentine * MmaIterations::kColumn],
+              ptr_A[m_serpentine],
+              ptr_B[n],
+              ptr_D[n + m_serpentine * MmaIterations::kColumn],
+              ptr_E[(m_serpentine / kMaxID2)],
+              id2);
+          } else {
+            mma(
+              ptr_D[m_serpentine + n * MmaIterations::kRow],
+              ptr_A[m_serpentine],
+              ptr_B[n],
+              ptr_D[m_serpentine + n * MmaIterations::kRow],
+              ptr_E[(m_serpentine / kMaxID2)],
+              id2);
+          }
+        }
+      }
+    } else {
       CUTLASS_PRAGMA_UNROLL
       for (int m = 0; m < MmaIterations::kRow; ++m) {
 
@@ -288,9 +317,7 @@ class SparseMmaTensorOp {
           }
         }
       }
-    #else
-      assert(0);
-    #endif
+    }
   }
 
   /// Transform the mma operands to the required types
@@ -298,7 +325,6 @@ class SparseMmaTensorOp {
   void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
                  FragmentA const &A, FragmentB const &B) const {
 
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
     //
     // Define conversions from source type to instruction type
     //
@@ -308,25 +334,42 @@ class SparseMmaTensorOp {
     FloatRoundStyle const kRoundB =
         PreferredRoundingMode<typename ArchMmaOperator::ElementB,
                               ElementB>::kRound;
-    detail::ConvertAndPack<typename ArchMmaOperator::ElementA, ElementA,
-                           FragmentA::kElements / 2, kRoundA>
-        convert_A;
-    NumericArrayConverter<typename ArchMmaOperator::ElementB, ElementB,
-                          FragmentB::kElements, kRoundB>
-        convert_B;
-    Array<ElementA, FragmentA::kElements / 2> const *ptr_A =
-        reinterpret_cast<Array<ElementA, FragmentA::kElements / 2> const *>(&A);
-    Array<typename ArchMmaOperator::ElementA, FragmentA::kElements / 2> *
-        ptr_dst_A = reinterpret_cast<Array<typename ArchMmaOperator::ElementA,
-                                           FragmentA::kElements / 2> *>(&dst_A);
-
-    dst_B = convert_B(B);
-
-    ptr_dst_A[0] = convert_A(ptr_A[0]);
-    ptr_dst_A[1] = convert_A(ptr_A[1]);
-    #else
-      assert(0);
-    #endif
+
+    if (kVerticalVisit) {
+      detail::ConvertAndPack<typename ArchMmaOperator::ElementA, ElementA,
+                            FragmentA::kElements, kRoundA>
+          convert_A;
+      NumericArrayConverter<typename ArchMmaOperator::ElementB, ElementB,
+                            FragmentB::kElements / 2, kRoundB>
+          convert_B;
+      Array<ElementB, FragmentB::kElements / 2> const *ptr_B =
+          reinterpret_cast<Array<ElementB, FragmentB::kElements / 2> const *>(&B);
+      Array<typename ArchMmaOperator::ElementB, FragmentB::kElements / 2> *
+          ptr_dst_B = reinterpret_cast<Array<typename ArchMmaOperator::ElementB,
+                                             FragmentB::kElements / 2> *>(&dst_B);
+  
+      dst_A = convert_A(A);
+  
+      ptr_dst_B[0] = convert_B(ptr_B[0]);
+      ptr_dst_B[1] = convert_B(ptr_B[1]);
+    } else {
+      detail::ConvertAndPack<typename ArchMmaOperator::ElementA, ElementA,
+                             FragmentA::kElements / 2, kRoundA>
+          convert_A;
+      NumericArrayConverter<typename ArchMmaOperator::ElementB, ElementB,
+                            FragmentB::kElements, kRoundB>
+          convert_B;
+      Array<ElementA, FragmentA::kElements / 2> const *ptr_A =
+          reinterpret_cast<Array<ElementA, FragmentA::kElements / 2> const *>(&A);
+      Array<typename ArchMmaOperator::ElementA, FragmentA::kElements / 2> *
+          ptr_dst_A = reinterpret_cast<Array<typename ArchMmaOperator::ElementA,
+                                             FragmentA::kElements / 2> *>(&dst_A);
+  
+      dst_B = convert_B(B);
+  
+      ptr_dst_A[0] = convert_A(ptr_A[0]);
+      ptr_dst_A[1] = convert_A(ptr_A[1]);
+    }
   }
 };
 
diff --git a/include/cutlass/gemm/warp/mma_tensor_op.h b/include/cutlass/gemm/warp/mma_tensor_op.h
index 2c62cd070b..d4aaf5be1d 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -138,7 +138,7 @@ struct ConvertAndPack<half_t, float, N, Round> {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+/// Structure to compute the matrix product targeting Tensor Cores.
 template <
   /// Size of the Gemm problem - concept: gemm::GemmShape<>
   typename Shape_,
@@ -217,6 +217,12 @@ class MmaTensorOp {
   /// Number of partitions along K dimension
   static int const kPartitionsK = PartitionsK_;
 
+  #if defined(__CUDA_ARCH__) && ((__CUDA_ARCH__ < 800) || (__CUDA_ARCH__ == 890)) 
+    static int const kVerticalVisit = true;
+  #else
+    static int const kVerticalVisit = false;
+  #endif
+
 public:
 
   /// Iterates over the A operand in memory
@@ -293,8 +299,8 @@ class MmaTensorOp {
     MmaOperandB const *ptr_B = reinterpret_cast<MmaOperandB const *>(&B);
     MmaOperandC *ptr_D = reinterpret_cast<MmaOperandC *>(&D);
 
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
-      // Serpentine visitation order maximizing reuse of Rb
+      
+    if (kVerticalVisit) {
       CUTLASS_PRAGMA_UNROLL
       for (int n = 0; n < MmaIterations::kColumn; ++n) {
 
@@ -318,8 +324,7 @@ class MmaTensorOp {
           }
         }
       }
-    #elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-      // Serpentine visitation order maximizing reuse of Ra
+    } else {
       CUTLASS_PRAGMA_UNROLL
       for (int m = 0; m < MmaIterations::kRow; ++m) {
 
@@ -342,9 +347,7 @@ class MmaTensorOp {
           }
         }
       }
-    #else
-      assert(0);
-    #endif
+    }
   }
 
   /// Transform the mma operands to the required types
@@ -361,7 +364,7 @@ class MmaTensorOp {
     FloatRoundStyle const kRoundB =
         PreferredRoundingMode<typename ArchMmaOperator::ElementB,
                               ElementB>::kRound;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
+    if (kVerticalVisit) {    
       detail::ConvertAndPack<typename ArchMmaOperator::ElementA, ElementA,
                             FragmentA::kElements, kRoundA>
           convert_A;
@@ -378,8 +381,7 @@ class MmaTensorOp {
   
       ptr_dst_B[0] = convert_B(ptr_B[0]);
       ptr_dst_B[1] = convert_B(ptr_B[1]);
-
-    #elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+    } else {
       detail::ConvertAndPack<typename ArchMmaOperator::ElementA, ElementA,
                             FragmentA::kElements / 2, kRoundA>
           convert_A;
@@ -396,9 +398,7 @@ class MmaTensorOp {
   
       ptr_dst_A[0] = convert_A(ptr_A[0]);
       ptr_dst_A[1] = convert_A(ptr_A[1]);
-    #else
-      assert(0);
-    #endif
+    }
   }
 };
 
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_fast_f32.h b/include/cutlass/gemm/warp/mma_tensor_op_fast_f32.h
index 30886ca42c..148e71226e 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_fast_f32.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_fast_f32.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h b/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h
index aa2806db11..32460b6290 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h
@@ -1,3 +1,34 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
 /*! \file
     \brief This defines a "fragment" iterator for visiting the fragments of a warp tile
       that participate in one warp-level mma operation.
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_policy.h b/include/cutlass/gemm/warp/mma_tensor_op_policy.h
index 4a4cdf51c0..0a768caef6 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_policy.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_policy.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_sm70.h b/include/cutlass/gemm/warp/mma_tensor_op_sm70.h
index f847c3257e..c40790fa8d 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_sm70.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_sm70.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h
new file mode 100644
index 0000000000..4588efb988
--- /dev/null
+++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h
@@ -0,0 +1,362 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+
+#include "cutlass/platform/platform.h"
+#include "cutlass/fast_math.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+
+/// Tile access iterator
+/// Each iteration acess in the tile is
+/// used as multiplicand for one
+/// warp-level matrix multiplication
+template <
+    /// Size of the tile (concept: MatrixShape)
+    typename Shape_,
+    /// Operand identity
+    Operand Operand_,
+    /// Data type of A elements
+    typename Element_,
+    /// Layout of operand
+    typename Layout_,
+    /// Shape of one matrix production operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Delta between *MMA operations (in units of *MMA operations, concept:
+    /// MatrixShape)
+    int OpDelta_,
+    /// Number of threads participating in one matrix operation
+    int Threads = 32,
+    /// Enable Residual Support
+    bool EnableResidual = false,
+    /// Number of partitions along K dimension
+    int PartitionsK_ = 1
+>
+class MmaTensorOpMultiplicandTileAccessIterator {
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  /// Basic check
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = Layout_;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Number of elements accessed per Shared Memory load
+  static int const kElementsPerAccess = 
+    (sizeof_bits<Element>::value >= 32 ? 1 : 32 / sizeof_bits<Element>::value);
+
+  using InstructionCount = MatrixShape<
+    Shape::kRow / InstructionShape::kRow,
+    Shape::kColumn / InstructionShape::kColumn
+  >;
+
+  static int const kIterations = (kOperand == Operand::kA) ? 
+    InstructionCount::kColumn : InstructionCount::kRow;
+
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<
+    Element, 
+    (kOperand == Operand::kA) ? 
+      (Shape::kRow * InstructionShape::kColumn / kThreads) : 
+      (Shape::kColumn * InstructionShape::kRow / kThreads)
+  >;
+
+  /// Memory access type
+  using AccessType = AlignedArray<Element, kElementsPerAccess>;
+
+private:
+
+  /// Underlying tensor reference
+  TensorRef ref_;
+
+  /// Extent of tensor
+  MatrixCoord extent_;
+
+  /// Origin
+  MatrixCoord origin_;
+
+  /// Used to load residual tile
+  bool is_residual_;
+  
+  /// residual offset of each thread
+  TensorCoord residual_offset_;
+
+  /// Iterations in a tile
+  int iterations_;
+
+public:
+  
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileAccessIterator(
+    TensorRef const &ref, 
+    TensorCoord extent,
+    int lane_id
+  ): ref_(ref), extent_(extent), is_residual_(false), iterations_(0) {
+  
+    if (kOperand == Operand::kA) {
+      origin_ = MatrixCoord(lane_id / 4, (lane_id % 4) * kElementsPerAccess);
+    }
+    else {
+      origin_ = MatrixCoord((lane_id % 4) * kElementsPerAccess, lane_id / 4);
+    }
+
+    ref_.add_coord_offset(origin_);
+
+    if(EnableResidual) {
+      // compute residual offset
+      if (kOperand == Operand::kA) {
+        typename TensorCoord::Index residual_size = 
+          extent_.column() % Shape::kColumn;
+        if(residual_size) {
+          is_residual_ = true;
+          residual_offset_ = make_Coord(0, residual_size);
+        }
+      }
+      else {
+        typename TensorCoord::Index residual_size = 
+          extent_.row() % Shape::kRow;
+        if(residual_size) {
+          is_residual_ = true;
+          residual_offset_ = make_Coord(residual_size, 0);
+        }
+      }
+    }
+  }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileAccessIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): MmaTensorOpMultiplicandTileAccessIterator(ref,
+    {Shape::kRow, Shape::kColumn}, lane_id) {
+  }
+ 
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileAccessIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+    origin_ += coord_offset;
+
+    ref_.add_coord_offset(coord_offset);
+
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  void advance() {
+
+    if(EnableResidual && is_residual_) {
+      is_residual_ = false;
+
+      origin_ += residual_offset_;
+      ref_.add_coord_offset(residual_offset_);
+
+    }
+
+    else {
+      if (kOperand == Operand::kA) {
+        add_tile_offset({0, 1});
+      }
+      else {
+        add_tile_offset({1, 0});
+      }
+    }
+
+    iterations_ = 0;
+  }
+
+  /// increase iterations in a tile
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileAccessIterator & operator++() {
+
+    iterations_++;
+
+    if(iterations_ >= kIterations)
+      advance();
+    
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    int const kWarpShapeDivisibleInner =
+      (kOperand == Operand::kA ? InstructionShape::kColumn : InstructionShape::kRow);
+
+    // Take advantage of Tensor Op's 8 x 4T access pattern
+    int const kAccessesInner = (kWarpShapeDivisibleInner / kElementsPerAccess) / 4;
+
+    AccessType *access_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    if (kOperand == Operand::kA) {
+      int const kTilesPerInstruction = InstructionShape::kRow / 8;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int inst_m_idx = 0; inst_m_idx < InstructionCount::kRow; ++inst_m_idx) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int inner_idx = 0; inner_idx < kAccessesInner; ++inner_idx) {
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int access_m_idx = 0; access_m_idx < kTilesPerInstruction; ++access_m_idx) {
+            int access_idx = 
+              access_m_idx + kTilesPerInstruction * (inner_idx + kAccessesInner * inst_m_idx);
+            
+            MatrixCoord offset(
+              access_m_idx * 8 + inst_m_idx * InstructionShape::kRow, 
+              inner_idx * 4 * kElementsPerAccess + iterations_ * InstructionShape::kColumn);
+
+            MatrixCoord access_coord = origin_ + offset;
+
+//            if(access_coord.row() < extent_.row() && access_coord.column() < extent_.column()) {
+
+              access_ptr[access_idx] = *reinterpret_cast<AccessType const *>(
+                ref_.data() + ref_.offset(offset));
+//            }
+//            else {
+//              AccessType zero;
+//              zero.clear();
+//              access_ptr[access_idx] = zero;
+//            }
+          }
+        }
+      }
+    }
+    else {
+      CUTLASS_PRAGMA_UNROLL
+      for (int inst_n_idx = 0; inst_n_idx < InstructionCount::kColumn; ++inst_n_idx) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int inner_idx = 0; inner_idx < kAccessesInner; ++inner_idx) {
+          int access_idx = inner_idx + kAccessesInner * inst_n_idx;
+
+          MatrixCoord offset(
+            inner_idx * 4 * kElementsPerAccess + iterations_ * InstructionShape::kRow,
+            inst_n_idx * 8);
+
+          MatrixCoord access_coord = origin_ + offset;
+
+//          if(access_coord.row() < extent_.row() && access_coord.column() < extent_.column()) {
+              
+            access_ptr[access_idx] = *reinterpret_cast<AccessType const *>(
+              ref_.data() + ref_.offset(offset));
+//          }
+//          else {
+//              AccessType zero;
+//              zero.clear();
+//              access_ptr[access_idx] = zero;
+//          }
+        }
+      } 
+    }
+  }
+
+};
+
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
index 24b7f3e7ca..e6e6d70f3f 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -234,7 +234,8 @@ class MmaTensorOpMultiplicandTileIterator<
     TensorRef const &ref, 
     int lane_id
   ):
-    stride_(ref.stride(0) / Layout::kElementsPerAccess), byte_offset_(0),
+    stride_(ref.stride(0) / Layout::kElementsPerAccess),
+    byte_offset_(0),
     k_group_idx_(0) {
       
     int quad_pair = (lane_id >> 3);
@@ -256,8 +257,7 @@ class MmaTensorOpMultiplicandTileIterator<
         partition_contiguous_idx = ((lane_in_quad_pair >> 2) ^ i);
         access_contiguous_idx = (quad_pair ^ lane_in_quad);
         access_strided_idx = lane_in_quad_pair;
-      }
-      else if (Policy::LdsmShape::kContiguous == 2 &&
+      } else if (Policy::LdsmShape::kContiguous == 2 &&
                  kOperand == Operand::kA) {
         // Matrix multiply 16816 A
         // Q0 Q1
@@ -280,9 +280,775 @@ class MmaTensorOpMultiplicandTileIterator<
         // Q1
         // Q2
         // Q3
-        partition_contiguous_idx = ((lane_in_quad_pair >> 2) ^ (i >> 2)); 
-        access_contiguous_idx = ((i & 3) ^ lane_in_quad); 
-        access_strided_idx = lane_id; 
+        partition_contiguous_idx = ((lane_in_quad_pair >> 2) ^ (i >> 2));
+        access_contiguous_idx = ((i & 3) ^ lane_in_quad);
+        access_strided_idx = lane_id;
+      }
+
+      int access_contiguous =
+          partition_contiguous_idx * Layout::PartitionShape::kContiguous +
+          access_contiguous_idx;
+
+      int access_strided = access_strided_idx;
+
+      pointer_[i] = reinterpret_cast<AccessType const *>(ref.data()) +
+                    access_contiguous + access_strided * stride_;
+    }
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    byte_offset_ += offset * sizeof(Element);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    int contiguous_offset = tile_offset.contiguous();
+    if (Shape::kContiguous ==
+        Layout::PartitionShape::kContiguous * Layout::kElementsPerAccess) {
+      if (tile_offset.contiguous() % 2) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kPointerCount / 2; ++i) {
+          AccessType const *tmp_pointer = pointer_[i];
+          pointer_[i] = pointer_[i + kPointerCount / 2];
+          pointer_[i + kPointerCount / 2] = tmp_pointer;
+        }
+      }
+      contiguous_offset = (tile_offset.contiguous() >> 1) << 1;
+    }
+
+    int offset = (tile_offset.strided() * InstructionShape::kStrided) *
+                     stride_ * Layout::kElementsPerAccess +
+                 contiguous_offset * Shape::kContiguous;
+
+    add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    add_tile_offset({0, 1});
+
+    if (kPartitionsK > 1) {
+      ++k_group_idx_;
+      // Jump to next stage
+      if (k_group_idx_ == Policy::kGroupsPerTile) {
+        k_group_idx_ = 0;
+        add_tile_offset(
+            {0, ((kPartitionsK - 1) * Policy::kGroupsPerTile)});
+      }
+    }
+
+    return *this;
+  }
+
+  /// Advances the iterator along the opposite of the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+    byte_offset_ -= stride_ * InstructionShape::kStrided * sizeof(Element) *
+                    Layout::kElementsPerAccess;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+
+    Array<unsigned, Policy::LdsmShape::kCount> *fetch_ptr = 
+      reinterpret_cast<Array<unsigned, Policy::LdsmShape::kCount> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < Policy::LdsmIterations::kStrided; ++s) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < Policy::LdsmIterations::kContiguous; ++c) {
+
+        int access_idx = c + s * Policy::LdsmIterations::kContiguous;
+
+        AccessType const *source_ptr =
+            pointer_[c % kPointerCount] +
+            Layout::TileShape::kContiguous * (c / kPointerCount) +
+            Policy::kLdsmOpInner * Policy::LdsmShape::kStrided * s * stride_;
+
+        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
+
+        cutlass::arch::ldsm<layout::ColumnMajor, Policy::LdsmShape::kCount>(
+          fetch_ptr[access_idx],
+          source_byte_ptr
+        );
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset = 
+      tile_offset.contiguous() * Shape::kContiguous / Layout::kElementsPerAccess + 
+      tile_offset.strided() * InstructionShape::kStrided * stride_;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no op
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread MMA.TF32 NT TensorOps. It
+/// uses LDS.32 to load from shared memory and therefore must be initialized
+/// with a TensorRef to shared memory.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::TensorOpMultiplicandCongruous<32, 32>, InstructionShape_,
+    OpDelta_, 32, PartitionsK_> {
+ public:
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand == Operand::kB,
+                "MmaTensorOpMultiplicandIterator may only be instantiated for "
+                "A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::TensorOpMultiplicandCongruous<32, 32>;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept:
+  /// MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kContiguous % InstructionShape::kContiguous),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+
+    // Determine number of elements along outer dimension per individual 32bit
+    // shared memory load op.  Every one warp of 32bit shared memory load loads
+    // 8x4 elements
+    static int const kLdsOpInner = Layout::TileShape::kStrided;
+    static int const kLdsOpOuter = kThreads / kLdsOpInner;
+
+    static_assert(!(Shape::kContiguous % kLdsOpOuter),
+                  "Shape of warp-level mma must be divisible by 32bit "
+                  "fundamental tile size.");
+
+    static_assert(!(Shape::kStrided % kLdsOpInner),
+                  "Shape of warp-level mma must be divisible by 32bit "
+                  "fundamental tile size.");
+
+    /// Number of 32 bit shared memory load instructions needed by one MMA instruction
+    /// 1688  A 2x2
+    /// 1688  B 1x2
+    /// 16816 B 1x4
+    static int const LdsShapeContiguous =
+        InstructionShape::kContiguous / kLdsOpOuter;
+    static int const LdsShapeStrided = InstructionShape::kStrided / kLdsOpInner;
+    using LdsShape =
+        layout::PitchLinearShape<LdsShapeContiguous, LdsShapeStrided>;
+
+    /// Number and arrangement of LDS instructions
+    using LdsIterations = layout::PitchLinearShape<
+        Shape::kContiguous / LdsShapeContiguous / kLdsOpOuter, 1>;
+
+    /// Number of groups for each tile
+    static int const kGroupsPerTile =
+        Shape::kStrided / InstructionShape::kStrided;
+  };
+
+ private:
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+                "Alternative arrangements not supported at present.");
+
+  /// Number of internal pointers needed to reference shared memory
+  static int const kPointerCount = Layout::TileShape::kContiguous *
+                                   Layout::kElementsPerAccess /
+                                   Policy::kLdsOpOuter;
+
+  /// Vectorized access is not used
+  static int const kElementsPerAccess = 1;
+
+  /// Pointer type used for accesses
+  using AccessType = Element;
+
+  /// Internal counter used to jump to next K partition
+  int k_group_idx_;
+
+ public:
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment =
+     Array<Element, Shape::kContiguous * InstructionShape::kStrided / kThreads>;
+
+ private:
+  /// Layout object storing stride values
+  StrideIndex stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_[kPointerCount];
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+ public:
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() : stride_(0), byte_offset_(0) {}
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
+      : stride_(ref.stride(0)), byte_offset_(0), k_group_idx_(0) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPointerCount; ++i) {
+      int access_strided = lane_id % Policy::kLdsOpInner;
+      int access_contiguous = (lane_id / Policy::kLdsOpInner) +
+                              (access_strided ^ i) * Policy::kLdsOpOuter;
+
+      pointer_[i] = reinterpret_cast<AccessType const *>(ref.data()) +
+                    access_contiguous + access_strided * stride_;
+    }
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+    byte_offset_ += offset * sizeof(Element);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(
+      TensorCoord const &tile_offset) {
+    int contiguous_offset = tile_offset.contiguous();
+    if (Shape::kContiguous ==
+        Layout::TileShape::kContiguous * Layout::kElementsPerAccess / 2) {
+      if (tile_offset.contiguous() % 2) {
+        // Matrix multiply 1688 pointer_[0] <=> pointer_[4] pointer_[1] <=> pointer_[5]
+        //           pointer_[2] <=> pointer_[6] pointer_[3] <=> pointer_[7]
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kPointerCount / 2; ++i) {
+          AccessType const *tmp_pointer = pointer_[i];
+          pointer_[i] = pointer_[i + kPointerCount / 2];
+          pointer_[i + kPointerCount / 2] = tmp_pointer;
+        }
+      }
+      contiguous_offset = (tile_offset.contiguous() >> 1) << 1;
+    }
+
+    int offset = (tile_offset.strided() * InstructionShape::kStrided) * stride_ +
+                 contiguous_offset * Shape::kContiguous;
+
+    add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator++() {
+    add_tile_offset({0, 1});
+
+    if (kPartitionsK > 1) {
+      ++k_group_idx_;
+      // Jump to next stage
+      if (k_group_idx_ == Policy::kGroupsPerTile) {
+        k_group_idx_ = 0;
+        add_tile_offset(
+            {0, ((kPartitionsK - 1) * Policy::kGroupsPerTile)});
+      }
+    }
+
+    return *this;
+  }
+
+  /// Advances the iterator along the opposite of the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator--() {
+    byte_offset_ -= stride_ * InstructionShape::kStrided * sizeof(Element) *
+                    kElementsPerAccess;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator+=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator-=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const { load_with_byte_offset(frag, 0); }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+    Element *fetch_ptr = reinterpret_cast<Element *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < Policy::LdsIterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < Policy::LdsIterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int ss = 0; ss < Policy::LdsShape::kStrided; ++ss) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int cc = 0; cc < Policy::LdsShape::kContiguous; ++cc) {
+            int access_idx =
+                cc + (ss + (c + s * Policy::LdsIterations::kContiguous) *
+                               Policy::LdsShape::kStrided) *
+                         Policy::LdsShape::kContiguous;
+            int access_idx_contiguous = cc + c * Policy::LdsShape::kContiguous;
+            int access_idx_strided =
+                (ss + s * Policy::LdsShape::kStrided) * Policy::kLdsOpInner;
+
+            AccessType const *source_ptr =
+                pointer_[access_idx_contiguous % kPointerCount] +
+                Layout::TileShape::kContiguous * Layout::kElementsPerAccess *
+                    (access_idx_contiguous / kPointerCount) +
+                access_idx_strided * stride_;
+
+            char const *source_byte_ptr =
+                reinterpret_cast<char const *>(source_ptr) + byte_offset +
+                byte_offset_;
+
+            fetch_ptr[access_idx] =
+                *reinterpret_cast<Element const *>(source_byte_ptr);
+          }
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset =
+        tile_offset.contiguous() * Shape::kContiguous /
+            Layout::kElementsPerAccess +
+        tile_offset.strided() * InstructionShape::kStrided * stride_;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no op
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps with 64B warp tile
+/// the contiguous dimension. This assumes Threadblock contiguous dimension has
+/// the same size as the warp tile.  It uses LDSM to load from shared
+/// memory and therefore must be initialized with a TensorRef to shared memory.
+///
+/// This specialization can be merged into the general one.  Most code is the same.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::TensorOpMultiplicandCongruous<16, 32>,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Element number when the layout crosses
+  static int const kCrosswise = 32;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::TensorOpMultiplicandCongruous<
+      sizeof_bits<Element_>::value, kCrosswise>;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kContiguous % InstructionShape::kContiguous),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+
+    // Determine number of elements along outer dimension per individual LDSM op
+    static int const kLdsmOpOuter = Layout::kElementsPerAccess;
+    static int const kLdsmOpInner = 8;
+
+    static_assert(!(Shape::kContiguous % kLdsmOpOuter),
+      "Shape of warp-level mma must be divisible by LDSM's fundamental tile size.");
+
+    static_assert(!(Shape::kStrided % kLdsmOpInner),
+      "Shape of warp-level mma must be divisible by LDSM's fundamental tile size.");
+
+    /// Shape of one individual LDSM instruction
+    static int const LdsmShapeStrided =
+        InstructionShape::kStrided / kLdsmOpInner;
+    static int const LdsmShapeContiguous = 4 / LdsmShapeStrided;
+    using LdsmShape =
+        layout::PitchLinearShape<LdsmShapeContiguous, LdsmShapeStrided>;
+
+    /// Number and arrangement of LDSM instructions
+    using LdsmIterations = layout::PitchLinearShape<
+        Shape::kContiguous / Layout::kElementsPerAccess / LdsmShapeContiguous,
+        1>;
+
+    /// Number of groups for each tile
+    static int const kGroupsPerTile =
+        Shape::kStrided / InstructionShape::kStrided;
+  };
+
+private:
+
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+    "Alternative arrangements not supported at present.");
+
+  /// Number of internal pointers needed to reference shared memory
+  static int const kPointerCount =
+      Layout::TileShape::kContiguous / Policy::LdsmShape::kContiguous / Layout::kFactor;
+
+  /// Pointer type used for accesses
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+  /// Internal counter used to jump to next K partition
+  int k_group_idx_;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+ using Fragment =
+     Array<Element, Shape::kContiguous * InstructionShape::kStrided / kThreads>;
+
+private:
+
+  /// Layout object storing stride values
+  StrideIndex stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_[kPointerCount];
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    stride_(ref.stride(0) * Layout::kFactor / Layout::kElementsPerAccess),
+    byte_offset_(0),
+    k_group_idx_(0) {
+      
+    int quad_pair = (lane_id >> 3);
+    int quad_quad = (lane_id >> 4);
+    //int lane_in_quad = (lane_id & 3);
+    int lane_in_quad_pair = (lane_id & 7);
+    int lane_in_quad_quad = (lane_id & 15);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPointerCount; ++i) {
+      int partition_contiguous_idx = -1;
+      int access_contiguous_idx = -1;
+      int access_strided_idx = -1;
+
+      if (Policy::LdsmShape::kContiguous == 4) {
+        // Matrix multiply 1688 A/B
+        // Q0 Q1 Q2 Q3 (Q stands for 1 8x128bit block).
+        // Four blocks are next to each other in the contiguous dimension.
+        partition_contiguous_idx = (lane_id % Layout::kFactor);
+        access_contiguous_idx = quad_pair ^ (lane_in_quad_pair / Layout::kFactor);
+        access_strided_idx = lane_in_quad_pair / Layout::kFactor;
+      } else if (Policy::LdsmShape::kContiguous == 2 &&
+          kOperand == Operand::kA) {
+        // Matrix multiply 16816 A
+        // Q0 Q1
+        // Q2 Q3
+        partition_contiguous_idx = (lane_id % Layout::kFactor);
+        access_contiguous_idx =
+            (((quad_pair & 1) + i * 2) ^ (lane_in_quad_pair / Layout::kFactor));
+        access_strided_idx = (lane_in_quad_pair + (lane_id >> 4 << 3)) / 2;
+      } else if (Policy::LdsmShape::kContiguous == 2 &&
+                 kOperand == Operand::kB) {
+        // Matrix multiply 16816 B
+        // Q0 Q2
+        // Q1 Q3
+        partition_contiguous_idx = (lane_id % Layout::kFactor);
+        access_contiguous_idx = (quad_quad + i * 2) ^ (lane_in_quad_pair / Layout::kFactor);
+        access_strided_idx = (lane_in_quad_quad / Layout::kFactor);
+      } else if (Policy::LdsmShape::kContiguous == 1) {
+        // Matrix multiply 16832.SP B
+        // Q0
+        // Q1
+        // Q2
+        // Q3
+        partition_contiguous_idx = (lane_id % Layout::kFactor);
+        access_contiguous_idx = (lane_in_quad_pair / Layout::kFactor) ^ i;
+        access_strided_idx = lane_id / Layout::kFactor;
       }
 
       int access_contiguous =
@@ -324,7 +1090,7 @@ class MmaTensorOpMultiplicandTileIterator<
     }
 
     int offset = (tile_offset.strided() * InstructionShape::kStrided) *
-                     stride_ * Layout::kElementsPerAccess +
+                     stride_ * Layout::kElementsPerAccess / Layout::kFactor +
                  contiguous_offset * Shape::kContiguous;
 
     add_pointer_offset(offset);
@@ -403,7 +1169,7 @@ class MmaTensorOpMultiplicandTileIterator<
         AccessType const *source_ptr =
             pointer_[c % kPointerCount] +
             Layout::TileShape::kContiguous * (c / kPointerCount) +
-            Policy::kLdsmOpInner * Policy::LdsmShape::kStrided * s * stride_;
+            Policy::kLdsmOpInner * Policy::LdsmShape::kStrided * s * stride_ / Layout::kFactor;
 
         char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
 
@@ -458,7 +1224,7 @@ class MmaTensorOpMultiplicandTileIterator<
       Index byte_offset) const {
     Index pointer_offset = 
       tile_offset.contiguous() * Shape::kContiguous / Layout::kElementsPerAccess + 
-      tile_offset.strided() * InstructionShape::kStrided * stride_;
+      tile_offset.strided() * InstructionShape::kStrided * stride_ / Layout::kFactor;
 
     byte_offset += sizeof(AccessType) * pointer_offset;
 
@@ -480,9 +1246,12 @@ class MmaTensorOpMultiplicandTileIterator<
 
 ////////////////////////////////////////////////////////////////////////////////
 
-/// This tile iterator is specialized for 32-thread MMA.TF32 NT TensorOps. It
-/// uses LDS.32 to load from shared memory and therefore must be initialized
-/// with a TensorRef to shared memory.
+/// This tile iterator is specialized for 32-thread TensorOps with 32B warp tile
+/// the contiguous dimension. This assumes Threadblock contiguous dimension has
+/// the same size as the warp tile.  It uses LDSM to load from shared
+/// memory and therefore must be initialized with a TensorRef to shared memory.
+///
+/// This specialization can be merged into the general one.  Most code is the same.
 ///
 /// Satisfies:
 ///   ReadableRandomAccessContiguousTileIteratorConcept
@@ -503,30 +1272,33 @@ template <
     int PartitionsK_>
 class MmaTensorOpMultiplicandTileIterator<
     Shape_, Operand_, Element_,
-    cutlass::layout::TensorOpMultiplicandCongruous<32, 32>, InstructionShape_,
-    OpDelta_, 32, PartitionsK_> {
+    cutlass::layout::TensorOpMultiplicandCongruous<16, 16>,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
  public:
+
   /// Shape of tile to load (concept: PitchLinearShape)
   using Shape = Shape_;
 
   /// Operand tag
   static Operand const kOperand = Operand_;
 
-  static_assert(kOperand == Operand::kA || kOperand == Operand::kB,
-                "MmaTensorOpMultiplicandIterator may only be instantiated for "
-                "A or B operands to warp-level Mma.");
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
 
   /// Element type
   using Element = Element_;
 
+  /// Element number when the layout crosses
+  static int const kCrosswise = 16;
+
   /// Layout of source tile
-  using Layout = cutlass::layout::TensorOpMultiplicandCongruous<32, 32>;
+  using Layout = cutlass::layout::TensorOpMultiplicandCongruous<
+      sizeof_bits<Element_>::value, kCrosswise>;
 
   /// Shape of one matrix product operation (concept: GemmShape)
   using InstructionShape = InstructionShape_;
 
-  /// Delta between *MMA operations (in units of *MMA operations, concept:
-  /// MatrixShape)
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
   static int const kOpDelta = OpDelta_;
 
   /// Number of participating threads
@@ -556,68 +1328,61 @@ class MmaTensorOpMultiplicandTileIterator<
         !(Shape::kContiguous % InstructionShape::kContiguous),
         "Shape of warp-level Mma must be divisible by operator shape.");
 
-    // Determine number of elements along outer dimension per individual 32bit
-    // shared memory load op.  Every one warp of 32bit shared memory load loads
-    // 8x4 elements
-    static int const kLdsOpInner = Layout::TileShape::kStrided;
-    static int const kLdsOpOuter = kThreads / kLdsOpInner;
+    // Determine number of elements along outer dimension per individual LDSM op
+    static int const kLdsmOpOuter = Layout::kElementsPerAccess;
+    static int const kLdsmOpInner = 8;
 
-    static_assert(!(Shape::kContiguous % kLdsOpOuter),
-                  "Shape of warp-level mma must be divisible by 32bit "
-                  "fundamental tile size.");
+    static_assert(!(Shape::kContiguous % kLdsmOpOuter),
+      "Shape of warp-level mma must be divisible by LDSM's fundamental tile size.");
 
-    static_assert(!(Shape::kStrided % kLdsOpInner),
-                  "Shape of warp-level mma must be divisible by 32bit "
-                  "fundamental tile size.");
+    static_assert(!(Shape::kStrided % kLdsmOpInner),
+      "Shape of warp-level mma must be divisible by LDSM's fundamental tile size.");
 
-    /// Number of 32 bit shared memory load instructions needed by one MMA instruction
-    /// 1688  A 2x2
-    /// 1688  B 1x2
-    /// 16816 B 1x4
-    static int const LdsShapeContiguous =
-        InstructionShape::kContiguous / kLdsOpOuter;
-    static int const LdsShapeStrided = InstructionShape::kStrided / kLdsOpInner;
-    using LdsShape =
-        layout::PitchLinearShape<LdsShapeContiguous, LdsShapeStrided>;
+    /// Shape of one individual LDSM instruction
+    static int const LdsmShapeStrided =
+        InstructionShape::kStrided / kLdsmOpInner;
+    static int const LdsmShapeContiguous = 4 / LdsmShapeStrided;
+    using LdsmShape =
+        layout::PitchLinearShape<LdsmShapeContiguous, LdsmShapeStrided>;
 
-    /// Number and arrangement of LDS instructions
-    using LdsIterations = layout::PitchLinearShape<
-        Shape::kContiguous / LdsShapeContiguous / kLdsOpOuter, 1>;
+    /// Number and arrangement of LDSM instructions
+    using LdsmIterations = layout::PitchLinearShape<
+        Shape::kContiguous / Layout::kElementsPerAccess / LdsmShapeContiguous,
+        1>;
 
     /// Number of groups for each tile
     static int const kGroupsPerTile =
         Shape::kStrided / InstructionShape::kStrided;
   };
 
- private:
+private:
+
   /// Not working on this feature at the moment.
   static_assert(kOpDelta == 1,
-                "Alternative arrangements not supported at present.");
+    "Alternative arrangements not supported at present.");
 
   /// Number of internal pointers needed to reference shared memory
-  static int const kPointerCount = Layout::TileShape::kContiguous *
-                                   Layout::kElementsPerAccess /
-                                   Policy::kLdsOpOuter;
-
-  /// Vectorized access is not used
-  static int const kElementsPerAccess = 1;
+  static int const kPointerCount =
+      Layout::TileShape::kContiguous / Policy::LdsmShape::kContiguous / Layout::kFactor;
 
   /// Pointer type used for accesses
-  using AccessType = Element;
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
 
   /// Internal counter used to jump to next K partition
   int k_group_idx_;
 
- public:
+public:
+
   //
   // Derived quantities
   //
 
   /// Fragment object holding a thread's part of a tile
-  using Fragment =
+ using Fragment =
      Array<Element, Shape::kContiguous * InstructionShape::kStrided / kThreads>;
 
- private:
+private:
+
   /// Layout object storing stride values
   StrideIndex stride_;
 
@@ -627,20 +1392,76 @@ class MmaTensorOpMultiplicandTileIterator<
   /// Byte offset incremented as iterator advances
   Index byte_offset_;
 
- public:
+public:
+
   /// Default ctor constructs null iterator
   CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator() : stride_(0), byte_offset_(0) {}
+  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
 
   /// Constructor from TensorRef
   CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
-      : stride_(ref.stride(0)), byte_offset_(0), k_group_idx_(0) {
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref,
+    int lane_id
+  ):
+    stride_(ref.stride(0) * Layout::kFactor / Layout::kElementsPerAccess),
+    byte_offset_(0),
+    k_group_idx_(0) {
+
+    //int quad_pair = (lane_id >> 3);
+    int quad_quad = (lane_id >> 4);
+    int lane_in_pair = (lane_id & 1);
+    int lane_in_quad = (lane_id & 3);
+    int lane_in_quad_pair = (lane_id & 7);
+    int lane_in_quad_quad = (lane_id & 15);
+
     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < kPointerCount; ++i) {
-      int access_strided = lane_id % Policy::kLdsOpInner;
-      int access_contiguous = (lane_id / Policy::kLdsOpInner) +
-                              (access_strided ^ i) * Policy::kLdsOpOuter;
+      int partition_contiguous_idx = -1;
+      int access_contiguous_idx = -1;
+      int access_strided_idx = -1;
+
+      if (Policy::LdsmShape::kContiguous == 2 &&
+          kOperand == Operand::kA) {
+        // Matrix multiply 16816 A
+        // Q0 Q1
+        // Q2 Q3
+        partition_contiguous_idx = lane_in_quad / 2;
+        access_strided_idx = lane_in_quad_pair / Layout::kFactor + quad_quad * 2;
+        access_contiguous_idx =
+            ((lane_in_pair * 2 + ((lane_id & 8) >> 3)) ^
+             access_strided_idx);
+      } else if (Policy::LdsmShape::kContiguous == 2 &&
+                 kOperand == Operand::kB) {
+        // Matrix multiply 16816 B
+        // Q0 Q2
+        // Q1 Q3
+        partition_contiguous_idx = lane_in_quad / 2;
+        access_strided_idx = lane_in_quad_quad / Layout::kFactor;
+        access_contiguous_idx =
+            ((lane_in_pair * 2 + quad_quad) ^
+             access_strided_idx);
+      } else if (Policy::LdsmShape::kContiguous == 1) {
+        // Matrix multiply 16832.SP B
+        // Q0
+        // Q1
+        // Q2
+        // Q3
+        int factor_in_partition =
+            (Layout::PartitionShape::kContiguous * Layout::kFactor /
+             Layout::TileShape::kContiguous);
+
+        partition_contiguous_idx = lane_in_quad / factor_in_partition;
+        access_contiguous_idx = ((lane_in_pair * factor_in_partition) ^
+                                 (lane_in_quad_quad / Layout::kFactor) ^ i);
+        access_strided_idx = lane_id / Layout::kFactor;
+      } 
+
+      int access_contiguous =
+          partition_contiguous_idx * Layout::PartitionShape::kContiguous +
+          access_contiguous_idx;
+
+      int access_strided = access_strided_idx;
 
       pointer_[i] = reinterpret_cast<AccessType const *>(ref.data()) +
                     access_contiguous + access_strided * stride_;
@@ -650,22 +1471,20 @@ class MmaTensorOpMultiplicandTileIterator<
   /// Adds a pointer offset to internal pointer(s) to advance through memory
   CUTLASS_DEVICE
   MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
     byte_offset_ += offset * sizeof(Element);
 
     return *this;
   }
 
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// tiles
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
   CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_tile_offset(
-      TensorCoord const &tile_offset) {
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
     int contiguous_offset = tile_offset.contiguous();
     if (Shape::kContiguous ==
-        Layout::TileShape::kContiguous * Layout::kElementsPerAccess / 2) {
+        Layout::PartitionShape::kContiguous * Layout::kElementsPerAccess) {
       if (tile_offset.contiguous() % 2) {
-        // Matrix multiply 1688 pointer_[0] <=> pointer_[4] pointer_[1] <=> pointer_[5]
-        //           pointer_[2] <=> pointer_[6] pointer_[3] <=> pointer_[7]
         CUTLASS_PRAGMA_UNROLL
         for (int i = 0; i < kPointerCount / 2; ++i) {
           AccessType const *tmp_pointer = pointer_[i];
@@ -676,7 +1495,8 @@ class MmaTensorOpMultiplicandTileIterator<
       contiguous_offset = (tile_offset.contiguous() >> 1) << 1;
     }
 
-    int offset = (tile_offset.strided() * InstructionShape::kStrided) * stride_ +
+    int offset = (tile_offset.strided() * InstructionShape::kStrided) *
+                     stride_ * Layout::kElementsPerAccess / Layout::kFactor +
                  contiguous_offset * Shape::kContiguous;
 
     add_pointer_offset(offset);
@@ -686,7 +1506,8 @@ class MmaTensorOpMultiplicandTileIterator<
 
   /// Advances the iterator along the advance dimension
   CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator &operator++() {
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
     add_tile_offset({0, 1});
 
     if (kPartitionsK > 1) {
@@ -704,34 +1525,33 @@ class MmaTensorOpMultiplicandTileIterator<
 
   /// Advances the iterator along the opposite of the advance dimension
   CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &operator--() {
+  MmaTensorOpMultiplicandTileIterator & operator--() {
     byte_offset_ -= stride_ * InstructionShape::kStrided * sizeof(Element) *
-                    kElementsPerAccess;
+                    Layout::kElementsPerAccess;
 
     return *this;
   }
 
-  ///< advances in units of whole tiles along the logical coordinate space of
-  ///< the tensor
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
   CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator &operator+=(
-      TensorCoord const &tile_offset) {
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
     add_tile_offset(tile_offset);
     return *this;
   }
 
-  ///< advances in units of whole tiles along the logical coordinate space of
-  ///< the tensor
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
   CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator &operator-=(
-      TensorCoord const &tile_offset) {
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
     add_tile_offset(-tile_offset);
     return *this;
   }
 
   /// Loads a fragment from memory at the location pointed to by the iterator.
   CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const { load_with_byte_offset(frag, 0); }
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
 
   /// Loads a fragment from memory with additional logical offset
   CUTLASS_DEVICE
@@ -740,38 +1560,29 @@ class MmaTensorOpMultiplicandTileIterator<
       Fragment &frag,
       /// loads a tile with a linear offset in units of bytes
       Index byte_offset) const {
-    Element *fetch_ptr = reinterpret_cast<Element *>(&frag);
+
+    Array<unsigned, Policy::LdsmShape::kCount> *fetch_ptr =
+      reinterpret_cast<Array<unsigned, Policy::LdsmShape::kCount> *>(&frag);
 
     CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < Policy::LdsIterations::kStrided; ++s) {
+    for (int s = 0; s < Policy::LdsmIterations::kStrided; ++s) {
+
       CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < Policy::LdsIterations::kContiguous; ++c) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int ss = 0; ss < Policy::LdsShape::kStrided; ++ss) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int cc = 0; cc < Policy::LdsShape::kContiguous; ++cc) {
-            int access_idx =
-                cc + (ss + (c + s * Policy::LdsIterations::kContiguous) *
-                               Policy::LdsShape::kStrided) *
-                         Policy::LdsShape::kContiguous;
-            int access_idx_contiguous = cc + c * Policy::LdsShape::kContiguous;
-            int access_idx_strided =
-                (ss + s * Policy::LdsShape::kStrided) * Policy::kLdsOpInner;
+      for (int c = 0; c < Policy::LdsmIterations::kContiguous; ++c) {
 
-            AccessType const *source_ptr =
-                pointer_[access_idx_contiguous % kPointerCount] +
-                Layout::TileShape::kContiguous * Layout::kElementsPerAccess *
-                    (access_idx_contiguous / kPointerCount) +
-                access_idx_strided * stride_;
+        int access_idx = c + s * Policy::LdsmIterations::kContiguous;
 
-            char const *source_byte_ptr =
-                reinterpret_cast<char const *>(source_ptr) + byte_offset +
-                byte_offset_;
+        AccessType const *source_ptr =
+            pointer_[c % kPointerCount] +
+            Layout::TileShape::kContiguous * (c / kPointerCount) +
+            Policy::kLdsmOpInner * Policy::LdsmShape::kStrided * s * stride_ / Layout::kFactor;
 
-            fetch_ptr[access_idx] =
-                *reinterpret_cast<Element const *>(source_byte_ptr);
-          }
-        }
+        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
+
+        cutlass::arch::ldsm<layout::ColumnMajor, Policy::LdsmShape::kCount>(
+          fetch_ptr[access_idx],
+          source_byte_ptr
+        );
       }
     }
   }
@@ -818,9 +1629,8 @@ class MmaTensorOpMultiplicandTileIterator<
       /// loads a tile with a logical offset AND a pointer offset
       Index byte_offset) const {
     Index pointer_offset =
-        tile_offset.contiguous() * Shape::kContiguous /
-            Layout::kElementsPerAccess +
-        tile_offset.strided() * InstructionShape::kStrided * stride_;
+      tile_offset.contiguous() * Shape::kContiguous / Layout::kElementsPerAccess +
+      tile_offset.strided() * InstructionShape::kStrided * stride_ / Layout::kFactor;
 
     byte_offset += sizeof(AccessType) * pointer_offset;
 
@@ -860,12 +1670,14 @@ template <
     /// Interval between adjacent *MMA instructions (in units of MMA
     /// instructions)
     int OpDelta_,
+    /// Element number when the layout crosses (in units of elements)
+    int Crosswise,
     /// Number of partitions along K dimension
     int PartitionsK_>
 class MmaTensorOpMultiplicandTileIterator<
     Shape_, Operand_, Element_,
     cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
-        sizeof_bits<Element_>::value, int(128 / sizeof(Element_))>,
+        sizeof_bits<Element_>::value, Crosswise>,
     InstructionShape_, OpDelta_, 32, PartitionsK_> {
  public:
 
@@ -882,9 +1694,12 @@ class MmaTensorOpMultiplicandTileIterator<
   /// Element type
   using Element = Element_;
 
+  /// MBlock or NBlock size
+  static int const kCrosswise = Crosswise;
+
   /// Layout of source tile
   using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<Element_>::value, int(128 / sizeof(Element_))>;
+      sizeof_bits<Element_>::value, kCrosswise>;
 
   /// Shape of one matrix product operation (concept: MatrixShape)
   using InstructionShape = InstructionShape_;
@@ -914,7 +1729,7 @@ class MmaTensorOpMultiplicandTileIterator<
   using Base = MmaTensorOpMultiplicandTileIterator<
       layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
       layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
-                                            int(128 / sizeof(Element_))>,
+                                            kCrosswise>,
       layout::PitchLinearShape<InstructionShape::kRow,
                                InstructionShape::kColumn>,
       kOpDelta, kThreads, PartitionsK_>;
@@ -1031,7 +1846,6 @@ class MmaTensorOpMultiplicandTileIterator<
       Fragment &frag,
       /// loads a tile with a logical offset in units of whole tiles
       TensorCoord const &tile_offset) const {
-    // TODO
   }
 
   /// Loads a fragment from memory with logical offset in units of whole tiles.
@@ -1043,7 +1857,6 @@ class MmaTensorOpMultiplicandTileIterator<
       TensorCoord const &tile_offset,
       /// loads a tile with a logical offset AND a pointer offset
       Index pointer_offset) const {
-    // TODO
   }
 
   /// Loads a fragment from memory with logical offset in units of whole tiles.
@@ -1094,12 +1907,14 @@ template <
     /// Interval between adjacent *MMA instructions (in units of MMA
     /// instructions)
     int OpDelta_,
+    /// Element number when the layout crosses (in units of elements)
+    int Crosswise,
     /// Number of partitions along K dimension
     int PartitionsK_>
 class MmaTensorOpMultiplicandTileIterator<
     Shape_, Operand_, Element_,
     cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
-        sizeof_bits<Element_>::value, int(128 / sizeof(Element_))>,
+        sizeof_bits<Element_>::value, Crosswise>,
     InstructionShape_, OpDelta_, 32, PartitionsK_> {
  public:
 
@@ -1116,9 +1931,12 @@ class MmaTensorOpMultiplicandTileIterator<
   /// Element type
   using Element = Element_;
 
+  /// Element number when the layout crosses
+  static int const kCrosswise = Crosswise;
+
   /// Layout of source tile
   using Layout = cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<Element_>::value, int(128 / sizeof(Element_))>;
+      sizeof_bits<Element_>::value, kCrosswise>;
 
   /// Shape of one matrix product operation (concept: MatrixShape)
   using InstructionShape = InstructionShape_;
@@ -1145,7 +1963,7 @@ class MmaTensorOpMultiplicandTileIterator<
   using Base = MmaTensorOpMultiplicandTileIterator<
       layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
       layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
-                                            int(128 / sizeof(Element_))>,
+                                            kCrosswise>,
       layout::PitchLinearShape<InstructionShape::kColumn,
                                InstructionShape::kRow>,
       kOpDelta, kThreads, PartitionsK_>;
@@ -1262,7 +2080,6 @@ class MmaTensorOpMultiplicandTileIterator<
       Fragment &frag,
       /// loads a tile with a logical offset in units of whole tiles
       TensorCoord const &tile_offset) const {
-    // TODO
   }
 
   /// Loads a fragment from memory with logical offset in units of whole tiles.
@@ -1274,7 +2091,6 @@ class MmaTensorOpMultiplicandTileIterator<
       TensorCoord const &tile_offset,
       /// loads a tile with a logical offset AND a pointer offset
       Index pointer_offset) const {
-    // TODO
   }
 
   /// Loads a fragment from memory with logical offset in units of whole tiles.
@@ -1499,7 +2315,17 @@ class MmaTensorOpMultiplicandTileIterator<
     int access_contiguous_idx = -1;
     int access_strided_idx = -1;
 
-    if (Layout::kFactor == 4) {
+    if (Layout::kFactor == 8) {
+      int factor_in_partition =
+          (Layout::PartitionShape::kContiguous * Layout::kFactor /
+           Layout::TileShape::kContiguous);
+
+      if (Policy::LdsmShape::kStrided == Policy::LdsmShape::kCount) {
+        partition_contiguous_idx = lane_in_quad_pair / factor_in_partition;
+        access_contiguous_idx = ((lane_in_quad) ^ (lane_id / Layout::kFactor));
+        access_strided_idx = lane_id / Layout::kFactor;
+      }
+    } else if (Layout::kFactor == 4) {
       // Super Integer matrix multiply Interleaved-32
 
       int factor_in_partition =
@@ -1546,8 +2372,7 @@ class MmaTensorOpMultiplicandTileIterator<
         partition_contiguous_idx = (lane_id % Layout::kFactor);
         access_contiguous_idx = (lane_in_quad_pair / Layout::kFactor);
         access_strided_idx = lane_id / Layout::kFactor;
-      }
-      else if (Policy::LdsmShape::kStrided ==
+      } else if (Policy::LdsmShape::kStrided ==
                      (Policy::LdsmShape::kCount / 2) &&
                  kOperand == Operand::kA) {
         // Matrix multiply 16816|1688.TF32 A
@@ -2060,7 +2885,6 @@ class MmaTensorOpMultiplicandTileIterator<
       Fragment &frag,
       /// loads a tile with a logical offset in units of whole tiles
       TensorCoord const &tile_offset) const {
-    // TODO
     assert(0);
   }
 
@@ -2073,7 +2897,6 @@ class MmaTensorOpMultiplicandTileIterator<
       TensorCoord const &tile_offset,
       /// loads a tile with a logical offset AND a pointer offset
       Index pointer_offset) const {
-    // TODO
     assert(0);
   }
 
@@ -2300,7 +3123,6 @@ class MmaTensorOpMultiplicandTileIterator<
       Fragment &frag,
       /// loads a tile with a logical offset in units of whole tiles
       TensorCoord const &tile_offset) const {
-    // TODO
     assert(0);
   }
 
@@ -2313,7 +3135,6 @@ class MmaTensorOpMultiplicandTileIterator<
       TensorCoord const &tile_offset,
       /// loads a tile with a logical offset AND a pointer offset
       Index pointer_offset) const {
-    // TODO
     assert(0);
   }
 
@@ -3843,18 +4664,18 @@ class MmaTensorOpAccumulatorTileIterator<
 
     if(beta_ == 0.0f) {
       CUTLASS_PRAGMA_UNROLL
-      for(int i = 0; i < frag.size(); ++i) {
+      for(int i = 0; i < int(frag.size()); ++i) {
         output_frag_f[i] = frag[i];
       }
 
       if(InstructionShape::kM == Policy::kStridedPerSTG) {
         CUTLASS_PRAGMA_UNROLL
-        for(int i = 0; i < frag.size(); ++i) {
+        for(int i = 0; i < int(frag.size()); ++i) {
           output_frag[i] = (Element)(output_frag_f[i] * alpha_);
         }
       } else {
         CUTLASS_PRAGMA_UNROLL
-        for(int i = 0; i < frag.size(); ++i) {
+        for(int i = 0; i < int(frag.size()); ++i) {
           int map_i = (i / (16 * Policy::kPackedFactor)) * (16 * Policy::kPackedFactor)
                     + (i % (8 * Policy::kPackedFactor)) / 2 * 4
                     + (i % (8 * Policy::kPackedFactor)) % 2
@@ -3890,12 +4711,12 @@ class MmaTensorOpAccumulatorTileIterator<
     } else {
       if(InstructionShape::kM == Policy::kStridedPerSTG) {
         CUTLASS_PRAGMA_UNROLL
-        for(int i = 0; i < frag.size(); ++i) {
+        for(int i = 0; i < int(frag.size()); ++i) {
           output_frag_f[i] = frag[i];
         }
       } else {
         CUTLASS_PRAGMA_UNROLL
-        for(int i = 0; i < frag.size(); ++i) {
+        for(int i = 0; i < int(frag.size()); ++i) {
           int map_i = (i / (16 * Policy::kPackedFactor)) * (16 * Policy::kPackedFactor)
                     + (i % (8 * Policy::kPackedFactor)) / 2 * 4
                     + (i % (8 * Policy::kPackedFactor)) % 2
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h
index a7f03105f8..bcac335f26 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -857,7 +857,6 @@ class MmaVoltaTensorOpMultiplicandTileIterator<
       Fragment &frag,
       /// loads a tile with a logical offset in units of whole tiles
       TensorCoord const &tile_offset) const {
-    // TODO
   }
 
   /// Loads a fragment from memory with logical offset in units of whole tiles.
@@ -869,7 +868,6 @@ class MmaVoltaTensorOpMultiplicandTileIterator<
       TensorCoord const &tile_offset,
       /// loads a tile with a logical offset AND a pointer offset
       Index pointer_offset) const {
-    // TODO
   }
 
   /// Loads a fragment from memory with logical offset in units of whole tiles.
@@ -1081,7 +1079,6 @@ class MmaVoltaTensorOpMultiplicandTileIterator<
       Fragment &frag,
       /// loads a tile with a logical offset in units of whole tiles
       TensorCoord const &tile_offset) const {
-    // TODO
   }
 
   /// Loads a fragment from memory with logical offset in units of whole tiles.
@@ -1093,7 +1090,6 @@ class MmaVoltaTensorOpMultiplicandTileIterator<
       TensorCoord const &tile_offset,
       /// loads a tile with a logical offset AND a pointer offset
       Index pointer_offset) const {
-    // TODO
   }
 
   /// Loads a fragment from memory with logical offset in units of whole tiles.
@@ -1987,7 +1983,6 @@ class MmaVoltaTensorOpMultiplicandTileIterator<
       Fragment &frag,
       /// loads a tile with a logical offset in units of whole tiles
       TensorCoord const &tile_offset) const {
-    // TODO
     assert(0);
   }
 
@@ -2000,7 +1995,6 @@ class MmaVoltaTensorOpMultiplicandTileIterator<
       TensorCoord const &tile_offset,
       /// loads a tile with a logical offset AND a pointer offset
       Index pointer_offset) const {
-    // TODO
     assert(0);
   }
 
@@ -2215,7 +2209,6 @@ class MmaVoltaTensorOpMultiplicandTileIterator<
       Fragment &frag,
       /// loads a tile with a logical offset in units of whole tiles
       TensorCoord const &tile_offset) const {
-    // TODO
     assert(0);
   }
 
@@ -2228,7 +2221,6 @@ class MmaVoltaTensorOpMultiplicandTileIterator<
       TensorCoord const &tile_offset,
       /// loads a tile with a logical offset AND a pointer offset
       Index pointer_offset) const {
-    // TODO
     assert(0);
   }
 
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h
index 4e3e5273fd..4ccf0b5809 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -546,7 +546,6 @@ class MmaTensorOpMultiplicandTileIterator<
       Fragment &frag,
       /// loads a tile with a logical offset in units of whole tiles
       TensorCoord const &tile_offset) const {
-    // TODO
   }
 
   /// Loads a fragment from memory with logical offset in units of whole tiles.
@@ -558,7 +557,6 @@ class MmaTensorOpMultiplicandTileIterator<
       TensorCoord const &tile_offset,
       /// loads a tile with a logical offset AND a pointer offset
       Index pointer_offset) const {
-    // TODO
   }
 
   /// Loads a fragment from memory with logical offset in units of whole tiles.
@@ -774,7 +772,6 @@ class MmaTensorOpMultiplicandTileIterator<
       Fragment &frag,
       /// loads a tile with a logical offset in units of whole tiles
       TensorCoord const &tile_offset) const {
-    // TODO
   }
 
   /// Loads a fragment from memory with logical offset in units of whole tiles.
@@ -786,7 +783,6 @@ class MmaTensorOpMultiplicandTileIterator<
       TensorCoord const &tile_offset,
       /// loads a tile with a logical offset AND a pointer offset
       Index pointer_offset) const {
-    // TODO
   }
 
   /// Loads a fragment from memory with logical offset in units of whole tiles.
@@ -999,7 +995,8 @@ class MmaTensorOpMultiplicandTileIterator<
   CUTLASS_DEVICE
   MmaTensorOpMultiplicandTileIterator &add_tile_offset_negative(TensorCoord const &tile_offset) {
 
-    add_tile_offset(tile_offset); // TODO fix this if it becomes an issue during warp it reset
+    // TODO: fix this if it becomes an issue during warp it reset
+    add_tile_offset(tile_offset);
 
     return *this;
   }
@@ -1334,7 +1331,6 @@ class MmaTensorOpMultiplicandTileIterator<
       Fragment &frag,
       /// loads a tile with a logical offset in units of whole tiles
       TensorCoord const &tile_offset) const {
-    // TODO
   }
 
   /// Loads a fragment from memory with logical offset in units of whole tiles.
@@ -1346,7 +1342,6 @@ class MmaTensorOpMultiplicandTileIterator<
       TensorCoord const &tile_offset,
       /// loads a tile with a logical offset AND a pointer offset
       Index pointer_offset) const {
-    // TODO
   }
 
   /// Loads a fragment from memory with logical offset in units of whole tiles.
@@ -1567,7 +1562,6 @@ class MmaTensorOpMultiplicandTileIterator<
       Fragment &frag,
       /// loads a tile with a logical offset in units of whole tiles
       TensorCoord const &tile_offset) const {
-    // TODO
   }
 
   /// Loads a fragment from memory with logical offset in units of whole tiles.
@@ -1579,7 +1573,6 @@ class MmaTensorOpMultiplicandTileIterator<
       TensorCoord const &tile_offset,
       /// loads a tile with a logical offset AND a pointer offset
       Index pointer_offset) const {
-    // TODO
   }
 
   /// Loads a fragment from memory with logical offset in units of whole tiles.
@@ -2170,7 +2163,6 @@ class MmaTensorOpMultiplicandTileIterator<
       Fragment &frag,
       /// loads a tile with a logical offset in units of whole tiles
       TensorCoord const &tile_offset) const {
-    // TODO
   }
 
   /// Loads a fragment from memory with logical offset in units of whole tiles.
@@ -2182,7 +2174,6 @@ class MmaTensorOpMultiplicandTileIterator<
       TensorCoord const &tile_offset,
       /// loads a tile with a logical offset AND a pointer offset
       Index pointer_offset) const {
-    // TODO
   }
 
   /// Loads a fragment from memory with logical offset in units of whole tiles.
@@ -2399,7 +2390,6 @@ class MmaTensorOpMultiplicandTileIterator<
       Fragment &frag,
       /// loads a tile with a logical offset in units of whole tiles
       TensorCoord const &tile_offset) const {
-    // TODO
   }
 
   /// Loads a fragment from memory with logical offset in units of whole tiles.
@@ -2411,7 +2401,6 @@ class MmaTensorOpMultiplicandTileIterator<
       TensorCoord const &tile_offset,
       /// loads a tile with a logical offset AND a pointer offset
       Index pointer_offset) const {
-    // TODO
   }
 
   /// Loads a fragment from memory with logical offset in units of whole tiles.
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h
index d32107753d..c4ed8bc98e 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h
index c26d4781c0..0da043e67a 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -493,7 +493,7 @@ class MmaTensorOpWmmaMultiplicandTileIterator<
 
     Index elements_offset = layout_({WmmaShape::kRow, 0});
 
-    byte_offset_ -= (elements_offset + sizeof_bits<Element>::value) / 8;
+    byte_offset_ -= (elements_offset * sizeof_bits<Element>::value) / 8;
     return *this;
   }
 
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_wmma.h b/include/cutlass/gemm/warp/mma_tensor_op_wmma.h
index 826d591bdf..971ad3b81d 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_wmma.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_wmma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/mma_with_reduction_tensor_op.h b/include/cutlass/gemm/warp/mma_with_reduction_tensor_op.h
index 06a7a9948f..67231d35a1 100644
--- a/include/cutlass/gemm/warp/mma_with_reduction_tensor_op.h
+++ b/include/cutlass/gemm/warp/mma_with_reduction_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -44,7 +44,7 @@
 #include "cutlass/matrix_shape.h"
 
 #include "cutlass/arch/memory_sm75.h"
-#include "cutlass/arch/mma_sm75.h" 
+#include "cutlass/arch/mma_sm75.h"
 #include "cutlass/arch/mma_sm80.h"
 
 #include "cutlass/gemm/gemm.h"
@@ -81,7 +81,7 @@ template <
   typename LayoutC_,
   /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
   typename Policy_,
-  ///
+  /// Reduce operand A or B along K dimension
   bool ReduceKForA_,
   /// Number of partitions along K dimension
   int PartitionsK_ = 1,
@@ -120,9 +120,9 @@ class MmaWithReductionTensorOp {
   /// Underlying matrix multiply operator (concept: arch::Mma)
   using ArchMmaOperator = typename Policy::Operator;
 
-  /// Indicates math operator 
+  /// Indicates math operator
   using MathOperator = typename ArchMmaOperator::Operator;
-  
+
   /// Architecture tag from underlying instruction
   using ArchTag = typename ArchMmaOperator::ArchTag;
 
@@ -146,6 +146,21 @@ class MmaWithReductionTensorOp {
 
   static bool const kReduceKForA = ReduceKForA_;
 
+  static_assert(platform::is_same<ElementA, cutlass::half_t>::value ||
+                platform::is_same<ElementA, cutlass::bfloat16_t>::value,
+                "ElementA needs to be fp16 or bf16.");
+
+  static_assert(platform::is_same<ElementB, cutlass::half_t>::value ||
+                platform::is_same<ElementB, cutlass::bfloat16_t>::value,
+                "ElementB needs to be fp16 or bf16.");
+
+  static_assert(platform::is_same<InstructionShape,
+                                  cutlass::gemm::GemmShape<16, 8, 16>>::value,
+                "Only supports 16x8x16 tensor core instruction.");
+
+  static_assert(!AccumulatorsInRowMajor,
+                "Only calls tensor core instructions in column major.");
+
 public:
 
   /// Iterates over the A operand in memory
@@ -208,9 +223,9 @@ class MmaWithReductionTensorOp {
   /// Performs a warp-level matrix multiply-accumulate operation
   CUTLASS_DEVICE
   void operator()(
-    FragmentC &D, 
-    TransformedFragmentA const &A, 
-    TransformedFragmentB const &B, 
+    FragmentC &D,
+    TransformedFragmentA const &A,
+    TransformedFragmentB const &B,
     FragmentC const &C,
     FragmentReduction &gemm_k_reduction
   ) const {
@@ -221,35 +236,12 @@ class MmaWithReductionTensorOp {
 
     D = C;
 
-    MmaOperandA const *ptr_A = reinterpret_cast<MmaOperandA const *>(&A);
-    MmaOperandB const *ptr_B = reinterpret_cast<MmaOperandB const *>(&B);
-    MmaOperandC *ptr_D = reinterpret_cast<MmaOperandC *>(&D);
+    [[maybe_unused]] MmaOperandA const *ptr_A = reinterpret_cast<MmaOperandA const *>(&A);
+    [[maybe_unused]] MmaOperandB const *ptr_B = reinterpret_cast<MmaOperandB const *>(&B);
+    [[maybe_unused]] MmaOperandC *ptr_D = reinterpret_cast<MmaOperandC *>(&D);
 
     #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
-      // Serpentine visitation order maximizing reuse of Rb
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < MmaIterations::kColumn; ++n) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int m = 0; m < MmaIterations::kRow; ++m) {
-
-          int m_serpentine = ((n % 2) ? (MmaIterations::kRow - 1 - m) : m);
-
-          if (AccumulatorsInRowMajor) {  // matrix B is reordered
-            mma(
-              ptr_D[n + m_serpentine * MmaIterations::kColumn],
-              ptr_A[m_serpentine],
-              ptr_B[n],
-              ptr_D[n + m_serpentine * MmaIterations::kColumn]);
-          } else {
-            mma(
-              ptr_D[m_serpentine + n * MmaIterations::kRow],
-              ptr_A[m_serpentine],
-              ptr_B[n],
-              ptr_D[m_serpentine + n * MmaIterations::kRow]);
-          }
-        }
-      }
+      assert(0);
     #elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
       // Serpentine visitation order maximizing reuse of Ra
       CUTLASS_PRAGMA_UNROLL
@@ -260,25 +252,21 @@ class MmaWithReductionTensorOp {
 
           int n_serpentine = ((m % 2) ? (MmaIterations::kColumn - 1 - n) : n);
 
-          if (AccumulatorsInRowMajor) {  // matrix B is reordered
-            mma(
-              ptr_D[n_serpentine + m * MmaIterations::kColumn],
+          mma(ptr_D[m + n_serpentine * MmaIterations::kRow],
               ptr_A[m],
               ptr_B[n_serpentine],
-              ptr_D[n_serpentine + m * MmaIterations::kColumn]);
-          } else {
-            mma(ptr_D[m + n_serpentine * MmaIterations::kRow],
-                ptr_A[m],
-                ptr_B[n_serpentine],
-                ptr_D[m + n_serpentine * MmaIterations::kRow]);
-
-            if (!kReduceKForA && m == 0) {
-//              gemm_k_reduction[n_serpentine] += float(B[n_serpentine * 4]);
-//              gemm_k_reduction[n_serpentine] += float(B[n_serpentine * 4 + 1]);
-//              gemm_k_reduction[n_serpentine] += float(B[n_serpentine * 4 + 2]);
-//              gemm_k_reduction[n_serpentine] += float(B[n_serpentine * 4 + 3]);
-
-              uint32_t const *tmp = reinterpret_cast<uint32_t const *>(&B);
+              ptr_D[m + n_serpentine * MmaIterations::kRow]);
+
+          if (!kReduceKForA && m == 0) {
+            #if 0
+            gemm_k_reduction[n_serpentine] += float(B[n_serpentine * 4]);
+            gemm_k_reduction[n_serpentine] += float(B[n_serpentine * 4 + 1]);
+            gemm_k_reduction[n_serpentine] += float(B[n_serpentine * 4 + 2]);
+            gemm_k_reduction[n_serpentine] += float(B[n_serpentine * 4 + 3]);
+            #else
+            uint32_t const *tmp = reinterpret_cast<uint32_t const *>(&B);
+
+            if (platform::is_same<ElementB, cutlass::half_t>::value) {
               asm volatile(
                 "{\n\t"
                 " .reg .f16 low, high;\n\t"
@@ -296,48 +284,99 @@ class MmaWithReductionTensorOp {
                 "}\n\t"
                 : "+f"(gemm_k_reduction[n_serpentine])
                 : "r"(tmp[n_serpentine * 2]), "r"(tmp[n_serpentine * 2 + 1]));
+            } else if (platform::is_same<ElementB, cutlass::bfloat16_t>::value) {
+              asm volatile(
+                "{\n\t"
+                " .reg .f32 tmp;\n\t"
+                " shl.b32 tmp, %1, 16;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " and.b32 tmp, %1, 0xffff0000;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " shl.b32 tmp, %2, 16;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " and.b32 tmp, %2, 0xffff0000;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                "}\n\t"
+                : "+f"(gemm_k_reduction[n_serpentine])
+              : "r"(tmp[n_serpentine * 2]), "r"(tmp[n_serpentine * 2 + 1]));
+            } else {
+                assert(0);
             }
+            #endif
           }
 
           if (kReduceKForA && (n == 0)) {
-//            gemm_k_reduction[m * 2] += float(A[m * 8]);
-//            gemm_k_reduction[m * 2] += float(A[m * 8 + 1]);
-//            gemm_k_reduction[m * 2] += float(A[m * 8 + 4]);
-//            gemm_k_reduction[m * 2] += float(A[m * 8 + 5]);
-//  
-//            gemm_k_reduction[m * 2 + 1] += float(A[m * 8 + 2]);
-//            gemm_k_reduction[m * 2 + 1] += float(A[m * 8 + 3]);
-//            gemm_k_reduction[m * 2 + 1] += float(A[m * 8 + 6]);
-//            gemm_k_reduction[m * 2 + 1] += float(A[m * 8 + 7]);
-
+            #if 0
+            gemm_k_reduction[m * 2] += float(A[m * 8]);
+            gemm_k_reduction[m * 2] += float(A[m * 8 + 1]);
+            gemm_k_reduction[m * 2] += float(A[m * 8 + 4]);
+            gemm_k_reduction[m * 2] += float(A[m * 8 + 5]);
+
+            gemm_k_reduction[m * 2 + 1] += float(A[m * 8 + 2]);
+            gemm_k_reduction[m * 2 + 1] += float(A[m * 8 + 3]);
+            gemm_k_reduction[m * 2 + 1] += float(A[m * 8 + 6]);
+            gemm_k_reduction[m * 2 + 1] += float(A[m * 8 + 7]);
+            #else
             uint32_t const *tmp = reinterpret_cast<uint32_t const *>(&A);
-            asm volatile(
-              "{\n\t"
-              " .reg .f16 low, high;\n\t"
-              " .reg .f32 tmp;\n\t"
-              " mov.b32 {low, high}, %2;\n\t"
-              " cvt.f32.f16 tmp, low;\n\t"
-              " add.f32 %0, tmp, %0;\n\t"
-              " cvt.f32.f16 tmp, high;\n\t"
-              " add.f32 %0, tmp, %0;\n\t"
-              " mov.b32 {low, high}, %3;\n\t"
-              " cvt.f32.f16 tmp, low;\n\t"
-              " add.f32 %1, tmp, %1;\n\t"
-              " cvt.f32.f16 tmp, high;\n\t"
-              " add.f32 %1, tmp, %1;\n\t"
-              " mov.b32 {low, high}, %4;\n\t"
-              " cvt.f32.f16 tmp, low;\n\t"
-              " add.f32 %0, tmp, %0;\n\t"
-              " cvt.f32.f16 tmp, high;\n\t"
-              " add.f32 %0, tmp, %0;\n\t"
-              " mov.b32 {low, high}, %5;\n\t"
-              " cvt.f32.f16 tmp, low;\n\t"
-              " add.f32 %1, tmp, %1;\n\t"
-              " cvt.f32.f16 tmp, high;\n\t"
-              " add.f32 %1, tmp, %1;\n\t"
-              "}\n\t"
-              : "+f"(gemm_k_reduction[m * 2]), "+f"(gemm_k_reduction[m * 2 + 1])
-              : "r"(tmp[m * 4]), "r"(tmp[m * 4 + 1]),"r"(tmp[m * 4 + 2]), "r"(tmp[m * 4 + 3]));
+
+            if (platform::is_same<ElementA, cutlass::half_t>::value) {
+              asm volatile(
+                "{\n\t"
+                " .reg .f16 low, high;\n\t"
+                " .reg .f32 tmp;\n\t"
+                " mov.b32 {low, high}, %2;\n\t"
+                " cvt.f32.f16 tmp, low;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " cvt.f32.f16 tmp, high;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " mov.b32 {low, high}, %3;\n\t"
+                " cvt.f32.f16 tmp, low;\n\t"
+                " add.f32 %1, tmp, %1;\n\t"
+                " cvt.f32.f16 tmp, high;\n\t"
+                " add.f32 %1, tmp, %1;\n\t"
+                " mov.b32 {low, high}, %4;\n\t"
+                " cvt.f32.f16 tmp, low;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " cvt.f32.f16 tmp, high;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " mov.b32 {low, high}, %5;\n\t"
+                " cvt.f32.f16 tmp, low;\n\t"
+                " add.f32 %1, tmp, %1;\n\t"
+                " cvt.f32.f16 tmp, high;\n\t"
+                " add.f32 %1, tmp, %1;\n\t"
+                "}\n\t"
+                : "+f"(gemm_k_reduction[m * 2]), "+f"(gemm_k_reduction[m * 2 + 1])
+                : "r"(tmp[m * 4]), "r"(tmp[m * 4 + 1]),"r"(tmp[m * 4 + 2]), "r"(tmp[m * 4 + 3]));
+
+            } else if (platform::is_same<ElementA, cutlass::bfloat16_t>::value) {
+
+              asm volatile(
+                "{\n\t"
+                " .reg .f32 tmp;\n\t"
+                " shl.b32 tmp, %2, 16;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " and.b32 tmp, %2, 0xffff0000;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " shl.b32 tmp, %3, 16;\n\t"
+                " add.f32 %1, tmp, %1;\n\t"
+                " and.b32 tmp, %3, 0xffff0000;\n\t"
+                " add.f32 %1, tmp, %1;\n\t"
+                " shl.b32 tmp, %4, 16;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " and.b32 tmp, %4, 0xffff0000;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " shl.b32 tmp, %5, 16;\n\t"
+                " add.f32 %1, tmp, %1;\n\t"
+                " and.b32 tmp, %5, 0xffff0000;\n\t"
+                " add.f32 %1, tmp, %1;\n\t"
+                "}\n\t"
+                : "+f"(gemm_k_reduction[m * 2]), "+f"(gemm_k_reduction[m * 2 + 1])
+                : "r"(tmp[m * 4]), "r"(tmp[m * 4 + 1]),"r"(tmp[m * 4 + 2]), "r"(tmp[m * 4 + 3]));
+
+            } else {
+              assert(0);
+            }
+            #endif
           }
         }
       }
@@ -372,9 +411,9 @@ class MmaWithReductionTensorOp {
       Array<typename ArchMmaOperator::ElementB, FragmentB::kElements / 2> *
           ptr_dst_B = reinterpret_cast<Array<typename ArchMmaOperator::ElementB,
                                              FragmentB::kElements / 2> *>(&dst_B);
-  
+
       dst_A = convert_A(A);
-  
+
       ptr_dst_B[0] = convert_B(ptr_B[0]);
       ptr_dst_B[1] = convert_B(ptr_B[1]);
 
@@ -390,9 +429,9 @@ class MmaWithReductionTensorOp {
       Array<typename ArchMmaOperator::ElementA, FragmentA::kElements / 2> *
           ptr_dst_A = reinterpret_cast<Array<typename ArchMmaOperator::ElementA,
                                              FragmentA::kElements / 2> *>(&dst_A);
-  
+
       dst_B = convert_B(B);
-  
+
       ptr_dst_A[0] = convert_A(ptr_A[0]);
       ptr_dst_A[1] = convert_A(ptr_A[1]);
     #else
diff --git a/include/cutlass/conv/warp/conv2d_fprop_scale_bias_iterator.h b/include/cutlass/gemm/warp/scale_bias_tile_iterator.h
similarity index 94%
rename from include/cutlass/conv/warp/conv2d_fprop_scale_bias_iterator.h
rename to include/cutlass/gemm/warp/scale_bias_tile_iterator.h
index 85b8dde24c..7d74ac8cfe 100644
--- a/include/cutlass/conv/warp/conv2d_fprop_scale_bias_iterator.h
+++ b/include/cutlass/gemm/warp/scale_bias_tile_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -57,7 +57,7 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
-namespace conv {
+namespace gemm {
 namespace warp {
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -77,7 +77,7 @@ template <
     int Threads,
     /// Number of partitions along K dimension
     int PartitionsK_ = 1>
-class WarpIteratorScaleBias;
+class ScaleBiasTileIterator;
 
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -99,7 +99,7 @@ template <
     typename Policy_,
     /// Number of partitions along K dimension
     int PartitionsK_>
-class WarpIteratorScaleBias<Shape_, Element_, cutlass::layout::PitchLinear,
+class ScaleBiasTileIterator<Shape_, Element_, cutlass::layout::PitchLinear,
                              InstructionShape_, Policy_, 32, PartitionsK_> {
  public:
   /// Shape of tile to load (concept: PitchLinearShape)
@@ -167,14 +167,14 @@ class WarpIteratorScaleBias<Shape_, Element_, cutlass::layout::PitchLinear,
  public:
   /// Default ctor constructs null iterator
   CUTLASS_HOST_DEVICE
-  WarpIteratorScaleBias()
+  ScaleBiasTileIterator()
       : pointer_(nullptr),
         byte_offset_(0),
         k_group_idx_(0) {}
 
   /// Constructor from TensorRef
   CUTLASS_DEVICE
-  WarpIteratorScaleBias(TensorRef const &ref_scale_bias,
+  ScaleBiasTileIterator(TensorRef const &ref_scale_bias,
                          int lane_id)
       : byte_offset_(0), k_group_idx_(0) {
     /// 16816 only
@@ -185,7 +185,7 @@ class WarpIteratorScaleBias<Shape_, Element_, cutlass::layout::PitchLinear,
 
   /// Adds a pointer offset to internal pointer(s) to advance through memory
   CUTLASS_DEVICE
-  WarpIteratorScaleBias &add_pointer_offset(LongIndex offset) {
+  ScaleBiasTileIterator &add_pointer_offset(LongIndex offset) {
     byte_offset_ += offset * sizeof_bits<Element>::value / 8;
 
     return *this;
@@ -194,7 +194,7 @@ class WarpIteratorScaleBias<Shape_, Element_, cutlass::layout::PitchLinear,
   /// Advances an iterator along logical dimensions of matrix in units of whole
   /// tiles
   CUTLASS_DEVICE
-  WarpIteratorScaleBias &add_tile_offset(
+  ScaleBiasTileIterator &add_tile_offset(
       TensorCoord const &tile_offset) {
     int whole_tiles = tile_offset.contiguous() / Policy::kGroupsPerTile;
     int k_groups_delta = tile_offset.contiguous() % Policy::kGroupsPerTile;
@@ -211,7 +211,7 @@ class WarpIteratorScaleBias<Shape_, Element_, cutlass::layout::PitchLinear,
 
   /// Advances the iterator along the advance dimension
   CUTLASS_DEVICE
-  WarpIteratorScaleBias &operator++() {
+  ScaleBiasTileIterator &operator++() {
     byte_offset_ += Policy::LdsmShape::kContiguous *
                     sizeof_bits<Element>::value * kElementsPerAccess / 8;
 
@@ -230,12 +230,12 @@ class WarpIteratorScaleBias<Shape_, Element_, cutlass::layout::PitchLinear,
 
   /// Advances the iterator along the advance dimension
   CUTLASS_HOST_DEVICE
-  WarpIteratorScaleBias &operator--() { assert(0); }
+  ScaleBiasTileIterator &operator--() { assert(0); }
 
   ///< advances in units of whole tiles along the logical coordinate space of
   ///< the tensor
   CUTLASS_DEVICE
-  WarpIteratorScaleBias &operator+=(
+  ScaleBiasTileIterator &operator+=(
       TensorCoord const &tile_offset) {
     add_tile_offset(tile_offset);
     return *this;
@@ -244,7 +244,7 @@ class WarpIteratorScaleBias<Shape_, Element_, cutlass::layout::PitchLinear,
   ///< advances in units of whole tiles along the logical coordinate space of
   ///< the tensor
   CUTLASS_DEVICE
-  WarpIteratorScaleBias &operator-=(
+  ScaleBiasTileIterator &operator-=(
       TensorCoord const &tile_offset) {
     add_tile_offset(-tile_offset);
     return *this;
@@ -366,7 +366,7 @@ template <
     typename Policy_,
     /// Number of partitions along K dimension
     int PartitionsK_>
-class WarpIteratorScaleBias<Shape_, Element_, cutlass::layout::RowMajor,
+class ScaleBiasTileIterator<Shape_, Element_, cutlass::layout::RowMajor,
                              InstructionShape_, Policy_, 32, PartitionsK_> {
  public:
   /// Shape of tile to load (concept: PitchLinearShape)
@@ -400,7 +400,7 @@ class WarpIteratorScaleBias<Shape_, Element_, cutlass::layout::RowMajor,
   using Policy = Policy_;
 
   /// Underlying tile iterator implementation
-  using Base = WarpIteratorScaleBias<
+  using Base = ScaleBiasTileIterator<
       layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
       layout::PitchLinear,
       layout::PitchLinearShape<InstructionShape::kColumn,
@@ -422,16 +422,16 @@ class WarpIteratorScaleBias<Shape_, Element_, cutlass::layout::RowMajor,
  public:
   /// Default ctor constructs null iterator
   CUTLASS_HOST_DEVICE
-  WarpIteratorScaleBias() {}
+  ScaleBiasTileIterator() {}
 
   /// Constructor from TensorRef
   CUTLASS_HOST_DEVICE
-  WarpIteratorScaleBias(TensorRef const &ref_scale_bias, int lane_id)
+  ScaleBiasTileIterator(TensorRef const &ref_scale_bias, int lane_id)
       : iterator_({ref_scale_bias.data(), ref_scale_bias.stride()}, lane_id) {}
 
   /// Adds a pointer offset to internal pointer(s) to advance through memory
   CUTLASS_HOST_DEVICE
-  WarpIteratorScaleBias &add_pointer_offset(LongIndex offset) {
+  ScaleBiasTileIterator &add_pointer_offset(LongIndex offset) {
     iterator_.add_pointer_offset(offset);
 
     return *this;
@@ -440,7 +440,7 @@ class WarpIteratorScaleBias<Shape_, Element_, cutlass::layout::RowMajor,
   /// Advances an iterator along logical dimensions of matrix in units of whole
   /// tiles
   CUTLASS_HOST_DEVICE
-  WarpIteratorScaleBias &add_tile_offset(
+  ScaleBiasTileIterator &add_tile_offset(
       TensorCoord const &tile_offset) {
     iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
 
@@ -450,7 +450,7 @@ class WarpIteratorScaleBias<Shape_, Element_, cutlass::layout::RowMajor,
   /// Advances an iterator along logical dimensions of matrix in units of whole
   /// tiles
   CUTLASS_DEVICE
-  WarpIteratorScaleBias &add_tile_offset_negative(
+  ScaleBiasTileIterator &add_tile_offset_negative(
       TensorCoord const &tile_offset) {
     iterator_.add_tile_offset_negative({tile_offset.column(), tile_offset.row()});
 
@@ -459,7 +459,7 @@ class WarpIteratorScaleBias<Shape_, Element_, cutlass::layout::RowMajor,
 
   /// Advances the iterator along the advance dimension
   CUTLASS_HOST_DEVICE
-  WarpIteratorScaleBias &operator++() {
+  ScaleBiasTileIterator &operator++() {
     ++iterator_;
 
     return *this;
@@ -467,7 +467,7 @@ class WarpIteratorScaleBias<Shape_, Element_, cutlass::layout::RowMajor,
 
   /// Advances the iterator along the advance dimension
   CUTLASS_HOST_DEVICE
-  WarpIteratorScaleBias &operator--() {
+  ScaleBiasTileIterator &operator--() {
     --iterator_;
 
     return *this;
@@ -476,7 +476,7 @@ class WarpIteratorScaleBias<Shape_, Element_, cutlass::layout::RowMajor,
   ///< advances in units of whole tiles along the logical coordinate space of
   ///< the tensor
   CUTLASS_DEVICE
-  WarpIteratorScaleBias &operator+=(
+  ScaleBiasTileIterator &operator+=(
       TensorCoord const &tile_offset) {
     add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
     return *this;
@@ -485,7 +485,7 @@ class WarpIteratorScaleBias<Shape_, Element_, cutlass::layout::RowMajor,
   ///< advances in units of whole tiles along the logical coordinate space of
   ///< the tensor
   CUTLASS_DEVICE
-  WarpIteratorScaleBias &operator-=(
+  ScaleBiasTileIterator &operator-=(
       TensorCoord const &tile_offset) {
     add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
     return *this;
@@ -522,7 +522,6 @@ class WarpIteratorScaleBias<Shape_, Element_, cutlass::layout::RowMajor,
       Fragment &frag,
       /// loads a tile with a logical offset in units of whole tiles
       TensorCoord const &tile_offset) const {
-    // TODO
     assert(0);
   }
 
@@ -535,7 +534,6 @@ class WarpIteratorScaleBias<Shape_, Element_, cutlass::layout::RowMajor,
       TensorCoord const &tile_offset,
       /// loads a tile with a logical offset AND a pointer offset
       Index pointer_offset) const {
-    // TODO
     assert(0);
   }
 
@@ -568,7 +566,7 @@ class WarpIteratorScaleBias<Shape_, Element_, cutlass::layout::RowMajor,
 ////////////////////////////////////////////////////////////////////////////////
 
 } // namespace warp
-} // namespace conv 
+} // namespace gemm 
 } // namespace cutlass
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/warp/softmax_scale_bias_transform.h b/include/cutlass/gemm/warp/softmax_scale_bias_transform.h
new file mode 100644
index 0000000000..d8d99d675d
--- /dev/null
+++ b/include/cutlass/gemm/warp/softmax_scale_bias_transform.h
@@ -0,0 +1,117 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level per-channel softmax before
+   matrix multiply-accumulate operations targeting Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/mma_sm75.h"
+#include "cutlass/arch/mma_sm80.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename FragmentActivations, typename FragmentNormSum>
+struct SoftmaxScaleBiasTransform {
+
+  using T = typename FragmentActivations::Element;
+
+  static int const NumActivations = FragmentActivations::kElements;
+  static int const NumNormSum = FragmentNormSum::kElements;
+  static int const MmaElements = 2;
+  // One element has one scale and one bias
+  static int const MmaScaleBiasPair = 2;
+  // 16816 has 2 columns and 2 rows
+  static int const MmaCols = 2;
+  static int const MmaRows = 2;
+
+  using MmaOperand = Array<T, MmaElements>;
+  using NormSumOperand = Array<__half2, MmaScaleBiasPair>;
+
+  CUTLASS_DEVICE
+  void transform(MmaOperand &activations,
+                 NormSumOperand const &norm_sum) {
+
+    __half2* packed_activations = reinterpret_cast<__half2*>(&activations);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < MmaElements / 2; ++i) {
+      __half2 out = ::h2exp(__hsub2(packed_activations[i], norm_sum[2*i]));
+      packed_activations[i] = __hmul2(out, norm_sum[2*i + 1]);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void operator()(FragmentActivations &activations,
+                  FragmentNormSum const &norm_sum) {
+    MmaOperand *ptr_activations = reinterpret_cast<MmaOperand *>(&activations);
+    NormSumOperand const *ptr_norm_sum =
+        reinterpret_cast<NormSumOperand const *>(&norm_sum);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < (NumActivations / MmaElements); ++i) {
+      transform(ptr_activations[i],
+                ptr_norm_sum[i / (MmaCols * MmaRows) * MmaRows + i % MmaRows]);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/warp/tile_iterator_planar_complex.h b/include/cutlass/gemm/warp/tile_iterator_planar_complex.h
index a808fd7884..42c6728bc2 100644
--- a/include/cutlass/gemm/warp/tile_iterator_planar_complex.h
+++ b/include/cutlass/gemm/warp/tile_iterator_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm_coord.h b/include/cutlass/gemm_coord.h
new file mode 100644
index 0000000000..61b97a1e19
--- /dev/null
+++ b/include/cutlass/gemm_coord.h
@@ -0,0 +1,394 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/coord.h"
+
+namespace cutlass {
+namespace gemm {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Shape of a matrix multiply-add operation
+template <
+  /// Rows of matrix product
+  int M = 1,
+  /// Columns of matrix product
+  int N = 1,
+  /// Inner dimension of matrix product
+  int K = 1
+>
+struct GemmShape {
+  static int const kM = M;
+  static int const kN = N;
+  static int const kK = K;
+
+  static int const kMN = M * N;
+  static int const kMK = M * K;
+  static int const kKN = N * K;
+  static int const kMNK = M * N * K;
+
+  static int const kCount = kMNK;
+
+  //
+  // Static member functions
+  //
+
+  /// Returns a Coord object
+  CUTLASS_HOST_DEVICE
+  static Coord<3> toCoord() {
+    return make_Coord(kM, kN, kK);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Type alias of the transpose of a GemmShape
+template <
+  /// concept: GemmShape
+  typename Shape
+>
+using GemmShapeTranspose = GemmShape<Shape::kN, Shape::kM, Shape::kK>;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// GemmCoord is a structure derived from Coord<3> that specifies a location within the
+/// coordinate space of a GEMM problem.
+struct GemmCoord : public Coord<3, int> {
+
+  /// Integer-valued index
+  typedef int Index;
+
+  /// Base type is a Coord of rank=3
+  typedef Coord<3, Index> Base;
+
+  /// GEMM M dimension - rows of the output C matrix
+  static int const kM = 0;
+
+  /// GEMM N dimension - columns of the output C matrix
+  static int const kN = 1;
+
+  /// GEMM K dimension - inner dimension of the GEMM problem
+  static int const kK = 2;
+
+  //
+  // Methods
+  //
+
+  /// Default ctor
+  CUTLASS_HOST_DEVICE
+  GemmCoord() { }
+
+  /// Constructs from Coord<3> and a batch
+  CUTLASS_HOST_DEVICE
+  GemmCoord(Coord<3, Index> const& coord): Base(make_Coord(coord[0], coord[1], coord[2])) { }
+
+  /// Helper to construct from a K, N, M, batch variables
+  CUTLASS_HOST_DEVICE
+  GemmCoord(Index m, Index n, Index k): Base(make_Coord(m, n, k)) { }
+
+  /// Returns the GEMM M coordinate
+  CUTLASS_HOST_DEVICE
+  Index const&  m() const { return this->at(kM); }
+
+  /// Returns reference to the GEMM M coordinate
+  CUTLASS_HOST_DEVICE
+  Index & m() { return this->at(kM); }
+
+  /// Returns the GEMM N coordinate
+  CUTLASS_HOST_DEVICE
+  Index const&  n() const { return this->at(kN); }
+
+  /// Returns reference to the GEMM N coordinate
+  CUTLASS_HOST_DEVICE
+  Index & n() { return this->at(kN); }
+
+  /// Returns the GEMM K coordinate
+  CUTLASS_HOST_DEVICE
+  Index const&  k() const { return this->at(kK); }
+
+  /// Returns reference to the GEMM K coordinate
+  CUTLASS_HOST_DEVICE
+  Index & k() { return this->at(kK); }
+
+  /// Obtains a Coord<3> from GemmCoord
+  CUTLASS_HOST_DEVICE
+  Coord<3> mnk() const {
+    return make_Coord(m(), n(), k());
+  }
+
+  /// Obtains a Coord<3> from GemmCoord
+  CUTLASS_HOST_DEVICE
+  Coord<3> knm() const {
+    return make_Coord(k(), n(), m());
+  }
+
+  /// Obtains a Coord<2> from GemmCoord
+  CUTLASS_HOST_DEVICE
+  Coord<2> nm() const {
+    return make_Coord(n(), m());
+  }
+
+  /// Obtains a Coord<2> from GemmCoord
+  CUTLASS_HOST_DEVICE
+  Coord<2> mn() const {
+    return make_Coord(m(), n());
+  }
+
+  /// Obtains a Coord<2> from GemmCoord
+  CUTLASS_HOST_DEVICE
+  Coord<2> mk() const {
+    return make_Coord(m(), k());
+  }
+
+  /// Obtains a Coord<2> from GemmCoord
+  CUTLASS_HOST_DEVICE
+  Coord<2> km() const {
+    return make_Coord(k(), m());
+  }
+
+  /// Obtains a Coord<2> from GemmCoord
+  CUTLASS_HOST_DEVICE
+  Coord<2> nk() const {
+    return make_Coord(n(), k());
+  }
+
+  /// Obtains a Coord<2> from GemmCoord
+  CUTLASS_HOST_DEVICE
+  Coord<2> kn() const {
+    return make_Coord(k(), n());
+  }
+
+  //
+  // Coord operators
+  //
+
+  /// Element-wise addition
+  CUTLASS_HOST_DEVICE
+  GemmCoord operator+(Base const& b) const {
+    return GemmCoord(Base::operator+(b));
+  }
+
+  /// Element-wise subtraction
+  CUTLASS_HOST_DEVICE
+  GemmCoord operator-(Base const& b) const {
+    return GemmCoord(Base::operator-(b));
+  }
+
+  /// Element-wise multiplication
+  CUTLASS_HOST_DEVICE
+  GemmCoord operator*(Base const& b) const {
+    return GemmCoord(Base::operator*(b));
+  }
+
+  /// Element-wise division
+  CUTLASS_HOST_DEVICE
+  GemmCoord operator/(Base const& b) const {
+    return GemmCoord(Base::operator/(b));
+  }
+
+  /// In-place addition
+  CUTLASS_HOST_DEVICE
+  GemmCoord& operator+=(Base const& b) {
+    Base::operator+=(b);
+    return *this;
+  }
+
+  /// In-place subtraction
+  CUTLASS_HOST_DEVICE
+  GemmCoord& operator-=(Base const& b) {
+    Base::operator-=(b);
+    return *this;
+  }
+
+  /// In-place multiplication
+  CUTLASS_HOST_DEVICE
+  GemmCoord& operator*=(Base const& b) {
+    Base::operator*=(b);
+    return *this;
+  }
+
+  /// In-place division
+  CUTLASS_HOST_DEVICE
+  GemmCoord& operator/=(Base const& b) {
+    Base::operator/=(b);
+    return *this;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// BatchedGemmCoord is a structure derived from Coord<4> that specifies a location within the
+/// coordinate space of a batched GEMM problem.
+struct BatchedGemmCoord : public Coord<4, int> {
+
+  /// Integer-valued index
+  typedef int Index;
+
+  /// Base type is a Coord of rank=4
+  typedef Coord<4, Index> Base;
+
+  /// GEMM M dimension - rows of the output C matrix
+  static int const kM = 0;
+
+  /// GEMM N dimension - columns of the output C matrix
+  static int const kN = 1;
+
+  /// GEMM K dimension - inner dimension of the GEMM problem
+  static int const kK = 2;
+
+  /// GEMM Batch dimension - inner dimension of the GEMM problem
+  static int const kBatch = 3;
+
+  //
+  // Methods
+  //
+
+  /// Default ctor
+  CUTLASS_HOST_DEVICE
+  BatchedGemmCoord() { }
+
+  /// Constructs from Coord<4>
+  CUTLASS_HOST_DEVICE
+  BatchedGemmCoord(Base const& coord): Base(coord) { }
+
+  /// Helper to construct from a K, N, M, and batch variables
+  CUTLASS_HOST_DEVICE
+  BatchedGemmCoord(Index m, Index n, Index k, Index b): Base(make_Coord(m, n, k, b)) { }
+
+  /// Returns the GEMM M coordinate
+  CUTLASS_HOST_DEVICE
+  Index const&  m() const { return this->at(kM); }
+
+  /// Returns reference to the GEMM M coordinate
+  CUTLASS_HOST_DEVICE
+  Index & m() { return this->at(kM); }
+
+  /// Returns the GEMM N coordinate
+  CUTLASS_HOST_DEVICE
+  Index const&  n() const { return this->at(kN); }
+
+  /// Returns reference to the GEMM N coordinate
+  CUTLASS_HOST_DEVICE
+  Index & n() { return this->at(kN); }
+
+  /// Returns the GEMM K coordinate
+  CUTLASS_HOST_DEVICE
+  Index const&  k() const { return this->at(kK); }
+
+  /// Returns reference to the GEMM K coordinate
+  CUTLASS_HOST_DEVICE
+  Index & k() { return this->at(kK); }
+
+  /// Returns the GEMM batch coordinate
+  CUTLASS_HOST_DEVICE
+  Index const&  batch() const { return this->at(kBatch); }
+
+  /// Returns reference to the GEMM batch coordinate
+  CUTLASS_HOST_DEVICE
+  Index & batch() { return this->at(kBatch); }
+
+  /// Obtains a GemmCoord from BatchedGemmCoord
+  CUTLASS_HOST_DEVICE
+  GemmCoord mnk() const {
+    return GemmCoord(m(), n(), k());
+  }
+
+  /// Obtains a Coord<4> from BatchedGemmCoord
+  CUTLASS_HOST_DEVICE
+  Coord<4> mnkb() const {
+    return make_Coord(m(), n(), k(), batch());
+  }
+
+  //
+  // Coord operators
+  //
+
+  /// Element-wise addition
+  CUTLASS_HOST_DEVICE
+  BatchedGemmCoord operator+(Base const& b) const {
+    return BatchedGemmCoord(Base::operator+(b));
+  }
+
+  /// Element-wise subtraction
+  CUTLASS_HOST_DEVICE
+  BatchedGemmCoord operator-(Base const& b) const {
+    return BatchedGemmCoord(Base::operator-(b));
+  }
+
+  /// Element-wise multiplication
+  CUTLASS_HOST_DEVICE
+  BatchedGemmCoord operator*(Base const& b) const {
+    return BatchedGemmCoord(Base::operator*(b));
+  }
+
+  /// Element-wise division
+  CUTLASS_HOST_DEVICE
+  BatchedGemmCoord operator/(Base const& b) const {
+    return BatchedGemmCoord(Base::operator/(b));
+  }
+
+  /// In-place addition
+  CUTLASS_HOST_DEVICE
+  BatchedGemmCoord& operator+=(Base const& b) {
+    Base::operator+=(b);
+    return *this;
+  }
+
+  /// In-place subtraction
+  CUTLASS_HOST_DEVICE
+  BatchedGemmCoord& operator-=(Base const& b) {
+    Base::operator-=(b);
+    return *this;
+  }
+
+  /// In-place multiplication
+  CUTLASS_HOST_DEVICE
+  BatchedGemmCoord& operator*=(Base const& b) {
+    Base::operator*=(b);
+    return *this;
+  }
+
+  /// In-place division
+  CUTLASS_HOST_DEVICE
+  BatchedGemmCoord& operator/=(Base const& b) {
+    Base::operator/=(b);
+    return *this;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm_coord.hpp b/include/cutlass/gemm_coord.hpp
new file mode 100644
index 0000000000..a979241ef6
--- /dev/null
+++ b/include/cutlass/gemm_coord.hpp
@@ -0,0 +1,66 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Utilities to convert a CuTe tuple to a GemmCoord or BatchedGemmCoord
+*/
+
+#pragma once
+
+#include "cute/layout.hpp"
+#include "cutlass/gemm_coord.h"
+
+namespace cutlass {
+namespace gemm {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class Tuple>
+CUTLASS_HOST_DEVICE
+auto
+to_gemm_coord(Tuple tuple) {
+  static_assert(cute::rank(tuple) <= 4, "Can only convert tuples of rank <= 4.");
+
+  if constexpr (cute::rank(tuple) <= 3) {
+    auto tuple_mnk = cute::append<3>(tuple, cute::Int<0>{});
+    return GemmCoord(cute::size<0>(tuple_mnk), cute::size<1>(tuple_mnk), cute::size<2>(tuple_mnk));
+  }
+  else {
+    return BatchedGemmCoord(cute::size<0>(tuple), cute::size<1>(tuple), cute::size<2>(tuple), cute::size<3>(tuple));
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/half.h b/include/cutlass/half.h
index 13d7146f8a..a0f398284a 100644
--- a/include/cutlass/half.h
+++ b/include/cutlass/half.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -33,6 +33,7 @@
     \brief Defines a class for using IEEE half-precision floating-point types in host or
       device code.
 */
+
 #pragma once
 
 #ifndef CUTLASS_ENABLE_F16C
@@ -40,25 +41,8 @@
 #endif
 
 #if defined(__CUDACC_RTC__)
-/* All floating-point numbers can be put in one of these categories.  */
-enum
-  {
-    FP_NAN =
-# define FP_NAN 0
-      FP_NAN,
-    FP_INFINITE =
-# define FP_INFINITE 1
-      FP_INFINITE,
-    FP_ZERO =
-# define FP_ZERO 2
-      FP_ZERO,
-    FP_SUBNORMAL =
-# define FP_SUBNORMAL 3
-      FP_SUBNORMAL,
-    FP_NORMAL =
-# define FP_NORMAL 4
-      FP_NORMAL
-  };
+
+#include "cutlass/floating_point_nvrtc.h"
 
 // F16C extensions are not meaningful when compiling for NVRTC which only accommodates device code.
 #undef CUTLASS_ENABLE_F16C
@@ -79,6 +63,7 @@ enum
 #include <cuda_fp16.h>
 
 #include "cutlass/cutlass.h"
+#include "cutlass/float8.h"
 #include "cutlass/platform/platform.h"
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -174,7 +159,6 @@ class CpuId {
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
-
 namespace cutlass {
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -327,9 +311,9 @@ struct alignas(2) half_t {
     #endif
 
     uint16_t const &h = x.storage;
-    int sign = ((h >> 15) & 1);
-    int exp = ((h >> 10) & 0x1f);
-    int mantissa = (h & 0x3ff);
+    uint32_t sign = ((h >> 15) & 1);
+    uint32_t exp = ((h >> 10) & 0x1f);
+    uint32_t mantissa = (h & 0x3ff);
     unsigned f = 0;
 
     if (exp > 0 && exp < 31) {
@@ -372,8 +356,7 @@ struct alignas(2) half_t {
   //
 
   /// Default constructor
-  CUTLASS_HOST_DEVICE
-  half_t() : storage(0) { }
+  half_t() = default;
 
   /// Reinterpret cast from CUDA's half type
   CUTLASS_HOST_DEVICE
@@ -398,6 +381,18 @@ struct alignas(2) half_t {
 
   }
 
+  /// float_e4m3_t conversion
+  CUTLASS_HOST_DEVICE
+  explicit half_t(float_e4m3_t x): half_t(float(x)) {
+
+  }
+
+  /// float_e5m2_t conversion
+  CUTLASS_HOST_DEVICE
+  explicit half_t(float_e5m2_t x): half_t(float(x)) {
+
+  }
+
   /// Integer conversion - round to nearest even
   CUTLASS_HOST_DEVICE
   explicit half_t(int x) {
@@ -607,38 +602,48 @@ struct numeric_limits<cutlass::half_t> {
   static int const digits = 10;
 
   /// Least positive value
+  CUTLASS_HOST_DEVICE
   static cutlass::half_t min() { return cutlass::half_t::bitcast(0x0001); }
 
   /// Minimum finite value
+  CUTLASS_HOST_DEVICE
   static cutlass::half_t lowest() { return cutlass::half_t::bitcast(0xfbff); }
 
   /// Maximum finite value
+  CUTLASS_HOST_DEVICE
   static cutlass::half_t max() { return cutlass::half_t::bitcast(0x7bff); }
 
   /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
   static cutlass::half_t epsilon() { return cutlass::half_t::bitcast(0x1800); }
 
-  /// Returns smallest finite value
+  /// Returns maximum rounding error
+  CUTLASS_HOST_DEVICE
   static cutlass::half_t round_error() { return cutlass::half_t(0.5f); }
 
-  /// Returns smallest finite value
+  /// Returns positive infinity value
+  CUTLASS_HOST_DEVICE
   static cutlass::half_t infinity() { return cutlass::half_t::bitcast(0x7c00); }
 
-  /// Returns smallest finite value
+  /// Returns quiet NaN value
+  CUTLASS_HOST_DEVICE
   static cutlass::half_t quiet_NaN() { return cutlass::half_t::bitcast(0x7fff); }
 
-  /// Returns smallest finite value
+  /// Returns signaling NaN value
+  CUTLASS_HOST_DEVICE
   static cutlass::half_t signaling_NaN() { return cutlass::half_t::bitcast(0x7fff); }
 
-  /// Returns smallest finite value
+  /// Returns smallest positive subnormal value
+  CUTLASS_HOST_DEVICE
   static cutlass::half_t denorm_min() { return cutlass::half_t::bitcast(0x0001); }
 };
 }  // namespace std
 #endif
 
+namespace cutlass {
 namespace platform {
 
-/// std::numeric_limits
+/// Forward Declaration
 template <class T>
 struct numeric_limits;
 
@@ -680,27 +685,28 @@ struct numeric_limits<cutlass::half_t> {
   CUTLASS_HOST_DEVICE
   static cutlass::half_t epsilon() { return cutlass::half_t::bitcast(0x1800); }
 
-  /// Returns smallest finite value
+  /// Returns maximum rounding error
   CUTLASS_HOST_DEVICE
   static cutlass::half_t round_error() { return cutlass::half_t(0.5f); }
 
-  /// Returns smallest finite value
+  /// Returns positive infinity value
   CUTLASS_HOST_DEVICE
   static cutlass::half_t infinity() { return cutlass::half_t::bitcast(0x7c00); }
 
-  /// Returns smallest finite value
+  /// Returns quiet NaN value
   CUTLASS_HOST_DEVICE
   static cutlass::half_t quiet_NaN() { return cutlass::half_t::bitcast(0x7fff); }
 
-  /// Returns smallest finite value
+  /// Returns signaling NaN value
   CUTLASS_HOST_DEVICE
   static cutlass::half_t signaling_NaN() { return cutlass::half_t::bitcast(0x7fff); }
 
-  /// Returns smallest finite value
+  /// Returns smallest positive subnormal value
   CUTLASS_HOST_DEVICE
   static cutlass::half_t denorm_min() { return cutlass::half_t::bitcast(0x0001); }
 };
 }  // namespace platform 
+}  // namespace cutlass
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 //
diff --git a/include/cutlass/integer_subbyte.h b/include/cutlass/integer_subbyte.h
index 981c4cbdea..27a50fd290 100644
--- a/include/cutlass/integer_subbyte.h
+++ b/include/cutlass/integer_subbyte.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -33,6 +33,7 @@
     \brief Defines a class for using integer types smaller than one byte in host or
       device code.
 */
+
 #pragma once
 
 #if defined(__CUDACC_RTC__)
@@ -41,122 +42,152 @@
 #include <cstdint>
 #endif
 
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_size.h"
 #include "cutlass/platform/platform.h"
 
 namespace cutlass {
 
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// 4-bit signed integer type
 template <int Bits, bool Signed = true>
 struct integer_subbyte {
-
-  /// Number of bits
-  static int const kBits = Bits;
-
-  /// Whether type is signed
-  static bool const kSigned = Signed;
-
-  /// External type
-  using T = typename platform::conditional<kSigned, int, unsigned>::type;
-
-  /// Storage type
   using Storage = uint8_t;
 
-  /// Bitmask used to truncate from larger integers
-  static Storage const kMask = Storage((1 << kBits) - 1);
+  static_assert(Bits <= 8*sizeof(Storage), "Require a subbyte of bits in integer_subbyte");
+
+  // "External type"; the integer type for which
+  // integer_subbyte has a conversion-to operator
+  using xint_t = typename cutlass::platform::conditional<Signed, int, unsigned>::type;
 
-  //
-  // Data members
-  //
+  // Bitmask for truncation from larger integers
+  static constexpr Storage bits_mask_ = Storage(Storage(-1) >> (8 - Bits));
+  // Bitmask for the sign bit
+  static constexpr Storage sign_mask_ = Storage((Signed ? 1 : 0) << (Bits - 1));
 
+  // Where the bits are stored
   Storage storage;
 
-  //
-  // Methods
-  //
+  // Default construction does NOT zero-initialize
+  integer_subbyte() = default;
 
-  /// No operation
+  // Implicit conversion is DEPRECATED.
+  // Please use one of the two explicit constructors below.
+  template<class T,
+    class Enable = cutlass::platform::enable_if_t<cutlass::platform::is_convertible_v<T, int>>
+  >
+  [[deprecated("Implicit conversion is deprecated; please use explicit construction instead")]]
   CUTLASS_HOST_DEVICE
-  integer_subbyte() { }
+  integer_subbyte(T value)
+      : integer_subbyte(static_cast<xint_t>(value)) {}
 
-  /// Conversion from integer type
-  CUTLASS_HOST_DEVICE
+  // CUTLASS code commonly converts both signed and unsigned integers
+  // into integer_subbyte, so the class provides both explicit
+  // conversions.
+
+  // Precondition: If the external type is unsigned int, then value
+  // fits in unsigned int (is nonnegative).
+  CUTLASS_HOST_DEVICE explicit
   integer_subbyte(int value)
-      : storage(reinterpret_cast<Storage const &>(value) & kMask) {}
+      : storage(reinterpret_cast<Storage const&>(value) & bits_mask_)
+  {
+    if constexpr (Signed) {
+      [[maybe_unused]] constexpr int lower_bound = -(1 << (Bits - 1));
+      [[maybe_unused]] constexpr int upper_bound = (1 << (Bits - 1)) - 1;
+      assert(value >= lower_bound);
+      assert(value <= upper_bound);
+    }
+    else {
+      [[maybe_unused]] constexpr unsigned upper_bound = 1u << Bits;
+      assert(value >= 0);
+      assert(value < static_cast<int>(upper_bound));
+    }
+  }
 
-  CUTLASS_HOST_DEVICE
+  // Precondition: If the external type is (signed) int, then value
+  // fits in int.
+  CUTLASS_HOST_DEVICE explicit
   integer_subbyte(unsigned value)
-      : storage(reinterpret_cast<Storage const &>(value) & kMask) {}
-
-  CUTLASS_HOST_DEVICE
-  integer_subbyte(double value) {
-    T tmp = static_cast<T>(value);
-    storage = Storage(reinterpret_cast<unsigned const &>(tmp) & kMask);
+      : storage(reinterpret_cast<Storage const&>(value) & bits_mask_)
+  {
+    if constexpr (Signed) {
+      [[maybe_unused]] constexpr int lower_bound = -(1 << (Bits - 1));
+      [[maybe_unused]] constexpr int upper_bound = (1 << (Bits - 1)) - 1;
+      assert(value >= lower_bound);
+      assert(value <= upper_bound);
+    }
+    else {
+      [[maybe_unused]] constexpr unsigned upper_bound = 1u << Bits;
+      assert(value < upper_bound);
+    }
   }
 
-  ///
+  CUTLASS_HOST_DEVICE explicit
+  integer_subbyte(uint8_t value)
+    : integer_subbyte(static_cast<unsigned>(value)) {}
+
+  // Convert to the "external" integer type (int or unsigned)
   CUTLASS_HOST_DEVICE
-  operator T() const {
-    if (kSigned) {
-      // Sign extend
-      if (storage & Storage(1 << (kBits - 1))) {
-        return T(storage) | ~T(kMask);
-      }
+  operator xint_t() const {
+    if (sign_mask_ & storage) {  // Sign extend
+      return xint_t(storage) | ~xint_t(bits_mask_);
+    } else {
+      return xint_t(storage);
     }
-    return T(storage);
   }
 
-  /// Equality
   CUTLASS_HOST_DEVICE
-  bool operator==(integer_subbyte const &rhs) const {
+  bool operator==(integer_subbyte const& rhs) const {
     return storage == rhs.storage;
   }
 
-  /// Inequality
   CUTLASS_HOST_DEVICE
-  bool operator!=(integer_subbyte const &rhs) const {
+  bool operator!=(integer_subbyte const& rhs) const {
     return storage != rhs.storage;
   }
 
-  /// Less than or equal
   CUTLASS_HOST_DEVICE
-  bool operator<=(integer_subbyte const &rhs) const {
-    if (kSigned) {
-      if (storage & (1 << (kBits - 1))) {
-        return !(rhs.storage < storage);
-      }
+  bool operator<(integer_subbyte const& rhs) const {
+    if ((sign_mask_ & storage) == (sign_mask_ & rhs.storage)) {
+      // If both *this and rhs have the same sign, compare storage directly.
+      return storage < rhs.storage;
+    }
+    else {
+      // If *this and rhs don't have the same sign,
+      // then return whether *this is negative.
+      return sign_mask_ & storage;
     }
-    return storage < rhs.storage;
   }
 
-  /// Less than
   CUTLASS_HOST_DEVICE
-  bool operator<(integer_subbyte const &rhs) const {
-    if (kSigned) {
-      if (storage & (1 << (kBits - 1))) {
-        return !(rhs.storage <= storage);
-      }
+  bool operator<=(integer_subbyte const& rhs) const {
+    if ((sign_mask_ & storage) == (sign_mask_ & rhs.storage)) {
+      // If both *this and rhs have the same sign, compare storage directly.
+      return storage <= rhs.storage;
+    }
+    else {
+      // If *this and rhs don't have the same sign,
+      // then return whether *this is negative.
+      return sign_mask_ & storage;
     }
-    return storage < rhs.storage;
   }
 
-  /// Greater than or equal
   CUTLASS_HOST_DEVICE
-  bool operator>=(integer_subbyte const &rhs) const {
+  bool operator>=(integer_subbyte const& rhs) const {
     return !(*this < rhs);
   }
 
-  /// Greater than
   CUTLASS_HOST_DEVICE
-  bool operator>(integer_subbyte const &rhs) const {
+  bool operator>(integer_subbyte const& rhs) const {
     return !(*this <= rhs);
   }
+
+  CUTLASS_HOST_DEVICE friend integer_subbyte
+  conj(integer_subbyte const& x) {
+    return x;
+  }
 };
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
-
 /// 1-bit Unsigned integer type
 using uint1b_t = integer_subbyte<1, false>;
 
@@ -172,70 +203,82 @@ using int4b_t = integer_subbyte<4, true>;
 /// 4-bit Unsigned integer type
 using uint4b_t = integer_subbyte<4, false>;
 
+/// 1-bit binary type
+using bin1_t = bool;
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines the size of an element in bits - specialized for uint1b_t
-template <>
-struct sizeof_bits<uint1b_t> {
-  static int const value = 1;
+template <int Bits, bool Signed>
+struct sizeof_bits<integer_subbyte<Bits,Signed>> {
+  static constexpr int value = Bits;
 };
 
-/// Defines the size of an element in bits - specialized for int2b_t
+/// Defines the size of an element in bits - specialized for bin1_t
 template <>
-struct sizeof_bits<int2b_t> {
-  static int const value = 2;
+struct sizeof_bits<bin1_t> {
+  static constexpr int value = 1;
 };
 
-/// Defines the size of an element in bits - specialized for uint2b_t
-template <>
-struct sizeof_bits<uint2b_t> {
-  static int const value = 2;
-};
+///////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines the size of an element in bits - specialized for int4b_t
-template <>
-struct sizeof_bits<int4b_t> {
-  static int const value = 4;
-};
+namespace platform {
 
-/// Defines the size of an element in bits - specialized for uint4b_t
-template <>
-struct sizeof_bits<uint4b_t> {
-  static int const value = 4;
-};
+/// Forward Declaration
+template <class T>
+struct numeric_limits;
+
+// Specialization for signed integer_subbyte
+template<int NumBits>
+struct numeric_limits<cutlass::integer_subbyte<NumBits, true>> {
+private:
+  using value_type = cutlass::integer_subbyte<NumBits, true>;
+
+public:
+  CUTLASS_HOST_DEVICE static value_type lowest() noexcept {
+    return value_type{
+      -(1 << (NumBits - 1))
+    };
+  }
 
-///////////////////////////////////////////////////////////////////////////////////////////////////
+  CUTLASS_HOST_DEVICE static value_type max() noexcept {
+    return value_type{
+      (1 << (NumBits - 1)) - 1
+    };
+  }
 
-namespace platform {
+  CUTLASS_HOST_DEVICE static value_type const min() noexcept {
+    return lowest();
+  }
 
-template <>
-struct numeric_limits<cutlass::int4b_t> {
-  CUTLASS_HOST_DEVICE
-  static cutlass::int4b_t const lowest() noexcept { return -8;}
-  CUTLASS_HOST_DEVICE
-  static cutlass::int4b_t const max() noexcept { return 7;}
   static constexpr bool is_integer = true;
+  static constexpr bool is_signed = true;
+  static constexpr bool has_infinity = false;
 };
 
-template <>
-struct numeric_limits<cutlass::uint4b_t> {
-  CUTLASS_HOST_DEVICE
-  static cutlass::uint4b_t const lowest() noexcept { return 0;}
-  CUTLASS_HOST_DEVICE
-  static cutlass::uint4b_t const max() noexcept { return 15;}
-  static constexpr bool is_integer = true;
-};
+// Specialization for unsigned integer_subbyte
+template<int NumBits>
+struct numeric_limits<cutlass::integer_subbyte<NumBits, false>> {
+private:
+  using value_type = cutlass::integer_subbyte<NumBits, false>;
+
+public:
+  CUTLASS_HOST_DEVICE static value_type lowest() noexcept {
+    return value_type{0u};
+  }
+
+  CUTLASS_HOST_DEVICE static value_type max() noexcept {
+    return value_type{
+      (1u << NumBits) - 1u
+    };
+  }
+
+  CUTLASS_HOST_DEVICE static value_type const min() noexcept {
+    return lowest();
+  }
 
-template <>
-struct numeric_limits<cutlass::uint1b_t> {
-  CUTLASS_HOST_DEVICE
-  static cutlass::uint1b_t const lowest() noexcept { return 0;}
-  CUTLASS_HOST_DEVICE
-  static cutlass::uint1b_t const max() noexcept { return 1;}
   static constexpr bool is_integer = true;
+  static constexpr bool is_signed = false;
 };
 
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
 } // namespace platform
 } // namespace cutlass
diff --git a/include/cutlass/kernel_hardware_info.h b/include/cutlass/kernel_hardware_info.h
new file mode 100644
index 0000000000..62dcb8b451
--- /dev/null
+++ b/include/cutlass/kernel_hardware_info.h
@@ -0,0 +1,76 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#if !defined(__CUDACC_RTC__)
+#include "cuda_runtime.h"
+
+#include "cutlass/trace.h"
+#endif
+
+namespace cutlass {
+
+struct KernelHardwareInfo {
+  //
+  // Data members
+  //
+  int device_id = 0;
+  int sm_count  = 0;
+
+  //
+  // Methods
+  //
+
+#if !defined(__CUDACC_RTC__)
+  static inline int
+  query_device_multiprocessor_count(int device_id = 0) {
+    cudaError_t result = cudaGetDevice(&device_id);
+    if (result != cudaSuccess) {
+      CUTLASS_TRACE_HOST(
+        "  cudaGetDevice() returned error "
+        << cudaGetErrorString(result));
+      return 0;
+    }
+    int multiprocessor_count;
+    result = cudaDeviceGetAttribute(&multiprocessor_count,
+      cudaDevAttrMultiProcessorCount, device_id);
+    if (result != cudaSuccess) {
+      CUTLASS_TRACE_HOST(
+        "  cudaDeviceGetAttribute() returned error "
+        << cudaGetErrorString(result));
+      return 0;
+    }
+    return multiprocessor_count;
+  }
+#endif
+};
+
+} // namespace cutlass
diff --git a/include/cutlass/kernel_hardware_info.hpp b/include/cutlass/kernel_hardware_info.hpp
new file mode 100644
index 0000000000..876aacc6b9
--- /dev/null
+++ b/include/cutlass/kernel_hardware_info.hpp
@@ -0,0 +1,35 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+// Simply import .h version of header so as to avoid breaking any existing CUTLASS builds
+// after .hpp was changed to .h
+#include "cutlass/kernel_hardware_info.h"
diff --git a/include/cutlass/kernel_launch.h b/include/cutlass/kernel_launch.h
index 16f906a58c..4cd087a3b3 100644
--- a/include/cutlass/kernel_launch.h
+++ b/include/cutlass/kernel_launch.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -34,7 +34,10 @@
 
 #pragma once
 
+#include <cuda_runtime_api.h>
 #include "cutlass/cutlass.h"
+#include "cutlass/trace.h"
+#include "cutlass/device_kernel.h" // cutlass::device_kernel
 
 namespace cutlass {
 
@@ -68,6 +71,72 @@ struct KernelLaunchConfiguration {
     dynamic_smem(_dynamic_smem) { }
 };
 
+
+template <typename GemmKernel, typename Params>
+Status kernel_launch(
+    dim3 const grid_dims,
+    dim3 const block_dims,
+    size_t const smem_size,
+    cudaStream_t cuda_stream,
+    const Params &kernel_params,
+    bool launch_with_pdl) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+  CUTLASS_TRACE_HOST("cutlass::kernel_launch");
+#endif
+
+  if (not launch_with_pdl) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("cutlass::kernel_launch: No PDL");
+#endif
+    device_kernel<GemmKernel><<<grid_dims, block_dims, smem_size, cuda_stream>>>(kernel_params);
+  }
+  else {
+#if ((__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8)))
+    if constexpr (GemmKernel::ArchTag::kMinComputeCapability < 90) {
+      CUTLASS_TRACE_HOST("  Programmatic dependent launch (PDL) is only supported for SM90.");
+      return Status::kInvalid;
+    }
+
+    cudaLaunchConfig_t config;
+    cudaLaunchAttribute attrs[1];
+
+    config.gridDim = grid_dims;
+    config.blockDim = block_dims;
+    config.dynamicSmemBytes = smem_size;
+    config.stream = cuda_stream;
+
+    config.attrs = attrs;
+    attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+    attrs[0].val.programmaticStreamSerializationAllowed = 1;
+    config.numAttrs = 1;
+
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("cutlass::kernel_launch: Calling cudaLaunchKernelEx");
+#endif
+    cudaError_t launch_result = cudaLaunchKernelEx(&config, &device_kernel<GemmKernel>, kernel_params);
+    if (cudaSuccess != launch_result) {
+      CUTLASS_TRACE_HOST("cutlass::kernel_launch: cudaLaunchKernelEx failed with error: " << cudaGetErrorString(launch_result));
+      return Status::kErrorInternal;
+    }
+#else
+    CUTLASS_TRACE_HOST("  Programmatic dependent launch (PDL) is only supported starting CUDA 11.8.");
+    return Status::kInvalid;
+#endif
+  }
+
+  cudaError_t result = cudaGetLastError();
+  if (cudaSuccess == result) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("cutlass::kernel_launch: cudaGetLastError reports success");
+#endif
+    return Status::kSuccess;
+  }
+  else {
+    CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << result);
+    return Status::kErrorInternal;
+  }
+}
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace cutlass
diff --git a/include/cutlass/layout/layout.h b/include/cutlass/layout/layout.h
index 8a676b0120..1089add39c 100644
--- a/include/cutlass/layout/layout.h
+++ b/include/cutlass/layout/layout.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/layout/matrix.h b/include/cutlass/layout/matrix.h
index 12473f9c6a..32aa17a5df 100644
--- a/include/cutlass/layout/matrix.h
+++ b/include/cutlass/layout/matrix.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -37,6 +37,7 @@
     Layout functions must implement all members in the public interface of IdentityTensorLayout<>
     defined in cutlass/tensor_ref.h.
 */
+
 #pragma once
 
 #include "cutlass/cutlass.h"
@@ -538,7 +539,7 @@ struct ContiguousMatrix {
   /// Inverse of layout function, mapping linear offset to logical coordinate
   CUTLASS_HOST_DEVICE
   MatrixCoord inverse(LongIndex offset) const {
-    // TODO
+    CUTLASS_UNUSED(offset);
     return MatrixCoord(0, 0);
   }
 
@@ -689,7 +690,6 @@ struct AffineRankN {
   /// Inverse of layout function, mapping linear offset to logical coordinate
   CUTLASS_HOST_DEVICE
   TensorCoord inverse(LongIndex offset) const {
-    // TODO
     return TensorCoord();
   }
 
@@ -785,7 +785,7 @@ struct AffineRank2ColumnMajor {
   /// Helper returns a layout to a tightly packed tensor
   CUTLASS_HOST_DEVICE
   static AffineRank2ColumnMajor packed(MatrixCoord const &extent) {
-    return AffineRank2ColumnMajor(extent.column(), 1);
+    return AffineRank2ColumnMajor(1, extent.row());
   }
 
   /// Returns the offset of a coordinate in linear memory. 
@@ -798,7 +798,7 @@ struct AffineRank2ColumnMajor {
   /// Inverse of layout function, mapping linear offset to logical coordinate
   CUTLASS_HOST_DEVICE
   MatrixCoord inverse(LongIndex offset) const {
-    // TODO
+    CUTLASS_UNUSED(offset);
     return MatrixCoord(0, 0);
   }
 
@@ -891,7 +891,7 @@ struct AffineRank2RowMajor {
   /// Helper returns a layout to a tightly packed tensor
   CUTLASS_HOST_DEVICE
   static AffineRank2RowMajor packed(MatrixCoord const &extent) {
-    return AffineRank2RowMajor(extent.column(), 1);
+    return AffineRank2RowMajor(1, extent.row());
   }
 
   /// Returns the offset of a coordinate in linear memory. 
@@ -904,7 +904,7 @@ struct AffineRank2RowMajor {
   /// Inverse of layout function, mapping linear offset to logical coordinate
   CUTLASS_HOST_DEVICE
   MatrixCoord inverse(LongIndex offset) const {
-    // TODO
+    CUTLASS_UNUSED(offset);
     return MatrixCoord(0, 0);
   }
 
@@ -1054,7 +1054,6 @@ struct ColumnMajorBlockLinear {
   CUTLASS_HOST_DEVICE
   MatrixCoord inverse(LongIndex offset) const {
 
-    // TODO
     return MatrixCoord(0, 0);
   }
 
@@ -1154,7 +1153,6 @@ struct RowMajorBlockLinear {
   /// Inverse of layout function, mapping linear offset to logical coordinate
   CUTLASS_HOST_DEVICE
   MatrixCoord inverse(LongIndex offset) const {
-    // TODO
     return MatrixCoord(0, 0);
   }
 
diff --git a/include/cutlass/layout/permute.h b/include/cutlass/layout/permute.h
new file mode 100644
index 0000000000..13e5ef222f
--- /dev/null
+++ b/include/cutlass/layout/permute.h
@@ -0,0 +1,825 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines layout functions used by GEMM+permute path for common tensor or matrix formats.
+
+    Like Layout functions, permute layout functions map logical coordinates to linear memory. They often require additional
+    data to describe strides between elements.
+
+    Permute layout functions must implement all members in the interface of NoPermute<> defined in this file. Address offset
+    computation lies in operator() with private member variables  {col_permute_, row_permute_ and stride_} as new addresses after permute op.
+*/
+#pragma once
+
+#include <cuda/std/cassert>
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/coord.h"
+#include "cutlass/tensor_coord.h"
+
+namespace cutlass {
+namespace layout {
+
+// template<PermuteTag, typename Layout, bool Inverse>
+// struct PermuteSelect {
+//   // Try to give a reasonable error message to the user
+//   static_assert(!platform::is_same<Permute, Permute>::value, // aka always_false<T>
+//                 "You've tried to use a layout permutation for which the implementation is not availble. "
+//                 "In order to provide an implementation for a particular combination of matrix layout "
+//                 "and direction (direct/inverse), please specialize PermuteSelect trait.");
+// };
+
+// Base template for defining specializations of permutation inverses
+template<typename Permute>
+struct InversePermute
+{
+  // Try to give a reasonable error message to the user
+  static_assert(!platform::is_same<Permute, Permute>::value, // aka always_false<T>
+                "To apply permutation to a GEMM input operand (A or B), an inverse permutation for the desired "
+                "permute class must be defined and enabled by specializing cutlass::layout::InversePermute trait.");
+};
+
+class PermuteBase {
+public:
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+};
+
+class NoPermute : public PermuteBase {
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor from matrix extent
+  CUTLASS_HOST_DEVICE
+  NoPermute(MatrixCoord extent, Index stride) { };
+
+  /// Constructor from pitch-linear extent
+  CUTLASS_HOST_DEVICE
+  NoPermute(PitchLinearCoord extent, Index stride) { };
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord coord) const { return 0; } // not correct but should never be called
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const { return 0; } // not correct but should never be called
+};
+
+template<>
+struct InversePermute<NoPermute> {
+  using type = NoPermute;
+};
+
+/// Helper trait to detect if permute operation is a noop
+template<typename Permute>
+inline bool constexpr is_trivial_permute = platform::is_same<Permute, cutlass::layout::NoPermute>::value;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Defines permute layouts of various tensor formats.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//  Tensor4DPermute0213
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Permute layout function for 4-D permuted tensors with matrix (dimensions [M, N]) reshaped
+/// as [M/D1, D1, D2, N/D2]. Then perform permute([0, 2, 1, 3]) on the corresponding tensor.
+template <int D1, int D2>
+class Tensor4DPermute0213RowMajor : public PermuteBase {
+private:
+  //
+  // Data members
+  //
+
+  Index D3_;
+
+  Index stride_;
+  
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermute0213RowMajor(MatrixCoord extent, Index stride) {
+
+    assert(extent.row() % D1 == 0);
+    assert(extent.column() % D2 == 0);
+
+    D3_ = extent.column() / D2;
+
+    stride_ = stride * D1 / D2;
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermute0213RowMajor(PitchLinearCoord extent, Index stride)
+  : Tensor4DPermute0213RowMajor(MatrixCoord(extent.strided(), extent.contiguous()), stride) {}
+  
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord coord) const {
+
+    // [i,j,k,l] -> [i,k,j,l]
+    Index l = coord.column() % D3_;
+    Index k = coord.column() / D3_;
+    Index j = coord.row() % D1;
+    Index i = coord.row() / D1;
+
+    MatrixCoord permuted{k + i * D2, l + j * D3_};
+
+    return LongIndex(permuted.row()) * LongIndex(stride_) + LongIndex(permuted.column());
+  }
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const { 
+    return operator()(MatrixCoord(coord.strided(), coord.contiguous()));
+  }
+};
+
+// Inverse for Tensor4DPermute0213 can be implemented by simply swapping D1 and D2
+template <int D1, int D2>
+class Tensor4DPermute0213RowMajorInverse : public Tensor4DPermute0213RowMajor<D2, D1> {
+public:
+  using Base = Tensor4DPermute0213RowMajor<D2, D1>;
+  using Base::Base;
+};
+
+template<int D1, int D2>
+struct InversePermute<Tensor4DPermute0213RowMajor<D1, D2>> {
+  using type = Tensor4DPermute0213RowMajorInverse<D1, D2>;
+};
+
+template<int D1, int D2>
+struct InversePermute<Tensor4DPermute0213RowMajorInverse<D1, D2>> {
+  using type = Tensor4DPermute0213RowMajor<D1, D2>;
+};
+
+/// Permute layout function for 4-D permuted tensors with matrix (dimensions [M, N]) reshaped
+/// as [M/D1, D1, D2, N/D2]. Then perform permute([0, 2, 1, 3]) on the corresponding tensor.
+template <int D1, int D2>
+class Tensor4DPermute0213ColumnMajor : public PermuteBase {
+private:
+  //
+  // Data members
+  //
+
+  Index D0_;
+
+  Index stride_;
+  
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermute0213ColumnMajor(MatrixCoord extent, Index stride) {
+
+    assert(extent.row() % D1 == 0);
+    assert(extent.column() % D2 == 0);
+
+    D0_ = extent.row() / D1;
+
+    stride_ = stride * D2 / D1;
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermute0213ColumnMajor(PitchLinearCoord extent, Index stride)
+  : Tensor4DPermute0213ColumnMajor(MatrixCoord(extent.contiguous(), extent.strided()), stride) {}
+  
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord coord) const {
+
+    // [i,j,k,l] -> [i,k,j,l]
+    Index l = coord.column() / D2;
+    Index k = coord.column() % D2;
+    Index j = coord.row() / D0_;
+    Index i = coord.row() % D0_;
+
+    MatrixCoord permuted{i + k * D0_, j + l * D1};
+
+    return LongIndex(permuted.row()) + LongIndex(permuted.column()) * LongIndex(stride_);
+  }
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const { 
+    return operator()(MatrixCoord(coord.contiguous(), coord.strided()));
+  }
+};
+
+// Inverse for Tensor4DPermute0213 can be implemented by simply swapping D1 and D2
+template <int D1, int D2>
+class Tensor4DPermute0213ColumnMajorInverse : public Tensor4DPermute0213ColumnMajor<D2, D1> {
+public:
+  using Base = Tensor4DPermute0213ColumnMajor<D2, D1>;
+  using Base::Base;
+};
+
+template<int D1, int D2>
+struct InversePermute<Tensor4DPermute0213ColumnMajor<D1, D2>> {
+  using type = Tensor4DPermute0213ColumnMajorInverse<D1, D2>;
+};
+
+template<int D1, int D2>
+struct InversePermute<Tensor4DPermute0213ColumnMajorInverse<D1, D2>> {
+  using type = Tensor4DPermute0213ColumnMajor<D1, D2>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//  Tensor4DPermuteBMM0213
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Permute layout function for 4-D permuted tensors for BMM with BMM tensor (dimensions [B, M, N]) reshaped
+/// as [B/D1, D1, M, N]. Then perform permute([0, 2, 1, 3]) on the corresponding whole BMM tensor.
+template <int D1>
+class Tensor4DPermuteBMM0213RowMajor : public PermuteBase {
+private:
+  //
+  // Data members
+  //
+
+  Index D3_;
+
+  Index stride_;
+
+  Index batch_stride_;
+  
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermuteBMM0213RowMajor(MatrixCoord extent, Index stride) {
+
+    Index D2 = extent.row();
+    D3_ = extent.column();
+
+    stride_ = stride * D1;
+    batch_stride_ = D2 * stride_;
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermuteBMM0213RowMajor(PitchLinearCoord extent, Index stride)
+  : Tensor4DPermuteBMM0213RowMajor(MatrixCoord(extent.strided(), extent.contiguous()), stride) {}
+  
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord coord) const {
+
+    // The batch index for BMM
+    Index BMM_batch_idx = blockIdx.z;
+    
+    // [i,j,k,l] -> [i,k,j,l]
+    Index l = coord.column();
+    Index k = coord.row();
+    Index j = BMM_batch_idx % D1;
+    Index i = BMM_batch_idx / D1;
+
+    Index pbatch = i;
+    MatrixCoord pcoord{k, l + j * D3_};
+
+    return pbatch * LongIndex(batch_stride_) + pcoord.row() * LongIndex(stride_) + pcoord.column();
+  }
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const { 
+    return operator()(MatrixCoord(coord.strided(), coord.contiguous()));
+  }
+};
+
+template <int D1>
+class Tensor4DPermuteBMM0213RowMajorInverse : public PermuteBase {
+private:
+  //
+  // Data members
+  //
+
+  Index D3_;
+
+  Index stride_;
+
+  Index batch_stride_;
+  
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermuteBMM0213RowMajorInverse(MatrixCoord extent, Index stride) {
+
+    assert(extent.column() % D1 == 0);
+
+    Index D2 = extent.row();
+    D3_ = extent.column() / D1;
+
+    stride_ = stride / D1;
+
+    batch_stride_ = D2 * stride_;
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermuteBMM0213RowMajorInverse(PitchLinearCoord extent, Index stride)
+  : Tensor4DPermuteBMM0213RowMajorInverse(MatrixCoord(extent.strided(), extent.contiguous()), stride) {}
+  
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord coord) const {
+
+    // The batch index for BMM
+    Index BMM_batch_idx = blockIdx.z;
+    
+    // The following assumes grouping [(D0)->batch, (D2)->row, (D1,D3)->col]
+    Index l = coord.column() % D3_;
+    Index j = coord.column() / D3_;
+    Index k = coord.row();
+    Index i = BMM_batch_idx;
+
+    // compute original [batch, row, col] index
+    Index pbatch = j + i * D1;
+    MatrixCoord pcoord{k, l};
+
+    return pbatch * LongIndex(batch_stride_) + pcoord.row() * LongIndex(stride_) + pcoord.column();
+  }
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const { 
+    return operator()(MatrixCoord(coord.strided(), coord.contiguous()));
+  }
+};
+
+template<int D1>
+struct InversePermute<Tensor4DPermuteBMM0213RowMajor<D1>> {
+  using type = Tensor4DPermuteBMM0213RowMajorInverse<D1>;
+};
+
+template<int D1>
+struct InversePermute<Tensor4DPermuteBMM0213RowMajorInverse<D1>> {
+  using type = Tensor4DPermuteBMM0213RowMajor<D1>;
+};
+
+/// Permute layout function for 4-D permuted tensors for BMM with BMM tensor (dimensions [B, M, N]) reshaped
+/// as [B/D1, D1, M, N]. Then perform permute([0, 3, 2, 1]) on the corresponding whole BMM tensor.
+template <int D1>
+class Tensor4DPermuteBMM0321ColumnMajor : public PermuteBase {
+private:
+  //
+  // Data members
+  //
+
+  Index D2_;
+
+  Index stride_;
+
+  Index batch_stride_;
+  
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermuteBMM0321ColumnMajor(MatrixCoord extent, Index stride) {
+
+    D2_ = extent.row();
+    Index D3 = extent.column();
+
+    stride_ = stride * D1;
+    batch_stride_ = stride_ * D3;
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermuteBMM0321ColumnMajor(PitchLinearCoord extent, Index stride)
+  : Tensor4DPermuteBMM0321ColumnMajor(MatrixCoord(extent.contiguous(), extent.strided()), stride) {}
+  
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord coord) const {
+
+    Index BMM_batch_idx = blockIdx.z;
+    
+    // [i,j,k,l] -> [i,k,j,l]
+    Index l = coord.column();
+    Index k = coord.row();
+    Index j = BMM_batch_idx % D1;
+    Index i = BMM_batch_idx / D1;
+
+    Index pbatch = i;
+    MatrixCoord pcoord{k + j * D2_, l};
+
+    return pbatch * LongIndex(batch_stride_) + pcoord.row() + pcoord.column() * LongIndex(stride_);
+  }
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const { 
+    return operator()(MatrixCoord(coord.contiguous(), coord.strided()));
+  }
+};
+
+template <int D1>
+class Tensor4DPermuteBMM0321ColumnMajorInverse : public PermuteBase {
+private:
+  //
+  // Data members
+  //
+
+  Index D2_;
+
+  Index stride_;
+
+  Index batch_stride_;
+  
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermuteBMM0321ColumnMajorInverse(MatrixCoord extent, Index stride) {
+
+    assert(extent.row() % D1 == 0);
+
+    D2_ = extent.row() / D1;
+    Index D3 = extent.column();
+
+    stride_ = stride / D1;
+    batch_stride_ = stride_ * D3;
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermuteBMM0321ColumnMajorInverse(PitchLinearCoord extent, Index stride)
+  : Tensor4DPermuteBMM0321ColumnMajorInverse(MatrixCoord(extent.contiguous(), extent.strided()), stride) {}
+  
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord coord) const {
+
+    Index BMM_batch_idx = blockIdx.z;
+    
+    // The following assumes grouping [(D0)->batch, (D1,D2)->row, (D3)->col]
+    Index l = coord.column();
+    Index k = coord.row() % D2_;
+    Index j = coord.row() / D2_;
+    Index i = BMM_batch_idx;
+
+    Index pbatch = i * D1 + j;
+    MatrixCoord pcoord{k, l};
+
+    return pbatch * LongIndex(batch_stride_) + pcoord.row() + pcoord.column() * LongIndex(stride_);
+  }
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const { 
+    return operator()(MatrixCoord(coord.contiguous(), coord.strided()));
+  }
+};
+
+template<int D1>
+struct InversePermute<Tensor4DPermuteBMM0321ColumnMajor<D1>> {
+  using type = Tensor4DPermuteBMM0321ColumnMajorInverse<D1>;
+};
+
+template<int D1>
+struct InversePermute<Tensor4DPermuteBMM0321ColumnMajorInverse<D1>> {
+  using type = Tensor4DPermuteBMM0321ColumnMajor<D1>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//  Tensor5DPermute20314
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Permute layout function for 5-D permuted tensors with output matrix (dimension as [M, N]) reshaped
+/// as [M/T1, T1, T2, T3, N/T2/T3]. Then perform permute([2, 0, 3, 1, 4]) on the corresponding output tensor.
+template <int T1, int T2, int T3>
+class Tensor5DPermute20314RowMajor : public PermuteBase {
+private:
+  //
+  // Data members
+  //
+
+  Index T0_;
+
+  Index T4_;
+
+  Index stride_;
+  
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor5DPermute20314RowMajor(MatrixCoord extent, Index stride) {
+
+    assert(extent.row() % T1 == 0);
+    assert(extent.column() % (T2 * T3) == 0);
+
+    T0_ = extent.row() / T1;
+    T4_ = extent.column() / (T2 * T3);
+
+    /// Update stride_permute with stride
+    stride_ = stride / T2 * T1; // stride in Elements
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor5DPermute20314RowMajor(PitchLinearCoord extent, Index stride)
+  : Tensor5DPermute20314RowMajor(MatrixCoord(extent.strided(), extent.contiguous()), stride) {}
+  
+  
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord coord) const {
+
+    // Permute as torch.permute(X1, [2, 0, 3, 1, 4]) -> 5D Tensor indices as [i,j,k,l,m], the dimension of X 
+    // is [T0, T1, T2, T3, T4], after permutation the dim of X1 is [T2, T0, T3, T1, T4].
+
+    Index m = coord.column() % T4_;
+    Index l = (coord.column() / T4_) % T3;
+    Index k = (coord.column() / T4_) / T3;
+    Index j = coord.row() % T1;
+    Index i = coord.row() / T1;
+
+    MatrixCoord permuted{i + k * T0_, m + j * T4_ + l * T1 * T4_};
+
+    return LongIndex(permuted.row()) * LongIndex(stride_) + LongIndex(permuted.column());
+  }
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const { 
+    return operator()(MatrixCoord(coord.strided(), coord.contiguous()));
+  }
+};
+
+/// Inverse for Tensor5DPermute20314 (could also be given a proper name, e.g. Tensor5DPermute13024).
+template <int T1, int T2, int T3>
+class Tensor5DPermute20314RowMajorInverse : public PermuteBase {
+private:
+  //
+  // Data members
+  //
+
+  Index T0_;
+
+  Index T4_;
+
+  // Permuted stride in units of elements
+  Index stride_;
+  
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor5DPermute20314RowMajorInverse(MatrixCoord extent, Index stride) {
+
+    assert(extent.row() % T2 == 0);
+    assert(extent.column() % (T1 * T3) == 0);
+
+    T0_ = extent.row() / T2;
+    T4_ = extent.column() / (T1 * T3);
+
+    stride_ = stride / T1 * T2;
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor5DPermute20314RowMajorInverse(PitchLinearCoord extent, Index stride)
+  : Tensor5DPermute20314RowMajorInverse(MatrixCoord(extent.strided(), extent.contiguous()), stride) {}
+
+  /// Computes the offset after the inverse of permute operation in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord coord) const {
+
+    Index m = coord.column() % T4_;
+    Index j = (coord.column() / T4_) % T1;
+    Index l = (coord.column() / T4_) / T1;
+    Index i = coord.row() % T0_;
+    Index k = coord.row() / T0_;
+
+    MatrixCoord permuted{j + i * T1, m + l * T4_ + k * T3 * T4_};
+
+    return LongIndex(permuted.row()) * LongIndex(stride_) + LongIndex(permuted.column());
+  }
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const { 
+    return operator()(MatrixCoord(coord.strided(), coord.contiguous()));
+  }
+};
+
+template<int T1, int T2, int T3>
+struct InversePermute<Tensor5DPermute20314RowMajor<T1, T2, T3>> {
+  using type = Tensor5DPermute20314RowMajorInverse<T1, T2, T3>;
+};
+
+template<int T1, int T2, int T3>
+struct InversePermute<Tensor5DPermute20314RowMajorInverse<T1, T2, T3>> {
+  using type = Tensor5DPermute20314RowMajor<T1, T2, T3>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Tensor5DPermute02413
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Permute layout function for 5-D permuted tensors with matrix (dimensions [M, N]) reshaped
+/// as [M/T1, T1, T2, T3, N/T2/T3]. Then perform permute([0, 2, 4, 1, 3]) on the corresponding tensor.
+template <int T1, int T2, int T3>
+class Tensor5DPermute02413ColumnMajor : public PermuteBase {
+private:
+  //
+  // Data members
+  //
+
+  Index T0_;
+
+  Index T4_;
+
+  Index stride_;
+  
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor5DPermute02413ColumnMajor(MatrixCoord extent, Index stride) {
+
+    assert(extent.row() % T1 == 0);
+    assert(extent.column() % (T2 * T3) == 0);
+
+    T0_ = extent.row() / T1;
+    T4_ = extent.column() / (T2 * T3);
+
+    /// Update stride_permute with stride
+    stride_ = stride / T1 * T2; // stride in Elements
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor5DPermute02413ColumnMajor(PitchLinearCoord extent, Index stride)
+  : Tensor5DPermute02413ColumnMajor(MatrixCoord(extent.contiguous(), extent.strided()), stride) {}
+  
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord coord) const {
+
+    // Permute as torch.permute(X1, [2, 0, 3, 1, 4]) -> 5D Tensor indices as [i,j,k,l,m], the dimension of X 
+    // is [T0, T1, T2, T3, T4], after permutation the dim of X1 is [T0, T2, T4, T1, T3].
+
+    Index m = (coord.column() / T2) / T3;
+    Index l = (coord.column() / T2) % T3;
+    Index k = coord.column() % T2;
+    Index j = coord.row() / T0_;
+    Index i = coord.row() % T0_;
+
+    MatrixCoord permuted{i + k * T0_, m + j * T4_ + l * T4_ * T1};
+
+    return LongIndex(permuted.row()) + LongIndex(permuted.column()) * LongIndex(stride_);
+  }
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const { 
+    return operator()(MatrixCoord(coord.contiguous(), coord.strided()));
+  }
+};
+
+/// Inverse for Tensor5DPermute02413ColumnMajor
+template <int T1, int T2, int T3>
+class Tensor5DPermute02413ColumnMajorInverse : public PermuteBase {
+private:
+  //
+  // Data members
+  //
+
+  Index T0_;
+
+  Index T4_;
+
+  // Permuted stride in units of elements
+  Index stride_;
+  
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor5DPermute02413ColumnMajorInverse(MatrixCoord extent, Index stride) {
+
+    assert(extent.row() % T2 == 0);
+    assert(extent.column() % (T1 * T3) == 0);
+
+    T0_ = extent.row() / T2;
+    T4_ = extent.column() / (T1 * T3);
+
+    stride_ = stride / T2 * T1;
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor5DPermute02413ColumnMajorInverse(PitchLinearCoord extent, Index stride)
+  : Tensor5DPermute02413ColumnMajorInverse(MatrixCoord(extent.contiguous(), extent.strided()), stride) {}
+
+  /// Computes the offset after the inverse of permute operation in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord coord) const {
+
+    Index m = coord.column() % T4_;
+    Index j = (coord.column() / T4_) % T1;
+    Index l = (coord.column() / T4_) / T1;
+    Index i = coord.row() % T0_;
+    Index k = coord.row() / T0_;
+
+    MatrixCoord permuted{i + j * T0_, k + l * T2 + m * T2 * T3};
+
+    return LongIndex(permuted.row()) + LongIndex(permuted.column()) * LongIndex(stride_);
+  }
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const { 
+    return operator()(MatrixCoord(coord.contiguous(), coord.strided()));
+  }
+};
+
+template<int T1, int T2, int T3>
+struct InversePermute<Tensor5DPermute02413ColumnMajor<T1, T2, T3>> {
+  using type = Tensor5DPermute02413ColumnMajorInverse<T1, T2, T3>;
+};
+
+template<int T1, int T2, int T3>
+struct InversePermute<Tensor5DPermute02413ColumnMajorInverse<T1, T2, T3>> {
+  using type = Tensor5DPermute02413ColumnMajor<T1, T2, T3>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace layout
+} // namespace cutlass
diff --git a/include/cutlass/layout/pitch_linear.h b/include/cutlass/layout/pitch_linear.h
index ee3f39b4c8..8c9540f408 100644
--- a/include/cutlass/layout/pitch_linear.h
+++ b/include/cutlass/layout/pitch_linear.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -31,6 +31,7 @@
 /*! \file
     \brief Defines layout functions used by TensorRef and derived classes for pitch-linear memory.
 */
+
 #pragma once
 
 #include "cutlass/cutlass.h"
diff --git a/include/cutlass/layout/tensor.h b/include/cutlass/layout/tensor.h
index f31df4062b..d296f1d04b 100644
--- a/include/cutlass/layout/tensor.h
+++ b/include/cutlass/layout/tensor.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -39,11 +39,9 @@
     defined in cutlass/tensor_ref.h.
 */
 #pragma once
-#if defined(__CUDACC_RTC__)
+
 #include <cuda/std/cassert>
-#else
-#include "assert.h"
-#endif
+
 #include "cutlass/cutlass.h"
 #include "cutlass/fast_math.h"
 #include "cutlass/layout/pitch_linear.h"
@@ -60,6 +58,19 @@ namespace layout {
 //
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+/// Tag used for 3-D NWC tensors for 1-D convolutions; only used in 3.x API
+class TensorNWC {};
+
+/// Tag used for n-D KCSRT tensors for n-D convolutions; only used in 3.x API for wgrad output layouts
+class TensorKCS {};
+class TensorKCSR {};
+class TensorKCSRT {};
+
+/// Tag used for n-D CSRTK tensors for n-D convolutions; only used in 3.x API for wgrad output layouts
+class TensorCSK {};
+class TensorCSRK {};
+class TensorCSRTK {};
+
 /// Mapping function for 4-D NHWC tensors.
 class TensorNHWC {
 public:
@@ -629,7 +640,6 @@ class TensorNDHWC {
   }
 };
 
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace layout
diff --git a/include/cutlass/layout/tensor_op_multiplicand_sm70.h b/include/cutlass/layout/tensor_op_multiplicand_sm70.h
index 3e96215106..b260942a73 100644
--- a/include/cutlass/layout/tensor_op_multiplicand_sm70.h
+++ b/include/cutlass/layout/tensor_op_multiplicand_sm70.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -37,6 +37,7 @@
 #include "cutlass/cutlass.h"
 #include "cutlass/coord.h"
 #include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_coord.h" // cutlass::MatrixCoord
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/include/cutlass/layout/tensor_op_multiplicand_sm75.h b/include/cutlass/layout/tensor_op_multiplicand_sm75.h
index 4009ac84d0..1cda442862 100644
--- a/include/cutlass/layout/tensor_op_multiplicand_sm75.h
+++ b/include/cutlass/layout/tensor_op_multiplicand_sm75.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -48,6 +48,8 @@ namespace layout {
 
 /// Template based on element size (in bits) - defined in terms of pitch-linear
 /// memory and Crosswise size (in elements).
+/// This one is the base class of all Ampere/Turing fp16/bf16/int8/int4/int1
+/// tensor core kernels.  tf32 TN uses this too.
 template <int ElementSize, int Crosswise>
 struct TensorOpMultiplicand {
   /// Logical rank of tensor
@@ -258,6 +260,8 @@ struct TensorOpMultiplicandCongruous {
 
   static int const kElementSize = Base::kElementSize;
   static int const kElementsPerAccess = Base::kElementsPerAccess;
+  static int const kCrosswise = Base::kCrosswise;
+  static int const kFactor = Base::kFactor;
   using PartitionCount =  typename Base::PartitionCount;
   using AccessCount = typename Base::AccessCount;
 
@@ -321,6 +325,7 @@ struct TensorOpMultiplicandCongruous {
 
 /// Template based on element size (in bits) - defined in terms of pitch-linear
 /// memory and Crosswise size (in elements).
+/// This one is just for TF32 NT kernel.
 template <int Crosswise>
 struct TensorOpMultiplicandCongruous<32, Crosswise> {
   /// Logical rank of tensor
@@ -366,6 +371,8 @@ struct TensorOpMultiplicandCongruous<32, Crosswise> {
   //
   static int const kElementSize = 32;
   static int const kElementsPerAccess = kAccessSize / kElementSize;
+  static int const kCrosswise = Crosswise;
+  static int const kFactor = 1;
 
  private:
   //
@@ -468,6 +475,8 @@ struct ColumnMajorTensorOpMultiplicandCongruous {
 
   static int const kElementSize = Base::kElementSize;
   static int const kElementsPerAccess = Base::kElementsPerAccess;
+  static int const kCrosswise = Base::kCrosswise;
+  static int const kFactor = Base::kFactor;
   using PartitionCount =  typename Base::PartitionCount;
   using AccessCount = typename Base::AccessCount;
 
@@ -573,6 +582,8 @@ struct RowMajorTensorOpMultiplicandCongruous {
 
   static int const kElementSize = Base::kElementSize;
   static int const kElementsPerAccess = Base::kElementsPerAccess;
+  static int const kCrosswise = Base::kCrosswise;
+  static int const kFactor = Base::kFactor;
   using PartitionCount =  typename Base::PartitionCount;
   using AccessCount = typename Base::AccessCount;
 
diff --git a/include/cutlass/layout/tensor_op_multiplicand_sm80.h b/include/cutlass/layout/tensor_op_multiplicand_sm80.h
index 33602879ec..15d5283990 100644
--- a/include/cutlass/layout/tensor_op_multiplicand_sm80.h
+++ b/include/cutlass/layout/tensor_op_multiplicand_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -29,7 +29,7 @@
  *
  **************************************************************************************************/
 /*! \file
-    \brief 
+    \brief layouts needed by Ampere fp64 tensor core kernels.
 */
 
 #pragma once
diff --git a/include/cutlass/layout/vector.h b/include/cutlass/layout/vector.h
index 77a3b75318..56506feab8 100644
--- a/include/cutlass/layout/vector.h
+++ b/include/cutlass/layout/vector.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -78,6 +78,7 @@ class PackedVectorLayout {
   /// Helper returns a layout to a tightly packed tensor
   CUTLASS_HOST_DEVICE
   static PackedVectorLayout packed(TensorCoord const &size) {
+    CUTLASS_UNUSED(size);
     return PackedVectorLayout();
   }
 
diff --git a/include/cutlass/matrix.h b/include/cutlass/matrix.h
index 41b3558fbe..5d8ccb3c1c 100644
--- a/include/cutlass/matrix.h
+++ b/include/cutlass/matrix.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -7825,7 +7825,7 @@ struct Matrix<Element_, 3, 3> {
 
     Matrix m;
 
-    m.set_slice3x3({
+    m.set_slice_3x3({
       c + x * x * one_minus_cos, x * y * one_minus_cos - z * s, x * z * one_minus_cos + y * s,
       y * x * one_minus_cos * z * s, c + y * y * one_minus_cos, y * z * one_minus_cos - x * s,
       z * x * one_minus_cos - y * s, z * y * one_minus_cos + x * s, c + z * z * one_minus_cos
@@ -7845,7 +7845,7 @@ struct Matrix<Element_, 3, 3> {
 
     Matrix m = Matrix::identity();
 
-    m.set_slice3x3({
+    m.set_slice_3x3({
       Element(1) - Element(2) * a * a, Element(-2) * a * b, Element(-2) * a * c,
       Element(-2) * a * b, Element(1) - Element(2) * b * b, Element(-2) * b * c,
       Element(-2) * a * c, Element(-2) * b * c, Element(1) - Element(2) * c * c
@@ -14005,7 +14005,7 @@ struct Matrix<Element_, 4, 4> {
 
     Matrix m;
 
-    m.set_slice3x3({
+    m.set_slice_3x3({
       c + x * x * one_minus_cos, x * y * one_minus_cos - z * s, x * z * one_minus_cos + y * s,
       y * x * one_minus_cos * z * s, c + y * y * one_minus_cos, y * z * one_minus_cos - x * s,
       z * x * one_minus_cos - y * s, z * y * one_minus_cos + x * s, c + z * z * one_minus_cos
@@ -14025,7 +14025,7 @@ struct Matrix<Element_, 4, 4> {
 
     Matrix m = Matrix::identity();
 
-    m.set_slice3x3({
+    m.set_slice_3x3({
       Element(1) - Element(2) * a * a, Element(-2) * a * b, Element(-2) * a * c,
       Element(-2) * a * b, Element(1) - Element(2) * b * b, Element(-2) * b * c,
       Element(-2) * a * c, Element(-2) * b * c, Element(1) - Element(2) * c * c
diff --git a/include/cutlass/matrix_coord.h b/include/cutlass/matrix_coord.h
index e6edb3822d..719575d592 100644
--- a/include/cutlass/matrix_coord.h
+++ b/include/cutlass/matrix_coord.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/matrix_shape.h b/include/cutlass/matrix_shape.h
index d1086f0fa9..66623a4318 100644
--- a/include/cutlass/matrix_shape.h
+++ b/include/cutlass/matrix_shape.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/numeric_conversion.h b/include/cutlass/numeric_conversion.h
index d7aad5ab15..b62a90ccac 100644
--- a/include/cutlass/numeric_conversion.h
+++ b/include/cutlass/numeric_conversion.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -28,10 +28,11 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
-/*! 
+/*!
     \file
     \brief Boost-like numeric conversion operator for CUTLASS numeric types
 */
+
 #pragma once
 
 #if !defined(__CUDACC_RTC__)
@@ -40,10 +41,11 @@
 
 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
-#include "cutlass/transform/thread/unaryOp.h"
+#include "cutlass/transform/thread/unary_op.h"
 
 #include "cutlass/array.h"
 #include "cutlass/half.h"
+#include "cutlass/bfloat16.h"
 
 namespace cutlass {
 
@@ -55,6 +57,7 @@ enum class FloatRoundStyle {
   round_indeterminate,          ///< rounding mode unknown
   round_toward_zero,            ///< round toward zero
   round_to_nearest,             ///< round to nearest even
+  round_to_nearest_satfinite,   ///< round to nearest even, capping value to min and max of destination type
   round_toward_infinity,        ///< round toward infinity
   round_toward_neg_infinity,    ///< round toward negative infinity
   round_half_ulp_truncate,      ///< add 0.5ulp to integer representation then round toward zero
@@ -75,13 +78,13 @@ struct NumericConverter {
   static FloatRoundStyle const round_style = Round;
 
   CUTLASS_HOST_DEVICE
-    static result_type convert(source_type const & s) {
+  static result_type convert(source_type const & s) {
 
     return static_cast<result_type>(s);
   }
 
   CUTLASS_HOST_DEVICE
-    result_type operator()(source_type const &s) {
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
@@ -92,7 +95,6 @@ struct NumericConverter {
 //
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-#if defined(__CUDA_ARCH__)
 template <>
 struct NumericConverter<int32_t, float, FloatRoundStyle::round_to_nearest> {
 
@@ -100,14 +102,18 @@ struct NumericConverter<int32_t, float, FloatRoundStyle::round_to_nearest> {
   using source_type = float;
   static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
 
-  CUTLASS_DEVICE
+  CUTLASS_HOST_DEVICE
   static result_type convert(source_type const & s) {
-
+    #if __CUDA_ARCH__
     return __float2int_rn(s);
+    #elif !defined(__CUDACC_RTC__)
+    std::fesetround(FE_TONEAREST);
+    return static_cast<result_type>(std::nearbyint(s));
+    #endif
   }
 
-  CUTLASS_DEVICE
-  result_type operator()(source_type const &s) {
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
@@ -119,147 +125,211 @@ struct NumericConverter<int32_t, float, FloatRoundStyle::round_toward_zero> {
   using source_type = float;
   static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_zero;
 
-  CUTLASS_DEVICE
+  CUTLASS_HOST_DEVICE
   static result_type convert(source_type const & s) {
-
+    #if __CUDA_ARCH__
     return __float2int_rz(s);
+    #elif !defined(__CUDACC_RTC__)
+    std::fesetround(FE_TOWARDZERO);
+    return (result_type)std::nearbyint(s);
+    #endif
   }
 
-  CUTLASS_DEVICE
-  result_type operator()(source_type const &s) {
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
 
-#elif !defined(__CUDACC_RTC__)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for float => int8_t
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <>
-struct NumericConverter<int32_t, float, FloatRoundStyle::round_to_nearest> {
+struct NumericConverter<int8_t, float, FloatRoundStyle::round_to_nearest> {
 
-  using result_type = int32_t;
+  using result_type = int8_t;
   using source_type = float;
   static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
 
+  CUTLASS_HOST_DEVICE
   static result_type convert(source_type const & s) {
+    #if defined(__CUDA_ARCH__)
+    int32_t intermediate;
+    asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(intermediate) : "f"(s));
+    return static_cast<result_type>(intermediate);
+    #elif !defined(__CUDACC_RTC__)
     std::fesetround(FE_TONEAREST);
-    return (result_type)std::nearbyint(s);
+    int32_t intermediate = (int32_t)std::nearbyint(s);
+    // Low-end saturation
+    intermediate = std::max(intermediate, (int32_t)std::numeric_limits<int8_t>::lowest());
+    // High-end saturation
+    intermediate = std::min(intermediate, (int32_t)std::numeric_limits<int8_t>::max());
+    return static_cast<result_type>(intermediate);
+    #endif
   }
 
-  result_type operator()(source_type const &s) {
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
 
 template <>
-struct NumericConverter<int32_t, float, FloatRoundStyle::round_toward_zero> {
+struct NumericConverter<int8_t, float, FloatRoundStyle::round_toward_zero> {
 
-  using result_type = int32_t;
+  using result_type = int8_t;
   using source_type = float;
-  static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_zero;
+  static FloatRoundStyle const round_style =  FloatRoundStyle::round_toward_zero;
 
+  CUTLASS_HOST_DEVICE
   static result_type convert(source_type const & s) {
+    #if defined(__CUDA_ARCH__)
+    int32_t intermediate;
+    asm volatile("cvt.rzi.sat.s8.f32 %0, %1;" : "=r"(intermediate) : "f"(s));
+    return static_cast<result_type>(intermediate);
+    #elif !defined(__CUDACC_RTC__)
     std::fesetround(FE_TOWARDZERO);
-    return (result_type)std::nearbyint(s);
+    int32_t intermediate = (int32_t)std::nearbyint(s);
+    // Low-end saturation
+    intermediate = std::max(intermediate, (int32_t)std::numeric_limits<int8_t>::lowest());
+    // High-end saturation
+    intermediate = std::min(intermediate, (int32_t)std::numeric_limits<int8_t>::max());
+    return static_cast<result_type>(intermediate);
+    #endif 
   }
 
-  result_type operator()(source_type const &s) {
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
-#endif
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Partial specializations for float => int8_t
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
 
-#if defined(__CUDA_ARCH__)
 template <>
-struct NumericConverter<int8_t, float, FloatRoundStyle::round_to_nearest> {
+struct NumericConverter<uint8_t, float, FloatRoundStyle::round_to_nearest> {
 
-  using result_type = int8_t;
+  using result_type = uint8_t;
   using source_type = float;
   static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
 
-  CUTLASS_DEVICE
+  CUTLASS_HOST_DEVICE
   static result_type convert(source_type const & s) {
-
-    int32_t intermediate = __float2int_rn(s);
-
+    #if defined(__CUDA_ARCH__)
+    int32_t intermediate;
+    asm volatile("cvt.rni.sat.u8.f32 %0, %1;" : "=r"(intermediate) : "f"(s));
     return static_cast<result_type>(intermediate);
+    #elif !defined(__CUDACC_RTC__)
+    std::fesetround(FE_TONEAREST);
+    int32_t intermediate = (int32_t)std::nearbyint(s);
+    // Low-end saturation
+    intermediate = std::max(intermediate, (int32_t)std::numeric_limits<uint8_t>::lowest());
+    // High-end saturation
+    intermediate = std::min(intermediate, (int32_t)std::numeric_limits<uint8_t>::max());
+    return static_cast<result_type>(intermediate);
+    #endif
   }
 
-  CUTLASS_DEVICE
-  result_type operator()(source_type const &s) {
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
 
 template <>
-struct NumericConverter<int8_t, float, FloatRoundStyle::round_toward_zero> {
+struct NumericConverter<uint8_t, float, FloatRoundStyle::round_toward_zero> {
 
-  using result_type = int8_t;
+  using result_type = uint8_t;
   using source_type = float;
   static FloatRoundStyle const round_style =  FloatRoundStyle::round_toward_zero;
 
-  CUTLASS_DEVICE
+  CUTLASS_HOST_DEVICE
   static result_type convert(source_type const & s) {
-
-    int32_t intermediate = __float2int_rz(s);
-
+    #if __CUDA_ARCH__
+    int32_t intermediate;
+    asm volatile("cvt.rzi.sat.u8.f32 %0, %1;" : "=r"(intermediate) : "f"(s));
+    return static_cast<result_type>(intermediate);
+    #elif !defined(__CUDACC_RTC__)
+    std::fesetround(FE_TOWARDZERO);
+    int32_t intermediate = (int32_t)std::nearbyint(s);
+    // Low-end saturation
+    intermediate = std::max(intermediate, (int32_t)std::numeric_limits<uint8_t>::lowest());
+    // High-end saturation
+    intermediate = std::min(intermediate, (int32_t)std::numeric_limits<uint8_t>::max());
     return static_cast<result_type>(intermediate);
+    #endif
   }
 
-  CUTLASS_DEVICE
-  result_type operator()(source_type const &s) {
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
 
-#elif !defined(__CUDACC_RTC__)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for float => integer_subbyte
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
 
-template <>
-struct NumericConverter<int8_t, float, FloatRoundStyle::round_to_nearest> {
+template<int Bits, FloatRoundStyle Round>
+struct NumericConverter<integer_subbyte<Bits, /* Signed = */ true>, float, Round> {
+private:
+  static constexpr bool result_is_signed = true;
 
-  using result_type = int8_t;
+public:
+  using result_type = integer_subbyte<Bits, result_is_signed>;
   using source_type = float;
-  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
+  static constexpr FloatRoundStyle round_style = Round;
 
-  static result_type convert(source_type const & s) {
-    std::fesetround(FE_TONEAREST);
-    int32_t intermediate =  (result_type)std::nearbyint(s);
-    return static_cast<result_type>(intermediate);
+  CUTLASS_HOST_DEVICE static result_type
+  convert(source_type const& src) {
+    using middle_type = int;
+    static_assert(8 * sizeof(middle_type) > Bits, "This conversion "
+      "requires that integer_subbyte have fewer representation bits "
+      "than the number of bits in int.");
+
+    auto middle = NumericConverter<middle_type, source_type, Round>::convert(src);
+    return NumericConverter<result_type, middle_type, Round>::convert(middle);
   }
 
-  result_type operator()(source_type const &s) {
+  CUTLASS_HOST_DEVICE result_type
+  operator()(source_type const& s) const {
     return convert(s);
   }
 };
 
-template <>
-struct NumericConverter<int8_t, float, FloatRoundStyle::round_toward_zero> {
+template<int Bits, FloatRoundStyle Round>
+struct NumericConverter<integer_subbyte<Bits, /* Signed = */ false>, float, Round> {
+private:
+  static constexpr bool result_is_signed = false;
 
-  using result_type = int8_t;
+public:
+  using result_type = integer_subbyte<Bits, result_is_signed>;
   using source_type = float;
-  static FloatRoundStyle const round_style =  FloatRoundStyle::round_toward_zero;
+  static constexpr FloatRoundStyle round_style = Round;
 
-  static result_type convert(source_type const & s) {
-    std::fesetround(FE_TOWARDZERO);
-    int32_t intermediate =  (result_type)std::nearbyint(s);
-    return static_cast<result_type>(intermediate);
+  CUTLASS_HOST_DEVICE static result_type
+  convert(source_type const& src) {
+    using middle_type = unsigned;
+    static_assert(8 * sizeof(middle_type) > Bits, "This conversion "
+      "requires that integer_subbyte have fewer representation bits "
+      "than the number of bits in unsigned int.");
+
+    auto middle = NumericConverter<middle_type, source_type, Round>::convert(src);
+    return NumericConverter<result_type, middle_type, Round>::convert(middle);
   }
 
-  result_type operator()(source_type const &s) {
+  CUTLASS_HOST_DEVICE result_type  
+  operator()(source_type const& s) const {
     return convert(s);
   }
 };
-
-#endif
-
+  
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Partial specialization for float <= half_t
+/// Partial specialization for float <= cutlass::half_t
 template <typename T, FloatRoundStyle Round>
 struct NumericConverter<T, T, Round> {
 
@@ -274,23 +344,23 @@ struct NumericConverter<T, T, Round> {
   }
 
   CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) {
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 //
-// Partial specializations for float <=> half_t
+// Partial specializations for float <=> cutlass::half_t
 //
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Partial specialization for float <= half_t
+/// Partial specialization for float <= cutlass::half_t
 template <FloatRoundStyle Round>
-struct NumericConverter<float, half_t, Round> {
+struct NumericConverter<float, cutlass::half_t, Round> {
 
   using result_type = float;
-  using source_type = half_t;
+  using source_type = cutlass::half_t;
   static FloatRoundStyle const round_style = Round;
 
   CUTLASS_HOST_DEVICE
@@ -302,38 +372,38 @@ struct NumericConverter<float, half_t, Round> {
   }
 
   CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) {
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
 
 /// Specialization for round-to-nearest
 template <>
-struct NumericConverter<half_t, float, FloatRoundStyle::round_to_nearest> {
+struct NumericConverter<cutlass::half_t, float, FloatRoundStyle::round_to_nearest> {
 
-  using result_type = half_t;
+  using result_type = cutlass::half_t;
   using source_type = float;
   static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
 
   CUTLASS_HOST_DEVICE
   static result_type convert(source_type const & s) {
 
-    result_type result = static_cast<half_t>(s);
+    result_type result = static_cast<cutlass::half_t>(s);
 
     return result;
   }
 
   CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) {
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
 
 /// Specialization for round-toward-zero
 template <>
-struct NumericConverter<half_t, float, FloatRoundStyle::round_toward_zero> {
+struct NumericConverter<cutlass::half_t, float, FloatRoundStyle::round_toward_zero> {
 
-  using result_type = half_t;
+  using result_type = cutlass::half_t;
   using source_type = float;
   static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_zero;
 
@@ -342,18 +412,18 @@ struct NumericConverter<half_t, float, FloatRoundStyle::round_toward_zero> {
   static result_type convert(source_type const & flt) {
 
   #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-    return half_t(__float2half_rz(flt));
+    return cutlass::half_t(__float2half_rz(flt));
   #else
     // software implementation rounds toward nearest even
     unsigned const& s = reinterpret_cast<unsigned const &>(flt);
     uint16_t sign = uint16_t((s >> 16) & 0x8000);
-    int16_t exp = uint16_t(((s >> 23) & 0xff) - 127);
+    int32_t exp = int32_t((s >> 23) & 0xff) - 127;
     int mantissa = s & 0x7fffff;
     uint16_t u = 0;
 
     if ((s & 0x7fffffff) == 0) {
       // sign-preserving zero
-      return half_t::bitcast(sign);
+      return cutlass::half_t::bitcast(sign);
     }
 
     if (exp > 15) {
@@ -364,16 +434,15 @@ struct NumericConverter<half_t, float, FloatRoundStyle::round_toward_zero> {
         // overflow to infinity
         u = sign | 0x7c00;
       }
-      return half_t::bitcast(u);
+      return cutlass::half_t::bitcast(u);
     }
 
     if (exp >= -14) {
       // normal fp32 to normal fp16
-      exp = uint16_t(exp + uint16_t(15));
-      u = uint16_t(((exp & 0x1f) << 10));
+      u = uint16_t((uint32_t(exp + 15) & 0x1f) << 10);
       u = uint16_t(u | (mantissa >> 13));
     } else {
-      // normal single-precision to subnormal half_t-precision representation
+      // normal single-precision to subnormal cutlass::half_t-precision representation
       int rshift = (-14 - exp);
       if (rshift < 32) {
         mantissa |= (1 << 23);
@@ -387,29 +456,29 @@ struct NumericConverter<half_t, float, FloatRoundStyle::round_toward_zero> {
 
     u |= sign;
 
-    return half_t::bitcast(u);
+    return cutlass::half_t::bitcast(u);
 
   #endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
   }
 
   CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) {
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 //
-// Partial specializations for float <=> bfloat16_t
+// Partial specializations for float <=> cutlass::bfloat16_t
 //
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Partial specialization for float <= bfloat16_t
+/// Partial specialization for float <= cutlass::bfloat16_t
 template <FloatRoundStyle Round>
-struct NumericConverter<float, bfloat16_t, Round> {
+struct NumericConverter<float, cutlass::bfloat16_t, Round> {
 
   using result_type = float;
-  using source_type = bfloat16_t;
+  using source_type = cutlass::bfloat16_t;
   static FloatRoundStyle const round_style = Round;
 
   CUTLASS_HOST_DEVICE
@@ -419,31 +488,31 @@ struct NumericConverter<float, bfloat16_t, Round> {
   }
 
   CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) {
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
 
 template <>
-struct NumericConverter<bfloat16_t, float, FloatRoundStyle::round_to_nearest> {
-  using result_type = bfloat16_t;
+struct NumericConverter<cutlass::bfloat16_t, float, FloatRoundStyle::round_to_nearest> {
+  using result_type = cutlass::bfloat16_t;
   using source_type = float;
   static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
 
   CUTLASS_HOST_DEVICE
   static result_type convert(source_type const & s) {
-    return static_cast<bfloat16_t>(s);
+    return static_cast<cutlass::bfloat16_t>(s);
   }
 
   CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) {
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
 
 template <>
-struct NumericConverter<bfloat16_t, float, FloatRoundStyle::round_half_ulp_truncate> {
-  using result_type = bfloat16_t;
+struct NumericConverter<cutlass::bfloat16_t, float, FloatRoundStyle::round_half_ulp_truncate> {
+  using result_type = cutlass::bfloat16_t;
   using source_type = float;
   static FloatRoundStyle const round_style = FloatRoundStyle::round_half_ulp_truncate;
 
@@ -462,18 +531,18 @@ struct NumericConverter<bfloat16_t, float, FloatRoundStyle::round_half_ulp_trunc
     #endif
 
     uint16_t x16 = uint16_t((x32 >> 16) & 0xffff);
-    return bfloat16_t::bitcast(x16);
+    return cutlass::bfloat16_t::bitcast(x16);
   }
 
   CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) {
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
 
 template <>
-struct NumericConverter<bfloat16_t, float, FloatRoundStyle::round_toward_zero> {
-  using result_type = bfloat16_t;
+struct NumericConverter<cutlass::bfloat16_t, float, FloatRoundStyle::round_toward_zero> {
+  using result_type = cutlass::bfloat16_t;
   using source_type = float;
   static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_zero;
 
@@ -483,27 +552,27 @@ struct NumericConverter<bfloat16_t, float, FloatRoundStyle::round_toward_zero> {
     uint32_t x32 = reinterpret_cast<uint32_t const &>(s);
     uint16_t x16 = uint16_t(x32 >> 16);
 
-    return bfloat16_t::bitcast(x16);
+    return cutlass::bfloat16_t::bitcast(x16);
   }
 
   CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) {
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 //
-// Partial specializations for float <=> tfloat32_t
+// Partial specializations for float <=> cutlass::tfloat32_t
 //
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Partial specialization for float <= tfloat32_t
+/// Partial specialization for float <= cutlass::tfloat32_t
 template <FloatRoundStyle Round>
-struct NumericConverter<float, tfloat32_t, Round> {
+struct NumericConverter<float, cutlass::tfloat32_t, Round> {
 
   using result_type = float;
-  using source_type = tfloat32_t;
+  using source_type = cutlass::tfloat32_t;
   static FloatRoundStyle const round_style = Round;
 
   CUTLASS_HOST_DEVICE
@@ -513,14 +582,14 @@ struct NumericConverter<float, tfloat32_t, Round> {
   }
 
   CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) {
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
 
 template <>
-struct NumericConverter<tfloat32_t, float, FloatRoundStyle::round_to_nearest> {
-  using result_type = tfloat32_t;
+struct NumericConverter<cutlass::tfloat32_t, float, FloatRoundStyle::round_to_nearest> {
+  using result_type = cutlass::tfloat32_t;
   using source_type = float;
   static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
 
@@ -529,6 +598,9 @@ struct NumericConverter<tfloat32_t, float, FloatRoundStyle::round_to_nearest> {
 
     unsigned storage = reinterpret_cast<unsigned const &>(s);
 
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+    asm volatile("cvt.rn.tf32.f32 %0, %1;" : "=r"(storage) : "r"(storage));
+#else
     if ((storage & 0x7f800000) != 0x7f800000) {
 
       bool mantissa_bit = ((storage & (1 << 13)) != 0);
@@ -541,7 +613,7 @@ struct NumericConverter<tfloat32_t, float, FloatRoundStyle::round_to_nearest> {
 
       // Note, the following is intentionally commented out. TF32
       // does not define the low order bits, so they may be left in
-      // an undefined state. 
+      // an undefined state.
       //
       // By not truncating these bit explicitly, we avoid an extra logical
       // operation.
@@ -554,29 +626,30 @@ struct NumericConverter<tfloat32_t, float, FloatRoundStyle::round_to_nearest> {
     else if (storage & ~0xff800000) {
       storage = 0x7fffffff;
     }
+#endif
 
-    return tfloat32_t::bitcast(storage);
+    return cutlass::tfloat32_t::bitcast(storage);
   }
 
   CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) {
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
 
 template <>
-struct NumericConverter<tfloat32_t, float, FloatRoundStyle::round_half_ulp_truncate> {
-  using result_type = tfloat32_t;
+struct NumericConverter<cutlass::tfloat32_t, float, FloatRoundStyle::round_half_ulp_truncate> {
+  using result_type = cutlass::tfloat32_t;
   using source_type = float;
   static FloatRoundStyle const round_style = FloatRoundStyle::round_half_ulp_truncate;
 
   CUTLASS_HOST_DEVICE
   static result_type convert(source_type const & s) {
-    return tfloat32_t::round_half_ulp_truncate(s);
+    return cutlass::tfloat32_t::round_half_ulp_truncate(s);
   }
 
   CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) {
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
@@ -584,8 +657,8 @@ struct NumericConverter<tfloat32_t, float, FloatRoundStyle::round_half_ulp_trunc
 /// This rounding operation is similar to half_ulp_truncate except it rounds denorms toward zero.
 /// It avoids predicated code, though it requires a temporary register.
 template <>
-struct NumericConverter<tfloat32_t, float, FloatRoundStyle::round_half_ulp_trunc_dntz> {
-  using result_type = tfloat32_t;
+struct NumericConverter<cutlass::tfloat32_t, float, FloatRoundStyle::round_half_ulp_trunc_dntz> {
+  using result_type = cutlass::tfloat32_t;
   using source_type = float;
   static FloatRoundStyle const round_style = FloatRoundStyle::round_half_ulp_trunc_dntz;
 
@@ -601,32 +674,32 @@ struct NumericConverter<tfloat32_t, float, FloatRoundStyle::round_half_ulp_trunc
   }
 
   CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) {
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
 
 template <>
-struct NumericConverter<tfloat32_t, float, FloatRoundStyle::round_toward_zero> {
-  using result_type = tfloat32_t;
+struct NumericConverter<cutlass::tfloat32_t, float, FloatRoundStyle::round_toward_zero> {
+  using result_type = cutlass::tfloat32_t;
   using source_type = float;
   static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_zero;
 
   CUTLASS_HOST_DEVICE
   static result_type convert(source_type const & s) {
     uint32_t x = reinterpret_cast<uint32_t const &>(s);
-    return tfloat32_t::bitcast(x & 0xffffe000);
+    return cutlass::tfloat32_t::bitcast(x & 0xffffe000);
   }
 
   CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) {
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 //
-// Conversion operator for float to tfloat32_t big and small values
+// Conversion operator for float to cutlass::tfloat32_t big and small values
 //
 /////////////////////////////////////////////////////////////////////////////////////////////////
 template <
@@ -635,8 +708,8 @@ template <
 >
 struct NumericConverterFastF32 {
 
-  // result_type holds big tfloat32_t at idx(0) and small tfloat32_t at idx(1)
-  using result_type = Array<tfloat32_t, 2>; 
+  // result_type holds big cutlass::tfloat32_t at idx(0) and small cutlass::tfloat32_t at idx(1)
+  using result_type = Array<cutlass::tfloat32_t, 2>;
 
   // source data type
   using source_type = float;
@@ -649,20 +722,20 @@ struct NumericConverterFastF32 {
     static result_type convert(source_type const & source) {
 
     result_type result;
-    NumericConverter<tfloat32_t, float, kRoundBig> convert_big_;
-    NumericConverter<tfloat32_t, float, kRoundSmall> convert_small_;
+    NumericConverter<cutlass::tfloat32_t, float, kRoundBig> convert_big_;
+    NumericConverter<cutlass::tfloat32_t, float, kRoundSmall> convert_small_;
 
-    // convert and fill tfloat32_t big at idx 0
+    // convert and fill cutlass::tfloat32_t big at idx 0
     result[0] = convert_big_(source);
 
-    // convert and fill tfloat32_t small at idx 1
+    // convert and fill cutlass::tfloat32_t small at idx 1
     result[1] = convert_small_(source - static_cast<float>(result[0]));
 
     return result;
   }
 
   CUTLASS_HOST_DEVICE
-    result_type operator()(source_type const &s) {
+    result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
@@ -685,9 +758,9 @@ struct NumericConverterClamp {
   CUTLASS_HOST_DEVICE
     static result_type convert(source_type const & s) {
     NumericConverter<result_type, source_type> convert_op;
-    result_type const kClamp_max = platform::numeric_limits<result_type>::max();
-    result_type const kClamp_min = platform::numeric_limits<result_type>::lowest();
-    if (s < (source_type)kClamp_min) 
+    result_type const kClamp_max = cutlass::platform::numeric_limits<result_type>::max();
+    result_type const kClamp_min = cutlass::platform::numeric_limits<result_type>::lowest();
+    if (s < (source_type)kClamp_min)
       return kClamp_min;
     if (s > (source_type)kClamp_max)
       return kClamp_max;
@@ -695,7 +768,29 @@ struct NumericConverterClamp {
   }
 
   CUTLASS_HOST_DEVICE
-    result_type operator()(source_type const &s) {
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+// This converter is needed to enable cutlass::half_t output types when using int32_t accumulators.
+// Since floating-point types do not require a clamp, this converter simply casts from
+// the source type to cutlass::half_t.
+template <
+  typename S
+>
+struct NumericConverterClamp<cutlass::half_t, S> {
+
+  using result_type = cutlass::half_t;
+  using source_type = S;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const &source) {
+    return static_cast<cutlass::half_t>(source);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
@@ -732,8 +827,7 @@ struct NumericArrayConverter {
 
     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
-      if( platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Identity>::value )
-      {
+      if (platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Identity>::value) {
         result[i] = convert_(s[i]);
       } else { // conjugate
         result[i] = conj(convert_(s[i]));
@@ -744,7 +838,7 @@ struct NumericArrayConverter {
   }
 
   CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) {
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
@@ -766,79 +860,87 @@ struct NumericArrayConverter<T, T, N, Round, Transform> {
                   "Unary Operator not supported.");
 
   CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) {
-      if( platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Identity>::value )
-      {
-          return s;
-      } else {
-          result_type result;
-          for (int i = 0; i < N; ++i) {
-              result[i] = conj(s[i]);
-          }
-          return result;
+  static result_type convert(source_type const &source) {
+    if (platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Identity>::value) {
+      return source;
+    } else {
+      result_type result;
+      for (int i = 0; i < N; ++i) {
+        result[i] = conj(static_cast<typename source_type::Element>(source[i]));
       }
+      return result;
+    }
   }
-};
 
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Partial specialization for Array<half, 2> <= Array<float, 2>, round to nearest
 template <>
-struct NumericArrayConverter<half_t, float, 2, FloatRoundStyle::round_to_nearest> {
+struct NumericArrayConverter<cutlass::half_t, float, 2, FloatRoundStyle::round_to_nearest> {
 
-  using result_type = Array<half_t, 2>;
+  using result_type = Array<cutlass::half_t, 2>;
   using source_type = Array<float, 2>;
   static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
 
   CUTLASS_HOST_DEVICE
   static result_type convert(source_type const & source) {
-
-    Array<half_t, 2> result;
-
     #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+      Array<cutlass::half_t, 2> result;
       reinterpret_cast<__half2 &>(result) = __float22half2_rn(reinterpret_cast<float2 const &>(source));
+      return result;
     #else
-      NumericConverter<half_t, float, round_style> convert_;
+      NumericConverter<cutlass::half_t, float, round_style> convert_;
+      // NOTE: cutlass::Array<half, N> is NOT an aggregate type and
+      //  below `{}` does NOT conduct zero initialization. Below `{}` will 
+      //  conduct default initialization (calling default ctr). We use this syntax
+      //  to resolve compiler warning on uninitialized member variable.
+      Array<cutlass::half_t, 2> result{};
       result[0] = convert_(source[0]);
       result[1] = convert_(source[1]);
+      return result;
     #endif
-    
-    return result;
   }
 
   CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) {
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
 
-/// Partial specialization for Array<float, 2> <= Array<half_t, 2>, round to nearest
+/// Partial specialization for Array<float, 2> <= Array<cutlass::half_t, 2>, round to nearest
 template <FloatRoundStyle Round>
-struct NumericArrayConverter<float, half_t, 2, Round> {
+struct NumericArrayConverter<float, cutlass::half_t, 2, Round> {
 
   using result_type = Array<float, 2>;
-  using source_type = Array<half_t, 2>;
+  using source_type = Array<cutlass::half_t, 2>;
   static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
 
   CUTLASS_HOST_DEVICE
   static result_type convert(source_type const & source) {
 
-    Array<float, 2> result;
-
     #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-      reinterpret_cast<float2 &>(result) = __half22float2(reinterpret_cast<__half2 const &>(source));
+      float2 result2 = __half22float2(reinterpret_cast<__half2 const &>(source));
+      return {
+        float{result2.x},
+        float{result2.y}
+      };
     #else
-      NumericConverter<float, half_t, round_style> convert_;
-      result[0] = convert_(source[0]);
-      result[1] = convert_(source[1]);
+      NumericConverter<float, cutlass::half_t, round_style> convert_;
+      return {
+        convert_(source[0]),
+        convert_(source[1])
+      };
     #endif
-    
-    return result;
   }
 
   CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) {
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
@@ -850,21 +952,21 @@ template <
   int N,
   FloatRoundStyle Round
 >
-struct NumericArrayConverter<half_t, float, N, Round> {
+struct NumericArrayConverter<cutlass::half_t, float, N, Round> {
 
-  using result_type = Array<half_t, N>;
+  using result_type = Array<cutlass::half_t, N>;
   using source_type = Array<float, N>;
   static FloatRoundStyle const round_style = Round;
 
   CUTLASS_HOST_DEVICE
   static result_type convert(source_type const & source) {
 
-    NumericArrayConverter<half_t, float, 2, Round> convert_vector_;
-    NumericConverter<half_t, float, Round> convert_element_;
+    NumericArrayConverter<cutlass::half_t, float, 2, Round> convert_vector_;
+    NumericConverter<cutlass::half_t, float, Round> convert_element_;
 
     result_type result;
 
-    Array<half_t, 2> *result_ptr = reinterpret_cast<Array<half_t, 2> *>(&result);
+    Array<cutlass::half_t, 2> *result_ptr = reinterpret_cast<Array<cutlass::half_t, 2> *>(&result);
     Array<float, 2> const *source_ptr = reinterpret_cast<Array<float, 2> const *>(&source);
 
     CUTLASS_PRAGMA_UNROLL
@@ -880,7 +982,7 @@ struct NumericArrayConverter<half_t, float, N, Round> {
   }
 
   CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) {
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
@@ -891,22 +993,22 @@ template <
   int N,
   FloatRoundStyle Round
 >
-struct NumericArrayConverter<float, half_t, N, Round> {
+struct NumericArrayConverter<float, cutlass::half_t, N, Round> {
 
   using result_type = Array<float, N>;
-  using source_type = Array<half_t, N>;
+  using source_type = Array<cutlass::half_t, N>;
   static FloatRoundStyle const round_style = Round;
 
   CUTLASS_HOST_DEVICE
   static result_type convert(source_type const & source) {
 
-    NumericArrayConverter<float, half_t, 2, Round> convert_vector_;
-    NumericConverter<float, half_t, Round> convert_element_;
+    NumericArrayConverter<float, cutlass::half_t, 2, Round> convert_vector_;
+    NumericConverter<float, cutlass::half_t, Round> convert_element_;
 
     result_type result;
 
     Array<float, 2> *result_ptr = reinterpret_cast<Array<float, 2> *>(&result);
-    Array<half_t, 2> const *source_ptr = reinterpret_cast<Array<half_t, 2> const *>(&source);
+    Array<cutlass::half_t, 2> const *source_ptr = reinterpret_cast<Array<cutlass::half_t, 2> const *>(&source);
 
     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N / 2; ++i) {
@@ -921,22 +1023,21 @@ struct NumericArrayConverter<float, half_t, N, Round> {
   }
 
   CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) {
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
 
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Partial specialization for Array<bfloat16_t, 2> <= Array<float, 2>, round to nearest
+/// Partial specialization for Array<cutlass::bfloat16_t, 2> <= Array<float, 2>, round to nearest
 template <>
-struct NumericArrayConverter<bfloat16_t, float, 2, FloatRoundStyle::round_to_nearest> {
+struct NumericArrayConverter<cutlass::bfloat16_t, float, 2, FloatRoundStyle::round_to_nearest> {
 
-  using result_type = Array<bfloat16_t, 2>;
+  using result_type = Array<cutlass::bfloat16_t, 2>;
   using source_type = Array<float, 2>;
   static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
 
@@ -951,33 +1052,31 @@ struct NumericArrayConverter<bfloat16_t, float, 2, FloatRoundStyle::round_to_nea
   }
 
   CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) {
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Array<half> <= Array<float>
+/// Partial specialization for Array<cutlass::bfloat16_t> <= Array<float>
 template <
   int N,
   FloatRoundStyle Round
 >
-struct NumericArrayConverter<bfloat16_t, float, N, Round> {
+struct NumericArrayConverter<cutlass::bfloat16_t, float, N, Round> {
 
-  using result_type = Array<bfloat16_t, N>;
+  using result_type = Array<cutlass::bfloat16_t, N>;
   using source_type = Array<float, N>;
   static FloatRoundStyle const round_style = Round;
 
   CUTLASS_HOST_DEVICE
   static result_type convert(source_type const & source) {
 
-    NumericArrayConverter<bfloat16_t, float, 2, Round> convert_vector_;
-    NumericConverter<bfloat16_t, float, Round> convert_element_;
+    NumericArrayConverter<cutlass::bfloat16_t, float, 2, Round> convert_vector_;
+    NumericConverter<cutlass::bfloat16_t, float, Round> convert_element_;
 
     result_type result;
 
-    Array<bfloat16_t, 2> *result_ptr = reinterpret_cast<Array<bfloat16_t, 2> *>(&result);
+    Array<cutlass::bfloat16_t, 2> *result_ptr = reinterpret_cast<Array<cutlass::bfloat16_t, 2> *>(&result);
     Array<float, 2> const *source_ptr = reinterpret_cast<Array<float, 2> const *>(&source);
 
     CUTLASS_PRAGMA_UNROLL
@@ -993,7 +1092,7 @@ struct NumericArrayConverter<bfloat16_t, float, N, Round> {
   }
 
   CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) {
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
@@ -1002,7 +1101,7 @@ struct NumericArrayConverter<bfloat16_t, float, N, Round> {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-// Conditional guards to enable partial specialization for packed integers 
+// Conditional guards to enable partial specialization for packed integers
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 720) && \
     ((__CUDACC_VER_MAJOR__ > 10) ||                     \
      ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2)))
@@ -1024,12 +1123,12 @@ struct NumericArrayConverter<int8_t, int, 1, Round> {
     result_type result;
 
     result[0] = convert_element_(source[0]);
-   
+
     return result;
   }
 
   CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) {
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
@@ -1058,7 +1157,7 @@ struct NumericArrayConverter<int8_t, int, 2, Round> {
   }
 
   CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) {
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
@@ -1089,7 +1188,7 @@ struct NumericArrayConverter<int8_t, int, 4, Round> {
   }
 
   CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) {
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
@@ -1125,7 +1224,7 @@ struct NumericArrayConverter<int8_t, int, N, Round> {
   }
 
   CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) {
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
@@ -1147,12 +1246,12 @@ struct NumericArrayConverter<uint8_t, int, 1, Round> {
     result_type result;
 
     result[0] = convert_element_(source[0]);
-   
+
     return result;
   }
 
   CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) {
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
@@ -1181,7 +1280,7 @@ struct NumericArrayConverter<uint8_t, int, 2, Round> {
   }
 
   CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) {
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
@@ -1212,7 +1311,7 @@ struct NumericArrayConverter<uint8_t, int, 4, Round> {
   }
 
   CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) {
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
@@ -1248,192 +1347,3725 @@ struct NumericArrayConverter<uint8_t, int, N, Round> {
   }
 
   CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) {
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
 
 #endif
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for Array<float, N> <=> Array<float_e4m3_t, N>
+//
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750) && \
-    ((__CUDACC_VER_MAJOR__ > 10) ||                     \
-     ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2)))
-
-/// Partial specialization for Array<int4b_t, 8> <= Array<int, 8>
+/// Partial specialization for Array<float, 2> <= Array<float_e4m3_t, 2>
 template <
   FloatRoundStyle Round
 >
-struct NumericArrayConverter<int4b_t, int, 8, Round> {
+struct NumericArrayConverter<float, cutlass::float_e4m3_t, 2, Round> {
+  using result_element = float;
+  using source_element = cutlass::float_e4m3_t;
 
-  using result_type = Array<int4b_t, 8>;
-  using source_type = Array<int, 8>;
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
   static FloatRoundStyle const round_style = Round;
 
-  CUTLASS_HOST_DEVICE
+  CUTLASS_DEVICE
   static result_type convert(source_type const & source) {
 
-    unsigned out;
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t out_fp16;
+    uint16_t const& src_packed = reinterpret_cast<uint16_t const&>(source);
 
-    asm volatile(
-        "{ .reg .u32 r4;"
-        "cvt.pack.sat.s4.s32.b32   r4, %8, %7, 0;"
-        "cvt.pack.sat.s4.s32.b32   r4, %6, %5, r4;"
-        "cvt.pack.sat.s4.s32.b32   r4, %4, %3, r4;"
-        "cvt.pack.sat.s4.s32.b32   %0, %2, %1, r4;"
-        "}"
-        : "=r"(out)
-        : "r"(source[0]), "r"(source[1]), "r"(source[2]), "r"(source[3]),
-          "r"(source[4]), "r"(source[5]), "r"(source[6]), "r"(source[7]));
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.f16x2.e4m3x2 %0, %1;\n" \
+        "}\n" : "=r"(out_fp16): "h"(src_packed));
 
-    return reinterpret_cast<result_type const &>(out);
+    float2 res0 = __half22float2(reinterpret_cast<__half2 &>(out_fp16));
+
+    result_type out;
+    out[0] = res0.x;
+    out[1] = res0.y;
+    return out;
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
   }
 
   CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) {
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
 
-/// Partial specialization for Array<int4b_t> <= Array<int>
+/// Partial specialization for Array<float_e4m3_t, 2> <= Array<float, 2>
 template <
-  int N,
   FloatRoundStyle Round
 >
-struct NumericArrayConverter<int4b_t, int, N, Round> {
-  static_assert(!(N % 8), "N must be multiple of 8.");
+struct NumericArrayConverter<float_e4m3_t, float, 2, Round> {
+  using result_element = cutlass::float_e4m3_t;
+  using source_element = float;
 
-  using result_type = Array<int4b_t, N>;
-  using source_type = Array<int, N>;
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
   static FloatRoundStyle const round_style = Round;
 
-  CUTLASS_HOST_DEVICE
+  CUTLASS_DEVICE
   static result_type convert(source_type const & source) {
 
-    NumericArrayConverter<int4b_t, int, 8, Round> convert_vector_;
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint16_t out;
 
-    result_type result;
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.satfinite.e4m3x2.f32   %0, %2, %1;\n" \
+        "}" \
+        : "=h"(out) : "f"(source[0]), "f"(source[1]));
 
-    Array<int4b_t, 8> *result_ptr = reinterpret_cast<Array<int4b_t, 8> *>(&result);
-    Array<int, 8> const *source_ptr = reinterpret_cast<Array<int, 8> const *>(&source);
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
 
     CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 8; ++i) {
-      result_ptr[i] = convert_vector_(source_ptr[i]);
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
     }
 
     return result;
+  #endif
   }
 
   CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) {
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
 
-/// Partial specialization for Array<uint4b_t, 8> <= Array<int, 8>
+/// Partial specialization for Array<float, 2> <= Array<float_e5m2_t, 2>
 template <
   FloatRoundStyle Round
 >
-struct NumericArrayConverter<uint4b_t, int, 8, Round> {
+struct NumericArrayConverter<float, cutlass::float_e5m2_t, 2, Round> {
+  using result_element = float;
+  using source_element = cutlass::float_e5m2_t;
 
-  using result_type = Array<uint4b_t, 8>;
-  using source_type = Array<int, 8>;
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
   static FloatRoundStyle const round_style = Round;
 
-  CUTLASS_HOST_DEVICE
+  CUTLASS_DEVICE
   static result_type convert(source_type const & source) {
 
-    unsigned out;
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t out_fp16;
+    uint16_t const& src_packed = reinterpret_cast<uint16_t const&>(source);
 
-    asm volatile(
-        "{ .reg .u32 r4;"
-        "cvt.pack.sat.u4.s32.b32   r4, %8, %7, 0;"
-        "cvt.pack.sat.u4.s32.b32   r4, %6, %5, r4;"
-        "cvt.pack.sat.u4.s32.b32   r4, %4, %3, r4;"
-        "cvt.pack.sat.u4.s32.b32   %0, %2, %1, r4;"
-        "}"
-        : "=r"(out)
-        : "r"(source[0]), "r"(source[1]), "r"(source[2]), "r"(source[3]),
-          "r"(source[4]), "r"(source[5]), "r"(source[6]), "r"(source[7]));
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.f16x2.e5m2x2 %0, %1;\n" \
+        "}\n" : "=r"(out_fp16): "h"(src_packed));
 
-    return reinterpret_cast<result_type const &>(out);
+    float2 res0 = __half22float2(reinterpret_cast<__half2 &>(out_fp16));
+
+    result_type out;
+    out[0] = res0.x;
+    out[1] = res0.y;
+    return out;
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
   }
 
   CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) {
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
 
-/// Partial specialization for Array<int4b_t> <= Array<int>
+/// Partial specialization for Array<float_e5m2_t, 2> <= Array<float, 2>
 template <
-  int N,
   FloatRoundStyle Round
 >
-struct NumericArrayConverter<uint4b_t, int, N, Round> {
-  static_assert(!(N % 8), "N must be multiple of 8.");
+struct NumericArrayConverter<float_e5m2_t, float, 2, Round> {
+  using result_element = cutlass::float_e5m2_t;
+  using source_element = float;
 
-  using result_type = Array<uint4b_t, N>;
-  using source_type = Array<int, N>;
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
   static FloatRoundStyle const round_style = Round;
 
-  CUTLASS_HOST_DEVICE
+  CUTLASS_DEVICE
   static result_type convert(source_type const & source) {
 
-    NumericArrayConverter<uint4b_t, int, 8, Round> convert_vector_;
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint16_t out;
 
-    result_type result;
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.satfinite.e5m2x2.f32   %0, %2, %1;\n" \
+        "}" \
+        : "=h"(out) : "f"(source[0]), "f"(source[1]));
 
-    Array<uint4b_t, 8> *result_ptr = reinterpret_cast<Array<uint4b_t, 8> *>(&result);
-    Array<int, 8> const *source_ptr = reinterpret_cast<Array<int, 8> const *>(&source);
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
 
     CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 8; ++i) {
-      result_ptr[i] = convert_vector_(source_ptr[i]);
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
     }
 
     return result;
+  #endif
   }
 
   CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) {
+  result_type operator()(source_type const &s) const {
     return convert(s);
   }
 };
 
-#endif  // Conditional guards to enable partial specialization for packed integers
-
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for Array<half, N> <=> Array<float_e4m3_t, N>
+//
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// FastNumericArrayConverter only works when the source is within center range.
-/// Conversion operator for Array.  See the comments before
-/// FastLinearCombinationClamp.
-template <typename T, typename S, int N,
-          FloatRoundStyle Round = FloatRoundStyle::round_to_nearest>
-struct FastNumericArrayConverter {
-  using result_type = Array<T, N>;
-  using source_type = Array<S, N>;
+/// Partial specialization for Array<half, 2> <= Array<float_e4m3_t, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<cutlass::half_t, cutlass::float_e4m3_t, 2, Round> {
+  using result_element = cutlass::half_t;
+  using source_element = cutlass::float_e4m3_t;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
   static FloatRoundStyle const round_style = Round;
 
   CUTLASS_DEVICE
-  static result_type convert(source_type const &s) {
-    result_type result;
-    NumericArrayConverter<T, S, N, Round> convert_;
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    result_type out;
+    uint32_t& reg = reinterpret_cast<uint32_t&>(out);
+    uint16_t const& src_packed = reinterpret_cast<uint16_t const&>(source);
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.f16x2.e4m3x2 %0, %1;\n" \
+        "}\n" : "=r"(reg): "h"(src_packed));
+
+    return out;
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e4m3_t, 2> <= Array<half, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e4m3_t, cutlass::half_t, 2, Round> {
+  using result_element = cutlass::float_e4m3_t;
+  using source_element = cutlass::half_t;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint16_t out;
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.satfinite.e4m3x2.f16x2   %0, %1;\n" \
+        "}" \
+        : "=h"(out) : "r"(reinterpret_cast<uint32_t const&>(source)));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<half, 2> <= Array<float_e5m2_t, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<cutlass::half_t, cutlass::float_e5m2_t, 2, Round> {
+  using result_element = cutlass::half_t;
+  using source_element = cutlass::float_e5m2_t;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    result_type out;
+    uint32_t& reg = reinterpret_cast<uint32_t&>(out);
+    uint16_t const& src_packed = reinterpret_cast<uint16_t const&>(source);
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.f16x2.e5m2x2 %0, %1;\n" \
+        "}\n" : "=r"(reg): "h"(src_packed));
+
+    return out;
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e5m2_t, 2> <= Array<half, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e5m2_t, cutlass::half_t, 2, Round> {
+  using result_element = cutlass::float_e5m2_t;
+  using source_element = cutlass::half_t;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint16_t out;
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.satfinite.e5m2x2.f16x2   %0, %1;\n" \
+        "}" \
+        : "=h"(out) : "r"(reinterpret_cast<uint32_t const&>(source)));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for Array<bfloat16_t, N> <=> Array<float_e4m3_t, N>
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<bfloat16_t, 2> <= Array<float_e4m3_t, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<cutlass::bfloat16_t, cutlass::float_e4m3_t, 2, Round> {
+  using result_element = cutlass::bfloat16_t;
+  using source_element = cutlass::float_e4m3_t;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t res_half;
+    uint16_t const& src_packed = reinterpret_cast<uint16_t const&>(source);
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.f16x2.e4m3x2 %0, %1;\n" \
+        "}\n" : "=r"(res_half): "h"(src_packed));
+    float2 res_float = __half22float2(reinterpret_cast<__half2 &>(res_half));
+    NumericArrayConverter<cutlass::bfloat16_t, float, 2, Round> converter;
+    return converter(reinterpret_cast<Array<float, 2> const&>(res_float));
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e4m3_t, 2> <= Array<bfloat16_t, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e4m3_t, cutlass::bfloat16_t, 2, Round> {
+  using result_element = cutlass::float_e4m3_t;
+  using source_element = cutlass::bfloat16_t;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    NumericArrayConverter<float, cutlass::bfloat16_t, 2, Round> converter;
+    Array<float, 2> res_float = converter(source);
+    uint16_t out;
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.satfinite.e4m3x2.f32   %0, %2, %1;\n" \
+        "}" \
+        : "=h"(out) : "f"(res_float[0]), "f"(res_float[1]));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<bfloat16_t, 2> <= Array<float_e5m2_t, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<cutlass::bfloat16_t, cutlass::float_e5m2_t, 2, Round> {
+  using result_element = cutlass::bfloat16_t;
+  using source_element = cutlass::float_e5m2_t;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t res_half;
+    uint16_t const& src_packed = reinterpret_cast<uint16_t const&>(source);
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.f16x2.e5m2x2 %0, %1;\n" \
+        "}\n" : "=r"(res_half): "h"(src_packed));
+    float2 res_float = __half22float2(reinterpret_cast<__half2 &>(res_half));
+    NumericArrayConverter<cutlass::bfloat16_t, float, 2, Round> converter;
+    return converter(reinterpret_cast<Array<float, 2> const&>(res_float));
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e5m2_t, 2> <= Array<bfloat16_t, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e5m2_t, cutlass::bfloat16_t, 2, Round> {
+  using result_element = cutlass::float_e5m2_t;
+  using source_element = cutlass::bfloat16_t;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    NumericArrayConverter<float, cutlass::bfloat16_t, 2, Round> converter;
+    Array<float, 2> res_float = converter(source);
+    uint16_t out;
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.satfinite.e5m2x2.f32   %0, %2, %1;\n" \
+        "}" \
+        : "=h"(out) : "f"(res_float[0]), "f"(res_float[1]));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+namespace detail {
+
+/// Special converters that can be used with 4 8-bit elements packed in a register.
+/// Common use is for fast FP8 converters.
+template <
+  typename T,
+  typename S,
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
+  typename Transform = cutlass::transform::thread::UnaryTransform::Identity
+>
+struct NumericArrayConverterPacked4Element {
+  using result_type = Array<T, 4>;
+  using source_type = Array<S, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  static_assert(platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Identity>::value ||
+                platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Conjugate>::value,
+                  "Unary Operator not supported.");
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+
+    result_type result;
+    NumericConverter<T, S, Round> convert_;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      if (platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Identity>::value) {
+        result[i] = convert_(s[i]);
+      }
+      else { // conjugate
+        result[i] = conj(convert_(s[i]));
+      }
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float, 4> <= Array<float_e4m3_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float, cutlass::float_e4m3_t, Round> {
+  using result_element = float;
+  using source_element = cutlass::float_e4m3_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t out_fp16[2];
+    uint32_t const& src_packed = reinterpret_cast<uint32_t const&>(source);
+
+    asm volatile( \
+        "{\n" \
+        ".reg .b16 lo, hi;\n" \
+        "mov.b32 {lo, hi}, %2;\n" \
+        "cvt.rn.f16x2.e4m3x2 %0, lo;\n" \
+        "cvt.rn.f16x2.e4m3x2 %1, hi;\n" \
+        "}\n" : "=r"(out_fp16[0]), "=r"(out_fp16[1]) : "r"(src_packed));
+
+    float2 res0 = __half22float2(reinterpret_cast<__half2 &>(out_fp16[0]));
+    float2 res1 = __half22float2(reinterpret_cast<__half2 &>(out_fp16[1]));
+
+    result_type out;
+    out[0] = res0.x;
+    out[1] = res0.y;
+    out[2] = res1.x;
+    out[3] = res1.y;
+    return out;
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e4m3_t, 4> <= Array<float, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float_e4m3_t, float, Round> {
+  using result_element = cutlass::float_e4m3_t;
+  using source_element = float;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t out;
+
+    asm volatile( \
+        "{\n" \
+        ".reg .b16 lo;\n" \
+        ".reg .b16 hi;\n" \
+        "cvt.rn.satfinite.e4m3x2.f32   lo, %2, %1;\n" \
+        "cvt.rn.satfinite.e4m3x2.f32   hi, %4, %3;\n" \
+        "mov.b32 %0, {lo, hi};\n" \
+        "}" \
+        : "=r"(out) : "f"(source[0]), "f"(source[1]), "f"(source[2]), "f"(source[3]));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for Array<float, 4> <=> Array<float_e5m2_t, 4>
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<float, 4> <= Array<float_e5m2_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float, cutlass::float_e5m2_t, Round> {
+  using result_element = float;
+  using source_element = cutlass::float_e5m2_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t out_fp16[2];
+    uint32_t const& src_packed = reinterpret_cast<uint32_t const&>(source);
+
+    asm volatile( \
+        "{\n" \
+        ".reg .b16 lo, hi;\n" \
+        "mov.b32 {lo, hi}, %2;\n" \
+        "cvt.rn.f16x2.e5m2x2 %0, lo;\n" \
+        "cvt.rn.f16x2.e5m2x2 %1, hi;\n" \
+        "}\n" : "=r"(out_fp16[0]), "=r"(out_fp16[1]) : "r"(src_packed));
+
+    float2 res0 = __half22float2(reinterpret_cast<__half2 &>(out_fp16[0]));
+    float2 res1 = __half22float2(reinterpret_cast<__half2 &>(out_fp16[1]));
+
+    result_type out;
+    out[0] = res0.x;
+    out[1] = res0.y;
+    out[2] = res1.x;
+    out[3] = res1.y;
+    return out;
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e5m2_t, 4> <= Array<float, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float_e5m2_t, float, Round> {
+  using result_element = cutlass::float_e5m2_t;
+  using source_element = float;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t out;
+
+    asm volatile( \
+        "{\n" \
+        ".reg .b16 lo;\n" \
+        ".reg .b16 hi;\n" \
+        "cvt.rn.satfinite.e5m2x2.f32   lo, %2, %1;\n" \
+        "cvt.rn.satfinite.e5m2x2.f32   hi, %4, %3;\n" \
+        "mov.b32 %0, {lo, hi};\n" \
+        "}" \
+        : "=r"(out) : "f"(source[0]), "f"(source[1]), "f"(source[2]), "f"(source[3]));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for Array<cutlass::half_t, 4> <=> Array<float_e4m3_t, 4>
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<cutlass::half_t, 4> <= Array<float_e4m3_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<cutlass::half_t, cutlass::float_e4m3_t, Round> {
+  using result_element = cutlass::half_t;
+  using source_element = cutlass::float_e4m3_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t out[2];
+    uint32_t const& src_packed = reinterpret_cast<uint32_t const&>(source);
+    asm volatile( \
+        "{\n" \
+        ".reg .b16 lo, hi;\n" \
+        "mov.b32 {lo, hi}, %2;\n" \
+        "cvt.rn.f16x2.e4m3x2 %0, lo;\n" \
+        "cvt.rn.f16x2.e4m3x2 %1, hi;\n" \
+        "}\n" : "=r"(out[0]), "=r"(out[1]) : "r"(src_packed));
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e4m3_t, 4> <= Array<cutlass::half_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float_e4m3_t, cutlass::half_t, Round> {
+  using result_element = cutlass::float_e4m3_t;
+  using source_element = cutlass::half_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t out;
+    uint32_t const* src_packed = reinterpret_cast<uint32_t const*>(&source);
+
+    asm volatile( \
+        "{\n" \
+        ".reg .b16 lo;\n" \
+        ".reg .b16 hi;\n" \
+        "cvt.rn.satfinite.e4m3x2.f16x2   lo, %1;\n" \
+        "cvt.rn.satfinite.e4m3x2.f16x2   hi, %2;\n" \
+        "mov.b32 %0, {lo, hi};\n" \
+        "}" \
+        : "=r"(out) : "r"(src_packed[0]), "r"(src_packed[1]));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for Array<cutlass::half_t, 4> <=> Array<float_e5m2_t, 4>
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<cutlass::half_t, 4> <= Array<float_e5m2_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<cutlass::half_t, cutlass::float_e5m2_t, Round> {
+  using result_element = cutlass::half_t;
+  using source_element = cutlass::float_e5m2_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t out[2];
+    uint32_t const& src_packed = reinterpret_cast<uint32_t const&>(source);
+    asm volatile( \
+        "{\n" \
+        ".reg .b16 lo, hi;\n" \
+        "mov.b32 {lo, hi}, %2;\n" \
+        "cvt.rn.f16x2.e5m2x2 %0, lo;\n" \
+        "cvt.rn.f16x2.e5m2x2 %1, hi;\n" \
+        "}\n" : "=r"(out[0]), "=r"(out[1]) : "r"(src_packed));
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e5m2_t, 4> <= Array<cutlass::half_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float_e5m2_t, cutlass::half_t, Round> {
+  using result_element = cutlass::float_e5m2_t;
+  using source_element = cutlass::half_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t out;
+    uint32_t const* src_packed = reinterpret_cast<uint32_t const*>(&source);
+
+    asm volatile( \
+        "{\n" \
+        ".reg .b16 lo;\n" \
+        ".reg .b16 hi;\n" \
+        "cvt.rn.satfinite.e5m2x2.f16x2   lo, %1;\n" \
+        "cvt.rn.satfinite.e5m2x2.f16x2   hi, %2;\n" \
+        "mov.b32 %0, {lo, hi};\n" \
+        "}" \
+        : "=r"(out) : "r"(src_packed[0]), "r"(src_packed[1]));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for Array<cutlass::bfloat16_t, 4> <=> Array<float_e4m3_t, 4>
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<cutlass::bfloat16_t, 4> <= Array<float_e4m3_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<cutlass::bfloat16_t, cutlass::float_e4m3_t, Round> {
+  using result_element = cutlass::bfloat16_t;
+  using source_element = cutlass::float_e4m3_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    // Convert f8 to float
+    NumericArrayConverterPacked4Element<float, source_element, Round> src2float;
+    Array<float, 4> tmp_floats = src2float(source);
+
+    // Convert float to bf16
+    result_type out;
+    Array<float, 2>* packed_tmp = reinterpret_cast<Array<float, 2>*>(&tmp_floats);
+    Array<result_element, 2>* packed_out = reinterpret_cast<Array<result_element, 2>*>(&out);
+    NumericArrayConverter<result_element, float, 2, Round> float2result;
+    packed_out[0] = float2result(packed_tmp[0]);
+    packed_out[1] = float2result(packed_tmp[1]);
+
+    return out;
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e4m3_t, 4> <= Array<cutlass::bfloat16_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float_e4m3_t, cutlass::bfloat16_t, Round> {
+  using result_element = cutlass::float_e4m3_t;
+  using source_element = cutlass::bfloat16_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    // Convert bf16 to float
+    Array<float, 4> tmp;
+    Array<float, 2>* packed_tmp = reinterpret_cast<Array<float, 2>*>(&tmp);
+    Array<source_element, 2> const* packed_source = reinterpret_cast<Array<source_element, 2> const*>(&source);
+    NumericArrayConverter<float, source_element, 2, Round> src2float;
+    packed_tmp[0] = src2float(packed_source[0]);
+    packed_tmp[1] = src2float(packed_source[1]);
+
+    // Convert float to f8
+    NumericArrayConverterPacked4Element<result_element, float, Round> float2result;
+    return float2result(tmp);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for Array<cutlass::bfloat16_t, 4> <=> Array<float_e5m2_t, 4>
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<cutlass::bfloat16_t, 4> <= Array<float_e5m2_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<cutlass::bfloat16_t, cutlass::float_e5m2_t, Round> {
+  using result_element = cutlass::bfloat16_t;
+  using source_element = cutlass::float_e5m2_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    // Convert f8 to float
+    NumericArrayConverterPacked4Element<float, source_element, Round> src2float;
+    Array<float, 4> tmp_floats = src2float(source);
+
+    // Convert float to bf16
+    result_type out;
+    Array<float, 2>* packed_tmp = reinterpret_cast<Array<float, 2>*>(&tmp_floats);
+    Array<result_element, 2>* packed_out = reinterpret_cast<Array<result_element, 2>*>(&out);
+    NumericArrayConverter<result_element, float, 2, Round> float2result;
+    packed_out[0] = float2result(packed_tmp[0]);
+    packed_out[1] = float2result(packed_tmp[1]);
+
+    return out;
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e5m2_t, 4> <= Array<cutlass::bfloat16_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float_e5m2_t, cutlass::bfloat16_t, Round> {
+  using result_element = cutlass::float_e5m2_t;
+  using source_element = cutlass::bfloat16_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    // Convert bf16 to float
+    Array<float, 4> tmp;
+    Array<float, 2>* packed_tmp = reinterpret_cast<Array<float, 2>*>(&tmp);
+    Array<source_element, 2> const* packed_source = reinterpret_cast<Array<source_element, 2> const*>(&source);
+    NumericArrayConverter<float, source_element, 2, Round> src2float;
+    packed_tmp[0] = src2float(packed_source[0]);
+    packed_tmp[1] = src2float(packed_source[1]);
+
+    // Convert float to f8
+    NumericArrayConverterPacked4Element<result_element, float, Round> float2result;
+    return float2result(tmp);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for Array<float_e4m3_t, 4> <=> Array<float_e5m2_t, 4>
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<float_e4m3_t, 4> <= Array<float_e5m2_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float_e4m3_t, cutlass::float_e5m2_t, Round> {
+  using result_element = cutlass::float_e4m3_t;
+  using source_element = cutlass::float_e5m2_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e5m2_t, 4> <= Array<float_e4m3_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float_e5m2_t, cutlass::float_e4m3_t, Round> {
+  using result_element = cutlass::float_e5m2_t;
+  using source_element = cutlass::float_e4m3_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for:
+//       Array<T, N> <=> Array<float_e4m3_t, N>
+//       Array<T, N> <=> Array<float_e5m2_t, N>
+// using packed converter under the hood
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename T,
+  typename S,
+  int N,
+  FloatRoundStyle Round
+>
+struct PackedNumericArrayConverter {
+  using result_element = T;
+  using source_element = S;
+
+  using result_type = Array<result_element, N>;
+  using source_type = Array<source_element, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using packed_result_type = Array<result_element, 4>;
+  using packed_source_type = Array<source_element, 4>;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+    result_type result;
+    packed_result_type* packed_result = reinterpret_cast<packed_result_type*>(&result);
+    const packed_source_type* packed_source = reinterpret_cast<const packed_source_type*>(&source);
+
+    detail::NumericArrayConverterPacked4Element<result_element, source_element, Round> packed_converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 4; ++i) {
+      packed_result[i] = packed_converter(packed_source[i]);
+    }
+
+    // Handle leftovers
+    NumericConverter<result_element, source_element, Round> converter;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N % 4; ++i) {
+      int idx = ((N / 4) * 4) + i;
+      result[idx] = converter(source[idx]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const{
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<T, N> <= Array<float_e4m3_t, N>
+template <
+  typename T,
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<T, cutlass::float_e4m3_t, N, Round> :
+  public PackedNumericArrayConverter<T, cutlass::float_e4m3_t, N, Round> {};
+
+/// Partial specialization for Array<T, N> <= Array<float_e5m2_t, N>
+template <
+  typename T,
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<T, cutlass::float_e5m2_t, N, Round> :
+  public PackedNumericArrayConverter<T, cutlass::float_e5m2_t, N, Round> {};
+
+/// Partial specialization for Array<float_e4m3_t, N> <= Array<S, N>
+template <
+  typename S,
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e4m3_t, S, N, Round> :
+  public PackedNumericArrayConverter<float_e4m3_t, S, N, Round> {};
+
+/// Partial specialization for Array<float_e5m2_t, N> <= Array<S, N>
+template <
+  typename S,
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e5m2_t, S, N, Round> :
+  public PackedNumericArrayConverter<float_e5m2_t, S, N, Round> {};
+
+/// Partial specialization for Array<float_e4m3_t, N> <= Array<float_e5m2_t, N>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e4m3_t, cutlass::float_e5m2_t, N, Round> :
+  public PackedNumericArrayConverter<float_e4m3_t, cutlass::float_e5m2_t, N, Round> {};
+
+/// Partial specialization for Array<float_e5m2_t, N> <= Array<float_e4m3_t, N>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e5m2_t, cutlass::float_e4m3_t, N, Round> :
+  public PackedNumericArrayConverter<float_e5m2_t, cutlass::float_e4m3_t, N, Round> {};
+
+/// Partial specialization for Array<float_e4m3_t, N> <= Array<float_e4m3_t, N>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e4m3_t, cutlass::float_e4m3_t, N, Round> :
+  public PackedNumericArrayConverter<float_e4m3_t, cutlass::float_e4m3_t, N, Round> {};
+
+/// Partial specialization for Array<float_e5m2_t, N> <= Array<float_e5m2_t, N>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e5m2_t, cutlass::float_e5m2_t, N, Round> :
+  public PackedNumericArrayConverter<float_e5m2_t, cutlass::float_e5m2_t, N, Round> {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<int8_t> <= Array<float>
+/// Conversion is performed with saturation regardless of setting of
+/// the `Round` template parameter.
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<int8_t, float, 1, Round> {
+
+  using result_type = Array<int8_t, 1>;
+  using source_type = Array<float, 1>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+    NumericConverter<int8_t, float, Round> destination_converter;
+    result_type result;
+    result[0] = destination_converter(source[0]);
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<uint8_t, float, 1, Round> {
+
+  using result_type = Array<uint8_t, 1>;
+  using source_type = Array<float, 1>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+    NumericConverter<uint8_t, float, Round> destination_converter;
+    result_type result;
+    result[0] = destination_converter(source[0]);
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+// To convert a FP32 to Int that has less than 32 bits, we need to convert it to int32 first.
+template <
+  typename T,
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayFP32ToIntConverter {
+
+  using result_type = Array<T, N>;
+  using source_type = Array<float, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  static_assert(cutlass::platform::numeric_limits<T>::is_integer, "the dest type has to be int.");
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+    // Convert float to int
+    Array<int32_t, N> temporary;
+
+    NumericArrayConverter<int32_t, float, N, Round> compute_converter;
+    temporary = compute_converter(source);
+
+    // Convert to int to int8_t
+    NumericArrayConverter<T, int32_t, N, Round> destination_converter;
+    return destination_converter(temporary);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<int8_t, float, N, Round> {
+
+  using result_type = Array<int8_t, N>;
+  using source_type = Array<float, N>;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+    NumericArrayFP32ToIntConverter<int8_t, N, Round> converter;
+    return converter(source);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<uint8_t, float, N, Round> {
+
+  using result_type = Array<uint8_t, N>;
+  using source_type = Array<float, N>;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+    NumericArrayFP32ToIntConverter<uint8_t, N, Round> converter;
+    return converter(source);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<int4b_t, float, N, Round> {
+
+  using result_type = Array<int4b_t, N>;
+  using source_type = Array<float, N>;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+    NumericArrayFP32ToIntConverter<int4b_t, N, Round> converter;
+    return converter(source);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<uint4b_t, float, N, Round> {
+
+  using result_type = Array<uint4b_t, N>;
+  using source_type = Array<float, N>;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+    NumericArrayFP32ToIntConverter<uint4b_t, N, Round> converter;
+    return converter(source);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750) && \
+    ((__CUDACC_VER_MAJOR__ > 10) ||                     \
+     ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2)))
+
+/// Partial specialization for Array<int4b_t, 8> <= Array<int, 8>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<int4b_t, int, 8, Round> {
+
+  using result_type = Array<int4b_t, 8>;
+  using source_type = Array<int, 8>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    unsigned out;
+
+    asm volatile(
+        "{ .reg .u32 r4;"
+        "cvt.pack.sat.s4.s32.b32   r4, %8, %7, 0;"
+        "cvt.pack.sat.s4.s32.b32   r4, %6, %5, r4;"
+        "cvt.pack.sat.s4.s32.b32   r4, %4, %3, r4;"
+        "cvt.pack.sat.s4.s32.b32   %0, %2, %1, r4;"
+        "}"
+        : "=r"(out)
+        : "r"(source[0]), "r"(source[1]), "r"(source[2]), "r"(source[3]),
+          "r"(source[4]), "r"(source[5]), "r"(source[6]), "r"(source[7]));
+
+    return reinterpret_cast<result_type const &>(out);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<int4b_t> <= Array<int>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<int4b_t, int, N, Round> {
+  static_assert(!(N % 8), "N must be multiple of 8.");
+
+  using result_type = Array<int4b_t, N>;
+  using source_type = Array<int, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    NumericArrayConverter<int4b_t, int, 8, Round> convert_vector_;
+
+    result_type result;
+
+    Array<int4b_t, 8> *result_ptr = reinterpret_cast<Array<int4b_t, 8> *>(&result);
+    Array<int, 8> const *source_ptr = reinterpret_cast<Array<int, 8> const *>(&source);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 8; ++i) {
+      result_ptr[i] = convert_vector_(source_ptr[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<uint4b_t, 8> <= Array<int, 8>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<uint4b_t, int, 8, Round> {
+
+  using result_type = Array<uint4b_t, 8>;
+  using source_type = Array<int, 8>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    unsigned out;
+
+    asm volatile(
+        "{ .reg .u32 r4;"
+        "cvt.pack.sat.u4.s32.b32   r4, %8, %7, 0;"
+        "cvt.pack.sat.u4.s32.b32   r4, %6, %5, r4;"
+        "cvt.pack.sat.u4.s32.b32   r4, %4, %3, r4;"
+        "cvt.pack.sat.u4.s32.b32   %0, %2, %1, r4;"
+        "}"
+        : "=r"(out)
+        : "r"(source[0]), "r"(source[1]), "r"(source[2]), "r"(source[3]),
+          "r"(source[4]), "r"(source[5]), "r"(source[6]), "r"(source[7]));
+
+    return reinterpret_cast<result_type const &>(out);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<int4b_t> <= Array<int>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<uint4b_t, int, N, Round> {
+  static_assert(!(N % 8), "N must be multiple of 8.");
+
+  using result_type = Array<uint4b_t, N>;
+  using source_type = Array<int, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    NumericArrayConverter<uint4b_t, int, 8, Round> convert_vector_;
+
+    result_type result;
+
+    Array<uint4b_t, 8> *result_ptr = reinterpret_cast<Array<uint4b_t, 8> *>(&result);
+    Array<int, 8> const *source_ptr = reinterpret_cast<Array<int, 8> const *>(&source);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 8; ++i) {
+      result_ptr[i] = convert_vector_(source_ptr[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+#endif  // Conditional guards to enable partial specialization for packed integers
+
+namespace detail {
+
+  /*
+      A helper class that can vectorize a numeric converter with implementation for several vector widths.
+
+      The vector widths must be giving in decreasing order or width, and must be a power of 2.
+
+      The vector converters must produce identical results to the scalar converters for consistency.
+    */
+  class VectorizedConverter {
+  private:
+    // Base case to handle remainder elements as scalars.
+    template <int Offset, size_t ParentWidth, typename ArrayConverter>
+    CUTLASS_DEVICE
+    static void convert_helper(
+      typename ArrayConverter::result_type& result,
+      typename ArrayConverter::source_type const& source) {
+
+      using ElementRes = typename ArrayConverter::result_type::Element;
+      using ElementSrc = typename ArrayConverter::source_type::Element;
+      // If no more converters, handle the remaining elements as scalars.
+      constexpr int total_elements = ArrayConverter::result_type::kElements;
+      constexpr int remainder = total_elements - Offset;
+      static_assert(remainder == (total_elements % ParentWidth), "Unexpected remainder.");
+
+      typename ArrayConverter::ScalarConverter scalar_converter;
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = Offset; i < ArrayConverter::result_type::kElements; ++i) {
+        result[i] = scalar_converter(ElementSrc(source[i]));
+      }
+    }
+
+    template <int Offset, size_t ParentWidth, typename ArrayConverter, typename ResultVectorArray, typename SourceVectorArray, typename... OtherVectorArrays>
+    CUTLASS_DEVICE
+    static void convert_helper(typename ArrayConverter::result_type& result, typename ArrayConverter::source_type const& source) {
+      static_assert(sizeof...(OtherVectorArrays) % 2 == 0, "Vector converters must come in {dst, src} pairs");
+      static_assert(ResultVectorArray::kElements == SourceVectorArray::kElements, "Vector converters must have the same vector width");
+      static_assert(cutlass::platform::is_same<typename ArrayConverter::result_type::Element, typename ResultVectorArray::Element>::value,
+        "ResultVectorArray must have the same type ArrayConverter::result_type");
+      static_assert(cutlass::platform::is_same<typename ArrayConverter::source_type::Element, typename SourceVectorArray::Element>::value,
+        "SourceVectorArray must have the same type ArrayConverter::result_type");
+      static_assert(Offset >= 0 && Offset <= ArrayConverter::result_type::kElements, "Offset must be between 0 and N");
+
+      static_assert(ParentWidth == 0 || ParentWidth > ResultVectorArray::kElements, "Vector arrays must be given in decreasing order of width");
+
+      constexpr int vector_width = ResultVectorArray::kElements;
+      static_assert(ispow2(vector_width), "Vector width must be a power of 2");
+
+      using ElementRes = typename ArrayConverter::result_type::Element;
+      using ElementSrc = typename ArrayConverter::source_type::Element;
+
+      constexpr int vector_bits_res = vector_width * cutlass::sizeof_bits<ElementRes>::value;
+      constexpr int vector_bits_src = vector_width * cutlass::sizeof_bits<ElementSrc>::value;
+
+      static_assert(vector_bits_res % 8 == 0, "Result vector type must be byte addressed.");
+      static_assert(vector_bits_src % 8 == 0, "Source vector type must be byte addressed.");
+
+      constexpr int vector_offset = Offset / vector_width;
+      ResultVectorArray* packed_result_vec = reinterpret_cast<ResultVectorArray*>(&result) + vector_offset;
+      SourceVectorArray const* packed_source_vec = reinterpret_cast<SourceVectorArray const*>(&source) + vector_offset;
+
+      // Convert the remaining elements as vectors.
+      constexpr int total_elements = ArrayConverter::result_type::kElements;
+      constexpr int groups_of_vec = (total_elements - Offset) / vector_width;
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < groups_of_vec; ++i) {
+        packed_result_vec[i] = ArrayConverter::template packed_convert<ResultVectorArray, SourceVectorArray>(packed_source_vec[i]);
+      }
+
+      constexpr int new_offset = Offset + vector_width * groups_of_vec;
+      // Recurse to handle other vector converters, or the scalar base case.
+      convert_helper<new_offset, ResultVectorArray::kElements, ArrayConverter, OtherVectorArrays...>(result, source);
+    }
+
+  public:
+    /*
+        A method to convert vectors of elements using the packed_convert method of the converter.
+
+        Converters using this class must implement packed convert and support 1 or more vector conversions.
+      */
+    template <typename ArrayConverter, typename ResultVectorArray, typename SourceVectorArray, typename... OtherVectorArrays>
+    CUTLASS_DEVICE
+    static void convert(typename ArrayConverter::result_type& result, typename ArrayConverter::source_type const& source) {
+      convert_helper<0, 0, ArrayConverter, ResultVectorArray, SourceVectorArray, OtherVectorArrays...>(result, source);
+    }
+  };
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<int8_t> <= Array<int4b_t>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<int8_t, int4b_t, N, Round> {
+
+  static_assert(N % 8 == 0, "N must be a multiple of 8");
+
+  using result_type = Array<int8_t, N>;
+  using source_type = Array<int4b_t, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+   
+    #if defined(__CUDA_ARCH__)
+
+    if constexpr ( N == 8 ) {
+      
+      unsigned const& storage = reinterpret_cast<unsigned const &>(source);
+      unsigned out[2];
+
+      asm volatile(
+          "{\n"
+          "  .reg .u32 tmp0, tmp1, tmp2;\n"
+          "  shl.b32 tmp0, %2, 4;\n"                // tmp0 = x1x2x3x4x5x6x7__
+          "  and.b32 tmp0, tmp0, 0xf0f0f0f0;\n"     // tmp0 = x1__x3__x5__x7__
+          "  prmt.b32 tmp1, tmp0, tmp0, 0xba98;\n"  // tmp1 = s1s3s5s7
+          "  and.b32 tmp1, tmp1, 0xf0f0f0f0;\n"     // tmp1 = s1__s3__s5__s7__
+          "  shr.u32 tmp0, tmp0, 4;\n"              // tmp0 = __x1__x3__x5__x7
+          "  or.b32 tmp2, tmp0, tmp1;\n"            // tmp2 = y1y3y5y7
+          "  and.b32 tmp0, %2, 0xf0f0f0f0;\n"       // tmp0 = x0__x2__x4__x6__
+          "  prmt.b32 tmp1, tmp0, tmp0, 0xba98;\n"  // tmp1 = s0s2s4s6
+          "  and.b32 tmp1, tmp1, 0xf0f0f0f0;\n"     // tmp1 = s0__s2__s4__s6__
+          "  shr.u32 tmp0, tmp0, 4;\n"              // tmp0 = __x0__x2__x4__x6
+          "  or.b32 tmp0, tmp0, tmp1;\n"            // tmp0 = y0y2y4y6
+          "  prmt.b32 %0, tmp2, tmp0, 0x5140;\n"    // %0 = y0y1y2y3
+          "  prmt.b32 %1, tmp2, tmp0, 0x7362;\n"    // %1 = y4y5y6y7
+          "}\n"
+          : "=r"(out[0]), "=r"(out[1])
+          : "r"(storage));
+
+      return reinterpret_cast<result_type const &>(out);
+      
+    } else {
+      
+      NumericArrayConverter<int8_t, int4b_t, 8, Round> convert_vector_;
+      
+      result_type result;
+      
+      Array<int8_t, 8> *result_ptr = reinterpret_cast<Array<int8_t, 8> *>(&result);
+      Array<int4b_t, 8> const *source_ptr = reinterpret_cast<Array<int4b_t, 8> const *>(&source);
+      
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < N / 8; ++i) {
+        result_ptr[i] = convert_vector_(source_ptr[i]);
+      }
+      
+      return result;
+    }
+    
+    #else
+    
+    result_type result;
+    NumericConverter<int8_t, int4b_t, Round> convert_;
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = convert_(source[i]);
+    }
+    
+    return result;
+    
+    #endif // __CUDA_ARCH__
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<cutlass::float_e4m3_t, N> <= Array<cutlass::int4b_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::float_e4m3_t, cutlass::int4b_t, N, Round> {
+  using result_type = Array<cutlass::float_e4m3_t, N>;
+  using source_type = Array<cutlass::int4b_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_8 = Array<cutlass::float_e4m3_t, 8>;
+  using result_type_packed_4 = Array<cutlass::float_e4m3_t, 4>;
+  using source_type_packed_8 = Array<cutlass::int4b_t, 8>;
+  using source_type_packed_4 = Array<cutlass::int4b_t, 4>;
+
+  using ScalarConverter = NumericConverter<cutlass::float_e4m3_t, cutlass::int4b_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_8 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  // The core converter uses a lookup table to converts i4 -> e4m3.
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_8>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_8>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 4 or 8 to use private convert dispatch.");
+
+    // Hold FP8 outputs in reg. We need 1 reg for every 4 outputs.
+    cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 4, sizeof(PackedResultType)> r;
+
+    // View the input as reg
+    uint32_t reg = to_reg(source);
+
+    // Determines if to get from the signed or unsigned candidates
+    uint32_t sign = (reg & 0x88888888) >> 1;
+
+    // Ignore sign bit when indexing into LUT
+    uint32_t lut_idx = (reg & 0x77777777);
+
+    // Signed is OR'd with 0x32103210 to find the correct value in the LUT
+    const uint32_t final_prmt_base = 0x32103210;
+
+    // [0, 1, 2, 3] encoded as FP8
+    static constexpr uint32_t POS_E4M3s_REG1 = 0x44403800;
+    // [4, 5, 6, 7] encoded as FP8
+    static constexpr uint32_t POS_E4M3s_REG2 = 0x4E4C4A48;
+    // [-1, -2, -3, -4] encoded as FP8
+    static constexpr uint32_t NEG_E4M3s_REG1 = 0xCACCCED0;
+    // [-5, -6, -7, -7] encoded as FP8
+    static constexpr uint32_t NEG_E4M3s_REG2 = 0xB8C0C4C8;
+
+
+    const int iters = PackedSrcType::kElements / 4;
+    #pragma unroll
+    for (int ii = 0; ii < iters; ++ii, lut_idx >>=16, sign >>=16) {
+      uint32_t final_prmt_idx = final_prmt_base | sign;
+
+      // This uses a look up table to convert packed int4s to packed fp8s, using the int4 value
+      // as the index to prmt.
+      // It first select both the positive and negative candidates, then uses the sign bit to
+      // select the correct candidate.
+      asm volatile(
+          "{\n"
+          "  .reg .b32 pos_f8s, neg_f8s;\n"
+          "  prmt.b32 pos_f8s, %1, %2, %5;\n"
+          "  prmt.b32 neg_f8s, %3, %4, %5;\n"
+          "  prmt.b32 %0, pos_f8s, neg_f8s, %6;\n"
+          "}\n"
+          : "=r"(r[ii])
+          : "n"(POS_E4M3s_REG1), "n"(POS_E4M3s_REG2), "n"(NEG_E4M3s_REG1), "n"(NEG_E4M3s_REG2),
+            "r"(lut_idx), "r"(final_prmt_idx));
+    }
+    return reinterpret_cast<PackedResultType&>(r);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_8, source_type_packed_8,
+                                         result_type_packed_4, source_type_packed_4>(result, source);
+
+    return result;
+  }
+
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float, N> <= Array<cutlass::int4b_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<float, cutlass::int4b_t, N, Round> {
+  using result_type = Array<float, N>;
+  using source_type = Array<cutlass::int4b_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_8 = Array<float, 8>;
+  using result_type_packed_4 = Array<float, 4>;
+  using result_type_packed_2 = Array<float, 2>;
+  using source_type_packed_8 = Array<cutlass::int4b_t, 8>;
+  using source_type_packed_4 = Array<cutlass::int4b_t, 4>;
+  using source_type_packed_2 = Array<cutlass::int4b_t, 2>;
+
+  using ScalarConverter = NumericConverter<float, cutlass::int4b_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_2 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint8_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_8 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  template <int offset, int elements_to_convert, typename PackedResultType>
+  CUTLASS_DEVICE
+  static void packed_convert_vec(PackedResultType& result, uint32_t src_reg) {
+    static_assert(offset == 0 || offset == 4, "Invalid offset");
+    // Selects one of the bottom int4s and constructs:
+    // 8388608 + (x + 8)
+    // 8388608 + 16 * (x + 8)
+    // 8388608 + 256 * (x + 8)
+    // 8388608 + 4096 * (x + 8)
+    uint32_t const and_masks[4] = {0x0000000F, 0x000000F0, 0x00000F00, 0x0000F000};
+    uint32_t const xor_masks[4] = {0x4B000008, 0x4B000080, 0x4B000800, 0x4B008000};
+
+    float const scales[4] = {1.f, 1.f / 16.f, 1.f / 256.f, 1.f / 4096.f};
+    float const offsets[4] = {-8388616.f, -524296.f, -32776.f, -2056.f};
+
+    static constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa;
+
+    uint32_t* result_as_int = reinterpret_cast<uint32_t*>(&result);
+
+    // For each operand, computes:
+    // r[i] = (r[i] & and_mask) ^ xor_mask
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < elements_to_convert; ++ii) {
+      asm volatile(
+          "{\n"
+          "  lop3.b32 %0, %1, %2, %3, %4;\n"
+          "}\n"
+          : "=r"(result_as_int[offset + ii])
+          : "r"(src_reg), "r"(and_masks[ii]), "r"(xor_masks[ii]), "n"(immLut));
+
+      result[offset + ii] = __fmaf_rn(result[offset + ii], scales[ii], offsets[ii]);
+    }
+  }
+
+  // The core converter uses bit tricks to construct a known FP16 number, then does a
+  // subtraction in FP16 for the final result.
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_8>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_8>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 1, 2, 4 or 8 to use private convert dispatch.");
+
+    // Hold output FP16s in reg. We need 1 reg for every 2 elements
+    PackedResultType r;
+
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+    constexpr int total_elements = PackedResultType::kElements == 8 ? 4 : PackedResultType::kElements;
+    packed_convert_vec<0, total_elements>(r, src_reg);
+
+
+    if (PackedResultType::kElements == 8) {
+      uint32_t src_reg_shifted = src_reg >> 16;
+      packed_convert_vec<4, 4>(r, src_reg_shifted);
+    }
+    return r;
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_8, source_type_packed_8,
+                                         result_type_packed_4, source_type_packed_4,
+                                         result_type_packed_2, source_type_packed_2>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float, N> <= Array<int8_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<float, int8_t, N, Round> {
+  using result_type = Array<float, N>;
+  using source_type = Array<int8_t, N>;
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_4 = Array<float, 4>;
+  using result_type_packed_2 = Array<float, 2>;
+  using source_type_packed_4 = Array<int8_t, 4>;
+  using source_type_packed_2 = Array<int8_t, 2>;
+
+  using ScalarConverter = NumericConverter<float, int8_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_2 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  CUTLASS_DEVICE
+  static int32_t to_int32(source_type_packed_2 const& source) {
+    return static_cast<int32_t>(reinterpret_cast<const int16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static int32_t to_int32(source_type_packed_4 const& source) {
+    return reinterpret_cast<const int32_t&>(source);
+  }
+
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 2 or 4 to use private convert dispatch.");
+
+    PackedResultType r;
+  #if defined __CUDA_ARCH__ && __CUDA_ARCH__ <= 800
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+    static constexpr int fp32_base = 0x4B400000;
+    uint32_t const prmt_indices[4] = {0x8880, 0x9991, 0xAAA2, 0xBBB3};
+
+    int* result_as_int = reinterpret_cast<int*>(&r);
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < PackedResultType::kElements; ++ii) {
+      asm volatile("prmt.b32 %0,%1,%1,%2;\n" : "=r"(result_as_int[ii]) : "r"(src_reg), "r"(prmt_indices[ii]));
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < PackedResultType::kElements; ++ii)
+    {
+      result_as_int[ii] += fp32_base;
+      r[ii] -= reinterpret_cast<const float&>(fp32_base);
+    }
+  #else
+    int32_t x = to_int32(source);
+    int32_t t[4];
+    constexpr int32_t mask[4] = {0x00000001, 0x00000100, 0x00010000, 0x01000000};
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < PackedResultType::kElements; ++ii) {
+      t[ii] = __dp4a(x, mask[ii], 0);
+      r[ii] = static_cast<float>(t[ii]);
+    }
+  #endif
+
+    return r;
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_4, source_type_packed_4,
+                                         result_type_packed_2, source_type_packed_2>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float, N> <= Array<uint8_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<float, uint8_t, N, Round> {
+  using result_type = Array<float, N>;
+  using source_type = Array<uint8_t, N>;
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_4 = Array<float, 4>;
+  using result_type_packed_2 = Array<float, 2>;
+  using source_type_packed_4 = Array<uint8_t, 4>;
+  using source_type_packed_2 = Array<uint8_t, 2>;
+
+  using ScalarConverter = NumericConverter<float, uint8_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_2 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 2 or 4 to use private convert dispatch.");
+
+    PackedResultType r;
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+
+    // __byte_perm simulates the add.u32 0x4B000000 to every u8 element of u8x4 source and stores
+    // the result in r (without introducing extra cvt.u32.u8 instruction)
+    uint32_t const prmt_indices[4] = {0x7650, 0x7651, 0x7652, 0x7653};
+    uint32_t* result_as_int = reinterpret_cast<uint32_t*>(&r);
+    for (int ii = 0; ii < PackedResultType::kElements; ++ii) {
+      result_as_int[ii] = __byte_perm(src_reg, 0x4B000000, prmt_indices[ii]);
+      // Subtract the magic number 0x4B000000 from tmp in floating-point arithmetic to obtain final result
+      r[ii] -= 8388608.f;
+    }
+
+    return r;
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_4, source_type_packed_4,
+                                         result_type_packed_2, source_type_packed_2>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for Array<cutlass::half_t, N> <= Array<cutlass::int2b_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::half_t, cutlass::int2b_t, N, Round> {
+  using result_type = Array<cutlass::half_t, N>;
+  using source_type = Array<cutlass::int2b_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_16 = Array<cutlass::half_t, 16>;
+  using result_type_packed_8 = Array<cutlass::half_t, 8>;
+  using result_type_packed_4 = Array<cutlass::half_t, 4>;
+  using source_type_packed_16 = Array<cutlass::int2b_t, 16>;
+  using source_type_packed_8 = Array<cutlass::int2b_t, 8>;
+  using source_type_packed_4 = Array<cutlass::int2b_t, 4>;
+
+  using ScalarConverter = NumericConverter<cutlass::half_t, cutlass::int2b_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint8_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_8 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_16 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_8>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_8>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_16>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_16>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 4, 8 or 16 to use private convert dispatch.");
+
+    // Hold output FP16s in reg. We need 1 reg for every 2 elements
+    using RegArray = cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2, sizeof(PackedResultType)>;
+    RegArray r;
+
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+    uint32_t src_reg_shifted = src_reg >> 4;
+
+    // Below constructs the following temporary:
+    // f1f0   = {0x00, i3i2i1i0,     0x00, i3i2i1i0}
+    // f3f2   = {0x00, i5i4i3i2,     0x00, i5i4i3i2}
+    // f5f4   = {0x00, i7i6i5i4,     0x00, i7i6i5i4}
+    // f7f6   = {0x00, i9i8i7i6,     0x00, i9i8i7i6}
+    // f9f8   = {0x00, i11i10i9i8,   0x00, i11i10i9i8}
+    // f11f10 = {0x00, i13i12i11i10, 0x00, i13i12i11i10}
+    // f13f12 = {0x00, i15i14i13i12, 0x00, i15i14i13i12}
+    // f15f14 = {0x00, 0000i15i14,   0x00, 0000i15i14}
+    // We use inline asm instead of __byte_perm intrinsic since we don't want the documented (& 0x7) on the index. NVCC
+    // might be able to optimize it out since the index is a constexpr, but we choose to be safe about it here.
+    uint32_t prmt_indices[4] = {0x4040, 0x4141, 0x4242, 0x4343};
+    static_assert(RegArray::kElements <= 8, "Too many inputs for I2 -> FP16 vector converter");
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ii += 2) {
+      asm volatile(
+          "{ prmt.b32 %0, %1, %2, %3; }\n"
+          : "=r"(r[ii])
+          : "r"(src_reg), "n"(0), "r"(prmt_indices[ii / 2]));
+
+      asm volatile(
+           "{ prmt.b32 %0, %1, %2, %3; }\n"
+           : "=r"(r[ii + 1])
+           : "r"(src_reg_shifted), "n"(0), "r"(prmt_indices[ii / 2]));
+    }
+
+    // The below XOR does the following:
+    // Sets the exponent bits of the FP16 to the correct value for the FP16 magic_num. We will be constructing
+    // 1024 + x + 2, 1024 + 4 * (x + 2)
+    // We use lop3 so that we can use 1 instruction for AND and XOR.
+    // static constexpr uint32_t xor_mask[2] = { 0x64086402, 0x64806420};
+    // static constexpr uint32_t and_mask[2] = { 0x000C0003, 0x00C00030};
+    static constexpr uint32_t xor_mask = 0x64086402;
+    static constexpr uint32_t and_mask = 0x000C0003;
+    static constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa;
+
+    // For each operand, computes:
+    // r[i] = (r[i] & and_mask[i / 2]) ^ xor_mask[i / 2]
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      asm volatile(
+          "{ lop3.b32 %0, %0, %1, %2, %3; }\n"
+          : "+r"(r[ii])
+          : "n"(and_mask), "n"(xor_mask), "n"(immLut));
+    }
+
+    // {-258, -1026}
+    static constexpr uint32_t hfma_bias_rep = 0xDC08E402;
+    // {1/4, 1}
+    static constexpr uint32_t hfma_scale_rep = 0x34003C00;
+
+    // Scale and subtract the FP16s to get the original int4 number as FP16.
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii]);
+      fp16x2_val = __hfma2(fp16x2_val,
+                           reinterpret_cast<const half2&>(hfma_scale_rep),
+                           reinterpret_cast<const half2&>(hfma_bias_rep));
+    }
+    return reinterpret_cast<PackedResultType&>(r);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_16, source_type_packed_16,
+                                         result_type_packed_8, source_type_packed_8,
+                                         result_type_packed_4, source_type_packed_4>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<cutlass::half_t, N> <= Array<cutlass::uint2b_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::half_t, cutlass::uint2b_t, N, Round> {
+  using result_type = Array<cutlass::half_t, N>;
+  using source_type = Array<cutlass::uint2b_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_16 = Array<cutlass::half_t, 16>;
+  using result_type_packed_8 = Array<cutlass::half_t, 8>;
+  using result_type_packed_4 = Array<cutlass::half_t, 4>;
+  using source_type_packed_16 = Array<cutlass::uint2b_t, 16>;
+  using source_type_packed_8 = Array<cutlass::uint2b_t, 8>;
+  using source_type_packed_4 = Array<cutlass::uint2b_t, 4>;
+
+  using ScalarConverter = NumericConverter<cutlass::half_t, cutlass::uint2b_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint8_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_8 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_16 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_8>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_8>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_16>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_16>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 4, 8 or 16 to use private convert dispatch.");
+
+    // Hold output FP16s in reg. We need 1 reg for every 2 elements
+    using RegArray = cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2, sizeof(PackedResultType)>;
+    RegArray r;
+
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+    uint32_t src_reg_shifted = src_reg >> 4;
+
+    // Below constructs the following temporary:
+    // f1f0   = {0x00, u3u2u1u0,     0x00, u3u2u1u0}
+    // f3f2   = {0x00, u5u4u3u2,     0x00, u5u4u3u2}
+    // f5f4   = {0x00, u7u6u5u4,     0x00, u7u6u5u4}
+    // f7f6   = {0x00, u9u8u7u6,     0x00, u9u8u7u6}
+    // f9f8   = {0x00, u11u10u9u8,   0x00, u11u10u9u8}
+    // f11f10 = {0x00, u13u12u11u10, 0x00, u13u12u11u10}
+    // f13f12 = {0x00, u15u14u13u12, 0x00, u15u14u13u12}
+    // f15f14 = {0x00, 0000u15u14,   0x00, 0000u15u14}
+    // We use inline asm instead of __byte_perm intrinsic since we don't want the documented (& 0x7) on the index. NVCC
+    // might be able to optimize it out since the index is a constexpr, but we choose to be safe about it here.
+    uint32_t prmt_indices[4] = {0x4040, 0x4141, 0x4242, 0x4343};
+    static_assert(RegArray::kElements <= 8, "Too many inputs for I2 -> FP16 vector converter");
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ii += 2) {
+      asm volatile(
+          "{ prmt.b32 %0, %1, %2, %3; }\n"
+          : "=r"(r[ii])
+          : "r"(src_reg), "n"(0), "r"(prmt_indices[ii / 2]));
+
+      asm volatile(
+           "{ prmt.b32 %0, %1, %2, %3; }\n"
+           : "=r"(r[ii + 1])
+           : "r"(src_reg_shifted), "n"(0), "r"(prmt_indices[ii / 2]));
+    }
+
+    // The below XOR does the following:
+    // Sets the exponent bits of the FP16 to the correct value for the FP16 magic_num. We will be constructing
+    // 1024 + x, 1024 + 4 * x
+    // We use lop3 so that we can use 1 instruction for AND and OR.
+    static constexpr uint32_t xor_mask = 0x64006400;
+    static constexpr uint32_t and_mask = 0x000C0003;
+    static constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa;
+
+    // For each operand, computes:
+    // r[i] = (r[i] & and_mask[i / 2]) ^ xor_mask[i / 2]
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      asm volatile(
+          "{ lop3.b32 %0, %0, %1, %2, %3; }\n"
+          : "+r"(r[ii])
+          : "n"(and_mask), "n"(xor_mask), "n"(immLut));
+    }
+
+    // {-256, -1024}
+    static constexpr uint32_t hfma_bias_rep = 0xDC00E400;
+    // {1/4, 1}
+    static constexpr uint32_t hfma_scale_rep = 0x34003C00;
+
+    // Scale and subtract the FP16s to get the original int4 number as FP16.
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii]);
+      fp16x2_val = __hfma2(fp16x2_val,
+                           reinterpret_cast<const half2&>(hfma_scale_rep),
+                           reinterpret_cast<const half2&>(hfma_bias_rep));
+    }
+    return reinterpret_cast<PackedResultType&>(r);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_16, source_type_packed_16,
+                                         result_type_packed_8, source_type_packed_8,
+                                         result_type_packed_4, source_type_packed_4>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<cutlass::half_t, N> <= Array<cutlass::int4b_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::half_t, cutlass::int4b_t, N, Round> {
+  using result_type = Array<cutlass::half_t, N>;
+  using source_type = Array<cutlass::int4b_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_8 = Array<cutlass::half_t, 8>;
+  using result_type_packed_4 = Array<cutlass::half_t, 4>;
+  using result_type_packed_2 = Array<cutlass::half_t, 2>;
+  using source_type_packed_8 = Array<cutlass::int4b_t, 8>;
+  using source_type_packed_4 = Array<cutlass::int4b_t, 4>;
+  using source_type_packed_2 = Array<cutlass::int4b_t, 2>;
+
+  using ScalarConverter = NumericConverter<cutlass::half_t, cutlass::int4b_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_2 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint8_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_8 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  // The core converter uses bit tricks to construct a known FP16 number, then does a
+  // subtraction in FP16 for the final result.
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_8>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_8>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 2, 4 or 8 to use private convert dispatch.");
+
+    // Hold output FP16s in reg. We need 1 reg for every 2 elements
+    using RegArray = cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2, sizeof(PackedResultType)>;
+    RegArray r;
+
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+
+    // Below constructs the following temporary:
+    // fp16s_01 = {0x00, i4_01, 0x00, i4_01}
+    // fp16s_23 = {0x00, i4_23, 0x00, i4_23}
+    // fp16s_45 = {0x00, i4_45, 0x00, i4_45}
+    // fp16s_67 = {0x00, i4_67, 0x00, i4_67}
+    // We use inline asm instead of __byte_perm intrinsic since we don't want the documented (& 0x7) on the index. NVCC
+    // might be able to optimize it out since the index is a constexpr, but we choose to be safe about it here.
+    uint32_t prmt_indices[4] = {0x4040, 0x4141, 0x4242, 0x4343};
+    static_assert(RegArray::kElements <= 4, "Too many inputs for I4 ->F16 vector converter");
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      asm volatile(
+          "{ prmt.b32 %0, %1, %2, %3; }\n"
+          : "=r"(r[ii])
+          : "r"(src_reg), "n"(0), "r"(prmt_indices[ii]));
+    }
+
+    // The below XOR does the following:
+    // 1) Sets the exponent bits of the FP16 to the correct value for the FP16 magic_num. We will be constructing
+    //    1024 + x + 8 OR 1024 + 16 * (x + 8), then using hfma to subtract 1032 from that
+    // 2) Adds 8 to the int4 value that we will process in the FP16 (for uint4, we can simply avoid this step)
+    // The AND does the following:
+    // 1) Clear the set bits for the int4 we will ignore.
+    // We use lop3 so that we can use 1 instruction for AND and XOR.
+    static constexpr uint32_t xor_mask = 0x64806408;
+    static constexpr uint32_t and_mask = 0xFFF0FF0F;
+    static constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa;
+
+    // For each operand, computes:
+    // r[i] = (r[i] & and_mask) ^ xor_mask
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      asm volatile(
+          "{\n"
+          "  lop3.b32 %0, %0, %1, %2, %3;\n"
+          "}\n"
+          : "+r"(r[ii])
+          : "n"(and_mask), "n"(xor_mask), "n"(immLut));
+    }
+
+    // We will issue 2 hfmas that do the following:
+    // For the high FP16:
+    //  Divide by 16 {packed as a operand} to get:
+    //    64 + (x + 8)
+    //    x + 72
+    //  Subtract 72 {packed as c operand} to get x
+    // For the low FP16:
+    //    1024 + (x + 8)
+    //    x + 1032
+    // So, we subtract 1032 {packed as c operand} to get x
+
+    // {-72, -1032}
+    static constexpr uint32_t hfma_bias_rep = 0xD480E408;
+    // {1 / 16, 1}
+    static constexpr uint32_t hfma_scale_rep = 0x2C003C00;
+
+    const half2& hfma_bias = reinterpret_cast<const half2&>(hfma_bias_rep);
+    const half2& hfma_scale = reinterpret_cast<const half2&>(hfma_scale_rep);
+    // Scale and subtract the FP16s to get the original int4 number as FP16.
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii]);
+      fp16x2_val = __hfma2(hfma_scale, fp16x2_val, hfma_bias);
+    }
+    return reinterpret_cast<PackedResultType&>(r);
+  }
+
+  friend class detail::VectorizedConverter;
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_8, source_type_packed_8,
+                                         result_type_packed_4, source_type_packed_4,
+                                         result_type_packed_2, source_type_packed_2>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<cutlass::half_t, N> <= Array<cutlass::uint4b_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::half_t, cutlass::uint4b_t, N, Round> {
+  using result_type = Array<cutlass::half_t, N>;
+  using source_type = Array<cutlass::uint4b_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_8 = Array<cutlass::half_t, 8>;
+  using result_type_packed_4 = Array<cutlass::half_t, 4>;
+  using result_type_packed_2 = Array<cutlass::half_t, 2>;
+  using source_type_packed_8 = Array<cutlass::uint4b_t, 8>;
+  using source_type_packed_4 = Array<cutlass::uint4b_t, 4>;
+  using source_type_packed_2 = Array<cutlass::uint4b_t, 2>;
+
+  using ScalarConverter = NumericConverter<cutlass::half_t, cutlass::uint4b_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_2 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint8_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_8 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  // The core converter uses bit tricks to construct a known FP16 number, then does a
+  // subtraction in FP16 for the final result.
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_8>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_8>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 2, 4 or 8 to use private convert dispatch.");
+
+    // Hold output FP16s in reg. We need 1 reg for every 2 elements
+    using RegArray = cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2, sizeof(PackedResultType)>;
+    RegArray r;
+
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+    // Below constructs the following temporary:
+    // fp16s_01 = {0x00, u4_01, 0x00, u4_01}
+    // fp16s_23 = {0x00, u4_23, 0x00, u4_23}
+    // fp16s_45 = {0x00, u4_45, 0x00, u4_45}
+    // fp16s_67 = {0x00, u4_67, 0x00, u4_67}
+    uint32_t prmt_indices[4] = {0x4040, 0x4141, 0x4242, 0x4343};
+    static_assert(RegArray::kElements <= 4, "Too many inputs for u4 -> f16 vector converter");
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      asm volatile(
+          "{ prmt.b32 %0, %1, %2, %3; }\n"
+          : "=r"(r[ii])
+          : "r"(src_reg), "n"(0), "r"(prmt_indices[ii]));
+    }
+
+    // The below XOR does the following:
+    // Sets the exponent bits of the FP16 to the correct value for the FP16 magic_num. We will be constructing
+    // 1024 + x, then using hsub2 to subtract 1024 from that
+    static constexpr uint32_t or_mask  = 0x64006400;
+    static constexpr uint32_t and_mask = 0x00F0000F;
+    static constexpr uint32_t immLut   = (0xf0 & 0xcc) | 0xaa;
+
+    // For each operand, computes:
+    // r[i] = (r[i] & and_mask) | or_mask
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      asm volatile(
+          "{\n"
+          "  lop3.b32 %0, %0, %1, %2, %3;\n"
+          "}\n"
+          : "+r"(r[ii])
+          : "n"(and_mask), "n"(or_mask), "n"(immLut));
+
+      // We will issue 2 hfmas that do the following:
+      // For the high FP16:
+      //  Divide by 16 {packed as a operand} to get:
+      //    64 + x
+      //  Subtract 64 {packed as c operand} to get x
+      // For the low FP16:
+      // we subtract 1024 {packed as c operand} to get x
+
+      static constexpr uint32_t hfma_bias  = 0xD400E400; // {-64, -1024}
+      static constexpr uint32_t hfma_scale = 0x2C003C00; // {1 / 16, 1}
+      
+      {
+        __half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii]);
+        fp16x2_val = __hfma2(fp16x2_val, reinterpret_cast<const __half2&>(hfma_scale), reinterpret_cast<const __half2&>(hfma_bias));
+      }
+    }
+    return reinterpret_cast<PackedResultType&>(r);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_8, source_type_packed_8,
+                                         result_type_packed_4, source_type_packed_4,
+                                         result_type_packed_2, source_type_packed_2>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<cutlass::half_t, N> <= Array<int8_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::half_t, int8_t, N, Round> {
+  using result_type = Array<cutlass::half_t, N>;
+  using source_type = Array<int8_t, N>;
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_4 = Array<cutlass::half_t, 4>;
+  using result_type_packed_2 = Array<cutlass::half_t, 2>;
+  using source_type_packed_4 = Array<int8_t, 4>;
+  using source_type_packed_2 = Array<int8_t, 2>;
+
+  using ScalarConverter = NumericConverter<cutlass::half_t, int8_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_2 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  // The core converter uses bit tricks to construct a known FP16 number, then does a
+  // subtraction in FP16 for the final result.
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 2 or 4 to use private convert dispatch.");
+
+    // Hold output FP16s in reg. We need 1 reg for every 2 elements
+    using RegArray = cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2, sizeof(PackedResultType)>;
+    RegArray r;
+
+    #if 0 // Scalar conversion (Please keep this code for reference for vectorized version below)
+    auto result = reinterpret_cast<PackedResultType&>(r);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < PackedResultType::kElements; ++i) {
+      int16_t tmp = source[i] + 26112 /* 0x6600 */;
+      result[i] = reinterpret_cast<cutlass::half_t const &>(tmp) - 1536.0_hf;
+    }
+    #endif
+
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+    uint32_t const prmt_indices[2] = {0x9180, 0xB3A2};
+
+    // Pack s8x2 (s8[1], s8[0]) -> s16x2 (sext.s8[1], sext.s8[0])
+    // (See https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt)
+    // The inline ptx below uses `msb=0` and `msb=1` from the above link to sign-extend the sign bit in 0, 1, 2, 3 bytes of s8x4
+    // into result_ptr[0] and result_ptr[1]'s 08-15 and 24-31 bits, respectively.
+    // Note that `__byte_perm(source_ptr[0], source_ptr[0], 0x9180);` won't achieve the same result and doesn't sign-extend the sign bit.
+    // Thus, we use inline ptx `prmt.b32` instruction for the desired sign extend from s8x2 to s16x2.
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      asm volatile("prmt.b32 %0,%1,%1,%2;\n" : "=r"(r[ii]) : "r"(src_reg), "r"(prmt_indices[ii]));
+    }
+
+    // In the absense of add.s16x2 instruction, use bit-wise operation to execute signed addition with magic numbers to achieve
+    // the same result as add.s16x2 instruction.
+    // (See https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-lop3)
+    // For a logical operation F(a, b, c) the value of kImmLut can be computed by applying the same operation to
+    // three predefined constant values as follows:
+    //                                        ta = 0xF0;
+    //                                        tb = 0xCC;
+    //                                        tc = 0xAA;
+    //                                   kImmLut = F(ta, tb, tc);
+    // If we want F = ((a & b) ^ c) then set kImmLut = (0xF0 & 0xCC) ^ 0xAA
+    static constexpr uint32_t kImmLut = (0xF0 & 0xCC) ^ 0xAA;
+
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      // The bit-wise operation executed below is `r[ii] = (r[ii] & 0x03FF03FF) ^ 0x66006600;`
+      asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" :
+                                "=r"(r[ii]) : "r"(r[ii]), "n"(0x03FF03FF), "n"(0x66006600), "n"(kImmLut));
+    }
+
+    static constexpr uint32_t bias_rep = 0x66006600;
+    const half2& bias = reinterpret_cast<const half2&>(bias_rep);
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii]);
+      fp16x2_val = __hsub2(fp16x2_val, bias);
+    }
+    return reinterpret_cast<PackedResultType&>(r);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_4, source_type_packed_4,
+                                         result_type_packed_2, source_type_packed_2>(result, source);
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<cutlass::half_t, N> <= Array<uint8_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::half_t, uint8_t, N, Round> {
+  using result_type = Array<cutlass::half_t, N>;
+  using source_type = Array<uint8_t, N>;
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_4 = Array<cutlass::half_t, 4>;
+  using result_type_packed_2 = Array<cutlass::half_t, 2>;
+  using source_type_packed_4 = Array<uint8_t, 4>;
+  using source_type_packed_2 = Array<uint8_t, 2>;
+
+  using ScalarConverter = NumericConverter<cutlass::half_t, uint8_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_2 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 2 or 4 to use private convert dispatch.");
+
+    // Hold output FP16s in reg. We need 1 reg for every 2 elements
+    using RegArray = cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2, sizeof(PackedResultType)>;
+    RegArray r;
+
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+    uint32_t const prmt_indices[2] = {0x5150, 0x5352};
+    static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
+
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      asm volatile("prmt.b32 %0,%1,%2,%3;\n" : "=r"(r[ii]) : "r"(src_reg), "n"(start_byte_for_fp16), "r"(prmt_indices[ii]));
+    }
+
+    static constexpr uint32_t bias_rep = 0x64006400;
+    const half2& bias = reinterpret_cast<const half2&>(bias_rep);
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii]);
+      fp16x2_val = __hsub2(fp16x2_val, bias);
+    }
+
+    return reinterpret_cast<PackedResultType&>(r);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_4, source_type_packed_4,
+                                         result_type_packed_2, source_type_packed_2>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for Array<cutlass::bfloat16_t, N> <= Array<cutlass::int2b_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::bfloat16_t, cutlass::int2b_t, N, Round> {
+  using result_type = Array<cutlass::bfloat16_t, N>;
+  using source_type = Array<cutlass::int2b_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_16 = Array<cutlass::bfloat16_t, 16>;
+  using result_type_packed_8 = Array<cutlass::bfloat16_t, 8>;
+  using result_type_packed_4 = Array<cutlass::bfloat16_t, 4>;
+  using source_type_packed_16 = Array<cutlass::int2b_t, 16>;
+  using source_type_packed_8 = Array<cutlass::int2b_t, 8>;
+  using source_type_packed_4 = Array<cutlass::int2b_t, 4>;
+
+  using ScalarConverter = NumericConverter<cutlass::bfloat16_t, cutlass::int2b_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint8_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_8 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_16 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_8>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_8>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_16>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_16>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 4, 8 or 16 to use private convert dispatch.");
+
+    using RegArray = cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2, sizeof(PackedResultType)>;
+    RegArray r;
+
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+    uint32_t src_reg_shifted_two = src_reg >> 2;
+    uint32_t src_reg_shifted_four = src_reg >> 4;
+    uint32_t src_reg_shifted_six = src_reg >> 6;
+
+    // Modified prmt indices for signed 2-bit values 
+    uint32_t const prmt_indices[4] = {0xF4F0, 0xF5F1, 0xF6F2, 0xF7F3};
+
+    static_assert(RegArray::kElements <= 8, "Too many inputs for I2 -> BF16 vector converter");
+
+    // First pass: extract and sign extend the 2-bit values
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ii += 2) {
+      asm volatile(
+          "{ prmt.b32 %0, %1, %2, %3; }\n"
+          : "=r"(r[ii])
+          : "r"(src_reg), "r"(src_reg_shifted_two), "r"(prmt_indices[ii / 2]));
+
+      asm volatile(
+           "{ prmt.b32 %0, %1, %2, %3; }\n"
+           : "=r"(r[ii + 1])
+           : "r"(src_reg_shifted_four), "r"(src_reg_shifted_six), "r"(prmt_indices[ii / 2]));
+    }
+
+    // For signed 2-bit integers:
+    // 00 ->  0     (0)
+    // 01 ->  1     (1)
+    // 10 -> -2     (2 with sign extension)
+    // 11 -> -1     (3 with sign extension)
+    //static constexpr uint32_t sign_mask = 0x00020002;  // Mask to check sign bit
+    static constexpr uint32_t and_mask = 0x00030003;   // Mask for 2 bits
+
+    // Modified for signed range (-2 to 1)
+    // We'll construct numbers in the form 128 + (x + 2) and then subtract 130
+    // to get back to our original range
+    static constexpr uint32_t xor_mask = 0x43024302;
+    static constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      asm volatile(
+          "{\n"
+          "  lop3.b32 %0, %0, %1, %2, %3;\n"
+          "}\n"
+          : "+r"(r[ii])
+          : "n"(and_mask), "n"(xor_mask), "n"(immLut));
+    }
+
+    // Bias represents 130 in bfloat16 format
+    // Subtracting 130 brings us back to our signed range (-2 to 1)
+    static constexpr uint32_t bias_rep = 0x43024302;  // {130, 130} in bfloat16
+    const __nv_bfloat162& bias = reinterpret_cast<const __nv_bfloat162&>(bias_rep);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      __nv_bfloat162& bf16x2_val = reinterpret_cast<__nv_bfloat162&>(r[ii]);
+      bf16x2_val = __hsub2(bf16x2_val, bias);
+    }
+
+    return reinterpret_cast<PackedResultType&>(r);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_16, source_type_packed_16,
+                                         result_type_packed_8, source_type_packed_8,
+                                         result_type_packed_4, source_type_packed_4>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<cutlass::bfloat16_t, N> <= Array<cutlass::uint2b_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::bfloat16_t, cutlass::uint2b_t, N, Round> {
+  using result_type = Array<cutlass::bfloat16_t, N>;
+  using source_type = Array<cutlass::uint2b_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_16 = Array<cutlass::bfloat16_t, 16>;
+  using result_type_packed_8 = Array<cutlass::bfloat16_t, 8>;
+  using result_type_packed_4 = Array<cutlass::bfloat16_t, 4>;
+  using source_type_packed_16 = Array<cutlass::uint2b_t, 16>;
+  using source_type_packed_8 = Array<cutlass::uint2b_t, 8>;
+  using source_type_packed_4 = Array<cutlass::uint2b_t, 4>;
+
+  using ScalarConverter = NumericConverter<cutlass::bfloat16_t, cutlass::uint2b_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint8_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_8 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_16 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_8>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_8>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_16>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_16>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 4, 8 or 16 to use private convert dispatch.");
+
+    using RegArray = cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2, sizeof(PackedResultType)>;
+    RegArray r;
+
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+    uint32_t src_reg_shifted_two = src_reg >> 2;
+    uint32_t src_reg_shifted_four = src_reg >> 4;
+    uint32_t src_reg_shifted_six = src_reg >> 6;
+
+    // Modified prmt indices for signed 2-bit values 
+    uint32_t const prmt_indices[4] = {0xF4F0, 0xF5F1, 0xF6F2, 0xF7F3};
+
+    static_assert(RegArray::kElements <= 8, "Too many inputs for U2 -> BF16 vector converter");
+
+    // First pass: extract and sign extend the 2-bit values
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ii += 2) {
+      asm volatile(
+          "{ prmt.b32 %0, %1, %2, %3; }\n"
+          : "=r"(r[ii])
+          : "r"(src_reg), "r"(src_reg_shifted_two), "r"(prmt_indices[ii / 2]));
+
+      asm volatile(
+           "{ prmt.b32 %0, %1, %2, %3; }\n"
+           : "=r"(r[ii + 1])
+           : "r"(src_reg_shifted_four), "r"(src_reg_shifted_six), "r"(prmt_indices[ii / 2]));
+    }
+
+    static constexpr uint32_t and_mask = 0x00030003;   // Mask for 2 bits
+    static constexpr uint32_t xor_mask = 0x43004300;
+    static constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      asm volatile(
+          "{ lop3.b32 %0, %0, %1, %2, %3; }"
+          : "+r"(r[ii])
+          : "n"(and_mask), "n"(xor_mask), "n"(immLut));
+    }
+
+    static constexpr uint32_t bias_rep = xor_mask;  // {128, 128} in bfloat16
+    const __nv_bfloat162& bias = reinterpret_cast<const __nv_bfloat162&>(bias_rep);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      __nv_bfloat162& bf16x2_val = reinterpret_cast<__nv_bfloat162&>(r[ii]);
+      bf16x2_val = __hsub2(bf16x2_val, bias);
+    }
+
+    return reinterpret_cast<PackedResultType&>(r);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_16, source_type_packed_16,
+                                         result_type_packed_8, source_type_packed_8,
+                                         result_type_packed_4, source_type_packed_4>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<cutlass::bfloat16_t, N> <= Array<cutlass::int4b_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::bfloat16_t, cutlass::int4b_t, N, Round> {
+  using result_type = Array<cutlass::bfloat16_t, N>;
+  using source_type = Array<cutlass::int4b_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_8 = Array<cutlass::bfloat16_t, 8>;
+  using result_type_packed_4 = Array<cutlass::bfloat16_t, 4>;
+  using result_type_packed_2 = Array<cutlass::bfloat16_t, 2>;
+  using source_type_packed_8 = Array<cutlass::int4b_t, 8>;
+  using source_type_packed_4 = Array<cutlass::int4b_t, 4>;
+  using source_type_packed_2 = Array<cutlass::int4b_t, 2>;
+
+  using ScalarConverter = NumericConverter<cutlass::bfloat16_t, cutlass::int4b_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_2 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint8_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_8 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  // The core converter uses bit tricks to construct a known FP16 number, then does a
+  // subtraction in FP16 for the final result.
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_8>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_8>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 2, 4 or 8 to use private convert dispatch.");
+
+    // Hold output FP16s in reg. We need 1 reg for every 2 elements
+    using RegArray = cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2, sizeof(PackedResultType)>;
+    RegArray r;
+
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+    uint32_t src_reg_shifted = src_reg >> 4;
+
+    // Below constructs the following temporary:
+    uint32_t const prmt_indices[4] = {0xF4F0, 0xF5F1, 0xF6F2, 0xF7F3};
+    static_assert(RegArray::kElements <= 4, "Too many inputs for BF16 -> I4 vector converter");
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      asm volatile(
+          "{ prmt.b32 %0, %1, %2, %3; }\n"
+          : "=r"(r[ii])
+          : "r"(src_reg), "r"(src_reg_shifted), "r"(prmt_indices[ii]));
+    }
+
+    // The below XOR does the following:
+    // 1) Sets the exponent bits of the FP16 to the correct value for the FP16 magic_num. We will be constructing
+    //    128 + (x + 8) and subtracting 136 to get x
+    static constexpr uint32_t xor_mask = 0x43084308;
+    static constexpr uint32_t and_mask = 0x000F000F;
+    static constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa;
+
+    // For each operand, computes:
+    // r[i] = (r[i] & and_mask) ^ xor_mask
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      asm volatile(
+          "{ lop3.b32 %0, %0, %1, %2, %3; }\n"
+          : "+r"(r[ii])
+          : "n"(and_mask), "n"(xor_mask), "n"(immLut));
+    }
+
+    // We will issue 2 bfmas that do the following:
+    // high BF16:
+    // hi_bf16 - 136, lo_bf16 - 136
+
+    // This is the BF16 {136, 136} represented as an integer.
+    static constexpr uint32_t bias_rep = 0x43084308;
+    const __nv_bfloat162& bias = reinterpret_cast<const __nv_bfloat162&>(bias_rep);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      __nv_bfloat162& bf16x2_val = reinterpret_cast<__nv_bfloat162&>(r[ii]);
+      bf16x2_val = __hsub2(bf16x2_val, bias);
+    }
+
+    return reinterpret_cast<PackedResultType&>(r);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_8, source_type_packed_8,
+                                         result_type_packed_4, source_type_packed_4,
+                                         result_type_packed_2, source_type_packed_2>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for Array<cutlass::bfloat16_t, N> <= Array<cutlass::uint4b_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::bfloat16_t, cutlass::uint4b_t, N, Round> {
+  using result_type = Array<cutlass::bfloat16_t, N>;
+  using source_type = Array<cutlass::uint4b_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_8 = Array<cutlass::bfloat16_t, 8>;
+  using result_type_packed_4 = Array<cutlass::bfloat16_t, 4>;
+  using result_type_packed_2 = Array<cutlass::bfloat16_t, 2>;
+  using source_type_packed_8 = Array<cutlass::uint4b_t, 8>;
+  using source_type_packed_4 = Array<cutlass::uint4b_t, 4>;
+  using source_type_packed_2 = Array<cutlass::uint4b_t, 2>;
+
+  using ScalarConverter = NumericConverter<cutlass::bfloat16_t, cutlass::uint4b_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_2 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint8_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_8 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  // The core converter uses bit tricks to construct a known FP16 number, then does a
+  // subtraction in FP16 for the final result.
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_8>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_8>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 2, 4 or 8 to use private convert dispatch.");
+
+    // Hold output FP16s in reg. We need 1 reg for every 2 elements
+    using RegArray = cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2, sizeof(PackedResultType)>;
+    RegArray r;
+
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+    uint32_t src_reg_shifted = src_reg >> 4;
+
+    // Below constructs the following temporary:
+    // fp16s_01 = {0x00,  u4_21, 0x00, u4_10}
+    // fp16s_23 = {0x00,  u4_43, 0x00, u4_32}
+    // fp16s_45 = {0x00,  u4_65, 0x00, u4_54}
+    // fp16s_67 = {0x000, u4_7,  0x00, u4_76}
+    static constexpr uint32_t prmt_indices[4] = {0xF4F0, 0xF5F1, 0xF6F2, 0xF7F3};
+    static_assert(RegArray::kElements <= 4, "Too many inputs for BF16 -> I4 vector converter");
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      asm volatile(
+          "{\n"
+          "  prmt.b32 %0, %1, %2, %3;\n"
+          "}\n"
+          : "=r"(r[ii])
+          : "r"(src_reg), "r"(src_reg_shifted), "r"(prmt_indices[ii]));
+    }
+
+    static constexpr uint32_t xor_mask = 0x43004300;
+    static constexpr uint32_t and_mask = 0x000F000F;
+    static constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa;
+
+    // For each operand, computes:
+    // r[i] = (r[i] & and_mask) ^ xor_mask
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      asm volatile(
+          "{\n"
+          "  lop3.b32 %0, %0, %1, %2, %3;\n"
+          "}\n"
+          : "+r"(r[ii])
+          : "n"(and_mask), "n"(xor_mask), "n"(immLut));
+    }
+
+    // We will issue 2 bfmas that do the following:
+    // high BF16:
+    // hi_bf16 - 128, lo_bf16 - 128
+
+    // This is the BF16 {128, 128} represented as an integer.
+    static constexpr uint32_t bias = xor_mask;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      __nv_bfloat162& bf16x2_val = reinterpret_cast<__nv_bfloat162&>(r[ii]);
+      bf16x2_val = __hsub2(bf16x2_val, reinterpret_cast<const __nv_bfloat162&>(bias));
+    }
+
+    return reinterpret_cast<PackedResultType&>(r);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_8, source_type_packed_8,
+                                         result_type_packed_4, source_type_packed_4,
+                                         result_type_packed_2, source_type_packed_2>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<cutlass::bfloat16_t, N> <= Array<int8_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::bfloat16_t, int8_t, N, Round> {
+  using result_type = Array<cutlass::bfloat16_t, N>;
+  using source_type = Array<int8_t, N>;
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_4 = Array<cutlass::bfloat16_t, 4>;
+  using result_type_packed_2 = Array<cutlass::bfloat16_t, 2>;
+  using source_type_packed_4 = Array<int8_t, 4>;
+  using source_type_packed_2 = Array<int8_t, 2>;
+
+  using ScalarConverter = NumericConverter<cutlass::bfloat16_t, int8_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_2 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 2 or 4 to use private convert dispatch.");
+
+    NumericArrayConverter<float, int8_t, PackedResultType::kElements, Round> convert_int8_to_f32;
+    Array<float, PackedResultType::kElements> tmp = convert_int8_to_f32(source);
+    NumericArrayConverter<cutlass::bfloat16_t, float, PackedResultType::kElements, Round> convert_f32_to_bf16;
+    return convert_f32_to_bf16(tmp);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_4, source_type_packed_4,
+                                         result_type_packed_2, source_type_packed_2>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<cutlass::bfloat16_t, N> <= Array<uint8_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::bfloat16_t, uint8_t, N, Round> {
+  using result_type = Array<cutlass::bfloat16_t, N>;
+  using source_type = Array<uint8_t, N>;
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_4 = Array<cutlass::bfloat16_t, 4>;
+  using result_type_packed_2 = Array<cutlass::bfloat16_t, 2>;
+  using source_type_packed_4 = Array<uint8_t, 4>;
+  using source_type_packed_2 = Array<uint8_t, 2>;
+
+  using ScalarConverter = NumericConverter<cutlass::bfloat16_t, uint8_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_2 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 2 or 4 to use private convert dispatch.");
+
+    NumericArrayConverter<float, uint8_t, PackedResultType::kElements, Round> convert_uint8_to_f32;
+    Array<float, PackedResultType::kElements> tmp = convert_uint8_to_f32(source);
+    NumericArrayConverter<cutlass::bfloat16_t, float, PackedResultType::kElements, Round> convert_f32_to_bf16_;
+    return convert_f32_to_bf16_(tmp);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_4, source_type_packed_4,
+                                         result_type_packed_2, source_type_packed_2>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// FastNumericArrayConverter only works when the source is within center range.
+/// Conversion operator for Array.  See the comments before
+/// FastLinearCombinationClamp.
+template <typename T, typename S, int N,
+          FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
+          typename Enable = void>
+struct FastNumericArrayConverter {
+  using result_type = Array<T, N>;
+  using source_type = Array<S, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &s) {
+    NumericArrayConverter<T, S, N, Round> convert_;
 
     return convert_(s);
   }
 
   CUTLASS_DEVICE
-  result_type operator()(source_type const &s) { return convert(s); }
+  result_type operator()(source_type const &s) const { return convert(s); }
 };
 
 /// Partial specialization for Array<float> <= Array<int>
-template <typename T, int N, FloatRoundStyle Round>
-struct FastNumericArrayConverter<float, T, N, Round> {
+template <int N, FloatRoundStyle Round>
+struct FastNumericArrayConverter<float, int, N, Round> {
   using result_type = Array<float, N>;
-  using source_type = Array<T, N>;
+  using source_type = Array<int, N>;
   static FloatRoundStyle const round_style = Round;
 
   CUTLASS_DEVICE
@@ -1450,7 +5082,7 @@ struct FastNumericArrayConverter<float, T, N, Round> {
   }
 
   CUTLASS_DEVICE
-  result_type operator()(source_type const &s) { return convert(s); }
+  result_type operator()(source_type const &s) const { return convert(s); }
 };
 
 /// Partial specialization for Array<int8_t, 4> <= Array<float, 4>
@@ -1478,7 +5110,7 @@ struct FastNumericArrayConverter<int8_t, float, 4, Round> {
   }
 
   CUTLASS_DEVICE
-  result_type operator()(source_type const &s) { return convert(s); }
+  result_type operator()(source_type const &s) const { return convert(s); }
 };
 
 /// Partial specialization for Array<int8_t> <= Array<float>
@@ -1510,7 +5142,7 @@ struct FastNumericArrayConverter<int8_t, float, N, Round> {
   }
 
   CUTLASS_DEVICE
-  result_type operator()(source_type const &s) { return convert(s); }
+  result_type operator()(source_type const &s) const { return convert(s); }
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1521,11 +5153,13 @@ struct PreferredRoundingMode {
   static FloatRoundStyle const kRound = FloatRoundStyle::round_to_nearest;
 };
 
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 900
 /// Defines preferred rounding mode for a pair of types
 template <>
-struct PreferredRoundingMode<tfloat32_t, float> {
+struct PreferredRoundingMode<cutlass::tfloat32_t, float> {
   static FloatRoundStyle const kRound = FloatRoundStyle::round_half_ulp_truncate;
 };
+#endif
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -1550,7 +5184,7 @@ struct PackPredicates {
       int word_idx = (i / kWordSize);
       int bit_idx = (i % kWordSize);
 
-      uint8_t mask = ((predicates[i] ? 1u : 0u) << bit_idx);
+      uint8_t mask = static_cast<uint8_t>((predicates[i] ? 1u : 0u) << bit_idx);
       bytes[word_idx] = (bytes[word_idx] | mask);
     }
     return packed;
diff --git a/include/cutlass/numeric_size.h b/include/cutlass/numeric_size.h
new file mode 100644
index 0000000000..98fd77c394
--- /dev/null
+++ b/include/cutlass/numeric_size.h
@@ -0,0 +1,92 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+    \file
+    \brief Top-level include for all CUTLASS numeric types.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines the size of an element in bits
+template <typename T>
+struct sizeof_bits {
+  static constexpr int value = int(sizeof(T) * 8);
+};
+
+template <typename T>
+struct sizeof_bits<T const>: sizeof_bits<T> {};
+
+template <>
+struct sizeof_bits<void> {
+  static constexpr int value = 0;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Returns the number of bytes required to hold a specified number of bits
+template <class R = int, class T>
+CUTLASS_HOST_DEVICE
+constexpr
+R
+bits_to_bytes(T bits) {
+  return (R(bits) + R(7)) / R(8);
+}
+
+/// Returns the number of bits required to hold a specified number of bytes
+template <class R = int, class T>
+CUTLASS_HOST_DEVICE
+constexpr
+R
+bytes_to_bits(T bytes) {
+  return R(bytes) * R(8);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class T>
+struct is_subbyte {
+  static constexpr bool value = sizeof_bits<T>::value < 8;
+};
+
+template <class T>
+struct is_subbyte<T const> : is_subbyte<T> {};
+
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/numeric_types.h b/include/cutlass/numeric_types.h
index c8ec08b72e..ca37896bca 100644
--- a/include/cutlass/numeric_types.h
+++ b/include/cutlass/numeric_types.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -34,32 +34,11 @@
 */
 #pragma once
 
-#include "cutlass/cutlass.h"
-
-namespace cutlass {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines the size of an element in bits
-template <typename T>
-struct sizeof_bits {
-  static int const value = int(sizeof(T) * 8);
-};
+#include "cutlass/numeric_size.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-//
-// Definitions for 1-bit binary and 4-bit integer types
-//
-
-/// 1-bit binary type
-using bin1_t = bool;
-
-/// Defines the size of an element in bits - specialized for bin1_t
-template <>
-struct sizeof_bits<bin1_t> {
-  static int const value = 1;
-};
+namespace cutlass {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -77,17 +56,31 @@ struct index_sequence_helper<0, 0, Next...> {
 template <size_t N>
 using make_index_sequence = typename index_sequence_helper<N>::type;
 
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Get the register type used in kernel
+//
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+namespace detail {
+
+template<typename T>
+struct get_unpacked_element_type {
+  using type = T;
+};
+
+} // namespace detail
+
 }  // namespace cutlass
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 #include "cutlass/integer_subbyte.h"
-
 #include "cutlass/half.h"
 #include "cutlass/bfloat16.h"
 #include "cutlass/tfloat32.h"
-
+#include "cutlass/float8.h"
+#include "cutlass/uint128.h"
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/include/cutlass/pipeline/pipeline.hpp b/include/cutlass/pipeline/pipeline.hpp
new file mode 100644
index 0000000000..0b56179768
--- /dev/null
+++ b/include/cutlass/pipeline/pipeline.hpp
@@ -0,0 +1,36 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/pipeline/sm90_pipeline.hpp"
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/pipeline/sm90_pipeline.hpp b/include/cutlass/pipeline/sm90_pipeline.hpp
new file mode 100644
index 0000000000..b1d04f5117
--- /dev/null
+++ b/include/cutlass/pipeline/sm90_pipeline.hpp
@@ -0,0 +1,1311 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cute/layout.hpp"
+#include "cute/layout_composed.hpp"  // cute::composition
+#include "cute/swizzle.hpp"             // cute::Swizzle
+#include "cute/swizzle_layout.hpp"      // cute::composition
+#include "cute/util/type_traits.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/container/array.hpp"
+#include "cute/numeric/integral_constant.hpp"
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/detail/dependent_false.hpp"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using namespace cute;
+
+namespace detail {
+
+// Helper function for DEBUG checks
+template<class ThreadCategory>
+CUTLASS_DEVICE
+bool pipeline_is_producer(ThreadCategory role) {
+  return (role == ThreadCategory::Producer || role == ThreadCategory::ProducerConsumer);
+}
+
+template<class ThreadCategory>
+CUTLASS_DEVICE
+void pipeline_check_is_producer(ThreadCategory role) {
+  #ifndef NDEBUG
+  if (!pipeline_is_producer(role)) {
+    asm volatile ("brkpt;\n" ::);
+  }
+  #endif
+}
+
+template<class ThreadCategory>
+CUTLASS_DEVICE
+bool pipeline_is_consumer(ThreadCategory role) {
+  return (role == ThreadCategory::Consumer || role == ThreadCategory::ProducerConsumer);
+}
+
+template<class ThreadCategory>
+CUTLASS_DEVICE
+void pipeline_check_is_consumer(ThreadCategory role) {
+  #ifndef NDEBUG
+  if (!pipeline_is_consumer(role)) {
+    asm volatile ("brkpt;\n" ::);
+  }
+  #endif
+}
+
+CUTLASS_DEVICE
+cute::tuple<bool, uint32_t> spread_arrivals_to_warp(int thread_idx_in_warp) {
+  constexpr uint32_t MaxClusterSize = 16;
+  bool is_signaling_thread = (thread_idx_in_warp % (32 / MaxClusterSize)) == 0;
+  auto layout = Layout<Shape<_4,_4>,Stride<_4, _1>>{};
+  uint32_t thread_row = thread_idx_in_warp / 8;
+  uint32_t thread_col = (thread_idx_in_warp % 8) / 2;
+  uint32_t dst_blockid = layout(thread_row, thread_col);
+  return cute::make_tuple(is_signaling_thread, dst_blockid);
+}
+
+CUTLASS_DEVICE
+cute::tuple<bool, uint32_t> spread_arrivals_to_warpgroup(int thread_idx_in_warpgroup, int warp_idx) {
+  constexpr uint32_t MaxClusterSize = 16;
+  bool is_signaling_thread = (thread_idx_in_warpgroup % (NumThreadsPerWarpGroup / MaxClusterSize)) == 0;
+  auto layout = cute::composition(Swizzle<2,0,-2>{},
+                                  Layout<Shape<_4,_4>,Stride<_4,_1>>{});
+  uint32_t thread_row = warp_idx % 4;
+  uint32_t thread_col = (thread_idx_in_warpgroup / 8) % 4;
+  uint32_t dst_blockid = layout(thread_row, thread_col);
+  return cute::make_tuple(is_signaling_thread, dst_blockid);
+}
+} // namespace detail
+
+enum class BarrierStatus : uint32_t {
+  WaitAgain = 0u,
+  WaitDone  = 1u,
+};
+
+class ArrivalToken {
+public:
+  CUTLASS_HOST_DEVICE
+  ArrivalToken(BarrierStatus barrier_status) : barrier_status_(barrier_status) {}
+
+  CUTLASS_HOST_DEVICE
+  ArrivalToken() = delete;
+
+  CUTLASS_HOST_DEVICE
+  BarrierStatus get() const {
+    return barrier_status_;
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool operator==(ArrivalToken const& other) const {
+    return barrier_status_ == other.get();
+  }
+
+private:
+  BarrierStatus barrier_status_;
+
+  CUTLASS_HOST_DEVICE
+  friend bool operator==(const ArrivalToken& left, const BarrierStatus& right) {
+    return left.get() == right;
+  }
+
+  CUTLASS_HOST_DEVICE
+  friend bool operator==(const BarrierStatus& left, const ArrivalToken& right) {
+    return left == right.get();
+  }
+
+  CUTLASS_HOST_DEVICE
+  friend bool operator!=(const ArrivalToken& left, const BarrierStatus& right) {
+    return left.get() != right;
+  }
+
+  CUTLASS_HOST_DEVICE
+  friend bool operator!=(const BarrierStatus& left, const ArrivalToken& right) {
+    return left != right.get();
+  }
+};
+
+class ProducerToken : public ArrivalToken {
+  using ArrivalToken::ArrivalToken;
+};
+
+class ConsumerToken : public ArrivalToken {
+  using ArrivalToken::ArrivalToken;
+};
+
+// Circular Buffer Index + Associated Phase
+// Assumes only one operation possible - i.e., ++
+template<uint32_t Stages_>
+struct PipelineState {
+
+  static constexpr uint32_t Stages = Stages_;
+
+  int index_ = 0;
+  uint32_t phase_ = 0;
+  uint32_t count_ = 0;
+
+  CUTLASS_DEVICE
+  PipelineState(): index_{}, phase_{}, count_{} {}
+
+  CUTLASS_DEVICE
+  PipelineState(int index, uint32_t phase, uint32_t count)
+    : index_(index)
+    , phase_(phase)
+    , count_(count) {}
+
+  CUTLASS_DEVICE
+  int index() const {
+    return index_;
+  }
+
+  CUTLASS_DEVICE
+  uint32_t phase() const {
+    return phase_;
+  }
+
+  CUTLASS_DEVICE
+  uint32_t count() const {
+    return count_;
+  }
+
+  CUTLASS_DEVICE
+  void operator++() {
+    if constexpr (Stages > 0) {
+      ++index_;
+      ++count_;
+      if (index_ == Stages) {
+        index_ = 0;
+        phase_ ^= 1;
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  PipelineState& operator+=(uint32_t num_iterations) {
+    return advance(num_iterations);
+  }
+
+  CUTLASS_DEVICE
+  PipelineState& operator=(PipelineState const& other) {
+    index_ = other.index();
+    phase_ = other.phase();
+    count_ = other.count();
+    return *this;
+  }
+
+  CUTLASS_DEVICE
+  PipelineState& advance(uint32_t num_iterations) {
+    if constexpr (Stages > 0) {
+      // Number of iterations cross over the stage boundary => flipped phase
+      if ((num_iterations < Stages) && (index_ + num_iterations) >= Stages ) {
+        phase_ ^= 1;
+      }
+      // How many times number of iterations cross over the stage boundary and
+      // end up on a odd number => flipped phase
+      if ((num_iterations >= Stages) && (((index_ + num_iterations) / Stages) % 2) == 1) {
+        phase_ ^= 1;
+      }
+      index_ = (index_ + num_iterations) % Stages;
+      count_ += num_iterations;
+    }
+    return *this;
+  }
+
+  CUTLASS_DEVICE
+  static PipelineState make_pipeline_state(PipelineState start_state, uint32_t num_iterations) {
+    return start_state.advance(num_iterations);
+  }
+};
+
+template<class Pipeline>
+CUTLASS_DEVICE
+PipelineState<Pipeline::Stages> make_producer_start_state() {
+  // Producer starts with an opposite phase as the buffers are initially empty
+  constexpr int InitialProducerStage = 0;
+  constexpr uint32_t InitialProducerPhase = 1;
+  constexpr uint32_t InitialProducerCount = 0;
+  return {InitialProducerStage, InitialProducerPhase, InitialProducerCount};
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// TMA load (producer) Async Pipeline class
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Assumptions : Constructor is visible Cluster-wide (as it needs a Cluster-Sync)
+// We have exactly one thread elected in the Producer as the "leader"
+// Currently, it is optional to elect a leader for the Consumers
+template <int Stages_>
+class PipelineTmaAsync {
+public:
+  using FullBarrier = cutlass::arch::ClusterTransactionBarrier;
+  using EmptyBarrier = cutlass::arch::ClusterBarrier;
+  using ProducerBarrierType = FullBarrier::ValueType;
+  using ConsumerBarrierType = EmptyBarrier::ValueType;
+  static constexpr uint32_t Stages = Stages_;
+  using PipelineState = cutlass::PipelineState<Stages>;
+
+  struct SharedStorage {
+    FullBarrier full_barrier_[Stages];
+    EmptyBarrier empty_barrier_[Stages];
+  };
+
+  enum class ThreadCategory {
+    NonParticipant,
+    Producer,
+    Consumer,
+    ProducerConsumer
+  };
+
+  struct Params {
+    uint32_t transaction_bytes = 0;
+    ThreadCategory role = ThreadCategory::NonParticipant;
+    uint32_t is_leader = 0;
+    uint32_t num_consumers = 0;
+  };
+
+  template <class ClusterShape>
+  static
+  CUTLASS_DEVICE
+  void
+  init_barriers(SharedStorage& storage, Params params, ClusterShape cluster_shape) {
+    int warp_idx = canonical_warp_idx_sync();
+    bool is_initializing_warp = (warp_idx == 0);
+    if (is_initializing_warp) {
+      // Barrier FULL and EMPTY init
+      constexpr int producer_arv_cnt = 1;
+      uint32_t const num_consumer_warpgroups_per_cluster = params.num_consumers / NumThreadsPerWarpGroup;
+      uint32_t multicast_consumer_arrival_count = params.num_consumers; // If cluster_size is 1
+      if (cute::size(cluster_shape) > 1) {
+        multicast_consumer_arrival_count = (cute::size<0>(cluster_shape) + cute::size<1>(cluster_shape) - 1) *
+              num_consumer_warpgroups_per_cluster;
+      }
+
+      cutlass::arch::detail::initialize_barrier_array_pair_aligned<decltype(storage.full_barrier_), decltype(storage.empty_barrier_), Stages>(
+          storage.full_barrier_, storage.empty_barrier_, producer_arv_cnt, multicast_consumer_arrival_count);
+    }
+  }
+
+  template<class ClusterShape, class InitBarriers, class InitMasks>
+  CUTLASS_DEVICE
+  PipelineTmaAsync(SharedStorage& storage, Params params, ClusterShape cluster_shape, InitBarriers = {}, InitMasks = {})
+      : params_(params)
+      , full_barrier_ptr_(&storage.full_barrier_[0])
+      , empty_barrier_ptr_(&storage.empty_barrier_[0]) {
+
+    int warp_idx = canonical_warp_idx_sync();
+    int thread_idx = threadIdx.x;
+    int lane_predicate = cute::elect_one_sync();
+
+    static_assert(cute::is_same_v<InitBarriers, cute::true_type> || cute::is_same_v<InitBarriers, cute::false_type>);
+    static_assert(cute::is_same_v<InitMasks, cute::true_type> || cute::is_same_v<InitMasks, cute::false_type>);
+    if constexpr (cute::is_same_v<InitBarriers, cute::true_type>) {
+      init_barriers(storage, params_, cluster_shape);
+    }
+
+    if constexpr (cute::is_same_v<InitMasks, cute::true_type>) {
+      // Logic to optimally schedule Empty Arrives
+      // Goal : To divide SYNCS Empty Arrival duty equally amongst the Warp-Group (128 threads)
+      dim3 block_id = cute::block_id_in_cluster();
+      auto cluster_size = cute::size(cluster_shape);
+
+      if (cluster_size == 1) {
+        is_signaling_thread_ = true;
+        dst_blockid_ = 0;
+      }
+      else {
+        // STEP 1 : Use Cute Layout function to generate an optimal dst block-id (0-15)
+        if (params_.num_consumers % NumThreadsPerWarpGroup == 0) {
+          auto [is_signaling_thread, dst_blockid] = detail::spread_arrivals_to_warpgroup(thread_idx % NumThreadsPerWarpGroup, warp_idx);
+          is_signaling_thread_ = is_signaling_thread;
+          dst_blockid_ = dst_blockid;
+        }
+        else if (params_.num_consumers == 32) {
+          auto [is_signaling_thread, dst_blockid] = detail::spread_arrivals_to_warp(thread_idx % 32);
+          is_signaling_thread_ = is_signaling_thread;
+          dst_blockid_ = dst_blockid;
+        }
+        else {
+          is_signaling_thread_ = 0;
+          #ifndef NDEBUG
+            asm volatile ("brkpt;\n" ::);
+          #endif
+        }
+
+        // STEP 2: Find if this dst block-id needs an arrival for this problem
+        is_signaling_thread_ &= dst_blockid_ < cluster_size;
+        is_signaling_thread_ &= is_same_row_or_col(dst_blockid_, block_id, cluster_shape);
+      }
+    }
+  }
+
+  // Constructor
+  template<class ClusterShape>
+  CUTLASS_DEVICE
+  PipelineTmaAsync(SharedStorage& storage, Params params, ClusterShape cluster_shape)
+      : PipelineTmaAsync(storage, params, cluster_shape, cute::true_type{}, cute::true_type{}) { }
+  
+  template<class ClusterShape, class InitBarriers>
+  CUTLASS_DEVICE
+  PipelineTmaAsync(SharedStorage& storage, Params params, ClusterShape cluster_shape, InitBarriers = {})
+      : PipelineTmaAsync(storage, params, cluster_shape, InitBarriers{}, cute::true_type{}) { }
+
+  template <class ClusterShape>
+  CUTLASS_DEVICE
+  bool is_same_row_or_col(int dst_block_id, dim3 block_id, ClusterShape cluster_shape) {
+    return (((dst_block_id % cute::size<0>(cluster_shape)) == block_id.x) ||
+            (
+              ((dst_block_id / cute::size<0>(cluster_shape)) == block_id.y)
+            ));
+  }
+
+  ////////////////////
+  // Producer APIs
+  ////////////////////
+  // Four member functions are always used in pairs:
+  //
+  // * producer_try_acquire and producer_acquire, and
+  // * consumer_try_wait and consumer_wait.
+  //
+  // The two functions with "try" in their names are called "try" functions,
+  // and the other two are conceptually "finalize" functions.
+  // The "try" function in each pair starts the process of waiting on the barrier to flip.
+  // It opportunistically waits for an implementation-dependent timeout.
+  // Whether or not the barrier has flipped yet, the try function will return a token.
+  // If the token indicates that the barrier has not flipped,
+  // then the token must be passed into the corresponding "finalize" function.
+  // The finalize function will then block until the barrier has flipped.
+  // If the token indicates that the barrier _has_ flipped,
+  // then it is still correct to pass it into the finalize function.
+  // The finalize function will return immediately in that case.
+
+  CUTLASS_DEVICE
+  ProducerToken producer_try_acquire(PipelineState state, uint32_t skip_wait = false) {
+    return producer_try_acquire(state.index(), state.phase(), skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  void producer_acquire(PipelineState state, ProducerToken barrier_token = {BarrierStatus::WaitAgain}) {
+    producer_acquire(state.index(), state.phase(), barrier_token);
+  }
+
+  CUTLASS_DEVICE
+  void producer_commit(PipelineState state, uint32_t bytes) {
+    producer_commit(state.index(), bytes);
+  }
+
+  // Prevents early exit of producer blocks in Cluster.
+  // This should be called once before kernel exits.
+  CUTLASS_DEVICE
+  void producer_tail(PipelineState state) {
+    detail::pipeline_check_is_producer(params_.role);
+    for (int count = 0; count < Stages; ++count) {
+      empty_barrier_ptr_[state.index()].wait(state.phase());
+      ++state;
+    }
+  }
+
+  CUTLASS_DEVICE
+  ProducerBarrierType* producer_get_barrier(PipelineState state) {
+    return producer_get_barrier(state.index());
+  }
+
+  ////////////////////
+  // Consumer APIs
+  ////////////////////
+  CUTLASS_DEVICE
+  ConsumerToken consumer_try_wait(PipelineState state, uint32_t skip_wait = false) {
+    return consumer_try_wait(state.index(), state.phase(), skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  ConsumerToken consumer_test_wait(PipelineState state, uint32_t skip_wait = false) {
+    return consumer_test_wait(state.index(), state.phase(), skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  void consumer_wait(PipelineState state) {
+    consumer_wait(state.index(), state.phase());
+  }
+
+  CUTLASS_DEVICE
+  void consumer_wait(PipelineState state, ConsumerToken barrier_token) {
+    consumer_wait(state.index(), state.phase(), barrier_token);
+  }
+
+  CUTLASS_DEVICE
+  void consumer_release(PipelineState state) {
+    consumer_release(state.index());
+  }
+
+private:
+  uint32_t dst_blockid_ = 0;
+  uint32_t is_signaling_thread_ = 0;
+  FullBarrier *full_barrier_ptr_ = nullptr;
+  EmptyBarrier *empty_barrier_ptr_ = nullptr;
+  Params params_;
+
+  CUTLASS_DEVICE
+  ProducerToken producer_try_acquire(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    detail::pipeline_check_is_producer(params_.role);
+    if (skip_wait) {
+      return {BarrierStatus::WaitDone};
+    }
+    bool barrier_status = empty_barrier_ptr_[stage].try_wait(phase);
+    return {static_cast<BarrierStatus>(barrier_status)};
+  }
+
+  CUTLASS_DEVICE
+  void producer_acquire(uint32_t stage, uint32_t phase, ProducerToken barrier_token) {
+    detail::pipeline_check_is_producer(params_.role);
+    if (barrier_token != BarrierStatus::WaitDone) {
+      empty_barrier_ptr_[stage].wait(phase);
+    }
+
+    if (params_.is_leader) {
+      full_barrier_ptr_[stage].arrive_and_expect_tx(params_.transaction_bytes);
+    }
+    #ifndef NDEBUG
+    if (params_.role == ThreadCategory::Consumer || params_.role == ThreadCategory::NonParticipant) {
+      asm volatile ("brkpt;\n" ::);
+    }
+
+    // Most likely you have elected more than one leader
+    if (params_.is_leader && (threadIdx.x % 32 != 0)) {
+      asm volatile ("brkpt;\n" ::);
+    }
+    #endif
+  }
+
+  // NOP for TMA based mainloop
+  CUTLASS_DEVICE
+  void producer_commit(uint32_t stage, uint32_t bytes) {
+    // Below code is used only for unit-testing (in the absence of TMA commit)
+    #if CUTLASS_UNIT_TEST_PIPELINE
+      if (params_.is_leader) {
+        // STEP 1 : Commit to self
+        full_barrier_ptr_[stage].complete_transaction(bytes);
+
+        // STEP 2 : Commit to other blocks in our cluster
+        auto cluster_shape = cute::cluster_shape();
+        Layout block_layout_in_cluster = make_layout(cluster_shape);
+        dim3 local_block_id = cute::block_id_in_cluster();
+
+        CUTLASS_PRAGMA_UNROLL
+        for(int n = 0; n < size<1>(block_layout_in_cluster); ++n) {
+          uint32_t dst_block_id = block_layout_in_cluster(local_block_id.x,n,Int<0>{});
+          full_barrier_ptr_[stage].complete_transaction(dst_block_id, bytes, n!=local_block_id.y);
+        }
+
+        CUTLASS_PRAGMA_UNROLL
+        for(int m = 0; m < size<0>(block_layout_in_cluster); ++m) {
+          uint32_t dst_block_id = block_layout_in_cluster(m,local_block_id.y,Int<0>{});
+          full_barrier_ptr_[stage].complete_transaction(dst_block_id, bytes, m!=local_block_id.x);
+        }
+      }
+    #endif
+  }
+
+  CUTLASS_DEVICE
+  ConsumerToken consumer_try_wait(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    detail::pipeline_check_is_consumer(params_.role);
+    if (skip_wait) {
+      return {BarrierStatus::WaitDone};
+    }
+    bool barrier_status = full_barrier_ptr_[stage].try_wait(phase);
+    return {static_cast<BarrierStatus>(barrier_status)};
+  }
+
+  CUTLASS_DEVICE
+  ConsumerToken consumer_test_wait(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    detail::pipeline_check_is_consumer(params_.role);
+    if (skip_wait) {
+      return {BarrierStatus::WaitDone};
+    }
+    bool barrier_status = full_barrier_ptr_[stage].test_wait(phase);
+    return {static_cast<BarrierStatus>(barrier_status)};
+  }
+
+  // Wait for producer to commit transactions (done by TMA)
+  CUTLASS_DEVICE
+  void consumer_wait(uint32_t stage, uint32_t phase) {
+    detail::pipeline_check_is_consumer(params_.role);
+    full_barrier_ptr_[stage].wait(phase);
+  }
+
+  // Wait for producer to commit transactions (done by TMA)
+  CUTLASS_DEVICE
+  void consumer_wait(uint32_t stage, uint32_t phase, ConsumerToken barrier_token) {
+    detail::pipeline_check_is_consumer(params_.role);
+    if (barrier_token == BarrierStatus::WaitAgain) {
+      full_barrier_ptr_[stage].wait(phase);
+    }
+  }
+
+  // Consumer signalling Producer of completion
+  // Ensures all blocks in the Same Row and Column get notifed.
+  CUTLASS_DEVICE
+  void consumer_release(uint32_t stage, uint32_t skip = false) {
+    detail::pipeline_check_is_consumer(params_.role);
+    empty_barrier_ptr_[stage].arrive(dst_blockid_, is_signaling_thread_ & (!skip));
+    #ifndef NDEBUG
+    if (params_.role == ThreadCategory::Producer || params_.role == ThreadCategory::NonParticipant) {
+      asm volatile ("brkpt;\n" ::);
+    }
+    #endif
+  }
+
+  CUTLASS_DEVICE
+  ProducerBarrierType* producer_get_barrier(uint32_t stage) {
+    return reinterpret_cast<ProducerBarrierType*>(&full_barrier_ptr_[stage]);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// TMA store pipeline class
+// producer-only class, no async barriers between threads because consumer is TMA unit
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  int Stages_,
+  // The number of committed TMA store batches that can be in flight upon return of producer acquire
+  int UnacquiredStages_ = Stages_-1
+>
+class PipelineTmaStore {
+public:
+  static constexpr uint32_t Stages = Stages_;
+  static_assert(Stages_ > 0);
+  static_assert(UnacquiredStages_ >= 0);
+  static constexpr uint32_t UnacquiredStages = static_cast<uint32_t>(UnacquiredStages_);
+  using PipelineState = cutlass::PipelineState<Stages>;
+
+  struct Params {
+    bool always_wait = false;
+  };
+
+  CUTLASS_DEVICE
+  PipelineTmaStore(Params params = {}) : params_(params) {}
+
+  ////////////////////
+  // Producer APIs
+  ////////////////////
+  // Wait for the least recently committed batch of TMA stores to complete
+  CUTLASS_DEVICE
+  void producer_acquire(PipelineState state) {
+    producer_acquire(state.index(), state.count());
+  }
+
+  // Commit the most recently issued batch of TMA stores
+  CUTLASS_DEVICE
+  void producer_commit(PipelineState state) {
+    producer_commit(state.index(), state.count());
+  }
+
+  // Wait for all TMA stores to complete
+  CUTLASS_DEVICE
+  void producer_tail([[maybe_unused]] PipelineState state) {
+    tma_store_wait<0>();
+  }
+
+private:
+  Params params_;
+
+  // Wait for the least recently committed batch of TMA stores to complete
+  // or until at most UnacquiredStages TMA store batches are in-flight (if specified)
+  CUTLASS_DEVICE
+  void producer_acquire([[maybe_unused]] uint32_t stage, uint32_t count) {
+    if (params_.always_wait || count > UnacquiredStages) {
+      tma_store_wait<UnacquiredStages>();
+    }
+  }
+
+  // Commit the most recently issued batch of TMA stores
+  CUTLASS_DEVICE
+  void producer_commit([[maybe_unused]] uint32_t stage, [[maybe_unused]] uint32_t count) {
+    tma_store_arrive();
+  }
+};
+
+template <>
+class PipelineTmaStore< /* Stages_ = */ 0, /* UnacquiredStages = Stages_ - 1 = */ -1 > {
+public:
+  static constexpr uint32_t Stages = 0;
+  static constexpr uint32_t UnacquiredStages = 0;
+  using PipelineState = cutlass::PipelineState<Stages>;
+
+  struct Params {
+    bool always_wait = false;
+  };
+
+  PipelineTmaStore() = default;
+  CUTLASS_DEVICE
+    PipelineTmaStore(Params params) : params_(params) {}
+
+  ////////////////////
+  // Producer APIs
+  ////////////////////
+
+  template<class ThisTemplateParameterExistsOnlyForDependentFalse = int>
+  CUTLASS_DEVICE
+    void producer_acquire(PipelineState /* state */,
+      ThisTemplateParameterExistsOnlyForDependentFalse* /* unused */ = nullptr) {
+    static_assert(cutlass::detail::dependent_false<ThisTemplateParameterExistsOnlyForDependentFalse>,
+      "It is never valid to call PipelineTmaStore<0>::producer_acquire");
+  }
+
+  // Commit the most recently issued batch of TMA stores
+  CUTLASS_DEVICE
+    void producer_commit(PipelineState state) {
+    producer_commit(state.index(), state.count());
+  }
+
+  // Wait for all TMA stores to complete
+  CUTLASS_DEVICE
+    void producer_tail([[maybe_unused]] PipelineState state) {
+    tma_store_wait<0>();
+  }
+
+private:
+  Params params_;
+
+  // Commit the most recently issued batch of TMA stores
+  CUTLASS_DEVICE
+    void producer_commit([[maybe_unused]] uint32_t stage, [[maybe_unused]] uint32_t count) {
+    tma_store_arrive();
+  }
+};
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Simple producer-consumer async Pipeline class using producer transaction barriers
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+template <int Stages_>
+class PipelineTransactionAsync {
+public:
+  using FullBarrier = cutlass::arch::ClusterTransactionBarrier;
+  using EmptyBarrier = cutlass::arch::ClusterBarrier;
+  using ProducerBarrierType = FullBarrier::ValueType;
+  using ConsumerBarrierType = EmptyBarrier::ValueType;
+  static constexpr uint32_t Stages = Stages_;
+  using PipelineState = cutlass::PipelineState<Stages>;
+
+  struct SharedStorage {
+    cute::array<FullBarrier, Stages> full_barrier_;
+    cute::array<EmptyBarrier, Stages> empty_barrier_;
+  };
+
+  enum class ThreadCategory {
+    NonParticipant,
+    Producer,
+    Consumer,
+    ProducerConsumer
+  };
+
+  struct Params {
+    ThreadCategory role = ThreadCategory::NonParticipant;
+    uint32_t transaction_bytes = 0;
+    uint32_t producer_arv_count = 1;
+    uint32_t consumer_arv_count = 1;
+    uint32_t dst_blockid = cute::block_rank_in_cluster();
+  };
+
+  static
+  CUTLASS_DEVICE
+  void
+  init_barriers(SharedStorage& storage, Params const& params) {
+    FullBarrier *full_barrier_ptr = storage.full_barrier_.data();
+    EmptyBarrier *empty_barrier_ptr = storage.empty_barrier_.data();
+    int warp_idx = canonical_warp_idx_sync();
+    bool is_initializing_warp = (warp_idx == 0);
+    if (is_initializing_warp) {
+      // Barrier FULL and EMPTY init
+      cutlass::arch::detail::initialize_barrier_array_pair_aligned<decltype(full_barrier_ptr), decltype(empty_barrier_ptr), Stages>(
+          full_barrier_ptr, empty_barrier_ptr, params.producer_arv_count, params.consumer_arv_count);
+    }
+  }
+
+  // Constructor
+  template<class InitBarriers>
+  CUTLASS_DEVICE
+  PipelineTransactionAsync(SharedStorage& storage, Params const& params, InitBarriers = cute::true_type{})
+    : params_(params)
+    , full_barrier_ptr_(storage.full_barrier_.data())
+    , empty_barrier_ptr_(storage.empty_barrier_.data()) {
+
+    int warp_idx = canonical_warp_idx_sync();
+    int lane_predicate = cute::elect_one_sync();
+
+    static_assert(cute::is_same_v<InitBarriers, cute::true_type> || cute::is_same_v<InitBarriers, cute::false_type>);
+
+    if constexpr (cute::is_same_v<InitBarriers, cute::true_type>) {
+      init_barriers(storage, params);
+    }
+
+  }
+
+  // Constructor
+  CUTLASS_DEVICE
+  PipelineTransactionAsync(SharedStorage& storage, Params const& params) :
+    PipelineTransactionAsync(storage, params, cute::true_type{}) { }
+
+  ////////////////////
+  // Producer APIs
+  ////////////////////
+  // Four member functions are always used in pairs:
+  //
+  // * producer_try_acquire and producer_acquire, and
+  // * consumer_try_wait and consumer_wait.
+  //
+  // The two functions with "try" in their names are called "try" functions,
+  // and the other two are conceptually "finalize" functions.
+  // The "try" function in each pair starts the process of waiting on the barrier to flip.
+  // It opportunistically waits for an implementation-dependent timeout.
+  // Whether or not the barrier has flipped yet, the try function will return a token.
+  // If the token indicates that the barrier has not flipped,
+  // then the token must be passed into the corresponding "finalize" function.
+  // The finalize function will then block until the barrier has flipped.
+  // If the token indicates that the barrier _has_ flipped,
+  // then it is still correct to pass it into the finalize function.
+  // The finalize function will return immediately in that case.
+  CUTLASS_DEVICE
+  ProducerToken producer_try_acquire(PipelineState state, uint32_t skip_wait = false) {
+    return producer_try_acquire(state.index(), state.phase(), skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  void producer_acquire(PipelineState state, ProducerToken barrier_token = {BarrierStatus::WaitAgain}) {
+    producer_acquire(state.index(), state.phase(), barrier_token);
+  }
+
+  // Perform an expect-tx operation on the stage's full barrier. Must be called by 1 thread
+  CUTLASS_DEVICE
+  void producer_expect_transaction(PipelineState state) {
+    producer_expect_transaction(state.index());
+  }
+
+  CUTLASS_DEVICE
+  void producer_commit(PipelineState state) {
+    producer_commit(state.index());
+  }
+
+  // Prevents early exit of producer blocks in Cluster.
+  // This should be called once before kernel exits.
+  CUTLASS_DEVICE
+  void producer_tail(PipelineState state) {
+    for (int count = 0; count < Stages; ++count) {
+      producer_acquire(state);
+      ++state;
+    }
+  }
+
+  CUTLASS_DEVICE
+  ProducerBarrierType* producer_get_barrier(PipelineState state) {
+    return producer_get_barrier(state.index());
+  }
+
+  ////////////////////
+  // Consumer APIs
+  ////////////////////
+  CUTLASS_DEVICE
+  ConsumerToken consumer_try_wait(PipelineState state, uint32_t skip_wait = false) {
+    return consumer_try_wait(state.index(), state.phase(), skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  ConsumerToken consumer_test_wait(PipelineState state, uint32_t skip_wait = false) {
+    return consumer_test_wait(state.index(), state.phase(), skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  void consumer_wait(PipelineState state, ConsumerToken barrier_token = {BarrierStatus::WaitAgain}) {
+    consumer_wait(state.index(), state.phase(), barrier_token);
+  }
+
+  CUTLASS_DEVICE
+  void consumer_release(PipelineState state) {
+    consumer_release(state.index());
+  }
+
+private:
+  FullBarrier *full_barrier_ptr_ = nullptr;
+  EmptyBarrier *empty_barrier_ptr_ = nullptr;
+  Params params_;
+
+  CUTLASS_DEVICE
+  ProducerToken producer_try_acquire(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    detail::pipeline_check_is_producer(params_.role);
+    if (skip_wait) {
+      return {BarrierStatus::WaitDone};
+    }
+    bool barrier_status = empty_barrier_ptr_[stage].try_wait(phase);
+    return {static_cast<BarrierStatus>(barrier_status)};
+  }
+
+  CUTLASS_DEVICE
+  void producer_acquire(uint32_t stage, uint32_t phase, ProducerToken barrier_token) {
+    detail::pipeline_check_is_producer(params_.role);
+    if (barrier_token == BarrierStatus::WaitAgain) {
+      empty_barrier_ptr_[stage].wait(phase);
+    }
+  }
+
+  // Perform an expect-tx operation on the stage's full barrier. Must be called by 1 thread
+  CUTLASS_DEVICE
+  void producer_expect_transaction(uint32_t stage) {
+    detail::pipeline_check_is_producer(params_.role);
+    full_barrier_ptr_[stage].expect_transaction(params_.transaction_bytes);
+  }
+
+  CUTLASS_DEVICE
+  void producer_commit(uint32_t stage) {
+    detail::pipeline_check_is_producer(params_.role);
+    full_barrier_ptr_[stage].arrive(params_.dst_blockid);
+  }
+
+  CUTLASS_DEVICE
+  ProducerBarrierType* producer_get_barrier(uint32_t stage) {
+    return reinterpret_cast<ProducerBarrierType*>(&full_barrier_ptr_[stage]);
+  }
+
+  CUTLASS_DEVICE
+  ConsumerToken consumer_try_wait(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    detail::pipeline_check_is_consumer(params_.role);
+    if (skip_wait) {
+      return {BarrierStatus::WaitDone};
+    }
+    bool barrier_status = full_barrier_ptr_[stage].try_wait(phase);
+    return {static_cast<BarrierStatus>(barrier_status)};
+  }
+
+  CUTLASS_DEVICE
+  ConsumerToken consumer_test_wait(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    detail::pipeline_check_is_consumer(params_.role);
+    if (skip_wait) {
+      return {BarrierStatus::WaitDone};
+    }
+    bool barrier_status = full_barrier_ptr_[stage].test_wait(phase);
+    return {static_cast<BarrierStatus>(barrier_status)};
+  }
+
+  CUTLASS_DEVICE
+  void consumer_wait(uint32_t stage, uint32_t phase, ConsumerToken barrier_token) {
+    detail::pipeline_check_is_consumer(params_.role);
+    if (barrier_token == BarrierStatus::WaitAgain) {
+      full_barrier_ptr_[stage].wait(phase);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void consumer_release(uint32_t stage, uint32_t skip = false) {
+    detail::pipeline_check_is_consumer(params_.role);
+    empty_barrier_ptr_[stage].arrive(params_.dst_blockid, (not skip));
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Simple producer-consumer async Pipeline class
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace PipelineDetail {
+  template<int Stages>
+  using PipelineAsyncPipelineState = cutlass::PipelineState<Stages>;
+
+  template<int Stages>
+  struct PipelineAsyncSharedStorage {
+    using FullBarrier = cutlass::arch::ClusterBarrier;
+    using EmptyBarrier = cutlass::arch::ClusterBarrier;
+
+    FullBarrier full_barrier_[Stages];
+    EmptyBarrier empty_barrier_[Stages];
+  };
+};
+
+template <int Stages_>
+class PipelineAsync {
+public:
+  static constexpr uint32_t Stages = Stages_;
+  using SharedStorage = PipelineDetail::PipelineAsyncSharedStorage<Stages>;
+  using FullBarrier = typename SharedStorage::FullBarrier;
+  using EmptyBarrier = typename SharedStorage::EmptyBarrier;
+  using ProducerBarrierType = typename FullBarrier::ValueType;
+  using ConsumerBarrierType = typename EmptyBarrier::ValueType;
+  using PipelineState = PipelineDetail::PipelineAsyncPipelineState<Stages>;
+
+  enum class ThreadCategory {
+    NonParticipant,
+    Producer,
+    Consumer,
+    ProducerConsumer
+  };
+
+  struct Params {
+    ThreadCategory role = ThreadCategory::NonParticipant;
+    uint32_t producer_arv_count = 1;
+    uint32_t consumer_arv_count = 1;
+    uint32_t dst_blockid = cute::block_rank_in_cluster();
+  };
+
+  static
+  CUTLASS_DEVICE
+  void
+  init_barriers(SharedStorage& storage, Params params) {
+    int warp_idx = canonical_warp_idx_sync();
+    bool is_initializing_warp = (warp_idx == 0);
+    if (is_initializing_warp) {
+      // Barrier FULL and EMPTY init
+      cutlass::arch::detail::initialize_barrier_array_pair_aligned<decltype(storage.full_barrier_), decltype(storage.empty_barrier_), Stages>(
+          storage.full_barrier_, storage.empty_barrier_, params.producer_arv_count, params.consumer_arv_count);
+    }
+  }
+
+  template<class InitBarriers>
+  CUTLASS_DEVICE
+  PipelineAsync(
+    SharedStorage& storage,
+    Params const& params,
+    InitBarriers = {}) :
+      params_(params),
+      full_barrier_ptr_(&storage.full_barrier_[0]),
+      empty_barrier_ptr_(&storage.empty_barrier_[0]) {
+
+    static_assert(cute::is_same_v<InitBarriers, cute::true_type> || cute::is_same_v<InitBarriers, cute::false_type>);
+    if constexpr (cute::is_same_v<InitBarriers, cute::true_type>) {
+      init_barriers(storage, params_);
+    }
+  }
+
+  CUTLASS_DEVICE
+  PipelineAsync(
+    SharedStorage& storage,
+    Params const& params) :
+      PipelineAsync(storage, params, cute::true_type{}) { }
+
+  // Default assumption when only storage is passed is :
+  // => single producer, single consumer & they are in the same block (within the Cluster)
+  CUTLASS_DEVICE
+  PipelineAsync(SharedStorage& storage)
+    : PipelineAsync(storage, {}, cute::true_type{}) {}
+
+  ////////////////////
+  // Producer APIs
+  ////////////////////
+  // Four member functions are always used in pairs:
+  //
+  // * producer_try_acquire and producer_acquire, and
+  // * consumer_try_wait and consumer_wait.
+  //
+  // The two functions with "try" in their names are called "try" functions,
+  // and the other two are conceptually "finalize" functions.
+  // The "try" function in each pair starts the process of waiting on the barrier to flip.
+  // It opportunistically waits for an implementation-dependent timeout.
+  // Whether or not the barrier has flipped yet, the try function will return a token.
+  // If the token indicates that the barrier has not flipped,
+  // then the token must be passed into the corresponding "finalize" function.
+  // The finalize function will then block until the barrier has flipped.
+  // If the token indicates that the barrier _has_ flipped,
+  // then it is still correct to pass it into the finalize function.
+  // The finalize function will return immediately in that case.
+  CUTLASS_DEVICE
+  ProducerToken producer_try_acquire(PipelineState state, uint32_t skip_wait = false) {
+    return producer_try_acquire(state.index(), state.phase(), skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  void producer_acquire(PipelineState state, ProducerToken barrier_token = {BarrierStatus::WaitAgain}) {
+    producer_acquire(state.index(), state.phase(), barrier_token);
+  }
+
+  CUTLASS_DEVICE
+  void producer_commit(PipelineState state) {
+    producer_commit(state.index());
+  }
+
+  template<class UserDefinedArriveOp>
+  CUTLASS_DEVICE
+  void producer_commit(PipelineState state, UserDefinedArriveOp&& user_defined_arrive_op) {
+    cute::forward<UserDefinedArriveOp>(user_defined_arrive_op)(producer_get_barrier(state.index()));
+    producer_commit(state);
+  }
+
+  // Prevents early exit of producer blocks in Cluster.
+  // This should be called once before kernel exits.
+  CUTLASS_DEVICE
+  void producer_tail(PipelineState state) {
+    for (int count = 0; count < Stages; ++count) {
+      producer_acquire(state);
+      ++state;
+    }
+  }
+
+  CUTLASS_DEVICE
+  ProducerBarrierType* producer_get_barrier(PipelineState state) {
+    return producer_get_barrier(state.index());
+  }
+
+  ////////////////////
+  // Consumer APIs
+  ////////////////////
+  CUTLASS_DEVICE
+  ConsumerToken consumer_try_wait(PipelineState state, uint32_t skip_wait = false) {
+    return consumer_try_wait(state.index(), state.phase(), skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  ConsumerToken consumer_test_wait(PipelineState state, uint32_t skip_wait = false) {
+    return consumer_test_wait(state.index(), state.phase(), skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  void consumer_wait(PipelineState state, ConsumerToken barrier_token = {BarrierStatus::WaitAgain}) {
+    consumer_wait(state.index(), state.phase(), barrier_token);
+  }
+
+  CUTLASS_DEVICE
+  void consumer_release(PipelineState state) {
+    consumer_release(state.index());
+  }
+
+  CUTLASS_DEVICE
+  ProducerBarrierType* producer_get_barrier(uint32_t stage) {
+    return reinterpret_cast<ProducerBarrierType*>(&full_barrier_ptr_[stage]);
+  }
+
+private:
+  Params params_;
+  FullBarrier *full_barrier_ptr_;
+  EmptyBarrier *empty_barrier_ptr_;
+
+  CUTLASS_DEVICE
+  ProducerToken producer_try_acquire(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    detail::pipeline_check_is_producer(params_.role);
+    if (skip_wait) {
+      return {BarrierStatus::WaitDone};
+    }
+    bool barrier_status = empty_barrier_ptr_[stage].try_wait(phase);
+    return {static_cast<BarrierStatus>(barrier_status)};
+  }
+
+  CUTLASS_DEVICE
+  void producer_acquire(uint32_t stage, uint32_t phase, ProducerToken barrier_token) {
+    detail::pipeline_check_is_producer(params_.role);
+    if (barrier_token == BarrierStatus::WaitAgain) {
+      empty_barrier_ptr_[stage].wait(phase);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void producer_commit(uint32_t stage) {
+    detail::pipeline_check_is_producer(params_.role);
+    full_barrier_ptr_[stage].arrive();
+  }
+
+  CUTLASS_DEVICE
+  ConsumerToken consumer_try_wait(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    detail::pipeline_check_is_consumer(params_.role);
+    if (skip_wait) {
+      return {BarrierStatus::WaitDone};
+    }
+    bool barrier_status = full_barrier_ptr_[stage].try_wait(phase);
+    return {static_cast<BarrierStatus>(barrier_status)};
+  }
+
+  CUTLASS_DEVICE
+  ConsumerToken consumer_test_wait(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    detail::pipeline_check_is_consumer(params_.role);
+    if (skip_wait) {
+      return {BarrierStatus::WaitDone};
+    }
+    bool barrier_status = full_barrier_ptr_[stage].test_wait(phase);
+    return {static_cast<BarrierStatus>(barrier_status)};
+  }
+
+  CUTLASS_DEVICE
+  void consumer_wait(uint32_t stage, uint32_t phase) {
+    detail::pipeline_check_is_consumer(params_.role);
+    bool done = full_barrier_ptr_[stage].test_wait(phase);
+    if (!done) {
+      full_barrier_ptr_[stage].wait(phase);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void consumer_wait(uint32_t stage, uint32_t phase, ConsumerToken barrier_token) {
+    detail::pipeline_check_is_consumer(params_.role);
+    if (barrier_token == BarrierStatus::WaitAgain) {
+      full_barrier_ptr_[stage].wait(phase);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void consumer_release(uint32_t stage) {
+    detail::pipeline_check_is_consumer(params_.role);
+    empty_barrier_ptr_[stage].arrive(params_.dst_blockid);
+  }
+};
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Barrier to ensure an Ordered Sequence between
+// SequenceLength number of groups (each with group_size participants) executing SequenceDepth Stages
+// i.e., for all i < j - only after id "i" arrives at a particular stage "m"
+// will the wait() for id "j" succeed for the same stage
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace PipelineDetail {
+
+template<int SequenceDepth, int SequenceLength>
+struct OrderedSequenceBarrierSharedStorage {
+  using Barrier = cutlass::arch::ClusterBarrier;
+  Barrier barrier_[SequenceDepth][SequenceLength];
+};
+
+} // namespace PipelineDetail
+
+template<int SequenceDepth_, int SequenceLength_>
+class OrderedSequenceBarrier {
+public:
+  static constexpr int SequenceDepth = SequenceDepth_;
+  static constexpr int SequenceLength = SequenceLength_;
+  using SharedStorage =
+    PipelineDetail::OrderedSequenceBarrierSharedStorage<SequenceDepth, SequenceLength>;
+  using Barrier = typename SharedStorage::Barrier;
+
+  struct Params {
+    uint32_t group_id;
+    uint32_t group_size;
+  };
+
+private:
+  // In future this Params object can be replaced easily with a CG object
+  Params params_;
+  Barrier *barrier_ptr_;
+  PipelineState<SequenceDepth> stage_;
+
+  static constexpr int Depth = SequenceDepth;
+  static constexpr int Length = SequenceLength;
+
+public:
+  OrderedSequenceBarrier() = delete;
+  OrderedSequenceBarrier(const OrderedSequenceBarrier&) = delete;
+  OrderedSequenceBarrier(OrderedSequenceBarrier&&) = delete;
+  OrderedSequenceBarrier& operator=(const OrderedSequenceBarrier&) = delete;
+  OrderedSequenceBarrier& operator=(OrderedSequenceBarrier&&) = delete;
+  ~OrderedSequenceBarrier() = default;
+
+  CUTLASS_DEVICE
+  OrderedSequenceBarrier(SharedStorage& storage, Params const& params) :
+      params_(params),
+      barrier_ptr_(&storage.barrier_[0][0]),
+      // Group 0 - starts with an opposite phase
+      stage_({0, params.group_id == 0, 0}) {
+    int warp_idx = canonical_warp_idx_sync();
+    int lane_predicate = cute::elect_one_sync();
+
+    // Barrier FULL, EMPTY init
+    // Init is done only by the one elected thread of the block
+    if (warp_idx == 0 && lane_predicate) {
+      for (int d = 0; d < Depth; ++d) {
+        for (int l = 0; l < Length; ++l) {
+          barrier_ptr_[d * Length + l].init(params.group_size);
+        }
+      }
+    }
+  }
+
+  // Wait on a stage to be unlocked
+  CUTLASS_DEVICE
+  void wait() {
+    get_barrier_for_current_stage(params_.group_id).wait(stage_.phase());
+  }
+
+  // Signal completion of Stage and move to the next stage
+  // (group_id) signals to (group_id+1)
+  CUTLASS_DEVICE
+  void arrive() {
+    int signalling_id = (params_.group_id + 1) % Length;
+    get_barrier_for_current_stage(signalling_id).arrive();
+    ++stage_;
+  }
+
+  CUTLASS_DEVICE
+  void advance() {
+    ++stage_;
+  }
+
+private:
+
+  CUTLASS_DEVICE
+  Barrier& get_barrier_for_current_stage(int group_id) {
+    return barrier_ptr_[stage_.index() * Length + group_id];
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Synchronization call. Blocks until barriers are initialized in shared memory.
+CUTLASS_DEVICE
+void
+pipeline_init_wait(int cluster_size) {
+  if (cluster_size > 1) {
+    cute::cluster_wait();
+  }
+  else {
+    __syncthreads();
+  }
+}
+
+// Used to guarantee that the Pipeline init is visible
+// to all producers and consumer threadblocks in the cluster
+CUTLASS_DEVICE
+void
+pipeline_init_arrive_relaxed(int cluster_size) {
+  if (cluster_size > 1) {
+    cute::cluster_arrive_relaxed();
+  }
+  else {
+    __syncthreads();
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // end namespace cutlass
diff --git a/include/cutlass/pitch_linear_coord.h b/include/cutlass/pitch_linear_coord.h
index 0830554acd..475229a25c 100644
--- a/include/cutlass/pitch_linear_coord.h
+++ b/include/cutlass/pitch_linear_coord.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/platform/platform.h b/include/cutlass/platform/platform.h
index ff6e3db692..13e018db8c 100644
--- a/include/cutlass/platform/platform.h
+++ b/include/cutlass/platform/platform.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -90,9 +90,6 @@
  *           - \p alignment_of
  *           - \p aligned_storage
  *
- *   (4) Functions and types that are STL-like (but aren't in the STL):
- *           - \p TODO: min and max functors?
- *
  * The idea is that, as we drop support for older compilers, we can simply #define
  * the \p __NV_STD_XYZ macros and \p platform namespace to alias their C++
  * counterparts (or trivially find-and-replace their occurrences in code text).
@@ -103,9 +100,17 @@
 //-----------------------------------------------------------------------------
 
 #if defined(__CUDACC_RTC__)
+#include <cuda/std/type_traits>
+#include <cuda/std/utility>
+#include <cuda/std/cstddef>
 #include <cuda/std/cstdint>
+#include <cuda/std/limits>
 #else
-#include <stdint.h>
+#include <type_traits>
+#include <utility>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
 #endif
 
 #if !defined(__CUDACC_RTC__)
@@ -117,11 +122,12 @@
 #include <cstddef>     // nullptr_t
 #include <functional>  // Arithmetic operations
 #include <utility>     // For methods on std::pair
+#include <limits>      // float_round_style, float_denorm_style
 #if (!defined(_MSC_VER) && (__cplusplus >= 201103L)) || (defined(_MSC_VER) && (_MS_VER >= 1500))
 #include <type_traits>  // For integral constants, conditional metaprogramming, and type traits
 #endif
 
-#include "cutlass/cutlass.h"
+#include <cutlass/cutlass.h>
 
 #endif
 
@@ -132,9 +138,31 @@
 #define CUTLASS_OS_WINDOWS
 #endif
 
+#if defined(__clang__) && defined(__CUDA__)
+#define CUTLASS_CLANG_CUDA 1
+#endif
+
 /******************************************************************************
  * Macros
  ******************************************************************************/
+/// std
+#if !defined(CUTLASS_STL_NAMESPACE)
+#if defined(__CUDACC_RTC__)
+#define CUTLASS_STL_NAMESPACE cuda::std
+#else
+#define CUTLASS_STL_NAMESPACE std
+#endif
+#endif
+
+/// builtin_unreachable
+#if !defined(CUTLASS_GCC_UNREACHABLE)
+#  if defined(__GNUC__)
+#    define CUTLASS_GCC_UNREACHABLE __builtin_unreachable()
+#  else
+#    define CUTLASS_GCC_UNREACHABLE
+#  endif
+#endif
+
 //-----------------------------------------------------------------------------
 // Keywords
 //-----------------------------------------------------------------------------
@@ -278,47 +306,16 @@ namespace platform {
 
 #if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1500))
 
-/// std::integral_constant
-template <typename value_t, value_t V>
-struct integral_constant;
-
-/// std::integral_constant
-template <typename value_t, value_t V>
-struct integral_constant {
-  static const value_t value = V;
-
-  typedef value_t value_type;
-  typedef integral_constant<value_t, V> type;
-
-  CUTLASS_HOST_DEVICE operator value_type() const { return value; }
-
-  CUTLASS_HOST_DEVICE const value_type operator()() const { return value; }
-};
-
 #else
 
-using std::integral_constant;
 using std::pair;
 
 #endif
 
-/// The type used as a compile-time boolean with true value.
-typedef integral_constant<bool, true> true_type;
-
-/// The type used as a compile-time boolean with false value.
-typedef integral_constant<bool, false> false_type;
-
-#if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) && (__cplusplus <= 201402L)) || (defined(_MSC_VER) && (_MSC_VER < 1900))
-
-/// std::bool_constant
-template <bool V>
-struct bool_constant : platform::integral_constant<bool, V> {};
-
-#else
-
-using std::bool_constant;
-
-#endif
+using CUTLASS_STL_NAMESPACE::integral_constant;
+using CUTLASS_STL_NAMESPACE::bool_constant;
+using CUTLASS_STL_NAMESPACE::true_type;
+using CUTLASS_STL_NAMESPACE::false_type;
 
 #if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1700))
 
@@ -335,78 +332,41 @@ using std::nullptr_t;
 // Conditional metaprogramming <type_traits>
 //-----------------------------------------------------------------------------
 
-#if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1600))
-
-/// std::enable_if (true specialization)
-template <bool C, typename T = void>
-struct enable_if {
-  typedef T type;
-};
-
-/// std::enable_if (false specialization)
-template <typename T>
-struct enable_if<false, T> {};
-
-/// std::conditional (true specialization)
-template <bool B, class T, class F>
-struct conditional {
-  typedef T type;
-};
-
-/// std::conditional (false specialization)
-template <class T, class F>
-struct conditional<false, T, F> {
-  typedef F type;
-};
-
-#else
-
-using std::enable_if;
-using std::conditional;
-
-#endif
+using CUTLASS_STL_NAMESPACE::conditional;
+using CUTLASS_STL_NAMESPACE::conditional_t;
+using CUTLASS_STL_NAMESPACE::enable_if;
+using CUTLASS_STL_NAMESPACE::enable_if_t;
+using CUTLASS_STL_NAMESPACE::void_t;
 
 //-----------------------------------------------------------------------------
 // Const/volatility specifiers <type_traits>
 //-----------------------------------------------------------------------------
 
-#if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1500))
+using CUTLASS_STL_NAMESPACE::remove_const;
+using CUTLASS_STL_NAMESPACE::remove_const_t;
+using CUTLASS_STL_NAMESPACE::remove_cv;
+using CUTLASS_STL_NAMESPACE::remove_cv_t;
+using CUTLASS_STL_NAMESPACE::remove_reference;
+using CUTLASS_STL_NAMESPACE::remove_reference_t;
+using CUTLASS_STL_NAMESPACE::remove_volatile;
+using CUTLASS_STL_NAMESPACE::remove_volatile_t;
 
-/// std::remove_const (non-const specialization)
-template <typename T>
-struct remove_const {
-  typedef T type;
-};
+// remove_cvref and remove_cvref_t are C++20 features,
+// but CUTLASS finds them useful enough to back-port.
+#if defined(__cpp_lib_remove_cvref)
 
-/// std::remove_const (const specialization)
-template <typename T>
-struct remove_const<const T> {
-  typedef T type;
-};
+using CUTLASS_STL_NAMESPACE::remove_cvref;
+using CUTLASS_STL_NAMESPACE::remove_cvref_t;
 
-/// std::remove_volatile (non-volatile specialization)
-template <typename T>
-struct remove_volatile {
-  typedef T type;
-};
-
-/// std::remove_volatile (volatile specialization)
-template <typename T>
-struct remove_volatile<volatile T> {
-  typedef T type;
-};
+#else
 
-/// std::remove_cv
-template <typename T>
-struct remove_cv {
-  typedef typename remove_volatile<typename remove_const<T>::type>::type type;
+template <class T>
+struct remove_cvref {
+  using type = remove_cv_t<remove_reference_t<T>>;
 };
 
-#else
-
-using std::remove_const;
-using std::remove_volatile;
-using std::remove_cv;
+template <class T>
+using remove_cvref_t = typename remove_cvref<T>::type;
 
 #endif
 
@@ -414,15 +374,10 @@ using std::remove_cv;
 // Type relationships <type_traits>
 //-----------------------------------------------------------------------------
 
-#if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1500))
-
-/// std::is_same (false specialization)
-template <typename A, typename B>
-struct is_same : false_type {};
+using CUTLASS_STL_NAMESPACE::is_same;  
+using CUTLASS_STL_NAMESPACE::is_same_v;
 
-/// std::is_same (true specialization)
-template <typename A>
-struct is_same<A, A> : true_type {};
+#if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1500))
 
 /// Helper for std::is_base_of
 template <typename BaseT, typename DerivedT>
@@ -452,9 +407,9 @@ struct is_base_of
                                            typename remove_cv<DerivedT>::type>::value) ||
                             (is_same<typename remove_cv<BaseT>::type,
                                      typename remove_cv<DerivedT>::type>::value)> {};
+
 #else
 
-using std::is_same;
 using std::is_base_of;
 
 #endif
@@ -463,6 +418,11 @@ using std::is_base_of;
 // Type properties <type_traits>
 //-----------------------------------------------------------------------------
 
+using CUTLASS_STL_NAMESPACE::is_arithmetic;
+using CUTLASS_STL_NAMESPACE::is_arithmetic_v;
+using CUTLASS_STL_NAMESPACE::is_void;
+using CUTLASS_STL_NAMESPACE::is_void_v;
+
 #if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1500))
 
 /// std::is_volatile
@@ -483,10 +443,6 @@ struct is_pointer_helper<T*> : true_type {};
 template <typename T>
 struct is_pointer : is_pointer_helper<typename remove_cv<T>::type> {};
 
-/// std::is_void
-template <typename T>
-struct is_void : is_same<void, typename remove_cv<T>::type> {};
-
 /// std::is_integral
 template <typename T>
 struct is_integral : false_type {};
@@ -526,11 +482,6 @@ struct is_floating_point
                         (is_same<float, typename remove_cv<T>::type>::value ||
                          is_same<double, typename remove_cv<T>::type>::value)> {};
 
-/// std::is_arithmetic
-template <typename T>
-struct is_arithmetic
-    : integral_constant<bool, (is_integral<T>::value || is_floating_point<T>::value)> {};
-
 /// std::is_fundamental
 template <typename T>
 struct is_fundamental
@@ -542,10 +493,8 @@ struct is_fundamental
 
 using std::is_volatile;
 using std::is_pointer;
-using std::is_void;
 using std::is_integral;
 using std::is_floating_point;
-using std::is_arithmetic;
 using std::is_fundamental;
 
 #endif
@@ -573,6 +522,41 @@ using std::is_trivially_copyable;
 
 #endif
 
+#if (201703L <=__cplusplus)
+
+/// std::is_unsigned_v
+using CUTLASS_STL_NAMESPACE::is_integral_v;
+/// std::is_unsigned_v
+using CUTLASS_STL_NAMESPACE::is_unsigned_v;
+
+#endif
+
+//-----------------------------------------------------------------------------
+// <utility>
+//-----------------------------------------------------------------------------
+
+using CUTLASS_STL_NAMESPACE::declval;
+  
+//-----------------------------------------------------------------------------
+// bit_cast <bit>
+//-----------------------------------------------------------------------------
+
+template< class To, class From >
+constexpr To CUTLASS_HOST_DEVICE bit_cast(const From& from ) noexcept;
+
+template <class To, class From>
+constexpr To CUTLASS_HOST_DEVICE bit_cast(const From& src) noexcept
+{
+  static_assert(sizeof(To) == sizeof(From), "sizes must match");
+  return reinterpret_cast<To const &>(src);
+}
+
+//-----------------------------------------------------------------------------
+// Convertable
+//-----------------------------------------------------------------------------
+using CUTLASS_STL_NAMESPACE::is_convertible;
+using CUTLASS_STL_NAMESPACE::is_convertible_v;
+
 //-----------------------------------------------------------------------------
 // Alignment and layout utilities
 //-----------------------------------------------------------------------------
@@ -816,6 +800,7 @@ struct numeric_limits<int32_t> {
   CUTLASS_HOST_DEVICE
   static constexpr int32_t max() noexcept { return 2147483647;}
   static constexpr bool is_integer = true;
+  static constexpr bool has_infinity = false;
 };
 
 template <>
@@ -825,6 +810,7 @@ struct numeric_limits<int16_t> {
   CUTLASS_HOST_DEVICE
   static constexpr int16_t max() noexcept { return 32767;}
   static constexpr bool is_integer = true;
+  static constexpr bool has_infinity = false;
 };
 
 template <>
@@ -834,6 +820,7 @@ struct numeric_limits<int8_t> {
   CUTLASS_HOST_DEVICE
   static constexpr int8_t max() noexcept { return 127;}
   static constexpr bool is_integer = true;
+  static constexpr bool has_infinity = false;
 };
 
 
@@ -842,8 +829,9 @@ struct numeric_limits<uint32_t> {
   CUTLASS_HOST_DEVICE
   static constexpr uint32_t lowest() noexcept { return 0;}
   CUTLASS_HOST_DEVICE
-  static constexpr uint32_t max() noexcept { return 4294967295;}
+  static constexpr uint32_t max() noexcept { return 4294967295U;}
   static constexpr bool is_integer = true;
+  static constexpr bool has_infinity = false;
 };
 
 template <>
@@ -851,8 +839,9 @@ struct numeric_limits<uint16_t> {
   CUTLASS_HOST_DEVICE
   static constexpr uint16_t lowest() noexcept { return 0;}
   CUTLASS_HOST_DEVICE
-  static constexpr uint16_t max() noexcept { return 65535;}
+  static constexpr uint16_t max() noexcept { return 65535U;}
   static constexpr bool is_integer = true;
+  static constexpr bool has_infinity = false;
 };
 
 template <>
@@ -860,9 +849,56 @@ struct numeric_limits<uint8_t> {
   CUTLASS_HOST_DEVICE
   static constexpr uint8_t lowest() noexcept { return 0;}
   CUTLASS_HOST_DEVICE
-  static constexpr uint8_t max() noexcept { return 255;}
+  static constexpr uint8_t max() noexcept { return 255U;}
   static constexpr bool is_integer = true;
+  static constexpr bool has_infinity = false;
 };
 
+template <>
+struct numeric_limits<float> {
+  CUTLASS_HOST_DEVICE
+  static constexpr float infinity() noexcept { return bit_cast<float, int32_t>(0x7f800000);}
+  CUTLASS_HOST_DEVICE
+  static constexpr float max() noexcept { return bit_cast<float, int32_t>(0x7f7fffff);}
+  static constexpr bool is_integer = false;
+  static constexpr bool has_infinity = true;
+};
+
+/// Returns a value that curries the `std::maximum()` function into the identity
+/// function. No value will compare < than this value.
+template <typename T>
+constexpr T identity_for_maximum() {
+  if constexpr (numeric_limits<T>::has_infinity) {
+    return -numeric_limits<T>::infinity();
+  } else {
+    return numeric_limits<T>::lowest();
+  }
+}
+
+/// Returns a value that curries the `std::minimum()` function into the identity
+/// function. No value will compare > than this value.
+template <typename T>
+constexpr T identity_for_minimum() {
+  if constexpr (numeric_limits<T>::has_infinity) {
+    return numeric_limits<T>::infinity();
+  } else {
+    return numeric_limits<T>::max();
+  }
+}
+
+/// std::float_round_style
+using CUTLASS_STL_NAMESPACE::float_round_style;
+using CUTLASS_STL_NAMESPACE::round_indeterminate;
+using CUTLASS_STL_NAMESPACE::round_toward_zero;
+using CUTLASS_STL_NAMESPACE::round_to_nearest;
+using CUTLASS_STL_NAMESPACE::round_toward_infinity;
+using CUTLASS_STL_NAMESPACE::round_toward_neg_infinity;
+
+/// std::float_denorm_style
+using CUTLASS_STL_NAMESPACE::float_denorm_style;
+using CUTLASS_STL_NAMESPACE::denorm_indeterminate;
+using CUTLASS_STL_NAMESPACE::denorm_absent;
+using CUTLASS_STL_NAMESPACE::denorm_present;
+
 }  // namespace platform
 }  // namespace cutlass
diff --git a/include/cutlass/predicate_vector.h b/include/cutlass/predicate_vector.h
index 3ed94389fe..e878156277 100644
--- a/include/cutlass/predicate_vector.h
+++ b/include/cutlass/predicate_vector.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,15 +35,14 @@
 #pragma once
 
 #if defined(__CUDACC_RTC__)
-#include <cuda/std/cassert>
 #include <cuda/std/cstdint>
 #else
-#include <assert.h>
-#include <stdint.h>
+#include <cstdint>
 #endif
 
-#include "cutlass/cutlass.h"
+#include <cuda/std/cassert>
 
+#include "cutlass/cutlass.h"
 #include "cutlass/platform/platform.h"
 
 namespace cutlass {
@@ -106,7 +105,7 @@ tile_traits_concept and a \ref predicate_vector_concept.
 
 /// Statically sized array of bits implementing @concept{predicate_vector_concept}.
 template <
-    /// Number of predicates conatined in predicate vector
+    /// Number of predicates contained in predicate vector
     int kPredicates_,
     /// Number of predicates contained in each byte of internal storage
     int kPredicatesPerByte_ = 4,
@@ -114,13 +113,13 @@ template <
     int kPredicateStart_ = 0>
 struct PredicateVector {
   /// Number of bits stored by the PredicateVector
-  static int const kPredicates = kPredicates_;
+  static constexpr int kPredicates = kPredicates_;
 
   /// Number of bits stored within each byte of the predicate bit vector
-  static int const kPredicatesPerByte = kPredicatesPerByte_;
+  static constexpr int kPredicatesPerByte = kPredicatesPerByte_;
 
-  /// First bit withing each byte containing predicates
-  static int const kPredicateStart = kPredicateStart_;
+  /// First bit within each byte containing predicates
+  static constexpr int kPredicateStart = kPredicateStart_;
 
   // Make sure no one tries to put more than 8 bits in a byte :)
   static_assert(kPredicatesPerByte <= 8, "kPredicatesPerByte must fit within an actual byte");
@@ -132,10 +131,13 @@ struct PredicateVector {
   typedef uint32_t Storage;
 
   /// Number of bytes needed
-  static int const kBytes = (kPredicates + kPredicatesPerByte - 1) / kPredicatesPerByte;
+  static constexpr int kBytes = (kPredicates + kPredicatesPerByte - 1) / kPredicatesPerByte;
 
   /// Number of storage elements needed
-  static int const kWordCount = (kBytes + int(sizeof(Storage)) - 1) / int(sizeof(Storage));
+  static constexpr int kWordCount = (kBytes + int(sizeof(Storage)) - 1) / int(sizeof(Storage));
+
+  /// The byte mask corresponding to predicates
+  static constexpr Storage kByteMask = (((1 << kPredicatesPerByte) - 1) << kPredicateStart);
 
  private:
   //
@@ -162,6 +164,26 @@ struct PredicateVector {
     bit = byte_offset * 8 + bit_offset + kPredicateStart;
   }
 
+  /// Returns word mask.
+  CUTLASS_HOST_DEVICE static constexpr bool computeWordMask() {
+    Storage mask(0);
+    CUTLASS_PRAGMA_UNROLL
+    for (size_t byte = 0; byte < sizeof(Storage); ++byte) {
+      mask |= (kByteMask << (byte * 8));
+    }
+    return mask;
+  }
+
+  /// Returns mask of last word.
+  CUTLASS_HOST_DEVICE static constexpr bool computeLastWordMask() {
+    Storage mask(0);
+    CUTLASS_PRAGMA_UNROLL
+    for (int byte = 0; byte < kBytes % sizeof(Storage); ++byte) {
+      mask |= (kByteMask << (byte * 8));
+    }
+    return mask;
+  }
+
   /// Accesses a given word with optional assertions
   CUTLASS_HOST_DEVICE Storage &storage(int word) {
     CUTLASS_ASSERT(word < kWordCount);
@@ -490,15 +512,15 @@ struct PredicateVector {
 
   /// Returns true if entire predicate array is zero.
   CUTLASS_HOST_DEVICE bool is_zero() const {
-    Storage mask(0);
-    for (int byte = 0; byte < sizeof(Storage); ++byte) {
-      Storage byte_mask = (((1 << kPredicatesPerByte) - 1) << kPredicateStart);
-      mask |= (byte_mask << (byte * 8));
-    }
-    uint32_t result = 0;
-    for (int word = 0; word < kWordCount; ++word) {
-      result |= storage(word);
+   constexpr Storage mask = computeWordMask();
+    Storage result = 0;
+    CUTLASS_PRAGMA_UNROLL
+    for (int word = 0; word < kWordCount - 1; ++word) {
+      result |= (storage(word) & mask);
     }
+    constexpr Storage last_word_mask = computeLastWordMask();
+    result |= (storage(kWordCount - 1) & last_word_mask);
+    
     return result == 0;
   }
 
diff --git a/include/cutlass/quaternion.h b/include/cutlass/quaternion.h
index c9af74ca90..b31df45579 100644
--- a/include/cutlass/quaternion.h
+++ b/include/cutlass/quaternion.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,6 +35,7 @@
 #pragma once
 
 #include "cutlass/cutlass.h"
+#include "cutlass/functional.h"
 #include "cutlass/array.h"
 #include "cutlass/real.h"
 #include "cutlass/coord.h"
@@ -609,7 +610,6 @@ Quaternion<Element> operator/(Element s, Quaternion<Element> const &q) {
 template <typename Element>
 CUTLASS_HOST_DEVICE
 bool operator<(Quaternion<Element> const &lhs, Quaternion<Element> const &rhs) {
-  //TODO
   return true; 
 }
 
@@ -651,7 +651,10 @@ CUTLASS_HOST_DEVICE
   }
 };
 
-//////////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Factories
+////////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <>
 CUTLASS_HOST_DEVICE
@@ -673,6 +676,76 @@ cutlass::Quaternion<double> from_real<cutlass::Quaternion<double> >(double r) {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// functional.h numeric specializations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct multiplies<Quaternion<T>> {
+  CUTLASS_HOST_DEVICE
+  Quaternion<T> operator()(Quaternion<T> lhs, Quaternion<T> const &rhs) const {
+    lhs = lhs * rhs;
+    return lhs;
+  }
+};
+
+/// Squares with optional conversion
+template <typename T, typename Output>
+struct magnitude_squared<Quaternion<T>, Output> {
+  CUTLASS_HOST_DEVICE
+  Output operator()(Quaternion<T> lhs) const {
+    multiplies<Output> mul_op;
+
+    Output y_w = Output(lhs.w());
+    Output y_x = Output(lhs.x());
+    Output y_y = Output(lhs.y());
+    Output y_z = Output(lhs.z());
+
+    return mul_op(y_w, y_w) + mul_op(y_x, y_x) + mul_op(y_y, y_y) + \
+           mul_op(y_z, y_z);
+  }
+};
+
+template <typename T>
+struct multiply_add<Quaternion<T>, Quaternion<T>, Quaternion<T>> {
+  CUTLASS_HOST_DEVICE
+  Quaternion<T> operator()(
+    Quaternion<T> const &a,
+    Quaternion<T> const &b,
+    Quaternion<T> const &c) const {
+
+    T x = c.x();
+    T y = c.y();
+    T z = c.z();
+    T w = c.w();
+
+    x += a.w() * b.x();
+    x += b.w() * a.x();
+    x += a.y() * b.z();
+    x += -a.z() * b.y(),
+
+    y += a.w() * b.y();
+    y += b.w() * a.y();
+    y += a.z() * b.x();
+    y += -a.x() * b.z();
+
+    z += a.w() * b.z();
+    z += b.w() * a.z();
+    z += a.x() * b.y();
+    z += -a.y() * b.x();
+
+    w += a.w() * b.w();
+    w += -a.x() * b.x();
+    w += -a.y() * b.y();
+    w += -a.z() * b.z();
+
+    return cutlass::make_Quaternion(x, y, z, w);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 } // namespace cutlass
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/real.h b/include/cutlass/real.h
index 72059a2fd9..95a22444f0 100644
--- a/include/cutlass/real.h
+++ b/include/cutlass/real.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,6 +35,8 @@
 
 #pragma once
 
+#include <cutlass/detail/helper_macros.hpp> // CUTLASS_DEVICE
+
 namespace cutlass {
 
 /// Used to determine the real-valued underlying type of a numeric type T.
diff --git a/include/cutlass/reduction/device/reduce_split_k.h b/include/cutlass/reduction/device/reduce_split_k.h
index 972708e6b9..0b8ac7a56b 100644
--- a/include/cutlass/reduction/device/reduce_split_k.h
+++ b/include/cutlass/reduction/device/reduce_split_k.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -36,6 +36,7 @@
 
 #include "cutlass/device_kernel.h"
 #include "cutlass/reduction/kernel/reduce_split_k.h"
+#include "cutlass/cuda_host_adapter.hpp"
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
@@ -64,6 +65,8 @@ class ReduceSplitK {
 
   using StrideIndex = typename ReductionKernel::StrideIndex;
 
+  static bool const kEnableCudaHostAdapter = CUTLASS_ENABLE_CUDA_HOST_ADAPTER;
+
   /// Argument structure
   struct Arguments {
 
@@ -71,25 +74,21 @@ class ReduceSplitK {
     // Data members
     //
 
-    MatrixCoord problem_size;
-    int partitions;
-    size_t partition_stride;
-    WorkspaceTensorRef workspace;
-    OutputTensorRef destination;
-    OutputTensorRef source;
-    typename OutputOp::Params output;
-    typename ReductionOp::Params reduction;
+    MatrixCoord problem_size{0,0};
+    int partitions{1};
+    size_t partition_stride{0};
+    WorkspaceTensorRef workspace{};
+    OutputTensorRef destination{};
+    OutputTensorRef source{};
+    typename OutputOp::Params output{};
+    typename ReductionOp::Params reduction{};
 
     //
     // Methods
     //
 
     /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Arguments() : 
-      problem_size(0, 0), 
-      partitions(1), 
-      partition_stride(0) { }
+    Arguments() = default;
    
     CUTLASS_HOST_DEVICE 
     Arguments(
@@ -178,7 +177,7 @@ class ReduceSplitK {
   }
 
   /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
+  Status run(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
 
     //
     // Launch reduction kernel
@@ -186,29 +185,39 @@ class ReduceSplitK {
     dim3 block = ReductionKernel::block_shape();
     dim3 grid = ReductionKernel::grid_shape(params_.problem_size);
 
-    Kernel<ReductionKernel><<< grid, block, 0, stream >>>(params_);
+    if constexpr (kEnableCudaHostAdapter) {
+        CUTLASS_ASSERT(cuda_adapter);
+        if (cuda_adapter) {
+          void* kernel_params[] = {&params_};
+          cuda_adapter->launch(
+              grid, dim3(1,1,1), block, 0, stream, kernel_params, kernel_index);
+        }
+    }
+    else {
+      cutlass::arch::synclog_setup();
+      Kernel<ReductionKernel><<< grid, block, 0, stream >>>(params_);
+    }
 
     cudaError_t result = cudaGetLastError();
-
     return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
   }
 
 
   /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
+  Status operator()(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
+    return run(stream, cuda_adapter, kernel_index);
   }
 
   /// Runs the kernel using initialized state.
   Status operator()(
     Arguments const &args, 
     void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
+    cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
     
     Status status = initialize(args, workspace, stream);
     
     if (status == Status::kSuccess) {
-      status = run(stream);
+      status = run(stream,cuda_adapter, kernel_index);
     }
 
     return status;
diff --git a/include/cutlass/reduction/device/tensor_reduce.h b/include/cutlass/reduction/device/tensor_reduce.h
index 7b5e436ae0..f36c72c922 100644
--- a/include/cutlass/reduction/device/tensor_reduce.h
+++ b/include/cutlass/reduction/device/tensor_reduce.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/reduction/device/tensor_reduce_affine_contiguous.h b/include/cutlass/reduction/device/tensor_reduce_affine_contiguous.h
index eae99b38b8..8d71aa9dd3 100644
--- a/include/cutlass/reduction/device/tensor_reduce_affine_contiguous.h
+++ b/include/cutlass/reduction/device/tensor_reduce_affine_contiguous.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -323,6 +323,7 @@ struct TensorReductionAffineContiguous {
     int shared_mem_bytes = sizeof(typename ReductionKernel::SharedStorage);
 
     // Launch the kernel
+    cutlass::arch::synclog_setup();
     Kernel<ReductionKernel><<< grid_shape, threadblock_shape, shared_mem_bytes, stream >>>(params);
 
     // Check error condition
diff --git a/include/cutlass/reduction/device/tensor_reduce_affine_strided.h b/include/cutlass/reduction/device/tensor_reduce_affine_strided.h
index 42302f2b77..5ec7e65494 100644
--- a/include/cutlass/reduction/device/tensor_reduce_affine_strided.h
+++ b/include/cutlass/reduction/device/tensor_reduce_affine_strided.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -302,6 +302,7 @@ struct TensorReductionAffineStrided {
     int shared_mem_bytes = sizeof(typename ReductionKernel::SharedStorage);
 
     // Launch the kernel
+    cutlass::arch::synclog_setup();
     Kernel<ReductionKernel><<< grid_shape, threadblock_shape, shared_mem_bytes, stream >>>(params);
 
     // Check error condition
diff --git a/include/cutlass/reduction/kernel/reduce_softmax_final.h b/include/cutlass/reduction/kernel/reduce_softmax_final.h
new file mode 100644
index 0000000000..9752b9b76e
--- /dev/null
+++ b/include/cutlass/reduction/kernel/reduce_softmax_final.h
@@ -0,0 +1,267 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a final reduction for softmax
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/arch/memory_sm75.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reduction {
+namespace kernel {
+
+template <
+  typename ElementNorm_,
+  typename ElementSum_,
+  typename ElementSoftmaxCompute_,
+  typename ThreadblockShape_,
+  bool GroupedProblem = false
+>
+class ApplySoftmaxFinalReduction {
+public:
+
+  using ElementNorm = ElementNorm_;
+  using ElementSum = ElementSum_;
+  using ElementSoftmaxCompute = ElementSoftmaxCompute_;
+  using ThreadblockShape = ThreadblockShape_;
+  static const bool isGroupedProblem = GroupedProblem;
+
+  //
+  // Arguments
+  //
+
+  struct Arguments {
+
+    cutlass::gemm::GemmCoord*  problem_sizes{nullptr};
+    cutlass::gemm::GemmCoord   problem_size{};
+    ElementNorm*               block_Norm{nullptr};
+    ElementSum*                block_Sum{nullptr};
+    int64_t*                   offset_Norm_Device{nullptr};
+    int64_t*                   offset_Sum_Device{nullptr};
+    int64_t                    batch_stride_Max{0};
+    int64_t                    batch_stride_Sum{0};
+
+    //
+    // Methods
+    //
+    Arguments() { }
+
+    // Non-grouped constructor without batching
+    Arguments(
+      cutlass::gemm::GemmCoord  problem_size,
+      ElementNorm*              block_Norm,
+      ElementSum*               block_Sum
+    ):
+      problem_size(problem_size),
+      block_Norm(block_Norm),
+      block_Sum(block_Sum),
+      problem_sizes(nullptr),
+      offset_Norm_Device(nullptr),
+      offset_Sum_Device(nullptr),
+      batch_stride_Max(0),
+      batch_stride_Sum(0)
+    {
+
+    }
+
+    // Non-grouped constructor with batching
+    Arguments(
+      cutlass::gemm::GemmCoord  problem_size,
+      ElementNorm*              block_Norm,
+      ElementSum*               block_Sum,
+      int64_t                   batch_stride_Max,
+      int64_t                   batch_stride_Sum
+    ):
+      problem_size(problem_size),
+      block_Norm(block_Norm),
+      block_Sum(block_Sum),
+      batch_stride_Max(batch_stride_Max),
+      batch_stride_Sum(batch_stride_Sum),
+      problem_sizes(nullptr),
+      offset_Norm_Device(nullptr),
+      offset_Sum_Device(nullptr)
+    {
+
+    }
+
+
+    // Grouped constructor
+    Arguments(
+      cutlass::gemm::GemmCoord  *problem_sizes,
+      ElementNorm*              block_Norm,
+      ElementSum*               block_Sum,
+      int64_t*                  offset_Norm_Device,
+      int64_t*                  offset_Sum_Device
+    ):
+      problem_sizes(problem_sizes),
+      problem_size(cutlass::gemm::GemmCoord(0, 0, 0)),
+      block_Norm(block_Norm),
+      block_Sum(block_Sum),
+      offset_Norm_Device(offset_Norm_Device),
+      offset_Sum_Device(offset_Sum_Device)
+    {
+
+    }
+  };
+
+  struct SharedStorage {
+
+
+  };
+
+  //
+  // Params struct
+  //
+
+  struct Params {
+    Arguments args;
+
+    //
+    // Methods
+    //
+    Params() { }
+
+    Params(Arguments const &args_): args(args_) { }
+  };
+
+private:
+
+public:
+
+  CUTLASS_DEVICE
+  ApplySoftmaxFinalReduction() { }
+
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    apply(params, shared_storage);
+  }
+
+private:
+
+  /// Full reduction
+  CUTLASS_DEVICE
+  void apply(Params const &params, SharedStorage &shared_storage) {
+
+    int tid = threadIdx.x;
+    int bid = blockIdx.x;
+    int bdim = blockDim.x;
+    
+    int block_batch = blockIdx.z;
+
+    // defining three vars for a general reduction module
+    cutlass::gemm::GemmCoord problem_size = isGroupedProblem ? params.args.problem_sizes[bid] : params.args.problem_size;
+    int m_dim_in_loop = isGroupedProblem ? problem_size.m() : tid + bdim;
+    int access_offset = isGroupedProblem ? 0 : bid * bdim;
+
+    if (!isGroupedProblem && access_offset + tid >= problem_size.m()) return;
+
+    ElementNorm *curr_ptr_Max = isGroupedProblem ? \
+              params.args.block_Norm + params.args.offset_Norm_Device[bid] : \
+              params.args.block_Norm + block_batch * params.args.batch_stride_Max;
+    ElementSum *curr_ptr_Sum = isGroupedProblem ? \
+              params.args.block_Sum + params.args.offset_Sum_Device[bid] : \
+              params.args.block_Sum + block_batch * params.args.batch_stride_Sum;
+
+    int threadblock_num = (problem_size.n() + ThreadblockShape::kN - 1) / ThreadblockShape::kN;
+
+    using ConvertSumOutput = cutlass::NumericConverter<ElementSum, ElementSoftmaxCompute>;
+    using ConvertNormOutput = cutlass::NumericConverter<ElementNorm, ElementSoftmaxCompute>;
+
+    using ConvertSum = cutlass::NumericConverter<ElementSoftmaxCompute, ElementSum>;
+    using ConvertNorm = cutlass::NumericConverter<ElementSoftmaxCompute, ElementNorm>;
+
+    ConvertSum   convert_sum;
+    ConvertNorm  convert_norm;
+
+    ConvertSumOutput   convert_sum_output;
+    ConvertNormOutput  convert_norm_output;
+
+    uint32_t float_max_bits = 0xff7fffff;
+    float min_float = reinterpret_cast<float const &>(float_max_bits);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int idx_m = tid; idx_m < m_dim_in_loop; idx_m += bdim) {
+      ElementNorm *access_n = curr_ptr_Max + idx_m + access_offset;
+      ElementSum *access_s = curr_ptr_Sum + idx_m + access_offset;
+      ElementNorm *access_n_bak = access_n;
+      ElementSum *access_s_bak = access_s;
+      ElementSoftmaxCompute max_val = ElementSoftmaxCompute(min_float);
+      ElementSoftmaxCompute sum_val = ElementSoftmaxCompute(0);
+      ElementNorm fetch_n;
+      ElementSum fetch_s;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int idx_n = 0; idx_n < threadblock_num; idx_n++) {
+        cutlass::arch::global_load<ElementNorm, sizeof(ElementNorm)>(fetch_n, access_n, true);
+        max_val = cutlass::fast_max(max_val, convert_norm(fetch_n));
+        access_n += problem_size.m();
+      }
+
+      access_n = access_n_bak;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int idx_n = 0; idx_n < threadblock_num; idx_n++) {
+        cutlass::arch::global_load<ElementNorm, sizeof(ElementNorm)>(fetch_n, access_n, true);
+        cutlass::arch::global_load<ElementSum, sizeof(ElementSum)>(fetch_s, access_s, true);
+        sum_val += convert_sum(fetch_s) * cutlass::fast_exp(convert_norm(fetch_n) - max_val);
+        access_n += problem_size.m();
+        access_s += problem_size.m();
+      }
+
+      ElementSoftmaxCompute inv_sum = cutlass::constants::one<ElementSoftmaxCompute>() / sum_val;
+
+      access_n = access_n_bak;
+      access_s = access_s_bak;
+
+      access_n[0] = convert_norm_output(max_val);
+      access_s[0] = convert_sum_output(inv_sum);
+    }
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace reduction
+} // namespace cutlass
diff --git a/include/cutlass/reduction/kernel/reduce_split_k.h b/include/cutlass/reduction/kernel/reduce_split_k.h
index 359b38a34e..d9c7013965 100644
--- a/include/cutlass/reduction/kernel/reduce_split_k.h
+++ b/include/cutlass/reduction/kernel/reduce_split_k.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h b/include/cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h
index 92371753e2..bffc956f2c 100644
--- a/include/cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h
+++ b/include/cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -83,7 +83,7 @@ struct TensorReductionAffineContiguousParams {
   uint64_t outer_count;                          /// Number of elements in outer index space
 
   ElementOutput * destination;                  /// Pointer to output tensor of rank kReducedRank
-  ElementSource const * source;                 /// Poitner to source pointer of rank kRank
+  ElementSource const * source;                 /// Pointer to source pointer of rank kRank
   ReductionOp reduction_op;                     /// Reduction operator
   ElementCompute reduction_identity;            /// Identity element used by reduction operator
   ElementCompute *device_workspace;             /// Pointer to device workspace for inter-CTA reductions
@@ -262,7 +262,7 @@ class TensorReductionAffineContiguous {
     ComputeFragment accumulator;
     
     CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < accumulator.size(); ++i) {
+    for (int i = 0; i < int(accumulator.size()); ++i) {
       accumulator[i] = params.reduction_identity;
     }
     
diff --git a/include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h b/include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h
index 3429fb588a..0d449e687e 100644
--- a/include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h
+++ b/include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -85,7 +85,7 @@ struct TensorReductionAffineStridedParams {
   uint64_t outer_count;                          /// Number of elements in outer index space
 
   ElementOutput * destination;                  /// Pointer to output tensor of rank kReducedRank
-  ElementSource const * source;                 /// Poitner to source pointer of rank kRank
+  ElementSource const * source;                 /// Pointer to source pointer of rank kRank
   ReductionOp reduction_op;                     /// Reduction operator
   ElementCompute reduction_identity;            /// Identity element for reduction operator
   ElementCompute *device_workspace;             /// Pointer to device workspace for inter-CTA reductions
@@ -258,7 +258,7 @@ class TensorReductionAffineStrided {
     ComputeFragment identity_frag;
 
     CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < identity_frag.size(); ++i) {
+    for (int i = 0; i < int(identity_frag.size()); ++i) {
       identity_frag[i] = params.reduction_identity;
     }
 
@@ -536,7 +536,7 @@ class TensorReductionAffineStridedFinal {
     ComputeFragment identity_frag;
     
     CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < identity_frag.size(); ++i) {
+    for (int i = 0; i < int(identity_frag.size()); ++i) {
       identity_frag[i] = params.reduction_identity;
     }
 
diff --git a/include/cutlass/reduction/thread/reduce.h b/include/cutlass/reduction/thread/reduce.h
index e6574d7a25..d2551f9770 100644
--- a/include/cutlass/reduction/thread/reduce.h
+++ b/include/cutlass/reduction/thread/reduce.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/reduction/thread/reduction_operators.h b/include/cutlass/reduction/thread/reduction_operators.h
index f9d4059b8e..8423c2d933 100644
--- a/include/cutlass/reduction/thread/reduction_operators.h
+++ b/include/cutlass/reduction/thread/reduction_operators.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -172,7 +172,7 @@ struct ReduceArrayOperation<logical_and<uint1b_t>, uint1b_t, N> {
       item = (item || !bits);
     }
 
-    return uint1b_t(!item);
+    return uint1b_t{!item};
   }
 };
 
@@ -195,7 +195,7 @@ struct ReduceArrayOperation<logical_or<uint1b_t>, uint1b_t, N> {
       item = (item || bits);
     }
 
-    return uint1b_t(item);
+    return uint1b_t{item};
   }
 };
 
diff --git a/include/cutlass/reduction/threadblock_swizzle.h b/include/cutlass/reduction/threadblock_swizzle.h
index b97dd3725a..ffb35dadaf 100644
--- a/include/cutlass/reduction/threadblock_swizzle.h
+++ b/include/cutlass/reduction/threadblock_swizzle.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/relatively_equal.h b/include/cutlass/relatively_equal.h
index c396426392..26b7c66b19 100644
--- a/include/cutlass/relatively_equal.h
+++ b/include/cutlass/relatively_equal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,14 +35,15 @@
 #pragma once
 
 #include "numeric_types.h"
+#include "complex.h"
 
 namespace cutlass {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-template <typename T>
+template <typename T, typename U = T>
 CUTLASS_HOST_DEVICE
-bool relatively_equal(T a, T b, T epsilon, T nonzero_floor);
+bool relatively_equal(T a, T b, U epsilon, U nonzero_floor);
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -56,7 +57,11 @@ template <typename T>
 CUTLASS_HOST_DEVICE
 bool relatively_equal_float(T a, T b, T epsilon, T nonzero_floor) {
   
+#if defined(__CUDACC_RTC__)
+  using cuda::std::abs;
+#else
   using std::abs;
+#endif
 
   T abs_A = abs(a);
   T abs_B = abs(b);
@@ -77,6 +82,12 @@ bool relatively_equal_float(T a, T b, T epsilon, T nonzero_floor) {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<bool>(bool a, bool b, bool, bool) {
+  return (a == b);
+}
+
 template <>
 CUTLASS_HOST_DEVICE
 bool relatively_equal<uint1b_t>(uint1b_t a, uint1b_t b, uint1b_t, uint1b_t) {
@@ -157,6 +168,18 @@ bool relatively_equal<uint64_t>(uint64_t a, uint64_t b, uint64_t, uint64_t) {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<float_e4m3_t>(float_e4m3_t a, float_e4m3_t b, float_e4m3_t epsilon, float_e4m3_t nonzero_floor) {
+  return detail::relatively_equal_float<float>(a, b, epsilon, nonzero_floor);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<float_e5m2_t>(float_e5m2_t a, float_e5m2_t b, float_e5m2_t epsilon, float_e5m2_t nonzero_floor) {
+  return detail::relatively_equal_float<float>(a, b, epsilon, nonzero_floor);
+}
+
 template <>
 CUTLASS_HOST_DEVICE
 bool relatively_equal<half_t>(half_t a, half_t b, half_t epsilon, half_t nonzero_floor) {
@@ -198,6 +221,55 @@ bool relatively_equal<double>(double a, double b, double epsilon, double nonzero
   return detail::relatively_equal_float(a, b, epsilon, nonzero_floor);
 }
 
+template<typename T>
+CUTLASS_HOST_DEVICE
+bool relatively_equal(complex<T> a, complex<T> b, T epsilon, T nonzero_floor) {
+#if defined(__CUDACC_RTC__)
+  using cuda::std::abs;
+#else
+  using std::abs;
+#endif
+
+  T abs_A = abs(a);
+  T abs_B = abs(b);
+  T diff = abs(a - b);
+  complex<T> zero = complex<T>{T{}, T{}};
+
+  if (a == b) {
+    return true;
+  }
+  else if (a == zero || b == zero || diff < nonzero_floor) {
+    return diff < epsilon * nonzero_floor;
+  }
+
+  return diff < epsilon * (abs_A + abs_B);
+}
+
+template <typename T>
+CUTLASS_HOST_DEVICE 
+bool relatively_equal(complex<T> a,  complex<T> b, complex<T> epsilon, complex<T> nonzero_floor) {
+#if defined(__CUDACC_RTC__)
+  using cuda::std::abs;
+#else
+  using std::abs;
+#endif
+
+  T abs_A = abs(a);
+  T abs_B = abs(b);
+  complex<T> diff = a - b;
+  T abs_diff = abs(diff);
+  complex<T> zero = complex<T>{T{}, T{}};
+
+  if (a == b) {
+    return true;
+  }
+  else if (a == zero || b == zero || abs_diff < abs(nonzero_floor)) {
+    return abs_diff < abs(epsilon * nonzero_floor);
+  }
+
+  return abs_diff < abs(epsilon) * (abs_A + abs_B);
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace cutlass
diff --git a/include/cutlass/semaphore.h b/include/cutlass/semaphore.h
index 5765f48f4a..efcd9211c9 100644
--- a/include/cutlass/semaphore.h
+++ b/include/cutlass/semaphore.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -36,7 +36,6 @@
 
 #include "cutlass/cutlass.h"
 
-#include "cutlass/aligned_buffer.h"
 #include "cutlass/array.h"
 
 #include "cutlass/numeric_types.h"
@@ -90,7 +89,6 @@ class Semaphore {
   /// Waits until the semaphore is equal to the given value
   CUTLASS_DEVICE
   void wait(int status = 0) {
-
     while( __syncthreads_and(state != status) ) {
       fetch();
     }
diff --git a/include/cutlass/subbyte_reference.h b/include/cutlass/subbyte_reference.h
index d75f4b00be..8d43f503ee 100644
--- a/include/cutlass/subbyte_reference.h
+++ b/include/cutlass/subbyte_reference.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -33,10 +33,49 @@
 */
 #pragma once
 
-#include "cutlass/numeric_types.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/integer_subbyte.h"
+#include "cutlass/fast_math.h"
 
 namespace cutlass {
 
+namespace detail {
+// This is an implementation detail of cutlass::SubbyteReference and.
+// cutlass::HostTensor.  For a given logical element type Element,
+// and its corresponding storage (physical) element type StorageUnit,
+// it computes quantities that help with managing allocations.
+//
+// CUTLASS uses a hidden "ContainerUnitType" or StorageUnit type to support
+// packed arrays of subbyte types such as int4.  Element is the "logical" type
+// for computations, while CUTLASS uses StorageUnit as the element type
+// of a packed array of Element.  If Element is not a subbyte type,
+// then the corresponding StorageUnit type is just Element itself.
+//
+// The ContainerType is always calculated as an array StorageUnit type (the StorageUnit
+// is always a byte for subbyte types),
+// and its number of bits is the lcm of the subbyte type's number of bits and 8.
+// Below are some examples for different subbyte types.
+//
+// * Subbyte Type=int2, ContainerType=StorageUnit[1] (StorageUnit=uint8_t)
+// * Subbyte Type=int4, ContainerType=StorageUnit[1] (StorageUnit=uint8_t)
+template<class Element, class StorageUnit>
+struct StorageContainerCalculator {
+  // kContainerTypeNumBits: The number of bits needed for ContainerType
+  static constexpr int kContainerTypeNumBits   = (sizeof_bits<Element>::value < 8) ? cutlass::lcm_cxx11(sizeof_bits<Element>::value, sizeof_bits<StorageUnit>::value) : sizeof_bits<Element>::value;
+  static_assert(kContainerTypeNumBits % sizeof_bits<Element>::value == 0, "The bits of ContainerType should be divisible by the element's number of bits");
+  // kContainerTypeNumLogicalElements: The number of logical Element instance(s) that can be stored per ContainerType instance
+  static constexpr int kContainerTypeNumLogicalElements = kContainerTypeNumBits / sizeof_bits<Element>::value;
+  /// 3. kContainerTypeNumBytes: The number of bytes per ContainerType instance
+  static constexpr int kContainerTypeNumBytes = kContainerTypeNumBits / 8;
+  /// 4. kContainerTypeNumBytes: The number of base StorageUnit in the ContainerType
+  static constexpr int kContainerTypeNumStorageUnit = kContainerTypeNumBits / sizeof_bits<StorageUnit>::value;
+
+  static_assert(kContainerTypeNumBits != 0, "kContainerTypeNumBits can not be zero");
+  static_assert(kContainerTypeNumLogicalElements != 0, "kContainerTypeNumLogicalElements can not be zero");
+  static_assert(kContainerTypeNumBytes != 0, "kContainerTypeNumBytes can not be zero");
+};
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// This class provides a mechanism for packing and unpacking elements smaller than one byte. It
@@ -60,8 +99,9 @@ namespace cutlass {
 ///
 template <
   typename Element_,              /// CUTLASS numeric element type.
-  typename Storage_ = uint8_t     /// Underlying storage type. Must be able to hold an integer 
+  typename Storage_ = uint8_t,    /// Underlying storage type. Must be able to hold an integer 
                                   ///   number of objects of type Element.
+  class = void
 >
 class ConstSubbyteReference {
 public:
@@ -294,8 +334,20 @@ class ConstSubbyteReference {
 
 template <
   typename Element_,              /// CUTLASS numeric element type.
-  typename Storage_ = uint8_t     /// Underlying storage type. Must be able to hold an integer 
+  typename Storage_ =             /// Underlying storage type. Must be able to hold an integer
                                   ///   number of objects of type Element.
+
+#if defined(__CUDA_ARCH__)        /// Default size depends on width of atomicCas() overloads.
+  #if (__CUDA_ARCH__ >= 700)      ///
+  uint16_t
+  #else
+  uint32_t
+  #endif
+#else
+  uint8_t
+#endif
+  ,
+  class = void
 >
 class SubbyteReference {
 public:
@@ -379,7 +431,13 @@ class SubbyteReference {
   /// Unpacks an element from memory
   CUTLASS_HOST_DEVICE
   Element get() const {
-    Storage item = Storage((*ptr_ >> (offset_ * sizeof_bits<Element>::value)) & kMask);
+    uint8_t const* byte_ptr = reinterpret_cast<uint8_t const*>(ptr_);
+    // Convert offset in elements to offset in bytes
+    constexpr int elements_per_byte = cutlass::sizeof_bits<uint8_t>::value / cutlass::sizeof_bits<Element>::value;
+    byte_ptr += offset_ / elements_per_byte;
+    // Offset of element within a byte
+    int byte_offset = offset_ % elements_per_byte;
+    uint8_t item = uint8_t((*byte_ptr >> (byte_offset * cutlass::sizeof_bits<Element>::value)) & kMask);
     return reinterpret_cast<Element const &>(item);
   }
 
@@ -387,14 +445,41 @@ class SubbyteReference {
   CUTLASS_HOST_DEVICE
   SubbyteReference & set(Element const &x) {
 
-    Storage item = (reinterpret_cast<Storage const &>(x) & kMask);
+    Storage item        = (reinterpret_cast<Storage const &>(x) & kMask);
+    Storage kUpdateMask = Storage(~(kMask << (offset_ * cutlass::sizeof_bits<Element>::value)));
+    Storage new_bits    = Storage(item << (offset_ * cutlass::sizeof_bits<Element>::value));
+
+#if defined(__CUDA_ARCH__)
+
+    //
+    // Homebrew read-modify-write
+    //
+    Storage original;
+    Storage updated;
+
+    do {
+
+      original = (*ptr_);
 
-    Storage kUpdateMask = Storage(~(kMask << (offset_ * sizeof_bits<Element>::value)));
-    *ptr_ = Storage((*ptr_ & kUpdateMask) | Storage(item << (offset_ * sizeof_bits<Element>::value)));
+      updated  = Storage((original & kUpdateMask) | new_bits);
+
+      original = atomicCAS(ptr_, original, updated);
+
+    } while (updated != original);
+
+#else
+
+    Storage original = (*ptr_);
+    Storage updated  = Storage((original & kUpdateMask) | new_bits);
+    *ptr_ = updated;
+
+#endif
 
     return *this;
   }
 
+  ////
+
   /// Unpacks an element from memory
   CUTLASS_HOST_DEVICE
   operator Element() const {
@@ -565,33 +650,736 @@ class SubbyteReference {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-template <typename Element, bool subbyte = (sizeof_bits<Element>::value < 8)>
-struct ReferenceFactory;
+template<typename T> using _war = T;
+template <
+  typename Element_,              /// CUTLASS numeric element type.
+  typename Storage_               /// Underlying basic storage type.
+>
+class SubbyteReference<Element_, Storage_, 
+    typename platform::enable_if<sizeof_bits<Storage_>::value % sizeof_bits<Element_>::value != 0>::type> {
+public:
+
+  using Element = Element_;
+  /// Note: It's possible that StorageUnit is not divisible by Element.
+  /// For example, an Element instance might be stored across 2 StorageUnit instances.
+  /// Thus, CUTLASS needs a storage vector to hold an integer number of Element instances.
+
+  using StorageUnit = Storage_;
+private:
+  using StorageContainerCalculator = cutlass::detail::StorageContainerCalculator<Element, StorageUnit>;
+public:
+  static int const kBitsStoredVec = StorageContainerCalculator::kContainerTypeNumBits; 
+  static int const kNumStorageUnitPerStoredVec = StorageContainerCalculator::kContainerTypeNumStorageUnit;
+
+  using StorageVec = StorageUnit[kNumStorageUnitPerStoredVec];
+  using StorageVecPointer = StorageVec *;
+  
+  using CudaAtomicType = typename platform::conditional<
+      sizeof_bits<StorageUnit>::value == 16,
+      uint32_t,
+      uint64_t
+    >::type;
+
+  static_assert(sizeof_bits<Element>::value <= sizeof_bits<StorageVec>::value,
+    "Size of Element must not be greater than StorageVec.");
+
+  static_assert(!(sizeof_bits<StorageVec>::value % sizeof_bits<Element>::value),
+    "StorageVec must be divisible by Element");
+
+private:
+
+  ///! Number of elements per storage vector
+  int const kElementsPerVector = sizeof_bits<StorageVec>::value / sizeof_bits<Element>::value;
+
+  ///! Bit mask for storage unit.
+  StorageUnit const kMask = (StorageUnit(1) << sizeof_bits<Element>::value) - StorageUnit(1);
+
+  /// Pointer to array containing element
+  _war<StorageVecPointer> ptr_;
+
+  /// Offset (in units of elements) from pointer.
+  ///
+  /// Invariant: must always be in range [0, kElementsPerVector)
+  int offset_;
+
+  /// Element may be stored across 2 storage unit.
+  ///   Low storage unit index in StorageVec
+  ///   High storage unit index in StorageVec
+  int low_storage_unit_idx_;
+  int high_storage_unit_idx_;
+
+  /// Full Mask to extract the entire element
+  uint64_t full_element_mask_;
+
+  /// Mask to extract the Element from Low storage unit and High storage unit.
+  StorageUnit low_storage_mask_;
+  StorageUnit high_storage_mask_;
+
+  /// Start bit index inside the storage unit.
+  int start_bit_idx_;
+
+private:
 
-template <typename Element>
-struct ReferenceFactory<Element, false> {
   CUTLASS_HOST_DEVICE
-  static Element &get(Element *ptr, int64_t offset) {
-    return ptr[offset];
+  void update_element_status() {
+    int num_bits = offset_ * sizeof_bits<Element>::value;
+
+    start_bit_idx_ = num_bits % sizeof_bits<StorageUnit>::value;
+    
+    low_storage_unit_idx_ = num_bits / sizeof_bits<StorageUnit>::value;
+    high_storage_unit_idx_ = sizeof_bits<StorageUnit>::value - (start_bit_idx_) < sizeof_bits<Element>::value 
+                              ? low_storage_unit_idx_ + 1 : low_storage_unit_idx_;
+    
+    full_element_mask_ = uint64_t(kMask) << start_bit_idx_;
+    low_storage_mask_ = StorageUnit(full_element_mask_ & ~StorageUnit(0));
+    high_storage_mask_ = StorageUnit((full_element_mask_ >> sizeof_bits<StorageUnit>::value) & ~StorageUnit(0));
   }
 
+public:
+
   CUTLASS_HOST_DEVICE
-  static Element const &get(Element const *ptr, int64_t offset) {
-    return ptr[offset];
+  SubbyteReference(): ptr_(nullptr), offset_(0) { }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  SubbyteReference(
+    Element *ptr,           /// pointer to memory
+    int64_t offset          /// logical offset in units of Element
+  ): 
+    ptr_(reinterpret_cast<StorageVecPointer>(ptr)),
+    offset_(0) {
+    int64_t offset_in_vectors = offset / kElementsPerVector;
+    int64_t offset_in_elements = offset % kElementsPerVector;
+
+    ptr_ += offset_in_vectors;
+    offset_ = int(offset_in_elements);
+
+    update_element_status();
   }
-};
 
-template <typename Element>
-struct ReferenceFactory<Element, true> {
+  /// Constructor
   CUTLASS_HOST_DEVICE
-  static SubbyteReference<Element> get(Element *ptr, int64_t offset) {
-    return SubbyteReference<Element>(ptr, offset);
+  SubbyteReference(
+    Element *ptr = nullptr
+  ): SubbyteReference(ptr, 0) { }
+
+  /// Gets StorageVec pointer
+  CUTLASS_HOST_DEVICE
+  StorageVecPointer storage_pointer() const {
+    return ptr_;
   }
 
+  /// Gets StorageVec pointer
   CUTLASS_HOST_DEVICE
-  static ConstSubbyteReference<Element> get(Element const *ptr,
-                                             int64_t offset) {
-    return ConstSubbyteReference<Element>(ptr, offset);
+  Element * operator&() const {
+    return reinterpret_cast<Element *>(ptr_);
+  }
+
+  /// Gets element offset within StorageVec vector
+  CUTLASS_HOST_DEVICE
+  int element_offset() const {
+    return offset_;
+  }
+
+  /// Unpacks an element from memory
+  CUTLASS_HOST_DEVICE
+  Element get() const {
+    StorageUnit low_bits = (*ptr_)[low_storage_unit_idx_] & low_storage_mask_;
+    StorageUnit high_bits = low_storage_unit_idx_ != high_storage_unit_idx_ ? (*ptr_)[high_storage_unit_idx_] & high_storage_mask_ : 0;
+
+    uint64_t full_item = ((uint64_t)high_bits << sizeof_bits<StorageUnit>::value) | low_bits;
+    uint8_t result = uint8_t(full_item >> start_bit_idx_);
+
+    return reinterpret_cast<Element const &>(result);
+  }
+
+  /// Stores an element to memory
+  CUTLASS_HOST_DEVICE
+  SubbyteReference & set(Element const &x) {
+
+    uint64_t item = static_cast<uint64_t>((reinterpret_cast<uint8_t const &>(x) & kMask)) << start_bit_idx_;
+    
+    StorageUnit low_new_bits  = StorageUnit(item & ~StorageUnit(0));
+    StorageUnit high_new_bits = StorageUnit(item >> sizeof_bits<StorageUnit>::value);
+
+    StorageUnit const kLowUpdateMask  = StorageUnit((~full_element_mask_) & (~StorageUnit(0)));
+    StorageUnit const kHighUpdateMask = StorageUnit(((~full_element_mask_) >> sizeof_bits<StorageUnit>::value) & (~StorageUnit(0)));
+
+#if defined(__CUDA_ARCH__)
+    //
+    // Homebrew read-modify-write
+    //
+    if(high_storage_unit_idx_ != low_storage_unit_idx_){
+      /// Only need update 2 storage unit at once.
+      /// consider misaligned address issue, we need to do atomicCAS twice 
+      StorageUnit original_low_bits, original_high_bits, update_low_bits, update_high_bits;
+      do {
+        original_low_bits  = ((*ptr_)[low_storage_unit_idx_]);
+        update_low_bits  = (original_low_bits & kLowUpdateMask) | low_new_bits;
+        original_low_bits = atomicCAS(&((*ptr_)[low_storage_unit_idx_]), original_low_bits, update_low_bits);
+      } while (update_low_bits != original_low_bits);
+      do {
+        original_high_bits = ((*ptr_)[high_storage_unit_idx_]);
+        update_high_bits  = (original_high_bits & kHighUpdateMask) | high_new_bits;
+        original_high_bits = atomicCAS(&((*ptr_)[high_storage_unit_idx_]), original_high_bits, update_high_bits);
+      } while (update_high_bits != original_high_bits);
+    }
+    else {
+      /// Only need update 1 storage unit.
+      StorageUnit original, updated;
+      do {
+        original = ((*ptr_)[low_storage_unit_idx_]);
+
+        updated = (original & kLowUpdateMask) | low_new_bits;
+
+        original = atomicCAS(&((*ptr_)[low_storage_unit_idx_]), original, updated);
+
+      } while (updated != original);
+    }
+#else
+
+
+    StorageUnit update_low_bits  = ((*ptr_)[low_storage_unit_idx_] & kLowUpdateMask) | low_new_bits;
+    StorageUnit update_high_bits = ((*ptr_)[high_storage_unit_idx_] & kHighUpdateMask) | high_new_bits;
+
+    (*ptr_)[low_storage_unit_idx_] = update_low_bits;
+
+    if(low_storage_unit_idx_ != high_storage_unit_idx_)
+      (*ptr_)[high_storage_unit_idx_] = update_high_bits;
+#endif
+
+    return *this;
+  }
+
+  ////
+
+  /// Unpacks an element from memory
+  CUTLASS_HOST_DEVICE
+  operator Element() const {
+    return get();
+  }
+
+  /// Stores an element to memory
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator=(Element const & x) {
+    return set(x);
+  }
+
+  /// Stores an element to memory
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator=(SubbyteReference const & x) {
+    return set(x.get());
+  }
+
+  /// Stores an element to memory
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator=(
+      ConstSubbyteReference<Element, StorageVec> const &x) {
+    return set(x.get());
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator+=(int offset) {
+
+    offset += offset_;
+    
+    int offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = offset % kElementsPerVector;
+
+    ptr_ += offset_in_vectors;
+    offset_ = offset_in_elements;
+
+    update_element_status();
+
+    return *this;
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator+=(long long offset) {
+
+    offset += offset_;
+    
+    long long offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = int(offset % kElementsPerVector);
+
+    ptr_ += offset_in_vectors;
+    offset_ = offset_in_elements;
+
+    update_element_status();
+
+    return *this;
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator-=(int offset) {
+    
+    int offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = offset % kElementsPerVector;
+
+    ptr_ -= offset_in_vectors;
+    offset_ -= offset_in_elements;
+
+    if (offset_ < 0) {
+      offset_ += kElementsPerVector;
+      --ptr_;
+    }
+
+    update_element_status();
+    return *this;
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator-=(long long offset) {
+    
+    long long offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = int(offset % kElementsPerVector);
+
+    ptr_ -= offset_in_vectors;
+    offset_ -= offset_in_elements;
+
+    if (offset_ < 0) {
+      offset_ += kElementsPerVector;
+      --ptr_;
+    }
+
+    update_element_status();
+    return *this;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference operator+(int offset) const {
+
+    SubbyteReference ref(ptr_, offset_);
+    ref += offset;
+
+    return ref;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference operator+(long long offset) const {
+    
+    SubbyteReference ref(ptr_, offset_);
+    ref += offset;
+
+    return ref;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference operator-(int offset) const {
+
+    SubbyteReference ref(ptr_, offset_);
+    ref -= offset;
+
+    return ref;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference operator-=(long long offset) const {
+
+    SubbyteReference ref(ptr_, offset_);
+    ref -= offset;
+
+    return ref;
+  }
+
+  /// Computes the difference in elements between references
+  CUTLASS_HOST_DEVICE
+  ptrdiff_t operator-(SubbyteReference ref) const {
+    return (ptr_ - ref.ptr_) * kElementsPerVector + (offset_ - ref.offset_);
+  }
+
+  /// Explicit cast to int
+  CUTLASS_HOST_DEVICE
+  explicit operator int() const {
+    return int(get());
+  }
+
+  /// Explicit cast to signed 64-bit integer
+  CUTLASS_HOST_DEVICE
+  explicit operator int64_t() const {
+    return int64_t(get());
+  }
+
+  /// Explicit cast to unsigned 64-bit integer
+  CUTLASS_HOST_DEVICE
+  explicit operator uint64_t() const {
+    return uint64_t(get());
+  }
+
+  /// Explicit cast to float
+  CUTLASS_HOST_DEVICE
+  explicit operator float() const {
+    return float(get());
+  }
+
+  /// Explicit cast to double
+  CUTLASS_HOST_DEVICE
+  explicit operator double() const {
+    return double(get());
+  }
+};
+
+template<typename T> using _war = T;
+template <
+  typename Element_,              /// CUTLASS numeric element type.
+  typename Storage_               /// Underlying storage type. Must be able to hold an integer 
+>
+class ConstSubbyteReference<Element_, Storage_, 
+    typename platform::enable_if<sizeof_bits<Storage_>::value % sizeof_bits<Element_>::value != 0>::type> {
+public:
+
+  using Element = Element_;
+  ///! Note: Storage unit could not be divisibale by Element,   
+  ///   Type element may be stored across 2 storage units, so need a storage vector to hold integer
+  ///   number of objects of type Element.
+  using StorageUnit = Storage_;
+  static int const kBitsStoredVec = cutlass::lcm_cxx11(sizeof_bits<Element>::value, sizeof_bits<StorageUnit>::value); 
+  static int const kNumStorageUnitPerStoredVec = kBitsStoredVec / sizeof_bits<StorageUnit>::value;
+
+  using StorageVec = StorageUnit[kNumStorageUnitPerStoredVec];
+  using StorageVecPointer = StorageVec const *;
+  
+  using CudaAtomicType = typename platform::conditional<
+      sizeof_bits<StorageUnit>::value == 16,
+      uint32_t,
+      uint64_t
+    >::type;
+
+  static_assert(sizeof_bits<Element>::value <= sizeof_bits<StorageVec>::value,
+    "Size of Element must not be greater than StorageVec.");
+
+  static_assert(!(sizeof_bits<StorageVec>::value % sizeof_bits<Element>::value),
+    "StorageVec must be divisible by Element");
+
+private:
+
+  ///! Number of elements per storage vector
+  int const kElementsPerVector = sizeof_bits<StorageVec>::value / sizeof_bits<Element>::value;
+
+  ///! Bit mask for storage unit.
+  StorageUnit const kMask = (StorageUnit(1) << sizeof_bits<Element>::value) - StorageUnit(1);
+
+  /// Pointer to array containing element
+  _war<StorageVecPointer> ptr_;
+
+  /// Offset (in units of elements) from pointer.
+  ///
+  /// Invariant: must always be in range [0, kElementsPerVector)
+  int offset_;
+
+  /// Element may be stored across 2 storage unit.
+  ///   Low storage unit index in StorageVec
+  ///   High storage unit index in StorageVec
+  int low_storage_unit_idx_;
+  int high_storage_unit_idx_;
+
+  /// Full Mask to extract the entire element
+  uint64_t full_element_mask_;
+
+  /// Mask to extract the Element from Low storage unit and High storage unit.
+  StorageUnit low_storage_mask_;
+  StorageUnit high_storage_mask_;
+
+  /// Start bit index inside the storage unit.
+  int start_bit_idx_;
+
+private:
+
+  CUTLASS_HOST_DEVICE
+  void update_element_status() {
+    int num_bits = offset_ * sizeof_bits<Element>::value;
+
+    start_bit_idx_ = num_bits % sizeof_bits<StorageUnit>::value;
+    
+    low_storage_unit_idx_ = num_bits / sizeof_bits<StorageUnit>::value;
+    high_storage_unit_idx_ = sizeof_bits<StorageUnit>::value - (start_bit_idx_) < sizeof_bits<Element>::value 
+                              ? low_storage_unit_idx_ + 1 : low_storage_unit_idx_;
+    
+    full_element_mask_ = uint64_t(kMask) << start_bit_idx_;
+    low_storage_mask_ = StorageUnit(full_element_mask_ & ~StorageUnit(0));
+    high_storage_mask_ = StorageUnit((full_element_mask_ >> sizeof_bits<StorageUnit>::value) & ~StorageUnit(0));
+  }
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference(): ptr_(nullptr), offset_(0) { }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference(
+    Element const *ptr,           /// pointer to memory
+    int64_t offset          /// logical offset in units of Element
+  ): 
+    ptr_(reinterpret_cast<StorageVecPointer>(ptr)),
+    offset_(0) {
+
+    int64_t offset_in_vectors = offset / kElementsPerVector;
+    int64_t offset_in_elements = offset % kElementsPerVector;
+
+    ptr_ += offset_in_vectors;
+    offset_ = int(offset_in_elements);
+
+    update_element_status();
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference(
+    Element *ptr = nullptr
+  ): ConstSubbyteReference(ptr, 0) { }
+
+  /// Gets storage pointer
+  CUTLASS_HOST_DEVICE
+  StorageVecPointer storage_pointer() const {
+    return ptr_;
+  }
+
+  /// Gets element offset within storage vector
+  CUTLASS_HOST_DEVICE
+  int element_offset() const {
+    return offset_;
+  }
+
+  /// Unpacks an element from memory
+  CUTLASS_HOST_DEVICE
+  Element get() const {
+    StorageUnit low_bits = (*ptr_)[low_storage_unit_idx_] & low_storage_mask_;
+    StorageUnit high_bits = low_storage_unit_idx_ != high_storage_unit_idx_ ? (*ptr_)[high_storage_unit_idx_] & high_storage_mask_ : 0;
+
+    uint64_t full_item = ((uint64_t)high_bits << sizeof_bits<StorageUnit>::value) | low_bits;
+    uint8_t result = uint8_t(full_item >> start_bit_idx_);
+
+    return reinterpret_cast<Element const &>(result);
+  }
+
+  /// Unpacks an element from memory
+  CUTLASS_HOST_DEVICE
+  operator Element() const {
+    return get();
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference &operator+=(int offset) {
+
+    offset += offset_;
+    
+    int offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = offset % kElementsPerVector;
+
+    ptr_ += offset_in_vectors;
+    offset_ = offset_in_elements;
+
+    update_element_status();
+
+    return *this;
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference &operator+=(long long offset) {
+
+    offset += offset_;
+    
+    long long offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = int(offset % kElementsPerVector);
+
+    ptr_ += offset_in_vectors;
+    offset_ = offset_in_elements;
+
+    update_element_status();
+
+    return *this;
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference &operator-=(int offset) {
+    
+    int offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = offset % kElementsPerVector;
+
+    ptr_ -= offset_in_vectors;
+    offset_ -= offset_in_elements;
+
+    if (offset_ < 0) {
+      offset_ += kElementsPerVector;
+      --ptr_;
+    }
+
+    update_element_status();
+
+    return *this;
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference &operator-=(long long offset) {
+    
+    long long offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = int(offset % kElementsPerVector);
+
+    ptr_ -= offset_in_vectors;
+    offset_ -= offset_in_elements;
+
+    if (offset_ < 0) {
+      offset_ += kElementsPerVector;
+      --ptr_;
+    }
+
+    update_element_status();
+
+    return *this;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference operator+(int offset) const {
+
+    ConstSubbyteReference ref(ptr_, offset_);
+    ref += offset;
+
+    return ref;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference operator+(long long offset) const {
+    
+    ConstSubbyteReference ref(ptr_, offset_);
+    ref += offset;
+
+    return ref;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference operator-(int offset) const {
+
+    ConstSubbyteReference ref(ptr_, offset_);
+    ref -= offset;
+
+    return ref;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference operator-=(long long offset) const {
+
+    ConstSubbyteReference ref(ptr_, offset_);
+    ref -= offset;
+
+    return ref;
+  }
+
+  /// Computes the difference in elements between references
+  CUTLASS_HOST_DEVICE
+  ptrdiff_t operator-(ConstSubbyteReference ref) const {
+    return (ptr_ - ref.ptr_) * kElementsPerVector + (offset_ - ref.offset_);
+  }
+
+  /// Explicit cast to int
+  CUTLASS_HOST_DEVICE
+  explicit operator int() const {
+    return int(get());
+  }
+
+  /// Explicit cast to signed 64-bit integer
+  CUTLASS_HOST_DEVICE
+  explicit operator int64_t() const {
+    return int64_t(get());
+  }
+
+  /// Explicit cast to unsigned 64-bit integer
+  CUTLASS_HOST_DEVICE
+  explicit operator uint64_t() const {
+    return uint64_t(get());
+  }
+
+  /// Explicit cast to float
+  CUTLASS_HOST_DEVICE
+  explicit operator float() const {
+    return float(get());
+  }
+
+  /// Explicit cast to double
+  CUTLASS_HOST_DEVICE
+  explicit operator double() const {
+    return double(get());
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Element, bool subbyte = (sizeof_bits<Element>::value < 8)>
+struct ReferenceFactory;
+
+template <typename Element>
+struct ReferenceFactory<Element, false> {
+
+  ///! Number of elements per storage vector
+  static int const kElementsPerVector = 1;
+
+  CUTLASS_HOST_DEVICE
+  static Element &get(Element *ptr, int64_t offset) {
+    return ptr[offset];
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Element const &get(Element const *ptr, int64_t offset) {
+    return ptr[offset];
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Element *add_pointer_offset(Element *ptr, int64_t offset) {
+    return ptr + offset;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Element const *add_pointer_offset(Element const *ptr, int64_t offset) {
+    return ptr + offset;
+  }
+};
+
+template <typename Element>
+struct ReferenceFactory<Element, true> {
+
+  //
+  // Static methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  static SubbyteReference<Element> get(Element *ptr, int64_t offset) {
+    return SubbyteReference<Element>(ptr, offset);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static ConstSubbyteReference<Element> get(Element const *ptr,
+                                             int64_t offset) {
+    return ConstSubbyteReference<Element>(ptr, offset);
+  }
+
+  /// Helper to add an offset in number of elements, assuming this offset is divisible
+  /// by the vector size.
+  CUTLASS_HOST_DEVICE
+  static Element *add_pointer_offset(Element *ptr, int64_t offset_in_elements) {
+    return &SubbyteReference<Element>(ptr, offset_in_elements);
+  }
+
+  /// Helper to add an offset in number of elements, assuming this offset is divisible
+  /// by the vector size.
+  CUTLASS_HOST_DEVICE
+  static Element const *add_pointer_offset(Element const *ptr, int64_t offset_in_elements) {
+    return &ConstSubbyteReference<Element>(ptr, offset_in_elements);
   }
 };
 
diff --git a/include/cutlass/tensor_coord.h b/include/cutlass/tensor_coord.h
index 4cb1e7bea5..982ec4e036 100644
--- a/include/cutlass/tensor_coord.h
+++ b/include/cutlass/tensor_coord.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/tensor_ref.h b/include/cutlass/tensor_ref.h
index 8236fdc518..1191f651e5 100644
--- a/include/cutlass/tensor_ref.h
+++ b/include/cutlass/tensor_ref.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -332,7 +332,8 @@ class TensorRef {
   /// Adds an offset to each pointer
   CUTLASS_HOST_DEVICE
   TensorRef & add_pointer_offset(LongIndex offset_) {
-    ptr_ += offset_;
+    ptr_ = ReferenceFactory<typename platform::remove_const<Element>::type,
+           (sizeof_bits<Element>::value < 8)>::add_pointer_offset(ptr_, offset_);
     return *this;
   }
 
diff --git a/include/cutlass/tensor_ref_planar_complex.h b/include/cutlass/tensor_ref_planar_complex.h
index 6aa5cd4251..ab354bbafc 100644
--- a/include/cutlass/tensor_ref_planar_complex.h
+++ b/include/cutlass/tensor_ref_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/tensor_view.h b/include/cutlass/tensor_view.h
index e1cbce78c6..7defcc24f6 100644
--- a/include/cutlass/tensor_view.h
+++ b/include/cutlass/tensor_view.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/tensor_view_planar_complex.h b/include/cutlass/tensor_view_planar_complex.h
index 7ec2b9be48..af63f80cd3 100644
--- a/include/cutlass/tensor_view_planar_complex.h
+++ b/include/cutlass/tensor_view_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -48,6 +48,7 @@
 
 #include "cutlass/cutlass.h"
 #include "cutlass/tensor_ref_planar_complex.h"
+#include "cutlass/tensor_view.h" // cutlass::TensorView
 
 namespace cutlass {
 
diff --git a/include/cutlass/tfloat32.h b/include/cutlass/tfloat32.h
index b522b6002f..d6d265a430 100644
--- a/include/cutlass/tfloat32.h
+++ b/include/cutlass/tfloat32.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -34,10 +34,13 @@
 */
 #pragma once
 
-#if !defined(__CUDACC_RTC__)
+#if defined(__CUDACC_RTC__)
+#include "cutlass/floating_point_nvrtc.h"
+#else
 #include <cmath>
 #include <limits>
 #include <cstdint>
+#include <cstring> // std::memcpy
 #endif
 
 #include "cutlass/cutlass.h"
@@ -59,7 +62,19 @@ struct alignas(4) tfloat32_t {
   //
   // Methods
   //
+  private:
+    CUTLASS_HOST_DEVICE
+    static uint32_t float_to_storage(float s) {
+  #if defined(__CUDA_ARCH__)
+      uint32_t result = reinterpret_cast<uint32_t const &>(s);
+  #else
+      uint32_t result;
+      std::memcpy(&result, &s, sizeof(float));
+  #endif
+      return result;
+    }
 
+  public:
   /// Constructs from an unsigned int
   CUTLASS_HOST_DEVICE
   static tfloat32_t bitcast(uint32_t x) {
@@ -71,7 +86,7 @@ struct alignas(4) tfloat32_t {
   /// Emulated rounding is fast in device code
   CUTLASS_HOST_DEVICE
   static tfloat32_t round_half_ulp_truncate(float const &s) {
-    uint32_t x = reinterpret_cast<uint32_t const &>(s);
+    uint32_t x = float_to_storage(s);
 
     #if defined(__CUDA_ARCH__)
     if (::isfinite(s)) {
@@ -86,19 +101,15 @@ struct alignas(4) tfloat32_t {
     return tfloat32_t::bitcast(x);
   }
 
-  /// Default constructor
-  CUTLASS_HOST_DEVICE
-  tfloat32_t() : storage(0) { }
+  tfloat32_t() = default;
 
   /// Floating-point conversion - round toward nearest even
   CUTLASS_HOST_DEVICE
-  explicit tfloat32_t(float x): storage(round_half_ulp_truncate(x).storage) { }
+  explicit tfloat32_t(float x): storage(round_half_ulp_truncate(x).raw()) { }
 
-  /// Floating-point conversion - round toward nearest even
+  // Conversion from double (this rounds twice)
   CUTLASS_HOST_DEVICE
-  explicit tfloat32_t(double x): tfloat32_t(float(x)) {
-
-  }
+  explicit tfloat32_t(double x): tfloat32_t(float(x)) { }
 
   /// Integer conversion - round toward zero
   CUTLASS_HOST_DEVICE
@@ -111,7 +122,7 @@ struct alignas(4) tfloat32_t {
     #endif
   }
 
-  /// Converts to float
+  // Conversion to float
   CUTLASS_HOST_DEVICE
   operator float() const {
 
@@ -119,7 +130,7 @@ struct alignas(4) tfloat32_t {
     // of the mantissa.
     unsigned bits = (storage & ~0x1fffu);
 
-    #if defined(__CUDA_ARCH__)    
+    #if defined(__CUDA_ARCH__)
     return reinterpret_cast<float const &>(bits);
     #else
     float flt;
@@ -128,7 +139,7 @@ struct alignas(4) tfloat32_t {
     #endif
   }
 
-  /// Converts to float
+  /// Converts to double
   CUTLASS_HOST_DEVICE
   explicit operator double() const {
     return double(float(*this));
@@ -250,11 +261,11 @@ cutlass::tfloat32_t sqrt(cutlass::tfloat32_t const& h) {
 CUTLASS_HOST_DEVICE
 tfloat32_t copysign(tfloat32_t const& a, tfloat32_t const& b) {
 
-  uint32_t a_mag = (reinterpret_cast<uint32_t const &>(a) & 0x7fffffff);  
-  uint32_t b_sign = (reinterpret_cast<uint32_t const &>(b) & 0x80000000);
+  uint32_t a_mag = (a.raw() & 0x7fffffff);
+  uint32_t b_sign = (b.raw() & 0x80000000);
   uint32_t result = (a_mag | b_sign);
 
-  return reinterpret_cast<tfloat32_t const &>(result);
+  return tfloat32_t::bitcast(result);
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -369,13 +380,7 @@ tfloat32_t operator+(tfloat32_t const& lhs, tfloat32_t const& rhs) {
 
 CUTLASS_HOST_DEVICE
 tfloat32_t operator-(tfloat32_t const& lhs) {
-  union u_tff32 {
-    float val_f32;
-    tfloat32_t val_tf;
-    CUTLASS_HOST_DEVICE u_tff32() : val_f32(0) { }
-  };
-  union u_tff32 x; x.val_f32 = -reinterpret_cast<float const &>(lhs);
-  return x.val_tf;
+  return tfloat32_t::bitcast(0x80000000 ^ lhs.raw());
 }
 
 CUTLASS_HOST_DEVICE
diff --git a/include/cutlass/thread/matrix.h b/include/cutlass/thread/matrix.h
index 238f0955f9..f6b4b2b790 100644
--- a/include/cutlass/thread/matrix.h
+++ b/include/cutlass/thread/matrix.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -134,7 +134,6 @@ class Matrix : public Array<Element, Rows * Columns> {
   /// Ctor
   CUTLASS_HOST_DEVICE
   Matrix(Diagonal const &diag) {
-    // Todo - construct from diagonal
   }
 
   /// Returns a TensorRef pointing to the first element of the tensor.
diff --git a/include/cutlass/trace.h b/include/cutlass/trace.h
index fb32cb81e2..1b0c51126e 100644
--- a/include/cutlass/trace.h
+++ b/include/cutlass/trace.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/collective/sm90_wgmma_transpose.hpp b/include/cutlass/transform/collective/sm90_wgmma_transpose.hpp
new file mode 100644
index 0000000000..430545e6da
--- /dev/null
+++ b/include/cutlass/transform/collective/sm90_wgmma_transpose.hpp
@@ -0,0 +1,754 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing how threads are mapped to a given tile.
+*/
+
+#pragma once
+
+#include "cute/arch/mma_sm90_gmma.hpp"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+using namespace cute;
+
+template <bool Transpose, class SmemLayoutAtom, class ElementType>
+constexpr auto
+gmma_smem_transpose_or_passthrough() {
+  if constexpr (Transpose) {
+    if constexpr (cute::is_same_v<GMMA::Layout_MN_SW128_Atom<ElementType>, SmemLayoutAtom>) {
+      return GMMA::Layout_K_SW128_Atom<ElementType>{};
+    }
+    else if constexpr (cute::is_same_v<GMMA::Layout_MN_SW64_Atom<ElementType>, SmemLayoutAtom>) {
+      return GMMA::Layout_K_SW64_Atom<ElementType>{};
+    }
+    else if constexpr (cute::is_same_v<GMMA::Layout_MN_SW32_Atom<ElementType>, SmemLayoutAtom>) {
+      return GMMA::Layout_K_SW32_Atom<ElementType>{};
+    }
+    else if constexpr (cute::is_same_v<GMMA::Layout_MN_INTER_Atom<ElementType>, SmemLayoutAtom>) {
+      return GMMA::Layout_K_INTER_Atom<ElementType>{};
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<SmemLayoutAtom>, "Unsupported Layout_SW_Atom for B SMEM transposition");
+    }
+  }
+  else {
+    return SmemLayoutAtom{};
+  }
+}
+
+template <class SmemCopyAtom, class ElementType>
+constexpr auto
+use_universal_transposition() {
+  if constexpr (sizeof(ElementType) == 1) {
+    return !cute::is_same_v<GMMA::Layout_MN_SW128_Atom<ElementType>, SmemCopyAtom>;
+  }
+  else if constexpr (sizeof(ElementType) == 4){
+    // Only universal transposition can handle SW64 and Non swizzle SMEM layout
+    if constexpr (cute::is_same_v<GMMA::Layout_MN_SW64_Atom<ElementType>, SmemCopyAtom> ||
+                  cute::is_same_v<GMMA::Layout_MN_INTER_Atom<ElementType>, SmemCopyAtom>) {
+      return true;
+    }
+    else {
+      return false;
+    }
+  }
+  else {
+    static_assert(cutlass::detail::dependent_false<ElementType>, "Unsupported ElementType for B SMEM transposition");
+  }
+}
+
+template<
+  class TiledMma_,
+  class SmemLayoutB_,
+  class SmemLayoutAtomB_,
+  class ElementB_>
+class NoTranspositionOperandB {
+public:
+  using TiledMma = TiledMma_;
+  using SmemLayoutB = SmemLayoutB_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using ElementB = ElementB_;
+
+  constexpr CUTLASS_HOST_DEVICE
+  NoTranspositionOperandB(
+      int,
+      int,
+      TiledMma,
+      SmemLayoutB,
+      SmemLayoutAtomB,
+      ElementB) { }
+
+  template <
+    class TensorSmemB,
+    class TensorTransposedSmemB>
+  CUTLASS_DEVICE void operator()(
+    TensorSmemB const&,
+    TensorTransposedSmemB const&,
+    int, int) { }
+
+  CUTLASS_DEVICE void synchronize(int) { }
+
+  CUTLASS_DEVICE void synchronize() { }
+
+  template <
+    class TensorSmemB,
+    class TensorTransposedSmemB>
+  CUTLASS_DEVICE void transpose(
+    TensorSmemB const&,
+    TensorTransposedSmemB const&,
+    int) { }
+};
+
+template<
+  class TiledMma_,
+  class SmemLayoutB_,
+  class SmemLayoutAtomB_,
+  class ElementB_>
+class UniversalTranspositionOperandB {
+public:
+  using TiledMma = TiledMma_;
+  using SmemLayoutB = SmemLayoutB_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using ElementB = ElementB_;
+  
+  constexpr CUTLASS_HOST_DEVICE 
+  UniversalTranspositionOperandB(
+      int warp_idx_,
+      int warp_group_thread_idx_,
+      TiledMma,
+      SmemLayoutB,
+      SmemLayoutAtomB,
+      ElementB)
+      : warp_idx(warp_idx_)
+      , warp_group_thread_idx(warp_group_thread_idx_) { }
+
+  template <
+    class TensorSmemB,
+    class TensorTransposedSmemB>
+  CUTLASS_DEVICE void operator()(
+    TensorSmemB const& sB,
+    TensorTransposedSmemB const& gmma_sB,
+    int read_stage, int current_step) {
+      if (current_step > 0) {
+        return;
+      }
+
+      constexpr int NumMathWarpGroup = CUTE_STATIC_V(size(TiledMma{})) / NumThreadsPerWarpGroup;
+      static_assert(NumMathWarpGroup == 1 ||
+                    (!detail::use_universal_transposition<SmemLayoutAtomB, ElementB>() && NumMathWarpGroup == 2),
+                    "Wrong math warp group number for TransposeB");
+      constexpr int WarpgroupTileSize = size<1>(SmemLayoutB{});  // A warp group tile would process entire Smem K.
+
+      constexpr int BytesPerSmemSwizzleUnit = 16;
+      constexpr int WarpThreadShapeN = BytesPerSmemSwizzleUnit / sizeof(ElementB);
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      /// Universal transposition, need warp_group sync between load and store.
+      /// The number of reg used depends on the input elementB.
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      /*
+          In one copy step, a warp group would load WarpgroupTileSize * WarpgroupTileSize tile then store to transposed location.
+          In warp_group_tile, each warp holds Four WarpTileSize x WarpTileSize elements:
+                    K
+              ------------
+            | W0 W1 W2 W3  ---
+            | W0 W1 W2 W3    |
+            | W0 W1 W2 W3    | --> Copy Step 0
+            | W0 W1 W2 W3  ---
+                  ....
+            | W0 W1 W2 W3  ---
+            | W0 W1 W2 W3    |
+            | W0 W1 W2 W3    | --> Copy Step n
+            | W0 W1 W2 W3  ---
+      */
+      static_assert((NumThreadsPerWarpGroup % WarpThreadShapeN == 0), "Unsupported warp thread layout.");
+      constexpr auto WarpgroupThreadLayout = make_layout(make_shape(Int<WarpThreadShapeN>{}, Int<NumThreadsPerWarpGroup / WarpThreadShapeN>{}));
+
+      // Get copy tile and partition to each thread
+      auto sB_tiled_copy = make_tiled_copy(
+        Copy_Atom<DefaultCopy, ElementB>{},
+        WarpgroupThreadLayout,                           // thr_layout
+        Layout<_1>{}                                     // val_layout
+      );
+      static_assert(size(sB_tiled_copy) == size(TiledMma{}), "Wrong thread number in TiledCopy.");
+
+      auto sB_thr_copy        = sB_tiled_copy.get_thread_slice(warp_group_thread_idx);
+      Tensor tCsB             = sB_thr_copy.partition_S(     sB(_,_,read_stage)); // (CPY, CPY_N, CPY_K)
+      Tensor tCsB_transposed  = sB_thr_copy.partition_D(gmma_sB(_,_,read_stage)); // (CPY, CPY_N, CPY_K)
+
+      // Divide partitioned tile to limit register usage
+      constexpr int  CopySteps      = size<0>(SmemLayoutB{}) / WarpgroupTileSize;
+      constexpr auto CopyTileShape  = make_shape(size<0>(tCsB), Int< size<1>(tCsB) / CopySteps >{}, size<2>(tCsB));
+      static_assert(size<1>(tCsB) % CopySteps == 0, "CopySteps must evenly divide rank 1 size of partitioned SMEM.");
+
+      Tensor tCsB_copy_tile            = zipped_divide(tCsB, CopyTileShape);
+      Tensor tCsB_copy_tile_transposed = zipped_divide(tCsB_transposed, CopyTileShape);
+      auto   transpose_fragment        = make_fragment_like(tCsB_copy_tile(_,_0{}));
+
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (int step = 0; step < CopySteps; ++step) {
+        copy(sB_tiled_copy, tCsB_copy_tile(_,step), transpose_fragment);
+
+        // Make sure all elements are read before being overwritten
+        __syncthreads();
+
+        copy(sB_tiled_copy, transpose_fragment, tCsB_copy_tile_transposed(_,step));
+      }
+  }
+
+  CUTLASS_DEVICE void synchronize(int step) {
+    if (step == 0) {
+      // SMEM fence to make sure B is transposed before math
+      cutlass::arch::fence_view_async_shared();
+      cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::TransposeBarrier);
+    }
+  }
+
+  CUTLASS_DEVICE void synchronize() {
+    // SMEM fence to make sure B is transposed before math
+    cutlass::arch::fence_view_async_shared();
+    cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::TransposeBarrier);
+  }
+
+  template <
+    class TensorSmemB,
+    class TensorTransposedSmemB>
+  CUTLASS_DEVICE void transpose(
+    TensorSmemB const& sB,
+    TensorTransposedSmemB const& gmma_sB,
+    int read_stage) {
+
+    this->operator()(sB, gmma_sB, read_stage, 0);
+    synchronize();
+
+  }
+
+private:
+  const int warp_idx;
+  const int warp_group_thread_idx;
+};
+
+template<
+  class TiledMma_,
+  class SmemLayoutB_,
+  class SmemLayoutAtomB_,
+  class ElementB_>
+class AsyncTranspositionOperandB {
+public:
+
+  using TiledMma = TiledMma_;
+  using SmemLayoutB = SmemLayoutB_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using ElementB = ElementB_;
+  
+  static constexpr int Steps             = 2;
+  static constexpr int NumMathWarpGroup  = CUTE_STATIC_V(size(TiledMma{})) / NumThreadsPerWarpGroup;
+  static constexpr int StepsPerWarpGroup = Steps / NumMathWarpGroup;
+  static_assert(NumMathWarpGroup <= 2,
+                    "Wrong math warp group number for TransposeB");
+  static constexpr int WarpgroupTileSize = size<1>(SmemLayoutB{});  // A warp group tile would process entire Smem K.
+  static constexpr int NumWarpsPerWarpGroup = NumThreadsPerWarpGroup / NumThreadsPerWarp;
+
+  static constexpr int BytesPerSmemSwizzleUnit = 16;
+  static constexpr int WarpThreadShapeN = BytesPerSmemSwizzleUnit / sizeof(ElementB);
+  static constexpr int WarpThreadShapeK = NumThreadsPerWarp / WarpThreadShapeN;
+  static constexpr int NumWarpTilePerWarpgroupTile = NumWarpsPerWarpGroup * (Steps == 8 ? 2 : 1);
+
+  static constexpr int WarpTileSize                = WarpgroupTileSize / NumWarpTilePerWarpgroupTile;
+  static_assert(WarpTileSize >= WarpThreadShapeN && WarpTileSize >= WarpThreadShapeK, "Invaild warp thread shape." );
+  static constexpr int TilesPerWarp                = 2;                     // Each Warp would process 2 warp_tiles in one step.
+  static constexpr int64_t WarpTileNCoordLUT = 06723763275316420;
+  static constexpr int64_t WarpTileKCoordLUT = 05410541064206420;
+  static constexpr int NumStepsEncoded       = 4;                             // Only encoding first 4 steps into LUT.
+  static constexpr int MaskPerStep           = 07;                            // Each step is encoded into 3bits,
+  static constexpr int NumBitsPerStep        = 3;
+  static constexpr int MaskPerWarp           = 07777;                         // Each warp has 4 steps(12 bits)
+  static constexpr int NumBitsPerWarp        = 12;
+  // Number of warp_group_tiles
+  static_assert(size<0>(SmemLayoutB{}) % WarpgroupTileSize == 0,
+    "Copy size must evenly divide SMEM tile.");
+  static constexpr int WarpgroupTileNum = size<0>(SmemLayoutB{}) / WarpgroupTileSize;
+
+  static_assert(size<2>(typename TiledMma::AtomShape_MNK{}) <= WarpThreadShapeK,
+      "Need to be able to transpose first k-block in the first step");
+
+  constexpr CUTLASS_HOST_DEVICE
+  AsyncTranspositionOperandB(
+      int warp_idx_,
+      int warp_group_thread_idx_,
+      TiledMma,
+      SmemLayoutB,
+      SmemLayoutAtomB,
+      ElementB)
+      : warp_idx(warp_idx_)
+      , warp_group_thread_idx(warp_group_thread_idx_)
+      , warp_idx_in_warp_group(warp_idx_ % NumWarpsPerWarpGroup)
+      , current_warp_tile_n_coord_LUT((WarpTileNCoordLUT >> ((warp_idx_
+            % NumWarpsPerWarpGroup) * NumBitsPerWarp)) & MaskPerWarp)
+      , current_warp_tile_k_coord_LUT((WarpTileKCoordLUT >> ((warp_idx_
+            % NumWarpsPerWarpGroup) * NumBitsPerWarp)) & MaskPerWarp) { }
+
+  template <
+    class TensorSmemB,
+    class TensorTransposedSmemB>
+  CUTLASS_DEVICE void operator()(
+      TensorSmemB const& sB,
+      TensorTransposedSmemB const& gmma_sB,
+      int read_stage, int current_step)
+  {
+      if (current_step >= StepsPerWarpGroup) {
+        return;
+      }
+
+      static constexpr auto WarpThreadLayout           = make_layout(make_shape(Int<WarpThreadShapeN>{}, Int<WarpThreadShapeK>{}));
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      /// A warp group uses 2 steps to transpose the whole WarpgroupTileSize x WarpgroupTileSize.
+      /// In each step, one warp would hold two warp_tiles.
+      ///  Step 0:                Step 1:
+      ///  W0 W1 W2 W3            -- -- -- --
+      ///  W1 W0 -- --            -- -- W3 W2
+      ///  W2 -- -- --            -- W3 W0 W1
+      ///  W3 -- -- --            -- W2 W1 W0
+      ///
+      /////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///
+      /// Fully static coord LUT to avoid extra register use.
+      /// [warp_id][step][warp_tile][n / k]
+      /// Step 0            Step 1         Step 2          Step 3          Step 4          Step 5         Step 6           Step 7
+      /// {{{0,0}, {1,1}}, {{2,2}, {3,3}}, {{4,4}, {5,5}}, {{6,6}, {7,7}}, {{4,0}, {0,4}}, {{4,1}, {1,4}}, {{4,2}, {2,4}}, {{4,3}, {3,4}}}, // W0
+      /// {{{1,0}, {0,1}}, {{3,2}, {2,3}}, {{5,4}, {4,5}}, {{7,6}, {6,7}}, {{5,0}, {0,5}}, {{5,1}, {1,5}}, {{5,2}, {2,5}}, {{5,3}, {3,5}}}, // W1
+      /// {{{2,0}, {0,2}}, {{3,1}, {1,3}}, {{6,4}, {4,6}}, {{7,5}, {5,7}}, {{6,0}, {0,6}}, {{6,1}, {1,6}}, {{6,2}, {2,6}}, {{6,3}, {3,6}}}, // W2
+      /// {{{3,0}, {0,3}}, {{2,1}, {1,2}}, {{7,4}, {4,7}}, {{6,5}, {5,6}}, {{7,0}, {0,7}}, {{7,1}, {1,7}}, {{7,2}, {2,7}}, {{7,3}, {3,7}}}, // W3
+      ///
+      /// Encoding the coord of warp tile0 into two int64_t values.
+      /// Only encoding Step 0 ~ Step 4, since Step 5 ~ Step 7 have a straightforward pattern.
+      /// Only encoding warp tile0, since the coords of warp tile1 could be easily deduced from warp tile0.
+      /// The 2-step transposition and the 8-step transposition share the same encoding.
+      ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      // Divide entire SMEM to multiple warp_tiles
+      constexpr auto WarpTileShape = make_shape(Int<WarpTileSize>(), Int<WarpTileSize>());
+      Tensor s_tile                = zipped_divide(     sB(_,_,read_stage), WarpTileShape);
+      Tensor s_tile_transposed     = zipped_divide(gmma_sB(_,_,read_stage), WarpTileShape);
+
+      // Get copy tile
+      auto sB_tiled_copy = make_tiled_copy(
+        Copy_Atom<DefaultCopy, ElementB>{},
+        WarpThreadLayout,     // thr_layout
+        Layout<_1>{}          // val_layout
+      );
+
+      static_assert(size(sB_tiled_copy) * NumWarpsPerWarpGroup == size(TiledMma{}) / NumMathWarpGroup, "Wrong thread number in TiledCopy.");
+      auto sB_thr_copy = sB_tiled_copy.get_thread_slice(warp_group_thread_idx % NumThreadsPerWarp);  // slice based on lane_idx
+
+      // Construct fragments for transposition
+      Tensor tmp_tCsB = sB_thr_copy.partition_S(flatten(s_tile(_, make_coord(_0{}, _0{}))));
+      decltype(make_fragment_like(tmp_tCsB)) transpose_fragments[TilesPerWarp] = {
+        make_fragment_like(tmp_tCsB),
+        make_fragment_like(tmp_tCsB)
+      };
+
+      [[maybe_unused]] int step = current_step * NumMathWarpGroup;
+      if constexpr (NumMathWarpGroup == 2) {
+        // For 2 math warpgroup, warp idx4~7 is 1st warp group and 8~9 is 2nd, so decide if 2nd warpgroup need warp idx divide 8.
+        step += warp_idx / (NumWarpsPerWarpGroup * 2);
+      }
+
+      int tmp_warp_tile_n_coord_LUT = current_warp_tile_n_coord_LUT >> (NumBitsPerStep * current_step);
+      int tmp_warp_tile_k_coord_LUT = current_warp_tile_k_coord_LUT >> (NumBitsPerStep * current_step);
+
+      if constexpr (NumMathWarpGroup == 2) {
+        tmp_warp_tile_n_coord_LUT >>= NumBitsPerStep * (warp_idx / (NumWarpsPerWarpGroup * 2));
+        tmp_warp_tile_k_coord_LUT >>= NumBitsPerStep * (warp_idx / (NumWarpsPerWarpGroup * 2));
+      }
+
+      // decoding the warp tile coord.
+      int warp_tile0_n, warp_tile0_k;
+      if constexpr (StepsPerWarpGroup <= NumStepsEncoded) {
+        warp_tile0_n = tmp_warp_tile_n_coord_LUT & MaskPerStep;
+        warp_tile0_k = tmp_warp_tile_k_coord_LUT & MaskPerStep;
+      } else {
+        warp_tile0_n = step < NumStepsEncoded ? (tmp_warp_tile_n_coord_LUT & MaskPerStep) : 4 + warp_idx_in_warp_group;
+        warp_tile0_k = step < NumStepsEncoded ? (tmp_warp_tile_k_coord_LUT & MaskPerStep) : step - 4;
+      }
+
+      int warp_tile1_n = warp_tile0_n == warp_tile0_k ? warp_tile0_n + 1 : warp_tile0_k;
+      int warp_tile1_k = warp_tile0_n == warp_tile0_k ? warp_tile0_k + 1 : warp_tile0_n;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_group_tile = 0; warp_group_tile < WarpgroupTileNum; ++warp_group_tile) {
+
+        static_assert(TilesPerWarp == 2);
+
+        // [warp_tile][n/k]
+        const int warp_tile_coord[TilesPerWarp][2] = {
+          // n                                                           k
+          {warp_group_tile * NumWarpTilePerWarpgroupTile + warp_tile0_n, warp_tile0_k}, // warp_tile 0
+          {warp_group_tile * NumWarpTilePerWarpgroupTile + warp_tile1_n, warp_tile1_k}  // warp_tile 1
+        };
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int warp_tile = 0; warp_tile < TilesPerWarp; ++warp_tile) {
+          Tensor tCsB = sB_thr_copy.partition_S(
+            flatten(s_tile(_, make_coord(warp_tile_coord[warp_tile][0], warp_tile_coord[warp_tile][1])))
+          ); // (CPY, CPY_N, CPY_K)
+
+          copy(sB_tiled_copy, tCsB, transpose_fragments[warp_tile]);
+        }
+
+        // Make sure elements in two 8x8 warp tiles are all consumed
+        __syncwarp();
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int warp_tile = 0; warp_tile < TilesPerWarp; ++warp_tile) {
+          Tensor tCsB_transposed = sB_thr_copy.partition_D(
+            flatten(s_tile_transposed(_, make_coord(warp_tile_coord[warp_tile][0], warp_tile_coord[warp_tile][1])))
+          ); // (CPY, CPY_N, CPY_K)
+          copy(sB_tiled_copy, transpose_fragments[warp_tile], tCsB_transposed);
+        }
+
+      } // loop warp_group_tile
+  }
+
+  CUTLASS_DEVICE void synchronize(int step) {
+    if (step < StepsPerWarpGroup) {
+      // SMEM fence to make sure B is transposed before math
+      cutlass::arch::fence_view_async_shared();
+      cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::TransposeBarrier);
+    }
+  }
+
+  CUTLASS_DEVICE void synchronize() {
+    cutlass::arch::fence_view_async_shared();
+    cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::TransposeBarrier);
+  }
+
+  template <
+    class TensorSmemB,
+    class TensorTransposedSmemB>
+  CUTLASS_DEVICE void transpose(
+    TensorSmemB const& sB,
+    TensorTransposedSmemB const& gmma_sB,
+    int read_stage) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int i = 0; i < StepsPerWarpGroup; ++i) {
+      this->operator()(sB, gmma_sB, read_stage, i);
+    }
+    synchronize();
+
+  }
+private:
+  const int warp_idx;
+  const int warp_group_thread_idx;
+  const int warp_idx_in_warp_group;
+  const int current_warp_tile_n_coord_LUT;
+  const int current_warp_tile_k_coord_LUT;
+};
+
+template<
+  class TiledMma_,
+  class SmemLayoutB_,
+  class SmemLayoutAtomB_,
+  class ElementB_>
+class AsyncTranspositionOperandB_1BElementB {
+public:
+
+  static_assert(sizeof(ElementB_) == 1);
+
+  using TiledMma = TiledMma_;
+  using SmemLayoutB = SmemLayoutB_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using ElementB = ElementB_;
+
+  static constexpr int Steps             = 8;
+  static constexpr int NumMathWarpGroup  = CUTE_STATIC_V(size(TiledMma{})) / NumThreadsPerWarpGroup;
+  static constexpr int StepsPerWarpGroup = Steps / NumMathWarpGroup;
+  static_assert(NumMathWarpGroup <= 2,
+                    "Wrong math warp group number for TransposeB");
+  static constexpr int WarpgroupTileSize = size<1>(SmemLayoutB{});  // A warp group tile would process entire Smem K.
+  static constexpr int NumWarpsPerWarpGroup = NumThreadsPerWarpGroup / NumThreadsPerWarp;
+
+  static constexpr int BytesPerSmemSwizzleUnit = 16;
+  static constexpr int WarpThreadShapeN = BytesPerSmemSwizzleUnit / sizeof(ElementB);
+  static constexpr int WarpThreadShapeK = NumThreadsPerWarp / WarpThreadShapeN;
+  static constexpr int NumWarpTilePerWarpgroupTile = NumWarpsPerWarpGroup * (Steps == 8 ? 2 : 1);
+
+  static constexpr int WarpTileSize                = WarpgroupTileSize / NumWarpTilePerWarpgroupTile;
+  static_assert(WarpTileSize >= WarpThreadShapeN && WarpTileSize >= WarpThreadShapeK, "Invaild warp thread shape." );
+  static constexpr int TilesPerWarp                = 2;                     // Each Warp would process 2 warp_tiles in one step.
+  static constexpr int64_t WarpTileNCoordLUT = 06723763275316420;
+  static constexpr int64_t WarpTileKCoordLUT = 05410541064206420;
+  static constexpr int NumStepsEncoded       = 4;                             // Only encoding first 4 steps into LUT.
+  static constexpr int MaskPerStep           = 07;                            // Each step is encoded into 3bits,
+  static constexpr int NumBitsPerStep        = 3;
+  static constexpr int MaskPerWarp           = 07777;                         // Each warp has 4 steps(12 bits)
+  static constexpr int NumBitsPerWarp        = 12;
+  // Number of warp_group_tiles
+  static_assert(size<0>(SmemLayoutB{}) % WarpgroupTileSize == 0,
+    "Copy size must evenly divide SMEM tile.");
+  static constexpr int WarpgroupTileNum = size<0>(SmemLayoutB{}) / WarpgroupTileSize;
+
+  constexpr CUTLASS_HOST_DEVICE
+  AsyncTranspositionOperandB_1BElementB(
+      int warp_idx_,
+      int warp_group_thread_idx_,
+      TiledMma,
+      SmemLayoutB,
+      SmemLayoutAtomB,
+      ElementB)
+      : warp_idx(warp_idx_)
+      , warp_group_thread_idx(warp_group_thread_idx_)
+      , warp_idx_in_warp_group(warp_idx_ % NumWarpsPerWarpGroup)
+      , current_warp_tile_n_coord_LUT((WarpTileNCoordLUT >> ((warp_idx_
+            % NumWarpsPerWarpGroup) * NumBitsPerWarp)) & MaskPerWarp)
+      , current_warp_tile_k_coord_LUT((WarpTileKCoordLUT >> ((warp_idx_
+            % NumWarpsPerWarpGroup) * NumBitsPerWarp)) & MaskPerWarp) { }
+
+  template <
+    class TensorSmemB,
+    class TensorTransposedSmemB>
+  CUTLASS_DEVICE void operator()(
+      TensorSmemB const& sB,
+      TensorTransposedSmemB const& gmma_sB,
+      int read_stage, int current_step)
+  {
+    if (current_step > 0) {
+      return;
+    }
+
+    constexpr auto WarpThreadLayout           = make_layout(make_shape(Int<WarpThreadShapeN>{}, Int<WarpThreadShapeK>{}));
+    //////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    /// A warp group uses 8 steps to transpose the whole WarpgroupTileSize x WarpgroupTileSize.
+    ///  Divide a warp_group_tile into 8x8 warp_tiles to further reduce the reg usage.
+    ///  Step 0:                   Step 1:                   Step 2:                   Step 3:
+    ///  W0 W1 W2 W3 -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --
+    ///  W1 W0 -- -- -- -- -- --   -- -- W3 W2 -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --
+    ///  W2 -- -- -- -- -- -- --   -- W3 W0 W1 -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --
+    ///  W3 -- -- -- -- -- -- --   -- W2 W1 W0 -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --
+    ///  -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- W0 W1 W2 W3   -- -- -- -- -- -- -- --
+    ///  -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- W1 W0 -- --   -- -- -- -- -- -- W3 W2
+    ///  -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- W2 -- -- --   -- -- -- -- -- W3 W0 W1
+    ///  -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- W3 -- -- --   -- -- -- -- -- W2 W1 W0
+    ///
+    ///  Step 4:                   Step 5:                   Step 6:                   Step 7:
+    ///  -- -- -- -- W0 W1 W2 W3   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --
+    ///  -- -- -- -- -- -- -- --   -- -- -- -- W0 W1 W2 W3   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --
+    ///  -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- W0 W1 W2 W3   -- -- -- -- -- -- -- --
+    ///  -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- W0 W1 W2 W3
+    ///  W0 -- -- -- -- -- -- --   -- W0 -- -- -- -- -- --   -- -- W0 -- -- -- -- --   -- -- -- W0 -- -- -- --
+    ///  W1 -- -- -- -- -- -- --   -- W1 -- -- -- -- -- --   -- -- W1 -- -- -- -- --   -- -- -- W1 -- -- -- --
+    ///  W2 -- -- -- -- -- -- --   -- W2 -- -- -- -- -- --   -- -- W2 -- -- -- -- --   -- -- -- W2 -- -- -- --
+    ///  W3 -- -- -- -- -- -- --   -- W3 -- -- -- -- -- --   -- -- W3 -- -- -- -- --   -- -- -- W3 -- -- -- --
+    ///
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ///
+    /// Fully static coord LUT to avoid extra register use.
+    /// [warp_id][step][warp_tile][n / k]
+    /// Step 0            Step 1         Step 2          Step 3          Step 4          Step 5         Step 6           Step 7
+    /// {{{0,0}, {1,1}}, {{2,2}, {3,3}}, {{4,4}, {5,5}}, {{6,6}, {7,7}}, {{4,0}, {0,4}}, {{4,1}, {1,4}}, {{4,2}, {2,4}}, {{4,3}, {3,4}}}, // W0
+    /// {{{1,0}, {0,1}}, {{3,2}, {2,3}}, {{5,4}, {4,5}}, {{7,6}, {6,7}}, {{5,0}, {0,5}}, {{5,1}, {1,5}}, {{5,2}, {2,5}}, {{5,3}, {3,5}}}, // W1
+    /// {{{2,0}, {0,2}}, {{3,1}, {1,3}}, {{6,4}, {4,6}}, {{7,5}, {5,7}}, {{6,0}, {0,6}}, {{6,1}, {1,6}}, {{6,2}, {2,6}}, {{6,3}, {3,6}}}, // W2
+    /// {{{3,0}, {0,3}}, {{2,1}, {1,2}}, {{7,4}, {4,7}}, {{6,5}, {5,6}}, {{7,0}, {0,7}}, {{7,1}, {1,7}}, {{7,2}, {2,7}}, {{7,3}, {3,7}}}, // W3
+    ///
+    /// Encoding the coord of warp tile0 into two int64_t values.
+    /// Only encoding Step 0 ~ Step 4, since Step 5 ~ Step 7 have a straightforward pattern.
+    /// Only encoding warp tile0, since the coords of warp tile1 could be easily deduced from warp tile0.
+    /// The 2-step transposition and the 8-step transposition share the same encoding.
+    ///
+    //////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    // Divide entire SMEM to multiple warp_tiles
+    constexpr auto WarpTileShape = make_shape(Int<WarpTileSize>(), Int<WarpTileSize>());
+    Tensor s_tile                = zipped_divide(     sB(_,_,read_stage), WarpTileShape);
+    Tensor s_tile_transposed     = zipped_divide(gmma_sB(_,_,read_stage), WarpTileShape);
+
+    // Get copy tile
+    auto sB_tiled_copy = make_tiled_copy(
+      Copy_Atom<DefaultCopy, ElementB>{},
+      WarpThreadLayout,     // thr_layout
+      Layout<_1>{}          // val_layout
+    );
+    static_assert(size(sB_tiled_copy) * NumWarpsPerWarpGroup == size(TiledMma{}) / NumMathWarpGroup, "Wrong thread number in TiledCopy.");
+    auto sB_thr_copy = sB_tiled_copy.get_thread_slice(warp_group_thread_idx % NumThreadsPerWarp);  // slice based on lane_idx
+
+    // Construct fragments for transposition
+    Tensor tmp_tCsB = sB_thr_copy.partition_S(flatten(s_tile(_, make_coord(_0{}, _0{}))));
+    decltype(make_fragment_like(tmp_tCsB)) transpose_fragments[TilesPerWarp] = {
+      make_fragment_like(tmp_tCsB),
+      make_fragment_like(tmp_tCsB)
+    };
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int warp_group_tile = 0; warp_group_tile < WarpgroupTileNum; ++warp_group_tile) {
+      int tmp_warp_tile_n_coord_LUT = current_warp_tile_n_coord_LUT;
+      int tmp_warp_tile_k_coord_LUT = current_warp_tile_k_coord_LUT;
+      constexpr int StepsPerWarpGroup = Steps / NumMathWarpGroup;
+
+      if constexpr (NumMathWarpGroup == 2) {
+        tmp_warp_tile_n_coord_LUT >>= NumBitsPerStep * (warp_idx / (NumWarpsPerWarpGroup * 2));
+        tmp_warp_tile_k_coord_LUT >>= NumBitsPerStep * (warp_idx / (NumWarpsPerWarpGroup * 2));
+      }
+
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (int step_per_warp_group = 0; step_per_warp_group < StepsPerWarpGroup; ++step_per_warp_group) {
+        // For 2 math warpgroup, warp idx4~7 is 1st warp group and 8~9 is 2nd, so decide if 2nd warpgroup need warp idx divide 8.
+        int step = step_per_warp_group * NumMathWarpGroup + warp_idx / (NumWarpsPerWarpGroup * 2);
+        // decoding the warp tile coord.
+        int warp_tile0_n = step < NumStepsEncoded ? (tmp_warp_tile_n_coord_LUT & MaskPerStep) : 4 + warp_idx_in_warp_group;
+        int warp_tile0_k = step < NumStepsEncoded ? (tmp_warp_tile_k_coord_LUT & MaskPerStep) : step - 4;
+        int warp_tile1_n = warp_tile0_n == warp_tile0_k ? warp_tile0_n + 1 : warp_tile0_k;
+        int warp_tile1_k = warp_tile0_n == warp_tile0_k ? warp_tile0_k + 1 : warp_tile0_n;
+
+        tmp_warp_tile_n_coord_LUT >>= NumBitsPerStep;
+        tmp_warp_tile_k_coord_LUT >>= NumBitsPerStep;
+
+        static_assert(TilesPerWarp == 2);
+
+        // [warp_tile][n/k]
+        const int warp_tile_coord[TilesPerWarp][2] = {
+          // n                                                           k
+          {warp_group_tile * NumWarpTilePerWarpgroupTile + warp_tile0_n, warp_tile0_k}, // warp_tile 0
+          {warp_group_tile * NumWarpTilePerWarpgroupTile + warp_tile1_n, warp_tile1_k}  // warp_tile 1
+        };
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int warp_tile = 0; warp_tile < TilesPerWarp; ++warp_tile) {
+          Tensor tCsB = sB_thr_copy.partition_S(
+            flatten(s_tile(_, make_coord(warp_tile_coord[warp_tile][0], warp_tile_coord[warp_tile][1])))
+          ); // (CPY, CPY_N, CPY_K)
+
+          copy(sB_tiled_copy, tCsB, transpose_fragments[warp_tile]);
+        }
+
+        // Make sure elements in two 8x8 warp tiles are all consumed
+        __syncwarp();
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int warp_tile = 0; warp_tile < TilesPerWarp; ++warp_tile) {
+          Tensor tCsB_transposed = sB_thr_copy.partition_D(
+            flatten(s_tile_transposed(_, make_coord(warp_tile_coord[warp_tile][0], warp_tile_coord[warp_tile][1])))
+          ); // (CPY, CPY_N, CPY_K)
+          copy(sB_tiled_copy, transpose_fragments[warp_tile], tCsB_transposed);
+        }
+      } // lock step
+    } // loop warp_group_tile
+  }
+
+  CUTLASS_DEVICE void synchronize(int step) {
+    if (step == 0) {
+      // SMEM fence to make sure B is transposed before math
+      cutlass::arch::fence_view_async_shared();
+      cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::TransposeBarrier);
+    }
+  }
+
+  CUTLASS_DEVICE void synchronize() {
+    cutlass::arch::fence_view_async_shared();
+    cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::TransposeBarrier);
+  }
+
+  template <
+    class TensorSmemB,
+    class TensorTransposedSmemB>
+  CUTLASS_DEVICE void transpose(
+    TensorSmemB const& sB,
+    TensorTransposedSmemB const& gmma_sB,
+    int read_stage) {
+    this->operator()(sB, gmma_sB, read_stage, 0);
+    synchronize();
+  }
+
+private:
+  const int warp_idx;
+  const int warp_group_thread_idx;
+  const int warp_idx_in_warp_group;
+  const int current_warp_tile_n_coord_LUT;
+  const int current_warp_tile_k_coord_LUT;
+};
+
+
+template<
+  class TiledMma,
+  class SmemLayoutB,
+  class SmemLayoutAtomB,
+  class ElementB,
+  bool TransposeB
+>
+constexpr CUTLASS_HOST_DEVICE
+auto
+make_transpose_operand_b(
+    int warp_idx,
+    int warp_group_thread_idx,
+    TiledMma,
+    SmemLayoutB,
+    SmemLayoutAtomB,
+    ElementB,
+    cute::bool_constant<TransposeB>)
+{
+  if constexpr (!TransposeB) {
+    return NoTranspositionOperandB(
+        warp_idx, warp_group_thread_idx, TiledMma{},
+        SmemLayoutB{}, SmemLayoutAtomB{}, ElementB{});
+  }
+  else if constexpr (use_universal_transposition<SmemLayoutAtomB, ElementB>()) {
+    return UniversalTranspositionOperandB(
+        warp_idx, warp_group_thread_idx, TiledMma{},
+        SmemLayoutB{}, SmemLayoutAtomB{}, ElementB{});
+  }
+  else if constexpr (sizeof(ElementB) == 1) {
+    return AsyncTranspositionOperandB_1BElementB(
+        warp_idx, warp_group_thread_idx, TiledMma{},
+        SmemLayoutB{}, SmemLayoutAtomB{}, ElementB{});
+  }
+  else {
+    return AsyncTranspositionOperandB(
+        warp_idx, warp_group_thread_idx, TiledMma{},
+        SmemLayoutB{}, SmemLayoutAtomB{}, ElementB{});
+  }
+}
+
+}; // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace transform
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/transform/device/transform_universal_adapter.hpp b/include/cutlass/transform/device/transform_universal_adapter.hpp
new file mode 100644
index 0000000000..a5033d80eb
--- /dev/null
+++ b/include/cutlass/transform/device/transform_universal_adapter.hpp
@@ -0,0 +1,303 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Transform Kernel Universal adapter
+*/
+
+#pragma once
+
+// common
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/detail/mma.hpp"
+#include "cutlass/cuda_host_adapter.hpp"
+
+#include "cutlass/kernel_launch.h"
+#if !defined(__CUDACC_RTC__)
+#include "cutlass/cluster_launch.hpp"
+#include "cutlass/trace.h"
+#endif // !defined(__CUDACC_RTC__)
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::transform::device {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <class TransformKernel_>
+class TransformUniversalAdapter
+{
+public:
+  using TransformKernel = GetUnderlyingKernel_t<TransformKernel_>;
+  using Arguments = typename TransformKernel::Arguments;
+  using Params = typename TransformKernel::Params;
+  static bool const kEnableCudaHostAdapter = CUTLASS_ENABLE_CUDA_HOST_ADAPTER;
+
+
+private:
+
+  /// Kernel API parameters object
+  Params params_;
+
+public:
+
+  /// Access the Params structure
+  Params const& params() const {
+    return params_;
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status
+  can_implement(Arguments const& args) {
+    return TransformKernel::can_implement(args);
+  }
+
+  /// Gets the workspace size
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_bytes = 0;
+    workspace_bytes += TransformKernel::get_workspace_size(args);
+
+    CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
+
+    return workspace_bytes;
+  }
+
+  /// Computes the grid shape
+  static dim3
+  get_grid_shape(Arguments const& args, void* workspace = nullptr) {
+    auto tmp_params = TransformKernel::to_underlying_arguments(args, workspace);
+    return TransformKernel::get_grid_shape(tmp_params);
+  }
+
+  /// Computes the grid shape
+  static dim3
+  get_grid_shape(Params const& params) {
+    return TransformKernel::get_grid_shape(params);
+  }
+
+
+  /// Initializes GEMM state from arguments.
+  Status
+  initialize(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+
+    CUTLASS_TRACE_HOST("TransformUniversalAdapter::initialize() - workspace "
+      << workspace << ", stream: " << (stream ? "non-null" : "null")
+      << ", EnableCudaHostAdapter: " << (kEnableCudaHostAdapter ? "True" : "false"));
+
+    // Initialize the workspace
+    Status status = TransformKernel::initialize_workspace(args, workspace, stream, cuda_adapter);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    // Initialize the Params structure
+    params_ = TransformKernel::to_underlying_arguments(args, workspace);
+    // Don't set the function attributes - require the CudaHostAdapter to set it.
+    if constexpr (kEnableCudaHostAdapter) {
+      CUTLASS_ASSERT(cuda_adapter);
+      return Status::kSuccess;
+    }
+    else {
+      //
+      // Account for dynamic smem capacity if needed
+      //
+      int smem_size = TransformKernel::SharedStorageSize;
+
+      CUTLASS_ASSERT(cuda_adapter == nullptr);
+
+      if (smem_size >= (48 << 10)) {
+        CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
+        cudaError_t result = cudaFuncSetAttribute(
+            device_kernel<TransformKernel>,
+            cudaFuncAttributeMaxDynamicSharedMemorySize,
+            smem_size);
+        if (cudaSuccess != result) {
+          result = cudaGetLastError(); // to clear the error bit
+          CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error: " << cudaGetErrorString(result));
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    return Status::kSuccess;
+  }
+
+  static Status
+  run(Params& params,
+      cudaStream_t stream = nullptr,
+      CudaHostAdapter *cuda_adapter = nullptr,
+      int32_t kernel_index = 0,
+      bool launch_with_pdl = false) {
+    CUTLASS_TRACE_HOST("TransformUniversalAdapter::run()");
+    dim3 const block = TransformKernel::get_block_shape();
+    dim3 const grid = get_grid_shape(params);
+
+    // configure smem size and carveout
+    int smem_size = TransformKernel::SharedStorageSize;
+
+    Status launch_result{ Status::kSuccess };
+    // Use extended launch API only for mainloops that use it
+    if constexpr (TransformKernel::ArchTag::kMinComputeCapability >= 90) {
+      // Currently only support 1x1x1 for transform kernel.
+      dim3 const cluster = {1,1,1};
+      void* kernel_params[] = {&params};
+
+      if constexpr (kEnableCudaHostAdapter) {
+        //
+        // Use the cuda host adapter
+        //
+        CUTLASS_ASSERT(cuda_adapter);
+        if (cuda_adapter) {
+
+          if (launch_with_pdl) {
+            CUTLASS_TRACE_HOST(
+              "TransformUniversalAdapter::run() does not support launching with PDL and a custom cuda adapter.");
+            return Status::kErrorInternal;
+          }
+          launch_result = cuda_adapter->launch(grid,
+                                               cluster,
+                                               block,
+                                               smem_size,
+                                               stream,
+                                               kernel_params,
+                                               kernel_index);
+          CUTLASS_TRACE_HOST("Kernel Launch Result" << cutlassGetStatusString(launch_result));
+        }
+        else {
+          return Status::kErrorInternal;
+        }
+      }
+      else {
+        CUTLASS_ASSERT(cuda_adapter == nullptr);
+        void const* kernel = (void const*) device_kernel<TransformKernel>;
+        if constexpr (TransformKernel::ArchTag::kMinComputeCapability == 90) {
+          launch_result = ClusterLauncher::launch(
+            grid, cluster, block, smem_size, stream, kernel, kernel_params, launch_with_pdl);
+        }
+      }
+    }
+    else {
+      launch_result = Status::kSuccess;
+      cutlass::arch::synclog_setup();
+
+      if constexpr (kEnableCudaHostAdapter) {
+        CUTLASS_ASSERT(cuda_adapter);
+        if (cuda_adapter) {
+          void* kernel_params[] = {&params};
+
+          launch_result = cuda_adapter->launch(
+            grid, block, smem_size, stream, kernel_params, 0
+          );
+
+        }
+        else {
+          return Status::kErrorInternal;
+        }
+      }
+      else {
+        CUTLASS_ASSERT(cuda_adapter == nullptr);
+        cutlass::kernel_launch<TransformKernel>(grid, block, smem_size, stream, params, launch_with_pdl);
+      }
+    }
+
+    cudaError_t result = cudaGetLastError();
+    if (cudaSuccess == result && Status::kSuccess == launch_result) {
+      return Status::kSuccess;
+    }
+    else if (cudaSuccess != result) {
+      CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << cudaGetErrorString(result));
+    }
+    else if (Status::kSuccess != launch_result) {
+      CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << cutlassGetStatusString(launch_result));
+    }
+    return Status::kErrorInternal;
+  }
+
+  //
+  // Non-static launch overloads that first create and set the internal params struct of this kernel handle.
+  //
+
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  run(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr,
+    int32_t kernel_index = 0,
+    bool launch_with_pdl = false
+  ) {
+    Status status = initialize(args, workspace, stream, cuda_adapter);
+
+    if (Status::kSuccess == status) {
+      status = run(params_, stream, cuda_adapter, kernel_index, launch_with_pdl);
+    }
+    return status;
+  }
+
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  operator()(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr,
+    bool launch_with_pdl = false) {
+    return run(args, workspace, stream, cuda_adapter, 0 /*kernel_index*/, launch_with_pdl);
+  }
+
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  run(
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr,
+    bool launch_with_pdl = false) {
+    return run(params_, stream, cuda_adapter, 0 /*kernel_index*/, launch_with_pdl);
+  }
+
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  operator()(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, bool launch_with_pdl = false) {
+    return run(params_, stream, cuda_adapter, 0 /*kernel_index*/, launch_with_pdl);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::transform::device
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/transform/kernel/filter_format_transformer.hpp b/include/cutlass/transform/kernel/filter_format_transformer.hpp
new file mode 100644
index 0000000000..9f54c93f1a
--- /dev/null
+++ b/include/cutlass/transform/kernel/filter_format_transformer.hpp
@@ -0,0 +1,223 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/* \file
+   \brief Convolution filter format transformation kernel.
+*/
+
+#pragma once
+
+#include <algorithm>
+#include <random>
+
+#include "cutlass/coord.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/cuda_host_adapter.hpp"
+
+#include "cute/int_tuple.hpp"
+#include "cute/tensor.hpp"
+#include "cute/config.hpp"
+
+namespace cutlass::transform::kernel {
+
+using namespace cute;
+
+enum class FilterFormat {
+  CKTRS,
+  CTRSK,
+  KTRSC
+};
+
+template <
+  FilterFormat SrcFormat,
+  FilterFormat DstFormat,
+  int NumDimensions,
+  class Element_,
+  int AlignmentBytes = 16
+>
+struct ConvFilterFormatTransformer {
+  
+  using Element = Element_;
+  static_assert(SrcFormat == FilterFormat::CKTRS, "Currently only source format of CKTRS is supported");
+  static_assert(DstFormat == FilterFormat::CTRSK || DstFormat == FilterFormat::KTRSC, "Currently only destination format of CTRSK/KTRSC is supported");
+  static_assert(AlignmentBytes > 0 && AlignmentBytes % static_cast<int>(sizeof(Element)) == 0, "Invalid alignment setting");
+
+  // In ktrsc order.
+  using FilterExtent = array<int, NumDimensions>;
+
+  // Default cta tile shape: 32x32
+  static constexpr auto CTATileShape = make_shape(Int<4 * AlignmentBytes / static_cast<int>(sizeof(Element))>{}, Int<32>{});
+  // Default thread layout: (4, 32)
+  static constexpr auto ThreadLayout = make_layout(make_shape(Int<4>{}, Int<32>{}));
+
+  static constexpr uint32_t MaxThreadsPerBlock = 128;
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  using ArchTag = arch::Sm90;
+
+  // Default ctor
+  CUTLASS_HOST_DEVICE
+  ConvFilterFormatTransformer() {}
+
+  struct Arguments {
+    const void *src_ptr;
+    void *dst_ptr;
+    FilterExtent filter_extent;
+  };
+
+  struct Params {
+    using TensorSrc = decltype(make_tensor(make_gmem_ptr(recast_ptr<const Element>(nullptr)), make_layout(take<0,NumDimensions>(FilterExtent{}))));
+    using TensorDst = decltype(make_tensor(make_gmem_ptr(recast_ptr<Element>(nullptr)), make_layout(make_shape(int32_t(0), int32_t(0)))));
+
+    TensorSrc src;
+    TensorDst dst; 
+  };
+
+  struct SharedStorage {
+    /* empty, no smem needed */
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  static Status
+  can_implement(Arguments const& args) {
+    bool implementable = true;
+    // alignment rule
+    {
+      int contiguous_dim = DstFormat == FilterFormat::CTRSK ? args.filter_extent[0] : args.filter_extent[NumDimensions - 1];
+      int align_element = AlignmentBytes / static_cast<int>(sizeof(Element));
+
+      implementable &= (contiguous_dim % align_element == 0);
+
+      if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Alignment setting is invalid.\n");
+        return Status::kInvalid;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    return 0;
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(size(shape(ThreadLayout)), 1, 1);
+  }
+
+  static dim3
+  get_grid_shape(Params const& params) {
+    auto dim_m = ceil_div(size<0>(shape(params.dst)), get<0>(CTATileShape));
+    auto dim_n = ceil_div(size<1>(shape(params.dst)), get<1>(CTATileShape));
+
+    return dim3(dim_m, dim_n, 1);
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+    return Status::kSuccess;
+  }
+
+  static Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    auto k = args.filter_extent[0];
+    auto c = args.filter_extent[NumDimensions - 1];
+    auto srt = reverse(take<1,NumDimensions - 1>(args.filter_extent));
+
+    // source shape (s,r,t,k,c)
+    auto shape_src = flatten(make_shape(srt, k, c));
+    auto shape_dst = DstFormat == FilterFormat::CTRSK ? make_shape(k, c * product(srt)) : make_shape(c, k * product(srt));
+
+    auto src = make_tensor(make_gmem_ptr(recast_ptr<const Element>(args.src_ptr)), make_layout(shape_src));
+    auto dst = make_tensor(make_gmem_ptr(recast_ptr<Element>(args.dst_ptr)), make_layout(shape_dst));
+
+    return Params{src, dst};
+  }
+
+  CUTLASS_DEVICE
+  void operator()(Params const& params, char *smem_buf) {
+    // Tile the input tensor into blocks
+    auto block_coord = make_coord(blockIdx.x, blockIdx.y);
+    auto block_shape = make_shape(Int<4 * AlignmentBytes / static_cast<int>(sizeof(Element))>{}, Int<32>{});
+    // Default thread layout: (4, 32)
+    auto thread_layout = make_layout(make_shape(Int<4>{}, Int<32>{}));
+    auto vec_layout = make_layout(make_shape(Int<AlignmentBytes / static_cast<int>(sizeof(Element))>{}, Int<1>{}));
+
+    Tensor tile_D = local_tile(params.dst, block_shape, block_coord);
+
+    // Construct tiled copy
+    using AccessType = cutlass::AlignedArray<Element, size(vec_layout)>;
+    using Atom = Copy_Atom<UniversalCopy<AccessType>, Element>;
+
+    auto tiled_copy = make_tiled_copy(Atom{}, thread_layout, vec_layout);
+    auto thr_copy = tiled_copy.get_thread_slice(threadIdx.x);
+    Tensor thr_tile_D = thr_copy.partition_D(tile_D);
+
+    // shape (s, r, t)
+    auto shape_trs = take<0, NumDimensions - 2>(shape(params.src));
+    // strided_c = c for format CTRSK, strided_c = k for format KTRSC
+    auto strided_c = DstFormat == FilterFormat::CTRSK ? get<NumDimensions - 1>(shape(params.src)) : get<NumDimensions - 2>(shape(params.src));
+    // shape (s, r, t, c) for format CTRSK and shape (s, r, t, k) for format KTRSC 
+    auto shape_ctrs = append<NumDimensions - 1>(shape_trs, strided_c);
+    auto srtc_coord = idx2crd(int(blockIdx.y * get<1>(block_shape) + threadIdx.x / size<0>(thread_layout)), shape_ctrs);
+    // index of k for format CTRSK and index of c for format KTRSC
+    auto n_layout = make_layout(make_shape(gridDim.x, size<0>(thread_layout)), make_stride(size<0>(block_shape), size<0>(vec_layout)));
+    int n_idx = n_layout(make_coord(blockIdx.x, threadIdx.x % size<0>(thread_layout)));
+
+    // Fragment to load from S and store to D
+    auto frag = make_fragment_like(thr_tile_D);
+    // Predicate tensor.
+    Tensor thr_tile_P = make_tensor<bool>(shape(thr_tile_D));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(frag); ++i) {
+      auto srt_coord = take<0, NumDimensions - 2>(srtc_coord);
+      auto kc_coord = DstFormat == FilterFormat::CTRSK ?
+          make_coord(n_idx+i, get<NumDimensions - 2>(srtc_coord)) :
+          make_coord(get<NumDimensions - 2>(srtc_coord), n_idx+i);
+      auto coord = flatten(make_coord(srt_coord, kc_coord)); 
+      thr_tile_P(i) = elem_less(coord, shape(params.src));
+      if (thr_tile_P(i)) {
+        frag(i) = params.src(coord);
+      }
+    }
+
+    // Copy from RMEM to GMEM
+    copy_if(tiled_copy, thr_tile_P, frag, thr_tile_D);
+  }
+};
+
+} // namespace cutlass::transform::kernel
diff --git a/include/cutlass/transform/kernel/sm90_sparse_gemm_compressor.hpp b/include/cutlass/transform/kernel/sm90_sparse_gemm_compressor.hpp
new file mode 100644
index 0000000000..dd4fa0c14a
--- /dev/null
+++ b/include/cutlass/transform/kernel/sm90_sparse_gemm_compressor.hpp
@@ -0,0 +1,580 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Compress utils specific for SM90 structure sparse kernels
+*/
+
+#pragma once
+
+#include "cute/container/bit_field.hpp"    // cute::bit_field
+#include "cute/numeric/numeric_types.hpp"  // cute::sizeof_bits_v, cute::uint_bit_t
+#include "cute/tensor.hpp"                 // cute::Tensor, cute::make_tensor
+#include "cute/algorithm/cooperative_copy.hpp" // cute::cooperative_copy
+#include "cutlass/arch/arch.h"             // cutlass::arch::Sm90
+#include "cutlass/cuda_host_adapter.hpp"   // cutlass::CudaHostAdapter
+#include "cutlass/cutlass.h"               // cutlass::Status
+#include "cutlass/gemm/gemm.h"             // cutlass::TagToStrideA_t
+#include "cutlass/fast_math.h"             // cutlass::ceil_div, cutlass::round_up
+#include "cutlass/kernel_hardware_info.h"  // cutlass::KernelHardwareInfo
+#include "cutlass/numeric_size.h"          // cutlass::bits_to_bytes
+#include "cutlass/cuda_host_adapter.hpp"   // cutlass::CudaHostAdapter
+
+namespace cutlass::transform::kernel {
+
+using namespace cute;
+
+template<
+  class ProblemShape_,
+  class ElementA_,
+  class LayoutATag_,
+  class SparseConfig_
+>
+class SM90StructuredSparseCompressor {
+public:
+  using SparseConfig = SparseConfig_;
+  using ProblemShape = ProblemShape_;
+
+  // * EltA
+  using ElementA = ElementA_;
+  using ElementAUint = cute::uint_bit_t<cute::sizeof_bits_v<ElementA>>;
+  using ElementAMma = typename SparseConfig::ElementAMma;
+  using ElementAMmaRaw = typename SparseConfig::ElementAMmaRaw;
+  using ElementAMmaRawUnit = cute::uint_bit_t<cute::sizeof_bits_v<ElementAMmaRaw>>;
+  using ElementASparsity = typename SparseConfig::ElementASparsity;
+  using ElementAMmaSparsity = typename SparseConfig::ElementAMmaSparsity;
+  using ElementAUintCompressed = cute::sparse_elem<ElementASparsity{}, ElementAUint>;
+  using LayoutATag = LayoutATag_;
+  using LayoutA = LayoutATag;
+  using StrideA = cutlass::gemm::TagToStrideA_t<LayoutATag>;
+
+  // * EltE
+  using ElementEMma = typename SparseConfig::ElementEMma;
+  using ElementEMmaRaw = typename SparseConfig::ElementEMmaRaw;
+  using ElementEMmaSparsity = typename SparseConfig::ElementEMmaSparsity;
+  // Data Type for storing one chunk's metadata
+  static constexpr int ElementEBitsPerChunk = typename SparseConfig::ElementEBitsPerChunk{};
+  CUTE_STATIC_ASSERT(ElementEBitsPerChunk == 4, "ElementEBitsPerChunk is 4 for SM90");
+  using ElementEChunk = cute::uint_bit_t<ElementEBitsPerChunk>;
+  CUTE_STATIC_ASSERT(cute::is_same_v<ElementEChunk, cute::uint4_t>, "ElementEChunk is uint4_t for SM90");
+  using ElementESparsityPerChunk = Int<ElementEMmaSparsity{} / (cute::sizeof_bits_v<ElementEMmaRaw> / ElementEBitsPerChunk)>;
+
+  // AtomE
+  using TensorEAtom = typename SparseConfig::TensorEAtom;
+  using TensorEAtomK = typename SparseConfig::TensorEAtomK;
+  using TensorEAtomM = typename SparseConfig::TensorEAtomM;
+
+  static constexpr int ElemsARawPerElementAMmaRaw = typename SparseConfig::ElemsARawPerElementAMmaRaw{};
+  static constexpr int LogicalElemsAPerChunk = typename SparseConfig::LogicalElemsAPerChunk{};
+  static constexpr int PhysicalElemsAPerChunk = typename SparseConfig::PhysicalElemsAPerChunk{};
+  static constexpr int LogicalElemsAMmaRawPerChunk = cutlass::ceil_div(LogicalElemsAPerChunk, ElemsARawPerElementAMmaRaw);
+  static constexpr int PhysicalElemsAMmaRawPerChunk = cutlass::ceil_div(PhysicalElemsAPerChunk, ElemsARawPerElementAMmaRaw);
+
+  // * Alignment
+  static constexpr int TensorEAlignmentM = typename SparseConfig::TensorEAlignmentM{};
+  static constexpr int TensorEAlignmentK = typename SparseConfig::TensorEAlignmentK{};
+  static constexpr int TensorAAlignmentK = typename SparseConfig::TensorAAlignmentK{};
+  static constexpr int TensorAAlignmentM = typename SparseConfig::TensorAAlignmentM{};
+
+  // Required by `device_kernel`
+  static constexpr int MaxThreadsPerBlock = TensorEAtomM{};
+  static constexpr int MinBlocksPerMultiprocessor = 1;
+  using ArchTag = arch::Sm90;
+
+  struct SharedStorage {
+    ElementEMma cEsE[cute::size(TensorEAtom{})];
+    ElementAUintCompressed cACsAC[cute::size(TensorEAtom{})];
+    ElementAUint cAsA[cute::size(TensorEAtom{})];
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  struct TransformArguments {
+    void const* ptr_A{nullptr};
+    StrideA dA{};
+    void* ptr_ACompress{nullptr};
+    void* ptr_E{nullptr};
+  };
+
+  using TransformParams = TransformArguments;
+
+  struct Arguments {
+    ProblemShape problem_shape{};
+    TransformArguments transform{};
+    KernelHardwareInfo hw_info{};
+  };
+
+  struct Params {
+    ProblemShape problem_shape{};
+    TransformParams transform{};
+    KernelHardwareInfo hw_info{};
+    void* workspace = nullptr;
+  };
+
+public:
+  static Params
+  to_underlying_arguments(Arguments const& args, void* workspace = nullptr) {
+    CUTLASS_TRACE_HOST("SM90StructuredSparseCompressor::to_underlying_arguments()");
+    return Params{{args.problem_shape},
+                  {args.transform.ptr_A, args.transform.dA, args.transform.ptr_ACompress, args.transform.ptr_E},
+                  {args.hw_info},
+                  workspace};
+  }
+
+  static Status
+  can_implement(Arguments const& args) {
+    auto [M, N, K, L] = args.problem_shape;
+    if (K % LogicalElemsAPerChunk != 0) {
+      CUTLASS_TRACE_HOST("SM90 Sparse Compressor CAN NOT IMPLEMENT: GemmK not multiplier of logical chunk size");
+      return Status::kErrorInvalidProblem;
+    }
+    CUTLASS_TRACE_HOST("SM90StructuredSparseCompressor::can_implement() (True)");
+    return Status::kSuccess;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    CUTLASS_UNUSED(args);
+    // Backward compatible with host compressor
+    CUTLASS_TRACE_HOST("SM90StructuredSparseCompressor::get_workspace_size() (" << SharedStorageSize << ")");
+    return SharedStorageSize;
+  }
+
+  static Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+    CUTLASS_UNUSED(args);
+    CUTLASS_UNUSED(workspace);
+    CUTLASS_UNUSED(stream);
+    CUTLASS_UNUSED(cuda_adapter);
+    CUTLASS_TRACE_HOST("SM90StructuredSparseCompressor::initialize_workspace()");
+    return Status::kSuccess;
+  }
+
+  static dim3
+  get_grid_shape(Params const& params) {
+    constexpr int MaxAlignmentM = cutlass::const_max(TensorEAlignmentM, TensorAAlignmentM);
+    constexpr int MaxAlignmentK = cutlass::const_max(TensorEAlignmentK, TensorAAlignmentK);
+    const auto [GemmM, GemmN, GemmK, GemmL] = params.problem_shape;
+
+    const int GemmMAlignedMax = cutlass::round_up(GemmM, MaxAlignmentM);
+    const int GemmKAlignedMax = cutlass::round_up(GemmK, MaxAlignmentK);
+
+    const int gridDim_X = cutlass::ceil_div(GemmMAlignedMax, TensorEAtomM{});
+    const int gridDim_Y = cutlass::ceil_div(GemmKAlignedMax, TensorEAtomK{});
+    const int gridDim_Z = GemmL;
+
+    CUTLASS_TRACE_HOST("SM90StructuredSparseCompressor::get_grid_shape() ("
+      << gridDim_X << ", "
+      << gridDim_Y << ", "
+      << gridDim_Z << ")");
+    return dim3(gridDim_X, gridDim_Y, gridDim_Z);
+  }
+
+  static dim3
+  get_block_shape() {
+    CUTLASS_TRACE_HOST("SM90StructuredSparseCompressor::get_block_shape() ("
+      << MaxThreadsPerBlock << ", "
+      << 1 << ", "
+      << 1 << ")");
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTE_DEVICE
+  void
+  operator()(Params params, void* smem_buf = nullptr) {
+    run(params, smem_buf);
+  }
+
+  CUTE_DEVICE
+  static void
+  run(Params params, void* smem_buf = nullptr) {
+    structure_sparse_compress(params, smem_buf);
+  }
+
+private:
+
+  struct MetadataOneChunk1to2 {
+
+    CUTE_DEVICE
+    void set_metadata_bits(int elt_log_idx, int elt_phy_idx) {
+      auto metadata_bits = [&]() -> uint8_t {
+        CUTLASS_ASSERT(elt_log_idx >= 0 && elt_log_idx < 2);
+        switch (elt_log_idx) {
+          case 0:
+            return 0b0100;
+          case 1:
+            return 0b1110;
+          default:
+            CUTE_GCC_UNREACHABLE;
+        }
+      };
+
+      storage_ |= (metadata_bits() << (4 * elt_phy_idx));
+    }
+
+
+    CUTE_DEVICE
+    ElementEChunk storage() const {
+      return ElementEChunk{storage_};
+    }
+
+  private:
+    uint8_t storage_ = 0b0000;
+  };
+
+  struct MetadataOneChunk2to4{
+
+    CUTE_DEVICE
+    void set_metadata_bits(int elt_log_idx, int elt_phy_idx) {
+      auto metadata_bits = [&]() -> uint8_t {
+        CUTLASS_ASSERT(elt_log_idx >= 0 && elt_log_idx < 4);
+        switch (elt_log_idx) {
+          case 0:
+            return 0b00;
+          case 1:
+            return 0b01;
+          case 2:
+            return 0b10;
+          case 3:
+            return 0b11;
+          default:
+            CUTLASS_ASSERT(false);
+            CUTE_GCC_UNREACHABLE;
+            return 0b00;
+        }
+      };
+
+      storage_ |= (metadata_bits() << (2 * elt_phy_idx));
+    }
+
+    CUTE_DEVICE
+    ElementEChunk storage() const {
+      return ElementEChunk{storage_};
+    }
+
+  private:
+    uint8_t storage_ = 0b0000;
+  };
+
+  using MetadataOneChunk = cute::conditional_t<SparseConfig::IsTfmma,
+                                               MetadataOneChunk1to2,
+                                               MetadataOneChunk2to4>;
+
+private:
+
+  CUTE_DEVICE
+  static void
+  structure_sparse_compress(Params params, void* smem_buf) {
+    // * Input Params
+    auto [GemmM, GemmN, GemmK, GemmL] = params.problem_shape;
+    auto [ptr_A, dA, ptr_ACompress, ptr_E] = params.transform;
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    [[maybe_unused]] const int gridDim_X = gridDim.x;
+    [[maybe_unused]] const int gridDim_Y = gridDim.y;
+    [[maybe_unused]] const int gridDim_Z = gridDim.z;
+    [[maybe_unused]] const int blockDim_X = blockDim.x;
+
+    // * Global Tensor Layout
+    const cute::Layout layout_gA = make_layout(make_shape(GemmM, GemmK, GemmL), dA);
+    const cute::Layout layout_gAC = SparseConfig::fill_layoutA(params.problem_shape);
+    const cute::Layout layout_gE = SparseConfig::fill_layoutE(params.problem_shape);
+
+    // * Construct Global Tensor
+    const cute::Tensor gA   = make_tensor(make_gmem_ptr(cute::recast_ptr<ElementAUint>(ptr_A)), layout_gA);
+    cute::Tensor gAC_sparse = make_tensor(make_gmem_ptr(cute::recast_ptr<ElementAUintCompressed>(ptr_ACompress)), layout_gAC );
+    cute::Tensor gAC        = cute::recast<ElementAUint>(gAC_sparse);
+    cute::Tensor gE_sparse  = make_tensor(make_gmem_ptr(cute::recast_ptr<ElementEMma>(ptr_E)), layout_gE);
+    cute::Tensor gE         = cute::recast<ElementEMmaRaw>(gE_sparse);
+
+    // * CTA Tensor Layout
+    using cAsA_layout_row = decltype(make_layout(make_shape(TensorEAtomM{}, TensorEAtomK{}), LayoutRight{}));
+    using cAsA_layout_col = decltype(make_layout(make_shape(TensorEAtomM{}, TensorEAtomK{}), LayoutLeft{}));
+    using cAsA_layout     = cute::conditional_t<cute::is_same_v<LayoutATag, layout::RowMajor>, cAsA_layout_row, cAsA_layout_col>;
+    using cACsAC_layout   = decltype(make_layout(make_shape(TensorEAtomM{}, TensorEAtomK{} / ElementASparsity{}), LayoutRight{}));
+    using cEsE_layout     = decltype(make_layout(make_shape(TensorEAtomM{}, TensorEAtomK{} / ElementEMmaSparsity{}), LayoutRight{}));
+
+    CUTE_STATIC_ASSERT(cute::is_static_v<TensorEAtom>, "TensorEAtom needs to be static");
+    CUTE_STATIC_ASSERT(cute::is_static_v<cAsA_layout>, "cAsA_layout needs to be static");
+    CUTE_STATIC_ASSERT(cute::is_static_v<cACsAC_layout>, "cACsAC_layout needs to be static");
+    CUTE_STATIC_ASSERT(cute::is_static_v<cEsE_layout>, "cEsE_layout needs to be static");
+
+    const int blockIdx_X = blockIdx.x;
+    const int blockIdx_Y = blockIdx.y;
+    const int blockIdx_Z = blockIdx.z;
+    const int threadIdx_X = threadIdx.x;
+
+    // * Construct CTA Tensor
+    const auto cta_coord = make_coord(blockIdx_X, blockIdx_Y, blockIdx_Z);
+    cute::Tensor cAgA   = cute::recast<ElementAMmaRawUnit>(local_tile(gA, shape(cAsA_layout{}), cta_coord));
+    cute::Tensor cACgAC = cute::recast<ElementAMmaRawUnit>(local_tile(gAC, shape(cACsAC_layout{}), cta_coord));
+    cute::Tensor cEgE   = local_tile(gE, shape(cEsE_layout{}), cta_coord);
+
+    cute::Tensor cAsA   = cute::recast<ElementAMmaRawUnit>(make_tensor(make_smem_ptr(cute::recast_ptr<ElementAUint>(shared_storage.cAsA)), cAsA_layout{}));
+    cute::Tensor cACsAC = cute::recast<ElementAMmaRawUnit>(make_tensor(make_smem_ptr(cute::recast_ptr<ElementAUint>(shared_storage.cACsAC)), cACsAC_layout{}));
+    cute::Tensor cEsE   = make_tensor(make_smem_ptr(cute::recast_ptr<ElementEMmaRaw>(shared_storage.cEsE)), cEsE_layout{});
+    cute::Tensor cEsE_chunk = cute::recast<ElementEChunk>(cEsE);
+
+    // * Handle in unit of Chunk when compress
+    using OneChunkSizeA  = Int<LogicalElemsAMmaRawPerChunk>;
+    using OneChunkSizeAC = Int<PhysicalElemsAMmaRawPerChunk>;
+    using OneChunkSizeE  = Int<LogicalElemsAPerChunk / ElementESparsityPerChunk{}>;
+    using NumOneChunkK   = Int<cutlass::ceil_div(TensorEAtomK{}, LogicalElemsAPerChunk)>;
+
+    cute::Tensor cAsA_log_chunk   = logical_divide(cAsA, make_shape(_, OneChunkSizeA{}));
+    cute::Tensor cACsAC_log_chunk = logical_divide(cACsAC, make_shape(_, OneChunkSizeAC{}));
+    cute::Tensor cEsE_log_chunk   = logical_divide(cEsE_chunk, make_shape(_, OneChunkSizeE{}));
+
+    // * Corner Case Handle
+    const auto GemmM_within_Cta = (GemmM - blockIdx_X * TensorEAtomM{} > TensorEAtomM{}) ? TensorEAtomM{} : GemmM - blockIdx_X * TensorEAtomM{};
+    const auto GemmK_within_Cta = ( (GemmK - blockIdx_Y * TensorEAtomK{} > TensorEAtomK{}) ? TensorEAtomK{} : GemmK - blockIdx_Y * TensorEAtomK{} ) / ElemsARawPerElementAMmaRaw;
+    const auto GemmK_NumOneChunk_within_Cta = GemmK_within_Cta / LogicalElemsAMmaRawPerChunk;
+
+    const auto GemmMAlignedAC = cutlass::round_up(GemmM, TensorAAlignmentM);
+    const auto GemmKAlignedAC = cutlass::round_up(GemmK, TensorAAlignmentK);
+    const auto GemmMAlignedAC_within_Cta = (GemmMAlignedAC - blockIdx_X * TensorEAtomM{} > TensorEAtomM{}) ? TensorEAtomM{} : GemmMAlignedAC - blockIdx_X * TensorEAtomM{};
+    const auto GemmKAlignedAC_within_Cta = ( (GemmKAlignedAC - blockIdx_Y * TensorEAtomK{} > TensorEAtomK{}) ? TensorEAtomK{} : GemmKAlignedAC - blockIdx_Y * TensorEAtomK{} ) / ElemsARawPerElementAMmaRaw;
+
+    // * Clear CTA Smem Tensor
+    cooperative_clear<MaxThreadsPerBlock>(threadIdx_X, cACsAC);
+    cooperative_clear<MaxThreadsPerBlock>(threadIdx_X, cEsE);
+
+    // * Input CTA Tensor G to S
+    if (GemmM_within_Cta == TensorEAtomM{} && GemmK_within_Cta == TensorEAtomK{}) {
+      copy_vec_pred<false, LayoutATag>(cAgA, cAsA, threadIdx_X, GemmM_within_Cta, GemmK_within_Cta);
+    }
+    else {
+      copy_vec_pred<true, LayoutATag>(cAgA, cAsA, threadIdx_X, GemmM_within_Cta, GemmK_within_Cta);
+    }
+
+    // * Compress
+    // cACsAC is always row major order
+    // TensorEAtomM threads perform the compression, each thread compress one row
+    const int row_i = threadIdx_X;
+    if (row_i < GemmM_within_Cta) {
+
+      CUTE_UNROLL
+      for (int col_chunk_i = 0; col_chunk_i < NumOneChunkK{}; ++col_chunk_i) {
+        if (col_chunk_i < GemmK_NumOneChunk_within_Cta) {
+          // Compress is handled in unit of ElementAMmaRawUnit
+          cute::Tensor tAsA   = cAsA_log_chunk(row_i, make_coord(_, col_chunk_i));
+          cute::Tensor tACsAC = cACsAC_log_chunk(row_i, make_coord(_, col_chunk_i));
+          cute::Tensor tEsE   = cEsE_log_chunk(row_i, make_coord(_, col_chunk_i));
+
+          int non_zero_cnt = 0;
+          // None zero element indx
+          // e.g.
+          //  2:4 sparsity [x 0 0 x]
+          //  non_zero_elt_log_idx = [0, 3]
+          int non_zero_elt_log_idx[OneChunkSizeAC{}] = { 0 };
+
+          // * Find None Zero Element Idx within Chunk
+          CUTE_UNROLL
+          for (int elt_log_idx = 0; elt_log_idx < OneChunkSizeA{}; ++elt_log_idx) {
+            ElementAMmaRawUnit elem_A = tAsA[elt_log_idx];
+            if ( elem_A != ElementAMmaRawUnit{0} ) {
+              non_zero_elt_log_idx[non_zero_cnt] = elt_log_idx;
+              tACsAC[non_zero_cnt] = elem_A;
+              non_zero_cnt++;
+            }
+          }
+
+          // * Corner Case for 2:4 sparsity
+          if constexpr (cute::sizeof_bits_v<ElementAMmaRawUnit> < 32) {
+            // i.e. [0 0 0 x] -> [(0) 0 0 x]
+            if (non_zero_cnt == 1 && non_zero_elt_log_idx[0] == 3) {
+              tACsAC[1] = tACsAC[0];
+              tACsAC[0] = ElementAMmaRawUnit{0};
+              non_zero_elt_log_idx[0] = 0;
+              non_zero_elt_log_idx[1] = 3;
+            }
+            // i.e. [0 0 x 0] -> [0 0 x (0)]
+            // i.e. [0 x 0 0] -> [0 x 0 (0)]
+            // i.e. [x 0 0 0] -> [x 0 0 (0)]
+            else if (non_zero_cnt == 1) {
+              tACsAC[1] = ElementAMmaRawUnit{0};
+              non_zero_elt_log_idx[1] = 3;
+            }
+          }
+
+          // * Set Metadata Bits
+          MetadataOneChunk metadata_one_chunk;
+          CUTE_UNROLL
+          for (int elt_phy_idx = 0; elt_phy_idx < OneChunkSizeAC{}; elt_phy_idx++) {
+            metadata_one_chunk.set_metadata_bits(non_zero_elt_log_idx[elt_phy_idx], elt_phy_idx);
+          }
+          tEsE[0] = metadata_one_chunk.storage();
+
+        }
+        else {
+          break;
+        }
+      }
+    }
+
+    // * Sync after Compress
+    __syncthreads();
+
+    // * Output Cta Tensor S to G
+    if (GemmM_within_Cta > 0 && GemmK_within_Cta > 0) {
+      constexpr int MaxVecBits = 128; // STG.128
+      cute::cooperative_copy<MaxThreadsPerBlock, MaxVecBits>(threadIdx_X, cEsE, cEgE);
+    }
+
+    if (GemmMAlignedAC_within_Cta == TensorEAtomM{} && GemmKAlignedAC_within_Cta == TensorEAtomK{}) {
+      copy_vec_pred<false, LayoutATag>(cACsAC, cACgAC, threadIdx_X, GemmMAlignedAC_within_Cta, (GemmKAlignedAC_within_Cta / ElementASparsity::value));
+    }
+    else {
+      copy_vec_pred<true, LayoutATag>(cACsAC, cACgAC, threadIdx_X, GemmMAlignedAC_within_Cta, (GemmKAlignedAC_within_Cta / ElementASparsity::value));
+    }
+
+  } // end of structure_sparse_compress()
+
+  template<uint32_t NumThreads,
+           typename TensorSrc>
+  CUTE_DEVICE
+  static void
+  cooperative_clear(
+    uint32_t const& tid,
+    TensorSrc dSrc) {
+    
+    auto dSrctSrc = local_partition(dSrc, make_layout(make_shape(NumThreads, _1{})), tid);
+    cute::clear(dSrctSrc);
+
+    // Sync all thread data access
+    __syncthreads();
+  }
+
+  template <bool pred,
+            typename LayoutTag,
+            typename TensorSrc,
+            typename TensorDst>
+  CUTE_DEVICE
+  static void
+  copy_vec_pred(
+      TensorSrc dSrc,
+      TensorDst dDst,
+      int threadIdx_X,
+      int valid_rows,
+      int valid_cols) {
+
+    constexpr bool IsRowMajor = cute::is_same_v<LayoutTag, cutlass::layout::RowMajor>;
+    using Element = typename TensorSrc::element_type;
+    CUTE_STATIC_ASSERT(cute::is_static_v<decltype(shape(dSrc))>, "shape(dSrc) needs to be static");
+    CUTE_STATIC_ASSERT(cute::is_static_v<decltype(shape(dDst))>, "shape(dDst) needs to be static");
+    CUTE_STATIC_ASSERT(cute::sizeof_bits_v<typename TensorSrc::element_type> == cute::sizeof_bits_v<typename TensorDst::element_type>,
+      "dSrc and dDst need to have same element bit width");
+    CUTE_STATIC_ASSERT(cute::size(dSrc) == cute::size(dDst), "dSrc and dDst need to have same size");
+
+    // ValueShape
+    using ValueShape = 
+      cute::conditional_t<IsRowMajor,
+                          Shape<Int<1>, Int<128 / sizeof_bits_v<Element>>>,
+                          Shape<Int<128 / sizeof_bits_v<Element>>, Int<1>>>
+      ;
+
+    constexpr int ValueShapeRows = shape<0>(ValueShape{});
+    constexpr int ValueShapeCols = shape<1>(ValueShape{});
+
+    // ThreadShape
+    using ThreadShape = 
+      cute::conditional_t<IsRowMajor,
+                          Shape<Int<MaxThreadsPerBlock / (shape<1>(dSrc) / ValueShapeCols)>, Int<                     (shape<1>(dSrc) / ValueShapeCols)>>,
+                          Shape<Int<                     (shape<0>(dSrc) / ValueShapeRows)>, Int<MaxThreadsPerBlock / (shape<0>(dSrc) / ValueShapeRows)>>>
+      ;
+
+    constexpr int ThreadShapeRows = shape<0>(ThreadShape{});
+    constexpr int ThreadShapeCols = shape<1>(ThreadShape{});
+
+    const int threadIdx_X_row = threadIdx_X / ThreadShapeCols;
+    const int threadIdx_X_col = threadIdx_X % ThreadShapeCols;
+
+    // Row Major
+    if constexpr (IsRowMajor) {
+      CUTE_UNROLL
+      for (int iter_row_blk = 0; iter_row_blk < cutlass::ceil_div(shape<0>(dSrc), ThreadShapeRows * ValueShapeRows); ++iter_row_blk) {
+        CUTE_UNROLL
+        for (int col_chunk_i = 0; col_chunk_i < cutlass::ceil_div(shape<1>(dSrc) , ThreadShapeCols * ValueShapeCols); ++col_chunk_i) {
+          CUTE_UNROLL
+          for (int iter_row_thr = 0; iter_row_thr < ValueShapeRows; ++iter_row_thr) {
+            CUTE_UNROLL
+            for (int iter_col_thr = 0; iter_col_thr < ValueShapeCols; ++iter_col_thr) {
+              const int row_i = (iter_row_blk * ThreadShapeRows + threadIdx_X_row) * ValueShapeRows + iter_row_thr;
+              const int col_i = (col_chunk_i * ThreadShapeCols + threadIdx_X_col) * ValueShapeCols + iter_col_thr;
+              if constexpr ( (not pred) 
+              ) {
+                dDst(row_i, col_i) = dSrc(row_i, col_i);
+              }
+              else {
+                if (row_i < valid_rows && col_i < valid_cols) {
+                  dDst(row_i, col_i) = dSrc(row_i, col_i);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // Col Major
+    else {
+      CUTE_UNROLL
+      for (int col_chunk_i = 0; col_chunk_i < cutlass::ceil_div(shape<1>(dSrc) , ThreadShapeCols * ValueShapeCols); ++col_chunk_i) {
+        CUTE_UNROLL
+        for (int iter_row_blk = 0; iter_row_blk < cutlass::ceil_div(shape<0>(dSrc), ThreadShapeRows * ValueShapeRows); ++iter_row_blk) {
+          CUTE_UNROLL
+          for (int iter_col_thr = 0; iter_col_thr < ValueShapeCols; ++iter_col_thr) {
+            CUTE_UNROLL
+            for (int iter_row_thr = 0; iter_row_thr < ValueShapeRows; ++iter_row_thr) {
+              const int row_i = (iter_row_blk * ThreadShapeRows + threadIdx_X_row) * ValueShapeRows + iter_row_thr;
+              const int col_i = (col_chunk_i * ThreadShapeCols + threadIdx_X_col) * ValueShapeCols + iter_col_thr;
+              if constexpr ( (not pred)
+              ) {
+                dDst(row_i, col_i) = dSrc(row_i, col_i);
+              }
+              else {
+                if (row_i < valid_rows && col_i < valid_cols) {
+                  dDst(row_i, col_i) = dSrc(row_i, col_i);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  
+    // Sync all thread data access
+    __syncthreads();
+  } // end of copy_vec_pred()
+  
+};
+
+}  // namespace cutlass::transform::kernel
diff --git a/include/cutlass/transform/kernel/sparse_gemm_compressor.hpp b/include/cutlass/transform/kernel/sparse_gemm_compressor.hpp
new file mode 100644
index 0000000000..51f42e9fd5
--- /dev/null
+++ b/include/cutlass/transform/kernel/sparse_gemm_compressor.hpp
@@ -0,0 +1,284 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Compress utils for structured sparse kernels
+*/
+
+#pragma once
+
+#include <algorithm>                       // std::fill
+#include <array>                           // std::array
+#include <random>                          // std::mt19937
+
+#include "cute/numeric/numeric_types.hpp"  // cute::sizeof_bits_v
+#include "cute/tensor.hpp"                 // cute::Tensor, cute::make_tensor
+#include "cutlass/arch/arch.h"             // cutlass::arch::SmXY
+#include "cutlass/gemm/gemm.h"             // cutlass::TagToStrideA_t
+#include "cutlass/fast_math.h"             // cutlass::ceil_div, cutlass::round_up
+#include "cutlass/numeric_size.h"          // cutlass::bits_to_bytes
+
+#include "cutlass/transform/kernel/sm90_sparse_gemm_compressor.hpp"
+
+namespace cutlass::transform::kernel {
+
+template<
+  class ProblemShape_,
+  class ElementA_,
+  class LayoutATag_,
+  class SparseConfig_
+>
+class StructuredSparseCompressorUtility {
+public:
+  using SparseConfig = SparseConfig_;
+  using ProblemShape = ProblemShape_;
+
+  //* EltA
+  using ElementA = ElementA_;
+  using LayoutATag = LayoutATag_;
+  using StrideA = cutlass::gemm::TagToStrideA_t<LayoutATag>;
+  using ElementAMmaRaw = typename SparseConfig::ElementAMmaRaw;
+  using ElementASparsity = typename SparseConfig::ElementASparsity;
+  using ElementAMmaSparsity = typename SparseConfig::ElementAMmaSparsity;
+
+  //* EltE
+  using ElementEMmaRaw = typename SparseConfig::ElementEMmaRaw;
+  using ElementEMmaSparsity = typename SparseConfig::ElementEMmaSparsity;
+
+  //* AtomE
+  using TensorEAtom = typename SparseConfig::TensorEAtom;
+  using TensorEAtomK = typename SparseConfig::TensorEAtomK;
+  using TensorEAtomM = typename SparseConfig::TensorEAtomM;
+
+  static constexpr int ElemsARawPerElementAMmaRaw = typename SparseConfig::ElemsARawPerElementAMmaRaw{};
+  static constexpr int LogicalElemsAPerChunk = typename SparseConfig::LogicalElemsAPerChunk{};
+  static constexpr int PhysicalElemsAPerChunk = typename SparseConfig::PhysicalElemsAPerChunk{};
+  static constexpr int LogicalElemsAMmaRawPerChunk = cutlass::ceil_div(LogicalElemsAPerChunk, ElemsARawPerElementAMmaRaw);
+  static constexpr int PhysicalElemsAMmaRawPerChunk = cutlass::ceil_div(PhysicalElemsAPerChunk, ElemsARawPerElementAMmaRaw);
+
+  //* Alignment
+  static constexpr int TensorEAlignmentM = typename SparseConfig::TensorEAlignmentM{};
+  static constexpr int TensorEAlignmentK = typename SparseConfig::TensorEAlignmentK{};
+  static constexpr int TensorAAlignmentK = typename SparseConfig::TensorAAlignmentK{};
+  static constexpr int TensorAAlignmentM = typename SparseConfig::TensorAAlignmentM{};
+
+  StructuredSparseCompressorUtility() = default;
+
+  StructuredSparseCompressorUtility(ProblemShape problem, StrideA dA) {
+    set_problem_size(problem, dA);
+  }
+
+  void set_problem_size(ProblemShape problem, StrideA dA_) {
+    M = cute::size<0>(problem);
+    K = cute::size<2>(problem);
+    L = cute::size<3>(problem);
+
+    // The following three vars are logical elem count!
+    K_alignedA  = round_up(K, TensorAAlignmentK);
+    M_alignedA  = round_up(M, TensorAAlignmentM);
+    K_alignedE = round_up(K, TensorEAlignmentK);
+    M_alignedE = round_up(M, TensorEAlignmentM);
+
+    dA = dA_;
+  }
+
+  /**
+   * @brief Get the TensorE number of ElementE along K after alignment requirement
+   * 
+   * @return int : number of ElementE (uint8_t) along K-dim
+   */
+  int get_metadata_m_physical() const {
+    return M_alignedE;
+  }
+
+  /**
+   * @brief Get the TensorE number of ElementE along M after alignment requirement
+   * 
+   * @return int : number of ElementE (uint8_t) along M-dim
+   */
+  int get_metadata_k_physical() const {
+    return K_alignedE / ElementEMmaSparsity{};
+  }
+
+  /**
+   * @brief Get the TensorACompressed number of ElementA along K after alignment requirement
+   * 
+   * @return int : number of ElementA along K-dim
+   */
+  int get_tensorA_k_physical() const {
+    return K_alignedA / ElementASparsity{};
+  }
+
+  /**
+   * @brief Get the TensorACompressed number of ElementA along M after alignment requirement
+   * 
+   * @return int : number of ElementA along M-dim
+   */
+  int get_tensorA_m_physical() const {
+    return M_alignedA;
+  }
+
+  /**
+   * @brief Get the TensorACompressed Bytes
+   * 
+   * @return uint64_t bytes
+   */
+  uint64_t get_compressed_tensor_A_bytes() const {
+    const auto tensor_a_comp_num_elt_a = get_tensorA_m_physical() * get_tensorA_k_physical() * L;
+    const auto tensor_a_comp_bytes = cutlass::bits_to_bytes<uint64_t>(tensor_a_comp_num_elt_a * cute::sizeof_bits_v<ElementA>);
+    return tensor_a_comp_bytes;
+  }
+
+  /**
+   * @brief Get the TensorA Bytes
+   * 
+   * @return uint64_t bytes
+   */
+  uint64_t get_raw_tensor_A_bytes() const {
+    const auto tensor_a_num_elt_a = uint64_t(M) * uint64_t(K) * uint64_t(L);
+    const auto tensor_a_bytes = cutlass::bits_to_bytes<uint64_t>(tensor_a_num_elt_a * cute::sizeof_bits_v<ElementA>);
+    return tensor_a_bytes;
+  }
+
+  /**
+   * @brief Get the TensorE Bytes
+   * 
+   * @return uint64_t bytes
+   */
+  uint64_t get_tensor_E_bytes() const {
+    const auto tensor_e_num_elt_a = uint64_t(get_metadata_m_physical()) * uint64_t(get_metadata_k_physical()) * uint64_t(L);
+    const auto tensor_e_bytes = cutlass::bits_to_bytes<uint64_t>(tensor_e_num_elt_a * cute::sizeof_bits_v<ElementEMmaRaw>);
+    return tensor_e_bytes;
+  }
+
+  constexpr auto fill_layoutA_from_compressor() const {
+    return SparseConfig::fill_layoutA(cute::make_tuple(M,_1{},K,L));
+  }
+
+  constexpr auto fill_layoutE_from_compressor() const {
+    return SparseConfig::fill_layoutE(cute::make_tuple(M,_1{},K,L));
+  }
+
+  void structure_sparse_zero_mask_fill(void* host_a_ptr, uint64_t seed) {
+    
+    constexpr int ChunkSize = LogicalElemsAMmaRawPerChunk;
+    using ChunkElement = cute::uint_bit_t<cute::sizeof_bits_v<ElementAMmaRaw>>;
+
+    cute::Tensor gA_eltA = cute::make_tensor(
+        cute::recast_ptr<ElementA>(host_a_ptr),
+        cute::make_layout(make_shape(M, K, L), dA));
+
+    // Input TensorA is handled in unit of ElementAMmaRaw instead of ElementA
+    cute::Tensor gA = cute::recast<ChunkElement>(gA_eltA);
+
+    // Extract out the Chunk from K-mode
+    Tensor gA_chunk = cute::zipped_divide(gA, cute::Shape<_1,cute::Int<ChunkSize>>{}); // (Chunk, Rest)
+
+    // Half of the data is zero to indicate sparsityA = 2
+    std::array<int, ChunkSize> nnzb_indicator{};
+    for (size_t i = 1; i < nnzb_indicator.size(); i += 2) {
+      nnzb_indicator.at(i) = 1;
+    }
+
+    std::mt19937 rng(seed);
+    auto rest_shape = cute::shape<1>(gA_chunk);
+    for (auto iter = cute::make_coord_iterator(rest_shape); iter != cute::ForwardCoordIteratorSentinel{}; ++iter) {
+      std::shuffle(nnzb_indicator.begin(), nnzb_indicator.end(), rng);
+      for (int c = 0; c < size<0>(gA_chunk); ++c) {                        // for each elem within chunk
+        if (nnzb_indicator[c] == 0) {
+          gA_chunk(c, *iter) = ChunkElement{0};
+        }
+      }  // end of within chunk
+    }    // end of chunk_idx
+  }
+
+  int M{-1};
+  int K{-1};
+  int L{-1};
+  StrideA dA{};
+
+private:
+  int K_alignedA{-1};
+  int M_alignedA{-1};
+  int K_alignedE{-1};
+  int M_alignedE{-1};
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template<
+  class ProblemShape,
+  class ElementA,
+  class LayoutATag,
+  class SparseConfig,
+  class ArchTag
+>
+struct StructuredSparseCompressorSelector {
+  static_assert(cutlass::detail::dependent_false<ArchTag>,
+      "Could not select a structured sparse compressor for given parameters.");
+};
+
+template<
+  class ProblemShape,
+  class ElementA,
+  class LayoutATag,
+  class SparseConfig
+>
+struct StructuredSparseCompressorSelector<
+    ProblemShape,
+    ElementA,
+    LayoutATag,
+    SparseConfig,
+    arch::Sm90> {
+  using Compressor = SM90StructuredSparseCompressor<
+    ProblemShape,
+    ElementA,
+    LayoutATag,
+    SparseConfig
+  >;
+};
+
+template<
+  class ProblemShape,
+  class ElementA,
+  class LayoutATag,
+  class SparseConfig,
+  class ArchTag
+>
+using StructuredSparseCompressor = typename StructuredSparseCompressorSelector<
+    ProblemShape,
+    ElementA,
+    LayoutATag,
+    SparseConfig,
+    ArchTag
+>::Compressor;
+
+} // End namespace cutlass::transform::kernel
diff --git a/include/cutlass/transform/pitch_linear_thread_map.h b/include/cutlass/transform/pitch_linear_thread_map.h
index 803df22a42..0fcb48e564 100644
--- a/include/cutlass/transform/pitch_linear_thread_map.h
+++ b/include/cutlass/transform/pitch_linear_thread_map.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -29,7 +29,7 @@
  *
  **************************************************************************************************/
 /*! \file
-    \brief Templates implementing how threads are mapped to a given tile. 
+    \brief Templates implementing how threads are mapped to a given tile.
 
 */
 
@@ -88,20 +88,15 @@ struct PitchLinearStripminedThreadMap {
 
     static_assert(!(Shape::kContiguous % kElementsPerAccess), "");
 
-    static_assert(!((Shape::kContiguous * Shape::kStrided) % (kThreads * kElementsPerAccess)), 
-      "Shape must be divisible thread count.");
-
     /// Shape of the tile in units of vectors
     using ShapeVec = layout::PitchLinearShape<
       Shape::kContiguous / kElementsPerAccess,
       Shape::kStrided
     >;
 
-    static_assert(
-      (Threads < ShapeVec::kContiguous && !(ShapeVec::kContiguous % kThreads)) ||
-      (!(kThreads % ShapeVec::kContiguous) && !(ShapeVec::kStrided % (kThreads / ShapeVec::kContiguous))),
-      "Shape must be divisible by number of iterations of each thread."
-    );
+    static_assert((Threads < ShapeVec::kContiguous && !(ShapeVec::kContiguous % kThreads)) ||
+                      (!(kThreads % ShapeVec::kContiguous)),
+                  "Shape must be divisible by number of iterations of each thread.");
   };
 
   /// Number of iterations by each thread
@@ -112,11 +107,12 @@ struct PitchLinearStripminedThreadMap {
           // Redo the comparison here to work around divide by zero compiler
           // error.  The compiler evaluates both path of platform::conditional.
           (Threads >= Detail::ShapeVec::kContiguous
-               ? Detail::ShapeVec::kStrided /
+               ? (Detail::ShapeVec::kStrided + (kThreads / Detail::ShapeVec::kContiguous - 1)) /
                      (kThreads / Detail::ShapeVec::kContiguous)
                : 0)>,
       layout::PitchLinearShape<Detail::ShapeVec::kContiguous / kThreads,
                                Detail::ShapeVec::kStrided>>::type;
+  
 
   /// Interval between accesses along each dimension of the tensor's logical coordinate space
   /// (in units of Elements)
@@ -132,6 +128,13 @@ struct PitchLinearStripminedThreadMap {
     >
   >::type;
 
+  /// Shape of the tile in units of vectors
+  using StorageShape = typename platform::conditional<
+      Threads >= Detail::ShapeVec::kContiguous,
+      layout::PitchLinearShape<Shape::kContiguous,
+                               Iterations::kStrided*(kThreads / Detail::ShapeVec::kContiguous)>,
+      layout::PitchLinearShape<Shape::kContiguous, Shape::kStrided>>::type;
+
   /// Maps thread ID to a coordinate offset within the tensor's logical coordinate space
   /// (in units of Elements)
   CUTLASS_HOST_DEVICE
@@ -160,9 +163,9 @@ struct PitchLinearTilePolicyStripminedThreadContiguous
 
   using Iterations = layout::PitchLinearShape<
                       Shape::kContiguous / (kThreads * kElementsPerAccess),
-                      Shape::kStrided>;                      
+                      Shape::kStrided>;
 
-  using Delta = layout::PitchLinearShape<1, 1>;  
+  using Delta = layout::PitchLinearShape<1, 1>;
 
   CUTLASS_HOST_DEVICE
   static TensorCoord initial_offset(int thread_id)
@@ -180,7 +183,7 @@ struct PitchLinearTilePolicyStripminedThreadStrided
 {
   static_assert((Shape::kStrided % Threads == 0),
                 "Strided shape must divide number of threads");
-  
+
   using TensorCoord = layout::PitchLinearCoord;
 
   static int const kThreads = Threads;
@@ -188,16 +191,16 @@ struct PitchLinearTilePolicyStripminedThreadStrided
 
   using Iterations = layout::PitchLinearShape<
                       Shape::kContiguous / kElementsPerAccess,
-                      Shape::kStrided / kThreads>;       
+                      Shape::kStrided / kThreads>;
 
-  using Delta = layout::PitchLinearShape<1, 1>;  
+  using Delta = layout::PitchLinearShape<1, 1>;
 
   using ShapeVec = Shape;
 
   CUTLASS_HOST_DEVICE
   static TensorCoord initial_offset(int thread_id)
   {
-    
+
     return TensorCoord(0, thread_id * Iterations::kStrided);
   }
 };
@@ -331,7 +334,7 @@ struct PitchLinearWarpRakedThreadMap {
     };
 
     // This is the offset of a thread within a threadblock tile (units of vectors)
-    layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec = 
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec =
       warp_footprint * warp_offset + thread_offset_in_warp;
 
     // This is the offset of a thread within a threadblock tile (units of elements)
@@ -457,7 +460,7 @@ struct PitchLinearStridedWarpRakedThreadMap {
     };
 
     // This is the offset of a thread within a threadblock tile (units of vectors)
-    layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec = 
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec =
       warp_footprint * warp_offset + thread_offset_in_warp;
 
     // This is the offset of a thread within a threadblock tile (units of elements)
@@ -598,7 +601,7 @@ struct TransposePitchLinearThreadMapSimt {
 
     static_assert(kElementsPerAccess == 1 , "Simt transpose requires elements per access to be 1");
     ///< Iterations along each dimension (concept: PitchLinearShape)
-    using Iterations = 
+    using Iterations =
         layout::PitchLinearShape<ThreadMap::Iterations::kStrided,
         ThreadMap::Iterations::kContiguous>;
 
@@ -612,7 +615,7 @@ struct TransposePitchLinearThreadMapSimt {
 
     ///< Delta betweeen accesses (units of elements, concept: PitchLinearShape)
     using Delta =
-        layout::PitchLinearShape<ThreadMap::Delta::kStrided, 
+        layout::PitchLinearShape<ThreadMap::Delta::kStrided,
         ThreadMap::Delta::kContiguous>;
 
 
@@ -690,12 +693,12 @@ struct PitchLinearWarpStripedThreadMap {
 
     // Divide it into the number of warps, first partitioning the strided dimension then the
     // contiguous.
-    static int const kWarpsStrided = 
-      (WarpAccessIterations::kStrided >= kWarpCount 
+    static int const kWarpsStrided =
+      (WarpAccessIterations::kStrided >= kWarpCount
         ? kWarpCount : (kWarpCount / WarpAccessIterations::kStrided));
 
-    static int const kWarpsContiguous = 
-      (kWarpCount > WarpAccessIterations::kStrided ? 
+    static int const kWarpsContiguous =
+      (kWarpCount > WarpAccessIterations::kStrided ?
         WarpAccessIterations::kContiguous / kWarpsStrided : 1);
 
     /// Arrangement of warps within a threadblock-scoped tile
@@ -749,7 +752,7 @@ struct PitchLinearWarpStripedThreadMap {
     };
 
     // This is the offset of a thread within a threadblock tile (units of vectors)
-    layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec = 
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec =
       warp_footprint * warp_offset + thread_offset_in_warp;
 
     // This is the offset of a thread within a threadblock tile (units of elements)
@@ -773,7 +776,7 @@ struct PitchLinearWarpStripedThreadMap {
 template <
   typename Shape_,
   int Threads,
-	typename ThreadTileShape
+        typename ThreadTileShape
 >
 struct PitchLinear2DThreadTileStripminedThreadMap;
 
@@ -864,7 +867,7 @@ struct PitchLinear2DThreadTileStripminedThreadMap <Shape_, Threads, cutlass::lay
   }
 };
 
-/// Thread Mapping a 2D threadtiled mapping as a tranposed Pitchlinear2DThreadTile mapping
+/// Thread Mapping a 2D threadtiled mapping as a transposed Pitchlinear2DThreadTile mapping
 template <typename ThreadMap_>
 struct TransposePitchLinearThreadMap2DThreadTile {
     /// Underlying ThreadMap
@@ -885,7 +888,7 @@ struct TransposePitchLinearThreadMap2DThreadTile {
 
     static_assert(kElementsPerAccess > 1 , "Simt transpose requires elements per access to be 1");
     ///< Iterations along each dimension (concept: PitchLinearShape)
-    using Iterations = 
+    using Iterations =
         layout::PitchLinearShape<ThreadMap::Iterations::kStrided,
         ThreadMap::Iterations::kContiguous>;
 
@@ -896,7 +899,7 @@ struct TransposePitchLinearThreadMap2DThreadTile {
 
     ///< Delta betweeen accesses (units of elements, concept: PitchLinearShape)
     using Delta =
-        layout::PitchLinearShape<ThreadMap::Delta::kStrided, 
+        layout::PitchLinearShape<ThreadMap::Delta::kStrided,
         ThreadMap::Delta::kContiguous>;
 
 
diff --git a/include/cutlass/transform/thread/transpose.h b/include/cutlass/transform/thread/transpose.h
index f1f992f862..4d0b39073b 100644
--- a/include/cutlass/transform/thread/transpose.h
+++ b/include/cutlass/transform/thread/transpose.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/thread/unaryOp.h b/include/cutlass/transform/thread/unary_op.h
similarity index 89%
rename from include/cutlass/transform/thread/unaryOp.h
rename to include/cutlass/transform/thread/unary_op.h
index 7696cf73e6..ce7cbbe8f3 100644
--- a/include/cutlass/transform/thread/unaryOp.h
+++ b/include/cutlass/transform/thread/unary_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -59,16 +59,16 @@ class UnaryOp
                           "Unary Operator not supported.");
 
             FragmentOut out;
-            if( platform::is_same<Transform, UnaryTransform::Identity>::value )
+            if (platform::is_same<Transform, UnaryTransform::Identity>::value )
             {
                 CUTLASS_PRAGMA_UNROLL
-                for(int i=0; i < FragmentIn::kElements; ++i){
+                for (int i=0; i < FragmentIn::kElements; ++i){
                    out[i] = static_cast<typename FragmentOut::Element>(in[i]);
                 }
             }
-            else if( platform::is_same<Transform, UnaryTransform::Conjugate>::value )
+            else if (platform::is_same<Transform, UnaryTransform::Conjugate>::value )
             {
-                for(int i=0; i < FragmentIn::kElements; ++i){
+                for (int i=0; i < FragmentIn::kElements; ++i){
                    out[i] = conj(static_cast<typename FragmentOut::Element>(in[i]));
                 }
             }
@@ -87,11 +87,11 @@ class UnaryOp<FragmentIn, FragmentIn, Transform>
                           platform::is_same<Transform, UnaryTransform::Conjugate>::value,
                           "Unary Operator not supported.");
 
-            if( platform::is_same<Transform, UnaryTransform::Identity>::value )
+            if (platform::is_same<Transform, UnaryTransform::Identity>::value )
             {
                 return in;
             }
-            else if( platform::is_same<Transform, UnaryTransform::Conjugate>::value )
+            else if (platform::is_same<Transform, UnaryTransform::Conjugate>::value )
             {
                 for(int i=0; i < FragmentIn::kElements; ++i){
                    in[i] = conj(in[i]);
@@ -99,9 +99,7 @@ class UnaryOp<FragmentIn, FragmentIn, Transform>
             }
             return in;
         }
-};
-}
+      };
+    }
+  }
 }
-}
-
-
diff --git a/include/cutlass/transform/threadblock/ell_iterator.h b/include/cutlass/transform/threadblock/ell_iterator.h
new file mode 100644
index 0000000000..026e4ced41
--- /dev/null
+++ b/include/cutlass/transform/threadblock/ell_iterator.h
@@ -0,0 +1,199 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Ell iterator for matrix of indices (ellColInd matrix) 
+*/
+
+#pragma once
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+namespace ell{
+
+constexpr unsigned int SmemPow = 8;
+constexpr unsigned int SmemStages = 2;
+constexpr unsigned int SmemSize = 1 << SmemPow;
+constexpr unsigned int SmemMask = (SmemSize*SmemStages-1);
+
+class SharedStorage{
+  public:
+    Array<int, SmemSize*SmemStages> array;
+};
+
+class Iterator{
+  public:
+  using Layout = layout::PitchLinear;
+  using LongIndex = typename Layout::LongIndex;
+
+  private:
+    const int *gmem_col_idx_;
+    int *smem_col_idx_;
+    const int  block_size_;
+    const int  base_idx_;
+    const int  k_shape_;
+    const int  ell_increment_;
+    const int  array_length_;
+    int  col_idx_base_;
+    int  residue_;
+    int  counter_;
+
+    int  pow2_;
+    int  residue_shape_;
+
+    int  smem_offset_;
+    int  smem_stage_;
+    int  gmem_offset_;
+
+    int  lane_;
+
+    bool is_pow2_;
+    bool is_residue_tile_;
+
+  public:
+    CUTLASS_DEVICE
+    void load_ell_indices(){
+      for(int i=threadIdx.x; i<SmemSize; i+=blockDim.x){
+        int idx = (gmem_offset_+i < array_length_) ? gmem_offset_+i : array_length_-1;
+        int gmem_col_idx = gmem_col_idx_[idx] - base_idx_;
+        smem_col_idx_[i + smem_stage_ * SmemSize] = 
+          (gmem_col_idx >= 0) ? gmem_col_idx : -1;
+      }
+      gmem_offset_ += SmemSize;
+      smem_stage_ ^= 1;
+    }
+
+    CUTLASS_DEVICE
+    Iterator(
+        SharedStorage& shared_storage_base,
+        const int* col_idx,
+        const int& block_size,
+        const int& base_idx,
+        const int  k_shape,
+        const int& problem_size_k,
+        const int& ell_stride,
+        const int& thread_idx)
+        : residue_(0),
+          counter_(0),
+          smem_offset_(0),
+          smem_stage_(0),
+          gmem_offset_(0),
+          block_size_(block_size),
+          base_idx_(base_idx),
+          k_shape_(k_shape),
+          ell_increment_(ell_stride * block_size),
+          array_length_((problem_size_k + block_size_ - 1) / block_size_), 
+          residue_shape_(problem_size_k % k_shape_),
+          is_residue_tile_(residue_shape_ != 0),
+          smem_col_idx_(reinterpret_cast<int*>(&shared_storage_base.array)),
+          gmem_col_idx_(const_cast<int*>(col_idx)),
+          lane_(thread_idx % 32) {
+
+      load_ell_indices();
+      __syncthreads();
+          
+      is_pow2_ = ((block_size_ & (block_size_ - 1)) == 0);
+      if( is_pow2_ && k_shape <= block_size_ ) lane_ = 0;
+      
+      col_idx_base_ = smem_col_idx_[(smem_offset_ + lane_) & SmemMask] * ell_increment_;
+
+      pow2_ = 0;
+      while(block_size_ >> (pow2_ + 1)) ++pow2_;
+    }
+
+    CUTLASS_DEVICE
+    int get_blocksize(){
+      return block_size_;
+    }
+
+    CUTLASS_DEVICE
+    Iterator &operator++(){
+      if(is_residue_tile_){
+        residue_ += residue_shape_;
+        is_residue_tile_ = false;
+      } else {
+        residue_ += k_shape_;
+      }
+
+      if(residue_ < block_size_){
+        return *this;
+      }
+
+      if((array_length_ > SmemSize) && (((smem_offset_ >> SmemPow) & 1) != smem_stage_)) 
+        load_ell_indices();
+
+      if(residue_ == block_size_){
+        ++smem_offset_;
+        counter_ += ell_increment_;
+        residue_ = 0;
+        col_idx_base_ = smem_col_idx_[(smem_offset_ + lane_) & SmemMask] * ell_increment_ - counter_;
+        return *this;
+      }
+      
+      if(is_pow2_){
+        smem_offset_ += residue_ >> pow2_; 
+        counter_ += (residue_ >> pow2_) * ell_increment_;
+        residue_ = residue_ & ((1 << pow2_) - 1);
+      }
+      else {
+        smem_offset_ += residue_ / block_size_; 
+        counter_ += (residue_ / block_size_) * ell_increment_;
+        residue_ %= block_size_;
+      }
+      
+      col_idx_base_ = smem_col_idx_[(smem_offset_ + lane_) & SmemMask] * ell_increment_ - counter_;
+      
+      return *this;
+    }
+    
+    CUTLASS_DEVICE
+    LongIndex get_offset(const int& idx) {
+      int num_jump_tiles;
+      if(is_pow2_)
+        num_jump_tiles = (idx + residue_) >> pow2_;
+      else 
+        num_jump_tiles = (idx + residue_) / block_size_;
+
+      int tmp = __shfl_sync(0xffffffff, col_idx_base_, num_jump_tiles); 
+      return tmp - num_jump_tiles * ell_increment_;
+    }
+    
+    CUTLASS_DEVICE
+    LongIndex get_offset_fast() {
+      return col_idx_base_;
+    }
+};
+
+}
+}
+}
+}
diff --git a/include/cutlass/transform/threadblock/ell_predicated_tile_access_iterator.h b/include/cutlass/transform/threadblock/ell_predicated_tile_access_iterator.h
new file mode 100644
index 0000000000..2e9e3716a6
--- /dev/null
+++ b/include/cutlass/transform/threadblock/ell_predicated_tile_access_iterator.h
@@ -0,0 +1,1350 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Ell iterator for Blocked-Ell matrix (ellValue matrix) used with EllMmaMultistage
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// EllPredicatedTileAccessIterator
+///
+template <typename Shape, typename Element, typename Layout, int AdvanceRank,
+          typename ThreadMap, typename AccessType>
+class EllPredicatedTileAccessIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of EllPredicatedTileAccessIterator for pitch-linear data.
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class EllPredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
+                                   AdvanceRank, ThreadMap_, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  static int const kPredicatesPerByte = 4;
+  static int const kPredicatesPerWord = 4 * kPredicatesPerByte;
+
+  static int const kPredicateCount = ThreadMap::Iterations::kCount * kAccessesPerVector;
+
+  /// Number of 32b words containing predicates
+  static int const kPredicateByteCount =
+    (kPredicateCount + kPredicatesPerByte - 1) / kPredicatesPerByte;
+  static int const kPredicateWordCount = (kPredicateByteCount + 3) / 4;
+
+  static unsigned const kPredicateMask = (1u << kPredicatesPerByte) - 1u;
+
+  static_assert(kPredicateWordCount <= 4, "Too many predicates.");
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = Array<uint32_t, kPredicateWordCount>;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend EllPredicatedTileAccessIterator;
+
+   private:
+    /// stride of pitch-linear layout (units of Element)
+    LongIndex stride_;
+    /// amount (in byte) to increment pointer to move to next access along
+    /// strided dimension
+    LongIndex inc_strided_;
+    /// amount (in byte) to increment pointer from last access to first access
+    /// of next tile
+    LongIndex inc_next_;
+    /// amount (in byte) to increment pointer from first access of current tile
+    /// to first access of next tile
+    LongIndex inc_advance_;
+
+   public:
+
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    Params(): stride_(0), inc_strided_(0), inc_next_(0), inc_advance_(0) { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) : stride_(layout.stride(0)) {
+      inc_strided_ = (LongIndex(stride_) * ThreadMap::Delta::kStrided) *
+                     sizeof_bits<Element>::value / 8;
+
+      if (kAdvanceRank) {
+        // advance along strided dimension
+        inc_advance_ =
+            Shape::kStrided * LongIndex(stride_) * sizeof_bits<Element>::value / 8;
+      } else {
+        // advance along contiguous dimension
+        inc_advance_ = Shape::kContiguous * sizeof_bits<Element>::value / 8;
+      }
+
+      inc_next_ = inc_advance_ - LongIndex(ThreadMap::Iterations::kStrided - 1) *
+                                     ThreadMap::Delta::kStrided * LongIndex(stride_) *
+                                     sizeof_bits<Element>::value / 8;
+    };
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Parameters object with precomputed internal state
+  Params const &params_;
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  /// Guard predicates
+  uint32_t predicates_[kPredicateWordCount];
+
+  /// Size of tensor
+  TensorCoord extent_;
+
+  /// Initial offset for each thread
+  TensorCoord thread_offset_;
+
+  /// Offset to the first steady-state tile
+  TensorCoord residue_offset_;
+
+  /// Initial offset to define ELL block
+  TensorCoord ell_offset_;
+
+  /// Used for out-of-order visitation
+  bool is_residue_tile_;
+
+  /// Iteration along vectors implied by the thread map
+  int iteration_vector_;
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+ public:
+  /// Computes predicates based on internally tracked per-thread offset.
+  CUTLASS_DEVICE
+  void compute_predicates_(
+      /// Extent of the matrix window
+      TensorCoord extent,
+      /// optionally, simplify predicate calculation during 'steady state' phase
+      bool is_steady_state = false) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = 0u;
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int access_idx = 0; access_idx < ThreadMap::Iterations::kCount * kAccessesPerVector; ++access_idx) {
+
+      int s = access_idx / (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
+      
+      int access_residual = access_idx % (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
+
+      int c = access_residual / kAccessesPerVector;
+      int v = access_residual % kAccessesPerVector;
+
+      TensorCoord iteration_coord(c * ThreadMap::Delta::kContiguous + v * AccessType::kElements,
+                                s * ThreadMap::Delta::kStrided);
+
+      TensorCoord coord = thread_offset_ + iteration_coord;
+
+      bool guard;
+
+      if (is_steady_state) {
+        if (kAdvanceRank == 0) {
+          guard = (coord.strided() < extent.strided());
+        } else {
+          guard = (coord.contiguous() < extent.contiguous());
+        }
+      } else {
+        guard = (coord.strided() < extent.strided() &&
+                 coord.contiguous() < extent.contiguous());
+      }
+
+      int pred_idx = v + kAccessesPerVector * (c + ThreadMap::Iterations::kContiguous * s);
+
+      int word_idx = pred_idx / kPredicatesPerWord;
+      int residual = pred_idx % kPredicatesPerWord;
+      int byte_idx = residual / kPredicatesPerByte;
+      int bit_idx = residual % kPredicatesPerByte;
+      
+      predicates_[word_idx] |= (unsigned(guard) << (byte_idx * 8 + bit_idx));
+
+    }
+
+  }
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : params_(params),
+        pointer_(reinterpret_cast<BytePointer>(
+            const_cast<NonConstPointer>(pointer))),
+        extent_(extent),
+        is_residue_tile_(true) {
+          
+    TensorCoord residue_extent;
+    if (kAdvanceRank) {
+
+      typename TensorCoord::Index residue_size = (extent_[kAdvanceRank] - threadblock_offset.strided()) % Shape::kStrided;
+      if (!residue_size) {
+        residue_size = Shape::kStrided;
+      }
+
+      residue_offset_ = make_Coord(0, residue_size);
+      residue_extent = make_Coord(
+        extent_.contiguous(), 
+        min(threadblock_offset.strided() + residue_size, extent_.strided())
+      );
+    } else {
+
+      typename TensorCoord::Index residue_size = (extent_[kAdvanceRank] - threadblock_offset.contiguous()) % Shape::kContiguous;
+      if (!residue_size) {
+        residue_size = Shape::kContiguous;
+      }
+
+      residue_offset_ = make_Coord(residue_size, 0);
+      
+      residue_extent = make_Coord(
+        min(extent_.contiguous(), threadblock_offset.contiguous() + residue_size),
+        extent_.strided()
+      );
+    }
+
+    // Per-thread offset in logical coordinates of tensor
+    ell_offset_ = ThreadMap::initial_offset(thread_id);
+    thread_offset_ = threadblock_offset + ThreadMap::initial_offset(thread_id);
+
+    // update internal pointers
+    Layout layout(params_.stride_);
+    add_pointer_offset(layout(thread_offset_));
+
+    compute_predicates_(residue_extent, false);
+
+    set_iteration_index(0);
+  }
+
+  /// Construct a EllPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id)
+      : EllPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(
+      TensorCoord const &tile_offset) {
+    if (is_residue_tile_) {
+
+      thread_offset_ += residue_offset_;
+
+      Layout layout(params_.stride_);
+      add_pointer_offset(layout(residue_offset_));
+
+      compute_predicates_(extent_, true);
+
+      if (kAdvanceRank) {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided() - 1);
+        pointer_ += Shape::kContiguous * tile_offset.contiguous();
+      } else {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous() - 1);
+        pointer_ += Shape::kStrided * tile_offset.strided();
+      }
+    } else {
+      if (kAdvanceRank) {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided());
+        pointer_ += Shape::kContiguous * tile_offset.contiguous();
+      } else {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous());
+        pointer_ += Shape::kStrided * tile_offset.strided();
+      }
+    }
+    is_residue_tile_ = false;
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(
+        pointer_ + 
+        iteration_contiguous_ * (ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value) / 8) + iteration_vector_;
+  }
+  
+  /// Returns a k_location
+  CUTLASS_HOST_DEVICE
+  int get_k() const {
+    if(kAdvanceRank){ //strided
+      return ell_offset_.strided() + iteration_strided_ * ThreadMap::Delta::kStrided;
+    }else{
+      return ell_offset_.contiguous() + iteration_contiguous_ * ThreadMap::Delta::kContiguous + iteration_vector_ * AccessType::kElements;
+    }
+  }
+  
+  CUTLASS_HOST_DEVICE
+  int get_stride() const {
+    if(kAdvanceRank)
+      return params_.stride_;
+    else
+      return 1;
+  }
+  
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator &operator++() {
+
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+
+    iteration_vector_ = 0;
+    ++iteration_contiguous_;
+
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      pointer_ += params_.inc_strided_;
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+
+    // advance to next tile
+    pointer_ += params_.inc_next_;
+
+    // now return to start tile - if the iterator is subsequently advanced, this
+    // subtraction as well as the subsequent integer addition are both elided by
+    // the compiler.
+    pointer_ -= params_.inc_advance_;
+
+    return *this;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator operator++(int) {
+    EllPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = enable ? 0u : predicates_[i];
+    }
+
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = 0xffffffff;
+    }
+
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { 
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = mask[i];
+    }
+
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+     CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      mask[i] = predicates_[i];
+    }
+  }
+  
+  /// add mask for small tiles in ELL
+  CUTLASS_DEVICE
+  void ell_add_mask(int blocksize) {
+
+    Mask mask;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      mask[i] = 0u;
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int access_idx = 0; access_idx < ThreadMap::Iterations::kCount * kAccessesPerVector; ++access_idx) {
+
+      int s = access_idx / (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
+      
+      int access_residual = access_idx % (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
+
+      int c = access_residual / kAccessesPerVector;
+      int v = access_residual % kAccessesPerVector;
+
+      TensorCoord iteration_coord(c * ThreadMap::Delta::kContiguous + v * AccessType::kElements,
+                                s * ThreadMap::Delta::kStrided);
+
+      TensorCoord coord = ell_offset_ + iteration_coord;
+
+      bool guard;
+
+      if (kAdvanceRank == 0) {
+        guard = (coord.strided() < blocksize);
+      } else {
+        guard = (coord.contiguous() < blocksize);
+      }
+
+      int pred_idx = v + kAccessesPerVector * (c + ThreadMap::Iterations::kContiguous * s);
+
+      int word_idx = pred_idx / kPredicatesPerWord;
+      int residual = pred_idx % kPredicatesPerWord;
+      int byte_idx = residual / kPredicatesPerByte;
+      int bit_idx = residual % kPredicatesPerByte;
+      
+      mask[word_idx] |= (unsigned(guard) << (byte_idx * 8 + bit_idx));
+
+    }
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      mask[i] &= predicates_[i];
+    }
+    set_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+
+    int pred_idx = 
+      iteration_vector_ + kAccessesPerVector * (iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous);
+
+    int word_idx = pred_idx / kPredicatesPerWord;
+    int residual = pred_idx % kPredicatesPerWord;
+    int byte_idx = residual / kPredicatesPerByte;
+    int bit_idx = residual % kPredicatesPerByte;
+    
+    bool pred = (predicates_[word_idx] & (1u << (byte_idx * 8 + bit_idx))) != 0;
+    return pred;
+    
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of EllPredicatedTileAccessIterator for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class EllPredicatedTileAccessIterator<Shape_, Element_, layout::ColumnMajor,
+                                   AdvanceRank, ThreadMap_, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = EllPredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessType>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend EllPredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row(), extent.column()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.row(),
+                                           threadblock_offset.column())) {}
+
+  /// Construct a EllPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : EllPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  CUTLASS_HOST_DEVICE
+  int get_k() const {
+    return iterator_.get_k();
+  }
+  
+  CUTLASS_HOST_DEVICE
+  int get_stride() const {
+    return iterator_.get_stride();
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator operator++(int) {
+    EllPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// add mask for small tiles in ELL
+  CUTLASS_DEVICE
+  void ell_add_mask(int blocksize) {
+    iterator_.ell_add_mask(blocksize);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of EllPredicatedTileAccessIterator for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class EllPredicatedTileAccessIterator<Shape_, Element_, layout::RowMajor,
+                                   AdvanceRank, ThreadMap_, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = EllPredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend EllPredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column(), extent.row()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row())) {}
+
+  /// Construct a EllPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : EllPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  CUTLASS_HOST_DEVICE
+  int get_k() const {
+    return iterator_.get_k();
+  }
+  
+  CUTLASS_HOST_DEVICE
+  int get_stride() const {
+    return iterator_.get_stride();
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator operator++(int) {
+    EllPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// add mask for small tiles in ELL
+  CUTLASS_DEVICE
+  void ell_add_mask(int blocksize) {
+    iterator_.ell_add_mask(blocksize);
+  }
+  
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of EllPredicatedTileAccessIterator for column-major interleaved data.
+/// It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_, int InterleavedK>
+class EllPredicatedTileAccessIterator<Shape_, Element_,
+                                   layout::ColumnMajorInterleaved<InterleavedK>,
+                                   AdvanceRank, ThreadMap_, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = EllPredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow * kInterleavedK,
+                               Shape::kColumn / kInterleavedK>,
+      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend EllPredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row() * kInterleavedK,
+                                           extent.column() / kInterleavedK),
+                  thread_id,
+                  layout::PitchLinearCoord(
+                      threadblock_offset.row() * kInterleavedK,
+                      threadblock_offset.column() / kInterleavedK)) {}
+
+  /// Construct a EllPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : EllPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  CUTLASS_HOST_DEVICE
+  int get_k() const {
+    return iterator_.get_k();
+  }
+  
+  CUTLASS_HOST_DEVICE
+  int get_stride() const {
+    return iterator_.get_stride();
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator operator++(int) {
+    EllPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+  
+  /// add mask for small tiles in ELL
+  CUTLASS_DEVICE
+  void ell_add_mask(int blocksize) {
+    iterator_.ell_add_mask(blocksize);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() { return iterator_.valid(); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of EllPredicatedTileAccessIterator for row-major interleaved data.
+/// It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_, int InterleavedK>
+class EllPredicatedTileAccessIterator<Shape_, Element_,
+                                   layout::RowMajorInterleaved<InterleavedK>,
+                                   AdvanceRank, ThreadMap_, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = EllPredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn * kInterleavedK,
+                               Shape::kRow / kInterleavedK>,
+      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap,
+      AccessType>;
+
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend EllPredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column() * kInterleavedK,
+                                           extent.row() / kInterleavedK),
+                  thread_id,
+                  layout::PitchLinearCoord(
+                      threadblock_offset.column() * kInterleavedK,
+                      threadblock_offset.row() / kInterleavedK)) {}
+
+  /// Construct a EllPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : EllPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  
+  CUTLASS_HOST_DEVICE
+  int get_k() const {
+    return iterator_.get_k();
+  }
+  
+  CUTLASS_HOST_DEVICE
+  int get_stride() const {
+    return iterator_.get_stride();
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator operator++(int) {
+    EllPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// add mask for small tiles in ELL
+  CUTLASS_DEVICE
+  void ell_add_mask(int blocksize) {
+    iterator_.ell_add_mask(blocksize);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() { return iterator_.valid(); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/transform/threadblock/ell_predicated_tile_iterator.h b/include/cutlass/transform/threadblock/ell_predicated_tile_iterator.h
new file mode 100644
index 0000000000..7c1b27b3d7
--- /dev/null
+++ b/include/cutlass/transform/threadblock/ell_predicated_tile_iterator.h
@@ -0,0 +1,1315 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Ell iterator for Blocked-Ell matrix (ellValue matrix) used with EllMmaPipelined
+*/
+
+#pragma once
+
+#include "cutlass/arch/memory.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator.h"
+
+#include "cutlass/transform/threadblock/ell_predicated_tile_access_iterator.h"
+#include "cutlass/transform/threadblock/ell_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// EllPredicatedTileIterator
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+/// Regular tile iterator using a precomputed control structure to minimize register liveness
+/// and integer arithmetic.
+///
+/// Layout is assumed to be invariant at the time the precomputed "Params" object is constructed.
+///
+/// Base pointer and tensor extents may be specified at the time the iterator is constructed.
+/// Subsequently, they are assumed to be immutable.
+///
+/// Adding a logical coordinate offset may be performed at the time the iterator is constructed.
+/// Subsequent additions to logical coordinate offset may be performed but are relatively expensive.
+///
+/// Visitation order is intended to first visit a "residual" tile that may be partially full in
+/// both the advance dimension and the steady-state dimension. This is assumed to be the last
+/// tile in the iteration sequence. Advancing an iterator that has just been constructed moves to
+/// the first tile that is full in the advance dimension and recomputes predicates. Subsequent
+/// accesses may be performed without updating internal predicates and are efficient in terms of
+/// live register state and pointer arithmetic instructions.
+///
+/// To be efficient, this assumes the iterator will be dereferenced and advanced at least once
+/// outside any looping structure to minimize integer arithmetic. 
+///
+/// Acceses out of bounds are safe so long as `clear_mask()` is called prior to dereferencing
+/// the iterator.
+///
+///
+/// Example:
+///
+/// An efficient pipeline structure may be constructed as follows:
+///
+// template <typename Iterator>
+// __global__ void kernel(
+//   typename Iterator::Params params, 
+//   typename Iterator::Element *ptr,
+//   TensorCoord extent) {
+//
+//   typename Iterator::Fragment fragment;
+//
+//   TensorCoord threadblock_offset(0, 0);
+//
+//   Iterator iter(params, ptr, extent, threadIdx.x, threadblock_offsets);
+//
+//
+//   fragment = *iter;        // load "residue" tile first
+//   ++iter;                  // advance to first "steady state" tile and update internal masks
+//
+//
+//   #pragma unroll
+//   for (int i = Remaining - 1; i >= 0; --i) {
+//
+//     f(fragment);
+//
+//     if (!i) {
+//       iter.clear_mask();   // light-weight operation to clear masks - subsequent loads become NO-OPs.
+//     }
+//  
+//     fragment = *iter;      // load tile during "steady state" phase
+//     ++iter;                // advance to next tile - lightweight due to steady-state masks
+//   }
+// }
+//
+// void host(TensorView<Element, 2, layout::PitchLinear> view) {
+//
+//   using Iterator = transform::threadblock::EllPredicatedTileIterator;
+//
+//   typename Iterator::Params params(view.layout());
+//
+//   kernel<Iterator>(params, view.data());
+// }
+///
+///
+template <
+  typename Shape,
+  typename Element,
+  typename Layout,
+  int AdvanceRank,
+  typename ThreadMap,
+  int AccessSize = ThreadMap::kElementsPerAccess
+>
+class EllPredicatedTileIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of EllPredicatedTileIterator for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int AccessSize>
+class EllPredicatedTileIterator<Shape_, Element_, layout::PitchLinear, AdvanceRank,
+                             ThreadMap_, AccessSize> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  /// Type used for internal memory accesses
+  using AccessType = AlignedArray<Element, AccessSize, (AccessSize * sizeof_bits<Element>::value / 8)>;
+
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator =
+      EllPredicatedTileAccessIterator<Shape, Element, Layout, kAdvanceRank,
+                                   ThreadMap, AccessType>;
+
+  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
+                                               ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename TileAccessIterator::Mask;
+
+  /// Iterator for ELL storage
+  using EllIterator = typename cutlass::transform::threadblock::ell::Iterator; 
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend EllPredicatedTileIterator;
+
+   private:
+    /// Parameters object
+    typename TileAccessIterator::Params params_;
+
+   public:
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) : params_(layout) { }
+    
+    CUTLASS_HOST_DEVICE
+    Params() { }
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : address_iterator_(params.params_, pointer, extent, thread_id,
+                          threadblock_offset) {}
+
+  /// Construct a EllPredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : EllPredicatedTileIterator(params, pointer, extent, thread_id,
+                               make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator &operator++() {
+    if (kAdvanceRank)
+      address_iterator_.add_tile_offset({0, 1});
+    else
+      address_iterator_.add_tile_offset({1, 0});
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator operator++(int) {
+    EllPredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Returns a stride
+  CUTLASS_HOST_DEVICE
+  int get_stride() const { return address_iterator_.get_stride(); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { address_iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { address_iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { address_iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { address_iterator_.get_mask(mask); }
+
+  /// add mask for small tiles in ELL
+  CUTLASS_HOST_DEVICE
+  void ell_add_mask(int blocksize) { address_iterator_.ell_add_mask(blocksize); }
+
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    load_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+          
+          address_iterator_.set_iteration_index(idx);
+          char const *byte_ptr = reinterpret_cast<char const *>(address_iterator_.get()) + byte_offset;
+
+          AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_ptr);
+
+          cutlass::arch::global_load<AccessType,
+                                     sizeof(AccessType)
+                                    >(
+              frag_ptr[idx], access_ptr, address_iterator_.valid());
+
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_byte_offset(frag, 0); }
+
+  CUTLASS_DEVICE
+  void load_with_ell_index(Fragment &frag, EllIterator &ell_iter) {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+          address_iterator_.set_iteration_index(idx);
+          LongIndex ell_offset = 0;
+
+          int k_offset = address_iterator_.get_k();
+          ell_offset = ell_iter.get_offset(k_offset) * sizeof(Element);
+          
+          char const *byte_ptr = reinterpret_cast<char const *>(address_iterator_.get()) + ell_offset;
+
+          AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_ptr);
+
+          bool is_valid = address_iterator_.valid();
+          is_valid = is_valid && (ell_offset >= 0);
+
+          cutlass::arch::global_load<AccessType,
+                                     sizeof(AccessType)
+                                    >(
+              frag_ptr[idx], access_ptr, is_valid);
+
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+  
+  CUTLASS_DEVICE
+  void load_with_ell_index_fast(Fragment &frag, EllIterator &ell_iter) {
+
+    LongIndex ell_offset = ell_iter.get_offset_fast() * sizeof(Element);
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          address_iterator_.set_iteration_index(idx);
+          char const *byte_ptr = reinterpret_cast<char const *>(address_iterator_.get()) + ell_offset;
+
+          AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_ptr);
+
+          bool is_valid = address_iterator_.valid();
+          is_valid = is_valid && (ell_offset >= 0);
+
+          cutlass::arch::global_load<AccessType,
+                                     sizeof(AccessType)
+                                    >(
+              frag_ptr[idx], access_ptr, is_valid);
+
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    store_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          char *byte_ptr = reinterpret_cast<char *>(address_iterator_.get()) + byte_offset;
+          AccessType *access_ptr = reinterpret_cast<AccessType *>(byte_ptr);
+
+          if (address_iterator_.valid()) {
+            *access_ptr = frag_ptr[idx];
+          }
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_byte_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of EllPredicatedTileIterator for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int AccessSize
+>
+class EllPredicatedTileIterator<Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, AccessSize> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = EllPredicatedTileIterator<
+    layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 0 : 1),
+    ThreadMap,
+    AccessSize
+  >;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Iterator for ELL storage
+  using EllIterator = typename cutlass::transform::threadblock::ell::Iterator; 
+  
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+  private:
+
+    friend EllPredicatedTileIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+  public:
+    
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0))) {
+
+    }
+  };
+
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object 
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id,                                ///< ID of each participating thread
+    TensorCoord const &threadblock_offset         ///< Initial offset of threadblock
+  ):
+    iterator_(
+      params.params_,
+      pointer,
+      layout::PitchLinearCoord(extent.row(), extent.column()),
+      thread_id,
+      layout::PitchLinearCoord(threadblock_offset.row(), threadblock_offset.column())
+    ) { }
+
+  /// Construct a EllPredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id                                 ///< ID of each participating thread
+  ): EllPredicatedTileIterator(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator operator++(int) {
+    EllPredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+  
+  /// Returns a stride
+  CUTLASS_HOST_DEVICE
+  int get_stride() const { return iterator_.get_stride(); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// add mask for small tiles in ELL
+  CUTLASS_HOST_DEVICE
+  void ell_add_mask(int blocksize) { 
+    iterator_.ell_add_mask(blocksize); 
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_ell_index(Fragment &frag, EllIterator& ell_iter) {
+    iterator_.load_with_ell_index(frag, ell_iter);
+  }
+  
+  CUTLASS_DEVICE
+  void load_with_ell_index_fast(Fragment &frag, EllIterator& ell_iter) {
+    iterator_.load_with_ell_index_fast(frag, ell_iter);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of EllPredicatedTileIterator for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int AccessSize
+>
+class EllPredicatedTileIterator<Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, AccessSize> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = EllPredicatedTileIterator<
+    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 1 : 0),
+    ThreadMap,
+    AccessSize
+  >;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Iterator for ELL storage
+  using EllIterator = typename cutlass::transform::threadblock::ell::Iterator; 
+  
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+  private:
+
+    friend EllPredicatedTileIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+  public:
+    
+    CUTLASS_HOST_DEVICE
+    Params() { } 
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0))) {
+
+    };
+  };
+
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object 
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id,                                ///< ID of each participating thread
+    TensorCoord const &threadblock_offset         ///< Initial offset of threadblock
+  ):
+    iterator_(
+      params.params_,
+      pointer,
+      layout::PitchLinearCoord(extent.column(), extent.row()),
+      thread_id,
+      layout::PitchLinearCoord(threadblock_offset.column(), threadblock_offset.row())
+    ) { }
+
+  /// Construct a EllPredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id                                 ///< ID of each participating thread
+  ): EllPredicatedTileIterator(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator operator++(int) {
+    EllPredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+  
+  /// Returns a stride
+  CUTLASS_HOST_DEVICE
+  int get_stride() const { return iterator_.get_stride(); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// add mask for small tiles in ELL
+  CUTLASS_HOST_DEVICE
+  void ell_add_mask(int blocksize) { 
+    iterator_.ell_add_mask(blocksize); 
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_ell_index(Fragment &frag, EllIterator& ell_iter) {
+    iterator_.load_with_ell_index(frag, ell_iter);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_ell_index_fast(Fragment &frag, EllIterator& ell_iter) {
+    iterator_.load_with_ell_index_fast(frag, ell_iter);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+  
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of EllPredicatedTileIterator for interleaved data.  It is mapped
+/// to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int AccessSize, int InterleavedK>
+class EllPredicatedTileIterator<Shape_, Element_,
+                             layout::ColumnMajorInterleaved<InterleavedK>,
+                             AdvanceRank, ThreadMap_, AccessSize> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = EllPredicatedTileIterator<
+      layout::PitchLinearShape<Shape::kRow * kInterleavedK,
+                               Shape::kColumn / kInterleavedK>,
+      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessSize>;
+
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
+                                               ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Iterator for ELL storage
+  using EllIterator = typename cutlass::transform::threadblock::ell::Iterator; 
+  
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend EllPredicatedTileIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row() * kInterleavedK,
+                                           extent.column() / kInterleavedK),
+                  thread_id,
+                  layout::PitchLinearCoord(
+                      threadblock_offset.row() * kInterleavedK,
+                      threadblock_offset.column() / kInterleavedK)) {}
+
+  /// Construct a EllPredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : EllPredicatedTileIterator(params, pointer, extent, thread_id,
+                               make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator operator++(int) {
+    EllPredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+  
+  /// Returns a stride
+  CUTLASS_HOST_DEVICE
+  int get_stride() const { return iterator_.get_stride(); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// add mask for small tiles in ELL
+  CUTLASS_HOST_DEVICE
+  void ell_add_mask(int blocksize) { iterator_.ell_add_mask(blocksize); }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_ell_index(Fragment &frag, EllIterator& ell_iter) {
+    iterator_.load_with_ell_index(frag, ell_iter);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_ell_index_fast(Fragment &frag, EllIterator& ell_iter) {
+    iterator_.load_with_ell_index_fast(frag, ell_iter);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of EllPredicatedTileIterator for interleaved-32 data.  It is
+/// mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int AccessSize, int InterleavedK>
+class EllPredicatedTileIterator<Shape_, Element_,
+                             layout::RowMajorInterleaved<InterleavedK>,
+                             AdvanceRank, ThreadMap_, AccessSize> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = EllPredicatedTileIterator<
+      layout::PitchLinearShape<Shape::kColumn * kInterleavedK,
+                               Shape::kRow / kInterleavedK>,
+      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessSize>;
+
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+  
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
+                                               ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend EllPredicatedTileIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column() * kInterleavedK,
+                                           extent.row() / kInterleavedK),
+                  thread_id,
+                  layout::PitchLinearCoord(
+                      threadblock_offset.column() * kInterleavedK,
+                      threadblock_offset.row() / kInterleavedK)) {}
+
+  /// Construct a EllPredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : EllPredicatedTileIterator(params, pointer, extent, thread_id,
+                               make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator operator++(int) {
+    EllPredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+  
+  /// Returns a stride
+  CUTLASS_HOST_DEVICE
+  int get_stride() const { return iterator_.get_stride(); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// add mask for small tiles in ELL
+  CUTLASS_HOST_DEVICE
+  void ell_add_mask(int blocksize) { iterator_.ell_add_mask(blocksize); }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/transform/threadblock/predicated_scale_bias_vector_access_iterator.h b/include/cutlass/transform/threadblock/predicated_scale_bias_vector_access_iterator.h
new file mode 100644
index 0000000000..366897c65d
--- /dev/null
+++ b/include/cutlass/transform/threadblock/predicated_scale_bias_vector_access_iterator.h
@@ -0,0 +1,375 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Templates calculating the address and predicates to the load of scale and bias vectors.
+
+    This iterator uses masks to guard out-of-bounds accesses.
+
+    It can be used to load the gamma and beta vectors of layernorm which is loop variant.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+   stored in registers, and integer addition is used to advance the pointer
+   through memory.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedScaleBiasVectorAccessIterator
+///
+template <typename ThreadblockShape,
+          typename Element,
+          typename Layout>
+class PredicatedScaleBiasVectorAccessIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for fprop pitch-linear data.
+///
+template <typename ThreadblockShape_, typename Element_>
+class PredicatedScaleBiasVectorAccessIterator<ThreadblockShape_,
+                                              Element_,
+                                              layout::PitchLinear> {
+ public:
+
+  using ThreadblockShape = ThreadblockShape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ConstPointer = const Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  static int const kElementsPerAccess = 128 / sizeof_bits<Element>::value;
+  static int const kThreads = ThreadblockShape::kContiguous / kElementsPerAccess;
+
+  using AccessType = AlignedArray<Element, kElementsPerAccess>;
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  TensorCoord thread_offset_;
+
+  int problem_size_k_;
+
+  /// Used for out-of-order visitation
+  bool is_residue_tile_;
+
+  bool guard_;
+
+  TensorCoord::Index residue_size_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      /// Extent of tensor
+      int problem_size_k,
+      /// Pointer to the start of the scale vector
+      ConstPointer scale_pointer,
+      /// Pointer to the start of the bias vector
+      ConstPointer bias_pointer,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset) {
+    pointer_ = (thread_id < kThreads)
+                   ? reinterpret_cast<BytePointer>(
+                         const_cast<NonConstPointer>(scale_pointer))
+                   : reinterpret_cast<BytePointer>(
+                         const_cast<NonConstPointer>(bias_pointer));
+
+    // Per-thread offset in logical coordinates of tensor
+    int thread_base = (thread_id < kThreads) ? 0 : kThreads;
+
+    problem_size_k_ = problem_size_k;
+
+    is_residue_tile_ = true;
+
+    residue_size_ = (problem_size_k_ - threadblock_offset.contiguous()) % ThreadblockShape::kContiguous;
+
+    if (residue_size_ == 0) {
+      residue_size_ = ThreadblockShape::kContiguous;
+    }
+
+    guard_ = ((thread_id - thread_base) * kElementsPerAccess) < residue_size_;
+
+    thread_offset_ =
+        threadblock_offset +
+        TensorCoord((thread_id - thread_base) * kElementsPerAccess, 0);
+
+    set_iteration_index(0);
+  }
+
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      /// Extent of tensor
+      int problem_size_k,
+      /// Pointer to start of scale vector
+      ConstPointer scale_pointer,
+      /// Pointer to start of scale vector
+      ConstPointer bias_pointer,
+      ///< ID of each participating thread
+      int thread_id)
+      : PredicatedScaleBiasVectorAccessIterator(problem_size_k,
+                                                scale_pointer, bias_pointer,
+                                                thread_id, make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {}
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole threadblock tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(
+      TensorCoord const &tile_offset) {
+
+    guard_ = threadIdx.x < kThreads * 2;
+
+    TensorCoord offset = is_residue_tile_ ?
+      TensorCoord(residue_size_ + ThreadblockShape::kContiguous * (tile_offset.contiguous() - 1), 0)
+      : TensorCoord(ThreadblockShape::kContiguous * tile_offset.contiguous(), 0);
+
+    thread_offset_ =
+        thread_offset_ +
+        offset;
+
+    is_residue_tile_ = false;
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+
+    return reinterpret_cast<AccessType *>(
+        pointer_ +
+        (thread_offset_.contiguous() * sizeof_bits<Element>::value / 8));
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator &operator++() {
+    return *this;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_DEVICE
+  PredicatedScaleBiasVectorAccessIterator operator++(int) {
+    PredicatedScaleBiasVectorAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    guard_ &= (!enable);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return guard_;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename ThreadblockShape_,
+          typename Element_>
+class PredicatedScaleBiasVectorAccessIterator<ThreadblockShape_,
+                                        Element_,
+                                        layout::RowMajor> {
+ public:
+
+  using ThreadblockShape = ThreadblockShape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ConstPointer = const Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedScaleBiasVectorAccessIterator<
+      layout::PitchLinearShape<ThreadblockShape::kColumn, ThreadblockShape::kRow>,
+      Element,
+      layout::PitchLinear>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+  static int const kElementsPerAccess = UnderlyingIterator::kElementsPerAccess;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      ///< Extent of tensor
+      int problem_size_k,
+      ///< Pointer to the start of the scale vector
+      ConstPointer scale_pointer,
+      ///< Pointer to the start of the bias vector
+      ConstPointer bias_pointer,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(problem_size_k, scale_pointer, bias_pointer,
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row())) {}
+
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      int problem_size_k,  ///< Extent of tensor
+      ConstPointer scale_pointer,  ///< Pointer to the start of the scale vector
+      ConstPointer bias_pointer,   ///< Pointer to the start of the bias vector
+      int thread_id                ///< ID of each participating thread
+      )
+      : PredicatedScaleBiasVectorAccessIterator(problem_size_k,
+                                                scale_pointer, bias_pointer,
+                                                thread_id, make_Coord(0, 0)) {}
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// threadblock tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator operator++(int) {
+    PredicatedScaleBiasVectorAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform 
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h b/include/cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h
new file mode 100644
index 0000000000..54b0ecf5ef
--- /dev/null
+++ b/include/cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h
@@ -0,0 +1,328 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Templates calculating the address and predicates to the load of scale and bias vectors.
+
+    This iterator uses masks to guard out-of-bounds accesses.
+
+    This can be used to load var and mean vectors in layernorm which is loop invariant.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+   stored in registers, and integer addition is used to advance the pointer
+   through memory.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedScaleBiasVectorIterator
+///
+template <typename WarpShape,
+          typename Element,
+          typename Layout>
+class PredicatedScaleBiasVectorIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIterator for wgrad pitch-linear data.
+///
+template <typename WarpShape_, typename Element_>
+class PredicatedScaleBiasVectorIterator<WarpShape_,
+                                        Element_,
+                                        layout::PitchLinear> {
+ public:
+
+  using WarpShape = WarpShape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ConstPointer = const Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  static int const kElementsPerAccess = 1;
+
+  using AccessType = AlignedArray<Element, kElementsPerAccess>;
+
+  static int const kIterations = WarpShape::kContiguous / 8;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<__half2, 2 * kIterations * kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Internal pointer to first access of tile
+  ConstPointer scale_pointer_;
+  ConstPointer bias_pointer_;
+
+  /// Size of tensor
+  int problem_size_;
+
+  int32_t thread_offset_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorIterator(
+      /// Extent of tensor
+      int problem_size,
+      /// Pointer to the start of the scale vector
+      ConstPointer scale_pointer,
+      /// Pointer to the start of the bias vector
+      ConstPointer bias_pointer,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : problem_size_(problem_size),
+        scale_pointer_(scale_pointer),
+        bias_pointer_(bias_pointer) {
+
+    thread_offset_ = threadblock_offset.contiguous() + (thread_id % 32) / 4;
+  }
+
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorIterator(
+      /// Extent of tensor
+      int problem_size,
+      /// Pointer to start of scale vector
+      ConstPointer scale_pointer,
+      /// Pointer to start of scale vector
+      ConstPointer bias_pointer,
+      ///< ID of each participating thread
+      int thread_id)
+      : PredicatedScaleBiasVectorIterator(problem_size,
+                                          scale_pointer, bias_pointer,
+                                          thread_id, make_Coord(0, 0)) {}
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole warp tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(
+      TensorCoord const &tile_offset) {
+
+    thread_offset_ += (WarpShape::kContiguous * tile_offset.contiguous());
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+
+    frag.fill(__float2half2_rn(0.0f));
+    __half2 *frag_ptr = reinterpret_cast<__half2 *>(&frag);
+
+    // load scale
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < kIterations; ++c) {
+
+      cutlass::arch::global_load<
+        __half,
+        sizeof(AccessType)
+      >(
+        frag_ptr[c * 2].x,
+        scale_pointer_ + thread_offset_ + c * 8,
+        (thread_offset_ + c * 8) < problem_size_ 
+      );
+    }
+
+    // load bias
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < kIterations; ++c) {
+
+      cutlass::arch::global_load<
+        __half,
+        sizeof(AccessType)
+      >(
+        frag_ptr[c * 2 + 1].x,
+        bias_pointer_ + thread_offset_ + c * 8,
+        (thread_offset_ + c * 8) < problem_size_ 
+      );
+    }
+
+    // duplicate scale
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < kIterations; ++c) {
+      frag_ptr[c * 2].y = frag_ptr[c * 2].x;
+    }
+
+    // duplicate bias
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < kIterations; ++c) {
+      frag_ptr[c * 2 + 1].y = frag_ptr[c * 2 + 1].x;
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIterator for row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename WarpShape_,
+          typename Element_>
+class PredicatedScaleBiasVectorIterator<WarpShape_,
+                                        Element_,
+                                        layout::RowMajor> {
+ public:
+
+  using WarpShape = WarpShape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ConstPointer = const Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedScaleBiasVectorIterator<
+      layout::PitchLinearShape<WarpShape::kColumn, WarpShape::kRow>,
+      Element,
+      layout::PitchLinear>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+  static int const kElementsPerAccess = UnderlyingIterator::kElementsPerAccess;
+  using Fragment = typename UnderlyingIterator::Fragment;
+
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorIterator(
+      ///< Extent of tensor
+      int problem_size,
+      ///< Pointer to the start of the scale vector
+      ConstPointer scale_pointer,
+      ///< Pointer to the start of the bias vector
+      ConstPointer bias_pointer,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(problem_size, scale_pointer, bias_pointer,
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row())) {}
+
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorIterator(
+      int problem_size,  ///< Extent of tensor
+      ConstPointer scale_pointer,  ///< Pointer to the start of the scale vector
+      ConstPointer bias_pointer,   ///< Pointer to the start of the bias vector
+      int thread_id                ///< ID of each participating thread
+      )
+      : PredicatedScaleBiasVectorIterator(problem_size,
+                                          scale_pointer, bias_pointer,
+                                          thread_id, make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// threadblock tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    iterator_.load(frag);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform 
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h b/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h
index d45c4441a8..a99dae9526 100644
--- a/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h
+++ b/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -48,6 +48,7 @@
 #include "cutlass/coord.h"
 #include "cutlass/cutlass.h"
 #include "cutlass/layout/matrix.h"
+#include "cutlass/layout/permute.h"
 #include "cutlass/layout/pitch_linear.h"
 #include "cutlass/matrix_shape.h"
 #include "cutlass/predicate_vector.h"
@@ -221,7 +222,10 @@ class PredicatedTileAccessIteratorPredicates {
     set_iteration_index(0);
   }
 
-    /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// Default constructor
+  PredicatedTileAccessIteratorPredicates() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
   /// and thread ID
   CUTLASS_HOST_DEVICE
   PredicatedTileAccessIteratorPredicates(
@@ -311,7 +315,8 @@ class PredicatedTileAccessIteratorPredicates {
 /// PredicatedTileAccessIterator
 ///
 template <typename Shape, typename Element, typename Layout, int AdvanceRank,
-          typename ThreadMap, typename AccessType, bool Gather = false>
+          typename ThreadMap, typename AccessType, bool Gather = false,
+          typename PermuteLayout = layout::NoPermute>
 class PredicatedTileAccessIterator;
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -319,9 +324,11 @@ class PredicatedTileAccessIterator;
 /// Specialization of PredicatedTileAccessIterator for pitch-linear data.
 ///
 template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, typename AccessType_, bool Gather>
+          typename ThreadMap_, typename AccessType_, bool Gather,
+          typename PermuteLayout>
 class PredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
-                                   AdvanceRank, ThreadMap_, AccessType_, Gather> {
+                                   AdvanceRank, ThreadMap_, AccessType_, Gather,
+                                   PermuteLayout> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
@@ -353,6 +360,9 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
   static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
     "Vectors implied by the thread map must be divisible by the access type.");
 
+  static bool constexpr Permute = !platform::is_same<PermuteLayout, layout::NoPermute>::value
+                               && !platform::is_same<PermuteLayout, layout::InversePermute<layout::NoPermute>>::value;
+
   using Mask = typename UnderlyingPredicates::Mask;
 
   /// Uses a non-template class
@@ -360,9 +370,8 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
     
     using Base = PredicatedTileAccessIteratorParams;
 
-    // Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
+    /// Default constructor
+    Params() = default;
 
     /// Construct the Params object given a pitch-linear tensor's layout
     CUTLASS_HOST_DEVICE
@@ -388,7 +397,7 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
   UnderlyingPredicates the_predicates;
 
   /// Parameters object with precomputed internal state
-  Params const &params_;
+  Params params_;
 
   /// Internal pointer to first access of tile
   BytePointer pointer_;
@@ -397,15 +406,21 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
   bool is_residue_tile_;
 
   /// Below is used when Gather is turned on.  We need to record strided_offset
-  /// and contiguous_offset seperated to compute the offset by using
+  /// and contiguous_offset separated to compute the offset by using
   ///
   /// offset = contiguous_offset + indices[strided_offset]
-  ///
 
   /// Gather indices
   int const *indices_;
 
-  Index gather_offset_strided;
+  /// Function to perform layout permutation and offset computation
+  PermuteLayout permute_layout_;
+
+  /// Tracks thread's coordinate offset in the matrix for current tile.
+  /// This is only used in the following cases:
+  /// - when Gather is true, strided coordinate needed to access indices (contiguous offset is tracked via pointer_)
+  /// - when Permute is true, both coordinates are neeeded as input into permutation function (pointer_ is fixed)
+  TensorCoord coord_offset_;
 
  private:
   /// Computes predicates based on internally tracked per-thread offset.
@@ -419,7 +434,10 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
   }
 
  public:
-          
+
+  /// Default constructor
+  PredicatedTileAccessIterator() = default;
+
   /// Constructs a TileIterator from its precomputed state, threadblock offset,
   /// and thread ID
   CUTLASS_HOST_DEVICE
@@ -437,22 +455,29 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
       /// Gather indices
       int const *indices = nullptr)
       : params_(params),
-	pointer_(reinterpret_cast<BytePointer>(
-            const_cast<NonConstPointer>(pointer))),
-	the_predicates(extent),
+	      pointer_(reinterpret_cast<BytePointer>(
+                 const_cast<NonConstPointer>(pointer))),
+	      the_predicates(extent),
         is_residue_tile_(true),
-        indices_(indices) {
+        indices_(indices),
+        permute_layout_(TensorCoord(extent.contiguous(), extent.strided()), params.stride_) {
 
     the_predicates.set_predicates(thread_id, threadblock_offset);
           
+    if (Gather) {
+      assert(indices_);
+    }
+
     // update internal pointers
     Layout layout(params_.stride_);
 
-    if (!Gather) {
+    if (!Gather && !Permute) {
       add_pointer_offset(layout(the_predicates.thread_offset_));
     } else {
-      gather_offset_strided = the_predicates.thread_offset_.strided();
-      add_pointer_offset(layout(make_Coord(the_predicates.thread_offset_.contiguous(), 0)));
+      coord_offset_ = the_predicates.thread_offset_;
+      if (!Permute) {
+        add_pointer_offset(layout(make_Coord(coord_offset_.contiguous(), 0)));
+      }
     }
   }
 
@@ -494,30 +519,27 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
 
       Layout layout(params_.stride_);
 
-      if (!Gather) {
+      if (!Gather && !Permute) {
         add_pointer_offset(layout(the_predicates.residue_offset_));
 
         if (kAdvanceRank) {
           pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided() - 1);
-          pointer_ += Shape::kContiguous * tile_offset.contiguous();
+          pointer_ += Shape::kContiguous * tile_offset.contiguous() * sizeof_bits<Element>::value / 8;
         } else {
           pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous() - 1);
-          pointer_ += Shape::kStrided * tile_offset.strided();
+          pointer_ += Shape::kStrided * tile_offset.strided() * sizeof_bits<Element>::value / 8;
         }
       } else {
-        gather_offset_strided = the_predicates.thread_offset_.strided();
-        add_pointer_offset(layout(make_Coord(the_predicates.residue_offset_.contiguous(), 0)));
-
-        if (kAdvanceRank) {
-          gather_offset_strided += Shape::kStrided * (tile_offset.strided() - 1);
-          add_pointer_offset(Shape::kContiguous * tile_offset.contiguous());
+        coord_offset_.strided() = the_predicates.thread_offset_.strided() + Shape::kStrided * (tile_offset.strided() - kAdvanceRank);
+        if (!Permute) {
+          add_pointer_offset(layout(make_Coord(the_predicates.residue_offset_.contiguous(), 0)));
+          add_pointer_offset(Shape::kContiguous * (tile_offset.contiguous() - (1 - kAdvanceRank)));
         } else {
-          add_pointer_offset(Shape::kContiguous * (tile_offset.contiguous() - 1));
-          gather_offset_strided += Shape::kStrided * tile_offset.strided();
+          coord_offset_.contiguous() = the_predicates.thread_offset_.contiguous() + Shape::kContiguous * (tile_offset.contiguous() - (1 - kAdvanceRank));
         }
       }
     } else {
-      if (!Gather) {
+      if (!Gather && !Permute) {
         if (kAdvanceRank) {
           pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided());
           pointer_ += Shape::kContiguous * tile_offset.contiguous();
@@ -526,8 +548,12 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
           pointer_ += Shape::kStrided * tile_offset.strided();
         }
       } else {
-        add_pointer_offset(Shape::kContiguous * tile_offset.contiguous());
-        gather_offset_strided += Shape::kStrided * tile_offset.strided();
+        coord_offset_.strided() += Shape::kStrided * tile_offset.strided();
+        if (!Permute) {
+          add_pointer_offset(Shape::kContiguous * tile_offset.contiguous());
+        } else {
+          coord_offset_.contiguous() += Shape::kContiguous * tile_offset.contiguous();
+        }
       }
     }
 
@@ -537,19 +563,21 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
   /// Returns a pointer
   CUTLASS_HOST_DEVICE
   AccessType *get() const {
-    if (Gather) {
-      assert(indices_);
 
+    if (Gather || Permute)
+    {
       if (!valid()) {
         return nullptr;
       }
 
-      LongIndex contiguous_offset = the_predicates.iteration_contiguous_ * (ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value / 8) + the_predicates.iteration_vector_;
-      int strided_index = gather_offset_strided + the_predicates.iteration_strided_ * ThreadMap::Delta::kStrided;
-
-      LongIndex strided_offset = indices_[strided_index] * LongIndex(params_.stride_) * sizeof_bits<Element>::value / 8;
+      Index coord_contig  = (Permute ? coord_offset_.contiguous() : 0) + the_predicates.iteration_contiguous_ * ThreadMap::Delta::kContiguous + the_predicates.iteration_vector_ * AccessType::kElements;
+      Index coord_strided = coord_offset_.strided() + the_predicates.iteration_strided_ * ThreadMap::Delta::kStrided;
+      if (Gather) {
+        coord_strided = indices_[coord_strided];
+      }
 
-      return reinterpret_cast<AccessType *>(pointer_ + contiguous_offset + strided_offset);
+      LongIndex offset = Permute ? permute_layout_(TensorCoord(coord_contig, coord_strided)) : (coord_strided * LongIndex(params_.stride_) + coord_contig);
+      return reinterpret_cast<AccessType *>(pointer_ + OffsetBytes<Element>(offset));
     }
 
     return reinterpret_cast<AccessType *>(
@@ -575,13 +603,12 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
       return *this;
     }
 
-    // Enter here only if (iteration_contiguous_ ==
-    // ThreadMap::Iteration::kContiguous)
+    // Enter here only if (iteration_contiguous_ == ThreadMap::Iteration::kContiguous)
     the_predicates.iteration_contiguous_ = 0;
     ++the_predicates.iteration_strided_;
 
     if (the_predicates.iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      if (!Gather) {
+      if (!Gather && !Permute) {
         pointer_ += params_.inc_strided_;
       }
 
@@ -592,7 +619,7 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
     // which means we enter the next tile.
     the_predicates.iteration_strided_ = 0;
 
-    if (!Gather) {
+    if (!Gather && !Permute) {
       // advance to next tile
       pointer_ += params_.inc_next_;
   
@@ -654,9 +681,11 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
 ///            MaskedTileIteratorConcept
 ///
 template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, typename AccessType_, bool Gather>
+          typename ThreadMap_, typename AccessType_, bool Gather,
+          typename PermuteLayout>
 class PredicatedTileAccessIterator<Shape_, Element_, layout::ColumnMajor,
-                                   AdvanceRank, ThreadMap_, AccessType_, Gather> {
+                                   AdvanceRank, ThreadMap_, AccessType_, Gather,
+                                   PermuteLayout> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
@@ -682,7 +711,8 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::ColumnMajor,
 
   using UnderlyingIterator = PredicatedTileAccessIterator<
       layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
-      layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessType, Gather>;
+      layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessType,
+      Gather, PermuteLayout>;
 
   /// Predicate vector stores mask to guard accesses
   using Mask = typename UnderlyingIterator::Mask;
@@ -699,9 +729,8 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::ColumnMajor,
 
    public:
 
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
+    /// Default constructor
+    Params() = default;
 
     /// Construct the Params object given a pitch-linear tensor's layout
     CUTLASS_HOST_DEVICE
@@ -723,6 +752,10 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::ColumnMajor,
   UnderlyingIterator iterator_;
 
  public:
+
+  /// Default constructor
+  PredicatedTileAccessIterator() = default;
+
   /// Constructs a TileIterator from its precomputed state, threadblock offset,
   /// and thread ID
   CUTLASS_HOST_DEVICE
@@ -838,9 +871,11 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::ColumnMajor,
 ///            MaskedTileIteratorConcept
 ///
 template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, typename AccessType_, bool Gather>
+          typename ThreadMap_, typename AccessType_, bool Gather,
+          typename PermuteLayout>
 class PredicatedTileAccessIterator<Shape_, Element_, layout::RowMajor,
-                                   AdvanceRank, ThreadMap_, AccessType_, Gather> {
+                                   AdvanceRank, ThreadMap_, AccessType_, Gather,
+                                   PermuteLayout> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
@@ -866,7 +901,8 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::RowMajor,
 
   using UnderlyingIterator = PredicatedTileAccessIterator<
       layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
-      layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessType, Gather>;
+      layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessType, 
+      Gather, PermuteLayout>;
 
   static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
 
@@ -883,9 +919,8 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::RowMajor,
 
    public:
 
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
+    /// Default constructor
+    Params() = default;
 
     /// Construct the Params object given a pitch-linear tensor's layout
     CUTLASS_HOST_DEVICE
@@ -907,6 +942,10 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::RowMajor,
   UnderlyingIterator iterator_;
 
  public:
+
+  /// Default constructor
+  PredicatedTileAccessIterator() = default;
+
   /// Constructs a TileIterator from its precomputed state, threadblock offset,
   /// and thread ID
   CUTLASS_HOST_DEVICE
@@ -1024,7 +1063,8 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::RowMajor,
 template <typename Shape_, typename Element_, int AdvanceRank,
           typename ThreadMap_, typename AccessType_>
 class PredicatedTileAccessIterator<Shape_, Element_, layout::AffineRankN<2>,
-                                   AdvanceRank, ThreadMap_, AccessType_, false> {
+                                   AdvanceRank, ThreadMap_, AccessType_, false,
+                                   layout::NoPermute> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
@@ -1122,7 +1162,7 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::AffineRankN<2>,
   //
 
   /// Parameters object with precomputed internal state
-  Params const &params_;
+  Params params_;
 
   /// Internal pointer to first access of tile
   BytePointer pointer_;
@@ -1144,6 +1184,10 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::AffineRankN<2>,
   }
 
  public:
+
+  /// Default constructor
+  PredicatedTileAccessIterator() = default;
+
   /// Constructs a TileIterator from its precomputed state, threadblock offset,
   /// and thread ID
   CUTLASS_HOST_DEVICE
@@ -1327,7 +1371,8 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::AffineRankN<2>,
 template <typename Shape_, typename Element_, int AdvanceRank,
           typename ThreadMap_, typename AccessType_>
 class PredicatedTileAccessIterator<Shape_, Element_, layout::AffineRank2ColumnMajor,
-                                   AdvanceRank, ThreadMap_, AccessType_, false> {
+                                   AdvanceRank, ThreadMap_, AccessType_, false,
+                                   layout::NoPermute> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
@@ -1371,9 +1416,8 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::AffineRank2ColumnMa
 
    public:
 
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
+    /// Default constructor
+    Params() = default;
 
     /// Construct the Params object given an AffineRankN<2> tensor's layout
     CUTLASS_HOST_DEVICE
@@ -1390,6 +1434,10 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::AffineRank2ColumnMa
   UnderlyingIterator iterator_;
 
  public:
+
+  /// Default constructor
+  PredicatedTileAccessIterator() = default;
+
   /// Constructs a TileIterator from its precomputed state, threadblock offset,
   /// and thread ID
   CUTLASS_HOST_DEVICE
@@ -1506,7 +1554,8 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::AffineRank2ColumnMa
 template <typename Shape_, typename Element_, int AdvanceRank,
           typename ThreadMap_, typename AccessType_>
 class PredicatedTileAccessIterator<Shape_, Element_, layout::AffineRank2RowMajor,
-                                   AdvanceRank, ThreadMap_, AccessType_, false> {
+                                   AdvanceRank, ThreadMap_, AccessType_, false,
+                                   layout::NoPermute> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
@@ -1550,9 +1599,8 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::AffineRank2RowMajor
 
    public:
 
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
+    /// Default constructor
+    Params() = default;
 
     /// Construct the Params object given an AffineRankN<2> tensor's layout
     CUTLASS_HOST_DEVICE
@@ -1569,6 +1617,10 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::AffineRank2RowMajor
   UnderlyingIterator iterator_;
 
  public:
+
+  /// Default constructor
+  PredicatedTileAccessIterator() = default;
+
   /// Constructs a TileIterator from its precomputed state, threadblock offset,
   /// and thread ID
   CUTLASS_HOST_DEVICE
@@ -1688,7 +1740,8 @@ template <typename Shape_, typename Element_, int AdvanceRank,
           typename ThreadMap_, typename AccessType_, int InterleavedK>
 class PredicatedTileAccessIterator<Shape_, Element_,
                                    layout::ColumnMajorInterleaved<InterleavedK>,
-                                   AdvanceRank, ThreadMap_, AccessType_, false> {
+                                   AdvanceRank, ThreadMap_, AccessType_, false,
+                                   layout::NoPermute> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
@@ -1733,8 +1786,9 @@ class PredicatedTileAccessIterator<Shape_, Element_,
     typename UnderlyingIterator::Params params_;
 
    public:
-    CUTLASS_HOST_DEVICE
-    Params() {}
+
+    /// Default constructor
+    Params() = default;
 
     /// Construct the Params object given a pitch-linear tensor's layout
     CUTLASS_HOST_DEVICE
@@ -1755,6 +1809,10 @@ class PredicatedTileAccessIterator<Shape_, Element_,
   UnderlyingIterator iterator_;
 
  public:
+
+  /// Default constructor
+  PredicatedTileAccessIterator() = default;
+
   /// Constructs a TileIterator from its precomputed state, threadblock offset,
   /// and thread ID
   CUTLASS_HOST_DEVICE
@@ -1873,7 +1931,8 @@ template <typename Shape_, typename Element_, int AdvanceRank,
           typename ThreadMap_, typename AccessType_, int InterleavedK>
 class PredicatedTileAccessIterator<Shape_, Element_,
                                    layout::RowMajorInterleaved<InterleavedK>,
-                                   AdvanceRank, ThreadMap_, AccessType_, false> {
+                                   AdvanceRank, ThreadMap_, AccessType_, false,
+                                   layout::NoPermute> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
@@ -1919,8 +1978,9 @@ class PredicatedTileAccessIterator<Shape_, Element_,
     typename UnderlyingIterator::Params params_;
 
    public:
-    CUTLASS_HOST_DEVICE
-    Params() {}
+
+    /// Default constructor
+    Params() = default;
 
     /// Construct the Params object given a pitch-linear tensor's layout
     CUTLASS_HOST_DEVICE
@@ -1941,6 +2001,10 @@ class PredicatedTileAccessIterator<Shape_, Element_,
   UnderlyingIterator iterator_;
 
  public:
+
+  /// Default constructor
+  PredicatedTileAccessIterator() = default;
+
   /// Constructs a TileIterator from its precomputed state, threadblock offset,
   /// and thread ID
   CUTLASS_HOST_DEVICE
diff --git a/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h b/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h
index 2ee6083336..4379bb0a62 100644
--- a/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h
+++ b/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/threadblock/predicated_tile_access_iterator_params.h b/include/cutlass/transform/threadblock/predicated_tile_access_iterator_params.h
index 251c36e2c3..c67af387e5 100755
--- a/include/cutlass/transform/threadblock/predicated_tile_access_iterator_params.h
+++ b/include/cutlass/transform/threadblock/predicated_tile_access_iterator_params.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,6 +35,9 @@
 #pragma once
 
 #include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/detail/helper_macros.hpp"
+#include "cutlass/layout/matrix.h"
 #include "cutlass/layout/pitch_linear.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -48,8 +51,8 @@ namespace threadblock {
 /// Predicated tile access iterator descriptor object containing template dependent state
 struct PredicatedTileAccessIteratorDesc {
 
-  int element_size_bits;
-  int advance_rank;
+  int element_size_bits = -1;
+  int advance_rank = -1;
   layout::PitchLinearCoord threadblock_shape;
   layout::PitchLinearCoord threadmap_iterations;
   layout::PitchLinearCoord threadmap_delta;
@@ -58,8 +61,7 @@ struct PredicatedTileAccessIteratorDesc {
   // Methods
   //
 
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIteratorDesc() { }
+  PredicatedTileAccessIteratorDesc() = default;
 
   CUTLASS_HOST_DEVICE
   PredicatedTileAccessIteratorDesc(
@@ -220,16 +222,16 @@ struct PredicatedTileAccessIteratorParams {
   // Data members
   //
   /// stride of pitch-linear layout (units of Element)
-  LongIndex stride_;
+  LongIndex stride_ = 0;
   /// amount (in byte) to increment pointer to move to next access along
   /// strided dimension
-  LongIndex inc_strided_;
+  LongIndex inc_strided_ = 0;
   /// amount (in byte) to increment pointer from last access to first access
   /// of next tile
-  LongIndex inc_next_;
+  LongIndex inc_next_ = 0;
   /// amount (in byte) to increment pointer from first access of current tile
   /// to first access of next tile
-  LongIndex inc_advance_;
+  LongIndex inc_advance_ = 0;
 
   //
   // Methods
@@ -237,6 +239,8 @@ struct PredicatedTileAccessIteratorParams {
 
   CUTLASS_HOST_DEVICE
   Status initialize(LongIndex stride, PredicatedTileAccessIteratorDesc desc) {
+    CUTLASS_ASSERT(desc.element_size_bits > 0);
+    CUTLASS_ASSERT(desc.advance_rank == 0 || desc.advance_rank == 1);
 
     stride_ = stride;
 
@@ -264,10 +268,7 @@ struct PredicatedTileAccessIteratorParams {
     return initialize(LongIndex(stride), desc);
   }
 
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIteratorParams() {
-    initialize(LongIndex(0), PredicatedTileAccessIteratorDesc());
-  }
+  PredicatedTileAccessIteratorParams() = default;
 
   CUTLASS_HOST_DEVICE
   PredicatedTileAccessIteratorParams(Index stride, PredicatedTileAccessIteratorDesc desc) {
diff --git a/include/cutlass/transform/threadblock/predicated_tile_access_iterator_triangular_matrix.h b/include/cutlass/transform/threadblock/predicated_tile_access_iterator_triangular_matrix.h
index dc0d89d350..24498843d5 100644
--- a/include/cutlass/transform/threadblock/predicated_tile_access_iterator_triangular_matrix.h
+++ b/include/cutlass/transform/threadblock/predicated_tile_access_iterator_triangular_matrix.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/threadblock/predicated_tile_iterator.h b/include/cutlass/transform/threadblock/predicated_tile_iterator.h
index 23c5541402..bdfb33fe5f 100644
--- a/include/cutlass/transform/threadblock/predicated_tile_iterator.h
+++ b/include/cutlass/transform/threadblock/predicated_tile_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -137,7 +137,8 @@ template <
   int AdvanceRank,
   typename ThreadMap,
   int AccessSize = ThreadMap::kElementsPerAccess,
-  bool Gather = false
+  bool Gather = false,
+  typename PermuteLayout = layout::NoPermute
 >
 class PredicatedTileIterator;
 
@@ -151,9 +152,9 @@ class PredicatedTileIterator;
 ///            MaskedTileIteratorConcept
 ///
 template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int AccessSize, bool Gather>
+          typename ThreadMap_, int AccessSize, bool Gather, typename PermuteLayout>
 class PredicatedTileIterator<Shape_, Element_, layout::PitchLinear, AdvanceRank,
-                             ThreadMap_, AccessSize, Gather> {
+                             ThreadMap_, AccessSize, Gather, PermuteLayout> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
@@ -182,7 +183,7 @@ class PredicatedTileIterator<Shape_, Element_, layout::PitchLinear, AdvanceRank,
   /// Underlying iterator to compute the addresses
   using TileAccessIterator =
       PredicatedTileAccessIterator<Shape, Element, Layout, kAdvanceRank,
-                                   ThreadMap, AccessType, Gather>;
+                                   ThreadMap, AccessType, Gather, PermuteLayout>;
 
   static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
 
@@ -207,13 +208,13 @@ class PredicatedTileIterator<Shape_, Element_, layout::PitchLinear, AdvanceRank,
    public:
     /// Construct the Params object given a pitch-linear tensor's layout
     CUTLASS_HOST_DEVICE
-    Params(Layout const &layout) : params_(layout) { }
-    
-    CUTLASS_HOST_DEVICE
-    Params() { }
+    Params(Layout const &layout) : params_(layout) {}
+
+    /// Default constructor
+    Params() = default;
 
     CUTLASS_HOST_DEVICE
-    Params(Base const &base) 
+    Params(Base const &base)
         : params_(base) {}
   };
 
@@ -230,6 +231,10 @@ class PredicatedTileIterator<Shape_, Element_, layout::PitchLinear, AdvanceRank,
   TileAccessIterator address_iterator_;
 
  public:
+
+  /// Default constructor
+  PredicatedTileIterator() = default;
+
   /// Constructs a TileIterator from its precomputed state, threadblock offset,
   /// and thread ID
   CUTLASS_HOST_DEVICE
@@ -391,7 +396,7 @@ class PredicatedTileIterator<Shape_, Element_, layout::PitchLinear, AdvanceRank,
 
 ////////////////////////////////////////////////////////////////////////////////
 
-/// Specialization of PredicatedTileIterator for pitch-linear data.
+/// Specialization of PredicatedTileIterator for column-major data.
 ///
 /// Satisfies: ForwardTileIteratorConcept | 
 ///            ReadableContiguousTileIteratorConcept | 
@@ -404,9 +409,11 @@ template <
   int AdvanceRank,
   typename ThreadMap_,
   int AccessSize,
-  bool Gather
+  bool Gather,
+  typename PermuteLayout
 >
-class PredicatedTileIterator<Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, AccessSize, Gather> {
+class PredicatedTileIterator<Shape_, Element_, layout::ColumnMajor, AdvanceRank, 
+                             ThreadMap_, AccessSize, Gather, PermuteLayout> {
 public:
 
   static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
@@ -436,7 +443,8 @@ class PredicatedTileIterator<Shape_, Element_, layout::ColumnMajor, AdvanceRank,
     (kAdvanceRank == 0 ? 0 : 1),
     ThreadMap,
     AccessSize,
-    Gather
+    Gather,
+    PermuteLayout
   >;
 
   using AccessType = typename UnderlyingIterator::AccessType;
@@ -457,18 +465,17 @@ class PredicatedTileIterator<Shape_, Element_, layout::ColumnMajor, AdvanceRank,
     typename UnderlyingIterator::Params params_;
 
   public:
-    
-    CUTLASS_HOST_DEVICE
-    Params() { }
+
+    /// Default constructor
+    Params() = default;
 
     /// Construct the Params object given a pitch-linear tensor's layout
     CUTLASS_HOST_DEVICE
-    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0))) {
-
-    }
+    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0)))
+    {}
 
     CUTLASS_HOST_DEVICE
-    Params(typename UnderlyingIterator::Params::Base const &base) 
+    Params(typename UnderlyingIterator::Params::Base const &base)
         : params_(base) {}
   };
 
@@ -484,6 +491,9 @@ class PredicatedTileIterator<Shape_, Element_, layout::ColumnMajor, AdvanceRank,
 
 public:
 
+  /// Default constructor
+  PredicatedTileIterator() = default;
+
   /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
   CUTLASS_HOST_DEVICE
   PredicatedTileIterator(
@@ -604,7 +614,7 @@ class PredicatedTileIterator<Shape_, Element_, layout::ColumnMajor, AdvanceRank,
 
 ////////////////////////////////////////////////////////////////////////////////
 
-/// Specialization of PredicatedTileIterator for pitch-linear data.
+/// Specialization of PredicatedTileIterator for row-major data.
 ///
 /// Satisfies: ForwardTileIteratorConcept | 
 ///            ReadableContiguousTileIteratorConcept | 
@@ -617,9 +627,11 @@ template <
   int AdvanceRank,
   typename ThreadMap_,
   int AccessSize,
-  bool Gather
+  bool Gather,
+  typename PermuteLayout
 >
-class PredicatedTileIterator<Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, AccessSize, Gather> {
+class PredicatedTileIterator<Shape_, Element_, layout::RowMajor, AdvanceRank, 
+                             ThreadMap_, AccessSize, Gather, PermuteLayout> {
 public:
 
   static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
@@ -649,7 +661,8 @@ class PredicatedTileIterator<Shape_, Element_, layout::RowMajor, AdvanceRank, Th
     (kAdvanceRank == 0 ? 1 : 0),
     ThreadMap,
     AccessSize,
-    Gather
+    Gather,
+    PermuteLayout
   >;
 
   using AccessType = typename UnderlyingIterator::AccessType;
@@ -670,16 +683,16 @@ class PredicatedTileIterator<Shape_, Element_, layout::RowMajor, AdvanceRank, Th
     typename UnderlyingIterator::Params params_;
 
   public:
-    
-    CUTLASS_HOST_DEVICE
-    Params() { } 
+
+    /// Default constructor
+    Params() = default;
 
     /// Construct the Params object given a pitch-linear tensor's layout
     CUTLASS_HOST_DEVICE
     Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0))) {}
 
     CUTLASS_HOST_DEVICE
-    Params(typename UnderlyingIterator::Params::Base const &base) 
+    Params(typename UnderlyingIterator::Params::Base const &base)
         : params_(base) {}
 
   };
@@ -695,6 +708,9 @@ class PredicatedTileIterator<Shape_, Element_, layout::RowMajor, AdvanceRank, Th
 
 public:
 
+  /// Default constructor
+  PredicatedTileIterator() = default;
+
   /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
   CUTLASS_HOST_DEVICE
   PredicatedTileIterator(
@@ -878,10 +894,10 @@ class PredicatedTileIterator<Shape_, Element_, layout::AffineRankN<2>, AdvanceRa
    public:
     /// Construct the Params object given a pitch-linear tensor's layout
     CUTLASS_HOST_DEVICE
-    Params(Layout const &layout) : params_(layout) { }
-    
-    CUTLASS_HOST_DEVICE
-    Params() { }
+    Params(Layout const &layout) : params_(layout) {}
+
+    /// Default constructor
+    Params() = default;
   };
 
  private:
@@ -897,6 +913,10 @@ class PredicatedTileIterator<Shape_, Element_, layout::AffineRankN<2>, AdvanceRa
   TileAccessIterator address_iterator_;
 
  public:
+
+  /// Default constructor
+  PredicatedTileIterator() = default;
+
   /// Constructs a TileIterator from its precomputed state, threadblock offset,
   /// and thread ID
   CUTLASS_HOST_DEVICE
@@ -1123,15 +1143,14 @@ class PredicatedTileIterator<Shape_, Element_, layout::AffineRank2ColumnMajor, A
     typename UnderlyingIterator::Params params_;
 
   public:
-    
-    CUTLASS_HOST_DEVICE
-    Params() { }
+
+    /// Default constructor
+    Params() = default;
 
     /// Construct the Params object given an AffineRankN<2> tensor's layout
     CUTLASS_HOST_DEVICE
-    Params(Layout const &layout): params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))) {
-
-    }
+    Params(Layout const &layout): params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1)))
+    {}
   };
 
 private:
@@ -1145,6 +1164,9 @@ class PredicatedTileIterator<Shape_, Element_, layout::AffineRank2ColumnMajor, A
 
 public:
 
+  /// Default constructor
+  PredicatedTileIterator() = default;
+
   /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
   CUTLASS_HOST_DEVICE
   PredicatedTileIterator(
@@ -1329,9 +1351,9 @@ class PredicatedTileIterator<Shape_, Element_, layout::AffineRank2RowMajor, Adva
     typename UnderlyingIterator::Params params_;
 
   public:
-    
-    CUTLASS_HOST_DEVICE
-    Params() { } 
+
+    /// Default constructor
+    Params() = default;
 
     /// Construct the Params object given an AffineRankN<2> tensor's layout
     CUTLASS_HOST_DEVICE
@@ -1350,6 +1372,9 @@ class PredicatedTileIterator<Shape_, Element_, layout::AffineRank2RowMajor, Adva
 
 public:
 
+  /// Default constructor
+  PredicatedTileIterator() = default;
+
   /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
   CUTLASS_HOST_DEVICE
   PredicatedTileIterator(
@@ -1530,8 +1555,9 @@ class PredicatedTileIterator<Shape_, Element_,
     typename UnderlyingIterator::Params params_;
 
    public:
-    CUTLASS_HOST_DEVICE
-    Params() {}
+
+    /// Default constructor
+    Params() = default;
 
     /// Construct the Params object given a pitch-linear tensor's layout
     CUTLASS_HOST_DEVICE
@@ -1539,7 +1565,7 @@ class PredicatedTileIterator<Shape_, Element_,
         : params_(layout::PitchLinear(layout.stride(0))) {}
 
     CUTLASS_HOST_DEVICE
-    Params(typename UnderlyingIterator::Params::Base const &base) 
+    Params(typename UnderlyingIterator::Params::Base const &base)
         : params_(base) {}
 
   };
@@ -1553,6 +1579,10 @@ class PredicatedTileIterator<Shape_, Element_,
   UnderlyingIterator iterator_;
 
  public:
+
+  /// Default constructor
+  PredicatedTileIterator() = default;
+
   /// Constructs a TileIterator from its precomputed state, threadblock offset,
   /// and thread ID
   CUTLASS_HOST_DEVICE
@@ -1701,7 +1731,7 @@ class PredicatedTileIterator<Shape_, Element_,
 
 
   using AccessType = typename UnderlyingIterator::AccessType;
-  
+
   /// Fragment object to be loaded or stored
   using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
                                                ThreadMap::kElementsPerAccess>;
@@ -1718,8 +1748,9 @@ class PredicatedTileIterator<Shape_, Element_,
     typename UnderlyingIterator::Params params_;
 
    public:
-    CUTLASS_HOST_DEVICE
-    Params() {}
+
+    /// Default constructor
+    Params() = default;
 
     /// Construct the Params object given a pitch-linear tensor's layout
     CUTLASS_HOST_DEVICE
@@ -1727,7 +1758,7 @@ class PredicatedTileIterator<Shape_, Element_,
         : params_(layout::PitchLinear(layout.stride(0))) {}
 
     CUTLASS_HOST_DEVICE
-    Params(typename UnderlyingIterator::Params::Base const &base) 
+    Params(typename UnderlyingIterator::Params::Base const &base)
         : params_(base) {}
   };
 
@@ -1740,6 +1771,10 @@ class PredicatedTileIterator<Shape_, Element_,
   UnderlyingIterator iterator_;
 
  public:
+
+  /// Default constructor
+  PredicatedTileIterator() = default;
+
   /// Constructs a TileIterator from its precomputed state, threadblock offset,
   /// and thread ID
   CUTLASS_HOST_DEVICE
diff --git a/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h b/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h
index cc3fb0301b..422ac45c34 100644
--- a/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h
+++ b/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/threadblock/predicated_tile_iterator_triangular_matrix.h b/include/cutlass/transform/threadblock/predicated_tile_iterator_triangular_matrix.h
index 8147f9a26b..8fea9ae028 100644
--- a/include/cutlass/transform/threadblock/predicated_tile_iterator_triangular_matrix.h
+++ b/include/cutlass/transform/threadblock/predicated_tile_iterator_triangular_matrix.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/threadblock/predicated_vector_access_iterator.h b/include/cutlass/transform/threadblock/predicated_vector_access_iterator.h
index 8ab254d74d..391f94b97e 100644
--- a/include/cutlass/transform/threadblock/predicated_vector_access_iterator.h
+++ b/include/cutlass/transform/threadblock/predicated_vector_access_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -56,8 +56,20 @@ namespace threadblock {
 
 /// PredicatedVectorAccessIterator
 ///
-template <typename Shape, typename WarpShape,
-    typename Element, typename Layout, int ElementsPerAccess>
+template <
+    /// Shape of the vector accessed by the entire threadblock
+    typename Shape,
+    /// Shape of the vector accessed by the warp
+    typename WarpShape,
+    /// Type of Element
+    typename Element,
+    /// Layout of the vector
+    typename Layout,
+    /// Number of elements for each access
+    int ElementsPerAccess,
+    /// Support residual tile
+    bool EnableResidualAccess = false
+>
 class PredicatedVectorAccessIterator;
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -65,8 +77,21 @@ class PredicatedVectorAccessIterator;
 /// Vector access iterator specialized for vectors, e.g. scale and bias
 /// Thread arrangements are for TensorOps
 ///
-template <typename Shape_, typename WarpShape_, typename Element_, int ElementsPerAccess>
-class PredicatedVectorAccessIterator<Shape_, WarpShape_, Element_, layout::PitchLinear, ElementsPerAccess> {
+template <
+  typename Shape_, 
+  typename WarpShape_, 
+  typename Element_, 
+  int ElementsPerAccess, 
+  bool EnableResidualAccess
+>
+class PredicatedVectorAccessIterator <
+  Shape_,
+  WarpShape_,
+  Element_,
+  layout::PitchLinear,
+  ElementsPerAccess,
+  EnableResidualAccess
+> {
   public:
 
   using Shape = Shape_;
@@ -116,6 +141,12 @@ class PredicatedVectorAccessIterator<Shape_, WarpShape_, Element_, layout::Pitch
   /// iteration index
   LongIndex iteration_;
 
+  /// residual access
+  bool is_residual_;
+
+  /// residual offset of each thread
+  TensorCoord residual_offset_;
+
  public:
   /// Constructs a vector access iterator
   CUTLASS_HOST_DEVICE
@@ -132,7 +163,8 @@ class PredicatedVectorAccessIterator<Shape_, WarpShape_, Element_, layout::Pitch
     TensorCoord const &threadblock_offset)
     : pointer_(reinterpret_cast<BytePointer>(
                        const_cast<NonConstPointer>(pointer))),
-      extent_(extent) {
+      extent_(extent),
+      is_residual_(false) {
 
 
     int warp_offset = (warp_id / kWarpCountStrided) * WarpShape::kContiguous;
@@ -143,6 +175,15 @@ class PredicatedVectorAccessIterator<Shape_, WarpShape_, Element_, layout::Pitch
         TensorCoord((thread_id & kThreadsPerRowMask) * kElementsPerAccess, 0);
 
     set_iteration_index(0);
+
+    if(EnableResidualAccess) {
+      // compute residual offset
+      typename TensorCoord::Index residual_size = extent_.contiguous() % WarpShape::kContiguous;
+      if (residual_size) {
+        is_residual_ = true;
+        residual_offset_ = make_Coord(residual_size, 0);
+      }
+    }
   }
 
   /// Construct a PredicatedVectorAccessIterator with zero threadblock offset
@@ -170,6 +211,7 @@ class PredicatedVectorAccessIterator<Shape_, WarpShape_, Element_, layout::Pitch
   CUTLASS_DEVICE
   void add_tile_offset(
       TensorCoord const &tile_offset) {
+
     thread_offset_ =
         thread_offset_ +
         TensorCoord(WarpShape::kContiguous * tile_offset.contiguous(), 0);
@@ -198,7 +240,12 @@ class PredicatedVectorAccessIterator<Shape_, WarpShape_, Element_, layout::Pitch
   /// Increment and return an instance to self.
   CUTLASS_HOST_DEVICE
   void advance() {
-    add_tile_offset(TensorCoord(1, 0));
+    if(EnableResidualAccess && is_residual_) {
+      is_residual_ = false;
+      thread_offset_ += residual_offset_; 
+    }
+    else
+      add_tile_offset(TensorCoord(1, 0));
   }
 
   /// Increment and return an instance to self.
@@ -221,8 +268,21 @@ class PredicatedVectorAccessIterator<Shape_, WarpShape_, Element_, layout::Pitch
 
 /// Specialization of PredicatedVectorAccessIterator for row-major data.
 ///
-template <typename Shape_, typename WarpShape_, typename Element_, int ElementsPerAccess>
-class PredicatedVectorAccessIterator<Shape_, WarpShape_, Element_, layout::RowMajor, ElementsPerAccess> {
+template <
+  typename Shape_,
+  typename WarpShape_,
+  typename Element_,
+  int ElementsPerAccess,
+  bool EnableResidualAccess
+>
+class PredicatedVectorAccessIterator<
+  Shape_,
+  WarpShape_,
+  Element_,
+  layout::RowMajor,
+  ElementsPerAccess,
+  EnableResidualAccess
+> {
  public:
 
   using Shape = Shape_;
@@ -245,7 +305,8 @@ class PredicatedVectorAccessIterator<Shape_, WarpShape_, Element_, layout::RowMa
       layout::PitchLinearShape<WarpShape::kColumn, WarpShape::kRow>, 
       Element,
       layout::PitchLinear,
-      ElementsPerAccess>;
+      ElementsPerAccess,
+      EnableResidualAccess>;
 
   using AccessType = typename UnderlyingIterator::AccessType;
   static int const kElementsPerAccess = UnderlyingIterator::kElementsPerAccess;
diff --git a/include/cutlass/conv/threadblock/regular_scale_bias_vector_access_iterator.h b/include/cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h
similarity index 97%
rename from include/cutlass/conv/threadblock/regular_scale_bias_vector_access_iterator.h
rename to include/cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h
index ba8a4bb5b8..f5906d828c 100644
--- a/include/cutlass/conv/threadblock/regular_scale_bias_vector_access_iterator.h
+++ b/include/cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -47,7 +47,7 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
-namespace conv {
+namespace transform {
 namespace threadblock {
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -151,7 +151,7 @@ class RegularScaleBiasVectorAccessIterator<Shape_, Element_, layout::PitchLinear
   /// Adds a tile offset in the unit of tile.
   CUTLASS_DEVICE
   void add_tile_offset(TensorCoord const &coord) {
-    // Multiply by 2 because we store sclae and bias belong to the same stage
+    // Multiply by 2 because we store scale and bias belong to the same stage
     // next to each other.
     add_pointer_offset(coord.contiguous() * Shape::kContiguous * 2);
   }
@@ -247,7 +247,7 @@ class RegularScaleBiasVectorAccessIterator<
 ////////////////////////////////////////////////////////////////////////////////
 
 }  // namespace threadblock
-}  // namespace conv 
+}  // namespace transform 
 }  // namespace cutlass
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator.h
index 10fc5e965a..d0992d4414 100644
--- a/include/cutlass/transform/threadblock/regular_tile_access_iterator.h
+++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h
index 1b78d02477..fa02b008b1 100644
--- a/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h
+++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear_direct_conv.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear_direct_conv.h
new file mode 100644
index 0000000000..a7b57bbe7e
--- /dev/null
+++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear_direct_conv.h
@@ -0,0 +1,587 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing computing the addresses of storing of tiles
+   from pitch-linear rank=2 tensors.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+
+#include "cutlass/transform/threadblock/regular_tile_access_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename Shape, typename Element, typename Layout, int AdvanceRank,
+          typename ThreadMap,
+           bool Dynamic_iterations = false,
+          int Alignment =
+              sizeof_bits<Element>::value* ThreadMap::kElementsPerAccess / 8
+          >
+class RegularTileAccessIteratorDirectConv;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for congruous arrangements for TensorOps with dynamic_iterations OFF
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIteratorDirectConv<
+    Shape_, Element_,
+    layout::PitchLinear,
+    AdvanceRank, ThreadMap_, false, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Element type per access
+  using AccessType = Array<Element, ThreadMap::kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Stride value
+  StrideIndex stride_;
+
+  /// Internal pointer to first access of tile
+  AccessType *pointer_;
+
+  /// Internal byte offset
+  Index byte_offset_;
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : stride_(ref.stride(0) / ThreadMap::kElementsPerAccess),
+        byte_offset_(0) {
+
+    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
+
+    // initialize pointer
+    pointer_ = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_base));
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_num(int num) {
+    //Do nothing
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+
+  /// Returns a pointer
+  CUTLASS_DEVICE
+  AccessType *get() const {
+
+    AccessType *access_ptr = pointer_;
+
+    int access_offset = iteration_strided_ * ThreadMap::Delta::kStrided * stride_ +
+                        iteration_contiguous_ * ThreadMap::Delta::kContiguous /
+                            ThreadMap::kElementsPerAccess;
+
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(access_ptr + access_offset);
+
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv &operator++() {
+    ++iteration_contiguous_;
+
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv operator++(int) {
+    RegularTileAccessIteratorDirectConv prev(*this);
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset in the unit of tile.
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    add_pointer_offset(coord.contiguous() * Shape::kContiguous +
+                       coord.strided() * ThreadMap::Iterations::kStrided *
+                           ThreadMap::Delta::kStrided * stride_ * ThreadMap::kElementsPerAccess);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for congruous arrangements for TensorOps with dynamic_iterations ON
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIteratorDirectConv<
+    Shape_, Element_,
+    layout::PitchLinear,
+    AdvanceRank, ThreadMap_,true, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Element type per access
+  using AccessType = Array<Element, ThreadMap::kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Stride value
+  StrideIndex stride_;
+
+  /// Internal pointer to first access of tile
+  AccessType *pointer_;
+
+  /// Internal byte offset
+  Index byte_offset_;
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+  /// Total iterattions in the strided dimension: Dynamic value
+  int total_iteration_strided_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : stride_(ref.stride(0) / ThreadMap::kElementsPerAccess),
+        byte_offset_(0) {
+
+    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
+
+    // initialize pointer
+    pointer_ = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_base));
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_num(int num) {
+    total_iteration_strided_ = num;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+
+  /// Returns a pointer
+  CUTLASS_DEVICE
+  AccessType *get() const {
+
+    AccessType *access_ptr = pointer_;
+
+    int access_offset = iteration_strided_ * ThreadMap::Delta::kStrided * stride_ +
+                        iteration_contiguous_ * ThreadMap::Delta::kContiguous /
+                            ThreadMap::kElementsPerAccess;
+
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(access_ptr + access_offset);
+
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv &operator++() {
+    ++iteration_contiguous_;
+
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+
+    if (iteration_strided_ < total_iteration_strided_) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv operator++(int) {
+    RegularTileAccessIteratorDirectConv prev(*this);
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset in the unit of tile.
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    add_pointer_offset(coord.contiguous() * Shape::kContiguous +
+                       coord.strided() * total_iteration_strided_ * ThreadMap::Delta::kStrided * stride_ *
+                           ThreadMap::kElementsPerAccess);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for column major layouts
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_,bool Dynamic_iterations, int Alignment >
+class RegularTileAccessIteratorDirectConv<
+    Shape_, Element_,
+    layout::ColumnMajor,
+    AdvanceRank, ThreadMap_, Dynamic_iterations , Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIteratorDirectConv<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1), 
+      ThreadMap_,
+      Dynamic_iterations>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_num(int num) {
+    iterator_.set_iteration_num(num);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv operator++(int) {
+    RegularTileAccessIteratorDirectConv prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for row major layouts
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_,bool Dynamic_iterations, int Alignment>
+class RegularTileAccessIteratorDirectConv<
+    Shape_, Element_,
+    layout::RowMajor,
+    AdvanceRank, ThreadMap_, Dynamic_iterations, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIteratorDirectConv<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0), 
+      ThreadMap_,
+      Dynamic_iterations>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_num(int num) {
+    iterator_.set_iteration_num(num);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv operator++(int) {
+    RegularTileAccessIteratorDirectConv prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h
index 7c848c771e..96e3ee84bc 100644
--- a/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h
+++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -60,11 +60,11 @@ namespace threadblock {
 ///            WriteableContiguousTileIteratorConcept
 ///
 template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment>
+          typename ThreadMap_, int Alignment, int Crosswise>
 class RegularTileAccessIterator<
     Shape_, Element_,
     layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
-                                          int(128 / sizeof(Element_))>,
+                                          Crosswise>,
     AdvanceRank, ThreadMap_, Alignment> {
  public:
   static_assert(
@@ -76,9 +76,10 @@ class RegularTileAccessIterator<
   using Element = Element_;
   using Layout =
       layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
-                                            int(128 / sizeof(Element_))>;
+                                            Crosswise>;
   static int const kAdvanceRank = AdvanceRank;
   static int const kAlignment = Alignment;
+  static int const kCrosswise = Crosswise;
 
   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;
@@ -134,7 +135,7 @@ class RegularTileAccessIterator<
   RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
                             int thread_id   ///< ID of each participating thread
                             )
-      : stride_(ref.stride(0) / Layout::kElementsPerAccess),
+      : stride_(ref.stride(0) * Layout::kFactor / Layout::kElementsPerAccess),
         byte_offset_(0) {
     layout::PitchLinearCoord thread_offset_base =
         ThreadMap::initial_offset(thread_id);
@@ -175,7 +176,7 @@ class RegularTileAccessIterator<
     AccessType *access_ptr = pointer_[iteration_strided_ & 1];
     int stride_idx = (iteration_strided_ & ~1);
 
-    int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
+    int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ / Layout::kFactor +
                         iteration_contiguous_ * ThreadMap::Delta::kContiguous /
                             ThreadMap::kElementsPerAccess;
 
@@ -220,9 +221,9 @@ class RegularTileAccessIterator<
   /// Adds a tile offset
   CUTLASS_DEVICE
   void add_tile_offset(TensorCoord const &coord) {
-    add_pointer_offset(coord.contiguous() * Shape::kContiguous +
+    add_pointer_offset(coord.contiguous() * Shape::kContiguous * Layout::kFactor +
                        coord.strided() * Shape::kStrided * stride_ *
-                           Layout::kElementsPerAccess);
+                           Layout::kElementsPerAccess / Layout::kFactor);
   }
 };
 
@@ -236,11 +237,11 @@ class RegularTileAccessIterator<
 ///            WriteableContiguousTileIteratorConcept
 ///
 template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment>
+          typename ThreadMap_, int Alignment, int Crosswise>
 class RegularTileAccessIterator<
     Shape_, Element_,
     layout::ColumnMajorTensorOpMultiplicandCongruous<
-        sizeof_bits<Element_>::value, int(128 / sizeof(Element_))>,
+        sizeof_bits<Element_>::value, Crosswise>,
     AdvanceRank, ThreadMap_, Alignment> {
  public:
   static_assert(
@@ -251,7 +252,7 @@ class RegularTileAccessIterator<
   using Shape = Shape_;
   using Element = Element_;
   using Layout = layout::ColumnMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<Element_>::value, int(128 / sizeof(Element_))>;
+      sizeof_bits<Element_>::value, Crosswise>;
   static int const kAdvanceRank = AdvanceRank;
   static int const kAlignment = Alignment;
 
@@ -267,7 +268,7 @@ class RegularTileAccessIterator<
   using UnderlyingIterator = RegularTileAccessIterator<
       layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
       layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
-                                            int(128 / sizeof(Element_))>,
+                                            Crosswise>,
       (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
 
   using AccessType = typename UnderlyingIterator::AccessType;
@@ -333,11 +334,11 @@ class RegularTileAccessIterator<
 ///            WriteableContiguousTileIteratorConcept
 ///
 template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment>
+          typename ThreadMap_, int Alignment, int Crosswise>
 class RegularTileAccessIterator<
     Shape_, Element_,
     layout::RowMajorTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
-                                                  int(128 / sizeof(Element_))>,
+                                                  Crosswise>,
     AdvanceRank, ThreadMap_, Alignment> {
  public:
   static_assert(
@@ -348,7 +349,7 @@ class RegularTileAccessIterator<
   using Shape = Shape_;
   using Element = Element_;
   using Layout = layout::RowMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<Element_>::value, int(128 / sizeof(Element_))>;
+      sizeof_bits<Element_>::value, Crosswise>;
   static int const kAdvanceRank = AdvanceRank;
   static int const kAlignment = Alignment;
 
@@ -364,7 +365,7 @@ class RegularTileAccessIterator<
   using UnderlyingIterator = RegularTileAccessIterator<
       layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
       layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
-                                            int(128 / sizeof(Element_))>,
+                                            Crosswise>,
       (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
 
   using AccessType = typename UnderlyingIterator::AccessType;
diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h
index 800e637466..b424af445c 100644
--- a/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h
+++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator.h b/include/cutlass/transform/threadblock/regular_tile_iterator.h
index 7378cfa923..d09c23892c 100644
--- a/include/cutlass/transform/threadblock/regular_tile_iterator.h
+++ b/include/cutlass/transform/threadblock/regular_tile_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h b/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h
index 4581360f64..1e04c4262d 100644
--- a/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h
+++ b/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -45,7 +45,7 @@
 #include "cutlass/layout/matrix.h"
 #include "cutlass/layout/pitch_linear.h"
 
-#include "regular_tile_iterator.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -277,7 +277,7 @@ class RegularTileIterator<Shape_, Element_, layout::PitchLinear, AdvanceRank, Th
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Regular tile iterator specialized for pitch-linear
+/// Regular tile iterator specialized for row major 
 template <
   typename Shape_,
   typename Element_,
diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h b/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h
index 24882dbfca..7fd4959845 100644
--- a/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h
+++ b/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -45,7 +45,7 @@
 #include "cutlass/layout/matrix.h"
 #include "cutlass/layout/pitch_linear.h"
 
-#include "regular_tile_iterator.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h b/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h
index feaa82e6bb..1308f45eb9 100644
--- a/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h
+++ b/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -53,11 +53,11 @@ namespace threadblock {
 ///            WriteableContiguousTileIteratorConcept
 ///
 template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment>
+          typename ThreadMap_, int Alignment, int Crosswise>
 class RegularTileIterator<
     Shape_, Element_,
     layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
-                                          int(128 / sizeof(Element_))>,
+                                          Crosswise>,
     AdvanceRank, ThreadMap_, Alignment> {
  public:
 
@@ -69,7 +69,7 @@ class RegularTileIterator<
   using Element = Element_;
   using Layout =
       layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
-                                            int(128 / sizeof(Element))>;
+                                            Crosswise>;
   static int const kAdvanceRank = AdvanceRank;
   static int const kAlignment = Alignment;
 
@@ -228,11 +228,11 @@ class RegularTileIterator<
 ///            WriteableContiguousTileIteratorConcept
 ///
 template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment>
+          typename ThreadMap_, int Alignment, int Crosswise>
 class RegularTileIterator<
     Shape_, Element_,
     layout::ColumnMajorTensorOpMultiplicandCongruous<
-        sizeof_bits<Element_>::value, int(128 / sizeof(Element_))>,
+        sizeof_bits<Element_>::value, Crosswise>,
     AdvanceRank, ThreadMap_, Alignment> {
  public:
 
@@ -243,7 +243,7 @@ class RegularTileIterator<
   using Shape = Shape_;
   using Element = Element_;
   using Layout = layout::ColumnMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<Element_>::value, int(128 / sizeof(Element))>;
+      sizeof_bits<Element_>::value, Crosswise>;
   static int const kAdvanceRank = AdvanceRank;
   static int const kAlignment = Alignment;
 
@@ -259,7 +259,7 @@ class RegularTileIterator<
   using UnderlyingIterator = RegularTileIterator<
       layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
       layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
-                                            int(128 / sizeof(Element))>,
+                                            Crosswise>,
       (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
 
  public:
@@ -349,11 +349,11 @@ class RegularTileIterator<
 ///            WriteableContiguousTileIteratorConcept
 ///
 template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment>
+          typename ThreadMap_, int Alignment, int Crosswise>
 class RegularTileIterator<
     Shape_, Element_,
     layout::RowMajorTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
-                                                  int(128 / sizeof(Element_))>,
+                                                  Crosswise>,
     AdvanceRank, ThreadMap_, Alignment> {
  public:
 
@@ -364,7 +364,7 @@ class RegularTileIterator<
   using Shape = Shape_;
   using Element = Element_;
   using Layout = layout::RowMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<Element_>::value, int(128 / sizeof(Element))>;
+      sizeof_bits<Element_>::value, Crosswise>;
   static int const kAdvanceRank = AdvanceRank;
   static int const kAlignment = Alignment;
 
@@ -380,7 +380,7 @@ class RegularTileIterator<
   using UnderlyingIterator = RegularTileIterator<
       layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
       layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
-                                            int(128 / sizeof(Element))>,
+                                            Crosswise>,
       (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
 
  public:
diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h b/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h
index 68e8928c90..81b774cf21 100644
--- a/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h
+++ b/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -1079,7 +1079,7 @@ class RegularTileIterator<
   //
 
   /// The crosswised elements will be stored in a line.
-  /// line_size is size of crosswised dimention plus padding.
+  /// line_size is size of crosswised dimension plus padding.
   /// in units of AccessType
   Index line_size;
 
diff --git a/include/cutlass/transform/threadblock/vector_iterator.h b/include/cutlass/transform/threadblock/vector_iterator.h
index 5cb052bc48..f78e5e862d 100644
--- a/include/cutlass/transform/threadblock/vector_iterator.h
+++ b/include/cutlass/transform/threadblock/vector_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/warp/vector_fragment_iterator.h b/include/cutlass/transform/warp/vector_fragment_iterator.h
index 4bd33fe7b4..b8bfa57fd2 100644
--- a/include/cutlass/transform/warp/vector_fragment_iterator.h
+++ b/include/cutlass/transform/warp/vector_fragment_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/uint128.h b/include/cutlass/uint128.h
index a55b526946..6de3ba1417 100644
--- a/include/cutlass/uint128.h
+++ b/include/cutlass/uint128.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -32,7 +32,6 @@
   \file
   \brief Defines an unsigned 128b integer with several operators to support 64-bit integer division.
 */
-
 #pragma once
 
 #if defined(__CUDACC_RTC__)
@@ -46,47 +45,43 @@
 #endif
 
 #include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Optionally enable GCC's built-in type
-#if defined(__x86_64) && !defined(__CUDA_ARCH__) && defined(__GNUC__)
+#if (defined(__x86_64) || defined (__aarch64__)) && !(defined(__CUDA_ARCH__) && ((__CUDACC_VER_MAJOR__ <= 10) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ <= 4)))) && defined(__GNUC__)
 #define CUTLASS_UINT128_NATIVE
-#elif defined(_MSC_VER) && defined(_M_AMD64) && !defined(__CUDA_ARCH__)
+#elif !defined(__CUDA_ARCH__)
+// No custom support for 128b arithmetic on device
+#if defined(_MSC_VER) && defined(_M_AMD64)
 #define CUTLASS_INT128_ARITHMETIC
 #include <intrin.h>
-#if _MSC_VER >= 1920
+#if _MSC_VER >= 1920 && !defined(__CUDA_ARCH__)
 #define CUTLASS_INT128_ARITHMETIC_DIV
 #include <immintrin.h>
 #endif
 #endif
+#endif
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
 
 ///! Unsigned 128b integer type
-struct uint128_t {
-
+struct alignas(16) uint128_t
+{
   /// Size of one part of the uint's storage in bits
-  int const kPartSize = sizeof_bits<uint64_t>::value;
+  static constexpr int storage_bits_ = 64;
 
-  struct hilo {
+  struct hilo
+  {
     uint64_t lo;
     uint64_t hi;
-
-    CUTLASS_HOST_DEVICE hilo(uint64_t lo_, uint64_t hi_):lo(lo_), hi(hi_) {}
   };
 
   // Use a union to store either low and high parts or, if present, a built-in 128b integer type.
   union {
     struct hilo hilo_;
 
-    #if defined(CUTLASS_UINT128_NATIVE)
+#if defined(CUTLASS_UINT128_NATIVE)
     unsigned __int128 native;
-    #endif // defined(CUTLASS_UINT128_NATIVE)
+#endif // defined(CUTLASS_UINT128_NATIVE)
   };
 
   //
@@ -95,31 +90,31 @@ struct uint128_t {
 
   /// Default ctor
   CUTLASS_HOST_DEVICE
-  uint128_t(): hilo_(0, 0) { }
+  uint128_t() : hilo_{0, 0} {}
 
   /// Constructor from uint64
   CUTLASS_HOST_DEVICE
-  uint128_t(uint64_t lo_): hilo_(lo_, 0) { }
+  uint128_t(uint64_t lo_) : hilo_{lo_, 0} {}
 
   /// Constructor from two 64b unsigned integers
   CUTLASS_HOST_DEVICE
-  uint128_t(uint64_t lo_, uint64_t hi_): hilo_(lo_, hi_) {
-
-  }
+  uint128_t(uint64_t lo_, uint64_t hi_) : hilo_{lo_, hi_} {}
 
   /// Optional constructor from native value
-  #if defined(CUTLASS_UINT128_NATIVE)
-  uint128_t(unsigned __int128 value): native(value) { }
-  #endif
+#if defined(CUTLASS_UINT128_NATIVE)
+  uint128_t(unsigned __int128 value) : native(value) { }
+#endif
 
   /// Lossily cast to uint64
   CUTLASS_HOST_DEVICE
-  explicit operator uint64_t() const {
+  explicit operator uint64_t() const
+  {
     return hilo_.lo;
   }
 
   CUTLASS_HOST_DEVICE
-  static void exception() {
+  static void exception()
+  {
 #if defined(__CUDA_ARCH__)
   asm volatile ("  brkpt;\n");
 #else
@@ -130,21 +125,23 @@ struct uint128_t {
 
   /// Add
   CUTLASS_HOST_DEVICE
-  uint128_t operator+(uint128_t const &rhs) const {
-    uint128_t y;
+  uint128_t operator+(uint128_t const& rhs) const
+  {
+    uint128_t y{};
 #if defined(CUTLASS_UINT128_NATIVE)
     y.native = native + rhs.native;
 #else
     y.hilo_.lo = hilo_.lo + rhs.hilo_.lo;
-    y.hilo_.hi = hilo_.hi + rhs.hilo_.hi + (!y.hilo_.lo && (rhs.hilo_.lo));
+    y.hilo_.hi = hilo_.hi + rhs.hilo_.hi + (y.hilo_.lo < hilo_.lo);
 #endif
     return y;
   }
 
   /// Subtract
   CUTLASS_HOST_DEVICE
-  uint128_t operator-(uint128_t const &rhs) const {
-    uint128_t y;
+  uint128_t operator-(uint128_t const& rhs) const
+  {
+    uint128_t y{};
 #if defined(CUTLASS_UINT128_NATIVE)
     y.native = native - rhs.native;
 #else
@@ -156,8 +153,9 @@ struct uint128_t {
 
   /// Multiply by unsigned 64b integer yielding 128b integer
   CUTLASS_HOST_DEVICE
-  uint128_t operator*(uint64_t const &rhs) const {
-    uint128_t y;
+  uint128_t operator*(uint64_t const& rhs) const
+  {
+    uint128_t y{};
 #if defined(CUTLASS_UINT128_NATIVE)
     y.native = native * rhs;
 #elif defined(CUTLASS_INT128_ARITHMETIC)
@@ -165,10 +163,10 @@ struct uint128_t {
     y.hilo_.lo = _umul128(hilo_.lo, rhs, &y.hilo_.hi);
 
     // Add the high part and ignore the overflow
-    uint64_t overflow;
+    uint64_t overflow{0};
     y.hilo_.hi += _umul128(hilo_.hi, rhs, &overflow);
 #else
-    // TODO - not implemented
+    CUTLASS_UNUSED(rhs);
     exception();
 #endif
     return y;
@@ -176,16 +174,17 @@ struct uint128_t {
 
   /// Divide 128b operation by 64b operation yielding a 64b quotient
   CUTLASS_HOST_DEVICE
-  uint64_t operator/(uint64_t const &divisor) const {
-    uint64_t quotient = 0;
+  uint64_t operator/(uint64_t const& divisor) const
+  {
+    uint64_t quotient{0};
 #if defined(CUTLASS_UINT128_NATIVE)
     quotient = uint64_t(native / divisor);
 #elif defined(CUTLASS_INT128_ARITHMETIC_DIV)
     // implemented using MSVC's arithmetic intrinsics
-    uint64_t remainder = 0;
+    uint64_t remainder{0};
     quotient = _udiv128(hilo_.hi, hilo_.lo, divisor, &remainder);
 #else
-    // TODO - not implemented
+    CUTLASS_UNUSED(divisor);
     exception();
 #endif
     return quotient;
@@ -193,15 +192,16 @@ struct uint128_t {
 
   /// Divide 128b operation by 64b operation yielding a 64b quotient
   CUTLASS_HOST_DEVICE
-  uint64_t operator%(uint64_t const &divisor) const {
-    uint64_t remainder = 0;
+  uint64_t operator%(uint64_t const& divisor) const
+  {
+    uint64_t remainder{0};
 #if defined(CUTLASS_UINT128_NATIVE)
     remainder = uint64_t(native % divisor);
 #elif defined(CUTLASS_INT128_ARITHMETIC_DIV)
     // implemented using MSVC's arithmetic intrinsics
     (void)_udiv128(hilo_.hi, hilo_.lo, divisor, &remainder);
 #else
-    // TODO - not implemented
+    CUTLASS_UNUSED(divisor);
     exception();
 #endif
     return remainder;
@@ -209,8 +209,9 @@ struct uint128_t {
 
   /// Computes the quotient and remainder in a single method.
   CUTLASS_HOST_DEVICE
-  uint64_t divmod(uint64_t &remainder, uint64_t divisor) const {
-    uint64_t quotient = 0;
+  uint64_t divmod(uint64_t &remainder, uint64_t divisor) const
+  {
+    uint64_t quotient{0};
 #if defined(CUTLASS_UINT128_NATIVE)
     quotient = uint64_t(native / divisor);
     remainder = uint64_t(native % divisor);
@@ -218,7 +219,8 @@ struct uint128_t {
     // implemented using MSVC's arithmetic intrinsics
     quotient = _udiv128(hilo_.hi, hilo_.lo, divisor, &remainder);
 #else
-    // TODO - not implemented
+    CUTLASS_UNUSED(remainder);
+    CUTLASS_UNUSED(divisor);
     exception();
 #endif
     return quotient;
@@ -226,33 +228,35 @@ struct uint128_t {
 
   /// Left-shifts a 128b unsigned integer
   CUTLASS_HOST_DEVICE
-  uint128_t operator<<(int sh) const {
+  uint128_t operator<<(int sh) const
+  {
     if (sh == 0) {
       return *this;
     }
-    else if (sh >= kPartSize) {
-      return uint128_t(0, hilo_.lo << (sh - kPartSize));
+    else if (sh >= storage_bits_) {
+      return uint128_t(0, hilo_.lo << (sh - storage_bits_));
     }
     else {
       return uint128_t(
         (hilo_.lo << sh),
-        (hilo_.hi << sh) | uint64_t(hilo_.lo >> (kPartSize - sh))
+        (hilo_.hi << sh) | uint64_t(hilo_.lo >> (storage_bits_ - sh))
       );
     }
   }
 
   /// Right-shifts a 128b unsigned integer
   CUTLASS_HOST_DEVICE
-  uint128_t operator>>(int sh) const {
+  uint128_t operator>>(int sh) const
+  {
     if (sh == 0) {
       return *this;
     }
-    else if (sh >= kPartSize) {
-      return uint128_t((hilo_.hi >> (sh - kPartSize)), 0);
+    else if (sh >= storage_bits_) {
+      return uint128_t((hilo_.hi >> (sh - storage_bits_)), 0);
     }
     else {
       return uint128_t(
-        (hilo_.lo >> sh) | (hilo_.hi << (kPartSize - sh)),
+        (hilo_.lo >> sh) | (hilo_.hi << (storage_bits_ - sh)),
         (hilo_.hi >> sh)
       );
     }
diff --git a/include/cutlass/version.h b/include/cutlass/version.h
new file mode 100644
index 0000000000..ff9aa11576
--- /dev/null
+++ b/include/cutlass/version.h
@@ -0,0 +1,80 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+
+#define CUTLASS_MAJOR 3
+#define CUTLASS_MINOR 6
+#define CUTLASS_PATCH 0
+
+#ifdef CUTLASS_VERSIONS_GENERATED
+#include "cutlass/version_extended.h"
+#else
+#define CUTLASS_BUILD 0
+#define CUTLASS_REVISION ""
+#endif
+
+#define CUTLASS_VERSION ((CUTLASS_MAJOR)*100 + (CUTLASS_MINOR)*10 + CUTLASS_PATCH)
+
+namespace cutlass {
+
+  inline constexpr uint32_t getVersion() {
+    return CUTLASS_VERSION;
+  }
+  inline constexpr uint32_t getVersionMajor() {
+    return CUTLASS_MAJOR;
+  }
+  inline constexpr uint32_t getVersionMinor() {
+    return CUTLASS_MINOR;
+  }
+  inline constexpr uint32_t getVersionPatch() {
+    return CUTLASS_PATCH;
+  }
+  inline constexpr uint32_t getVersionBuild() {
+    return CUTLASS_BUILD + 0;
+  }
+
+  inline std::string getVersionString() {
+    std::string version = "@CUTLASS_VERSION@";
+    if (getVersionBuild()) {
+      version += "." + std::to_string(getVersionBuild());
+    }
+    return version;
+  }
+  
+  inline std::string getGitRevision() {
+    return "@CUTLASS_REVISION@";
+  }
+
+} // namespace cutlass
diff --git a/include/cutlass/wmma_array.h b/include/cutlass/wmma_array.h
index fede386cd3..0f9b2b5147 100644
--- a/include/cutlass/wmma_array.h
+++ b/include/cutlass/wmma_array.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -41,30 +41,85 @@
 
 #include "cutlass/cutlass.h"
 #include "cutlass/array.h"
+#include "cutlass/functional.h"
 
 namespace cutlass {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Wmma array type (WmmaFragmentArray holds elements of of type nvcuda::wmma::fragment)
+/// Wmma array type (WmmaFragmentArray holds elements of type nvcuda::wmma::fragment)
 template <
   /// Element type
   typename T,
   /// Number of elements in the array
-  int N
+  int N,
+  /// Whether the element type of T is half_t or __half
+  bool IsHalfType = (platform::is_same<typename T::element_type, cutlass::half_t>::value ||
+                     platform::is_same<typename T::element_type, __half>::value)
 >
 class WmmaFragmentArray: public Array<T, N, true> {
 public:
+
   /// Efficient clear method (override Array::clear())
   CUTLASS_HOST_DEVICE
-  void clear() {
+  void clear()
+  {
+    for(int i = 0; i < Array<T, N, true>::kElements; i++)
+    {
+      nvcuda::wmma::fill_fragment((*this)[i], (typename T::element_type)0);
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  WmmaFragmentArray<T, N>& operator+=(const WmmaFragmentArray<T, N>& rhs)
+  {
+    using element_type = typename T::element_type;
+    plus<T> add;
+
+    for (int i = 0; i < Array<T, N, true>::kElements; i++)
+    {
+      (*this)[i] = add((*this)[i], rhs[i]);
+    }
 
-    for(int i=0; i<Array<T, N, true>::kElements; i++) {
+    return *this;
+  }
+};
 
-      nvcuda::wmma::fill_fragment((*this)[i], (typename T::element_type)0);
+/// Partial specialization for the case in which T::element_type is
+/// half_t or __half. This is needed because the cast (typename T::element_type)0
+/// in the primary template flags as an error when __CUDA_NO_HALF_CONVERSIONS__
+/// is set.
+template <
+  /// Element type
+  typename T,
+  /// Number of elements in the array
+  int N
+>
+class WmmaFragmentArray<T, N, true>: public Array<T, N, true> {
+public:
 
+  /// Efficient clear method (override Array::clear())
+  CUTLASS_HOST_DEVICE
+  void clear()
+  {
+    for(int i = 0; i < Array<T, N, true>::kElements; i++)
+    {
+      nvcuda::wmma::fill_fragment((*this)[i], __float2half(0.f));
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  WmmaFragmentArray<T, N>& operator+=(const WmmaFragmentArray<T, N>& rhs)
+  {
+    using element_type = typename T::element_type;
+    plus<T> add;
+
+    for (int i = 0; i < Array<T, N, true>::kElements; i++)
+    {
+      (*this)[i] = add((*this)[i], rhs[i]);
     }
 
+    return *this;
   }
 };
 
diff --git a/include/cutlass/workspace.h b/include/cutlass/workspace.h
new file mode 100644
index 0000000000..6f1c3254c6
--- /dev/null
+++ b/include/cutlass/workspace.h
@@ -0,0 +1,150 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Utilities for initializing workspaces
+*/
+
+#pragma once
+
+#if !defined(__CUDACC_RTC__)
+#include "cuda.h"
+#include "cuda_runtime.h"
+
+#include "cutlass/trace.h"
+#endif
+
+#include "cutlass.h"
+#include "cutlass/cuda_host_adapter.hpp"
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+static constexpr int MinWorkspaceAlignment = 16;
+
+#if !defined(__CUDACC_RTC__)
+static Status
+zero_workspace(void* workspace, size_t workspace_size, cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr) {
+  if (workspace_size > 0) {
+    if (workspace == nullptr) {
+      CUTLASS_TRACE_HOST("  error: device workspace must not be null");
+      return Status::kErrorWorkspaceNull;
+    }
+
+    CUTLASS_TRACE_HOST("  clearing workspace");
+
+#if defined(CUTLASS_ENABLE_CUDA_HOST_ADAPTER) && CUTLASS_ENABLE_CUDA_HOST_ADAPTER
+    //
+    // Use the cuda host adapter
+    //
+    CUTLASS_ASSERT(cuda_adapter);
+    if (cuda_adapter) {
+      if (Status::kSuccess != cuda_adapter->memsetDevice(workspace, static_cast<uint8_t>(0), workspace_size, stream)) {
+        return Status::kErrorInternal;
+      }
+    }
+    else {
+      return Status::kErrorInternal;
+    }
+#else
+    cudaError_t result = cudaMemsetAsync(workspace, 0, workspace_size, stream);
+    if (cudaSuccess != result) {
+      result = cudaGetLastError(); // to clear the error bit
+      CUTLASS_TRACE_HOST("  cudaMemsetAsync() returned error " << cudaGetErrorString(result));
+      return Status::kErrorInternal;
+    }
+#endif
+  }
+
+  return Status::kSuccess;
+}
+#endif
+
+#if !defined(__CUDACC_RTC__)
+template <typename T>
+Status
+fill_workspace(void* workspace, T fill_value, size_t fill_count, cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr) {
+  static_assert(sizeof(T) == 4 || sizeof(T) == 2 || sizeof(T) == 1, "Unsupported fill type");
+  if (fill_count > 0) {
+    if (workspace == nullptr) {
+      CUTLASS_TRACE_HOST("  error: device workspace must not be null");
+      return Status::kErrorWorkspaceNull;
+    }
+
+    CUTLASS_TRACE_HOST("  filling workspace");
+
+#if defined(CUTLASS_ENABLE_CUDA_HOST_ADAPTER) && CUTLASS_ENABLE_CUDA_HOST_ADAPTER
+    //
+    // Use the cuda host adapter
+    //
+    CUTLASS_ASSERT(cuda_adapter);
+    if (cuda_adapter) {
+      if (Status::kSuccess != cuda_adapter->memsetDevice(workspace, fill_value, fill_count, stream)) {
+        return Status::kErrorInternal;
+      }
+    }
+    else {
+      return Status::kErrorInternal;
+    }
+#else
+    CUdeviceptr d_workspace = reinterpret_cast<CUdeviceptr>(workspace);
+    CUresult result = CUDA_SUCCESS;
+    if (sizeof(T) == 4) {
+      result = cuMemsetD32Async(d_workspace, reinterpret_cast<uint32_t&>(fill_value), fill_count, stream);
+    }
+    else if (sizeof(T) == 2) {
+      result = cuMemsetD16Async(d_workspace, reinterpret_cast<uint16_t&>(fill_value), fill_count, stream);
+    }
+    else if (sizeof(T) == 1) {
+      result = cuMemsetD8Async(d_workspace, reinterpret_cast<uint8_t&>(fill_value), fill_count, stream);
+    }
+
+    if (CUDA_SUCCESS != result) {
+      const char** error_string_ptr = nullptr;
+      (void) cuGetErrorString(result, error_string_ptr);
+      if (error_string_ptr != nullptr) {
+        CUTLASS_TRACE_HOST("  cuMemsetD" << sizeof(T) * 8 << "Async() returned error " << *error_string_ptr);
+      }
+      else {
+        CUTLASS_TRACE_HOST("  cuMemsetD" << sizeof(T) * 8 << "Async() returned unrecognized error");
+      }
+      return Status::kErrorInternal;
+    }
+#endif
+  }
+
+  return Status::kSuccess;
+}
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
diff --git a/media/docs/build/building_in_windows_with_visual_studio.md b/media/docs/build/building_in_windows_with_visual_studio.md
new file mode 100644
index 0000000000..2c69e1ac5c
--- /dev/null
+++ b/media/docs/build/building_in_windows_with_visual_studio.md
@@ -0,0 +1,90 @@
+[README](../../README.md#documentation) > **CUTLASS 3.0: Building on Windows with Visual Studio**
+
+# Building on Windows with Visual Studio
+
+CUTLASS 3.2 reintroduces support for the Microsoft Visual Studio compiler on Windows.
+Users and developers may build either
+in Visual Studio's graphical integrated development environment,
+or on the command line with `cmake --build`.
+
+# Software prerequisites
+
+1. Windows 10 or 11
+
+2. Visual Studio 2019 version 16.11.27, or Visual Studio 2022
+
+3. CUDA Toolkit (at least 12.2; earlier 12.x versions may work)
+
+4. CMake (at least 3.18)
+
+5. git
+
+6. Python (at least 3.6)
+
+Visual Studio must be installed *before* the CUDA Toolkit.
+Otherwise, Visual Studio's build system won't know about CUDA.
+
+# Operating system settings
+
+By default, Windows restricts the maximum file path length (`MAX_PATH`) to 260 characters.
+CUTLASS has many files and directory paths that challenge this requirement.
+As a result, CUTLASS is unlikely to build with this default setting.
+The choice of source and build directories affect path lengths,
+so the kinds of errors and whether they occur may depend on this.
+Symptoms may vary, from errors when running `cmake`
+(e.g., during the "generating library instances" step) to build failures.
+
+CUTLASS recommends changing the maximum file path length setting
+and rebooting the computer before attempting to clone or build CUTLASS.
+Windows 10 (as of version 1607) and 11 permit changing this setting
+by making sure that the following registry key exists,
+and that its value is set to 1.
+
+```
+Computer\HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\FileSystem\LongPathsEnabled
+```
+
+After changing the registry key's value, reboot the computer first
+before attempting to clone or build CUTLASS.
+
+[This Microsoft help article](https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation?tabs=registry)
+explains different ways to change the registry setting.
+
+# Set up build environment
+
+1. Run "git bash" to get a familiar command-line interface
+
+2. Edit `~/.profile` and set the environment variables as needed to access the CUTLASS repository
+
+3. Clone the CUTLASS repository
+
+4. Create the `build` subdirectory in the CUTLASS clone directory, and run CMake in it,
+    specifying whatever CMake options are desired, e.g.,
+    `cmake .. -DCUTLASS_NVCC_ARCHS=90a`
+
+Alternate approaches may rely on the CMake GUI and/or Windows' native command line.
+
+# Building
+
+A successful CMake run will create a `CUTLASS.sln` Visual Studio "solution" file in the build directory.
+One can open this in Visual Studio and build the entire solution or any subset of projects as desired.
+It may be necessary to limit maximum build parallelism by setting the appropriate Visual Studio option.
+
+Alternately, one can run `cmake --build . --config Release -j 4` in the build directory.
+Replace 4 with the desired maximum build parallelism.
+It's important to put the `--build` option before the period that signifies the build directory.
+The `--config` option specifies the kind of build;
+`--config Release` builds a Release build, while `--config Debug` builds a Debug build.
+Unlike with CMake's Makefile or Ninja generators,
+`CMAKE_BUILD_TYPE` has no effect on the Visual Studio generator,
+because the Visual Studio generator creates all build configurations.
+
+# Tips
+
+With Windows builds, one may find that CMake reruns unnecessarily.
+For example, cancelling a build and starting it again may rerun CMake.
+This will in turn touch build files that result in unnecessary rebuilds.
+One work-around is to set the CMake option `CMAKE_SUPPRESS_REGENERATION=ON`.
+However, this turns off CMake's ability to detect on its own when it needs to rerun.
+As a result, one will need to know when to rerun CMake by hand.
+
diff --git a/media/docs/build/building_with_clang_as_host_compiler.md b/media/docs/build/building_with_clang_as_host_compiler.md
new file mode 100644
index 0000000000..c53500609b
--- /dev/null
+++ b/media/docs/build/building_with_clang_as_host_compiler.md
@@ -0,0 +1,59 @@
+[README](../../README.md#documentation) > **CUTLASS 3: Building with Clang as host compiler**
+
+# Building with Clang as host compiler
+
+CUTLASS 3.2(.1) reintroduces support for building with
+Clang as host compiler, and NVCC as device compiler.
+This is NOT the same as building with
+Clang as both host and device compiler ("CUDA Clang").
+
+# Software prerequisites
+
+1. Clang (regularly tested with Clang 17;
+   occasionally tested with Clang 10 and greater)
+
+2. CUDA Toolkit (tested with 12.2; other versions likely work)
+
+3. CMake (at least 3.18)
+
+4. git
+
+5. Python (at least 3.6)
+
+Experience with Ubuntu 22.04 LTS is that
+clang requires the following packages to be installed.
+
+```bash
+$ sudo apt-get install clang cmake ninja-build pkg-config libgtk-3-dev liblzma-dev libstdc++-12-dev
+```
+
+A symptom of not installing all needed dependencies
+is the following error when attempting to use clang:
+`"/usr/bin/ld: cannot find -lstdc++: No such file or directory"`.
+
+# Running CMake
+
+## Required CMake options
+
+The Clang build requires specifying the following CMake options.
+Replace `<path-to-clang++>` with the path to your `clang++` executable.
+You may use `clang++` directly if it is in your `PATH`.
+
+* `CMAKE_CXX_COMPILER=<path-to-clang++>`
+* `CMAKE_CUDA_HOST_COMPILER=<path-to-clang++>`
+
+One must set both!  It's not enough just to set the `CXX` environment
+variable, for example.  Symptoms of only setting `CMAKE_CXX_COMPILER`
+(or only setting the `CXX` environment variable) include `cc1plus`
+(GCC's compiler executable) reporting build errors due to it not
+understanding Clang's command-line options.
+
+Users can also specify a particular CUDA Toolkit version
+by setting the CMake option `CMAKE_CUDA_COMPILER`
+to the path to the `nvcc` executable
+that lives in the CUDA Toolkit's directory.  For example,
+if `${PATH_TO_CUDA_TOOLKIT}` is the CUDA Toolkit directory,
+then one can set `CMAKE_CUDA_COMPILER` as follows.
+
+* `CMAKE_CUDA_COMPILER=${PATH_TO_CUDA_TOOLKIT}/bin/nvcc`
+
diff --git a/media/docs/code_organization.md b/media/docs/code_organization.md
index c83c7c90c6..b446091e40 100644
--- a/media/docs/code_organization.md
+++ b/media/docs/code_organization.md
@@ -1,12 +1,13 @@
-![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS Code Organization")
+![ALT](../images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS Code Organization")
 
-[README](/README.md#documentation) > **Code Organization**
+[README](../../README.md#documentation) > **Code Organization**
 
 # CUTLASS Code Organization
 
 This document describes the layout of the CUTLASS repository. The main components are:
 
 * **CUTLASS Template Library** - CUDA Templates for Linear Algebra Subroutines and Solvers (header only)
+* **CuTe Template Library** - CUTLASS's core vocabulary layout type and associated algebra (header only)
 * **CUTLASS Utilities** - Additional templates 
 * **CUTLASS Instance Library** - instantiations of CUTLASS templates covering the design space
 * **CUTLASS Profiler** - CUTLASS Library, Profiler, and Utilities
@@ -29,7 +30,6 @@ CUTLASS Templates are implemented by header files in the following directory str
 
 ```
 include/                     # Top-level include directory. Client applications should target this path.
-
   cutlass/                   # CUDA Templates for Linear Algebra Subroutines and Solvers - headers only
 
     arch/                    # direct exposure of architecture features (including instruction-level GEMMs)
@@ -37,10 +37,11 @@ include/                     # Top-level include directory. Client applications
     gemm/                    # code specialized for general matrix product computations
       thread/                #   thread-level operators
       warp/                  #   warp-level operators
+      collective/            #   3.x API operators for all threads a tiled mma/copy are built over
       threadblock/           #   CTA-level operators
       kernel/                #   CUDA kernel entry points
       device/                #   launches kernel(s) over a full device
-      *                      # scope-agnostic components and basic vocabular type definitions for GEMM
+      *                      # scope-agnostic components and basic vocabulary type definitions for GEMM
 
     layout/                  # layout definitions for matrices, tensors, and other mathematical objects in memory
       *
@@ -51,7 +52,7 @@ include/                     # Top-level include directory. Client applications
       threadblock/           #   CTA-level operators
       kernel/                #   CUDA kernel entry points
       device/                #   launches kernel(s) over a full device
-      *                      # scope-agnostic components and basic vocabular type definitions
+      *                      # scope-agnostic components and basic vocabulary type definitions
 
     transform/               # code specialized for layout, type, and domain transformations
       thread/                #   thread-level operators
@@ -64,11 +65,27 @@ include/                     # Top-level include directory. Client applications
     util/                    # miscellaneous CUTLASS components
       *
     *                        # core vocabulary types and fundamental arithmetic operators
+
+  cute /                     # CuTe Layout, layout algebra, MMA/Copy atoms, tiled MMA/Copy
+    algorithm/               # Definitions of core operations such as copy, gemm, and operations on cute::tuples
+    arch/                    # Bare bones PTX wrapper structs for copy and math instructions
+    atom/                    # Meta-information either link to or built from arch/ operators
+      mma_atom.hpp           # cute::Mma_Atom and cute::TiledMma
+      copy_atom.hpp          # cute::Copy_Atom and cute::TiledCopy
+      *sm*.hpp               # Arch specific meta-information for copy and math operations
+    container/               # Core container types used across CuTe, namely, cute::tuple
+    numeric/                 # CuTe's internal numerics implementation
+    *                        # Core library types such as Shape, Stride, Layout, Tensor, and associated operations
 ```
 
 See [Programming Guidelines](/media/docs/programming_guidelines.md) for further details about
 conventions and design patterns used throughout CUTLASS.
 
+## CuTe
+
+CuTe is a collection of C++ CUDA template abstractions for defining and operating on hierarchically multidimensional layouts of threads and data. CuTe provides `Layout` and `Tensor` objects that compactly packages the type, shape, memory space, and layout of data, while performing the complicated indexing for the user. This lets programmers focus on the logical descriptions of their algorithms while CuTe does the mechanical bookkeeping for them. With these tools, we can quickly design, implement, and modify all dense linear algebra operations. More documentation
+for CuTe can be found in [`/media/docs/cute/`](/media/docs/cute/).
+
 ## Tools
 
 The `tools/` directory contains clients of the CUTLASS Template library and includes the following.
@@ -92,14 +109,15 @@ tools/
           library.h          # defines enums and structs to describe the tiled structure of operator instances          
           manifest.h         # collection of all instances
 
-    scripts/                 # scripts to procedurally generate CUTLASS template instances
+    src/
+
+python/
+    cutlass_library/       # scripts to procedurally generate CUTLASS template instances
 
       gemm_operations.py
       library.py
-      generator.py           # entry point of procedural generation scripts - invoked by cmake
+      generator.py            # entry point of procedural generation scripts - invoked by cmake
       manifest.py
-
-    src/
 ```
 
 When CMake is executed, the CUTLASS Instance Library generator scripts are executed to construct a set of
@@ -181,9 +199,9 @@ examples/
 
   11_planar_complex_array/   # example demonstrating planar complex kernels with batch-specific problem sizes
 
-  12_gemm_bias_relu/         # example demonstrating GEMM fused with bias and relu
+  12_gemm_bias_relu/         # example demonstrating GEMM fused with bias and relu activation function
 
-  13_fused_two_gemms/        # example demonstrating two GEMms fused in one kernel
+  13_fused_two_gemms/        # example demonstrating two GEMMs fused into one kernel
 ```
 
 ## Media
@@ -220,7 +238,7 @@ of tests run may vary over time as more are added.
 
 # Copyright
 
-Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/media/docs/cute/00_quickstart.md b/media/docs/cute/00_quickstart.md
new file mode 100644
index 0000000000..05998e77f4
--- /dev/null
+++ b/media/docs/cute/00_quickstart.md
@@ -0,0 +1,119 @@
+# Getting Started With CuTe
+
+CuTe is a collection of C++ CUDA template abstractions for defining and operating on hierarchically multidimensional layouts of threads and data. CuTe provides `Layout` and `Tensor` objects that compactly packages the type, shape, memory space, and layout of data, while performing the complicated indexing for the user. This lets programmers focus on the logical descriptions of their algorithms while CuTe does the mechanical bookkeeping for them. With these tools, we can quickly design, implement, and modify all dense linear algebra operations.
+
+The core abstraction of CuTe are the hierarchically multidimensional layouts which can be composed with data arrays to represent tensors. The representation of layouts is powerful enough to represent nearly everything we need to implement efficient dense linear algebra. Layouts can also be combined and manipulated via functional composition, on which we build a large set of common operations such as tiling and partitioning.
+
+## System Requirements
+
+CuTe shares CUTLASS 3.x's software requirements,
+including NVCC with a C++17 host compiler.
+
+## Knowledge prerequisites
+
+CuTe is a CUDA C++ header-only library.  It requires C++17
+(the revision of the C++ Standard that was released in 2017).
+
+Throughout this tutorial, we assume intermediate C++ experience.
+For example, we assume that readers know
+how to read and write templated functions and classes, and
+how to use the `auto` keyword to deduce a function's return type.
+We will be gentle with C++ and explain some things
+that you might already know.
+
+We also assume intermediate CUDA experience.
+For example, readers must know
+the difference between device and host code,
+and how to launch kernels.
+
+## Building Tests and Examples
+
+CuTe's tests and examples build and run as part of CUTLASS's normal build process.
+
+CuTe's unit tests live in the [`test/unit/cute`](../../../test/unit/cute) subdirectory.
+
+CuTe's examples live in the [`examples/cute`](../../../examples/cute) subdirectory.
+
+## Library Organization
+
+CuTe is a header-only C++ library, so there is no source code that needs building. Library headers are contained within the top level [`include/cute`](../../../include/cute) directory, with components of the library grouped by directories that represent their semantics.
+
+|        Directory       |        Contents        |
+|------------------------|------------------------|
+| [`include/cute`](../../../include/cute) | Each header in the top level corresponds to one of the fundamental building blocks of CuTe, such as [`Layout`](../../../include/cute/layout.hpp) and [`Tensor`](../../../include/cute/tensor.hpp). |
+| [`include/cute/container`](../../../include/cute/container) | Implementations of STL-like objects, such as tuple, array, and aligned array.  |
+| [`include/cute/numeric`](../../../include/cute/numeric) | Fundamental numeric data types that include nonstandard floating-point types, nonstandard integer types, complex numbers, and integer sequence.  |
+| [`include/cute/algorithm`](../../../include/cute/algorithm) | Implementations of utility algorithms such as copy, fill, and clear that automatically leverage architecture-specific features if available. |
+| [`include/cute/arch`](../../../include/cute/arch) | Wrappers for architecture-specific matrix-matrix multiply and copy instructions. |
+| [`include/cute/atom`](../../../include/cute/atom) | Meta-information for instructions in `arch` and utilities like partitioning and tiling.
+
+## Tutorial
+
+This directory contains a CuTe tutorial in Markdown format.
+The file
+[`0x_gemm_tutorial.md`](./0x_gemm_tutorial.md)
+explains how to implement dense matrix-matrix multiply using CuTe components.
+It gives a broad overview of CuTe and thus would be a good place to start.
+
+Other files in this directory discuss specific parts of CuTe.
+
+* [`01_layout.md`](./01_layout.md) describes `Layout`, CuTe's core abstraction.
+
+* [`02_layout_algebra.md`](./02_layout_algebra.md) describes more advanced `Layout` operations and the CuTe layout algebra.
+
+* [`03_tensor.md`](./03_tensor.md) describes `Tensor`,
+  a multidimensional array abstraction which composes `Layout`
+  with an array of data.
+
+* [`04_algorithms.md`](./04_algorithms.md) summarizes CuTe's
+  generic algorithms that operate on `Tensor`s.
+
+* [`0t_mma_atom.md`](./0t_mma_atom.md) demonstrates CuTe's meta-information and interface to our GPUs'
+  architecture-specific Matrix Multiply-Accumulate (MMA) instructions.
+
+* [`0x_gemm_tutorial.md`](./0x_gemm_tutorial.md) walks through building a GEMM from scratch using CuTe.
+
+* [`0y_predication.md`](./0y_predication.md) explains what to do
+  if a tiling doesn't fit evenly into a matrix.
+
+* [`0z_tma_tensors.md`](./0z_tma_tensors.md) explains an advanced `Tensor` type that CuTe uses to support TMA loads and stores.
+
+## Quick Tips
+
+### How do I print CuTe objects on host or device?
+
+The `cute::print` function has overloads for almost all CuTe types, including Pointers, Integers, Strides, Shapes, Layouts, and Tensors.  When in doubt, try calling `print` on it.
+
+CuTe's print functions work on either host or device.
+Note that on device, printing is expensive.
+Even just leaving print code in place on device,
+even if it is never called
+(e.g., printing in an `if` branch that is not taken at run time),
+may generate slower code.
+Thus, be sure to remove code that prints on device after debugging.
+
+You might also only want to print on thread 0 of each threadblock, or threadblock 0 of the grid.  The `thread0()` function returns true only for global thread 0 of the kernel, that is, for thread 0 of threadblock 0.  A common idiom for printing CuTe objects to print only on global thread 0.
+
+```c++
+if (thread0()) {
+  print(some_cute_object);
+}
+```
+
+Some algorithms depend on some thread or threadblock,
+so you may need to print on threads or threadblocks other than zero.
+The header file
+[`cute/util/debug.hpp`](../../../include/cute/util/debug.hpp),
+among other utilities,
+includes the function `bool thread(int tid, int bid)`
+that returns `true` if running on thread `tid` and threadblock `bid`.
+
+#### Other output formats
+
+Some CuTe types have special printing functions that use a different output format.
+
+The `cute::print_layout` function will display any rank-2 layout in a plain test table. This is excellent for visualizing the map from coordinates to indices.
+
+The `cute::print_tensor` function will display any rank-1, rank-2, rank-3, or rank-4 tensor in a plain text multidimensional table. The values of the tensor are printed so you can verify the tile of data is what you expect after a copy, for example.
+
+The `cute::print_latex` function will print LaTeX commands that you can use to build a nicely formatted and colored tables via `pdflatex`. This work for `Layout`, `TiledCopy`, and `TiledMMA`, which can be very useful to get a sense of layout patterns and partitioning patterns within CuTe.
diff --git a/media/docs/cute/01_layout.md b/media/docs/cute/01_layout.md
new file mode 100644
index 0000000000..2f5ba5a515
--- /dev/null
+++ b/media/docs/cute/01_layout.md
@@ -0,0 +1,537 @@
+# CuTe Layouts
+
+This document describes `Layout`, CuTe's core abstraction.
+Fundamentally, a `Layout` maps from coordinate space(s)
+to an index space.
+
+`Layout`s present a common interface to multidimensional array access
+that abstracts away the details of how the array's elements are organized in memory.
+This lets users write algorithms that access multidimensional arrays generically,
+so that layouts can change, without users' code needing to change. For example, a row-major MxN layout and a column-major MxN layout can be treated identically in software.
+
+CuTe also provides an "algebra of `Layout`s."
+`Layout`s can be combined and manipulated
+to construct more complicated layouts
+and to tile layouts across other layouts.
+This can help users do things like partition layouts of data over layouts of threads.
+
+## Fundamental Types and Concepts
+
+### Integers
+
+CuTe makes great use of dynamic (known only at run-time) and static (known at compile-time) integers.
+
+* Dynamic integers (or "run-time integers") are just ordinary integral types like `int` or `size_t` or `uint16_t`. Anything that is accepted by `std::is_integral<T>` is considered a dynamic integer in CuTe.
+
+* Static integers (or "compile-time integers") are instantiations of types like `std::integral_constant<Value>`. These types encode the value as a `static constexpr` member. They also support casting to their underlying dynamic types, so they can be used in expressions with dynamic integers. CuTe defines its own CUDA-compatibe static integer types `cute::C<Value>` along with overloaded math operators so that math on static integers results in static integers. CuTe defines shortcut aliases `Int<1>`, `Int<2>`, `Int<3>` and `_1`, `_2`, `_3` as conveniences, which you should see often within examples.
+
+CuTe attempts to handle static and dynamic integers identically. In the examples that follow, all dynamic integers could be replaced with static integers and vice versa. When we say "integer" in CuTe, we almost always mean a static OR dynamic integer.
+
+CuTe provides a number of traits to work with integers.
+* `cute::is_integral<T>`: Checks whether `T` is a static or dynamic integer type.
+* `cute::is_std_integral<T>`: Checks whether `T` is a dynamic integer type. Equivalent to `std::is_integral<T>`.
+* `cute::is_static<T>`: Checks whether `T` is an empty type (so instantiations cannot depend on any dynamic information). Equivalent to `std::is_empty`.
+* `cute::is_constant<N,T>`: Checks that `T` is a static integer AND its value is equivalent to `N`.
+
+See the [`integral_constant` implementations](../../../include/cute/numeric/integral_constant.hpp) for more information.
+
+### Tuple
+
+A tuple is a finite ordered list of zero or more elements.
+The [`cute::tuple` class](../../../include/cute/container/tuple.hpp) behaves like `std::tuple`, but works on device and host. It imposes restrictions on its template arguments and strips down the implementation for performance and simplicity.
+
+### IntTuple
+
+CuTe defines the IntTuple concept as either an integer, or a tuple of IntTuples. Note the recursive definition.
+In C++, we define [operations on `IntTuple`](../../../include/cute/int_tuple.hpp).
+
+Examples of `IntTuple`s include:
+* `int{2}`, the dynamic integer 2.
+* `Int<3>{}`, the static integer 3.
+* `make_tuple(int{2}, Int<3>{})`, the tuple of dynamic-2, and static-3.
+* `make_tuple(uint16_t{42}, make_tuple(Int<1>{}, int32_t{3}), Int<17>{})`, the tuple of dynamic-42, tuple of static-1 and dynamic-3, and static-17.
+
+CuTe reuses the `IntTuple` concept for many different things,
+including Shape, Stride, Step, and Coord
+(see [`include/cute/layout.hpp`](../../../include/cute/layout.hpp)).
+
+Operations defined on `IntTuple`s include the following.
+
+* `rank(IntTuple)`: The number of elements in an `IntTuple`. A single integer has rank 1, and a tuple has rank `tuple_size`.
+
+* `get<I>(IntTuple)`: The `I`th element of the `IntTuple`, with `I < rank`. For single integers, `get<0>` is just that integer.
+
+* `depth(IntTuple)`: The number of hierarchical `IntTuple`s. A single integer has depth 0, a tuple of integers has depth 1, a tuple that contains a tuple of integers has depth 2, etc.
+
+* `size(IntTuple)`: The product of all elements of the `IntTuple`.
+
+We write `IntTuple`s with parentheses to denote the hierarchy. For example, `6`, `(2)`, `(4,3)`, and `(3,(6,2),8)` are all `IntTuple`s.
+
+### Shapes and Strides
+
+Both `Shape` and `Stride` are `IntTuple` concepts.
+
+### Layout
+
+A `Layout` is a tuple of (`Shape`, `Stride`).
+Semantically, it implements a mapping from
+any coordinate within the Shape to an index via the Stride.
+
+### Tensor
+
+A `Layout` can be composed with data -- e.g., a pointer or an array -- to create a `Tensor`. The index generated by the `Layout` is used to subscript an iterator to retrieve the appropriate data. For details on `Tensor`, please refer to the
+[`Tensor` section of the tutorial](./03_tensor.md).
+
+## Layout Creation and Use
+
+A `Layout` is a pair of `IntTuple`s: the `Shape` and the `Stride`. The first element defines the abstract *shape* of the `Layout`, and the second element defines the *strides*, which map from coordinates within the shape to the index space.
+
+We define many operations on `Layout`s analogous to those defined on `IntTuple`.
+
+* `rank(Layout)`: The number of modes in a `Layout`. Equivalent to the tuple size of the `Layout`'s shape.
+
+* `get<I>(Layout)`: The `I`th sub-layout of the `Layout`, with `I < rank`.
+
+* `depth(Layout)`: The depth of the `Layout`'s shape. A single integer has depth 0, a tuple of integers has depth 1, a tuple of tuples of integers has depth 2, etc.
+
+* `shape(Layout)`: The shape of the `Layout`.
+
+* `stride(Layout)`: The stride of the `Layout`.
+
+* `size(Layout)`: The size of the `Layout` function's domain.  Equivalent to `size(shape(Layout))`.
+
+* `cosize(Layout)`: The size of the `Layout` function's codomain (not necessarily the range). Equivalent to `A(size(A) - 1) + 1`.
+
+### Hierarchical access functions
+
+`IntTuple`s and `Layout`s can be arbitrarily nested.
+For convenience, we define versions of some of the above functions
+that take a sequence of integers, instead of just one integer.
+This makes it possible to access elements
+inside of nested `IntTuple` or `Layout` more easily.
+For example, we permit `get<I...>(x)`, where `I...` is a "C++ parameter pack" that denotes zero or more (integer) template arguments. These hierarchical access functions include the following.
+
+* `get<I0,I1,...,IN>(x) := get<IN>(...(get<I1>(get<I0>(x)))...)`. Extract the `IN`th of the ... of the `I1`st of the `I0`th element of `x`.
+
+* `rank<I...>(x)  := rank(get<I...>(x))`. The rank of the `I...`th element of `x`.
+
+* `depth<I...>(x) := depth(get<I...>(x))`. The depth of the `I...`th element of `x`.
+
+* `shape<I...>(x)  := shape(get<I...>(x))`. The shape of the `I...`th element of `x`.
+
+* `size<I...>(x)  := size(get<I...>(x))`. The size of the `I...`th element of `x`.
+
+In the following examples, you'll see use of `size<0>` and `size<1>` to determine loops bounds for the 0th and 1st mode of a layout or tensor.
+
+### Constructing a Layout
+
+A `Layout` can be constructed in many different ways.
+It can include any combination of compile-time (static) integers
+or run-time (dynamic) integers.
+
+```c++
+Layout s8 = make_layout(Int<8>{});
+Layout d8 = make_layout(8);
+
+Layout s2xs4 = make_layout(make_shape(Int<2>{},Int<4>{}));
+Layout s2xd4 = make_layout(make_shape(Int<2>{},4));
+
+Layout s2xd4_a = make_layout(make_shape (Int< 2>{},4),
+                             make_stride(Int<12>{},Int<1>{}));
+Layout s2xd4_col = make_layout(make_shape(Int<2>{},4),
+                               LayoutLeft{});
+Layout s2xd4_row = make_layout(make_shape(Int<2>{},4),
+                               LayoutRight{});
+
+Layout s2xh4 = make_layout(make_shape (2,make_shape (2,2)),
+                           make_stride(4,make_stride(2,1)));
+Layout s2xh4_col = make_layout(shape(s2xh4),
+                               LayoutLeft{});
+```
+
+The `make_layout` function returns a `Layout`.
+It deduces the types of the function's arguments and returns a `Layout` with the appropriate template arguments.
+Similarly, the `make_shape` and `make_stride` functions
+return a `Shape` resp. `Stride`.
+CuTe often uses these `make_*` functions
+due to restrictions around constructor template argument deduction (CTAD) and to avoid having to repeat static or dynamic integer types.
+
+When the `Stride` argument is omitted, it is generated from the provided `Shape` with `LayoutLeft` as default. The `LayoutLeft` tag constructs strides as an exclusive prefix product of the `Shape` from left to right, without regard to the `Shape`'s hierarchy. This can be considered a "generalized column-major stride generation". The `LayoutRight` tag constructs strides as an exclusive prefix product of the `Shape` from right to left, without regard to the `Shape`'s hierarchy. For shapes of depth one, this can be considered a "row-major stride generation", but for hierarchical shapes the resulting strides may be surprising. For example, the strides of `s2xh4` above could be generated with `LayoutRight`.
+
+Calling `print` on each layout above results in the following
+
+```
+s8        :  _8:_1
+d8        :  8:_1
+s2xs4     :  (_2,_4):(_1,_2)
+s2xd4     :  (_2,4):(_1,_2)
+s2xd4_a   :  (_2,4):(_12,_1)
+s2xd4_col :  (_2,4):(_1,_2)
+s2xd4_row :  (_2,4):(4,_1)
+s2xh4     :  (2,(2,2)):(4,(2,1))
+s2xh4_col :  (2,(2,2)):(_1,(2,4))
+```
+
+The `Shape:Stride` notation is used quite often for `Layout`. The `_N` notation is shorthand for a static integer while other integers are dynamic integers. Observe that both `Shape` and `Stride` may be composed of both static and dynamic integers.
+
+Also note that the `Shape` and `Stride` are assumed to be *congruent*. That is, `Shape` and `Stride` have the same tuple profiles. For every integer in `Shape`, there is a corresponding integer in `Stride`. This can be asserted with
+```cpp
+static_assert(congruent(my_shape, my_stride));
+```
+
+### Using a Layout
+
+The fundamental use of a `Layout` is to map between coordinate space(s) defined by the `Shape` and an index space defined by the `Stride`. For example, to print an arbitrary rank-2 layout in a 2-D table, we can write the function
+
+```c++
+template <class Shape, class Stride>
+void print2D(Layout<Shape,Stride> const& layout)
+{
+  for (int m = 0; m < size<0>(layout); ++m) {
+    for (int n = 0; n < size<1>(layout); ++n) {
+      printf("%3d  ", layout(m,n));
+    }
+    printf("\n");
+  }
+}
+```
+
+which produces the following output for the above examples.
+
+```
+> print2D(s2xs4)
+  0    2    4    6
+  1    3    5    7
+> print2D(s2xd4_a)
+  0    1    2    3
+ 12   13   14   15
+> print2D(s2xh4_col)
+  0    2    4    6
+  1    3    5    7
+> print2D(s2xh4)
+  0    2    1    3
+  4    6    5    7
+```
+
+We can see static, dynamic, row-major, column-major, and hierarchical layouts printed here. The statement `layout(m,n)` provides the mapping of
+the logical 2-D coordinate (m,n) to the 1-D index.
+
+Interestingly, the `s2xh4` example isn't row-major or column-major. Furthermore, it has three modes but is still interpreted as rank-2 and we're using a 2-D coordinate. Specifically, `s2xh4` has a 2-D multi-mode in the second mode, but we're still able to use a 1-D coordinate for that mode. More on this in the next section, but first we can generalize this another step. Let's use a 1-D coordinate and treat all of the modes of each layout as a single multi-mode.  For instance, the following `print1D` function
+
+```c++
+template <class Shape, class Stride>
+void print1D(Layout<Shape,Stride> const& layout)
+{
+  for (int i = 0; i < size(layout); ++i) {
+    printf("%3d  ", layout(i));
+  }
+}
+```
+
+produces the following output for the above examples.
+
+```
+> print1D(s2xs4)
+  0    1    2    3    4    5    6    7
+> print1D(s2xd4_a)
+  0   12    1   13    2   14    3   15
+> print1D(s2xh4_col)
+  0    1    2    3    4    5    6    7
+> print1D(s2xh4)
+  0    4    2    6    1    5    3    7
+```
+
+Any multi-mode of a layout, including the entire layout itself, can accept a 1-D coordinate. More on this in the following sections.
+
+CuTe provides more printing utilities for visualizing Layouts. The `print_layout` function produces a formatted 2-D table of the Layout's mapping.
+
+```text
+> print_layout(s2xh4)
+(2,(2,2)):(4,(2,1))
+      0   1   2   3
+    +---+---+---+---+
+ 0  | 0 | 2 | 1 | 3 |
+    +---+---+---+---+
+ 1  | 4 | 6 | 5 | 7 |
+    +---+---+---+---+
+```
+
+The `print_latex` function generates LaTeX that can be compiled with `pdflatex` into a color-coded vector graphics image of the same 2-D table.
+
+### Vector Layouts
+
+We define a vector as any `Layout` with `rank == 1`.
+For example, the layout `8:1` can be interpreted as an 8-element vector whose indices are contiguous.
+
+```
+Layout:  8:1
+Coord :  0  1  2  3  4  5  6  7
+Index :  0  1  2  3  4  5  6  7
+```
+
+Similarly,
+the layout `8:2` can be interpreted as an 8-element vector where the indices of the elements are strided by `2`.
+
+```
+Layout:  8:2
+Coord :  0  1  2  3  4  5  6  7
+Index :  0  2  4  6  8 10 12 14
+```
+
+By the above rank-1 definition, we *also* interpret layout `((4,2)):((2,1))` as a vector, since its shape is rank-1. The inner shape looks like a 4x2 row-major matrix, but the extra pair of parenthesis suggest we can interpret those two modes as a 1-D 8-element vector. The strides tell us that the first `4` elements are strided by `2` and then there are `2` of those first elements strided by `1`.
+
+```
+Layout:  ((4,2)):((2,1))
+Coord :  0  1  2  3  4  5  6  7
+Index :  0  2  4  6  1  3  5  7
+```
+
+We can see the second set of `4` elements are duplicates of the first `4` with an extra stride of `1`.
+
+Consider the layout `((4,2)):((1,4))`. Again, it's `4` elements strided by `1` and then `2` of those first elements strided by `4`.
+
+```
+Layout:  ((4,2)):((1,4))
+Coord :  0  1  2  3  4  5  6  7
+Index :  0  1  2  3  4  5  6  7
+```
+
+As a function from integers to integers, it's identical to `8:1`. It's the identity function.
+
+### Matrix examples
+
+Generalizing, we define a matrix as any `Layout` that is rank-2. For example,
+
+```
+Shape :  (4,2)
+Stride:  (1,4)
+  0   4
+  1   5
+  2   6
+  3   7
+```
+
+is a 4x2 column-major layout with stride-1 down the columns and stride-4 across the rows, and
+
+```
+Shape :  (4,2)
+Stride:  (2,1)
+  0   1
+  2   3
+  4   5
+  6   7
+```
+
+is a 4x2 row-major layout with stride-2 down the columns and stride-1 across the rows. Majorness is simply which mode has stride-1.
+
+Just like the vector layouts, each of the modes of the matrix can also be split into *multi-modes*.
+This lets us express more layouts beyond just row-major and column-major. For example,
+
+```
+Shape:  ((2,2),2)
+Stride: ((4,1),2)
+  0   2
+  4   6
+  1   3
+  5   7
+```
+
+is also logically 4x2, with stride-2 across the rows but a multi-stride down the columns. The first `2` elements down the column have a stride of `4` and then there is a copy of those with stride-1. Since this layout is logically 4x2,
+like the column-major and row-major examples above,
+we can _still_ use 2-D coordinates to index into it.
+
+## Layout Concepts
+
+In this section, we'll introduce the coordinate sets that `Layout`s accept and how the coordinate mappings and index mappings are computed.
+
+### Layout compatibility
+
+We say that layout A is *compatible* with layout B if the shape of A is compatible with the shape of B.
+Shape A is compatible with shape B if
+
+* the size of A is equal to the size of B and
+* all coordinates within A are valid coordinates within B.
+
+For example:
+* Shape 24 is NOT compatible with Shape 32.
+* Shape 24 is compatible with Shape (4,6).
+* Shape (4,6) is compatible with Shape ((2,2),6).
+* Shape ((2,2),6) is compatible with Shape ((2,2),(3,2)).
+* Shape 24 is compatible with Shape ((2,2),(3,2)).
+* Shape 24 is compatible with Shape ((2,3),4).
+* Shape ((2,3),4) is NOT compatible with Shape ((2,2),(3,2)).
+* Shape ((2,2),(3,2)) is NOT compatible with Shape ((2,3),4).
+* Shape 24 is compatible with Shape (24).
+* Shape (24) is NOT compatible with Shape 24.
+* Shape (24) is NOT compatible with Shape (4,6).
+
+That is, *compatible* is a weak partial order on Shapes as it is reflexive, antisymmetric, and transitive.
+
+### Layouts Coordinates
+
+With the notion of compatibility above, we emphasize that every `Layout` accepts multiple kinds of coordinates. Every `Layout` accepts coordinates for any `Shape` that is compatible with it. CuTe provides mappings between these sets of coordinates via a colexicographical order.
+
+Thus, all Layouts provide two fundamental mappings:
+
+* the map from an input coordinate to the corresponding natural coordinate via the `Shape`, and
+* the map from a natural coordinate to the index via the `Stride`.
+
+#### Coordinate Mapping
+
+The map from an input coordinate to a natural coordinate is the application of a colexicographical order (reading right to left, instead of "lexicographical," which reads left to right) within the `Shape`.
+
+Take the shape `(3,(2,3))`, for example. This shape has three coordinate sets: the 1-D coordinates, the 2-D coordinates, and the natural (h-D) coordinates.
+
+|  1-D  |   2-D   |   Natural   | |  1-D  |   2-D   |       Natural   |
+| ----- | ------- | ----------- |-| ----- | ------- | ----------- |
+|  `0`  | `(0,0)` | `(0,(0,0))` | |  `9`  | `(0,3)` | `(0,(1,1))` |
+|  `1`  | `(1,0)` | `(1,(0,0))` | | `10`  | `(1,3)` | `(1,(1,1))` |
+|  `2`  | `(2,0)` | `(2,(0,0))` | | `11`  | `(2,3)` | `(2,(1,1))` |
+|  `3`  | `(0,1)` | `(0,(1,0))` | | `12`  | `(0,4)` | `(0,(0,2))` |
+|  `4`  | `(1,1)` | `(1,(1,0))` | | `13`  | `(1,4)` | `(1,(0,2))` |
+|  `5`  | `(2,1)` | `(2,(1,0))` | | `14`  | `(2,4)` | `(2,(0,2))` |
+|  `6`  | `(0,2)` | `(0,(0,1))` | | `15`  | `(0,5)` | `(0,(1,2))` |
+|  `7`  | `(1,2)` | `(1,(0,1))` | | `16`  | `(1,5)` | `(1,(1,2))` |
+|  `8`  | `(2,2)` | `(2,(0,1))` | | `17`  | `(2,5)` | `(2,(1,2))` |
+
+Each coordinate into the shape `(3,(2,3))` has two *equivalent* coordinates and all equivalent coordinates map to the same natural coordinate. To emphasize again, because all of the above coordinates are valid inputs, a Layout with Shape `(3,(2,3))` can be used as if it is a 1-D array of 18 elements by using the 1-D coordinates, a 2-D matrix of 3x6 elements by using the 2-D coordinates, or a h-D tensor of 3x(2x3) elements by using the h-D (natural) coordinates.
+
+The previous 1-D print demonstrates how CuTe identifies 1-D coordinates with a colexicographical ordering of 2-D coordinates.  Iterating from `i = 0` to `size(layout)` and indexing into our layout with the single integer coordinate `i`, traverses the 2-D coordinates in this "generalized-column-major" order, even if the layout maps coordinates to indices in a row-major or more complex fashion.
+
+The function `cute::idx2crd(idx, shape)` is responsible for the coordinate mapping. It will take any coordinate within the shape and compute the equivalent natural coordinate for that shape.
+```cpp
+auto shape = Shape<_3,Shape<_2,_3>>{};
+print(idx2crd(   16, shape));                                // (1,(1,2))
+print(idx2crd(_16{}, shape));                                // (_1,(_1,_2))
+print(idx2crd(make_coord(   1,5), shape));                   // (1,(1,2))
+print(idx2crd(make_coord(_1{},5), shape));                   // (_1,(1,2))
+print(idx2crd(make_coord(   1,make_coord(1,   2)), shape));  // (1,(1,2))
+print(idx2crd(make_coord(_1{},make_coord(1,_2{})), shape));  // (_1,(1,_2))
+```
+
+#### Index Mapping
+
+The map from a natural coordinate to an index is performed by taking the inner product of the natural coordinate with the `Layout`'s `Stride`.
+
+Take the layout `(3,(2,3)):(3,(12,1))`, for example. Then a natural coordinate `(i,(j,k))` will result in the index `i*3 + j*12 + k*1`. The indices this layout computes are shown in the 2-D table below where `i` is used as the row coordinate and `(j,k)` is used as the column coordinate.
+
+```
+       0     1     2     3     4     5     <== 1-D col coord
+     (0,0) (1,0) (0,1) (1,1) (0,2) (1,2)   <== 2-D col coord (j,k)
+    +-----+-----+-----+-----+-----+-----+
+ 0  |  0  |  12 |  1  |  13 |  2  |  14 |
+    +-----+-----+-----+-----+-----+-----+
+ 1  |  3  |  15 |  4  |  16 |  5  |  17 |
+    +-----+-----+-----+-----+-----+-----+
+ 2  |  6  |  18 |  7  |  19 |  8  |  20 |
+    +-----+-----+-----+-----+-----+-----+
+```
+
+The function `cute::crd2idx(c, shape, stride)` is responsible for the index mapping. It will take any coordinate within the shape, compute the equivalent natural coordinate for that shape (if it is not already), and compute the inner product with the strides.
+```cpp
+auto shape  = Shape <_3,Shape<  _2,_3>>{};
+auto stride = Stride<_3,Stride<_12,_1>>{};
+print(crd2idx(   16, shape, stride));       // 17
+print(crd2idx(_16{}, shape, stride));       // _17
+print(crd2idx(make_coord(   1,   5), shape, stride));  // 17
+print(crd2idx(make_coord(_1{},   5), shape, stride));  // 17
+print(crd2idx(make_coord(_1{},_5{}), shape, stride));  // _17
+print(crd2idx(make_coord(   1,make_coord(   1,   2)), shape, stride));  // 17
+print(crd2idx(make_coord(_1{},make_coord(_1{},_2{})), shape, stride));  // _17
+```
+
+## Layout Manipulation
+
+### Sublayouts
+
+Sublayouts can be retrieved with `layout<I...>`
+```cpp
+Layout a   = Layout<Shape<_4,Shape<_3,_6>>>{}; // (4,(3,6)):(1,(4,12))
+Layout a0  = layout<0>(a);                     // 4:1
+Layout a1  = layout<1>(a);                     // (3,6):(4,12)
+Layout a10 = layout<1,0>(a);                   // 3:4
+Layout a11 = layout<1,1>(a);                   // 6:12
+```
+or `select<I...>`
+```cpp
+Layout a   = Layout<Shape<_2,_3,_5,_7>>{};     // (2,3,5,7):(1,2,6,30)
+Layout a13 = select<1,3>(a);                   // (3,7):(2,30)
+Layout a01 = select<0,1,3>(a);                 // (2,3,7):(1,2,30)
+Layout a2  = select<2>(a);                     // (5):(6)
+```
+or `take<ModeBegin, ModeEnd>`
+```cpp
+Layout a   = Layout<Shape<_2,_3,_5,_7>>{};     // (2,3,5,7):(1,2,6,30)
+Layout a13 = take<1,3>(a);                     // (3,5):(2,6)
+Layout a14 = take<1,4>(a);                     // (3,5,7):(2,6,30)
+// take<1,1> not allowed. Empty layouts not allowed.
+```
+
+### Concatenation
+
+A `Layout` can be provided to `make_layout` to wrap and concatenate
+```cpp
+Layout a = Layout<_3,_1>{};                     // 3:1
+Layout b = Layout<_4,_3>{};                     // 4:3
+Layout row = make_layout(a, b);                 // (3,4):(1,3)
+Layout col = make_layout(b, a);                 // (4,3):(3,1)
+Layout q   = make_layout(row, col);             // ((3,4),(4,3)):((1,3),(3,1))
+Layout aa  = make_layout(a);                    // (3):(1)
+Layout aaa = make_layout(aa);                   // ((3)):((1))
+Layout d   = make_layout(a, make_layout(a), a); // (3,(3),3):(1,(1),1)
+```
+or can be combined with `append`, `prepend`, or `replace`.
+```cpp
+Layout a = Layout<_3,_1>{};                     // 3:1
+Layout b = Layout<_4,_3>{};                     // 4:3
+Layout ab = append(a, b);                       // (3,4):(1,3)
+Layout ba = prepend(a, b);                      // (4,3):(3,1)
+Layout c  = append(ab, ab);                     // (3,4,(3,4)):(1,3,(1,3))
+Layout d  = replace<2>(c, b);                   // (3,4,4):(1,3,3)
+```
+
+### Grouping and flattening
+
+Layout modes can be grouped with `group<ModeBegin, ModeEnd>` and flattened with `flatten`.
+```cpp
+Layout a = Layout<Shape<_2,_3,_5,_7>>{};  // (_2,_3,_5,_7):(_1,_2,_6,_30)
+Layout b = group<0,2>(a);                 // ((_2,_3),_5,_7):((_1,_2),_6,_30)
+Layout c = group<1,3>(b);                 // ((_2,_3),(_5,_7)):((_1,_2),(_6,_30))
+Layout f = flatten(b);                    // (_2,_3,_5,_7):(_1,_2,_6,_30)
+Layout e = flatten(c);                    // (_2,_3,_5,_7):(_1,_2,_6,_30)
+```
+Grouping, flattening, and reordering modes allows the reinterpretation of tensors in place as matrices, matrices as vectors, vectors as matrices, etc.
+
+### Slicing
+
+`Layout`s can be sliced, but slicing is more appropriate to perform on `Tensor`s. See the [`Tensor` section](./03_tensor.md) for slicing details.
+
+## Summary
+
+* The `Shape` of a `Layout` defines its coordinate space(s).
+
+    * Every `Layout` has a 1-D coordinate space.
+      This can be used to iterate over the coordinate spaces in a colexicographical order.
+
+    * Every `Layout` has a R-D coordinate space,
+      where R is the rank of the layout.
+      The colexicographical enumeration of the R-D coordinates
+      correspond to the 1-D coordinates above.
+
+    * Every `Layout` has an h-D (natural) coordinate space where h is "hierarchical." These are ordered colexicographically and the enumeration of that order corresponds to the 1-D coordinates above. A natural coordinate is *congruent* to the `Shape` so that each element of the coordinate has a corresponding element of the `Shape`.
+
+* The `Stride` of a `Layout` maps coordinates to indices.
+
+    * The inner product of the elements of the natural coordinate with the elements of the `Stride` produces the resulting index.
+
+For each `Layout` there exists an integral `Shape` that is that compatible with that `Layout`. Namely, that integral shape is `size(layout)`. We can then observe that
+
+> Layouts are functions from integers to integers.
+
+If you're familiar with the C++23 feature `mdspan`,
+this is an important difference between
+`mdspan` layout mappings and CuTe `Layout`s. In CuTe, `Layout` is a first class citizen, is natively hierarchical to naturally represent functions beyond row-major and column-major, and can similarly be indexed with a hierarchy of coordinates.
+(`mdspan` layout mappings can represent hierarchical functions as well,
+but this requires defining a custom layout.)
+Input coordinates for an `mdspan` must have the same shape as the `mdspan`;
+a multidimensional `mdspan` does not accept 1-D coordinates.
diff --git a/media/docs/cute/02_layout_algebra.md b/media/docs/cute/02_layout_algebra.md
new file mode 100644
index 0000000000..fbadc13285
--- /dev/null
+++ b/media/docs/cute/02_layout_algebra.md
@@ -0,0 +1,574 @@
+# CuTe Layout Algebra
+
+CuTe provides an "algebra of `Layout`s" to support combining layouts in different ways.  This algebra includes operations such as
+
+* `Layout` functional composition,
+* a notion of `Layout` "product" to reproduce one layout according to another, and
+* a notion of `Layout` "divide" to split one layout according to another.
+
+Common utilities for building complicated layouts from simpler ones depend on the `Layout` product. Common utilities for partitioning layouts (of data, for example) across other layouts (of threads, for example) depend on the `Layout` divide. All of these utilities rely on the functional composition of `Layout`s.
+
+In this section, we'll build up the tools of the `Layout` algebra and explain some of these core operations in detail.
+
+## Coalesce
+
+In the previous section, we summarized `Layout`s with
+> Layouts are functions from integers to integers.
+
+The `coalesce` operation is a "simplify" on functions from integers to integers. If we only care about input integers, then we can manipulate the shape and number of modes of the `Layout` without changing it as a function. The only thing `coalesce` can't change is the `Layout`'s `size`.
+
+More specifically, you can find the checked post-conditions in [the `coalesce` unit test](../../../test/unit/cute/core/coalesce.cpp), which we'll reproduce here:
+```cpp
+// @post size(@a result) == size(@a layout)
+// @post depth(@a result) <= 1
+// @post for all i, 0 <= i < size(@a layout), @a result(i) == @a layout(i)
+Layout coalesce(Layout const& layout)
+```
+
+For example,
+
+```cpp
+auto layout = Layout<Shape <_2,Shape <_1,_6>>,
+                     Stride<_1,Stride<_6,_2>>>{};
+auto result = coalesce(layout);    // _12:_1
+```
+
+where we can see the result has fewer modes and is "simpler." Indeed, this could save us a few operations in the coordinate mapping and index mapping (if those are performed dynamically).
+
+So, how do we get there?
+
+* We've already seen that column-major `Layout`s like `(_2,_4):(_1,_2)` act identically to `_8:_1` for 1-D coordinates.
+* Modes with size static-1 will always produce a natural coordinate of static-0. They can be ignored no matter the stride.
+
+Generalizing, consider a layout with just two integral modes, s0:d0 and s1:d1.  Denote the result of coalescing this layout as s0:d0 ++ s1:d1. Then, there are four cases:
+
+1. `s0:d0  ++  _1:d1  =>  s0:d0`. Ignore modes with size static-1.
+2. `_1:d0  ++  s1:d1  =>  s1:d1`. Ignore modes with size static-1.
+3. `s0:d0  ++  s1:s0*d0  =>  s0*s1:d0`. If the second mode's stride is the product of the first mode's size and stride, then they can be combined.
+4. `s0:d0  ++  s1:d1  =>  (s0,s1):(d0,d1)`. Else, nothing can be done and they must be treated separately.
+
+That's it! We can flatten any layout and apply the above binary operation to each pair of adjacent modes in order to "coalesce" the modes of the layout.
+
+### By-mode Coalesce
+
+Obviously, sometimes we do care about the shape of our `Layout`, but would still like to coalesce. For example, I have a 2-D `Layout` and I would like the result to remain 2-D.
+
+For this reason, there's an overload of `coalesce` that takes an additional parameter
+```cpp
+// Apply coalesce at the terminals of trg_profile
+Layout coalesce(Layout const& layout, IntTuple const& trg_profile)
+```
+
+which can be used as follows
+
+```cpp
+auto a = Layout<Shape <_2,Shape <_1,_6>>,
+                Stride<_1,Stride<_6,_2>>>{};
+auto result = coalesce(a, Step<_1,_1>{});   // (_2,_6):(_1,_2)
+// Identical to
+auto same_r = make_layout(coalesce(layout<0>(a)),
+                          coalesce(layout<1>(a)));
+```
+
+This function is recursing into `Step<_1,_1>{}` and applying `coalesce` to the corresponding sublayout whenever it sees an integer (the values don't matter, they're just flags) rather than a tuple.
+
+> This theme of defining an operation that treats a `Layout` as a "1-D" function from integers to integers and then generalizing to use it for an arbitrarily shaped layout will be a common one!
+
+## Composition
+
+Functional composition of `Layout`s is the core of CuTe and is used in just about every higher-level operation.
+
+Starting again from the observation that `Layout`s are just functions from integers to integers, we can define functional composition that results in another `Layout`. First, an example.
+
+```text
+Functional composition, R := A o B
+R(c) := (A o B)(c) := A(B(c))
+
+Example
+A = (6,2):(8,2)
+B = (4,3):(3,1)
+
+R( 0) = A(B( 0)) = A(B(0,0)) = A( 0) = A(0,0) =  0
+R( 1) = A(B( 1)) = A(B(1,0)) = A( 3) = A(3,0) = 24
+R( 2) = A(B( 2)) = A(B(2,0)) = A( 6) = A(0,1) =  2
+R( 3) = A(B( 3)) = A(B(3,0)) = A( 9) = A(3,1) = 26
+R( 4) = A(B( 4)) = A(B(0,1)) = A( 1) = A(1,0) =  8
+R( 5) = A(B( 5)) = A(B(1,1)) = A( 4) = A(4,0) = 32
+R( 6) = A(B( 6)) = A(B(2,1)) = A( 7) = A(1,1) = 10
+R( 7) = A(B( 7)) = A(B(3,1)) = A(10) = A(4,1) = 34
+R( 8) = A(B( 8)) = A(B(0,2)) = A( 2) = A(2,0) = 16
+R( 9) = A(B( 9)) = A(B(1,2)) = A( 5) = A(5,0) = 40
+R(10) = A(B(10)) = A(B(2,2)) = A( 8) = A(2,1) = 18
+R(11) = A(B(11)) = A(B(3,2)) = A(11) = A(5,1) = 42
+```
+
+The absolutely amazing observation is that the function `R(c) = k` defined above can be written down as another `Layout`
+
+```
+R = ((2,2),3):((24,2),8)
+```
+
+AND
+
+```
+compatible(B, R)
+```
+
+That is, every coordinate of `B` can also be used as a coordinate of `R`. This is an expected property of functional composition because `B` defines the *domain* of `R`.
+
+You can find many examples and checked post-conditions in [the `composition` unit test](../../../test/unit/cute/core/composition.cpp). The post-conditions are precisely as we just stated.
+```cpp
+// @post compatible(@a layout_b, @a result)
+// @post for all i, 0 <= i < size(@a layout_b), @a result(i) == @a layout_a(@a layout_b(i)))
+Layout composition(LayoutA const& layout_a, LayoutB const& layout_b)
+```
+
+### Computing Composition
+
+First, a few observations:
+
+* `B = (B_0, B_1, ...)`. A layout can be expressed as the concatenation of its sublayouts.
+
+* `A o B = A o (B_0, B_1, ...) = (A o B_0, A o B_1, ...)`. When `B` is injective, composition is left-distributive with concatenation.
+
+With the above, we can assume without loss of generality that `B = s:d` is a layout with integral shape and stride. We can also assume that `A` is a flattened, coalesced layout.
+
+When `A` is integral, `A = a:b`, the result is rather trivial: `R = A o B = a:b o s:d = s:(b*d)`. But when `A` is multimodal, we need to be more careful.
+
+Put into words, `A o B = A o s:d`, for integral `s` and `d` means that we want (1) every `d`th element of `A`, and then (2) keep the first `s` of those strided elements.
+
+1. Every `d`th element of `A` can be computed by "dividing out" the first `d` elements from the shape of `A`. For an array of integers representing the shape, this is computed as
+```cpp
+void shape_div(int* shapeA, int N, int& strideB) {
+   for (int i = 0; i < N; ++i) {
+      assert(shapeA[i] %   strideB == 0 or
+               strideB % shapeA[i] == 0);
+      int new_shape  = ceil_div(shapeA[i], strideB);
+      int new_stride = ceil_div(strideB, shapeA[i]);
+      shapeA[i] = new_shape;
+      strideB   = new_stride;
+   }
+}
+```
+which progressively "removes" the first `strideB` elements from `shapeA` starting from the left. For example,
+* `(6,2) /  2 => (3,2)`
+* `(6,2) /  3 => (2,2)`
+* `(6,2) /  6 => (1,2)`
+* `(6,2) / 12 => (1,1)`
+* `(3,6,2,8) / 6 => (1,3,2,8)`
+* `(3,6,2,8) / 9 => (1,2,2,8)`
+* `(42,16,3) / 2 => (21,16,3)`
+* `(42,16,3) / 6 => ( 7,16,3)`
+
+As you may have noticed, we can only divide shapes by certain values and get a sensible result. This is called the **divisibility condition** and is enforced by the `assert` in the above code and statically checked in CuTe when possible.
+
+2. The first `s` elements of the strided `A` layout can be computed by "modding out" the first `s` elements from the shape of `A`. For an array of integers representing the shape, this is computed as
+```cpp
+void shape_mod(int* shapeA, int N, int& shapeB) {
+   for (int i = 0; i < N; ++i) {
+      assert(shapeA[i] %    shapeB == 0 or
+                shapeB % shapeA[i] == 0);
+      int new_shapeA =      min(shapeA[i], shapeB);
+      int new_shapeB = ceil_div(shapeB, shapeA[i]);
+      shapeA[i] = new_shapeA;
+      shapeB    = new_shapeB;
+   }
+}
+```
+which progressibly "keeps" the first `shapeB` elements from `shapeA` starting from the left. For example,
+* `(6,2) %  2 => (2,1)`
+* `(6,2) %  3 => (3,1)`
+* `(6,2) %  6 => (6,1)`
+* `(6,2) % 12 => (6,2)`
+* `(3,6,2,8) %  6 => (3,2,1,1)`
+* `(3,6,2,8) %  9 => (3,3,1,1)`
+* `(1,2,2,8) %  2 => (1,2,1,1)`
+* `(1,2,2,8) % 16 => (1,2,2,4)`
+
+Again, this operation must satisfy the divisibility condition to yield a sensible result. This is enforced by the `assert` in the above code and statically checked in CuTe when possible.
+
+Clearly, CuTe does not use arrays to store shapes or strides and the above code is for explication only. CuTe works with shapes and strides as `IntTuple`s and the implementation is expressed as algorithmic `fold`s which carefully account for static and dynamic integers.
+
+#### Example 1 -- Reshape a layout into a matrix
+
+`20:2  o  (5,4):(4,1)`. Composition formulation.
+
+This describes interpreting the layout `20:2`
+as a 5x4 matrix in a row-major order.
+
+1. ` = 20:2 o (5:4,4:1)`. Layout `(5,4):(4,1)` as concatenation of sublayouts.
+
+2. ` = (20:2 o 5:4, 20:2 o 4:1)`. Left distributivity.
+
+    * `20:2 o 5:4  =>  5:8`. Trivial case.
+    * `20:2 o 4:1  =>  4:2`. Trivial case.
+
+3. ` = (5:8, 4:2)`. Composed Layout as concatenation of sublayouts.
+
+4. ` = (5,4):(8,2)`. Final composed layout.
+
+#### Example 2 -- Reshape a layout into a matrix
+
+`(10,2):(16,4)  o  (5,4):(1,5)`
+
+This describes interpreting the layout `(10,2):(16,4)`
+as a 5x4 matrix in a column-major order.
+
+1. ` = (10,2):(16,4) o (5:1,4:5)`. Layout `(5,4):(1,5)` as concatenation of sublayouts.
+
+2. ` = ((10,2):(16,4) o 5:1, (10,2):(16,4) o 4:5)`. Left distributivity.
+
+    * `(10,2):(16,4) o 5:1 => (5,1):(16,4)`. Mod out the shape `5`.
+    * `(10,2):(16,4) o 4:5 => (2,2):(80,4)`. Div out the stride `5`.
+
+3. ` = ((5,1):(16,4), (2,2):(80,4))`. Composed Layout as concatenation of sublayouts.
+
+4. ` = (5:16, (2,2):(80,4))`. By-mode coalesce.
+
+5. ` = (5,(2,2))):(16,(80,4))`. Final composed layout.
+
+We get exactly this result with CuTe
+if we use compile-time shapes and strides.
+The following C++ code prints `(_5,(_2,_2)):(_16,(_80,_4))`.
+
+```cpp
+Layout a = make_layout(make_shape (Int<10>{}, Int<2>{}),
+                       make_stride(Int<16>{}, Int<4>{}));
+Layout b = make_layout(make_shape (Int< 5>{}, Int<4>{}),
+                       make_stride(Int< 1>{}, Int<5>{}));
+Layout c = composition(a, b);
+print(c);
+```
+
+If we use dynamic integers, the following C++ code prints `((5,1),(2,2)):((16,4),(80,4))`.
+
+```cpp
+Layout a = make_layout(make_shape (10, 2),
+                       make_stride(16, 4));
+Layout b = make_layout(make_shape ( 5, 4),
+                       make_stride( 1, 5));
+Layout c = composition(a, b);
+print(c);
+```
+
+The results may _look_ different but are the mathematically the same. The 1s in the shape don't affect the layout as a mathematical function from 1-D coordinates to integers or as a function from 2-D coordinates to integers. In the dynamic case, CuTe can not coalesce the dynamic size-1 modes to "simplify" the layout due to the static rank and type of the tuples containing them.
+
+### By-mode Composition
+
+Similar to by-mode `coalesce` and building up to a generic tiling operation, sometimes we do care about the shape of the `A` layout and would still like to apply `composition` to individual modes. For example, I have a 2-D `Layout` and would like some sublayout of the elements down the columns and another sublayout of elements across the rows.
+
+For this reason, `composition` also works when its second parameter -- the `B` -- is a `Tiler`. In general, a tiler is a layout or a tuple-of-layouts (note the generalization on `IntTuple`), which can be used as follows
+```cpp
+// (12,(4,8)):(59,(13,1))
+auto a = make_layout(make_shape (12,make_shape ( 4,8)),
+                     make_stride(59,make_stride(13,1)));
+// <3:4, 8:2>
+auto tiler = make_tile(Layout<_3,_4>{},  // Apply 3:4 to mode-0
+                       Layout<_8,_2>{}); // Apply 8:2 to mode-1
+
+// (_3,(2,4)):(236,(26,1))
+auto result = composition(a, tiler);
+// Identical to
+auto same_r = make_layout(composition(layout<0>(a), get<0>(tiler)),
+                          composition(layout<1>(a), get<1>(tiler)));
+```
+We often use the `<LayoutA, LayoutB, ...>` notation to distinguish `Tiler`s from the concatenation-of-sublayouts notation `(LayoutA, LayoutB, ...)` that we used previously.
+
+The `result` in the above code can be depicted as the 3x8 sublayout of the original layout highlighted in the figure below.
+<p align="center">
+  <img src="../../images/cute/composition1.png" alt="composition1.png" height="250"/>
+</p>
+
+For convenience, CuTe also interprets `Shape`s as a tiler as well. A `Shape` is interpreted as tuple-of-layouts-with-stride-1:
+```cpp
+// (12,(4,8)):(59,(13,1))
+auto a = make_layout(make_shape (12,make_shape ( 4,8)),
+                     make_stride(59,make_stride(13,1)));
+// (8, 3)
+auto tiler = make_shape(Int<3>{}, Int<8>{});
+// Equivalent to <3:1, 8:1>
+// auto tiler = make_tile(Layout<_3,_1>{},  // Apply 3:1 to mode-0
+//                        Layout<_8,_1>{}); // Apply 8:1 to mode-1
+
+// (_3,(4,2)):(59,(13,1))
+auto result = composition(a, tiler);
+```
+where `result` can be depicted as the 3x8 sublayout of the original layout highlighted in the figure below.
+<p align="center">
+  <img src="../../images/cute/composition2.png" alt="composition2.png" height="250"/>
+</p>
+
+## Composition Tilers
+
+In summary, a `Tiler` is one of the following objects.
+1. A `Layout`.
+2. A tuple of `Tiler`s.
+3. A `Shape`, which will be interpreted as a tiler of `Layout`s with stride-1.
+
+Any of the above can be used as the second argument in `composition`. With (1), we think of the `composition` as between two functions from integers to integers, no matter the ranks of the layouts. With (2) and (3), the `composition` is performed on each pair of corresponding modes of `A` and `B`, until case (1) is found.
+
+This allows composition to be applied by-mode to retrieve arbitrary sublayouts of specified modes of a tensor ("Give me the 3x5x8 subblock of this MxNxL tensor") but also allows entire tiles of data to be reshaped and reordered as if they were 1-D vectors ("Reorder this 8x16 block of data into a 32x4 block using this weird order of elements"). We will see the by-mode cases appear often when we are tiling for threadblocks in examples that follow. We will see 1-D reshaping and reordering when we want to apply arbitrary partitioning patterns for threads and values in MMAs in examples that follow.
+
+## Complement
+
+Before getting to "product" and "divide," we need one more operation. We can think of `composition` as a layout `B` that is "selecting" certain coordinates from another layout `A`. But what about the coordinates that aren't "selected"? To implement generic tiling, we want to be able to select arbitrary elements -- the tile -- and to describe the layout of those tiles -- the leftovers, or the "rest."
+
+The `complement` of a layout attempts to find another layout that represents the "rest" -- the elements that aren't touched by the layout.
+
+You can find many examples and checked post-conditions in [the `complement` unit test](../../../test/unit/cute/core/complement.cpp). The post-conditions include
+```cpp
+// @post cosize(make_layout(@a layout_a, @a result))) >= size(@a cotarget)
+// @post cosize(@a result) >= round_up(size(@a cotarget), cosize(@a layout_a))
+// @post for all i, 1 <= i < size(@a result),
+//         @a result(i-1) < @a result(i)
+// @post for all i, 1 <= i < size(@a result),
+//         for all j, 0 <= j < size(@a layout_a),
+//           @a result(i) != @a layout_a(j)
+Layout complement(LayoutA const& layout_a, Shape const& cotarget)
+```
+That is, the complement `R` of a layout `A` with respect to a Shape (IntTuple) `M` satisfies the following properties.
+1. The size (and cosize) of `R` is *bounded* by `size(M)`.
+2. `R` is *ordered*.  That is, the strides of `R` are positive and increasing.  This means that `R` is unique.
+3. `A` and `R` have *disjoint* codomains. `R` attempts to "complete" the codomain of `A`.
+
+The `cotarget` parameter above is most commonly an integer -- you can see we only use `size(cotarget)` above. However, sometimes it is useful to specify an integer that has static properties. For example, `28` is a dynamic integer and `(_4,7)` is a shape with size `28` that is statically known to be divisible by `_4`. Both will produce the same `complement` mathematically, but the extra information can used by `complement` to preserve the staticness of the result as much as possible.
+
+### Complement Examples
+
+`complement` is most effective on static shapes and strides, so consider all integers below to be static. Similar examples for dynamic shapes and strides as well as IntTuple `cotarget` can be found in [the unit test](../../../test/unit/cute/core/complement.cpp).
+
+* `complement(4:1, 24)` is `6:4`. Note that `(4,6):(1,4)` has cosize `24`. The layout `4:1` is effectively repeated 6 times with `6:4`.
+
+* `complement(6:4, 24)` is `4:1`. Note that `(6,4):(4,1)` has cosize `24`. The "hole" in `6:4` is filled with `4:1`.
+
+* `complement((4,6):(1,4), 24)` is `1:0`. Nothing needs to be appended.
+
+* `complement(4:2, 24)` is `(2,3):(1,8)`. Note that `(4,(2,3)):(2,(1,8))` has cosize `24`. The "hole" in `4:2` is filled with `2:1` first, then everything is repeated 3 times with `3:8`.
+
+* `complement((2,4):(1,6), 24)` is `3:2`. Note that `((2,4),3):((1,6),2)` has cosize `24` and produces unique indices.
+
+* `complement((2,2):(1,6), 24)` is `(3,2):(2,12)`. Note that `((2,2),(3,2)):((1,6),(2,12))` has cosize `24` and produces unique indices.
+
+<p align="center">
+  <img src="../../images/cute/complement1.png" alt="complement1.png" height="75"/>
+</p>
+As a visualization, the above figure depicts the codomain of the last example. The image of the original layout `(2,2):(1,6)` is colored in gray. The complement effectively "repeats" the original layout (displayed in the other colors) such that the codomain size of the result is `24`. The complement `(3,2):(2,12)` can be viewed as the "layout of the repetition."
+
+## Division (Tiling)
+
+Finally, we can define the division of a `Layout` by another `Layout`. Functions that divide a layout into components are useful as a basis for tiling and partitioning layouts.
+
+In this section, we'll define `logical_divide(Layout, Layout)`, which again considers all `Layout`s as 1-D functions from integers to integers, and then use that definition to create multidimensional `Layout` divides.
+
+Informally, `logical_divide(A, B)` splits a layout `A` into two modes -- in the first mode are all elements pointed to by `B` and in the second mode are all elements not pointed to by `B`.
+
+Formally, this can be written as
+
+$A \oslash B := A \circ (B,B^*)$
+
+and implemented as
+```cpp
+template <class LShape, class LStride,
+          class TShape, class TStride>
+auto logical_divide(Layout<LShape,LStride> const& layout,
+                    Layout<TShape,TStride> const& tiler)
+{
+  return composition(layout, make_layout(tiler, complement(tiler, size(layout))));
+}
+```
+Note that this is defined only in terms of concatenation, composition, and complement.
+
+So what is that?
+
+> in the first mode are all elements pointed to by `B`
+
+This is clearly composition, `A o B`.
+
+> in the second mode are all elements not pointed to by `B`
+
+The elements NOT pointed to by `B` sounds like a complement, `B*`, up to the size of `A`. As we've seen above in the `complement` section, this can be described as the "layout of the repetition of `B`." If `B` is the "tiler", then `B*` is the layout of the tiles.
+
+### Logical Divide 1-D Example
+
+Consider tiling the 1-D layout `A = (4,2,3):(2,1,8)` with the tiler `B = 4:2`. Informally, this means that we have a 1-D vector of 24 elements in some storage order defined by `A` and we want to extract tiles of 4 elements strided by 2.
+
+This is computed in the three steps described in the implementation above.
+* Complement of `B = 4:2` under `size(A) = 24` is `B* = (2,3):(1,8)`.
+* Concantenation of `(B,B*) = (4,(2,3)):(2,(1,8))`.
+* Composition of `A = (4,2,3):(2,1,8)` with `(B,B*)` is then `((2,2),(2,3)):((4,1),(2,8))`.
+
+<p align="center">
+  <img src="../../images/cute/divide1.png" alt="divide1.png" height="150"/>
+</p>
+
+The above figure depicts `A` as a 1-D layout with the elements pointed to by `B` highlighted in gray. The layout `B` describes our "tile" of data, and there are six of those tiles in `A` shown by each of the colors. After the divide, the first mode of the result is the tile of data and the second mode of the result iterates over each tile.
+
+### Logical Divide 2-D Example
+
+Using the `Tiler` concept defined above, this immediately generalizes to multidimensional tiling. The below example simply applies `layout_divide` by-mode to the cols and rows of a 2-D layout using a `Tiler`.
+
+Similar to the 2-D composition example above, consider a 2-D layout `A = (9,(4,8)):(59,(13,1))` and want to apply `3:3` down the columns (mode-0) and `(2,4):(1,8)` across the rows (mode-1). This means the tiler can be written as `B = <3:3, (2,4):(1,8)>`.
+
+<p align="center">
+  <img src="../../images/cute/divide2.png" alt="divide2.png" height="450"/>
+</p>
+
+The above figure depicts `A` as a 2-D layout with the elements pointed to by `B` highlighted in gray. The layout `B` describes our "tile" of data, and there are twelve of those tiles in `A` shown by each of the colors. After the divide, the first mode of each mode of the result is the tile of data and the second mode of each mode iterates over each tile. In that sense, this operation can be viewed as a kind of `gather` operation or as simply a permutation on the rows and cols.
+
+Note that the first mode of each mode of the result is the sublayout `(3,(2,4)):(177,(13,2))` and is precisely the result we would have received if we had applied `composition` instead of `logical_divide`.
+
+### Zipped, Tiled, Flat Divides
+
+It's easy to see the tiles when they are highlighted in the images above, but working with them can still be awkward. How would you slice out the `3`rd tile or the `7`th tile or the `(1,2)`th tile so you could continue working on it?
+
+Enter the convenience flavors of `logical_divide`. Suppose we have a `Layout` and a `Tiler` of some shape, then each operation will apply `logical_divide`, but potentially rearrange the modes into more convenient forms.
+```text
+Layout Shape : (M, N, L, ...)
+Tiler Shape  : <TileM, TileN>
+
+logical_divide : ((TileM,RestM), (TileN,RestN), L, ...)
+zipped_divide  : ((TileM,TileN), (RestM,RestN,L,...))
+tiled_divide   : ((TileM,TileN), RestM, RestN, L, ...)
+flat_divide    : (TileM, TileN, RestM, RestN, L, ...)
+```
+
+For example, the `zipped_divide` function applies `logical_divide`, and then gathers the "subtiles" into a single mode and the "rest" into a single mode.
+```cpp
+// A: shape is (9,32)
+auto layout_a = make_layout(make_shape (Int< 9>{}, make_shape (Int< 4>{}, Int<8>{})),
+                            make_stride(Int<59>{}, make_stride(Int<13>{}, Int<1>{})));
+// B: shape is (3,8)
+auto tiler = make_tile(Layout<_3,_3>{},           // Apply     3:3     to mode-0
+                       Layout<Shape <_2,_4>,      // Apply (2,4):(1,8) to mode-1
+                              Stride<_1,_8>>{});
+
+// ((TileM,RestM), (TileN,RestN)) with shape ((3,3), (8,4))
+auto ld = logical_divide(layout_a, tiler);
+// ((TileM,TileN), (RestM,RestN)) with shape ((3,8), (3,4))
+auto zd = zipped_divide(layout_a, tiler);
+```
+Then, the offset to the `3`rd tile is `zd(0,3)`. The offset to the `7`th tile is `zd(0,7)`. The offset to the `(1,2)`th tile is `zd(0,make_coord(1,2))`. The tile itself always has layout `layout<0>(zd)`. Indeed, it is always the case that
+
+`layout<0>(zipped_divide(a, b)) == composition(a, b)`.
+
+We note that `logical_divide` preserves the *semantics* of the modes while permuting the elements within those modes -- the `M`-mode of layout `A` is still the `M`-mode of the result and the `N`-mode of layout `A` is still the `N`-mode of the result.
+
+This is not the case with `zipped_divide`. The mode-0 in the `zipped_divide` result is the `Tile` itself (of whatever rank the `Tiler` was) and mode-1 is the layout of those tiles. It doesn't always make sense to plot these as 2-D layouts, because the `M`-mode is now more aptly the "tile-mode" and the `N`-mode is more aptly the "rest-mode". Regardless, we still can plot the resulting layout as 2-D as shown below.
+
+<p align="center">
+  <img src="../../images/cute/divide3.png" alt="divide3.png" height="450"/>
+</p>
+
+We've kept each tile as its color in the previous images for clarity. Clearly, iterating across tiles is now equivalent to iterating across a row of this layout and iterating over elements within a tile is equivalent to iterating down a column of this layout. As we'll see in the `Tensor` section, this can be used to great effect in partitioning within or across tiles of data.
+
+## Product (Tiling)
+
+Finally, we can define the product of a Layout by another Layout. In this section, we'll define `logical_product(Layout, Layout)`, which again considers all `Layout`s as 1-D functions from integers to integers, and then use that definition to create multidimensional `Layout` products.
+
+Informally, `logical_product(A, B)` results in a two mode layout where the first mode is the layout `A` and the second mode is the layout `B` but with each element replaced by a "unique replication" of layout `A`.
+
+Formally, this can be written as
+
+$A \otimes B := (A, A^* \circ B)$
+
+and implemented in CuTe as
+```cpp
+template <class LShape, class LStride,
+          class TShape, class TStride>
+auto logical_product(Layout<LShape,LStride> const& layout,
+                     Layout<TShape,TStride> const& tiler)
+{
+  return make_layout(layout, composition(complement(layout, size(layout)*cosize(tiler)), tiler));
+}
+```
+Note that this is defined only in terms of concatenation, composition, and complement.
+
+So what is that?
+
+> where the first mode is the layout `A`
+
+This is clearly just a copy of `A`.
+
+> the second mode is the layout `B` but with each element replaced by a "unique replication" of layout `A`.
+
+The "unique replication" of layout `A` sounds like complement, `A*`, up to the cosize of `B`. As we've seen in the `complement` section, this can be described as the "layout of the repetition of `A`". If `A` is the "tile", then `A*` is the layout of repetitions that are available for `B`.
+
+### Logical Product 1-D Example
+
+Consider reproducing the 1-D layout `A = (2,2):(4,1)` according to `B = 6:1`. Informally, this means that we have a 1-D layout of 4 elements defined by `A` and we want to reproduce it 6 times.
+
+This is computed in the three steps described in the implementation above.
+* Complement of `A = (2,2):(4,1)` under `6*4 = 24` is `A* = (2,3):(2,8)`.
+* Composition of `A* = (2,3):(2,8)` with `B = 6:1` is then `(2,3):(2,8)`.
+* Concatenation of `(A,A* o B) = ((2,2),(2,3)):((4,1),(2,8))`.
+
+<p align="center">
+  <img src="../../images/cute/product1.png" alt="product1.png" height="175"/>
+</p>
+
+The above figure depicts `A` and `B` as a 1-D layouts. The layout `B` describes the number and order of repetitions of `A` and they are colored for clarity. After the product, the first mode of the result is the tile of data and the second mode of the result iterates over each tile.
+
+Note that the result is identical to the result of the 1-D Logical Divide example.
+
+Of course, we can change the number and order of the tiles in the product by changing `B`.
+
+<p align="center">
+  <img src="../../images/cute/product2.png" alt="product2.png" height="175"/>
+</p>
+
+For example, in the above image with `B = (4,2):(2,1)`, there are 8 repeated tiles instead of 6 and the tiles are in a different order.
+
+### Logical Product 2-D Example
+
+We can use the by-mode `tiler` strategies previously developed to write multidimensional products as well.
+
+<p align="center">
+  <img src="../../images/cute/product2d.png" alt="product2d.png" height="250"/>
+</p>
+
+The above image demonstates the use of a `tiler` to apply `logical_product` by-mode. Despite this **not being the recommended approach**, the result is a rank-2 layout consisting of 2x5 row-major block that is tiled across a 3x4 column-major arrangement.
+
+The reason **this is not the recommended approach** is that the `tiler B` in the above expression is highly unintuitive. In fact, it requires perfect knowledge of the shape and strides of `A` in order to construct. We would like to express "Tile Layout `A` according to Layout `B`" in a way that makes `A` and `B` independent and is much more intuitive.
+
+#### Blocked and Raked Products
+
+The `blocked_product(LayoutA, LayoutB)` and `raked_product(LayoutA, LayoutB)` are rank-sensitive transformations on top of 1-D `logical_product` that let us express the more intuitive `Layout` products that we most often want to express.
+
+A key observation in the implementation of these functions are the compatibility post-conditions of `logical_product`:
+```
+// @post rank(result) == 2
+// @post compatible(layout_a, layout<0>(result))
+// @post compatible(layout_b, layout<1>(result))
+```
+
+Because `A` is always compatible with mode-0 of the result and `B` is always compatible with mode-1 of the result, if we made `A` and `B` the same rank then we could "reassociate" like-modes after the product. That is, the "column" mode in `A` could be combined with the "column" mode in `B` and the "row" mode in `A` could be combined with the "row" mode in `B`, etc.
+
+This is exactly what `blocked_product` and `raked_product` do and it is why they are called rank-sensitive. Unlike other CuTe functions that take `Layout` arguments, these care about the top-level rank of the arguments so that each mode can be reassociated after the `logical_product`.
+
+<p align="center">
+  <img src="../../images/cute/productblocked2d.png" alt="productblocked2d.png" height="250"/>
+</p>
+
+The above image shows the same result as the `tiler` approach, but with much more intuitive arguments. A 2x5 row-major layout is arranged as a tile in a 3x4 column-major arrangement. Also note that `blocked_product` went ahead and `coalesced` mode-0 for us.
+
+Similarly, `raked_product` combines the modes slightly differently. Instead of the resulting "column" mode being constructed from the `A` "column" mode then the `B` "column" mode, the resulting "column" mode is constructed from the `B` "column" mode then the `A` "column" mode.
+
+<p align="center">
+  <img src="../../images/cute/productraked2d.png" alt="productraked2d.png" height="250"/>
+</p>
+
+This results in the "tile" `A` now being interleaved or "raked" with the "layout-of-tiles" `B` instead of appearing as blocks. Other references call this a "cyclic distribution."
+
+### Zipped and Tiled Products
+
+Similar to `zipped_divide` and `tiled_divide`, the `zipped_product` and `tiled_product` simply rearrange the modes that result from a by-mode `logical_product`.
+
+```text
+Layout Shape : (M, N, L, ...)
+Tiler Shape  : <TileM, TileN>
+
+logical_product : ((M,TileM), (N,TileN), L, ...)
+zipped_product  : ((M,N), (TileM,TileN,L,...))
+tiled_product   : ((M,N), TileM, TileN, L, ...)
+flat_product    : (M, N, TileM, TileN, L, ...)
+```
diff --git a/media/docs/cute/03_tensor.md b/media/docs/cute/03_tensor.md
new file mode 100644
index 0000000000..f2412d1189
--- /dev/null
+++ b/media/docs/cute/03_tensor.md
@@ -0,0 +1,393 @@
+# CuTe Tensors
+
+This document describes `Tensor`, CuTe's core container that deploys the `Layout` concepts previously described.
+
+Fundamentally, a `Tensor` represents a multidimensional array. `Tensor`s abstracts away the details of how the array's elements are organized and how the array's elements are stored. This lets users write algorithms that access multidimensional arrays generically and potentially specialize algorithms on a `Tensor`s traits. For example, the rank of the `Tensor` can be dispatched against, the `Layout` of data can be inspected, and the type of data can be verified.
+
+A `Tensor` is represented by two template parameters: `Engine` and `Layout`.
+For a description of `Layout`, please refer to [the `Layout` section](./01_layout.md).
+The `Tensor` presents the same shape and access operators as the `Layout` and uses the result of the `Layout` computation to
+offset and dereference a random-access iterator held by the `Engine`.
+That is, the layout of the data is provided by `Layout` and the actual data is provided by the iterator. Such data can live in any kind of memory -- global memory, shared memory, register memory -- or can even be transformed or generated on the fly.
+
+## Fundamental operations
+
+CuTe `Tensor` provides container-like operations for accessing elements.
+
+* `.data()`. The iterator this `Tensor` holds.
+
+* `.size()`. The total logical size of this `Tensor`.
+
+* `.operator[](Coord)`. Access the element corresponding to the logical coordinate `Coord`.
+
+* `.operator()(Coord)`. Access the element corresponding to the logical coordinate `Coord`.
+
+* `.operator()(Coords...)`. Access the element corresponding to the logical coordinate `make_coord(Coords...)`.
+
+CuTe `Tensor` provides a similar core of hierarchical operations as `Layout`.
+
+* `rank<I...>(Tensor)`. The rank of the `I...`th mode of the `Tensor`.
+
+* `depth<I...>(Tensor)`. The depth of the `I...`th mode of the `Tensor`.
+
+* `shape<I...>(Tensor)`. The shape of the `I...`th mode of the `Tensor`.
+
+* `size<I...>(Tensor)`. The size of the `I...`th mode of the `Tensor`.
+
+* `layout<I...>(Tensor)`. The layout of the `I...`th mode of the `Tensor`.
+
+* `tensor<I...>(Tensor)`. The subtensor corresponding to the the `I...`th mode of the `Tensor`.
+
+## Tensor Engines
+
+The `Engine` concept is a wrapper for an iterator or array of data.
+It uses a stripped-down interface of `std::array` to present the iterator.
+
+```c++
+using iterator     =  // The iterator type
+using value_type   =  // The iterator value-type
+using reference    =  // The iterator reference-type
+iterator begin()      // The iterator
+```
+
+In general, users do not need to construct `Engine`s on their own. When a `Tensor` is constructed,
+the appropriate engine -- often `ArrayEngine<T,N>`, `ViewEngine<Iter>`, or
+`ConstViewEngine<Iter>` -- will be constructed.
+
+### Tagged Iterators
+
+Any random-access iterator can be used to construct a `Tensor`, but
+users can also "tag" any iterator with a memory space --
+e.g., to indicate this iterator is accessing global memory or shared memory.
+This is done by calling `make_gmem_ptr(g)` or `make_gmem_ptr<T>(g)` to tag `g` as a global memory iterator,
+and `make_smem_ptr(s)` or `make_smem_ptr<T>(s)` to tag `s` as a shared memory iterator.
+
+Tagging memory makes it possible for CuTe's `Tensor` algorithms
+to use the fastest implementation for the specific kind(s) of memory.
+When calling very specific operations with `Tensor`s, it also allows those
+operators to verify the tags against what is expected.
+For example, some kinds of optimized copy operations require
+the source of the copy to be global memory
+and the destination of the copy to be shared memory.
+Tagging makes it possible for CuTe to dispatch
+to those copy operations and/or verify against those copy operations.
+
+## Tensor Creation
+
+`Tensor`s can be constructed as owning or nonowning.
+
+"Owning" `Tensor`s behave like `std::array`.
+When you copy the `Tensor`, you (deep-)copy its elements,
+and the `Tensor`'s destructor deallocates the array of elements.
+
+"Nonowning" `Tensor`'s behave like a (raw) pointer.
+Copying the `Tensor` doesn't copy the elements,
+and destroying the `Tensor` doesn't deallocate the array of elements.
+
+This has implications for developers of generic `Tensor` algorithms.
+For example, input `Tensor` parameters of a function
+should be passed by referece or const reference,
+because passing a `Tensor` by value
+may or may not make a deep copy of the `Tensor`'s elements.
+
+### Nonowning Tensors
+
+A `Tensor` is usually a nonowning view of existing memory.
+Nonowning `Tensor`s are created by calling `make_tensor`
+with two arguments: a random-access iterator, and the `Layout` or arguments to construct a `Layout`.
+
+Here are some examples of creating `Tensor`s
+that are nonowning views of existing memory.
+
+```cpp
+float* A = ...;
+
+// Untagged pointers
+Tensor tensor_8   = make_tensor(A, make_layout(Int<8>{}));  // Construct with Layout
+Tensor tensor_8s  = make_tensor(A, Int<8>{});               // Construct with Shape
+Tensor tensor_8d2 = make_tensor(A, 8, 2);                   // Construct with Shape and Stride
+
+// Global memory (static or dynamic layouts)
+Tensor gmem_8s     = make_tensor(make_gmem_ptr(A), Int<8>{});
+Tensor gmem_8d     = make_tensor(make_gmem_ptr(A), 8);
+Tensor gmem_8sx16d = make_tensor(make_gmem_ptr(A), make_shape(Int<8>{},16));
+Tensor gmem_8dx16s = make_tensor(make_gmem_ptr(A), make_shape (      8  ,Int<16>{}),
+                                                   make_stride(Int<16>{},Int< 1>{}));
+
+// Shared memory (static or dynamic layouts)
+Layout smem_layout = make_layout(make_shape(Int<4>{},Int<8>{}));
+__shared__ float smem[decltype(cosize(smem_layout))::value];   // (static-only allocation)
+Tensor smem_4x8_col = make_tensor(make_smem_ptr(smem), smem_layout);
+Tensor smem_4x8_row = make_tensor(make_smem_ptr(smem), shape(smem_layout), LayoutRight{});
+```
+
+As shown, users wrap the pointer by identifying its memory space:
+e.g., global memory (via `make_gmem_ptr` or `make_gmem_ptr<T>`) or shared memory (via `make_smem_ptr` or `make_smem_ptr<T>`).
+`Tensor`s that view existing memory can have either static or dynamic `Layout`s.
+
+Calling `print` on all of the above tensors displays
+```
+tensor_8     : ptr[32b](0x7f42efc00000) o _8:_1
+tensor_8s    : ptr[32b](0x7f42efc00000) o _8:_1
+tensor_8d2   : ptr[32b](0x7f42efc00000) o 8:2
+gmem_8s      : gmem_ptr[32b](0x7f42efc00000) o _8:_1
+gmem_8d      : gmem_ptr[32b](0x7f42efc00000) o 8:_1
+gmem_8sx16d  : gmem_ptr[32b](0x7f42efc00000) o (_8,16):(_1,_8)
+gmem_8dx16s  : gmem_ptr[32b](0x7f42efc00000) o (8,_16):(_16,_1)
+smem_4x8_col : smem_ptr[32b](0x7f4316000000) o (_4,_8):(_1,_4)
+smem_4x8_row : smem_ptr[32b](0x7f4316000000) o (_4,_8):(_8,_1)
+```
+
+which displays the pointer type along with any memory space tags, the pointer's `value_type` width, the raw pointer address, and the associated `Layout`.
+
+### Owning Tensors
+
+A `Tensor` can also be an owning array of memory.
+Owning `Tensor`s are created by calling `make_tensor<T>`,
+where `T` is the type of each element of the array, and
+a `Layout` or arguments to construct a `Layout`.
+The array is allocated analogously to `std::array<T,N>` and, therefore, owning `Tensor`s must be constructed with a `Layout` that has static shapes and static strides.
+CuTe does not perform dynamic memory allocation in `Tensor`s as it is not a common or performant operation within CUDA kernels.
+
+Here are some examples of creating owning `Tensor`s.
+
+```c++
+// Register memory (static layouts only)
+Tensor rmem_4x8_col = make_tensor<float>(Shape<_4,_8>{});
+Tensor rmem_4x8_row = make_tensor<float>(Shape<_4,_8>{},
+                                         LayoutRight{});
+Tensor rmem_4x8_pad = make_tensor<float>(Shape <_4, _8>{},
+                                         Stride<_32,_2>{});
+Tensor rmem_4x8_like = make_tensor_like(rmem_4x8_pad);
+```
+
+The `make_tensor_like` function makes an owning Tensor of register memory with the same value type and shape as its input `Tensor` argument and attempts to use the same order of strides as well.
+
+Calling `print` on each of the above tensors produces similar output
+
+```
+rmem_4x8_col  : ptr[32b](0x7fff48929460) o (_4,_8):(_1,_4)
+rmem_4x8_row  : ptr[32b](0x7fff489294e0) o (_4,_8):(_8,_1)
+rmem_4x8_pad  : ptr[32b](0x7fff489295e0) o (_4,_8):(_32,_2)
+rmem_4x8_like : ptr[32b](0x7fff48929560) o (_4,_8):(_8,_1)
+```
+
+and we can see that each pointer address is unique indicating that each `Tensor` is a unique array-like allocation.
+
+## Accessing a Tensor
+
+Users can access the elements of a `Tensor` via `operator()` and `operator[]`,
+which take `IntTuple`s of logical coordinates.
+
+When users access a `Tensor`,
+the `Tensor` uses its `Layout` to map the logical coordinate
+to an offset that can be accessed by the iterator.
+You can see this in `Tensor`'s implementation of `operator[]`.
+
+```c++
+template <class Coord>
+decltype(auto) operator[](Coord const& coord) {
+  return data()[layout()(coord)];
+}
+```
+
+For example, we can read and write to `Tensor`s using natural coordinates, using the variadic `operator()`, or the container-like `operator[]`.
+
+```c++
+Tensor A = make_tensor<float>(Shape <Shape < _4,_5>,Int<13>>{},
+                              Stride<Stride<_12,_1>,    _64>{});
+float* b_ptr = ...;
+Tensor B = make_tensor(b_ptr, make_shape(13, 20));
+
+// Fill A via natural coordinates op[]
+for (int m0 = 0; m0 < size<0,0>(A); ++m0)
+  for (int m1 = 0; m1 < size<0,1>(A); ++m1)
+    for (int n = 0; n < size<1>(A); ++n)
+      A[make_coord(make_coord(m0,m1),n)] = n + 2 * m0;
+
+// Transpose A into B using variadic op()
+for (int m = 0; m < size<0>(A); ++m)
+  for (int n = 0; n < size<1>(A); ++n)
+    B(n,m) = A(m,n);
+
+// Copy B to A as if they are arrays
+for (int i = 0; i < A.size(); ++i)
+  A[i] = B[i];
+```
+
+## Tiling a Tensor
+
+Many of the [`Layout` algebra operations](https://github.com/NVIDIA/cutlass/blob/main/media/docs/cute/02_layout_algebra.md) can also be applied to `Tensor`.
+```cpp
+   composition(Tensor, Tiler)
+logical_divide(Tensor, Tiler)
+ zipped_divide(Tensor, Tiler)
+  tiled_divide(Tensor, Tiler)
+   flat_divide(Tensor, Tiler)
+```
+The above operations allows arbitrary subtensors to be "factored out" of `Tensor`s. This very commonly used in tiling for threadgroups, tiling for MMAs, and reodering tiles of data for threads.
+
+Note that the `_product` operations are not implemented for `Tensor`s as those would
+often produce layouts with increased codomain sizes, which means the `Tensor` would
+require accessing elements unpredictably far outside its previous bounds. `Layout`s can be
+used in products, but not `Tensor`s.
+
+## Slicing a Tensor
+
+Whereas accessing a `Tensor` with a coordinate will return an element of that tensor,
+slicing a `Tensor` will return a subtensor of all the elements in the sliced mode(s).
+
+Slices are performed through the same `operator()`
+that are used for accessing an individual element.
+Passing in `_` (the underscore character, an instance of the `cute::Underscore` type)
+has the same effect as `:` (the colon character) in Fortran or Matlab:
+retain that mode of the tensor as if no coordinate had been used.
+
+Slicing a tensor performs two operations,
+* the `Layout` is evaluated on the partial coordinate and the resulting offset is accumulated into the iterator -- the new iterator points to the start of the new tensor.
+* the `Layout` modes cooresponding to `_`-elements of the coordinate are used to construct a new layout.
+Together, the new iterator and the new layout construct the new tensor.
+
+```cpp
+// ((_3,2),(2,_5,_2)):((4,1),(_2,13,100))
+Tensor A = make_tensor(ptr, make_shape (make_shape (Int<3>{},2), make_shape (       2,Int<5>{},Int<2>{})),
+                            make_stride(make_stride(       4,1), make_stride(Int<2>{},      13,     100)));
+
+// ((2,_5,_2)):((_2,13,100))
+Tensor B = A(2,_);
+
+// ((_3,_2)):((4,1))
+Tensor C = A(_,5);
+
+// (_3,2):(4,1)
+Tensor D = A(make_coord(_,_),5);
+
+// (_3,_5):(4,13)
+Tensor E = A(make_coord(_,1),make_coord(0,_,1));
+
+// (2,2,_2):(1,_2,100)
+Tensor F = A(make_coord(2,_),make_coord(_,3,_));
+```
+
+<p align="center">
+  <img src="../../images/cute/slice.png" alt="slice.png" height="300"/>
+</p>
+
+In the image above, a `Tensor` is sliced in various ways and the subtensors generated by those slices are highlighted within the original tensor. Note that tensor `C` and `D` contain the same elements, but have different ranks and shapes due to the use of `_` versus the use of `make_coord(_,_)`. In each case, the rank of the result is equal to the number of `Underscore`s in the slicing coordinate.
+
+## Partitioning a Tensor
+
+To implement generic partitioning of a `Tensor`, we apply composition or tiling followed by a slicing. This can be performed in many ways, but we have found three ways that are particularly useful: inner-partitioning, outer-partitioning, and TV-layout-partitioning.
+
+### Inner and outer partitioning
+
+Let's take a tiled example and look at how we can slice it in useful ways.
+
+```cpp
+Tensor A = make_tensor(ptr, make_shape(8,24));  // (8,24)
+auto tiler = Shape<_4,_8>{};                    // (_4,_8)
+
+Tensor tiled_a = zipped_divide(A, tiler);       // ((_4,_8),(2,3))
+```
+
+Suppose that we want to give each threadgroup one of these 4x8 tiles of data. Then we can use our threadgroup coordinate to index into the second mode.
+
+```cpp
+Tensor cta_a = tiled_a(make_coord(_,_), make_coord(blockIdx.x, blockIdx.y));  // (_4,_8)
+```
+
+We call this an *inner-partition* because it keeps the inner "tile" mode. This pattern of applying a tiler and then slicing out that tile by indexing into the remainder mode is common and has been wrapped into its own function `inner_partition(Tensor, Tiler, Coord)`. You'll often see `local_tile(Tensor, Tiler, Coord)` which is just another name for `inner_partition`. The `local_tile` partitioner is very often applied at the threadgroup level to partition tensors into tiles across threadgroups.
+
+Alternatively, suppose that we have 32 threads and want to give each thread one element of these 4x8 tiles of data. Then we can use our thread to index into the first mode.
+
+```cpp
+Tensor thr_a = tiled_a(threadIdx.x, make_coord(_,_)); // (2,3)
+```
+
+We call this an *outer-partition* because it keeps the outer "rest" mode. This pattern of applying a tiler and then slicing into that tile by indexing into the tile mode is common and has been wrapped into its own function `outer_partition(Tensor, Tiler, Coord)`. Sometimes you'll see `local_partition(Tensor, Layout, Idx)`, which is a rank-sensitive wrapper around `outer_partition` that transforms the `Idx` into a `Coord` using the inverse of the `Layout` and then constructs a `Tiler` with the same top-level shape of the `Layout`. This allows the user to ask for a row-major, column-major, or arbitrary layout of threads with a given shape that can be used to partition into a tensor.
+
+To see how these partitioning patterns are used, see the [introductory GEMM tutorial](./0x_gemm_tutorial.md).
+
+### Thread-Value partitioning
+
+Another common partitioning strategy is called a thread-value partitioning. In this pattern, we construct a `Layout` that represents the mapping of all threads (or any parallel agent) and all values that each thread will receive to coordinates of the target data. With `composition` the target data layout is transformed according to our TV-layout and then we can simply slice into the thread-mode of the result with our thread index.
+
+```cpp
+// Construct a TV-layout that maps 8 thread indices and 4 value indices
+//   to 1D coordinates within a 4x8 tensor
+// (T8,V4) -> (M4,N8)
+auto tv_layout = Layout<Shape <Shape <_2,_4>,Shape <_2, _2>>,
+                        Stride<Stride<_8,_1>,Stride<_4,_16>>>{}; // (8,4)
+
+// Construct a 4x8 tensor with any layout
+Tensor A = make_tensor<float>(Shape<_4,_8>{}, LayoutRight{});    // (4,8)
+// Compose A with the tv_layout to transform its shape and order
+Tensor tv = composition(A, tv_layout);                           // (8,4)
+// Slice so each thread has 4 values in the shape and order that the tv_layout prescribes
+Tensor  v = tv(threadIdx.x, _);                                  // (4)
+```
+
+<p align="center">
+  <img src="../../images/cute/tv_layout.png" alt="tv_layout.png" height="300"/>
+</p>
+
+The above image is a visual representation of the above code. An arbitrary 4x8 layout of data is composed with a specific 8x4 TV-layout that represents a partitioning pattern. The result of the composition is on the right where each threads' values are arranged across each row. The bottom layout depicts the inverse TV layout which shows the mapping of 4x8 logical coordinates to the thread id and value id they will be mapped to.
+
+To see how these partitioning patterns are constructed and used, see the [tutorial on building MMA Traits](./0t_mma_atom.md).
+
+## Examples
+
+### Copy a subtile from global memory to registers
+
+The following example copies rows of a matrix (with any `Layout`)
+from global memory to register memory,
+then executes some algorithm `do_something`
+on the row that lives in register memory.
+
+```c++
+Tensor gmem = make_tensor(ptr, make_shape(Int<8>{}, 16));  // (_8,16)
+Tensor rmem = make_tensor_like(gmem(_, 0));                // (_8)
+for (int j = 0; j < size<1>(gmem); ++j) {
+  copy(gmem(_, j), rmem);
+  do_something(rmem);
+}
+```
+
+This code does not need to know anything about the `Layout` of `gmem`
+other than that it is rank-2 and that the first mode has a static size.
+The following code checks both of those conditions at compile time.
+
+```c++
+CUTE_STATIC_ASSERT_V(rank(gmem) == Int<2>{});
+CUTE_STATIC_ASSERT_V(is_static<decltype(shape<0>(gmem))>{});
+```
+
+Extending this example using the tiling utilities detailed in [the `Layout` algebra section](./02_layout_algebra.md), we can copy an arbitrary subtile of a tensor using almost the same code as above.
+
+```c++
+Tensor gmem = make_tensor(ptr, make_shape(24, 16));         // (24,16)
+
+auto tiler         = Shape<_8,_4>{};                        // 8x4 tiler
+//auto tiler       = Tile<Layout<_8,_3>, Layout<_4,_2>>{};  // 8x4 tiler with stride-3 and stride-2
+Tensor gmem_tiled  = zipped_divide(gmem, tiler);            // ((_8,_4),Rest)
+Tensor rmem        = make_tensor_like(gmem_tiled(_, 0));    // ((_8,_4))
+for (int j = 0; j < size<1>(gmem_tiled); ++j) {
+  copy(gmem_tiled(_, j), rmem);
+  do_something(rmem);
+}
+```
+
+This applies a statically shaped `Tiler` to the global memory `Tensor`, creates an register `Tensor` that is compatible with the shape of that tile, then loops through each tile to copy it into memory and `do_something`.
+
+## Summary
+
+* `Tensor` is defined as an `Engine` and a `Layout`.
+
+    * `Engine` is an iterator that can be offset and dereferenced.
+    * `Layout` defines the logical domain of the tensor and maps coordinates to offsets.
+
+* Tile a `Tensor` using the same methods for tiling `Layout`s.
+
+* Slice a `Tensor` to retrieve subtensors.
+
+* Partitioning is tiling and/or composition followed by slicing.
diff --git a/media/docs/cute/04_algorithms.md b/media/docs/cute/04_algorithms.md
new file mode 100644
index 0000000000..f427e5ef81
--- /dev/null
+++ b/media/docs/cute/04_algorithms.md
@@ -0,0 +1,223 @@
+# CuTe Tensor algorithms
+
+This section summarizes the interfaces and implementations
+of common numerical algorithms performed on `Tensor`s.
+
+The implementation of these algorithms may be found in the
+[include/cute/algorithm/](../../../include/cute/algorithm/)
+directory.
+
+## `copy`
+
+CuTe's `copy` algorithm copies the elements of a source `Tensor`
+into the elements of a destination `Tensor`.
+The various overloads of `copy` can be found in
+[`include/cute/algorithm/copy.hpp`](../../../include/cute/algorithm/copy.hpp).
+
+### Interface and specialization opportunities
+
+A `Tensor` encapsulates the data type, data location,
+and possibly also the shape and stride of the tensor at compile time.
+As a result, `copy` can and does dispatch,
+based on the types of its arguments,
+to use any of various synchronous or asynchronous hardware copy instructions.
+
+The `copy` algorithm has two main overloads.
+The first just takes the source `Tensor` and the destination `Tensor`.
+
+```c++
+template <class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy(Tensor<SrcEngine, SrcLayout> const& src,
+     Tensor<DstEngine, DstLayout>      & dst);
+```
+
+The second takes those two parameters, plus a `Copy_Atom`.
+
+```c++
+template <class... CopyArgs,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy(Copy_Atom<CopyArgs...>       const& copy_atom,
+     Tensor<SrcEngine, SrcLayout> const& src,
+     Tensor<DstEngine, DstLayout>      & dst);
+```
+
+The two-parameter `copy` overload picks a default implementation
+based only on the types of the two `Tensor` parameters.
+The `Copy_Atom` overload lets callers override that default
+by specifying a nondefault `copy` implementation.
+
+### Parallelism and synchronization depend on parameter types
+
+Either the default implementation or
+the implementation selected by a `Copy_Atom` overload
+may use none or all available parallelism,
+and may have a variety of synchronization semantics.
+The behavior depends on `copy`'s parameter types.
+Users are expected to figure this out based on their knowledge
+of the architecture on which they are running.
+(Developers often write a custom optimized kernel
+for each GPU architecture.)
+
+The `copy` algorithm may be sequential per thread,
+or it may be parallel across some collection of threads
+(e.g., a block or cluster).
+
+If `copy` is parallel,
+then the collection of participating threads
+may need synchronization before any thread in the collection
+may assume that the copy operation has completed.
+For example, if the participating threads form a thread block,
+then users must invoke `__syncthreads()`
+or the Cooperative Groups equivalent
+before they may use the results of `copy`.
+
+The `copy` algorithm may use asynchronous copy instructions,
+such as `cp.async`, or its C++ interface `memcpy_async`.
+In that case, users will need to perform
+the additional synchronization appropriate to that underlying implementation
+before they may use the results of the `copy` algorithm.
+[The CuTe GEMM tutorial example](../../../examples/cute/tutorial/)
+shows one such synchronization method.
+More optimized GEMM implementations use pipelining techniques
+to overlap asynchronous `copy` operations with other useful work.
+
+### A generic copy implementation
+
+A simple example of a generic `copy` implementation
+for any two `Tensor`s looks like this.
+
+```c++
+template <class TA, class ALayout,
+          class TB, class BLayout>
+CUTE_HOST_DEVICE
+void
+copy(Tensor<TA, ALayout> const& src,  // Any logical shape
+     Tensor<TB, BLayout>      & dst)  // Any logical shape
+{
+  for (int i = 0; i < size(dst); ++i) {
+    dst(i) = src(i);
+  }
+}
+```
+
+This generic `copy` algorithm addresses both `Tensor`s
+with 1-D logical coordinates, thus traversing both `Tensor`s
+in a logical column-major order.
+Some reasonable architecture-independent optimizations
+would include the following.
+
+1. If the two `Tensor`s have known memory spaces with optimized
+   access instructions (like `cp.async`), then dispatch to the
+   custom instruction.
+
+2. The two `Tensor`s have static layouts and it can be proven
+that element vectorization is valid -- for example, four `ld.global.b32`s
+can be combined into a single `ld.global.b128` -- then vectorize the source
+   and destinations tensors.
+
+3. If possible, validate that the copy instruction to be used is
+   appropriate for the source and destination tensors.
+
+CuTe's optimized copy implementations can do all of these.
+
+## `copy_if`
+
+CuTe's `copy_if` algorithm lives in the same header as `copy`,
+[`include/cute/algorithm/copy.hpp`](../../../include/cute/algorithm/copy.hpp).
+The algorithm takes source and destination `Tensor` parameters like `copy`,
+but it also takes a "predication `Tensor`"
+with the same shape as the input and output.
+Elements of the source `Tensor` are only copied
+if the corresponding predication `Tensor` element is nonzero.
+
+For details on why and how to use `copy_if`,
+please refer to the
+["predication" section of the tutorial](./0y_predication.md).
+
+## `gemm`
+
+### What `gemm` computes
+
+The `gemm` algorithm takes three `Tensor`s, A, B, and C.
+What it does depends on the number of modes
+that its `Tensor` parameters have.
+We express these modes using letters.
+
+* V indicates a "vector," a mode of independent elements.
+
+* M and N indicate the number of rows resp. columns
+  of the matrix result C of the BLAS's GEMM routine.
+
+* K indicates the "reduction mode" of GEMM,
+  that is, the mode along which GEMM sums.
+  Please see the [GEMM tutorial](./0x_gemm_tutorial.md) for details.
+
+We list the modes of the input `Tensor`s A and B,
+and the output `Tensor` C,
+using a notation `(...) x (...) => (...)`.
+The two leftmost `(...)` describe A and B (in that order),
+and the `(...)` to the right of the `=>` describes C.
+
+1. `(V) x (V) => (V)`. The element-wise product of vectors: C<sub>v</sub> += A<sub>v</sub> B<sub>v</sub>. Dispatches to FMA or MMA.
+
+2. `(M) x (N) => (M,N)`. The outer product of vectors: C<sub>mn</sub> += A<sub>m</sub> B<sub>n</sub>. Dispatches to (4) with V=1.
+
+3. `(M,K) x (N,K) => (M,N)`. The product of matrices: C<sub>mn</sub> += A<sub>mk</sub> B<sub>nk</sub>. Dispatches to (2) for each K.
+
+4. `(V,M) x (V,N) => (V,M,N)`. The batched outer product of vectors: C<sub>vmn</sub> += A<sub>vm</sub> B<sub>vn</sub>. Optimizes for register reuse and dispatches to (1) for each M, N.
+
+5. `(V,M,K) x (V,N,K) => (V,M,N)`. The batched product of matrices: C<sub>vmn</sub> += A<sub>vmk</sub> B<sub>vnk</sub>. Dispatches to (4) for each K.
+
+Please refer to the [GEMM tutorial](./0x_gemm_tutorial.md)
+for an overview of CuTe's convention for ordering the modes.
+For example, if K appears, it always appears rightmost ("outermost").
+If V appears, it always appears leftmost ("innermost").
+
+### Dispatch to optimized implementations
+
+Just like with `copy`, CuTe's implementations of `gemm`
+uses its `Tensor` arguments' types to dispatch
+to an appropriately optimized implementation.
+Also like `copy`, `gemm` takes an optional `MMA_Atom` parameter
+that lets callers override the default `FMA` instruction
+that CuTe would select based on the `Tensor` arguments' types.
+
+For more information on `MMA_Atom` and on specialization of `gemm`
+for different architectures, please refer to the
+[MMA section of the tutorial](./0t_mma_atom.md).
+
+## `axpby`
+
+The `axpby` algorithm lives in the header file
+[`include/cute/algorithm/axpby.hpp`](../../../include/cute/algorithm/axpby.hpp).
+It assigns to $y$ the result of $\alpha x + \beta y$,
+where $\alpha$ and $\beta$ are scalars and $x$ and $y$ are `Tensor`s.
+The name stands for "Alpha times X Plus Beta times Y,"
+and is a generalization of the original BLAS "AXPY" routine
+("Alpha times X Plus Y").
+
+## `fill`
+
+The `fill` algorithm lives in the header file
+[`include/cute/algorithm/fill.hpp`](../../../include/cute/algorithm/fill.hpp).
+It overwrites the elements of its `Tensor` output argument
+with a given scalar value.
+
+## `clear`
+
+The `clear` algorithm lives in the header file
+[`include/cute/algorithm/clear.hpp`](../../../include/cute/algorithm/clear.hpp).
+It overwrites the elements of its `Tensor` output argument with zeros.
+
+## Other algorithms
+
+CuTe provides other algorithms.
+Their header files can be found in the
+[`include/cute/algorithm`](../../../include/cute/algorithm)
+directory.
diff --git a/media/docs/cute/0t_mma_atom.md b/media/docs/cute/0t_mma_atom.md
new file mode 100644
index 0000000000..6f285c2645
--- /dev/null
+++ b/media/docs/cute/0t_mma_atom.md
@@ -0,0 +1,527 @@
+# CuTe's support for Matrix Multiply-Accumulate instructions
+
+In this file, we explain in detail how we support our GPUs'
+Matrix Multiply-Accumulate (MMA) hardware instructions in CuTe.
+
+MMAs are architecture-specific.
+Different generations of GPU architectures
+introduce different sets of MMA instructions.
+However, CuTe features such as `Layout`
+makes it possible to expose MMAs for use in generic CUDA C++ code.
+We accomplish this in multiple steps.
+
+1. We wrap each MMA's PTX instruction in an "Operation" struct.
+
+2. For each Operation struct, we define a "Traits" struct
+   that defines all of the meta-information needed to use the Operation.
+
+3. Combining the above, an "Atom" is the combination of the PTX Operation struct with the
+   meta-information Traits struct and provides methods to construct
+   `cute::Tensor` "fragments" for that Operation and to use that Operation
+   on existing `cute::Tensor`s.
+
+4. Combining potentially multiple Atoms, a "TiledMMA" provides utilities for building
+   more complex partitioning patterns by creating layouts and interleavings of Atoms.
+
+## CuTe MMA Atoms
+
+CuTe exposes each MMA to generic CUDA C++ code as a pair of structs:
+an "Operation" struct,
+and an `MMA_Traits` struct templated on the Operation struct type.
+
+An "Operation" struct exposes the PTX instruction
+for that specific operation.
+It defines the arguments and interface it expects.
+Operation structs have minimal software dependencies --
+they do not use layouts, tensors, or non-standard numeric data types -- and
+describe only the physical inputs and outputs to the instruction.
+Different structs have different names
+that describe what the MMA instruction does.
+We will explain the naming scheme below.
+
+A corresponding `MMA_Traits` struct specialization
+defines meta-information about the Operation,
+such as the logical compute types, the logical shape of the operation,
+and the `Layout`s of threads and values within the operation.
+The `MMA_Traits` struct takes the Operation as a template parameter.
+CuTe specializes `MMA_Traits` for each Operation type that it supports.
+
+Together, these two types comprise an "Atom" that decouples the complexity of thread and data layouts from the call site of the PTX instruction.  The Atom's Traits struct exposes information that is relevant to a single MMA operation, no matter the granularity at which it operates.
+
+CuTe MMA atoms expose the semantics of a single MMA operation.
+This is true regardless of the hardware level at which the MMA operates.
+CuTe supports MMA atoms that operate at a variety of hardware levels,
+including
+
+* a single thread (e.g., fused multiply-add (FMA) instruction);
+
+* a quadpair (Volta);
+
+* a single warp (Ampere); and
+
+* a warpgroup (Hopper).
+
+### Operation structs
+
+#### Location of files
+
+CuTe provides its Operations structs in the
+[`include/cute/arch`](../../../include/cute/arch)
+directory, in header files starting with `mma`.
+
+#### Operation struct's name
+
+A CuTe Operation struct's name principally encodes the PTX instruction it wraps.
+These often include
+
+* its first supported architecture,
+
+* the M, N, and K dimensions that it accepts,
+
+* the types that it takes, and
+
+* the arrangement of the A and B inputs.
+
+For example, the Volta section below will refer to the
+`SM70_8x8x4_F32F16F16F32_NT` Operation struct defined in
+[`include/cute/arch/mma_sm70.hpp`](../../../include/cute/arch/mma_sm70.hpp).
+
+* "SM70" refers to Volta.
+
+* "8x8x4" refers to M = 8, N = 8, and K = 4,
+  the dimensions of the MMA operation that the quadpair performs
+  (see below). This is reflected in the PTX as `.m8n8k4.`.
+
+* "F32F16F16F32" refers to the element types
+  of the four matrix operands A, B, C, and D.
+  An MMA computes D = C + A * B,
+  so we read the types from left to right:
+  D is F32 (`float`), A is F16 (half),
+  B is F16 (half), and C is F32 (`float`). This is reflected in the PTX instruction name as `.f32.f16.f16.f32`.
+
+* "NT" means that the PTX instruction is designed for inputs A as M-major (not transposed, column-major)
+  and inputs B as N-major (transposed, row-major). This is reflected in the PTX instruction name as `.col.row.`.
+
+#### Contents
+
+An Operation struct has the following members.
+
+##### Type aliases
+
+An Operation struct has four public type aliases:
+`DRegisters`, `ARegisters`, `BRegisters`, and `CRegisters`.
+For example, the `SM70_8x8x4_F32F16F16F32_NT` Operation struct defined in
+[`include/cute/arch/mma_sm70.hpp`](../../../include/cute/arch/mma_sm70.hpp)
+defines these as follows.
+
+```c++
+using DRegisters = float[8];
+using ARegisters = uint32_t[2];
+using BRegisters = uint32_t[2];
+using CRegisters = float[8];
+```
+
+This shows how many values each thread will pass into the PTX instruction
+for each of the matrices A, B, C, and D.  For this Operation,
+each thread passes 8 F32 values each for C and D (hence `float[8]`),
+and 4 F16 values each for A and B (hence `uint32_t[2]`;
+the instruction packs two 16-bit F16 values
+in each of the two 32-bit `uint32_t` values).
+
+##### `fma` static member device function
+
+An operation struct defines a public `static void fma` function.
+It is marked with the `CUTE_HOST_DEVICE` macro,
+which adds the `__host__ __device__` annotations.
+Different Operations define `fma` to take different numbers of arguments,
+depending on the PTX MMA instruction.
+The implementation protects use of the PTX instruction with a macro,
+and raises an `assert` if `fma` is called when the macro is not defined.
+This ensures that tests and examples that use this Operation in an Atom
+can still compile, even if the PTX instruction is not available.
+
+### Traits
+
+#### Location of files
+
+CuTe provides its Traits structs in the
+[`include/cute/atom`](../../../include/cute/atom)
+directory, in header files starting with `mma_traits`.
+
+#### Contents
+
+An `MMA_Traits` specialization defines the following public type aliases.
+
+* `ValTypeD`: Logical compute type of the D matrix
+
+* `ValTypeA`: Logical compute type of the A matrix
+
+* `ValTypeB`: Logical compute type of the B matrix
+
+* `ValTypeC`: Logical compute type of the C matrix
+
+* `Shape_MNK`: Logical MxNxK shape of the MMA operation
+
+* `ThrID`: Logical thread mapping within the single MMA operation
+  (specifying the thread, quadpair, warp, or warpgroup view)
+
+* `ALayout`: Mapping of (thread,value) pairs to coordinates in the MxK A matrix
+
+* `BLayout`: Mapping of (thread,value) pairs to coordinates in the NxK B matrix
+
+* `CLayout`: Mapping of (thread,value) pairs to coordinates in the MxN C matrix
+
+#### Example
+
+The specialization of MMA_Traits for the
+`SM70_8x8x4_F32F16F16F32_NT` Operation lives in the header file
+[`include/cute/atom/mma_traits_sm70.hpp`](../../../include/cute/atom/mma_traits_sm70.hpp).
+It looks like this.
+
+```c++
+template <>
+struct MMA_Traits<SM70_8x8x4_F32F16F16F32_NT>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using Shape_MNK = Shape<_8,_8,_4>;
+  using ThrID   = SM70_QuadPair;
+  using ALayout = SM70_8x4_Col;
+  using BLayout = SM70_8x4_Col;
+  using CLayout = SM70_8x8_32b;
+};
+```
+
+The next section will explain these type aliases in detail.
+
+## Volta
+
+This and the following sections show examples of how to construct MMA atoms.
+We don't try to explain this for all GPU architectures and MMAs.
+Instead, we use selected examples to illustrate the process
+of developing new atoms.
+
+Volta architecture implements an HMMA instruction where a group of 8 threads called a quadpair (QP) collaborate to share data and perform an 8x8x4 (fp32 or fp16) matrix multiply-accumulate. (since a warp is 32 threads wide, it would perform an MMA across 4 QPs for a tile size of 16x16x4).
+
+We first take a look at how we would take the ISA semantics of thread and data partitioning for the HMMA instruction, and encode it in a Traits struct. The HMMA NT instruction has the thread-data layout:
+
+<p align="center">
+  <img src="../../images/cute/HMMA.8x8x4.NT.png" alt="HMMA.8x8x4.NT.png" height="400"/>
+</p>
+
+### Types
+
+The HMMA NT above uses types:
+
+```cpp
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+```
+
+The rest of the `MMA_Traits` will be described in units of these types.
+
+### Shape
+
+The HMMA NT above has shape 8x8x4:
+
+```cpp
+  // Logical shape of the MMA
+  using Shape_MNK = Shape <_8,_8,_4>;
+```
+
+### Thread ID
+
+If the 32 threads in a warp are logically indexed by [0 ... 31], then the above image contains threads [0,1,2,3]U[16,17,18,19]. These threads make up the 0th quadpair. We can write a thread mapping that maps eight logical thread ids [0,1,2,3,4,5,6,7] of the MMA to a quadpair thread index [0,1,2,3]U[16,17,18,19] of a warp. The layout function has 4 elements with a stride of 1 and 2 of those with a stride of 16. With this, we write a layout that represents a quadpair:
+
+```cpp
+  // Mapping from (logical thread id) -> (thread idx)
+  using ThrID = Layout<Shape <_4, _2>,
+                       Stride<_1,_16>>;
+```
+
+Again, this layout function maps the logical thread id [0,8) of the MMA operation onto the quadpair thread index [0,4)U[16,20) of a warp.
+
+### Accumulator Mapping
+
+Let us look at exactly how the 8 threads within a QP are mapped to the A, B and C matrices. For the C and D matrices, the above image is broken down a bit more below. On the left is shown the whole QP level view, and on the right is shown the values owned by just thread 0.
+
+<p align="center">
+  <img src="../../images/cute/HMMA.8x8x4.quadpair.C.png" alt="HMMA.8x8x4.quadpair.C.png" height="400"/>
+</p>
+
+The metainformation of this single instruction level view is what we want to encode in CuTe. Specifically, the QP level view in this diagram corresponds to the four MMA traits for [SM70_F32F16F16F32](../../../include/cute/arch/mma_sm70.hpp). These structs contain the `Element` types, the `Shape_MNK`, and the `ThrID` mapping we constructed above. Now, let us take a look at the definition of `CLayout`, the thread-data layout of accumulators. The job of `CLayout` is to construct a mapping between the `(logical_thr_id, logical_val_id)` and `(m, n)` coordinate in the C matrix which can then be used to build up more complicated layouts and operations like the 16x16x4 WMMA.
+
+We can start constructing a `CLayout` from the picture above. As with any CuTe layout, it is a pair of `Shape` and corresponding `Stride`. Let us just look at the shape for now. We know that the HMMA uses 8 threads each of which own 8 values. Therefore, the shape of our mapping must have a size of 8 along two modes. With this, we have
+
+```cpp
+  // (T8,V8) -> (m,n)
+  using CLayout = Layout<Shape <_8, _8>,
+                         Stride<_?, _?>;  // Stride to be filled in below
+```
+
+This is not to be confused with the logical 8x8 shape of the C matrix. This is 8-threads by 8-values. We now want to map those to (m,n) coordinates. Since CuTe layouts return indices rather than coordinates, we choose a column-major encoding of the (m,n) coordinates:
+
+```
+(logical_thr_id, logical_val_id) -> (m, n) == m + n * M
+```
+
+With this in place, we can start thinking about how to construct the strides in `CLayout`. Let's begin by looking at the strides between threads. Note that
+* `(T0,V0)` is located at `(m,n) = (0,0) = 0`
+* `(T1,V0)` is located at `(m,n) = (1,0) = 1`
+* `(T2,V0)` is located at `(m,n) = (0,2) = 16`
+* `(T3,V0)` is located at `(m,n) = (1,2) = 17`
+* `(T4,V0)` is located at `(m,n) = (4,0) = 4`
+* `(T5,V0)` is located at `(m,n) = (5,0) = 5`
+* `(T6,V0)` is located at `(m,n) = (4,2) = 20`
+* `(T7,V0)` is located at `(m,n) = (5,2) = 21`
+
+where `T4`,`T5`,`T6`,`T7` are the 4th,5th,6th,7th logical thread id of the MMA corresponding to thread indices of 16,17,18,19 of the warp (recorded in the `ThrID` mapping!).
+
+We note that the pattern can be transcribed to a layout. We can find the position of the 8 threads via
+
+```cpp
+  using CLayout = Layout<Shape <Shape <_2,  _2, _2>, _8>,
+                         Stride<Stride<_1, _16, _4>, _?>;
+```
+
+With the exact same approach, we can construct the stride along the `logical value id` mode.
+* `(T0,V0)` is located at `(m,n) = (0,0) = 0`
+* `(T0,V1)` is located at `(m,n) = (0,1) = 8`
+* `(T0,V2)` is located at `(m,n) = (2,0) = 2`
+* `(T0,V3)` is located at `(m,n) = (2,1) = 10`
+* `(T0,V4)` is located at `(m,n) = (0,4) = 32`
+* `(T0,V5)` is located at `(m,n) = (0,5) = 40`
+* `(T0,V6)` is located at `(m,n) = (2,4) = 34`
+* `(T0,V7)` is located at `(m,n) = (2,5) = 42`
+
+We note that this pattern can also be transcribed to a layout. We can find the position of the 8 values via
+
+```cpp
+  // (T8,V8) -> (m,n)
+  using CLayout = Layout<Shape <Shape <_2, _2,_2>, Shape <_2,_2, _2>>,
+                         Stride<Stride<_1,_16,_4>, Stride<_8,_2,_32>>>;
+```
+
+And that's all! We can verify that each `(tid,vid)` coordinate in this layout is reliably mapped to the correct (encoded) `(m,n)` coordinate.
+
+In the case of F16 accumulators, the layout is way less complex. Each row of accumulators `(m, :)` is held by a single thread, which makes the layout:
+
+```cpp
+  using CLayout = Layout<Shape <_8,_8>,
+                         Stride<_1,_8>>;
+```
+
+### A and B Layout Mapping
+
+A and B matrix layouts depend on whether the sources are transposed or not. The diagram below shows the thread ID to data ownership map for A and B matrices in the case of NT and TN transposes.
+
+<p align="center">
+  <img src="../../images/cute/HMMA.8x8x4.quadpair.AB.png" alt="HMMA.8x8x4.quadpair.AB.png" height="400"/>
+</p>
+
+Let's look at the TN layout for A matrix first (right side in the diagram). Again, there are the same 8 logical threads, but each threads owns only 4 elements this time. The shape of `ALayout` will then be `Shape<_8, _4>`. As for the strides, we again need a similar mapping between `(m, k) == m + k * M`. Looking down the `M` mode, we go from `(T0, V0)` to `(T1, V0)` which is a stride of 1 for all 8 threads. For the `K` mode, as we go across, we go from `(T0, V0)` to `(T0, V1)`, which makes a stride of 8 for all 4 values. Therefore, the A layout is:
+
+```cpp
+  // (T8,V4) -> (m,k)
+  using ALayout = Layout<Shape <_8,_4>,
+                         Stride<_1,_8>>;
+```
+
+Source B layout is constructed similarly for the TN HMMA, except that we want write it as `(N,K)` rather than `(K,N)` for convenience. For the strides, as we go across the `N` mode, we go from `(T0, V0)` to `(T1, V0)`, making this a stride of 1 for all 8 threads. As we go down the `K` mode, `(T0, V0)` to `(T0, V1)` which is a stride of 8 for all 4 values. So the B layout is the same as A:
+
+```cpp
+  // (T8,V4) -> (n,k)
+  using BLayout = Layout<Shape <_8,_4>,
+                         Stride<_1,_8>>;
+```
+
+The layouts in the case of NT are a bit more complicated (left side of the diagram). Going down the `M` mode of `A`, we see the four values of `T0` first and then we see the four values of `T4`. This means we first have a stride of 1 for 4 values, followed by a stride of 4 from `T0` to `T4`. So we have two sub-strides along the `M` mode. For the `K` mode, as we go across, we simply increment the `thr_id`, keeping `val_id` the same, making the stride 8 for 4 threads. This makes the A layout:
+
+```cpp
+  // (T8,V4) -> (m,k)
+  using ALayout = Layout<Shape <Shape <_4,_2>,_4>,
+                         Stride<Stride<_8,_4>,_1>>;
+```
+
+With the `(N,K)` ordering for B, the layout is the same.
+
+```cpp
+  // (T8,V4) -> (n,k)
+  using BLayout = Layout<Shape <Shape <_4,_2>,_4>,
+                         Stride<Stride<_8,_4>,_1>>;
+```
+
+For the NN and TT transposes, they are simply combinations of the two layouts we have seen for A and B so far.
+
+## Hopper
+
+Now, we are ready to take a look at the much larger GMMA operation (Group MMA) first introduced with Hopper architecture. These MMA instructions operate at the granularity of 128 threads (4 warps), which are collectively referred to as a warpgroup.
+
+### Thread ID
+
+In the case of Hopper GMMAs, the thread IDs are assigned based on the simple 1D contiguous layout, which makes `thrID` trivial:
+
+```cpp
+using ThrID = Layout<_128, _1>;
+```
+
+### Accumulator Mapping
+
+Accumulators are mapped hierarchically in GMMA, starting from the concept of a core matrix and building up to a layout for the whole C matrix tile. Let's look at this core matrix first. We only consider fp16 accumulators here, but extensions of fp32 accumulators as trivial as we will see later.
+
+Each core matrix has the layout as shown in the diagram below.
+<p align="center">
+  <img src="../../images/cute/gmma_coremat_cd_fp16.png" alt="gmma_coremat_cd_fp16.png" height="600"/>
+</p>
+
+As in the Volta examples, the thread IDs are logical only, and which of the four warps they belong to in the warpgroup is not important.
+
+Then GMMA tiles this core matrix first vertically along the M mode, and then repeats that column of core matrices along the N mode to construct the full MxN tile. This tiling is shown in the image below.
+
+<p align="center">
+  <img src="../../images/cute/gmma_wg_n_slice.png" alt="gmma_wg_n_slice.png" height="600"/>
+</p>
+
+With this image, we are again ready to start building the `CLayout` for `SM90_64x128x16_F16F16F16F16_TN` atom. Same as before, we are constructing a mapping between the `(logical_thr_id, logical_val_id) -> (m, n)` coordinate spaces.
+
+To begin, let's follow the first few threads and values. We immediately see that they are arranged along the `N`-mode with pairs of values and four threads. This gives us
+
+```cpp
+// (T128,V4) -> (M64,N8)
+using CLayout = Layout<Shape <Shape <  _4, ...>, Shape < _2, ...>>,
+                       Stride<Stride<_128, ...>, Stride<_64, ...>>>;
+```
+
+To complete the first 8x8 core matrix, the four threads repeat eight times down the `M`-mode:
+
+```cpp
+// (T128,V4) -> (M64,N8)
+using CLayout = Layout<Shape <Shape <  _4, _8, ...>, Shape < _2, ...>>,
+                       Stride<Stride<_128, _1, ...>, Stride<_64, ...>>>;
+```
+
+Then, as we go to the next core matrix, we wrap back again to `T0`, but this time to `(T0, V2)`.
+
+```cpp
+// (T128,V4) -> (M64,N8)
+using CLayout = Layout<Shape <Shape <  _4, _8, ...>, Shape < _2, _2>>,
+                       Stride<Stride<_128, _1, ...>, Stride<_64, _8>>>;
+```
+
+Finally, we get this entire pattern repeating four times, once for each warp, down the `M`-mode starting at `(m,n) = (16,0) = 16`. where two core matrices that belong to the same warp are stacked on top of each other. This makes the size of the final sub-mode of M 4. As for the stride, this time we go to `(T32, V0)`, which makes it a stride of 32.
+
+```cpp
+// (T128,V4) -> (M64,N8)
+using CLayout = Layout<Shape <Shape <  _4, _8,  _4>, Shape < _2, _2>>,
+                       Stride<Stride<_128, _1, _16>, Stride<_64, _8>>>;
+```
+
+This is the full `CLayout` for 64x8 accumulators. The GMMA instructions include 64xN variants with `N = [16,32,64,128,256]` where this 64x8 pattern is repeated giving each thread additional values. As this starts at `(m,n) = (0,8) = 512`, this is easy to account for in our `CLayout`. For example, the 64x128 `CLayout` is
+
+```cpp
+// (T128,V64) -> (M64,N128)
+using CLayout = Layout<Shape <Shape <  _4, _8,  _4>, Shape < _2, _2,  _16>>,
+                       Stride<Stride<_128, _1, _16>, Stride<_64, _8, _512>>>;
+```
+
+where we see 16 copies of the 64x8 tile.
+
+### A and B Layout Mapping
+
+GMMA atoms that consume A and B sources directly from shared memory are a bit interesting. The GMMA Descriptor is constructed on an entire tile of A and/or B data in shared memory rather than being partitioned by threads. That is, every thread sees the entire tile of data and the tile is not reordered so that the descriptor can be constructed on it. In `ALayout` form, this can be expressed
+
+```cpp
+// (T128,V64x8) -> (M64,K16)
+using ALayout = Layout<Shape <_128, Shape <_64,_16>>,
+                       Stride<  _0, Stride< _1,_64>>>;
+```
+
+That is, all threads are mapped the to `(m,k) = (0,0) = 0` element and the values (and shape of the values) remains unchanged. The GMMA Descriptor Constructor can then inspect the `(M,K)` layout of this data and create an appropriate GMMA Descriptor or produce an error message saying the data is in an invalid layout for GMMA.
+
+## `TiledMMA`s
+
+We can make more complex patterns by combining and interleaving multiple atoms.
+
+Let's start with `SM70_8x8x4_F32F16F16F32_NT`.
+```cpp
+MMA_Atom mma = MMA_Atom<SM70_8x8x4_F32F16F16F32_NT>{};
+print_latex(mma);
+```
+<p align="center">
+  <img src="../../images/cute/HMMA.8x8x4.NT_Atom.png" alt="HMMA.8x8x4.NT_Atom.png" height="400"/>
+</p>
+
+The above is equivalent to 
+```cpp
+    TiledMMA mma = make_tiled_mma(SM70_8x8x4_F32F16F16F32_NT{},
+                                  Layout<Shape<_1,_1,_1>>{},   // Layout of Atoms
+                                  Tile<_8,_8,_4>{});           // Tiler
+    print_latex(mma);
+```
+as it is a single atom and has a natural tile size of 8x8x4.
+
+We can create an object akin to a WMMA by using four of these quadpair MMAs:
+```cpp
+    TiledMMA mma = make_tiled_mma(SM70_8x8x4_F32F16F16F32_NT{},
+                                  Layout<Shape <_2,_2>,
+                                         Stride<_2,_1>>{});   // 2x2 n-major layout of Atoms
+    print_latex(mma);
+```
+<p align="center">
+  <img src="../../images/cute/HMMA.8x8x4.NT_2x2.png" alt="HMMA.8x8x4.NT_2x2.png" height="400"/>
+</p>
+This `TiledMMA` replicates the `MMA_Atom` across threads as we can see the `T4` and `T8` and `T12` threads in the `C`-matrix that were not used before. Each quadrant of the `C`-matrix is a replica of the atom's partitioning pattern for a new quadpair and this replication follows a `(2,2):(2,1)` layout.
+
+The above represents a 16x16x4 MMA now, but we can immediately expand this "tile size" up to 32x32x4 instead:
+```cpp
+    TiledMMA mma = make_tiled_mma(SM70_8x8x4_F32F16F16F32_NT{},
+                                  Layout<Shape <_2,_2>,
+                                         Stride<_2,_1>>{},  // 2x2 n-major layout of Atoms
+                                  Tile<_32,_32,_4>{});      // 32x32x4 tiler
+    print_latex(mma);
+```
+<p align="center">
+  <img src="../../images/cute/HMMA.8x8x4.NT_2x2_32x32x4.png" alt="HMMA.8x8x4.NT_2x2_32x32x4.png" height="400"/>
+</p>
+This `TiledMMA` replicates the previous `TiledMMA` across values instead of threads. We can see the `T0V8` and `T16V8` and `T8V8` values in the `C`-matrix that were not used before. Each quadrant of the `C`-matrix is a replica of the previous `TiledMMA`'s partitioning pattern for a new set of values.
+
+Continuing, we see that there are eight values that `T0` receives from the `A`-matrix. Those reads occur at coordinates
+```
+T0V0 => ( 0,0)
+T0V1 => ( 1,0)
+T0V2 => ( 2,0)
+T0V3 => ( 3,0)
+T0V4 => (16,0)
+T0V5 => (17,0)
+T0V6 => (18,0)
+T0V7 => (19,0)
+```
+which are separate, but we might prefer them to be next to each other. That is we would like to permute the `M`-mode to create another valid `TiledMMA`.
+
+```cpp
+    TiledMMA mma = make_tiled_mma(SM70_8x8x4_F32F16F16F32_NT{},
+                                  Layout<Shape <_2,_2>,
+                                         Stride<_2,_1>>{},       // 2x2 n-major layout of Atoms
+                                  Tile<Layout<Shape <_4,_4,_2>,
+                                              Stride<_1,_8,_4>>, // Permutation on M, size 32
+                                       _32,                      // Permutation on N, size 32 identity
+                                       _4>{});                   // Permutation on K, size 4 identity
+    print_latex(mma);
+```
+<p align="center">
+  <img src="../../images/cute/HMMA.8x8x4.NT_2x2_32Mx32x4.png" alt="HMMA.8x8x4.NT_2x2_32Mx32x4.png" height="400"/>
+</p>
+
+That layout `(4,4,2):(1,8,4)` is read like a scatter permutation, telling the m-coords of the original image where to go in the new image.
+```
+old m-coord:  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
+new m-coord:  0  1  2  3  8  9 10 11 16 17 18 19 24 25 26 27  4  5  6  7 12 13 14 15 20 21 22 23 28 29 30 31
+```
+This permutes only the M-mode (in `A` and `C` accordingly) and brings the access of all threads to be contiguous in m-coordinates in the `A`-matrix. This is convenient when designing layouts for shared memory or registers, for example. The MMA instructions contained within the image above are now effectively interleaved in the logical m-coordinates. Of course, permutations in the N-mode and K-mode are also valid.
+
+To see how these `TiledMMA`s are used to partition data tensors, see the [`0x_gemm_tutorial.md`](./0x_gemm_tutorial.md).
diff --git a/media/docs/cute/0x_gemm_tutorial.md b/media/docs/cute/0x_gemm_tutorial.md
new file mode 100644
index 0000000000..a2f4dd2e96
--- /dev/null
+++ b/media/docs/cute/0x_gemm_tutorial.md
@@ -0,0 +1,538 @@
+# CuTe dense matrix-matrix multiply tutorial
+
+In this section, we review
+[these examples](../../../examples/cute/tutorial/),
+which demonstrate a few self-contained, single-file dense matrix-matrix multiply implementations using only CuTe.
+
+## `sgemm_1.cu`
+
+The simplest of the tutorial examples covers the basics of partitioning the global memory into tiles across the CTAs (also called threadblocks in CUDA), partitioning the data tiles across the threads of each CTA, and writing a mainloop using `cute::copy` and `cute::gemm`.
+
+### High-level interface
+
+We'll start with the kernel entry point `gemm_device` at the top of the file.
+
+```c++
+template <class ProblemShape, class CtaTiler,
+          class TA, class AStride, class ASmemLayout, class AThreadLayout,
+          class TB, class BStride, class BSmemLayout, class BThreadLayout,
+          class TC, class CStride, class CSmemLayout, class CThreadLayout,
+          class Alpha, class Beta>
+__global__ static
+__launch_bounds__(decltype(size(CThreadLayout{}))::value)
+void
+gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
+            TA const* A, AStride dA, ASmemLayout sA_layout, AThreadLayout tA,
+            TB const* B, BStride dB, BSmemLayout sB_layout, BThreadLayout tB,
+            TC      * C, CStride dC, CSmemLayout          , CThreadLayout tC,
+            Alpha alpha, Beta beta)
+```
+
+There are many template parameters, let's quickly review them and then go into more depth on their uses.
+
+* `ProblemShape`. The MxNxK problem shape of this matrix multiply.
+
+* `CtaTiler`. A CuTe [tiler concept](./02_layout_algebra.md#composition-tilers) that determines how to extract a tile of data from the problem shape.
+
+* `TA const* A`, `TB const* B`, `TC* C`. The types and pointers to the A, B, and C data, respectively.
+
+* `AStride`, `BStride`, `CStride`. The layout strides corresponding to the `ProblemShape` for each A, B, and C.
+
+* `ASmemLayout`, `BSmemLayout`, `CSmemLayout`. The layouts, if needed, of shared memory to use for staging A-data, B-data, and C-data within each CTA.
+
+* `AThreadLayout`, `BThreadLayout`, `CThreadLayout`. The layouts of threads to be used in partitioning each stage.
+
+* `Alpha alpha`, `Beta beta`. The types and values of the scalar constants to compute GEMM: `C = alpha * A * B + beta * C`.
+
+### The Full Tensors: Shapes, Strides, and Data
+
+Most GEMM interfaces list the matrices' dimensions
+in the order M, N, K. CuTe also uses this convention, but packages them
+into a single `IntTuple`. In this example, they are dynamic values
+defined at the top of the `gemm_nt` and `gemm_tn` host functions
+that invoke the device kernel.
+```cpp
+  // Define shapes (dynamic)
+  auto M = int(m);
+  auto N = int(n);
+  auto K = int(k);
+  auto prob_shape = make_shape(M, N, K);    // (M, N, K)
+```
+
+Inside the kernel, the problem shape is checked against the preconditions and then used to construct each of the full matrices.
+```cpp
+  // Preconditions
+  CUTE_STATIC_ASSERT_V(rank(shape_MNK) == Int<3>{});                      // (M, N, K)
+
+  CUTE_STATIC_ASSERT_V(congruent(select<0,2>(shape_MNK), dA));            // dA strides for shape MK
+  CUTE_STATIC_ASSERT_V(congruent(select<1,2>(shape_MNK), dB));            // dB strides for shape NK
+  CUTE_STATIC_ASSERT_V(congruent(select<0,1>(shape_MNK), dC));            // dC strides for shape MN
+
+  // Represent the full tensors
+  Tensor mA = make_tensor(make_gmem_ptr(A), select<0,2>(shape_MNK), dA);  // (M,K)
+  Tensor mB = make_tensor(make_gmem_ptr(B), select<1,2>(shape_MNK), dB);  // (N,K)
+  Tensor mC = make_tensor(make_gmem_ptr(C), select<0,1>(shape_MNK), dC);  // (M,N)
+```
+The appropriate modes of the `Shape` are selected to construct each of the tensors. The preconditions make sure that for every integer in the `Shape` there is a corresponding integer in the associated `Stride`.
+
+Note that the comment after B says `(N,K)` rather than `(K,N)`.
+This means that B is treated as an NxK matrix instead of a KxN matrix as is typical within BLAS and most other matrix-matrix multiplications.
+CuTe follows the convention that the semantics of matrix modes is
+`(M,K)` for `A`, `(N,K)` for `B`, and `(M,N)` for `C`, which we try to record in comments everywhere.
+
+For each of the `(M,K)`, `(N,K)`, and `(M,N)` tensors, the `gemm_nt` and `gemm_tn` construct the strides those tensors will use. In `gemm_nt` the strides are defined as
+```cpp
+  // Define NT strides (mixed)
+  auto dA = make_stride(Int<1>{}, ldA);    // (dM, dK)
+  auto dB = make_stride(Int<1>{}, ldB);    // (dN, dK)
+  auto dC = make_stride(Int<1>{}, ldC);    // (dM, dN)
+```
+and in `gemm_tn` the strides are defined as
+```cpp
+  // Define TN strides (mixed)
+  auto dA = make_stride(ldA, Int<1>{});    // (dM, dK)
+  auto dB = make_stride(ldB, Int<1>{});    // (dN, dK)
+  auto dC = make_stride(Int<1>{}, ldC);    // (dM, dN)
+```
+
+#### Aside: M-major, N-major, K-major
+
+We've found that the BLAS convention of using "non-transposed" (N) and "transposed" (T) flags in conjunction with the mode conventions of `MxK * KxN` to confuse the core issue of "what layout does this matrix use" and "in which mode does my matrix have a stride-1?". Indeed, the answer to those questions can always be found by inspecting the CuTe `Layout`.
+
+Instead of row-major or column-major (or Transposed
+and Not-Transposed), we have found it much more convenient to say that a matrix is "M-major" if it is stride-1 in the M-mode, "N-major" if it is stride-1 in the N-mode, or "K-major" if it is stride-1 in the K-mode. Furthermore, knowing that matrix multiply always performs a reduction in the K-mode, it is very convenient from a software perspective to always have the K-mode in the same place and adopt the mode convention `MxK * NxK`. Implementations will always reduce over the second mode (the K mode) of both input matrices and leads to cases where implementations can treat both input matrices the same way.
+
+How do we translate this into the BLAS user's experience?
+
+| BLAS | A Majorness | A Layout        | B Majorness | B Layout        |
+| ---  | ---         | ---             | ---         | ---             |
+| NT   | M-major     | `(M,K):(1,ldA)` | N-major     | `(N,K):(1,ldB)` |
+| TN   | K-major     | `(M,K):(ldA,1)` | K-major     | `(N,K):(ldB,1)` |
+| NN   | M-major     | `(M,K):(1,ldA)` | K-major     | `(N,K):(ldB,1)` |
+| TT   | K-major     | `(M,K):(ldA,1)` | N-major     | `(N,K):(1,ldB)` |
+
+Regardless, we'll still use the BLAS "NT" and "TN" notations for high-level descriptions of kernels when it's appropriate.
+
+### CTA Partitioning
+
+Now that we have the representations of the full matrices, it's time to tile them and split up the work!
+
+At the highest level, the work is distributed across CTAs. In principle, each CTA's tile could come from the input tensors in many different ways. Many [CuTe `Tiler`s](./02_layout_algebra.md#composition-tilers) could be used to tile the data, but for these cases it is sufficient to simply use the shape of the desired CTA tile.
+```cpp
+  // Define CTA tile sizes (static)
+  auto bM = Int<128>{};
+  auto bN = Int<128>{};
+  auto bK = Int<  8>{};
+  auto cta_tiler = make_shape(bM, bN, bK);  // (BLK_M, BLK_N, BLK_K)
+```
+
+Once the tiler has been defined, we can use it to tile and partition the tensors across the CTAs.
+
+```cpp
+  // Get the appropriate blocks for this threadblock
+  auto cta_coord = make_coord(blockIdx.x, blockIdx.y, _);              // (m,n,k)
+  Tensor gA = local_tile(mA, cta_tiler, cta_coord, Step<_1, X,_1>{});  // (BLK_M,BLK_K,k)
+  Tensor gB = local_tile(mB, cta_tiler, cta_coord, Step< X,_1,_1>{});  // (BLK_N,BLK_K,k)
+  Tensor gC = local_tile(mC, cta_tiler, cta_coord, Step<_1,_1, X>{});  // (BLK_M,BLK_N)
+```
+
+First, the CTA coordinate is created.
+* The `m`-coordinate of this tile is given by `blockIdx.x`.
+* The `n`-coordinate of this tile is given by `blockIdx.y`.
+* The `k`-coordinate of this tile is unspecified -- we want all of the tiles in `K` so the coordinate is `_`, the `Underscore` value, to keep that mode.
+
+Then, `local_tile` is used to remove the modes of the tiler and coord corresponding to the `X`s. That is, the `Step<_1, X,_1>` is just shorthand for
+```cpp
+  // Use select<0,2> to use only the M- and K-modes of the tiler and coord
+  Tensor gA = local_tile(mA, select<0,2>(cta_tiler), select<0,2>(cta_coord));
+```
+This `local_tile` is simply shorthand for
+1. apply the tiler via [`zipped_divide`](./02_layout_algebra.md#zipped-tiled-flat-divides)
+```cpp
+// ((BLK_M,BLK_K),(m,k))
+Tensor gA_mk = zipped_divide(mA, select<0,2>(cta_tiler));
+```
+2. apply the coord to the second mode, the "Rest" mode, to extract out the correct tiles for this CTA.
+```cpp
+// (BLK_M,BLK_N,k)
+Tensor gA = gA_mk(make_coord(_,_), select<0,2>(cta_coord));
+```
+Because the projections of the tiler and coord are symmetric and the two steps (apply a tiler and then slice into the rest-mode to produce a partition) are so common, they are wrapped together into the projective `local_tile` interface.
+
+For tensor `A`, we are left with a rank-3 tensor of shape `(BLK_M,BLK_K,k)`. The first two modes are precisely the modes of the CTA tile and the last mode indexes over all of the tiles that will be reduced by this CTA. In the mainloop section below, this mode is iterated over via the `k_tile` loop.
+
+### SMEM tensors
+
+The shared memory layouts that are used to hold the tiles of data for A and B are also passed in as the parameters `ASmemLayout sA_layout` and `BSmemLayout sB_layout`.
+
+These are defined in `gemm_nt` as
+```c++
+  // Define the smem layouts (static)
+  auto sA = make_layout(make_shape(bM, bK));   // (m,k) -> smem_idx; m-major
+  auto sB = make_layout(make_shape(bN, bK));   // (n,k) -> smem_idx; n-major
+```
+which produces simple M-major and N-major layouts. In `gemm_tn` these are
+```cpp
+  // Define the smem layouts (static)
+  auto sA = make_layout(make_shape(bM,bK), LayoutRight{});   // (m,k) -> smem_idx; k-major
+  auto sB = make_layout(make_shape(bN,bK), LayoutRight{});   // (n,k) -> smem_idx; k-major
+```
+which produces simple K-major layouts.
+
+As is evident, these smem layouts can be almost anything. Inside the kernel, they are checked for only two properties: the shared memory layouts are static and they are the same top-level shape as the `CtaTiler`.
+
+```cpp
+  // Preconditions
+  static_assert(is_static<ASmemLayout>::value);
+  static_assert(is_static<BSmemLayout>::value);
+  static_assert(is_static<CSmemLayout>::value);
+
+  CUTE_STATIC_ASSERT_V(size<0>(ASmemLayout{}) == size<0>(cta_tiler));  // BLK_M
+  CUTE_STATIC_ASSERT_V(size<0>(CSmemLayout{}) == size<0>(cta_tiler));  // BLK_M
+  CUTE_STATIC_ASSERT_V(size<0>(BSmemLayout{}) == size<1>(cta_tiler));  // BLK_N
+  CUTE_STATIC_ASSERT_V(size<1>(CSmemLayout{}) == size<1>(cta_tiler));  // BLK_N
+  CUTE_STATIC_ASSERT_V(size<1>(ASmemLayout{}) == size<2>(cta_tiler));  // BLK_K
+  CUTE_STATIC_ASSERT_V(size<1>(BSmemLayout{}) == size<2>(cta_tiler));  // BLK_K
+```
+
+Use of static layouts has a few advantages.
+* Static layouts let us statically allocate shared memory as shown below.
+* Static layouts are often more efficient and allow CuTe to dispatch to optimized implementations.
+* Static layouts makes it easier to prove correctness of the algorithm and provide checks like the above -- the smem layout sizes are the same as the CTA tile sizes.
+
+As stated, the shared memory layouts can be anything that satisfy those conditions. Optimizing kernels like these is often performed by finding a good shared memory layout that provides good access patterns for both the writes to and the reads from shared memory. This includes the ability to vectorize reads and writes as well as avoid shared memory bank conflicts.
+
+Wih the static smem layouts, the `gemm_device` kernel can allocate the required shared memory and create the smem `Tensor`s.
+
+```cpp
+  // Shared memory buffers
+  __shared__ TA smemA[cosize_v<ABlockLayout>];
+  __shared__ TB smemB[cosize_v<BBlockLayout>];
+  Tensor sA = make_tensor(make_smem_ptr(smemA), sA_layout);  // (BLK_M,BLK_K)
+  Tensor sB = make_tensor(make_smem_ptr(smemB), sB_layout);  // (BLK_N,BLK_K)
+```
+
+Note how the shared memory allocation depends only on the data type and the layout. What's a `cosize`? Because a `Layout` is a function, we can speak of its domain and codomain. The `size` of a layout is the size of its domain and the `cosize` of a layout is the size of its codomain. If we want to allocate an array for which all the offsets produced by a layout are valid, then we can use the `cosize` of the layout as the length of the array (in units of elements).
+
+### Copy partitioning
+
+The kernel now has tiles of global memory by applying the `CtaTiler` to the full tensors and it also has tiles of shared memory by allocating appropriately. We now want to create an efficient way to copy one tile of global memory to our tile of shared memory. A trivial way to do this would be to use a single thread and copy each element.
+```cpp
+if (thread0()) {
+  Tensor gA0 = gA(_,_,0);  // (BLK_M,BLK_K), the 0th tile
+  for (int i = 0; i < size(sA); ++i) {
+    sA(i) = gA0(i);
+  }
+}
+```
+This would work, but we have lots of threads to use inside this CTA, so let's use them!
+
+If we partition the two tiles of data across the threads in the CTA, then each thread can copy its own subtensor of data. There are lots of ways this partitioning could occur, however.
+
+The `gemm_nt` function defines two layouts of *threads* as
+```c++
+  // Define thread layouts (static)
+  auto tA = make_layout(make_shape(Int<32>{},Int<8>{}));   // (m,k) -> thr_idx
+  auto tB = make_layout(make_shape(Int<32>{},Int<8>{}));   // (n,k) -> thr_idx
+```
+and the `gemm_tn` functions defines two layouts of *threads* as
+```c++
+  // Define thread layouts (static)
+  auto tA = make_layout(make_shape(Int<32>{},Int<8>{}), LayoutRight{});  // (m,k) -> thr_idx; k-major
+  auto tB = make_layout(make_shape(Int<32>{},Int<8>{}), LayoutRight{});  // (n,k) -> thr_idx; k-major
+```
+Both cases happen to use 32x8 threads, which will be used to partition a 128x8 tile of gmem and smem data into a 4x1 subtensor for each thread. The only difference here is that `gemm_nt` uses M-major and N-major threads to match the order of data in global memory and `gemm_tn` uses K-major threads to match the order of data in global memory.
+
+Again, the conditions on the thread layouts are checked inside the kernel.
+```cpp
+  static_assert(is_static<AThreadLayout>::value);
+  static_assert(is_static<BThreadLayout>::value);
+
+  CUTE_STATIC_ASSERT_V(size(tA) == size(tB));                          // NumThreads
+
+  CUTE_STATIC_ASSERT_V(size<0>(cta_tiler) % size<0>(tA) == Int<0>{});  // BLK_M / THR_M
+  CUTE_STATIC_ASSERT_V(size<2>(cta_tiler) % size<1>(tA) == Int<0>{});  // BLK_K / THR_K
+  CUTE_STATIC_ASSERT_V(size<1>(cta_tiler) % size<0>(tB) == Int<0>{});  // BLK_N / THR_N
+  CUTE_STATIC_ASSERT_V(size<2>(cta_tiler) % size<1>(tB) == Int<0>{});  // BLK_K / THR_K
+```
+
+These thread layouts are then used to partition the global memory tensors data and shared memory tensors
+```cpp
+  Tensor tAgA = local_partition(gA, tA, threadIdx.x);    // (THR_M,THR_K,k)
+  Tensor tAsA = local_partition(sA, tA, threadIdx.x);    // (THR_M,THR_K)
+
+  Tensor tBgB = local_partition(gB, tB, threadIdx.x);    // (THR_N,THR_K,k)
+  Tensor tBsB = local_partition(sB, tB, threadIdx.x);    // (THR_N,THR_K)
+
+  CUTE_STATIC_ASSERT_V(size<0>(tAgA) == size<0>(tAsA));  // THR_M
+  CUTE_STATIC_ASSERT_V(size<1>(tAgA) == size<1>(tAsA));  // THR_K
+  CUTE_STATIC_ASSERT_V(size<0>(tBgB) == size<0>(tBsB));  // THR_N
+  CUTE_STATIC_ASSERT_V(size<1>(tBgB) == size<1>(tBsB));  // THR_K
+```
+where `local_partition` is a lot like `local_tile`, except the coordinate slices into the tile-mode (the first mode) of the `zipped_divide` rather than the rest-mode (the second mode). That is, each thread gets one element of data assigned to it per thread tile and that thread tile is repeated to cover the entire data tile.
+
+The naming convention `tAsA` is pretty typical across CuTe and CUTLASS. This is read as "Partitioning pattern `tA` applied to tensor `sA`". In the next section, we'll see a different partitioner applied to `sA` to produce `tCsA`. By applying the same partitioning pattern, `tA`, to tensors `sA` and `gA`, we preserve the *logical consistency* of those tensors (checked by the assertions above) where logical elements between the two tensors correspond despite any differences in their data layouts. When used in `cute::copy`, for example, this naming convention let's us lexically verify that the two tensors are using the same partitioning pattern.
+
+With the data partitioned across the threads, *every thread* can now participate in the copy by writing
+```cpp
+copy(tAgA(_,_,0), tAsA);
+```
+because every thread owns a different subtensor of the tile that will be copied.
+
+### Math partitioning
+
+The kernel now has tiles of shared memory copied in from global memory. We now want to create an efficient way to compute and accumulate the matrix product on that tile of shared memory. A trivial way to do this would be to use a single thread and compute directly.
+```cpp
+if (thread0()) {
+  for (int m = 0; m < size<0>(gC); ++m) {
+    for (int n = 0; n < size<1>(gC); ++n) {
+      for (int k = 0; k < size<1>(sA); ++k) {
+        gC(m,n) += sA(m,k) * sB(n,k);
+      }
+    }
+  }
+}
+```
+This would work, but we have lots of threads to use inside this CTA, so let's use them!
+
+If we partition the output tile `gC` across the threads in the CTA, then each thread can compute its own subtensor. There are lots of ways this partitioning could occur, however.
+
+The `gemm_nt` and `gemm_tn` functions define one more layout of *threads*:
+```cpp
+  // Define thread layouts (static)
+  auto tC = make_layout(make_shape(Int<16>{}, Int<16>{}));   // (m,n) -> thr_idx; m-major
+```
+This is a m-major 16x16 layout of threads which will be used to partition a 128x128 tile of `C`-data, resulting in each thread computing its own 8x8 subtensor of `gC`.
+
+Again, the conditions on the thread layouts are checked inside the kernel.
+```cpp
+  static_assert(is_static<CThreadLayout>::value);
+
+  CUTE_STATIC_ASSERT_V(size(tC) == size(tA));                          // NumThreads
+
+  CUTE_STATIC_ASSERT_V(size<0>(cta_tiler) % size<0>(tC) == Int<0>{});  // BLK_M / THR_M
+  CUTE_STATIC_ASSERT_V(size<1>(cta_tiler) % size<1>(tC) == Int<0>{});  // BLK_N / THR_N
+```
+
+These thread layouts are then used to partition the tiles of data in global memory and shared memory
+```cpp
+  // Partition sA (M,K) by the rows of tC
+  Tensor tCsA = local_partition(sA, tC, threadIdx.x, Step<_1, X>{});   // (THR_M,BLK_K)
+  // Partition sB (N,K) by the cols of tC
+  Tensor tCsB = local_partition(sB, tC, threadIdx.x, Step< X,_1>{});   // (THR_N,BLK_K)
+  // Partition gC (M,N) by the tile of tC
+  Tensor tCgC = local_partition(gC, tC, threadIdx.x, Step<_1,_1>{});   // (THR_M,THR_N)
+
+  // Allocate the accumulators -- same shape/layout as the partitioned data
+  Tensor tCrC = make_tensor_like(tCgC);                                // (THR_M,THR_N)
+
+  CUTE_STATIC_ASSERT_V(size<0>(tCrC) == size<0>(tCgC));                // THR_M
+  CUTE_STATIC_ASSERT_V(size<0>(tCrC) == size<0>(tCsA));                // THR_M
+  CUTE_STATIC_ASSERT_V(size<1>(tCrC) == size<1>(tCgC));                // THR_N
+  CUTE_STATIC_ASSERT_V(size<1>(tCrC) == size<0>(tCsB));                // THR_N
+  CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCsB));                // BLK_K
+```
+where we've used the same projection-style interface to avoid applying the `N`-mode of `tC` to the `(BLK_M,BLK_K)` shape of `sA` and avoid applying the `M`-mode of `tC` to the `(BLK_N,BLK_K)` shape of `sB`.
+
+<p align="center">
+  <img src="../../images/cute/tC_partitioning.png" alt="tC_partitioning.png" height="300"/>
+</p>
+This diagram shows a `tC` layout, highlights two threads in green and blue, shows the projections of the `tC` layout, and finally highlights the subtensors within `sA`, `sB`, and `gC` that `tCsA`, `tCsB`, and `tCgC` represent.
+
+With the data partitioned across the threads, *every thread* can now participate in the compute step by writing
+```cpp
+gemm(tCsA, tCsB, tCrC);
+```
+because every thread owns different subtensors of the data to be computed.
+
+### Mainloop
+
+The mainloop iterates over tiles of global memory, reads those tiles into shared memory, and then performs the matrix-multiply and accumulates into the accumulators.
+
+```c++
+// TUTORIAL: Example of a very simple compute mainloop
+//   copy(.) operates on the global and shared memory via the tA|tB partitioning
+//   gemm(.) operates on the shared and register memory via the tC partitioning
+
+auto K_TILE_MAX = size<2>(tAgA);
+
+for (int k_tile = 0; k_tile < K_TILE_MAX; ++k_tile)
+{
+  // Copy gmem to smem with tA|tB thread-partitioned tensors
+  copy(tAgA(_,_,k_tile), tAsA);      // A   (THR_M,THR_K) -> (THR_M,THR_K)
+  copy(tBgB(_,_,k_tile), tBsB);      // B   (THR_N,THR_K) -> (THR_N,THR_K)
+
+  cp_async_fence();        // Label the end of (potential) cp.async instructions
+  cp_async_wait<0>();      // Sync on all (potential) cp.async instructions
+  __syncthreads();         // Wait for all threads to write to smem
+
+  // Compute gemm on tC thread-partitioned smem
+  gemm(tCsA, tCsB, tCrC);            // (THR_M,THR_N) += (THR_M,BLK_K) * (THR_N,BLK_K)
+  __syncthreads();         // Wait for all threads to read from smem
+}
+```
+
+We can see that `k_tile` iterates over each tile of data, the `cute::copy` is performed for the current `k_tile` using the `tA` and `tB` thread-partitioned tensors, and the `cute::gemm` is computed for that current `k_tile` using the `tC` thread-partitioned tensors. Synchronization is provided so that this kernel works on any architecture.
+
+## `sgemm_2.cu`
+
+An example that uses more complex `TiledMMA` and `TiledCopy` to perform partitioning in place of the `tA`, `tB`, and `tC` thread layouts. With this example, we try to emphasize that the shared memory layouts, the partitioning patterns, and the PTX instruction to use in each stage can be specified independently.
+
+### TiledCopy
+
+First, we can replace the `tA` partitioning and `tB` partitioning with `TiledCopy` partitioning, which provides for more complex partitioning patterns and checked dispatch to specific copy instructions.
+
+As a first example, lets look at the `TiledCopy` that `gemm_nt` generates.
+```cpp
+  TiledCopy copyA = make_tiled_copy(Copy_Atom<UniversalCopy<uint128_t>, TA>{},  // Atom: Copy TAs as if they were uint128_t
+                                    Layout<Shape<_32,_8>>{},                    // Thr layout 32x8 m-major
+                                    Layout<Shape< _4,_1>>{});                   // Val layout  4x1 m-major
+  print_latex(copyA);
+```
+The easiest way to see what this `TiledCopy` does is to look at the partition pattern in LaTeX.
+<p align="center">
+  <img src="../../images/cute/TiledCopyA.png" alt="TiledCopyA.png" height="300"/>
+</p>
+On the left is the source-tensor partitioning and on the right is the destination-tensor partitioning. The partition patterns are the same for this case, but there exist PTX instructions which require different patterns in the source and destination. The diagram shows that each thread reads 4x1 `TA` elements and there are 32x8 threads. The `UniversalCopy<uint128_t>` forces the instruction to use a 128-bit copy instruction. If the partition (of `sA` or `gA` in this case) does not result in 4 `TA` elements that cannot be vectorized to a 128-bit load/store, then CuTe will statically fail with an error message to that effect.
+
+To use the `TiledCopy`, the kernel writes
+```cpp
+  ThrCopy thr_copy_a = copy_a.get_slice(threadIdx.x);
+  Tensor tAgA = thr_copy_a.partition_S(gA);            // (CPY,CPY_M,CPY_K,k)
+  Tensor tAsA = thr_copy_a.partition_D(sA);            // (CPY,CPY_M,CPY_K)
+  // Allocate registers same shape/layout as partitioned data
+  Tensor tArA = make_fragment_like(tAsA);              // (CPY,CPY_M,CPY_K)
+```
+which applies the source-tensor partitioning to `gA` via `partition_S` and applies the destination-tensor partitioning to `sA` via `partition_D`. The first mode, `CPY`, of the result tensors hold all of the elements that a single instruction will consume. In this case, that mode should have size-4 since there are four `TA=float` elements in a single 128-bit `uint128_t`.
+
+Once the partition has been performed, we can execute the `copy` on the thread-partitioned tensors using the provided instruction in `copy_a`.
+```cpp
+cute::copy(copy_a, tAgA, tArA);
+```
+
+### TiledMMA
+
+Next, we can replace the `tC` partitioning with `TiledMMA` partitioning, which provides for more complex partitioning patterns and checked dispatch to specific MMA instructions.
+
+As a first example, lets look at the `TiledMMA` that `gemm_nt` generates.
+```cpp
+  TiledMMA mmaC = make_tiled_mma(UniversalFMA<TC,TA,TB>{},
+                                 Layout<Shape<_16,_16,_1>>{});  // 16x16x1 UniversalFMA
+  print_latex(mmaC);
+```
+The easiest way to see what this `TiledMMA` does is to look at the partition pattern in LaTeX.
+<p align="center">
+  <img src="../../images/cute/TiledMmaC.png" alt="TiledMmaC.png" height="300"/>
+</p>
+On the left is the A-tensor partitioning, on the top is the B-tensor partitioning, and in the middle is the C-tensor partitioning.Because the `UniversalFMA` is a 1x1x1 MMA instruction, a 16x16x1 tiling of them results in a 16x16x1 `TiledMMA`. Other MMA instructions will have different threads involved and have different instruction sizes. In this case, all threads will read a single element from `A`, `B`, and `C` each.
+
+To use the `TiledMMA`, the kernel writes
+```cpp
+  ThrMMA thr_mma = mma.get_slice(threadIdx.x);
+  Tensor tCsA = thr_mma.partition_A(sA);        // (MMA,MMA_M,MMA_K)
+  Tensor tCsB = thr_mma.partition_B(sB);        // (MMA,MMA_N,MMA_K)
+  Tensor tCgC = thr_mma.partition_C(gC);        // (MMA,MMA_M,MMA_N)
+  // Allocate the accumulators -- same size as the projected data
+  Tensor tCrC = thr_mma.make_fragment_C(tCgC);  // (MMA,MMA_M,MMA_N)
+```
+which applies the A-tensor partitioning to `sA` via `partition_A`, applies the B-tensor partitioning to `sB` via `partition_B`, and applies the C-tensor partitioning to `gC` via `partition_C`. The first mode, `MMA`, of the result tensors hold all of the elements that a single instruction will consume. In this case, that mode should have size-1 since `UniversalFMA` is a 1x1x1 MMA, but in general the size of the first mode can vary and not even be the same across `tCsA`, `tCsB`, and `tCgC` depending on the MMA.
+
+Once the partition has been performed, we can execute the `gemm` on the thread-partitioned tensors using the provided instruction in `mma`.
+```cpp
+cute::gemm(mma, tCsA, tCsB, tCrC);
+```
+
+### Other changes
+
+In this version, we have also updated the shared memory layouts for `gemm_tn` from K-major to
+```cpp
+  // Define the smem layouts (static)
+  auto sA = make_layout(make_shape (      bM,          bK),
+                        make_stride(Int<1>{}, bM+Int<1>{}));  // (m,k) -> smem_idx; padded m-major
+  auto sB = make_layout(make_shape (      bN,          bK),
+                        make_stride(Int<1>{}, bN+Int<1>{}));  // (n,k) -> smem_idx; padded n-major
+```
+which produces M-major and N-major layouts, but they are padded to avoid shared memory bank conflicts. This simply improves the access pattern to and from shared memory and no other changes in the kernel are required.
+
+## `sgemm_sm70.cu`
+
+An example that uses an optimized mainloop for Volta SM70 architectures that pipelines shared memory and register memory.
+
+## `sgemm_sm80.cu`
+
+An example that uses an optimized mainloop for Ampere SM80 architectures that explicitly pipelines shared memory using asynchronous reads from global memory.
+
+## Next steps
+
+All of the above examples assume that the CTA tile size divides the problem size so that global memory loads do no need to be predicated. The
+[predication section of the tutorial](./0y_predication.md)
+explains what to do if a matrix tiling
+doesn't perfectly divide the matrix.
+
+## GETT as GEMM
+
+"GETT" here stands for "general(ized) tensor times tensor," a tensor contraction.
+
+CuTe permits matrices to have nested `Layout`s.
+This means that we can fold a `Tensor` into a "matrix" by grouping modes according to their categories.
+
+As a result, we can implement GETT by using
+our existing GEMM implementation. Included below is a launcher like `gemm_nt` that uses the same device kernel contained in `sgemm_1.cu` to compute a GETT with two m-modes.
+```cpp
+// Setup params for a GETT with two m-modes.
+// The A and C tensors are assumed to be m0-major.
+//   Calls sgemm_1.cu's gemm_device<<<>>> without modification.
+template <class TA, class TB, class TC,
+          class Alpha, class Beta>
+void
+gett(int m0, int m1, int n, int k,
+     Alpha alpha,
+     TA const* A, int ldAm1, int ldAk,  // m0-major
+     TB const* B, int ldBk,
+     Beta beta,
+     TC      * C, int ldCm1, int ldCn,  // m0-major
+     cudaStream_t stream = 0)
+{
+  using namespace cute;
+
+  // Define shapes (dynamic)
+  auto M = make_shape(m0, m1);                               // (m0,m1)-multimode M
+  auto N = int(n);
+  auto K = int(k);
+  auto prob_shape = make_shape(M, N, K);                     // (M, N, K)
+
+  // Define NT strides (mixed)
+  auto dA = make_stride(make_stride(Int<1>{}, ldAm1), ldAk); // (dM, dK)
+  auto dB = make_stride(Int<1>{}, ldB);                      // (dN, dK)
+  auto dC = make_stride(make_stride(Int<1>{}, ldCm1), ldCn); // (dM, dN)
+
+  // Define CTA tile sizes (static)
+  auto bM = Shape<_64, _2>{};    // Take _64 elements from m0 and _2 elements from m1
+  auto bN = Int<128>{};
+  auto bK = Int<  8>{};
+  auto cta_tiler = make_shape(bM, bN, bK);                   // (BLK_M, BLK_N, BLK_K)
+
+  // Define the smem layouts (static)
+  auto sA = make_layout(make_shape(bM, bK));                 // (m,k) -> smem_idx; m-major
+  auto sB = make_layout(make_shape(bN, bK));                 // (n,k) -> smem_idx; n-major
+  auto sC = make_layout(make_shape(bM, bN));                 // (m,n) -> smem_idx; m-major
+
+  // Define the thread layouts (static)
+  auto tA = make_layout(make_shape(Int<32>{}, Int< 8>{}));   // (m,k) -> thr_idx
+  auto tB = make_layout(make_shape(Int<32>{}, Int< 8>{}));   // (n,k) -> thr_idx
+  auto tC = make_layout(make_shape(Int<16>{}, Int<16>{}));   // (m,n) -> thr_idx
+
+  dim3 dimBlock(size(tC));
+  dim3 dimGrid(size(ceil_div(M, bM)),
+               size(ceil_div(N, bN)));
+  gemm_device<<<dimGrid, dimBlock, 0, stream>>>
+      (prob_shape, cta_tiler,
+       A, dA, sA, tA,
+       B, dB, sB, tB,
+       C, dC, sC, tC,
+       alpha, beta);
+}
+```
+Note that the only changes are the definition of shape `M`, the definition of strides `dA` and `dC`, and the definition of the CTA Tiler `bM`. The above uses a multimodel problem shape `M = (m0,m1)` and a multimodal CTA Tiler `bM = <_64,_2>` to change which portion of the global memory tensors `A` and `C` each CTA will be responsible for computing.
+
+Similar examples can be found for CUTLASS 3.x kernels that are based on CuTe, such as [this Hopper GETT example](../../../examples/51_hopper_gett).
diff --git a/media/docs/cute/0y_predication.md b/media/docs/cute/0y_predication.md
new file mode 100644
index 0000000000..f764508bf1
--- /dev/null
+++ b/media/docs/cute/0y_predication.md
@@ -0,0 +1,217 @@
+# Predication: What to do when tiling isn't perfect
+
+The [GEMM tutorial](./0x_gemm_tutorial.md) shows how
+we compute a matrix-matrix multiply
+by iterating over tiles of the input matrices and output matrix.
+The examples all assume that the tiles fit evenly into the matrices,
+with no remainder.
+What do we do if this is not the case?
+For example, we might want to tile a 41 x 55 matrix into 4 x 8 tiles,
+but 41 / 4 is 10 remainder 1, and 55 / 8 is 6 remainder 7.
+What do we do with those "leftover" parts of the matrix?
+
+Another way to say this, is that `logical_divide`
+(CuTe's way of tiling layouts) "rounds up."
+For example, if `N` is the layout (1000, 1) and `B` is the layout (128, 1),
+then `logical_divide(N, B)` is the layout ((128, 8), (1, 128)).
+This effectively rounds up the original shape N = 1000
+into an 128 x 8 matrix (as if N = 1024).
+What about those last 24 elements,
+that aren't part of the original data?
+
+The idiomatic CuTe way to solve this problem is through "predication."
+Rather than trying to reason about the "remainder tiles,"
+CuTe instead rounds up, but only tries to access data in each tile
+that are part of the matrix.
+This corresponds well with how our GPUs optimize:
+branches without warp divergence are relatively fast.
+It also matches the usual CUDA idiom
+when dividing N work items in 1-D fashion over B thread blocks:
+first test if "my thread" is out of bounds before doing work.
+
+There are a few ways to figure out
+which elements need to be predicated.
+In-kernel GEMMs like to do this in the following way.
+
+```c++
+// Create the predicate tensor
+Layout idA  = make_layout(shape(A));   // e.g. 1000:1
+Layout idAB = logical_divide(idA, B);  // e.g. (128,8):(1,128)
+
+Tensor pred = make_tensor<bool>(shape(idAB));
+for (int i = 0; i < size(pred); ++i) {
+  pred(i) = idAB(i) < size(A);
+}
+
+// ... intervening code ...
+
+// Use the predicate tensor.  c is some coordinate.
+// This code would likely live inside some algorithm.
+if (pred(c)) { copy(idAB(c), smem(c)); }
+```
+
+The general procedure is that we
+
+1. create an "identity" layout (`Layout idA  = make_layout(shape(A))`,
+   in the above example) with the same shape as our original data;
+
+2. repeat the same tiling/partitioning/slicing (possibly rounding up)
+   on that identity layout (`Layout idAB = logical_divide(idA, B)`);
+
+3. create a "predicate tensor" by comparing the coordinates
+   of that reference layout with the bounds of the original layout;
+   and then
+
+4. use the predicate tensor to mask off accesses to out-of-bounds elements.
+
+For example, suppose that we've partitioned A and B tiles
+across threads as follows.
+
+```c++
+Tensor tAgA = local_partition(gA, tA, thread_idx);                  // (THR_M,THR_K,k)
+Tensor tAsA = local_partition(sA, tA, thread_idx);                  // (THR_M,THR_K,PIPE)
+
+Tensor tBgB = local_partition(gB, tB, thread_idx);                  // (THR_N,THR_K,k)
+Tensor tBsB = local_partition(sB, tB, thread_idx);                  // (THR_N,THR_K,PIPE)
+```
+
+`tAgA` and `tBgB` partition the global A resp. B matrices over threads,
+and `tAsA` and `tBsB` partition the shared memory tiles of A resp. B over threads.
+
+The following code creates predicate tensors
+corresponding to `tAgA` and `tBgB`.
+They will be computed once in the prologue.
+and will be used to mask off instructions in the inner loop.
+
+```c++
+Tensor tApA = make_tensor<bool>(make_shape (size<0>(tAgA), size<1>(tAgA)),
+                                make_stride(     Int<1>{},      Int<0>{}));
+Tensor tBpB = make_tensor<bool>(make_shape (size<0>(tBgB), size<1>(tBgB)),
+                                make_stride(     Int<1>{},      Int<0>{}));
+```
+
+We're only thread-parallelizing over the leftmost (row) dimension,
+so we only need to predicate over the leftmost dimension.
+Thus, we can make the rightmost (column) stride zero,
+since we will never actually address the rightmost dimension.
+
+The following code creates "two-dimensional identity tensors"
+that map coordinates (m,k) -> (m,k)
+for the tile of data within the thread block.
+
+```c++
+Tensor cA = make_identity_tensor(make_shape(size<0>(sA), size<1>(sA)));   // (BLK_M,BLK_K) -> (blk_m,blk_k)
+Tensor cB = make_identity_tensor(make_shape(size<0>(sB), size<1>(sB)));   // (BLK_N,BLK_K) -> (blk_n,blk_k)
+```
+
+The following lines then tile and partition
+the two reference tensors
+in exactly the same way the data were tiled and partitioned
+into `tAsA` and `tBsB`.
+
+```c++
+Tensor tAcA = local_partition(cA, tA, thread_idx);
+Tensor tBcB = local_partition(cB, tB, thread_idx);
+```
+
+Tiling and partitioning affect the offset and domain,
+but not the codomain of the tensors,
+so we're left with tensors that map `(thr_m,thr_k) -> (m,k)`
+where `(thr_m,thr_k)` is this particular thread's subtensor of the tile
+and `(m,k)` is the original codomain: a coordinate into the original tile.
+
+The unrolled loops in the code below then compare
+the m- and n-coordinates of those tensors with our known maximums
+to mask off elements we are not allowed to access.
+
+```c++
+Tensor cA   = make_identity_tensor(make_shape(size<0>(sA), size<1>(sA)));  // (BLK_M,BLK_K) -> (blk_m,blk_k)
+Tensor tAcA = local_partition(cA, tA, thread_idx);
+
+Tensor cB   = make_identity_tensor(make_shape(size<0>(sB), size<1>(sB)));  // (BLK_N,BLK_K) -> (blk_n,blk_k)
+Tensor tBcB = local_partition(cB, tB, thread_idx);
+
+// Populate
+CUTE_UNROLL
+for (int m = 0; m < size<0>(tApA); ++m) {
+  tApA(m,0) = get<0>(tAcA(m,0)) < m_max_coord;
+}
+CUTE_UNROLL
+for (int n = 0; n < size<0>(tBpB); ++n) {
+  tBpB(n,0) = get<0>(tBcB(n,0)) < n_max_coord;
+}
+```
+
+Those last `for` loops fill in the two predicate tensors.
+In this case, we only need to predicate over the leftmost dimension,
+so we only address `(m,0)` resp. `(n,0)`.
+
+We can then use the predicate tensors in `copy_if`
+to copy only the elements for which the corresponding
+predicate tensor elements are nonzero.
+
+```c++
+// Prefetch k_tile=0, gate these on k_residue as well
+CUTE_UNROLL
+for (int k = 0; k < size<1>(tAsA); ++k) {
+  if (get<1>(tAcA(0,k)) >= -k_residue) { // some other condition on the column index
+    copy_if(tApA, tAgA(_,k,0), tAsA(_,k,0));
+  }
+}
+
+CUTE_UNROLL
+for (int k = 0; k < size<1>(tBsB); ++k) {
+  if (get<1>(tBcB(0,k)) >= -k_residue) { // some other condition on the column index
+    copy_if(tBpB, tBgB(_,k,0), tBsB(_,k,0));
+  }
+}
+```
+
+Here are some advantages of this "reference tensor" approach.
+
+1. It doesn't depend on the layout/strides of the tensor
+   being predicated, just the logical bounds being imposed.
+
+2. The partitioning stage can be anything.
+
+3. It naturally extends to any-dimensional predication.
+
+4. It's a natural generalization of a typical CUDA 1-D
+   parallel vector access pattern,
+   which computes an access index `k`
+   (e.g., as `blockDim.x * blockIdx.x + threadIdx.x`)
+   and then predicates access to the vector's `k`-th element
+   on whether `k` is in bounds.
+
+As an example of (3), the epilogue predication does exactly the same thing,
+
+```c++
+// Repeat with a tensor of coordinates for predication
+Tensor cC   = make_identity_tensor(make_shape(size<0>(gC), size<1>(gC)));
+Tensor tCcC = thr_mma.partition_C(cC);
+
+const bool isBetaZero = (beta == 0);
+
+CUTE_UNROLL
+for (int i = 0; i < size(tCrC); ++i) {
+  if (elem_less(tCcC(i), make_coord(m_max_coord,n_max_coord))) {
+    tCgC(i) = isBetaZero ? alpha * tCrC(i) : alpha * tCrC(i) + beta * tCgC(i);
+  }
+}
+```
+
+but with the mma responsible for the tiling/partitioning `tCcC`
+so that the reference subtensor matches the accumulator's subtensor.
+Then, the reference subtensor is predicated against the `if` bounds
+(in both m- and n-coordinates) inside the `for` loop.
+
+Another way to explain this is that we don't modify the tiles
+to give you the "right" extents so that you never overrun.
+Instead, we let you query the original coordinate
+to see if that coordinate overruns.
+This avoids all branching and variable/dynamic loop bounds
+(thus maintaining load balance and synchronicity,
+both very important in-kernel) in favor of predication.
+It's also general enough to extend to all ranks,
+all layouts of threads and data,
+and all tiling/partitioning patterns.
diff --git a/media/docs/cute/0z_tma_tensors.md b/media/docs/cute/0z_tma_tensors.md
new file mode 100644
index 0000000000..6afd895d60
--- /dev/null
+++ b/media/docs/cute/0z_tma_tensors.md
@@ -0,0 +1,233 @@
+# CuTe TMA Tensors
+
+Along your travels, you may find strange looking CuTe Tensors that are printed as something like
+```
+ArithTuple(0,_0,_0,_0) o ((_128,_64),2,3,1):((_1@0,_1@1),_64@1,_1@2,_1@3)
+```
+What is an `ArithTuple`? Are those tensor strides? What do those mean? What is this for?
+
+This documentation intends to answer those questions and introduce some of the more advanced features of CuTe.
+
+# Introduction to TMA instructions
+
+The Tensor Memory Accelerator (TMA) is a set of instructions for copying possibly multidimensional arrays between global and shared memory.  TMA was introduced in the Hopper architecture.  A single TMA instruction can copy an entire tile of data all at once.  As a result, the hardware no longer needs to compute individual memory addresses and issue a separate copy instruction for each element of the tile.
+
+To accomplish this, the TMA instruction is given a *TMA descriptor*, which is a packed representation of a multidimensional tensor in global memory with 1, 2, 3, 4, or 5 dimensions. The TMA descriptor holds
+
+* the base pointer of the tensor;
+
+* the data type of the tensor's elements (e.g., `int`, `float`, `double`, or `half`);
+
+* the size of each dimension;
+
+* the stride within each dimension; and
+
+* other flags representing the smem box size, smem swizzling patterns, and out-of-bounds access behavior.
+
+This descriptor must be created on the host before kernel execution.
+It is shared between all thread blocks that will be issuing TMA instructions.
+Once inside the kernel, the TMA is executed with the following parameters:
+
+* pointer to the TMA descriptor;
+
+* pointer to the SMEM; and
+
+* coordinates into the GMEM tensor represented within the TMA descriptor.
+
+For example, the interface for TMA-store with 3-D coordinates looks like this.
+
+```cpp
+struct SM90_TMA_STORE_3D {
+  CUTE_DEVICE static void
+  copy(void const* const desc_ptr,
+       void const* const smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2) {
+    // ... invoke CUDA PTX instruction ...
+  }
+};
+```
+
+We observe that the TMA instruction does not directly consume pointers to global memory. Indeed, the global memory pointer is contained in the descriptor, is considered constant, and is NOT a separate parameter to the TMA instruction. Instead, the TMA consumes TMA coordinates into the TMA's view of global memory that is defined in the TMA descriptor.
+
+That means that an ordinary CuTe Tensor that stores a GMEM pointer and computes offsets and new GMEM pointers is useless to the TMA.
+
+What do we do?
+
+# Building a TMA Tensor
+
+## Implicit CuTe Tensors
+
+All CuTe Tensors are compositions of Layouts and Iterators. An ordinary global memory tensor's iterator is its global memory pointer. However, a CuTe Tensor's iterator doesn't have to be a pointer; it can be any random-access iterator.
+
+One example of such an iterator is a *counting iterator*.
+This represents a possibly infinite sequence of integers that starts at some value.
+We call the members of this sequence *implicit integers*,
+because the sequence is not explicitly stored in memory.
+The iterator just stores its current value.
+
+We can use a counting iterator to create a tensor of implicit integers,
+```cpp
+Tensor A = make_tensor(counting_iterator<int>(42), make_shape(4,5));
+print_tensor(A);
+```
+which outputs
+```
+counting_iter(42) o (4,5):(_1,4):
+   42   46   50   54   58
+   43   47   51   55   59
+   44   48   52   56   60
+   45   49   53   57   61
+```
+This tensor maps logical coordinates to on-the-fly computed integers. Because it's still a CuTe Tensor, it can still be tiled and partitioned and sliced just like a normal tensor by accumulating integer offsets into the iterator.
+
+But the TMA doesn't consume pointers or integers, it consumes coordinates. Can we make a tensor of implicit TMA
+coordinates for the TMA instruction to consume? If so, then we could presumably also tile and partition and slice that tensor of coordinates so that we would always have the right TMA coordinate to give to the instruction.
+
+## ArithTupleIterators and ArithTuples
+
+First, we build a `counting_iterator` equivalent for TMA coordinates. It should support
+
+* dereference to a TMA coordinate, and
+
+* offset by another TMA coordinate.
+
+We'll call this an `ArithmeticTupleIterator`. It stores a coordinate (a tuple of integers) that is represented as an `ArithmeticTuple`. The `ArithmeticTuple` is simply a (public subclass of) `cute::tuple` that has an overloaded `operator+` so that it can be offset by another tuple. The sum of two tuples is the tuple of the sum of the elements.
+
+Now similar to `counting_iterator<int>(42)` we can create an implicit "iterator" (but without increment or other common iterator operations) over tuples that can be dereferenced and offset by other tuples
+```cpp
+ArithmeticTupleIterator citer_1 = make_inttuple_iter(42, Int<2>{}, Int<7>{});
+ArithmeticTupleIterator citer_2 = citer_1 + make_tuple(Int<0>{}, 5, Int<2>{});
+print(*citer_2);
+```
+which outputs
+```
+(42,7,_9)
+```
+
+A TMA Tensor can use an iterator like this to store the current TMA coordinate "offset". The "offset" here is in quotes because it's clearly not a normal 1-D array offset or pointer.
+
+In summary, one creates a TMA descriptor for the *whole global memory tensor*. The TMA descriptor defines a view into that tensor and the instruction takes TMA coordinates into that view. In order to generate and track those TMA coordinates, we define an implicit CuTe Tensor of TMA coordinates that can be tiled, sliced, and partitioned the exact same way as an ordinary CuTe Tensor.
+
+We can now track and offset TMA coordinates with this iterator, but how do we get CuTe Layouts to generate non-integer offsets?
+
+## Strides aren't just integers
+
+Ordinary tensors have a layout that maps
+a logical coordinate `(i,j)` into a 1-D linear index `k`.
+This mapping is the inner-product of the coordinate with the strides.
+
+TMA Tensors hold iterators of TMA coordinates.
+Thus, a TMA Tensor's Layout must map a logical coordinate
+to a TMA coordinate, rather than to a 1-D linear index.
+
+To do this, we can abstract what a stride is. Strides need not be integers, but rather any algebraic object that supports inner-product with the integers (the logical coordinate). The obvious choice is the `ArithmeticTuple` we used earlier since they can be added to each other, but this time additionally equipped with an `operator*` so it can also be scaled by an integer.
+
+### Aside: Integer-module strides
+
+A group of objects that support addition between elements and product between elements and integers is called an integer-module.
+
+Formally, an integer-module is an abelian group `(M,+)` equipped with `Z*M -> M`, where `Z` are the integers. That is, an integer-module `M` is
+a group that supports inner products with the integers.
+The integers are an integer-module.
+Rank-R tuples of integers are an integer-module.
+
+In principle, layout strides may be any integer-module.
+
+### Basis elements
+
+CuTe's basis elements live in the header file `cute/numeric/arithmetic_tuple.hpp`.
+To make it easy to create `ArithmeticTuple`s that can be used as strides, CuTe defines normalized basis elements using the `E` type alias. "Normalized" means that the scaling factor of the basis element is the compile-time integer 1.
+
+| C++ object | Description           | String representation |
+| ---        | ---                   | ---                   |
+| `E<>{}`    | `1`                   | `1`                   |
+| `E<0>{}`   | `(1,0,...)`           | `1@0`                 |
+| `E<1>{}`   | `(0,1,0,...)`         | `1@1`                 |
+| `E<0,1>{}` | `((0,1,0,...),0,...)` | `1@1@0`               |
+| `E<1,0>{}` | `(0,(1,0,...),0,...)` | `1@0@1`               |
+
+The "description" column in the above table
+interprets each basis element as an infinite tuple of integers,
+where all the tuple's entries not specified by the element's type are zero.
+We count tuple entries from left to right, starting with zero.
+For example, `E<1>{}` has a 1 in position 1: `(0,1,0,...)`.
+`E<3>{}` has a 1 in position 3: `(0,0,0,1,0,...)`.
+
+Basis elements can be *nested*.
+For instance, in the above table, `E<0,1>{}` means that
+in position 0 there is a `E<1>{}`: `((0,1,0,...),0,...)`.
+
+Basis elements can be *scaled*.
+That is, they can be multiplied by an integer *scaling factor*.
+For example, in `5*E<1>{}`, the scaling factor is `5`.
+`5*E<1>{}` prints as `5@1` and means `(0,5,0,...)`.
+The scaling factor commutes through any nesting.
+For instance, `5*E<0,1>{}` prints as `5@1@0`
+and means `((0,5,0,...),0,...)`.
+
+Basis elements can also be added together,
+as long as their hierarchical structures are compatible.
+For example, `3*E<0>{} + 4*E<1>{}` results in `(3,4,0,...)`.
+Intuitively, "compatible" means that
+the nested structure of the two basis elements
+matches well enough to add the two elements together.
+
+### Linear combinations of strides
+
+Layouts work by taking the inner product
+of the natural coordinate with their strides.
+For strides made of integer elements, e.g., `(1,100)`,
+the inner product of the input coordinate `(i,j)`
+and the stride is `i + 100j`.
+Offsetting an "ordinary" tensor's pointer and this index
+gives the pointer to the tensor element at `(i,j)`.
+
+For strides of basis elements, we still compute the inner product of the natural coordinate with the strides.
+For example, if the stride is `(1@0,1@1)`,
+then the inner product of the input coordinate `(i,j)`
+with the strides is `i@0 + j@1 = (i,j)`.
+That translates into the (TMA) coordinate `(i,j)`.
+If we wanted to reverse the coordinates,
+then we could use `(1@1,1@0)` as the stride.
+Evaluating the layout would give `i@1 + j@0 = (j,i)`.
+
+A linear combination of basis elements
+can be interpreted as a possibly multidimensional and hierarchical coordinate.
+For instance, `2*2@1@0 + 3*1@1 + 4*5@1 + 7*1@0@0`
+means `((0,4,...),0,...) + (0,3,0,...) + (0,20,0,...) + ((7,...),...) = ((7,4,...),23,...)`
+and can be interpreted as the coordinate `((7,4),23)`.
+
+Thus, linear combinations of these strides can be used to generate TMA coordinates.
+These coordinates, in turn, can be used to offset TMA coordinate iterators.
+
+## Application to TMA Tensors
+
+Now we can build CuTe Tensors like the one seen in the introduction.
+
+```cpp
+Tensor a = make_tensor(make_inttuple_iter(0,0),
+                       make_shape (     4,      5),
+                       make_stride(E<0>{}, E<1>{}));
+print_tensor(a);
+
+Tensor b = make_tensor(make_inttuple_iter(0,0),
+                       make_shape (     4,      5),
+                       make_stride(E<1>{}, E<0>{}));
+print_tensor(b);
+```
+prints
+```
+ArithTuple(0,0) o (4,5):(_1@0,_1@1):
+  (0,0)  (0,1)  (0,2)  (0,3)  (0,4)
+  (1,0)  (1,1)  (1,2)  (1,3)  (1,4)
+  (2,0)  (2,1)  (2,2)  (2,3)  (2,4)
+  (3,0)  (3,1)  (3,2)  (3,3)  (3,4)
+
+ArithTuple(0,0) o (4,5):(_1@1,_1@0):
+  (0,0)  (1,0)  (2,0)  (3,0)  (4,0)
+  (0,1)  (1,1)  (2,1)  (3,1)  (4,1)
+  (0,2)  (1,2)  (2,2)  (3,2)  (4,2)
+  (0,3)  (1,3)  (2,3)  (3,3)  (4,3)
+```
+
+
diff --git a/media/docs/cutlass_3x_backwards_compatibility.md b/media/docs/cutlass_3x_backwards_compatibility.md
new file mode 100644
index 0000000000..024e66f4b1
--- /dev/null
+++ b/media/docs/cutlass_3x_backwards_compatibility.md
@@ -0,0 +1,473 @@
+[README](../../README.md#documentation) > **CUTLASS 3.0 GEMM Backwards Compatibility**
+
+# CUTLASS 3.0 GEMM Backwards Compatibility
+
+Although CUTLASS 3.0 restructures the GEMM hierarchy and introduces new types for the
+threadblock layer and below, we intend the entire source code to be usable in user applications.
+We expect users to be able to `#include` any source file from CUTLASS 3.0, whether
+they implement the 2.x or the 3.x API, without breaking user builds. This means that a single
+translation unit should be able to contain any valid kernel regardless of its API version. The
+sections below discuss how `device` and `kernel` layer type names are made compatible across the
+two API versions, and what the users can expect out of the `threadblock` layer API going forward.
+
+## Compatible Device API
+
+The entry point for CUTLASS's Device GEMM API
+is the class
+`cutlass::gemm::device::GemmUniversalAdapter`.
+This class lives in the header file
+[include/cutlass/gemm/device/gemm_universal_adapter.h](/include/cutlass/gemm/device/gemm_universal_adapter.h).
+
+`GemmUniversalAdapter` is a "universal adapter"
+and serves as a common device interface
+for both CUTLASS 3.x and CUTLASS 2.x kernels.
+Its template parameter `GemmKernel`,
+the GEMM kernel type, can be any of the following:
+
+* `cutlass::gemm::kernel::GemmUniversal`,
+  implementing CUTLASS 3.x API kernels;
+* `cutlass::gemm::kernel::GemmUniversal`,
+  implementing CUTLASS 2.x API kernels;
+* Any valid CUTLASS 2.x `kernel` layer GEMM that
+  was previously composable with `device::GemmUniversalAdapter`
+
+Users implementing new kernels in either API should prefer
+using `kernel::GemmUniversal` as the kernel type
+and compose it with `device::GemmUniversalAdapter`.
+Users with existing `kernel::Gemm` kernels
+can continue to use them as template arguments
+of `device::GemmUniversalAdapter`. They can adopt
+`GemmUniversal` as a gradual migration path,
+since `GemmUniversal` accepts either 3.0 or 2.x collectives.
+Please see the [next section for `kernel::GemmUniversal`](#compatible-kernel-api) for details.
+
+`GemmUniversalAdapter` presents a single
+host-side interface to both 3.0 and 2.x kernels.
+CUTLASS accomplishes this by
+specializing `GemmUniversalAdapter`'s implementation
+on either 2.x API implementing kernel layer GEMMs, or 3.x API
+implementing kernel layer GEMMs (as detected by `gemm::detail::IsCutlass3GemmKernel`
+discussed below). As a result, `GemmUniversalAdapter`'s behavior
+might differ between the two specializations.
+
+### Device API design differences
+
+In CUTLASS 2.x, the Device API was more closely tied
+to the Kernel API.  In CUTLASS 3.0, the Device API
+accepts any kernel type that meets the Kernel API
+interface requirements.  CUTLASS 3.0's Device API code is
+parameterized by the kernel type, but this code
+is *generic*; the same code works for any kernel type.
+
+The device layer compatibility interface, `device::GemmUniversalAdapter`,
+also provides reflective mappings from 3.0-specific types
+back to the closest possible 2.x equivalent types. This is [discussed further in the section below](#conversions-between-2x-tags-and-30-types).
+
+CUTLASS 3.0's `device::GemmUniversalAdapter` also exposes some new APIs that the 2.x `device::GemmUniversalAdapter` implementation does not. Most notably, this includes the ability to bypass the `GemmKernel::Arguments` to `GemmKernel::Params` lowering.
+
+```c++
+// Primary run() entry point API that is static allowing users to create and manage their own params.
+static Status
+run(Params& params, cudaStream_t stream = nullptr);
+```
+
+This new API is useful for the following scenarios.
+
+* Running again does not require reinvoking `GemmKernel::to_underlying_arguments()`
+* Manual control over construction of `GemmKernel::Params` for custom kernels with custom stride types
+* Fully static problem shapes and strides for bespoke kernels where no argument mapping needs to take place
+
+## Compatible Kernel API
+
+CUTLASS 3.x API shares the kernel layer API with CUTLASS 2.x
+through the single entry point type `cutlass::gemm::kernel::GemmUniversal`.
+All kernel layer GEMMs are viewed as a composition of a collective mainloop
+and a collective epilogue.
+
+**`kernel::GemmUniversal` implements both 2.x and 3.x APIs**
+
+The entry point for CUTLASS's kernel API is the class
+`cutlass::gemm::kernel::GemmUniversal`.
+This class' declaration lives in the header file
+[include/cutlass/gemm/kernel/gemm_universal.hpp](/include/cutlass/gemm/kernel/gemm_universal.hpp).
+
+```c++
+/*
+ * Stateless universal device GEMM kernel type that treats GEMM as
+ * a composition of a collective mainloop and a collective epilogue.
+ * SFIANE shims both 2.x and 3.0 API kernels based on ProblemShapeOrThreadblockMma_.
+**/
+template <
+  class ProblemShapeOrThreadblockMma_,
+  class CollectiveMainloopOrEpilogue_,
+  class CollectiveEpilogueOrThreadblockSwizzle_,
+  class TileScheduler_ = void,
+  class Enable = void
+>
+class GemmUniversal;
+```
+
+We call this class "universal" because it can be built
+using either the CUTLASS 3.0 or the 2.x mainloops and epilogues.
+If `GemmUniversal`'s first template argument
+(`ProblemShapeOrThreadblockMma_`) is a `cute::tuple`,
+then `GemmUniversal` assumes that
+the remaining three template arguments
+(the mainloop, epilogue, and grid swizzle)
+implement the 3.0 APIs.
+Otherwise, `GemmUniversal` assumes that
+the remaining three template arguments
+implement the 2.x APIs.
+All the template arguments must be either
+CUTLASS 3.0 or CUTLASS 2.x types. For example,
+`GemmUniversal` does not permit using
+a 2.x mainloop with a 3.0 collective epilogue.
+
+CUTLASS 3.x implements various embodiments of `kernel::GemmUniversal`.
+Each kernel layer schedule is specialized
+for a GEMM scheduling algorithm and GPU architecture.
+Specializations of `kernel::GemmUniversal` for 3.0 APIs live in
+any of various `gemm_*.hpp` files in the directory
+[include/cutlass/gemm/kernel/](../../include/cutlass/gemm/kernel/).
+The specialization to which to dispatch is decided through the dispatch policy's `Schedule` type.
+
+Specializations for 2.x APIs live in the header file
+[include/cutlass/gemm/kernel/gemm_universal.h](../../include/cutlass/gemm/kernel/gemm_universal.h).
+
+### Kernel API design differences
+
+The CUTLASS 2.x Kernel API was more closely tied
+to the Device API, as we mentioned above.
+In particular, the 2.x Device API specified the grid shape
+used to launch the Kernel API.
+In CUTLASS 3.0, the Kernel API controls its own grid shape,
+while the device adapter simply queries the kernel with which it needs to be launched.
+
+This change is required to support various kernel schedules
+that may need their own schedule specific grid planning logic.
+For example, persistent kernel schedules generally only launch with
+as many threadblocks as the number of multiprocessors on the GPU.
+
+All CUTLASS 3 `kernel::GemmUniversal` specializations expose the following (static) API:
+
+```c++
+// Returns true if the kernel can execute the provided GEMM arguments.
+static bool
+can_implement(Arguments const& args);
+
+// Returns a dim3 representing the threadblock shape.
+static dim3
+get_block_shape();
+
+// Returns a dim3 representing the grid shape in terms of threadblocks.
+static dim3
+get_grid_shape(Params const& params);
+```
+
+The device adapter simply queries the kernel for these three before launching it on the device.
+CUTLASS 3.0 provides a meta-function to detect whether a `cutlass::gemm::kernel::*` implements
+the 3.x API or 2.x API:
+
+```c++
+// include/cutlass/gemm/gemm.h
+
+namespace cutlass:gemm::detail {
+
+// The following metafunction is used to detect whether a
+// `kernel::Gemm` or `kernel::GemmUniversal` implements the CUTLASS 3.x API,
+// by checking whether the problem shape type is aliased within.
+template <class GemmKernel, class = void>
+struct IsCutlass3GemmKernel;
+
+} // namespace cutlass:gemm::detail
+```
+
+Users can dispatch their generic code against 2.x and 3.x specializations with
+this as a type trait for the kernel API version.
+
+## Threadblock API and Inner Loops
+
+Much of the CUTLASS 3 GEMM hierarchy for mainloops and inner loops diverges
+from that of CUTLASS 2.x.  With that also comes the introduction of the
+`cutlass::gemm::collective` layer as a direct replacement and a superset
+of the 2.x `cutlass::gemm::threadblock` layer. Going forward,
+CUTLASS 3.x will discontinue new developments in the following namespaces.
+
+* `cutlass::*::threadblock::*`
+* `cutlass::*::warp::*`
+* `cutlass::gemm::thread::*`
+* `cutlass::arch::*` (except `barrier.h`)
+
+`cutlass::gemm::collective`s are a superset of the threadblock layer where
+all new mainloops will be developed. Users should look to the `CollectiveMma` type
+if they wish to author custom mainloop code in the 3.x API.
+
+Similarly, for the GEMM inner loops, `cute::MMA_Atom`s replace the
+`gemm::warp` and `gemm::thread` layer code. Going forward, all new PTX instructions
+and associated metadata development will occur directly inside [`cute/arch/*.hpp`](/include/cute/arch/) and [`cute/atom/*.hpp`](/include/cute/atom/).
+
+The desired inner loop MMA iteration order and tiling can be achieved through careful
+selection of the atom layout, value layout, and permutations of the `cute::TiledMma`.
+
+For epilogues, the `cutlass::epilogue::collective` layer replaces `cutlass::threadblock::collective`.  However, the thread-level epilogue elementwise operations
+in `cutlass::epilogue::thread` will continue to be used in 3.x kernels as well, albeit, with
+a more idiomatic epilogue vectorization strategy.
+[Example 50](/examples/50_hopper_gemm_with_epilogue_swizzle/50_hopper_gemm_with_epilogue_swizzle.cu)
+shows how to use 2.x epilogue thread operators with 3.0 API kernels.
+
+## Porting from 2.x to 3.0 API
+
+### CUTLASS 2.x layout tags and CUTLASS 3.0 major modes
+
+CUTLASS 2.x and CUTLASS 3.0 use both
+different wording and different types
+to describe the permitted layouts
+of GEMM's input matrices A and B.
+
+CUTLASS 3.0 does not use the terms "column major"
+or "row major" to describe matrix layouts.
+Starting with CUTLASS 3.0, adoption of CuTe allows us to decouple
+
+* the coordinate mode order (logical shape) of layouts from
+
+* the index space stride order of the backing storage.
+
+In line with our switch to a conceptual GEMM hierarchy, we view the major modes not from a BLAS-3 perspective.
+Rather, we divide the modes into two categories.
+
+* "Inner modes" or "K-modes" are contracted over during the GEMM.
+  Therefore, they are not present in the output tensor.
+
+* "Outer modes" or "MN-modes" are preserved in the output.
+
+Now, instead of `RowMajor` or `ColumnMajor`, whose major stride depends on whether we are referring to the
+A or the B matrix, we uniformly employ the "K major" or "MN major" terminology and enforce the convention of all tensors having the shape `[M/N, K, L]` regardless of which mode is major.  That is,
+
+* the input matrix A has shape M x K,
+* the input matrix B has shape N x K, and
+* the input/output matrices C/D have shape M x N.
+
+Note that this convention for B
+differs from the BLAS's GEMM interface,
+which specifies that B has shape K x N.
+
+CUTLASS 3.0 uses these names of the modes
+to specify which mode of a matrix has stride 1.
+For the matrix A,
+
+* "M major" means that the matrix is stride 1
+  in the M mode, and
+* "K major" means that the matrix is stride 1
+  in the K mode.
+
+For the matrix B,
+
+* "N major" means that the matrix is stride 1
+  in the N mode (which for B is mode 0,
+  because the convention is that B is N x K); and
+* "K major" means that the matrix is stride 1
+  in the K mode (which for B is mode 1).
+
+CUTLASS 2.x defines "layout tag" classes
+`cutlass::layout::ColumnMajor` and `cutlass::layout::RowMajor`,
+that live in the header file
+[`cutlass/layout/matrix.h`](/include/cutlass/layout/matrix.h).
+The interpretation of these layouts in GEMM
+depends on whether they are applied
+to the input matrix A or B. For the matrix A, "column major" means
+that mode corresponding to M extent has stride 1,
+and "row major" means that mode corresponding to K extent has stride 1.
+This is the usual computer science definition
+of column major and row major for a rank-2 array.
+For the matrix B, the opposite holds:
+"column major" means that mode corresponding to N extent has stride 1,
+and "row major" means that mode corresponding to K extent has stride 1.
+
+Using the convention of `[outer, inner, batch]` mode order for tensor logical shapes
+avoids potential confusion with the meaning of column major and row major
+changing depending on whether they are applied to A or B.
+
+The table below summarizes our mode order convention and
+mapping of 2.x layout tags to corresponding M-major, N-major, or K-major strides.
+
+| Matrix | CUTLASS 2.x layout | 2.x Shape  | Logical major mode| 3.x Shape/Stride  | Major ordinal |
+| ---    | ---                | ---        | ---               | ---               | ---           |
+| A      | `ColumnMajor`      | M x K      | M major           | M x K x L         | 0 (outer)     |
+| A      | `RowMajor`         | M x K      | K major           | M x K x L         | 1 (inner)     |
+| B      | `RowMajor`         | K x N      | N major           | N x K x L         | 0 (outer)     |
+| B      | `ColumnMajor`      | K x N      | K major           | N x K x L         | 1 (inner)     |
+| C      | `ColumnMajor`      | M x N      | M major           | M x N x L         | 0 (outer)     |
+| C      | `RowMajor`         | M x N      | N major           | M x N x L         | 1 (inner)     |
+
+Notice that in CUTLASS 3.0, interpretation of layouts no longer changes based on
+whether we are talking about the A or B matrix. M and N major inputs always have a
+static size-1 stride in their 0th (outer) mode. Similarly, K major inputs
+always contain the static size-1 stride in their 1st mode. This uniformity in stride order
+allows us to represent tensor layouts much more cleanly and treat both A and B equally in our interfaces.
+See for example the following snippet from our [`kernel/sm70_gemm.hpp`](/include/cutlass/gemm/kernel/sm70_gemm.hpp)
+for Ampere kernel schedules.
+
+```c++
+// Represent the full tensors
+Tensor mA_mkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_A), make_shape(M,K,L), params.mainloop.dA); // (m,k,l)
+Tensor mB_nkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_B), make_shape(N,K,L), params.mainloop.dB); // (n,k,l)
+
+// Get batch slice
+Tensor mA_mk = mA_mkl(_,_,get<3>(blk_coord_mnkl)); // (m,k)
+Tensor mB_nk = mB_nkl(_,_,get<3>(blk_coord_mnkl)); // (n,k)
+
+// Slice to get the tiles for which this thread block is responsible
+Tensor gA = local_tile(mA_mk, blk_shape, take<0,3>(blk_coord_mnkl), Step<_1, X,_1>{}); // (BLK_M,BLK_K,k)
+Tensor gB = local_tile(mB_nk, blk_shape, take<0,3>(blk_coord_mnkl), Step< X,_1,_1>{}); // (BLK_N,BLK_K,k)
+```
+
+As seem in this snippet, all input tensors have the logical shape `[outer, inner, batch]`,
+and the strides could represent either outer or inner
+(or any other complex hierarchical stride) major storage.
+CuTe layouts always maintain the logical consistency of the coordinate spaces regardless of the strides.
+
+By convention, in CUTLASS 3.0, we treat the M and N mode as the 0th mode,
+and K mode as the 1st mode of the stride.
+
+### Conversions between 2.x tags and 3.0 types
+
+Starting with CUTLASS 3.0, all layouts are described using
+`cute::Shape` and `cute::Stride` which compose into a `cute::Layout<Shape, Stride>`.
+In CUTLASS 2.x, various layout tags such as `cutlass::layout::RowMajor` are used to specialize
+template implementations. These tag types only encode information about the tensor strides,
+as 2.x layouts did not incorporate any concept of tensor shape in the layout tags themselves.
+Users may find a need to convert between CUTLASS 2.x layout tags, and 3.0
+CuTe stride types. CUTLASS 3.0 `gemm::collective::CollectiveBuilder` interfaces
+also accept these 2.x layout tags as input parameters in their template API as a convenience for users.
+At every entry point into CUTLASS 3.0, these tags get converted to their corresponding CuTe Stride type with
+metafunctions that best approximate their corresponding `cute::Stride`.
+
+* `cutlass::gemm::detail::TagToStrideA_t<LayoutTag>`
+* `cutlass::gemm::detail::TagToStrideB_t<LayoutTag>`
+* `cutlass::gemm::detail::TagToStrideC_t<LayoutTag>`
+
+By convention, and to match user expectations, the `cute::Stride` types that these
+map onto always contain one static mode corresponding to the layout tag, and two 64-bit
+dynamic stride modes corresponding to the minor mode and the batch mode. Batch
+mode is included by default as all CUTLASS 3.0 kernels support packed batch-mode GEMMs
+out of the box.
+
+The [`cutlass/gemm/gemm.h#440`](../../include/cutlass/gemm/gemm.h#440)
+header file includes functions
+that can be useful for converting
+from CUTLASS 3.0 `cute::Stride`s back to CUTLASS 2.x layout tags.
+
+* `cutlass::gemm::detail::StrideToLayoutTagA_t<CuteStride>`
+* `cutlass::gemm::detail::StrideToLayoutTagB_t<CuteStride>`
+* `cutlass::gemm::detail::StrideToLayoutTagC_t<CuteStride>`
+
+These metafunctions take the CuTe Stride as a template parameter and
+attempt to find the size-1 stride in the idiomatic M, N, or K modes
+to best approximate a corresponding 2.x layout tag type.
+Note that this may not work in general for any `cute::Stride`
+as the mapping between the stride and tag type is not bijective.
+
+These mapping utilities are kept in a `detail` namespace
+as we do not guarantee stability of their implementation.
+Their behavior may change in future releases as we add new features.
+However, we do expect these type names to remain stable. For users who want
+these 2.x reflective types from an assembled kernel with a more stable API,
+the specialization of `cutlass::gemm::device::GemmUniversalAdapter`
+for CUTLASS 3.0 kernel provides all aliases for all 2.x type aliases
+in addition to the layout tags. You can see how they are used in the header file
+[`cutlass/gemm/device/gemm_universal_adapter.h`](/include/cutlass/gemm/device/gemm_universal_adapter.h).
+Here is an excerpt.
+
+```c++
+  // Map back to 2.x type as best as possible
+  using LayoutA = gemm::detail::StrideToLayoutTagA_t<typename GemmKernel::StrideA>;
+  using LayoutB = gemm::detail::StrideToLayoutTagB_t<typename GemmKernel::StrideB>;
+  using LayoutC = gemm::detail::StrideToLayoutTagC_t<typename GemmKernel::StrideC>;
+  using LayoutD = gemm::detail::StrideToLayoutTagC_t<typename GemmKernel::StrideD>;
+
+  // Legacy: Assume MultiplyAdd only since we do not use this tag type in 3.0
+  using MathOperator = cutlass::arch::OpMultiplyAdd;
+
+  // If our TiledMMA's instruction thread layout size is larger than 1,
+  // we know it's a tensorop
+  using OperatorClass = std::conditional_t<
+      (cute::size(typename GemmKernel::TiledMma::AtomThrID{}) > 1),
+      cutlass::arch::OpClassTensorOp, cutlass::arch::OpClassSimt>;
+
+  // Assume TiledMma's ShapeMNK is the same as 2.x's ThreadblockShape
+  using ThreadblockShape = cutlass::gemm::GemmShape<
+      cute::size<0>(TileShape{}),
+      cute::size<1>(TileShape{}),
+      cute::size<2>(TileShape{})>;
+
+  using ClusterShape = cutlass::gemm::GemmShape<
+      cute::size<0>(typename GemmKernel::DispatchPolicy::ClusterShape{}),
+      cute::size<1>(typename GemmKernel::DispatchPolicy::ClusterShape{}),
+      cute::size<2>(typename GemmKernel::DispatchPolicy::ClusterShape{})>;
+
+  // We get the instruction shape directly from our TiledMma's atom shape
+  using InstructionShape = cutlass::gemm::GemmShape<
+      cute::size<0>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{}),
+      cute::size<1>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{}),
+      cute::size<2>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{})>;
+
+  static int constexpr kStages = CollectiveMainloop::DispatchPolicy::Stages;
+  static int const kThreadCount = GemmKernel::MaxThreadsPerBlock;
+
+  // Warp shape is not a primary API type in 3.x,
+  // but we can best approximate it by inspecting the TiledMma
+  // For this, we make the assumption that we always have 4 warps along M,
+  // and the rest along N, with none along K.  We also always round up
+  // the warp count to 4 if the tiled mma is smaller than 128 threads.
+  static constexpr int WarpsInMma = std::max(4, CUTE_STATIC_V(cute::size(typename GemmKernel::TiledMma{})) / 32);
+  static constexpr int WarpsInMmaM = 4;
+  static constexpr int WarpsInMmaN = cute::ceil_div(WarpsInMma, WarpsInMmaM);
+  using WarpCount = cutlass::gemm::GemmShape<WarpsInMmaM, WarpsInMmaN, 1>;
+  using WarpShape = cutlass::gemm::GemmShape<
+      CUTE_STATIC_V(cute::tile_size<0>(typename CollectiveMainloop::TiledMma{})) / WarpsInMmaM,
+      CUTE_STATIC_V(cute::tile_size<1>(typename CollectiveMainloop::TiledMma{})) / WarpsInMmaN,
+      CUTE_STATIC_V(cute::tile_size<2>(typename CollectiveMainloop::TiledMma{}))>;
+
+  // Inspect TiledCopy for A and B to compute the alignment size
+  static int constexpr kAlignmentA = gemm::detail::get_alignment_count_from_gmem_tiled_copy<
+      typename CollectiveMainloop::GmemTiledCopyA, ElementA>();
+  static int constexpr kAlignmentB = gemm::detail::get_alignment_count_from_gmem_tiled_copy<
+      typename CollectiveMainloop::GmemTiledCopyB, ElementB>();
+```
+
+CUTLASS's library and profiler use these reflective interfaces to
+obtain the kernel's configuration parameters. Users can use these to approximate the CUTLASS 2.x types
+for 3.0 API kernels.  However, the reflective interfaces cannot always match the types exactly,
+as the mappings are not always bijective.
+
+# Copyright
+
+Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/media/docs/cutlass_3x_design.md b/media/docs/cutlass_3x_design.md
new file mode 100644
index 0000000000..399e200bea
--- /dev/null
+++ b/media/docs/cutlass_3x_design.md
@@ -0,0 +1,117 @@
+[README](../../README.md#documentation) > **CUTLASS 3.0 Design and Hierarchy**
+
+# CUTLASS 3.0 Design
+
+CUTLASS 3.0 is a major enhancement over the abstractions of CUTLASS 2.x
+and aims to make usage of all layers of the GEMM hierarchy easier and more composable
+while still achieving peak performance on Hardware.
+
+## CUTLASS 3.0 design goals
+
+CUTLASS 3.0 has the following design goals, in no particular order.
+
+- Simplify expressing and manipulating data and thread layouts across
+  the GEMM hierarchy with CuTe layouts and layout algebra.
+
+- Improve code readability and learning curve by
+  reducing the number of named types.
+
+- Functional correctness by default,
+  actionable static asserts otherwise.
+
+- Single, clear points of performance tuning and custom kernel extensions.
+
+- Support for NVIDIA Hopper GPUs with great performance using
+  features such as Tensor Cores, tensor memory accelerator, and thread block clusters.
+
+##  A new Conceptual GEMM Hierarchy
+
+CUTLASS 2.x decomposes the moving parts of a GEMM operation
+across a hierarchy that closely mirrors the organization of GPU
+architectures. This discussed in detail within the
+[CUTLASS 2.x GEMM API documentation](/media/docs/gemm_api.md).
+This design, however, sometimes results in a coupling that is too tight
+to extend to newer GPU features that might not fit into the same architectural
+hierarchy. For instance, Hopper's warp-group wide instructions do not naturally
+fit into any warp or thread layer GEMM concept in CUTLASS 2.x. Even for Volta tensor cores,
+instructions that atomically exist at the quad-pair granularity are first tiled at
+the warp level before use. This hints at the brittleness of the abstraction power.
+
+CUTLASS 3.0 detaches its interface layers from the hardware,
+centering them instead around the natural structure of GEMM algorithms
+not tied to any particular GPU generation.
+This makes CUTLASS's code more robust to GPU architecture evolution,
+less prone to implementation detail leakage, and provides users
+with a consistent interface to hardware acceleration regardless of
+the architecture specific details.
+
+The new conceptual GEMM hierarchy is discussed in detail in the dedicated
+[CUTLASS 3.0 GEMM API documentation readme](/media/docs/gemm_api_3x.md),
+along with code examples of the core concepts and types. 
+
+## Adoption of CuTe Layout and Tensors
+
+CUTLASS 3.0 introduces a new core library, CuTe, to describe and manipulate tensors of threads and data.
+CuTe is a collection of C++ CUDA template abstractions for defining and operating on hierarchically multidimensional layouts of threads and data. CuTe provides `Layout` and `Tensor` objects that compactly packages the type, shape, memory space, and layout of data, while performing the complicated indexing for the user. 
+
+CUTLASS 3.0 adopts CuTe throughout the GEMM hierarchy in its templates, greatly simplifying the design,
+improving code composability, and readability. More documentation specific to CuTe can be found in its [dedicated documentation directory](/media/docs/cute/00_quickstart.md).
+
+![CuTe helps reduce named iterator types down to a single vocabulary type, `Layout`](/media/images/cutlass-reduction-in-named-iterators.png)
+
+Programming massively parallel systems with various layers of logical thread and data hierarchies is not a trivial task. 
+
+- `cute::Layout`s always maintain logical consistency of their coordinates,
+  allowing us to check pre- and post-conditions at compile time for all static inner loops.
+- Explicit thread to data mapping allows users and kernel authors to inspect and reason about operations
+  from a single point in the source code.
+- Layouts provide a single point of performance tuning, as most optimizations can be done by careful
+  selection of thread and data layouts.
+- Formalized algebra makes manipulation of and reasoning about thread->data mapping explicit in source code.
+- Single vocabulary type (`cute::Layout`) subsumes every iterator and layout in CUTLASS 2.x CUTLASS 2.x uses many bespoke thread maps, iterators, and data layouts. Iterators are fundamentally 1-D, whereas most layouts we encounter in the GPU hierarchy are fundamentally n-D.
+
+## Reducing the number of named types and iterator concepts
+
+CUTLASS 2.x design preferred introducing bespoke named types for each
+architecture specific thread and data layout. For instance, `gemm::treadblock` namespace
+contains implementation for `MmaMultistage`, `MmaPlanarComplexMultistage`, `MmaPipelined` etc.
+despite them providing mainloops for GEMMs. To spell these types the same way in generic code,
+CUTLASS 2.x provides aliases through its `default_x_configuration.h` files, however,
+these aliases make the code much harder to read as the user has to perform type substitution
+mentally in order to understand the codebase.
+
+CUTLASS 3.0 greatly reduces the number of named types used throughout by
+
+- Replacing all iterator concepts for all memory domains with `cute::Tensor`s
+- Dispatching mainloop and epilogue implementations on tag-dispatch policies rather than naming new types
+- Dispatching kernel layer schedules on tag-dispatch policies rather than naming new types
+
+Reducing the number of named types has many benefits:
+
+- It *makes writing generic code easier*, as the primary type names share the same lexical
+  without aliasing through configuration providers.
+- It *flattens the learning curve of CUTLASS* by greatly reducing the mental context required
+  as the library only exposes a handful of named types.
+- It *provides a clear, singular extension point* for users to plug in their customizations
+  through the dispatch policies.
+
+## Correctness by default, Performance through clear, individual points of tuning
+
+CUTLASS 2.x maintained its thread layouts as implicit indexing math implemented
+as a part of 1D iterators. This meant that the thread to data layout mapping
+was implicit in the imperative structure of the C++ code itself and did not have
+a formal algebra we could use to manipulate these mappings. Each iterator
+had to re-implement its indexing and mapping logic. This made it hard to learn
+how this mapping was performed for existing iterators, and even harder to
+implement custom layout functions for the core inner loops of a GEMM.
+
+CUTLASS 3.0 replaces all iterator concepts from CUTLASS 2.x
+with a single layout type for thread and data tensors.
+CuTe's formalized layout algebra is then used at every layer of
+the GEMM hierarchy to manipulate the mapping between the two.
+CuTe layouts always maintain logical consistency, and for fully static layouts
+(such as in the core unrolled inner loops), provide
+compile time checks that break builds if this consistency is violated.
+In this way, CuTe reifies the thread-to-data-layout mapping, 
+makes it easier to write code that is "correct by construction".
+If the code compiles, it's probably correct. 
diff --git a/media/docs/dependent_kernel_launch.md b/media/docs/dependent_kernel_launch.md
new file mode 100644
index 0000000000..76eadd20bc
--- /dev/null
+++ b/media/docs/dependent_kernel_launch.md
@@ -0,0 +1,32 @@
+[README](../../README.md#documentation) > **Dependent kernel launch**
+
+# Dependent kernel launches
+
+The Hopper architecture supports a new feature through which two kernels in the same stream can
+overlap their execution, named 
+[Programmatic Dependent Launch (PDL)](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programmatic-dependent-launch-and-synchronization).
+This allows kernels with conflict in global memory to programmatically and safely overlap portions
+of their execution. Primary kernel can signal it is about to finish execution, and the next kernel can
+optionally wait on the previous kernel to finish flushing its memory.
+
+For more information, we refer you to the [PDL section in the CUDA Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programmatic-dependent-launch-and-synchronization).
+
+## Using dependent launch in CUTLASS
+
+When building CUTLASS, you can use the `CUTLASS_ENABLE_GDC_FOR_SM90` macro to 
+enable PDL-related instructions in Hopper kernels:
+
+```
+cmake . -DCUTLASS_ENABLE_GDC_FOR_SM90=1
+```
+
+Note that this only adds PDL-related instructions to the _kernels_, but to actually allow a dependent
+launch, you must also run your GEMM kernel with PDL:
+
+```
+gemm.run(
+  /* stream = */ stream,
+  /* cuda_adapter = */ nullptr,
+  /* launch_with_pdl = */ true
+);_
+```
diff --git a/media/docs/doxygen_mainpage.md b/media/docs/doxygen_mainpage.md
index 53620d99c4..17e4b159bf 100644
--- a/media/docs/doxygen_mainpage.md
+++ b/media/docs/doxygen_mainpage.md
@@ -1,14 +1,14 @@
-# CUTLASS 2.0
+# CUTLASS 3.0
 
-_CUTLASS 2.0 - November 2019_
+_CUTLASS 3.0 - January 2023_
 
 CUTLASS is a collection of CUDA C++ template abstractions for implementing
 high-performance matrix-multiplication (GEMM) at all levels and scales within CUDA.
 It incorporates strategies for hierarchical decomposition and data movement similar
 to those used to implement cuBLAS.  CUTLASS decomposes these "moving parts" into
 reusable, modular software components abstracted by C++ template classes.  These
-thread-wide, warp-wide, block-wide, and device-wide primitives can be specialized
-and tuned via custom tiling sizes, data types, and other algorithmic policy. The
+components can be specialized
+and tuned via custom tiling sizes, data types, and other algorithmic policies. The
 resulting flexibility simplifies their use as building blocks within custom kernels
 and applications.
 
@@ -16,111 +16,29 @@ To support a wide variety of applications, CUTLASS provides extensive support fo
 mixed-precision computations, providing specialized data-movement and
 multiply-accumulate abstractions for 8-bit integer, half-precision floating
 point (FP16), single-precision floating point (FP32), and double-precision floating
-point (FP64) types.  Furthermore, CUTLASS demonstrates warp-synchronous matrix multiply
-operations for targeting the programmable, high-throughput _Tensor Cores_ implemented 
-by NVIDIA's Volta and Turing architectures.
+point (FP64) types.  Furthermore, CUTLASS exploits the _Tensor Cores_ and asynchronous
+memory copy operations of the latest NVIDIA GPU architectures.
 
+# What's New in CUTLASS 3.0
 
-# What's New in CUTLASS 2.0
+For an overview of CUTLASS 3.0's GEMM interface levels,
+please refer to the
+[CUTLASS 3.0 GEMM API document](./gemm_api_3x.md).
+To learn how to migrate code using CUTLASS 2.x's interface
+to CUTLASS 3.0, please refer to the
+[backwards compatibility document](./cutlass_3x_backwards_compatibility.md).
 
-CUTLASS 2.0 is a substantial refactoring from the previous version, intended to offer:
+# GEMM examples
 
-- Better performance over 1.x, particularly for kernels targeting Turing Tensor Cores
-- Robust and durable templates that reliably span the design space
-- Encapsulated functionality that may be reusable in other contexts
-
-
-# Example CUTLASS GEMM
-
-The following illustrates an example function that defines a CUTLASS GEMM kernel
-with single-precision inputs and outputs. This is an exercpt from the CUTLASS SDK 
-[basic_gemm example](https://github.com/NVIDIA/cutlass/tree/master/examples/00_basic_gemm/basic_gemm.cu).
-
-~~~~~~~~~~~~~~~~~~~~~{.cpp}
-//
-// CUTLASS includes needed for single-precision GEMM kernel
-//
-
-// Defines cutlass::gemm::device::Gemm, the generic Gemm computation template class.
-
-#include <cutlass/gemm/device/gemm.h>
-
-/// Define a CUTLASS GEMM template and launch a GEMM kernel.
-cudaError_t cutlass_sgemm_nn(
-  int M,
-  int N,
-  int K,
-  float alpha,
-  float const *A,
-  int lda,
-  float const *B,
-  int ldb,
-  float beta,
-  float *C,
-  int ldc) {
-
-  // Define type definition for single-precision CUTLASS GEMM with column-major
-  // input matrices and 128x128x8 threadblock tile size (chosen by default).
-  //
-  // To keep the interface manageable, several helpers are defined for plausible compositions
-  // including the following example for single-precision GEMM. Typical values are used as
-  // default template arguments. See `cutlass/gemm/device/default_gemm_configuration.h` for more details.
-  //
-  // To view the full gemm device API interface, see `cutlass/gemm/device/gemm.h`
-
-  using ColumnMajor = cutlass::layout::ColumnMajor;
-
-  using CutlassGemm = cutlass::gemm::device::Gemm<float,        // Data-type of A matrix
-                                                  ColumnMajor,  // Layout of A matrix
-                                                  float,        // Data-type of B matrix
-                                                  ColumnMajor,  // Layout of B matrix
-                                                  float,        // Data-type of C matrix
-                                                  ColumnMajor>; // Layout of C matrix
-
-  // Define a CUTLASS GEMM type
-
-  CutlassGemm gemm_operator;
-
-  // Construct the CUTLASS GEMM arguments object.
-  //
-  // One of CUTLASS's design patterns is to define gemm argument objects that are constructible
-  // in host code and passed to kernels by value. These may include pointers, strides, scalars,
-  // and other arguments needed by Gemm and its components.
-  //
-  // The benefits of this pattern are (1.) a structured, composable strategy for passing host-constructible
-  // arguments to kernels and (2.) minimized initialization overhead on kernel entry.
-  //
-
-  CutlassGemm::Arguments args({M , N, K},  // Gemm Problem dimensions
-                              {A, lda},    // Tensor-ref for source matrix A
-                              {B, ldb},    // Tensor-ref for source matrix B
-                              {C, ldc},    // Tensor-ref for source matrix C
-                              {C, ldc},    // Tensor-ref for destination matrix D (may be different memory than source C matrix)
-                              {alpha, beta}); // Scalars used in the Epilogue
-
-  //
-  // Launch the CUTLASS GEMM kernel.
-  //
-
-  cutlass::Status status = gemm_operator(args);
-
-  //
-  // Return a cudaError_t if the CUTLASS GEMM operator returned an error code.
-  //
-
-  if (status != cutlass::Status::kSuccess) {
-    return cudaErrorUnknown;
-  }
-
-  // Return success, if no errors were encountered.
-
-  return cudaSuccess;
-}
-~~~~~~~~~~~~~~~~~~~~~
+For a code example showing how to define
+a GEMM kernel using CUTLASS, please refer to
+[the quickstart guide](./quickstart.md).
+The [`examples` directory](../../examples)
+has a variety of examples.
 
 # Copyright
 
-Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/media/docs/efficient_gemm.md b/media/docs/efficient_gemm.md
index 203f68108c..33df860879 100644
--- a/media/docs/efficient_gemm.md
+++ b/media/docs/efficient_gemm.md
@@ -1,24 +1,24 @@
-![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "Efficient GEMM in CUDA")
+![ALT](../images/gemm-hierarchy-with-epilogue-no-labels.png "Efficient GEMM in CUDA")
 
-[README](/README.md#documentation) > **Efficient GEMM in CUDA**
+[README](../../README.md#documentation) > **Efficient GEMM in CUDA**
 
 # Efficient GEMM in CUDA
 
-CUTLASS implements the hierarchically blocked structure described in 
+CUTLASS implements the hierarchically blocked structure described in
 [CUTLASS: Fast Linear Algebra in CUDA C++](https://devblogs.nvidia.com/cutlass-linear-algebra-cuda/)
 and the [CUTLASS GTC2018 talk](http://on-demand.gputechconf.com/gtc/2018/presentation/s8854-cutlass-software-primitives-for-dense-linear-algebra-at-all-levels-and-scales-within-cuda.pdf).
 
 ## Hierarchical Structure
 
 The basic triple loop nest computing matrix multiply may be blocked and tiled to match
-concurrency in hardware, memory locality, and parallel programming models. In CUTLASS, 
+concurrency in hardware, memory locality, and parallel programming models. In CUTLASS,
 GEMM is mapped to NVIDIA GPUs with the structure illustrated by the following loop nest.
 
 ```c++
 for (int cta_n = 0; cta_n < GemmN; cta_n += CtaTileN) {                     // for each threadblock_y           } threadblock-level concurrency
   for (int cta_m = 0; cta_m < GemmM; cta_m += CtaTileM) {                   //    for each threadblock_x        }
 
-    for (int cta_k = 0; cta_k < GemmK; cta_k += CtaTileK) {                 //       "GEMM mainloop" - no unrolling 
+    for (int cta_k = 0; cta_k < GemmK; cta_k += CtaTileK) {                 //       "GEMM mainloop" - no unrolling
                                                                             //                       - one iteration of this loop is one "stage"
                                                                             //
       for (int warp_n = 0; warp_n < CtaTileN; warp_n += WarpTileN) {        // for each warp_y                  } warp-level parallelism
@@ -30,7 +30,7 @@ for (int cta_n = 0; cta_n < GemmN; cta_n += CtaTileN) {                     // f
             for (int mma_k = 0; mma_k < WarpTileK; mma_k += MmaK) {         // for each mma instruction         } instruction-level parallelism
               for (int mma_n = 0; mma_n < WarpTileN; mma_n += MmaN) {       //    for each mma instruction      }
                 for (int mma_m = 0; mma_m < WarpTileM; mma_m += MmaM) {     //        for each mma instruction  }
-                                                                            // 
+                                                                            //
                   mma_instruction(d, a, b, c);                              //            TensorCore matrix computation
 
                 }   // for mma_m
@@ -47,39 +47,38 @@ for (int cta_n = 0; cta_n < GemmN; cta_n += CtaTileN) {                     // f
 ```
 
 This tiled loop nest targets concurrency among
-- threadblocks
-- warps
-- CUDA and Tensor Cores
+- threadblocks,
+- warps, and
+- CUDA and Tensor Cores.
 
-and takes advantage of memory locality within
-- shared memory
-- registers
+It takes advantage of memory locality within
+- shared memory and
+- registers.
 
-The flow of data within this structure is illustrated below. 
-This is the hierarchical GEMM computation embodied by CUTLASS. Each stage depicts a 
-nested level of tiling which corresponds to a layer of concurrency within the CUDA execution model and to a 
+The figure below illustrates the flow of data within this structure.
+This is the hierarchical GEMM computation embodied by CUTLASS. Each stage depicts a
+nested level of tiling which corresponds to a layer of concurrency within the CUDA execution model and to a
 level within the memory hierarchy, becoming increasingly finer moving left to right.
 
-![ALT](/media/images/gemm-hierarchy-with-epilogue.png "Hierarchical GEMM in CUDA")
+![ALT](../images/gemm-hierarchy-with-epilogue.png "Hierarchical GEMM in CUDA")
 
 
 ### Threadblock-level GEMM
 
 Each threadblock computes its portion of the output GEMM by iteratively loading tiles of input
-matrices and computing an accumulated matrix product. At the threadblock level, data is loaded from
-global memory. The blocking strategy in general is key to achieving efficiency. However, there are
-multiple conflicting goals that a programmer aims to achieve to strike a reasonable compromise. A
+matrices and computing an accumulated matrix product. At the threadblock level, data are loaded from
+global memory. The blocking strategy in general is key to achieving efficiency. However, the programmer
+must balance multiple conflicting goals. A
 larger threadblock means fewer fetches from global memory, thereby ensuring that DRAM bandwidth
-does not become a bottleneck. 
-
+does not become a bottleneck.
 However, large threadblock tiles may not match the dimensions of the problem well. If either the
 GEMM _M_ or _N_ dimension is small, some threads within the threadblock may not perform meaningful
 work, as the threadblock may be partially outside the bounds of the problem. If both _M_ and _N_
 are small while _K_ is large, this scheme may launch relatively few threadblocks and fail to
-fully utilize all multiprocessors within the GPU. Strategies to optimize performance for this case
-are described in the section [Parallelized Reductions](efficient_gemm.md#parallelized-reductions) 
-which partition the GEMM K dimension across multiple threadblocks or multiple warps. These compute
-matrix products in parallel which is then reduced to compute the result.
+make full use of all multiprocessors within the GPU. Strategies to optimize performance for this case,
+as described in the section [Parallelized Reductions](efficient_gemm.md#parallelized-reductions),
+partition the GEMM K dimension across multiple threadblocks or multiple warps. These threadblocks
+or warps compute matrix products in parallel; the products are then reduced to compute the result.
 
 In CUTLASS, the dimensions of the threadblock tile are specified as `ThreadblockShape::{kM, kN, kK}`
 and may be tuned to specialize the GEMM computation for the target processor and dimensions of
@@ -90,10 +89,10 @@ the GEMM problem.
 
 The warp-level GEMM maps to the warp-level parallelism within the CUDA execution model. Multiple
 warps within a threadblock fetch data from shared memory into registers and perform computations.
-Warp-level GEMMs may be implemented either by TensorCores issuing 
-[mma.sync](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma) 
-or [wmma](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-wmma-mma) 
-instructions or by thread-level matrix computations issued to CUDA cores.
+Warp-level GEMMs may be implemented either by TensorCores issuing
+[mma.sync](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma)
+or [wmma](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-wmma-mma)
+instructions, or by thread-level matrix computations issued to CUDA cores.
 For maximum performance, access to shared memory should be bank conflict free. To maximize data
 reuse within the warp, a large warp-level GEMM tile should be chosen.
 
@@ -101,8 +100,8 @@ reuse within the warp, a large warp-level GEMM tile should be chosen.
 ### Thread-level GEMM
 
 At the lowest level of blocking, each thread is responsible for processing a certain number of
-elements. Threads cannot access each other's registers so we choose an organization that enables
-values held in registers to be reused for multiple math instructions. This results in a 2D tiled
+elements. Threads cannot access each other's registers, so we choose an organization that enables
+reuse of values held in registers for multiple math instructions. This results in a 2D tiled
 structure within a thread, in which each thread issues a sequence of independent math instructions
 to the CUDA cores and computes an accumulated outer product.
 
@@ -127,33 +126,35 @@ but other device-side function call operators may be used to perform custom oper
 
 ## Optimizations
 
-The hierarchical structure described above yields an efficient mapping to the CUDA execution model and 
+The hierarchical structure described above yields an efficient mapping to the CUDA execution model and
 CUDA/TensorCores in NVIDIA GPUs. The following sections describe strategies for obtaining peak performance
 for all corners of the design space, maximizing parallelism and exploiting data locality wherever possible.
 
 ### Pipelining
 
 The blocked structure demands a large storage allocation within the registers of each CUDA thread. The
-accumulator elements typically occupy at least half a thread's total register budget. Consequently, 
+accumulator elements typically occupy at least half a thread's total register budget. Consequently,
 occupancy -- the number of concurrent threads, warps, and threadblocks -- is relatively low compared
-to other classes of GPU workloads. This limits the GPUs ability to hide memory latency and other stalls
+to other classes of GPU workloads. This limits the GPU's ability to hide memory latency and other stalls
 by context switching to other concurrent threads within an SM.
 
-To mitigate the effects of memory latency, *software pipelining* is used to overlap memory accesses
-with other computation within a thread. In CUTLASS, this is achieved by double buffering at the
-following scopes
+To mitigate the effects of memory latency, CUTLASS uses *software pipelining* to overlap memory accesses
+with other computation within a thread. CUTLASS accomplishes this by double buffering at the
+following scopes.
 
-- **threadblock-scoped shared memory tiles:** two tiles are allocated within shared memory; one is used
-  load data for the current matrix operation, while the other tile is used to buffer data loaded from
-  global memory for the next mainloop iteration
+- **Threadblock-scoped shared memory tiles:** two tiles are allocated in shared memory.
+  One is used to load data for the current matrix operation,
+  while the other tile is used to buffer data loaded from global memory
+  for the next mainloop iteration.
 
-- **warp-scoped matrix fragments:** two fragments are allocated within registers; one fragment is passed
-  to CUDA and TensorCores during the current matrix computation, while the other is used to receive
-  shared memory fetch returns for the next warp-level matrix operation
+- **Warp-scoped matrix fragments:** two fragments are allocated within registers.
+  One fragment is passed to CUDA and TensorCores during the current matrix computation,
+  while the other is used to receive shared memory fetch returns
+  for the next warp-level matrix operation.
 
-The efficient, pipelined mainloop body used in CUTLASS GEMMs is illustrated as follows.
+The following diagram illustrates the efficient, pipelined mainloop body used in CUTLASS GEMMs.
 
-![ALT](/media/images/software-pipeline.png "Software pipeline in CUTLASS")
+![ALT](../images/software-pipeline.png "Software pipeline in CUTLASS")
 
 ### Threadblock Rasterization
 
@@ -163,7 +164,7 @@ consecutively launched threadblocks to packed two-dimensional regions of the par
 problem to increase the probability that these will access the same tiles of global memory at
 approximately the same time.
 
-Several functions are defined in [cutlass/gemm/threadblock_swizzle.h](/include/cutlass/gemm/threadblock/threadblock_swizzle.h).
+Several functions are defined in [cutlass/gemm/threadblock_swizzle.h](../../include/cutlass/gemm/threadblock/threadblock_swizzle.h).
 
 
 ### Parallelized Reductions
@@ -181,35 +182,66 @@ benefits of large threadblock-level GEMM tiles.
 
 CUTLASS implements parallel reductions across threadblocks by partitioning the GEMM _K_ dimension
 and launching an additional set of threadblocks for each partition. Consequently, we refer to
-this strategy within CUTLASS as "parallel reduction splitK." The "parallel reduction splitK" in cutlass 
-requires the execution of 2 kernels. The first one is called partitionedK GEMM. The second one is called 
-batched reduction.
+this strategy within CUTLASS as "parallel reduction splitK." The "parallel reduction splitK" strategy
+requires the execution of 2 kernels: partitionedK GEMM, and batched reduction.
 
-The partitionedK GEMM is very similar to one flavor of batched strided GEMM. Instead of requiring users 
-to specify the problem size of each batch, partitionedK GEMM asks for the overall problem size and the 
-number of partition that will be applied along K dimension for operand A and B. For example, parameters o
-f m=128, n=128, k=4096 and partition=16 will result in 16 batched strided GEMMs with each batch of 
-m=128, n=128, k=256. PartitionedK also allows scenario where k is not divisible by partition count. 
+PartitionedK GEMM resembles one flavor of batched strided GEMM. Instead of requiring users
+to specify the problem size of each batch, partitionedK GEMM asks for the overall problem size and the
+number of partitions that will be applied along the K dimension for operands A and B. For example,
+parameters of m=128, n=128, k=4096 and partition=16 will result in 16 batched strided GEMMs
+with each batch of m=128, n=128, k=256. PartitionedK also allows scenario where k is not divisible
+by the partition count.
 
-For example, parameters of m=128, n=128, k=4096 and partition=20 will result in 20 batched strided GEMMs 
-with the first 19 batches of m=128, n=128, k=4096/20=204 and the last batch of m=128, n=128, k=220.
+For example, parameters of m=128, n=128, k=4096 and partition=20
+will result in 20 batched strided GEMMs.
+The first 19 batches will have m=128, n=128, and k=4096/20=204,
+and the last batch will have m=128, n=128, and k=220.
 
-The batched reduction kernel will further perform reduction along the K-dimension. Thus, the input of 
-the batched reduction kernel is the output (C) of partitionedK GEMM. An workspace memory is managed by 
-the users to store this intermediate results.
+The batched reduction kernel takes as input the output (C) of partitionedK GEMM,
+and performs a reduction along the K-dimension.
+Users must manage workspace memory to store this intermediate result.
 
 **Sliced K - reduction across warps**
 
-Similar to the split-k scenario, sliced-k aims at improving the efficiency of kernels with smaller M, N,
- but large K dimensions. In general at the thread-block level, the parameters CtaTileN, CtaTileM expose parallelism 
-by partitioning the the work the among warps, and larger warpTiles expose better ILP (Instruction 
-level parallelism) and reuse, but it also limits the number of warps running per thread-block, which reduces efficiency.
+Similar to the split-k scenario, sliced-k aims at improving the efficiency of kernels
+with smaller M and N dimensions, but large K dimension.
+At the thread-block level, the parameters CtaTileN and CtaTileM expose parallelism
+by partitioning the work among warps.
+Larger warpTiles expose better instruction-level parallelism (ILP) and reuse,
+but also limit the number of warps running per threadblock, which reduces efficiency.
+
+In order to improve efficiency in such scenarios, partitioning the warpTiles also along ctaTileK
+helps use the hardware more efficiently by allowing more warps to run concurrently in a CTA.
+Sliced-k kernels break down a threadblock's computation among participating warps
+not just among the CtaTileN, CtaTileM dimension, but also the CtaTileK dimension.
+Thus, sliced-k entails a small cost in form of a reduction
+which has to happen at the end among the participating warps.
+This is because each warp computes using only a "slice" of CtaTileK,
+so each warp only has a partial sum before the reduction.
+
+### Warp Specialization
+
+Starting with Hopper, CUTLASS 3.0 incorporates the concept of [Warp Specialization](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#spatial-partitioning-also-known-as-warp-specialization)
+as part of the kernel design. A thread block is partitioned into two sets of warps, [*producer* warp group](../../include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp) and [*consumer* warp group](../../include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp). The *producer* warp group loads data from global memory into shared memory buffers using the new [Tensor Memory Accelerator (TMA)](https://developer.nvidia.com/blog/nvidia-hopper-architecture-in-depth/). 
+
+[*Producer* warp group (DMA)](../../include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp) waits for the shared memory buffers to be signaled as [empty](../../include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp) by the *consumer* warp group using the newly added **Async Pipeline class** ([refer](pipeline.md)). Once the data is written into the shared memory, TMA is also updates the barrier associated with that stage to notify affected threads that the buffer has been [filled](../../include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp). The [*Consumer* warp group (MMA)](../../include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp) on the other hand waits for the *producer* warp group to signal that the buffer is [filled](../../include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp) and then launches tensor core MMA operations. Finally, the *consumer* warp group [releases](../../include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp) the buffers for the next set of TMA loads to happens.
+
+**Warp-Specialized Persistent Cooperative kernel design**
+
+Another flavor of Warp-Specialized kernel design being introduced starting with Hopper is the [*Warp-Specialized Persistent Cooperative*](../../include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp) kernel. Like the Warp-Specialized kernel, the concepts of warp groups and barrier synchronization between warp groups remain the same in the cooperative design. 
+The distinctive feature of the Warp-Specialized Persistent Cooperative kernel are the following :
+* Persistent thread blocks launched to occupy as many SMs as mentioned in the [KernelHardwareInfo](../../include/cutlass/kernel_hardware_info.hpp) struct. These persistent thread blocks are used to tile the output and thus (potentially) compute multiple output tiles through their lifetime. The main benefit this adds is amortization of the thread-block launch and kernel prologue overheads which are typical of all kernels.
+* Presence of two *consumer* warp groups cooperating on the same output tile by splitting the tile in half across the M dimension. This allows for larger tile sizes to be enabled - since the register pressure per *consumer* warp group is reduced - and hence improving performance.
+
+Since each thread block now computes multiple output tiles, the shape of the grid launch and the scheduling of tiles to the thread blocks is managed using the new [*Tile Scheduler*](../../include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp). The *Tile Scheduler* considers the shape of the *clusters* as well as the available number of available SMs to compute a valid scheduling of the output tiles to launched thread blocks.
+
+**Warp-Specialized Persistent Ping-Pong kernel design**
 
-So in order to improve efficiency in such scenarios, partitioning the warpTiles also along ctaTileK helps improve the utilization 
-of the underlying hardware by allowing more warps to run concurrently in a CTA.  Now, since sliced-k kernels breaks 
-down a thread-blocks's computation among participating warps not just among the CtaTileN, CtaTileM dimension, 
-but also the CtaTileK dimension it entails a small cost in form of a reduction which has to happen at the end among the 
-participating warps - since each warp now owns a partial sum (since they compute using only a "slice" of ctaTileK). 
+The third kernel design is the [*Warp-Specialized Persistent Ping-Pong*](../../include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp) kernel. 
+Like the Warp-Specialized Persistent Cooperative, kernel the concepts of warp groups, barrier synchronization between warp groups, and the shape of the grid launch remain the same in the persistent ping-pong design. 
+The distinctive feature of the Warp-Specialized Persistent Ping-Pong kernel is the following :
+* The two *consumer* warp groups are assigned a different output tile using the Tile Scheduler. This allows for *epilogue* of one *consumer* warp group to be overlapped with the math operations of the other *consumer* warp group - thus maximizing tensor core utilization. 
+* The *producer* warp group synchronizes using the [Ordered Sequence Barrier](../../include/cutlass/pipeline/pipeline.hpp) to fill buffers of the two *consumer* warp groups one after the other in order.
 
 # Resources
 
@@ -225,7 +257,7 @@ targeting NVIDIA GPUs.
 
 # Copyright
 
-Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/media/docs/functionality.md b/media/docs/functionality.md
index 720f479f1e..a761284eb1 100644
--- a/media/docs/functionality.md
+++ b/media/docs/functionality.md
@@ -1,13 +1,18 @@
-![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS Functionality")
+![ALT](../images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS Functionality")
 
-[README](/README.md#documentation) > **Functionality**
+[README](../../README.md#documentation) > **Functionality**
+
+# Functionality
+
+Note : CUTLASS-3 requires users to use CUDA 11.4 or newer, and SM70 or newer, for the target toolkit and architecture, respectively.
+Please refer to the [Compatibility](/README.md#Compatibility) section for more details.
 
 - N - Column Major Matrix
 - T - Row Major matrix
-- {N,T} x {N,T} - All combinations, i.e. NN, NT, TN, TT
+- {N,T} x {N,T} - All combinations, i.e., NN, NT, TN, TT
 - [NHWC](/include/cutlass/layout/tensor.h#L63-206) - 4 dimension tensor used for convolution
 - [NCxHWx](/include/cutlass/layout/tensor.h#L290-395) - Interleaved 4 dimension tensor used for convolution
-- f - float point
+- f - floating point
 - s - signed int
 - b - bit
 - cf - complex float
@@ -18,46 +23,57 @@
 - SpTensorOp - Use Sparse Tensor Core MMA
 - WmmaTensorOp - Use WMMA abstraction to use Tensor Core MMA
 
-# Functionality
-
 ## Device-level GEMM
 
-The following table summarizes device-level GEMM kernels in CUTLASS, organized by opcode class, data type, and layout.
+The following tables summarize device-level GEMM kernels in CUTLASS, organized by opcode class, data type, and layout.
 Hyperlinks to relevant unit tests demonstrate how specific template instances may be defined.
 
+### CUTLASS 3.x Kernels
+
+|**Opcode Class** | **Compute Capability** | **CUDA Toolkit** | **Data Type**                  | **Layouts**            | **Unit Test**    |
+|-----------------|------------------------|------------------|--------------------------------|------------------------|------------------|
+| **TensorOp**        | 90a                 |  12.0+           | `f16 * f16 + { f16, f32 } => { f16, f32 }`       | {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized.cu) |
+| **TensorOp**        | 90a                 |  12.0+           | `bf16 * bf16 + { f16, f32 } => { bf16, f32 }`| {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_tensor_op_f32.cu) |
+| **TensorOp**        | 90a                 |  12.0+           | `{f32, tf32} * {f32, tf32} + f32 => f32`| { T } x { N } => {N,T} |  [example](/test/unit/gemm/device/sm90_gemm_f32_f32_f32_tensor_op_f32.cu) |
+| **TensorOp**        | 90a                 |  12.0+           | `s8 * s8 + s32 => {s32, s8}`   | { T } x { N } => {N,T} |  [example](/test/unit/gemm/device/sm90_gemm_s8_s8_s8_tensor_op_s32.cu) |
+ 
+
+### CUTLASS 2.x Kernels
+
 |**Opcode Class** | **Compute Capability** | **CUDA Toolkit** | **Data Type**                  | **Layouts**            | **Unit Test**    |
 |-----------------|------------------------|------------------|--------------------------------|------------------------|------------------|
-| **Simt**        | 50,60,61,70,75         |  9.2+            | `f32 * f32 + f32 => f32`       | {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/simt_sgemm_nt_sm50.cu)                |
-| **Simt**        | 50,60,61,70,75         |  9.2+            | `f64 * f64 + f64 => f64`       | {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/simt_dgemm_nt_sm50.cu)                |
-| **Simt**        | 60,61,70,75            |  9.2+            | `f16 * f16 + f16 => f16`       | {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/simt_hgemm_nt_sm50.cu)                |
-| **Simt**        | 61,70,75               |  9.2+            | `s8 * s8 + s32 => {s32,s8}`    | {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/simt_igemm_nt_sm50.cu)              |
-| **WmmaTensorOp**    | 70                 |  9.2+            | `f16 * f16 + f16 => f16`       | {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu)     |
-| **WmmaTensorOp**    | 70                 |  9.2+            | `f16 * f16 + f32 => {f16, f32}`| {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu)                |
-| **WmmaTensorOp**    | 75                 |  10.0+           | `s8 * s8 + s32 => {s32, s8}`   | {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu) |
-| **WmmaTensorOp**    | 75                 |  10.0+           | `s4 * s4 + s32 => {s32, s4}`   | {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_s4t_s4n_s4t_wmma_tensor_op_s32_sm75.cu)                |
-| **WmmaTensorOp**    | 75                 |  10.0+           | `b1 ^ b1 + s32 => {s32, b1}`   | { T } x { N } => {N,T} |  [example](/test/unit/gemm/device/gemm_b1t_b1n_b1t_wmma_tensor_op_s32_sm75.cu)                |
-| **TensorOp**        | 70                 |  10.1+           | `f16 * f16 + f16 => f16`       | {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu)                |
-| **TensorOp**        | 70                 |  10.1+           | `f16 * f16 + f32 => {f16, f32}`| {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f32_sm70.cu)                |
-| **TensorOp**        | 75                 |  10.2+           | `f16 * f16 + f16 => f16`       | {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu) |
-| **TensorOp**        | 75                 |  10.2+           | `f16 * f16 + f32 => {f16, f32}`| {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm75.cu) |
-| **TensorOp**        | 75                 |  10.2+           | `s8 * s8 + s32 => {s32, s8}`   | { T } x { N } => {N,T} |  [example](/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu) |
-| **TensorOp**        | 75                 |  10.2+           | `s4 * s4 + s32 => {s32, s4}`   | { T } x { N } => {N,T} |  [example](/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu) |
-| **TensorOp**        | 75                 |  10.2+           | `b1 ^ b1 + s32 => {s32, b1}`   | { T } x { N } => {N,T} |  [example](/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu) |
-| **TensorOp**        | 80                 |  11.0+           | `f16 * f16 + f16 => f16`       | {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu) |
-| **TensorOp**        | 80                 |  11.0+           | `f16 * f16 + f32 => {f16, f32}`| {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu) |
-| **TensorOp**        | 80                 |  11.0+           | `bf16 * bf16 + f32 => {bf16, f32}`| {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_bf16n_bf16t_bf16t_tensor_op_f32_sm80.cu) |
-| **TensorOp**        | 80                 |  11.0+           | `tf32 * tf32 + f32 => f32`| {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_f32n_f32t_f32t_tensor_op_f32_sm80.cu) |
-| **TensorOp**        | 80                 |  11.0+           | `s8 * s8 + s32 => {s32, s8}`   | { T } x { N } => {N,T} |  [example](/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm80.cu) |
-| **TensorOp**        | 80                 |  11.0+           | `s4 * s4 + s32 => {s32, s4}`   | { T } x { N } => {N,T} |  [example](/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu) |
-| **TensorOp**        | 80                 |  11.0+           | `b1 ^ b1 + s32 => {s32, b1}`   | { T } x { N } => {N,T} |  [example](/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu) |
-| **TensorOp**        | 80                 |  11.0+           | `f64 * f64 + f64 => f64`       | {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu) |
-| **TensorOp**        | 80                 |  11.0+           | `cf32 * cf32 + cf32 => cf32`       | {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu) |
-| **TensorOp**        | 80                 |  11.0+           | `cf64 * cf64 + cf64 => cf64`       | {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu), [Gaussian 3m](/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu) |
-| **SpTensorOp**      | 80                 |  11.1+           | `f16 * f16 + f32 => {f16, f32}`    | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu) |
-| **SpTensorOp**      | 80                 |  11.1+           | `bf16 * bf16 + f32 => {bf16, f32}` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu) |
-| **SpTensorOp**      | 80                 |  11.1+           | `tf32 * tf32 + f32 => f32`         | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sparse_sm80.cu) |
-| **SpTensorOp**      | 80                 |  11.1+           | `s8 * s8 + s32 => {s8, s32}`       | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu) |
-| **SpTensorOp**      | 80                 |  11.1+           | `s4 * s4 + s32 => {s4, s32}`       | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu) |
+| **Simt**        | 50+                    |  11.4+            | `f32 * f32 + f32 => f32`       | {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/simt_sgemm_nt_sm50.cu)                |
+| **Simt**        | 50+                    |  11.4+            | `f64 * f64 + f64 => f64`       | {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/simt_dgemm_nt_sm50.cu)                |
+| **Simt**        | 60+                    |  11.4+            | `f16 * f16 + f16 => f16`       | {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/simt_hgemm_nt_sm50.cu)                |
+| **Simt**        | 61+                    |  11.4+            | `s8 * s8 + s32 => {s32,s8}`    | {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/simt_igemm_nt_sm50.cu)              |
+| **WmmaTensorOp**    | 70+                |  11.4+            | `f16 * f16 + f16 => f16`       | {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu)     |
+| **WmmaTensorOp**    | 70+                |  11.4+            | `f16 * f16 + f32 => {f16, f32}`| {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu)                |
+| **WmmaTensorOp**    | 75+                |  11.4+           | `s8 * s8 + s32 => {s32, s8}`   | {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu) |
+| **WmmaTensorOp**    | 75+                |  11.4+           | `s4 * s4 + s32 => {s32, s4}`   | {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_s4t_s4n_s4t_wmma_tensor_op_s32_sm75.cu)                |
+| **WmmaTensorOp**    | 75+                |  11.4+           | `b1 ^ b1 + s32 => {s32, b1}`   | { T } x { N } => {N,T} |  [example](/test/unit/gemm/device/gemm_b1t_b1n_b1t_wmma_tensor_op_s32_sm75.cu)                |
+| **TensorOp**        | 70+                |  11.4+           | `f16 * f16 + f16 => f16`       | {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu)                |
+| **TensorOp**        | 70+                |  11.4+           | `f16 * f16 + f32 => {f16, f32}`| {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f32_sm70.cu)                |
+| **TensorOp**        | 75+                |  11.4+           | `f16 * f16 + f16 => f16`       | {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu) |
+| **TensorOp**        | 75+                |  11.4+           | `f16 * f16 + f32 => {f16, f32}`| {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm75.cu) |
+| **TensorOp**        | 75+                |  11.4+           | `s8 * s8 + s32 => {s32, s8}`   | { T } x { N } => {N,T} |  [example](/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu) |
+| **TensorOp**        | 75+                |  11.4+           | `s4 * s4 + s32 => {s32, s4}`   | { T } x { N } => {N,T} |  [example](/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu) |
+| **TensorOp**        | 75+                |  11.4+           | `b1 ^ b1 + s32 => {s32, b1}`   | { T } x { N } => {N,T} |  [example](/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu) |
+| **TensorOp**        | 80+                |  11.4+           | `f16 * f16 + f16 => f16`       | {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu) |
+| **TensorOp**        | 80+                |  11.4+           | `f16 * f16 + f32 => {f16, f32}`| {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu) |
+| **TensorOp**        | 80+                |  11.4+           | `bf16 * bf16 + f32 => {bf16, f32}`| {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_bf16n_bf16t_bf16t_tensor_op_f32_sm80.cu) |
+| **TensorOp**        | 80+                |  11.4+           | `tf32 * tf32 + f32 => f32`| {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_f32n_f32t_f32t_tensor_op_f32_sm80.cu) |
+| **TensorOp**        | 80+                |  11.4+           | `s8 * s8 + s32 => {s32, s8}`   | { T } x { N } => {N,T} |  [example](/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm80.cu) |
+| **TensorOp**        | 80+                |  11.4+           | `s4 * s4 + s32 => {s32, s4}`   | { T } x { N } => {N,T} |  [example](/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu) |
+| **TensorOp**        | 80+                |  11.4+           | `b1 ^ b1 + s32 => {s32, b1}`   | { T } x { N } => {N,T} |  [example](/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu) |
+| **TensorOp**        | 80+                |  11.4+           | `f64 * f64 + f64 => f64`       | {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu) |
+| **TensorOp**        | 80+                |  11.4+           | `cf32 * cf32 + cf32 => cf32`       | {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu) |
+| **TensorOp**        | 80+                |  11.4+           | `cf64 * cf64 + cf64 => cf64`       | {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu), [Gaussian 3m](/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu) |
+| **SpTensorOp**      | 80+                |  11.4+           | `f16 * f16 + f32 => {f16, f32}`    | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu) |
+| **SpTensorOp**      | 80+                |  11.4+           | `bf16 * bf16 + f32 => {bf16, f32}` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu) |
+| **SpTensorOp**      | 80+                |  11.4+           | `tf32 * tf32 + f32 => f32`         | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sparse_sm80.cu) |
+| **SpTensorOp**      | 80+                |  11.4+           | `s8 * s8 + s32 => {s8, s32}`       | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu) |
+| **SpTensorOp**      | 80+                |  11.4+           | `s4 * s4 + s32 => {s4, s32}`       | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu) |
+| **TensorOp**        | 90+                |  11.8+           | `f64 * f64 + f64 => f64`       | {N,T} x {N,T} => {N,T} |  [example](/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm90.cu) |
 
 
 ## Device-level Implicit GEMM convolution
@@ -68,19 +84,19 @@ One can find and/or create equivalent dgrad and wgrad convolutional operators.
 
 |**Opcode Class** | **Compute Capability** | **CUDA Toolkit** | **Data Type**                  | **Layouts**      | **Unit Test**    |
 |-----------------|------------------------|------------------|--------------------------------|------------------|------------------|
-| **Simt**            | 50,60,61,70,75     |  9.2+            | `f32 * f32 + f32 => f32`       | NHWC             |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu)                |
-| **Simt**            | 50,60,61,70,75     |  9.2+            | `cf32 * cf32 + cf32 => cf32`   | NHWC             |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu)                |
-| **TensorOp**        | 70                 |  10.1+           | `f16 * f16 + f32 => {f16, f32}`| NHWC             |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu) |
-| **TensorOp**        | 75                 |  10.2+           | `f16 * f16 + f32 => {f16, f32}`| NHWC             |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu) |
-| **TensorOp**        | 75                 |  10.2+           | `s8 * s8 + s32 => {s32, s8}`   | NHWC, NCxHWx     |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu), [ncxhwx](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu) |
-| **TensorOp**        | 75                 |  10.2+           | `s4 * s4 + s32 => {s32, s4}`   | NHWC, NCxHWx     |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu), [ncxhwx](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu) |
-| **Simt**            | 80                 |  11.0+           | `f32 * f32 + f32 => f32`       | NHWC             |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu)                |
-| **Simt**            | 80                 |  11.0+           | `cf32 * cf32 + cf32 => cf32`   | NHWC             |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu)                |
-| **TensorOp**        | 80                 |  11.0+           | `f16 * f16 + f32 => {f16, f32}`| NHWC             |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu) |
-| **TensorOp**        | 80                 |  11.0+           | `f16 * f16 + f16 => f16`       | NHWC             |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu) |
-| **TensorOp**        | 80                 |  11.0+           | `tf32 * tf32 + f32 => f32`     | NHWC             |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu) |
-| **TensorOp**        | 80                 |  11.0+           | `s8 * s8 + s32 => {s32, s8}`   | NHWC, NCxHWx     |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu), [ncxhwx](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu) |
-| **TensorOp**        | 80                 |  11.0+           | `s4 * s4 + s32 => {s32, s4}`   | NHWC, NCxHWx     |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu), [ncxhwx](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu) |
+| **Simt**            | 50+                |  11.4+            | `f32 * f32 + f32 => f32`       | NHWC             |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu)                |
+| **Simt**            | 50+                |  11.4+            | `cf32 * cf32 + cf32 => cf32`   | NHWC             |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu)                |
+| **TensorOp**        | 70+                |  11.4+           | `f16 * f16 + f32 => {f16, f32}`| NHWC             |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu) |
+| **TensorOp**        | 75+                |  11.4+           | `f16 * f16 + f32 => {f16, f32}`| NHWC             |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu) |
+| **TensorOp**        | 75+                |  11.4+           | `s8 * s8 + s32 => {s32, s8}`   | NHWC, NCxHWx     |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu), [ncxhwx](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu) |
+| **TensorOp**        | 75+                |  11.4+           | `s4 * s4 + s32 => {s32, s4}`   | NHWC, NCxHWx     |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu), [ncxhwx](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu) |
+| **Simt**            | 80+                |  11.4+           | `f32 * f32 + f32 => f32`       | NHWC             |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu)                |
+| **Simt**            | 80+                |  11.4+           | `cf32 * cf32 + cf32 => cf32`   | NHWC             |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu)                |
+| **TensorOp**        | 80+                |  11.4+           | `f16 * f16 + f32 => {f16, f32}`| NHWC             |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu) |
+| **TensorOp**        | 80+                |  11.4+           | `f16 * f16 + f16 => f16`       | NHWC             |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu) |
+| **TensorOp**        | 80+                |  11.4+           | `tf32 * tf32 + f32 => f32`     | NHWC             |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu) |
+| **TensorOp**        | 80+                |  11.4+           | `s8 * s8 + s32 => {s32, s8}`   | NHWC, NCxHWx     |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu), [ncxhwx](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu) |
+| **TensorOp**        | 80+                |  11.4+           | `s4 * s4 + s32 => {s32, s4}`   | NHWC, NCxHWx     |  [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu), [ncxhwx](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu) |
 
 
 
@@ -265,7 +281,7 @@ CUDA exposes warp-level matrix operations in the CUDA C++ WMMA API. The CUDA C++
 
 # Copyright
 
-Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/media/docs/fundamental_types.md b/media/docs/fundamental_types.md
index 2d493d0b8e..8bef0702f6 100644
--- a/media/docs/fundamental_types.md
+++ b/media/docs/fundamental_types.md
@@ -1,6 +1,6 @@
-![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS")
+![ALT](../images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS")
 
-[README](/README.md#documentation) > **Fundamental Types**
+[README](../../README.md#documentation) > **Fundamental Types**
 
 # Fundamental Types
 
@@ -63,13 +63,12 @@ template <
   typename T,       // element type
   int N             // number of elements
 >
-class Array;
+struct Array;
 ```
 
 `Array<class T, int N>` defines a statically sized array of elements of type _T_ and size _N_. This class is similar to 
-[`std::array<>`](https://en.cppreference.com/w/cpp/container/array) in the Standard Library with two notable exceptions:
-* constructors for each element may not be called
-* partial specializations exist to pack or unpack elements smaller than one byte.
+[`std::array<>`](https://en.cppreference.com/w/cpp/container/array) in the Standard Library with one notable exception:
+partial specializations exist to pack or unpack elements smaller than one byte.
 
 `Array<>` is intended to be a convenient and uniform container class to store arrays of numeric elements regardless of data type or vector length. The storage needed is expected to be the minimum necessary given the logical size of each numeric type in bits (numeric types smaller than one byte are densely packed). Nevertheless, the size reported by `sizeof(Array<T, N>)` is always an integer multiple of bytes.
 
@@ -231,7 +230,7 @@ class Coord;
 
 **Example:** Vector operations on coordinates.
 ```c++
-Coord<2> compute_offset(Corod<2> const & base) {
+Coord<2> compute_offset(Coord<2> const & base) {
   
   Coord<2> stride = make_Coord(1, kM);
 
@@ -346,7 +345,7 @@ support on current and future NVIDIA GPUs.
 
 # Copyright
 
-Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/media/docs/gemm_api.md b/media/docs/gemm_api.md
index 01ea8b7e5a..f942933a41 100644
--- a/media/docs/gemm_api.md
+++ b/media/docs/gemm_api.md
@@ -1,6 +1,6 @@
-![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS GEMM API")
+![ALT](../images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS GEMM API")
 
-[README](/README.md#documentation) > **CUTLASS GEMM API**
+[README](../../README.md#documentation) > **CUTLASS GEMM API**
 
 # CUTLASS GEMM API
 
@@ -28,7 +28,7 @@ for (int cta_n = 0; cta_n < GemmN; cta_n += CtaTileN) {                     // f
       for (int warp_n = 0; warp_n < CtaTileN; warp_n += WarpTileN) {        // for each warp      } warp-level concurrency
         for (int warp_m = 0; warp_m < CtaTileM; warp_m += WarpTileM) {      //    for each warp   }
                                                                             //
-          for (int warp_k = 0; warp_k < CtaTileK; warp_k += MmaK) {         //       fully unroll across CtaTileK - one iteration of this loop is one "k Group" == "MmaK"
+          for (int warp_k = 0; warp_k < CtaTileK; warp_k += WarpTileK) {    //       fully unroll across CtaTileK - one iteration of this loop is one "k Group"
                                                                             //
             for (int mma_k = 0; mma_k < WarpTileK; mma_k += MmaK) {         // cutlass::gemm::warp::Mma
               for (int mma_n = 0; mma_n < WarpTileN; mma_n += MmaN) {       //
@@ -542,7 +542,7 @@ to inline PTX.
 
 # Copyright
 
-Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/media/docs/gemm_api_3x.md b/media/docs/gemm_api_3x.md
new file mode 100644
index 0000000000..5956e6e197
--- /dev/null
+++ b/media/docs/gemm_api_3x.md
@@ -0,0 +1,704 @@
+![ALT](../images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS GEMM API")
+
+[README](../../README.md#documentation) > **CUTLASS 3.0 GEMM API**
+
+# CUTLASS 3.0 GEMM API
+
+CUTLASS presents a uniform programming model
+for matrix multiply-accumulate (MMA) operations
+at different levels of the GPU system hierarchy.
+CUTLASS 3.0 has GEMM APIs corresponding to the following levels
+in order of highest to the lowest level.
+
+1. Device
+2. Kernel
+3. Collective
+4. Tiled MMA and Copy
+5. Atom
+
+This document will cover the first three levels in detail:
+Device, Kernel, and Collective.
+It also briefly discusses the Tiled MMA/Copy and Atom level,
+and then refers readers to CuTe's tutorial for more information.
+
+# CUTLASS GEMM Model
+
+CUTLASS implements algorithms that express
+the classical "triply nested loop" GEMM algorithm
+with a tiled structure mirroring the above hierarchy.
+
+The following pseudocode describes the model for a GEMM kernel
+targeting a warp-synchronous matrix multiply instruction like `mma.sync.`
+The entire operation is referred to as "Gemm,"
+as it is assumed that an epilogue operation
+performs the general matrix update similar to BLAS.
+This is pseudocode and is only meant to illustrate which parts of the layers
+correspond to the inner or outer loops of the GEMM.
+
+```c++
+// cutlass::gemm::kernel::GemmUniversal: ClusterTileM and ClusterTileN loops
+//   are either rasterized by the hardware or scheduled by the kernel in persistent kernels.
+// Parallelism over thread block clusters
+for (int cluster_m = 0; cluster_m < GemmM; cluster_m += ClusterTileM) {
+  for (int cluster_n = 0; cluster_n < GemmN; cluster_n += ClusterTileN) {
+
+    // cutlass::gemm::collective::CollectiveMma: mainloop that iterates over all k-tiles
+    // No loop unrolling is performed at this stage
+    for (int k_tile = 0; k_tile < size<2>(gmem_tensor_A); k_tile++) {
+
+      // loops inside cute::gemm(tiled_mma, a, b, c); Dispatch 5: (V,M,K) x (V,N,K) => (V,M,N)
+      // TiledMma uses the hardware instruction provided through its Mma_Atom
+      // TiledMma's atom layout, value layout, and permutations define the iteration order
+      for (int tiled_mma_k = 0; tiled_mma_k < size<2>(A); tiled_mma_k++) {
+        for (int tiled_mma_m = 0; tiled_mma_m < size<1>(A); tiled_mma_m++) {
+          for (int tiled_mma_n = 0; tiled_mma_n < size<1>(B); tiled_mma_n++) {
+
+            // TiledMma's vector mode dispatches to the underlying instruction.
+            mma.call(d, a, b, c);
+          } // tiled_mma_n
+        } // tiled_mma_m
+      } // tiled_mma_k
+    } // k_tile mainloop
+  } // cluster_m
+} // cluster_n
+```
+
+The first three nested `for` loops
+correspond to parallelism over thread block clusters.
+The code does not actually express them as explicit `for` loops.
+Instead, the parallelization scheme over tiles
+is implied by CUDA grid launch semantics.
+However, for persistent kernels,
+these three loops are expressed in the source code 
+as a single `while` loop that queries the
+[work tile scheduler](/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp)
+for problem tiles on which to compute.
+
+Inside the three nested `for` loops,
+one finds code that pulls matrix tiles
+from global memory into more "local" memory
+(like shared memory or registers)
+and computes MMAs.
+These tiled copy and tiled mma iterations are generally
+fully static and get fully unrolled.
+
+# CUTLASS GEMM Components
+
+CUTLASS expresses the above loop nest
+with the following components which are specialized for
+data type, layout, and math instruction.
+
+| API level            | API Class and/or function names                   |
+| ---                  | ---                                               |
+| Device               | `cutlass::gemm::device::GemmUniversalAdapter`     |
+| Kernel               | `cutlass::gemm::kernel::GemmUniversal`            |
+| Collective           | `cutlass::gemm::collective::CollectiveMma` <br /> `cutlass::epilogue::collective::DefaultEpilogue` <br /> `cutlass::epilogue::collective::Epilogue`        <br /> |
+| Tiled (MMA and Copy) | `cute::TiledMma` and `cute::TiledCopy` <br /> `cute::gemm()` and `cute::copy()` |
+| Atom                 | `cute::Mma_Atom` and `cute::Copy_Atom` |
+
+In CUTLASS 3.0, we assemble kernels
+by first composing a collective mainloop and collective epilogue
+together at the kernel layer,
+and then wrapping them with a host-side adapter
+to form a GEMM handle to that kernel.
+
+The following sections describe these components
+in the order a user should instantiate them
+in order to assemble a kernel.  This order is
+
+1. assemble the required collective mainloop and epilogues,
+
+2. compose them together to build a kernel type, and
+
+3. wrap up the kernel with a device layer adapter.
+
+This order is also reflected in the [CUTLASS 3.0 Hopper kernel examples](/examples/48_hopper_warp_specialized_gemm) as seen in the excerpt below.
+
+```c++
+// Step 1: Generate the required collective layer mainloop specialization
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    TilesShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAuto,
+    cutlass::gemm::collective::KernelScheduleAuto
+  >::CollectiveOp;
+
+// Step 2: Specify the collective layer epilogue type
+using CollectiveEpilogue = cutlass::epilogue::collective::DefaultEpilogue<
+    cutlass::gemm::TagToStrideC_t<LayoutC>,
+    cutlass::gemm::TagToStrideC_t<LayoutC>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>>;
+
+// Step 3: Compose the mainloop and epilogue together at the kernel layer
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cute::Shape<int,int,int,int>, // ProblemShape [M,N,K,L]
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+
+// Step 4: Wrap up the kernel::GemmUniversal kernel class
+// with the device adapter to obtain a host-side handle to the kernel
+using GemmHandle = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+```
+
+Towards the end, we also briefly cover CuTe's tiled mma and copy as well as the atom layer APIs,
+before redirecting users to CuTe-specific documentation for further details.
+
+## Collective API
+
+A Collective is "the largest collection of threads
+onto which mma atoms and copy atoms are tiled."
+That is, it is the largest number of threads in a grid
+that can cooperate by leveraging hardware features
+for accelerated communication and synchronization.
+These hardware features include
+
+* asynchronous array copy
+  (e.g., from global memory to shared memory);
+
+* MMA instructions
+  for small tiles that live in shared memory;
+
+* synchronization operations for clusters,
+  thread blocks, and/or warps; and/or
+
+* hardware acceleration (such as barriers)
+  for ensuring that data dependencies
+  between asynchronous operations are met.
+
+A Collective uses the `TiledMma` and `TiledCopy` API (see below)
+to access operations that copy and perform MMA on tiles.
+
+Different units of parallelism
+(e.g., threads, warps, or thread blocks)
+in a Collective might have different roles.
+For example, in "warp-specialized" algorithms,
+some warps may be responsible for copying data,
+while others may be responsible for computation.
+Nevertheless, the different units of parallelism
+still need to share data and coordinate access
+to the shared data. For example,
+the producer warps in a warp-specialized algorithm
+that copy input matrix tiles into shared memory
+need to let the consumer MMA warp(s) know
+that their MMA inputs are ready.
+We contrast this with the `kernel::` layer API,
+which schedules the collectives over *independent* tiles in the grid.
+
+The Collective API includes both the "mainloop"
+of matrix multiply-accumulate, and the epilogue.
+This API is the composition point for optimizations
+such as mainloop fusions and epilogue fusions.
+It is responsible for implementing
+the `k_tile` loop in the above triply nested loop pseudocode.
+
+### Collective Mainloops
+
+The `cutlass::gemm::collective::CollectiveMma` class
+is the primary interface to the collective
+matrix multiply-accumulate (MMA) mainloops.
+"Mainloop" refers to the "main loop" over tiles --
+the "cluster tile k" loop in the pseudocode
+near the top of this document.
+Any looping over multiple tiles that
+the algorithm might need to do would happen here.
+
+The `CollectiveMma` class is declared in the header
+[cutlass/gemm/collective/collective_mma.hpp](/include/cutlass/gemm/collective/collective_mma.hpp).
+
+```c++
+namespace cutlass::gemm::collective {
+
+template <
+  class DispatchPolicy,
+  class TileShape,
+  class ElementA,
+  class StrideA,
+  class ElementB,
+  class StrideB,
+  class TiledMma,
+  class GmemTiledCopyA,
+  class SmemLayoutAtomA,
+  class SmemCopyAtomA,
+  class TransformA,
+  class GmemTiledCopyB,
+  class SmemLayoutAtomB,
+  class SmemCopyAtomB,
+  class TransformB
+>
+struct CollectiveMma {
+  static_assert(sizeof(ElementA) == 0, "Could not find a mainloop specialization.");
+};
+
+} // namespace cutlass::gemm::collective
+```
+
+- `DispatchPolicy` is the most important type for a collective, and is
+[covered in more detail below](#collective-dispatch-policies).
+
+- `StrideA` and `StrideB` are instances of type `cute::Stride` that represent the global memory layout of A and B tensors. These strides are required to be rank-3, representing the modes `[outer, inner, batch]`. Each of the 3 ranks can be a multi-modal hierarchical stride; this would apply if implementing a tensor contraction.
+
+- `TiledMma` is an instance of `cute::TiledMma`.
+
+- `GmemTiledCopyA` and `GmemTiledCopyB` are instances of `cute::TiledCopy` types. Both tiled operation types are [covered in more detail below](#tiled-mma-and-copy).
+
+- `SmemLayoutAtomA` and `SmemLayoutAtomB` are instances of type `cute::Layout` and represent the smallest
+layout that will get tiled over the entire collective's shared memory. This layout does _not_ include the
+pipeline mode, and therefore, both are expected to be rank 2 layouts of shape [`outer`, `inner`].
+
+- `SmemCopyAtomA` and `SmemCopyAtomB` are `Copy_Atom`s to be used for moving data from shared memory
+into register memory.
+
+Notice that CUTLASS 3.0 mainloops do not accept a dedicated accumulator element type.
+We obtain the accumulator type from the `typename TiledMma::ValTypeC`. Note also that
+top level API's `ElementA` and `ElementB` can differ from those of the MMA facing
+`typename TiledMma::ValTypeA` and `typename TiledMma::ValTypeB`, allowing TMA or user
+supplied transform operations to perform type conversions.
+
+### Collective Dispatch Policies
+
+`CollectiveMma` implementations are not generic.
+Instead, they must be specialized for each algorithm and GPU architecture.
+Users can dispatch to a `CollectiveMma` specialization
+by picking template arguments matching that specialization.
+CUTLASS 3.0 adopts a tag-based dispatch policy type to specialize
+mainloop implementations and add tuning knobs to them.
+
+Below is an example of one of the dispatch policies that is used to dispatch to a Hopper TMA
+warp-specialized mainloop implementation:
+
+```c++
+// n-buffer in smem (Hopper TMA),
+// pipelined with Hopper GMMA and TMA,
+// warp-specialized dynamic schedule
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  class KernelSchedule = KernelTmaWarpSpecializedCooperative
+>
+struct MainloopSm90TmaGmmaWarpSpecialized {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm90;
+  using Schedule = KernelSchedule;
+};
+```
+
+The `Stages_` template parameter lets the user freely vary the number of pipeline stages,
+while the `ClusterShape_` type allows for parameterization over the shape of the threadblock
+cluster over which TMA multicast will take place.
+
+The collective dispatch policy is also the primary point of composing various kernel schedules
+freely with any mainloop. Each mainloop policy either prescribes a `Schedule` with which
+it needs to be run, or exposes a template API that lets the user pick a subset of the following schedules:
+
+```c++
+struct KernelCpAsyncWarpSpecialized { };
+struct KernelCpAsyncWarpSpecializedPingpong { };
+struct KernelCpAsyncWarpSpecializedCooperative { };
+struct KernelTma { };
+struct KernelTmaWarpSpecialized { };
+struct KernelTmaWarpSpecializedPingpong { };
+struct KernelTmaWarpSpecializedCooperative { };
+```
+
+- A single kernel schedule can support multiple mainloop implementations. For example,
+`KernelMultistage` can be composed with many different mainloop implementations across GPU
+architectures such as `MainloopSm70TwoStage`, `MainloopSm80CpAsyncUnpredicated`, and many more.
+
+- A single mainloop can be composed with multiple
+possible kernel schedules. For example, the `MainloopSm90TmaGmmaWarpSpecialized` can be
+composed with any of the `KernelTmaWarpSpecialized`, `KernelTmaWarpSpecializedPingpong` or `KernelTmaWarpSpecializedCooperative`
+kernel schedules.
+
+As [discussed in the CUTLASS 3.0 design documentation](cutlass_3x_design.md), adopting tag
+dispatch policies for our core vocabulary types allows us to maintain a single type name for
+all operations that conceptually belong to the same class. This design has the following benefits.
+
+- It *avoids code duplication* in cases where mainloops can be composed with multiple kernels or vice versa.
+- It *makes writing generic code easier*, as the primary type name `CollectiveMma` does not change across any implementation.
+- It *provides a clear, singular extension point* for users to plug in new, custom mainloops implementations specialized on their own dispatch policies.
+
+### Collective Builder for `CollectiveMma`s
+
+The primary `CollectiveMma` is intended to be an expert user interface that allows full control over
+all the properties of the collective's GPU micro-kernel. However, often a user just wants an
+off-the-shelf GEMM mainloop implementation parameterized on simple configuration parameters. CUTLASS 3.0
+provides [`cutlass::gemm::collective::CollectiveBuilder`](/include/cutlass/gemm/collective/collective_builder.hpp) for such scenarios.
+
+```c++
+namespace cutlass::gemm::collective {
+template <
+  class ArchTag,
+  class OpClass,
+  class ElementA,
+  class GmemLayoutA,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutB,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType,
+  class Enable = void
+>
+struct CollectiveBuilder {
+  static_assert(sizeof(ElementA) == 0, "Could not build a collective for given parameters.");
+};
+} // namespace cutlass::gemm::collective
+```
+
+`CollectiveBuilder` accepts CUTLASS 2.x equivalent input template arguments, and attempts to build
+the best performing `CollectiveMma` from the given parameters.
+
+- `ArchTag` is one of the SM architectures tags from `cutlass::arch::Sm*`.
+- `OpClass` is one of the operator class tags from `cutlass::arch::OpClass*`.
+- `ElementA` and `ElementB` are the logical value types of the A resp. B tensors.
+- `ElementAccumulator` is the accumulator type to be used in the instruction.
+- `GmemLayoutA` and `GmemLayoutB` are CUTLASS 2.x layout tags, `layout::RowMajor` or `layout::ColumnMajor`.
+- `AlignmentA` and `AlignmentB` are global memory alignments of A and B tensors in terms of element count.
+- `TileShape_MNK` is an instance of `cute::Shape` that is rank-3, representing the MxNxK collective tile shape.
+- `ClusterShape_MNK` is an instance of `cute::Shape` that is rank-3, representing the MxNxK threadblock cluster tile shape.
+- `StageCountType` is either `collective::StageCountAuto` or an instance of `collective::StageCount<N>`.
+- `KernelScheduleType` is either `collective::KernelScheduleAuto` or one of the specific kernel schedule tags discussed in the [dispatch policy section](#collective-dispatch-policies) above.
+
+`StageCountAuto` allows the collective builder to compute the size of a single stage's size in shared memory
+and maximize the shared memory usage assuming 1 threadblock / multiprocessor occupancy.
+
+`KernelScheduleAuto` allows the collective builder to pick the best kernel schedule available for the
+given set of parameters, or let's the user override this with a specific kernel schedule type.
+
+Note that collective builders are still in beta, and their functionality
+does not map onto the full design space that the primary expert `CollectiveMma` API
+allows for. We expect their supported mainloop types to expand in future releases, but 
+with 3.0, only SM90 tensorop kernels are supported through the builder API. The builder API
+may also change in the future as we adopt user feedback.
+
+If the builder is able to provide a collective mainloop type for the given set of parameters,
+it will be aliased within as `CollectiveOp`. For more information on how to
+parameterize kernels conveniently with the collective builder, please see example [49_hopper_gemm_with_collective_builder](/examples/49_hopper_gemm_with_collective_builder).
+
+### Epilogue
+
+The collective epilogue implements element-wise operations
+involving the output matrix.  Users can provide a custom
+epilogue, or use one of the standard epilogues.
+These live in the directory
+[include/cutlass/epilogue/collective/](/include/cutlass/epilogue/collective/),
+and include classes like
+`cutlass::epilogue::collective::DefaultEpilogue`
+and
+`cutlass::epilogue::collective::Epilogue`.
+CUTLASS's provided collective epilogues
+do not live under `include/cutlass/gemm`
+or in the `cutlass::gemm` namespace,
+because they can be used for computations
+other than GEMM.
+
+## Kernel API
+
+The kernel is "a collection of all clusters in the grid."
+The kernel layer schedules have four main responsibilities.
+
+- Ordering the execution of collectives within the kernel, performing any synchronization between that may be necessary
+- Marshalling the threads of a warp specialized schedules into their respective roles
+- Performing any necessary grid swizzling logic
+- Tiling the input tensors with the threadblock cluster value tile before invoking the collectives on them
+
+The Kernel API is the entry point for a grid of thread blocks
+that may or may not be organized in a cluster.
+It is the composition point for fusing back-to-back GEMMs,
+epilogues, and/or other operations.
+
+The entry point API for CUTLASS 3.0 kernel is the class
+`cutlass::gemm::kernel::GemmUniversal`, found in the header file
+[include/cutlass/gemm/kernel/gemm_universal.hpp](/include/cutlass/gemm/kernel/gemm_universal.hpp).
+`GemmUniversal` is a stateless universal device kernel
+that implements GEMM as the composition of two parts:
+
+* a collective mainloop, and
+* a collective epilogue
+
+```cpp
+namespace cutlass::gemm::kernel {
+/*
+ * Stateless universal device GEMM kernel type that treats GEMM as
+ * a composition of a collective mainloop and a collective epilogue.
+ *
+ * Supports both the 2.x and 3.x APIs based on whether the first type is
+ * a cute::tuple<> or not.
+ * 2.x API implementation: cutlass/gemm/kernel/gemm_universal.h
+ * 3.x API implementation: cutlass/gemm/kernel/gemm_*.hpp
+ *
+ * In the following declaration, the name preceding the 'Or' refers to
+ * 3.x API type argument order, and the name succeeding the 'Or' refers to
+ * 2.x API type argument order. Template arguments without two names
+ * belong to the 3.x API only.
+**/
+template <
+  class ProblemShapeOrThreadblockMma_, // (m, n, k) or (m, n, k, l)
+  class CollectiveMainloopOrEpilogue_,
+  class CollectiveEpilogueOrThreadblockSwizzle_,
+  class TileScheduler_ = void,
+  class Enable = void
+>
+class GemmUniversal;
+} // namespace cutlass::gemm::kernel
+```
+
+*Stateless* means that the caller --
+for example, the Device API described above --
+manages the kernel's state.
+The kernel just takes input and output parameters (`Params`).
+
+*Universal* means that `GemmUniversal` works
+for both CUTLASS 3.0 and 2.x interfaces
+and across a broad range of kernel schedules.
+If `GemmUniversal`'s first template argument is a `cute::Shape`,
+then `GemmUniversal` assumes that the remaining template arguments
+implement the 3.0 APIs.  Otherwise, `GemmUniversal` assumes that
+the remaining template arguments implement the 2.x APIs.
+Starting with CUTLASS 3.0, the problem shape has been promoted
+to a top-level template API for the GEMM kernel.
+This supports fully static GEMM instantiations
+where the user expects to know some or all
+of the problem shapes at compile time
+in order to extract even more performance.
+
+The *collective mainloop* implements MMA on local tiles.
+The *collective epilogue* addresses any operations after the MMA,
+such as applying the `beta * C` part of `C := beta * C + alpha * A * B`.
+We will explain *collective* in more detail below.
+
+Specializations of `kernel::GemmUniversal` for 3.0 APIs live in 
+any of various `gemm_*.hpp` files in the directory
+[include/cutlass/gemm/kernel/](/include/cutlass/gemm/kernel/).
+Specializations for 2.x APIs can be found in the header file
+[include/cutlass/gemm/kernel/gemm_universal.h](/include/cutlass/gemm/kernel/gemm_universal.h).
+
+CUTLASS 3.x implements various embodiments of `kernel::GemmUniversal`.
+Each kernel layer schedule is specialized
+for a GEMM scheduling algorithm and GPU architecture.
+Specializations of `kernel::GemmUniversal` for 3.0 APIs live in 
+any of various `include/cutlass/gemm/kernel/{arch_tag}*.hpp` files in the directory
+[include/cutlass/gemm/kernel/](/include/cutlass/gemm/kernel/).
+Which specialization to dispatch to is decided through the dispatch policy's `Schedule` type.
+
+For example, the header file
+[include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp](/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp)
+has a specialization of `kernel::GemmUniversal` for Hopper
+that uses a warp-specialized mainloop with a persistent scheduling algorithm,
+while the header file
+[include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp](/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp)
+has a specialization of `GemmUniversal` for Hopper
+that uses a warp-specialized but non-persistent algorithm.
+
+To support composition between supported kernel schedules and mainloop dispatch policies without having to
+duplicate collective mainloop implementations, GEMM kernel layer schedules can be composed with
+any mainloop that specifies their corresponding kernel schedule as their `Schedule` type in the policy.
+This is discussed in detail in the [collective dispatch policy section](#collective-dispatch-policies) above.
+
+```c++
+// An example of the SM90 KernelMultistage kernel's
+// specialization logic that allows it to be composed
+// with many mainloops such as `MainloopSm80CpAsync`
+// and `MainloopSm70TwoStage`.
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  std::enable_if_t<std::is_base_of_v<KernelMultistage, typename CollectiveMainloop_::DispatchPolicy::Schedule>>>
+```
+
+## Device API
+
+The Device API is a universal, kernel-agnostic host interface
+for kernel launch and managing the lifetime of 
+reusable host-side parameters.
+
+This API is how users' host-side .cu code
+invokes CUTLASS's single-GPU GEMM kernels.
+It serves the same purpose as cuBLAS and behaves similarly.
+
+The entry point for the Device GEMM API is the class
+`cutlass::gemm::device::GemmUniversalAdapter`.
+This class lives in the header file
+[include/cutlass/gemm/device/gemm_universal_adapter.h](/include/cutlass/gemm/device/gemm_universal_adapter.h).
+`GemmUniversalAdapter` is a stateful, reusable handle,
+which is parameterized on the `cutlass::gemm::kernel` type.
+
+```c++
+/*! 
+  GemmUniversalAdapter is a stateful, reusable GEMM handle built around a kernel
+  of type cutlass::gemm::kernel::*
+
+  It manages the lifetime of the underlying `kernel::Params` struct, and exposes APIs
+  to create it from the host facing arguments. For power users, new static methods
+  are exposed in 3.x APIs that bypass the stateful methods or args->params lowering.
+
+  It supports kernel types that implement both the 2.x and 3.0 APIs,
+  however, this is done by specializing the implementation of GemmUniversalAdapter
+  on the two kernel API types, and thus, GemmUniversalAdapter's behavior might
+  differ between the two specializations.
+*/
+template <class GemmKernel_, class Enable = void>
+class GemmUniversalAdapter;
+```
+
+*Stateful* means that the handle instance contains state
+that the kernel needs to run.
+This means that the user must initialize the handle first,
+then use the initialized handle instance to run the kernel.
+Statefulness also means that the handle can manage the lifetime
+of the kernel's `Params` -- the parameters of the kernel itself.
+An important duty of `GemmUniversalAdapter`
+is to map from the user's `Arguments` --
+what the user sees as the kernel's parameters --
+to the `Params` that the kernel actually sees.
+For power users, the class exposes new static methods
+in 3.0 APIs that can bypass stateful methods
+or go directly to `Params` without intermediate `Arguments`.
+
+*Reusable* means that the handle instance can be used
+to call the kernel multiple times with different arguments
+(e.g., different matrices).
+Reusing the handle may be more efficient than just
+creating a new handle for each kernel invocation.
+
+*Parameterized on the kernel type* means that
+the `GemmUniversalAdapter` class' behavior
+depends on the GEMM kernel type (see the next section).
+Specifically, `GemmUniversalAdapter` has a template parameter
+`GemmKernel`, which is the GEMM kernel type.
+Valid template arguments for `GemmKernel` are
+
+* `cutlass::gemm::kernel::GemmUniversal`,
+  implementing CUTLASS 3.x API kernels;
+* `cutlass::gemm::kernel::GemmUniversal`,
+  implementing CUTLASS 2.x API kernels; or
+* Any valid CUTLASS 2.x `kernel` layer GEMM that
+  was previously composable with the `device::GemmUniversalAdapter`.
+
+`GemmUniversalAdapter` presents a single
+host-side interface to both 3.0 and 2.x kernels.
+CUTLASS accomplishes this by
+specializing `GemmUniversalAdapter`'s implementation
+on either the 2.x API implementing kernel layer GEMMs, or on the 3.x API
+implementing kernel layer GEMMs. The metafunction [`cutlass::gemm::detail::IsCutlass3GemmKernel`](cutlass_3x_backwards_compatibility.md#kernel-api-design-differences)
+is what `GemmUniversalAdapter` uses to distinguish between 2.x and 3.x kernels.
+
+`GemmUniversalAdapter` sets up and launches the kernel, using the 
+CUDA extended launch API for threadblock cluster support if required.
+Note, `GemmUniversalAdapter` does *not* specify the grid shape.
+The kernel controls the grid shape
+and other kernel-specific launch parameters.
+This makes it possible for all 3.0 kernels
+to use the same kernel launch code,
+thus factoring out kernel launch from the actual kernel.
+
+## Tiled MMA and Copy
+
+The Tiled MMA or Copy are tilings of MMA atoms resp. Copy atoms
+across threads and data, with possible permutations applied to the 
+resulting tiling. This layer is most analogous to the warp level
+tiling of MMA instructions in CUTLASS 2.x. However, it views the tiling
+from the perspective of all threads participating in the operation
+and generalizes the concept to copy operations as well. The purpose
+of this layer is to build composable GPU micro-kernels out of a plethora
+of hardware accelerated math and data movement operations, each with their
+unit layouts in threads and data. The tiled MMA and Copy types present
+all these various hardware accelerated CuTe Atoms with a single, consistent
+API.
+
+The resulting tiled operation acts as a single MMA or copy operation
+that users can invoke in the "inner" loop
+of the three-nested-loops pseudocode
+at the top of this document using `cute::gemm()` or `cute::copy()`.
+
+We call this API "tiled" because it constructs
+larger operations out of the Atoms provided by CuTe,
+as if fitting together individual tiles
+to build a reusable component of a mosaic.
+For example, CuTe might provide an MMA Atom
+that users can call on a single warp,
+for fixed M, N, and K dimensions.
+CUTLASS can then use CuTe operations like `make_tiled_mma`
+to turn this Atom into an operation
+that works on an entire thread block,
+for larger M, N, and K dimensions.
+
+## Atom API
+
+An "Atom" is the smallest collection of threads and data
+that must participate in the execution of a hardware-accelerated
+math or copy operation.
+
+An Atom is "atomic" (indivisible) not in the sense of
+concurrent memory operations like `atomicAdd`
+(which are "indivisible in time (causality)"),
+but in the sense of indivisibility in "space" --
+the number of values and the groups of parallel workers
+that must participate in the operation together.
+
+An Atom uses CuTe Layouts to express the required
+dimensions and strides of its input and output arrays.
+Generally these are fixed at compile time.
+
+The Atom API wraps calls to actual hardware instructions
+that accelerate MMA or copy operations.
+Users can ask for GPU architecture-specific implementations,
+or just pick generic implementations and rely on
+whatever GPU architectures were enabled.
+
+For more information about Atoms,
+please refer to CuTe's tutorial, e.g., the sections on
+
+* [algorithms](./cute/04_algorithms.md) like `gemm` and `copy`,
+
+* [MMA Atoms](./cute/0t_mma_atom.md#cute-mma-atoms), and
+
+* [a GEMM example](./cute/0x_gemm_tutorial.md).
+
+# Copyright
+
+Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/media/docs/grouped_scheduler.md b/media/docs/grouped_scheduler.md
new file mode 100644
index 0000000000..501cf91b09
--- /dev/null
+++ b/media/docs/grouped_scheduler.md
@@ -0,0 +1,388 @@
+![ALT](../images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS Grouped Kernel Schedulers")
+
+[README](../../README.md#documentation) > **Grouped Kernel Schedulers**
+
+# CUTLASS Grouped Kernel Schedulers
+
+CUTLASS's grouped kernel is a persistent kernel which launches multiple problems (e.g., GEMMs, SYR2Ks) within a
+single CUDA kernel launch.
+
+Unlike a conventional GEMMs in CUTLASS, which launch a number of threadblocks equal to the number
+of tiles in the GEMM, CUTLASS grouped kernels typically launch a number of threadblocks that is
+fewer than the total number of tiles across all problems in the group. Each threadblock is then
+responsible for computing one or more tiles among the problems in the group. The grouped kernel
+_scheduler_ (referred to as the _problem visitor_ in code) is responsible for assigning each
+threadblock the sequence of tiles that it will compute within the group.
+
+This document provides background on the functionality of the grouped kernel scheduler, and describes
+various optimizations to the grouped kernel scheduler.
+
+**Outline**
+
+* [Introduction to Grouped Kernel Schedulers](grouped_scheduler.md#introduction-to-grouped-kernel-schedulers)
+* [Grouped GEMM Scheduler](grouped_scheduler.md#grouped-gemm-scheduler)
+* [Grouped Rank2K Scheduler](grouped_scheduler.md#grouped-rank2k-scheduler)
+* [Scheduler Modes](grouped_scheduler.md#scheduler-modes)
+* [Improving Load Balance by Sorting Problems](grouped_scheduler.md#improving-load-balance-by-sorting-problems)
+
+# Introduction to Grouped Kernel Schedulers
+Given a group of problem sizes and a grid of threadblocks, the scheduler's job is to assign 
+tiles from problems in the group to threadblocks. Threadblocks in a grouped kernel persistently 
+execute a loop of querying the scheduler for the next tile to compute and performing the 
+kernel-level operations for that tile (e.g., MMA and epilogue). In pseudocode, this looks as 
+follows:
+```c++
+ProblemVisitor problem_visitor;
+ 
+while (problem_visitor.next_tile()) {
+    //
+    // Get next tile index from scheduler
+    //
+ 
+    //
+    // Compute MMA and epilogue
+    //
+ 
+    // Inform the scheduler that we are done with the current tile
+    problem_visitor.advance(gridDim.x);
+}
+```
+
+The key functionality of the grouped kernel scheduler lies in the `next_tile()` method,
+which determines which tile in the group the calling threadblock should compute next, if any.
+
+# Grouped GEMM Scheduler
+The scheduler used by grouped GEMM assigns tiles in the group to threadblocks in a round-robin
+fashion.
+
+Consider, for example, the threadblock-to-tile mapping that occurs for a group of four GEMMs
+each consisting of a grid of 2x2 tiles. Suppose that eight threadblocks are launched. The
+figure below illustrates the threadblock ID assigned to each tile in each GEMM in the group.
+
+![ALT](/media/images/grouped-gemm-schedule-2x2.png "CUTLASS grouped GEMM scheduler assigning threadblocks to four GEMMs with 2x2 grids of tiles")
+
+A similar mapping for problems that do not have the same number of tiles
+is shown below:
+
+![ALT](/media/images/grouped-gemm-schedule-varied.png "CUTLASS grouped GEMM scheduler assigning threadblocks to four GEMMs with varying tile count")
+
+## Computing the schedule for a given block
+Each threadblock in the grouped GEMM computes its own schedule by calling
+the `next_tile()` method described above.
+
+To do this, the threadblock's `ProblemVisitor` maintains a `thread_idx`
+member that is initialized to `blockIdx.x` and is incremented by
+`gridDim.x` between each tile computed (only the x dimension is used)
+in the launch configuration for grouped kernels). The scheduler must
+then figure out which GEMM in the group `tile_idx` belongs to, and which tile
+within that problem it maps to.
+
+1. **Determining which GEMM `tile_idx` maps to:** The scheduler determines 
+the GEMM to which `tile_idx` belongs by iterating through GEMMs starting with
+the most-recently visited GEMM, and adding the number of tiles within that 
+GEMM to a running variable `problem_tile_start`. The scheduler has found the
+correct problem for this tile when `problem_tile_start <= tile_idx < problem_tile_start + tiles_in_problem`.
+
+2. **Determining the tile within a GEMM `tile_idx` maps to:** Once the GEMM
+to which `tile_idx` maps has been located, the specific tile within that
+GEMM that this block should compute is given by `tile_idx - problem_tile_start`.
+Simple rasterization is then performed to map this one-dimensional tile ID
+into the two-dimensional coordinate of the tile to compute in the GEMM.
+
+We describe how this search is accelerated in [Scheduler Modes](grouped_scheduler.md#scheduler-modes).
+
+# Grouped Rank2K Scheduler
+The previous section described the operation of the scheduler used
+for grouped GEMM kernels. While this scheduler is sufficient for
+correctly implementing grouped Rank2K operations (i.e., SYR2K and HER2K), it leads to significant inefficiencies.
+
+We next describe these inefficiencies as well as how the CUTLASS
+grouped Rank2K scheduler overcomes them.
+
+## Inefficiency of grouped GEMM scheduler for grouped Rank2K problems
+The grouped GEMM scheduler assumes that every tile in every GEMM in the group will 
+ultimately affect the output of the problem. This is not the case for Rank2K 
+problems, for which matrix C is either upper or lower triangular. Using the default 
+grouped GEMM scheduler for such problems will thus lead to threadblocks frequently 
+being assigned to tiles that exit early (e.g., due to being assigned to a tile in the 
+upper-triangular portion of a lower-triangular problem). This further leads to load 
+imbalance among threadblocks, as the grouped GEMM scheduler assigns nearly the same 
+number of tiles to all threadblocks, regardless of how many tiles are truly active.
+
+Consider an example of a group of four SYR2K problems, each with matrix C consisting 
+of a grid of 2x2 tiles.  Matrix C in each problem is lower triangular, indicated by 
+shaded tiles. Consider that eight threadblocks are launched to compute the grouped 
+problem. The default grouped GEMM scheduler will assign threadblocks to tiles in the following order:
+
+![ALT](/media/images/grouped-syr2k-schedule-using-grouped-gemm-scheduler.png "CUTLASS grouped GEMM scheduler assigning threadblocks to four SYR2Ks with 2x2 grids of tiles")
+
+In this case, threadblocks 1 and 5 are continuously assigned to inactive tiles. In
+scenarios in which problems within the group have varying size, we have observed
+this to still lead to significant load imbalance.
+
+## Specializing the scheduler for triangular problems
+We seek to design a scheduler that more efficiently maps threadblocks to active tiles
+for kernels that use triangular output matrices. The scheduler should ideally assign
+threadblocks only to those tiles within lower-triangular portion of a
+lower-triangular problem (and vice-versa for upper-triangular problems).
+
+Using the example above, the resulting assignment of threadblocks to tiles from
+such a scheduler might be:
+
+![ALT](/media/images/grouped-syr2k-schedule-ideal.png "CUTLASS grouped SYR2K scheduler assigning threadblocks to four SYR2Ks with 2x2 grids of tiles")
+
+Achieving this schedule requires mapping from a threadblock ID to tile coordinates
+ `(i, j)`.
+
+We will illustrate this by mapping a lower-triangular matrix with a 3x3 grid. We 
+first calculate row and column indices assuming one-indexed rows, tiles, and 
+threadblock IDs, and then subtract one to convert to zero-indexed versions. Our 
+description borrows heavily from the mapping described [here](https://stackoverflow.com/a/40954159).
+
+![ALT](/media/images/grouped-syr2k-schedule-3x3.png "CUTLASS grouped SYR2K scheduler assigning threadblocks to one SYR2K with a 3x3 grids of tiles")
+
+### Calculating row `i` given threadblock ID `t`
+For a given row i, all threadblock IDs t in that row satisfy the following:
+```
+t <= 1 + 2 + 3 + ... + (i-1) + i
+```
+
+The closed-form equation for the right-hand side is: `i(i+1)/2`.
+Using this, we can solve for `i` given `t`:
+```
+t  <= i(i+1)/2
+2t <= i^2 + i
+2t <= i^2 + i + 0.25 - 0.25
+2t + 0.25 <= i^2 + i + 0.25
+2t + 0.25 <= (i + 0.5)^2
+sqrt(2t + 0.25) - 0.5 <= i
+```
+
+To account for fractional values, we set:
+```
+i = ceil(sqrt(2t + 0.25) - 0.5)
+```
+
+To turn this into a zero-indexed row and work with zero-indexed `t`, we perform:
+```
+i = ceil(sqrt(2(t+1) + 0.25) - 0.5) - 1
+  = ceil(sqrt(2t + 2.25) - 0.5) - 1
+```
+
+### Calculating column `j` given threadblock ID `t` and row `i`
+For a given row `i`, all threadblock IDs `t` in that row also satisfy the following:
+```
+    t > 1 + 2 + 3 + ... + (i-2) + (i-1)
+--> t > i(i-1)/2
+```
+
+Threadblock IDs within a given row are sequential, so the one-indexed column ID
+for one-indexed threadblock ID `t` and row `i` is:
+```
+j = t - (i(i-1)/2)
+```
+
+The zero-indexed version becomes:
+```
+j = (t+1) - (i(i+1)/2) -1
+  = t - (i(i+1)/2)
+```
+
+### Accounting for non-square grids
+Though the overall output problem size for Rank2K problems is guaranteed to be square, the
+grids used in computing may not be square due to using non-square threadblock shapes. For
+example, a threadblock shape of 64x32 operating on a problem of output size 128x128 would
+result in a grid of 2x4 tiles.
+
+This case can be handled by noting that the output resembles a square grid of 2x2 "macro tiles"
+each of which contains 2 "true tiles." We can thus first map a threadblock ID to its "macro tile"
+using the equations above, and then map it to the "true tile" within its "macro tile." In the example
+of a 2x4 grid, this mapping would look as follows:
+
+![ALT](/media/images/grouped-syr2k-schedule-macro.png "CUTLASS grouped SYR2K scheduler converting a grid into a 'macro grid' for computing tile mappings for non-square grids")
+
+A zero-indexed threadblock ID `t` is mapped to its "macro tile ID" `t_macro` as:
+```
+t_macro = t // r
+```
+Where `r` is the ratio of the maximum dimension of the grid to the
+minimum dimension of the grid (i.e., `r = 4 / 2 = 2` in the previous example).
+
+One uses `t_macro` and the calculations above to find the row and column in the square matrix to
+obtain `i_macro` and `j_macro` (zero-indexed). The mapping from `(i_macro, j_macro) --> (i, j)`
+is simply the following:
+```
+if (ThreadblockShape::M > ThreadblockShape::N):
+    r = ThreadblockShape::M / ThreadblockShape::N
+    i = i_macro
+    j = (j_macro * r) + (t % r)
+elif (ThreadblockShape::M < ThreadblockShape::N):
+    r = ThreadblockShape::N / ThreadblockShape::M
+    i = (i_macro * r) + (t % r)
+    j = j_macro
+else:
+    i = i_macro
+    j = j_macro
+```
+
+### Handling cases with grid dimensions that aren't multiples of each other
+Even though threadblock shapes M and N are typically multiples of one another, the grid
+for a given problem may not have dimensions of the same ratio as that of the threadblock.
+For example, a problem of size 132x132 using a threadblock of shape 64x32 will result
+in a grid of 3x5 tiles. In this case, there is not an integer number of "true tiles"
+per "macro tile."
+
+When this scenario arises, we simply pad the larger dimension of the grid such that
+there are an integer number of "true tiles" per "macro tile." Thus, the 3x5 grid in
+the example above will be treated as a 3x6 grid. Row and column positions for each
+tile are calculated as above. Any threadblocks that map to tiles that are outside the
+problem range or upper/lower triangular portion (e.g., (2, 5)) will exit early from
+this problem and may proceed to the next problem in the group.
+
+### Handling upper-triangular matrices
+The only modification needed for upper-triangular matrices is to swap `i_macro` and `j_macro` in the calculations above.
+
+# Scheduler modes
+The grouped kernel schedulers come with two different modes for finding
+the next tile for a block to compute. These techniques are controlled by
+the [`cutlass::gemm::kernel::GroupScheduleMode`](../../include/cutlass/gemm/kernel/grouped_problem_visitor.h) enum.
+We describe each mode in greater detail below.
+
+## `GroupScheduleMode::kDeviceOnly` (default)
+This scheduler mode performs all scheduling work on the device. It parallelizes
+the search for the problem that `tile_idx` maps to by having each thread "own"
+a different problem and determine whether `tile_idx` falls within the range of
+that problem.
+
+`GroupScheduleMode::kDeviceOnly` performs this parallelization in a warp-wide 
+fashion. Each thread in the warp loads a problem size indexed by its lane id and 
+computes the number of tiles in that problem. A warp-wide prefix sum is used to find 
+the starting tiles for the set of problems the warp is looking at. At the end of the 
+prefix sum, each thread holds the starting tile index and tile count for a unique 
+problem in the group.
+
+While `tile_idx` remains within the range of the problems currently hosted by the 
+warp, each thread will check whether `tile_idx`  is in the range of its current 
+problem. The matching problem index and its starting tile are then broadcasted to all 
+threads in the warp.
+
+## Precomputing schedules on the host: `GroupScheduleMode::kHostPrecompute`
+This scheduler attempts to reduce the amount of scheduling performed on the device
+by precomputing on the host the sequence of problems that will 
+be accessed by each block. As described above, all that is needed to map tile_idx  to 
+the specific tile within a problem to compute is the problem ID and the problem's 
+starting tile (among all of the tiles in the group). Thus, this scheduler precomputes 
+the problem index and problem starting tile for each tile computed by each block.
+
+The schedule for an individual block is represented as an array of
+`(problem_idx, problem_starting_tile)` tuples. There is one such array per block.
+These arrays are produced on the host and copied over to the device. This
+representation is optimized for the case in which blocks compute at most one
+tile per problem. When a block computes multiple tiles per problem in the group,
+the representation above will result in duplicate entries, and thus will be
+suboptimal (e.g., `[(3, 20), (3, 20)]` for a block that computes two tiles in
+problem 3, which has starting tile index 20).
+We have chosen to use the representation described above because grouped kernels
+themselves are typically most beneficial when problem sizes are small, and, thus,
+blocks compute at most one tile per problem.
+
+## Which scheduler mode should I use?
+Consider the following questions when deciding which scheduling mode to use:
+
+### How are the parameters used as input to the grouped kernel (e.g., ptrA, lda) set in my application?
+If these are set by a previous kernel running on
+the device (rather than by the host), you likely want to use `kDeviceOnly`,
+as this will minimize additional host-device communication.
+
+### Can host-side work be overlapped with other device kernels in my application?
+For example, if a grouped GEMM is used as the Nth layer in a neural network,
+host-side precomputation for the grouped GEMM can potentially be overlapped
+with device-side work for layer N-1. In this case `kHostPrecompute` is likely
+a good fit.
+
+### How compute-intensive are the problems in my group?
+The differences in performance between `kHostPrecompute` and `kDeviceOnly` are most
+noticeable for grouped kernels with low computational intensity, for which time spent in
+the scheduler accounts for a significant fraction of the grouped kernel's runtime.
+Intuitively, as problems in a group decrease in computational intensity, a smaller
+fraction of the overall runtime will be consumed in performing MMA operations, leading
+to a larger fraction of the overall runtime being consumed by scheduling logic.
+
+Since the scheduling modes affect only the scheduling logic of the grouped kernels,
+one expects to see most benefit from `kHostPrecompute` for less computationally-intense
+groups.
+
+# Improving Load Balance by Sorting Problems
+The grouped kernel schedulers assign a nearly equal number
+of tiles to each block participating in the grouped kernel. Every tile in the
+group has the same M and N dimensions. However, the K dimension of each
+tile depends on the K dimension of the problem, so tiles may have different
+K dimensions. Thus, the K dimension of the
+tile plays a significant role in determining how long it takes for a given
+tile to be computed.
+
+## Potential problems with imbalanced K dimension
+To ensure that compute load is balanced evenly across blocks, it is important
+that the sum of the K dimensions among all tiles a block computes be similar
+to that of other blocks; if one block computes far more tiles with a large
+value of K than other blocks, it may take longer than the other blocks.
+
+For example, consider the following group of GEMMs:
+```
+0 1152x768x128
+1 1152x768x1024
+2 768x1152x128
+3 768x1152x1024
+```
+If a tile size of 128x128 is used, then each problem will have 54 tiles.
+Thus, there are 216 tiles across the group.
+
+Suppose this grouped GEMM is run on GA100, which has 108 SMs. Suppose that
+the occupancy given the parameters of the grouped GEMM is one -- one threadblock
+can be active at a time on an SM. The grouped GEMM will, thus, run with 108
+persistent threadblocks, each of which computes (256 / 108) = 2 tiles.
+
+Under the round-robin assignment of tiles to threadblocks employed by
+the grouped GEMM scheduler, the assignment of tiles to threadblocks
+in this GEMM will be as follows:
+```
+Threadblocks 0-53:     Tiles of size 128x128x128  from problem 0
+Threadblocks 54-107:   Tiles of size 128x128x1024 from problem 1
+Threadblocks 0-53:     Tiles of size 128x128x128  from problem 2
+Threadblocks 54-107:   Tiles of size 128x128x1024 from problem 3
+```
+
+Following this assignment, threadblocks 54-107 perform significantly more
+work than threadblocks 0-53 because they compute two tiles with a K
+dimension of 1024, whereas threadblocks 0-53 compute two tiles with K
+dimension of only 128.
+
+Due to this imbalanced assignment, threadblocks 54-107 will run
+significantly longer than threadblocks 0-53, leaving threadblocks
+0-53 idle for a large fraction of time.
+
+Clearly, a better assignment of tiles to threadblocks for this
+example would involve all threadblocks computing one tile with
+a K dimension of 1024 and one tile with a K dimension of 128.
+This would better balance the workload among threadblocks.
+
+## Potential for sorting problems to reduce imbalance
+A simple way to potentially reduce load imbalance is to sort the problems in a group in
+descending order of their K dimension. This can help to improve load balance
+because tiles in a group are assigned in a round-robin fashion to blocks
+sequentially, so every block will always be assigned next the tile with
+the highest K dimension available.
+
+Considering the example described above, sorting the problem sizes before
+executing grouped GEMM improves the runtime of this grouped GEMM on GA100 with each
+scheduling mode by around 30%.
+
+To ease the process of sorting groups and their associated metadata in this
+manner, the device-level grouped kernels provide a `sort_problems()` method.
+An example of how to use this may be found in the [grouped GEMM example](../../examples/24_gemm_grouped/gemm_grouped.cu).
+
+Finally, while sorting problems can be helpful in certain scenarios, it is
+not guaranteed to improve performance. In some cases, performance can
+decrease when sorting problems due to additional conflicting factors that
+affect GEMM performance. We recommend profiling your grouped kernel with
+and without sorting to see whether it helps in your case.
diff --git a/media/docs/ide_setup.md b/media/docs/ide_setup.md
new file mode 100644
index 0000000000..90e5dc2957
--- /dev/null
+++ b/media/docs/ide_setup.md
@@ -0,0 +1,122 @@
+[README](../../README.md#documentation) > **IDE Setup for CUTLASS Development**
+
+# IDE Setup for CUTLASS Development
+
+This document outlines instructions and tips for setting up a local editor for CUTLASS development, including support
+for intellisense, go-to-definition, code formatting, and so on.
+
+## Overview
+In order for any intellisense tool to work with CUTLASS, the following things need to be configured with it:
+* Include paths, i.e. where the compiler (or in this case, the intellisense tool) should look for header files
+* Compiler flags; especially the C++ standard (`--std`)
+* Preprocessor variables; especially CUDA-related ones
+
+One usually needs to configure the above variables in a settings file. Below, two config approaches are described:
+for VSCode, and for any editor that uses the clangd language server, which includes
+Vim, Emacs, NeoVim, Sublime Text, and so on. Note that VSCode can also be configured to use clangd.
+It might be worth setting up clangd for VSCode rather than the default intellisense,
+and you might see faster responses and more stable performance with clangd.
+
+## VSCode Setup
+
+1. Install the [Official C/C++ extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode.cpptools)
+1. Open settings...
+    1. `Ctrl+Shift+P` to open the command palette
+    1. Enter "C/C++" to filter results
+    1. Select "C/C++ Edit Configurations (UI)" (or "... (JSON)" if you feel like editing the raw JSON)
+    1. View the documentation for these settings
+       [here](https://code.visualstudio.com/docs/cpp/c-cpp-properties-schema-reference)
+1. Edit "Include Path" to set up **include paths**. For CUTLASS, this includes the following:
+    * `${workspaceFolder}/include`
+    * `${workspaceFolder}/tools/util/include`
+    * `${workspaceFolder}/examples/common`
+    * ...others, depending on which files you edit
+1. Edit C++ standard to be `c++17`, `gnu++17`, or equivalent.
+1. Edit `defines` to define preprocessor variables. See
+[Global Config below](#Global-Config) for examples. The important
+   ones include `__CUDACC_VER_MAJOR__`, `__CUDA_ARCH__`, `__CUDA_ARCH_FEAT_SM90_ALL__`. But configure
+   them according to your target architecture.
+1. ...and possible edit any other fields for your specific setup.
+
+## clangd Setup
+
+`clangd` is a C++ language server that is part of the LLVM project. You must first set it up your specific IDE:
+* `clangd` official [documentation](https://clangd.llvm.org/installation#editor-plugins) for editor setup.
+* NeoVim setup is possible through [lsp](https://neovim.io/doc/user/lsp.html) and either manually installing clangd or
+using an installation manager like Mason.
+
+Then, one needs to edit the config ([documentation](https://clangd.llvm.org/config)). One typically has a
+**global** and a **per-project** config.
+
+### Global Config
+
+Here is one example for a global config.
+On linux this is usually located at `~/.config/clangd/config.yaml` . Here is one example config for CUDA projects on SM90.
+The key settings here are the preprocessor vars (`-D__CUDACC_VER_MAJOR__` , `-D__CUDA_ARCH__`)
+
+```
+CompileFlags:
+  Compiler: /usr/local/cuda/bin/nvcc
+  Add:
+    - --cuda-path=/usr/local/cuda
+    - --cuda-gpu-arch=sm_90a
+    - -I/usr/local/cuda/include
+    - "-xcuda"
+    # report all errors
+    - "-ferror-limit=0"
+    - --cuda-gpu-arch=sm_90a
+    - --std=c++17
+    - "-D__INTELLISENSE__"
+    - "-D__CLANGD__"
+    - "-DCUDA_12_0_SM90_FEATURES_SUPPORTED"
+    - "-DCUTLASS_ARCH_MMA_SM90_SUPPORTED=1"
+    - "-D_LIBCUDACXX_STD_VER=12"
+    - "-D__CUDACC_VER_MAJOR__=12"
+    - "-D__CUDACC_VER_MINOR__=3"
+    - "-D__CUDA_ARCH__=900"
+    - "-D__CUDA_ARCH_FEAT_SM90_ALL"
+    - "-Wno-invalid-constexpr"
+  Remove:
+    # strip CUDA fatbin args
+    - "-Xfatbin*"
+    # strip CUDA arch flags
+    - "-gencode*"
+    - "--generate-code*"
+    # strip CUDA flags unknown to clang
+    - "-ccbin*"
+    - "--compiler-options*"
+    - "--expt-extended-lambda"
+    - "--expt-relaxed-constexpr"
+    - "-forward-unknown-to-host-compiler"
+    - "-Werror=cross-execution-space-call"
+Hover:
+  ShowAKA: No
+InlayHints:
+  Enabled: No
+Diagnostics:
+  Suppress:
+    - "variadic_device_fn"
+    - "attributes_not_allowed"
+```
+
+### Local Config
+Local config is needed to specify per-project settings, especially include paths. An example is:
+```
+CompileFlags:
+  Add:
+    - -I</absolute/path/to/cutlass>/include/
+    - -I</absolute/path/to/cutlass>/tools/util/include/
+    - -I</absolute/path/to/cutlass>/cutlass/examples/common/
+```
+
+Note that absolute paths are needed since clangd doesn't support relative paths.
+
+### Note on compile_commands.json
+For typical C++ projects, clangd can *automatically* configure itself by parsing the `compile_commands.json`
+generated by your CMake build. The path to such a file is by default `build/compile_commands.json` and is
+configured by the `CompilationDatabase` config.
+
+This is usually a convenient way to configure projects, but it's not as simple for CUDA/nvcc projects, since
+clang doesn't understand many of the compiler flags used by nvcc. Hence, for now, we don't recommend using
+`compile_commands.json` to configure your CUDA project.
+
diff --git a/media/docs/implicit_gemm_convolution.md b/media/docs/implicit_gemm_convolution.md
index b19e04e6a0..9b4df0096b 100644
--- a/media/docs/implicit_gemm_convolution.md
+++ b/media/docs/implicit_gemm_convolution.md
@@ -1,6 +1,6 @@
-![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS Implicit GEMM API")
+![ALT](../images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS Implicit GEMM API")
 
-[README](/README.md#documentation) > **Implicit GEMM Convolution**
+[README](../../README.md#documentation) > **Implicit GEMM Convolution**
 
 # CUTLASS Convolution
 
@@ -19,16 +19,20 @@ This release of CUTLASS contains several artifacts related to convolution.
 
 # Implicit GEMM Algorithm
 
-2-D convolution may be mapped to matrix multiply by forming a _convolution matrix_ containing
-elements of the activations tensor then multiplying this by a matrix formed from the filters tensor.
-The earliest form of this algorithm construct the convolution matrix explicitly via an operation
+2-D convolution may be mapped to matrix multiply
+by first forming a _convolution matrix_ containing elements of the activations tensor,
+then multiplying this by a matrix formed from the filters tensor.
+The earliest form of this algorithm constructs the convolution matrix explicitly via an operation
 conventionally referred to as `im2col`. The resulting matrix replicates each activation element by a factor
 equal to the filter size, consuming additional storage capacity and memory bandwidth.
 
-The _implicit GEMM_ algorithm is a variation on the blocked, hierarchical GEMM computation in CUDA
-that instead forms tiles of the convolution matrix on the fly as data is loaded from global memory
-into Shared Memory by carefully updating pointers and predicates. Once the convolution matrix is
-formed in Shared Memory, the existing components computing warp-level GEMM accumulate the result of
+The _implicit GEMM_ algorithm is a variation on the blocked, hierarchical GEMM computation in CUDA.
+Instead of constructing the convolution matrix explicitly,
+it forms tiles of the convolution matrix on the fly
+as data are loaded from global memory into Shared Memory
+by carefully updating pointers and predicates.
+Once the convolution matrix is formed in Shared Memory,
+the existing warp-level GEMM components accumulate the result of
 convolution and update the output tensor.
 
 This section describes the structure of an efficient Implicit GEMM Convolution CUDA kernel
@@ -158,7 +162,7 @@ To get the best performance, the following parameters are recommended.
 - Channel count (C) is a multiple of 32 elements
 - Filter count (K) is a multiple of 32 elements
 
-This enables 128-bit vector memory acceses which lead to efficient CUDA kernels. Smaller alignment is supported even on tensor cores by setting AlignmentA and AlignmentB in conv::kernel::DefaultConv2dFprop, but the performance is lower than 128-bit aligned tesnors.
+This enables 128-bit vector memory acceses which lead to efficient CUDA kernels. Smaller alignment is supported even on tensor cores by setting AlignmentA and AlignmentB in `conv::kernel::DefaultConv2dFprop`, but the performance is lower than 128-bit aligned tensors.
 
 # CUTLASS Device-level Convolution Operator
 
@@ -187,12 +191,12 @@ using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
   SwizzleThreadBlock,                                     // optional function to reorder threadblocks for locality
   NumStages,                                              // number of pipeline stages in threadblock-scoped GEMM
   cutlass::arch::OpMultiplyAddSaturate,                   // math operation on data of element a and b
-  cutlass::conv::IteratorAlgorithm::kOptimized            // globabl memory iterator algorithm  
+  cutlass::conv::IteratorAlgorithm::kOptimized            // global memory iterator algorithm  
 >::Kernel
 ```
 
 This template is intended to be generic and cover all feasible configurations. The example specifies
-the following concrete data types, layouts, and tile sizes.
+the following concrete data types, layouts, and tile shapes.
 
 ```c++
 /// Define an Implicit GEMM convolution forward propagation (fprop) kernel
@@ -219,7 +223,7 @@ using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
   SwizzleThreadBlock,                                  // optional function to reorder threadblocks for locality
   2,                                                   // number of pipeline stages in threadblock-scoped GEMM
   cutlass::arch::OpMultiplyAddSaturate,                // math operation on data of element a and b
-  cutlass::conv::IteratorAlgorithm::kOptimized         // globabl memory iterator algorithm  
+  cutlass::conv::IteratorAlgorithm::kOptimized         // global memory iterator algorithm  
 >::Kernel
 ```
 
@@ -227,7 +231,7 @@ That is, this computes 2D convolutional forward propagation with 4-bit integer i
 Internal accumulation is performed using 32-bit integers (`int32_t`), and an elementwise linear combination operation
 is performed on the output in single-precision floating point (`float`).
 
-The threadblock and warp-level tile sizes refer to the hierarhically blocked GEMM computation 
+The threadblock and warp-level tile shapes refer to the hierarchically blocked GEMM computation 
 [described here](/media/docs/gemm_api.md). Larger tiles achieve greater reuse of data loaded through shared memory
 but launch fewer CTAs and may not fully occupy the GPU for small problem sizes. Smaller tile configurations achieve
 lower peak utilizations but may better match the number of SMs within the GPU for real-world workloads.
@@ -344,13 +348,13 @@ creating GEMM-A tile in shared memory.
 - [conv2d_fprop_filter_tile_access_iterator_optimized.h](/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h) optimizes iterating over global memory and 
 creating GEMM-B tile in shared memory.
 
-The improvements covered by optimized iterators are: 
-- (a) Precomputing kernel-invariant pointer deltas on the host 
-- (b) Computing cta-invariant mask predicates on device-side iterator ctors
-- (c) Use of [fast divmod](/include/cutlass/fast_math.h) to map GEMM dimenstions to convolution tensors. 
-For example, _optimized_ activation iterator uses fast divmod to map GEMM _M_ to NPQ 
-for activation iterator
+The improvements covered by optimized iterators are:
 
+a. Precomputing kernel-invariant pointer deltas on the host 
+b. Computing cta-invariant mask predicates on device-side iterator ctors
+c. Use of [fast divmod](/include/cutlass/fast_math.h) to map GEMM dimensions to convolution tensors.
+
+For example, an _optimized_ activation iterator uses fast divmod to map GEMM _M_ to NPQ.
 
 **Pipelined mainloop** loads threadblock-scoped tiles from global memory into shared memory and then applies
 CUTLASS warp-level GEMM operations to load from Shared Memory and issue instructions to Turing Tensor Cores.
@@ -483,7 +487,7 @@ inc_next[2] = (
 }
 ```
 
-This allows only a simple lookup from the _delta table_ performed in device code in `Conv2dFpropActivationTileAccessIteratorOptimized::advance()`
+This allows only a simple lookup from the _delta table_ performed in device code in `Conv2dFpropActivationTileAccessIteratorOptimized::advance()`.
 
 ```c++
 // cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h
@@ -516,17 +520,17 @@ void advance() {
 
 ```
 
-### Utilizing Tensor Cores
+### Making use of Tensor Cores
 
 Turing Tensor Cores compute matrix multiply-accumulate operations efficiently by sharing data among all
 threads within a warp. The following operations are supported.
 
-|**Shape**|**A**|**B**|**C**|
-|---------|-----|-----|-----|
-| 8x8x32  | int4b_t | int4b_t | int32_t |
-| 8x8x16  | int8b_t | int8b_t | int32_t |
-| 16x8x8  | half   | half   | half    |
-| 16x8x8  | half   | half   | float   |
+| **Shape** | **A**   | **B**   | **C**   |
+|-----------|---------|---------|---------|
+| 8x8x32    | int4b_t | int4b_t | int32_t |
+| 8x8x16    | int8b_t | int8b_t | int32_t |
+| 16x8x8    | half    | half    | half    |
+| 16x8x8    | half    | half    | float   |
 
 Functionally, the Turing 8x8x32 matrix multiply operation distributes the _A_, _B_, and _C_ matrix across 32
 threads within a warp according to the following illustration.
@@ -551,7 +555,7 @@ asm volatile(
   : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
 ```
 
-To efficiently load data from Shared Memory into registers with the distribution among
+To load data efficiently from Shared Memory into registers with the distribution among
 warps matching the above, the Turing GPU architecture introduces 
 [`ldmatrix`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-ldmatrix).
 `ldmatrix` is the ultimate warp-cooperative instruction, as all threads contribute addresses to up to 32 row vectors of
@@ -652,8 +656,11 @@ CUTLASS structures this as several components:
 ## Unit Tests
 
 Unit tests verify the functional behavior of each of the above components in a standalone CUDA kernel. This provides a
-convenient environment to (a.) inspect the template definition, (b.) showcase instantiation of use of these templates
-in device code, and (c.) assert functional correctness.
+convenient environment to
+
+a. inspect the template definition,
+b. showcase instantiation of use of these templates in device code, and
+c. assert functional correctness.
 
 **Convolution unit tests**
 - Device-wide convolution operator: [conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu)
@@ -754,7 +761,7 @@ Convolution can also be run by the CUTLASS Profiler.
 
 # Copyright
 
-Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/media/docs/layout.md b/media/docs/layout.md
index 9a186c4a6f..74c1909ea1 100644
--- a/media/docs/layout.md
+++ b/media/docs/layout.md
@@ -1,6 +1,11 @@
-![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS Layouts and Tensors")
+![ALT](../images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS Layouts and Tensors")
 
-[README](/README.md#documentation) > **Layouts and Tensors**
+[README](../../README.md#documentation) > **Layouts and Tensors**
+
+Note: This document talks about CUTLASS 2.x layout tag types.
+CUTLASS 3.0 deprecates all legacy 2.x layout tags in favour of a single `cute::Layout<Shape, Stride>`
+vocabulary type for all thread and data tensors. Please refer to the
+[documentation for cute layouts](cute/01_layout.md) for more details about CUTLASS 3.0's definition of "layout".
 
 # Layouts and Tensors
 
@@ -267,7 +272,7 @@ Permuted Shared Memory Layouts:
 
 # Copyright
 
-Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/media/docs/pipeline.md b/media/docs/pipeline.md
new file mode 100644
index 0000000000..4b9bfd4718
--- /dev/null
+++ b/media/docs/pipeline.md
@@ -0,0 +1,210 @@
+# Synchronization primitives
+
+## Overview of CUDA's synchronization methods
+
+The CUDA programming model provides 3 abstractions:
+
+* hierarchical parallelism -- that is, parallel threads
+  grouped into hierarchical units such as blocks and clusters;
+
+* shared memory, through which parallel threads that are
+  in the same hierarchical unit can communicate; and
+
+* synchronization methods for threads.
+
+These abstractions help developers extract
+both fine-grained and coarse-grained parallelism,
+by making it possible for them to subdivide problems
+into independent components,
+and to insert synchronization at appropriate points.
+
+Over the years CUDA has introduced several synchronization primitives
+that operate at different levels of the hierarchy.
+These include
+
+* [thread block - level](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#synchronization-functions) synchronization (e.g., `__syncthreads()`);
+
+* [warp-level](https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/) synchronization (e.g., `__syncwarp()`); and
+
+* [thread-level](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#memory-fence-functions) fence operations.
+
+As an extension to this, starting with the Hopper architecture, CUDA added the following improvements:
+
+* [thread block clusters](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#thread-block-clusters) --
+  a new level in the thread hierarchy representing
+  a group of thread blocks that can coordinate and share data;
+
+* synchronization instructions for a thread block cluster and threads within a cluster scope.
+
+## CUTLASS's abstractions for Hopper features
+
+CUTLASS now includes abstractions
+for the following features introduced in Hopper.
+
+1. Thread block cluster - level synchronization and query
+   [APIs](/include/cute/arch/cluster_sm90.hpp)
+
+2. Abstractions for new
+   [barrier instructions](/include/cutlass/arch/barrier.h)
+   which help with efficient synchronization
+   of threads within a thread block cluster.
+
+### Asynchronous pipelines
+
+In order to write a performant GEMM Kernel,
+software pipelining is critical to hide the latency of global memory loads.
+(Please refer to the
+[Efficient GEMM](/media/docs/efficient_gemm.md#pipelining) document.)
+Different threads or groups of threads
+may have different roles in the pipeline.
+Some are "producers" that load data or perform computations
+to satisfy other threads' input data dependencies.
+The same or different threads may be "consumers"
+that do other work with those input data dependencies,
+once they are satisfied.
+Starting with the Hopper architecture,
+the presence of hardware-accelerated synchronization instructions
+make it possible for "producer" and "consumer" threads
+to communicate with each other efficiently
+about their data dependencies.
+
+Implementing a persistent GEMM algorithm calls for managing
+dozens of different kinds of asynchronously executing operations
+that synchronize using multiple barriers organized as a circular list.
+This complexity is too much for human programmers to manage by hand.
+As a result, we have developed
+[asynchronous Pipeline classes](/include/cutlass/pipeline/).
+These classes help developers orchestrate a pipeline
+of asynchronous producer and consumer threads,
+without needing to worry about lower-level hardware details.
+These classes serve a similar function as the various
+[pipeline abstractions](https://nvidia.github.io/libcudacxx/extended_api/synchronization_primitives/pipeline.html)
+in libcu++.
+
+#### Pipeline methods 
+  
+##### Producer acquire 
+
+The `producer_acquire` method is to be used by asynchronous producer threads
+before issuing other instructions associated with a particular pipeline stage
+(e.g., copy or write).
+
+This is a blocking instruction
+which blocks further execution of consumer threads
+unless the particular stage waiting to be acquired
+is released by a consumer.
+
+We say that a pipeline at its start is "empty" if producer threads are free to produce and do not need to wait for a consumer release -- that is, if an acquire operation is expected to succeed.  If the pipeline at its start is empty, then we can either skip performing producer acquire operations during the first pass through the pipeline stages, or use the `make_producer_start_state` method.  The latter ensures that the acquire operation will succeed at the start of a pipeline.
+
+##### Producer commit
+
+The `producer_commit` method is to be issued by asynchronous producer threads
+after the instructions associated with a particular stage
+(e.g., shared memory writes) have completed,
+in order to notify the waiting asynchronous consumer threads.
+This is a nonblocking instruction.
+
+This API may result in a No-Op in some cases,
+if the producer instructions also update the barrier stage associated automatically
+(e.g., TMA_based producer threads using the  `PipelineTmaAsync ` class).
+
+##### Consumer wait
+
+The `consumer_wait` method is to be used by consumer threads
+before consuming data from a particular pipeline stage
+which is expected to be produced by producer threads.  
+
+This is a blocking instruction.  That is,
+until the producer threads have committed to a particular stage,
+this instruction is expected to block further execution of consumer threads.
+
+##### Consumer release
+
+The `consumer_release` method is to be used by consumer threads
+to signal waiting producer threads that they have finished consuming data
+associated with a particular stage of the pipeline.
+This is a nonblocking instruction.
+
+#### Pipeline example
+
+```c++
+// 4-stage Pipeline
+static constexpr int NumStages = 4;
+using MainloopPipeline = typename cutlass::PipelineAsync<NumStages>;
+using PipelineState = typename cutlass::PipelineState<NumStages>;
+
+// 2 producer threads and 1 consumer thread 
+typename MainloopPipeline::Params params;
+params.producer_arv_count = 2;
+params.consumer_arv_count = 1;
+MainloopPipeline pipeline(shared_storage.storage, params);
+  
+// Producer threads
+if (thread_idx == 0 or thread_idx == 1) {
+  PipelineState smem_pipe_write = cutlass::make_producer_start_state<MainloopPipeline>();
+  for ( ; iter > 0; --iter) {
+    pipeline.producer_acquire(smem_pipe_write);
+
+    // Producer ops
+    // If any memory operations are involved, then we also need
+    // to guarantee that writes are completed and visible to consumer(s).
+
+    pipeline.producer_commit(smem_pipe_write);
+    ++smem_pipe_write;
+  }
+}
+else if (thread_idx == 2) {
+  PipelineState smem_pipe_read;
+  for (; iter > 0; --iter) {
+    pipeline.consumer_wait(smem_pipe_read);
+
+    // Consumer ops
+
+    pipeline.consumer_release(smem_pipe_read);
+    ++smem_pipe_read;
+  }
+}
+```
+
+In this example, we create an instance of the asynchronous pipeline class `PipelineSync`,
+and then synchronize among 3 asynchronously executing threads:
+2 producer threads and 1 consumer thread.
+
+Please note that this is a basic example.
+There are different versions possible,
+depending on what the producer and consumer threads are doing.
+Please refer to our [unit tests](/test/unit/pipeline)
+and the other [pipeline classes](/include/cutlass/pipeline/pipeline.hpp)
+for more details.
+
+# Copyright
+
+Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/media/docs/profiler.md b/media/docs/profiler.md
index 322063101d..80c855e8e9 100644
--- a/media/docs/profiler.md
+++ b/media/docs/profiler.md
@@ -1,11 +1,11 @@
-![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS Profiler")
+![ALT](../images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS Profiler")
 
-[README](/README.md#documentation) > **CUTLASS Profiler**
+[README](../../README.md#documentation) > **CUTLASS Profiler**
 
 # CUTLASS Profiler
 
 The CUTLASS Profiler is a command-line driven test and profiling environment for CUTLASS computations
-defined in the CUTLASS Instance Library. The CUTLASS Profiler is capable of executing each GEMM, Sparse Gemm, 
+defined in the CUTLASS Instance Library. The CUTLASS Profiler is capable of executing each GEMM, Sparse Gemm,
 Conv2d, and Conv3d kernel.
 
 The CUTLASS Profiler may be compiled with:
@@ -13,8 +13,8 @@ The CUTLASS Profiler may be compiled with:
 $ make cutlass_profiler -j
 ```
 
-To limit compilation time, only one tile size (typically 128x128) is instantiated for each data type, 
-math instruction, and layout. To instantiate all sizes, set the following environment variable when running CMake from an 
+To limit compilation time, only one tile size (typically 128x128) and threadblock cluster size (typically 2x1x1) is instantiated for each data type,
+math instruction, and layout. To instantiate all sizes, set the following environment variable when running CMake from an
 empty `build/` directory.
 ```bash
 $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" -DCUTLASS_LIBRARY_KERNELS=all  -DCUTLASS_UNITY_BUILD_ENABLED=ON
@@ -24,7 +24,68 @@ $ make cutlass_profiler -j
 Enabling the unity build places multiple kernel instances in one compilation unit, thereby reducing size of the compiled
 binary and avoiding linker limitations on some platforms.
 
-The CUTLASS Profiler sources are stored in 
+### Instantiating more kernels with Hopper
+With Hopper (SM90), you will need to use an additional flag,
+`CUTLASS_LIBRARY_INSTANTIATION_LEVEL`, in order to instantiate all possible combinations,
+which unlike previous architectures, will be in the order of millions of kernels.
+Due to this, `CUTLASS_LIBRARY_KERNELS` must be non-empty, since generating and filtering these
+kernels alone can take hours.
+You must also exercise caution, because not all of these configs are tested, and some may fail to
+compile or fail to launch at runtime.
+
+```bash
+$ cmake .. \
+  -DCUTLASS_NVCC_ARCHS="90a" \
+  -DCUTLASS_LIBRARY_KERNELS="cutlass3x_sm90_tensorop_s64x64x16gemm_f16_f16_f32_void_f32_*" \
+  -DCUTLASS_LIBRARY_INSTANTIATION_LEVEL="max" \
+  -DCUTLASS_UNITY_BUILD_ENABLED=ON
+```
+
+The CUTLASS profiler employs a four-digit integer level (global instantiation level) mechanism to manage the generation of kernel configurations. This global instantiation level decides the behavior of multiple "generators" by defining how many and which combinations of configurations are produced. If a global instantiation level contains fewer than four digits, it can be padded with leading zeros to ensure it is four digits long. Each of the four digits in the global level corresponds to a specific category that influences kernel generation, from right to left:
+
+0. **Instruction Shape**
+1. **MMA Shape Multiplier**
+2. **Cluster Shape**
+3. **Schedule Pruning**
+
+Cluster shape levels define the number of CTAs (Cooperative Thread Arrays) included in the kernel generation:
+
+- **Level 0**: Only `(1, 2, 1)` cluster shape.
+- **Level 1**: Clusters with 2 CTAs.
+- **Level 2**: Clusters with 1 or 2 CTAs.
+- **Level 3**: Clusters with 1, 2, or 4 CTAs.
+- **Level 4**: Clusters with 1, 2, 4, or 8 CTAs.
+- **Level 5**: Clusters with 1, 2, 4, 8, or 16 CTAs.
+
+The MMA multipliers are combined with MMA instruction shapes (WGMMA shapes) to form CTA shapes. The levels for MMA multipliers determine the configurations generated for different data types.
+- **Levels [0, 3]**: Control the specific configurations generated for various data types.
+- **Level 9**: Activates exhaustive mode, generating all possible configurations.
+
+Higher levels encompass a broader range of CTA configurations, resulting in more comprehensive kernel generation.
+
+Instruction shape levels control the selection of WGMMA shapes used in kernel generation:
+
+- **Level 0**: Generates the "default" shape only.
+- **Level 1**: Includes additional shapes for unpruned cases, specifically for TF32 data type.
+- **Level 2**: Includes shapes that are powers of 2.
+- **Level 3**: Includes all other shapes.
+
+The detailed defination of the three instantiation levels controlling cluster shape, MMA shape multiplier, and instruction shape can be found in [sm90_shapes.py](../../python/cutlass_library/sm90_shapes.py).
+
+Schedule pruning levels decide the epilogue schedule and mainloop schedule to stamp out a kernel instance. As defined in `get_valid_schedules` in [sm90_utils.py](../../python/cutlass_library/sm90_utils.py),
+
+- **Level >= 1**: Indicates that no pruning is being applied.
+- **Level 0**: Indicates pruning according to existing [generator.py](../../python/cutlass_library/generator.py) behavior.
+
+An instantiation level `500`, which is padded to `0500`, thus indicates:
+
+- **Instruction Shapes**: At level 0, generating only the "default" shape.
+- **MMA Multipliers**: At level 0, generating only one multiplier, `(2, 1, 4)`.
+- **Cluster Sizes**: At level 5, allowing for clusters with 1, 2, 4, 8, or 16 CTAs.
+- **Schedule Pruning**: At level 0, where pruning is applied according to the existing `generator.py` behavior.
+
+The CUTLASS Profiler sources are stored in:
+
 ```bash
 tools/
   profiler/
@@ -65,6 +126,9 @@ Device:
                                                    profiling phases cycle through different input tensors to induce
                                                    capacity misses in the L2.
 
+  --allocations=<name>:<device>,<name>:<device>    Pairs of allocation names to devices. If <device> is negative,
+                                                   the execution device is used
+
 
 Initialization:
   --initialization=<bool>                          Enables initialization (default: true). If false, device memory is
@@ -90,8 +154,8 @@ Library:
 
 
 Profiling:
-  --workspace-count=<workspace count>              Number of discrete workspaces maintained to avoid cache-resident 
-                                                 If zero (default), the amount is chosen for each workload based on 
+  --workspace-count=<workspace count>              Number of discrete workspaces maintained to avoid cache-resident
+                                                 If zero (default), the amount is chosen for each workload based on
                                                  capacity of the last-level cache.
 
   --profiling-iterations=<iterations>              Number of iterations to profile each kernel. If zero, kernels
@@ -123,7 +187,7 @@ Verification:
 
 
 Report:
-  --append=<bool>                                  If true, result is appended to possibly existing file. Otherwise, 
+  --append=<bool>                                  If true, result is appended to possibly existing file. Otherwise,
                                                    any existing file is overwritten.
 
   --output=<path>                                  Path to output file for machine readable results. Operation kind and '.csv' is appended.
@@ -168,8 +232,8 @@ Example:
 The CUTLASS Profiler is capable of executing GEMM and Sparse GEMM problems.
 
 The CUTLASS Profiler can be built with cuBLAS enabled to use as a reference implementation. If CMake detects
-the cuBLASS library available in the system, it is included as a dependency. This may be explicitly overridden
-with CMake flag `CUTLASS_ENABLE_CUBLAS`. 
+the cuBLAS library available in the system, it is included as a dependency. This may be explicitly overridden
+with CMake flag `CUTLASS_ENABLE_CUBLAS`.
 
 ## GEMM Arguments
 
@@ -181,7 +245,7 @@ $ ./tools/profiler/cutlass_profiler --operation=gemm --help
 
 GEMM
 
-  [enum]      --Gemm_kind                                       Variant of GEMM (e.g. gemm, batched, ...)
+  [enum]      --gemm_kind                                       Variant of GEMM (e.g. universal, gemm, planar_complex, planar_complex_array)
   [int]       --m,--problem-size::m                             M dimension of the GEMM problem space
   [int]       --n,--problem-size::n                             N dimension of the GEMM problem space
   [int]       --k,--problem-size::k                             K dimension of the GEMM problem space
@@ -190,57 +254,67 @@ GEMM
   [tensor]    --C                                               Tensor storing the C operand
   [scalar]    --alpha,--epilogue::alpha                         Epilogue scalar alpha
   [scalar]    --beta,--epilogue::beta                           Epilogue scalar beta
-  [int]       --split_k_slices                                  Number of partitions of K dimension
-  [int]       --batch_count                                     Number of GEMMs computed in one batch
-  [enum]      --op_class,--opcode-class                         Class of math instruction (SIMT or TensorOp).
-  [enum]      --accum,--accumulator-type                        Math instruction accumulator data type.
-  [int]       --cta_m,--threadblock-shape::m                    Threadblock shape in the M dimension.
-  [int]       --cta_n,--threadblock-shape::n                    Threadblock shape in the N dimension.
-  [int]       --cta_k,--threadblock-shape::k                    Threadblock shape in the K dimension.
-  [int]       --stages,--threadblock-stages                     Number of stages of threadblock-scoped matrix multiply.
-  [int]       --warps_m,--warp-count::m                         Number of warps within threadblock along the M dimension.
-  [int]       --warps_n,--warp-count::n                         Number of warps within threadblock along the N dimension.
-  [int]       --warps_k,--warp-count::k                         Number of warps within threadblock along the K dimension.
-  [int]       --inst_m,--instruction-shape::m                   Math instruction shape in the M dimension.
-  [int]       --inst_n,--instruction-shape::n                   Math instruction shape in the N dimension.
-  [int]       --inst_k,--instruction-shape::k                   Math instruction shape in the K dimension.
-  [int]       --min_cc,--minimum-compute-capability             Minimum device compute capability.
-  [int]       --max_cc,--maximum-compute-capability             Maximum device compute capability.
-
+  [enum]      --split_k_mode,--split-k-mode                     Variant of split K mode(serial, parallel)
+  [int]       --split_k_slices,--split-k-slices                 Number of partitions of K dimension
+  [int]       --batch_count,--batch-count                       Number of GEMMs computed in one batch
+  [enum]      --op_class,--opcode-class                         Class of math instruction (simt, tensorop, wmmatensorop, wmma).
+  [enum]      --accum,--accumulator-type                        Math instruction accumulator data type
+  [int]       --cta_m,--threadblock-shape::m                    Threadblock shape in the M dimension
+  [int]       --cta_n,--threadblock-shape::n                    Threadblock shape in the N dimension
+  [int]       --cta_k,--threadblock-shape::k                    Threadblock shape in the K dimension
+  [int]       --cluster_m,--cluster-shape::m                    Cluster shape in the M dimension
+  [int]       --cluster_n,--cluster-shape::n                    Cluster shape in the N dimension
+  [int]       --cluster_k,--cluster-shape::k                    Cluster shape in the K dimension
+  [int]       --stages,--threadblock-stages                     Number of stages of threadblock-scoped matrix multiply
+  [int]       --warps_m,--warp-count::m                         Number of warps within threadblock along the M dimension
+  [int]       --warps_n,--warp-count::n                         Number of warps within threadblock along the N dimension
+  [int]       --warps_k,--warp-count::k                         Number of warps within threadblock along the K dimension
+  [int]       --inst_m,--instruction-shape::m                   Math instruction shape in the M dimension
+  [int]       --inst_n,--instruction-shape::n                   Math instruction shape in the N dimension
+  [int]       --inst_k,--instruction-shape::k                   Math instruction shape in the K dimension
+  [int]       --min_cc,--minimum-compute-capability             Minimum device compute capability
+  [int]       --max_cc,--maximum-compute-capability             Maximum device compute capability
+  [enum]      --raster_order={heuristic|H|along_m|M|along_n|N}  If supported by kernel, sets the tile raster direction
+  [int]       --swizzle_size={1,2,4,8}                          If supported by kernel, sets the 2D tile swizzle extent (In Hopper, other values will be rounded down to the nearest supported value)
 Examples:
 
 Profile a particular problem size:
-  $ ./tools/profiler/cutlass_profiler --operation=Gemm --m=1024 --n=1024 --k=128
+  $ cutlass_profiler --operation=Gemm --m=1024 --n=1024 --k=128
 
 Schmoo over problem size and beta:
-  $ ./tools/profiler/cutlass_profiler --operation=Gemm --m=1024:4096:256 --n=1024:4096:256 --k=128:8192:128 --beta=0,1,2
+  $ cutlass_profiler --operation=Gemm --m=1024:4096:256 --n=1024:4096:256 --k=128:8192:128 --beta=0,1,2.5
 
 Schmoo over accumulator types:
-  $ ./tools/profiler/cutlass_profiler --operation=Gemm --accumulator-type=f16,f32
-
-Run when A is f16 with column-major and B is any datatype with row-major 
-(For column major, use column, col, or n. For row major use, row or t):
+  $ cutlass_profiler --operation=Gemm --accumulator-type=f16,f32
 
-  $ ./tools/profiler/cutlass_profiler --operation=Gemm --A=f16:column --B=*:row
+Run when A is f16 with column-major and B is any datatype with row-major (For column major, use column, col, or n. For row major use, row or t):
+  $ cutlass_profiler --operation=Gemm --A=f16:column --B=*:row
 
 Using various input value distribution:
-  $ ./tools/profiler/cutlass_profiler --operation=Gemm --dist=uniform,min:0,max:3
-  $ ./tools/profiler/cutlass_profiler --operation=Gemm --dist=gaussian,mean:0,stddev:3
-  $ ./tools/profiler/cutlass_profiler --operation=Gemm --dist=sequential,start:0,delta:1
+  $ cutlass_profiler --operation=Gemm --dist=uniform,min:0,max:3
+  $ cutlass_profiler --operation=Gemm --dist=gaussian,mean:0,stddev:3
+  $ cutlass_profiler --operation=Gemm --dist=sequential,start:0,delta:1
 
-Run a kernel with cta tile size of 256x128x32 and save workspace if results are incorrect 
-(note that --cta-tile::k=32 is default cta-tile size):
- $ ./tools/profiler/cutlass_profiler --operation=Gemm --cta_m=256 --cta_n=128  --cta_k=32 --save-workspace=incorrect
+Using CUTLASS 3.x GEMM kernel with a tile scheduler that supports runtime tile remapping and raster mode order:
+  $ cutlass_profiler --operation=Gemm --m=2048 --n=2048 --k=2048 --raster_order=M --swizzle_size=2
+
+Run a kernel with cta tile size of 256x128x32 and save workspace if results are incorrect (note that --cta-tile::k=32 is default cta-tile size):
+ $ cutlass_profiler --operation=Gemm --cta_m=256 --cta_n=128  --cta_k=32 --save-workspace=incorrect
 
 Test your changes to gemm kernels with a quick functional test and save results in functional-test.csv:
- $ ./tools/profiler/cutlass_profiler  --operation=Gemm \
+ $ cutlass_profiler  --operation=Gemm \
    --m=8,56,120,136,256,264,512,520,1024,1032,4096,8192,16384 \
    --n=8,56,120,136,256,264,512,520,1024,1032,4096,8192,16384 \
    --k=8,16,32,64,128,256,288,384,504,512,520 \
    --beta=0,1,2 --profiling-iterations=1 \
-   --output=functional-test.csv
+   --providers=cutlass --output=functional-test.csv
+
+Profile when execution is performed on device 0 and the C tensor is located on a device 1 and D on device 2:
+  $ cutlass_profiler --device=0 --allocations=C:1,D:2 --operation=Gemm --m=1024 --n=1024 --k=128
 ```
 
+The format of tensor argument is followed by `<type>:<layout>`. The type could be `f32` as 32-bit floating point, `s8` as 8-bit signed integer, etc. The available types can be referred to the `NumericTypeID_enumerants` in [util.cu](tools/library/src/util.cu). The layout could be `row` or `column`.
+
 ## Example CUDA Core GEMM Operation
 
 Example command line for profiling SGEMM kernels is as follows:
@@ -315,7 +389,7 @@ $ ./tools/profiler/cutlass_profiler --op_class=tensorop --m=3456 --n=4096 --k=81
 ## Covering the problem space
 
 All arguments may have single values or comma-delimited set of values. Integers may also be specified
-as an inclusive range with the following syntax `start:end:increment` or simply `start:end`. 
+as an inclusive range with the following syntax `start:end:increment` or simply `start:end`.
 
 For example, the following sweeps over the range of the GEMM K dimension from 8 to 4096 in increments
 of 8 elements.
@@ -342,16 +416,71 @@ To faclitate generation of pivot tables and charts, additional columns may be pr
 $ ./tools/profiler/cutlass_profiler --kernels=cutlass_simt_sgemm_128x128_nn            \
                                     --m=3456 --n=4096 --k=8:4096:8 --output=report.csv \
                                     --tags=cutlass:2.2,date:2020-06-08
-```  
+```
+
+## CUTLASS 3.0 GEMM procedural names
+
+CUTLASS 3.0 introduces a new naming convention for GEMMs used by the profiler targeting the NVIDIA
+Hopper architecture and beyond so as to indicate new features of the kernel within the name
+(e.g., the cluster shape).
+
+To best illustrate this naming convention, we will walk through the meaning of each of the components
+in a GEMM kernel used by the profiler:
+
+```
+cutlass3x_sm90_tensorop_s64x128x16gemm_f16_f16_f32_f16_f32_128x128x64_2x1x1_0_ntn_align8
+```
+
+The components within this name are as follows:
+
+* `cutlass3x`: indicates that the kernel was generated through the CUTLASS 3.0 API
+* `sm90`: indicates that the kernel targets NVIDIA GPUs with compute capability 90
+* `tensorop`: indicates that the kernel makes use of NVIDIA Tensor Cores
+(as opposed to `simt`, which indicates the use of "CUDA cores")
+* `s`: indicates that the Tensor Core instruction being used accumulates in single precision
+(as opposed to `h`, which indicates half precision)
+* `64x128x16gemm`: indicates that the shape of the Tensor Core instruction being used (MxNxK) is 64x128x16
+* `f16_f16_f32_f16_f16`: indicates that the data types for operands A, B, Accumulator, C and D (in that order).
+* `128x128x64`: indicates that the thread block shape used in the GEMM (MxNxK) is 128x128x64
+* `2x1x1`: indicates that the cluster shape being used is 2x1x1
+* `0`: indicates that the kernel uses the CollectiveBuilder's automatic stage calculation to determine the
+number of pipeline stages in the kernel. Note that `0` does not mean that no stages are used. A nonzero value indicates that automatic stage calculation is not performed and indicates the number of pipeline stages to be used.
+This 0 is only added to the kernel's procedural name, the profiler will still report the actual stage count
+when printing the kernel argument details (`--stages=N`) and kernel discovery will still support filtering through the `--stages` argument.
+* `ntn`: indicates that the layouts for operands A, B, and C are column major ("n"; non-transposed),
+row major ("t"; transposed), and column major, respectively.
+* `align8`: indicates that the maximum alignment between operands A and B is 8.
+
+Note that in some special cases where the input A/B types do not match that of the MMA
+instruction's, the MMA facing input type is added to the instruction string as well.
+
+```
+cutlass3x_sm90_tensorop_s64x128x8tf32gemm_f32_f32_f32_f32_f32_128x128x32_2x1x1_0_tnn_align4
+```
+
+* `s64x128x8tf32gemm`: indicates that the MMA consumes inputs in `tf32` format, and therefore
+the kernel performs rounding of the `f32` values in global memory while loading them into shared memory.
+
+For custom mainloop or epilogue schedules, details of the opted-in schedule are appended to the end of the
+kernel name. For example,
+
+```
+cutlass3x_sm90_tensorop_h64x128x16gemm_f16_f16_f16_void_f16_128x128x64_1x1x1_0_nnn_align8_warpspecialized_cooperative_epi_tma
+```
+
+* `warpspecialized_cooperative`: Mainloop employs a persistent warp-specialized mainloop and kernel schedule.
+* `epi_tma`: Kernel epilogue employs TMA based vectorization.
+* `f16_f16_f16_void_f16`: In this case, C type is set to `void`, indicating that residual matrix support
+is disabled.
 
 # Convolution
 
 The CUTLASS Profiler is capable of executing 2-D and 3-D convolution problems for forwards and backwards
-oeprator variants.
+operator variants.
 
 The CUTLASS Profiler can be built with cuDNN enabled to use as a reference implementation. If CMake detects
 the cuDNN library available in the system, it is included as a dependency. This may be explicitly overridden
-with CMake flag `CUTLASS_ENABLE_CUDNN`. 
+with CMake flag `CUTLASS_ENABLE_CUDNN`.
 
 ```bash
 $ cmake .. -DCUTLASS_LIBRARY_OPERATIONS=conv2d -DCUTLASS_ENABLE_CUDNN=OFF
@@ -377,6 +506,7 @@ Conv2d
   [int]       --s,--filter_s                                    Filter S dimension of the Conv2d problem space
   [int]       --p,--output_p                                    Output P dimension of the Conv2d problem space
   [int]       --q,--output_q                                    Output Q dimension of the Conv2d problem space
+  [int]       --g,--groups                                      Number of convolution groups
   [int]       --pad_h                                           Padding in H direction
   [int]       --pad_w                                           Padding in W direction
   [int]       --stride_h                                        Stride in H direction
@@ -398,6 +528,9 @@ Conv2d
   [int]       --cta_m,--threadblock-shape::m                    Threadblock shape in the M dimension
   [int]       --cta_n,--threadblock-shape::n                    Threadblock shape in the N dimension
   [int]       --cta_k,--threadblock-shape::k                    Threadblock shape in the K dimension
+  [int]       --cluster_m,--cluster-shape::m                    Cluster shape in the M dimension
+  [int]       --cluster_n,--cluster-shape::n                    Cluster shape in the N dimension
+  [int]       --cluster_k,--cluster-shape::k                    Cluster shape in the K dimension
   [int]       --stages,--threadblock-stages                     Number of stages of threadblock-scoped matrix multiply
   [int]       --warps_m,--warp-count::m                         Number of warps within threadblock along the M dimension
   [int]       --warps_n,--warp-count::n                         Number of warps within threadblock along the N dimension
@@ -411,12 +544,7 @@ Conv2d
 Examples:
 
 Profile a particular convolution (specify all the convolution parameters):
-
- $ cutlass_profiler --operation=Conv2d --Activation=f16:nhwc   \
-  --Filter=f16:nhwc --Output=f16 --accumulator-type=f32        \
-  --n=32 --h=14 --w=14 --c=8 --k=64 --r=3 --s=3                \
-  --pad_h=1 --pad_w=1                                          \
-  --stride::h=1 --stride::w=1 --dilation::h=1 --dilation::w=1
+ $ cutlass_profiler --operation=Conv2d --Activation=f16:nhwc --Filter=f16:nhwc --Output=f16 --accumulator-type=f32 --n=32 --h=14 --w=14 --c=8 --k=64 --r=3 --s=3 --pad_h=1 --pad_w=1 --stride_h=1 --stride_w=1 --dilation_h=1 --dilation_w=1
 
 ```
 
@@ -460,7 +588,7 @@ reference_device: Passed
 
 Example command line for profiling forward propagation convolution kernels runing on Tensor Cores is as follows:
 ```bash
-$ ./tools/profiler/cutlass_profiler --kernels=tensorop*fprop  --verification-providers=device --n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3 
+$ ./tools/profiler/cutlass_profiler --kernels=tensorop*fprop  --verification-providers=device --n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3
 
 
 
@@ -496,7 +624,7 @@ reference_device: Passed
 
 # Copyright
 
-Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/media/docs/programming_guidelines.md b/media/docs/programming_guidelines.md
index 45efea2d95..d6d5e16b26 100644
--- a/media/docs/programming_guidelines.md
+++ b/media/docs/programming_guidelines.md
@@ -1,37 +1,28 @@
-![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS Programming Guidelines")
+![ALT](../images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS Programming Guidelines")
 
-[README](/README.md#documentation) > **Programming Guidelines**
+[README](../../README.md#documentation) > **Programming Guidelines**
 
 # Programming Guidelines
 
 ## Hierarchical Organization
 
-CUTLASS embodies a design paradigm exemplified by the [CUB library](https://nvlabs.github.io/cub/) 
-for expressing collective operations. Objects expose an interface for a problem that is then decomposed 
-into concurrent subtasks executed by cooperating threadblocks, warps, and threads. For example, a grid-level 
-object may be constructed with base pointers to the start of a GEMM operation, add a threadblock-dependent 
-offset to partition the problem, and then compute a per-threadblock GEMM. This in turn performs some 
-operations as a collection of cooperating threads, while it may partition other parts of the task into 
-warp-level subtasks.
-
-Consequently, CUTLASS components are organized by the computation then by the layer of
-the following hierarchy.
-
-* *device*: an operation is _device-wide_ and may launch one or more kernels on the GPU
-* *kernel*: an operation is implemented by a CUDA kernel with definitions for `__shared__` memory and constant memory allocations
-* *threadblock*: an operation is collectivey executed by a threadblock; any component calling `__syncthreads()` is likely to be threadblock-scope
-* *warp*: an operation is collectively executed by a warp; threads within the context of a warp are referred to as _lane_
-* *thread*: an operation is performed by an individual thread with no other data sharing or interaction with other threads
-* *instruction*: an operation corresponds to an individual hardware or PTX instruction
+The [CUTLASS 3.0 GEMM API](./gemm_api_3x.md) document
+explains CUTLASS 3.0's hierarchical organization,
+based conceptually on parallelization strategy.
+This differs from CUTLASS 2.x's approach,
+which more closely mirrors the GPU hardware hierarchy
+of thread blocks, warps, and threads.
 
 ## Design Patterns
 
-CUTLASS strives to achieve the highest performance possible on NVIDIA GPUs while also offering a
-flexible composition that an be easily applied to solve new problems related to Deep Learning and
-linear algebra. Though we intend to make CUTLASS as simple and straightforward as possible, given
-a tradeoff between simplicity and performance, CUTLASS chooses performance. Consequently, several
-design patterns are necessary to yield a composable structure while also satisfying these performance
-objectives. This section is intended to provide more detail.
+CUTLASS aims for the highest performance possible on NVIDIA GPUs.
+It also offers flexible components that can be assembled and customized
+to solve new problems related to deep learning and linear algebra.
+Given a tradeoff between simplicity and performance,
+CUTLASS chooses performance.
+Consequently, several design patterns are necessary
+to yield a composable structure
+while also satisfying these performance objectives.
 
 ### Templates
 
@@ -48,35 +39,38 @@ and function inlining.
 
 ### Constant Memory
 
-Several CUTLASS template classes exhibit a pattern in which problem-specific internal state is known at kernel 
-launch time and remains invariant throughout the execution of a kernel. For example, tile iterators compute several 
-offsets based on the strides of the input tensor that is added to an internal pointer when loading the elements 
-of a tile. These are computed from the tensor stride and never updated; the per-thread internal state consists 
+Several CUTLASS template classes exhibit a pattern in which problem-specific internal state is known at kernel
+launch time and remains invariant throughout the execution of a kernel. For example, tile iterators compute several
+offsets based on the strides of the input tensor that is added to an internal pointer when loading the elements
+of a tile. These are computed from the tensor stride and never updated; the per-thread internal state consists
 only of the internal global memory pointer.
 
-CUTLASS can take advantage of this CUDA grid-invariant property by constructing the object in host code and passing 
-a composed parameters structure to the kernel. This confers two benefits: (1.) invariant state is held in constant 
+CUTLASS can take advantage of this CUDA grid-invariant property by constructing the object in host code and passing
+a composed parameters structure to the kernel. This confers two benefits: (1.) invariant state is held in constant
 memory, and (2.) there is no overhead to compute the initial state by each thread.
 
-The design pattern in CUTLASS is for classes with nontrivial constructors to define `struct Params` as an inner class 
-which contains grid-invariant state. These should define a constructor and an `initialize()` method. The `Params` 
-structure should also include a data member corresponding to each data member in the parent class, so these too can 
-be properly constructed in host code. The parent class should define a constructor which accepts `Params const &` as 
+The design pattern in CUTLASS is for classes with nontrivial constructors to define `struct Params` as an inner class
+which contains grid-invariant state. These should define a constructor and an `initialize()` method. The `Params`
+structure should also include a data member corresponding to each data member in the parent class, so these too can
+be properly constructed in host code. The parent class should define a constructor which accepts `Params const &` as
 its first argument.
 
-
 ### Composable Shared Memory
 
-Shared memory requires explicit effort by the programmer to allocate and de-allocate. CUTLASS follows the paradigm 
-introduced by [CUB](https://nvlabs.github.io/cub/) to define composed structures for storing data intended to be held 
-in shared memory. Any object requiring shared memory storage for itself or its data members should define a child 
-structure called `SharedStorage`. This holds data needed by the class and also instantiates `SharedStorage` 
+Shared memory requires explicit effort by the programmer to allocate and de-allocate. CUTLASS follows the paradigm
+introduced by [CUB](https://nvlabs.github.io/cub/) to define composed structures for storing data intended to be held
+in shared memory. Any object requiring shared memory storage for itself or its data members should define a child
+structure called `SharedStorage`. This holds data needed by the class and also instantiates `SharedStorage`
 objects for each data member.
 
-To be consistent, this pattern defines a convention in which classes define internal shared memory storage requirements. 
-Classes should consider all SharedStorage structures to be opaque other than their own child class. When the lifetimes 
-of child objects are known to be non-overlapping, unions may be used to alias multiple SharedStorage objects to the same
-shared memory region and reduce overall SMEM capacity.
+To be consistent, this pattern defines a convention in which classes define internal shared memory storage requirements.
+Classes should consider all SharedStorage structures to be opaque other than their own child class. When the lifetimes
+of child objects are known to be non-overlapping, `union`s may be used to alias multiple SharedStorage objects to the same
+shared memory region and reduce overall shared memory capacity.  Developers should carefully note that C++ `union` rules
+require that they only access the most recently written ("active") member of the `union`; this differs from C rules.
+
+For host to device ABI compatibility, inheritance from a class is only permitted if the superclass is unique to the
+child class. This is most easily achieved by templating the parent class by the child class (CRTP).
 
 ### Loop Unrolling
 
@@ -88,7 +82,7 @@ Consequently, most loops within the CUTLASS GEMM implementation are specified by
 is able to unroll the loop bodies, map array elements to registers, and construct an efficient instruction schedule.
 
 All loops expected to be unrolled should be annotated with `CUTLASS_PRAGMA_UNROLL` to explicitly direct the compiler
-to unroll them. 
+to unroll them.
 
 ```c++
 int const kN = 8;
@@ -97,130 +91,799 @@ Array<float, kN> x;                       // Array we would like to store in reg
 CUTLASS_PRAGMA_UNROLL                     // Directs the CUDA compiler to unroll this loop.
 for (int idx = 0; idx < kN; ++idx) {      // Loop has constant number of iterations.
 
-  x[i] = float(idx);                      // Indirect access by induction variable results in 
+  x[i] = float(idx);                      // Indirect access by induction variable results in
                                           // direct register access.
 }
 ```
-
 ## Style
 
-### C++ Style
+### If you see an issue in code formatting, fix it
 
-CUTLASS source code follows the 
-[Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html) with exceptions and extensions.
+You are empowered to reformat code.
+Please, however, consider making reformatting changes separately from content-related changes.
 
-Design choices should be consistent with the 
-[CppCoreGuidelines](https://github.com/isocpp/CppCoreGuidelines/blob/master/CppCoreGuidelines.md) recommendations by Stroustrup and Sutter.
+### No automatic code formatting
 
-### CUDA Built-in Variables
+Do not use any kind of automatic code formatting,
+like `clang-format`, on CUTLASS code.
 
-Avoid direct access to CUDA built-in variables `threadIdx`, `blockIdx`, `blockDim`, and `gridDim` within
-CUTLASS components except in special circumstances. 
+### C++ style
 
-Using built-in 'global' variables directly within resuable components necessitates that all components
-use them consistently which may not be possible if CUTLASS components are used in other contexts.
+#### CUTLASS is a C++ project
 
-Instead, components should accept a linear ID identifying threads, warps, and threadblocks from calling
-code. The top-level kernel may then decide how to map threads, warps, and blocks to the problem it is
-solving.
+CUTLASS is a C++ project.  CUDA C++ is a C++ dialect.
+Therefore, we write using standard C++ idioms as much as possible.
+We aim for portability to as many compilers as possible,
+by writing host code in Standard C++
+and device code in CUDA C++
+that resembles Standard C++ as much as possible.
+This improves usability
+for the general community of C++ developers,
+and makes it easier for new staff to join the project.
 
-### Use CUTLASS Fundamental Types
+#### Follow Standard C++ idioms where possible
 
-Use the [fundamental types](fundamental_types.md) defined in CUTLASS consistently. Doing so contributes
-to a framework of interoperable, consistent components.
+Regarding "standard C++ idioms,"
+CUTLASS source code follows the following guidelines,
+with deviations only because of compiler limitations
+or where performance absolutely requires it.
+"Performance requires it" implies measurement.
+Deviations should be limited in scope
+and we should always strive to eliminate them.
 
-In particular, be sure to use:
+* [C++ Core Guidelines](https://github.com/isocpp/CppCoreGuidelines/blob/master/CppCoreGuidelines.md)
 
-* [Numeric types](fundamental_types.md#numeric-types) to represent numeric data in host and device code
-* [Containers](fundamental_types.md#containers) to store data in register-backed arrays
-* [functional.h](fundamental_types.md#functional) to perform numeric operations in generic code
-* [Layouts](layout.md) to store stride and partially specialize template classes
-* [`TensorRef` and `TensorView`](layout.md#tensorref) to pass pointers and layout objects
+* [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html)
 
-Avoid defining alternative implementations of the same functionality. Instead, prefer to enhance
-or extend additional components where it makes sense.
+#### C is not a subset of C++
 
-### Classes and Structs
+C is not a subset of C++.
+Some valid C is not valid C++, and some valid "C-looking" C++ is not valid C.
+See e.g., the informative C++ Standard Committee (WG21) document
+[P2735R0](https://isocpp.org/files/papers/P2735R0.pdf),
+which explains ways in which the same code has different behavior in C vs. C++.
+In some cases, code that compiles in both C and C++,
+and is correct in C, has undefined behavior (can crash or worse) in C++.
+The "type.punning" section of P2735R0 specifically relates to unions.
 
-Type names use `CapitalLetters` except when implementations are a _perfect_ drop-in replacement for
-Standard Library components.
+#### Spacing and line length
 
-Follow the [CppCoreGuidelines](https://github.com/isocpp/CppCoreGuidelines/blob/master/CppCoreGuidelines.md#Rc-struct) 
-to decide whether to use `class` or `struct`. Namely,
-* use `class` when the object must maintain an invariant. Data members related to the invariant should be private.
-* use `struct` when the class has no invariant to maintain, and data members may vary arbitrarily.
+* Use spaces, not tabs.
 
-### Class Members
+* Use 2 spaces to indent.
 
-Methods and members are written using `snake_case`.
+* Use at most 100 characters per line.
 
-Private data and function members have suffix `_`.
+(Right-align tensor shape layout comments at column 120.
+Please see below.)
+Lines longer than 100 characters typically wrap unfavorably
+when viewed in Github's pretty printer.
+
+#### Formatting function declarations and definitions
+
+Short function headers can go on one line.
+
+Do not insert a newline between the parenthesis
+that closes the function's parameters and
+the curly bracket that opens the function's body.
+
+```c++
+int short_name(int x, int y) {
+  return x + y;
+}
+```
 
-### Constant names
+If the function name and its parameters are too long to fit on one line,
+break the line immediately after the opening parenthesis
+that starts the parameter list.  Then, double-indent the parameters
+to distinguish them from the body of the function.
 
-CUTLASS makes extensive use of constants and compile-time evaluation. Constant variable names should have
-prefix `k` and use mixed case. True compile-time constsants should be defined as `constexpr` to enable
-dependent `constexpr` functions.
+```c++
+void indeed_my_fellowbeings_this_function_name_is_unusually_long(
+    std::uint32_t foo, // parameters are double-indented
+    std::uint32_t const* bar,
+    TypeA a,
+    TypeB b,
+    TypeC c) { // the ) and { go on the same line still
+  auto d = body_of_the_function(a, b, c); // body is single-indented
+  // ... more code ...
+}
+```
 
-CUTLASS uses ["East const"](http://slashslash.info/2018/02/a-foolish-consistency/) style, placing `constexpr` keyword
-after the type name.
+For a constructor with a long parameter list,
+break the line after the parentheses, just as with other functions.
+Align the colon that starts the constructor's initializer list
+flush with the comma on the next line.
+
+As with functions, double-indent the parameters
+to distinguish them from the constructor body.
+Here is an example.
 
 ```c++
-float constexpr kPi = 3.14159f;
+class YesTheCommunityAgreesThatTheNameOfThisClassIsIndeedExtremelyLong {
+public:
+  CUTLASS_HOST_DEVICE
+  YesTheCommunityAgreesThatTheNameOfThisClassIsIndeedExtremelyLong(
+      int this_is_the_first_parameter_and_its_name_is_long,
+      int this_is_the_second_parameter_and_its_name_is_also_long,
+      int this_is_the_third_parameter_and_its_name_is_long_too)
+  : x_(this_is_the_first_parameter_and_its_name_is_long)
+  , y_(this_is_the_second_parameter_and_its_name_is_also_long)
+  , z_(this_is_the_third_parameter_and_its_name_is_long_too) {
+    // constructor body
+    // more of the constructor body
+  }
+
+private:
+  int x_ = 0;
+  int y_ = 0;
+  int z_ = 0;
+};
+```
+
+#### Formatting function calls
+
+When calling a function or function object with a long name,
+break the line right after the invoking open parenthesis.
+Here are some examples.
+
+```c++
+detail::very_long_function_object_name<TemplateArgument>{}(
+  params.long_parameter_name, some_operator.another_long_function_name());
+
+detail::an_even_longer_function_object_name<TemplateArgument1, TemplateArgument2>{}(
+  params.long_parameter_name, some_operator.long_member_function_name(),
+  another_operator.another_long_member_function_name(x, y, z));
 ```
 
-### Class Member Order
+#### If-else brackets and spacing
+
+* Always use braces with conditionals such as `if`,
+  even if the body is a single line.
+
+* Use a space after control flow keywords
+  such as `if`, `for`, and `while`.
+
+* Use a space after the parenthesis closing a conditional
+  such as `if`, and the curly bracket opening a scope.
+
+* Use a new line between the closing brace
+  of an `if` branch, and the `else` keyword.
+
+```c++
+if (condition) { // space after if, and between ) and {
+  // ... code ...
+} // newline after }
+else {
+  // ... other code ...
+}
+
+// space after keyword for
+for (int k = 0; k < num_iters; ++k) {
+  // ... still more code ...
+}
+```
+
+#### East const
+
+CUTLASS uses the
+["East const"](http://slashslash.info/2018/02/a-foolish-consistency/)
+convention.
+That is, the `const` or `constexpr` keyword
+goes after the type, not before.
+The general rule is that `const` or `constexpr`
+modifies the type to the left of it.
+Here are some examples.
+
+```c++
+float constexpr compile_time_constant = 42.3f;
+
+float const const_float = /* whatever */;
+float const& reference_to_const_float = const_float;
+float const* pointer_to_const_float = &const_float;
+float const* const const_pointer_to_const_float = &const_float;
+
+float nonconst_float;
+float& reference_to_nonconst_float = nonconst_float;
+float* pointer_to_nonconst_float = &nonconst_float;
+float* const pointer_to_nonconst_float = &nonconst_float;
+```
+
+Contrast this with "West const" style, e.g.,
+
+```c++
+const float const_float = /* whatever */;
+const float* pointer_to_const_float = &const_float;
+```
+
+#### Alignment of reference and pointer types
+
+For reference and pointer types,
+align the `&` resp. `*` flush against the type
+that it modifies.  This is called "left alignment."
+
+For example, do this:
+
+```c++
+int const& var;
+int const* var;
+```
+
+and not this.
+
+```c++
+int const &var;
+int const *var;
+```
+#### Avoid calling functions "fast" or "optimized"
+
+Putting words like "fast" or "optimized"
+in the name of a function
+assumes that the "fast" path is actually faster.
+That might be true now, but later changes
+(in the code, compilers, or GPU hardware)
+might make it false.  In that case,
+your name could be unintentionally misleading.
+Consider instead a name that briefly describes
+the algorithm or feature that is relevant for optimization.
+For example, `compute_on_host` is more meaningful
+than `compute_slowly`, and computing on host
+might be faster in some cases
+(e.g., if the data are already on host
+and the algorithm is not GPU-friendly).
+
+CUTLASS code has not always followed this rule in the past.
+Some functions and classes might have words like "fast" in their name.
+New code should follow this rule, however.
+
+#### Avoid creating unconstrained templated functions with common names
+
+See [C++ Core Guidelines T.47](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines#t47-avoid-highly-visible-unconstrained-templates-with-common-names):
+"Avoid highly visible unconstrained templates
+with common names."
+Argument-dependent lookup (ADL) means that
+if users call a function name without specifying the namespace,
+the compiler can find overloads
+of that function in any namespace.
+This can lead to ambiguous overloads in users' code,
+just because they happened to include one of your header files
+that exposes an unconstrained function template.
+The following illustrates this
+with an unconstrained swap overload in the `cutlass` namespace.
+
+```c++
+#include <cassert>
+#include <memory>
+#include <utility>
+
+// Uncomment the line below to observe unwarranted build errors.
+//#define BAD_CUTLASS_SWAP 1
+
+namespace cutlass {
+struct Bar {
+  float f;
+};
+} // namespace cutlass
+
+#ifdef BAD_CUTLASS_SWAP
+namespace cutlass {
+
+// don't do this
+template<class T>
+void swap(T& a, T& b) {
+  T tmp = a;
+  a = b;
+  b = tmp;
+}
+
+} // namespace cutlass
+#endif // BAD_CUTLASS_SWAP
+
+namespace other {
+
+#ifdef BAD_CUTLASS_SWAP
+using cutlass::swap;
+#endif // BAD_CUTLASS_SWAP
+
+// Imagine for the sake of this example
+// that "foo" is a less common name,
+// and that T is constrained via
+// std::enable_if or a requires clause.
+template<class T>
+void foo(T& a, T& b) {
+  // The usual idiom for using std::swap is the "swap two-step":
+  //
+  // 1. import std::swap into the current scope, then
+  // 2. call swap without namespace qualification.
+  //
+  // That won't build if we have another swap
+  // overload available in the scope already.
+
+  using std::swap;
+  swap(a, b); // OBSERVE UNWARRANTED BUILD ERROR HERE
+}
+
+} // namespace other
+
+int main() {
+  int x = 42;
+  int y = 43;
+  other::foo(x, y);
+  assert(x == 43);
+  assert(y == 42);
+
+  cutlass::Bar a{42.0};
+  cutlass::Bar b{43.0};
+  other::foo(a, b);
+  assert(a.f == 43.0);
+  assert(b.f == 42.0);
+
+  // GCC 7.5 std::unique_ptr::reset calls swap,
+  // leading to the same issue as above.
+  // GCC 12.2's implementation of std::unique_ptr
+  // does not have this issue.  Nevertheless,
+  // breaking the swap two-step will break users' code,
+  // just by them happening to include your headers.
+  auto ptr = std::make_unique<cutlass::Bar>(cutlass::Bar{666.0f});
+  ptr.reset(new cutlass::Bar{777.0f}); // OBSERVE UNWARRANTED BUILD ERROR HERE
+
+  return 0;
+}
+```
+
+#### Function return values and in-out parameters
+
+##### Prefer return values to output parameters
+
+In general, avoid in-out mutable references to return a value.
+If you need to return multiple values,
+you can return them by `struct` or `tuple`,
+rather than by output references.
+This includes the special case of error reporting
+by returning either a value or an error code.
+Please see the next section for details.
+
+```c++
+// Instead of passing in-out mutable references ...
+void not_preferred(float& input_and_output); // not preferred
+
+// keep functions pure and return value types instead
+float preferred(float input); // preferred
+```
+
+##### Return multiple values by struct or tuple
+
+Sometimes a function needs to return multiple values.  In that case, consider the following, in decreasing order of preference.
+
+1. Return a `struct`.  This lets you name the fields
+   (for more self-documenting code),
+   yet still permits use of structured binding.
+
+2. Return a `tuple`.  If you need a tuple type
+   that works on device, use `cute::tuple`.
+   (Please note that `cute::tuple` does not work
+   for all the types that work in `std::tuple`.
+   CuTe's documentation explains.)
+
+3. Resort to "returning" multiple values by output references
+   only if performance requires it.
+
+Here is an example of the struct approach for named values.
+For a comparable example in the C++ Standard,
+please see [`std::allocate_at_least`](https://en.cppreference.com/w/cpp/memory/allocate_at_least),
+which returns `std::allocation_result`.
+
+```c++
+struct my_computation_result {
+  float value = 0.0f;
+  float relative_error = 0.0f;
+  bool success = false;
+};
+
+my_computation_result my_computation(float tolerance);
+
+void foo(float tolerance) {
+  // Approach 1: Use structured binding.  The names
+  // you choose on the left-hand side have nothing
+  // to do with the struct, so it's up to you
+  // to get the order right.  On the other hand,
+  // this code works whether my_computation returns
+  // a struct or a tuple.
+  auto [val, rel_err, ok] = my_computation(tolerance);
+
+  // Approach 2: Keep the struct and use its named fields.
+  // This approach prevents errors like mixing the order of return types.
+  // However, it only works for structs, not for tuples.
+
+  auto result = my_computation(tolerance);
+  if (not result.success) {
+    // computation did not succeed
+  }
+  else if (result.relative_error > tolerance) {
+    // successful but relative error too large
+  }
+  else {
+    // successful and relative error is in bounds
+  }
+}
+```
+
+##### Reporting errors from a function that returns one or more values
+
+We may want to return one or more values
+from a function that could fail
+or otherwise report errors.
+That is, the function either
+
+* returns one or more valid values, or
+
+* does not return any values and reports an error,
+
+but NOT BOTH.  We contrast this with cases
+when it's meaningful to report both a result
+and whether the result is satisfactory.
+For example, when solving
+a system of nonlinear equations iteratively,
+users may want the approximate computed solution,
+even if the iteration did not succeed
+by converging to the desired tolerance
+in the desired number of steps.
+(Users may want to invest more steps,
+or use the current approximation
+to jump-start a different algorithm.)
+
+We're talking here about the "either valid value(s),
+or error, but not both" case.
+For this case, C++ offers a few options.
+
+1. Return the value(s), or throw an exception on error
+
+2. `std::expected` (requiring C++23) or something like it
+
+3. `std::optional` (for a Boolean error state)
+   or something like it
+
+4. `std::variant` (a C++17 fall-back for `std::expected`)
+   or something like it
+
+5. C-style interface: return an error code,
+   and "return" the values as output parameters
+
+We usually cannot or do not want to
+throw exceptions on device.
+Some code projects forbid exceptions entirely
+(on host or device)
+and tell the compiler to disable them.
+If we exclude a C-style interface (the last option)
+as not idiomatic C++, then for host-only code,
+`std::expected`, `std::optional`, and `std::variant`
+all work.
+For code that needs to build and run on device,
+we can fall back to libcu++ equivalents
+in the `cuda::std::` namespace, when they exist.
+Otherwise, we must resort to returning a struct or tuple
+with the value and the error information,
+and ask users not to use the value on error.
+This is acceptable if the value can be constructed
+cheaply with a reasonable default.
+
+##### Performance of different value-or-error reporting methods
+
+[P1886R0](https://wg21.link/P1886R0)
+(Ben Craig, "Error speed benchmarking")
+surveys different ways in Standard C++
+to report errors from a function
+that returns one or more values,
+and compares their (host-only) performance
+with different compilers.
+
+##### Use aggregate initialization when returning a struct or tuple
+
+Use aggregate initialization when returning a struct or tuple.
+This avoids duplication of the return type name.
+
+```c++
+struct foo_result {
+  float value = 0.0f;
+  float error = 0.0f;
+  bool success = false;
+};
+
+foo_result foo(std::span<const float> input) {
+  // ... code  ...
+
+  // Prefer this.  We know what type the function returns.
+  return {val, err, ok}; // prefer this
+
+  // Naming foo_result again here is unnecessary.
+  // return foo_result{val, err, ok};
+}
+```
+
+However, note that this won't work if the function returns `auto`.
+The general rule is to avoid code duplication.
+
+```c++
+auto foo(std::span<const float> input) {
+  // ... code  ...
+
+  if constexpr (some_condition) {
+    return foo_result{val, err, ok};
+  }
+  else {
+    return bar_result{val, err, ok};
+  }
+}
+```
+
+##### Prefer using the actual return type to auto, if you know the type
+
+C++ lets you use `auto` to deduce the type returned from a function.
+
+* If you know the actual type, prefer using the type instead of `auto`.
+
+* Use [Constructor Type Argument Deduction](https://en.cppreference.com/w/cpp/language/class_template_argument_deduction)
+  (CTAD) if you know that a function returns some type
+  (e.g., `Tensor`), but don't know the type's template arguments.
+
+* Use `auto` in structured bindings (where you have to use it anyway).  This also makes your code agnostic of whether the return type is a `struct`, `tuple`, `pair`, or other tuple-like type.
+
+* Be careful using `auto` with types that provide expression templates.
+
+Contrast this with "Almost Always Auto" (AAA) style.
+We deliberately choose not to follow AAA style,
+for the following reasons.
+
+* Using the actual type when we know it can help prevent common loss-of-precision errors in mixed-precision computations, an important use case for CUTLASS.
+
+* CTAD gives us much of the brevity of AAA, with more clarity.
+
+* Using the actual type instead of `auto` can prevent common dangling errors with expression templates.
+
+#### Classes and structs
+
+Type names use `CamelCase`.
+That is, words start with capital letters.
+The remaining letters in the word are lower case,
+and words are joined with no intervening underscores.
+The only exception is when implementations are
+a drop-in replacement for C++ Standard Library components.
+
+Follow the
+[C++ Core Guidelines](https://github.com/isocpp/CppCoreGuidelines/blob/master/CppCoreGuidelines.md#Rc-struct)
+to decide whether to use `class` or `struct`.
+
+* Use `class` when the object must maintain an invariant.
+  Data members related to the invariant should be `private`.
+
+* Use `struct` when the class has no invariant to maintain,
+  and data members may vary arbitrarily with respect to each other.
+
+Prefer nonmember functions and statelessness where possible.
+Member functions imply invariants.
+More invariants make code maintenance and testing harder.
+
+#### Class members
+
+Methods and members are written using `snake_case`.
+
+Private data and function members have suffix `_`.
+
+#### Class Member Order
 
 Members within classes and structures should be organized as follows:
 
 1. Type and constant definitions
+
 2. Data members
+
 3. Constructors
+
 4. Other methods
 
-This convention follows the [CUB library](https://nvlabs.github.io/cub/) and is also described by 
-[Howard Hinnant](https://howardhinnant.github.io/classdecl.html). Unsurprisingly, it approximates 
-the usual ordering of chapters in a typical Systems and Controls textbook. That is,
-(1.) identify relevant constants, (2.) define a state-space representation of the dynamical system 
-under study (i.e. the data members), and (3.) devote subsequent chapters to definining dynamical behavior
-of the system (i.e. the methods).
+This convention follows the
+[CUB library](https://nvlabs.github.io/cub/)
+and is also described by
+[Howard Hinnant](https://howardhinnant.github.io/classdecl.html).
+It also approximates the usual ordering of chapters
+in a typical Systems and Controls textbook.
+That is, it
+
+1. identifies relevant constants,
+
+2. defines a state-space representation
+   of the dynamical system under study
+   (the class's data members), and then
+
+3. devotes the remaining "chapters" to defining
+   the system's dynamical behavior
+   (the class's methods).
+
+Here is an example class.
 
-_Example_:
 ```c++
 class A {
 public:
-  // Type definitions
+  // type definitions
 protected:
-  // protected Type definitions
+  // protected type definitions
 private:
-  // private Type definitions
+  // private type definitions
 
 public:
-  // Data members
+  // data members
 protected:
   // protected data members
+  // STRONGLY TO BE AVOIDED;
+  // please see C++ Core Guidelines
 private:
   // private data members
 
 public:
-  // Methods
+  // methods
 protected:
   // protected methods
 private:
   // private methods
+};
+```
+
+#### For code reuse, prefer composition over inheritance
+
+* [C++ Core Guidelines C.129](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines#c129-when-designing-a-class-hierarchy-distinguish-between-implementation-inheritance-and-interface-inheritance): "When designing a class hierarchy, distinguish between implementation inheritance and interface inheritance"
+* [C++ Core Guidelines ES.63](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines#Res-slice): "Don't slice"
+
+Suppose that a class hierarchy exists entirely for implementation convenience, so that implementers can reuse code and "program by difference" (changing or adding only what's different from the base class).  In the example below, both `PipelineA` and `PipelineB` are used by themselves.  `PipelineB` inherits from `PipelineA` just to avoid duplicating code.  There are no virtual member functions, and users don't expect to rely on run-time polymorphism.
+
+```c++
+class PipelineA {
+public:
+  PipelineA(Arg0 arg0, Arg1 arg1)
+    : arg0_(arg0), arg1_(arg1)
+  {}
+
+  void producer_acquire(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    // ... implementation ... 
+  }
+
+  void consumer_release(uint32_t stage, uint32_t skip) {
+    // ... implementation ...
+  }
 
+private:
+  Arg0 arg0_;
+  Arg1 arg1_;
 };
 
+class PipelineB : public PipelineA {
+public:
+  PipelineB(Arg0 arg0, Arg1 arg1, Arg2 arg2) :
+    PipelineA(arg0, arg1), arg2_(arg2)
+  {}
+
+  // Reuse PipelineA::producer_acquire via inheritance
+
+  // Override PipelineA::consumer_release
+  void consumer_release(uint32_t stage, uint32_t skip) {
+    // ... some other implementation, not invoking parent ...
+  }
+
+private:
+  Arg2 arg2_;
+};
 ```
 
-### File Names
+The problem with public inheritance here is that `PipelineB` is NOT a (versus "is-a," i.e., substitutable-as) `PipelineA`. In particular, the following code would be incorrect.
 
-Files should be named using `snake_case` with extension `.h` for header files, `.cu` for CUDA sources,
-and `.cpp` for C++ host-only source files.
+```c++
+void consume_and_release_pipeline(PipelineA* parent) {
+  // ... code ...
+  parent->consumer_release(stage, skip);
+  // ... code ...
+}
+
+void use_pipeline( /* other args */ ) {
+  // ... code ...
+  PipelineB child{arg0, arg1, arg2};
+  // ... code ...
+
+  // WRONG!!! SLICES CHILD TO PARENT!!!
+  consume_and_release_pipeline(&child); // BAD
+
+  // ... code ...
+}
+```
+
+`PipelineA::consumer_release` is not a virtual member function, so `consume_and_release_pipeline` would not actually be polymorphic, as callers might have expected from an interface that takes a base class pointer. What's worse is that the resulting slicing could violate `PipelineB`'s invariants, thus putting it in an incorrect state.
+
+The most straightforward way to reuse code would be by changing from inheritance (is-a) to composition (has-a).
+
+```c++
+namespace detail {
+
+// Implementation class; not for users
+class PipelineImpl {
+public:
+  PipelineImpl(Arg0 arg0, Arg1 arg1)
+    : arg0_(arg0), arg1_(arg1)
+  {}
+
+  void producer_acquire(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    // ... implementation ...
+  }
+
+  void consumer_release(uint32_t stage, uint32_t skip) {
+    // ... implementation ...
+  }
+
+private:
+  Arg0 arg0_;
+  Arg1 arg1_;
+};
+
+} // namespace detail
+
+class PipelineA {
+public:
+  PipelineA(Arg0 arg0, Arg1 arg1) :
+    impl_(arg0, arg1)
+  {}
+
+  void producer_acquire(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    impl_.producer_acquire(stage, phase, skip_wait);
+  }
+
+  void consumer_release(uint32_t stage, uint32_t skip) {
+    impl_.consumer_release(stage, skip);
+  }
+
+private:
+  detail::PipelineImpl impl_;
+};
+
+// A second kind of pipeline.
+// Note that this does NOT inherit from PipelineB!
+// The two pipeline classes have the same compile-time interface
+// (for compile-time polymorphism), but do not belong in an 
+// inheritance hierarchy (as would imply run-time polymorphism).
+class PipelineB {
+public:
+  PipelineB(Arg0 arg0, Arg1 arg1, Arg2 arg2) :
+    impl_(arg0, arg1), otherTwo_(arg2)
+  {}
+
+  void producer_acquire(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    impl_.producer_acquire(stage, phase, skip_wait);
+  }
+
+  void consumer_release(uint32_t stage, uint32_t skip) {
+    // this class doesn't actually use impl_ here
+    otherTwo_.other_action(stage, skip);
+    // ... some other code not using impl_ ...
+  }
+
+private:
+  detail::PipelineImpl impl_;
+  OtherTwo otherTwo_;
+  // ... other member data ...
+};
+```
+
+This design prevents users at compile time from incorrectly assuming that `PipelineB` is a `PipelineA`.  Implementers continue to get compile-time polymorphism, as long as `PipelineA` and `PipelineB` implement the same compile-time interface.
+
+##### Behavioral subtyping
+
+Another reason to avoid public inheritance would be if the public member functions of `PipelineA` and `PipelineB` have different behavior, such that the invariants satisfied by the member functions of the base class `PipelineA` are not satisfied by the correspondingly named member functions of the subclass `PipelineB`.  For example, suppose that both classes have a public `producer_arrive` member function.  However, for `PipelineA`, this issues a producer arrival only for its own block, whereas for `PipelineB`, this issues a producer arrival for all blocks in the cluster.  Again, PipelineB "is-not-a" PipelineA.  The child class doesn't just add behavior onto the parent class; it has completely different behavior. Thus, it fails to satisfy behavioral subtyping: invariants of the parent class's member functions are not satisfied by the child class.  Behavioral subtyping is especially important when reasoning about already difficult things like parallel synchronization.  The inheritance design would give developers the false impression that `PipelineB` just adds behavior atop `PipelineA`, whereas in fact, developers would need to understand both pipeline classes completely to build a correct mental model about their behavior.
+
+The fix is the same: Use composition, not inheritance.  As [C++ Core Guidelines C.120](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines#c120-use-class-hierarchies-to-represent-concepts-with-inherent-hierarchical-structure-only) explains: "Use class hierarchies to represent concepts with inherent hierarchical structure (only)."
+
+1. "Make sure the idea represented in the base class exactly matches all derived types and there is not a better way to express it than using the tight coupling of inheritance."
+2. "Do not use inheritance when simply having a data member will do."
 
-### Use scoped enums
+#### Use scoped enums
 
-Use scoped enums added in C++11 for enumerated types. Use capital letters for the enumerated type name
+Use scoped enums (a C++11 feature) for enumerated types.
+Use capital letters for the enumerated type name
 and prefix `k` for enumerators like other constants.
 
 ```c++
@@ -232,67 +895,273 @@ enum class MatrixOperation {
 };
 ```
 
-### Namespaces
+#### Namespaces
 
-Namespaces are all lower case. The top-level namespace is `cutlass::`. The second nested namespace refers
-top the general category of operation performed by its members, and the third nested namespace refers to
-the CUDA execution model scope (if applicable).
+Namespaces are all lower case.
+The top-level namespace is `cutlass::`.
+The second nested namespace refers to
+the general category of operation
+performed by its members: e.g., `gemm::`.
+The third nested namespace refers to
+the operations' position in the conceptual hierarchy:
+e.g., `device::`, `kernel::`, or `collective::`.
 
-The bodies of namespace definitions should not be intented, and comments on the closing brace are welcome.
+The bodies of namespace definitions should not be indented.
+Comments on the closing brace to indicate
+the namespace being closed are welcome.
 
 ```c++
 namespace cutlass {
 namespace gemm {
-namespace warp {
-
-struct MmaTensorCore {
+namespace kernel {
 
+struct AnotherGemmKernel {
+  // ... contents ...
 };
 
-} // namespace warp
+} // namespace kernel
 } // namespace gemm
 } // namespace cutlass
 ```
 
-### Macros
+#### File Names
+
+New files should be named using `snake_case`
+with extension `.hpp` for header files,
+`.cu` for CUDA sources,
+and `.cpp` for C++ host-only source files.
+
+Header files with extension `.h`
+are CUTLASS 2.x legacy headers.
+
+#### Macros
 
-Avoid defining macros except where preprocessing is obligatory. In particular, 
-avoid using macros for constants.
+Only use macros when the preprocessor
+is the only way to accomplish the task.
+Do not use macros for literal constants.
+Instead, if inside the body of a function,
+use `constexpr` values,
+and if at namespace scope, use
+[`inline constexpr` variables](https://en.cppreference.com/w/cpp/language/inline)
+(a C++17 feature).
+
+"Namespace" macros by starting them with the module name, e.g., `CUTLASS_`.
+Macros and ONLY MACROS use all capital letters with underscores between words.
+For example:
+
+```c++
+#define CUTLASS_MACROS_USE_ALL_CAPS inline __host__ __device__
+```
 
-Several existing macros defined in `cutlass/cutlass.h` are useful for working around compiler-dependent
-behavior.
+Header files such as
+[cutlass/cutlass.h](../../include/cutlass/cutlass.h)
+and
+[cute/config.hpp](../../include/cutlass/cutlass.h)
+offer macros for expressing compiler-dependent behavior.
+These include
 
-Annotations for device code:
-* `CUTLASS_HOST_DEVICE` for functions running on the host and the device
-* `CUTLASS_DEVICE` for functions running on the device only
+* replacements for `__device__` and/or `__host__`
+  annotations:
 
-Loop unrolling:
-* `CUTLASS_PRAGMA_UNROLL` for full unrolling of loops with constant trip counts
-* `CUTLASS_PRAGMA_NO_UNROLL` to prevent unrolling
+  * `CUTLASS_HOST_DEVICE` or `CUTE_HOST_DEVICE`
+    for functions that run on the host and the device,
 
-### #pragma once
+  * `CUTLASS_DEVICE` or `CUTE_DEVICE`
+    for functions that run on the device only,
+
+  * `CUTE_HOST`
+    for functions that run on the host only, and
+
+  * `CUTE_HOST_RTC`
+    for functions that run on the host only,
+    but occur as unevaluated operands (of e.g., `decltype` or `sizeof`;
+    see C++ Standard, `[expr.context]` 1) in device code; and
+
+* annotations to loop unrolling:
+
+  * `CUTLASS_PRAGMA_UNROLL` or `CUTE_UNROLL`
+    for full unrolling of loops with constant trip counts, and
+
+  * `CUTLASS_PRAGMA_NO_UNROLL` or `CUTE_NO_UNROLL` to prevent unrolling.
+
+#### Guard all headers with `#pragma once`
 
 Use `#pragma once` to guard all headers.
 
+### CuTe Layout Comments
+
+* Right-align tensor shape layout comments at column 120. 
+* If layout comment is too long do your best to align it.
+* If layout comment is too long and there are many related tensors
+  that the reader should read together,
+  try to align the layout comments of related tensors.
+
+Here are a couple examples.
+
+```c++
+Tensor mC = make_tensor(make_gmem_ptr(params.ptr_C), make_shape(M,N), params.dC);                              // (M,N)
+Tensor mD = make_tensor(make_gmem_ptr(params.ptr_D), make_shape(M,N), params.dD);                              // (M,N)
+Tensor mAux = make_tensor(make_gmem_ptr(params.ptr_Aux), make_shape(M,N), params.dAux);                        // (M,N)
+
+auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
+Tensor tCgD = thr_mma.partition_C(gD);                                                             // (VEC,THR_M,THR_N)
+Tensor tCgC = thr_mma.partition_C(gC);                                                             // (VEC,THR_M,THR_N)
+Tensor tCgAux = thr_mma.partition_C(gAux);                                                         // (VEC,THR_M,THR_N)
+```
+
+```c++
+Tensor my_tensor = make_tensor<Type>(Layout<Shape<_2,_2>{}, Stride<_1,_2>>{});                           // (2,2):(1,2)
+    
+// Related tensors
+Tensor my_tensor1 = make_tensor<Type>(ThisIsAVeryComplicatedLayoutWithAVeryLongName);         // ((Mode0_0,Mode0_1,Mode0_2),Mode1,Mode2,Mode3)
+Tensor my_tensor2_related = make_tensor<Type>(ThisIsAVeryComplicatedLayoutWithAVeryLongName); // ((Mode0_0,Mode0_1,Mode0_2),Mode1,Mode2,Mode3)
+```
+
+### Warnings
+
+CUTLASS code aims to build free of warnings.
+
+#### Spurious warnings
+
+Some compilers, or some versions of a compiler, emit spurious warnings, that is, "false positives" for perfectly fine code.  While such code is correct, the warnings can obscure errors.  Users also may report warnings as bugs, and processing those bugs takes developer time away from other tasks.  Thus, it's good to try to "fix" the warnings, if doing so wouldn't make the code worse.
+
+#### Missing return statement
+
+GCC 10 (but not 7.5, 9.4.0, or 11) has trouble deducing that a function with `auto` return type and all of its returns in an `if constexpr` ... `else` statement must actually return.  As a result, GCC emits spurious "missing return statement" build warnings.  Such functions have one of two forms: `if constexpr` ... `else` where `else` returns, and `if constexpr` ... `else` where `else` is meant to fail at compile time.  Here is an example of the first form.
+
+```c++
+template<class T>
+constexpr auto first_form(T t) {
+  if constexpr (some_condition_v<T>) {
+    return some_function(t);
+  }
+  else if constexpr (another_condition_v<T>) {
+    return another_function(t);
+  }
+  else {
+    return yet_another_function(t);
+  }
+}
+```
+
+In this form, the `if constexpr` ... `else` sequence of branches covers all possibilities.  Here is an example of the second form.
+
 ```c++
-/*!
+template<class T>
+constexpr auto second_form(T t) {
+  if constexpr (some_condition_v<T>) {
+    return some_function(t);
+  }
+  else if constexpr (another_condition_v<T>) {
+    return another_function(t);
+  }
+  else {
+    static_assert(sizeof(T) < 0, "This branch always fails");
+  }
+}
+```
 
-*/
+In this form, the `else` branch had a `static_assert` that was meant always to fail if the `else` branch were taken, such as `static_assert(sizeof(T) < 0)`.  (Note that we cannot use `static_assert(false)` here, because it will ALWAYS fail at compile time, even if the `else` branch is not taken.  C++23 fixes this behavior, but CUTLASS currently requires that its code be compatible with C++17.  As a result, CUTLASS includes a `dependent_false<T>` library function that you can use in place of the always-`false` test `sizeof(T) < 0`.)
 
-#pragma once
+One can suppress "missing return statement" warnings for both forms by invoking CUTLASS' function-like macro `CUTE_GCC_UNREACHABLE`.  When building with GCC, this invokes the GCC-specific built-in function `__builtin_unreachable()`.  Actually calling this function is undefined behavior, so using this lets the programmer declare that the code path calling that function will never be taken.  (C++23 introduces the `std::unreachable()` function, which achieves the same goal.  Again, though, CUTLASS cannot currently use C++23 library functions.)  Here is an example of how to use `CUTE_GCC_UNREACHABLE`.
 
-...
+```c++
+template<class T>
+constexpr auto second_form(T t) {
+  if constexpr (some_condition_v<T>) {
+    return some_function(t);
+  }
+  else if constexpr (another_condition_v<T>) {
+    return another_function(t);
+  }
+  else {
+    static_assert(sizeof(T) < 0, "This branch always fails");
+  }
+  CUTE_GCC_UNREACHABLE;
+}
 ```
 
-### Source Line Length
+This macro should only be used if it is needed to suppress spurious warnings.  Also, this function should not be used if the developer is not sure whether the code exhaustively tests all possibilities.  For example, some functions may look like this.
+
+```c++
+template<class T>
+constexpr auto possibly_nonexhaustive(T t) {
+  if constexpr (some_condition_v<T>) {
+    return some_function(t);
+  }
+  else if constexpr (another_condition_v<T>) {
+    return another_function(t);
+  }
+ 
+  // NOTE lack of unadorned "else" here
+}
+```
+
+This is a good opportunity to review the function.  If the branches are obviously meant to be exhaustive, you can add an `else` branch with a `static_assert` (see above for how to express this).  If you're not sure, leave it alone and let the compiler issue warnings.
+
+#### Unused variable
+
+Some compilers may emit spurious unused warnings for some variable declarations, where the variable was only being used inside a `decltype` in an `if constexpr` test. Marking the variables as `[[maybe_unused]]` (a standard C++17 attribute) suppresses these warnings.  Again, please only do this if you're sure that the code is right.
+
+### CUDA C++ style
+
+#### CUDA Built-in Variables
+
+Avoid direct access to CUDA built-in variables `threadIdx`, `blockIdx`, `blockDim`, and `gridDim` within
+CUTLASS components except in special circumstances.
+
+Using built-in global variables directly within resuable components necessitates that all components
+use them consistently which may not be possible if CUTLASS components are used in other contexts.
+
+Instead, components should accept a linear ID identifying threads, warps, and threadblocks from calling
+code. The top-level kernel may then decide how to map threads, warps, and blocks to the problem it is
+solving.
+
+#### Use CUTLASS's and CuTe's fundamental types and operations
+
+Use the
+[fundamental types and operations](fundamental_types.md)
+defined in CUTLASS consistently.
+This contributes to a framework of interoperable, consistent components.
+It reduces code duplication, which reduces build and test times.
+It also saves developer effort.
+
+CUTLASS's fundamental types and operations include
+
+* [Numeric types](fundamental_types.md#numeric-types) to represent numeric data in host and device code, and
+
+* [functional.h](fundamental_types.md#functional) to perform numeric operations in generic code.
+
+CUTLASS 3.0 uses CuTe components to represent data layouts and multidimensional arrays.
+Please refer to the [CuTe Tutorial](./cute/00_quickstart.md) for details.
+CuTe has replaced CUTLASS 2.x components such as
+[Containers](fundamental_types.md#containers),
+[Layouts](layout.md), and
+[`TensorRef` and `TensorView`](layout.md#tensorref).
+
+## CUTLASS idioms
+
+### Detecting major mode
+
+Developers sometimes need to detect whether a tensor is MN-major or K-major.
+(For definitions, see the [CuTe GEMM tutorial](./cute/0x_gemm_tutorial.md).)
+
+* _Correct_: `cutlass::detail::is_major<0, Stride>()` or
+`cutlass::detail::is_k_major()` from `include/cutlass/gemm/gemm.h`
 
-Avoid lines longer than 100 characters. These typically wrap unfavorably when viewed in
-Github's pretty printer.
+* _Incorrect_: `get<0>(stride) == 1`
 
+The second point is incorrect because it assumes that the mode
+is a single integer, not a multimode.
+This means that the code will fail to compile for tensor contractions.
+For example, suppose that a tensor A
+has shape `((X, Y), K)` and stride `((1, X), X*Y)`.
+`get<0>(stride)` is the tuple `(1, X)`, not a single integer.
+However, A is certainly M major if interpreted as a matrix.
 
 # Copyright
 
-Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/media/docs/quickstart.md b/media/docs/quickstart.md
index be274c7e51..29e5a0f6f3 100644
--- a/media/docs/quickstart.md
+++ b/media/docs/quickstart.md
@@ -1,15 +1,15 @@
-![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS Quick Start Guide")
+![ALT](../images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS Quick Start Guide")
 
-[README](/README.md#documentation) > **Quick Start**
+[README](../../README.md#documentation) > **Quick Start**
 
 # Quickstart
 
 ## Prerequisites
 
 CUTLASS requires:
-- NVIDIA CUDA Toolkit (9.2 or later required, [11.1](https://developer.nvidia.com/cuda-toolkit) recommended)
-- CMake 3.12+
-- host compiler supporting C++11 or greater (g++ 7.3.0 or Microsoft Visual Studio 2015 recommended)
+- NVIDIA CUDA Toolkit (11.4 or later required, [12.0](https://developer.nvidia.com/cuda-toolkit) recommended)
+- CMake 3.18+
+- host compiler supporting C++17 or greater (minimum g++ 7.5.0)
 - Python 3.6+
 
 CUTLASS may be optionally compiled and linked with
@@ -24,28 +24,28 @@ $ export CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc
 
 $ mkdir build && cd build
 
-$ cmake .. -DCUTLASS_NVCC_ARCHS=80               # compiles for NVIDIA Ampere GPU architecture
+$ cmake .. -DCUTLASS_NVCC_ARCHS=90a             # compiles for NVIDIA Hopper GPU architecture
 ```
 
 If your goal is strictly to build only the CUTLASS Profiler and to minimize compilation time, we suggest
 executing the following CMake command in an empty `build/` directory.
 ```bash
-$ cmake .. -DCUTLASS_NVCC_ARCHS=80 -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON
+$ cmake .. -DCUTLASS_NVCC_ARCHS=90a -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON
 ```
 
-This reduces overall compilation time by excluding unit tests and enabling the unit build.
+This reduces overall compilation time by excluding unit tests and enabling the unity build.
 
 You may reduce build times by compiling only certain operations by setting the `CUTLASS_LIBRARY_OPERATIONS` flag as shown below,
 executed from an empty `build/` directory. This only compiles 2-D convolution kernels.
 
 ```bash
-$ cmake .. -DCUTLASS_NVCC_ARCHS=80 -DCUTLASS_LIBRARY_OPERATIONS=conv2d
+$ cmake .. -DCUTLASS_NVCC_ARCHS=90a -DCUTLASS_LIBRARY_OPERATIONS=conv2d
 ```
 
-You may also filter kernels by name by supplying a filter string with flag `CUTLASS_LIBRARY_KERNELS`. 
+You may also filter kernels by name by supplying a filter string with flag `CUTLASS_LIBRARY_KERNELS`. For example the below command selects only CUTLASS-3 kernels.
 
 ```bash
-$ cmake .. -DCUTLASS_NVCC_ARCHS=80 -DCUTLASS_LIBRARY_KERNELS=s16816gemm,s16816fprop*128x128
+$ cmake .. -DCUTLASS_NVCC_ARCHS=90a -DCUTLASS_LIBRARY_KERNELS=cutlass3x*
 ```
 See more examples on selectively compiling CUTLASS GEMM and convolution kernels [here](quickstart.md#example-cmake-commands).
 
@@ -56,7 +56,7 @@ You may explicitly exclude cuBLAS and cuDNN as dependencies with the following C
 
 ## Build and run the CUTLASS Profiler
 
-From the `build/` directory created above, compile the the CUTLASS Profiler.
+From the `build/` directory created above, compile the CUTLASS Profiler.
 ```bash
 $ make cutlass_profiler -j12
 ```
@@ -179,6 +179,11 @@ $ make test_unit_gemm_warp -j
 To minimize compilation time, specific GPU architectures can be enabled via the CMake command,
 selected by [CUDA Compute Capability.](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities)
 
+**NVIDIA Hopper Architecture.**
+```bash
+$ cmake .. -DCUTLASS_NVCC_ARCHS=90a              # compiles for NVIDIA Hopper GPU architecture
+```
+
 **NVIDIA Ampere Architecture.**
 ```bash
 $ cmake .. -DCUTLASS_NVCC_ARCHS=80               # compiles for NVIDIA Ampere GPU architecture
@@ -204,32 +209,10 @@ $ cmake .. -DCUTLASS_NVCC_ARCHS="60;61"          # compiles for NVIDIA Pascal GP
 $ cmake .. -DCUTLASS_NVCC_ARCHS="50;53"          # compiles for NVIDIA Maxwell GPU architecture
 ```
 
-## Clang
-
-For experimental purposes, CUTLASS has been verified to compile with the following versions of Clang and CUDA.
-
-* [clang 8.0](https://github.com/llvm/llvm-project/releases/download/llvmorg-8.0.1/clang+llvm-8.0.1-amd64-unknown-freebsd11.tar.xz) using the 
-[CUDA 10.0 Toolkit](https://developer.nvidia.com/cuda-10.0-download-archive).
-* [clang release/13.x](https://github.com/llvm/llvm-project/tree/release/13.x) using [CUDA 11.4](https://developer.nvidia.com/cuda-toolkit-archive)
-
-At this time, compiling with clang enables the CUTLASS SIMT GEMM kernels (sgemm, dgemm, hgemm, igemm)
-but does not enable TensorCores.
-
-```bash
-$ mkdir build && cd build
-
-$ cmake -DCUDA_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ ..
-# Add -DCMAKE_CXX_FLAGS=-D__NV_NO_HOST_COMPILER_CHECK=1 -DCMAKE_CUDA_FLAGS=-D__NV_NO_HOST_COMPILER_CHECK=1 if compiler
-# checks fail during CMake configuration.
-
-$ make test_unit -j
-```
-
-
 ## Using CUTLASS within other applications
 
 Applications should list [`/include`](/include) within their include paths. They must be
-compiled as C++11 or greater.
+compiled as C++17 or greater.
 
 **Example:** print the contents of a variable storing half-precision data.
 ```c++
@@ -250,7 +233,7 @@ int main() {
 
 ## Launching a GEMM kernel in CUDA
 
-**Example:** launch a mixed-precision GEMM targeting Turing Tensor Cores. 
+**Example:** launch a mixed-precision GEMM targeting Turing Tensor Cores.
 
 _Note, this example uses CUTLASS Utilities. Be sure `tools/util/include` is listed as an include path._
 ```c++
@@ -307,7 +290,7 @@ int main() {
   //
   // Launch GEMM on the device
   //
- 
+
   status = gemm_op({
     {M, N, K},
     {ptrA, lda},            // TensorRef to A device tensor
@@ -333,7 +316,7 @@ Note, the above could be simplified as follows using helper methods defined in `
 
   //
   // Use the TensorRef returned by HostTensor::device_ref().
-  // 
+  //
 
   status = gemm_op({
     {M, N, K},
@@ -345,14 +328,144 @@ Note, the above could be simplified as follows using helper methods defined in `
   });
 ```
 
+## Launching a GEMM kernel using CUTLASS 3.0 or newer
+
+**Example:** launch a mixed-precision GEMM targeting Hopper Tensor Cores.
+
+```c++
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+
+using namespace cute;
+
+int main(int argc, char const **args) {
+
+  // A matrix configuration
+  using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+  using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+  constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+  // B matrix configuration
+  using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+  using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+  constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+  // C/D matrix configuration
+  using         ElementC    = cutlass::half_t;                                // Element type for C and D matrix operands
+  using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C and D matrix operands
+
+  // Core kernel configurations
+  using ElementAccumulator  = float;                                          // Element type for internal accumulation
+  using ArchTag             = cutlass::arch::Sm90;                            // Tag indicating the minimum SM that supports the intended feature
+  using OperatorClass       = cutlass::arch::OpClassTensorOp;                 // Operator class tag
+  using TilesShape          = Shape<_128,_128,_64>;                           // Threadblock-level tile size
+  using ClusterShape        = Shape<_1,_2,_1>;                                // Shape of the threadblocks in a cluster
+  using StageCountType = cutlass::gemm::collective::StageCountAuto;           // Stage count maximized based on the tile size
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;       // Kernel to launch based on the default setting in the Collective Builder
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      ArchTag, OperatorClass,
+      ElementA, LayoutA, AlignmentA,
+      ElementB, LayoutB, AlignmentB,
+      ElementAccumulator,
+      TilesShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = cutlass::epilogue::collective::DefaultEpilogue<
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>>;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int>, // Indicates ProblemShape
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  Gemm gemm_op;
+  cutlass::Status status;
+
+  //
+  // Define the problem size
+  //
+
+  int M = 512;
+  int N = 256;
+  int K = 128;
+
+  float alpha = 1.25f;
+  float beta = -1.25f;
+
+  //
+  // Allocate device memory
+  //
+
+  cutlass::DeviceAllocation<typename Gemm::ElementA> block_A;
+  cutlass::DeviceAllocation<typename Gemm::ElementB> block_B;
+  cutlass::DeviceAllocation<typename Gemm::ElementC> block_C;
+  cutlass::DeviceAllocation<typename Gemm::EpilogueOutputOp::ElementOutput> block_D;
+
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+
+  StrideA stride_A;
+  StrideB stride_B;
+  StrideC stride_C;
+  StrideD stride_D;
+
+  stride_A = cutlass::make_cute_packed_stride(StrideA{}, {M, K, 1});
+  stride_B = cutlass::make_cute_packed_stride(StrideB{}, {N, K, 1});
+  stride_C = cutlass::make_cute_packed_stride(StrideC{}, {M, N, 1});
+  stride_D = cutlass::make_cute_packed_stride(StrideD{}, {M, N, 1});
+
+  block_A.reset(M * K);
+  block_B.reset(K * N);
+  block_C.reset(M * N);
+  block_D.reset(M * N);
+
+  //
+  // Launch GEMM on the device
+  //
+
+  status = gemm_op({
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    {M, N, K},
+    block_A.get(),
+    stride_A,
+    block_B.get(),
+    stride_B,
+    {block_C.get(), stride_C, block_D.get(), stride_D, {alpha, beta}}
+  });
+
+  if (status != cutlass::Status::kSuccess) {
+    return -1;
+  }
+
+  return 0;
+}
+```
+
 # CUTLASS Library
 
-The [CUTLASS Library](./tools/library) defines an API for managing and executing collections of compiled
+The [CUTLASS Library](/tools/library) defines an API for managing and executing collections of compiled
 kernel instances and launching them from host code without template instantiations in client code.
 
-The host-side launch API is designed to be analogous to BLAS implementations for convenience, though its 
-kernel selection procedure is intended only to be functionally sufficient. It may not launch the 
-optimal tile size for a given problem. It chooses the first available kernel whose data types, 
+The host-side launch API is designed to be analogous to BLAS implementations for convenience, though its
+kernel selection procedure is intended only to be functionally sufficient. It may not launch the
+optimal tile size for a given problem. It chooses the first available kernel whose data types,
 layouts, and alignment constraints satisfy the given problem. Kernel instances and a data structure
 describing them are completely available to client applications which may choose to implement their
 own selection logic.
@@ -367,12 +480,12 @@ by several SDK examples.
 * [11_planar_complex_array](/examples/11_planar_complex_array/planar_complex_array.cu)
 
 The CUTLASS Library defines enumerated types describing numeric data types, matrix and tensor
-layouts, math operation classes, complex transformations, and more. 
+layouts, math operation classes, complex transformations, and more.
 
 Client applications should specify [`tools/library/include`](/tools/library/include) in their
 include paths and link against libcutlas_lib.so.
 
-The CUTLASS SDK example [10_planar_complex](/examples/10_planar_complex/CMakeLists.txt) specifies 
+The CUTLASS SDK example [10_planar_complex](/examples/10_planar_complex/CMakeLists.txt) specifies
 its dependency on the CUTLASS Library with the following CMake command.
 ```
 target_link_libraries(
@@ -422,7 +535,7 @@ int main() {
   //
   // CUTLASS Library call to execute device GEMM
   //
-  
+
   cutlass::library::Handle handle;
 
   //
@@ -459,7 +572,7 @@ int main() {
     ptrD,                                           // pointer to D matrix in device memory
     ldd                                             // leading dimension of D matrix
   );
-  
+
   if (status != cutlass::Status::kSuccess) {
     return -1;
   }
@@ -468,27 +581,27 @@ int main() {
 }
 ```
 
-# Example CMake Commands 
+# Example CMake Commands
 
-To instantiate all operations supporting all tile sizes, data types, and alignment constraints, specify 
+To instantiate all operations supporting all tile sizes, data types, and alignment constraints, specify
 `-DCUTLASS_LIBRARY_KERNELS=all` when running `cmake`.
 ```bash
 $ cmake .. -DCUTLASS_NVCC_ARCHS='70;75;80' -DCUTLASS_LIBRARY_KERNELS=all
 ```
-The above command line generates about seven thousand kernels targetting NVIDIA Ampere, Turing, and Volta architectures. 
-Compiling thousands of kernels for three different architectures is time consuming. Additionaly, this would also result 
+The above command line generates about twenty thousand kernels targeting NVIDIA Ampere, Turing, and Volta architectures.
+Compiling thousands of kernels for three different architectures is time-consuming. Additionally, this would also result
 in a large binary size and on some platforms linker to fail on building the library.
 
-Enabling the "unity build" instantiates multiple kernel instances in each compilation unit, thereby reducing binary size 
+Enabling the "unity build" instantiates multiple kernel instances in each compilation unit, thereby reducing binary size
 and avoiding linker limitations on some platforms.
 ```bash
 $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" -DCUTLASS_LIBRARY_KERNELS=all -DCUTLASS_UNITY_BUILD_ENABLED=ON
 ```
 
-It is advised to only compile CUTLASS kernels for NVIDIA architectures one plans on running. Furthermore, kernels 
-can be selectively included in the CUTLASS Library by specifying filter strings and wildcard characters when executing CMake. 
+It is advised to only compile CUTLASS kernels for NVIDIA architectures one plans on running. Furthermore, kernels
+can be selectively included in the CUTLASS Library by specifying filter strings and wildcard characters when executing CMake.
 
-Several examples are defined below for convenience. They may be combined as a comma-delimited list. 
+Several examples are defined below for convenience. They may be combined as a comma-delimited list.
 Compling only the kernels desired reduces compilation time.
 
 
@@ -529,20 +642,20 @@ $ cmake .. -DCUTLASS_NVCC_ARCHS='80' -DCUTLASS_LIBRARY_KERNELS=s16816fprop,s1681
 $ cmake .. -DCUTLASS_NVCC_ARCHS='50;60;61;70;75;80' -DCUTLASS_LIBRARY_KERNELS=sfprop
 ```
 
-**Example.** All forward propagation (fprop) convolution kernels with FP32 accumulation and FP16 input targetting NVIDIA Ampere's 16816 Tensor Core operation
+**Example.** All forward propagation (fprop) convolution kernels with FP32 accumulation and FP16 input targeting NVIDIA Ampere's 16816 Tensor Core operation
 ```bash
 $ cmake .. -DCUTLASS_NVCC_ARCHS='80' -DCUTLASS_LIBRARY_KERNELS=s16816fprop_*_f16
 ```
 
-**Example.** All backward weight gradient (wgrad) convolution kernels with FP32 accumulation, FP16 input, and optimized global memory iterator 
-targetting NVIDIA Ampere, Turing, and Volta Tensor Core operations
+**Example.** All backward weight gradient (wgrad) convolution kernels with FP32 accumulation, FP16 input, and optimized global memory iterator
+targeting NVIDIA Ampere, Turing, and Volta Tensor Core operations
 ```bash
 $ cmake .. -DCUTLASS_NVCC_ARCHS='70;75;80' -DCUTLASS_LIBRARY_KERNELS=tensorop*s*wgrad_optimized_f16
 ```
 
 # Copyright
 
-Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/media/docs/terminology.md b/media/docs/terminology.md
index 96d516397a..d58884b9f1 100644
--- a/media/docs/terminology.md
+++ b/media/docs/terminology.md
@@ -1,13 +1,13 @@
-![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS Terminology")
+![ALT](../images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS Terminology")
 
-[README](/README.md#documentation) > **Terminology**
+[README](../../README.md#documentation) > **Terminology**
 
 # CUTLASS Terminology
 
-`AlignedBuffer<T, N>`: statically sized array type; union-safe, no construction guarantee for elements
+**cute::Layout**: A `cute::Layout` vocabulary type composed of the hierarchical `cute::Shape` and `cute::Stride`
+tuples that is used throughout CUTLASS 3.0 to represent and manipulate thread and data layouts. More details are included in the [CuTe specific tensor type documentation](/media/docs/cute/03_tensor.md).
 
-`Array<T, N>`: container for holding numeric types - handles bit packing for small numeric types (e.g. int4_t, uint4_t, bin1_t)
-  `sizeof(Array<T, N>)` - gives expected value in units of bytes with minimum storage of `1 B`: (sizeof_bits<T>::value * N) / 8
+**cute::Tensor**: A pointer backed by a `cute::Layout` used to represent a tensor. More details are included in the [CuTe specific tensor type documentation](/media/docs/cute/03_tensor.md).
 
 **Capacity**: (scalar) physical number of elements in memory required to store a multidimensional object; expressed as the type's LongIndex type
   - example: the capacity of a column-major matrix is `lda * N`
@@ -28,8 +28,6 @@
 
 **Numeric Type**: a CUTLASS data type used to represent real-valued quantities; is trivially copyable.
 
-**Operator**: an object performing a computation on matrix or tensor objects. May be further refined by scope within the execution model hierarchy.
-
 **Pitch Linear**: linear memory allocation obtained from a user-defined 2-D size, which specifies the 
 contiguous and strided dimensions of a tile. 
 
@@ -41,7 +39,7 @@ contiguous and strided dimensions of a tile.
 **Rank**: number of dimensions in a multidimensional index space, array, tensor, or matrix. Consistent with 
   [C++ Standard Library](https://en.cppreference.com/w/cpp/types/rank)
 
-**Register**: in device code, registes are the most efficient storage for statically sized arrays of elements.
+**Register**: in device code, registers are the most efficient storage for statically sized arrays of elements.
   Arrays may be expected to be stored in registers if all accesses are made via constexpr indices or within
   fully unrolled loops.
 
@@ -61,20 +59,30 @@ contiguous and strided dimensions of a tile.
 
 **Tile**: partitions of a tensor that have constant extents and layout known at compile time
 
-**Tile Iterator**: abstraction for accessing and traversing a sequence of tiles in a tensor; CUTLASS specifies 
-  [formal concepts for tile iterators](tile_iterator_concept.md)
-
-**Thread Map**: abstraction for defining how threads are mapped to a given tile.
-
 **Trait**: characteristics of a fully-specialized type, typically used in metaprogramming reflection
 
 **View**: an object containing references to a data structure that it does not own; typically, construction of views is lightweight
 
 **Warp**: a collection of hardware threads executing in lock-step; warp-level operations typically rely on cooperation among the threads within the warp
 
+`AlignedBuffer<T, N>`: statically sized array type; union-safe, no construction guarantee for elements
+
+`Array<T, N>`: container for holding numeric types - handles bit packing for small numeric types (e.g. int4_t, uint4_t, bin1_t)
+  `sizeof(Array<T, N>)` - gives expected value in units of bytes with minimum storage of `1 B`: (sizeof_bits<T>::value * N) / 8
+
+**Operator**: an object performing a computation on matrix or tensor objects. May be further refined by scope within the execution model hierarchy. Deprecated starting CUTLASS 3.0,
+replaced by [MMA and Copy atoms from CuTe](/media/docs/cute/0t_mma_atom.md).
+
+**Tile Iterator**: abstraction for accessing and traversing a sequence of tiles in a tensor; CUTLASS specifies 
+  [formal concepts for tile iterators](tile_iterator_concept.md). Deprecated starting CUTLASS 3.0.
+  Replaced by `cute::Layout` in equivalent usage scenarios to represent data tensors.
+
+**Thread Map**: abstraction for defining how threads are mapped to a given tile. Deprecated starting CUTLASS 3.0.
+  Replaced by `cute::Layout` in equivalent usage scenarios to represent thread tensors.
+
 # Copyright
 
-Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/media/docs/tile_iterator_concept.md b/media/docs/tile_iterator_concept.md
index cba82e423b..897db653f5 100644
--- a/media/docs/tile_iterator_concept.md
+++ b/media/docs/tile_iterator_concept.md
@@ -1,12 +1,18 @@
-![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS Tile Iterator Concepts")
+![ALT](../images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS Tile Iterator Concepts")
 
-[README](/README.md#documentation) > **Tile Iterator Concepts**
+[README](../../README.md#documentation) > **Tile Iterator Concepts**
 
 # Tile Iterator Concepts
 
+Note: CUTLASS 3.0 deprecates all tile access iterators in favour of CuTe's single
+vocabulary type `cute::Tensor`, which is parameterized on `cute::Layout`.
+`cute::Tensor`s can therefore be manipulated with the same layout algebra as all CuTe layouts.
+This removes the need for bespoke types that encapsulate iterator properties.
+The following text thus only applies to legacy CUTLASS 2.x API and related types.
+
 CUTLASS 2.x implements generic algorithms on tiles of matrix or tensors of constant size. These may
 be considered as partitions of tensors of infinite size, with a range of partitions accessible
-by _tile iterators_. 
+by _tile iterators_.
 
 Various data structures may make operations such as random access to tiles inexpensive,
 while data structures may not offer random access at all. For example, iterating over a linked
@@ -14,7 +20,9 @@ list of matrices requires sequential traversal. Algorithms implemented in terms
 should require only the minimum set of operators be defined for tile iterators.
 
 This document describes a set of C++ concepts which may be used to define tile iterators used
-by CUTLASS algorithms. Each concept specifies members and type definitions that a tile iterator
+by CUTLASS algorithms.  ("Concept" here does not refer to a C++20 concept that uses the `concept` keyword.
+Rather, it refers to a set of requirements on a type.)
+Each concept specifies members and type definitions that a tile iterator
 must implement. Frequently, a tile iterator implements several concepts, and its members are
 the union of the members from each individual concept. These definitions were inspired by
 [Boost "New style" iterator concepts](https://www.boost.org/doc/libs/1_40_0/libs/iterator/doc/new-iter-concepts.html).
@@ -23,7 +31,6 @@ The set of all possible combinations of these concepts is quite large, however m
 templates can be described by one of several combinations. The section 
 Frequently Used Tile Iterator Concepts describes several common interfaces used throughout CUTLASS.
 
-
 ## Definitions
 
 **_Base Tile Iterator Concept_.** All tile iterators must describe an _Element_ type as well as a _Shape_.
@@ -466,7 +473,7 @@ struct WriteableReadableRandomAccessContiguousTileIteratorConcept {
 
 # Copyright
 
-Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/media/docs/utilities.md b/media/docs/utilities.md
index 34c06acb5b..b179f2fa94 100644
--- a/media/docs/utilities.md
+++ b/media/docs/utilities.md
@@ -1,6 +1,13 @@
-![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS Code Organization")
+![ALT](../images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS Code Organization")
 
-[README](/README.md#documentation) > **CUTLASS Utilities**
+[README](../../README.md#documentation) > **CUTLASS Utilities**
+
+Note: This document discusses utilities commonly used with code that targets CUTLASS 2.x.
+Although CUTLASS 3.0's primary entry point APIs do not transact in these `cutlass::*` tensor types anymore,
+users can still find them convenient for managing allocations with trivial affine layouts.
+For more advanced host side tensor management, [`cute::Tensor`](/media/docs/cute/03_tensor.md)s
+can be used on either host or device for any memory space and full expressive power of
+[`cute::Layout`](/media/docs/cute/01_layout.md)s.
 
 # CUTLASS Utilities
 
@@ -377,9 +384,57 @@ int main() {
 }
 ```
 
+## Debugging Asynchronous Kernels with CUTLASS's Built-in `synclog` Tool
+
+CUTLASS provides a built-in tool called `synclog` that enables printing runtime information useful for debugging asynchronous CUTLASS kernels. With the introduction of Warp Specialization in CUTLASS 3.0 for Hopper GPUs, kernel designs now incorporate synchronization among warps. The `synclog` tool simplifies debugging efforts for these asynchronous programs by recording and displaying timing information for synchronization events.
+
+### Enabling `synclog`
+To enable `synclog`, add the -DCUTLASS_ENABLE_SYNCLOG=1 flag during compilation. From the CUTLASS root directory:
+
+```
+$ mkdir build && cd build && 
+$ cmake .. -DCUTLASS_NVCC_ARCHS=90a -DCUTLASS_ENABLE_SYNCLOG=1
+```
+
+### Building and Running with `synclog`
+After enabling `synclog`, build your CUTLASS example. For instance, to build example 54:
+
+```
+$ cd examples/54_hopper_fp8_warp_specialized_gemm
+$ make
+```
+
+Run the example, setting the profiling iteration count to 0 to ensure `synclog` information is printed only for the reference run:
+
+```
+$ ./54_hopper_fp8_warp_specialized_gemm --iterations=0 &> synclog.txt
+```
+
+### Interpreting `synclog` output
+The synclog.txt file will contain runtime information about synchronization events. Here's a sample output snippet:
+
+```
+synclog start
+synclog at 1: cluster_barrier_init line=281 time=1725400116233388736 thread=0,0,0 block=0,0,0 smem_addr=197632 arrive_count=1
+synclog at 13: fence_barrier_init line=583 time=1725400116233388768 thread=32,0,0 block=0,0,0 
+...
+```
+
+Each line in the main body follows this format:
+```
+synclog at [synclog_at]: [header] line=[line] thread=[threadIdx.xyz] block=[blockIdx.xyz] 
+```
+* `synclog at`: Address in the `synclog` output buffer (in bytes). Output exceeding 2^26 bytes is discarded.
+* `header`: Name of the synchronization event.
+* `line`: Code line number of the synchronization operation calling into `synclog`.
+
+Additional information may appear at the end of each line, such as shared memory address, phase bit, and arrive count. For more detailed information on `synclog` output, refer to [synclog.hpp](../../include/cutlass/arch/synclog.hpp) in the CUTLASS source code. 
+
+Please note that `synclog` is an experimental feature, and its functionality is not always guaranteed. We encourage its use in custom kernels and CUTLASS examples, though it is known to be incompatible with profiler kernels.
+
 # Copyright
 
-Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/media/images/cute/HMMA.8x8x4.NT.png b/media/images/cute/HMMA.8x8x4.NT.png
new file mode 100644
index 0000000000..adedbac03c
Binary files /dev/null and b/media/images/cute/HMMA.8x8x4.NT.png differ
diff --git a/media/images/cute/HMMA.8x8x4.NT_2x2.png b/media/images/cute/HMMA.8x8x4.NT_2x2.png
new file mode 100644
index 0000000000..1a3e30aebf
Binary files /dev/null and b/media/images/cute/HMMA.8x8x4.NT_2x2.png differ
diff --git a/media/images/cute/HMMA.8x8x4.NT_2x2_32Mx32x4.png b/media/images/cute/HMMA.8x8x4.NT_2x2_32Mx32x4.png
new file mode 100644
index 0000000000..564cba2202
Binary files /dev/null and b/media/images/cute/HMMA.8x8x4.NT_2x2_32Mx32x4.png differ
diff --git a/media/images/cute/HMMA.8x8x4.NT_2x2_32x32x4.png b/media/images/cute/HMMA.8x8x4.NT_2x2_32x32x4.png
new file mode 100644
index 0000000000..1c45523137
Binary files /dev/null and b/media/images/cute/HMMA.8x8x4.NT_2x2_32x32x4.png differ
diff --git a/media/images/cute/HMMA.8x8x4.NT_Atom.png b/media/images/cute/HMMA.8x8x4.NT_Atom.png
new file mode 100644
index 0000000000..54d34e99d7
Binary files /dev/null and b/media/images/cute/HMMA.8x8x4.NT_Atom.png differ
diff --git a/media/images/cute/HMMA.8x8x4.quadpair.AB.png b/media/images/cute/HMMA.8x8x4.quadpair.AB.png
new file mode 100644
index 0000000000..2b04c7328a
Binary files /dev/null and b/media/images/cute/HMMA.8x8x4.quadpair.AB.png differ
diff --git a/media/images/cute/HMMA.8x8x4.quadpair.C.png b/media/images/cute/HMMA.8x8x4.quadpair.C.png
new file mode 100644
index 0000000000..2e255e420d
Binary files /dev/null and b/media/images/cute/HMMA.8x8x4.quadpair.C.png differ
diff --git a/media/images/cute/TiledCopyA.png b/media/images/cute/TiledCopyA.png
new file mode 100644
index 0000000000..ae87a000f3
Binary files /dev/null and b/media/images/cute/TiledCopyA.png differ
diff --git a/media/images/cute/TiledMmaC.png b/media/images/cute/TiledMmaC.png
new file mode 100644
index 0000000000..5ec89b1b40
Binary files /dev/null and b/media/images/cute/TiledMmaC.png differ
diff --git a/media/images/cute/complement1.png b/media/images/cute/complement1.png
new file mode 100644
index 0000000000..24fbb68fd4
Binary files /dev/null and b/media/images/cute/complement1.png differ
diff --git a/media/images/cute/composition1.png b/media/images/cute/composition1.png
new file mode 100644
index 0000000000..0d330a66f6
Binary files /dev/null and b/media/images/cute/composition1.png differ
diff --git a/media/images/cute/composition2.png b/media/images/cute/composition2.png
new file mode 100644
index 0000000000..7581a5d707
Binary files /dev/null and b/media/images/cute/composition2.png differ
diff --git a/media/images/cute/divide1.png b/media/images/cute/divide1.png
new file mode 100644
index 0000000000..534666e540
Binary files /dev/null and b/media/images/cute/divide1.png differ
diff --git a/media/images/cute/divide2.png b/media/images/cute/divide2.png
new file mode 100644
index 0000000000..36098093c9
Binary files /dev/null and b/media/images/cute/divide2.png differ
diff --git a/media/images/cute/divide3.png b/media/images/cute/divide3.png
new file mode 100644
index 0000000000..a5073faf57
Binary files /dev/null and b/media/images/cute/divide3.png differ
diff --git a/media/images/cute/gmma_coremat_cd_fp16.png b/media/images/cute/gmma_coremat_cd_fp16.png
new file mode 100644
index 0000000000..f84e0d249e
Binary files /dev/null and b/media/images/cute/gmma_coremat_cd_fp16.png differ
diff --git a/media/images/cute/gmma_wg_n_slice.png b/media/images/cute/gmma_wg_n_slice.png
new file mode 100644
index 0000000000..6fa03c0e31
Binary files /dev/null and b/media/images/cute/gmma_wg_n_slice.png differ
diff --git a/media/images/cute/logical_divide-and-zipped_divide-2.png b/media/images/cute/logical_divide-and-zipped_divide-2.png
new file mode 100755
index 0000000000..c1c29a4a1d
Binary files /dev/null and b/media/images/cute/logical_divide-and-zipped_divide-2.png differ
diff --git a/media/images/cute/logical_divide-and-zipped_divide.png b/media/images/cute/logical_divide-and-zipped_divide.png
new file mode 100755
index 0000000000..471649f539
Binary files /dev/null and b/media/images/cute/logical_divide-and-zipped_divide.png differ
diff --git a/media/images/cute/product1.png b/media/images/cute/product1.png
new file mode 100644
index 0000000000..7d966323f4
Binary files /dev/null and b/media/images/cute/product1.png differ
diff --git a/media/images/cute/product2.png b/media/images/cute/product2.png
new file mode 100644
index 0000000000..572beb0461
Binary files /dev/null and b/media/images/cute/product2.png differ
diff --git a/media/images/cute/product2d.png b/media/images/cute/product2d.png
new file mode 100644
index 0000000000..b13a9fb34d
Binary files /dev/null and b/media/images/cute/product2d.png differ
diff --git a/media/images/cute/productblocked2d.png b/media/images/cute/productblocked2d.png
new file mode 100644
index 0000000000..84862272d8
Binary files /dev/null and b/media/images/cute/productblocked2d.png differ
diff --git a/media/images/cute/productraked2d.png b/media/images/cute/productraked2d.png
new file mode 100644
index 0000000000..7d121ff4b4
Binary files /dev/null and b/media/images/cute/productraked2d.png differ
diff --git a/media/images/cute/slice.png b/media/images/cute/slice.png
new file mode 100644
index 0000000000..2edcbbf11b
Binary files /dev/null and b/media/images/cute/slice.png differ
diff --git a/media/images/cute/tC_partitioning.png b/media/images/cute/tC_partitioning.png
new file mode 100644
index 0000000000..108f836e29
Binary files /dev/null and b/media/images/cute/tC_partitioning.png differ
diff --git a/media/images/cute/tv_layout.png b/media/images/cute/tv_layout.png
new file mode 100644
index 0000000000..1d375e0ef9
Binary files /dev/null and b/media/images/cute/tv_layout.png differ
diff --git a/media/images/cutlass-3.0-gemm-peak-performance.png b/media/images/cutlass-3.0-gemm-peak-performance.png
new file mode 100644
index 0000000000..4e92a56be6
Binary files /dev/null and b/media/images/cutlass-3.0-gemm-peak-performance.png differ
diff --git a/media/images/cutlass-3.1-gemm-peak-performance.png b/media/images/cutlass-3.1-gemm-peak-performance.png
new file mode 100644
index 0000000000..b2f550a286
Binary files /dev/null and b/media/images/cutlass-3.1-gemm-peak-performance.png differ
diff --git a/media/images/cutlass-3.5.1-gemm-peak-performance-fp8.png b/media/images/cutlass-3.5.1-gemm-peak-performance-fp8.png
new file mode 100644
index 0000000000..bca203c0cb
Binary files /dev/null and b/media/images/cutlass-3.5.1-gemm-peak-performance-fp8.png differ
diff --git a/media/images/cutlass-3.5.1-gemm-peak-performance.png b/media/images/cutlass-3.5.1-gemm-peak-performance.png
new file mode 100644
index 0000000000..90b0bd6d20
Binary files /dev/null and b/media/images/cutlass-3.5.1-gemm-peak-performance.png differ
diff --git a/media/images/cutlass-reduction-in-named-iterators.png b/media/images/cutlass-reduction-in-named-iterators.png
new file mode 100644
index 0000000000..446fa9bf82
Binary files /dev/null and b/media/images/cutlass-reduction-in-named-iterators.png differ
diff --git a/media/images/grouped-gemm-schedule-2x2.png b/media/images/grouped-gemm-schedule-2x2.png
new file mode 100755
index 0000000000..27d57497bc
Binary files /dev/null and b/media/images/grouped-gemm-schedule-2x2.png differ
diff --git a/media/images/grouped-gemm-schedule-varied.png b/media/images/grouped-gemm-schedule-varied.png
new file mode 100755
index 0000000000..47de67dcce
Binary files /dev/null and b/media/images/grouped-gemm-schedule-varied.png differ
diff --git a/media/images/grouped-syr2k-schedule-3x3.png b/media/images/grouped-syr2k-schedule-3x3.png
new file mode 100755
index 0000000000..e98e24b8e6
Binary files /dev/null and b/media/images/grouped-syr2k-schedule-3x3.png differ
diff --git a/media/images/grouped-syr2k-schedule-ideal.png b/media/images/grouped-syr2k-schedule-ideal.png
new file mode 100755
index 0000000000..28d398ca4c
Binary files /dev/null and b/media/images/grouped-syr2k-schedule-ideal.png differ
diff --git a/media/images/grouped-syr2k-schedule-macro.png b/media/images/grouped-syr2k-schedule-macro.png
new file mode 100755
index 0000000000..1c2a2f9856
Binary files /dev/null and b/media/images/grouped-syr2k-schedule-macro.png differ
diff --git a/media/images/grouped-syr2k-schedule-using-grouped-gemm-scheduler.png b/media/images/grouped-syr2k-schedule-using-grouped-gemm-scheduler.png
new file mode 100755
index 0000000000..24737b5fd5
Binary files /dev/null and b/media/images/grouped-syr2k-schedule-using-grouped-gemm-scheduler.png differ
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000..ef8f1db29c
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,28 @@
+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "nvidia-cutlass"
+version = "3.6.0.0"
+description = "CUTLASS"
+readme = "README.md"
+requires-python = ">=3.8"
+license = {text = "BSD-3-Clause"}
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: BSD License",
+    "Operating System :: OS Independent",
+]
+dependencies = [
+  "cuda-python>=11.8.0",
+  "networkx",
+  "numpy",
+  "pydot",
+  "scipy",
+  "treelib"
+]
+
+[project.urls]
+"Homepage" = "https://github.com/nvidia/cutlass"
+"Bug Tracker" = "https://github.com/nvidia/cutlass/issues"
diff --git a/python/LICENSE.txt b/python/LICENSE.txt
new file mode 100644
index 0000000000..525500841e
--- /dev/null
+++ b/python/LICENSE.txt
@@ -0,0 +1,27 @@
+Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/python/README.md b/python/README.md
new file mode 100644
index 0000000000..4c89ec892f
--- /dev/null
+++ b/python/README.md
@@ -0,0 +1,212 @@
+![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition")
+
+# Python packages associated with CUTLASS
+
+This directory contains Python packages that are associated with CUTLASS:
+
+* `cutlass`: the CUTLASS Python interface, which enables one to compile and run CUTLASS kernels from within Python
+* `cutlass_library`: utilities used for enumerating and emitting C++ code for CUTLASS kernels
+
+## CUTLASS Python Interface
+
+The CUTLASS Python interface enables one to compile and run CUTLASS operations from within Python.
+
+```python
+import cutlass
+import numpy as np
+
+plan = cutlass.op.Gemm(element=np.float16, layout=cutlass.LayoutType.RowMajor)
+A, B, C, D = [np.ones((1024, 1024), dtype=np.float16) for i in range(4)]
+plan.run(A, B, C, D)
+```
+
+### Overview
+
+The CUTLASS Python interface prioritizes ease of use.
+It has the following features that support this goal.
+
+* It presents high-level interfaces for operators, that require only few parameters.
+* It selects sensible default configurations for an operator given the parameters that have been specified.
+* It enumerates configurations for users that are known to work in a given setting.
+* It favors emitting descriptive Python run-time exceptions instead of C++ compile-time errors, where possible.
+* It simplifies exporting CUTLASS kernels to framework extensions (e.g., PyTorch CUDA extensions).
+
+#### Non-goals
+The CUTLASS Python interface does not intend to:
+
+1. select optimal kernel configurations,
+2. act as a fast container for CUTLASS kernels, or
+3. act as a Python-to-CUDA-kernel just-in-time (JIT) compilation engine.
+
+Regarding selection of optimal kernel configurations,
+the interface favors ease-of-use over maximum configurability.
+Thus, its default selections for operator parameters may
+not achieve the highest possible performance in all scenarios. Users wishing to achieve the highest performance possible should either
+
+* select parameters by profiling different combinations of them, or
+* use a library such as [cuBLAS](https://developer.nvidia.com/cublas)
+  that contains heuristics for selecting kernels.
+
+Regarding acting as a fast container for CUTLASS kernels:
+the interface does not strive to minimize overhead in its Python functions surrounding the running of a kernel.
+Those wishing to deploy a CUTLASS kernel should either
+
+* use the C++ emitted by the Python interface directly, or
+* use one of the CUTLASS emitters for automatically creating a framework extension for the kernel (e.g., a PyTorch CUDA extension).
+
+Regarding acting as a Python-to-CUDA-kernel JIT compilation engine:
+the interface enables use of CUTLASS in Python code.
+It can be used by frameworks for JIT compiling
+Python to CUDA kernels, but does not set out to be such a framework.
+
+#### Comparison to PyCUTLASS
+
+The CUTLASS Python interface builds atop CUTLASS's [PyCUTLASS](https://github.com/NVIDIA/cutlass/tree/v3.0.0/tools/library/scripts/pycutlass) library. PyCUTLASS enables
+one to declare, compile, and run GEMMs, convolutions, and grouped GEMM operators with nearly the same configuration
+space as CUTLASS's C++ interface. While this flexibility enables one to achieve the similar levels of functionality
+as available in CUTLASS's C++ interface, it comes with the burden of needing to specify many configuration parameters
+to operators -- similar to what one must do in specifying template parameters to operations in CUTLASS's C++ interface.
+
+In contrast, the CUTLASS Python interface aims to provide a higher-level API for declaring, emitting, and compiling
+kernels that does not require exhaustively defining template parameters.
+
+### Current functionality
+The CUTLASS Python interface currently supports the following operations:
+* GEMMs
+* GEMMs with fused elementwise epilogues (e.g., ReLU) (for pre-SM90 kernels)
+* Stream K swizzling (for pre-SM90 kernels)
+* Grouped GEMM (for pre-SM90 kernels)
+
+### Getting started
+We recommend using the CUTLASS Python interface via an [NGC PyTorch Docker container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch):
+
+```bash
+docker run --gpus all -it --rm nvcr.io/nvidia/pytorch:23.08-py3 -p 8888:8888
+```
+
+The CUTLASS Python interface has been tested with CUDA 11.8, 12.0, and 12.1 on Python 3.8 and 3.9.
+
+#### Optional environment variables
+
+Prior to installing the CUTLASS Python interface, one may optionally set the following environment variables:
+
+* `CUTLASS_PATH`: the path to the cloned CUTLASS repository
+* `CUDA_INSTALL_PATH`: the path to the installation of CUDA
+
+If these environment variables are not set, the installation process will infer them to be the following:
+
+* `CUTLASS_PATH`: either one directory level above the current directory (i.e., `$(pwd)/..`) if installed locally or in the `source` directory of the location in which `cutlass_library` was installed
+* `CUDA_INSTALL_PATH`: the directory holding `/bin/nvcc` for the first version of `nvcc` on `$PATH` (i.e., `which nvcc | awk -F'/bin/nvcc' '{print $1}'`)
+
+**NOTE:** The version of `cuda-python` installed must match the CUDA version in `CUDA_INSTALL_PATH`.
+
+#### Installation
+
+Stable releases of the CUTLASS Python interface are available via the `nvidia-cutlass` PyPI package. Any other packages with the name `cutlass` are not affiliated with NVIDIA CUTLASS.
+```bash
+pip install nvidia-cutlass
+```
+
+The CUTLASS Python interface can also be installed from source by navigating to the root of the CUTLASS directory and performing
+```bash
+pip install .
+```
+
+If you would like to be able to make changes to the CUTLASS Python interface and have them reflected when using the interface, perform:
+```bash
+pip install -e .
+```
+
+To test that your installation was successful, you can run:
+```python
+import cutlass
+import numpy as np
+
+plan = cutlass.op.Gemm(element=np.float16, layout=cutlass.LayoutType.RowMajor)
+A, B, C, D = [np.ones((128, 128), dtype=np.float16) for i in range(4)]
+plan.run(A, B, C, D)
+```
+
+### Deep learning framework CUDA extensions
+The CUTLASS Python interface provides utilities for exporting a CUTLASS kernel to a deep learning framework CUDA extensions. Currently, PyTorch CUDA extensions can be exported, but a similar pattern could be applied for other frameworks as well. An example of this is provided [here](/examples/python/02_pytorch_extension_grouped_gemm.ipynb).
+
+Currently, the following operations can be exported to a PyTorch CUDA extension:
+* GEMM
+* Grouped GEMM
+* Conv2d
+
+### Examples
+
+Jupyter notebook examples of using the CUTLASS Python interface are located in [examples/python](/examples/python).
+
+To launch these notebooks from this directory, run:
+```bash
+jupyter-lab ../examples/python
+```
+
+### Building documentation
+
+The CUTLASS Python interface uses [Sphinx](https://www.sphinx-doc.org/en/master/) for documentation.
+
+Building the documentation requires additional packages.  The following commands will install them.
+```bash
+sudo apt-get install pandoc
+pip install --upgrade Sphinx furo pandoc myst-parser sphinx-copybutton nbsphinx nbsphinx-link sphinx-inline-tabs
+```
+
+To build documentation, you must first have installed the CUTLASS Python interface via the
+[installation instructions](#installation).
+
+Documentation can then be built via the following commands.
+```bash
+sphinx-apidoc -o docs_src/source/ cutlass/ cutlass/backend*
+cd docs_src
+make html
+mv _build/* ../docs
+```
+
+## CUTLASS library package
+
+[cutlass_library](/python/cutlass_library) contains utilities for enumerating and emitting CUTLASS C++ kernels.
+It is used by the CUTLASS CMake system to construct a library of kernels that can be profiled using the CUTLASS profiler.
+
+To install the `cutlass_library` package, run
+```bash
+python setup_library.py develop --user
+```
+
+Alternatively, `cutlass_library` will automatically be installed if you install the CUTLASS Python interface package.
+
+You can also use the [generator.py](/python/cutlass_library/generator.py) script directly without installing the module.
+
+# Copyright
+
+Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/python/cutlass/__init__.py b/python/cutlass/__init__.py
new file mode 100644
index 0000000000..fad278370f
--- /dev/null
+++ b/python/cutlass/__init__.py
@@ -0,0 +1,190 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import logging
+import os
+import sys
+
+import cutlass_library
+
+
+def _cuda_install_path_from_nvcc() -> str:
+    import subprocess
+    # Attempt to detect CUDA_INSTALL_PATH based on location of NVCC
+    result = subprocess.run(['/usr/bin/which', 'nvcc'], capture_output=True)
+    if result.returncode != 0:
+        raise Exception(f'Unable to find nvcc via `which` utility.')
+
+    cuda_install_path = result.stdout.decode('utf-8').split('/bin/nvcc')[0]
+    if not os.path.isdir(cuda_install_path):
+        raise Exception(f'Environment variable "CUDA_INSTALL_PATH" is not defined, '
+                        f'and default path of {cuda_install_path} does not exist.')
+
+    return cuda_install_path
+
+
+CUTLASS_PATH = os.getenv("CUTLASS_PATH", cutlass_library.source_path)
+
+# Alias CUTLASS_PATH as source_path
+source_path = CUTLASS_PATH
+
+_NVCC_VERSION = None
+def nvcc_version():
+    global _NVCC_VERSION
+    if _NVCC_VERSION is None:
+        import subprocess
+
+        # Attempt to get NVCC version
+        result = subprocess.run(['nvcc', '--version'], capture_output=True)
+        if result.returncode != 0:
+            raise Exception('Unable to run `nvcc --version')
+        _NVCC_VERSION = str(result.stdout).split(" release ")[-1].split(",")[0]
+    return _NVCC_VERSION
+
+_CUDA_INSTALL_PATH = None
+def cuda_install_path():
+    """
+    Helper method for on-demand fetching of the CUDA installation path. This allows
+    the import of CUTLASS to proceed even if NVCC is not available, preferring to
+    raise this error only when an operation that needs NVCC is being performed.
+    """
+    global _CUDA_INSTALL_PATH
+    if _CUDA_INSTALL_PATH is None:
+        _CUDA_INSTALL_PATH = os.getenv("CUDA_INSTALL_PATH", _cuda_install_path_from_nvcc())
+    return _CUDA_INSTALL_PATH
+
+CACHE_FILE = "compiled_cache.db"
+
+from cutlass_library import (
+    DataType,
+    EpilogueScheduleType,
+    KernelScheduleType,
+    MathOperation,
+    LayoutType,
+    OpcodeClass,
+    TileDescription,
+    TileSchedulerType,
+)
+
+this = sys.modules[__name__]
+this.logger = logging.getLogger(__name__)
+
+# RMM is only supported for Python 3.9+
+if (sys.version_info.major == 3 and sys.version_info.major > 8) or sys.version_info.major > 3:
+    try:
+        import rmm
+        this.use_rmm = True
+    except ImportError:
+        this.use_rmm = False
+else:
+    this.use_rmm = False
+
+
+def set_log_level(level: int):
+    """
+    Sets the log level
+
+    :param log_level: severity of logging level to use. See https://docs.python.org/3/library/logging.html#logging-levels for options
+    :type log_level: int
+    """
+    this.logger.setLevel(level)
+
+set_log_level(logging.ERROR)
+
+from cutlass.library_defaults import OptionRegistry
+from cutlass.backend.utils.device import device_cc
+
+this._option_registry = None
+def get_option_registry():
+    """
+    Helper method for on-demand initialization of the options registry. This avoids building
+    the registry when CUTLASS is imported.
+    """
+    if this._option_registry is None:
+        this.logger.info("Initializing option registry")
+        this._option_registry = OptionRegistry(device_cc())
+    return this._option_registry
+
+this.__version__ = '3.6.0'
+
+from cutlass.backend import create_memory_pool
+from cutlass.emit.pytorch import pytorch
+from cutlass.op.gemm import Gemm
+from cutlass.op.conv import Conv2d, Conv2dFprop, Conv2dDgrad, Conv2dWgrad
+from cutlass.op.gemm_grouped import GroupedGemm
+from cutlass.op.op import OperationBase
+from cutlass.backend.evt.ir.tensor import Tensor
+
+
+this.memory_pool = None
+def get_memory_pool():
+    """"
+    Helper method for on-demand memory pool. This avoids allocating the memory pool unnecessarily
+    whe CUTLASS is imported.
+    """
+    if this.use_rmm and this.memory_pool is None:
+        this.memory_pool = create_memory_pool(init_pool_size=2 ** 30, max_pool_size=2 ** 32)
+    return this.memory_pool
+
+
+from cuda import cuda, cudart
+
+this._device_id = None
+def initialize_cuda_context():
+    if this._device_id is not None:
+        return
+
+    if this.use_rmm:
+        # This also covers initializing the CUDA context
+        get_memory_pool()
+
+    device_id = os.getenv("CUTLASS_CUDA_DEVICE_ID")
+    if device_id is None:
+        if not this.use_rmm:
+            # Manually call cuInit() and create context by making a runtime API call
+            err, = cudart.cudaFree(0)
+            if err != cudart.cudaError_t.cudaSuccess:
+                raise RuntimeError(f"cudaFree failed with error {err}")
+
+        err, device_count = cuda.cuDeviceGetCount()
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise Exception(f"cuDeviceGetCount failed with error {err}")
+        if device_count <= 0:
+            raise Exception("No CUDA devices found")
+        device_id = 0
+
+    this._device_id = int(device_id)
+
+
+def device_id() -> int:
+    initialize_cuda_context()
+    return this._device_id
diff --git a/python/cutlass/backend/__init__.py b/python/cutlass/backend/__init__.py
new file mode 100644
index 0000000000..9a4e2f672d
--- /dev/null
+++ b/python/cutlass/backend/__init__.py
@@ -0,0 +1,48 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from cutlass.backend.arguments import *
+from cutlass.backend.c_types import *
+from cutlass.backend.compiler import ArtifactManager
+from cutlass.backend.conv2d_operation import *
+from cutlass.backend.epilogue import *
+from cutlass.backend.frontend import *
+from cutlass.backend.gemm_operation import *
+from cutlass.backend.library import *
+from cutlass.backend.memory_manager import PoolMemoryManager, create_memory_pool
+from cutlass.backend.operation import *
+from cutlass.backend.reduction_operation import *
+from cutlass.backend.type_hint import *
+from cutlass.backend.utils import *
+from cutlass.backend.utils.device import device_cc
+
+compiler = ArtifactManager()
diff --git a/python/cutlass/backend/arguments.py b/python/cutlass/backend/arguments.py
new file mode 100644
index 0000000000..b91cdf1f0c
--- /dev/null
+++ b/python/cutlass/backend/arguments.py
@@ -0,0 +1,133 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from math import prod
+from typing import Union
+
+from cuda import cuda, cudart
+import numpy as np
+
+import cutlass
+from cutlass.backend.frontend import CupyFrontend, NumpyFrontend, TorchFrontend
+from cutlass.backend.memory_manager import DevicePtrWrapper
+from cutlass.utils.datatypes import is_cupy_tensor, is_numpy_tensor, is_torch_tensor
+
+
+class ArgumentBase:
+    """
+    Base class for operation arguments
+    """
+
+    def __init__(
+        self,
+        A: "Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]",
+        B: "Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]",
+        C: "Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]",
+        D: "Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]",
+        **kwargs,
+    ) -> None:
+        # tensor_C can be interpreted as the bias with bias=True in keyword args
+        self.bias = kwargs.get("bias", False)
+
+        self.stream = kwargs.get("stream", cuda.CUstream(0))
+
+        # RMM buffers used to track tensor lifetime
+        self.buffers = {}
+        # Host tensor to copy the computed result back
+        self.host_tensors = {}
+
+        self.ptr_A = self.tensor_to_ptr(A, "A")
+        self.ptr_B = self.tensor_to_ptr(B, "B")
+        self.ptr_C = self.tensor_to_ptr(C, "C")
+        self.ptr_D = self.tensor_to_ptr(D, "D", is_output=True)
+        if C is not None:
+            if not isinstance(C, cuda.CUdeviceptr):
+                self.tensor_c_numel = prod(C.shape)
+
+    def tensor_to_ptr(self, tensor, name, is_output=False):
+        """
+        Convert and remember the input tensor to cuda.CUdeviceptr used by cuda python
+        For numpy.ndarray, it also remembers the host buffer for synchronization
+        """
+        if tensor is None:
+            return cuda.CUdeviceptr(0)
+        if is_numpy_tensor(tensor):
+            if is_output:
+                assert name
+            self.buffers[name] = NumpyFrontend.argument(tensor, is_output)
+            if is_output:
+                self.host_tensors[name] = tensor
+            return self.buffers[name].ptr
+        elif is_torch_tensor(tensor):
+            return TorchFrontend.argument(tensor)
+        elif isinstance(tensor, cuda.CUdeviceptr):
+            return tensor
+        elif is_cupy_tensor(tensor):
+            return CupyFrontend.argument(tensor)
+        else:
+            raise TypeError("Unsupported Frontend. Only support numpy and torch")
+
+    def sync(self, stream_sync=True):
+        if stream_sync:
+            (err,) = cudart.cudaDeviceSynchronize()
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                raise RuntimeError("CUDA Error %s" % str(err))
+
+        for key in self.host_tensors.keys():
+            host_tensor = self.host_tensors[key]
+            (err,) = cuda.cuMemcpyDtoH(
+                host_tensor,
+                self.buffers[key].ptr,
+                host_tensor.size * host_tensor.itemsize,
+            )
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                raise RuntimeError("CUDA Error %s" % str(err))
+
+        self.free()
+
+    def free(self):
+        """
+        Frees allocated device-side memory
+        """
+        # Free any device memory allocated manually
+        if not cutlass.use_rmm:
+            for name, buf in self.buffers.items():
+                if isinstance(buf, DevicePtrWrapper):
+                    err, = cudart.cudaFree(buf.ptr)
+                    if err != cudart.cudaError_t.cudaSuccess:
+                        raise RuntimeError(f"cudaFree failed with error {err}")
+
+            if hasattr(self, "workspace_buffer") and isinstance(self.workspace_buffer, DevicePtrWrapper):
+                err, = cudart.cudaFree(self.workspace_buffer.ptr)
+                if err != cudart.cudaError_t.cudaSuccess:
+                    raise RuntimeError(f"cudaFree failed with error {err}")
+                del self.workspace_buffer
diff --git a/python/cutlass/backend/c_types.py b/python/cutlass/backend/c_types.py
new file mode 100644
index 0000000000..95e264cd89
--- /dev/null
+++ b/python/cutlass/backend/c_types.py
@@ -0,0 +1,639 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import ctypes
+
+from cutlass_library import (
+    DataType,
+    KernelScheduleType,
+    TileSchedulerType
+)
+from cutlass.backend.library import DataTypeSizeBytes
+
+
+class GemmCoord_(ctypes.Structure):
+    _fields_ = [
+        ("m", ctypes.c_int),
+        ("n", ctypes.c_int),
+        ("k", ctypes.c_int)
+    ]
+
+    def __init__(self, m, n, k) -> None:
+        self.m = m
+        self.n = n
+        self.k = k
+
+
+class GemmCoordBatched_(ctypes.Structure):
+    """
+    Wrapper around a GemmCoord that also contains batch count. This is used for encoding
+    batched GEMM inputs to CUTLASS 3 GEMMs.
+    """
+
+    _fields_ = [
+        ("m", ctypes.c_int),
+        ("n", ctypes.c_int),
+        ("k", ctypes.c_int),
+        ("batch_count", ctypes.c_int)
+    ]
+
+    def __init__(self, gemm_coord, batch_count) -> None:
+        self.m = gemm_coord.m
+        self.n = gemm_coord.n
+        self.k = gemm_coord.k
+        self.batch_count = batch_count
+
+
+class MatrixCoord_(ctypes.Structure):
+    _fields_ = [
+        ("row", ctypes.c_int),
+        ("column", ctypes.c_int)
+    ]
+
+
+class dim3_(ctypes.Structure):
+    _fields_ = [
+        ("x", ctypes.c_int),
+        ("y", ctypes.c_int),
+        ("z", ctypes.c_int)
+    ]
+
+
+class StrideBatched_(ctypes.Structure):
+    """
+    CUTLASS 3.0 strides for operands contain one static dimension and two variable dimensions. The
+    variable dimensions represent the stride along non-unit-stride dimension of the row/column major
+    layout, and the batch stride. This structure encodes the two variable dimensions.
+    """
+    _fields_ = [
+        ("major_stride", ctypes.c_int64),
+        ("batch_stride", ctypes.c_int64)
+    ]
+
+
+
+class GenericMainloopArguments3x_(ctypes.Structure):
+    """
+    Structure representing the superset of possible mainloop arguments.
+    This structure should not be passed to kernels directly, but, rather,
+    be used as an input to one of the more specific schedule arguments, which
+    will each select those arguments relevant to the particular schedule.
+    """
+    _fields_ = [
+        ("ptr_A", ctypes.c_void_p),
+        ("stride_A", StrideBatched_),
+        ("ptr_B", ctypes.c_void_p),
+        ("stride_B", StrideBatched_),
+        ("mma_promotion_interval", ctypes.c_int)
+    ]
+
+
+class _PersistentTileSchedulerArguments(ctypes.Structure):
+    _fields_ = [
+        ("max_swizzle_size", ctypes.c_int),
+        ("raster_order_option", ctypes.c_int),
+    ]
+
+
+class _PersistentTileSchedulerStreamKArguments(ctypes.Structure):
+    _fields_ = [
+        ("splits", ctypes.c_int),
+        ("max_swizzle_size", ctypes.c_int),
+        ("raster_order_option", ctypes.c_int),
+        ("reduction_mode", ctypes.c_int),
+        ("decomposition_mode", ctypes.c_int),
+    ]
+
+
+def get_tile_scheduler_arguments_3x(
+    tile_scheduler: TileSchedulerType,
+    splits: int = 1):
+    max_swizzle_size = 1
+    raster_order_option = 0 # Heuristic
+    if tile_scheduler in [TileSchedulerType.Default, TileSchedulerType.Persistent]:
+        return _PersistentTileSchedulerArguments(
+            max_swizzle_size,
+            raster_order_option,
+        )
+    elif tile_scheduler == TileSchedulerType.StreamK:
+        reduction_mode = 0 # Deterministic
+        decomposition_mode = 0 # Heuristic
+        return _PersistentTileSchedulerStreamKArguments(
+            splits,
+            max_swizzle_size,
+            raster_order_option,
+            reduction_mode,
+            decomposition_mode,
+        )
+
+
+def get_mainloop_arguments_3x(
+    kernel_schedule: KernelScheduleType,
+    element_A,
+    element_B,
+    alignment_A: int,
+    alignment_B: int) -> ctypes.Structure:
+    """
+    Returns the ctypes structure to be used for the 3.x kernel's mainloop parameters.
+
+    :param kernel_schedule: type of kernel schedule to be used in the mainloop
+    :type kernel_schedule: cutlass_library.KernelScheduleType
+    :param element_A: data type of operand A
+    :param element_B: data type of operand B
+    :param alignment_A: alignment of operand A
+    :type alignment_A: int
+    :param alignment_B: alignment of operand B
+    :type alignment_B: int
+
+    :returns: ctypes structure to be used for the 3.x kernel's mainloop parameters
+    :rtype: ctypes.Structure
+    """
+    class _MainloopArgumentsTma(ctypes.Structure):
+        _fields_ = [
+            ("ptr_A", ctypes.c_void_p),
+            ("stride_A", StrideBatched_),
+            ("ptr_B", ctypes.c_void_p),
+            ("stride_B", StrideBatched_),
+            ("mma_promotion_interval", ctypes.c_int)
+        ]
+
+        @staticmethod
+        def from_generic_mainloop_args(args: GenericMainloopArguments3x_):
+            return _MainloopArgumentsTma(
+                args.ptr_A, args.stride_A, args.ptr_B, args.stride_B,
+                args.mma_promotion_interval
+            )
+
+    class _MainloopArgumentsMultistage(ctypes.Structure):
+        _fields_ = [
+            ("ptr_A", ctypes.c_void_p),
+            ("stride_A", StrideBatched_),
+            ("ptr_B", ctypes.c_void_p),
+            ("stride_B", StrideBatched_),
+        ]
+
+        @staticmethod
+        def from_generic_mainloop_args(args: GenericMainloopArguments3x_):
+            return _MainloopArgumentsMultistage(
+                args.ptr_A, args.stride_A, args.ptr_B, args.stride_B,
+            )
+
+    # Currently all 3.x kernels (CpAsync and Tma) have the same argument structure.
+    # Should that become not the case, this is the place to return custom ctypes
+    # structures based on selected kernel schedule.
+    return _MainloopArgumentsTma
+
+
+def get_gemm_arguments_3x(mainloop_arguments, epilogue_functor, scheduler_args, default_epilogue):
+    if not default_epilogue and hasattr(epilogue_functor, "epilogue_type_evt"):
+        _EpilogueOutputOpParams = epilogue_functor.epilogue_type_evt
+    else:
+        _EpilogueOutputOpParams = epilogue_functor.epilogue_type
+
+    if hasattr(epilogue_functor, "visitor"):
+        class _EpilogueArguments(ctypes.Structure):
+            _fields_ = [
+                ("epilogue", _EpilogueOutputOpParams),
+                ("arg_C", epilogue_functor.arg_c_type),
+                ("arg_D", epilogue_functor.arg_d_type)
+            ]
+
+            def __init__(self, output_op, ptr_c, stride_c, ptr_d, stride_d) -> None:
+                self.epilogue = output_op
+                self.arg_C = epilogue_functor.arg_c_type(ptr_c)
+                self.arg_D = epilogue_functor.arg_d_type(ptr_d)
+    else:
+        class _EpilogueArguments(ctypes.Structure):
+            _fields_ = [
+                ("epilogue", _EpilogueOutputOpParams),
+                ("ptr_C", ctypes.c_void_p),
+                ("stride_C", StrideBatched_),
+                ("ptr_D", ctypes.c_void_p),
+                ("stride_D", StrideBatched_),
+            ]
+
+    class _HardwareInfo(ctypes.Structure):
+        _fields_ = [
+            ("device_id", ctypes.c_int),
+            ("sm_count", ctypes.c_int),
+        ]
+
+    class _GemmArguments(ctypes.Structure):
+        _fields_ = [
+            ("mode", ctypes.c_int),
+            ("problem_size", GemmCoordBatched_),
+            ("mainloop", mainloop_arguments),
+            ("epilogue", _EpilogueArguments),
+            ("hw_info", _HardwareInfo),
+            ("scheduler", type(scheduler_args)),
+        ]
+
+    return _GemmArguments, _EpilogueArguments, _EpilogueOutputOpParams, _HardwareInfo
+
+
+def get_gemm_arguments(epilogue_functor):
+    _EpilogueOutputOpParams = epilogue_functor.epilogue_type
+
+    class _GemmArguments(ctypes.Structure):
+        _fields_ = [
+            # Arguments from UniversalArgumentsBase
+            ("mode", ctypes.c_int),
+            ("problem_size", GemmCoord_),
+            ("batch_count", ctypes.c_int),
+            ("batch_stride_D", ctypes.c_longlong),
+            # Remaining arguments
+            ("epilogue", _EpilogueOutputOpParams),
+            ("ptr_A", ctypes.c_void_p),
+            ("ptr_B", ctypes.c_void_p),
+            ("ptr_C", ctypes.c_void_p),
+            ("ptr_D", ctypes.c_void_p),
+            ("batch_stride_A", ctypes.c_longlong),
+            ("batch_stride_B", ctypes.c_longlong),
+            ("batch_stride_C", ctypes.c_longlong),
+            ("stride_a", ctypes.c_longlong),
+            ("stride_b", ctypes.c_longlong),
+            ("stride_c", ctypes.c_longlong),
+            ("stride_d", ctypes.c_longlong),
+            ("lda", ctypes.c_longlong),
+            ("ldb", ctypes.c_longlong),
+            ("ldc", ctypes.c_longlong),
+            ("ldd", ctypes.c_longlong),
+            ("ptr_gather_A_indices", ctypes.c_void_p),
+            ("ptr_gather_B_indices", ctypes.c_void_p),
+            ("ptr_scatter_D_indices", ctypes.c_void_p)
+        ]
+
+    return _GemmArguments, _EpilogueOutputOpParams
+
+
+def get_gemm_arguments_streamk(epilogue_functor):
+    _EpilogueOutputOpParams = epilogue_functor.epilogue_type
+
+    class _GemmArguments(ctypes.Structure):
+        _fields_ = [
+            ("mode", ctypes.c_int),
+            ("problem_size", GemmCoord_),
+            ("batch_count", ctypes.c_int),
+            ("epilogue", _EpilogueOutputOpParams),
+            ("ptr_A", ctypes.c_void_p),
+            ("ptr_B", ctypes.c_void_p),
+            ("ptr_C", ctypes.c_void_p),
+            ("ptr_D", ctypes.c_void_p),
+            ("batch_stride_A", ctypes.c_longlong),
+            ("batch_stride_B", ctypes.c_longlong),
+            ("batch_stride_C", ctypes.c_longlong),
+            ("batch_stride_D", ctypes.c_longlong),
+            ("stride_a", ctypes.c_longlong),
+            ("stride_b", ctypes.c_longlong),
+            ("stride_c", ctypes.c_longlong),
+            ("stride_d", ctypes.c_longlong),
+            ("lda", ctypes.c_longlong),
+            ("ldb", ctypes.c_longlong),
+            ("ldc", ctypes.c_longlong),
+            ("ldd", ctypes.c_longlong),
+            ("avail_sms", ctypes.c_int)
+        ]
+
+    return _GemmArguments, _EpilogueOutputOpParams
+
+
+###########################################################################################
+# GEMM Grouped
+###########################################################################################
+
+
+def get_gemm_grouped_arguments(epilogue_functor):
+    _EpilogueOutputOpParams = epilogue_functor.epilogue_type
+
+    class _GEMMGroupedArguments(ctypes.Structure):
+        _fields_ = [
+            ("problem_sizes", ctypes.c_void_p),
+            ("problem_count", ctypes.c_int),
+            ("threadblock_count", ctypes.c_int),
+            ("output_op", _EpilogueOutputOpParams),
+            ("ptr_A", ctypes.c_void_p),
+            ("ptr_B", ctypes.c_void_p),
+            ("ptr_C", ctypes.c_void_p),
+            ("ptr_D", ctypes.c_void_p),
+            ("lda", ctypes.c_void_p),
+            ("ldb", ctypes.c_void_p),
+            ("ldc", ctypes.c_void_p),
+            ("ldd", ctypes.c_void_p),
+            ("host_problem_sizes", ctypes.c_void_p)
+        ]
+
+    return _GEMMGroupedArguments, _EpilogueOutputOpParams
+
+
+############################################################################################
+# Convolution2D
+############################################################################################
+
+
+class Conv2DProblemSize_(ctypes.Structure):
+    _fields_ = [
+        ("N", ctypes.c_int),
+        ("H", ctypes.c_int),
+        ("W", ctypes.c_int),
+        ("C", ctypes.c_int),
+        ("P", ctypes.c_int),
+        ("Q", ctypes.c_int),
+        ("K", ctypes.c_int),
+        ("R", ctypes.c_int),
+        ("S", ctypes.c_int),
+        ("pad_h", ctypes.c_int),
+        ("pad_w", ctypes.c_int),
+        ("stride_h", ctypes.c_int),
+        ("stride_w", ctypes.c_int),
+        ("dilation_h", ctypes.c_int),
+        ("dilation_w", ctypes.c_int),
+        ("mode", ctypes.c_int),  # kCrossCorrelation: 0, kConvolution: 1
+        ("split_k_slices", ctypes.c_int),
+        ("groups", ctypes.c_int)
+    ]
+
+    def __init__(self, problem_size) -> None:
+        for field_name, _ in self._fields_:
+            setattr(self, field_name, getattr(problem_size, field_name))
+
+
+class Layout4D(ctypes.Structure):
+    _fields_ = [("stride", ctypes.c_int * 3)]
+
+    def __init__(self, tensor_ref):
+        stride = tensor_ref.stride()
+        setattr(self, "stride", (stride.at(0), stride.at(1), stride.at(2)))
+
+
+class TensorRef_(ctypes.Structure):
+    _fields_ = [
+        ("ptr", ctypes.c_void_p),
+        ("layout", Layout4D)
+    ]
+
+    def __init__(self, tensor_ref):
+        setattr(self, "ptr", tensor_ref.data())
+        setattr(self, "layout", Layout4D(tensor_ref.layout()))
+
+
+class TensorRef2D_(ctypes.Structure):
+    _fields_ = [
+        ("ptr", ctypes.c_void_p),
+        ("stride", ctypes.c_int)
+    ]
+
+
+def get_conv2d_arguments(epilogue_functor):
+    _EpilogueOutputOpParams = epilogue_functor.epilogue_type
+
+    class _Conv2dArguments(ctypes.Structure):
+        _fields_ = [
+            ("conv_kind", ctypes.c_int),
+            ("problem_size", Conv2DProblemSize_),
+            ("ptr_A", ctypes.c_void_p),
+            ("ptr_B", ctypes.c_void_p),
+            ("ptr_C", ctypes.c_void_p),
+            ("ptr_D", ctypes.c_void_p),
+            ("tensor_C_numel", ctypes.c_int),
+            ("output_op", _EpilogueOutputOpParams),
+            ("split_k_mode", ctypes.c_int)
+        ]
+
+    return _Conv2dArguments, _EpilogueOutputOpParams
+
+
+############################################################################################
+# Reduction
+############################################################################################
+
+
+def get_reduction_params(epilogue_functor):
+    _EpilogueOutputParams = epilogue_functor.epilogue_type
+
+    class _ReductionParams(ctypes.Structure):
+        _fields_ = [
+            ("problem_size", MatrixCoord_),
+            ("partitions", ctypes.c_int),
+            ("partition_stride", ctypes.c_longlong),
+            ("workspace", TensorRef2D_),
+            ("destination", TensorRef2D_),
+            ("source", TensorRef2D_),
+            ("output_op", _EpilogueOutputParams),
+        ]
+
+    return _ReductionParams, _EpilogueOutputParams
+
+
+###########################################################################################
+# Epilogue Visitor Type Factory
+###########################################################################################
+
+class Empty(ctypes.Structure):
+    _fields_ = []
+
+    def __init__(self, *arg) -> None:
+        pass
+
+class EmptyByte(ctypes.Structure):
+    _fields_ = [
+        ("byte", ctypes.c_byte)
+    ]
+
+    def __init__(self, *arg) -> None:
+        pass
+
+class EBO:
+    def __init__(self, index: int, type) -> None:
+        self.index = index
+        self.type = type
+
+    def __eq__(self, other) -> bool:
+        if isinstance(other, EBO):
+            return self.index == other.index and self.type == other.type
+        return False
+
+    def __hash__(self) -> int:
+        return hash((self.index, self.type))
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def __str__(self) -> str:
+        return f"<{self.index}, {self.type}>"
+
+
+def tuple_factory_(input_tuple, dtype, constants=[0,1]):
+    """
+    The factory function generating cute::Tuple with input tuple
+    :param input_tuple: the input tuple
+    :type input_tuple: tuple
+    :param dtype: the data type for non-constant values
+    :type dtype: str, "int32_t", "int", "int64_t"
+    :param constant: the values that will be treated as constants
+    :type constant: list[int]
+
+    :return: ctype structure representing the cute::Tuple
+    :return: the empty base classes of the tuple
+    """
+
+    # The empty base classes of the current tuple
+    empty_bases = []
+    # The first non empty base class
+    first_non_empty_base = None
+    # The ctype fields of the current tuple
+    ctype_fields = []
+
+    for idx, entry in enumerate(input_tuple):
+        # For nested tuples
+        if isinstance(entry, tuple):
+            sub_tuple_ctype, sub_empty_bases = tuple_factory_(entry, dtype, constants)
+            if ctypes.sizeof(sub_tuple_ctype) == 0:
+                # The empty tuple base class is also an empty EBO
+                empty_bases.append(EBO(idx, entry))
+            else:
+                if first_non_empty_base is None:
+                    first_non_empty_base = sub_empty_bases
+            ctype_fields.append((f"entry_{idx}", sub_tuple_ctype))
+        else:
+            if entry in constants:
+                empty_bases.append(EBO(idx, entry))
+                ctype_fields.append((f"entry_{idx}", Empty))
+            else:
+                ctype_fields.append((f"entry_{idx}", dtype))
+                if first_non_empty_base is None:
+                    first_non_empty_base = []
+
+    # Determine whether or not add an additional byte for empty base classes
+    additional_byte = False
+    # Special case for constant tuple
+    if first_non_empty_base is None:
+        additional_byte = False
+    else:
+        for base in first_non_empty_base:
+            if base in empty_bases:
+                additional_byte = True
+                break
+
+    if additional_byte:
+        ctype_fields = [("empty_byte", EmptyByte), ] + ctype_fields
+
+    # Create the ctype tuple
+    class TupleType(ctypes.Structure):
+        _fields_ = ctype_fields
+
+        def __init__(self, args) -> None:
+            if additional_byte:
+                fields = self._fields_[1:]
+            else:
+                fields = self._fields_
+
+            assert len(fields) == len(args)
+            for field, arg in zip(fields, args):
+                name = field[0]
+                field_type = field[1]
+                setattr(self, name, field_type(arg))
+
+    return TupleType, empty_bases
+
+def tuple_factory(input_tuple, dtype: str, constants=[0,1]):
+    """
+    The factory function generating cute::Tuple with input tuple
+    :param input_tuple: the input tuple
+    :type input_tuple: tuple
+    :param dtype: the data type for non-constant values
+    :type dtype: str, "int32_t", "int", "int64_t"
+    :param constant: the values that will be treated as constants
+    :type constant: list[int]
+
+    :return: ctype structure representing the cute::Tuple
+    :return: the empty base classes of the tuple
+    """
+    # Step 1: convert the dtype
+    if dtype == "int64_t":
+        dtype = ctypes.c_longlong
+    elif dtype in ["int", "int32_t"]:
+        dtype = ctypes.c_int32
+    else:
+        raise NotImplementedError(f"Type {dtype} is not supported")
+
+    tuple_type, _ = tuple_factory_(input_tuple, dtype, constants)
+
+    if ctypes.sizeof(tuple_type) == 0:
+        return EmptyByte
+    return tuple_type
+
+
+def visitor_factory(node_types, node_names):
+    """
+    Creates the argument type of epilogue visitor type
+
+    :param node_types: list of argument types under ctypes
+    :param node_names: list of argument names under str
+
+    :return: tuple type in ctypes.Structure
+    """
+    ctypes_field = []
+    # Struct is used when number of nodes < 4
+    # Because the Sm90VisitorImplBase has specification up to 4 nodes
+    # in `include/cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp`
+    if len(node_types) <= 4:
+        for idx, node_type in enumerate(node_types):
+            if ctypes.sizeof(node_type) == 0:
+                # Special case for empty struct
+                # 1 byte placeholder is used for correct alignment
+                ctypes_field.append((node_names[idx], ctypes.c_byte))
+            else:
+                ctypes_field.append((node_names[idx], node_type))
+
+        class VisitorType(ctypes.Structure):
+            _fields_ = ctypes_field
+
+            def __init__(self, kwargs) -> None:
+                for field in self._fields_:
+                    fname, ftype = field
+                    if ftype != ctypes.c_byte:
+                        setattr(self, fname, ftype(kwargs))
+
+    # For cases with more than 4 nodes, tuple is used
+    else:
+        for idx, node_type in enumerate(node_types):
+            ctypes_field.append((node_names[idx], node_type))
+
+        class VisitorType(ctypes.Structure):
+            _fields_ = ctypes_field
+
+            def __init__(self, kwargs) -> None:
+                for field in self._fields_:
+                    fname, ftype = field
+                    setattr(self, fname, ftype(kwargs))
+
+    return VisitorType
diff --git a/python/cutlass/backend/compiler.py b/python/cutlass/backend/compiler.py
new file mode 100644
index 0000000000..2c38397d3f
--- /dev/null
+++ b/python/cutlass/backend/compiler.py
@@ -0,0 +1,459 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import ctypes
+import json
+import os
+import sqlite3
+import subprocess
+import tempfile
+
+from cuda import cuda, nvrtc
+from cutlass_library import SubstituteTemplate
+
+import cutlass
+from cutlass import CACHE_FILE, CUTLASS_PATH, cuda_install_path, logger
+from cutlass.backend.gemm_operation import GemmOperationUniversal
+from cutlass.backend.library import ApiVersion
+from cutlass.backend.utils.device import device_cc
+
+IncludeTemplate = r"""#include "${include}"
+"""
+
+
+def compile_with_nvcc(cmd, source, error_file):
+    succeed = True
+    try:
+        subprocess.check_output(cmd, stderr=subprocess.STDOUT)
+    except subprocess.CalledProcessError as e:
+        error_message = e.output.decode()
+        with open(error_file, "w") as error_out:
+            error_log = "Compilation error for the following kernel: \n"
+            error_log += source
+            error_log += "\nError Message:\n"
+            error_log += error_message
+            error_out.write(error_log)
+        succeed = False
+    if not succeed:
+        # Print the error log to stdout if log level is set to warning or higher
+        # verbosity. Otherwise, simply point to the error log file.
+        logger.warning(error_log)
+        raise Exception(f"Invalid Kernel. See '{error_file}' for details.")
+
+
+class CompilationOptions:
+    """
+    Compilation options.
+    """
+
+    def __init__(self, flags, arch, include_paths=[]):
+        self.includes = []
+        self.include_paths = include_paths
+        self.flags = flags
+        self.arch = arch
+
+    def get_str(self):
+        opts = []
+        for flag in self.flags:
+            opts.append(flag)
+
+        for incl in self.include_paths:
+            opts.append(f"--include-path={incl}")
+
+        arch_flag = f"-arch=sm_{self.arch}"
+        if self.arch == 90 and int(cutlass.nvcc_version().split('.')[0]) >= 12:
+            arch_flag += "a"
+        opts.append(arch_flag)
+
+        return " ".join(opts)
+
+    def get(self):
+        options = []
+
+        for flag in self.flags:
+            options.append(bytes(str.encode(flag)))
+
+        for incl in self.include_paths:
+            options.append(bytes(str.encode(f" --include-path={incl}")))
+
+        arch_flag = f" -arch=sm_{self.arch}"
+        if self.arch == 90:
+            arch_flag += "a"
+
+        options.append(bytes(str.encode(arch_flag)))
+
+        return options
+
+
+def convertToBinaryData(filename):
+    with open(filename, "rb") as file:
+        blobData = file.read()
+    return blobData
+
+
+def CDLLBin(host_binary):
+    tempfile.tempdir = "./"
+    temp_so = tempfile.NamedTemporaryFile(prefix="host_func", suffix=".so", delete=True)
+    with open(temp_so.name, "wb") as file:
+        file.write(host_binary)
+    host_lib = ctypes.CDLL(temp_so.name)
+    return host_lib
+
+
+class ArtifactManager:
+    """
+    Artifact manager
+    """
+
+    def __init__(self) -> None:
+        connection = sqlite3.connect(CACHE_FILE)
+        cursor = connection.cursor()
+        # Create the table if it does not already exist
+        sqlite_create_table_query = """
+        CREATE TABLE IF NOT EXISTS compiled_operations(op_key TEXT NOT NULL UNIQUE,
+                                                        cubin BLOB NOT NULL,
+                                                        hostbin BLOB NOT NULL,
+                                                        op_name TEXT NOT NULL,
+                                                        op_attrs TEXT NOT NULL)
+        """
+        cursor.execute(sqlite_create_table_query)
+        connection.commit()
+        cursor.close()
+
+        self._nvrtc_compile_options = ["-std=c++17", "-default-device"]
+        self._nvcc_compile_options = [
+            "-std=c++17",
+            "--expt-relaxed-constexpr",
+            "-Xcudafe --diag_suppress=esa_on_defaulted_function_ignored",
+        ]
+        self.nvcc()
+        self.compiled_cache_device = {}
+        self.compiled_cache_host = {}
+
+    def nvrtc(self):
+        self.backend = "nvrtc"
+        self.default_compile_options = self._nvrtc_compile_options
+
+    def nvcc(self):
+        self.backend = "nvcc"
+        self.default_compile_options = self._nvcc_compile_options
+
+    def insert_operation(self, op_key, cubin, hostfile, op_name, op_attrs):
+        connection = sqlite3.connect(CACHE_FILE)
+        cursor = connection.cursor()
+        sqlite_insert_blob_query = """ INSERT OR IGNORE INTO compiled_operations (op_key, cubin, hostbin, op_name, op_attrs) VALUES (?, ?, ?, ?, ?)"""
+
+        hostbin = convertToBinaryData(hostfile)
+
+        data_tuple = (op_key, cubin, hostbin, op_name, json.dumps(op_attrs))
+
+        cursor.execute(sqlite_insert_blob_query, data_tuple)
+        connection.commit()
+        cursor.close()
+
+    def load_operation(self, op_key, extra_funcs):
+        connection = sqlite3.connect(CACHE_FILE)
+        cursor = connection.cursor()
+        sqlite_fetch_blob_query = """SELECT * from compiled_operations where op_key = ?"""
+        cursor.execute(sqlite_fetch_blob_query, (op_key,))
+        record = cursor.fetchall()
+        if len(record) == 0:
+            return False
+        for row in record:
+            key, cubin_image, host_binary, operation_name, op_attr = row
+            op_attr = json.loads(op_attr)
+            err, module = cuda.cuModuleLoadData(cubin_image)
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                raise RuntimeError("Cuda Error: {}".format(err))
+
+            err, kernel = cuda.cuModuleGetFunction(module, bytes(str.encode(operation_name)))
+            self.compiled_cache_device[key] = kernel
+
+            compiled_host_fns = {}
+            host_lib = CDLLBin(host_binary)
+
+            func_name = operation_name + "_get_params"
+            func = getattr(host_lib, func_name)
+            func.restype = ctypes.POINTER(ctypes.c_char * op_attr[0])
+            compiled_host_fns["get_args"] = func
+
+            func_name = operation_name + "_shared_memory_size"
+            func = getattr(host_lib, func_name)
+            compiled_host_fns["shared_memory_capacity"] = func()
+
+            for attr in op_attr:
+                if isinstance(attr, str):
+                    func_name = operation_name + "_" + attr
+                    func = getattr(host_lib, func_name)
+
+                    # Set the return type of the function
+                    if attr in extra_funcs and extra_funcs[attr] != None:
+                        func.restype = extra_funcs[attr]
+
+                    compiled_host_fns[attr] = func
+
+            self.compiled_cache_host[key] = compiled_host_fns
+        return True
+
+    def emit_compile_(self, operation_list, compilation_options, host_compilation_options):
+        """
+        Compile a list of kernels and store them into database
+        """
+        source_buffer_device = ""
+        source_buffer_host = ""
+        # 1. include
+        includes = []
+        for operation in operation_list:
+            for incl in operation.emitter.includes:
+                if incl not in includes:
+                    includes.append(incl)
+
+        includes_host = ["builtin_types.h", "device_launch_parameters.h", "cstddef"] + includes
+        for incl in includes:
+            source_buffer_device += SubstituteTemplate(
+                IncludeTemplate,
+                {"include": incl},
+            )
+
+        for incl in includes_host:
+            source_buffer_host += SubstituteTemplate(
+                IncludeTemplate,
+                {"include": incl},
+            )
+
+        # 2. Operations
+        for operation in operation_list:
+            source_buffer_device += operation.emit()
+            source_buffer_host += operation.emit()
+            values = {
+                "operation_name": operation.name(),
+                "operation_suffix": operation.emitter.operation_suffix,
+            }
+            source_buffer_device += SubstituteTemplate(
+                operation.KernelTemplate,
+                values,
+            )
+            source_buffer_host += SubstituteTemplate(operation.HostTemplate, values)
+
+        if self.backend == "nvrtc":
+            # 3. compile
+            err, program = nvrtc.nvrtcCreateProgram(
+                str.encode(source_buffer_device),
+                bytes(str.encode("module.cu")),
+                0, [], [])
+
+            if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+                raise RuntimeError("NVRTC Error: {}".format(err))
+
+            # Compile program
+            options = compilation_options.get()
+
+            err, = nvrtc.nvrtcCompileProgram(program, len(options), options)
+            if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+                error_string = "NVRTC Error: {}\n".format(err)
+
+                # Get log from compilation
+                err, logSize = nvrtc.nvrtcGetProgramLogSize(program)
+                if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+                    raise RuntimeError("NVRTC Error: {}".format(err))
+
+                log = b" " * logSize
+                err, = nvrtc.nvrtcGetProgramLog(program, log)
+                if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+                    raise RuntimeError("NVRTC Error: {}".format(err))
+
+                raise RuntimeError(error_string + log.decode() + source_buffer_device)
+
+            # Get data from compilation
+            err, dataSize = nvrtc.nvrtcGetCUBINSize(program)
+            if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+                raise RuntimeError("NVRTC Error: {}".format(err))
+
+            cubin_image = b" " * dataSize
+            (err,) = nvrtc.nvrtcGetCUBIN(program, cubin_image)
+            if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+                raise RuntimeError("NVRTC Error: {}".format(err))
+
+        else:  # with nvcc backend
+            # emit code
+            tempfile.tempdir = "./"
+            temp_cu = tempfile.NamedTemporaryFile(
+                prefix="kernel", suffix=".cu", delete=True)
+            temp_cubin = tempfile.NamedTemporaryFile(
+                prefix="kernel", suffix=".cubin", delete=True)
+            with open(temp_cu.name, "w") as file:
+                file.write(source_buffer_device)
+
+            # compile with nvcc
+            cmd_template = "${cuda_install_path}/bin/nvcc ${options} -cubin ${srcfile} -o ${tarfile}"
+            values = {
+                "cuda_install_path": cuda_install_path(),
+                "options": compilation_options.get_str(),
+                "srcfile": temp_cu.name,
+                "tarfile": temp_cubin.name,
+            }
+            cmd = SubstituteTemplate(cmd_template, values)
+            compile_with_nvcc(cmd.split(" "), source_buffer_device, "./cutlass_python_compilation_device_error.txt")
+
+            # load the cubin image
+            with open(temp_cubin.name, "rb") as file:
+                cubin_image = file.read()
+
+        tempfile.tempdir = "./"
+        temp_src = tempfile.NamedTemporaryFile(
+            prefix="host_src", suffix=".cu", delete=True)
+
+        # Write the host source
+        with open(temp_src.name, "w") as outfile:
+            outfile.write(source_buffer_host)
+
+        temp_dst = tempfile.NamedTemporaryFile(
+            prefix="host_func", suffix=".so", delete=True)
+
+        # Set up host compilation arguments
+        cmd = []
+        cmd.append(f"{cuda_install_path()}/bin/nvcc")
+        cmd.extend(["-x", "cu", "-Xcompiler=-fpermissive", "-Xcompiler=-w", "-Xcompiler=-fPIC"])
+        cmd.extend(host_compilation_options.get_str().split(" "))
+        cmd.extend(["-shared", "-o", temp_dst.name, temp_src.name, "-lcudart", "-lcuda"])
+
+        # Comile and load the library
+        compile_with_nvcc( cmd, source_buffer_host, error_file="./cutlass_python_compilation_host_error.txt")
+        host_lib = ctypes.CDLL(temp_dst.name)
+
+        return cubin_image, host_lib, temp_dst
+
+    def add_module(self, operations, compile_options=None, bypass_cache=False):
+        """
+        Insert a new compiled device module
+        """
+        include_paths = [
+            cuda_install_path() + "/include",
+            CUTLASS_PATH + "/include",
+            CUTLASS_PATH + "/tools/util/include",
+            CUTLASS_PATH + "/python/cutlass/cpp/include",
+        ]
+
+        cutlass.initialize_cuda_context()
+        arch = device_cc()
+
+        host_compile_options = CompilationOptions(
+            self._nvcc_compile_options, arch, include_paths)
+        if compile_options is None:
+            compile_options = CompilationOptions(
+                self.default_compile_options, arch, include_paths)
+        # save the cubin
+        operation_key = []
+        operation_list = []
+        for operation in operations:
+            # step 1: get kernel string as key
+            key = operation.rt_module.emit() + operation.procedural_name() + self.backend
+            # step 1: check if the operation is in cache
+            compiled_kernel = self.compiled_cache_device.get(key)
+
+            if compiled_kernel is None and not bypass_cache:
+                hit = self.load_operation(key, getattr( operation.rt_module, "extra_funcs", {}))
+                if hit:
+                    compiled_kernel = self.compiled_cache_device.get(key)
+                    assert compiled_kernel is not None
+            if compiled_kernel is not None:
+                operation.rt_module.kernel = compiled_kernel
+                compiled_host_fns = self.compiled_cache_host.get(key)
+                assert compiled_host_fns is not None
+                for key in compiled_host_fns.keys():
+                    setattr(operation.rt_module, key, compiled_host_fns[key])
+                operation.rt_module.initialize()
+            else:
+                operation_list.append(operation.rt_module)
+                operation_key.append(key)
+
+        if len(operation_list) > 0:
+            cubin_image, host_lib, host_file = self.emit_compile_(
+                operation_list, compile_options, host_compile_options)
+
+            err, module = cuda.cuModuleLoadData(cubin_image)
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                raise RuntimeError("Cuda Error: {}".format(err))
+
+            operation_name = []
+            operation_attr = []
+            for operation, key in zip(operation_list, operation_key):
+                # get device kernels
+                err, operation.kernel = cuda.cuModuleGetFunction(
+                    module,
+                    bytes(str.encode(operation.name()))
+                )
+                operation_name.append(operation.name())
+                self.compiled_cache_device[key] = operation.kernel
+                # get host functions
+                compiled_host_fns = {}
+                op_attr = []
+
+                # get param size
+                func_name = operation.name() + "_get_param_size"
+                func = getattr(host_lib, func_name)
+                param_size = func()
+
+                func_name = operation.name() + "_get_params"
+                func = getattr(host_lib, func_name)
+                func.argtype = operation.argtype
+                func.restype = ctypes.POINTER(ctypes.c_char * param_size)
+                setattr(operation, "get_args", func)
+                compiled_host_fns["get_args"] = func
+
+                # set shared memory size
+                func_name = operation.name() + "_shared_memory_size"
+                func = getattr(host_lib, func_name)
+                setattr(operation, "shared_memory_capacity", func())
+                compiled_host_fns["shared_memory_capacity"] = func()
+                # set the maximum dynamic shared size
+                operation.initialize()
+
+                # get extra functions
+                op_attr.append(param_size)
+
+                if hasattr(operation, "extra_funcs"):
+                    for suffix, ret_type  in operation.extra_funcs.items():
+                        func_name = operation.name() + "_" + suffix
+                        func = getattr(host_lib, func_name)
+                        if ret_type is not None:
+                            func.restype = ret_type
+                        setattr(operation, suffix, func)
+                        compiled_host_fns[suffix] = func
+                        op_attr.append(suffix)
+
+                operation_attr.append(op_attr)
+                self.compiled_cache_host[key] = compiled_host_fns
+
+            for (key, operation_name, operation_attr,) in zip(operation_key, operation_name, operation_attr):
+                self.insert_operation(
+                    key, cubin_image, host_file.name, operation_name, operation_attr)
diff --git a/python/cutlass/backend/conv2d_operation.py b/python/cutlass/backend/conv2d_operation.py
new file mode 100644
index 0000000000..faefd135cd
--- /dev/null
+++ b/python/cutlass/backend/conv2d_operation.py
@@ -0,0 +1,698 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import ctypes
+from typing import Union
+
+from cuda import cuda
+from cutlass_library import SubstituteTemplate
+import numpy as np
+
+from cutlass_library import (
+    ConvKindNames,
+    ConvKindTag,
+    DataTypeNames,
+    DataTypeSize,
+    DataTypeTag,
+    IteratorAlgorithmNames,
+    IteratorAlgorithmTag,
+    LayoutTag,
+    LayoutType,
+    MathOperation,
+    MathOperationTag,
+    OpcodeClass,
+    OpcodeClassNames,
+    OpcodeClassTag,
+    OperationKind,
+    ShortDataTypeNames,
+    ShortLayoutTypeNames,
+    SplitKMode,
+    StrideSupport,
+    StrideSupportTag,
+    SwizzlingFunctor,
+    SwizzlingFunctorTag,
+    get_complex_from_real,
+)
+
+from cutlass.backend.arguments import ArgumentBase
+from cutlass.backend.c_types import dim3_, get_conv2d_arguments
+from cutlass.backend.library import (
+    EmissionType,
+    TensorDescription,
+    TileDescription,
+)
+from cutlass.backend.memory_manager import device_mem_alloc
+from cutlass.backend.operation import ExecutableOperation, LaunchConfiguration
+from cutlass.backend.utils.device import to_device_ptr
+from cutlass.shape import GemmCoord
+
+
+class Conv2dArguments(ArgumentBase):
+    """
+    Argument wrapper for Conv2d. It encodes problem information and
+    user-provide tensors into the kernel's argument.
+
+    :param operation: the Conv2d operation to take the argument
+    :type operation: :class:`cutlass.backend.Conv2dOperation`
+    :param problem_size: the Conv2d problem size
+    :type problem_size: :class:`cutlass.shape.Conv2dProblemSize`
+    :param A: tensor A
+    :type A: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+    :param B: tensor B
+    :type B: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+    :param C: tensor C
+    :type C: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+    :param D: tensor D
+    :type D: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+    :param split_k_mode: conv2d split K mode, defaults to cutlass_library.library.SplitKMode.Serial
+    :type split_k_mode: cutlass_library.library.SplitKMode, optional
+    :param output_op: output operator, optional
+    :type output_op: :class:`cutlass.backend.LinearCombinationFunctorArguments`
+    :param stream: cuda stream, defaults to cuda.cuda.CUstream(0)
+    :type stream: :class:`cuda.cuda.CUstream`
+    """
+
+    def __init__(self, operation, problem_size, A, B, C, D,
+        split_k_mode=SplitKMode.Serial, **kwargs, ) -> None:
+        self.operation = operation
+        self.conv_kind = operation.conv_kind
+        self.layout_A = operation.A.layout
+        self.layout_B = operation.B.layout
+        self.layout_C = operation.C.layout
+
+        self.element_A = operation.A.element
+        self.element_B = operation.B.element
+        self.element_C = operation.C.element
+
+        if self.layout_C == LayoutType.TensorNC32HW32:
+            raise Exception("Layout type TensorNC32HW32 is not currently supported")
+
+        super().__init__(A, B, C, D, **kwargs)
+
+        if "split_k_slices" in kwargs.keys() and kwargs["split_k_slices"] > 1:
+            self.split_k_mode = split_k_mode
+            self.split_k_slices = kwargs["split_k_slices"]
+        else:
+            self.split_k_mode = SplitKMode.Serial
+            self.split_k_slices = 1
+
+        if "output_op" in kwargs.keys() and self.split_k_mode != SplitKMode.Parallel:
+            self.output_op = kwargs["output_op"]
+        else:
+            self.output_op = self.operation.epilogue_type(1.0, 0.0)
+
+        self.problem_size = problem_size
+        self.problem_size.split_k_slices = self.split_k_slices
+
+        self.initialize()
+
+    def get_arguments(self):
+        tc_numel = -1
+        if hasattr(self, "tensor_c_numel"):
+            tc_numel = self.tensor_c_numel
+
+        self.c_arguments = self.operation.argument_type(
+            int(self.conv_kind),
+            self.problem_size.ctype,
+            int(to_device_ptr(self.ptr_A)),
+            int(to_device_ptr(self.ptr_B)),
+            int(to_device_ptr(self.ptr_C)),
+            int(to_device_ptr(self.ptr_D)),
+            tc_numel,
+            self.output_op,
+            int(self.split_k_mode)
+        )
+
+    def initialize(self):
+        self.launch_config = self.operation.rt_module.plan(self)
+
+        self.get_arguments()
+
+        # Allocate and initialize device workspace
+        device_workspace_size = self.operation.rt_module.get_workspace_size(self.c_arguments)
+        if device_workspace_size > 0:
+            self.workspace_buffer = device_mem_alloc(device_workspace_size)
+            workspace_ptr = self.workspace_buffer.ptr
+            err, = cuda.cuMemsetD32(
+                workspace_ptr, 0, device_workspace_size // 4)
+        else:
+            workspace_ptr = None
+
+        self.semaphore = 0
+        if workspace_ptr is not None and self.split_k_mode == SplitKMode.Parallel:
+            self.ptr_D = workspace_ptr
+            # Reset arguments now that ptr_D has been updated
+            self.get_arguments()
+        elif workspace_ptr is not None and self.split_k_mode == SplitKMode.Serial:
+            self.semaphore = workspace_ptr
+
+        params_ = self.operation.rt_module.get_args(
+            self.c_arguments, ctypes.c_void_p(int(self.semaphore)))
+        self.host_workspace = bytearray(params_.contents)
+        self.device_workspace = None
+
+    def sync(self):
+        """
+        Synchronize the arguments. If the input tensor is in host,
+        copy it from device to host.
+        """
+        return super().sync()
+
+
+class Conv2dRT(ExecutableOperation):
+    """
+    Conv2dRT manages the CUTLASS runtime components
+    """
+
+    KernelTemplate = r"""
+extern "C"
+__global__ void
+${operation_name}(${operation_name}${operation_suffix}::Params params) {
+
+  // Dynamic shared memory base pointer
+  extern __shared__ int SharedStorageBase[];
+
+  // Declare pointer to dynamic shared memory.
+  ${operation_name}${operation_suffix}::SharedStorage *shared_storage =
+      reinterpret_cast<${operation_name}${operation_suffix}::SharedStorage *>(SharedStorageBase);
+
+  ${operation_name}${operation_suffix} op;
+
+  op(params, *shared_storage);
+}
+    """
+
+    HostTemplate = r"""
+extern "C" {
+  // Get the size of params in bytes
+  int ${operation_name}_get_param_size(){
+    return sizeof(${operation_name}${operation_suffix}::Params);
+  }
+
+  // Get the size of dynamic shared memory in bytes
+  int ${operation_name}_shared_memory_size() {
+    return int(sizeof(${operation_name}${operation_suffix}::SharedStorage));
+  }
+
+  using ElementA = typename ${operation_name}_base::ElementA;
+  using ElementB = typename ${operation_name}_base::ElementB;
+  using ElementC = typename ${operation_name}_base::ElementC;
+  using LayoutA = typename ${operation_name}_base::LayoutA;
+  using LayoutB = typename ${operation_name}_base::LayoutB;
+  using LayoutC = typename ${operation_name}_base::LayoutC;
+  using EpilogueOutputOp = typename ${operation_name}_base::EpilogueOutputOp;
+
+  struct ${operation_name}_TemporaryArgs {
+    int conv_kind;
+    cutlass::conv::Conv2dProblemSize problem_size;
+    ElementA* ptr_A;
+    ElementB* ptr_B;
+    ElementC* ptr_C;
+    ElementC* ptr_D;
+    int tensor_c_numel;
+    typename EpilogueOutputOp::Params epilogue_params;
+    int split_k_mode;
+  };
+
+  typename ${operation_name}${operation_suffix}::Arguments
+  construct_arguments(${operation_name}_TemporaryArgs args) {
+    cutlass::conv::Operator conv_operator = static_cast<cutlass::conv::Operator>(args.conv_kind);
+    auto tc_A = cutlass::conv::implicit_gemm_tensor_a_extent(conv_operator, args.problem_size);
+    auto tc_B = cutlass::conv::implicit_gemm_tensor_b_extent(conv_operator, args.problem_size);
+    auto tc_C = cutlass::conv::implicit_gemm_tensor_c_extent(conv_operator, args.problem_size);
+    auto tc_D = cutlass::conv::implicit_gemm_tensor_c_extent(conv_operator, args.problem_size);
+
+    auto size_C = tc_C.at(0) * tc_C.at(1) * tc_C.at(2) * tc_C.at(3);
+    if (args.tensor_c_numel >= 0 && args.tensor_c_numel == tc_C.at(3) && args.tensor_c_numel < size_C) {
+      // C is interpreted as bias
+      tc_C = {0, 0, 0, 0};
+    }
+
+    cutlass::TensorRef<ElementA, LayoutA> tref_A(args.ptr_A, LayoutA::packed(tc_A));
+    cutlass::TensorRef<ElementB, LayoutA> tref_B(args.ptr_B, LayoutB::packed(tc_B));
+    cutlass::TensorRef<ElementC, LayoutA> tref_C(args.ptr_C, LayoutC::packed(tc_C));
+    cutlass::TensorRef<ElementC, LayoutA> tref_D(args.ptr_D, LayoutC::packed(tc_D));
+
+    return {
+      args.problem_size,
+      tref_A,
+      tref_B,
+      tref_C,
+      tref_D,
+      args.epilogue_params,
+      static_cast<cutlass::conv::SplitKMode>(args.split_k_mode)
+    };
+  }
+
+  // Get the params as byte array
+  char* ${operation_name}_get_params(${operation_name}_TemporaryArgs args, int *semaphore=nullptr) {
+    auto arguments = construct_arguments(args);
+    typename ${operation_name}${operation_suffix}::Params* params;
+    params = new ${operation_name}${operation_suffix}::Params(arguments, semaphore);
+
+    char *bytes = ((char*)(params));
+    char *output = new char[sizeof(${operation_name}${operation_suffix}::Params)];
+    for (unsigned int i = 0; i < sizeof(${operation_name}${operation_suffix}::Params); i ++)
+      output[i] = bytes[i];
+
+    return output;
+  }
+
+  dim3 ${operation_name}_get_grid_shape(
+    int conv_kind,
+    cutlass::conv::Conv2dProblemSize problem_size,
+    cutlass::gemm::GemmCoord tile_size,
+    int split_k_slices
+  ) {
+
+    using Swizzle = typename ${operation_name}_base::ThreadblockSwizzle;
+    auto tiled_shape = Swizzle::get_tiled_shape(
+      static_cast<cutlass::conv::Operator>(conv_kind),
+      problem_size,
+      tile_size,
+      split_k_slices);
+
+    return Swizzle::get_grid_shape(tiled_shape);
+  }
+
+  size_t ${operation_name}_get_workspace_size(${operation_name}_TemporaryArgs args) {
+    auto arguments = construct_arguments(args);
+
+    // Temporarily define device::-level Conv2d so that we can call get_workspace_size
+    using DeviceConv = cutlass::conv::device::ImplicitGemmConvolution<${operation_name}_base>;
+    return DeviceConv::get_workspace_size(arguments);
+  }
+}
+
+    """
+
+    def __init__(self, operation: "Conv2dOperation"):
+        super().__init__(operation)
+        self.extra_funcs = {
+            "get_grid_shape": dim3_,
+            "get_workspace_size": ctypes.c_uint64
+        }
+        self.argument_type, self.epilogue_type = get_conv2d_arguments(operation.epilogue_functor)
+        self.argtype = [ctypes.POINTER(self.argument_type), ctypes.c_void_p]
+        self.conv_kind = operation.conv_kind
+
+        self.operation: Conv2dOperation = operation
+
+        self.emitter = EmitConv2dInstance("_type")
+
+        self.threads = operation.tile_description.num_threads
+
+        self.swizzle_functor = operation.swizzling_functor
+
+    def emit(self):
+        return self.emitter.emit(self.operation)
+
+    def plan(self, arguments: Conv2dArguments):
+        tile_size = GemmCoord(
+            self.operation.tile_description.threadblock_shape[0],
+            self.operation.tile_description.threadblock_shape[1],
+            self.operation.tile_description.threadblock_shape[2],
+        )
+
+        grid = self.get_grid_shape(
+            int(self.conv_kind),
+            arguments.problem_size.ctype,
+            tile_size.ctype,
+            arguments.split_k_slices
+        )
+
+        return LaunchConfiguration(
+            [grid.x, grid.y, grid.z], [self.threads, 1, 1],
+            self.shared_memory_capacity)
+
+    def initialize(self):
+        err, = cuda.cuFuncSetAttribute(
+            self.kernel,
+            attrib=cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+            value=self.shared_memory_capacity)
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError(f"CUDA Error: {err}")
+
+
+class Conv2dOperation:
+    """
+    CUTLASS Conv2d operation description.
+
+    :param conv_kind: convolution operator
+    :type conv_kind: :class:`cutlass_library.library.ConvKind`
+
+    :param iterator_algorithm: Selects among several implementation
+    variants trading off performance with simplicity
+    :type iterator_algorithm: :class:`cutlass_library.library.IteratorAlgorithm`
+
+    :param arch: GPU compute capability (sm_xx)
+    :type arch: int
+
+    :param tile_description: tile description
+    :type tile_description: :class:`cutlass.backend.TileDescription`
+
+    :param A: tensor A description
+    :type A: :class:`cutlass.backend.TensorDescription`
+
+    :param B: tensor B description
+    :type B: :class:`cutlass.backend.TensorDescription`
+
+    :param C: tensor C description
+    :type C: :class:`cutlass.backend.TensorDescription`
+
+    :param D: tensor D description
+    :type D: :class:`cutlass.backend.TensorDescription`
+
+    :param element_epilogue: element type for computation in epilogue \
+    :type element_epilogue: cutlass_library.library.DataType
+
+    :param stride_support: distinguish among partial specializations that \
+    accelerate certain problems where convolution stride is unit \
+    :type stride_support: :class:`cutlass_library.library.StrideSupport`
+
+    :param epilogue_functor: convolution epilogue functor
+    :type epilogue_functor: :class:`EpilogueFunctor`
+
+    :param swizzling_functor: threadblock swizzling functor
+    """
+    def __init__(
+        self,
+        conv_kind,
+        iterator_algorithm,
+        arch: int,
+        tile_description: TileDescription,
+        A: TensorDescription,
+        B: TensorDescription,
+        C: TensorDescription,
+        stride_support,
+        epilogue_functor,
+        swizzling_functor=SwizzlingFunctor.Identity1,
+        emission_type=EmissionType.Kernel,
+        **kwargs
+    ):
+        self.operation_kind: OperationKind = OperationKind.Conv2d
+        self.arch: int = arch
+        self.tile_description: TileDescription = tile_description
+        self.conv_kind = conv_kind
+        self.A: TensorDescription = A
+        self.B: TensorDescription = B
+        self.C: TensorDescription = C
+        self.epilogue_functor = epilogue_functor
+        self.iterator_algorithm = iterator_algorithm
+        self.stride_support = stride_support
+        self.swizzling_functor = swizzling_functor
+
+        self.emission_type = emission_type
+
+        self.rt_module: Conv2dRT = Conv2dRT(self)
+        self.argument_type = self.rt_module.argument_type
+        self.epilogue_type = self.rt_module.epilogue_type
+
+    def run(self, arguments: Conv2dArguments) -> cuda.CUresult:
+        """
+        Launch the cuda kernel with input arguments
+
+        :param arguments: conv2d arguments
+        :type arguments: :class:`cutlass.backend.Conv2dArguments`
+        """
+
+        # launch the kernel
+        err = self.rt_module.run(
+            arguments.host_workspace,
+            arguments.device_workspace,
+            arguments.launch_config,
+            arguments.stream
+        )
+
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError(f"CUDA Error {err}")
+
+        return err
+
+    #
+    # Get function name
+    #
+
+    def procedural_name(self):
+        """The full procedural name indicates architecture, extended name, tile size, and layout."""
+        return self.configuration_name()
+
+    def configuration_name(self):
+        """The full procedural name indicates architecture, extended name, tile size, and layout."""
+
+        opcode_class_name = OpcodeClassNames[
+            self.tile_description.math_instruction.opcode_class
+        ]
+
+        threadblock = "%dx%d_%dx%d" % (
+            self.tile_description.threadblock_shape[0],
+            self.tile_description.threadblock_shape[1],
+            self.tile_description.threadblock_shape[2],
+            self.tile_description.stages,
+        )
+
+        if self.stride_support == StrideSupport.Unity:
+            configuration_name = "cutlass_sm${arch}_${opcode_class}_${extended_name}_${threadblock}_${layout}_unity_stride_align${alignment}"
+        else:
+            configuration_name = "cutlass_sm${arch}_${opcode_class}_${extended_name}_${threadblock}_${layout}_align${alignment}"
+
+        return SubstituteTemplate(
+            configuration_name,
+            {
+                "arch": str(self.arch),
+                "opcode_class": opcode_class_name,
+                "extended_name": self.extended_name(),
+                "threadblock": threadblock,
+                "layout": self.layout_name(),
+                "alignment": "%d" % self.A.alignment
+            },
+        )
+
+    def extended_name(self):
+        """Append data types if they differ from compute type."""
+        if self.C.element != self.tile_description.math_instruction.element_accumulator and \
+                self.A.element != self.tile_description.math_instruction.element_accumulator:
+            extended_name = "${element_c}_${core_name}_${element_a}"
+        elif self.C.element == self.tile_description.math_instruction.element_accumulator and  \
+                self.A.element != self.tile_description.math_instruction.element_accumulator:
+            extended_name = "${core_name}_${element_a}"
+        else:
+            extended_name = "${core_name}"
+
+        extended_name = SubstituteTemplate(extended_name, {
+            "element_a": DataTypeNames[self.A.element],
+            "element_c": DataTypeNames[self.C.element],
+            "core_name": self.core_name(),
+        })
+
+        return extended_name
+
+    def layout_name(self):
+        return "%s" % (ShortLayoutTypeNames[self.A.layout])
+
+    def core_name(self):
+        """The basic operation kind is prefixed with a letter indicating the accumulation type."""
+
+        intermediate_type = ""
+
+        if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp:
+            inst_shape = "%dx%dx%d" % tuple(
+                self.tile_description.math_instruction.instruction_shape)
+            if self.tile_description.math_instruction.element_a != self.A.element and \
+                    self.tile_description.math_instruction.element_a != self.accumulator_type():
+                intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
+        else:
+            inst_shape = ""
+
+        return "%s%s%s%s_%s" % (
+            ShortDataTypeNames[self.accumulator_type()],
+            inst_shape,
+            intermediate_type,
+            ConvKindNames[self.conv_kind],
+            IteratorAlgorithmNames[self.iterator_algorithm]
+        )
+
+    def is_complex(self):
+        complex_operators = [
+            MathOperation.multiply_add_complex,
+            MathOperation.multiply_add_complex_gaussian,
+        ]
+        return self.tile_description.math_instruction.math_operation in complex_operators
+
+    def accumulator_type(self):
+        accum = self.tile_description.math_instruction.element_accumulator
+
+        if self.is_complex():
+            return get_complex_from_real(accum)
+
+        return accum
+
+    def device_op(self):
+        """
+        Returns a new Conv2dOperation object that is constructed with emission type
+        ``EmissionType.Device``.
+
+        :return: operation ready for device-level code emission
+        :rtype: Conv2dOperation
+        """
+        return Conv2dOperation(
+            self.conv_kind, self.iterator_algorithm, self.arch, self.tile_description,
+            self.A, self.B, self.C, self.stride_support, self.epilogue_functor, self.swizzling_functor,
+            emission_type=EmissionType.Device)
+
+
+###################################################################################################
+#
+# Emits single instances of a CUTLASS device-wide operator
+#
+###################################################################################################
+
+
+class EmitConv2dInstance:
+    def __init__(self, operation_suffix=""):
+        self.operation_suffix = operation_suffix
+        self.includes = [
+            "cutlass/cutlass.h",
+            "cutlass/conv/kernel/default_conv2d_fprop.h",
+            "cutlass/conv/kernel/default_conv2d_dgrad.h",
+            "cutlass/conv/kernel/default_conv2d_wgrad.h",
+            "cutlass/conv/device/implicit_gemm_convolution.h"
+        ]
+        self.template = """
+// Conv2d${conv_kind_name} ${iterator_algorithm_name} kernel instance "${operation_name}"
+using ${operation_name}_base =
+typename cutlass::conv::kernel::DefaultConv2d${conv_kind_name}<
+  ${element_a},
+  ${layout_a},
+  ${element_b},
+  ${layout_b},
+  ${element_c},
+  ${layout_c},
+  ${element_accumulator},
+  ${opcode_class},
+  ${arch},
+  cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+  cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k} >,
+  cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+  ${epilogue_functor},
+  ${swizzling_functor},
+  ${stages},
+  ${math_operator},
+  ${iterator_algorithm},
+  ${stride_support},
+  ${align_a},
+  ${align_b}
+>::Kernel;
+
+struct ${operation_name}${operation_suffix}:
+  public ${operation_name}_base { };
+
+"""
+
+        self.template_device = """
+// Conv2d operation ${operation_name}
+
+using Conv2d${conv_kind_name}Kernel = typename cutlass::conv::kernel::DefaultConv2d${conv_kind_name}<
+  ${element_a},
+  ${layout_a},
+  ${element_b},
+  ${layout_b},
+  ${element_c},
+  ${layout_c},
+  ${element_accumulator},
+  ${opcode_class},
+  ${arch},
+  cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+  cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k} >,
+  cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+  ${epilogue_functor},
+  ${swizzling_functor},
+  ${stages},
+  ${math_operator},
+  ${iterator_algorithm},
+  ${stride_support},
+  ${align_a},
+  ${align_b}
+>::Kernel;
+
+using DeviceKernel =
+    typename cutlass::conv::device::ImplicitGemmConvolution<Conv2d${conv_kind_name}Kernel>;
+"""
+
+    def emit(self, operation):
+        warp_shape = [int(operation.tile_description.threadblock_shape[idx] /
+                          operation.tile_description.warp_count[idx]) for idx in range(3)]
+
+        epilogue_vector_length = int(min(
+            operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
+
+        values = {
+            "operation_name": operation.procedural_name(),
+            "operation_suffix": self.operation_suffix,
+            "conv_kind": ConvKindTag[operation.conv_kind],
+            "conv_kind_name": ConvKindNames[operation.conv_kind].capitalize(),
+            "element_a": DataTypeTag[operation.A.element],
+            "layout_a": LayoutTag[operation.A.layout],
+            "element_b": DataTypeTag[operation.B.element],
+            "layout_b": LayoutTag[operation.B.layout],
+            "element_c": DataTypeTag[operation.C.element],
+            "layout_c": LayoutTag[operation.C.layout],
+            "element_accumulator": DataTypeTag[operation.accumulator_type()],
+            "opcode_class": OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+            "arch": "cutlass::arch::Sm%d" % operation.arch,
+            "threadblock_shape_m": str(operation.tile_description.threadblock_shape[0]),
+            "threadblock_shape_n": str(operation.tile_description.threadblock_shape[1]),
+            "threadblock_shape_k": str(operation.tile_description.threadblock_shape[2]),
+            "warp_shape_m": str(warp_shape[0]),
+            "warp_shape_n": str(warp_shape[1]),
+            "warp_shape_k": str(warp_shape[2]),
+            "instruction_shape_m": str(operation.tile_description.math_instruction.instruction_shape[0]),
+            "instruction_shape_n": str(operation.tile_description.math_instruction.instruction_shape[1]),
+            "instruction_shape_k": str(operation.tile_description.math_instruction.instruction_shape[2]),
+            "epilogue_vector_length": str(epilogue_vector_length),
+            "epilogue_functor": operation.epilogue_functor.emit(),
+            "swizzling_functor": SwizzlingFunctorTag[operation.swizzling_functor],
+            "stages": str(operation.tile_description.stages),
+            "iterator_algorithm": IteratorAlgorithmTag[operation.iterator_algorithm],
+            "iterator_algorithm_name": IteratorAlgorithmNames[operation.iterator_algorithm].capitalize(),
+            "stride_support": StrideSupportTag[operation.stride_support],
+            "math_operator": "cutlass::arch::OpMultiplyAddComplex" if operation.is_complex() else MathOperationTag[operation.tile_description.math_instruction.math_operation],
+            "align_a": str(operation.A.alignment),
+            "align_b": str(operation.B.alignment),
+        }
+
+        if operation.emission_type == EmissionType.Kernel:
+            conv2d_template = self.template
+        else:
+            conv2d_template = self.template_device
+
+        return SubstituteTemplate(conv2d_template, values)
diff --git a/python/cutlass/backend/epilogue.py b/python/cutlass/backend/epilogue.py
new file mode 100644
index 0000000000..48366a7609
--- /dev/null
+++ b/python/cutlass/backend/epilogue.py
@@ -0,0 +1,541 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import ctypes
+
+from cutlass_library import SubstituteTemplate
+import numpy as np
+from scipy.special import erf
+
+from cutlass_library import DataType, DataTypeTag
+from cutlass.backend.c_types import MatrixCoord_, tuple_factory
+from cutlass.backend.frontend import NumpyFrontend
+from cutlass.backend.library import ActivationOp, ActivationOpTag
+from cutlass.utils.datatypes import is_numpy_tensor, is_torch_available, is_torch_tensor
+
+dtype2ctype = {
+    DataType.f16: ctypes.c_uint16,
+    DataType.bf16: ctypes.c_uint16,
+    DataType.f32: ctypes.c_float,
+    DataType.f64: ctypes.c_double,
+    DataType.s8: ctypes.c_int8,
+    DataType.s32: ctypes.c_int32
+}
+
+if is_torch_available():
+    import torch
+    import torch.nn.functional as F
+
+
+def get_scalar(value):
+    """
+    Returns a scalar value from a container (e.g., np.ndarray)
+    """
+    if is_numpy_tensor(value):
+        if value.size != 1:
+            raise Exception("Scalars used in epilogue must be of size 1")
+        return value.reshape(-1)[0]
+    elif is_torch_tensor(value):
+        if value.size != 1:
+            raise Exception("Scalars used in epilogue must be of size 1")
+        return value.reshape(-1)[0]
+    else:
+        return value
+
+
+def to_ctype_value(value, dtype):
+    """
+    Converts ``value`` to the corresponding storage needed for the ctype that
+    will store ``value``.
+    """
+    scalar = get_scalar(value)
+    if dtype == DataType.f16:
+        # Convert f16 value into an integer
+        return int.from_bytes(np.float16(scalar).tobytes(), "little")
+    else:
+        return scalar
+
+
+#################################################################################################
+#
+# Epilogue Functors
+#
+#################################################################################################
+
+
+class EpilogueFunctorBase:
+    """
+    Base class for thread-level epilogue functors
+    """
+
+    def __init__(self) -> None:
+        pass
+
+    def emit(self, tag, template_argument):
+        template = """${tag}<${arguments}>"""
+        arguments = ""
+        for idx, arg in enumerate(template_argument):
+            arguments += arg
+            if idx < len(template_argument) - 1:
+                arguments += ", "
+        values = {
+            "tag": tag,
+            "arguments": arguments,
+        }
+
+        return SubstituteTemplate(template, values)
+
+
+class LinearCombination(EpilogueFunctorBase):
+    """
+    Apply a linear combination operator to an array of elements
+    D = alpha * accumulator + beta * source
+
+    :param element_output: data type used to load and store tensors
+
+    :param epilogue_vector_length: number of elements computed per operation.
+    Usually it is 128/sizeof_bits_v<ElementOutput_>, but we use 64 and 32 sometimes
+    when there are not enough data to store
+
+    :param element_accumulator: Accumulator data type
+
+    :param element_epilogue: data type used to compute linear combination
+    """
+
+    tag = "cutlass::epilogue::thread::LinearCombination"
+
+    def __init__(
+        self, element_output, epilogue_vector_length,
+        element_accumulator=None, element_epilogue=None) -> None:
+        super().__init__()
+
+        if element_accumulator is None:
+            element_accumulator = element_output
+        if element_epilogue is None:
+            element_epilogue = element_output
+
+        self.element_output = element_output
+        self.element_accumulator = element_accumulator
+        self.element_epilogue = element_epilogue
+        self.epilogue_vector_length = epilogue_vector_length
+
+        self.template_arguments = [
+            DataTypeTag[element_output],
+            str(epilogue_vector_length),
+            DataTypeTag[element_accumulator],
+            DataTypeTag[element_epilogue],
+        ]
+
+        c_element_epilogue = dtype2ctype[self.element_epilogue]
+        element_epilogue = self.element_epilogue
+
+        class _EpilogueOutputOpParamsEVT(ctypes.Structure):
+            """
+            Epilogue params when using the default linear combination of EVT, which
+            does not currently use {alpha,beta}_ptr_array
+            """
+
+            stride_type = tuple_factory((0,0,1), "int64_t", [0])
+            _fields_ = [
+                ("alpha", c_element_epilogue),
+                ("beta", c_element_epilogue),
+                ("alpha_ptr", ctypes.c_void_p),
+                ("beta_ptr", ctypes.c_void_p),
+                ("dalpha", stride_type),
+                ("dbeta", stride_type),
+            ]
+
+            def __init__(self, alpha, beta, *args) -> None:
+                self.alpha = to_ctype_value(alpha, element_epilogue)
+                self.beta = to_ctype_value(beta, element_epilogue)
+
+        class _EpilogueOutputOpParams(ctypes.Structure):
+            _fields_ = [
+                ("alpha", c_element_epilogue),
+                ("beta", c_element_epilogue),
+                ("alpha_ptr", ctypes.c_void_p),
+                ("beta_ptr", ctypes.c_void_p),
+                ("alpha_ptr_array", ctypes.c_void_p),
+                ("beta_ptr_array", ctypes.c_void_p),
+            ]
+
+            def __init__(self, alpha, beta, *args) -> None:
+                self.alpha = to_ctype_value(alpha, element_epilogue)
+                self.beta = to_ctype_value(beta, element_epilogue)
+
+            def to_evt_params(self) -> _EpilogueOutputOpParamsEVT:
+                return _EpilogueOutputOpParamsEVT(self.alpha, self.beta)
+
+        self.epilogue_type = _EpilogueOutputOpParams
+        self.epilogue_type_evt = _EpilogueOutputOpParamsEVT
+
+    def emit(self):
+        return super().emit(self.tag, self.template_arguments)
+
+
+class LinearCombinationClamp(LinearCombination):
+    """
+    Applies a linear combination operator to an array of elements then clamps
+    the output before converting to the output element type.
+
+    D = alpha * accumulator + beta * source + uniform
+
+    :param element_output: data type used to load and store tensors
+
+    :param epilogue_vector_length: number of elements computed per operation.
+    Usually it is 128/sizeof_bits_v<ElementOutput_>, but we use 64 and 32 sometimes
+    when there are not enough data to store
+
+    :param element_accumulator: Accumulator data type
+
+    :param element_epilogue: data type used to compute linear combination
+    """
+
+    tag = "cutlass::epilogue::thread::LinearCombinationClamp"
+
+    def __init__(
+        self, element_output, epilogue_vector_length,
+        element_accumulator=None, element_epilogue=None) -> None:
+        # Base constructor
+        super().__init__(
+            element_output,
+            epilogue_vector_length,
+            element_accumulator,
+            element_epilogue,
+        )
+
+        c_element_epilogue = dtype2ctype[self.element_epilogue]
+        element_epilogue = self.element_epilogue
+
+        class _EpilogueOutputOpParams(ctypes.Structure):
+            _fields_ = [
+                ("alpha", c_element_epilogue),
+                ("beta", c_element_epilogue),
+                ("alpha_ptr", ctypes.c_void_p),
+                ("beta_ptr", ctypes.c_void_p),
+            ]
+
+            def __init__(self, alpha, beta, *args) -> None:
+                self.alpha = to_ctype_value(alpha, element_epilogue)
+                self.beta = to_ctype_value(beta, element_epilogue)
+
+        self.epilogue_type = _EpilogueOutputOpParams
+
+
+class FastLinearCombinationClamp(EpilogueFunctorBase):
+    """
+    Applies a linear combination operator to an array of elements then clamps
+    the output before converting to the output element type.
+
+    D = alpha * accumulator + beta * source
+
+    Note: The below method only when problem_size_K <= 256 for signed int8 gemm
+    or problem_size_K <= 128 for unsigned int8 gemm. The default approach is
+    above.
+
+    :param element_output: data type used to load and store tensors
+
+    :param epilogue_vector_length: number of elements computed per operation.
+    Usually it is 128/sizeof_bits_v<ElementOutput_>, but we use 64 and 32 sometimes
+    when there are not enough data to store
+    """
+
+    tag = "cutlass::epilogue::thread::FastLinearCombinationClamp"
+
+    def __init__(self, element_output, epilogue_vector_length, *args) -> None:
+        super().__init__()
+
+        self.template_arguments = [
+            DataTypeTag[element_output], str(epilogue_vector_length)
+        ]
+
+        self.element_accumulator = DataType.s32
+        self.element_epilogue = DataType.f32
+
+        # get epilogue output op
+        c_element_epilogue = dtype2ctype[self.element_epilogue]
+        element_epilogue = self.element_epilogue
+
+        class _EpilogueOutputOpParams(ctypes.Structure):
+            _fields_ = [
+                ("alpha", c_element_epilogue),
+                ("beta", c_element_epilogue),
+                ("alpha_ptr", ctypes.c_void_p),
+                ("beta_ptr", ctypes.c_void_p),
+            ]
+
+            def __init__(self, alpha, beta, *args) -> None:
+                self.alpha = to_ctype_value(alpha, element_epilogue)
+                self.beta = to_ctype_value(beta, element_epilogue)
+
+        self.epilogue_type = _EpilogueOutputOpParams
+
+    def emit(self):
+        return super().emit(self.tag, self.template_arguments)
+
+
+class LinearCombinationGeneric(LinearCombination):
+    """
+    Applies a linear combination operator followed by an activation function
+    to an array of elements.
+
+    D = activation(alpha * accumulator + beta * source)
+
+    :param activation_functor: input activation functor
+
+    :param element_output: data type used to load and store tensors
+
+    :param epilogue_vector_length: number of elements computed per operation.
+    Usually it is 128/sizeof_bits_v<ElementOutput_>, but we use 64 and 32 sometimes
+    when there are not enough data to store
+
+    :param element_accumulator: Accumulator data type
+
+    :param element_epilogue: data type used to compute linear combination
+    """
+
+    tag = "cutlass::epilogue::thread::LinearCombinationGeneric"
+
+    def __init__(
+        self, activation_functor,
+        element_output, epilogue_vector_length,
+        element_accumulator=None, element_epilogue=None) -> None:
+        super().__init__(
+            element_output,
+            epilogue_vector_length,
+            element_accumulator,
+            element_epilogue,
+        )
+
+        self.template_arguments = [
+            activation_functor.emit()] + self.template_arguments
+
+        self.activation_functor = activation_functor
+        self.element_epilogue = element_epilogue
+
+        # get epilogue output op
+        self.epilogue_type = self.activation_functor.epilogue_output_op(self.element_epilogue)
+
+
+class ActivationFunctor:
+    """
+    Base class for frequently used activation functions
+    """
+
+    @staticmethod
+    def numpy(x: np.ndarray):
+        raise NotImplementedError()
+
+    @classmethod
+    def emit(cls):
+        return ActivationOpTag[cls.binding_type]
+
+    @staticmethod
+    def epilogue_output_op(element_epilogue):
+        c_element_epilogue = dtype2ctype[element_epilogue]
+
+        class _EpilogueOutputOpParams(ctypes.Structure):
+            _fields_ = [
+                ("alpha", c_element_epilogue),
+                ("beta", c_element_epilogue),
+                ("alpha_ptr", ctypes.c_void_p),
+                ("beta_ptr", ctypes.c_void_p),
+            ]
+
+            def __init__(self, alpha, beta, *args) -> None:
+                self.alpha = to_ctype_value(alpha, element_epilogue)
+                self.beta = to_ctype_value(beta, element_epilogue)
+
+        return _EpilogueOutputOpParams
+
+class ActivationMeta(type):
+    @classmethod
+    def __call__(cls, x, *args):
+        if is_numpy_tensor(x):
+            return cls.numpy(x, *args)
+        elif is_torch_tensor(x):
+            return cls.torch(x, *args)
+        else:
+            raise NotImplementedError("Unsupported tensor type")
+
+    @classmethod
+    def numpy(cls, *args):
+        raise NotImplementedError(f"Numpy reference for {cls.__name__[:-4]} is not implemented.")
+
+    @classmethod
+    def torch(cls, *args):
+        raise NotImplementedError(f"PyTorch reference for {cls.__name__[:-4]} is not implemented.")
+
+##############################################################################
+# identity operator
+class identityMeta(ActivationMeta):
+    @classmethod
+    def numpy(cls, x):
+        return x
+
+    @classmethod
+    def torch(cls, x):
+        return x
+
+class identity(ActivationFunctor, metaclass=identityMeta):
+    binding_type = ActivationOp.Identity
+
+
+##############################################################################
+# ReLu operator
+class reluMeta(ActivationMeta):
+    @classmethod
+    def numpy(cls, x):
+        return np.where(x > 0, x, 0)
+
+    @classmethod
+    def torch(cls, x):
+        return F.relu(x)
+
+class relu(ActivationFunctor, metaclass=reluMeta):
+    binding_type = ActivationOp.ReLU
+
+
+##############################################################################
+# Leaky ReLu operator
+class leakyReLUMeta(ActivationMeta):
+    @classmethod
+    def numpy(cls, x, leaky_alpha):
+        return np.maximum(x, 0) + np.minimum(x, 0) * leaky_alpha
+
+    @classmethod
+    def torch(cls, x, leaky_alpha):
+        return F.leaky_relu(x, leaky_alpha)
+
+class leaky_relu(ActivationFunctor, metaclass=leakyReLUMeta):
+    binding_type = ActivationOp.LeakyReLU
+
+    @staticmethod
+    def epilogue_output_op(element_epilogue):
+        c_element_epilogue = dtype2ctype[element_epilogue]
+
+        class _EpilogueOutputOpParams(ctypes.Structure):
+            _fields_ = [
+                ("alpha", c_element_epilogue),
+                ("beta", c_element_epilogue),
+                ("alpha_ptr", ctypes.c_void_p),
+                ("beta_ptr", ctypes.c_void_p),
+                ("leaky_alpha", c_element_epilogue)
+            ]
+
+            def __init__(self, alpha, beta, leaky_alpha=0.2, *args) -> None:
+                self.alpha = to_ctype_value(alpha, element_epilogue)
+                self.beta = to_ctype_value(beta, element_epilogue)
+                self.alpha_ptr = 0
+                self.beta_ptr = 0
+                self.leaky_alpha = to_ctype_value(leaky_alpha, element_epilogue)
+
+        return _EpilogueOutputOpParams
+
+
+##############################################################################
+# Tanh operator
+class tanhMeta(ActivationMeta):
+    @classmethod
+    def numpy(cls, x):
+        return np.tanh(x)
+
+    @classmethod
+    def torch(cls, x):
+        return torch.tanh(x)
+
+class tanh(ActivationFunctor, metaclass=tanhMeta):
+    binding_type = ActivationOp.Tanh
+
+
+##############################################################################
+# Sigmoid operator
+class sigmoidMeta(ActivationMeta):
+    @classmethod
+    def numpy(cls, x):
+        return 1.0 / (1.0 + np.exp(-x))
+
+    @classmethod
+    def torch(cls, x):
+        return F.sigmoid(x)
+
+class sigmoid(ActivationFunctor, metaclass=sigmoidMeta):
+    binding_type = ActivationOp.Sigmoid
+
+
+##############################################################################
+# SiLu operator
+class siluMeta(ActivationMeta):
+    @classmethod
+    def numpy(cls, x):
+        return x * sigmoidMeta.numpy()
+
+    @classmethod
+    def silu(cls, x):
+        return F.silu(x)
+
+
+class silu(ActivationFunctor, metaclass=siluMeta):
+    binding_type = ActivationOp.SiLU
+
+
+##############################################################################
+# Hardswish operator
+class hardswishMeta(ActivationMeta):
+    @classmethod
+    def numpy(cls, x):
+        relu6 = np.minimum(np.maximum(x + 3.0, 0), 6.0)
+        return x * relu6 / 6.0
+
+    @classmethod
+    def torch(cls, x):
+        return F.hardswish(x)
+
+
+class hardswish(ActivationFunctor, metaclass=hardswishMeta):
+    binding_type = ActivationOp.HardSwish
+
+
+##############################################################################
+# GELU operator
+class geluMeta(ActivationMeta):
+    @classmethod
+    def numpy(cls, x):
+        return 0.5 * x * (1 + erf(x / np.sqrt(2.0)))
+
+    @classmethod
+    def torch(cls, x):
+        return F.gelu(x)
+
+
+class gelu(ActivationFunctor, metaclass=geluMeta):
+    binding_type = ActivationOp.Gelu
diff --git a/python/cutlass/backend/evt/__init__.py b/python/cutlass/backend/evt/__init__.py
new file mode 100644
index 0000000000..a7cad2eaad
--- /dev/null
+++ b/python/cutlass/backend/evt/__init__.py
@@ -0,0 +1,34 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from cutlass.backend.evt.epilogue import EpilogueFunctorVisitor
+from cutlass.backend.evt.frontend import PythonASTFrontend
diff --git a/python/cutlass/backend/evt/backend/__init__.py b/python/cutlass/backend/evt/backend/__init__.py
new file mode 100644
index 0000000000..06c1545c9b
--- /dev/null
+++ b/python/cutlass/backend/evt/backend/__init__.py
@@ -0,0 +1,36 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from cutlass.backend.evt.backend.sm80_emitter import Sm80Emitter
+import cutlass.backend.evt.backend.sm80_nodes as sm80_nodes
+from cutlass.backend.evt.backend.sm90_emitter import Sm90Emitter
+import cutlass.backend.evt.backend.sm90_nodes as sm90_nodes
diff --git a/python/cutlass/backend/evt/backend/emitter_base.py b/python/cutlass/backend/evt/backend/emitter_base.py
new file mode 100644
index 0000000000..dd98af63ea
--- /dev/null
+++ b/python/cutlass/backend/evt/backend/emitter_base.py
@@ -0,0 +1,158 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Base class for Epilogue Visitor Emitter
+"""
+
+from cutlass_library import DataTypeTag
+from cutlass.backend.evt.ir import TopoVisitorNode, DAGIR
+
+
+class FusionCallbacks:
+    def __init__(self, dag_ir: DAGIR, cc: int, emit_CD=True) -> None:
+        """
+        Emit the EVT fusion callbacks
+        :param dag_ir: the DAG IR holding the epilogue visitor
+        :param cc: compute capability
+        :param emit_CD: whether to emit nodes C & D as a part of the fusion callbacks
+                        For Sm90, set emit_CD=False, as Tensor C & D are hardcoded in the collective API
+                        so that their shared memory can be explicitly reused
+                        For Sm89, set emit_CD=True as they are treated as normal AuxLoad & AuxStore nodes.
+        """
+        self.dag_ir = dag_ir
+        self.emit_CD = emit_CD
+        self.cc = cc
+        if self.cc < 90:
+            self.namespace = "threadblock"
+        else:
+            self.namespace = "fusion"
+
+    #
+    # Helper functions
+    #
+
+    def get_visitor_name(self, node: str):
+        """
+        Get the visitor name
+        """
+        meta = self.dag_ir.get_node_meta(node)
+        if not isinstance(meta, TopoVisitorNode) and self.dag_ir.in_degree(node) > 0:
+            return f"EVT{meta.name_camel}"
+        else:
+            return meta.name_camel
+
+    def emit(self):
+        node_metas = self.dag_ir.node_metas_topological_order()
+        epilogue_str = ""
+        # Step 1: emit individual node type decl
+        #         emit the EVT & DAG connector
+        for meta in node_metas:
+            if not meta.disabled:
+                epilogue_str += self.emit_node(meta)
+            if not self.emit_CD and meta.name == "D":
+                continue
+            if isinstance(meta, TopoVisitorNode):
+                epilogue_str += self.emit_dag(meta)
+            else:
+                epilogue_str += self.emit_evt(meta)
+
+        # Step 2: post-processing & get callback name
+        if not self.emit_CD:
+            if not self.dag_ir.has_node("C"):
+                epilogue_str += "using ElementC = void;\nusing StrideC = StrideD;\n"
+            output_node = self.dag_ir.get_all_inputs("D")[0]
+            # The callback is the src of node D
+            callback_name = self.get_visitor_name(output_node)
+        else:
+            # The callback is the last node in the topological order
+            callback_name = self.get_visitor_name(node_metas[-1].name)
+        return epilogue_str, callback_name
+
+    def emit_evt(self, node):
+        if self.dag_ir.in_degree(node.name) == 0:
+            return ""
+
+        evt_tmp = f"""
+using EVT{node.name_camel} = cutlass::epilogue::{self.namespace}::Sm{self.cc}EVT<
+    {node.name_camel},
+"""
+        sorted_children = self.dag_ir.get_all_inputs(node.name)
+        evt_node_strs = [f"    {self.get_visitor_name(child_name)}" for child_name in sorted_children]
+        evt_tmp += ",\n".join(evt_node_strs) + ">;\n"
+
+        return evt_tmp
+
+    def emit_dag(self, node):
+        subgraph = node.subgraph
+        subgraph_nodes = subgraph.nodes_topological_order()
+        # Emit the Edge Tuple
+        edge_tuples = "cute::tuple<\n"
+        for n in subgraph_nodes[:-1]:
+            in_edges = subgraph.in_edges(n)
+            edge_weights = [subgraph.get_edge_weight(edge[0], edge[1]) for edge in in_edges]
+            sorted_children = [edge[0] for _, edge in sorted(zip(edge_weights, in_edges))]
+            edge_tuple = "        cute::seq<"
+            edge_str = [str(subgraph_nodes.index(child)) for child in sorted_children]
+            edge_tuple += ", ".join(edge_str) + ">,\n"
+
+            edge_tuples += edge_tuple
+        edge_tuples += "    >"
+
+        # Emit the node list
+        dag_nodes = ""
+        dag_node_strs = []
+        for n in subgraph_nodes[:-1]:
+            n_meta = subgraph.get_node_meta(n)
+            if n_meta.disabled:
+                dag_node_strs.append(f"    {self.get_visitor_name(n)}")
+            else:
+                dag_node_strs.append(f"    {n_meta.name_camel}")
+        dag_nodes = ",\n".join(dag_node_strs)
+
+        return f"""
+using {node.name_camel} = cutlass::epilogue::{self.namespace}::Sm{self.cc}TopologicalVisitor<
+    {DataTypeTag[node.subgraph.element_compute]},
+    {edge_tuples},
+{dag_nodes}
+>;
+"""
+
+    def emit_node(self, node):
+        if isinstance(node, TopoVisitorNode):
+            emission = ""
+            for node in node.subgraph.node_metas_topological_order():
+                if not node.disabled:
+                    emission += self.emit_node(node)
+            return emission
+        else:
+            return node.underlying_impl.type_decl
diff --git a/python/cutlass/backend/evt/backend/sm80_emitter.py b/python/cutlass/backend/evt/backend/sm80_emitter.py
new file mode 100644
index 0000000000..f8d8000c2f
--- /dev/null
+++ b/python/cutlass/backend/evt/backend/sm80_emitter.py
@@ -0,0 +1,47 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################  
+
+"""
+Emitter for Sm80 Epilogue Visitor
+"""
+
+from cutlass.backend.evt.backend.emitter_base import FusionCallbacks
+from cutlass.backend import GemmOperationUniversal
+
+
+class Sm80Emitter:
+    def __init__(self, operation: GemmOperationUniversal, graph) -> None:
+        self.fusion_callbacks = FusionCallbacks(graph, cc=80)
+
+    def emit(self):
+        callback_decl, callback_name = self.fusion_callbacks.emit()
+        return callback_name, callback_decl
diff --git a/python/cutlass/backend/evt/backend/sm80_nodes.py b/python/cutlass/backend/evt/backend/sm80_nodes.py
new file mode 100644
index 0000000000..ec915fd31a
--- /dev/null
+++ b/python/cutlass/backend/evt/backend/sm80_nodes.py
@@ -0,0 +1,258 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from cutlass_library import DataTypeSize, DataTypeTag
+
+from cutlass.backend.evt.ir import (
+    # Load Node
+    AccumulatorImpl,
+    AuxLoadImpl,
+    ColumnBroadcastImpl,
+    LoadNode,
+    LoadSrcImpl,
+    RowBroadcastImpl,
+    ScalarBroadcastImpl,
+    # Compute Node
+    ComputeImpl,
+    # Store Node
+    AuxStoreImpl,
+    ColumnReductionImpl,
+    RowReductionImpl,
+    ScalarReductionImpl
+)
+
+from cutlass.backend.library import (
+    FloatRoundStyleTag,
+    FunctionalOp,
+    op_tag,
+)
+
+
+class Sm80AccumulatorImpl(AccumulatorImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""\nusing {self.name_camel} = cutlass::epilogue::threadblock::VisitorAccFetch;\n"""
+        return self._type_decl
+
+
+class Sm80AuxLoadImpl(AuxLoadImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::threadblock::VisitorAuxLoad<
+    OutputTileThreadMap, {DataTypeTag[self.element]}, {self.stride_mnl}
+>;
+"""
+        return self._type_decl
+
+
+class Sm80LoadSrcImpl(Sm80AuxLoadImpl):
+    pass
+
+
+class Sm80ScalarBroadcastImpl(ScalarBroadcastImpl):
+    def __init__(self, node: LoadNode) -> None:
+        super().__init__(node)
+        self.broadcast_count = 1
+        self.reduction_fn = FunctionalOp.Multiplies
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::threadblock::VisitorScalarBroadcast<
+    {DataTypeTag[self.element]}, {self.stride_mnl}, {self.broadcast_count}, {op_tag(self.reduction_fn)}
+>;
+"""
+        return self._type_decl
+
+
+class Sm80RowBroadcastImpl(RowBroadcastImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::threadblock::VisitorRowBroadcast<
+    OutputTileThreadMap, {DataTypeTag[self.element]},
+    {self.stride_mnl}
+>;
+"""
+        return self._type_decl
+
+
+class Sm80ColumnBroadcastImpl(ColumnBroadcastImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::threadblock::VisitorColBroadcast<
+    OutputTileThreadMap, {DataTypeTag[self.element]},
+    {self.stride_mnl}
+>;
+"""
+        return self._type_decl
+
+
+class Sm80ComputeImpl(ComputeImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::threadblock::VisitorCompute<
+    {op_tag(self.fn)}, {DataTypeTag[self.element_output]}, {DataTypeTag[self.element_compute]},
+    {FloatRoundStyleTag[self.round_style]}
+>;
+"""
+        return self._type_decl
+
+
+class Sm80AuxStoreImpl(AuxStoreImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::threadblock::VisitorAuxStore<
+    OutputTileThreadMap, {DataTypeTag[self.element]}, {FloatRoundStyleTag[self.round_style]},
+    {self.stride_mnl}
+>;
+"""
+        return self._type_decl
+
+
+class Sm80StoreDImpl(Sm80AuxStoreImpl):
+    pass
+
+
+class Sm80ColumnReductionImpl(ColumnReductionImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::threadblock::VisitorColReduction<
+    {op_tag(self.reg_reduce_fn)}, {op_tag(self.gmem_reduce_fn)},
+    OutputTileThreadMap, {DataTypeTag[self.element]},
+    {DataTypeTag[self.element_compute]}, {FloatRoundStyleTag[self.round_style]},
+    {self.stride_mnl}
+>;
+"""
+        return self._type_decl
+
+
+class Sm80RowReductionImpl(RowReductionImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::threadblock::VisitorRowReduction<
+    {op_tag(self.reg_reduce_fn)}, {op_tag(self.gmem_reduce_fn)},
+    OutputTileThreadMap, {DataTypeTag[self.element]},
+    {DataTypeTag[self.element_compute]}, {FloatRoundStyleTag[self.round_style]},
+    {self.stride_mnl}
+>;
+"""
+        return self._type_decl
+
+
+class Sm80ScalarReductionImpl(ScalarReductionImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::threadblock::VisitorScalarReduction<
+    {op_tag(self.reg_reduce_fn)}, {op_tag(self.gmem_reduce_fn)},
+    OutputTileThreadMap, {DataTypeTag[self.element]},
+    {DataTypeTag[self.element_compute]}, {FloatRoundStyleTag[self.round_style]},
+    {self.stride_mnl}
+>;
+"""
+        return self._type_decl
diff --git a/python/cutlass/backend/evt/backend/sm90_emitter.py b/python/cutlass/backend/evt/backend/sm90_emitter.py
new file mode 100644
index 0000000000..f9d9afc69c
--- /dev/null
+++ b/python/cutlass/backend/evt/backend/sm90_emitter.py
@@ -0,0 +1,98 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Emitter for Sm90 Epilogue Visitor
+"""
+
+from cutlass_library import DataTypeTag, EpilogueScheduleTag
+from cutlass.backend import GemmOperationUniversal
+from cutlass.backend.evt.backend.emitter_base import FusionCallbacks
+
+
+class CollectiveEpilogue:
+    def __init__(self, tile_description,
+                 schedule,
+                 element_c,
+                 element_d,
+                 fusion_callbacks) -> None:
+
+        self.cta_tile_mnk = tile_description.threadblock_shape
+        self.element_c = element_c
+        self.element_d = element_d
+        self.schedule = schedule
+        self.fusion_callbacks = fusion_callbacks
+
+    @property
+    def CtaTileMNK(self) -> str:
+        """
+        The threadblock shape
+        """
+        return f"cute::Shape<_{self.cta_tile_mnk[0]}, _{self.cta_tile_mnk[1]}, _{self.cta_tile_mnk[2]}>"
+
+    @property
+    def EpilogueTileType(self) -> str:
+        """
+        The epilogue tile type
+        """
+        return "cutlass::epilogue::collective::EpilogueTileAuto"
+
+    @property
+    def Schedule(self) -> str:
+        return EpilogueScheduleTag[self.schedule]
+
+    def emit(self):
+        callback_decl, callback_name = self.fusion_callbacks.emit()
+        return callback_name, f"""
+using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
+  {self.CtaTileMNK}, {self.EpilogueTileType},
+  {DataTypeTag[self.element_c]}, {DataTypeTag[self.element_d]},
+  {self.Schedule}
+>;
+{callback_decl}
+"""
+
+
+class Sm90Emitter:
+    def __init__(self, operation: GemmOperationUniversal, graph) -> None:
+        fusion_callbacks = FusionCallbacks(graph, cc=90, emit_CD=False)
+
+        self.collective_epilogue = CollectiveEpilogue(
+            tile_description=operation.tile_description,
+            schedule=operation.tile_description.epilogue_schedule,
+            element_c=operation.C.element,
+            element_d=operation.C.element,
+            fusion_callbacks=fusion_callbacks
+        )
+
+    def emit(self):
+        return self.collective_epilogue.emit()
diff --git a/python/cutlass/backend/evt/backend/sm90_nodes.py b/python/cutlass/backend/evt/backend/sm90_nodes.py
new file mode 100644
index 0000000000..acdc4f4720
--- /dev/null
+++ b/python/cutlass/backend/evt/backend/sm90_nodes.py
@@ -0,0 +1,329 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from pycute import product
+
+from cutlass_library import DataTypeSize, DataTypeTag
+from cutlass.backend.evt.ir import (
+    # Load Node
+    AccumulatorImpl,
+    AuxLoadImpl,
+    ColumnBroadcastImpl,
+    LoadNode,
+    LoadSrcImpl,
+    RowBroadcastImpl,
+    ScalarBroadcastImpl,
+    # Compute Node
+    ComputeImpl,
+    ComputeNode,
+    # Store Node
+    AuxStoreImpl,
+    ColumnReductionImpl,
+    RowReductionImpl,
+    ScalarReductionImpl,
+    StoreNode,
+    StoreDImpl,
+)
+from cutlass.backend.library import (
+    FloatRoundStyleTag,
+    FunctionalOp,
+    op_tag,
+)
+
+
+class Sm90AccumulatorImpl(AccumulatorImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""\nusing {self.name_camel} = cutlass::epilogue::fusion::Sm90AccFetch;\n"""
+        return self._type_decl
+
+
+class Sm90LoadSrcImpl(LoadSrcImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using ElementC = {DataTypeTag[self.element]};
+using StrideC = {self.stride_mnl};
+using {self.name_camel} = cutlass::epilogue::fusion::Sm90SrcFetch<{DataTypeTag[self.element]}>;
+"""
+        return self._type_decl
+
+
+class Sm90AuxLoadImpl(AuxLoadImpl):
+
+    @property
+    def descriptor(self) -> str:
+        """
+        Descriptor for Aux Load
+        """
+        return f"{self.name_camel}Descriptor"
+
+    def decl_descriptor(self) -> str:
+        """
+        Declare the descriptor type
+        """
+        return f"\nusing {self.descriptor} = cutlass::epilogue::collective::detail::AuxLoadDescriptor<EpilogueDescriptor, {self.stride_mnl}, {DataTypeTag[self.element]}>;\n"
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = self.decl_descriptor()
+        self._type_decl += f"""
+using {self.name_camel} = cutlass::epilogue::fusion::Sm90AuxLoad<
+    {self.descriptor}::Stages, typename {self.descriptor}::EpilogueTile, {DataTypeTag[self.element]},
+    {self.stride_mnl}, typename {self.descriptor}::SmemLayoutAtom, typename {self.descriptor}::CopyOpS2R
+>;
+"""
+        return self._type_decl
+
+    def get_smem_size(self, cta_tile_mnk, epilogue_tile_mn, stages_c, stages_d, epi_tiles):
+        """
+        Get the shared memory size based on epilogue_tile_mn, stages_c, and stages_d
+        """
+        return (DataTypeSize[self.element] * stages_c * product(epilogue_tile_mn) // 8, 128)
+
+
+class Sm90ScalarBroadcastImpl(ScalarBroadcastImpl):
+    def __init__(self, node: LoadNode) -> None:
+        super().__init__(node)
+        self.broadcast_count = 1
+        self.reduction_fn = FunctionalOp.Multiplies
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::fusion::Sm90ScalarBroadcast<
+    {DataTypeTag[self.element]}, {self.stride_mnl}, {self.broadcast_count}, {op_tag(self.reduction_fn)}
+>;
+"""
+        return self._type_decl
+
+
+class Sm90RowBroadcastImpl(RowBroadcastImpl):
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::fusion::Sm90RowBroadcast<
+    0 /*Stages*/, typename EpilogueDescriptor::TileShape, {DataTypeTag[self.element]}, {DataTypeTag[self.element_output]},
+    {self.stride_mnl}
+>;
+"""
+        return self._type_decl
+
+
+class Sm90ColumnBroadcastImpl(ColumnBroadcastImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::fusion::Sm90ColBroadcast<
+    0 /*Stages*/, typename EpilogueDescriptor::TileShape, {DataTypeTag[self.element]}, {DataTypeTag[self.element_output]},
+    {self.stride_mnl}
+>;
+"""
+        return self._type_decl
+
+
+class Sm90ComputeImpl(ComputeImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::fusion::Sm90Compute<
+    {op_tag(self.fn)}, {DataTypeTag[self.element_output]}, {DataTypeTag[self.element_compute]},
+    {FloatRoundStyleTag[self.round_style]}
+>;
+"""
+        return self._type_decl
+
+
+class Sm90AuxStoreImpl(AuxStoreImpl):
+
+    @property
+    def descriptor(self) -> str:
+        """
+        Descriptor for Aux Load
+        """
+        return f"{self.name_camel}Descriptor"
+
+    def decl_descriptor(self) -> str:
+        """
+        Declare the descriptor type
+        """
+        return f"""
+using {self.descriptor} = cutlass::epilogue::collective::detail::AuxStoreDescriptor<
+    EpilogueDescriptor, {self.stride_mnl}, {DataTypeTag[self.element]}
+>;
+"""
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = self.decl_descriptor()
+        self._type_decl += f"""
+using {self.name_camel} = cutlass::epilogue::fusion::Sm90AuxStore<
+    {self.descriptor}::Stages, typename {self.descriptor}::EpilogueTile, {DataTypeTag[self.element]},
+    {FloatRoundStyleTag[self.round_style]}, {self.stride_mnl}, typename {self.descriptor}::SmemLayoutAtom,
+    typename {self.descriptor}::CopyOpR2S
+>;
+"""
+        return self._type_decl
+
+    def get_smem_size(self, cta_tile_mnk, epilogue_tile_mn, stages_c, stages_d, epi_tiles):
+        """
+        Get the shared memory size based on epilogue_tile_mn, stages_c, and stages_d
+        """
+        return (DataTypeSize[self.element] * stages_d * product(epilogue_tile_mn) // 8, 128)
+
+
+class Sm90StoreDImpl(StoreDImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        return f"""
+using ElementD = {DataTypeTag[self.element]};
+using StrideD = {self.stride_mnl};
+"""
+
+
+class Sm90ColumnReductionImpl(ColumnReductionImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::fusion::Sm90ColReduction<
+    {op_tag(self.reg_reduce_fn)}, {op_tag(self.reg_reduce_fn)}, {op_tag(self.gmem_reduce_fn)}, 0,
+    typename EpilogueDescriptor::TileShape, {DataTypeTag[self.element]},
+    {DataTypeTag[self.element_compute]}, {FloatRoundStyleTag[self.round_style]},
+    {self.stride_mnl}
+>;
+"""
+        return self._type_decl
+
+
+class Sm90RowReductionImpl(RowReductionImpl):
+
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::fusion::Sm90RowReduction<
+    {op_tag(self.reg_reduce_fn)}, {op_tag(self.reg_reduce_fn)}, {op_tag(self.gmem_reduce_fn)}, 0 /* Stages */,
+    typename EpilogueDescriptor::TileShape, {DataTypeTag[self.element]},
+    {DataTypeTag[self.element_compute]}, {FloatRoundStyleTag[self.round_style]},
+    {self.stride_mnl}
+>;
+"""
+        return self._type_decl
+
+
+class Sm90ScalarReductionImpl(ScalarReductionImpl):
+
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::fusion::Sm90ScalarReduction<
+    {op_tag(self.reg_reduce_fn)}, {op_tag(self.gmem_reduce_fn)},
+    {DataTypeTag[self.element]}, {DataTypeTag[self.element_compute]},
+    {FloatRoundStyleTag[self.round_style]}, {self.stride_mnl}
+>;
+"""
+        return self._type_decl
diff --git a/python/cutlass/backend/evt/epilogue.py b/python/cutlass/backend/evt/epilogue.py
new file mode 100644
index 0000000000..c0c780becd
--- /dev/null
+++ b/python/cutlass/backend/evt/epilogue.py
@@ -0,0 +1,167 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Epilogue Visitor interface for compiling, and running visitor-based epilogue.
+"""
+
+import ctypes
+
+from cuda import cuda
+from cutlass_library import DataType
+import numpy as np
+
+from cutlass.backend.epilogue import EpilogueFunctorBase
+import cutlass.backend.evt.backend
+from cutlass.backend.frontend import TensorFrontend
+from cutlass.utils.datatypes import is_numpy_tensor
+from cutlass.backend.evt.passes.util import cc_map
+
+
+class EpilogueFunctorVisitor(EpilogueFunctorBase):
+    """
+    Apply an epilogue functor described by the epilogue EVT
+
+    :param cc: compute capability
+    :param visitor_frontend: user-provide visitor frontend
+
+    """
+    def __init__(self, cc: int, visitor, element_compute=DataType.f32) -> None:
+        # Type of Emitter based on CC
+        self.emit_cls = getattr(cutlass.backend.evt.backend, f"Sm{cc_map[cc]}Emitter")
+
+        # Visitor Types
+        self.visitor = visitor
+        self.graph = visitor.dag_ir
+
+        # Data types
+        self.element_epilogue = element_compute # element compute
+        self.element_output = self.graph.get_node_meta('D').underlying_impl.element
+
+        # Epilogue Thread Type
+        epilogue_thread_type = self.visitor.epilogue_thread_type
+        if cc == 90:
+            self.arg_c_type = self.visitor.arg_c_type
+            self.arg_d_type = self.visitor.arg_d_type
+        output_names = self.visitor.return_names
+        reduction_names = self.visitor.reduction_names
+
+        # Epilogue stages specialized for sm80 kernel
+        if cc == 80:
+            if hasattr(self.visitor, "epilogue_stages"):
+                self.epilogue_stages = self.visitor.epilogue_stages
+                assert self.epilogue_stages <= 2, "Only supports Stages <=2 in SM80 Epilogue"
+
+        # Epilogue Argument Type
+        class _Arguments(ctypes.Structure):
+            """
+            Concepts:
+            class _EpilogueArguments(ctypes.Structure):
+                _fields_ = [
+                    ("epilogue", _Arguments), <- this class
+                    ("ptr_C", ctypes.c_void_p),
+                    ("stride_C", StrideBatched_),
+                    ("ptr_D", ctypes.c_void_p),
+                    ("stride_D", StrideBatched_)
+                ]
+            """
+            _fields_ = [
+                ("output_op", epilogue_thread_type)
+            ]
+
+            def __init__(self, kwargs: dict) -> None:
+                # The user-input kwargs is a dict of (name: tensors)
+                # We first convert all of them to device pointers
+                ptr_kwargs = {}
+                for key in kwargs.keys():
+                    is_output = key in output_names and key not in reduction_names
+                    ptr_kwargs[key] = self.get_tensor_ptr(key, kwargs, is_output)
+                # Initialize the thread arguments
+                self.output_op = epilogue_thread_type(ptr_kwargs)
+
+            def get_tensor_ptr(self, tensor_name, kwargs, is_output=False):
+                """
+                Helper function for extracting device pointer
+                """
+                # Skip the special tensors
+                if cc == 90:
+                    if tensor_name in ["C", "D"]:
+                        return 0
+                if tensor_name not in kwargs.keys():
+                    raise ValueError(f"Tensor {tensor_name} is not provided.")
+                tensor = kwargs[tensor_name]
+
+                # For float scalar constant, directly return the value
+                if isinstance(tensor, float):
+                    return tensor
+
+                # The tensor frontend returns a device buffer for np.ndarray
+                # and device ptr for other frontends
+                buffer_or_ptr = TensorFrontend.argument(tensor, is_output)
+                if is_numpy_tensor(tensor):
+                    # Remember the host tensor for later synchronization
+                    setattr(self, f"{tensor_name}_buffer", buffer_or_ptr)
+                    setattr(self, f"{tensor_name}_host", tensor)
+                    return int(buffer_or_ptr.ptr)
+                else:
+                    return int(buffer_or_ptr)
+
+            def sync(self):
+                """
+                Synchronize the results from device to host
+                """
+                for name in output_names:
+                    if hasattr(self, f"{name}_host"):
+                        host_tensor = getattr(self, f"{name}_host")
+                        tensor_ptr = getattr(self, f"{name}_buffer").ptr
+                        (err,) = cuda.cuMemcpyDtoH(
+                            host_tensor,
+                            tensor_ptr,
+                            host_tensor.size * host_tensor.itemsize,
+                        )
+                        if err != cuda.CUresult.CUDA_SUCCESS:
+                            raise RuntimeError("CUDA Error %s" % str(err))
+
+        self.epilogue_type = _Arguments
+
+    def emit(self, operation):
+        """
+        Emit the C++ code
+        """
+        emitter = self.emit_cls(operation, self.graph)
+        return emitter.emit()
+
+    def get_smem_size(self, tile_description):
+        """
+        Get the shared memory size in bytes
+        """
+        return self.visitor.get_smem_size(tile_description)
diff --git a/python/cutlass/backend/evt/frontend/__init__.py b/python/cutlass/backend/evt/frontend/__init__.py
new file mode 100644
index 0000000000..5abb473e8c
--- /dev/null
+++ b/python/cutlass/backend/evt/frontend/__init__.py
@@ -0,0 +1,33 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from cutlass.backend.evt.frontend.python_ast import PythonASTFrontend
diff --git a/python/cutlass/backend/evt/frontend/frontend_base.py b/python/cutlass/backend/evt/frontend/frontend_base.py
new file mode 100644
index 0000000000..5c63c14121
--- /dev/null
+++ b/python/cutlass/backend/evt/frontend/frontend_base.py
@@ -0,0 +1,262 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Base class for Python EVT Frontend
+"""
+
+from typing import Union
+
+from cutlass_library import DataType
+from cutlass.backend.evt.ir import (
+    ComputeNode,
+    DAGIR,
+    LayoutNode,
+    LoadNode,
+    StoreNode,
+)
+from cutlass.backend.evt.passes import (
+    EVTGraphDrawer,
+    EVTPassManager,
+    GetSmemSize,
+    PassDAG2Tree,
+    PassGetArgumentType,
+    PassGetImpl,
+    PassFixElementD,
+    PassLayoutManipulateElimination,
+    PassPreprocessRed,
+    PassShapeTypePropagation,
+)
+from cutlass.backend.utils import device_cc
+from cutlass.epilogue.evt_ops import permute, reshape
+from cutlass.utils.datatypes import library_type
+
+
+class EVTFrontendBase:
+    layout_fns = {
+        "permute": permute,
+        "reshape": reshape
+    }
+
+    def __init__(self, element_compute=DataType.f32, cc=None, additional_passes=[], **kwargs) -> None:
+        self.cc = cc if cc else device_cc()
+        self.element_compute = library_type(element_compute)
+        self.dag_ir = DAGIR(self.element_compute, self.cc)
+        self.compute_cnt = 0
+        self.layout_cnt = 0
+
+        self.pass_manager = EVTPassManager(
+            self.dag_ir,
+            [
+                PassPreprocessRed,
+                PassGetArgumentType,
+                PassShapeTypePropagation,
+                PassLayoutManipulateElimination,
+                PassGetImpl,
+                PassDAG2Tree,
+                PassFixElementD
+            ] + additional_passes)
+
+        if self.cc == 80:
+            self._epilogue_stages = 1
+        else:
+            self._epilogue_stages = None
+
+    @property
+    def epilogue_stages(self):
+        return self._epilogue_stages
+
+    @epilogue_stages.setter
+    def epilogue_stages(self, stages):
+        self._epilogue_stages = stages
+
+
+    def parse(self, *args, **kwargs):
+        raise NotImplementedError(f"The 'parse' function must be overloaded in frontend class")
+
+    def trace(self, *args, **kwargs):
+        # Parse the input
+        self.parse(*args, **kwargs)
+
+        # Run the passes
+        self.pass_manager()
+        # Set the epilogue type
+        self.epilogue_thread_type = self.dag_ir.epilogue_thread_type
+        if self.cc == 90:
+            self.arg_c_type = self.dag_ir.arg_c_type
+            self.arg_d_type = self.dag_ir.arg_d_type
+        self.reduction_names = self.dag_ir.reduction_names
+
+    #
+    # Helper functions for DAG IR manipulation
+    #
+
+    def add_node(self, node):
+        self.dag_ir.add_node(node)
+
+    def add_edge(self, src, tgt, weight=0):
+        self.dag_ir.add_edge(src, tgt, weight=weight)
+
+    def set_tensor(self, node_name, example):
+        """
+        Add an example tensor to node {node_name} in the DAG IR
+        """
+        meta = self.dag_ir.get_node_meta(node_name)
+        meta.tensor = {"tensor": example}
+
+    def set_store_tensor(self, node_name, example):
+        """
+        Add an example tensor to node {node_name} in the DAG IR
+        """
+        meta = self.dag_ir.get_node_meta(node_name)
+        meta.store_tensor = {"tensor": example}
+
+    def mark_output(self, node_name):
+        """
+        Mark a store node as output
+        """
+        meta = self.dag_ir.get_node_meta(node_name)
+        if not isinstance(meta, StoreNode):
+            raise ValueError(
+                f"Only StoreNodes can be marked as output. "
+                f"Got {type(meta).__name__}: {node_name}")
+        meta.is_output = True
+
+    # Add node with specific type
+
+    def add_load_node(self, name, example):
+        """
+        Add a Load node to DAG IR
+        :param name: name of the loaded variable
+        :type name: str
+        :param example: example input
+        :type example: np.ndarray|torch.Tensor|cupy.ndarray|float
+        """
+        if name is None:
+            raise ValueError(f"Name is not provided.")
+        if example is None:
+            raise ValueError(f"Example input for {name} is not provided.")
+        load_node = LoadNode(name)
+        load_node.tensor = {"tensor": example}
+        # Special logics for accumulator
+        if name == "accum":
+            if load_node.tensor.rank == 2:
+                new_shape = tuple([1, ] + list(load_node.tensor.shape))
+                load_node.tensor.broadcast(new_shape)
+            elif load_node.tensor.rank < 2 or load_node.tensor.rank > 3:
+                raise ValueError(f"Expect example inputs for 'accum' be a rank-2 or rank-3 tensor. Got {load_node.tensor.shape}.")
+        self.add_node(load_node)
+
+    def add_imm(self, value: Union[float,int]):
+        """
+        Add an immediate scalar value to DAG IR
+        :param value: the value of the immediate scalar
+        :type value: float
+        """
+        try:
+            value = float(value)
+        except:
+            raise ValueError(f"{type(value).__name__} cannot be converted to float.")
+
+        name = f"imm_{value}".replace('.', '_')
+        load_node = LoadNode(name)
+        load_node.tensor = {"tensor": value, "is_constant": True}
+        self.add_node(load_node)
+        return name
+
+    def add_compute_node(self, op, name=None):
+        """
+        Add a compute node.
+        :param op: the computation op
+        :param name: the node name (optional)
+        :type name: str
+        :return: the name of the compute node
+        """
+        if name is None:
+            name = f"compute_{self.compute_cnt}"
+            self.compute_cnt += 1
+        compute_node = ComputeNode(
+            name=name, fn=op,
+            element_output=self.element_compute,
+            element_compute=self.element_compute)
+        self.add_node(compute_node)
+        return compute_node.name
+
+    def add_layout_node(self, op, kwargs, name=None):
+        """
+        Add a layout node.
+        :param op: the layout op
+        :type op: evt_ops
+        :param name: the node name (optional)
+        :type name: str
+        :return: the name of the layout node
+        """
+        if name is None:
+            name = f"layout_{self.layout_cnt}"
+            self.layout_cnt += 1
+        layout_node = LayoutNode(name=name, fn=op, kwargs=kwargs)
+        self.add_node(layout_node)
+        return layout_node.name
+
+    def add_store_node(self, name):
+        store_node = StoreNode(name)
+        self.add_node(store_node)
+
+    #
+    # Visualization The DAG IR
+    #
+
+    def visualize(self, name="dag_ir"):
+        """
+        Visualize the dag ir with svg file
+        :param name: the name of the graph
+        """
+        drawer = EVTGraphDrawer(self.dag_ir, name)
+        try:
+            for name, graph in drawer.get_dot_graph():
+                graph.write_svg(f"./{name}.svg")
+        except:
+            raise RuntimeError(
+                "'dot' is not found in path. GraphDrawer is disabled. "
+                "Please install it with 'sudo apt-get install graphviz'."
+            )
+
+    #
+    # Get shared memory size
+    #
+
+    def get_smem_size(self, tile_description):
+        """
+        Get the shared memory size of the epilogue
+        """
+        smem_size = GetSmemSize(self.dag_ir)(tile_description)
+        return smem_size
diff --git a/python/cutlass/backend/evt/frontend/python_ast.py b/python/cutlass/backend/evt/frontend/python_ast.py
new file mode 100644
index 0000000000..3f33485456
--- /dev/null
+++ b/python/cutlass/backend/evt/frontend/python_ast.py
@@ -0,0 +1,187 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Python AST frontend that parses input into DAG IR
+"""
+
+import ast
+import inspect
+import textwrap
+
+from cutlass_library import DataType
+
+import cutlass
+from cutlass.backend.evt.frontend.frontend_base import EVTFrontendBase
+from cutlass.backend.epilogue import relu
+from cutlass.backend.library import FunctionalOp
+
+
+class PythonASTFrontend(EVTFrontendBase, ast.NodeVisitor):
+    def __init__(self, element_compute=DataType.f32, **kwargs):
+        super().__init__(element_compute, **kwargs)
+        # Flags
+        # If this state is True, visit_Constant returns values without creating imm node
+        self.no_imm = False
+        self.visiting_return = False
+
+    def parse(self, example_inputs):
+        self.example_inputs = example_inputs
+        self.source = textwrap.dedent(inspect.getsource(self.__call__))
+        self.ast = ast.parse(self.source)
+        self.visit(self.ast)
+
+    #
+    # Helper functions
+    #
+    @staticmethod
+    def ast_op_to_bindings(op):
+        mapping = {
+            ast.Add: FunctionalOp.Plus,
+            ast.Sub: FunctionalOp.Minus,
+            ast.Mult: FunctionalOp.Multiplies,
+            ast.Div: FunctionalOp.Divides,
+            "maximum": FunctionalOp.Maximum,
+            "minimum": FunctionalOp.Minimum,
+            "relu": relu.binding_type,
+            "multiply_add": FunctionalOp.MultiplyAdd,
+            "sum": (FunctionalOp.Plus, FunctionalOp.AtomicAdd),
+            "max": (FunctionalOp.Maximum, FunctionalOp.AtomicMaximum)
+        }
+        return mapping[op]
+
+    #
+    # Visiting different node types
+    #
+
+    def visit_FunctionDef(self, node: ast.FunctionDef):
+        # Visit args and register load nodes
+        for arg in node.args.args:
+            self.visit(arg)
+        for expr in node.body:
+            self.visit(expr)
+
+    def visit_arg(self, node: ast.arg):
+        # Name of the argument
+        name = node.arg
+        try:
+            example_tensor = self.example_inputs[name]
+        except:
+            raise RuntimeError(f"Example input for {name} is not provided.")
+
+        self.add_load_node(name, example_tensor)
+
+    def visit_Name(self, node: ast.Name):
+        return node.id
+
+    def visit_Constant(self, node: ast.Constant):
+        if self.no_imm:
+            return node.value
+        else:
+            name = self.add_imm(node.value)
+            return name
+
+    def visit_Tuple(self, node: ast.Tuple):
+        results = []
+        for elt in node.elts:
+            results.append(self.visit(elt))
+        return tuple(results)
+
+    def visit_keyword(self, node: ast.keyword):
+        return {node.arg: self.visit(node.value)}
+
+    def visit_BinOp(self, node: ast.BinOp):
+        if self.visiting_return:
+            raise SyntaxError("Return value cannot be an expression")
+        lhs = self.visit(node.left)
+        rhs = self.visit(node.right)
+        op = self.ast_op_to_bindings(type(node.op))
+        name = self.add_compute_node(op)
+
+        # Add edges
+        # The edge weights are used to sort the input args
+        self.add_edge(lhs, name, weight=0)
+        self.add_edge(rhs, name, weight=1)
+        return name
+
+    def visit_Assign(self, node: ast.BinOp):
+        target = self.visit(node.targets[0])
+        value = self.visit(node.value)
+        # Create the assign node
+        self.add_store_node(target)
+
+        # Add edges
+        self.add_edge(value, target)
+        return target
+
+    def visit_Call(self, node: ast.Call):
+        if self.visiting_return:
+            raise SyntaxError("Return value cannot be an expression")
+        func = self.visit(node.func)
+        args = [self.visit(arg) for arg in node.args]
+
+        if func in self.layout_fns.keys():
+            # Parse kwargs
+            # By default, visiting imm automatically creates a load node
+            # However, in function call, keyword args are used to set
+            # specific function attributes such as indices for permute
+            # So no_imm is set to True temporarily
+            self.no_imm = True
+            kwargs = {}
+            for kw in node.keywords:
+                kwargs.update(self.visit(kw))
+            self.no_imm = False
+            op = self.layout_fns[func]
+            name = self.add_layout_node(op, kwargs)
+        else:
+            op = self.ast_op_to_bindings(func)
+            name = self.add_compute_node(op)
+
+        # Add edges
+        for idx, arg in enumerate(args):
+            self.add_edge(arg, name, weight=idx)
+        return name
+
+    def visit_Return(self, node: ast.Return):
+        self.visiting_return = True
+        results = self.visit(node.value)
+        self.visiting_return = False
+        self.return_names = results
+        if not isinstance(results, tuple):
+            results = (results,)
+        for rst in results:
+            try:
+                example_tensor = self.example_inputs[rst]
+            except:
+                raise RuntimeError(f"Example input for {rst} is not provided.")
+            self.set_store_tensor(rst, example_tensor)
+            self.mark_output(rst)
diff --git a/python/cutlass/backend/evt/ir/__init__.py b/python/cutlass/backend/evt/ir/__init__.py
new file mode 100644
index 0000000000..83412fd1c4
--- /dev/null
+++ b/python/cutlass/backend/evt/ir/__init__.py
@@ -0,0 +1,53 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from cutlass.backend.evt.ir.compute_nodes import ComputeNode, ComputeImpl
+from cutlass.backend.evt.ir.dag_ir import DAGIR
+from cutlass.backend.evt.ir.layout_nodes import LayoutNode
+from cutlass.backend.evt.ir.load_nodes import (
+    LoadNode,
+    AccumulatorImpl,
+    LoadSrcImpl,
+    AuxLoadImpl,
+    RowBroadcastImpl,
+    ColumnBroadcastImpl,
+    ScalarBroadcastImpl
+)
+from cutlass.backend.evt.ir.node import TopoVisitorNode, NoOpImpl
+from cutlass.backend.evt.ir.store_nodes import (
+    StoreNode,
+    StoreDImpl,
+    AuxStoreImpl,
+    ColumnReductionImpl,
+    RowReductionImpl,
+    ScalarReductionImpl
+)
diff --git a/python/cutlass/backend/evt/ir/compute_nodes.py b/python/cutlass/backend/evt/ir/compute_nodes.py
new file mode 100644
index 0000000000..783d7cf14d
--- /dev/null
+++ b/python/cutlass/backend/evt/ir/compute_nodes.py
@@ -0,0 +1,91 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Python registration for compute nodes in EVT
+"""
+
+from cutlass.backend.evt.ir.node import NodeBase, ImplBase
+from cutlass.backend.library import FloatRoundStyle
+
+
+class ComputeImplBase(ImplBase):
+    """
+    Base class for compute implementation
+    """
+    def __init__(self, node) -> None:
+        super().__init__(node)
+
+
+class ComputeImpl(ComputeImplBase):
+    """
+    Implementation for Compute Node
+    """
+    def __init__(self, node) -> None:
+        super().__init__(node)
+
+        self.fn = node.fn
+        self.element_output = node.element_output
+        self.element_compute = node.element_compute
+        self.round_style = node.round_style
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        return True
+
+
+class ComputeNode(NodeBase):
+    """
+    Compute Node in DAG IR
+    """
+    possible_impls = [
+        ComputeImpl
+    ]
+    def __init__(
+        self, name: str, fn, element_output,
+        element_compute,
+        round_style=FloatRoundStyle.ToNearest) -> None:
+        super().__init__(name)
+        self.op = "compute"
+        self.fn = fn
+        self.element_compute = element_compute
+        self.round_style = round_style
+
+    def type_propagation(self, *args, **kwargs):
+        """
+        Load node loads tensor under type `tensor.element` and returns an array of type `tensor.element`.
+        """
+        self.element = self.element_compute
+        # In general, the compute nodes have element_output = element_compute
+        # In certain cases like producer of D it is overwritten by other passes
+        if not hasattr(self, "element_output"):
+            self.element_output = self.element
diff --git a/python/cutlass/backend/evt/ir/dag_ir.py b/python/cutlass/backend/evt/ir/dag_ir.py
new file mode 100644
index 0000000000..b837a552ac
--- /dev/null
+++ b/python/cutlass/backend/evt/ir/dag_ir.py
@@ -0,0 +1,236 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+DAG IR used by Python EVT
+"""
+
+import networkx as nx
+
+from cutlass_library import DataType
+
+from cutlass.backend.evt.ir.node import NodeBase
+from cutlass.backend.utils import device_cc
+
+
+class DAGIR:
+    """
+    ``DAGIR`` is the main data structure used in the EVT Intermediate Representation.
+    It consists of a series of ``Node`` s, each representing epilogue visitor nodes.
+
+    In the DAGIR, ``node`` is an string of its name. ``node_meta`` is the underlying class of the node
+    """
+    def __init__(self, element_compute=DataType.f32, cc: int=None) -> None:
+        # The EVT DAGIR is managed through the nextworkX Digraph class
+        self._graph = nx.DiGraph()
+
+        self.element_compute = element_compute
+
+        self.reduction_names = []
+
+        self.cc = cc if cc else device_cc()
+
+    #
+    # IR manipulator
+    #
+
+    def add_node(self, meta: NodeBase):
+        """
+        Add a node to dag ir
+        """
+        if self.has_node(meta.name):
+            raise SyntaxError(f"Variable '{meta.name}' cannot be defined twice.")
+        self._graph.add_node(meta.name, meta=meta)
+
+    def add_edge(self, src: str, dst: str, weight: int=0):
+        """
+        Add an edge src -> dst to dag ir with weight
+        """
+        if not self.has_node(src):
+            raise SyntaxError(f"Variable '{src}' is undefined.")
+        if not self.has_node(dst):
+            raise SyntaxError(f"Variable '{dst}' is undefined.")
+        self._graph.add_edge(src, dst, weight=weight)
+
+    def remove_node(self, node: str):
+        """
+        Remove node from dag ir
+        """
+        self._graph.remove_node(node)
+
+    def remove_edge(self, src: str, dst: str):
+        """
+        Remove edge src -> dst
+        """
+        self._graph.remove_edge(src, dst)
+
+    #
+    # Helper functions for getting attrs
+    #
+
+    def has_node(self, node: str) -> bool:
+        """
+        Check if the node is in the graph
+        """
+        return self._graph.has_node(node)
+
+    def in_degree(self, node: str):
+        """
+        Get the input degree of node
+        """
+        return self._graph.in_degree(node)
+
+    def in_edges(self, node: str):
+        """
+        Get the input edges of node
+        """
+        return [edge for edge in self._graph.in_edges(node)]
+
+    def out_degree(self, node: str):
+        """
+        Get the output degree of node
+        """
+        return self._graph.out_degree(node)
+
+    def out_edges(self, node: str):
+        """
+        Get the output edges of node
+        """
+        return [edge for edge in self._graph.out_edges(node)]
+
+    def get_node_meta(self, node: str):
+        """
+        Get the meta data of the node
+        """
+        return self._graph.nodes[node]["meta"]
+
+    def get_edge_weight(self, src, dst):
+        """
+        Get the edge weight of edge src->dst
+        """
+        return self._graph.get_edge_data(src, dst)["weight"]
+
+    #
+    # High-level helper functions
+    #
+
+    def all_reachable_nodes(self, node: str):
+        """
+        Get all the nodes reachable from the current node (exclude)
+        """
+        return list(nx.dfs_preorder_nodes(self._graph, source=node))
+
+    def get_users(self, node: str):
+        """
+        Get all users of the current node
+        """
+        return [edge[1] for edge in self.out_edges(node)]
+
+    def get_all_inputs(self, node: str):
+        """
+        Get all the input nodes sorted by edge weight
+        """
+        in_edges = self.in_edges(node)
+        edge_weights = [self.get_edge_weight(*edge) for edge in in_edges]
+        return [edge[0] for _, edge in sorted(zip(edge_weights, in_edges))]
+
+    def get_all_inputs_meta(self, node: str):
+        """
+        Get all the input node metas sorted by edge weight
+        """
+        return [self.get_node_meta(input_node) for input_node in self.get_all_inputs(node)]
+
+    def replace_all_uses_with(self, node1, node2):
+        """
+        Replace all uses of node1 with node2
+        """
+        for edge in self.out_edges(node1):
+            weight = self.get_edge_weight(*edge)
+            user = edge[1]
+            self.add_edge(node2, user, weight)
+            self.remove_edge(node1, user)
+        self.remove_node(node1)
+
+    #
+    # Node accessor
+    #
+    def nodes_topological_order(self):
+        """
+        Get the nodes in the unique lexicographical topological order
+        It generates a unique ordering of nodes by first sorting topologically
+        and then additionally by sorting lexicographically.
+
+        Although topological_sort alone also works, this generates a unique key
+        for each epilogue visitor pattern and ensures the compilation cache can be reused.
+        :return: list[str]
+        """
+        return list(nx.lexicographical_topological_sort(self._graph))
+
+    def node_metas_topological_order(self):
+        """
+        Get the node metas in topological order
+        :return: list[NodeBase]
+        """
+        return [self.get_node_meta(node) for node in self.nodes_topological_order()]
+
+    @property
+    def nodes(self):
+        """
+        Get all nodes
+        :return: list[str]
+        """
+        return list(self._graph.nodes)
+
+    @property
+    def nodes_meta(self):
+        """
+        Get all node metas
+        :return: list[NodeBase]
+        """
+        return [data[1]['meta'] for data in self._graph.nodes.data()]
+
+    @property
+    def edges(self):
+        """
+        Get all edges
+        :return: list[(str, str)]
+        """
+        return list(self._graph.edges)
+
+    #
+    # Path
+    #
+    def has_path(self, src: str, target: str) -> bool:
+        """
+        Return True is a path exists from src to target
+        """
+        return nx.has_path(self._graph, src, target)
diff --git a/python/cutlass/backend/evt/ir/layout_algorithm.py b/python/cutlass/backend/evt/ir/layout_algorithm.py
new file mode 100644
index 0000000000..dd99030344
--- /dev/null
+++ b/python/cutlass/backend/evt/ir/layout_algorithm.py
@@ -0,0 +1,324 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Layout algebras
+"""
+
+from pycute import Layout, composition, make_layout, flatten, product
+
+
+def _infer_split(old_shape, new_shape):
+    old_shape = _tuple_to_list(old_shape)
+    new_shape = _tuple_to_list(new_shape)
+    if len(old_shape) == 0 and len(new_shape) == 0:
+        return []
+    if len(old_shape) == 0:
+        if product(tuple(new_shape)) != 1:
+            raise ValueError("Invalid reshape size")
+        else:
+            return new_shape
+    if len(new_shape) == 0:
+        if product(tuple(old_shape)) != 1:
+            raise ValueError("Invalid reshape size")
+        else:
+            return old_shape
+    # This is done recursively by only process the last dimension at each time
+    old_dim = old_shape[-1]
+    new_dim = new_shape[-1]
+    # Exact match
+    if old_dim == new_dim:
+        return _infer_split(old_shape[:-1], new_shape[:-1]) + [new_dim,]
+    # Needs split
+    if old_dim > new_dim and old_dim % new_dim == 0:
+        residual = old_dim // new_dim
+        return _infer_split(old_shape[:-1] + [residual,], new_shape[:-1]) + [new_dim,]
+    # Needs merge
+    if old_dim < new_dim and new_dim % old_dim == 0:
+        residual = new_dim // old_dim
+        return _infer_split(old_shape[:-1], new_shape[:-1] + [residual,]) + [old_dim,]
+
+    raise NotImplementedError(f"Unsupported split: {old_shape} -> {new_shape}")
+
+def _infer_merge(flatten_shape, shape):
+    flatten_shape = _tuple_to_list(flatten_shape)
+    shape = _tuple_to_list(shape)
+    idx_flat = 0
+    merged_shape = []
+    for dim in shape:
+        # Exact match
+        if dim == flatten_shape[idx_flat]:
+            merged_shape.append(dim)
+            idx_flat += 1
+        # Need group
+        elif dim > flatten_shape[idx_flat] and dim % flatten_shape[idx_flat] == 0:
+            residual = dim
+            group = []
+            while(residual > 1):
+                group.append(flatten_shape[idx_flat])
+                residual = residual // flatten_shape[idx_flat]
+                idx_flat += 1
+            merged_shape.append(group)
+        else:
+            raise NotImplementedError(f"Unsupported merge: {flatten_shape} -> {shape}")
+
+    return merged_shape
+
+def _list_to_tuple(nested_list):
+    if isinstance(nested_list, list) or isinstance(nested_list, tuple):
+        return tuple(_list_to_tuple(item) for item in nested_list)
+    return nested_list
+
+def _tuple_to_list(nested_tuple):
+    if isinstance(nested_tuple, list) or isinstance(nested_tuple, tuple):
+        return list(_tuple_to_list(item) for item in nested_tuple)
+    return nested_tuple
+
+def _reverse_tuple(nested_tuple: tuple):
+    if isinstance(nested_tuple, tuple):
+        return tuple([_reverse_tuple(item) for item in nested_tuple][::-1])
+    return nested_tuple
+
+def _get_first_lhs_nonzero_stride(stride_list, idx):
+    for i in reversed(range(idx)):
+        if stride_list[i] != 0:
+            return i
+    else:
+        return None
+
+def _get_first_rhs_nonzero_stride(stride_list, idx):
+    for i in range(idx+1, len(stride_list)):
+        if stride_list[i] != 0:
+            return i
+        else:
+            return None
+
+def reshape(layout, new_shape):
+    """
+    General reshape of input layout.
+    It takes two steps:
+    1. split the dimensions of the old layout
+    2. merge the splitted dimensions according to the new shape
+    """
+    #
+    # Step 1: Split the dimensions of the old layout
+    #
+    # 1.1 Flat old and new shape
+    old_flatten_shape = list(flatten(layout.shape))
+    new_flatten_shape = list(flatten(new_shape))
+
+    # 1.2 Infer the flatten splitted shape
+    splitted_flatten_shape = _infer_split(old_flatten_shape, new_flatten_shape)
+
+    # 1.3 Unflat the splitted shape based on the old shape
+    splited_shape = _infer_merge(splitted_flatten_shape, old_flatten_shape)
+
+    # 1.4 Infer the type of each split
+    # If the split type is in row-major (R), the dimension list is reversed because
+    # the cute::composition only support column-major split
+    split_type = []  # the type of each split (ColumnMajor or RowMajor)
+    permuted_splitted_shape = []
+    old_flatten_stride = list(flatten(layout.stride))
+    for idx, dim in enumerate(splited_shape):
+        if not isinstance(dim, list):
+            permuted_splitted_shape.append(dim)
+            split_type.append("C")
+        else:
+            lhs_stride = _get_first_lhs_nonzero_stride(old_flatten_stride, idx)
+            rhs_stride = _get_first_rhs_nonzero_stride(old_flatten_stride, idx)
+            # Special case for single tuple
+            # Use column-major by default
+            if lhs_stride is None and rhs_stride is None:
+                permuted_splitted_shape.append(dim)
+                split_type.append("C")
+            else:
+                if lhs_stride is not None and rhs_stride is not None:
+                    # We consider shape[idx]:stride[idx]
+                    # Case 1: stride[idx - 1] <= stride[idx] <= stride[idx + 1]: column major
+                    if lhs_stride <= old_flatten_stride[idx] and old_flatten_stride[idx] <= rhs_stride:
+                        permuted_splitted_shape.append(dim)
+                        split_type.append("C")
+                    # Case 2: stride[idx - 1] > stride[idx] > stride[idx + 1]: row major
+                    elif lhs_stride > old_flatten_stride[idx] and old_flatten_stride[idx] > rhs_stride:
+                        permuted_splitted_shape.append([d for d in reversed(dim)])
+                        split_type.append("R")
+                    # Case 3: stride[idx - 1] <= stride[idx] > stride[idx + 1]: concave
+                    elif lhs_stride <= old_flatten_stride[idx] and old_flatten_stride[idx] > rhs_stride:
+                        if lhs_stride >= rhs_stride:
+                            permuted_splitted_shape.append(dim)
+                            split_type.append("C")
+                        else:
+                            permuted_splitted_shape.append([d for d in reversed(dim)])
+                            split_type.append("R")
+                    # Case 4: stride[idx - 1] > stride[idx] <= stride[idx + 1]: concave
+                    elif lhs_stride > old_flatten_stride[idx] and old_flatten_stride[idx] <= rhs_stride:
+                        if lhs_stride >= rhs_stride:
+                            permuted_splitted_shape.append(dim)
+                            split_type.append("C")
+                        else:
+                            permuted_splitted_shape.append([d for d in reversed(dim)])
+                            split_type.append("R")
+                    else:
+                        raise NotImplementedError()
+                elif lhs_stride is None:
+                    # Case 1: dim's stride < dim+1's stride, expand in column major
+                    if old_flatten_stride[idx] > rhs_stride:
+                        permuted_splitted_shape.append([d for d in reversed(dim)])
+                        split_type.append("R")
+                    else:
+                        permuted_splitted_shape.append(dim)
+                        split_type.append("C")
+                else:
+                    # Case 1: dim's stride > dim-1's stride
+                    if old_flatten_stride[idx] < lhs_stride:
+                        permuted_splitted_shape.append([d for d in reversed(dim)])
+                        split_type.append("R")
+                    else:
+                        permuted_splitted_shape.append(dim)
+                        split_type.append("C")
+
+    # 1.4 Generate the splitted layout
+    permuted_splitted_layout = composition(layout, Layout(_list_to_tuple(permuted_splitted_shape)))
+
+    # 1.5 Reverse the permutation in 1.4 before merge
+    splitted_shape = []
+    splitted_stride = []
+    for shape_dim, stride_dim, type in zip(
+            permuted_splitted_layout.shape,
+            permuted_splitted_layout.stride,
+            split_type):
+        if type == "C":
+            splitted_shape.append(shape_dim)
+            splitted_stride.append(stride_dim)
+        else:
+            splitted_shape.append(tuple([d for d in reversed(shape_dim)]))
+            splitted_stride.append(tuple([d for d in reversed(stride_dim)]))
+    splitted_layout = Layout(tuple(splitted_shape), tuple(splitted_stride))
+
+
+    #
+    # Step 2: Merge the splitted dimensions according to the new shape
+    #
+    # 2.1 Merge layout
+    merged_layout = composition(splitted_layout, Layout(new_shape))
+
+    # 2.2 Cleaning up
+    output_layout = composition(merged_layout, Layout(new_shape))
+    return output_layout
+
+
+def permutation(layout, permutation):
+    """
+    Permute the layout
+    """
+    new_shape = tuple([layout.shape[idx] for idx in permutation])
+    new_stride = tuple([layout.stride[idx] for idx in permutation])
+    return Layout(new_shape, new_stride)
+
+
+def _broadcast(layout, new_shape):
+    if len(layout) == 1 and isinstance(new_shape, int):
+        old_dim = layout.shape
+        old_stride = layout.stride
+        new_dim = new_shape
+        if old_dim == new_dim:
+            return Layout(old_dim, old_stride)
+        elif old_dim == 1:
+            return Layout(new_dim, 0)
+        else:
+            raise NotImplementedError(f"Invalid Broadcast: {old_dim} -> {new_dim}")
+
+    # Align the dimensions
+    old_shape = layout.shape
+    if isinstance(old_shape, int):
+        old_shape = (old_shape,)
+        sub_layouts = [layout,]
+    else:
+        sub_layouts = [sub_layout for sub_layout in layout]
+    rhs_broadcast_layouts = [Layout(1, 0)] * (len(new_shape) - len(old_shape))
+    # Get the broadcasted layout
+    broadcast_layouts = []
+    try:
+        layout = make_layout(*sub_layouts, *rhs_broadcast_layouts)
+        broadcast_layouts = []
+        for idx, sub_layout in enumerate(layout):
+            broadcast_layouts.append(_broadcast(sub_layout, new_shape[idx]))
+    except NotImplementedError:
+        layout = make_layout(*rhs_broadcast_layouts, *sub_layouts)
+        for idx, sub_layout in enumerate(layout):
+            broadcast_layouts.append(_broadcast(sub_layout, new_shape[idx]))
+    return make_layout(*broadcast_layouts)
+
+
+def broadcast(layout, new_shape):
+    """
+    Broadcast the new layout based on the input shape
+    The broadcasted shape equals to the new shape
+    The stride of broadcasted dimensions are 0
+    """
+    return _broadcast(layout, new_shape)
+
+
+def debroadcast(layout, dims):
+    """
+    Squeeze the 0-stride
+    """
+    for dim in dims:
+        if layout.stride[dim] != 0:
+            raise ValueError(f"Dim{dim} cannot be debroadcasted as it has stride {layout.stride[dim]}")
+    new_shape = tuple([s for idx, s in enumerate(layout.shape) if idx not in dims])
+    new_stride = tuple([s for idx, s in enumerate(layout.stride) if idx not in dims])
+    return Layout(new_shape, new_stride)
+
+
+def canonicalization_(shapes, strides):
+    if isinstance(shapes, tuple):
+        c_shapes = []
+        c_strides = []
+        for shape, stride in zip(shapes, strides):
+            c_shape, c_stride = canonicalization_(shape, stride)
+            c_shapes.append(c_shape)
+            c_strides.append(c_stride)
+        return tuple(c_shapes), tuple(c_strides)
+    else:
+        if shapes == 1:
+            return 1, 0
+        else:
+            return shapes, strides
+
+def canonicalization(layout):
+    """
+    Canonicalize the input layout
+    1. set the stride of shape "1" to 0
+    """
+    new_shape, new_stride = canonicalization_(layout.shape, layout.stride)
+    return Layout(new_shape, new_stride)
diff --git a/python/cutlass/backend/evt/ir/layout_nodes.py b/python/cutlass/backend/evt/ir/layout_nodes.py
new file mode 100644
index 0000000000..961189fcfb
--- /dev/null
+++ b/python/cutlass/backend/evt/ir/layout_nodes.py
@@ -0,0 +1,336 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Layout manipulation nodes and implementations
+
+The layout Nodes change the layout of intermediate nodes in epilogue visitor graph
+"""
+
+from copy import deepcopy
+
+from cutlass_library import LayoutType
+from pycute import product, flatten
+
+import cutlass
+from cutlass.backend.evt.ir.layout_algorithm import _list_to_tuple, _tuple_to_list
+from cutlass.backend.evt.ir.node import NodeBase
+from cutlass.backend.evt.ir.tensor import Tensor
+
+
+class PermutationImpl:
+    """
+    Detailed implementation and helper functions for permutation
+    """
+    def __init__(self, node) -> None:
+        assert "indices" in node.kwargs.keys()
+        self.indices = list(node.kwargs["indices"])
+        self.inverse_indices = self.get_inverse_indices(self.indices)
+
+    def get_inverse_impl(self):
+        inverse_impl = deepcopy(self)
+        inverse_impl.indices = self.inverse_indices
+        inverse_impl.inverse_indices = self.indices
+        return inverse_impl
+
+    def update(self, shape):
+        num_dim = len(shape)
+        indices = self.indices
+        num_old_dim = len(indices)
+        # Add offset
+        for i, idx in enumerate(indices):
+            indices[i] = idx + num_dim - num_old_dim
+        # Add broadcast dims
+        for i in range(num_dim - num_old_dim):
+            indices = [i,] + indices
+
+        self.indices = indices
+        self.inverse_indices = self.get_inverse_indices(self.indices)
+
+    def get_inverse_indices(self, indices):
+        """
+        Get the indices for inverse permutation
+        """
+        num_dim = len(indices)
+        inverse_indices = [0] * num_dim
+        for i in range(num_dim):
+            inverse_indices[indices[i]] = i
+        return inverse_indices
+
+    def shape_propagation(self, input_node_meta):
+        input_shape = input_node_meta.tensor.shape
+        output_shape = tuple([input_shape[idx] for idx in self.indices])
+        return output_shape
+
+    def broadcast(self, shape, node_meta: NodeBase):
+        """
+        Broadcast the inputs based on current shape
+        """
+        self.update(shape)
+        inverse_shape = tuple([shape[idx] for idx in self.inverse_indices])
+        node_meta.tensor.broadcast(inverse_shape)
+
+    def apply_to_user(self, usr_meta: NodeBase):
+        """
+        Propagate the permutation to the users of the current nodes
+        """
+        usr_meta.tensor.permute(self.inverse_indices)
+        if hasattr(usr_meta, "store_tensor"):
+            if usr_meta.store_tensor is not None:
+                usr_meta.store_tensor.permute(self.inverse_indices)
+
+    def apply_to_input(self, input_meta: NodeBase):
+        """
+        Propagate the permutation to inputs of the current nodes
+        """
+        input_meta.tensor.permute(self.indices)
+        if hasattr(input_meta, "store_tensor"):
+            if input_meta.store_tensor is not None:
+                input_meta.store_tensor.permute(self.indices)
+
+
+class ReshapeImpl:
+    """
+    Detailed implementation and helper functions for reshape
+    """
+    def __init__(self, node) -> None:
+        self.node = node
+        assert "new_shape" in node.kwargs.keys()
+        self.output_shape = _list_to_tuple(node.kwargs["new_shape"])
+
+    def get_inverse_impl(self):
+        inverse_impl = deepcopy(self)
+        inverse_impl.output_shape = self.input_shape
+        inverse_impl.input_shape = self.output_shape
+        return inverse_impl
+
+    def shape_propagation(self, input_node_meta):
+        self.input_shape = input_node_meta.tensor.shape
+        return _list_to_tuple(self.output_shape)
+
+    def broadcast(self, shape, node_meta: NodeBase):
+        """
+        Broadcast the inputs based on current shape.
+        """
+        # Step 1: infer split
+        flatten_split_shape = self.infer_split(flatten(self.input_shape), flatten(self.output_shape))
+        split_input_shape = self.infer_merge(flatten_split_shape, self.input_shape)
+        split_output_shape = self.infer_merge(flatten_split_shape, self.output_shape)
+
+        # broadcast shape -> split_output_shape -> flatten_split_shape
+        if len(shape) - len(split_output_shape) > 0:
+            for _ in range(len(shape) - len(split_output_shape)):
+                split_output_shape = [1,] + split_output_shape
+                flatten_split_shape = [1,] + flatten_split_shape
+                split_input_shape = [1,] + split_input_shape
+        broadcast_factor = []
+        for dim, old_dim in zip(shape, split_output_shape):
+            if not isinstance(dim, list):
+                dim = [dim,]
+            if not isinstance(old_dim, list):
+                old_dim = [old_dim,]
+            if product(tuple(dim)) == product(tuple(old_dim)):
+                broadcast_factor += [1] * len(old_dim)
+            elif product(tuple(old_dim)) == 1:
+                assert len(dim) == 1
+                broadcast_factor.append(dim[0])
+            else:
+                raise NotImplementedError(f"Invalid Broadcast: {old_dim} -> {dim}")
+
+        # flatten_split_shape -> split_input_shape
+        factor_idx = 0
+        broadcast_split_input_shape = []
+        for dim in split_input_shape:
+            if isinstance(dim, list):
+                new_dim = []
+                for d in dim:
+                    new_dim.append(d * broadcast_factor[factor_idx])
+                    factor_idx += 1
+                broadcast_split_input_shape.append(new_dim)
+            else:
+                broadcast_split_input_shape.append(dim * broadcast_factor[factor_idx])
+                factor_idx += 1
+        broadcast_split_input_shape = _list_to_tuple(broadcast_split_input_shape)
+        node_meta.tensor.reshape(_list_to_tuple(split_input_shape))
+        node_meta.tensor.broadcast(broadcast_split_input_shape)
+        # Last reshape op to clean up
+        broadcast_input_shape = tuple([product(dim) for dim in broadcast_split_input_shape])
+        node_meta.tensor.reshape(broadcast_input_shape)
+        # Update the input shape and output shape
+        self.input_shape = _list_to_tuple(node_meta.tensor.shape)
+        self.output_shape = _list_to_tuple(shape)
+
+    def apply_to_user(self, user_meta: NodeBase):
+        """
+        Propagate the reshape to user nodes
+        """
+        user_meta.tensor.reshape(tuple(self.input_shape))
+        if hasattr(user_meta, "store_tensor"):
+            if user_meta.store_tensor is not None:
+                user_meta.store_tensor.reshape(tuple(self.input_shape))
+
+    def apply_to_input(self, input_meta: NodeBase):
+        """
+        Propagate the reshape to input nodes
+        """
+        input_meta.tensor.reshape(tuple(self.output_shape))
+        if hasattr(input_meta, "store_tensor"):
+            if input_meta.store_tensor is not None:
+                input_meta.store_tensor.reshape(tuple(self.output_shape))
+
+    #
+    # Helper functions
+    #
+
+    def infer_split(self, input_shape, output_shape):
+        """
+        Infer the flatten splitted shape that can be merged to both input_shape and output_shape
+        """
+        input_shape = _tuple_to_list(input_shape)
+        output_shape = _tuple_to_list(output_shape)
+        if len(input_shape) == 0 and len(output_shape) == 0:
+            return []
+        if len(input_shape) == 0:
+            if product(tuple(output_shape)) != 1:
+                raise ValueError("Invalid reshape size")
+            else:
+                return output_shape
+        if len(output_shape) == 0:
+            if product(tuple(input_shape)) != 1:
+                raise ValueError("Invalid reshape size")
+            else:
+                return input_shape
+        # This is done recursively by only process the last dimension at each time
+        old_dim = input_shape[-1]
+        new_dim = output_shape[-1]
+        # Exact match
+        if old_dim == new_dim:
+            return self.infer_split(input_shape[:-1], output_shape[:-1]) + [new_dim,]
+        # Needs split
+        if old_dim > new_dim and old_dim % new_dim == 0:
+            residual = old_dim // new_dim
+            return self.infer_split(input_shape[:-1] + [residual,], output_shape[:-1]) + [new_dim,]
+        # Needs merge
+        if old_dim < new_dim and new_dim % old_dim == 0:
+            residual = new_dim // old_dim
+            return self.infer_split(input_shape[:-1], output_shape[:-1] + [residual,]) + [old_dim,]
+
+        raise NotImplementedError(f"Unsupported split: {input_shape} -> {output_shape}")
+
+    def infer_merge(self, flatten_shape, shape):
+        flatten_shape = _tuple_to_list(flatten_shape)
+        shape = _tuple_to_list(shape)
+        idx_flat = len(flatten_shape) - 1
+        merged_shape = []
+        for dim in reversed(shape):
+            # Exact match
+            if dim == flatten_shape[idx_flat]:
+                merged_shape.append(dim)
+                idx_flat -= 1
+            # need group
+            elif dim > flatten_shape[idx_flat] and dim % flatten_shape[idx_flat] == 0:
+                residual = dim
+                group = []
+                while(residual > 1):
+                    group.append(flatten_shape[idx_flat])
+                    residual = residual // flatten_shape[idx_flat]
+                    idx_flat -= 1
+                merged_shape.append(group[::-1])
+            else:
+                raise NotImplementedError(f"Unsupported merge: {flatten_shape} -> {shape}")
+
+        return merged_shape[::-1]
+
+
+class LayoutNode(NodeBase):
+    """
+    Layout manipulation nodes
+    """
+    fn_to_impl = {
+        "permute": PermutationImpl,
+        "reshape": ReshapeImpl
+    }
+    def __init__(self, name: str, fn, kwargs: dict) -> None:
+        super().__init__(name)
+        self.op = "layout"
+        self.fn = fn
+        self.kwargs = kwargs
+        self.underlying_impl = self.fn_to_impl[self.fn.__name__](self)
+
+    def get_inverse_node(self):
+        inverse_node = deepcopy(self)
+        inverse_node.underlying_impl = self.underlying_impl.get_inverse_impl()
+        return inverse_node
+
+    def shape_propagation(self, input_node_metas):
+        if self._tensor is not None:
+            return
+        assert len(input_node_metas) == 1, "Layout node can only have one input node"
+
+        output_shape = self.underlying_impl.shape_propagation(input_node_metas[0])
+
+        self._tensor = Tensor(
+            element=self.element_output,
+            shape=output_shape, layout_tag=LayoutType.RowMajor
+        )
+
+        return super().shape_propagation(input_node_metas)
+
+    def type_propagation(self, input_node_metas: 'list[NodeBase]'):
+        """
+        The store nodes has element_output = element_input
+        """
+        assert len(input_node_metas) == 1, "Layout node can only have one input node"
+        self.element_output = input_node_metas[0].element_output
+
+    def broadcast_propagation(self, input_node_metas: 'list[NodeBase]'):
+        """
+        Propagate the broadcast in the reversed topological order
+        """
+        if self.tensor is None:
+            raise RuntimeError(f"The tensor of node {self.name} is unknown.")
+        shape = self.tensor.shape
+
+        for child in input_node_metas:
+            self.underlying_impl.broadcast(shape, child)
+
+    def apply_to_user(self, usr_meta: NodeBase):
+        """
+        Propagate the permutation to user nodes
+        """
+        self.underlying_impl.apply_to_user(usr_meta)
+
+    def apply_to_input(self, input_meta: NodeBase):
+        """
+        Propagate the permutation to input nodes
+        """
+        self.underlying_impl.apply_to_input(input_meta)
diff --git a/python/cutlass/backend/evt/ir/load_nodes.py b/python/cutlass/backend/evt/ir/load_nodes.py
new file mode 100644
index 0000000000..4c6c4b8bac
--- /dev/null
+++ b/python/cutlass/backend/evt/ir/load_nodes.py
@@ -0,0 +1,294 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Load nodes and implementations
+"""
+
+import ctypes
+
+from cutlass.backend.c_types import tuple_factory
+from cutlass.backend.epilogue import dtype2ctype, to_ctype_value
+from cutlass.backend.evt.ir.node import NodeBase, ImplBase
+
+
+class LoadImplBase(ImplBase):
+    """
+    Base class for load node implementations
+    """
+    reserved_names = ["accum", "C"]
+    def __init__(self, node) -> None:
+        super().__init__(node)
+        self.element = node.element
+        self.element_output = node.element_output
+        self.stride = node.tensor.stride
+
+
+class AccumulatorImpl(LoadImplBase):
+    """
+    Accumulator node implementation
+    """
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        return node.name == "accum" and node.tensor.shape == problem_size
+
+
+class LoadSrcImpl(LoadImplBase):
+    """
+    Load C implementation
+    """
+    @property
+    def name_camel(self) -> str:
+        return "TensorC"
+
+    @property
+    def argument_type_c(self):
+        stride_mnl = self.get_stride_mnl()
+        tuple_type = tuple_factory(stride_mnl, self.stride_dtype)
+        class _Argument(ctypes.Structure):
+            _fields_ = [
+                ("ptr_C", ctypes.c_void_p),
+                ("stride_C", tuple_type)
+            ]
+            def __init__(self, ptr) -> None:
+                self.ptr_C = ptr
+                self.stride_C = tuple_type(stride_mnl)
+
+        return _Argument
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        return node.name == "C" and node.tensor.shape == problem_size
+
+
+class AuxLoadImpl(LoadImplBase):
+    """
+    Load arbitrary tensor
+    """
+    @property
+    def argument_type(self):
+        stride_mnl = self.get_stride_mnl()
+        name = self.name
+        tuple_type = tuple_factory(stride_mnl, self.stride_dtype)
+        element_type = self.element
+        class _Argument(ctypes.Structure):
+            _fields_ = [
+                ("ptr_aux", ctypes.c_void_p),
+                ("null_default", dtype2ctype[element_type]),
+                ("dAux", tuple_type)
+            ]
+            def __init__(self, kwargs) -> None:
+                ptr = kwargs[name]
+                self.ptr_aux = ptr
+                self.null_default = to_ctype_value(0, element_type)
+                self.dAux = tuple_type(stride_mnl)
+
+        return _Argument
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        if node.name in LoadImplBase.reserved_names:
+            return False
+        strideMN = node.tensor.stride[-2:]
+        if (strideMN[0] == 1 and strideMN[1] != 0 or
+            strideMN[0] != 0 and strideMN[1] == 1 ):
+            return True
+        else:
+            return False
+
+
+class RowBroadcastImpl(LoadImplBase):
+    """
+    Broadcast a row vector
+    """
+    def __init__(self, node) -> None:
+        super().__init__(node)
+        self.stride_dtype = "int"
+
+    @property
+    def argument_type(self):
+        stride_mnl = self.get_stride_mnl()
+        name = self.name
+        tuple_type = tuple_factory(stride_mnl, self.stride_dtype)
+        element_type = self.element
+        class _Argument(ctypes.Structure):
+            _fields_ = [
+                ("ptr_row", ctypes.c_void_p),
+                ("null_default", dtype2ctype[element_type]),
+                ("dRow", tuple_type)
+            ]
+            def __init__(self, kwargs) -> None:
+                ptr = kwargs[name]
+                self.ptr_row = ptr
+                self.null_default = to_ctype_value(0, element_type)
+                self.dRow = tuple_type(stride_mnl)
+
+        return _Argument
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        if node.name in LoadImplBase.reserved_names:
+            return False
+
+        strideMN = node.tensor.stride[-2:]
+        if strideMN == (0, 1):
+            return True
+        else:
+            return False
+
+
+class ColumnBroadcastImpl(LoadImplBase):
+    """
+    Broadcast a column vector
+    """
+    def __init__(self, node) -> None:
+        super().__init__(node)
+        self.stride_dtype = "int"
+
+    @property
+    def argument_type(self):
+        stride_mnl = self.get_stride_mnl()
+        name = self.name
+        tuple_type = tuple_factory(stride_mnl, self.stride_dtype)
+        element_type = self.element
+        class _Argument(ctypes.Structure):
+            _fields_ = [
+                ("ptr_col", ctypes.c_void_p),
+                ("null_default", dtype2ctype[element_type]),
+                ("dCol", tuple_type)
+            ]
+            def __init__(self, kwargs) -> None:
+                ptr = kwargs[name]
+                self.ptr_col = int(ptr)
+                self.null_default = to_ctype_value(0, element_type)
+                self.dCol = tuple_type(stride_mnl)
+
+        return _Argument
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        if node.name in LoadImplBase.reserved_names:
+            return False
+
+        strideMN = node.tensor.stride[-2:]
+        if strideMN == (1, 0):
+            return True
+        else:
+            return False
+
+
+class ScalarBroadcastImpl(LoadImplBase):
+    """
+    Broadcast a scalar
+    """
+    def __init__(self, node) -> None:
+        super().__init__(node)
+        self.stride_dtype = "int"
+
+    @property
+    def argument_type(self):
+        stride_mnl = self.get_stride_mnl()
+        name = self.name
+        tuple_type = tuple_factory(stride_mnl, self.stride_dtype)
+        element_type = self.element
+
+        if self.tensor.is_constant:
+            value = self.tensor.value
+            class _Argument(ctypes.Structure):
+                _fields_ = [
+                    ("scalars", dtype2ctype[element_type]),
+                    ("scalar_ptrs", ctypes.c_void_p),
+                    ("dScalar", tuple_type)
+                ]
+                def __init__(self, kwargs) -> None:
+                    self.scalars = to_ctype_value(value, element_type)
+                    self.scalar_ptrs = 0
+                    self.dScalar = tuple_type(stride_mnl)
+
+        else:
+            class _Argument(ctypes.Structure):
+                _fields_ = [
+                    ("scalars", dtype2ctype[element_type]),
+                    ("scalar_ptrs", ctypes.c_void_p),
+                    ("dScalar", tuple_type)
+                ]
+                def __init__(self, kwargs) -> None:
+                    scalar_or_ptr = kwargs[name]
+                    if isinstance(scalar_or_ptr, float):
+                        self.scalars = to_ctype_value(scalar_or_ptr, element_type)
+                        self.scalar_ptrs = 0
+                    else:
+                        self.scalar_ptrs = int(scalar_or_ptr)
+
+                    self.dScalar = tuple_type(stride_mnl)
+
+        return _Argument
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        if node.name in LoadImplBase.reserved_names:
+            return False
+
+        strideMN = node.tensor.stride[-2:]
+        if strideMN == (0, 0):
+            return True
+        else:
+            return False
+
+
+class LoadNode(NodeBase):
+    """
+    Load Node
+    """
+    cnt = 0
+    possible_impls = [
+        AccumulatorImpl, LoadSrcImpl, AuxLoadImpl,
+        RowBroadcastImpl, ColumnBroadcastImpl,
+        ScalarBroadcastImpl
+    ]
+    def __init__(self, name: str) -> None:
+        if name is None:
+            name = f"load{LoadNode.cnt}"
+            LoadNode.cnt += 1
+        super().__init__(name)
+        self.op = "load"
+
+    def type_propagation(self, *args, **kwargs):
+        """
+        Load node loads tensor under type `tensor.element` and returns an array of type `tensor.element`.
+        """
+        if self.tensor is None:
+            raise RuntimeError(f"The tensor of node {self.name} is unknown.")
+
+        self.element = self.tensor.element
+        self.element_output = self.tensor.element
diff --git a/python/cutlass/backend/evt/ir/node.py b/python/cutlass/backend/evt/ir/node.py
new file mode 100644
index 0000000000..87cca2f25e
--- /dev/null
+++ b/python/cutlass/backend/evt/ir/node.py
@@ -0,0 +1,293 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Base & visitor classes of DAGIR Nodes
+"""
+
+import ctypes
+from re import sub
+
+from cutlass_library import LayoutType
+
+from cutlass.backend.evt.ir.layout_algorithm import _list_to_tuple, _reverse_tuple
+from cutlass.backend.evt.ir.tensor import Tensor
+
+
+class ImplBase:
+    """
+    Base class for Node Implementation
+    """
+    def __init__(self, node) -> None:
+        self.node = node
+        self.name = node.name
+        self.tensor = node.tensor
+        self._type_decl = None
+        self.stride_dtype = "int64_t"
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        """
+        Match function used in get_underlying_impl
+        """
+        raise NotImplementedError(f"The `match` function is not defined.")
+
+    @property
+    def argument_type(self):
+        """
+        Default class for Argument Type
+        """
+        class _Argument(ctypes.Structure):
+            _fields_ = []
+
+            def __init__(self, *args, **kwargs) -> None:
+                pass
+
+        return _Argument
+
+    @property
+    def name_camel(self) -> str:
+        """
+        Return the CamelCase name.
+        """
+        return sub(r"(_|-)+", " ", self.name).title().replace(" ", "")
+
+    def _emit_cute_tuple(self, py_tuple):
+        """
+        Emit the cute tuple to C++ code
+        """
+        if isinstance(py_tuple, int):
+            if py_tuple in [0, 1]:
+                return f"cute::Int<{py_tuple}>"
+            else:
+                return f"{self.stride_dtype}"
+        elif isinstance(py_tuple, tuple):
+            decl = "cute::Stride<"
+            for item in py_tuple:
+                decl += self._emit_cute_tuple(item) + ", "
+            return decl[:-2] + ">"
+        else:
+            raise ValueError(f"_emit_cute_tuple only accepts tuple or int, got {type(py_tuple).__name__}")
+
+    @property
+    def stride_mnl(self):
+        """
+        Typename StrideMNL
+        """
+        stride = _list_to_tuple([self.stride[-2], self.stride[-1]] + list(_reverse_tuple(tuple(self.stride[:-2]))))
+        return self._emit_cute_tuple(stride)
+
+    def get_non_constant_stride(self, py_tuple):
+        if isinstance(py_tuple, int):
+            if py_tuple not in [0, 1]:
+                return py_tuple
+            else:
+                return None
+        non_constant_stride = []
+        for item in py_tuple:
+            item_out = self.get_non_constant_stride(item)
+            if item_out:
+                non_constant_stride.append(item_out)
+        return tuple(non_constant_stride)
+
+    def get_stride_mnl(self):
+        """
+        Get the non-zero stride mnl. This is used in argument construction
+        """
+        stride = _list_to_tuple([self.stride[-2], self.stride[-1]] + list(_reverse_tuple(tuple(self.stride[:-2]))))
+        return stride
+
+    def get_smem_size(self, *args, **kwargs):
+        """
+        Get the shared memory size and alignment of current node
+        """
+        return (0, 1)
+
+
+class NoOpImpl(ImplBase):
+    """
+    The NoOpImpl does nothing but forward its input to users
+    """
+    def __init__(self, node) -> None:
+        super().__init__(node)
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        if node.op == "store":
+            # Store that is not output is a No OP
+            return not node.is_output
+
+
+class NodeBase:
+    """
+    Base class of DAG Node
+    """
+    def __init__(self, name: str) -> None:
+        self.name = name
+        self.underlying_impl = None
+
+        self._tensor = None
+
+        # Whether the node is disabled for emit
+        self.disabled = False
+
+    @property
+    def name_camel(self) -> str:
+        """
+        Return the CamelCase name.
+        """
+        return self.underlying_impl.name_camel
+
+    @property
+    def tensor(self) -> Tensor:
+        """
+        Return the output tensor (concept: cutlass.backend.evt.ir.tensor)
+        """
+        return self._tensor
+
+    @tensor.setter
+    def tensor(self, kwargs):
+        """
+        Setting the tensor
+        """
+        self._tensor = Tensor(**kwargs)
+
+    #
+    # Helper functions for type/shape propagation
+    #
+
+    def shape_propagation(self, input_node_metas):
+        """
+        Infer shape from input nodes
+        General Broadcasting Rules from NumPy
+        When operating on two arrays, we compare their shapes element-wise.
+        It starts with the trailing (i.e. rightmost) dimension and works its
+        way left. Two dimensions are compatible when
+        1. they are equal
+        2. one of them is 1
+        """
+        if self._tensor is not None:
+            return
+
+        shape = None
+        for src in input_node_metas:
+            src_shape = src.tensor.shape
+            if shape is None:
+                shape = src_shape
+            else:
+                len_difference = len(shape) - len(src_shape)
+                if len_difference > 0:
+                    for _ in range(len_difference):
+                        src_shape = [1, ] + list(src_shape)
+                elif len_difference < 0:
+                    for _ in range(-len_difference):
+                        shape = [1, ] + list(shape)
+                broadcasted_shape = []
+                # Infer broadcast shape
+                for shape_dim, src_dim in zip(reversed(shape), reversed(src_shape)):
+                    if shape_dim == 1:
+                        broadcasted_shape = [src_dim, ] + list(broadcasted_shape)
+                    elif src_dim == 1:
+                        broadcasted_shape = [shape_dim, ] + list(broadcasted_shape)
+                    elif shape_dim == src_dim:
+                        broadcasted_shape = [shape_dim, ] + list(broadcasted_shape)
+                    else:
+                        error_msg = "Dimension mismatch between "
+                        for src_ in input_node_metas:
+                            error_msg += f"{src_.name}{src_.tensor.shape}, "
+                        error_msg = error_msg[:-2] + "."
+                        raise RuntimeError(error_msg)
+                shape = tuple(broadcasted_shape)
+
+        self._tensor = Tensor(element=self.element_output, shape=shape, layout_tag=LayoutType.RowMajor)
+
+    def type_propagation(self, *args, **kwargs):
+        """
+        Each node is associated with two data types: `element` and `element_output`.
+        The `element_output` is the type of return array of the node. The `element`
+        has specific meaning for different node types.
+        * Load Node: data type of tensor in gmem
+        * Compute Node: element compute
+        * Store Node: data type of tensor in gmem
+        This function must be overloaded in the derived classes
+        """
+        raise NotImplementedError(f"Function `type_propagation` is not overloaded in {self.__class__.__name__}")
+
+    def broadcast_propagation(self, input_node_metas: 'list[NodeBase]'):
+        """
+        Propagate the broadcast in the reversed topological order.
+        For example:
+            C[l, m, n] = A[m, 1] + B[l, m, n]
+        After the broadcast propagation, it will be come
+            C[l, m, n] = A[l, m, n] + B[l, m, n]
+        and each tensor will have a proper stride accessing the underlying tensor
+        """
+        if self.tensor is None:
+            raise RuntimeError(f"The tensor of node {self.name} is unknown.")
+        for child in input_node_metas:
+            child.tensor.broadcast(self.tensor.shape)
+
+    def get_underlying_impl(self, problem_size: tuple):
+        """
+        Get the underlying implementation of the current node.
+        """
+        if self.tensor is None:
+            raise RuntimeError(f"The Layout of node {self.name} is unknown. Please call PassShapeTypePropagation first.")
+
+        for impl in self.possible_impls:
+            if impl.match(self, problem_size):
+                self.underlying_impl = impl(self)
+                break
+
+        if self.underlying_impl is None:
+            raise NotImplementedError(f"No matching op for node {self.name} with stride {self.tensor.stride}.")
+
+#
+# Visitor Nodes & Impls
+#
+
+class TopoVisitorImpl(ImplBase):
+    """
+    Impl for topological visitor
+    """
+    def __init__(self, node) -> None:
+        super().__init__(node.output_node)
+        self.name = node.name
+        self.element_output = node.output_node.element_output
+
+class TopoVisitorNode(NodeBase):
+    def __init__(self, name: str, subgraph, output_node) -> None:
+        super().__init__(name)
+        self.subgraph = subgraph
+        self.output_node = output_node
+        self.op = "dag"
+        self.underlying_impl = TopoVisitorImpl(self)
diff --git a/python/cutlass/backend/evt/ir/store_nodes.py b/python/cutlass/backend/evt/ir/store_nodes.py
new file mode 100644
index 0000000000..4990e87a9f
--- /dev/null
+++ b/python/cutlass/backend/evt/ir/store_nodes.py
@@ -0,0 +1,277 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Store node and implementations
+"""
+
+import ctypes
+
+from cutlass_library import DataType
+
+from cutlass.backend.c_types import tuple_factory
+from cutlass.backend.epilogue import dtype2ctype, to_ctype_value
+from cutlass.backend.evt.ir.node import NodeBase, ImplBase, NoOpImpl
+from cutlass.backend.evt.ir.tensor import Tensor
+from cutlass.backend.library import FloatRoundStyle, FunctionalOp
+
+
+class StoreImplBase(ImplBase):
+    """
+    Base class for store node implementation
+    """
+    reserved_names = ["D"]
+    def __init__(self, node) -> None:
+        super().__init__(node)
+        self.element = node.element
+        self.element_output = node.element_output
+        self.stride = node.store_tensor.stride
+
+
+class StoreDImpl(StoreImplBase):
+    """
+    Store D implementation
+    """
+
+    @property
+    def argument_type_d(self):
+        stride_mnl = self.get_stride_mnl()
+        tuple_type = tuple_factory(stride_mnl, self.stride_dtype)
+        class _Argument(ctypes.Structure):
+            _fields_ = [
+                ("ptr_D", ctypes.c_void_p),
+                ("stride_D", tuple_type)
+            ]
+            def __init__(self, ptr: int) -> None:
+                self.ptr_D = ptr
+                self.stride_D = tuple_type(stride_mnl)
+
+        return _Argument
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        if node.name == "D" and node.store_tensor.shape == problem_size:
+            return True
+        return False
+
+
+class AuxStoreImpl(StoreImplBase):
+    def __init__(self, node) -> None:
+        super().__init__(node)
+        self.round_style = FloatRoundStyle.ToNearest
+
+    @property
+    def argument_type(self):
+        stride_mnl = self.get_stride_mnl()
+        name = self.name
+        tuple_type = tuple_factory(stride_mnl, self.stride_dtype)
+        class _Argument(ctypes.Structure):
+            _fields_ = [
+                ("ptr_aux", ctypes.c_void_p),
+                ("dAux", tuple_type)
+            ]
+            def __init__(self, kwargs) -> None:
+                ptr = kwargs[name]
+                self.ptr_aux = ptr
+                self.dAux = tuple_type(stride_mnl)
+
+        return _Argument
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        if not node.is_output:
+            return False
+        if node.name in StoreImplBase.reserved_names:
+            return False
+
+        strideMN = node.store_tensor.stride[-2:]
+        if (strideMN[0] == 1 and strideMN[1] != 0 or
+            strideMN[0] != 0 and strideMN[1] == 1 ):
+            return True
+        else:
+            return False
+
+
+class ReductionImplBase(StoreImplBase):
+    def __init__(self, node) -> None:
+        super().__init__(node)
+        self.element = node.store_tensor.element
+        self.element_compute = node.element_compute
+        self.reg_reduce_fn = self.node.reg_reduce_fn
+        self.gmem_reduce_fn = self.node.gmem_reduce_fn
+        self.round_style = node.round_style
+        self.stride_dtype = "int"
+
+    def get_reduce_identity(self):
+        """
+        Return the reduction identity of the current reduce_fn
+        """
+        maxes = {
+            DataType.f32: (2 ** 31) - 1,
+            DataType.f16: (2 ** 15),
+            DataType.s32: (2 ** 31) - 1,
+            DataType.s8: (2 ** 7) - 1
+        }
+        mins = {
+            DataType.f32: -maxes[DataType.f32],
+            DataType.f16: -maxes[DataType.f16],
+            DataType.s32: -maxes[DataType.s32],
+            DataType.s8: -maxes[DataType.s8]
+        }
+        if self.reg_reduce_fn == FunctionalOp.Maximum:
+            if self.element_compute not in mins:
+                raise Exception(f"No min entry for data type {self.element_compute}")
+            return to_ctype_value(mins[self.element_compute], self.element_compute)
+        elif self.reg_reduce_fn == FunctionalOp.Multiplies:
+            return to_ctype_value(1., self.element_compute)
+        elif self.reg_reduce_fn == FunctionalOp.Minimum:
+            if self.element_compute not in maxes:
+                raise Exception(f"No max entry for data type {self.element_compute}")
+            return to_ctype_value(maxes[self.element_compute], self.element_compute)
+        else:
+            return to_ctype_value(0., self.element_compute)
+
+    @property
+    def argument_type(self):
+        self.get_reduce_identity()
+        stride_mnl = self.get_stride_mnl()
+        name = self.name
+        tuple_type = tuple_factory(stride_mnl, self.stride_dtype)
+        element_compute = self.element_compute
+        reduce_identity = self.get_reduce_identity()
+        class _Argument(ctypes.Structure):
+            _fields_ = [
+                ("ptr", ctypes.c_void_p),
+                ("reduce_identity", dtype2ctype[element_compute]),
+                ("dMNL", tuple_type)
+            ]
+            def __init__(self, kwargs) -> None:
+                ptr = kwargs[name]
+                self.ptr = ptr
+                self.reduce_identity = reduce_identity
+                self.dMNL = tuple_type(stride_mnl)
+
+        return _Argument
+
+
+class ColumnReductionImpl(ReductionImplBase):
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        if not node.is_output:
+            return False
+        if node.name in StoreImplBase.reserved_names:
+            return False
+
+        strideMN = node.store_tensor.stride[-2:]
+        if strideMN == (1, 0):
+            return True
+        else:
+            return False
+
+
+class RowReductionImpl(ReductionImplBase):
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        if not node.is_output:
+            return False
+        if node.name in StoreImplBase.reserved_names:
+            return False
+
+        strideMN = node.store_tensor.stride[-2:]
+        if strideMN == (0, 1):
+            return True
+        else:
+            return False
+
+
+class ScalarReductionImpl(ReductionImplBase):
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        if not node.is_output:
+            return False
+        if node.name in StoreImplBase.reserved_names:
+            return False
+
+        strideMN = node.store_tensor.stride[-2:]
+        if strideMN == (0, 0):
+            return True
+        else:
+            return False
+
+
+class StoreNode(NodeBase):
+    """
+    Store node
+    """
+    possible_impls = [
+        AuxStoreImpl, RowReductionImpl,
+        ColumnReductionImpl, ScalarReductionImpl,
+        NoOpImpl, StoreDImpl
+    ]
+    def __init__(self, name: str) -> None:
+        super().__init__(name)
+        self.op = "store"
+        self.is_output = False
+        self._store_tensor = None
+
+    @property
+    def store_tensor(self) -> Tensor:
+        """
+        Return the output tensor (concept: cutlass.backend.evt.ir.tensor)
+        """
+        return self._store_tensor
+
+    @store_tensor.setter
+    def store_tensor(self, kwargs):
+        """
+        Setting the tensor
+        """
+        self._store_tensor = Tensor(**kwargs)
+
+    def type_propagation(self, input_node_metas: 'list[NodeBase]'):
+        """
+        The store nodes has element_output = element_input
+        """
+        if self.is_output:
+            if self.store_tensor is None:
+                raise RuntimeError(f"The store tensor of node {self.name} is unknown.")
+            self.element = self.store_tensor.element
+        assert len(input_node_metas) == 1, "Store node can only have one input node"
+        self.element_output = input_node_metas[0].element_output
+
+    def broadcast_propagation(self, input_node_metas: 'list[NodeBase]'):
+        super().broadcast_propagation(input_node_metas)
+        if self.is_output:
+            self._store_tensor.broadcast(self.tensor.shape)
diff --git a/python/cutlass/backend/evt/ir/tensor.py b/python/cutlass/backend/evt/ir/tensor.py
new file mode 100644
index 0000000000..18641e0eea
--- /dev/null
+++ b/python/cutlass/backend/evt/ir/tensor.py
@@ -0,0 +1,130 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+High-level class for tensor
+"""
+
+from cutlass_library import LayoutType
+
+from cutlass.backend.evt.ir.layout_algorithm import (
+    Layout,
+    broadcast,
+    canonicalization,
+    permutation,
+    reshape,
+    _reverse_tuple
+)
+from cutlass.utils.datatypes import get_datatype_and_layout, get_tensor_shape, library_type
+
+
+class Tensor:
+    """
+    The tensor abstracts the data type
+    """
+    def __init__(self, tensor=None, element=None, shape=None, layout_tag=None, is_constant=False) -> None:
+        if element is not None and tensor is not None:
+            raise Exception(f"Must not specify both element and tensor")
+        elif shape is not None and tensor is not None:
+            raise Exception(f"Must not specify both shape and tensor")
+        elif layout_tag is not None and tensor is not None:
+            raise Exception(f"Must not specify both layout_tag and tensor")
+        elif (element is None or layout_tag is None or shape is None) and (tensor is None) :
+            raise Exception(f"Must specify one of (element, shape, layout) or (tensor)")
+
+        if isinstance(tensor, Tensor):
+            # Directly copy all the attributes
+            self.__dict__.update(vars(tensor))
+        else:
+            if tensor is None:
+                self.element = library_type(element)
+            else:
+                self.element, layout_tag = get_datatype_and_layout(tensor)
+                shape = get_tensor_shape(tensor)
+            if layout_tag == LayoutType.RowMajor:
+                self.layout = Layout(shape[::-1])
+            elif layout_tag == LayoutType.ColumnMajor:
+                self.layout = permutation(Layout(shape), [idx for idx in reversed(range(len(shape)))])
+            self.layout = canonicalization(self.layout)
+
+            self.is_constant = is_constant
+            # Save the tensor value if it is constant
+            if is_constant and tensor is not None:
+                self.value = tensor
+
+    @property
+    def shape(self):
+        """
+        Returns the RowMajor layout shape
+        """
+        return _reverse_tuple(self.layout.shape)
+
+    @property
+    def stride(self):
+        """
+        Returns the RowMajor layout stride
+        """
+        return _reverse_tuple(self.layout.stride)
+
+    @property
+    def rank(self):
+        """
+        Returns the rank of the tensor
+        """
+        return len(self.shape)
+
+    #
+    # Layout Algorithms
+    #
+
+    def broadcast(self, shape):
+        """
+        Broadcast self.layout to shape
+        """
+        assert isinstance(shape, tuple)
+        self.layout = broadcast(self.layout, _reverse_tuple(shape))
+
+    def reshape(self, shape):
+        """
+        Reshape self.layout to shape
+        """
+        assert isinstance(shape, tuple)
+        reverse_shape = _reverse_tuple(shape)
+        self.layout = reshape(self.layout, reverse_shape)
+
+    def permute(self, indices):
+        """
+        Permute self.layout according to indices
+        """
+        length = len(indices)
+        indices = [length - idx - 1 for idx in indices]
+        self.layout = permutation(self.layout, indices[::-1])
diff --git a/python/cutlass/backend/evt/passes/__init__.py b/python/cutlass/backend/evt/passes/__init__.py
new file mode 100644
index 0000000000..4bd8180caf
--- /dev/null
+++ b/python/cutlass/backend/evt/passes/__init__.py
@@ -0,0 +1,42 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from cutlass.backend.evt.passes.graph_drawer import EVTGraphDrawer
+from cutlass.backend.evt.passes.pass_argument_type import PassGetArgumentType
+from cutlass.backend.evt.passes.pass_dag_2_tree import PassDAG2Tree
+from cutlass.backend.evt.passes.pass_get_impl import PassGetImpl
+from cutlass.backend.evt.passes.pass_fix_element_d import PassFixElementD
+from cutlass.backend.evt.passes.pass_layout_elimination import PassLayoutManipulateElimination
+from cutlass.backend.evt.passes.pass_manager import EVTPassManager
+from cutlass.backend.evt.passes.pass_preprocess_red import PassPreprocessRed
+from cutlass.backend.evt.passes.pass_shape_type_propagation import PassShapeTypePropagation
+from cutlass.backend.evt.passes.smem_size_calculator import GetSmemSize
diff --git a/python/cutlass/backend/evt/passes/graph_drawer.py b/python/cutlass/backend/evt/passes/graph_drawer.py
new file mode 100644
index 0000000000..a2a73640c1
--- /dev/null
+++ b/python/cutlass/backend/evt/passes/graph_drawer.py
@@ -0,0 +1,142 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import subprocess
+
+from cutlass_library import DataTypeTag
+import pydot
+
+from cutlass.backend.evt.ir.dag_ir import DAGIR
+
+
+_COLOR_MAP = {
+    "load": '"AliceBlue"',
+    "compute": "LemonChiffon1",
+    "accumulator": "LightGrey",
+    "store": "PowderBlue",
+    "layout": "lightseagreen",
+    "dag": "darkorange"
+}
+
+
+class EVTGraphDrawer:
+    """
+    Visualize a EVT DAGIR with graphviz
+    """
+    def __init__(
+        self,
+        graph: DAGIR,
+        name: str
+    ):
+        self._name = name
+        self._dot_graphs = {}
+
+        self._dot_graphs[name] = self._to_dot(graph, name)
+
+    def _get_node_style(self, node):
+        template = {
+            "shape": "record",
+            "fillcolor": "#CAFFE3",
+            "style": '"filled,rounded"',
+            "fontcolor": "#000000",
+        }
+        if node.op in _COLOR_MAP:
+            template["fillcolor"] = _COLOR_MAP[node.op]
+        else:
+            raise NotImplementedError("unknown node op")
+        if node.disabled:
+            template["fontcolor"] = "grey"
+            template["fillcolor"] = "white"
+        return template
+
+    def _get_node_label(self, node):
+        label = "{" + f"name={node.name}|op={node.op}"
+        if node.op == "layout":
+            label += f"|fn={node.fn.__name__}"
+            for key in node.kwargs:
+                label += f"|{key}={node.kwargs[key]}"
+        if node.underlying_impl is not None:
+            label += f"|impl={type(node.underlying_impl).__name__}"
+            if node.op == "load":
+                label += f"|element_output={DataTypeTag[node.underlying_impl.element]}"
+            elif node.op == "compute":
+                label += f"|element_compute={DataTypeTag[node.underlying_impl.element_compute]}|element_output={DataTypeTag[node.underlying_impl.element_output]}"
+            elif node.op == "store":
+                label += f"|element_store={DataTypeTag[node.underlying_impl.element]}|element_output={DataTypeTag[node.underlying_impl.element_output]}"
+            elif node.op == "dag":
+                label += f"|element_output={DataTypeTag[node.underlying_impl.element_output]}"
+        if node.tensor is not None:
+            shape = node.tensor.shape
+            stride = node.tensor.stride
+            label += f"|shape={shape}|stride={stride}"
+
+        if hasattr(node, "store_tensor"):
+            if node.store_tensor is not None:
+                store_shape = node.store_tensor.shape
+                store_stride = node.store_tensor.stride
+                label += f"|store_shape={store_shape}|stride_stride={store_stride}"
+
+        label += "}"
+        return label
+
+    def _to_dot(
+        self,
+        graph: DAGIR,
+        name: str
+    ):
+        dot_graph = pydot.Dot(name, randir="TB")
+        for node in graph.nodes_meta:
+            style = self._get_node_style(node)
+            label = self._get_node_label(node)
+            dot_node = pydot.Node(
+                node.name, label=label, **style
+            )
+            dot_graph.add_node(dot_node)
+            if node.op == "dag":
+                dot_subgraph = self._to_dot(node.subgraph, name=node.name)
+                self._dot_graphs[node.name] = dot_subgraph
+
+        # Add edges
+        for src, dst in graph.edges:
+            weight = graph.get_edge_weight(src, dst)
+            dot_graph.add_edge(pydot.Edge(src, dst, label=weight))
+
+        return dot_graph
+
+    def get_dot_graph(self) -> pydot.Dot:
+        return [(key, self.get_dot_graph_by_name(key)) for key in self._dot_graphs.keys()]
+
+    def get_dot_graph_by_name(self, name) -> pydot.Dot:
+        return self._dot_graphs[name]
+
+    def get_main_dot_graph(self) -> pydot.Dot:
+        return self._dot_graphs[self._name]
diff --git a/python/cutlass/backend/evt/passes/pass_argument_type.py b/python/cutlass/backend/evt/passes/pass_argument_type.py
new file mode 100644
index 0000000000..11cf1fe266
--- /dev/null
+++ b/python/cutlass/backend/evt/passes/pass_argument_type.py
@@ -0,0 +1,116 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Construct the epilogue visitor argument type
+"""
+
+from cutlass.backend.c_types import visitor_factory
+from cutlass.backend.evt.ir import TopoVisitorNode
+from cutlass.backend.evt.passes.pass_dag_2_tree import PassDAG2Tree
+from cutlass.backend.evt.passes.pass_get_impl import PassGetImpl
+from cutlass.backend.evt.passes.pass_manager import EVTPassBase
+from cutlass.backend.evt.passes.pass_shape_type_propagation import PassShapeTypePropagation
+
+
+class PassGetArgumentType(EVTPassBase):
+    """
+    Construct the epilogue visitor argument type
+    """
+    dependencies = [
+        PassShapeTypePropagation,     # The Layout of all nodes must be set
+        PassDAG2Tree,                 # The type of each node must be set
+        PassGetImpl                   # The DAG subgraphs must be set
+    ]
+
+    def requires(self) -> None:
+        # Check "D" is in the node list
+        if self.cc == 90 and (not self.dag_ir.has_node("D")):
+            raise SyntaxError(
+                "Sm90 EVT requires the epilogue to have a returned tensor D, "
+                "but the variable 'D' is not found in the return values.")
+
+    def call(self):
+        nodes = self.dag_ir.nodes_topological_order()
+        self.argument_types = {}
+        for node in nodes:
+            meta = self.dag_ir.get_node_meta(node)
+            if not meta.disabled:
+                self.argument_types[node] = meta.underlying_impl.argument_type
+            if node == "D" and self.cc == 90:
+                continue
+            if isinstance(meta, TopoVisitorNode):
+                self.get_dag_argument_type(node)
+            else:
+                self.get_evt_argument_type(node)
+
+        self.cc_specific_method(self.set_argument_type)()
+
+    def get_evt_argument_type(self, node):
+        # Sort the input nodes by edge weight
+        input_types = [self.argument_types[child] for child in self.dag_ir.get_all_inputs(node)]
+        if len(input_types) > 0:
+            self.argument_types[node] = visitor_factory(
+                input_types + [self.argument_types[node],], self.dag_ir.get_all_inputs(node) + [node,])
+
+    def get_dag_argument_type(self, node):
+        meta = self.dag_ir.get_node_meta(node)
+        subgraph = meta.subgraph
+        subgraph_nodes = subgraph.nodes_topological_order()
+        # Visit the unvisited nodes in subgraph
+        for n in subgraph_nodes:
+            m = subgraph.get_node_meta(n)
+            if m.disabled:
+                continue
+            else:
+                self.argument_types[n] = m.underlying_impl.argument_type
+        input_types = [self.argument_types[child] for child in subgraph_nodes[:-1]]
+        if len(input_types) > 0:
+            self.argument_types[node] = visitor_factory(input_types, subgraph_nodes[:-1])
+
+    def set_argument_type(self):
+        pass
+
+    def sm90_set_argument_type(self):
+        self.dag_ir.epilogue_thread_type = self.argument_types[self.dag_ir.get_all_inputs("D")[0]]
+        # Get the tensorD argument type
+        self.dag_ir.arg_d_type = self.dag_ir.get_node_meta("D").underlying_impl.argument_type_d
+
+        # Get the tensorC argument type
+        if self.dag_ir.has_node("C"):
+            self.dag_ir.arg_c_type = self.dag_ir.get_node_meta("C").underlying_impl.argument_type_c
+        else:
+            self.dag_ir.arg_c_type = self.dag_ir.arg_d_type
+
+    def sm80_set_argument_type(self):
+        nodes = self.dag_ir.nodes_topological_order()
+        self.dag_ir.epilogue_thread_type = self.argument_types[nodes[-1]]
diff --git a/python/cutlass/backend/evt/passes/pass_dag_2_tree.py b/python/cutlass/backend/evt/passes/pass_dag_2_tree.py
new file mode 100644
index 0000000000..9a1cb85160
--- /dev/null
+++ b/python/cutlass/backend/evt/passes/pass_dag_2_tree.py
@@ -0,0 +1,147 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Merge non-tree sub-graphs of the DAG IR into a single DAG. The fused DAG will be implemented
+by the topological visitor, while the rest of the graph will be implemented with the tree visitor.
+"""
+
+from copy import deepcopy
+
+from cutlass.backend.evt.ir import DAGIR, TopoVisitorNode
+from cutlass.backend.evt.passes.pass_get_impl import PassGetImpl
+from cutlass.backend.evt.passes.pass_manager import EVTPassBase
+from cutlass.backend.evt.passes.pass_shape_type_propagation import PassShapeTypePropagation
+
+
+class PassDAG2Tree(EVTPassBase):
+    """
+    Convert the DAG IR to Tree by fusing subgraphs
+    """
+    dependencies = [
+        PassShapeTypePropagation,
+        PassGetImpl
+    ]
+
+    def call(self):
+        # Step 1: find the nodes that have multiple parents
+        multi_parent_nodes = []
+
+        for node in self.dag_ir.nodes_topological_order():
+            if self.dag_ir.out_degree(node) > 1:
+                multi_parent_nodes.append(node)
+        # Step 2: find the lowest common ancestor (LCA) of all its parents
+        for node in multi_parent_nodes:
+            # A multi-parent node could be already fused by the previous node
+            if not self.dag_ir.has_node(node):
+                continue
+            # A node uncovered by the previous fusions can have out degree change
+            # Case 1: it has <= 1 edges to the previously fused subgraph, no degree change
+            # Case 2: it has more than one edges to the previously fused subgraph, degree drops
+            if self.dag_ir.out_degree(node) <= 1:
+                continue
+
+            # Otherwise, the node still
+            reachable_nodes = []
+            # Complexity: O(Dout*N)
+            for parent in self.dag_ir.get_users(node):
+                reachable_nodes.append(set(self.dag_ir.all_reachable_nodes(parent)))
+            # get the common reachable objects
+            common_items = set.intersection(*reachable_nodes)
+
+            # If common ancestor exists, find the lowest one
+            if len(common_items) > 0:
+                topo_order = self.dag_ir.nodes_topological_order()
+                lca = None
+                topo_idx = -1
+                for item in common_items:
+                    if lca is None:
+                        lca = item
+                        topo_idx = topo_order.index(item)
+                    else:
+                        if topo_idx > topo_order.index(item):
+                            lca = item
+                            topo_idx = topo_order.index(item)
+                # The lca is the output node of the DAG node
+                # Get the nodes to be fused
+                node_to_fuse = set.union(*reachable_nodes).difference(common_items)
+                node_to_fuse.add(lca)
+                # Get all the input nodes
+                all_input_nodes = []
+                all_output_nodes = []
+                for node in node_to_fuse:
+                    all_input_nodes.append(set(self.dag_ir.get_all_inputs(node)))
+                    all_output_nodes.append(set(self.dag_ir.get_users(node)))
+                all_input_nodes = set.union(*all_input_nodes)
+                all_output_nodes = set.union(*all_output_nodes)
+
+                new_subgraph_nodes = set.union(node_to_fuse, all_input_nodes, all_output_nodes)
+
+                # Create the subgraph
+                subgraph_ = self.dag_ir._graph.subgraph(new_subgraph_nodes)
+                subgraph = DAGIR()
+                for node in subgraph_.nodes:
+                    meta = deepcopy(self.dag_ir.get_node_meta(node))
+                    if node not in node_to_fuse:
+                        meta.disabled = True
+                    subgraph.add_node(meta)
+                for edge in subgraph_.edges:
+                    subgraph.add_edge(edge[0], edge[1], self.dag_ir.get_edge_weight(edge[0], edge[1]))
+
+
+                # Create the fused node
+                dag_node = TopoVisitorNode(
+                    name=f"dag_{lca}", subgraph=subgraph,
+                    output_node=self.dag_ir.get_node_meta(lca))
+                self.dag_ir.add_node(dag_node)
+
+                # Add input edges
+                for idx, node in enumerate(all_input_nodes):
+                    self.dag_ir.add_edge(node, dag_node.name, weight=idx)
+
+                # Replace all uses with DAG node (only 1 output node)
+                self.dag_ir.replace_all_uses_with(lca, dag_node.name)
+
+                # Remove all fused nodes
+                node_to_fuse.remove(lca)
+                for node in node_to_fuse:
+                    self.dag_ir.remove_node(node)
+
+            else:
+                raise NotImplementedError("No LCA found. Consider SplitTreeVisitor.")
+
+    def ensures(self) -> None:
+        # Ensure that after the pass, the resulting DAG becomes a tree
+        for node in self.dag_ir.nodes:
+            out_degree = self.dag_ir.out_degree(node)
+            if out_degree > 1:
+                raise RuntimeError(f"PassDAG2Tree failed. Node {node} still have outdegree = {out_degree}")
diff --git a/python/cutlass/backend/evt/passes/pass_fix_element_d.py b/python/cutlass/backend/evt/passes/pass_fix_element_d.py
new file mode 100644
index 0000000000..7cc5f70f8f
--- /dev/null
+++ b/python/cutlass/backend/evt/passes/pass_fix_element_d.py
@@ -0,0 +1,64 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Fix the element_output of producer of D.
+
+In Sm90 epilogue visitor, the node writing D to gmem does not have internal
+element converter, so the compute node producing D must have element_output = type(D).
+"""
+
+from cutlass.backend.evt.passes.pass_layout_elimination import PassLayoutManipulateElimination
+from cutlass.backend.evt.passes.pass_manager import EVTPassBase
+
+
+class PassFixElementD(EVTPassBase):
+    """
+    In Sm90 epilogue visitor, the node writing D to gmem does not have internal
+    element converter, so the compute node producing D must have
+    element_output = type(D)
+    """
+    dependencies = [
+        PassLayoutManipulateElimination
+    ]
+    def get_producer(self, node, element_D):
+        node_meta = self.dag_ir.get_node_meta(node)
+        if node_meta.op == "compute":
+            node_meta.element_output = element_D
+        elif node_meta.op == "store":
+            self.get_producer(self.dag_ir.get_all_inputs(node)[0], element_D)
+
+    def call(self):
+        if self.dag_ir.has_node("D"):
+            node_d_meta = self.dag_ir.get_node_meta("D")
+            element_D = node_d_meta.store_tensor.element
+            self.get_producer("D", element_D)
diff --git a/python/cutlass/backend/evt/passes/pass_get_impl.py b/python/cutlass/backend/evt/passes/pass_get_impl.py
new file mode 100644
index 0000000000..47f390353a
--- /dev/null
+++ b/python/cutlass/backend/evt/passes/pass_get_impl.py
@@ -0,0 +1,90 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Infer the underlying implement of each node.
+
+While the frontend only distinguish between Load/Store/Compute Node,
+each of these nodes can have different underlying implementation based
+on their layout. For instance, a LoadNode can be AuxLoad, Row/Col/Scalar broadcast, etc.
+This pass infers the underlying impl of each node
+"""
+
+import cutlass.backend.evt.backend as evt_backend
+from cutlass.backend.evt.ir import DAGIR, LoadNode
+from cutlass.backend.evt.passes.pass_fix_element_d import PassFixElementD
+from cutlass.backend.evt.passes.pass_manager import EVTPassBase
+from cutlass.backend.evt.passes.pass_no_op_elimination import PassNoOpElimination
+from cutlass.backend.evt.passes.pass_shape_type_propagation import PassShapeTypePropagation
+from cutlass.backend.evt.passes.util import cc_map
+
+
+class PassGetImpl(EVTPassBase):
+    """
+    While the frontend only distinguish between Load/Store/Compute Node,
+    each of these nodes can have different underlying implementation based
+    on their layout. For instance, a LoadNode can be AuxLoad, Row/Col/Scalar broadcast, etc.
+    This pass infers the underlying impl of each node
+    """
+    dependencies = [
+        PassShapeTypePropagation,  # The shape and type info are required for inference
+        PassFixElementD
+    ]
+
+    def __init__(self, dag_ir: DAGIR) -> None:
+        super().__init__(dag_ir)
+        self.no_op_elimination = PassNoOpElimination(dag_ir)
+
+    def requires(self) -> None:
+        # Verify "accum" is in the arg list
+        if not self.dag_ir.has_node("accum"):
+            raise SyntaxError("Cannot find 'accum' in the argument list.")
+
+    def call(self):
+        # The loop structure of the epilogue is determined by the
+        # accumulator shape
+        accumulator: LoadNode = self.dag_ir.get_node_meta("accum")
+        problem_size = accumulator.tensor.shape
+
+        for node_meta in self.dag_ir.node_metas_topological_order():
+            node_meta.get_underlying_impl(problem_size)
+
+    def ensures(self) -> None:
+        # Some nodes will be lowered to NoOp, eliminate them
+        self.no_op_elimination()
+        # Lower to cc-specific impl
+        for node_meta in self.dag_ir.nodes_meta:
+            node_impl_ccs = getattr(evt_backend, f"sm{cc_map[self.cc]}_nodes")
+            node_meta.underlying_impl = getattr(
+                node_impl_ccs,
+                f"Sm{cc_map[self.cc]}" + node_meta.underlying_impl.__class__.__name__
+            )(node_meta)
diff --git a/python/cutlass/backend/evt/passes/pass_layout_elimination.py b/python/cutlass/backend/evt/passes/pass_layout_elimination.py
new file mode 100644
index 0000000000..be57deb29d
--- /dev/null
+++ b/python/cutlass/backend/evt/passes/pass_layout_elimination.py
@@ -0,0 +1,217 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Eliminate layout manipulation nodes
+"""
+
+from copy import deepcopy
+
+from cutlass.backend.evt.ir import DAGIR, LayoutNode
+from cutlass.backend.evt.passes.pass_manager import EVTPassBase
+from cutlass.backend.evt.passes.pass_shape_type_propagation import PassShapeTypePropagation
+
+
+class PassLayoutManipulateElimination(EVTPassBase):
+    """
+    Eliminate layout manipulation nodes
+    """
+    dependencies = [PassShapeTypePropagation]
+
+    def __init__(self, dag_ir: DAGIR) -> None:
+        super().__init__(dag_ir)
+        self.copy_cnt = 0
+
+    def call(self):
+        self.layout_nodes_worklist = self.get_all_layout_nodes()
+        # Run while loop utill all layout nodes are eliminated
+        while(len(self.layout_nodes_worklist) > 0):
+            node = self.layout_nodes_worklist.pop(0)
+            # for node in layout_nodes:
+            # Step 1: get the propagation direction
+            direction = self.get_propagation_direction(node)
+            self.visited = []
+            getattr(self, f"propagate_to_{direction}")(self.dag_ir.get_node_meta(node), node)
+            # Eliminate the current node
+            input_node = self.dag_ir.get_all_inputs(node)[0]
+            self.dag_ir.replace_all_uses_with(node, input_node)
+            # layout_nodes = self.get_all_layout_nodes()
+
+    def get_all_layout_nodes(self):
+        layout_nodes = []
+        for node_meta in reversed(self.dag_ir.node_metas_topological_order()):
+            if isinstance(node_meta, LayoutNode):
+                layout_nodes.append(node_meta.name)
+        return layout_nodes
+
+    def get_propagation_direction(self, node: str):
+        """
+        The logic is propagating all layout nodes away from the accumulator node.
+        """
+        self.visited = []
+        self.get_influenced_users(node)
+        nodes_influenced_dir_users = self.visited
+        self.visited = []
+        self.get_influenced_inputs(node)
+        nodes_influenced_dir_inputs = self.visited
+
+        if "accum" in nodes_influenced_dir_users and "accum" not in nodes_influenced_dir_inputs:
+            return "inputs"
+        elif "accum" not in nodes_influenced_dir_users and "accum" in nodes_influenced_dir_inputs:
+            return "users"
+        else:
+            raise RuntimeError("Unsolved propagation direction")
+
+    # Get all influenced nodes if we propagate along the user direction
+    def get_influenced_users(self, node: str):
+        if node in self.visited:
+            return
+        self.visited.append(node)
+
+        users = self.dag_ir.get_users(node)
+        for user in users:
+            self.get_influenced_users(user)
+        user_inputs = []
+        for user in users:
+            user_inputs.append(set(self.dag_ir.get_all_inputs(user)))
+        if len(user_inputs) > 0:
+            user_inputs = set.union(*user_inputs)
+            user_inputs.remove(node)
+            for input in user_inputs:
+                self.get_influenced_inputs(input)
+
+    # Get all influenced nodes if we propagate along the input direction
+    def get_influenced_inputs(self, node: str):
+        if node in self.visited:
+            return
+        self.visited.append(node)
+
+        inputs = self.dag_ir.get_all_inputs(node)
+        for input in inputs:
+            self.get_influenced_inputs(input)
+        input_users = []
+        for input in inputs:
+            input_users.append(set(self.dag_ir.get_users(input)))
+        if len(input_users) > 0:
+            input_users = set.union(*input_users)
+            input_users.remove(node)
+            for user in input_users:
+                self.get_influenced_users(user)
+
+    def add_copy_before(self, layout_node_meta: LayoutNode, target: str):
+        copied_node_meta = deepcopy(layout_node_meta)
+        copied_node = f"{copied_node_meta.name}_copy{self.copy_cnt}"
+        self.copy_cnt += 1
+        copied_node_meta.name = copied_node
+        self.dag_ir.add_node(copied_node_meta)
+        # Add edges
+        target_inputs = self.dag_ir.get_all_inputs(target)
+        for src in target_inputs:
+            self.dag_ir.remove_edge(src, target)
+            self.dag_ir.add_edge(src, copied_node)
+        self.dag_ir.add_edge(copied_node, target)
+        self.layout_nodes_worklist.append(copied_node)
+
+    def add_copy_after(self, layout_node_meta: LayoutNode, target: str):
+        copied_node_meta = deepcopy(layout_node_meta)
+        copied_node = f"{copied_node_meta.name}_copy{self.copy_cnt}"
+        self.copy_cnt += 1
+        copied_node_meta.name = copied_node
+        self.dag_ir.add_node(copied_node_meta)
+        # Add edges
+        users = self.dag_ir.get_users(target)
+        for user in users:
+            self.dag_ir.remove_edge(target, user)
+            self.dag_ir.add_edge(copied_node, user)
+        self.dag_ir.add_edge(target, copied_node)
+        self.layout_nodes_worklist.append(copied_node)
+
+    # Propagate the layout `node` along the user direction
+    def propagate_to_users(self, layout_node_meta: LayoutNode, node: str):
+        """
+        Propagate layout node to users
+        """
+        if node in self.visited:
+            # Avoid applying twice
+            return
+        self.visited.append(node)
+
+        node_meta = self.dag_ir.get_node_meta(node)
+        if layout_node_meta.name != node:
+            if isinstance(node_meta, LayoutNode):
+                # Layout node is not transparent with layout node
+                self.add_copy_before(layout_node_meta, node)
+                return
+            else:
+                layout_node_meta.apply_to_user(node_meta)
+
+        users = self.dag_ir.get_users(node)
+        user_inputs = []
+        for user in users:
+            user_inputs.append(set(self.dag_ir.get_all_inputs(user)))
+        for user in users:
+            self.propagate_to_users(layout_node_meta, user)
+        if len(user_inputs) > 0:
+            user_inputs = set.union(*user_inputs)
+            user_inputs.remove(node)
+            for input in user_inputs:
+                self.propagate_to_inputs(layout_node_meta.get_inverse_node(), input)
+
+    # Propagate the layout `node` along the input direction
+    def propagate_to_inputs(self, layout_node_meta: LayoutNode, node: str):
+        """
+        Propagate layout node to inputs
+        """
+        if node in self.visited:
+            # Avoid applying twice
+            return
+        self.visited.append(node)
+
+        node_meta = self.dag_ir.get_node_meta(node)
+        if layout_node_meta.name != node:
+            if isinstance(node_meta, LayoutNode):
+                # Layout node is not transparent with layout node
+                self.add_copy_after(layout_node_meta, node)
+                return
+            else:
+                layout_node_meta.apply_to_input(node_meta)
+        inputs = self.dag_ir.get_all_inputs(node)
+        input_users = []
+        for input in inputs:
+            input_users.append(set(self.dag_ir.get_users(input)))
+        for input in inputs:
+            self.propagate_to_inputs(layout_node_meta, input)
+        if len(input_users) > 0:
+            input_users = set.union(*input_users)
+            input_users.remove(node)
+            for user in input_users:
+                self.propagate_to_users(layout_node_meta.get_inverse_node(), user)
diff --git a/python/cutlass/backend/evt/passes/pass_manager.py b/python/cutlass/backend/evt/passes/pass_manager.py
new file mode 100644
index 0000000000..0b0b6bb938
--- /dev/null
+++ b/python/cutlass/backend/evt/passes/pass_manager.py
@@ -0,0 +1,164 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Pass manager for DAG IR.
+"""
+
+from typing import Any
+
+import networkx as nx
+
+from cutlass.backend.evt.ir import DAGIR
+from cutlass.backend.evt.passes.util import cc_map
+
+
+class EVTPassBase:
+    """
+    Base class for EVT Passes
+    """
+    dependencies = []
+    def __init__(self, dag_ir: DAGIR) -> None:
+        self.dag_ir = dag_ir
+        self.cc = self.dag_ir.cc
+
+    def requires(self) -> None:
+        """
+        This function will be called before the pass is run.
+        """
+        pass
+
+    def call(self) -> None:
+        """
+        The pass that is run through the self.dag_ir
+        """
+        raise NotImplementedError(
+            f"__call__ is not overwritten in Pass {self.__class__.__name__}")
+
+    def ensures(self) -> None:
+        """
+        This function will be called after the pass is run.
+        """
+        pass
+
+    def __call__(self) -> Any:
+        self.requires()
+        self.call()
+        self.ensures()
+
+    def cc_specific_method(self, func):
+        """
+        This enables defining function that behaves differently under different cc
+        The simplest example of using this function is the following
+
+        .. highlight:: python
+        .. code-block:: python
+
+        class ExamplePass(EVTPassBase):
+
+            def call(sekf):
+                # This automatically select the smXX_func based on current cc
+                self.cc_specific_method(self.func)()
+
+            # Interface func, can be empty
+            def func(self):
+                pass
+
+            # Sm90 specific func
+            def sm90_func(self):
+                // sm90 specific method
+                return
+
+            # Sm80 specific func
+            def sm80_func(self):
+                // sm80 specific method
+                return
+        """
+        func_name = f"sm{cc_map[self.cc]}_{func.__name__}"
+        if hasattr(self, func_name):
+            return getattr(self, func_name)
+        else:
+            raise NotImplementedError(f"func {func.__name__} is not overwritten for Sm{self.cc}")
+
+
+class EVTPassManager(nx.DiGraph):
+    """
+    Topological-based Pass Manager.
+    Each registered pass has a list of dependencies. The pass manager organizes
+    the passes as a DAG and launch the compiler passes under topological order.
+    """
+    def __init__(self, dag_ir: DAGIR, pass_list):
+        super().__init__()
+        self.dag_ir = dag_ir
+        for pass_cls in pass_list:
+            self.add_pass(pass_cls)
+
+        self.sorted_passes = self.schedule()
+
+    def get_callable(self, pass_name):
+        """
+        Return the callable of the pass
+        """
+        return self.nodes[pass_name]["callable"]
+
+    def add_pass(self, pass_cls):
+        """
+        Add a pass to the pass manager
+        :param pass_cls: the class of pass
+        :type pass_cls: derived class of EVTPassBase
+        """
+        name = pass_cls.__name__
+        pass_callable = pass_cls(self.dag_ir)
+        self.add_node(name, callable=pass_callable)
+
+    def schedule(self):
+        """
+        Schedule the added passes under topological order
+        """
+        # Add edges
+        for pass_name in self.nodes:
+            callable = self.get_callable(pass_name)
+            for dependency_cls in callable.dependencies:
+                self.add_edge(
+                    dependency_cls.__name__,
+                    type(callable).__name__)
+
+        # Topological sort
+        return list(nx.topological_sort(self))
+
+    def __call__(self) -> Any:
+        """
+        Launch the registered passes
+        """
+        for pass_name in self.sorted_passes:
+            callable = self.get_callable(pass_name)
+            callable()
diff --git a/python/cutlass/backend/evt/passes/pass_no_op_elimination.py b/python/cutlass/backend/evt/passes/pass_no_op_elimination.py
new file mode 100644
index 0000000000..1188426224
--- /dev/null
+++ b/python/cutlass/backend/evt/passes/pass_no_op_elimination.py
@@ -0,0 +1,53 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+No op elimination node
+"""
+
+from typing import Any
+
+from cutlass.backend.evt.ir import NoOpImpl
+from cutlass.backend.evt.passes.pass_manager import EVTPassBase
+
+
+class PassNoOpElimination(EVTPassBase):
+    """
+    The dead node elimination pass removes nodes with NoOpImpl in DAG IR
+    """
+    dependencies = []
+
+    def call(self) -> Any:
+        for node in self.dag_ir.nodes_topological_order():
+            node_meta = self.dag_ir.get_node_meta(node)
+            if isinstance(node_meta.underlying_impl, NoOpImpl):
+                self.dag_ir.replace_all_uses_with(node, self.dag_ir.get_all_inputs(node)[0])
diff --git a/python/cutlass/backend/evt/passes/pass_preprocess_red.py b/python/cutlass/backend/evt/passes/pass_preprocess_red.py
new file mode 100644
index 0000000000..3c8b7a7217
--- /dev/null
+++ b/python/cutlass/backend/evt/passes/pass_preprocess_red.py
@@ -0,0 +1,97 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Preprocess the reduction nodes.
+
+The parser treats reduction as Compute(op=(reg_reduce_fn, gmem_reduce_fn)) - Store()
+This pass fuses these into a single store node, and then replaces all uses of the
+current node with the new store node.
+"""
+
+from cutlass.backend.evt.ir import ComputeNode, StoreNode
+from cutlass.backend.evt.passes.pass_manager import EVTPassBase
+
+
+class PassPreprocessRed(EVTPassBase):
+    """
+    Preprocess red nodes
+    """
+
+    def call(self):
+        # Step 1: find the compute nodes with op=red
+        red_compute_nodes = []
+        for node_meta in self.dag_ir.nodes_meta:
+            if isinstance(node_meta, ComputeNode):
+                if type(node_meta.fn) == tuple:
+                    # To keep the frontend simple, the reduction nodes
+                    # are parsed into compute nodes by default
+                    # The simple heuristic to distinguish between compute
+                    # and reduction node is that compute node is a single function,
+                    # while the reduction node is a tuple of functions for
+                    # in-register reduction and atomic global memory reduction
+                    red_compute_nodes.append(node_meta.name)
+
+        # Step 2: for each compute, merge it with the succeeding store
+        for node in red_compute_nodes:
+            # Verify
+            users = self.dag_ir.get_users(node)
+            inputs = self.dag_ir.get_all_inputs(node)
+            # Has a single user
+            assert len(users) == 1
+            assert len(inputs) == 1
+            user = users[0]
+            input = inputs[0]
+
+            user_meta = self.dag_ir.get_node_meta(user)
+            # Must be a store node
+            assert isinstance(user_meta, StoreNode)
+            # With output degree == 0
+            assert self.dag_ir.out_degree(user) == 0
+            # Register the reduce op
+            node_meta = self.dag_ir.get_node_meta(node)
+            user_meta.reg_reduce_fn, user_meta.gmem_reduce_fn = node_meta.fn
+            user_meta.element_compute = node_meta.element_compute
+            user_meta.round_style = node_meta.round_style
+
+            # Replace all uses
+            self.dag_ir.remove_edge(input, node)
+            input_users = self.dag_ir.get_users(input)
+            for iu in input_users:
+                weight = self.dag_ir.get_edge_weight(input, iu)
+                self.dag_ir.add_edge(user, iu, weight)
+                self.dag_ir.remove_edge(input, iu)
+            self.dag_ir.add_edge(input, user)
+            self.dag_ir.remove_node(node)
+
+            # Register the reduction name
+            self.dag_ir.reduction_names.append(user)
diff --git a/python/cutlass/backend/evt/passes/pass_shape_type_propagation.py b/python/cutlass/backend/evt/passes/pass_shape_type_propagation.py
new file mode 100644
index 0000000000..b0e93219d7
--- /dev/null
+++ b/python/cutlass/backend/evt/passes/pass_shape_type_propagation.py
@@ -0,0 +1,59 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Shape and type propagation pass
+"""
+
+from cutlass.backend.evt.ir.node import NodeBase
+from cutlass.backend.evt.passes.pass_manager import EVTPassBase
+from cutlass.backend.evt.passes.pass_preprocess_red import PassPreprocessRed
+
+
+class PassShapeTypePropagation(EVTPassBase):
+    """
+    Propagate the shape and type of all nodes
+    """
+    dependencies = [PassPreprocessRed]
+
+    def call(self):
+        # Propagate the node shape and type
+        for node in self.dag_ir.nodes_topological_order():
+            node_meta: NodeBase = self.dag_ir.get_node_meta(node)
+            input_node_metas = self.dag_ir.get_all_inputs_meta(node)
+            node_meta.type_propagation(input_node_metas)
+            node_meta.shape_propagation(input_node_metas)
+
+        for node in reversed(self.dag_ir.nodes_topological_order()):
+            node_meta: NodeBase = self.dag_ir.get_node_meta(node)
+            input_node_metas = self.dag_ir.get_all_inputs_meta(node)
+            node_meta.broadcast_propagation(input_node_metas)
diff --git a/python/cutlass/backend/evt/passes/smem_size_calculator.py b/python/cutlass/backend/evt/passes/smem_size_calculator.py
new file mode 100644
index 0000000000..bcc94884e3
--- /dev/null
+++ b/python/cutlass/backend/evt/passes/smem_size_calculator.py
@@ -0,0 +1,204 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Compute the shared memory size in bytes
+"""
+
+import cutlass_library
+from pycute import shape_div, product
+
+import cutlass
+from cutlass.backend.evt.ir import TopoVisitorNode, DAGIR
+from cutlass.backend.library import DataTypeSize
+
+
+class GetSmemSize:
+    """
+    Get the size in byte of shared memory used by the kernel
+    """
+    def __init__(self, dag_ir: DAGIR) -> None:
+        self.dag_ir = dag_ir
+        self.cc = self.dag_ir.cc
+
+    #
+    # Sm90 epilogue specific
+    #
+
+    def sm90_epilogue_tile(self, tile_description):
+        # Get the epilogue tile size
+        schedule = tile_description.epilogue_schedule
+        if schedule == cutlass_library.EpilogueScheduleType.TmaWarpSpecialized:
+            epilogue_tile_mn = (64, 32)
+        elif schedule == cutlass_library.EpilogueScheduleType.TmaWarpSpecializedCooperative:
+            if tile_description.threadblock_shape[0] >= 128:
+                epilogue_tile_mn = (128, 32)
+            else:
+                epilogue_tile_mn = (64, 32)
+        else:
+            raise NotImplementedError(f"Unsupported schedule: {schedule}")
+
+        # Get the pipeline stages
+        stages_d = 2
+        epi_tiles = product(shape_div(tuple(tile_description.threadblock_shape)[:2], epilogue_tile_mn))
+        if self.dag_ir.has_node("C"):
+            element_c = self.dag_ir.get_node_meta("C").element
+        else:
+            element_c = None
+
+        element_d = self.dag_ir.get_node_meta("D").element
+        if element_c == element_d:
+            reuse_smem_c = True
+        else:
+            reuse_smem_c = False
+        stages_c = max(epi_tiles, stages_d + 1) if reuse_smem_c else epi_tiles
+
+        # Record the epilogue tile
+        self.cta_tile_mnk = tuple(tile_description.threadblock_shape)
+        self.epilogue_tile_mn = epilogue_tile_mn
+        self.epi_tiles = epi_tiles
+        self.stages_c = stages_c
+        self.stages_d = stages_d
+        self.reuse_smem_c = reuse_smem_c
+        self.element_c = element_c
+        self.element_d = element_d
+        self.is_source_supported = element_c is not None
+
+    def sm90_epilogue_smem_size(self, tile_description):
+        """
+        Compute the shared memory size of sm90 collective epilogue
+        """
+        self.sm90_epilogue_tile(tile_description)
+        # Get the Fusion Storage
+        nodes = self.dag_ir.nodes_topological_order()
+        self.smem_types = {}
+        for node in nodes:
+            meta = self.dag_ir.get_node_meta(node)
+            if not meta.disabled:
+                self.smem_types[node] = meta.underlying_impl.get_smem_size(
+                    self.cta_tile_mnk, self.epilogue_tile_mn,
+                    self.stages_c, self.stages_d, self.epi_tiles)
+            if node == "D":
+                continue
+            if isinstance(meta, TopoVisitorNode):
+                self.get_dag_smem_type(node)
+            else:
+                self.get_evt_smem_type(node)
+
+        thread_smem_size = self.smem_types[self.dag_ir.get_all_inputs("D")[0]][0]
+        # Get the Tensor Storage
+        tensors = []
+        if self.is_source_supported:
+            smem_C = DataTypeSize[self.element_c] * product(self.epilogue_tile_mn) * self.stages_c // 8
+            tensors.append((smem_C, 128))
+        else:
+            tensors.append((0, 1))
+        if self.reuse_smem_c:
+            tensors.append((0, 128))
+        else:
+            smem_D = DataTypeSize[self.element_d] * product(self.epilogue_tile_mn) * self.stages_d // 8
+            tensors.append((smem_D, 128))
+        tensors.append((thread_smem_size, 128))
+
+        tensor_smem_size = self.get_struct_size(tensors)
+        # Get pipeline storage size
+        # sizeof(uint64_t * stages_c * 2), alignment of uint64_t
+        # 2 is for FullBarrier and EmptyBarrier
+        pipeline_smem_size = (8 * self.stages_c * 2, 8)
+
+        # get SharedStorage size
+        smem_size = self.get_struct_size([tensor_smem_size, pipeline_smem_size])
+        return smem_size[0]
+
+    def __call__(self, tile_description):
+        return getattr(self, f"sm{self.cc}_epilogue_smem_size")(tile_description)
+
+    #
+    # Helper functions
+    #
+
+    @staticmethod
+    def get_visitor_size(members: list, ebo: bool):
+        """
+        Get the size of struct in bytes
+        """
+        offset = 0
+        max_alignment = 1
+        if len(members) > 0:
+            # Get alignment
+            for _, alignment in members:
+                max_alignment = max(max_alignment, alignment)
+
+            for type_size, _ in members:
+                if type_size != 0:
+                    offset = ((offset + max_alignment - 1) // max_alignment) * max_alignment
+                if type_size == 0 and not ebo:
+                    offset += 1
+                else:
+                    offset += type_size
+            offset = ((offset + max_alignment - 1) // max_alignment) * max_alignment
+            return (offset, max_alignment)
+        else:
+            # Struct size is at least 1
+            return (1, 1)
+
+    def get_struct_size(self, members: list):
+        """
+        Get the size of struct in bytes
+        """
+        return self.get_visitor_size(members, False)
+
+    def get_evt_smem_type(self, node):
+        # Sort the input nodes by edge weight
+        input_types = [self.smem_types[child] for child in self.dag_ir.get_all_inputs(node)]
+        input_types.append(self.smem_types[node])
+        if len(input_types) > 1:
+            ebo = len(input_types) > 4
+            self.smem_types[node] = self.get_visitor_size(input_types, ebo)
+
+    def get_dag_smem_type(self, node):
+        meta = self.dag_ir.get_node_meta(node)
+        subgraph = meta.subgraph
+        subgraph_nodes = subgraph.nodes_topological_order()
+        # Visit the unvisited nodes in subgraph
+        for n in subgraph_nodes:
+            m = subgraph.get_node_meta(n)
+            if m.disabled:
+                continue
+            else:
+                self.smem_types[n] = m.underlying_impl.get_smem_size(
+                    self.cta_tile_mnk, self.epilogue_tile_mn,
+                    self.stages_c, self.stages_d, self.epi_tiles)
+        input_types = [self.smem_types[child] for child in subgraph_nodes[:-1]]
+        if len(input_types) > 0:
+            ebo = len(input_types) > 4
+            self.smem_types[node] = self.get_visitor_size(input_types, ebo)
diff --git a/python/cutlass/backend/evt/passes/util.py b/python/cutlass/backend/evt/passes/util.py
new file mode 100644
index 0000000000..59b4ebcd65
--- /dev/null
+++ b/python/cutlass/backend/evt/passes/util.py
@@ -0,0 +1,43 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for passes
+"""
+
+# Map from the CC of the kernel to the EVT implementation that the CC targets
+cc_map = {
+    80: 80,
+    86: 80,
+    89: 80,
+    90: 90,
+}
diff --git a/python/cutlass/backend/frontend.py b/python/cutlass/backend/frontend.py
new file mode 100644
index 0000000000..2b907cc765
--- /dev/null
+++ b/python/cutlass/backend/frontend.py
@@ -0,0 +1,107 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from cuda import cuda
+import numpy as np
+
+from cutlass.backend.memory_manager import device_mem_alloc, todevice
+from cutlass.utils.datatypes import is_cupy_tensor, is_numpy_tensor, is_torch_tensor
+
+
+class NumpyFrontend:
+    """
+    Frontend node for numpy
+    """
+
+    @staticmethod
+    def argument(np_tensor: "np.ndarray", is_output: "bool") -> cuda.CUdeviceptr:
+        """Convert the input numpy tensor to CUDA device pointer
+
+        :param np_tensor: input numpy nd array
+        :param is_output: whether the tensor is output
+
+        :return: CUDA device pointer
+        """
+        # copy the data to device
+        if is_output:
+            return device_mem_alloc(np_tensor.size * np_tensor.itemsize)
+        else:
+            return todevice(np_tensor)
+
+
+class TorchFrontend:
+    """
+    Frontend node for torch
+    """
+
+    @staticmethod
+    def argument(torch_tensor: "torch.Tensor") -> cuda.CUdeviceptr:
+        """Convert the input torch tensor to CUDA device pointer
+
+        :param torch_tensor: input torch tensor
+        :param is_output: whether the tensor is output
+
+        :return: CUDA device pointer
+        """
+
+        # check the device of torch_tensor
+        if not torch_tensor.is_cuda:
+            torch_tensor = torch_tensor.to("cuda")
+
+        return cuda.CUdeviceptr(torch_tensor.data_ptr())
+
+
+class CupyFrontend:
+    """
+    Frontend node for cupy
+    """
+
+    @staticmethod
+    def argument(cupy_ndarray: "cp.ndarray"):
+        return cuda.CUdeviceptr(int(cupy_ndarray.data.ptr))
+
+
+class TensorFrontend:
+    """
+    Universal Frontend for client-provide tensors
+    """
+
+    @staticmethod
+    def argument(tensor, is_output=False):
+        if is_numpy_tensor(tensor):
+            return NumpyFrontend.argument(tensor, is_output)
+        elif is_torch_tensor(tensor):
+            return TorchFrontend.argument(tensor)
+        elif is_cupy_tensor(tensor):
+            return CupyFrontend.argument(tensor)
+        else:
+            raise NotImplementedError("Unknown Tensor Type")
diff --git a/python/cutlass/backend/gemm_operation.py b/python/cutlass/backend/gemm_operation.py
new file mode 100644
index 0000000000..62ac6c272d
--- /dev/null
+++ b/python/cutlass/backend/gemm_operation.py
@@ -0,0 +1,2138 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import copy
+import ctypes
+import enum
+
+from cuda import cuda, cudart
+from cutlass_library import SubstituteTemplate
+import numpy as np
+
+from cutlass_library import (
+    ComplexTransformTag,
+    DataType,
+    DataTypeNames,
+    DataTypeSize,
+    DataTypeTag,
+    EpilogueScheduleSuffixes,
+    EpilogueScheduleTag,
+    EpilogueScheduleType,
+    GemmKind,
+    GemmKindNames,
+    GemmUniversalMode,
+    KernelScheduleSuffixes,
+    KernelScheduleTag,
+    KernelScheduleType,
+    LayoutTag,
+    LayoutType,
+    MathOperation,
+    MathOperationTag,
+    OpcodeClass,
+    OpcodeClassNames,
+    OpcodeClassTag,
+    OperationKind,
+    ShortComplexLayoutNames,
+    ShortDataTypeNames,
+    ShortLayoutTypeNames,
+    SwizzlingFunctor,
+    SwizzlingFunctorTag,
+    TileSchedulerSuffixes,
+    TileSchedulerTag,
+    TileSchedulerType,
+    get_complex_from_real
+)
+from cutlass.backend.arguments import ArgumentBase
+from cutlass.backend.c_types import (
+    GemmCoord_,
+    GemmCoordBatched_,
+    GenericMainloopArguments3x_,
+    StrideBatched_,
+    dim3_,
+    get_gemm_arguments,
+    get_gemm_arguments_3x,
+    get_gemm_arguments_streamk,
+    get_gemm_grouped_arguments,
+    get_mainloop_arguments_3x,
+    get_tile_scheduler_arguments_3x,
+)
+from cutlass.backend.library import (
+    ApiVersion,
+    EmissionType,
+    SchedulerMode,
+    SchedulerModeTag,
+    TensorDescription,
+    TileDescription,
+    api_version,
+)
+from cutlass.backend.memory_manager import device_mem_alloc, todevice
+from cutlass.backend.operation import ExecutableOperation, LaunchConfiguration
+from cutlass.backend.type_hint import GemmOperation, Tensor
+from cutlass.backend.utils.device import device_sm_count
+from cutlass.shape import GemmCoord, MatrixCoord
+
+
+################################################################################
+#
+# Data structure modeling a GEMM operation
+#
+################################################################################
+
+
+def leading_dimension(layout: LayoutType, shape: MatrixCoord) -> int:
+    """
+    Returns the leading dimenson of a tensor with layout ``layout`` and shape ``shape``.
+
+    :param layout: layout of the tensor
+    :type layout: cutlass.shape.LayoutType
+    :param shape: shape of the tensor
+    :type shape: cutlass.shape.MatrixCoord
+
+    :return: leading dimension of the tensor
+    :rtype: int
+    """
+    if layout == LayoutType.RowMajor:
+        return shape.column
+    elif layout == LayoutType.ColumnMajor:
+        return shape.row
+
+
+def transpose_layout(layout: LayoutType) -> LayoutType:
+    if layout == LayoutType.ColumnMajor:
+        return LayoutType.RowMajor
+    elif layout == LayoutType.RowMajor:
+        return LayoutType.ColumnMajor
+    else:
+        raise ValueError(f"Unsupported Layout {layout}")
+
+
+class GemmArguments2x(ArgumentBase):
+    """
+    Argument wrapper for GEMM in CUTLASS 2. It encodes problem information and
+    user-provide tensors into the kernel's argument
+
+    :param operation: the GEMM operation to take the argument
+    :type operation: :class:`cutlass.backend.GemmOperationUniversal` |
+     :class:`cutlass.backend.GemmOperationGrouped`
+
+    :param problem_size: GEMM problem size gemm(M, N, K)
+    :type operation: :class:`cutlass.shape.GemmCoord`
+
+    :param A: tensor A
+    :type A: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param B: tensor B
+    :type B: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param C: tensor C
+    :type C: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param D: tensor D
+    :type D: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param gemm_mode: GEMM mode
+    :type gemm_mode: :class:`cutlass_library.GemmUniversalMode`
+
+    :param output_op: output operator, optional
+    :type output_op: :class:`cutlass.backend.LinearCombinationFunctorArguments`
+
+    :param stream: cuda stream, defaults to cuda.cuda.CUstream(0)
+    :type stream: :class:`cuda.cuda.CUstream`
+    """
+
+    def __init__(self, operation, problem_size, A, B, C, D, gemm_mode=GemmUniversalMode.Gemm, **kwargs):
+        self.operation = operation
+
+        self.layout_A = operation.A.layout
+        self.layout_B = operation.B.layout
+        self.layout_C = operation.C.layout
+
+        self.element_A = operation.A.element
+        self.element_B = operation.B.element
+        self.element_C = operation.C.element
+
+        if operation.C.layout in [LayoutType.RowMajorInterleaved32, LayoutType.ColumnMajorInterleaved32]:
+            raise Exception("Interleaved layout not currently supported")
+
+        if hasattr(self.operation.epilogue_functor, "visitor") and operation.arch != 90:
+            super().__init__(A, B, None, None, **kwargs)
+        else:
+            super().__init__(A, B, C, D, **kwargs)
+
+        if operation.switched:
+            self.problem_size = GemmCoord(problem_size.n, problem_size.m, problem_size.k)
+            self.ptr_A, self.ptr_B = self.ptr_B, self.ptr_A
+        else:
+            self.problem_size = problem_size
+        # If the number of elements in C = problem_size.n, C is treated as the bias
+        if hasattr(self, "tensor_c_numel"):
+            if self.tensor_c_numel == self.problem_size.n and self.problem_size.m != 1:
+                self.bias = True
+
+        self.lda = leading_dimension(self.layout_A, self.problem_size.mk)
+        self.ldb = leading_dimension(self.layout_B, self.problem_size.kn)
+        self.ldc = leading_dimension(self.layout_C, self.problem_size.mn)
+        self.ldd = self.ldc
+
+        if self.bias:
+            self.ldc = 0
+
+        if "output_op" in kwargs.keys() and gemm_mode != GemmUniversalMode.GemmSplitKParallel:
+            self.output_op = kwargs["output_op"]
+        else:
+            if self.operation.epilogue_functor.element_epilogue in [DataType.s8, DataType.s32, DataType.u8, DataType.u32]:
+                dtype = int
+            else:
+                dtype = float
+            self.output_op = self.operation.epilogue_type(dtype(1.0), dtype(0.0))
+
+        self.gemm_mode = gemm_mode
+        if gemm_mode in [GemmUniversalMode.Gemm, GemmUniversalMode.GemmSplitKParallel]:
+            if "split_k_slices" in kwargs.keys():
+                self.batch_count = kwargs["split_k_slices"]
+            else:
+                self.batch_count = 1
+            self.split_k_slices = self.batch_count
+
+        if gemm_mode in [GemmUniversalMode.Batched, GemmUniversalMode.Array]:
+            if "batch" in kwargs.keys():
+                self.batch_count = kwargs["batch"]
+            else:
+                self.batch_count = 1
+
+        if "batch_strides" in kwargs:
+            self.batched_stride_A = kwargs["batch_strides"]["A"]
+            self.batched_stride_B = kwargs["batch_strides"]["B"]
+            self.batched_stride_C = kwargs["batch_strides"]["C"]
+            self.batched_stride_D = kwargs["batch_strides"]["D"]
+        else:
+            self.batched_stride_A = self.problem_size.m * self.problem_size.k
+            self.batched_stride_B = self.problem_size.n * self.problem_size.k
+            self.batched_stride_C = self.problem_size.m * self.problem_size.n
+            self.batched_stride_D = self.problem_size.m * self.problem_size.n
+
+        if self.bias:
+            self.batched_stride_C = self.problem_size.n
+
+        if gemm_mode == GemmUniversalMode.Array:
+            self.ptr_A_array = []
+            self.ptr_B_array = []
+            self.ptr_C_array = []
+            self.ptr_D_array = []
+
+            ptr_A_addr = int(self.ptr_A)
+            ptr_B_addr = int(self.ptr_B)
+            ptr_C_addr = int(self.ptr_C)
+            ptr_D_addr = int(self.ptr_D)
+
+            stride_A = self.batched_stride_A * DataTypeSize[self.element_A] // 8
+            stride_B = self.batched_stride_B * DataTypeSize[self.element_B] // 8
+            stride_C = self.batched_stride_C * DataTypeSize[self.element_C] // 8
+            stride_D = self.batched_stride_D * DataTypeSize[self.element_C] // 8
+            for _ in range(self.batch_count):
+                self.ptr_A_array.append(ptr_A_addr)
+                self.ptr_B_array.append(ptr_B_addr)
+                self.ptr_C_array.append(ptr_C_addr)
+                self.ptr_D_array.append(ptr_D_addr)
+
+                ptr_A_addr += stride_A
+                ptr_B_addr += stride_B
+                ptr_C_addr += stride_C
+                ptr_D_addr += stride_D
+
+            self.ptr_A_array_buffer = todevice(self.ptr_A_array, dtype=np.int64)
+            self.ptr_B_array_buffer = todevice(self.ptr_B_array, dtype=np.int64)
+            self.ptr_C_array_buffer = todevice(self.ptr_C_array, dtype=np.int64)
+            self.ptr_D_array_buffer = todevice(self.ptr_D_array, dtype=np.int64)
+
+        if isinstance(self.operation, GemmOperationUniversal):
+            self.initialize()
+
+    def get_arguments(self):
+        problem_size_ = self.problem_size.ctype
+        grid_tiled_shape_ = GemmCoord(
+            self.grid_tiled_shape.x,
+            self.grid_tiled_shape.y,
+            self.grid_tiled_shape.z ).ctype
+
+        if self.gemm_mode == GemmUniversalMode.Array:
+            arguments = self.operation.argument_type(
+                # Arguments from UniversalArgumentsBase
+                self.gemm_mode,
+                problem_size_,
+                self.batch_count,
+                0,
+                # Remaining arguments
+                self.output_op,
+                int(self.ptr_A_array_buffer.ptr),
+                int(self.ptr_B_array_buffer.ptr),
+                int(self.ptr_C_array_buffer.ptr),
+                int(self.ptr_D_array_buffer.ptr),
+                0, 0, 0,
+                self.lda, self.ldb, self.ldc, self.ldd,
+                self.lda, self.ldb, self.ldc, self.ldd,
+                0, 0, 0
+            )
+        else:
+            arguments = self.operation.argument_type(
+                # Arguments from UniversalArgumentsBase
+                self.gemm_mode, problem_size_, self.batch_count, self.batched_stride_D,
+                # Remaining arguments
+                self.output_op,
+                int(self.ptr_A),
+                int(self.ptr_B),
+                int(self.ptr_C),
+                int(self.ptr_D),
+                self.batched_stride_A,
+                self.batched_stride_B,
+                self.batched_stride_C,
+                self.lda, self.ldb, self.ldc, self.ldd,
+                self.lda, self.ldb, self.ldc, self.ldd,
+                0, 0, 0
+            )
+
+        self.arguments = arguments, grid_tiled_shape_, self.gemm_k_size
+
+    def initialize(self):
+        launch_config = self.operation.rt_module.plan(self)
+
+        # Get the host and device workspace
+        device_workspace_size = self.operation.rt_module.get_device_workspace_size(self)
+
+        if device_workspace_size > 0:
+            self.workspace_buffer = device_mem_alloc(device_workspace_size)
+            workspace_ptr = self.workspace_buffer.ptr
+            err, = cuda.cuMemsetD32(
+                workspace_ptr, 0, device_workspace_size // 4)
+        else:
+            workspace_ptr = None
+
+        device_workspace = 0
+        if workspace_ptr is not None and self.gemm_mode == GemmUniversalMode.GemmSplitKParallel:
+            # In GEMM splik-K parallel, the D pointer is redirected to the workspace
+            self.ptr_D = cuda.CUdeviceptr(workspace_ptr)
+        elif workspace_ptr is not None and self.gemm_mode == GemmUniversalMode.Gemm:
+            device_workspace = workspace_ptr
+
+        self.get_arguments()
+
+        arguments, grid_tiled_shape, gemm_k_size = self.arguments
+        res_arg = self.operation.rt_module.get_args(
+            ctypes.byref(arguments), ctypes.c_void_p(int(device_workspace)))
+        host_workspace = bytearray(res_arg.contents)
+
+        device_workspace = None
+
+        self.host_workspace = host_workspace
+        self.device_workspace = device_workspace
+        self.launch_config = launch_config
+
+    def sync(self, stream_sync=True):
+        super().sync(stream_sync)
+        if hasattr(self.output_op, "sync"):
+            self.output_op.sync()
+
+
+class GemmArguments2xStreamK(GemmArguments2x):
+    """
+    Argument wrapper for stream-K GEMMs in CUTLASS 2. It encodes problem information and
+    user-provide tensors into the kernel's argument
+
+    :param operation: the GEMM operation to take the argument
+    :type operation: :class:`cutlass.backend.GemmOperationUniversal` |
+     :class:`cutlass.backend.GemmOperationGrouped`
+
+    :param problem_size: GEMM problem size gemm(M, N, K)
+    :type operation: :class:`cutlass.shape.GemmCoord`
+
+    :param A: tensor A
+    :type A: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param B: tensor B
+    :type B: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param C: tensor C
+    :type C: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param D: tensor D
+    :type D: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param gemm_mode: GEMM mode
+    :type gemm_mode: :class:`cutlass_library.GemmUniversalMode`
+
+    :param output_op: output operator, optional
+    :type output_op: :class:`cutlass.backend.LinearCombinationFunctorArguments`
+    """
+
+    def __init__(self, operation, problem_size, A, B, C, D, gemm_mode=GemmUniversalMode.Gemm, **kwargs):
+        if gemm_mode not in [GemmUniversalMode.Gemm, GemmUniversalMode.Batched]:
+            raise Exception(f"Unsupported GEMM mode {gemm_mode}.")
+
+        super().__init__(operation, problem_size, A, B, C, D, gemm_mode, **kwargs)
+
+    def get_arguments(self):
+        batch_stride_A = self.problem_size.m * self.problem_size.k
+        batch_stride_B = self.problem_size.k * self.problem_size.n
+        batch_stride_C = self.problem_size.m * self.problem_size.n
+        batch_stride_D = self.problem_size.m * self.problem_size.n
+
+        arguments = self.operation.argument_type(
+            self.gemm_mode,
+            GemmCoord_(self.problem_size.m, self.problem_size.n, self.problem_size.k),
+            self.batch_count,
+            self.output_op,
+            int(self.ptr_A),
+            int(self.ptr_B),
+            int(self.ptr_C),
+            int(self.ptr_D),
+            batch_stride_A,
+            batch_stride_B,
+            batch_stride_C,
+            batch_stride_D,
+            self.lda, self.ldb, self.ldc, self.ldd,  # strides
+            self.lda, self.ldb, self.ldc, self.ldd,
+            -1,  # avail_sms
+        )
+        return arguments
+
+    def initialize(self):
+        # Get the host and device workspace
+        device_workspace_size = self.operation.rt_module.get_device_workspace_size(
+            self,
+            device_sm_count(),
+            self.operation.rt_module.occupancy
+        )
+
+        if device_workspace_size > 0:
+            self.workspace_buffer = device_mem_alloc(device_workspace_size)
+            workspace_ptr = self.workspace_buffer.ptr
+            err, = cuda.cuMemsetD32(
+                workspace_ptr, 0, device_workspace_size // 4)
+        else:
+            workspace_ptr = None
+
+        device_workspace = 0
+        if workspace_ptr is not None and self.gemm_mode == GemmUniversalMode.GemmSplitKParallel:
+            # In GEMM splik-K parallel, the D pointer is redirected to the workspace
+            self.ptr_D = cuda.CUdeviceptr(workspace_ptr)
+        elif workspace_ptr is not None and self.gemm_mode == GemmUniversalMode.Gemm:
+            device_workspace = workspace_ptr
+
+        arguments = self.get_arguments()
+
+        res_arg = self.operation.rt_module.get_args(
+            ctypes.byref(arguments),
+            ctypes.c_void_p(int(device_workspace)),
+            device_sm_count(),
+            self.operation.rt_module.occupancy
+        )
+        host_workspace = bytearray(res_arg.contents)
+
+        grid = self.operation.rt_module.get_grid_shape(
+            ctypes.byref(arguments),
+            device_sm_count(),
+            self.operation.rt_module.occupancy
+        )
+
+        device_workspace = None
+
+        self.host_workspace = host_workspace
+        self.device_workspace = device_workspace
+        self.launch_config = LaunchConfiguration(
+            [grid.m, grid.n, grid.k],
+            [self.operation.rt_module.threads, 1, 1],
+            self.operation.rt_module.shared_memory_capacity
+        )
+
+
+class GemmArguments3x(GemmArguments2x):
+    """
+    Argument wrapper for GEMM in CUTLASS 3. It encodes problem information and
+    user-provide tensors into the kernel's argument
+
+    :param operation: the GEMM operation to take the argument
+    :type operation: :class:`cutlass.backend.GemmOperationUniversal` |
+     :class:`cutlass.backend.GemmOperationGrouped`
+
+    :param problem_size: GEMM problem size gemm(M, N, K)
+    :type operation: :class:`cutlass.shape.GemmCoord`
+
+    :param A: tensor A
+    :type A: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param B: tensor B
+    :type B: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param C: tensor C
+    :type C: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param D: tensor D
+    :type D: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param gemm_mode: GEMM mode
+    :type gemm_mode: GemmUniversalMode
+
+    :param output_op: output operator, optional
+    :type output_op: :class:`cutlass.backend.LinearCombinationFunctorArguments`
+    """
+
+    def __init__(self, operation, problem_size, A, B, C, D, gemm_mode=GemmUniversalMode.Gemm, **kwargs):
+        if gemm_mode not in [GemmUniversalMode.Gemm, GemmUniversalMode.Batched]:
+            raise Exception(f"Unsupported GEMM mode {gemm_mode}.")
+
+        super().__init__(operation, problem_size, A, B, C, D, gemm_mode, **kwargs)
+
+    def get_arguments(self):
+        mainloop_args = get_mainloop_arguments_3x(
+            self.operation.tile_description.kernel_schedule,
+            self.operation.A.element,
+            self.operation.B.element,
+            self.operation.A.alignment,
+            self.operation.B.alignment
+        )
+        scheduler_args = get_tile_scheduler_arguments_3x(self.operation.tile_description.tile_scheduler)
+        uses_default_epilogue = self.operation.rt_module.uses_default_epilogue()
+        argument_type, epilogue_args, epilogue_type, hw_info = get_gemm_arguments_3x(
+            mainloop_args, self.operation.epilogue_functor, scheduler_args, uses_default_epilogue)
+
+        problem_size_ = GemmCoordBatched_(self.problem_size, self.batch_count)
+
+        if self.batch_count > 1:
+            bsA = self.batched_stride_A
+            bsB = self.batched_stride_B
+            bsC = self.batched_stride_C
+            bsD = self.batched_stride_D
+        else:
+            bsA = 0
+            bsB = 0
+            bsC = 0
+            bsD = 0
+        stride_A = StrideBatched_(self.lda, bsA)
+        stride_B = StrideBatched_(self.ldb, bsB)
+        stride_C = StrideBatched_(self.ldc, bsC)
+        stride_D = StrideBatched_(self.ldd, bsD)
+
+        # Superset of potential mainloop arguments
+        generic_args = GenericMainloopArguments3x_(
+            int(self.ptr_A),
+            stride_A,
+            int(self.ptr_B),
+            stride_B,
+            4 # mma_promotion_interval
+        )
+
+        # Set of mainloop arguments needed for this kernel
+        mainloop = mainloop_args.from_generic_mainloop_args(generic_args)
+
+        if not uses_default_epilogue and hasattr(self.output_op, "to_evt_params"):
+            self.output_op = self.output_op.to_evt_params()
+
+        epilogue = epilogue_args(
+            self.output_op,
+            int(self.ptr_C),
+            stride_C,
+            int(self.ptr_D),
+            stride_D,
+        )
+
+        # Set hardware info
+        hw_info_ = hw_info(
+            0, device_sm_count(),
+        )
+
+        self.arguments = argument_type(
+            int(self.gemm_mode),
+            problem_size_,
+            mainloop,
+            epilogue,
+            hw_info_,
+            scheduler_args
+        )
+        return self.arguments
+
+    def initialize(self):
+        # Get the host and evice workspace
+        device_workspace_size = self.operation.rt_module.get_device_workspace_size(self)
+
+        if device_workspace_size > 0:
+            self.workspace_buffer = device_mem_alloc(device_workspace_size)
+            workspace_ptr = self.workspace_buffer.ptr
+            err, = cuda.cuMemsetD32(
+                workspace_ptr, 0, device_workspace_size // 4)
+        else:
+            workspace_ptr = None
+
+        device_workspace = 0
+        if workspace_ptr is not None and self.gemm_mode == GemmUniversalMode.GemmSplitKParallel:
+            # In GEMM splik-K parallel, the D pointer is redirected to the workspace
+            self.ptr_D = cuda.CUdeviceptr(workspace_ptr)
+        elif workspace_ptr is not None and self.gemm_mode == GemmUniversalMode.Gemm:
+            device_workspace = workspace_ptr
+
+        self.get_arguments()
+        res_arg = self.operation.rt_module.get_args(
+            ctypes.byref(self.arguments),
+            ctypes.c_void_p(int(device_workspace)),
+        )
+        host_workspace = bytearray(res_arg.contents)
+
+        grid = self.operation.rt_module.get_grid_shape(
+            ctypes.byref(self.arguments),
+            ctypes.c_void_p(int(device_workspace)),
+        )
+        block = self.operation.rt_module.get_block_shape()
+
+        device_workspace = None
+
+        self.host_workspace = host_workspace
+        self.device_workspace = device_workspace
+        self.launch_config = LaunchConfiguration(
+            [grid.x, grid.y, grid.z],
+            [block.x, block.y, block.z],
+            self.operation.rt_module.shared_memory_capacity,
+        )
+
+
+def GemmArguments(operation, problem_size, A, B, C, D, gemm_mode=GemmUniversalMode.Gemm, **kwargs):
+    """
+    Argument wrapper for GEMM in CUTLASS 2 or 3. It returns either 2x arguments
+    or 3x arguments depending on the `arch` field specified in `operation`.
+
+    :param operation: the GEMM operation to take the argument
+    :type operation: :class:`cutlass.backend.GemmOperationUniversal` |
+     :class:`cutlass.backend.GemmOperationGrouped`
+
+    :param problem_size: GEMM problem size gemm(M, N, K)
+    :type operation: :class:`cutlass.shape.GemmCoord`
+
+    :param A: tensor A
+    :type A: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param B: tensor B
+    :type B: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param C: tensor C
+    :type C: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param D: tensor D
+    :type D: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param gemm_mode: GEMM mode
+    :type gemm_mode: :class:`cutlass_library.GemmUniversalMode`
+
+    :param output_op: output operator, optional
+    :type output_op: :class:`cutlass.backend.LinearCombinationFunctorArguments`
+    """
+    if operation.swizzling_functor == SwizzlingFunctor.StreamK:
+        if operation.api == ApiVersion.v3x:
+            raise Exception("Stream K is currently only supported in CUTLASS 2.x")
+        ArgClass = GemmArguments2xStreamK
+    else:
+        ArgClass = GemmArguments3x if operation.api == ApiVersion.v3x else GemmArguments2x
+    return ArgClass(operation, problem_size, A, B, C, D, gemm_mode, **kwargs)
+
+
+class GemmGroupedArguments:
+    """
+    Argument wrapper for GEMM Grouped. It encodes problem information and
+    user-provide tensors into the kernel's argument
+
+    :param operation: the GEMM Grouped operation to take the argument
+    :type operation: :class:`cutlass.backend.GemmOperationGrouped`
+
+    :param problem_size: list of GEMM problem size gemm(M, N, K)
+    :type operation: list[:class:`cutlass.shape.GemmCoord`]
+
+    :param A: list of tensor A
+    :type A: list[cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray]
+
+    :param B: list of tensor B
+    :type B: list[cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray]
+
+    :param C: list of tensor C
+    :type C: list[cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray]
+
+    :param D: list of tensor D
+    :type D: list[cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray]
+
+    :param output_op: output operator, optional
+    :type output_op: :class:`cutlass.backend.LinearCombinationFunctorArguments`
+
+    :param stream: cuda stream, defaults to cuda.cuda.CUstream(0)
+    :type stream: :class:`cuda.cuda.CUstream`
+    """
+
+    def __init__(self, operation, problem_sizes, A, B, C, D, **kwargs):
+        # Get number of problems in the group
+        self.problem_count = len(problem_sizes)
+
+        # Check the input arguments
+        assert len(A) == self.problem_count
+        assert len(B) == self.problem_count
+        assert len(C) == self.problem_count
+        assert len(D) == self.problem_count
+
+        problem_size_host = []
+        self.ptr_A_host = []
+        self.ptr_B_host = []
+        self.ptr_C_host = []
+        self.ptr_D_host = []
+
+        lda_host = []
+        ldb_host = []
+        ldc_host = []
+        ldd_host = []
+
+        self.partitions = 1
+
+        self.operation = operation
+
+        # Get the threadblock
+        threadblock_shape = operation.tile_description.threadblock_shape
+        self.threadblock_shape = GemmCoord(
+            threadblock_shape[0],
+            threadblock_shape[1],
+            threadblock_shape[2],
+        )
+        self.threadblock_swizzle = operation.swizzling_functor
+
+        self.total_tiles = 0
+
+        self.gemm_arguments = []
+
+        self.stream = kwargs.get("stream", cuda.CUstream(0))
+
+        # Process the input arguments
+        for idx, problem_size in enumerate(problem_sizes):
+            M, N, K = problem_size.m, problem_size.n, problem_size.k
+            temp_argument = GemmArguments2x(
+                operation=operation,
+                problem_size=GemmCoord(M, N, K),
+                A=A[idx], B=B[idx], C=C[idx], D=D[idx])
+            self.gemm_arguments.append(temp_argument)
+
+            problem_size_host.append(
+                [temp_argument.problem_size.m,
+                 temp_argument.problem_size.n,
+                 temp_argument.problem_size.k]
+            )
+
+            self.ptr_A_host.append(int(temp_argument.ptr_A))
+            lda_host.append(temp_argument.lda)
+
+            self.ptr_B_host.append(int(temp_argument.ptr_B))
+            ldb_host.append(temp_argument.ldb)
+
+            self.ptr_C_host.append(int(temp_argument.ptr_C))
+            ldc_host.append(temp_argument.ldc)
+
+            self.ptr_D_host.append(int(temp_argument.ptr_D))
+            ldd_host.append(temp_argument.ldd)
+
+            # Get number of tiles
+            grid = self.operation.rt_module.get_grid_shape(
+                self.operation.rt_module.get_tiled_shape(
+                    temp_argument.problem_size.ctype,
+                    self.threadblock_shape.ctype,
+                    temp_argument.batch_count
+                )
+            )
+            self.total_tiles += grid.x * grid.y * grid.z
+
+        self.problem_size_buffer = todevice(problem_size_host, np.int32)
+        self.ptr_A_buffer = todevice(self.ptr_A_host, np.int64)
+        self.ptr_B_buffer = todevice(self.ptr_B_host, np.int64)
+        self.ptr_C_buffer = todevice(self.ptr_C_host, np.int64)
+        self.ptr_D_buffer = todevice(self.ptr_D_host, np.int64)
+
+        self.lda_buffer = todevice(lda_host, np.int64)
+        self.ldb_buffer = todevice(ldb_host, np.int64)
+        self.ldc_buffer = todevice(ldc_host, np.int64)
+        self.ldd_buffer = todevice(ldd_host, np.int64)
+
+        if "output_op" in kwargs.keys():
+            self.alpha = kwargs["output_op"].alpha
+            self.beta = kwargs["output_op"].beta
+        else:
+            self.alpha = 1.0
+            self.beta = 0.0
+
+        if "output_op" in kwargs.keys():
+            self.output_op = kwargs["output_op"]
+        else:
+            self.output_op = self.operation.epilogue_type(1.0, 0.0)
+
+        # Get host problem size
+        self.host_problem_size_ptr = np.array(problem_size_host, dtype=np.int32).__array_interface__["data"][0]
+
+        self.arguments = self.get_arguments()
+
+        self.initialize()
+
+    def get_arguments(self):
+        return self.operation.argument_type(
+            self.problem_size_buffer.ptr,
+            self.problem_count,
+            self.total_tiles,
+            self.output_op,
+            self.ptr_A_buffer.ptr,
+            self.ptr_B_buffer.ptr,
+            self.ptr_C_buffer.ptr,
+            self.ptr_D_buffer.ptr,
+            self.lda_buffer.ptr,
+            self.ldb_buffer.ptr,
+            self.ldc_buffer.ptr,
+            self.ldd_buffer.ptr,
+            ctypes.c_void_p(int(self.host_problem_size_ptr)),
+        )
+
+    def initialize(self):
+        # Get launch configuration
+        launch_config = self.operation.rt_module.plan(self)
+
+        # Get the host and evice workspace
+        device_workspace_size = self.operation.rt_module.get_device_workspace_size(self)
+
+        if device_workspace_size > 0:
+            self.workspace_buffer = device_mem_alloc(device_workspace_size)
+            workspace_ptr = self.workspace_buffer.ptr
+            err, = cuda.cuMemsetD32(
+                workspace_ptr, 0, device_workspace_size // 4)
+        else:
+            workspace_ptr = None
+
+        if self.operation.precompute_mode == SchedulerMode.Host:
+            device_workspace_ptr = self.operation.rt_module.host_precompute(
+                self, self.operation.rt_module.get_workspace_size(self),)
+        else:
+            device_workspace_ptr = 0
+
+        result = self.operation.rt_module.get_args(
+            ctypes.byref(self.arguments),
+            self.total_tiles,
+            ctypes.c_void_p(int(device_workspace_ptr)),
+        )
+        host_workspace = bytearray(result.contents)
+
+        device_workspace = None
+
+        self.host_workspace = host_workspace
+        self.device_workspace = device_workspace
+        self.launch_config = launch_config
+
+    def sync(self):
+        err, = cudart.cudaDeviceSynchronize()
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError("CUDA Error %s" % str(err))
+        for arg in self.gemm_arguments:
+            arg.sync(stream_sync=False)
+
+
+################################################################################
+# Base class for GEMM runtime module
+################################################################################
+
+
+class GemmRTbase(ExecutableOperation):
+    """
+    GemmRT manages the CUTLASS runtime components
+    """
+
+    KernelTemplate = r"""
+extern "C"
+__global__ void
+${operation_name}(${operation_name}${operation_suffix}::Params params) {
+
+  // Dynamic shared memory base pointer
+  extern __shared__ int SharedStorageBase[];
+
+  // Declare pointer to dynamic shared memory.
+  ${operation_name}${operation_suffix}::SharedStorage *shared_storage =
+      reinterpret_cast<${operation_name}${operation_suffix}::SharedStorage *>(SharedStorageBase);
+
+  ${operation_name}${operation_suffix}::invoke(params, *shared_storage);
+}
+  """
+
+    def __init__(self, operation: "GemmOperation"):
+        super().__init__(operation)
+
+        self.operation = operation
+        threadblock_shape = operation.tile_description.threadblock_shape
+        self.threadblock_shape = GemmCoord(
+            threadblock_shape[0], threadblock_shape[1], threadblock_shape[2])
+        self.threadblock_swizzle = operation.swizzling_functor
+
+        # Threads per threadblock
+        self.threads = operation.tile_description.num_threads
+
+    def emit(self):
+        return self.emitter.emit(self.operation)
+
+    def can_implement(self, configuration, arguments):
+        raise NotImplementedError()
+
+    def get_host_workspace_size(self, arguments):
+        raise NotImplementedError()
+
+    def get_device_workspace_size(self, arguments):
+        return 0
+
+    def initialize(self):
+        err, = cuda.cuFuncSetAttribute(
+            self.kernel,
+            attrib=cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+            value=self.shared_memory_capacity)
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError(
+                f"CUDA error on call to cuFuncSetAttribute: {cuda.cuGetErrorString(err)[1]}"
+            )
+
+
+################################################################################
+# Runtime module for GEMM Universal
+################################################################################
+
+
+class GemmRTUniversal(GemmRTbase):
+    """
+    GemmRTUniversal manages the CUTLASS runtime components
+    """
+
+    HostTemplate = r"""
+extern "C" {
+  // Get the size of params in bytes
+  int ${operation_name}_get_param_size(){
+    return sizeof(${operation_name}${operation_suffix}::Params);
+  }
+
+  // Get the size of dynamic shared memory in bytes
+  int ${operation_name}_shared_memory_size() {
+    return int(sizeof(${operation_name}${operation_suffix}::SharedStorage));
+  }
+
+  // Get the params as byte array
+  char* ${operation_name}_get_params(${operation_name}_base::Arguments* argument, int* workspace){
+    ${operation_name}_base::Params* params;
+    params = new ${operation_name}_base::Params(*argument,
+                                                -1, // SM count. Only used for stream-K
+                                                -1  // Occupancy. Only used for stream-K
+                                                );
+
+    // Semaphore holds the pointer to the workspace in the Params struct
+    params->semaphore = workspace;
+
+    char *bytes = ((char*)(params));
+    char *output = new char[sizeof(${operation_name}_base::Params)];
+    for (unsigned int i = 0; i < sizeof(${operation_name}_base::Params); i ++)
+        output[i] = bytes[i];
+
+    return output;
+  }
+
+  cutlass::gemm::GemmCoord ${operation_name}_get_tiled_shape(
+    cutlass::gemm::GemmCoord problem_size, cutlass::gemm::GemmCoord tile_size, int split_k_slices) {
+    return ${operation_name}_base::ThreadblockSwizzle::get_tiled_shape(
+        problem_size, tile_size, split_k_slices);
+  }
+
+  dim3 ${operation_name}_get_grid_shape(cutlass::gemm::GemmCoord tiled_shape) {
+    return ${operation_name}_base::ThreadblockSwizzle::get_grid_shape(tiled_shape);
+  }
+}
+  """
+
+    def __init__(self, operation):
+        super(GemmRTUniversal, self).__init__(operation)
+        self.extra_funcs = {
+            "get_tiled_shape": GemmCoord_,
+            "get_grid_shape": dim3_,
+        }
+        self.emitter = EmitGemmUniversalInstance(
+            "_type", operation.direct_store)
+
+        self.argument_type, self.epilogue_type = get_gemm_arguments(operation.epilogue_functor)
+        self.argtype = [
+            ctypes.POINTER(self.argument_type),
+            ctypes.POINTER(GemmCoord_), ctypes.c_int, ctypes.c_void_p
+        ]
+
+    def plan(self, arguments):
+        grid = self.get_tiled_shape(
+            arguments.problem_size.ctype,
+            self.threadblock_shape.ctype,
+            arguments.batch_count
+        )
+
+        gemm_k_size = arguments.problem_size.k
+        if arguments.gemm_mode in [GemmUniversalMode.Gemm, GemmUniversalMode.GemmSplitKParallel]:
+            alignk = max(max(128 // DataTypeSize[self.operation.A.element],
+                         128 // DataTypeSize[self.operation.B.element]), 1)
+
+            gemm_k_size = (((arguments.problem_size.k + arguments.batch_count - 1) //
+                           arguments.batch_count + alignk - 1) // alignk) * alignk
+
+            if gemm_k_size:
+                grid_z = (arguments.problem_size.k + gemm_k_size - 1) // gemm_k_size
+                grid = GemmCoord(grid.m, grid.n, grid_z).ctype
+
+        arguments.grid_tiled_shape = dim3_(grid.m, grid.n, grid.k)
+        grid = self.get_grid_shape(grid)
+        arguments.gemm_k_size = gemm_k_size
+        return LaunchConfiguration(
+            [grid.x, grid.y, grid.z],
+            [self.threads, 1, 1],
+            self.shared_memory_capacity)
+
+    def get_device_workspace_size(self, arguments: GemmArguments):
+        workspace_bytes = 0
+        if arguments.gemm_mode == GemmUniversalMode.GemmSplitKParallel:
+            workspace_bytes = (DataTypeSize[arguments.operation.C.element]
+             * arguments.batched_stride_D * arguments.grid_tiled_shape.z // 8)
+        elif (arguments.gemm_mode == GemmUniversalMode.Gemm and
+            arguments.split_k_slices > 1):
+            workspace_bytes = 4 * arguments.grid_tiled_shape.x * arguments.grid_tiled_shape.y
+
+        return workspace_bytes
+
+
+class GemmRTUniversalStreamK(GemmRTUniversal):
+    """
+    Manages the CUTLASS runtime components for 2.x stream K kernels
+    """
+
+    HostTemplate = r"""
+extern "C" {
+  // Get the size of params in bytes
+  int ${operation_name}_get_param_size(){
+    return sizeof(${operation_name}${operation_suffix}::Params);
+  }
+
+  // Get the size of dynamic shared memory in bytes
+  int ${operation_name}_shared_memory_size() {
+    return int(sizeof(${operation_name}${operation_suffix}::SharedStorage));
+  }
+
+  using GemmType = ${operation_name}_base;
+
+  // Get the params as byte array
+  char* ${operation_name}_get_params(GemmType::Arguments* argument, int* workspace,
+                                     int sm_count, int occupancy) {
+    GemmType::Params* params;
+    params = new GemmType::Params(*argument, sm_count, occupancy);
+
+    params->init_workspace(workspace);
+
+    char *bytes = ((char*)(params));
+    char *output = new char[sizeof(GemmType::Params)];
+    for (unsigned int i = 0; i < sizeof(GemmType::Params); i ++)
+        output[i] = bytes[i];
+
+    return output;
+  }
+
+  dim3 ${operation_name}_get_grid_shape(GemmType::Arguments* args, int device_sms, int sm_occupancy) {
+    typename GemmType::Params params(*args, device_sms, sm_occupancy);
+    return params.get_grid_dims();
+  }
+
+  uint64_t ${operation_name}_get_kernel_workspace_size(GemmType::Arguments* args, int device_sms, int sm_occupancy) {
+    typename GemmType::Params params(*args, device_sms, sm_occupancy);
+    return params.get_workspace_size();
+  }
+}
+  """
+
+    def __init__(self, operation: "GemmOperation"):
+        super(GemmRTUniversalStreamK, self).__init__(operation)
+        self.extra_funcs = {
+            "get_grid_shape": GemmCoord_,
+            "get_kernel_workspace_size": ctypes.c_uint64,
+        }
+        self._occupancy = None
+        self.argument_type, self.epilogue_type  = get_gemm_arguments_streamk(operation.epilogue_functor)
+
+    @property
+    def occupancy(self):
+        if self._occupancy is None:
+            err, self._occupancy = cuda.cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+                self.kernel, self.threads, self.shared_memory_capacity,
+                cuda.CUoccupancy_flags.CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE)
+
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                raise RuntimeError(
+                    "CUDA error on call to cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags: "
+                    f"{cuda.cuGetErrorString(err)[1]}")
+        return self._occupancy
+
+    def get_device_workspace_size(self, arguments: GemmArguments2xStreamK, device_sms: int, sm_occupancy: int):
+        return self.get_kernel_workspace_size(ctypes.byref(arguments.get_arguments()), device_sms, sm_occupancy)
+
+
+################################################################################
+# Runtime module for GEMM Universal within CUTLASS 3
+################################################################################
+
+
+class GemmRTUniversal3x(GemmRTUniversal):
+    """
+    Manages the CUTLASS runtime components for 3.x kernels
+    """
+
+    KernelTemplate = r"""
+
+using Operator = ${operation_name}${operation_suffix};
+extern "C"
+__global__ __launch_bounds__(Operator::MaxThreadsPerBlock, Operator::MinBlocksPerMultiprocessor)
+void ${operation_name}(__grid_constant__ typename Operator::Params const params) {
+  // Dynamic shared memory base pointer
+  extern __shared__ char smem[];
+
+  // Declare pointer to dynamic shared memory.
+  Operator op;
+  op(params, smem);
+}
+  """
+    HostTemplate = r"""
+extern "C" {
+  // Get the size of params in bytes
+  int ${operation_name}_get_param_size(){
+    return sizeof(${operation_name}${operation_suffix}::Params);
+  }
+
+  // Get the size of dynamic shared memory in bytes
+  int ${operation_name}_shared_memory_size() {
+    return ${operation_name}${operation_suffix}::SharedStorageSize;
+  }
+
+  using GemmType = ${operation_name}_base;
+
+  bool ${operation_name}_uses_default_epilogue() {
+    return std::is_same_v<GemmType::CollectiveEpilogue::DispatchPolicy, cutlass::gemm::EpilogueDefault>;
+  }
+
+  // Get the workspace size
+  uint64_t ${operation_name}_get_kernel_workspace_size(GemmType::Arguments* argument) {
+    return GemmType::get_workspace_size(*argument);
+  }
+
+  // Get the params as byte array
+  char* ${operation_name}_get_params(GemmType::Arguments* argument, int* workspace){
+    GemmType::Params params = GemmType::to_underlying_arguments(*argument, workspace);
+    char *bytes = ((char*)(&params));
+    char *output = new char[sizeof(GemmType::Params)];
+    for (unsigned int i = 0; i < sizeof(GemmType::Params); i ++)
+        output[i] = bytes[i];
+
+    return output;
+  }
+
+  // Get the total number of blocks for a persistent kernel
+  uint64_t ${operation_name}_get_persistent_tiled_blk_shape_mnl(GemmType::ProblemShape problem) {
+    auto problem_shape_MNKL = append<4>(problem, Int<1>{});
+    auto [problem_blocks_m, problem_blocks_n, problem_blocks_l] =
+        cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90::get_tiled_cta_shape_mnl(
+            problem_shape_MNKL, GemmType::TileShape{}, GemmType::DispatchPolicy::ClusterShape{});
+    return problem_blocks_m * problem_blocks_n * problem_blocks_l;
+  }
+
+  // Get the grid shape
+  dim3 ${operation_name}_get_grid_shape(GemmType::Arguments* args, int* workspace) {
+    auto tmp_params = GemmType::to_underlying_arguments(*args, workspace);
+    return GemmType::get_grid_shape(tmp_params);
+  }
+
+  // Get the block shape
+  dim3 ${operation_name}_get_block_shape() {
+    return GemmType::get_block_shape();
+  }
+}
+  """
+
+    def __init__(self, operation):
+        super(GemmRTUniversal3x, self).__init__(operation)
+        self.extra_funcs = {
+            "get_grid_shape": dim3_,
+            "get_block_shape": dim3_,
+            "get_persistent_tiled_blk_shape_mnl": ctypes.c_uint64,
+            "get_kernel_workspace_size": ctypes.c_uint64,
+            "uses_default_epilogue": ctypes.c_bool,
+        }
+        self.emitter = EmitGemmUniversalInstance3x("_type")
+
+    def get_device_workspace_size(self, arguments: GemmArguments3x):
+        return self.get_kernel_workspace_size(ctypes.byref(arguments.get_arguments()))
+
+
+class EmitGemmUniversalInstance3x:
+    """Responsible for emitting a CUTLASS 3 template definition"""
+
+    def __init__(self, operation_suffix=""):
+        self.operation_suffix = operation_suffix
+        self.includes = [
+            "cutlass/cutlass.h",
+            "cute/tensor.hpp",
+            "cute/atom/mma_atom.hpp",
+            "cutlass/numeric_types.h",
+            "cutlass/gemm/collective/collective_builder.hpp",
+            "cutlass/gemm/kernel/sm90_tile_scheduler.hpp",
+            "cutlass/gemm/kernel/gemm_universal.hpp",
+            "cutlass/epilogue/collective/collective_builder.hpp",
+            "cutlass/epilogue/collective/default_epilogue.hpp",
+            "cutlass/epilogue/thread/linear_combination.h"
+        ]
+        self.gemm_template_kernel = """
+using namespace cute;
+
+using CollectiveEpilogue =
+  typename cutlass::epilogue::collective::CollectiveBuilder<
+    ${arch}, ${opcode_class},
+    cute::Shape<cute::_${threadblock_shape_m}, cute::_${threadblock_shape_n}, cute::_${threadblock_shape_k}>,
+    cute::Shape<cute::_${cluster_m},cute::_${cluster_n},cute::_${cluster_k}>,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ${element_accumulator}, ${element_epilogue},
+    ${element_c}, ${layout_c}, ${align_c},
+    ${element_d}, ${layout_d}, ${align_d},
+    ${epilogue_schedule}
+  >::CollectiveOp;
+
+using CollectiveMainloop =
+  typename cutlass::gemm::collective::CollectiveBuilder<
+    ${arch}, ${opcode_class},
+    ${element_a}, ${layout_a}, ${align_a},
+    ${element_b}, ${layout_b}, ${align_b},
+    ${element_accumulator},
+    cute::Shape<cute::_${threadblock_shape_m}, cute::_${threadblock_shape_n}, cute::_${threadblock_shape_k}>,
+    cute::Shape<cute::_${cluster_m},cute::_${cluster_n},cute::_${cluster_k}>,
+    ${stage_count_type},
+    ${kernel_schedule}
+  >::CollectiveOp;
+
+// Gemm operator ${operation_name}
+using ${operation_name}_base = cutlass::gemm::kernel::GemmUniversal<
+    Shape<int,int,int,int>,
+    CollectiveMainloop,
+    CollectiveEpilogue,
+    ${tile_scheduler}
+>;
+
+// Define named type
+struct ${operation_name}${operation_suffix} :
+  public ${operation_name}_base { };
+"""
+        self.gemm_template_kernel_visitor = """
+using namespace cute;
+
+${callback_decl}
+
+using CollectiveEpilogue =
+  typename cutlass::epilogue::collective::CollectiveBuilder<
+    ${arch}, ${opcode_class},
+    cute::Shape<cute::_${threadblock_shape_m}, cute::_${threadblock_shape_n}, cute::_${threadblock_shape_k}>,
+    cute::Shape<cute::_${cluster_m},cute::_${cluster_n},cute::_${cluster_k}>,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ${element_accumulator}, ${element_epilogue},
+    ElementC, StrideC, ${align_c},
+    ElementD, StrideD, ${align_d},
+    ${epilogue_schedule},
+    ${callback_name}
+  >::CollectiveOp;
+
+using CollectiveMainloop =
+  typename cutlass::gemm::collective::CollectiveBuilder<
+    ${arch}, ${opcode_class},
+    ${element_a}, ${layout_a}, ${align_a},
+    ${element_b}, ${layout_b}, ${align_b},
+    ${element_accumulator},
+    cute::Shape<cute::_${threadblock_shape_m}, cute::_${threadblock_shape_n}, cute::_${threadblock_shape_k}>,
+    cute::Shape<cute::_${cluster_m},cute::_${cluster_n},cute::_${cluster_k}>,
+    ${stage_count_type},
+    ${kernel_schedule}
+  >::CollectiveOp;
+
+// Gemm operator ${operation_name}
+using ${operation_name}_base = cutlass::gemm::kernel::GemmUniversal<
+    Shape<int,int,int,int>,
+    CollectiveMainloop,
+    CollectiveEpilogue,
+    ${tile_scheduler}
+>;
+
+// Define named type
+struct ${operation_name}${operation_suffix} :
+  public ${operation_name}_base { };
+"""
+
+        self.gemm_template_device = self.gemm_template_kernel + """
+
+// Define device-level operator
+using DeviceKernel = cutlass::gemm::device::GemmUniversalAdapter<${operation_name}${operation_suffix}>;
+"""
+
+    def emit(self, operation):
+        # Support built-in epilogue functors or user-defined functions
+
+        if operation.tile_description.stages is None or operation.tile_description.stages == 0:
+            stage_count_type = "cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>"
+        else:
+            stage_count_type = "_" + str(operation.tile_description.stages)
+
+        if operation.emission_type == EmissionType.Kernel:
+            gemm_template = self.gemm_template_kernel
+        else:
+            gemm_template = self.gemm_template_device
+
+        kschedule = KernelScheduleType.ScheduleAuto
+        eschedule = EpilogueScheduleType.ScheduleAuto
+        tschedule = TileSchedulerType.Default
+        if operation.tile_description.kernel_schedule is not None:
+            kschedule = operation.tile_description.kernel_schedule
+        if operation.tile_description.epilogue_schedule is not None:
+            eschedule = operation.tile_description.epilogue_schedule
+        if operation.tile_description.tile_scheduler is not None:
+            tschedule = operation.tile_description.tile_scheduler
+
+        values = {
+            "operation_name": operation.procedural_name(),
+            "operation_suffix": self.operation_suffix,
+            "element_a": DataTypeTag[operation.A.element],
+            "layout_a": LayoutTag[operation.A.layout],
+            "element_b": DataTypeTag[operation.B.element],
+            "layout_b": LayoutTag[operation.B.layout],
+            "element_c": DataTypeTag[operation.C.element],
+            "layout_c": LayoutTag[operation.C.layout],
+            "element_d": DataTypeTag[operation.epilogue_functor.element_output],
+            "layout_d": LayoutTag[operation.C.layout],
+            "element_accumulator": DataTypeTag[operation.accumulator_type()],
+            "element_epilogue": DataTypeTag[operation.epilogue_functor.element_epilogue],
+            "opcode_class": OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+            "arch": "cutlass::arch::Sm%d" % operation.arch,
+            "threadblock_shape_m": str(operation.tile_description.threadblock_shape[0]),
+            "threadblock_shape_n": str(operation.tile_description.threadblock_shape[1]),
+            "threadblock_shape_k": str(operation.tile_description.threadblock_shape[2]),
+            "cluster_m": str(operation.tile_description.cluster_shape[0]),
+            "cluster_n": str(operation.tile_description.cluster_shape[1]),
+            "cluster_k": str(operation.tile_description.cluster_shape[2]),
+            "align_a": str(operation.A.alignment),
+            "align_b": str(operation.B.alignment),
+            "align_c": str(operation.C.alignment),
+            "align_d": str(operation.C.alignment),
+            "stage_count_type": stage_count_type,
+            "kernel_schedule": KernelScheduleTag[kschedule],
+            "epilogue_schedule": EpilogueScheduleTag[eschedule],
+            "tile_scheduler": TileSchedulerTag[tschedule]
+        }
+        if hasattr(operation.epilogue_functor, "visitor"):
+            callback_name, callback_decl = operation.epilogue_functor.emit(operation)
+            values["callback_name"] = callback_name
+            values["callback_decl"] = callback_decl
+            return SubstituteTemplate(self.gemm_template_kernel_visitor, values)
+
+        else:
+            values["epilogue_functor"] = operation.epilogue_functor.emit()
+            return SubstituteTemplate(gemm_template, values)
+
+
+###################################################################################################
+# Runtime module for GEMM Grouped
+###################################################################################################
+
+
+class GemmRTGrouped(GemmRTbase):
+    """
+    GemmRTGrouped manages the CUTLASS runtime components
+    """
+
+    KernelTemplate = r"""
+extern "C"
+__global__ void
+${operation_name}(${operation_name}${operation_suffix}::Params params) {
+
+  // Dynamic shared memory base pointer
+  extern __shared__ int SharedStorageBase[];
+
+  // Declare pointer to dynamic shared memory.
+  ${operation_name}${operation_suffix}::SharedStorage *shared_storage =
+      reinterpret_cast<${operation_name}${operation_suffix}::SharedStorage *>(SharedStorageBase);
+
+  ${operation_name}${operation_suffix} op;
+
+  op(params, *shared_storage);
+}
+  """
+
+    HostTemplate = r"""
+  extern "C" {
+
+    // precompute scheduling information
+     char * ${operation_name}_precompute(${operation_name}_base::Arguments const &args, int tile_count, size_t workspace_bytes) {
+      char* host_workspace = new char[workspace_bytes];
+      ${operation_name}_base::ProblemVisitor::host_precompute(
+        args.host_problem_sizes,
+        args.problem_count,
+        args.threadblock_count,
+        (void*)host_workspace
+      );
+      return host_workspace;
+    }
+
+    // Get the size of params in bytes
+    int ${operation_name}_get_param_size(){
+      return sizeof(${operation_name}${operation_suffix}::Params);
+    }
+
+    // Get the size of dynamic shared memory in bytes
+    int ${operation_name}_shared_memory_size() {
+      return int(sizeof(${operation_name}${operation_suffix}::SharedStorage));
+    }
+
+    // Get the params as byte array
+    char* ${operation_name}_get_params(${operation_name}_base::Arguments* argument, int tile_count, void* workspace=nullptr){
+      ${operation_name}_base::Params* params;
+      params = new ${operation_name}_base::Params(*argument, workspace, tile_count);
+
+      char *bytes = ((char*)(params));
+      char *output = new char[sizeof(${operation_name}_base::Params)];
+      for (unsigned int i = 0; i < sizeof(${operation_name}_base::Params); i ++)
+          output[i] = bytes[i];
+
+      return output;
+    }
+
+    cutlass::gemm::GemmCoord ${operation_name}_get_tiled_shape(
+        cutlass::gemm::GemmCoord problem_size, cutlass::gemm::GemmCoord tile_size, int split_k_slices) {
+        return ${operation_name}_base::ThreadblockSwizzle::get_tiled_shape(
+            problem_size, tile_size, split_k_slices);
+    }
+
+    dim3 ${operation_name}_get_grid_shape(cutlass::gemm::GemmCoord tiled_shape) {
+        return ${operation_name}_base::ThreadblockSwizzle::get_grid_shape(tiled_shape);
+    }
+  }
+  """
+
+    def __init__(self, operation: "GemmOperation"):
+        super(GemmRTGrouped, self).__init__(operation)
+        self.extra_funcs = {
+            "precompute": None,
+            "get_tiled_shape": GemmCoord_,
+            "get_grid_shape": dim3_,
+        }
+        self.emitter = EmitGemmGroupedInstance("_type")
+        self.argument_type, self.epilogue_type = get_gemm_grouped_arguments(operation.epilogue_functor)
+        self.argtype = [ctypes.POINTER(self.argument_type), ctypes.c_int, ctypes.c_void_p]
+
+    def host_precompute(self, arguments, workspace_bytes):
+        self.precompute.argtype = [
+            self.argtype[0], ctypes.c_int, ctypes.c_longlong]
+        self.precompute.restype = ctypes.POINTER(ctypes.c_byte * workspace_bytes)
+
+        problem_info = self.precompute(
+            ctypes.byref(arguments.arguments),
+            arguments.total_tiles,
+            workspace_bytes)
+        problem_info_array = bytearray(problem_info.contents)
+
+        # copy to device memory
+        return todevice(problem_info_array).ptr
+
+    def plan(self, arguments):
+        return LaunchConfiguration(
+            [arguments.total_tiles, 1, 1],
+            [self.threads, 1, 1],
+            self.shared_memory_capacity,
+        )
+
+    def get_workspace_size(self, arguments):
+        if self.operation.precompute_mode == SchedulerMode.Device:
+            return 0
+        elif self.operation.precompute_mode == SchedulerMode.Host:
+            total_tiles = arguments.total_tiles
+            entries_per_block = 1
+            return 8 * entries_per_block * total_tiles  # three int32_t
+
+
+################################################################################
+# Runtime module for GEMM and grouped GEMM
+################################################################################
+
+
+class GemmOperationBase:
+    """
+    CUTLASS GEMM operation
+    """
+
+    def __init__(
+        self, gemm_kind, arch, tile_description: TileDescription,
+        A: TensorDescription, B: TensorDescription, C: TensorDescription,
+        epilogue_functor, swizzling_functor=SwizzlingFunctor.Identity1,
+        api=ApiVersion.v2x, emission_type=EmissionType.Kernel, **kwargs):
+        self.operation_kind: OperationKind = OperationKind.Gemm
+        self.arch: int = arch
+        self.tile_description: TileDescription = tile_description
+        self.gemm_kind: GemmKind = gemm_kind
+
+        self.api = api
+        self.prefix = "3x" if self.api == ApiVersion.v3x else ""
+        self.emission_type = emission_type
+
+        # Optionally swap the TensorDescriptions for operands A and B and transpose their
+        # layouts. This is needed to mimic the transpose performed by device::GemmUniversal.
+        # The code below uses deep copy to avoid overwritting the original TensorDescription
+        self.switched = (self.api != ApiVersion.v3x and
+                         self.emission_type == EmissionType.Kernel and
+                         C.layout == LayoutType.ColumnMajor)
+
+        self.A, self.B, self.C = GemmOperationBase.get_operands(A, B, C, self.switched)
+
+        self.epilogue_functor = epilogue_functor
+        self.swizzling_functor = swizzling_functor
+
+        if "direct_store" in kwargs:
+            self.direct_store = kwargs["direct_store"]
+        else:
+            self.direct_store = False
+
+    @staticmethod
+    def get_operands(A: TensorDescription, B: TensorDescription, C: TensorDescription, swap: bool):
+        """
+        Makes copies of A, B, and C, and possibly transposes their order. If ``swap`` is set,
+        A and B are swapped, and the layout of A, B, and C are transposed.
+
+        :param A: description of operand A
+        :type A: TensorDescription
+        :param B: description of operand B
+        :type B: TensorDescription
+        :param C: description of operand C
+        :type C: TensorDescription
+
+        :return: descriptions of operands A, B, and C
+        :rtype: tuple[TileDescription]
+        """
+        if swap:
+            A_out = copy.deepcopy(B)
+            B_out = copy.deepcopy(A)
+            C_out = copy.deepcopy(C)
+            A_out.layout = transpose_layout(A_out.layout)
+            B_out.layout = transpose_layout(B_out.layout)
+            C_out.layout = transpose_layout(C_out.layout)
+        else:
+            A_out = copy.deepcopy(A)
+            B_out = copy.deepcopy(B)
+            C_out = copy.deepcopy(C)
+        return A_out, B_out, C_out
+
+    def run(self, arguments: GemmArguments) -> cuda.CUresult:
+        """
+        Configure and launch the cuda kernel with input arguments
+        """
+        if self.emission_type == EmissionType.Device:
+            raise Exception('Running a kernel via PyCUTLASS is only enabled with emission type "Kernel"')
+
+        err = self.rt_module.run(
+            arguments.host_workspace,
+            arguments.device_workspace,
+            arguments.launch_config,
+            arguments.stream
+        )
+
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError("CUDA Error %s" % str(err))
+
+        return err
+
+    def is_complex(self):
+        complex_operators = [
+            MathOperation.multiply_add_complex,
+            MathOperation.multiply_add_complex_gaussian,
+            MathOperation.multiply_add_complex_fast_f32,
+        ]
+        return self.tile_description.math_instruction.math_operation in complex_operators
+
+    def is_planar_complex(self):
+        return self.gemm_kind in (GemmKind.PlanarComplex, GemmKind.PlanarComplexArray)
+
+    def accumulator_type(self):
+        accum = self.tile_description.math_instruction.element_accumulator
+
+        if self.is_complex():
+            return get_complex_from_real(accum)
+
+        return accum
+
+    def short_math_name(self):
+        if self.tile_description.math_instruction.math_operation == MathOperation.multiply_add_complex_gaussian:
+            return "g%s" % ShortDataTypeNames[self.accumulator_type()]
+        return ShortDataTypeNames[self.accumulator_type()]
+
+    def core_name(self):
+        """The basic operation kind is prefixed with a letter indicating the accumulation type."""
+
+        inst_shape = ""
+        inst_operation = ""
+        intermediate_type = ""
+
+        math_operations_map = {
+            MathOperation.xor_popc: "xor",
+        }
+
+        if (self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp or
+            self.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp):
+            math_op = self.tile_description.math_instruction.math_operation
+            math_op_string = math_operations_map[math_op] if math_op in math_operations_map.keys() else ""
+
+            if self.tile_description.math_instruction.instruction_shape is not None:
+                if self.api == ApiVersion.v3x and self.arch >= 90:
+                    inst_shape = "%dx%dx%d" % tuple(
+                        self.tile_description.math_instruction.instruction_shape)
+                else:
+                    inst_shape = "%d%d%d" % tuple(
+                        self.tile_description.math_instruction.instruction_shape)
+            else:
+                inst_shape = "Default"
+            inst_shape += math_op_string
+
+            if (self.tile_description.math_instruction.element_a != self.A.element and
+                self.tile_description.math_instruction.element_a != self.tile_description.math_instruction.element_accumulator):
+                intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
+
+        return "%s%s%s%s" % (self.short_math_name(), inst_shape, intermediate_type, GemmKindNames[self.gemm_kind])
+
+    def extended_name(self):
+        """Append data types if they differ from compute type."""
+        if self.is_complex():
+            extended_name = "${core_name}"
+        else:
+            if (self.C.element != self.tile_description.math_instruction.element_accumulator and
+                self.A.element != self.tile_description.math_instruction.element_accumulator):
+                extended_name = "${element_c}_${core_name}_${element_a}"
+            elif (self.C.element == self.tile_description.math_instruction.element_accumulator and
+                self.A.element != self.tile_description.math_instruction.element_accumulator):
+                extended_name = "${core_name}_${element_a}"
+            else:
+                extended_name = "${core_name}"
+
+        extended_name = SubstituteTemplate(extended_name, {
+            "element_a": DataTypeNames[self.A.element],
+            "element_c": DataTypeNames[self.C.element],
+            "core_name": self.core_name(),
+        })
+
+        return extended_name
+
+    def extended_name_3x(self):
+        """Generates a string representing the MMA atom. Assumes accumulator type is C type."""
+        extended_name = "{core_name}_{element_a}_{element_b}_{element_acc}_{element_c}_{element_d}".format(
+            element_a=DataTypeNames[self.A.element],
+            element_b=DataTypeNames[self.B.element],
+            element_acc=DataTypeNames[self.accumulator_type()],
+            element_c=DataTypeNames[self.C.element],
+            element_d=DataTypeNames[self.epilogue_functor.element_output],
+            core_name=self.core_name())
+        return extended_name
+
+    def layout_name(self):
+        if self.is_complex() or self.is_planar_complex():
+            return "%s%s" % (
+                ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)],
+                ShortComplexLayoutNames[(self.B.layout, self.B.complex_transform)]
+            )
+        return "%s%s" % (ShortLayoutTypeNames[self.A.layout], ShortLayoutTypeNames[self.B.layout])
+
+    # Generates a short string representing the ABC layout tags (e.g. ntn or tnn)
+    def layout_name_3x(self):
+        if self.is_complex() or self.is_planar_complex():
+            return "{}{}{}".format(
+                ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)],
+                ShortComplexLayoutNames[(self.B.layout, self.B.complex_transform)],
+                ShortComplexLayoutNames[(self.C.layout, self.C.complex_transform)])
+        else:
+            return "{}{}{}".format(
+                ShortLayoutTypeNames[self.A.layout],
+                ShortLayoutTypeNames[self.B.layout],
+                ShortLayoutTypeNames[self.C.layout])
+
+    # Generates a short string representing underlying kernel schedule type
+    def kernel_schedule_name_3x(self):
+        if self.tile_description.kernel_schedule is None:
+            return KernelScheduleSuffixes[KernelScheduleType.ScheduleAuto]
+        else:
+            return KernelScheduleSuffixes[self.tile_description.kernel_schedule]
+
+    # Generates a short string representing underlying epilogue schedule type
+    def epilogue_schedule_name_3x(self):
+        if self.tile_description.epilogue_schedule is None:
+            return EpilogueScheduleSuffixes[EpilogueScheduleType.ScheduleAuto]
+        else:
+            return EpilogueScheduleSuffixes[self.tile_description.epilogue_schedule]
+
+    def procedural_name(self):
+        """The full procedural name indicates architecture, extended name, tile size, and layout."""
+        opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
+        if self.api == ApiVersion.v3x and self.arch >= 90:
+            kernel_name_template = "cutlass{p}_sm{ar}_{op}_{ex}_{tbm}x{tbn}x{tbk}_{cm}x{cn}x{ck}_{l}_{s}_align{al}{k}{e}"
+            return kernel_name_template.format(
+                p=self.prefix,
+                ar=self.arch,
+                op=opcode_class_name,
+                ex=self.extended_name_3x(),
+                tbm=self.tile_description.threadblock_shape[0],
+                tbn=self.tile_description.threadblock_shape[1],
+                tbk=self.tile_description.threadblock_shape[2],
+                cm=self.tile_description.cluster_shape[0],
+                cn=self.tile_description.cluster_shape[1],
+                ck=self.tile_description.cluster_shape[2],
+                l=self.tile_description.stages,
+                s=self.layout_name_3x(),
+                al=str(self.A.alignment),
+                k=self.kernel_schedule_name_3x(),
+                e=self.epilogue_schedule_name_3x()
+            )
+        else:
+            threadblock = self.tile_description.procedural_name_2x()
+            return "cutlass{p}_{op}_{ex}_{tb}_{l}_align{a}".format(
+                p=self.prefix,
+                op=opcode_class_name,
+                ex=self.extended_name(),
+                tb=threadblock,
+                l=self.layout_name(),
+                a=str(self.A.alignment)
+            )
+
+    def configuration_name(self):
+        """The full procedural name indicates architecture, extended name, tile size, and layout."""
+        return self.procedural_name()
+
+
+class GemmOperationUniversal(GemmOperationBase):
+    def __init__(self, arch, tile_description: TileDescription, A: TensorDescription, B, C,
+        epilogue_functor, swizzling_functor=SwizzlingFunctor.Identity1, **kwargs):
+        api = api_version(arch, tile_description.math_instruction.opcode_class, A.element)
+        super(GemmOperationUniversal, self).__init__(GemmKind.Universal, arch, tile_description,
+                                                     A, B, C, epilogue_functor, swizzling_functor,
+                                                     api=api, **kwargs, )
+        if api == ApiVersion.v3x:
+            if swizzling_functor == SwizzlingFunctor.StreamK:
+                raise Exception("Stream K swizzle functor is currently only supported for CUTLASS 2.x kernels")
+            self.rt_module = GemmRTUniversal3x(self)
+        else:
+            if swizzling_functor == SwizzlingFunctor.StreamK:
+                self.rt_module = GemmRTUniversalStreamK(self)
+            else:
+                self.rt_module = GemmRTUniversal(self)
+        self.argument_type = self.rt_module.argument_type
+        self.epilogue_type = self.rt_module.epilogue_type
+
+    def device_op(self):
+        """
+        Returns a new GemmOperationUniversal object that is constructed with emission type
+        ``EmissionType.Device``. Since the device-emitted kernel does not require swapping,
+        any swappng performed by the kernel-emitted operation is reversed.
+
+        :return: operation ready for device-level code emission
+        :rtype: GemmUniversalOperation
+        """
+        A, B, C = GemmOperationBase.get_operands(self.A, self.B, self.C, self.switched)
+        return GemmOperationUniversal(self.arch, self.tile_description, A, B, C,
+                                      self.epilogue_functor, self.swizzling_functor,
+                                      emission_type=EmissionType.Device, direct_store=self.direct_store)
+
+
+class GemmOperationGrouped(GemmOperationBase):
+    def __init__(self, arch, tile_description: TileDescription, A: TensorDescription, B, C,
+        epilogue_functor, swizzling_functor=SwizzlingFunctor.Identity1, **kwargs):
+        super(GemmOperationGrouped, self).__init__(GemmKind.Grouped, arch, tile_description,
+                                                   A, B, C, epilogue_functor, swizzling_functor, **kwargs)
+        assert "precompute_mode" in kwargs.keys(), "missing keyword arguement 'precompute_mode'."
+        self.precompute_mode = kwargs["precompute_mode"]
+        self.rt_module = GemmRTGrouped(self)
+        self.argument_type = self.rt_module.argument_type
+        self.epilogue_type = self.rt_module.epilogue_type
+
+    def device_op(self):
+        """
+        Returns a new GemmOperationGrouped object that is constructed with emission type
+        ``EmissionType.Device``. Since the device-emitted kernel does not require swapping,
+        any swappng performed by the kernel-emitted operation is reversed.
+
+        :return: operation ready for device-level code emission
+        :rtype: GemmOperationGrouped
+        """
+        A, B, C = GemmOperationBase.get_operands(self.A, self.B, self.C, self.switched)
+        return GemmOperationGrouped(
+            self.arch, self.tile_description, A, B, C, self.epilogue_functor,
+            self.swizzling_functor, emission_type=EmissionType.Device,
+            direct_store=self.direct_store, precompute_mode=self.precompute_mode, )
+
+
+###################################################################################################
+#
+# Emits single instances of a CUTLASS device-wide operator
+#
+###################################################################################################
+
+
+class EmitGemmUniversalInstance:
+    """Responsible for emitting a CUTLASS template definition"""
+
+    def __init__(
+        self,
+        operation_suffix="",
+        direct_store=False
+    ):
+        self.operation_suffix = operation_suffix
+        self.direct_store = direct_store
+        self.includes = [
+            "cutlass/cutlass.h",
+            "cutlass/gemm_coord.h",
+            "cutlass/numeric_types.h",
+            "cutlass/arch/arch.h",
+            "cutlass/arch/mma.h",
+            "cutlass/layout/matrix.h",
+            "cutlass/gemm/device/gemm.h",
+            "cutlass/gemm/device/gemm_universal_adapter.h",
+            "cutlass/gemm/kernel/default_gemm_universal.h",
+        ]
+        if self.direct_store:
+            self.includes.append(
+                "cutlass/epilogue/threadblock/default_epilogue_direct_store.h"
+            )
+        self.gemm_template_kernel = """
+// Gemm operator ${operation_name}
+using ${operation_name}_base =
+  typename cutlass::gemm::kernel::DefaultGemmUniversal<
+    ${element_a}, ${layout_a}, ${transform_a}, ${align_a},
+    ${element_b}, ${layout_b}, ${transform_b}, ${align_b},
+    ${element_c}, ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor},
+    ${swizzling_functor},
+    ${stages},
+    ${math_operation}
+>::GemmKernel;
+
+// Define named type
+struct ${operation_name}${operation_suffix} :
+  public ${operation_name}_base { };
+"""
+
+        self.gemm_template_device = """
+// Gemm operator ${operation_name}
+using DeviceKernel =
+    typename cutlass::gemm::device::GemmUniversal<
+        // Data type and layout of operand A
+        ${element_a}, ${layout_a},
+        // Data type and layout of operand B
+        ${element_b}, ${layout_b},
+        // Data type and layout of operand C
+        ${element_c}, ${layout_c},
+        // Data type of accumulator
+        ${element_accumulator},
+        // Class of operation
+        ${opcode_class},
+        // Compute capability of the target kernel
+        ${arch},
+        // Threadblock tile shape
+        cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+        // Warp tile shape
+        cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+        // Instruction shape
+        cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+        // Epilogue functor
+        ${epilogue_functor},
+        // Swizzling function
+        ${swizzling_functor},
+        // Number of pipeline stages
+        ${stages},
+        // Alignment of operands A and B
+        ${align_a}, ${align_b},
+        // Type of math operation
+        ${math_operation},
+        // Complex transform types of operands A and B
+        ${transform_a}, ${transform_b}
+    >;
+"""
+        self.gemm_template_direct_store = """
+// Gemm operator ${operation_name}
+using ${operation_name}_default =
+  typename cutlass::gemm::kernel::DefaultGemmUniversal<
+    ${element_a}, ${layout_a}, ${transform_a}, ${align_a},
+    ${element_b}, ${layout_b}, ${transform_b}, ${align_b},
+    ${element_c}, ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor},
+    ${swizzling_functor},
+    ${stages},
+    ${math_operation}
+>::GemmKernel;
+
+using ${operation_name}_base =
+  cutlass::gemm::kernel::GemmUniversal<
+    ${operation_name}_default::Mma,
+    cutlass::epilogue::threadblock::DefaultEpilogueDirectStore<
+      ${operation_name}_default::Epilogue
+    >::Epilogue,
+    ${operation_name}_default::ThreadblockSwizzle
+  >;
+
+// Define named type
+struct ${operation_name}${operation_suffix} :
+  public ${operation_name}_base { };
+"""
+        self.gemm_template_kernel_visitor = """
+
+using OutputTileThreadMap = cutlass::epilogue::threadblock::OutputTileThreadLayout<
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    ${element_c},
+    ${align_c},
+    ${epilogue_stages} /* epilogue stages */
+>;
+
+${callback_decl}
+
+// Gemm operator ${operation_name}
+using ${operation_name}_base =
+    typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
+    ${element_a}, ${layout_a}, ${transform_a}, ${align_a},
+    ${element_b}, ${layout_b}, ${transform_b}, ${align_b},
+    ${element_c}, ${layout_c}, ${align_c},
+    ${element_accumulator},
+    ${element_epilogue},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${callback_name},
+    ${swizzling_functor},
+    ${stages},
+    ${math_operation},
+    ${epilogue_stages} /* epilogue stages */
+>::GemmKernel;
+
+// Define named type
+struct ${operation_name}${operation_suffix} :
+  public ${operation_name}_base { };
+"""
+
+    def instance_template(self):
+        return """
+${compile_guard_start}
+  manifest.append(new ${gemm_kind}<
+      cutlass::gemm::device::GemmUniversalAdapter<${operation_name}>
+    >("${operation_name}"));
+${compile_guard_end}
+"""
+
+    def emit(self, operation):
+        threadblock_shape = operation.tile_description.threadblock_shape
+        warp_count = operation.tile_description.warp_count
+
+        warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
+
+        instance_layout_A, instance_layout_B, instance_layout_C = \
+            (operation.A.layout, operation.B.layout, operation.C.layout)
+
+        if operation.emission_type == EmissionType.Kernel:
+            if self.direct_store:
+                gemm_template = self.gemm_template_direct_store
+            else:
+                gemm_template = self.gemm_template_kernel
+        else:
+            gemm_template = self.gemm_template_device
+
+        values = {
+            "operation_name": operation.procedural_name(),
+            "operation_suffix": self.operation_suffix,
+            "element_a": DataTypeTag[operation.A.element],
+            "layout_a": LayoutTag[instance_layout_A],
+            "element_b": DataTypeTag[operation.B.element],
+            "layout_b": LayoutTag[instance_layout_B],
+            "element_c": DataTypeTag[operation.C.element],
+            "layout_c": LayoutTag[instance_layout_C],
+            "element_accumulator": DataTypeTag[operation.accumulator_type()],
+            "opcode_class": OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+            "arch": "cutlass::arch::Sm%d" % operation.arch,
+            "threadblock_shape_m": str(operation.tile_description.threadblock_shape[0]),
+            "threadblock_shape_n": str(operation.tile_description.threadblock_shape[1]),
+            "threadblock_shape_k": str(operation.tile_description.threadblock_shape[2]),
+            "warp_shape_m": str(warp_shape[0]),
+            "warp_shape_n": str(warp_shape[1]),
+            "warp_shape_k": str(warp_shape[2]),
+            "instruction_shape_m": str(operation.tile_description.math_instruction.instruction_shape[0]),
+            "instruction_shape_n": str(operation.tile_description.math_instruction.instruction_shape[1]),
+            "instruction_shape_k": str(operation.tile_description.math_instruction.instruction_shape[2]),
+            "swizzling_functor": SwizzlingFunctorTag[operation.swizzling_functor],
+            "stages": str(operation.tile_description.stages),
+            "align_a": str(operation.A.alignment),
+            "align_b": str(operation.B.alignment),
+            "transform_a": ComplexTransformTag[operation.A.complex_transform],
+            "transform_b": ComplexTransformTag[operation.B.complex_transform],
+            "math_operation": MathOperationTag[operation.tile_description.math_instruction.math_operation],
+        }
+
+        if hasattr(operation.epilogue_functor, "visitor"):
+            self.includes += [
+                "cutlass/epilogue/threadblock/fusion/visitors.hpp",
+                "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h"
+            ]
+            callback_name, callback_decl = operation.epilogue_functor.emit(operation)
+            values["callback_name"] = callback_name
+            values["callback_decl"] = callback_decl
+            values["align_c"] = str(operation.C.alignment)
+            values["element_epilogue"] = DataTypeTag[operation.epilogue_functor.element_epilogue]
+            if hasattr(operation.epilogue_functor, "epilogue_stages"):
+                epilogue_stages = operation.epilogue_functor.epilogue_stages
+            else:
+                epilogue_stages = 1
+            values["epilogue_stages"] = str(epilogue_stages)
+            return SubstituteTemplate(self.gemm_template_kernel_visitor, values)
+        else:
+            values["epilogue_functor"] = operation.epilogue_functor.emit()
+            return SubstituteTemplate(gemm_template, values)
+
+
+class EmitGemmGroupedInstance:
+    """Responsible for emitting a CUTLASS template definition"""
+
+    def __init__(self, operation_suffix=""):
+        self.operation_suffix = operation_suffix
+        self.includes = [
+            "cutlass/cutlass.h",
+            "cutlass/numeric_types.h",
+            "cutlass/arch/arch.h",
+            "cutlass/arch/mma.h",
+            "cutlass/layout/matrix.h",
+            "cutlass/gemm/kernel/gemm_grouped.h",
+            "cutlass/gemm/kernel/default_gemm_grouped.h",
+        ]
+        self.gemm_template_kernel = """
+// Gemm operator ${operation_name}
+using ${operation_name}_base =
+  typename cutlass::gemm::kernel::DefaultGemmGrouped<
+    ${element_a}, ${layout_a}, ${transform_a}, ${align_a},
+    ${element_b}, ${layout_b}, ${transform_b}, ${align_b},
+    ${element_c}, ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor},
+    ${swizzling_functor},
+    ${stages},
+    ${precompute_mode},
+    ${math_operation}
+>::GemmKernel;
+
+// Define named type
+struct ${operation_name}${operation_suffix} :
+  public ${operation_name}_base { };
+"""
+        self.gemm_template_device = (
+            self.gemm_template_kernel
+            + """
+using DeviceKernel = cutlass::gemm::device::GemmGrouped<${operation_name}_base>;
+"""
+        )
+
+    def instance_template(self):
+        return """
+${compile_guard_start}
+  manifest.append(new ${gemm_kind}<
+    cutlass::gemm::device::GemmGrouped<${operation_name}>
+  >("${operation_name}"));
+${compile_guard_end}
+"""
+
+    def emit(self, operation):
+        threadblock_shape = operation.tile_description.threadblock_shape
+        warp_count = operation.tile_description.warp_count
+
+        warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
+
+        instance_layout_A, instance_layout_B, instance_layout_C = \
+            (operation.A.layout, operation.B.layout, operation.C.layout)
+
+        # Support built-in epilogue functors or user-defined functions
+        epilogue_functor = operation.epilogue_functor.emit()
+
+        values = {
+            "operation_name": operation.procedural_name(),
+            "operation_suffix": self.operation_suffix,
+            "element_a": DataTypeTag[operation.A.element],
+            "layout_a": LayoutTag[instance_layout_A],
+            "element_b": DataTypeTag[operation.B.element],
+            "layout_b": LayoutTag[instance_layout_B],
+            "element_c": DataTypeTag[operation.C.element],
+            "layout_c": LayoutTag[instance_layout_C],
+            "element_accumulator": DataTypeTag[operation.accumulator_type()],
+            "opcode_class": OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+            "arch": "cutlass::arch::Sm%d" % operation.arch,
+            "threadblock_shape_m": str(operation.tile_description.threadblock_shape[0]),
+            "threadblock_shape_n": str(operation.tile_description.threadblock_shape[1]),
+            "threadblock_shape_k": str(operation.tile_description.threadblock_shape[2]),
+            "warp_shape_m": str(warp_shape[0]),
+            "warp_shape_n": str(warp_shape[1]),
+            "warp_shape_k": str(warp_shape[2]),
+            "instruction_shape_m": str(operation.tile_description.math_instruction.instruction_shape[0]),
+            "instruction_shape_n": str(operation.tile_description.math_instruction.instruction_shape[1]),
+            "instruction_shape_k": str(operation.tile_description.math_instruction.instruction_shape[2]),
+            "epilogue_functor": epilogue_functor,
+            "swizzling_functor": SwizzlingFunctorTag[operation.swizzling_functor],
+            "stages": str(operation.tile_description.stages),
+            "align_a": str(operation.A.alignment),
+            "align_b": str(operation.B.alignment),
+            "transform_a": ComplexTransformTag[operation.A.complex_transform],
+            "transform_b": ComplexTransformTag[operation.B.complex_transform],
+            "precompute_mode": SchedulerModeTag[operation.precompute_mode],
+            "math_operation": MathOperationTag[operation.tile_description.math_instruction.math_operation],
+        }
+
+        if operation.emission_type == EmissionType.Kernel:
+            gemm_template = self.gemm_template_kernel
+        else:
+            gemm_template = self.gemm_template_device
+
+        return SubstituteTemplate(gemm_template, values)
diff --git a/python/cutlass/backend/library.py b/python/cutlass/backend/library.py
new file mode 100644
index 0000000000..49cb537a2d
--- /dev/null
+++ b/python/cutlass/backend/library.py
@@ -0,0 +1,488 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Common data types and string names/tags for them
+"""
+
+import enum
+
+from cutlass_library import (
+    ComplexTransform,
+    DataType,
+    DataTypeSize,
+    EpilogueScheduleType,
+    KernelScheduleType,
+    MathOperation,
+    OpcodeClass,
+    TileSchedulerType
+)
+
+
+# The following block implements enum.auto() for Python 3.5 variants that don't include it such
+# as the default 3.5.2 on Ubuntu 16.04.
+#
+# https://codereview.stackexchange.com/questions/177309/reimplementing-pythons-enum-auto-for-compatibility
+
+try:
+    from enum import auto as enum_auto
+except ImportError:
+    __cutlass_library_auto_enum = 0
+
+    def enum_auto() -> int:
+        global __cutlass_library_auto_enum
+        i = __cutlass_library_auto_enum
+        __cutlass_library_auto_enum += 1
+        return i
+
+
+class DataTypeSizeBytes:
+    """
+    Static class to mimic the `DataTypeSize` dictionary, but with checks for whether the
+    data type key is less than a full byte or a non-integer number of bytes.
+    """
+
+    @staticmethod
+    def __class_getitem__(datatype):
+        """
+        Returns the number of bytes in size the data type is. Raises an exception if the data type
+        is either less than a full byte or a non-integer number of bytes in size.
+
+        :param datatype: data type to query
+
+        :return: number of bytes the data type occupies
+        :rtype: int
+        """
+        bits = DataTypeSize[datatype]
+        if bits < 8:
+            raise Exception(
+                f"Data type {datatype} is less than one byte in size."
+            )
+        elif bits % 8 != 0:
+            raise Exception(
+                f"Data type datatype is not an integer number of bytes."
+            )
+        return bits // 8
+
+
+class SchedulerMode(enum.Enum):
+    Device = enum_auto()
+    Host = enum_auto()
+
+
+SchedulerModeTag = {
+    SchedulerMode.Device: "cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly",
+    SchedulerMode.Host: "cutlass::gemm::kernel::GroupScheduleMode::kHostPrecompute",
+}
+
+
+ShortSchedulerModeNames = {SchedulerMode.Device: "Device", SchedulerMode.Host: "Host"}
+
+
+class FunctionalOp(enum.Enum):
+    AtomicAdd = enum_auto()
+    AtomicMaximum = enum_auto()
+    Divides = enum_auto()
+    Maximum = enum_auto()
+    Minimum = enum_auto()
+    Minus = enum_auto()
+    Multiplies = enum_auto()
+    MultiplyAdd = enum_auto()
+    Plus = enum_auto()
+
+
+FunctionalOpTag = {
+    FunctionalOp.AtomicAdd: "cutlass::atomic_add",
+    FunctionalOp.AtomicMaximum: "cutlass::atomic_maximum",
+    FunctionalOp.Divides: "cutlass::divides",
+    FunctionalOp.Maximum: "cutlass::maximum",
+    FunctionalOp.Minimum: "cutlass::minimum",
+    FunctionalOp.Minus: "cutlass::minus",
+    FunctionalOp.Multiplies: "cutlass::multiplies",
+    FunctionalOp.MultiplyAdd: "cutlass::multiply_add",
+    FunctionalOp.Plus: "cutlass::plus",
+}
+
+
+class ActivationOp(enum.Enum):
+    DGelu = enum_auto()
+    Gelu = enum_auto()
+    GeluTaylor = enum_auto()
+    HardSwish = enum_auto()
+    Identity = enum_auto()
+    LeakyReLU = enum_auto()
+    ReLU = enum_auto()
+    Sigmoid = enum_auto()
+    SiLU = enum_auto()
+    Tanh = enum_auto()
+
+
+ActivationOpTag = {
+    ActivationOp.DGelu: "cutlass::epilogue::thread::dGELU",
+    ActivationOp.Gelu: "cutlass::epilogue::thread::GELU",
+    ActivationOp.GeluTaylor: "cutlass::epilogue::thread::GELU_taylor",
+    ActivationOp.HardSwish: "cutlass::epilogue::thread::HardSwish",
+    ActivationOp.Identity: "cutlass::epilogue::thread::Identity",
+    ActivationOp.LeakyReLU: "cutlass::epilogue::thread::LeakyReLU",
+    ActivationOp.ReLU: "cutlass::epilogue::thread::ReLu",
+    ActivationOp.Sigmoid: "cutlass::epilogue::thread::Sigmoid",
+    ActivationOp.SiLU: "cutlass::epilogue::thread::SiLu",
+    ActivationOp.Tanh: "cutlass::epilogue::thread::Tanh",
+}
+
+
+def op_tag(op) -> str:
+    """
+    Dispatches `op` to the appropriate *Tag dictionary depending on whether
+    `op` is an ActivationOp or FunctionalOp. This is useful for cases in which
+    either type can be used.
+
+    :param op: operation to emit a tag for
+    :type op: ActivationOp | FunctionalOp
+
+    :return: tag corresponding to op
+    :rtype: str
+    """
+    if isinstance(op, ActivationOp):
+        return ActivationOpTag[op]
+    elif isinstance(op, FunctionalOp):
+        return FunctionalOpTag[op]
+    else:
+        raise Exception(f"Unexpected op type {op}. Must be one of ActivationOp or FunctionalOp.")
+
+
+class FloatRoundStyle(enum.Enum):
+    ToNearest = enum_auto()
+    ToNearestSatfinite = enum_auto()
+    Indeterminate = enum_auto()
+    TowardZero = enum_auto()
+    TowardInfinity = enum_auto()
+    TowardNegInfinity = enum_auto()
+    HalfUlpTruncDntz = enum_auto()
+    HalfUlpTruncate = enum_auto()
+
+
+FloatRoundStyleTag = {
+    FloatRoundStyle.ToNearest: "cutlass::FloatRoundStyle::round_to_nearest",
+    FloatRoundStyle.ToNearestSatfinite: "cutlass::FloatRoundStyle::round_to_nearest_satfinite",
+    FloatRoundStyle.Indeterminate: "cutlass::FloatRoundStyle::round_indeterminate",
+    FloatRoundStyle.TowardZero: "cutlass::FloatRoundStyle::round_toward_zero",
+    FloatRoundStyle.TowardInfinity: "cutlass::FloatRoundStyle::round_toward_infinity",
+    FloatRoundStyle.TowardNegInfinity: "cutlass::FloatRoundStyle::round_toward_neg_infinity",
+    FloatRoundStyle.HalfUlpTruncDntz: "cutlass::FloatRoundStyle::round_half_ulp_trunc_dntz",
+    FloatRoundStyle.HalfUlpTruncate: "cutlass::FloatRoundStyle::round_half_ulp_truncate",
+}
+
+
+class MathInstruction:
+    """
+    Description of a the lowest-level matrix-multiply-accumulate operation to be used in a kernel
+    """
+
+    def __init__(
+        self,
+        instruction_shape,
+        element_a,
+        element_b,
+        element_accumulator,
+        opcode_class=OpcodeClass.Simt,
+        math_operation=MathOperation.multiply_add,
+    ):
+        """
+        :param instruction_shape: size of the [M, N, K] dimensions of the instruction
+        :type instruction_shape: list or tuple
+        :param element_a: data type of operand A
+        :param element_b: data type of operand B
+        :param element_accumulator: data type used in accumulation
+        :param opcode_class: higher-level class of the instruction (e.g., SIMT or Tensor Core)
+        :type opcode_class: cutlass_library.library.OpcodeClass
+        :param math_operation: the type of low-level operation to be performed (e.g., multiply accumulate)
+        :type math_operation: MathOperation
+        """
+        self.instruction_shape = instruction_shape
+        self.element_a = element_a
+        self.element_b = element_b
+        self.element_accumulator = element_accumulator
+        self.opcode_class = opcode_class
+        self.math_operation = math_operation
+
+
+class TileDescription:
+    """
+    Description of a tile of computation to be performed in the kernel, encompassing threadblock, cluster, and warp shapes,
+    stage count, and math instruction specification
+    """
+
+    def __init__(
+        self,
+        threadblock_shape,
+        stages,
+        warp_count,
+        math_instruction,
+        cluster_shape=[1, 1, 1],
+        kernel_schedule: KernelScheduleType = None,
+        epilogue_schedule: EpilogueScheduleType = None,
+        tile_scheduler: TileSchedulerType = None
+    ):
+        """
+        :param threadblock_shape: shape of a threadblock tyle
+        :type threadblock_shape: list or tuple
+        :param stages: number of pipline stages in the operation. For SM90 kernels, this can be set to `None` and the maximum
+                       number of stages that can be supported for an operation on a given architecture will be computed at a later time
+        :type stages: int or None
+        :param warp_count: number of warps in each [M, N, K] dimension of a threadblock tile
+        :type warp_count: list, tuple, or None
+        :param math_instruction: specification of the instruction type and shape to be performed and the types of its operands
+        :type math_instruction: MathInstruction
+        :param cluster_shape: number of threadblocks in the [X, Y, Z] dimensions of a threadblock cluster
+        :param kernel_schedule: type of kernel schedule to use (only available for SM90+)
+        :type kernel_schedule: cutlass_library.KernelScheduleType
+        :param epilogue_schedule: type of epilogue schedule to use (only available for SM90+)
+        :type epilogue_schedule: cutlass_library.EpilogueScheduleType
+        :param tile_scheduler: type of tile scheduler to use (only available for SM90+)
+        :type tile_scheduler: cutlass_library.TileSchedulerType
+        """
+        if ((kernel_schedule is None and epilogue_schedule is not None) or
+            (kernel_schedule is not None and epilogue_schedule is None)):
+            raise Exception("Kernel and epilogue schedule must either both be Auto or neither be Auto.")
+
+        self.threadblock_shape = threadblock_shape
+        self.cluster_shape = cluster_shape
+        self.kernel_schedule = kernel_schedule
+        self.epilogue_schedule = epilogue_schedule
+        self.tile_scheduler = tile_scheduler
+        self.stages = stages
+
+        self.math_instruction = math_instruction
+        self.instruction_shape = math_instruction.instruction_shape
+
+        # Number of warps along x, y, z directions
+        self.warp_count = warp_count
+
+    def clone_and_update(self, td: dict):
+        attrs = {
+            "cluster_shape": None,
+            "threadblock_shape": None,
+            "warp_count": None,
+            "stages": None,
+            "instruction_shape": None,
+            "kernel_schedule": None,
+            "epilogue_schedule": None,
+            "tile_scheduler": None
+        }
+        for key in attrs.keys():
+            if key in td.keys():
+                attrs[key] = td[key]
+            else:
+                attrs[key] = getattr(self, key)
+
+        attrs["math_instruction"] = MathInstruction(
+            attrs["instruction_shape"],
+            self.math_instruction.element_a,
+            self.math_instruction.element_b,
+            self.math_instruction.element_accumulator,
+            self.math_instruction.opcode_class,
+            self.math_instruction.math_operation
+        )
+
+        # Remove the instruction shape
+        del attrs["instruction_shape"]
+
+        return TileDescription(**attrs)
+
+    @property
+    def num_threads(self):
+        """
+        Returns the number of threads in the threadblock
+
+        :return: number of threads in the threadblock
+        :rtype: int or None (if warp count is None)
+        """
+        if self.warp_count is not None:
+            threads = 32
+            for cnt in self.warp_count:
+                threads *= cnt
+            return threads
+        return None
+
+    def procedural_name(self):
+        """
+        Returns a name identifying the tile description
+
+        :return: name identifying the tile description
+        :rtype: int
+        """
+        emit_stages = 0 if self.stages is None else self.stages
+        name = "%dx%dx%d_%dx%d_%dx%d" % (
+            self.cluster_shape[0],
+            self.cluster_shape[1],
+            self.cluster_shape[2],
+            self.threadblock_shape[0],
+            self.threadblock_shape[1],
+            self.threadblock_shape[2],
+            emit_stages
+        )
+
+        return name
+
+    def procedural_name_2x(self):
+        """
+        Returns a name identifying the tile description
+
+        :return: name identifying the tile description
+        :rtype: int
+        """
+        return "%dx%d_%dx%d" % (self.threadblock_shape[0], self.threadblock_shape[1], self.threadblock_shape[2], self.stages)
+
+    def __str__(self):
+        """
+        Returns a string with containing each of the tile description's values
+
+        :return: contents of tile description
+        :rtype: str
+        """
+        if self.kernel_schedule is not None:
+            kschedule = self.kernel_schedule
+        else:
+            kschedule = KernelScheduleType.ScheduleAuto
+
+        if self.epilogue_schedule is not None:
+            eschedule = self.epilogue_schedule
+        else:
+            eschedule = EpilogueScheduleType.ScheduleAuto
+
+        if self.tile_scheduler is not None:
+            tschedule = self.tile_scheduler.name
+        else:
+            tschedule = "None"
+        return f"""
+{{
+  ClusterShape: {self.cluster_shape}
+  ThreadblockShape: {self.threadblock_shape}
+  WarpCount: {self.warp_count}
+  Stages: {self.stages if self.stages is not None else 'Auto'}
+  InstructionShape: {self.math_instruction.instruction_shape}
+  Kernel schedule: {kschedule.name}
+  Epilogue schedule: {kschedule.name}
+  TileScheduler: {tschedule}
+}}"""
+
+
+class TensorDescription:
+    def __init__(self, element, layout, alignment=1, complex_transform=ComplexTransform.none):
+        self.element = element
+        self.layout = layout
+        if element != DataType.void:
+            self.alignment = min(128 // DataTypeSize[self.element], alignment)
+        else:
+            self.alignment = alignment
+        self.complex_transform = complex_transform
+
+
+def CalculateSmemUsagePerStage(operation):
+    """
+    Returns the amount of shared memory in bytes consumed in a single stage of a kernel.
+
+    :param op: operation for which the maximum stages should be computed. If stages are
+               set via the `op.tile_description.stages` parameter, this setting is ignored
+               in the present calculation
+    :type op: cutlass.backend.Operation
+
+    :return: number of bytes of shared memory consumed by a single stage
+    :rtype: int
+    """
+    m, n, k = operation.tile_description.threadblock_shape
+
+    if operation.operation_kind == OperationKind.Gemm:
+        stage_barrier_bytes = 32
+        return (
+            (DataTypeSize[operation.A.element] * m * k // 8)
+            + (DataTypeSize[operation.B.element] * k * n // 8)
+            + stage_barrier_bytes
+        )
+    else:
+        raise Exception("Unsupported operation kind {}.".format(operation.operation_kind))
+
+
+def CalculateSmemUsage(operation):
+    """
+    Returns the amount of shared memory in bytes consumed by a kernel.
+
+    :param op: operation for which the maximum stages should be computed. If stages are
+               set via the `op.tile_description.stages` parameter, this setting is ignored
+               in the present calculation
+    :type op: cutlass.backend.Operation
+
+    :return: int
+    """
+    return operation.tile_description.stages * CalculateSmemUsagePerStage(operation)
+
+
+class ApiVersion(enum.Enum):
+    """
+    Differentiate between CUTLASS 2.x and 3.x API versions
+    """
+
+    v2x = enum_auto()
+    v3x = enum_auto()
+
+
+def api_version(arch, opclass, dtype):
+    """
+    Returns whether the architecture, opcode class, and datatype in question require using CUTLASS 2.x
+    or 3.x for code emission.
+
+    :param arch: compute capability of device on which to run
+    :type arch: int
+    :param opclass: class of the operation being performed
+    :type opclass: cutlass_library.OpcodeClass
+    :param dtype: data type to be used in operation (assumes that ElementA and ElementB are the same)
+    :type dtype: cutlass_library.DataType
+
+    :return: API version to be used in code emission
+    :rtype: ApiVersion
+    """
+    if (arch >= 90 and
+        opclass == OpcodeClass.TensorOp and
+        (dtype != DataType.f64)):
+        return ApiVersion.v3x
+    else:
+        return ApiVersion.v2x
+
+
+class EmissionType(enum.Enum):
+    """
+    Tags for whether to emit a kernel- or device-level operation
+    """
+
+    Kernel = enum_auto()
+    Device = enum_auto()
diff --git a/python/cutlass/backend/memory_manager.py b/python/cutlass/backend/memory_manager.py
new file mode 100644
index 0000000000..89e6908395
--- /dev/null
+++ b/python/cutlass/backend/memory_manager.py
@@ -0,0 +1,120 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import numpy as np
+
+import cutlass
+from cutlass.utils.datatypes import is_numpy_tensor
+
+if cutlass.use_rmm:
+    import rmm
+else:
+    from cuda import cudart
+
+
+class PoolMemoryManager:
+    def __init__(self, init_pool_size: int, max_pool_size: int) -> None:
+        self.pool = rmm.mr.PoolMemoryResource(
+            rmm.mr.CudaMemoryResource(),
+            initial_pool_size=init_pool_size,
+            maximum_pool_size=max_pool_size
+        )
+        self.mr = rmm.mr.TrackingResourceAdaptor(self.pool)
+        rmm.mr.set_current_device_resource(self.mr)
+
+    def pool_size(self):
+        return self.pool.pool_size()
+
+
+class DevicePtrWrapper:
+    """
+    Wrapper around a pointer to device memory to provide a uniform interface with the RMM DeviceBuffer
+    (at least in terms of the interface used by the CUTLASS Python interface)
+    """
+    def __init__(self, dev_ptr):
+        self.dev_ptr = dev_ptr
+
+    @property
+    def ptr(self):
+        return self.dev_ptr
+
+
+def _todevice(host_data):
+    """
+    Helper for transferring host data to device memory
+    """
+    if cutlass.use_rmm:
+        return rmm.DeviceBuffer.to_device(host_data.tobytes())
+    else:
+        nbytes = len(host_data.tobytes())
+        dev_ptr_wrapper = device_mem_alloc(nbytes)
+        err, = cudart.cudaMemcpy(
+            dev_ptr_wrapper.ptr,
+            host_data.__array_interface__['data'][0],
+            nbytes,
+            cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
+        )
+        if err != cudart.cudaError_t.cudaSuccess:
+            raise Exception(f"cudaMemcpy failed with error {err}")
+        return dev_ptr_wrapper
+
+
+def todevice(host_data, dtype=np.float32):
+    """
+    Pass the host_data to device memory
+    """
+    if isinstance(host_data, list):
+        return _todevice(np.array(host_data, dtype=dtype))
+    elif is_numpy_tensor(host_data):
+        return _todevice(host_data)
+
+
+def device_mem_alloc(size):
+    if cutlass.use_rmm:
+        return rmm.DeviceBuffer(size=size)
+    else:
+        err, ptr = cudart.cudaMalloc(size)
+        if err != cudart.cudaError_t.cudaSuccess:
+            raise Exception(f"cudaMalloc failed with error {err}")
+        return DevicePtrWrapper(ptr)
+
+
+def align_size(size, alignment=256):
+    return ((size + alignment - 1) // alignment) * alignment
+
+
+def create_memory_pool(init_pool_size=0, max_pool_size=2 ** 34):
+    if cutlass.use_rmm:
+        memory_pool = PoolMemoryManager(init_pool_size=init_pool_size, max_pool_size=max_pool_size)
+        return memory_pool
+    else:
+        return None
diff --git a/python/cutlass/backend/operation.py b/python/cutlass/backend/operation.py
new file mode 100644
index 0000000000..a73cef6857
--- /dev/null
+++ b/python/cutlass/backend/operation.py
@@ -0,0 +1,133 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import ctypes
+
+from cuda import __version__, cuda
+
+from cutlass.backend.utils.device import device_cc
+
+_version_splits = [int(x) for x in __version__.split("rc")[0].split(".post")[0].split(".")]
+_supports_cluster_launch = None
+
+
+def supports_cluster_launch():
+    global _supports_cluster_launch
+    if _supports_cluster_launch is None:
+        major, minor = _version_splits[0], _version_splits[1]
+        _supports_cluster_launch = device_cc() >= 90 and (major > 11 or (major == 11 and minor >= 8))
+    return _supports_cluster_launch
+
+
+class LaunchConfiguration:
+    def __init__(self, grid=[1, 1, 1], block=[1, 1, 1], smem=0):
+        self.grid = grid
+        self.block = block
+        self.shared_memory_capacity = smem
+
+
+class ExecutableOperation:
+    def __init__(self, operation):
+        self.operation = operation
+        self.module = None
+        self.kernel = None
+
+    def name(self):
+        return self.operation.procedural_name()
+
+    def emit(self):
+        return ""
+
+    def can_implement(self, configuration, arguments):
+        raise NotImplementedError()
+
+    def get_host_workspace_size(self, arguments):
+        raise NotImplementedError()
+
+    def get_device_workspace_size(self, arguments):
+        raise NotImplementedError()
+
+    def plan(self, arguments):
+        raise NotImplementedError()
+
+    def initialize(self, host_workspace, device_workspace, launch_config, arguments, stream=cuda.CUstream(0)):
+        raise NotImplementedError()
+
+    def run_with_clusters(self, launch_config, kernel_params, stream=cuda.CUstream(0)):
+        if hasattr(self.operation, "tile_description") and hasattr(self.operation.tile_description, "cluster_shape"):
+            attr = cuda.CUlaunchAttribute()
+            attr.value.clusterDim.x, attr.value.clusterDim.y, attr.value.clusterDim.z = self.operation.tile_description.cluster_shape
+            attr.id = cuda.CUstreamAttrID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
+            attrs = [attr]
+
+            # Allow for non-portable cluster sizes
+            err, = cuda.cuFuncSetAttribute(
+                self.kernel, cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, 1)
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                return err
+        else:
+            attrs = []
+
+        config = cuda.CUlaunchConfig()
+        config.gridDimX, config.gridDimY, config.gridDimZ = launch_config.grid
+        config.blockDimX, config.blockDimY, config.blockDimZ = launch_config.block
+        config.blockDimZ = launch_config.block[2]
+        config.sharedMemBytes = launch_config.shared_memory_capacity
+        config.hStream = stream
+        config.attrs = attrs
+        config.numAttrs = len(attrs)
+
+        err, = cuda.cuLaunchKernelEx(
+            config, f=self.kernel, kernelParams=kernel_params, extra=0)
+        return err
+
+    def run_without_clusters(self, launch_config, kernel_params, stream=cuda.CUstream(0)):
+        err, = cuda.cuLaunchKernel(
+            self.kernel,
+            launch_config.grid[0], launch_config.grid[1], launch_config.grid[2],
+            launch_config.block[0], launch_config.block[1], launch_config.block[2],
+            launch_config.shared_memory_capacity,
+            stream,
+            kernel_params,
+            0)
+
+        return err
+
+    def run(self, host_workspace, device_workspace, launch_config, stream=cuda.CUstream(0)):
+        cArg = (ctypes.c_char * len(host_workspace)).from_buffer(host_workspace)
+        packed = (ctypes.c_void_p * 1)()
+        packed[0] = ctypes.addressof(cArg)
+
+        if supports_cluster_launch():
+            return self.run_with_clusters(launch_config, packed, stream)
+        else:
+            return self.run_without_clusters(launch_config, packed, stream)
diff --git a/python/cutlass/backend/reduction_operation.py b/python/cutlass/backend/reduction_operation.py
new file mode 100644
index 0000000000..7b65a87646
--- /dev/null
+++ b/python/cutlass/backend/reduction_operation.py
@@ -0,0 +1,452 @@
+################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+import ctypes
+from typing import Union
+
+from cuda import cuda, cudart
+import numpy as np
+
+from cutlass_library import (
+    DataTypeNames,
+    DataTypeSize,
+    DataTypeTag,
+    LayoutType,
+    SubstituteTemplate
+)
+
+import cutlass
+from cutlass.backend.c_types import MatrixCoord_, TensorRef2D_, get_reduction_params
+from cutlass.backend.frontend import NumpyFrontend, TorchFrontend
+from cutlass.backend.library import TensorDescription
+from cutlass.backend.memory_manager import DevicePtrWrapper
+from cutlass.backend.operation import ExecutableOperation, LaunchConfiguration
+from cutlass.shape import MatrixCoord
+from cutlass.utils.datatypes import is_numpy_tensor, is_torch_tensor
+
+
+class ReductionOperation:
+    pass
+
+
+class ReductionArguments:
+    """
+    Arguments of reduction
+    """
+
+    def __init__(
+        self,
+        operation: ReductionOperation,
+        problem_size: "list[int]",
+        partitions: int,
+        workspace: cuda.CUdeviceptr,
+        destination: "Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]",
+        source: "Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]",
+        **kwargs,
+    ) -> None:
+        # tensor_C can be interpreted as the bias with bias=True in keyword args
+        if "bias" in kwargs.keys():
+            self.bias = kwargs["bias"]
+        else:
+            # by default, tensor_C is not bias
+            self.bias = False
+        if "stream" in kwargs.keys():
+            self.stream = kwargs["stream"]
+        else:
+            self.stream = cuda.CUstream(0)
+
+        self.operation = operation
+        self.ptr_workspace = workspace
+
+        # number of split-k partitions
+        self.partitions = partitions
+
+        if is_numpy_tensor(destination):
+            self.host_D = destination
+            self.destination_buffer = NumpyFrontend.argument(destination, True)
+            self.source_buffer = NumpyFrontend.argument(source, False)
+            self.ptr_destination = cuda.CUdeviceptr(self.destination_buffer.ptr)
+            self.ptr_source = cuda.CUdeviceptr(self.source_buffer.ptr)
+        elif is_torch_tensor(destination):
+            self.ptr_destination = TorchFrontend.argument(destination)
+            self.ptr_source = TorchFrontend.argument(source)
+        elif isinstance(destination, cuda.CUdeviceptr):
+            self.ptr_destination = destination
+            self.ptr_source = source
+        else:
+            raise TypeError("unknown Type")
+
+        self.problem_size = MatrixCoord_(problem_size[0], problem_size[1])
+
+        self.partition_stride = (
+            problem_size[0] * problem_size[1] * DataTypeSize[operation.C.element] // 8
+        )
+
+        if "output_op" in kwargs.keys():
+            self.output_op = kwargs["output_op"]
+        else:
+            self.output_op = self.operation.epilogue_type(1.0, 0.0)
+
+        self.get_arguments()
+
+    @staticmethod
+    def get_tensor_ref(
+        extent: "tuple[int]",
+        device_ptr: cuda.CUdeviceptr,
+        layout: LayoutType,
+    ):
+        if layout == LayoutType.RowMajor:
+            return TensorRef2D_(int(device_ptr), extent[1])
+        else:
+            raise ValueError(f"Unknown layout type {layout}")
+
+    def get_arguments(self):
+        ref_workspace = ReductionArguments.get_tensor_ref(
+            extent=[
+                self.problem_size.row,
+                self.problem_size.column,
+            ],
+            device_ptr=self.ptr_workspace,
+            layout=LayoutType.RowMajor,
+        )
+        if self.bias:
+            ref_source = ReductionArguments.get_tensor_ref(
+                extent=[0, 0],
+                device_ptr=self.ptr_source,
+                layout=LayoutType.RowMajor,
+            )
+        else:
+            ref_source = ReductionArguments.get_tensor_ref(
+                extent=[
+                    self.problem_size.row,
+                    self.problem_size.column,
+                ],
+                device_ptr=self.ptr_source,
+                layout=LayoutType.RowMajor,
+            )
+
+        ref_destination = ReductionArguments.get_tensor_ref(
+            extent=[
+                self.problem_size.row,
+                self.problem_size.column,
+            ],
+            device_ptr=self.ptr_destination,
+            layout=LayoutType.RowMajor,
+        )
+
+        self.c_arguments = self.operation.argument_type(
+            self.problem_size,
+            self.partitions,
+            self.partition_stride,
+            ref_workspace,
+            ref_destination,
+            ref_source,
+            self.output_op,
+        )
+
+        params_ = self.operation.rt_module.get_args(ctypes.byref(self.c_arguments))
+        self.host_workspace = bytearray(params_.contents)
+
+    def sync(self):
+        (err,) = cudart.cudaDeviceSynchronize()
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError(f"CUDA Error {str(err)}")
+
+        if hasattr(self, "host_D"):
+            (err,) = cuda.cuMemcpyDtoH(
+                self.host_D,
+                self.ptr_destination,
+                self.host_D.size * self.host_D.itemsize,
+            )
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                raise RuntimeError("CUDA Error %s" % str(err))
+
+        self.free()
+
+    def free(self):
+        """
+        Frees allocated device-side memory
+        """
+        # Free any device memory allocated manually
+        if not cutlass.use_rmm:
+            for attr in ["destination_buffer", "source_buffer"]:
+                if hasattr(self, attr):
+                    buf = getattr(self, attr)
+                    if isinstance(buf, DevicePtrWrapper):
+                        err, = cudart.cudaFree(buf.ptr)
+                        if err != cudart.cudaError_t.cudaSuccess:
+                            raise RuntimeError(f"cudaFree failed with error {err}")
+                        del buf
+
+
+class ReductionRT(ExecutableOperation):
+    """
+    ReductionRT manages the CUTLASS runtime components for reduction
+    """
+
+    KernelTemplate = r"""
+extern "C"
+__global__ void
+${operation_name}(${operation_name}${operation_suffix}::Params params) {
+
+  // Dynamic shared memory base pointer
+  extern __shared__ int SharedStorageBase[];
+
+  // Declare pointer to dynamic shared memory.
+  ${operation_name}${operation_suffix}::SharedStorage *shared_storage =
+      reinterpret_cast<${operation_name}${operation_suffix}::SharedStorage *>(SharedStorageBase);
+
+  ${operation_name}${operation_suffix} op;
+
+  op(params, *shared_storage);
+}
+    """
+    HostTemplate = r"""
+extern "C" {
+  // Get the size of params in bytes
+  int ${operation_name}_get_param_size(){
+    return sizeof(${operation_name}${operation_suffix}::Params);
+  }
+
+  // Get the size of dynamic shared memory in bytes
+  int ${operation_name}_shared_memory_size() {
+    return int(sizeof(${operation_name}${operation_suffix}::SharedStorage));
+  }
+
+  // Get the params as byte array
+  char* ${operation_name}_get_params(${operation_name}${operation_suffix}::Params* params){
+    char *bytes = ((char*)(params));
+    char *output = new char[sizeof(${operation_name}${operation_suffix}::Params)];
+    for (unsigned int i = 0; i < sizeof(${operation_name}${operation_suffix}::Params); i ++)
+        output[i] = bytes[i];
+
+    return output;
+  }
+}
+    """
+
+    def __init__(self, operation: ReductionOperation):
+        super().__init__(operation)
+
+        self.operation: ReductionOperation = operation
+        self.emitter = EmitReductionInstance("_type")
+
+        self.elements_per_access = self.operation.count
+        (
+            self.argument_type,
+            self.epilogue_type,
+        ) = get_reduction_params(operation.epilogue_functor)
+        self.argtype = [ctypes.POINTER(self.argument_type)]
+
+    def emit(self):
+        return self.emitter.emit(self.operation)
+
+    def plan(self, arguments: ReductionArguments):
+        block_shape = [
+            self.operation.shape.column // self.elements_per_access,
+            self.operation.shape.row,
+            1,
+        ]
+        grid_shape = [
+            (arguments.problem_size.row + self.operation.shape.row - 1)
+            // self.operation.shape.row,
+            (arguments.problem_size.column + self.operation.shape.column - 1)
+            // self.operation.shape.column,
+            1,
+        ]
+        return LaunchConfiguration(
+            grid_shape,
+            block_shape,
+            self.shared_memory_capacity,
+        )
+
+    def initialize(self):
+        (err,) = cuda.cuFuncSetAttribute(
+            self.kernel,
+            attrib=cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+            value=self.shared_memory_capacity,
+        )
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError(f"CUDA Error: {err}")
+
+
+class ReductionOperation:
+    """
+    CUTLASS reduction Operation
+    """
+
+    def __init__(
+        self,
+        shape: MatrixCoord,
+        C: TensorDescription,
+        element_accumulator,
+        element_workspace=None,
+        element_compute=None,
+        epilogue_functor=None,
+        count: int = 1,
+        partitions_per_stage: int = 4,
+    ) -> None:
+        self.shape = shape
+        self.epilogue_functor = epilogue_functor
+        self.element_accumulator = element_accumulator
+
+        if element_workspace is None:
+            self.element_workspace = element_accumulator
+        else:
+            self.element_workspace = element_workspace
+
+        if element_compute is None:
+            self.element_compute = element_accumulator
+        else:
+            self.element_compute = element_compute
+
+        self.element_output = C.element
+        self.C: TensorDescription = C
+
+        # Reduce op processing size
+        self.count: int = count
+
+        # Number of partitions to reduce per stage
+        self.partitions_per_stage: int = partitions_per_stage
+
+        self.rt_module: ReductionRT = ReductionRT(self)
+        self.argument_type = self.rt_module.argument_type
+        self.epilogue_type = self.rt_module.epilogue_type
+
+    def extended_name(self):
+        extend_name = "${element_workspace}_${element_accumulator}_${element_compute}_${element_output}"
+
+        return SubstituteTemplate(
+            extend_name,
+            {
+                "element_workspace": DataTypeNames[self.element_workspace],
+                "element_accumulator": DataTypeNames[self.element_accumulator],
+                "element_compute": DataTypeNames[self.element_compute],
+                "element_output": DataTypeNames[self.element_output],
+            },
+        )
+
+    def configuration_name(self):
+        """The full procedural name indicates architecture, extended name, tile size"""
+
+        configuration_name = "cutlass_reduce_split_k_${extended_name}_${threadblock}"
+
+        threadblock = "%dx%d" % (
+            self.shape.row,
+            self.shape.column,
+        )
+
+        return SubstituteTemplate(
+            configuration_name,
+            {
+                "extended_name": self.extended_name(),
+                "threadblock": threadblock,
+            },
+        )
+
+    def procedural_name(self):
+        """The full procedural name indicates architeture, extended name, tile size"""
+        return self.configuration_name()
+
+    def run(self, arguments: ReductionArguments) -> cuda.CUresult:
+        """
+        Configure and launch the cuda kernel with input arguments
+        """
+        launch_config = self.rt_module.plan(arguments)
+
+        host_workspace = arguments.host_workspace
+        device_workspace = None
+
+        err = self.rt_module.run(
+            host_workspace,
+            device_workspace,
+            launch_config,
+            arguments.stream
+        )
+
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError(f"CUDA Error {str(err)}")
+
+        return err
+
+
+class EmitReductionInstance:
+    def __init__(self, operation_suffix="") -> None:
+        self.operation_suffix = operation_suffix
+        self.includes = [
+            "cutlass/cutlass.h",
+            "cutlass/numeric_types.h",
+            "cutlass/arch/arch.h",
+            "cutlass/arch/mma.h",
+            "cutlass/layout/matrix.h",
+            "cutlass/gemm/device/gemm.h",
+            "cutlass/gemm/device/gemm_universal_adapter.h",
+            "cutlass/gemm/kernel/default_gemm_universal.h",
+            "cutlass/reduction/kernel/reduce_split_k.h",
+            "cutlass/reduction/thread/reduction_operators.h",
+        ]
+        self.template = """
+// Reduction kernel instance
+using ${operation_name}_base =
+typename cutlass::reduction::kernel::ReduceSplitK<
+  cutlass::MatrixShape<${shape_row}, ${shape_column}>,
+  ${epilogue_functor},
+  cutlass::reduction::thread::ReduceAdd<
+    ${element_accumulator},
+    ${element_output},
+    ${count}>,
+  ${partition_per_stage}>;
+
+struct ${operation_name}${operation_suffix}:
+  public ${operation_name}_base { };
+      """
+
+    def emit(self, operation: ReductionOperation):
+        vector_length_bits = min(operation.C.alignment * DataTypeSize[operation.C.element], 128)
+        epilogue_vector_length = vector_length_bits // DataTypeSize[operation.C.element]
+
+        values = {
+            "operation_name": operation.configuration_name(),
+            "operation_suffix": self.operation_suffix,
+            "shape_row": str(operation.shape.row),
+            "shape_column": str(operation.shape.column),
+            "epilogue_functor": operation.epilogue_functor.emit(),
+            "element_output": DataTypeTag[operation.element_output],
+            "epilogue_vector_length": str(epilogue_vector_length),
+            "element_accumulator": DataTypeTag[operation.element_accumulator],
+            "element_compute": DataTypeTag[operation.element_compute],
+            "element_workspace": DataTypeTag[operation.element_workspace],
+            "count": str(operation.count),
+            "partition_per_stage": str(operation.partitions_per_stage),
+        }
+
+        return SubstituteTemplate(self.template, values)
diff --git a/python/cutlass/backend/type_hint.py b/python/cutlass/backend/type_hint.py
new file mode 100644
index 0000000000..9a3caabf9d
--- /dev/null
+++ b/python/cutlass/backend/type_hint.py
@@ -0,0 +1,35 @@
+################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+GemmOperation = "Union[GemmOperationUniversal, GemmOperationGrouped]"
+
+Tensor = "Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]"
diff --git a/python/cutlass/backend/utils/__init__.py b/python/cutlass/backend/utils/__init__.py
new file mode 100644
index 0000000000..43ebe696a7
--- /dev/null
+++ b/python/cutlass/backend/utils/__init__.py
@@ -0,0 +1,33 @@
+################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+from cutlass.backend.utils.device import check_cuda_errors, device_cc
diff --git a/python/cutlass/backend/utils/device.py b/python/cutlass/backend/utils/device.py
new file mode 100644
index 0000000000..7ccf6ee981
--- /dev/null
+++ b/python/cutlass/backend/utils/device.py
@@ -0,0 +1,123 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utility functions for interacting with the device
+"""
+
+from cuda import cuda, cudart
+
+import cutlass
+from cutlass.utils.datatypes import is_cupy_tensor, is_numpy_tensor, is_torch_tensor
+
+
+def check_cuda_errors(result: list):
+    """
+    Checks whether `result` contains a CUDA error raises the error as an exception, if so. Otherwise,
+    returns the result contained in the remaining fields of `result`.
+
+    :param result: the results of the `cudart` method, consisting of an error code and any method results
+    :type result: list
+
+    :return: non-error-code results from the `results` parameter
+    """
+    # `result` is of the format : (cudaError_t, result...)
+    err = result[0]
+    if err.value:
+        raise RuntimeError("CUDA error: {}".format(cudart.cudaGetErrorName(err)))
+
+    if len(result) == 1:
+        return None
+    elif len(result) == 2:
+        return result[1]
+    else:
+        return result[1:]
+
+
+def device_cc(device: int = -1) -> int:
+    """
+    Returns the compute capability of the device with ID `device`.
+
+    :param device: ID of the device to query
+    :type device: int
+
+    :return: compute capability of the queried device (e.g., 80 for SM80)
+    :rtype: int
+    """
+    if device == -1:
+        device = cutlass.device_id()
+
+    deviceProp = check_cuda_errors(cudart.cudaGetDeviceProperties(device))
+    major = str(deviceProp.major)
+    minor = str(deviceProp.minor)
+    return int(major + minor)
+
+
+def device_sm_count(device: int = -1):
+    if device == -1:
+        device = cutlass.device_id()
+    err, device_sm_count = cuda.cuDeviceGetAttribute(
+        cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device
+    )
+    if err != cuda.CUresult.CUDA_SUCCESS:
+        raise Exception(
+            "Failed to retireve SM count. "
+            f"cuDeviceGetAttribute() failed with error: {cuda.cuGetErrorString(err)[1]}"
+        )
+
+    return device_sm_count
+
+
+def to_device_ptr(tensor) -> cuda.CUdeviceptr:
+    """
+    Converts a tensor to a CUdeviceptr
+
+    :param tensor: tensor to convert
+    :type tensor: np.ndarray | torch.Tensor | cp.ndarray | int
+
+    :return: device pointer
+    :rtype: cuda.CUdeviceptr
+    """
+    if is_numpy_tensor(tensor):
+        ptr = cuda.CUdeviceptr(tensor.__array_interface__["data"][0])
+    elif is_torch_tensor(tensor):
+        ptr = cuda.CUdeviceptr(tensor.data_ptr())
+    elif is_cupy_tensor(tensor):
+        ptr = cuda.CUdeviceptr(int(tensor.data.ptr))
+    elif isinstance(tensor, cuda.CUdeviceptr):
+        ptr = tensor
+    elif isinstance(tensor, int):
+        ptr = cuda.CUdeviceptr(tensor)
+    else:
+        raise NotImplementedError(tensor)
+
+    return ptr
diff --git a/python/cutlass/emit/__init__.py b/python/cutlass/emit/__init__.py
new file mode 100644
index 0000000000..145960d065
--- /dev/null
+++ b/python/cutlass/emit/__init__.py
@@ -0,0 +1,33 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from cutlass.emit.pytorch import pytorch
diff --git a/python/cutlass/emit/common.py b/python/cutlass/emit/common.py
new file mode 100644
index 0000000000..87025eead0
--- /dev/null
+++ b/python/cutlass/emit/common.py
@@ -0,0 +1,267 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Common utilities for emitting CUTLASS kernels
+"""
+
+import cutlass
+
+# Strings used for printing information about the generation of emitted scripts
+_AUTOGEN_STR = f"This file was automatically generated by the CUTLASS {cutlass.__version__} Python interface (https://github.com/nvidia/cutlass/python)"
+
+
+_CSTYLE_AUTOGEN_COMMENT = f"""// {_AUTOGEN_STR}
+"""
+
+
+_PYSTYLE_AUTOGEN_COMMENT = f"""# {_AUTOGEN_STR}
+"""
+
+_CUTLASS_KERNEL_ARGS_2x = """
+  typename DeviceKernel::Arguments arguments {
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      {M, N, K},                                        // problem size
+      1,
+      {alpha, beta},
+      A, B, C, D,
+      0, 0, 0, 0,                                       // batch strides
+      DeviceKernel::LayoutA::packed({M, K}).stride(0),  // lda
+      DeviceKernel::LayoutB::packed({K, N}).stride(0),  // ldb
+      DeviceKernel::LayoutC::packed({M, N}).stride(0),  // ldc
+      DeviceKernel::LayoutC::packed({M, N}).stride(0)   // ldd
+  };
+"""
+
+_CUTLASS_KERNEL_ARGS_2x_STREAM_K = """
+  typename DeviceKernel::Arguments arguments {
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      {M, N, K},                                        // problem size
+      1,
+      {alpha, beta},
+      A, B, C, D,
+      0, 0, 0, 0,                                       // batch strides
+      DeviceKernel::LayoutA::packed({M, K}).stride(0),  // lda
+      DeviceKernel::LayoutB::packed({K, N}).stride(0),  // ldb
+      DeviceKernel::LayoutC::packed({M, N}).stride(0),  // ldc
+      DeviceKernel::LayoutC::packed({M, N}).stride(0),  // ldd
+      -1                                                // avail_sms
+  };
+"""
+
+_CUTLASS_KERNEL_RUN_GEMM_2x = """
+using ElementCompute = typename DeviceKernel::EpilogueOutputOp::ElementCompute;
+
+cutlass::Status ${name}_kernel_run(int M, int N, int K,
+                        const DeviceKernel::ElementA* A, const DeviceKernel::ElementB* B, const DeviceKernel::ElementC* C, DeviceKernel::ElementC* D,
+                        ElementCompute alpha, ElementCompute beta) {
+  ${args}
+  size_t workspace_size = DeviceKernel::get_workspace_size(arguments);
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  DeviceKernel gemm_op;
+  cutlass::Status status = gemm_op.initialize(arguments,
+                                              workspace.get(),
+                                              nullptr);     // CUDA stream
+
+  if (status != cutlass::Status::kSuccess) {
+    return status;
+  }
+
+  status = gemm_op();
+  return status;
+}
+"""
+
+_CUTLASS_KERNEL_RUN_GEMM_3x = """
+using StrideA = typename DeviceKernel::GemmKernel::StrideA;
+using StrideB = typename DeviceKernel::GemmKernel::StrideB;
+using StrideC = typename DeviceKernel::GemmKernel::StrideC;
+using StrideD = typename DeviceKernel::GemmKernel::StrideD;
+
+using ElementCompute = typename DeviceKernel::EpilogueOutputOp::ElementCompute;
+
+cutlass::Status ${name}_kernel_run(
+        int M, int N, int K, int L,
+        const DeviceKernel::ElementA* A, const DeviceKernel::ElementB* B, const DeviceKernel::ElementC* C, DeviceKernel::ElementC* D,
+        ElementCompute alpha, ElementCompute beta, const cutlass::KernelHardwareInfo& hw_info) {
+
+  typename DeviceKernel::Arguments arguments{
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      {M, N, K, L},                                                              // problem size
+      {
+        A,                                                                         // ptrA
+        cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L)),    // stride A
+        B,                                                                         // ptrB
+        cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L)),    // stride B
+      },
+      {
+        {alpha, beta},
+        C,                                                                       // ptrC
+        cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)),  // stride C
+        D,                                                                       // ptrD
+        cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)),  // stride D
+      },
+      hw_info
+  };
+
+  size_t workspace_size = DeviceKernel::get_workspace_size(arguments);
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  DeviceKernel gemm_op;
+  cutlass::Status status = gemm_op.run(arguments,
+                                       workspace.get(),
+                                       nullptr);     // CUDA stream
+
+  return status;
+}
+"""
+
+
+_CUTLASS_KERNEL_RUN_GROUPED_GEMM_2x = """
+using ElementCompute = typename DeviceKernel::EpilogueOutputOp::ElementCompute;
+
+int threadblock_count = DeviceKernel::sufficient();
+
+cutlass::Status ${name}_kernel_run(int problem_count, cutlass::gemm::GemmCoord* problem_sizes,
+                        DeviceKernel::ElementA** A, DeviceKernel::ElementB** B, DeviceKernel::ElementC** C, DeviceKernel::ElementC** D,
+                        int64_t* lda, int64_t* ldb, int64_t* ldc, int64_t* ldd,
+                        ElementCompute alpha, ElementCompute beta) {
+
+  typename DeviceKernel::Arguments arguments {
+    problem_sizes,
+    problem_count,
+    threadblock_count,
+    {alpha, beta},
+    A, B, C, D,
+    lda, ldb, ldc, ldd
+  };
+
+  size_t workspace_size = DeviceKernel::get_workspace_size(arguments);
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  DeviceKernel gemm_op;
+  cutlass::Status status = gemm_op.initialize(arguments,
+                                              workspace.get(),
+                                              nullptr);     // CUDA stream
+
+  if (status != cutlass::Status::kSuccess) {
+    return status;
+  }
+
+  status = gemm_op();
+  return status;
+}
+"""
+
+
+_CUTLASS_KERNEL_RUN_CONV2D_2x = """
+
+using UnderlyingKernel = typename DeviceKernel::UnderlyingKernel;
+namespace {
+using TensorRefA = typename UnderlyingKernel::TensorRefA;
+using TensorRefB = typename UnderlyingKernel::TensorRefB;
+using TensorRefC = typename UnderlyingKernel::TensorRefC;
+using ElementCompute = typename UnderlyingKernel::EpilogueOutputOp::ElementCompute;
+}
+
+template<typename TensorRef, typename Element>
+TensorRef get_tensor_ref(cutlass::Tensor4DCoord tensor_coord, Element* ptr){
+  cutlass::layout::TensorNHWC layout = cutlass::layout::TensorNHWC::packed(tensor_coord);
+  TensorRef tensor_ref(ptr, layout);
+  return tensor_ref;
+}
+
+cutlass::Status ${name}_kernel_run(cutlass::conv::Conv2dProblemSize* problem_size,
+                        UnderlyingKernel::ElementA* A, UnderlyingKernel::ElementB* B,
+                        UnderlyingKernel::ElementC* C, UnderlyingKernel::ElementC* D,
+                        ElementCompute alpha, ElementCompute beta, std::string split_k_mode,
+                        cudaStream_t stream, int device_id=0) {
+  // create the tensor references
+  cutlass::Tensor4DCoord tensor_coord_A = cutlass::conv::implicit_gemm_tensor_a_extent(
+    cutlass::conv::Operator::k${conv_kind_name}, *problem_size
+  );
+  cutlass::Tensor4DCoord tensor_coord_B = cutlass::conv::implicit_gemm_tensor_b_extent(
+    cutlass::conv::Operator::k${conv_kind_name}, *problem_size
+  );
+  cutlass::Tensor4DCoord tensor_coord_C = cutlass::conv::implicit_gemm_tensor_c_extent(
+    cutlass::conv::Operator::k${conv_kind_name}, *problem_size
+  );
+
+  TensorRefA tensor_ref_A = get_tensor_ref<TensorRefA, UnderlyingKernel::ElementA>(tensor_coord_A, A);
+  TensorRefB tensor_ref_B = get_tensor_ref<TensorRefB, UnderlyingKernel::ElementB>(tensor_coord_B, B);
+  TensorRefC tensor_ref_C = get_tensor_ref<TensorRefC, UnderlyingKernel::ElementC>(tensor_coord_C, C);
+  TensorRefC tensor_ref_D = get_tensor_ref<TensorRefC, UnderlyingKernel::ElementC>(tensor_coord_C, D);
+
+  cutlass::conv::SplitKMode mode;
+  if (split_k_mode == "serial") {
+    mode = cutlass::conv::SplitKMode::kSerial;
+  } else if (split_k_mode == "parallel") {
+    mode = cutlass::conv::SplitKMode::kParallel;
+  } else {
+    throw std::runtime_error("Invalid split_k_mode: " + split_k_mode);
+  }
+
+  typename DeviceKernel::Arguments arguments{
+    *problem_size,
+    tensor_ref_A,
+    tensor_ref_B,
+    tensor_ref_C,
+    tensor_ref_D,
+    {alpha, beta},
+    mode
+  };
+
+  DeviceKernel implicit_gemm_op;
+
+  size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
+
+  void* workspace_ptr = device_memory_allocation(workspace_size, device_id);
+
+  cutlass::Status status = implicit_gemm_op.can_implement(arguments);
+  if (status != cutlass::Status::kSuccess) {
+    return status;
+  }
+
+  status = implicit_gemm_op.initialize(arguments, workspace_ptr, stream);
+  if (status != cutlass::Status::kSuccess) {
+    return status;
+  }
+
+  //
+  // Launch initialized CUTLASS kernel
+  //
+  status = implicit_gemm_op(stream);
+
+  return status;
+}
+"""
diff --git a/python/cutlass/emit/pytorch.py b/python/cutlass/emit/pytorch.py
new file mode 100644
index 0000000000..8c10f87a9b
--- /dev/null
+++ b/python/cutlass/emit/pytorch.py
@@ -0,0 +1,936 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for generating source for building a PyTorch CUDA extension that using a CUTLASS kernel.
+If specified, the extension can be JIT compiled via PyTorch's ``cpp_extension.load`` method.
+
+Example usage with JIT compilation:
+
+.. highlight:: python
+.. code-block:: python
+
+    plan = cutlass.op.Gemm(element=torch.float32, layout=cutlass_library.LayoutType.RowMajor)
+    op = plan.construct()
+    mod = cutlass.emit.pytorch(op, 'cutlass_gemm', 80, jit=True)
+
+    # Generate inputs for the GEMM
+    A, B, C = [torch.ones((512, 512)).to('cuda') for _ in range(3)]
+
+    # Run the module
+    D = mod.run(A, B, C)
+
+
+Example usage without JIT compilation:
+
+.. highlight:: python
+.. code-block:: python
+
+    plan = cutlass.op.Gemm(element=torch.float32, layout=cutlass.LayoutType.RowMajor)
+    op = plan.construct()
+    cutlass.emit.pytorch(op, 'cutlass_gemm', 80, jit=False, sourcedir='output')
+
+After this call, the directory ``output`` contains ``setup.py``,
+``cutlass_gemm.cpp``, and ``cutlass_gemm_kernel.cu``. The module can be built from
+within ``output`` by running: ``TORCH_CUDA_ARCH_LIST="8.0" python setup.py develop --user``.
+
+The module can later be used in Python via:
+
+.. highlight:: python
+.. code-block:: python
+
+    import torch
+    import cutlass_gemm
+
+    # Generate inputs for the GEMM
+    A, B, C = [torch.ones((512, 512)).to('cuda') for _ in range(3)]
+
+    # Run the module
+    D = cutlass_gemm.run(A, B, C)
+"""
+
+import logging
+import os
+
+from cutlass_library import ConvKind, ConvKindNames, DataType, SubstituteTemplate
+
+from cutlass import CUTLASS_PATH, logger, swizzle
+from cutlass.backend.gemm_operation import GemmOperationGrouped, GemmOperationUniversal
+from cutlass.backend.conv2d_operation import Conv2dOperation
+from cutlass.backend.library import ApiVersion
+from cutlass.emit import common
+from cutlass.utils.datatypes import is_torch_available
+
+if is_torch_available():
+    import torch
+
+
+_PYTORCH_CUDA_TEMPLATE = common._CSTYLE_AUTOGEN_COMMENT + """
+#include <cuda_runtime.h>
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/util/device_memory.h"
+
+// helper function allocating the memory
+void* device_memory_allocation(size_t size, int device_id=0) {
+    if (size > 0) {
+        torch::Device device(torch::kCUDA, device_id);
+        cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+        torch::TensorOptions options = torch::TensorOptions().dtype(torch::kI8).device(device);
+        at::Tensor device_tensor = torch::empty({(long)size,}, options);
+        return reinterpret_cast<void*>(device_tensor.data_ptr());
+    } else {
+        return nullptr;
+    }
+}
+
+${includes}
+${declaration}
+${impl}
+"""
+
+_PYTORCH_GEMM_CPP_TEMPLATE = common._CSTYLE_AUTOGEN_COMMENT + """
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include <pybind11/stl.h>
+
+// CUDA forward declarations
+at::Tensor ${name}_kernel(const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C=at::nullopt, float alpha=1.f, float beta=0.f);
+
+// C++ interface
+at::Tensor ${name}(const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C=at::nullopt, float alpha=1.f, float beta=0.f) {
+  return ${name}_kernel(A, B, C, alpha, beta);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("run", py::overload_cast<const at::Tensor&, const at::Tensor&, at::optional<const at::Tensor>, float, float>(&${name}), py::arg("A"), py::arg("B"), py::arg("C") = nullptr, py::arg("alpha") = 1.f, py::arg("beta") = 0.f);
+}
+"""
+
+_PYTORCH_GROUPED_GEMM_CPP_TEMPLATE = common._CSTYLE_AUTOGEN_COMMENT + """
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include <pybind11/stl.h>
+
+// CUDA forward declarations
+std::vector<at::Tensor> ${name}_kernel(const std::vector<at::Tensor>& A, const std::vector<at::Tensor>& B, at::optional<const std::vector<at::Tensor>> C=at::nullopt, float alpha=1.f, float beta=0.f);
+
+// C++ interface
+std::vector<at::Tensor> ${name}(const std::vector<at::Tensor>& A, const std::vector<at::Tensor>& B, at::optional<const std::vector<at::Tensor>> C=at::nullopt, float alpha=1.f, float beta=0.f) {
+  return ${name}_kernel(A, B, C, alpha, beta);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("run", py::overload_cast<const std::vector<at::Tensor>&, const std::vector<at::Tensor>&, at::optional<const std::vector<at::Tensor>>, float, float>(&${name}),
+        py::arg("A"), py::arg("B"), py::arg("C") = nullptr, py::arg("alpha") = 1.f, py::arg("beta") = 0.f);
+}
+"""
+
+_PYTORCH_CONV2D_FPROP_CPP_TEMPLATE = common._CSTYLE_AUTOGEN_COMMENT + """
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include <pybind11/stl.h>
+
+// CUDA forward declarations
+at::Tensor ${name}_kernel(
+    const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C=at::nullopt,
+    std::tuple<int, int> stride={1, 1}, std::tuple<int, int> padding={0, 0}, std::tuple<int, int> dilation={1, 1},
+    float alpha=1.f, float beta=0.f,
+    std::string split_k_mode="serial", int split_k_slices=1);
+
+// C++ interface
+at::Tensor ${name}(
+    const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C=at::nullopt,
+    std::tuple<int, int> stride={1, 1}, std::tuple<int, int> padding={0, 0}, std::tuple<int, int> dilation={1, 1},
+    float alpha=1.f, float beta=0.f,
+    std::string split_k_mode="serial", int split_k_slices=1) {
+    return ${name}_kernel(A, B, C, stride, padding, dilation, alpha, beta, split_k_mode, split_k_slices);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("run",
+  py::overload_cast<
+    const at::Tensor&, const at::Tensor&, at::optional<const at::Tensor>,
+    std::tuple<int, int>, std::tuple<int, int>, std::tuple<int, int>, float, float,  std::string, int>(
+        &${name}), py::arg("A"), py::arg("B"), py::arg("C") = nullptr,
+        py::arg("stride") = std::make_tuple(1, 1), py::arg("padding") = std::make_tuple(1, 1), py::arg("dilation") = std::make_tuple(1, 1),
+        py::arg("alpha") = 1.f, py::arg("beta") = 0.f,
+        py::arg("split_k_mode") = "serial", py::arg("split_k_slices") = 1);
+}
+"""
+
+_PYTORCH_CONV2D_GRAD_CPP_TEMPLATE = common._CSTYLE_AUTOGEN_COMMENT + """
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include <pybind11/stl.h>
+
+// CUDA forward declarations
+at::Tensor ${name}_kernel(
+    std::tuple<int, int, int, int> result_size, const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C=at::nullopt,
+    std::tuple<int, int> stride={1, 1}, std::tuple<int, int> padding={0, 0}, std::tuple<int, int> dilation={1, 1},
+    float alpha=1.f, float beta=0.f,
+    std::string split_k_mode="serial", int split_k_slices=1);
+
+// C++ interface
+at::Tensor ${name}(
+    std::tuple<int, int, int, int> result_size, const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C=at::nullopt,
+    std::tuple<int, int> stride={1, 1}, std::tuple<int, int> padding={0, 0}, std::tuple<int, int> dilation={1, 1},
+    float alpha=1.f, float beta=0.f,
+    std::string split_k_mode="serial", int split_k_slices=1) {
+    return ${name}_kernel(result_size, A, B, C, stride, padding, dilation, alpha, beta, split_k_mode, split_k_slices);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("run",
+  py::overload_cast<
+    std::tuple<int, int, int, int>, const at::Tensor&, const at::Tensor&, at::optional<const at::Tensor>,
+    std::tuple<int, int>, std::tuple<int, int>, std::tuple<int, int>, float, float, std::string, int>(
+        &${name}), py::arg("result_size"), py::arg("A"), py::arg("B"), py::arg("C") = nullptr,
+        py::arg("stride") = std::make_tuple(1, 1), py::arg("padding") = std::make_tuple(1, 1), py::arg("dilation") = std::make_tuple(1, 1),
+        py::arg("alpha") = 1.f, py::arg("beta") = 0.f,
+        py::arg("split_k_mode") = "serial", py::arg("split_k_slices") = 1);
+}
+"""
+
+_PYTORCH_GEMM_INCLUDES = {
+    ApiVersion.v2x: """
+#include "cutlass/gemm/device/gemm_universal.h"
+""",
+    ApiVersion.v3x: """
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/util/packed_stride.hpp"
+""",
+}
+
+_PYTORCH_GROUPED_GEMM_INCLUDES = """
+#include "cutlass/gemm/kernel/default_gemm_grouped.h"
+#include "cutlass/gemm/device/gemm_grouped.h"
+"""
+
+_PYTORCH_CONV2D_INCLUDES = """
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/kernel/default_conv2d_dgrad.h"
+#include "cutlass/conv/kernel/default_conv2d_wgrad.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+"""
+
+_CUTLASS_TYPE_TO_TORCH_TYPE = {
+    DataType.f16: "torch::kF16",
+    DataType.f32: "torch::kF32",
+    DataType.f64: "torch::kF64",
+    DataType.s8: "torch::kI8",
+    DataType.s32: "torch::kI32",
+    DataType.bf16: "torch::kBFloat16",
+}
+
+_PYTORCH_GEMM_IMPL_TEMPLATE_2x = (
+    common._CUTLASS_KERNEL_RUN_GEMM_2x
+    + """
+at::Tensor ${name}_kernel(const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C, float alpha, float beta) {
+    int M = A.size(0);
+    int N = B.size(1);
+    int K = A.size(1);
+
+    typename DeviceKernel::ElementC* ptrC = (C == at::nullopt) ?
+                                            nullptr :
+                                            reinterpret_cast<typename DeviceKernel::ElementC*>(C->contiguous().data_ptr());
+    at::Tensor D = B.new_empty({M, N}, ${torch_type_C});
+
+    cutlass::Status status = ${name}_kernel_run(M, N, K,
+                                                reinterpret_cast<typename DeviceKernel::ElementA*>(A.contiguous().data_ptr()),
+                                                reinterpret_cast<typename DeviceKernel::ElementB*>(B.contiguous().data_ptr()),
+                                                ptrC,
+                                                reinterpret_cast<typename DeviceKernel::ElementC*>(D.contiguous().data_ptr()),
+                                                ElementCompute(alpha), ElementCompute(beta));
+
+    TORCH_CHECK(status == cutlass::Status::kSuccess, "CUTLASS kernel failed");
+    return D;
+}
+"""
+)
+
+_PYTORCH_GEMM_IMPL_TEMPLATE_3x = (
+    common._CUTLASS_KERNEL_RUN_GEMM_3x
+    + """
+bool hw_info_queried = false;
+cutlass::KernelHardwareInfo hw_info;
+
+at::Tensor ${name}_kernel(const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C, float alpha, float beta) {
+    int M = A.size(0);
+    int N = B.size(1);
+    int K = A.size(1);
+    int L = 1;
+
+    // Query hardware info if we haven't already
+    if (!hw_info_queried) {
+        hw_info.device_id = 0;
+        hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+    }
+
+    typename DeviceKernel::ElementC* ptrC = (C == at::nullopt) ?
+                                            nullptr :
+                                            reinterpret_cast<typename DeviceKernel::ElementC*>(C->contiguous().data_ptr());
+    at::Tensor D = B.new_empty({M, N}, ${torch_type_C});
+
+    cutlass::Status status = ${name}_kernel_run(M, N, K, L,
+                                                reinterpret_cast<typename DeviceKernel::ElementA*>(A.contiguous().data_ptr()),
+                                                reinterpret_cast<typename DeviceKernel::ElementB*>(B.contiguous().data_ptr()),
+                                                ptrC,
+                                                reinterpret_cast<typename DeviceKernel::ElementC*>(D.contiguous().data_ptr()),
+                                                ElementCompute(alpha), ElementCompute(beta),
+                                                hw_info);
+
+    TORCH_CHECK(status == cutlass::Status::kSuccess, "CUTLASS kernel failed");
+    return D;
+}
+"""
+)
+
+
+_PYTORCH_GROUPED_GEMM_IMPL_TEMPLATE = (
+    common._CUTLASS_KERNEL_RUN_GROUPED_GEMM_2x
+    + """
+std::vector<at::Tensor> ${name}_kernel(const std::vector<at::Tensor>& A, const std::vector<at::Tensor>& B, at::optional<const std::vector<at::Tensor>> C, float alpha, float beta) {
+    size_t num = A.size();
+
+    // To avoid performing many small cudaMallocs and host-to-device copies,
+    // we serialize the grouped GEMM arguments on the host, allocate one
+    // large chunk of device memory, and perform a single cudaMemcpy to
+    // copy the host data to the device. Allocation overheads could be
+    // avoided by using a memory pool.
+
+    // Calculate the total size of the data to be copied from host to device
+    size_t total_size = sizeof(cutlass::gemm::GemmCoord) +
+                        sizeof(DeviceKernel::ElementA*) +
+                        sizeof(DeviceKernel::ElementB*) +
+                        sizeof(DeviceKernel::ElementC*) +
+                        sizeof(DeviceKernel::ElementC*) +
+                        sizeof(int64_t) +
+                        sizeof(int64_t) +
+                        sizeof(int64_t);
+    total_size *= num;
+
+    // num * sizeof(cutlass::gemm::GemmCoord) may leave one at a non-multiple
+    // of sizeof(DeviceKernel::ElementA*) (which will be 64 on a 64-bit system).
+    // To ensure that we don't end up having misaligned loads in the kernel,
+    // we pad to the nearest multiple of 8.
+    //
+    // Note that, even on a 32-bit system (for which sizeof(X*) will not equal
+    // sizeof(int64_t)), only padding between the list of GemmCoords and the
+    // list of ptr_As is sufficient because the set of four equal-length lists of pointers
+    // (A*, B*, C*, D*) will ensure that the first list of int64_ts will always
+    // start on a multiple of 8.
+    int64_t padding = 8 - (total_size % 8);
+    total_size += padding;
+
+    uint8_t* host_data = new uint8_t[total_size];
+    cutlass::DeviceAllocation<uint8_t> device_data(total_size);
+
+    uint8_t* start = host_data;
+    cutlass::gemm::GemmCoord* problem_sizes_host = reinterpret_cast<cutlass::gemm::GemmCoord*>(start);
+
+    // Apply the padding after the list of GemmCoords
+    start += num * sizeof(cutlass::gemm::GemmCoord) + padding;
+
+    int64_t ptr_A_offset = start - host_data;
+    DeviceKernel::ElementA** ptr_A_host = reinterpret_cast<DeviceKernel::ElementA**>(start);
+    start += num * sizeof(DeviceKernel::ElementA*);
+
+    int64_t ptr_B_offset = start - host_data;
+    DeviceKernel::ElementB** ptr_B_host = reinterpret_cast<DeviceKernel::ElementB**>(start);
+    start += num * sizeof(DeviceKernel::ElementB*);
+
+    int64_t ptr_C_offset = start - host_data;
+    DeviceKernel::ElementC** ptr_C_host = reinterpret_cast<DeviceKernel::ElementC**>(start);
+    start += num * sizeof(DeviceKernel::ElementC*);
+
+    int64_t ptr_D_offset = start - host_data;
+    DeviceKernel::ElementC** ptr_D_host = reinterpret_cast<DeviceKernel::ElementC**>(start);
+    start += num * sizeof(DeviceKernel::ElementC*);
+
+    int64_t lda_offset = start - host_data;
+    int64_t* lda_host = reinterpret_cast<int64_t*>(start);
+    start += num * sizeof(int64_t);
+
+    int64_t ldb_offset = start - host_data;
+    int64_t* ldb_host = reinterpret_cast<int64_t*>(start);
+    start += num * sizeof(int64_t);
+
+    int64_t ldc_offset = start - host_data;
+    int64_t* ldc_host = reinterpret_cast<int64_t*>(start);
+    start += num * sizeof(int64_t);
+
+    std::vector<at::Tensor> D(num);
+
+    bool need_C = (C != at::nullopt) && (beta != 0.f);
+    for (size_t i = 0; i < num; ++i) {
+        int M = A[i].size(0);
+        int N = B[i].size(1);
+        int K = A[i].size(1);
+        *(problem_sizes_host + i) = {M, N, K};
+        *(ptr_A_host + i) = reinterpret_cast<typename DeviceKernel::ElementA*>(A[i].contiguous().data_ptr());
+        *(ptr_B_host + i) = reinterpret_cast<typename DeviceKernel::ElementB*>(B[i].contiguous().data_ptr());
+
+        if (need_C) {
+            *(ptr_C_host + i) = reinterpret_cast<typename DeviceKernel::ElementC*>(C->at(i).contiguous().data_ptr());
+        }
+        else {
+            *(ptr_C_host + i) = nullptr;
+        }
+
+        D[i] = B[i].new_empty({M, N}, ${torch_type_C});
+        *(ptr_D_host + i) = reinterpret_cast<typename DeviceKernel::ElementC*>(D[i].contiguous().data_ptr());
+
+        *(lda_host + i) = DeviceKernel::LayoutA::packed({M, K}).stride(0);
+        *(ldb_host + i) = DeviceKernel::LayoutB::packed({K, N}).stride(0);
+        *(ldc_host + i) = DeviceKernel::LayoutC::packed({M, N}).stride(0);
+    }
+
+    device_data.copy_from_host(host_data);
+
+    cutlass::Status status = ${name}_kernel_run(
+        num,
+        reinterpret_cast<cutlass::gemm::GemmCoord*>(device_data.get()),
+        reinterpret_cast<DeviceKernel::ElementA**>(device_data.get() + ptr_A_offset),
+        reinterpret_cast<DeviceKernel::ElementB**>(device_data.get() + ptr_B_offset),
+        reinterpret_cast<DeviceKernel::ElementC**>(device_data.get() + ptr_C_offset),
+        reinterpret_cast<DeviceKernel::ElementC**>(device_data.get() + ptr_D_offset),
+        reinterpret_cast<int64_t*>(device_data.get() + lda_offset),
+        reinterpret_cast<int64_t*>(device_data.get() + ldb_offset),
+        reinterpret_cast<int64_t*>(device_data.get() + ldc_offset),
+        reinterpret_cast<int64_t*>(device_data.get() + ldc_offset),
+        ElementCompute(alpha), ElementCompute(beta));
+
+    delete[] host_data;
+
+    TORCH_CHECK(status == cutlass::Status::kSuccess, "CUTLASS kernel failed");
+    return D;
+}
+"""
+)
+
+_PYTORCH_CONV2D_IMPL_TEMPLATE_2x = """
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+    cutlass::Status status = ${name}_kernel_run(
+        &problem_size,
+        reinterpret_cast<typename UnderlyingKernel::ElementA*>(A.data_ptr()),
+        reinterpret_cast<typename UnderlyingKernel::ElementB*>(B.data_ptr()),
+        ptrC,
+        reinterpret_cast<typename UnderlyingKernel::ElementC*>(D.data_ptr()),
+        alpha, beta,
+        split_k_mode, stream, B.device().index());
+
+    TORCH_CHECK(status == cutlass::Status::kSuccess, "CUTLASS kernel failed");
+    return D;
+}
+"""
+
+_PYTORCH_CONV2D_FPROP_IMPL_TEMPLATE_2x = (
+    common._CUTLASS_KERNEL_RUN_CONV2D_2x
+    + """
+at::Tensor ${name}_kernel(const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C=at::nullopt,
+    std::tuple<int, int> stride={1, 1}, std::tuple<int, int> padding={0, 0}, std::tuple<int, int> dilation={1, 1},
+    float alpha=1.f, float beta=0.f, std::string split_k_mode="serial", int split_k_slices=1) {
+    int N, H, W, C_, K, R, S, P, Q;
+    N = A.size(0);
+    C_ = A.size(1);
+    H = A.size(2);
+    W = A.size(3);
+
+    K = B.size(0);
+    R = B.size(2);
+    S = B.size(3);
+
+    cutlass::conv::Conv2dProblemSize problem_size(
+        cutlass::Tensor4DCoord(N, H, W, C_),
+        cutlass::Tensor4DCoord(K, R, S, C_),
+        cutlass::Tensor4DCoord(std::get<0>(padding), std::get<0>(padding), std::get<1>(padding), std::get<1>(padding)),
+        cutlass::MatrixCoord(std::get<0>(stride), std::get<1>(stride)),
+        cutlass::MatrixCoord(std::get<0>(dilation), std::get<1>(dilation)),
+        cutlass::conv::Mode::kCrossCorrelation,
+        split_k_slices
+    );
+
+    P = problem_size.P;
+    Q = problem_size.Q;
+
+    typename UnderlyingKernel::ElementC* ptrC = (C == at::nullopt) ?
+                                            nullptr :
+                                            reinterpret_cast<typename UnderlyingKernel::ElementC*>(C->data_ptr());
+
+    torch::TensorOptions options = torch::TensorOptions().dtype(${torch_type_C}).device(B.device()).memory_format(at::MemoryFormat::ChannelsLast);
+    at::Tensor D = torch::zeros({N, K, P, Q}, options);
+""" + _PYTORCH_CONV2D_IMPL_TEMPLATE_2x
+)
+
+
+_PYTORCH_CONV2D_DGRAD_IMPL_TEMPLATE_2x = (
+    common._CUTLASS_KERNEL_RUN_CONV2D_2x
+    + """
+at::Tensor ${name}_kernel(std::tuple<int, int, int, int> input_size, const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C=at::nullopt,
+    std::tuple<int, int> stride={1, 1}, std::tuple<int, int> padding={0, 0}, std::tuple<int, int> dilation={1, 1}, float alpha=1.f, float beta=0.f,
+    std::string split_k_mode="serial", int split_k_slices=1) {
+    int N, H, W, C_, K, R, S;
+    N = std::get<0>(input_size);
+    C_ = std::get<1>(input_size);
+    H = std::get<2>(input_size);
+    W = std::get<3>(input_size);
+
+    K = B.size(0);
+    R = B.size(2);
+    S = B.size(3);
+
+    cutlass::conv::Conv2dProblemSize problem_size(
+        cutlass::Tensor4DCoord(N, H, W, C_),
+        cutlass::Tensor4DCoord(K, R, S, C_),
+        cutlass::Tensor4DCoord(std::get<0>(padding), std::get<0>(padding), std::get<1>(padding), std::get<1>(padding)),
+        cutlass::MatrixCoord(std::get<0>(stride), std::get<1>(stride)),
+        cutlass::MatrixCoord(std::get<0>(dilation), std::get<1>(dilation)),
+        cutlass::conv::Mode::kCrossCorrelation,
+        split_k_slices
+    );
+
+    typename UnderlyingKernel::ElementC* ptrC = (C == at::nullopt) ?
+                                            nullptr :
+                                            reinterpret_cast<typename UnderlyingKernel::ElementC*>(C->data_ptr());
+
+    torch::TensorOptions options = torch::TensorOptions().dtype(${torch_type_C}).device(B.device()).memory_format(at::MemoryFormat::ChannelsLast);
+    at::Tensor D = torch::empty({N, C_, H, W}, options);
+""" + _PYTORCH_CONV2D_IMPL_TEMPLATE_2x
+)
+
+
+_PYTORCH_CONV2D_WGRAD_IMPL_TEMPLATE_2x = (
+    common._CUTLASS_KERNEL_RUN_CONV2D_2x
+    + """
+at::Tensor ${name}_kernel(std::tuple<int, int, int, int> weight_size, const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C=at::nullopt,
+    std::tuple<int, int> stride={1, 1}, std::tuple<int, int> padding={0, 0}, std::tuple<int, int> dilation={1, 1}, float alpha=1.f, float beta=0.f,
+    std::string split_k_mode="serial", int split_k_slices=1) {
+    int N, H, W, C_, K, R, S;
+    K = std::get<0>(weight_size);
+    C_ = std::get<1>(weight_size);
+    R = std::get<2>(weight_size);
+    S = std::get<3>(weight_size);
+
+    N = B.size(0);
+    H = B.size(2);
+    W = B.size(3);
+
+    cutlass::conv::Conv2dProblemSize problem_size(
+        cutlass::Tensor4DCoord(N, H, W, C_),
+        cutlass::Tensor4DCoord(K, R, S, C_),
+        cutlass::Tensor4DCoord(std::get<0>(padding), std::get<0>(padding), std::get<1>(padding), std::get<1>(padding)),
+        cutlass::MatrixCoord(std::get<0>(stride), std::get<1>(stride)),
+        cutlass::MatrixCoord(std::get<0>(dilation), std::get<1>(dilation)),
+        cutlass::conv::Mode::kCrossCorrelation,
+        split_k_slices
+    );
+
+    typename UnderlyingKernel::ElementC* ptrC = (C == at::nullopt) ?
+                                            nullptr :
+                                            reinterpret_cast<typename UnderlyingKernel::ElementC*>(C->data_ptr());
+
+    torch::TensorOptions options = torch::TensorOptions().dtype(${torch_type_C}).device(B.device()).memory_format(at::MemoryFormat::ChannelsLast);
+    at::Tensor D = torch::empty({K, C_, R, S}, options);
+""" + _PYTORCH_CONV2D_IMPL_TEMPLATE_2x
+)
+
+
+_PYTORCH_SETUP_PY = common._PYSTYLE_AUTOGEN_COMMENT + """
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+setup(
+    name='${name}',
+    ext_modules=[
+        CUDAExtension('${name}', [
+            '${name}.cpp',
+            '${name}_kernel.cu',
+        ],
+        include_dirs=['${cutlass_path}/include', '${cutlass_path}/tools/util/include'],
+        extra_compile_args={
+            'cxx': ['-std=c++17'],
+            'nvcc': ['-std=c++17', ${extra_compile_args}],
+        },
+        libraries=['cuda']
+        ),
+    ],
+    cmdclass={
+        'build_ext': BuildExtension
+    })
+
+"""
+
+
+def _generate_setup(name: str, sourcedir: str, extra_compile_args: str=""):
+    """
+    Generates a setup.py file for the extension
+
+    :param name: name of the module to generate
+    :type name: str
+    :param sourcedir: directory to which generated source files should be written
+    :type sourcedir: str
+    :param extra_compile_args: additional arguments to pass to setup.py
+    :type extra_args: str
+    """
+    setup_py_file = os.path.join(sourcedir, "setup.py")
+    setup_source = SubstituteTemplate(
+        _PYTORCH_SETUP_PY, {"name": name, "cutlass_path": CUTLASS_PATH, "extra_compile_args": extra_compile_args}
+    )
+    with open(setup_py_file, "w") as outfile:
+        outfile.write(setup_source)
+
+
+class _ArchListSetter:
+    """
+    Utility context manager for temporarily setting the value of the ``TORCH_CUDA_ARCH_LIST``
+    environment variable when building a PyTorch CUDA module.
+
+    ``TORCH_CUDA_ARCH_LIST`` is a space-delmited list of compute capabilites for which a PyTorch
+    CUDA module should be compiled.
+
+    For example, ``TORCH_CUDA_ARCH_LIST="7.0 8.0"`` would result in the inclusion of
+    ``-gencode=arch=compute_70,code=sm_70`` and ``-gencode=arch=compute_80,code=sm_80`` in the
+    compilation of the module.
+
+    This utility wraps the building of a PyTorch CUDA module with a setting of this environment
+    variable according to the current compute capability being targetted.
+
+    Example usage:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        # Temporarily set TORCH_CUDA_ARCH_LIST="8.0"
+        with _ArchListSetter(80):
+            # Perform JIT compilation and loading of the module
+            mod = torch.utils.cpp_extension.load(...)
+
+    :param cc: compute capability
+    :type cc: int
+    """
+
+    _TORCH_CUDA_ARCH_LIST = "TORCH_CUDA_ARCH_LIST"
+
+    def __init__(self, cc: int):
+        self.cc_str = ".".join(list(str(cc)))
+
+    def __enter__(self):
+        """
+        Saves the old value of TORCH_CUDA_ARCH_LIST and reset it to the new value based on ``cc``
+        """
+        self.old_arch_list = os.getenv(_ArchListSetter._TORCH_CUDA_ARCH_LIST)
+        os.environ[_ArchListSetter._TORCH_CUDA_ARCH_LIST] = self.cc_str
+
+        return self
+
+    def __exit__(self, exc_type, exc_val, traceback):
+        """
+        Restores the old value of TORCH_CUDA_ARCH_LIST
+        """
+        if self.old_arch_list is None:
+            del os.environ[_ArchListSetter._TORCH_CUDA_ARCH_LIST]
+        else:
+            os.environ[_ArchListSetter._TORCH_CUDA_ARCH_LIST] = self.old_arch_list
+
+
+def _jit(name: str, cc: int, cpp_file: str, cuda_file: str):
+    """
+    JIT compiles and loads a PyTorch CUDA extension.
+
+    :param name: name of the module to generate
+    :type name: str
+    :param cc: compute capability of the device the module should target
+    :type cc: int
+    :param cpp_file: path to file containing extension's C++ interface
+    :type cpp_file: str
+    :param cuda_file: path to file containing extension's CUDA interface
+    :type cuda_file: str
+
+    :return: loaded PyTorch module
+    """
+
+    from torch.utils.cpp_extension import load
+
+    extra_cuda_cflags = ["-std=c++17"]
+    if cc == 90:
+        # PyTorch does not currently add the sm_90a target when compute capability
+        # 9.0 is set within TORCH_CUDA_ARCH_LIST. Thus, we manually add the sm_90a target.
+        extra_cuda_cflags.append("-gencode=arch=compute_90a,code=sm_90a")
+
+    with _ArchListSetter(cc):
+        jitmodule = load(
+            name,
+            [cpp_file, cuda_file],
+            extra_cuda_cflags=extra_cuda_cflags,
+            extra_include_paths=[
+                os.path.join(CUTLASS_PATH, "include"),
+                os.path.join(CUTLASS_PATH, "tools/util/include"),
+            ],
+            extra_ldflags=["-lcuda"],
+            verbose=(logger.level == logging.DEBUG)
+        )
+    return jitmodule
+
+
+def _pytorch_gemm(op, name: str, cc: int, jit: bool = False, sourcedir: str = ""):
+    """
+    Generates source for building a PyTorch CUDA module that leverages the CUTLASS GEMM
+    specified by ``op``. If the ``jit`` parameter is set to true, the module is just-in-time
+    compiled, loaded, and returned.
+
+    :param op: operation to emit in the module
+    :param name: name of the module to generate
+    :type name: str
+    :param cc: compute capability of the device the module should target
+    :type cc: int
+    :param jit: whether the module should be just-in-time compiled
+    :type jit: bool
+    :param sourcedir: directory to which generated source files should be written
+    :type sourcedir: str
+
+    :return: loaded PyTorch module if ``jit=True`` or ``None`` otherwise
+    """
+    if sourcedir != "" and not os.path.isdir(sourcedir):
+        os.makedirs(sourcedir)
+
+    cuda_file = os.path.join(sourcedir, name + "_kernel.cu")
+    extra_kw = {}
+    if op.api == ApiVersion.v3x:
+        impl_template = _PYTORCH_GEMM_IMPL_TEMPLATE_3x
+    else:
+        impl_template = _PYTORCH_GEMM_IMPL_TEMPLATE_2x
+        if op.swizzling_functor == swizzle.ThreadblockSwizzleStreamK:
+            extra_kw["args"] = common._CUTLASS_KERNEL_ARGS_2x_STREAM_K
+        else:
+            extra_kw["args"] = common._CUTLASS_KERNEL_ARGS_2x
+    impl_template = (
+        _PYTORCH_GEMM_IMPL_TEMPLATE_3x
+        if op.api == ApiVersion.v3x
+        else _PYTORCH_GEMM_IMPL_TEMPLATE_2x
+    )
+    cuda_impl = SubstituteTemplate(impl_template, {"name": name, **extra_kw})
+    cuda_source = SubstituteTemplate(
+        _PYTORCH_CUDA_TEMPLATE,
+        {
+            "includes": _PYTORCH_GEMM_INCLUDES[op.api],
+            "declaration": op.rt_module.emit(),
+            "procedural_name": op.procedural_name(),
+            "impl": cuda_impl,
+            "torch_type_C": _CUTLASS_TYPE_TO_TORCH_TYPE[op.C.element],
+        },
+    )
+    with open(cuda_file, "w") as outfile:
+        outfile.write(cuda_source)
+
+    cpp_file = os.path.join(sourcedir, name + ".cpp")
+    cpp_source = SubstituteTemplate(
+        _PYTORCH_GEMM_CPP_TEMPLATE,
+        {"name": name, "description": f"CUTLASS {op.procedural_name()} GEMM"},
+    )
+    with open(cpp_file, "w") as outfile:
+        outfile.write(cpp_source)
+
+    extra_compile_args = ""
+    if cc == 90:
+        extra_compile_args = "'--generate-code=arch=compute_90a,code=[sm_90a]'"
+    _generate_setup(name, sourcedir, extra_compile_args)
+
+    if jit:
+        return _jit(name, cc, cpp_file, cuda_file)
+
+    return None
+
+
+def _pytorch_grouped_gemm(
+    op, name: str, cc: int, jit: bool = False, sourcedir: str = ""
+):
+    """
+    Generates source for building a PyTorch CUDA module that leverages the CUTLASS grouped GEMM
+    specified by ``op``. If the ``jit`` parameter is set to true, the module is just-in-time
+    compiled, loaded, and returned.
+
+    :param op: operation to emit in the module
+    :param name: name of the module to generate
+    :type name: str
+    :param cc: compute capability of the device the module should target
+    :type cc: int
+    :param jit: whether the module should be just-in-time compiled
+    :type jit: bool
+    :param sourcedir: directory to which generated source files should be written
+    :type sourcedir: str
+
+    :return: loaded PyTorch module if ``jit=True`` or ``None`` otherwise
+    """
+    if op.api != ApiVersion.v2x:
+        raise Exception("Grouped GEMM is currently only supported for CUTLASS 2.x")
+
+    if sourcedir != "" and not os.path.isdir(sourcedir):
+        os.makedirs(sourcedir)
+
+    cuda_file = os.path.join(sourcedir, name + "_kernel.cu")
+    cuda_impl = SubstituteTemplate(_PYTORCH_GROUPED_GEMM_IMPL_TEMPLATE, {"name": name})
+    cuda_source = SubstituteTemplate(
+        _PYTORCH_CUDA_TEMPLATE,
+        {
+            "includes": _PYTORCH_GROUPED_GEMM_INCLUDES,
+            "declaration": op.rt_module.emit(),
+            "procedural_name": op.procedural_name(),
+            "impl": cuda_impl,
+            "torch_type_C": _CUTLASS_TYPE_TO_TORCH_TYPE[op.C.element],
+        },
+    )
+    with open(cuda_file, "w") as outfile:
+        outfile.write(cuda_source)
+
+    cpp_file = os.path.join(sourcedir, name + ".cpp")
+    cpp_source = SubstituteTemplate(
+        _PYTORCH_GROUPED_GEMM_CPP_TEMPLATE,
+        {"name": name, "description": f"CUTLASS {op.procedural_name()} grouped GEMM"},
+    )
+    with open(cpp_file, "w") as outfile:
+        outfile.write(cpp_source)
+
+    _generate_setup(name, sourcedir)
+
+    if jit:
+        return _jit(name, cc, cpp_file, cuda_file)
+
+    return None
+
+
+def _pytorch_conv2d(op, name: str, cc: int, jit: bool = False, sourcedir: str = ""):
+    """
+    Generates source for building a PyTorch CUDA module that leverages the CUTLASS Conv2d
+    specified by ``op``. If the ``jit`` parameter is set to true, the module is just-in-time
+    compiled, loaded, and returned.
+
+    :param op: operation to emit in the module
+    :param name: name of the module to generate
+    :type name: str
+    :param cc: compute capability of the device the module should target
+    :type cc: int
+    :param jit: whether the module should be just-in-time compiled
+    :type jit: bool
+    :param sourcedir: directory to which generated source files should be written
+    :type sourcedir: str
+
+    Note that the when conv kind is `dgrad` or `wgrad`, the size of the input `(N, C, H, W)` or
+    weight `(K, C, R, S)` should be provided. This is because there are multiple valid solutions
+    for H/W/R/S given the same P/Q.
+
+    :return: loaded PyTorch module if ``jit=True`` or ``None`` otherwise
+    """
+    if sourcedir != "" and not os.path.isdir(sourcedir):
+        os.makedirs(sourcedir)
+    cuda_file = os.path.join(sourcedir, name + "_kernel.cu")
+    extra_kw = {}
+    if op.conv_kind == ConvKind.Fprop:
+        impl_template = _PYTORCH_CONV2D_FPROP_IMPL_TEMPLATE_2x
+        cpp_template = _PYTORCH_CONV2D_FPROP_CPP_TEMPLATE
+    elif op.conv_kind == ConvKind.Dgrad:
+        impl_template = _PYTORCH_CONV2D_DGRAD_IMPL_TEMPLATE_2x
+        cpp_template = _PYTORCH_CONV2D_GRAD_CPP_TEMPLATE
+    elif op.conv_kind == ConvKind.Wgrad:
+        impl_template = _PYTORCH_CONV2D_WGRAD_IMPL_TEMPLATE_2x
+        cpp_template = _PYTORCH_CONV2D_GRAD_CPP_TEMPLATE
+    extra_kw["conv_kind_name"] = ConvKindNames[op.conv_kind].capitalize()
+    extra_kw["torch_type_C"] = _CUTLASS_TYPE_TO_TORCH_TYPE[op.C.element]
+    cuda_impl = SubstituteTemplate(impl_template, {"name": name, **extra_kw})
+    cuda_source = SubstituteTemplate(
+        _PYTORCH_CUDA_TEMPLATE,
+        {
+            "includes": _PYTORCH_CONV2D_INCLUDES,
+            "declaration": op.rt_module.emit(),
+            "procedural_name": op.procedural_name(),
+            "impl": cuda_impl,
+            "torch_type_C": _CUTLASS_TYPE_TO_TORCH_TYPE[op.C.element],
+        },
+    )
+    with open(cuda_file, "w") as outfile:
+        outfile.write(cuda_source)
+
+    cpp_file = os.path.join(sourcedir, name + ".cpp")
+    cpp_source = SubstituteTemplate(
+        cpp_template,
+        {"name": name, "description": f"CUTLASS {op.procedural_name()} Conv2d"},
+    )
+    with open(cpp_file, "w") as outfile:
+        outfile.write(cpp_source)
+
+    _generate_setup(name, sourcedir)
+
+    if jit:
+        return _jit(name, cc, cpp_file, cuda_file)
+
+    return None
+
+
+def pytorch(op, name: str, cc: int, jit: bool = False, sourcedir: str = ""):
+    """
+    Generates source for building a PyTorch CUDA module that leverages the CUTLASS kernel
+    specified by ``op``. If the ``jit`` parameter is set to true, the module is just-in-time
+    compiled, loaded, and returned.
+
+    The result of this method is files within ``sourcedir`` that can be used for building
+    a PyTorch module.
+
+    :param op: operation to emit in the module
+    :param name: name of the module to generate
+    :type name: str
+    :param cc: compute capability of the device the module should target
+    :type cc: int
+    :param jit: whether the module should be just-in-time compiled
+    :type jit: bool
+    :param sourcedir: directory to which generated source files should be written
+    :type sourcedir: str
+
+    :return: loaded PyTorch module (if ``jit=True``) or None
+    """
+    device_op = op.device_op()
+    if isinstance(op, GemmOperationUniversal):
+        return _pytorch_gemm(device_op, name, cc, jit, sourcedir)
+    elif isinstance(op, GemmOperationGrouped):
+        return _pytorch_grouped_gemm(device_op, name, cc, jit, sourcedir)
+    elif isinstance(op, Conv2dOperation):
+        return _pytorch_conv2d(device_op, name, cc, jit, sourcedir)
+    else:
+        raise Exception(
+            f"Operation type {type(op)} is not currently supported for PyTorch emission."
+        )
diff --git a/python/cutlass/epilogue/__init__.py b/python/cutlass/epilogue/__init__.py
new file mode 100644
index 0000000000..423deccebc
--- /dev/null
+++ b/python/cutlass/epilogue/__init__.py
@@ -0,0 +1,55 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from cutlass.epilogue.epilogue import (
+    get_activations,
+    get_activation_epilogue,
+    gelu,
+    hardswish,
+    identity,
+    leaky_relu,
+    relu,
+    sigmoid,
+    silu,
+    tanh,
+    trace
+)
+
+from cutlass.epilogue.evt_ops import (
+    max,
+    multiply_add,
+    sum,
+    permute,
+    reshape,
+    maximum,
+    minimum,
+)
diff --git a/python/cutlass/epilogue/epilogue.py b/python/cutlass/epilogue/epilogue.py
new file mode 100644
index 0000000000..3f512d2403
--- /dev/null
+++ b/python/cutlass/epilogue/epilogue.py
@@ -0,0 +1,158 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Registry of elementwise epilogues
+
+Elementwise epilogues can be added to many CUTLASS kernels in the CUTLAS Python interface via
+code like the following for GEMM:
+
+.. highlight:: python
+.. code-block:: python
+
+    plan = cutlass.op.Gemm(element=cutlass.DataType.f32, layout=cutlass.LayoutType.RowMajor)
+    plan.activation = cutlass.epilogue.relu
+"""
+
+from cutlass.backend import epilogue
+
+
+gelu = epilogue.gelu
+hardswish = epilogue.hardswish
+identity = epilogue.identity
+leaky_relu = epilogue.leaky_relu
+relu = epilogue.relu
+sigmoid = epilogue.sigmoid
+silu = epilogue.silu
+tanh = epilogue.tanh
+
+
+_activations = [gelu, hardswish, identity, leaky_relu, relu, sigmoid, silu, tanh]
+
+
+def get_activations() -> list:
+    """
+    Returns a list of available activation functions
+
+    :return: list of available activation functions
+    :rtype: list
+    """
+    return _activations
+
+
+def get_activation_epilogue(
+    activation,
+    element_output,
+    elements_per_access,
+    element_accumulator,
+    element_compute,
+):
+    """
+    Return an epilogue corresponding to the activation function, data types, and alignment
+    used in the kernel
+
+    :param activation: elementwise activation function to use
+    :param element_output: data type of the output
+    :param elements_per_access: alignment of operand C of the kernel
+    :type elements_per_access: int
+    :param element_accumulator: data type of the accumulated output C
+    :param element_compute: data type in which compute operations should be performed
+
+    :return: epilogue functor
+    """
+    if activation not in _activations:
+        raise Exception(
+            f"Unsupported activation type {activation}. Available activations are: {_activations}"
+        )
+
+    if activation == identity:
+        return epilogue.LinearCombination(
+            element_output, elements_per_access, element_accumulator, element_compute
+        )
+    else:
+        return epilogue.LinearCombinationGeneric(
+            activation,
+            element_output,
+            elements_per_access,
+            element_accumulator,
+            element_compute,
+        )
+
+
+"""
+Frontend for EVT that generates epilogue functor through tracing the input function
+"""
+from cutlass.backend.evt.frontend import PythonASTFrontend
+
+
+def trace(fn, example_tensors, **kwargs):
+    """
+    Trace `fn(**example_tensors)` and generates epilogue visitor
+
+    :param fn: Python callables
+    :param example_tensors: example inputs for fn
+    :type example_tensors: dict
+
+    .. hightlight:: python
+    .. code-block:: python
+        import cutlass.backend.evt
+
+        # Define epilogue function as Python callable
+        def example_fn(accum, C, alpha, beta, gamma):
+            D = ((accum + C) * alpha - gamma) / beta
+            return D
+
+        # Define the example tensors
+        example_inputs = {
+            "accum": torch.empty(size=(6, 512, 512), dtype=torch.float16, device="cuda"),
+            "C": torch.empty(size=(6, 512, 512), dtype=torch.float16, device="cuda"),
+            "alpha": 1.5,
+            "beta": 0.5,
+            "gamma": 2.5,
+            "D": torch.empty(size=(6, 512, 512), dtype=torch.float16, device="cuda")
+        }
+
+        # Generate the epilogue functor
+        epilogue_visitor = cutlass.epilogue.trace(example_fn, example_inputs)
+    """
+    if callable(fn):
+        class EpilogueFunctor(PythonASTFrontend):
+            def __init__(self, **kwargs):
+                super().__init__(**kwargs)
+            pass
+        setattr(EpilogueFunctor, "__call__", staticmethod(fn))
+
+        epilogue_functor = EpilogueFunctor(**kwargs)
+        epilogue_functor.trace(example_tensors)
+        return epilogue_functor
+    else:
+        raise NotImplementedError("Expect a callable Python function")
diff --git a/python/cutlass/epilogue/evt_ops.py b/python/cutlass/epilogue/evt_ops.py
new file mode 100644
index 0000000000..153b937e65
--- /dev/null
+++ b/python/cutlass/epilogue/evt_ops.py
@@ -0,0 +1,92 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Collection of builtin functions used for host reference in EVT
+"""
+
+import numpy as np
+
+from cutlass.utils.datatypes import is_cupy_tensor, is_numpy_tensor, is_torch_available, is_torch_tensor
+
+if is_torch_available():
+    import torch
+
+
+def multiply_add(x, y, z):
+    return x * y + z
+
+
+def sum(x, dim):
+    if is_numpy_tensor(x):
+        return x.sum(axis=tuple(dim))
+    elif is_torch_tensor(x):
+        return torch.sum(x, dim)
+
+
+def max(x, dim):
+    if is_numpy_tensor(x):
+        return x.max(axis=tuple(dim))
+    elif is_torch_tensor(x):
+        return torch.amax(x, dim)
+
+
+def maximum(x, y):
+    if is_numpy_tensor(x):
+        return np.maximum(x, y)
+    elif is_torch_tensor(x):
+        return torch.maximum(x, torch.tensor(y))
+
+
+def minimum(x, y):
+    if is_numpy_tensor(x):
+        return np.minimum(x, y)
+    elif is_torch_tensor(x):
+        return torch.minimum(x, torch.tensor(y))
+
+
+##############################################################################
+# Layout manipulate nodes
+##############################################################################
+
+def permute(x, indices: tuple):
+    if is_numpy_tensor(x):
+        return np.transpose(x, axes=indices)
+    elif is_torch_tensor(x):
+        return x.permute(*indices)
+
+
+def reshape(x, new_shape: tuple):
+    if is_numpy_tensor(x):
+        return np.reshape(x, newshape=new_shape)
+    elif is_torch_tensor(x):
+        return x.view(new_shape)
diff --git a/python/cutlass/library_defaults.py b/python/cutlass/library_defaults.py
new file mode 100644
index 0000000000..2a02f61c90
--- /dev/null
+++ b/python/cutlass/library_defaults.py
@@ -0,0 +1,580 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Classes containing valid operations for a given compute capability and data types.
+"""
+
+from itertools import combinations_with_replacement
+import logging
+
+from cuda import __version__
+import cutlass_library
+from cutlass_library.library import ConvKind, IteratorAlgorithm, StrideSupport, GroupMode
+
+import cutlass
+from cutlass.utils.check import valid_stage_count
+from cutlass.utils.datatypes import td_from_profiler_td, td_from_profiler_op
+
+
+_generator_ccs = [50, 60, 61, 70, 75, 80, 90]
+
+# Strip any additional information from the CUDA version
+_cuda_version = __version__.split("rc")[0]
+
+# Check that Python CUDA version exceeds NVCC version
+_nvcc_version = cutlass.nvcc_version()
+_cuda_list = _cuda_version.split('.')
+_nvcc_list = _cuda_version.split('.')
+for val_cuda, val_nvcc in zip(_cuda_list, _nvcc_list):
+    if int(val_cuda) < int(val_nvcc):
+        raise Exception(f"Python CUDA version of {_cuda_version} must be greater than or equal to NVCC version of {_nvcc_version}")
+
+if len(_nvcc_list) > len(_cuda_list):
+    if len(_nvcc_list) != len(_cuda_list) + 1:
+        raise Exception(f"Malformatted NVCC version of {_nvcc_version}")
+    if _nvcc_list[:-1] == _cuda_list and int(_nvcc_list[-1]) != 0:
+        raise Exception(f"Python CUDA version of {_cuda_version} must be greater than or equal to NVCC version of {_nvcc_version}")
+
+
+class KernelsForDataType:
+    """
+    Container class for keeping track of kernels that correspond to a particular combination
+    of data types for operands A, B, and accumulator
+    """
+
+    def __init__(self, datatype_comb: tuple, layout_comb: tuple):
+        self.datatype_comb = datatype_comb
+        self.layout_comb = layout_comb
+        self.math_operations = set()
+
+        # Dictionary mapping from alignment (int) to a list of kernels that fit the alignment
+        # constraint for the data type combination
+        self.kernels_by_alignment = {}
+
+    def add(self, operation):
+        """
+        Add an operation to the list of supported kernels
+        """
+        alignment_key = f"{operation.A.alignment} {operation.B.alignment} {operation.C.alignment}"
+        if alignment_key not in self.kernels_by_alignment:
+            self.kernels_by_alignment[alignment_key] = []
+        self.kernels_by_alignment[alignment_key].append(operation)
+        self.math_operations.add(operation.tile_description.math_instruction.math_operation)
+
+    def alignments(self, operand: str):
+        """
+        Returns an unsorted list of alignments supported by this data type combination
+
+        :param operand: identifier of operand in question (e.g., A, B, C)
+        :type operand: str
+
+        :return: unsorted list of alignments supported by this data type combination
+        :rtype: list
+        """
+        operand_idx = self._operand_idx(operand)
+        return [int(key.split(" ")[operand_idx]) for key in self.kernels_by_alignment.keys()]
+
+    @property
+    def all_operations(self):
+        """
+        Returns a list of all operations supported by this data type combination
+
+        :return: list of all operations supported by this data type combination
+        :rtype: list
+        """
+        ops = []
+        for _, alignment_ops in self.kernels_by_alignment.items():
+            ops.extend(alignment_ops)
+        return ops
+
+    def default_operation(self, math_operation: cutlass.MathOperation):
+        key = sorted(list(self.kernels_by_alignment.keys()))[0]
+        kernels = self.kernels_by_alignment[key]
+        if math_operation is not None:
+            kernels = [x for x in kernels if x.tile_description.math_instruction.math_operation == math_operation]
+        return kernels[0]
+
+    def operations(self, alignment_A: int, alignment_B: int, alignment_C: int, math_operation: cutlass.MathOperation):
+        """
+        Returns operations satisfying the alignment constraints
+
+        :param alignment_A: alignment constraint of operations to return
+        :type alignment_A: int
+        :param alignment_B: alignment constraint of operations to return
+        :type alignment_B: int
+        :param alignment_C: alignment constraint of operations to return
+        :type alignment_C: int
+        :param math_operation: math operation to consider
+        :type math_operation: cutlass.MathOperation
+
+        :return: list of operations
+        :rtype: list
+        """
+        key = f"{alignment_A} {alignment_B} {alignment_C}"
+
+        if key not in self.kernels_by_alignment:
+            og_key = key
+            # Reconcile A, B, and C alignments by trying to align to the minimum
+            min_alignment = min(alignment_A, alignment_B, alignment_C)
+            key = f"{min_alignment} {min_alignment} {min_alignment}"
+            if key not in self.kernels_by_alignment:
+                # Finally, go through all available alignment combinations and find
+                # one for which all values are less than those passed in.
+                key = None
+                alignments = sorted([tuple(int(x) for x in k.split(" ")) for k in self.kernels_by_alignment.keys()], reverse=True)
+                for align_A, align_B, align_C in alignments:
+                    if alignment_A % align_A == 0 and alignment_B % align_B == 0 and alignment_C % align_C == 0:
+                        key = f"{align_A} {align_B} {align_C}"
+                        break
+
+                if key is None:
+                    raise Exception(
+                        f"No operations of alignment {og_key} found for data type and layout "
+                        f"combination {self.datatype_comb} {self.layout_comb}. Compatible alignments "
+                        f"are {self.kernels_by_alignment.keys()}"
+                    )
+
+        ops = self.kernels_by_alignment[key]
+        if math_operation is not None:
+            ops = [op for op in ops if op.tile_description.math_instruction.math_operation == math_operation]
+        return ops
+
+    def _operand_idx(self, key: str) -> int:
+        operand_list = ["A", "B", "C"]
+        if key not in operand_list:
+            raise Exception(f"Unexpected operand {operand}")
+
+        return operand_list.index(key)
+
+    def find_alignment(self, shape: tuple, layout: cutlass.LayoutType, operand=str) -> int:
+        """
+        Returns the most preferable alignment for a given shape and layout
+
+        :param shape: extent of each dimension of the tensor
+        :type shape: tuple
+        :param layout: layout of the tensor
+        :type layout: cutlass.LayoutType
+        :param operand: descriptor of the operand in question
+        :type operand: str
+
+        :return: maximum alignment supported by the data type combination and tensor size
+        :rtype: int
+        """
+        operand_idx = self._operand_idx(operand)
+
+        # Determine the leading dimension of the shape
+        if layout == cutlass.LayoutType.ColumnMajor:
+            ld = shape[-2]
+        elif layout == cutlass.LayoutType.RowMajor:
+            ld = shape[-1]
+        elif layout == cutlass.LayoutType.TensorNHWC:
+            ld = shape[-1]
+        else:
+            raise Exception(f"Unexpected or unsupported layout {layout}")
+
+        for alignments in sorted(list(self.kernels_by_alignment.keys()), reverse=True):
+            alignment = int(alignments.split(" ")[operand_idx])
+            if ld % alignment == 0:
+                return alignment
+
+        # Default to alignment of 1 if no others match
+        return 1
+
+    def sort(self):
+        """
+        Sorts each list of kernels in `kernels_by_alignment` in descending order of threadblock shape
+        """
+        key = lambda op: (
+            op.tile_description.threadblock_shape[0]
+            * op.tile_description.threadblock_shape[1]
+            * op.tile_description.threadblock_shape[2]
+        )
+        for alignment in self.kernels_by_alignment.keys():
+            self.kernels_by_alignment[alignment].sort(key=key, reverse=True)
+
+    def supports_math_operation(self, math_operation: cutlass.MathOperation) -> bool:
+        """
+        Returns whether `math_operation` is supported by at least one operation.
+
+        :param math_operation: math operation to consider
+        :type math_operation: cutlass.MathOperation
+
+        :return: whether math_operation is supported by at least one operation
+        :rtype: bool
+        """
+        return math_operation is None or math_operation in self.math_operations
+
+
+class ArchOptions:
+    """
+    Structure for keeping track of kernels available on a given compute capability
+
+    :param target_cc: compute capability of the device on which kernels will be run
+    :type target_cc: int
+    :param kernel_cc: compute capability of the kernels to generate
+    :type kernel_cc: int
+    :param operation_kind: type of operation to register
+    :type operation_kind: cutlass_library.OperationKind
+    :param gemm_kinds: types of GEMM operations that can be included
+    :type gemm_kinds: list
+    :param allowed_math_operations: types of primitive math operations allowed
+    :type allowed_math_operations: list
+    """
+
+    def __init__(
+        self,
+        target_cc: int,
+        kernel_cc: int,
+        operation_kind: cutlass_library.OperationKind,
+        gemm_kinds: list,
+        allowed_math_operations: list = [
+            cutlass_library.MathOperation.multiply_add,
+            cutlass_library.MathOperation.multiply_add_saturate,
+            cutlass_library.MathOperation.multiply_add_mixed_input_upcast,
+            cutlass_library.MathOperation.multiply_add_fast_f32
+        ]
+    ):
+        self.cc = kernel_cc
+
+        # Dictionary with following structure:
+        #  Key: OpcodeClass
+        #  Value: Dictionary with the following structure:
+        #     Key: tuple of ((DataType, DataType, DataType), (LayoutType, LayoutType, LayoutType),
+        #          representing ((element_a, element_b, element_accumulator), (layout_a, layout_b))
+        #     Value: KernelsForDataType
+        self.operations_by_opclass = {}
+        self.op_class = None
+        self.allowed_math_operations = allowed_math_operations
+
+        # Identify the method within CUTLASS generator script that generates kernel
+        # descriptions for the target CC
+        generate_function_name = "GenerateSM" + str(kernel_cc)
+        if not hasattr(cutlass_library.generator, generate_function_name):
+            cutlass.logger.warning(f"No generator found for architecture {kernel_cc}")
+            return
+        generate_function = getattr(cutlass_library.generator, generate_function_name)
+
+        # Initialize a default manifest and populate it with valid kernel descriptions
+        # for the target CC
+        args = [
+            "--kernels=all",
+            f"--log-level={logging.getLevelName(cutlass.logger.level)}"
+        ]
+        manifest_args = cutlass_library.generator.define_parser().parse_args(args)
+        manifest = cutlass_library.manifest.Manifest(manifest_args)
+        generate_function(manifest, _nvcc_version)
+
+        if operation_kind not in manifest.operations:
+            # No kernels generated for this architecture, this could be because the CUDA
+            # toolkit is insufficient to support operations in this CC
+            cutlass.logger.warning(f"No operations of type {operation_kind} found for CC {kernel_cc}")
+            return
+
+        # Only one CC should be returned, given the setup above of calling only the generation scripts
+        # for a given CC
+        if len(manifest.operations[operation_kind].keys()) != 1 or kernel_cc not in manifest.operations[operation_kind]:
+            raise Exception(f"Error finding kernels for SM{kernel_cc}. Check that your CUDA toolkit version "
+                             "is sufficient for the architecture in question.")
+
+        # Iterate through the available operations for this operation kind and
+        # find available opclasses and data types
+        for name, op_list in manifest.operations[operation_kind][kernel_cc].items():
+            for op in op_list:
+                if operation_kind == cutlass_library.OperationKind.Gemm:
+                    if op.gemm_kind not in gemm_kinds:
+                        continue
+
+                mi = op.tile_description.math_instruction
+                if mi.math_operation not in self.allowed_math_operations:
+                    continue
+
+                # Prune operations that don't fit in shared memory
+                td = td_from_profiler_op(op)
+                if not valid_stage_count(target_cc, kernel_cc, td, verbose=False)[0]:
+                    continue
+
+                if mi.opcode_class not in self.operations_by_opclass:
+                    self.operations_by_opclass[mi.opcode_class] = {}
+
+                datatype_comb = (mi.element_a, mi.element_b, mi.element_accumulator)
+                layout_comb = (op.A.layout, op.B.layout)
+
+                # Register TF32 kernels as F32 to enable F32 -> TF32 conversion + TF32 Tensor Core operations
+                if datatype_comb == (cutlass_library.DataType.tf32, cutlass_library.DataType.tf32, cutlass_library.DataType.f32):
+                    # TF32 kernels only supported on SM80 and beyond
+                    if self.cc < 80:
+                        continue
+                    elif self.cc == 90:
+                        if (op.A.element != cutlass_library.DataType.f32
+                            or op.B.element != cutlass_library.DataType.f32
+                            or op.C.element != cutlass_library.DataType.f32):
+                            continue
+
+                    datatype_comb = (cutlass_library.DataType.f32, cutlass_library.DataType.f32, cutlass_library.DataType.f32)
+
+                opclass_dict = self.operations_by_opclass[mi.opcode_class]
+                key = (datatype_comb, layout_comb)
+                if key not in opclass_dict:
+                    opclass_dict[key] = KernelsForDataType(datatype_comb, layout_comb)
+                opclass_dict[key].add(op)
+
+        # Set the default opclass to TensorOp, if available. Otherwise default to SIMT
+        if cutlass_library.OpcodeClass.TensorOp in self.operations_by_opclass:
+            self.op_class = cutlass_library.OpcodeClass.TensorOp
+        else:
+            self.op_class = cutlass_library.OpcodeClass.Simt
+
+        # The profiler's generator may generate only a limited set of combinations of operands for SIMT kernels.
+        # Here, we generate additional versions via a generic TileDescription.
+        if cutlass_library.OpcodeClass.Simt not in self.operations_by_opclass:
+            self.operations_by_opclass[cutlass_library.OpcodeClass.Simt] = {}
+
+        if operation_kind == cutlass_library.OperationKind.Gemm:
+            types = [
+                (cutlass_library.DataType.s8, cutlass_library.DataType.s8, cutlass_library.DataType.s8),
+                (cutlass_library.DataType.s8, cutlass_library.DataType.s8, cutlass_library.DataType.s32),
+                (cutlass_library.DataType.f16, cutlass_library.DataType.f16, cutlass_library.DataType.f16),
+                (cutlass_library.DataType.f16, cutlass_library.DataType.f16, cutlass_library.DataType.f32),
+                (cutlass_library.DataType.f32, cutlass_library.DataType.f32, cutlass_library.DataType.f32),
+                (cutlass_library.DataType.f64, cutlass_library.DataType.f64, cutlass_library.DataType.f64),
+            ]
+
+            # Add FP8 A/B/C
+            fp8_types = [cutlass_library.DataType.e4m3, cutlass_library.DataType.e5m2]
+            for type_comb in combinations_with_replacement(fp8_types, 3):
+                types.append(type_comb)
+
+            # Add FP8 A/B with FP32 C
+            for type_comb in combinations_with_replacement(fp8_types, 2):
+                types.append(type_comb + (cutlass.DataType.f32,))
+
+            layouts = [
+                (cutlass_library.LayoutType.RowMajor, cutlass_library.LayoutType.RowMajor),
+                (cutlass_library.LayoutType.RowMajor, cutlass_library.LayoutType.ColumnMajor),
+                (cutlass_library.LayoutType.ColumnMajor, cutlass_library.LayoutType.RowMajor),
+                (cutlass_library.LayoutType.ColumnMajor, cutlass_library.LayoutType.ColumnMajor),
+            ]
+        elif operation_kind == cutlass_library.OperationKind.Conv2d:
+            types = [
+                (cutlass_library.DataType.f16, cutlass_library.DataType.f16, cutlass_library.DataType.f16),
+                (cutlass_library.DataType.f16, cutlass_library.DataType.f16, cutlass_library.DataType.f32),
+                (cutlass_library.DataType.f32, cutlass_library.DataType.f32, cutlass_library.DataType.f32),
+                (cutlass_library.DataType.f64, cutlass_library.DataType.f64, cutlass_library.DataType.f64),
+            ]
+
+            layouts = [
+                (cutlass_library.LayoutType.TensorNHWC, cutlass_library.LayoutType.TensorNHWC),
+            ]
+        else:
+            raise NotImplementedError(f"Operation kind {operation_kind} is currently unsupported.")
+
+        alignment = 1
+        epilogue_functor = cutlass_library.EpilogueFunctor.LinearCombination
+        swizzling_functor = cutlass_library.SwizzlingFunctor.Identity8
+        for type_comb in types:
+            for layout_comb in layouts:
+                comb = (type_comb, layout_comb)
+                if comb in self.operations_by_opclass[cutlass_library.OpcodeClass.Simt]:
+                    continue
+
+                A = cutlass_library.TensorDescription(type_comb[0], layout_comb[0], alignment)
+                B = cutlass_library.TensorDescription(type_comb[1], layout_comb[1], alignment)
+                C = cutlass_library.TensorDescription(type_comb[2], cutlass_library.LayoutType.ColumnMajor, alignment)
+                math_inst = cutlass_library.MathInstruction(
+                    [1, 1, 1],
+                    type_comb[0],
+                    type_comb[1],
+                    type_comb[2],
+                    cutlass_library.OpcodeClass.Simt,
+                    cutlass_library.MathOperation.multiply_add
+                )
+
+                td = cutlass_library.TileDescription(
+                    [128, 128, 8], 2, [4, 2, 1], math_inst, 50, 1024)
+
+                # Prune operations that don't fit in shared memory
+                if not valid_stage_count(target_cc, kernel_cc, td_from_profiler_td(td), verbose=False)[0]:
+                    continue
+
+                new_kernels = KernelsForDataType(type_comb, layout_comb)
+
+                if operation_kind == cutlass_library.OperationKind.Gemm:
+                    new_operation = cutlass_library.manifest.GemmOperation(
+                        cutlass_library.GemmKind.Universal, td.minimum_compute_capability,
+                        td, A, B, C, type_comb[2], epilogue_functor, swizzling_functor)
+                    new_kernels.add(new_operation)
+                elif operation_kind == cutlass_library.OperationKind.Conv2d:
+                    for conv_kind in [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad]:
+                        new_operation = cutlass_library.manifest.Conv2dOperation(
+                            conv_kind, IteratorAlgorithm.Analytic, td.minimum_compute_capability, td,
+                            A, B, C, type_comb[2], StrideSupport.Strided, epilogue_functor, swizzling_functor,
+                            group_mode=GroupMode.SingleGroup
+                        )
+                        new_kernels.add(new_operation)
+
+                self.operations_by_opclass[cutlass_library.OpcodeClass.Simt][comb] = new_kernels
+
+        # Sort all operations
+        for oc in self.operations_by_opclass.keys():
+            for comb in self.operations_by_opclass[oc].keys():
+                self.operations_by_opclass[oc][comb].sort()
+
+    def opclass_supports_combination(
+        self, op_class: cutlass_library.OpcodeClass, datatype_comb: tuple, layout_comb: tuple, math_operation: cutlass_library.MathOperation
+    ) -> bool:
+        """
+        Returns whether the provided operation class supports the provided data type and layout combination
+
+        :param op_class: operation class to consider
+        :type op_class: cutlass_library.OpcodeClass
+        :param datatype_comb: tuple of data types for (element_A, element_B, element_accumulator)
+        :type datatype_comb: tuple[cutlass_library.DataType]
+        :param layout_comb: tuple of data types for (layout_A, layout_B)
+        :type layout_comb: tuple[cutlass_library.LayoutType]
+        :param math_operation: math operation to consider or None if any can be considered
+        :type math_operation: cutlass.MathOperation
+
+        :return: set of operation classes that support the provided data type and layout combination
+        :rtype: set
+        """
+        if op_class not in self.operations_by_opclass:
+            raise Exception(f"Unexpected or unsupported operation class {op_class}")
+
+        if operations := self.operations_by_opclass[op_class].get((datatype_comb, layout_comb)):
+            if math_operation is not None:
+                return operations.supports_math_operation(math_operation)
+            else:
+                return True
+
+        return False
+
+
+    def supporting_opclasses(
+        self,
+        element_a: cutlass_library.DataType,
+        element_b: cutlass_library.DataType,
+        element_accumulator: cutlass_library.DataType,
+        layout_a: cutlass_library.LayoutType,
+        layout_b: cutlass_library.LayoutType,
+        math_operation: cutlass_library.MathOperation,
+    ) -> set:
+        """
+        Returns a set of operation classes that support the provided data type combination
+
+        :param element_a: data type of operand A
+        :type element_a: cutlass_library.DataType
+        :param element_b: data type of operand B
+        :type element_b: cutlass_library.DataType
+        :param element_accumulator: data type of accumulator
+        :type element_accumulator: cutlass_library.DataType
+        :param layout_a: layout of operand A
+        :type layout_a: cutlass_library.LayoutType
+        :param layout_b: layout of operand B
+        :type layout_b: cutlass_library.LayoutType
+        :param math_operation: math operation to consider
+        :type math_operation: cutlass.MathOperation
+
+        :return: set of operation classes that support the provided data type combination
+        :rtype: set
+        """
+        supporting_op_classes = set()
+        datatype_comb = (element_a, element_b, element_accumulator)
+        layout_comb = (layout_a, layout_b)
+
+        for op_class in self.operations_by_opclass.keys():
+            if self.opclass_supports_combination(op_class, datatype_comb, layout_comb, math_operation):
+                supporting_op_classes.add(op_class)
+        return supporting_op_classes
+
+    def operations(
+        self,
+        op_class: cutlass_library.OpcodeClass,
+        element_a: cutlass_library.DataType,
+        element_b: cutlass_library.DataType,
+        element_accumulator: cutlass_library.DataType,
+        layout_a: cutlass_library.LayoutType,
+        layout_b: cutlass_library.LayoutType,
+        math_operation: cutlass_library.MathOperation,
+    ) -> KernelsForDataType:
+        """
+        Returns whether the provided operation class supports the provided data type combination
+
+        :param op_class: operation class to consider
+        :type op_class: cutlass_library.OpcodeClass
+        :param element_a: data type of operand A
+        :type element_a: cutlass_library.DataType
+        :param element_b: data type of operand B
+        :type element_b: cutlass_library.DataType
+        :param element_accumulator: data type of accumulator
+        :type element_accumulator: cutlass_library.DataType
+        :param layout_a: layout of operand A
+        :type layout_a: cutlass_library.LayoutType
+        :param layout_b: layout of operand B
+        :type layout_b: cutlass_library.LayoutType
+        :param math_operation: math operation to consider
+        :type math_operation: cutlass.MathOperation
+
+        :return: container of kernels by alignment supported by the provided combination of parameters
+        :rtype: KernelsForDataType
+        """
+        datatype_comb = (element_a, element_b, element_accumulator)
+        layout_comb = (layout_a, layout_b)
+        if not self.opclass_supports_combination(op_class, datatype_comb, layout_comb, math_operation):
+            raise Exception(
+                f"Data type layout combination {datatype_comb}, {layout_comb} "
+                f"is not supported by opcode class {op_class} on CC {self.cc}."
+            )
+        return self.operations_by_opclass[op_class][(datatype_comb, layout_comb)]
+
+
+class OptionRegistry:
+    """
+    Container of all architecture-specific options
+
+    :param target_cc: compute capability of the device on which operations will be run
+    :type target_cc: int
+    """
+
+    def __init__(self, target_cc: int):
+        self.registry = {}
+
+        gemm_kinds = [cutlass_library.GemmKind.Universal, cutlass_library.GemmKind.Universal3x]
+        operation_kinds = [cutlass_library.OperationKind.Gemm, cutlass_library.OperationKind.Conv2d]
+        # Construct options for each CC
+        for kernel_cc in _generator_ccs:
+            self.registry[kernel_cc] = {}
+            for opkind in operation_kinds:
+                self.registry[kernel_cc][opkind] = ArchOptions(target_cc, kernel_cc, opkind, gemm_kinds)
+
+    def options_for_cc(self, cc: int, op_kind=cutlass_library.OperationKind.Gemm) -> ArchOptions:
+        return self.registry.get(cc, None)[op_kind]
diff --git a/python/cutlass/op/__init__.py b/python/cutlass/op/__init__.py
new file mode 100644
index 0000000000..c0f3c82753
--- /dev/null
+++ b/python/cutlass/op/__init__.py
@@ -0,0 +1,36 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from cutlass.op.conv import Conv2d, Conv2dFprop, Conv2dDgrad, Conv2dWgrad
+from cutlass.op.gemm import Gemm
+from cutlass.op.gemm_grouped import GroupedGemm
+from cutlass.op.op import OperationBase
diff --git a/python/cutlass/op/conv.py b/python/cutlass/op/conv.py
new file mode 100644
index 0000000000..6de935511f
--- /dev/null
+++ b/python/cutlass/op/conv.py
@@ -0,0 +1,983 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+    Ease-of-use interface for constructing, compiling, and running CONVs
+
+    The ``Conv2d`` interface is meant to allow one to easily instantiate, compile, and run
+    CONV2D operations in CUTLASS via Python, without specifying many configuration parameters.
+    Under the hood, the interface will select sensible default parameters for the many template
+    parameters for CUTLASS CONVs.
+
+    Note: optimal performance is not to be expected from this interface. To achieve optimal
+    performance, one should specify and tune each configuration parameter.
+
+    The simplest example of using this interface is the following:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        # A, B, C, and D are torch/numpy/cupy tensor objects
+        plan = cutlass.op.Conv(A, B, C, D)
+        plan.run(stride=(1, 1), padding=(0, 0), dilation=(1, 1))
+
+    One can also use the interface by specifying data types of operands at construction
+    and using different tensor objects with these data types at runtime:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        # The following is shorthand for:
+        #        cutlass.op.Conv2d(kind="fprop",
+        #                          element_A=torch.float32, element_B=torch.float32,
+        #                          element_C=torch.float32, element_D=torch.float32,
+        #                          element_accumulator=torch.float32)
+        plan = cutlass.op.Conv2d(kind="fprop", element=torch.float32)
+
+        A0 = torch.rand((128, 256), dtype=torch.float32, device='cuda')
+        B0 = torch.rand((256, 64), dtype=torch.float32, device='cuda')
+        C0 = torch.zeros((128, 64), dtype=torch.float32, device='cuda')
+        D0 = torch.zeros((128, 64), dtype=torch.float32, device.'cuda')
+        plan.run(A0, B0, C0, D0, stride=(1, 1), padding=(0, 0), dilation=(1, 1))
+
+        A = torch.rand((32, 128), dtype=torch.float32, device='cuda')
+        B = torch.rand((128, 256), dtype=torch.float32, device='cuda')
+        C = torch.zeros((32, 256), dtype=torch.float32, device='cuda')
+        D = torch.zeros((32, 256), dtype=torch.float32, device.'cuda')
+        plan.run(A1, B1, C1, D1, stride=(1, 1), padding=(0, 0), dilation=(1, 1))
+
+    The interface additionally enables one to decouple the compilation of the underlying CUTLASS
+    kernel from its execution:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        plan = cutlass.op.Conv2d(kind="fprop", element=np.float32)
+
+        # Do other work...
+
+        plan.run(A0, B0, C0, D0, stride=(1, 1), padding=(0, 0), dilation=(1, 1))
+
+        # Do other work...
+
+        plan.run(A1, B1, C1, D1, stride=(1, 1), padding=(0, 0), dilation=(1, 1))
+
+    Elementwise activation functions are easily fused to the GEMM via the interface:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        plan = cutlass.op.Conv2d(kind="fprop", element=np.float32)
+        plan.activation = cutlass.epilogue.relu
+
+    Operations can also be run asynchronously:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        plan = cutlass.op.Conv2d(kind="fprop", element=np.float32)
+        args = plan.run()
+
+        # Do other work...
+
+        args.sync()
+"""
+
+from cuda import cuda
+from cutlass_library import (
+    ConvKind,
+    ConvMode,
+    DataTypeSize,
+    IteratorAlgorithm,
+    OperationKind,
+    SplitKMode,
+    StrideSupport,
+)
+
+import cutlass
+from cutlass import epilogue
+from cutlass.backend import compiler
+from cutlass.backend.conv2d_operation import Conv2dArguments, Conv2dOperation
+from cutlass.backend.reduction_operation import ReductionOperation, ReductionArguments
+from cutlass.backend.library import TensorDescription, TileDescription
+from cutlass.op.op import OperationBase
+from cutlass.shape import Conv2DProblemSize, MatrixCoord
+from cutlass.utils import check, datatypes
+
+
+class Conv2d(OperationBase):
+    """
+    Constructs a ``Conv2d`` object.
+
+    The convolution kind (fprop, wgrad, degrad), the data types of operands A, B, and C,
+    along with the data type of output D and that used for accumulation, are bound to the ``Conv``
+    object throughout its lifetime -- these are not to be changed after a ``Conv2d`` has been constructed.
+
+    The constructor has optional parameters for flexibly setting these parameters. The following
+    constructors are equivalent:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        # Use F32 for A, B, C, D, and accumulation in fprop
+
+        # Use the generic ``element`` parameter to concisely set all data types for operands to the same values.
+        Conv2d(kind="fprop", element=cutlass.DataType.f32)
+
+        # Explicitly specify the data types to use for A, B, C, and D.
+        Conv2d(kind="fprop", element_A=cutlass.DataType.f32, element_B=cutlass.DataType.f32,
+            element_C=cutlass.DataType.f32, element_D=cutlass.DataType.f32)
+
+        # Set the data types and elements from existing tensors. Note that one can use different tensors when
+        # executing GEMM via the ``run()`` method than passed in here (though those passed in to ``run()`` must
+        # have the same data type as those passed in here).
+        # A, B, C, and D are torch.Tensor objects of type torch.float32 under the channel-last layout
+        Conv2d(kind="fprop", A=A, B=B, C=C, D=D)
+
+        # Explicitly specify the data type for only some of A, B, C, and D. Unspecified data types will inherit
+        # those passed in via the generic ``element``
+        Conv2d(kind="fprop", element_A=cutlass.DataType.f32, element_accumulator=cutlass.DataType.f32,
+            element=cutlass.DataType.f32)
+
+    The order of precedence for the setting of the data type for a given operand/output is as follows:
+        1) If the tensor type is specified (e.g., ``A``), use the data type inferred from this tensor
+        2) Otherwise, if the data type (e.g., ``element_A``) is specified, use those
+        3) Otherwise, use the generic values (e.g., ``element``)
+
+    :param kind: the convolution kind (i.e. fprop, wgrad, and dgrad)
+    :type kind: str
+    :param A: tensor representing data type of operand A
+    :param B: tensor representing data type of operand B
+    :param C: tensor representing data type of operand C
+    :param D: tensor representing data type of operand D
+    :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B
+    :param beta: scalar parameter beta from GEMM operation that scales operand C
+    :param element: generic data type to be used for operands A, B, C, D, as well as the accumulation data type
+    :type element: cutlass.DataType
+    :param element_A: data type to be used for operand A
+    :type element_A: cutlass.DataType
+    :param element_B: data type to be used for operand B
+    :type element_B: cutlass.DataType
+    :param element_C: data type to be used for operand C
+    :type element_C: cutlass.DataType
+    :param element_D: data type to be used for operand D
+    :type element_D: cutlass.DataType
+    :param element_accumulator: data type to be used in accumulation of the product of operands A and B
+    :type element_accumulator: cutlass.DataType
+    :param cc: compute capability of device for which kernels should be compiled. For example, if running on H100, this should be set to 90
+    :type cc: int
+    :param kernel_cc: compute capability of kernels to generate. For example, if running on SM90, but desiring to use a CUTLASS 2.x-style Ampere kernel, this should be set to 80
+    :type kernel_cc: int
+    """
+    def __init__(
+        self, kind="fprop",
+        A=None, B=None, C=None, D=None, alpha=1.0, beta=0.0,
+        element=None,
+        element_A=None, element_B=None, element_C=None, element_D=None,
+        element_accumulator=None,
+        cc: int = None, kernel_cc: int = None
+    ):
+        super().__init__(cc=cc, kernel_cc=kernel_cc, operation_kind=OperationKind.Conv2d)
+        # Verify the kernel cc
+        if self.current_cc == 90:
+            # The Conv2d kernel on Hopper (SM90) is currently unsupported
+            # Revert to use SM80-tagged kernels
+            cutlass.logger.warning("Reverting to using SM80-tagged kernel. Opclass may change.")
+            self.specified_kernel_cc = 80
+            self._reset_options(80)
+
+        # The arch is used in testing
+        self.arch = self.current_cc
+        self.name = "conv2d" + kind
+
+        # The convolution kind. (concept: cutlass_library.library.ConvKind)
+        self.conv_kind = datatypes.getattr_enum(ConvKind, kind)
+
+        # The element types (concept: cutlass library types) of A, B, C, and D
+        elements = []
+        layouts = []
+
+        # Complete the data types based on user-provided arguments
+        for elt, tens, name in zip([element_A, element_B, element_C, element_D],
+                                   [A, B, C, D],
+                                   ["A", "B", "C", "D"]):
+            if elt is not None and tens is not None:
+                raise Exception(f'Must not specify both element_{name} and tensor {name}')
+            if elt is None and tens is None and element is None:
+                raise Exception(f'Must specify one of element_{name}, tensor {name}, or generic element.')
+
+            elt_to_set = None
+            lay_to_set = None
+
+            if tens is not None:
+                elt_to_set, _ = datatypes.get_datatype_and_layout(tens)
+            else:
+                elt_to_set = elt if elt is not None else element
+
+            assert elt_to_set is not None
+
+            # Currently we only support layout TensorNHWC
+            lay_to_set = cutlass.LayoutType.TensorNHWC
+            elements.append(datatypes.library_type(elt_to_set))
+            layouts.append(lay_to_set)
+
+        self._element_a, self._element_b, self._element_c, self._element_d = elements
+        self._layout_a, self._layout_b, self._layout_c, self._layout_d = layouts
+
+        self.A, self.B, self.C, self.D, self.alpha, self.beta = A, B, C, D, alpha, beta
+
+        if element_accumulator is None:
+            self._element_accumulator = self._element_c
+        else:
+            self._element_accumulator = datatypes.library_type(element_accumulator)
+
+        # Default inputs if none is supplied in run()
+        self.A = A
+        self.B = B
+        self.C = C
+        self.D = D
+
+        self.alpha = alpha
+        self.beta = beta
+
+        # We only specify the stride of the swizzling functor here
+        # The actual swizzling functor is determined in run based on conv_kind and stride
+        self._swizzling_stride = 1
+
+        # Arguments that will be set to default value in _reset_operations
+        # The default tile_description and op_class are fetched from manifest of cutlass library
+        self._tile_description = None
+        self.op_class = None
+        # The default identity epilogue will be created
+        self.epilogue_functor = None
+
+        self._reset_operations()
+
+        # Arguments that will be determined online based on arguments of "run"
+        # based on stride, input/output channels, alignment, and conv_kind
+        self._iterator_algorithm = None
+        self._stride_support = None
+
+    def _reset_operations(self, reset_epilogue: bool = True):
+        # Set the default op class
+        datatype_comb = (self._element_a, self._element_b, self._element_accumulator)
+        layout_comb = (self._layout_a, self._layout_b)
+
+        self.possible_op_classes = self.options.supporting_opclasses(
+            self._element_a, self._element_b, self._element_accumulator,
+            self._layout_a, self._layout_b, self._math_operation
+        )
+
+        if cutlass.OpcodeClass.TensorOp in self.possible_op_classes:
+            self.opclass = cutlass.OpcodeClass.TensorOp
+        elif cutlass.OpcodeClass.Simt in self.possible_op_classes:
+            self.opclass = cutlass.OpcodeClass.Simt
+        else:
+            if self._math_operation is not None:
+                math_op_str = f' and math operation {self._math_operation}'
+            else:
+                math_op_str = ''
+
+            raise Exception(f'No kernel configuration found for supported data type and layout '
+                            f'combination {datatype_comb}x{layout_comb}{math_op_str}')
+
+        if reset_epilogue:
+            self._reset_epilogue_functor_activation(epilogue.identity)
+
+        self.alignment_pref_A = min(
+            128 // DataTypeSize[self._element_a], max(self.possible_operations.alignments("A")))
+        self.alignment_pref_B = min(
+            128 // DataTypeSize[self._element_b], max(self.possible_operations.alignments("B")))
+        self.alignment_pref_C = min(
+            128 // DataTypeSize[self._element_c], max(self.possible_operations.alignments("C")))
+
+    #
+    # Tile description Related
+    #
+
+    @property
+    def tile_description(self) -> TileDescription:
+        """
+        Returns the tile description
+        """
+        return self._tile_description
+
+    @tile_description.setter
+    def tile_description(
+        self, td=None):
+        """
+        Set the tile description
+
+        :param td: tile description
+        :type td: cutlass.backend.TileDescription, or a dict with keys
+                  {
+                      "threadblock_shape": [int, int, int],
+                      "warp_count": [int, int, int],
+                      "stages": int,
+                      "instruction_shape": [int, int, int] (optional),
+                      "cluster_shape": [int, int, int] (optional)
+                  }
+        """
+        if td is None:
+            return
+        if isinstance(td, dict):
+            if self._tile_description is None:
+                op = self.possible_operations.default_operation(self._math_operation)
+                self._tile_description = datatypes.td_from_profiler_op(op)
+            if "cluster_shape" in td.keys():
+                if td["cluster_shape"] != [1, 1, 1]:
+                    cutlass.logger.warning("Conv2d currently only support 'cluster_shape'=[1, 1, 1]'.")
+                    td["cluster_shape"] = [1, 1, 1]
+            td = self._tile_description.clone_and_update(td)
+
+        valid, msg = self._valid_tile_description(td)
+        if valid:
+            self._tile_description = td
+        else:
+            raise Exception(msg)
+
+    def _valid_tile_description(self, td: TileDescription) -> tuple:
+        """
+        Checks whether the provided tile description is valid for the given compute capability. At present,
+        this checks the following:
+
+        - Does the tile description use a number of stages supported by the compute capability in question?
+        - Does the tile size requested fit within shared memory?
+        - Are cluster dimensions outside the valid range requested for a given architecture (e.g.,
+          more non-unit cluster dimensions for pre-SM90 architectures)?
+        - Is the kernel schedule being used supported on the architecture in question?
+
+        :param td: tile description to validate
+        :type td: cutlass.backend.TileDescription
+        :return: tuple in which the first element is a bool indicating that the tile description is valid
+                 and the second element is a string providing an optional error message.
+        :rtype: tuple
+        """
+        valid, msg = check.valid_stage_count(self.cc, self.current_cc, td)
+        if not valid:
+            return (valid, msg)
+
+        valid, msg = check.valid_cluster_shape(self.current_cc, td.cluster_shape)
+        if not valid:
+            return (valid, msg)
+
+        return valid, msg
+
+    def tile_descriptions(self) -> list:
+        """
+        Returns a list of valid tile descriptions for the operations
+
+        :returns: list of valid tile descriptions for the operations
+        :rtype: list
+        """
+        descriptions = []
+        description_str = []
+        for op in self.possible_operations.all_operations:
+            td = datatypes.td_from_profiler_op(op)
+
+            if self._math_operation is not None:
+                if td.math_instruction.math_operation != self._math_operation:
+                    continue
+
+            if str(td) not in description_str:
+                description_str.append(str(td))
+                descriptions.append(td)
+        return descriptions
+
+    #
+    # Swizzling functor Related
+    #
+
+    @property
+    def swizzling_stride(self):
+        """
+        Returns the stride of swizzling currently being used by the Conv2d
+
+        :return: swizzing stride
+        """
+        return self._swizzling_stride
+
+    @swizzling_stride.setter
+    def swizzling_stride(self, stride: int):
+        """
+        Sets the swizzling functor to the type specified by `swizzling_functor`
+        """
+        if not isinstance(stride, int):
+            raise Exception(f"Expect integer (1, 2, 4, 8), got {stride}")
+        self._swizzling_stride = stride
+
+    def _propose_swizzling_functor(self, stride):
+        """
+        Automatically propose the swizzling functor based on the stride
+        """
+        if self.conv_kind == ConvKind.Dgrad:
+            if stride[0] != 1 or stride[1] != 1:
+                return getattr(cutlass.swizzle, f"StridedDgradIdentitySwizzle{self._swizzling_stride}")
+
+        return getattr(cutlass.swizzle, f"IdentitySwizzle{self._swizzling_stride}")
+
+    #
+    # Iterator Algorithm Related
+    #
+
+    @property
+    def iterator_algorithm(self) -> IteratorAlgorithm:
+        """
+        Returns the iterator algorithm
+        """
+        return self._iterator_algorithm
+
+    @iterator_algorithm.setter
+    def iterator_algorithm(self, alg: str):
+        """
+        Sets the iterator algorithm
+
+        :param alg: The iterator algorithm
+        :type td: string, options: "analytic", "optimized", "few_channels", and "fixed_channels"
+        """
+        iterator_alg = datatypes.getattr_enum(IteratorAlgorithm, alg)
+
+        # Check if the iterator algorithm is valid
+        if iterator_alg in [IteratorAlgorithm.FewChannels, IteratorAlgorithm.FixedChannels] and self.conv_kind != ConvKind.Fprop:
+            raise Exception(f"{self.conv_kind} does not support iterator algorithm {alg}.")
+
+        self._iterator_algorithm = iterator_alg
+
+    def _propose_iterator_algorithm(self, problem_size, alignment_a, alignment_b) -> IteratorAlgorithm:
+        """
+        Propose a valid iterator algorithm based on problem size and alignment
+        """
+        if self.conv_kind == ConvKind.Fprop:
+            # Check whether the fixed channel is applicable
+            if problem_size.C == alignment_a:
+                return IteratorAlgorithm.FixedChannels
+            elif (problem_size.C % alignment_a == 0 and
+                  problem_size.R <= 32 and problem_size.S <= 32):
+                return IteratorAlgorithm.Optimized
+            else:
+                return IteratorAlgorithm.Analytic
+        elif self.conv_kind == ConvKind.Dgrad:
+            if (problem_size.K % alignment_a == 0 and
+                problem_size.R <= 32 and problem_size.S <= 32 and
+                problem_size.C % alignment_b == 0):
+                return IteratorAlgorithm.Optimized
+            else:
+                return IteratorAlgorithm.Analytic
+        elif self.conv_kind == ConvKind.Wgrad:
+            if (problem_size.K % alignment_a == 0 and
+                problem_size.C % alignment_b == 0):
+                return IteratorAlgorithm.Optimized
+            else:
+                return IteratorAlgorithm.Analytic
+
+    def _validate_iterator_algorithm(self, iterator_algorithm, problem_size, alignment_a, alignment_b) -> bool:
+        """
+        Validate whether the user provide iterator algorithm works for the given problem size
+        """
+        if self.conv_kind == ConvKind.Fprop:
+            if iterator_algorithm == IteratorAlgorithm.FixedChannels:
+                return problem_size.C == alignment_a
+            elif iterator_algorithm == IteratorAlgorithm.Optimized:
+                return (problem_size.C % alignment_a == 0 and
+                  problem_size.R <= 32 and problem_size.S <= 32)
+            elif iterator_algorithm == IteratorAlgorithm.FewChannels:
+                return problem_size.C % alignment_a == 0
+        elif self.conv_kind == ConvKind.Dgrad:
+            if iterator_algorithm == IteratorAlgorithm.Optimized:
+                return (problem_size.K % alignment_a == 0 and
+                        problem_size.R <= 32 and problem_size.S <= 32 and
+                        problem_size.C % alignment_b == 0)
+        elif self.conv_kind == ConvKind.Wgrad:
+            if iterator_algorithm == IteratorAlgorithm.Optimized:
+                return (problem_size.K % alignment_a == 0 and
+                problem_size.C % alignment_b == 0)
+
+        return True
+
+    #
+    # Stride Support Related
+    #
+
+    def _propose_stride_support(self, stride):
+        if self.conv_kind == ConvKind.Dgrad:
+            if stride[0] == 1 and stride[1] == 1:
+                return StrideSupport.Unity
+
+        return StrideSupport.Strided
+
+    #
+    # Construct and Compilation
+    #
+
+    def construct(
+        self, tile_description: TileDescription = None,
+        alignment_A: int = None, alignment_B: int = None, alignment_C: int = None,
+        iterator_algorithm: IteratorAlgorithm = None,
+        stride_support = None, swizzling_functor: cutlass.swizzle = None,
+        epilogue_functor=None) -> cutlass.backend.Conv2dOperation:
+        """
+        Constructs a ``cutlass.backend.Conv2dOperation`` based on the input parameters and current
+        kernel specification of the ``Conv2d`` object.
+
+        :param tile_description: tile description specifying shapes and operand types to use in the kernel
+        :type tile_description: cutlass.backend.TileDescription
+        :param alignment_A: alignment of operand A
+        :type alignment_A: int
+        :param alignment_B: alignment of operand B
+        :type alignment_B: int
+        :param alignment_C: alignment of operand C
+        :type alignment_C: int
+        :param iterator_algorithm: the iterator algorithm used
+        :type iterator_algorithm: cutlass_library.library.IteratorAlgorithm
+        :param stride_support: the stride support of dgrad
+        :type stride_support: cutlass_library.library.StrideSupport
+        :param swizzling_functor: the swizzling functor
+        :type swizzling_functor: cutlass.swizzle
+        :param epilogue_functor: the epilogue functor
+
+        :return: operation that was constructed
+        :rtype: cutlass.backend.Conv2dOperation
+        """
+        # Get alignment
+        alignment_A = check.alignment_or_default(alignment_A, self.alignment_pref_A)
+        alignment_B = check.alignment_or_default(alignment_B, self.alignment_pref_B)
+        alignment_C = check.alignment_or_default(alignment_C, self.alignment_pref_C)
+
+        tensor_A = TensorDescription(self._element_a, self._layout_b, alignment_A)
+        tensor_B = TensorDescription(self._element_b, self._layout_b, alignment_B)
+        tensor_C = TensorDescription(self._element_c, self._layout_c, alignment_C)
+
+        if tile_description is None:
+            if self.tile_description is not None:
+                tile_description = self.tile_description
+            else:
+                op = self.possible_operations.operations(alignment_A, alignment_B, alignment_C, self._math_operation)[0]
+                tile_description = datatypes.td_from_profiler_op(op)
+        else:
+            valid, err_str = self._valid_tile_description(tile_description)
+            if not valid:
+                raise Exception(f"Invalid tile description. {err_str}")
+            self.tile_description = tile_description
+
+        if iterator_algorithm is None:
+            # If the iterator algorithm is already set
+            if self.iterator_algorithm is not None:
+                iterator_algorithm = self.iterator_algorithm
+            else:
+                # Otherwise, we conservatively use the analytic iterator for correctness
+                iterator_algorithm = IteratorAlgorithm.Analytic
+
+        if stride_support is None:
+            # If the stride support is already set
+            if self._stride_support is not None:
+                stride_support = self._stride_support
+            else:
+                # Otherwise, we assume strided
+                stride_support = StrideSupport.Strided
+
+        if swizzling_functor is None:
+            # If the swizzling functor is already set
+            swizzling_functor = self._propose_swizzling_functor(stride=(2, 2))
+
+        if epilogue_functor is None:
+            if self.epilogue_functor is not None:
+                epilogue_functor = self.epilogue_functor
+            else:
+                epilogue_functor = self._create_epilogue_functor_activation(self._activation)
+
+        # Reset the alignment of the epilogue functor
+        epilogue_functor = self._reset_epilogue_functor_alignment(alignment_C, epilogue_functor)
+
+        operation = Conv2dOperation(
+            conv_kind=self.conv_kind,
+            iterator_algorithm=iterator_algorithm,
+            arch=self.current_cc,
+            tile_description=tile_description,
+            A=tensor_A, B=tensor_B, C=tensor_C,
+            stride_support=stride_support,
+            epilogue_functor=epilogue_functor,
+            swizzling_functor=swizzling_functor,
+        )
+
+        return operation
+
+    def compile(self, tile_description: TileDescription = None,
+                alignment_A: int = None, alignment_B: int = None, alignment_C: int = None,
+                iterator_algorithm: IteratorAlgorithm = None,
+                stride_support = None, swizzling_functor: cutlass.swizzle = None,
+                epilogue_functor = None, print_module: bool = False) -> cutlass.backend.Conv2dOperation:
+        """
+        Emits and compiles the kernel currently specified. If ``tile_description`` and any
+        of the ``alignment`` parameters are set, the kernel will be chosen using this
+        tile description and alignments. Otherwise, a default tile description and alignment
+        will be used.
+
+        ::param tile_description: tile description specifying shapes and operand types to use in the kernel
+        :type tile_description: cutlass.backend.TileDescription
+        :param alignment_A: alignment of operand A
+        :type alignment_A: int
+        :param alignment_B: alignment of operand B
+        :type alignment_B: int
+        :param alignment_C: alignment of operand C
+        :type alignment_C: int
+        :param iterator_algorithm: the iterator algorithm used
+        :type iterator_algorithm: cutlass_library.library.IteratorAlgorithm
+        :param stride_support: the stride support of dgrad
+        :type stride_support: cutlass_library.library.StrideSupport
+        :param swizzling_functor: the swizzling functor
+        :type swizzling_functor: cutlass.swizzle
+        :param epilogue_functor: the epilogue functor
+
+        :return: operation that was compiled
+        :rtype: cutlass.backend.Conv2dOperation
+        """
+
+        self.operation = self.construct(
+            tile_description, alignment_A, alignment_B, alignment_C,
+            iterator_algorithm, stride_support, swizzling_functor, epilogue_functor)
+
+        if print_module:
+            print(self.operation.rt_module.emit())
+
+        compiler.add_module([self.operation,])
+        return self.operation
+
+    #
+    # Run Related
+    #
+
+    def _verify_type_and_layout(self, tensor, ref_type, ref_layout, name):
+        """
+        Verifies that ``tensor`` has data type ``ref_type`` and layout ``ref_layout``. An exception
+        is raised if it does not.
+
+        :param tensor: object representing a tensor passed in to verify, or ``None`` if no tensor was passed in
+        :type tensor: numpy/cupy/torch array/tensor object
+        :param ref_dtype: data type for the tensor that this object was initialized to
+        :param name: identifier of the tensor to verify. Used in raising exceptions
+        :type name: str
+        """
+        dtype, _ = datatypes.get_datatype_and_layout(tensor)
+        if dtype != ref_type:
+            raise Exception(f'Tensor {name} with type and layout {dtype} '
+                            f'does not match the expected type of {ref_type}.')
+
+    def _get_and_verify_conv_problem_size(self, A, B, C, stride, padding, dilation):
+        if self.conv_kind == ConvKind.Fprop:
+            input = A
+            weight = B
+            output = C
+            output_tensor = "C"
+        elif self.conv_kind == ConvKind.Dgrad:
+            output = A
+            weight = B
+            input = C
+            output_tensor = "A"
+        elif self.conv_kind == ConvKind.Wgrad:
+            output = A
+            input = B
+            weight = C
+            output_tensor = "A"
+        else:
+            raise Exception(f"Convolution kind {self.conv_kind} is not supported")
+
+        N_, H_, W_, C_ = datatypes.get_tensor_shape(input, op="CONV")
+        K_, R_, S_, _ = datatypes.get_tensor_shape(weight, op="CONV")
+        _, P_, Q_, _ = datatypes.get_tensor_shape(output, op="CONV")
+
+        problem_size = Conv2DProblemSize(
+            N_, H_, W_, C_,
+            K_, R_, S_, C_,
+            padding[0], padding[1],
+            stride[0], stride[1],
+            dilation[0], dilation[1],
+            ConvMode.CrossCorrelation,
+            1, 1
+        )
+
+        if P_ != problem_size.P or Q_ != problem_size.Q:
+            raise Exception(
+                f"Tensor {output_tensor} size should be ({N_}, {problem_size.P}, {problem_size.Q}, {K_}), got ({N_}, {P_}, {Q_}, {K_})")
+
+        return problem_size
+
+    def run(self, A=None, B=None, C=None, D=None,
+            stride=(1, 1), padding=(0, 0), dilation=(1, 1),
+            alpha=None, beta=None,
+            split_k=("serial", 1), sync: bool = True,
+            print_module: bool = False,
+            stream: cuda.CUstream = cuda.CUstream(0)) -> Conv2dArguments:
+        """
+        Runs the kernel currently specified. If it has not already been, the kernel is emitted and
+        compiled. Tensors holding operands and outputs of the kernel are sourced either from the
+        ``A``, ``B``, ``C``, ``D``, ``alpha``, and ``beta``
+        parameters provided in the call, or from those
+        passed in on the construction of this object -- one of the two must be specified.
+
+        By default, this call returns only once the kernel has completed. To launch the kernel
+        and immediately return, set ``sync=False``. In this case, it is the responsibility of the
+        caller to syncrhonize the results of the kernel before attempting to access outputs
+        by calling ``sync()`` on the arguments returned from this call.
+
+        :param A: tensor representing data type and layout of operand A
+        :param B: tensor representing data type and layout of operand B
+        :param C: tensor representing data type and layout of operand C
+        :param D: tensor representing data type and layout of operand D
+        :param stride: (stride_h, stride_w) describing the convolution stride. Default: (1, 1)
+        :param padding: (pad_h, pad_w) describing the convolution padding. Default: (0, 0)
+        :param dilation: (dilation_h, dilation_w) describing the dilation of convolution. Default: (1, 1)
+        :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B
+        :param beta: scalar parameter beta from GEMM operation that scales operand C
+        :param split_k: a tuple (split_k_mode, split_k_slices)
+        :param sync: whether the call should wait for the kernel to complete before returning
+        :type sync: bool
+        :param print_module: whether to print the emitted C++ code
+        :type print_module: bool
+        :param stream: cuda stream, defaults to cuda.cuda.CUstream(0)
+        :type stream: :class:`cuda.cuda.CUstream`
+
+        :return: arguments passed in to the kernel
+        :rtype: cutlass.backend.Conv2dArguments
+        """
+        super().run_setup()
+
+        A = self._verify_tensor(A, self.A, self._element_a, self._layout_a, "A")
+        B = self._verify_tensor(B, self.B, self._element_b, self._layout_b, "B")
+        C = self._verify_tensor(C, self.C, self._element_c, self._layout_c, "C")
+        D = self._verify_tensor(D, self.D, self._element_d, self._layout_d, "D")
+        alpha = self._verify_scalar(alpha, self.alpha, self._element_c, "alpha")
+        beta = self._verify_scalar(beta, self.beta, self._element_c, "beta")
+
+        # handle the case when there is no C
+        if C is None:
+            if beta != 0:
+                raise Exception(f"With beta {beta} != 0, C has to be provided.")
+            else:
+                C = D
+
+        # Construct problem size based on input
+        # It also verifies whether the A, B, C, D, stride, padding, and dilation are matching
+        problem_size = self._get_and_verify_conv_problem_size(A, B, C, stride, padding, dilation)
+
+        # Propose stride support based on input
+        stride_support = self._propose_stride_support(stride)
+
+        # Propose swizzling functor
+        swizzling_functor = self._propose_swizzling_functor(stride)
+
+        shape_a = datatypes.get_tensor_shape(A, op="CONV")
+        shape_b = datatypes.get_tensor_shape(B, op="CONV")
+        shape_c = datatypes.get_tensor_shape(C, op="CONV")
+
+        # Get the alignment
+        alignment_a = self.possible_operations.find_alignment(shape_a, self._layout_a, operand="A")
+        alignment_b = self.possible_operations.find_alignment(shape_b, self._layout_b, operand="B")
+        alignment_c = self.possible_operations.find_alignment(shape_c, self._layout_c, operand="C")
+
+        alignment_a = check.update_alignment(alignment_a, self.alignment_pref_A)
+        alignment_b = check.update_alignment(alignment_b, self.alignment_pref_B)
+        alignment_c = check.update_alignment(alignment_c, self.alignment_pref_C)
+
+        # Propose iterator algorithm based on input
+        if self._iterator_algorithm is None:
+            # Propose a default iterator algorithm based on the problem size
+            iterator_algorithm = self._propose_iterator_algorithm(problem_size, alignment_a, alignment_b)
+        else:
+            if (self._validate_iterator_algorithm(self._iterator_algorithm, problem_size, alignment_a, alignment_b)):
+                iterator_algorithm = self._iterator_algorithm
+            else:
+                raise Exception(f"Iterator algorithm {self._iterator_algorithm} is invalid for current problem.")
+
+        epilogue_args = [alpha, beta]
+
+        if hasattr(self, "_activation_args"):
+            if isinstance(self._activation_args, list):
+                epilogue_args += self._activation_args
+            else:
+                epilogue_args.append(self._activation_args)
+
+        if split_k[0] == "parallel" and split_k[1] > 1:
+            epilogue_functor = self._create_epilogue_functor_activation(epilogue.identity)
+        else:
+            epilogue_functor = self.epilogue_functor
+
+        # The alignment is determined by the iterator function (I believe)
+        self.compile(tile_description=self.tile_description, alignment_A=alignment_a, alignment_B=alignment_b,
+                     alignment_C=alignment_c, iterator_algorithm=iterator_algorithm, stride_support=stride_support,
+                     swizzling_functor=swizzling_functor, epilogue_functor=epilogue_functor, print_module=print_module)
+
+        # Create reduction operation for parallel split-k
+        if split_k[0] == "parallel" and split_k[1] > 1:
+            epilogue_functor_reduction = self._reset_epilogue_functor_alignment(alignment_c, self.epilogue_functor)
+            self.reduction_operation = ReductionOperation(
+                shape=MatrixCoord(4, 32 * alignment_c), C=self.operation.C,
+                element_accumulator=self._element_accumulator,
+                element_compute=self._element_accumulator,
+                epilogue_functor=epilogue_functor_reduction,
+                count=alignment_c
+            )
+            if print_module:
+                print(self.reduction_operation.rt_module.emit())
+            compiler.add_module([self.reduction_operation,])
+
+        arguments = Conv2dArguments(
+            operation=self.operation, problem_size=problem_size,
+            A=A, B=B, C=C, D=D,
+            output_op=self.operation.epilogue_type(*epilogue_args),
+            split_k_mode=datatypes.getattr_enum(SplitKMode, split_k[0]),
+            split_k_slices=split_k[1],
+            stream=stream
+        )
+
+        self.operation.run(arguments)
+
+        if split_k[0] == "parallel" and split_k[1] > 1:
+            implicit_gemm_size = arguments.problem_size.implicit_gemm_size(self.conv_kind)
+            reduction_arguments = ReductionArguments(
+                self.reduction_operation,
+                problem_size=[implicit_gemm_size.m, implicit_gemm_size.n],
+                partitions=split_k[1],
+                workspace=arguments.ptr_D,
+                destination=D,
+                source=C,
+                output_op=self.reduction_operation.epilogue_type(*epilogue_args),
+                stream=stream
+            )
+            self.reduction_operation.run(reduction_arguments)
+
+        if sync:
+            if split_k[0] == "parallel" and split_k[1] > 1:
+                reduction_arguments.sync()
+
+                # Free memory allocated by args because we are not
+                # calling `arguments.sync()` in this case (which will free memory)
+                arguments.free()
+            else:
+                arguments.sync()
+
+        return arguments
+
+    #
+    # Helper functions
+    #
+    @staticmethod
+    def output_size(input_size, weight_size, padding, stride, dilation):
+        problem_size = Conv2DProblemSize(
+            *input_size,
+            *weight_size,
+            padding[0], padding[1],
+            stride[0], stride[1],
+            dilation[0], dilation[1],
+            ConvMode.CrossCorrelation,
+            1, 1
+        )
+        return (problem_size.N, problem_size.P, problem_size.Q, problem_size.K)
+
+
+#
+# Easy to use interfaces for fprop, wgrad, and dgrad
+#
+
+class Conv2dFprop(Conv2d):
+    def __init__(
+        self,
+        input=None, weight=None, C=None, output=None, alpha=1, beta=0,
+        element=None,
+        element_input=None, element_weight=None, element_C=None, element_output=None,
+        element_accumulator=None,
+        cc: int = None, kernel_cc: int = None):
+        A, B, D = input, weight, output
+        element_A, element_B, element_D = element_input, element_weight, element_output
+        super().__init__(
+            "fprop", A, B, C, D, alpha, beta, element,
+            element_A, element_B, element_C, element_D,
+            element_accumulator, cc, kernel_cc)
+
+    def run(
+        self, input=None, weight=None, C=None, output=None, alpha=None, beta=None,
+        stride=(1, 1), padding=(0, 0), dilation=(1, 1), split_k=("serial", 1),
+        sync: bool = True, print_module: bool = False,
+        stream: cuda.CUstream = cuda.CUstream(0)) -> Conv2dArguments:
+
+        A, B, D = input, weight, output
+        return super().run(
+            A, B, C, D, alpha, beta, stride, padding, dilation, split_k, sync, print_module, stream)
+
+
+class Conv2dDgrad(Conv2d):
+    def __init__(
+        self,
+        grad_output=None, weight=None, C=None, grad_input=None, alpha=1, beta=0,
+        element=None,
+        element_grad_output=None, element_weight=None, element_C=None, element_grad_input=None,
+        element_accumulator=None,
+        cc: int = None, kernel_cc: int = None):
+        A, B, D = grad_output, weight, grad_input
+        element_A, element_B, element_D = element_grad_output, element_weight, element_grad_input
+        super().__init__(
+            "dgrad", A, B, C, D, alpha, beta, element,
+            element_A, element_B, element_C, element_D,
+            element_accumulator, cc, kernel_cc)
+
+    def run(self, grad_output=None, weight=None, C=None, grad_input=None, alpha=None, beta=None,
+        stride=(1, 1), padding=(0, 0), dilation=(1, 1), split_k=("serial", 1),
+        sync: bool = True, print_module: bool = False,
+        stream: cuda.CUstream = cuda.CUstream(0)) -> Conv2dArguments:
+        #
+        A, B, D = grad_output, weight, grad_input
+        return super().run(
+            A, B, C, D, alpha, beta, stride, padding, dilation, split_k, sync, print_module, stream)
+
+
+class Conv2dWgrad(Conv2d):
+    def __init__(
+        self,
+        grad_output=None, input=None, C=None, grad_weight=None, alpha=1, beta=0,
+        element=None,
+        element_grad_output=None, element_input=None, element_C=None, element_grad_weight=None,
+        element_accumulator=None,
+        cc: int = None, kernel_cc: int = None):
+        A, B, D = grad_output, input, grad_weight
+        element_A, element_B, element_D = element_grad_output, element_input, element_grad_weight
+        super().__init__(
+            "wgrad", A, B, C, D, alpha, beta, element,
+            element_A, element_B, element_C, element_D,
+            element_accumulator, cc, kernel_cc)
+
+    def run(self, grad_output=None, input=None, C=None, grad_weight=None, alpha=None, beta=None,
+        stride=(1, 1), padding=(0, 0), dilation=(1, 1), split_k=("serial", 1),
+        sync: bool = True, print_module: bool = False,
+        stream: cuda.CUstream = cuda.CUstream(0)) -> Conv2dArguments:
+        #
+        A, B, D = grad_output, input, grad_weight
+        return super().run(
+            A, B, C, D, alpha, beta, stride, padding, dilation, split_k, sync, print_module, stream)
diff --git a/python/cutlass/op/gemm.py b/python/cutlass/op/gemm.py
new file mode 100644
index 0000000000..e74c40786f
--- /dev/null
+++ b/python/cutlass/op/gemm.py
@@ -0,0 +1,715 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+    Ease-of-use interface for constructing, compiling, and running GEMMs.
+
+    The ``Gemm`` interface is meant to allow one to easily instantiate, compile, and run
+    GEMM operations in CUTLASS via Python, without specifying many configuration parameters.
+    Under the hood, the interface will select sensible default parameters for the many template
+    parameters for CUTLASS GEMMs.
+
+    Note: optimal performance is not to be expected from this interface. To achieve optimal
+    performance, one should specify and tune each configuration parameter.
+
+    The simplest example of using this interface is the following:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        # A, B, C, and D are torch/numpy/cupy tensor objects
+        plan = cutlass.op.Gemm(A, B, C, D)
+        plan.run()
+
+
+    One can also use the interface by specifying data types of operands at construction
+    and using different tensor objects with these data types at runtime:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        # The following is shorthand for:
+        #        cutlass.op.Gemm(element_A=torch.float32, element_B=torch.float32,
+        #                        element_C=torch.float32, element_D=torch.float32,
+        #                        element_accumulator=torch.float32,
+        #                        layout=cutlass.LayoutType.RowMajor)
+        plan = cutlass.op.Gemm(element=torch.float32, layout=cutlass.LayoutType.RowMajor)
+
+        A0 = torch.rand((128, 256), device='cuda')
+        B0 = torch.rand((256, 64), device='cuda')
+        C0 = torch.zeros((128, 64), device='cuda')
+        D0 = torch.zeros((128, 64), device.'cuda')
+        plan.run(A0, B0, C0, D0)
+
+        A = torch.rand((32, 128), device='cuda')
+        B = torch.rand((128, 256), device='cuda')
+        C = torch.zeros((32, 256), device='cuda')
+        D = torch.zeros((32, 256), device.'cuda')
+        plan.run(A1, B1, C1, D1)
+
+    The interface additionally enables one to decouple the compilation of the underlying CUTLASS
+    kernel from its execution:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        plan = cutlass.op.Gemm(element=np.float32, layout=cutlass.LayoutType.RowMajor)
+        plan.compile()
+
+        # Do other work...
+
+        plan.run(A0, B0, C0, D0)
+
+        # Do other work...
+
+        plan.run(A1, B1, C1, D1)
+
+    Elementwise activation functions are easily fused to the GEMM via the interface:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        plan = cutlass.op.Gemm(element=np.float32, layout=cutlass.LayoutType.RowMajor)
+        plan.activation = cutlass.epilogue.relu
+
+    Operations can also be run asynchronously:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        plan = cutlass.op.Gemm(element=np.float32, layout=cutlass.LayoutType.RowMajor)
+        args = plan.run()
+
+        # Do other work...
+
+        args.sync()
+"""
+
+from math import prod
+
+from cuda import cuda
+from cutlass_library import (
+    DataType,
+    DataTypeSize,
+    GemmUniversalMode,
+)
+
+import cutlass
+from cutlass import epilogue, swizzle
+from cutlass.backend import compiler
+from cutlass.backend.evt import EpilogueFunctorVisitor
+from cutlass.backend.gemm_operation import GemmArguments, GemmOperationUniversal
+from cutlass.backend.library import TensorDescription, TileDescription
+from cutlass.op.op import OperationBase
+from cutlass.shape import GemmCoord
+from cutlass.utils import check, datatypes
+
+
+class Gemm(OperationBase):
+    """
+    Constructs a ``Gemm`` object.
+
+    The data types and layouts of operands A, B, and C, along with the data type of output D
+    and that used for accumulation, are bound to the ``Gemm`` object throughout its lifetime --
+    these are not to be changed after a ``Gemm`` has been constructed.
+
+    The constructor has optional parameters for flexibly setting these parameters. The following
+    constructors are equivalent:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        # Use F32 for A, B, C, D, and accumulation. All operands are row major.
+
+        # Use the generic ``element`` and ``layout`` parameters to concisely set all data types and layouts
+        # for operands to the same values.
+        Gemm(element=cutlass.DataType.f32, layout=cutlass.LayoutType.RowMajor)
+
+        # Explicitly specify the data types to use for A, B, C, and D. Use the generic ``layout``.
+        Gemm(element_A=cutlass.DataType.f32, element_B=cutlass.DataType.f32, element_C=cutlass.DataType.f32,
+            element_D=cutlass.DataType.f32, layout=cutlass.LayoutType.RowMajor)
+
+        # Set the data types and elements from existing tensors. Note that one can use different tensors when
+        # executing GEMM via the ``run()`` method than passed in here (though those passed in to ``run()`` must
+        # have the same data type and layout as those passed in here).
+        # A, B, C, and D are row-major torch.Tensor objects of type torch.float32
+        Gemm(A=A, B=B, C=C, D=D)
+
+        # Use the generic ``element`` and explicitly specify the layouts to use for A, B, and C (layout of D is
+        # the same as that for D, at present)
+        Gemm(element=cutlass.DataType.f32, layout_A=cutlass.LayoutType.RowMajor,
+            layout_B=cutlass.LayoutType.RowMajor, layout_C=cutlass.LayoutType.RowMajor)
+
+        # Explicitly specify the data type and layout for only some of A, B, C, and D. Unspecified data types
+        # and layouts will inherit those passed in via the generic ``element`` and ``layout``
+        Gemm(element_A=cutlass.DataType.f32, layout_B=cutlass.LayoutType.RowMajor,
+            element=cutlass.DataType.f32, layout=cutlass.LayoutType.RowMajor)
+
+    The order of precedence for the setting of the data type and layout for a given operand/output is as follows:
+        1) If the tensor type is specified (e.g., ``A``), use the data type and layout inferred from this tensor
+        2) Otherwise, if the data type/layout (e.g., ``element_A``, ``layout_A``) is specified, use those
+        3) Otherwise, use the generic values (e.g., ``element``, ``layout``)
+
+    :param cc: compute capability of device for which kernels should be compiled. For example, if running on H100, this should be set to 90
+    :type cc: int
+    :param kernel_cc: compute capability of kernels to generate. For example, if running on SM90, but desiring to use a CUTLASS 2.x-style Ampere kernel, this should be set to 80
+    :type kernel_cc: int
+    :param A: tensor representing data type and layout of operand A
+    :param B: tensor representing data type and layout of operand B
+    :param C: tensor representing data type and layout of operand C
+    :param D: tensor representing data type and layout of operand D
+    :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B
+    :param beta: scalar parameter beta from GEMM operation that scales operand C
+    :param element_accumulator: data type to be used in accumulation of the product of operands A and B
+    :type element_accumulator: cutlass.DataType
+    :param element: generic data type to be used for operands A, B, C, D, as well as the accumulation data type
+    :type element: cutlass.DataType
+    :param layout: generic layout type to be used for operands A, B, C, and D
+    :type layout: cutlass.LayoutType
+    :param element_A: data type to be used for operand A
+    :type element_A: cutlass.DataType
+    :param element_B: data type to be used for operand B
+    :type element_B: cutlass.DataType
+    :param element_C: data type to be used for operand C
+    :type element_C: cutlass.DataType
+    :param element_D: data type to be used for operand D
+    :type element_D: cutlass.DataType
+    :param layout_A: layout of operand A
+    :type layout_A: cutlass.LayoutType
+    :param layout_B: layout of operand B
+    :type layout_B: cutlass.LayoutType
+    :param layout_C: layout of operand C
+    :type layout_C: cutlass.LayoutType
+    :param layout_D: layout of operand D
+    :type layout_D: cutlass.LayoutType
+    """
+
+    def __init__(
+        self, A=None, B=None, C=None, D=None,
+        alpha=1.0, beta=0.0, element_accumulator=None,
+        element=None, layout=None,
+        element_A=None, element_B=None, element_C=None, element_D=None,
+        layout_A=None, layout_B=None, layout_C=None,
+        cc: int = None, kernel_cc: int = None
+    ):
+        super().__init__(cc=cc, kernel_cc=kernel_cc)
+        self.name = "gemm"
+        self.compiled = False
+
+        elements = []
+        layouts = []
+
+        # Check that at least one of the following is set for each tensor (illustrated assuming tensor A):
+        # ``A``, ``element_A``, ``element`` and ``A``, ``layout_A``, ``layout``
+        for elt, lay, tens, name in zip([element_A, element_B, element_C, element_D],
+                                        [layout_A, layout_B, layout_C, layout_C],
+                                        [A, B, C, D],
+                                        ["A", "B", "C", "D"]):
+            if elt is not None and tens is not None:
+                raise Exception(f'Must not specify both element_{name} and tensor {name}')
+            if lay is not None and tens is not None:
+                raise Exception(f'Must not specify both layout_{name} and tensor {name}')
+            if elt is None and tens is None and element is None:
+                raise Exception(f'Must specify one of element_{name}, tensor {name}, or generic element.')
+            if lay is None and tens is None and layout is None:
+                raise Exception(f'Must specify one of layout_{name}, tensor {name}, or generic layout.')
+
+            elt_to_set = None
+            lay_to_set = None
+            if tens is not None:
+                elt_to_set, lay_to_set = datatypes.get_datatype_and_layout(tens)
+            else:
+                elt_to_set = elt if elt is not None else element
+                lay_to_set = lay if lay is not None else layout
+
+            elements.append(datatypes.library_type(elt_to_set))
+            layouts.append(lay_to_set)
+
+        self._element_a, self._element_b, self._element_c, self._element_d = elements
+        self._layout_a, self._layout_b, self._layout_c, self._layout_d = layouts
+
+        if element_accumulator is None:
+            self._element_accumulator = self._element_c
+        else:
+            self._element_accumulator = datatypes.library_type(element_accumulator)
+
+        self.A = A
+        self.B = B
+        self.C = C
+        self.D = D
+
+        self.alpha = alpha
+        self.beta = beta
+
+        self.epilogue_functor = None
+        self.op_class = None
+        self._tile_description = None
+
+        self._reset_operations()
+
+        self._swizzling_functor = cutlass.swizzle.IdentitySwizzle1
+
+    def _reset_operations(self, reset_epilogue: bool = True):
+        # Set the default op class
+        datatype_comb = (self._element_a, self._element_b, self._element_accumulator)
+        layout_comb = (self._layout_a, self._layout_b)
+
+        self.possible_op_classes = self.options.supporting_opclasses(
+            self._element_a, self._element_b, self._element_accumulator,
+            self._layout_a, self._layout_b, self._math_operation)
+
+        if cutlass.OpcodeClass.TensorOp in self.possible_op_classes:
+            self.opclass = cutlass.OpcodeClass.TensorOp
+        elif cutlass.OpcodeClass.Simt in self.possible_op_classes:
+            self.opclass = cutlass.OpcodeClass.Simt
+        else:
+            if self._math_operation is not None:
+                math_op_str = f' and math operation {self._math_operation}'
+            else:
+                math_op_str = ''
+
+            raise Exception(f'No kernel configuration found for supported data type and layout '
+                            f'combination {datatype_comb}x{layout_comb}{math_op_str}')
+
+        if reset_epilogue:
+            self._reset_epilogue_functor_activation(cutlass.epilogue.identity)
+
+    @property
+    def swizzling_functor(self):
+        """
+        Returns the type of the swizzling functor currently being used by the GEMM
+
+        :return: swizzing functor type
+        """
+        return self._swizzling_functor
+
+    @swizzling_functor.setter
+    def swizzling_functor(self, swizzling_functor):
+        """
+        Sets the swizzling functor to the type specified by `swizzling_functor`
+        """
+        if swizzling_functor == cutlass.swizzle.ThreadblockSwizzleStreamK:
+            if self.op_class == cutlass.OpcodeClass.Simt:
+                raise Exception('ThreadblockSwizzleStreamK is currently only supported with opcode class TensorOp')
+
+            if self.current_cc == 90:
+                raise Exception('ThreadblockSwizzleStreamK is currently unsupported on SM90')
+        self._swizzling_functor = swizzling_functor
+
+    #
+    # Tile description Related
+    #
+
+    @property
+    def tile_description(self) -> TileDescription:
+        """
+        Returns the tile description
+        """
+        return self._tile_description
+
+    @tile_description.setter
+    def tile_description(
+        self, td=None):
+        """
+        Set the tile description
+
+        :param td: tile description
+        :type td: cutlass.backend.TileDescription, or a dict with keys
+                  {
+                      "threadblock_shape": [int, int, int],
+                      "warp_count": [int, int, int],
+                      "stages": int,
+                      "instruction_shape": [int, int, int] (optional),
+                      "cluster_shape": [int, int, int] (optional)
+                  }
+        """
+        if td is None:
+            return
+        if isinstance(td, dict):
+            if self._tile_description is None:
+                op = self.possible_operations.default_operation(self._math_operation)
+                self._tile_description = datatypes.td_from_profiler_op(op)
+            td = self._tile_description.clone_and_update(td)
+
+        valid, msg = self._valid_tile_description(td)
+        if valid:
+            self._tile_description = td
+        else:
+            raise Exception(msg)
+
+    def _valid_tile_description(self, td: TileDescription) -> tuple:
+        """
+        Checks whether the provided tile description is valid for the given compute capability. At present,
+        this checks the following:
+
+        - Does the tile description use a number of stages supported by the compute capability in question?
+        - Does the tile size requested fit within shared memory?
+        - Are cluster dimensions outside the valid range requested for a given architecture (e.g.,
+          more non-unit cluster dimensions for pre-SM90 architectures)?
+        - Is the kernel schedule being used supported on the architecture in question?
+
+        :param td: tile description to validate
+        :type td: cutlass.backend.TileDescription
+        :return: tuple in which the first element is a bool indicating that the tile description is valid
+                 and the second element is a string providing an optional error message.
+        :rtype: tuple
+        """
+        valid, msg = check.valid_stage_count(self.cc, self.current_cc, td, self._element_c, self._element_d)
+        if not valid:
+            return (valid, msg)
+
+        valid, msg = check.valid_cluster_shape(self.current_cc, td.cluster_shape)
+        if not valid:
+            return (valid, msg)
+
+        valid, msg = check.valid_schedule(self.current_cc, td.kernel_schedule, td.epilogue_schedule, td.tile_scheduler)
+        return valid, msg
+
+    def tile_descriptions(self) -> list:
+        """
+        Returns a list of valid tile descriptions for the operations
+
+        :returns: list of valid tile descriptions for the operations
+        :rtype: list
+        """
+        tds = [datatypes.td_from_profiler_op(op) for op in self.possible_operations.all_operations]
+        if self._math_operation is not None:
+            tds = [td for td in tds if td.math_instruction.math_operation == self._math_operation]
+        return tds
+
+    def construct(
+        self, tile_description: TileDescription = None,
+        alignment_A: int = None, alignment_B: int = None, alignment_C: int = None) -> GemmOperationUniversal:
+        """
+        Constructs a ``cutlass.backend.GemmUniversalOperation`` based on the input parameters and current
+        kernel specification of the ``Gemm`` object.
+
+        :param tile_description: tile description specifying shapes and operand types to use in the kernel
+        :type tile_description: cutlass.backend.TileDescription
+        :param alignment_A: alignment of operand A
+        :type alignment_A: int
+        :param alignment_B: alignment of operand B
+        :type alignment_B: int
+        :param alignment_C: alignment of operand C
+        :type alignment_C: int
+
+        :return: operation that was constructed
+        :rtype: cutlass.backend.GemmOperationUniversal
+        """
+        alignment_pref_A = min(128 // DataTypeSize[self._element_a], max(self.possible_operations.alignments("A")))
+        alignment_pref_B = min(128 // DataTypeSize[self._element_b], max(self.possible_operations.alignments("B")))
+        alignment_A = check.alignment_or_default(alignment_A, alignment_pref_A)
+        alignment_B = check.alignment_or_default(alignment_B, alignment_pref_B)
+
+        tensor_A = TensorDescription(self._element_a, self._layout_a, alignment_A)
+        tensor_B = TensorDescription(self._element_b, self._layout_b, alignment_B)
+
+        if alignment_C is None:
+            alignment_C = max(self.possible_operations.alignments("C"))
+            if self._element_c != DataType.void:
+                alignment_C = min(128 // DataTypeSize[self._element_c], alignment_C)
+
+        if tile_description is None:
+            if self._tile_description is None:
+                op = self.possible_operations.operations(alignment_A, alignment_B, alignment_C, self._math_operation)[0]
+                tile_description = datatypes.td_from_profiler_op(op)
+
+                # The selected op may have lower alignment than that determined above, so we must
+                # reset alignment here.
+                alignment_C = op.C.alignment
+            else:
+                tile_description = self._tile_description
+        else:
+            valid, err_str = self._valid_tile_description(tile_description)
+            if not valid:
+                raise Exception(f"Invalid tile description. {err_str}")
+            self._tile_description = tile_description
+
+        tensor_C = TensorDescription(self._element_c, self._layout_c, alignment_C)
+        self.epilogue_functor = self._reset_epilogue_functor_alignment(alignment_C, self.epilogue_functor)
+
+        operation = GemmOperationUniversal(
+            arch=self.current_cc,
+            tile_description=tile_description,
+            A=tensor_A, B=tensor_B, C=tensor_C,
+            epilogue_functor=self.epilogue_functor,
+            swizzling_functor=self._swizzling_functor,
+        )
+
+        return operation
+
+    def compile(self, tile_description: TileDescription = None,
+                alignment_A: int = None, alignment_B: int = None, alignment_C: int = None,
+                print_module: bool = False) -> cutlass.backend.GemmOperationUniversal:
+        """
+        Emits and compiles the kernel currently specified. If ``tile_description`` and any
+        of the ``alignment`` parameters are set, the kernel will be chosen using this
+        tile description and alignments. Otherwise, a default tile description and alignment
+        will be used.
+
+        :param tile_description: tile description specifying shapes and operand types to use in the kernel
+        :type tile_description: cutlass.backend.TileDescription
+        :param alignment_A: alignment of operand A
+        :type alignment_A: int
+        :param alignment_B: alignment of operand B
+        :type alignment_B: int
+        :param alignment_C: alignment of operand C
+        :type alignment_C: int
+        :param print_module: whether to print the emitted C++ code
+        :type print_module: bool
+
+        :return: operation that was compiled
+        :rtype: cutlass.backend.GemmOperationUniversal
+        """
+        self.operation = self.construct(tile_description, alignment_A, alignment_B, alignment_C)
+
+        if print_module:
+            print(self.operation.rt_module.emit())
+
+        compiler.add_module([self.operation,])
+        return self.operation
+
+    def _verify_rank(self, tensor):
+        """
+        Verifies that ``tensor`` has rank greater than 1
+
+        :param tensor: object representing a tensor passed in to verify, or ``None`` if no tensor was passed in
+        :type tensor: numpy/cupy/torch array/tensor object
+        """
+        if len(tensor.shape) < 2:
+            raise Exception(f"Tensors must be of rank greater than 1. Received tensor of shape: {tensor.shape}")
+
+    def _get_batch_count(self, A, B, C, D) -> int:
+        """
+        Returns the batch count specified by the tensors A, B, C, and D and verifies that these
+        tensors match in batch size. Presence of a batch dimension is detected by one of the
+        tensors being rank 3. If a batch dimension is present, it must be present in one of
+        operands A, B, or C (but need not be in all), and must be present in D.
+
+        :param A: tensor A
+        :type A: numpy/cupy/torch array/tensor object
+        :param B: tensor B
+        :type B: numpy/cupy/torch array/tensor object
+        :param C: tensor C
+        :type C: numpy/cupy/torch array/tensor object
+        :param D: tensor D
+        :type D: numpy/cupy/torch array/tensor object
+
+        :return: tuple of batch count dimensions
+        :rtype: tuple
+        """
+        A_batch = prod(A.shape[:-2]) if len(A.shape) > 2 else 1
+        B_batch = prod(B.shape[:-2]) if len(B.shape) > 2 else 1
+
+        if 1 not in [A_batch, B_batch]:
+            if A_batch != B_batch:
+                raise Exception(f"Get invalid batch counts: A={A_batch}, B={B_batch}")
+        return max(A_batch, B_batch)
+
+    def _get_batch_stride(self, tensor) -> int:
+        """
+        Returns the batch stride of ``tensor``. If ``tensor`` is only rank-2, batch stride is 0.
+
+        :param tensor: tensor object to process
+        :type tensor: numpy/cupy/torch array/tensor object
+
+        :return: stride between each matrix in the batch
+        :rtype: int
+        """
+        if tensor is not None and len(tensor.shape) > 2:
+            return tensor.shape[-2] * tensor.shape[-1]
+        else:
+            return 0
+
+    def _get_problem_args(self, A, B, C, D) -> tuple:
+        """
+        Returns the problem size and GEMM universal mode to use for the
+        given operands.
+
+        :param A: tensor A
+        :type A: numpy/cupy/torch array/tensor object
+        :param B: tensor B
+        :type B: numpy/cupy/torch array/tensor object
+        :param C: tensor C
+        :type C: numpy/cupy/torch array/tensor object
+        :param D: tensor D
+        :type D: numpy/cupy/torch array/tensor object
+
+        :return: tuple containing the problem size (cutlass.shape.GemmCoord), the GEMM mode (cutlass.GemmUniversalMode), and the batch count (int)
+        :rtype: tuple
+        """
+        M, K = A.shape[-2:]
+        N = B.shape[-1]
+        mode = GemmUniversalMode.Gemm
+
+        batch_count = self._get_batch_count(A, B, C, D)
+        returned_batch_count = batch_count
+
+        # If we are running a batched GEMM in which there is a nonzero batch stride
+        # only for A, then we can fold the batched dimension of A into the M dimension
+        # (i.e., (b, m, k) x (k, n) -> (m*b, k) x (k, n)). This works only if both A
+        # and C are row major. A similar operation can be performed if only B has a nonzero
+        # batch dimension
+        if batch_count > 1:
+            A_row = self._layout_a == cutlass.LayoutType.RowMajor
+            B_row = self._layout_b == cutlass.LayoutType.RowMajor
+            C_row = self._layout_c == cutlass.LayoutType.RowMajor
+
+            # Consider a Tensor to be batched if its rank is > 2 and
+            # the product of the modes beyond rank 2 equals our pre-determined batch size.
+            batched = lambda x : x is None or (len(x.shape) > 2 and prod(x.shape[:-2]) == batch_count)
+
+            if batched(A) and not batched(B) and (C is None or batched(C)) and A_row and C_row:
+                M *= batch_count
+                returned_batch_count = 1
+            elif not batched(A) and batched(B) and (C is None or batched(C)) and not B_row and not C_row:
+                N *= batch_count
+                returned_batch_count = 1
+            else:
+                mode = GemmUniversalMode.Batched
+
+        return GemmCoord(M, N, K), mode, returned_batch_count
+
+    def _verify_type_and_layout(self, tensor, ref_type, ref_layout, name):
+        """
+        Verifies that ``tensor`` has data type ``ref_type`` and layout ``ref_layout``. An exception
+        is raised if it does not.
+
+        :param tensor: object representing a tensor passed in to verify, or ``None`` if no tensor was passed in
+        :type tensor: numpy/cupy/torch array/tensor object
+        :param ref_dtype: data type for the tensor that this object was initialized to
+        :param ref_layout: layout for the tensor that this object was initialized to
+        :param name: identifier of the tensor to verify. Used in raising exceptions
+        :type name: str
+        """
+        dtype, layout = datatypes.get_datatype_and_layout(tensor)
+        if dtype != ref_type or layout != ref_layout:
+            try:
+                # Attempt to transpose the tensor to fit the desired layout
+                tensor = tensor.transpose(-1, -2)
+            except:
+                raise Exception(f'Tensor {name} with type and layout ({dtype}, {layout}) '
+                                f'does not match the expected type and '
+                                f'layout of ({ref_type}, {ref_layout}) and transpose failed.')
+
+    def run(self, A=None, B=None, C=None, D=None,
+            alpha=None, beta=None, sync: bool = True, print_module: bool = False, visitor_args: dict = None,
+            stream: cuda.CUstream = cuda.CUstream(0)) -> GemmArguments:
+        """
+        Runs the kernel currently specified. If it has not already been, the kernel is emitted and
+        compiled. Tensors holding operands and outputs of the kernel are sourced either from the
+        ``A``, ``B``, ``C``, ``D``, ``alpha``, and ``beta``
+        parameters provided in this call, or from those
+        passed in on the construction of this object -- one of the two must be specified.
+
+        By default, this call returns only once the kernel has completed. To launch the kernel
+        and immediately return, set ``sync=False``. In this case, it is the responsibility of the
+        caller to syncrhonize the results of the kernel before attempting to access outputs
+        by calling ``sync()`` on the arguments returned from this call.
+
+        :param A: tensor representing data type and layout of operand A
+        :param B: tensor representing data type and layout of operand B
+        :param C: tensor representing data type and layout of operand C
+        :param D: tensor representing data type and layout of operand D
+        :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B
+        :param beta: scalar parameter beta from GEMM operation that scales operand C
+        :param sync: whether the call should wait for the kernel to complete before returning
+        :type sync: bool
+        :param print_module: whether to print the emitted C++ code
+        :type print_module: bool
+        :param stream: cuda stream, defaults to cuda.cuda.CUstream(0)
+        :type stream: :class:`cuda.cuda.CUstream`
+
+        :return: arguments passed in to the kernel
+        :rtype: cutlass.backend.GemmArguments
+        """
+        super().run_setup()
+        A = self._verify_tensor(A, self.A, self._element_a, self._layout_a, "A")
+        B = self._verify_tensor(B, self.B, self._element_b, self._layout_b, "B")
+        C = self._verify_tensor(C, self.C, self._element_c, self._layout_c, "C")
+        D = self._verify_tensor(D, self.D, self._element_d, self._layout_d, "D")
+        alpha = self._verify_scalar(alpha, self.alpha, self._element_c, "alpha")
+        beta = self._verify_scalar(beta, self.beta, self._element_c, "beta")
+
+        is_void_c = self._element_c == DataType.void
+
+        self._verify_rank(A)
+        self._verify_rank(B)
+        if not is_void_c:
+            self._verify_rank(C)
+        self._verify_rank(D)
+
+        alignment_a = self.possible_operations.find_alignment(A.shape, self._layout_a, operand="A")
+        alignment_b = self.possible_operations.find_alignment(B.shape, self._layout_b, operand="B")
+
+        # Set C alignment based on D.shape so as to correctly get an alignment with void-C
+        # kernels, for which `C` is None.
+        alignment_c = self.possible_operations.find_alignment(D.shape, self._layout_c, operand="C")
+        self.compile(self._tile_description, alignment_A=alignment_a, alignment_B=alignment_b,
+                     alignment_C=alignment_c, print_module=print_module)
+
+        problem_size, mode, batch_count = self._get_problem_args(A, B, C, D)
+
+        if mode == GemmUniversalMode.Gemm or batch_count == 1:
+            kwargs = {'split_k_slices': 1}
+        else:
+            kwargs = {
+                'batch': batch_count,
+                'batch_strides': {
+                    'A': self._get_batch_stride(A),
+                    'B': self._get_batch_stride(B),
+                    'C': self._get_batch_stride(C),
+                    'D': self._get_batch_stride(D)
+                }
+            }
+
+        kwargs['stream'] = stream
+
+        if isinstance(self.epilogue_functor, EpilogueFunctorVisitor):
+            output_op = self.operation.epilogue_type(visitor_args)
+        else:
+            output_op = self.operation.epilogue_type(alpha, beta)
+
+        arguments = GemmArguments(
+            operation=self.operation, problem_size=problem_size,
+            A=A, B=B, C=C, D=D,
+            output_op=output_op,
+            gemm_mode=mode,
+            **kwargs
+        )
+
+        self.operation.run(arguments)
+
+        if sync:
+            arguments.sync()
+
+        return arguments
diff --git a/python/cutlass/op/gemm_grouped.py b/python/cutlass/op/gemm_grouped.py
new file mode 100644
index 0000000000..dbbb21f6e8
--- /dev/null
+++ b/python/cutlass/op/gemm_grouped.py
@@ -0,0 +1,264 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+    Ease-of-use interface for constructing, compiling, and running GEMMs.
+
+    The ``GroupedGemm`` interface is meant to allow one to easily instantiate, compile, and run
+    grouped GEMM operations in CUTLASS via Python, without specifying many configuration parameters.
+    Under the hood, the interface will select sensible default parameters for the many template
+    parameters for CUTLASS grouped GEMMs.
+
+    Note: optimal performance is not to be expected from this interface. To achieve optimal
+    performance, one should specify and tune each configuration parameter.
+
+    The simplest example of using this interface is the following:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        # As, Bs, Cs, and Ds are torch/numpy/cupy tensor objects
+        plan = cutlass.op.GroupedGemm(element=cutlass.DataType.f16, layout=cutlass.LayoutType.RowMajor)
+        plan.run([A0, A1], [B0, B1], [C0, C1], [D0, D1])
+"""
+
+from cutlass_library import DataTypeSize
+
+from cuda import cuda
+from cutlass.backend.gemm_operation import (
+    GemmGroupedArguments,
+    GemmOperationGrouped,
+)
+from cutlass.backend.library import (
+    SchedulerMode,
+    TensorDescription,
+    TileDescription,
+)
+from cutlass.op.gemm import Gemm
+from cutlass.shape import GemmCoord
+from cutlass.utils import check, datatypes
+
+
+class GroupedGemm(Gemm):
+    """
+    Constructs a ``GroupedGemm`` object.
+
+    The data types and layouts of operands A, B, and C, along with the data type of output D
+    and that used for accumulation, are bound to the ``GroupedGemm`` object throughout its lifetime --
+    these are not to be changed after a ``GroupedGemm`` has been constructed.
+
+    The constructor has optional parameters for flexibly setting these parameters. Please see the constructor
+    for ``Gemm`` for examples of these.
+
+    :param cc: compute capability of device to generate kernels for
+    :type cc: int
+    :param A: tensor representing data type and layout of operands A
+    :param B: tensor representing data type and layout of operands B
+    :param C: tensor representing data type and layout of operands C
+    :param D: tensor representing data type and layout of operands D
+    :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B
+    :param beta: scalar parameter beta from GEMM operation that scales operand C
+    :param element_accumulator: data type to be used in accumulation of the product of operands A and B
+    :type element_accumulator: cutlass.DataType
+    :param element: generic data type to be used for operands A, B, C, D, as well as the accumulation data type
+    :type element: cutlass.DataType
+    :param layout: generic layout type to be used for operands A, B, C, and D
+    :type layout: cutlass.LayoutType
+    :param element_A: data type to be used for operand A
+    :type element_A: cutlass.DataType
+    :param element_B: data type to be used for operand B
+    :type element_B: cutlass.DataType
+    :param element_C: data type to be used for operand C
+    :type element_C: cutlass.DataType
+    :param element_D: data type to be used for operand D
+    :type element_D: cutlass.DataType
+    :type layout_A: layout of operand A
+    :param layout_A: cutlass.LayoutType
+    :type layout_B: layout of operand B
+    :param layout_B: cutlass.LayoutType
+    :type layout_C: layout of operand C
+    :param layout_C: cutlass.LayoutType
+    :type layout_D: layout of operand D
+    :param layout_D: cutlass.LayoutType
+    """
+
+    def __init__(
+        self, A=None, B=None, C=None, D=None,
+        alpha=1.0, beta=0.0, element_accumulator=None,
+        element=None, layout=None,
+        element_A=None, element_B=None, element_C=None, element_D=None,
+        layout_A=None, layout_B=None, layout_C=None,
+        cc: int = None,
+    ):
+        super().__init__(
+            A=A, B=B, C=C, D=D,
+            alpha=alpha, beta=beta,
+            element_accumulator=element_accumulator,
+            element=element, layout=layout,
+            element_A=element_A, element_B=element_B,
+            element_C=element_C, element_D=element_D,
+            layout_A=layout_A, layout_B=layout_B, layout_C=layout_C,
+            cc=cc
+        )
+
+        # Grouped GEMM specializations for SM90 are currently unavailable. Revert to using SM80
+        if self.current_cc == 90:
+            self._reset_options(80)
+            self._reset_operations(reset_epilogue=False)
+
+        self.name = "grouped_gemm"
+
+    @Gemm.swizzling_functor.setter
+    def swizzling_functor(self, swizzling_functor):
+        """
+        Sets the swizzling functor to the type specified by `swizzling_functor`
+        """
+        raise Exception('Grouped GEMM does not currently support different swizzling functors')
+
+    def construct(self, tile_description: TileDescription = None,
+                  alignment_A: int = None,
+                  alignment_B: int = None,
+                  alignment_C: int = None) -> GemmOperationGrouped:
+        """
+        Constructs a ``cutlass.backend.GemmOperationGrouped`` based on the input parameters and current
+        kernel specification of the ``Gemm`` object.
+
+        :param tile_description: tile description specifying shapes and operand types to use in the kernel
+        :type tile_description: cutlass.backend.TileDescription
+        :param alignment_A: alignment of operand A
+        :type alignment_A: int
+        :param alignment_B: alignment of operand B
+        :type alignment_B: int
+        :param alignment_C: alignment of operand C
+        :type alignment_C: int
+
+        :return: operation that was constructed
+        :rtype: cutlass.backend.GemmOperationGrouped
+        """
+        alignment_A = check.alignment_or_default(alignment_A, max(self.possible_operations.alignments("A")))
+        alignment_B = check.alignment_or_default(alignment_B, max(self.possible_operations.alignments("B")))
+        alignment_C = check.alignment_or_default(alignment_C, max(self.possible_operations.alignments("C")))
+
+        self.epilogue_functor = self._reset_epilogue_functor_alignment(alignment_C, self.epilogue_functor)
+
+        tensor_A = TensorDescription(self._element_a, self._layout_b, alignment_A)
+        tensor_B = TensorDescription(self._element_b, self._layout_b, alignment_B)
+        tensor_C = TensorDescription(self._element_c, self._layout_c, alignment_C)
+
+        if tile_description is None:
+            op = self.possible_operations.operations(alignment_A, alignment_B, alignment_C, self._math_operation)[0]
+            tile_description = datatypes.td_from_profiler_op(op)
+        else:
+            valid, err_str = self._valid_tile_description(tile_description)
+            if not valid:
+                raise Exception(f"Invalid tile description. {err_str}")
+            self.tile_description = tile_description
+
+        operation = GemmOperationGrouped(
+            arch=self.current_cc,
+            tile_description=tile_description,
+            A=tensor_A, B=tensor_B, C=tensor_C,
+            epilogue_functor=self.epilogue_functor,
+            swizzling_functor=self._swizzling_functor,
+            precompute_mode=SchedulerMode.Device)
+
+        return operation
+
+    def run(self, A, B, C, D,
+            alpha=None, beta=None, sync: bool = True,
+            print_module: bool = False,
+            stream: cuda.CUstream = cuda.CUstream(0)) -> GemmGroupedArguments:
+        """
+        Runs the kernel currently specified.
+
+        By default, this call returns only once the kernel has completed. To launch the kernel
+        and immediately return, set ``sync=False``. In this case, it is the responsibility of the
+        caller to syncrhonize the results of the kernel before attempting to access outputs
+        by calling ``sync()`` on the arguments returned from this call.
+
+        :param A: list of tensors representing data type and layout of operand A
+        :type A: list
+        :param B: list of tensors representing data type and layout of operand B
+        :type B: list
+        :param C: list of tensors representing data type and layout of operand C
+        :type C: list
+        :param D: list of tensors representing data type and layout of operand D
+        :type D: list
+        :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B
+        :param beta: scalar parameter beta from GEMM operation that scales operand C
+        :param sync: whether the call should wait for the kernel to complete before returning
+        :type sync: bool
+        :param print_module: whether to print the emitted C++ code
+        :type print_module: bool
+        :param stream: cuda stream, defaults to cuda.cuda.CUstream(0)
+        :type stream: :class:`cuda.cuda.CUstream`
+
+        :return: arguments passed in to the kernel
+        :rtype: cutlass.backend.GemmGroupedArguments
+        """
+        super().run_setup()
+
+        if len(A) != len(B) or len(A) != len(C) or len(A) != len(D):
+            raise Exception("Lengths of A, B, C, and D lists must be equal")
+
+        problem_sizes = []
+        As, Bs, Cs, Ds = ([None] * len(A) for _ in range(4))
+        for i in range(len(A)):
+            As[i] = self._verify_tensor(A[i], self.A, self._element_a, self._layout_a, "A")
+            Bs[i] = self._verify_tensor(B[i], self.B, self._element_b, self._layout_b, "B")
+            Cs[i] = self._verify_tensor(C[i], self.C, self._element_c, self._layout_c, "C")
+            Ds[i] = self._verify_tensor(D[i], self.D, self._element_d, self._layout_d, "D")
+            problem_sizes.append(GemmCoord(A[i].shape[0], B[i].shape[1], A[i].shape[1]))
+
+        alpha = self._verify_scalar(alpha, self.alpha, self._element_c, "alpha")
+        beta = self._verify_scalar(beta, self.beta, self._element_c, "beta")
+
+        alignment_a = min((self.possible_operations.find_alignment(A.shape, self._layout_a, operand="A") for A in As))
+        alignment_b = min((self.possible_operations.find_alignment(B.shape, self._layout_b, operand="B") for B in Bs))
+        alignment_c = min((self.possible_operations.find_alignment(C.shape, self._layout_c, operand="C") for C in Cs))
+        self.compile(self.tile_description, alignment_A=alignment_a, alignment_B=alignment_b,
+                     alignment_C=alignment_c, print_module=print_module)
+
+        arguments = GemmGroupedArguments(
+            operation=self.operation,
+            problem_sizes=problem_sizes,
+            A=As, B=Bs, C=Cs, D=Ds,
+            output_op=self.operation.epilogue_type(alpha, beta),
+            stream=stream
+        )
+
+        self.operation.run(arguments)
+
+        if sync:
+            arguments.sync()
+
+        return arguments
diff --git a/python/cutlass/op/op.py b/python/cutlass/op/op.py
new file mode 100644
index 0000000000..69eb71c6e6
--- /dev/null
+++ b/python/cutlass/op/op.py
@@ -0,0 +1,430 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Base operation used for defining high-level CUTLASS operations (e.g., GEMM, Conv2d)
+"""
+
+from bisect import bisect_left
+
+from cutlass_library import (
+    DataType,
+    DataTypeSize,
+    MathOperation,
+    OperationKind,
+    SharedMemPerCC
+)
+
+import cutlass
+from cutlass import get_option_registry
+from cutlass.backend.evt import EpilogueFunctorVisitor
+from cutlass.backend.utils.device import device_cc
+from cutlass.epilogue import get_activations, get_activation_epilogue, identity
+from cutlass.library_defaults import KernelsForDataType, _generator_ccs
+from cutlass.swizzle import get_swizzling_functors
+from cutlass.utils import datatypes, check
+
+
+class OperationBase:
+    """
+    Base operation used for defining high-level CUTLASS operations (e.g., GEMM, Conv2d)
+    """
+
+    def __init__(self, cc: int = None, kernel_cc: int = None, operation_kind = OperationKind.Gemm):
+        """
+        :param cc: compute capability of device for which kernels should be compiled. For example, if running on H100, this should be set to 90
+        :type cc: int
+        :param kernel_cc: compute capability of kernels to generate. For example, if running on SM90, but desiring to use a CUTLASS 2.x-style Ampere kernel, this should be set to 80
+        :type kernel_cc: int
+        :param operation_kind: class of operation that will be performed (e.g., GEMM, Conv)
+        :type operation_kind: cutlass_library.OperationKind
+        """
+        self.operation_kind = operation_kind
+        self.cc = cc if cc is not None else device_cc()
+        self.specified_kernel_cc = kernel_cc is not None
+        self.current_cc = kernel_cc if kernel_cc is not None else self._find_closest_cc(self.cc)
+        self.tile_description = None
+        self._math_operation = None
+
+        self.options = get_option_registry().options_for_cc(self.current_cc, operation_kind)
+
+        if self.options is None:
+            raise Exception(f"Invalid or unsupported compute capability: {self.current_cc}")
+
+        # Default activation function: identity
+        self._activation = identity
+
+    def _find_closest_cc(self, cc: int) -> int:
+        """
+        Returns the closest CC in _generator_ccs less than or equal to `cc`
+
+        :param cc: compute capability to query
+        :type cc: int
+
+        :returns: closest CC in _generator_ccs less than or equal to `cc`
+        :rtype: int
+        """
+        if cc in _generator_ccs:
+            return cc
+
+        # Find closest CC lower than this CC
+        idx = bisect_left(_generator_ccs, cc)
+        if idx == 0:
+            raise Exception(f'No valid CC to fall back to for {cc}')
+        return _generator_ccs[idx-1]
+
+    def activations(self) -> list:
+        """
+        Returns possible activation functions that can be used
+
+        :return: list of activation functions that can be used
+        :rtype: list
+        """
+        return get_activations()
+
+    def swizzling_functors(self) -> list:
+        """
+        Returns possible swizzling functions that can be used
+
+        :return: list of swizzling functions that can be used
+        :rtype: list
+        """
+        return get_swizzling_functors()
+
+    def _reset_options(self, cc: int):
+        """
+        Resets the kernel options based on cc
+
+        :param cc: compute capability to reset to
+        :type cc: int
+        """
+        if cc != self.current_cc:
+            if cc not in _generator_ccs:
+                raise Exception(f'Invalid CC for CUTLASS kernels: {cc}.')
+            self.current_cc = cc
+            self.options = get_option_registry().options_for_cc(self.current_cc, self.operation_kind)
+
+    def _verify_scalar(self, scalar, ref_scalar, ref_dtype, name):
+        """
+        Verifies the following properties:
+            1) Either ``scalar`` or ``ref_scakar`` must be set (i.e., not ``None``)
+            2) If ``scalar`` is not ``None``, its datatype must match matches the current version
+               set by the plan (i.e., those in ``ref_dtype``)
+
+        If either of these properties does not hold, an exception is raised. If these properties hold and
+        ``scalar`` is not ``None``, ``scalar`` is returned. Otherwise, ``ref_scalar`` is returned.
+
+        :param scalar: object representing a tensor passed in to verify, or ``None`` if no tensor was passed in
+        :type scalar: numpy/cupy/torch scalar
+        :param ref_scalar: object representing a tensor passed in on construction of this object, or ``None`` if no tensor was passed in
+        :type ref_scalar: numpy/cupy/torch scalar
+        :param ref_dtype: data type for the scalar that this object was initialized to
+        :param name: identifier of the scalar to verify. Used in raising exceptions
+        :type name: str
+
+        :return: valid scalar to use
+        :rtype: numpy/cupy/torch scalar
+        """
+        if scalar is None:
+            if ref_scalar is None:
+                raise Exception(f"Scalar {name} must be set.")
+            return ref_scalar
+        if hasattr(scalar, "dtype"):
+            dtype = datatypes.library_type(scalar.dtype)
+            if dtype != ref_dtype:
+                raise Exception(
+                    f"Tensor {name} with type {dtype} does not match expected type {ref_dtype}."
+                )
+        return scalar
+
+    def _verify_tensor(self, tensor, ref_tensor, ref_dtype, ref_layout, name):
+        """
+        Verifies the following properties:
+            If ref_dtype is not void:
+                1) Either ``tensor`` or ``ref_tensor`` must be set (i.e., not ``None``)
+                2) If ``tensor`` is not ``None``, its datatype and layout must match matches the current versions
+                set by the plan (i.e., those in ``ref_dtype`` and ``ref_layout``)
+            If ref_dtype is void:
+                Neither ``tensor`` nor ``ref_tensor`` are set
+
+        If either of these properties does not hold, an exception is raised. If these properties hold and
+        ``tensor`` is not ``None``, ``tensor`` is returned. Otherwise, ``ref_tensor`` is returned.
+
+        :param tensor: object representing a tensor passed in to verify, or ``None`` if no tensor was passed in
+        :type tensor: numpy/cupy/torch array/tensor object
+        :param ref_tensor: object representing a tensor passed in on construction of this object, or ``None`` if no tensor was passed in
+        :type ref_tensor: numpy/cupy/torch array/tensor object
+        :param ref_dtype: data type for the tensor that this object was initialized to
+        :param ref_layout: layout for the tensor that this object was initialized to
+        :param name: identifier of the tensor to verify. Used in raising exceptions
+        :type name: str
+
+        :return: valid tensor object to use
+        :rtype: numpy/cupy/torch array/tensor object
+        """
+        if ref_dtype == DataType.void:
+            if tensor is not None or ref_tensor is not None:
+                raise Exception("Operands with element DataType.void must not be provided a tensor")
+            return None
+
+        if tensor is None:
+            if ref_tensor is None:
+                raise Exception(f"Tensor {name} must be set.")
+            return ref_tensor
+
+        self._verify_type_and_layout(tensor, ref_dtype, ref_layout, name)
+        return tensor
+
+    @property
+    def opclass(self) -> cutlass.OpcodeClass:
+        """
+        Returns the opcode class currently in use
+
+        :return: opcode class currently in use
+        :rtype: cutlass.OpcodeClass
+        """
+        return self.op_class
+
+    @opclass.setter
+    def opclass(self, oc: cutlass.OpcodeClass):
+        if isinstance(oc, str):
+            oc = datatypes.getattr_enum(cutlass.OpcodeClass, oc)
+        if oc in self.possible_op_classes:
+            self.op_class = oc
+        else:
+            raise Exception(
+                f'Unsupported operation class {oc} for CC {self.cc} and data type combination '
+                f'({self._element_a}, {self._element_b}, {self._element_accumulator}) and '
+                f'layout combination ({self._layout_a}, {self._layout_b}).')
+
+        # Changing the op class also changes the possible operations available. Reset these.
+        self.possible_operations = self.options.operations(
+            self.op_class, self._element_a, self._element_b,
+            self._element_accumulator, self._layout_a, self._layout_b, self._math_operation)
+
+        # Changing the op class changes the elements per access in the epilogue. Reset this.
+        if self.epilogue_functor is not None:
+            self.epilogue_functor = self._reset_epilogue_functor_alignment(self._elements_per_access(), self.epilogue_functor)
+
+    @property
+    def math_operation(self) -> cutlass.MathOperation:
+        """
+        Returns the math operation currently in use
+
+        :return: math operation currently in use
+        :rtype: cutlass.MathOperation
+        """
+        return self._math_operation
+
+    @math_operation.setter
+    def math_operation(self, mo: cutlass.MathOperation):
+        if isinstance(mo, str):
+            mo = datatypes.getattr_enum(cutlass.MathOperation, mo)
+
+        if not self.specified_kernel_cc:
+            if self.current_cc == 90:
+                # CUTLASS 3.0 kernels do not use different math operations. If one is specified, we
+                # revert to using a CUTLASS 2.x kernel by using SM80-tagged kernels.
+                cutlass.logger.warning("Reverting to using SM80-tagged kernel. Opclass may change.")
+                self._reset_options(80)
+                self._reset_operations(reset_epilogue=False)
+        elif self.current_cc == 90:
+            raise Exception("CUTLASS 3.0 kernels do not use different math operations. "
+                "To use 2.x kernels with a specific math operation, do not set the `kernel_cc`"
+                "parameter when constructing the plan.")
+
+        self._math_operation = mo
+        self._reset_operations()
+
+    def _elements_per_access(self):
+        if self.op_class == cutlass.OpcodeClass.Simt:
+            return 1
+        elif self._element_c != DataType.void:
+            return 128 // DataTypeSize[self._element_c]
+        else:
+            return 128 // max(self.possible_operations.alignments("C"))
+
+    def _create_epilogue_functor_activation(self, activation):
+        """
+        Returns the epilogue functor with given activation function
+        """
+        if self.epilogue_functor is None:
+            elements_per_access = self._elements_per_access()
+        else:
+            elements_per_access = self.epilogue_functor.epilogue_vector_length
+
+        if not self.specified_kernel_cc:
+            if self.current_cc == 90 and activation != identity:
+                # CUTLASS 3.0 kernels in Python currently only support identity activation. If one requests a non-identity activation,
+                # revert to using a CUTLASS 2.x kernel by using SM80-tagged kernels.
+                cutlass.logger.warning("Reverting to using SM80-tagged kernel. Opclass may change.")
+                if self._element_c != self._element_d:
+                    raise Exception("CUTLASS 2.x kernels require element C to be the same as element D")
+                self._reset_options(80)
+                self._reset_operations(reset_epilogue=False)
+            elif (self.cc == 90 and self.current_cc != 90 and activation == identity and self._math_operation is None):
+                # SM80 fallback kernels are currently used. Since an identity activation is requested,
+                # we can switch back to using SM90 kernels.
+                self._reset_options(90)
+                self._reset_operations(reset_epilogue=False)
+        else:
+            if self.current_cc == 90 and activation != identity:
+                raise Exception("Epilogues with elementwise fusion are not currently supported "
+                                "in the Python interface for 3.x kernels. To use 2.x kernels "
+                                "with fused elementwise epilogues, do not set the `kernel_cc` "
+                                "parameter when constructing the plan.")
+
+        return get_activation_epilogue(
+            activation,
+            self._element_d,
+            elements_per_access,
+            self._element_accumulator,
+            self._element_accumulator,
+        )
+
+    def _reset_epilogue_functor_activation(self, activation):
+        """
+        Set the epilogue functor based on the provided activation function
+        """
+        self.epilogue_functor = self._create_epilogue_functor_activation(activation)
+
+    def _reset_epilogue_functor_alignment(self, alignment, epilogue_functor):
+        """
+        Reset the alignment of the current epilogue functor based on alignment C
+        """
+        if isinstance(epilogue_functor, EpilogueFunctorVisitor):
+            return epilogue_functor
+
+        if epilogue_functor is None or not hasattr(epilogue_functor, 'activation_functor'):
+            # Identity epilogue does not have 'activation_functor'
+            activation = identity
+        else:
+            activation = epilogue_functor.activation_functor
+
+        epilogue_functor = get_activation_epilogue(
+            activation,
+            self._element_d,
+            alignment,
+            self._element_accumulator,
+            self._element_accumulator,
+        )
+        return epilogue_functor
+
+    @property
+    def activation(self):
+        """
+        Returns the type of the current activation function used
+        """
+        if hasattr(self.epilogue_functor, "activation_functor"):
+            return self.epilogue_functor.activation_functor
+        else:
+            return identity
+
+    @activation.setter
+    def activation(self, act):
+        """
+        Sets the type of the activation function to use
+        Activation can come with a set of arguments
+
+        :param act: type of activation function to use
+        :type act: str or tuple. e.g. "relu", ("leaky_relu", 0.01)
+
+        """
+        if isinstance(act, tuple):
+            if isinstance(act[0], str):
+                act_fn = getattr(cutlass.backend.epilogue, act[0])
+            else:
+                act_fn = act[0]
+            self._reset_epilogue_functor_activation(act_fn)
+            self._activation_args = act[1]
+            self._activation = act[0]
+        else:
+            if isinstance(act, str):
+                act = getattr(cutlass.backend.epilogue, act)
+            self._reset_epilogue_functor_activation(act)
+            self._activation = act
+
+    @property
+    def epilogue_visitor(self):
+        """
+        Return the epilogue functor
+        """
+        return self.epilogue_functor
+
+    @epilogue_visitor.setter
+    def epilogue_visitor(self, visitor):
+        """
+        Create the epilogue visitor
+        """
+        self.epilogue_functor = EpilogueFunctorVisitor(self.cc, visitor)
+
+        # The epilogue_functor may consume too much shared memory
+        # Reset the possible operations
+        if self.cc != 90:
+            # The shared memory is only a concern for sm90 epilogue
+            # In sm80, the epilogue and mainloop share the shared memory
+            return
+
+        datatype_comb = self.possible_operations.datatype_comb
+        layout_comb = self.possible_operations.layout_comb
+        new_possible_operations = KernelsForDataType(datatype_comb, layout_comb)
+        for operation in self.possible_operations.all_operations:
+            td = datatypes.td_from_profiler_op(operation)
+            # Filter invalid epilogue schedules
+            if td.epilogue_schedule not in [
+                cutlass.EpilogueScheduleType.TmaWarpSpecialized,
+                cutlass.EpilogueScheduleType.TmaWarpSpecializedCooperative]:
+                continue
+            epilogue_smem_bytes = self.epilogue_functor.get_smem_size(td)
+
+            # Verify the maximum number of mainloop stages
+            mainloop_smem_per_stage = check.calculate_smem_usage_per_stage(td, OperationKind.Gemm)
+            smem_capacity_bytes = SharedMemPerCC[self.cc] << 10
+            mainloop_stages = (smem_capacity_bytes - epilogue_smem_bytes) // mainloop_smem_per_stage
+            if mainloop_stages < 2:
+                # Mainloop stages must >= 2
+                continue
+
+            new_possible_operations.add(operation)
+        if len(new_possible_operations.all_operations) == 0:
+            raise RuntimeError(
+                "The epilogue consumes too much shared memory. "
+                "No valid tile description is found in the generator.")
+        self.possible_operations = new_possible_operations
+
+
+    def run_setup(self):
+        """
+        Steps that must be taken before caling `plan.run()`
+        """
+        # Initialize the memory pool if, if not already done
+        cutlass.get_memory_pool()
diff --git a/python/cutlass/shape.py b/python/cutlass/shape.py
new file mode 100644
index 0000000000..3734146368
--- /dev/null
+++ b/python/cutlass/shape.py
@@ -0,0 +1,184 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for expressing shapes
+"""
+
+from cutlass_library import (
+    ConvMode,
+    ConvKind,
+    LayoutType
+)
+from cutlass.backend.c_types import (
+    Conv2DProblemSize_,
+    GemmCoord_,
+    GemmCoordBatched_
+)
+
+
+class MatrixCoord:
+    def __init__(self, row, col):
+        self._row = row
+        self._col = col
+
+    @property
+    def row(self):
+        return self._row
+
+    @property
+    def column(self):
+        return self._col
+
+    def leading_dimension(self, layout: LayoutType) -> int:
+        """
+        Returns the leading dimension for a matrix with layout ``layout`` and shape provided by the MatrixCoord.
+
+        :param layout: layout of matrix
+        :type layout: cutlass_library.LayoutType
+
+        :returns: leading dimension
+        :rtype: int
+        """
+        if layout == LayoutType.RowMajor:
+            return self._col
+        elif layout == LayoutType.ColumnMajor:
+            return self._row
+        else:
+            raise Exception(f'Unsupported layout for leading dimension calculation: {layout}')
+
+
+class GemmCoord:
+    def __init__(self, m: int, n: int, k: int):
+        self._m = m
+        self._n = n
+        self._k = k
+
+    @property
+    def m(self) -> int:
+        return self._m
+
+    @property
+    def n(self) -> int:
+        return self._n
+
+    @property
+    def k(self) -> int:
+        return self._k
+
+    @property
+    def mk(self) -> MatrixCoord:
+        return MatrixCoord(self._m, self._k)
+
+    @property
+    def mn(self) -> MatrixCoord:
+        return MatrixCoord(self._m, self._n)
+
+    @property
+    def kn(self) -> MatrixCoord:
+        return MatrixCoord(self._k, self._n)
+
+    @property
+    def ctype(self) -> GemmCoord_:
+        return GemmCoord_(self._m, self._n, self._k)
+
+    def batched_ctype(self, batch_count: int) -> GemmCoordBatched_:
+        return GemmCoordBatched_(self._m, self._n, self._k, batch_count)
+
+
+class Conv2DProblemSize:
+    def __init__(
+        self, n: int, h: int, w: int, c: int,
+        k: int, r: int, s: int, c_: int,
+        pad_h: int, pad_w: int, stride_h: int, stride_w: int,
+        dilation_h: int, dilation_w: int, mode: ConvMode=ConvMode.CrossCorrelation,
+        split_k_slices: int=1, groups: int=1):
+
+        self.N = n
+        self.H = h
+        self.W = w
+        self.C = c
+        self.K = k
+        self.R = r
+        self.S = s
+        self.pad_h = pad_h
+        self.pad_w = pad_w
+        self.stride_h = stride_h
+        self.stride_w = stride_w
+        self.dilation_h = dilation_h
+        self.dilation_w = dilation_w
+        self.mode = int(mode)
+        self.split_k_slices = split_k_slices
+        self.groups = groups
+        self.P = ((h + pad_h * 2 - r * dilation_h) // stride_h) + 1
+        self.Q = ((w + pad_w * 2 - s * dilation_w) // stride_w) + 1
+
+    @property
+    def ctype(self) -> Conv2DProblemSize_:
+        return Conv2DProblemSize_(self)
+
+    def implicit_gemm_size(self, kind: ConvKind):
+        if kind == ConvKind.Fprop:
+            return GemmCoord(
+                self.N * self.P * self.Q,
+                self.K,
+                self.R * self.S * self.C // self.groups
+            )
+        elif kind == ConvKind.Dgrad:
+            return GemmCoord(
+                self.N * self.H * self.W,
+                self.C,
+                self.R * self.S * self.K
+            )
+        elif kind == ConvKind.Wgrad:
+            return GemmCoord(
+                self.K,
+                self.R * self.S * self.C,
+                self.N * self.P * self.Q
+            )
+
+    @staticmethod
+    def from_sizes(input_size, weight_size):
+        K, R, S, _ = weight_size
+        pad_h = R // 2
+        pad_w = S // 2
+        stride_h = 1
+        stride_w = 1
+        dilation_h = 1
+        dilation_w = 1
+        return Conv2DProblemSize(
+            *input_size,
+            *weight_size,
+            pad_h, pad_w,
+            stride_h, stride_w,
+            dilation_h, dilation_w
+        )
diff --git a/python/cutlass/swizzle.py b/python/cutlass/swizzle.py
new file mode 100644
index 0000000000..b678910f5c
--- /dev/null
+++ b/python/cutlass/swizzle.py
@@ -0,0 +1,65 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Registry of swizzling functions
+"""
+
+from cutlass_library import SwizzlingFunctor
+
+
+IdentitySwizzle1 = SwizzlingFunctor.Identity1
+IdentitySwizzle2 = SwizzlingFunctor.Identity2
+IdentitySwizzle4 = SwizzlingFunctor.Identity4
+IdentitySwizzle8 = SwizzlingFunctor.Identity8
+HorizontalSwizzle = SwizzlingFunctor.Horizontal
+ThreadblockSwizzleStreamK = SwizzlingFunctor.StreamK
+StridedDgradIdentitySwizzle1 = SwizzlingFunctor.StridedDgradIdentity1
+StridedDgradIdentitySwizzle4 = SwizzlingFunctor.StridedDgradIdentity4
+StridedDgradHorizontalSwizzle = SwizzlingFunctor.StridedDgradHorizontal
+
+
+_swizzling_functors = [
+    IdentitySwizzle1,
+    IdentitySwizzle2,
+    IdentitySwizzle4,
+    IdentitySwizzle8,
+    HorizontalSwizzle,
+    ThreadblockSwizzleStreamK,
+    StridedDgradIdentitySwizzle1,
+    StridedDgradIdentitySwizzle4,
+    StridedDgradHorizontalSwizzle,
+]
+
+
+def get_swizzling_functors():
+    return _swizzling_functors
diff --git a/python/cutlass/utils/__init__.py b/python/cutlass/utils/__init__.py
new file mode 100644
index 0000000000..367fd67b0f
--- /dev/null
+++ b/python/cutlass/utils/__init__.py
@@ -0,0 +1,41 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from cutlass.utils.check import (
+    alignment_or_default,
+    calculate_smem_usage,
+    calculate_smem_usage_per_stage,
+    valid_cluster_shape,
+    valid_schedule,
+    valid_stage_count,
+    update_alignment,
+)
diff --git a/python/cutlass/utils/check.py b/python/cutlass/utils/check.py
new file mode 100644
index 0000000000..2a37b72c33
--- /dev/null
+++ b/python/cutlass/utils/check.py
@@ -0,0 +1,269 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utility functions for checking constraints on kernels and calculating kernel attributes
+"""
+
+import ctypes
+
+from cutlass_library import DataTypeSize, OperationKind, SharedMemPerCC
+
+import cutlass
+from cutlass.backend.library import TileDescription
+
+
+def calculate_smem_usage_per_stage(td: TileDescription, operation_kind: OperationKind) -> int:
+    """
+    Returns the amount of shared memory in bytes consumed in a single stage of a kernel.
+
+    :param td: tile description to compute shared memory of
+    :type td: TileDescription
+    :param operation_kind: identifier for the type of operation being performed
+    :type operation_kind: cutlass_library.OperationKind
+
+    :return: number of bytes of shared memory consumed by a single stage
+    :rtype: int
+    """
+    m, n, k = td.threadblock_shape
+
+    if operation_kind == OperationKind.Gemm:
+        stage_barrier_bytes = 32
+        return (
+            (DataTypeSize[td.math_instruction.element_a] * m * k // 8)
+            + (DataTypeSize[td.math_instruction.element_b] * k * n // 8)
+            + stage_barrier_bytes
+        )
+    else:
+        raise Exception(f"No available shared memory calculation for operation kind {operation.operation_kind}")
+
+
+def calculate_smem_usage(operation) -> int:
+    """
+    Returns the amount of shared memory in bytes consumed by a kernel.
+
+    :return: number of bytes of shared memory consumed by the operation
+    :return: int
+    """
+    _per_stage = calculate_smem_usage_per_stage(operation.tile_description, operation.operation_kind)
+    return _per_stage * operation.tile_description.stages
+
+
+def valid_stage_count(
+    cc: int,
+    kernel_cc: int,
+    td: TileDescription,
+    element_C: cutlass.DataType = None,
+    element_D: cutlass.DataType = None,
+    verbose: bool = True) -> tuple:
+    """
+    Checks whether a device with `cc` supports the number of stages within `tile_description`, both
+    based on raw limits on the number of stages and based on shared memory capacity
+
+    :param cc: compute capability of device in question
+    :type cc: int
+    :param kernel_cc: compute capability that the kernel targets (corresponding to the arch::SMxy tag in CUTLASS)
+    :type kernel_cc: int
+    :param td: tile description to check
+    :type td: TileDescription
+    :param element_C: data type of operand C
+    :type element_C: cutlass.DataType
+    :param element_D: data type of operand D
+    :type element_D: cutlass.DataType
+    :param verbose: whether to log warnings
+    :type verbose: bool
+
+    :return: tuple with the first element indicating whether the provided tile description is
+             valid for the provided device and the second element being an error message
+    :rtype: tuple
+    """
+    if kernel_cc == 90:
+        if (td.stages is None or td.stages == 0):
+            # Stage count of None or 0 for SM90 indicates that the CollectiveBuilder automatically
+            # determines the stage count to use. Thus, all settings are valid in these scenarios.
+            return (True, "")
+        elif verbose:
+            cutlass.logger.warning(
+                "Setting an explicit stage count for SM90 kernels currently may "
+                "result in compilation errors if the combination of tile shape, "
+                "stage count, and shared memory requirement of the epilogue exceeds "
+                "the available shared memory per SM.")
+
+    if td.stages <= 0:
+        return (False, f"Stage counts must be positive integers. Tile description has stage count of {td.stages}.")
+
+    if cc < 80 and td.stages != 2:
+        return (False, f"Tile description has stage count of {td.stages}, "
+                       f"but only 2 stages are supported on SM{cc}.")
+
+    # The calculation below does not consider shared memory used by the epilogue and, thus,
+    # only catches cases in which the mainloop exceeds the device's shared memory capacity.
+    # This is not a concern for CUTLASS 2.x kernels, for which the shared memory of the
+    # mainloop and epilogue is shared.
+    smem_per_stage = calculate_smem_usage_per_stage(td, OperationKind.Gemm)
+    smem_usage_mainloop = (smem_per_stage * td.stages)
+    smem_arch = SharedMemPerCC[cc] << 10
+    if smem_usage_mainloop > smem_arch:
+        return ( False,
+            "Configuration uses too much shared memory. Consider reducing stage count or tile shape.\n"
+            f"Details:\n"
+            f"Mainloop uses {smem_per_stage} bytes of shared memory per stage, and "
+            f"{td.stages} stages for a total of {smem_usage_mainloop} bytes.\n"
+            f"The maxmium amount of shared memory that can be used per block on CC {cc} is {smem_arch}.")
+
+    return (True, "")
+
+
+def valid_cluster_shape(cc: int, cluster_shape: list) -> tuple:
+    """
+    Checks whether a device with `cc` supports a thread block cluster of shape `cluster_shape`.
+
+    :param cc: compute capability of device in question
+    :type cc: int
+    :param cluster_shape: dimensions of thread block cluster shape to check
+    :type cluster_shape: list
+
+    :return: tuple with the first element indicating whether the provided cluster shape is
+             valid for the provided device and the second element being an error message
+    :rtype: tuple
+    """
+
+    if cc < 90:
+        if cluster_shape != [1, 1, 1]:
+            return (False,
+                    f"Cluster shape for pre-SM90 architectures must be [1, 1, 1]. Received cluster shape of "
+                    f"{cluster_shape} for SM{cc}.")
+        else:
+            return (True, "")
+
+    if len(cluster_shape) != 3:
+        return (False,
+                f"Cluster shapes must be rank-3. Received {cluster_shape} (rank {len(cluster_shape)}")
+
+    if cluster_shape[2] != 1:
+        return (False,
+                "CUTLASS kernels currently require the third dimension of cluster shape to be 1. "
+                f"Received cluster shape of {cluster_shape}.")
+
+    # The CUDA programming guide currently defines a maximum of 8 thread blocks per cluster
+    # as being portably supported (https://docs.nvidia.com/cuda/cuda-c-programming-guide/#thread-block-clusters).
+    # Current CUTLASS kernels only have non-unit cluster dimensions within the first two dimensions,
+    # so we check that the first two dimensions of the cluster shape do not exceed 8 thread blocks in total.
+    blocks_in_2d = cluster_shape[0] * cluster_shape[1]
+    if blocks_in_2d > 8:
+        return (False,
+            f"Thread block clusters with more than 8 thread blocks are currently unsupported on SM{cc}. "
+            f"Received cluster shape {cluster_shape}, which has {blocks_in_2d} thread blocks.")
+    return (True, "")
+
+
+def valid_schedule(
+    cc: int,
+    kernel_schedule: cutlass.KernelScheduleType,
+    epilogue_schedule: cutlass.EpilogueScheduleType,
+    tile_scheduler: cutlass.TileSchedulerType) -> tuple:
+    """
+    Checks that the kernel and epilogue schedules passed in are a valid combination for
+    a device of compute capability ``cc``.
+
+    :param cc: compute capability of device in question
+    :type cc: int
+    :param kernel_schedule: kernel schedule type
+    :type kernel_schedule: cutlass.KernelScheduleType
+    :param epilogue_schedule: epilogue schedule type
+    :type epilogue_schedule: cutlass.EpilogueScheduleType
+    :param tile_scheduler: tile scheduler type
+    :type tile_scheduler: cutlass.TileSchedulerType
+
+    :return: tuple with the first element indicating whether the provided schedules are
+             valid for the provided device and the second element being an error message
+    :rtype: tuple
+    """
+    kernel_auto = (kernel_schedule == cutlass.KernelScheduleType.ScheduleAuto)
+    epilogue_auto = (epilogue_schedule == cutlass.EpilogueScheduleType.ScheduleAuto)
+    tile_scheduler_default = (tile_scheduler == cutlass.TileSchedulerType.Default)
+    if cc < 90 and not (kernel_auto and epilogue_auto and tile_scheduler_default):
+        return (False, "Non-default schedules are only supported on SM90 and beyond")
+
+    if (kernel_auto and not epilogue_auto) or (not kernel_auto and epilogue_auto):
+        return (False, "Kernel and epilogue schedules must either both be auto or neither be auto")
+
+    if not tile_scheduler_default:
+        cooperative_kernels = [cutlass.KernelScheduleType.TmaWarpSpecializedCooperative, 
+                               cutlass.KernelScheduleType.CpAsyncWarpSpecializedCooperative]
+        if (tile_scheduler == cutlass.TileSchedulerType.StreamK) and (kernel_schedule not in cooperative_kernels):
+            return (False, "Stream-K tile scheduler is currently only supported with the cooperative kernel schedule")
+    return (True, "")
+
+
+def alignment_or_default(alignment_provided: int, default_alignment: int) -> int:
+    """
+    Returns `alignment_provided` if it is set, otherwise `default_alignment` and checks
+    that `alignment_provided` does not exceed `default_alignment`.
+
+    :param alignment_provided: alignment preference specified. Can be None.
+    :type alignment_provided: int
+    :param default_alignment: alignment to use if `alignment_provided` is None
+    :type default_alignment: int
+
+    :return: alignment to use
+    :rtype: int
+    """
+    if alignment_provided is not None:
+        if alignment_provided > default_alignment:
+            raise Exception(f"Alignment {alignment_provided} exceeds the maximum supported of {default_alignment}.")
+        return alignment_provided
+
+    return default_alignment
+
+
+def update_alignment(alignment_provided:int, default_alignment: int) -> int:
+    """
+    Returns `alignment_provided` if it is set, otherwise `default_alignment` and checks
+    that `alignment_provided` does not exceed `default_alignment`.
+
+    :param alignment_provided: alignment preference specified. Can be None.
+    :type alignment_provided: int
+    :param default_alignment: alignment to use if `alignment_provided` is None
+    :type default_alignment: int
+
+    :return: alignment to use
+    :rtype: int
+    """
+    if alignment_provided is not None:
+        if alignment_provided > default_alignment:
+            if alignment_provided % default_alignment == 0:
+                return default_alignment
+            raise Exception(f"Alignment {alignment_provided} exceeds the maximum supported of {default_alignment}.")
+        return alignment_provided
+
+    return default_alignment
diff --git a/python/cutlass/utils/datatypes.py b/python/cutlass/utils/datatypes.py
new file mode 100644
index 0000000000..8ef50ad8ca
--- /dev/null
+++ b/python/cutlass/utils/datatypes.py
@@ -0,0 +1,362 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utility functions for converting between frontend datatypes and CUTLASS datatypes
+"""
+
+import cutlass
+from cutlass_library import (
+    DataTypeSize,
+    MathOperation,
+    MathInstruction
+)
+from cutlass.backend.library import (
+    TileDescription,
+)
+
+bfloat16_available = None
+cupy_available = None
+numpy_available = None
+torch_available = None
+_library_to_cupy_dict = None
+_library_to_numpy_dict = None
+_library_to_torch_dict = None
+_torch_to_library_dict = None
+
+
+def is_numpy_available():
+    global numpy_available, _library_to_numpy_dict
+    if numpy_available is None:
+        try:
+            import numpy as np
+
+            numpy_available = True
+            _library_to_numpy_dict = {
+                cutlass.DataType.f16: np.float16,
+                cutlass.DataType.f32: np.float32,
+                cutlass.DataType.f64: np.float64,
+                cutlass.DataType.s8: np.int8,
+                cutlass.DataType.s32: np.int32,
+            }
+        except ImportError:
+            numpy_available = False
+            _library_to_numpy_dict = {}
+    return numpy_available
+
+
+def is_numpy_tensor(inp) -> bool:
+    if is_numpy_available():
+        import numpy as np
+        return isinstance(inp, np.ndarray)
+    return False
+
+
+def numpy_library_type(inp) -> cutlass.DataType:
+    if is_numpy_available():
+        import numpy as np
+        if inp == np.float16:
+            return cutlass.DataType.f16
+        elif inp == np.float32:
+            return cutlass.DataType.f32
+        elif inp == np.float64:
+            return cutlass.DataType.f64
+        elif inp == np.int8:
+            return cutlass.DataType.s8
+        elif inp == np.int32:
+            return cutlass.DataType.s32
+    return None
+
+
+def numpy_type(inp):
+    return _library_to_numpy_dict.get(inp, None)
+
+
+def is_cupy_available():
+    global cupy_available
+    if cupy_available is None:
+        try:
+            import cupy as cp
+
+            cupy_available = True
+            _library_to_cupy_dict = {
+                cutlass.DataType.f16: cp.float16,
+                cutlass.DataType.f32: cp.float32,
+                cutlass.DataType.f64: cp.float64,
+                cutlass.DataType.s8: cp.int8,
+                cutlass.DataType.s32: cp.int32,
+            }
+        except ImportError:
+            cupy_available = False
+            _library_to_cupy_dict = {}
+    return cupy_available
+
+
+def is_cupy_tensor(inp) -> bool:
+    if is_cupy_available():
+        import cupy as cp
+        return isinstance(inp, cp.ndarray)
+    return False
+
+
+def cupy_library_type(inp) -> cutlass.DataType:
+    if is_cupy_available():
+        import cupy as cp
+        if inp == cp.float16:
+            return cutlass.DataType.f16
+        elif inp == cp.float32:
+            return cutlass.DataType.f32
+        elif inp == cp.float64:
+            return cutlass.DataType.f64
+    return None
+
+
+def cupy_type(inp):
+    return _library_to_cupy_dict.get(inp, None)
+
+
+def is_torch_available():
+    global torch_available, _library_to_torch_dict, _torch_to_library_dict
+    if torch_available is None:
+        try:
+            import torch
+
+            torch_available = True
+            _torch_to_library_dict = {
+                torch.half: cutlass.DataType.f16,
+                torch.float16: cutlass.DataType.f16,
+                torch.bfloat16: cutlass.DataType.bf16,
+                torch.float: cutlass.DataType.f32,
+                torch.float32: cutlass.DataType.f32,
+                torch.double: cutlass.DataType.f64,
+                torch.float64: cutlass.DataType.f64,
+                torch.int8: cutlass.DataType.s8,
+                torch.int32: cutlass.DataType.s32,
+                torch.uint8: cutlass.DataType.u8,
+            }
+
+            _library_to_torch_dict = {
+                cutlass.DataType.f16: torch.half,
+                cutlass.DataType.f16: torch.float16,
+                cutlass.DataType.bf16: torch.bfloat16,
+                cutlass.DataType.f32: torch.float,
+                cutlass.DataType.f32: torch.float32,
+                cutlass.DataType.f64: torch.double,
+                cutlass.DataType.f64: torch.float64,
+                cutlass.DataType.s8: torch.int8,
+                cutlass.DataType.s32: torch.int32,
+                cutlass.DataType.u8: torch.uint8,
+            }
+
+            def possibly_add_type(torch_type_name, cutlass_type):
+                # Only try adding the type if the version of torch being used supports it
+                if hasattr(torch, torch_type_name):
+                    torch_type = getattr(torch, torch_type_name)
+                    _torch_to_library_dict[torch_type] = cutlass_type
+                    _library_to_torch_dict[cutlass_type] = torch_type
+
+            possibly_add_type("float8_e4m3fn", cutlass.DataType.e4m3)
+            possibly_add_type("float8_e5m2", cutlass.DataType.e5m2)
+
+        except ImportError:
+            torch_available = False
+            _torch_to_library_dict = {}
+            _library_to_torch_dict = {}
+    return torch_available
+
+
+def is_torch_tensor(inp) -> bool:
+    if is_torch_available():
+        import torch
+        return isinstance(inp, torch.Tensor)
+    return False
+
+
+def torch_library_type(inp) -> cutlass.DataType:
+    return _torch_to_library_dict.get(inp, None)
+
+
+def torch_type(inp):
+    return _library_to_torch_dict.get(inp, None)
+
+
+def is_bfloat16_available():
+    global bfloat16_available
+
+    if bfloat16_available is None:
+        try:
+            import bfloat16
+
+            bfloat16_available = True
+        except ImportError:
+            bfloat16_available = False
+    return bfloat16_available
+
+
+def bfloat16_library_type(inp) -> cutlass.DataType:
+    if is_bfloat16_available():
+        import bfloat16
+        if inp == bfloat16.bfloat16:
+            return cutlass.DataType.bf16
+
+
+def bfloat16_type(inp):
+    if is_bfloat16_available():
+        import bfloat16
+        if inp == cutlass.DataType.bf16:
+            return bfloat16.bfloat16
+
+
+def library_type(inp):
+    if inp in DataTypeSize:
+        return inp
+
+    for cvt_fn in [
+        bfloat16_library_type,
+        cupy_library_type,
+        numpy_library_type,
+        torch_library_type,
+    ]:
+        out = cvt_fn(inp)
+        if out is not None:
+            return out
+
+    raise Exception(f"No available conversion from type {inp} to a library type.")
+
+
+def _tensor_from_numpy(np_tensor):
+    dtype = library_type(np_tensor.dtype)
+    if np_tensor.flags.c_contiguous:
+        layout = cutlass.LayoutType.RowMajor
+    elif np_tensor.flags.f_contiguous:
+        layout = cutlass.LayoutType.ColumnMajor
+    return (dtype, layout)
+
+
+def _tensor_from_torch(pt_tensor):
+    dtype = library_type(pt_tensor.dtype)
+    return (dtype, cutlass.LayoutType.RowMajor)
+
+
+def get_datatype_and_layout(tensor):
+    if (is_numpy_tensor(tensor) or is_cupy_tensor(tensor)):
+        return _tensor_from_numpy(tensor)
+    elif is_torch_tensor(tensor):
+        return _tensor_from_torch(tensor)
+    elif isinstance(tensor, float) or isinstance(tensor, int):
+        return (cutlass.DataType.f32, cutlass.LayoutType.RowMajor)
+    else:
+        raise Exception(f"Unable to convert tensor of type {type(tensor)} to Python-bound CUTLASS datatype and layout.")
+
+
+def get_tensor_shape(tensor, op="GEMM"):
+    if (is_numpy_tensor(tensor) or is_cupy_tensor(tensor)):
+        return tensor.shape
+    elif is_torch_tensor(tensor):
+        size = tensor.size()
+        if op == "CONV":
+            # PyTorch Tensors have shape NCHW
+            return (size[0], size[2], size[3], size[1])
+        else:
+            return tuple(tensor.size())
+    elif isinstance(tensor, float) or isinstance(tensor, int):
+        return (1,)
+    else:
+        raise Exception(f"Unable to convert tensor of type {type(tensor)} to Python-bound CUTLASS datatype and layout.")
+
+
+_math_operation_value_map = {x.value: x for x in MathOperation}
+
+
+def backend_math_operation(math_op: MathOperation):
+    if math_op.value not in _math_operation_value_map.keys():
+        raise Exception(f"Unable to convert math operation of type {math_op} to backend math operation.")
+    return _math_operation_value_map[math_op.value]
+
+
+def construct_backend_td(td: cutlass.TileDescription,
+                         kernel_schedule: cutlass.KernelScheduleType,
+                         epilogue_schedule: cutlass.EpilogueScheduleType,
+                         tile_scheduler: cutlass.TileSchedulerType) -> TileDescription:
+    mi = td.math_instruction
+    backend_mi = MathInstruction(
+        mi.instruction_shape,
+        mi.element_a,
+        mi.element_b,
+        mi.element_accumulator,
+        mi.opcode_class,
+        backend_math_operation(mi.math_operation)
+    )
+    cluster_shape = td.cluster_shape if hasattr(td, "cluster_shape") else [1, 1, 1]
+    return TileDescription(td.threadblock_shape, td.stages, td.warp_count,
+                           backend_mi, cluster_shape, kernel_schedule, epilogue_schedule, tile_scheduler)
+
+
+def td_from_profiler_op(op) -> TileDescription:
+    """
+    Converts the profiler's TileDescription in ``op`` into the backend TileDescription
+
+    :param op: profiler Operation
+
+    :returns: backend TileDescription
+    :rtype: cutlass.backend.TileDescription
+    """
+    kschedule = op.kernel_schedule if hasattr(op, 'kernel_schedule') else None
+    eschedule = op.epilogue_schedule if hasattr(op, 'epilogue_schedule') else None
+    tschedule = op.tile_scheduler if hasattr(op, 'tile_scheduler') else None
+    return construct_backend_td(op.tile_description, kschedule, eschedule, tschedule)
+
+
+def td_from_profiler_td(td: TileDescription) -> TileDescription:
+    """
+    Converts the profiler's TileDescription into the backend TileDescription
+
+    :param td: profiler TileDescription
+    :type td: cutlass.TileDescription
+
+    :returns: backend TileDescription
+    :rtype: cutlass.backend.TileDescription
+    """
+    return construct_backend_td(td, kernel_schedule=None, epilogue_schedule=None, tile_scheduler=None)
+
+
+def to_camel_case(snake_str):
+    return "".join(x.capitalize() for x in snake_str.lower().split("_"))
+
+
+def getattr_enum(obj, attr_name):
+    # The attr_name is under the snake_case
+    camel_attr = to_camel_case(attr_name)
+    if hasattr(obj, camel_attr):
+        return getattr(obj, camel_attr)
+    else:
+        raise Exception(f"Invalid option: {attr_name}")
diff --git a/python/cutlass/utils/profiler.py b/python/cutlass/utils/profiler.py
new file mode 100644
index 0000000000..82e414c04a
--- /dev/null
+++ b/python/cutlass/utils/profiler.py
@@ -0,0 +1,185 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Profiler based on the cuda events
+"""
+
+import re
+import subprocess
+
+from cuda import cuda, cudart
+import numpy as np
+
+from cutlass import CUTLASS_PATH
+from cutlass.backend.library import DataTypeSize
+from cutlass.op.op import OperationBase
+from cutlass.shape import GemmCoord
+from cutlass.utils.datatypes import is_numpy_tensor
+
+
+class GpuTimer:
+    def __init__(self) -> None:
+        self.events = [
+            cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1],
+            cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1],
+        ]
+
+    def start(self, stream=cuda.CUstream(0)):
+        (err,) = cuda.cuEventRecord(self.events[0], stream)
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError(f"CUDA Error {str(err)}")
+
+    def stop(self, stream=cuda.CUstream(0)):
+        (err,) = cuda.cuEventRecord(self.events[1], stream)
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError(f"CUDA Error {str(err)}")
+        pass
+
+    def stop_and_wait(self, stream=cuda.CUstream(0)):
+        self.stop(stream)
+        if stream:
+            (err,) = cuda.cuStreamSynchronize(stream)
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                raise RuntimeError(f"CUDA Error {str(err)}")
+        else:
+            (err,) = cudart.cudaDeviceSynchronize()
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                raise RuntimeError(f"CUDA Error {str(err)}")
+
+    def duration(self, iterations=1):
+        err, duration = cuda.cuEventElapsedTime(self.events[0], self.events[1])
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError(f"CUDA Error {str(err)}")
+        return duration / float(iterations)
+
+
+class CUDAEventProfiler:
+    def __init__(self, op: OperationBase, warmup_iterations: int=500, iterations: int=500, *args, **kwargs) -> None:
+        self.arguments = op.run(*args, **kwargs)
+        self.operation = op.operation
+        self.warmup_iterations = warmup_iterations
+        self.iterations = iterations
+        self.timer = GpuTimer()
+
+    #
+    # Cutlass Python Interface Profiler
+    #
+
+    def __call__(self):
+        for _ in range(self.warmup_iterations):
+            self.operation.run(self.arguments)
+
+        self.timer.start()
+        for _ in range(self.iterations):
+            self.operation.run(self.arguments)
+
+        self.timer.stop_and_wait()
+        runtime = self.timer.duration(self.iterations)
+        return runtime
+
+    #
+    # CUTLASS Profiler
+    #
+
+    def run_cutlass_profiler(self):
+        alpha = 1.0
+        beta = 1.0
+
+        profiler_path = CUTLASS_PATH + "/build/tools/profiler/cutlass_profiler"
+        kernel_name = self.operation.procedural_name()
+        verification_providers = "device"
+        provider = "cutlass"
+        problem_size = self.arguments.problem_size
+
+        if "cutlass3x" in kernel_name:
+            # cutlass3x generator only have column-major output
+            layout_name = self.operation.layout_name_3x()
+            if layout_name[-1] == "t":
+                new_layout_name = "".join(["n" for l in layout_name if l == "t" or "t"])
+                problem_size = GemmCoord(problem_size.n, problem_size.m, problem_size.k)
+                kernel_name = kernel_name.replace(layout_name, new_layout_name)
+
+        batch_count = self.arguments.batch_count
+
+        cmd = f"{profiler_path} --kernels={kernel_name} --verification-providers={verification_providers} " \
+              f"--providers={provider} --m={problem_size.m()} --n={problem_size.n()} --k={problem_size.k()} " \
+              f"--batch_count={batch_count} --alpha={alpha} --beta={beta} "\
+              f"--warmup-iterations={self.warmup_iterations} --profiling-iterations={self.iterations}"
+
+        result = subprocess.getoutput(cmd)
+
+        m = re.search(r"Runtime:\s+(?P<runtime>\d+.\d+)", result)
+        runtime = float(m.group("runtime"))
+
+        m = re.search(r"Bytes:\s+(?P<bytes>\d+)", result)
+        bytes = int(m.group("bytes"))
+
+        m = re.search(r"FLOPs:\s+(?P<flops>\d+)", result)
+        flops = int(m.group("flops"))
+
+        # check if the problem size matches
+        assert bytes == self.bytes(problem_size, batch_count, beta)
+        assert flops == self.flops(problem_size, batch_count, beta)
+
+        return runtime
+
+    def bytes(self, problem_size, batch_count=1, beta=0.0):
+        m = problem_size.m()
+        n = problem_size.n()
+        k = problem_size.k()
+
+        bytes = (
+            (DataTypeSize[self.operation.A.element] * m // 8) * k
+            + (DataTypeSize[self.operation.B.element] * n // 8) * k
+            + (DataTypeSize[self.operation.C.element] * m // 8) * n
+        )
+
+        if beta != 0:
+            bytes += (DataTypeSize[self.operation.C.element] * m // 8) * n
+
+        bytes *= batch_count
+
+        return bytes
+
+    def flops(self, problem_size, batch_count=1, beta=0.0):
+        m = problem_size.m()
+        n = problem_size.n()
+        k = problem_size.k()
+
+        flops_ = (m * n * k) * 2 * batch_count
+
+        if beta != 0:
+            flops_ += m * n * batch_count * 2
+
+        return flops_
+
diff --git a/python/cutlass_library/__init__.py b/python/cutlass_library/__init__.py
new file mode 100644
index 0000000000..5788506131
--- /dev/null
+++ b/python/cutlass_library/__init__.py
@@ -0,0 +1,63 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import os
+import sys
+
+from . import conv2d_operation
+from . import conv3d_operation
+from . import gemm_operation
+
+if '-m' not in sys.argv:
+    # Do not import generator when running python -m cutlass_library.generator to
+    # avoid double-import warnings
+    from . import generator
+
+from . import library
+from . import manifest
+from . import rank_2k_operation
+from . import rank_k_operation
+from . import symm_operation
+from . import trmm_operation
+
+# Make enum types from library.py accessible via cutlass_library.*
+from .library import *
+
+# Set up `source` to point to the path containing the CUTLASS source.
+# Check first if the path cotains a `source` subdirectory -- this will
+# be the case when the package has been installed via pip. Otherwise,
+# default to the root of CUTLASS.
+install_source_path = os.path.join(__path__[0], 'source')
+if os.path.isdir(install_source_path):
+    source_path = install_source_path
+else:
+    source_path = os.path.join(__path__[0], '../..')
diff --git a/python/cutlass_library/conv2d_operation.py b/python/cutlass_library/conv2d_operation.py
new file mode 100644
index 0000000000..1cfe7f6eb6
--- /dev/null
+++ b/python/cutlass_library/conv2d_operation.py
@@ -0,0 +1,621 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for emitting Conv2d kernels
+"""
+
+import enum
+import logging
+import os.path
+import shutil
+from string import Template
+
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+  from cutlass_library.conv3x_emitter import EmitConv3xInstance, EmitConv3xIncludes
+except ImportError:
+  from library import *
+  from conv3x_emitter import EmitConv3xInstance, EmitConv3xIncludes
+
+_LOGGER = logging.getLogger(__name__)
+
+###################################################################################################
+
+#
+class Conv2dOperation:
+  #
+  def __init__(self, conv_kind, iterator_algorithm, arch, tile_description, A, B, C, element_epilogue, \
+    stride_support, epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity1, \
+    group_mode = GroupMode.NoneGroup):
+
+    self.operation_kind = OperationKind.Conv2d
+    self.arch = arch
+    self.tile_description = tile_description
+    self.conv_kind = conv_kind
+    self.A = A
+    self.B = B
+    self.C = C
+    self.element_epilogue = element_epilogue
+    self.epilogue_functor = epilogue_functor
+    self.iterator_algorithm = iterator_algorithm
+    self.stride_support = stride_support
+    self.swizzling_functor = swizzling_functor
+    self.group_mode = group_mode
+  #
+  def is_complex(self):
+    complex_operators = [
+      MathOperation.multiply_add_complex,
+      MathOperation.multiply_add_complex_gaussian
+      ]
+    return self.tile_description.math_instruction.math_operation in complex_operators
+
+  #
+  def is_mixed_input(self):
+    return self.A.element != self.B.element
+
+  #
+  def accumulator_type(self):
+    accum = self.tile_description.math_instruction.element_accumulator
+
+    if self.is_complex():
+      return get_complex_from_real(accum)
+
+    return accum
+
+  #
+  def core_name(self):
+    ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
+
+    intermediate_type = ''
+
+    if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp:
+      inst_shape = "%d%d%d" % tuple(self.tile_description.math_instruction.instruction_shape)
+      if self.tile_description.math_instruction.element_a != self.A.element and \
+        self.tile_description.math_instruction.element_a != self.accumulator_type():
+        intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
+    else:
+      inst_shape = ''
+
+    return "%s%s%s%s_%s" % (ShortDataTypeNames[self.accumulator_type()], \
+      inst_shape, intermediate_type, ConvKindNames[self.conv_kind], IteratorAlgorithmNames[self.iterator_algorithm])
+
+  #
+  def extended_name(self):
+    ''' Append data types if they differ from compute type. '''
+    if self.C.element != self.tile_description.math_instruction.element_accumulator and \
+      self.A.element != self.tile_description.math_instruction.element_accumulator:
+      extended_name = "${element_c}_${core_name}_${element_a}"
+    elif self.C.element == self.tile_description.math_instruction.element_accumulator and  \
+      self.A.element != self.tile_description.math_instruction.element_accumulator:
+      extended_name = "${core_name}_${element_a}"
+    else:
+      extended_name = "${core_name}"
+
+    extended_name = SubstituteTemplate(extended_name, {
+      'element_a': DataTypeNames[self.A.element],
+      'element_c': DataTypeNames[self.C.element],
+      'core_name': self.core_name()
+      })
+
+    return extended_name
+
+  #
+  def layout_name(self):
+    return "%s" % (ShortLayoutTypeNames[self.A.layout])
+
+  #
+  def configuration_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+
+    opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
+
+    threadblock = self.tile_description.procedural_name()
+
+    # grouped conv
+    if self.group_mode != GroupMode.NoneGroup:
+      group_conv_name = f"{GroupModeNames[self.group_mode]}_"
+    else:
+      group_conv_name = ""
+
+    if self.stride_support == StrideSupport.Unity and self.conv_kind == ConvKind.Dgrad:
+      configuration_name = "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}_unity_stride_${group_conv_name}align${alignment}"
+    else:
+      configuration_name = "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}_${group_conv_name}align${alignment}"
+
+    return SubstituteTemplate(
+      configuration_name,
+      {
+        'opcode_class': opcode_class_name,
+        'extended_name': self.extended_name(),
+        'threadblock': threadblock,
+        'layout': self.layout_name(),
+        'alignment': "%d" % self.A.alignment,
+        'group_conv_name': group_conv_name
+      }
+    )
+
+  #
+  def procedural_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    return self.configuration_name()
+
+###################################################################################################
+#
+# Emits single instances of a CUTLASS device-wide operator
+#
+###################################################################################################
+
+class EmitConv2dInstance:
+  def __init__(self):
+    # Emitter for CUTLASS 3 convolution operations
+    self.conv3x_emitter = EmitConv3xInstance()
+    self.template = """
+  // Conv2d${conv_kind_name} ${iterator_algorithm_name} kernel instance "${operation_name}"
+  using ${operation_name}_base =
+  typename cutlass::conv::kernel::DefaultConv2d${conv_kind_name}<
+    ${element_a},
+    ${layout_a},
+    ${element_b},
+    ${layout_b},
+    ${element_c},
+    ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k} >,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor}, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>,
+    ${stages},
+    ${math_operator},
+    ${iterator_algorithm},
+    ${stride_support},
+    ${align_a},
+    ${align_b}
+  >::Kernel;
+"""
+    self.template_group_conv = """
+  // Conv2d${conv_kind_name} ${iterator_algorithm_name} kernel instance "${operation_name}"
+  using ${operation_name}_base =
+  typename cutlass::conv::kernel::DefaultConv2dGroup${conv_kind_name}<
+    ${element_a},
+    ${layout_a},
+    ${element_b},
+    ${layout_b},
+    ${element_c},
+    ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k} >,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor}, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>,
+    ${stages},
+    ${math_operator},
+    ${group_mode},
+    ${iterator_algorithm},
+    ${stride_support},
+    ${align_a},
+    ${align_b}
+  >::Kernel;
+"""
+    self.template_depthwise_direct_conv = """
+  // Conv2d${conv_kind_name} ${iterator_algorithm_name} kernel instance "${operation_name}"
+  using ${operation_name}_base =
+  typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConv${conv_kind_name}<
+    ${element_a},
+    ${layout_a},
+    ${element_b},
+    ${layout_b},
+    ${element_c},
+    ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::conv::TensorNHWCShape<${threadblock_output_shape_n}, ${threadblock_output_shape_p}, ${threadblock_output_shape_q}, ${groups_per_cta}>,
+    cutlass::MatrixShape<${filter_shape_r}, ${filter_shape_s}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue},
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
+    >,
+
+    cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<
+          1,
+          ${threadblock_output_shape_n},
+          ${threadblock_output_shape_p},
+          ${threadblock_output_shape_q}>,
+    ${stages},
+    ${math_operator},
+    ${iterator_algorithm},
+    ${stride_support},
+    cutlass::MatrixShape<${stride_r}, ${stride_s}>,
+    cutlass::MatrixShape<${dilation_r}, ${dilation_s}>
+  >::Kernel;
+"""
+
+  def arch_number_to_type(self, arch: int):
+    return f"cutlass::arch::Sm{arch}"
+
+  def emit(self, operation):
+    _LOGGER.debug("*** EmitConv2dInstance::emit")
+    _LOGGER.debug("***   operation: procedural_name()=" + operation.procedural_name())
+
+    if hasattr(operation, 'is_3x') and operation.is_3x:
+      _LOGGER.debug("***   CUTLASS 3 operation")
+      return self.conv3x_emitter.emit(operation)
+
+    _LOGGER.debug("***   CUTLASS 2 operation")
+
+    warp_shape = [int(operation.tile_description.threadblock_shape[idx] / operation.tile_description.warp_count[idx]) for idx in range(3)]
+
+    epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
+
+    values = {
+      'operation_name': operation.procedural_name(),
+      'conv_kind': ConvKindTag[operation.conv_kind],
+      'conv_kind_name': ConvKindNames[operation.conv_kind].capitalize(),
+      'element_a': DataTypeTag[operation.A.element],
+      'layout_a': LayoutTag[operation.A.layout],
+      'element_b': DataTypeTag[operation.B.element],
+      'layout_b': LayoutTag[operation.B.layout],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[operation.C.layout],
+      'element_accumulator': DataTypeTag[operation.accumulator_type()],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'epilogue_vector_length': str(epilogue_vector_length),
+      'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
+      'stages': str(operation.tile_description.stages),
+      'iterator_algorithm': IteratorAlgorithmTag[operation.iterator_algorithm],
+      'iterator_algorithm_name': IteratorAlgorithmNames[operation.iterator_algorithm].capitalize(),
+      'stride_support': StrideSupportTag[operation.stride_support],
+      'math_operator': 'cutlass::arch::OpMultiplyAddComplex' if operation.is_complex() else \
+      MathOperationTag[operation.tile_description.math_instruction.math_operation],
+      'align_a': str(operation.A.alignment),
+      'align_b': str(operation.B.alignment),
+    }
+
+    if operation.group_mode == GroupMode.NoneGroup:
+      _LOGGER.debug("***   group_mode=NoneGroup")
+      return SubstituteTemplate(self.template, values)
+
+    elif operation.group_mode == GroupMode.Depthwise:
+      _LOGGER.debug("***   group_mode=Depthwise")
+      values['group_mode'] = GroupModeTag[operation.group_mode]
+      # Setup other template params
+      values['threadblock_output_shape_n'] = str(operation.tile_description.threadblock_output_shape[0])
+      values['threadblock_output_shape_p'] = str(operation.tile_description.threadblock_output_shape[1])
+      values['threadblock_output_shape_q'] = str(operation.tile_description.threadblock_output_shape[2])
+
+      values['groups_per_cta'] = str(operation.tile_description.threadblock_output_shape[3])
+
+      values['filter_shape_r'] = str(operation.tile_description.filter_shape[0])
+      values['filter_shape_s'] = str(operation.tile_description.filter_shape[1])
+
+      values['stride_r'] = str(operation.tile_description.stride[0])
+      values['stride_s'] = str(operation.tile_description.stride[1])
+
+      values['dilation_r'] = str(operation.tile_description.dilation[0])
+      values['dilation_s'] = str(operation.tile_description.dilation[1])
+
+      return SubstituteTemplate(self.template_depthwise_direct_conv, values)
+
+    else:
+      _LOGGER.debug("***   group_mode=" + GroupModeTag[operation.group_mode])
+      values['group_mode'] = GroupModeTag[operation.group_mode]
+      return SubstituteTemplate(self.template_group_conv, values)
+
+###################################################################################################
+#
+# Generator functions for all layouts
+#
+###################################################################################################
+
+#
+def GenerateConv2dTensorOp(manifest, tile_descriptions, min_cc, align = 128):
+  _LOGGER.debug("*** GenerateConv2dTensorOp")
+
+  for tile in tile_descriptions:
+    for conv_kind in [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad]:
+
+      if conv_kind == ConvKind.Fprop or (tile.math_instruction.element_accumulator in [DataType.f16, DataType.f32]):
+
+        #
+        output_types = [tile.math_instruction.element_a, tile.math_instruction.element_accumulator] \
+          if DataTypeSize[tile.math_instruction.element_accumulator] == 32 \
+          else [tile.math_instruction.element_accumulator,]
+
+        for output_type in output_types:
+          A = TensorDescription(tile.math_instruction.element_a, LayoutType.TensorNHWC, int(align / DataTypeSize[tile.math_instruction.element_a]))
+          B = TensorDescription(tile.math_instruction.element_b, LayoutType.TensorNHWC, int(align / DataTypeSize[tile.math_instruction.element_b]))
+          C = TensorDescription(output_type,  LayoutType.TensorNHWC, max(1, int(align / DataTypeSize[output_type])))
+
+          manifest.append(Conv2dOperation(conv_kind, min_cc, tile, A, B, C, tile.math_instruction.element_accumulator))
+
+class EmitConv2dIncludes:
+  '''Emit includes that are specific to the operation.'''
+
+  def __init__(self):
+    self.includes = ['conv2d_operation.h']
+    self.emitter_3x = EmitConv3xIncludes()
+
+  def operation_is_3x(self, operation) -> bool:
+    """Whether operation is a CUTLASS 3 convolution (as opposed to CUTLASS 2)"""
+    return hasattr(operation, 'is_3x') and operation.is_3x
+
+  def emit(self, operation) -> str:
+    if self.operation_is_3x(operation):
+      return self.emitter_3x.emit(operation)
+
+    return '\n'.join(f"#include \"{incl}\"" for incl in self.includes) + \
+      "\n\n///////////////////////////////////////////////////////////////////////////////////////////////////"
+
+###################################################################################################
+#
+# Emitters functions for all targets
+#
+###################################################################################################
+
+class EmitConv2dConfigurationLibrary:
+  def __init__(self, operation_path, configuration_name):
+    self.configuration_name = configuration_name
+    self.configuration_path = os.path.join(operation_path, "%s.cu" % configuration_name)
+
+    self.instance_emitter = EmitConv2dInstance()
+    self.includes_emitter = EmitConv2dIncludes()
+
+    self.header_template = """
+/*
+  Generated by conv2d_operation.py - Do not edit.
+*/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "library_internal.h"
+"""
+
+    self.instance_template = """
+${stub_begin}
+${operation_instance}
+// Derived class
+struct ${operation_name} :
+  public ${operation_name}_base { };
+${stub_end}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+"""
+
+    self.configuration_header = """
+
+namespace cutlass {
+namespace library {
+
+// Initialize all instances
+void initialize_${configuration_name}(Manifest &manifest) {
+"""
+
+    self.configuration_instance = """${stub_begin}
+  using Operation_${operation_name} = cutlass::conv::device::${kernel_name}<
+    ${operation_name}>;
+
+  manifest.append(new cutlass::library::${operation_wrapper}<
+      Operation_${operation_name}
+    >(
+      "${operation_name}"
+    ));
+${stub_end}
+"""
+
+    self.configuration_epilogue = "}\n"
+
+    self.epilogue_template = """
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+"""
+
+  def operation_is_3x(self, operation):
+    """Whether operation is a CUTLASS 3 convolution (as opposed to CUTLASS 2)"""
+    return hasattr(operation, 'is_3x') and operation.is_3x
+
+  def __enter__(self):
+    """
+    Open the configuration_file, and write the "header" C++ code to it.
+
+    The "header" consists of a comment (that this is generated code,
+    so it should not be edited), and includes that are common
+    to all kinds of kernels.
+    """
+    _LOGGER.debug('*** EmitConv2dConfigurationLibrary::__enter__')
+    _LOGGER.debug('***   configuration_path (file to write): ' +
+                  str(self.configuration_path))
+    _LOGGER.debug('***   configuration_name: ' + self.configuration_name)
+    self.configuration_file = open(self.configuration_path, "w")
+
+    self.configuration_file.write(SubstituteTemplate(self.header_template, {
+      'configuration_name': self.configuration_name
+      }))
+    self.operations = []
+    return self
+
+  def emit(self, operation):
+    """
+    Write three pieces of C++ code to the configuration_file
+    (that was opened by the __enter__ method above):
+
+    1. the header includes that are specific to the operation
+       (CUTLASS 2 vs. CUTLASS 3);
+
+    2. the "operation instance" (a "using" declaration ending in "_base"); and
+
+    3. the "operation name" (declaration and definition of a derived class
+       of the above operation instance).
+
+    The "using" declaration turns a C++ class name, possibly namespace-qualified,
+    possibly also with angle brackets, into a C-style, easily demangled identifier.
+    """
+    _LOGGER.debug('*** EmitConv2dConfigurationLibrary::emit')
+    _LOGGER.debug('***   operation.procedural_name(): ' + operation.procedural_name())
+    self.operations.append(operation)
+
+    self.configuration_file.write(self.includes_emitter.emit(operation))
+
+    stub_begin = ''
+    stub_end = ''
+    # It can be useful to stub (comment) out instantiations for testing.
+    # In this case, one need only set is_stub to True.
+    is_stub = False
+    if is_stub:
+      stub_begin = "// STUB for now\n#if 0"
+      stub_end = '#endif // 0'
+
+    self.configuration_file.write(Template(self.instance_template).substitute({
+      'configuration_name': self.configuration_name,
+      'operation_name': operation.procedural_name(),
+      'operation_instance': self.instance_emitter.emit(operation),
+      'stub_begin': stub_begin,
+      'stub_end': stub_end
+      }))
+
+  def __exit__(self, exception_type, exception_value, traceback):
+    """
+    Write the rest of the C++ code to the configuration_file, and close the file.
+
+    The "rest of the C++ code" has the following components.
+
+    1. Configuration header: Open the namespace(s), and open the definition
+       of the "initialize_${configuration_name}" registration function
+       that registers the operation with the Manifest.
+       ("Registration" helps turn C++ compile-time polymorphism
+       (via template parameters) into a run-time choice of parameters.)
+
+    2. Configuration instance: In the body of the registration function,
+       make a "using" declaration Operation_${operation_name} for the
+       operation type (which uses operation_name as its template argument).
+       Then, tell the manifest about the operation via a "manifest.append" call.
+       The argument of the call is a new instance of
+       "SomethingOperation<Operation_${operation_name}>"
+       (replace Something with a specific name).
+
+    3. Configuration epilogue: Close the definition of the registration function.
+
+    4. Epilogue template: Close the namespace(s).
+    """
+
+    _LOGGER.debug('*** EmitConv2dConfigurationLibrary::__exit__')
+    _LOGGER.debug('***   configuration_path (file to write): ' +
+                  str(self.configuration_path))
+    _LOGGER.debug('***   configuration_name: ' + self.configuration_name)
+
+    self.configuration_file.write(SubstituteTemplate(self.configuration_header, {
+      'configuration_name': self.configuration_name
+      }))
+
+    for operation in self.operations:
+      stub_begin = ''
+      stub_end = ''
+      # It can be useful to stub (comment) out instantiations for testing.
+      # In this case, one need only set is_stub to True.
+      is_stub = False
+      if is_stub:
+        stub_begin = "// STUB for now\n#if 0"
+        stub_end = "#endif // 0"
+
+      if operation.group_mode == GroupMode.Depthwise:
+        kernel_name = 'DirectConvolution'
+        operation_wrapper = 'DirectConv2dOperation'
+      else:
+        kernel_name = 'ImplicitGemmConvolution'
+        operation_wrapper = 'Conv2dOperation'
+      if self.operation_is_3x(operation):
+        kernel_name = 'ConvUniversalAdapter'
+        operation_wrapper = 'ConvOperation3x'
+
+      self.configuration_file.write(SubstituteTemplate(self.configuration_instance, {
+        'configuration_name': self.configuration_name,
+        'operation_name': operation.procedural_name(),
+        'kernel_name': kernel_name,
+        'operation_wrapper': operation_wrapper,
+        'stub_begin': stub_begin,
+        'stub_end': stub_end
+      }))
+
+    self.configuration_file.write(self.configuration_epilogue)
+    self.configuration_file.write(self.epilogue_template)
+    self.configuration_file.close()
+
+
+###################################################################################################
+###################################################################################################
diff --git a/tools/library/scripts/conv3d_operation.py b/python/cutlass_library/conv3d_operation.py
similarity index 59%
rename from tools/library/scripts/conv3d_operation.py
rename to python/cutlass_library/conv3d_operation.py
index 4ba31b0395..bb9f99e436 100644
--- a/tools/library/scripts/conv3d_operation.py
+++ b/python/cutlass_library/conv3d_operation.py
@@ -1,15 +1,56 @@
+#################################################################################################
 #
-# \file generator.py
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# \brief Generates the CUTLASS Library's instances
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
 #
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for emitting Conv3d kernels
+"""
 
 import enum
+import logging
 import os.path
 import shutil
+from string import Template
 
-from library import *
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+  from cutlass_library.conv3x_emitter import EmitConv3xInstance, EmitConv3xIncludes
+except ImportError:
+  from library import *
+  from conv3x_emitter import EmitConv3xInstance, EmitConv3xIncludes
+
+_LOGGER = logging.getLogger(__name__)
 
 ###################################################################################################
 
@@ -32,6 +73,10 @@ def __init__(self, conv_kind, iterator_algorithm, arch, tile_description, A, B,
     self.stride_support = stride_support
     self.swizzling_functor = swizzling_functor
 
+  #
+  def is_mixed_input(self):
+    return self.A.element != self.B.element
+
   #
   def core_name(self):
     ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
@@ -74,7 +119,7 @@ def configuration_name(self):
     ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
 
     opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
-    
+
     threadblock = "%dx%d_%dx%d" % (
       self.tile_description.threadblock_shape[0],
       self.tile_description.threadblock_shape[1],
@@ -109,15 +154,17 @@ def procedural_name(self):
 
 class EmitConv3dInstance:
   def __init__(self):
+    # Emitter for CUTLASS 3 convolution operations
+    self.conv3x_emitter = EmitConv3xInstance()
     self.template = """
   // Conv3d${conv_kind_name} ${iterator_algorithm_name} kernel instance "${operation_name}"
-  using ${operation_name}_base = 
+  using ${operation_name}_base =
   typename cutlass::conv::kernel::DefaultConv3d${conv_kind_name}<
-    ${element_a}, 
+    ${element_a},
     cutlass::layout::TensorNDHWC,
-    ${element_b}, 
+    ${element_b},
     cutlass::layout::TensorNDHWC,
-    ${element_c}, 
+    ${element_c},
     cutlass::layout::TensorNDHWC,
     ${element_accumulator},
     ${opcode_class},
@@ -139,8 +186,15 @@ def __init__(self):
   >::Kernel;
 """
 
-
   def emit(self, operation):
+    _LOGGER.debug("*** EmitConv3dInstance::emit")
+    _LOGGER.debug("***   operation: procedural_name()=" + operation.procedural_name())
+
+    if hasattr(operation, 'is_3x') and operation.is_3x:
+      _LOGGER.debug("***   CUTLASS 3 operation")
+      return self.conv3x_emitter.emit(operation)
+
+    _LOGGER.debug("***   CUTLASS 2 operation")
 
     warp_shape = [int(operation.tile_description.threadblock_shape[idx] / operation.tile_description.warp_count[idx]) for idx in range(3)]
 
@@ -206,6 +260,24 @@ def GenerateConv3dTensorOp(manifest, tile_descriptions, min_cc, align = 128):
 
           manifest.append(Conv3dOperation(conv_kind, min_cc, tile, A, B, C, tile.math_instruction.element_accumulator))
 
+class EmitConv3dIncludes:
+  '''Emit includes that are specific to the operation.'''
+
+  def __init__(self):
+    self.includes = ['conv3d_operation.h']
+    self.emitter_3x = EmitConv3xIncludes()
+
+  def operation_is_3x(self, operation) -> bool:
+    """Whether operation is a CUTLASS 3 convolution (as opposed to CUTLASS 2)"""
+    return hasattr(operation, 'is_3x') and operation.is_3x
+
+  def emit(self, operation) -> str:
+    if self.operation_is_3x(operation):
+      return self.emitter_3x.emit(operation)
+
+    return '\n'.join(f"#include \"{incl}\"" for incl in self.includes) + \
+      "\n\n///////////////////////////////////////////////////////////////////////////////////////////////////"
+
 ###################################################################################################
 #
 # Emitters functions for all targets
@@ -218,17 +290,8 @@ def __init__(self, operation_path, configuration_name):
     self.configuration_path = os.path.join(operation_path, "%s.cu" % configuration_name)
 
     self.instance_emitter = EmitConv3dInstance()
+    self.includes_emitter = EmitConv3dIncludes()
 
-    self.instance_template = """
-${operation_instance}
-
-// Derived class
-struct ${operation_name} : 
-  public ${operation_name}_base { };
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-"""
     self.header_template = """
 /*
   Generated by conv3d_operation.py - Do not edit.
@@ -241,9 +304,17 @@ def __init__(self, operation_path, configuration_name):
 #include "cutlass/library/manifest.h"
 
 #include "library_internal.h"
-#include "conv3d_operation.h"
+"""
 
+    self.instance_template = """
+${stub_begin}
+${operation_instance}
+// Derived class
+struct ${operation_name} :
+  public ${operation_name}_base { };
+${stub_end}
 ///////////////////////////////////////////////////////////////////////////////////////////////////
+
 """
 
     self.configuration_header = """
@@ -253,22 +324,22 @@ def __init__(self, operation_path, configuration_name):
 
 // Initialize all instances
 void initialize_${configuration_name}(Manifest &manifest) {
-
 """
 
-    self.configuration_instance = """
-  using Operation_${operation_name} = cutlass::conv::device::ImplicitGemmConvolution<
+    self.configuration_instance = """${stub_begin}
+  using Operation_${operation_name} = cutlass::conv::device::${kernel_name}<
     ${operation_name}>;
 
-  manifest.append(new cutlass::library::Conv3dOperation<
-    Operation_${operation_name}>(
-      "${operation_name}"));
-
+  manifest.append(new cutlass::library::${operation_wrapper}<
+      Operation_${operation_name}
+    >(
+      "${operation_name}"
+    ));
+${stub_end}
 """
 
-    self.configuration_epilogue = """
-}
-"""
+    self.configuration_epilogue = "}\n"
+
     self.epilogue_template = """
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -280,35 +351,126 @@ def __init__(self, operation_path, configuration_name):
 
 """
 
-  #
+  def operation_is_3x(self, operation):
+    """Whether operation is a CUTLASS 3 convolution (as opposed to CUTLASS 2)"""
+    return hasattr(operation, 'is_3x') and operation.is_3x
+
   def __enter__(self):
+    """
+    Open the configuration_file, and write the "header" C++ code to it.
+
+    The "header" consists of a comment (that this is generated code,
+    so it should not be edited), and includes that are common
+    to both the CUTLASS 2 and the CUTLASS 3 cases.
+    """
+    _LOGGER.debug('*** EmitConv3dConfigurationLibrary::__enter__')
+    _LOGGER.debug('***   configuration_path (file to write): ' +
+                  str(self.configuration_path))
+    _LOGGER.debug('***   configuration_name: ' + self.configuration_name)
     self.configuration_file = open(self.configuration_path, "w")
+
     self.configuration_file.write(SubstituteTemplate(self.header_template, {
       'configuration_name': self.configuration_name
       }))
     self.operations = []
     return self
 
-  #
   def emit(self, operation):
+    """
+    Write three pieces of C++ code to the configuration_file
+    (that was opened by the __enter__ method above):
+
+    1. the header includes that are specific to the operation
+       (CUTLASS 2 vs. CUTLASS 3);
+
+    2. the "operation instance" (a "using" declaration ending in "_base"); and
+
+    3. the "operation name" (declaration and definition of a derived class
+       of the above operation instance).
+
+    The "using" declaration turns a C++ class name, possibly namespace-qualified,
+    possibly also with angle brackets, into a C-style, easily demangled identifier.
+    """
+    _LOGGER.debug('*** EmitConv3dConfigurationLibrary::emit')
+    _LOGGER.debug('***   operation.procedural_name(): ' + operation.procedural_name())
     self.operations.append(operation)
-    self.configuration_file.write(SubstituteTemplate(self.instance_template, {
+
+    self.configuration_file.write(self.includes_emitter.emit(operation))
+
+    stub_begin = ''
+    stub_end = ''
+    # It can be useful to stub (comment) out instantiations for testing.
+    # In this case, one need only set is_stub to True.
+    is_stub = False
+    if is_stub:
+      stub_begin = "// STUB for now\n#if 0"
+      stub_end = '#endif // 0'
+
+    self.configuration_file.write(Template(self.instance_template).substitute({
       'configuration_name': self.configuration_name,
       'operation_name': operation.procedural_name(),
-      'operation_instance': self.instance_emitter.emit(operation)
+      'operation_instance': self.instance_emitter.emit(operation),
+      'stub_begin': stub_begin,
+      'stub_end': stub_end
       }))
 
-  #
   def __exit__(self, exception_type, exception_value, traceback):
+    """
+    Write the rest of the C++ code to the configuration_file, and close the file.
+
+    The "rest of the C++ code" has the following components.
+
+    1. Configuration header: Open the namespace(s), and open the definition
+       of the "initialize_${configuration_name}" registration function
+       that registers the operation with the Manifest.
+       ("Registration" helps turn C++ compile-time polymorphism
+       (via template parameters) into a run-time choice of parameters.)
+
+    2. Configuration instance: In the body of the registration function,
+       make a "using" declaration Operation_${operation_name} for the
+       operation type (which uses operation_name as its template argument).
+       Then, tell the manifest about the operation via a "manifest.append" call.
+       The argument of the call is a new instance of
+       "SomethingOperation<Operation_${operation_name}>"
+       (replace Something with a specific name).
+
+    3. Configuration epilogue: Close the definition of the registration function.
+
+    4. Epilogue template: Close the namespace(s).
+    """
+
+    _LOGGER.debug('*** EmitConv3dConfigurationLibrary::__exit__')
+    _LOGGER.debug('***   configuration_path (file to write): ' +
+                  str(self.configuration_path))
+    _LOGGER.debug('***   configuration_name: ' + self.configuration_name)
 
     self.configuration_file.write(SubstituteTemplate(self.configuration_header, {
       'configuration_name': self.configuration_name
       }))
 
     for operation in self.operations:
+      stub_begin = ''
+      stub_end = ''
+      # It can be useful to stub (comment) out instantiations for testing.
+      # In this case, one need only set is_stub to True.
+      is_stub = False
+      if is_stub:
+        stub_begin = "// STUB for now\n#if 0"
+        stub_end = "#endif // 0"
+
+      kernel_name = 'ImplicitGemmConvolution'
+      operation_wrapper = 'Conv3dOperation'
+      if self.operation_is_3x(operation):
+        kernel_name = 'ConvUniversalAdapter'
+        operation_wrapper = 'ConvOperation3x'
+
       self.configuration_file.write(SubstituteTemplate(self.configuration_instance, {
         'configuration_name': self.configuration_name,
-        'operation_name': operation.procedural_name()  
+        'operation_name': operation.procedural_name(),
+        'kernel_name': kernel_name,
+        'operation_wrapper': operation_wrapper,
+        'stub_begin': stub_begin,
+        'stub_end': stub_end
       }))
 
     self.configuration_file.write(self.configuration_epilogue)
@@ -318,4 +480,3 @@ def __exit__(self, exception_type, exception_value, traceback):
 
 ###################################################################################################
 ###################################################################################################
-
diff --git a/python/cutlass_library/conv3x_emitter.py b/python/cutlass_library/conv3x_emitter.py
new file mode 100644
index 0000000000..29bc4a8f97
--- /dev/null
+++ b/python/cutlass_library/conv3x_emitter.py
@@ -0,0 +1,252 @@
+#################################################################################################
+#
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for emitting CUTLASS >= 3 convolution kernels
+"""
+
+import enum
+import os.path
+import shutil
+import logging
+from string import Template
+
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+except ImportError:
+  from library import *
+
+_LOGGER = logging.getLogger(__name__)
+
+###################################################################################################
+#
+# Emits single instances of a CUTLASS device-wide operator
+#
+###################################################################################################
+
+class EmitConv3xInstance:
+  def __init__(self):
+    _LOGGER.debug("*** EmitConv3xInstance::__init__")
+
+    # Define epilogue type first, so that the mainloop type
+    # can use it with StageCountAutoCarveout.
+    self.template = """
+
+// CUTLASS >= 3 convolution ${conv_kind_name} kernel instance "${operation_name}"
+using ${operation_name}_epilogue =
+  typename cutlass::epilogue::collective::CollectiveBuilder<
+    ${arch},
+    ${opcode_class_epi},
+    ${output_cta_tile_shape},        // output cta tile shape
+    ${cluster_shape},                // cluster shape
+    ${epi_tile_mn},
+    ${element_accumulator},
+    ${element_compute},
+    ${element_c}, ${layout_c}, 128 / cute::sizeof_bits_v<${element_c}>,
+    ${element_d}, ${layout_d}, 128 / cute::sizeof_bits_v<${element_d}>,
+    ${epilogue_schedule}
+    // , class FusionOpOrCallbacks = cutlass::epilogue::fusion::LinearCombination<ElementD,ElementCompute>
+  >::CollectiveOp;
+
+using ${operation_name}_mainloop =
+  typename cutlass::conv::collective::CollectiveBuilder<
+    ${arch},
+    ${opcode_class_main},
+    ${conv_kind},         // kFprop, kDgrad, or kWgrad
+    ${element_a}, ${layout_a}, 128 / cute::sizeof_bits_v<${element_a}>,
+    ${element_b}, ${layout_b}, 128 / cute::sizeof_bits_v<${element_b}>,
+    ${element_accumulator},
+    ${mma_tile_shape},        // mma tile shape
+    ${cluster_shape},         // cluster shape
+    ${stages},
+    ${kernel_schedule}
+  >::CollectiveOp;
+
+using ${operation_name}_problem_shape = cutlass::conv::ConvProblemShape<${conv_kind}, ${operation_name}_mainloop::NumSpatialDimensions>;
+
+// Unit tests call this "ConvKernel".
+// Conv operator ${operation_name}
+using ${operation_name}_base = cutlass::conv::kernel::ConvUniversal<
+    ${operation_name}_problem_shape,
+    ${operation_name}_mainloop,
+    ${operation_name}_epilogue,
+    ${tile_scheduler}
+  >;
+"""
+
+  def arch_number_to_type(self, arch: int) -> str:
+    return f"cutlass::arch::Sm{arch}"
+
+  def output_cta_tile_shape(self, operation, cta_m, cta_n, cta_k) -> str:
+    # For all three kinds of convolutions, the tile shape's K mode
+    # differs from GEMM in that needs to be wrapped in a Shape.
+    # For Wgrad convolutions specifically,
+    # the N tile shape also needs to be wrapped in a Shape.
+    m_template = 'cute::_${cta_m}'
+    if operation.conv_kind == ConvKind.Wgrad:
+      n_template = 'cute::Shape<cute::_${cta_n}>'
+    else:
+      n_template = 'cute::_${cta_n}'
+    k_template = 'cute::Shape<cute::_${cta_k}>'
+
+    output_cta_tile_shape_template = f'cute::Shape<{m_template}, {n_template}, {k_template}>'
+    values = {
+      'cta_m': cta_m,
+      'cta_n': cta_n,
+      'cta_k': cta_k
+    }
+    return Template(output_cta_tile_shape_template).substitute(values)
+
+  def mma_tile_shape(self, operation, cta_m, cta_n, cta_k) -> str:
+    mma_m = cta_m
+    mma_n = cta_n
+    mma_k = cta_k
+
+    # For all three kinds of convolutions, the tile shape's K mode
+    # differs from GEMM in that needs to be wrapped in a Shape.
+    # For Wgrad convolutions specifically,
+    # the N tile shape also needs to be wrapped in a Shape.
+    m_template = 'cute::_${mma_m}'
+    if operation.conv_kind == ConvKind.Wgrad:
+      n_template = 'cute::Shape<cute::_${mma_n}>'
+    else:
+      n_template = 'cute::_${mma_n}'
+    k_template = 'cute::Shape<cute::_${mma_k}>'
+
+    mma_tile_shape_template = f'cute::Shape<{m_template}, {n_template}, {k_template}>'
+    values = {
+      'mma_m': mma_m,
+      'mma_n': mma_n,
+      'mma_k': mma_k
+    }
+    return Template(mma_tile_shape_template).substitute(values)
+
+  def cluster_shape(self, operation) -> str:
+    m_template = 'cute::_${cluster_shape_m}' if operation.tile_description.cluster_shape[0] > 0 else 'int(0)'
+    n_template = 'cute::_${cluster_shape_n}' if operation.tile_description.cluster_shape[1] > 0 else 'int(0)'
+    k_template = 'cute::_${cluster_shape_k}' if operation.tile_description.cluster_shape[2] > 0 else 'int(0)'
+    cluster_shape_template = f'cute::Shape<{m_template}, {n_template}, {k_template}>'
+    values = {
+      'cluster_shape_m': operation.tile_description.cluster_shape[0],
+      'cluster_shape_n': operation.tile_description.cluster_shape[1],
+      'cluster_shape_k': operation.tile_description.cluster_shape[2],
+    }
+    return Template(cluster_shape_template).substitute(values)
+
+  def stage_count(self, operation) -> str:
+    # stages == 0 tells builder to pick the number of stages automatically
+    namespace_prefix = 'cutlass::conv::collective::'
+    if operation.tile_description.stages > 0:
+      return f"{namespace_prefix}StageCount<{str(operation.tile_description.stages)}>"
+    else:
+      return f"{namespace_prefix}StageCountAutoCarveout<sizeof(typename {operation.procedural_name()}_epilogue::SharedStorage)>"
+
+  def emit(self, operation) -> str:
+    _LOGGER.debug("*** EmitConv3xInstance::emit")
+    _LOGGER.debug("***   operation: procedural_name()=" + operation.procedural_name())
+
+    # Identify the operation as CUTLASS 3 by its is_3x field
+    if (not hasattr(operation, 'is_3x')) or (not operation.is_3x):
+      raise RuntimeError("operation must be a CUTLASS 3 operation")
+
+    epi_tile_mn = "cutlass::epilogue::collective::EpilogueTileAuto"
+    opcode_class_main = OpcodeClassTag[operation.tile_description.math_instruction.opcode_class]
+    opcode_class_epi = opcode_class_main
+
+    tile_shape = operation.tile_description.tile_shape
+    cluster_m = operation.tile_description.cluster_shape[0]
+    cluster_n = operation.tile_description.cluster_shape[1]
+
+    cta_m, cta_n, cta_k = tile_shape
+    warp_count = operation.tile_description.warp_count
+    epilogue_schedule = EpilogueScheduleTag[operation.epilogue_schedule]
+
+    # KernelScheduleTag and TileSchedulerTag both hard-code the
+    # namespace qualification of KernelScheduleAuto as
+    # "cutlass::gemm::collective::" (unless the tag is 'void').
+    #
+    # For TileSchedulerTag, this namespace is fine, since CUTLASS 3
+    # convolutions use the same tile schedulers (from the same
+    # cutlass::gemm::collective namespace) as GEMMs.
+    kernel_schedule = KernelScheduleTag[operation.kernel_schedule].replace('gemm::', 'conv::')
+    tile_scheduler = TileSchedulerTag[operation.tile_scheduler]
+    opcode_class = OpcodeClassTag[operation.tile_description.math_instruction.opcode_class]
+
+    values = {
+      'operation_name': operation.procedural_name(),
+      'conv_kind':      ConvKindTag[operation.conv_kind],
+      'conv_kind_name': ConvKindNames[operation.conv_kind].capitalize(),
+      'element_a':      DataTypeTag[operation.A.element],
+      'layout_a':       LayoutTag[operation.A.layout],
+      'align_a':        int(operation.A.alignment),
+      'element_b':      DataTypeTag[operation.B.element],
+      'layout_b':       LayoutTag[operation.B.layout],
+      'align_b':        int(operation.B.alignment),
+      'element_c':      DataTypeTag[operation.C.element],
+      'layout_c':       LayoutTag[operation.C.layout],
+      'align_c':        int(operation.C.alignment),
+      'element_d':      DataTypeTag[operation.D.element],
+      'layout_d':       LayoutTag[operation.D.layout],
+      'align_d':        int(operation.D.alignment),
+      'element_accumulator':   DataTypeTag[operation.accumulator_type()],
+      'opcode_class':          opcode_class,
+      'arch':                  self.arch_number_to_type(operation.arch),
+      'output_cta_tile_shape': self.output_cta_tile_shape(operation, cta_m, cta_n, cta_k),
+      'mma_tile_shape':        self.mma_tile_shape(operation, cta_m, cta_n, cta_k),
+      'cluster_shape':         self.cluster_shape(operation),
+      'opcode_class_epi':      opcode_class_epi,
+      'opcode_class_main':     opcode_class_main,
+      'epi_tile_mn':           epi_tile_mn,
+      'stages':                self.stage_count(operation),
+      'kernel_schedule':       kernel_schedule,
+      'epilogue_schedule':     epilogue_schedule,
+      'tile_scheduler':        tile_scheduler,
+      'element_compute':       DataTypeTag[operation.element_compute]
+    }
+    return Template(self.template).substitute(values)
+
+class EmitConv3xIncludes:
+  def __init__(self):
+    _LOGGER.debug("*** EmitConv3xIncludes::__init__")
+    self.includes = ['conv_operation_3x.hpp',
+                     'cutlass/conv/device/conv_universal_adapter.hpp',
+                     'cutlass/conv/kernel/conv_universal.hpp',
+                     'cutlass/conv/collective/collective_builder.hpp',
+                     'cutlass/epilogue/collective/collective_builder.hpp']
+
+  def emit(self, operation) -> str:
+    _LOGGER.debug("*** EmitConv3xIncludes::emit")
+    return '\n'.join(f"#include \"{incl}\"" for incl in self.includes) + \
+      "\n\n///////////////////////////////////////////////////////////////////////////////////////////////////"
diff --git a/tools/library/scripts/gemm_operation.py b/python/cutlass_library/gemm_operation.py
similarity index 66%
rename from tools/library/scripts/gemm_operation.py
rename to python/cutlass_library/gemm_operation.py
index 095fbb91d5..62a5474ae7 100644
--- a/tools/library/scripts/gemm_operation.py
+++ b/python/cutlass_library/gemm_operation.py
@@ -1,17 +1,56 @@
+#################################################################################################
 #
-# \file generator.py
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# \brief Generates the CUTLASS Library's instances
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for emitting GEMM kernels
+"""
 
+import collections
 import enum
-import os.path
-import shutil
 import functools
+import logging
 import operator
-import collections
+import os.path
+import shutil
 
-from library import *
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+except ImportError:
+  from library import *
+
+_LOGGER = logging.getLogger(__name__)
 
 ###################################################################################################
 #
@@ -23,8 +62,17 @@
 class GemmOperation:
   #
   def __init__(self, gemm_kind, arch, tile_description, A, B, C, element_epilogue, \
-      epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8):
-
+      epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8, D = None,
+      kernel_schedule = KernelScheduleType.ScheduleAuto, epilogue_schedule = EpilogueScheduleType.ScheduleAuto,
+      tile_scheduler = TileSchedulerType.Default
+    ):
+
+    kinds_3x = {
+      GemmKind.Universal3x,
+      GemmKind.SparseUniversal3x,
+    }
+    self.is_3x = gemm_kind in kinds_3x
+    self.prefix = "3x" if self.is_3x else ""
     self.operation_kind = OperationKind.Gemm
     self.arch = arch
     self.tile_description = tile_description
@@ -32,19 +80,38 @@ def __init__(self, gemm_kind, arch, tile_description, A, B, C, element_epilogue,
     self.A = A
     self.B = B
     self.C = C
+    self.D = D
+
+    if self.D == None:
+      self.D = self.C
+
+    if not self.is_3x:
+      assert(kernel_schedule == KernelScheduleType.ScheduleAuto)
+      assert(epilogue_schedule == EpilogueScheduleType.ScheduleAuto)
+    self.kernel_schedule = kernel_schedule
+    self.epilogue_schedule = epilogue_schedule
     self.element_epilogue = element_epilogue
     self.epilogue_functor = epilogue_functor
+
+    if self.is_3x and epilogue_functor == EpilogueFunctor.LinearCombination:
+      self.epilogue_functor = EpilogueFunctor3x.LinearCombination
+
     self.swizzling_functor = swizzling_functor
+    self.tile_scheduler = tile_scheduler
 
   #
   def is_complex(self):
     complex_operators = [
-      MathOperation.multiply_add_complex, 
+      MathOperation.multiply_add_complex,
       MathOperation.multiply_add_complex_gaussian,
       MathOperation.multiply_add_complex_fast_f32
     ]
     return self.tile_description.math_instruction.math_operation in complex_operators
 
+  #
+  def is_mixed_input(self):
+    return self.A.element != self.B.element
+
   #
   def is_planar_complex(self):
     return self.gemm_kind in (GemmKind.PlanarComplex, GemmKind.PlanarComplexArray)
@@ -68,22 +135,35 @@ def short_math_name(self):
   #
   def core_name(self):
     ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
-    
+
     inst_shape = ''
     inst_operation = ''
     intermediate_type = ''
 
     math_operations_map = {
       MathOperation.xor_popc: 'xor',
+      MathOperation.and_popc: 'and',
+      MathOperation.multiply_add_fast_accum: 'fastaccum',
     }
 
-    if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp or \
-      self.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp:
+    tensor_ops = [
+      OpcodeClass.TensorOp,
+      OpcodeClass.WmmaTensorOp,
+      OpcodeClass.SparseTensorOp,
+    ]
+
+    is_tensor_op = self.tile_description.math_instruction.opcode_class in tensor_ops
+
+    if is_tensor_op:
 
       math_op = self.tile_description.math_instruction.math_operation
       math_op_string = math_operations_map[math_op] if math_op in math_operations_map.keys() else ''
 
-      inst_shape = "%d%d%d" % tuple(self.tile_description.math_instruction.instruction_shape)
+      if self.is_3x:
+        inst_shape = "{0}x{1}x{2}".format(*tuple(self.tile_description.math_instruction.instruction_shape))
+      else:
+        inst_shape = "{0}{1}{2}".format(*tuple(self.tile_description.math_instruction.instruction_shape))
+
       inst_shape += math_op_string
 
       if self.tile_description.math_instruction.element_a != self.A.element and \
@@ -92,63 +172,155 @@ def core_name(self):
 
     return "%s%s%s%s" % (self.short_math_name(), inst_shape, intermediate_type, GemmKindNames[self.gemm_kind])
 
-  #
+  # Generates a string representing the MMA instruction.
   def extended_name(self):
     ''' Append data types if they differ from compute type. '''
     if self.is_complex():
       extended_name = "${core_name}"
     else:
-      if self.C.element != self.tile_description.math_instruction.element_accumulator and \
-        self.A.element != self.tile_description.math_instruction.element_accumulator:
-        extended_name = "${element_c}_${core_name}_${element_a}"
-      elif self.C.element == self.tile_description.math_instruction.element_accumulator and  \
-        self.A.element != self.tile_description.math_instruction.element_accumulator:
-        extended_name = "${core_name}_${element_a}"
+      if self.is_mixed_input():
+        extended_name = "${core_name}_${element_a}_${element_b}"
+        if self.C.element != self.tile_description.math_instruction.element_accumulator:
+          extended_name = "${element_c}_" + extended_name
       else:
         extended_name = "${core_name}"
+        if self.C.element != self.tile_description.math_instruction.element_accumulator:
+          extended_name = "${element_c}_" + extended_name
+        if self.A.element != self.tile_description.math_instruction.element_accumulator:
+          extended_name += "_${element_a}"
 
     extended_name = SubstituteTemplate(extended_name, {
       'element_a': DataTypeNames[self.A.element],
+      'element_b': DataTypeNames[self.B.element],
       'element_c': DataTypeNames[self.C.element],
       'core_name': self.core_name()
       })
 
     return extended_name
 
-  #
+  def extended_name_3x(self):
+    '''Generates a string representing the MMA atom. Assumes accumulator type is C type.'''
+    extended_name = "{core_name}_{element_a}_{element_b}_{element_acc}_{element_c}_{element_d}".format(
+      element_a = DataTypeNames[self.A.element],
+      element_b = DataTypeNames[self.B.element],
+      element_acc = DataTypeNames[self.accumulator_type()],
+      element_c = DataTypeNames[self.C.element],
+      element_d = DataTypeNames[self.D.element],
+      core_name = self.core_name())
+    return extended_name
+
+  def datatype_name_3x(self):
+    '''Generates a string representing the MMA atom. Assumes accumulator type is C type.'''
+    datatype_name = "{element_a}_{element_b}_{element_acc}_{element_c}_{element_d}".format(
+      element_a = DataTypeNames[self.A.element],
+      element_b = DataTypeNames[self.B.element],
+      element_acc = DataTypeNames[self.accumulator_type()],
+      element_c = DataTypeNames[self.C.element],
+      element_d = DataTypeNames[self.D.element])
+    return datatype_name
+
+  # Generates a short string representing the AB layout tags (e.g. nt or tn)
   def layout_name(self):
     if self.is_complex() or self.is_planar_complex():
       return "%s%s" % (
-        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)], 
+        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)],
         ShortComplexLayoutNames[(self.B.layout, self.B.complex_transform)]
       )
     return "%s%s" % (ShortLayoutTypeNames[self.A.layout], ShortLayoutTypeNames[self.B.layout])
 
-  #
-  def procedural_name(self):
-    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
-    threadblock = self.tile_description.procedural_name()
+  # Generates a short string representing the ABC layout tags (e.g. ntn or tnn)
+  def layout_name_3x(self):
+    if self.is_complex() or self.is_planar_complex():
+      return "{}{}{}".format(
+        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)],
+        ShortComplexLayoutNames[(self.B.layout, self.B.complex_transform)],
+        ShortComplexLayoutNames[(self.C.layout, self.C.complex_transform)])
+    else:
+      return "{}{}{}".format(
+        ShortLayoutTypeNames[self.A.layout],
+        ShortLayoutTypeNames[self.B.layout],
+        ShortLayoutTypeNames[self.C.layout])
 
-    opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
+  # Generates a short string representing underlying kernel schedule type
+  def kernel_schedule_name_3x(self):
+    return KernelScheduleSuffixes[self.kernel_schedule]
 
-    alignment = max([self.A.alignment, self.B.alignment, self.C.alignment])
+  # Generates a short string representing underlying epilogue schedule type
+  def epilogue_schedule_name_3x(self):
+    return EpilogueScheduleSuffixes[self.epilogue_schedule]
 
-    return SubstituteTemplate(
-      "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}_align${alignment}",
-      {
-        'opcode_class': opcode_class_name,
-        'extended_name': self.extended_name(),
-        'threadblock': threadblock,
-        'layout': self.layout_name(),
-        'alignment': "%d" % self.A.alignment,
-      }
-    )
+  # Generate a short string representing the operation class
+  def opcode_class_name(self):
+    return OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
+
+  # Generates the full kernel function name
+  def procedural_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
+    if self.arch >= 90:
+      kernel_name_template = "cutlass{p}_sm{ar}_{op}_{ex}{ct}{cs}_{l}_{s}_align{al}{t}{k}{e}"
+      return kernel_name_template.format(
+          p = self.prefix,
+          ar = self.arch,
+          op = opcode_class_name,
+          ex = self.extended_name_3x(),
+          ct = '_' + 'x'.join([str(i) for i in self.tile_description.tile_shape]) if self.tile_description.tile_shape[0] > 0 else "",
+          cs = '_' + 'x'.join([str(i) for i in self.tile_description.cluster_shape]),
+          l = self.tile_description.stages,
+          s = self.layout_name_3x(),
+          al = str(max(self.A.alignment, self.B.alignment)),
+          t = TileSchedulerSuffixes[self.tile_scheduler],
+          k = self.kernel_schedule_name_3x(),
+          e = self.epilogue_schedule_name_3x())
+    else:
+      threadblock = self.tile_description.procedural_name()
+      return "cutlass{p}_{op}_{ex}_{tb}_{l}_align{a}".format(
+          p = self.prefix,
+          op = opcode_class_name,
+          ex = self.extended_name(),
+          tb = threadblock,
+          l = self.layout_name(),
+          a = str(max(self.A.alignment, self.B.alignment)))
 
   #
   def configuration_name(self):
     ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
     return self.procedural_name()
 
+  def __hash__(self):
+    return hash(self.configuration_name())
+
+  def __eq__(self, other):
+    return self.configuration_name() == other.configuration_name()
+
+###################################################################################################
+#
+# Data structure modeling a grouped GEMM operation
+#
+###################################################################################################
+
+#
+class GroupedGemmOperation(GemmOperation):
+  #
+  def __init__(self, gemm_kind, arch, tile_description, A, B, C, element_epilogue, \
+      epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8, \
+      scheduler_mode = GroupScheduleMode.Device):
+    super().__init__(gemm_kind, arch, tile_description, A, B, C, element_epilogue, \
+                     epilogue_functor, swizzling_functor)
+
+    self.scheduler_mode = scheduler_mode
+
+  #
+  def procedural_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    base = super().procedural_name()
+    return SubstituteTemplate(
+      base + "_schedule${schedule}",
+      {
+        'schedule': ShortGroupScheduleModeNames[self.scheduler_mode]
+      })
+
+
 ###################################################################################################
 #
 # Emits single instances of a CUTLASS device-wide operator
@@ -232,7 +404,7 @@ def emit(self, operation):
     epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
 
     residual = ''
-    
+
     values = {
       'operation_name': operation.procedural_name(),
       'element_a': DataTypeTag[operation.A.element],
@@ -322,7 +494,7 @@ def emit(self, operation):
     epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
 
     residual = ''
-    
+
     values = {
       'operation_name': operation.procedural_name(),
       'element_a': DataTypeTag[operation.A.element],
@@ -389,7 +561,7 @@ def __init__(self, operation_suffix = ''):
 """
     self.gemm_template = """
 // Gemm operator ${operation_name}
-using ${operation_name}_base = 
+using ${operation_name}_base =
   typename cutlass::gemm::kernel::DefaultGemmUniversal<
     ${element_b}, ${layout_b}, ${transform_b}, ${align_b},    // transposed B operand
     ${element_a}, ${layout_a}, ${transform_a}, ${align_a},    // transposed A operand
@@ -407,12 +579,12 @@ def __init__(self, operation_suffix = ''):
 >::GemmKernel;
 
 // Define named type
-struct ${operation_name}${operation_suffix} : 
+struct ${operation_name}${operation_suffix} :
   public ${operation_name}_base { };
 """
     self.gemm_template_interleaved = """
 // Gemm operator ${operation_name}
-using ${operation_name}_base = 
+using ${operation_name}_base =
   typename cutlass::gemm::kernel::DefaultGemmUniversal<
     ${element_a}, ${layout_a}, ${transform_a}, ${align_a},
     ${element_b}, ${layout_b}, ${transform_b}, ${align_b},
@@ -430,7 +602,7 @@ def __init__(self, operation_suffix = ''):
 >::GemmKernel;
 
 // Define named type
-struct ${operation_name}${operation_suffix} : 
+struct ${operation_name}${operation_suffix} :
   public ${operation_name}_base { };
 """
 
@@ -522,6 +694,177 @@ def emit(self, operation):
 
     return SubstituteTemplate(gemm_template, values)
 
+
+###################################################################################################
+
+class EmitGemmUniversal3xInstance:
+  ''' Responsible for emitting a CUTLASS 3.x template definition'''
+
+  def __init__(self, operation_suffix = ''):
+    self.operation_suffix = operation_suffix
+    self.includes = [
+      "cutlass/cutlass.h",
+      "cutlass/gemm/gemm.h",
+      "cutlass/numeric_types.h",
+      "cutlass/gemm/kernel/gemm_universal.hpp",
+      "cutlass/gemm/collective/collective_builder.hpp",
+      "cutlass/epilogue/collective/collective_builder.hpp",
+    ]
+    self.builtin_epilogue_functor_template = \
+"""${epilogue_functor}<
+      ${element_d},
+      ${element_epilogue},
+      ${element_c},
+      ${element_epilogue}
+    >"""
+
+    self.gemm_template = """
+
+using ${operation_name}_epilogue =
+  typename cutlass::epilogue::collective::CollectiveBuilder<
+    ${arch}, ${opcode_class_epi},
+    cute::Shape<cute::_${tile_shape_epi_m}, cute::_${tile_shape_epi_n}, cute::_${tile_shape_epi_k}>,
+    cute::Shape<${cluster_shape_m}, ${cluster_shape_n}, ${cluster_shape_k}>,
+    ${epi_tile_mn},
+    ${element_accumulator}, ${element_epilogue},
+    ${element_c}, ${layout_c}, ${align_c},
+    ${element_d}, ${layout_d}, ${align_d},
+    ${epilogue_schedule},
+    ${epilogue_functor}
+  >::CollectiveOp;
+
+using ${operation_name}_mainloop =
+  typename cutlass::gemm::collective::CollectiveBuilder<
+    ${arch}, ${opcode_class_main},
+    ${element_a}, ${layout_a}, ${align_a},
+    ${element_b}, ${layout_b}, ${align_b},
+    ${element_accumulator},
+    cute::Shape<cute::_${tile_shape_main_m}, cute::_${tile_shape_main_n}, cute::_${tile_shape_main_k}>,
+    cute::Shape<${cluster_shape_m}, ${cluster_shape_n}, ${cluster_shape_k}>,
+    ${stages},
+    ${kernel_schedule}
+  >::CollectiveOp;
+
+// Gemm operator ${operation_name}
+using ${operation_name}_base = cutlass::gemm::kernel::GemmUniversal<
+    cute::Shape<int,int,int,int>,
+    ${operation_name}_mainloop,
+    ${operation_name}_epilogue,
+    ${tile_scheduler}>;
+
+// Define named type
+struct ${operation_name} :
+  public ${operation_name}_base { };
+
+"""
+  #
+  def instance_template(self):
+    return """
+${compile_guard_start}
+  {
+    using GemmKernel = cutlass::gemm::device::GemmUniversalAdapter<${operation_name}>;
+    manifest.append(
+      new ${gemm_kind}<GemmKernel>("${operation_name}"));
+  }
+${compile_guard_end}
+"""
+
+  #
+  def emit(self, operation):
+    _LOGGER.debug("*** EmitGemmConfigurationLibrary::emit(operation)")
+    _LOGGER.debug("***   operation.procedural_name(): " + operation.procedural_name())
+    _LOGGER.debug("***   tile_shape: " + str(operation.tile_description.tile_shape))
+    _LOGGER.debug("***   warp_count: " + str(operation.tile_description.warp_count))
+
+    opcode_class_main = operation.tile_description.math_instruction.opcode_class
+    opcode_class_epi = opcode_class_main
+    tile_shape = operation.tile_description.tile_shape
+    instruction_shape = operation.tile_description.math_instruction.instruction_shape
+    cluster_m = operation.tile_description.cluster_shape[0]
+    cluster_n = operation.tile_description.cluster_shape[1]
+
+    tile_shape_main_m, tile_shape_main_n, tile_shape_main_k = tile_shape
+    tile_shape_epi_m, tile_shape_epi_n, tile_shape_epi_k = tile_shape
+
+    # account for static/dynamic cluster shapes
+    cta_m = tile_shape[0] // cluster_m if cluster_m > 0 else tile_shape[0]
+    cta_n = tile_shape[1] // cluster_n if cluster_n > 0 else tile_shape[1]
+
+    # stage count set to zero indicates builder automatic stage selection
+    if operation.tile_description.stages > 0:
+      stage_count_string = f"cutlass::gemm::collective::StageCount<{str(operation.tile_description.stages)}>"
+    else:
+      stage_count_string = f"cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename {str(operation.procedural_name())}_epilogue::SharedStorage))>"
+
+    epi_tile_mn = "cutlass::epilogue::collective::EpilogueTileAuto"
+
+    instance_layout_A, instance_layout_B, instance_layout_C , instance_layout_D = \
+      (operation.A.layout, operation.B.layout, operation.C.layout, operation.D.layout)
+
+    # 3.0 profiler integration only supports trivial epilogues for now
+    epilogue_vector_length = 1
+
+    # Support built-in epilogue functors or user-defined functions
+    if isinstance(operation.epilogue_functor, enum.Enum):
+      values = {
+        'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+        'epilogue_functor': EpilogueFunctor3xTag[operation.epilogue_functor],
+      }
+      epilogue_functor = SubstituteTemplate(self.builtin_epilogue_functor_template, values)
+    else:
+      epilogue_functor = self.epilogue_functor.emit_declaration()
+    #
+    # Cutlass3x complex kernels' ElementA(B) is a tuple in collective mainloop builder, e.g. cute::tuple<Element, Transform>, Transform : cute::identity / cute::conjugate.
+    element_a = DataTypeTag[operation.A.element] if not operation.is_complex() else f"cute::tuple<{str(DataTypeTag[operation.A.element])},{str(ComplexTransformTag3x[operation.A.complex_transform])}>"
+    element_b = DataTypeTag[operation.B.element] if not operation.is_complex() else f"cute::tuple<{str(DataTypeTag[operation.B.element])},{str(ComplexTransformTag3x[operation.B.complex_transform])}>"
+    epilogue_schedule_type = EpilogueScheduleTag[operation.epilogue_schedule]
+    is_no_smem_epilogue = operation.epilogue_schedule == EpilogueScheduleType.NoSmemWarpSpecialized
+    values = {
+      'operation_name': operation.procedural_name(),
+      'operation_suffix': self.operation_suffix,
+      'element_a': element_a,
+      'layout_a': LayoutTag[instance_layout_A],
+      'element_b': element_b,
+      'layout_b': LayoutTag[instance_layout_B],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[instance_layout_C],
+      'element_d': DataTypeTag[operation.D.element],
+      'layout_d': LayoutTag[instance_layout_D],
+      'element_accumulator': DataTypeTag[operation.accumulator_type()],
+      'opcode_class_main': OpcodeClassTag[opcode_class_main],
+      'opcode_class_epi': OpcodeClassTag[opcode_class_epi],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'tile_shape_epi_m': str(tile_shape_epi_m),
+      'tile_shape_epi_n': str(tile_shape_epi_n),
+      'tile_shape_epi_k': str(tile_shape_epi_k),
+      'tile_shape_main_m': str(tile_shape_main_m),
+      'tile_shape_main_n': str(tile_shape_main_n),
+      'tile_shape_main_k': str(tile_shape_main_k),
+      'cluster_shape_m': 'cute::_' + str(operation.tile_description.cluster_shape[0]) if operation.tile_description.cluster_shape[0] > 0 else "int",
+      'cluster_shape_n': 'cute::_' + str(operation.tile_description.cluster_shape[1]) if operation.tile_description.cluster_shape[1] > 0 else "int",
+      'cluster_shape_k': 'cute::_' + str(operation.tile_description.cluster_shape[2]) if operation.tile_description.cluster_shape[2] > 0 else "int",
+      'instruction_shape_m': str(instruction_shape[0]),
+      'instruction_shape_n': str(instruction_shape[1]),
+      'instruction_shape_k': str(instruction_shape[2]),
+      'kernel_schedule' : str(KernelScheduleTag[operation.kernel_schedule]),
+      'epilogue_schedule' : str(epilogue_schedule_type),
+      'epi_tile_mn' : epi_tile_mn,
+      'epilogue_functor': epilogue_functor,
+      'stages': stage_count_string,
+      'align_a': str(operation.A.alignment),
+      'align_b': str(operation.B.alignment),
+      'align_c': str(operation.C.alignment),
+      'align_d': str(operation.C.alignment),
+      'transform_a': ComplexTransformTag[operation.A.complex_transform],
+      'transform_b': ComplexTransformTag[operation.B.complex_transform],
+      'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation],
+      'epilogue_vector_length': str(epilogue_vector_length),
+      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+      'tile_scheduler': str(TileSchedulerTag[operation.tile_scheduler]),
+    }
+
+    return SubstituteTemplate(self.gemm_template, values)
+
 ###################################################################################################
 
 #
@@ -554,7 +897,7 @@ def __init__(self, operation_suffix = ''):
     ${math_operator}
   >::GemmKernel;
 
-  struct ${operation_name} : 
+  struct ${operation_name} :
     public Operation_${operation_name} { };
 """
 
@@ -714,14 +1057,14 @@ def __init__(self, operation_suffix = ''):
       "cutlass/gemm/kernel/default_gemm_grouped.h",
       "cutlass/gemm/device/gemm_grouped.h"
     ]
-    self.builtin_epilogue_functor_template = """
-    ${epilogue_functor}<
+    self.builtin_epilogue_functor_template = \
+"""${epilogue_functor}<
       ${element_c},
       ${epilogue_vector_length},
       ${element_accumulator},
       ${element_epilogue}
-    >
-"""
+    >"""
+
     self.gemm_template = """
 // Gemm operator ${operation_name}
 using ${operation_name}_base =
@@ -738,6 +1081,7 @@ def __init__(self, operation_suffix = ''):
     ${epilogue_functor},
     ${swizzling_functor},
     ${stages},
+    ${scheduler_mode},
     ${math_operation}
 >::GemmKernel;
 
@@ -817,6 +1161,7 @@ def emit(self, operation):
       'align_b': str(operation.B.alignment),
       'transform_a': ComplexTransformTag[operation.A.complex_transform],
       'transform_b': ComplexTransformTag[operation.B.complex_transform],
+      'scheduler_mode': GroupScheduleModeTag[operation.scheduler_mode],
       'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation]
     }
 
@@ -837,6 +1182,8 @@ def __init__(self, operation_path, configuration_name):
       GemmKind.Gemm: EmitGemmInstance,
       GemmKind.Sparse: EmitSparseGemmInstance,
       GemmKind.Universal: EmitGemmUniversalInstance,
+      GemmKind.Universal3x: EmitGemmUniversal3xInstance,
+      GemmKind.SparseUniversal3x: EmitGemmUniversal3xInstance,
       GemmKind.PlanarComplex: EmitGemmPlanarComplexInstance,
       GemmKind.PlanarComplexArray: EmitGemmPlanarComplexArrayInstance,
       GemmKind.Grouped: EmitGemmGroupedInstance
@@ -846,6 +1193,8 @@ def __init__(self, operation_path, configuration_name):
       GemmKind.Gemm: 'GemmOperation',
       GemmKind.Sparse: 'GemmSparseOperation',
       GemmKind.Universal: 'GemmUniversalOperation',
+      GemmKind.Universal3x: 'GemmUniversal3xOperation',
+      GemmKind.SparseUniversal3x: 'SparseGemmUniversal3xOperation',
       GemmKind.PlanarComplex: 'GemmPlanarComplexOperation',
       GemmKind.PlanarComplexArray: 'GemmPlanarComplexArrayOperation',
       GemmKind.Grouped: 'GemmGroupedOperation'
@@ -890,6 +1239,10 @@ def __init__(self, operation_path, configuration_name):
 """
 
   def __enter__(self):
+    _LOGGER.debug("*** EmitGemmConfigurationLibrary::__enter__")
+    _LOGGER.debug("***   configuration_path (file to write): " +
+                  str(self.configuration_path))
+
     self.configuration_file = open(self.configuration_path, "w")
     self.configuration_file.write(self.header_template)
     self.configuration_file.write(self.separator)
@@ -900,7 +1253,10 @@ def __enter__(self):
       ("cutlass/library/manifest.h", None),
       ("library_internal.h", None),
       ("gemm_operation.h", None),
+      ("gemm_operation_3x.hpp", None),
+      ("sparse_gemm_operation_3x.hpp", None),
       ("cutlass/arch/wmma.h", None),
+      ("cutlass/numeric_types.h", None)
     ])
     self.instance_definitions = []
     self.instance_wrappers = []
@@ -909,6 +1265,9 @@ def __enter__(self):
     return self
 
   def emit(self, operation):
+    _LOGGER.debug("*** EmitGemmConfigurationLibrary::emit(operation)")
+    _LOGGER.debug("***   operation.gemm_kind: " + str(operation.gemm_kind))
+
     emitter = self.instance_emitter[operation.gemm_kind]()
 
     for incl in emitter.includes:
@@ -925,7 +1284,7 @@ def emit(self, operation):
       'compile_guard_start': SubstituteTemplate(self.wmma_guard_start, {'sm_number': str(operation.arch)}) \
         if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "",
       'compile_guard_end': "#endif" \
-        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "" 
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else ""
       }))
 
   def __exit__(self, exception_type, exception_value, traceback):
@@ -945,13 +1304,12 @@ def __exit__(self, exception_type, exception_value, traceback):
     self.configuration_file.write(SubstituteTemplate(self.initialize_function_template, {
       'configuration_name': self.configuration_name
       }))
-   
+
     for instance_wrapper in self.instance_wrappers:
-      self.configuration_file.write(instance_wrapper) 
+      self.configuration_file.write(instance_wrapper)
 
     self.configuration_file.write(self.epilogue_template)
     self.configuration_file.close()
 
 ###################################################################################################
 ###################################################################################################
-
diff --git a/tools/library/scripts/generator.py b/python/cutlass_library/generator.py
similarity index 52%
rename from tools/library/scripts/generator.py
rename to python/cutlass_library/generator.py
index bc03f2efde..bd06a8016a 100644
--- a/tools/library/scripts/generator.py
+++ b/python/cutlass_library/generator.py
@@ -1,16 +1,98 @@
+#################################################################################################
 #
-# \file generator.py
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# \brief Generates the CUTLASS Library's instances
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
 
+"""
+Utilities for enumerating CUTLASS library kernels
+"""
+
+import argparse
 import enum
+from itertools import chain, product
+import logging
 import os.path
 import shutil
-import argparse
-
-from library import *
-from manifest import *
+import sys
+import copy
+from typing import Any, Dict, Optional, Sequence, Tuple
+
+_LOGGER = logging.getLogger(__name__)
+
+def logging_prefix(indent_level: int = 0) -> str:
+  """String prefix for start of each debug log entry"""
+  prefix = '*** '
+  indent = '  '
+  return f"{prefix}{indent_level * indent}"
+
+def log_debug_line(line: str, indent_level: int = 0) -> None:
+  """Log one line of debug output"""
+  prefix = logging_prefix(indent_level)
+  _LOGGER.debug(prefix + line)
+
+# Certain usecases of cutlass_library nearly always prefer to run as scripts with
+# relative imports, rather than via an installed Python package. An example of this
+# is using CUTLASS's CMake system to generate a library of kernels to be profiled.
+# To make it easy to use these use cases when an existing installation of cutlass_library
+# exists, this global flag can be set to true (via command-line arguments) to ensure
+# that package-based installations are not used.
+
+# Create a temporary argument parser to check only for the availability of the
+# --disable-cutlass-package-imports argument, which controls whether package-based
+# imports are disabled.
+def _add_package_disablement_flag(argparser):
+  argparser.add_argument("--disable-cutlass-package-imports", action='store_true', required=False,
+                     help="Disable use of cutlass_library from Python package")
+
+_parser = argparse.ArgumentParser()
+_add_package_disablement_flag(_parser)
+_args, _ = _parser.parse_known_args()
+
+# Add `CUTLASS_IGNORE_PACKAGE` to `builtins` so that it is visible for gating future
+# imports without requiring importing another module. Ideally, we would just place this
+# as a global variable in a module to that could be imported and checked (e.g.,
+# utils.CUTLASS_IGNORE_PACKAGE). However, this raises the issue of determining
+# where this module should be sourced (from the cutlass_library package or from
+# a relative import), which is the problem this variable is being used to solve in the
+# first place.
+import builtins
+builtins.CUTLASS_IGNORE_PACKAGE = _args.disable_cutlass_package_imports
+
+try:
+  if CUTLASS_IGNORE_PACKAGE:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+  from cutlass_library.manifest import *
+except ImportError:
+  from library import *
+  from manifest import *
 ###################################################################################################
 
 #
@@ -21,7 +103,7 @@ def CudaToolkitVersionSatisfies(semantic_ver_string, major, minor, patch = 0):
 
   # Update cuda_version based on parsed string
   if semantic_ver_string != '':
-    for i, x in enumerate([int(x) for x in semantic_ver_string.split('.')]):
+    for i, x in enumerate([int(x) for x in semantic_ver_string.split('.')[:3]]):
       if i < len(cuda_version):
         cuda_version[i] = x
       else:
@@ -45,16 +127,20 @@ def product(X, identity = 1):
   elements_per_thread = product(tile.threadblock_shape[:-1]) // product(tile.warp_count) // 32 // epilogue_steps
   return min(max_alignment, elements_per_thread)
 
+def DefaultSwizzlingFunctor():
+    return SwizzlingFunctor.Identity8
+    # To use StreamK decomposition for basic GEMMs, set `swizzling_functor = SwizzlingFunctor.StreamK`
+
 #
 def CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, \
   alignment_constraints, complex_transforms = None, epilogue_functor = EpilogueFunctor.LinearCombination, \
-  swizzling_functor = SwizzlingFunctor.Identity8):
+  swizzling_functor = DefaultSwizzlingFunctor()):
 
   if complex_transforms is None:
     complex_transforms = [(ComplexTransform.none, ComplexTransform.none),]
 
   element_a, element_b, element_c, element_epilogue = data_type
-  
+
   operations = []
 
   # by default, only generate the largest tile and largest alignment
@@ -66,11 +152,14 @@ def CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, \
     for tile_description in tile_descriptions:
       for alignment in alignment_constraints:
         for complex_transform in complex_transforms:
-  
-            alignment_c = min(8, alignment)
- 
-            A = TensorDescription(element_a, layout[0], alignment, complex_transform[0])
-            B = TensorDescription(element_b, layout[1], alignment, complex_transform[1])
+
+            # If alignment is a tuple or a list, then we have different alignments for A and B
+            alignment_a = alignment if isinstance(alignment, int) else alignment[0]
+            alignment_b = alignment if isinstance(alignment, int) else alignment[1]
+            alignment_c = min(8, alignment_a) if isinstance(alignment, int) else alignment[2]
+
+            A = TensorDescription(element_a, layout[0], alignment_a, complex_transform[0])
+            B = TensorDescription(element_b, layout[1], alignment_b, complex_transform[1])
             C = TensorDescription(element_c, layout[2], alignment_c)
 
             new_operation = GemmOperation(GemmKind.Universal, tile_description.minimum_compute_capability, \
@@ -81,6 +170,103 @@ def CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, \
 
   return operations
 
+# Generates 3.0 API based GemmUniversal API kernels. Alignment constraints are folded in with layouts
+def CreateGemmUniversal3xOperator(
+    manifest, layouts, tile_descriptions, data_types,
+    schedules = [[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto]],
+    complex_transforms=None,
+    epilogue_functor=EpilogueFunctor.LinearCombination,
+    swizzling_functor=SwizzlingFunctor.Identity1,
+    tile_schedulers=[TileSchedulerType.Default]):
+
+  if type(data_types) is dict:
+    data_types = [data_types]
+
+  for s in schedules:
+    assert(len(s) == 2)
+
+  if complex_transforms is None:
+    complex_transforms = [(ComplexTransform.none, ComplexTransform.none), ]
+
+  operations = []
+
+  # by default, only generate the largest tile and largest alignment
+  if manifest.kernel_filter == '':
+    tile_descriptions = [tile_descriptions[0]]
+
+  combinations = product(layouts, tile_descriptions, data_types, complex_transforms, schedules, tile_schedulers)
+  for layout, tile_description, data_type, complex_transform, schedules, tile_scheduler in combinations:
+    kernel_schedule, epilogue_schedule = schedules
+    A = TensorDescription(
+        data_type["a_type"], layout[0][0], layout[0][1], complex_transform[0])
+    B = TensorDescription(
+        data_type["b_type"], layout[1][0], layout[1][1], complex_transform[1])
+
+    C = TensorDescription(data_type["c_type"], layout[2][0], layout[2][1])
+    D = TensorDescription(data_type["d_type"], layout[2][0], layout[2][1])
+
+    gemm_op_extra_args = {}
+    gemm_kind = GemmKind.Universal3x
+    element_compute = data_type.get("epi_type", data_type["acc_type"])
+
+    operation = GemmOperation(
+        gemm_kind, tile_description.minimum_compute_capability,
+        tile_description, A, B, C, element_compute, epilogue_functor, swizzling_functor, D,
+        kernel_schedule, epilogue_schedule, tile_scheduler, **gemm_op_extra_args)
+
+    manifest.append(operation)
+    operations.append(operation)
+
+  return operations
+
+# Generates 3.0 API based GemmUniversal API kernels. Alignment constraints are folded in with layouts
+def CreateSparseGemmUniversal3xOperator(
+    manifest, layouts, tile_descriptions, data_types,
+    schedules = [[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto]],
+    complex_transforms=None,
+    epilogue_functor=EpilogueFunctor.LinearCombination,
+    swizzling_functor=SwizzlingFunctor.Identity1,
+    tile_schedulers=[TileSchedulerType.Default]):
+
+  if type(data_types) is dict:
+    data_types = [data_types]
+
+  for s in schedules:
+    assert(len(s) == 2)
+
+  if complex_transforms is None:
+    complex_transforms = [(ComplexTransform.none, ComplexTransform.none), ]
+
+  operations = []
+
+  # by default, only generate the largest tile and largest alignment
+  if manifest.kernel_filter == '':
+    tile_descriptions = [tile_descriptions[0]]
+
+  combinations = product(layouts, tile_descriptions, data_types, complex_transforms, schedules, tile_schedulers)
+  for layout, tile_description, data_type, complex_transform, schedules, tile_scheduler in combinations:
+    kernel_schedule, epilogue_schedule = schedules
+    A = TensorDescription(
+        data_type["a_type"], layout[0][0], layout[0][1], complex_transform[0])
+    B = TensorDescription(
+        data_type["b_type"], layout[1][0], layout[1][1], complex_transform[1])
+
+    # Currently assume tensor C/D have same layout requirement.
+    C = TensorDescription(data_type["c_type"], layout[2][0], layout[2][1])
+    D = TensorDescription(data_type["d_type"], layout[2][0], layout[2][1])
+
+    element_compute = data_type.get("epi_type", data_type["acc_type"])
+
+    operation = GemmOperation(
+        GemmKind.SparseUniversal3x, tile_description.minimum_compute_capability,
+        tile_description, A, B, C, element_compute, epilogue_functor, swizzling_functor, D,
+        kernel_schedule, epilogue_schedule, tile_scheduler)
+
+    manifest.append(operation)
+    operations.append(operation)
+
+  return operations
+
 #
 def CreateSparseGemmOperator(manifest, layouts, tile_descriptions, data_type, \
   alignment_constraints, complex_transforms = None, epilogue_functor = EpilogueFunctor.LinearCombination, \
@@ -92,7 +278,7 @@ def CreateSparseGemmOperator(manifest, layouts, tile_descriptions, data_type, \
   element_a, element_b, element_c, element_epilogue = data_type
 
   gemm_kinds = [GemmKind.Sparse]
-  
+
   operations = []
 
   # by default, only generate the largest tile and largest alignment
@@ -104,9 +290,9 @@ def CreateSparseGemmOperator(manifest, layouts, tile_descriptions, data_type, \
     for tile_description in tile_descriptions:
       for alignment in alignment_constraints:
         for complex_transform in complex_transforms:
-  
+
             alignment_c = min(8, alignment)
- 
+
             A = TensorDescription(element_a, layout[0], alignment, complex_transform[0])
             B = TensorDescription(element_b, layout[1], alignment, complex_transform[1])
             C = TensorDescription(element_c, layout[2], alignment_c)
@@ -129,12 +315,12 @@ def CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, data_t
   element_a, element_b, element_c, element_epilogue = data_type
 
   gemm_kinds = [GemmKind.PlanarComplex, GemmKind.PlanarComplexArray]
-  
+
   # by default, only generate the largest tile and largest alignment
   if manifest.kernel_filter == '':
     tile_descriptions = [tile_descriptions[0],]
     alignment_constraints = [alignment_constraints[0],]
-  
+
   for gemm_kind in gemm_kinds:
     for layout in layouts:
       for tile_description in tile_descriptions:
@@ -180,7 +366,7 @@ def CreateGemmGroupedOperator(manifest, layouts, tile_descriptions, data_type, \
             B = TensorDescription(element_b, layout[1], alignment, complex_transform[1])
             C = TensorDescription(element_c, layout[2], alignment_c)
 
-            new_operation = GemmOperation(GemmKind.Grouped, tile_description.minimum_compute_capability, \
+            new_operation = GroupedGemmOperation(GemmKind.Grouped, tile_description.minimum_compute_capability, \
               tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor)
 
             manifest.append(new_operation)
@@ -194,7 +380,7 @@ def CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, data_t
   swizzling_functor = SwizzlingFunctor.Identity8):
 
   element_a, element_c, element_epilogue = data_type
-  
+
   operations = []
 
   # by default, only generate the largest tile and largest alignment
@@ -206,7 +392,7 @@ def CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, data_t
     for fill_mode in fill_modes:
       for tile_description in tile_descriptions:
         for alignment in alignment_constraints:
-          
+
           # SERK supported layouts (RowMajor, ColumnMajor) with no conjugation
           complex_transform = ComplexTransform.none
 
@@ -215,7 +401,7 @@ def CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, data_t
             complex_transform = ComplexTransform.conj
 
           alignment_c = 1 # Alignment only applies to A in SYRK
-   
+
           A = TensorDescription(element_a, layout[0], alignment, complex_transform)
           C = SymmetricTensorDescription(element_c, layout[1], fill_mode, alignment_c)
 
@@ -225,7 +411,7 @@ def CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, data_t
 
           manifest.append(new_operation)
           operations.append(new_operation)
-          
+
           # Rank-2K update
           new_operation = Rank2KOperation(RankKKind.Universal, tile_description.minimum_compute_capability, \
             tile_description, A, C, element_epilogue, epilogue_functor, swizzling_functor, blas_mode)
@@ -244,7 +430,7 @@ def CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, ti
     complex_transforms = [(ComplexTransform.none),]
 
   element_a, element_b, element_c, element_epilogue = data_type
-  
+
   operations = []
 
   # by default, only generate the largest tile and largest alignment
@@ -259,9 +445,9 @@ def CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, ti
           for tile_description in tile_descriptions:
             for alignment in alignment_constraints:
               for complex_transform in complex_transforms:
-        
+
                   alignment_c = min(8, alignment)
-       
+
                   A = TriangularTensorDescription(element_a, layout[0], side_mode, fill_mode, diag_type,
                                                   alignment, complex_transform)
                   B = TensorDescription(element_b, layout[1], alignment)
@@ -281,7 +467,7 @@ def CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descripti
   swizzling_functor = SwizzlingFunctor.Identity8):
 
   element_a, element_b, element_c, element_epilogue = data_type
-  
+
   operations = []
 
   # by default, only generate the largest tile and largest alignment
@@ -294,13 +480,13 @@ def CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descripti
       for fill_mode in fill_modes:
         for tile_description in tile_descriptions:
           for alignment in alignment_constraints:
-            
+
             # SYMM supported layouts (RowMajor, ColumnMajor) with no conjugation
             complex_transform = ComplexTransform.none
 
             alignment_a = 1 # No vectorized access for the triangular matrix
             alignment_c = min(8, alignment)
-       
+
             A = SymmetricTensorDescription(element_a, layout[0], fill_mode, alignment_a, complex_transform, side_mode)
             # tensor A and B have same data type and layout
             B = TensorDescription(element_b, layout[0], alignment)
@@ -312,7 +498,7 @@ def CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descripti
 
             manifest.append(new_operation)
             operations.append(new_operation)
-            
+
             # SYMM/HEMM update
             new_operation = SymmOperation(SymmKind.Universal, tile_description.minimum_compute_capability, \
               tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor, blas_mode)
@@ -338,15 +524,14 @@ def CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descripti
 def CreateConv2dOperator(manifest, layout, tile_descriptions, data_type, alignment_constraints, \
   conv_kinds = [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad], \
   epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4):
-  
+
   element_a, element_b, element_c, element_epilogue = data_type
-  
+
   # one exceptional case
-  
+
   # iterator algorithm (analytic and optimized)
-  #iterator_algorithms = [IteratorAlgorithm.Analytic, IteratorAlgorithm.Optimized]
-  iterator_algorithms = [IteratorAlgorithm.Optimized]
-  
+  iterator_algorithms = [IteratorAlgorithm.Analytic, IteratorAlgorithm.Optimized]
+
   # by default, only generate the largest tile size, largest alignment, and optimized iterator
   if manifest.kernel_filter == '':
     tile_descriptions = [tile_descriptions[0],]
@@ -363,66 +548,82 @@ def CreateConv2dOperator(manifest, layout, tile_descriptions, data_type, alignme
       A = TensorDescription(element_a, layout[0], alignment)
       B = TensorDescription(element_b, layout[1], alignment)
       C = TensorDescription(element_c, layout[2], alignment_c)
-      
+
       swizzling_functor_ = swizzling_functor
-  
+
       #
       # Conv2d Fprop
       #
       if ConvKind.Fprop in conv_kinds:
-  
+
         # Strided support for Analytic and Optimized Fprop
         for iterator_algorithm in iterator_algorithms:
-          new_operation = Conv2dOperation(ConvKind.Fprop, iterator_algorithm, tile.minimum_compute_capability, tile,\
-            A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor, swizzling_functor_)
-  
-          manifest.append(new_operation)
-          operations.append(new_operation)
-  
+          new_operations = [
+            # None grouped kernel
+            Conv2dOperation(ConvKind.Fprop, iterator_algorithm, tile.minimum_compute_capability, tile,\
+              A, B, C, element_epilogue, StrideSupport.Unity, epilogue_functor, swizzling_functor_),
+          ]
+
+          # Instance group conv kernel
+          if tile.math_instruction.opcode_class == OpcodeClass.TensorOp and A.layout == LayoutType.TensorNHWC and \
+            tile.minimum_compute_capability >= 80:
+            # SingleGroup kernel
+            new_operations.append(Conv2dOperation(ConvKind.Fprop, iterator_algorithm, tile.minimum_compute_capability, tile,\
+              A, B, C, element_epilogue, StrideSupport.Unity, epilogue_functor, swizzling_functor_, group_mode=GroupMode.SingleGroup))
+
+            # Analytic iterator supports MultipleGroup mode
+            if iterator_algorithm == IteratorAlgorithm.Analytic:
+              new_operations.append(Conv2dOperation(ConvKind.Fprop, iterator_algorithm, tile.minimum_compute_capability, tile,\
+                A, B, C, element_epilogue, StrideSupport.Unity, epilogue_functor, swizzling_functor_, group_mode=GroupMode.MultipleGroup))
+
+          for new_operation in new_operations:
+            manifest.append(new_operation)
+            operations.append(new_operation)
+
       #
       # Conv2d Dgrad
       #
       if ConvKind.Dgrad in conv_kinds:
-  
+
         # Unity stride for Analytic and Optimized Dgrad
         for iterator_algorithm in iterator_algorithms:
           new_operation = Conv2dOperation(ConvKind.Dgrad, iterator_algorithm, tile.minimum_compute_capability, tile,\
             A, B, C, element_epilogue, StrideSupport.Unity, epilogue_functor, swizzling_functor_)
-  
+
           manifest.append(new_operation)
           operations.append(new_operation)
-  
+
         # Strided support for Analytic Dgrad
         # strided dgrad uses a special threadblock swizzle
-        # note that SwizzlingFunctor.StridedDgradHorizontal might be 
+        # note that SwizzlingFunctor.StridedDgradHorizontal might be
         # better for problem sizes with large activation channel count
         swizzling_functor_strided_dgrad_ = SwizzlingFunctor.StridedDgradIdentity1
-  
+
         if IteratorAlgorithm.Analytic in iterator_algorithms:
           new_operation = Conv2dOperation(ConvKind.Dgrad, IteratorAlgorithm.Analytic, tile.minimum_compute_capability, tile,\
             A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor, swizzling_functor_strided_dgrad_)
-  
+
           manifest.append(new_operation)
           operations.append(new_operation)
-        
+
         # Strided support for Optimized Dgrad
         if IteratorAlgorithm.Optimized in iterator_algorithms:
           new_operation = Conv2dOperation(ConvKind.Dgrad, IteratorAlgorithm.Optimized, tile.minimum_compute_capability, tile,\
             A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor, swizzling_functor_strided_dgrad_)
-  
+
           manifest.append(new_operation)
           operations.append(new_operation)
-  
+
       #
       # Conv2d Wgrad
       #
       if ConvKind.Wgrad in conv_kinds:
-       
+
         # Strided support for Analytic and Optimized Wgrad
         for iterator_algorithm in iterator_algorithms:
           new_operation = Conv2dOperation(ConvKind.Wgrad, iterator_algorithm, tile.minimum_compute_capability, tile,\
             A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor, swizzling_functor_)
-  
+
           manifest.append(new_operation)
           operations.append(new_operation)
 
@@ -473,6 +674,7 @@ def CreateConv2dFixedChannelsOperator(manifest, layout, tile_descriptions, data_
           manifest.append(new_operation)
           operations.append(new_operation)
 
+  return operations
 
 # Convolution for 2D operations specialized for few channels
 def CreateConv2dFewChannelsOperator(manifest, layout, tile_descriptions, data_type, channel_counts, \
@@ -517,18 +719,19 @@ def CreateConv2dFewChannelsOperator(manifest, layout, tile_descriptions, data_ty
           manifest.append(new_operation)
           operations.append(new_operation)
 
+  return operations
+
 # Convolution for 3D operations
 def CreateConv3dOperator(manifest, layout, tile_descriptions, data_type, alignment, \
   conv_kinds = [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad], epilogue_functor = EpilogueFunctor.LinearCombination):
-  
+
   element_a, element_b, element_c, element_epilogue = data_type
-  
+
   # one exceptional case
   alignment_c = min(8, alignment)
-  
+
   # iterator algorithm (analytic and optimized)
-  #iterator_algorithms = [IteratorAlgorithm.Analytic, IteratorAlgorithm.Optimized]
-  iterator_algorithms = [IteratorAlgorithm.Optimized]
+  iterator_algorithms = [IteratorAlgorithm.Analytic, IteratorAlgorithm.Optimized]
 
   # by default, only generate the largest tile size and optimized iterators
   if manifest.kernel_filter == '':
@@ -542,7 +745,7 @@ def CreateConv3dOperator(manifest, layout, tile_descriptions, data_type, alignme
     A = TensorDescription(element_a, layout, alignment)
     B = TensorDescription(element_b, layout, alignment)
     C = TensorDescription(element_c, layout, alignment_c)
-    
+
     #
     # Conv3d Fprop
     #
@@ -557,7 +760,7 @@ def CreateConv3dOperator(manifest, layout, tile_descriptions, data_type, alignme
     # Conv3d Wgrad
     #
     if ConvKind.Wgrad in conv_kinds:
-     
+
       # Strided support for Analytic and Optimized Wgrad
       for iterator_algorithm in iterator_algorithms:
         new_operation = Conv3dOperation(ConvKind.Wgrad, iterator_algorithm, tile.minimum_compute_capability, tile,\
@@ -567,11 +770,11 @@ def CreateConv3dOperator(manifest, layout, tile_descriptions, data_type, alignme
 
   # All tile sizes for Conv3dDgrad
   for tile in tile_descriptions:
-    
+
     A = TensorDescription(element_a, layout, alignment)
     B = TensorDescription(element_b, layout, alignment)
     C = TensorDescription(element_c, layout, alignment_c)
-    
+
     #
     # Conv3d Dgrad
     #
@@ -579,20 +782,433 @@ def CreateConv3dOperator(manifest, layout, tile_descriptions, data_type, alignme
       # Unity stride for Optimized Dgrad
       new_operation = Conv3dOperation(ConvKind.Dgrad, IteratorAlgorithm.Optimized, tile.minimum_compute_capability, tile,\
         A, B, C, element_epilogue, StrideSupport.Unity, epilogue_functor)
-      
+
       manifest.append(new_operation)
       operations.append(new_operation)
-      
-      # Strided support for Analytic Dgrad 
-      # Conv3dDgrad has a naive strided support which does not cut down redundant MMAs   
+
+      # Strided support for Analytic Dgrad
+      # Conv3dDgrad has a naive strided support which does not cut down redundant MMAs
       new_operation = Conv3dOperation(ConvKind.Dgrad, IteratorAlgorithm.Analytic, tile.minimum_compute_capability, tile,\
         A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor)
-      
+
       manifest.append(new_operation)
       operations.append(new_operation)
 
   return operations
 
+# Convolution for Depthwise 2d conv
+def CreateDepthwiseConv2dOperator(manifest, layout, tile_descriptions, data_type, alignment_constraints, \
+  conv_kinds = [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad], \
+  epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4):
+
+  element_a, element_b, element_c, element_epilogue = data_type
+
+  # iterator algorithm (FixedStrideDilation, Optimized)
+  iterator_algorithms = [IteratorAlgorithm.FixedStrideDilation, IteratorAlgorithm.Optimized]
+
+  # by default, only generate the largest tile size, largest alignment, and optimized iterator
+  if manifest.kernel_filter == '':
+    tile_descriptions = [tile_descriptions[0],]
+    alignment_constraints = [alignment_constraints[0],]
+
+  operations = []
+
+  for tile in tile_descriptions:
+    for alignment in alignment_constraints:
+
+      alignment_c = min(8, alignment)
+
+      A = TensorDescription(element_a, layout[0], alignment)
+      B = TensorDescription(element_b, layout[1], alignment)
+      C = TensorDescription(element_c, layout[2], alignment_c)
+
+      swizzling_functor_ = swizzling_functor
+
+      if ConvKind.Fprop in conv_kinds:
+
+        # Strided support for Optimized and FixedStridedDilation Depthwise Conv
+        for iterator_algorithm in iterator_algorithms:
+          stride_support = StrideSupport.Strided
+          if iterator_algorithm == IteratorAlgorithm.FixedStrideDilation:
+              if tile.stride == [-1, -1] or tile.dilation == [-1,-1]:
+                continue
+              stride_support = StrideSupport.Fixed
+
+          if iterator_algorithm == IteratorAlgorithm.Optimized:
+              if tile.stride != [-1, -1] or tile.dilation != [-1,-1]:
+                continue
+          new_operation = Conv2dOperation(ConvKind.Fprop,
+                                          iterator_algorithm,
+                                          tile.minimum_compute_capability,
+                                          tile,
+                                          A, B, C,
+                                          element_epilogue,
+                                          stride_support,
+                                          epilogue_functor,
+                                          swizzling_functor_,
+                                          group_mode=GroupMode.Depthwise)
+
+          manifest.append(new_operation)
+          operations.append(new_operation)
+
+  return operations
+
+class ConvOperation3x:
+  """All parameters of a CUTLASS 3 convolution operation.
+
+  Unlike CUTLASS 2 convolutions, CUTLASS 3 convolutions do not
+  distinguish between 2-D and 3-D convolutions by kernel class name.
+  Instead, for CUTLASS 3 convolutions, the tensor layouts encode
+  whether the convolution is 2-D or 3-D.  Thus, this class deduces
+  the OperationKind (either Conv2d or Conv3d) from the layouts,
+  rather than taking it as a constructor parameter.
+  """
+  def __init__(self,
+               conv_kind: ConvKind,
+               tile_description: TileDescription,
+               A: TensorDescription,
+               B: TensorDescription,
+               C: TensorDescription,
+               element_compute: Optional[DataType] = None,
+               D: Optional[TensorDescription] = None,
+               kernel_schedule: KernelScheduleType = KernelScheduleType.ScheduleAuto,
+               epilogue_schedule: EpilogueScheduleType = EpilogueScheduleType.ScheduleAuto,
+               tile_scheduler: TileSchedulerType = TileSchedulerType.Default,
+               log_indent_level: int = 1):
+    log_debug_line(f'ConvOperation3x::init: conv_kind: {conv_kind}', log_indent_level)
+    log_indent_level = log_indent_level + 1
+
+    self.conv_kind = conv_kind
+    self.tile_description = tile_description
+    self.A = A
+    self.B = B
+    self.C = C
+    self.element_compute = C.element if element_compute is None else element_compute
+    self.kernel_schedule = kernel_schedule
+    self.epilogue_schedule = epilogue_schedule
+
+    self.arch = tile_description.minimum_compute_capability
+    self.tile_scheduler = tile_scheduler
+    if D == None:
+      self.D = C
+    else:
+      self.D = D
+
+    self.is_3x = True
+    self.group_mode = GroupMode.NoneGroup # CUTLASS 3 convolutions currently aren't grouped
+
+    operation_kind = None
+    for layout in (A.layout, B.layout, C.layout):
+      assert(isinstance(layout, LayoutType))
+      new_operation_kind = convolution_tensor_layout_type_to_operation_kind(layout)
+      if operation_kind is None:
+        operation_kind = new_operation_kind
+      else: # CUTLASS 3 convolutions don't permit mixing 2-D and 3-D layouts.
+        assert(operation_kind == new_operation_kind)
+    assert(operation_kind is not None)
+    self.operation_kind = operation_kind
+
+  def __str__(self):
+    return f"ConvOperation3x: operation_kind={self.operation_kind}, conv_kind={self.conv_kind}, tile_description={self.tile_description}"
+
+  def is_complex(self):
+    complex_operators = [
+      MathOperation.multiply_add_complex,
+      MathOperation.multiply_add_complex_gaussian,
+      MathOperation.multiply_add_complex_fast_f32
+    ]
+    return self.tile_description.math_instruction.math_operation in complex_operators
+
+  def is_mixed_input(self):
+    return self.A.element != self.B.element
+
+  def accumulator_type(self):
+    accum = self.tile_description.math_instruction.element_accumulator
+    if self.is_complex():
+      return get_complex_from_real(accum)
+    return accum
+
+  def short_math_name(self):
+    prefix = ''
+    if self.tile_description.math_instruction.math_operation == MathOperation.multiply_add_complex_gaussian:
+      prefix = 'g'
+    return prefix + DataTypeNames[self.accumulator_type()]
+
+  def is_tensor_op(self):
+    tensor_ops = [
+      OpcodeClass.TensorOp,
+      OpcodeClass.WmmaTensorOp
+    ]
+    return self.tile_description.math_instruction.opcode_class in tensor_ops
+
+  def instruction_shape_string(self):
+    math_operations_map = {
+      MathOperation.xor_popc: 'xor',
+      MathOperation.and_popc: 'and'
+    }
+    if self.is_tensor_op():
+      is0, is1, is2 = self.tile_description.math_instruction.instruction_shape
+      math_op = self.tile_description.math_instruction.math_operation
+      math_op_string = math_operations_map[math_op] if math_op in math_operations_map.keys() else ''
+      return f"{is0}x{is1}x{is2}{math_op_string}"
+    else:
+      return ''
+
+  def intermediate_type_string(self):
+    '''
+    Name of the distinct intermediate type used by the tensor operation,
+    or the empty string if none.
+
+    Tensor ops (opcode_clas *TensorOp) may use an intermediate data type
+    that differs from the element type of A or the accumulator type.
+    '''
+    if not self.is_tensor_op():
+      return ''
+    elif self.tile_description.math_instruction.element_a == self.A.element:
+      return ''
+    elif self.tile_description.math_instruction.element_a == self.tile_description.math_instruction.element_accumulator:
+      return ''
+    else:
+      return DataTypeNames[self.tile_description.math_instruction.element_a]
+
+  def core_name(self):
+    inst_shape = self.instruction_shape_string()
+    intermediate_type = self.intermediate_type_string()
+    conv_kind_name = ConvKindNames[self.conv_kind]
+    return f"{self.short_math_name()}{inst_shape}{intermediate_type}{conv_kind_name}"
+
+  def extended_name(self):
+    core_name = self.core_name()
+    element_a = DataTypeNames[self.A.element]
+    element_b = DataTypeNames[self.B.element]
+    element_acc = DataTypeNames[self.tile_description.math_instruction.element_accumulator]
+    element_c = DataTypeNames[self.C.element]
+    element_d = DataTypeNames[self.D.element]
+    return f"{core_name}_{element_a}_{element_b}_{element_acc}_{element_c}_{element_d}"
+
+  def is_complex(self):
+    complex_operators = [
+      MathOperation.multiply_add_complex,
+      MathOperation.multiply_add_complex_gaussian,
+      MathOperation.multiply_add_complex_fast_f32
+    ]
+    return self.tile_description.math_instruction.math_operation in complex_operators
+
+  def layout_names(self):
+    '''Layout strings for A and B, respectively'''
+    if self.is_complex():
+      return (ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)],
+              ShortComplexLayoutNames[(self.B.layout, self.B.complex_transform)])
+    else:
+      return (ShortLayoutTypeNames[self.A.layout],
+              ShortLayoutTypeNames[self.B.layout])
+
+  def extended_name(self):
+    core_name = self.core_name()
+    element_a = DataTypeNames[self.A.element]
+    element_b = DataTypeNames[self.B.element]
+    element_acc = DataTypeNames[self.tile_description.math_instruction.element_accumulator]
+    element_c = DataTypeNames[self.C.element]
+    element_d = DataTypeNames[self.D.element]
+    layout_a, layout_b = self.layout_names()
+    return f"{core_name}_{element_a}{layout_a}_{element_b}{layout_b}_{element_acc}_{element_c}_{element_d}"
+
+  def configuration_name(self):
+    prefix = 'cutlass3x'
+    arch = self.arch
+    opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
+    tbm = self.tile_description.tile_shape[0]
+    tbn = self.tile_description.tile_shape[1]
+    tbk = self.tile_description.tile_shape[2]
+    cm = self.tile_description.cluster_shape[0]
+    cn = self.tile_description.cluster_shape[1]
+    ck = self.tile_description.cluster_shape[2]
+    alignment = max(self.A.alignment, self.B.alignment)
+    tile_scheduler = TileSchedulerSuffixes[self.tile_scheduler]
+    kernel_schedule = KernelScheduleSuffixes[self.kernel_schedule]
+    epilogue_schedule = EpilogueScheduleSuffixes[self.epilogue_schedule]
+
+    return f"{prefix}_sm{arch}_{opcode_class_name}_{self.extended_name()}_{tbm}x{tbn}x{tbk}_{cm}x{cn}x{ck}_{self.tile_description.stages}_align{alignment}{tile_scheduler}{kernel_schedule}{epilogue_schedule}"
+
+  def procedural_name(self):
+    return self.configuration_name()
+
+def convolution_tensor_layout_type_to_operation_kind(layout: LayoutType) -> OperationKind:
+  if layout == LayoutType.TensorNHWC or layout == LayoutType.TensorKCSR:
+    return OperationKind.Conv2d
+  elif layout == LayoutType.TensorNDHWC or layout == LayoutType.TensorKCSRT:
+    return OperationKind.Conv3d
+  else:
+    raise RuntimeError(f'LayoutType {layout} does not have a corresponding OperationKind')
+
+def CreateConvOperator3x(manifest: Manifest,
+                         dims_and_alignments: Sequence[Tuple[Tuple[int, int], Tuple[int, int], Tuple[int, int]]],
+                         tile_descriptions: Sequence[Sequence[TileDescription]],
+                         data_types,
+                         schedule_pairs: Sequence[Tuple[KernelScheduleType, KernelScheduleType]] = \
+                           [(KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto)],
+                         complex_transforms: Optional[Sequence[ComplexTransform]] = None,
+                         tile_schedulers: Sequence[TileSchedulerType] = [TileSchedulerType.Default],
+                         conv_kind: ConvKind = ConvKind.Fprop,
+                         log_indent_level: int = 1):
+  """
+  Create zero or more CUTLASS 3 two-dimensional convolution operators.
+
+  Create a CUTLASS 3 two-dimensional convolution operator
+  for all feasible combinations of the input parameters.
+  Add the operators to the manifest.
+
+  dims_and_alignments: 3-level list.  Each outer list term is a list [A, B, C].
+    Each inner list (A, B, or C) has the form [num_spatial_dimensions, alignment].
+    Both are integers; the first is the number of spatial dimensions
+    (currently, only 2 or 3 are supported), and the second is the byte alignment.
+    We deduce the operation_kind (either OperationKind.Conv2d or OperationKind.Conv3d)
+    from num_spatial_dimensions.
+
+  This function doesn't take layouts, unlike the GEMM functions.
+  CUTLASS 3 convolutions currently support three input layouts:
+
+  * TensorNWC for 1-D convolutions,
+  * TensorNHWC for 2-D convolutions, and
+  * TensorNDHWC for 3-D convolutions.
+
+  Output (C and D) layouts are the same as input layouts,
+  except for Wgrad convolutions, where the layouts are
+
+  * TensorKCS for 1-D convolutions,
+  * TensorKCSR for 2-D convolutions, and
+  * TensorKCSRT for 3-D convolutions.
+
+  The output layouts are completely constrained by the input layouts
+  and the convolution kind.
+
+  tile_descriptions: 2-level list.
+    Outer level has one list per math instruction.
+    Inner level has one TileDescription for each cluster shape.
+
+  data_types: Either a single data_type dictionary, or a list of them.
+    Keys: 'a_type', 'b_type', 'c_type', 'd_type', 'acc_type', 'epi_type'
+
+  complex_transforms: Optional list of pairs.
+    First element of each pair is the complex transform for A, and
+    second element of each pair is the complex transform for B.
+
+  schedule_pairs: [(kernel_schedule, epilogue_schedule), ...]
+
+  conv_kind: Convolution kind (Fprop, Dgrad, or Wgrad).
+  """
+  log_debug_line('CreateConvOperator3x', log_indent_level)
+  log_indent_level = log_indent_level + 1
+  log_debug_line(f'conv_kind: {conv_kind}', log_indent_level)
+
+  for triple in dims_and_alignments:
+    assert(isinstance(triple, tuple) or isinstance(triple, list))
+    assert(len(triple) == 3)
+
+    spatial_dimensionality = None # to be determined by loop below
+
+    for entry in triple: # [A, B, C]
+      assert(len(entry) == 2)
+      [dim, alignment] = entry
+      assert(type(dim) is int)
+      assert(dim == 2 or dim == 3)
+      assert(type(alignment) is int)
+      assert(alignment > 0)
+      if spatial_dimensionality is None:
+        spatial_dimensionality = dim
+      else:
+        # A, B, and C need to have the same spatial dimensionality
+        assert(spatial_dimensionality == dim)
+
+  def input_and_output_layouts(spatial_dim: int, kind: ConvKind) -> Tuple[LayoutType, LayoutType]:
+    if spatial_dim == 1:
+      input_layout = LayoutType.TensorNWC
+      if kind == ConvKind.Wgrad:
+        output_layout = LayoutType.TensorKCS
+      else:
+        output_layout = input_layout
+    elif spatial_dim == 2:
+      input_layout = LayoutType.TensorNHWC
+      if kind == ConvKind.Wgrad:
+        output_layout = LayoutType.TensorKCSR
+      else:
+        output_layout = input_layout
+    elif spatial_dim == 3:
+      input_layout = LayoutType.TensorNDHWC
+      if kind == ConvKind.Wgrad:
+        output_layout = LayoutType.TensorKCSRT
+      else:
+        output_layout = input_layout
+    else:
+      assert(False)
+    return (input_layout, output_layout)
+
+  def dims_to_layouts(A_B_C: Tuple[Tuple[int, int], Tuple[int, int], Tuple[int, int]]) -> \
+      Tuple[Tuple[LayoutType, int], Tuple[LayoutType, int], Tuple[LayoutType, int]]:
+    [A, B, C] = A_B_C
+    [spatial_dim, alignment] = A
+    [input_layout, output_layout] = input_and_output_layouts(spatial_dim, conv_kind)
+    return ((input_layout, A[1]),
+            (input_layout, B[1]),
+            (output_layout, C[1]))
+
+  # layouts: list of triples (A, B, C).
+  # Each of A, B, and C has the form [layout, alignment].
+  layouts = [dims_to_layouts(A_B_C) for A_B_C in dims_and_alignments]
+
+  if type(data_types) is dict:
+    data_types = [data_types]
+
+  for s in schedule_pairs:
+    assert(len(s) == 2)
+
+  if complex_transforms is None:
+    complex_transforms = [(ComplexTransform.none, ComplexTransform.none)]
+
+  # product produces a one-pass generator, so the loop must call it anew each time.
+  def make_combinations():
+    return product(
+      layouts,
+      tile_descriptions,
+      data_types,
+      complex_transforms,
+      schedule_pairs,
+      tile_schedulers
+    )
+
+  operations = []
+  for layout_triple, tile_description, data_type, complex_transform_pair, schedule_pair, tile_scheduler in make_combinations():
+    A_layout, A_alignment = layout_triple[0]
+    A_xform = complex_transform_pair[0]
+    B_layout, B_alignment = layout_triple[1]
+    B_xform = complex_transform_pair[1]
+    C_layout, C_alignment = layout_triple[2]
+    D_layout = C_layout
+    D_alignment = C_alignment
+
+    A = TensorDescription(data_type["a_type"], A_layout, A_alignment, A_xform)
+    B = TensorDescription(data_type["b_type"], B_layout, B_alignment, B_xform)
+    C = TensorDescription(data_type["c_type"], C_layout, C_alignment)
+    D = TensorDescription(data_type["d_type"], D_layout, D_alignment)
+    element_compute = data_type.get("epi_type", data_type["acc_type"])
+    kernel_schedule, epilogue_schedule = schedule_pair
+
+    operation = ConvOperation3x(conv_kind=conv_kind,
+                                tile_description=tile_description,
+                                A=A,
+                                B=B,
+                                C=C,
+                                element_compute=element_compute,
+                                D=D,
+                                kernel_schedule=kernel_schedule,
+                                epilogue_schedule=epilogue_schedule,
+                                tile_scheduler=tile_scheduler,
+                                log_indent_level=log_indent_level)
+    log_debug_line(f'Created ConvOperation3x: {str(operation)}', log_indent_level)
+    manifest.append(operation)
+    operations.append(operation)
+
+  return operations
 
 ###################################################################################################
 ###################################################################################################
@@ -640,7 +1256,7 @@ def GenerateSM50_Simt(manifest, cuda_version):
       math_inst.element_accumulator,
       math_inst.element_accumulator,
     ]
-    
+
     CreateGemmOperator(manifest, layouts, tile_descriptions, \
       data_type, alignment_constraints)
 
@@ -688,7 +1304,7 @@ def GenerateSM50_Simt_complex(manifest, cuda_version):
       DataType.cf32,
     ]
 
-    
+
     CreateGemmOperator(manifest, layouts, tile_descriptions, \
       data_type, alignment_constraints)
 
@@ -744,14 +1360,87 @@ def GenerateSM60_Simt(manifest, cuda_version):
       math_inst.element_accumulator,
       math_inst.element_accumulator,
     ]
-    
+
     CreateGemmOperator(manifest, layouts, tile_descriptions, \
       data_type, alignment_constraints)
 #
+def GenerateSM60_Simt_DepthwiseConv2d(manifest, cuda_version):
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [1, 1, 1],                                      \
+      DataType.f16, DataType.f16, DataType.f16,       \
+      OpcodeClass.Simt,                               \
+      MathOperation.multiply_add),
+  ]
+
+  min_cc = 60
+  max_cc = 1024
+
+  alignment_constraints = [8,]
+
+  filter_3x3 = [3, 3]
+  filter_5x5 = [5, 5]
+
+  # [stride_h, stride_w]
+  # [-1, -1] means all stride size.
+  strides = [[-1,-1], [1, 1], [2, 2]]
+  # [dilation_h, dilation_w]
+  # [-1, -1] means all dilation size.
+  dilations = [[-1,-1], [1, 1], [2, 2]]
+
+  #groups per thread block
+  g16 = 16
+  g32 = 32
+  g64 = 64
+
+  #output shape per thread block
+  npq_1x4x4 = [1, 4, 4]
+  npq_1x8x8 = [1, 8, 8]
+  npq_1x10x10 = [1, 10, 10]
+
+  tile_descriptions = []
+  for math_inst in math_instructions:
+    for stride, dilation in product(strides, dilations):
+      tile_descriptions.extend([
+        # filter3x3               ThreadBlock_output, filter, stage, warp
+        Direct2dConvFixedStrideDilationTileDescription(npq_1x8x8+[g32], filter_3x3, 3, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
+        Direct2dConvFixedStrideDilationTileDescription(npq_1x8x8+[g64], filter_3x3, 3, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
+        Direct2dConvFixedStrideDilationTileDescription(npq_1x8x8+[g16], filter_3x3, 3, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
+
+        Direct2dConvFixedStrideDilationTileDescription(npq_1x10x10+[g64], filter_3x3, 2, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
+
+        Direct2dConvFixedStrideDilationTileDescription(npq_1x4x4+[g32], filter_3x3, 4, stride, dilation, [4, 1, 1],  math_inst, min_cc, max_cc),
+        Direct2dConvFixedStrideDilationTileDescription(npq_1x4x4+[g64], filter_3x3, 4,  stride, dilation,[4, 1, 1], math_inst, min_cc, max_cc),
+        Direct2dConvFixedStrideDilationTileDescription(npq_1x4x4+[g16], filter_3x3, 4, stride, dilation, [4, 1, 1],  math_inst, min_cc, max_cc),
+
+        # filter5x5               ThreadBlock_output, filter, stage, warp
+        Direct2dConvFixedStrideDilationTileDescription(npq_1x8x8+[g32], filter_5x5, 3, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
+        Direct2dConvFixedStrideDilationTileDescription(npq_1x8x8+[g64], filter_5x5, 3, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
+        Direct2dConvFixedStrideDilationTileDescription(npq_1x8x8+[g16], filter_5x5, 3, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
+
+        Direct2dConvFixedStrideDilationTileDescription(npq_1x10x10+[g64], filter_5x5, 2, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
+
+        Direct2dConvFixedStrideDilationTileDescription(npq_1x4x4+[g32], filter_5x5, 4, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
+        Direct2dConvFixedStrideDilationTileDescription(npq_1x4x4+[g64], filter_5x5, 4, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
+        Direct2dConvFixedStrideDilationTileDescription(npq_1x4x4+[g16], filter_5x5, 4, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc)
+      ])
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      math_inst.element_accumulator,
+    ]
+
+    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+    CreateDepthwiseConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
+#
 
 #
 def GenerateSM60(manifest, cuda_version):
   GenerateSM60_Simt(manifest, cuda_version)
+  GenerateSM60_Simt_DepthwiseConv2d(manifest, cuda_version)
 
 ###################################################################################################
 ###################################################################################################
@@ -800,12 +1489,12 @@ def GenerateSM61_Simt(manifest, cuda_version):
       math_inst.element_a,
       math_inst.element_accumulator,
     ]
-    
+
     CreateGemmOperator(manifest, layouts, tile_descriptions, \
       data_type, alignment_constraints)
 
     CreateGemmOperator(manifest, layouts, tile_descriptions, \
-      data_type_mixed, alignment_constraints)
+      data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
 #
 
 #
@@ -864,7 +1553,7 @@ def GenerateSM70_TensorOp_884(manifest, cuda_version):
       math_inst.element_accumulator,
       math_inst.element_accumulator,
     ]
-    
+
     CreateGemmOperator(manifest, layouts, tile_descriptions, \
       data_type, alignment_constraints)
 
@@ -883,7 +1572,7 @@ def GenerateSM70_TensorOp_884(manifest, cuda_version):
 
       CreateGemmOperator(manifest, layouts, tile_descriptions, \
         data_type_mixed, alignment_constraints)
-    
+
       CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, alignment_constraints)
 
 #
@@ -935,7 +1624,7 @@ def GenerateSM70_PlanarComplexTensorOp_884(manifest, cuda_version):
       math_inst.element_accumulator,
       math_inst.element_accumulator,
     ]
-    
+
     CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \
       data_type, alignment_constraints, complex_transforms)
 
@@ -951,7 +1640,7 @@ def GenerateSM70_PlanarComplexTensorOp_884(manifest, cuda_version):
 
       CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \
         data_type_mixed, alignment_constraints, complex_transforms)
-    
+
 
 #
 def GenerateSM70_WmmaTensorOp_161616(manifest, cuda_version):
@@ -995,7 +1684,7 @@ def GenerateSM70_WmmaTensorOp_161616(manifest, cuda_version):
       math_inst.element_accumulator,
       math_inst.element_accumulator,
     ]
-    
+
     CreateGemmOperator(manifest, layouts, tile_descriptions, \
       data_type, alignment_constraints)
 
@@ -1118,7 +1807,7 @@ def GenerateSM75_TensorOp_1688(manifest, cuda_version):
       math_inst.element_accumulator,
       math_inst.element_accumulator,
     ]
-    
+
     CreateGemmOperator(manifest, layouts, tile_descriptions, \
       data_type, alignment_constraints)
 
@@ -1197,7 +1886,7 @@ def GenerateSM75_PlanarComplexTensorOp_1688(manifest, cuda_version):
       math_inst.element_accumulator,
       math_inst.element_accumulator,
     ]
-    
+
     CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \
       data_type, alignment_constraints, complex_transforms)
 
@@ -1238,9 +1927,10 @@ def GenerateSM75_TensorOp_8816_TN(manifest, cuda_version):
   ]
 
   min_cc = 75
-  max_cc = 1024
+  max_cc = 90
 
   alignment_constraints = [16,]
+  alignment_constraints_small_channels = [16, 8, 4]
 
   for math_inst in math_instructions:
     tile_descriptions = [
@@ -1248,10 +1938,25 @@ def GenerateSM75_TensorOp_8816_TN(manifest, cuda_version):
       TileDescription([128, 256, 64], 2, [2, 4, 1], math_inst, min_cc, max_cc),
       TileDescription([128, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 256, 64], 2, [1, 4, 1], math_inst, min_cc, max_cc),
-      TileDescription([256,  64, 64], 2, [4, 1, 1], math_inst, min_cc, max_cc),      
+      TileDescription([256,  64, 64], 2, [4, 1, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([128,  64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64,  64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  32, 64], 2, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 256, 64], 2, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  32, 64], 2, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  32, 64], 2, [2, 1, 1], math_inst, min_cc, max_cc),
+
+      TileDescription([256, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 32], 2, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 32], 2, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  32, 32], 2, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  32, 32], 2, [2, 1, 1], math_inst, min_cc, max_cc),
     ]
 
     data_type = [
@@ -1260,7 +1965,7 @@ def GenerateSM75_TensorOp_8816_TN(manifest, cuda_version):
       math_inst.element_accumulator,
       DataType.s32,
     ]
-    
+
     CreateGemmOperator(manifest, layouts, tile_descriptions, \
       data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination)
 
@@ -1282,10 +1987,16 @@ def GenerateSM75_TensorOp_8816_TN(manifest, cuda_version):
 
       operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
         data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
-    
+
       operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
         data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
 
+      operations += CreateConv2dFixedChannelsOperator(manifest, conv_layout, tile_descriptions,
+        data_type_mixed, alignment_constraints_small_channels, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+
+      operations += CreateConv2dFewChannelsOperator(manifest, conv_layout, tile_descriptions,
+        data_type_mixed, alignment_constraints_small_channels, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+
       for op in operations:
         if op.tile_description.threadblock_shape[1] >= 128:
           op.C.alignment = 16
@@ -1318,7 +2029,7 @@ def GenerateSM75_TensorOp_8816_Interleaved(manifest, cuda_version):
   ]
 
   min_cc = 75
-  max_cc = 1024
+  max_cc = 90
 
   alignment_constraints = [16,]
 
@@ -1340,7 +2051,7 @@ def GenerateSM75_TensorOp_8816_Interleaved(manifest, cuda_version):
       math_inst.element_a,
       DataType.f32,
     ]
-    
+
     operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
       data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
 
@@ -1377,7 +2088,8 @@ def GenerateSM75_TensorOp_8832_TN(manifest, cuda_version):
   ]
 
   min_cc = 75
-  max_cc = 1024
+  max_cc = 89
+
   alignment_constraints = [32,]
 
   for math_inst in math_instructions:
@@ -1398,10 +2110,10 @@ def GenerateSM75_TensorOp_8832_TN(manifest, cuda_version):
       math_inst.element_accumulator,
       DataType.s32,
     ]
-    
+
     CreateGemmOperator(manifest, layouts, tile_descriptions, \
       data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination)
-      
+
     conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
     CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
       data_type, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombination)
@@ -1420,7 +2132,7 @@ def GenerateSM75_TensorOp_8832_TN(manifest, cuda_version):
 
       operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
         data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
-    
+
       operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
         data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
 
@@ -1458,7 +2170,8 @@ def GenerateSM75_TensorOp_8832_Interleaved(manifest, cuda_version):
   ]
 
   min_cc = 75
-  max_cc = 1024
+  max_cc = 89
+
   alignment_constraints = [32,]
 
   for math_inst in math_instructions:
@@ -1485,12 +2198,12 @@ def GenerateSM75_TensorOp_8832_Interleaved(manifest, cuda_version):
         data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
 
       conv_layout = (LayoutType.TensorNC64HW64, LayoutType.TensorC64RSK64, LayoutType.TensorNC64HW64)
-  
+
       operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
         data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
 
       for op in operations:
-        op.C.alignment = 16 
+        op.C.alignment = 16
 #
 
 #
@@ -1511,20 +2224,24 @@ def GenerateSM75_TensorOp_88128(manifest, cuda_version):
       MathOperation.xor_popc),
   ]
 
-  min_cc = 75 
-  max_cc = 1024
+  min_cc = 75
+  max_cc = {
+    MathOperation.xor_popc: 89,
+    MathOperation.and_popc: 90
+  }
+
   alignment_constraints = [128,]
 
   for math_inst in math_instructions:
     tile_descriptions = [
-      TileDescription([256, 128, 512], 2, [4, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 256, 512], 2, [2, 4, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 128, 512], 2, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 64, 256, 512], 2, [1, 4, 1], math_inst, min_cc, max_cc),
-      TileDescription([256,  64, 512], 2, [4, 1, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 64, 128, 512], 2, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128,  64, 512], 2, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 64,  64, 512], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 512], 2, [4, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
+      TileDescription([128, 256, 512], 2, [2, 4, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
+      TileDescription([128, 128, 512], 2, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
+      TileDescription([ 64, 256, 512], 2, [1, 4, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
+      TileDescription([256,  64, 512], 2, [4, 1, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
+      TileDescription([ 64, 128, 512], 2, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
+      TileDescription([128,  64, 512], 2, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
+      TileDescription([ 64,  64, 512], 2, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
     ]
 
     data_type = [DataType.b1, DataType.b1, DataType.s32, DataType.s32]
@@ -1574,7 +2291,7 @@ def GenerateSM75_WmmaTensorOp_161616(manifest, cuda_version):
       math_inst.element_accumulator,
       DataType.f32,
     ]
-    
+
     CreateGemmOperator(manifest, layouts, tile_descriptions, \
       data_type, alignment_constraints)
 
@@ -1677,7 +2394,6 @@ def GenerateSM80_TensorOp_16816(manifest, cuda_version):
 
   min_cc = 80
   max_cc = 1024
-  max_cc_smem_limited = 80
 
   alignment_constraints = [8, 4, 2]
 
@@ -1694,12 +2410,14 @@ def GenerateSM80_TensorOp_16816(manifest, cuda_version):
       TileDescription([128,  64, 32],  6, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 128, 32],  6, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64,  64, 32], 10, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([256, 128, 64],  3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([128, 256, 64],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([256,  64, 64],  4, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([ 64, 256, 64],  4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([256, 128, 64],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 64],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 64],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 64],  4, [1, 4, 1], math_inst, min_cc, max_cc),
       TileDescription([128, 128, 64],  4, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([256,  64, 64],  3, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([256,  64, 64],  3, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 64],  3, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([128,  64, 64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 128, 64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64,  64, 64],  5, [2, 2, 1], math_inst, min_cc, max_cc),
@@ -1711,7 +2429,7 @@ def GenerateSM80_TensorOp_16816(manifest, cuda_version):
       math_inst.element_accumulator,
       math_inst.element_accumulator,
     ]
-    
+
     CreateGemmOperator(manifest, layouts, tile_descriptions, \
       data_type, alignment_constraints)
 
@@ -1773,23 +2491,22 @@ def GenerateSM80_SparseTensorOp_16832(manifest, cuda_version):
 
   min_cc = 80
   max_cc = 1024
-  max_cc_smem_limited = 80
 
   alignment_constraints = [8]
 
   for math_inst in math_instructions:
     tile_descriptions = [
       TileDescription([ 64, 128,  64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([256, 128,  64],  3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([128, 256,  64],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([256, 128,  64],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256,  64],  3, [2, 4, 1], math_inst, min_cc, max_cc),
       TileDescription([128, 128,  64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([256,  64,  64],  3, [4, 1, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 256,  64],  4, [1, 4, 1], math_inst, min_cc, max_cc),
       TileDescription([128,  64,  64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64,  64,  64],  4, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 128, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([256,  64, 128],  3, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([128,  64, 128],  4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([128, 128, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 128],  3, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 128],  4, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 128, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64,  64, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
     ]
@@ -1800,7 +2517,7 @@ def GenerateSM80_SparseTensorOp_16832(manifest, cuda_version):
       math_inst.element_accumulator,
       math_inst.element_accumulator,
     ]
-    
+
     CreateSparseGemmOperator(manifest, layouts, tile_descriptions, \
       data_type, alignment_constraints)
 
@@ -1875,7 +2592,7 @@ def GenerateSM80_PlanarComplexTensorOp_16816(manifest, cuda_version):
       math_inst.element_accumulator,
       math_inst.element_accumulator,
     ]
-    
+
     CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \
       data_type, alignment_constraints, complex_transforms)
 
@@ -1893,7 +2610,7 @@ def GenerateSM80_PlanarComplexTensorOp_16816(manifest, cuda_version):
         data_type_mixed, alignment_constraints, complex_transforms)
 
 #
-def GenerateSM80_TensorOp_16832_TN(manifest, cuda_version):
+def GenerateSM80_TensorOp_16816_mixed_input_upcast_a(manifest, cuda_version):
 
   if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
     return
@@ -1902,135 +2619,206 @@ def GenerateSM80_TensorOp_16832_TN(manifest, cuda_version):
     (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
   ]
 
+  # Upcast on Operand A
   math_instructions = [
     MathInstruction(                                  \
-      [16, 8, 32],                                    \
-      DataType.s8, DataType.s8, DataType.s32,         \
+      [16, 8, 16],                                    \
+      DataType.s8, DataType.f16, DataType.f32,        \
       OpcodeClass.TensorOp,                           \
-      MathOperation.multiply_add_saturate),
+      MathOperation.multiply_add_mixed_input_upcast),
     MathInstruction(                                  \
-      [16, 8, 32],                                    \
-      DataType.u8, DataType.u8, DataType.s32,         \
+      [16, 8, 16],                                    \
+      DataType.u8, DataType.f16, DataType.f32,        \
       OpcodeClass.TensorOp,                           \
-      MathOperation.multiply_add_saturate),
+      MathOperation.multiply_add_mixed_input_upcast),
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.s8, DataType.bf16, DataType.f32,       \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_mixed_input_upcast),
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.u8, DataType.bf16, DataType.f32,       \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_mixed_input_upcast),
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.s8, DataType.f16, DataType.f16,        \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_mixed_input_upcast),
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.u8, DataType.f16, DataType.f16,        \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_mixed_input_upcast),
   ]
 
   min_cc = 80
   max_cc = 1024
-  max_cc_smem_limited = 80
 
-  alignment_constraints = [16,]
+  # For mixed-input alignment constraints are a list of lists, where the
+  # inner list contains the alignment constraints for operands/matrices
+  # [[alignA, alignB, alignC],..]
+  alignment_constraints = [[16, 8, 8],]
 
   for math_inst in math_instructions:
     tile_descriptions = [
-      TileDescription([256, 128,  64],  3, [4, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 256,  64],  3, [2, 4, 1], math_inst, min_cc, max_cc),
-      TileDescription([256,  64,  64],  4, [4, 1, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 64, 256,  64],  4, [1, 4, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 128,  64],  5, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128,  64,  64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 64, 128,  64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 64,  64,  64], 10, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([256, 128, 128],  3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([128, 256, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([256,  64, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([ 64, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([128, 128, 128],  4, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128,  64, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 64, 128, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 64,  64, 128],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      # 128x128
+      TileDescription([128, 128, 64],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      # 128x64
+      TileDescription([128, 64, 64],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 64, 64],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 64, 64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      # 128x32
+      TileDescription([128, 32, 64],  9, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 32, 64],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      # 128x16
+      TileDescription([128, 16, 64],  5, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 16, 64],  3, [2, 1, 1], math_inst, min_cc, max_cc),
     ]
-  
-    data_type = [math_inst.element_a, math_inst.element_b, math_inst.element_accumulator, DataType.s32]
-    data_type_mixed = [math_inst.element_a, math_inst.element_b, math_inst.element_a, DataType.f32]
-  
-    CreateGemmOperator(manifest, layouts, tile_descriptions, \
-      data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination)
-  
-    operations = []
-  
-    operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
-      data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
-      
-    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
-    CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
-      data_type, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombination)
-    
-    operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
-      data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
-  
-    for op in operations:
-      if op.tile_description.threadblock_shape[1] >= 128:
-        op.C.alignment = 16
-      else:
-        op.C.alignment = 8
 
-#
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      math_inst.element_accumulator,
+    ]
+
+    # streamk uses more regs which can cause spill for the biggest warp tile size when the accumulators are 32bit.
+    operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination, SwizzlingFunctor.Identity8)
+
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
+    if math_inst.element_b != math_inst.element_accumulator:
+
+      data_type_mixed = [
+        math_inst.element_a,
+        math_inst.element_b,
+        math_inst.element_b,
+        math_inst.element_accumulator,
+      ]
+
+      operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
+        data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombination, SwizzlingFunctor.Identity8)
+
+    for op in operations:
+      if (DataTypeSize[op.C.element] == 16) and \
+         (op.tile_description.threadblock_shape[1] <= 32):
+        op.C.alignment = 4
 
 #
-def GenerateSM80_SparseTensorOp_16864_TN(manifest, cuda_version):
+def GenerateSM80_TensorOp_16816_mixed_input_upcast_b(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(cuda_version, 11, 1):
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
     return
 
   layouts = [
-    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.RowMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
   ]
 
-  math_inst =                                         \
+  math_instructions = [
     MathInstruction(                                  \
-      [16, 8, 64],                                    \
-      DataType.s8, DataType.s8, DataType.s32,         \
+      [16, 8, 16],                                    \
+      DataType.f16, DataType.s8, DataType.f32,        \
       OpcodeClass.TensorOp,                           \
-      MathOperation.multiply_add_saturate)
+      MathOperation.multiply_add_mixed_input_upcast),
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.f16, DataType.u8, DataType.f32,        \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_mixed_input_upcast),
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.bf16, DataType.s8, DataType.f32,       \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_mixed_input_upcast),
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.bf16, DataType.u8, DataType.f32,       \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_mixed_input_upcast),
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.f16, DataType.s8, DataType.f16,        \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_mixed_input_upcast),
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.f16, DataType.u8, DataType.f16,        \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_mixed_input_upcast),
+  ]
 
   min_cc = 80
   max_cc = 1024
-  max_cc_smem_limited = 80
 
-  alignment_constraints = [16,]
+  # For mixed-input alignment constraints are a list of lists, where the
+  # inner list contains the alignment constraints for operands/matrices
+  # [[alignA, alignB, alignC],..]
+  alignment_constraints = [[8, 16, 8],]
 
-  tile_descriptions = [
-    TileDescription([128,  64, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
-    TileDescription([256, 128, 128],  3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-    TileDescription([128, 256, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
-    TileDescription([128, 128, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-    TileDescription([256,  64, 128],  3, [4, 1, 1], math_inst, min_cc, max_cc),
-    TileDescription([ 64, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited),
-    TileDescription([ 64, 128, 128],  6, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-    TileDescription([ 64,  64, 128],  4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-    TileDescription([128, 128, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-    TileDescription([128,  64, 256],  4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-    TileDescription([ 64, 128, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-    TileDescription([ 64,  64, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
-  ]
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      # 128x128
+      TileDescription([128, 128, 64],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      # 128x64
+      TileDescription([128, 64, 64],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 64, 64],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 64, 64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      # 128x32
+      TileDescription([128, 32, 64],  9, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 32, 64],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 32, 32],  9, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 32, 32],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      # 128x16
+      TileDescription([128, 16, 64],  5, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 16, 64],  3, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 16, 32],  9, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 16, 32],  5, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 16, 32],  3, [2, 1, 1], math_inst, min_cc, max_cc),
+      # 256x16
+      TileDescription([256, 16, 32],  5, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 16, 32],  3, [2, 1, 1], math_inst, min_cc, max_cc),
+    ]
 
-  data_type = [DataType.s8, DataType.s8, DataType.s32, DataType.s32]
-  data_type_mixed = [DataType.s8, DataType.s8, DataType.s8, DataType.f32]
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      math_inst.element_accumulator,
+    ]
 
-  CreateSparseGemmOperator(manifest, layouts, tile_descriptions, \
-    data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination)
+    # streamk uses more regs which can cause spill for the biggest warp tile size when the accumulators are 32bit.
+    operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination, SwizzlingFunctor.Identity8)
 
-  operations = []
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
+    if math_inst.element_a != math_inst.element_accumulator:
 
-  operations += CreateSparseGemmOperator(manifest, layouts, tile_descriptions, \
-    data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
+      data_type_mixed = [
+        math_inst.element_a,
+        math_inst.element_b,
+        math_inst.element_a,
+        math_inst.element_accumulator,
+      ]
 
-  for op in operations:
-    if op.tile_description.threadblock_shape[1] >= 128:
-      op.C.alignment = 16
-    else:
-      op.C.alignment = 8
-#
+      operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
+        data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombination, SwizzlingFunctor.Identity8)
+
+    for op in operations:
+      if op.tile_description.threadblock_shape[1] <= 32:
+        op.C.alignment = 4
 
 #
-def GenerateSM80_TensorOp_16832_Interleaved(manifest, cuda_version):
+def GenerateSM80_TensorOp_16832_TN(manifest, cuda_version):
 
   if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
     return
 
   layouts = [
-    (LayoutType.ColumnMajorInterleaved32, LayoutType.RowMajorInterleaved32, LayoutType.ColumnMajorInterleaved32),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
   ]
 
   math_instructions = [
@@ -2048,37 +2836,75 @@ def GenerateSM80_TensorOp_16832_Interleaved(manifest, cuda_version):
 
   min_cc = 80
   max_cc = 1024
+  smem_usage = 164
 
   alignment_constraints = [16,]
+  alignment_constraints_small_channels = [16, 8, 4]
 
   for math_inst in math_instructions:
     tile_descriptions = [
-      TileDescription([256, 128, 64],  3, [4, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 256, 64],  3, [2, 4, 1], math_inst, min_cc, max_cc),
-      TileDescription([256,  64, 64],  4, [4, 1, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 64, 256, 64],  4, [1, 4, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 128, 64],  5, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128,  64, 64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 64, 128, 64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 64,  64, 64], 10, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128,  64],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256,  64],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64,  64],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256,  64],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  32,  64],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 256,  64],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128,  64],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64,  64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128,  64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  32,  64],  6, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 128,  64],  6, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64,  64], 10, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 128],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  32, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 128],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  32, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 128, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 128],  5, [2, 2, 1], math_inst, min_cc, max_cc),
     ]
-  
+
+    data_type = [math_inst.element_a, math_inst.element_b, math_inst.element_accumulator, DataType.s32]
     data_type_mixed = [math_inst.element_a, math_inst.element_b, math_inst.element_a, DataType.f32]
-  
-    operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
+
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination)
+
+    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+    CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
+      data_type, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombination)
+
+    operations = []
+
+    operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
       data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
- 
-    conv_layout = (LayoutType.TensorNC32HW32, LayoutType.TensorC32RSK32, LayoutType.TensorNC32HW32)
 
     operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
       data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
- 
+
+    operations += CreateConv2dFixedChannelsOperator(manifest, conv_layout, tile_descriptions,
+      data_type_mixed, alignment_constraints_small_channels, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+
+    operations += CreateConv2dFewChannelsOperator(manifest, conv_layout, tile_descriptions,
+      data_type_mixed, alignment_constraints_small_channels, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+
     for op in operations:
-      op.C.alignment = 8
-#
+      if op.tile_description.threadblock_shape[1] >= 128:
+        if op.tile_description.threadblock_shape[0] == 32:
+          op.C.alignment = 8
+        else:
+          op.C.alignment = 16
+      else:
+        op.C.alignment = 8
 
 #
-def GenerateSM80_TensorOp_16864_TN(manifest, cuda_version):
+
+def GenerateSM80_TensorOp_16832_TN_mixed_input_upcast_a(manifest, cuda_version):
 
   if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
     return
@@ -2087,77 +2913,349 @@ def GenerateSM80_TensorOp_16864_TN(manifest, cuda_version):
     (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
   ]
 
+  # Upcast on Operand A
   math_instructions = [
     MathInstruction(                                  \
-      [16, 8, 64],                                    \
-      DataType.s4, DataType.s4, DataType.s32,         \
-      OpcodeClass.TensorOp,                           \
-      MathOperation.multiply_add_saturate),
-    MathInstruction(                                  \
-      [16, 8, 64],                                    \
-      DataType.u4, DataType.u4, DataType.s32,         \
+      [16, 8, 32],                                    \
+      DataType.s4, DataType.s8, DataType.s32,         \
       OpcodeClass.TensorOp,                           \
-      MathOperation.multiply_add_saturate),
+      MathOperation.multiply_add_mixed_input_upcast),
   ]
 
   min_cc = 80
   max_cc = 1024
-  max_cc_smem_limited = 80
 
-  alignment_constraints = [32,]
+  # For mixed-input alignment constraints are a list of lists, where the 
+  # inner list contains the alignment constraints for operands/matrices 
+  # [[alignA, alignB, alignC],..]
+  alignment_constraints = [[32, 16, 4],]
 
   for math_inst in math_instructions:
     tile_descriptions = [
+      TileDescription([256, 128,  64],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256,  64],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64,  64],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256,  64],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 256,  64],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128,  64],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128,  64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([256, 128, 128],  3, [4, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([128, 256, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc),
       TileDescription([256,  64, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 128, 128],  5, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128,  64, 128],  6, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 64, 128, 128],  6, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 64,  64, 128], 10, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([256, 128, 256],  3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([128, 256, 256],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([256,  64, 256],  4, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([ 64, 256, 256],  4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([128, 128, 256],  4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([128, 128, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128,  64, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 64, 128, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 64,  64, 256],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  32, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 128],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  32, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
     ]
-  
-    data_type = [math_inst.element_a, math_inst.element_b, math_inst.element_accumulator, DataType.s32]
-    data_type_mixed = [math_inst.element_a, math_inst.element_b, math_inst.element_a, DataType.f32]
-  
-    CreateGemmOperator(manifest, layouts, tile_descriptions, \
-      data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination)
-  
-    operations = []
-  
-    operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
-      data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
-       
-    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
-    CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
-      data_type, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombination)
-    
-    operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
-      data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
-  
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      math_inst.element_accumulator,
+    ]
+
+    # streamk uses more regs which can cause spill for the biggest warp tile size when the accumulators are 32bit.
+    operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination, SwizzlingFunctor.Identity8)
+
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. S8 accumulation)
+    if math_inst.element_a != math_inst.element_accumulator:
+      alignment_constraints = [[32, 16, 16],]
+
+      data_type_mixed = [
+        math_inst.element_a,
+        math_inst.element_b,
+        math_inst.element_b,
+        DataType.f32
+      ]
+
+      operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
+        data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp, SwizzlingFunctor.Identity8)
+
     for op in operations:
       if op.tile_description.threadblock_shape[1] >= 128:
-        op.C.alignment = 16
-      elif op.tile_description.threadblock_shape[1] == 64:
-        op.C.alignment = 8
+        if op.tile_description.threadblock_shape[0] == 32:
+          op.C.alignment = 8
+        else:
+          op.C.alignment = 16
       else:
         op.C.alignment = 8
 #
 
 #
-def GenerateSM80_SparseTensorOp_168128_TN(manifest, cuda_version):
+def GenerateSM80_TensorOp_16832_TN_mixed_input_upcast_b(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(cuda_version, 11, 1):
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  # Upcast on Operand B
+  math_instructions = [
+    MathInstruction(                                  \
+      [16, 8, 32],                                    \
+      DataType.s8, DataType.s4, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_mixed_input_upcast),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  # For mixed-input alignment constraints are a list of lists, where the 
+  # inner list contains the alignment constraints for operands/matrices 
+  # [[alignA, alignB, alignC],..]
+  alignment_constraints = [[16, 32, 4],]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128,  64],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256,  64],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64,  64],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256,  64],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  32,  64],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128,  64],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128,  64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  32,  64],  6, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 128],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  32, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 128],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  32, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      math_inst.element_accumulator,
+    ]
+
+    # streamk uses more regs which can cause spill for the biggest warp tile size when the accumulators are 32bit.
+    operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination, SwizzlingFunctor.Identity8)
+
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. S8 accumulation)
+    if math_inst.element_a != math_inst.element_accumulator:
+      alignment_constraints = [[16, 32, 16],]
+
+      data_type_mixed = [
+        math_inst.element_a,
+        math_inst.element_b,
+        math_inst.element_a,
+        DataType.f32,
+      ]
+
+      operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
+        data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp, SwizzlingFunctor.Identity8)
+
+    for op in operations:
+      if op.tile_description.threadblock_shape[1] >= 128:
+        if op.tile_description.threadblock_shape[0] == 32:
+          op.C.alignment = 8
+        else:
+          op.C.alignment = 16
+      else:
+        op.C.alignment = 8
+#
+
+#
+def GenerateSM80_SparseTensorOp_16864_TN(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 1):
+    return
+
+  layouts = [
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.RowMajor),
+  ]
+
+  math_inst =                                         \
+    MathInstruction(                                  \
+      [16, 8, 64],                                    \
+      DataType.s8, DataType.s8, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [16,]
+
+  tile_descriptions = [
+    TileDescription([128,  64, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([256, 128, 128],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 256, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 128, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([256,  64, 128],  3, [4, 1, 1], math_inst, min_cc, max_cc),
+    TileDescription([ 64, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([ 64, 128, 128],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([ 64,  64, 128],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 128, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([128,  64, 256],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([ 64, 128, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([ 64,  64, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.s8, DataType.s8, DataType.s32, DataType.s32]
+  data_type_mixed = [DataType.s8, DataType.s8, DataType.s8, DataType.f32]
+
+  CreateSparseGemmOperator(manifest, layouts, tile_descriptions, \
+    data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination)
+
+  operations = []
+
+  operations += CreateSparseGemmOperator(manifest, layouts, tile_descriptions, \
+    data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
+
+  for op in operations:
+    if op.tile_description.threadblock_shape[1] >= 128:
+      op.C.alignment = 16
+    else:
+      op.C.alignment = 8
+#
+
+#
+def GenerateSM80_TensorOp_16832_Interleaved(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajorInterleaved32, LayoutType.RowMajorInterleaved32, LayoutType.ColumnMajorInterleaved32),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [16, 8, 32],                                    \
+      DataType.s8, DataType.s8, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate),
+    MathInstruction(                                  \
+      [16, 8, 32],                                    \
+      DataType.u8, DataType.u8, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [16,]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 64],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 64],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 64],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 64],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 64],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 64], 10, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type_mixed = [math_inst.element_a, math_inst.element_b, math_inst.element_a, DataType.f32]
+
+    operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
+
+    conv_layout = (LayoutType.TensorNC32HW32, LayoutType.TensorC32RSK32, LayoutType.TensorNC32HW32)
+
+    operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
+      data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+
+    for op in operations:
+      op.C.alignment = 8
+#
+
+#
+def GenerateSM80_TensorOp_16864_TN(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [16, 8, 64],                                    \
+      DataType.s4, DataType.s4, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate),
+    MathInstruction(                                  \
+      [16, 8, 64],                                    \
+      DataType.u4, DataType.u4, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+  alignment_constraints = [32,]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 128],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 128],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 128],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 128],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 128], 10, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 256],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 256],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 256],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 256],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 256],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 256],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [math_inst.element_a, math_inst.element_b, math_inst.element_accumulator, DataType.s32]
+    data_type_mixed = [math_inst.element_a, math_inst.element_b, math_inst.element_a, DataType.f32]
+
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination)
+
+    operations = []
+
+    operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
+
+    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+    CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
+      data_type, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombination)
+
+    operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
+      data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+
+    for op in operations:
+      if op.tile_description.threadblock_shape[1] >= 128:
+        op.C.alignment = 16
+      elif op.tile_description.threadblock_shape[1] == 64:
+        op.C.alignment = 8
+      else:
+        op.C.alignment = 8
+#
+
+#
+def GenerateSM80_SparseTensorOp_168128_TN(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 1):
     return
 
   layouts = [
@@ -2173,21 +3271,19 @@ def GenerateSM80_SparseTensorOp_168128_TN(manifest, cuda_version):
 
   min_cc = 80
   max_cc = 1024
-  max_cc_smem_limited = 80
-
   alignment_constraints = [32,]
 
   tile_descriptions = [
     TileDescription([ 64,  64, 256],  4, [2, 2, 1], math_inst, min_cc, max_cc),
     TileDescription([256,  64, 256],  3, [4, 1, 1], math_inst, min_cc, max_cc),
-    TileDescription([256, 128, 256],  3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-    TileDescription([128, 256, 256],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
-    TileDescription([128, 128, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-    TileDescription([ 64, 256, 256],  4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited),
-    TileDescription([128,  64, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-    TileDescription([ 64, 128, 256],  6, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-    TileDescription([128, 128, 512],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
-    TileDescription([128,  64, 512],  4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
+    TileDescription([256, 128, 256],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 256, 256],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 128, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([ 64, 256, 256],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([128,  64, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([ 64, 128, 256],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 128, 512],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([128,  64, 512],  4, [2, 2, 1], math_inst, min_cc, max_cc),
     TileDescription([ 64, 128, 512],  3, [2, 2, 1], math_inst, min_cc, max_cc),
     TileDescription([ 64,  64, 512],  3, [2, 2, 1], math_inst, min_cc, max_cc),
   ]
@@ -2246,21 +3342,21 @@ def GenerateSM80_TensorOp_16864_Interleaved(manifest, cuda_version):
       TileDescription([128, 128, 128],  5, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 128, 128],  6, [2, 2, 1], math_inst, min_cc, max_cc),
     ]
-  
+
     data_type_mixed = [math_inst.element_a, math_inst.element_b, math_inst.element_a, DataType.f32]
-    
+
     operations = []
-  
+
     operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
       data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
- 
+
     conv_layout = (LayoutType.TensorNC64HW64, LayoutType.TensorC64RSK64, LayoutType.TensorNC64HW64)
-  
+
     operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
       data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
- 
+
     for op in operations:
-      op.C.alignment = 16 
+      op.C.alignment = 16
 #
 
 #
@@ -2279,11 +3375,17 @@ def GenerateSM80_TensorOp_168256(manifest, cuda_version):
       DataType.b1, DataType.b1, DataType.s32,         \
       OpcodeClass.TensorOp,                           \
       MathOperation.xor_popc),
+    MathInstruction(                                  \
+      [16, 8, 256],                                   \
+      DataType.b1, DataType.b1, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.and_popc),
   ]
 
   min_cc = 80
-  max_cc = { 
-    MathOperation.xor_popc: 1024
+  max_cc = {
+    MathOperation.xor_popc: 89,
+    MathOperation.and_popc: 90
   }
 
   alignment_constraints = [128,]
@@ -2338,7 +3440,6 @@ def GenerateSM80_TensorOp_1688(manifest, cuda_version):
 
   min_cc = 80
   max_cc = 1024
-  max_cc_smem_limited = 80
 
   alignment_constraints = [4, 2, 1]
 
@@ -2350,15 +3451,15 @@ def GenerateSM80_TensorOp_1688(manifest, cuda_version):
       TileDescription([ 64, 256, 16],  4, [1, 4, 1], math_inst, min_cc, max_cc),
       TileDescription([128, 128, 16],  5, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([128, 128, 16],  4, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 128, 16],  3, [2, 2, 1], math_inst, min_cc, max_cc),            
+      TileDescription([128, 128, 16],  3, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([128,  64, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 128, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64,  64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([256, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([128, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([256,  64, 32],  4, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([ 64, 256, 32],  4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([128, 128, 32],  4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([256, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 32],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 32],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 32],  4, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([128, 128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([128,  64, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([64,  128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
@@ -2424,7 +3525,6 @@ def GenerateSM80_TensorOp_1688_fast_math(manifest, cuda_version):
 
   min_cc = 80
   max_cc = 1024
-  max_cc_smem_limited = 80
 
   alignment_constraints = [4, 2, 1]
 
@@ -2436,19 +3536,19 @@ def GenerateSM80_TensorOp_1688_fast_math(manifest, cuda_version):
       TileDescription([ 64, 256, 16],  4, [1, 4, 1], math_inst, min_cc, max_cc),
       TileDescription([128, 128, 16],  5, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([128, 128, 16],  4, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 128, 16],  3, [2, 2, 1], math_inst, min_cc, max_cc),            
+      TileDescription([128, 128, 16],  3, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([128,  64, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 128, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64,  64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([256, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([128, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([256,  64, 32],  4, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([ 64, 256, 32],  4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([128, 128, 32],  4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([128, 128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128,  64, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 64, 128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 64,  64, 32],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 32],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 32],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 32],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 32],  5, [2, 2, 1], math_inst, min_cc, max_cc),
     ]
 
     data_type = [DataType.f32, DataType.f32, DataType.f32, DataType.f32]
@@ -2483,7 +3583,6 @@ def GenerateSM80_TensorOp_1688_fast_fp32_math(manifest, cuda_version):
 
   min_cc = 80
   max_cc = 1024
-  max_cc_smem_limited = 80
 
   alignment_constraints = [4, 2, 1]
 
@@ -2497,8 +3596,8 @@ def GenerateSM80_TensorOp_1688_fast_fp32_math(manifest, cuda_version):
       TileDescription([ 64, 128, 16],  4, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64,  64, 16],  3, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([128, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([256,  64, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([ 64, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([256,  64, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc),
       TileDescription([128,  64, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64,  64, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
@@ -2583,23 +3682,22 @@ def GenerateSM80_SparseTensorOp_16816_fast_math(manifest, cuda_version):
 
   min_cc = 80
   max_cc = 1024
-  max_cc_smem_limited = 80
 
   alignment_constraints = [4]
 
   for math_inst in math_instructions:
     tile_descriptions = [
       TileDescription([128,  64, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([256, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([128, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([128, 128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc),
       TileDescription([256,  64, 32],  3, [4, 1, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 256, 32],  4, [1, 4, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 128, 32],  6, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64,  64, 32],  6, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 128, 64],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([256,  64, 64],  3, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([128,  64, 64],  4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([128, 128, 64],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 64],  3, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 64],  4, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 128, 64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64,  64, 64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
     ]
@@ -2674,7 +3772,7 @@ def GenerateSM80_TensorOp_1688_rank_k(manifest, cuda_version):
     FillMode.Lower, FillMode.Upper,
   ]
 
-  math_instructions = [ 
+  math_instructions = [
       MathInstruction(                                    \
       [16, 8, 8],                                         \
       DataType.tf32, DataType.tf32, DataType.f32,         \
@@ -2733,7 +3831,7 @@ def GenerateSM80_TensorOp_1688_rank_k_complex(manifest, cuda_version):
     FillMode.Lower, FillMode.Upper,
   ]
 
-  math_instructions = [ 
+  math_instructions = [
       MathInstruction(                                    \
       [16, 8, 8],                                         \
       DataType.tf32, DataType.tf32, DataType.f32,         \
@@ -2796,7 +3894,7 @@ def GenerateSM80_TensorOp_1688_trmm(manifest, cuda_version):
     DiagType.NonUnit, DiagType.Unit,
   ]
 
-  math_instructions = [ 
+  math_instructions = [
       MathInstruction(                                    \
       [16, 8, 8],                                         \
       DataType.tf32, DataType.tf32, DataType.f32,         \
@@ -2812,7 +3910,7 @@ def GenerateSM80_TensorOp_1688_trmm(manifest, cuda_version):
   min_cc = 80
   max_cc = 1024
 
-  alignment_constraints = [1, 2, 4] 
+  alignment_constraints = [1, 2, 4]
 
   for math_inst in math_instructions:
     tile_descriptions = [
@@ -2834,201 +3932,1986 @@ def GenerateSM80_TensorOp_1688_trmm(manifest, cuda_version):
       #TileDescription([ 64,  64, 32],  5, [2, 2, 1], math_inst, min_cc, max_cc),
     ]
 
-    data_type = [DataType.f32, DataType.f32, DataType.f32, DataType.f32]
+    data_type = [DataType.f32, DataType.f32, DataType.f32, DataType.f32]
+
+    CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
+      data_type, alignment_constraints)
+#
+
+#
+def GenerateSM80_TensorOp_1688_trmm_complex(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  diag_types = [
+    DiagType.NonUnit, DiagType.Unit,
+  ]
+
+  math_instructions = [
+      MathInstruction(                                    \
+      [16, 8, 8],                                         \
+      DataType.tf32, DataType.tf32, DataType.f32,         \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex),
+      MathInstruction(                                    \
+      [16, 8, 8],                                         \
+      DataType.f32, DataType.f32, DataType.f32,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex_fast_f32),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([128, 64, 16], 4, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([64, 128, 16], 4, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([64, 32, 16], 4, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([32, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      DataType.cf32, DataType.cf32, DataType.cf32, DataType.cf32
+    ]
+
+    alignment_constraints = [1,]
+
+    complex_transforms = [
+      ComplexTransform.none, ComplexTransform.conj,
+    ]
+
+    CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
+      data_type, alignment_constraints, complex_transforms)
+#
+
+#
+def GenerateSM80_TensorOp_1688_symm(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  # A and B have same layouts
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  math_instructions = [
+      MathInstruction(                                    \
+      [16, 8, 8],                                         \
+      DataType.tf32, DataType.tf32, DataType.f32,         \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add),
+      MathInstruction(                                    \
+      [16, 8, 8],                                         \
+      DataType.f32, DataType.f32, DataType.f32,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_fast_f32),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [
+    1, 2, 4
+  ]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 16],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 16],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      #TileDescription([256,  64, 16],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64, 256, 16],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 16],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([128,  64, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64, 128, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64,  64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      #TileDescription([256,  64, 32],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64, 256, 32],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 32],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([128,  64, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64, 128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64,  64, 32],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [DataType.f32, DataType.f32, DataType.f32, DataType.f32]
+
+    CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
+      data_type, alignment_constraints, BlasMode.symmetric)
+#
+
+#
+def GenerateSM80_TensorOp_1688_symm_complex(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  math_instructions = [
+      MathInstruction(                                    \
+      [16, 8, 8],                                         \
+      DataType.tf32, DataType.tf32, DataType.f32,         \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex),
+      MathInstruction(                                    \
+      [16, 8, 8],                                         \
+      DataType.f32, DataType.f32, DataType.f32,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex_fast_f32),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([128, 64, 16], 4, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([64, 128, 16], 4, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([64, 32, 16], 4, [2, 1, 1], math_inst, min_cc, max_cc),
+      #TileDescription([32, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      DataType.cf32, DataType.cf32, DataType.cf32, DataType.cf32
+    ]
+
+    alignment_constraints = [1,]
+
+    # SYMM
+    CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
+      data_type, alignment_constraints, BlasMode.symmetric)
+
+    # HEMM
+    CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
+      data_type, alignment_constraints, BlasMode.hermitian)
+#
+
+#
+def GenerateSM80_TensorOp_884(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([256, 64, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 256, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([256, 32, 16], 3, [4, 1, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 256, 16], 3, [1, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([16, 32, 16], 5, [1, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 16, 16], 5, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.f64, DataType.f64, DataType.f64, DataType.f64]
+
+  CreateGemmOperator(manifest, layouts, tile_descriptions, \
+    data_type, alignment_constraints)
+#
+
+#
+def GenerateSM80_TensorOp_884_complex(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([128, 64,  8 ], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64,  128, 8 ], 3, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([64,  64,  8 ], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64,  32,  8 ], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32,  64,  8 ], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32,  32,  8 ], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([16,  32,  8 ], 4, [1, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32,  16,  8 ], 4, [2, 1, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 64,  16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64,  128, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([64,  64,  16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64,  32,  16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32,  64,  16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32,  32,  16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([16,  32,  16], 4, [1, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32,  16,  16], 3, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
+
+  complex_transforms = [
+    (ComplexTransform.none, ComplexTransform.none),
+    (ComplexTransform.conj, ComplexTransform.none),
+    (ComplexTransform.none, ComplexTransform.conj),
+    (ComplexTransform.conj, ComplexTransform.conj)
+  ]
+
+  CreateGemmOperator(manifest, layouts, tile_descriptions, \
+    data_type, alignment_constraints, complex_transforms)
+
+#
+def GenerateSM80_TensorOp_884_complex_gaussian(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex_gaussian)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
+
+  complex_transforms = [
+    (ComplexTransform.none, ComplexTransform.none),
+    (ComplexTransform.conj, ComplexTransform.none),
+    (ComplexTransform.none, ComplexTransform.conj),
+    (ComplexTransform.conj, ComplexTransform.conj)
+  ]
+
+  CreateGemmOperator(manifest, layouts, tile_descriptions, \
+    data_type, alignment_constraints, complex_transforms)
+#
+
+#
+def GenerateSM80_TensorOp_884_rank_k(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([16, 32, 16], 5, [1, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 16, 16], 5, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.f64, DataType.f64, DataType.f64]
+
+  CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.symmetric)
+#
+
+#
+def GenerateSM80_TensorOp_884_rank_k_complex(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([128, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 128, 8], 3, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 64, 8], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.cf64, DataType.cf64, DataType.cf64]
+
+  # SYRK computation
+  CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.symmetric)
+
+  # HERK computation
+  CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.hermitian)
+
+#
+
+#
+def GenerateSM80_TensorOp_884_rank_k_complex_gaussian(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex_gaussian)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.cf64, DataType.cf64, DataType.cf64]
+
+  complex_transforms = [ComplexTransform.none,]
+
+  # SYRK computation
+  CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.symmetric)
+
+  # HERK computation
+  CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.hermitian)
+#
+
+#
+def GenerateSM80_TensorOp_884_trmm(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  diag_types = [
+    DiagType.NonUnit, DiagType.Unit,
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.f64, DataType.f64, DataType.f64, DataType.f64]
+
+  CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
+    data_type, alignment_constraints)
+#
+
+#
+def GenerateSM80_TensorOp_884_trmm_complex(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  diag_types = [
+    DiagType.NonUnit, DiagType.Unit,
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([128, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 128, 8], 3, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 64, 8], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
+
+  complex_transforms = [
+    ComplexTransform.none, ComplexTransform.conj,
+  ]
+
+  CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
+    data_type, alignment_constraints, complex_transforms)
+#
+
+
+#
+def GenerateSM80_TensorOp_884_trmm_complex_gaussian(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  diag_types = [
+    DiagType.NonUnit, DiagType.Unit,
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex_gaussian)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
+
+  complex_transforms = [
+    ComplexTransform.none, ComplexTransform.conj,
+  ]
+
+  CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
+    data_type, alignment_constraints, complex_transforms)
+#
+
+#
+def GenerateSM80_TensorOp_884_symm(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([16, 32, 16], 5, [1, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 16, 16], 5, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.f64, DataType.f64, DataType.f64, DataType.f64]
+
+  CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.symmetric)
+#
+
+#
+def GenerateSM80_TensorOp_884_symm_complex(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([128, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 128, 8], 3, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 64, 8], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
+
+  # SYMM computation
+  CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.symmetric)
+
+  # HEMM computation
+  CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.hermitian)
+#
+
+#
+def GenerateSM80_TensorOp_884_symm_complex_gaussian(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex_gaussian)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
+
+  complex_transforms = [ComplexTransform.none,]
+
+  # SYMM computation
+  CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.symmetric)
+
+  # HEMM computation
+  CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.hermitian)
+#
+
+###################################################################################################
+
+#
+def GenerateSM80_Simt_f32(manifest, cuda_version):
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [1, 1, 1],                                      \
+      DataType.f32, DataType.f32, DataType.f32,       \
+      OpcodeClass.Simt,                               \
+      MathOperation.multiply_add),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 8], 5, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 8], 5, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 8], 5, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 8], 4, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 8], 4, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 8], 4, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 8], 5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 8], 5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 8], 5, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  32, 8], 5, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 128, 8], 5, [1, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      math_inst.element_accumulator,
+    ]
+
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints)
+
+    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+    CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
+#
+
+
+#
+def GenerateSM80_Simt_f64(manifest, cuda_version):
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [1, 1, 1],                                      \
+      DataType.f64, DataType.f64, DataType.f64,       \
+      OpcodeClass.Simt,                               \
+      MathOperation.multiply_add),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([128, 128, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 8], 5, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  32, 8], 5, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 128, 8], 5, [1, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      math_inst.element_accumulator,
+    ]
+
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints)
+#
+
+
+##################################################################################################
+#
+def GenerateSM80_Simt_complex(manifest, cuda_version):
+  math_instructions = [
+    MathInstruction(                                  \
+      [1, 1, 1],                                      \
+      DataType.f32, DataType.f32, DataType.f32,       \
+      OpcodeClass.Simt,                               \
+      MathOperation.multiply_add_complex),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  data_type = [
+    DataType.cf32,
+    DataType.cf32,
+    DataType.cf32,
+    DataType.cf32
+  ]
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  complex_transforms = [
+    (ComplexTransform.none, ComplexTransform.none),
+    (ComplexTransform.conj, ComplexTransform.none),
+    (ComplexTransform.none, ComplexTransform.conj),
+    (ComplexTransform.conj, ComplexTransform.conj)
+  ]
+
+  for math_inst in math_instructions:
+
+    tile_descriptions = [
+      TileDescription([128, 128, 8], 5, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 8], 4, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, alignment_constraints, complex_transforms)
+
+    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+    CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
+#
+
+###################################################################################################
+
+#
+def GenerateSM80(manifest, cuda_version):
+  GenerateSM80_TensorOp_16816(manifest, cuda_version)
+  GenerateSM80_SparseTensorOp_16832(manifest, cuda_version)
+  GenerateSM80_PlanarComplexTensorOp_16816(manifest, cuda_version)
+  GenerateSM80_TensorOp_1688(manifest, cuda_version)
+  GenerateSM80_TensorOp_1688_fast_math(manifest, cuda_version)
+  GenerateSM80_SparseTensorOp_16816_fast_math(manifest, cuda_version)
+  GenerateSM80_TensorOp_1688_complex(manifest, cuda_version)
+  # 3xTF32
+  GenerateSM80_TensorOp_1688_fast_fp32_math(manifest, cuda_version)
+  GenerateSM80_TensorOp_1688_fast_fp32_math_complex(manifest, cuda_version)
+  GenerateSM80_TensorOp_1688_rank_k(manifest, cuda_version)
+  GenerateSM80_TensorOp_1688_rank_k_complex(manifest, cuda_version)
+  GenerateSM80_TensorOp_1688_trmm(manifest, cuda_version)
+  GenerateSM80_TensorOp_1688_trmm_complex(manifest, cuda_version)
+  GenerateSM80_TensorOp_1688_symm(manifest, cuda_version)
+  GenerateSM80_TensorOp_1688_symm_complex(manifest, cuda_version)
+  GenerateSM80_TensorOp_884(manifest, cuda_version)
+  GenerateSM80_TensorOp_884_complex(manifest, cuda_version)
+  GenerateSM80_TensorOp_884_complex_gaussian(manifest, cuda_version)
+  GenerateSM80_TensorOp_884_rank_k(manifest, cuda_version)
+  GenerateSM80_TensorOp_884_rank_k_complex(manifest, cuda_version)
+  GenerateSM80_TensorOp_884_rank_k_complex_gaussian(manifest, cuda_version)
+  GenerateSM80_TensorOp_884_trmm(manifest, cuda_version)
+  GenerateSM80_TensorOp_884_trmm_complex(manifest, cuda_version)
+  GenerateSM80_TensorOp_884_trmm_complex_gaussian(manifest, cuda_version)
+  GenerateSM80_TensorOp_884_symm(manifest, cuda_version)
+  GenerateSM80_TensorOp_884_symm_complex(manifest, cuda_version)
+  GenerateSM80_TensorOp_884_symm_complex_gaussian(manifest, cuda_version)
+  GenerateSM80_TensorOp_16816_mixed_input_upcast_a(manifest, cuda_version)
+  GenerateSM80_TensorOp_16816_mixed_input_upcast_b(manifest, cuda_version)
+  GenerateSM80_TensorOp_16832_TN(manifest, cuda_version)
+  GenerateSM80_TensorOp_16832_TN_mixed_input_upcast_a(manifest, cuda_version)
+  GenerateSM80_TensorOp_16832_TN_mixed_input_upcast_b(manifest, cuda_version)
+  GenerateSM80_SparseTensorOp_16864_TN(manifest, cuda_version)
+  GenerateSM80_TensorOp_16832_Interleaved(manifest, cuda_version)
+  GenerateSM80_TensorOp_16864_TN(manifest, cuda_version)
+  GenerateSM80_SparseTensorOp_168128_TN(manifest, cuda_version)
+  GenerateSM80_TensorOp_16864_Interleaved(manifest, cuda_version)
+  GenerateSM80_TensorOp_168256(manifest, cuda_version)
+  GenerateSM80_Simt_f32(manifest, cuda_version)
+  GenerateSM80_Simt_f64(manifest, cuda_version)
+  GenerateSM80_Simt_complex(manifest, cuda_version)
+
+###################################################################################################
+
+def GenerateSM89_TensorOp_16832_fp8(manifest, cuda_version):
+  if (
+    not CudaToolkitVersionSatisfies(cuda_version, 12, 4)
+  ):
+    return
+
+  layouts = [
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.RowMajor)
+  ]
+
+  math_instructions = [
+    MathInstruction(
+      [16, 8, 32],
+      DataType.e4m3, DataType.e4m3, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [16, 8, 32],
+      DataType.e4m3, DataType.e5m2, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [16, 8, 32],
+      DataType.e5m2, DataType.e4m3, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [16, 8, 32],
+      DataType.e5m2, DataType.e5m2, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [16, 8, 32],
+      DataType.e4m3, DataType.e4m3, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add_fast_accum),
+    MathInstruction(
+      [16, 8, 32],
+      DataType.e4m3, DataType.e5m2, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add_fast_accum),
+    MathInstruction(
+      [16, 8, 32],
+      DataType.e5m2, DataType.e4m3, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add_fast_accum),
+    MathInstruction(
+      [16, 8, 32],
+      DataType.e5m2, DataType.e5m2, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add_fast_accum),
+  ]
+
+  min_cc = 89
+  max_cc = 89
+
+  alignment_constraints = [16,]
+  alignment_constraints_small_channels = [16, 8, 4]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 128],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128,  64],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128,  64],  6, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256,  64],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256,  64],  6, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64,  64],  3, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64,  64],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256,  64],  3, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256,  64],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  32, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  32,  64],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 256,  64],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 128],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 128],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128,  64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128,  64],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128,  64],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128,  64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 128],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 128],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64,  64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64,  64],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64,  64],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64,  64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128,  64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128,  64],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128,  64],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128,  64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  32, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  32,  64],  6, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 128, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 128,  64],  6, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 128],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 128],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64,  64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64,  64], 10, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_types = [
+      [
+        math_inst.element_a,
+        math_inst.element_b,
+        DataType.f32,
+        math_inst.element_accumulator
+      ],
+      [
+        math_inst.element_a,
+        math_inst.element_b,
+        DataType.bf16,
+        math_inst.element_accumulator
+      ],
+    ]
+
+    operations = []
+    for data_type in data_types:
+      operations += CreateGemmOperator(manifest, layouts, tile_descriptions, data_type,
+        alignment_constraints, None, EpilogueFunctor.LinearCombination)
+
+      conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+      operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
+        data_type, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombination)
+
+      operations += CreateConv2dFixedChannelsOperator(manifest, conv_layout, tile_descriptions,
+        data_type, alignment_constraints_small_channels, [ConvKind.Fprop], EpilogueFunctor.LinearCombination)
+
+    for op in operations:
+      if op.tile_description.threadblock_shape[1] >= 128:
+        if op.tile_description.threadblock_shape[0] == 32:
+          op.C.alignment = 8
+        else:
+          op.C.alignment = 16
+      else:
+        op.C.alignment = 8
+
+#
+def GenerateSM89_SparseTensorOp_16864_fp8(manifest, cuda_version):
+
+  if (
+    not CudaToolkitVersionSatisfies(cuda_version, 12, 4)
+  ):
+    return
+
+  layouts = [
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.RowMajor)
+  ]
+
+  math_instructions = [
+    MathInstruction(
+      [16, 8, 64],
+      DataType.e4m3, DataType.e4m3, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [16, 8, 64],
+      DataType.e4m3, DataType.e5m2, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [16, 8, 64],
+      DataType.e5m2, DataType.e4m3, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [16, 8, 64],
+      DataType.e5m2, DataType.e5m2, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [16, 8, 64],
+      DataType.e4m3, DataType.e4m3, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add_fast_accum),
+    MathInstruction(
+      [16, 8, 64],
+      DataType.e4m3, DataType.e5m2, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add_fast_accum),
+    MathInstruction(
+      [16, 8, 64],
+      DataType.e5m2, DataType.e4m3, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add_fast_accum),
+    MathInstruction(
+      [16, 8, 64],
+      DataType.e5m2, DataType.e5m2, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add_fast_accum),
+  ]
+
+  min_cc = 89
+  max_cc = 89
+
+  alignment_constraints = [16,]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([128,  64, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 128],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 128],  3, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 128],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 128],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 256],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_types = [
+      [
+        math_inst.element_a,
+        math_inst.element_b,
+        DataType.f32,
+        math_inst.element_accumulator
+      ],
+    ]
 
-    CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
-      data_type, alignment_constraints)
-#
+    operations = []
+    for data_type in data_types:
+      operations += CreateSparseGemmOperator(manifest, layouts, tile_descriptions, data_type,
+        alignment_constraints, None, EpilogueFunctor.LinearCombination)
+
+    for op in operations:
+      if op.tile_description.threadblock_shape[1] >= 128:
+        op.C.alignment = 16
+      else:
+        op.C.alignment = 8
+
+###################################################################################################
 
 #
-def GenerateSM80_TensorOp_1688_trmm_complex(manifest, cuda_version):
+def GenerateSM89(manifest, cuda_version):
+  GenerateSM89_TensorOp_16832_fp8(manifest, cuda_version)
+  GenerateSM89_SparseTensorOp_16864_fp8(manifest, cuda_version)
 
-  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+###################################################################################################
+
+
+try:
+    from .sm90_utils import (
+        generate_fp16_bf16_math_instructions_sm90,
+        generate_tf32_math_instructions_sm90,
+        generate_int8_math_instructions_sm90,
+        generate_fp8_math_instructions_sm90,
+        make_sparse_math_instructions,
+        generate_tile_descriptions_sm90,
+        get_valid_schedules,
+        generate_data_types_from_math_instruction,
+        fix_alignments,
+    )
+except ImportError:
+    from sm90_utils import (
+        generate_fp16_bf16_math_instructions_sm90,
+        generate_tf32_math_instructions_sm90,
+        generate_int8_math_instructions_sm90,
+        generate_fp8_math_instructions_sm90,
+        make_sparse_math_instructions,
+        generate_tile_descriptions_sm90,
+        get_valid_schedules,
+        generate_data_types_from_math_instruction,
+        fix_alignments,
+    )
+
+def GenerateSM90_TensorOp_16b_WGMMA_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
     return
 
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=131, exhaustive_level=9999)
+  is_aligned = True
+
+  # layouts for ABC and their alignments.
   layouts = [
-    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
-    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    [[LayoutType.ColumnMajor, 8], [LayoutType.ColumnMajor, 8], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.ColumnMajor, 8], [LayoutType.RowMajor,    8], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.RowMajor,    8], [LayoutType.ColumnMajor, 8], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.RowMajor,    8], [LayoutType.RowMajor,    8], [LayoutType.ColumnMajor, 1]],
   ]
 
-  side_modes = [
-    SideMode.Left, SideMode.Right,
-  ]
+  math_instructions = generate_fp16_bf16_math_instructions_sm90(instantiation_level)
+  tile_descriptions = generate_tile_descriptions_sm90(
+      math_instructions=math_instructions,
+      is_aligned=is_aligned,
+      level=instantiation_level)
 
-  fill_modes = [
-    FillMode.Lower, FillMode.Upper,
-  ]
+  for tile_desc in tile_descriptions:
+    math_inst = tile_desc.math_instruction
+    data_type_w_source = generate_data_types_from_math_instruction(math_inst)
+    data_type_wo_source = generate_data_types_from_math_instruction(math_inst, element_source=DataType.void)
+    data_types = [data_type_w_source, data_type_wo_source]
 
-  diag_types = [
-    DiagType.NonUnit, DiagType.Unit,
-  ]
+    # for mixed precision kernels, also generate kernels that write output matrix in the A/B format
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
+    if math_inst.element_a != math_inst.element_accumulator:
+        data_type_mixed_w_source = generate_data_types_from_math_instruction(
+            math_inst,
+            element_source=math_inst.element_a,
+            element_dest=math_inst.element_a
+        )
+        data_type_mixed_wo_source = generate_data_types_from_math_instruction(
+            math_inst,
+            element_source=DataType.void,
+            element_dest=math_inst.element_a
+        )
+        data_types.append(data_type_mixed_w_source)
+        data_types.append(data_type_mixed_wo_source)
 
-  math_instructions = [ 
-      MathInstruction(                                    \
-      [16, 8, 8],                                         \
-      DataType.tf32, DataType.tf32, DataType.f32,         \
-      OpcodeClass.TensorOp,                               \
-      MathOperation.multiply_add_complex),
-      MathInstruction(                                    \
-      [16, 8, 8],                                         \
-      DataType.f32, DataType.f32, DataType.f32,           \
-      OpcodeClass.TensorOp,                               \
-      MathOperation.multiply_add_complex_fast_f32),
-  ]
+    for layout in layouts:
+        for data_type in data_types:
+            layout = fix_alignments(data_type, layout, alignment_bits=128)
+
+            schedules, stream_k_schedules = get_valid_schedules(
+              tile_description=tile_desc,
+              cuda_version=cuda_version,
+              is_aligned=is_aligned,
+              data_types=data_type,
+              instantiation_level=instantiation_level,
+              layout=layout,
+            )
+
+            if len(schedules):
+              CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type, schedules)
+              if len(stream_k_schedules):
+                assert CudaToolkitVersionSatisfies(cuda_version, 12, 1)
+                CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type,
+                                              stream_k_schedules,
+                                              tile_schedulers=[TileSchedulerType.StreamK])
+
+
+def GenerateSM90_TensorOp_16b_WGMMA_alignx_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+    return
 
-  min_cc = 80
-  max_cc = 1024
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=101, exhaustive_level=9999)
+  is_aligned = False
 
-  for math_inst in math_instructions:
-    tile_descriptions = [
-      TileDescription([128, 64, 16], 4, [4, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([64, 128, 16], 4, [2, 4, 1], math_inst, min_cc, max_cc),
-      TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([64, 32, 16], 4, [2, 1, 1], math_inst, min_cc, max_cc),
-      TileDescription([32, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
-    ]
+  # layouts for ABC and their alignments.
+  layouts = [
+    [[LayoutType.RowMajor,    4], [LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.RowMajor,    4], [LayoutType.RowMajor,    4], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.ColumnMajor, 4], [LayoutType.RowMajor,    4], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.RowMajor,    2], [LayoutType.ColumnMajor, 2], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.RowMajor,    2], [LayoutType.RowMajor,    2], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.ColumnMajor, 2], [LayoutType.ColumnMajor, 2], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.ColumnMajor, 2], [LayoutType.RowMajor,    2], [LayoutType.ColumnMajor, 1]],
+  ]
+
+  math_instructions = generate_fp16_bf16_math_instructions_sm90(instantiation_level)
+  tile_descriptions = generate_tile_descriptions_sm90(
+      math_instructions=math_instructions,
+      is_aligned=is_aligned,
+      level=instantiation_level)
+
+  for tile_desc in tile_descriptions:
+    math_inst = tile_desc.math_instruction
+    data_type_w_source = generate_data_types_from_math_instruction(math_inst)
+    data_types = [data_type_w_source]
+
+    # for mixed precision kernels, also generate kernels that write output matrix in the A/B format
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
+    if math_inst.element_a != math_inst.element_accumulator:
+        data_type_mixed_w_source = generate_data_types_from_math_instruction(
+            math_inst,
+            element_source=math_inst.element_a,
+            element_dest=math_inst.element_a
+        )
+        data_types.append(data_type_mixed_w_source)
 
-    data_type = [
-      DataType.cf32, DataType.cf32, DataType.cf32, DataType.cf32
-    ]
+    for layout in layouts:
+        for data_type in data_types:
+            layout = fix_alignments(data_type, layout, alignment_bits=128)
+
+            schedules, stream_k_schedules = get_valid_schedules(
+              tile_description=tile_desc,
+              cuda_version=cuda_version,
+              is_aligned=is_aligned,
+              data_types=data_type,
+              instantiation_level=instantiation_level,
+              layout=layout,
+            )
+
+            if len(schedules):
+              CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type, schedules)
+              if len(stream_k_schedules):
+                assert CudaToolkitVersionSatisfies(cuda_version, 12, 1)
+                CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type,
+                                              stream_k_schedules,
+                                              tile_schedulers=[TileSchedulerType.StreamK])
+
+def GenerateSM90_SparseTensorOp_16b_WGMMA_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 2):
+    return
 
-    alignment_constraints = [1,]
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=131, exhaustive_level=9999)
+  is_aligned = True
 
-    complex_transforms = [
-      ComplexTransform.none, ComplexTransform.conj,
-    ]
+  # layouts for ABC and their alignments.
+  layouts = [
+    [[LayoutType.ColumnMajor, 8], [LayoutType.ColumnMajor, 8], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.ColumnMajor, 8], [LayoutType.RowMajor,    8], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.RowMajor,   16], [LayoutType.ColumnMajor, 8], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.RowMajor,   16], [LayoutType.RowMajor,    8], [LayoutType.ColumnMajor, 1]],
+  ]
 
-    CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
-      data_type, alignment_constraints, complex_transforms)
-#
+  math_instructions = make_sparse_math_instructions(generate_fp16_bf16_math_instructions_sm90(instantiation_level))
+  tile_descriptions = generate_tile_descriptions_sm90(
+      math_instructions=math_instructions,
+      is_aligned=is_aligned,
+      level=instantiation_level)
 
-#
-def GenerateSM80_TensorOp_1688_symm(manifest, cuda_version):
+  for tile_desc in tile_descriptions:
+    math_inst = tile_desc.math_instruction
+    data_type_w_source = generate_data_types_from_math_instruction(math_inst)
+    data_type_wo_source = generate_data_types_from_math_instruction(math_inst, element_source=DataType.void)
+    data_types = [data_type_w_source, data_type_wo_source]
 
-  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    # for mixed precision kernels, also generate kernels that write output matrix in the A/B format
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
+    if math_inst.element_a != math_inst.element_accumulator:
+        data_type_mixed_w_source = generate_data_types_from_math_instruction(
+            math_inst,
+            element_source=math_inst.element_a,
+            element_dest=math_inst.element_a
+        )
+        data_type_mixed_wo_source = generate_data_types_from_math_instruction(
+            math_inst,
+            element_source=DataType.void,
+            element_dest=math_inst.element_a
+        )
+        data_types.append(data_type_mixed_w_source)
+        data_types.append(data_type_mixed_wo_source)
+
+    for layout in layouts:
+        for data_type in data_types:
+            layout = fix_alignments(data_type, layout, alignment_bits=128)
+
+            schedules, stream_k_schedules = get_valid_schedules(
+              tile_description=tile_desc,
+              cuda_version=cuda_version,
+              is_aligned=is_aligned,
+              data_types=data_type,
+              instantiation_level=instantiation_level,
+              layout=layout,
+            )
+
+            if len(schedules):
+              CreateSparseGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type, schedules)
+              if len(stream_k_schedules):
+                assert CudaToolkitVersionSatisfies(cuda_version, 12, 1)
+                CreateSparseGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type,
+                                                    stream_k_schedules,
+                                                    tile_schedulers=[TileSchedulerType.StreamK])
+
+
+def GenerateSM90_TensorOp_tf32_WGMMA_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
     return
 
-  # A and B have same layouts
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=120, default_level=121, exhaustive_level=9999)
+  is_aligned = True
+
+  # layouts for ABC and their alignments
   layouts = [
-    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    [[LayoutType.RowMajor,    4], [LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 4]],
+    [[LayoutType.RowMajor,    4], [LayoutType.RowMajor,    4], [LayoutType.ColumnMajor, 4]],
+    [[LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 4]],
+    [[LayoutType.ColumnMajor, 4], [LayoutType.RowMajor,    4], [LayoutType.ColumnMajor, 4]],
   ]
 
-  side_modes = [
-    SideMode.Left, SideMode.Right,
-  ]
+  math_instructions = generate_tf32_math_instructions_sm90(instantiation_level)
+  tile_descriptions = generate_tile_descriptions_sm90(
+      math_instructions=math_instructions,
+      is_aligned=is_aligned,
+      level=instantiation_level)
 
-  fill_modes = [
-    FillMode.Lower, FillMode.Upper,
-  ]
+  for tile_desc in tile_descriptions:
+    math_inst = tile_desc.math_instruction
 
-  math_instructions = [ 
-      MathInstruction(                                    \
-      [16, 8, 8],                                         \
-      DataType.tf32, DataType.tf32, DataType.f32,         \
-      OpcodeClass.TensorOp,                               \
-      MathOperation.multiply_add),
-      MathInstruction(                                    \
-      [16, 8, 8],                                         \
-      DataType.f32, DataType.f32, DataType.f32,           \
-      OpcodeClass.TensorOp,                               \
-      MathOperation.multiply_add_fast_f32),
+    for layout in layouts:
+        data_type_tf32 = generate_data_types_from_math_instruction(math_inst)
+        data_type_tf32_wo_source = generate_data_types_from_math_instruction(math_inst, element_source=DataType.void)
+        data_type_f32 = copy.deepcopy(data_type_tf32)
+        data_type_f32_wo_source = copy.deepcopy(data_type_tf32_wo_source)
+        data_type_f32["a_type"] = DataType.f32
+        data_type_f32["b_type"] = DataType.f32
+        data_type_f32["epi_type"] = DataType.f32
+        data_type_f32_wo_source["a_type"] = DataType.f32
+        data_type_f32_wo_source["b_type"] = DataType.f32
+        data_type_f32_wo_source["epi_type"] = DataType.f32
+        data_types = [data_type_tf32, data_type_f32, data_type_tf32_wo_source, data_type_f32_wo_source]
+
+        for data_type in data_types:
+            layout = fix_alignments(data_type, layout, alignment_bits=128)
+
+            schedules, stream_k_schedules = get_valid_schedules(
+              tile_description=tile_desc,
+              cuda_version=cuda_version,
+              is_aligned=is_aligned,
+              data_types=data_type,
+              instantiation_level=instantiation_level,
+              layout=layout,
+            )
+
+            if len(schedules):
+              CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type, schedules)
+              if len(stream_k_schedules):
+                assert CudaToolkitVersionSatisfies(cuda_version, 12, 1)
+                CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type,
+                                              stream_k_schedules,
+                                              tile_schedulers=[TileSchedulerType.StreamK])
+
+
+def GenerateSM90_TensorOp_tf32_WGMMA_alignx_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+    return
+
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=101, exhaustive_level=9999)
+  is_aligned = False
+
+  # layouts for ABC and their alignments.
+  layouts = [
+    [[LayoutType.RowMajor,    2], [LayoutType.ColumnMajor, 2], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.RowMajor,    2], [LayoutType.RowMajor,    2], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.ColumnMajor, 2], [LayoutType.ColumnMajor, 2], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.ColumnMajor, 2], [LayoutType.RowMajor,    2], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.RowMajor,    1], [LayoutType.ColumnMajor, 1], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.RowMajor,    1], [LayoutType.RowMajor,    1], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.ColumnMajor, 1], [LayoutType.ColumnMajor, 1], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.ColumnMajor, 1], [LayoutType.RowMajor,    1], [LayoutType.ColumnMajor, 1]],
   ]
 
-  min_cc = 80
-  max_cc = 1024
+  math_instructions = generate_tf32_math_instructions_sm90(instantiation_level)
+  tile_descriptions = generate_tile_descriptions_sm90(
+      math_instructions=math_instructions,
+      is_aligned=is_aligned,
+      level=instantiation_level)
 
-  alignment_constraints = [
-    1, 2, 4
-  ] 
+  for tile_desc in tile_descriptions:
+    math_inst = tile_desc.math_instruction
 
-  for math_inst in math_instructions:
-    tile_descriptions = [
-      TileDescription([256, 128, 16],  3, [4, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 256, 16],  3, [2, 4, 1], math_inst, min_cc, max_cc),
-      #TileDescription([256,  64, 16],  4, [4, 1, 1], math_inst, min_cc, max_cc),
-      #TileDescription([ 64, 256, 16],  4, [1, 4, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 128, 16],  5, [2, 2, 1], math_inst, min_cc, max_cc),
-      #TileDescription([128,  64, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
-      #TileDescription([ 64, 128, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
-      #TileDescription([ 64,  64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([256, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc),
-      #TileDescription([256,  64, 32],  4, [4, 1, 1], math_inst, min_cc, max_cc),
-      #TileDescription([ 64, 256, 32],  4, [1, 4, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 128, 32],  4, [2, 2, 1], math_inst, min_cc, max_cc),
-      #TileDescription([128,  64, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
-      #TileDescription([ 64, 128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
-      #TileDescription([ 64,  64, 32],  5, [2, 2, 1], math_inst, min_cc, max_cc),
-    ]
+    for layout in layouts:
+        # Inconsistency: TF32 does not stamp out void-C
+        data_type_tf32 = generate_data_types_from_math_instruction(math_inst)
+        data_type_f32 = copy.deepcopy(data_type_tf32)
+        data_type_f32["a_type"] = DataType.f32
+        data_type_f32["b_type"] = DataType.f32
+        data_type_f32["epi_type"] = DataType.f32
+        for data_type in [data_type_tf32, data_type_f32]:
+            # Inconsistency: alignments aren't fixed in TF32 / alignx
+            # layout = fix_alignments(data_type, layout, alignment_bits=128)
+
+            schedules, stream_k_schedules = get_valid_schedules(
+              tile_description=tile_desc,
+              cuda_version=cuda_version,
+              is_aligned=is_aligned,
+              data_types=data_type,
+              instantiation_level=instantiation_level,
+              layout=layout,
+            )
+
+            if len(schedules):
+              CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type, schedules)
+              if len(stream_k_schedules):
+                assert CudaToolkitVersionSatisfies(cuda_version, 12, 1)
+                CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type,
+                                              stream_k_schedules,
+                                              tile_schedulers=[TileSchedulerType.StreamK])
+
+
+def GenerateSM90_SparseTensorOp_tf32_WGMMA_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 2):
+    return
 
-    data_type = [DataType.f32, DataType.f32, DataType.f32, DataType.f32]
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=120, default_level=121, exhaustive_level=9999)
+  is_aligned = True
 
-    CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
-      data_type, alignment_constraints, BlasMode.symmetric)
-#
+  # layouts for ABC and their alignments
+  layouts = [
+    [[LayoutType.RowMajor,    8], [LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 4]],
+  ]
 
-#
-def GenerateSM80_TensorOp_1688_symm_complex(manifest, cuda_version):
+  math_instructions = make_sparse_math_instructions(generate_tf32_math_instructions_sm90(instantiation_level))
+  tile_descriptions = generate_tile_descriptions_sm90(
+      math_instructions=math_instructions,
+      is_aligned=is_aligned,
+      level=instantiation_level)
 
-  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+  for tile_desc in tile_descriptions:
+    math_inst = tile_desc.math_instruction
+
+    for layout in layouts:
+        data_type_tf32 = generate_data_types_from_math_instruction(math_inst)
+        data_type_tf32_wo_source = generate_data_types_from_math_instruction(math_inst, element_source=DataType.void)
+        data_type_f32 = copy.deepcopy(data_type_tf32)
+        data_type_f32_wo_source = copy.deepcopy(data_type_tf32_wo_source)
+        data_type_f32["a_type"] = DataType.f32
+        data_type_f32["b_type"] = DataType.f32
+        data_type_f32["epi_type"] = DataType.f32
+        data_type_f32_wo_source["a_type"] = DataType.f32
+        data_type_f32_wo_source["b_type"] = DataType.f32
+        data_type_f32_wo_source["epi_type"] = DataType.f32
+        data_types = [data_type_tf32, data_type_f32, data_type_tf32_wo_source, data_type_f32_wo_source]
+
+        for data_type in data_types:
+            layout = fix_alignments(data_type, layout, alignment_bits=128)
+
+            schedules, stream_k_schedules = get_valid_schedules(
+              tile_description=tile_desc,
+              cuda_version=cuda_version,
+              is_aligned=is_aligned,
+              data_types=data_type,
+              instantiation_level=instantiation_level,
+              layout=layout,
+            )
+
+            if len(schedules):
+              CreateSparseGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type, schedules)
+              if len(stream_k_schedules):
+                assert CudaToolkitVersionSatisfies(cuda_version, 12, 1)
+                CreateSparseGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type,
+                                                    stream_k_schedules,
+                                                    tile_schedulers=[TileSchedulerType.StreamK])
+
+
+def GenerateSM90_TensorOp_int8_WGMMA_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
     return
 
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=111, exhaustive_level=9999)
+  is_aligned = True
+
+  # layouts for ABC and their alignments
   layouts = [
-    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
-  ]
+    [[LayoutType.RowMajor, 16], [LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 16]],
+  ]
+
+  math_instructions = generate_int8_math_instructions_sm90(instantiation_level)
+  tile_descriptions = generate_tile_descriptions_sm90(
+      math_instructions=math_instructions,
+      is_aligned=is_aligned,
+      level=instantiation_level)
+
+  for tile_desc in tile_descriptions:
+    math_inst = tile_desc.math_instruction
+    data_type_w_source = generate_data_types_from_math_instruction(math_inst)
+    data_type_wo_source = generate_data_types_from_math_instruction(math_inst, element_source=DataType.void)
+    data_type_int8_output = generate_data_types_from_math_instruction(
+        math_inst,
+        element_source=DataType.s8,
+        element_dest=math_inst.element_a,
+        element_epilogue=DataType.f32
+    )
+    data_types = [data_type_w_source, data_type_wo_source, data_type_int8_output]
 
-  side_modes = [
-    SideMode.Left, SideMode.Right,
-  ]
+    for layout in layouts:
+        for data_type in data_types:
+            layout = fix_alignments(data_type, layout, alignment_bits=128)
+
+            schedules, stream_k_schedules = get_valid_schedules(
+              tile_description=tile_desc,
+              cuda_version=cuda_version,
+              is_aligned=is_aligned,
+              data_types=data_type,
+              instantiation_level=instantiation_level,
+              layout=layout,
+            )
+
+            if len(schedules):
+              CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type, schedules)
+              if len(stream_k_schedules):
+                assert CudaToolkitVersionSatisfies(cuda_version, 12, 1)
+                CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type,
+                                              stream_k_schedules,
+                                              tile_schedulers=[TileSchedulerType.StreamK])
+
+
+def GenerateSM90_TensorOp_int8_WGMMA_alignx_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+    return
 
-  fill_modes = [
-    FillMode.Lower, FillMode.Upper,
-  ]
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=111, exhaustive_level=9999)
+  is_aligned = False
 
-  math_instructions = [ 
-      MathInstruction(                                    \
-      [16, 8, 8],                                         \
-      DataType.tf32, DataType.tf32, DataType.f32,         \
-      OpcodeClass.TensorOp,                               \
-      MathOperation.multiply_add_complex),
-      MathInstruction(                                    \
-      [16, 8, 8],                                         \
-      DataType.f32, DataType.f32, DataType.f32,           \
-      OpcodeClass.TensorOp,                               \
-      MathOperation.multiply_add_complex_fast_f32),
-  ]
+  # layouts for ABC and their alignments
+  layouts = [
+    [[LayoutType.RowMajor,  8], [LayoutType.ColumnMajor,  8], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.RowMajor,  4], [LayoutType.ColumnMajor,  4], [LayoutType.ColumnMajor, 1]],
+  ]
+
+  math_instructions = generate_int8_math_instructions_sm90(instantiation_level)
+  tile_descriptions = generate_tile_descriptions_sm90(
+      math_instructions=math_instructions,
+      is_aligned=is_aligned,
+      level=instantiation_level)
+
+  for tile_desc in tile_descriptions:
+    math_inst = tile_desc.math_instruction
+    data_type_w_source = generate_data_types_from_math_instruction(math_inst)
+    data_type_int8_output = generate_data_types_from_math_instruction(
+        math_inst,
+        element_source=DataType.s8,
+        element_dest=math_inst.element_a,
+        element_epilogue=DataType.f32
+    )
+    data_types = [data_type_w_source, data_type_int8_output]
 
-  min_cc = 80
-  max_cc = 1024
+    for layout in layouts:
+        for data_type in data_types:
+            layout = fix_alignments(data_type, layout, alignment_bits=128)
+
+            schedules, stream_k_schedules = get_valid_schedules(
+              tile_description=tile_desc,
+              cuda_version=cuda_version,
+              is_aligned=is_aligned,
+              data_types=data_type,
+              instantiation_level=instantiation_level,
+              layout=layout,
+            )
+
+            if len(schedules):
+              CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type, schedules)
+              if len(stream_k_schedules):
+                assert CudaToolkitVersionSatisfies(cuda_version, 12, 1)
+                CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type,
+                                              stream_k_schedules,
+                                              tile_schedulers=[TileSchedulerType.StreamK])
+
+
+def GenerateSM90_SparseTensorOp_int8_WGMMA_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 2):
+    return
 
-  for math_inst in math_instructions:
-    tile_descriptions = [
-      TileDescription([128, 64, 16], 4, [4, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([64, 128, 16], 4, [2, 4, 1], math_inst, min_cc, max_cc),
-      TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
-      #TileDescription([64, 32, 16], 4, [2, 1, 1], math_inst, min_cc, max_cc),
-      #TileDescription([32, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
-    ]
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=111, exhaustive_level=9999)
+  is_aligned = True
 
-    data_type = [
-      DataType.cf32, DataType.cf32, DataType.cf32, DataType.cf32
-    ]
+  # layouts for ABC and their alignments
+  layouts = [
+    [[LayoutType.RowMajor, 32], [LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 16]],
+  ]
+
+  math_instructions = make_sparse_math_instructions(generate_int8_math_instructions_sm90(instantiation_level))
+  tile_descriptions = generate_tile_descriptions_sm90(
+      math_instructions=math_instructions,
+      is_aligned=is_aligned,
+      level=instantiation_level)
+
+  for tile_desc in tile_descriptions:
+    math_inst = tile_desc.math_instruction
+    # s8.u8 and u8.s8 wgmma variants require PTX 8.4
+    if math_inst.element_a != math_inst.element_b and not CudaToolkitVersionSatisfies(cuda_version, 12, 4):
+      continue
+    data_type_w_source = generate_data_types_from_math_instruction(math_inst)
+    data_type_wo_source = generate_data_types_from_math_instruction(math_inst, element_source=DataType.void)
+    data_type_int8_output = generate_data_types_from_math_instruction(
+        math_inst,
+        element_source=DataType.s8,
+        element_dest=math_inst.element_a,
+        element_epilogue=DataType.f32
+    )
+    data_types = [data_type_w_source, data_type_wo_source, data_type_int8_output]
 
-    alignment_constraints = [1,]
+    for layout in layouts:
+        for data_type in data_types:
+            layout = fix_alignments(data_type, layout, alignment_bits=128)
+
+            schedules, stream_k_schedules = get_valid_schedules(
+              tile_description=tile_desc,
+              cuda_version=cuda_version,
+              is_aligned=is_aligned,
+              data_types=data_type,
+              instantiation_level=instantiation_level,
+              layout=layout,
+            )
+
+            if len(schedules):
+              CreateSparseGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type, schedules)
+              if len(stream_k_schedules):
+                assert CudaToolkitVersionSatisfies(cuda_version, 12, 1)
+                CreateSparseGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type,
+                                                    stream_k_schedules,
+                                                    tile_schedulers=[TileSchedulerType.StreamK])
+
+
+def GenerateSM90_TensorOp_fp8_WGMMA_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+    return
 
-    # SYMM
-    CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
-      data_type, alignment_constraints, BlasMode.symmetric)
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=20, default_level=121, exhaustive_level=9999)
+  is_aligned = True
 
-    # HEMM
-    CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
-      data_type, alignment_constraints, BlasMode.hermitian)
-#
+  # layouts for ABC and their alignments
+  layouts = [
+    [[LayoutType.RowMajor, 16], [LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 1]],  # TN Layout
+  ]
+
+  math_instructions = generate_fp8_math_instructions_sm90(instantiation_level)
+  tile_descriptions = generate_tile_descriptions_sm90(
+      math_instructions=math_instructions,
+      is_aligned=is_aligned,
+      level=instantiation_level)
+
+  for tile_desc in tile_descriptions:
+    math_inst = tile_desc.math_instruction
+    data_types = []
+    fp8_types = [DataType.e4m3, DataType.e5m2]
+    valid_types_for_d = [DataType.f32, DataType.bf16, DataType.f16, DataType.e4m3, DataType.e5m2]
+    valid_types_for_c = copy.deepcopy(valid_types_for_d)
+    valid_types_for_c.append(DataType.void)
+    for c_type, d_type in product(valid_types_for_c, valid_types_for_d):
+        data_types.append(
+            generate_data_types_from_math_instruction(
+                math_inst,
+                element_source=c_type,
+                element_dest=d_type,
+            )
+        )
+    else:
+        for d_type in valid_types_for_d:
+            data_types.append(
+                generate_data_types_from_math_instruction(
+                    math_inst,
+                    element_source=DataType.void,
+                    element_dest=d_type,
+                )
+            )
 
-#
-def GenerateSM80_TensorOp_884(manifest, cuda_version):
+    for layout in layouts:
+        for data_type in data_types:
+            # Inconsistency: alignments aren't fixed in FP8
+            # layout = fix_alignments(data_type, layout, alignment_bits=128)
+
+            schedules, stream_k_schedules = get_valid_schedules(
+              tile_description=tile_desc,
+              cuda_version=cuda_version,
+              is_aligned=is_aligned,
+              data_types=data_type,
+              instantiation_level=instantiation_level,
+              layout=layout,
+            )
+
+            if len(schedules):
+              CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type, schedules)
+              if len(stream_k_schedules):
+                assert CudaToolkitVersionSatisfies(cuda_version, 12, 1)
+                CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type,
+                                              stream_k_schedules,
+                                              tile_schedulers=[TileSchedulerType.StreamK])
+
+
+def GenerateSM90_TensorOp_fp8_WGMMA_alignx_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+    return
 
-  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=0, default_level=101, exhaustive_level=9999)
+  is_aligned = False
+
+  # layouts for ABC and their alignments
+  layouts = [
+    [[LayoutType.RowMajor, 8], [LayoutType.ColumnMajor, 8], [LayoutType.ColumnMajor, 1]],  # TN Layout
+    [[LayoutType.RowMajor, 4], [LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 1]],  # TN Layout
+  ]
+
+  math_instructions = generate_fp8_math_instructions_sm90(instantiation_level)
+  tile_descriptions = generate_tile_descriptions_sm90(
+      math_instructions=math_instructions,
+      is_aligned=is_aligned,
+      level=instantiation_level)
+
+  for tile_desc in tile_descriptions:
+    math_inst = tile_desc.math_instruction
+    data_types = [generate_data_types_from_math_instruction(math_inst)]
+    fp8_types = [DataType.e4m3, DataType.e5m2]
+    valid_types_for_d = [DataType.f32, DataType.bf16, DataType.f16, DataType.e4m3, DataType.e5m2]
+    valid_types_for_c = copy.deepcopy(valid_types_for_d)
+    valid_types_for_c.append(DataType.void)
+    for c_type, d_type in product(valid_types_for_c, valid_types_for_d):
+        data_types.append(
+            generate_data_types_from_math_instruction(
+                math_inst,
+                element_source=c_type,
+                element_dest=d_type,
+            )
+        )
+
+    for layout in layouts:
+        for data_type in data_types:
+            # Inconsistency: alignments aren't fixed in FP8
+            # layout = fix_alignments(data_type, layout, alignment_bits=128)
+
+            schedules, stream_k_schedules = get_valid_schedules(
+              tile_description=tile_desc,
+              cuda_version=cuda_version,
+              is_aligned=is_aligned,
+              data_types=data_type,
+              instantiation_level=instantiation_level,
+              layout=layout,
+            )
+
+            if len(schedules):
+              CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type, schedules)
+              if len(stream_k_schedules):
+                assert CudaToolkitVersionSatisfies(cuda_version, 12, 1)
+                CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type,
+                                              stream_k_schedules,
+                                              tile_schedulers=[TileSchedulerType.StreamK])
+
+
+def GenerateSM90_SparseTensorOp_fp8_WGMMA_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 2):
+    return
+
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=20, default_level=121, exhaustive_level=9999)
+  is_aligned = True
+
+  # layouts for ABC and their alignments
+  layouts = [
+    [[LayoutType.RowMajor, 32], [LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 1]],  # TN Layout
+  ]
+
+  math_instructions = make_sparse_math_instructions(generate_fp8_math_instructions_sm90(instantiation_level))
+  tile_descriptions = generate_tile_descriptions_sm90(
+      math_instructions=math_instructions,
+      is_aligned=is_aligned,
+      level=instantiation_level)
+
+  for tile_desc in tile_descriptions:
+    math_inst = tile_desc.math_instruction
+    data_types = []
+    fp8_types = [DataType.e4m3, DataType.e5m2]
+    valid_types_for_d = [DataType.f32, DataType.bf16, DataType.f16, DataType.e4m3, DataType.e5m2]
+    valid_types_for_c = copy.deepcopy(valid_types_for_d)
+    valid_types_for_c.append(DataType.void)
+    for c_type, d_type in product(valid_types_for_c, valid_types_for_d):
+        data_types.append(
+            generate_data_types_from_math_instruction(
+                math_inst,
+                element_source=c_type,
+                element_dest=d_type,
+            )
+        )
+    else:
+        for d_type in valid_types_for_d:
+            data_types.append(
+                generate_data_types_from_math_instruction(
+                    math_inst,
+                    element_source=DataType.void,
+                    element_dest=d_type,
+                )
+            )
+
+    for layout in layouts:
+        for data_type in data_types:
+            # Inconsistency: alignments aren't fixed in FP8
+            # layout = fix_alignments(data_type, layout, alignment_bits=128)
+
+            schedules, stream_k_schedules = get_valid_schedules(
+              tile_description=tile_desc,
+              cuda_version=cuda_version,
+              is_aligned=is_aligned,
+              data_types=data_type,
+              instantiation_level=instantiation_level,
+              layout=layout,
+            )
+
+            if len(schedules):
+              CreateSparseGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type, schedules)
+              if len(stream_k_schedules):
+                assert CudaToolkitVersionSatisfies(cuda_version, 12, 1)
+                CreateSparseGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type,
+                                                    stream_k_schedules,
+                                                    tile_schedulers=[TileSchedulerType.StreamK])
+
+
+def GenerateSM90_TensorOp_1684(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
     return
 
   layouts = [
@@ -3038,16 +5921,14 @@ def GenerateSM80_TensorOp_884(manifest, cuda_version):
     (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
   ]
 
-  math_inst =                                             \
-    MathInstruction(                                      \
-      [8, 8, 4],                                          \
-      DataType.f64, DataType.f64, DataType.f64,           \
-      OpcodeClass.TensorOp,                               \
+  math_inst = MathInstruction(
+      [16, 8, 4],
+      DataType.f64, DataType.f64, DataType.f64,
+      OpcodeClass.TensorOp,
       MathOperation.multiply_add)
 
-  min_cc = 80
-  max_cc = 1024
-  max_cc_smem_limited = 80
+  min_cc = 90
+  max_cc = 90
 
   alignment_constraints = [1,]
 
@@ -3069,14 +5950,15 @@ def GenerateSM80_TensorOp_884(manifest, cuda_version):
 
   data_type = [DataType.f64, DataType.f64, DataType.f64, DataType.f64]
 
-  CreateGemmOperator(manifest, layouts, tile_descriptions, \
+  CreateGemmOperator(manifest, layouts, tile_descriptions,
     data_type, alignment_constraints)
+
 #
 
 #
-def GenerateSM80_TensorOp_884_complex(manifest, cuda_version):
+def GenerateSM90_TensorOp_1684_complex(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
     return
 
   layouts = [
@@ -3088,13 +5970,13 @@ def GenerateSM80_TensorOp_884_complex(manifest, cuda_version):
 
   math_inst =                                             \
     MathInstruction(                                      \
-      [8, 8, 4],                                          \
+      [16, 8, 4],                                          \
       DataType.f64, DataType.f64, DataType.f64,           \
       OpcodeClass.TensorOp,                               \
       MathOperation.multiply_add_complex)
 
-  min_cc = 80
-  max_cc = 1024
+  min_cc = 90
+  max_cc = 90
 
   alignment_constraints = [1,]
 
@@ -3128,11 +6010,12 @@ def GenerateSM80_TensorOp_884_complex(manifest, cuda_version):
 
   CreateGemmOperator(manifest, layouts, tile_descriptions, \
     data_type, alignment_constraints, complex_transforms)
+#
 
 #
-def GenerateSM80_TensorOp_884_complex_gaussian(manifest, cuda_version):
+def GenerateSM90_TensorOp_1684_complex_gaussian(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
     return
 
   layouts = [
@@ -3144,13 +6027,13 @@ def GenerateSM80_TensorOp_884_complex_gaussian(manifest, cuda_version):
 
   math_inst =                                             \
     MathInstruction(                                      \
-      [8, 8, 4],                                          \
+      [16, 8, 4],                                          \
       DataType.f64, DataType.f64, DataType.f64,           \
       OpcodeClass.TensorOp,                               \
       MathOperation.multiply_add_complex_gaussian)
 
-  min_cc = 80
-  max_cc = 1024
+  min_cc = 90
+  max_cc = 90
 
   alignment_constraints = [1,]
 
@@ -3177,9 +6060,9 @@ def GenerateSM80_TensorOp_884_complex_gaussian(manifest, cuda_version):
 #
 
 #
-def GenerateSM80_TensorOp_884_rank_k(manifest, cuda_version):
+def GenerateSM90_TensorOp_1684_rank_k(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
     return
 
   layouts = [
@@ -3188,18 +6071,18 @@ def GenerateSM80_TensorOp_884_rank_k(manifest, cuda_version):
   ]
 
   fill_modes = [
-    FillMode.Lower, FillMode.Upper, 
+    FillMode.Lower, FillMode.Upper,
   ]
 
   math_inst =                                             \
     MathInstruction(                                      \
-      [8, 8, 4],                                          \
+      [16, 8, 4],                                          \
       DataType.f64, DataType.f64, DataType.f64,           \
       OpcodeClass.TensorOp,                               \
       MathOperation.multiply_add)
 
-  min_cc = 80
-  max_cc = 1024
+  min_cc = 90
+  max_cc = 90
 
   alignment_constraints = [1,]
 
@@ -3222,9 +6105,9 @@ def GenerateSM80_TensorOp_884_rank_k(manifest, cuda_version):
 #
 
 #
-def GenerateSM80_TensorOp_884_rank_k_complex(manifest, cuda_version):
+def GenerateSM90_TensorOp_1684_rank_k_complex(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
     return
 
   layouts = [
@@ -3238,13 +6121,13 @@ def GenerateSM80_TensorOp_884_rank_k_complex(manifest, cuda_version):
 
   math_inst =                                             \
     MathInstruction(                                      \
-      [8, 8, 4],                                          \
+      [16, 8, 4],                                          \
       DataType.f64, DataType.f64, DataType.f64,           \
       OpcodeClass.TensorOp,                               \
       MathOperation.multiply_add_complex)
 
-  min_cc = 80
-  max_cc = 1024
+  min_cc = 90
+  max_cc = 90
 
   alignment_constraints = [1,]
 
@@ -3272,9 +6155,9 @@ def GenerateSM80_TensorOp_884_rank_k_complex(manifest, cuda_version):
 #
 
 #
-def GenerateSM80_TensorOp_884_rank_k_complex_gaussian(manifest, cuda_version):
+def GenerateSM90_TensorOp_1684_rank_k_complex_gaussian(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
     return
 
   layouts = [
@@ -3288,13 +6171,13 @@ def GenerateSM80_TensorOp_884_rank_k_complex_gaussian(manifest, cuda_version):
 
   math_inst =                                             \
     MathInstruction(                                      \
-      [8, 8, 4],                                          \
+      [16, 8, 4],                                          \
       DataType.f64, DataType.f64, DataType.f64,           \
       OpcodeClass.TensorOp,                               \
       MathOperation.multiply_add_complex_gaussian)
 
-  min_cc = 80
-  max_cc = 1024
+  min_cc = 90
+  max_cc = 90
 
   alignment_constraints = [1,]
 
@@ -3321,9 +6204,9 @@ def GenerateSM80_TensorOp_884_rank_k_complex_gaussian(manifest, cuda_version):
 #
 
 #
-def GenerateSM80_TensorOp_884_trmm(manifest, cuda_version):
+def GenerateSM90_TensorOp_1684_trmm(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
     return
 
   layouts = [
@@ -3345,14 +6228,14 @@ def GenerateSM80_TensorOp_884_trmm(manifest, cuda_version):
 
   math_inst =                                             \
     MathInstruction(                                      \
-      [8, 8, 4],                                          \
+      [16, 8, 4],                                          \
       DataType.f64, DataType.f64, DataType.f64,           \
       OpcodeClass.TensorOp,                               \
       MathOperation.multiply_add)
 
-  min_cc = 80
-  max_cc = 1024
-
+  min_cc = 90
+  max_cc = 90
+
   alignment_constraints = [1,]
 
   tile_descriptions = [
@@ -3369,9 +6252,9 @@ def GenerateSM80_TensorOp_884_trmm(manifest, cuda_version):
 #
 
 #
-def GenerateSM80_TensorOp_884_trmm_complex(manifest, cuda_version):
+def GenerateSM90_TensorOp_1684_trmm_complex(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
     return
 
   layouts = [
@@ -3393,13 +6276,13 @@ def GenerateSM80_TensorOp_884_trmm_complex(manifest, cuda_version):
 
   math_inst =                                             \
     MathInstruction(                                      \
-      [8, 8, 4],                                          \
+      [16, 8, 4],                                          \
       DataType.f64, DataType.f64, DataType.f64,           \
       OpcodeClass.TensorOp,                               \
       MathOperation.multiply_add_complex)
 
-  min_cc = 80
-  max_cc = 1024
+  min_cc = 90
+  max_cc = 90
 
   alignment_constraints = [1,]
 
@@ -3423,9 +6306,9 @@ def GenerateSM80_TensorOp_884_trmm_complex(manifest, cuda_version):
 
 
 #
-def GenerateSM80_TensorOp_884_trmm_complex_gaussian(manifest, cuda_version):
+def GenerateSM90_TensorOp_1684_trmm_complex_gaussian(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
     return
 
   layouts = [
@@ -3447,13 +6330,13 @@ def GenerateSM80_TensorOp_884_trmm_complex_gaussian(manifest, cuda_version):
 
   math_inst =                                             \
     MathInstruction(                                      \
-      [8, 8, 4],                                          \
+      [16, 8, 4],                                          \
       DataType.f64, DataType.f64, DataType.f64,           \
       OpcodeClass.TensorOp,                               \
       MathOperation.multiply_add_complex_gaussian)
 
-  min_cc = 80
-  max_cc = 1024
+  min_cc = 90
+  max_cc = 90
 
   alignment_constraints = [1,]
 
@@ -3474,9 +6357,9 @@ def GenerateSM80_TensorOp_884_trmm_complex_gaussian(manifest, cuda_version):
 #
 
 #
-def GenerateSM80_TensorOp_884_symm(manifest, cuda_version):
+def GenerateSM90_TensorOp_1684_symm(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
     return
 
   layouts = [
@@ -3488,18 +6371,18 @@ def GenerateSM80_TensorOp_884_symm(manifest, cuda_version):
   ]
 
   fill_modes = [
-    FillMode.Lower, FillMode.Upper, 
+    FillMode.Lower, FillMode.Upper,
   ]
 
   math_inst =                                             \
     MathInstruction(                                      \
-      [8, 8, 4],                                          \
+      [16, 8, 4],                                          \
       DataType.f64, DataType.f64, DataType.f64,           \
       OpcodeClass.TensorOp,                               \
       MathOperation.multiply_add)
 
-  min_cc = 80
-  max_cc = 1024
+  min_cc = 90
+  max_cc = 90
 
   alignment_constraints = [1,]
 
@@ -3522,9 +6405,9 @@ def GenerateSM80_TensorOp_884_symm(manifest, cuda_version):
 #
 
 #
-def GenerateSM80_TensorOp_884_symm_complex(manifest, cuda_version):
+def GenerateSM90_TensorOp_1684_symm_complex(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
     return
 
   layouts = [
@@ -3541,13 +6424,13 @@ def GenerateSM80_TensorOp_884_symm_complex(manifest, cuda_version):
 
   math_inst =                                             \
     MathInstruction(                                      \
-      [8, 8, 4],                                          \
+      [16, 8, 4],                                          \
       DataType.f64, DataType.f64, DataType.f64,           \
       OpcodeClass.TensorOp,                               \
       MathOperation.multiply_add_complex)
 
-  min_cc = 80
-  max_cc = 1024
+  min_cc = 90
+  max_cc = 90
 
   alignment_constraints = [1,]
 
@@ -3574,9 +6457,9 @@ def GenerateSM80_TensorOp_884_symm_complex(manifest, cuda_version):
 #
 
 #
-def GenerateSM80_TensorOp_884_symm_complex_gaussian(manifest, cuda_version):
+def GenerateSM90_TensorOp_1684_symm_complex_gaussian(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
     return
 
   layouts = [
@@ -3593,13 +6476,13 @@ def GenerateSM80_TensorOp_884_symm_complex_gaussian(manifest, cuda_version):
 
   math_inst =                                             \
     MathInstruction(                                      \
-      [8, 8, 4],                                          \
+      [16, 8, 4],                                          \
       DataType.f64, DataType.f64, DataType.f64,           \
       OpcodeClass.TensorOp,                               \
       MathOperation.multiply_add_complex_gaussian)
 
-  min_cc = 80
-  max_cc = 1024
+  min_cc = 90
+  max_cc = 90
 
   alignment_constraints = [1,]
 
@@ -3625,225 +6508,461 @@ def GenerateSM80_TensorOp_884_symm_complex_gaussian(manifest, cuda_version):
     data_type, alignment_constraints, BlasMode.hermitian)
 #
 
-###################################################################################################
-
-#
-def GenerateSM80_Simt_f32(manifest, cuda_version):
-  layouts = [
-    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
-    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
-    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
-    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
-  ]
-
-  math_instructions = [
-    MathInstruction(                                  \
-      [1, 1, 1],                                      \
-      DataType.f32, DataType.f32, DataType.f32,       \
-      OpcodeClass.Simt,                               \
-      MathOperation.multiply_add),
-  ]
-
-  min_cc = 80
-  max_cc = 1024
-
-  alignment_constraints = [1,]
-
-  for math_inst in math_instructions:
-    tile_descriptions = [
-      TileDescription([256, 128, 8], 5, [4, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 256, 8], 5, [2, 4, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 128, 8], 5, [4, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([256, 128, 8], 4, [4, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 256, 8], 4, [2, 4, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 128, 8], 4, [4, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128,  64, 8], 5, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 64, 128, 8], 5, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 64,  64, 8], 5, [2, 1, 1], math_inst, min_cc, max_cc),
-      TileDescription([128,  32, 8], 5, [2, 1, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 32, 128, 8], 5, [1, 2, 1], math_inst, min_cc, max_cc),
-    ]
-
-    data_type = [
-      math_inst.element_a,
-      math_inst.element_b,
-      math_inst.element_accumulator,
-      math_inst.element_accumulator,
-    ]
-    
-    CreateGemmOperator(manifest, layouts, tile_descriptions, \
-      data_type, alignment_constraints)
-
-    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
-    CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
-#
-
-
-#
-def GenerateSM80_Simt_f64(manifest, cuda_version):
-  layouts = [
-    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
-    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
-    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
-    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
-  ]
-
-  math_instructions = [
-    MathInstruction(                                  \
-      [1, 1, 1],                                      \
-      DataType.f64, DataType.f64, DataType.f64,       \
-      OpcodeClass.Simt,                               \
-      MathOperation.multiply_add),
-  ]
-
-  min_cc = 80
-  max_cc = 1024
-
-  alignment_constraints = [1,]
 
-  for math_inst in math_instructions:
-    tile_descriptions = [
-      TileDescription([128, 128, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128,  64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 64, 128, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 64,  64, 8], 5, [2, 1, 1], math_inst, min_cc, max_cc),
-      TileDescription([128,  32, 8], 5, [2, 1, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 32, 128, 8], 5, [1, 2, 1], math_inst, min_cc, max_cc),
-    ]
+###################################################################################################
 
-    data_type = [
-      math_inst.element_a,
-      math_inst.element_b,
-      math_inst.element_accumulator,
-      math_inst.element_accumulator,
-    ]
-    
-    CreateGemmOperator(manifest, layouts, tile_descriptions, \
-      data_type, alignment_constraints)
-#
+def GenerateSM90_Conv3x(manifest, cuda_version,
+                        log_indent_level: int = 0):
+  """
+  Generate CUTLASS 3 convolution kernel(s) for SM90.
 
+  This is meant to be called from GenerateSM90.
+  """
+  log_debug_line('GenerateSM90_Conv3x', log_indent_level)
+  log_indent_level = log_indent_level + 1
 
-##################################################################################################
-#
-def GenerateSM80_Simt_complex(manifest, cuda_version):
-  math_instructions = [
-    MathInstruction(                                  \
-      [1, 1, 1],                                      \
-      DataType.f32, DataType.f32, DataType.f32,       \
-      OpcodeClass.Simt,                               \
-      MathOperation.multiply_add_complex),
-  ]
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+    return
 
-  min_cc = 80
-  max_cc = 1024
+  minimum_compute_capability = 90
+  maximum_compute_capability = 90
 
-  alignment_constraints = [1,]
+  spatial_dims = (2, 3)
 
-  data_type = [
-    DataType.cf32,
-    DataType.cf32,
-    DataType.cf32,
-    DataType.cf32
-  ]
-    
-  layouts = [
-    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
-    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
-    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
-    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
-  ]
+  # This function only generates kernels that use TMA.
+  byte_alignment_required_by_tma = 16
+  tma_byte_alignments = {
+    'A': byte_alignment_required_by_tma,
+    'B': byte_alignment_required_by_tma,
+    'C': byte_alignment_required_by_tma,
+  }
 
-  complex_transforms = [
-    (ComplexTransform.none, ComplexTransform.none),
-    (ComplexTransform.conj, ComplexTransform.none),
-    (ComplexTransform.none, ComplexTransform.conj),
-    (ComplexTransform.conj, ComplexTransform.conj)
-  ]
+  # For tuples of one element, the element needs to end with comma.
+  all_byte_alignments = (
+    tma_byte_alignments,
+  )
 
-  for math_inst in math_instructions:
+  # MMA shapes (MMA_M, MMA_N, MMA_K):
+  #
+  # Different hardware MMA instructions may have different MMA shapes.
+  # This function may generate kernels with different MMA shapes for
+  # different data types, either because the hardware only supports
+  # certain shapes for certain types, or for performance reasons
+  # (CUTLASS doesn't need to generate all valid kernels for the
+  # profiler library, just the best-performing ones).
+  #
+  # The kernel names refer to tile shapes (TILE_M, TILE_N, TILE_K)
+  # instead of MMA shapes.  For SM >= 90 kernels, TILE_K = 4 * MMA_K,
+  # where 4, the "number of MMA instructions per tile," is determined
+  # through some combination of modeling and experiment.
+  #
+  # For performance on sm90, generally CUTLASS generates 64x128
+  # instead of 128x64.
+  mma_64x64x16  = ( 64,  64,  16)
+  mma_64x64x8   = ( 64,  64,   8)
+
+  num_mma_per_tile = 4
+
+  # Cluster shapes (1, 1, 1) and (2, 2, 1) are valid,
+  # but not included, because they tend not to perform as well.
+  cluster_shapes = (
+    (2, 1, 1),
+    (1, 2, 1),
+   )
+
+  fp16 = DataType.f16
+  bf16 = DataType.bf16
+  fp32 = DataType.f32
+  s8   = DataType.s8
+  s32  = DataType.s32
+
+  # When generating kernels, the usual way is to specify 4 types,
+  # (A, B, Acc, C/D).  Tests instead have 5 types,
+  # (ElementAct, ElementFlt, ElementOut, ElementAcc, ElementCompute),
+  # where ElementCompute is also called 'epi_type',
+  # and corresponds to the type of epilogue activations.
+  # This script maps tests' 5 types to 4 types
+  # by making ElementCompute the same as ElementOut.
+
+  fp16_fp32_fp16_fp32 = {
+    'a_type':   fp16, # ElementAct(ivation)
+    'b_type':   fp16, # ElementF(i)lt(er)
+    'c_type':   fp32, # ElementAcc
+    'd_type':   fp32, # ElementOut (used only by CollectiveEpilogue)
+    'acc_type': fp16, # ElementAcc
+    'epi_type': fp32, # ElementCompute (used only by CollectiveEpilogue)
+  }
+  fp16_fp32_fp32_fp32 = {
+    'a_type':   fp16,
+    'b_type':   fp16,
+    'c_type':   fp32,
+    'd_type':   fp32,
+    'acc_type': fp32,
+    'epi_type': fp32,
+  }
+  fp32_fp32_fp32_fp32 = {
+    'a_type':   fp32,
+    'b_type':   fp32,
+    'c_type':   fp32,
+    'd_type':   fp32,
+    'acc_type': fp32,
+    'epi_type': fp32,
+  }
+  s8_s32_s32_s32 = {
+    'a_type':     s8,
+    'b_type':     s8,
+    'c_type':    s32,
+    'd_type':    s32,
+    'acc_type':  s32,
+    'epi_type':  s32,
+  }
 
-    tile_descriptions = [
-      TileDescription([128, 128, 8], 5, [4, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 128, 8], 4, [4, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([ 64, 128, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128,  64, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
-    ]
-    
-    CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, alignment_constraints, complex_transforms)
+  # Other NVIDIA libraries may have the habit of specifying data types like this.
+  bf16bf16_bf16f32_f32 = {
+    'a_type':   bf16,
+    'b_type':   bf16,
+    'c_type':   fp32,
+    'd_type':   fp32,
+    'acc_type': fp32,
+    'epi_type': fp32,
+  }
+  f16f16_f16f16_f16 = {
+    'a_type':   fp16,
+    'b_type':   fp16,
+    'c_type':   fp16,
+    'd_type':   fp16,
+    'acc_type': fp16,
+    'epi_type': fp16,
+  }
+  f16f16_f16f32_f32 = {
+    'a_type':   fp16,
+    'b_type':   fp16,
+    'c_type':   fp16,
+    'd_type':   fp16,
+    'acc_type': fp32,
+    'epi_type': fp32,
+  }
+  f32f32_tf32f32_f32 = fp32_fp32_fp32_fp32
+
+  i8i8_i8i32_f32 = {
+    'a_type':     s8,
+    'b_type':     s8,
+    'c_type':    s32,
+    'd_type':    s32,
+    'acc_type':  s32,
+    'epi_type':  s32,
+  }
 
-    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
-    CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
-#
+  # Each element in the outermost iterable is one combination of
+  #
+  # (ConvKind, spatial_dimension, data_types, byte_alignments, mma_sizes, cluster_sizes)
+  #
+  # for which to generate a kernel.  spatial_dimension is the spatial
+  # dimension of the convolution: either 1, 2, or 3.  byte_alignments
+  # is a triple of required minimum byte alignments for A, B, and C.
+  #
+  # Note that itertools functions produce a single-pass generator.
+  # The code doesn't need a multipass iterable, but if one did, one
+  # could call `tuple` or `list` on the generator.
+  #
+  # While this happens to use the same cluster sizes for each element,
+  # the code doesn't require that.  Different convolution kinds, data
+  # types, or mma sizes might have different optimal cluster sizes.
+  combinations_of_parameters = chain(
+    # The following are all the kernels exercised in the unit tests.
+    # Please try to keep in sync with the unit tests.
+    product(
+      (
+        ConvKind.Fprop,
+      ),
+      spatial_dims,
+      (
+        fp16_fp32_fp16_fp32,
+        fp16_fp32_fp32_fp32,
+        s8_s32_s32_s32,
+      ),
+      all_byte_alignments,
+      (
+        mma_64x64x16,
+      ),
+      cluster_shapes
+    ),
+    product(
+      (
+        ConvKind.Fprop,
+      ),
+      spatial_dims,
+      (
+        fp32_fp32_fp32_fp32,
+      ),
+      all_byte_alignments,
+      (
+        mma_64x64x8,
+      ),
+      cluster_shapes
+    ),
+    product(
+      (
+        ConvKind.Dgrad,
+        ConvKind.Wgrad
+      ),
+      spatial_dims,
+      (
+        fp16_fp32_fp16_fp32,
+        fp16_fp32_fp32_fp32,
+      ),
+      all_byte_alignments,
+      (
+        mma_64x64x16,
+      ),
+      cluster_shapes
+    ),
+    # Kernels not necessarily in the unit tests, but used elsewhere
+    # and thus useful to have generated for profiling.  They may
+    # duplicate kernels above.  All of them are 2-D.  In general,
+    # CUTLASS prefers 64 x 128 to 128 x 64 on sm90, even if the
+    # hardware permits 128 x 64.
+    (
+      # Fprop
+      #
+      # bf16bf16_bf16f32_f32
+      #
+      # cluster shape (2, 1, 1)
+      #
+      (ConvKind.Fprop, 2, bf16bf16_bf16f32_f32, tma_byte_alignments, (128, 256,  8), (2, 1, 1)),
+      (ConvKind.Fprop, 2, bf16bf16_bf16f32_f32, tma_byte_alignments, (128, 256, 16), (2, 1, 1)),
+      (ConvKind.Fprop, 2, bf16bf16_bf16f32_f32, tma_byte_alignments, (256, 128,  8), (2, 1, 1)),
+      (ConvKind.Fprop, 2, bf16bf16_bf16f32_f32, tma_byte_alignments, (256, 128, 16), (2, 1, 1)),
+      #
+      # f16f16_f16f16_f16
+      #
+      # cluster shape (1, 1, 1)
+      #
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, tma_byte_alignments, ( 64,  64,  8), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, tma_byte_alignments, ( 64,  64, 16), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, tma_byte_alignments, ( 64, 128,  8), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, tma_byte_alignments, ( 64, 128, 16), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, tma_byte_alignments, ( 64, 256,  8), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, tma_byte_alignments, ( 64, 256, 16), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, tma_byte_alignments, (128, 128,  8), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, tma_byte_alignments, (128, 128, 16), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, tma_byte_alignments, (128, 256,  8), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, tma_byte_alignments, (128, 256, 16), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, tma_byte_alignments, (256,  64,  8), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, tma_byte_alignments, (256,  64, 16), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, tma_byte_alignments, (256, 128,  8), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, tma_byte_alignments, (256, 128, 16), (1, 1, 1)),
+      #
+      # f16f16_f16f32_f32
+      #
+      # cluster shape (2, 1, 1)
+      #
+      (ConvKind.Fprop, 2,    f16f16_f16f32_f32, tma_byte_alignments, (128, 192,  8), (2, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f32_f32, tma_byte_alignments, (128, 192, 16), (2, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f32_f32, tma_byte_alignments, (128, 256,  8), (2, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f32_f32, tma_byte_alignments, (128, 256, 16), (2, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f32_f32, tma_byte_alignments, (256,  96,  8), (2, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f32_f32, tma_byte_alignments, (256,  96, 16), (2, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f32_f32, tma_byte_alignments, (256, 128,  8), (2, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f32_f32, tma_byte_alignments, (256, 128, 16), (2, 1, 1)),
+      #
+      # f32f32_tf32f32_f32
+      #
+      # cluster shape (2, 1, 1)
+      #
+      (ConvKind.Fprop, 2,   f32f32_tf32f32_f32, tma_byte_alignments, (128, 192,  8), (2, 1, 1)),
+      (ConvKind.Fprop, 2,   f32f32_tf32f32_f32, tma_byte_alignments, (128, 256,  8), (2, 1, 1)),
+      (ConvKind.Fprop, 2,   f32f32_tf32f32_f32, tma_byte_alignments, (256, 128,  8), (2, 1, 1)),
+      (ConvKind.Fprop, 2,   f32f32_tf32f32_f32, tma_byte_alignments, (256,  96,  8), (2, 1, 1)),
+      #
+      # i8i8_i8i32_f32
+      #
+      # cluster shape (2, 1, 1)
+      #
+      (ConvKind.Fprop, 2,       i8i8_i8i32_f32, tma_byte_alignments, (128, 256, 16), (2, 1, 1)),
+      (ConvKind.Fprop, 2,       i8i8_i8i32_f32, tma_byte_alignments, (128, 256, 32), (2, 1, 1)),
+      (ConvKind.Fprop, 2,       i8i8_i8i32_f32, tma_byte_alignments, (256, 128, 16), (2, 1, 1)),
+      (ConvKind.Fprop, 2,       i8i8_i8i32_f32, tma_byte_alignments, (256, 128, 32), (2, 1, 1)),
+      #
+      # Dgrad
+      #
+      # bf16bf16_bf16f32_f32
+      #
+      # cluster shape (2, 1, 1)
+      #
+      (ConvKind.Dgrad, 2, bf16bf16_bf16f32_f32, tma_byte_alignments, (128, 256,  8), (2, 1, 1)),
+      (ConvKind.Dgrad, 2, bf16bf16_bf16f32_f32, tma_byte_alignments, (128, 256, 16), (2, 1, 1)),
+      (ConvKind.Dgrad, 2, bf16bf16_bf16f32_f32, tma_byte_alignments, (256, 128,  8), (2, 1, 1)),
+      (ConvKind.Dgrad, 2, bf16bf16_bf16f32_f32, tma_byte_alignments, (256, 128, 16), (2, 1, 1)),
+      #
+      # f16f16_f16f16_f16
+      #
+      # cluster shape (1, 1, 1)
+      #
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, tma_byte_alignments, ( 64,  64,  8), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, tma_byte_alignments, ( 64,  64, 16), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, tma_byte_alignments, ( 64, 128,  8), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, tma_byte_alignments, ( 64, 128, 16), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, tma_byte_alignments, ( 64, 256,  8), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, tma_byte_alignments, ( 64, 256, 16), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, tma_byte_alignments, (128, 128,  8), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, tma_byte_alignments, (128, 128, 16), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, tma_byte_alignments, (128, 256,  8), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, tma_byte_alignments, (128, 256, 16), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, tma_byte_alignments, (256,  64,  8), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, tma_byte_alignments, (256,  64, 16), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, tma_byte_alignments, (256, 128,  8), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, tma_byte_alignments, (256, 128, 16), (1, 1, 1)),
+      #
+      # f16f16_f16f32_f32
+      #
+      # cluster shape (2, 1, 1)
+      #
+      (ConvKind.Dgrad, 2,    f16f16_f16f32_f32, tma_byte_alignments, (128, 256,  8), (2, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f32_f32, tma_byte_alignments, (128, 256, 16), (2, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f32_f32, tma_byte_alignments, (256, 128,  8), (2, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f32_f32, tma_byte_alignments, (256, 128, 16), (2, 1, 1)),
+    ),
+  )
+
+  # SM >= 90 kernels don't actually use warp_count, but the
+  # TileDescription class needs it.  The 4 in the default
+  # warp_count has nothing to do with num_mma_per_tile.
+  warp_count = [4, 1, 1]
+
+  stages = 0 # zero means "deduce the number of stages automatically"
+
+  mainloop_schedule = KernelScheduleType.ImplicitTmaWarpSpecializedSm90
+  epilogue_schedule = EpilogueScheduleType.TmaWarpSpecialized
+  schedule_pairs = (
+    (mainloop_schedule, epilogue_schedule),
+  )
+  tile_schedulers = (
+    TileSchedulerType.Default, # -> void
+  )
+
+  def make_math_instruction(data_types: Dict[str, DataType],
+                            mma_shape: Tuple[int, int, int]) -> MathInstruction:
+    default_opcode = OpcodeClass.TensorOp
+    default_math_op = MathOperation.multiply_add
+    return MathInstruction(
+      mma_shape,
+      data_types['a_type'], data_types['b_type'], data_types['c_type'],
+      default_opcode,
+      default_math_op
+    )
+
+  for (conv_kind, spatial_dim, data_types, byte_alignments, mma_shape, cluster_shape) in combinations_of_parameters:
+    math_inst = make_math_instruction(data_types, mma_shape)
+    tile_shape = (mma_shape[0], mma_shape[1], num_mma_per_tile * mma_shape[2])
+    tile_description = TileDescription(tile_shape, stages, warp_count, math_inst,
+      minimum_compute_capability, maximum_compute_capability, cluster_shape)
+    assert(isinstance(spatial_dim, int))
+    assert(isinstance(byte_alignments, dict))
+    dims_and_alignments = (
+      (
+        (spatial_dim, byte_alignments['A']),
+        (spatial_dim, byte_alignments['B']),
+        (spatial_dim, byte_alignments['C']),
+      ),
+    )
+    CreateConvOperator3x(manifest,
+                         dims_and_alignments = dims_and_alignments,
+                         tile_descriptions = [tile_description],
+                         data_types = data_types,
+                         schedule_pairs = schedule_pairs,
+                         tile_schedulers = tile_schedulers,
+                         conv_kind = conv_kind,
+                         log_indent_level = log_indent_level)
+
+def GenerateSM90(manifest, cuda_version):
+  GenerateSM90_TensorOp_16b_WGMMA_gemm(manifest, cuda_version)
+  GenerateSM90_TensorOp_16b_WGMMA_alignx_gemm(manifest, cuda_version)
+  GenerateSM90_TensorOp_tf32_WGMMA_gemm(manifest, cuda_version)
+  GenerateSM90_TensorOp_tf32_WGMMA_alignx_gemm(manifest, cuda_version)
+  GenerateSM90_TensorOp_int8_WGMMA_gemm(manifest, cuda_version)
+  GenerateSM90_TensorOp_int8_WGMMA_alignx_gemm(manifest, cuda_version)
+  GenerateSM90_TensorOp_fp8_WGMMA_gemm(manifest, cuda_version)
+  GenerateSM90_TensorOp_fp8_WGMMA_alignx_gemm(manifest, cuda_version)
+  GenerateSM90_TensorOp_1684(manifest, cuda_version)
+  GenerateSM90_TensorOp_1684_complex(manifest, cuda_version)
+  GenerateSM90_TensorOp_1684_complex_gaussian(manifest, cuda_version)
+  GenerateSM90_TensorOp_1684_rank_k(manifest, cuda_version)
+  GenerateSM90_TensorOp_1684_rank_k_complex(manifest, cuda_version)
+  GenerateSM90_TensorOp_1684_rank_k_complex_gaussian(manifest, cuda_version)
+  GenerateSM90_TensorOp_1684_trmm(manifest, cuda_version)
+  GenerateSM90_TensorOp_1684_trmm_complex(manifest, cuda_version)
+  GenerateSM90_TensorOp_1684_trmm_complex_gaussian(manifest, cuda_version)
+  GenerateSM90_TensorOp_1684_symm(manifest, cuda_version)
+  GenerateSM90_TensorOp_1684_symm_complex(manifest, cuda_version)
+  GenerateSM90_TensorOp_1684_symm_complex_gaussian(manifest, cuda_version)
+  GenerateSM90_Conv3x(manifest, cuda_version)
+  GenerateSM90_SparseTensorOp_16b_WGMMA_gemm(manifest, cuda_version)
+  GenerateSM90_SparseTensorOp_tf32_WGMMA_gemm(manifest, cuda_version)
+  GenerateSM90_SparseTensorOp_int8_WGMMA_gemm(manifest, cuda_version)
+  GenerateSM90_SparseTensorOp_fp8_WGMMA_gemm(manifest, cuda_version)
 
 ###################################################################################################
 
-#
-def GenerateSM80(manifest, cuda_version):
-  GenerateSM80_TensorOp_16816(manifest, cuda_version)
-  GenerateSM80_SparseTensorOp_16832(manifest, cuda_version)
-  GenerateSM80_PlanarComplexTensorOp_16816(manifest, cuda_version)
-  GenerateSM80_TensorOp_1688(manifest, cuda_version)
-  GenerateSM80_TensorOp_1688_fast_math(manifest, cuda_version)
-  GenerateSM80_SparseTensorOp_16816_fast_math(manifest, cuda_version)
-  GenerateSM80_TensorOp_1688_complex(manifest, cuda_version)
-  # 3xTF32 
-  GenerateSM80_TensorOp_1688_fast_fp32_math(manifest, cuda_version)
-  GenerateSM80_TensorOp_1688_fast_fp32_math_complex(manifest, cuda_version)
-  GenerateSM80_TensorOp_1688_rank_k(manifest, cuda_version)
-  GenerateSM80_TensorOp_1688_rank_k_complex(manifest, cuda_version)
-  GenerateSM80_TensorOp_1688_trmm(manifest, cuda_version)
-  GenerateSM80_TensorOp_1688_trmm_complex(manifest, cuda_version)
-  GenerateSM80_TensorOp_1688_symm(manifest, cuda_version)
-  GenerateSM80_TensorOp_1688_symm_complex(manifest, cuda_version)
-  GenerateSM80_TensorOp_884(manifest, cuda_version)
-  GenerateSM80_TensorOp_884_complex(manifest, cuda_version)
-  GenerateSM80_TensorOp_884_complex_gaussian(manifest, cuda_version)
-  GenerateSM80_TensorOp_884_rank_k(manifest, cuda_version)
-  GenerateSM80_TensorOp_884_rank_k_complex(manifest, cuda_version)
-  GenerateSM80_TensorOp_884_rank_k_complex_gaussian(manifest, cuda_version)
-  GenerateSM80_TensorOp_884_trmm(manifest, cuda_version)
-  GenerateSM80_TensorOp_884_trmm_complex(manifest, cuda_version)
-  GenerateSM80_TensorOp_884_trmm_complex_gaussian(manifest, cuda_version)
-  GenerateSM80_TensorOp_884_symm(manifest, cuda_version)
-  GenerateSM80_TensorOp_884_symm_complex(manifest, cuda_version)
-  GenerateSM80_TensorOp_884_symm_complex_gaussian(manifest, cuda_version)
-  GenerateSM80_TensorOp_16832_TN(manifest, cuda_version)
-  GenerateSM80_SparseTensorOp_16864_TN(manifest, cuda_version)
-  GenerateSM80_TensorOp_16832_Interleaved(manifest, cuda_version)
-  GenerateSM80_TensorOp_16864_TN(manifest, cuda_version)
-  GenerateSM80_SparseTensorOp_168128_TN(manifest, cuda_version)
-  GenerateSM80_TensorOp_16864_Interleaved(manifest, cuda_version)
-  GenerateSM80_TensorOp_168256(manifest, cuda_version)
-  GenerateSM80_Simt_f32(manifest, cuda_version)
-  GenerateSM80_Simt_f64(manifest, cuda_version)
-  GenerateSM80_Simt_complex(manifest, cuda_version)
+def numeric_log_level(log_level: str) -> int:
+  """
+  Converts the string identifier of the log level
+  into the numeric identifier used in setting the log level.
 
-###################################################################################################
-###################################################################################################
+  :param x: string representation of log level (e.g., 'INFO', 'DEBUG')
+  :type x: str
 
-if __name__ == "__main__":
+  :return: numeric representation of log level
+  :rtype: int
+  """
+  numeric_level = getattr(logging, log_level.upper(), None)
+  if not isinstance(numeric_level, int):
+    raise ValueError(f'Invalid log level: {log_level}')
+  return numeric_level
 
+
+# This function for defining the ArgumentParser is used to make it easy for the CUTLASS Python interface
+# to leverage the functionality in this file without running this script via a shell prompt.
+def define_parser():
   parser = argparse.ArgumentParser(description="Generates device kernel registration code for CUTLASS Kernels")
   parser.add_argument("--operations", default="all", help="Specifies the operation to generate (gemm, all)")
   parser.add_argument("--build-dir", default=".", required=False, help="CUTLASS top-level build directory")
   parser.add_argument("--curr-build-dir", default=".", help="CUTLASS current build directory. cmake files will be emitted in this directory")
   parser.add_argument("--generator-target", default='library', help="Target of CUTLASS Library Generator.")
-  parser.add_argument("--architectures", default='53;60;61;70;75;80', help="Target compute architectures")
-  parser.add_argument("--kernels", default='', help='Comma delimited list to filter kernels by name.')
-  parser.add_argument("--ignore-kernels", default='', help='Comma delimited list of kernels to exclude from build.')
-  parser.add_argument("--filter-by-cc", default='True', type=str, help='If enabled, kernels whose comupte capability range is not satisfied by the build target are excluded.')
+  parser.add_argument("--architectures", default='53;60;61;70;75;80;90', help="Target compute architectures")
+  parser.add_argument("--kernels", default='', help='Comma-delimited list to filter kernels by name.  ' +
+                      'Specifying this as \"all\" includes ALL the kernels, ' +
+                      'while not specifying this includes only the default set of kernels.')
+  parser.add_argument("--ignore-kernels", default='', help='Comma-delimited list of kernels ' +
+                      'to exclude from build.  For backwards compatibility reasons, ' +
+                      'this option only takes effect if --kernels is set to a nonempty value.')
+  parser.add_argument("--exclude-kernels", default='', help='Comma-delimited list of kernels ' +
+                      'to exclude from build.  In contrast to --ignore-kernels, ' +
+                      'this option always takes effect, ' +
+                      'whether or not --kernels is set to a nonempty value.  ' +
+                      'It also can exclude kernels from the filter file ' +
+                      '(see --kernel-filter-file option below).')
+  parser.add_argument("--filter-by-cc", default='True', type=str, help='If enabled, kernels whose compute capability range is not satisfied by the build target are excluded.')
   parser.add_argument("--cuda-version", default="11.0.0", help="Semantic version string of CUDA Toolkit")
   parser.add_argument('--kernel-filter-file',   type=str, default=None, required=False, help='Full path of filter file')
   parser.add_argument('--selected-kernel-list',   type=str, default=None, required=False,
                         help='Specify the output log file containing all enabled kernels in this build')
   parser.add_argument("--interface-dir", default=None, required=False, help="Interface header to kernels")
+  parser.add_argument("--disable-full-archs-compilation", action="store_true", required=False, help="Disable compilation for every archs in --architectures")
+  parser.add_argument("--log-level", default='info', type=numeric_log_level, required=False,
+                      help='Logging level to be used by the generator script')
+  parser.add_argument('--instantiation-level', type=str, default="", required=False, help="Instantiation level for SM90 kernels. Set to `max` and make sure `--kernels` is not empty to generate all possible configurations.")
+  _add_package_disablement_flag(parser)
+  return parser
+
 
+if __name__ == "__main__":
+  parser = define_parser()
   args = parser.parse_args()
 
+  # Set the logging level based on the user-provided `--log-level` command-line option
+  logging.basicConfig(level=args.log_level)
+
   manifest = Manifest(args)
 
   GenerateSM50(manifest, args.cuda_version)
@@ -3852,6 +6971,8 @@ def GenerateSM80(manifest, cuda_version):
   GenerateSM70(manifest, args.cuda_version)
   GenerateSM75(manifest, args.cuda_version)
   GenerateSM80(manifest, args.cuda_version)
+  GenerateSM89(manifest, args.cuda_version)
+  GenerateSM90(manifest, args.cuda_version)
   if 'library' in args.generator_target.split(','):
     manifest.emit(GeneratorTarget.Library)
 
@@ -3860,5 +6981,5 @@ def GenerateSM80(manifest, cuda_version):
       with open(args.selected_kernel_list, 'w') as file_writer:
         for line in manifest.selected_kernels:
           file_writer.write("%s\n" % line)
-#
+
 ###################################################################################################
diff --git a/tools/library/scripts/library.py b/python/cutlass_library/library.py
similarity index 51%
rename from tools/library/scripts/library.py
rename to python/cutlass_library/library.py
index a6b7832401..3ccfb403ff 100644
--- a/tools/library/scripts/library.py
+++ b/python/cutlass_library/library.py
@@ -1,23 +1,50 @@
+#################################################################################################
 #
-# \file generator.py
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# \brief Generates the CUTLASS Library's instances
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
 
-import re
-
-###################################################################################################
+"""
+Data types and tags used for emitting CUTLASS C++ kernels
+"""
 
 import enum
+import re
 
 # The following block implements enum.auto() for Python 3.5 variants that don't include it such
 # as the default 3.5.2 on Ubuntu 16.04.
-# 
+#
 # https://codereview.stackexchange.com/questions/177309/reimplementing-pythons-enum-auto-for-compatibility
 
 try:
   from enum import auto as enum_auto
-except ImportError: 
+except ImportError:
   __cutlass_library_auto_enum = 0
   def enum_auto() -> int:
     global __cutlass_library_auto_enum
@@ -40,17 +67,22 @@ class GeneratorTarget(enum.Enum):
 
 #
 class DataType(enum.Enum):
+  void = enum_auto()  # primarily used to disable C tensor for epilogues
   b1 = enum_auto()
+  u2 = enum_auto()
   u4 = enum_auto()
   u8 = enum_auto()
   u16 = enum_auto()
   u32 = enum_auto()
   u64 = enum_auto()
+  s2 = enum_auto()
   s4 = enum_auto()
   s8 = enum_auto()
   s16 = enum_auto()
   s32 = enum_auto()
   s64 = enum_auto()
+  e4m3 = enum_auto()
+  e5m2 = enum_auto()
   f16 = enum_auto()
   bf16 = enum_auto()
   f32 = enum_auto()
@@ -61,11 +93,13 @@ class DataType(enum.Enum):
   cf32 = enum_auto()
   ctf32 = enum_auto()
   cf64 = enum_auto()
+  cs2 = enum_auto()
   cs4 = enum_auto()
   cs8 = enum_auto()
   cs16 = enum_auto()
   cs32 = enum_auto()
   cs64 = enum_auto()
+  cu2 = enum_auto()
   cu4 = enum_auto()
   cu8 = enum_auto()
   cu16 = enum_auto()
@@ -76,6 +110,8 @@ class DataType(enum.Enum):
 #
 ShortDataTypeNames = {
   DataType.s32: 'i',
+  DataType.e4m3: 'e4m3',
+  DataType.e5m2: 'e5m2',
   DataType.f16: 'h',
   DataType.f32: 's',
   DataType.f64: 'd',
@@ -85,17 +121,22 @@ class DataType(enum.Enum):
 
 #
 DataTypeNames = {
+  DataType.void: "void",
   DataType.b1: "b1",
+  DataType.u2: "u2",
   DataType.u4: "u4",
   DataType.u8: "u8",
   DataType.u16: "u16",
   DataType.u32: "u32",
   DataType.u64: "u64",
+  DataType.s2: "s2",
   DataType.s4: "s4",
   DataType.s8: "s8",
   DataType.s16: "s16",
   DataType.s32: "s32",
   DataType.s64: "s64",
+  DataType.e4m3: 'e4m3',
+  DataType.e5m2: 'e5m2',
   DataType.f16: "f16",
   DataType.bf16: "bf16",
   DataType.f32: "f32",
@@ -106,30 +147,37 @@ class DataType(enum.Enum):
   DataType.cf32: "cf32",
   DataType.ctf32: "ctf32",
   DataType.cf64: "cf64",
+  DataType.cu2: "cu2",
   DataType.cu4: "cu4",
   DataType.cu8: "cu8",
   DataType.cu16: "cu16",
   DataType.cu32: "cu32",
   DataType.cu64: "cu64",
+  DataType.cs2: "cs2",
   DataType.cs4: "cs4",
   DataType.cs8: "cs8",
   DataType.cs16: "cs16",
   DataType.cs32: "cs32",
-  DataType.cs64: "cs64", 
+  DataType.cs64: "cs64",
 }
 
 DataTypeTag = {
+  DataType.void: "void",
   DataType.b1: "cutlass::uint1b_t",
+  DataType.u2: "cutlass::uint2b_t",
   DataType.u4: "cutlass::uint4b_t",
   DataType.u8: "uint8_t",
   DataType.u16: "uint16_t",
   DataType.u32: "uint32_t",
   DataType.u64: "uint64_t",
+  DataType.s2: "cutlass::int2b_t",
   DataType.s4: "cutlass::int4b_t",
   DataType.s8: "int8_t",
   DataType.s16: "int16_t",
   DataType.s32: "int32_t",
   DataType.s64: "int64_t",
+  DataType.e4m3: 'cutlass::float_e4m3_t',
+  DataType.e5m2: 'cutlass::float_e5m2_t',
   DataType.f16: "cutlass::half_t",
   DataType.bf16: "cutlass::bfloat16_t",
   DataType.f32: "float",
@@ -140,11 +188,13 @@ class DataType(enum.Enum):
   DataType.cf32: "cutlass::complex<float>",
   DataType.ctf32: "cutlass::complex<cutlass::tfloat32_t>",
   DataType.cf64: "cutlass::complex<double>",
+  DataType.cu2: "cutlass::complex<cutlass::uint2b_t>",
   DataType.cu4: "cutlass::complex<cutlass::uint4b_t>",
   DataType.cu8: "cutlass::complex<cutlass::uint8_t>",
   DataType.cu16: "cutlass::complex<cutlass::uint16_t>",
   DataType.cu32: "cutlass::complex<cutlass::uint32_t>",
   DataType.cu64: "cutlass::complex<cutlass::uint64_t>",
+  DataType.cs2: "cutlass::complex<cutlass::int2b_t>",
   DataType.cs4: "cutlass::complex<cutlass::int4b_t>",
   DataType.cs8: "cutlass::complex<cutlass::int8_t>",
   DataType.cs16: "cutlass::complex<cutlass::int16_t>",
@@ -153,17 +203,22 @@ class DataType(enum.Enum):
 }
 
 DataTypeSize = {
+  DataType.void: 0,
   DataType.b1: 1,
+  DataType.u2: 2,
   DataType.u4: 4,
   DataType.u8: 8,
   DataType.u16: 16,
   DataType.u32: 32,
   DataType.u64: 64,
+  DataType.s2: 2,
   DataType.s4: 4,
   DataType.s8: 8,
   DataType.s16: 16,
   DataType.s32: 32,
   DataType.s64: 64,
+  DataType.e4m3: 8,
+  DataType.e5m2: 8,
   DataType.f16: 16,
   DataType.bf16: 16,
   DataType.f32: 32,
@@ -174,11 +229,13 @@ class DataType(enum.Enum):
   DataType.cf32: 64,
   DataType.ctf32: 32,
   DataType.cf64: 128,
+  DataType.cu2: 4,
   DataType.cu4: 8,
   DataType.cu8: 16,
   DataType.cu16: 32,
   DataType.cu32: 64,
   DataType.cu64: 128,
+  DataType.cs2: 4,
   DataType.cs4: 8,
   DataType.cs8: 16,
   DataType.cs16: 32,
@@ -209,6 +266,12 @@ class ComplexTransform(enum.Enum):
   ComplexTransform.conj: 'cutlass::ComplexTransform::kConjugate',
 }
 
+# Used for cutlass3x complex kernel collective mainloop builder instantiation
+ComplexTransformTag3x = {
+  ComplexTransform.none: 'cute::identity',
+  ComplexTransform.conj: 'cute::conjugate',
+}
+
 #
 RealComplexBijection = [
   (DataType.f16, DataType.cf16),
@@ -248,25 +311,31 @@ class ComplexMultiplyOp(enum.Enum):
 class MathOperation(enum.Enum):
   multiply_add = enum_auto()
   multiply_add_saturate = enum_auto()
+  multiply_add_mixed_input_upcast = enum_auto()
   xor_popc = enum_auto()
+  and_popc = enum_auto()
   multiply_add_fast_bf16 = enum_auto()
   multiply_add_fast_f16 = enum_auto()
   multiply_add_fast_f32 = enum_auto()
   multiply_add_complex_fast_f32 = enum_auto()
   multiply_add_complex = enum_auto()
   multiply_add_complex_gaussian = enum_auto()
+  multiply_add_fast_accum = enum_auto()
 
 #
 MathOperationTag = {
-  MathOperation.multiply_add: 'cutlass::arch::OpMultiplyAdd', 
+  MathOperation.multiply_add: 'cutlass::arch::OpMultiplyAdd',
   MathOperation.multiply_add_saturate: 'cutlass::arch::OpMultiplyAddSaturate',
+  MathOperation.multiply_add_mixed_input_upcast: 'cutlass::arch::OpMultiplyAddMixedInputUpcast',
   MathOperation.xor_popc: 'cutlass::arch::OpXorPopc',
+  MathOperation.and_popc: 'cutlass::arch::OpAndPopc',
   MathOperation.multiply_add_fast_bf16: 'cutlass::arch::OpMultiplyAddFastBF16',
   MathOperation.multiply_add_fast_f16: 'cutlass::arch::OpMultiplyAddFastF16',
   MathOperation.multiply_add_fast_f32: 'cutlass::arch::OpMultiplyAddFastF32',
   MathOperation.multiply_add_complex_fast_f32: 'cutlass::arch::OpMultiplyAddComplexFastF32',
   MathOperation.multiply_add_complex: 'cutlass::arch::OpMultiplyAddComplex',
   MathOperation.multiply_add_complex_gaussian: 'cutlass::arch::OpMultiplyAddGaussianComplex',
+  MathOperation.multiply_add_fast_accum: 'cutlass::arch::OpMultiplyAddFastAccum',
 }
 
 ###################################################################################################
@@ -281,6 +350,7 @@ class LayoutType(enum.Enum):
   RowMajorInterleaved32 = enum_auto()
   ColumnMajorInterleaved64 = enum_auto()
   RowMajorInterleaved64 = enum_auto()
+  TensorNWC = enum_auto()
   TensorNHWC = enum_auto()
   TensorNDHWC = enum_auto()
   TensorNCHW = enum_auto()
@@ -289,6 +359,9 @@ class LayoutType(enum.Enum):
   TensorNC64HW64 = enum_auto()
   TensorC32RSK32 = enum_auto()
   TensorC64RSK64 = enum_auto()
+  TensorKCS = enum_auto()
+  TensorKCSR = enum_auto()
+  TensorKCSRT = enum_auto()
 
 #
 LayoutTag = {
@@ -300,6 +373,7 @@ class LayoutType(enum.Enum):
   LayoutType.RowMajorInterleaved32: 'cutlass::layout::RowMajorInterleaved<32>',
   LayoutType.ColumnMajorInterleaved64: 'cutlass::layout::ColumnMajorInterleaved<64>',
   LayoutType.RowMajorInterleaved64: 'cutlass::layout::RowMajorInterleaved<64>',
+  LayoutType.TensorNWC: 'cutlass::layout::TensorNWC',
   LayoutType.TensorNHWC: 'cutlass::layout::TensorNHWC',
   LayoutType.TensorNDHWC: 'cutlass::layout::TensorNDHWC',
   LayoutType.TensorNCHW: 'cutlass::layout::TensorNCHW',
@@ -308,6 +382,9 @@ class LayoutType(enum.Enum):
   LayoutType.TensorC32RSK32: 'cutlass::layout::TensorCxRSKx<32>',
   LayoutType.TensorNC64HW64: 'cutlass::layout::TensorNCxHWx<64>',
   LayoutType.TensorC64RSK64: 'cutlass::layout::TensorCxRSKx<64>',
+  LayoutType.TensorKCS: 'cutlass::layout::TensorKCS',
+  LayoutType.TensorKCSR: 'cutlass::layout::TensorKCSR',
+  LayoutType.TensorKCSRT: 'cutlass::layout::TensorKCSRT'
 }
 
 #
@@ -333,6 +410,7 @@ class LayoutType(enum.Enum):
   LayoutType.RowMajorInterleaved2: 't2',
   LayoutType.RowMajorInterleaved32: 't32',
   LayoutType.RowMajorInterleaved64: 't64',
+  LayoutType.TensorNWC: 'nwc',
   LayoutType.TensorNHWC: 'nhwc',
   LayoutType.TensorNDHWC: 'ndhwc',
   LayoutType.TensorNCHW: 'nchw',
@@ -340,7 +418,10 @@ class LayoutType(enum.Enum):
   LayoutType.TensorNC32HW32: 'nc32hw32',
   LayoutType.TensorNC64HW64: 'nc64hw64',
   LayoutType.TensorC32RSK32: 'c32rsk32',
-  LayoutType.TensorC64RSK64: 'c64rsk64'
+  LayoutType.TensorC64RSK64: 'c64rsk64',
+  LayoutType.TensorKCS: 'kcs',
+  LayoutType.TensorKCSR: 'kcsr',
+  LayoutType.TensorKCSRT: 'kcsrt'
 }
 
 #
@@ -351,6 +432,104 @@ class LayoutType(enum.Enum):
   (LayoutType.RowMajor, ComplexTransform.conj): 'h'
 }
 
+###################################################################################################
+class KernelScheduleType(enum.Enum):
+  ScheduleAuto = enum_auto()
+  Multistage = enum_auto()
+  CpAsyncWarpSpecialized = enum_auto()
+  CpAsyncWarpSpecializedPingpong = enum_auto()
+  CpAsyncWarpSpecializedCooperative = enum_auto()
+  Tma = enum_auto()
+  TmaWarpSpecialized = enum_auto()
+  TmaWarpSpecializedPingpong = enum_auto()
+  TmaWarpSpecializedCooperative = enum_auto()
+  TmaWarpSpecializedFP8FastAccum = enum_auto()
+  TmaWarpSpecializedCooperativeFP8FastAccum = enum_auto()
+  TmaWarpSpecializedPingpongFP8FastAccum = enum_auto()
+  ImplicitTmaWarpSpecializedSm90 = enum_auto()
+#
+KernelScheduleTag = {
+  KernelScheduleType.ScheduleAuto: 'cutlass::gemm::collective::KernelScheduleAuto',
+  KernelScheduleType.Multistage: 'cutlass::gemm::KernelMultistage',
+  KernelScheduleType.CpAsyncWarpSpecialized: 'cutlass::gemm::KernelCpAsyncWarpSpecialized',
+  KernelScheduleType.CpAsyncWarpSpecializedPingpong: 'cutlass::gemm::KernelCpAsyncWarpSpecializedPingpong',
+  KernelScheduleType.CpAsyncWarpSpecializedCooperative: 'cutlass::gemm::KernelCpAsyncWarpSpecializedCooperative',
+  KernelScheduleType.Tma: 'cutlass::gemm::KernelTma',
+  KernelScheduleType.TmaWarpSpecialized: 'cutlass::gemm::KernelTmaWarpSpecialized',
+  KernelScheduleType.TmaWarpSpecializedPingpong: 'cutlass::gemm::KernelTmaWarpSpecializedPingpong',
+  KernelScheduleType.TmaWarpSpecializedCooperative: 'cutlass::gemm::KernelTmaWarpSpecializedCooperative',
+  KernelScheduleType.TmaWarpSpecializedFP8FastAccum: 'cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum',
+  KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum: 'cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum',
+  KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum: 'cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum',
+  KernelScheduleType.ImplicitTmaWarpSpecializedSm90: 'cutlass::conv::KernelImplicitTmaWarpSpecializedSm90',
+}
+
+#
+KernelScheduleSuffixes = {
+  KernelScheduleType.ScheduleAuto: '',
+  KernelScheduleType.Multistage: '_cpasync',
+  KernelScheduleType.CpAsyncWarpSpecialized: '_cpasync_warpspecialized',
+  KernelScheduleType.CpAsyncWarpSpecializedPingpong: '_cpasync_warpspecialized_pingpong',
+  KernelScheduleType.CpAsyncWarpSpecializedCooperative: '_cpasync_warpspecialized_cooperative',
+  KernelScheduleType.Tma: '_unspecialized',
+  KernelScheduleType.TmaWarpSpecialized: '_warpspecialized',
+  KernelScheduleType.TmaWarpSpecializedPingpong: '_warpspecialized_pingpong',
+  KernelScheduleType.TmaWarpSpecializedCooperative: '_warpspecialized_cooperative',
+  KernelScheduleType.TmaWarpSpecializedFP8FastAccum: '_warpspecialized_fp8_fastaccum',
+  KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum: '_warpspecialized_cooperative_fp8_fastaccum',
+  KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum: '_warpspecialized_pingpong_fp8_fastaccum',
+  KernelScheduleType.ImplicitTmaWarpSpecializedSm90: '_warpspecialized',
+}
+
+class EpilogueScheduleType(enum.Enum):
+  ScheduleAuto = enum_auto()
+  EpilogueTransposed = enum_auto()
+  NoSmemWarpSpecialized = enum_auto()
+  TmaWarpSpecialized = enum_auto()
+  TmaWarpSpecializedCooperative = enum_auto()
+#
+EpilogueScheduleTag = {
+  EpilogueScheduleType.ScheduleAuto: 'cutlass::epilogue::collective::EpilogueScheduleAuto',
+  EpilogueScheduleType.EpilogueTransposed: 'cutlass::gemm::EpilogueTransposed',
+  EpilogueScheduleType.NoSmemWarpSpecialized: 'cutlass::epilogue::NoSmemWarpSpecialized',
+  EpilogueScheduleType.TmaWarpSpecialized: 'cutlass::epilogue::TmaWarpSpecialized',
+  EpilogueScheduleType.TmaWarpSpecializedCooperative: 'cutlass::epilogue::TmaWarpSpecializedCooperative',
+}
+
+#
+EpilogueScheduleSuffixes = {
+  EpilogueScheduleType.ScheduleAuto: '',
+  EpilogueScheduleType.EpilogueTransposed: '',
+  EpilogueScheduleType.NoSmemWarpSpecialized: '_epi_nosmem',
+  EpilogueScheduleType.TmaWarpSpecialized: '_epi_tma',
+  EpilogueScheduleType.TmaWarpSpecializedCooperative: '_epi_tma',
+}
+
+class EpilogueFunctor3x(enum.Enum):
+  LinearCombination = enum_auto()
+#
+EpilogueFunctor3xTag = {
+  EpilogueFunctor3x.LinearCombination: 'cutlass::epilogue::fusion::LinearCombination',
+}
+
+class TileSchedulerType(enum.Enum):
+  Default = enum_auto()
+  Persistent = enum_auto()
+  StreamK = enum_auto()
+#
+TileSchedulerTag = {
+  TileSchedulerType.Default: 'void',
+  TileSchedulerType.Persistent: 'cutlass::gemm::PersistentScheduler',
+  TileSchedulerType.StreamK: 'cutlass::gemm::StreamKScheduler',
+}
+
+#
+TileSchedulerSuffixes = {
+  TileSchedulerType.Default: '',
+  TileSchedulerType.Persistent: '',
+  TileSchedulerType.StreamK: '_stream_k',
+}
+
 ###################################################################################################
 
 #
@@ -417,17 +596,18 @@ class OpcodeClass(enum.Enum):
   WmmaTensorOp = enum_auto()
   SparseTensorOp = enum_auto()
 
-
 OpcodeClassNames = {
   OpcodeClass.Simt: 'simt',
   OpcodeClass.TensorOp: 'tensorop',
   OpcodeClass.WmmaTensorOp: 'wmma_tensorop',
+  OpcodeClass.SparseTensorOp: 'sptensorop',
 }
 
 OpcodeClassTag = {
   OpcodeClass.Simt: 'cutlass::arch::OpClassSimt',
   OpcodeClass.TensorOp: 'cutlass::arch::OpClassTensorOp',
   OpcodeClass.WmmaTensorOp: 'cutlass::arch::OpClassWmmaTensorOp',
+  OpcodeClass.SparseTensorOp: 'cutlass::arch::OpClassSparseTensorOp',
 }
 
 ###################################################################################################
@@ -439,8 +619,8 @@ class OperationKind(enum.Enum):
   Rank2K = enum_auto()
   Trmm = enum_auto()
   Symm = enum_auto()
-  Conv2d = enum_auto()        
-  Conv3d = enum_auto()        
+  Conv2d = enum_auto()
+  Conv3d = enum_auto()
 
 #
 OperationKindNames = {
@@ -449,14 +629,14 @@ class OperationKind(enum.Enum):
   , OperationKind.Rank2K: 'rank_2k'
   , OperationKind.Trmm: 'trmm'
   , OperationKind.Symm: 'symm'
-  , OperationKind.Conv2d: 'conv2d'  
-  , OperationKind.Conv3d: 'conv3d' 
+  , OperationKind.Conv2d: 'conv2d'
+  , OperationKind.Conv3d: 'conv3d'
 }
 
-# 
+#
 class Target(enum.Enum):
   library = enum_auto()
-
+#
 ArchitectureNames = {
   50: 'maxwell',
   60: 'pascal',
@@ -464,6 +644,20 @@ class Target(enum.Enum):
   70: 'volta',
   75: 'turing',
   80: 'ampere',
+  89: 'ada',
+  90: 'hopper'
+}
+
+#
+SharedMemPerCC = {
+  70:  96, #  96KB of SMEM
+  72:  96, #  96KB of SMEM
+  75:  64, #  64KB of SMEM
+  80: 163, # 163KB of SMEM - 1KB reserved for the driver
+  86:  99, #  99KB of SMEM - 1KB reserved for the driver
+  87: 163, # 163KB of SMEM - 1KB reserved for the driver
+  89:  99, #  99KB of SMEM - 1KB reserved for the driver
+  90: 227, # 227KB of SMEM - 1KB reserved for the driver
 }
 
 ###################################################################################################
@@ -489,18 +683,21 @@ class GemmKind(enum.Enum):
   Gemm = enum_auto()
   Sparse = enum_auto()
   Universal = enum_auto()
+  Universal3x = enum_auto()
+  SparseUniversal3x = enum_auto()
   PlanarComplex = enum_auto()
   PlanarComplexArray = enum_auto()
   Grouped = enum_auto()
-
 #
 GemmKindNames = {
   GemmKind.Gemm: "gemm",
   GemmKind.Sparse: "spgemm",
   GemmKind.Universal: "gemm",
+  GemmKind.Universal3x: "gemm",
+  GemmKind.SparseUniversal3x: "spgemm",
   GemmKind.PlanarComplex: "gemm_planar_complex",
   GemmKind.PlanarComplexArray: "gemm_planar_complex_array",
-  GemmKind.Grouped: "gemm_grouped"
+  GemmKind.Grouped: "gemm_grouped",
 }
 
 #
@@ -551,6 +748,7 @@ class SwizzlingFunctor(enum.Enum):
   StridedDgradIdentity1 = enum_auto()
   StridedDgradIdentity4 = enum_auto()
   StridedDgradHorizontal = enum_auto()
+  StreamK = enum_auto()
 
 #
 SwizzlingFunctorTag = {
@@ -562,15 +760,33 @@ class SwizzlingFunctor(enum.Enum):
   SwizzlingFunctor.StridedDgradIdentity1: 'cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<1>',
   SwizzlingFunctor.StridedDgradIdentity4: 'cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<4>',
   SwizzlingFunctor.StridedDgradHorizontal: 'cutlass::conv::threadblock::StridedDgradHorizontalThreadblockSwizzle',
+  SwizzlingFunctor.StreamK: 'cutlass::gemm::threadblock::ThreadblockSwizzleStreamK',
+}
+
+#
+class GroupScheduleMode(enum.Enum):
+  Device = enum_auto(),
+  Host = enum_auto()
+
+#
+GroupScheduleModeTag = {
+  GroupScheduleMode.Device: 'cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly',
+  GroupScheduleMode.Host: 'cutlass::gemm::kernel::GroupScheduleMode::kHostPrecompute'
+}
+
+#
+ShortGroupScheduleModeNames = {
+  GroupScheduleMode.Device: 'Device',
+  GroupScheduleMode.Host: 'Host'
 }
 
 ###################################################################################################
 
 #
-class ConvKind(enum.Enum):
-  Fprop = enum_auto()
-  Dgrad = enum_auto()
-  Wgrad = enum_auto()
+class ConvKind(enum.IntEnum):
+  Fprop = 0
+  Dgrad = 1
+  Wgrad = 2
 
 #
 ConvKindTag = {
@@ -585,71 +801,183 @@ class ConvKind(enum.Enum):
   ConvKind.Wgrad: 'wgrad',
 }
 
+class ConvMode(enum.IntEnum):
+  CrossCorrelation = 0
+  Convolution = 1
+
 #
 class IteratorAlgorithm(enum.Enum):
-  Analytic = enum_auto()
-  Optimized = enum_auto()
-  FixedChannels = enum_auto()
-  FewChannels = enum_auto()
+  Analytic = 0
+  Optimized = 1
+  FixedChannels = 2
+  FewChannels = 3
+  FixedStrideDilation = 4
 
 #
 IteratorAlgorithmTag = {
   IteratorAlgorithm.Analytic: 'cutlass::conv::IteratorAlgorithm::kAnalytic',
   IteratorAlgorithm.Optimized: 'cutlass::conv::IteratorAlgorithm::kOptimized',
   IteratorAlgorithm.FixedChannels: 'cutlass::conv::IteratorAlgorithm::kFixedChannels',
-  IteratorAlgorithm.FewChannels: 'cutlass::conv::IteratorAlgorithm::kFewChannels'
+  IteratorAlgorithm.FewChannels: 'cutlass::conv::IteratorAlgorithm::kFewChannels',
+  IteratorAlgorithm.FixedStrideDilation: 'cutlass::conv::IteratorAlgorithm::kFixedStrideDilation'
 }
 
 IteratorAlgorithmNames = {
   IteratorAlgorithm.Analytic: 'analytic',
   IteratorAlgorithm.Optimized: 'optimized',
   IteratorAlgorithm.FixedChannels: 'fixed_channels',
-  IteratorAlgorithm.FewChannels: 'few_channels'
+  IteratorAlgorithm.FewChannels: 'few_channels',
+  IteratorAlgorithm.FixedStrideDilation: 'fixed_stride_dilation'
 }
 
 #
 class StrideSupport(enum.Enum):
-  Strided = enum_auto()
-  Unity = enum_auto()
+  Strided = 0
+  Unity = 1
+  Fixed = 2
 
 #
 StrideSupportTag = {
   StrideSupport.Strided: 'cutlass::conv::StrideSupport::kStrided',
   StrideSupport.Unity: 'cutlass::conv::StrideSupport::kUnity',
+  StrideSupport.Fixed: 'cutlass::conv::StrideSupport::kFixed'
 }
 
 StrideSupportNames = {
   StrideSupport.Strided: '',
   StrideSupport.Unity: 'unity_stride',
+  StrideSupport.Fixed: 'fixed_stride'
+}
+
+#
+class GroupMode(enum.Enum):
+  NoneGroup = enum_auto()         # dense conv (G=1)
+  SingleGroup = enum_auto()       # grouped convolution (single group per CTA)
+  MultipleGroup = enum_auto()     # grouped convolution ( multiple groups per CTA)
+  Depthwise = enum_auto()         # Depthwise convolution ( C=K=G )
+
+#
+GroupModeTag = {
+  GroupMode.NoneGroup: 'cutlass::conv::GroupMode::kNone',
+  GroupMode.SingleGroup: 'cutlass::conv::GroupMode::kSingleGroup',
+  GroupMode.MultipleGroup: 'cutlass::conv::GroupMode::kMultipleGroup',
+  GroupMode.Depthwise: 'cutlass::conv::GroupMode::kDepthwise',
 }
 
+GroupModeNames = {
+  GroupMode.NoneGroup: '',
+  GroupMode.SingleGroup: 'single_group',
+  GroupMode.MultipleGroup: 'multiple_group',
+  GroupMode.Depthwise: 'depthwise',
+}
 
 ###################################################################################################
 
 #
 class MathInstruction:
-  def __init__(self, instruction_shape, element_a, element_b, element_accumulator, opcode_class, math_operation = MathOperation.multiply_add):
+  def __init__(self,
+      instruction_shape,                                            \
+      element_a, element_b, element_accumulator,                    \
+      opcode_class, math_operation = MathOperation.multiply_add     \
+    ):
+
     self.instruction_shape = instruction_shape
     self.element_a = element_a
     self.element_b = element_b
     self.element_accumulator = element_accumulator
     self.opcode_class = opcode_class
     self.math_operation = math_operation
-
-
 #
 class TileDescription:
 
-  def __init__(self, threadblock_shape, stages, warp_count, math_instruction, min_compute, max_compute):
+  def __init__(self, threadblock_shape, stages, warp_count, math_instruction, min_compute, max_compute, cluster_shape = [1,1,1]):
     self.threadblock_shape = threadblock_shape
+    self.tile_shape = threadblock_shape
     self.stages = stages
     self.warp_count = warp_count
     self.math_instruction = math_instruction
     self.minimum_compute_capability = min_compute
     self.maximum_compute_capability = max_compute
+    self.cluster_shape = cluster_shape
 
   def procedural_name(self):
-    return "%dx%d_%dx%d" % (self.threadblock_shape[0], self.threadblock_shape[1], self.threadblock_shape[2], self.stages)
+    if self.minimum_compute_capability >= 90:
+      return "{tbm}x{tbn}x{tbk}_{cm}x{cn}x{ck}_{s}".format(
+        tbm = self.threadblock_shape[0],
+        tbn = self.threadblock_shape[1],
+        tbk = self.threadblock_shape[2],
+        cm = self.cluster_shape[0],
+        cn = self.cluster_shape[1],
+        ck = self.cluster_shape[2],
+        s = self.stages)
+    else:
+      return "%dx%d_%dx%d" % (self.threadblock_shape[0], self.threadblock_shape[1], self.threadblock_shape[2], self.stages)
+
+#
+class Direct2dConvFixedStrideDilationTileDescription:
+  def __init__(self, threadblock_output_shape, filter_shape, stages, stride, dilation, warp_count, math_instruction, min_compute, max_compute):
+    self.threadblock_shape = [threadblock_output_shape[0]*threadblock_output_shape[1]*threadblock_output_shape[2], threadblock_output_shape[3], filter_shape[0]*filter_shape[1]]
+    self.threadblock_output_shape = threadblock_output_shape
+    self.filter_shape = filter_shape
+    self.stages = stages
+    self.warp_count = warp_count
+    self.stride = stride
+    self.dilation =  dilation
+    self.math_instruction = math_instruction
+    self.minimum_compute_capability = min_compute
+    self.maximum_compute_capability = max_compute
+
+  def procedural_name(self):
+    str_name = "%dx%dx%d_%dx%dx%dx%d_%d_filter%dx%d" % (self.threadblock_shape[0],
+                                      self.threadblock_shape[1],
+                                      self.threadblock_shape[2],
+                                      self.threadblock_output_shape[0],
+                                      self.threadblock_output_shape[1],
+                                      self.threadblock_output_shape[2],
+                                      self.threadblock_output_shape[3],
+                                      self.stages,
+                                      self.filter_shape[0],
+                                      self.filter_shape[1])
+    # Fixed Strided and dilation
+    if self.stride != [-1, -1] and self.dilation != [-1, -1]:
+      str_name += "_stride%dx%d_dilation%dx%d" % (self.stride[0],
+                                                  self.stride[1],
+                                                  self.dilation[0],
+                                                  self.dilation[1])
+    return str_name
+
+#
+class Direct2dConvFixedStrideDilationTileDescription:
+  def __init__(self, threadblock_output_shape, filter_shape, stages, stride, dilation, warp_count, math_instruction, min_compute, max_compute):
+    self.threadblock_shape = [threadblock_output_shape[0]*threadblock_output_shape[1]*threadblock_output_shape[2], threadblock_output_shape[3], filter_shape[0]*filter_shape[1]]
+    self.threadblock_output_shape = threadblock_output_shape
+    self.filter_shape = filter_shape
+    self.stages = stages
+    self.warp_count = warp_count
+    self.stride = stride
+    self.dilation =  dilation
+    self.math_instruction = math_instruction
+    self.minimum_compute_capability = min_compute
+    self.maximum_compute_capability = max_compute
+
+  def procedural_name(self):
+    str_name = "%dx%dx%d_%dx%dx%dx%d_%d_filter%dx%d" % (self.threadblock_shape[0],
+                                      self.threadblock_shape[1],
+                                      self.threadblock_shape[2],
+                                      self.threadblock_output_shape[0],
+                                      self.threadblock_output_shape[1],
+                                      self.threadblock_output_shape[2],
+                                      self.threadblock_output_shape[3],
+                                      self.stages,
+                                      self.filter_shape[0],
+                                      self.filter_shape[1])
+    # Fixed Strided and dilation
+    if self.stride != [-1, -1] and self.dilation != [-1, -1]:
+      str_name += "_stride%dx%d_dilation%dx%d" % (self.stride[0],
+                                                  self.stride[1],
+                                                  self.dilation[0],
+                                                  self.dilation[1])
+    return str_name
 
 #
 class TensorDescription:
@@ -680,4 +1008,51 @@ def __init__(self, element, layout, side_mode, fill_mode, diag_type, alignment =
     self.alignment = alignment
     self.complex_transform = complex_transform
 
-###################################################################################################
+#
+def CalculateSmemUsage(operation):
+  cta_shape = operation.tile_description.threadblock_shape
+  stages = operation.tile_description.stages
+
+  if operation.operation_kind == OperationKind.Gemm and operation.gemm_kind == GemmKind.Sparse:
+    # Elements represented by 8 bits of metadata (based on 4:8, 2:4 or 1:2 sparsity)
+    if DataTypeSize[operation.A.element] == 32:
+      elements_per_8b_md = 2
+    elif DataTypeSize[operation.A.element] == 4:
+      elements_per_8b_md = 8
+    else:
+      elements_per_8b_md = 4
+
+    smem_per_stage = DataTypeSize[operation.A.element] * cta_shape[0] * (cta_shape[2] // 2) // 8 + \
+                     DataTypeSize[operation.B.element] * cta_shape[1] * cta_shape[2] // 8 + \
+                     cta_shape[0] * (cta_shape[2] // 2) // elements_per_8b_md
+  else:
+    # Few BLAS3 operations only have A tensor
+    data_type_size_a = DataTypeSize[operation.A.element]
+    data_type_size_b = DataTypeSize[operation.A.element]
+    if operation.is_mixed_input():
+      data_type_size_b = DataTypeSize[operation.B.element]
+
+    smem_per_stage = data_type_size_a * cta_shape[0] * cta_shape[2] // 8 + \
+                     data_type_size_b * cta_shape[1] * cta_shape[2] // 8
+
+  smem_usage = smem_per_stage * stages
+  return (smem_usage >> 10)
+
+
+class GemmUniversalMode(enum.IntEnum):
+  """
+  Types corresponding to GemmUniversalMode
+  """
+  Gemm = 0
+  GemmSplitKParallel = 1
+  Batched = 2
+  Array = 3
+
+
+class SplitKMode(enum.IntEnum):
+  """
+  Types corresponding to SplitKMode
+  """
+  NoneSplitK = 0
+  Serial = 1
+  Parallel = 2
diff --git a/python/cutlass_library/manifest.py b/python/cutlass_library/manifest.py
new file mode 100644
index 0000000000..3e82e640c7
--- /dev/null
+++ b/python/cutlass_library/manifest.py
@@ -0,0 +1,853 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for filtering CUTLASS library kernels and emitting library intitialization
+and building code
+"""
+
+import enum
+import logging
+import os.path
+import shutil
+
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+  from cutlass_library.gemm_operation import *
+  from cutlass_library.rank_k_operation import *
+  from cutlass_library.rank_2k_operation import *
+  from cutlass_library.trmm_operation import *
+  from cutlass_library.symm_operation import *
+  from cutlass_library.conv2d_operation import *
+  from cutlass_library.conv3d_operation import *
+except ImportError:
+  from library import *
+  from gemm_operation import *
+  from rank_k_operation import *
+  from rank_2k_operation import *
+  from trmm_operation import *
+  from symm_operation import *
+  from conv2d_operation import *
+  from conv3d_operation import *
+
+###################################################################################################
+_LOGGER = logging.getLogger(__name__)
+
+
+class EmitOperationKindAll:
+  """
+  Emit the OperationKind-level CUTLASS library initialization code.
+  The code is generated in the {generated_path}/{operation_kind} directory
+  (e.g., tools/library/generated/gemm in the build directory,
+  for OperationKind=Gemm), in the all_{operation_kind}_operations.cu file
+  (e.g., all_gemm_operations.cu for OperationKind=Gemm).
+  That file declares several functions in namespace cutlass::library.
+  The functions all have this form,
+
+  void initialize_{configuration_name}(Manifest& manifest);
+
+  The file also _defines_ the following function in that namespace.
+
+  void initialize_all_{operation_kind}_operations(Manifest& manifest);
+
+  That function calls all of the functions declared in this file.
+  Those functions are defined in subdirectories
+  (which this class does not create).
+  """
+
+  def __init__(self, generated_path, kind, args):
+    self.generated_path = generated_path
+    self.kind = kind
+    self.args = args
+
+    self.header_template ="""
+/*
+ Generated by manifest.py - Do not edit.
+*/
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+"""
+
+    self.entry_template = """
+
+//
+// Entry point to construct operations
+//
+void initialize_all_${operation_name}_operations(Manifest &manifest) {
+"""
+    self.configuration_prototype_template = "void initialize_${configuration_name}(Manifest &manifest);\n"
+    self.configuration_template ="  initialize_${configuration_name}(manifest);\n"
+
+    self.epilogue_template ="""}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+"""
+
+  #
+  def __enter__(self):
+    _LOGGER.debug("*** EmitOperationKindAll::__enter__")
+
+    self.operation_path = os.path.join(self.generated_path, OperationKindNames[self.kind])
+    _LOGGER.debug('***   operation_path (directory to create): ' +
+                  str(self.operation_path));
+    os.makedirs(self.operation_path, exist_ok=True)
+
+    self.top_level_path = os.path.join(self.operation_path, f"all_{OperationKindNames[self.kind]}_operations.cu")
+    _LOGGER.debug(f"***   top_level_path (file to write): {str(self.top_level_path)}")
+
+    self.top_level_file = open(self.top_level_path, "w")
+    self.top_level_file.write(self.header_template)
+
+    self.source_files = [self.top_level_path,]
+
+    self.configurations = []
+
+    return self
+
+  #
+  def emit(self, operations):
+    _LOGGER.debug('*** EmitOperationKindAll::emit')
+    _LOGGER.debug(f"***   len(operations): {len(operations)}")
+    _LOGGER.debug(f"***   min_cc list: {sorted(min_cc for min_cc, _ in operations.items())}")
+
+    for min_cc, configurations in sorted(operations.items()):
+      _LOGGER.debug(f"***   min_cc={min_cc}")
+
+      for configuration_name, _ in configurations.items():
+        _LOGGER.debug(f"***     configuration_name={configuration_name}")
+        self.configurations.append(configuration_name)
+        self.top_level_file.write(SubstituteTemplate(self.configuration_prototype_template, {'configuration_name': configuration_name} ))
+
+  #
+  def __exit__(self, exception_type, exception_value, traceback):
+    _LOGGER.debug("*** EmitOperationKindAll::__exit__")
+
+    self.top_level_file.write(SubstituteTemplate(self.entry_template, {'operation_name': OperationKindNames[self.kind]}))
+
+    for configuration_name in self.configurations:
+      self.top_level_file.write(SubstituteTemplate(self.configuration_template, {'configuration_name': configuration_name}))
+
+    self.top_level_file.write(self.epilogue_template)
+    self.top_level_file.close()
+
+
+class EmitOperationKindLibrary:
+  """
+  Emit the CUTLASS library initialization code for each OperationKind.
+  The code is generated in the directory
+  {generated_path}/{operation_kind}/{min_cc}
+  (e.g., tools/library/generated/gemm/90 in the build directory,
+  for min_cc=90 and OperationKind=Gemm), in the file
+  all_sm{min_cc}_{operation_kind}_operations.cu
+  (e.g., all_sm90_gemm_operations.cu for min_cc=90 and OperationKind=Gemm).
+  The min_cc variable here indicates the minimum GPU architecture version
+  that the things to be initialized require.
+  For example, min_cc=90 indicates sm90.
+
+  That file declares several functions in namespace cutlass::library.
+  The functions all have this form,
+
+  void initialize_all_sm{min_cc}_{subclass_name}_{extended_name}_operations(Manifest& manifest);
+
+  where extended_name is operation.extended_name() for all the operations
+  given to the emit method (which see below).  (All operations for a given
+  configuration_name are guaranteed to have the same extended_name().)
+
+  The file also _defines_ the following function in that namespace.
+
+  void initialize_all_sm{min_cc}__{operation_kind}_operations(Manifest& manifest);
+
+  That function calls all of the functions declared in this file.
+  Those functions are defined in subdirectories.
+  The mapping from OperationKind to emitter handles the details
+  of what happens in each of those subdirectories.
+  """
+
+  def __init__(self, generated_path, min_cc, kind, args):
+    self.generated_path = generated_path
+    self.min_cc = min_cc
+    self.kind = kind
+    self.args = args
+    self.emitters = {
+      OperationKind.Gemm: EmitGemmConfigurationLibrary,
+      OperationKind.Conv2d: EmitConv2dConfigurationLibrary,
+      OperationKind.Conv3d: EmitConv3dConfigurationLibrary,
+      OperationKind.RankK: EmitRankKConfigurationLibrary,
+      OperationKind.Rank2K: EmitRank2KConfigurationLibrary,
+      OperationKind.Trmm: EmitTrmmConfigurationLibrary,
+      OperationKind.Symm: EmitSymmConfigurationLibrary
+    }
+
+    self.header_template ="""
+/*
+ Generated by manifest.py - Do not edit.
+*/
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+"""
+    self.entry_template = """
+
+//
+// Entry point to construct operations
+//
+void initialize_all_sm${min_cc}_${subclass_name}_${operation_name}_operations(Manifest &manifest) {
+"""
+    self.configuration_prototype_template = "void initialize_${configuration_name}(Manifest &manifest);\n"
+    self.configuration_template = "  initialize_${configuration_name}(manifest);\n"
+    self.subclass_call_template = "  initialize_all_sm${min_cc}_${subclass_name}_${operation_name}_operations(manifest);\n"
+    self.subclass_prototype_template = "void initialize_all_sm${min_cc}_${subclass_name}_${operation_name}_operations(Manifest &manifest);\n"
+    self.epilogue_template ="""}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+"""
+
+  #
+  def __enter__(self):
+    _LOGGER.debug("*** EmitOperationKindLibrary::__enter__")
+    _LOGGER.debug(f"***   generated_path: {str(self.generated_path)}")
+    _LOGGER.debug(f"***   OperationKindNames[kind]: {OperationKindNames[self.kind]}")
+    _LOGGER.debug(f"***   min_cc: {self.min_cc}")
+
+    self.operation_path = os.path.join(self.generated_path, OperationKindNames[self.kind], str(self.min_cc))
+    _LOGGER.debug(f"***   operation_path (directory to make): {str(self.operation_path)}")
+    os.makedirs(self.operation_path)
+
+    self.top_level_path = os.path.join(self.operation_path, f"all_sm{self.min_cc}_{OperationKindNames[self.kind]}_operations.cu")
+    _LOGGER.debug(f"***   top_level_path (file to write): {str(self.top_level_path)}")
+
+    self.top_level_file = open(self.top_level_path, "w")
+    self.top_level_file.write(self.header_template)
+
+    self.source_files = {}
+
+    # Each {operation_kind x cc} combination is further decomposed by the instruction
+    # types used. This dictionary used to track the file handles for the top-level
+    # files of each subclass
+    self.subclass_files = {}
+
+    # Configurations in each sub class
+    self.subclass_configurations = {}
+
+    return self
+
+  #
+  def emit(self, configuration_name, operations):
+    _LOGGER.debug("*** EmitOperationKindLibrary::emit")
+    _LOGGER.debug(f"***   configuration_name: {configuration_name}")
+
+    assert len(operations) > 0
+
+    # The extended name for all operations of a given configuration_name is guaranteed
+    # to be the same because extended_name() is used in defining configuration_name. Thus,
+    # we can safely use the extended_name() of the first operation.
+    extended_name = operations[0].extended_name()
+    _LOGGER.debug('***   extended_name (for all ops): ' + extended_name)
+
+    # Create a directory for operations with this subclass if it does not exist
+    if extended_name not in self.subclass_files:
+      subclass_path = os.path.join(self.operation_path, extended_name)
+      _LOGGER.debug(f"***     subclass_path: {str(subclass_path)}")
+      os.mkdir(subclass_path)
+
+      self.subclass_configurations[extended_name] = []
+
+      # Open a new top-level file for this sub class
+      subclass_top_level_path = os.path.join(
+        subclass_path, f"all_sm{self.min_cc}_{extended_name}_{OperationKindNames[self.kind]}_operations.cu")
+      _LOGGER.debug('***     subclass_top_level_path (min_cc, extended_name, ' +
+                    'OperationKind): ' + str(subclass_top_level_path))
+
+      self.subclass_files[extended_name] = open(subclass_top_level_path, "w")
+      self.subclass_files[extended_name].write(self.header_template)
+
+      self.source_files[extended_name] = [subclass_top_level_path]
+
+    subclass_dir = os.path.dirname(self.subclass_files[extended_name].name)
+    _LOGGER.debug('***   subclass_dir: ' + str(subclass_dir))
+
+    with self.emitters[self.kind](subclass_dir, configuration_name) as configuration_emitter:
+      for operation in operations:
+        configuration_emitter.emit(operation)
+
+      _LOGGER.debug('***   configuration_emitter.configuration_path: ' +
+                    str(configuration_emitter.configuration_path))
+      self.source_files[extended_name].append(configuration_emitter.configuration_path)
+
+    self.subclass_configurations[extended_name].append(configuration_name)
+    self.subclass_files[extended_name].write(SubstituteTemplate(self.configuration_prototype_template, {'configuration_name': configuration_name} ))
+
+  #
+  def __exit__(self, exception_type, exception_value, traceback):
+    _LOGGER.debug("*** EmitOperationKindLibrary::__exit__")    
+    for subclass_name, subclass_file in sorted(self.subclass_files.items()):
+      subclass_cfg = {
+        'min_cc': str(self.min_cc),
+        'subclass_name': subclass_name,
+        'operation_name': OperationKindNames[self.kind]
+      }
+      self.top_level_file.write(SubstituteTemplate(self.subclass_prototype_template, subclass_cfg))
+
+    self.top_level_file.write(
+      SubstituteTemplate(self.entry_template, {
+        'min_cc': str(self.min_cc),
+        'subclass_name': '',
+        'operation_name': OperationKindNames[self.kind]
+      }))
+
+    # Finish and close all subclass files
+    for subclass_name, subclass_file in sorted(self.subclass_files.items()):
+      subclass_cfg = {
+        'min_cc': str(self.min_cc),
+        'subclass_name': subclass_name,
+        'operation_name': OperationKindNames[self.kind]
+      }
+      subclass_file.write(SubstituteTemplate(self.entry_template, subclass_cfg))
+
+      for configuration in self.subclass_configurations[subclass_name]:
+        subclass_file.write(
+          SubstituteTemplate(self.configuration_template, {
+            'configuration_name': configuration
+          }))
+
+      subclass_file.write(self.epilogue_template)
+      subclass_file.close()
+
+      # Write the call to initialize_all for this subclass to the top-level file
+      self.top_level_file.write(SubstituteTemplate(self.subclass_call_template, subclass_cfg))
+
+    self.top_level_file.write(self.epilogue_template)
+    self.top_level_file.close()
+
+class EmitInterfaceLibrary:
+  """
+  Emit the topmost-level CUTLASS library initialization code.
+  The code is generated in the generated_path directory
+  (e.g., tools/library/generated in the build directory),
+  in the initialize_all.cpp file.
+  That file declares several functions in namespace cutlass::library.
+  The functions all have this form,
+
+  void initialize_all_{operation_kind}_operations(Manifest& manifest);
+
+  where {operation_kind} abbreviates the "kind" of operation
+  (e.g., gemm for matrix-matrix multiply, conv2d for 2-d convolution,
+  or trmm for triangular solve with multiple right-hand sides).
+  The definitions of these functions live in subdirectories.
+
+  The file also _defines_ the following function in that namespace.
+
+  void initialize_all(Manifest& manifest);
+
+  That function first prepares the manifest, and then
+  calls all of the functions declared in this file.
+  """
+
+  def __init__(self, generated_path, operation_count, args):
+    self.generated_path = generated_path
+    self.args = args
+
+    self.prototypes = []
+    self.fn_calls = []
+    self.operation_count = str(operation_count)
+
+    self.top_level_hdr_template = '''
+/*
+ Generated by manifest.py - Do not edit.
+*/
+'''
+    self.top_level_prologue = '''
+
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+namespace cutlass {
+\tnamespace library {
+
+${prototypes}
+'''
+
+    self.top_level_initialize_kind = '''
+\t\tvoid initialize_all_${kind}_operations(Manifest &manifest) {
+${fn_calls}
+\t\t}
+'''
+
+    self.top_level_initialize = '''
+\t\tvoid initialize_all(Manifest &manifest) {
+\t\t\tmanifest.reserve(${operation_count});\n
+${fn_calls}
+\t\t}
+'''
+
+    self.top_level_suffix = '''
+\t} // namespace library
+} // namespace cutlass
+
+'''
+
+  #
+  def __enter__(self):
+    _LOGGER.debug("*** EmitInterfaceLibrary::__enter__")
+
+    self.top_level_path = os.path.join(self.generated_path, 'initialize_all.cpp')
+    _LOGGER.debug("***   top_level_path: " + str(self.top_level_path))
+
+    self.top_level_file = open(self.top_level_path, "w")
+    self.top_level_file.write(self.top_level_hdr_template)
+
+    self.source_files = [self.top_level_path,]
+
+    return self
+
+  #
+  def emit(self, operation_name):
+    _LOGGER.debug("*** EmitInterfaceLibrary::emit")
+    _LOGGER.debug("***   operation_name: " + operation_name)
+
+    self.prototypes.append(SubstituteTemplate(
+       "\t\tvoid initialize_all_${operation_kind}_operations(Manifest &manifest);",
+       {'operation_kind': operation_name}))
+
+    self.fn_calls.append(SubstituteTemplate(
+      "\t\t\tinitialize_all_${operation_kind}_operations(manifest);",
+      {'operation_kind': operation_name}))
+
+  #
+  def __exit__(self, exception_type, exception_value, traceback):
+    _LOGGER.debug("*** EmitInterfaceLibrary::__exit__")
+
+    self.top_level_file.write(SubstituteTemplate(self.top_level_prologue, {'prototypes':"\n".join(self.prototypes)}))
+
+    # Write out initialize_all method
+    self.top_level_file.write(SubstituteTemplate(self.top_level_initialize,
+                              {'operation_count': self.operation_count, 'fn_calls':"\n".join(self.fn_calls)}))
+
+    self.top_level_file.write(self.top_level_suffix)
+    self.top_level_file.close()
+
+###################################################################################################
+###################################################################################################
+
+class Options:
+  def __init__(self):
+    pass
+
+###################################################################################################
+
+#
+class Manifest:
+
+  #
+  def __init__(self, args = None):
+    self.operations = {}
+    self.args = args
+    self.operation_count = 0
+    self.operations_by_name = {}
+
+    self.kernel_filter = ''
+    self.kernel_filter_list = []
+    self.kernel_names = []
+    self.operations_enabled = []
+    self.selected_kernels = []
+    self.ignore_kernel_names = []
+    self.exclude_kernel_names = []
+    self.compute_capabilities = [50,]
+    self.curr_build_dir = '.'
+    self.filter_by_cc = True
+
+    if self.args:
+      self.kernel_filter = self.args.kernels
+      self.curr_build_dir = args.curr_build_dir
+
+      # A common user error is to use commas instead of semicolons.
+      if ',' in args.architectures:
+        raise RuntimeError("The list of architectures (CMake option CUTLASS_NVCC_ARCHS) must be semicolon-delimited.\nDon't use commas to separate the architectures; use semicolons.\nYou specified the list as: " + args.architectures)
+      architectures = args.architectures.split(';') if len(args.architectures) else ['50',]
+
+      arch_conditional_cc = [
+        '90a', 
+      ]
+      architectures = [x if x not in arch_conditional_cc else x.split('a')[0] for x in architectures]
+
+      self.compute_capabilities = [int(x) for x in architectures]
+
+      if args.filter_by_cc in ['false', 'False', '0']:
+        self.filter_by_cc = False
+
+    if args.operations == 'all':
+      self.operations_enabled = []
+    else:
+      operations_list = [
+        OperationKind.Gemm
+        , OperationKind.Conv2d
+        , OperationKind.Conv3d
+          , OperationKind.RankK
+          , OperationKind.Trmm
+          , OperationKind.Symm
+      ]
+      self.operations_enabled = [x for x in operations_list if OperationKindNames[x] in args.operations.split(',')]
+
+    if args.kernels == 'all':
+      self.kernel_names = []
+    else:
+      self.kernel_names = [x for x in args.kernels.split(',') if x != '']
+
+    self.ignore_kernel_names = [x for x in args.ignore_kernels.split(',') if x != '']
+    self.exclude_kernel_names = [x for x in args.exclude_kernels.split(',') if x != '']
+
+    if args.kernel_filter_file is None:
+        self.kernel_filter_list = []
+    else:
+        self.kernel_filter_list = self.get_kernel_filters(args.kernel_filter_file)
+        _LOGGER.debug("Using {filter_count} kernel filters from {filter_file}".format(
+            filter_count = len(self.kernel_filter_list),
+            filter_file = args.kernel_filter_file))
+
+    self.operation_count = 0
+    self.operations_by_name = {}
+    self.disable_full_archs_compilation = args.disable_full_archs_compilation
+    self.is_kernel_filter_set_to_all = args.instantiation_level == "max" and args.kernels != ''
+
+  def get_sm90_instantiation_level(self, pruned_level=0, default_level=111, exhaustive_level=9999):
+    # Non-negative integer which determines how many kernels are instantiated.
+    # 0 = 0000 generates the fewest kernels, 9999 generates all possible combinations.
+    # increasing first digit reduces schedule / mixed type pruning,
+    # increasing second digit generates more cluster sizes,
+    # increasing third digit generates more MMA shapes,
+    # increasing fourth digit generates more instruction shapes.
+    return exhaustive_level if self.is_kernel_filter_set_to_all else (
+      pruned_level if self.kernel_filter == '' else default_level
+    )
+
+
+  def get_kernel_filters (self, kernelListFile):
+    if os.path.isfile(kernelListFile):
+        with open(kernelListFile, 'r') as fileReader:
+            lines = [line.rstrip() for line in fileReader if not line.startswith("#")]
+
+        lines = [re.compile(line) for line in lines if line]
+        return lines
+    else:
+        return []
+
+  #
+  def filter_out_kernels(self, kernel_name, kernel_filter_list):
+
+    for kernel_filter_re in kernel_filter_list:
+        if kernel_filter_re.search(kernel_name) is not None:
+            return True
+
+    return False
+
+
+  #
+  def _filter_string_matches(self, filter_string, haystack):
+    ''' Returns true if all substrings appear in the haystack in order'''
+    substrings = filter_string.split('*')
+    for sub in substrings:
+      idx = haystack.find(sub)
+      if idx < 0:
+        return False
+      haystack = haystack[idx + len(sub):]
+    return True
+
+  #
+  def filter(self, operation):
+    ''' Filtering operations based on various criteria'''
+
+    # filter based on compute capability
+    enabled = not (self.filter_by_cc)
+
+    for cc in self.compute_capabilities:
+
+      if cc >= operation.tile_description.minimum_compute_capability and \
+         cc <= operation.tile_description.maximum_compute_capability and \
+         (cc not in SharedMemPerCC or SharedMemPerCC[cc] >= CalculateSmemUsage(operation)):
+
+        enabled = True
+        break
+
+    if not enabled:
+      return False
+
+    if len(self.operations_enabled) and not operation.operation_kind in self.operations_enabled:
+      return False
+
+    name = operation.procedural_name()
+
+    # eliminate duplicates
+    if name in self.operations_by_name.keys():
+      return False
+
+    # Filter based on list of valid substrings
+    if len(self.kernel_names):
+      enabled = False
+
+      # compare against the include list
+      for name_substr in self.kernel_names:
+        if self._filter_string_matches(name_substr, name):
+          _LOGGER.debug(f"Kernel {name} included due to filter string '{name_substr}'.")
+          enabled = True
+          break
+        else:
+          _LOGGER.debug(f"Kernel {name} NOT included due to not matching '{name_substr}'.")
+
+      # compare against the exclude list
+      for name_substr in self.ignore_kernel_names:
+        if self._filter_string_matches(name_substr, name):
+          _LOGGER.debug(f"Kernel {name} ignored due to filter string '{name_substr}'.")
+          enabled = False
+          break
+        else:
+          _LOGGER.debug(f"Kernel {name} NOT ignored due to not matching '{name_substr}'.")
+
+    if len(self.kernel_filter_list) > 0:
+      if self.filter_out_kernels(name, self.kernel_filter_list):
+        _LOGGER.debug(f"Kernel {name} matched via kernel filter file.")
+        enabled = True
+      else:
+        _LOGGER.debug(f"Kernel {name} culled due to no match in kernel filter file.")
+        enabled = False
+
+    # CUTLASS_LIBRARY_IGNORE_KERNELS ("ignore" list) only takes effect
+    # if CUTLASS_LIBRARY_KERNELS was specified.
+    # Changing that would break backwards compatibility.
+    # Thus, CUTLASS has introduced the new CMake option CUTLASS_LIBRARY_EXCLUDE_KERNELS,
+    # that always takes effect, whether or not CUTLASS_LIBRARY_KERNELS was specified.
+    for name_substr in self.exclude_kernel_names:
+      if self._filter_string_matches(name_substr, name):
+        _LOGGER.debug(f"Kernel {name} excluded due to filter string '{name_substr}'.")
+        enabled = False
+        break
+      else:
+        _LOGGER.debug(f"Kernel {name} NOT excluded due to not matching '{name_substr}'.")
+
+    # TODO: filter based on compute data type
+    return enabled
+  #
+
+  #
+  def append(self, operation):
+    '''
+      Inserts the operation.
+
+      operation_kind -> configuration_name -> []
+    '''
+
+    if self.filter(operation):
+
+      self.selected_kernels.append(operation.procedural_name())
+
+      self.operations_by_name[operation.procedural_name()] = operation
+
+      # add the configuration
+      configuration_name = operation.configuration_name()
+
+      # Split operations by minimum CC
+      min_cc = operation.arch
+
+      if operation.operation_kind not in self.operations.keys():
+        self.operations[operation.operation_kind] = {}
+
+      if min_cc not in self.operations[operation.operation_kind]:
+        self.operations[operation.operation_kind][min_cc] = {}
+
+      if configuration_name not in self.operations[operation.operation_kind][min_cc].keys():
+        self.operations[operation.operation_kind][min_cc][configuration_name] = []
+
+      self.operations[operation.operation_kind][min_cc][configuration_name].append(operation)
+      self.operation_count += 1
+    else:
+      _LOGGER.debug("Culled {} from manifest".format(operation.procedural_name()))
+  #
+
+  def emit_manifest_cmake(self, manifest_path, top_level_path, source_files):
+    with open(manifest_path, "w") as manifest_file:
+
+      target_text = SubstituteTemplate("""cutlass_target_sources(cutlass_library_objs PRIVATE
+      """, { })
+      manifest_file.write(target_text + '\n\n')
+      manifest_file.write("    %s\n" % str(top_level_path.replace('\\', '/')))
+      generated_path = os.path.join(self.curr_build_dir, 'generated')
+      for kind in self.operations.keys():
+        kind_str = OperationKindNames[kind]
+        all_kind_file = os.path.join(generated_path, kind_str, f"all_{kind_str}_operations.cu").replace('\\', '/')
+        manifest_file.write(f"    {all_kind_file}\n")
+      manifest_file.write(')\n\n')
+
+      for kind in self.operations.keys():
+        for min_cc in sorted(self.operations[kind].keys()):
+          for subclass in sorted(source_files[kind][min_cc].keys()):
+            target_text = SubstituteTemplate("""cutlass_add_cutlass_library(
+      SUFFIX ${kind}_sm${min_cc}_${subclass}
+""", { 'min_cc': str(min_cc), 'kind': OperationKindNames[kind], 'subclass': subclass })
+            manifest_file.write(target_text + '\n\n')
+
+            for source_file in source_files[kind][min_cc][subclass]:
+              manifest_file.write("    %s\n" % str(source_file.replace('\\', '/')))
+
+            manifest_file.write(")\n")
+
+          if self.disable_full_archs_compilation:
+            self.emit_disable_full_archs_compilation(manifest_file, source_files)
+
+  def emit_disable_full_archs_compilation(manifest_file, source_files):
+      def for_hopper(name):
+          pass
+
+      def for_ampere(name):
+          return "16816" in name or \
+                  "16832" in name or \
+                  "16864" in name or \
+                  ("1688" in name and "tf32" in name)
+
+      def for_turing(name):
+          return ("1688" in name and "tf32" not in name) or \
+                  "8816" in name
+
+      def for_volta(name):
+          return "884" in name
+
+      def is_cpp(name):
+          return name.endswith(".cpp")
+
+      def get_src_archs_str_given_requested_cuda_archs(archs, source_file):
+          intersected_archs = archs & set(self.compute_capabilities)
+          if intersected_archs == set():
+              raise RuntimeError(
+                    """
+                    Empty archs set for file {} after taking
+                    the intersection of {} (global requested archs) and
+                    {} (per file requested archs)
+                    """.format(source_file, set(self.compute_capabilities), archs))
+          else:
+              return " ".join(map(str, intersected_archs))
+
+      for min_cc in sorted(source_files.keys()):
+        for source_file in source_files[min_cc]:
+            if is_cpp(source_file):
+                continue # skip because source is cpp
+            elif for_ampere(source_file):
+                archs_str = get_src_archs_str_given_requested_cuda_archs({80, 87, 90}, source_file)
+            elif for_turing(source_file):
+                archs_str = get_src_archs_str_given_requested_cuda_archs({75}, source_file)
+            elif for_volta(source_file):
+                archs_str = get_src_archs_str_given_requested_cuda_archs({70, 72}, source_file)
+            else:
+                raise RuntimeError("Per file archs are not set {}, as there is no rule specified for this file pattern".format(source_file))
+
+            manifest_file.write("cutlass_apply_cuda_gencode_flags({} SM_ARCHS {})\n".format(str(source_file.replace('\\', '/')), archs_str))
+
+  #
+  def emit(self, target = GeneratorTarget.Library):
+
+    operation_emitters = {
+      GeneratorTarget.Library: EmitOperationKindLibrary
+    }
+
+    # Emitters for all operations that fall under a particular kind (e.g., GEMM, Conv2d)
+    kind_emitters = {
+      GeneratorTarget.Library: EmitOperationKindAll
+    }
+
+    interface_emitters = {
+      GeneratorTarget.Library: EmitInterfaceLibrary
+    }
+
+    generated_path = os.path.join(self.curr_build_dir, 'generated')
+
+    # create generated/
+    if os.path.exists(generated_path):
+      shutil.rmtree(generated_path)
+
+    os.mkdir(generated_path)
+
+    with interface_emitters[target](generated_path, self.operation_count, self.args) as iface_emitter:
+      top_level_path = iface_emitter.top_level_path
+      for operation_kind in self.operations.keys():
+        iface_emitter.emit(OperationKindNames[operation_kind])
+
+    source_files = {}
+    for kind in self.operations.keys():
+      source_files[kind] = {}
+      for min_cc in self.operations[kind].keys():
+        source_files[kind][min_cc] = {}
+
+    for operation_kind, ops in self.operations.items():
+      for min_cc, configurations in sorted(ops.items()):
+        with operation_emitters[target](generated_path, min_cc, operation_kind, self.args) as operation_kind_emitter:
+          for configuration_name, operations in configurations.items():
+            _LOGGER.info(f"Emitting {configuration_name} with {len(operations)} operation{'' if len(operations) == 1 else 's'}.")
+            operation_kind_emitter.emit(configuration_name, operations)
+
+          for subclass, files in operation_kind_emitter.source_files.items():
+            if subclass not in source_files[operation_kind][min_cc]:
+              source_files[operation_kind][min_cc][subclass] = []
+            source_files[operation_kind][min_cc][subclass].extend(operation_kind_emitter.source_files[subclass])
+
+      # Emit top level all_{gemm, conv2d, ...}_operations.cu files
+      with kind_emitters[target](generated_path, operation_kind, self.args) as operation_kind_emitter:
+        operation_kind_emitter.emit(ops)
+
+    # write the manifest.cmake file containing paths from all targets
+    manifest_path = os.path.join(generated_path, "manifest.cmake")
+
+    self.emit_manifest_cmake(manifest_path, top_level_path, source_files)
+
+###################################################################################################
diff --git a/tools/library/scripts/rank_2k_operation.py b/python/cutlass_library/rank_2k_operation.py
similarity index 83%
rename from tools/library/scripts/rank_2k_operation.py
rename to python/cutlass_library/rank_2k_operation.py
index ebb7c3eb80..0cfb9222aa 100644
--- a/tools/library/scripts/rank_2k_operation.py
+++ b/python/cutlass_library/rank_2k_operation.py
@@ -1,17 +1,52 @@
+#################################################################################################
 #
-# \file generator.py
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# \brief Generates the CUTLASS Library's instances
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# 
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for emitting Rank2K kernels
+"""
 
 import enum
-import os.path
-import shutil
 import functools
 import operator
+import os.path
+import shutil
 
-from library import *
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+except ImportError:
+  from library import *
 
 
 ###################################################################################################
@@ -34,7 +69,7 @@ def __init__(self, rank_k_kind, arch, tile_description, A, C, element_epilogue,
     self.rank_k_kind = rank_k_kind
     # tensor A and B have same data type and layout
     self.A = A
-    self.B = A  
+    self.B = A
     self.C = C
     self.element_epilogue = element_epilogue
     self.epilogue_functor = epilogue_functor
@@ -43,13 +78,17 @@ def __init__(self, rank_k_kind, arch, tile_description, A, C, element_epilogue,
   #
   def is_complex(self):
     complex_operators = [
-      MathOperation.multiply_add_complex, 
+      MathOperation.multiply_add_complex,
       MathOperation.multiply_add_complex_gaussian,
       MathOperation.multiply_add_complex_fast_f32
     ]
     return self.tile_description.math_instruction.math_operation in complex_operators
     return False
 
+  #
+  def is_mixed_input(self):
+    return self.A.element != self.B.element
+
   #
   def is_planar_complex(self):
     return False
@@ -73,13 +112,14 @@ def short_math_name(self):
   #
   def core_name(self):
     ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
-    
+
     inst_shape = ''
     inst_operation = ''
     intermediate_type = ''
 
     math_operations_map = {
       MathOperation.xor_popc: 'xor',
+      MathOperation.and_popc: 'and'
     }
 
     if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp or \
@@ -126,7 +166,7 @@ def extended_name(self):
   def layout_name(self):
     if self.is_complex() or self.is_planar_complex():
       return "%s" % (
-        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)] 
+        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)]
       )
     return "%s" % (ShortLayoutTypeNames[self.A.layout])
 
@@ -173,10 +213,10 @@ class EmitRank2KUniversalInstance:
   def __init__(self):
     self.rank_k_template = """
 // Rank K operator ${operation_name}
-using Operation_${operation_name} = 
+using Operation_${operation_name} =
   typename cutlass::gemm::device::Rank2K<
-    ${element_a}, ${layout_a}, 
-    ${element_b}, ${layout_b}, 
+    ${element_a}, ${layout_a},
+    ${element_b}, ${layout_b},
     ${element_c}, ${layout_c}, ${fill_mode},
     ${element_accumulator},
     ${opcode_class},
@@ -200,10 +240,10 @@ def __init__(self):
 """
     self.rank_k_complex_template = """
 // Rank K operator ${operation_name}
-using Operation_${operation_name} = 
+using Operation_${operation_name} =
   typename cutlass::gemm::device::Rank2K<
-    ${element_a}, ${layout_a}, 
-    ${element_b}, ${layout_b}, 
+    ${element_a}, ${layout_a},
+    ${element_b}, ${layout_b},
     ${element_c}, ${layout_c}, ${fill_mode},
     ${element_accumulator},
     ${opcode_class},
@@ -232,7 +272,7 @@ def __init__(self):
   def emit(self, operation):
 
     threadblock_shape = operation.tile_description.threadblock_shape
-    
+
     warp_count = operation.tile_description.warp_count
     warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
 
@@ -266,7 +306,7 @@ def emit(self, operation):
       'stages': str(operation.tile_description.stages),
       'align_a': str(operation.A.alignment),
       'align_b': str(operation.B.alignment),
-      'split_k_serial': 'false', 
+      'split_k_serial': 'false',
       'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation],
       'transform_a': ComplexTransformTag[operation.A.complex_transform],
       'transform_b': ComplexTransformTag[operation.B.complex_transform],
@@ -375,7 +415,7 @@ def emit(self, operation):
       'compile_guard_start': SubstituteTemplate(self.wmma_guard_start, {'sm_number': str(operation.arch)}) \
         if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "",
       'compile_guard_end': "#endif" \
-        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "" 
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else ""
       }))
 
   def __exit__(self, exception_type, exception_value, traceback):
@@ -388,9 +428,9 @@ def __exit__(self, exception_type, exception_value, traceback):
     self.configuration_file.write(SubstituteTemplate(self.initialize_function_template, {
       'configuration_name': self.configuration_name
       }))
-   
+
     for instance_wrapper in self.instance_wrappers:
-      self.configuration_file.write(instance_wrapper) 
+      self.configuration_file.write(instance_wrapper)
 
     self.configuration_file.write(self.epilogue_template)
     self.configuration_file.close()
diff --git a/tools/library/scripts/rank_k_operation.py b/python/cutlass_library/rank_k_operation.py
similarity index 84%
rename from tools/library/scripts/rank_k_operation.py
rename to python/cutlass_library/rank_k_operation.py
index 5ff596be4b..a15b6815a9 100644
--- a/tools/library/scripts/rank_k_operation.py
+++ b/python/cutlass_library/rank_k_operation.py
@@ -1,17 +1,52 @@
+#################################################################################################
 #
-# \file generator.py
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# \brief Generates the CUTLASS Library's instances
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# 
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for emitting RankK kernels
+"""
 
 import enum
-import os.path
-import shutil
 import functools
 import operator
+import os.path
+import shutil
 
-from library import *
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+except ImportError:
+  from library import *
 
 
 ###################################################################################################
@@ -41,13 +76,17 @@ def __init__(self, rank_k_kind, arch, tile_description, A, C, element_epilogue,
   #
   def is_complex(self):
     complex_operators = [
-      MathOperation.multiply_add_complex, 
+      MathOperation.multiply_add_complex,
       MathOperation.multiply_add_complex_gaussian,
       MathOperation.multiply_add_complex_fast_f32
     ]
     return self.tile_description.math_instruction.math_operation in complex_operators
     return False
 
+  #
+  def is_mixed_input(self):
+    return False
+
   #
   def is_planar_complex(self):
     return False
@@ -71,13 +110,14 @@ def short_math_name(self):
   #
   def core_name(self):
     ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
-    
+
     inst_shape = ''
     inst_operation = ''
     intermediate_type = ''
 
     math_operations_map = {
       MathOperation.xor_popc: 'xor',
+      MathOperation.and_popc: 'and'
     }
 
     if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp or \
@@ -124,7 +164,7 @@ def extended_name(self):
   def layout_name(self):
     if self.is_complex() or self.is_planar_complex():
       return "%s" % (
-        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)] 
+        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)]
       )
     return "%s" % (ShortLayoutTypeNames[self.A.layout])
 
@@ -171,9 +211,9 @@ class EmitRankKUniversalInstance:
   def __init__(self):
     self.rank_k_template = """
 // Rank K operator ${operation_name}
-using Operation_${operation_name} = 
+using Operation_${operation_name} =
   typename cutlass::gemm::device::RankK<
-    ${element_a}, ${layout_a}, 
+    ${element_a}, ${layout_a},
     ${element_c}, ${layout_c}, ${fill_mode},
     ${element_accumulator},
     ${opcode_class},
@@ -196,9 +236,9 @@ def __init__(self):
 """
     self.rank_k_complex_template = """
 // Rank K operator ${operation_name}
-using Operation_${operation_name} = 
+using Operation_${operation_name} =
   typename cutlass::gemm::device::RankK<
-    ${element_a}, ${layout_a}, 
+    ${element_a}, ${layout_a},
     ${element_c}, ${layout_c}, ${fill_mode},
     ${element_accumulator},
     ${opcode_class},
@@ -225,7 +265,7 @@ def __init__(self):
   def emit(self, operation):
 
     threadblock_shape = operation.tile_description.threadblock_shape
-    
+
     warp_count = operation.tile_description.warp_count
     warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
 
@@ -256,7 +296,7 @@ def emit(self, operation):
       'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
       'stages': str(operation.tile_description.stages),
       'align_a': str(operation.A.alignment),
-      'split_k_serial': 'false', 
+      'split_k_serial': 'false',
       'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation],
       'transform_a': ComplexTransformTag[operation.A.complex_transform],
       'blas_mode': BlasModeTag[operation.blas_mode]
@@ -364,7 +404,7 @@ def emit(self, operation):
       'compile_guard_start': SubstituteTemplate(self.wmma_guard_start, {'sm_number': str(operation.arch)}) \
         if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "",
       'compile_guard_end': "#endif" \
-        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "" 
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else ""
       }))
 
   def __exit__(self, exception_type, exception_value, traceback):
@@ -377,9 +417,9 @@ def __exit__(self, exception_type, exception_value, traceback):
     self.configuration_file.write(SubstituteTemplate(self.initialize_function_template, {
       'configuration_name': self.configuration_name
       }))
-   
+
     for instance_wrapper in self.instance_wrappers:
-      self.configuration_file.write(instance_wrapper) 
+      self.configuration_file.write(instance_wrapper)
 
     self.configuration_file.write(self.epilogue_template)
     self.configuration_file.close()
diff --git a/python/cutlass_library/sm90_shapes.py b/python/cutlass_library/sm90_shapes.py
new file mode 100644
index 0000000000..034e75248d
--- /dev/null
+++ b/python/cutlass_library/sm90_shapes.py
@@ -0,0 +1,212 @@
+#################################################################################################
+#
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Valid WGMMA shapes, MMA multipliers, and cluster sizes for SM90, associated with levels.
+These shape and level pairs are defined as dicts, where keys are shapes and values are their
+associated levels. If the user input level for that category (MMA multiplier, WGMMA shape, cluster
+size) is smaller than a shape's associated level, it will be excluded, and otherwise, included.
+Higher levels are therefore less likely emitted, but lower levels are more emitted more frequently.
+Level 0 is always emitted. The default behavior in `generator.py` is that level 1 is only emitted
+when the `--kernel` argument is non-empty.
+"""
+
+# NOTE: more combinations are possible here.
+# Levels [0, 3] exist in order to control exactly what configs are generated in different dtypes.
+# The rest are only used in the exhaustive mode (when the corresponding level digit is 9).
+# MMA multipliers are multiplied by MMA instruction shapes (WGMMA shapes) to get CTA shapes.
+SM90_MMA_MULTIPLIERS = {
+    (2, 1, 4): 0,
+    (1, 1, 4): 1,
+    (4, 1, 4): 2,
+    (2, 2, 4): 3,
+    (2, 1, 8): 4,
+    (4, 1, 8): 4,
+    (1, 1, 8): 4,
+    (2, 2, 8): 4,
+    (2, 1, 16): 5,
+    (4, 1, 16): 5,
+    (1, 1, 16): 5,
+    (2, 2, 16): 5,
+}
+
+# Level 0: only (1, 2, 1) -- fp8 dense gemms in pruned case
+# Level 1: clusters with 2 CTAs -- all but fp8 (s8, u8, f16, b16, f32, tf32) dense gemms in pruned case
+# Level 2: clusters with 1 or 2 CTAs
+# Level 3: clusters with 1, 2, or 4 CTAs
+# Level 4: clusters with 1, 2, 4, or 8 CTAs
+# Level 5: clusters with 1, 2, 4, 8, or 16 CTAs
+SM90_CLUSTER_SIZES = {
+    (1, 2, 1): 0,
+    (2, 1, 1): 1,
+    (1, 1, 1): 2,
+    (2, 2, 1): 3,
+    (1, 4, 1): 3,
+    (4, 1, 1): 3,
+    (2, 4, 1): 4,
+    (4, 2, 1): 4,
+    (1, 8, 1): 4,
+    (8, 1, 1): 4,
+    (4, 4, 1): 5,
+}
+
+
+# WGMMA shapes
+# Level 0: "default" shape only,
+# Level 1: additional shapes for the unpruned case (tf32 only)
+# Level 2: shapes that are all powers of 2
+# Level 3: all other shapes
+SM90_WGMMA_SHAPES_FP16_BF16_DENSE = {
+    (64, 8, 16): 2,
+    (64, 16, 16): 2,
+    (64, 24, 16): 3,
+    (64, 32, 16): 2,
+    (64, 40, 16): 3,
+    (64, 48, 16): 3,
+    (64, 56, 16): 3,
+    (64, 64, 16): 2,
+    (64, 72, 16): 3,
+    (64, 80, 16): 3,
+    (64, 88, 16): 3,
+    (64, 96, 16): 3,
+    (64, 104, 16): 3,
+    (64, 112, 16): 3,
+    (64, 120, 16): 3,
+    (64, 128, 16): 0,
+    (64, 136, 16): 3,
+    (64, 144, 16): 3,
+    (64, 152, 16): 3,
+    (64, 160, 16): 3,
+    (64, 168, 16): 3,
+    (64, 176, 16): 3,
+    (64, 184, 16): 3,
+    (64, 192, 16): 3,
+    (64, 200, 16): 3,
+    (64, 208, 16): 3,
+    (64, 216, 16): 3,
+    (64, 224, 16): 3,
+    (64, 232, 16): 3,
+    (64, 240, 16): 3,
+    (64, 248, 16): 3,
+    (64, 256, 16): 1,
+}
+
+SM90_WGMMA_SHAPES_TF32_DENSE = {
+    (64, 8, 8): 2,
+    (64, 16, 8): 2,
+    (64, 24, 8): 3,
+    (64, 32, 8): 2,
+    (64, 40, 8): 3,
+    (64, 48, 8): 3,
+    (64, 56, 8): 3,
+    (64, 64, 8): 2,
+    (64, 72, 8): 3,
+    (64, 80, 8): 3,
+    (64, 88, 8): 3,
+    (64, 96, 8): 3,
+    (64, 104, 8): 3,
+    (64, 112, 8): 3,
+    (64, 120, 8): 3,
+    (64, 128, 8): 0,
+    (64, 136, 8): 3,
+    (64, 144, 8): 3,
+    (64, 152, 8): 3,
+    (64, 160, 8): 3,
+    (64, 168, 8): 3,
+    (64, 176, 8): 3,
+    (64, 184, 8): 3,
+    (64, 192, 8): 3,
+    (64, 200, 8): 3,
+    (64, 208, 8): 3,
+    (64, 216, 8): 3,
+    (64, 224, 8): 3,
+    (64, 232, 8): 3,
+    (64, 240, 8): 3,
+    (64, 248, 8): 3,
+    (64, 256, 8): 1,
+}
+
+SM90_WGMMA_SHAPES_FP8_DENSE = {
+    (64, 8, 32): 2,
+    (64, 16, 32): 2,
+    (64, 24, 32): 3,
+    (64, 32, 32): 2,
+    (64, 40, 32): 3,
+    (64, 48, 32): 3,
+    (64, 56, 32): 3,
+    (64, 64, 32): 2,
+    (64, 72, 32): 3,
+    (64, 80, 32): 3,
+    (64, 88, 32): 3,
+    (64, 96, 32): 3,
+    (64, 104, 32): 3,
+    (64, 112, 32): 3,
+    (64, 120, 32): 3,
+    (64, 128, 32): 0,
+    (64, 136, 32): 3,
+    (64, 144, 32): 3,
+    (64, 152, 32): 3,
+    (64, 160, 32): 3,
+    (64, 168, 32): 3,
+    (64, 176, 32): 3,
+    (64, 184, 32): 3,
+    (64, 192, 32): 3,
+    (64, 200, 32): 3,
+    (64, 208, 32): 3,
+    (64, 216, 32): 3,
+    (64, 224, 32): 3,
+    (64, 232, 32): 3,
+    (64, 240, 32): 3,
+    (64, 248, 32): 3,
+    (64, 256, 32): 1,
+}
+
+SM90_WGMMA_SHAPES_INT8_DENSE = {
+    (64, 8, 32): 2,
+    (64, 16, 32): 2,
+    (64, 24, 32): 3,
+    (64, 32, 32): 2,
+    (64, 48, 32): 3,
+    (64, 64, 32): 2,
+    (64, 80, 32): 3,
+    (64, 96, 32): 3,
+    (64, 112, 32): 3,
+    (64, 128, 32): 0,
+    (64, 144, 32): 3,
+    (64, 160, 32): 3,
+    (64, 176, 32): 3,
+    (64, 192, 32): 3,
+    (64, 208, 32): 3,
+    (64, 224, 32): 3,
+    (64, 240, 32): 3,
+    (64, 256, 32): 1,
+}
diff --git a/python/cutlass_library/sm90_utils.py b/python/cutlass_library/sm90_utils.py
new file mode 100644
index 0000000000..021406d700
--- /dev/null
+++ b/python/cutlass_library/sm90_utils.py
@@ -0,0 +1,605 @@
+#################################################################################################
+#
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for enumerating CUTLASS library SM90 kernels
+"""
+
+import argparse
+import enum
+from itertools import product
+import math
+import logging
+import os.path
+import shutil
+import sys
+import copy
+from typing import Any, Optional, Sequence, Tuple
+
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+except ImportError:
+  from library import *
+
+# NOTE: this is a duplicate of CudaToolkitVersionSatisfies in generator.py
+def CudaToolkitVersionSatisfies(semantic_ver_string, major, minor, patch = 0):
+
+  # by default, use the latest CUDA Toolkit version
+  cuda_version = [11, 0, 132]
+
+  # Update cuda_version based on parsed string
+  if semantic_ver_string != '':
+    for i, x in enumerate([int(x) for x in semantic_ver_string.split('.')[:3]]):
+      if i < len(cuda_version):
+        cuda_version[i] = x
+      else:
+        cuda_version.append(x)
+  return cuda_version >= [major, minor, patch]
+
+#### Step 0: define levels
+
+# One integer level controls multiple "generators" and how many
+# combinations they generate. That is the "global" level.
+# "Generators" are WGMMA shapes, MMA multipliers, cluster sizes, and
+# anything that is eventually involved in the Cartesian product
+# which yields our kernel configurations.
+# For simplicity, each generator defines their own levels, 
+# starting from 0. As a rule we assume 10 or fewer levels, making
+# their level a digit.
+# The "global" level simply stacks these digits and represents them
+# as a single integer.
+# 
+# For example, level 500 indicates cluster sizes are at level 5, MMA
+# multipliers are at level 0, and WGMMA shapes are at level 0 as well.
+#
+# Here we define the global level to generator level mappings.
+
+
+def get_wgmma_level_from_global_level(global_level: int):
+    return global_level % 10
+
+
+def get_mma_level_from_global_level(global_level: int):
+    return (global_level // 10) % 10
+
+
+def get_cluster_level_from_global_level(global_level: int):
+    return (global_level // 100) % 10
+
+
+def get_pruning_level_from_global_level(global_level: int):
+    return (global_level // 1000) % 10
+
+
+#### Step 1: generate MMA instruction shapes based on levels
+
+try:
+    from .sm90_shapes import (
+        SM90_MMA_MULTIPLIERS,
+        SM90_CLUSTER_SIZES,
+        SM90_WGMMA_SHAPES_TF32_DENSE,
+        SM90_WGMMA_SHAPES_FP16_BF16_DENSE,
+        SM90_WGMMA_SHAPES_FP8_DENSE,
+        SM90_WGMMA_SHAPES_INT8_DENSE,
+    )
+except:
+    from sm90_shapes import (
+        SM90_MMA_MULTIPLIERS,
+        SM90_CLUSTER_SIZES,
+        SM90_WGMMA_SHAPES_TF32_DENSE,
+        SM90_WGMMA_SHAPES_FP16_BF16_DENSE,
+        SM90_WGMMA_SHAPES_FP8_DENSE,
+        SM90_WGMMA_SHAPES_INT8_DENSE,
+    )
+
+
+def generate_tf32_math_instruction_shapes_sm90(level: int):
+    assert isinstance(level, int) and level >= 0
+    filtered_list_of_wgmma_shapes = [
+        wgmma_shape for wgmma_shape, min_level in SM90_WGMMA_SHAPES_TF32_DENSE.items() if level >= min_level
+    ]
+    return filtered_list_of_wgmma_shapes
+
+def generate_fp16_bf16_math_instruction_shapes_sm90(level: int):
+    assert isinstance(level, int) and level >= 0
+    filtered_list_of_wgmma_shapes = [
+        wgmma_shape for wgmma_shape, min_level in SM90_WGMMA_SHAPES_FP16_BF16_DENSE.items() if level >= min_level
+    ]
+    return filtered_list_of_wgmma_shapes
+
+def generate_fp8_math_instruction_shapes_sm90(level: int):
+    assert isinstance(level, int) and level >= 0
+    filtered_list_of_wgmma_shapes = [
+        wgmma_shape for wgmma_shape, min_level in SM90_WGMMA_SHAPES_FP8_DENSE.items() if level >= min_level
+    ]
+    return filtered_list_of_wgmma_shapes
+
+def generate_int8_math_instruction_shapes_sm90(level: int):
+    assert isinstance(level, int) and level >= 0
+    filtered_list_of_wgmma_shapes = [
+        wgmma_shape for wgmma_shape, min_level in SM90_WGMMA_SHAPES_INT8_DENSE.items() if level >= min_level
+    ]
+    return filtered_list_of_wgmma_shapes
+
+###########
+
+def generate_tf32_math_instructions_sm90(level: int):
+    wgmma_level = get_wgmma_level_from_global_level(level)
+    math_instructions = []
+    for math_instruction_shape in generate_tf32_math_instruction_shapes_sm90(wgmma_level):
+        math_instructions.append(
+          MathInstruction(
+              math_instruction_shape,
+              DataType.tf32, DataType.tf32, DataType.f32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add)
+        )
+    return math_instructions
+
+def generate_fp16_bf16_math_instructions_sm90(level: int):
+    wgmma_level = get_wgmma_level_from_global_level(level)
+    math_instructions = []
+    for math_instruction_shape in generate_fp16_bf16_math_instruction_shapes_sm90(wgmma_level):
+        math_instructions += [
+          MathInstruction(
+              math_instruction_shape,
+              DataType.f16, DataType.f16, DataType.f16,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add),
+          MathInstruction(
+              math_instruction_shape,
+              DataType.f16, DataType.f16, DataType.f32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add),
+          MathInstruction(
+              math_instruction_shape,
+              DataType.bf16, DataType.bf16, DataType.f32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add),
+        ]
+    return math_instructions
+
+def generate_fp8_math_instructions_sm90(level: int):
+    wgmma_level = get_wgmma_level_from_global_level(level)
+    math_instructions = []
+    for math_instruction_shape in generate_fp8_math_instruction_shapes_sm90(wgmma_level):
+        math_instructions += [
+          MathInstruction(
+              math_instruction_shape,
+              DataType.e4m3, DataType.e4m3, DataType.f32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add),
+          MathInstruction(
+              math_instruction_shape,
+              DataType.e4m3, DataType.e5m2, DataType.f32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add),
+          MathInstruction(
+              math_instruction_shape,
+              DataType.e5m2, DataType.e4m3, DataType.f32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add),
+          MathInstruction(
+              math_instruction_shape,
+              DataType.e5m2, DataType.e5m2, DataType.f32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add),
+        ]
+    return math_instructions
+
+def generate_int8_math_instructions_sm90(level: int):
+    wgmma_level = get_wgmma_level_from_global_level(level)
+    math_instructions = []
+    for math_instruction_shape in generate_int8_math_instruction_shapes_sm90(wgmma_level):
+        math_instructions += [
+          MathInstruction(
+              math_instruction_shape,
+              DataType.s8, DataType.s8, DataType.s32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add),
+          MathInstruction(
+              math_instruction_shape,
+              DataType.u8, DataType.u8, DataType.s32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add),
+        ]
+    return math_instructions
+
+def make_sparse_math_instructions(math_instructions):
+    sparse_instructions = []
+    for inst in math_instructions:
+        if inst.opcode_class == OpcodeClass.TensorOp:
+            sparse_instructions.append(MathInstruction(
+                (inst.instruction_shape[0], inst.instruction_shape[1], inst.instruction_shape[2] * 2),
+                inst.element_a, inst.element_b, inst.element_accumulator,
+                OpcodeClass.SparseTensorOp,
+                inst.math_operation),)
+    return sparse_instructions
+
+
+#### Step 2: generate tile descriptions from math instruction shapes
+
+def is_tile_desc_valid(tile_description):
+    if tile_description.minimum_compute_capability != 90 or tile_description.maximum_compute_capability != 90:
+        return False
+
+    element_a, element_b, element_accum = (
+        tile_description.math_instruction.element_a,
+        tile_description.math_instruction.element_b,
+        tile_description.math_instruction.element_accumulator
+    )
+
+    cluster_shape, cta_shape, inst_shape = (
+        tile_description.cluster_shape,
+        tile_description.threadblock_shape,
+        tile_description.math_instruction.instruction_shape
+    )
+    grid_size = (
+        cta_shape[0] * cluster_shape[0] +
+        cta_shape[1] * cluster_shape[1] +
+        cta_shape[2] * cluster_shape[2]
+    )
+    cluster_size = cluster_shape[0] * cluster_shape[1] * cluster_shape[2]
+
+    # Maximum number of CTAs per cluster is 8 for Hopper, but up to 16 is
+    # allowed for non portable clusters.
+    if cluster_size > 16 or cluster_size < 1:
+        return False
+
+    if grid_size < 1:
+        return False
+
+    # SM90 WGMMA shapes are always 64 across M, therefore
+    # CTA shape across M must always be a multiple of 64.
+    if cta_shape[0] < 64 or cta_shape[0] % 64 != 0:
+        return False
+
+    # The minimum WGMMA shape across N is 8, and increments
+    # vary across different dtypes, but they're never smaller
+    # than 8. The minimum CTA shape allowed across N though is 16.
+    if cta_shape[1] < 16 or cta_shape[1] % 8 != 0:
+        return False
+
+    # SM90 WGMMA shapes across K are always 8 for 32 bit dense
+    # operations, 16 for 16 bit, and 32 for 8 bit. In any case,
+    # the CTA shape across K should be a multiple of 8 and at least
+    # twice the WGMMA shape across K.
+    if cta_shape[2] < 16 or cta_shape[2] % 8 != 0:
+        return False
+
+    # Minimum of 2 stages
+    if cta_shape[2] < inst_shape[2] or cta_shape[2] % inst_shape[2] != 0 or cta_shape[2] / inst_shape[2] < 2:
+        return False
+
+    # CTA shape upper bound: <256, 256, 256>
+    if cta_shape[0] > 256 or cta_shape[1] > 256 or cta_shape[2] > 256:
+        return False
+
+    return True
+
+def get_mma_multipliers(level: int):
+    assert isinstance(level, int) and level >= 0
+    mma_level = get_mma_level_from_global_level(level)
+    return [
+        mma_mul for mma_mul, mma_min_level in SM90_MMA_MULTIPLIERS.items() if mma_level >= mma_min_level
+    ]
+
+def get_cluster_sizes(level: int, is_aligned: bool):
+    if not is_aligned:
+        return [(1, 1, 1)]
+    assert isinstance(level, int) and level >= 0
+    cluster_level = get_cluster_level_from_global_level(level)
+    return [
+        cluster_size for cluster_size, cluster_min_level in SM90_CLUSTER_SIZES.items() if cluster_level >= cluster_min_level
+    ]
+
+def generate_tile_descriptions_sm90(math_instructions, is_aligned: bool, level: int):
+    tile_descriptions = set()
+    mma_multipliers, cluster_sizes = get_mma_multipliers(level), get_cluster_sizes(level, is_aligned)
+    for math_inst, mma_mul, cluster_size in product(math_instructions, mma_multipliers, cluster_sizes):
+        tile_desc = TileDescription(
+            threadblock_shape=[
+                math_inst.instruction_shape[0] * mma_mul[0],
+                math_inst.instruction_shape[1] * mma_mul[1],
+                math_inst.instruction_shape[2] * mma_mul[2]
+            ],
+            stages=0,
+            warp_count=[4, 1, 1],
+            math_instruction=math_inst,
+            min_compute=90,
+            max_compute=90,
+            cluster_shape=cluster_size)
+        # For sparse kernels K-tile is twice as large (due to 2x MMA-K size)
+        # Reduce it to same size as dense to afford more smem stages
+        if math_inst.opcode_class == OpcodeClass.SparseTensorOp:
+            tile_desc.threadblock_shape[2] = tile_desc.threadblock_shape[2] // 2
+        if is_tile_desc_valid(tile_desc):
+            tile_descriptions.add(tile_desc)
+
+    return tile_descriptions
+
+#### Step 3: map tile description to valid schedules
+
+def is_tile_desc_compatible_with_cooperative(tile_description):
+    # Cooperative kernels require a minimum CTA-M of 128
+    return tile_description.threadblock_shape[0] >= 128
+
+
+def can_tile_desc_use_shmem_in_epilogue(tile_description, data_types):
+    dtype_a, dtype_b, dtype_c, dtype_d, dtype_acc, dtype_epi = (
+        data_types["a_type"],
+        data_types["b_type"],
+        data_types["c_type"],
+        data_types["d_type"],
+        data_types["acc_type"],
+        data_types["epi_type"]
+    )
+    mn = tile_description.threadblock_shape[0] * tile_description.threadblock_shape[1]
+    bitsize_c, bitsize_d = DataTypeSize[dtype_c], DataTypeSize[dtype_d]
+
+    shmem_bits_c, shmem_bits_d = bitsize_c * mn, bitsize_d * mn
+    shmem_bits_total = shmem_bits_c + shmem_bits_d
+    # Magic number: 2^20
+    # Existing logic suggested that tile shape 256x128 (or 128x256)
+    # would run out of shmem if D is FP32, and source is needed.
+    # That would be 256 * 128 * 32 == 2^21 (~262 KB), which is over the limit.
+    # Hopper's max shmem size is 228 KB, and 2^20 ~= 131 KB.
+    # Since epilogue can't possibly use ALL of the shmem available
+    # we can just settle on 2^20 bits (~ 131 KB) being the upper bound
+    # we would allow for epilogue.
+    # This can be different for non-persistent kernels where epilogue and
+    # mainloop shmem is shared.
+    if shmem_bits_total > 2 ** 20:
+        return False
+
+    return True
+
+
+def get_valid_schedules(tile_description, cuda_version, is_aligned, data_types, layout,
+                        instantiation_level, enable_fp8_fast_acc=True):
+    # Level 0: prune according to existing generator.py behavior
+    # Level >= 1: no pruning
+    level = get_pruning_level_from_global_level(instantiation_level)
+    schedules = []
+    stream_k_schedules = []
+
+    if not is_tile_desc_valid(tile_description):
+        return schedules, stream_k_schedules
+
+    FP16_TYPES = [DataType.f16, DataType.bf16]
+    is_fp16 = data_types["a_type"] in FP16_TYPES and data_types["b_type"] in FP16_TYPES
+
+    FP8_TYPES = [DataType.e4m3, DataType.e5m2]
+    is_fp8 = data_types["a_type"] in FP8_TYPES and data_types["b_type"] in FP8_TYPES
+    can_do_fp8_fast_accum = is_fp8 and enable_fp8_fast_acc
+
+    FP32_TYPES = [DataType.f32, DataType.tf32]
+    is_fp32 = data_types["a_type"] in FP32_TYPES and data_types["b_type"] in FP32_TYPES
+    requires_transposed_epilogue = is_fp32 and layout[0][0] == LayoutType.RowMajor and layout[1][0] == LayoutType.RowMajor
+
+    is_sparse = tile_description.math_instruction.opcode_class == OpcodeClass.SparseTensorOp
+
+    can_do_cooperative = is_tile_desc_compatible_with_cooperative(tile_description)
+    can_do_tma_epilogue = is_aligned and not requires_transposed_epilogue and can_tile_desc_use_shmem_in_epilogue(tile_description, data_types)
+
+    default_epilogue = EpilogueScheduleType.NoSmemWarpSpecialized if not requires_transposed_epilogue else EpilogueScheduleType.EpilogueTransposed
+    auto_epilogue = EpilogueScheduleType.ScheduleAuto if not requires_transposed_epilogue else EpilogueScheduleType.EpilogueTransposed
+
+    cta_m, cta_n, cta_k = (
+        tile_description.threadblock_shape[0],
+        tile_description.threadblock_shape[1],
+        tile_description.threadblock_shape[2]
+    )
+    c_type = data_types["c_type"]
+    d_type = data_types["d_type"]
+    is_void_c = c_type == DataType.void
+
+    # Early pruning
+    if level < 1:
+        # Don't stamp out FP16/BF16 kernels smaller than or equal to 64x128x64
+        if is_fp16 and cta_m <= 64 and cta_n <= 128 and cta_k <= 64:
+            return [], []
+
+        # FP8 configs with CTA tile larger than or equal to 256x128x128 limit data types and schedules
+        is_large_fp8_tile = is_fp8 and cta_m >= 256 and cta_n >= 128 and cta_k >= 128
+        if is_large_fp8_tile:
+            # Only void-C, and only FP8 outputs allowed
+            if not is_void_c or d_type not in FP8_TYPES:
+                return [], []
+            if CudaToolkitVersionSatisfies(cuda_version, 12, 1) and can_do_cooperative and can_do_tma_epilogue:
+                return [
+                    [
+                        KernelScheduleType.TmaWarpSpecializedCooperative if not is_sparse else KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum,
+                        EpilogueScheduleType.TmaWarpSpecializedCooperative
+                    ],
+                    [
+                        KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum,
+                        EpilogueScheduleType.TmaWarpSpecializedCooperative
+                    ],
+                ] , []
+            return [], []
+
+        if is_fp8 and not is_large_fp8_tile:
+            valid_dtypes_for_c = [DataType.f32, DataType.bf16, DataType.f16]
+            # Prune all configs with fp8 source, and all configs with non-fp8 output
+            # that have different dtypes for source and output.
+            if c_type not in valid_dtypes_for_c or (d_type not in FP8_TYPES and c_type != d_type):
+                return [], []
+
+        # FP32/TF32 kernels don't stamp out void-C
+        if is_fp32 and is_void_c:
+            return [], []
+
+    # Void-c only makes a difference for TMA epilogues
+    if is_void_c and not can_do_tma_epilogue:
+        return [], []
+
+    if not is_aligned:
+        schedules = [[KernelScheduleType.CpAsyncWarpSpecialized,
+                    default_epilogue]]
+        stream_k_schedules = []
+
+        if CudaToolkitVersionSatisfies(cuda_version, 12, 1) and can_do_cooperative:
+            schedules.append([
+                KernelScheduleType.CpAsyncWarpSpecializedCooperative,
+                default_epilogue
+            ])
+            stream_k_schedules.append([
+                KernelScheduleType.CpAsyncWarpSpecializedCooperative,
+                default_epilogue
+            ])
+
+        return schedules, stream_k_schedules
+
+    schedules = []
+    # Pruning: emit Void-C kernels with persistent kernels only
+    if level >= 1 or not is_void_c:
+        # Pruning: don't stamp out fp8 kernels with auto schedule
+        if not is_fp8:
+            schedules.append([KernelScheduleType.ScheduleAuto, auto_epilogue])
+        if not (is_fp8 and is_sparse):
+            schedules.append([KernelScheduleType.TmaWarpSpecialized, default_epilogue])
+    stream_k_schedules = []
+    
+    if CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+        if can_do_tma_epilogue:
+            assert not requires_transposed_epilogue
+            # Inconsistency: fp8 pingpong only gets stamped out with fast accum
+            if not is_fp8 or level >= 1:
+                schedules.append([
+                    KernelScheduleType.TmaWarpSpecializedPingpong,
+                    EpilogueScheduleType.TmaWarpSpecialized
+                ])
+            if can_do_fp8_fast_accum:
+                schedules.append([
+                    KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum,
+                    EpilogueScheduleType.TmaWarpSpecialized
+                ])
+
+    if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
+        # Pruning: don't stamp out fp8 ping-ponging kernel with non-tma epilogue
+        if not is_fp8 or level >= 1:
+            schedules.append([KernelScheduleType.TmaWarpSpecializedPingpong, default_epilogue])
+
+        if can_do_fp8_fast_accum:
+            schedules.append([KernelScheduleType.TmaWarpSpecializedFP8FastAccum, default_epilogue])
+            schedules.append([KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum, default_epilogue])
+
+        if can_do_cooperative:
+            # Sparse kernels only support FastAccum FP8 mainloop
+            if not (is_fp8 and is_sparse):
+                schedules.append([
+                    KernelScheduleType.TmaWarpSpecializedCooperative,
+                    default_epilogue
+                ])
+                stream_k_schedules.append([
+                    KernelScheduleType.TmaWarpSpecializedCooperative,
+                    default_epilogue
+                ])
+            if can_do_fp8_fast_accum:
+                schedules.append([
+                    KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum,
+                    default_epilogue
+                ])
+                stream_k_schedules.append([
+                    KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum,
+                    default_epilogue
+                ])
+
+        # persistent kernels with TMA epilogues
+        if can_do_tma_epilogue:
+            assert not requires_transposed_epilogue
+            if can_do_cooperative:
+                # Sparse kernels only support FastAccum FP8 mainloop
+                if not (is_fp8 and is_sparse):
+                    schedules.append([
+                        KernelScheduleType.TmaWarpSpecializedCooperative,
+                        EpilogueScheduleType.TmaWarpSpecializedCooperative
+                    ])
+                    stream_k_schedules.append([
+                        KernelScheduleType.TmaWarpSpecializedCooperative,
+                        EpilogueScheduleType.TmaWarpSpecializedCooperative
+                    ])
+                if can_do_fp8_fast_accum:
+                    schedules.append([
+                        KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum,
+                        EpilogueScheduleType.TmaWarpSpecializedCooperative
+                    ])
+                    stream_k_schedules.append([
+                        KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum,
+                        EpilogueScheduleType.TmaWarpSpecializedCooperative
+                    ])
+
+    return schedules, stream_k_schedules
+
+
+#### Misc: helpers
+
+def generate_data_types_from_math_instruction(math_instruction, element_source = None, element_dest = None, element_epilogue = None):
+    element_a, element_b = math_instruction.element_a, math_instruction.element_b
+    element_accumulator = math_instruction.element_accumulator
+    element_c = element_source or element_accumulator
+    element_d = element_dest or element_accumulator
+    element_epilogue = element_epilogue or element_accumulator
+    data_types = {
+        "a_type"   : element_a,
+        "b_type"   : element_b,
+        "c_type"   : element_c,
+        "d_type"   : element_d,
+        "acc_type" : element_accumulator,
+        "epi_type" : element_epilogue
+    }
+    return data_types
+
+def fix_alignments(data_types, layout, alignment_bits = 128):
+    operand_keys = ["a_type", "b_type", "c_type"]
+    operands_to_fix = ["c_type"]
+    new_layout = []
+    assert len(layout) == len(operand_keys)
+    for i, k in enumerate(operand_keys):
+        assert k in data_types and data_types[k] in DataTypeSize
+        dtype = data_types[k]
+        dtype_size_bits = DataTypeSize[dtype]
+
+        layout_type = layout[i][0]
+        layout_alignment = layout[i][1]
+
+        # Don't modify alignment if dtype's been changed to void
+        if k in operands_to_fix and dtype_size_bits >= 1:
+            layout_alignment = alignment_bits // dtype_size_bits
+
+        new_layout.append([layout_type, layout_alignment])
+
+    return new_layout
diff --git a/tools/library/scripts/symm_operation.py b/python/cutlass_library/symm_operation.py
similarity index 83%
rename from tools/library/scripts/symm_operation.py
rename to python/cutlass_library/symm_operation.py
index dbee11b4c7..93c94832ef 100644
--- a/tools/library/scripts/symm_operation.py
+++ b/python/cutlass_library/symm_operation.py
@@ -1,17 +1,52 @@
+#################################################################################################
 #
-# \file generator.py
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# \brief Generates the CUTLASS Library's instances
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# 
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for emitting Symm kernels
+"""
 
 import enum
-import os.path
-import shutil
 import functools
 import operator
+import os.path
+import shutil
 
-from library import *
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+except ImportError:
+  from library import *
 
 
 ###################################################################################################
@@ -34,7 +69,7 @@ def __init__(self, symm_kind, arch, tile_description, A, B, C, element_epilogue,
     self.symm_kind = symm_kind
     # tensor A and B have same data type and layout
     self.A = A
-    self.B = B  
+    self.B = B
     self.C = C
     self.element_epilogue = element_epilogue
     self.epilogue_functor = epilogue_functor
@@ -43,13 +78,17 @@ def __init__(self, symm_kind, arch, tile_description, A, B, C, element_epilogue,
   #
   def is_complex(self):
     complex_operators = [
-      MathOperation.multiply_add_complex, 
+      MathOperation.multiply_add_complex,
       MathOperation.multiply_add_complex_gaussian,
       MathOperation.multiply_add_complex_fast_f32
     ]
     return self.tile_description.math_instruction.math_operation in complex_operators
     return False
 
+  #
+  def is_mixed_input(self):
+    return self.A.element != self.B.element
+
   #
   def is_planar_complex(self):
     return False
@@ -73,13 +112,14 @@ def short_math_name(self):
   #
   def core_name(self):
     ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
-    
+
     inst_shape = ''
     inst_operation = ''
     intermediate_type = ''
 
     math_operations_map = {
       MathOperation.xor_popc: 'xor',
+      MathOperation.and_popc: 'and'
     }
 
     if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp or \
@@ -126,7 +166,7 @@ def extended_name(self):
   def layout_name(self):
     if self.is_complex() or self.is_planar_complex():
       return "%s" % (
-        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)] 
+        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)]
       )
     return "%s" % (ShortLayoutTypeNames[self.A.layout])
 
@@ -178,10 +218,10 @@ class EmitSymmUniversalInstance:
   def __init__(self):
     self.symm_template = """
 // Symm operator ${operation_name}
-using Operation_${operation_name} = 
+using Operation_${operation_name} =
   typename cutlass::gemm::device::Symm<
-    ${element_a}, ${layout_a}, ${side_mode}, ${fill_mode}, 
-    ${element_b}, ${layout_b}, 
+    ${element_a}, ${layout_a}, ${side_mode}, ${fill_mode},
+    ${element_b}, ${layout_b},
     ${element_c}, ${layout_c},
     ${element_accumulator},
     ${opcode_class},
@@ -205,10 +245,10 @@ def __init__(self):
 """
     self.symm_complex_template = """
 // Symm operator ${operation_name}
-using Operation_${operation_name} = 
+using Operation_${operation_name} =
   typename cutlass::gemm::device::Symm<
-    ${element_a}, ${layout_a}, ${side_mode}, ${fill_mode}, 
-    ${element_b}, ${layout_b}, 
+    ${element_a}, ${layout_a}, ${side_mode}, ${fill_mode},
+    ${element_b}, ${layout_b},
     ${element_c}, ${layout_c},
     ${element_accumulator},
     ${opcode_class},
@@ -235,7 +275,7 @@ def __init__(self):
   def emit(self, operation):
 
     threadblock_shape = operation.tile_description.threadblock_shape
-    
+
     warp_count = operation.tile_description.warp_count
     warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
 
@@ -270,7 +310,7 @@ def emit(self, operation):
       'stages': str(operation.tile_description.stages),
       'align_a': str(operation.A.alignment),
       'align_b': str(operation.B.alignment),
-      'split_k_serial': 'false', 
+      'split_k_serial': 'false',
       'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation],
       'blas_mode': BlasModeTag[operation.blas_mode]
     }
@@ -377,7 +417,7 @@ def emit(self, operation):
       'compile_guard_start': SubstituteTemplate(self.wmma_guard_start, {'sm_number': str(operation.arch)}) \
         if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "",
       'compile_guard_end': "#endif" \
-        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "" 
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else ""
       }))
 
   def __exit__(self, exception_type, exception_value, traceback):
@@ -390,9 +430,9 @@ def __exit__(self, exception_type, exception_value, traceback):
     self.configuration_file.write(SubstituteTemplate(self.initialize_function_template, {
       'configuration_name': self.configuration_name
       }))
-   
+
     for instance_wrapper in self.instance_wrappers:
-      self.configuration_file.write(instance_wrapper) 
+      self.configuration_file.write(instance_wrapper)
 
     self.configuration_file.write(self.epilogue_template)
     self.configuration_file.close()
diff --git a/tools/library/scripts/trmm_operation.py b/python/cutlass_library/trmm_operation.py
similarity index 83%
rename from tools/library/scripts/trmm_operation.py
rename to python/cutlass_library/trmm_operation.py
index 7e03e278ef..5fc991a3cb 100644
--- a/tools/library/scripts/trmm_operation.py
+++ b/python/cutlass_library/trmm_operation.py
@@ -1,17 +1,52 @@
+#################################################################################################
 #
-# \file generator.py
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# \brief Generates the CUTLASS Library's instances
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# 
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for emitting Trmm kernels
+"""
 
 import enum
-import os.path
-import shutil
 import functools
 import operator
+import os.path
+import shutil
 
-from library import *
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+except ImportError:
+  from library import *
 
 
 ###################################################################################################
@@ -40,7 +75,7 @@ def __init__(self, trmm_kind, arch, tile_description, A, B, C, element_epilogue,
   #
   def is_complex(self):
     complex_operators = [
-      MathOperation.multiply_add_complex, 
+      MathOperation.multiply_add_complex,
       MathOperation.multiply_add_complex_gaussian,
       MathOperation.multiply_add_complex_fast_f32
     ]
@@ -52,6 +87,10 @@ def is_planar_complex(self):
 #   return self.trmm_kind in (TrmmKind.PlanarComplex, TrmmKind.PlanarComplexArray)
     return False
 
+  #
+  def is_mixed_input(self):
+    return self.A.element != self.B.element
+
   #
   def accumulator_type(self):
     accum = self.tile_description.math_instruction.element_accumulator
@@ -71,13 +110,14 @@ def short_math_name(self):
   #
   def core_name(self):
     ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
-    
+
     inst_shape = ''
     inst_operation = ''
     intermediate_type = ''
 
     math_operations_map = {
       MathOperation.xor_popc: 'xor',
+      MathOperation.and_popc: 'and'
     }
 
     if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp or \
@@ -122,8 +162,8 @@ def extended_name(self):
   def layout_name(self):
     if self.is_complex() or self.is_planar_complex():
       return "%s%s" % (
-        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)], 
-        ShortComplexLayoutNames[(self.B.layout, self.B.complex_transform)] 
+        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)],
+        ShortComplexLayoutNames[(self.B.layout, self.B.complex_transform)]
       )
     return "%s%s" % (ShortLayoutTypeNames[self.A.layout], ShortLayoutTypeNames[self.B.layout])
 
@@ -180,11 +220,11 @@ class EmitTrmmUniversalInstance:
   def __init__(self):
     self.trmm_template = """
 // Trmm operator ${operation_name}
-using Operation_${operation_name} = 
+using Operation_${operation_name} =
   typename cutlass::gemm::device::Trmm<
     ${element_a}, ${layout_a},
-    ${side_mode}, ${fill_mode}, ${diag_type}, 
-    ${element_b}, ${layout_b}, 
+    ${side_mode}, ${fill_mode}, ${diag_type},
+    ${element_b}, ${layout_b},
     ${element_c}, ${layout_c},
     ${element_accumulator},
     ${opcode_class},
@@ -209,11 +249,11 @@ def __init__(self):
 """
     self.trmm_complex_template = """
 // Trmm operator ${operation_name}
-using Operation_${operation_name} = 
+using Operation_${operation_name} =
   typename cutlass::gemm::device::Trmm<
-    ${element_a}, ${layout_a}, 
-    ${side_mode}, ${fill_mode}, ${diag_type}, 
-    ${element_b}, ${layout_b}, 
+    ${element_a}, ${layout_a},
+    ${side_mode}, ${fill_mode}, ${diag_type},
+    ${element_b}, ${layout_b},
     ${element_c}, ${layout_c},
     ${element_accumulator},
     ${opcode_class},
@@ -234,7 +274,7 @@ def __init__(self):
     ${align_b},
     ${split_k_serial},
     ${math_operation},
-    ${transform_a} 
+    ${transform_a}
 >;
 """
 
@@ -251,7 +291,7 @@ def emit(self, operation):
       'operation_name': operation.procedural_name(),
       'element_a': DataTypeTag[operation.A.element],
       'layout_a': LayoutTag[operation.A.layout],
-      'side_mode' : SideModeTag[operation.A.side_mode], 
+      'side_mode' : SideModeTag[operation.A.side_mode],
       'fill_mode': FillModeTag[operation.A.fill_mode],
       'diag_type' : DiagTypeTag[operation.A.diag_type],
       'element_b': DataTypeTag[operation.B.element],
@@ -277,7 +317,7 @@ def emit(self, operation):
       'stages': str(operation.tile_description.stages),
       'align_a': str(1),  # TRMM A's alignment is always 1 for no padding to work until we make zfill work with variable bytes
       'align_b': str(operation.B.alignment),
-      'split_k_serial': 'false', 
+      'split_k_serial': 'false',
       'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation],
       'transform_a': ComplexTransformTag[operation.A.complex_transform]
     }
@@ -384,7 +424,7 @@ def emit(self, operation):
       'compile_guard_start': SubstituteTemplate(self.wmma_guard_start, {'sm_number': str(operation.arch)}) \
         if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "",
       'compile_guard_end': "#endif" \
-        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "" 
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else ""
       }))
 
   def __exit__(self, exception_type, exception_value, traceback):
@@ -397,9 +437,9 @@ def __exit__(self, exception_type, exception_value, traceback):
     self.configuration_file.write(SubstituteTemplate(self.initialize_function_template, {
       'configuration_name': self.configuration_name
       }))
-   
+
     for instance_wrapper in self.instance_wrappers:
-      self.configuration_file.write(instance_wrapper) 
+      self.configuration_file.write(instance_wrapper)
 
     self.configuration_file.write(self.epilogue_template)
     self.configuration_file.close()
diff --git a/python/docs/.buildinfo b/python/docs/.buildinfo
new file mode 100644
index 0000000000..6f01492659
--- /dev/null
+++ b/python/docs/.buildinfo
@@ -0,0 +1,4 @@
+# Sphinx build info version 1
+# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
+config: 4a5275a3b68094ba1d8a4b7e4c459321
+tags: 645f666f9bcd5a90fca523b33c5a78b7
diff --git a/python/docs/_modules/cutlass/emit/pytorch.html b/python/docs/_modules/cutlass/emit/pytorch.html
new file mode 100644
index 0000000000..fd3f85b817
--- /dev/null
+++ b/python/docs/_modules/cutlass/emit/pytorch.html
@@ -0,0 +1,923 @@
+<!doctype html>
+<html class="no-js" lang="en">
+  <head><meta charset="utf-8"/>
+    <meta name="viewport" content="width=device-width,initial-scale=1"/>
+    <meta name="color-scheme" content="light dark"><link rel="index" title="Index" href="../../../genindex.html" /><link rel="search" title="Search" href="../../../search.html" />
+        <link rel="canonical" href="docs/_modules/cutlass/emit/pytorch.html" />
+
+    <!-- Generated with Sphinx 6.1.3 and Furo 2023.03.27 -->
+        <title>cutlass.emit.pytorch - CUTLASS Python</title>
+      <link rel="stylesheet" type="text/css" href="../../../_static/pygments.css" />
+    <link rel="stylesheet" type="text/css" href="../../../_static/styles/furo.css?digest=fad236701ea90a88636c2a8c73b44ae642ed2a53" />
+    <link rel="stylesheet" type="text/css" href="../../../_static/copybutton.css" />
+    <link rel="stylesheet" type="text/css" href="../../../_static/tabs.css" />
+    <link rel="stylesheet" type="text/css" href="../../../_static/styles/furo-extensions.css?digest=30d1aed668e5c3a91c3e3bf6a60b675221979f0e" />
+    
+    
+
+
+<style>
+  body {
+    --color-code-background: #eeffcc;
+  --color-code-foreground: black;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+  }
+  @media not print {
+    body[data-theme="dark"] {
+      --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+    }
+    @media (prefers-color-scheme: dark) {
+      body:not([data-theme="light"]) {
+        --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+      }
+    }
+  }
+</style></head>
+  <body>
+    
+    <script>
+      document.body.dataset.theme = localStorage.getItem("theme") || "auto";
+    </script>
+    
+
+<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
+  <symbol id="svg-toc" viewBox="0 0 24 24">
+    <title>Contents</title>
+    <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 1024 1024">
+      <path d="M408 442h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8zm-8 204c0 4.4 3.6 8 8 8h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56zm504-486H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zm0 632H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zM115.4 518.9L271.7 642c5.8 4.6 14.4.5 14.4-6.9V388.9c0-7.4-8.5-11.5-14.4-6.9L115.4 505.1a8.74 8.74 0 0 0 0 13.8z"/>
+    </svg>
+  </symbol>
+  <symbol id="svg-menu" viewBox="0 0 24 24">
+    <title>Menu</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-menu">
+      <line x1="3" y1="12" x2="21" y2="12"></line>
+      <line x1="3" y1="6" x2="21" y2="6"></line>
+      <line x1="3" y1="18" x2="21" y2="18"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-arrow-right" viewBox="0 0 24 24">
+    <title>Expand</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-chevron-right">
+      <polyline points="9 18 15 12 9 6"></polyline>
+    </svg>
+  </symbol>
+  <symbol id="svg-sun" viewBox="0 0 24 24">
+    <title>Light mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="feather-sun">
+      <circle cx="12" cy="12" r="5"></circle>
+      <line x1="12" y1="1" x2="12" y2="3"></line>
+      <line x1="12" y1="21" x2="12" y2="23"></line>
+      <line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
+      <line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
+      <line x1="1" y1="12" x2="3" y2="12"></line>
+      <line x1="21" y1="12" x2="23" y2="12"></line>
+      <line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
+      <line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-moon" viewBox="0 0 24 24">
+    <title>Dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-moon">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none" />
+      <path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z" />
+    </svg>
+  </symbol>
+  <symbol id="svg-sun-half" viewBox="0 0 24 24">
+    <title>Auto light/dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-shadow">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none"/>
+      <circle cx="12" cy="12" r="9" />
+      <path d="M13 12h5" />
+      <path d="M13 15h4" />
+      <path d="M13 18h1" />
+      <path d="M13 9h4" />
+      <path d="M13 6h1" />
+    </svg>
+  </symbol>
+</svg>
+
+<input type="checkbox" class="sidebar-toggle" name="__navigation" id="__navigation">
+<input type="checkbox" class="sidebar-toggle" name="__toc" id="__toc">
+<label class="overlay sidebar-overlay" for="__navigation">
+  <div class="visually-hidden">Hide navigation sidebar</div>
+</label>
+<label class="overlay toc-overlay" for="__toc">
+  <div class="visually-hidden">Hide table of contents sidebar</div>
+</label>
+
+
+
+<div class="page">
+  <header class="mobile-header">
+    <div class="header-left">
+      <label class="nav-overlay-icon" for="__navigation">
+        <div class="visually-hidden">Toggle site navigation sidebar</div>
+        <i class="icon"><svg><use href="#svg-menu"></use></svg></i>
+      </label>
+    </div>
+    <div class="header-center">
+      <a href="../../../index.html"><div class="brand">CUTLASS Python</div></a>
+    </div>
+    <div class="header-right">
+      <div class="theme-toggle-container theme-toggle-header">
+        <button class="theme-toggle">
+          <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+          <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+          <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+          <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+        </button>
+      </div>
+      <label class="toc-overlay-icon toc-header-icon no-toc" for="__toc">
+        <div class="visually-hidden">Toggle table of contents sidebar</div>
+        <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+      </label>
+    </div>
+  </header>
+  <aside class="sidebar-drawer">
+    <div class="sidebar-container">
+      
+      <div class="sidebar-sticky"><a class="sidebar-brand" href="../../../index.html">
+  
+  <div class="sidebar-logo-container">
+    <img class="sidebar-logo only-light" src="../../../_static/cutlass-logo-small.png" alt="Light Logo"/>
+    <img class="sidebar-logo only-dark" src="../../../_static/cutlass-logo-small.png" alt="Dark Logo"/>
+  </div>
+  
+  <span class="sidebar-brand-text">CUTLASS Python</span>
+  
+</a><form class="sidebar-search-container" method="get" action="../../../search.html" role="search">
+  <input class="sidebar-search" placeholder="Search" name="q" aria-label="Search">
+  <input type="hidden" name="check_keywords" value="yes">
+  <input type="hidden" name="area" value="default">
+</form>
+<div id="searchbox"></div><div class="sidebar-scroll"><div class="sidebar-tree">
+  <ul>
+<li class="toctree-l1"><a class="reference internal" href="../../../index.html">Home</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Getting Started:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="../../../install.html">Installation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../externals/00_basic_gemm.html">Getting Started</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../contribute.html">Contributing</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Python Documentation:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../modules.html">CUTLASS Python API</a><input class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" role="switch" type="checkbox"/><label for="toctree-checkbox-1"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2 has-children"><a class="reference internal" href="../../../cutlass.html">CUTLASS</a><input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" role="switch" type="checkbox"/><label for="toctree-checkbox-2"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l3"><a class="reference internal" href="../../../cutlass.emit.html">Emitters</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../../cutlass.op.html">Operations</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../../cutlass.utils.html">Utilities</a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Examples and Tutorials:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples.html">Examples</a><input class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" role="switch" type="checkbox"/><label for="toctree-checkbox-3"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../externals/00_basic_gemm.html">Basic GEMM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../externals/01_epilogue.html">Epilogue</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../externals/02_pytorch_extension_grouped_gemm.html">PyTorch Extension</a></li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Reference:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference external" href="https://github.com/NVIDIA/cutlass">Github</a></li>
+</ul>
+
+</div>
+</div>
+
+      </div>
+      
+    </div>
+  </aside>
+  <div class="main">
+    <div class="content">
+      <div class="article-container">
+        <a href="#" class="back-to-top muted-link">
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+            <path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12z"></path>
+          </svg>
+          <span>Back to top</span>
+        </a>
+        <div class="content-icon-container">
+          <div class="theme-toggle-container theme-toggle-content">
+            <button class="theme-toggle">
+              <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+              <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+              <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+              <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+            </button>
+          </div>
+          <label class="toc-overlay-icon toc-content-icon no-toc" for="__toc">
+            <div class="visually-hidden">Toggle table of contents sidebar</div>
+            <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+          </label>
+        </div>
+        <article role="main">
+          <h1>Source code for cutlass.emit.pytorch</h1><div class="highlight"><pre>
+<span></span><span class="c1">#################################################################################################</span>
+<span class="c1">#</span>
+<span class="c1"># Copyright (c) 2023 - 2023 NVIDIA CORPORATION &amp; AFFILIATES. All rights reserved.</span>
+<span class="c1"># SPDX-License-Identifier: BSD-3-Clause</span>
+<span class="c1">#</span>
+<span class="c1"># Redistribution and use in source and binary forms, with or without</span>
+<span class="c1"># modification, are permitted provided that the following conditions are met:</span>
+<span class="c1">#</span>
+<span class="c1"># 1. Redistributions of source code must retain the above copyright notice, this</span>
+<span class="c1"># list of conditions and the following disclaimer.</span>
+<span class="c1">#</span>
+<span class="c1"># 2. Redistributions in binary form must reproduce the above copyright notice,</span>
+<span class="c1"># this list of conditions and the following disclaimer in the documentation</span>
+<span class="c1"># and/or other materials provided with the distribution.</span>
+<span class="c1">#</span>
+<span class="c1"># 3. Neither the name of the copyright holder nor the names of its</span>
+<span class="c1"># contributors may be used to endorse or promote products derived from</span>
+<span class="c1"># this software without specific prior written permission.</span>
+<span class="c1">#</span>
+<span class="c1"># THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS &quot;AS IS&quot;</span>
+<span class="c1"># AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE</span>
+<span class="c1"># IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE</span>
+<span class="c1"># DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE</span>
+<span class="c1"># FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL</span>
+<span class="c1"># DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR</span>
+<span class="c1"># SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER</span>
+<span class="c1"># CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,</span>
+<span class="c1"># OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE</span>
+<span class="c1"># OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.</span>
+<span class="c1">#</span>
+<span class="c1">#################################################################################################</span>
+
+<span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">Utilities for generating source for building a PyTorch CUDA extension that using a CUTLASS kernel.</span>
+<span class="sd">If specified, the extension can be JIT compiled via PyTorch&#39;s ``cpp_extension.load`` method.</span>
+
+<span class="sd">Example usage with JIT compilation:</span>
+
+<span class="sd">.. highlight:: python</span>
+<span class="sd">.. code-block:: python</span>
+
+<span class="sd">    plan = cutlass.op.Gemm(element=torch.float32, layout=cutlass.LayoutType.RowMajor)</span>
+<span class="sd">    op = plan.construct()</span>
+<span class="sd">    mod = cutlass.emit.pytorch(op, &#39;cutlass_gemm&#39;, 80, jit=True)</span>
+
+<span class="sd">    # Generate inputs for the GEMM</span>
+<span class="sd">    A, B, C = [torch.ones((512, 512)).to(&#39;cuda&#39;) for _ in range(3)]</span>
+
+<span class="sd">    # Run the module</span>
+<span class="sd">    D = mod.run(A, B, C)</span>
+
+
+<span class="sd">Example usage without JIT compilation:</span>
+
+<span class="sd">.. highlight:: python</span>
+<span class="sd">.. code-block:: python</span>
+
+<span class="sd">    plan = cutlass.op.Gemm(element=torch.float32, layout=cutlass.LayoutType.RowMajor)</span>
+<span class="sd">    op = plan.construct()</span>
+<span class="sd">    cutlass.emit.pytorch(op, &#39;cutlass_gemm&#39;, 80, jit=False, sourcedir=&#39;output&#39;)</span>
+
+<span class="sd">After this call, the directory ``output`` contains ``setup.py``,</span>
+<span class="sd">``cutlass_gemm.cpp``, and ``cutlass_gemm_kernel.cu``. The module can be built from</span>
+<span class="sd">within ``output`` by running: ``TORCH_CUDA_ARCH_LIST=&quot;8.0&quot; python setup.py develop --user``.</span>
+
+<span class="sd">The module can later be used in Python via:</span>
+
+<span class="sd">.. highlight:: python</span>
+<span class="sd">.. code-block:: python</span>
+
+<span class="sd">    import torch</span>
+<span class="sd">    import cutlass_gemm</span>
+
+<span class="sd">    # Generate inputs for the GEMM</span>
+<span class="sd">    A, B, C = [torch.ones((512, 512)).to(&#39;cuda&#39;) for _ in range(3)]</span>
+
+<span class="sd">    # Run the module</span>
+<span class="sd">    D = cutlass_gemm.run(A, B, C)</span>
+<span class="sd">&quot;&quot;&quot;</span>
+
+<span class="kn">import</span> <span class="nn">logging</span>
+<span class="kn">import</span> <span class="nn">os</span>
+
+<span class="kn">import</span> <span class="nn">cutlass_bindings</span>
+
+<span class="kn">from</span> <span class="nn">cutlass</span> <span class="kn">import</span> <span class="n">CUTLASS_PATH</span><span class="p">,</span> <span class="n">logger</span><span class="p">,</span> <span class="n">swizzle</span>
+<span class="kn">from</span> <span class="nn">cutlass.backend.gemm_operation</span> <span class="kn">import</span> <span class="n">GemmOperationGrouped</span><span class="p">,</span> <span class="n">GemmOperationUniversal</span>
+<span class="kn">from</span> <span class="nn">cutlass.backend.library</span> <span class="kn">import</span> <span class="n">ApiVersion</span>
+<span class="kn">from</span> <span class="nn">cutlass.backend.utils.software</span> <span class="kn">import</span> <span class="n">CheckPackages</span><span class="p">,</span> <span class="n">SubstituteTemplate</span>
+<span class="kn">from</span> <span class="nn">cutlass.emit</span> <span class="kn">import</span> <span class="n">common</span>
+
+<span class="n">torch_available</span> <span class="o">=</span> <span class="n">CheckPackages</span><span class="p">()</span><span class="o">.</span><span class="n">check_torch</span><span class="p">()</span>
+<span class="k">if</span> <span class="n">torch_available</span><span class="p">:</span>
+    <span class="kn">import</span> <span class="nn">torch</span>
+
+
+<span class="n">_PYTORCH_CUDA_TEMPLATE</span> <span class="o">=</span> <span class="n">common</span><span class="o">.</span><span class="n">_CSTYLE_AUTOGEN_COMMENT</span> <span class="o">+</span> <span class="s2">&quot;&quot;&quot;</span>
+<span class="s2">#include &lt;torch/extension.h&gt;</span>
+<span class="s2">#include &lt;ATen/ATen.h&gt;</span>
+
+<span class="s2">#include &quot;cutlass/cutlass.h&quot;</span>
+<span class="s2">#include &quot;cutlass/util/device_memory.h&quot;</span>
+
+<span class="s2">$</span><span class="si">{includes}</span>
+<span class="s2">$</span><span class="si">{declaration}</span>
+<span class="s2">$</span><span class="si">{impl}</span>
+<span class="s2">&quot;&quot;&quot;</span>
+
+<span class="n">_PYTORCH_GEMM_CPP_TEMPLATE</span> <span class="o">=</span> <span class="n">common</span><span class="o">.</span><span class="n">_CSTYLE_AUTOGEN_COMMENT</span> <span class="o">+</span> <span class="s2">&quot;&quot;&quot;</span>
+<span class="s2">#include &lt;torch/extension.h&gt;</span>
+<span class="s2">#include &lt;ATen/ATen.h&gt;</span>
+<span class="s2">#include &lt;pybind11/stl.h&gt;</span>
+
+<span class="s2">// CUDA forward declarations</span>
+<span class="s2">at::Tensor $</span><span class="si">{name}</span><span class="s2">_kernel(const at::Tensor&amp; A, const at::Tensor&amp; B, at::optional&lt;const at::Tensor&gt; C=at::nullopt, float alpha=1.f, float beta=0.f);</span>
+
+<span class="s2">// C++ interface</span>
+<span class="s2">at::Tensor $</span><span class="si">{name}</span><span class="s2">(const at::Tensor&amp; A, const at::Tensor&amp; B, at::optional&lt;const at::Tensor&gt; C=at::nullopt, float alpha=1.f, float beta=0.f) {</span>
+<span class="s2">  return $</span><span class="si">{name}</span><span class="s2">_kernel(A, B, C, alpha, beta);</span>
+<span class="s2">}</span>
+
+<span class="s2">PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {</span>
+<span class="s2">  m.def(&quot;run&quot;, py::overload_cast&lt;const at::Tensor&amp;, const at::Tensor&amp;, at::optional&lt;const at::Tensor&gt;, float, float&gt;(&amp;$</span><span class="si">{name}</span><span class="s2">), py::arg(&quot;A&quot;), py::arg(&quot;B&quot;), py::arg(&quot;C&quot;) = nullptr, py::arg(&quot;alpha&quot;) = 1.f, py::arg(&quot;beta&quot;) = 0.f);</span>
+<span class="s2">}</span>
+<span class="s2">&quot;&quot;&quot;</span>
+
+<span class="n">_PYTORCH_GROUPED_GEMM_CPP_TEMPLATE</span> <span class="o">=</span> <span class="n">common</span><span class="o">.</span><span class="n">_CSTYLE_AUTOGEN_COMMENT</span> <span class="o">+</span> <span class="s2">&quot;&quot;&quot;</span>
+<span class="s2">#include &lt;torch/extension.h&gt;</span>
+<span class="s2">#include &lt;ATen/ATen.h&gt;</span>
+<span class="s2">#include &lt;pybind11/stl.h&gt;</span>
+
+<span class="s2">// CUDA forward declarations</span>
+<span class="s2">std::vector&lt;at::Tensor&gt; $</span><span class="si">{name}</span><span class="s2">_kernel(const std::vector&lt;at::Tensor&gt;&amp; A, const std::vector&lt;at::Tensor&gt;&amp; B, at::optional&lt;const std::vector&lt;at::Tensor&gt;&gt; C=at::nullopt, float alpha=1.f, float beta=0.f);</span>
+
+<span class="s2">// C++ interface</span>
+<span class="s2">std::vector&lt;at::Tensor&gt; $</span><span class="si">{name}</span><span class="s2">(const std::vector&lt;at::Tensor&gt;&amp; A, const std::vector&lt;at::Tensor&gt;&amp; B, at::optional&lt;const std::vector&lt;at::Tensor&gt;&gt; C=at::nullopt, float alpha=1.f, float beta=0.f) {</span>
+<span class="s2">  return $</span><span class="si">{name}</span><span class="s2">_kernel(A, B, C, alpha, beta);</span>
+<span class="s2">}</span>
+
+<span class="s2">PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {</span>
+<span class="s2">  m.def(&quot;run&quot;, py::overload_cast&lt;const std::vector&lt;at::Tensor&gt;&amp;, const std::vector&lt;at::Tensor&gt;&amp;, at::optional&lt;const std::vector&lt;at::Tensor&gt;&gt;, float, float&gt;(&amp;$</span><span class="si">{name}</span><span class="s2">),</span>
+<span class="s2">        py::arg(&quot;A&quot;), py::arg(&quot;B&quot;), py::arg(&quot;C&quot;) = nullptr, py::arg(&quot;alpha&quot;) = 1.f, py::arg(&quot;beta&quot;) = 0.f);</span>
+<span class="s2">}</span>
+<span class="s2">&quot;&quot;&quot;</span>
+
+<span class="n">_PYTORCH_GEMM_INCLUDES</span> <span class="o">=</span> <span class="p">{</span>
+    <span class="n">ApiVersion</span><span class="o">.</span><span class="n">v2x</span><span class="p">:</span> <span class="s2">&quot;&quot;&quot;</span>
+<span class="s2">#include &quot;cutlass/gemm/device/gemm_universal.h&quot;</span>
+<span class="s2">&quot;&quot;&quot;</span><span class="p">,</span>
+    <span class="n">ApiVersion</span><span class="o">.</span><span class="n">v3x</span><span class="p">:</span> <span class="s2">&quot;&quot;&quot;</span>
+<span class="s2">#include &quot;cutlass/gemm/device/gemm_universal_adapter.h&quot;</span>
+<span class="s2">#include &quot;cutlass/gemm/collective/collective_builder.hpp&quot;</span>
+<span class="s2">#include &quot;cutlass/gemm/device/gemm_universal_adapter.h&quot;</span>
+<span class="s2">#include &quot;cutlass/gemm/kernel/gemm_universal.hpp&quot;</span>
+<span class="s2">#include &quot;cutlass/epilogue/collective/default_epilogue.hpp&quot;</span>
+<span class="s2">#include &quot;cutlass/util/packed_stride.hpp&quot;</span>
+<span class="s2">&quot;&quot;&quot;</span><span class="p">,</span>
+<span class="p">}</span>
+
+<span class="n">_PYTORCH_GROUPED_GEMM_INCLUDES</span> <span class="o">=</span> <span class="s2">&quot;&quot;&quot;</span>
+<span class="s2">#include &quot;cutlass/gemm/kernel/default_gemm_grouped.h&quot;</span>
+<span class="s2">#include &quot;cutlass/gemm/device/gemm_grouped.h&quot;</span>
+<span class="s2">&quot;&quot;&quot;</span>
+
+<span class="n">_CUTLASS_TYPE_TO_TORCH_TYPE</span> <span class="o">=</span> <span class="p">{</span>
+    <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">float16</span><span class="p">:</span> <span class="s2">&quot;torch::kF16&quot;</span><span class="p">,</span>
+    <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">float32</span><span class="p">:</span> <span class="s2">&quot;torch::kF32&quot;</span><span class="p">,</span>
+    <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">float64</span><span class="p">:</span> <span class="s2">&quot;torch::kF64&quot;</span><span class="p">,</span>
+    <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">int8</span><span class="p">:</span> <span class="s2">&quot;torch::I8&quot;</span><span class="p">,</span>
+    <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">int32</span><span class="p">:</span> <span class="s2">&quot;torch::I32&quot;</span><span class="p">,</span>
+<span class="p">}</span>
+
+<span class="n">_PYTORCH_GEMM_IMPL_TEMPLATE_2x</span> <span class="o">=</span> <span class="p">(</span>
+    <span class="n">common</span><span class="o">.</span><span class="n">_CUTLASS_KERNEL_RUN_GEMM_2x</span>
+    <span class="o">+</span> <span class="s2">&quot;&quot;&quot;</span>
+<span class="s2">at::Tensor $</span><span class="si">{name}</span><span class="s2">_kernel(const at::Tensor&amp; A, const at::Tensor&amp; B, at::optional&lt;const at::Tensor&gt; C, float alpha, float beta) {</span>
+<span class="s2">    int M = A.size(0);</span>
+<span class="s2">    int N = B.size(1);</span>
+<span class="s2">    int K = A.size(1);</span>
+
+<span class="s2">    typename DeviceKernel::ElementC* ptrC = (C == at::nullopt) ?</span>
+<span class="s2">                                            nullptr :</span>
+<span class="s2">                                            reinterpret_cast&lt;typename DeviceKernel::ElementC*&gt;(C-&gt;contiguous().data_ptr());</span>
+<span class="s2">    at::Tensor D = B.new_empty({M, N}, $</span><span class="si">{torch_type_C}</span><span class="s2">);</span>
+
+<span class="s2">    cutlass::Status status = $</span><span class="si">{name}</span><span class="s2">_kernel_run(M, N, K,</span>
+<span class="s2">                                                reinterpret_cast&lt;typename DeviceKernel::ElementA*&gt;(A.contiguous().data_ptr()),</span>
+<span class="s2">                                                reinterpret_cast&lt;typename DeviceKernel::ElementB*&gt;(B.contiguous().data_ptr()),</span>
+<span class="s2">                                                ptrC,</span>
+<span class="s2">                                                reinterpret_cast&lt;typename DeviceKernel::ElementC*&gt;(D.contiguous().data_ptr()),</span>
+<span class="s2">                                                ElementCompute(alpha), ElementCompute(beta));</span>
+
+<span class="s2">    TORCH_CHECK(status == cutlass::Status::kSuccess, &quot;CUTLASS kernel failed&quot;);</span>
+<span class="s2">    return D;</span>
+<span class="s2">}</span>
+<span class="s2">&quot;&quot;&quot;</span>
+<span class="p">)</span>
+
+<span class="n">_PYTORCH_GEMM_IMPL_TEMPLATE_3x</span> <span class="o">=</span> <span class="p">(</span>
+    <span class="n">common</span><span class="o">.</span><span class="n">_CUTLASS_KERNEL_RUN_GEMM_3x</span>
+    <span class="o">+</span> <span class="s2">&quot;&quot;&quot;</span>
+<span class="s2">bool hw_info_queried = false;</span>
+<span class="s2">cutlass::KernelHardwareInfo hw_info;</span>
+
+<span class="s2">at::Tensor $</span><span class="si">{name}</span><span class="s2">_kernel(const at::Tensor&amp; A, const at::Tensor&amp; B, at::optional&lt;const at::Tensor&gt; C, float alpha, float beta) {</span>
+<span class="s2">    int M = A.size(0);</span>
+<span class="s2">    int N = B.size(1);</span>
+<span class="s2">    int K = A.size(1);</span>
+<span class="s2">    int L = 1;</span>
+
+<span class="s2">    // Query hardware info if we haven&#39;t already</span>
+<span class="s2">    if (!hw_info_queried) {</span>
+<span class="s2">        hw_info.device_id = 0;</span>
+<span class="s2">        hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);</span>
+<span class="s2">    }</span>
+
+<span class="s2">    typename DeviceKernel::ElementC* ptrC = (C == at::nullopt) ?</span>
+<span class="s2">                                            nullptr :</span>
+<span class="s2">                                            reinterpret_cast&lt;typename DeviceKernel::ElementC*&gt;(C-&gt;contiguous().data_ptr());</span>
+<span class="s2">    at::Tensor D = B.new_empty({M, N}, $</span><span class="si">{torch_type_C}</span><span class="s2">);</span>
+
+<span class="s2">    cutlass::Status status = $</span><span class="si">{name}</span><span class="s2">_kernel_run(M, N, K, L,</span>
+<span class="s2">                                                reinterpret_cast&lt;typename DeviceKernel::ElementA*&gt;(A.contiguous().data_ptr()),</span>
+<span class="s2">                                                reinterpret_cast&lt;typename DeviceKernel::ElementB*&gt;(B.contiguous().data_ptr()),</span>
+<span class="s2">                                                ptrC,</span>
+<span class="s2">                                                reinterpret_cast&lt;typename DeviceKernel::ElementC*&gt;(D.contiguous().data_ptr()),</span>
+<span class="s2">                                                ElementCompute(alpha), ElementCompute(beta),</span>
+<span class="s2">                                                hw_info);</span>
+
+<span class="s2">    TORCH_CHECK(status == cutlass::Status::kSuccess, &quot;CUTLASS kernel failed&quot;);</span>
+<span class="s2">    return D;</span>
+<span class="s2">}</span>
+<span class="s2">&quot;&quot;&quot;</span>
+<span class="p">)</span>
+
+
+<span class="n">_PYTORCH_GROUPED_GEMM_IMPL_TEMPLATE</span> <span class="o">=</span> <span class="p">(</span>
+    <span class="n">common</span><span class="o">.</span><span class="n">_CUTLASS_KERNEL_RUN_GROUPED_GEMM_2x</span>
+    <span class="o">+</span> <span class="s2">&quot;&quot;&quot;</span>
+<span class="s2">std::vector&lt;at::Tensor&gt; $</span><span class="si">{name}</span><span class="s2">_kernel(const std::vector&lt;at::Tensor&gt;&amp; A, const std::vector&lt;at::Tensor&gt;&amp; B, at::optional&lt;const std::vector&lt;at::Tensor&gt;&gt; C, float alpha, float beta) {</span>
+<span class="s2">    size_t num = A.size();</span>
+
+<span class="s2">    // To avoid performing many small cudaMallocs and host-to-device copies,</span>
+<span class="s2">    // we serialize the grouped GEMM arguments on the host, allocate one</span>
+<span class="s2">    // large chunk of device memory, and perform a single cudaMemcpy to</span>
+<span class="s2">    // copy the host data to the device. Allocation overheads could be</span>
+<span class="s2">    // avoided by using a memory pool.</span>
+
+<span class="s2">    // Calculate the total size of the data to be copied from host to device</span>
+<span class="s2">    size_t total_size = sizeof(cutlass::gemm::GemmCoord) +</span>
+<span class="s2">                        sizeof(DeviceKernel::ElementA*) +</span>
+<span class="s2">                        sizeof(DeviceKernel::ElementB*) +</span>
+<span class="s2">                        sizeof(DeviceKernel::ElementC*) +</span>
+<span class="s2">                        sizeof(DeviceKernel::ElementC*) +</span>
+<span class="s2">                        sizeof(int64_t) +</span>
+<span class="s2">                        sizeof(int64_t) +</span>
+<span class="s2">                        sizeof(int64_t);</span>
+<span class="s2">    total_size *= num;</span>
+
+<span class="s2">    // num * sizeof(cutlass::gemm::GemmCoord) may leave one at a non-multiple</span>
+<span class="s2">    // of sizeof(DeviceKernel::ElementA*) (which will be 64 on a 64-bit system).</span>
+<span class="s2">    // To ensure that we don&#39;t end up having misaligned loads in the kernel,</span>
+<span class="s2">    // we pad to the nearest multiple of 8.</span>
+<span class="s2">    //</span>
+<span class="s2">    // Note that, even on a 32-bit system (for which sizeof(X*) will not equal</span>
+<span class="s2">    // sizeof(int64_t)), only padding between the list of GemmCoords and the</span>
+<span class="s2">    // list of ptr_As is sufficient because the set of four equal-length lists of pointers</span>
+<span class="s2">    // (A*, B*, C*, D*) will ensure that the first list of int64_ts will always</span>
+<span class="s2">    // start on a multiple of 8.</span>
+<span class="s2">    int64_t padding = 8 - (total_size % 8);</span>
+<span class="s2">    total_size += padding;</span>
+
+<span class="s2">    uint8_t* host_data = new uint8_t[total_size];</span>
+<span class="s2">    cutlass::DeviceAllocation&lt;uint8_t&gt; device_data(total_size);</span>
+
+<span class="s2">    uint8_t* start = host_data;</span>
+<span class="s2">    cutlass::gemm::GemmCoord* problem_sizes_host = reinterpret_cast&lt;cutlass::gemm::GemmCoord*&gt;(start);</span>
+
+<span class="s2">    // Apply the padding after the list of GemmCoords</span>
+<span class="s2">    start += num * sizeof(cutlass::gemm::GemmCoord) + padding;</span>
+
+<span class="s2">    int64_t ptr_A_offset = start - host_data;</span>
+<span class="s2">    DeviceKernel::ElementA** ptr_A_host = reinterpret_cast&lt;DeviceKernel::ElementA**&gt;(start);</span>
+<span class="s2">    start += num * sizeof(DeviceKernel::ElementA*);</span>
+
+<span class="s2">    int64_t ptr_B_offset = start - host_data;</span>
+<span class="s2">    DeviceKernel::ElementB** ptr_B_host = reinterpret_cast&lt;DeviceKernel::ElementB**&gt;(start);</span>
+<span class="s2">    start += num * sizeof(DeviceKernel::ElementB*);</span>
+
+<span class="s2">    int64_t ptr_C_offset = start - host_data;</span>
+<span class="s2">    DeviceKernel::ElementC** ptr_C_host = reinterpret_cast&lt;DeviceKernel::ElementC**&gt;(start);</span>
+<span class="s2">    start += num * sizeof(DeviceKernel::ElementC*);</span>
+
+<span class="s2">    int64_t ptr_D_offset = start - host_data;</span>
+<span class="s2">    DeviceKernel::ElementC** ptr_D_host = reinterpret_cast&lt;DeviceKernel::ElementC**&gt;(start);</span>
+<span class="s2">    start += num * sizeof(DeviceKernel::ElementC*);</span>
+
+<span class="s2">    int64_t lda_offset = start - host_data;</span>
+<span class="s2">    int64_t* lda_host = reinterpret_cast&lt;int64_t*&gt;(start);</span>
+<span class="s2">    start += num * sizeof(int64_t);</span>
+
+<span class="s2">    int64_t ldb_offset = start - host_data;</span>
+<span class="s2">    int64_t* ldb_host = reinterpret_cast&lt;int64_t*&gt;(start);</span>
+<span class="s2">    start += num * sizeof(int64_t);</span>
+
+<span class="s2">    int64_t ldc_offset = start - host_data;</span>
+<span class="s2">    int64_t* ldc_host = reinterpret_cast&lt;int64_t*&gt;(start);</span>
+<span class="s2">    start += num * sizeof(int64_t);</span>
+
+<span class="s2">    std::vector&lt;at::Tensor&gt; D(num);</span>
+
+<span class="s2">    bool need_C = (C != at::nullopt) &amp;&amp; (beta != 0.f);</span>
+<span class="s2">    for (size_t i = 0; i &lt; num; ++i) {</span>
+<span class="s2">        int M = A[i].size(0);</span>
+<span class="s2">        int N = B[i].size(1);</span>
+<span class="s2">        int K = A[i].size(1);</span>
+<span class="s2">        *(problem_sizes_host + i) = {M, N, K};</span>
+<span class="s2">        *(ptr_A_host + i) = reinterpret_cast&lt;typename DeviceKernel::ElementA*&gt;(A[i].contiguous().data_ptr());</span>
+<span class="s2">        *(ptr_B_host + i) = reinterpret_cast&lt;typename DeviceKernel::ElementB*&gt;(B[i].contiguous().data_ptr());</span>
+
+<span class="s2">        if (need_C) {</span>
+<span class="s2">            *(ptr_C_host + i) = reinterpret_cast&lt;typename DeviceKernel::ElementC*&gt;(C-&gt;at(i).contiguous().data_ptr());</span>
+<span class="s2">        }</span>
+<span class="s2">        else {</span>
+<span class="s2">            *(ptr_C_host + i) = nullptr;</span>
+<span class="s2">        }</span>
+
+<span class="s2">        D[i] = B[i].new_empty({M, N}, $</span><span class="si">{torch_type_C}</span><span class="s2">);</span>
+<span class="s2">        *(ptr_D_host + i) = reinterpret_cast&lt;typename DeviceKernel::ElementC*&gt;(D[i].contiguous().data_ptr());</span>
+
+<span class="s2">        *(lda_host + i) = DeviceKernel::LayoutA::packed({M, K}).stride(0);</span>
+<span class="s2">        *(ldb_host + i) = DeviceKernel::LayoutB::packed({K, N}).stride(0);</span>
+<span class="s2">        *(ldc_host + i) = DeviceKernel::LayoutC::packed({M, N}).stride(0);</span>
+<span class="s2">    }</span>
+
+<span class="s2">    device_data.copy_from_host(host_data);</span>
+
+<span class="s2">    cutlass::Status status = $</span><span class="si">{name}</span><span class="s2">_kernel_run(</span>
+<span class="s2">        num,</span>
+<span class="s2">        reinterpret_cast&lt;cutlass::gemm::GemmCoord*&gt;(device_data.get()),</span>
+<span class="s2">        reinterpret_cast&lt;DeviceKernel::ElementA**&gt;(device_data.get() + ptr_A_offset),</span>
+<span class="s2">        reinterpret_cast&lt;DeviceKernel::ElementB**&gt;(device_data.get() + ptr_B_offset),</span>
+<span class="s2">        reinterpret_cast&lt;DeviceKernel::ElementC**&gt;(device_data.get() + ptr_C_offset),</span>
+<span class="s2">        reinterpret_cast&lt;DeviceKernel::ElementC**&gt;(device_data.get() + ptr_D_offset),</span>
+<span class="s2">        reinterpret_cast&lt;int64_t*&gt;(device_data.get() + lda_offset),</span>
+<span class="s2">        reinterpret_cast&lt;int64_t*&gt;(device_data.get() + ldb_offset),</span>
+<span class="s2">        reinterpret_cast&lt;int64_t*&gt;(device_data.get() + ldc_offset),</span>
+<span class="s2">        reinterpret_cast&lt;int64_t*&gt;(device_data.get() + ldc_offset),</span>
+<span class="s2">        ElementCompute(alpha), ElementCompute(beta));</span>
+
+<span class="s2">    delete[] host_data;</span>
+
+<span class="s2">    TORCH_CHECK(status == cutlass::Status::kSuccess, &quot;CUTLASS kernel failed&quot;);</span>
+<span class="s2">    return D;</span>
+<span class="s2">}</span>
+<span class="s2">&quot;&quot;&quot;</span>
+<span class="p">)</span>
+
+
+<span class="n">_PYTORCH_SETUP_PY</span> <span class="o">=</span> <span class="n">common</span><span class="o">.</span><span class="n">_PYSTYLE_AUTOGEN_COMMENT</span> <span class="o">+</span> <span class="s2">&quot;&quot;&quot;</span>
+<span class="s2">from setuptools import setup</span>
+<span class="s2">from torch.utils.cpp_extension import BuildExtension, CUDAExtension</span>
+
+<span class="s2">setup(</span>
+<span class="s2">    name=&#39;$</span><span class="si">{name}</span><span class="s2">&#39;,</span>
+<span class="s2">    ext_modules=[</span>
+<span class="s2">        CUDAExtension(&#39;$</span><span class="si">{name}</span><span class="s2">&#39;, [</span>
+<span class="s2">            &#39;$</span><span class="si">{name}</span><span class="s2">.cpp&#39;,</span>
+<span class="s2">            &#39;$</span><span class="si">{name}</span><span class="s2">_kernel.cu&#39;,</span>
+<span class="s2">        ],</span>
+<span class="s2">        include_dirs=[&#39;$</span><span class="si">{cutlass_path}</span><span class="s2">/include&#39;, &#39;$</span><span class="si">{cutlass_path}</span><span class="s2">/tools/util/include&#39;],</span>
+<span class="s2">        extra_compile_args=[&#39;-std=c++17&#39;]</span>
+<span class="s2">        ),</span>
+<span class="s2">    ],</span>
+<span class="s2">    cmdclass={</span>
+<span class="s2">        &#39;build_ext&#39;: BuildExtension</span>
+<span class="s2">    })</span>
+
+<span class="s2">&quot;&quot;&quot;</span>
+
+
+<span class="k">def</span> <span class="nf">_generate_setup</span><span class="p">(</span><span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">sourcedir</span><span class="p">:</span> <span class="nb">str</span><span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Generates a setup.py file for the extension</span>
+
+<span class="sd">    :param name: name of the module to generate</span>
+<span class="sd">    :type name: str</span>
+<span class="sd">    :param sourcedir: directory to which generated source files should be written</span>
+<span class="sd">    :type sourcedir: str</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="n">setup_py_file</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">sourcedir</span><span class="p">,</span> <span class="s2">&quot;setup.py&quot;</span><span class="p">)</span>
+    <span class="n">setup_source</span> <span class="o">=</span> <span class="n">SubstituteTemplate</span><span class="p">(</span>
+        <span class="n">_PYTORCH_SETUP_PY</span><span class="p">,</span> <span class="p">{</span><span class="s2">&quot;name&quot;</span><span class="p">:</span> <span class="n">name</span><span class="p">,</span> <span class="s2">&quot;cutlass_path&quot;</span><span class="p">:</span> <span class="n">CUTLASS_PATH</span><span class="p">}</span>
+    <span class="p">)</span>
+    <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">setup_py_file</span><span class="p">,</span> <span class="s2">&quot;w&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">outfile</span><span class="p">:</span>
+        <span class="n">outfile</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="n">setup_source</span><span class="p">)</span>
+
+
+<span class="k">class</span> <span class="nc">_ArchListSetter</span><span class="p">:</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Utility context manager for temporarily setting the value of the ``TORCH_CUDA_ARCH_LIST``</span>
+<span class="sd">    environment variable when building a PyTorch CUDA module.</span>
+
+<span class="sd">    ``TORCH_CUDA_ARCH_LIST`` is a space-delmited list of compute capabilites for which a PyTorch</span>
+<span class="sd">    CUDA module should be compiled.</span>
+
+<span class="sd">    For example, ``TORCH_CUDA_ARCH_LIST=&quot;7.0 8.0&quot;`` would result in the inclusion of</span>
+<span class="sd">    ``-gencode=arch=compute_70,code=sm_70`` and ``-gencode=arch=compute_80,code=sm_80`` in the</span>
+<span class="sd">    compilation of the module.</span>
+
+<span class="sd">    This utility wraps the building of a PyTorch CUDA module with a setting of this environment</span>
+<span class="sd">    variable according to the current compute capability being targetted.</span>
+
+<span class="sd">    Example usage:</span>
+
+<span class="sd">    .. highlight:: python</span>
+<span class="sd">    .. code-block:: python</span>
+
+<span class="sd">        # Temporarily set TORCH_CUDA_ARCH_LIST=&quot;8.0&quot;</span>
+<span class="sd">        with _ArchListSetter(80):</span>
+<span class="sd">            # Perform JIT compilation and loading of the module</span>
+<span class="sd">            mod = torch.utils.cpp_extension.load(...)</span>
+
+<span class="sd">    :param cc: compute capability</span>
+<span class="sd">    :type cc: int</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+
+    <span class="n">_TORCH_CUDA_ARCH_LIST</span> <span class="o">=</span> <span class="s2">&quot;TORCH_CUDA_ARCH_LIST&quot;</span>
+
+    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">cc</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">cc_str</span> <span class="o">=</span> <span class="s2">&quot;.&quot;</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">cc</span><span class="p">)))</span>
+
+    <span class="k">def</span> <span class="fm">__enter__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Saves the old value of TORCH_CUDA_ARCH_LIST and reset it to the new value based on ``cc``</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">old_arch_list</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">getenv</span><span class="p">(</span><span class="n">_ArchListSetter</span><span class="o">.</span><span class="n">_TORCH_CUDA_ARCH_LIST</span><span class="p">)</span>
+        <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="n">_ArchListSetter</span><span class="o">.</span><span class="n">_TORCH_CUDA_ARCH_LIST</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cc_str</span>
+
+        <span class="k">return</span> <span class="bp">self</span>
+
+    <span class="k">def</span> <span class="fm">__exit__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">exc_type</span><span class="p">,</span> <span class="n">exc_val</span><span class="p">,</span> <span class="n">traceback</span><span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Restores the old value of TORCH_CUDA_ARCH_LIST</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="n">_ArchListSetter</span><span class="o">.</span><span class="n">_TORCH_CUDA_ARCH_LIST</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">old_arch_list</span>
+
+
+<span class="k">def</span> <span class="nf">_jit</span><span class="p">(</span><span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">cc</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">cpp_file</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">cuda_file</span><span class="p">:</span> <span class="nb">str</span><span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    JIT compiles and loads a PyTorch CUDA extension.</span>
+
+<span class="sd">    :param name: name of the module to generate</span>
+<span class="sd">    :type name: str</span>
+<span class="sd">    :param cc: compute capability of the device the module should target</span>
+<span class="sd">    :type cc: int</span>
+<span class="sd">    :param cpp_file: path to file containing extension&#39;s C++ interface</span>
+<span class="sd">    :type cpp_file: str</span>
+<span class="sd">    :param cuda_file: path to file containing extension&#39;s CUDA interface</span>
+<span class="sd">    :type cuda_file: str</span>
+
+<span class="sd">    :return: loaded PyTorch module</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+
+    <span class="kn">from</span> <span class="nn">torch.utils.cpp_extension</span> <span class="kn">import</span> <span class="n">load</span>
+
+    <span class="n">extra_cuda_cflags</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;-std=c++17&quot;</span><span class="p">]</span>
+    <span class="k">if</span> <span class="n">cc</span> <span class="o">==</span> <span class="mi">90</span><span class="p">:</span>
+        <span class="c1"># PyTorch does not currently add the sm_90a target when compute capability</span>
+        <span class="c1"># 9.0 is set within TORCH_CUDA_ARCH_LIST. Thus, we manually add the sm_90a target.</span>
+        <span class="n">extra_cuda_cflags</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s2">&quot;-gencode=arch=compute_90a,code=sm_90a&quot;</span><span class="p">)</span>
+
+    <span class="k">with</span> <span class="n">_ArchListSetter</span><span class="p">(</span><span class="n">cc</span><span class="p">):</span>
+        <span class="n">jitmodule</span> <span class="o">=</span> <span class="n">load</span><span class="p">(</span>
+            <span class="n">name</span><span class="p">,</span>
+            <span class="p">[</span><span class="n">cpp_file</span><span class="p">,</span> <span class="n">cuda_file</span><span class="p">],</span>
+            <span class="n">extra_cuda_cflags</span><span class="o">=</span><span class="n">extra_cuda_cflags</span><span class="p">,</span>
+            <span class="n">extra_include_paths</span><span class="o">=</span><span class="p">[</span>
+                <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">CUTLASS_PATH</span><span class="p">,</span> <span class="s2">&quot;include&quot;</span><span class="p">),</span>
+                <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">CUTLASS_PATH</span><span class="p">,</span> <span class="s2">&quot;tools/util/include&quot;</span><span class="p">),</span>
+            <span class="p">],</span>
+            <span class="n">verbose</span><span class="o">=</span><span class="p">(</span><span class="n">logger</span><span class="o">.</span><span class="n">level</span> <span class="o">==</span> <span class="n">logging</span><span class="o">.</span><span class="n">DEBUG</span><span class="p">)</span>
+        <span class="p">)</span>
+    <span class="k">return</span> <span class="n">jitmodule</span>
+
+
+<span class="k">def</span> <span class="nf">_pytorch_gemm</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">cc</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">jit</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="n">sourcedir</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;&quot;</span><span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Generates source for building a PyTorch CUDA module that leverages the CUTLASS GEMM</span>
+<span class="sd">    specified by ``op``. If the ``jit`` parameter is set to true, the module is just-in-time</span>
+<span class="sd">    compiled, loaded, and returned.</span>
+
+<span class="sd">    :param op: operation to emit in the module</span>
+<span class="sd">    :param name: name of the module to generate</span>
+<span class="sd">    :type name: str</span>
+<span class="sd">    :param cc: compute capability of the device the module should target</span>
+<span class="sd">    :type cc: int</span>
+<span class="sd">    :param jit: whether the module should be just-in-time compiled</span>
+<span class="sd">    :type jit: bool</span>
+<span class="sd">    :param sourcedir: directory to which generated source files should be written</span>
+<span class="sd">    :type sourcedir: str</span>
+
+<span class="sd">    :return: loaded PyTorch module if ``jit=True`` or ``None`` otherwise</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="k">if</span> <span class="n">sourcedir</span> <span class="o">!=</span> <span class="s2">&quot;&quot;</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">isdir</span><span class="p">(</span><span class="n">sourcedir</span><span class="p">):</span>
+        <span class="n">os</span><span class="o">.</span><span class="n">makedirs</span><span class="p">(</span><span class="n">sourcedir</span><span class="p">)</span>
+
+    <span class="n">cuda_file</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">sourcedir</span><span class="p">,</span> <span class="n">name</span> <span class="o">+</span> <span class="s2">&quot;_kernel.cu&quot;</span><span class="p">)</span>
+    <span class="n">extra_kw</span> <span class="o">=</span> <span class="p">{}</span>
+    <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">api</span> <span class="o">==</span> <span class="n">ApiVersion</span><span class="o">.</span><span class="n">v3x</span><span class="p">:</span>
+        <span class="n">impl_template</span> <span class="o">=</span> <span class="n">_PYTORCH_GEMM_IMPL_TEMPLATE_3x</span>
+    <span class="k">else</span><span class="p">:</span>
+        <span class="n">impl_template</span> <span class="o">=</span> <span class="n">_PYTORCH_GEMM_IMPL_TEMPLATE_2x</span>
+        <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">swizzling_functor</span><span class="p">,</span> <span class="n">swizzle</span><span class="o">.</span><span class="n">ThreadblockSwizzleStreamK</span><span class="p">):</span>
+            <span class="n">extra_kw</span><span class="p">[</span><span class="s2">&quot;args&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">common</span><span class="o">.</span><span class="n">_CUTLASS_KERNEL_ARGS_2x_STREAM_K</span>
+        <span class="k">else</span><span class="p">:</span>
+            <span class="n">extra_kw</span><span class="p">[</span><span class="s2">&quot;args&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">common</span><span class="o">.</span><span class="n">_CUTLASS_KERNEL_ARGS_2x</span>
+    <span class="n">impl_template</span> <span class="o">=</span> <span class="p">(</span>
+        <span class="n">_PYTORCH_GEMM_IMPL_TEMPLATE_3x</span>
+        <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">api</span> <span class="o">==</span> <span class="n">ApiVersion</span><span class="o">.</span><span class="n">v3x</span>
+        <span class="k">else</span> <span class="n">_PYTORCH_GEMM_IMPL_TEMPLATE_2x</span>
+    <span class="p">)</span>
+    <span class="n">cuda_impl</span> <span class="o">=</span> <span class="n">SubstituteTemplate</span><span class="p">(</span><span class="n">impl_template</span><span class="p">,</span> <span class="p">{</span><span class="s2">&quot;name&quot;</span><span class="p">:</span> <span class="n">name</span><span class="p">,</span> <span class="o">**</span><span class="n">extra_kw</span><span class="p">})</span>
+    <span class="n">cuda_source</span> <span class="o">=</span> <span class="n">SubstituteTemplate</span><span class="p">(</span>
+        <span class="n">_PYTORCH_CUDA_TEMPLATE</span><span class="p">,</span>
+        <span class="p">{</span>
+            <span class="s2">&quot;includes&quot;</span><span class="p">:</span> <span class="n">_PYTORCH_GEMM_INCLUDES</span><span class="p">[</span><span class="n">op</span><span class="o">.</span><span class="n">api</span><span class="p">],</span>
+            <span class="s2">&quot;declaration&quot;</span><span class="p">:</span> <span class="n">op</span><span class="o">.</span><span class="n">rt_module</span><span class="o">.</span><span class="n">emit</span><span class="p">(),</span>
+            <span class="s2">&quot;procedural_name&quot;</span><span class="p">:</span> <span class="n">op</span><span class="o">.</span><span class="n">procedural_name</span><span class="p">(),</span>
+            <span class="s2">&quot;impl&quot;</span><span class="p">:</span> <span class="n">cuda_impl</span><span class="p">,</span>
+            <span class="s2">&quot;torch_type_C&quot;</span><span class="p">:</span> <span class="n">_CUTLASS_TYPE_TO_TORCH_TYPE</span><span class="p">[</span><span class="n">op</span><span class="o">.</span><span class="n">C</span><span class="o">.</span><span class="n">element</span><span class="p">],</span>
+        <span class="p">},</span>
+    <span class="p">)</span>
+    <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">cuda_file</span><span class="p">,</span> <span class="s2">&quot;w&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">outfile</span><span class="p">:</span>
+        <span class="n">outfile</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="n">cuda_source</span><span class="p">)</span>
+
+    <span class="n">cpp_file</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">sourcedir</span><span class="p">,</span> <span class="n">name</span> <span class="o">+</span> <span class="s2">&quot;.cpp&quot;</span><span class="p">)</span>
+    <span class="n">cpp_source</span> <span class="o">=</span> <span class="n">SubstituteTemplate</span><span class="p">(</span>
+        <span class="n">_PYTORCH_GEMM_CPP_TEMPLATE</span><span class="p">,</span>
+        <span class="p">{</span><span class="s2">&quot;name&quot;</span><span class="p">:</span> <span class="n">name</span><span class="p">,</span> <span class="s2">&quot;description&quot;</span><span class="p">:</span> <span class="sa">f</span><span class="s2">&quot;CUTLASS </span><span class="si">{</span><span class="n">op</span><span class="o">.</span><span class="n">procedural_name</span><span class="p">()</span><span class="si">}</span><span class="s2"> GEMM&quot;</span><span class="p">},</span>
+    <span class="p">)</span>
+    <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">cpp_file</span><span class="p">,</span> <span class="s2">&quot;w&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">outfile</span><span class="p">:</span>
+        <span class="n">outfile</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="n">cpp_source</span><span class="p">)</span>
+
+    <span class="n">_generate_setup</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">sourcedir</span><span class="p">)</span>
+
+    <span class="k">if</span> <span class="n">jit</span><span class="p">:</span>
+        <span class="k">return</span> <span class="n">_jit</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">cc</span><span class="p">,</span> <span class="n">cpp_file</span><span class="p">,</span> <span class="n">cuda_file</span><span class="p">)</span>
+
+    <span class="k">return</span> <span class="kc">None</span>
+
+
+<span class="k">def</span> <span class="nf">_pytorch_grouped_gemm</span><span class="p">(</span>
+    <span class="n">op</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">cc</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">jit</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="n">sourcedir</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;&quot;</span>
+<span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Generates source for building a PyTorch CUDA module that leverages the CUTLASS grouped GEMM</span>
+<span class="sd">    specified by ``op``. If the ``jit`` parameter is set to true, the module is just-in-time</span>
+<span class="sd">    compiled, loaded, and returned.</span>
+
+<span class="sd">    :param op: operation to emit in the module</span>
+<span class="sd">    :param name: name of the module to generate</span>
+<span class="sd">    :type name: str</span>
+<span class="sd">    :param cc: compute capability of the device the module should target</span>
+<span class="sd">    :type cc: int</span>
+<span class="sd">    :param jit: whether the module should be just-in-time compiled</span>
+<span class="sd">    :type jit: bool</span>
+<span class="sd">    :param sourcedir: directory to which generated source files should be written</span>
+<span class="sd">    :type sourcedir: str</span>
+
+<span class="sd">    :return: loaded PyTorch module if ``jit=True`` or ``None`` otherwise</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">api</span> <span class="o">!=</span> <span class="n">ApiVersion</span><span class="o">.</span><span class="n">v2x</span><span class="p">:</span>
+        <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span><span class="s2">&quot;Grouped GEMM is currently only supported for CUTLASS 2.x&quot;</span><span class="p">)</span>
+
+    <span class="k">if</span> <span class="n">sourcedir</span> <span class="o">!=</span> <span class="s2">&quot;&quot;</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">isdir</span><span class="p">(</span><span class="n">sourcedir</span><span class="p">):</span>
+        <span class="n">os</span><span class="o">.</span><span class="n">makedirs</span><span class="p">(</span><span class="n">sourcedir</span><span class="p">)</span>
+
+    <span class="n">cuda_file</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">sourcedir</span><span class="p">,</span> <span class="n">name</span> <span class="o">+</span> <span class="s2">&quot;_kernel.cu&quot;</span><span class="p">)</span>
+    <span class="n">cuda_impl</span> <span class="o">=</span> <span class="n">SubstituteTemplate</span><span class="p">(</span><span class="n">_PYTORCH_GROUPED_GEMM_IMPL_TEMPLATE</span><span class="p">,</span> <span class="p">{</span><span class="s2">&quot;name&quot;</span><span class="p">:</span> <span class="n">name</span><span class="p">})</span>
+    <span class="n">cuda_source</span> <span class="o">=</span> <span class="n">SubstituteTemplate</span><span class="p">(</span>
+        <span class="n">_PYTORCH_CUDA_TEMPLATE</span><span class="p">,</span>
+        <span class="p">{</span>
+            <span class="s2">&quot;includes&quot;</span><span class="p">:</span> <span class="n">_PYTORCH_GROUPED_GEMM_INCLUDES</span><span class="p">,</span>
+            <span class="s2">&quot;declaration&quot;</span><span class="p">:</span> <span class="n">op</span><span class="o">.</span><span class="n">rt_module</span><span class="o">.</span><span class="n">emit</span><span class="p">(),</span>
+            <span class="s2">&quot;procedural_name&quot;</span><span class="p">:</span> <span class="n">op</span><span class="o">.</span><span class="n">procedural_name</span><span class="p">(),</span>
+            <span class="s2">&quot;impl&quot;</span><span class="p">:</span> <span class="n">cuda_impl</span><span class="p">,</span>
+            <span class="s2">&quot;torch_type_C&quot;</span><span class="p">:</span> <span class="n">_CUTLASS_TYPE_TO_TORCH_TYPE</span><span class="p">[</span><span class="n">op</span><span class="o">.</span><span class="n">C</span><span class="o">.</span><span class="n">element</span><span class="p">],</span>
+        <span class="p">},</span>
+    <span class="p">)</span>
+    <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">cuda_file</span><span class="p">,</span> <span class="s2">&quot;w&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">outfile</span><span class="p">:</span>
+        <span class="n">outfile</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="n">cuda_source</span><span class="p">)</span>
+
+    <span class="n">cpp_file</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">sourcedir</span><span class="p">,</span> <span class="n">name</span> <span class="o">+</span> <span class="s2">&quot;.cpp&quot;</span><span class="p">)</span>
+    <span class="n">cpp_source</span> <span class="o">=</span> <span class="n">SubstituteTemplate</span><span class="p">(</span>
+        <span class="n">_PYTORCH_GROUPED_GEMM_CPP_TEMPLATE</span><span class="p">,</span>
+        <span class="p">{</span><span class="s2">&quot;name&quot;</span><span class="p">:</span> <span class="n">name</span><span class="p">,</span> <span class="s2">&quot;description&quot;</span><span class="p">:</span> <span class="sa">f</span><span class="s2">&quot;CUTLASS </span><span class="si">{</span><span class="n">op</span><span class="o">.</span><span class="n">procedural_name</span><span class="p">()</span><span class="si">}</span><span class="s2"> grouped GEMM&quot;</span><span class="p">},</span>
+    <span class="p">)</span>
+    <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">cpp_file</span><span class="p">,</span> <span class="s2">&quot;w&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">outfile</span><span class="p">:</span>
+        <span class="n">outfile</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="n">cpp_source</span><span class="p">)</span>
+
+    <span class="n">_generate_setup</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">sourcedir</span><span class="p">)</span>
+
+    <span class="k">if</span> <span class="n">jit</span><span class="p">:</span>
+        <span class="k">return</span> <span class="n">_jit</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">cc</span><span class="p">,</span> <span class="n">cpp_file</span><span class="p">,</span> <span class="n">cuda_file</span><span class="p">)</span>
+
+    <span class="k">return</span> <span class="kc">None</span>
+
+
+<div class="viewcode-block" id="pytorch"><a class="viewcode-back" href="../../../cutlass.emit.html#cutlass.emit.pytorch.pytorch">[docs]</a><span class="k">def</span> <span class="nf">pytorch</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">cc</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">jit</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="n">sourcedir</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;&quot;</span><span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Generates source for building a PyTorch CUDA module that leverages the CUTLASS kernel</span>
+<span class="sd">    specified by ``op``. If the ``jit`` parameter is set to true, the module is just-in-time</span>
+<span class="sd">    compiled, loaded, and returned.</span>
+
+<span class="sd">    The result of this method is files within ``sourcedir`` that can be used for building</span>
+<span class="sd">    a PyTorch module.</span>
+
+<span class="sd">    :param op: operation to emit in the module</span>
+<span class="sd">    :param name: name of the module to generate</span>
+<span class="sd">    :type name: str</span>
+<span class="sd">    :param cc: compute capability of the device the module should target</span>
+<span class="sd">    :type cc: int</span>
+<span class="sd">    :param jit: whether the module should be just-in-time compiled</span>
+<span class="sd">    :type jit: bool</span>
+<span class="sd">    :param sourcedir: directory to which generated source files should be written</span>
+<span class="sd">    :type sourcedir: str</span>
+
+<span class="sd">    :return: loaded PyTorch module (if ``jit=True``) or None</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="n">device_op</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">device_op</span><span class="p">()</span>
+    <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="n">GemmOperationUniversal</span><span class="p">):</span>
+        <span class="k">return</span> <span class="n">_pytorch_gemm</span><span class="p">(</span><span class="n">device_op</span><span class="p">,</span> <span class="n">name</span><span class="p">,</span> <span class="n">cc</span><span class="p">,</span> <span class="n">jit</span><span class="p">,</span> <span class="n">sourcedir</span><span class="p">)</span>
+    <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="n">GemmOperationGrouped</span><span class="p">):</span>
+        <span class="k">return</span> <span class="n">_pytorch_grouped_gemm</span><span class="p">(</span><span class="n">device_op</span><span class="p">,</span> <span class="n">name</span><span class="p">,</span> <span class="n">cc</span><span class="p">,</span> <span class="n">jit</span><span class="p">,</span> <span class="n">sourcedir</span><span class="p">)</span>
+    <span class="k">else</span><span class="p">:</span>
+        <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span>
+            <span class="sa">f</span><span class="s2">&quot;Operation type </span><span class="si">{</span><span class="nb">type</span><span class="p">(</span><span class="n">op</span><span class="p">)</span><span class="si">}</span><span class="s2"> is not currently supported for PyTorch emission.&quot;</span>
+        <span class="p">)</span></div>
+</pre></div>
+        </article>
+      </div>
+      <footer>
+        
+        <div class="related-pages">
+          
+          
+        </div>
+        <div class="bottom-of-page">
+          <div class="left-details">
+            <div class="copyright">
+                Copyright &#169; 2023, NVIDIA
+            </div>
+            Made with <a href="https://www.sphinx-doc.org/">Sphinx</a> and <a class="muted-link" href="https://pradyunsg.me">@pradyunsg</a>'s
+            
+            <a href="https://github.com/pradyunsg/furo">Furo</a>
+            
+          </div>
+          <div class="right-details">
+            <div class="icons">
+              <a class="muted-link " href="https://github.com/NVIDIA/cutlass" aria-label="GitHub">
+                <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 16 16">
+                    <path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"></path>
+                </svg>
+            </a>
+              
+            </div>
+          </div>
+        </div>
+        
+      </footer>
+    </div>
+    <aside class="toc-drawer no-toc">
+      
+      
+      
+    </aside>
+  </div>
+</div><script data-url_root="../../../" id="documentation_options" src="../../../_static/documentation_options.js"></script>
+    <script src="../../../_static/doctools.js"></script>
+    <script src="../../../_static/sphinx_highlight.js"></script>
+    <script src="../../../_static/scripts/furo.js"></script>
+    <script src="../../../_static/clipboard.min.js"></script>
+    <script src="../../../_static/copybutton.js"></script>
+    <script src="../../../_static/tabs.js"></script>
+    <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
+    </body>
+</html>
\ No newline at end of file
diff --git a/python/docs/_modules/cutlass/epilogue.html b/python/docs/_modules/cutlass/epilogue.html
new file mode 100644
index 0000000000..da5e3ac2e6
--- /dev/null
+++ b/python/docs/_modules/cutlass/epilogue.html
@@ -0,0 +1,391 @@
+<!doctype html>
+<html class="no-js" lang="en">
+  <head><meta charset="utf-8"/>
+    <meta name="viewport" content="width=device-width,initial-scale=1"/>
+    <meta name="color-scheme" content="light dark"><link rel="index" title="Index" href="../../genindex.html" /><link rel="search" title="Search" href="../../search.html" />
+        <link rel="canonical" href="docs/_modules/cutlass/epilogue.html" />
+
+    <!-- Generated with Sphinx 6.1.3 and Furo 2023.03.27 -->
+        <title>cutlass.epilogue - CUTLASS Python</title>
+      <link rel="stylesheet" type="text/css" href="../../_static/pygments.css" />
+    <link rel="stylesheet" type="text/css" href="../../_static/styles/furo.css?digest=fad236701ea90a88636c2a8c73b44ae642ed2a53" />
+    <link rel="stylesheet" type="text/css" href="../../_static/copybutton.css" />
+    <link rel="stylesheet" type="text/css" href="../../_static/tabs.css" />
+    <link rel="stylesheet" type="text/css" href="../../_static/styles/furo-extensions.css?digest=30d1aed668e5c3a91c3e3bf6a60b675221979f0e" />
+    
+    
+
+
+<style>
+  body {
+    --color-code-background: #eeffcc;
+  --color-code-foreground: black;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+  }
+  @media not print {
+    body[data-theme="dark"] {
+      --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+    }
+    @media (prefers-color-scheme: dark) {
+      body:not([data-theme="light"]) {
+        --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+      }
+    }
+  }
+</style></head>
+  <body>
+    
+    <script>
+      document.body.dataset.theme = localStorage.getItem("theme") || "auto";
+    </script>
+    
+
+<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
+  <symbol id="svg-toc" viewBox="0 0 24 24">
+    <title>Contents</title>
+    <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 1024 1024">
+      <path d="M408 442h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8zm-8 204c0 4.4 3.6 8 8 8h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56zm504-486H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zm0 632H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zM115.4 518.9L271.7 642c5.8 4.6 14.4.5 14.4-6.9V388.9c0-7.4-8.5-11.5-14.4-6.9L115.4 505.1a8.74 8.74 0 0 0 0 13.8z"/>
+    </svg>
+  </symbol>
+  <symbol id="svg-menu" viewBox="0 0 24 24">
+    <title>Menu</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-menu">
+      <line x1="3" y1="12" x2="21" y2="12"></line>
+      <line x1="3" y1="6" x2="21" y2="6"></line>
+      <line x1="3" y1="18" x2="21" y2="18"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-arrow-right" viewBox="0 0 24 24">
+    <title>Expand</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-chevron-right">
+      <polyline points="9 18 15 12 9 6"></polyline>
+    </svg>
+  </symbol>
+  <symbol id="svg-sun" viewBox="0 0 24 24">
+    <title>Light mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="feather-sun">
+      <circle cx="12" cy="12" r="5"></circle>
+      <line x1="12" y1="1" x2="12" y2="3"></line>
+      <line x1="12" y1="21" x2="12" y2="23"></line>
+      <line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
+      <line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
+      <line x1="1" y1="12" x2="3" y2="12"></line>
+      <line x1="21" y1="12" x2="23" y2="12"></line>
+      <line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
+      <line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-moon" viewBox="0 0 24 24">
+    <title>Dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-moon">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none" />
+      <path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z" />
+    </svg>
+  </symbol>
+  <symbol id="svg-sun-half" viewBox="0 0 24 24">
+    <title>Auto light/dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-shadow">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none"/>
+      <circle cx="12" cy="12" r="9" />
+      <path d="M13 12h5" />
+      <path d="M13 15h4" />
+      <path d="M13 18h1" />
+      <path d="M13 9h4" />
+      <path d="M13 6h1" />
+    </svg>
+  </symbol>
+</svg>
+
+<input type="checkbox" class="sidebar-toggle" name="__navigation" id="__navigation">
+<input type="checkbox" class="sidebar-toggle" name="__toc" id="__toc">
+<label class="overlay sidebar-overlay" for="__navigation">
+  <div class="visually-hidden">Hide navigation sidebar</div>
+</label>
+<label class="overlay toc-overlay" for="__toc">
+  <div class="visually-hidden">Hide table of contents sidebar</div>
+</label>
+
+
+
+<div class="page">
+  <header class="mobile-header">
+    <div class="header-left">
+      <label class="nav-overlay-icon" for="__navigation">
+        <div class="visually-hidden">Toggle site navigation sidebar</div>
+        <i class="icon"><svg><use href="#svg-menu"></use></svg></i>
+      </label>
+    </div>
+    <div class="header-center">
+      <a href="../../index.html"><div class="brand">CUTLASS Python</div></a>
+    </div>
+    <div class="header-right">
+      <div class="theme-toggle-container theme-toggle-header">
+        <button class="theme-toggle">
+          <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+          <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+          <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+          <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+        </button>
+      </div>
+      <label class="toc-overlay-icon toc-header-icon no-toc" for="__toc">
+        <div class="visually-hidden">Toggle table of contents sidebar</div>
+        <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+      </label>
+    </div>
+  </header>
+  <aside class="sidebar-drawer">
+    <div class="sidebar-container">
+      
+      <div class="sidebar-sticky"><a class="sidebar-brand" href="../../index.html">
+  
+  <div class="sidebar-logo-container">
+    <img class="sidebar-logo only-light" src="../../_static/cutlass-logo-small.png" alt="Light Logo"/>
+    <img class="sidebar-logo only-dark" src="../../_static/cutlass-logo-small.png" alt="Dark Logo"/>
+  </div>
+  
+  <span class="sidebar-brand-text">CUTLASS Python</span>
+  
+</a><form class="sidebar-search-container" method="get" action="../../search.html" role="search">
+  <input class="sidebar-search" placeholder="Search" name="q" aria-label="Search">
+  <input type="hidden" name="check_keywords" value="yes">
+  <input type="hidden" name="area" value="default">
+</form>
+<div id="searchbox"></div><div class="sidebar-scroll"><div class="sidebar-tree">
+  <ul>
+<li class="toctree-l1"><a class="reference internal" href="../../index.html">Home</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Getting Started:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="../../install.html">Installation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../externals/00_basic_gemm.html">Getting Started</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../contribute.html">Contributing</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Python Documentation:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../modules.html">CUTLASS Python API</a><input class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" role="switch" type="checkbox"/><label for="toctree-checkbox-1"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2 has-children"><a class="reference internal" href="../../cutlass.html">CUTLASS</a><input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" role="switch" type="checkbox"/><label for="toctree-checkbox-2"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l3"><a class="reference internal" href="../../cutlass.emit.html">Emitters</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../cutlass.op.html">Operations</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../cutlass.utils.html">Utilities</a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Examples and Tutorials:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../examples.html">Examples</a><input class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" role="switch" type="checkbox"/><label for="toctree-checkbox-3"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../externals/00_basic_gemm.html">Basic GEMM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../externals/01_epilogue.html">Epilogue</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../externals/02_pytorch_extension_grouped_gemm.html">PyTorch Extension</a></li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Reference:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference external" href="https://github.com/NVIDIA/cutlass">Github</a></li>
+</ul>
+
+</div>
+</div>
+
+      </div>
+      
+    </div>
+  </aside>
+  <div class="main">
+    <div class="content">
+      <div class="article-container">
+        <a href="#" class="back-to-top muted-link">
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+            <path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12z"></path>
+          </svg>
+          <span>Back to top</span>
+        </a>
+        <div class="content-icon-container">
+          <div class="theme-toggle-container theme-toggle-content">
+            <button class="theme-toggle">
+              <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+              <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+              <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+              <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+            </button>
+          </div>
+          <label class="toc-overlay-icon toc-content-icon no-toc" for="__toc">
+            <div class="visually-hidden">Toggle table of contents sidebar</div>
+            <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+          </label>
+        </div>
+        <article role="main">
+          <h1>Source code for cutlass.epilogue</h1><div class="highlight"><pre>
+<span></span><span class="c1">#################################################################################################</span>
+<span class="c1">#</span>
+<span class="c1"># Copyright (c) 2023 - 2023 NVIDIA CORPORATION &amp; AFFILIATES. All rights reserved.</span>
+<span class="c1"># SPDX-License-Identifier: BSD-3-Clause</span>
+<span class="c1">#</span>
+<span class="c1"># Redistribution and use in source and binary forms, with or without</span>
+<span class="c1"># modification, are permitted provided that the following conditions are met:</span>
+<span class="c1">#</span>
+<span class="c1"># 1. Redistributions of source code must retain the above copyright notice, this</span>
+<span class="c1"># list of conditions and the following disclaimer.</span>
+<span class="c1">#</span>
+<span class="c1"># 2. Redistributions in binary form must reproduce the above copyright notice,</span>
+<span class="c1"># this list of conditions and the following disclaimer in the documentation</span>
+<span class="c1"># and/or other materials provided with the distribution.</span>
+<span class="c1">#</span>
+<span class="c1"># 3. Neither the name of the copyright holder nor the names of its</span>
+<span class="c1"># contributors may be used to endorse or promote products derived from</span>
+<span class="c1"># this software without specific prior written permission.</span>
+<span class="c1">#</span>
+<span class="c1"># THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS &quot;AS IS&quot;</span>
+<span class="c1"># AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE</span>
+<span class="c1"># IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE</span>
+<span class="c1"># DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE</span>
+<span class="c1"># FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL</span>
+<span class="c1"># DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR</span>
+<span class="c1"># SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER</span>
+<span class="c1"># CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,</span>
+<span class="c1"># OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE</span>
+<span class="c1"># OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.</span>
+<span class="c1">#</span>
+<span class="c1">#################################################################################################</span>
+
+<span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">Registry of elementwise epilogues</span>
+
+<span class="sd">Elementwise epilogues can be added to many CUTLASS kernels in the CUTLAS Python interface via</span>
+<span class="sd">code like the following for GEMM:</span>
+
+<span class="sd">.. highlight:: python</span>
+<span class="sd">.. code-block:: python</span>
+
+<span class="sd">    plan = cutlass.op.Gemm(element=cutlass.DataType.f32, layout=cutlass.LayoutType.RowMajor)</span>
+<span class="sd">    plan.activation = cutlass.epilogue.relu</span>
+<span class="sd">&quot;&quot;&quot;</span>
+
+<span class="kn">from</span> <span class="nn">cutlass.backend</span> <span class="kn">import</span> <span class="n">epilogue</span>
+
+<span class="n">gelu</span> <span class="o">=</span> <span class="n">epilogue</span><span class="o">.</span><span class="n">gelu</span>
+<span class="n">hardswish</span> <span class="o">=</span> <span class="n">epilogue</span><span class="o">.</span><span class="n">hardswish</span>
+<span class="n">identity</span> <span class="o">=</span> <span class="n">epilogue</span><span class="o">.</span><span class="n">identity</span>
+<span class="n">leaky_relu</span> <span class="o">=</span> <span class="n">epilogue</span><span class="o">.</span><span class="n">leaky_relu</span>
+<span class="n">relu</span> <span class="o">=</span> <span class="n">epilogue</span><span class="o">.</span><span class="n">relu</span>
+<span class="n">sigmoid</span> <span class="o">=</span> <span class="n">epilogue</span><span class="o">.</span><span class="n">sigmoid</span>
+<span class="n">silu</span> <span class="o">=</span> <span class="n">epilogue</span><span class="o">.</span><span class="n">silu</span>
+<span class="n">tanh</span> <span class="o">=</span> <span class="n">epilogue</span><span class="o">.</span><span class="n">tanh</span>
+
+
+<span class="n">_activations</span> <span class="o">=</span> <span class="p">[</span><span class="n">gelu</span><span class="p">,</span> <span class="n">hardswish</span><span class="p">,</span> <span class="n">identity</span><span class="p">,</span> <span class="n">leaky_relu</span><span class="p">,</span> <span class="n">relu</span><span class="p">,</span> <span class="n">sigmoid</span><span class="p">,</span> <span class="n">silu</span><span class="p">,</span> <span class="n">tanh</span><span class="p">]</span>
+
+
+<div class="viewcode-block" id="get_activations"><a class="viewcode-back" href="../../cutlass.html#cutlass.epilogue.get_activations">[docs]</a><span class="k">def</span> <span class="nf">get_activations</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="nb">list</span><span class="p">:</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Returns a list of available activation functions</span>
+
+<span class="sd">    :return: list of available activation functions</span>
+<span class="sd">    :rtype: list</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="k">return</span> <span class="n">_activations</span></div>
+
+
+<div class="viewcode-block" id="get_activation_epilogue"><a class="viewcode-back" href="../../cutlass.html#cutlass.epilogue.get_activation_epilogue">[docs]</a><span class="k">def</span> <span class="nf">get_activation_epilogue</span><span class="p">(</span>
+    <span class="n">activation</span><span class="p">,</span>
+    <span class="n">element_output</span><span class="p">,</span>
+    <span class="n">elements_per_access</span><span class="p">,</span>
+    <span class="n">element_accumulator</span><span class="p">,</span>
+    <span class="n">element_compute</span><span class="p">,</span>
+<span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Return an epilogue corresponding to the activation function, data types, and alignment</span>
+<span class="sd">    used in the kernel</span>
+
+<span class="sd">    :param activation: elementwise activation function to use</span>
+<span class="sd">    :param element_output: data type of the output</span>
+<span class="sd">    :param elements_per_access: alignment of operand C of the kernel</span>
+<span class="sd">    :type elements_per_access: int</span>
+<span class="sd">    :param element_accumulator: data type of the accumulated output C</span>
+<span class="sd">    :param element_compute: data type in which compute operations should be performed</span>
+
+<span class="sd">    :return: epilogue functor</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="k">if</span> <span class="n">activation</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">_activations</span><span class="p">:</span>
+        <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span>
+            <span class="sa">f</span><span class="s2">&quot;Unsupported activation type </span><span class="si">{</span><span class="n">activation</span><span class="si">}</span><span class="s2">. Available activations are: </span><span class="si">{</span><span class="n">_activations</span><span class="si">}</span><span class="s2">&quot;</span>
+        <span class="p">)</span>
+
+    <span class="k">if</span> <span class="n">activation</span> <span class="o">==</span> <span class="n">identity</span><span class="p">:</span>
+        <span class="k">return</span> <span class="n">epilogue</span><span class="o">.</span><span class="n">LinearCombination</span><span class="p">(</span>
+            <span class="n">element_output</span><span class="p">,</span> <span class="n">elements_per_access</span><span class="p">,</span> <span class="n">element_accumulator</span><span class="p">,</span> <span class="n">element_compute</span>
+        <span class="p">)</span>
+    <span class="k">else</span><span class="p">:</span>
+        <span class="k">return</span> <span class="n">epilogue</span><span class="o">.</span><span class="n">LinearCombinationGeneric</span><span class="p">(</span>
+            <span class="n">activation</span><span class="p">(</span><span class="n">element_compute</span><span class="p">),</span>
+            <span class="n">element_output</span><span class="p">,</span>
+            <span class="n">elements_per_access</span><span class="p">,</span>
+            <span class="n">element_accumulator</span><span class="p">,</span>
+            <span class="n">element_compute</span><span class="p">,</span>
+        <span class="p">)</span></div>
+</pre></div>
+        </article>
+      </div>
+      <footer>
+        
+        <div class="related-pages">
+          
+          
+        </div>
+        <div class="bottom-of-page">
+          <div class="left-details">
+            <div class="copyright">
+                Copyright &#169; 2023, NVIDIA
+            </div>
+            Made with <a href="https://www.sphinx-doc.org/">Sphinx</a> and <a class="muted-link" href="https://pradyunsg.me">@pradyunsg</a>'s
+            
+            <a href="https://github.com/pradyunsg/furo">Furo</a>
+            
+          </div>
+          <div class="right-details">
+            <div class="icons">
+              <a class="muted-link " href="https://github.com/NVIDIA/cutlass" aria-label="GitHub">
+                <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 16 16">
+                    <path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"></path>
+                </svg>
+            </a>
+              
+            </div>
+          </div>
+        </div>
+        
+      </footer>
+    </div>
+    <aside class="toc-drawer no-toc">
+      
+      
+      
+    </aside>
+  </div>
+</div><script data-url_root="../../" id="documentation_options" src="../../_static/documentation_options.js"></script>
+    <script src="../../_static/doctools.js"></script>
+    <script src="../../_static/sphinx_highlight.js"></script>
+    <script src="../../_static/scripts/furo.js"></script>
+    <script src="../../_static/clipboard.min.js"></script>
+    <script src="../../_static/copybutton.js"></script>
+    <script src="../../_static/tabs.js"></script>
+    <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
+    </body>
+</html>
\ No newline at end of file
diff --git a/python/docs/_modules/cutlass/library_defaults.html b/python/docs/_modules/cutlass/library_defaults.html
new file mode 100644
index 0000000000..7618df8632
--- /dev/null
+++ b/python/docs/_modules/cutlass/library_defaults.html
@@ -0,0 +1,729 @@
+<!doctype html>
+<html class="no-js" lang="en">
+  <head><meta charset="utf-8"/>
+    <meta name="viewport" content="width=device-width,initial-scale=1"/>
+    <meta name="color-scheme" content="light dark"><link rel="index" title="Index" href="../../genindex.html" /><link rel="search" title="Search" href="../../search.html" />
+        <link rel="canonical" href="docs/_modules/cutlass/library_defaults.html" />
+
+    <!-- Generated with Sphinx 6.1.3 and Furo 2023.03.27 -->
+        <title>cutlass.library_defaults - CUTLASS Python</title>
+      <link rel="stylesheet" type="text/css" href="../../_static/pygments.css" />
+    <link rel="stylesheet" type="text/css" href="../../_static/styles/furo.css?digest=fad236701ea90a88636c2a8c73b44ae642ed2a53" />
+    <link rel="stylesheet" type="text/css" href="../../_static/copybutton.css" />
+    <link rel="stylesheet" type="text/css" href="../../_static/tabs.css" />
+    <link rel="stylesheet" type="text/css" href="../../_static/styles/furo-extensions.css?digest=30d1aed668e5c3a91c3e3bf6a60b675221979f0e" />
+    
+    
+
+
+<style>
+  body {
+    --color-code-background: #eeffcc;
+  --color-code-foreground: black;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+  }
+  @media not print {
+    body[data-theme="dark"] {
+      --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+    }
+    @media (prefers-color-scheme: dark) {
+      body:not([data-theme="light"]) {
+        --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+      }
+    }
+  }
+</style></head>
+  <body>
+    
+    <script>
+      document.body.dataset.theme = localStorage.getItem("theme") || "auto";
+    </script>
+    
+
+<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
+  <symbol id="svg-toc" viewBox="0 0 24 24">
+    <title>Contents</title>
+    <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 1024 1024">
+      <path d="M408 442h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8zm-8 204c0 4.4 3.6 8 8 8h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56zm504-486H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zm0 632H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zM115.4 518.9L271.7 642c5.8 4.6 14.4.5 14.4-6.9V388.9c0-7.4-8.5-11.5-14.4-6.9L115.4 505.1a8.74 8.74 0 0 0 0 13.8z"/>
+    </svg>
+  </symbol>
+  <symbol id="svg-menu" viewBox="0 0 24 24">
+    <title>Menu</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-menu">
+      <line x1="3" y1="12" x2="21" y2="12"></line>
+      <line x1="3" y1="6" x2="21" y2="6"></line>
+      <line x1="3" y1="18" x2="21" y2="18"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-arrow-right" viewBox="0 0 24 24">
+    <title>Expand</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-chevron-right">
+      <polyline points="9 18 15 12 9 6"></polyline>
+    </svg>
+  </symbol>
+  <symbol id="svg-sun" viewBox="0 0 24 24">
+    <title>Light mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="feather-sun">
+      <circle cx="12" cy="12" r="5"></circle>
+      <line x1="12" y1="1" x2="12" y2="3"></line>
+      <line x1="12" y1="21" x2="12" y2="23"></line>
+      <line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
+      <line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
+      <line x1="1" y1="12" x2="3" y2="12"></line>
+      <line x1="21" y1="12" x2="23" y2="12"></line>
+      <line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
+      <line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-moon" viewBox="0 0 24 24">
+    <title>Dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-moon">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none" />
+      <path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z" />
+    </svg>
+  </symbol>
+  <symbol id="svg-sun-half" viewBox="0 0 24 24">
+    <title>Auto light/dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-shadow">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none"/>
+      <circle cx="12" cy="12" r="9" />
+      <path d="M13 12h5" />
+      <path d="M13 15h4" />
+      <path d="M13 18h1" />
+      <path d="M13 9h4" />
+      <path d="M13 6h1" />
+    </svg>
+  </symbol>
+</svg>
+
+<input type="checkbox" class="sidebar-toggle" name="__navigation" id="__navigation">
+<input type="checkbox" class="sidebar-toggle" name="__toc" id="__toc">
+<label class="overlay sidebar-overlay" for="__navigation">
+  <div class="visually-hidden">Hide navigation sidebar</div>
+</label>
+<label class="overlay toc-overlay" for="__toc">
+  <div class="visually-hidden">Hide table of contents sidebar</div>
+</label>
+
+
+
+<div class="page">
+  <header class="mobile-header">
+    <div class="header-left">
+      <label class="nav-overlay-icon" for="__navigation">
+        <div class="visually-hidden">Toggle site navigation sidebar</div>
+        <i class="icon"><svg><use href="#svg-menu"></use></svg></i>
+      </label>
+    </div>
+    <div class="header-center">
+      <a href="../../index.html"><div class="brand">CUTLASS Python</div></a>
+    </div>
+    <div class="header-right">
+      <div class="theme-toggle-container theme-toggle-header">
+        <button class="theme-toggle">
+          <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+          <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+          <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+          <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+        </button>
+      </div>
+      <label class="toc-overlay-icon toc-header-icon no-toc" for="__toc">
+        <div class="visually-hidden">Toggle table of contents sidebar</div>
+        <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+      </label>
+    </div>
+  </header>
+  <aside class="sidebar-drawer">
+    <div class="sidebar-container">
+      
+      <div class="sidebar-sticky"><a class="sidebar-brand" href="../../index.html">
+  
+  <div class="sidebar-logo-container">
+    <img class="sidebar-logo only-light" src="../../_static/cutlass-logo-small.png" alt="Light Logo"/>
+    <img class="sidebar-logo only-dark" src="../../_static/cutlass-logo-small.png" alt="Dark Logo"/>
+  </div>
+  
+  <span class="sidebar-brand-text">CUTLASS Python</span>
+  
+</a><form class="sidebar-search-container" method="get" action="../../search.html" role="search">
+  <input class="sidebar-search" placeholder="Search" name="q" aria-label="Search">
+  <input type="hidden" name="check_keywords" value="yes">
+  <input type="hidden" name="area" value="default">
+</form>
+<div id="searchbox"></div><div class="sidebar-scroll"><div class="sidebar-tree">
+  <ul>
+<li class="toctree-l1"><a class="reference internal" href="../../index.html">Home</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Getting Started:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="../../install.html">Installation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../externals/00_basic_gemm.html">Getting Started</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../contribute.html">Contributing</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Python Documentation:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../modules.html">CUTLASS Python API</a><input class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" role="switch" type="checkbox"/><label for="toctree-checkbox-1"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2 has-children"><a class="reference internal" href="../../cutlass.html">CUTLASS</a><input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" role="switch" type="checkbox"/><label for="toctree-checkbox-2"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l3"><a class="reference internal" href="../../cutlass.emit.html">Emitters</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../cutlass.op.html">Operations</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../cutlass.utils.html">Utilities</a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Examples and Tutorials:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../examples.html">Examples</a><input class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" role="switch" type="checkbox"/><label for="toctree-checkbox-3"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../externals/00_basic_gemm.html">Basic GEMM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../externals/01_epilogue.html">Epilogue</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../externals/02_pytorch_extension_grouped_gemm.html">PyTorch Extension</a></li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Reference:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference external" href="https://github.com/NVIDIA/cutlass">Github</a></li>
+</ul>
+
+</div>
+</div>
+
+      </div>
+      
+    </div>
+  </aside>
+  <div class="main">
+    <div class="content">
+      <div class="article-container">
+        <a href="#" class="back-to-top muted-link">
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+            <path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12z"></path>
+          </svg>
+          <span>Back to top</span>
+        </a>
+        <div class="content-icon-container">
+          <div class="theme-toggle-container theme-toggle-content">
+            <button class="theme-toggle">
+              <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+              <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+              <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+              <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+            </button>
+          </div>
+          <label class="toc-overlay-icon toc-content-icon no-toc" for="__toc">
+            <div class="visually-hidden">Toggle table of contents sidebar</div>
+            <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+          </label>
+        </div>
+        <article role="main">
+          <h1>Source code for cutlass.library_defaults</h1><div class="highlight"><pre>
+<span></span><span class="c1">#################################################################################################</span>
+<span class="c1">#</span>
+<span class="c1"># Copyright (c) 2023 - 2023 NVIDIA CORPORATION &amp; AFFILIATES. All rights reserved.</span>
+<span class="c1"># SPDX-License-Identifier: BSD-3-Clause</span>
+<span class="c1">#</span>
+<span class="c1"># Redistribution and use in source and binary forms, with or without</span>
+<span class="c1"># modification, are permitted provided that the following conditions are met:</span>
+<span class="c1">#</span>
+<span class="c1"># 1. Redistributions of source code must retain the above copyright notice, this</span>
+<span class="c1"># list of conditions and the following disclaimer.</span>
+<span class="c1">#</span>
+<span class="c1"># 2. Redistributions in binary form must reproduce the above copyright notice,</span>
+<span class="c1"># this list of conditions and the following disclaimer in the documentation</span>
+<span class="c1"># and/or other materials provided with the distribution.</span>
+<span class="c1">#</span>
+<span class="c1"># 3. Neither the name of the copyright holder nor the names of its</span>
+<span class="c1"># contributors may be used to endorse or promote products derived from</span>
+<span class="c1"># this software without specific prior written permission.</span>
+<span class="c1">#</span>
+<span class="c1"># THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS &quot;AS IS&quot;</span>
+<span class="c1"># AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE</span>
+<span class="c1"># IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE</span>
+<span class="c1"># DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE</span>
+<span class="c1"># FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL</span>
+<span class="c1"># DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR</span>
+<span class="c1"># SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER</span>
+<span class="c1"># CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,</span>
+<span class="c1"># OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE</span>
+<span class="c1"># OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.</span>
+<span class="c1">#</span>
+<span class="c1">#################################################################################################</span>
+
+<span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">Classes containing valid operations for a given compute capability and data types.</span>
+<span class="sd">&quot;&quot;&quot;</span>
+
+<span class="kn">import</span> <span class="nn">logging</span>
+<span class="kn">from</span> <span class="nn">cuda</span> <span class="kn">import</span> <span class="n">__version__</span>
+
+<span class="c1"># Strip any additional information from the CUDA version</span>
+<span class="n">_cuda_version</span> <span class="o">=</span> <span class="n">__version__</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s2">&quot;rc&quot;</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>
+
+<span class="c1"># Imports from CUTLASS profiler generator and manifest scripts</span>
+<span class="kn">import</span> <span class="nn">generator</span> <span class="k">as</span> <span class="nn">prof_generator</span>
+<span class="kn">import</span> <span class="nn">manifest</span> <span class="k">as</span> <span class="nn">prof_manifest</span>
+
+<span class="kn">import</span> <span class="nn">cutlass</span>
+<span class="kn">from</span> <span class="nn">cutlass.utils.check</span> <span class="kn">import</span> <span class="n">valid_stage_count</span>
+<span class="kn">from</span> <span class="nn">cutlass.utils.datatypes</span> <span class="kn">import</span> <span class="n">td_from_profiler_td</span><span class="p">,</span> <span class="n">td_from_profiler_op</span><span class="p">,</span> <span class="n">has_binding_type</span>
+
+
+<span class="n">_generator_ccs</span> <span class="o">=</span> <span class="p">[</span><span class="mi">50</span><span class="p">,</span> <span class="mi">60</span><span class="p">,</span> <span class="mi">61</span><span class="p">,</span> <span class="mi">70</span><span class="p">,</span> <span class="mi">75</span><span class="p">,</span> <span class="mi">80</span><span class="p">,</span> <span class="mi">90</span><span class="p">]</span>
+
+
+<div class="viewcode-block" id="KernelsForDataType"><a class="viewcode-back" href="../../cutlass.html#cutlass.library_defaults.KernelsForDataType">[docs]</a><span class="k">class</span> <span class="nc">KernelsForDataType</span><span class="p">:</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Container class for keeping track of kernels that correspond to a particular combination</span>
+<span class="sd">    of data types for operands A, B, and accumulator</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+
+    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">datatype_comb</span><span class="p">:</span> <span class="nb">tuple</span><span class="p">,</span> <span class="n">layout_comb</span><span class="p">:</span> <span class="nb">tuple</span><span class="p">):</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">datatype_comb</span> <span class="o">=</span> <span class="n">datatype_comb</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">layout_comb</span> <span class="o">=</span> <span class="n">layout_comb</span>
+
+        <span class="c1"># Dictionary mapping from alignment (int) to a list of kernels that fit the alignment</span>
+        <span class="c1"># constraint for the data type combination</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">kernels_by_alignment</span> <span class="o">=</span> <span class="p">{}</span>
+
+<div class="viewcode-block" id="KernelsForDataType.add"><a class="viewcode-back" href="../../cutlass.html#cutlass.library_defaults.KernelsForDataType.add">[docs]</a>    <span class="k">def</span> <span class="nf">add</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">operation</span><span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Add an operation to the list of supported kernels</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="n">alignment</span> <span class="o">=</span> <span class="n">operation</span><span class="o">.</span><span class="n">A</span><span class="o">.</span><span class="n">alignment</span>
+        <span class="k">if</span> <span class="n">alignment</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">kernels_by_alignment</span><span class="p">:</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">kernels_by_alignment</span><span class="p">[</span><span class="n">alignment</span><span class="p">]</span> <span class="o">=</span> <span class="p">[]</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">kernels_by_alignment</span><span class="p">[</span><span class="n">alignment</span><span class="p">]</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">operation</span><span class="p">)</span></div>
+
+    <span class="nd">@property</span>
+    <span class="k">def</span> <span class="nf">alignments</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Returns an unsorted list of alignments supported by this data type combination</span>
+
+<span class="sd">        :return: unsorted list of alignments supported by this data type combination</span>
+<span class="sd">        :rtype: list</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="k">return</span> <span class="nb">list</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">kernels_by_alignment</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span>
+
+    <span class="nd">@property</span>
+    <span class="k">def</span> <span class="nf">all_operations</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Returns a list of all operations supported by this data type combination</span>
+
+<span class="sd">        :return: list of all operations supported by this data type combination</span>
+<span class="sd">        :rtype: list</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="n">ops</span> <span class="o">=</span> <span class="p">[]</span>
+        <span class="k">for</span> <span class="n">_</span><span class="p">,</span> <span class="n">alignment_ops</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">kernels_by_alignment</span><span class="o">.</span><span class="n">items</span><span class="p">():</span>
+            <span class="n">ops</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="n">alignment_ops</span><span class="p">)</span>
+        <span class="k">return</span> <span class="n">ops</span>
+
+<div class="viewcode-block" id="KernelsForDataType.operations"><a class="viewcode-back" href="../../cutlass.html#cutlass.library_defaults.KernelsForDataType.operations">[docs]</a>    <span class="k">def</span> <span class="nf">operations</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">alignment</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Returns operations satisfying the alignment constraint indicated by `alignment`</span>
+
+<span class="sd">        :param alignment: alignment constraint of operations to return</span>
+<span class="sd">        :type alignment: int</span>
+
+<span class="sd">        :return: list of operations</span>
+<span class="sd">        :rtype: list</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="k">if</span> <span class="n">alignment</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">kernels_by_alignment</span><span class="p">:</span>
+            <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span>
+                <span class="sa">f</span><span class="s2">&quot;No operations of alignment </span><span class="si">{</span><span class="n">alignment</span><span class="si">}</span><span class="s2"> found for data type and layout &quot;</span>
+                <span class="sa">f</span><span class="s2">&quot;combination </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">datatype_comb</span><span class="si">}</span><span class="s2"> </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">layout_comb</span><span class="si">}</span><span class="s2">&quot;</span>
+            <span class="p">)</span>
+        <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">kernels_by_alignment</span><span class="p">[</span><span class="n">alignment</span><span class="p">]</span></div>
+
+<div class="viewcode-block" id="KernelsForDataType.find_alignment"><a class="viewcode-back" href="../../cutlass.html#cutlass.library_defaults.KernelsForDataType.find_alignment">[docs]</a>    <span class="k">def</span> <span class="nf">find_alignment</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">shape</span><span class="p">:</span> <span class="nb">tuple</span><span class="p">,</span> <span class="n">layout</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Returns the most preferable alignment for a given shape and layout</span>
+
+<span class="sd">        :param shape: extent of each dimension of the tensor</span>
+<span class="sd">        :type shape: tuple</span>
+<span class="sd">        :param layout: layout of the tensor</span>
+<span class="sd">        :type layout: cutlass.LayoutType</span>
+
+<span class="sd">        :return: maximum alignment supported by the data type combination and tensor size</span>
+<span class="sd">        :rtype: int</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="c1"># Determine the leading dimension of the shape</span>
+        <span class="k">if</span> <span class="n">layout</span> <span class="o">==</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">RowMajor</span><span class="p">:</span>
+            <span class="n">ld</span> <span class="o">=</span> <span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
+        <span class="k">elif</span> <span class="n">layout</span> <span class="o">==</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">RowMajor</span><span class="p">:</span>
+            <span class="n">ld</span> <span class="o">=</span> <span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
+        <span class="k">else</span><span class="p">:</span>
+            <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Unexpected or unsupported layout </span><span class="si">{</span><span class="n">layout</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+
+        <span class="k">for</span> <span class="n">alignment</span> <span class="ow">in</span> <span class="nb">sorted</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">kernels_by_alignment</span><span class="o">.</span><span class="n">keys</span><span class="p">()),</span> <span class="n">reverse</span><span class="o">=</span><span class="kc">True</span><span class="p">):</span>
+            <span class="k">if</span> <span class="n">ld</span> <span class="o">%</span> <span class="n">alignment</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
+                <span class="k">return</span> <span class="n">alignment</span>
+
+        <span class="c1"># Default to alignment of 1 if no others match</span>
+        <span class="k">return</span> <span class="mi">1</span></div>
+
+<div class="viewcode-block" id="KernelsForDataType.sort"><a class="viewcode-back" href="../../cutlass.html#cutlass.library_defaults.KernelsForDataType.sort">[docs]</a>    <span class="k">def</span> <span class="nf">sort</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Sorts each list of kernels in `kernels_by_alignment` in descending order of threadblock shape</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="n">key</span> <span class="o">=</span> <span class="k">lambda</span> <span class="n">op</span><span class="p">:</span> <span class="p">(</span>
+            <span class="n">op</span><span class="o">.</span><span class="n">tile_description</span><span class="o">.</span><span class="n">threadblock_shape</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
+            <span class="o">*</span> <span class="n">op</span><span class="o">.</span><span class="n">tile_description</span><span class="o">.</span><span class="n">threadblock_shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
+            <span class="o">*</span> <span class="n">op</span><span class="o">.</span><span class="n">tile_description</span><span class="o">.</span><span class="n">threadblock_shape</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span>
+        <span class="p">)</span>
+        <span class="k">for</span> <span class="n">alignment</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">kernels_by_alignment</span><span class="o">.</span><span class="n">keys</span><span class="p">():</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">kernels_by_alignment</span><span class="p">[</span><span class="n">alignment</span><span class="p">]</span><span class="o">.</span><span class="n">sort</span><span class="p">(</span><span class="n">key</span><span class="o">=</span><span class="n">key</span><span class="p">,</span> <span class="n">reverse</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span></div></div>
+
+
+<div class="viewcode-block" id="ArchOptions"><a class="viewcode-back" href="../../cutlass.html#cutlass.library_defaults.ArchOptions">[docs]</a><span class="k">class</span> <span class="nc">ArchOptions</span><span class="p">:</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Structure for keeping track of kernels available on a given compute capability</span>
+
+<span class="sd">    :param target_cc: compute capability of the device on which kernels will be run</span>
+<span class="sd">    :type target_cc: int</span>
+<span class="sd">    :param kernel_cc: compute capability of the kernels to generate</span>
+<span class="sd">    :type kernel_cc: int</span>
+<span class="sd">    :param operation_kind: type of operation to register</span>
+<span class="sd">    :type operation_kind: cutlass.OperationKind</span>
+<span class="sd">    :param gemm_kinds: types of GEMM operations that can be included</span>
+<span class="sd">    :type gemm_kinds: list</span>
+<span class="sd">    :param allowed_math_operations: types of primitive math operations allowed</span>
+<span class="sd">    :type allowed_math_operations: list</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+
+    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
+        <span class="bp">self</span><span class="p">,</span>
+        <span class="n">target_cc</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
+        <span class="n">kernel_cc</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
+        <span class="n">operation_kind</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">OperationKind</span><span class="p">,</span>
+        <span class="n">gemm_kinds</span><span class="p">:</span> <span class="nb">list</span><span class="p">,</span>
+        <span class="n">allowed_math_operations</span><span class="p">:</span> <span class="nb">list</span> <span class="o">=</span> <span class="p">[</span>
+            <span class="n">cutlass</span><span class="o">.</span><span class="n">MathOperation</span><span class="o">.</span><span class="n">multiply_add</span><span class="p">,</span>
+            <span class="n">cutlass</span><span class="o">.</span><span class="n">MathOperation</span><span class="o">.</span><span class="n">multiply_add_saturate</span><span class="p">,</span>
+        <span class="p">]</span>
+    <span class="p">):</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">cc</span> <span class="o">=</span> <span class="n">kernel_cc</span>
+
+        <span class="c1"># Dictionary with following structure:</span>
+        <span class="c1">#  Key: OpcodeClass</span>
+        <span class="c1">#  Value: Dictionary with the following structure:</span>
+        <span class="c1">#     Key: tuple of ((DataType, DataType, DataType), (LayoutType, LayoutType, LayoutType),</span>
+        <span class="c1">#          representing ((element_a, element_b, element_accumulator), (layout_a, layout_b))</span>
+        <span class="c1">#     Value: KernelsForDataType</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">operations_by_opclass</span> <span class="o">=</span> <span class="p">{}</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">op_class</span> <span class="o">=</span> <span class="kc">None</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">allowed_math_operations</span> <span class="o">=</span> <span class="n">allowed_math_operations</span>
+
+        <span class="c1"># Identify the method within CUTLASS generator script that generates kernel</span>
+        <span class="c1"># descriptions for the target CC</span>
+        <span class="n">generate_function_name</span> <span class="o">=</span> <span class="s2">&quot;GenerateSM&quot;</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">kernel_cc</span><span class="p">)</span>
+        <span class="k">if</span> <span class="ow">not</span> <span class="nb">hasattr</span><span class="p">(</span><span class="n">prof_generator</span><span class="p">,</span> <span class="n">generate_function_name</span><span class="p">):</span>
+            <span class="n">cutlass</span><span class="o">.</span><span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;No generator found for architecture </span><span class="si">{</span><span class="n">kernel_cc</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+            <span class="k">return</span>
+        <span class="n">generate_function</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">prof_generator</span><span class="p">,</span> <span class="n">generate_function_name</span><span class="p">)</span>
+
+        <span class="c1"># Initialize a default manifest and populate it with valid kernel descriptions</span>
+        <span class="c1"># for the target CC</span>
+        <span class="n">args</span> <span class="o">=</span> <span class="p">[</span>
+            <span class="s2">&quot;--kernels=all&quot;</span><span class="p">,</span>
+            <span class="sa">f</span><span class="s2">&quot;--log-level=</span><span class="si">{</span><span class="n">logging</span><span class="o">.</span><span class="n">getLevelName</span><span class="p">(</span><span class="n">cutlass</span><span class="o">.</span><span class="n">logger</span><span class="o">.</span><span class="n">level</span><span class="p">)</span><span class="si">}</span><span class="s2">&quot;</span>
+        <span class="p">]</span>
+        <span class="n">manifest_args</span> <span class="o">=</span> <span class="n">prof_generator</span><span class="o">.</span><span class="n">define_parser</span><span class="p">()</span><span class="o">.</span><span class="n">parse_args</span><span class="p">(</span><span class="n">args</span><span class="p">)</span>
+        <span class="n">manifest</span> <span class="o">=</span> <span class="n">prof_manifest</span><span class="o">.</span><span class="n">Manifest</span><span class="p">(</span><span class="n">manifest_args</span><span class="p">)</span>
+        <span class="n">generate_function</span><span class="p">(</span><span class="n">manifest</span><span class="p">,</span> <span class="n">_cuda_version</span><span class="p">)</span>
+
+        <span class="k">if</span> <span class="n">operation_kind</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">manifest</span><span class="o">.</span><span class="n">operations</span><span class="p">:</span>
+            <span class="c1"># No kernels generated for this architecture, this could be because the CUDA</span>
+            <span class="c1"># toolkit is insufficient to support operations in this CC</span>
+            <span class="n">cutlass</span><span class="o">.</span><span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;No operations of type </span><span class="si">{</span><span class="n">operation_kind</span><span class="si">}</span><span class="s2"> found for CC </span><span class="si">{</span><span class="n">kernel_cc</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+            <span class="k">return</span>
+
+        <span class="c1"># Iterate through the available operations for this operation kind and</span>
+        <span class="c1"># find available opclasses and data types</span>
+        <span class="k">for</span> <span class="n">name</span><span class="p">,</span> <span class="n">op_list</span> <span class="ow">in</span> <span class="n">manifest</span><span class="o">.</span><span class="n">operations</span><span class="p">[</span><span class="n">operation_kind</span><span class="p">]</span><span class="o">.</span><span class="n">items</span><span class="p">():</span>
+            <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">op_list</span><span class="p">:</span>
+                <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">gemm_kind</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">gemm_kinds</span><span class="p">:</span>
+                    <span class="k">continue</span>
+
+                <span class="n">mi</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">tile_description</span><span class="o">.</span><span class="n">math_instruction</span>
+                <span class="k">if</span> <span class="n">mi</span><span class="o">.</span><span class="n">math_operation</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">allowed_math_operations</span><span class="p">:</span>
+                    <span class="k">continue</span>
+
+                <span class="n">datatype_comb</span> <span class="o">=</span> <span class="p">(</span><span class="n">mi</span><span class="o">.</span><span class="n">element_a</span><span class="p">,</span> <span class="n">mi</span><span class="o">.</span><span class="n">element_b</span><span class="p">,</span> <span class="n">mi</span><span class="o">.</span><span class="n">element_accumulator</span><span class="p">)</span>
+
+                <span class="c1"># Skip any data types that do not currently have conversions via cutlass_bindings</span>
+                <span class="k">if</span> <span class="kc">False</span> <span class="ow">in</span> <span class="p">[</span><span class="n">has_binding_type</span><span class="p">(</span><span class="n">elt</span><span class="p">)</span> <span class="k">for</span> <span class="n">elt</span> <span class="ow">in</span> <span class="n">datatype_comb</span><span class="p">]:</span>
+                    <span class="k">continue</span>
+
+                <span class="c1"># Prune operations that don&#39;t fit in shared memory</span>
+                <span class="n">td</span> <span class="o">=</span> <span class="n">td_from_profiler_op</span><span class="p">(</span><span class="n">op</span><span class="p">)</span>
+                <span class="k">if</span> <span class="ow">not</span> <span class="n">valid_stage_count</span><span class="p">(</span><span class="n">target_cc</span><span class="p">,</span> <span class="n">td</span><span class="p">)[</span><span class="mi">0</span><span class="p">]:</span>
+                    <span class="k">continue</span>
+
+                <span class="k">if</span> <span class="n">mi</span><span class="o">.</span><span class="n">opcode_class</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">operations_by_opclass</span><span class="p">:</span>
+                    <span class="bp">self</span><span class="o">.</span><span class="n">operations_by_opclass</span><span class="p">[</span><span class="n">mi</span><span class="o">.</span><span class="n">opcode_class</span><span class="p">]</span> <span class="o">=</span> <span class="p">{}</span>
+
+                <span class="n">datatype_comb</span> <span class="o">=</span> <span class="p">(</span><span class="n">mi</span><span class="o">.</span><span class="n">element_a</span><span class="p">,</span> <span class="n">mi</span><span class="o">.</span><span class="n">element_b</span><span class="p">,</span> <span class="n">mi</span><span class="o">.</span><span class="n">element_accumulator</span><span class="p">)</span>
+                <span class="n">layout_comb</span> <span class="o">=</span> <span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">A</span><span class="o">.</span><span class="n">layout</span><span class="p">,</span> <span class="n">op</span><span class="o">.</span><span class="n">B</span><span class="o">.</span><span class="n">layout</span><span class="p">)</span>
+
+                <span class="c1"># Register TF32 kernels as F32 to enable F32 -&gt; TF32 conversion + TF32 Tensor Core operations</span>
+                <span class="k">if</span> <span class="n">datatype_comb</span> <span class="o">==</span> <span class="p">(</span><span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">tf32</span><span class="p">,</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">tf32</span><span class="p">,</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f32</span><span class="p">):</span>
+                    <span class="c1"># TF32 kernels only supported on SM80 and beyond</span>
+                    <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">cc</span> <span class="o">&lt;</span> <span class="mi">80</span><span class="p">:</span>
+                        <span class="k">continue</span>
+                    <span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">cc</span> <span class="o">==</span> <span class="mi">90</span><span class="p">:</span>
+                        <span class="k">if</span> <span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">A</span><span class="o">.</span><span class="n">element</span> <span class="o">!=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f32</span>
+                            <span class="ow">or</span> <span class="n">op</span><span class="o">.</span><span class="n">B</span><span class="o">.</span><span class="n">element</span> <span class="o">!=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f32</span>
+                            <span class="ow">or</span> <span class="n">op</span><span class="o">.</span><span class="n">C</span><span class="o">.</span><span class="n">element</span> <span class="o">!=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f32</span><span class="p">):</span>
+                            <span class="k">continue</span>
+
+                    <span class="n">datatype_comb</span> <span class="o">=</span> <span class="p">(</span><span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f32</span><span class="p">,</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f32</span><span class="p">,</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f32</span><span class="p">)</span>
+
+                <span class="n">opclass_dict</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">operations_by_opclass</span><span class="p">[</span><span class="n">mi</span><span class="o">.</span><span class="n">opcode_class</span><span class="p">]</span>
+                <span class="n">key</span> <span class="o">=</span> <span class="p">(</span><span class="n">datatype_comb</span><span class="p">,</span> <span class="n">layout_comb</span><span class="p">)</span>
+                <span class="k">if</span> <span class="n">key</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">opclass_dict</span><span class="p">:</span>
+                    <span class="n">opclass_dict</span><span class="p">[</span><span class="n">key</span><span class="p">]</span> <span class="o">=</span> <span class="n">KernelsForDataType</span><span class="p">(</span><span class="n">datatype_comb</span><span class="p">,</span> <span class="n">layout_comb</span><span class="p">)</span>
+                <span class="n">opclass_dict</span><span class="p">[</span><span class="n">key</span><span class="p">]</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">op</span><span class="p">)</span>
+
+        <span class="c1"># Set the default opclass to TensorOp, if available. Otherwise default to SIMT</span>
+        <span class="k">if</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">OpcodeClass</span><span class="o">.</span><span class="n">TensorOp</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">operations_by_opclass</span><span class="p">:</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">op_class</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">OpcodeClass</span><span class="o">.</span><span class="n">TensorOp</span>
+        <span class="k">else</span><span class="p">:</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">op_class</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">OpcodeClass</span><span class="o">.</span><span class="n">Simt</span>
+
+        <span class="c1"># The profiler&#39;s generator may generate only a limited set of combinations of operands for SIMT kernels.</span>
+        <span class="c1"># Here, we generate additional versions via a generic TileDescription.</span>
+        <span class="k">if</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">OpcodeClass</span><span class="o">.</span><span class="n">Simt</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">operations_by_opclass</span><span class="p">:</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">operations_by_opclass</span><span class="p">[</span><span class="n">cutlass</span><span class="o">.</span><span class="n">OpcodeClass</span><span class="o">.</span><span class="n">Simt</span><span class="p">]</span> <span class="o">=</span> <span class="p">{}</span>
+
+        <span class="n">types</span> <span class="o">=</span> <span class="p">[</span>
+            <span class="p">(</span><span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">s8</span><span class="p">,</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">s8</span><span class="p">,</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">s8</span><span class="p">),</span>
+            <span class="p">(</span><span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">s8</span><span class="p">,</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">s8</span><span class="p">,</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">s32</span><span class="p">),</span>
+            <span class="p">(</span><span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f16</span><span class="p">,</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f16</span><span class="p">,</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f16</span><span class="p">),</span>
+            <span class="p">(</span><span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f16</span><span class="p">,</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f16</span><span class="p">,</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f32</span><span class="p">),</span>
+            <span class="p">(</span><span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f32</span><span class="p">,</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f32</span><span class="p">,</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f32</span><span class="p">),</span>
+            <span class="p">(</span><span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f64</span><span class="p">,</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f64</span><span class="p">,</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f64</span><span class="p">),</span>
+        <span class="p">]</span>
+
+        <span class="n">layouts</span> <span class="o">=</span> <span class="p">[</span>
+            <span class="p">(</span><span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">RowMajor</span><span class="p">,</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">RowMajor</span><span class="p">),</span>
+            <span class="p">(</span><span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">RowMajor</span><span class="p">,</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">ColumnMajor</span><span class="p">),</span>
+            <span class="p">(</span><span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">ColumnMajor</span><span class="p">,</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">RowMajor</span><span class="p">),</span>
+            <span class="p">(</span><span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">ColumnMajor</span><span class="p">,</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">ColumnMajor</span><span class="p">),</span>
+        <span class="p">]</span>
+        <span class="n">alignment</span> <span class="o">=</span> <span class="mi">1</span>
+        <span class="n">epilogue_functor</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">EpilogueFunctor</span><span class="o">.</span><span class="n">LinearCombination</span>
+        <span class="n">swizzling_functor</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">SwizzlingFunctor</span><span class="o">.</span><span class="n">Identity8</span>
+        <span class="k">for</span> <span class="n">type_comb</span> <span class="ow">in</span> <span class="n">types</span><span class="p">:</span>
+            <span class="k">for</span> <span class="n">layout_comb</span> <span class="ow">in</span> <span class="n">layouts</span><span class="p">:</span>
+                <span class="n">comb</span> <span class="o">=</span> <span class="p">(</span><span class="n">type_comb</span><span class="p">,</span> <span class="n">layout_comb</span><span class="p">)</span>
+                <span class="k">if</span> <span class="n">comb</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">operations_by_opclass</span><span class="p">[</span><span class="n">cutlass</span><span class="o">.</span><span class="n">OpcodeClass</span><span class="o">.</span><span class="n">Simt</span><span class="p">]:</span>
+                    <span class="k">continue</span>
+
+                <span class="n">A</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">TensorDescription</span><span class="p">(</span><span class="n">type_comb</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">layout_comb</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">alignment</span><span class="p">)</span>
+                <span class="n">B</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">TensorDescription</span><span class="p">(</span><span class="n">type_comb</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="n">layout_comb</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="n">alignment</span><span class="p">)</span>
+                <span class="n">C</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">TensorDescription</span><span class="p">(</span><span class="n">type_comb</span><span class="p">[</span><span class="mi">2</span><span class="p">],</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">ColumnMajor</span><span class="p">,</span> <span class="n">alignment</span><span class="p">)</span>
+                <span class="n">math_inst</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">MathInstruction</span><span class="p">(</span>
+                    <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span>
+                    <span class="n">type_comb</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span>
+                    <span class="n">type_comb</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span>
+                    <span class="n">type_comb</span><span class="p">[</span><span class="mi">2</span><span class="p">],</span>
+                    <span class="n">cutlass</span><span class="o">.</span><span class="n">OpcodeClass</span><span class="o">.</span><span class="n">Simt</span><span class="p">,</span>
+                    <span class="n">cutlass</span><span class="o">.</span><span class="n">MathOperation</span><span class="o">.</span><span class="n">multiply_add</span>
+                <span class="p">)</span>
+
+                <span class="n">td</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">TileDescription</span><span class="p">(</span>
+                    <span class="p">[</span><span class="mi">128</span><span class="p">,</span> <span class="mi">128</span><span class="p">,</span> <span class="mi">8</span><span class="p">],</span> <span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="mi">4</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="n">math_inst</span><span class="p">,</span> <span class="mi">50</span><span class="p">,</span> <span class="mi">1024</span><span class="p">)</span>
+
+                <span class="c1"># Prune operations that don&#39;t fit in shared memory</span>
+                <span class="k">if</span> <span class="ow">not</span> <span class="n">valid_stage_count</span><span class="p">(</span><span class="n">target_cc</span><span class="p">,</span> <span class="n">td_from_profiler_td</span><span class="p">(</span><span class="n">td</span><span class="p">))[</span><span class="mi">0</span><span class="p">]:</span>
+                    <span class="k">continue</span>
+
+                <span class="n">new_operation</span> <span class="o">=</span> <span class="n">prof_manifest</span><span class="o">.</span><span class="n">GemmOperation</span><span class="p">(</span>
+                    <span class="n">cutlass</span><span class="o">.</span><span class="n">GemmKind</span><span class="o">.</span><span class="n">Universal</span><span class="p">,</span> <span class="n">td</span><span class="o">.</span><span class="n">minimum_compute_capability</span><span class="p">,</span>
+                    <span class="n">td</span><span class="p">,</span> <span class="n">A</span><span class="p">,</span> <span class="n">B</span><span class="p">,</span> <span class="n">C</span><span class="p">,</span> <span class="n">type_comb</span><span class="p">[</span><span class="mi">2</span><span class="p">],</span> <span class="n">epilogue_functor</span><span class="p">,</span> <span class="n">swizzling_functor</span><span class="p">)</span>
+
+                <span class="n">new_kernels</span> <span class="o">=</span> <span class="n">KernelsForDataType</span><span class="p">(</span><span class="n">type_comb</span><span class="p">,</span> <span class="n">layout_comb</span><span class="p">)</span>
+                <span class="n">new_kernels</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">new_operation</span><span class="p">)</span>
+                <span class="bp">self</span><span class="o">.</span><span class="n">operations_by_opclass</span><span class="p">[</span><span class="n">cutlass</span><span class="o">.</span><span class="n">OpcodeClass</span><span class="o">.</span><span class="n">Simt</span><span class="p">][</span><span class="n">comb</span><span class="p">]</span> <span class="o">=</span> <span class="n">new_kernels</span>
+
+        <span class="c1"># Sort all operations</span>
+        <span class="k">for</span> <span class="n">oc</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">operations_by_opclass</span><span class="o">.</span><span class="n">keys</span><span class="p">():</span>
+            <span class="k">for</span> <span class="n">comb</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">operations_by_opclass</span><span class="p">[</span><span class="n">oc</span><span class="p">]</span><span class="o">.</span><span class="n">keys</span><span class="p">():</span>
+                <span class="bp">self</span><span class="o">.</span><span class="n">operations_by_opclass</span><span class="p">[</span><span class="n">oc</span><span class="p">][</span><span class="n">comb</span><span class="p">]</span><span class="o">.</span><span class="n">sort</span><span class="p">()</span>
+
+<div class="viewcode-block" id="ArchOptions.opclass_supports_combination"><a class="viewcode-back" href="../../cutlass.html#cutlass.library_defaults.ArchOptions.opclass_supports_combination">[docs]</a>    <span class="k">def</span> <span class="nf">opclass_supports_combination</span><span class="p">(</span>
+        <span class="bp">self</span><span class="p">,</span> <span class="n">op_class</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">OpcodeClass</span><span class="p">,</span> <span class="n">datatype_comb</span><span class="p">:</span> <span class="nb">tuple</span><span class="p">,</span> <span class="n">layout_comb</span><span class="p">:</span> <span class="nb">tuple</span>
+    <span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Returns whether the provided operation class supports the provided data type and layout combination</span>
+
+<span class="sd">        :param op_class: operation class to consider</span>
+<span class="sd">        :type op_class: cutlass.OpcodeClass</span>
+<span class="sd">        :param datatype_comb: tuple of data types for (element_A, element_B, element_accumulator)</span>
+<span class="sd">        :type datatype_comb: tuple[cutlass.DataType]</span>
+<span class="sd">        :param layout_comb: tuple of data types for (layout_A, layout_B)</span>
+<span class="sd">        :type layout_comb: tuple[cutlass.LayoutType]</span>
+
+<span class="sd">        :return: set of operation classes that support the provided data type and layout combination</span>
+<span class="sd">        :rtype: set</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="k">if</span> <span class="n">op_class</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">operations_by_opclass</span><span class="p">:</span>
+            <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Unexpected or unsupported operation class </span><span class="si">{</span><span class="n">op_class</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+
+        <span class="k">return</span> <span class="p">(</span><span class="n">datatype_comb</span><span class="p">,</span> <span class="n">layout_comb</span><span class="p">)</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">operations_by_opclass</span><span class="p">[</span><span class="n">op_class</span><span class="p">]</span></div>
+
+<div class="viewcode-block" id="ArchOptions.supporting_opclasses"><a class="viewcode-back" href="../../cutlass.html#cutlass.library_defaults.ArchOptions.supporting_opclasses">[docs]</a>    <span class="k">def</span> <span class="nf">supporting_opclasses</span><span class="p">(</span>
+        <span class="bp">self</span><span class="p">,</span>
+        <span class="n">element_a</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="p">,</span>
+        <span class="n">element_b</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="p">,</span>
+        <span class="n">element_accumulator</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="p">,</span>
+        <span class="n">layout_a</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="p">,</span>
+        <span class="n">layout_b</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="p">,</span>
+    <span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">set</span><span class="p">:</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Returns a set of operation classes that support the provided data type combination</span>
+
+<span class="sd">        :param element_a: data type of operand A</span>
+<span class="sd">        :type element_a: cutlass.DataType</span>
+<span class="sd">        :param element_b: data type of operand B</span>
+<span class="sd">        :type element_b: cutlass.DataType</span>
+<span class="sd">        :param element_accumulator: data type of accumulator</span>
+<span class="sd">        :type element_accumulator: cutlass.DataType</span>
+<span class="sd">        :param layout_a: layout of operand A</span>
+<span class="sd">        :type layout_a: cutlass.LayoutType</span>
+<span class="sd">        :param layout_b: layout of operand B</span>
+<span class="sd">        :type layout_b: cutlass.LayoutType</span>
+
+<span class="sd">        :return: set of operation classes that support the provided data type combination</span>
+<span class="sd">        :rtype: set</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="n">supporting_op_classes</span> <span class="o">=</span> <span class="nb">set</span><span class="p">()</span>
+        <span class="n">datatype_comb</span> <span class="o">=</span> <span class="p">(</span><span class="n">element_a</span><span class="p">,</span> <span class="n">element_b</span><span class="p">,</span> <span class="n">element_accumulator</span><span class="p">)</span>
+        <span class="n">layout_comb</span> <span class="o">=</span> <span class="p">(</span><span class="n">layout_a</span><span class="p">,</span> <span class="n">layout_b</span><span class="p">)</span>
+
+        <span class="k">for</span> <span class="n">op_class</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">operations_by_opclass</span><span class="o">.</span><span class="n">keys</span><span class="p">():</span>
+            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">opclass_supports_combination</span><span class="p">(</span><span class="n">op_class</span><span class="p">,</span> <span class="n">datatype_comb</span><span class="p">,</span> <span class="n">layout_comb</span><span class="p">):</span>
+                <span class="n">supporting_op_classes</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">op_class</span><span class="p">)</span>
+        <span class="k">return</span> <span class="n">supporting_op_classes</span></div>
+
+<div class="viewcode-block" id="ArchOptions.operations"><a class="viewcode-back" href="../../cutlass.html#cutlass.library_defaults.ArchOptions.operations">[docs]</a>    <span class="k">def</span> <span class="nf">operations</span><span class="p">(</span>
+        <span class="bp">self</span><span class="p">,</span>
+        <span class="n">op_class</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">OpcodeClass</span><span class="p">,</span>
+        <span class="n">element_a</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="p">,</span>
+        <span class="n">element_b</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="p">,</span>
+        <span class="n">element_accumulator</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="p">,</span>
+        <span class="n">layout_a</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="p">,</span>
+        <span class="n">layout_b</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="p">,</span>
+    <span class="p">)</span> <span class="o">-&gt;</span> <span class="n">KernelsForDataType</span><span class="p">:</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Returns whether the provided operation class supports the provided data type combination</span>
+
+<span class="sd">        :param op_class: operation class to consider</span>
+<span class="sd">        :type op_class: cutlass.OpcodeClass</span>
+<span class="sd">        :param element_a: data type of operand A</span>
+<span class="sd">        :type element_a: cutlass.DataType</span>
+<span class="sd">        :param element_b: data type of operand B</span>
+<span class="sd">        :type element_b: cutlass.DataType</span>
+<span class="sd">        :param element_accumulator: data type of accumulator</span>
+<span class="sd">        :type element_accumulator: cutlass.DataType</span>
+<span class="sd">        :param layout_a: layout of operand A</span>
+<span class="sd">        :type layout_a: cutlass.LayoutType</span>
+<span class="sd">        :param layout_b: layout of operand B</span>
+<span class="sd">        :type layout_b: cutlass.LayoutType</span>
+
+<span class="sd">        :return: container of kernels by alignment supported by the provided combination of parameters</span>
+<span class="sd">        :rtype: KernelsForDataType</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="n">datatype_comb</span> <span class="o">=</span> <span class="p">(</span><span class="n">element_a</span><span class="p">,</span> <span class="n">element_b</span><span class="p">,</span> <span class="n">element_accumulator</span><span class="p">)</span>
+        <span class="n">layout_comb</span> <span class="o">=</span> <span class="p">(</span><span class="n">layout_a</span><span class="p">,</span> <span class="n">layout_b</span><span class="p">)</span>
+        <span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">opclass_supports_combination</span><span class="p">(</span><span class="n">op_class</span><span class="p">,</span> <span class="n">datatype_comb</span><span class="p">,</span> <span class="n">layout_comb</span><span class="p">):</span>
+            <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span>
+                <span class="sa">f</span><span class="s2">&quot;Data type layout combination </span><span class="si">{</span><span class="n">datatype_comb</span><span class="si">}</span><span class="s2">, </span><span class="si">{</span><span class="n">layout_comb</span><span class="si">}</span><span class="s2"> &quot;</span>
+                <span class="sa">f</span><span class="s2">&quot;is not supported by opcode class </span><span class="si">{</span><span class="n">op_class</span><span class="si">}</span><span class="s2"> on CC </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">cc</span><span class="si">}</span><span class="s2">.&quot;</span>
+            <span class="p">)</span>
+        <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">operations_by_opclass</span><span class="p">[</span><span class="n">op_class</span><span class="p">][(</span><span class="n">datatype_comb</span><span class="p">,</span> <span class="n">layout_comb</span><span class="p">)]</span></div></div>
+
+
+<div class="viewcode-block" id="OptionRegistry"><a class="viewcode-back" href="../../cutlass.html#cutlass.library_defaults.OptionRegistry">[docs]</a><span class="k">class</span> <span class="nc">OptionRegistry</span><span class="p">:</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Container of all architecture-specific options</span>
+
+<span class="sd">    :param target_cc: compute capability of the device on which operations will be run</span>
+<span class="sd">    :type target_cc: int</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+
+    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">target_cc</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">registry</span> <span class="o">=</span> <span class="p">{}</span>
+
+        <span class="n">gemm_kinds</span> <span class="o">=</span> <span class="p">[</span><span class="n">cutlass</span><span class="o">.</span><span class="n">GemmKind</span><span class="o">.</span><span class="n">Universal</span><span class="p">,</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">GemmKind</span><span class="o">.</span><span class="n">Universal3x</span><span class="p">]</span>
+        <span class="c1"># Construct options for each CC</span>
+        <span class="k">for</span> <span class="n">kernel_cc</span> <span class="ow">in</span> <span class="n">_generator_ccs</span><span class="p">:</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">registry</span><span class="p">[</span><span class="n">kernel_cc</span><span class="p">]</span> <span class="o">=</span> <span class="n">ArchOptions</span><span class="p">(</span><span class="n">target_cc</span><span class="p">,</span> <span class="n">kernel_cc</span><span class="p">,</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">OperationKind</span><span class="o">.</span><span class="n">Gemm</span><span class="p">,</span> <span class="n">gemm_kinds</span><span class="p">)</span>
+
+<div class="viewcode-block" id="OptionRegistry.options_for_cc"><a class="viewcode-back" href="../../cutlass.html#cutlass.library_defaults.OptionRegistry.options_for_cc">[docs]</a>    <span class="k">def</span> <span class="nf">options_for_cc</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">cc</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">ArchOptions</span><span class="p">:</span>
+        <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">registry</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">cc</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span></div></div>
+</pre></div>
+        </article>
+      </div>
+      <footer>
+        
+        <div class="related-pages">
+          
+          
+        </div>
+        <div class="bottom-of-page">
+          <div class="left-details">
+            <div class="copyright">
+                Copyright &#169; 2023, NVIDIA
+            </div>
+            Made with <a href="https://www.sphinx-doc.org/">Sphinx</a> and <a class="muted-link" href="https://pradyunsg.me">@pradyunsg</a>'s
+            
+            <a href="https://github.com/pradyunsg/furo">Furo</a>
+            
+          </div>
+          <div class="right-details">
+            <div class="icons">
+              <a class="muted-link " href="https://github.com/NVIDIA/cutlass" aria-label="GitHub">
+                <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 16 16">
+                    <path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"></path>
+                </svg>
+            </a>
+              
+            </div>
+          </div>
+        </div>
+        
+      </footer>
+    </div>
+    <aside class="toc-drawer no-toc">
+      
+      
+      
+    </aside>
+  </div>
+</div><script data-url_root="../../" id="documentation_options" src="../../_static/documentation_options.js"></script>
+    <script src="../../_static/doctools.js"></script>
+    <script src="../../_static/sphinx_highlight.js"></script>
+    <script src="../../_static/scripts/furo.js"></script>
+    <script src="../../_static/clipboard.min.js"></script>
+    <script src="../../_static/copybutton.js"></script>
+    <script src="../../_static/tabs.js"></script>
+    <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
+    </body>
+</html>
\ No newline at end of file
diff --git a/python/docs/_modules/cutlass/op/gemm.html b/python/docs/_modules/cutlass/op/gemm.html
new file mode 100644
index 0000000000..8214706b93
--- /dev/null
+++ b/python/docs/_modules/cutlass/op/gemm.html
@@ -0,0 +1,980 @@
+<!doctype html>
+<html class="no-js" lang="en">
+  <head><meta charset="utf-8"/>
+    <meta name="viewport" content="width=device-width,initial-scale=1"/>
+    <meta name="color-scheme" content="light dark"><link rel="index" title="Index" href="../../../genindex.html" /><link rel="search" title="Search" href="../../../search.html" />
+        <link rel="canonical" href="docs/_modules/cutlass/op/gemm.html" />
+
+    <!-- Generated with Sphinx 6.1.3 and Furo 2023.03.27 -->
+        <title>cutlass.op.gemm - CUTLASS Python</title>
+      <link rel="stylesheet" type="text/css" href="../../../_static/pygments.css" />
+    <link rel="stylesheet" type="text/css" href="../../../_static/styles/furo.css?digest=fad236701ea90a88636c2a8c73b44ae642ed2a53" />
+    <link rel="stylesheet" type="text/css" href="../../../_static/copybutton.css" />
+    <link rel="stylesheet" type="text/css" href="../../../_static/tabs.css" />
+    <link rel="stylesheet" type="text/css" href="../../../_static/styles/furo-extensions.css?digest=30d1aed668e5c3a91c3e3bf6a60b675221979f0e" />
+    
+    
+
+
+<style>
+  body {
+    --color-code-background: #eeffcc;
+  --color-code-foreground: black;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+  }
+  @media not print {
+    body[data-theme="dark"] {
+      --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+    }
+    @media (prefers-color-scheme: dark) {
+      body:not([data-theme="light"]) {
+        --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+      }
+    }
+  }
+</style></head>
+  <body>
+    
+    <script>
+      document.body.dataset.theme = localStorage.getItem("theme") || "auto";
+    </script>
+    
+
+<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
+  <symbol id="svg-toc" viewBox="0 0 24 24">
+    <title>Contents</title>
+    <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 1024 1024">
+      <path d="M408 442h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8zm-8 204c0 4.4 3.6 8 8 8h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56zm504-486H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zm0 632H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zM115.4 518.9L271.7 642c5.8 4.6 14.4.5 14.4-6.9V388.9c0-7.4-8.5-11.5-14.4-6.9L115.4 505.1a8.74 8.74 0 0 0 0 13.8z"/>
+    </svg>
+  </symbol>
+  <symbol id="svg-menu" viewBox="0 0 24 24">
+    <title>Menu</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-menu">
+      <line x1="3" y1="12" x2="21" y2="12"></line>
+      <line x1="3" y1="6" x2="21" y2="6"></line>
+      <line x1="3" y1="18" x2="21" y2="18"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-arrow-right" viewBox="0 0 24 24">
+    <title>Expand</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-chevron-right">
+      <polyline points="9 18 15 12 9 6"></polyline>
+    </svg>
+  </symbol>
+  <symbol id="svg-sun" viewBox="0 0 24 24">
+    <title>Light mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="feather-sun">
+      <circle cx="12" cy="12" r="5"></circle>
+      <line x1="12" y1="1" x2="12" y2="3"></line>
+      <line x1="12" y1="21" x2="12" y2="23"></line>
+      <line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
+      <line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
+      <line x1="1" y1="12" x2="3" y2="12"></line>
+      <line x1="21" y1="12" x2="23" y2="12"></line>
+      <line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
+      <line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-moon" viewBox="0 0 24 24">
+    <title>Dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-moon">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none" />
+      <path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z" />
+    </svg>
+  </symbol>
+  <symbol id="svg-sun-half" viewBox="0 0 24 24">
+    <title>Auto light/dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-shadow">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none"/>
+      <circle cx="12" cy="12" r="9" />
+      <path d="M13 12h5" />
+      <path d="M13 15h4" />
+      <path d="M13 18h1" />
+      <path d="M13 9h4" />
+      <path d="M13 6h1" />
+    </svg>
+  </symbol>
+</svg>
+
+<input type="checkbox" class="sidebar-toggle" name="__navigation" id="__navigation">
+<input type="checkbox" class="sidebar-toggle" name="__toc" id="__toc">
+<label class="overlay sidebar-overlay" for="__navigation">
+  <div class="visually-hidden">Hide navigation sidebar</div>
+</label>
+<label class="overlay toc-overlay" for="__toc">
+  <div class="visually-hidden">Hide table of contents sidebar</div>
+</label>
+
+
+
+<div class="page">
+  <header class="mobile-header">
+    <div class="header-left">
+      <label class="nav-overlay-icon" for="__navigation">
+        <div class="visually-hidden">Toggle site navigation sidebar</div>
+        <i class="icon"><svg><use href="#svg-menu"></use></svg></i>
+      </label>
+    </div>
+    <div class="header-center">
+      <a href="../../../index.html"><div class="brand">CUTLASS Python</div></a>
+    </div>
+    <div class="header-right">
+      <div class="theme-toggle-container theme-toggle-header">
+        <button class="theme-toggle">
+          <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+          <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+          <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+          <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+        </button>
+      </div>
+      <label class="toc-overlay-icon toc-header-icon no-toc" for="__toc">
+        <div class="visually-hidden">Toggle table of contents sidebar</div>
+        <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+      </label>
+    </div>
+  </header>
+  <aside class="sidebar-drawer">
+    <div class="sidebar-container">
+      
+      <div class="sidebar-sticky"><a class="sidebar-brand" href="../../../index.html">
+  
+  <div class="sidebar-logo-container">
+    <img class="sidebar-logo only-light" src="../../../_static/cutlass-logo-small.png" alt="Light Logo"/>
+    <img class="sidebar-logo only-dark" src="../../../_static/cutlass-logo-small.png" alt="Dark Logo"/>
+  </div>
+  
+  <span class="sidebar-brand-text">CUTLASS Python</span>
+  
+</a><form class="sidebar-search-container" method="get" action="../../../search.html" role="search">
+  <input class="sidebar-search" placeholder="Search" name="q" aria-label="Search">
+  <input type="hidden" name="check_keywords" value="yes">
+  <input type="hidden" name="area" value="default">
+</form>
+<div id="searchbox"></div><div class="sidebar-scroll"><div class="sidebar-tree">
+  <ul>
+<li class="toctree-l1"><a class="reference internal" href="../../../index.html">Home</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Getting Started:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="../../../install.html">Installation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../externals/00_basic_gemm.html">Getting Started</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../contribute.html">Contributing</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Python Documentation:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../modules.html">CUTLASS Python API</a><input class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" role="switch" type="checkbox"/><label for="toctree-checkbox-1"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2 has-children"><a class="reference internal" href="../../../cutlass.html">CUTLASS</a><input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" role="switch" type="checkbox"/><label for="toctree-checkbox-2"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l3"><a class="reference internal" href="../../../cutlass.emit.html">Emitters</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../../cutlass.op.html">Operations</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../../cutlass.utils.html">Utilities</a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Examples and Tutorials:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples.html">Examples</a><input class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" role="switch" type="checkbox"/><label for="toctree-checkbox-3"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../externals/00_basic_gemm.html">Basic GEMM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../externals/01_epilogue.html">Epilogue</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../externals/02_pytorch_extension_grouped_gemm.html">PyTorch Extension</a></li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Reference:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference external" href="https://github.com/NVIDIA/cutlass">Github</a></li>
+</ul>
+
+</div>
+</div>
+
+      </div>
+      
+    </div>
+  </aside>
+  <div class="main">
+    <div class="content">
+      <div class="article-container">
+        <a href="#" class="back-to-top muted-link">
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+            <path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12z"></path>
+          </svg>
+          <span>Back to top</span>
+        </a>
+        <div class="content-icon-container">
+          <div class="theme-toggle-container theme-toggle-content">
+            <button class="theme-toggle">
+              <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+              <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+              <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+              <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+            </button>
+          </div>
+          <label class="toc-overlay-icon toc-content-icon no-toc" for="__toc">
+            <div class="visually-hidden">Toggle table of contents sidebar</div>
+            <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+          </label>
+        </div>
+        <article role="main">
+          <h1>Source code for cutlass.op.gemm</h1><div class="highlight"><pre>
+<span></span><span class="c1">#################################################################################################</span>
+<span class="c1">#</span>
+<span class="c1"># Copyright (c) 2023 - 2023 NVIDIA CORPORATION &amp; AFFILIATES. All rights reserved.</span>
+<span class="c1"># SPDX-License-Identifier: BSD-3-Clause</span>
+<span class="c1">#</span>
+<span class="c1"># Redistribution and use in source and binary forms, with or without</span>
+<span class="c1"># modification, are permitted provided that the following conditions are met:</span>
+<span class="c1">#</span>
+<span class="c1"># 1. Redistributions of source code must retain the above copyright notice, this</span>
+<span class="c1"># list of conditions and the following disclaimer.</span>
+<span class="c1">#</span>
+<span class="c1"># 2. Redistributions in binary form must reproduce the above copyright notice,</span>
+<span class="c1"># this list of conditions and the following disclaimer in the documentation</span>
+<span class="c1"># and/or other materials provided with the distribution.</span>
+<span class="c1">#</span>
+<span class="c1"># 3. Neither the name of the copyright holder nor the names of its</span>
+<span class="c1"># contributors may be used to endorse or promote products derived from</span>
+<span class="c1"># this software without specific prior written permission.</span>
+<span class="c1">#</span>
+<span class="c1"># THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS &quot;AS IS&quot;</span>
+<span class="c1"># AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE</span>
+<span class="c1"># IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE</span>
+<span class="c1"># DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE</span>
+<span class="c1"># FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL</span>
+<span class="c1"># DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR</span>
+<span class="c1"># SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER</span>
+<span class="c1"># CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,</span>
+<span class="c1"># OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE</span>
+<span class="c1"># OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.</span>
+<span class="c1">#</span>
+<span class="c1">#################################################################################################</span>
+
+<span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Ease-of-use interface for constructing, compiling, and running GEMMs.</span>
+
+<span class="sd">    The ``Gemm`` interface is meant to allow one to easily instantiate, compile, and run</span>
+<span class="sd">    GEMM operations in CUTLASS via Python, without specifying many configuration parameters.</span>
+<span class="sd">    Under the hood, the interface will select sensible default parameters for the many template</span>
+<span class="sd">    parameters for CUTLASS GEMMs.</span>
+
+<span class="sd">    Note: optimal performance is not to be expected from this interface. To achieve optimal</span>
+<span class="sd">    performance, one should specify and tune each configuration parameter.</span>
+
+<span class="sd">    The simplest example of using this interface is the following:</span>
+
+<span class="sd">    .. highlight:: python</span>
+<span class="sd">    .. code-block:: python</span>
+
+<span class="sd">        # A, B, C, and D are torch/numpy/cupy tensor objects</span>
+<span class="sd">        plan = cutlass.op.Gemm(A, B, C, D)</span>
+<span class="sd">        plan.run()</span>
+
+
+<span class="sd">    One can also use the interface by specifying data types of operands at construction</span>
+<span class="sd">    and using different tensor objects with these data types at runtime:</span>
+
+<span class="sd">    .. highlight:: python</span>
+<span class="sd">    .. code-block:: python</span>
+
+<span class="sd">        # The following is shorthand for:</span>
+<span class="sd">        #        cutlass.op.Gemm(element_A=torch.float32, element_B=torch.float32,</span>
+<span class="sd">        #                        element_C=torch.float32, element_D=torch.float32,</span>
+<span class="sd">        #                        element_accumulator=torch.float32,</span>
+<span class="sd">        #                        layout=cutlass.LayoutType.RowMajor)</span>
+<span class="sd">        plan = cutlass.op.Gemm(element=torch.float32, layout=cutlass.LayoutType.RowMajor)</span>
+
+<span class="sd">        A0 = torch.rand((128, 256), device=&#39;cuda&#39;)</span>
+<span class="sd">        B0 = torch.rand((256, 64), device=&#39;cuda&#39;)</span>
+<span class="sd">        C0 = torch.zeros((128, 64), device=&#39;cuda&#39;)</span>
+<span class="sd">        D0 = torch.zeros((128, 64), device.&#39;cuda&#39;)</span>
+<span class="sd">        plan.run(A0, B0, C0, D0)</span>
+
+<span class="sd">        A = torch.rand((32, 128), device=&#39;cuda&#39;)</span>
+<span class="sd">        B = torch.rand((128, 256), device=&#39;cuda&#39;)</span>
+<span class="sd">        C = torch.zeros((32, 256), device=&#39;cuda&#39;)</span>
+<span class="sd">        D = torch.zeros((32, 256), device.&#39;cuda&#39;)</span>
+<span class="sd">        plan.run(A1, B1, C1, D1)</span>
+
+<span class="sd">    The interface additionally enables one to decouple the compilation of the underlying CUTLASS</span>
+<span class="sd">    kernel from its execution:</span>
+
+<span class="sd">    .. highlight:: python</span>
+<span class="sd">    .. code-block:: python</span>
+
+<span class="sd">        plan = cutlass.op.Gemm(element=np.float32, layout=cutlass.LayoutType.RowMajor)</span>
+<span class="sd">        plan.compile()</span>
+
+<span class="sd">        # Do other work...</span>
+
+<span class="sd">        plan.run(A0, B0, C0, D0)</span>
+
+<span class="sd">        # Do other work...</span>
+
+<span class="sd">        plan.run(A1, B1, C1, D1)</span>
+
+<span class="sd">    Elementwise activation functions are easily fused to the GEMM via the interface:</span>
+
+<span class="sd">    .. highlight:: python</span>
+<span class="sd">    .. code-block:: python</span>
+
+<span class="sd">        plan = cutlass.op.Gemm(element=np.float32, layout=cutlass.LayoutType.RowMajor)</span>
+<span class="sd">        plan.activation = cutlass.epilogue.relu</span>
+
+<span class="sd">    Operations can also be run asynchronously:</span>
+
+<span class="sd">    .. highlight:: python</span>
+<span class="sd">    .. code-block:: python</span>
+
+<span class="sd">        plan = cutlass.op.Gemm(element=np.float32, layout=cutlass.LayoutType.RowMajor)</span>
+<span class="sd">        args = plan.run()</span>
+
+<span class="sd">        # Do other work...</span>
+
+<span class="sd">        args.sync()</span>
+<span class="sd">&quot;&quot;&quot;</span>
+
+<span class="kn">import</span> <span class="nn">cutlass_bindings</span>
+
+<span class="kn">import</span> <span class="nn">cutlass</span>
+<span class="kn">from</span> <span class="nn">cutlass</span> <span class="kn">import</span> <span class="n">epilogue</span><span class="p">,</span> <span class="n">swizzle</span>
+<span class="kn">from</span> <span class="nn">cutlass.backend</span> <span class="kn">import</span> <span class="n">compiler</span>
+<span class="kn">from</span> <span class="nn">cutlass.backend.gemm_operation</span> <span class="kn">import</span> <span class="n">GemmArguments</span><span class="p">,</span> <span class="n">GemmOperationUniversal</span>
+<span class="kn">from</span> <span class="nn">cutlass.backend.library</span> <span class="kn">import</span> <span class="n">TensorDescription</span><span class="p">,</span> <span class="n">TileDescription</span>
+<span class="kn">from</span> <span class="nn">cutlass.op.op</span> <span class="kn">import</span> <span class="n">OperationBase</span>
+<span class="kn">from</span> <span class="nn">cutlass.utils</span> <span class="kn">import</span> <span class="n">check</span><span class="p">,</span> <span class="n">datatypes</span>
+
+
+<div class="viewcode-block" id="Gemm"><a class="viewcode-back" href="../../../cutlass.op.html#cutlass.op.gemm.Gemm">[docs]</a><span class="k">class</span> <span class="nc">Gemm</span><span class="p">(</span><span class="n">OperationBase</span><span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Constructs a ``Gemm`` object.</span>
+
+<span class="sd">    The data types and layouts of operands A, B, and C, along with the data type of output D</span>
+<span class="sd">    and that used for accumulation, are bound to the ``Gemm`` object throughout its lifetime --</span>
+<span class="sd">    these are not to be changed after a ``Gemm`` has been constructed.</span>
+
+<span class="sd">    The constructor has optional parameters for flexibly setting these parameters. The following</span>
+<span class="sd">    constructors are equivalent:</span>
+
+<span class="sd">    .. highlight:: python</span>
+<span class="sd">    .. code-block:: python</span>
+
+<span class="sd">        # Use F32 for A, B, C, D, and accumulation. All operands are row major.</span>
+
+<span class="sd">        # Use the generic ``element`` and ``layout`` parameters to concisely set all data types and layouts</span>
+<span class="sd">        # for operands to the same values.</span>
+<span class="sd">        Gemm(element=cutlass.DataType.f32, layout=cutlass.LayoutType.RowMajor)</span>
+
+<span class="sd">        # Explicitly specify the data types to use for A, B, C, and D. Use the generic ``layout``.</span>
+<span class="sd">        Gemm(element_A=cutlass.DataType.f32, element_B=cutlass.DataType.f32, element_C=cutlass.DataType.f32,</span>
+<span class="sd">            element_D=cutlass.DataType.f32, layout=cutlass.LayoutType.RowMajor)</span>
+
+<span class="sd">        # Set the data types and elements from existing tensors. Note that one can use different tensors when</span>
+<span class="sd">        # executing GEMM via the ``run()`` method than passed in here (though those passed in to ``run()`` must</span>
+<span class="sd">        # have the same data type and layout as those passed in here).</span>
+<span class="sd">        # A, B, C, and D are row-major torch.Tensor objects of type torch.float32</span>
+<span class="sd">        Gemm(A=A, B=B, C=C, D=D)</span>
+
+<span class="sd">        # Use the generic ``element`` and explicitly specify the layouts to use for A, B, and C (layout of D is</span>
+<span class="sd">        # the same as that for D, at present)</span>
+<span class="sd">        Gemm(element=cutlass.DataType.f32, layout_A=cutlass.LayoutType.RowMajor,</span>
+<span class="sd">            layout_B=cutlass.LayoutType.RowMajor, layout_C=cutlass.LayoutType.RowMajor)</span>
+
+<span class="sd">        # Explicitly specify the data type and layout for only some of A, B, C, and D. Unspecified data types</span>
+<span class="sd">        # and layouts will inherit those passed in via the generic ``element`` and ``layout``</span>
+<span class="sd">        Gemm(element_A=cutlass.DataType.f32, layout_B=cutlass.LayoutType.RowMajor,</span>
+<span class="sd">            element=cutlass.DataType.f32, layout=cutlass.LayoutType.RowMajor)</span>
+
+<span class="sd">    The order of precedence for the setting of the data type and layout for a given operand/output is as follows:</span>
+<span class="sd">        1) If the tensor type is specified (e.g., ``A``), use the data type and layout inferred from this tensor</span>
+<span class="sd">        2) Otherwise, if the data type/layout (e.g., ``element_A``, ``layout_A``) is specified, use those</span>
+<span class="sd">        3) Otherwise, use the generic values (e.g., ``element``, ``layout``)</span>
+
+<span class="sd">    :param cc: compute capability of device for which kernels should be compiled. For example, if running on H100, this should be set to 90</span>
+<span class="sd">    :type cc: int</span>
+<span class="sd">    :param kernel_cc: compute capability of kernels to generate. For example, if running on SM90, but desiring to use a CUTLASS 2.x-style Ampere kernel, this should be set to 80</span>
+<span class="sd">    :type kernel_cc: int</span>
+<span class="sd">    :param A: tensor representing data type and layout of operand A</span>
+<span class="sd">    :param B: tensor representing data type and layout of operand B</span>
+<span class="sd">    :param C: tensor representing data type and layout of operand C</span>
+<span class="sd">    :param D: tensor representing data type and layout of operand D</span>
+<span class="sd">    :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B</span>
+<span class="sd">    :param beta: scalar parameter beta from GEMM operation that scales operand C</span>
+<span class="sd">    :param element_accumulator: data type to be used in accumulation of the product of operands A and B</span>
+<span class="sd">    :type element_accumulator: cutlass.DataType</span>
+<span class="sd">    :param element: generic data type to be used for operands A, B, C, D, as well as the accumulation data type</span>
+<span class="sd">    :type element: cutlass.DataType</span>
+<span class="sd">    :param layout: generic layout type to be used for operands A, B, C, and D</span>
+<span class="sd">    :type layout: cutlass.LayoutType</span>
+<span class="sd">    :param element_A: data type to be used for operand A</span>
+<span class="sd">    :type element_A: cutlass.DataType</span>
+<span class="sd">    :param element_B: data type to be used for operand B</span>
+<span class="sd">    :type element_B: cutlass.DataType</span>
+<span class="sd">    :param element_C: data type to be used for operand C</span>
+<span class="sd">    :type element_C: cutlass.DataType</span>
+<span class="sd">    :param element_D: data type to be used for operand D</span>
+<span class="sd">    :type element_D: cutlass.DataType</span>
+<span class="sd">    :type layout_A: layout of operand A</span>
+<span class="sd">    :param layout_A: cutlass.LayoutType</span>
+<span class="sd">    :type layout_B: layout of operand B</span>
+<span class="sd">    :param layout_B: cutlass.LayoutType</span>
+<span class="sd">    :type layout_C: layout of operand C</span>
+<span class="sd">    :param layout_C: cutlass.LayoutType</span>
+<span class="sd">    :type layout_D: layout of operand D</span>
+<span class="sd">    :param layout_D: cutlass.LayoutType</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+
+    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
+        <span class="bp">self</span><span class="p">,</span> <span class="n">A</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">B</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">C</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">D</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
+        <span class="n">alpha</span><span class="o">=</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">beta</span><span class="o">=</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">element_accumulator</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
+        <span class="n">element</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">layout</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
+        <span class="n">element_A</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">element_B</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">element_C</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">element_D</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
+        <span class="n">layout_A</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">layout_B</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">layout_C</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
+        <span class="n">cc</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">kernel_cc</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="kc">None</span>
+    <span class="p">):</span>
+        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="n">cc</span><span class="o">=</span><span class="n">cc</span><span class="p">,</span> <span class="n">kernel_cc</span><span class="o">=</span><span class="n">kernel_cc</span><span class="p">)</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="s2">&quot;gemm&quot;</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">compiled</span> <span class="o">=</span> <span class="kc">False</span>
+
+        <span class="n">elements</span> <span class="o">=</span> <span class="p">[]</span>
+        <span class="n">layouts</span> <span class="o">=</span> <span class="p">[]</span>
+
+        <span class="c1"># Check that at least one of the following is set for each tensor (illustrated assuming tensor A):</span>
+        <span class="c1"># ``A``, ``element_A``, ``element`` and ``A``, ``layout_A``, ``layout``</span>
+        <span class="k">for</span> <span class="n">elt</span><span class="p">,</span> <span class="n">lay</span><span class="p">,</span> <span class="n">tens</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">([</span><span class="n">element_A</span><span class="p">,</span> <span class="n">element_B</span><span class="p">,</span> <span class="n">element_C</span><span class="p">,</span> <span class="n">element_D</span><span class="p">],</span>
+                                        <span class="p">[</span><span class="n">layout_A</span><span class="p">,</span> <span class="n">layout_B</span><span class="p">,</span> <span class="n">layout_C</span><span class="p">,</span> <span class="n">layout_C</span><span class="p">],</span>
+                                        <span class="p">[</span><span class="n">A</span><span class="p">,</span> <span class="n">B</span><span class="p">,</span> <span class="n">C</span><span class="p">,</span> <span class="n">D</span><span class="p">],</span>
+                                        <span class="p">[</span><span class="s2">&quot;A&quot;</span><span class="p">,</span> <span class="s2">&quot;B&quot;</span><span class="p">,</span> <span class="s2">&quot;C&quot;</span><span class="p">,</span> <span class="s2">&quot;D&quot;</span><span class="p">]):</span>
+            <span class="k">if</span> <span class="n">elt</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">tens</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+                <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span><span class="sa">f</span><span class="s1">&#39;Must not specify both element_</span><span class="si">{</span><span class="n">name</span><span class="si">}</span><span class="s1"> and tensor </span><span class="si">{</span><span class="n">name</span><span class="si">}</span><span class="s1">&#39;</span><span class="p">)</span>
+            <span class="k">if</span> <span class="n">lay</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">tens</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+                <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span><span class="sa">f</span><span class="s1">&#39;Must not specify both layout_</span><span class="si">{</span><span class="n">name</span><span class="si">}</span><span class="s1"> and tensor </span><span class="si">{</span><span class="n">name</span><span class="si">}</span><span class="s1">&#39;</span><span class="p">)</span>
+            <span class="k">if</span> <span class="n">elt</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">tens</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">element</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+                <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span><span class="sa">f</span><span class="s1">&#39;Must specify one of element_</span><span class="si">{</span><span class="n">name</span><span class="si">}</span><span class="s1">, tensor </span><span class="si">{</span><span class="n">name</span><span class="si">}</span><span class="s1">, or generic element.&#39;</span><span class="p">)</span>
+            <span class="k">if</span> <span class="n">lay</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">tens</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">layout</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+                <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span><span class="sa">f</span><span class="s1">&#39;Must specify one of layout_</span><span class="si">{</span><span class="n">name</span><span class="si">}</span><span class="s1">, tensor </span><span class="si">{</span><span class="n">name</span><span class="si">}</span><span class="s1">, or generic layout.&#39;</span><span class="p">)</span>
+
+            <span class="n">elt_to_set</span> <span class="o">=</span> <span class="kc">None</span>
+            <span class="n">lay_to_set</span> <span class="o">=</span> <span class="kc">None</span>
+            <span class="k">if</span> <span class="n">tens</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+                <span class="n">elt_to_set</span><span class="p">,</span> <span class="n">lay_to_set</span> <span class="o">=</span> <span class="n">datatypes</span><span class="o">.</span><span class="n">get_datatype_and_layout</span><span class="p">(</span><span class="n">tens</span><span class="p">)</span>
+            <span class="k">else</span><span class="p">:</span>
+                <span class="n">elt_to_set</span> <span class="o">=</span> <span class="n">elt</span> <span class="k">if</span> <span class="n">elt</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">element</span>
+                <span class="n">lay_to_set</span> <span class="o">=</span> <span class="n">lay</span> <span class="k">if</span> <span class="n">lay</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">layout</span>
+
+            <span class="n">elements</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">datatypes</span><span class="o">.</span><span class="n">library_type</span><span class="p">(</span><span class="n">elt_to_set</span><span class="p">))</span>
+            <span class="n">layouts</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">datatypes</span><span class="o">.</span><span class="n">library_layout</span><span class="p">(</span><span class="n">lay_to_set</span><span class="p">))</span>
+
+        <span class="bp">self</span><span class="o">.</span><span class="n">_element_a</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_element_b</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_element_c</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_element_d</span> <span class="o">=</span> <span class="n">elements</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">_layout_a</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_layout_b</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_layout_c</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_layout_d</span> <span class="o">=</span> <span class="n">layouts</span>
+
+        <span class="k">if</span> <span class="n">element_accumulator</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">_element_accumulator</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_element_c</span>
+        <span class="k">else</span><span class="p">:</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">_element_accumulator</span> <span class="o">=</span> <span class="n">datatypes</span><span class="o">.</span><span class="n">library_type</span><span class="p">(</span><span class="n">element_accumulator</span><span class="p">)</span>
+
+        <span class="bp">self</span><span class="o">.</span><span class="n">A</span> <span class="o">=</span> <span class="n">A</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">B</span> <span class="o">=</span> <span class="n">B</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">C</span> <span class="o">=</span> <span class="n">C</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">D</span> <span class="o">=</span> <span class="n">D</span>
+
+        <span class="bp">self</span><span class="o">.</span><span class="n">alpha</span> <span class="o">=</span> <span class="n">alpha</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">beta</span> <span class="o">=</span> <span class="n">beta</span>
+
+        <span class="bp">self</span><span class="o">.</span><span class="n">epilogue_functor</span> <span class="o">=</span> <span class="kc">None</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">op_class</span> <span class="o">=</span> <span class="kc">None</span>
+
+        <span class="bp">self</span><span class="o">.</span><span class="n">_reset_operations</span><span class="p">()</span>
+
+        <span class="bp">self</span><span class="o">.</span><span class="n">_swizzling_functor</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">swizzle</span><span class="o">.</span><span class="n">IdentitySwizzle1</span>
+
+    <span class="k">def</span> <span class="nf">_reset_operations</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">reset_epilogue</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">):</span>
+        <span class="c1"># Set the default op class</span>
+        <span class="n">datatype_comb</span> <span class="o">=</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_element_a</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_element_b</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_element_accumulator</span><span class="p">)</span>
+        <span class="n">layout_comb</span> <span class="o">=</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_layout_a</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_layout_b</span><span class="p">)</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">possible_op_classes</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">options</span><span class="o">.</span><span class="n">supporting_opclasses</span><span class="p">(</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">_element_a</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_element_b</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_element_accumulator</span><span class="p">,</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">_layout_a</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_layout_b</span><span class="p">)</span>
+
+        <span class="k">if</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">OpcodeClass</span><span class="o">.</span><span class="n">TensorOp</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">possible_op_classes</span><span class="p">:</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">opclass</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">OpcodeClass</span><span class="o">.</span><span class="n">TensorOp</span>
+        <span class="k">elif</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">OpcodeClass</span><span class="o">.</span><span class="n">Simt</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">possible_op_classes</span><span class="p">:</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">opclass</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">OpcodeClass</span><span class="o">.</span><span class="n">Simt</span>
+        <span class="k">else</span><span class="p">:</span>
+            <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span><span class="sa">f</span><span class="s1">&#39;No kernel configuration found for supported data type and layout &#39;</span>
+                            <span class="sa">f</span><span class="s1">&#39;combination </span><span class="si">{</span><span class="n">datatype_comb</span><span class="si">}</span><span class="s1">x</span><span class="si">{</span><span class="n">layout_comb</span><span class="si">}</span><span class="s1">&#39;</span><span class="p">)</span>
+
+        <span class="k">if</span> <span class="n">reset_epilogue</span><span class="p">:</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">_reset_epilogue_functor_activation</span><span class="p">(</span><span class="n">epilogue</span><span class="o">.</span><span class="n">identity</span><span class="p">)</span>
+
+    <span class="k">def</span> <span class="nf">_reset_epilogue_functor_activation</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">activation</span><span class="p">):</span>
+        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">epilogue_functor</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">op_class</span> <span class="o">==</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">OpcodeClass</span><span class="o">.</span><span class="n">Simt</span><span class="p">:</span>
+                <span class="n">elements_per_access</span> <span class="o">=</span> <span class="mi">1</span>
+            <span class="k">else</span><span class="p">:</span>
+                <span class="n">elements_per_access</span> <span class="o">=</span> <span class="mi">128</span> <span class="o">//</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataTypeSize</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_element_c</span><span class="p">]</span>
+        <span class="k">else</span><span class="p">:</span>
+            <span class="n">elements_per_access</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">epilogue_functor</span><span class="o">.</span><span class="n">epilogue_vector_length</span>
+
+        <span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">specified_kernel_cc</span><span class="p">:</span>
+            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">current_cc</span> <span class="o">==</span> <span class="mi">90</span> <span class="ow">and</span> <span class="n">activation</span> <span class="o">!=</span> <span class="n">epilogue</span><span class="o">.</span><span class="n">identity</span><span class="p">:</span>
+                <span class="c1"># CUTLASS 3.0 kernels currently only support identity activation. If one requests a non-identity activation,</span>
+                <span class="c1"># revert to using a CUTLASS 2.x kernel by using SM80-tagged kernels.</span>
+                <span class="n">cutlass</span><span class="o">.</span><span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="s2">&quot;Reverting to using SM80-tagged kernel. Opclass may change.&quot;</span><span class="p">)</span>
+                <span class="bp">self</span><span class="o">.</span><span class="n">_reset_options</span><span class="p">(</span><span class="mi">80</span><span class="p">)</span>
+                <span class="bp">self</span><span class="o">.</span><span class="n">_reset_operations</span><span class="p">(</span><span class="n">reset_epilogue</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
+            <span class="k">elif</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">cc</span> <span class="o">==</span> <span class="mi">90</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">current_cc</span> <span class="o">!=</span> <span class="mi">90</span> <span class="ow">and</span> <span class="n">activation</span> <span class="o">==</span> <span class="n">epilogue</span><span class="o">.</span><span class="n">identity</span><span class="p">):</span>
+                <span class="c1"># SM80 fallback kernels are currently used. Since an identity activation is requested,</span>
+                <span class="c1"># we can switch back to using SM90 kernels.</span>
+                <span class="bp">self</span><span class="o">.</span><span class="n">_reset_options</span><span class="p">(</span><span class="mi">90</span><span class="p">)</span>
+                <span class="bp">self</span><span class="o">.</span><span class="n">_reset_operations</span><span class="p">(</span><span class="n">reset_epilogue</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
+        <span class="k">else</span><span class="p">:</span>
+            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">current_cc</span> <span class="o">==</span> <span class="mi">90</span> <span class="ow">and</span> <span class="n">activation</span> <span class="o">!=</span> <span class="n">epilogue</span><span class="o">.</span><span class="n">identity</span><span class="p">:</span>
+                <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span><span class="s2">&quot;Epilogues with elementwise fusion are not currently supported &quot;</span>
+                                <span class="s2">&quot;in the Python interface for 3.x kernels. To use 2.x kernels &quot;</span>
+                                <span class="s2">&quot;with fused elementwise epilogues, do not set the `kernel_cc` &quot;</span>
+                                <span class="s2">&quot;parameter when constructing the Gemm object.&quot;</span><span class="p">)</span>
+
+        <span class="bp">self</span><span class="o">.</span><span class="n">epilogue_functor</span> <span class="o">=</span> <span class="n">epilogue</span><span class="o">.</span><span class="n">get_activation_epilogue</span><span class="p">(</span>
+            <span class="n">activation</span><span class="p">,</span>
+            <span class="n">datatypes</span><span class="o">.</span><span class="n">binding_type</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_element_c</span><span class="p">),</span>
+            <span class="n">elements_per_access</span><span class="p">,</span>
+            <span class="n">datatypes</span><span class="o">.</span><span class="n">binding_type</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_element_accumulator</span><span class="p">),</span>
+            <span class="n">datatypes</span><span class="o">.</span><span class="n">binding_type</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_element_accumulator</span><span class="p">),</span>
+        <span class="p">)</span>
+
+    <span class="k">def</span> <span class="nf">_reset_epilogue_functor_alignment</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">alignment</span><span class="p">):</span>
+        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">epilogue_functor</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="ow">not</span> <span class="nb">hasattr</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">epilogue_functor</span><span class="p">,</span> <span class="s1">&#39;activation_functor&#39;</span><span class="p">):</span>
+            <span class="n">activation</span> <span class="o">=</span> <span class="n">epilogue</span><span class="o">.</span><span class="n">identity</span>
+        <span class="k">else</span><span class="p">:</span>
+            <span class="n">activation</span> <span class="o">=</span> <span class="nb">type</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">epilogue_functor</span><span class="o">.</span><span class="n">activation_functor</span><span class="p">)</span>
+
+        <span class="bp">self</span><span class="o">.</span><span class="n">epilogue_functor</span> <span class="o">=</span> <span class="n">epilogue</span><span class="o">.</span><span class="n">get_activation_epilogue</span><span class="p">(</span>
+            <span class="n">activation</span><span class="p">,</span>
+            <span class="n">datatypes</span><span class="o">.</span><span class="n">binding_type</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_element_c</span><span class="p">),</span>
+            <span class="n">alignment</span><span class="p">,</span>
+            <span class="n">datatypes</span><span class="o">.</span><span class="n">binding_type</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_element_accumulator</span><span class="p">),</span>
+            <span class="n">datatypes</span><span class="o">.</span><span class="n">binding_type</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_element_accumulator</span><span class="p">),</span>
+        <span class="p">)</span>
+
+    <span class="nd">@property</span>
+    <span class="k">def</span> <span class="nf">activation</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Returns the type of the current activation function used</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="k">return</span> <span class="nb">type</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">epilogue_functor</span><span class="o">.</span><span class="n">activation_functor</span><span class="p">)</span>
+
+    <span class="nd">@activation</span><span class="o">.</span><span class="n">setter</span>
+    <span class="k">def</span> <span class="nf">activation</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">act</span><span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Sets the type of the activation function to use</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">_reset_epilogue_functor_activation</span><span class="p">(</span><span class="n">act</span><span class="p">)</span>
+
+    <span class="nd">@property</span>
+    <span class="k">def</span> <span class="nf">opclass</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">OpcodeClass</span><span class="p">:</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Returns the opcode class currently in use by the GEMM</span>
+
+<span class="sd">        :return: opcode class currently in use</span>
+<span class="sd">        :rtype: cutlass.OpcodeClass</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">op_class</span>
+
+    <span class="nd">@opclass</span><span class="o">.</span><span class="n">setter</span>
+    <span class="k">def</span> <span class="nf">opclass</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">oc</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">OpcodeClass</span><span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Sets the opcode class to use in the GEMM. If the opcode class is not supported under</span>
+<span class="sd">        the given compute capability and element/layout combinations of the GEMM, an exception is raised.</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="k">if</span> <span class="n">oc</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">possible_op_classes</span><span class="p">:</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">op_class</span> <span class="o">=</span> <span class="n">oc</span>
+        <span class="k">else</span><span class="p">:</span>
+            <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span>
+                <span class="sa">f</span><span class="s1">&#39;Unsupported operation class </span><span class="si">{</span><span class="n">oc</span><span class="si">}</span><span class="s1"> for CC </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">cc</span><span class="si">}</span><span class="s1"> and data type combination &#39;</span>
+                <span class="sa">f</span><span class="s1">&#39;(</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">_element_a</span><span class="si">}</span><span class="s1">, </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">_element_b</span><span class="si">}</span><span class="s1">, </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">_element_accumulator</span><span class="si">}</span><span class="s1">) and &#39;</span>
+                <span class="sa">f</span><span class="s1">&#39;layout combination (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">_layout_a</span><span class="si">}</span><span class="s1">, </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">_layout_b</span><span class="si">}</span><span class="s1">).&#39;</span><span class="p">)</span>
+
+        <span class="c1"># Changing the op class changes the elements per access in the epilogue. Reset this.</span>
+        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">op_class</span> <span class="o">==</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">OpcodeClass</span><span class="o">.</span><span class="n">Simt</span><span class="p">:</span>
+            <span class="n">elements_per_access</span> <span class="o">=</span> <span class="mi">1</span>
+        <span class="k">else</span><span class="p">:</span>
+            <span class="n">elements_per_access</span> <span class="o">=</span> <span class="mi">128</span> <span class="o">//</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataTypeSize</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_element_c</span><span class="p">]</span>
+
+        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">epilogue_functor</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">_reset_epilogue_functor_alignment</span><span class="p">(</span><span class="n">elements_per_access</span><span class="p">)</span>
+
+        <span class="c1"># Changing the op class also changes the possible operations available. Reset these.</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">possible_operations</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">options</span><span class="o">.</span><span class="n">operations</span><span class="p">(</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">op_class</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_element_a</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_element_b</span><span class="p">,</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">_element_accumulator</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_layout_a</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_layout_b</span><span class="p">)</span>
+
+    <span class="nd">@property</span>
+    <span class="k">def</span> <span class="nf">swizzling_functor</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Returns the type of the swizzling functor currently being used by the GEMM</span>
+
+<span class="sd">        :return: swizzing functor type</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_swizzling_functor</span>
+
+    <span class="nd">@swizzling_functor</span><span class="o">.</span><span class="n">setter</span>
+    <span class="k">def</span> <span class="nf">swizzling_functor</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">swizzling_functor</span><span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Sets the swizzling functor to the type specified by `swizzling_functor`</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="k">if</span> <span class="n">swizzling_functor</span> <span class="o">==</span> <span class="n">swizzle</span><span class="o">.</span><span class="n">ThreadblockSwizzleStreamK</span><span class="p">:</span>
+            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">op_class</span> <span class="o">==</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">OpcodeClass</span><span class="o">.</span><span class="n">Simt</span><span class="p">:</span>
+                <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span><span class="s1">&#39;ThreadblockSwizzleStreamK is currently only supported with opcode class TensorOp&#39;</span><span class="p">)</span>
+
+            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">current_cc</span> <span class="o">==</span> <span class="mi">90</span><span class="p">:</span>
+                <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span><span class="s1">&#39;ThreadblockSwizzleStreamK is currently unsupported on SM90&#39;</span><span class="p">)</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">_swizzling_functor</span> <span class="o">=</span> <span class="n">swizzling_functor</span>
+
+    <span class="k">def</span> <span class="nf">_valid_tile_description</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">td</span><span class="p">:</span> <span class="n">TileDescription</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">tuple</span><span class="p">:</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Checks whether the provided tile description is valid for the given compute capability. At present,</span>
+<span class="sd">        this checks the following:</span>
+
+<span class="sd">        - Does the tile description use a number of stages supported by the compute capability in question?</span>
+<span class="sd">        - Does the tile size requested fit within shared memory?</span>
+<span class="sd">        - Are cluster dimensions outside the valid range requested for a given architecture (e.g.,</span>
+<span class="sd">          more non-unit cluster dimensions for pre-SM90 architectures)?</span>
+<span class="sd">        - Is the kernel schedule being used supported on the architecture in question?</span>
+
+<span class="sd">        :param td: tile description to validate</span>
+<span class="sd">        :type td: cutlass.backend.TileDescription</span>
+<span class="sd">        :return: tuple in which the first element is a bool indicating that the tile description is valid</span>
+<span class="sd">                 and the second element is a string providing an optional error message.</span>
+<span class="sd">        :rtype: tuple</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="c1"># Check stage count based on the CC to which we are compiling (self.cc), rather</span>
+        <span class="c1"># than the CC from which we find kernels (self.current_cc)</span>
+        <span class="n">valid</span><span class="p">,</span> <span class="n">msg</span> <span class="o">=</span> <span class="n">check</span><span class="o">.</span><span class="n">valid_stage_count</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">cc</span><span class="p">,</span> <span class="n">td</span><span class="p">)</span>
+        <span class="k">if</span> <span class="ow">not</span> <span class="n">valid</span><span class="p">:</span>
+            <span class="k">return</span> <span class="p">(</span><span class="n">valid</span><span class="p">,</span> <span class="n">msg</span><span class="p">)</span>
+
+        <span class="n">valid</span><span class="p">,</span> <span class="n">msg</span> <span class="o">=</span> <span class="n">check</span><span class="o">.</span><span class="n">valid_cluster_shape</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">current_cc</span><span class="p">,</span> <span class="n">td</span><span class="o">.</span><span class="n">cluster_shape</span><span class="p">)</span>
+        <span class="k">if</span> <span class="ow">not</span> <span class="n">valid</span><span class="p">:</span>
+            <span class="k">return</span> <span class="p">(</span><span class="n">valid</span><span class="p">,</span> <span class="n">msg</span><span class="p">)</span>
+
+        <span class="n">valid</span><span class="p">,</span> <span class="n">msg</span> <span class="o">=</span> <span class="n">check</span><span class="o">.</span><span class="n">valid_kernel_schedule</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">current_cc</span><span class="p">,</span> <span class="n">td</span><span class="o">.</span><span class="n">kernel_schedule</span><span class="p">)</span>
+        <span class="k">return</span> <span class="n">valid</span><span class="p">,</span> <span class="n">msg</span>
+
+<div class="viewcode-block" id="Gemm.tile_descriptions"><a class="viewcode-back" href="../../../cutlass.op.html#cutlass.op.gemm.Gemm.tile_descriptions">[docs]</a>    <span class="k">def</span> <span class="nf">tile_descriptions</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">list</span><span class="p">:</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Returns a list of valid tile descriptions for the operations</span>
+
+<span class="sd">        :returns: list of valid tile descriptions for the operations</span>
+<span class="sd">        :rtype: list</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="k">return</span> <span class="p">[</span><span class="n">datatypes</span><span class="o">.</span><span class="n">td_from_profiler_op</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">possible_operations</span><span class="o">.</span><span class="n">all_operations</span><span class="p">]</span></div>
+
+<div class="viewcode-block" id="Gemm.construct"><a class="viewcode-back" href="../../../cutlass.op.html#cutlass.op.gemm.Gemm.construct">[docs]</a>    <span class="k">def</span> <span class="nf">construct</span><span class="p">(</span>
+        <span class="bp">self</span><span class="p">,</span> <span class="n">tile_description</span><span class="p">:</span> <span class="n">TileDescription</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+        <span class="n">alignment_A</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">alignment_B</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">alignment_C</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">GemmOperationUniversal</span><span class="p">:</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Constructs a ``cutlass.backend.GemmUniversalOperation`` based on the input parameters and current</span>
+<span class="sd">        kernel specification of the ``Gemm`` object.</span>
+
+<span class="sd">        :param tile_description: tile description specifying shapes and operand types to use in the kernel</span>
+<span class="sd">        :type tile_description: cutlass.backend.TileDescription</span>
+<span class="sd">        :param alignment_A: alignment of operand A</span>
+<span class="sd">        :type alignment_A: int</span>
+<span class="sd">        :param alignment_B: alignment of operand B</span>
+<span class="sd">        :type alignment_B: int</span>
+<span class="sd">        :param alignment_C: alignment of operand C</span>
+<span class="sd">        :type alignment_C: int</span>
+
+<span class="sd">        :return: operation that was constructed</span>
+<span class="sd">        :rtype: cutlass.backend.GemmOperationUniversal</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="n">alignment_pref_A</span> <span class="o">=</span> <span class="nb">min</span><span class="p">(</span><span class="mi">128</span> <span class="o">//</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataTypeSize</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_element_a</span><span class="p">],</span> <span class="nb">max</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">possible_operations</span><span class="o">.</span><span class="n">alignments</span><span class="p">))</span>
+        <span class="n">alignment_pref_B</span> <span class="o">=</span> <span class="nb">min</span><span class="p">(</span><span class="mi">128</span> <span class="o">//</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataTypeSize</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_element_b</span><span class="p">],</span> <span class="nb">max</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">possible_operations</span><span class="o">.</span><span class="n">alignments</span><span class="p">))</span>
+        <span class="n">alignment_pref_C</span> <span class="o">=</span> <span class="nb">min</span><span class="p">(</span><span class="mi">128</span> <span class="o">//</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataTypeSize</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_element_c</span><span class="p">],</span> <span class="nb">max</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">possible_operations</span><span class="o">.</span><span class="n">alignments</span><span class="p">))</span>
+        <span class="n">alignment_A</span> <span class="o">=</span> <span class="n">check</span><span class="o">.</span><span class="n">alignment_or_default</span><span class="p">(</span><span class="n">alignment_A</span><span class="p">,</span> <span class="n">alignment_pref_A</span><span class="p">)</span>
+        <span class="n">alignment_B</span> <span class="o">=</span> <span class="n">check</span><span class="o">.</span><span class="n">alignment_or_default</span><span class="p">(</span><span class="n">alignment_B</span><span class="p">,</span> <span class="n">alignment_pref_B</span><span class="p">)</span>
+        <span class="n">alignment_C</span> <span class="o">=</span> <span class="n">check</span><span class="o">.</span><span class="n">alignment_or_default</span><span class="p">(</span><span class="n">alignment_C</span><span class="p">,</span> <span class="n">alignment_pref_C</span><span class="p">)</span>
+
+        <span class="bp">self</span><span class="o">.</span><span class="n">_reset_epilogue_functor_alignment</span><span class="p">(</span><span class="n">alignment_C</span><span class="p">)</span>
+
+        <span class="n">tensor_A</span> <span class="o">=</span> <span class="n">TensorDescription</span><span class="p">(</span>
+            <span class="n">datatypes</span><span class="o">.</span><span class="n">binding_type</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_element_a</span><span class="p">),</span>
+            <span class="n">datatypes</span><span class="o">.</span><span class="n">binding_layout</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_layout_a</span><span class="p">),</span>
+            <span class="n">alignment_A</span>
+        <span class="p">)</span>
+        <span class="n">tensor_B</span> <span class="o">=</span> <span class="n">TensorDescription</span><span class="p">(</span>
+            <span class="n">datatypes</span><span class="o">.</span><span class="n">binding_type</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_element_b</span><span class="p">),</span>
+            <span class="n">datatypes</span><span class="o">.</span><span class="n">binding_layout</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_layout_b</span><span class="p">),</span>
+            <span class="n">alignment_B</span>
+        <span class="p">)</span>
+        <span class="n">tensor_C</span> <span class="o">=</span> <span class="n">TensorDescription</span><span class="p">(</span>
+            <span class="n">datatypes</span><span class="o">.</span><span class="n">binding_type</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_element_c</span><span class="p">),</span>
+            <span class="n">datatypes</span><span class="o">.</span><span class="n">binding_layout</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_layout_c</span><span class="p">),</span>
+            <span class="n">alignment_C</span>
+        <span class="p">)</span>
+
+        <span class="k">if</span> <span class="n">tile_description</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="n">op</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">possible_operations</span><span class="o">.</span><span class="n">operations</span><span class="p">(</span><span class="n">alignment_A</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>
+            <span class="n">tile_description</span> <span class="o">=</span> <span class="n">datatypes</span><span class="o">.</span><span class="n">td_from_profiler_op</span><span class="p">(</span><span class="n">op</span><span class="p">)</span>
+        <span class="k">else</span><span class="p">:</span>
+            <span class="n">valid</span><span class="p">,</span> <span class="n">err_str</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_valid_tile_description</span><span class="p">(</span><span class="n">tile_description</span><span class="p">)</span>
+            <span class="k">if</span> <span class="ow">not</span> <span class="n">valid</span><span class="p">:</span>
+                <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Invalid tile description. </span><span class="si">{</span><span class="n">err_str</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">tile_description</span> <span class="o">=</span> <span class="n">tile_description</span>
+
+        <span class="n">operation</span> <span class="o">=</span> <span class="n">GemmOperationUniversal</span><span class="p">(</span>
+            <span class="n">arch</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">current_cc</span><span class="p">,</span>
+            <span class="n">tile_description</span><span class="o">=</span><span class="n">tile_description</span><span class="p">,</span>
+            <span class="n">A</span><span class="o">=</span><span class="n">tensor_A</span><span class="p">,</span> <span class="n">B</span><span class="o">=</span><span class="n">tensor_B</span><span class="p">,</span> <span class="n">C</span><span class="o">=</span><span class="n">tensor_C</span><span class="p">,</span>
+            <span class="n">epilogue_functor</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">epilogue_functor</span><span class="p">,</span>
+            <span class="n">swizzling_functor</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_swizzling_functor</span><span class="p">,</span>
+        <span class="p">)</span>
+
+        <span class="k">return</span> <span class="n">operation</span></div>
+
+<div class="viewcode-block" id="Gemm.compile"><a class="viewcode-back" href="../../../cutlass.op.html#cutlass.op.gemm.Gemm.compile">[docs]</a>    <span class="k">def</span> <span class="nf">compile</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">tile_description</span><span class="p">:</span> <span class="n">TileDescription</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+                <span class="n">alignment_A</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">alignment_B</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">alignment_C</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+                <span class="n">print_module</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">backend</span><span class="o">.</span><span class="n">GemmOperationUniversal</span><span class="p">:</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Emits and compiles the kernel currently specified. If ``tile_description`` and any</span>
+<span class="sd">        of the ``alignment`` parameters are set, the kernel will be chosen using this</span>
+<span class="sd">        tile description and alignments. Otherwise, a default tile description and alignment</span>
+<span class="sd">        will be used.</span>
+
+<span class="sd">        :param tile_description: tile description specifying shapes and operand types to use in the kernel</span>
+<span class="sd">        :type tile_description: cutlass.backend.TileDescription</span>
+<span class="sd">        :param alignment_A: alignment of operand A</span>
+<span class="sd">        :type alignment_A: int</span>
+<span class="sd">        :param alignment_B: alignment of operand B</span>
+<span class="sd">        :type alignment_B: int</span>
+<span class="sd">        :param alignment_C: alignment of operand C</span>
+<span class="sd">        :type alignment_C: int</span>
+<span class="sd">        :param print_module: whether to print the emitted C++ code</span>
+<span class="sd">        :type print_module: bool</span>
+
+<span class="sd">        :return: operation that was compiled</span>
+<span class="sd">        :rtype: cutlass.backend.GemmOperationUniversal</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">operation</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">construct</span><span class="p">(</span><span class="n">tile_description</span><span class="p">,</span> <span class="n">alignment_A</span><span class="p">,</span> <span class="n">alignment_B</span><span class="p">,</span> <span class="n">alignment_C</span><span class="p">)</span>
+
+        <span class="k">if</span> <span class="n">print_module</span><span class="p">:</span>
+            <span class="nb">print</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">operation</span><span class="o">.</span><span class="n">rt_module</span><span class="o">.</span><span class="n">emit</span><span class="p">())</span>
+
+        <span class="n">compiler</span><span class="o">.</span><span class="n">add_module</span><span class="p">([</span><span class="bp">self</span><span class="o">.</span><span class="n">operation</span><span class="p">,])</span>
+        <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">operation</span></div>
+
+    <span class="k">def</span> <span class="nf">_verify_type_and_layout</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">tensor</span><span class="p">,</span> <span class="n">ref_type</span><span class="p">,</span> <span class="n">ref_layout</span><span class="p">,</span> <span class="n">name</span><span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Verifies that ``tensor`` has data type ``ref_type`` and layout ``ref_layout``. An exception</span>
+<span class="sd">        is raised if it does not.</span>
+
+<span class="sd">        :param tensor: object representing a tensor passed in to verify, or ``None`` if no tensor was passed in</span>
+<span class="sd">        :type tensor: numpy/cupy/torch array/tensor object</span>
+<span class="sd">        :param ref_dtype: data type for the tensor that this object was initialized to</span>
+<span class="sd">        :param ref_layout: layout for the tensor that this object was initialized to</span>
+<span class="sd">        :param name: identifier of the tensor to verify. Used in raising exceptions</span>
+<span class="sd">        :type name: str</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="n">dtype</span><span class="p">,</span> <span class="n">layout</span> <span class="o">=</span> <span class="n">datatypes</span><span class="o">.</span><span class="n">get_datatype_and_layout</span><span class="p">(</span><span class="n">tensor</span><span class="p">)</span>
+        <span class="k">if</span> <span class="n">dtype</span> <span class="o">!=</span> <span class="n">ref_type</span> <span class="ow">or</span> <span class="n">layout</span> <span class="o">!=</span> <span class="n">ref_layout</span><span class="p">:</span>
+            <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span><span class="sa">f</span><span class="s1">&#39;Tensor </span><span class="si">{</span><span class="n">name</span><span class="si">}</span><span class="s1"> with type and layout (</span><span class="si">{</span><span class="n">dtype</span><span class="si">}</span><span class="s1">, </span><span class="si">{</span><span class="n">layout</span><span class="si">}</span><span class="s1">) &#39;</span>
+                            <span class="sa">f</span><span class="s1">&#39;does not match the expected type and &#39;</span>
+                            <span class="sa">f</span><span class="s1">&#39;layout of (</span><span class="si">{</span><span class="n">ref_type</span><span class="si">}</span><span class="s1">, </span><span class="si">{</span><span class="n">ref_layout</span><span class="si">}</span><span class="s1">).&#39;</span><span class="p">)</span>
+
+    <span class="k">def</span> <span class="nf">_verify_tensor</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">tensor</span><span class="p">,</span> <span class="n">ref_tensor</span><span class="p">,</span> <span class="n">ref_dtype</span><span class="p">,</span> <span class="n">ref_layout</span><span class="p">,</span> <span class="n">name</span><span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Verifies the following properties:</span>
+<span class="sd">            1) Either ``tensor`` or ``ref_tensor`` must be set (i.e., not ``None``)</span>
+<span class="sd">            2) If ``tensor`` is not ``None``, its datatype and layout must match matches the current versions</span>
+<span class="sd">               set by the plan (i.e., those in ``ref_dtype`` and ``ref_layout``)</span>
+
+<span class="sd">        If either of these properties does not hold, an exception is raised. If these properties hold and</span>
+<span class="sd">        ``tensor`` is not ``None``, ``tensor`` is returned. Otherwise, ``ref_tensor`` is returned.</span>
+
+<span class="sd">        :param tensor: object representing a tensor passed in to verify, or ``None`` if no tensor was passed in</span>
+<span class="sd">        :type tensor: numpy/cupy/torch array/tensor object</span>
+<span class="sd">        :param ref_tensor: object representing a tensor passed in on construction of this object, or ``None`` if no tensor was passed in</span>
+<span class="sd">        :type ref_tensor: numpy/cupy/torch array/tensor object</span>
+<span class="sd">        :param ref_dtype: data type for the tensor that this object was initialized to</span>
+<span class="sd">        :param ref_layout: layout for the tensor that this object was initialized to</span>
+<span class="sd">        :param name: identifier of the tensor to verify. Used in raising exceptions</span>
+<span class="sd">        :type name: str</span>
+
+<span class="sd">        :return: valid tensor object to use</span>
+<span class="sd">        :rtype: numpy/cupy/torch array/tensor object</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="k">if</span> <span class="n">tensor</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="k">if</span> <span class="n">ref_tensor</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+                <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Tensor </span><span class="si">{</span><span class="n">name</span><span class="si">}</span><span class="s2"> must be set.&quot;</span><span class="p">)</span>
+            <span class="k">return</span> <span class="n">ref_tensor</span>
+
+        <span class="bp">self</span><span class="o">.</span><span class="n">_verify_type_and_layout</span><span class="p">(</span><span class="n">tensor</span><span class="p">,</span> <span class="n">ref_dtype</span><span class="p">,</span> <span class="n">ref_layout</span><span class="p">,</span> <span class="n">name</span><span class="p">)</span>
+        <span class="k">return</span> <span class="n">tensor</span>
+
+    <span class="k">def</span> <span class="nf">_verify_scalar</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">scalar</span><span class="p">,</span> <span class="n">ref_scalar</span><span class="p">,</span> <span class="n">ref_dtype</span><span class="p">,</span> <span class="n">name</span><span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Verifies the following properties:</span>
+<span class="sd">            1) Either ``scalar`` or ``ref_scakar`` must be set (i.e., not ``None``)</span>
+<span class="sd">            2) If ``scalar`` is not ``None``, its datatype must match matches the current version</span>
+<span class="sd">               set by the plan (i.e., those in ``ref_dtype``)</span>
+
+<span class="sd">        If either of these properties does not hold, an exception is raised. If these properties hold and</span>
+<span class="sd">        ``scalar`` is not ``None``, ``scalar`` is returned. Otherwise, ``ref_scalar`` is returned.</span>
+
+<span class="sd">        :param scalar: object representing a tensor passed in to verify, or ``None`` if no tensor was passed in</span>
+<span class="sd">        :type scalar: numpy/cupy/torch scalar</span>
+<span class="sd">        :param ref_scalar: object representing a tensor passed in on construction of this object, or ``None`` if no tensor was passed in</span>
+<span class="sd">        :type ref_scalar: numpy/cupy/torch scalar</span>
+<span class="sd">        :param ref_dtype: data type for the scalar that this object was initialized to</span>
+<span class="sd">        :param name: identifier of the scalar to verify. Used in raising exceptions</span>
+<span class="sd">        :type name: str</span>
+
+<span class="sd">        :return: valid scalar to use</span>
+<span class="sd">        :rtype: numpy/cupy/torch scalar</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="k">if</span> <span class="n">scalar</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="k">if</span> <span class="n">ref_scalar</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+                <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Scalar </span><span class="si">{</span><span class="n">name</span><span class="si">}</span><span class="s2"> must be set.&quot;</span><span class="p">)</span>
+            <span class="k">return</span> <span class="n">ref_scalar</span>
+        <span class="n">dtype</span> <span class="o">=</span> <span class="n">datatypes</span><span class="o">.</span><span class="n">library_type</span><span class="p">(</span><span class="n">scalar</span><span class="o">.</span><span class="n">dtype</span><span class="p">)</span>
+        <span class="k">if</span> <span class="n">dtype</span> <span class="o">!=</span> <span class="n">ref_dtype</span><span class="p">:</span>
+            <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span>
+                <span class="sa">f</span><span class="s2">&quot;Tensor </span><span class="si">{</span><span class="n">name</span><span class="si">}</span><span class="s2"> with type </span><span class="si">{</span><span class="n">dtype</span><span class="si">}</span><span class="s2"> does not match expected type </span><span class="si">{</span><span class="n">ref_dtype</span><span class="si">}</span><span class="s2">.&quot;</span>
+            <span class="p">)</span>
+        <span class="k">return</span> <span class="n">scalar</span>
+
+<div class="viewcode-block" id="Gemm.run"><a class="viewcode-back" href="../../../cutlass.op.html#cutlass.op.gemm.Gemm.run">[docs]</a>    <span class="k">def</span> <span class="nf">run</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">A</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">B</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">C</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">D</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
+            <span class="n">alpha</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">beta</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">batch_count</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
+            <span class="n">sync</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">print_module</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">GemmArguments</span><span class="p">:</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Runs the kernel currently specified. If it has not already been, the kernel is emitted and</span>
+<span class="sd">        compiled. Tensors holding operands and outputs of the kernel are sourced either from the</span>
+<span class="sd">        ``A``, ``B``, ``C``, ``D``, ``alpha``, and ``beta``</span>
+<span class="sd">        parameters provided in this call, or from those</span>
+<span class="sd">        passed in on the construction of this object -- one of the two must be specified.</span>
+
+<span class="sd">        By default, this call returns only once the kernel has completed. To launch the kernel</span>
+<span class="sd">        and immediately return, set ``sync=False``. In this case, it is the responsibility of the</span>
+<span class="sd">        caller to syncrhonize the results of the kernel before attempting to access outputs</span>
+<span class="sd">        by calling ``sync()`` on the arguments returned from this call.</span>
+
+<span class="sd">        :param A: tensor representing data type and layout of operand A</span>
+<span class="sd">        :param B: tensor representing data type and layout of operand B</span>
+<span class="sd">        :param C: tensor representing data type and layout of operand C</span>
+<span class="sd">        :param D: tensor representing data type and layout of operand D</span>
+<span class="sd">        :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B</span>
+<span class="sd">        :param beta: scalar parameter beta from GEMM operation that scales operand C</span>
+<span class="sd">        :param batch_count: number of GEMMs in the batch</span>
+<span class="sd">        :type batch_count: int</span>
+<span class="sd">        :param sync: whether the call should wait for the kernel to complete before returning</span>
+<span class="sd">        :type sync: bool</span>
+<span class="sd">        :param print_module: whether to print the emitted C++ code</span>
+<span class="sd">        :type print_module: bool</span>
+
+<span class="sd">        :return: arguments passed in to the kernel</span>
+<span class="sd">        :rtype: cutlass.backend.GemmArguments</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="k">if</span> <span class="n">batch_count</span> <span class="o">&lt;</span> <span class="mi">1</span><span class="p">:</span>
+            <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Invalid batch count </span><span class="si">{</span><span class="n">batch_count</span><span class="si">}</span><span class="s2">. Value must be an integer &gt;= 1.&quot;</span><span class="p">)</span>
+
+        <span class="n">A</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_verify_tensor</span><span class="p">(</span><span class="n">A</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">A</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_element_a</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_layout_a</span><span class="p">,</span> <span class="s2">&quot;A&quot;</span><span class="p">)</span>
+        <span class="n">B</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_verify_tensor</span><span class="p">(</span><span class="n">B</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">B</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_element_b</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_layout_b</span><span class="p">,</span> <span class="s2">&quot;B&quot;</span><span class="p">)</span>
+        <span class="n">C</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_verify_tensor</span><span class="p">(</span><span class="n">C</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">C</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_element_c</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_layout_c</span><span class="p">,</span> <span class="s2">&quot;C&quot;</span><span class="p">)</span>
+        <span class="n">D</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_verify_tensor</span><span class="p">(</span><span class="n">D</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">D</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_element_d</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_layout_d</span><span class="p">,</span> <span class="s2">&quot;D&quot;</span><span class="p">)</span>
+        <span class="n">alpha</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_verify_scalar</span><span class="p">(</span><span class="n">alpha</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">alpha</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_element_c</span><span class="p">,</span> <span class="s2">&quot;alpha&quot;</span><span class="p">)</span>
+        <span class="n">beta</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_verify_scalar</span><span class="p">(</span><span class="n">beta</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">beta</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_element_c</span><span class="p">,</span> <span class="s2">&quot;beta&quot;</span><span class="p">)</span>
+
+        <span class="n">alignment_a</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">possible_operations</span><span class="o">.</span><span class="n">find_alignment</span><span class="p">(</span><span class="n">A</span><span class="o">.</span><span class="n">shape</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_layout_a</span><span class="p">)</span>
+        <span class="n">alignment_b</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">possible_operations</span><span class="o">.</span><span class="n">find_alignment</span><span class="p">(</span><span class="n">B</span><span class="o">.</span><span class="n">shape</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_layout_b</span><span class="p">)</span>
+        <span class="n">alignment_c</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">possible_operations</span><span class="o">.</span><span class="n">find_alignment</span><span class="p">(</span><span class="n">C</span><span class="o">.</span><span class="n">shape</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_layout_c</span><span class="p">)</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">tile_description</span><span class="p">,</span> <span class="n">alignment_A</span><span class="o">=</span><span class="n">alignment_a</span><span class="p">,</span> <span class="n">alignment_B</span><span class="o">=</span><span class="n">alignment_b</span><span class="p">,</span>
+                     <span class="n">alignment_C</span><span class="o">=</span><span class="n">alignment_c</span><span class="p">,</span> <span class="n">print_module</span><span class="o">=</span><span class="n">print_module</span><span class="p">)</span>
+
+        <span class="n">problem_size</span> <span class="o">=</span> <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">gemm</span><span class="o">.</span><span class="n">GemmCoord</span><span class="p">(</span><span class="n">A</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">B</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="n">A</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span>
+
+        <span class="k">if</span> <span class="n">batch_count</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
+            <span class="n">mode</span> <span class="o">=</span> <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">gemm</span><span class="o">.</span><span class="n">Mode</span><span class="o">.</span><span class="n">Gemm</span>
+            <span class="n">kwargs</span> <span class="o">=</span> <span class="p">{</span><span class="s1">&#39;split_k_slices&#39;</span><span class="p">:</span> <span class="mi">1</span><span class="p">}</span>
+        <span class="k">else</span><span class="p">:</span>
+            <span class="n">mode</span> <span class="o">=</span> <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">gemm</span><span class="o">.</span><span class="n">Mode</span><span class="o">.</span><span class="n">Batched</span>
+            <span class="n">kwargs</span> <span class="o">=</span> <span class="p">{</span><span class="s1">&#39;batch&#39;</span><span class="p">:</span> <span class="n">batch_count</span><span class="p">}</span>
+
+        <span class="n">arguments</span> <span class="o">=</span> <span class="n">GemmArguments</span><span class="p">(</span>
+            <span class="n">operation</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">operation</span><span class="p">,</span> <span class="n">problem_size</span><span class="o">=</span><span class="n">problem_size</span><span class="p">,</span>
+            <span class="n">A</span><span class="o">=</span><span class="n">A</span><span class="p">,</span> <span class="n">B</span><span class="o">=</span><span class="n">B</span><span class="p">,</span> <span class="n">C</span><span class="o">=</span><span class="n">C</span><span class="p">,</span> <span class="n">D</span><span class="o">=</span><span class="n">D</span><span class="p">,</span>
+            <span class="n">output_op</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">operation</span><span class="o">.</span><span class="n">epilogue_type</span><span class="p">(</span><span class="n">alpha</span><span class="p">,</span> <span class="n">beta</span><span class="p">),</span>
+            <span class="n">gemm_mode</span><span class="o">=</span><span class="n">mode</span><span class="p">,</span>
+            <span class="o">**</span><span class="n">kwargs</span>
+        <span class="p">)</span>
+
+        <span class="bp">self</span><span class="o">.</span><span class="n">operation</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">arguments</span><span class="p">)</span>
+
+        <span class="k">if</span> <span class="n">sync</span><span class="p">:</span>
+            <span class="n">arguments</span><span class="o">.</span><span class="n">sync</span><span class="p">()</span>
+
+        <span class="k">return</span> <span class="n">arguments</span></div></div>
+</pre></div>
+        </article>
+      </div>
+      <footer>
+        
+        <div class="related-pages">
+          
+          
+        </div>
+        <div class="bottom-of-page">
+          <div class="left-details">
+            <div class="copyright">
+                Copyright &#169; 2023, NVIDIA
+            </div>
+            Made with <a href="https://www.sphinx-doc.org/">Sphinx</a> and <a class="muted-link" href="https://pradyunsg.me">@pradyunsg</a>'s
+            
+            <a href="https://github.com/pradyunsg/furo">Furo</a>
+            
+          </div>
+          <div class="right-details">
+            <div class="icons">
+              <a class="muted-link " href="https://github.com/NVIDIA/cutlass" aria-label="GitHub">
+                <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 16 16">
+                    <path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"></path>
+                </svg>
+            </a>
+              
+            </div>
+          </div>
+        </div>
+        
+      </footer>
+    </div>
+    <aside class="toc-drawer no-toc">
+      
+      
+      
+    </aside>
+  </div>
+</div><script data-url_root="../../../" id="documentation_options" src="../../../_static/documentation_options.js"></script>
+    <script src="../../../_static/doctools.js"></script>
+    <script src="../../../_static/sphinx_highlight.js"></script>
+    <script src="../../../_static/scripts/furo.js"></script>
+    <script src="../../../_static/clipboard.min.js"></script>
+    <script src="../../../_static/copybutton.js"></script>
+    <script src="../../../_static/tabs.js"></script>
+    <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
+    </body>
+</html>
\ No newline at end of file
diff --git a/python/docs/_modules/cutlass/op/gemm_grouped.html b/python/docs/_modules/cutlass/op/gemm_grouped.html
new file mode 100644
index 0000000000..b227ab6934
--- /dev/null
+++ b/python/docs/_modules/cutlass/op/gemm_grouped.html
@@ -0,0 +1,554 @@
+<!doctype html>
+<html class="no-js" lang="en">
+  <head><meta charset="utf-8"/>
+    <meta name="viewport" content="width=device-width,initial-scale=1"/>
+    <meta name="color-scheme" content="light dark"><link rel="index" title="Index" href="../../../genindex.html" /><link rel="search" title="Search" href="../../../search.html" />
+        <link rel="canonical" href="docs/_modules/cutlass/op/gemm_grouped.html" />
+
+    <!-- Generated with Sphinx 6.1.3 and Furo 2023.03.27 -->
+        <title>cutlass.op.gemm_grouped - CUTLASS Python</title>
+      <link rel="stylesheet" type="text/css" href="../../../_static/pygments.css" />
+    <link rel="stylesheet" type="text/css" href="../../../_static/styles/furo.css?digest=fad236701ea90a88636c2a8c73b44ae642ed2a53" />
+    <link rel="stylesheet" type="text/css" href="../../../_static/copybutton.css" />
+    <link rel="stylesheet" type="text/css" href="../../../_static/tabs.css" />
+    <link rel="stylesheet" type="text/css" href="../../../_static/styles/furo-extensions.css?digest=30d1aed668e5c3a91c3e3bf6a60b675221979f0e" />
+    
+    
+
+
+<style>
+  body {
+    --color-code-background: #eeffcc;
+  --color-code-foreground: black;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+  }
+  @media not print {
+    body[data-theme="dark"] {
+      --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+    }
+    @media (prefers-color-scheme: dark) {
+      body:not([data-theme="light"]) {
+        --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+      }
+    }
+  }
+</style></head>
+  <body>
+    
+    <script>
+      document.body.dataset.theme = localStorage.getItem("theme") || "auto";
+    </script>
+    
+
+<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
+  <symbol id="svg-toc" viewBox="0 0 24 24">
+    <title>Contents</title>
+    <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 1024 1024">
+      <path d="M408 442h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8zm-8 204c0 4.4 3.6 8 8 8h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56zm504-486H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zm0 632H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zM115.4 518.9L271.7 642c5.8 4.6 14.4.5 14.4-6.9V388.9c0-7.4-8.5-11.5-14.4-6.9L115.4 505.1a8.74 8.74 0 0 0 0 13.8z"/>
+    </svg>
+  </symbol>
+  <symbol id="svg-menu" viewBox="0 0 24 24">
+    <title>Menu</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-menu">
+      <line x1="3" y1="12" x2="21" y2="12"></line>
+      <line x1="3" y1="6" x2="21" y2="6"></line>
+      <line x1="3" y1="18" x2="21" y2="18"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-arrow-right" viewBox="0 0 24 24">
+    <title>Expand</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-chevron-right">
+      <polyline points="9 18 15 12 9 6"></polyline>
+    </svg>
+  </symbol>
+  <symbol id="svg-sun" viewBox="0 0 24 24">
+    <title>Light mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="feather-sun">
+      <circle cx="12" cy="12" r="5"></circle>
+      <line x1="12" y1="1" x2="12" y2="3"></line>
+      <line x1="12" y1="21" x2="12" y2="23"></line>
+      <line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
+      <line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
+      <line x1="1" y1="12" x2="3" y2="12"></line>
+      <line x1="21" y1="12" x2="23" y2="12"></line>
+      <line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
+      <line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-moon" viewBox="0 0 24 24">
+    <title>Dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-moon">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none" />
+      <path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z" />
+    </svg>
+  </symbol>
+  <symbol id="svg-sun-half" viewBox="0 0 24 24">
+    <title>Auto light/dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-shadow">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none"/>
+      <circle cx="12" cy="12" r="9" />
+      <path d="M13 12h5" />
+      <path d="M13 15h4" />
+      <path d="M13 18h1" />
+      <path d="M13 9h4" />
+      <path d="M13 6h1" />
+    </svg>
+  </symbol>
+</svg>
+
+<input type="checkbox" class="sidebar-toggle" name="__navigation" id="__navigation">
+<input type="checkbox" class="sidebar-toggle" name="__toc" id="__toc">
+<label class="overlay sidebar-overlay" for="__navigation">
+  <div class="visually-hidden">Hide navigation sidebar</div>
+</label>
+<label class="overlay toc-overlay" for="__toc">
+  <div class="visually-hidden">Hide table of contents sidebar</div>
+</label>
+
+
+
+<div class="page">
+  <header class="mobile-header">
+    <div class="header-left">
+      <label class="nav-overlay-icon" for="__navigation">
+        <div class="visually-hidden">Toggle site navigation sidebar</div>
+        <i class="icon"><svg><use href="#svg-menu"></use></svg></i>
+      </label>
+    </div>
+    <div class="header-center">
+      <a href="../../../index.html"><div class="brand">CUTLASS Python</div></a>
+    </div>
+    <div class="header-right">
+      <div class="theme-toggle-container theme-toggle-header">
+        <button class="theme-toggle">
+          <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+          <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+          <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+          <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+        </button>
+      </div>
+      <label class="toc-overlay-icon toc-header-icon no-toc" for="__toc">
+        <div class="visually-hidden">Toggle table of contents sidebar</div>
+        <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+      </label>
+    </div>
+  </header>
+  <aside class="sidebar-drawer">
+    <div class="sidebar-container">
+      
+      <div class="sidebar-sticky"><a class="sidebar-brand" href="../../../index.html">
+  
+  <div class="sidebar-logo-container">
+    <img class="sidebar-logo only-light" src="../../../_static/cutlass-logo-small.png" alt="Light Logo"/>
+    <img class="sidebar-logo only-dark" src="../../../_static/cutlass-logo-small.png" alt="Dark Logo"/>
+  </div>
+  
+  <span class="sidebar-brand-text">CUTLASS Python</span>
+  
+</a><form class="sidebar-search-container" method="get" action="../../../search.html" role="search">
+  <input class="sidebar-search" placeholder="Search" name="q" aria-label="Search">
+  <input type="hidden" name="check_keywords" value="yes">
+  <input type="hidden" name="area" value="default">
+</form>
+<div id="searchbox"></div><div class="sidebar-scroll"><div class="sidebar-tree">
+  <ul>
+<li class="toctree-l1"><a class="reference internal" href="../../../index.html">Home</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Getting Started:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="../../../install.html">Installation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../externals/00_basic_gemm.html">Getting Started</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../contribute.html">Contributing</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Python Documentation:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../modules.html">CUTLASS Python API</a><input class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" role="switch" type="checkbox"/><label for="toctree-checkbox-1"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2 has-children"><a class="reference internal" href="../../../cutlass.html">CUTLASS</a><input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" role="switch" type="checkbox"/><label for="toctree-checkbox-2"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l3"><a class="reference internal" href="../../../cutlass.emit.html">Emitters</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../../cutlass.op.html">Operations</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../../cutlass.utils.html">Utilities</a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Examples and Tutorials:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples.html">Examples</a><input class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" role="switch" type="checkbox"/><label for="toctree-checkbox-3"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../externals/00_basic_gemm.html">Basic GEMM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../externals/01_epilogue.html">Epilogue</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../externals/02_pytorch_extension_grouped_gemm.html">PyTorch Extension</a></li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Reference:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference external" href="https://github.com/NVIDIA/cutlass">Github</a></li>
+</ul>
+
+</div>
+</div>
+
+      </div>
+      
+    </div>
+  </aside>
+  <div class="main">
+    <div class="content">
+      <div class="article-container">
+        <a href="#" class="back-to-top muted-link">
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+            <path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12z"></path>
+          </svg>
+          <span>Back to top</span>
+        </a>
+        <div class="content-icon-container">
+          <div class="theme-toggle-container theme-toggle-content">
+            <button class="theme-toggle">
+              <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+              <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+              <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+              <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+            </button>
+          </div>
+          <label class="toc-overlay-icon toc-content-icon no-toc" for="__toc">
+            <div class="visually-hidden">Toggle table of contents sidebar</div>
+            <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+          </label>
+        </div>
+        <article role="main">
+          <h1>Source code for cutlass.op.gemm_grouped</h1><div class="highlight"><pre>
+<span></span><span class="c1">#################################################################################################</span>
+<span class="c1">#</span>
+<span class="c1"># Copyright (c) 2023 - 2023 NVIDIA CORPORATION &amp; AFFILIATES. All rights reserved.</span>
+<span class="c1"># SPDX-License-Identifier: BSD-3-Clause</span>
+<span class="c1">#</span>
+<span class="c1"># Redistribution and use in source and binary forms, with or without</span>
+<span class="c1"># modification, are permitted provided that the following conditions are met:</span>
+<span class="c1">#</span>
+<span class="c1"># 1. Redistributions of source code must retain the above copyright notice, this</span>
+<span class="c1"># list of conditions and the following disclaimer.</span>
+<span class="c1">#</span>
+<span class="c1"># 2. Redistributions in binary form must reproduce the above copyright notice,</span>
+<span class="c1"># this list of conditions and the following disclaimer in the documentation</span>
+<span class="c1"># and/or other materials provided with the distribution.</span>
+<span class="c1">#</span>
+<span class="c1"># 3. Neither the name of the copyright holder nor the names of its</span>
+<span class="c1"># contributors may be used to endorse or promote products derived from</span>
+<span class="c1"># this software without specific prior written permission.</span>
+<span class="c1">#</span>
+<span class="c1"># THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS &quot;AS IS&quot;</span>
+<span class="c1"># AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE</span>
+<span class="c1"># IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE</span>
+<span class="c1"># DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE</span>
+<span class="c1"># FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL</span>
+<span class="c1"># DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR</span>
+<span class="c1"># SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER</span>
+<span class="c1"># CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,</span>
+<span class="c1"># OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE</span>
+<span class="c1"># OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.</span>
+<span class="c1">#</span>
+<span class="c1">#################################################################################################</span>
+
+<span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Ease-of-use interface for constructing, compiling, and running GEMMs.</span>
+
+<span class="sd">    The ``GroupedGemm`` interface is meant to allow one to easily instantiate, compile, and run</span>
+<span class="sd">    grouped GEMM operations in CUTLASS via Python, without specifying many configuration parameters.</span>
+<span class="sd">    Under the hood, the interface will select sensible default parameters for the many template</span>
+<span class="sd">    parameters for CUTLASS grouped GEMMs.</span>
+
+<span class="sd">    Note: optimal performance is not to be expected from this interface. To achieve optimal</span>
+<span class="sd">    performance, one should specify and tune each configuration parameter.</span>
+
+<span class="sd">    The simplest example of using this interface is the following:</span>
+
+<span class="sd">    .. highlight:: python</span>
+<span class="sd">    .. code-block:: python</span>
+
+<span class="sd">        # As, Bs, Cs, and Ds are torch/numpy/cupy tensor objects</span>
+<span class="sd">        plan = cutlass.op.GroupedGemm(element=cutlass.DataType.f16, layout=cutlass.LayoutType.RowMajor)</span>
+<span class="sd">        plan.run([A0, A1], [B0, B1], [C0, C1], [D0, D1])</span>
+<span class="sd">&quot;&quot;&quot;</span>
+
+<span class="kn">import</span> <span class="nn">cutlass_bindings</span>
+
+<span class="kn">from</span> <span class="nn">cutlass.backend.gemm_operation</span> <span class="kn">import</span> <span class="p">(</span>
+    <span class="n">GemmGroupedArguments</span><span class="p">,</span>
+    <span class="n">GemmOperationGrouped</span><span class="p">,</span>
+<span class="p">)</span>
+<span class="kn">from</span> <span class="nn">cutlass.backend.library</span> <span class="kn">import</span> <span class="p">(</span>
+    <span class="n">DataTypeSize</span><span class="p">,</span>
+    <span class="n">SchedulerMode</span><span class="p">,</span>
+    <span class="n">TensorDescription</span><span class="p">,</span>
+    <span class="n">TileDescription</span><span class="p">,</span>
+<span class="p">)</span>
+<span class="kn">from</span> <span class="nn">cutlass.op.gemm</span> <span class="kn">import</span> <span class="n">Gemm</span>
+<span class="kn">from</span> <span class="nn">cutlass.utils</span> <span class="kn">import</span> <span class="n">check</span><span class="p">,</span> <span class="n">datatypes</span>
+
+
+<div class="viewcode-block" id="GroupedGemm"><a class="viewcode-back" href="../../../cutlass.op.html#cutlass.op.gemm_grouped.GroupedGemm">[docs]</a><span class="k">class</span> <span class="nc">GroupedGemm</span><span class="p">(</span><span class="n">Gemm</span><span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Constructs a ``GroupedGemm`` object.</span>
+
+<span class="sd">    The data types and layouts of operands A, B, and C, along with the data type of output D</span>
+<span class="sd">    and that used for accumulation, are bound to the ``GroupedGemm`` object throughout its lifetime --</span>
+<span class="sd">    these are not to be changed after a ``GroupedGemm`` has been constructed.</span>
+
+<span class="sd">    The constructor has optional parameters for flexibly setting these parameters. Please see the constructor</span>
+<span class="sd">    for ``Gemm`` for examples of these.</span>
+
+<span class="sd">    :param cc: compute capability of device to generate kernels for</span>
+<span class="sd">    :type cc: int</span>
+<span class="sd">    :param A: tensor representing data type and layout of operands A</span>
+<span class="sd">    :param B: tensor representing data type and layout of operands B</span>
+<span class="sd">    :param C: tensor representing data type and layout of operands C</span>
+<span class="sd">    :param D: tensor representing data type and layout of operands D</span>
+<span class="sd">    :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B</span>
+<span class="sd">    :param beta: scalar parameter beta from GEMM operation that scales operand C</span>
+<span class="sd">    :param element_accumulator: data type to be used in accumulation of the product of operands A and B</span>
+<span class="sd">    :type element_accumulator: cutlass.DataType</span>
+<span class="sd">    :param element: generic data type to be used for operands A, B, C, D, as well as the accumulation data type</span>
+<span class="sd">    :type element: cutlass.DataType</span>
+<span class="sd">    :param layout: generic layout type to be used for operands A, B, C, and D</span>
+<span class="sd">    :type layout: cutlass.LayoutType</span>
+<span class="sd">    :param element_A: data type to be used for operand A</span>
+<span class="sd">    :type element_A: cutlass.DataType</span>
+<span class="sd">    :param element_B: data type to be used for operand B</span>
+<span class="sd">    :type element_B: cutlass.DataType</span>
+<span class="sd">    :param element_C: data type to be used for operand C</span>
+<span class="sd">    :type element_C: cutlass.DataType</span>
+<span class="sd">    :param element_D: data type to be used for operand D</span>
+<span class="sd">    :type element_D: cutlass.DataType</span>
+<span class="sd">    :type layout_A: layout of operand A</span>
+<span class="sd">    :param layout_A: cutlass.LayoutType</span>
+<span class="sd">    :type layout_B: layout of operand B</span>
+<span class="sd">    :param layout_B: cutlass.LayoutType</span>
+<span class="sd">    :type layout_C: layout of operand C</span>
+<span class="sd">    :param layout_C: cutlass.LayoutType</span>
+<span class="sd">    :type layout_D: layout of operand D</span>
+<span class="sd">    :param layout_D: cutlass.LayoutType</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+
+    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
+        <span class="bp">self</span><span class="p">,</span> <span class="n">A</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">B</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">C</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">D</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
+        <span class="n">alpha</span><span class="o">=</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">beta</span><span class="o">=</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">element_accumulator</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
+        <span class="n">element</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">layout</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
+        <span class="n">element_A</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">element_B</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">element_C</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">element_D</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
+        <span class="n">layout_A</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">layout_B</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">layout_C</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
+        <span class="n">cc</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+    <span class="p">):</span>
+        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span>
+            <span class="n">A</span><span class="o">=</span><span class="n">A</span><span class="p">,</span> <span class="n">B</span><span class="o">=</span><span class="n">B</span><span class="p">,</span> <span class="n">C</span><span class="o">=</span><span class="n">C</span><span class="p">,</span> <span class="n">D</span><span class="o">=</span><span class="n">D</span><span class="p">,</span>
+            <span class="n">alpha</span><span class="o">=</span><span class="n">alpha</span><span class="p">,</span> <span class="n">beta</span><span class="o">=</span><span class="n">beta</span><span class="p">,</span>
+            <span class="n">element_accumulator</span><span class="o">=</span><span class="n">element_accumulator</span><span class="p">,</span>
+            <span class="n">element</span><span class="o">=</span><span class="n">element</span><span class="p">,</span> <span class="n">layout</span><span class="o">=</span><span class="n">layout</span><span class="p">,</span>
+            <span class="n">element_A</span><span class="o">=</span><span class="n">element_A</span><span class="p">,</span> <span class="n">element_B</span><span class="o">=</span><span class="n">element_B</span><span class="p">,</span>
+            <span class="n">element_C</span><span class="o">=</span><span class="n">element_C</span><span class="p">,</span> <span class="n">element_D</span><span class="o">=</span><span class="n">element_D</span><span class="p">,</span>
+            <span class="n">layout_A</span><span class="o">=</span><span class="n">layout_A</span><span class="p">,</span> <span class="n">layout_B</span><span class="o">=</span><span class="n">layout_B</span><span class="p">,</span> <span class="n">layout_C</span><span class="o">=</span><span class="n">layout_C</span><span class="p">,</span>
+            <span class="n">cc</span><span class="o">=</span><span class="n">cc</span>
+        <span class="p">)</span>
+
+        <span class="c1"># Grouped GEMM specializations for SM90 are currently unavailable. Revert to using SM80</span>
+        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">current_cc</span> <span class="o">==</span> <span class="mi">90</span><span class="p">:</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">_reset_options</span><span class="p">(</span><span class="mi">80</span><span class="p">)</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">_reset_operations</span><span class="p">(</span><span class="n">reset_epilogue</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
+
+        <span class="bp">self</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="s2">&quot;grouped_gemm&quot;</span>
+
+    <span class="nd">@Gemm</span><span class="o">.</span><span class="n">swizzling_functor</span><span class="o">.</span><span class="n">setter</span>
+    <span class="k">def</span> <span class="nf">swizzling_functor</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">swizzling_functor</span><span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Sets the swizzling functor to the type specified by `swizzling_functor`</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span><span class="s1">&#39;Grouped GEMM does not currently support different swizzling functors&#39;</span><span class="p">)</span>
+
+<div class="viewcode-block" id="GroupedGemm.construct"><a class="viewcode-back" href="../../../cutlass.op.html#cutlass.op.gemm_grouped.GroupedGemm.construct">[docs]</a>    <span class="k">def</span> <span class="nf">construct</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">tile_description</span><span class="p">:</span> <span class="n">TileDescription</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+                  <span class="n">alignment_A</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+                  <span class="n">alignment_B</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+                  <span class="n">alignment_C</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">GemmOperationGrouped</span><span class="p">:</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Constructs a ``cutlass.backend.GemmOperationGrouped`` based on the input parameters and current</span>
+<span class="sd">        kernel specification of the ``Gemm`` object.</span>
+
+<span class="sd">        :param tile_description: tile description specifying shapes and operand types to use in the kernel</span>
+<span class="sd">        :type tile_description: cutlass.backend.TileDescription</span>
+<span class="sd">        :param alignment_A: alignment of operand A</span>
+<span class="sd">        :type alignment_A: int</span>
+<span class="sd">        :param alignment_B: alignment of operand B</span>
+<span class="sd">        :type alignment_B: int</span>
+<span class="sd">        :param alignment_C: alignment of operand C</span>
+<span class="sd">        :type alignment_C: int</span>
+
+<span class="sd">        :return: operation that was constructed</span>
+<span class="sd">        :rtype: cutlass.backend.GemmOperationGrouped</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="n">alignment_preference</span> <span class="o">=</span> <span class="nb">max</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">possible_operations</span><span class="o">.</span><span class="n">alignments</span><span class="p">)</span>
+        <span class="n">alignment_A</span> <span class="o">=</span> <span class="n">check</span><span class="o">.</span><span class="n">alignment_or_default</span><span class="p">(</span><span class="n">alignment_A</span><span class="p">,</span> <span class="n">alignment_preference</span><span class="p">)</span>
+        <span class="n">alignment_B</span> <span class="o">=</span> <span class="n">check</span><span class="o">.</span><span class="n">alignment_or_default</span><span class="p">(</span><span class="n">alignment_B</span><span class="p">,</span> <span class="n">alignment_preference</span><span class="p">)</span>
+        <span class="n">alignment_C</span> <span class="o">=</span> <span class="n">check</span><span class="o">.</span><span class="n">alignment_or_default</span><span class="p">(</span><span class="n">alignment_C</span><span class="p">,</span> <span class="n">alignment_preference</span><span class="p">)</span>
+
+        <span class="bp">self</span><span class="o">.</span><span class="n">_reset_epilogue_functor_alignment</span><span class="p">(</span><span class="n">alignment_C</span><span class="p">)</span>
+
+        <span class="n">tensor_A</span> <span class="o">=</span> <span class="n">TensorDescription</span><span class="p">(</span>
+            <span class="n">datatypes</span><span class="o">.</span><span class="n">binding_type</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_element_a</span><span class="p">),</span>
+            <span class="n">datatypes</span><span class="o">.</span><span class="n">binding_layout</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_layout_a</span><span class="p">),</span>
+            <span class="n">alignment_A</span>
+        <span class="p">)</span>
+        <span class="n">tensor_B</span> <span class="o">=</span> <span class="n">TensorDescription</span><span class="p">(</span>
+            <span class="n">datatypes</span><span class="o">.</span><span class="n">binding_type</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_element_b</span><span class="p">),</span>
+            <span class="n">datatypes</span><span class="o">.</span><span class="n">binding_layout</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_layout_b</span><span class="p">),</span>
+            <span class="n">alignment_B</span>
+        <span class="p">)</span>
+        <span class="n">tensor_C</span> <span class="o">=</span> <span class="n">TensorDescription</span><span class="p">(</span>
+            <span class="n">datatypes</span><span class="o">.</span><span class="n">binding_type</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_element_c</span><span class="p">),</span>
+            <span class="n">datatypes</span><span class="o">.</span><span class="n">binding_layout</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_layout_c</span><span class="p">),</span>
+            <span class="n">alignment_C</span>
+        <span class="p">)</span>
+
+        <span class="k">if</span> <span class="n">tile_description</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="n">op</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">possible_operations</span><span class="o">.</span><span class="n">operations</span><span class="p">(</span><span class="n">alignment_A</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>
+            <span class="n">tile_description</span> <span class="o">=</span> <span class="n">datatypes</span><span class="o">.</span><span class="n">td_from_profiler_op</span><span class="p">(</span><span class="n">op</span><span class="p">)</span>
+        <span class="k">else</span><span class="p">:</span>
+            <span class="n">valid</span><span class="p">,</span> <span class="n">err_str</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_valid_tile_description</span><span class="p">(</span><span class="n">tile_description</span><span class="p">)</span>
+            <span class="k">if</span> <span class="ow">not</span> <span class="n">valid</span><span class="p">:</span>
+                <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Invalid tile description. </span><span class="si">{</span><span class="n">err_str</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">tile_description</span> <span class="o">=</span> <span class="n">tile_description</span>
+
+        <span class="n">operation</span> <span class="o">=</span> <span class="n">GemmOperationGrouped</span><span class="p">(</span>
+            <span class="n">arch</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">current_cc</span><span class="p">,</span>
+            <span class="n">tile_description</span><span class="o">=</span><span class="n">tile_description</span><span class="p">,</span>
+            <span class="n">A</span><span class="o">=</span><span class="n">tensor_A</span><span class="p">,</span> <span class="n">B</span><span class="o">=</span><span class="n">tensor_B</span><span class="p">,</span> <span class="n">C</span><span class="o">=</span><span class="n">tensor_C</span><span class="p">,</span>
+            <span class="n">epilogue_functor</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">epilogue_functor</span><span class="p">,</span>
+            <span class="n">swizzling_functor</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_swizzling_functor</span><span class="p">,</span>
+            <span class="n">precompute_mode</span><span class="o">=</span><span class="n">SchedulerMode</span><span class="o">.</span><span class="n">Device</span><span class="p">)</span>
+
+        <span class="k">return</span> <span class="n">operation</span></div>
+
+<div class="viewcode-block" id="GroupedGemm.run"><a class="viewcode-back" href="../../../cutlass.op.html#cutlass.op.gemm_grouped.GroupedGemm.run">[docs]</a>    <span class="k">def</span> <span class="nf">run</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">A</span><span class="p">,</span> <span class="n">B</span><span class="p">,</span> <span class="n">C</span><span class="p">,</span> <span class="n">D</span><span class="p">,</span>
+            <span class="n">alpha</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">beta</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">sync</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
+            <span class="n">print_module</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">GemmGroupedArguments</span><span class="p">:</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Runs the kernel currently specified.</span>
+
+<span class="sd">        By default, this call returns only once the kernel has completed. To launch the kernel</span>
+<span class="sd">        and immediately return, set ``sync=False``. In this case, it is the responsibility of the</span>
+<span class="sd">        caller to syncrhonize the results of the kernel before attempting to access outputs</span>
+<span class="sd">        by calling ``sync()`` on the arguments returned from this call.</span>
+
+<span class="sd">        :param A: list of tensors representing data type and layout of operand A</span>
+<span class="sd">        :type A: list</span>
+<span class="sd">        :param B: list of tensors representing data type and layout of operand B</span>
+<span class="sd">        :type B: list</span>
+<span class="sd">        :param C: list of tensors representing data type and layout of operand C</span>
+<span class="sd">        :type C: list</span>
+<span class="sd">        :param D: list of tensors representing data type and layout of operand D</span>
+<span class="sd">        :type D: list</span>
+<span class="sd">        :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B</span>
+<span class="sd">        :param beta: scalar parameter beta from GEMM operation that scales operand C</span>
+<span class="sd">        :param sync: whether the call should wait for the kernel to complete before returning</span>
+<span class="sd">        :type sync: bool</span>
+<span class="sd">        :param print_module: whether to print the emitted C++ code</span>
+<span class="sd">        :type print_module: bool</span>
+
+<span class="sd">        :return: arguments passed in to the kernel</span>
+<span class="sd">        :rtype: cutlass.backend.GemmGroupedArguments</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">A</span><span class="p">)</span> <span class="o">!=</span> <span class="nb">len</span><span class="p">(</span><span class="n">B</span><span class="p">)</span> <span class="ow">or</span> <span class="nb">len</span><span class="p">(</span><span class="n">A</span><span class="p">)</span> <span class="o">!=</span> <span class="nb">len</span><span class="p">(</span><span class="n">C</span><span class="p">)</span> <span class="ow">or</span> <span class="nb">len</span><span class="p">(</span><span class="n">A</span><span class="p">)</span> <span class="o">!=</span> <span class="nb">len</span><span class="p">(</span><span class="n">D</span><span class="p">):</span>
+            <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span><span class="s2">&quot;Lengths of A, B, C, and D lists must be equal&quot;</span><span class="p">)</span>
+
+        <span class="n">problem_sizes</span> <span class="o">=</span> <span class="p">[]</span>
+        <span class="n">As</span><span class="p">,</span> <span class="n">Bs</span><span class="p">,</span> <span class="n">Cs</span><span class="p">,</span> <span class="n">Ds</span> <span class="o">=</span> <span class="p">([</span><span class="kc">None</span><span class="p">]</span> <span class="o">*</span> <span class="nb">len</span><span class="p">(</span><span class="n">A</span><span class="p">)</span> <span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">4</span><span class="p">))</span>
+        <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">A</span><span class="p">)):</span>
+            <span class="n">As</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_verify_tensor</span><span class="p">(</span><span class="n">A</span><span class="p">[</span><span class="n">i</span><span class="p">],</span> <span class="bp">self</span><span class="o">.</span><span class="n">A</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_element_a</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_layout_a</span><span class="p">,</span> <span class="s2">&quot;A&quot;</span><span class="p">)</span>
+            <span class="n">Bs</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_verify_tensor</span><span class="p">(</span><span class="n">B</span><span class="p">[</span><span class="n">i</span><span class="p">],</span> <span class="bp">self</span><span class="o">.</span><span class="n">B</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_element_b</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_layout_b</span><span class="p">,</span> <span class="s2">&quot;B&quot;</span><span class="p">)</span>
+            <span class="n">Cs</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_verify_tensor</span><span class="p">(</span><span class="n">C</span><span class="p">[</span><span class="n">i</span><span class="p">],</span> <span class="bp">self</span><span class="o">.</span><span class="n">C</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_element_c</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_layout_c</span><span class="p">,</span> <span class="s2">&quot;C&quot;</span><span class="p">)</span>
+            <span class="n">Ds</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_verify_tensor</span><span class="p">(</span><span class="n">D</span><span class="p">[</span><span class="n">i</span><span class="p">],</span> <span class="bp">self</span><span class="o">.</span><span class="n">D</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_element_d</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_layout_d</span><span class="p">,</span> <span class="s2">&quot;D&quot;</span><span class="p">)</span>
+            <span class="n">problem_sizes</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">gemm</span><span class="o">.</span><span class="n">GemmCoord</span><span class="p">(</span><span class="n">A</span><span class="p">[</span><span class="n">i</span><span class="p">]</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">B</span><span class="p">[</span><span class="n">i</span><span class="p">]</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="n">A</span><span class="p">[</span><span class="n">i</span><span class="p">]</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]))</span>
+
+        <span class="n">alpha</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_verify_scalar</span><span class="p">(</span><span class="n">alpha</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">alpha</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_element_c</span><span class="p">,</span> <span class="s2">&quot;alpha&quot;</span><span class="p">)</span>
+        <span class="n">beta</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_verify_scalar</span><span class="p">(</span><span class="n">beta</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">beta</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_element_c</span><span class="p">,</span> <span class="s2">&quot;beta&quot;</span><span class="p">)</span>
+
+        <span class="n">alignment_a</span> <span class="o">=</span> <span class="nb">min</span><span class="p">((</span><span class="bp">self</span><span class="o">.</span><span class="n">possible_operations</span><span class="o">.</span><span class="n">find_alignment</span><span class="p">(</span><span class="n">A</span><span class="o">.</span><span class="n">shape</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_layout_a</span><span class="p">)</span> <span class="k">for</span> <span class="n">A</span> <span class="ow">in</span> <span class="n">As</span><span class="p">))</span>
+        <span class="n">alignment_b</span> <span class="o">=</span> <span class="nb">min</span><span class="p">((</span><span class="bp">self</span><span class="o">.</span><span class="n">possible_operations</span><span class="o">.</span><span class="n">find_alignment</span><span class="p">(</span><span class="n">B</span><span class="o">.</span><span class="n">shape</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_layout_b</span><span class="p">)</span> <span class="k">for</span> <span class="n">B</span> <span class="ow">in</span> <span class="n">Bs</span><span class="p">))</span>
+        <span class="n">alignment_c</span> <span class="o">=</span> <span class="nb">min</span><span class="p">((</span><span class="bp">self</span><span class="o">.</span><span class="n">possible_operations</span><span class="o">.</span><span class="n">find_alignment</span><span class="p">(</span><span class="n">C</span><span class="o">.</span><span class="n">shape</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_layout_c</span><span class="p">)</span> <span class="k">for</span> <span class="n">C</span> <span class="ow">in</span> <span class="n">Cs</span><span class="p">))</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">tile_description</span><span class="p">,</span> <span class="n">alignment_A</span><span class="o">=</span><span class="n">alignment_a</span><span class="p">,</span> <span class="n">alignment_B</span><span class="o">=</span><span class="n">alignment_b</span><span class="p">,</span>
+                     <span class="n">alignment_C</span><span class="o">=</span><span class="n">alignment_c</span><span class="p">,</span> <span class="n">print_module</span><span class="o">=</span><span class="n">print_module</span><span class="p">)</span>
+
+        <span class="n">arguments</span> <span class="o">=</span> <span class="n">GemmGroupedArguments</span><span class="p">(</span>
+            <span class="n">operation</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">operation</span><span class="p">,</span>
+            <span class="n">problem_sizes</span><span class="o">=</span><span class="n">problem_sizes</span><span class="p">,</span>
+            <span class="n">A</span><span class="o">=</span><span class="n">As</span><span class="p">,</span> <span class="n">B</span><span class="o">=</span><span class="n">Bs</span><span class="p">,</span> <span class="n">C</span><span class="o">=</span><span class="n">Cs</span><span class="p">,</span> <span class="n">D</span><span class="o">=</span><span class="n">Ds</span><span class="p">,</span>
+            <span class="n">output_op</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">operation</span><span class="o">.</span><span class="n">epilogue_type</span><span class="p">(</span><span class="n">alpha</span><span class="p">,</span> <span class="n">beta</span><span class="p">)</span>
+        <span class="p">)</span>
+
+        <span class="bp">self</span><span class="o">.</span><span class="n">operation</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">arguments</span><span class="p">)</span>
+
+        <span class="k">if</span> <span class="n">sync</span><span class="p">:</span>
+            <span class="n">arguments</span><span class="o">.</span><span class="n">sync</span><span class="p">()</span>
+
+        <span class="k">return</span> <span class="n">arguments</span></div></div>
+</pre></div>
+        </article>
+      </div>
+      <footer>
+        
+        <div class="related-pages">
+          
+          
+        </div>
+        <div class="bottom-of-page">
+          <div class="left-details">
+            <div class="copyright">
+                Copyright &#169; 2023, NVIDIA
+            </div>
+            Made with <a href="https://www.sphinx-doc.org/">Sphinx</a> and <a class="muted-link" href="https://pradyunsg.me">@pradyunsg</a>'s
+            
+            <a href="https://github.com/pradyunsg/furo">Furo</a>
+            
+          </div>
+          <div class="right-details">
+            <div class="icons">
+              <a class="muted-link " href="https://github.com/NVIDIA/cutlass" aria-label="GitHub">
+                <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 16 16">
+                    <path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"></path>
+                </svg>
+            </a>
+              
+            </div>
+          </div>
+        </div>
+        
+      </footer>
+    </div>
+    <aside class="toc-drawer no-toc">
+      
+      
+      
+    </aside>
+  </div>
+</div><script data-url_root="../../../" id="documentation_options" src="../../../_static/documentation_options.js"></script>
+    <script src="../../../_static/doctools.js"></script>
+    <script src="../../../_static/sphinx_highlight.js"></script>
+    <script src="../../../_static/scripts/furo.js"></script>
+    <script src="../../../_static/clipboard.min.js"></script>
+    <script src="../../../_static/copybutton.js"></script>
+    <script src="../../../_static/tabs.js"></script>
+    <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
+    </body>
+</html>
\ No newline at end of file
diff --git a/python/docs/_modules/cutlass/op/op.html b/python/docs/_modules/cutlass/op/op.html
new file mode 100644
index 0000000000..e13244b9c1
--- /dev/null
+++ b/python/docs/_modules/cutlass/op/op.html
@@ -0,0 +1,400 @@
+<!doctype html>
+<html class="no-js" lang="en">
+  <head><meta charset="utf-8"/>
+    <meta name="viewport" content="width=device-width,initial-scale=1"/>
+    <meta name="color-scheme" content="light dark"><link rel="index" title="Index" href="../../../genindex.html" /><link rel="search" title="Search" href="../../../search.html" />
+        <link rel="canonical" href="docs/_modules/cutlass/op/op.html" />
+
+    <!-- Generated with Sphinx 6.1.3 and Furo 2023.03.27 -->
+        <title>cutlass.op.op - CUTLASS Python</title>
+      <link rel="stylesheet" type="text/css" href="../../../_static/pygments.css" />
+    <link rel="stylesheet" type="text/css" href="../../../_static/styles/furo.css?digest=fad236701ea90a88636c2a8c73b44ae642ed2a53" />
+    <link rel="stylesheet" type="text/css" href="../../../_static/copybutton.css" />
+    <link rel="stylesheet" type="text/css" href="../../../_static/tabs.css" />
+    <link rel="stylesheet" type="text/css" href="../../../_static/styles/furo-extensions.css?digest=30d1aed668e5c3a91c3e3bf6a60b675221979f0e" />
+    
+    
+
+
+<style>
+  body {
+    --color-code-background: #eeffcc;
+  --color-code-foreground: black;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+  }
+  @media not print {
+    body[data-theme="dark"] {
+      --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+    }
+    @media (prefers-color-scheme: dark) {
+      body:not([data-theme="light"]) {
+        --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+      }
+    }
+  }
+</style></head>
+  <body>
+    
+    <script>
+      document.body.dataset.theme = localStorage.getItem("theme") || "auto";
+    </script>
+    
+
+<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
+  <symbol id="svg-toc" viewBox="0 0 24 24">
+    <title>Contents</title>
+    <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 1024 1024">
+      <path d="M408 442h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8zm-8 204c0 4.4 3.6 8 8 8h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56zm504-486H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zm0 632H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zM115.4 518.9L271.7 642c5.8 4.6 14.4.5 14.4-6.9V388.9c0-7.4-8.5-11.5-14.4-6.9L115.4 505.1a8.74 8.74 0 0 0 0 13.8z"/>
+    </svg>
+  </symbol>
+  <symbol id="svg-menu" viewBox="0 0 24 24">
+    <title>Menu</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-menu">
+      <line x1="3" y1="12" x2="21" y2="12"></line>
+      <line x1="3" y1="6" x2="21" y2="6"></line>
+      <line x1="3" y1="18" x2="21" y2="18"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-arrow-right" viewBox="0 0 24 24">
+    <title>Expand</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-chevron-right">
+      <polyline points="9 18 15 12 9 6"></polyline>
+    </svg>
+  </symbol>
+  <symbol id="svg-sun" viewBox="0 0 24 24">
+    <title>Light mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="feather-sun">
+      <circle cx="12" cy="12" r="5"></circle>
+      <line x1="12" y1="1" x2="12" y2="3"></line>
+      <line x1="12" y1="21" x2="12" y2="23"></line>
+      <line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
+      <line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
+      <line x1="1" y1="12" x2="3" y2="12"></line>
+      <line x1="21" y1="12" x2="23" y2="12"></line>
+      <line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
+      <line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-moon" viewBox="0 0 24 24">
+    <title>Dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-moon">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none" />
+      <path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z" />
+    </svg>
+  </symbol>
+  <symbol id="svg-sun-half" viewBox="0 0 24 24">
+    <title>Auto light/dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-shadow">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none"/>
+      <circle cx="12" cy="12" r="9" />
+      <path d="M13 12h5" />
+      <path d="M13 15h4" />
+      <path d="M13 18h1" />
+      <path d="M13 9h4" />
+      <path d="M13 6h1" />
+    </svg>
+  </symbol>
+</svg>
+
+<input type="checkbox" class="sidebar-toggle" name="__navigation" id="__navigation">
+<input type="checkbox" class="sidebar-toggle" name="__toc" id="__toc">
+<label class="overlay sidebar-overlay" for="__navigation">
+  <div class="visually-hidden">Hide navigation sidebar</div>
+</label>
+<label class="overlay toc-overlay" for="__toc">
+  <div class="visually-hidden">Hide table of contents sidebar</div>
+</label>
+
+
+
+<div class="page">
+  <header class="mobile-header">
+    <div class="header-left">
+      <label class="nav-overlay-icon" for="__navigation">
+        <div class="visually-hidden">Toggle site navigation sidebar</div>
+        <i class="icon"><svg><use href="#svg-menu"></use></svg></i>
+      </label>
+    </div>
+    <div class="header-center">
+      <a href="../../../index.html"><div class="brand">CUTLASS Python</div></a>
+    </div>
+    <div class="header-right">
+      <div class="theme-toggle-container theme-toggle-header">
+        <button class="theme-toggle">
+          <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+          <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+          <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+          <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+        </button>
+      </div>
+      <label class="toc-overlay-icon toc-header-icon no-toc" for="__toc">
+        <div class="visually-hidden">Toggle table of contents sidebar</div>
+        <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+      </label>
+    </div>
+  </header>
+  <aside class="sidebar-drawer">
+    <div class="sidebar-container">
+      
+      <div class="sidebar-sticky"><a class="sidebar-brand" href="../../../index.html">
+  
+  <div class="sidebar-logo-container">
+    <img class="sidebar-logo only-light" src="../../../_static/cutlass-logo-small.png" alt="Light Logo"/>
+    <img class="sidebar-logo only-dark" src="../../../_static/cutlass-logo-small.png" alt="Dark Logo"/>
+  </div>
+  
+  <span class="sidebar-brand-text">CUTLASS Python</span>
+  
+</a><form class="sidebar-search-container" method="get" action="../../../search.html" role="search">
+  <input class="sidebar-search" placeholder="Search" name="q" aria-label="Search">
+  <input type="hidden" name="check_keywords" value="yes">
+  <input type="hidden" name="area" value="default">
+</form>
+<div id="searchbox"></div><div class="sidebar-scroll"><div class="sidebar-tree">
+  <ul>
+<li class="toctree-l1"><a class="reference internal" href="../../../index.html">Home</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Getting Started:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="../../../install.html">Installation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../externals/00_basic_gemm.html">Getting Started</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../contribute.html">Contributing</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Python Documentation:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../modules.html">CUTLASS Python API</a><input class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" role="switch" type="checkbox"/><label for="toctree-checkbox-1"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2 has-children"><a class="reference internal" href="../../../cutlass.html">CUTLASS</a><input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" role="switch" type="checkbox"/><label for="toctree-checkbox-2"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l3"><a class="reference internal" href="../../../cutlass.emit.html">Emitters</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../../cutlass.op.html">Operations</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../../cutlass.utils.html">Utilities</a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Examples and Tutorials:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples.html">Examples</a><input class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" role="switch" type="checkbox"/><label for="toctree-checkbox-3"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../externals/00_basic_gemm.html">Basic GEMM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../externals/01_epilogue.html">Epilogue</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../externals/02_pytorch_extension_grouped_gemm.html">PyTorch Extension</a></li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Reference:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference external" href="https://github.com/NVIDIA/cutlass">Github</a></li>
+</ul>
+
+</div>
+</div>
+
+      </div>
+      
+    </div>
+  </aside>
+  <div class="main">
+    <div class="content">
+      <div class="article-container">
+        <a href="#" class="back-to-top muted-link">
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+            <path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12z"></path>
+          </svg>
+          <span>Back to top</span>
+        </a>
+        <div class="content-icon-container">
+          <div class="theme-toggle-container theme-toggle-content">
+            <button class="theme-toggle">
+              <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+              <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+              <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+              <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+            </button>
+          </div>
+          <label class="toc-overlay-icon toc-content-icon no-toc" for="__toc">
+            <div class="visually-hidden">Toggle table of contents sidebar</div>
+            <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+          </label>
+        </div>
+        <article role="main">
+          <h1>Source code for cutlass.op.op</h1><div class="highlight"><pre>
+<span></span><span class="c1">#################################################################################################</span>
+<span class="c1">#</span>
+<span class="c1"># Copyright (c) 2023 - 2023 NVIDIA CORPORATION &amp; AFFILIATES. All rights reserved.</span>
+<span class="c1"># SPDX-License-Identifier: BSD-3-Clause</span>
+<span class="c1">#</span>
+<span class="c1"># Redistribution and use in source and binary forms, with or without</span>
+<span class="c1"># modification, are permitted provided that the following conditions are met:</span>
+<span class="c1">#</span>
+<span class="c1"># 1. Redistributions of source code must retain the above copyright notice, this</span>
+<span class="c1"># list of conditions and the following disclaimer.</span>
+<span class="c1">#</span>
+<span class="c1"># 2. Redistributions in binary form must reproduce the above copyright notice,</span>
+<span class="c1"># this list of conditions and the following disclaimer in the documentation</span>
+<span class="c1"># and/or other materials provided with the distribution.</span>
+<span class="c1">#</span>
+<span class="c1"># 3. Neither the name of the copyright holder nor the names of its</span>
+<span class="c1"># contributors may be used to endorse or promote products derived from</span>
+<span class="c1"># this software without specific prior written permission.</span>
+<span class="c1">#</span>
+<span class="c1"># THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS &quot;AS IS&quot;</span>
+<span class="c1"># AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE</span>
+<span class="c1"># IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE</span>
+<span class="c1"># DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE</span>
+<span class="c1"># FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL</span>
+<span class="c1"># DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR</span>
+<span class="c1"># SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER</span>
+<span class="c1"># CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,</span>
+<span class="c1"># OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE</span>
+<span class="c1"># OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.</span>
+<span class="c1">#</span>
+<span class="c1">#################################################################################################</span>
+
+<span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">Base operation used for defining high-level CUTLASS operations (e.g., GEMM, Conv2d)</span>
+<span class="sd">&quot;&quot;&quot;</span>
+
+<span class="kn">from</span> <span class="nn">bisect</span> <span class="kn">import</span> <span class="n">bisect_left</span>
+
+<span class="kn">from</span> <span class="nn">cutlass</span> <span class="kn">import</span> <span class="n">option_registry</span>
+<span class="kn">from</span> <span class="nn">cutlass.backend.utils.device</span> <span class="kn">import</span> <span class="n">device_cc</span>
+<span class="kn">from</span> <span class="nn">cutlass.epilogue</span> <span class="kn">import</span> <span class="n">get_activations</span>
+<span class="kn">from</span> <span class="nn">cutlass.library_defaults</span> <span class="kn">import</span> <span class="n">_generator_ccs</span>
+<span class="kn">from</span> <span class="nn">cutlass.swizzle</span> <span class="kn">import</span> <span class="n">get_swizzling_functors</span>
+
+
+<div class="viewcode-block" id="OperationBase"><a class="viewcode-back" href="../../../cutlass.op.html#cutlass.op.op.OperationBase">[docs]</a><span class="k">class</span> <span class="nc">OperationBase</span><span class="p">:</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Base operation used for defining high-level CUTLASS operations (e.g., GEMM, Conv2d)</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+
+    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">cc</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">kernel_cc</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="kc">None</span><span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        :param cc: compute capability of device for which kernels should be compiled. For example, if running on H100, this should be set to 90</span>
+<span class="sd">        :type cc: int</span>
+<span class="sd">        :param kernel_cc: compute capability of kernels to generate. For example, if running on SM90, but desiring to use a CUTLASS 2.x-style Ampere kernel, this should be set to 80</span>
+<span class="sd">        :type kernel_cc: int</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">cc</span> <span class="o">=</span> <span class="n">cc</span> <span class="k">if</span> <span class="n">cc</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">device_cc</span><span class="p">()</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">specified_kernel_cc</span> <span class="o">=</span> <span class="n">kernel_cc</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">current_cc</span> <span class="o">=</span> <span class="n">kernel_cc</span> <span class="k">if</span> <span class="n">kernel_cc</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="bp">self</span><span class="o">.</span><span class="n">_find_closest_cc</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">cc</span><span class="p">)</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">tile_description</span> <span class="o">=</span> <span class="kc">None</span>
+
+        <span class="bp">self</span><span class="o">.</span><span class="n">options</span> <span class="o">=</span> <span class="n">option_registry</span><span class="o">.</span><span class="n">options_for_cc</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">current_cc</span><span class="p">)</span>
+
+        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">options</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Invalid or unsupported compute capability: </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">current_cc</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+
+    <span class="k">def</span> <span class="nf">_find_closest_cc</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">cc</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Returns the closest CC in _generator_ccs less than or equal to `cc`</span>
+
+<span class="sd">        :param cc: compute capability to query</span>
+<span class="sd">        :type cc: int</span>
+
+<span class="sd">        :returns: closest CC in _generator_ccs less than or equal to `cc`</span>
+<span class="sd">        :rtype: int</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="k">if</span> <span class="n">cc</span> <span class="ow">in</span> <span class="n">_generator_ccs</span><span class="p">:</span>
+            <span class="k">return</span> <span class="n">cc</span>
+
+        <span class="c1"># Find closest CC lower than this CC</span>
+        <span class="n">idx</span> <span class="o">=</span> <span class="n">bisect_left</span><span class="p">(</span><span class="n">_generator_ccs</span><span class="p">,</span> <span class="n">cc</span><span class="p">)</span>
+        <span class="k">if</span> <span class="n">idx</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
+            <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span><span class="sa">f</span><span class="s1">&#39;No valid CC to fall back to for </span><span class="si">{</span><span class="n">cc</span><span class="si">}</span><span class="s1">&#39;</span><span class="p">)</span>
+        <span class="k">return</span> <span class="n">_generator_ccs</span><span class="p">[</span><span class="n">idx</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span>
+
+<div class="viewcode-block" id="OperationBase.activations"><a class="viewcode-back" href="../../../cutlass.op.html#cutlass.op.op.OperationBase.activations">[docs]</a>    <span class="k">def</span> <span class="nf">activations</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">list</span><span class="p">:</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Returns possible activation functions that can be used</span>
+
+<span class="sd">        :return: list of activation functions that can be used</span>
+<span class="sd">        :rtype: list</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="k">return</span> <span class="n">get_activations</span><span class="p">()</span></div>
+
+<div class="viewcode-block" id="OperationBase.swizzling_functors"><a class="viewcode-back" href="../../../cutlass.op.html#cutlass.op.op.OperationBase.swizzling_functors">[docs]</a>    <span class="k">def</span> <span class="nf">swizzling_functors</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">list</span><span class="p">:</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Returns possible swizzling functions that can be used</span>
+
+<span class="sd">        :return: list of swizzling functions that can be used</span>
+<span class="sd">        :rtype: list</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="k">return</span> <span class="n">get_swizzling_functors</span><span class="p">()</span></div>
+
+    <span class="k">def</span> <span class="nf">_reset_options</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">cc</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Resets the kernel options based on cc</span>
+
+<span class="sd">        :param cc: compute capability to reset to</span>
+<span class="sd">        :type cc: int</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="k">if</span> <span class="n">cc</span> <span class="o">!=</span> <span class="bp">self</span><span class="o">.</span><span class="n">current_cc</span><span class="p">:</span>
+            <span class="k">if</span> <span class="n">cc</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">_generator_ccs</span><span class="p">:</span>
+                <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span><span class="sa">f</span><span class="s1">&#39;Invalid CC for CUTLASS kernels: </span><span class="si">{</span><span class="n">cc</span><span class="si">}</span><span class="s1">.&#39;</span><span class="p">)</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">current_cc</span> <span class="o">=</span> <span class="n">cc</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">options</span> <span class="o">=</span> <span class="n">option_registry</span><span class="o">.</span><span class="n">options_for_cc</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">current_cc</span><span class="p">)</span></div>
+</pre></div>
+        </article>
+      </div>
+      <footer>
+        
+        <div class="related-pages">
+          
+          
+        </div>
+        <div class="bottom-of-page">
+          <div class="left-details">
+            <div class="copyright">
+                Copyright &#169; 2023, NVIDIA
+            </div>
+            Made with <a href="https://www.sphinx-doc.org/">Sphinx</a> and <a class="muted-link" href="https://pradyunsg.me">@pradyunsg</a>'s
+            
+            <a href="https://github.com/pradyunsg/furo">Furo</a>
+            
+          </div>
+          <div class="right-details">
+            <div class="icons">
+              <a class="muted-link " href="https://github.com/NVIDIA/cutlass" aria-label="GitHub">
+                <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 16 16">
+                    <path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"></path>
+                </svg>
+            </a>
+              
+            </div>
+          </div>
+        </div>
+        
+      </footer>
+    </div>
+    <aside class="toc-drawer no-toc">
+      
+      
+      
+    </aside>
+  </div>
+</div><script data-url_root="../../../" id="documentation_options" src="../../../_static/documentation_options.js"></script>
+    <script src="../../../_static/doctools.js"></script>
+    <script src="../../../_static/sphinx_highlight.js"></script>
+    <script src="../../../_static/scripts/furo.js"></script>
+    <script src="../../../_static/clipboard.min.js"></script>
+    <script src="../../../_static/copybutton.js"></script>
+    <script src="../../../_static/tabs.js"></script>
+    <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
+    </body>
+</html>
\ No newline at end of file
diff --git a/python/docs/_modules/cutlass/swizzle.html b/python/docs/_modules/cutlass/swizzle.html
new file mode 100644
index 0000000000..ba581cbde9
--- /dev/null
+++ b/python/docs/_modules/cutlass/swizzle.html
@@ -0,0 +1,350 @@
+<!doctype html>
+<html class="no-js" lang="en">
+  <head><meta charset="utf-8"/>
+    <meta name="viewport" content="width=device-width,initial-scale=1"/>
+    <meta name="color-scheme" content="light dark"><link rel="index" title="Index" href="../../genindex.html" /><link rel="search" title="Search" href="../../search.html" />
+        <link rel="canonical" href="docs/_modules/cutlass/swizzle.html" />
+
+    <!-- Generated with Sphinx 6.1.3 and Furo 2023.03.27 -->
+        <title>cutlass.swizzle - CUTLASS Python</title>
+      <link rel="stylesheet" type="text/css" href="../../_static/pygments.css" />
+    <link rel="stylesheet" type="text/css" href="../../_static/styles/furo.css?digest=fad236701ea90a88636c2a8c73b44ae642ed2a53" />
+    <link rel="stylesheet" type="text/css" href="../../_static/copybutton.css" />
+    <link rel="stylesheet" type="text/css" href="../../_static/tabs.css" />
+    <link rel="stylesheet" type="text/css" href="../../_static/styles/furo-extensions.css?digest=30d1aed668e5c3a91c3e3bf6a60b675221979f0e" />
+    
+    
+
+
+<style>
+  body {
+    --color-code-background: #eeffcc;
+  --color-code-foreground: black;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+  }
+  @media not print {
+    body[data-theme="dark"] {
+      --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+    }
+    @media (prefers-color-scheme: dark) {
+      body:not([data-theme="light"]) {
+        --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+      }
+    }
+  }
+</style></head>
+  <body>
+    
+    <script>
+      document.body.dataset.theme = localStorage.getItem("theme") || "auto";
+    </script>
+    
+
+<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
+  <symbol id="svg-toc" viewBox="0 0 24 24">
+    <title>Contents</title>
+    <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 1024 1024">
+      <path d="M408 442h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8zm-8 204c0 4.4 3.6 8 8 8h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56zm504-486H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zm0 632H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zM115.4 518.9L271.7 642c5.8 4.6 14.4.5 14.4-6.9V388.9c0-7.4-8.5-11.5-14.4-6.9L115.4 505.1a8.74 8.74 0 0 0 0 13.8z"/>
+    </svg>
+  </symbol>
+  <symbol id="svg-menu" viewBox="0 0 24 24">
+    <title>Menu</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-menu">
+      <line x1="3" y1="12" x2="21" y2="12"></line>
+      <line x1="3" y1="6" x2="21" y2="6"></line>
+      <line x1="3" y1="18" x2="21" y2="18"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-arrow-right" viewBox="0 0 24 24">
+    <title>Expand</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-chevron-right">
+      <polyline points="9 18 15 12 9 6"></polyline>
+    </svg>
+  </symbol>
+  <symbol id="svg-sun" viewBox="0 0 24 24">
+    <title>Light mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="feather-sun">
+      <circle cx="12" cy="12" r="5"></circle>
+      <line x1="12" y1="1" x2="12" y2="3"></line>
+      <line x1="12" y1="21" x2="12" y2="23"></line>
+      <line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
+      <line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
+      <line x1="1" y1="12" x2="3" y2="12"></line>
+      <line x1="21" y1="12" x2="23" y2="12"></line>
+      <line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
+      <line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-moon" viewBox="0 0 24 24">
+    <title>Dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-moon">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none" />
+      <path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z" />
+    </svg>
+  </symbol>
+  <symbol id="svg-sun-half" viewBox="0 0 24 24">
+    <title>Auto light/dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-shadow">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none"/>
+      <circle cx="12" cy="12" r="9" />
+      <path d="M13 12h5" />
+      <path d="M13 15h4" />
+      <path d="M13 18h1" />
+      <path d="M13 9h4" />
+      <path d="M13 6h1" />
+    </svg>
+  </symbol>
+</svg>
+
+<input type="checkbox" class="sidebar-toggle" name="__navigation" id="__navigation">
+<input type="checkbox" class="sidebar-toggle" name="__toc" id="__toc">
+<label class="overlay sidebar-overlay" for="__navigation">
+  <div class="visually-hidden">Hide navigation sidebar</div>
+</label>
+<label class="overlay toc-overlay" for="__toc">
+  <div class="visually-hidden">Hide table of contents sidebar</div>
+</label>
+
+
+
+<div class="page">
+  <header class="mobile-header">
+    <div class="header-left">
+      <label class="nav-overlay-icon" for="__navigation">
+        <div class="visually-hidden">Toggle site navigation sidebar</div>
+        <i class="icon"><svg><use href="#svg-menu"></use></svg></i>
+      </label>
+    </div>
+    <div class="header-center">
+      <a href="../../index.html"><div class="brand">CUTLASS Python</div></a>
+    </div>
+    <div class="header-right">
+      <div class="theme-toggle-container theme-toggle-header">
+        <button class="theme-toggle">
+          <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+          <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+          <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+          <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+        </button>
+      </div>
+      <label class="toc-overlay-icon toc-header-icon no-toc" for="__toc">
+        <div class="visually-hidden">Toggle table of contents sidebar</div>
+        <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+      </label>
+    </div>
+  </header>
+  <aside class="sidebar-drawer">
+    <div class="sidebar-container">
+      
+      <div class="sidebar-sticky"><a class="sidebar-brand" href="../../index.html">
+  
+  <div class="sidebar-logo-container">
+    <img class="sidebar-logo only-light" src="../../_static/cutlass-logo-small.png" alt="Light Logo"/>
+    <img class="sidebar-logo only-dark" src="../../_static/cutlass-logo-small.png" alt="Dark Logo"/>
+  </div>
+  
+  <span class="sidebar-brand-text">CUTLASS Python</span>
+  
+</a><form class="sidebar-search-container" method="get" action="../../search.html" role="search">
+  <input class="sidebar-search" placeholder="Search" name="q" aria-label="Search">
+  <input type="hidden" name="check_keywords" value="yes">
+  <input type="hidden" name="area" value="default">
+</form>
+<div id="searchbox"></div><div class="sidebar-scroll"><div class="sidebar-tree">
+  <ul>
+<li class="toctree-l1"><a class="reference internal" href="../../index.html">Home</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Getting Started:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="../../install.html">Installation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../externals/00_basic_gemm.html">Getting Started</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../contribute.html">Contributing</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Python Documentation:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../modules.html">CUTLASS Python API</a><input class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" role="switch" type="checkbox"/><label for="toctree-checkbox-1"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2 has-children"><a class="reference internal" href="../../cutlass.html">CUTLASS</a><input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" role="switch" type="checkbox"/><label for="toctree-checkbox-2"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l3"><a class="reference internal" href="../../cutlass.emit.html">Emitters</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../cutlass.op.html">Operations</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../cutlass.utils.html">Utilities</a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Examples and Tutorials:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../examples.html">Examples</a><input class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" role="switch" type="checkbox"/><label for="toctree-checkbox-3"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../externals/00_basic_gemm.html">Basic GEMM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../externals/01_epilogue.html">Epilogue</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../externals/02_pytorch_extension_grouped_gemm.html">PyTorch Extension</a></li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Reference:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference external" href="https://github.com/NVIDIA/cutlass">Github</a></li>
+</ul>
+
+</div>
+</div>
+
+      </div>
+      
+    </div>
+  </aside>
+  <div class="main">
+    <div class="content">
+      <div class="article-container">
+        <a href="#" class="back-to-top muted-link">
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+            <path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12z"></path>
+          </svg>
+          <span>Back to top</span>
+        </a>
+        <div class="content-icon-container">
+          <div class="theme-toggle-container theme-toggle-content">
+            <button class="theme-toggle">
+              <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+              <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+              <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+              <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+            </button>
+          </div>
+          <label class="toc-overlay-icon toc-content-icon no-toc" for="__toc">
+            <div class="visually-hidden">Toggle table of contents sidebar</div>
+            <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+          </label>
+        </div>
+        <article role="main">
+          <h1>Source code for cutlass.swizzle</h1><div class="highlight"><pre>
+<span></span><span class="c1">#################################################################################################</span>
+<span class="c1">#</span>
+<span class="c1"># Copyright (c) 2023 - 2023 NVIDIA CORPORATION &amp; AFFILIATES. All rights reserved.</span>
+<span class="c1"># SPDX-License-Identifier: BSD-3-Clause</span>
+<span class="c1">#</span>
+<span class="c1"># Redistribution and use in source and binary forms, with or without</span>
+<span class="c1"># modification, are permitted provided that the following conditions are met:</span>
+<span class="c1">#</span>
+<span class="c1"># 1. Redistributions of source code must retain the above copyright notice, this</span>
+<span class="c1"># list of conditions and the following disclaimer.</span>
+<span class="c1">#</span>
+<span class="c1"># 2. Redistributions in binary form must reproduce the above copyright notice,</span>
+<span class="c1"># this list of conditions and the following disclaimer in the documentation</span>
+<span class="c1"># and/or other materials provided with the distribution.</span>
+<span class="c1">#</span>
+<span class="c1"># 3. Neither the name of the copyright holder nor the names of its</span>
+<span class="c1"># contributors may be used to endorse or promote products derived from</span>
+<span class="c1"># this software without specific prior written permission.</span>
+<span class="c1">#</span>
+<span class="c1"># THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS &quot;AS IS&quot;</span>
+<span class="c1"># AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE</span>
+<span class="c1"># IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE</span>
+<span class="c1"># DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE</span>
+<span class="c1"># FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL</span>
+<span class="c1"># DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR</span>
+<span class="c1"># SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER</span>
+<span class="c1"># CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,</span>
+<span class="c1"># OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE</span>
+<span class="c1"># OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.</span>
+<span class="c1">#</span>
+<span class="c1">#################################################################################################</span>
+
+<span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">Registry of swizzling functions</span>
+<span class="sd">&quot;&quot;&quot;</span>
+
+<span class="kn">import</span> <span class="nn">cutlass_bindings</span>
+
+<span class="n">IdentitySwizzle1</span> <span class="o">=</span> <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">IdentitySwizzle1</span>
+<span class="n">IdentitySwizzle2</span> <span class="o">=</span> <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">IdentitySwizzle2</span>
+<span class="n">IdentitySwizzle4</span> <span class="o">=</span> <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">IdentitySwizzle4</span>
+<span class="n">IdentitySwizzle8</span> <span class="o">=</span> <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">IdentitySwizzle8</span>
+<span class="n">HorizontalSwizzle</span> <span class="o">=</span> <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">HorizontalSwizzle</span>
+<span class="n">BatchedIdentitySwizzle</span> <span class="o">=</span> <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">BatchedIdentitySwizzle</span>
+<span class="n">ThreadblockSwizzleStreamK</span> <span class="o">=</span> <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">ThreadblockSwizzleStreamK</span>
+<span class="n">StridedDgradIdentitySwizzle1</span> <span class="o">=</span> <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">StridedDgradIdentitySwizzle1</span>
+<span class="n">StridedDgradIdentitySwizzle4</span> <span class="o">=</span> <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">StridedDgradIdentitySwizzle4</span>
+<span class="n">StridedDgradHorizontalSwizzle</span> <span class="o">=</span> <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">StridedDgradHorizontalSwizzle</span>
+
+
+<span class="n">_swizzling_functors</span> <span class="o">=</span> <span class="p">[</span>
+    <span class="n">IdentitySwizzle1</span><span class="p">,</span>
+    <span class="n">IdentitySwizzle2</span><span class="p">,</span>
+    <span class="n">IdentitySwizzle4</span><span class="p">,</span>
+    <span class="n">IdentitySwizzle8</span><span class="p">,</span>
+    <span class="n">HorizontalSwizzle</span><span class="p">,</span>
+    <span class="n">BatchedIdentitySwizzle</span><span class="p">,</span>
+    <span class="n">ThreadblockSwizzleStreamK</span><span class="p">,</span>
+    <span class="n">StridedDgradIdentitySwizzle1</span><span class="p">,</span>
+    <span class="n">StridedDgradIdentitySwizzle4</span><span class="p">,</span>
+    <span class="n">StridedDgradHorizontalSwizzle</span><span class="p">,</span>
+<span class="p">]</span>
+
+
+<div class="viewcode-block" id="get_swizzling_functors"><a class="viewcode-back" href="../../cutlass.html#cutlass.swizzle.get_swizzling_functors">[docs]</a><span class="k">def</span> <span class="nf">get_swizzling_functors</span><span class="p">():</span>
+    <span class="k">return</span> <span class="n">_swizzling_functors</span></div>
+</pre></div>
+        </article>
+      </div>
+      <footer>
+        
+        <div class="related-pages">
+          
+          
+        </div>
+        <div class="bottom-of-page">
+          <div class="left-details">
+            <div class="copyright">
+                Copyright &#169; 2023, NVIDIA
+            </div>
+            Made with <a href="https://www.sphinx-doc.org/">Sphinx</a> and <a class="muted-link" href="https://pradyunsg.me">@pradyunsg</a>'s
+            
+            <a href="https://github.com/pradyunsg/furo">Furo</a>
+            
+          </div>
+          <div class="right-details">
+            <div class="icons">
+              <a class="muted-link " href="https://github.com/NVIDIA/cutlass" aria-label="GitHub">
+                <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 16 16">
+                    <path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"></path>
+                </svg>
+            </a>
+              
+            </div>
+          </div>
+        </div>
+        
+      </footer>
+    </div>
+    <aside class="toc-drawer no-toc">
+      
+      
+      
+    </aside>
+  </div>
+</div><script data-url_root="../../" id="documentation_options" src="../../_static/documentation_options.js"></script>
+    <script src="../../_static/doctools.js"></script>
+    <script src="../../_static/sphinx_highlight.js"></script>
+    <script src="../../_static/scripts/furo.js"></script>
+    <script src="../../_static/clipboard.min.js"></script>
+    <script src="../../_static/copybutton.js"></script>
+    <script src="../../_static/tabs.js"></script>
+    <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
+    </body>
+</html>
\ No newline at end of file
diff --git a/python/docs/_modules/cutlass/utils/check.html b/python/docs/_modules/cutlass/utils/check.html
new file mode 100644
index 0000000000..d4c3c57596
--- /dev/null
+++ b/python/docs/_modules/cutlass/utils/check.html
@@ -0,0 +1,476 @@
+<!doctype html>
+<html class="no-js" lang="en">
+  <head><meta charset="utf-8"/>
+    <meta name="viewport" content="width=device-width,initial-scale=1"/>
+    <meta name="color-scheme" content="light dark"><link rel="index" title="Index" href="../../../genindex.html" /><link rel="search" title="Search" href="../../../search.html" />
+        <link rel="canonical" href="docs/_modules/cutlass/utils/check.html" />
+
+    <!-- Generated with Sphinx 6.1.3 and Furo 2023.03.27 -->
+        <title>cutlass.utils.check - CUTLASS Python</title>
+      <link rel="stylesheet" type="text/css" href="../../../_static/pygments.css" />
+    <link rel="stylesheet" type="text/css" href="../../../_static/styles/furo.css?digest=fad236701ea90a88636c2a8c73b44ae642ed2a53" />
+    <link rel="stylesheet" type="text/css" href="../../../_static/copybutton.css" />
+    <link rel="stylesheet" type="text/css" href="../../../_static/tabs.css" />
+    <link rel="stylesheet" type="text/css" href="../../../_static/styles/furo-extensions.css?digest=30d1aed668e5c3a91c3e3bf6a60b675221979f0e" />
+    
+    
+
+
+<style>
+  body {
+    --color-code-background: #eeffcc;
+  --color-code-foreground: black;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+  }
+  @media not print {
+    body[data-theme="dark"] {
+      --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+    }
+    @media (prefers-color-scheme: dark) {
+      body:not([data-theme="light"]) {
+        --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+      }
+    }
+  }
+</style></head>
+  <body>
+    
+    <script>
+      document.body.dataset.theme = localStorage.getItem("theme") || "auto";
+    </script>
+    
+
+<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
+  <symbol id="svg-toc" viewBox="0 0 24 24">
+    <title>Contents</title>
+    <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 1024 1024">
+      <path d="M408 442h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8zm-8 204c0 4.4 3.6 8 8 8h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56zm504-486H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zm0 632H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zM115.4 518.9L271.7 642c5.8 4.6 14.4.5 14.4-6.9V388.9c0-7.4-8.5-11.5-14.4-6.9L115.4 505.1a8.74 8.74 0 0 0 0 13.8z"/>
+    </svg>
+  </symbol>
+  <symbol id="svg-menu" viewBox="0 0 24 24">
+    <title>Menu</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-menu">
+      <line x1="3" y1="12" x2="21" y2="12"></line>
+      <line x1="3" y1="6" x2="21" y2="6"></line>
+      <line x1="3" y1="18" x2="21" y2="18"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-arrow-right" viewBox="0 0 24 24">
+    <title>Expand</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-chevron-right">
+      <polyline points="9 18 15 12 9 6"></polyline>
+    </svg>
+  </symbol>
+  <symbol id="svg-sun" viewBox="0 0 24 24">
+    <title>Light mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="feather-sun">
+      <circle cx="12" cy="12" r="5"></circle>
+      <line x1="12" y1="1" x2="12" y2="3"></line>
+      <line x1="12" y1="21" x2="12" y2="23"></line>
+      <line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
+      <line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
+      <line x1="1" y1="12" x2="3" y2="12"></line>
+      <line x1="21" y1="12" x2="23" y2="12"></line>
+      <line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
+      <line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-moon" viewBox="0 0 24 24">
+    <title>Dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-moon">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none" />
+      <path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z" />
+    </svg>
+  </symbol>
+  <symbol id="svg-sun-half" viewBox="0 0 24 24">
+    <title>Auto light/dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-shadow">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none"/>
+      <circle cx="12" cy="12" r="9" />
+      <path d="M13 12h5" />
+      <path d="M13 15h4" />
+      <path d="M13 18h1" />
+      <path d="M13 9h4" />
+      <path d="M13 6h1" />
+    </svg>
+  </symbol>
+</svg>
+
+<input type="checkbox" class="sidebar-toggle" name="__navigation" id="__navigation">
+<input type="checkbox" class="sidebar-toggle" name="__toc" id="__toc">
+<label class="overlay sidebar-overlay" for="__navigation">
+  <div class="visually-hidden">Hide navigation sidebar</div>
+</label>
+<label class="overlay toc-overlay" for="__toc">
+  <div class="visually-hidden">Hide table of contents sidebar</div>
+</label>
+
+
+
+<div class="page">
+  <header class="mobile-header">
+    <div class="header-left">
+      <label class="nav-overlay-icon" for="__navigation">
+        <div class="visually-hidden">Toggle site navigation sidebar</div>
+        <i class="icon"><svg><use href="#svg-menu"></use></svg></i>
+      </label>
+    </div>
+    <div class="header-center">
+      <a href="../../../index.html"><div class="brand">CUTLASS Python</div></a>
+    </div>
+    <div class="header-right">
+      <div class="theme-toggle-container theme-toggle-header">
+        <button class="theme-toggle">
+          <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+          <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+          <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+          <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+        </button>
+      </div>
+      <label class="toc-overlay-icon toc-header-icon no-toc" for="__toc">
+        <div class="visually-hidden">Toggle table of contents sidebar</div>
+        <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+      </label>
+    </div>
+  </header>
+  <aside class="sidebar-drawer">
+    <div class="sidebar-container">
+      
+      <div class="sidebar-sticky"><a class="sidebar-brand" href="../../../index.html">
+  
+  <div class="sidebar-logo-container">
+    <img class="sidebar-logo only-light" src="../../../_static/cutlass-logo-small.png" alt="Light Logo"/>
+    <img class="sidebar-logo only-dark" src="../../../_static/cutlass-logo-small.png" alt="Dark Logo"/>
+  </div>
+  
+  <span class="sidebar-brand-text">CUTLASS Python</span>
+  
+</a><form class="sidebar-search-container" method="get" action="../../../search.html" role="search">
+  <input class="sidebar-search" placeholder="Search" name="q" aria-label="Search">
+  <input type="hidden" name="check_keywords" value="yes">
+  <input type="hidden" name="area" value="default">
+</form>
+<div id="searchbox"></div><div class="sidebar-scroll"><div class="sidebar-tree">
+  <ul>
+<li class="toctree-l1"><a class="reference internal" href="../../../index.html">Home</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Getting Started:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="../../../install.html">Installation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../externals/00_basic_gemm.html">Getting Started</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../contribute.html">Contributing</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Python Documentation:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../modules.html">CUTLASS Python API</a><input class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" role="switch" type="checkbox"/><label for="toctree-checkbox-1"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2 has-children"><a class="reference internal" href="../../../cutlass.html">CUTLASS</a><input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" role="switch" type="checkbox"/><label for="toctree-checkbox-2"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l3"><a class="reference internal" href="../../../cutlass.emit.html">Emitters</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../../cutlass.op.html">Operations</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../../cutlass.utils.html">Utilities</a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Examples and Tutorials:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples.html">Examples</a><input class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" role="switch" type="checkbox"/><label for="toctree-checkbox-3"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../externals/00_basic_gemm.html">Basic GEMM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../externals/01_epilogue.html">Epilogue</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../externals/02_pytorch_extension_grouped_gemm.html">PyTorch Extension</a></li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Reference:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference external" href="https://github.com/NVIDIA/cutlass">Github</a></li>
+</ul>
+
+</div>
+</div>
+
+      </div>
+      
+    </div>
+  </aside>
+  <div class="main">
+    <div class="content">
+      <div class="article-container">
+        <a href="#" class="back-to-top muted-link">
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+            <path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12z"></path>
+          </svg>
+          <span>Back to top</span>
+        </a>
+        <div class="content-icon-container">
+          <div class="theme-toggle-container theme-toggle-content">
+            <button class="theme-toggle">
+              <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+              <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+              <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+              <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+            </button>
+          </div>
+          <label class="toc-overlay-icon toc-content-icon no-toc" for="__toc">
+            <div class="visually-hidden">Toggle table of contents sidebar</div>
+            <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+          </label>
+        </div>
+        <article role="main">
+          <h1>Source code for cutlass.utils.check</h1><div class="highlight"><pre>
+<span></span><span class="c1">#################################################################################################</span>
+<span class="c1">#</span>
+<span class="c1"># Copyright (c) 2023 - 2023 NVIDIA CORPORATION &amp; AFFILIATES. All rights reserved.</span>
+<span class="c1"># SPDX-License-Identifier: BSD-3-Clause</span>
+<span class="c1">#</span>
+<span class="c1"># Redistribution and use in source and binary forms, with or without</span>
+<span class="c1"># modification, are permitted provided that the following conditions are met:</span>
+<span class="c1">#</span>
+<span class="c1"># 1. Redistributions of source code must retain the above copyright notice, this</span>
+<span class="c1"># list of conditions and the following disclaimer.</span>
+<span class="c1">#</span>
+<span class="c1"># 2. Redistributions in binary form must reproduce the above copyright notice,</span>
+<span class="c1"># this list of conditions and the following disclaimer in the documentation</span>
+<span class="c1"># and/or other materials provided with the distribution.</span>
+<span class="c1">#</span>
+<span class="c1"># 3. Neither the name of the copyright holder nor the names of its</span>
+<span class="c1"># contributors may be used to endorse or promote products derived from</span>
+<span class="c1"># this software without specific prior written permission.</span>
+<span class="c1">#</span>
+<span class="c1"># THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS &quot;AS IS&quot;</span>
+<span class="c1"># AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE</span>
+<span class="c1"># IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE</span>
+<span class="c1"># DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE</span>
+<span class="c1"># FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL</span>
+<span class="c1"># DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR</span>
+<span class="c1"># SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER</span>
+<span class="c1"># CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,</span>
+<span class="c1"># OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE</span>
+<span class="c1"># OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.</span>
+<span class="c1">#</span>
+<span class="c1">#################################################################################################</span>
+
+<span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">Utility functions for checking constraints on kernels and calculating kernel attributes</span>
+<span class="sd">&quot;&quot;&quot;</span>
+
+<span class="kn">import</span> <span class="nn">ctypes</span>
+
+<span class="kn">import</span> <span class="nn">cutlass_bindings</span>
+<span class="kn">import</span> <span class="nn">cutlass</span>
+<span class="kn">from</span> <span class="nn">cutlass.backend.library</span> <span class="kn">import</span> <span class="n">DataTypeSize</span><span class="p">,</span> <span class="n">TileDescription</span>
+
+
+<div class="viewcode-block" id="calculate_smem_usage_per_stage"><a class="viewcode-back" href="../../../cutlass.utils.html#cutlass.utils.check.calculate_smem_usage_per_stage">[docs]</a><span class="k">def</span> <span class="nf">calculate_smem_usage_per_stage</span><span class="p">(</span><span class="n">tile_description</span><span class="p">,</span> <span class="n">operation_kind</span><span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Returns the amount of shared memory in bytes consumed in a single stage of a kernel.</span>
+
+<span class="sd">    :return: number of bytes of shared memory consumed by a single stage</span>
+<span class="sd">    :rtype: int</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="n">m</span><span class="p">,</span> <span class="n">n</span><span class="p">,</span> <span class="n">k</span> <span class="o">=</span> <span class="n">tile_description</span><span class="o">.</span><span class="n">threadblock_shape</span>
+
+    <span class="k">if</span> <span class="n">operation_kind</span> <span class="o">==</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">OperationKind</span><span class="o">.</span><span class="n">Gemm</span><span class="p">:</span>
+        <span class="n">stage_barrier_bytes</span> <span class="o">=</span> <span class="mi">32</span>
+        <span class="k">return</span> <span class="p">(</span>
+            <span class="p">(</span><span class="n">DataTypeSize</span><span class="p">[</span><span class="n">tile_description</span><span class="o">.</span><span class="n">math_instruction</span><span class="o">.</span><span class="n">element_a</span><span class="p">]</span> <span class="o">*</span> <span class="n">m</span> <span class="o">*</span> <span class="n">k</span> <span class="o">//</span> <span class="mi">8</span><span class="p">)</span>
+            <span class="o">+</span> <span class="p">(</span><span class="n">DataTypeSize</span><span class="p">[</span><span class="n">tile_description</span><span class="o">.</span><span class="n">math_instruction</span><span class="o">.</span><span class="n">element_b</span><span class="p">]</span> <span class="o">*</span> <span class="n">k</span> <span class="o">*</span> <span class="n">n</span> <span class="o">//</span> <span class="mi">8</span><span class="p">)</span>
+            <span class="o">+</span> <span class="n">stage_barrier_bytes</span>
+        <span class="p">)</span>
+    <span class="k">else</span><span class="p">:</span>
+        <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;No available shared memory calculation for operation kind </span><span class="si">{</span><span class="n">operation</span><span class="o">.</span><span class="n">operation_kind</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span></div>
+
+
+<div class="viewcode-block" id="calculate_smem_usage"><a class="viewcode-back" href="../../../cutlass.utils.html#cutlass.utils.check.calculate_smem_usage">[docs]</a><span class="k">def</span> <span class="nf">calculate_smem_usage</span><span class="p">(</span><span class="n">operation</span><span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Returns the amount of shared memory in bytes consumed by a kernel.</span>
+
+<span class="sd">    :return: number of bytes of shared memory consumed by the operation</span>
+<span class="sd">    :return: int</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="n">_per_stage</span> <span class="o">=</span> <span class="n">calculate_smem_usage_per_stage</span><span class="p">(</span><span class="n">operation</span><span class="o">.</span><span class="n">tile_description</span><span class="p">,</span> <span class="n">operation</span><span class="o">.</span><span class="n">operation_kind</span><span class="p">)</span>
+    <span class="k">return</span> <span class="n">_per_stage</span> <span class="o">*</span> <span class="n">operation</span><span class="o">.</span><span class="n">tile_description</span><span class="o">.</span><span class="n">stages</span></div>
+
+
+<div class="viewcode-block" id="valid_stage_count"><a class="viewcode-back" href="../../../cutlass.utils.html#cutlass.utils.check.valid_stage_count">[docs]</a><span class="k">def</span> <span class="nf">valid_stage_count</span><span class="p">(</span><span class="n">cc</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">td</span><span class="p">:</span> <span class="n">TileDescription</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">tuple</span><span class="p">:</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Checks whether a device with `cc` supports the number of stages within `tile_description`, both</span>
+<span class="sd">    based on raw limits on the number of stages and based on shared memory capacity</span>
+
+<span class="sd">    :param cc: compute capability of device in question</span>
+<span class="sd">    :type cc: int</span>
+<span class="sd">    :param td: tile description to check</span>
+<span class="sd">    :type td: TileDescription</span>
+
+<span class="sd">    :return: tuple with the first element indicating whether the provided tile description is</span>
+<span class="sd">             valid for the provided device and the second element being an error message</span>
+<span class="sd">    :rtype: tuple</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="k">if</span> <span class="n">cc</span> <span class="o">==</span> <span class="mi">90</span> <span class="ow">and</span> <span class="p">(</span><span class="n">td</span><span class="o">.</span><span class="n">stages</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="n">td</span><span class="o">.</span><span class="n">stages</span> <span class="o">==</span> <span class="mi">0</span><span class="p">):</span>
+        <span class="c1"># Stage count of None or 0 for SM90 indicates that the CollectiveBuilder automatically</span>
+        <span class="c1"># determines the stage count to use. Thus, all settings are valid in these scenarios.</span>
+        <span class="k">return</span> <span class="p">(</span><span class="kc">True</span><span class="p">,</span> <span class="s2">&quot;&quot;</span><span class="p">)</span>
+
+    <span class="k">if</span> <span class="n">td</span><span class="o">.</span><span class="n">stages</span> <span class="o">&lt;=</span> <span class="mi">0</span><span class="p">:</span>
+        <span class="k">return</span> <span class="p">(</span><span class="kc">False</span><span class="p">,</span> <span class="sa">f</span><span class="s2">&quot;Stage counts must be positive integers. Tile description has stage count of </span><span class="si">{</span><span class="n">td</span><span class="o">.</span><span class="n">stages</span><span class="si">}</span><span class="s2">.&quot;</span><span class="p">)</span>
+
+    <span class="k">if</span> <span class="n">cc</span> <span class="o">&lt;</span> <span class="mi">80</span> <span class="ow">and</span> <span class="n">td</span><span class="o">.</span><span class="n">stages</span> <span class="o">!=</span> <span class="mi">2</span><span class="p">:</span>
+        <span class="k">return</span> <span class="p">(</span><span class="kc">False</span><span class="p">,</span> <span class="sa">f</span><span class="s2">&quot;Tile description has stage count of </span><span class="si">{</span><span class="n">td</span><span class="o">.</span><span class="n">stages</span><span class="si">}</span><span class="s2">, &quot;</span>
+                       <span class="sa">f</span><span class="s2">&quot;but only 2 stages are supported on SM</span><span class="si">{</span><span class="n">cc</span><span class="si">}</span><span class="s2">.&quot;</span><span class="p">)</span>
+
+    <span class="n">smem_per_stage</span> <span class="o">=</span> <span class="n">calculate_smem_usage_per_stage</span><span class="p">(</span><span class="n">td</span><span class="p">,</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">OperationKind</span><span class="o">.</span><span class="n">Gemm</span><span class="p">)</span>
+    <span class="n">smem_arch</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">SharedMemPerCC</span><span class="p">[</span><span class="n">cc</span><span class="p">]</span> <span class="o">&lt;&lt;</span> <span class="mi">10</span>
+    <span class="k">if</span> <span class="p">(</span><span class="n">smem_per_stage</span> <span class="o">*</span> <span class="n">td</span><span class="o">.</span><span class="n">stages</span><span class="p">)</span> <span class="o">&gt;</span> <span class="n">smem_arch</span><span class="p">:</span>
+        <span class="k">return</span> <span class="p">(</span> <span class="kc">False</span><span class="p">,</span>
+            <span class="s2">&quot;Configuration uses too much shared memory. Consider reducing stage count or tile shape.</span><span class="se">\n</span><span class="s2">&quot;</span>
+            <span class="sa">f</span><span class="s2">&quot;Details: configuration uses </span><span class="si">{</span><span class="n">smem_per_stage</span><span class="si">}</span><span class="s2"> bytes of shared memory per stage, and &quot;</span>
+            <span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="n">td</span><span class="o">.</span><span class="n">stages</span><span class="si">}</span><span class="s2"> stages for a total of </span><span class="si">{</span><span class="n">smem_per_stage</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="n">td</span><span class="o">.</span><span class="n">stages</span><span class="si">}</span><span class="s2"> bytes.</span><span class="se">\n</span><span class="s2">&quot;</span>
+            <span class="sa">f</span><span class="s2">&quot;The maxmium amoung of shared memory that can be used per block on CC </span><span class="si">{</span><span class="n">cc</span><span class="si">}</span><span class="s2"> is </span><span class="si">{</span><span class="n">smem_arch</span><span class="si">}</span><span class="s2">.&quot;</span><span class="p">)</span>
+
+    <span class="k">return</span> <span class="p">(</span><span class="kc">True</span><span class="p">,</span> <span class="s2">&quot;&quot;</span><span class="p">)</span></div>
+
+
+<div class="viewcode-block" id="valid_cluster_shape"><a class="viewcode-back" href="../../../cutlass.utils.html#cutlass.utils.check.valid_cluster_shape">[docs]</a><span class="k">def</span> <span class="nf">valid_cluster_shape</span><span class="p">(</span><span class="n">cc</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">cluster_shape</span><span class="p">:</span> <span class="nb">list</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">tuple</span><span class="p">:</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Checks whether a device with `cc` supports a thread block cluster of shape `cluster_shape`.</span>
+
+<span class="sd">    :param cc: compute capability of device in question</span>
+<span class="sd">    :type cc: int</span>
+<span class="sd">    :param cluster_shape: dimensions of thread block cluster shape to check</span>
+<span class="sd">    :type cluster_shape: list</span>
+
+<span class="sd">    :return: tuple with the first element indicating whether the provided cluster shape is</span>
+<span class="sd">             valid for the provided device and the second element being an error message</span>
+<span class="sd">    :rtype: tuple</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+
+    <span class="k">if</span> <span class="n">cc</span> <span class="o">&lt;</span> <span class="mi">90</span><span class="p">:</span>
+        <span class="k">if</span> <span class="n">cluster_shape</span> <span class="o">!=</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">]:</span>
+            <span class="k">return</span> <span class="p">(</span><span class="kc">False</span><span class="p">,</span>
+                    <span class="sa">f</span><span class="s2">&quot;Cluster shape for pre-SM90 architectures must be [1, 1, 1]. Received cluster shape of &quot;</span>
+                    <span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="n">cluster_shape</span><span class="si">}</span><span class="s2"> for SM</span><span class="si">{</span><span class="n">cc</span><span class="si">}</span><span class="s2">.&quot;</span><span class="p">)</span>
+        <span class="k">else</span><span class="p">:</span>
+            <span class="k">return</span> <span class="p">(</span><span class="kc">True</span><span class="p">,</span> <span class="s2">&quot;&quot;</span><span class="p">)</span>
+
+    <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">cluster_shape</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">3</span><span class="p">:</span>
+        <span class="k">return</span> <span class="p">(</span><span class="kc">False</span><span class="p">,</span>
+                <span class="sa">f</span><span class="s2">&quot;Cluster shapes must be rank-3. Received </span><span class="si">{</span><span class="n">cluster_shape</span><span class="si">}</span><span class="s2"> (rank </span><span class="si">{</span><span class="nb">len</span><span class="p">(</span><span class="n">cluster_shape</span><span class="p">)</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+
+    <span class="k">if</span> <span class="n">cluster_shape</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span> <span class="o">!=</span> <span class="mi">1</span><span class="p">:</span>
+        <span class="k">return</span> <span class="p">(</span><span class="kc">False</span><span class="p">,</span>
+                <span class="s2">&quot;CUTLASS kernels currently require the third dimension of cluster shape to be 1. &quot;</span>
+                <span class="sa">f</span><span class="s2">&quot;Received cluster shape of </span><span class="si">{</span><span class="n">cluster_shape</span><span class="si">}</span><span class="s2">.&quot;</span><span class="p">)</span>
+
+    <span class="c1"># The CUDA programming guide currently defines a maximum of 8 thread blocks per cluster</span>
+    <span class="c1"># as being portably supported (https://docs.nvidia.com/cuda/cuda-c-programming-guide/#thread-block-clusters).</span>
+    <span class="c1"># Current CUTLASS kernels only have non-unit cluster dimensions within the first two dimensions,</span>
+    <span class="c1"># so we check that the first two dimensions of the cluster shape do not exceed 8 thread blocks in total.</span>
+    <span class="n">blocks_in_2d</span> <span class="o">=</span> <span class="n">cluster_shape</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">*</span> <span class="n">cluster_shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
+    <span class="k">if</span> <span class="n">blocks_in_2d</span> <span class="o">&gt;</span> <span class="mi">8</span><span class="p">:</span>
+        <span class="k">return</span> <span class="p">(</span><span class="kc">False</span><span class="p">,</span>
+            <span class="sa">f</span><span class="s2">&quot;Thread block clusters with more than 8 thread blocks are currently unsupported on SM</span><span class="si">{</span><span class="n">cc</span><span class="si">}</span><span class="s2">. &quot;</span>
+            <span class="sa">f</span><span class="s2">&quot;Received cluster shape </span><span class="si">{</span><span class="n">cluster_shape</span><span class="si">}</span><span class="s2">, which has </span><span class="si">{</span><span class="n">blocks_in_2d</span><span class="si">}</span><span class="s2"> thread blocks.&quot;</span><span class="p">)</span>
+    <span class="k">return</span> <span class="p">(</span><span class="kc">True</span><span class="p">,</span> <span class="s2">&quot;&quot;</span><span class="p">)</span></div>
+
+
+<div class="viewcode-block" id="valid_kernel_schedule"><a class="viewcode-back" href="../../../cutlass.utils.html#cutlass.utils.check.valid_kernel_schedule">[docs]</a><span class="k">def</span> <span class="nf">valid_kernel_schedule</span><span class="p">(</span><span class="n">cc</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">kernel_schedule</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">KernelScheduleType</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">tuple</span><span class="p">:</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Checks whether a device with ``cc`` supports ``kernel_schedule``.</span>
+
+<span class="sd">    :param cc: compute capability of device in question</span>
+<span class="sd">    :type cc: int</span>
+<span class="sd">    :param kernel_schedule: kernel schedule type</span>
+<span class="sd">    :type KernelScheduleType: cutlass.KernelScheduleType</span>
+
+<span class="sd">    :return: tuple with the first element indicating whether the provided kernel schedule is</span>
+<span class="sd">             valid for the provided device and the second element being an error message</span>
+<span class="sd">    :rtype: tuple</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="k">if</span> <span class="n">kernel_schedule</span> <span class="o">!=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">KernelScheduleType</span><span class="o">.</span><span class="n">ScheduleAuto</span> <span class="ow">and</span> <span class="n">cc</span> <span class="o">&lt;</span> <span class="mi">90</span><span class="p">:</span>
+        <span class="k">return</span> <span class="p">(</span><span class="kc">False</span><span class="p">,</span> <span class="s2">&quot;Non-default kernel schedules are only supported on SM90 and beyond&quot;</span><span class="p">)</span>
+    <span class="k">return</span> <span class="p">(</span><span class="kc">True</span><span class="p">,</span> <span class="s2">&quot;&quot;</span><span class="p">)</span></div>
+
+
+<div class="viewcode-block" id="alignment_or_default"><a class="viewcode-back" href="../../../cutlass.utils.html#cutlass.utils.check.alignment_or_default">[docs]</a><span class="k">def</span> <span class="nf">alignment_or_default</span><span class="p">(</span><span class="n">alignment_provided</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">default_alignment</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Returns `alignment_provided` if it is set, otherwise `default_alignment` and checks</span>
+<span class="sd">    that `alignment_provided` does not exceed `default_alignment`.</span>
+
+<span class="sd">    :param alignment_provided: alignment preference specified. Can be None.</span>
+<span class="sd">    :type alignment_provided: int</span>
+<span class="sd">    :param default_alignment: alignment to use if `alignment_provided` is None</span>
+<span class="sd">    :type default_alignment: int</span>
+
+<span class="sd">    :return: alignment to use</span>
+<span class="sd">    :rtype: int</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="k">if</span> <span class="n">alignment_provided</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+        <span class="k">if</span> <span class="n">alignment_provided</span> <span class="o">&gt;</span> <span class="n">default_alignment</span><span class="p">:</span>
+            <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Alignment </span><span class="si">{</span><span class="n">alignment_provided</span><span class="si">}</span><span class="s2"> exceeds the maximum supported of </span><span class="si">{</span><span class="n">default_alignment</span><span class="si">}</span><span class="s2">.&quot;</span><span class="p">)</span>
+        <span class="k">return</span> <span class="n">alignment_provided</span>
+
+    <span class="k">return</span> <span class="n">default_alignment</span></div>
+</pre></div>
+        </article>
+      </div>
+      <footer>
+        
+        <div class="related-pages">
+          
+          
+        </div>
+        <div class="bottom-of-page">
+          <div class="left-details">
+            <div class="copyright">
+                Copyright &#169; 2023, NVIDIA
+            </div>
+            Made with <a href="https://www.sphinx-doc.org/">Sphinx</a> and <a class="muted-link" href="https://pradyunsg.me">@pradyunsg</a>'s
+            
+            <a href="https://github.com/pradyunsg/furo">Furo</a>
+            
+          </div>
+          <div class="right-details">
+            <div class="icons">
+              <a class="muted-link " href="https://github.com/NVIDIA/cutlass" aria-label="GitHub">
+                <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 16 16">
+                    <path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"></path>
+                </svg>
+            </a>
+              
+            </div>
+          </div>
+        </div>
+        
+      </footer>
+    </div>
+    <aside class="toc-drawer no-toc">
+      
+      
+      
+    </aside>
+  </div>
+</div><script data-url_root="../../../" id="documentation_options" src="../../../_static/documentation_options.js"></script>
+    <script src="../../../_static/doctools.js"></script>
+    <script src="../../../_static/sphinx_highlight.js"></script>
+    <script src="../../../_static/scripts/furo.js"></script>
+    <script src="../../../_static/clipboard.min.js"></script>
+    <script src="../../../_static/copybutton.js"></script>
+    <script src="../../../_static/tabs.js"></script>
+    <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
+    </body>
+</html>
\ No newline at end of file
diff --git a/python/docs/_modules/cutlass/utils/datatypes.html b/python/docs/_modules/cutlass/utils/datatypes.html
new file mode 100644
index 0000000000..3046047577
--- /dev/null
+++ b/python/docs/_modules/cutlass/utils/datatypes.html
@@ -0,0 +1,623 @@
+<!doctype html>
+<html class="no-js" lang="en">
+  <head><meta charset="utf-8"/>
+    <meta name="viewport" content="width=device-width,initial-scale=1"/>
+    <meta name="color-scheme" content="light dark"><link rel="index" title="Index" href="../../../genindex.html" /><link rel="search" title="Search" href="../../../search.html" />
+        <link rel="canonical" href="docs/_modules/cutlass/utils/datatypes.html" />
+
+    <!-- Generated with Sphinx 6.1.3 and Furo 2023.03.27 -->
+        <title>cutlass.utils.datatypes - CUTLASS Python</title>
+      <link rel="stylesheet" type="text/css" href="../../../_static/pygments.css" />
+    <link rel="stylesheet" type="text/css" href="../../../_static/styles/furo.css?digest=fad236701ea90a88636c2a8c73b44ae642ed2a53" />
+    <link rel="stylesheet" type="text/css" href="../../../_static/copybutton.css" />
+    <link rel="stylesheet" type="text/css" href="../../../_static/tabs.css" />
+    <link rel="stylesheet" type="text/css" href="../../../_static/styles/furo-extensions.css?digest=30d1aed668e5c3a91c3e3bf6a60b675221979f0e" />
+    
+    
+
+
+<style>
+  body {
+    --color-code-background: #eeffcc;
+  --color-code-foreground: black;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+  }
+  @media not print {
+    body[data-theme="dark"] {
+      --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+    }
+    @media (prefers-color-scheme: dark) {
+      body:not([data-theme="light"]) {
+        --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+      }
+    }
+  }
+</style></head>
+  <body>
+    
+    <script>
+      document.body.dataset.theme = localStorage.getItem("theme") || "auto";
+    </script>
+    
+
+<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
+  <symbol id="svg-toc" viewBox="0 0 24 24">
+    <title>Contents</title>
+    <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 1024 1024">
+      <path d="M408 442h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8zm-8 204c0 4.4 3.6 8 8 8h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56zm504-486H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zm0 632H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zM115.4 518.9L271.7 642c5.8 4.6 14.4.5 14.4-6.9V388.9c0-7.4-8.5-11.5-14.4-6.9L115.4 505.1a8.74 8.74 0 0 0 0 13.8z"/>
+    </svg>
+  </symbol>
+  <symbol id="svg-menu" viewBox="0 0 24 24">
+    <title>Menu</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-menu">
+      <line x1="3" y1="12" x2="21" y2="12"></line>
+      <line x1="3" y1="6" x2="21" y2="6"></line>
+      <line x1="3" y1="18" x2="21" y2="18"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-arrow-right" viewBox="0 0 24 24">
+    <title>Expand</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-chevron-right">
+      <polyline points="9 18 15 12 9 6"></polyline>
+    </svg>
+  </symbol>
+  <symbol id="svg-sun" viewBox="0 0 24 24">
+    <title>Light mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="feather-sun">
+      <circle cx="12" cy="12" r="5"></circle>
+      <line x1="12" y1="1" x2="12" y2="3"></line>
+      <line x1="12" y1="21" x2="12" y2="23"></line>
+      <line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
+      <line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
+      <line x1="1" y1="12" x2="3" y2="12"></line>
+      <line x1="21" y1="12" x2="23" y2="12"></line>
+      <line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
+      <line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-moon" viewBox="0 0 24 24">
+    <title>Dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-moon">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none" />
+      <path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z" />
+    </svg>
+  </symbol>
+  <symbol id="svg-sun-half" viewBox="0 0 24 24">
+    <title>Auto light/dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-shadow">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none"/>
+      <circle cx="12" cy="12" r="9" />
+      <path d="M13 12h5" />
+      <path d="M13 15h4" />
+      <path d="M13 18h1" />
+      <path d="M13 9h4" />
+      <path d="M13 6h1" />
+    </svg>
+  </symbol>
+</svg>
+
+<input type="checkbox" class="sidebar-toggle" name="__navigation" id="__navigation">
+<input type="checkbox" class="sidebar-toggle" name="__toc" id="__toc">
+<label class="overlay sidebar-overlay" for="__navigation">
+  <div class="visually-hidden">Hide navigation sidebar</div>
+</label>
+<label class="overlay toc-overlay" for="__toc">
+  <div class="visually-hidden">Hide table of contents sidebar</div>
+</label>
+
+
+
+<div class="page">
+  <header class="mobile-header">
+    <div class="header-left">
+      <label class="nav-overlay-icon" for="__navigation">
+        <div class="visually-hidden">Toggle site navigation sidebar</div>
+        <i class="icon"><svg><use href="#svg-menu"></use></svg></i>
+      </label>
+    </div>
+    <div class="header-center">
+      <a href="../../../index.html"><div class="brand">CUTLASS Python</div></a>
+    </div>
+    <div class="header-right">
+      <div class="theme-toggle-container theme-toggle-header">
+        <button class="theme-toggle">
+          <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+          <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+          <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+          <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+        </button>
+      </div>
+      <label class="toc-overlay-icon toc-header-icon no-toc" for="__toc">
+        <div class="visually-hidden">Toggle table of contents sidebar</div>
+        <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+      </label>
+    </div>
+  </header>
+  <aside class="sidebar-drawer">
+    <div class="sidebar-container">
+      
+      <div class="sidebar-sticky"><a class="sidebar-brand" href="../../../index.html">
+  
+  <div class="sidebar-logo-container">
+    <img class="sidebar-logo only-light" src="../../../_static/cutlass-logo-small.png" alt="Light Logo"/>
+    <img class="sidebar-logo only-dark" src="../../../_static/cutlass-logo-small.png" alt="Dark Logo"/>
+  </div>
+  
+  <span class="sidebar-brand-text">CUTLASS Python</span>
+  
+</a><form class="sidebar-search-container" method="get" action="../../../search.html" role="search">
+  <input class="sidebar-search" placeholder="Search" name="q" aria-label="Search">
+  <input type="hidden" name="check_keywords" value="yes">
+  <input type="hidden" name="area" value="default">
+</form>
+<div id="searchbox"></div><div class="sidebar-scroll"><div class="sidebar-tree">
+  <ul>
+<li class="toctree-l1"><a class="reference internal" href="../../../index.html">Home</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Getting Started:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="../../../install.html">Installation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../externals/00_basic_gemm.html">Getting Started</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../contribute.html">Contributing</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Python Documentation:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../modules.html">CUTLASS Python API</a><input class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" role="switch" type="checkbox"/><label for="toctree-checkbox-1"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2 has-children"><a class="reference internal" href="../../../cutlass.html">CUTLASS</a><input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" role="switch" type="checkbox"/><label for="toctree-checkbox-2"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l3"><a class="reference internal" href="../../../cutlass.emit.html">Emitters</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../../cutlass.op.html">Operations</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../../cutlass.utils.html">Utilities</a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Examples and Tutorials:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples.html">Examples</a><input class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" role="switch" type="checkbox"/><label for="toctree-checkbox-3"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../externals/00_basic_gemm.html">Basic GEMM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../externals/01_epilogue.html">Epilogue</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../externals/02_pytorch_extension_grouped_gemm.html">PyTorch Extension</a></li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Reference:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference external" href="https://github.com/NVIDIA/cutlass">Github</a></li>
+</ul>
+
+</div>
+</div>
+
+      </div>
+      
+    </div>
+  </aside>
+  <div class="main">
+    <div class="content">
+      <div class="article-container">
+        <a href="#" class="back-to-top muted-link">
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+            <path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12z"></path>
+          </svg>
+          <span>Back to top</span>
+        </a>
+        <div class="content-icon-container">
+          <div class="theme-toggle-container theme-toggle-content">
+            <button class="theme-toggle">
+              <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+              <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+              <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+              <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+            </button>
+          </div>
+          <label class="toc-overlay-icon toc-content-icon no-toc" for="__toc">
+            <div class="visually-hidden">Toggle table of contents sidebar</div>
+            <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+          </label>
+        </div>
+        <article role="main">
+          <h1>Source code for cutlass.utils.datatypes</h1><div class="highlight"><pre>
+<span></span><span class="c1">#################################################################################################</span>
+<span class="c1">#</span>
+<span class="c1"># Copyright (c) 2023 - 2023 NVIDIA CORPORATION &amp; AFFILIATES. All rights reserved.</span>
+<span class="c1"># SPDX-License-Identifier: BSD-3-Clause</span>
+<span class="c1">#</span>
+<span class="c1"># Redistribution and use in source and binary forms, with or without</span>
+<span class="c1"># modification, are permitted provided that the following conditions are met:</span>
+<span class="c1">#</span>
+<span class="c1"># 1. Redistributions of source code must retain the above copyright notice, this</span>
+<span class="c1"># list of conditions and the following disclaimer.</span>
+<span class="c1">#</span>
+<span class="c1"># 2. Redistributions in binary form must reproduce the above copyright notice,</span>
+<span class="c1"># this list of conditions and the following disclaimer in the documentation</span>
+<span class="c1"># and/or other materials provided with the distribution.</span>
+<span class="c1">#</span>
+<span class="c1"># 3. Neither the name of the copyright holder nor the names of its</span>
+<span class="c1"># contributors may be used to endorse or promote products derived from</span>
+<span class="c1"># this software without specific prior written permission.</span>
+<span class="c1">#</span>
+<span class="c1"># THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS &quot;AS IS&quot;</span>
+<span class="c1"># AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE</span>
+<span class="c1"># IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE</span>
+<span class="c1"># DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE</span>
+<span class="c1"># FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL</span>
+<span class="c1"># DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR</span>
+<span class="c1"># SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER</span>
+<span class="c1"># CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,</span>
+<span class="c1"># OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE</span>
+<span class="c1"># OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.</span>
+<span class="c1">#</span>
+<span class="c1">#################################################################################################</span>
+
+<span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">Utility functions for converting between frontend datatypes and CUTLASS datatypes</span>
+<span class="sd">&quot;&quot;&quot;</span>
+
+<span class="kn">import</span> <span class="nn">cutlass_bindings</span>
+
+<span class="kn">import</span> <span class="nn">cutlass</span>
+<span class="kn">from</span> <span class="nn">cutlass.backend.library</span> <span class="kn">import</span> <span class="p">(</span>
+    <span class="n">DataTypeSize</span><span class="p">,</span>
+    <span class="n">MathInstruction</span><span class="p">,</span>
+    <span class="n">MathOperation</span><span class="p">,</span>
+    <span class="n">ShortLayoutTypeNames</span><span class="p">,</span>
+    <span class="n">TileDescription</span><span class="p">,</span>
+<span class="p">)</span>
+
+<span class="k">try</span><span class="p">:</span>
+    <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
+
+    <span class="n">numpy_available</span> <span class="o">=</span> <span class="kc">True</span>
+    <span class="n">_library_to_numpy_dict</span> <span class="o">=</span> <span class="p">{</span>
+        <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f16</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">float16</span><span class="p">,</span>
+        <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f32</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">float32</span><span class="p">,</span>
+        <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f64</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">float64</span><span class="p">,</span>
+        <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">s8</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">int8</span><span class="p">,</span>
+        <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">s32</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">int32</span><span class="p">,</span>
+    <span class="p">}</span>
+<span class="k">except</span> <span class="ne">ImportError</span><span class="p">:</span>
+    <span class="n">numpy_available</span> <span class="o">=</span> <span class="kc">False</span>
+    <span class="n">_library_to_numpy_dict</span> <span class="o">=</span> <span class="p">{}</span>
+
+
+<div class="viewcode-block" id="numpy_library_type"><a class="viewcode-back" href="../../../cutlass.utils.html#cutlass.utils.datatypes.numpy_library_type">[docs]</a><span class="k">def</span> <span class="nf">numpy_library_type</span><span class="p">(</span><span class="n">inp</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="p">:</span>
+    <span class="k">if</span> <span class="n">numpy_available</span><span class="p">:</span>
+        <span class="k">if</span> <span class="n">inp</span> <span class="o">==</span> <span class="n">np</span><span class="o">.</span><span class="n">float16</span><span class="p">:</span>
+            <span class="k">return</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f16</span>
+        <span class="k">elif</span> <span class="n">inp</span> <span class="o">==</span> <span class="n">np</span><span class="o">.</span><span class="n">float32</span><span class="p">:</span>
+            <span class="k">return</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f32</span>
+        <span class="k">elif</span> <span class="n">inp</span> <span class="o">==</span> <span class="n">np</span><span class="o">.</span><span class="n">float64</span><span class="p">:</span>
+            <span class="k">return</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f64</span>
+        <span class="k">elif</span> <span class="n">inp</span> <span class="o">==</span> <span class="n">np</span><span class="o">.</span><span class="n">int8</span><span class="p">:</span>
+            <span class="k">return</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">s8</span>
+        <span class="k">elif</span> <span class="n">inp</span> <span class="o">==</span> <span class="n">np</span><span class="o">.</span><span class="n">int32</span><span class="p">:</span>
+            <span class="k">return</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">s32</span>
+    <span class="k">return</span> <span class="kc">None</span></div>
+
+
+<div class="viewcode-block" id="numpy_type"><a class="viewcode-back" href="../../../cutlass.utils.html#cutlass.utils.datatypes.numpy_type">[docs]</a><span class="k">def</span> <span class="nf">numpy_type</span><span class="p">(</span><span class="n">inp</span><span class="p">):</span>
+    <span class="k">return</span> <span class="n">_library_to_numpy_dict</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">inp</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span></div>
+
+
+<span class="k">try</span><span class="p">:</span>
+    <span class="kn">import</span> <span class="nn">cupy</span> <span class="k">as</span> <span class="nn">cp</span>
+
+    <span class="n">cupy_available</span> <span class="o">=</span> <span class="kc">True</span>
+    <span class="n">_library_to_cupy_dict</span> <span class="o">=</span> <span class="p">{</span>
+        <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f16</span><span class="p">:</span> <span class="n">cp</span><span class="o">.</span><span class="n">float16</span><span class="p">,</span>
+        <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f32</span><span class="p">:</span> <span class="n">cp</span><span class="o">.</span><span class="n">float32</span><span class="p">,</span>
+        <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f64</span><span class="p">:</span> <span class="n">cp</span><span class="o">.</span><span class="n">float64</span><span class="p">,</span>
+        <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">s8</span><span class="p">:</span> <span class="n">cp</span><span class="o">.</span><span class="n">int8</span><span class="p">,</span>
+        <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">s32</span><span class="p">:</span> <span class="n">cp</span><span class="o">.</span><span class="n">int32</span><span class="p">,</span>
+    <span class="p">}</span>
+<span class="k">except</span> <span class="ne">ImportError</span><span class="p">:</span>
+    <span class="n">cupy_available</span> <span class="o">=</span> <span class="kc">False</span>
+    <span class="n">_library_to_cupy_dict</span> <span class="o">=</span> <span class="p">{}</span>
+
+
+<div class="viewcode-block" id="cupy_library_type"><a class="viewcode-back" href="../../../cutlass.utils.html#cutlass.utils.datatypes.cupy_library_type">[docs]</a><span class="k">def</span> <span class="nf">cupy_library_type</span><span class="p">(</span><span class="n">inp</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="p">:</span>
+    <span class="k">if</span> <span class="n">cupy_available</span><span class="p">:</span>
+        <span class="k">if</span> <span class="n">inp</span> <span class="o">==</span> <span class="n">cp</span><span class="o">.</span><span class="n">float16</span><span class="p">:</span>
+            <span class="k">return</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f16</span>
+        <span class="k">elif</span> <span class="n">inp</span> <span class="o">==</span> <span class="n">cp</span><span class="o">.</span><span class="n">float32</span><span class="p">:</span>
+            <span class="k">return</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f32</span>
+        <span class="k">elif</span> <span class="n">inp</span> <span class="o">==</span> <span class="n">cp</span><span class="o">.</span><span class="n">float64</span><span class="p">:</span>
+            <span class="k">return</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f64</span>
+    <span class="k">return</span> <span class="kc">None</span></div>
+
+
+<div class="viewcode-block" id="cupy_type"><a class="viewcode-back" href="../../../cutlass.utils.html#cutlass.utils.datatypes.cupy_type">[docs]</a><span class="k">def</span> <span class="nf">cupy_type</span><span class="p">(</span><span class="n">inp</span><span class="p">):</span>
+    <span class="k">return</span> <span class="n">_library_to_cupy_dict</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">inp</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span></div>
+
+
+<span class="k">try</span><span class="p">:</span>
+    <span class="kn">import</span> <span class="nn">torch</span>
+
+    <span class="n">torch_available</span> <span class="o">=</span> <span class="kc">True</span>
+    <span class="n">_torch_to_library_dict</span> <span class="o">=</span> <span class="p">{</span>
+        <span class="n">torch</span><span class="o">.</span><span class="n">half</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f16</span><span class="p">,</span>
+        <span class="n">torch</span><span class="o">.</span><span class="n">float16</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f16</span><span class="p">,</span>
+        <span class="n">torch</span><span class="o">.</span><span class="n">float</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f32</span><span class="p">,</span>
+        <span class="n">torch</span><span class="o">.</span><span class="n">float32</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f32</span><span class="p">,</span>
+        <span class="n">torch</span><span class="o">.</span><span class="n">double</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f64</span><span class="p">,</span>
+        <span class="n">torch</span><span class="o">.</span><span class="n">float64</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f64</span><span class="p">,</span>
+    <span class="p">}</span>
+
+    <span class="n">_library_to_torch_dict</span> <span class="o">=</span> <span class="p">{</span>
+        <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f16</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">half</span><span class="p">,</span>
+        <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f16</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">float16</span><span class="p">,</span>
+        <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f32</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">float</span><span class="p">,</span>
+        <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f32</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">float32</span><span class="p">,</span>
+        <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f64</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">double</span><span class="p">,</span>
+        <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f64</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">float64</span><span class="p">,</span>
+    <span class="p">}</span>
+<span class="k">except</span> <span class="ne">ImportError</span><span class="p">:</span>
+    <span class="n">torch_available</span> <span class="o">=</span> <span class="kc">False</span>
+    <span class="n">_torch_to_library_dict</span> <span class="o">=</span> <span class="p">{}</span>
+    <span class="n">_library_to_torch_dict</span> <span class="o">=</span> <span class="p">{}</span>
+
+
+<div class="viewcode-block" id="torch_library_type"><a class="viewcode-back" href="../../../cutlass.utils.html#cutlass.utils.datatypes.torch_library_type">[docs]</a><span class="k">def</span> <span class="nf">torch_library_type</span><span class="p">(</span><span class="n">inp</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="p">:</span>
+    <span class="k">return</span> <span class="n">_torch_to_library_dict</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">inp</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span></div>
+
+
+<div class="viewcode-block" id="torch_type"><a class="viewcode-back" href="../../../cutlass.utils.html#cutlass.utils.datatypes.torch_type">[docs]</a><span class="k">def</span> <span class="nf">torch_type</span><span class="p">(</span><span class="n">inp</span><span class="p">):</span>
+    <span class="k">return</span> <span class="n">_library_to_torch_dict</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">inp</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span></div>
+
+
+<span class="k">try</span><span class="p">:</span>
+    <span class="kn">import</span> <span class="nn">bfloat16</span>
+
+    <span class="n">bfloat16_available</span> <span class="o">=</span> <span class="kc">True</span>
+<span class="k">except</span> <span class="ne">ImportError</span><span class="p">:</span>
+    <span class="n">bfloat16_available</span> <span class="o">=</span> <span class="kc">False</span>
+
+
+<div class="viewcode-block" id="bfloat16_library_type"><a class="viewcode-back" href="../../../cutlass.utils.html#cutlass.utils.datatypes.bfloat16_library_type">[docs]</a><span class="k">def</span> <span class="nf">bfloat16_library_type</span><span class="p">(</span><span class="n">inp</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="p">:</span>
+    <span class="k">if</span> <span class="n">bfloat16_available</span><span class="p">:</span>
+        <span class="k">if</span> <span class="n">inp</span> <span class="o">==</span> <span class="n">bfloat16</span><span class="o">.</span><span class="n">bfloat16</span><span class="p">:</span>
+            <span class="k">return</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">bf16</span></div>
+
+
+<div class="viewcode-block" id="bfloat16_type"><a class="viewcode-back" href="../../../cutlass.utils.html#cutlass.utils.datatypes.bfloat16_type">[docs]</a><span class="k">def</span> <span class="nf">bfloat16_type</span><span class="p">(</span><span class="n">inp</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">bfloat16</span><span class="o">.</span><span class="n">bfloat16</span><span class="p">:</span>
+    <span class="k">if</span> <span class="n">bfloat16_available</span><span class="p">:</span>
+        <span class="k">if</span> <span class="n">inp</span> <span class="o">==</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">bf16</span><span class="p">:</span>
+            <span class="k">return</span> <span class="n">bfloat16</span><span class="o">.</span><span class="n">bfloat16</span></div>
+
+
+<span class="c1"># Mapping from library data type to Python-bound CUTLASS data type</span>
+<span class="n">library_to_binding_dict</span> <span class="o">=</span> <span class="p">{</span>
+    <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">s8</span><span class="p">:</span> <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">int8</span><span class="p">,</span>
+    <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">s32</span><span class="p">:</span> <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">int32</span><span class="p">,</span>
+    <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f16</span><span class="p">:</span> <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">float16</span><span class="p">,</span>
+    <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">bf16</span><span class="p">:</span> <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">bfloat16</span><span class="p">,</span>
+    <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f32</span><span class="p">:</span> <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">float32</span><span class="p">,</span>
+    <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f64</span><span class="p">:</span> <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">float64</span><span class="p">,</span>
+    <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">tf32</span><span class="p">:</span> <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">tfloat32</span><span class="p">,</span>
+<span class="p">}</span>
+
+<span class="c1"># Mapping from Python-bound CUTLASS data type to library data type</span>
+<span class="n">binding_to_library</span> <span class="o">=</span> <span class="p">{</span>
+    <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">int8</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">s8</span><span class="p">,</span>
+    <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">int32</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">s32</span><span class="p">,</span>
+    <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">float16</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f16</span><span class="p">,</span>
+    <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">bfloat16</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">bf16</span><span class="p">,</span>
+    <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">float32</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f32</span><span class="p">,</span>
+    <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">float64</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f64</span><span class="p">,</span>
+    <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">tfloat32</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">tf32</span><span class="p">,</span>
+<span class="p">}</span>
+
+
+<div class="viewcode-block" id="binding_library_type"><a class="viewcode-back" href="../../../cutlass.utils.html#cutlass.utils.datatypes.binding_library_type">[docs]</a><span class="k">def</span> <span class="nf">binding_library_type</span><span class="p">(</span><span class="n">inp</span><span class="p">):</span>
+    <span class="k">if</span> <span class="n">inp</span> <span class="ow">in</span> <span class="n">binding_to_library</span><span class="p">:</span>
+        <span class="k">return</span> <span class="n">binding_to_library</span><span class="p">[</span><span class="n">inp</span><span class="p">]</span>
+    <span class="k">return</span> <span class="kc">None</span></div>
+
+
+<div class="viewcode-block" id="has_binding_type"><a class="viewcode-back" href="../../../cutlass.utils.html#cutlass.utils.datatypes.has_binding_type">[docs]</a><span class="k">def</span> <span class="nf">has_binding_type</span><span class="p">(</span><span class="n">inp</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="p">):</span>
+    <span class="k">return</span> <span class="n">inp</span> <span class="ow">in</span> <span class="n">library_to_binding_dict</span></div>
+
+
+<div class="viewcode-block" id="library_to_binding"><a class="viewcode-back" href="../../../cutlass.utils.html#cutlass.utils.datatypes.library_to_binding">[docs]</a><span class="k">def</span> <span class="nf">library_to_binding</span><span class="p">(</span><span class="n">inp</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="p">):</span>
+    <span class="k">if</span> <span class="ow">not</span> <span class="n">has_binding_type</span><span class="p">(</span><span class="n">inp</span><span class="p">):</span>
+        <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;No available conversion from library type </span><span class="si">{</span><span class="n">inp</span><span class="si">}</span><span class="s2"> to Python-bound CUTLASS type&quot;</span><span class="p">)</span>
+    <span class="k">return</span> <span class="n">library_to_binding_dict</span><span class="p">[</span><span class="n">inp</span><span class="p">]</span></div>
+
+
+<div class="viewcode-block" id="library_type"><a class="viewcode-back" href="../../../cutlass.utils.html#cutlass.utils.datatypes.library_type">[docs]</a><span class="k">def</span> <span class="nf">library_type</span><span class="p">(</span><span class="n">inp</span><span class="p">):</span>
+    <span class="k">if</span> <span class="n">inp</span> <span class="ow">in</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">DataTypeSize</span><span class="o">.</span><span class="n">keys</span><span class="p">():</span>
+        <span class="k">return</span> <span class="n">inp</span>
+
+    <span class="k">for</span> <span class="n">cvt_fn</span> <span class="ow">in</span> <span class="p">[</span>
+        <span class="n">bfloat16_library_type</span><span class="p">,</span>
+        <span class="n">cupy_library_type</span><span class="p">,</span>
+        <span class="n">numpy_library_type</span><span class="p">,</span>
+        <span class="n">torch_library_type</span><span class="p">,</span>
+        <span class="n">binding_library_type</span><span class="p">,</span>
+    <span class="p">]:</span>
+        <span class="n">out</span> <span class="o">=</span> <span class="n">cvt_fn</span><span class="p">(</span><span class="n">inp</span><span class="p">)</span>
+        <span class="k">if</span> <span class="n">out</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="k">return</span> <span class="n">out</span>
+
+    <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;No available conversion from type </span><span class="si">{</span><span class="n">inp</span><span class="si">}</span><span class="s2"> to a library type.&quot;</span><span class="p">)</span></div>
+
+
+<div class="viewcode-block" id="library_layout"><a class="viewcode-back" href="../../../cutlass.utils.html#cutlass.utils.datatypes.library_layout">[docs]</a><span class="k">def</span> <span class="nf">library_layout</span><span class="p">(</span><span class="n">layout</span><span class="p">):</span>
+    <span class="k">if</span> <span class="n">layout</span> <span class="ow">in</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutTag</span><span class="o">.</span><span class="n">keys</span><span class="p">():</span>
+        <span class="k">return</span> <span class="n">layout</span>
+
+    <span class="c1"># Convert Python-bound CUTLASS layout to profiler library layout</span>
+    <span class="k">if</span> <span class="n">layout</span> <span class="o">==</span> <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">RowMajor</span><span class="p">:</span>
+        <span class="k">return</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">RowMajor</span>
+    <span class="k">elif</span> <span class="n">layout</span> <span class="o">==</span> <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">ColumnMajor</span><span class="p">:</span>
+        <span class="k">return</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">ColumnMajor</span>
+    <span class="k">else</span><span class="p">:</span>
+        <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;No conversion available for layout </span><span class="si">{</span><span class="n">layout</span><span class="si">}</span><span class="s2"> to library layout.&quot;</span><span class="p">)</span></div>
+
+
+<div class="viewcode-block" id="binding_type"><a class="viewcode-back" href="../../../cutlass.utils.html#cutlass.utils.datatypes.binding_type">[docs]</a><span class="k">def</span> <span class="nf">binding_type</span><span class="p">(</span><span class="n">inp</span><span class="p">):</span>
+    <span class="k">if</span> <span class="n">inp</span> <span class="ow">in</span> <span class="n">DataTypeSize</span><span class="o">.</span><span class="n">keys</span><span class="p">():</span>
+        <span class="k">return</span> <span class="n">inp</span>
+
+    <span class="n">libtype</span> <span class="o">=</span> <span class="n">library_type</span><span class="p">(</span><span class="n">inp</span><span class="p">)</span>
+    <span class="k">return</span> <span class="n">library_to_binding</span><span class="p">(</span><span class="n">libtype</span><span class="p">)</span></div>
+
+
+<div class="viewcode-block" id="binding_layout"><a class="viewcode-back" href="../../../cutlass.utils.html#cutlass.utils.datatypes.binding_layout">[docs]</a><span class="k">def</span> <span class="nf">binding_layout</span><span class="p">(</span><span class="n">layout</span><span class="p">):</span>
+    <span class="k">if</span> <span class="n">layout</span> <span class="ow">in</span> <span class="n">ShortLayoutTypeNames</span><span class="o">.</span><span class="n">keys</span><span class="p">():</span>
+        <span class="k">return</span> <span class="n">layout</span>
+    <span class="k">elif</span> <span class="n">layout</span> <span class="o">==</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">RowMajor</span><span class="p">:</span>
+        <span class="k">return</span> <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">RowMajor</span>
+    <span class="k">elif</span> <span class="n">layout</span> <span class="o">==</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">ColumnMajor</span><span class="p">:</span>
+        <span class="k">return</span> <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">ColumnMajor</span>
+    <span class="k">else</span><span class="p">:</span>
+        <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;No conversion available for layout </span><span class="si">{</span><span class="n">layout</span><span class="si">}</span><span class="s2"> to Python-bound CUTLASS layout.&quot;</span><span class="p">)</span></div>
+
+
+<span class="k">def</span> <span class="nf">_tensor_from_numpy</span><span class="p">(</span><span class="n">np_tensor</span><span class="p">):</span>
+    <span class="n">dtype</span> <span class="o">=</span> <span class="n">library_type</span><span class="p">(</span><span class="n">np_tensor</span><span class="o">.</span><span class="n">dtype</span><span class="p">)</span>
+    <span class="k">if</span> <span class="n">np_tensor</span><span class="o">.</span><span class="n">flags</span><span class="o">.</span><span class="n">c_contiguous</span><span class="p">:</span>
+        <span class="n">layout</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">RowMajor</span>
+    <span class="k">elif</span> <span class="n">np_tensor</span><span class="o">.</span><span class="n">flags</span><span class="o">.</span><span class="n">f_contiguous</span><span class="p">:</span>
+        <span class="n">layout</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">ColumnMajor</span>
+    <span class="k">return</span> <span class="p">(</span><span class="n">dtype</span><span class="p">,</span> <span class="n">layout</span><span class="p">)</span>
+
+
+<span class="k">def</span> <span class="nf">_tensor_from_torch</span><span class="p">(</span><span class="n">pt_tensor</span><span class="p">):</span>
+    <span class="n">dtype</span> <span class="o">=</span> <span class="n">library_type</span><span class="p">(</span><span class="n">pt_tensor</span><span class="o">.</span><span class="n">dtype</span><span class="p">)</span>
+    <span class="k">return</span> <span class="p">(</span><span class="n">dtype</span><span class="p">,</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">RowMajor</span><span class="p">)</span>
+
+
+<div class="viewcode-block" id="get_datatype_and_layout"><a class="viewcode-back" href="../../../cutlass.utils.html#cutlass.utils.datatypes.get_datatype_and_layout">[docs]</a><span class="k">def</span> <span class="nf">get_datatype_and_layout</span><span class="p">(</span><span class="n">tensor</span><span class="p">):</span>
+    <span class="k">if</span> <span class="p">(</span><span class="n">numpy_available</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">tensor</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">))</span> <span class="ow">or</span> <span class="p">(</span>
+        <span class="n">cupy_available</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">tensor</span><span class="p">,</span> <span class="n">cp</span><span class="o">.</span><span class="n">ndarray</span><span class="p">)</span>
+    <span class="p">):</span>
+        <span class="k">return</span> <span class="n">_tensor_from_numpy</span><span class="p">(</span><span class="n">tensor</span><span class="p">)</span>
+    <span class="k">elif</span> <span class="n">torch_available</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">tensor</span><span class="p">,</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">):</span>
+        <span class="k">return</span> <span class="n">_tensor_from_torch</span><span class="p">(</span><span class="n">tensor</span><span class="p">)</span>
+    <span class="k">else</span><span class="p">:</span>
+        <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Unable to convert tensor of type </span><span class="si">{</span><span class="nb">type</span><span class="p">(</span><span class="n">tensor</span><span class="p">)</span><span class="si">}</span><span class="s2"> to Python-bound CUTLASS datatype and layout.&quot;</span><span class="p">)</span></div>
+
+
+<div class="viewcode-block" id="binding_opclass"><a class="viewcode-back" href="../../../cutlass.utils.html#cutlass.utils.datatypes.binding_opclass">[docs]</a><span class="k">def</span> <span class="nf">binding_opclass</span><span class="p">(</span><span class="n">opclass</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">OpcodeClass</span><span class="p">):</span>
+    <span class="k">if</span> <span class="n">opclass</span> <span class="o">==</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">OpcodeClass</span><span class="o">.</span><span class="n">TensorOp</span><span class="p">:</span>
+        <span class="k">return</span> <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">OpClass</span><span class="o">.</span><span class="n">TensorOp</span>
+    <span class="k">elif</span> <span class="n">opclass</span> <span class="o">==</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">OpcodeClass</span><span class="o">.</span><span class="n">Simt</span><span class="p">:</span>
+        <span class="k">return</span> <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">OpClass</span><span class="o">.</span><span class="n">Simt</span>
+    <span class="k">else</span><span class="p">:</span>
+        <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Unable to convert opcode class of type </span><span class="si">{</span><span class="n">opclass</span><span class="si">}</span><span class="s2"> to Python-bound CUTLASS opcode class.&quot;</span><span class="p">)</span></div>
+
+
+<span class="n">_math_operation_value_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">x</span><span class="o">.</span><span class="n">value</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">MathOperation</span><span class="p">}</span>
+
+
+<div class="viewcode-block" id="backend_math_operation"><a class="viewcode-back" href="../../../cutlass.utils.html#cutlass.utils.datatypes.backend_math_operation">[docs]</a><span class="k">def</span> <span class="nf">backend_math_operation</span><span class="p">(</span><span class="n">math_op</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">MathOperation</span><span class="p">):</span>
+    <span class="k">if</span> <span class="n">math_op</span><span class="o">.</span><span class="n">value</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">_math_operation_value_map</span><span class="o">.</span><span class="n">keys</span><span class="p">():</span>
+        <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Unable to convert math operation of type </span><span class="si">{</span><span class="n">math_op</span><span class="si">}</span><span class="s2"> to backend math operation.&quot;</span><span class="p">)</span>
+    <span class="k">return</span> <span class="n">_math_operation_value_map</span><span class="p">[</span><span class="n">math_op</span><span class="o">.</span><span class="n">value</span><span class="p">]</span></div>
+
+
+<div class="viewcode-block" id="construct_backend_td"><a class="viewcode-back" href="../../../cutlass.utils.html#cutlass.utils.datatypes.construct_backend_td">[docs]</a><span class="k">def</span> <span class="nf">construct_backend_td</span><span class="p">(</span><span class="n">td</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">TileDescription</span><span class="p">,</span>
+                         <span class="n">kernel_schedule</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">KernelScheduleType</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">TileDescription</span><span class="p">:</span>
+    <span class="n">mi</span> <span class="o">=</span> <span class="n">td</span><span class="o">.</span><span class="n">math_instruction</span>
+    <span class="n">backend_mi</span> <span class="o">=</span> <span class="n">MathInstruction</span><span class="p">(</span>
+        <span class="n">mi</span><span class="o">.</span><span class="n">instruction_shape</span><span class="p">,</span>
+        <span class="n">binding_type</span><span class="p">(</span><span class="n">mi</span><span class="o">.</span><span class="n">element_a</span><span class="p">),</span>
+        <span class="n">binding_type</span><span class="p">(</span><span class="n">mi</span><span class="o">.</span><span class="n">element_b</span><span class="p">),</span>
+        <span class="n">binding_type</span><span class="p">(</span><span class="n">mi</span><span class="o">.</span><span class="n">element_accumulator</span><span class="p">),</span>
+        <span class="n">binding_opclass</span><span class="p">(</span><span class="n">mi</span><span class="o">.</span><span class="n">opcode_class</span><span class="p">),</span>
+        <span class="n">backend_math_operation</span><span class="p">(</span><span class="n">mi</span><span class="o">.</span><span class="n">math_operation</span><span class="p">)</span>
+    <span class="p">)</span>
+    <span class="k">return</span> <span class="n">TileDescription</span><span class="p">(</span><span class="n">td</span><span class="o">.</span><span class="n">threadblock_shape</span><span class="p">,</span> <span class="n">td</span><span class="o">.</span><span class="n">stages</span><span class="p">,</span> <span class="n">td</span><span class="o">.</span><span class="n">warp_count</span><span class="p">,</span>
+                           <span class="n">backend_mi</span><span class="p">,</span> <span class="n">td</span><span class="o">.</span><span class="n">cluster_shape</span><span class="p">,</span> <span class="n">kernel_schedule</span><span class="p">)</span></div>
+
+
+<div class="viewcode-block" id="td_from_profiler_op"><a class="viewcode-back" href="../../../cutlass.utils.html#cutlass.utils.datatypes.td_from_profiler_op">[docs]</a><span class="k">def</span> <span class="nf">td_from_profiler_op</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">TileDescription</span><span class="p">:</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Converts the profiler&#39;s TileDescription in ``op`` into the backend TileDescription</span>
+
+<span class="sd">    :param op: profiler Operation</span>
+
+<span class="sd">    :returns: backend TileDescription</span>
+<span class="sd">    :rtype: cutlass.backend.TileDescription</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="n">schedule</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">kernel_schedule</span> <span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="s1">&#39;kernel_schedule&#39;</span><span class="p">)</span> <span class="k">else</span> <span class="kc">None</span>
+    <span class="k">return</span> <span class="n">construct_backend_td</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">tile_description</span><span class="p">,</span> <span class="n">schedule</span><span class="p">)</span></div>
+
+
+<div class="viewcode-block" id="td_from_profiler_td"><a class="viewcode-back" href="../../../cutlass.utils.html#cutlass.utils.datatypes.td_from_profiler_td">[docs]</a><span class="k">def</span> <span class="nf">td_from_profiler_td</span><span class="p">(</span><span class="n">td</span><span class="p">:</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">backend</span><span class="o">.</span><span class="n">TileDescription</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">TileDescription</span><span class="p">:</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Converts the profiler&#39;s TileDescription into the backend TileDescription</span>
+
+<span class="sd">    :param td: profiler TileDescription</span>
+<span class="sd">    :type td: cutlass.TileDescription</span>
+
+<span class="sd">    :returns: backend TileDescription</span>
+<span class="sd">    :rtype: cutlass.backend.TileDescription</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="k">return</span> <span class="n">construct_backend_td</span><span class="p">(</span><span class="n">td</span><span class="p">,</span> <span class="n">kernel_schedule</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span></div>
+</pre></div>
+        </article>
+      </div>
+      <footer>
+        
+        <div class="related-pages">
+          
+          
+        </div>
+        <div class="bottom-of-page">
+          <div class="left-details">
+            <div class="copyright">
+                Copyright &#169; 2023, NVIDIA
+            </div>
+            Made with <a href="https://www.sphinx-doc.org/">Sphinx</a> and <a class="muted-link" href="https://pradyunsg.me">@pradyunsg</a>'s
+            
+            <a href="https://github.com/pradyunsg/furo">Furo</a>
+            
+          </div>
+          <div class="right-details">
+            <div class="icons">
+              <a class="muted-link " href="https://github.com/NVIDIA/cutlass" aria-label="GitHub">
+                <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 16 16">
+                    <path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"></path>
+                </svg>
+            </a>
+              
+            </div>
+          </div>
+        </div>
+        
+      </footer>
+    </div>
+    <aside class="toc-drawer no-toc">
+      
+      
+      
+    </aside>
+  </div>
+</div><script data-url_root="../../../" id="documentation_options" src="../../../_static/documentation_options.js"></script>
+    <script src="../../../_static/doctools.js"></script>
+    <script src="../../../_static/sphinx_highlight.js"></script>
+    <script src="../../../_static/scripts/furo.js"></script>
+    <script src="../../../_static/clipboard.min.js"></script>
+    <script src="../../../_static/copybutton.js"></script>
+    <script src="../../../_static/tabs.js"></script>
+    <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
+    </body>
+</html>
\ No newline at end of file
diff --git a/python/docs/_modules/index.html b/python/docs/_modules/index.html
new file mode 100644
index 0000000000..7cfe6e9db5
--- /dev/null
+++ b/python/docs/_modules/index.html
@@ -0,0 +1,293 @@
+<!doctype html>
+<html class="no-js" lang="en">
+  <head><meta charset="utf-8"/>
+    <meta name="viewport" content="width=device-width,initial-scale=1"/>
+    <meta name="color-scheme" content="light dark"><link rel="index" title="Index" href="../genindex.html" /><link rel="search" title="Search" href="../search.html" />
+        <link rel="canonical" href="docs/_modules/index.html" />
+
+    <!-- Generated with Sphinx 6.1.3 and Furo 2023.03.27 -->
+        <title>Overview: module code - CUTLASS Python</title>
+      <link rel="stylesheet" type="text/css" href="../_static/pygments.css" />
+    <link rel="stylesheet" type="text/css" href="../_static/styles/furo.css?digest=fad236701ea90a88636c2a8c73b44ae642ed2a53" />
+    <link rel="stylesheet" type="text/css" href="../_static/copybutton.css" />
+    <link rel="stylesheet" type="text/css" href="../_static/tabs.css" />
+    <link rel="stylesheet" type="text/css" href="../_static/styles/furo-extensions.css?digest=30d1aed668e5c3a91c3e3bf6a60b675221979f0e" />
+    
+    
+
+
+<style>
+  body {
+    --color-code-background: #eeffcc;
+  --color-code-foreground: black;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+  }
+  @media not print {
+    body[data-theme="dark"] {
+      --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+    }
+    @media (prefers-color-scheme: dark) {
+      body:not([data-theme="light"]) {
+        --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+      }
+    }
+  }
+</style></head>
+  <body>
+    
+    <script>
+      document.body.dataset.theme = localStorage.getItem("theme") || "auto";
+    </script>
+    
+
+<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
+  <symbol id="svg-toc" viewBox="0 0 24 24">
+    <title>Contents</title>
+    <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 1024 1024">
+      <path d="M408 442h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8zm-8 204c0 4.4 3.6 8 8 8h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56zm504-486H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zm0 632H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zM115.4 518.9L271.7 642c5.8 4.6 14.4.5 14.4-6.9V388.9c0-7.4-8.5-11.5-14.4-6.9L115.4 505.1a8.74 8.74 0 0 0 0 13.8z"/>
+    </svg>
+  </symbol>
+  <symbol id="svg-menu" viewBox="0 0 24 24">
+    <title>Menu</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-menu">
+      <line x1="3" y1="12" x2="21" y2="12"></line>
+      <line x1="3" y1="6" x2="21" y2="6"></line>
+      <line x1="3" y1="18" x2="21" y2="18"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-arrow-right" viewBox="0 0 24 24">
+    <title>Expand</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-chevron-right">
+      <polyline points="9 18 15 12 9 6"></polyline>
+    </svg>
+  </symbol>
+  <symbol id="svg-sun" viewBox="0 0 24 24">
+    <title>Light mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="feather-sun">
+      <circle cx="12" cy="12" r="5"></circle>
+      <line x1="12" y1="1" x2="12" y2="3"></line>
+      <line x1="12" y1="21" x2="12" y2="23"></line>
+      <line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
+      <line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
+      <line x1="1" y1="12" x2="3" y2="12"></line>
+      <line x1="21" y1="12" x2="23" y2="12"></line>
+      <line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
+      <line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-moon" viewBox="0 0 24 24">
+    <title>Dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-moon">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none" />
+      <path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z" />
+    </svg>
+  </symbol>
+  <symbol id="svg-sun-half" viewBox="0 0 24 24">
+    <title>Auto light/dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-shadow">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none"/>
+      <circle cx="12" cy="12" r="9" />
+      <path d="M13 12h5" />
+      <path d="M13 15h4" />
+      <path d="M13 18h1" />
+      <path d="M13 9h4" />
+      <path d="M13 6h1" />
+    </svg>
+  </symbol>
+</svg>
+
+<input type="checkbox" class="sidebar-toggle" name="__navigation" id="__navigation">
+<input type="checkbox" class="sidebar-toggle" name="__toc" id="__toc">
+<label class="overlay sidebar-overlay" for="__navigation">
+  <div class="visually-hidden">Hide navigation sidebar</div>
+</label>
+<label class="overlay toc-overlay" for="__toc">
+  <div class="visually-hidden">Hide table of contents sidebar</div>
+</label>
+
+
+
+<div class="page">
+  <header class="mobile-header">
+    <div class="header-left">
+      <label class="nav-overlay-icon" for="__navigation">
+        <div class="visually-hidden">Toggle site navigation sidebar</div>
+        <i class="icon"><svg><use href="#svg-menu"></use></svg></i>
+      </label>
+    </div>
+    <div class="header-center">
+      <a href="../index.html"><div class="brand">CUTLASS Python</div></a>
+    </div>
+    <div class="header-right">
+      <div class="theme-toggle-container theme-toggle-header">
+        <button class="theme-toggle">
+          <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+          <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+          <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+          <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+        </button>
+      </div>
+      <label class="toc-overlay-icon toc-header-icon no-toc" for="__toc">
+        <div class="visually-hidden">Toggle table of contents sidebar</div>
+        <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+      </label>
+    </div>
+  </header>
+  <aside class="sidebar-drawer">
+    <div class="sidebar-container">
+      
+      <div class="sidebar-sticky"><a class="sidebar-brand" href="../index.html">
+  
+  <div class="sidebar-logo-container">
+    <img class="sidebar-logo only-light" src="../_static/cutlass-logo-small.png" alt="Light Logo"/>
+    <img class="sidebar-logo only-dark" src="../_static/cutlass-logo-small.png" alt="Dark Logo"/>
+  </div>
+  
+  <span class="sidebar-brand-text">CUTLASS Python</span>
+  
+</a><form class="sidebar-search-container" method="get" action="../search.html" role="search">
+  <input class="sidebar-search" placeholder="Search" name="q" aria-label="Search">
+  <input type="hidden" name="check_keywords" value="yes">
+  <input type="hidden" name="area" value="default">
+</form>
+<div id="searchbox"></div><div class="sidebar-scroll"><div class="sidebar-tree">
+  <ul>
+<li class="toctree-l1"><a class="reference internal" href="../index.html">Home</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Getting Started:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="../install.html">Installation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../externals/00_basic_gemm.html">Getting Started</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../contribute.html">Contributing</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Python Documentation:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../modules.html">CUTLASS Python API</a><input class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" role="switch" type="checkbox"/><label for="toctree-checkbox-1"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2 has-children"><a class="reference internal" href="../cutlass.html">CUTLASS</a><input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" role="switch" type="checkbox"/><label for="toctree-checkbox-2"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l3"><a class="reference internal" href="../cutlass.emit.html">Emitters</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../cutlass.op.html">Operations</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../cutlass.utils.html">Utilities</a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Examples and Tutorials:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../examples.html">Examples</a><input class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" role="switch" type="checkbox"/><label for="toctree-checkbox-3"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2"><a class="reference internal" href="../externals/00_basic_gemm.html">Basic GEMM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../externals/01_epilogue.html">Epilogue</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../externals/02_pytorch_extension_grouped_gemm.html">PyTorch Extension</a></li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Reference:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference external" href="https://github.com/NVIDIA/cutlass">Github</a></li>
+</ul>
+
+</div>
+</div>
+
+      </div>
+      
+    </div>
+  </aside>
+  <div class="main">
+    <div class="content">
+      <div class="article-container">
+        <a href="#" class="back-to-top muted-link">
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+            <path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12z"></path>
+          </svg>
+          <span>Back to top</span>
+        </a>
+        <div class="content-icon-container">
+          <div class="theme-toggle-container theme-toggle-content">
+            <button class="theme-toggle">
+              <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+              <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+              <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+              <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+            </button>
+          </div>
+          <label class="toc-overlay-icon toc-content-icon no-toc" for="__toc">
+            <div class="visually-hidden">Toggle table of contents sidebar</div>
+            <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+          </label>
+        </div>
+        <article role="main">
+          <h1>All modules for which code is available</h1>
+<ul><li><a href="cutlass/emit/pytorch.html">cutlass.emit.pytorch</a></li>
+<li><a href="cutlass/epilogue.html">cutlass.epilogue</a></li>
+<li><a href="cutlass/library_defaults.html">cutlass.library_defaults</a></li>
+<li><a href="cutlass/op/gemm.html">cutlass.op.gemm</a></li>
+<li><a href="cutlass/op/gemm_grouped.html">cutlass.op.gemm_grouped</a></li>
+<li><a href="cutlass/op/op.html">cutlass.op.op</a></li>
+<li><a href="cutlass/swizzle.html">cutlass.swizzle</a></li>
+<li><a href="cutlass/utils/check.html">cutlass.utils.check</a></li>
+<li><a href="cutlass/utils/datatypes.html">cutlass.utils.datatypes</a></li>
+</ul>
+        </article>
+      </div>
+      <footer>
+        
+        <div class="related-pages">
+          
+          
+        </div>
+        <div class="bottom-of-page">
+          <div class="left-details">
+            <div class="copyright">
+                Copyright &#169; 2023, NVIDIA
+            </div>
+            Made with <a href="https://www.sphinx-doc.org/">Sphinx</a> and <a class="muted-link" href="https://pradyunsg.me">@pradyunsg</a>'s
+            
+            <a href="https://github.com/pradyunsg/furo">Furo</a>
+            
+          </div>
+          <div class="right-details">
+            <div class="icons">
+              <a class="muted-link " href="https://github.com/NVIDIA/cutlass" aria-label="GitHub">
+                <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 16 16">
+                    <path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"></path>
+                </svg>
+            </a>
+              
+            </div>
+          </div>
+        </div>
+        
+      </footer>
+    </div>
+    <aside class="toc-drawer no-toc">
+      
+      
+      
+    </aside>
+  </div>
+</div><script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script>
+    <script src="../_static/doctools.js"></script>
+    <script src="../_static/sphinx_highlight.js"></script>
+    <script src="../_static/scripts/furo.js"></script>
+    <script src="../_static/clipboard.min.js"></script>
+    <script src="../_static/copybutton.js"></script>
+    <script src="../_static/tabs.js"></script>
+    <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
+    </body>
+</html>
\ No newline at end of file
diff --git a/python/docs/_sources/contribute.md.txt b/python/docs/_sources/contribute.md.txt
new file mode 100644
index 0000000000..42475252b2
--- /dev/null
+++ b/python/docs/_sources/contribute.md.txt
@@ -0,0 +1,9 @@
+# Contributing
+
+Thank you for your interest in contributing to the CUTLASS Python interface. Based on the type of contribution, it will fall into two categories:
+
+1. You want to report a bug, feature request, or documentation issue
+    - File an [issue](https://github.com/NVIDIA/cutlass/issues/new/choose) describing what you encountered or what you want to see changed.
+    - The CUTLASS team will evaluate the issues and triage them, scheduling them for a release. If you believe the issue needs priority attention, comment on the issue to notify the team.
+2. You want to implement a feature or bug-fix
+    - We welcome contributions from the community. We recommend that you contribute via a [pull request](https://github.com/NVIDIA/cutlass/pulls). If you have questions about CUTLASS, consider asking a question via the [Discussions](https://github.com/NVIDIA/cutlass/discussions) tab. Please be sure to search through both existing issues and discussions to see whether your question has already been answered.
diff --git a/python/docs/_sources/cutlass.emit.rst.txt b/python/docs/_sources/cutlass.emit.rst.txt
new file mode 100644
index 0000000000..3e65d40776
--- /dev/null
+++ b/python/docs/_sources/cutlass.emit.rst.txt
@@ -0,0 +1,18 @@
+Emitters
+========
+
+Common
+------
+
+.. automodule:: cutlass.emit.common
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+PyTorch
+-------
+
+.. automodule:: cutlass.emit.pytorch
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/python/docs/_sources/cutlass.op.rst.txt b/python/docs/_sources/cutlass.op.rst.txt
new file mode 100644
index 0000000000..3b8a2b7e57
--- /dev/null
+++ b/python/docs/_sources/cutlass.op.rst.txt
@@ -0,0 +1,26 @@
+Operations
+==========
+
+GEMM
+----
+
+.. automodule:: cutlass.op.gemm
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Grouped GEMM
+------------
+
+.. automodule:: cutlass.op.gemm_grouped
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Operation
+---------
+
+.. automodule:: cutlass.op.op
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/python/docs/_sources/cutlass.rst.txt b/python/docs/_sources/cutlass.rst.txt
new file mode 100644
index 0000000000..a65c251821
--- /dev/null
+++ b/python/docs/_sources/cutlass.rst.txt
@@ -0,0 +1,36 @@
+CUTLASS
+=======
+
+Subpackages
+-----------
+
+.. toctree::
+   :maxdepth: 1
+
+   cutlass.emit
+   cutlass.op
+   cutlass.utils
+
+Epilogue
+--------
+
+.. automodule:: cutlass.epilogue
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Library Defaults
+----------------
+
+.. automodule:: cutlass.library_defaults
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Swizzle
+----------
+
+.. automodule:: cutlass.swizzle
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/python/docs/_sources/cutlass.utils.rst.txt b/python/docs/_sources/cutlass.utils.rst.txt
new file mode 100644
index 0000000000..58e56e5665
--- /dev/null
+++ b/python/docs/_sources/cutlass.utils.rst.txt
@@ -0,0 +1,18 @@
+Utilities
+=========
+
+Checks
+------
+
+.. automodule:: cutlass.utils.check
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Data Types
+----------
+
+.. automodule:: cutlass.utils.datatypes
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/python/docs/_sources/examples.rst.txt b/python/docs/_sources/examples.rst.txt
new file mode 100644
index 0000000000..3cea362113
--- /dev/null
+++ b/python/docs/_sources/examples.rst.txt
@@ -0,0 +1,9 @@
+Examples
+==================
+
+.. toctree::
+   :maxdepth: 5
+
+   Basic GEMM <externals/00_basic_gemm.nblink>
+   Epilogue <externals/01_epilogue.nblink>
+   PyTorch Extension <externals/02_pytorch_extension_grouped_gemm.nblink>
diff --git a/python/docs/_sources/externals/00_basic_gemm.nblink.txt b/python/docs/_sources/externals/00_basic_gemm.nblink.txt
new file mode 100644
index 0000000000..b384198567
--- /dev/null
+++ b/python/docs/_sources/externals/00_basic_gemm.nblink.txt
@@ -0,0 +1,3 @@
+{
+  "path": "./../../../../examples/python/00_basic_gemm.ipynb"
+}
diff --git a/python/docs/_sources/externals/01_epilogue.nblink.txt b/python/docs/_sources/externals/01_epilogue.nblink.txt
new file mode 100644
index 0000000000..14503a1ee3
--- /dev/null
+++ b/python/docs/_sources/externals/01_epilogue.nblink.txt
@@ -0,0 +1,3 @@
+{
+  "path": "./../../../../examples/python/01_epilogue.ipynb"
+}
diff --git a/python/docs/_sources/externals/02_pytorch_extension_grouped_gemm.nblink.txt b/python/docs/_sources/externals/02_pytorch_extension_grouped_gemm.nblink.txt
new file mode 100644
index 0000000000..7da19aff8d
--- /dev/null
+++ b/python/docs/_sources/externals/02_pytorch_extension_grouped_gemm.nblink.txt
@@ -0,0 +1,3 @@
+{
+  "path": "./../../../../examples/python/02_pytorch_extension_grouped_gemm.ipynb"
+}
diff --git a/python/docs/_sources/index.rst.txt b/python/docs/_sources/index.rst.txt
new file mode 100644
index 0000000000..73cc742d5a
--- /dev/null
+++ b/python/docs/_sources/index.rst.txt
@@ -0,0 +1,55 @@
+.. CUTLASS Python interface documentation master file, created by
+   sphinx-quickstart on Mon Feb 13 17:57:39 2023.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+.. include:: ../../README.md
+   :start-line: 1
+   :parser: markdown
+
+.. toctree::
+   :hidden:
+
+   Home <self>
+
+.. toctree::
+   :hidden:
+   :caption: Getting Started:
+
+   install.md
+   Getting Started <externals/00_basic_gemm.nblink>
+   contribute.md
+
+.. toctree::
+   :hidden:
+   :caption: Python Documentation:
+
+   modules.rst
+
+.. toctree::
+   :hidden:
+   :caption: Examples and Tutorials:
+
+   examples.rst
+
+.. toctree::
+   :hidden:
+   :caption: Advanced:
+
+.. toctree::
+   :hidden:
+   :caption: FAQ:
+
+.. toctree::
+   :hidden:
+   :caption: Reference:
+
+   Github <https://github.com/NVIDIA/cutlass>
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/python/docs/_sources/install.md.txt b/python/docs/_sources/install.md.txt
new file mode 100644
index 0000000000..4b5da10517
--- /dev/null
+++ b/python/docs/_sources/install.md.txt
@@ -0,0 +1,36 @@
+# Installation
+
+## Installing from source
+
+Installing from source requires the latest CUDA Toolkit that matches the major.minor of CUDA Python installed.
+
+Prior to installing the CUTLASS Python interface, one may optionally set the following environment variables:
+* `CUTLASS_PATH`: the path to the cloned CUTLASS repository
+* `CUDA_INSTALL_PATH`: the path to the installation of CUDA
+
+If these environment variables are not set, the installation process will infer them to be the following:
+* `CUTLASS_PATH`: one directory level above the current directory (i.e., `$(pwd)/..`)
+* `CUDA_INSTALL_PATH`: the directory holding `/bin/nvcc` for the first version of `nvcc` on `$PATH` (i.e., `which nvcc | awk -F'/bin/nvcc' '{print $1}'`)
+
+**NOTE:** The version of `cuda-python` installed must match the CUDA version in `CUDA_INSTALL_PATH`.
+
+### Installing a developer-mode package
+The CUTLASS Python interface can currently be installed via:
+```bash
+python setup.py develop --user
+```
+This will allow changes to the Python interface source to be reflected when using the Python interface.
+
+We plan to add support for installing via `python setup.py install` in a future release.
+
+## Docker
+To ensure that you have all of the necessary Python modules for running the examples using the
+CUTLASS Python interface, we recommend using one of the Docker images located in the docker directory.
+
+For example, to build and launch a container that uses CUDA 12.1 via an NGC PyTorch container, run:
+```bash
+docker build -t cutlass-cuda12.1:latest -f docker/Dockerfile-cuda12.1-pytorch .
+docker run --gpus all -it --rm cutlass-cuda12.1:latest
+```
+
+The CUTLASS Python interface has been tested with CUDA 11.8, 12.0, and 12.1 on Python 3.8.10 and 3.9.7.
diff --git a/python/docs/_sources/modules.rst.txt b/python/docs/_sources/modules.rst.txt
new file mode 100644
index 0000000000..467824e9e8
--- /dev/null
+++ b/python/docs/_sources/modules.rst.txt
@@ -0,0 +1,7 @@
+CUTLASS Python API
+==================
+
+.. toctree::
+   :maxdepth: 5
+
+   cutlass
diff --git a/python/docs/_static/basic.css b/python/docs/_static/basic.css
new file mode 100644
index 0000000000..7577acb1ad
--- /dev/null
+++ b/python/docs/_static/basic.css
@@ -0,0 +1,903 @@
+/*
+ * basic.css
+ * ~~~~~~~~~
+ *
+ * Sphinx stylesheet -- basic theme.
+ *
+ * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS.
+ * :license: BSD, see LICENSE for details.
+ *
+ */
+
+/* -- main layout ----------------------------------------------------------- */
+
+div.clearer {
+    clear: both;
+}
+
+div.section::after {
+    display: block;
+    content: '';
+    clear: left;
+}
+
+/* -- relbar ---------------------------------------------------------------- */
+
+div.related {
+    width: 100%;
+    font-size: 90%;
+}
+
+div.related h3 {
+    display: none;
+}
+
+div.related ul {
+    margin: 0;
+    padding: 0 0 0 10px;
+    list-style: none;
+}
+
+div.related li {
+    display: inline;
+}
+
+div.related li.right {
+    float: right;
+    margin-right: 5px;
+}
+
+/* -- sidebar --------------------------------------------------------------- */
+
+div.sphinxsidebarwrapper {
+    padding: 10px 5px 0 10px;
+}
+
+div.sphinxsidebar {
+    float: left;
+    width: 230px;
+    margin-left: -100%;
+    font-size: 90%;
+    word-wrap: break-word;
+    overflow-wrap : break-word;
+}
+
+div.sphinxsidebar ul {
+    list-style: none;
+}
+
+div.sphinxsidebar ul ul,
+div.sphinxsidebar ul.want-points {
+    margin-left: 20px;
+    list-style: square;
+}
+
+div.sphinxsidebar ul ul {
+    margin-top: 0;
+    margin-bottom: 0;
+}
+
+div.sphinxsidebar form {
+    margin-top: 10px;
+}
+
+div.sphinxsidebar input {
+    border: 1px solid #98dbcc;
+    font-family: sans-serif;
+    font-size: 1em;
+}
+
+div.sphinxsidebar #searchbox form.search {
+    overflow: hidden;
+}
+
+div.sphinxsidebar #searchbox input[type="text"] {
+    float: left;
+    width: 80%;
+    padding: 0.25em;
+    box-sizing: border-box;
+}
+
+div.sphinxsidebar #searchbox input[type="submit"] {
+    float: left;
+    width: 20%;
+    border-left: none;
+    padding: 0.25em;
+    box-sizing: border-box;
+}
+
+
+img {
+    border: 0;
+    max-width: 100%;
+}
+
+/* -- search page ----------------------------------------------------------- */
+
+ul.search {
+    margin: 10px 0 0 20px;
+    padding: 0;
+}
+
+ul.search li {
+    padding: 5px 0 5px 20px;
+    background-image: url(file.png);
+    background-repeat: no-repeat;
+    background-position: 0 7px;
+}
+
+ul.search li a {
+    font-weight: bold;
+}
+
+ul.search li p.context {
+    color: #888;
+    margin: 2px 0 0 30px;
+    text-align: left;
+}
+
+ul.keywordmatches li.goodmatch a {
+    font-weight: bold;
+}
+
+/* -- index page ------------------------------------------------------------ */
+
+table.contentstable {
+    width: 90%;
+    margin-left: auto;
+    margin-right: auto;
+}
+
+table.contentstable p.biglink {
+    line-height: 150%;
+}
+
+a.biglink {
+    font-size: 1.3em;
+}
+
+span.linkdescr {
+    font-style: italic;
+    padding-top: 5px;
+    font-size: 90%;
+}
+
+/* -- general index --------------------------------------------------------- */
+
+table.indextable {
+    width: 100%;
+}
+
+table.indextable td {
+    text-align: left;
+    vertical-align: top;
+}
+
+table.indextable ul {
+    margin-top: 0;
+    margin-bottom: 0;
+    list-style-type: none;
+}
+
+table.indextable > tbody > tr > td > ul {
+    padding-left: 0em;
+}
+
+table.indextable tr.pcap {
+    height: 10px;
+}
+
+table.indextable tr.cap {
+    margin-top: 10px;
+    background-color: #f2f2f2;
+}
+
+img.toggler {
+    margin-right: 3px;
+    margin-top: 3px;
+    cursor: pointer;
+}
+
+div.modindex-jumpbox {
+    border-top: 1px solid #ddd;
+    border-bottom: 1px solid #ddd;
+    margin: 1em 0 1em 0;
+    padding: 0.4em;
+}
+
+div.genindex-jumpbox {
+    border-top: 1px solid #ddd;
+    border-bottom: 1px solid #ddd;
+    margin: 1em 0 1em 0;
+    padding: 0.4em;
+}
+
+/* -- domain module index --------------------------------------------------- */
+
+table.modindextable td {
+    padding: 2px;
+    border-collapse: collapse;
+}
+
+/* -- general body styles --------------------------------------------------- */
+
+div.body {
+    min-width: 360px;
+    max-width: 800px;
+}
+
+div.body p, div.body dd, div.body li, div.body blockquote {
+    -moz-hyphens: auto;
+    -ms-hyphens: auto;
+    -webkit-hyphens: auto;
+    hyphens: auto;
+}
+
+a.headerlink {
+    visibility: hidden;
+}
+
+h1:hover > a.headerlink,
+h2:hover > a.headerlink,
+h3:hover > a.headerlink,
+h4:hover > a.headerlink,
+h5:hover > a.headerlink,
+h6:hover > a.headerlink,
+dt:hover > a.headerlink,
+caption:hover > a.headerlink,
+p.caption:hover > a.headerlink,
+div.code-block-caption:hover > a.headerlink {
+    visibility: visible;
+}
+
+div.body p.caption {
+    text-align: inherit;
+}
+
+div.body td {
+    text-align: left;
+}
+
+.first {
+    margin-top: 0 !important;
+}
+
+p.rubric {
+    margin-top: 30px;
+    font-weight: bold;
+}
+
+img.align-left, figure.align-left, .figure.align-left, object.align-left {
+    clear: left;
+    float: left;
+    margin-right: 1em;
+}
+
+img.align-right, figure.align-right, .figure.align-right, object.align-right {
+    clear: right;
+    float: right;
+    margin-left: 1em;
+}
+
+img.align-center, figure.align-center, .figure.align-center, object.align-center {
+  display: block;
+  margin-left: auto;
+  margin-right: auto;
+}
+
+img.align-default, figure.align-default, .figure.align-default {
+  display: block;
+  margin-left: auto;
+  margin-right: auto;
+}
+
+.align-left {
+    text-align: left;
+}
+
+.align-center {
+    text-align: center;
+}
+
+.align-default {
+    text-align: center;
+}
+
+.align-right {
+    text-align: right;
+}
+
+/* -- sidebars -------------------------------------------------------------- */
+
+div.sidebar,
+aside.sidebar {
+    margin: 0 0 0.5em 1em;
+    border: 1px solid #ddb;
+    padding: 7px;
+    background-color: #ffe;
+    width: 40%;
+    float: right;
+    clear: right;
+    overflow-x: auto;
+}
+
+p.sidebar-title {
+    font-weight: bold;
+}
+
+nav.contents,
+aside.topic,
+div.admonition, div.topic, blockquote {
+    clear: left;
+}
+
+/* -- topics ---------------------------------------------------------------- */
+
+nav.contents,
+aside.topic,
+div.topic {
+    border: 1px solid #ccc;
+    padding: 7px;
+    margin: 10px 0 10px 0;
+}
+
+p.topic-title {
+    font-size: 1.1em;
+    font-weight: bold;
+    margin-top: 10px;
+}
+
+/* -- admonitions ----------------------------------------------------------- */
+
+div.admonition {
+    margin-top: 10px;
+    margin-bottom: 10px;
+    padding: 7px;
+}
+
+div.admonition dt {
+    font-weight: bold;
+}
+
+p.admonition-title {
+    margin: 0px 10px 5px 0px;
+    font-weight: bold;
+}
+
+div.body p.centered {
+    text-align: center;
+    margin-top: 25px;
+}
+
+/* -- content of sidebars/topics/admonitions -------------------------------- */
+
+div.sidebar > :last-child,
+aside.sidebar > :last-child,
+nav.contents > :last-child,
+aside.topic > :last-child,
+div.topic > :last-child,
+div.admonition > :last-child {
+    margin-bottom: 0;
+}
+
+div.sidebar::after,
+aside.sidebar::after,
+nav.contents::after,
+aside.topic::after,
+div.topic::after,
+div.admonition::after,
+blockquote::after {
+    display: block;
+    content: '';
+    clear: both;
+}
+
+/* -- tables ---------------------------------------------------------------- */
+
+table.docutils {
+    margin-top: 10px;
+    margin-bottom: 10px;
+    border: 0;
+    border-collapse: collapse;
+}
+
+table.align-center {
+    margin-left: auto;
+    margin-right: auto;
+}
+
+table.align-default {
+    margin-left: auto;
+    margin-right: auto;
+}
+
+table caption span.caption-number {
+    font-style: italic;
+}
+
+table caption span.caption-text {
+}
+
+table.docutils td, table.docutils th {
+    padding: 1px 8px 1px 5px;
+    border-top: 0;
+    border-left: 0;
+    border-right: 0;
+    border-bottom: 1px solid #aaa;
+}
+
+th {
+    text-align: left;
+    padding-right: 5px;
+}
+
+table.citation {
+    border-left: solid 1px gray;
+    margin-left: 1px;
+}
+
+table.citation td {
+    border-bottom: none;
+}
+
+th > :first-child,
+td > :first-child {
+    margin-top: 0px;
+}
+
+th > :last-child,
+td > :last-child {
+    margin-bottom: 0px;
+}
+
+/* -- figures --------------------------------------------------------------- */
+
+div.figure, figure {
+    margin: 0.5em;
+    padding: 0.5em;
+}
+
+div.figure p.caption, figcaption {
+    padding: 0.3em;
+}
+
+div.figure p.caption span.caption-number,
+figcaption span.caption-number {
+    font-style: italic;
+}
+
+div.figure p.caption span.caption-text,
+figcaption span.caption-text {
+}
+
+/* -- field list styles ----------------------------------------------------- */
+
+table.field-list td, table.field-list th {
+    border: 0 !important;
+}
+
+.field-list ul {
+    margin: 0;
+    padding-left: 1em;
+}
+
+.field-list p {
+    margin: 0;
+}
+
+.field-name {
+    -moz-hyphens: manual;
+    -ms-hyphens: manual;
+    -webkit-hyphens: manual;
+    hyphens: manual;
+}
+
+/* -- hlist styles ---------------------------------------------------------- */
+
+table.hlist {
+    margin: 1em 0;
+}
+
+table.hlist td {
+    vertical-align: top;
+}
+
+/* -- object description styles --------------------------------------------- */
+
+.sig {
+	font-family: 'Consolas', 'Menlo', 'DejaVu Sans Mono', 'Bitstream Vera Sans Mono', monospace;
+}
+
+.sig-name, code.descname {
+    background-color: transparent;
+    font-weight: bold;
+}
+
+.sig-name {
+	font-size: 1.1em;
+}
+
+code.descname {
+    font-size: 1.2em;
+}
+
+.sig-prename, code.descclassname {
+    background-color: transparent;
+}
+
+.optional {
+    font-size: 1.3em;
+}
+
+.sig-paren {
+    font-size: larger;
+}
+
+.sig-param.n {
+	font-style: italic;
+}
+
+/* C++ specific styling */
+
+.sig-inline.c-texpr,
+.sig-inline.cpp-texpr {
+	font-family: unset;
+}
+
+.sig.c   .k, .sig.c   .kt,
+.sig.cpp .k, .sig.cpp .kt {
+	color: #0033B3;
+}
+
+.sig.c   .m,
+.sig.cpp .m {
+	color: #1750EB;
+}
+
+.sig.c   .s, .sig.c   .sc,
+.sig.cpp .s, .sig.cpp .sc {
+	color: #067D17;
+}
+
+
+/* -- other body styles ----------------------------------------------------- */
+
+ol.arabic {
+    list-style: decimal;
+}
+
+ol.loweralpha {
+    list-style: lower-alpha;
+}
+
+ol.upperalpha {
+    list-style: upper-alpha;
+}
+
+ol.lowerroman {
+    list-style: lower-roman;
+}
+
+ol.upperroman {
+    list-style: upper-roman;
+}
+
+:not(li) > ol > li:first-child > :first-child,
+:not(li) > ul > li:first-child > :first-child {
+    margin-top: 0px;
+}
+
+:not(li) > ol > li:last-child > :last-child,
+:not(li) > ul > li:last-child > :last-child {
+    margin-bottom: 0px;
+}
+
+ol.simple ol p,
+ol.simple ul p,
+ul.simple ol p,
+ul.simple ul p {
+    margin-top: 0;
+}
+
+ol.simple > li:not(:first-child) > p,
+ul.simple > li:not(:first-child) > p {
+    margin-top: 0;
+}
+
+ol.simple p,
+ul.simple p {
+    margin-bottom: 0;
+}
+
+aside.footnote > span,
+div.citation > span {
+    float: left;
+}
+aside.footnote > span:last-of-type,
+div.citation > span:last-of-type {
+  padding-right: 0.5em;
+}
+aside.footnote > p {
+  margin-left: 2em;
+}
+div.citation > p {
+  margin-left: 4em;
+}
+aside.footnote > p:last-of-type,
+div.citation > p:last-of-type {
+    margin-bottom: 0em;
+}
+aside.footnote > p:last-of-type:after,
+div.citation > p:last-of-type:after {
+    content: "";
+    clear: both;
+}
+
+dl.field-list {
+    display: grid;
+    grid-template-columns: fit-content(30%) auto;
+}
+
+dl.field-list > dt {
+    font-weight: bold;
+    word-break: break-word;
+    padding-left: 0.5em;
+    padding-right: 5px;
+}
+
+dl.field-list > dd {
+    padding-left: 0.5em;
+    margin-top: 0em;
+    margin-left: 0em;
+    margin-bottom: 0em;
+}
+
+dl {
+    margin-bottom: 15px;
+}
+
+dd > :first-child {
+    margin-top: 0px;
+}
+
+dd ul, dd table {
+    margin-bottom: 10px;
+}
+
+dd {
+    margin-top: 3px;
+    margin-bottom: 10px;
+    margin-left: 30px;
+}
+
+dl > dd:last-child,
+dl > dd:last-child > :last-child {
+    margin-bottom: 0;
+}
+
+dt:target, span.highlighted {
+    background-color: #fbe54e;
+}
+
+rect.highlighted {
+    fill: #fbe54e;
+}
+
+dl.glossary dt {
+    font-weight: bold;
+    font-size: 1.1em;
+}
+
+.versionmodified {
+    font-style: italic;
+}
+
+.system-message {
+    background-color: #fda;
+    padding: 5px;
+    border: 3px solid red;
+}
+
+.footnote:target  {
+    background-color: #ffa;
+}
+
+.line-block {
+    display: block;
+    margin-top: 1em;
+    margin-bottom: 1em;
+}
+
+.line-block .line-block {
+    margin-top: 0;
+    margin-bottom: 0;
+    margin-left: 1.5em;
+}
+
+.guilabel, .menuselection {
+    font-family: sans-serif;
+}
+
+.accelerator {
+    text-decoration: underline;
+}
+
+.classifier {
+    font-style: oblique;
+}
+
+.classifier:before {
+    font-style: normal;
+    margin: 0 0.5em;
+    content: ":";
+    display: inline-block;
+}
+
+abbr, acronym {
+    border-bottom: dotted 1px;
+    cursor: help;
+}
+
+/* -- code displays --------------------------------------------------------- */
+
+pre {
+    overflow: auto;
+    overflow-y: hidden;  /* fixes display issues on Chrome browsers */
+}
+
+pre, div[class*="highlight-"] {
+    clear: both;
+}
+
+span.pre {
+    -moz-hyphens: none;
+    -ms-hyphens: none;
+    -webkit-hyphens: none;
+    hyphens: none;
+    white-space: nowrap;
+}
+
+div[class*="highlight-"] {
+    margin: 1em 0;
+}
+
+td.linenos pre {
+    border: 0;
+    background-color: transparent;
+    color: #aaa;
+}
+
+table.highlighttable {
+    display: block;
+}
+
+table.highlighttable tbody {
+    display: block;
+}
+
+table.highlighttable tr {
+    display: flex;
+}
+
+table.highlighttable td {
+    margin: 0;
+    padding: 0;
+}
+
+table.highlighttable td.linenos {
+    padding-right: 0.5em;
+}
+
+table.highlighttable td.code {
+    flex: 1;
+    overflow: hidden;
+}
+
+.highlight .hll {
+    display: block;
+}
+
+div.highlight pre,
+table.highlighttable pre {
+    margin: 0;
+}
+
+div.code-block-caption + div {
+    margin-top: 0;
+}
+
+div.code-block-caption {
+    margin-top: 1em;
+    padding: 2px 5px;
+    font-size: small;
+}
+
+div.code-block-caption code {
+    background-color: transparent;
+}
+
+table.highlighttable td.linenos,
+span.linenos,
+div.highlight span.gp {  /* gp: Generic.Prompt */
+  user-select: none;
+  -webkit-user-select: text; /* Safari fallback only */
+  -webkit-user-select: none; /* Chrome/Safari */
+  -moz-user-select: none; /* Firefox */
+  -ms-user-select: none; /* IE10+ */
+}
+
+div.code-block-caption span.caption-number {
+    padding: 0.1em 0.3em;
+    font-style: italic;
+}
+
+div.code-block-caption span.caption-text {
+}
+
+div.literal-block-wrapper {
+    margin: 1em 0;
+}
+
+code.xref, a code {
+    background-color: transparent;
+    font-weight: bold;
+}
+
+h1 code, h2 code, h3 code, h4 code, h5 code, h6 code {
+    background-color: transparent;
+}
+
+.viewcode-link {
+    float: right;
+}
+
+.viewcode-back {
+    float: right;
+    font-family: sans-serif;
+}
+
+div.viewcode-block:target {
+    margin: -1px -10px;
+    padding: 0 10px;
+}
+
+/* -- math display ---------------------------------------------------------- */
+
+img.math {
+    vertical-align: middle;
+}
+
+div.body div.math p {
+    text-align: center;
+}
+
+span.eqno {
+    float: right;
+}
+
+span.eqno a.headerlink {
+    position: absolute;
+    z-index: 1;
+}
+
+div.math:hover a.headerlink {
+    visibility: visible;
+}
+
+/* -- printout stylesheet --------------------------------------------------- */
+
+@media print {
+    div.document,
+    div.documentwrapper,
+    div.bodywrapper {
+        margin: 0 !important;
+        width: 100%;
+    }
+
+    div.sphinxsidebar,
+    div.related,
+    div.footer,
+    #top-link {
+        display: none;
+    }
+}
\ No newline at end of file
diff --git a/python/docs/_static/check-solid.svg b/python/docs/_static/check-solid.svg
new file mode 100644
index 0000000000..92fad4b5c0
--- /dev/null
+++ b/python/docs/_static/check-solid.svg
@@ -0,0 +1,4 @@
+<svg xmlns="http://www.w3.org/2000/svg" class="icon icon-tabler icon-tabler-check" width="44" height="44" viewBox="0 0 24 24" stroke-width="2" stroke="#22863a" fill="none" stroke-linecap="round" stroke-linejoin="round">
+  <path stroke="none" d="M0 0h24v24H0z" fill="none"/>
+  <path d="M5 12l5 5l10 -10" />
+</svg>
diff --git a/python/docs/_static/clipboard.min.js b/python/docs/_static/clipboard.min.js
new file mode 100644
index 0000000000..54b3c46381
--- /dev/null
+++ b/python/docs/_static/clipboard.min.js
@@ -0,0 +1,7 @@
+/*!
+ * clipboard.js v2.0.8
+ * https://clipboardjs.com/
+ *
+ * Licensed MIT © Zeno Rocha
+ */
+!function(t,e){"object"==typeof exports&&"object"==typeof module?module.exports=e():"function"==typeof define&&define.amd?define([],e):"object"==typeof exports?exports.ClipboardJS=e():t.ClipboardJS=e()}(this,function(){return n={686:function(t,e,n){"use strict";n.d(e,{default:function(){return o}});var e=n(279),i=n.n(e),e=n(370),u=n.n(e),e=n(817),c=n.n(e);function a(t){try{return document.execCommand(t)}catch(t){return}}var f=function(t){t=c()(t);return a("cut"),t};var l=function(t){var e,n,o,r=1<arguments.length&&void 0!==arguments[1]?arguments[1]:{container:document.body},i="";return"string"==typeof t?(e=t,n="rtl"===document.documentElement.getAttribute("dir"),(o=document.createElement("textarea")).style.fontSize="12pt",o.style.border="0",o.style.padding="0",o.style.margin="0",o.style.position="absolute",o.style[n?"right":"left"]="-9999px",n=window.pageYOffset||document.documentElement.scrollTop,o.style.top="".concat(n,"px"),o.setAttribute("readonly",""),o.value=e,o=o,r.container.appendChild(o),i=c()(o),a("copy"),o.remove()):(i=c()(t),a("copy")),i};function r(t){return(r="function"==typeof Symbol&&"symbol"==typeof Symbol.iterator?function(t){return typeof t}:function(t){return t&&"function"==typeof Symbol&&t.constructor===Symbol&&t!==Symbol.prototype?"symbol":typeof t})(t)}var s=function(){var t=0<arguments.length&&void 0!==arguments[0]?arguments[0]:{},e=t.action,n=void 0===e?"copy":e,o=t.container,e=t.target,t=t.text;if("copy"!==n&&"cut"!==n)throw new Error('Invalid "action" value, use either "copy" or "cut"');if(void 0!==e){if(!e||"object"!==r(e)||1!==e.nodeType)throw new Error('Invalid "target" value, use a valid Element');if("copy"===n&&e.hasAttribute("disabled"))throw new Error('Invalid "target" attribute. Please use "readonly" instead of "disabled" attribute');if("cut"===n&&(e.hasAttribute("readonly")||e.hasAttribute("disabled")))throw new Error('Invalid "target" attribute. You can\'t cut text from elements with "readonly" or "disabled" attributes')}return t?l(t,{container:o}):e?"cut"===n?f(e):l(e,{container:o}):void 0};function d(t){return(d="function"==typeof Symbol&&"symbol"==typeof Symbol.iterator?function(t){return typeof t}:function(t){return t&&"function"==typeof Symbol&&t.constructor===Symbol&&t!==Symbol.prototype?"symbol":typeof t})(t)}function p(t,e){for(var n=0;n<e.length;n++){var o=e[n];o.enumerable=o.enumerable||!1,o.configurable=!0,"value"in o&&(o.writable=!0),Object.defineProperty(t,o.key,o)}}function y(t,e){return(y=Object.setPrototypeOf||function(t,e){return t.__proto__=e,t})(t,e)}function h(n){var o=function(){if("undefined"==typeof Reflect||!Reflect.construct)return!1;if(Reflect.construct.sham)return!1;if("function"==typeof Proxy)return!0;try{return Date.prototype.toString.call(Reflect.construct(Date,[],function(){})),!0}catch(t){return!1}}();return function(){var t,e=m(n);return t=o?(t=m(this).constructor,Reflect.construct(e,arguments,t)):e.apply(this,arguments),e=this,!(t=t)||"object"!==d(t)&&"function"!=typeof t?function(t){if(void 0!==t)return t;throw new ReferenceError("this hasn't been initialised - super() hasn't been called")}(e):t}}function m(t){return(m=Object.setPrototypeOf?Object.getPrototypeOf:function(t){return t.__proto__||Object.getPrototypeOf(t)})(t)}function v(t,e){t="data-clipboard-".concat(t);if(e.hasAttribute(t))return e.getAttribute(t)}var o=function(){!function(t,e){if("function"!=typeof e&&null!==e)throw new TypeError("Super expression must either be null or a function");t.prototype=Object.create(e&&e.prototype,{constructor:{value:t,writable:!0,configurable:!0}}),e&&y(t,e)}(r,i());var t,e,n,o=h(r);function r(t,e){var n;return function(t){if(!(t instanceof r))throw new TypeError("Cannot call a class as a function")}(this),(n=o.call(this)).resolveOptions(e),n.listenClick(t),n}return t=r,n=[{key:"copy",value:function(t){var e=1<arguments.length&&void 0!==arguments[1]?arguments[1]:{container:document.body};return l(t,e)}},{key:"cut",value:function(t){return f(t)}},{key:"isSupported",value:function(){var t=0<arguments.length&&void 0!==arguments[0]?arguments[0]:["copy","cut"],t="string"==typeof t?[t]:t,e=!!document.queryCommandSupported;return t.forEach(function(t){e=e&&!!document.queryCommandSupported(t)}),e}}],(e=[{key:"resolveOptions",value:function(){var t=0<arguments.length&&void 0!==arguments[0]?arguments[0]:{};this.action="function"==typeof t.action?t.action:this.defaultAction,this.target="function"==typeof t.target?t.target:this.defaultTarget,this.text="function"==typeof t.text?t.text:this.defaultText,this.container="object"===d(t.container)?t.container:document.body}},{key:"listenClick",value:function(t){var e=this;this.listener=u()(t,"click",function(t){return e.onClick(t)})}},{key:"onClick",value:function(t){var e=t.delegateTarget||t.currentTarget,t=s({action:this.action(e),container:this.container,target:this.target(e),text:this.text(e)});this.emit(t?"success":"error",{action:this.action,text:t,trigger:e,clearSelection:function(){e&&e.focus(),document.activeElement.blur(),window.getSelection().removeAllRanges()}})}},{key:"defaultAction",value:function(t){return v("action",t)}},{key:"defaultTarget",value:function(t){t=v("target",t);if(t)return document.querySelector(t)}},{key:"defaultText",value:function(t){return v("text",t)}},{key:"destroy",value:function(){this.listener.destroy()}}])&&p(t.prototype,e),n&&p(t,n),r}()},828:function(t){var e;"undefined"==typeof Element||Element.prototype.matches||((e=Element.prototype).matches=e.matchesSelector||e.mozMatchesSelector||e.msMatchesSelector||e.oMatchesSelector||e.webkitMatchesSelector),t.exports=function(t,e){for(;t&&9!==t.nodeType;){if("function"==typeof t.matches&&t.matches(e))return t;t=t.parentNode}}},438:function(t,e,n){var u=n(828);function i(t,e,n,o,r){var i=function(e,n,t,o){return function(t){t.delegateTarget=u(t.target,n),t.delegateTarget&&o.call(e,t)}}.apply(this,arguments);return t.addEventListener(n,i,r),{destroy:function(){t.removeEventListener(n,i,r)}}}t.exports=function(t,e,n,o,r){return"function"==typeof t.addEventListener?i.apply(null,arguments):"function"==typeof n?i.bind(null,document).apply(null,arguments):("string"==typeof t&&(t=document.querySelectorAll(t)),Array.prototype.map.call(t,function(t){return i(t,e,n,o,r)}))}},879:function(t,n){n.node=function(t){return void 0!==t&&t instanceof HTMLElement&&1===t.nodeType},n.nodeList=function(t){var e=Object.prototype.toString.call(t);return void 0!==t&&("[object NodeList]"===e||"[object HTMLCollection]"===e)&&"length"in t&&(0===t.length||n.node(t[0]))},n.string=function(t){return"string"==typeof t||t instanceof String},n.fn=function(t){return"[object Function]"===Object.prototype.toString.call(t)}},370:function(t,e,n){var f=n(879),l=n(438);t.exports=function(t,e,n){if(!t&&!e&&!n)throw new Error("Missing required arguments");if(!f.string(e))throw new TypeError("Second argument must be a String");if(!f.fn(n))throw new TypeError("Third argument must be a Function");if(f.node(t))return c=e,a=n,(u=t).addEventListener(c,a),{destroy:function(){u.removeEventListener(c,a)}};if(f.nodeList(t))return o=t,r=e,i=n,Array.prototype.forEach.call(o,function(t){t.addEventListener(r,i)}),{destroy:function(){Array.prototype.forEach.call(o,function(t){t.removeEventListener(r,i)})}};if(f.string(t))return t=t,e=e,n=n,l(document.body,t,e,n);throw new TypeError("First argument must be a String, HTMLElement, HTMLCollection, or NodeList");var o,r,i,u,c,a}},817:function(t){t.exports=function(t){var e,n="SELECT"===t.nodeName?(t.focus(),t.value):"INPUT"===t.nodeName||"TEXTAREA"===t.nodeName?((e=t.hasAttribute("readonly"))||t.setAttribute("readonly",""),t.select(),t.setSelectionRange(0,t.value.length),e||t.removeAttribute("readonly"),t.value):(t.hasAttribute("contenteditable")&&t.focus(),n=window.getSelection(),(e=document.createRange()).selectNodeContents(t),n.removeAllRanges(),n.addRange(e),n.toString());return n}},279:function(t){function e(){}e.prototype={on:function(t,e,n){var o=this.e||(this.e={});return(o[t]||(o[t]=[])).push({fn:e,ctx:n}),this},once:function(t,e,n){var o=this;function r(){o.off(t,r),e.apply(n,arguments)}return r._=e,this.on(t,r,n)},emit:function(t){for(var e=[].slice.call(arguments,1),n=((this.e||(this.e={}))[t]||[]).slice(),o=0,r=n.length;o<r;o++)n[o].fn.apply(n[o].ctx,e);return this},off:function(t,e){var n=this.e||(this.e={}),o=n[t],r=[];if(o&&e)for(var i=0,u=o.length;i<u;i++)o[i].fn!==e&&o[i].fn._!==e&&r.push(o[i]);return r.length?n[t]=r:delete n[t],this}},t.exports=e,t.exports.TinyEmitter=e}},r={},o.n=function(t){var e=t&&t.__esModule?function(){return t.default}:function(){return t};return o.d(e,{a:e}),e},o.d=function(t,e){for(var n in e)o.o(e,n)&&!o.o(t,n)&&Object.defineProperty(t,n,{enumerable:!0,get:e[n]})},o.o=function(t,e){return Object.prototype.hasOwnProperty.call(t,e)},o(686).default;function o(t){if(r[t])return r[t].exports;var e=r[t]={exports:{}};return n[t](e,e.exports,o),e.exports}var n,r});
\ No newline at end of file
diff --git a/python/docs/_static/copy-button.svg b/python/docs/_static/copy-button.svg
new file mode 100644
index 0000000000..9c074dae52
--- /dev/null
+++ b/python/docs/_static/copy-button.svg
@@ -0,0 +1,5 @@
+<svg xmlns="http://www.w3.org/2000/svg" class="icon icon-tabler icon-tabler-copy" width="44" height="44" viewBox="0 0 24 24" stroke-width="1.5" stroke="#000000" fill="none" stroke-linecap="round" stroke-linejoin="round">
+  <path stroke="none" d="M0 0h24v24H0z" fill="none"/>
+  <rect x="8" y="8" width="12" height="12" rx="2" />
+  <path d="M16 8v-2a2 2 0 0 0 -2 -2h-8a2 2 0 0 0 -2 2v8a2 2 0 0 0 2 2h2" />
+</svg>
diff --git a/python/docs/_static/copybutton.css b/python/docs/_static/copybutton.css
new file mode 100644
index 0000000000..f1916ec7d1
--- /dev/null
+++ b/python/docs/_static/copybutton.css
@@ -0,0 +1,94 @@
+/* Copy buttons */
+button.copybtn {
+    position: absolute;
+    display: flex;
+    top: .3em;
+    right: .3em;
+    width: 1.7em;
+    height: 1.7em;
+	opacity: 0;
+    transition: opacity 0.3s, border .3s, background-color .3s;
+    user-select: none;
+    padding: 0;
+    border: none;
+    outline: none;
+    border-radius: 0.4em;
+    /* The colors that GitHub uses */
+    border: #1b1f2426 1px solid;
+    background-color: #f6f8fa;
+    color: #57606a;
+}
+
+button.copybtn.success {
+    border-color: #22863a;
+    color: #22863a;
+}
+
+button.copybtn svg {
+    stroke: currentColor;
+    width: 1.5em;
+    height: 1.5em;
+    padding: 0.1em;
+}
+
+div.highlight  {
+    position: relative;
+}
+
+/* Show the copybutton */
+.highlight:hover button.copybtn, button.copybtn.success {
+	opacity: 1;
+}
+
+.highlight button.copybtn:hover {
+    background-color: rgb(235, 235, 235);
+}
+
+.highlight button.copybtn:active {
+    background-color: rgb(187, 187, 187);
+}
+
+/**
+ * A minimal CSS-only tooltip copied from:
+ *   https://codepen.io/mildrenben/pen/rVBrpK
+ *
+ * To use, write HTML like the following:
+ *
+ * <p class="o-tooltip--left" data-tooltip="Hey">Short</p>
+ */
+ .o-tooltip--left {
+  position: relative;
+ }
+
+ .o-tooltip--left:after {
+    opacity: 0;
+    visibility: hidden;
+    position: absolute;
+    content: attr(data-tooltip);
+    padding: .2em;
+    font-size: .8em;
+    left: -.2em;
+    background: grey;
+    color: white;
+    white-space: nowrap;
+    z-index: 2;
+    border-radius: 2px;
+    transform: translateX(-102%) translateY(0);
+    transition: opacity 0.2s cubic-bezier(0.64, 0.09, 0.08, 1), transform 0.2s cubic-bezier(0.64, 0.09, 0.08, 1);
+}
+
+.o-tooltip--left:hover:after {
+    display: block;
+    opacity: 1;
+    visibility: visible;
+    transform: translateX(-100%) translateY(0);
+    transition: opacity 0.2s cubic-bezier(0.64, 0.09, 0.08, 1), transform 0.2s cubic-bezier(0.64, 0.09, 0.08, 1);
+    transition-delay: .5s;
+}
+
+/* By default the copy button shouldn't show up when printing a page */
+@media print {
+    button.copybtn {
+        display: none;
+    }
+}
diff --git a/python/docs/_static/copybutton.js b/python/docs/_static/copybutton.js
new file mode 100644
index 0000000000..2ea7ff3e21
--- /dev/null
+++ b/python/docs/_static/copybutton.js
@@ -0,0 +1,248 @@
+// Localization support
+const messages = {
+  'en': {
+    'copy': 'Copy',
+    'copy_to_clipboard': 'Copy to clipboard',
+    'copy_success': 'Copied!',
+    'copy_failure': 'Failed to copy',
+  },
+  'es' : {
+    'copy': 'Copiar',
+    'copy_to_clipboard': 'Copiar al portapapeles',
+    'copy_success': '¡Copiado!',
+    'copy_failure': 'Error al copiar',
+  },
+  'de' : {
+    'copy': 'Kopieren',
+    'copy_to_clipboard': 'In die Zwischenablage kopieren',
+    'copy_success': 'Kopiert!',
+    'copy_failure': 'Fehler beim Kopieren',
+  },
+  'fr' : {
+    'copy': 'Copier',
+    'copy_to_clipboard': 'Copier dans le presse-papier',
+    'copy_success': 'Copié !',
+    'copy_failure': 'Échec de la copie',
+  },
+  'ru': {
+    'copy': 'Скопировать',
+    'copy_to_clipboard': 'Скопировать в буфер',
+    'copy_success': 'Скопировано!',
+    'copy_failure': 'Не удалось скопировать',
+  },
+  'zh-CN': {
+    'copy': '复制',
+    'copy_to_clipboard': '复制到剪贴板',
+    'copy_success': '复制成功!',
+    'copy_failure': '复制失败',
+  },
+  'it' : {
+    'copy': 'Copiare',
+    'copy_to_clipboard': 'Copiato negli appunti',
+    'copy_success': 'Copiato!',
+    'copy_failure': 'Errore durante la copia',
+  }
+}
+
+let locale = 'en'
+if( document.documentElement.lang !== undefined
+    && messages[document.documentElement.lang] !== undefined ) {
+  locale = document.documentElement.lang
+}
+
+let doc_url_root = DOCUMENTATION_OPTIONS.URL_ROOT;
+if (doc_url_root == '#') {
+    doc_url_root = '';
+}
+
+/**
+ * SVG files for our copy buttons
+ */
+let iconCheck = `<svg xmlns="http://www.w3.org/2000/svg" class="icon icon-tabler icon-tabler-check" width="44" height="44" viewBox="0 0 24 24" stroke-width="2" stroke="#22863a" fill="none" stroke-linecap="round" stroke-linejoin="round">
+  <title>${messages[locale]['copy_success']}</title>
+  <path stroke="none" d="M0 0h24v24H0z" fill="none"/>
+  <path d="M5 12l5 5l10 -10" />
+</svg>`
+
+// If the user specified their own SVG use that, otherwise use the default
+let iconCopy = ``;
+if (!iconCopy) {
+  iconCopy = `<svg xmlns="http://www.w3.org/2000/svg" class="icon icon-tabler icon-tabler-copy" width="44" height="44" viewBox="0 0 24 24" stroke-width="1.5" stroke="#000000" fill="none" stroke-linecap="round" stroke-linejoin="round">
+  <title>${messages[locale]['copy_to_clipboard']}</title>
+  <path stroke="none" d="M0 0h24v24H0z" fill="none"/>
+  <rect x="8" y="8" width="12" height="12" rx="2" />
+  <path d="M16 8v-2a2 2 0 0 0 -2 -2h-8a2 2 0 0 0 -2 2v8a2 2 0 0 0 2 2h2" />
+</svg>`
+}
+
+/**
+ * Set up copy/paste for code blocks
+ */
+
+const runWhenDOMLoaded = cb => {
+  if (document.readyState != 'loading') {
+    cb()
+  } else if (document.addEventListener) {
+    document.addEventListener('DOMContentLoaded', cb)
+  } else {
+    document.attachEvent('onreadystatechange', function() {
+      if (document.readyState == 'complete') cb()
+    })
+  }
+}
+
+const codeCellId = index => `codecell${index}`
+
+// Clears selected text since ClipboardJS will select the text when copying
+const clearSelection = () => {
+  if (window.getSelection) {
+    window.getSelection().removeAllRanges()
+  } else if (document.selection) {
+    document.selection.empty()
+  }
+}
+
+// Changes tooltip text for a moment, then changes it back
+// We want the timeout of our `success` class to be a bit shorter than the
+// tooltip and icon change, so that we can hide the icon before changing back.
+var timeoutIcon = 2000;
+var timeoutSuccessClass = 1500;
+
+const temporarilyChangeTooltip = (el, oldText, newText) => {
+  el.setAttribute('data-tooltip', newText)
+  el.classList.add('success')
+  // Remove success a little bit sooner than we change the tooltip
+  // So that we can use CSS to hide the copybutton first
+  setTimeout(() => el.classList.remove('success'), timeoutSuccessClass)
+  setTimeout(() => el.setAttribute('data-tooltip', oldText), timeoutIcon)
+}
+
+// Changes the copy button icon for two seconds, then changes it back
+const temporarilyChangeIcon = (el) => {
+  el.innerHTML = iconCheck;
+  setTimeout(() => {el.innerHTML = iconCopy}, timeoutIcon)
+}
+
+const addCopyButtonToCodeCells = () => {
+  // If ClipboardJS hasn't loaded, wait a bit and try again. This
+  // happens because we load ClipboardJS asynchronously.
+  if (window.ClipboardJS === undefined) {
+    setTimeout(addCopyButtonToCodeCells, 250)
+    return
+  }
+
+  // Add copybuttons to all of our code cells
+  const COPYBUTTON_SELECTOR = 'div.highlight pre';
+  const codeCells = document.querySelectorAll(COPYBUTTON_SELECTOR)
+  codeCells.forEach((codeCell, index) => {
+    const id = codeCellId(index)
+    codeCell.setAttribute('id', id)
+
+    const clipboardButton = id =>
+    `<button class="copybtn o-tooltip--left" data-tooltip="${messages[locale]['copy']}" data-clipboard-target="#${id}">
+      ${iconCopy}
+    </button>`
+    codeCell.insertAdjacentHTML('afterend', clipboardButton(id))
+  })
+
+function escapeRegExp(string) {
+    return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); // $& means the whole matched string
+}
+
+/**
+ * Removes excluded text from a Node.
+ *
+ * @param {Node} target Node to filter.
+ * @param {string} exclude CSS selector of nodes to exclude.
+ * @returns {DOMString} Text from `target` with text removed.
+ */
+function filterText(target, exclude) {
+    const clone = target.cloneNode(true);  // clone as to not modify the live DOM
+    if (exclude) {
+        // remove excluded nodes
+        clone.querySelectorAll(exclude).forEach(node => node.remove());
+    }
+    return clone.innerText;
+}
+
+// Callback when a copy button is clicked. Will be passed the node that was clicked
+// should then grab the text and replace pieces of text that shouldn't be used in output
+function formatCopyText(textContent, copybuttonPromptText, isRegexp = false, onlyCopyPromptLines = true, removePrompts = true, copyEmptyLines = true, lineContinuationChar = "", hereDocDelim = "") {
+    var regexp;
+    var match;
+
+    // Do we check for line continuation characters and "HERE-documents"?
+    var useLineCont = !!lineContinuationChar
+    var useHereDoc = !!hereDocDelim
+
+    // create regexp to capture prompt and remaining line
+    if (isRegexp) {
+        regexp = new RegExp('^(' + copybuttonPromptText + ')(.*)')
+    } else {
+        regexp = new RegExp('^(' + escapeRegExp(copybuttonPromptText) + ')(.*)')
+    }
+
+    const outputLines = [];
+    var promptFound = false;
+    var gotLineCont = false;
+    var gotHereDoc = false;
+    const lineGotPrompt = [];
+    for (const line of textContent.split('\n')) {
+        match = line.match(regexp)
+        if (match || gotLineCont || gotHereDoc) {
+            promptFound = regexp.test(line)
+            lineGotPrompt.push(promptFound)
+            if (removePrompts && promptFound) {
+                outputLines.push(match[2])
+            } else {
+                outputLines.push(line)
+            }
+            gotLineCont = line.endsWith(lineContinuationChar) & useLineCont
+            if (line.includes(hereDocDelim) & useHereDoc)
+                gotHereDoc = !gotHereDoc
+        } else if (!onlyCopyPromptLines) {
+            outputLines.push(line)
+        } else if (copyEmptyLines && line.trim() === '') {
+            outputLines.push(line)
+        }
+    }
+
+    // If no lines with the prompt were found then just use original lines
+    if (lineGotPrompt.some(v => v === true)) {
+        textContent = outputLines.join('\n');
+    }
+
+    // Remove a trailing newline to avoid auto-running when pasting
+    if (textContent.endsWith("\n")) {
+        textContent = textContent.slice(0, -1)
+    }
+    return textContent
+}
+
+
+var copyTargetText = (trigger) => {
+  var target = document.querySelector(trigger.attributes['data-clipboard-target'].value);
+
+  // get filtered text
+  let exclude = '.linenos';
+
+  let text = filterText(target, exclude);
+  return formatCopyText(text, '', false, true, true, true, '', '')
+}
+
+  // Initialize with a callback so we can modify the text before copy
+  const clipboard = new ClipboardJS('.copybtn', {text: copyTargetText})
+
+  // Update UI with error/success messages
+  clipboard.on('success', event => {
+    clearSelection()
+    temporarilyChangeTooltip(event.trigger, messages[locale]['copy'], messages[locale]['copy_success'])
+    temporarilyChangeIcon(event.trigger)
+  })
+
+  clipboard.on('error', event => {
+    temporarilyChangeTooltip(event.trigger, messages[locale]['copy'], messages[locale]['copy_failure'])
+  })
+}
+
+runWhenDOMLoaded(addCopyButtonToCodeCells)
\ No newline at end of file
diff --git a/python/docs/_static/copybutton_funcs.js b/python/docs/_static/copybutton_funcs.js
new file mode 100644
index 0000000000..dbe1aaad79
--- /dev/null
+++ b/python/docs/_static/copybutton_funcs.js
@@ -0,0 +1,73 @@
+function escapeRegExp(string) {
+    return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); // $& means the whole matched string
+}
+
+/**
+ * Removes excluded text from a Node.
+ *
+ * @param {Node} target Node to filter.
+ * @param {string} exclude CSS selector of nodes to exclude.
+ * @returns {DOMString} Text from `target` with text removed.
+ */
+export function filterText(target, exclude) {
+    const clone = target.cloneNode(true);  // clone as to not modify the live DOM
+    if (exclude) {
+        // remove excluded nodes
+        clone.querySelectorAll(exclude).forEach(node => node.remove());
+    }
+    return clone.innerText;
+}
+
+// Callback when a copy button is clicked. Will be passed the node that was clicked
+// should then grab the text and replace pieces of text that shouldn't be used in output
+export function formatCopyText(textContent, copybuttonPromptText, isRegexp = false, onlyCopyPromptLines = true, removePrompts = true, copyEmptyLines = true, lineContinuationChar = "", hereDocDelim = "") {
+    var regexp;
+    var match;
+
+    // Do we check for line continuation characters and "HERE-documents"?
+    var useLineCont = !!lineContinuationChar
+    var useHereDoc = !!hereDocDelim
+
+    // create regexp to capture prompt and remaining line
+    if (isRegexp) {
+        regexp = new RegExp('^(' + copybuttonPromptText + ')(.*)')
+    } else {
+        regexp = new RegExp('^(' + escapeRegExp(copybuttonPromptText) + ')(.*)')
+    }
+
+    const outputLines = [];
+    var promptFound = false;
+    var gotLineCont = false;
+    var gotHereDoc = false;
+    const lineGotPrompt = [];
+    for (const line of textContent.split('\n')) {
+        match = line.match(regexp)
+        if (match || gotLineCont || gotHereDoc) {
+            promptFound = regexp.test(line)
+            lineGotPrompt.push(promptFound)
+            if (removePrompts && promptFound) {
+                outputLines.push(match[2])
+            } else {
+                outputLines.push(line)
+            }
+            gotLineCont = line.endsWith(lineContinuationChar) & useLineCont
+            if (line.includes(hereDocDelim) & useHereDoc)
+                gotHereDoc = !gotHereDoc
+        } else if (!onlyCopyPromptLines) {
+            outputLines.push(line)
+        } else if (copyEmptyLines && line.trim() === '') {
+            outputLines.push(line)
+        }
+    }
+
+    // If no lines with the prompt were found then just use original lines
+    if (lineGotPrompt.some(v => v === true)) {
+        textContent = outputLines.join('\n');
+    }
+
+    // Remove a trailing newline to avoid auto-running when pasting
+    if (textContent.endsWith("\n")) {
+        textContent = textContent.slice(0, -1)
+    }
+    return textContent
+}
diff --git a/python/docs/_static/cutlass-logo-small.png b/python/docs/_static/cutlass-logo-small.png
new file mode 100644
index 0000000000..6c2a313fde
Binary files /dev/null and b/python/docs/_static/cutlass-logo-small.png differ
diff --git a/python/docs/_static/debug.css b/python/docs/_static/debug.css
new file mode 100644
index 0000000000..74d4aec33e
--- /dev/null
+++ b/python/docs/_static/debug.css
@@ -0,0 +1,69 @@
+/*
+  This CSS file should be overridden by the theme authors. It's
+  meant for debugging and developing the skeleton that this theme provides.
+*/
+body {
+  font-family: -apple-system, "Segoe UI", Roboto, Helvetica, Arial, sans-serif,
+    "Apple Color Emoji", "Segoe UI Emoji";
+  background: lavender;
+}
+.sb-announcement {
+  background: rgb(131, 131, 131);
+}
+.sb-announcement__inner {
+  background: black;
+  color: white;
+}
+.sb-header {
+  background: lightskyblue;
+}
+.sb-header__inner {
+  background: royalblue;
+  color: white;
+}
+.sb-header-secondary {
+  background: lightcyan;
+}
+.sb-header-secondary__inner {
+  background: cornflowerblue;
+  color: white;
+}
+.sb-sidebar-primary {
+  background: lightgreen;
+}
+.sb-main {
+  background: blanchedalmond;
+}
+.sb-main__inner {
+  background: antiquewhite;
+}
+.sb-header-article {
+  background: lightsteelblue;
+}
+.sb-article-container {
+  background: snow;
+}
+.sb-article-main {
+  background: white;
+}
+.sb-footer-article {
+  background: lightpink;
+}
+.sb-sidebar-secondary {
+  background: lightgoldenrodyellow;
+}
+.sb-footer-content {
+  background: plum;
+}
+.sb-footer-content__inner {
+  background: palevioletred;
+}
+.sb-footer {
+  background: pink;
+}
+.sb-footer__inner {
+  background: salmon;
+}
+.sb-article {
+  background: white;
+}
diff --git a/python/docs/_static/doctools.js b/python/docs/_static/doctools.js
new file mode 100644
index 0000000000..d06a71d751
--- /dev/null
+++ b/python/docs/_static/doctools.js
@@ -0,0 +1,156 @@
+/*
+ * doctools.js
+ * ~~~~~~~~~~~
+ *
+ * Base JavaScript utilities for all Sphinx HTML documentation.
+ *
+ * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS.
+ * :license: BSD, see LICENSE for details.
+ *
+ */
+"use strict";
+
+const BLACKLISTED_KEY_CONTROL_ELEMENTS = new Set([
+  "TEXTAREA",
+  "INPUT",
+  "SELECT",
+  "BUTTON",
+]);
+
+const _ready = (callback) => {
+  if (document.readyState !== "loading") {
+    callback();
+  } else {
+    document.addEventListener("DOMContentLoaded", callback);
+  }
+};
+
+/**
+ * Small JavaScript module for the documentation.
+ */
+const Documentation = {
+  init: () => {
+    Documentation.initDomainIndexTable();
+    Documentation.initOnKeyListeners();
+  },
+
+  /**
+   * i18n support
+   */
+  TRANSLATIONS: {},
+  PLURAL_EXPR: (n) => (n === 1 ? 0 : 1),
+  LOCALE: "unknown",
+
+  // gettext and ngettext don't access this so that the functions
+  // can safely bound to a different name (_ = Documentation.gettext)
+  gettext: (string) => {
+    const translated = Documentation.TRANSLATIONS[string];
+    switch (typeof translated) {
+      case "undefined":
+        return string; // no translation
+      case "string":
+        return translated; // translation exists
+      default:
+        return translated[0]; // (singular, plural) translation tuple exists
+    }
+  },
+
+  ngettext: (singular, plural, n) => {
+    const translated = Documentation.TRANSLATIONS[singular];
+    if (typeof translated !== "undefined")
+      return translated[Documentation.PLURAL_EXPR(n)];
+    return n === 1 ? singular : plural;
+  },
+
+  addTranslations: (catalog) => {
+    Object.assign(Documentation.TRANSLATIONS, catalog.messages);
+    Documentation.PLURAL_EXPR = new Function(
+      "n",
+      `return (${catalog.plural_expr})`
+    );
+    Documentation.LOCALE = catalog.locale;
+  },
+
+  /**
+   * helper function to focus on search bar
+   */
+  focusSearchBar: () => {
+    document.querySelectorAll("input[name=q]")[0]?.focus();
+  },
+
+  /**
+   * Initialise the domain index toggle buttons
+   */
+  initDomainIndexTable: () => {
+    const toggler = (el) => {
+      const idNumber = el.id.substr(7);
+      const toggledRows = document.querySelectorAll(`tr.cg-${idNumber}`);
+      if (el.src.substr(-9) === "minus.png") {
+        el.src = `${el.src.substr(0, el.src.length - 9)}plus.png`;
+        toggledRows.forEach((el) => (el.style.display = "none"));
+      } else {
+        el.src = `${el.src.substr(0, el.src.length - 8)}minus.png`;
+        toggledRows.forEach((el) => (el.style.display = ""));
+      }
+    };
+
+    const togglerElements = document.querySelectorAll("img.toggler");
+    togglerElements.forEach((el) =>
+      el.addEventListener("click", (event) => toggler(event.currentTarget))
+    );
+    togglerElements.forEach((el) => (el.style.display = ""));
+    if (DOCUMENTATION_OPTIONS.COLLAPSE_INDEX) togglerElements.forEach(toggler);
+  },
+
+  initOnKeyListeners: () => {
+    // only install a listener if it is really needed
+    if (
+      !DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS &&
+      !DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS
+    )
+      return;
+
+    document.addEventListener("keydown", (event) => {
+      // bail for input elements
+      if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return;
+      // bail with special keys
+      if (event.altKey || event.ctrlKey || event.metaKey) return;
+
+      if (!event.shiftKey) {
+        switch (event.key) {
+          case "ArrowLeft":
+            if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break;
+
+            const prevLink = document.querySelector('link[rel="prev"]');
+            if (prevLink && prevLink.href) {
+              window.location.href = prevLink.href;
+              event.preventDefault();
+            }
+            break;
+          case "ArrowRight":
+            if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break;
+
+            const nextLink = document.querySelector('link[rel="next"]');
+            if (nextLink && nextLink.href) {
+              window.location.href = nextLink.href;
+              event.preventDefault();
+            }
+            break;
+        }
+      }
+
+      // some keyboard layouts may need Shift to get /
+      switch (event.key) {
+        case "/":
+          if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) break;
+          Documentation.focusSearchBar();
+          event.preventDefault();
+      }
+    });
+  },
+};
+
+// quick alias for translations
+const _ = Documentation.gettext;
+
+_ready(Documentation.init);
diff --git a/python/docs/_static/documentation_options.js b/python/docs/_static/documentation_options.js
new file mode 100644
index 0000000000..6ba7fc6804
--- /dev/null
+++ b/python/docs/_static/documentation_options.js
@@ -0,0 +1,14 @@
+var DOCUMENTATION_OPTIONS = {
+    URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'),
+    VERSION: '3.1.0',
+    LANGUAGE: 'en',
+    COLLAPSE_INDEX: false,
+    BUILDER: 'html',
+    FILE_SUFFIX: '.html',
+    LINK_SUFFIX: '.html',
+    HAS_SOURCE: true,
+    SOURCELINK_SUFFIX: '.txt',
+    NAVIGATION_WITH_KEYS: false,
+    SHOW_SEARCH_SUMMARY: true,
+    ENABLE_SEARCH_SHORTCUTS: true,
+};
\ No newline at end of file
diff --git a/python/docs/_static/file.png b/python/docs/_static/file.png
new file mode 100644
index 0000000000..a858a410e4
Binary files /dev/null and b/python/docs/_static/file.png differ
diff --git a/python/docs/_static/language_data.js b/python/docs/_static/language_data.js
new file mode 100644
index 0000000000..250f5665fa
--- /dev/null
+++ b/python/docs/_static/language_data.js
@@ -0,0 +1,199 @@
+/*
+ * language_data.js
+ * ~~~~~~~~~~~~~~~~
+ *
+ * This script contains the language-specific data used by searchtools.js,
+ * namely the list of stopwords, stemmer, scorer and splitter.
+ *
+ * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS.
+ * :license: BSD, see LICENSE for details.
+ *
+ */
+
+var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"];
+
+
+/* Non-minified version is copied as a separate JS file, is available */
+
+/**
+ * Porter Stemmer
+ */
+var Stemmer = function() {
+
+  var step2list = {
+    ational: 'ate',
+    tional: 'tion',
+    enci: 'ence',
+    anci: 'ance',
+    izer: 'ize',
+    bli: 'ble',
+    alli: 'al',
+    entli: 'ent',
+    eli: 'e',
+    ousli: 'ous',
+    ization: 'ize',
+    ation: 'ate',
+    ator: 'ate',
+    alism: 'al',
+    iveness: 'ive',
+    fulness: 'ful',
+    ousness: 'ous',
+    aliti: 'al',
+    iviti: 'ive',
+    biliti: 'ble',
+    logi: 'log'
+  };
+
+  var step3list = {
+    icate: 'ic',
+    ative: '',
+    alize: 'al',
+    iciti: 'ic',
+    ical: 'ic',
+    ful: '',
+    ness: ''
+  };
+
+  var c = "[^aeiou]";          // consonant
+  var v = "[aeiouy]";          // vowel
+  var C = c + "[^aeiouy]*";    // consonant sequence
+  var V = v + "[aeiou]*";      // vowel sequence
+
+  var mgr0 = "^(" + C + ")?" + V + C;                      // [C]VC... is m>0
+  var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$";    // [C]VC[V] is m=1
+  var mgr1 = "^(" + C + ")?" + V + C + V + C;              // [C]VCVC... is m>1
+  var s_v   = "^(" + C + ")?" + v;                         // vowel in stem
+
+  this.stemWord = function (w) {
+    var stem;
+    var suffix;
+    var firstch;
+    var origword = w;
+
+    if (w.length < 3)
+      return w;
+
+    var re;
+    var re2;
+    var re3;
+    var re4;
+
+    firstch = w.substr(0,1);
+    if (firstch == "y")
+      w = firstch.toUpperCase() + w.substr(1);
+
+    // Step 1a
+    re = /^(.+?)(ss|i)es$/;
+    re2 = /^(.+?)([^s])s$/;
+
+    if (re.test(w))
+      w = w.replace(re,"$1$2");
+    else if (re2.test(w))
+      w = w.replace(re2,"$1$2");
+
+    // Step 1b
+    re = /^(.+?)eed$/;
+    re2 = /^(.+?)(ed|ing)$/;
+    if (re.test(w)) {
+      var fp = re.exec(w);
+      re = new RegExp(mgr0);
+      if (re.test(fp[1])) {
+        re = /.$/;
+        w = w.replace(re,"");
+      }
+    }
+    else if (re2.test(w)) {
+      var fp = re2.exec(w);
+      stem = fp[1];
+      re2 = new RegExp(s_v);
+      if (re2.test(stem)) {
+        w = stem;
+        re2 = /(at|bl|iz)$/;
+        re3 = new RegExp("([^aeiouylsz])\\1$");
+        re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
+        if (re2.test(w))
+          w = w + "e";
+        else if (re3.test(w)) {
+          re = /.$/;
+          w = w.replace(re,"");
+        }
+        else if (re4.test(w))
+          w = w + "e";
+      }
+    }
+
+    // Step 1c
+    re = /^(.+?)y$/;
+    if (re.test(w)) {
+      var fp = re.exec(w);
+      stem = fp[1];
+      re = new RegExp(s_v);
+      if (re.test(stem))
+        w = stem + "i";
+    }
+
+    // Step 2
+    re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
+    if (re.test(w)) {
+      var fp = re.exec(w);
+      stem = fp[1];
+      suffix = fp[2];
+      re = new RegExp(mgr0);
+      if (re.test(stem))
+        w = stem + step2list[suffix];
+    }
+
+    // Step 3
+    re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
+    if (re.test(w)) {
+      var fp = re.exec(w);
+      stem = fp[1];
+      suffix = fp[2];
+      re = new RegExp(mgr0);
+      if (re.test(stem))
+        w = stem + step3list[suffix];
+    }
+
+    // Step 4
+    re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
+    re2 = /^(.+?)(s|t)(ion)$/;
+    if (re.test(w)) {
+      var fp = re.exec(w);
+      stem = fp[1];
+      re = new RegExp(mgr1);
+      if (re.test(stem))
+        w = stem;
+    }
+    else if (re2.test(w)) {
+      var fp = re2.exec(w);
+      stem = fp[1] + fp[2];
+      re2 = new RegExp(mgr1);
+      if (re2.test(stem))
+        w = stem;
+    }
+
+    // Step 5
+    re = /^(.+?)e$/;
+    if (re.test(w)) {
+      var fp = re.exec(w);
+      stem = fp[1];
+      re = new RegExp(mgr1);
+      re2 = new RegExp(meq1);
+      re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
+      if (re.test(stem) || (re2.test(stem) && !(re3.test(stem))))
+        w = stem;
+    }
+    re = /ll$/;
+    re2 = new RegExp(mgr1);
+    if (re.test(w) && re2.test(w)) {
+      re = /.$/;
+      w = w.replace(re,"");
+    }
+
+    // and turn initial Y back to y
+    if (firstch == "y")
+      w = firstch.toLowerCase() + w.substr(1);
+    return w;
+  }
+}
+
diff --git a/python/docs/_static/logo-dark-mode.png b/python/docs/_static/logo-dark-mode.png
new file mode 100644
index 0000000000..6b005a283b
Binary files /dev/null and b/python/docs/_static/logo-dark-mode.png differ
diff --git a/python/docs/_static/logo-light-mode.png b/python/docs/_static/logo-light-mode.png
new file mode 100644
index 0000000000..c07d6848c9
Binary files /dev/null and b/python/docs/_static/logo-light-mode.png differ
diff --git a/python/docs/_static/minus.png b/python/docs/_static/minus.png
new file mode 100644
index 0000000000..d96755fdaf
Binary files /dev/null and b/python/docs/_static/minus.png differ
diff --git a/python/docs/_static/nbsphinx-broken-thumbnail.svg b/python/docs/_static/nbsphinx-broken-thumbnail.svg
new file mode 100644
index 0000000000..4919ca8829
--- /dev/null
+++ b/python/docs/_static/nbsphinx-broken-thumbnail.svg
@@ -0,0 +1,9 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="100" height="100">
+  <style>
+    svg { fill: lightcoral; }
+    @media (prefers-color-scheme: dark) {
+      svg { fill: crimson; }
+    }
+  </style>
+  <path d="m 80.856887,38.34474 h 2.903414 l 7.066397,13.332221 V 38.34474 h 2.092166 V 54.281494 H 90.01545 L 82.949053,40.949273 v 13.332221 h -2.092166 z m -13.812565,0 h 10.076555 v 1.814633 h -7.920343 v 4.718048 h 7.589439 v 1.814634 h -7.589439 v 5.774805 h 8.11248 v 1.814634 H 67.044322 Z m -14.335606,0 h 2.156212 v 6.735493 L 62.01672,38.34474 h 2.775323 l -7.909669,7.429324 8.475407,8.50743 h -2.839368 l -7.653485,-7.674833 v 7.674833 h -2.156212 z m -10.738365,1.462381 q -2.348349,0 -3.73601,1.750588 -1.376987,1.750588 -1.376987,4.771419 0,3.010158 1.376987,4.760746 1.387661,1.750587 3.73601,1.750587 2.34835,0 3.714663,-1.750587 1.376986,-1.750588 1.376986,-4.760746 0,-3.020831 -1.376986,-4.771419 -1.366313,-1.750588 -3.714663,-1.750588 z m 0,-1.750588 q 3.351736,0 5.358507,2.252281 2.006772,2.241606 2.006772,6.020314 0,3.768034 -2.006772,6.020315 -2.006771,2.241606 -5.358507,2.241606 -3.362409,0 -5.379855,-2.241606 -2.006772,-2.241607 -2.006772,-6.020315 0,-3.778708 2.006772,-6.020314 2.017446,-2.252281 5.379855,-2.252281 z m -14.100771,8.75294 q 0.693831,0.234835 1.344964,1.003385 0.661808,0.768551 1.323615,2.113515 l 2.188235,4.355121 H 30.410068 L 28.371273,50.193231 Q 27.581374,48.592083 26.834172,48.069042 26.097644,47.546 24.816726,47.546 h -2.34835 v 6.735494 H 20.312164 V 38.34474 h 4.867489 q 2.732625,0 4.077588,1.142152 1.344964,1.142151 1.344964,3.447804 0,1.505078 -0.704505,2.49779 -0.69383,0.992711 -2.02812,1.376987 z m -5.401204,-6.692797 v 5.657388 h 2.711277 q 1.55845,0 2.348349,-0.715179 0.800574,-0.725854 0.800574,-2.124189 0,-1.398336 -0.800574,-2.102841 -0.789899,-0.715179 -2.348349,-0.715179 z M 7.4709616,46.670707 v 5.838851 H 10.92944 q 1.739914,0 2.57251,-0.71518 0.843271,-0.725853 0.843271,-2.209583 0,-1.494404 -0.843271,-2.198909 -0.832596,-0.715179 -2.57251,-0.715179 z m 0,-6.554031 v 4.803443 h 3.1916204 q 1.579799,0 2.34835,-0.587088 0.779225,-0.597761 0.779225,-1.814634 0,-1.206197 -0.779225,-1.803959 -0.768551,-0.597762 -2.34835,-0.597762 z M 5.3147497,38.34474 h 5.5079473 q 2.465767,0 3.800057,1.024734 1.334289,1.024734 1.334289,2.914088 0,1.462382 -0.683156,2.327001 -0.683156,0.86462 -2.006772,1.078106 1.590474,0.341579 2.465768,1.430359 0.885968,1.078106 0.885968,2.700602 0,2.134863 -1.451707,3.298364 -1.451707,1.1635 -4.130961,1.1635 H 5.3147497 Z M 65.365328,74.550032 H 67.52154 V 88.67217 h 7.760238 v 1.814634 h -9.91645 z m -6.447298,0 h 2.156214 V 90.486804 H 58.91803 Z m -9.955303,2.124192 -2.924765,7.931025 h 5.860208 z m -1.216872,-2.124192 h 2.444423 l 6.073692,15.936772 h -2.241606 l -1.451709,-4.088268 h -7.183824 l -1.451707,4.088268 H 41.66149 Z m -21.264514,0 h 2.903417 L 36.45116,87.882271 V 74.550032 h 2.092171 V 90.486804 H 35.639914 L 28.573506,77.154569 v 13.332235 h -2.092165 z m 53.454813,-9.781663 v 5.838858 h 3.458479 q 1.739919,0 2.572513,-0.71518 0.843272,-0.725854 0.843272,-2.209586 0,-1.494406 -0.843272,-2.198912 -0.832594,-0.71518 -2.572513,-0.71518 z m 0,-6.554037 v 4.803447 h 3.191622 q 1.579803,0 2.348355,-0.587087 0.779225,-0.597763 0.779225,-1.814636 0,-1.206199 -0.779225,-1.803962 -0.768552,-0.597762 -2.348355,-0.597762 z m -2.156212,-1.771939 h 5.507949 q 2.465772,0 3.800065,1.024736 1.334288,1.024735 1.334288,2.914092 0,1.462383 -0.683157,2.327003 -0.683157,0.864621 -2.006771,1.078107 1.590471,0.341579 2.46577,1.43036 0.885967,1.078107 0.885967,2.700605 0,2.134866 -1.451708,3.298368 -1.451709,1.163502 -4.130963,1.163502 h -5.72144 z m -18.86154,0 h 3.212969 l 4.066921,10.845118 4.088269,-10.845118 H 73.49953 V 72.379166 H 71.396692 V 58.385121 L 67.287075,69.315634 H 65.120183 L 61.010567,58.385121 v 13.994045 h -2.092165 z m -16.246333,0 h 2.166892 v 9.681616 q 0,2.561839 0.928668,3.693318 0.928667,1.120804 3.010159,1.120804 2.070819,0 2.999486,-1.120804 0.928667,-1.131479 0.928667,-3.693318 v -9.681616 h 2.166886 v 9.948474 q 0,3.116904 -1.547776,4.70738 -1.537104,1.590474 -4.547263,1.590474 -3.020833,0 -4.568615,-1.590474 -1.537104,-1.590476 -1.537104,-4.70738 z m -16.192953,0 h 2.156213 v 6.532689 h 7.83496 v -6.532689 h 2.156213 v 15.936773 h -2.156213 v -7.589448 h -7.83496 v 7.589448 h -2.156213 z m -15.563168,0 h 13.481671 v 1.814636 H 18.740226 V 72.379166 H 16.573339 V 58.257029 h -5.657391 z" />
+</svg>
diff --git a/python/docs/_static/nbsphinx-code-cells.css b/python/docs/_static/nbsphinx-code-cells.css
new file mode 100644
index 0000000000..199fa5a42f
--- /dev/null
+++ b/python/docs/_static/nbsphinx-code-cells.css
@@ -0,0 +1,253 @@
+/* remove conflicting styling from Sphinx themes */
+div.nbinput.container div.prompt *,
+div.nboutput.container div.prompt *,
+div.nbinput.container div.input_area pre,
+div.nboutput.container div.output_area pre,
+div.nbinput.container div.input_area .highlight,
+div.nboutput.container div.output_area .highlight {
+    border: none;
+    padding: 0;
+    margin: 0;
+    box-shadow: none;
+}
+
+div.nbinput.container > div[class*=highlight],
+div.nboutput.container > div[class*=highlight] {
+    margin: 0;
+}
+
+div.nbinput.container div.prompt *,
+div.nboutput.container div.prompt * {
+    background: none;
+}
+
+div.nboutput.container div.output_area .highlight,
+div.nboutput.container div.output_area pre {
+    background: unset;
+}
+
+div.nboutput.container div.output_area div.highlight {
+    color: unset;  /* override Pygments text color */
+}
+
+/* avoid gaps between output lines */
+div.nboutput.container div[class*=highlight] pre {
+    line-height: normal;
+}
+
+/* input/output containers */
+div.nbinput.container,
+div.nboutput.container {
+    display: -webkit-flex;
+    display: flex;
+    align-items: flex-start;
+    margin: 0;
+    width: 100%;
+}
+@media (max-width: 540px) {
+    div.nbinput.container,
+    div.nboutput.container {
+        flex-direction: column;
+    }
+}
+
+/* input container */
+div.nbinput.container {
+    padding-top: 5px;
+}
+
+/* last container */
+div.nblast.container {
+    padding-bottom: 5px;
+}
+
+/* input prompt */
+div.nbinput.container div.prompt pre {
+    color: #307FC1;
+}
+
+/* output prompt */
+div.nboutput.container div.prompt pre {
+    color: #BF5B3D;
+}
+
+/* all prompts */
+div.nbinput.container div.prompt,
+div.nboutput.container div.prompt {
+    width: 4.5ex;
+    padding-top: 5px;
+    position: relative;
+    user-select: none;
+}
+
+div.nbinput.container div.prompt > div,
+div.nboutput.container div.prompt > div {
+    position: absolute;
+    right: 0;
+    margin-right: 0.3ex;
+}
+
+@media (max-width: 540px) {
+    div.nbinput.container div.prompt,
+    div.nboutput.container div.prompt {
+        width: unset;
+        text-align: left;
+        padding: 0.4em;
+    }
+    div.nboutput.container div.prompt.empty {
+        padding: 0;
+    }
+
+    div.nbinput.container div.prompt > div,
+    div.nboutput.container div.prompt > div {
+        position: unset;
+    }
+}
+
+/* disable scrollbars and line breaks on prompts */
+div.nbinput.container div.prompt pre,
+div.nboutput.container div.prompt pre {
+    overflow: hidden;
+    white-space: pre;
+}
+
+/* input/output area */
+div.nbinput.container div.input_area,
+div.nboutput.container div.output_area {
+    -webkit-flex: 1;
+    flex: 1;
+    overflow: auto;
+}
+@media (max-width: 540px) {
+    div.nbinput.container div.input_area,
+    div.nboutput.container div.output_area {
+        width: 100%;
+    }
+}
+
+/* input area */
+div.nbinput.container div.input_area {
+    border: 1px solid #e0e0e0;
+    border-radius: 2px;
+    /*background: #f5f5f5;*/
+}
+
+/* override MathJax center alignment in output cells */
+div.nboutput.container div[class*=MathJax] {
+    text-align: left !important;
+}
+
+/* override sphinx.ext.imgmath center alignment in output cells */
+div.nboutput.container div.math p {
+    text-align: left;
+}
+
+/* standard error */
+div.nboutput.container div.output_area.stderr {
+    background: #fdd;
+}
+
+/* ANSI colors */
+.ansi-black-fg { color: #3E424D; }
+.ansi-black-bg { background-color: #3E424D; }
+.ansi-black-intense-fg { color: #282C36; }
+.ansi-black-intense-bg { background-color: #282C36; }
+.ansi-red-fg { color: #E75C58; }
+.ansi-red-bg { background-color: #E75C58; }
+.ansi-red-intense-fg { color: #B22B31; }
+.ansi-red-intense-bg { background-color: #B22B31; }
+.ansi-green-fg { color: #00A250; }
+.ansi-green-bg { background-color: #00A250; }
+.ansi-green-intense-fg { color: #007427; }
+.ansi-green-intense-bg { background-color: #007427; }
+.ansi-yellow-fg { color: #DDB62B; }
+.ansi-yellow-bg { background-color: #DDB62B; }
+.ansi-yellow-intense-fg { color: #B27D12; }
+.ansi-yellow-intense-bg { background-color: #B27D12; }
+.ansi-blue-fg { color: #208FFB; }
+.ansi-blue-bg { background-color: #208FFB; }
+.ansi-blue-intense-fg { color: #0065CA; }
+.ansi-blue-intense-bg { background-color: #0065CA; }
+.ansi-magenta-fg { color: #D160C4; }
+.ansi-magenta-bg { background-color: #D160C4; }
+.ansi-magenta-intense-fg { color: #A03196; }
+.ansi-magenta-intense-bg { background-color: #A03196; }
+.ansi-cyan-fg { color: #60C6C8; }
+.ansi-cyan-bg { background-color: #60C6C8; }
+.ansi-cyan-intense-fg { color: #258F8F; }
+.ansi-cyan-intense-bg { background-color: #258F8F; }
+.ansi-white-fg { color: #C5C1B4; }
+.ansi-white-bg { background-color: #C5C1B4; }
+.ansi-white-intense-fg { color: #A1A6B2; }
+.ansi-white-intense-bg { background-color: #A1A6B2; }
+
+.ansi-default-inverse-fg { color: #FFFFFF; }
+.ansi-default-inverse-bg { background-color: #000000; }
+
+.ansi-bold { font-weight: bold; }
+.ansi-underline { text-decoration: underline; }
+
+
+div.nbinput.container div.input_area div[class*=highlight] > pre,
+div.nboutput.container div.output_area div[class*=highlight] > pre,
+div.nboutput.container div.output_area div[class*=highlight].math,
+div.nboutput.container div.output_area.rendered_html,
+div.nboutput.container div.output_area > div.output_javascript,
+div.nboutput.container div.output_area:not(.rendered_html) > img{
+    padding: 5px;
+    margin: 0;
+}
+
+/* fix copybtn overflow problem in chromium (needed for 'sphinx_copybutton') */
+div.nbinput.container div.input_area > div[class^='highlight'],
+div.nboutput.container div.output_area > div[class^='highlight']{
+    overflow-y: hidden;
+}
+
+/* hide copybtn icon on prompts (needed for 'sphinx_copybutton') */
+.prompt .copybtn {
+    display: none;
+}
+
+/* Some additional styling taken form the Jupyter notebook CSS */
+.jp-RenderedHTMLCommon table,
+div.rendered_html table {
+  border: none;
+  border-collapse: collapse;
+  border-spacing: 0;
+  color: black;
+  font-size: 12px;
+  table-layout: fixed;
+}
+.jp-RenderedHTMLCommon thead,
+div.rendered_html thead {
+  border-bottom: 1px solid black;
+  vertical-align: bottom;
+}
+.jp-RenderedHTMLCommon tr,
+.jp-RenderedHTMLCommon th,
+.jp-RenderedHTMLCommon td,
+div.rendered_html tr,
+div.rendered_html th,
+div.rendered_html td {
+  text-align: right;
+  vertical-align: middle;
+  padding: 0.5em 0.5em;
+  line-height: normal;
+  white-space: normal;
+  max-width: none;
+  border: none;
+}
+.jp-RenderedHTMLCommon th,
+div.rendered_html th {
+  font-weight: bold;
+}
+.jp-RenderedHTMLCommon tbody tr:nth-child(odd),
+div.rendered_html tbody tr:nth-child(odd) {
+  background: #f5f5f5;
+}
+.jp-RenderedHTMLCommon tbody tr:hover,
+div.rendered_html tbody tr:hover {
+  background: rgba(66, 165, 245, 0.2);
+}
+
diff --git a/python/docs/_static/nbsphinx-gallery.css b/python/docs/_static/nbsphinx-gallery.css
new file mode 100644
index 0000000000..365c27a96b
--- /dev/null
+++ b/python/docs/_static/nbsphinx-gallery.css
@@ -0,0 +1,31 @@
+.nbsphinx-gallery {
+    display: grid;
+    grid-template-columns: repeat(auto-fill, minmax(160px, 1fr));
+    gap: 5px;
+    margin-top: 1em;
+    margin-bottom: 1em;
+}
+
+.nbsphinx-gallery > a {
+    padding: 5px;
+    border: 1px dotted currentColor;
+    border-radius: 2px;
+    text-align: center;
+}
+
+.nbsphinx-gallery > a:hover {
+    border-style: solid;
+}
+
+.nbsphinx-gallery img {
+    max-width: 100%;
+    max-height: 100%;
+}
+
+.nbsphinx-gallery > a > div:first-child {
+    display: flex;
+    align-items: start;
+    justify-content: center;
+    height: 120px;
+    margin-bottom: 5px;
+}
diff --git a/python/docs/_static/nbsphinx-no-thumbnail.svg b/python/docs/_static/nbsphinx-no-thumbnail.svg
new file mode 100644
index 0000000000..9dca7588fa
--- /dev/null
+++ b/python/docs/_static/nbsphinx-no-thumbnail.svg
@@ -0,0 +1,9 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="100" height="100">
+  <style>
+    svg { fill: #ccc; }
+    @media (prefers-color-scheme: dark) {
+      svg { fill: #999; }
+    }
+  </style>
+  <path d="M 65.365328,74.550032 H 67.52154 V 88.67217 h 7.760238 v 1.814634 h -9.91645 z m -6.447298,0 h 2.156214 V 90.486804 H 58.91803 Z m -9.955303,2.124192 -2.924765,7.931025 h 5.860208 z m -1.216872,-2.124192 h 2.444423 l 6.073692,15.936772 h -2.241606 l -1.451709,-4.088268 h -7.183824 l -1.451707,4.088268 H 41.66149 Z m -21.264514,0 h 2.903417 L 36.45116,87.882271 V 74.550032 h 2.092171 V 90.486804 H 35.639914 L 28.573506,77.154569 v 13.332235 h -2.092165 z m 53.454813,-9.781663 v 5.838858 h 3.458479 q 1.739919,0 2.572513,-0.71518 0.843272,-0.725854 0.843272,-2.209586 0,-1.494406 -0.843272,-2.198912 -0.832594,-0.71518 -2.572513,-0.71518 z m 0,-6.554037 v 4.803447 h 3.191622 q 1.579803,0 2.348355,-0.587087 0.779225,-0.597763 0.779225,-1.814636 0,-1.206199 -0.779225,-1.803962 -0.768552,-0.597762 -2.348355,-0.597762 z m -2.156212,-1.771939 h 5.507949 q 2.465772,0 3.800065,1.024736 1.334288,1.024735 1.334288,2.914092 0,1.462383 -0.683157,2.327003 -0.683157,0.864621 -2.006771,1.078107 1.590471,0.341579 2.46577,1.43036 0.885967,1.078107 0.885967,2.700605 0,2.134866 -1.451708,3.298368 -1.451709,1.163502 -4.130963,1.163502 h -5.72144 z m -18.86154,0 h 3.212969 l 4.066921,10.845118 4.088269,-10.845118 H 73.49953 V 72.379166 H 71.396692 V 58.385121 L 67.287075,69.315634 H 65.120183 L 61.010567,58.385121 v 13.994045 h -2.092165 z m -16.246333,0 h 2.166892 v 9.681616 q 0,2.561839 0.928668,3.693318 0.928667,1.120804 3.010159,1.120804 2.070819,0 2.999486,-1.120804 0.928667,-1.131479 0.928667,-3.693318 v -9.681616 h 2.166886 v 9.948474 q 0,3.116904 -1.547776,4.70738 -1.537104,1.590474 -4.547263,1.590474 -3.020833,0 -4.568615,-1.590474 -1.537104,-1.590476 -1.537104,-4.70738 z m -16.192953,0 h 2.156213 v 6.532689 h 7.83496 v -6.532689 h 2.156213 v 15.936773 h -2.156213 v -7.589448 h -7.83496 v 7.589448 h -2.156213 z m -15.563168,0 h 13.481671 v 1.814636 H 18.740226 V 72.379166 H 16.573339 V 58.257029 H 10.915948 Z M 65.497127,39.809967 q -2.34835,0 -3.736011,1.750588 -1.376987,1.750588 -1.376987,4.771419 0,3.010157 1.376987,4.760745 1.387661,1.750588 3.736011,1.750588 2.348349,0 3.714662,-1.750588 1.376986,-1.750588 1.376986,-4.760745 0,-3.020831 -1.376986,-4.771419 -1.366313,-1.750588 -3.714662,-1.750588 z m 0,-1.750588 q 3.351735,0 5.358506,2.252281 2.006772,2.241606 2.006772,6.020314 0,3.768034 -2.006772,6.020315 -2.006771,2.241606 -5.358506,2.241606 -3.36241,0 -5.379856,-2.241606 -2.006771,-2.241607 -2.006771,-6.020315 0,-3.778708 2.006771,-6.020314 2.017446,-2.252281 5.379856,-2.252281 z m -22.821688,0.288206 h 2.903414 L 52.64525,51.679807 V 38.347585 h 2.092166 V 54.28434 H 51.834001 L 44.767605,40.952119 V 54.28434 h -2.092166 z" />
+</svg>
diff --git a/python/docs/_static/plus.png b/python/docs/_static/plus.png
new file mode 100644
index 0000000000..7107cec93a
Binary files /dev/null and b/python/docs/_static/plus.png differ
diff --git a/python/docs/_static/pygments.css b/python/docs/_static/pygments.css
new file mode 100644
index 0000000000..2de64167b1
--- /dev/null
+++ b/python/docs/_static/pygments.css
@@ -0,0 +1,246 @@
+.highlight pre { line-height: 125%; }
+.highlight td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
+.highlight span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
+.highlight td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
+.highlight span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
+.highlight .hll { background-color: #ffffcc }
+.highlight { background: #eeffcc; }
+.highlight .c { color: #408090; font-style: italic } /* Comment */
+.highlight .err { border: 1px solid #FF0000 } /* Error */
+.highlight .k { color: #007020; font-weight: bold } /* Keyword */
+.highlight .o { color: #666666 } /* Operator */
+.highlight .ch { color: #408090; font-style: italic } /* Comment.Hashbang */
+.highlight .cm { color: #408090; font-style: italic } /* Comment.Multiline */
+.highlight .cp { color: #007020 } /* Comment.Preproc */
+.highlight .cpf { color: #408090; font-style: italic } /* Comment.PreprocFile */
+.highlight .c1 { color: #408090; font-style: italic } /* Comment.Single */
+.highlight .cs { color: #408090; background-color: #fff0f0 } /* Comment.Special */
+.highlight .gd { color: #A00000 } /* Generic.Deleted */
+.highlight .ge { font-style: italic } /* Generic.Emph */
+.highlight .gr { color: #FF0000 } /* Generic.Error */
+.highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */
+.highlight .gi { color: #00A000 } /* Generic.Inserted */
+.highlight .go { color: #333333 } /* Generic.Output */
+.highlight .gp { color: #c65d09; font-weight: bold } /* Generic.Prompt */
+.highlight .gs { font-weight: bold } /* Generic.Strong */
+.highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */
+.highlight .gt { color: #0044DD } /* Generic.Traceback */
+.highlight .kc { color: #007020; font-weight: bold } /* Keyword.Constant */
+.highlight .kd { color: #007020; font-weight: bold } /* Keyword.Declaration */
+.highlight .kn { color: #007020; font-weight: bold } /* Keyword.Namespace */
+.highlight .kp { color: #007020 } /* Keyword.Pseudo */
+.highlight .kr { color: #007020; font-weight: bold } /* Keyword.Reserved */
+.highlight .kt { color: #902000 } /* Keyword.Type */
+.highlight .m { color: #208050 } /* Literal.Number */
+.highlight .s { color: #4070a0 } /* Literal.String */
+.highlight .na { color: #4070a0 } /* Name.Attribute */
+.highlight .nb { color: #007020 } /* Name.Builtin */
+.highlight .nc { color: #0e84b5; font-weight: bold } /* Name.Class */
+.highlight .no { color: #60add5 } /* Name.Constant */
+.highlight .nd { color: #555555; font-weight: bold } /* Name.Decorator */
+.highlight .ni { color: #d55537; font-weight: bold } /* Name.Entity */
+.highlight .ne { color: #007020 } /* Name.Exception */
+.highlight .nf { color: #06287e } /* Name.Function */
+.highlight .nl { color: #002070; font-weight: bold } /* Name.Label */
+.highlight .nn { color: #0e84b5; font-weight: bold } /* Name.Namespace */
+.highlight .nt { color: #062873; font-weight: bold } /* Name.Tag */
+.highlight .nv { color: #bb60d5 } /* Name.Variable */
+.highlight .ow { color: #007020; font-weight: bold } /* Operator.Word */
+.highlight .w { color: #bbbbbb } /* Text.Whitespace */
+.highlight .mb { color: #208050 } /* Literal.Number.Bin */
+.highlight .mf { color: #208050 } /* Literal.Number.Float */
+.highlight .mh { color: #208050 } /* Literal.Number.Hex */
+.highlight .mi { color: #208050 } /* Literal.Number.Integer */
+.highlight .mo { color: #208050 } /* Literal.Number.Oct */
+.highlight .sa { color: #4070a0 } /* Literal.String.Affix */
+.highlight .sb { color: #4070a0 } /* Literal.String.Backtick */
+.highlight .sc { color: #4070a0 } /* Literal.String.Char */
+.highlight .dl { color: #4070a0 } /* Literal.String.Delimiter */
+.highlight .sd { color: #4070a0; font-style: italic } /* Literal.String.Doc */
+.highlight .s2 { color: #4070a0 } /* Literal.String.Double */
+.highlight .se { color: #4070a0; font-weight: bold } /* Literal.String.Escape */
+.highlight .sh { color: #4070a0 } /* Literal.String.Heredoc */
+.highlight .si { color: #70a0d0; font-style: italic } /* Literal.String.Interpol */
+.highlight .sx { color: #c65d09 } /* Literal.String.Other */
+.highlight .sr { color: #235388 } /* Literal.String.Regex */
+.highlight .s1 { color: #4070a0 } /* Literal.String.Single */
+.highlight .ss { color: #517918 } /* Literal.String.Symbol */
+.highlight .bp { color: #007020 } /* Name.Builtin.Pseudo */
+.highlight .fm { color: #06287e } /* Name.Function.Magic */
+.highlight .vc { color: #bb60d5 } /* Name.Variable.Class */
+.highlight .vg { color: #bb60d5 } /* Name.Variable.Global */
+.highlight .vi { color: #bb60d5 } /* Name.Variable.Instance */
+.highlight .vm { color: #bb60d5 } /* Name.Variable.Magic */
+.highlight .il { color: #208050 } /* Literal.Number.Integer.Long */
+@media not print {
+body[data-theme="dark"] .highlight pre { line-height: 125%; }
+body[data-theme="dark"] .highlight td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
+body[data-theme="dark"] .highlight span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
+body[data-theme="dark"] .highlight td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
+body[data-theme="dark"] .highlight span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
+body[data-theme="dark"] .highlight .hll { background-color: #49483e }
+body[data-theme="dark"] .highlight { background: #272822; color: #f8f8f2 }
+body[data-theme="dark"] .highlight .c { color: #75715e } /* Comment */
+body[data-theme="dark"] .highlight .err { color: #960050; background-color: #1e0010 } /* Error */
+body[data-theme="dark"] .highlight .esc { color: #f8f8f2 } /* Escape */
+body[data-theme="dark"] .highlight .g { color: #f8f8f2 } /* Generic */
+body[data-theme="dark"] .highlight .k { color: #66d9ef } /* Keyword */
+body[data-theme="dark"] .highlight .l { color: #ae81ff } /* Literal */
+body[data-theme="dark"] .highlight .n { color: #f8f8f2 } /* Name */
+body[data-theme="dark"] .highlight .o { color: #f92672 } /* Operator */
+body[data-theme="dark"] .highlight .x { color: #f8f8f2 } /* Other */
+body[data-theme="dark"] .highlight .p { color: #f8f8f2 } /* Punctuation */
+body[data-theme="dark"] .highlight .ch { color: #75715e } /* Comment.Hashbang */
+body[data-theme="dark"] .highlight .cm { color: #75715e } /* Comment.Multiline */
+body[data-theme="dark"] .highlight .cp { color: #75715e } /* Comment.Preproc */
+body[data-theme="dark"] .highlight .cpf { color: #75715e } /* Comment.PreprocFile */
+body[data-theme="dark"] .highlight .c1 { color: #75715e } /* Comment.Single */
+body[data-theme="dark"] .highlight .cs { color: #75715e } /* Comment.Special */
+body[data-theme="dark"] .highlight .gd { color: #f92672 } /* Generic.Deleted */
+body[data-theme="dark"] .highlight .ge { color: #f8f8f2; font-style: italic } /* Generic.Emph */
+body[data-theme="dark"] .highlight .gr { color: #f8f8f2 } /* Generic.Error */
+body[data-theme="dark"] .highlight .gh { color: #f8f8f2 } /* Generic.Heading */
+body[data-theme="dark"] .highlight .gi { color: #a6e22e } /* Generic.Inserted */
+body[data-theme="dark"] .highlight .go { color: #66d9ef } /* Generic.Output */
+body[data-theme="dark"] .highlight .gp { color: #f92672; font-weight: bold } /* Generic.Prompt */
+body[data-theme="dark"] .highlight .gs { color: #f8f8f2; font-weight: bold } /* Generic.Strong */
+body[data-theme="dark"] .highlight .gu { color: #75715e } /* Generic.Subheading */
+body[data-theme="dark"] .highlight .gt { color: #f8f8f2 } /* Generic.Traceback */
+body[data-theme="dark"] .highlight .kc { color: #66d9ef } /* Keyword.Constant */
+body[data-theme="dark"] .highlight .kd { color: #66d9ef } /* Keyword.Declaration */
+body[data-theme="dark"] .highlight .kn { color: #f92672 } /* Keyword.Namespace */
+body[data-theme="dark"] .highlight .kp { color: #66d9ef } /* Keyword.Pseudo */
+body[data-theme="dark"] .highlight .kr { color: #66d9ef } /* Keyword.Reserved */
+body[data-theme="dark"] .highlight .kt { color: #66d9ef } /* Keyword.Type */
+body[data-theme="dark"] .highlight .ld { color: #e6db74 } /* Literal.Date */
+body[data-theme="dark"] .highlight .m { color: #ae81ff } /* Literal.Number */
+body[data-theme="dark"] .highlight .s { color: #e6db74 } /* Literal.String */
+body[data-theme="dark"] .highlight .na { color: #a6e22e } /* Name.Attribute */
+body[data-theme="dark"] .highlight .nb { color: #f8f8f2 } /* Name.Builtin */
+body[data-theme="dark"] .highlight .nc { color: #a6e22e } /* Name.Class */
+body[data-theme="dark"] .highlight .no { color: #66d9ef } /* Name.Constant */
+body[data-theme="dark"] .highlight .nd { color: #a6e22e } /* Name.Decorator */
+body[data-theme="dark"] .highlight .ni { color: #f8f8f2 } /* Name.Entity */
+body[data-theme="dark"] .highlight .ne { color: #a6e22e } /* Name.Exception */
+body[data-theme="dark"] .highlight .nf { color: #a6e22e } /* Name.Function */
+body[data-theme="dark"] .highlight .nl { color: #f8f8f2 } /* Name.Label */
+body[data-theme="dark"] .highlight .nn { color: #f8f8f2 } /* Name.Namespace */
+body[data-theme="dark"] .highlight .nx { color: #a6e22e } /* Name.Other */
+body[data-theme="dark"] .highlight .py { color: #f8f8f2 } /* Name.Property */
+body[data-theme="dark"] .highlight .nt { color: #f92672 } /* Name.Tag */
+body[data-theme="dark"] .highlight .nv { color: #f8f8f2 } /* Name.Variable */
+body[data-theme="dark"] .highlight .ow { color: #f92672 } /* Operator.Word */
+body[data-theme="dark"] .highlight .pm { color: #f8f8f2 } /* Punctuation.Marker */
+body[data-theme="dark"] .highlight .w { color: #f8f8f2 } /* Text.Whitespace */
+body[data-theme="dark"] .highlight .mb { color: #ae81ff } /* Literal.Number.Bin */
+body[data-theme="dark"] .highlight .mf { color: #ae81ff } /* Literal.Number.Float */
+body[data-theme="dark"] .highlight .mh { color: #ae81ff } /* Literal.Number.Hex */
+body[data-theme="dark"] .highlight .mi { color: #ae81ff } /* Literal.Number.Integer */
+body[data-theme="dark"] .highlight .mo { color: #ae81ff } /* Literal.Number.Oct */
+body[data-theme="dark"] .highlight .sa { color: #e6db74 } /* Literal.String.Affix */
+body[data-theme="dark"] .highlight .sb { color: #e6db74 } /* Literal.String.Backtick */
+body[data-theme="dark"] .highlight .sc { color: #e6db74 } /* Literal.String.Char */
+body[data-theme="dark"] .highlight .dl { color: #e6db74 } /* Literal.String.Delimiter */
+body[data-theme="dark"] .highlight .sd { color: #e6db74 } /* Literal.String.Doc */
+body[data-theme="dark"] .highlight .s2 { color: #e6db74 } /* Literal.String.Double */
+body[data-theme="dark"] .highlight .se { color: #ae81ff } /* Literal.String.Escape */
+body[data-theme="dark"] .highlight .sh { color: #e6db74 } /* Literal.String.Heredoc */
+body[data-theme="dark"] .highlight .si { color: #e6db74 } /* Literal.String.Interpol */
+body[data-theme="dark"] .highlight .sx { color: #e6db74 } /* Literal.String.Other */
+body[data-theme="dark"] .highlight .sr { color: #e6db74 } /* Literal.String.Regex */
+body[data-theme="dark"] .highlight .s1 { color: #e6db74 } /* Literal.String.Single */
+body[data-theme="dark"] .highlight .ss { color: #e6db74 } /* Literal.String.Symbol */
+body[data-theme="dark"] .highlight .bp { color: #f8f8f2 } /* Name.Builtin.Pseudo */
+body[data-theme="dark"] .highlight .fm { color: #a6e22e } /* Name.Function.Magic */
+body[data-theme="dark"] .highlight .vc { color: #f8f8f2 } /* Name.Variable.Class */
+body[data-theme="dark"] .highlight .vg { color: #f8f8f2 } /* Name.Variable.Global */
+body[data-theme="dark"] .highlight .vi { color: #f8f8f2 } /* Name.Variable.Instance */
+body[data-theme="dark"] .highlight .vm { color: #f8f8f2 } /* Name.Variable.Magic */
+body[data-theme="dark"] .highlight .il { color: #ae81ff } /* Literal.Number.Integer.Long */
+@media (prefers-color-scheme: dark) {
+body:not([data-theme="light"]) .highlight pre { line-height: 125%; }
+body:not([data-theme="light"]) .highlight td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
+body:not([data-theme="light"]) .highlight span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
+body:not([data-theme="light"]) .highlight td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
+body:not([data-theme="light"]) .highlight span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
+body:not([data-theme="light"]) .highlight .hll { background-color: #49483e }
+body:not([data-theme="light"]) .highlight { background: #272822; color: #f8f8f2 }
+body:not([data-theme="light"]) .highlight .c { color: #75715e } /* Comment */
+body:not([data-theme="light"]) .highlight .err { color: #960050; background-color: #1e0010 } /* Error */
+body:not([data-theme="light"]) .highlight .esc { color: #f8f8f2 } /* Escape */
+body:not([data-theme="light"]) .highlight .g { color: #f8f8f2 } /* Generic */
+body:not([data-theme="light"]) .highlight .k { color: #66d9ef } /* Keyword */
+body:not([data-theme="light"]) .highlight .l { color: #ae81ff } /* Literal */
+body:not([data-theme="light"]) .highlight .n { color: #f8f8f2 } /* Name */
+body:not([data-theme="light"]) .highlight .o { color: #f92672 } /* Operator */
+body:not([data-theme="light"]) .highlight .x { color: #f8f8f2 } /* Other */
+body:not([data-theme="light"]) .highlight .p { color: #f8f8f2 } /* Punctuation */
+body:not([data-theme="light"]) .highlight .ch { color: #75715e } /* Comment.Hashbang */
+body:not([data-theme="light"]) .highlight .cm { color: #75715e } /* Comment.Multiline */
+body:not([data-theme="light"]) .highlight .cp { color: #75715e } /* Comment.Preproc */
+body:not([data-theme="light"]) .highlight .cpf { color: #75715e } /* Comment.PreprocFile */
+body:not([data-theme="light"]) .highlight .c1 { color: #75715e } /* Comment.Single */
+body:not([data-theme="light"]) .highlight .cs { color: #75715e } /* Comment.Special */
+body:not([data-theme="light"]) .highlight .gd { color: #f92672 } /* Generic.Deleted */
+body:not([data-theme="light"]) .highlight .ge { color: #f8f8f2; font-style: italic } /* Generic.Emph */
+body:not([data-theme="light"]) .highlight .gr { color: #f8f8f2 } /* Generic.Error */
+body:not([data-theme="light"]) .highlight .gh { color: #f8f8f2 } /* Generic.Heading */
+body:not([data-theme="light"]) .highlight .gi { color: #a6e22e } /* Generic.Inserted */
+body:not([data-theme="light"]) .highlight .go { color: #66d9ef } /* Generic.Output */
+body:not([data-theme="light"]) .highlight .gp { color: #f92672; font-weight: bold } /* Generic.Prompt */
+body:not([data-theme="light"]) .highlight .gs { color: #f8f8f2; font-weight: bold } /* Generic.Strong */
+body:not([data-theme="light"]) .highlight .gu { color: #75715e } /* Generic.Subheading */
+body:not([data-theme="light"]) .highlight .gt { color: #f8f8f2 } /* Generic.Traceback */
+body:not([data-theme="light"]) .highlight .kc { color: #66d9ef } /* Keyword.Constant */
+body:not([data-theme="light"]) .highlight .kd { color: #66d9ef } /* Keyword.Declaration */
+body:not([data-theme="light"]) .highlight .kn { color: #f92672 } /* Keyword.Namespace */
+body:not([data-theme="light"]) .highlight .kp { color: #66d9ef } /* Keyword.Pseudo */
+body:not([data-theme="light"]) .highlight .kr { color: #66d9ef } /* Keyword.Reserved */
+body:not([data-theme="light"]) .highlight .kt { color: #66d9ef } /* Keyword.Type */
+body:not([data-theme="light"]) .highlight .ld { color: #e6db74 } /* Literal.Date */
+body:not([data-theme="light"]) .highlight .m { color: #ae81ff } /* Literal.Number */
+body:not([data-theme="light"]) .highlight .s { color: #e6db74 } /* Literal.String */
+body:not([data-theme="light"]) .highlight .na { color: #a6e22e } /* Name.Attribute */
+body:not([data-theme="light"]) .highlight .nb { color: #f8f8f2 } /* Name.Builtin */
+body:not([data-theme="light"]) .highlight .nc { color: #a6e22e } /* Name.Class */
+body:not([data-theme="light"]) .highlight .no { color: #66d9ef } /* Name.Constant */
+body:not([data-theme="light"]) .highlight .nd { color: #a6e22e } /* Name.Decorator */
+body:not([data-theme="light"]) .highlight .ni { color: #f8f8f2 } /* Name.Entity */
+body:not([data-theme="light"]) .highlight .ne { color: #a6e22e } /* Name.Exception */
+body:not([data-theme="light"]) .highlight .nf { color: #a6e22e } /* Name.Function */
+body:not([data-theme="light"]) .highlight .nl { color: #f8f8f2 } /* Name.Label */
+body:not([data-theme="light"]) .highlight .nn { color: #f8f8f2 } /* Name.Namespace */
+body:not([data-theme="light"]) .highlight .nx { color: #a6e22e } /* Name.Other */
+body:not([data-theme="light"]) .highlight .py { color: #f8f8f2 } /* Name.Property */
+body:not([data-theme="light"]) .highlight .nt { color: #f92672 } /* Name.Tag */
+body:not([data-theme="light"]) .highlight .nv { color: #f8f8f2 } /* Name.Variable */
+body:not([data-theme="light"]) .highlight .ow { color: #f92672 } /* Operator.Word */
+body:not([data-theme="light"]) .highlight .pm { color: #f8f8f2 } /* Punctuation.Marker */
+body:not([data-theme="light"]) .highlight .w { color: #f8f8f2 } /* Text.Whitespace */
+body:not([data-theme="light"]) .highlight .mb { color: #ae81ff } /* Literal.Number.Bin */
+body:not([data-theme="light"]) .highlight .mf { color: #ae81ff } /* Literal.Number.Float */
+body:not([data-theme="light"]) .highlight .mh { color: #ae81ff } /* Literal.Number.Hex */
+body:not([data-theme="light"]) .highlight .mi { color: #ae81ff } /* Literal.Number.Integer */
+body:not([data-theme="light"]) .highlight .mo { color: #ae81ff } /* Literal.Number.Oct */
+body:not([data-theme="light"]) .highlight .sa { color: #e6db74 } /* Literal.String.Affix */
+body:not([data-theme="light"]) .highlight .sb { color: #e6db74 } /* Literal.String.Backtick */
+body:not([data-theme="light"]) .highlight .sc { color: #e6db74 } /* Literal.String.Char */
+body:not([data-theme="light"]) .highlight .dl { color: #e6db74 } /* Literal.String.Delimiter */
+body:not([data-theme="light"]) .highlight .sd { color: #e6db74 } /* Literal.String.Doc */
+body:not([data-theme="light"]) .highlight .s2 { color: #e6db74 } /* Literal.String.Double */
+body:not([data-theme="light"]) .highlight .se { color: #ae81ff } /* Literal.String.Escape */
+body:not([data-theme="light"]) .highlight .sh { color: #e6db74 } /* Literal.String.Heredoc */
+body:not([data-theme="light"]) .highlight .si { color: #e6db74 } /* Literal.String.Interpol */
+body:not([data-theme="light"]) .highlight .sx { color: #e6db74 } /* Literal.String.Other */
+body:not([data-theme="light"]) .highlight .sr { color: #e6db74 } /* Literal.String.Regex */
+body:not([data-theme="light"]) .highlight .s1 { color: #e6db74 } /* Literal.String.Single */
+body:not([data-theme="light"]) .highlight .ss { color: #e6db74 } /* Literal.String.Symbol */
+body:not([data-theme="light"]) .highlight .bp { color: #f8f8f2 } /* Name.Builtin.Pseudo */
+body:not([data-theme="light"]) .highlight .fm { color: #a6e22e } /* Name.Function.Magic */
+body:not([data-theme="light"]) .highlight .vc { color: #f8f8f2 } /* Name.Variable.Class */
+body:not([data-theme="light"]) .highlight .vg { color: #f8f8f2 } /* Name.Variable.Global */
+body:not([data-theme="light"]) .highlight .vi { color: #f8f8f2 } /* Name.Variable.Instance */
+body:not([data-theme="light"]) .highlight .vm { color: #f8f8f2 } /* Name.Variable.Magic */
+body:not([data-theme="light"]) .highlight .il { color: #ae81ff } /* Literal.Number.Integer.Long */
+}
+}
\ No newline at end of file
diff --git a/python/docs/_static/scripts/furo-extensions.js b/python/docs/_static/scripts/furo-extensions.js
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/python/docs/_static/scripts/furo.js b/python/docs/_static/scripts/furo.js
new file mode 100644
index 0000000000..cbf64878d0
--- /dev/null
+++ b/python/docs/_static/scripts/furo.js
@@ -0,0 +1,3 @@
+/*! For license information please see furo.js.LICENSE.txt */
+(()=>{var t={212:function(t,e,n){var o,r;r=void 0!==n.g?n.g:"undefined"!=typeof window?window:this,o=function(){return function(t){"use strict";var e={navClass:"active",contentClass:"active",nested:!1,nestedClass:"active",offset:0,reflow:!1,events:!0},n=function(t,e,n){if(n.settings.events){var o=new CustomEvent(t,{bubbles:!0,cancelable:!0,detail:n});e.dispatchEvent(o)}},o=function(t){var e=0;if(t.offsetParent)for(;t;)e+=t.offsetTop,t=t.offsetParent;return e>=0?e:0},r=function(t){t&&t.sort((function(t,e){return o(t.content)<o(e.content)?-1:1}))},c=function(e,n,o){var r=e.getBoundingClientRect(),c=function(t){return"function"==typeof t.offset?parseFloat(t.offset()):parseFloat(t.offset)}(n);return o?parseInt(r.bottom,10)<(t.innerHeight||document.documentElement.clientHeight):parseInt(r.top,10)<=c},s=function(){return Math.ceil(t.innerHeight+t.pageYOffset)>=Math.max(document.body.scrollHeight,document.documentElement.scrollHeight,document.body.offsetHeight,document.documentElement.offsetHeight,document.body.clientHeight,document.documentElement.clientHeight)},l=function(t,e){var n=t[t.length-1];if(function(t,e){return!(!s()||!c(t.content,e,!0))}(n,e))return n;for(var o=t.length-1;o>=0;o--)if(c(t[o].content,e))return t[o]},a=function(t,e){if(e.nested&&t.parentNode){var n=t.parentNode.closest("li");n&&(n.classList.remove(e.nestedClass),a(n,e))}},i=function(t,e){if(t){var o=t.nav.closest("li");o&&(o.classList.remove(e.navClass),t.content.classList.remove(e.contentClass),a(o,e),n("gumshoeDeactivate",o,{link:t.nav,content:t.content,settings:e}))}},u=function(t,e){if(e.nested){var n=t.parentNode.closest("li");n&&(n.classList.add(e.nestedClass),u(n,e))}};return function(o,c){var s,a,d,f,m,v={setup:function(){s=document.querySelectorAll(o),a=[],Array.prototype.forEach.call(s,(function(t){var e=document.getElementById(decodeURIComponent(t.hash.substr(1)));e&&a.push({nav:t,content:e})})),r(a)},detect:function(){var t=l(a,m);t?d&&t.content===d.content||(i(d,m),function(t,e){if(t){var o=t.nav.closest("li");o&&(o.classList.add(e.navClass),t.content.classList.add(e.contentClass),u(o,e),n("gumshoeActivate",o,{link:t.nav,content:t.content,settings:e}))}}(t,m),d=t):d&&(i(d,m),d=null)}},h=function(e){f&&t.cancelAnimationFrame(f),f=t.requestAnimationFrame(v.detect)},g=function(e){f&&t.cancelAnimationFrame(f),f=t.requestAnimationFrame((function(){r(a),v.detect()}))};return v.destroy=function(){d&&i(d,m),t.removeEventListener("scroll",h,!1),m.reflow&&t.removeEventListener("resize",g,!1),a=null,s=null,d=null,f=null,m=null},m=function(){var t={};return Array.prototype.forEach.call(arguments,(function(e){for(var n in e){if(!e.hasOwnProperty(n))return;t[n]=e[n]}})),t}(e,c||{}),v.setup(),v.detect(),t.addEventListener("scroll",h,!1),m.reflow&&t.addEventListener("resize",g,!1),v}}(r)}.apply(e,[]),void 0===o||(t.exports=o)}},e={};function n(o){var r=e[o];if(void 0!==r)return r.exports;var c=e[o]={exports:{}};return t[o].call(c.exports,c,c.exports,n),c.exports}n.n=t=>{var e=t&&t.__esModule?()=>t.default:()=>t;return n.d(e,{a:e}),e},n.d=(t,e)=>{for(var o in e)n.o(e,o)&&!n.o(t,o)&&Object.defineProperty(t,o,{enumerable:!0,get:e[o]})},n.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||new Function("return this")()}catch(t){if("object"==typeof window)return window}}(),n.o=(t,e)=>Object.prototype.hasOwnProperty.call(t,e),(()=>{"use strict";var t=n(212),e=n.n(t),o=null,r=null,c=window.pageYOffset||document.documentElement.scrollTop;function s(){const t=localStorage.getItem("theme")||"auto";var e;"light"!==(e=window.matchMedia("(prefers-color-scheme: dark)").matches?"auto"===t?"light":"light"==t?"dark":"auto":"auto"===t?"dark":"dark"==t?"light":"auto")&&"dark"!==e&&"auto"!==e&&(console.error(`Got invalid theme mode: ${e}. Resetting to auto.`),e="auto"),document.body.dataset.theme=e,localStorage.setItem("theme",e),console.log(`Changed to ${e} mode.`)}function l(){!function(){const t=document.getElementsByClassName("theme-toggle");Array.from(t).forEach((t=>{t.addEventListener("click",s)}))}(),function(){let t=0,e=!1;window.addEventListener("scroll",(function(n){t=window.scrollY,e||(window.requestAnimationFrame((function(){var n;n=t,0==Math.floor(r.getBoundingClientRect().top)?r.classList.add("scrolled"):r.classList.remove("scrolled"),function(t){t<64?document.documentElement.classList.remove("show-back-to-top"):t<c?document.documentElement.classList.add("show-back-to-top"):t>c&&document.documentElement.classList.remove("show-back-to-top"),c=t}(n),function(t){null!==o&&(0==t?o.scrollTo(0,0):Math.ceil(t)>=Math.floor(document.documentElement.scrollHeight-window.innerHeight)?o.scrollTo(0,o.scrollHeight):document.querySelector(".scroll-current"))}(n),e=!1})),e=!0)})),window.scroll()}(),null!==o&&new(e())(".toc-tree a",{reflow:!0,recursive:!0,navClass:"scroll-current",offset:()=>{let t=parseFloat(getComputedStyle(document.documentElement).fontSize);return r.getBoundingClientRect().height+.5*t+1}})}document.addEventListener("DOMContentLoaded",(function(){document.body.parentNode.classList.remove("no-js"),r=document.querySelector("header"),o=document.querySelector(".toc-scroll"),l()}))})()})();
+//# sourceMappingURL=furo.js.map
\ No newline at end of file
diff --git a/python/docs/_static/scripts/furo.js.LICENSE.txt b/python/docs/_static/scripts/furo.js.LICENSE.txt
new file mode 100644
index 0000000000..1632189c7e
--- /dev/null
+++ b/python/docs/_static/scripts/furo.js.LICENSE.txt
@@ -0,0 +1,7 @@
+/*!
+ * gumshoejs v5.1.2 (patched by @pradyunsg)
+ * A simple, framework-agnostic scrollspy script.
+ * (c) 2019 Chris Ferdinandi
+ * MIT License
+ * http://github.com/cferdinandi/gumshoe
+ */
diff --git a/python/docs/_static/scripts/furo.js.map b/python/docs/_static/scripts/furo.js.map
new file mode 100644
index 0000000000..7ed2be87dc
--- /dev/null
+++ b/python/docs/_static/scripts/furo.js.map
@@ -0,0 +1 @@
+{"version":3,"file":"scripts/furo.js","mappings":";iCAAA,MAQWA,SAWS,IAAX,EAAAC,EACH,EAAAA,EACkB,oBAAXC,OACPA,OACAC,KAbS,EAAF,WACP,OAaJ,SAAUD,GACR,aAMA,IAAIE,EAAW,CAEbC,SAAU,SACVC,aAAc,SAGdC,QAAQ,EACRC,YAAa,SAGbC,OAAQ,EACRC,QAAQ,EAGRC,QAAQ,GA6BNC,EAAY,SAAUC,EAAMC,EAAMC,GAEpC,GAAKA,EAAOC,SAASL,OAArB,CAGA,IAAIM,EAAQ,IAAIC,YAAYL,EAAM,CAChCM,SAAS,EACTC,YAAY,EACZL,OAAQA,IAIVD,EAAKO,cAAcJ,KAQjBK,EAAe,SAAUR,GAC3B,IAAIS,EAAW,EACf,GAAIT,EAAKU,aACP,KAAOV,GACLS,GAAYT,EAAKW,UACjBX,EAAOA,EAAKU,aAGhB,OAAOD,GAAY,EAAIA,EAAW,GAOhCG,EAAe,SAAUC,GACvBA,GACFA,EAASC,MAAK,SAAUC,EAAOC,GAG7B,OAFcR,EAAaO,EAAME,SACnBT,EAAaQ,EAAMC,UACF,EACxB,MA2CTC,EAAW,SAAUlB,EAAME,EAAUiB,GACvC,IAAIC,EAASpB,EAAKqB,wBACd1B,EAnCU,SAAUO,GAExB,MAA+B,mBAApBA,EAASP,OACX2B,WAAWpB,EAASP,UAItB2B,WAAWpB,EAASP,QA4Bd4B,CAAUrB,GACvB,OAAIiB,EAEAK,SAASJ,EAAOD,OAAQ,KACvB/B,EAAOqC,aAAeC,SAASC,gBAAgBC,cAG7CJ,SAASJ,EAAOS,IAAK,KAAOlC,GAOjCmC,EAAa,WACf,OACEC,KAAKC,KAAK5C,EAAOqC,YAAcrC,EAAO6C,cAnCjCF,KAAKG,IACVR,SAASS,KAAKC,aACdV,SAASC,gBAAgBS,aACzBV,SAASS,KAAKE,aACdX,SAASC,gBAAgBU,aACzBX,SAASS,KAAKP,aACdF,SAASC,gBAAgBC,eAqDzBU,EAAY,SAAUzB,EAAUX,GAClC,IAAIqC,EAAO1B,EAASA,EAAS2B,OAAS,GACtC,GAbgB,SAAUC,EAAMvC,GAChC,SAAI4B,MAAgBZ,EAASuB,EAAKxB,QAASf,GAAU,IAYjDwC,CAAYH,EAAMrC,GAAW,OAAOqC,EACxC,IAAK,IAAII,EAAI9B,EAAS2B,OAAS,EAAGG,GAAK,EAAGA,IACxC,GAAIzB,EAASL,EAAS8B,GAAG1B,QAASf,GAAW,OAAOW,EAAS8B,IAS7DC,EAAmB,SAAUC,EAAK3C,GAEpC,GAAKA,EAAST,QAAWoD,EAAIC,WAA7B,CAGA,IAAIC,EAAKF,EAAIC,WAAWE,QAAQ,MAC3BD,IAGLA,EAAGE,UAAUC,OAAOhD,EAASR,aAG7BkD,EAAiBG,EAAI7C,MAQnBiD,EAAa,SAAUC,EAAOlD,GAEhC,GAAKkD,EAAL,CAGA,IAAIL,EAAKK,EAAMP,IAAIG,QAAQ,MACtBD,IAGLA,EAAGE,UAAUC,OAAOhD,EAASX,UAC7B6D,EAAMnC,QAAQgC,UAAUC,OAAOhD,EAASV,cAGxCoD,EAAiBG,EAAI7C,GAGrBJ,EAAU,oBAAqBiD,EAAI,CACjCM,KAAMD,EAAMP,IACZ5B,QAASmC,EAAMnC,QACff,SAAUA,OASVoD,EAAiB,SAAUT,EAAK3C,GAElC,GAAKA,EAAST,OAAd,CAGA,IAAIsD,EAAKF,EAAIC,WAAWE,QAAQ,MAC3BD,IAGLA,EAAGE,UAAUM,IAAIrD,EAASR,aAG1B4D,EAAeP,EAAI7C,MA8LrB,OA1JkB,SAAUsD,EAAUC,GAKpC,IACIC,EAAU7C,EAAU8C,EAASC,EAAS1D,EADtC2D,EAAa,CAUjBA,MAAmB,WAEjBH,EAAWhC,SAASoC,iBAAiBN,GAGrC3C,EAAW,GAGXkD,MAAMC,UAAUC,QAAQC,KAAKR,GAAU,SAAUjB,GAE/C,IAAIxB,EAAUS,SAASyC,eACrBC,mBAAmB3B,EAAK4B,KAAKC,OAAO,KAEjCrD,GAGLJ,EAAS0D,KAAK,CACZ1B,IAAKJ,EACLxB,QAASA,OAKbL,EAAaC,IAMfgD,OAAoB,WAElB,IAAIW,EAASlC,EAAUzB,EAAUX,GAG5BsE,EASDb,GAAWa,EAAOvD,UAAY0C,EAAQ1C,UAG1CkC,EAAWQ,EAASzD,GAzFT,SAAUkD,EAAOlD,GAE9B,GAAKkD,EAAL,CAGA,IAAIL,EAAKK,EAAMP,IAAIG,QAAQ,MACtBD,IAGLA,EAAGE,UAAUM,IAAIrD,EAASX,UAC1B6D,EAAMnC,QAAQgC,UAAUM,IAAIrD,EAASV,cAGrC8D,EAAeP,EAAI7C,GAGnBJ,EAAU,kBAAmBiD,EAAI,CAC/BM,KAAMD,EAAMP,IACZ5B,QAASmC,EAAMnC,QACff,SAAUA,MAuEVuE,CAASD,EAAQtE,GAGjByD,EAAUa,GAfJb,IACFR,EAAWQ,EAASzD,GACpByD,EAAU,QAoBZe,EAAgB,SAAUvE,GAExByD,GACFxE,EAAOuF,qBAAqBf,GAI9BA,EAAUxE,EAAOwF,sBAAsBf,EAAWgB,SAOhDC,EAAgB,SAAU3E,GAExByD,GACFxE,EAAOuF,qBAAqBf,GAI9BA,EAAUxE,EAAOwF,uBAAsB,WACrChE,EAAaC,GACbgD,EAAWgB,aAoDf,OA7CAhB,EAAWkB,QAAU,WAEfpB,GACFR,EAAWQ,EAASzD,GAItBd,EAAO4F,oBAAoB,SAAUN,GAAe,GAChDxE,EAASN,QACXR,EAAO4F,oBAAoB,SAAUF,GAAe,GAItDjE,EAAW,KACX6C,EAAW,KACXC,EAAU,KACVC,EAAU,KACV1D,EAAW,MAQXA,EA3XS,WACX,IAAI+E,EAAS,GAOb,OANAlB,MAAMC,UAAUC,QAAQC,KAAKgB,WAAW,SAAUC,GAChD,IAAK,IAAIC,KAAOD,EAAK,CACnB,IAAKA,EAAIE,eAAeD,GAAM,OAC9BH,EAAOG,GAAOD,EAAIC,OAGfH,EAmXMK,CAAOhG,EAAUmE,GAAW,IAGvCI,EAAW0B,QAGX1B,EAAWgB,SAGXzF,EAAOoG,iBAAiB,SAAUd,GAAe,GAC7CxE,EAASN,QACXR,EAAOoG,iBAAiB,SAAUV,GAAe,GAS9CjB,GA7bA4B,CAAQvG,IAChB,QAFM,SAEN,uBCXDwG,EAA2B,GAG/B,SAASC,EAAoBC,GAE5B,IAAIC,EAAeH,EAAyBE,GAC5C,QAAqBE,IAAjBD,EACH,OAAOA,EAAaE,QAGrB,IAAIC,EAASN,EAAyBE,GAAY,CAGjDG,QAAS,IAOV,OAHAE,EAAoBL,GAAU1B,KAAK8B,EAAOD,QAASC,EAAQA,EAAOD,QAASJ,GAGpEK,EAAOD,QCpBfJ,EAAoBO,EAAKF,IACxB,IAAIG,EAASH,GAAUA,EAAOI,WAC7B,IAAOJ,EAAiB,QACxB,IAAM,EAEP,OADAL,EAAoBU,EAAEF,EAAQ,CAAEG,EAAGH,IAC5BA,GCLRR,EAAoBU,EAAI,CAACN,EAASQ,KACjC,IAAI,IAAInB,KAAOmB,EACXZ,EAAoBa,EAAED,EAAYnB,KAASO,EAAoBa,EAAET,EAASX,IAC5EqB,OAAOC,eAAeX,EAASX,EAAK,CAAEuB,YAAY,EAAMC,IAAKL,EAAWnB,MCJ3EO,EAAoBxG,EAAI,WACvB,GAA0B,iBAAf0H,WAAyB,OAAOA,WAC3C,IACC,OAAOxH,MAAQ,IAAIyH,SAAS,cAAb,GACd,MAAOC,GACR,GAAsB,iBAAX3H,OAAqB,OAAOA,QALjB,GCAxBuG,EAAoBa,EAAI,CAACrB,EAAK6B,IAAUP,OAAOzC,UAAUqB,eAAenB,KAAKiB,EAAK6B,4CCK9EC,EAAY,KACZC,EAAS,KACTC,EAAgB/H,OAAO6C,aAAeP,SAASC,gBAAgByF,UA4EnE,SAASC,IACP,MAAMC,EAAeC,aAAaC,QAAQ,UAAY,OAZxD,IAAkBC,EACH,WADGA,EAaIrI,OAAOsI,WAAW,gCAAgCC,QAI/C,SAAjBL,EACO,QACgB,SAAhBA,EACA,OAEA,OAIU,SAAjBA,EACO,OACgB,QAAhBA,EACA,QAEA,SA9BoB,SAATG,GAA4B,SAATA,IACzCG,QAAQC,MAAM,2BAA2BJ,yBACzCA,EAAO,QAGT/F,SAASS,KAAK2F,QAAQC,MAAQN,EAC9BF,aAAaS,QAAQ,QAASP,GAC9BG,QAAQK,IAAI,cAAcR,WA4E5B,SAASlC,KART,WAEE,MAAM2C,EAAUxG,SAASyG,uBAAuB,gBAChDpE,MAAMqE,KAAKF,GAASjE,SAASoE,IAC3BA,EAAI7C,iBAAiB,QAAS6B,MAKhCiB,GA9CF,WAEE,IAAIC,EAA6B,EAC7BC,GAAU,EAEdpJ,OAAOoG,iBAAiB,UAAU,SAAUuB,GAC1CwB,EAA6BnJ,OAAOqJ,QAE/BD,IACHpJ,OAAOwF,uBAAsB,WAzDnC,IAAuB8D,IA0DDH,EA9GkC,GAAlDxG,KAAK4G,MAAMzB,EAAO7F,wBAAwBQ,KAC5CqF,EAAOjE,UAAUM,IAAI,YAErB2D,EAAOjE,UAAUC,OAAO,YAI5B,SAAmCwF,GAC7BA,EAXmB,GAYrBhH,SAASC,gBAAgBsB,UAAUC,OAAO,oBAEtCwF,EAAYvB,EACdzF,SAASC,gBAAgBsB,UAAUM,IAAI,oBAC9BmF,EAAYvB,GACrBzF,SAASC,gBAAgBsB,UAAUC,OAAO,oBAG9CiE,EAAgBuB,EAqChBE,CAA0BF,GAlC5B,SAA6BA,GACT,OAAdzB,IAKa,GAAbyB,EACFzB,EAAU4B,SAAS,EAAG,GAGtB9G,KAAKC,KAAK0G,IACV3G,KAAK4G,MAAMjH,SAASC,gBAAgBS,aAAehD,OAAOqC,aAE1DwF,EAAU4B,SAAS,EAAG5B,EAAU7E,cAGhBV,SAASoH,cAAc,oBAmBzCC,CAAoBL,GAwDdF,GAAU,KAGZA,GAAU,MAGdpJ,OAAO4J,SA8BPC,GA1BkB,OAAdhC,GAKJ,IAAI,IAAJ,CAAY,cAAe,CACzBrH,QAAQ,EACRsJ,WAAW,EACX3J,SAAU,iBACVI,OAAQ,KACN,IAAIwJ,EAAM7H,WAAW8H,iBAAiB1H,SAASC,iBAAiB0H,UAChE,OAAOnC,EAAO7F,wBAAwBiI,OAAS,GAAMH,EAAM,KA+BjEzH,SAAS8D,iBAAiB,oBAT1B,WACE9D,SAASS,KAAKW,WAAWG,UAAUC,OAAO,SAE1CgE,EAASxF,SAASoH,cAAc,UAChC7B,EAAYvF,SAASoH,cAAc,eAEnCvD","sources":["webpack:///./src/furo/assets/scripts/gumshoe-patched.js","webpack:///webpack/bootstrap","webpack:///webpack/runtime/compat get default export","webpack:///webpack/runtime/define property getters","webpack:///webpack/runtime/global","webpack:///webpack/runtime/hasOwnProperty shorthand","webpack:///./src/furo/assets/scripts/furo.js"],"sourcesContent":["/*!\n * gumshoejs v5.1.2 (patched by @pradyunsg)\n * A simple, framework-agnostic scrollspy script.\n * (c) 2019 Chris Ferdinandi\n * MIT License\n * http://github.com/cferdinandi/gumshoe\n */\n\n(function (root, factory) {\n  if (typeof define === \"function\" && define.amd) {\n    define([], function () {\n      return factory(root);\n    });\n  } else if (typeof exports === \"object\") {\n    module.exports = factory(root);\n  } else {\n    root.Gumshoe = factory(root);\n  }\n})(\n  typeof global !== \"undefined\"\n    ? global\n    : typeof window !== \"undefined\"\n    ? window\n    : this,\n  function (window) {\n    \"use strict\";\n\n    //\n    // Defaults\n    //\n\n    var defaults = {\n      // Active classes\n      navClass: \"active\",\n      contentClass: \"active\",\n\n      // Nested navigation\n      nested: false,\n      nestedClass: \"active\",\n\n      // Offset & reflow\n      offset: 0,\n      reflow: false,\n\n      // Event support\n      events: true,\n    };\n\n    //\n    // Methods\n    //\n\n    /**\n     * Merge two or more objects together.\n     * @param   {Object}   objects  The objects to merge together\n     * @returns {Object}            Merged values of defaults and options\n     */\n    var extend = function () {\n      var merged = {};\n      Array.prototype.forEach.call(arguments, function (obj) {\n        for (var key in obj) {\n          if (!obj.hasOwnProperty(key)) return;\n          merged[key] = obj[key];\n        }\n      });\n      return merged;\n    };\n\n    /**\n     * Emit a custom event\n     * @param  {String} type   The event type\n     * @param  {Node}   elem   The element to attach the event to\n     * @param  {Object} detail Any details to pass along with the event\n     */\n    var emitEvent = function (type, elem, detail) {\n      // Make sure events are enabled\n      if (!detail.settings.events) return;\n\n      // Create a new event\n      var event = new CustomEvent(type, {\n        bubbles: true,\n        cancelable: true,\n        detail: detail,\n      });\n\n      // Dispatch the event\n      elem.dispatchEvent(event);\n    };\n\n    /**\n     * Get an element's distance from the top of the Document.\n     * @param  {Node} elem The element\n     * @return {Number}    Distance from the top in pixels\n     */\n    var getOffsetTop = function (elem) {\n      var location = 0;\n      if (elem.offsetParent) {\n        while (elem) {\n          location += elem.offsetTop;\n          elem = elem.offsetParent;\n        }\n      }\n      return location >= 0 ? location : 0;\n    };\n\n    /**\n     * Sort content from first to last in the DOM\n     * @param  {Array} contents The content areas\n     */\n    var sortContents = function (contents) {\n      if (contents) {\n        contents.sort(function (item1, item2) {\n          var offset1 = getOffsetTop(item1.content);\n          var offset2 = getOffsetTop(item2.content);\n          if (offset1 < offset2) return -1;\n          return 1;\n        });\n      }\n    };\n\n    /**\n     * Get the offset to use for calculating position\n     * @param  {Object} settings The settings for this instantiation\n     * @return {Float}           The number of pixels to offset the calculations\n     */\n    var getOffset = function (settings) {\n      // if the offset is a function run it\n      if (typeof settings.offset === \"function\") {\n        return parseFloat(settings.offset());\n      }\n\n      // Otherwise, return it as-is\n      return parseFloat(settings.offset);\n    };\n\n    /**\n     * Get the document element's height\n     * @private\n     * @returns {Number}\n     */\n    var getDocumentHeight = function () {\n      return Math.max(\n        document.body.scrollHeight,\n        document.documentElement.scrollHeight,\n        document.body.offsetHeight,\n        document.documentElement.offsetHeight,\n        document.body.clientHeight,\n        document.documentElement.clientHeight,\n      );\n    };\n\n    /**\n     * Determine if an element is in view\n     * @param  {Node}    elem     The element\n     * @param  {Object}  settings The settings for this instantiation\n     * @param  {Boolean} bottom   If true, check if element is above bottom of viewport instead\n     * @return {Boolean}          Returns true if element is in the viewport\n     */\n    var isInView = function (elem, settings, bottom) {\n      var bounds = elem.getBoundingClientRect();\n      var offset = getOffset(settings);\n      if (bottom) {\n        return (\n          parseInt(bounds.bottom, 10) <\n          (window.innerHeight || document.documentElement.clientHeight)\n        );\n      }\n      return parseInt(bounds.top, 10) <= offset;\n    };\n\n    /**\n     * Check if at the bottom of the viewport\n     * @return {Boolean} If true, page is at the bottom of the viewport\n     */\n    var isAtBottom = function () {\n      if (\n        Math.ceil(window.innerHeight + window.pageYOffset) >=\n        getDocumentHeight()\n      )\n        return true;\n      return false;\n    };\n\n    /**\n     * Check if the last item should be used (even if not at the top of the page)\n     * @param  {Object} item     The last item\n     * @param  {Object} settings The settings for this instantiation\n     * @return {Boolean}         If true, use the last item\n     */\n    var useLastItem = function (item, settings) {\n      if (isAtBottom() && isInView(item.content, settings, true)) return true;\n      return false;\n    };\n\n    /**\n     * Get the active content\n     * @param  {Array}  contents The content areas\n     * @param  {Object} settings The settings for this instantiation\n     * @return {Object}          The content area and matching navigation link\n     */\n    var getActive = function (contents, settings) {\n      var last = contents[contents.length - 1];\n      if (useLastItem(last, settings)) return last;\n      for (var i = contents.length - 1; i >= 0; i--) {\n        if (isInView(contents[i].content, settings)) return contents[i];\n      }\n    };\n\n    /**\n     * Deactivate parent navs in a nested navigation\n     * @param  {Node}   nav      The starting navigation element\n     * @param  {Object} settings The settings for this instantiation\n     */\n    var deactivateNested = function (nav, settings) {\n      // If nesting isn't activated, bail\n      if (!settings.nested || !nav.parentNode) return;\n\n      // Get the parent navigation\n      var li = nav.parentNode.closest(\"li\");\n      if (!li) return;\n\n      // Remove the active class\n      li.classList.remove(settings.nestedClass);\n\n      // Apply recursively to any parent navigation elements\n      deactivateNested(li, settings);\n    };\n\n    /**\n     * Deactivate a nav and content area\n     * @param  {Object} items    The nav item and content to deactivate\n     * @param  {Object} settings The settings for this instantiation\n     */\n    var deactivate = function (items, settings) {\n      // Make sure there are items to deactivate\n      if (!items) return;\n\n      // Get the parent list item\n      var li = items.nav.closest(\"li\");\n      if (!li) return;\n\n      // Remove the active class from the nav and content\n      li.classList.remove(settings.navClass);\n      items.content.classList.remove(settings.contentClass);\n\n      // Deactivate any parent navs in a nested navigation\n      deactivateNested(li, settings);\n\n      // Emit a custom event\n      emitEvent(\"gumshoeDeactivate\", li, {\n        link: items.nav,\n        content: items.content,\n        settings: settings,\n      });\n    };\n\n    /**\n     * Activate parent navs in a nested navigation\n     * @param  {Node}   nav      The starting navigation element\n     * @param  {Object} settings The settings for this instantiation\n     */\n    var activateNested = function (nav, settings) {\n      // If nesting isn't activated, bail\n      if (!settings.nested) return;\n\n      // Get the parent navigation\n      var li = nav.parentNode.closest(\"li\");\n      if (!li) return;\n\n      // Add the active class\n      li.classList.add(settings.nestedClass);\n\n      // Apply recursively to any parent navigation elements\n      activateNested(li, settings);\n    };\n\n    /**\n     * Activate a nav and content area\n     * @param  {Object} items    The nav item and content to activate\n     * @param  {Object} settings The settings for this instantiation\n     */\n    var activate = function (items, settings) {\n      // Make sure there are items to activate\n      if (!items) return;\n\n      // Get the parent list item\n      var li = items.nav.closest(\"li\");\n      if (!li) return;\n\n      // Add the active class to the nav and content\n      li.classList.add(settings.navClass);\n      items.content.classList.add(settings.contentClass);\n\n      // Activate any parent navs in a nested navigation\n      activateNested(li, settings);\n\n      // Emit a custom event\n      emitEvent(\"gumshoeActivate\", li, {\n        link: items.nav,\n        content: items.content,\n        settings: settings,\n      });\n    };\n\n    /**\n     * Create the Constructor object\n     * @param {String} selector The selector to use for navigation items\n     * @param {Object} options  User options and settings\n     */\n    var Constructor = function (selector, options) {\n      //\n      // Variables\n      //\n\n      var publicAPIs = {};\n      var navItems, contents, current, timeout, settings;\n\n      //\n      // Methods\n      //\n\n      /**\n       * Set variables from DOM elements\n       */\n      publicAPIs.setup = function () {\n        // Get all nav items\n        navItems = document.querySelectorAll(selector);\n\n        // Create contents array\n        contents = [];\n\n        // Loop through each item, get it's matching content, and push to the array\n        Array.prototype.forEach.call(navItems, function (item) {\n          // Get the content for the nav item\n          var content = document.getElementById(\n            decodeURIComponent(item.hash.substr(1)),\n          );\n          if (!content) return;\n\n          // Push to the contents array\n          contents.push({\n            nav: item,\n            content: content,\n          });\n        });\n\n        // Sort contents by the order they appear in the DOM\n        sortContents(contents);\n      };\n\n      /**\n       * Detect which content is currently active\n       */\n      publicAPIs.detect = function () {\n        // Get the active content\n        var active = getActive(contents, settings);\n\n        // if there's no active content, deactivate and bail\n        if (!active) {\n          if (current) {\n            deactivate(current, settings);\n            current = null;\n          }\n          return;\n        }\n\n        // If the active content is the one currently active, do nothing\n        if (current && active.content === current.content) return;\n\n        // Deactivate the current content and activate the new content\n        deactivate(current, settings);\n        activate(active, settings);\n\n        // Update the currently active content\n        current = active;\n      };\n\n      /**\n       * Detect the active content on scroll\n       * Debounced for performance\n       */\n      var scrollHandler = function (event) {\n        // If there's a timer, cancel it\n        if (timeout) {\n          window.cancelAnimationFrame(timeout);\n        }\n\n        // Setup debounce callback\n        timeout = window.requestAnimationFrame(publicAPIs.detect);\n      };\n\n      /**\n       * Update content sorting on resize\n       * Debounced for performance\n       */\n      var resizeHandler = function (event) {\n        // If there's a timer, cancel it\n        if (timeout) {\n          window.cancelAnimationFrame(timeout);\n        }\n\n        // Setup debounce callback\n        timeout = window.requestAnimationFrame(function () {\n          sortContents(contents);\n          publicAPIs.detect();\n        });\n      };\n\n      /**\n       * Destroy the current instantiation\n       */\n      publicAPIs.destroy = function () {\n        // Undo DOM changes\n        if (current) {\n          deactivate(current, settings);\n        }\n\n        // Remove event listeners\n        window.removeEventListener(\"scroll\", scrollHandler, false);\n        if (settings.reflow) {\n          window.removeEventListener(\"resize\", resizeHandler, false);\n        }\n\n        // Reset variables\n        contents = null;\n        navItems = null;\n        current = null;\n        timeout = null;\n        settings = null;\n      };\n\n      /**\n       * Initialize the current instantiation\n       */\n      var init = function () {\n        // Merge user options into defaults\n        settings = extend(defaults, options || {});\n\n        // Setup variables based on the current DOM\n        publicAPIs.setup();\n\n        // Find the currently active content\n        publicAPIs.detect();\n\n        // Setup event listeners\n        window.addEventListener(\"scroll\", scrollHandler, false);\n        if (settings.reflow) {\n          window.addEventListener(\"resize\", resizeHandler, false);\n        }\n      };\n\n      //\n      // Initialize and return the public APIs\n      //\n\n      init();\n      return publicAPIs;\n    };\n\n    //\n    // Return the Constructor\n    //\n\n    return Constructor;\n  },\n);\n","// The module cache\nvar __webpack_module_cache__ = {};\n\n// The require function\nfunction __webpack_require__(moduleId) {\n\t// Check if module is in cache\n\tvar cachedModule = __webpack_module_cache__[moduleId];\n\tif (cachedModule !== undefined) {\n\t\treturn cachedModule.exports;\n\t}\n\t// Create a new module (and put it into the cache)\n\tvar module = __webpack_module_cache__[moduleId] = {\n\t\t// no module.id needed\n\t\t// no module.loaded needed\n\t\texports: {}\n\t};\n\n\t// Execute the module function\n\t__webpack_modules__[moduleId].call(module.exports, module, module.exports, __webpack_require__);\n\n\t// Return the exports of the module\n\treturn module.exports;\n}\n\n","// getDefaultExport function for compatibility with non-harmony modules\n__webpack_require__.n = (module) => {\n\tvar getter = module && module.__esModule ?\n\t\t() => (module['default']) :\n\t\t() => (module);\n\t__webpack_require__.d(getter, { a: getter });\n\treturn getter;\n};","// define getter functions for harmony exports\n__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n\t\tif(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n\t\t\tObject.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n\t\t}\n\t}\n};","__webpack_require__.g = (function() {\n\tif (typeof globalThis === 'object') return globalThis;\n\ttry {\n\t\treturn this || new Function('return this')();\n\t} catch (e) {\n\t\tif (typeof window === 'object') return window;\n\t}\n})();","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","import Gumshoe from \"./gumshoe-patched.js\";\n\n////////////////////////////////////////////////////////////////////////////////\n// Scroll Handling\n////////////////////////////////////////////////////////////////////////////////\nvar tocScroll = null;\nvar header = null;\nvar lastScrollTop = window.pageYOffset || document.documentElement.scrollTop;\nconst GO_TO_TOP_OFFSET = 64;\n\nfunction scrollHandlerForHeader() {\n  if (Math.floor(header.getBoundingClientRect().top) == 0) {\n    header.classList.add(\"scrolled\");\n  } else {\n    header.classList.remove(\"scrolled\");\n  }\n}\n\nfunction scrollHandlerForBackToTop(positionY) {\n  if (positionY < GO_TO_TOP_OFFSET) {\n    document.documentElement.classList.remove(\"show-back-to-top\");\n  } else {\n    if (positionY < lastScrollTop) {\n      document.documentElement.classList.add(\"show-back-to-top\");\n    } else if (positionY > lastScrollTop) {\n      document.documentElement.classList.remove(\"show-back-to-top\");\n    }\n  }\n  lastScrollTop = positionY;\n}\n\nfunction scrollHandlerForTOC(positionY) {\n  if (tocScroll === null) {\n    return;\n  }\n\n  // top of page.\n  if (positionY == 0) {\n    tocScroll.scrollTo(0, 0);\n  } else if (\n    // bottom of page.\n    Math.ceil(positionY) >=\n    Math.floor(document.documentElement.scrollHeight - window.innerHeight)\n  ) {\n    tocScroll.scrollTo(0, tocScroll.scrollHeight);\n  } else {\n    // somewhere in the middle.\n    const current = document.querySelector(\".scroll-current\");\n    if (current == null) {\n      return;\n    }\n\n    // https://github.com/pypa/pip/issues/9159 This breaks scroll behaviours.\n    // // scroll the currently \"active\" heading in toc, into view.\n    // const rect = current.getBoundingClientRect();\n    // if (0 > rect.top) {\n    //   current.scrollIntoView(true); // the argument is \"alignTop\"\n    // } else if (rect.bottom > window.innerHeight) {\n    //   current.scrollIntoView(false);\n    // }\n  }\n}\n\nfunction scrollHandler(positionY) {\n  scrollHandlerForHeader();\n  scrollHandlerForBackToTop(positionY);\n  scrollHandlerForTOC(positionY);\n}\n\n////////////////////////////////////////////////////////////////////////////////\n// Theme Toggle\n////////////////////////////////////////////////////////////////////////////////\nfunction setTheme(mode) {\n  if (mode !== \"light\" && mode !== \"dark\" && mode !== \"auto\") {\n    console.error(`Got invalid theme mode: ${mode}. Resetting to auto.`);\n    mode = \"auto\";\n  }\n\n  document.body.dataset.theme = mode;\n  localStorage.setItem(\"theme\", mode);\n  console.log(`Changed to ${mode} mode.`);\n}\n\nfunction cycleThemeOnce() {\n  const currentTheme = localStorage.getItem(\"theme\") || \"auto\";\n  const prefersDark = window.matchMedia(\"(prefers-color-scheme: dark)\").matches;\n\n  if (prefersDark) {\n    // Auto (dark) -> Light -> Dark\n    if (currentTheme === \"auto\") {\n      setTheme(\"light\");\n    } else if (currentTheme == \"light\") {\n      setTheme(\"dark\");\n    } else {\n      setTheme(\"auto\");\n    }\n  } else {\n    // Auto (light) -> Dark -> Light\n    if (currentTheme === \"auto\") {\n      setTheme(\"dark\");\n    } else if (currentTheme == \"dark\") {\n      setTheme(\"light\");\n    } else {\n      setTheme(\"auto\");\n    }\n  }\n}\n\n////////////////////////////////////////////////////////////////////////////////\n// Setup\n////////////////////////////////////////////////////////////////////////////////\nfunction setupScrollHandler() {\n  // Taken from https://developer.mozilla.org/en-US/docs/Web/API/Document/scroll_event\n  let last_known_scroll_position = 0;\n  let ticking = false;\n\n  window.addEventListener(\"scroll\", function (e) {\n    last_known_scroll_position = window.scrollY;\n\n    if (!ticking) {\n      window.requestAnimationFrame(function () {\n        scrollHandler(last_known_scroll_position);\n        ticking = false;\n      });\n\n      ticking = true;\n    }\n  });\n  window.scroll();\n}\n\nfunction setupScrollSpy() {\n  if (tocScroll === null) {\n    return;\n  }\n\n  // Scrollspy -- highlight table on contents, based on scroll\n  new Gumshoe(\".toc-tree a\", {\n    reflow: true,\n    recursive: true,\n    navClass: \"scroll-current\",\n    offset: () => {\n      let rem = parseFloat(getComputedStyle(document.documentElement).fontSize);\n      return header.getBoundingClientRect().height + 0.5 * rem + 1;\n    },\n  });\n}\n\nfunction setupTheme() {\n  // Attach event handlers for toggling themes\n  const buttons = document.getElementsByClassName(\"theme-toggle\");\n  Array.from(buttons).forEach((btn) => {\n    btn.addEventListener(\"click\", cycleThemeOnce);\n  });\n}\n\nfunction setup() {\n  setupTheme();\n  setupScrollHandler();\n  setupScrollSpy();\n}\n\n////////////////////////////////////////////////////////////////////////////////\n// Main entrypoint\n////////////////////////////////////////////////////////////////////////////////\nfunction main() {\n  document.body.parentNode.classList.remove(\"no-js\");\n\n  header = document.querySelector(\"header\");\n  tocScroll = document.querySelector(\".toc-scroll\");\n\n  setup();\n}\n\ndocument.addEventListener(\"DOMContentLoaded\", main);\n"],"names":["root","g","window","this","defaults","navClass","contentClass","nested","nestedClass","offset","reflow","events","emitEvent","type","elem","detail","settings","event","CustomEvent","bubbles","cancelable","dispatchEvent","getOffsetTop","location","offsetParent","offsetTop","sortContents","contents","sort","item1","item2","content","isInView","bottom","bounds","getBoundingClientRect","parseFloat","getOffset","parseInt","innerHeight","document","documentElement","clientHeight","top","isAtBottom","Math","ceil","pageYOffset","max","body","scrollHeight","offsetHeight","getActive","last","length","item","useLastItem","i","deactivateNested","nav","parentNode","li","closest","classList","remove","deactivate","items","link","activateNested","add","selector","options","navItems","current","timeout","publicAPIs","querySelectorAll","Array","prototype","forEach","call","getElementById","decodeURIComponent","hash","substr","push","active","activate","scrollHandler","cancelAnimationFrame","requestAnimationFrame","detect","resizeHandler","destroy","removeEventListener","merged","arguments","obj","key","hasOwnProperty","extend","setup","addEventListener","factory","__webpack_module_cache__","__webpack_require__","moduleId","cachedModule","undefined","exports","module","__webpack_modules__","n","getter","__esModule","d","a","definition","o","Object","defineProperty","enumerable","get","globalThis","Function","e","prop","tocScroll","header","lastScrollTop","scrollTop","cycleThemeOnce","currentTheme","localStorage","getItem","mode","matchMedia","matches","console","error","dataset","theme","setItem","log","buttons","getElementsByClassName","from","btn","setupTheme","last_known_scroll_position","ticking","scrollY","positionY","floor","scrollHandlerForBackToTop","scrollTo","querySelector","scrollHandlerForTOC","scroll","setupScrollHandler","recursive","rem","getComputedStyle","fontSize","height"],"sourceRoot":""}
\ No newline at end of file
diff --git a/python/docs/_static/searchtools.js b/python/docs/_static/searchtools.js
new file mode 100644
index 0000000000..97d56a74d8
--- /dev/null
+++ b/python/docs/_static/searchtools.js
@@ -0,0 +1,566 @@
+/*
+ * searchtools.js
+ * ~~~~~~~~~~~~~~~~
+ *
+ * Sphinx JavaScript utilities for the full-text search.
+ *
+ * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS.
+ * :license: BSD, see LICENSE for details.
+ *
+ */
+"use strict";
+
+/**
+ * Simple result scoring code.
+ */
+if (typeof Scorer === "undefined") {
+  var Scorer = {
+    // Implement the following function to further tweak the score for each result
+    // The function takes a result array [docname, title, anchor, descr, score, filename]
+    // and returns the new score.
+    /*
+    score: result => {
+      const [docname, title, anchor, descr, score, filename] = result
+      return score
+    },
+    */
+
+    // query matches the full name of an object
+    objNameMatch: 11,
+    // or matches in the last dotted part of the object name
+    objPartialMatch: 6,
+    // Additive scores depending on the priority of the object
+    objPrio: {
+      0: 15, // used to be importantResults
+      1: 5, // used to be objectResults
+      2: -5, // used to be unimportantResults
+    },
+    //  Used when the priority is not in the mapping.
+    objPrioDefault: 0,
+
+    // query found in title
+    title: 15,
+    partialTitle: 7,
+    // query found in terms
+    term: 5,
+    partialTerm: 2,
+  };
+}
+
+const _removeChildren = (element) => {
+  while (element && element.lastChild) element.removeChild(element.lastChild);
+};
+
+/**
+ * See https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions#escaping
+ */
+const _escapeRegExp = (string) =>
+  string.replace(/[.*+\-?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string
+
+const _displayItem = (item, searchTerms) => {
+  const docBuilder = DOCUMENTATION_OPTIONS.BUILDER;
+  const docUrlRoot = DOCUMENTATION_OPTIONS.URL_ROOT;
+  const docFileSuffix = DOCUMENTATION_OPTIONS.FILE_SUFFIX;
+  const docLinkSuffix = DOCUMENTATION_OPTIONS.LINK_SUFFIX;
+  const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY;
+
+  const [docName, title, anchor, descr, score, _filename] = item;
+
+  let listItem = document.createElement("li");
+  let requestUrl;
+  let linkUrl;
+  if (docBuilder === "dirhtml") {
+    // dirhtml builder
+    let dirname = docName + "/";
+    if (dirname.match(/\/index\/$/))
+      dirname = dirname.substring(0, dirname.length - 6);
+    else if (dirname === "index/") dirname = "";
+    requestUrl = docUrlRoot + dirname;
+    linkUrl = requestUrl;
+  } else {
+    // normal html builders
+    requestUrl = docUrlRoot + docName + docFileSuffix;
+    linkUrl = docName + docLinkSuffix;
+  }
+  let linkEl = listItem.appendChild(document.createElement("a"));
+  linkEl.href = linkUrl + anchor;
+  linkEl.dataset.score = score;
+  linkEl.innerHTML = title;
+  if (descr)
+    listItem.appendChild(document.createElement("span")).innerHTML =
+      " (" + descr + ")";
+  else if (showSearchSummary)
+    fetch(requestUrl)
+      .then((responseData) => responseData.text())
+      .then((data) => {
+        if (data)
+          listItem.appendChild(
+            Search.makeSearchSummary(data, searchTerms)
+          );
+      });
+  Search.output.appendChild(listItem);
+};
+const _finishSearch = (resultCount) => {
+  Search.stopPulse();
+  Search.title.innerText = _("Search Results");
+  if (!resultCount)
+    Search.status.innerText = Documentation.gettext(
+      "Your search did not match any documents. Please make sure that all words are spelled correctly and that you've selected enough categories."
+    );
+  else
+    Search.status.innerText = _(
+      `Search finished, found ${resultCount} page(s) matching the search query.`
+    );
+};
+const _displayNextItem = (
+  results,
+  resultCount,
+  searchTerms
+) => {
+  // results left, load the summary and display it
+  // this is intended to be dynamic (don't sub resultsCount)
+  if (results.length) {
+    _displayItem(results.pop(), searchTerms);
+    setTimeout(
+      () => _displayNextItem(results, resultCount, searchTerms),
+      5
+    );
+  }
+  // search finished, update title and status message
+  else _finishSearch(resultCount);
+};
+
+/**
+ * Default splitQuery function. Can be overridden in ``sphinx.search`` with a
+ * custom function per language.
+ *
+ * The regular expression works by splitting the string on consecutive characters
+ * that are not Unicode letters, numbers, underscores, or emoji characters.
+ * This is the same as ``\W+`` in Python, preserving the surrogate pair area.
+ */
+if (typeof splitQuery === "undefined") {
+  var splitQuery = (query) => query
+      .split(/[^\p{Letter}\p{Number}_\p{Emoji_Presentation}]+/gu)
+      .filter(term => term)  // remove remaining empty strings
+}
+
+/**
+ * Search Module
+ */
+const Search = {
+  _index: null,
+  _queued_query: null,
+  _pulse_status: -1,
+
+  htmlToText: (htmlString) => {
+    const htmlElement = new DOMParser().parseFromString(htmlString, 'text/html');
+    htmlElement.querySelectorAll(".headerlink").forEach((el) => { el.remove() });
+    const docContent = htmlElement.querySelector('[role="main"]');
+    if (docContent !== undefined) return docContent.textContent;
+    console.warn(
+      "Content block not found. Sphinx search tries to obtain it via '[role=main]'. Could you check your theme or template."
+    );
+    return "";
+  },
+
+  init: () => {
+    const query = new URLSearchParams(window.location.search).get("q");
+    document
+      .querySelectorAll('input[name="q"]')
+      .forEach((el) => (el.value = query));
+    if (query) Search.performSearch(query);
+  },
+
+  loadIndex: (url) =>
+    (document.body.appendChild(document.createElement("script")).src = url),
+
+  setIndex: (index) => {
+    Search._index = index;
+    if (Search._queued_query !== null) {
+      const query = Search._queued_query;
+      Search._queued_query = null;
+      Search.query(query);
+    }
+  },
+
+  hasIndex: () => Search._index !== null,
+
+  deferQuery: (query) => (Search._queued_query = query),
+
+  stopPulse: () => (Search._pulse_status = -1),
+
+  startPulse: () => {
+    if (Search._pulse_status >= 0) return;
+
+    const pulse = () => {
+      Search._pulse_status = (Search._pulse_status + 1) % 4;
+      Search.dots.innerText = ".".repeat(Search._pulse_status);
+      if (Search._pulse_status >= 0) window.setTimeout(pulse, 500);
+    };
+    pulse();
+  },
+
+  /**
+   * perform a search for something (or wait until index is loaded)
+   */
+  performSearch: (query) => {
+    // create the required interface elements
+    const searchText = document.createElement("h2");
+    searchText.textContent = _("Searching");
+    const searchSummary = document.createElement("p");
+    searchSummary.classList.add("search-summary");
+    searchSummary.innerText = "";
+    const searchList = document.createElement("ul");
+    searchList.classList.add("search");
+
+    const out = document.getElementById("search-results");
+    Search.title = out.appendChild(searchText);
+    Search.dots = Search.title.appendChild(document.createElement("span"));
+    Search.status = out.appendChild(searchSummary);
+    Search.output = out.appendChild(searchList);
+
+    const searchProgress = document.getElementById("search-progress");
+    // Some themes don't use the search progress node
+    if (searchProgress) {
+      searchProgress.innerText = _("Preparing search...");
+    }
+    Search.startPulse();
+
+    // index already loaded, the browser was quick!
+    if (Search.hasIndex()) Search.query(query);
+    else Search.deferQuery(query);
+  },
+
+  /**
+   * execute search (requires search index to be loaded)
+   */
+  query: (query) => {
+    const filenames = Search._index.filenames;
+    const docNames = Search._index.docnames;
+    const titles = Search._index.titles;
+    const allTitles = Search._index.alltitles;
+    const indexEntries = Search._index.indexentries;
+
+    // stem the search terms and add them to the correct list
+    const stemmer = new Stemmer();
+    const searchTerms = new Set();
+    const excludedTerms = new Set();
+    const highlightTerms = new Set();
+    const objectTerms = new Set(splitQuery(query.toLowerCase().trim()));
+    splitQuery(query.trim()).forEach((queryTerm) => {
+      const queryTermLower = queryTerm.toLowerCase();
+
+      // maybe skip this "word"
+      // stopwords array is from language_data.js
+      if (
+        stopwords.indexOf(queryTermLower) !== -1 ||
+        queryTerm.match(/^\d+$/)
+      )
+        return;
+
+      // stem the word
+      let word = stemmer.stemWord(queryTermLower);
+      // select the correct list
+      if (word[0] === "-") excludedTerms.add(word.substr(1));
+      else {
+        searchTerms.add(word);
+        highlightTerms.add(queryTermLower);
+      }
+    });
+
+    if (SPHINX_HIGHLIGHT_ENABLED) {  // set in sphinx_highlight.js
+      localStorage.setItem("sphinx_highlight_terms", [...highlightTerms].join(" "))
+    }
+
+    // console.debug("SEARCH: searching for:");
+    // console.info("required: ", [...searchTerms]);
+    // console.info("excluded: ", [...excludedTerms]);
+
+    // array of [docname, title, anchor, descr, score, filename]
+    let results = [];
+    _removeChildren(document.getElementById("search-progress"));
+
+    const queryLower = query.toLowerCase();
+    for (const [title, foundTitles] of Object.entries(allTitles)) {
+      if (title.toLowerCase().includes(queryLower) && (queryLower.length >= title.length/2)) {
+        for (const [file, id] of foundTitles) {
+          let score = Math.round(100 * queryLower.length / title.length)
+          results.push([
+            docNames[file],
+            titles[file] !== title ? `${titles[file]} > ${title}` : title,
+            id !== null ? "#" + id : "",
+            null,
+            score,
+            filenames[file],
+          ]);
+        }
+      }
+    }
+
+    // search for explicit entries in index directives
+    for (const [entry, foundEntries] of Object.entries(indexEntries)) {
+      if (entry.includes(queryLower) && (queryLower.length >= entry.length/2)) {
+        for (const [file, id] of foundEntries) {
+          let score = Math.round(100 * queryLower.length / entry.length)
+          results.push([
+            docNames[file],
+            titles[file],
+            id ? "#" + id : "",
+            null,
+            score,
+            filenames[file],
+          ]);
+        }
+      }
+    }
+
+    // lookup as object
+    objectTerms.forEach((term) =>
+      results.push(...Search.performObjectSearch(term, objectTerms))
+    );
+
+    // lookup as search terms in fulltext
+    results.push(...Search.performTermsSearch(searchTerms, excludedTerms));
+
+    // let the scorer override scores with a custom scoring function
+    if (Scorer.score) results.forEach((item) => (item[4] = Scorer.score(item)));
+
+    // now sort the results by score (in opposite order of appearance, since the
+    // display function below uses pop() to retrieve items) and then
+    // alphabetically
+    results.sort((a, b) => {
+      const leftScore = a[4];
+      const rightScore = b[4];
+      if (leftScore === rightScore) {
+        // same score: sort alphabetically
+        const leftTitle = a[1].toLowerCase();
+        const rightTitle = b[1].toLowerCase();
+        if (leftTitle === rightTitle) return 0;
+        return leftTitle > rightTitle ? -1 : 1; // inverted is intentional
+      }
+      return leftScore > rightScore ? 1 : -1;
+    });
+
+    // remove duplicate search results
+    // note the reversing of results, so that in the case of duplicates, the highest-scoring entry is kept
+    let seen = new Set();
+    results = results.reverse().reduce((acc, result) => {
+      let resultStr = result.slice(0, 4).concat([result[5]]).map(v => String(v)).join(',');
+      if (!seen.has(resultStr)) {
+        acc.push(result);
+        seen.add(resultStr);
+      }
+      return acc;
+    }, []);
+
+    results = results.reverse();
+
+    // for debugging
+    //Search.lastresults = results.slice();  // a copy
+    // console.info("search results:", Search.lastresults);
+
+    // print the results
+    _displayNextItem(results, results.length, searchTerms);
+  },
+
+  /**
+   * search for object names
+   */
+  performObjectSearch: (object, objectTerms) => {
+    const filenames = Search._index.filenames;
+    const docNames = Search._index.docnames;
+    const objects = Search._index.objects;
+    const objNames = Search._index.objnames;
+    const titles = Search._index.titles;
+
+    const results = [];
+
+    const objectSearchCallback = (prefix, match) => {
+      const name = match[4]
+      const fullname = (prefix ? prefix + "." : "") + name;
+      const fullnameLower = fullname.toLowerCase();
+      if (fullnameLower.indexOf(object) < 0) return;
+
+      let score = 0;
+      const parts = fullnameLower.split(".");
+
+      // check for different match types: exact matches of full name or
+      // "last name" (i.e. last dotted part)
+      if (fullnameLower === object || parts.slice(-1)[0] === object)
+        score += Scorer.objNameMatch;
+      else if (parts.slice(-1)[0].indexOf(object) > -1)
+        score += Scorer.objPartialMatch; // matches in last name
+
+      const objName = objNames[match[1]][2];
+      const title = titles[match[0]];
+
+      // If more than one term searched for, we require other words to be
+      // found in the name/title/description
+      const otherTerms = new Set(objectTerms);
+      otherTerms.delete(object);
+      if (otherTerms.size > 0) {
+        const haystack = `${prefix} ${name} ${objName} ${title}`.toLowerCase();
+        if (
+          [...otherTerms].some((otherTerm) => haystack.indexOf(otherTerm) < 0)
+        )
+          return;
+      }
+
+      let anchor = match[3];
+      if (anchor === "") anchor = fullname;
+      else if (anchor === "-") anchor = objNames[match[1]][1] + "-" + fullname;
+
+      const descr = objName + _(", in ") + title;
+
+      // add custom score for some objects according to scorer
+      if (Scorer.objPrio.hasOwnProperty(match[2]))
+        score += Scorer.objPrio[match[2]];
+      else score += Scorer.objPrioDefault;
+
+      results.push([
+        docNames[match[0]],
+        fullname,
+        "#" + anchor,
+        descr,
+        score,
+        filenames[match[0]],
+      ]);
+    };
+    Object.keys(objects).forEach((prefix) =>
+      objects[prefix].forEach((array) =>
+        objectSearchCallback(prefix, array)
+      )
+    );
+    return results;
+  },
+
+  /**
+   * search for full-text terms in the index
+   */
+  performTermsSearch: (searchTerms, excludedTerms) => {
+    // prepare search
+    const terms = Search._index.terms;
+    const titleTerms = Search._index.titleterms;
+    const filenames = Search._index.filenames;
+    const docNames = Search._index.docnames;
+    const titles = Search._index.titles;
+
+    const scoreMap = new Map();
+    const fileMap = new Map();
+
+    // perform the search on the required terms
+    searchTerms.forEach((word) => {
+      const files = [];
+      const arr = [
+        { files: terms[word], score: Scorer.term },
+        { files: titleTerms[word], score: Scorer.title },
+      ];
+      // add support for partial matches
+      if (word.length > 2) {
+        const escapedWord = _escapeRegExp(word);
+        Object.keys(terms).forEach((term) => {
+          if (term.match(escapedWord) && !terms[word])
+            arr.push({ files: terms[term], score: Scorer.partialTerm });
+        });
+        Object.keys(titleTerms).forEach((term) => {
+          if (term.match(escapedWord) && !titleTerms[word])
+            arr.push({ files: titleTerms[word], score: Scorer.partialTitle });
+        });
+      }
+
+      // no match but word was a required one
+      if (arr.every((record) => record.files === undefined)) return;
+
+      // found search word in contents
+      arr.forEach((record) => {
+        if (record.files === undefined) return;
+
+        let recordFiles = record.files;
+        if (recordFiles.length === undefined) recordFiles = [recordFiles];
+        files.push(...recordFiles);
+
+        // set score for the word in each file
+        recordFiles.forEach((file) => {
+          if (!scoreMap.has(file)) scoreMap.set(file, {});
+          scoreMap.get(file)[word] = record.score;
+        });
+      });
+
+      // create the mapping
+      files.forEach((file) => {
+        if (fileMap.has(file) && fileMap.get(file).indexOf(word) === -1)
+          fileMap.get(file).push(word);
+        else fileMap.set(file, [word]);
+      });
+    });
+
+    // now check if the files don't contain excluded terms
+    const results = [];
+    for (const [file, wordList] of fileMap) {
+      // check if all requirements are matched
+
+      // as search terms with length < 3 are discarded
+      const filteredTermCount = [...searchTerms].filter(
+        (term) => term.length > 2
+      ).length;
+      if (
+        wordList.length !== searchTerms.size &&
+        wordList.length !== filteredTermCount
+      )
+        continue;
+
+      // ensure that none of the excluded terms is in the search result
+      if (
+        [...excludedTerms].some(
+          (term) =>
+            terms[term] === file ||
+            titleTerms[term] === file ||
+            (terms[term] || []).includes(file) ||
+            (titleTerms[term] || []).includes(file)
+        )
+      )
+        break;
+
+      // select one (max) score for the file.
+      const score = Math.max(...wordList.map((w) => scoreMap.get(file)[w]));
+      // add result to the result list
+      results.push([
+        docNames[file],
+        titles[file],
+        "",
+        null,
+        score,
+        filenames[file],
+      ]);
+    }
+    return results;
+  },
+
+  /**
+   * helper function to return a node containing the
+   * search summary for a given text. keywords is a list
+   * of stemmed words.
+   */
+  makeSearchSummary: (htmlText, keywords) => {
+    const text = Search.htmlToText(htmlText);
+    if (text === "") return null;
+
+    const textLower = text.toLowerCase();
+    const actualStartPosition = [...keywords]
+      .map((k) => textLower.indexOf(k.toLowerCase()))
+      .filter((i) => i > -1)
+      .slice(-1)[0];
+    const startWithContext = Math.max(actualStartPosition - 120, 0);
+
+    const top = startWithContext === 0 ? "" : "...";
+    const tail = startWithContext + 240 < text.length ? "..." : "";
+
+    let summary = document.createElement("p");
+    summary.classList.add("context");
+    summary.textContent = top + text.substr(startWithContext, 240).trim() + tail;
+
+    return summary;
+  },
+};
+
+_ready(Search.init);
diff --git a/python/docs/_static/skeleton.css b/python/docs/_static/skeleton.css
new file mode 100644
index 0000000000..467c878c62
--- /dev/null
+++ b/python/docs/_static/skeleton.css
@@ -0,0 +1,296 @@
+/* Some sane resets. */
+html {
+  height: 100%;
+}
+
+body {
+  margin: 0;
+  min-height: 100%;
+}
+
+/* All the flexbox magic! */
+body,
+.sb-announcement,
+.sb-content,
+.sb-main,
+.sb-container,
+.sb-container__inner,
+.sb-article-container,
+.sb-footer-content,
+.sb-header,
+.sb-header-secondary,
+.sb-footer {
+  display: flex;
+}
+
+/* These order things vertically */
+body,
+.sb-main,
+.sb-article-container {
+  flex-direction: column;
+}
+
+/* Put elements in the center */
+.sb-header,
+.sb-header-secondary,
+.sb-container,
+.sb-content,
+.sb-footer,
+.sb-footer-content {
+  justify-content: center;
+}
+/* Put elements at the ends */
+.sb-article-container {
+  justify-content: space-between;
+}
+
+/* These elements grow. */
+.sb-main,
+.sb-content,
+.sb-container,
+article {
+  flex-grow: 1;
+}
+
+/* Because padding making this wider is not fun */
+article {
+  box-sizing: border-box;
+}
+
+/* The announcements element should never be wider than the page. */
+.sb-announcement {
+  max-width: 100%;
+}
+
+.sb-sidebar-primary,
+.sb-sidebar-secondary {
+  flex-shrink: 0;
+  width: 17rem;
+}
+
+.sb-announcement__inner {
+  justify-content: center;
+
+  box-sizing: border-box;
+  height: 3rem;
+
+  overflow-x: auto;
+  white-space: nowrap;
+}
+
+/* Sidebars, with checkbox-based toggle */
+.sb-sidebar-primary,
+.sb-sidebar-secondary {
+  position: fixed;
+  height: 100%;
+  top: 0;
+}
+
+.sb-sidebar-primary {
+  left: -17rem;
+  transition: left 250ms ease-in-out;
+}
+.sb-sidebar-secondary {
+  right: -17rem;
+  transition: right 250ms ease-in-out;
+}
+
+.sb-sidebar-toggle {
+  display: none;
+}
+.sb-sidebar-overlay {
+  position: fixed;
+  top: 0;
+  width: 0;
+  height: 0;
+
+  transition: width 0ms ease 250ms, height 0ms ease 250ms, opacity 250ms ease;
+
+  opacity: 0;
+  background-color: rgba(0, 0, 0, 0.54);
+}
+
+#sb-sidebar-toggle--primary:checked
+  ~ .sb-sidebar-overlay[for="sb-sidebar-toggle--primary"],
+#sb-sidebar-toggle--secondary:checked
+  ~ .sb-sidebar-overlay[for="sb-sidebar-toggle--secondary"] {
+  width: 100%;
+  height: 100%;
+  opacity: 1;
+  transition: width 0ms ease, height 0ms ease, opacity 250ms ease;
+}
+
+#sb-sidebar-toggle--primary:checked ~ .sb-container .sb-sidebar-primary {
+  left: 0;
+}
+#sb-sidebar-toggle--secondary:checked ~ .sb-container .sb-sidebar-secondary {
+  right: 0;
+}
+
+/* Full-width mode */
+.drop-secondary-sidebar-for-full-width-content
+  .hide-when-secondary-sidebar-shown {
+  display: none !important;
+}
+.drop-secondary-sidebar-for-full-width-content .sb-sidebar-secondary {
+  display: none !important;
+}
+
+/* Mobile views */
+.sb-page-width {
+  width: 100%;
+}
+
+.sb-article-container,
+.sb-footer-content__inner,
+.drop-secondary-sidebar-for-full-width-content .sb-article,
+.drop-secondary-sidebar-for-full-width-content .match-content-width {
+  width: 100vw;
+}
+
+.sb-article,
+.match-content-width {
+  padding: 0 1rem;
+  box-sizing: border-box;
+}
+
+@media (min-width: 32rem) {
+  .sb-article,
+  .match-content-width {
+    padding: 0 2rem;
+  }
+}
+
+/* Tablet views */
+@media (min-width: 42rem) {
+  .sb-article-container {
+    width: auto;
+  }
+  .sb-footer-content__inner,
+  .drop-secondary-sidebar-for-full-width-content .sb-article,
+  .drop-secondary-sidebar-for-full-width-content .match-content-width {
+    width: 42rem;
+  }
+  .sb-article,
+  .match-content-width {
+    width: 42rem;
+  }
+}
+@media (min-width: 46rem) {
+  .sb-footer-content__inner,
+  .drop-secondary-sidebar-for-full-width-content .sb-article,
+  .drop-secondary-sidebar-for-full-width-content .match-content-width {
+    width: 46rem;
+  }
+  .sb-article,
+  .match-content-width {
+    width: 46rem;
+  }
+}
+@media (min-width: 50rem) {
+  .sb-footer-content__inner,
+  .drop-secondary-sidebar-for-full-width-content .sb-article,
+  .drop-secondary-sidebar-for-full-width-content .match-content-width {
+    width: 50rem;
+  }
+  .sb-article,
+  .match-content-width {
+    width: 50rem;
+  }
+}
+
+/* Tablet views */
+@media (min-width: 59rem) {
+  .sb-sidebar-secondary {
+    position: static;
+  }
+  .hide-when-secondary-sidebar-shown {
+    display: none !important;
+  }
+  .sb-footer-content__inner,
+  .drop-secondary-sidebar-for-full-width-content .sb-article,
+  .drop-secondary-sidebar-for-full-width-content .match-content-width {
+    width: 59rem;
+  }
+  .sb-article,
+  .match-content-width {
+    width: 42rem;
+  }
+}
+@media (min-width: 63rem) {
+  .sb-footer-content__inner,
+  .drop-secondary-sidebar-for-full-width-content .sb-article,
+  .drop-secondary-sidebar-for-full-width-content .match-content-width {
+    width: 63rem;
+  }
+  .sb-article,
+  .match-content-width {
+    width: 46rem;
+  }
+}
+@media (min-width: 67rem) {
+  .sb-footer-content__inner,
+  .drop-secondary-sidebar-for-full-width-content .sb-article,
+  .drop-secondary-sidebar-for-full-width-content .match-content-width {
+    width: 67rem;
+  }
+  .sb-article,
+  .match-content-width {
+    width: 50rem;
+  }
+}
+
+/* Desktop views */
+@media (min-width: 76rem) {
+  .sb-sidebar-primary {
+    position: static;
+  }
+  .hide-when-primary-sidebar-shown {
+    display: none !important;
+  }
+  .sb-footer-content__inner,
+  .drop-secondary-sidebar-for-full-width-content .sb-article,
+  .drop-secondary-sidebar-for-full-width-content .match-content-width {
+    width: 59rem;
+  }
+  .sb-article,
+  .match-content-width {
+    width: 42rem;
+  }
+}
+
+/* Full desktop views */
+@media (min-width: 80rem) {
+  .sb-article,
+  .match-content-width {
+    width: 46rem;
+  }
+  .sb-footer-content__inner,
+  .drop-secondary-sidebar-for-full-width-content .sb-article,
+  .drop-secondary-sidebar-for-full-width-content .match-content-width {
+    width: 63rem;
+  }
+}
+
+@media (min-width: 84rem) {
+  .sb-article,
+  .match-content-width {
+    width: 50rem;
+  }
+  .sb-footer-content__inner,
+  .drop-secondary-sidebar-for-full-width-content .sb-article,
+  .drop-secondary-sidebar-for-full-width-content .match-content-width {
+    width: 67rem;
+  }
+}
+
+@media (min-width: 88rem) {
+  .sb-footer-content__inner,
+  .drop-secondary-sidebar-for-full-width-content .sb-article,
+  .drop-secondary-sidebar-for-full-width-content .match-content-width {
+    width: 67rem;
+  }
+  .sb-page-width {
+    width: 88rem;
+  }
+}
diff --git a/python/docs/_static/sphinx_highlight.js b/python/docs/_static/sphinx_highlight.js
new file mode 100644
index 0000000000..aae669d7ea
--- /dev/null
+++ b/python/docs/_static/sphinx_highlight.js
@@ -0,0 +1,144 @@
+/* Highlighting utilities for Sphinx HTML documentation. */
+"use strict";
+
+const SPHINX_HIGHLIGHT_ENABLED = true
+
+/**
+ * highlight a given string on a node by wrapping it in
+ * span elements with the given class name.
+ */
+const _highlight = (node, addItems, text, className) => {
+  if (node.nodeType === Node.TEXT_NODE) {
+    const val = node.nodeValue;
+    const parent = node.parentNode;
+    const pos = val.toLowerCase().indexOf(text);
+    if (
+      pos >= 0 &&
+      !parent.classList.contains(className) &&
+      !parent.classList.contains("nohighlight")
+    ) {
+      let span;
+
+      const closestNode = parent.closest("body, svg, foreignObject");
+      const isInSVG = closestNode && closestNode.matches("svg");
+      if (isInSVG) {
+        span = document.createElementNS("http://www.w3.org/2000/svg", "tspan");
+      } else {
+        span = document.createElement("span");
+        span.classList.add(className);
+      }
+
+      span.appendChild(document.createTextNode(val.substr(pos, text.length)));
+      parent.insertBefore(
+        span,
+        parent.insertBefore(
+          document.createTextNode(val.substr(pos + text.length)),
+          node.nextSibling
+        )
+      );
+      node.nodeValue = val.substr(0, pos);
+
+      if (isInSVG) {
+        const rect = document.createElementNS(
+          "http://www.w3.org/2000/svg",
+          "rect"
+        );
+        const bbox = parent.getBBox();
+        rect.x.baseVal.value = bbox.x;
+        rect.y.baseVal.value = bbox.y;
+        rect.width.baseVal.value = bbox.width;
+        rect.height.baseVal.value = bbox.height;
+        rect.setAttribute("class", className);
+        addItems.push({ parent: parent, target: rect });
+      }
+    }
+  } else if (node.matches && !node.matches("button, select, textarea")) {
+    node.childNodes.forEach((el) => _highlight(el, addItems, text, className));
+  }
+};
+const _highlightText = (thisNode, text, className) => {
+  let addItems = [];
+  _highlight(thisNode, addItems, text, className);
+  addItems.forEach((obj) =>
+    obj.parent.insertAdjacentElement("beforebegin", obj.target)
+  );
+};
+
+/**
+ * Small JavaScript module for the documentation.
+ */
+const SphinxHighlight = {
+
+  /**
+   * highlight the search words provided in localstorage in the text
+   */
+  highlightSearchWords: () => {
+    if (!SPHINX_HIGHLIGHT_ENABLED) return;  // bail if no highlight
+
+    // get and clear terms from localstorage
+    const url = new URL(window.location);
+    const highlight =
+        localStorage.getItem("sphinx_highlight_terms")
+        || url.searchParams.get("highlight")
+        || "";
+    localStorage.removeItem("sphinx_highlight_terms")
+    url.searchParams.delete("highlight");
+    window.history.replaceState({}, "", url);
+
+    // get individual terms from highlight string
+    const terms = highlight.toLowerCase().split(/\s+/).filter(x => x);
+    if (terms.length === 0) return; // nothing to do
+
+    // There should never be more than one element matching "div.body"
+    const divBody = document.querySelectorAll("div.body");
+    const body = divBody.length ? divBody[0] : document.querySelector("body");
+    window.setTimeout(() => {
+      terms.forEach((term) => _highlightText(body, term, "highlighted"));
+    }, 10);
+
+    const searchBox = document.getElementById("searchbox");
+    if (searchBox === null) return;
+    searchBox.appendChild(
+      document
+        .createRange()
+        .createContextualFragment(
+          '<p class="highlight-link">' +
+            '<a href="javascript:SphinxHighlight.hideSearchWords()">' +
+            _("Hide Search Matches") +
+            "</a></p>"
+        )
+    );
+  },
+
+  /**
+   * helper function to hide the search marks again
+   */
+  hideSearchWords: () => {
+    document
+      .querySelectorAll("#searchbox .highlight-link")
+      .forEach((el) => el.remove());
+    document
+      .querySelectorAll("span.highlighted")
+      .forEach((el) => el.classList.remove("highlighted"));
+    localStorage.removeItem("sphinx_highlight_terms")
+  },
+
+  initEscapeListener: () => {
+    // only install a listener if it is really needed
+    if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) return;
+
+    document.addEventListener("keydown", (event) => {
+      // bail for input elements
+      if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return;
+      // bail with special keys
+      if (event.shiftKey || event.altKey || event.ctrlKey || event.metaKey) return;
+      if (DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS && (event.key === "Escape")) {
+        SphinxHighlight.hideSearchWords();
+        event.preventDefault();
+      }
+    });
+  },
+};
+
+_ready(SphinxHighlight.highlightSearchWords);
+_ready(SphinxHighlight.initEscapeListener);
diff --git a/python/docs/_static/styles/furo-extensions.css b/python/docs/_static/styles/furo-extensions.css
new file mode 100644
index 0000000000..bc447f228f
--- /dev/null
+++ b/python/docs/_static/styles/furo-extensions.css
@@ -0,0 +1,2 @@
+#furo-sidebar-ad-placement{padding:var(--sidebar-item-spacing-vertical) var(--sidebar-item-spacing-horizontal)}#furo-sidebar-ad-placement .ethical-sidebar{background:var(--color-background-secondary);border:none;box-shadow:none}#furo-sidebar-ad-placement .ethical-sidebar:hover{background:var(--color-background-hover)}#furo-sidebar-ad-placement .ethical-sidebar a{color:var(--color-foreground-primary)}#furo-sidebar-ad-placement .ethical-callout a{color:var(--color-foreground-secondary)!important}#furo-readthedocs-versions{background:transparent;display:block;position:static;width:100%}#furo-readthedocs-versions .rst-versions{background:#1a1c1e}#furo-readthedocs-versions .rst-current-version{background:var(--color-sidebar-item-background);cursor:unset}#furo-readthedocs-versions .rst-current-version:hover{background:var(--color-sidebar-item-background)}#furo-readthedocs-versions .rst-current-version .fa-book{color:var(--color-foreground-primary)}#furo-readthedocs-versions>.rst-other-versions{padding:0}#furo-readthedocs-versions>.rst-other-versions small{opacity:1}#furo-readthedocs-versions .injected .rst-versions{position:unset}#furo-readthedocs-versions:focus-within,#furo-readthedocs-versions:hover{box-shadow:0 0 0 1px var(--color-sidebar-background-border)}#furo-readthedocs-versions:focus-within .rst-current-version,#furo-readthedocs-versions:hover .rst-current-version{background:#1a1c1e;font-size:inherit;height:auto;line-height:inherit;padding:12px;text-align:right}#furo-readthedocs-versions:focus-within .rst-current-version .fa-book,#furo-readthedocs-versions:hover .rst-current-version .fa-book{color:#fff;float:left}#furo-readthedocs-versions:focus-within .fa-caret-down,#furo-readthedocs-versions:hover .fa-caret-down{display:none}#furo-readthedocs-versions:focus-within .injected,#furo-readthedocs-versions:focus-within .rst-current-version,#furo-readthedocs-versions:focus-within .rst-other-versions,#furo-readthedocs-versions:hover .injected,#furo-readthedocs-versions:hover .rst-current-version,#furo-readthedocs-versions:hover .rst-other-versions{display:block}#furo-readthedocs-versions:focus-within>.rst-current-version,#furo-readthedocs-versions:hover>.rst-current-version{display:none}.highlight:hover button.copybtn{color:var(--color-code-foreground)}.highlight button.copybtn{align-items:center;background-color:var(--color-code-background);border:none;color:var(--color-background-item);cursor:pointer;height:1.25em;opacity:1;right:.5rem;top:.625rem;transition:color .3s,opacity .3s;width:1.25em}.highlight button.copybtn:hover{background-color:var(--color-code-background);color:var(--color-brand-content)}.highlight button.copybtn:after{background-color:transparent;color:var(--color-code-foreground);display:none}.highlight button.copybtn.success{color:#22863a;transition:color 0ms}.highlight button.copybtn.success:after{display:block}.highlight button.copybtn svg{padding:0}body{--sd-color-primary:var(--color-brand-primary);--sd-color-primary-highlight:var(--color-brand-content);--sd-color-primary-text:var(--color-background-primary);--sd-color-shadow:rgba(0,0,0,.05);--sd-color-card-border:var(--color-card-border);--sd-color-card-border-hover:var(--color-brand-content);--sd-color-card-background:var(--color-card-background);--sd-color-card-text:var(--color-foreground-primary);--sd-color-card-header:var(--color-card-marginals-background);--sd-color-card-footer:var(--color-card-marginals-background);--sd-color-tabs-label-active:var(--color-brand-content);--sd-color-tabs-label-hover:var(--color-foreground-muted);--sd-color-tabs-label-inactive:var(--color-foreground-muted);--sd-color-tabs-underline-active:var(--color-brand-content);--sd-color-tabs-underline-hover:var(--color-foreground-border);--sd-color-tabs-underline-inactive:var(--color-background-border);--sd-color-tabs-overline:var(--color-background-border);--sd-color-tabs-underline:var(--color-background-border)}.sd-tab-content{box-shadow:0 -2px var(--sd-color-tabs-overline),0 1px var(--sd-color-tabs-underline)}.sd-card{box-shadow:0 .1rem .25rem var(--sd-color-shadow),0 0 .0625rem rgba(0,0,0,.1)}.sd-shadow-sm{box-shadow:0 .1rem .25rem var(--sd-color-shadow),0 0 .0625rem rgba(0,0,0,.1)!important}.sd-shadow-md{box-shadow:0 .3rem .75rem var(--sd-color-shadow),0 0 .0625rem rgba(0,0,0,.1)!important}.sd-shadow-lg{box-shadow:0 .6rem 1.5rem var(--sd-color-shadow),0 0 .0625rem rgba(0,0,0,.1)!important}.sd-card-hover:hover{transform:none}.sd-cards-carousel{gap:.25rem;padding:.25rem}body{--tabs--label-text:var(--color-foreground-muted);--tabs--label-text--hover:var(--color-foreground-muted);--tabs--label-text--active:var(--color-brand-content);--tabs--label-text--active--hover:var(--color-brand-content);--tabs--label-background:transparent;--tabs--label-background--hover:transparent;--tabs--label-background--active:transparent;--tabs--label-background--active--hover:transparent;--tabs--padding-x:0.25em;--tabs--margin-x:1em;--tabs--border:var(--color-background-border);--tabs--label-border:transparent;--tabs--label-border--hover:var(--color-foreground-muted);--tabs--label-border--active:var(--color-brand-content);--tabs--label-border--active--hover:var(--color-brand-content)}[role=main] .container{max-width:none;padding-left:0;padding-right:0}.shadow.docutils{border:none;box-shadow:0 .2rem .5rem rgba(0,0,0,.05),0 0 .0625rem rgba(0,0,0,.1)!important}.sphinx-bs .card{background-color:var(--color-background-secondary);color:var(--color-foreground)}
+/*# sourceMappingURL=furo-extensions.css.map*/
\ No newline at end of file
diff --git a/python/docs/_static/styles/furo-extensions.css.map b/python/docs/_static/styles/furo-extensions.css.map
new file mode 100644
index 0000000000..9ba5637f9a
--- /dev/null
+++ b/python/docs/_static/styles/furo-extensions.css.map
@@ -0,0 +1 @@
+{"version":3,"file":"styles/furo-extensions.css","mappings":"AAGA,2BACE,oFACA,4CAKE,6CAHA,YACA,eAEA,CACA,kDACE,yCAEF,8CACE,sCAEJ,8CACE,kDAEJ,2BAGE,uBACA,cAHA,gBACA,UAEA,CAGA,yCACE,mBAEF,gDAEE,gDADA,YACA,CACA,sDACE,gDACF,yDACE,sCAEJ,+CACE,UACA,qDACE,UAGF,mDACE,eAEJ,yEAEE,4DAEA,mHASE,mBAPA,kBAEA,YADA,oBAGA,aADA,gBAIA,CAEA,qIAEE,WADA,UACA,CAEJ,uGACE,aAEF,iUAGE,cAEF,mHACE,aC1EJ,gCACE,mCAEF,0BAKE,mBAUA,8CACA,YAFA,mCAKA,eAZA,cALA,UASA,YADA,YAYA,iCAdA,YAcA,CAEA,gCAEE,8CADA,gCACA,CAEF,gCAGE,6BADA,mCADA,YAEA,CAEF,kCAEE,cADA,oBACA,CACA,wCACE,cAEJ,8BACE,UC5CN,KAEE,6CAA8C,CAC9C,uDAAwD,CACxD,uDAAwD,CAGxD,iCAAsC,CAGtC,+CAAgD,CAChD,uDAAwD,CACxD,uDAAwD,CACxD,oDAAqD,CACrD,6DAA8D,CAC9D,6DAA8D,CAG9D,uDAAwD,CACxD,yDAA0D,CAC1D,4DAA6D,CAC7D,2DAA4D,CAC5D,8DAA+D,CAC/D,iEAAkE,CAClE,uDAAwD,CACxD,wDAAyD,CAG3D,gBACE,qFAGF,SACE,6EAEF,cACE,uFAEF,cACE,uFAEF,cACE,uFAGF,qBACE,eAEF,mBACE,WACA,eChDF,KACE,gDAAiD,CACjD,uDAAwD,CACxD,qDAAsD,CACtD,4DAA6D,CAC7D,oCAAqC,CACrC,2CAA4C,CAC5C,4CAA6C,CAC7C,mDAAoD,CACpD,wBAAyB,CACzB,oBAAqB,CACrB,6CAA8C,CAC9C,gCAAiC,CACjC,yDAA0D,CAC1D,uDAAwD,CACxD,8DAA+D,CCbjE,uBACE,eACA,eACA,gBAGF,iBACE,YACA,+EAGF,iBACE,mDACA","sources":["webpack:///./src/furo/assets/styles/extensions/_readthedocs.sass","webpack:///./src/furo/assets/styles/extensions/_copybutton.sass","webpack:///./src/furo/assets/styles/extensions/_sphinx-design.sass","webpack:///./src/furo/assets/styles/extensions/_sphinx-inline-tabs.sass","webpack:///./src/furo/assets/styles/extensions/_sphinx-panels.sass"],"sourcesContent":["// This file contains the styles used for tweaking how ReadTheDoc's embedded\n// contents would show up inside the theme.\n\n#furo-sidebar-ad-placement\n  padding: var(--sidebar-item-spacing-vertical) var(--sidebar-item-spacing-horizontal)\n  .ethical-sidebar\n    // Remove the border and box-shadow.\n    border: none\n    box-shadow: none\n    // Manage the background colors.\n    background: var(--color-background-secondary)\n    &:hover\n      background: var(--color-background-hover)\n    // Ensure the text is legible.\n    a\n      color: var(--color-foreground-primary)\n\n  .ethical-callout a\n    color: var(--color-foreground-secondary) !important\n\n#furo-readthedocs-versions\n  position: static\n  width: 100%\n  background: transparent\n  display: block\n\n  // Make the background color fit with the theme's aesthetic.\n  .rst-versions\n    background: rgb(26, 28, 30)\n\n  .rst-current-version\n    cursor: unset\n    background: var(--color-sidebar-item-background)\n    &:hover\n      background: var(--color-sidebar-item-background)\n    .fa-book\n      color: var(--color-foreground-primary)\n\n  > .rst-other-versions\n    padding: 0\n    small\n      opacity: 1\n\n  .injected\n    .rst-versions\n      position: unset\n\n  &:hover,\n  &:focus-within\n    box-shadow: 0 0 0 1px var(--color-sidebar-background-border)\n\n    .rst-current-version\n      // Undo the tweaks done in RTD's CSS\n      font-size: inherit\n      line-height: inherit\n      height: auto\n      text-align: right\n      padding: 12px\n\n      // Match the rest of the body\n      background: #1a1c1e\n\n      .fa-book\n        float: left\n        color: white\n\n    .fa-caret-down\n      display: none\n\n    .rst-current-version,\n    .rst-other-versions,\n    .injected\n      display: block\n\n    > .rst-current-version\n      display: none\n",".highlight\n  &:hover button.copybtn\n    color: var(--color-code-foreground)\n\n  button.copybtn\n    // Make it visible\n    opacity: 1\n\n    // Align things correctly\n    align-items: center\n\n    height: 1.25em\n    width: 1.25em\n\n    top: 0.625rem // $code-spacing-vertical\n    right: 0.5rem\n\n    // Make it look better\n    color: var(--color-background-item)\n    background-color: var(--color-code-background)\n    border: none\n\n    // Change to cursor to make it obvious that you can click on it\n    cursor: pointer\n\n    // Transition smoothly, for aesthetics\n    transition: color 300ms, opacity 300ms\n\n    &:hover\n      color: var(--color-brand-content)\n      background-color: var(--color-code-background)\n\n    &::after\n      display: none\n      color: var(--color-code-foreground)\n      background-color: transparent\n\n    &.success\n      transition: color 0ms\n      color: #22863a\n      &::after\n        display: block\n\n    svg\n      padding: 0\n","body\n  // Colors\n  --sd-color-primary: var(--color-brand-primary)\n  --sd-color-primary-highlight: var(--color-brand-content)\n  --sd-color-primary-text: var(--color-background-primary)\n\n  // Shadows\n  --sd-color-shadow: rgba(0, 0, 0, 0.05)\n\n  // Cards\n  --sd-color-card-border: var(--color-card-border)\n  --sd-color-card-border-hover: var(--color-brand-content)\n  --sd-color-card-background: var(--color-card-background)\n  --sd-color-card-text: var(--color-foreground-primary)\n  --sd-color-card-header: var(--color-card-marginals-background)\n  --sd-color-card-footer: var(--color-card-marginals-background)\n\n  // Tabs\n  --sd-color-tabs-label-active: var(--color-brand-content)\n  --sd-color-tabs-label-hover: var(--color-foreground-muted)\n  --sd-color-tabs-label-inactive: var(--color-foreground-muted)\n  --sd-color-tabs-underline-active: var(--color-brand-content)\n  --sd-color-tabs-underline-hover: var(--color-foreground-border)\n  --sd-color-tabs-underline-inactive: var(--color-background-border)\n  --sd-color-tabs-overline: var(--color-background-border)\n  --sd-color-tabs-underline: var(--color-background-border)\n\n// Tabs\n.sd-tab-content\n  box-shadow: 0 -2px var(--sd-color-tabs-overline), 0 1px var(--sd-color-tabs-underline)\n\n// Shadows\n.sd-card  // Have a shadow by default\n  box-shadow: 0 0.1rem 0.25rem var(--sd-color-shadow), 0 0 0.0625rem rgba(0, 0, 0, 0.1)\n\n.sd-shadow-sm\n  box-shadow: 0 0.1rem 0.25rem var(--sd-color-shadow), 0 0 0.0625rem rgba(0, 0, 0, 0.1) !important\n\n.sd-shadow-md\n  box-shadow: 0 0.3rem 0.75rem var(--sd-color-shadow), 0 0 0.0625rem rgba(0, 0, 0, 0.1) !important\n\n.sd-shadow-lg\n  box-shadow: 0 0.6rem 1.5rem var(--sd-color-shadow), 0 0 0.0625rem rgba(0, 0, 0, 0.1) !important\n\n// Cards\n.sd-card-hover:hover  // Don't change scale on hover\n  transform: none\n\n.sd-cards-carousel  // Have a bit of gap in the carousel by default\n  gap: 0.25rem\n  padding: 0.25rem\n","// This file contains styles to tweak sphinx-inline-tabs to work well with Furo.\n\nbody\n  --tabs--label-text: var(--color-foreground-muted)\n  --tabs--label-text--hover: var(--color-foreground-muted)\n  --tabs--label-text--active: var(--color-brand-content)\n  --tabs--label-text--active--hover: var(--color-brand-content)\n  --tabs--label-background: transparent\n  --tabs--label-background--hover: transparent\n  --tabs--label-background--active: transparent\n  --tabs--label-background--active--hover: transparent\n  --tabs--padding-x: 0.25em\n  --tabs--margin-x: 1em\n  --tabs--border: var(--color-background-border)\n  --tabs--label-border: transparent\n  --tabs--label-border--hover: var(--color-foreground-muted)\n  --tabs--label-border--active: var(--color-brand-content)\n  --tabs--label-border--active--hover: var(--color-brand-content)\n","// This file contains styles to tweak sphinx-panels to work well with Furo.\n\n// sphinx-panels includes Bootstrap 4, which uses .container which can conflict\n// with docutils' `.. container::` directive.\n[role=\"main\"] .container\n  max-width: initial\n  padding-left: initial\n  padding-right: initial\n\n// Make the panels look nicer!\n.shadow.docutils\n  border: none\n  box-shadow: 0 0.2rem 0.5rem rgba(0, 0, 0, 0.05), 0 0 0.0625rem rgba(0, 0, 0, 0.1) !important\n\n// Make panel colors respond to dark mode\n.sphinx-bs .card\n  background-color: var(--color-background-secondary)\n  color: var(--color-foreground)\n"],"names":[],"sourceRoot":""}
\ No newline at end of file
diff --git a/python/docs/_static/styles/furo.css b/python/docs/_static/styles/furo.css
new file mode 100644
index 0000000000..b30989daf5
--- /dev/null
+++ b/python/docs/_static/styles/furo.css
@@ -0,0 +1,2 @@
+/*! normalize.css v8.0.1 | MIT License | github.com/necolas/normalize.css */html{-webkit-text-size-adjust:100%;line-height:1.15}body{margin:0}main{display:block}h1{font-size:2em;margin:.67em 0}hr{box-sizing:content-box;height:0;overflow:visible}pre{font-family:monospace,monospace;font-size:1em}a{background-color:transparent}abbr[title]{border-bottom:none;text-decoration:underline;text-decoration:underline dotted}b,strong{font-weight:bolder}code,kbd,samp{font-family:monospace,monospace;font-size:1em}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}img{border-style:none}button,input,optgroup,select,textarea{font-family:inherit;font-size:100%;line-height:1.15;margin:0}button,input{overflow:visible}button,select{text-transform:none}[type=button],[type=reset],[type=submit],button{-webkit-appearance:button}[type=button]::-moz-focus-inner,[type=reset]::-moz-focus-inner,[type=submit]::-moz-focus-inner,button::-moz-focus-inner{border-style:none;padding:0}[type=button]:-moz-focusring,[type=reset]:-moz-focusring,[type=submit]:-moz-focusring,button:-moz-focusring{outline:1px dotted ButtonText}fieldset{padding:.35em .75em .625em}legend{box-sizing:border-box;color:inherit;display:table;max-width:100%;padding:0;white-space:normal}progress{vertical-align:baseline}textarea{overflow:auto}[type=checkbox],[type=radio]{box-sizing:border-box;padding:0}[type=number]::-webkit-inner-spin-button,[type=number]::-webkit-outer-spin-button{height:auto}[type=search]{-webkit-appearance:textfield;outline-offset:-2px}[type=search]::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}details{display:block}summary{display:list-item}[hidden],template{display:none}@media print{.content-icon-container,.headerlink,.mobile-header,.related-pages{display:none!important}.highlight{border:.1pt solid var(--color-foreground-border)}a,blockquote,dl,ol,pre,table,ul{page-break-inside:avoid}caption,figure,h1,h2,h3,h4,h5,h6,img{page-break-after:avoid;page-break-inside:avoid}dl,ol,ul{page-break-before:avoid}}.visually-hidden{clip:rect(0,0,0,0)!important;border:0!important;height:1px!important;margin:-1px!important;overflow:hidden!important;padding:0!important;position:absolute!important;white-space:nowrap!important;width:1px!important}:-moz-focusring{outline:auto}body{--font-stack:-apple-system,BlinkMacSystemFont,Segoe UI,Helvetica,Arial,sans-serif,Apple Color Emoji,Segoe UI Emoji;--font-stack--monospace:"SFMono-Regular",Menlo,Consolas,Monaco,Liberation Mono,Lucida Console,monospace;--font-size--normal:100%;--font-size--small:87.5%;--font-size--small--2:81.25%;--font-size--small--3:75%;--font-size--small--4:62.5%;--sidebar-caption-font-size:var(--font-size--small--2);--sidebar-item-font-size:var(--font-size--small);--sidebar-search-input-font-size:var(--font-size--small);--toc-font-size:var(--font-size--small--3);--toc-font-size--mobile:var(--font-size--normal);--toc-title-font-size:var(--font-size--small--4);--admonition-font-size:0.8125rem;--admonition-title-font-size:0.8125rem;--code-font-size:var(--font-size--small--2);--api-font-size:var(--font-size--small);--header-height:calc(var(--sidebar-item-line-height) + var(--sidebar-item-spacing-vertical)*4);--header-padding:0.5rem;--sidebar-tree-space-above:1.5rem;--sidebar-caption-space-above:1rem;--sidebar-item-line-height:1rem;--sidebar-item-spacing-vertical:0.5rem;--sidebar-item-spacing-horizontal:1rem;--sidebar-item-height:calc(var(--sidebar-item-line-height) + var(--sidebar-item-spacing-vertical)*2);--sidebar-expander-width:var(--sidebar-item-height);--sidebar-search-space-above:0.5rem;--sidebar-search-input-spacing-vertical:0.5rem;--sidebar-search-input-spacing-horizontal:0.5rem;--sidebar-search-input-height:1rem;--sidebar-search-icon-size:var(--sidebar-search-input-height);--toc-title-padding:0.25rem 0;--toc-spacing-vertical:1.5rem;--toc-spacing-horizontal:1.5rem;--toc-item-spacing-vertical:0.4rem;--toc-item-spacing-horizontal:1rem;--icon-search:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor" fill="none" stroke-linecap="round" stroke-linejoin="round"><path d="M0 0h24v24H0z" stroke="none"/><circle cx="10" cy="10" r="7"/><path d="m21 21-6-6"/></svg>');--icon-pencil:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20.71 7.04c.39-.39.39-1.04 0-1.41l-2.34-2.34c-.37-.39-1.02-.39-1.41 0l-1.84 1.83 3.75 3.75M3 17.25V21h3.75L17.81 9.93l-3.75-3.75L3 17.25z"/></svg>');--icon-abstract:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M4 5h16v2H4V5m0 4h16v2H4V9m0 4h16v2H4v-2m0 4h10v2H4v-2z"/></svg>');--icon-info:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M13 9h-2V7h2m0 10h-2v-6h2m-1-9A10 10 0 0 0 2 12a10 10 0 0 0 10 10 10 10 0 0 0 10-10A10 10 0 0 0 12 2z"/></svg>');--icon-flame:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M17.55 11.2c-.23-.3-.5-.56-.76-.82-.65-.6-1.4-1.03-2.03-1.66C13.3 7.26 13 4.85 13.91 3c-.91.23-1.75.75-2.45 1.32-2.54 2.08-3.54 5.75-2.34 8.9.04.1.08.2.08.33 0 .22-.15.42-.35.5-.22.1-.46.04-.64-.12a.83.83 0 0 1-.15-.17c-1.1-1.43-1.28-3.48-.53-5.12C5.89 10 5 12.3 5.14 14.47c.04.5.1 1 .27 1.5.14.6.4 1.2.72 1.73 1.04 1.73 2.87 2.97 4.84 3.22 2.1.27 4.35-.12 5.96-1.6 1.8-1.66 2.45-4.32 1.5-6.6l-.13-.26c-.2-.46-.47-.87-.8-1.25l.05-.01m-3.1 6.3c-.28.24-.73.5-1.08.6-1.1.4-2.2-.16-2.87-.82 1.19-.28 1.89-1.16 2.09-2.05.17-.8-.14-1.46-.27-2.23-.12-.74-.1-1.37.18-2.06.17.38.37.76.6 1.06.76 1 1.95 1.44 2.2 2.8.04.14.06.28.06.43.03.82-.32 1.72-.92 2.27h.01z"/></svg>');--icon-question:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="m15.07 11.25-.9.92C13.45 12.89 13 13.5 13 15h-2v-.5c0-1.11.45-2.11 1.17-2.83l1.24-1.26c.37-.36.59-.86.59-1.41a2 2 0 0 0-2-2 2 2 0 0 0-2 2H8a4 4 0 0 1 4-4 4 4 0 0 1 4 4 3.2 3.2 0 0 1-.93 2.25M13 19h-2v-2h2M12 2A10 10 0 0 0 2 12a10 10 0 0 0 10 10 10 10 0 0 0 10-10c0-5.53-4.5-10-10-10z"/></svg>');--icon-warning:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M13 14h-2v-4h2m0 8h-2v-2h2M1 21h22L12 2 1 21z"/></svg>');--icon-failure:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 2c5.53 0 10 4.47 10 10s-4.47 10-10 10S2 17.53 2 12 6.47 2 12 2m3.59 5L12 10.59 8.41 7 7 8.41 10.59 12 7 15.59 8.41 17 12 13.41 15.59 17 17 15.59 13.41 12 17 8.41 15.59 7z"/></svg>');--icon-spark:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="m11.5 20 4.86-9.73H13V4l-5 9.73h3.5V20M12 2c2.75 0 5.1 1 7.05 2.95C21 6.9 22 9.25 22 12s-1 5.1-2.95 7.05C17.1 21 14.75 22 12 22s-5.1-1-7.05-2.95C3 17.1 2 14.75 2 12s1-5.1 2.95-7.05C6.9 3 9.25 2 12 2z"/></svg>');--color-admonition-title--caution:#ff9100;--color-admonition-title-background--caution:rgba(255,145,0,.2);--color-admonition-title--warning:#ff9100;--color-admonition-title-background--warning:rgba(255,145,0,.2);--color-admonition-title--danger:#ff5252;--color-admonition-title-background--danger:rgba(255,82,82,.2);--color-admonition-title--attention:#ff5252;--color-admonition-title-background--attention:rgba(255,82,82,.2);--color-admonition-title--error:#ff5252;--color-admonition-title-background--error:rgba(255,82,82,.2);--color-admonition-title--hint:#00c852;--color-admonition-title-background--hint:rgba(0,200,82,.2);--color-admonition-title--tip:#00c852;--color-admonition-title-background--tip:rgba(0,200,82,.2);--color-admonition-title--important:#00bfa5;--color-admonition-title-background--important:rgba(0,191,165,.2);--color-admonition-title--note:#00b0ff;--color-admonition-title-background--note:rgba(0,176,255,.2);--color-admonition-title--seealso:#448aff;--color-admonition-title-background--seealso:rgba(68,138,255,.2);--color-admonition-title--admonition-todo:grey;--color-admonition-title-background--admonition-todo:hsla(0,0%,50%,.2);--color-admonition-title:#651fff;--color-admonition-title-background:rgba(101,31,255,.2);--icon-admonition-default:var(--icon-abstract);--color-topic-title:#14b8a6;--color-topic-title-background:rgba(20,184,166,.2);--icon-topic-default:var(--icon-pencil);--color-problematic:#b30000;--color-foreground-primary:#000;--color-foreground-secondary:#5a5c63;--color-foreground-muted:#646776;--color-foreground-border:#878787;--color-background-primary:#fff;--color-background-secondary:#f8f9fb;--color-background-hover:#efeff4;--color-background-hover--transparent:#efeff400;--color-background-border:#eeebee;--color-background-item:#ccc;--color-announcement-background:#000000dd;--color-announcement-text:#eeebee;--color-brand-primary:#2962ff;--color-brand-content:#2a5adf;--color-api-background:var(--color-background-hover--transparent);--color-api-background-hover:var(--color-background-hover);--color-api-overall:var(--color-foreground-secondary);--color-api-name:var(--color-problematic);--color-api-pre-name:var(--color-problematic);--color-api-paren:var(--color-foreground-secondary);--color-api-keyword:var(--color-foreground-primary);--color-highlight-on-target:#ffc;--color-inline-code-background:var(--color-background-secondary);--color-highlighted-background:#def;--color-highlighted-text:var(--color-foreground-primary);--color-guilabel-background:#ddeeff80;--color-guilabel-border:#bedaf580;--color-guilabel-text:var(--color-foreground-primary);--color-admonition-background:transparent;--color-table-header-background:var(--color-background-secondary);--color-table-border:var(--color-background-border);--color-card-border:var(--color-background-secondary);--color-card-background:transparent;--color-card-marginals-background:var(--color-background-secondary);--color-header-background:var(--color-background-primary);--color-header-border:var(--color-background-border);--color-header-text:var(--color-foreground-primary);--color-sidebar-background:var(--color-background-secondary);--color-sidebar-background-border:var(--color-background-border);--color-sidebar-brand-text:var(--color-foreground-primary);--color-sidebar-caption-text:var(--color-foreground-muted);--color-sidebar-link-text:var(--color-foreground-secondary);--color-sidebar-link-text--top-level:var(--color-brand-primary);--color-sidebar-item-background:var(--color-sidebar-background);--color-sidebar-item-background--current:var( --color-sidebar-item-background );--color-sidebar-item-background--hover:linear-gradient(90deg,var(--color-background-hover--transparent) 0%,var(--color-background-hover) var(--sidebar-item-spacing-horizontal),var(--color-background-hover) 100%);--color-sidebar-item-expander-background:transparent;--color-sidebar-item-expander-background--hover:var( --color-background-hover );--color-sidebar-search-text:var(--color-foreground-primary);--color-sidebar-search-background:var(--color-background-secondary);--color-sidebar-search-background--focus:var(--color-background-primary);--color-sidebar-search-border:var(--color-background-border);--color-sidebar-search-icon:var(--color-foreground-muted);--color-toc-background:var(--color-background-primary);--color-toc-title-text:var(--color-foreground-muted);--color-toc-item-text:var(--color-foreground-secondary);--color-toc-item-text--hover:var(--color-foreground-primary);--color-toc-item-text--active:var(--color-brand-primary);--color-content-foreground:var(--color-foreground-primary);--color-content-background:transparent;--color-link:var(--color-brand-content);--color-link--hover:var(--color-brand-content);--color-link-underline:var(--color-background-border);--color-link-underline--hover:var(--color-foreground-border)}.only-light{display:block!important}html body .only-dark{display:none!important}@media not print{body[data-theme=dark]{--color-problematic:#ee5151;--color-foreground-primary:#ffffffcc;--color-foreground-secondary:#9ca0a5;--color-foreground-muted:#81868d;--color-foreground-border:#666;--color-background-primary:#131416;--color-background-secondary:#1a1c1e;--color-background-hover:#1e2124;--color-background-hover--transparent:#1e212400;--color-background-border:#303335;--color-background-item:#444;--color-announcement-background:#000000dd;--color-announcement-text:#eeebee;--color-brand-primary:#2b8cee;--color-brand-content:#368ce2;--color-highlighted-background:#083563;--color-guilabel-background:#08356380;--color-guilabel-border:#13395f80;--color-api-keyword:var(--color-foreground-secondary);--color-highlight-on-target:#330;--color-admonition-background:#18181a;--color-card-border:var(--color-background-secondary);--color-card-background:#18181a;--color-card-marginals-background:var(--color-background-hover)}html body[data-theme=dark] .only-light{display:none!important}body[data-theme=dark] .only-dark{display:block!important}@media(prefers-color-scheme:dark){body:not([data-theme=light]){--color-problematic:#ee5151;--color-foreground-primary:#ffffffcc;--color-foreground-secondary:#9ca0a5;--color-foreground-muted:#81868d;--color-foreground-border:#666;--color-background-primary:#131416;--color-background-secondary:#1a1c1e;--color-background-hover:#1e2124;--color-background-hover--transparent:#1e212400;--color-background-border:#303335;--color-background-item:#444;--color-announcement-background:#000000dd;--color-announcement-text:#eeebee;--color-brand-primary:#2b8cee;--color-brand-content:#368ce2;--color-highlighted-background:#083563;--color-guilabel-background:#08356380;--color-guilabel-border:#13395f80;--color-api-keyword:var(--color-foreground-secondary);--color-highlight-on-target:#330;--color-admonition-background:#18181a;--color-card-border:var(--color-background-secondary);--color-card-background:#18181a;--color-card-marginals-background:var(--color-background-hover)}html body:not([data-theme=light]) .only-light{display:none!important}body:not([data-theme=light]) .only-dark{display:block!important}}}body[data-theme=auto] .theme-toggle svg.theme-icon-when-auto,body[data-theme=dark] .theme-toggle svg.theme-icon-when-dark,body[data-theme=light] .theme-toggle svg.theme-icon-when-light{display:block}body{font-family:var(--font-stack)}code,kbd,pre,samp{font-family:var(--font-stack--monospace)}body{-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}article{line-height:1.5}h1,h2,h3,h4,h5,h6{border-radius:.5rem;font-weight:700;line-height:1.25;margin:.5rem -.5rem;padding-left:.5rem;padding-right:.5rem}h1+p,h2+p,h3+p,h4+p,h5+p,h6+p{margin-top:0}h1{font-size:2.5em;margin-bottom:1rem}h1,h2{margin-top:1.75rem}h2{font-size:2em}h3{font-size:1.5em}h4{font-size:1.25em}h5{font-size:1.125em}h6{font-size:1em}small{font-size:80%;opacity:75%}p{margin-bottom:.75rem;margin-top:.5rem}hr.docutils{background-color:var(--color-background-border);border:0;height:1px;margin:2rem 0;padding:0}.centered{text-align:center}a{color:var(--color-link);text-decoration:underline;-webkit-text-decoration-color:var(--color-link-underline);text-decoration-color:var(--color-link-underline)}a:hover{color:var(--color-link--hover);-webkit-text-decoration-color:var(--color-link-underline--hover);text-decoration-color:var(--color-link-underline--hover)}a.muted-link{color:inherit}a.muted-link:hover{color:var(--color-link);-webkit-text-decoration-color:var(--color-link-underline--hover);text-decoration-color:var(--color-link-underline--hover)}html{overflow-x:hidden;overflow-y:scroll;scroll-behavior:smooth}.sidebar-scroll,.toc-scroll,article[role=main] *{scrollbar-color:var(--color-foreground-border) transparent;scrollbar-width:thin}.sidebar-scroll::-webkit-scrollbar,.toc-scroll::-webkit-scrollbar,article[role=main] ::-webkit-scrollbar{height:.25rem;width:.25rem}.sidebar-scroll::-webkit-scrollbar-thumb,.toc-scroll::-webkit-scrollbar-thumb,article[role=main] ::-webkit-scrollbar-thumb{background-color:var(--color-foreground-border);border-radius:.125rem}body,html{background:var(--color-background-primary);color:var(--color-foreground-primary);height:100%}article{background:var(--color-content-background);color:var(--color-content-foreground)}.page{display:flex;min-height:100%}.mobile-header{background-color:var(--color-header-background);border-bottom:1px solid var(--color-header-border);color:var(--color-header-text);display:none;height:var(--header-height);width:100%;z-index:10}.mobile-header.scrolled{border-bottom:none;box-shadow:0 0 .2rem rgba(0,0,0,.1),0 .2rem .4rem rgba(0,0,0,.2)}.mobile-header .header-center a{color:var(--color-header-text);text-decoration:none}.main{display:flex;flex:1}.sidebar-drawer{background:var(--color-sidebar-background);border-right:1px solid var(--color-sidebar-background-border);box-sizing:border-box;display:flex;justify-content:flex-end;min-width:15em;width:calc(50% - 26em)}.sidebar-container,.toc-drawer{box-sizing:border-box;width:15em}.toc-drawer{background:var(--color-toc-background);padding-right:1rem}.sidebar-sticky,.toc-sticky{display:flex;flex-direction:column;height:min(100%,100vh);height:100vh;position:-webkit-sticky;position:sticky;top:0}.sidebar-scroll,.toc-scroll{flex-grow:1;flex-shrink:1;overflow:auto;scroll-behavior:smooth}.content{display:flex;flex-direction:column;justify-content:space-between;padding:0 3em;width:46em}.icon{display:inline-block;height:1rem;width:1rem}.icon svg{height:100%;width:100%}.announcement{align-items:center;background-color:var(--color-announcement-background);color:var(--color-announcement-text);display:flex;height:var(--header-height);overflow-x:auto}.announcement+.page{min-height:calc(100% - var(--header-height))}.announcement-content{box-sizing:border-box;min-width:100%;padding:.5rem;text-align:center;white-space:nowrap}.announcement-content a{color:var(--color-announcement-text);-webkit-text-decoration-color:var(--color-announcement-text);text-decoration-color:var(--color-announcement-text)}.announcement-content a:hover{color:var(--color-announcement-text);-webkit-text-decoration-color:var(--color-link--hover);text-decoration-color:var(--color-link--hover)}.no-js .theme-toggle-container{display:none}.theme-toggle-container{vertical-align:middle}.theme-toggle{background:transparent;border:none;cursor:pointer;padding:0}.theme-toggle svg{color:var(--color-foreground-primary);display:none;height:1rem;vertical-align:middle;width:1rem}.theme-toggle-header{float:left;padding:1rem .5rem}.nav-overlay-icon,.toc-overlay-icon{cursor:pointer;display:none}.nav-overlay-icon .icon,.toc-overlay-icon .icon{color:var(--color-foreground-secondary);height:1rem;width:1rem}.nav-overlay-icon,.toc-header-icon{align-items:center;justify-content:center}.toc-content-icon{height:1.5rem;width:1.5rem}.content-icon-container{display:flex;float:right;gap:.5rem;margin-bottom:1rem;margin-left:1rem;margin-top:1.5rem}.content-icon-container .edit-this-page svg{color:inherit;height:1rem;width:1rem}.sidebar-toggle{display:none;position:absolute}.sidebar-toggle[name=__toc]{left:20px}.sidebar-toggle:checked{left:40px}.overlay{background-color:rgba(0,0,0,.54);height:0;opacity:0;position:fixed;top:0;transition:width 0ms,height 0ms,opacity .25s ease-out;width:0}.sidebar-overlay{z-index:20}.toc-overlay{z-index:40}.sidebar-drawer{transition:left .25s ease-in-out;z-index:30}.toc-drawer{transition:right .25s ease-in-out;z-index:50}#__navigation:checked~.sidebar-overlay{height:100%;opacity:1;width:100%}#__navigation:checked~.page .sidebar-drawer{left:0;top:0}#__toc:checked~.toc-overlay{height:100%;opacity:1;width:100%}#__toc:checked~.page .toc-drawer{right:0;top:0}.back-to-top{background:var(--color-background-primary);border-radius:1rem;box-shadow:0 .2rem .5rem rgba(0,0,0,.05),0 0 1px 0 hsla(220,9%,46%,.502);display:none;font-size:.8125rem;left:0;margin-left:50%;padding:.5rem .75rem .5rem .5rem;position:fixed;text-decoration:none;top:1rem;transform:translateX(-50%);z-index:10}.back-to-top svg{fill:currentColor;display:inline-block;height:1rem;width:1rem}.back-to-top span{margin-left:.25rem}.show-back-to-top .back-to-top{align-items:center;display:flex}@media(min-width:97em){html{font-size:110%}}@media(max-width:82em){.toc-content-icon{display:flex}.toc-drawer{border-left:1px solid var(--color-background-muted);height:100vh;position:fixed;right:-15em;top:0}.toc-tree{border-left:none;font-size:var(--toc-font-size--mobile)}.sidebar-drawer{width:calc(50% - 18.5em)}}@media(max-width:67em){.nav-overlay-icon{display:flex}.sidebar-drawer{height:100vh;left:-15em;position:fixed;top:0;width:15em}.toc-header-icon{display:flex}.theme-toggle-content,.toc-content-icon{display:none}.theme-toggle-header{display:block}.mobile-header{align-items:center;display:flex;justify-content:space-between;position:-webkit-sticky;position:sticky;top:0}.mobile-header .header-left,.mobile-header .header-right{display:flex;height:var(--header-height);padding:0 var(--header-padding)}.mobile-header .header-left label,.mobile-header .header-right label{height:100%;-webkit-user-select:none;-moz-user-select:none;user-select:none;width:100%}.nav-overlay-icon .icon,.theme-toggle svg{height:1.25rem;width:1.25rem}:target{scroll-margin-top:var(--header-height)}.back-to-top{top:calc(var(--header-height) + .5rem)}.page{flex-direction:column;justify-content:center}.content{margin-left:auto;margin-right:auto}}@media(max-width:52em){.content{overflow-x:auto;width:100%}}@media(max-width:46em){.content{padding:0 1em}article aside.sidebar{float:none;margin:1rem 0;width:100%}}.admonition,.topic{background:var(--color-admonition-background);border-radius:.2rem;box-shadow:0 .2rem .5rem rgba(0,0,0,.05),0 0 .0625rem rgba(0,0,0,.1);font-size:var(--admonition-font-size);margin:1rem auto;overflow:hidden;padding:0 .5rem .5rem;page-break-inside:avoid}.admonition>:nth-child(2),.topic>:nth-child(2){margin-top:0}.admonition>:last-child,.topic>:last-child{margin-bottom:0}p.admonition-title,p.topic-title{font-size:var(--admonition-title-font-size);font-weight:500;line-height:1.3;margin:0 -.5rem .5rem;padding:.4rem .5rem .4rem 2rem;position:relative}p.admonition-title:before,p.topic-title:before{content:"";height:1rem;left:.5rem;position:absolute;width:1rem}p.admonition-title{background-color:var(--color-admonition-title-background)}p.admonition-title:before{background-color:var(--color-admonition-title);-webkit-mask-image:var(--icon-admonition-default);mask-image:var(--icon-admonition-default);-webkit-mask-repeat:no-repeat;mask-repeat:no-repeat}p.topic-title{background-color:var(--color-topic-title-background)}p.topic-title:before{background-color:var(--color-topic-title);-webkit-mask-image:var(--icon-topic-default);mask-image:var(--icon-topic-default);-webkit-mask-repeat:no-repeat;mask-repeat:no-repeat}.admonition{border-left:.2rem solid var(--color-admonition-title)}.admonition.caution{border-left-color:var(--color-admonition-title--caution)}.admonition.caution>.admonition-title{background-color:var(--color-admonition-title-background--caution)}.admonition.caution>.admonition-title:before{background-color:var(--color-admonition-title--caution);-webkit-mask-image:var(--icon-spark);mask-image:var(--icon-spark)}.admonition.warning{border-left-color:var(--color-admonition-title--warning)}.admonition.warning>.admonition-title{background-color:var(--color-admonition-title-background--warning)}.admonition.warning>.admonition-title:before{background-color:var(--color-admonition-title--warning);-webkit-mask-image:var(--icon-warning);mask-image:var(--icon-warning)}.admonition.danger{border-left-color:var(--color-admonition-title--danger)}.admonition.danger>.admonition-title{background-color:var(--color-admonition-title-background--danger)}.admonition.danger>.admonition-title:before{background-color:var(--color-admonition-title--danger);-webkit-mask-image:var(--icon-spark);mask-image:var(--icon-spark)}.admonition.attention{border-left-color:var(--color-admonition-title--attention)}.admonition.attention>.admonition-title{background-color:var(--color-admonition-title-background--attention)}.admonition.attention>.admonition-title:before{background-color:var(--color-admonition-title--attention);-webkit-mask-image:var(--icon-warning);mask-image:var(--icon-warning)}.admonition.error{border-left-color:var(--color-admonition-title--error)}.admonition.error>.admonition-title{background-color:var(--color-admonition-title-background--error)}.admonition.error>.admonition-title:before{background-color:var(--color-admonition-title--error);-webkit-mask-image:var(--icon-failure);mask-image:var(--icon-failure)}.admonition.hint{border-left-color:var(--color-admonition-title--hint)}.admonition.hint>.admonition-title{background-color:var(--color-admonition-title-background--hint)}.admonition.hint>.admonition-title:before{background-color:var(--color-admonition-title--hint);-webkit-mask-image:var(--icon-question);mask-image:var(--icon-question)}.admonition.tip{border-left-color:var(--color-admonition-title--tip)}.admonition.tip>.admonition-title{background-color:var(--color-admonition-title-background--tip)}.admonition.tip>.admonition-title:before{background-color:var(--color-admonition-title--tip);-webkit-mask-image:var(--icon-info);mask-image:var(--icon-info)}.admonition.important{border-left-color:var(--color-admonition-title--important)}.admonition.important>.admonition-title{background-color:var(--color-admonition-title-background--important)}.admonition.important>.admonition-title:before{background-color:var(--color-admonition-title--important);-webkit-mask-image:var(--icon-flame);mask-image:var(--icon-flame)}.admonition.note{border-left-color:var(--color-admonition-title--note)}.admonition.note>.admonition-title{background-color:var(--color-admonition-title-background--note)}.admonition.note>.admonition-title:before{background-color:var(--color-admonition-title--note);-webkit-mask-image:var(--icon-pencil);mask-image:var(--icon-pencil)}.admonition.seealso{border-left-color:var(--color-admonition-title--seealso)}.admonition.seealso>.admonition-title{background-color:var(--color-admonition-title-background--seealso)}.admonition.seealso>.admonition-title:before{background-color:var(--color-admonition-title--seealso);-webkit-mask-image:var(--icon-info);mask-image:var(--icon-info)}.admonition.admonition-todo{border-left-color:var(--color-admonition-title--admonition-todo)}.admonition.admonition-todo>.admonition-title{background-color:var(--color-admonition-title-background--admonition-todo)}.admonition.admonition-todo>.admonition-title:before{background-color:var(--color-admonition-title--admonition-todo);-webkit-mask-image:var(--icon-pencil);mask-image:var(--icon-pencil)}.admonition-todo>.admonition-title{text-transform:uppercase}dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) dd{margin-left:2rem}dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) dd>:first-child{margin-top:.125rem}dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) .field-list,dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) dd>:last-child{margin-bottom:.75rem}dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) .field-list>dt{font-size:var(--font-size--small);text-transform:uppercase}dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) .field-list dd:empty{margin-bottom:.5rem}dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) .field-list dd>ul{margin-left:-1.2rem}dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) .field-list dd>ul>li>p:nth-child(2){margin-top:0}dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) .field-list dd>ul>li>p+p:last-child:empty{margin-bottom:0;margin-top:0}dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple)>dt{color:var(--color-api-overall)}.sig:not(.sig-inline){background:var(--color-api-background);border-radius:.25rem;font-family:var(--font-stack--monospace);font-size:var(--api-font-size);font-weight:700;margin-left:-.25rem;margin-right:-.25rem;padding:.25rem .5rem .25rem 3em;text-indent:-2.5em;transition:background .1s ease-out}.sig:not(.sig-inline):hover{background:var(--color-api-background-hover)}.sig:not(.sig-inline) a.reference .viewcode-link{font-weight:400;width:3.5rem}.sig:not(.sig-inline) span.pre{overflow-wrap:anywhere}em.property{font-style:normal}em.property:first-child{color:var(--color-api-keyword)}.sig-name{color:var(--color-api-name)}.sig-prename{color:var(--color-api-pre-name);font-weight:400}.sig-paren{color:var(--color-api-paren)}.sig-param{font-style:normal}.versionmodified{font-style:italic}div.deprecated p,div.versionadded p,div.versionchanged p{margin-bottom:.125rem;margin-top:.125rem}.viewcode-back,.viewcode-link{float:right;text-align:right}.line-block{margin-bottom:.75rem;margin-top:.5rem}.line-block .line-block{margin-bottom:0;margin-top:0;padding-left:1rem}.code-block-caption,article p.caption,table>caption{font-size:var(--font-size--small);text-align:center}.toctree-wrapper.compound .caption,.toctree-wrapper.compound :not(.caption)>.caption-text{font-size:var(--font-size--small);margin-bottom:0;text-align:initial;text-transform:uppercase}.toctree-wrapper.compound>ul{margin-bottom:0;margin-top:0}.sig-inline,code.literal{background:var(--color-inline-code-background);border-radius:.2em;font-size:var(--font-size--small--2);overflow-wrap:break-word;padding:.1em .2em}p .sig-inline,p code.literal{border:1px solid var(--color-background-border)}.sig-inline{font-family:var(--font-stack--monospace)}div[class*=" highlight-"],div[class^=highlight-]{display:flex;margin:1em 0}div[class*=" highlight-"] .table-wrapper,div[class^=highlight-] .table-wrapper,pre{margin:0;padding:0}pre{overflow:auto}article[role=main] .highlight pre{line-height:1.5}.highlight pre,pre.literal-block{font-size:var(--code-font-size);padding:.625rem .875rem}pre.literal-block{background-color:var(--color-code-background);border-radius:.2rem;color:var(--color-code-foreground);margin-bottom:1rem;margin-top:1rem}.highlight{border-radius:.2rem;width:100%}.highlight .gp,.highlight span.linenos{pointer-events:none;-webkit-user-select:none;-moz-user-select:none;user-select:none}.highlight .hll{display:block;margin-left:-.875rem;margin-right:-.875rem;padding-left:.875rem;padding-right:.875rem}.code-block-caption{background-color:var(--color-code-background);border-bottom:1px solid;border-radius:.25rem;border-bottom-left-radius:0;border-bottom-right-radius:0;border-color:var(--color-background-border);color:var(--color-code-foreground);display:flex;font-weight:300;padding:.625rem .875rem}.code-block-caption+div[class]{margin-top:0}.code-block-caption+div[class] pre{border-top-left-radius:0;border-top-right-radius:0}.highlighttable{display:block;width:100%}.highlighttable tbody{display:block}.highlighttable tr{display:flex}.highlighttable td.linenos{background-color:var(--color-code-background);border-bottom-left-radius:.2rem;border-top-left-radius:.2rem;color:var(--color-code-foreground);padding:.625rem 0 .625rem .875rem}.highlighttable .linenodiv{box-shadow:-.0625rem 0 var(--color-foreground-border) inset;font-size:var(--code-font-size);padding-right:.875rem}.highlighttable td.code{display:block;flex:1;overflow:hidden;padding:0}.highlighttable td.code .highlight{border-bottom-left-radius:0;border-top-left-radius:0}.highlight span.linenos{box-shadow:-.0625rem 0 var(--color-foreground-border) inset;display:inline-block;margin-right:.875rem;padding-left:0;padding-right:.875rem}.footnote-reference{font-size:var(--font-size--small--4);vertical-align:super}dl.footnote.brackets{color:var(--color-foreground-secondary);display:grid;font-size:var(--font-size--small);grid-template-columns:-webkit-max-content auto;grid-template-columns:max-content auto}dl.footnote.brackets dt{margin:0}dl.footnote.brackets dt>.fn-backref{margin-left:.25rem}dl.footnote.brackets dt:after{content:":"}dl.footnote.brackets dt .brackets:before{content:"["}dl.footnote.brackets dt .brackets:after{content:"]"}dl.footnote.brackets dd{margin:0;padding:0 1rem}aside.footnote{color:var(--color-foreground-secondary);font-size:var(--font-size--small)}aside.footnote>span,div.citation>span{float:left;font-weight:500;padding-right:.25rem}aside.footnote>p,div.citation>p{margin-left:2rem}img{box-sizing:border-box;height:auto;max-width:100%}article .figure,article figure{border-radius:.2rem;margin:0}article .figure :last-child,article figure :last-child{margin-bottom:0}article .align-left{clear:left;float:left;margin:0 1rem 1rem}article .align-right{clear:right;float:right;margin:0 1rem 1rem}article .align-center,article .align-default{display:block;margin-left:auto;margin-right:auto;text-align:center}article table.align-default{display:table;text-align:initial}.domainindex-jumpbox,.genindex-jumpbox{border-bottom:1px solid var(--color-background-border);border-top:1px solid var(--color-background-border);padding:.25rem}.domainindex-section h2,.genindex-section h2{margin-bottom:.5rem;margin-top:.75rem}.domainindex-section ul,.genindex-section ul{margin-bottom:0;margin-top:0}ol,ul{margin-bottom:1rem;margin-top:1rem;padding-left:1.2rem}ol li>p:first-child,ul li>p:first-child{margin-bottom:.25rem;margin-top:.25rem}ol li>p:last-child,ul li>p:last-child{margin-top:.25rem}ol li>ol,ol li>ul,ul li>ol,ul li>ul{margin-bottom:.5rem;margin-top:.5rem}ol.arabic{list-style:decimal}ol.loweralpha{list-style:lower-alpha}ol.upperalpha{list-style:upper-alpha}ol.lowerroman{list-style:lower-roman}ol.upperroman{list-style:upper-roman}.simple li>ol,.simple li>ul,.toctree-wrapper li>ol,.toctree-wrapper li>ul{margin-bottom:0;margin-top:0}.field-list dt,.option-list dt,dl.footnote dt,dl.glossary dt,dl.simple dt,dl:not([class]) dt{font-weight:500;margin-top:.25rem}.field-list dt+dt,.option-list dt+dt,dl.footnote dt+dt,dl.glossary dt+dt,dl.simple dt+dt,dl:not([class]) dt+dt{margin-top:0}.field-list dt .classifier:before,.option-list dt .classifier:before,dl.footnote dt .classifier:before,dl.glossary dt .classifier:before,dl.simple dt .classifier:before,dl:not([class]) dt .classifier:before{content:":";margin-left:.2rem;margin-right:.2rem}.field-list dd ul,.field-list dd>p:first-child,.option-list dd ul,.option-list dd>p:first-child,dl.footnote dd ul,dl.footnote dd>p:first-child,dl.glossary dd ul,dl.glossary dd>p:first-child,dl.simple dd ul,dl.simple dd>p:first-child,dl:not([class]) dd ul,dl:not([class]) dd>p:first-child{margin-top:.125rem}.field-list dd ul,.option-list dd ul,dl.footnote dd ul,dl.glossary dd ul,dl.simple dd ul,dl:not([class]) dd ul{margin-bottom:.125rem}.math-wrapper{overflow-x:auto;width:100%}div.math{position:relative;text-align:center}div.math .headerlink,div.math:focus .headerlink{display:none}div.math:hover .headerlink{display:inline-block}div.math span.eqno{position:absolute;right:.5rem;top:50%;transform:translateY(-50%);z-index:1}abbr[title]{cursor:help}.problematic{color:var(--color-problematic)}kbd:not(.compound){background-color:var(--color-background-secondary);border:1px solid var(--color-foreground-border);border-radius:.2rem;box-shadow:0 .0625rem 0 rgba(0,0,0,.2),inset 0 0 0 .125rem var(--color-background-primary);color:var(--color-foreground-primary);display:inline-block;font-size:var(--font-size--small--3);margin:0 .2rem;padding:0 .2rem;vertical-align:text-bottom}blockquote{background:var(--color-background-secondary);border-left:4px solid var(--color-background-border);margin-left:0;margin-right:0;padding:.5rem 1rem}blockquote .attribution{font-weight:600;text-align:right}blockquote.highlights,blockquote.pull-quote{font-size:1.25em}blockquote.epigraph,blockquote.pull-quote{border-left-width:0;border-radius:.5rem}blockquote.highlights{background:transparent;border-left-width:0}p .reference img{vertical-align:middle}p.rubric{font-size:1.125em;font-weight:700;line-height:1.25}dd p.rubric{font-size:var(--font-size--small);font-weight:inherit;line-height:inherit;text-transform:uppercase}article .sidebar{background-color:var(--color-background-secondary);border:1px solid var(--color-background-border);border-radius:.2rem;clear:right;float:right;margin-left:1rem;margin-right:0;width:30%}article .sidebar>*{padding-left:1rem;padding-right:1rem}article .sidebar>ol,article .sidebar>ul{padding-left:2.2rem}article .sidebar .sidebar-title{border-bottom:1px solid var(--color-background-border);font-weight:500;margin:0;padding:.5rem 1rem}.table-wrapper{margin-bottom:.5rem;margin-top:1rem;overflow-x:auto;padding:.2rem .2rem .75rem;width:100%}table.docutils{border-collapse:collapse;border-radius:.2rem;border-spacing:0;box-shadow:0 .2rem .5rem rgba(0,0,0,.05),0 0 .0625rem rgba(0,0,0,.1)}table.docutils th{background:var(--color-table-header-background)}table.docutils td,table.docutils th{border-bottom:1px solid var(--color-table-border);border-left:1px solid var(--color-table-border);border-right:1px solid var(--color-table-border);padding:0 .25rem}table.docutils td p,table.docutils th p{margin:.25rem}table.docutils td:first-child,table.docutils th:first-child{border-left:none}table.docutils td:last-child,table.docutils th:last-child{border-right:none}table.docutils td.text-left,table.docutils th.text-left{text-align:left}table.docutils td.text-right,table.docutils th.text-right{text-align:right}table.docutils td.text-center,table.docutils th.text-center{text-align:center}:target{scroll-margin-top:.5rem}@media(max-width:67em){:target{scroll-margin-top:calc(.5rem + var(--header-height))}section>span:target{scroll-margin-top:calc(.8rem + var(--header-height))}}.headerlink{font-weight:100;-webkit-user-select:none;-moz-user-select:none;user-select:none}.code-block-caption>.headerlink,dl dt>.headerlink,figcaption p>.headerlink,h1>.headerlink,h2>.headerlink,h3>.headerlink,h4>.headerlink,h5>.headerlink,h6>.headerlink,p.caption>.headerlink,table>caption>.headerlink{margin-left:.5rem;visibility:hidden}.code-block-caption:hover>.headerlink,dl dt:hover>.headerlink,figcaption p:hover>.headerlink,h1:hover>.headerlink,h2:hover>.headerlink,h3:hover>.headerlink,h4:hover>.headerlink,h5:hover>.headerlink,h6:hover>.headerlink,p.caption:hover>.headerlink,table>caption:hover>.headerlink{visibility:visible}.code-block-caption>.toc-backref,dl dt>.toc-backref,figcaption p>.toc-backref,h1>.toc-backref,h2>.toc-backref,h3>.toc-backref,h4>.toc-backref,h5>.toc-backref,h6>.toc-backref,p.caption>.toc-backref,table>caption>.toc-backref{color:inherit;-webkit-text-decoration-line:none;text-decoration-line:none}figure:hover>figcaption>p>.headerlink,table:hover>caption>.headerlink{visibility:visible}:target>h1:first-of-type,:target>h2:first-of-type,:target>h3:first-of-type,:target>h4:first-of-type,:target>h5:first-of-type,:target>h6:first-of-type,span:target~h1:first-of-type,span:target~h2:first-of-type,span:target~h3:first-of-type,span:target~h4:first-of-type,span:target~h5:first-of-type,span:target~h6:first-of-type{background-color:var(--color-highlight-on-target)}:target>h1:first-of-type code.literal,:target>h2:first-of-type code.literal,:target>h3:first-of-type code.literal,:target>h4:first-of-type code.literal,:target>h5:first-of-type code.literal,:target>h6:first-of-type code.literal,span:target~h1:first-of-type code.literal,span:target~h2:first-of-type code.literal,span:target~h3:first-of-type code.literal,span:target~h4:first-of-type code.literal,span:target~h5:first-of-type code.literal,span:target~h6:first-of-type code.literal{background-color:transparent}.literal-block-wrapper:target .code-block-caption,.this-will-duplicate-information-and-it-is-still-useful-here li :target,figure:target,table:target>caption{background-color:var(--color-highlight-on-target)}dt:target{background-color:var(--color-highlight-on-target)!important}.footnote-reference:target,.footnote>dt:target+dd{background-color:var(--color-highlight-on-target)}.guilabel{background-color:var(--color-guilabel-background);border:1px solid var(--color-guilabel-border);border-radius:.5em;color:var(--color-guilabel-text);font-size:.9em;padding:0 .3em}footer{display:flex;flex-direction:column;font-size:var(--font-size--small);margin-top:2rem}.bottom-of-page{align-items:center;border-top:1px solid var(--color-background-border);color:var(--color-foreground-secondary);display:flex;justify-content:space-between;line-height:1.5;margin-top:1rem;padding-bottom:1rem;padding-top:1rem}@media(max-width:46em){.bottom-of-page{flex-direction:column-reverse;gap:.25rem;text-align:center}}.bottom-of-page .left-details{font-size:var(--font-size--small)}.bottom-of-page .right-details{display:flex;flex-direction:column;gap:.25rem;text-align:right}.bottom-of-page .icons{display:flex;font-size:1rem;gap:.25rem;justify-content:flex-end}.bottom-of-page .icons a{text-decoration:none}.bottom-of-page .icons img,.bottom-of-page .icons svg{font-size:1.125rem;height:1em;width:1em}.related-pages a{align-items:center;display:flex;text-decoration:none}.related-pages a:hover .page-info .title{color:var(--color-link);text-decoration:underline;-webkit-text-decoration-color:var(--color-link-underline);text-decoration-color:var(--color-link-underline)}.related-pages a svg.furo-related-icon,.related-pages a svg.furo-related-icon>use{color:var(--color-foreground-border);flex-shrink:0;height:.75rem;margin:0 .5rem;width:.75rem}.related-pages a.next-page{clear:right;float:right;max-width:50%;text-align:right}.related-pages a.prev-page{clear:left;float:left;max-width:50%}.related-pages a.prev-page svg{transform:rotate(180deg)}.page-info{display:flex;flex-direction:column;overflow-wrap:anywhere}.next-page .page-info{align-items:flex-end}.page-info .context{align-items:center;color:var(--color-foreground-muted);display:flex;font-size:var(--font-size--small);padding-bottom:.1rem;text-decoration:none}ul.search{list-style:none;padding-left:0}ul.search li{border-bottom:1px solid var(--color-background-border);padding:1rem 0}[role=main] .highlighted{background-color:var(--color-highlighted-background);color:var(--color-highlighted-text)}.sidebar-brand{display:flex;flex-direction:column;flex-shrink:0;padding:var(--sidebar-item-spacing-vertical) var(--sidebar-item-spacing-horizontal);text-decoration:none}.sidebar-brand-text{color:var(--color-sidebar-brand-text);font-size:1.5rem;overflow-wrap:break-word}.sidebar-brand-text,.sidebar-logo-container{margin:var(--sidebar-item-spacing-vertical) 0}.sidebar-logo{display:block;margin:0 auto;max-width:100%}.sidebar-search-container{align-items:center;background:var(--color-sidebar-search-background);display:flex;margin-top:var(--sidebar-search-space-above);position:relative}.sidebar-search-container:focus-within,.sidebar-search-container:hover{background:var(--color-sidebar-search-background--focus)}.sidebar-search-container:before{background-color:var(--color-sidebar-search-icon);content:"";height:var(--sidebar-search-icon-size);left:var(--sidebar-item-spacing-horizontal);-webkit-mask-image:var(--icon-search);mask-image:var(--icon-search);position:absolute;width:var(--sidebar-search-icon-size)}.sidebar-search{background:transparent;border:none;border-bottom:1px solid var(--color-sidebar-search-border);border-top:1px solid var(--color-sidebar-search-border);box-sizing:border-box;color:var(--color-sidebar-search-foreground);padding:var(--sidebar-search-input-spacing-vertical) var(--sidebar-search-input-spacing-horizontal) var(--sidebar-search-input-spacing-vertical) calc(var(--sidebar-item-spacing-horizontal) + var(--sidebar-search-input-spacing-horizontal) + var(--sidebar-search-icon-size));width:100%;z-index:10}.sidebar-search:focus{outline:none}.sidebar-search::-moz-placeholder{font-size:var(--sidebar-search-input-font-size)}.sidebar-search::placeholder{font-size:var(--sidebar-search-input-font-size)}#searchbox .highlight-link{margin:0;padding:var(--sidebar-item-spacing-vertical) var(--sidebar-item-spacing-horizontal) 0;text-align:center}#searchbox .highlight-link a{color:var(--color-sidebar-search-icon);font-size:var(--font-size--small--2)}.sidebar-tree{font-size:var(--sidebar-item-font-size);margin-bottom:var(--sidebar-item-spacing-vertical);margin-top:var(--sidebar-tree-space-above)}.sidebar-tree ul{display:flex;flex-direction:column;list-style:none;margin-bottom:0;margin-top:0;padding:0}.sidebar-tree li{margin:0;position:relative}.sidebar-tree li>ul{margin-left:var(--sidebar-item-spacing-horizontal)}.sidebar-tree .icon,.sidebar-tree .reference{color:var(--color-sidebar-link-text)}.sidebar-tree .reference{box-sizing:border-box;display:inline-block;height:100%;line-height:var(--sidebar-item-line-height);overflow-wrap:anywhere;padding:var(--sidebar-item-spacing-vertical) var(--sidebar-item-spacing-horizontal);text-decoration:none;width:100%}.sidebar-tree .reference:hover{background:var(--color-sidebar-item-background--hover)}.sidebar-tree .reference.external:after{color:var(--color-sidebar-link-text);content:url("data:image/svg+xml;charset=utf-8,%3Csvg width='12' height='12' xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24' stroke-width='1.5' stroke='%23607D8B' fill='none' stroke-linecap='round' stroke-linejoin='round'%3E%3Cpath d='M0 0h24v24H0z' stroke='none'/%3E%3Cpath d='M11 7H6a2 2 0 0 0-2 2v9a2 2 0 0 0 2 2h9a2 2 0 0 0 2-2v-5M10 14 20 4M15 4h5v5'/%3E%3C/svg%3E");margin:0 .25rem;vertical-align:middle}.sidebar-tree .current-page>.reference{font-weight:700}.sidebar-tree label{align-items:center;cursor:pointer;display:flex;height:var(--sidebar-item-height);justify-content:center;position:absolute;right:0;top:0;-webkit-user-select:none;-moz-user-select:none;user-select:none;width:var(--sidebar-expander-width)}.sidebar-tree .caption,.sidebar-tree :not(.caption)>.caption-text{color:var(--color-sidebar-caption-text);font-size:var(--sidebar-caption-font-size);font-weight:700;margin:var(--sidebar-caption-space-above) 0 0 0;padding:var(--sidebar-item-spacing-vertical) var(--sidebar-item-spacing-horizontal);text-transform:uppercase}.sidebar-tree li.has-children>.reference{padding-right:var(--sidebar-expander-width)}.sidebar-tree .toctree-l1>.reference,.sidebar-tree .toctree-l1>label .icon{color:var(--color-sidebar-link-text--top-level)}.sidebar-tree label{background:var(--color-sidebar-item-expander-background)}.sidebar-tree label:hover{background:var(--color-sidebar-item-expander-background--hover)}.sidebar-tree .current>.reference{background:var(--color-sidebar-item-background--current)}.sidebar-tree .current>.reference:hover{background:var(--color-sidebar-item-background--hover)}.toctree-checkbox{display:none;position:absolute}.toctree-checkbox~ul{display:none}.toctree-checkbox~label .icon svg{transform:rotate(90deg)}.toctree-checkbox:checked~ul{display:block}.toctree-checkbox:checked~label .icon svg{transform:rotate(-90deg)}.toc-title-container{padding:var(--toc-title-padding);padding-top:var(--toc-spacing-vertical)}.toc-title{color:var(--color-toc-title-text);font-size:var(--toc-title-font-size);padding-left:var(--toc-spacing-horizontal);text-transform:uppercase}.no-toc{display:none}.toc-tree-container{padding-bottom:var(--toc-spacing-vertical)}.toc-tree{border-left:1px solid var(--color-background-border);font-size:var(--toc-font-size);line-height:1.3;padding-left:calc(var(--toc-spacing-horizontal) - var(--toc-item-spacing-horizontal))}.toc-tree>ul>li:first-child{padding-top:0}.toc-tree>ul>li:first-child>ul{padding-left:0}.toc-tree>ul>li:first-child>a{display:none}.toc-tree ul{list-style-type:none;margin-bottom:0;margin-top:0;padding-left:var(--toc-item-spacing-horizontal)}.toc-tree li{padding-top:var(--toc-item-spacing-vertical)}.toc-tree li.scroll-current>.reference{color:var(--color-toc-item-text--active);font-weight:700}.toc-tree .reference{color:var(--color-toc-item-text);overflow-wrap:anywhere;text-decoration:none}.toc-scroll{max-height:100vh;overflow-y:scroll}.contents:not(.this-will-duplicate-information-and-it-is-still-useful-here){background:rgba(255,0,0,.25);color:var(--color-problematic)}.contents:not(.this-will-duplicate-information-and-it-is-still-useful-here):before{content:"ERROR: Adding a table of contents in Furo-based documentation is unnecessary, and does not work well with existing styling.Add a 'this-will-duplicate-information-and-it-is-still-useful-here' class, if you want an escape hatch."}.text-align\:left>p{text-align:left}.text-align\:center>p{text-align:center}.text-align\:right>p{text-align:right}
+/*# sourceMappingURL=furo.css.map*/
\ No newline at end of file
diff --git a/python/docs/_static/styles/furo.css.map b/python/docs/_static/styles/furo.css.map
new file mode 100644
index 0000000000..92af407572
--- /dev/null
+++ b/python/docs/_static/styles/furo.css.map
@@ -0,0 +1 @@
+{"version":3,"file":"styles/furo.css","mappings":"AAAA,2EAA2E,CAU3E,KAEE,6BAA8B,CAD9B,gBAEF,CASA,KACE,QACF,CAMA,KACE,aACF,CAOA,GACE,aAAc,CACd,cACF,CAUA,GACE,sBAAuB,CACvB,QAAS,CACT,gBACF,CAOA,IACE,+BAAiC,CACjC,aACF,CASA,EACE,4BACF,CAOA,YACE,kBAAmB,CACnB,yBAA0B,CAC1B,gCACF,CAMA,SAEE,kBACF,CAOA,cAGE,+BAAiC,CACjC,aACF,CAeA,QAEE,aAAc,CACd,aAAc,CACd,iBAAkB,CAClB,uBACF,CAEA,IACE,aACF,CAEA,IACE,SACF,CASA,IACE,iBACF,CAUA,sCAKE,mBAAoB,CACpB,cAAe,CACf,gBAAiB,CACjB,QACF,CAOA,aAEE,gBACF,CAOA,cAEE,mBACF,CAMA,gDAIE,yBACF,CAMA,wHAIE,iBAAkB,CAClB,SACF,CAMA,4GAIE,6BACF,CAMA,SACE,0BACF,CASA,OACE,qBAAsB,CACtB,aAAc,CACd,aAAc,CACd,cAAe,CACf,SAAU,CACV,kBACF,CAMA,SACE,uBACF,CAMA,SACE,aACF,CAOA,6BAEE,qBAAsB,CACtB,SACF,CAMA,kFAEE,WACF,CAOA,cACE,4BAA6B,CAC7B,mBACF,CAMA,yCACE,uBACF,CAOA,6BACE,yBAA0B,CAC1B,YACF,CASA,QACE,aACF,CAMA,QACE,iBACF,CAiBA,kBACE,YACF,CCvVA,aAcE,kEACE,uBAOF,WACE,iDAMF,gCACE,wBAEF,qCAEE,uBADA,uBACA,CAEF,SACE,wBAtBA,CCpBJ,iBAOE,6BAEA,mBANA,qBAEA,sBACA,0BAFA,oBAHA,4BAOA,6BANA,mBAOA,CAEF,gBACE,aCPF,KCGE,mHAEA,wGAGA,wBAAyB,CACzB,wBAAyB,CACzB,4BAA6B,CAC7B,yBAA0B,CAC1B,2BAA4B,CAG5B,sDAAuD,CACvD,gDAAiD,CACjD,wDAAyD,CAGzD,0CAA2C,CAC3C,gDAAiD,CACjD,gDAAiD,CAKjD,gCAAiC,CACjC,sCAAuC,CAGvC,2CAA4C,CAG5C,uCAAwC,CChCxC,+FAGA,uBAAwB,CAGxB,iCAAkC,CAClC,kCAAmC,CAEnC,+BAAgC,CAChC,sCAAuC,CACvC,sCAAuC,CACvC,qGAIA,mDAAoD,CAEpD,mCAAoC,CACpC,8CAA+C,CAC/C,gDAAiD,CACjD,kCAAmC,CACnC,6DAA8D,CAG9D,6BAA8B,CAC9B,6BAA8B,CAC9B,+BAAgC,CAChC,kCAAmC,CACnC,kCAAmC,CCPjC,ukBCYA,srCAZF,kaCVA,mLAOA,oTAWA,2UAaA,0CACA,gEACA,0CAGA,gEAUA,yCACA,+DAGA,4CACA,CACA,iEAGA,sGACA,uCACA,4DAGA,sCACA,2DAEA,4CACA,kEACA,oGACA,CAEA,0GACA,+CAGA,+MAOA,+EACA,wCAIA,4DACA,sEACA,kEACA,sEACA,gDAGA,+DACA,0CACA,gEACA,gGACA,CAGA,2DACA,qDAGA,0CACA,8CACA,oDACA,oDL7GF,iCAEA,iEAME,oCKyGA,yDAIA,sCACA,kCACA,sDAGA,0CACA,kEACA,oDAEA,sDAGA,oCACA,oEAIA,CAGA,yDAGA,qDACA,oDAGA,6DAIA,iEAGA,2DAEA,2DL9IE,4DAEA,gEAIF,gEKgGA,gFAIA,oNAOA,qDAEA,gFAIA,4DAIA,oEAMA,yEAIA,6DACA,0DAGA,uDAGA,qDAEA,wDLpII,6DAEA,yDACE,2DAMN,uCAIA,yCACE,8CAGF,sDMjDA,6DAKA,oCAIA,4CACA,kBAGF,sBAMA,2BAME,qCAGA,qCAEA,iCAEA,+BAEA,mCAEA,qCAIA,CACA,gCACA,gDAKA,kCAIA,6BAEA,0CAQA,kCAIF,8BAGE,8BACA,uCAGF,sCAKE,kCAEA,sDACA,uEAGE,sDACA,gGACF,wCAGI,sBACA,yHCzEJ,2BACA,qCAGF,sEAGE,kEAGA,sHAGA,2IACE,8BACA,8BAOF,uCAEA,wEAGA,sDACA,iCAKA,CAEF,qCAEE,sDACA,gCACA,gEAKA,+CAOE,sBACA,gEAGA,GAYF,yLACA,gDAGA,mBAEA,wCACA,wCAGF,CAEE,iCAGF,wBACE,mBAIF,oBAFE,eAEF,CAJE,gBAEA,CAMA,mBACA,mBAGA,mDAIA,YACA,mBAEA,CACA,kBAGF,OAJE,kBAQA,CAJF,GACE,aAGA,IACA,mCACA,qBAEF,IACE,oBAEA,aACA,CAFA,WAEA,GAEE,oBAKJ,CAPE,gBAOF,aACE,+CAGA,UAHA,kCAGA,4BACA,GAEA,uBACA,CAHA,yBAEA,CACA,yDAGF,kDAEE,SACA,8BAEA,iEAGE,yDACA,sEAEA,iEAEE,yHAKN,kDAMA,0DAIE,CANA,oBAMA,0GAOA,aAEF,CAHE,YAGF,4HAWE,+CACE,iCAIJ,0CAGE,CALE,qCAEJ,CAHI,WAMF,SAIA,0CAIA,CANF,qCAME,mBACA,gBACA,gBAIA,+CAEE,CAIF,kDAGF,CAPI,8BAGJ,CAKE,YACF,CAbE,2BAEA,CAHA,WAYF,UAEA,yBACE,kBAIA,iEAKA,iCAGA,mDAEA,mBACF,OACE,iBAQA,0CAIA,CAPA,6DAGA,CALF,qBAEE,CAOA,qCAEE,CAGA,eAHA,sBAGA,gCAKF,qBACE,WACA,aACA,sCAEA,mBAOJ,6BASE,kCACA,CAHA,sBACA,aACA,CARA,uBAGA,gBAEA,MAIA,6BAEA,yBACA,2DAEA,sBAGA,8BACA,CANA,wBAMA,2BAEE,YACA,sBACA,WAEF,CAFE,UAEF,eAeF,kBAEE,CAhBE,qDAGA,qCAOJ,CAEI,YAEJ,CAJA,2BAEI,CAIF,eACE,qBACF,4CAIE,uBACA,sBACF,cACE,CAFA,aACF,CAEE,kBADA,kBACA,yBAGF,oCACE,6DAMF,qDAGE,CC1VY,8BDgWd,oCAEA,uDAEA,CACE,8CAIA,gCAEA,YACA,8CACA,CAEA,oCAGE,CAHF,oCAGE,mBAEA,mDADA,YADA,qBACA,WACA,sBAEE,WACA,uDAEN,eAFM,YAEN,iDAGE,uCAIA,YAGF,+CAKE,kBACA,CALA,sBAKA,mBACF,aACE,aACA,yBAEJ,YAGI,CAHJ,YAOE,SACE,CAFJ,kBACE,CAHE,gBAEJ,CAHI,iBAKA,6CAIA,aACA,YEhaJ,4BAEE,aADA,iBACA,6BAEA,kCAEA,SACA,UAIA,gCACA,CALA,SAEA,SAEA,CAJA,0EAEA,CAFA,OAKA,CAGA,mDACE,iBAGF,gCACE,CADF,UACE,aAEJ,iCACE,CADF,UAEE,wCAEA,WACA,WAFA,UAEA,6CAIA,yCACA,WAGA,WAJA,UAIA,kCACE,OACA,CAFF,KAEE,cAQF,0CACE,CAFF,kBACA,CACE,wEACA,CARA,YACA,CAKF,mBAFF,OAII,eACA,CAJF,iCAJE,cAGJ,CANI,oBAEA,CAKF,SAIE,2BADA,UACA,kBAGF,sCACA,CAFF,WACE,WACA,qCACE,gCACA,2EACA,sDAKJ,aACE,mDAII,CAJJ,6CAII,kEACA,iBACE,iDACA,+CACE,aACA,WADA,+BACA,uEANN,YACE,mDAEE,kBACA,CADA,2CADF,uCACE,MACA,0DACE,yCACA,qGALJ,oCACA,uCACE,CAFF,UAEE,uEACA,+CACE,oDACA,6DANN,kCACE,kCACA,gBADA,UACA,yBACE,wDACA,cADA,UACA,qBACE,6CACA,yFALJ,sCACA,CAEE,gBACE,CAHJ,gBAGI,sBAHJ,uBACE,4DACA,4CACE,iDAJJ,2CACA,CADA,gBAEE,gBAGE,sBALJ,+BAII,iBAFF,gDACA,WACE,YADF,uCACE,6EACA,2BANN,8CACE,kDACA,0CACE,8BACA,yFACE,sBACA,sFALJ,mEACA,sBACE,kEACA,6EACE,uCACA,kEALJ,qGAEE,kEACA,6EACE,uCACA,kEALJ,8CACA,uDACE,sEACA,2EACE,sCACA,iEALJ,mGACA,qCACE,oDACA,0DACE,6GACA,gDAGR,yDCpEA,sEACE,CACA,6GACE,gEACF,iGAIF,wFACE,qDAGA,mGAEE,2CAEF,4FACE,gCACF,wGACE,8DAEE,6FAIA,iJAKN,6GACE,gDAKF,yDACA,qCAGA,6BACA,kBACA,qDAKA,oCAEA,+DAGA,2CAGE,oDAIA,oEAEE,qBAGJ,wDAIA,uCAEE,kEAEF,CACF,6CAEE,uDAEA,oCAIF,4BACE,6BAEA,gEAEE,+CAIF,0EC9FA,sDAGE,+DCLJ,sCAGE,8BAKA,wJAIE,gBACA,yGCZF,mBAQA,2MAIA,oBAOF,wGAKE,iCAEE,CAFF,wBAEE,8GAWF,mBAEE,2GAMA,mBAEA,6HAOF,YAGA,mIAOE,gBADA,YACA,4FAOF,8BACA,uBAYA,sCAEE,CAFF,qBARA,wCAEA,CAHA,8BACA,CAFA,eACA,CAGA,mBAEA,sBAEA,kDAEA,CAEE,kCACE,6BACA,4CAMJ,kDAGA,eAIA,6CACE,mCACA,0CACA,8BAEA,sCACA,cAEF,+BACE,CAHA,eAGA,YACA,4BACA,gEAGF,0DAME,sBAFA,kBAGE,+BACA,4BAIJ,aACE,oBACA,CAFF,gBAEE,yBAEA,eACA,CApHsB,YAmHtB,CACA,sECpIF,mDACA,2FAMA,iCAGA,0FAEE,eACA,CAFF,YAEE,0BACE,8CAEF,mBAIE,qCACE,CACF,yBADE,iBACF,8BAGJ,+CAKF,aACE,wCACA,kDAEF,YAEE,CAFF,YAEE,CClCA,mFDwCA,QCzCF,UAGE,CAFA,IACA,aACA,mCAGA,eACE,kCAGA,uDAGF,mBAKA,6CAGE,CALA,mBAEF,CAGE,kCAEF,CARE,kBACA,CAFA,eASF,YAEE,mBACA,CAHF,UAGE,wCC7BJ,oBDkCE,8CAEE,iBCpCJ,iBACE,wDACA,gEASE,6CCLF,CDIE,uBACA,CALF,oBACE,4BAEF,8BCAE,2CAEE,CALJ,kCAGE,CDHF,aAGA,eACE,CAJF,uBCKI,gCAEF,gDAGA,kDAGE,iBAIF,cADF,UACE,uBAEA,iCAEA,wCAEA,6CAEA,CASE,+BASJ,CAZE,4BAGE,CATF,kCAMA,kCAYF,4BACE,2DAEA,CAHF,+BACE,CADF,qBAGE,2GAGA,wIAEE,CAFF,8EAEE,qBACA,oCAGF,6RAIA,sGACE,oDChEJ,WAEF,yBACE,QACA,eAEA,gBAEE,uCAGA,CALF,iCAKE,uCAGA,0BACA,CACA,oBACA,iCClBJ,gBACE,KAGF,qBACE,YAGF,CAHE,cAGF,gCAEE,mBACA,iEAEA,oCACA,wCAEA,sBACA,WAEA,CAFA,YAEA,8EAEA,mCAFA,iBAEA,6BAIA,wEAKA,sDAIE,CARF,mDAIA,CAIE,cAEF,8CAIA,oBAFE,iBAEF,8CAGE,eAEF,CAFE,YAEF,OAEE,kBAGJ,CAJI,eACA,CAFF,mBAKF,yCCjDE,oBACA,CAFA,iBAEA,uCAKE,iBACA,qCAGA,mBCZJ,CDWI,gBCXJ,6BAEE,eACA,sBAGA,eAEA,sBACA,oDACA,iGAMA,gBAFE,YAEF,8FAME,iJClBF,YACA,gNAUE,6BAEF,oTAcI,kBACF,gHAIA,qBACE,eACF,qDACE,kBACF,6DACE,4BCxCJ,oBAEF,qCAEI,+CAGF,uBACE,uDAGJ,oBAkBE,mDAhBA,+CAaA,CAbA,oBAaA,0FAEE,CAFF,gGAbA,+BAaA,0BAGA,mQAIA,oNAEE,kCADA,gBACA,aAGJ,sDAHI,mBAGJ,yBAYI,+VACE,sDAGA,iBAHA,2BAGA,kWAGN,iDAEE,CALI,gGAGN,CAHM,gBAKJ,yCAGF,0EACE,2EAGF,iBACE,yDAOA,0EAGF,6EAEE,iBC/EA,wDACA,4DACA,qBAEA,oDCDA,6BACA,yBACA,sBAEA,iBAGF,sNAYE,iBAEA,kBAdF,wRA8BI,kBACA,iOAkBA,aACA,4DACE,uEAEA,uVAoBA,iDAKA,ieC1EJ,4BACA,CCFF,6JAEE,iDACA,sEAIA,mDAGA,iDAOF,4DAGE,8CAEA,CAEA,kBACA,CAHA,gCAEA,CACA,eADA,cACA,oBAEE,uBAFF,kCAEE,gCAEF,kBACE,CAIA,mDAEA,CAHA,uCACA,CALF,aACE,6BAEA,CAIA,gBAJA,mCACA,CADA,gBAIA,wBACA,6CAGF,YAHE,iBAGF,gCAGA,iEACA,6CAEA,qDACA,6EACA,2EACA,8GAEA,yCAGA,uBACA,CAFA,yBACA,CACA,yDAKA,kDACE,mFAKJ,oCACE,CANE,aAKJ,CACE,qEAIA,YAFA,WAEA,CAHA,aACA,CAEA,gBACE,4BACA,sBADA,aACA,gCAMF,oCACA,yDACA,2CAEA,qBAGE,kBAEA,CACA,mCAIF,CARE,YACA,CAOF,iCAEE,CAPA,oBACA,CAQA,oBACE,uDAEJ,sDAGA,CAHA,cAGA,0BACE,oDAIA,oCACA,4BACA,sBAGA,cAEA,oFAGA,sBAEA,yDACE,CAIA,iBAJA,wBAIA,6CAJA,6CAOA,4BAGJ,CAHI,cAGJ,yCAGA,kBACE,CAIA,iDAEA,CATA,YAEF,CACE,4CAGA,kBAIA,wEAEA,wDAIF,kCAOE,iDACA,CARF,WAIE,sCAGA,CANA,2CACA,CAMA,oEARF,iBACE,CACA,qCAMA,iBAuBE,uBAlBF,YAKA,2DALA,uDAKA,CALA,sBAiBA,4CACE,CALA,gRAIF,YACE,UAEN,uBACE,YACA,mCAOE,+CAGA,8BAGF,+CAGA,4BCjNA,SDiNA,qFCjNA,gDAGA,sCACA,qCACA,sDAIF,CAIE,kDAGA,CAPF,0CAOE,kBAEA,kDAEA,CAHA,eACA,CAFA,YACA,CADA,SAIA,mHAIE,CAGA,6CAFA,oCAeE,CAbF,yBACE,qBAEJ,CAGE,oBACA,CAEA,YAFA,2CACF,CACE,uBAEA,mFAEE,CALJ,oBACE,CAEA,UAEE,gCAGF,sDAEA,yCC7CJ,oCAGA,CD6CE,yXAQE,sCCrDJ,wCAGA,oCACE","sources":["webpack:///./node_modules/normalize.css/normalize.css","webpack:///./src/furo/assets/styles/base/_print.sass","webpack:///./src/furo/assets/styles/base/_screen-readers.sass","webpack:///./src/furo/assets/styles/base/_theme.sass","webpack:///./src/furo/assets/styles/variables/_fonts.scss","webpack:///./src/furo/assets/styles/variables/_spacing.scss","webpack:///./src/furo/assets/styles/variables/_icons.scss","webpack:///./src/furo/assets/styles/variables/_admonitions.scss","webpack:///./src/furo/assets/styles/variables/_colors.scss","webpack:///./src/furo/assets/styles/base/_typography.sass","webpack:///./src/furo/assets/styles/_scaffold.sass","webpack:///./src/furo/assets/styles/variables/_layout.scss","webpack:///./src/furo/assets/styles/content/_admonitions.sass","webpack:///./src/furo/assets/styles/content/_api.sass","webpack:///./src/furo/assets/styles/content/_blocks.sass","webpack:///./src/furo/assets/styles/content/_captions.sass","webpack:///./src/furo/assets/styles/content/_code.sass","webpack:///./src/furo/assets/styles/content/_footnotes.sass","webpack:///./src/furo/assets/styles/content/_images.sass","webpack:///./src/furo/assets/styles/content/_indexes.sass","webpack:///./src/furo/assets/styles/content/_lists.sass","webpack:///./src/furo/assets/styles/content/_math.sass","webpack:///./src/furo/assets/styles/content/_misc.sass","webpack:///./src/furo/assets/styles/content/_rubrics.sass","webpack:///./src/furo/assets/styles/content/_sidebar.sass","webpack:///./src/furo/assets/styles/content/_tables.sass","webpack:///./src/furo/assets/styles/content/_target.sass","webpack:///./src/furo/assets/styles/content/_gui-labels.sass","webpack:///./src/furo/assets/styles/components/_footer.sass","webpack:///./src/furo/assets/styles/components/_search.sass","webpack:///./src/furo/assets/styles/components/_sidebar.sass","webpack:///./src/furo/assets/styles/components/_table_of_contents.sass","webpack:///./src/furo/assets/styles/_shame.sass"],"sourcesContent":["/*! normalize.css v8.0.1 | MIT License | github.com/necolas/normalize.css */\n\n/* Document\n   ========================================================================== */\n\n/**\n * 1. Correct the line height in all browsers.\n * 2. Prevent adjustments of font size after orientation changes in iOS.\n */\n\nhtml {\n  line-height: 1.15; /* 1 */\n  -webkit-text-size-adjust: 100%; /* 2 */\n}\n\n/* Sections\n   ========================================================================== */\n\n/**\n * Remove the margin in all browsers.\n */\n\nbody {\n  margin: 0;\n}\n\n/**\n * Render the `main` element consistently in IE.\n */\n\nmain {\n  display: block;\n}\n\n/**\n * Correct the font size and margin on `h1` elements within `section` and\n * `article` contexts in Chrome, Firefox, and Safari.\n */\n\nh1 {\n  font-size: 2em;\n  margin: 0.67em 0;\n}\n\n/* Grouping content\n   ========================================================================== */\n\n/**\n * 1. Add the correct box sizing in Firefox.\n * 2. Show the overflow in Edge and IE.\n */\n\nhr {\n  box-sizing: content-box; /* 1 */\n  height: 0; /* 1 */\n  overflow: visible; /* 2 */\n}\n\n/**\n * 1. Correct the inheritance and scaling of font size in all browsers.\n * 2. Correct the odd `em` font sizing in all browsers.\n */\n\npre {\n  font-family: monospace, monospace; /* 1 */\n  font-size: 1em; /* 2 */\n}\n\n/* Text-level semantics\n   ========================================================================== */\n\n/**\n * Remove the gray background on active links in IE 10.\n */\n\na {\n  background-color: transparent;\n}\n\n/**\n * 1. Remove the bottom border in Chrome 57-\n * 2. Add the correct text decoration in Chrome, Edge, IE, Opera, and Safari.\n */\n\nabbr[title] {\n  border-bottom: none; /* 1 */\n  text-decoration: underline; /* 2 */\n  text-decoration: underline dotted; /* 2 */\n}\n\n/**\n * Add the correct font weight in Chrome, Edge, and Safari.\n */\n\nb,\nstrong {\n  font-weight: bolder;\n}\n\n/**\n * 1. Correct the inheritance and scaling of font size in all browsers.\n * 2. Correct the odd `em` font sizing in all browsers.\n */\n\ncode,\nkbd,\nsamp {\n  font-family: monospace, monospace; /* 1 */\n  font-size: 1em; /* 2 */\n}\n\n/**\n * Add the correct font size in all browsers.\n */\n\nsmall {\n  font-size: 80%;\n}\n\n/**\n * Prevent `sub` and `sup` elements from affecting the line height in\n * all browsers.\n */\n\nsub,\nsup {\n  font-size: 75%;\n  line-height: 0;\n  position: relative;\n  vertical-align: baseline;\n}\n\nsub {\n  bottom: -0.25em;\n}\n\nsup {\n  top: -0.5em;\n}\n\n/* Embedded content\n   ========================================================================== */\n\n/**\n * Remove the border on images inside links in IE 10.\n */\n\nimg {\n  border-style: none;\n}\n\n/* Forms\n   ========================================================================== */\n\n/**\n * 1. Change the font styles in all browsers.\n * 2. Remove the margin in Firefox and Safari.\n */\n\nbutton,\ninput,\noptgroup,\nselect,\ntextarea {\n  font-family: inherit; /* 1 */\n  font-size: 100%; /* 1 */\n  line-height: 1.15; /* 1 */\n  margin: 0; /* 2 */\n}\n\n/**\n * Show the overflow in IE.\n * 1. Show the overflow in Edge.\n */\n\nbutton,\ninput { /* 1 */\n  overflow: visible;\n}\n\n/**\n * Remove the inheritance of text transform in Edge, Firefox, and IE.\n * 1. Remove the inheritance of text transform in Firefox.\n */\n\nbutton,\nselect { /* 1 */\n  text-transform: none;\n}\n\n/**\n * Correct the inability to style clickable types in iOS and Safari.\n */\n\nbutton,\n[type=\"button\"],\n[type=\"reset\"],\n[type=\"submit\"] {\n  -webkit-appearance: button;\n}\n\n/**\n * Remove the inner border and padding in Firefox.\n */\n\nbutton::-moz-focus-inner,\n[type=\"button\"]::-moz-focus-inner,\n[type=\"reset\"]::-moz-focus-inner,\n[type=\"submit\"]::-moz-focus-inner {\n  border-style: none;\n  padding: 0;\n}\n\n/**\n * Restore the focus styles unset by the previous rule.\n */\n\nbutton:-moz-focusring,\n[type=\"button\"]:-moz-focusring,\n[type=\"reset\"]:-moz-focusring,\n[type=\"submit\"]:-moz-focusring {\n  outline: 1px dotted ButtonText;\n}\n\n/**\n * Correct the padding in Firefox.\n */\n\nfieldset {\n  padding: 0.35em 0.75em 0.625em;\n}\n\n/**\n * 1. Correct the text wrapping in Edge and IE.\n * 2. Correct the color inheritance from `fieldset` elements in IE.\n * 3. Remove the padding so developers are not caught out when they zero out\n *    `fieldset` elements in all browsers.\n */\n\nlegend {\n  box-sizing: border-box; /* 1 */\n  color: inherit; /* 2 */\n  display: table; /* 1 */\n  max-width: 100%; /* 1 */\n  padding: 0; /* 3 */\n  white-space: normal; /* 1 */\n}\n\n/**\n * Add the correct vertical alignment in Chrome, Firefox, and Opera.\n */\n\nprogress {\n  vertical-align: baseline;\n}\n\n/**\n * Remove the default vertical scrollbar in IE 10+.\n */\n\ntextarea {\n  overflow: auto;\n}\n\n/**\n * 1. Add the correct box sizing in IE 10.\n * 2. Remove the padding in IE 10.\n */\n\n[type=\"checkbox\"],\n[type=\"radio\"] {\n  box-sizing: border-box; /* 1 */\n  padding: 0; /* 2 */\n}\n\n/**\n * Correct the cursor style of increment and decrement buttons in Chrome.\n */\n\n[type=\"number\"]::-webkit-inner-spin-button,\n[type=\"number\"]::-webkit-outer-spin-button {\n  height: auto;\n}\n\n/**\n * 1. Correct the odd appearance in Chrome and Safari.\n * 2. Correct the outline style in Safari.\n */\n\n[type=\"search\"] {\n  -webkit-appearance: textfield; /* 1 */\n  outline-offset: -2px; /* 2 */\n}\n\n/**\n * Remove the inner padding in Chrome and Safari on macOS.\n */\n\n[type=\"search\"]::-webkit-search-decoration {\n  -webkit-appearance: none;\n}\n\n/**\n * 1. Correct the inability to style clickable types in iOS and Safari.\n * 2. Change font properties to `inherit` in Safari.\n */\n\n::-webkit-file-upload-button {\n  -webkit-appearance: button; /* 1 */\n  font: inherit; /* 2 */\n}\n\n/* Interactive\n   ========================================================================== */\n\n/*\n * Add the correct display in Edge, IE 10+, and Firefox.\n */\n\ndetails {\n  display: block;\n}\n\n/*\n * Add the correct display in all browsers.\n */\n\nsummary {\n  display: list-item;\n}\n\n/* Misc\n   ========================================================================== */\n\n/**\n * Add the correct display in IE 10+.\n */\n\ntemplate {\n  display: none;\n}\n\n/**\n * Add the correct display in IE 10.\n */\n\n[hidden] {\n  display: none;\n}\n","// This file contains styles for managing print media.\n\n////////////////////////////////////////////////////////////////////////////////\n// Hide elements not relevant to print media.\n////////////////////////////////////////////////////////////////////////////////\n@media print\n  // Hide icon container.\n  .content-icon-container\n    display: none !important\n\n  // Hide showing header links if hovering over when printing.\n  .headerlink\n    display: none !important\n\n  // Hide mobile header.\n  .mobile-header\n    display: none !important\n\n  // Hide navigation links.\n  .related-pages\n    display: none !important\n\n////////////////////////////////////////////////////////////////////////////////\n// Tweaks related to decolorization.\n////////////////////////////////////////////////////////////////////////////////\n@media print\n  // Apply a border around code which no longer have a color background.\n  .highlight\n    border: 0.1pt solid var(--color-foreground-border)\n\n////////////////////////////////////////////////////////////////////////////////\n// Avoid page break in some relevant cases.\n////////////////////////////////////////////////////////////////////////////////\n@media print\n  ul, ol, dl, a, table, pre, blockquote\n    page-break-inside: avoid\n\n  h1, h2, h3, h4, h5, h6, img, figure, caption\n    page-break-inside: avoid\n    page-break-after: avoid\n\n  ul, ol, dl\n    page-break-before: avoid\n",".visually-hidden\n  position: absolute !important\n  width: 1px !important\n  height: 1px !important\n  padding: 0 !important\n  margin: -1px !important\n  overflow: hidden !important\n  clip: rect(0,0,0,0) !important\n  white-space: nowrap !important\n  border: 0 !important\n\n:-moz-focusring\n  outline: auto\n","// This file serves as the \"skeleton\" of the theming logic.\n//\n// This contains the bulk of the logic for handling dark mode, color scheme\n// toggling and the handling of color-scheme-specific hiding of elements.\n\nbody\n  @include fonts\n  @include spacing\n  @include icons\n  @include admonitions\n  @include default-admonition(#651fff, \"abstract\")\n  @include default-topic(#14B8A6, \"pencil\")\n\n  @include colors\n\n.only-light\n  display: block !important\nhtml body .only-dark\n  display: none !important\n\n// Ignore dark-mode hints if print media.\n@media not print\n  // Enable dark-mode, if requested.\n  body[data-theme=\"dark\"]\n    @include colors-dark\n\n    html & .only-light\n      display: none !important\n    .only-dark\n      display: block !important\n\n  // Enable dark mode, unless explicitly told to avoid.\n  @media (prefers-color-scheme: dark)\n    body:not([data-theme=\"light\"])\n      @include colors-dark\n\n      html & .only-light\n        display: none !important\n      .only-dark\n        display: block !important\n\n//\n// Theme toggle presentation\n//\nbody[data-theme=\"auto\"]\n  .theme-toggle svg.theme-icon-when-auto\n    display: block\n\nbody[data-theme=\"dark\"]\n  .theme-toggle svg.theme-icon-when-dark\n    display: block\n\nbody[data-theme=\"light\"]\n  .theme-toggle svg.theme-icon-when-light\n    display: block\n","// Fonts used by this theme.\n//\n// There are basically two things here -- using the system font stack and\n// defining sizes for various elements in %ages. We could have also used `em`\n// but %age is easier to reason about for me.\n\n@mixin fonts {\n  // These are adapted from https://systemfontstack.com/\n  --font-stack: -apple-system, BlinkMacSystemFont, Segoe UI, Helvetica, Arial,\n    sans-serif, Apple Color Emoji, Segoe UI Emoji;\n  --font-stack--monospace: \"SFMono-Regular\", Menlo, Consolas, Monaco,\n    Liberation Mono, Lucida Console, monospace;\n\n  --font-size--normal: 100%;\n  --font-size--small: 87.5%;\n  --font-size--small--2: 81.25%;\n  --font-size--small--3: 75%;\n  --font-size--small--4: 62.5%;\n\n  // Sidebar\n  --sidebar-caption-font-size: var(--font-size--small--2);\n  --sidebar-item-font-size: var(--font-size--small);\n  --sidebar-search-input-font-size: var(--font-size--small);\n\n  // Table of Contents\n  --toc-font-size: var(--font-size--small--3);\n  --toc-font-size--mobile: var(--font-size--normal);\n  --toc-title-font-size: var(--font-size--small--4);\n\n  // Admonitions\n  //\n  // These aren't defined in terms of %ages, since nesting these is permitted.\n  --admonition-font-size: 0.8125rem;\n  --admonition-title-font-size: 0.8125rem;\n\n  // Code\n  --code-font-size: var(--font-size--small--2);\n\n  // API\n  --api-font-size: var(--font-size--small);\n}\n","// Spacing for various elements on the page\n//\n// If the user wants to tweak things in a certain way, they are permitted to.\n// They also have to deal with the consequences though!\n\n@mixin spacing {\n  // Header!\n  --header-height: calc(\n    var(--sidebar-item-line-height) + 4 * #{var(--sidebar-item-spacing-vertical)}\n  );\n  --header-padding: 0.5rem;\n\n  // Sidebar\n  --sidebar-tree-space-above: 1.5rem;\n  --sidebar-caption-space-above: 1rem;\n\n  --sidebar-item-line-height: 1rem;\n  --sidebar-item-spacing-vertical: 0.5rem;\n  --sidebar-item-spacing-horizontal: 1rem;\n  --sidebar-item-height: calc(\n    var(--sidebar-item-line-height) + 2 *#{var(--sidebar-item-spacing-vertical)}\n  );\n\n  --sidebar-expander-width: var(--sidebar-item-height); // be square\n\n  --sidebar-search-space-above: 0.5rem;\n  --sidebar-search-input-spacing-vertical: 0.5rem;\n  --sidebar-search-input-spacing-horizontal: 0.5rem;\n  --sidebar-search-input-height: 1rem;\n  --sidebar-search-icon-size: var(--sidebar-search-input-height);\n\n  // Table of Contents\n  --toc-title-padding: 0.25rem 0;\n  --toc-spacing-vertical: 1.5rem;\n  --toc-spacing-horizontal: 1.5rem;\n  --toc-item-spacing-vertical: 0.4rem;\n  --toc-item-spacing-horizontal: 1rem;\n}\n","// Expose theme icons as CSS variables.\n\n$icons: (\n  // Adapted from tabler-icons\n  //    url: https://tablericons.com/\n  \"search\":\n    url('data:image/svg+xml;charset=utf-8,<svg xmlns=\"http://www.w3.org/2000/svg\" viewBox=\"0 0 24 24\" stroke-width=\"1.5\" stroke=\"currentColor\" fill=\"none\" stroke-linecap=\"round\" stroke-linejoin=\"round\"><path stroke=\"none\" d=\"M0 0h24v24H0z\"/><circle cx=\"10\" cy=\"10\" r=\"7\" /><line x1=\"21\" y1=\"21\" x2=\"15\" y2=\"15\" /></svg>'),\n  // Factored out from mkdocs-material on 24-Aug-2020.\n  //    url: https://squidfunk.github.io/mkdocs-material/reference/admonitions/\n  \"pencil\":\n    url('data:image/svg+xml;charset=utf-8,<svg xmlns=\"http://www.w3.org/2000/svg\" viewBox=\"0 0 24 24\"><path d=\"M20.71 7.04c.39-.39.39-1.04 0-1.41l-2.34-2.34c-.37-.39-1.02-.39-1.41 0l-1.84 1.83 3.75 3.75M3 17.25V21h3.75L17.81 9.93l-3.75-3.75L3 17.25z\"/></svg>'),\n  \"abstract\":\n    url('data:image/svg+xml;charset=utf-8,<svg xmlns=\"http://www.w3.org/2000/svg\" viewBox=\"0 0 24 24\"><path d=\"M4 5h16v2H4V5m0 4h16v2H4V9m0 4h16v2H4v-2m0 4h10v2H4v-2z\"/></svg>'),\n  \"info\":\n    url('data:image/svg+xml;charset=utf-8,<svg xmlns=\"http://www.w3.org/2000/svg\" viewBox=\"0 0 24 24\"><path d=\"M13 9h-2V7h2m0 10h-2v-6h2m-1-9A10 10 0 002 12a10 10 0 0010 10 10 10 0 0010-10A10 10 0 0012 2z\"/></svg>'),\n  \"flame\":\n    url('data:image/svg+xml;charset=utf-8,<svg xmlns=\"http://www.w3.org/2000/svg\" viewBox=\"0 0 24 24\"><path d=\"M17.55 11.2c-.23-.3-.5-.56-.76-.82-.65-.6-1.4-1.03-2.03-1.66C13.3 7.26 13 4.85 13.91 3c-.91.23-1.75.75-2.45 1.32-2.54 2.08-3.54 5.75-2.34 8.9.04.1.08.2.08.33 0 .22-.15.42-.35.5-.22.1-.46.04-.64-.12a.83.83 0 01-.15-.17c-1.1-1.43-1.28-3.48-.53-5.12C5.89 10 5 12.3 5.14 14.47c.04.5.1 1 .27 1.5.14.6.4 1.2.72 1.73 1.04 1.73 2.87 2.97 4.84 3.22 2.1.27 4.35-.12 5.96-1.6 1.8-1.66 2.45-4.32 1.5-6.6l-.13-.26c-.2-.46-.47-.87-.8-1.25l.05-.01m-3.1 6.3c-.28.24-.73.5-1.08.6-1.1.4-2.2-.16-2.87-.82 1.19-.28 1.89-1.16 2.09-2.05.17-.8-.14-1.46-.27-2.23-.12-.74-.1-1.37.18-2.06.17.38.37.76.6 1.06.76 1 1.95 1.44 2.2 2.8.04.14.06.28.06.43.03.82-.32 1.72-.92 2.27h.01z\"/></svg>'),\n  \"question\":\n    url('data:image/svg+xml;charset=utf-8,<svg xmlns=\"http://www.w3.org/2000/svg\" viewBox=\"0 0 24 24\"><path d=\"M15.07 11.25l-.9.92C13.45 12.89 13 13.5 13 15h-2v-.5c0-1.11.45-2.11 1.17-2.83l1.24-1.26c.37-.36.59-.86.59-1.41a2 2 0 00-2-2 2 2 0 00-2 2H8a4 4 0 014-4 4 4 0 014 4 3.2 3.2 0 01-.93 2.25M13 19h-2v-2h2M12 2A10 10 0 002 12a10 10 0 0010 10 10 10 0 0010-10c0-5.53-4.5-10-10-10z\"/></svg>'),\n  \"warning\":\n    url('data:image/svg+xml;charset=utf-8,<svg xmlns=\"http://www.w3.org/2000/svg\" viewBox=\"0 0 24 24\"><path d=\"M13 14h-2v-4h2m0 8h-2v-2h2M1 21h22L12 2 1 21z\"/></svg>'),\n  \"failure\":\n    url('data:image/svg+xml;charset=utf-8,<svg xmlns=\"http://www.w3.org/2000/svg\" viewBox=\"0 0 24 24\"><path d=\"M12 2c5.53 0 10 4.47 10 10s-4.47 10-10 10S2 17.53 2 12 6.47 2 12 2m3.59 5L12 10.59 8.41 7 7 8.41 10.59 12 7 15.59 8.41 17 12 13.41 15.59 17 17 15.59 13.41 12 17 8.41 15.59 7z\"/></svg>'),\n  \"spark\":\n    url('data:image/svg+xml;charset=utf-8,<svg xmlns=\"http://www.w3.org/2000/svg\" viewBox=\"0 0 24 24\"><path d=\"M11.5 20l4.86-9.73H13V4l-5 9.73h3.5V20M12 2c2.75 0 5.1 1 7.05 2.95C21 6.9 22 9.25 22 12s-1 5.1-2.95 7.05C17.1 21 14.75 22 12 22s-5.1-1-7.05-2.95C3 17.1 2 14.75 2 12s1-5.1 2.95-7.05C6.9 3 9.25 2 12 2z\"/></svg>')\n);\n\n@mixin icons {\n  @each $name, $glyph in $icons {\n    --icon-#{$name}: #{$glyph};\n  }\n}\n","// Admonitions\n\n// Structure of these is:\n//    admonition-class: color \"icon-name\";\n//\n// The colors are translated into CSS variables below. The icons are\n// used directly in the main declarations to set the `mask-image` in\n// the title.\n\n// prettier-ignore\n$admonitions: (\n  // Each of these has an reST directives for it.\n  \"caution\":         #ff9100 \"spark\",\n  \"warning\":         #ff9100 \"warning\",\n  \"danger\":          #ff5252 \"spark\",\n  \"attention\":       #ff5252 \"warning\",\n  \"error\":           #ff5252 \"failure\",\n  \"hint\":            #00c852 \"question\",\n  \"tip\":             #00c852 \"info\",\n  \"important\":       #00bfa5 \"flame\",\n  \"note\":            #00b0ff \"pencil\",\n  \"seealso\":         #448aff \"info\",\n  \"admonition-todo\": #808080 \"pencil\"\n);\n\n@mixin default-admonition($color, $icon-name) {\n  --color-admonition-title: #{$color};\n  --color-admonition-title-background: #{rgba($color, 0.2)};\n\n  --icon-admonition-default: var(--icon-#{$icon-name});\n}\n\n@mixin default-topic($color, $icon-name) {\n  --color-topic-title: #{$color};\n  --color-topic-title-background: #{rgba($color, 0.2)};\n\n  --icon-topic-default: var(--icon-#{$icon-name});\n}\n\n@mixin admonitions {\n  @each $name, $values in $admonitions {\n    --color-admonition-title--#{$name}: #{nth($values, 1)};\n    --color-admonition-title-background--#{$name}: #{rgba(\n        nth($values, 1),\n        0.2\n      )};\n  }\n}\n","// Colors used throughout this theme.\n//\n// The aim is to give the user more control. Thus, instead of hard-coding colors\n// in various parts of the stylesheet, the approach taken is to define all\n// colors as CSS variables and reusing them in all the places.\n//\n// `colors-dark` depends on `colors` being included at a lower specificity.\n\n@mixin colors {\n  --color-problematic: #b30000;\n\n  // Base Colors\n  --color-foreground-primary: black; // for main text and headings\n  --color-foreground-secondary: #5a5c63; // for secondary text\n  --color-foreground-muted: #646776; // for muted text\n  --color-foreground-border: #878787; // for content borders\n\n  --color-background-primary: white; // for content\n  --color-background-secondary: #f8f9fb; // for navigation + ToC\n  --color-background-hover: #efeff4ff; // for navigation-item hover\n  --color-background-hover--transparent: #efeff400;\n  --color-background-border: #eeebee; // for UI borders\n  --color-background-item: #ccc; // for \"background\" items (eg: copybutton)\n\n  // Announcements\n  --color-announcement-background: #000000dd;\n  --color-announcement-text: #eeebee;\n\n  // Brand colors\n  --color-brand-primary: #2962ff;\n  --color-brand-content: #2a5adf;\n\n  // API documentation\n  --color-api-background: var(--color-background-hover--transparent);\n  --color-api-background-hover: var(--color-background-hover);\n  --color-api-overall: var(--color-foreground-secondary);\n  --color-api-name: var(--color-problematic);\n  --color-api-pre-name: var(--color-problematic);\n  --color-api-paren: var(--color-foreground-secondary);\n  --color-api-keyword: var(--color-foreground-primary);\n  --color-highlight-on-target: #ffffcc;\n\n  // Inline code background\n  --color-inline-code-background: var(--color-background-secondary);\n\n  // Highlighted text (search)\n  --color-highlighted-background: #ddeeff;\n  --color-highlighted-text: var(--color-foreground-primary);\n\n  // GUI Labels\n  --color-guilabel-background: #ddeeff80;\n  --color-guilabel-border: #bedaf580;\n  --color-guilabel-text: var(--color-foreground-primary);\n\n  // Admonitions!\n  --color-admonition-background: transparent;\n\n  //////////////////////////////////////////////////////////////////////////////\n  // Everything below this should be one of:\n  // - var(...)\n  // - *-gradient(...)\n  // - special literal values (eg: transparent, none)\n  //////////////////////////////////////////////////////////////////////////////\n\n  // Tables\n  --color-table-header-background: var(--color-background-secondary);\n  --color-table-border: var(--color-background-border);\n\n  // Cards\n  --color-card-border: var(--color-background-secondary);\n  --color-card-background: transparent;\n  --color-card-marginals-background: var(--color-background-secondary);\n\n  // Header\n  --color-header-background: var(--color-background-primary);\n  --color-header-border: var(--color-background-border);\n  --color-header-text: var(--color-foreground-primary);\n\n  // Sidebar (left)\n  --color-sidebar-background: var(--color-background-secondary);\n  --color-sidebar-background-border: var(--color-background-border);\n\n  --color-sidebar-brand-text: var(--color-foreground-primary);\n  --color-sidebar-caption-text: var(--color-foreground-muted);\n  --color-sidebar-link-text: var(--color-foreground-secondary);\n  --color-sidebar-link-text--top-level: var(--color-brand-primary);\n\n  --color-sidebar-item-background: var(--color-sidebar-background);\n  --color-sidebar-item-background--current: var(\n    --color-sidebar-item-background\n  );\n  --color-sidebar-item-background--hover: linear-gradient(\n    90deg,\n    var(--color-background-hover--transparent) 0%,\n    var(--color-background-hover) var(--sidebar-item-spacing-horizontal),\n    var(--color-background-hover) 100%\n  );\n\n  --color-sidebar-item-expander-background: transparent;\n  --color-sidebar-item-expander-background--hover: var(\n    --color-background-hover\n  );\n\n  --color-sidebar-search-text: var(--color-foreground-primary);\n  --color-sidebar-search-background: var(--color-background-secondary);\n  --color-sidebar-search-background--focus: var(--color-background-primary);\n  --color-sidebar-search-border: var(--color-background-border);\n  --color-sidebar-search-icon: var(--color-foreground-muted);\n\n  // Table of Contents (right)\n  --color-toc-background: var(--color-background-primary);\n  --color-toc-title-text: var(--color-foreground-muted);\n  --color-toc-item-text: var(--color-foreground-secondary);\n  --color-toc-item-text--hover: var(--color-foreground-primary);\n  --color-toc-item-text--active: var(--color-brand-primary);\n\n  // Actual page contents\n  --color-content-foreground: var(--color-foreground-primary);\n  --color-content-background: transparent;\n\n  // Links\n  --color-link: var(--color-brand-content);\n  --color-link--hover: var(--color-brand-content);\n  --color-link-underline: var(--color-background-border);\n  --color-link-underline--hover: var(--color-foreground-border);\n}\n\n@mixin colors-dark {\n  --color-problematic: #ee5151;\n\n  // Base Colors\n  --color-foreground-primary: #ffffffcc; // for main text and headings\n  --color-foreground-secondary: #9ca0a5; // for secondary text\n  --color-foreground-muted: #81868d; // for muted text\n  --color-foreground-border: #666666; // for content borders\n\n  --color-background-primary: #131416; // for content\n  --color-background-secondary: #1a1c1e; // for navigation + ToC\n  --color-background-hover: #1e2124ff; // for navigation-item hover\n  --color-background-hover--transparent: #1e212400;\n  --color-background-border: #303335; // for UI borders\n  --color-background-item: #444; // for \"background\" items (eg: copybutton)\n\n  // Announcements\n  --color-announcement-background: #000000dd;\n  --color-announcement-text: #eeebee;\n\n  // Brand colors\n  --color-brand-primary: #2b8cee;\n  --color-brand-content: #368ce2;\n\n  // Highlighted text (search)\n  --color-highlighted-background: #083563;\n\n  // GUI Labels\n  --color-guilabel-background: #08356380;\n  --color-guilabel-border: #13395f80;\n\n  // API documentation\n  --color-api-keyword: var(--color-foreground-secondary);\n  --color-highlight-on-target: #333300;\n\n  // Admonitions\n  --color-admonition-background: #18181a;\n\n  // Cards\n  --color-card-border: var(--color-background-secondary);\n  --color-card-background: #18181a;\n  --color-card-marginals-background: var(--color-background-hover);\n}\n","// This file contains the styling for making the content throughout the page,\n// including fonts, paragraphs, headings and spacing among these elements.\n\nbody\n  font-family: var(--font-stack)\npre,\ncode,\nkbd,\nsamp\n  font-family: var(--font-stack--monospace)\n\n// Make fonts look slightly nicer.\nbody\n  -webkit-font-smoothing: antialiased\n  -moz-osx-font-smoothing: grayscale\n\n// Line height from Bootstrap 4.1\narticle\n  line-height: 1.5\n\n//\n// Headings\n//\nh1,\nh2,\nh3,\nh4,\nh5,\nh6\n  line-height: 1.25\n  font-weight: bold\n\n  border-radius: 0.5rem\n  margin-top: 0.5rem\n  margin-bottom: 0.5rem\n  margin-left: -0.5rem\n  margin-right: -0.5rem\n  padding-left: 0.5rem\n  padding-right: 0.5rem\n\n  + p\n    margin-top: 0\n\nh1\n  font-size: 2.5em\n  margin-top: 1.75rem\n  margin-bottom: 1rem\nh2\n  font-size: 2em\n  margin-top: 1.75rem\nh3\n  font-size: 1.5em\nh4\n  font-size: 1.25em\nh5\n  font-size: 1.125em\nh6\n  font-size: 1em\n\nsmall\n  opacity: 75%\n  font-size: 80%\n\n// Paragraph\np\n  margin-top: 0.5rem\n  margin-bottom: 0.75rem\n\n// Horizontal rules\nhr.docutils\n  height: 1px\n  padding: 0\n  margin: 2rem 0\n  background-color: var(--color-background-border)\n  border: 0\n\n.centered\n  text-align: center\n\n// Links\na\n  text-decoration: underline\n\n  color: var(--color-link)\n  text-decoration-color: var(--color-link-underline)\n\n  &:hover\n    color: var(--color-link--hover)\n    text-decoration-color: var(--color-link-underline--hover)\n  &.muted-link\n    color: inherit\n    &:hover\n      color: var(--color-link)\n      text-decoration-color: var(--color-link-underline--hover)\n","// This file contains the styles for the overall layouting of the documentation\n// skeleton, including the responsive changes as well as sidebar toggles.\n//\n// This is implemented as a mobile-last design, which isn't ideal, but it is\n// reasonably good-enough and I got pretty tired by the time I'd finished this\n// to move the rules around to fix this. Shouldn't take more than 3-4 hours,\n// if you know what you're doing tho.\n\n// HACK: Not all browsers account for the scrollbar width in media queries.\n// This results in horizontal scrollbars in the breakpoint where we go\n// from displaying everything to hiding the ToC. We accomodate for this by\n// adding a bit of padding to the TOC drawer, disabling the horizontal\n// scrollbar and allowing the scrollbars to cover the padding.\n// https://www.456bereastreet.com/archive/201301/media_query_width_and_vertical_scrollbars/\n\n// HACK: Always having the scrollbar visible, prevents certain browsers from\n// causing the content to stutter horizontally between taller-than-viewport and\n// not-taller-than-viewport pages.\n\nhtml\n  overflow-x: hidden\n  overflow-y: scroll\n  scroll-behavior: smooth\n\n.sidebar-scroll, .toc-scroll, article[role=main] *\n  // Override Firefox scrollbar style\n  scrollbar-width: thin\n  scrollbar-color: var(--color-foreground-border) transparent\n\n  // Override Chrome scrollbar styles\n  &::-webkit-scrollbar\n    width: 0.25rem\n    height: 0.25rem\n  &::-webkit-scrollbar-thumb\n    background-color: var(--color-foreground-border)\n    border-radius: 0.125rem\n\n//\n// Overalls\n//\nhtml,\nbody\n  height: 100%\n  color: var(--color-foreground-primary)\n  background: var(--color-background-primary)\n\narticle\n  color: var(--color-content-foreground)\n  background: var(--color-content-background)\n\n.page\n  display: flex\n  // fill the viewport for pages with little content.\n  min-height: 100%\n\n.mobile-header\n  width: 100%\n  height: var(--header-height)\n  background-color: var(--color-header-background)\n  color: var(--color-header-text)\n  border-bottom: 1px solid var(--color-header-border)\n\n  // Looks like sub-script/super-script have this, and we need this to\n  // be \"on top\" of those.\n  z-index: 10\n\n  // We don't show the header on large screens.\n  display: none\n\n  // Add shadow when scrolled\n  &.scrolled\n    border-bottom: none\n    box-shadow: 0 0 0.2rem rgba(0, 0, 0, 0.1), 0 0.2rem 0.4rem rgba(0, 0, 0, 0.2)\n\n  .header-center\n    a\n      color: var(--color-header-text)\n      text-decoration: none\n\n.main\n  display: flex\n  flex: 1\n\n// Sidebar (left) also covers the entire left portion of screen.\n.sidebar-drawer\n  box-sizing: border-box\n\n  border-right: 1px solid var(--color-sidebar-background-border)\n  background: var(--color-sidebar-background)\n\n  display: flex\n  justify-content: flex-end\n  // These next two lines took me two days to figure out.\n  width: calc((100% - #{$full-width}) / 2 + #{$sidebar-width})\n  min-width: $sidebar-width\n\n// Scroll-along sidebars\n.sidebar-container,\n.toc-drawer\n  box-sizing: border-box\n  width: $sidebar-width\n\n.toc-drawer\n  background: var(--color-toc-background)\n  // See HACK described on top of this document\n  padding-right: 1rem\n\n.sidebar-sticky,\n.toc-sticky\n  position: sticky\n  top: 0\n  height: min(100%, 100vh)\n  height: 100vh\n\n  display: flex\n  flex-direction: column\n\n.sidebar-scroll,\n.toc-scroll\n  flex-grow: 1\n  flex-shrink: 1\n\n  overflow: auto\n  scroll-behavior: smooth\n\n// Central items.\n.content\n  padding: 0 $content-padding\n  width: $content-width\n\n  display: flex\n  flex-direction: column\n  justify-content: space-between\n\n.icon\n  display: inline-block\n  height: 1rem\n  width: 1rem\n  svg\n    width: 100%\n    height: 100%\n\n//\n// Accommodate announcement banner\n//\n.announcement\n  background-color: var(--color-announcement-background)\n  color: var(--color-announcement-text)\n\n  height: var(--header-height)\n  display: flex\n  align-items: center\n  overflow-x: auto\n  & + .page\n    min-height: calc(100% - var(--header-height))\n\n.announcement-content\n  box-sizing: border-box\n  padding: 0.5rem\n  min-width: 100%\n  white-space: nowrap\n  text-align: center\n\n  a\n    color: var(--color-announcement-text)\n    text-decoration-color: var(--color-announcement-text)\n\n    &:hover\n      color: var(--color-announcement-text)\n      text-decoration-color: var(--color-link--hover)\n\n////////////////////////////////////////////////////////////////////////////////\n// Toggles for theme\n////////////////////////////////////////////////////////////////////////////////\n.no-js .theme-toggle-container  // don't show theme toggle if there's no JS\n  display: none\n\n.theme-toggle-container\n  vertical-align: middle\n\n.theme-toggle\n  cursor: pointer\n  border: none\n  padding: 0\n  background: transparent\n\n.theme-toggle svg\n  vertical-align: middle\n  height: 1rem\n  width: 1rem\n  color: var(--color-foreground-primary)\n  display: none\n\n.theme-toggle-header\n  float: left\n  padding: 1rem 0.5rem\n\n////////////////////////////////////////////////////////////////////////////////\n// Toggles for elements\n////////////////////////////////////////////////////////////////////////////////\n.toc-overlay-icon, .nav-overlay-icon\n  display: none\n  cursor: pointer\n\n  .icon\n    color: var(--color-foreground-secondary)\n    height: 1rem\n    width: 1rem\n\n.toc-header-icon, .nav-overlay-icon\n  // for when we set display: flex\n  justify-content: center\n  align-items: center\n\n.toc-content-icon\n  height: 1.5rem\n  width: 1.5rem\n\n.content-icon-container\n  float: right\n  display: flex\n  margin-top: 1.5rem\n  margin-left: 1rem\n  margin-bottom: 1rem\n  gap: 0.5rem\n\n  .edit-this-page svg\n    color: inherit\n    height: 1rem\n    width: 1rem\n\n.sidebar-toggle\n  position: absolute\n  display: none\n// <debugging things>\n.sidebar-toggle[name=\"__toc\"]\n  left: 20px\n.sidebar-toggle:checked\n  left: 40px\n// </debugging things>\n\n.overlay\n  position: fixed\n  top: 0\n  width: 0\n  height: 0\n\n  transition: width 0ms, height 0ms, opacity 250ms ease-out\n\n  opacity: 0\n  background-color: rgba(0, 0, 0, 0.54)\n.sidebar-overlay\n  z-index: 20\n.toc-overlay\n  z-index: 40\n\n// Keep things on top and smooth.\n.sidebar-drawer\n  z-index: 30\n  transition: left 250ms ease-in-out\n.toc-drawer\n  z-index: 50\n  transition: right 250ms ease-in-out\n\n// Show the Sidebar\n#__navigation:checked\n  & ~ .sidebar-overlay\n    width: 100%\n    height: 100%\n    opacity: 1\n  & ~ .page\n    .sidebar-drawer\n      top: 0\n      left: 0\n      // Show the toc sidebar\n#__toc:checked\n  & ~ .toc-overlay\n    width: 100%\n    height: 100%\n    opacity: 1\n  & ~ .page\n    .toc-drawer\n      top: 0\n      right: 0\n\n////////////////////////////////////////////////////////////////////////////////\n// Back to top\n////////////////////////////////////////////////////////////////////////////////\n.back-to-top\n  text-decoration: none\n\n  display: none\n  position: fixed\n  left: 0\n  top: 1rem\n  padding: 0.5rem\n  padding-right: 0.75rem\n  border-radius: 1rem\n  font-size: 0.8125rem\n\n  background: var(--color-background-primary)\n  box-shadow: 0 0.2rem 0.5rem rgba(0, 0, 0, 0.05), #6b728080 0px 0px 1px 0px\n\n  z-index: 10\n\n  margin-left: 50%\n  transform: translateX(-50%)\n  svg\n    height: 1rem\n    width: 1rem\n    fill: currentColor\n    display: inline-block\n\n  span\n    margin-left: 0.25rem\n\n  .show-back-to-top &\n    display: flex\n    align-items: center\n\n////////////////////////////////////////////////////////////////////////////////\n// Responsive layouting\n////////////////////////////////////////////////////////////////////////////////\n// Make things a bit bigger on bigger screens.\n@media (min-width: $full-width + $sidebar-width)\n  html\n    font-size: 110%\n\n@media (max-width: $full-width)\n  // Collapse \"toc\" into the icon.\n  .toc-content-icon\n    display: flex\n  .toc-drawer\n    position: fixed\n    height: 100vh\n    top: 0\n    right: -$sidebar-width\n    border-left: 1px solid var(--color-background-muted)\n  .toc-tree\n    border-left: none\n    font-size: var(--toc-font-size--mobile)\n\n  // Accomodate for a changed content width.\n  .sidebar-drawer\n    width: calc((100% - #{$full-width - $sidebar-width}) / 2 + #{$sidebar-width})\n\n@media (max-width: $full-width - $sidebar-width)\n  // Collapse \"navigation\".\n  .nav-overlay-icon\n    display: flex\n  .sidebar-drawer\n    position: fixed\n    height: 100vh\n    width: $sidebar-width\n\n    top: 0\n    left: -$sidebar-width\n\n  // Swap which icon is visible.\n  .toc-header-icon\n    display: flex\n  .toc-content-icon, .theme-toggle-content\n    display: none\n  .theme-toggle-header\n    display: block\n\n  // Show the header.\n  .mobile-header\n    position: sticky\n    top: 0\n    display: flex\n    justify-content: space-between\n    align-items: center\n\n    .header-left,\n    .header-right\n      display: flex\n      height: var(--header-height)\n      padding: 0 var(--header-padding)\n      label\n        height: 100%\n        width: 100%\n        user-select: none\n\n  .nav-overlay-icon .icon,\n  .theme-toggle svg\n    height: 1.25rem\n    width: 1.25rem\n\n  // Add a scroll margin for the content\n  :target\n    scroll-margin-top: var(--header-height)\n\n  // Show back-to-top below the header\n  .back-to-top\n    top: calc(var(--header-height) + 0.5rem)\n\n  // Center the page, and accommodate for the header.\n  .page\n    flex-direction: column\n    justify-content: center\n  .content\n    margin-left: auto\n    margin-right: auto\n\n@media (max-width: $content-width + 2* $content-padding)\n  // Content should respect window limits.\n  .content\n    width: 100%\n    overflow-x: auto\n\n@media (max-width: $content-width)\n  .content\n    padding: 0 $content-padding--small\n    // Don't float sidebars to the right.\n  article aside.sidebar\n    float: none\n    width: 100%\n    margin: 1rem 0\n","// Overall Layout Variables\n//\n// Because CSS variables can't be used in media queries. The fact that this\n// makes the layout non-user-configurable is a good thing.\n$content-padding: 3em;\n$content-padding--small: 1em;\n$content-width: 46em;\n$sidebar-width: 15em;\n$full-width: $content-width + 2 * ($content-padding + $sidebar-width);\n","//\n// The design here is strongly inspired by mkdocs-material.\n.admonition, .topic\n  margin: 1rem auto\n  padding: 0 0.5rem 0.5rem 0.5rem\n\n  background: var(--color-admonition-background)\n\n  border-radius: 0.2rem\n  box-shadow: 0 0.2rem 0.5rem rgba(0, 0, 0, 0.05), 0 0 0.0625rem rgba(0, 0, 0, 0.1)\n\n  font-size: var(--admonition-font-size)\n\n  overflow: hidden\n  page-break-inside: avoid\n\n  // First element should have no margin, since the title has it.\n  > :nth-child(2)\n    margin-top: 0\n\n  // Last item should have no margin, since we'll control that w/ padding\n  > :last-child\n    margin-bottom: 0\n\np.admonition-title, p.topic-title\n  position: relative\n  margin: 0 -0.5rem 0.5rem\n  padding-left: 2rem\n  padding-right: .5rem\n  padding-top: .4rem\n  padding-bottom: .4rem\n\n  font-weight: 500\n  font-size: var(--admonition-title-font-size)\n  line-height: 1.3\n\n    // Our fancy icon\n  &::before\n    content: \"\"\n    position: absolute\n    left: 0.5rem\n    width: 1rem\n    height: 1rem\n\n// Default styles\np.admonition-title\n  background-color: var(--color-admonition-title-background)\n  &::before\n    background-color: var(--color-admonition-title)\n    mask-image: var(--icon-admonition-default)\n    mask-repeat: no-repeat\n\np.topic-title\n  background-color: var(--color-topic-title-background)\n  &::before\n    background-color: var(--color-topic-title)\n    mask-image: var(--icon-topic-default)\n    mask-repeat: no-repeat\n\n//\n// Variants\n//\n.admonition\n  border-left: 0.2rem solid var(--color-admonition-title)\n\n  @each $type, $value in $admonitions\n    &.#{$type}\n      border-left-color: var(--color-admonition-title--#{$type})\n      > .admonition-title\n        background-color: var(--color-admonition-title-background--#{$type})\n        &::before\n          background-color: var(--color-admonition-title--#{$type})\n          mask-image: var(--icon-#{nth($value, 2)})\n\n.admonition-todo > .admonition-title\n  text-transform: uppercase\n","// This file stylizes the API documentation (stuff generated by autodoc). It's\n// deeply nested due to how autodoc structures the HTML without enough classes\n// to select the relevant items.\n\n// API docs!\ndl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple)\n  // Tweak the spacing of all the things!\n  dd\n    margin-left: 2rem\n    > :first-child\n      margin-top: 0.125rem\n    > :last-child\n      margin-bottom: 0.75rem\n\n  // This is used for the arguments\n  .field-list\n    margin-bottom: 0.75rem\n\n    // \"Headings\" (like \"Parameters\" and \"Return\")\n    > dt\n      text-transform: uppercase\n      font-size: var(--font-size--small)\n\n    dd:empty\n      margin-bottom: 0.5rem\n    dd > ul\n      margin-left: -1.2rem\n      > li\n        > p:nth-child(2)\n          margin-top: 0\n        // When the last-empty-paragraph follows a paragraph, it doesn't need\n        // to augument the existing spacing.\n        > p + p:last-child:empty\n          margin-top: 0\n          margin-bottom: 0\n\n  // Colorize the elements\n  > dt\n    color: var(--color-api-overall)\n\n.sig:not(.sig-inline)\n  font-weight: bold\n\n  font-size: var(--api-font-size)\n  font-family: var(--font-stack--monospace)\n\n  margin-left: -0.25rem\n  margin-right: -0.25rem\n  padding-top: 0.25rem\n  padding-bottom: 0.25rem\n  padding-right: 0.5rem\n\n  // These are intentionally em, to properly match the font size.\n  padding-left: 3em\n  text-indent: -2.5em\n\n  border-radius: 0.25rem\n\n  background: var(--color-api-background)\n  transition: background 100ms ease-out\n\n  &:hover\n    background: var(--color-api-background-hover)\n\n  // adjust the size of the [source] link on the right.\n  a.reference\n    .viewcode-link\n      font-weight: normal\n      width: 3.5rem\n\n  // Break words when they're too long\n  span.pre\n    overflow-wrap: anywhere\n\nem.property\n  font-style: normal\n  &:first-child\n    color: var(--color-api-keyword)\n.sig-name\n  color: var(--color-api-name)\n.sig-prename\n  font-weight: normal\n  color: var(--color-api-pre-name)\n.sig-paren\n  color: var(--color-api-paren)\n.sig-param\n  font-style: normal\n\n.versionmodified\n  font-style: italic\ndiv.versionadded, div.versionchanged, div.deprecated\n  p\n    margin-top: 0.125rem\n    margin-bottom: 0.125rem\n\n// Align the [docs] and [source] to the right.\n.viewcode-link, .viewcode-back\n  float: right\n  text-align: right\n",".line-block\n  margin-top: 0.5rem\n  margin-bottom: 0.75rem\n  .line-block\n    margin-top: 0rem\n    margin-bottom: 0rem\n    padding-left: 1rem\n","// Captions\narticle p.caption,\ntable > caption,\n.code-block-caption\n  font-size: var(--font-size--small)\n  text-align: center\n\n// Caption above a TOCTree\n.toctree-wrapper.compound\n  .caption, :not(.caption) > .caption-text\n    font-size: var(--font-size--small)\n    text-transform: uppercase\n\n    text-align: initial\n    margin-bottom: 0\n\n  > ul\n    margin-top: 0\n    margin-bottom: 0\n","// Inline code\ncode.literal, .sig-inline\n  background: var(--color-inline-code-background)\n  border-radius: 0.2em\n  // Make the font smaller, and use padding to recover.\n  font-size: var(--font-size--small--2)\n  padding: 0.1em 0.2em\n\n  overflow-wrap: break-word\n\n  p &\n    border: 1px solid var(--color-background-border)\n\n.sig-inline\n  font-family: var(--font-stack--monospace)\n\n// Code and Literal Blocks\n$code-spacing-vertical: 0.625rem\n$code-spacing-horizontal: 0.875rem\n\n// Wraps every literal block + line numbers.\ndiv[class*=\" highlight-\"],\ndiv[class^=\"highlight-\"]\n  margin: 1em 0\n  display: flex\n\n  .table-wrapper\n    margin: 0\n    padding: 0\n\npre\n  margin: 0\n  padding: 0\n  overflow: auto\n\n  // Needed to have more specificity than pygments' \"pre\" selector. :(\n  article[role=\"main\"] .highlight &\n    line-height: 1.5\n\n  &.literal-block,\n  .highlight &\n    font-size: var(--code-font-size)\n    padding: $code-spacing-vertical $code-spacing-horizontal\n\n  // Make it look like all the other blocks.\n  &.literal-block\n    margin-top: 1rem\n    margin-bottom: 1rem\n\n    border-radius: 0.2rem\n    background-color: var(--color-code-background)\n    color: var(--color-code-foreground)\n\n// All code is always contained in this.\n.highlight\n  width: 100%\n  border-radius: 0.2rem\n\n  // Make line numbers and prompts un-selectable.\n  .gp, span.linenos\n    user-select: none\n    pointer-events: none\n\n  // Expand the line-highlighting.\n  .hll\n    display: block\n    margin-left: -$code-spacing-horizontal\n    margin-right: -$code-spacing-horizontal\n    padding-left: $code-spacing-horizontal\n    padding-right: $code-spacing-horizontal\n\n/* Make code block captions be nicely integrated */\n.code-block-caption\n  display: flex\n  padding: $code-spacing-vertical $code-spacing-horizontal\n\n  border-radius: 0.25rem\n  border-bottom-left-radius: 0\n  border-bottom-right-radius: 0\n  font-weight: 300\n  border-bottom: 1px solid\n\n  background-color: var(--color-code-background)\n  color: var(--color-code-foreground)\n  border-color: var(--color-background-border)\n\n  + div[class]\n    margin-top: 0\n    pre\n      border-top-left-radius: 0\n      border-top-right-radius: 0\n\n// When `html_codeblock_linenos_style` is table.\n.highlighttable\n  width: 100%\n  display: block\n  tbody\n    display: block\n\n  tr\n    display: flex\n\n  // Line numbers\n  td.linenos\n    background-color: var(--color-code-background)\n    color: var(--color-code-foreground)\n    padding: $code-spacing-vertical $code-spacing-horizontal\n    padding-right: 0\n    border-top-left-radius: 0.2rem\n    border-bottom-left-radius: 0.2rem\n\n  .linenodiv\n    padding-right: $code-spacing-horizontal\n    font-size: var(--code-font-size)\n    box-shadow: -0.0625rem 0 var(--color-foreground-border) inset\n\n  // Actual code\n  td.code\n    padding: 0\n    display: block\n    flex: 1\n    overflow: hidden\n\n    .highlight\n      border-top-left-radius: 0\n      border-bottom-left-radius: 0\n\n// When `html_codeblock_linenos_style` is inline.\n.highlight\n  span.linenos\n    display: inline-block\n    padding-left: 0\n    padding-right: $code-spacing-horizontal\n    margin-right: $code-spacing-horizontal\n    box-shadow: -0.0625rem 0 var(--color-foreground-border) inset\n","// Inline Footnote Reference\n.footnote-reference\n  font-size: var(--font-size--small--4)\n  vertical-align: super\n\n// Definition list, listing the content of each note.\n// docutils <= 0.17\ndl.footnote.brackets\n  font-size: var(--font-size--small)\n  color: var(--color-foreground-secondary)\n\n  display: grid\n  grid-template-columns: max-content auto\n  dt\n    margin: 0\n    > .fn-backref\n      margin-left: 0.25rem\n\n    &:after\n      content: \":\"\n\n    .brackets\n      &:before\n        content: \"[\"\n      &:after\n        content: \"]\"\n\n  dd\n    margin: 0\n    padding: 0 1rem\n\n// docutils >= 0.18\naside.footnote\n  font-size: var(--font-size--small)\n  color: var(--color-foreground-secondary)\n\naside.footnote > span,\ndiv.citation > span\n  float: left\n  font-weight: 500\n  padding-right: 0.25rem\n\naside.footnote > p,\ndiv.citation > p\n  margin-left: 2rem\n","//\n// Figures\n//\nimg\n  box-sizing: border-box\n  max-width: 100%\n  height: auto\n\narticle\n  figure, .figure\n    border-radius: 0.2rem\n\n    margin: 0\n    :last-child\n      margin-bottom: 0\n\n  .align-left\n    float: left\n    clear: left\n    margin: 0 1rem 1rem\n\n  .align-right\n    float: right\n    clear: right\n    margin: 0 1rem 1rem\n\n  .align-default,\n  .align-center\n    display: block\n    text-align: center\n    margin-left: auto\n    margin-right: auto\n\n  // WELL, table needs to be stylised like a table.\n  table.align-default\n    display: table\n    text-align: initial\n",".genindex-jumpbox, .domainindex-jumpbox\n  border-top: 1px solid var(--color-background-border)\n  border-bottom: 1px solid var(--color-background-border)\n  padding: 0.25rem\n\n.genindex-section, .domainindex-section\n  h2\n    margin-top: 0.75rem\n    margin-bottom: 0.5rem\n  ul\n    margin-top: 0\n    margin-bottom: 0\n","ul,\nol\n  padding-left: 1.2rem\n\n  // Space lists out like paragraphs\n  margin-top: 1rem\n  margin-bottom: 1rem\n  // reduce margins within li.\n  li\n    > p:first-child\n      margin-top: 0.25rem\n      margin-bottom: 0.25rem\n\n    > p:last-child\n      margin-top: 0.25rem\n\n    > ul,\n    > ol\n      margin-top: 0.5rem\n      margin-bottom: 0.5rem\n\nol\n  &.arabic\n    list-style: decimal\n  &.loweralpha\n    list-style: lower-alpha\n  &.upperalpha\n    list-style: upper-alpha\n  &.lowerroman\n    list-style: lower-roman\n  &.upperroman\n    list-style: upper-roman\n\n// Don't space lists out when they're \"simple\" or in a `.. toctree::`\n.simple,\n.toctree-wrapper\n  li\n    > ul,\n    > ol\n      margin-top: 0\n      margin-bottom: 0\n\n// Definition Lists\n.field-list,\n.option-list,\ndl:not([class]),\ndl.simple,\ndl.footnote,\ndl.glossary\n  dt\n    font-weight: 500\n    margin-top: 0.25rem\n    + dt\n      margin-top: 0\n\n    .classifier::before\n      content: \":\"\n      margin-left: 0.2rem\n      margin-right: 0.2rem\n\n  dd\n    > p:first-child,\n    ul\n      margin-top: 0.125rem\n\n    ul\n      margin-bottom: 0.125rem\n",".math-wrapper\n  width: 100%\n  overflow-x: auto\n\ndiv.math\n  position: relative\n  text-align: center\n\n  .headerlink,\n  &:focus .headerlink\n    display: none\n\n  &:hover .headerlink\n    display: inline-block\n\n  span.eqno\n    position: absolute\n    right: 0.5rem\n    top: 50%\n    transform: translate(0, -50%)\n    z-index: 1\n","// Abbreviations\nabbr[title]\n  cursor: help\n\n// \"Problematic\" content, as identified by Sphinx\n.problematic\n  color: var(--color-problematic)\n\n// Keyboard / Mouse \"instructions\"\nkbd:not(.compound)\n  margin: 0 0.2rem\n  padding: 0 0.2rem\n  border-radius: 0.2rem\n  border: 1px solid var(--color-foreground-border)\n  color: var(--color-foreground-primary)\n  vertical-align: text-bottom\n\n  font-size: var(--font-size--small--3)\n  display: inline-block\n\n  box-shadow: 0 0.0625rem 0 rgba(0, 0, 0, 0.2), inset 0 0 0 0.125rem var(--color-background-primary)\n\n  background-color: var(--color-background-secondary)\n\n// Blockquote\nblockquote\n  border-left: 4px solid var(--color-background-border)\n  background: var(--color-background-secondary)\n\n  margin-left: 0\n  margin-right: 0\n  padding: 0.5rem 1rem\n\n  .attribution\n    font-weight: 600\n    text-align: right\n\n  &.pull-quote,\n  &.highlights\n    font-size: 1.25em\n\n  &.epigraph,\n  &.pull-quote\n    border-left-width: 0\n    border-radius: 0.5rem\n\n  &.highlights\n    border-left-width: 0\n    background: transparent\n\n// Center align embedded-in-text images\np .reference img\n  vertical-align: middle\n","p.rubric\n  line-height: 1.25\n  font-weight: bold\n  font-size: 1.125em\n\n  // For Numpy-style documentation that's got rubrics within it.\n  // https://github.com/pradyunsg/furo/discussions/505\n  dd &\n    line-height: inherit\n    font-weight: inherit\n\n    font-size: var(--font-size--small)\n    text-transform: uppercase\n","article .sidebar\n  float: right\n  clear: right\n  width: 30%\n\n  margin-left: 1rem\n  margin-right: 0\n\n  border-radius: 0.2rem\n  background-color: var(--color-background-secondary)\n  border: var(--color-background-border) 1px solid\n\n  > *\n    padding-left: 1rem\n    padding-right: 1rem\n\n  > ul, > ol  // lists need additional padding, because bullets.\n    padding-left: 2.2rem\n\n  .sidebar-title\n    margin: 0\n    padding: 0.5rem 1rem\n    border-bottom: var(--color-background-border) 1px solid\n\n    font-weight: 500\n\n// TODO: subtitle\n// TODO: dedicated variables?\n",".table-wrapper\n  width: 100%\n  overflow-x: auto\n  margin-top: 1rem\n  margin-bottom: 0.5rem\n  padding: 0.2rem 0.2rem 0.75rem\n\ntable.docutils\n  border-radius: 0.2rem\n  border-spacing: 0\n  border-collapse: collapse\n\n  box-shadow: 0 0.2rem 0.5rem rgba(0, 0, 0, 0.05), 0 0 0.0625rem rgba(0, 0, 0, 0.1)\n\n  th\n    background: var(--color-table-header-background)\n\n  td,\n  th\n    // Space things out properly\n    padding: 0 0.25rem\n\n    // Get the borders looking just-right.\n    border-left: 1px solid var(--color-table-border)\n    border-right: 1px solid var(--color-table-border)\n    border-bottom: 1px solid var(--color-table-border)\n\n    p\n      margin: 0.25rem\n\n    &:first-child\n      border-left: none\n    &:last-child\n      border-right: none\n\n    // MyST-parser tables set these classes for control of column alignment\n    &.text-left\n      text-align: left\n    &.text-right\n      text-align: right\n    &.text-center\n      text-align: center\n",":target\n  scroll-margin-top: 0.5rem\n\n@media (max-width: $full-width - $sidebar-width)\n  :target\n    scroll-margin-top: calc(0.5rem + var(--header-height))\n\n  // When a heading is selected\n  section > span:target\n    scroll-margin-top: calc(0.8rem + var(--header-height))\n\n// Permalinks\n.headerlink\n  font-weight: 100\n  user-select: none\n\nh1,\nh2,\nh3,\nh4,\nh5,\nh6,\ndl dt,\np.caption,\nfigcaption p,\ntable > caption,\n.code-block-caption\n  > .headerlink\n    margin-left: 0.5rem\n    visibility: hidden\n  &:hover > .headerlink\n    visibility: visible\n\n  // Don't change to link-like, if someone adds the contents directive.\n  > .toc-backref\n    color: inherit\n    text-decoration-line: none\n\n// Figure and table captions are special.\nfigure:hover > figcaption > p > .headerlink,\ntable:hover > caption > .headerlink\n  visibility: visible\n\n:target >,  // Regular section[id] style anchors\nspan:target ~ // Non-regular span[id] style \"extra\" anchors\n  h1,\n  h2,\n  h3,\n  h4,\n  h5,\n  h6\n    &:nth-of-type(1)\n      background-color: var(--color-highlight-on-target)\n      // .headerlink\n      //   visibility: visible\n      code.literal\n        background-color: transparent\n\ntable:target > caption,\nfigure:target\n  background-color: var(--color-highlight-on-target)\n\n// Inline page contents\n.this-will-duplicate-information-and-it-is-still-useful-here li :target\n  background-color: var(--color-highlight-on-target)\n\n// Code block permalinks\n.literal-block-wrapper:target .code-block-caption\n  background-color: var(--color-highlight-on-target)\n\n// When a definition list item is selected\n//\n//   There isn't really an alternative to !important here, due to the\n//   high-specificity of API documentation's selector.\ndt:target\n  background-color: var(--color-highlight-on-target) !important\n\n// When a footnote reference is selected\n.footnote > dt:target + dd,\n.footnote-reference:target\n  background-color: var(--color-highlight-on-target)\n",".guilabel\n  background-color: var(--color-guilabel-background)\n  border: 1px solid var(--color-guilabel-border)\n  color: var(--color-guilabel-text)\n\n  padding: 0 0.3em\n  border-radius: 0.5em\n  font-size: 0.9em\n","// This file contains the styles used for stylizing the footer that's shown\n// below the content.\n\nfooter\n  font-size: var(--font-size--small)\n  display: flex\n  flex-direction: column\n\n  margin-top: 2rem\n\n// Bottom of page information\n.bottom-of-page\n  display: flex\n  align-items: center\n  justify-content: space-between\n\n  margin-top: 1rem\n  padding-top: 1rem\n  padding-bottom: 1rem\n\n  color: var(--color-foreground-secondary)\n  border-top: 1px solid var(--color-background-border)\n\n  line-height: 1.5\n\n  @media (max-width: $content-width)\n    text-align: center\n    flex-direction: column-reverse\n    gap: 0.25rem\n\n  .left-details\n    font-size: var(--font-size--small)\n\n  .right-details\n    display: flex\n    flex-direction: column\n    gap: 0.25rem\n    text-align: right\n\n  .icons\n    display: flex\n    justify-content: flex-end\n    gap: 0.25rem\n    font-size: 1rem\n\n    a\n      text-decoration: none\n\n    svg,\n    img\n      font-size: 1.125rem\n      height: 1em\n      width: 1em\n\n// Next/Prev page information\n.related-pages\n  a\n    display: flex\n    align-items: center\n\n    text-decoration: none\n    &:hover .page-info .title\n      text-decoration: underline\n      color: var(--color-link)\n      text-decoration-color: var(--color-link-underline)\n\n    svg.furo-related-icon,\n    svg.furo-related-icon > use\n      flex-shrink: 0\n\n      color: var(--color-foreground-border)\n\n      width: 0.75rem\n      height: 0.75rem\n      margin: 0 0.5rem\n\n    &.next-page\n      max-width: 50%\n\n      float: right\n      clear: right\n      text-align: right\n\n    &.prev-page\n      max-width: 50%\n\n      float: left\n      clear: left\n\n      svg\n        transform: rotate(180deg)\n\n.page-info\n  display: flex\n  flex-direction: column\n  overflow-wrap: anywhere\n\n  .next-page &\n    align-items: flex-end\n\n  .context\n    display: flex\n    align-items: center\n\n    padding-bottom: 0.1rem\n\n    color: var(--color-foreground-muted)\n    font-size: var(--font-size--small)\n    text-decoration: none\n","//\n// Search Page Listing\n//\nul.search\n  padding-left: 0\n  list-style: none\n\n  li\n    padding: 1rem 0\n    border-bottom: 1px solid var(--color-background-border)\n\n//\n// Highlighted by links in search page\n//\n[role=main] .highlighted\n  background-color: var(--color-highlighted-background)\n  color: var(--color-highlighted-text)\n","// This file contains the styles for the contents of the left sidebar, which\n// contains the navigation tree, logo, search etc.\n\n////////////////////////////////////////////////////////////////////////////////\n// Brand on top of the scrollable tree.\n////////////////////////////////////////////////////////////////////////////////\n.sidebar-brand\n  display: flex\n  flex-direction: column\n  flex-shrink: 0\n\n  padding: var(--sidebar-item-spacing-vertical) var(--sidebar-item-spacing-horizontal)\n  text-decoration: none\n\n.sidebar-brand-text\n  color: var(--color-sidebar-brand-text)\n  overflow-wrap: break-word\n  margin: var(--sidebar-item-spacing-vertical) 0\n  font-size: 1.5rem\n\n.sidebar-logo-container\n  margin: var(--sidebar-item-spacing-vertical) 0\n\n.sidebar-logo\n  margin: 0 auto\n  display: block\n  max-width: 100%\n\n////////////////////////////////////////////////////////////////////////////////\n// Search\n////////////////////////////////////////////////////////////////////////////////\n.sidebar-search-container\n  display: flex\n  align-items: center\n  margin-top: var(--sidebar-search-space-above)\n\n  position: relative\n\n  background: var(--color-sidebar-search-background)\n  &:hover,\n  &:focus-within\n    background: var(--color-sidebar-search-background--focus)\n\n  &::before\n    content: \"\"\n    position: absolute\n    left: var(--sidebar-item-spacing-horizontal)\n    width: var(--sidebar-search-icon-size)\n    height: var(--sidebar-search-icon-size)\n\n    background-color: var(--color-sidebar-search-icon)\n    mask-image: var(--icon-search)\n\n.sidebar-search\n  box-sizing: border-box\n\n  border: none\n  border-top: 1px solid var(--color-sidebar-search-border)\n  border-bottom: 1px solid var(--color-sidebar-search-border)\n\n  padding-top: var(--sidebar-search-input-spacing-vertical)\n  padding-bottom: var(--sidebar-search-input-spacing-vertical)\n  padding-right: var(--sidebar-search-input-spacing-horizontal)\n  padding-left: calc(var(--sidebar-item-spacing-horizontal) + var(--sidebar-search-input-spacing-horizontal) + var(--sidebar-search-icon-size))\n\n  width: 100%\n\n  color: var(--color-sidebar-search-foreground)\n  background: transparent\n  z-index: 10\n\n  &:focus\n    outline: none\n\n  &::placeholder\n    font-size: var(--sidebar-search-input-font-size)\n\n//\n// Hide Search Matches link\n//\n#searchbox .highlight-link\n  padding: var(--sidebar-item-spacing-vertical) var(--sidebar-item-spacing-horizontal) 0\n  margin: 0\n  text-align: center\n\n  a\n    color: var(--color-sidebar-search-icon)\n    font-size: var(--font-size--small--2)\n\n////////////////////////////////////////////////////////////////////////////////\n// Structure/Skeleton of the navigation tree (left)\n////////////////////////////////////////////////////////////////////////////////\n.sidebar-tree\n  font-size: var(--sidebar-item-font-size)\n  margin-top: var(--sidebar-tree-space-above)\n  margin-bottom: var(--sidebar-item-spacing-vertical)\n\n  ul\n    padding: 0\n    margin-top: 0\n    margin-bottom: 0\n\n    display: flex\n    flex-direction: column\n\n    list-style: none\n\n  li\n    position: relative\n    margin: 0\n\n    > ul\n      margin-left: var(--sidebar-item-spacing-horizontal)\n\n  .icon\n    color: var(--color-sidebar-link-text)\n\n  .reference\n    box-sizing: border-box\n    color: var(--color-sidebar-link-text)\n\n    // Fill the parent.\n    display: inline-block\n    line-height: var(--sidebar-item-line-height)\n    text-decoration: none\n\n    // Don't allow long words to cause wrapping.\n    overflow-wrap: anywhere\n\n    height: 100%\n    width: 100%\n\n    padding: var(--sidebar-item-spacing-vertical) var(--sidebar-item-spacing-horizontal)\n\n    &:hover\n      background: var(--color-sidebar-item-background--hover)\n\n    // Add a nice little \"external-link\" arrow here.\n    &.external::after\n      content: url('data:image/svg+xml,<svg width=\"12\" height=\"12\" xmlns=\"http://www.w3.org/2000/svg\" viewBox=\"0 0 24 24\" stroke-width=\"1.5\" stroke=\"%23607D8B\" fill=\"none\" stroke-linecap=\"round\" stroke-linejoin=\"round\"><path stroke=\"none\" d=\"M0 0h24v24H0z\"/><path d=\"M11 7h-5a2 2 0 0 0 -2 2v9a2 2 0 0 0 2 2h9a2 2 0 0 0 2 -2v-5\" /><line x1=\"10\" y1=\"14\" x2=\"20\" y2=\"4\" /><polyline points=\"15 4 20 4 20 9\" /></svg>')\n      margin: 0 0.25rem\n      vertical-align: middle\n      color: var(--color-sidebar-link-text)\n\n  // Make the current page reference bold.\n  .current-page > .reference\n    font-weight: bold\n\n  label\n    position: absolute\n    top: 0\n    right: 0\n    height: var(--sidebar-item-height)\n    width: var(--sidebar-expander-width)\n\n    cursor: pointer\n    user-select: none\n\n    display: flex\n    justify-content: center\n    align-items: center\n\n  .caption, :not(.caption) > .caption-text\n    font-size: var(--sidebar-caption-font-size)\n    color: var(--color-sidebar-caption-text)\n\n    font-weight: bold\n    text-transform: uppercase\n\n    margin: var(--sidebar-caption-space-above) 0 0 0\n    padding: var(--sidebar-item-spacing-vertical) var(--sidebar-item-spacing-horizontal)\n\n  // If it has children, add a bit more padding to wrap the content to avoid\n  // overlapping with the <label>\n  li.has-children\n    > .reference\n      padding-right: var(--sidebar-expander-width)\n\n  // Colorize the top-level list items and icon.\n  .toctree-l1\n    & > .reference,\n    & > label .icon\n      color: var(--color-sidebar-link-text--top-level)\n\n  // Color changes on hover\n  label\n    background: var(--color-sidebar-item-expander-background)\n    &:hover\n      background: var(--color-sidebar-item-expander-background--hover)\n\n  .current > .reference\n    background: var(--color-sidebar-item-background--current)\n    &:hover\n      background: var(--color-sidebar-item-background--hover)\n\n.toctree-checkbox\n  position: absolute\n  display: none\n\n////////////////////////////////////////////////////////////////////////////////\n// Togglable expand/collapse\n////////////////////////////////////////////////////////////////////////////////\n.toctree-checkbox\n  ~ ul\n    display: none\n\n  ~ label .icon svg\n    transform: rotate(90deg)\n\n.toctree-checkbox:checked\n  ~ ul\n    display: block\n\n  ~ label .icon svg\n    transform: rotate(-90deg)\n","// This file contains the styles for the contents of the right sidebar, which\n// contains the table of contents for the current page.\n.toc-title-container\n  padding: var(--toc-title-padding)\n  padding-top: var(--toc-spacing-vertical)\n\n.toc-title\n  color: var(--color-toc-title-text)\n  font-size: var(--toc-title-font-size)\n  padding-left: var(--toc-spacing-horizontal)\n  text-transform: uppercase\n\n// If the ToC is not present, hide these elements coz they're not relevant.\n.no-toc\n  display: none\n\n.toc-tree-container\n  padding-bottom: var(--toc-spacing-vertical)\n\n.toc-tree\n  font-size: var(--toc-font-size)\n  line-height: 1.3\n  border-left: 1px solid var(--color-background-border)\n\n  padding-left: calc(var(--toc-spacing-horizontal) - var(--toc-item-spacing-horizontal))\n\n  // Hide the first \"top level\" bullet.\n  > ul > li:first-child\n    padding-top: 0\n    & > ul\n      padding-left: 0\n    & > a\n      display: none\n\n  ul\n    list-style-type: none\n    margin-top: 0\n    margin-bottom: 0\n    padding-left: var(--toc-item-spacing-horizontal)\n  li\n    padding-top: var(--toc-item-spacing-vertical)\n\n    &.scroll-current >.reference\n      color: var(--color-toc-item-text--active)\n      font-weight: bold\n\n  .reference\n    color: var(--color-toc-item-text)\n    text-decoration: none\n    overflow-wrap: anywhere\n\n.toc-scroll\n  max-height: 100vh\n  overflow-y: scroll\n\n// Be very annoying when someone includes the table of contents\n.contents:not(.this-will-duplicate-information-and-it-is-still-useful-here)\n  color: var(--color-problematic)\n  background: rgba(255, 0, 0, 0.25)\n  &::before\n    content: \"ERROR: Adding a table of contents in Furo-based documentation is unnecessary, and does not work well with existing styling.Add a 'this-will-duplicate-information-and-it-is-still-useful-here' class, if you want an escape hatch.\"\n","// Shameful hacks, to work around bugs.\n\n// MyST parser doesn't correctly generate classes, to align table contents.\n// https://github.com/executablebooks/MyST-Parser/issues/412\n.text-align\\:left > p\n  text-align: left\n\n.text-align\\:center > p\n  text-align: center\n\n.text-align\\:right > p\n  text-align: right\n"],"names":[],"sourceRoot":""}
\ No newline at end of file
diff --git a/python/docs/_static/tabs.css b/python/docs/_static/tabs.css
new file mode 100644
index 0000000000..13042e5672
--- /dev/null
+++ b/python/docs/_static/tabs.css
@@ -0,0 +1,110 @@
+/* body[data-theme] { */
+:root {
+  --tabs--label-text: #4b5563;
+  --tabs--label-text--hover: #4b5563;
+  --tabs--label-text--active: #0ea5e9;
+  --tabs--label-text--active--hover: #0ea5e9;
+  --tabs--label-background: transparent;
+  --tabs--label-background--hover: transparent;
+  --tabs--label-background--active: transparent;
+  --tabs--label-background--active--hover: transparent;
+  --tabs--label-border: transparent;
+  --tabs--label-border--hover: #d1d5db;
+  --tabs--label-border--active: #0ea5e9;
+  --tabs--label-border--active--hover: #0ea5e9;
+  --tabs--padding-x: 1.25em;
+  --tabs--margin-x: 0;
+  --tabs--border: #e6e6e6;
+}
+
+/* Hide radio buttons */
+.tab-set > input {
+  position: absolute;
+  opacity: 0;
+}
+
+/* Tab set container */
+.tab-set {
+  border-radius: 2px;
+  display: flex;
+  flex-wrap: wrap;
+  margin: 0.75em 0;
+  position: relative;
+}
+
+/* Tab label */
+.tab-set > label {
+  z-index: 1;
+
+  width: auto;
+  border-bottom: 2px solid var(--tabs--label-border);
+  padding: 1em var(--tabs--padding-x) 0.5em;
+  margin-left: var(--tabs--margin-x);
+
+  color: var(--tabs--label-text);
+  background: var(--tabs--label-background);
+
+  transition: color 250ms;
+
+  cursor: pointer;
+
+  font-size: 0.875em;
+  font-weight: 700;
+}
+.tab-set > label:nth-child(2) {
+  margin-left: 0;
+}
+
+/* Hovered label */
+.tab-set > label:hover {
+  color: var(--tabs--label-text--hover);
+  background: var(--tabs--label-background--hover);
+  border-color: var(--tabs--label-border--hover);
+}
+
+/* Active tab label */
+.tab-set > input:checked + label {
+  color: var(--tabs--label-text--active);
+  background: var(--tabs--label-background--active);
+  border-color: var(--tabs--label-border--active);
+}
+.tab-set > input:checked + label:hover {
+  color: var(--tabs--label-text--active--hover);
+  background: var(--tabs--label-background--active--hover);
+  border-color: var(--tabs--label-border--active--hover);
+}
+
+/* Tab content */
+.tab-content {
+  order: 99;
+  display: none;
+  width: 100%;
+  box-shadow: 0 -0.0625rem var(--tabs--border);
+}
+
+/* Show content, when input is checked. */
+.tab-set > input:checked + label + .tab-content {
+  display: block;
+}
+.tab-content > p:first-child {
+  margin-top: 0.75rem;
+}
+/* Remove the top border on first code block */
+.tab-content > [class^="highlight-"]:first-child .highlight {
+  border-top: none;
+  border-top-left-radius: 0;
+  border-top-right-radius: 0;
+}
+
+/* Remove margins on children */
+.tab-content > *:first-child {
+  margin-top: 0;
+}
+.tab-content > *:last-child {
+  margin-bottom: 0;
+}
+
+/* Remove margins on nested tabs */
+.tab-content > .tab-set {
+  margin: 0;
+}
diff --git a/python/docs/_static/tabs.js b/python/docs/_static/tabs.js
new file mode 100644
index 0000000000..d15e04544f
--- /dev/null
+++ b/python/docs/_static/tabs.js
@@ -0,0 +1,21 @@
+var labels_by_text = {};
+
+function ready() {
+  var li = document.getElementsByClassName("tab-label");
+  for (const label of li) {
+    label.onclick = onLabelClick;
+    const text = label.textContent;
+    if (!labels_by_text[text]) {
+      labels_by_text[text] = [];
+    }
+    labels_by_text[text].push(label);
+  }
+}
+
+function onLabelClick() {
+  // Activate other labels with the same text.
+  for (label of labels_by_text[this.textContent]) {
+    label.previousSibling.checked = true;
+  }
+}
+document.addEventListener("DOMContentLoaded", ready, false);
diff --git a/python/docs/contribute.html b/python/docs/contribute.html
new file mode 100644
index 0000000000..ea3459845c
--- /dev/null
+++ b/python/docs/contribute.html
@@ -0,0 +1,320 @@
+<!doctype html>
+<html class="no-js" lang="en">
+  <head><meta charset="utf-8"/>
+    <meta name="viewport" content="width=device-width,initial-scale=1"/>
+    <meta name="color-scheme" content="light dark"><meta name="generator" content="Docutils 0.19: https://docutils.sourceforge.io/" />
+<link rel="index" title="Index" href="genindex.html" /><link rel="search" title="Search" href="search.html" /><link rel="next" title="CUTLASS Python API" href="modules.html" /><link rel="prev" title="Basic example of using the CUTLASS Python interface" href="externals/00_basic_gemm.html" />
+        <link rel="canonical" href="docs/contribute.html" />
+
+    <!-- Generated with Sphinx 6.1.3 and Furo 2023.03.27 -->
+        <title>Contributing - CUTLASS Python</title>
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css" />
+    <link rel="stylesheet" type="text/css" href="_static/styles/furo.css?digest=fad236701ea90a88636c2a8c73b44ae642ed2a53" />
+    <link rel="stylesheet" type="text/css" href="_static/copybutton.css" />
+    <link rel="stylesheet" type="text/css" href="_static/tabs.css" />
+    <link rel="stylesheet" type="text/css" href="_static/styles/furo-extensions.css?digest=30d1aed668e5c3a91c3e3bf6a60b675221979f0e" />
+    
+    
+
+
+<style>
+  body {
+    --color-code-background: #eeffcc;
+  --color-code-foreground: black;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+  }
+  @media not print {
+    body[data-theme="dark"] {
+      --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+    }
+    @media (prefers-color-scheme: dark) {
+      body:not([data-theme="light"]) {
+        --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+      }
+    }
+  }
+</style></head>
+  <body>
+    
+    <script>
+      document.body.dataset.theme = localStorage.getItem("theme") || "auto";
+    </script>
+    
+
+<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
+  <symbol id="svg-toc" viewBox="0 0 24 24">
+    <title>Contents</title>
+    <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 1024 1024">
+      <path d="M408 442h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8zm-8 204c0 4.4 3.6 8 8 8h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56zm504-486H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zm0 632H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zM115.4 518.9L271.7 642c5.8 4.6 14.4.5 14.4-6.9V388.9c0-7.4-8.5-11.5-14.4-6.9L115.4 505.1a8.74 8.74 0 0 0 0 13.8z"/>
+    </svg>
+  </symbol>
+  <symbol id="svg-menu" viewBox="0 0 24 24">
+    <title>Menu</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-menu">
+      <line x1="3" y1="12" x2="21" y2="12"></line>
+      <line x1="3" y1="6" x2="21" y2="6"></line>
+      <line x1="3" y1="18" x2="21" y2="18"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-arrow-right" viewBox="0 0 24 24">
+    <title>Expand</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-chevron-right">
+      <polyline points="9 18 15 12 9 6"></polyline>
+    </svg>
+  </symbol>
+  <symbol id="svg-sun" viewBox="0 0 24 24">
+    <title>Light mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="feather-sun">
+      <circle cx="12" cy="12" r="5"></circle>
+      <line x1="12" y1="1" x2="12" y2="3"></line>
+      <line x1="12" y1="21" x2="12" y2="23"></line>
+      <line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
+      <line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
+      <line x1="1" y1="12" x2="3" y2="12"></line>
+      <line x1="21" y1="12" x2="23" y2="12"></line>
+      <line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
+      <line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-moon" viewBox="0 0 24 24">
+    <title>Dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-moon">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none" />
+      <path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z" />
+    </svg>
+  </symbol>
+  <symbol id="svg-sun-half" viewBox="0 0 24 24">
+    <title>Auto light/dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-shadow">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none"/>
+      <circle cx="12" cy="12" r="9" />
+      <path d="M13 12h5" />
+      <path d="M13 15h4" />
+      <path d="M13 18h1" />
+      <path d="M13 9h4" />
+      <path d="M13 6h1" />
+    </svg>
+  </symbol>
+</svg>
+
+<input type="checkbox" class="sidebar-toggle" name="__navigation" id="__navigation">
+<input type="checkbox" class="sidebar-toggle" name="__toc" id="__toc">
+<label class="overlay sidebar-overlay" for="__navigation">
+  <div class="visually-hidden">Hide navigation sidebar</div>
+</label>
+<label class="overlay toc-overlay" for="__toc">
+  <div class="visually-hidden">Hide table of contents sidebar</div>
+</label>
+
+
+
+<div class="page">
+  <header class="mobile-header">
+    <div class="header-left">
+      <label class="nav-overlay-icon" for="__navigation">
+        <div class="visually-hidden">Toggle site navigation sidebar</div>
+        <i class="icon"><svg><use href="#svg-menu"></use></svg></i>
+      </label>
+    </div>
+    <div class="header-center">
+      <a href="index.html"><div class="brand">CUTLASS Python</div></a>
+    </div>
+    <div class="header-right">
+      <div class="theme-toggle-container theme-toggle-header">
+        <button class="theme-toggle">
+          <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+          <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+          <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+          <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+        </button>
+      </div>
+      <label class="toc-overlay-icon toc-header-icon no-toc" for="__toc">
+        <div class="visually-hidden">Toggle table of contents sidebar</div>
+        <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+      </label>
+    </div>
+  </header>
+  <aside class="sidebar-drawer">
+    <div class="sidebar-container">
+      
+      <div class="sidebar-sticky"><a class="sidebar-brand" href="index.html">
+  
+  <div class="sidebar-logo-container">
+    <img class="sidebar-logo only-light" src="_static/cutlass-logo-small.png" alt="Light Logo"/>
+    <img class="sidebar-logo only-dark" src="_static/cutlass-logo-small.png" alt="Dark Logo"/>
+  </div>
+  
+  <span class="sidebar-brand-text">CUTLASS Python</span>
+  
+</a><form class="sidebar-search-container" method="get" action="search.html" role="search">
+  <input class="sidebar-search" placeholder="Search" name="q" aria-label="Search">
+  <input type="hidden" name="check_keywords" value="yes">
+  <input type="hidden" name="area" value="default">
+</form>
+<div id="searchbox"></div><div class="sidebar-scroll"><div class="sidebar-tree">
+  <ul>
+<li class="toctree-l1"><a class="reference internal" href="index.html">Home</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Getting Started:</span></p>
+<ul class="current">
+<li class="toctree-l1"><a class="reference internal" href="install.html">Installation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="externals/00_basic_gemm.html">Getting Started</a></li>
+<li class="toctree-l1 current current-page"><a class="current reference internal" href="#">Contributing</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Python Documentation:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="modules.html">CUTLASS Python API</a><input class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" role="switch" type="checkbox"/><label for="toctree-checkbox-1"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2 has-children"><a class="reference internal" href="cutlass.html">CUTLASS</a><input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" role="switch" type="checkbox"/><label for="toctree-checkbox-2"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.emit.html">Emitters</a></li>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.op.html">Operations</a></li>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.utils.html">Utilities</a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Examples and Tutorials:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="examples.html">Examples</a><input class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" role="switch" type="checkbox"/><label for="toctree-checkbox-3"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2"><a class="reference internal" href="externals/00_basic_gemm.html">Basic GEMM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="externals/01_epilogue.html">Epilogue</a></li>
+<li class="toctree-l2"><a class="reference internal" href="externals/02_pytorch_extension_grouped_gemm.html">PyTorch Extension</a></li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Reference:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference external" href="https://github.com/NVIDIA/cutlass">Github</a></li>
+</ul>
+
+</div>
+</div>
+
+      </div>
+      
+    </div>
+  </aside>
+  <div class="main">
+    <div class="content">
+      <div class="article-container">
+        <a href="#" class="back-to-top muted-link">
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+            <path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12z"></path>
+          </svg>
+          <span>Back to top</span>
+        </a>
+        <div class="content-icon-container">
+          
+<div class="theme-toggle-container theme-toggle-content">
+            <button class="theme-toggle">
+              <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+              <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+              <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+              <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+            </button>
+          </div>
+          <label class="toc-overlay-icon toc-content-icon no-toc" for="__toc">
+            <div class="visually-hidden">Toggle table of contents sidebar</div>
+            <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+          </label>
+        </div>
+        <article role="main">
+          <section id="contributing">
+<h1>Contributing<a class="headerlink" href="#contributing" title="Permalink to this heading">#</a></h1>
+<p>Thank you for your interest in contributing to the CUTLASS Python interface. Based on the type of contribution, it will fall into two categories:</p>
+<ol class="arabic simple">
+<li><p>You want to report a bug, feature request, or documentation issue</p>
+<ul class="simple">
+<li><p>File an <a class="reference external" href="https://github.com/NVIDIA/cutlass/issues/new/choose">issue</a> describing what you encountered or what you want to see changed.</p></li>
+<li><p>The CUTLASS team will evaluate the issues and triage them, scheduling them for a release. If you believe the issue needs priority attention, comment on the issue to notify the team.</p></li>
+</ul>
+</li>
+<li><p>You want to implement a feature or bug-fix</p>
+<ul class="simple">
+<li><p>We welcome contributions from the community. We recommend that you contribute via a <a class="reference external" href="https://github.com/NVIDIA/cutlass/pulls">pull request</a>. If you have questions about CUTLASS, consider asking a question via the <a class="reference external" href="https://github.com/NVIDIA/cutlass/discussions">Discussions</a> tab. Please be sure to search through both existing issues and discussions to see whether your question has already been answered.</p></li>
+</ul>
+</li>
+</ol>
+</section>
+
+        </article>
+      </div>
+      <footer>
+        
+        <div class="related-pages">
+          <a class="next-page" href="modules.html">
+              <div class="page-info">
+                <div class="context">
+                  <span>Next</span>
+                </div>
+                <div class="title">CUTLASS Python API</div>
+              </div>
+              <svg class="furo-related-icon"><use href="#svg-arrow-right"></use></svg>
+            </a>
+          <a class="prev-page" href="externals/00_basic_gemm.html">
+              <svg class="furo-related-icon"><use href="#svg-arrow-right"></use></svg>
+              <div class="page-info">
+                <div class="context">
+                  <span>Previous</span>
+                </div>
+                
+                <div class="title">Basic example of using the CUTLASS Python interface</div>
+                
+              </div>
+            </a>
+        </div>
+        <div class="bottom-of-page">
+          <div class="left-details">
+            <div class="copyright">
+                Copyright &#169; 2023, NVIDIA
+            </div>
+            Made with <a href="https://www.sphinx-doc.org/">Sphinx</a> and <a class="muted-link" href="https://pradyunsg.me">@pradyunsg</a>'s
+            
+            <a href="https://github.com/pradyunsg/furo">Furo</a>
+            
+          </div>
+          <div class="right-details">
+            <div class="icons">
+              <a class="muted-link " href="https://github.com/NVIDIA/cutlass" aria-label="GitHub">
+                <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 16 16">
+                    <path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"></path>
+                </svg>
+            </a>
+              
+            </div>
+          </div>
+        </div>
+        
+      </footer>
+    </div>
+    <aside class="toc-drawer no-toc">
+      
+      
+      
+    </aside>
+  </div>
+</div><script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
+    <script src="_static/doctools.js"></script>
+    <script src="_static/sphinx_highlight.js"></script>
+    <script src="_static/scripts/furo.js"></script>
+    <script src="_static/clipboard.min.js"></script>
+    <script src="_static/copybutton.js"></script>
+    <script src="_static/tabs.js"></script>
+    <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
+    </body>
+</html>
\ No newline at end of file
diff --git a/python/docs/cutlass.emit.html b/python/docs/cutlass.emit.html
new file mode 100644
index 0000000000..342b3f804d
--- /dev/null
+++ b/python/docs/cutlass.emit.html
@@ -0,0 +1,394 @@
+<!doctype html>
+<html class="no-js" lang="en">
+  <head><meta charset="utf-8"/>
+    <meta name="viewport" content="width=device-width,initial-scale=1"/>
+    <meta name="color-scheme" content="light dark"><meta name="generator" content="Docutils 0.19: https://docutils.sourceforge.io/" />
+<link rel="index" title="Index" href="genindex.html" /><link rel="search" title="Search" href="search.html" /><link rel="next" title="Operations" href="cutlass.op.html" /><link rel="prev" title="CUTLASS" href="cutlass.html" />
+        <link rel="canonical" href="docs/cutlass.emit.html" />
+
+    <!-- Generated with Sphinx 6.1.3 and Furo 2023.03.27 -->
+        <title>Emitters - CUTLASS Python</title>
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css" />
+    <link rel="stylesheet" type="text/css" href="_static/styles/furo.css?digest=fad236701ea90a88636c2a8c73b44ae642ed2a53" />
+    <link rel="stylesheet" type="text/css" href="_static/copybutton.css" />
+    <link rel="stylesheet" type="text/css" href="_static/tabs.css" />
+    <link rel="stylesheet" type="text/css" href="_static/styles/furo-extensions.css?digest=30d1aed668e5c3a91c3e3bf6a60b675221979f0e" />
+    
+    
+
+
+<style>
+  body {
+    --color-code-background: #eeffcc;
+  --color-code-foreground: black;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+  }
+  @media not print {
+    body[data-theme="dark"] {
+      --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+    }
+    @media (prefers-color-scheme: dark) {
+      body:not([data-theme="light"]) {
+        --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+      }
+    }
+  }
+</style></head>
+  <body>
+    
+    <script>
+      document.body.dataset.theme = localStorage.getItem("theme") || "auto";
+    </script>
+    
+
+<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
+  <symbol id="svg-toc" viewBox="0 0 24 24">
+    <title>Contents</title>
+    <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 1024 1024">
+      <path d="M408 442h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8zm-8 204c0 4.4 3.6 8 8 8h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56zm504-486H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zm0 632H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zM115.4 518.9L271.7 642c5.8 4.6 14.4.5 14.4-6.9V388.9c0-7.4-8.5-11.5-14.4-6.9L115.4 505.1a8.74 8.74 0 0 0 0 13.8z"/>
+    </svg>
+  </symbol>
+  <symbol id="svg-menu" viewBox="0 0 24 24">
+    <title>Menu</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-menu">
+      <line x1="3" y1="12" x2="21" y2="12"></line>
+      <line x1="3" y1="6" x2="21" y2="6"></line>
+      <line x1="3" y1="18" x2="21" y2="18"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-arrow-right" viewBox="0 0 24 24">
+    <title>Expand</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-chevron-right">
+      <polyline points="9 18 15 12 9 6"></polyline>
+    </svg>
+  </symbol>
+  <symbol id="svg-sun" viewBox="0 0 24 24">
+    <title>Light mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="feather-sun">
+      <circle cx="12" cy="12" r="5"></circle>
+      <line x1="12" y1="1" x2="12" y2="3"></line>
+      <line x1="12" y1="21" x2="12" y2="23"></line>
+      <line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
+      <line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
+      <line x1="1" y1="12" x2="3" y2="12"></line>
+      <line x1="21" y1="12" x2="23" y2="12"></line>
+      <line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
+      <line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-moon" viewBox="0 0 24 24">
+    <title>Dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-moon">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none" />
+      <path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z" />
+    </svg>
+  </symbol>
+  <symbol id="svg-sun-half" viewBox="0 0 24 24">
+    <title>Auto light/dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-shadow">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none"/>
+      <circle cx="12" cy="12" r="9" />
+      <path d="M13 12h5" />
+      <path d="M13 15h4" />
+      <path d="M13 18h1" />
+      <path d="M13 9h4" />
+      <path d="M13 6h1" />
+    </svg>
+  </symbol>
+</svg>
+
+<input type="checkbox" class="sidebar-toggle" name="__navigation" id="__navigation">
+<input type="checkbox" class="sidebar-toggle" name="__toc" id="__toc">
+<label class="overlay sidebar-overlay" for="__navigation">
+  <div class="visually-hidden">Hide navigation sidebar</div>
+</label>
+<label class="overlay toc-overlay" for="__toc">
+  <div class="visually-hidden">Hide table of contents sidebar</div>
+</label>
+
+
+
+<div class="page">
+  <header class="mobile-header">
+    <div class="header-left">
+      <label class="nav-overlay-icon" for="__navigation">
+        <div class="visually-hidden">Toggle site navigation sidebar</div>
+        <i class="icon"><svg><use href="#svg-menu"></use></svg></i>
+      </label>
+    </div>
+    <div class="header-center">
+      <a href="index.html"><div class="brand">CUTLASS Python</div></a>
+    </div>
+    <div class="header-right">
+      <div class="theme-toggle-container theme-toggle-header">
+        <button class="theme-toggle">
+          <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+          <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+          <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+          <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+        </button>
+      </div>
+      <label class="toc-overlay-icon toc-header-icon" for="__toc">
+        <div class="visually-hidden">Toggle table of contents sidebar</div>
+        <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+      </label>
+    </div>
+  </header>
+  <aside class="sidebar-drawer">
+    <div class="sidebar-container">
+      
+      <div class="sidebar-sticky"><a class="sidebar-brand" href="index.html">
+  
+  <div class="sidebar-logo-container">
+    <img class="sidebar-logo only-light" src="_static/cutlass-logo-small.png" alt="Light Logo"/>
+    <img class="sidebar-logo only-dark" src="_static/cutlass-logo-small.png" alt="Dark Logo"/>
+  </div>
+  
+  <span class="sidebar-brand-text">CUTLASS Python</span>
+  
+</a><form class="sidebar-search-container" method="get" action="search.html" role="search">
+  <input class="sidebar-search" placeholder="Search" name="q" aria-label="Search">
+  <input type="hidden" name="check_keywords" value="yes">
+  <input type="hidden" name="area" value="default">
+</form>
+<div id="searchbox"></div><div class="sidebar-scroll"><div class="sidebar-tree">
+  <ul>
+<li class="toctree-l1"><a class="reference internal" href="index.html">Home</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Getting Started:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="install.html">Installation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="externals/00_basic_gemm.html">Getting Started</a></li>
+<li class="toctree-l1"><a class="reference internal" href="contribute.html">Contributing</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Python Documentation:</span></p>
+<ul class="current">
+<li class="toctree-l1 current has-children"><a class="reference internal" href="modules.html">CUTLASS Python API</a><input checked="" class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" role="switch" type="checkbox"/><label for="toctree-checkbox-1"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul class="current">
+<li class="toctree-l2 current has-children"><a class="reference internal" href="cutlass.html">CUTLASS</a><input checked="" class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" role="switch" type="checkbox"/><label for="toctree-checkbox-2"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul class="current">
+<li class="toctree-l3 current current-page"><a class="current reference internal" href="#">Emitters</a></li>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.op.html">Operations</a></li>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.utils.html">Utilities</a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Examples and Tutorials:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="examples.html">Examples</a><input class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" role="switch" type="checkbox"/><label for="toctree-checkbox-3"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2"><a class="reference internal" href="externals/00_basic_gemm.html">Basic GEMM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="externals/01_epilogue.html">Epilogue</a></li>
+<li class="toctree-l2"><a class="reference internal" href="externals/02_pytorch_extension_grouped_gemm.html">PyTorch Extension</a></li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Reference:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference external" href="https://github.com/NVIDIA/cutlass">Github</a></li>
+</ul>
+
+</div>
+</div>
+
+      </div>
+      
+    </div>
+  </aside>
+  <div class="main">
+    <div class="content">
+      <div class="article-container">
+        <a href="#" class="back-to-top muted-link">
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+            <path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12z"></path>
+          </svg>
+          <span>Back to top</span>
+        </a>
+        <div class="content-icon-container">
+          
+<div class="theme-toggle-container theme-toggle-content">
+            <button class="theme-toggle">
+              <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+              <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+              <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+              <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+            </button>
+          </div>
+          <label class="toc-overlay-icon toc-content-icon" for="__toc">
+            <div class="visually-hidden">Toggle table of contents sidebar</div>
+            <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+          </label>
+        </div>
+        <article role="main">
+          <section id="emitters">
+<h1>Emitters<a class="headerlink" href="#emitters" title="Permalink to this heading">#</a></h1>
+<section id="module-cutlass.emit.common">
+<span id="common"></span><h2>Common<a class="headerlink" href="#module-cutlass.emit.common" title="Permalink to this heading">#</a></h2>
+<p>Common utilities for emitting CUTLASS kernels</p>
+</section>
+<section id="module-cutlass.emit.pytorch">
+<span id="pytorch"></span><h2>PyTorch<a class="headerlink" href="#module-cutlass.emit.pytorch" title="Permalink to this heading">#</a></h2>
+<p>Utilities for generating source for building a PyTorch CUDA extension that using a CUTLASS kernel.
+If specified, the extension can be JIT compiled via PyTorch’s <code class="docutils literal notranslate"><span class="pre">cpp_extension.load</span></code> method.</p>
+<p>Example usage with JIT compilation:</p>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">plan</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class="n">Gemm</span><span class="p">(</span><span class="n">element</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">float32</span><span class="p">,</span> <span class="n">layout</span><span class="o">=</span><span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">RowMajor</span><span class="p">)</span>
+<span class="n">op</span> <span class="o">=</span> <span class="n">plan</span><span class="o">.</span><span class="n">construct</span><span class="p">()</span>
+<span class="n">mod</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">emit</span><span class="o">.</span><span class="n">pytorch</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="s1">&#39;cutlass_gemm&#39;</span><span class="p">,</span> <span class="mi">80</span><span class="p">,</span> <span class="n">jit</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+
+<span class="c1"># Generate inputs for the GEMM</span>
+<span class="n">A</span><span class="p">,</span> <span class="n">B</span><span class="p">,</span> <span class="n">C</span> <span class="o">=</span> <span class="p">[</span><span class="n">torch</span><span class="o">.</span><span class="n">ones</span><span class="p">((</span><span class="mi">512</span><span class="p">,</span> <span class="mi">512</span><span class="p">))</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="s1">&#39;cuda&#39;</span><span class="p">)</span> <span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">3</span><span class="p">)]</span>
+
+<span class="c1"># Run the module</span>
+<span class="n">D</span> <span class="o">=</span> <span class="n">mod</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">A</span><span class="p">,</span> <span class="n">B</span><span class="p">,</span> <span class="n">C</span><span class="p">)</span>
+</pre></div>
+</div>
+<p>Example usage without JIT compilation:</p>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">plan</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class="n">Gemm</span><span class="p">(</span><span class="n">element</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">float32</span><span class="p">,</span> <span class="n">layout</span><span class="o">=</span><span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">RowMajor</span><span class="p">)</span>
+<span class="n">op</span> <span class="o">=</span> <span class="n">plan</span><span class="o">.</span><span class="n">construct</span><span class="p">()</span>
+<span class="n">cutlass</span><span class="o">.</span><span class="n">emit</span><span class="o">.</span><span class="n">pytorch</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="s1">&#39;cutlass_gemm&#39;</span><span class="p">,</span> <span class="mi">80</span><span class="p">,</span> <span class="n">jit</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">sourcedir</span><span class="o">=</span><span class="s1">&#39;output&#39;</span><span class="p">)</span>
+</pre></div>
+</div>
+<p>After this call, the directory <code class="docutils literal notranslate"><span class="pre">output</span></code> contains <code class="docutils literal notranslate"><span class="pre">setup.py</span></code>,
+<code class="docutils literal notranslate"><span class="pre">cutlass_gemm.cpp</span></code>, and <code class="docutils literal notranslate"><span class="pre">cutlass_gemm_kernel.cu</span></code>. The module can be built from
+within <code class="docutils literal notranslate"><span class="pre">output</span></code> by running: <code class="docutils literal notranslate"><span class="pre">TORCH_CUDA_ARCH_LIST=&quot;8.0&quot;</span> <span class="pre">python</span> <span class="pre">setup.py</span> <span class="pre">develop</span> <span class="pre">--user</span></code>.</p>
+<p>The module can later be used in Python via:</p>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">torch</span>
+<span class="kn">import</span> <span class="nn">cutlass_gemm</span>
+
+<span class="c1"># Generate inputs for the GEMM</span>
+<span class="n">A</span><span class="p">,</span> <span class="n">B</span><span class="p">,</span> <span class="n">C</span> <span class="o">=</span> <span class="p">[</span><span class="n">torch</span><span class="o">.</span><span class="n">ones</span><span class="p">((</span><span class="mi">512</span><span class="p">,</span> <span class="mi">512</span><span class="p">))</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="s1">&#39;cuda&#39;</span><span class="p">)</span> <span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">3</span><span class="p">)]</span>
+
+<span class="c1"># Run the module</span>
+<span class="n">D</span> <span class="o">=</span> <span class="n">cutlass_gemm</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">A</span><span class="p">,</span> <span class="n">B</span><span class="p">,</span> <span class="n">C</span><span class="p">)</span>
+</pre></div>
+</div>
+<dl class="py function">
+<dt class="sig sig-object py" id="cutlass.emit.pytorch.pytorch">
+<span class="sig-prename descclassname"><span class="pre">cutlass.emit.pytorch.</span></span><span class="sig-name descname"><span class="pre">pytorch</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">op</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">cc</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">jit</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">sourcedir</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">''</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/emit/pytorch.html#pytorch"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.emit.pytorch.pytorch" title="Permalink to this definition">#</a></dt>
+<dd><p>Generates source for building a PyTorch CUDA module that leverages the CUTLASS kernel
+specified by <code class="docutils literal notranslate"><span class="pre">op</span></code>. If the <code class="docutils literal notranslate"><span class="pre">jit</span></code> parameter is set to true, the module is just-in-time
+compiled, loaded, and returned.</p>
+<p>The result of this method is files within <code class="docutils literal notranslate"><span class="pre">sourcedir</span></code> that can be used for building
+a PyTorch module.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>op</strong> – operation to emit in the module</p></li>
+<li><p><strong>name</strong> (<em>str</em>) – name of the module to generate</p></li>
+<li><p><strong>cc</strong> (<em>int</em>) – compute capability of the device the module should target</p></li>
+<li><p><strong>jit</strong> (<em>bool</em>) – whether the module should be just-in-time compiled</p></li>
+<li><p><strong>sourcedir</strong> (<em>str</em>) – directory to which generated source files should be written</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>loaded PyTorch module (if <code class="docutils literal notranslate"><span class="pre">jit=True</span></code>) or None</p>
+</dd>
+</dl>
+</dd></dl>
+
+</section>
+</section>
+
+        </article>
+      </div>
+      <footer>
+        
+        <div class="related-pages">
+          <a class="next-page" href="cutlass.op.html">
+              <div class="page-info">
+                <div class="context">
+                  <span>Next</span>
+                </div>
+                <div class="title">Operations</div>
+              </div>
+              <svg class="furo-related-icon"><use href="#svg-arrow-right"></use></svg>
+            </a>
+          <a class="prev-page" href="cutlass.html">
+              <svg class="furo-related-icon"><use href="#svg-arrow-right"></use></svg>
+              <div class="page-info">
+                <div class="context">
+                  <span>Previous</span>
+                </div>
+                
+                <div class="title">CUTLASS</div>
+                
+              </div>
+            </a>
+        </div>
+        <div class="bottom-of-page">
+          <div class="left-details">
+            <div class="copyright">
+                Copyright &#169; 2023, NVIDIA
+            </div>
+            Made with <a href="https://www.sphinx-doc.org/">Sphinx</a> and <a class="muted-link" href="https://pradyunsg.me">@pradyunsg</a>'s
+            
+            <a href="https://github.com/pradyunsg/furo">Furo</a>
+            
+          </div>
+          <div class="right-details">
+            <div class="icons">
+              <a class="muted-link " href="https://github.com/NVIDIA/cutlass" aria-label="GitHub">
+                <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 16 16">
+                    <path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"></path>
+                </svg>
+            </a>
+              
+            </div>
+          </div>
+        </div>
+        
+      </footer>
+    </div>
+    <aside class="toc-drawer">
+      
+      
+      <div class="toc-sticky toc-scroll">
+        <div class="toc-title-container">
+          <span class="toc-title">
+            On this page
+          </span>
+        </div>
+        <div class="toc-tree-container">
+          <div class="toc-tree">
+            <ul>
+<li><a class="reference internal" href="#">Emitters</a><ul>
+<li><a class="reference internal" href="#module-cutlass.emit.common">Common</a></li>
+<li><a class="reference internal" href="#module-cutlass.emit.pytorch">PyTorch</a><ul>
+<li><a class="reference internal" href="#cutlass.emit.pytorch.pytorch"><code class="docutils literal notranslate"><span class="pre">pytorch()</span></code></a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+
+          </div>
+        </div>
+      </div>
+      
+      
+    </aside>
+  </div>
+</div><script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
+    <script src="_static/doctools.js"></script>
+    <script src="_static/sphinx_highlight.js"></script>
+    <script src="_static/scripts/furo.js"></script>
+    <script src="_static/clipboard.min.js"></script>
+    <script src="_static/copybutton.js"></script>
+    <script src="_static/tabs.js"></script>
+    <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
+    </body>
+</html>
\ No newline at end of file
diff --git a/python/docs/cutlass.html b/python/docs/cutlass.html
new file mode 100644
index 0000000000..5198b220e5
--- /dev/null
+++ b/python/docs/cutlass.html
@@ -0,0 +1,628 @@
+<!doctype html>
+<html class="no-js" lang="en">
+  <head><meta charset="utf-8"/>
+    <meta name="viewport" content="width=device-width,initial-scale=1"/>
+    <meta name="color-scheme" content="light dark"><meta name="generator" content="Docutils 0.19: https://docutils.sourceforge.io/" />
+<link rel="index" title="Index" href="genindex.html" /><link rel="search" title="Search" href="search.html" /><link rel="next" title="Emitters" href="cutlass.emit.html" /><link rel="prev" title="CUTLASS Python API" href="modules.html" />
+        <link rel="canonical" href="docs/cutlass.html" />
+
+    <!-- Generated with Sphinx 6.1.3 and Furo 2023.03.27 -->
+        <title>CUTLASS - CUTLASS Python</title>
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css" />
+    <link rel="stylesheet" type="text/css" href="_static/styles/furo.css?digest=fad236701ea90a88636c2a8c73b44ae642ed2a53" />
+    <link rel="stylesheet" type="text/css" href="_static/copybutton.css" />
+    <link rel="stylesheet" type="text/css" href="_static/tabs.css" />
+    <link rel="stylesheet" type="text/css" href="_static/styles/furo-extensions.css?digest=30d1aed668e5c3a91c3e3bf6a60b675221979f0e" />
+    
+    
+
+
+<style>
+  body {
+    --color-code-background: #eeffcc;
+  --color-code-foreground: black;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+  }
+  @media not print {
+    body[data-theme="dark"] {
+      --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+    }
+    @media (prefers-color-scheme: dark) {
+      body:not([data-theme="light"]) {
+        --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+      }
+    }
+  }
+</style></head>
+  <body>
+    
+    <script>
+      document.body.dataset.theme = localStorage.getItem("theme") || "auto";
+    </script>
+    
+
+<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
+  <symbol id="svg-toc" viewBox="0 0 24 24">
+    <title>Contents</title>
+    <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 1024 1024">
+      <path d="M408 442h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8zm-8 204c0 4.4 3.6 8 8 8h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56zm504-486H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zm0 632H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zM115.4 518.9L271.7 642c5.8 4.6 14.4.5 14.4-6.9V388.9c0-7.4-8.5-11.5-14.4-6.9L115.4 505.1a8.74 8.74 0 0 0 0 13.8z"/>
+    </svg>
+  </symbol>
+  <symbol id="svg-menu" viewBox="0 0 24 24">
+    <title>Menu</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-menu">
+      <line x1="3" y1="12" x2="21" y2="12"></line>
+      <line x1="3" y1="6" x2="21" y2="6"></line>
+      <line x1="3" y1="18" x2="21" y2="18"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-arrow-right" viewBox="0 0 24 24">
+    <title>Expand</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-chevron-right">
+      <polyline points="9 18 15 12 9 6"></polyline>
+    </svg>
+  </symbol>
+  <symbol id="svg-sun" viewBox="0 0 24 24">
+    <title>Light mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="feather-sun">
+      <circle cx="12" cy="12" r="5"></circle>
+      <line x1="12" y1="1" x2="12" y2="3"></line>
+      <line x1="12" y1="21" x2="12" y2="23"></line>
+      <line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
+      <line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
+      <line x1="1" y1="12" x2="3" y2="12"></line>
+      <line x1="21" y1="12" x2="23" y2="12"></line>
+      <line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
+      <line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-moon" viewBox="0 0 24 24">
+    <title>Dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-moon">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none" />
+      <path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z" />
+    </svg>
+  </symbol>
+  <symbol id="svg-sun-half" viewBox="0 0 24 24">
+    <title>Auto light/dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-shadow">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none"/>
+      <circle cx="12" cy="12" r="9" />
+      <path d="M13 12h5" />
+      <path d="M13 15h4" />
+      <path d="M13 18h1" />
+      <path d="M13 9h4" />
+      <path d="M13 6h1" />
+    </svg>
+  </symbol>
+</svg>
+
+<input type="checkbox" class="sidebar-toggle" name="__navigation" id="__navigation">
+<input type="checkbox" class="sidebar-toggle" name="__toc" id="__toc">
+<label class="overlay sidebar-overlay" for="__navigation">
+  <div class="visually-hidden">Hide navigation sidebar</div>
+</label>
+<label class="overlay toc-overlay" for="__toc">
+  <div class="visually-hidden">Hide table of contents sidebar</div>
+</label>
+
+
+
+<div class="page">
+  <header class="mobile-header">
+    <div class="header-left">
+      <label class="nav-overlay-icon" for="__navigation">
+        <div class="visually-hidden">Toggle site navigation sidebar</div>
+        <i class="icon"><svg><use href="#svg-menu"></use></svg></i>
+      </label>
+    </div>
+    <div class="header-center">
+      <a href="index.html"><div class="brand">CUTLASS Python</div></a>
+    </div>
+    <div class="header-right">
+      <div class="theme-toggle-container theme-toggle-header">
+        <button class="theme-toggle">
+          <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+          <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+          <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+          <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+        </button>
+      </div>
+      <label class="toc-overlay-icon toc-header-icon" for="__toc">
+        <div class="visually-hidden">Toggle table of contents sidebar</div>
+        <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+      </label>
+    </div>
+  </header>
+  <aside class="sidebar-drawer">
+    <div class="sidebar-container">
+      
+      <div class="sidebar-sticky"><a class="sidebar-brand" href="index.html">
+  
+  <div class="sidebar-logo-container">
+    <img class="sidebar-logo only-light" src="_static/cutlass-logo-small.png" alt="Light Logo"/>
+    <img class="sidebar-logo only-dark" src="_static/cutlass-logo-small.png" alt="Dark Logo"/>
+  </div>
+  
+  <span class="sidebar-brand-text">CUTLASS Python</span>
+  
+</a><form class="sidebar-search-container" method="get" action="search.html" role="search">
+  <input class="sidebar-search" placeholder="Search" name="q" aria-label="Search">
+  <input type="hidden" name="check_keywords" value="yes">
+  <input type="hidden" name="area" value="default">
+</form>
+<div id="searchbox"></div><div class="sidebar-scroll"><div class="sidebar-tree">
+  <ul>
+<li class="toctree-l1"><a class="reference internal" href="index.html">Home</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Getting Started:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="install.html">Installation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="externals/00_basic_gemm.html">Getting Started</a></li>
+<li class="toctree-l1"><a class="reference internal" href="contribute.html">Contributing</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Python Documentation:</span></p>
+<ul class="current">
+<li class="toctree-l1 current has-children"><a class="reference internal" href="modules.html">CUTLASS Python API</a><input checked="" class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" role="switch" type="checkbox"/><label for="toctree-checkbox-1"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul class="current">
+<li class="toctree-l2 current has-children current-page"><a class="current reference internal" href="#">CUTLASS</a><input checked="" class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" role="switch" type="checkbox"/><label for="toctree-checkbox-2"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.emit.html">Emitters</a></li>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.op.html">Operations</a></li>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.utils.html">Utilities</a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Examples and Tutorials:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="examples.html">Examples</a><input class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" role="switch" type="checkbox"/><label for="toctree-checkbox-3"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2"><a class="reference internal" href="externals/00_basic_gemm.html">Basic GEMM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="externals/01_epilogue.html">Epilogue</a></li>
+<li class="toctree-l2"><a class="reference internal" href="externals/02_pytorch_extension_grouped_gemm.html">PyTorch Extension</a></li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Reference:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference external" href="https://github.com/NVIDIA/cutlass">Github</a></li>
+</ul>
+
+</div>
+</div>
+
+      </div>
+      
+    </div>
+  </aside>
+  <div class="main">
+    <div class="content">
+      <div class="article-container">
+        <a href="#" class="back-to-top muted-link">
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+            <path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12z"></path>
+          </svg>
+          <span>Back to top</span>
+        </a>
+        <div class="content-icon-container">
+          
+<div class="theme-toggle-container theme-toggle-content">
+            <button class="theme-toggle">
+              <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+              <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+              <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+              <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+            </button>
+          </div>
+          <label class="toc-overlay-icon toc-content-icon" for="__toc">
+            <div class="visually-hidden">Toggle table of contents sidebar</div>
+            <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+          </label>
+        </div>
+        <article role="main">
+          <section id="cutlass">
+<h1>CUTLASS<a class="headerlink" href="#cutlass" title="Permalink to this heading">#</a></h1>
+<section id="subpackages">
+<h2>Subpackages<a class="headerlink" href="#subpackages" title="Permalink to this heading">#</a></h2>
+<div class="toctree-wrapper compound">
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="cutlass.emit.html">Emitters</a></li>
+<li class="toctree-l1"><a class="reference internal" href="cutlass.op.html">Operations</a></li>
+<li class="toctree-l1"><a class="reference internal" href="cutlass.utils.html">Utilities</a></li>
+</ul>
+</div>
+</section>
+<section id="module-cutlass.epilogue">
+<span id="epilogue"></span><h2>Epilogue<a class="headerlink" href="#module-cutlass.epilogue" title="Permalink to this heading">#</a></h2>
+<p>Registry of elementwise epilogues</p>
+<p>Elementwise epilogues can be added to many CUTLASS kernels in the CUTLAS Python interface via
+code like the following for GEMM:</p>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">plan</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class="n">Gemm</span><span class="p">(</span><span class="n">element</span><span class="o">=</span><span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f32</span><span class="p">,</span> <span class="n">layout</span><span class="o">=</span><span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">RowMajor</span><span class="p">)</span>
+<span class="n">plan</span><span class="o">.</span><span class="n">activation</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">epilogue</span><span class="o">.</span><span class="n">relu</span>
+</pre></div>
+</div>
+<dl class="py function">
+<dt class="sig sig-object py" id="cutlass.epilogue.get_activation_epilogue">
+<span class="sig-prename descclassname"><span class="pre">cutlass.epilogue.</span></span><span class="sig-name descname"><span class="pre">get_activation_epilogue</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">activation</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">element_output</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">elements_per_access</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">element_accumulator</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">element_compute</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/epilogue.html#get_activation_epilogue"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.epilogue.get_activation_epilogue" title="Permalink to this definition">#</a></dt>
+<dd><p>Return an epilogue corresponding to the activation function, data types, and alignment
+used in the kernel</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>activation</strong> – elementwise activation function to use</p></li>
+<li><p><strong>element_output</strong> – data type of the output</p></li>
+<li><p><strong>elements_per_access</strong> (<em>int</em>) – alignment of operand C of the kernel</p></li>
+<li><p><strong>element_accumulator</strong> – data type of the accumulated output C</p></li>
+<li><p><strong>element_compute</strong> – data type in which compute operations should be performed</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>epilogue functor</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="cutlass.epilogue.get_activations">
+<span class="sig-prename descclassname"><span class="pre">cutlass.epilogue.</span></span><span class="sig-name descname"><span class="pre">get_activations</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/epilogue.html#get_activations"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.epilogue.get_activations" title="Permalink to this definition">#</a></dt>
+<dd><p>Returns a list of available activation functions</p>
+<dl class="field-list simple">
+<dt class="field-odd">Returns<span class="colon">:</span></dt>
+<dd class="field-odd"><p>list of available activation functions</p>
+</dd>
+<dt class="field-even">Return type<span class="colon">:</span></dt>
+<dd class="field-even"><p>list</p>
+</dd>
+</dl>
+</dd></dl>
+
+</section>
+<section id="module-cutlass.library_defaults">
+<span id="library-defaults"></span><h2>Library Defaults<a class="headerlink" href="#module-cutlass.library_defaults" title="Permalink to this heading">#</a></h2>
+<p>Classes containing valid operations for a given compute capability and data types.</p>
+<dl class="py class">
+<dt class="sig sig-object py" id="cutlass.library_defaults.ArchOptions">
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">cutlass.library_defaults.</span></span><span class="sig-name descname"><span class="pre">ArchOptions</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="pre">target_cc,</span> <span class="pre">kernel_cc,</span> <span class="pre">operation_kind,</span> <span class="pre">gemm_kinds,</span> <span class="pre">allowed_math_operations=[&lt;MathOperation.multiply_add:</span> <span class="pre">1&gt;,</span> <span class="pre">&lt;MathOperation.multiply_add_saturate:</span> <span class="pre">2&gt;]</span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/library_defaults.html#ArchOptions"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.library_defaults.ArchOptions" title="Permalink to this definition">#</a></dt>
+<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
+<p>Structure for keeping track of kernels available on a given compute capability</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>target_cc</strong> (<em>int</em>) – compute capability of the device on which kernels will be run</p></li>
+<li><p><strong>kernel_cc</strong> (<em>int</em>) – compute capability of the kernels to generate</p></li>
+<li><p><strong>operation_kind</strong> (<em>cutlass.OperationKind</em>) – type of operation to register</p></li>
+<li><p><strong>gemm_kinds</strong> (<em>list</em>) – types of GEMM operations that can be included</p></li>
+<li><p><strong>allowed_math_operations</strong> (<em>list</em>) – types of primitive math operations allowed</p></li>
+</ul>
+</dd>
+</dl>
+<dl class="py method">
+<dt class="sig sig-object py" id="cutlass.library_defaults.ArchOptions.opclass_supports_combination">
+<span class="sig-name descname"><span class="pre">opclass_supports_combination</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">op_class</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">datatype_comb</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">layout_comb</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/library_defaults.html#ArchOptions.opclass_supports_combination"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.library_defaults.ArchOptions.opclass_supports_combination" title="Permalink to this definition">#</a></dt>
+<dd><p>Returns whether the provided operation class supports the provided data type and layout combination</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>op_class</strong> (<em>cutlass.OpcodeClass</em>) – operation class to consider</p></li>
+<li><p><strong>datatype_comb</strong> (<em>tuple</em><em>[</em><em>cutlass.DataType</em><em>]</em>) – tuple of data types for (element_A, element_B, element_accumulator)</p></li>
+<li><p><strong>layout_comb</strong> (<em>tuple</em><em>[</em><em>cutlass.LayoutType</em><em>]</em>) – tuple of data types for (layout_A, layout_B)</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>set of operation classes that support the provided data type and layout combination</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>set</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="cutlass.library_defaults.ArchOptions.operations">
+<span class="sig-name descname"><span class="pre">operations</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">op_class</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">element_a</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">element_b</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">element_accumulator</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">layout_a</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">layout_b</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/library_defaults.html#ArchOptions.operations"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.library_defaults.ArchOptions.operations" title="Permalink to this definition">#</a></dt>
+<dd><p>Returns whether the provided operation class supports the provided data type combination</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>op_class</strong> (<em>cutlass.OpcodeClass</em>) – operation class to consider</p></li>
+<li><p><strong>element_a</strong> (<em>cutlass.DataType</em>) – data type of operand A</p></li>
+<li><p><strong>element_b</strong> (<em>cutlass.DataType</em>) – data type of operand B</p></li>
+<li><p><strong>element_accumulator</strong> (<em>cutlass.DataType</em>) – data type of accumulator</p></li>
+<li><p><strong>layout_a</strong> (<em>cutlass.LayoutType</em>) – layout of operand A</p></li>
+<li><p><strong>layout_b</strong> (<em>cutlass.LayoutType</em>) – layout of operand B</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>container of kernels by alignment supported by the provided combination of parameters</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p><a class="reference internal" href="#cutlass.library_defaults.KernelsForDataType" title="cutlass.library_defaults.KernelsForDataType">KernelsForDataType</a></p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="cutlass.library_defaults.ArchOptions.supporting_opclasses">
+<span class="sig-name descname"><span class="pre">supporting_opclasses</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">element_a</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">element_b</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">element_accumulator</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">layout_a</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">layout_b</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/library_defaults.html#ArchOptions.supporting_opclasses"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.library_defaults.ArchOptions.supporting_opclasses" title="Permalink to this definition">#</a></dt>
+<dd><p>Returns a set of operation classes that support the provided data type combination</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>element_a</strong> (<em>cutlass.DataType</em>) – data type of operand A</p></li>
+<li><p><strong>element_b</strong> (<em>cutlass.DataType</em>) – data type of operand B</p></li>
+<li><p><strong>element_accumulator</strong> (<em>cutlass.DataType</em>) – data type of accumulator</p></li>
+<li><p><strong>layout_a</strong> (<em>cutlass.LayoutType</em>) – layout of operand A</p></li>
+<li><p><strong>layout_b</strong> (<em>cutlass.LayoutType</em>) – layout of operand B</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>set of operation classes that support the provided data type combination</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>set</p>
+</dd>
+</dl>
+</dd></dl>
+
+</dd></dl>
+
+<dl class="py class">
+<dt class="sig sig-object py" id="cutlass.library_defaults.KernelsForDataType">
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">cutlass.library_defaults.</span></span><span class="sig-name descname"><span class="pre">KernelsForDataType</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">datatype_comb</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">layout_comb</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/library_defaults.html#KernelsForDataType"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.library_defaults.KernelsForDataType" title="Permalink to this definition">#</a></dt>
+<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
+<p>Container class for keeping track of kernels that correspond to a particular combination
+of data types for operands A, B, and accumulator</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>datatype_comb</strong> (<em>tuple</em>) – </p></li>
+<li><p><strong>layout_comb</strong> (<em>tuple</em>) – </p></li>
+</ul>
+</dd>
+</dl>
+<dl class="py method">
+<dt class="sig sig-object py" id="cutlass.library_defaults.KernelsForDataType.add">
+<span class="sig-name descname"><span class="pre">add</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">operation</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/library_defaults.html#KernelsForDataType.add"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.library_defaults.KernelsForDataType.add" title="Permalink to this definition">#</a></dt>
+<dd><p>Add an operation to the list of supported kernels</p>
+</dd></dl>
+
+<dl class="py property">
+<dt class="sig sig-object py" id="cutlass.library_defaults.KernelsForDataType.alignments">
+<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">alignments</span></span><a class="headerlink" href="#cutlass.library_defaults.KernelsForDataType.alignments" title="Permalink to this definition">#</a></dt>
+<dd><p>Returns an unsorted list of alignments supported by this data type combination</p>
+<dl class="field-list simple">
+<dt class="field-odd">Returns<span class="colon">:</span></dt>
+<dd class="field-odd"><p>unsorted list of alignments supported by this data type combination</p>
+</dd>
+<dt class="field-even">Return type<span class="colon">:</span></dt>
+<dd class="field-even"><p>list</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py property">
+<dt class="sig sig-object py" id="cutlass.library_defaults.KernelsForDataType.all_operations">
+<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">all_operations</span></span><a class="headerlink" href="#cutlass.library_defaults.KernelsForDataType.all_operations" title="Permalink to this definition">#</a></dt>
+<dd><p>Returns a list of all operations supported by this data type combination</p>
+<dl class="field-list simple">
+<dt class="field-odd">Returns<span class="colon">:</span></dt>
+<dd class="field-odd"><p>list of all operations supported by this data type combination</p>
+</dd>
+<dt class="field-even">Return type<span class="colon">:</span></dt>
+<dd class="field-even"><p>list</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="cutlass.library_defaults.KernelsForDataType.find_alignment">
+<span class="sig-name descname"><span class="pre">find_alignment</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">shape</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">layout</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/library_defaults.html#KernelsForDataType.find_alignment"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.library_defaults.KernelsForDataType.find_alignment" title="Permalink to this definition">#</a></dt>
+<dd><p>Returns the most preferable alignment for a given shape and layout</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>shape</strong> (<em>tuple</em>) – extent of each dimension of the tensor</p></li>
+<li><p><strong>layout</strong> (<em>cutlass.LayoutType</em>) – layout of the tensor</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>maximum alignment supported by the data type combination and tensor size</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>int</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="cutlass.library_defaults.KernelsForDataType.operations">
+<span class="sig-name descname"><span class="pre">operations</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">alignment</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/library_defaults.html#KernelsForDataType.operations"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.library_defaults.KernelsForDataType.operations" title="Permalink to this definition">#</a></dt>
+<dd><p>Returns operations satisfying the alignment constraint indicated by <cite>alignment</cite></p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><p><strong>alignment</strong> (<em>int</em>) – alignment constraint of operations to return</p>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>list of operations</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>list</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="cutlass.library_defaults.KernelsForDataType.sort">
+<span class="sig-name descname"><span class="pre">sort</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/library_defaults.html#KernelsForDataType.sort"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.library_defaults.KernelsForDataType.sort" title="Permalink to this definition">#</a></dt>
+<dd><p>Sorts each list of kernels in <cite>kernels_by_alignment</cite> in descending order of threadblock shape</p>
+</dd></dl>
+
+</dd></dl>
+
+<dl class="py class">
+<dt class="sig sig-object py" id="cutlass.library_defaults.OptionRegistry">
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">cutlass.library_defaults.</span></span><span class="sig-name descname"><span class="pre">OptionRegistry</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">target_cc</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/library_defaults.html#OptionRegistry"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.library_defaults.OptionRegistry" title="Permalink to this definition">#</a></dt>
+<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
+<p>Container of all architecture-specific options</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><p><strong>target_cc</strong> (<em>int</em>) – compute capability of the device on which operations will be run</p>
+</dd>
+</dl>
+<dl class="py method">
+<dt class="sig sig-object py" id="cutlass.library_defaults.OptionRegistry.options_for_cc">
+<span class="sig-name descname"><span class="pre">options_for_cc</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">cc</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/library_defaults.html#OptionRegistry.options_for_cc"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.library_defaults.OptionRegistry.options_for_cc" title="Permalink to this definition">#</a></dt>
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><p><strong>cc</strong> (<em>int</em>) – </p>
+</dd>
+<dt class="field-even">Return type<span class="colon">:</span></dt>
+<dd class="field-even"><p><a class="reference internal" href="#cutlass.library_defaults.ArchOptions" title="cutlass.library_defaults.ArchOptions"><em>ArchOptions</em></a></p>
+</dd>
+</dl>
+</dd></dl>
+
+</dd></dl>
+
+</section>
+<section id="module-cutlass.swizzle">
+<span id="swizzle"></span><h2>Swizzle<a class="headerlink" href="#module-cutlass.swizzle" title="Permalink to this heading">#</a></h2>
+<p>Registry of swizzling functions</p>
+<dl class="py function">
+<dt class="sig sig-object py" id="cutlass.swizzle.get_swizzling_functors">
+<span class="sig-prename descclassname"><span class="pre">cutlass.swizzle.</span></span><span class="sig-name descname"><span class="pre">get_swizzling_functors</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/swizzle.html#get_swizzling_functors"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.swizzle.get_swizzling_functors" title="Permalink to this definition">#</a></dt>
+<dd></dd></dl>
+
+</section>
+</section>
+
+        </article>
+      </div>
+      <footer>
+        
+        <div class="related-pages">
+          <a class="next-page" href="cutlass.emit.html">
+              <div class="page-info">
+                <div class="context">
+                  <span>Next</span>
+                </div>
+                <div class="title">Emitters</div>
+              </div>
+              <svg class="furo-related-icon"><use href="#svg-arrow-right"></use></svg>
+            </a>
+          <a class="prev-page" href="modules.html">
+              <svg class="furo-related-icon"><use href="#svg-arrow-right"></use></svg>
+              <div class="page-info">
+                <div class="context">
+                  <span>Previous</span>
+                </div>
+                
+                <div class="title">CUTLASS Python API</div>
+                
+              </div>
+            </a>
+        </div>
+        <div class="bottom-of-page">
+          <div class="left-details">
+            <div class="copyright">
+                Copyright &#169; 2023, NVIDIA
+            </div>
+            Made with <a href="https://www.sphinx-doc.org/">Sphinx</a> and <a class="muted-link" href="https://pradyunsg.me">@pradyunsg</a>'s
+            
+            <a href="https://github.com/pradyunsg/furo">Furo</a>
+            
+          </div>
+          <div class="right-details">
+            <div class="icons">
+              <a class="muted-link " href="https://github.com/NVIDIA/cutlass" aria-label="GitHub">
+                <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 16 16">
+                    <path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"></path>
+                </svg>
+            </a>
+              
+            </div>
+          </div>
+        </div>
+        
+      </footer>
+    </div>
+    <aside class="toc-drawer">
+      
+      
+      <div class="toc-sticky toc-scroll">
+        <div class="toc-title-container">
+          <span class="toc-title">
+            On this page
+          </span>
+        </div>
+        <div class="toc-tree-container">
+          <div class="toc-tree">
+            <ul>
+<li><a class="reference internal" href="#">CUTLASS</a><ul>
+<li><a class="reference internal" href="#subpackages">Subpackages</a></li>
+<li><a class="reference internal" href="#module-cutlass.epilogue">Epilogue</a><ul>
+<li><a class="reference internal" href="#cutlass.epilogue.get_activation_epilogue"><code class="docutils literal notranslate"><span class="pre">get_activation_epilogue()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.epilogue.get_activations"><code class="docutils literal notranslate"><span class="pre">get_activations()</span></code></a></li>
+</ul>
+</li>
+<li><a class="reference internal" href="#module-cutlass.library_defaults">Library Defaults</a><ul>
+<li><a class="reference internal" href="#cutlass.library_defaults.ArchOptions"><code class="docutils literal notranslate"><span class="pre">ArchOptions</span></code></a><ul>
+<li><a class="reference internal" href="#cutlass.library_defaults.ArchOptions.opclass_supports_combination"><code class="docutils literal notranslate"><span class="pre">ArchOptions.opclass_supports_combination()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.library_defaults.ArchOptions.operations"><code class="docutils literal notranslate"><span class="pre">ArchOptions.operations()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.library_defaults.ArchOptions.supporting_opclasses"><code class="docutils literal notranslate"><span class="pre">ArchOptions.supporting_opclasses()</span></code></a></li>
+</ul>
+</li>
+<li><a class="reference internal" href="#cutlass.library_defaults.KernelsForDataType"><code class="docutils literal notranslate"><span class="pre">KernelsForDataType</span></code></a><ul>
+<li><a class="reference internal" href="#cutlass.library_defaults.KernelsForDataType.add"><code class="docutils literal notranslate"><span class="pre">KernelsForDataType.add()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.library_defaults.KernelsForDataType.alignments"><code class="docutils literal notranslate"><span class="pre">KernelsForDataType.alignments</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.library_defaults.KernelsForDataType.all_operations"><code class="docutils literal notranslate"><span class="pre">KernelsForDataType.all_operations</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.library_defaults.KernelsForDataType.find_alignment"><code class="docutils literal notranslate"><span class="pre">KernelsForDataType.find_alignment()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.library_defaults.KernelsForDataType.operations"><code class="docutils literal notranslate"><span class="pre">KernelsForDataType.operations()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.library_defaults.KernelsForDataType.sort"><code class="docutils literal notranslate"><span class="pre">KernelsForDataType.sort()</span></code></a></li>
+</ul>
+</li>
+<li><a class="reference internal" href="#cutlass.library_defaults.OptionRegistry"><code class="docutils literal notranslate"><span class="pre">OptionRegistry</span></code></a><ul>
+<li><a class="reference internal" href="#cutlass.library_defaults.OptionRegistry.options_for_cc"><code class="docutils literal notranslate"><span class="pre">OptionRegistry.options_for_cc()</span></code></a></li>
+</ul>
+</li>
+</ul>
+</li>
+<li><a class="reference internal" href="#module-cutlass.swizzle">Swizzle</a><ul>
+<li><a class="reference internal" href="#cutlass.swizzle.get_swizzling_functors"><code class="docutils literal notranslate"><span class="pre">get_swizzling_functors()</span></code></a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+
+          </div>
+        </div>
+      </div>
+      
+      
+    </aside>
+  </div>
+</div><script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
+    <script src="_static/doctools.js"></script>
+    <script src="_static/sphinx_highlight.js"></script>
+    <script src="_static/scripts/furo.js"></script>
+    <script src="_static/clipboard.min.js"></script>
+    <script src="_static/copybutton.js"></script>
+    <script src="_static/tabs.js"></script>
+    <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
+    </body>
+</html>
\ No newline at end of file
diff --git a/python/docs/cutlass.op.html b/python/docs/cutlass.op.html
new file mode 100644
index 0000000000..2c32e0063a
--- /dev/null
+++ b/python/docs/cutlass.op.html
@@ -0,0 +1,784 @@
+<!doctype html>
+<html class="no-js" lang="en">
+  <head><meta charset="utf-8"/>
+    <meta name="viewport" content="width=device-width,initial-scale=1"/>
+    <meta name="color-scheme" content="light dark"><meta name="generator" content="Docutils 0.19: https://docutils.sourceforge.io/" />
+<link rel="index" title="Index" href="genindex.html" /><link rel="search" title="Search" href="search.html" /><link rel="next" title="Utilities" href="cutlass.utils.html" /><link rel="prev" title="Emitters" href="cutlass.emit.html" />
+        <link rel="canonical" href="docs/cutlass.op.html" />
+
+    <!-- Generated with Sphinx 6.1.3 and Furo 2023.03.27 -->
+        <title>Operations - CUTLASS Python</title>
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css" />
+    <link rel="stylesheet" type="text/css" href="_static/styles/furo.css?digest=fad236701ea90a88636c2a8c73b44ae642ed2a53" />
+    <link rel="stylesheet" type="text/css" href="_static/copybutton.css" />
+    <link rel="stylesheet" type="text/css" href="_static/tabs.css" />
+    <link rel="stylesheet" type="text/css" href="_static/styles/furo-extensions.css?digest=30d1aed668e5c3a91c3e3bf6a60b675221979f0e" />
+    
+    
+
+
+<style>
+  body {
+    --color-code-background: #eeffcc;
+  --color-code-foreground: black;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+  }
+  @media not print {
+    body[data-theme="dark"] {
+      --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+    }
+    @media (prefers-color-scheme: dark) {
+      body:not([data-theme="light"]) {
+        --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+      }
+    }
+  }
+</style></head>
+  <body>
+    
+    <script>
+      document.body.dataset.theme = localStorage.getItem("theme") || "auto";
+    </script>
+    
+
+<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
+  <symbol id="svg-toc" viewBox="0 0 24 24">
+    <title>Contents</title>
+    <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 1024 1024">
+      <path d="M408 442h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8zm-8 204c0 4.4 3.6 8 8 8h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56zm504-486H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zm0 632H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zM115.4 518.9L271.7 642c5.8 4.6 14.4.5 14.4-6.9V388.9c0-7.4-8.5-11.5-14.4-6.9L115.4 505.1a8.74 8.74 0 0 0 0 13.8z"/>
+    </svg>
+  </symbol>
+  <symbol id="svg-menu" viewBox="0 0 24 24">
+    <title>Menu</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-menu">
+      <line x1="3" y1="12" x2="21" y2="12"></line>
+      <line x1="3" y1="6" x2="21" y2="6"></line>
+      <line x1="3" y1="18" x2="21" y2="18"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-arrow-right" viewBox="0 0 24 24">
+    <title>Expand</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-chevron-right">
+      <polyline points="9 18 15 12 9 6"></polyline>
+    </svg>
+  </symbol>
+  <symbol id="svg-sun" viewBox="0 0 24 24">
+    <title>Light mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="feather-sun">
+      <circle cx="12" cy="12" r="5"></circle>
+      <line x1="12" y1="1" x2="12" y2="3"></line>
+      <line x1="12" y1="21" x2="12" y2="23"></line>
+      <line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
+      <line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
+      <line x1="1" y1="12" x2="3" y2="12"></line>
+      <line x1="21" y1="12" x2="23" y2="12"></line>
+      <line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
+      <line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-moon" viewBox="0 0 24 24">
+    <title>Dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-moon">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none" />
+      <path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z" />
+    </svg>
+  </symbol>
+  <symbol id="svg-sun-half" viewBox="0 0 24 24">
+    <title>Auto light/dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-shadow">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none"/>
+      <circle cx="12" cy="12" r="9" />
+      <path d="M13 12h5" />
+      <path d="M13 15h4" />
+      <path d="M13 18h1" />
+      <path d="M13 9h4" />
+      <path d="M13 6h1" />
+    </svg>
+  </symbol>
+</svg>
+
+<input type="checkbox" class="sidebar-toggle" name="__navigation" id="__navigation">
+<input type="checkbox" class="sidebar-toggle" name="__toc" id="__toc">
+<label class="overlay sidebar-overlay" for="__navigation">
+  <div class="visually-hidden">Hide navigation sidebar</div>
+</label>
+<label class="overlay toc-overlay" for="__toc">
+  <div class="visually-hidden">Hide table of contents sidebar</div>
+</label>
+
+
+
+<div class="page">
+  <header class="mobile-header">
+    <div class="header-left">
+      <label class="nav-overlay-icon" for="__navigation">
+        <div class="visually-hidden">Toggle site navigation sidebar</div>
+        <i class="icon"><svg><use href="#svg-menu"></use></svg></i>
+      </label>
+    </div>
+    <div class="header-center">
+      <a href="index.html"><div class="brand">CUTLASS Python</div></a>
+    </div>
+    <div class="header-right">
+      <div class="theme-toggle-container theme-toggle-header">
+        <button class="theme-toggle">
+          <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+          <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+          <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+          <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+        </button>
+      </div>
+      <label class="toc-overlay-icon toc-header-icon" for="__toc">
+        <div class="visually-hidden">Toggle table of contents sidebar</div>
+        <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+      </label>
+    </div>
+  </header>
+  <aside class="sidebar-drawer">
+    <div class="sidebar-container">
+      
+      <div class="sidebar-sticky"><a class="sidebar-brand" href="index.html">
+  
+  <div class="sidebar-logo-container">
+    <img class="sidebar-logo only-light" src="_static/cutlass-logo-small.png" alt="Light Logo"/>
+    <img class="sidebar-logo only-dark" src="_static/cutlass-logo-small.png" alt="Dark Logo"/>
+  </div>
+  
+  <span class="sidebar-brand-text">CUTLASS Python</span>
+  
+</a><form class="sidebar-search-container" method="get" action="search.html" role="search">
+  <input class="sidebar-search" placeholder="Search" name="q" aria-label="Search">
+  <input type="hidden" name="check_keywords" value="yes">
+  <input type="hidden" name="area" value="default">
+</form>
+<div id="searchbox"></div><div class="sidebar-scroll"><div class="sidebar-tree">
+  <ul>
+<li class="toctree-l1"><a class="reference internal" href="index.html">Home</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Getting Started:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="install.html">Installation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="externals/00_basic_gemm.html">Getting Started</a></li>
+<li class="toctree-l1"><a class="reference internal" href="contribute.html">Contributing</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Python Documentation:</span></p>
+<ul class="current">
+<li class="toctree-l1 current has-children"><a class="reference internal" href="modules.html">CUTLASS Python API</a><input checked="" class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" role="switch" type="checkbox"/><label for="toctree-checkbox-1"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul class="current">
+<li class="toctree-l2 current has-children"><a class="reference internal" href="cutlass.html">CUTLASS</a><input checked="" class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" role="switch" type="checkbox"/><label for="toctree-checkbox-2"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul class="current">
+<li class="toctree-l3"><a class="reference internal" href="cutlass.emit.html">Emitters</a></li>
+<li class="toctree-l3 current current-page"><a class="current reference internal" href="#">Operations</a></li>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.utils.html">Utilities</a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Examples and Tutorials:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="examples.html">Examples</a><input class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" role="switch" type="checkbox"/><label for="toctree-checkbox-3"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2"><a class="reference internal" href="externals/00_basic_gemm.html">Basic GEMM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="externals/01_epilogue.html">Epilogue</a></li>
+<li class="toctree-l2"><a class="reference internal" href="externals/02_pytorch_extension_grouped_gemm.html">PyTorch Extension</a></li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Reference:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference external" href="https://github.com/NVIDIA/cutlass">Github</a></li>
+</ul>
+
+</div>
+</div>
+
+      </div>
+      
+    </div>
+  </aside>
+  <div class="main">
+    <div class="content">
+      <div class="article-container">
+        <a href="#" class="back-to-top muted-link">
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+            <path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12z"></path>
+          </svg>
+          <span>Back to top</span>
+        </a>
+        <div class="content-icon-container">
+          
+<div class="theme-toggle-container theme-toggle-content">
+            <button class="theme-toggle">
+              <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+              <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+              <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+              <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+            </button>
+          </div>
+          <label class="toc-overlay-icon toc-content-icon" for="__toc">
+            <div class="visually-hidden">Toggle table of contents sidebar</div>
+            <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+          </label>
+        </div>
+        <article role="main">
+          <section id="operations">
+<h1>Operations<a class="headerlink" href="#operations" title="Permalink to this heading">#</a></h1>
+<section id="module-cutlass.op.gemm">
+<span id="gemm"></span><h2>GEMM<a class="headerlink" href="#module-cutlass.op.gemm" title="Permalink to this heading">#</a></h2>
+<p>Ease-of-use interface for constructing, compiling, and running GEMMs.</p>
+<p>The <code class="docutils literal notranslate"><span class="pre">Gemm</span></code> interface is meant to allow one to easily instantiate, compile, and run
+GEMM operations in CUTLASS via Python, without specifying many configuration parameters.
+Under the hood, the interface will select sensible default parameters for the many template
+parameters for CUTLASS GEMMs.</p>
+<p>Note: optimal performance is not to be expected from this interface. To achieve optimal
+performance, one should specify and tune each configuration parameter.</p>
+<p>The simplest example of using this interface is the following:</p>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># A, B, C, and D are torch/numpy/cupy tensor objects</span>
+<span class="n">plan</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class="n">Gemm</span><span class="p">(</span><span class="n">A</span><span class="p">,</span> <span class="n">B</span><span class="p">,</span> <span class="n">C</span><span class="p">,</span> <span class="n">D</span><span class="p">)</span>
+<span class="n">plan</span><span class="o">.</span><span class="n">run</span><span class="p">()</span>
+</pre></div>
+</div>
+<p>One can also use the interface by specifying data types of operands at construction
+and using different tensor objects with these data types at runtime:</p>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># The following is shorthand for:</span>
+<span class="c1">#        cutlass.op.Gemm(element_A=torch.float32, element_B=torch.float32,</span>
+<span class="c1">#                        element_C=torch.float32, element_D=torch.float32,</span>
+<span class="c1">#                        element_accumulator=torch.float32,</span>
+<span class="c1">#                        layout=cutlass.LayoutType.RowMajor)</span>
+<span class="n">plan</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class="n">Gemm</span><span class="p">(</span><span class="n">element</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">float32</span><span class="p">,</span> <span class="n">layout</span><span class="o">=</span><span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">RowMajor</span><span class="p">)</span>
+
+<span class="n">A0</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">rand</span><span class="p">((</span><span class="mi">128</span><span class="p">,</span> <span class="mi">256</span><span class="p">),</span> <span class="n">device</span><span class="o">=</span><span class="s1">&#39;cuda&#39;</span><span class="p">)</span>
+<span class="n">B0</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">rand</span><span class="p">((</span><span class="mi">256</span><span class="p">,</span> <span class="mi">64</span><span class="p">),</span> <span class="n">device</span><span class="o">=</span><span class="s1">&#39;cuda&#39;</span><span class="p">)</span>
+<span class="n">C0</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">zeros</span><span class="p">((</span><span class="mi">128</span><span class="p">,</span> <span class="mi">64</span><span class="p">),</span> <span class="n">device</span><span class="o">=</span><span class="s1">&#39;cuda&#39;</span><span class="p">)</span>
+<span class="n">D0</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">zeros</span><span class="p">((</span><span class="mi">128</span><span class="p">,</span> <span class="mi">64</span><span class="p">),</span> <span class="n">device</span><span class="o">.</span><span class="s1">&#39;cuda&#39;</span><span class="p">)</span>
+<span class="n">plan</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">A0</span><span class="p">,</span> <span class="n">B0</span><span class="p">,</span> <span class="n">C0</span><span class="p">,</span> <span class="n">D0</span><span class="p">)</span>
+
+<span class="n">A</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">rand</span><span class="p">((</span><span class="mi">32</span><span class="p">,</span> <span class="mi">128</span><span class="p">),</span> <span class="n">device</span><span class="o">=</span><span class="s1">&#39;cuda&#39;</span><span class="p">)</span>
+<span class="n">B</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">rand</span><span class="p">((</span><span class="mi">128</span><span class="p">,</span> <span class="mi">256</span><span class="p">),</span> <span class="n">device</span><span class="o">=</span><span class="s1">&#39;cuda&#39;</span><span class="p">)</span>
+<span class="n">C</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">zeros</span><span class="p">((</span><span class="mi">32</span><span class="p">,</span> <span class="mi">256</span><span class="p">),</span> <span class="n">device</span><span class="o">=</span><span class="s1">&#39;cuda&#39;</span><span class="p">)</span>
+<span class="n">D</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">zeros</span><span class="p">((</span><span class="mi">32</span><span class="p">,</span> <span class="mi">256</span><span class="p">),</span> <span class="n">device</span><span class="o">.</span><span class="s1">&#39;cuda&#39;</span><span class="p">)</span>
+<span class="n">plan</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">A1</span><span class="p">,</span> <span class="n">B1</span><span class="p">,</span> <span class="n">C1</span><span class="p">,</span> <span class="n">D1</span><span class="p">)</span>
+</pre></div>
+</div>
+<p>The interface additionally enables one to decouple the compilation of the underlying CUTLASS
+kernel from its execution:</p>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">plan</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class="n">Gemm</span><span class="p">(</span><span class="n">element</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">float32</span><span class="p">,</span> <span class="n">layout</span><span class="o">=</span><span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">RowMajor</span><span class="p">)</span>
+<span class="n">plan</span><span class="o">.</span><span class="n">compile</span><span class="p">()</span>
+
+<span class="c1"># Do other work...</span>
+
+<span class="n">plan</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">A0</span><span class="p">,</span> <span class="n">B0</span><span class="p">,</span> <span class="n">C0</span><span class="p">,</span> <span class="n">D0</span><span class="p">)</span>
+
+<span class="c1"># Do other work...</span>
+
+<span class="n">plan</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">A1</span><span class="p">,</span> <span class="n">B1</span><span class="p">,</span> <span class="n">C1</span><span class="p">,</span> <span class="n">D1</span><span class="p">)</span>
+</pre></div>
+</div>
+<p>Elementwise activation functions are easily fused to the GEMM via the interface:</p>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">plan</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class="n">Gemm</span><span class="p">(</span><span class="n">element</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">float32</span><span class="p">,</span> <span class="n">layout</span><span class="o">=</span><span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">RowMajor</span><span class="p">)</span>
+<span class="n">plan</span><span class="o">.</span><span class="n">activation</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">epilogue</span><span class="o">.</span><span class="n">relu</span>
+</pre></div>
+</div>
+<p>Operations can also be run asynchronously:</p>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">plan</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class="n">Gemm</span><span class="p">(</span><span class="n">element</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">float32</span><span class="p">,</span> <span class="n">layout</span><span class="o">=</span><span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">RowMajor</span><span class="p">)</span>
+<span class="n">args</span> <span class="o">=</span> <span class="n">plan</span><span class="o">.</span><span class="n">run</span><span class="p">()</span>
+
+<span class="c1"># Do other work...</span>
+
+<span class="n">args</span><span class="o">.</span><span class="n">sync</span><span class="p">()</span>
+</pre></div>
+</div>
+<dl class="py class">
+<dt class="sig sig-object py" id="cutlass.op.gemm.Gemm">
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">cutlass.op.gemm.</span></span><span class="sig-name descname"><span class="pre">Gemm</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">A</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">B</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">C</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">D</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">alpha</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1.0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">beta</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">element_accumulator</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">element</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">layout</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">element_A</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">element_B</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">element_C</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">element_D</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">layout_A</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">layout_B</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">layout_C</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">cc</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kernel_cc</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/op/gemm.html#Gemm"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.op.gemm.Gemm" title="Permalink to this definition">#</a></dt>
+<dd><p>Bases: <a class="reference internal" href="#cutlass.op.op.OperationBase" title="cutlass.op.op.OperationBase"><code class="xref py py-class docutils literal notranslate"><span class="pre">OperationBase</span></code></a></p>
+<p>Constructs a <code class="docutils literal notranslate"><span class="pre">Gemm</span></code> object.</p>
+<p>The data types and layouts of operands A, B, and C, along with the data type of output D
+and that used for accumulation, are bound to the <code class="docutils literal notranslate"><span class="pre">Gemm</span></code> object throughout its lifetime –
+these are not to be changed after a <code class="docutils literal notranslate"><span class="pre">Gemm</span></code> has been constructed.</p>
+<p>The constructor has optional parameters for flexibly setting these parameters. The following
+constructors are equivalent:</p>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Use F32 for A, B, C, D, and accumulation. All operands are row major.</span>
+
+<span class="c1"># Use the generic ``element`` and ``layout`` parameters to concisely set all data types and layouts</span>
+<span class="c1"># for operands to the same values.</span>
+<span class="n">Gemm</span><span class="p">(</span><span class="n">element</span><span class="o">=</span><span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f32</span><span class="p">,</span> <span class="n">layout</span><span class="o">=</span><span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">RowMajor</span><span class="p">)</span>
+
+<span class="c1"># Explicitly specify the data types to use for A, B, C, and D. Use the generic ``layout``.</span>
+<span class="n">Gemm</span><span class="p">(</span><span class="n">element_A</span><span class="o">=</span><span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f32</span><span class="p">,</span> <span class="n">element_B</span><span class="o">=</span><span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f32</span><span class="p">,</span> <span class="n">element_C</span><span class="o">=</span><span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f32</span><span class="p">,</span>
+    <span class="n">element_D</span><span class="o">=</span><span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f32</span><span class="p">,</span> <span class="n">layout</span><span class="o">=</span><span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">RowMajor</span><span class="p">)</span>
+
+<span class="c1"># Set the data types and elements from existing tensors. Note that one can use different tensors when</span>
+<span class="c1"># executing GEMM via the ``run()`` method than passed in here (though those passed in to ``run()`` must</span>
+<span class="c1"># have the same data type and layout as those passed in here).</span>
+<span class="c1"># A, B, C, and D are row-major torch.Tensor objects of type torch.float32</span>
+<span class="n">Gemm</span><span class="p">(</span><span class="n">A</span><span class="o">=</span><span class="n">A</span><span class="p">,</span> <span class="n">B</span><span class="o">=</span><span class="n">B</span><span class="p">,</span> <span class="n">C</span><span class="o">=</span><span class="n">C</span><span class="p">,</span> <span class="n">D</span><span class="o">=</span><span class="n">D</span><span class="p">)</span>
+
+<span class="c1"># Use the generic ``element`` and explicitly specify the layouts to use for A, B, and C (layout of D is</span>
+<span class="c1"># the same as that for D, at present)</span>
+<span class="n">Gemm</span><span class="p">(</span><span class="n">element</span><span class="o">=</span><span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f32</span><span class="p">,</span> <span class="n">layout_A</span><span class="o">=</span><span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">RowMajor</span><span class="p">,</span>
+    <span class="n">layout_B</span><span class="o">=</span><span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">RowMajor</span><span class="p">,</span> <span class="n">layout_C</span><span class="o">=</span><span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">RowMajor</span><span class="p">)</span>
+
+<span class="c1"># Explicitly specify the data type and layout for only some of A, B, C, and D. Unspecified data types</span>
+<span class="c1"># and layouts will inherit those passed in via the generic ``element`` and ``layout``</span>
+<span class="n">Gemm</span><span class="p">(</span><span class="n">element_A</span><span class="o">=</span><span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f32</span><span class="p">,</span> <span class="n">layout_B</span><span class="o">=</span><span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">RowMajor</span><span class="p">,</span>
+    <span class="n">element</span><span class="o">=</span><span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f32</span><span class="p">,</span> <span class="n">layout</span><span class="o">=</span><span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">RowMajor</span><span class="p">)</span>
+</pre></div>
+</div>
+<dl class="simple">
+<dt>The order of precedence for the setting of the data type and layout for a given operand/output is as follows:</dt><dd><ol class="arabic simple">
+<li><p>If the tensor type is specified (e.g., <code class="docutils literal notranslate"><span class="pre">A</span></code>), use the data type and layout inferred from this tensor</p></li>
+<li><p>Otherwise, if the data type/layout (e.g., <code class="docutils literal notranslate"><span class="pre">element_A</span></code>, <code class="docutils literal notranslate"><span class="pre">layout_A</span></code>) is specified, use those</p></li>
+<li><p>Otherwise, use the generic values (e.g., <code class="docutils literal notranslate"><span class="pre">element</span></code>, <code class="docutils literal notranslate"><span class="pre">layout</span></code>)</p></li>
+</ol>
+</dd>
+</dl>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>cc</strong> (<em>int</em>) – compute capability of device for which kernels should be compiled. For example, if running on H100, this should be set to 90</p></li>
+<li><p><strong>kernel_cc</strong> (<em>int</em>) – compute capability of kernels to generate. For example, if running on SM90, but desiring to use a CUTLASS 2.x-style Ampere kernel, this should be set to 80</p></li>
+<li><p><strong>A</strong> – tensor representing data type and layout of operand A</p></li>
+<li><p><strong>B</strong> – tensor representing data type and layout of operand B</p></li>
+<li><p><strong>C</strong> – tensor representing data type and layout of operand C</p></li>
+<li><p><strong>D</strong> – tensor representing data type and layout of operand D</p></li>
+<li><p><strong>alpha</strong> – scalar paramter alpha from GEMM computation that scales the product of operands A and B</p></li>
+<li><p><strong>beta</strong> – scalar parameter beta from GEMM operation that scales operand C</p></li>
+<li><p><strong>element_accumulator</strong> (<em>cutlass.DataType</em>) – data type to be used in accumulation of the product of operands A and B</p></li>
+<li><p><strong>element</strong> (<em>cutlass.DataType</em>) – generic data type to be used for operands A, B, C, D, as well as the accumulation data type</p></li>
+<li><p><strong>layout</strong> (<em>cutlass.LayoutType</em>) – generic layout type to be used for operands A, B, C, and D</p></li>
+<li><p><strong>element_A</strong> (<em>cutlass.DataType</em>) – data type to be used for operand A</p></li>
+<li><p><strong>element_B</strong> (<em>cutlass.DataType</em>) – data type to be used for operand B</p></li>
+<li><p><strong>element_C</strong> (<em>cutlass.DataType</em>) – data type to be used for operand C</p></li>
+<li><p><strong>element_D</strong> (<em>cutlass.DataType</em>) – data type to be used for operand D</p></li>
+<li><p><strong>layout_A</strong> (<em>layout</em><em> of </em><em>operand A</em>) – cutlass.LayoutType</p></li>
+<li><p><strong>layout_B</strong> (<em>layout</em><em> of </em><em>operand B</em>) – cutlass.LayoutType</p></li>
+<li><p><strong>layout_C</strong> (<em>layout</em><em> of </em><em>operand C</em>) – cutlass.LayoutType</p></li>
+<li><p><strong>layout_D</strong> (<em>layout</em><em> of </em><em>operand D</em>) – cutlass.LayoutType</p></li>
+</ul>
+</dd>
+</dl>
+<dl class="py property">
+<dt class="sig sig-object py" id="cutlass.op.gemm.Gemm.activation">
+<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">activation</span></span><a class="headerlink" href="#cutlass.op.gemm.Gemm.activation" title="Permalink to this definition">#</a></dt>
+<dd><p>Returns the type of the current activation function used</p>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="cutlass.op.gemm.Gemm.compile">
+<span class="sig-name descname"><span class="pre">compile</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">tile_description</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">alignment_A</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">alignment_B</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">alignment_C</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">print_module</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/op/gemm.html#Gemm.compile"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.op.gemm.Gemm.compile" title="Permalink to this definition">#</a></dt>
+<dd><p>Emits and compiles the kernel currently specified. If <code class="docutils literal notranslate"><span class="pre">tile_description</span></code> and any
+of the <code class="docutils literal notranslate"><span class="pre">alignment</span></code> parameters are set, the kernel will be chosen using this
+tile description and alignments. Otherwise, a default tile description and alignment
+will be used.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>tile_description</strong> (<em>cutlass.backend.TileDescription</em>) – tile description specifying shapes and operand types to use in the kernel</p></li>
+<li><p><strong>alignment_A</strong> (<em>int</em>) – alignment of operand A</p></li>
+<li><p><strong>alignment_B</strong> (<em>int</em>) – alignment of operand B</p></li>
+<li><p><strong>alignment_C</strong> (<em>int</em>) – alignment of operand C</p></li>
+<li><p><strong>print_module</strong> (<em>bool</em>) – whether to print the emitted C++ code</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>operation that was compiled</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>cutlass.backend.GemmOperationUniversal</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="cutlass.op.gemm.Gemm.construct">
+<span class="sig-name descname"><span class="pre">construct</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">tile_description</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">alignment_A</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">alignment_B</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">alignment_C</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/op/gemm.html#Gemm.construct"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.op.gemm.Gemm.construct" title="Permalink to this definition">#</a></dt>
+<dd><p>Constructs a <code class="docutils literal notranslate"><span class="pre">cutlass.backend.GemmUniversalOperation</span></code> based on the input parameters and current
+kernel specification of the <code class="docutils literal notranslate"><span class="pre">Gemm</span></code> object.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>tile_description</strong> (<em>cutlass.backend.TileDescription</em>) – tile description specifying shapes and operand types to use in the kernel</p></li>
+<li><p><strong>alignment_A</strong> (<em>int</em>) – alignment of operand A</p></li>
+<li><p><strong>alignment_B</strong> (<em>int</em>) – alignment of operand B</p></li>
+<li><p><strong>alignment_C</strong> (<em>int</em>) – alignment of operand C</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>operation that was constructed</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>cutlass.backend.GemmOperationUniversal</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py property">
+<dt class="sig sig-object py" id="cutlass.op.gemm.Gemm.opclass">
+<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">opclass</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">OpcodeClass</span></em><a class="headerlink" href="#cutlass.op.gemm.Gemm.opclass" title="Permalink to this definition">#</a></dt>
+<dd><p>Returns the opcode class currently in use by the GEMM</p>
+<dl class="field-list simple">
+<dt class="field-odd">Returns<span class="colon">:</span></dt>
+<dd class="field-odd"><p>opcode class currently in use</p>
+</dd>
+<dt class="field-even">Return type<span class="colon">:</span></dt>
+<dd class="field-even"><p>cutlass.OpcodeClass</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="cutlass.op.gemm.Gemm.run">
+<span class="sig-name descname"><span class="pre">run</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">A</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">B</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">C</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">D</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">alpha</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">beta</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">batch_count</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">sync</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">print_module</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/op/gemm.html#Gemm.run"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.op.gemm.Gemm.run" title="Permalink to this definition">#</a></dt>
+<dd><p>Runs the kernel currently specified. If it has not already been, the kernel is emitted and
+compiled. Tensors holding operands and outputs of the kernel are sourced either from the
+<code class="docutils literal notranslate"><span class="pre">A</span></code>, <code class="docutils literal notranslate"><span class="pre">B</span></code>, <code class="docutils literal notranslate"><span class="pre">C</span></code>, <code class="docutils literal notranslate"><span class="pre">D</span></code>, <code class="docutils literal notranslate"><span class="pre">alpha</span></code>, and <code class="docutils literal notranslate"><span class="pre">beta</span></code>
+parameters provided in this call, or from those
+passed in on the construction of this object – one of the two must be specified.</p>
+<p>By default, this call returns only once the kernel has completed. To launch the kernel
+and immediately return, set <code class="docutils literal notranslate"><span class="pre">sync=False</span></code>. In this case, it is the responsibility of the
+caller to syncrhonize the results of the kernel before attempting to access outputs
+by calling <code class="docutils literal notranslate"><span class="pre">sync()</span></code> on the arguments returned from this call.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>A</strong> – tensor representing data type and layout of operand A</p></li>
+<li><p><strong>B</strong> – tensor representing data type and layout of operand B</p></li>
+<li><p><strong>C</strong> – tensor representing data type and layout of operand C</p></li>
+<li><p><strong>D</strong> – tensor representing data type and layout of operand D</p></li>
+<li><p><strong>alpha</strong> – scalar paramter alpha from GEMM computation that scales the product of operands A and B</p></li>
+<li><p><strong>beta</strong> – scalar parameter beta from GEMM operation that scales operand C</p></li>
+<li><p><strong>batch_count</strong> (<em>int</em>) – number of GEMMs in the batch</p></li>
+<li><p><strong>sync</strong> (<em>bool</em>) – whether the call should wait for the kernel to complete before returning</p></li>
+<li><p><strong>print_module</strong> (<em>bool</em>) – whether to print the emitted C++ code</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>arguments passed in to the kernel</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>cutlass.backend.GemmArguments</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py property">
+<dt class="sig sig-object py" id="cutlass.op.gemm.Gemm.swizzling_functor">
+<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">swizzling_functor</span></span><a class="headerlink" href="#cutlass.op.gemm.Gemm.swizzling_functor" title="Permalink to this definition">#</a></dt>
+<dd><p>Returns the type of the swizzling functor currently being used by the GEMM</p>
+<dl class="field-list simple">
+<dt class="field-odd">Returns<span class="colon">:</span></dt>
+<dd class="field-odd"><p>swizzing functor type</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="cutlass.op.gemm.Gemm.tile_descriptions">
+<span class="sig-name descname"><span class="pre">tile_descriptions</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/op/gemm.html#Gemm.tile_descriptions"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.op.gemm.Gemm.tile_descriptions" title="Permalink to this definition">#</a></dt>
+<dd><p>Returns a list of valid tile descriptions for the operations</p>
+<dl class="field-list simple">
+<dt class="field-odd">Returns<span class="colon">:</span></dt>
+<dd class="field-odd"><p>list of valid tile descriptions for the operations</p>
+</dd>
+<dt class="field-even">Return type<span class="colon">:</span></dt>
+<dd class="field-even"><p>list</p>
+</dd>
+</dl>
+</dd></dl>
+
+</dd></dl>
+
+</section>
+<section id="module-cutlass.op.gemm_grouped">
+<span id="grouped-gemm"></span><h2>Grouped GEMM<a class="headerlink" href="#module-cutlass.op.gemm_grouped" title="Permalink to this heading">#</a></h2>
+<p>Ease-of-use interface for constructing, compiling, and running GEMMs.</p>
+<p>The <code class="docutils literal notranslate"><span class="pre">GroupedGemm</span></code> interface is meant to allow one to easily instantiate, compile, and run
+grouped GEMM operations in CUTLASS via Python, without specifying many configuration parameters.
+Under the hood, the interface will select sensible default parameters for the many template
+parameters for CUTLASS grouped GEMMs.</p>
+<p>Note: optimal performance is not to be expected from this interface. To achieve optimal
+performance, one should specify and tune each configuration parameter.</p>
+<p>The simplest example of using this interface is the following:</p>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># As, Bs, Cs, and Ds are torch/numpy/cupy tensor objects</span>
+<span class="n">plan</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class="n">GroupedGemm</span><span class="p">(</span><span class="n">element</span><span class="o">=</span><span class="n">cutlass</span><span class="o">.</span><span class="n">DataType</span><span class="o">.</span><span class="n">f16</span><span class="p">,</span> <span class="n">layout</span><span class="o">=</span><span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">RowMajor</span><span class="p">)</span>
+<span class="n">plan</span><span class="o">.</span><span class="n">run</span><span class="p">([</span><span class="n">A0</span><span class="p">,</span> <span class="n">A1</span><span class="p">],</span> <span class="p">[</span><span class="n">B0</span><span class="p">,</span> <span class="n">B1</span><span class="p">],</span> <span class="p">[</span><span class="n">C0</span><span class="p">,</span> <span class="n">C1</span><span class="p">],</span> <span class="p">[</span><span class="n">D0</span><span class="p">,</span> <span class="n">D1</span><span class="p">])</span>
+</pre></div>
+</div>
+<dl class="py class">
+<dt class="sig sig-object py" id="cutlass.op.gemm_grouped.GroupedGemm">
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">cutlass.op.gemm_grouped.</span></span><span class="sig-name descname"><span class="pre">GroupedGemm</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">A</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">B</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">C</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">D</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">alpha</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1.0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">beta</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">element_accumulator</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">element</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">layout</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">element_A</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">element_B</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">element_C</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">element_D</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">layout_A</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">layout_B</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">layout_C</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">cc</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/op/gemm_grouped.html#GroupedGemm"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.op.gemm_grouped.GroupedGemm" title="Permalink to this definition">#</a></dt>
+<dd><p>Bases: <a class="reference internal" href="#cutlass.op.gemm.Gemm" title="cutlass.op.gemm.Gemm"><code class="xref py py-class docutils literal notranslate"><span class="pre">Gemm</span></code></a></p>
+<p>Constructs a <code class="docutils literal notranslate"><span class="pre">GroupedGemm</span></code> object.</p>
+<p>The data types and layouts of operands A, B, and C, along with the data type of output D
+and that used for accumulation, are bound to the <code class="docutils literal notranslate"><span class="pre">GroupedGemm</span></code> object throughout its lifetime –
+these are not to be changed after a <code class="docutils literal notranslate"><span class="pre">GroupedGemm</span></code> has been constructed.</p>
+<p>The constructor has optional parameters for flexibly setting these parameters. Please see the constructor
+for <code class="docutils literal notranslate"><span class="pre">Gemm</span></code> for examples of these.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>cc</strong> (<em>int</em>) – compute capability of device to generate kernels for</p></li>
+<li><p><strong>A</strong> – tensor representing data type and layout of operands A</p></li>
+<li><p><strong>B</strong> – tensor representing data type and layout of operands B</p></li>
+<li><p><strong>C</strong> – tensor representing data type and layout of operands C</p></li>
+<li><p><strong>D</strong> – tensor representing data type and layout of operands D</p></li>
+<li><p><strong>alpha</strong> – scalar paramter alpha from GEMM computation that scales the product of operands A and B</p></li>
+<li><p><strong>beta</strong> – scalar parameter beta from GEMM operation that scales operand C</p></li>
+<li><p><strong>element_accumulator</strong> (<em>cutlass.DataType</em>) – data type to be used in accumulation of the product of operands A and B</p></li>
+<li><p><strong>element</strong> (<em>cutlass.DataType</em>) – generic data type to be used for operands A, B, C, D, as well as the accumulation data type</p></li>
+<li><p><strong>layout</strong> (<em>cutlass.LayoutType</em>) – generic layout type to be used for operands A, B, C, and D</p></li>
+<li><p><strong>element_A</strong> (<em>cutlass.DataType</em>) – data type to be used for operand A</p></li>
+<li><p><strong>element_B</strong> (<em>cutlass.DataType</em>) – data type to be used for operand B</p></li>
+<li><p><strong>element_C</strong> (<em>cutlass.DataType</em>) – data type to be used for operand C</p></li>
+<li><p><strong>element_D</strong> (<em>cutlass.DataType</em>) – data type to be used for operand D</p></li>
+<li><p><strong>layout_A</strong> (<em>layout</em><em> of </em><em>operand A</em>) – cutlass.LayoutType</p></li>
+<li><p><strong>layout_B</strong> (<em>layout</em><em> of </em><em>operand B</em>) – cutlass.LayoutType</p></li>
+<li><p><strong>layout_C</strong> (<em>layout</em><em> of </em><em>operand C</em>) – cutlass.LayoutType</p></li>
+<li><p><strong>layout_D</strong> (<em>layout</em><em> of </em><em>operand D</em>) – cutlass.LayoutType</p></li>
+</ul>
+</dd>
+</dl>
+<dl class="py method">
+<dt class="sig sig-object py" id="cutlass.op.gemm_grouped.GroupedGemm.construct">
+<span class="sig-name descname"><span class="pre">construct</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">tile_description</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">alignment_A</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">alignment_B</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">alignment_C</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/op/gemm_grouped.html#GroupedGemm.construct"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.op.gemm_grouped.GroupedGemm.construct" title="Permalink to this definition">#</a></dt>
+<dd><p>Constructs a <code class="docutils literal notranslate"><span class="pre">cutlass.backend.GemmOperationGrouped</span></code> based on the input parameters and current
+kernel specification of the <code class="docutils literal notranslate"><span class="pre">Gemm</span></code> object.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>tile_description</strong> (<em>cutlass.backend.TileDescription</em>) – tile description specifying shapes and operand types to use in the kernel</p></li>
+<li><p><strong>alignment_A</strong> (<em>int</em>) – alignment of operand A</p></li>
+<li><p><strong>alignment_B</strong> (<em>int</em>) – alignment of operand B</p></li>
+<li><p><strong>alignment_C</strong> (<em>int</em>) – alignment of operand C</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>operation that was constructed</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>cutlass.backend.GemmOperationGrouped</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="cutlass.op.gemm_grouped.GroupedGemm.run">
+<span class="sig-name descname"><span class="pre">run</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">A</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">B</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">C</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">D</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">alpha</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">beta</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">sync</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">print_module</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/op/gemm_grouped.html#GroupedGemm.run"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.op.gemm_grouped.GroupedGemm.run" title="Permalink to this definition">#</a></dt>
+<dd><p>Runs the kernel currently specified.</p>
+<p>By default, this call returns only once the kernel has completed. To launch the kernel
+and immediately return, set <code class="docutils literal notranslate"><span class="pre">sync=False</span></code>. In this case, it is the responsibility of the
+caller to syncrhonize the results of the kernel before attempting to access outputs
+by calling <code class="docutils literal notranslate"><span class="pre">sync()</span></code> on the arguments returned from this call.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>A</strong> (<em>list</em>) – list of tensors representing data type and layout of operand A</p></li>
+<li><p><strong>B</strong> (<em>list</em>) – list of tensors representing data type and layout of operand B</p></li>
+<li><p><strong>C</strong> (<em>list</em>) – list of tensors representing data type and layout of operand C</p></li>
+<li><p><strong>D</strong> (<em>list</em>) – list of tensors representing data type and layout of operand D</p></li>
+<li><p><strong>alpha</strong> – scalar paramter alpha from GEMM computation that scales the product of operands A and B</p></li>
+<li><p><strong>beta</strong> – scalar parameter beta from GEMM operation that scales operand C</p></li>
+<li><p><strong>sync</strong> (<em>bool</em>) – whether the call should wait for the kernel to complete before returning</p></li>
+<li><p><strong>print_module</strong> (<em>bool</em>) – whether to print the emitted C++ code</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>arguments passed in to the kernel</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>cutlass.backend.GemmGroupedArguments</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py property">
+<dt class="sig sig-object py" id="cutlass.op.gemm_grouped.GroupedGemm.swizzling_functor">
+<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">swizzling_functor</span></span><a class="headerlink" href="#cutlass.op.gemm_grouped.GroupedGemm.swizzling_functor" title="Permalink to this definition">#</a></dt>
+<dd><p>Returns the type of the swizzling functor currently being used by the GEMM</p>
+<dl class="field-list simple">
+<dt class="field-odd">Returns<span class="colon">:</span></dt>
+<dd class="field-odd"><p>swizzing functor type</p>
+</dd>
+</dl>
+</dd></dl>
+
+</dd></dl>
+
+</section>
+<section id="module-cutlass.op.op">
+<span id="operation"></span><h2>Operation<a class="headerlink" href="#module-cutlass.op.op" title="Permalink to this heading">#</a></h2>
+<p>Base operation used for defining high-level CUTLASS operations (e.g., GEMM, Conv2d)</p>
+<dl class="py class">
+<dt class="sig sig-object py" id="cutlass.op.op.OperationBase">
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">cutlass.op.op.</span></span><span class="sig-name descname"><span class="pre">OperationBase</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">cc</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kernel_cc</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/op/op.html#OperationBase"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.op.op.OperationBase" title="Permalink to this definition">#</a></dt>
+<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
+<p>Base operation used for defining high-level CUTLASS operations (e.g., GEMM, Conv2d)</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>cc</strong> (<em>int</em>) – </p></li>
+<li><p><strong>kernel_cc</strong> (<em>int</em>) – </p></li>
+</ul>
+</dd>
+</dl>
+<dl class="py method">
+<dt class="sig sig-object py" id="cutlass.op.op.OperationBase.activations">
+<span class="sig-name descname"><span class="pre">activations</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/op/op.html#OperationBase.activations"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.op.op.OperationBase.activations" title="Permalink to this definition">#</a></dt>
+<dd><p>Returns possible activation functions that can be used</p>
+<dl class="field-list simple">
+<dt class="field-odd">Returns<span class="colon">:</span></dt>
+<dd class="field-odd"><p>list of activation functions that can be used</p>
+</dd>
+<dt class="field-even">Return type<span class="colon">:</span></dt>
+<dd class="field-even"><p>list</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="cutlass.op.op.OperationBase.swizzling_functors">
+<span class="sig-name descname"><span class="pre">swizzling_functors</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/op/op.html#OperationBase.swizzling_functors"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.op.op.OperationBase.swizzling_functors" title="Permalink to this definition">#</a></dt>
+<dd><p>Returns possible swizzling functions that can be used</p>
+<dl class="field-list simple">
+<dt class="field-odd">Returns<span class="colon">:</span></dt>
+<dd class="field-odd"><p>list of swizzling functions that can be used</p>
+</dd>
+<dt class="field-even">Return type<span class="colon">:</span></dt>
+<dd class="field-even"><p>list</p>
+</dd>
+</dl>
+</dd></dl>
+
+</dd></dl>
+
+</section>
+</section>
+
+        </article>
+      </div>
+      <footer>
+        
+        <div class="related-pages">
+          <a class="next-page" href="cutlass.utils.html">
+              <div class="page-info">
+                <div class="context">
+                  <span>Next</span>
+                </div>
+                <div class="title">Utilities</div>
+              </div>
+              <svg class="furo-related-icon"><use href="#svg-arrow-right"></use></svg>
+            </a>
+          <a class="prev-page" href="cutlass.emit.html">
+              <svg class="furo-related-icon"><use href="#svg-arrow-right"></use></svg>
+              <div class="page-info">
+                <div class="context">
+                  <span>Previous</span>
+                </div>
+                
+                <div class="title">Emitters</div>
+                
+              </div>
+            </a>
+        </div>
+        <div class="bottom-of-page">
+          <div class="left-details">
+            <div class="copyright">
+                Copyright &#169; 2023, NVIDIA
+            </div>
+            Made with <a href="https://www.sphinx-doc.org/">Sphinx</a> and <a class="muted-link" href="https://pradyunsg.me">@pradyunsg</a>'s
+            
+            <a href="https://github.com/pradyunsg/furo">Furo</a>
+            
+          </div>
+          <div class="right-details">
+            <div class="icons">
+              <a class="muted-link " href="https://github.com/NVIDIA/cutlass" aria-label="GitHub">
+                <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 16 16">
+                    <path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"></path>
+                </svg>
+            </a>
+              
+            </div>
+          </div>
+        </div>
+        
+      </footer>
+    </div>
+    <aside class="toc-drawer">
+      
+      
+      <div class="toc-sticky toc-scroll">
+        <div class="toc-title-container">
+          <span class="toc-title">
+            On this page
+          </span>
+        </div>
+        <div class="toc-tree-container">
+          <div class="toc-tree">
+            <ul>
+<li><a class="reference internal" href="#">Operations</a><ul>
+<li><a class="reference internal" href="#module-cutlass.op.gemm">GEMM</a><ul>
+<li><a class="reference internal" href="#cutlass.op.gemm.Gemm"><code class="docutils literal notranslate"><span class="pre">Gemm</span></code></a><ul>
+<li><a class="reference internal" href="#cutlass.op.gemm.Gemm.activation"><code class="docutils literal notranslate"><span class="pre">Gemm.activation</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.op.gemm.Gemm.compile"><code class="docutils literal notranslate"><span class="pre">Gemm.compile()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.op.gemm.Gemm.construct"><code class="docutils literal notranslate"><span class="pre">Gemm.construct()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.op.gemm.Gemm.opclass"><code class="docutils literal notranslate"><span class="pre">Gemm.opclass</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.op.gemm.Gemm.run"><code class="docutils literal notranslate"><span class="pre">Gemm.run()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.op.gemm.Gemm.swizzling_functor"><code class="docutils literal notranslate"><span class="pre">Gemm.swizzling_functor</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.op.gemm.Gemm.tile_descriptions"><code class="docutils literal notranslate"><span class="pre">Gemm.tile_descriptions()</span></code></a></li>
+</ul>
+</li>
+</ul>
+</li>
+<li><a class="reference internal" href="#module-cutlass.op.gemm_grouped">Grouped GEMM</a><ul>
+<li><a class="reference internal" href="#cutlass.op.gemm_grouped.GroupedGemm"><code class="docutils literal notranslate"><span class="pre">GroupedGemm</span></code></a><ul>
+<li><a class="reference internal" href="#cutlass.op.gemm_grouped.GroupedGemm.construct"><code class="docutils literal notranslate"><span class="pre">GroupedGemm.construct()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.op.gemm_grouped.GroupedGemm.run"><code class="docutils literal notranslate"><span class="pre">GroupedGemm.run()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.op.gemm_grouped.GroupedGemm.swizzling_functor"><code class="docutils literal notranslate"><span class="pre">GroupedGemm.swizzling_functor</span></code></a></li>
+</ul>
+</li>
+</ul>
+</li>
+<li><a class="reference internal" href="#module-cutlass.op.op">Operation</a><ul>
+<li><a class="reference internal" href="#cutlass.op.op.OperationBase"><code class="docutils literal notranslate"><span class="pre">OperationBase</span></code></a><ul>
+<li><a class="reference internal" href="#cutlass.op.op.OperationBase.activations"><code class="docutils literal notranslate"><span class="pre">OperationBase.activations()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.op.op.OperationBase.swizzling_functors"><code class="docutils literal notranslate"><span class="pre">OperationBase.swizzling_functors()</span></code></a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+
+          </div>
+        </div>
+      </div>
+      
+      
+    </aside>
+  </div>
+</div><script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
+    <script src="_static/doctools.js"></script>
+    <script src="_static/sphinx_highlight.js"></script>
+    <script src="_static/scripts/furo.js"></script>
+    <script src="_static/clipboard.min.js"></script>
+    <script src="_static/copybutton.js"></script>
+    <script src="_static/tabs.js"></script>
+    <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
+    </body>
+</html>
\ No newline at end of file
diff --git a/python/docs/cutlass.utils.html b/python/docs/cutlass.utils.html
new file mode 100644
index 0000000000..ed78ed6e15
--- /dev/null
+++ b/python/docs/cutlass.utils.html
@@ -0,0 +1,663 @@
+<!doctype html>
+<html class="no-js" lang="en">
+  <head><meta charset="utf-8"/>
+    <meta name="viewport" content="width=device-width,initial-scale=1"/>
+    <meta name="color-scheme" content="light dark"><meta name="generator" content="Docutils 0.19: https://docutils.sourceforge.io/" />
+<link rel="index" title="Index" href="genindex.html" /><link rel="search" title="Search" href="search.html" /><link rel="next" title="Examples" href="examples.html" /><link rel="prev" title="Operations" href="cutlass.op.html" />
+        <link rel="canonical" href="docs/cutlass.utils.html" />
+
+    <!-- Generated with Sphinx 6.1.3 and Furo 2023.03.27 -->
+        <title>Utilities - CUTLASS Python</title>
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css" />
+    <link rel="stylesheet" type="text/css" href="_static/styles/furo.css?digest=fad236701ea90a88636c2a8c73b44ae642ed2a53" />
+    <link rel="stylesheet" type="text/css" href="_static/copybutton.css" />
+    <link rel="stylesheet" type="text/css" href="_static/tabs.css" />
+    <link rel="stylesheet" type="text/css" href="_static/styles/furo-extensions.css?digest=30d1aed668e5c3a91c3e3bf6a60b675221979f0e" />
+    
+    
+
+
+<style>
+  body {
+    --color-code-background: #eeffcc;
+  --color-code-foreground: black;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+  }
+  @media not print {
+    body[data-theme="dark"] {
+      --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+    }
+    @media (prefers-color-scheme: dark) {
+      body:not([data-theme="light"]) {
+        --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+      }
+    }
+  }
+</style></head>
+  <body>
+    
+    <script>
+      document.body.dataset.theme = localStorage.getItem("theme") || "auto";
+    </script>
+    
+
+<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
+  <symbol id="svg-toc" viewBox="0 0 24 24">
+    <title>Contents</title>
+    <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 1024 1024">
+      <path d="M408 442h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8zm-8 204c0 4.4 3.6 8 8 8h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56zm504-486H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zm0 632H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zM115.4 518.9L271.7 642c5.8 4.6 14.4.5 14.4-6.9V388.9c0-7.4-8.5-11.5-14.4-6.9L115.4 505.1a8.74 8.74 0 0 0 0 13.8z"/>
+    </svg>
+  </symbol>
+  <symbol id="svg-menu" viewBox="0 0 24 24">
+    <title>Menu</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-menu">
+      <line x1="3" y1="12" x2="21" y2="12"></line>
+      <line x1="3" y1="6" x2="21" y2="6"></line>
+      <line x1="3" y1="18" x2="21" y2="18"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-arrow-right" viewBox="0 0 24 24">
+    <title>Expand</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-chevron-right">
+      <polyline points="9 18 15 12 9 6"></polyline>
+    </svg>
+  </symbol>
+  <symbol id="svg-sun" viewBox="0 0 24 24">
+    <title>Light mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="feather-sun">
+      <circle cx="12" cy="12" r="5"></circle>
+      <line x1="12" y1="1" x2="12" y2="3"></line>
+      <line x1="12" y1="21" x2="12" y2="23"></line>
+      <line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
+      <line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
+      <line x1="1" y1="12" x2="3" y2="12"></line>
+      <line x1="21" y1="12" x2="23" y2="12"></line>
+      <line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
+      <line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-moon" viewBox="0 0 24 24">
+    <title>Dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-moon">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none" />
+      <path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z" />
+    </svg>
+  </symbol>
+  <symbol id="svg-sun-half" viewBox="0 0 24 24">
+    <title>Auto light/dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-shadow">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none"/>
+      <circle cx="12" cy="12" r="9" />
+      <path d="M13 12h5" />
+      <path d="M13 15h4" />
+      <path d="M13 18h1" />
+      <path d="M13 9h4" />
+      <path d="M13 6h1" />
+    </svg>
+  </symbol>
+</svg>
+
+<input type="checkbox" class="sidebar-toggle" name="__navigation" id="__navigation">
+<input type="checkbox" class="sidebar-toggle" name="__toc" id="__toc">
+<label class="overlay sidebar-overlay" for="__navigation">
+  <div class="visually-hidden">Hide navigation sidebar</div>
+</label>
+<label class="overlay toc-overlay" for="__toc">
+  <div class="visually-hidden">Hide table of contents sidebar</div>
+</label>
+
+
+
+<div class="page">
+  <header class="mobile-header">
+    <div class="header-left">
+      <label class="nav-overlay-icon" for="__navigation">
+        <div class="visually-hidden">Toggle site navigation sidebar</div>
+        <i class="icon"><svg><use href="#svg-menu"></use></svg></i>
+      </label>
+    </div>
+    <div class="header-center">
+      <a href="index.html"><div class="brand">CUTLASS Python</div></a>
+    </div>
+    <div class="header-right">
+      <div class="theme-toggle-container theme-toggle-header">
+        <button class="theme-toggle">
+          <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+          <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+          <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+          <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+        </button>
+      </div>
+      <label class="toc-overlay-icon toc-header-icon" for="__toc">
+        <div class="visually-hidden">Toggle table of contents sidebar</div>
+        <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+      </label>
+    </div>
+  </header>
+  <aside class="sidebar-drawer">
+    <div class="sidebar-container">
+      
+      <div class="sidebar-sticky"><a class="sidebar-brand" href="index.html">
+  
+  <div class="sidebar-logo-container">
+    <img class="sidebar-logo only-light" src="_static/cutlass-logo-small.png" alt="Light Logo"/>
+    <img class="sidebar-logo only-dark" src="_static/cutlass-logo-small.png" alt="Dark Logo"/>
+  </div>
+  
+  <span class="sidebar-brand-text">CUTLASS Python</span>
+  
+</a><form class="sidebar-search-container" method="get" action="search.html" role="search">
+  <input class="sidebar-search" placeholder="Search" name="q" aria-label="Search">
+  <input type="hidden" name="check_keywords" value="yes">
+  <input type="hidden" name="area" value="default">
+</form>
+<div id="searchbox"></div><div class="sidebar-scroll"><div class="sidebar-tree">
+  <ul>
+<li class="toctree-l1"><a class="reference internal" href="index.html">Home</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Getting Started:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="install.html">Installation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="externals/00_basic_gemm.html">Getting Started</a></li>
+<li class="toctree-l1"><a class="reference internal" href="contribute.html">Contributing</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Python Documentation:</span></p>
+<ul class="current">
+<li class="toctree-l1 current has-children"><a class="reference internal" href="modules.html">CUTLASS Python API</a><input checked="" class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" role="switch" type="checkbox"/><label for="toctree-checkbox-1"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul class="current">
+<li class="toctree-l2 current has-children"><a class="reference internal" href="cutlass.html">CUTLASS</a><input checked="" class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" role="switch" type="checkbox"/><label for="toctree-checkbox-2"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul class="current">
+<li class="toctree-l3"><a class="reference internal" href="cutlass.emit.html">Emitters</a></li>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.op.html">Operations</a></li>
+<li class="toctree-l3 current current-page"><a class="current reference internal" href="#">Utilities</a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Examples and Tutorials:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="examples.html">Examples</a><input class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" role="switch" type="checkbox"/><label for="toctree-checkbox-3"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2"><a class="reference internal" href="externals/00_basic_gemm.html">Basic GEMM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="externals/01_epilogue.html">Epilogue</a></li>
+<li class="toctree-l2"><a class="reference internal" href="externals/02_pytorch_extension_grouped_gemm.html">PyTorch Extension</a></li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Reference:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference external" href="https://github.com/NVIDIA/cutlass">Github</a></li>
+</ul>
+
+</div>
+</div>
+
+      </div>
+      
+    </div>
+  </aside>
+  <div class="main">
+    <div class="content">
+      <div class="article-container">
+        <a href="#" class="back-to-top muted-link">
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+            <path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12z"></path>
+          </svg>
+          <span>Back to top</span>
+        </a>
+        <div class="content-icon-container">
+          
+<div class="theme-toggle-container theme-toggle-content">
+            <button class="theme-toggle">
+              <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+              <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+              <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+              <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+            </button>
+          </div>
+          <label class="toc-overlay-icon toc-content-icon" for="__toc">
+            <div class="visually-hidden">Toggle table of contents sidebar</div>
+            <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+          </label>
+        </div>
+        <article role="main">
+          <section id="utilities">
+<h1>Utilities<a class="headerlink" href="#utilities" title="Permalink to this heading">#</a></h1>
+<section id="module-cutlass.utils.check">
+<span id="checks"></span><h2>Checks<a class="headerlink" href="#module-cutlass.utils.check" title="Permalink to this heading">#</a></h2>
+<p>Utility functions for checking constraints on kernels and calculating kernel attributes</p>
+<dl class="py function">
+<dt class="sig sig-object py" id="cutlass.utils.check.alignment_or_default">
+<span class="sig-prename descclassname"><span class="pre">cutlass.utils.check.</span></span><span class="sig-name descname"><span class="pre">alignment_or_default</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">alignment_provided</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">default_alignment</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/utils/check.html#alignment_or_default"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.utils.check.alignment_or_default" title="Permalink to this definition">#</a></dt>
+<dd><p>Returns <cite>alignment_provided</cite> if it is set, otherwise <cite>default_alignment</cite> and checks
+that <cite>alignment_provided</cite> does not exceed <cite>default_alignment</cite>.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>alignment_provided</strong> (<em>int</em>) – alignment preference specified. Can be None.</p></li>
+<li><p><strong>default_alignment</strong> (<em>int</em>) – alignment to use if <cite>alignment_provided</cite> is None</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>alignment to use</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>int</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="cutlass.utils.check.calculate_smem_usage">
+<span class="sig-prename descclassname"><span class="pre">cutlass.utils.check.</span></span><span class="sig-name descname"><span class="pre">calculate_smem_usage</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">operation</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/utils/check.html#calculate_smem_usage"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.utils.check.calculate_smem_usage" title="Permalink to this definition">#</a></dt>
+<dd><p>Returns the amount of shared memory in bytes consumed by a kernel.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Returns<span class="colon">:</span></dt>
+<dd class="field-odd"><p>number of bytes of shared memory consumed by the operation</p>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>int</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="cutlass.utils.check.calculate_smem_usage_per_stage">
+<span class="sig-prename descclassname"><span class="pre">cutlass.utils.check.</span></span><span class="sig-name descname"><span class="pre">calculate_smem_usage_per_stage</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">tile_description</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">operation_kind</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/utils/check.html#calculate_smem_usage_per_stage"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.utils.check.calculate_smem_usage_per_stage" title="Permalink to this definition">#</a></dt>
+<dd><p>Returns the amount of shared memory in bytes consumed in a single stage of a kernel.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Returns<span class="colon">:</span></dt>
+<dd class="field-odd"><p>number of bytes of shared memory consumed by a single stage</p>
+</dd>
+<dt class="field-even">Return type<span class="colon">:</span></dt>
+<dd class="field-even"><p>int</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="cutlass.utils.check.valid_cluster_shape">
+<span class="sig-prename descclassname"><span class="pre">cutlass.utils.check.</span></span><span class="sig-name descname"><span class="pre">valid_cluster_shape</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">cc</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">cluster_shape</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/utils/check.html#valid_cluster_shape"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.utils.check.valid_cluster_shape" title="Permalink to this definition">#</a></dt>
+<dd><p>Checks whether a device with <cite>cc</cite> supports a thread block cluster of shape <cite>cluster_shape</cite>.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>cc</strong> (<em>int</em>) – compute capability of device in question</p></li>
+<li><p><strong>cluster_shape</strong> (<em>list</em>) – dimensions of thread block cluster shape to check</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>tuple with the first element indicating whether the provided cluster shape is
+valid for the provided device and the second element being an error message</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>tuple</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="cutlass.utils.check.valid_kernel_schedule">
+<span class="sig-prename descclassname"><span class="pre">cutlass.utils.check.</span></span><span class="sig-name descname"><span class="pre">valid_kernel_schedule</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">cc</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kernel_schedule</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/utils/check.html#valid_kernel_schedule"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.utils.check.valid_kernel_schedule" title="Permalink to this definition">#</a></dt>
+<dd><p>Checks whether a device with <code class="docutils literal notranslate"><span class="pre">cc</span></code> supports <code class="docutils literal notranslate"><span class="pre">kernel_schedule</span></code>.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>cc</strong> (<em>int</em>) – compute capability of device in question</p></li>
+<li><p><strong>kernel_schedule</strong> (<em>KernelScheduleType</em>) – kernel schedule type</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>tuple with the first element indicating whether the provided kernel schedule is
+valid for the provided device and the second element being an error message</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>tuple</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="cutlass.utils.check.valid_stage_count">
+<span class="sig-prename descclassname"><span class="pre">cutlass.utils.check.</span></span><span class="sig-name descname"><span class="pre">valid_stage_count</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">cc</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">td</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/utils/check.html#valid_stage_count"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.utils.check.valid_stage_count" title="Permalink to this definition">#</a></dt>
+<dd><p>Checks whether a device with <cite>cc</cite> supports the number of stages within <cite>tile_description</cite>, both
+based on raw limits on the number of stages and based on shared memory capacity</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>cc</strong> (<em>int</em>) – compute capability of device in question</p></li>
+<li><p><strong>td</strong> (<em>TileDescription</em>) – tile description to check</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>tuple with the first element indicating whether the provided tile description is
+valid for the provided device and the second element being an error message</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>tuple</p>
+</dd>
+</dl>
+</dd></dl>
+
+</section>
+<section id="module-cutlass.utils.datatypes">
+<span id="data-types"></span><h2>Data Types<a class="headerlink" href="#module-cutlass.utils.datatypes" title="Permalink to this heading">#</a></h2>
+<p>Utility functions for converting between frontend datatypes and CUTLASS datatypes</p>
+<dl class="py function">
+<dt class="sig sig-object py" id="cutlass.utils.datatypes.backend_math_operation">
+<span class="sig-prename descclassname"><span class="pre">cutlass.utils.datatypes.</span></span><span class="sig-name descname"><span class="pre">backend_math_operation</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">math_op</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/utils/datatypes.html#backend_math_operation"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.utils.datatypes.backend_math_operation" title="Permalink to this definition">#</a></dt>
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><p><strong>math_op</strong> (<em>MathOperation</em>) – </p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="cutlass.utils.datatypes.bfloat16_library_type">
+<span class="sig-prename descclassname"><span class="pre">cutlass.utils.datatypes.</span></span><span class="sig-name descname"><span class="pre">bfloat16_library_type</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">inp</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/utils/datatypes.html#bfloat16_library_type"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.utils.datatypes.bfloat16_library_type" title="Permalink to this definition">#</a></dt>
+<dd><dl class="field-list simple">
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p><em>DataType</em></p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="cutlass.utils.datatypes.bfloat16_type">
+<span class="sig-prename descclassname"><span class="pre">cutlass.utils.datatypes.</span></span><span class="sig-name descname"><span class="pre">bfloat16_type</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">inp</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/utils/datatypes.html#bfloat16_type"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.utils.datatypes.bfloat16_type" title="Permalink to this definition">#</a></dt>
+<dd><dl class="field-list simple">
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>bfloat16</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="cutlass.utils.datatypes.binding_layout">
+<span class="sig-prename descclassname"><span class="pre">cutlass.utils.datatypes.</span></span><span class="sig-name descname"><span class="pre">binding_layout</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">layout</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/utils/datatypes.html#binding_layout"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.utils.datatypes.binding_layout" title="Permalink to this definition">#</a></dt>
+<dd></dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="cutlass.utils.datatypes.binding_library_type">
+<span class="sig-prename descclassname"><span class="pre">cutlass.utils.datatypes.</span></span><span class="sig-name descname"><span class="pre">binding_library_type</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">inp</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/utils/datatypes.html#binding_library_type"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.utils.datatypes.binding_library_type" title="Permalink to this definition">#</a></dt>
+<dd></dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="cutlass.utils.datatypes.binding_opclass">
+<span class="sig-prename descclassname"><span class="pre">cutlass.utils.datatypes.</span></span><span class="sig-name descname"><span class="pre">binding_opclass</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">opclass</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/utils/datatypes.html#binding_opclass"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.utils.datatypes.binding_opclass" title="Permalink to this definition">#</a></dt>
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><p><strong>opclass</strong> (<em>OpcodeClass</em>) – </p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="cutlass.utils.datatypes.binding_type">
+<span class="sig-prename descclassname"><span class="pre">cutlass.utils.datatypes.</span></span><span class="sig-name descname"><span class="pre">binding_type</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">inp</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/utils/datatypes.html#binding_type"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.utils.datatypes.binding_type" title="Permalink to this definition">#</a></dt>
+<dd></dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="cutlass.utils.datatypes.construct_backend_td">
+<span class="sig-prename descclassname"><span class="pre">cutlass.utils.datatypes.</span></span><span class="sig-name descname"><span class="pre">construct_backend_td</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">td</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kernel_schedule</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/utils/datatypes.html#construct_backend_td"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.utils.datatypes.construct_backend_td" title="Permalink to this definition">#</a></dt>
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>td</strong> (<em>TileDescription</em>) – </p></li>
+<li><p><strong>kernel_schedule</strong> (<em>KernelScheduleType</em>) – </p></li>
+</ul>
+</dd>
+<dt class="field-even">Return type<span class="colon">:</span></dt>
+<dd class="field-even"><p><em>TileDescription</em></p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="cutlass.utils.datatypes.cupy_library_type">
+<span class="sig-prename descclassname"><span class="pre">cutlass.utils.datatypes.</span></span><span class="sig-name descname"><span class="pre">cupy_library_type</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">inp</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/utils/datatypes.html#cupy_library_type"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.utils.datatypes.cupy_library_type" title="Permalink to this definition">#</a></dt>
+<dd><dl class="field-list simple">
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p><em>DataType</em></p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="cutlass.utils.datatypes.cupy_type">
+<span class="sig-prename descclassname"><span class="pre">cutlass.utils.datatypes.</span></span><span class="sig-name descname"><span class="pre">cupy_type</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">inp</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/utils/datatypes.html#cupy_type"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.utils.datatypes.cupy_type" title="Permalink to this definition">#</a></dt>
+<dd></dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="cutlass.utils.datatypes.get_datatype_and_layout">
+<span class="sig-prename descclassname"><span class="pre">cutlass.utils.datatypes.</span></span><span class="sig-name descname"><span class="pre">get_datatype_and_layout</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">tensor</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/utils/datatypes.html#get_datatype_and_layout"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.utils.datatypes.get_datatype_and_layout" title="Permalink to this definition">#</a></dt>
+<dd></dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="cutlass.utils.datatypes.has_binding_type">
+<span class="sig-prename descclassname"><span class="pre">cutlass.utils.datatypes.</span></span><span class="sig-name descname"><span class="pre">has_binding_type</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">inp</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/utils/datatypes.html#has_binding_type"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.utils.datatypes.has_binding_type" title="Permalink to this definition">#</a></dt>
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><p><strong>inp</strong> (<em>DataType</em>) – </p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="cutlass.utils.datatypes.library_layout">
+<span class="sig-prename descclassname"><span class="pre">cutlass.utils.datatypes.</span></span><span class="sig-name descname"><span class="pre">library_layout</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">layout</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/utils/datatypes.html#library_layout"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.utils.datatypes.library_layout" title="Permalink to this definition">#</a></dt>
+<dd></dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="cutlass.utils.datatypes.library_to_binding">
+<span class="sig-prename descclassname"><span class="pre">cutlass.utils.datatypes.</span></span><span class="sig-name descname"><span class="pre">library_to_binding</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">inp</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/utils/datatypes.html#library_to_binding"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.utils.datatypes.library_to_binding" title="Permalink to this definition">#</a></dt>
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><p><strong>inp</strong> (<em>DataType</em>) – </p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="cutlass.utils.datatypes.library_type">
+<span class="sig-prename descclassname"><span class="pre">cutlass.utils.datatypes.</span></span><span class="sig-name descname"><span class="pre">library_type</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">inp</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/utils/datatypes.html#library_type"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.utils.datatypes.library_type" title="Permalink to this definition">#</a></dt>
+<dd></dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="cutlass.utils.datatypes.numpy_library_type">
+<span class="sig-prename descclassname"><span class="pre">cutlass.utils.datatypes.</span></span><span class="sig-name descname"><span class="pre">numpy_library_type</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">inp</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/utils/datatypes.html#numpy_library_type"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.utils.datatypes.numpy_library_type" title="Permalink to this definition">#</a></dt>
+<dd><dl class="field-list simple">
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p><em>DataType</em></p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="cutlass.utils.datatypes.numpy_type">
+<span class="sig-prename descclassname"><span class="pre">cutlass.utils.datatypes.</span></span><span class="sig-name descname"><span class="pre">numpy_type</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">inp</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/utils/datatypes.html#numpy_type"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.utils.datatypes.numpy_type" title="Permalink to this definition">#</a></dt>
+<dd></dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="cutlass.utils.datatypes.td_from_profiler_op">
+<span class="sig-prename descclassname"><span class="pre">cutlass.utils.datatypes.</span></span><span class="sig-name descname"><span class="pre">td_from_profiler_op</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">op</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/utils/datatypes.html#td_from_profiler_op"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.utils.datatypes.td_from_profiler_op" title="Permalink to this definition">#</a></dt>
+<dd><p>Converts the profiler’s TileDescription in <code class="docutils literal notranslate"><span class="pre">op</span></code> into the backend TileDescription</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><p><strong>op</strong> – profiler Operation</p>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>backend TileDescription</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>cutlass.backend.TileDescription</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="cutlass.utils.datatypes.td_from_profiler_td">
+<span class="sig-prename descclassname"><span class="pre">cutlass.utils.datatypes.</span></span><span class="sig-name descname"><span class="pre">td_from_profiler_td</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">td</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/utils/datatypes.html#td_from_profiler_td"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.utils.datatypes.td_from_profiler_td" title="Permalink to this definition">#</a></dt>
+<dd><p>Converts the profiler’s TileDescription into the backend TileDescription</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><p><strong>td</strong> (<em>cutlass.TileDescription</em>) – profiler TileDescription</p>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>backend TileDescription</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>cutlass.backend.TileDescription</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="cutlass.utils.datatypes.torch_library_type">
+<span class="sig-prename descclassname"><span class="pre">cutlass.utils.datatypes.</span></span><span class="sig-name descname"><span class="pre">torch_library_type</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">inp</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/utils/datatypes.html#torch_library_type"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.utils.datatypes.torch_library_type" title="Permalink to this definition">#</a></dt>
+<dd><dl class="field-list simple">
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p><em>DataType</em></p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="cutlass.utils.datatypes.torch_type">
+<span class="sig-prename descclassname"><span class="pre">cutlass.utils.datatypes.</span></span><span class="sig-name descname"><span class="pre">torch_type</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">inp</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/cutlass/utils/datatypes.html#torch_type"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#cutlass.utils.datatypes.torch_type" title="Permalink to this definition">#</a></dt>
+<dd></dd></dl>
+
+</section>
+</section>
+
+        </article>
+      </div>
+      <footer>
+        
+        <div class="related-pages">
+          <a class="next-page" href="examples.html">
+              <div class="page-info">
+                <div class="context">
+                  <span>Next</span>
+                </div>
+                <div class="title">Examples</div>
+              </div>
+              <svg class="furo-related-icon"><use href="#svg-arrow-right"></use></svg>
+            </a>
+          <a class="prev-page" href="cutlass.op.html">
+              <svg class="furo-related-icon"><use href="#svg-arrow-right"></use></svg>
+              <div class="page-info">
+                <div class="context">
+                  <span>Previous</span>
+                </div>
+                
+                <div class="title">Operations</div>
+                
+              </div>
+            </a>
+        </div>
+        <div class="bottom-of-page">
+          <div class="left-details">
+            <div class="copyright">
+                Copyright &#169; 2023, NVIDIA
+            </div>
+            Made with <a href="https://www.sphinx-doc.org/">Sphinx</a> and <a class="muted-link" href="https://pradyunsg.me">@pradyunsg</a>'s
+            
+            <a href="https://github.com/pradyunsg/furo">Furo</a>
+            
+          </div>
+          <div class="right-details">
+            <div class="icons">
+              <a class="muted-link " href="https://github.com/NVIDIA/cutlass" aria-label="GitHub">
+                <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 16 16">
+                    <path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"></path>
+                </svg>
+            </a>
+              
+            </div>
+          </div>
+        </div>
+        
+      </footer>
+    </div>
+    <aside class="toc-drawer">
+      
+      
+      <div class="toc-sticky toc-scroll">
+        <div class="toc-title-container">
+          <span class="toc-title">
+            On this page
+          </span>
+        </div>
+        <div class="toc-tree-container">
+          <div class="toc-tree">
+            <ul>
+<li><a class="reference internal" href="#">Utilities</a><ul>
+<li><a class="reference internal" href="#module-cutlass.utils.check">Checks</a><ul>
+<li><a class="reference internal" href="#cutlass.utils.check.alignment_or_default"><code class="docutils literal notranslate"><span class="pre">alignment_or_default()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.utils.check.calculate_smem_usage"><code class="docutils literal notranslate"><span class="pre">calculate_smem_usage()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.utils.check.calculate_smem_usage_per_stage"><code class="docutils literal notranslate"><span class="pre">calculate_smem_usage_per_stage()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.utils.check.valid_cluster_shape"><code class="docutils literal notranslate"><span class="pre">valid_cluster_shape()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.utils.check.valid_kernel_schedule"><code class="docutils literal notranslate"><span class="pre">valid_kernel_schedule()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.utils.check.valid_stage_count"><code class="docutils literal notranslate"><span class="pre">valid_stage_count()</span></code></a></li>
+</ul>
+</li>
+<li><a class="reference internal" href="#module-cutlass.utils.datatypes">Data Types</a><ul>
+<li><a class="reference internal" href="#cutlass.utils.datatypes.backend_math_operation"><code class="docutils literal notranslate"><span class="pre">backend_math_operation()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.utils.datatypes.bfloat16_library_type"><code class="docutils literal notranslate"><span class="pre">bfloat16_library_type()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.utils.datatypes.bfloat16_type"><code class="docutils literal notranslate"><span class="pre">bfloat16_type()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.utils.datatypes.binding_layout"><code class="docutils literal notranslate"><span class="pre">binding_layout()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.utils.datatypes.binding_library_type"><code class="docutils literal notranslate"><span class="pre">binding_library_type()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.utils.datatypes.binding_opclass"><code class="docutils literal notranslate"><span class="pre">binding_opclass()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.utils.datatypes.binding_type"><code class="docutils literal notranslate"><span class="pre">binding_type()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.utils.datatypes.construct_backend_td"><code class="docutils literal notranslate"><span class="pre">construct_backend_td()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.utils.datatypes.cupy_library_type"><code class="docutils literal notranslate"><span class="pre">cupy_library_type()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.utils.datatypes.cupy_type"><code class="docutils literal notranslate"><span class="pre">cupy_type()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.utils.datatypes.get_datatype_and_layout"><code class="docutils literal notranslate"><span class="pre">get_datatype_and_layout()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.utils.datatypes.has_binding_type"><code class="docutils literal notranslate"><span class="pre">has_binding_type()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.utils.datatypes.library_layout"><code class="docutils literal notranslate"><span class="pre">library_layout()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.utils.datatypes.library_to_binding"><code class="docutils literal notranslate"><span class="pre">library_to_binding()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.utils.datatypes.library_type"><code class="docutils literal notranslate"><span class="pre">library_type()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.utils.datatypes.numpy_library_type"><code class="docutils literal notranslate"><span class="pre">numpy_library_type()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.utils.datatypes.numpy_type"><code class="docutils literal notranslate"><span class="pre">numpy_type()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.utils.datatypes.td_from_profiler_op"><code class="docutils literal notranslate"><span class="pre">td_from_profiler_op()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.utils.datatypes.td_from_profiler_td"><code class="docutils literal notranslate"><span class="pre">td_from_profiler_td()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.utils.datatypes.torch_library_type"><code class="docutils literal notranslate"><span class="pre">torch_library_type()</span></code></a></li>
+<li><a class="reference internal" href="#cutlass.utils.datatypes.torch_type"><code class="docutils literal notranslate"><span class="pre">torch_type()</span></code></a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+
+          </div>
+        </div>
+      </div>
+      
+      
+    </aside>
+  </div>
+</div><script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
+    <script src="_static/doctools.js"></script>
+    <script src="_static/sphinx_highlight.js"></script>
+    <script src="_static/scripts/furo.js"></script>
+    <script src="_static/clipboard.min.js"></script>
+    <script src="_static/copybutton.js"></script>
+    <script src="_static/tabs.js"></script>
+    <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
+    </body>
+</html>
\ No newline at end of file
diff --git a/python/docs/examples.html b/python/docs/examples.html
new file mode 100644
index 0000000000..9c67aca2d8
--- /dev/null
+++ b/python/docs/examples.html
@@ -0,0 +1,330 @@
+<!doctype html>
+<html class="no-js" lang="en">
+  <head><meta charset="utf-8"/>
+    <meta name="viewport" content="width=device-width,initial-scale=1"/>
+    <meta name="color-scheme" content="light dark"><meta name="generator" content="Docutils 0.19: https://docutils.sourceforge.io/" />
+<link rel="index" title="Index" href="genindex.html" /><link rel="search" title="Search" href="search.html" /><link rel="next" title="Example of using elementwise activation functions in the CUTLASS Python interface" href="externals/01_epilogue.html" /><link rel="prev" title="Utilities" href="cutlass.utils.html" />
+        <link rel="canonical" href="docs/examples.html" />
+
+    <!-- Generated with Sphinx 6.1.3 and Furo 2023.03.27 -->
+        <title>Examples - CUTLASS Python</title>
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css" />
+    <link rel="stylesheet" type="text/css" href="_static/styles/furo.css?digest=fad236701ea90a88636c2a8c73b44ae642ed2a53" />
+    <link rel="stylesheet" type="text/css" href="_static/copybutton.css" />
+    <link rel="stylesheet" type="text/css" href="_static/tabs.css" />
+    <link rel="stylesheet" type="text/css" href="_static/styles/furo-extensions.css?digest=30d1aed668e5c3a91c3e3bf6a60b675221979f0e" />
+    
+    
+
+
+<style>
+  body {
+    --color-code-background: #eeffcc;
+  --color-code-foreground: black;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+  }
+  @media not print {
+    body[data-theme="dark"] {
+      --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+    }
+    @media (prefers-color-scheme: dark) {
+      body:not([data-theme="light"]) {
+        --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+      }
+    }
+  }
+</style></head>
+  <body>
+    
+    <script>
+      document.body.dataset.theme = localStorage.getItem("theme") || "auto";
+    </script>
+    
+
+<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
+  <symbol id="svg-toc" viewBox="0 0 24 24">
+    <title>Contents</title>
+    <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 1024 1024">
+      <path d="M408 442h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8zm-8 204c0 4.4 3.6 8 8 8h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56zm504-486H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zm0 632H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zM115.4 518.9L271.7 642c5.8 4.6 14.4.5 14.4-6.9V388.9c0-7.4-8.5-11.5-14.4-6.9L115.4 505.1a8.74 8.74 0 0 0 0 13.8z"/>
+    </svg>
+  </symbol>
+  <symbol id="svg-menu" viewBox="0 0 24 24">
+    <title>Menu</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-menu">
+      <line x1="3" y1="12" x2="21" y2="12"></line>
+      <line x1="3" y1="6" x2="21" y2="6"></line>
+      <line x1="3" y1="18" x2="21" y2="18"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-arrow-right" viewBox="0 0 24 24">
+    <title>Expand</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-chevron-right">
+      <polyline points="9 18 15 12 9 6"></polyline>
+    </svg>
+  </symbol>
+  <symbol id="svg-sun" viewBox="0 0 24 24">
+    <title>Light mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="feather-sun">
+      <circle cx="12" cy="12" r="5"></circle>
+      <line x1="12" y1="1" x2="12" y2="3"></line>
+      <line x1="12" y1="21" x2="12" y2="23"></line>
+      <line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
+      <line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
+      <line x1="1" y1="12" x2="3" y2="12"></line>
+      <line x1="21" y1="12" x2="23" y2="12"></line>
+      <line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
+      <line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-moon" viewBox="0 0 24 24">
+    <title>Dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-moon">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none" />
+      <path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z" />
+    </svg>
+  </symbol>
+  <symbol id="svg-sun-half" viewBox="0 0 24 24">
+    <title>Auto light/dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-shadow">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none"/>
+      <circle cx="12" cy="12" r="9" />
+      <path d="M13 12h5" />
+      <path d="M13 15h4" />
+      <path d="M13 18h1" />
+      <path d="M13 9h4" />
+      <path d="M13 6h1" />
+    </svg>
+  </symbol>
+</svg>
+
+<input type="checkbox" class="sidebar-toggle" name="__navigation" id="__navigation">
+<input type="checkbox" class="sidebar-toggle" name="__toc" id="__toc">
+<label class="overlay sidebar-overlay" for="__navigation">
+  <div class="visually-hidden">Hide navigation sidebar</div>
+</label>
+<label class="overlay toc-overlay" for="__toc">
+  <div class="visually-hidden">Hide table of contents sidebar</div>
+</label>
+
+
+
+<div class="page">
+  <header class="mobile-header">
+    <div class="header-left">
+      <label class="nav-overlay-icon" for="__navigation">
+        <div class="visually-hidden">Toggle site navigation sidebar</div>
+        <i class="icon"><svg><use href="#svg-menu"></use></svg></i>
+      </label>
+    </div>
+    <div class="header-center">
+      <a href="index.html"><div class="brand">CUTLASS Python</div></a>
+    </div>
+    <div class="header-right">
+      <div class="theme-toggle-container theme-toggle-header">
+        <button class="theme-toggle">
+          <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+          <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+          <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+          <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+        </button>
+      </div>
+      <label class="toc-overlay-icon toc-header-icon no-toc" for="__toc">
+        <div class="visually-hidden">Toggle table of contents sidebar</div>
+        <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+      </label>
+    </div>
+  </header>
+  <aside class="sidebar-drawer">
+    <div class="sidebar-container">
+      
+      <div class="sidebar-sticky"><a class="sidebar-brand" href="index.html">
+  
+  <div class="sidebar-logo-container">
+    <img class="sidebar-logo only-light" src="_static/cutlass-logo-small.png" alt="Light Logo"/>
+    <img class="sidebar-logo only-dark" src="_static/cutlass-logo-small.png" alt="Dark Logo"/>
+  </div>
+  
+  <span class="sidebar-brand-text">CUTLASS Python</span>
+  
+</a><form class="sidebar-search-container" method="get" action="search.html" role="search">
+  <input class="sidebar-search" placeholder="Search" name="q" aria-label="Search">
+  <input type="hidden" name="check_keywords" value="yes">
+  <input type="hidden" name="area" value="default">
+</form>
+<div id="searchbox"></div><div class="sidebar-scroll"><div class="sidebar-tree">
+  <ul>
+<li class="toctree-l1"><a class="reference internal" href="index.html">Home</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Getting Started:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="install.html">Installation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="externals/00_basic_gemm.html">Getting Started</a></li>
+<li class="toctree-l1"><a class="reference internal" href="contribute.html">Contributing</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Python Documentation:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="modules.html">CUTLASS Python API</a><input class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" role="switch" type="checkbox"/><label for="toctree-checkbox-1"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2 has-children"><a class="reference internal" href="cutlass.html">CUTLASS</a><input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" role="switch" type="checkbox"/><label for="toctree-checkbox-2"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.emit.html">Emitters</a></li>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.op.html">Operations</a></li>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.utils.html">Utilities</a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Examples and Tutorials:</span></p>
+<ul class="current">
+<li class="toctree-l1 current has-children current-page"><a class="current reference internal" href="#">Examples</a><input checked="" class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" role="switch" type="checkbox"/><label for="toctree-checkbox-3"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2"><a class="reference internal" href="externals/00_basic_gemm.html">Basic GEMM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="externals/01_epilogue.html">Epilogue</a></li>
+<li class="toctree-l2"><a class="reference internal" href="externals/02_pytorch_extension_grouped_gemm.html">PyTorch Extension</a></li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Reference:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference external" href="https://github.com/NVIDIA/cutlass">Github</a></li>
+</ul>
+
+</div>
+</div>
+
+      </div>
+      
+    </div>
+  </aside>
+  <div class="main">
+    <div class="content">
+      <div class="article-container">
+        <a href="#" class="back-to-top muted-link">
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+            <path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12z"></path>
+          </svg>
+          <span>Back to top</span>
+        </a>
+        <div class="content-icon-container">
+          
+<div class="theme-toggle-container theme-toggle-content">
+            <button class="theme-toggle">
+              <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+              <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+              <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+              <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+            </button>
+          </div>
+          <label class="toc-overlay-icon toc-content-icon no-toc" for="__toc">
+            <div class="visually-hidden">Toggle table of contents sidebar</div>
+            <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+          </label>
+        </div>
+        <article role="main">
+          <section id="examples">
+<h1>Examples<a class="headerlink" href="#examples" title="Permalink to this heading">#</a></h1>
+<div class="toctree-wrapper compound">
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="externals/00_basic_gemm.html">Basic GEMM</a><ul>
+<li class="toctree-l2"><a class="reference internal" href="externals/00_basic_gemm.html#Declaring-and-running-a-GEMM">Declaring and running a GEMM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="externals/00_basic_gemm.html#Changing-operation-modes">Changing operation modes</a></li>
+<li class="toctree-l2"><a class="reference internal" href="externals/00_basic_gemm.html#Running-cached-kernels">Running cached kernels</a></li>
+<li class="toctree-l2"><a class="reference internal" href="externals/00_basic_gemm.html#Running-non-default-GEMMs">Running non-default GEMMs</a></li>
+<li class="toctree-l2"><a class="reference internal" href="externals/00_basic_gemm.html#Handling-errors">Handling errors</a></li>
+</ul>
+</li>
+<li class="toctree-l1"><a class="reference internal" href="externals/01_epilogue.html">Epilogue</a><ul>
+<li class="toctree-l2"><a class="reference internal" href="externals/01_epilogue.html#Run-a-GEMM-with-an-identity-activation-function">Run a GEMM with an identity activation function</a></li>
+<li class="toctree-l2"><a class="reference internal" href="externals/01_epilogue.html#Run-a-GEMM-with-a-ReLU-element-wise-activation-function">Run a GEMM with a ReLU element-wise activation function</a></li>
+<li class="toctree-l2"><a class="reference internal" href="externals/01_epilogue.html#Other-element-wise-activation-functions">Other element-wise activation functions</a></li>
+</ul>
+</li>
+<li class="toctree-l1"><a class="reference internal" href="externals/02_pytorch_extension_grouped_gemm.html">PyTorch Extension</a><ul>
+<li class="toctree-l2"><a class="reference internal" href="externals/02_pytorch_extension_grouped_gemm.html#Background-on-grouped-GEMM">Background on grouped GEMM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="externals/02_pytorch_extension_grouped_gemm.html#Declaring-a-grouped-GEMM-via-the-CUTLASS-Python-interface">Declaring a grouped GEMM via the CUTLASS Python interface</a></li>
+<li class="toctree-l2"><a class="reference internal" href="externals/02_pytorch_extension_grouped_gemm.html#Exporting-the-CUTLASS-kernel-to-a-PyTorch-CUDA-extension">Exporting the CUTLASS kernel to a PyTorch CUDA extension</a></li>
+</ul>
+</li>
+</ul>
+</div>
+</section>
+
+        </article>
+      </div>
+      <footer>
+        
+        <div class="related-pages">
+          <a class="next-page" href="externals/01_epilogue.html">
+              <div class="page-info">
+                <div class="context">
+                  <span>Next</span>
+                </div>
+                <div class="title">Example of using elementwise activation functions in the CUTLASS Python interface</div>
+              </div>
+              <svg class="furo-related-icon"><use href="#svg-arrow-right"></use></svg>
+            </a>
+          <a class="prev-page" href="cutlass.utils.html">
+              <svg class="furo-related-icon"><use href="#svg-arrow-right"></use></svg>
+              <div class="page-info">
+                <div class="context">
+                  <span>Previous</span>
+                </div>
+                
+                <div class="title">Utilities</div>
+                
+              </div>
+            </a>
+        </div>
+        <div class="bottom-of-page">
+          <div class="left-details">
+            <div class="copyright">
+                Copyright &#169; 2023, NVIDIA
+            </div>
+            Made with <a href="https://www.sphinx-doc.org/">Sphinx</a> and <a class="muted-link" href="https://pradyunsg.me">@pradyunsg</a>'s
+            
+            <a href="https://github.com/pradyunsg/furo">Furo</a>
+            
+          </div>
+          <div class="right-details">
+            <div class="icons">
+              <a class="muted-link " href="https://github.com/NVIDIA/cutlass" aria-label="GitHub">
+                <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 16 16">
+                    <path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"></path>
+                </svg>
+            </a>
+              
+            </div>
+          </div>
+        </div>
+        
+      </footer>
+    </div>
+    <aside class="toc-drawer no-toc">
+      
+      
+      
+    </aside>
+  </div>
+</div><script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
+    <script src="_static/doctools.js"></script>
+    <script src="_static/sphinx_highlight.js"></script>
+    <script src="_static/scripts/furo.js"></script>
+    <script src="_static/clipboard.min.js"></script>
+    <script src="_static/copybutton.js"></script>
+    <script src="_static/tabs.js"></script>
+    <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
+    </body>
+</html>
\ No newline at end of file
diff --git a/python/docs/externals/00_basic_gemm.html b/python/docs/externals/00_basic_gemm.html
new file mode 100644
index 0000000000..f5e0a89a3b
--- /dev/null
+++ b/python/docs/externals/00_basic_gemm.html
@@ -0,0 +1,821 @@
+<!doctype html>
+<html class="no-js" lang="en">
+  <head><meta charset="utf-8"/>
+    <meta name="viewport" content="width=device-width,initial-scale=1"/>
+    <meta name="color-scheme" content="light dark"><meta name="generator" content="Docutils 0.19: https://docutils.sourceforge.io/" />
+<link rel="index" title="Index" href="../genindex.html" /><link rel="search" title="Search" href="../search.html" /><link rel="next" title="Contributing" href="../contribute.html" /><link rel="prev" title="Installation" href="../install.html" />
+        <link rel="canonical" href="docs/externals/00_basic_gemm.html" />
+
+    <!-- Generated with Sphinx 6.1.3 and Furo 2023.03.27 -->
+        <title>Basic example of using the CUTLASS Python interface - CUTLASS Python</title>
+      <link rel="stylesheet" type="text/css" href="../_static/pygments.css" />
+    <link rel="stylesheet" type="text/css" href="../_static/styles/furo.css?digest=fad236701ea90a88636c2a8c73b44ae642ed2a53" />
+    <link rel="stylesheet" type="text/css" href="../_static/copybutton.css" />
+    <link rel="stylesheet" type="text/css" href="../_static/tabs.css" />
+    <link rel="stylesheet" type="text/css" href="../_static/nbsphinx-code-cells.css" />
+    <link rel="stylesheet" type="text/css" href="../_static/styles/furo-extensions.css?digest=30d1aed668e5c3a91c3e3bf6a60b675221979f0e" />
+    
+    
+
+
+<style>
+  body {
+    --color-code-background: #eeffcc;
+  --color-code-foreground: black;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+  }
+  @media not print {
+    body[data-theme="dark"] {
+      --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+    }
+    @media (prefers-color-scheme: dark) {
+      body:not([data-theme="light"]) {
+        --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+      }
+    }
+  }
+</style></head>
+  <body>
+    
+    <script>
+      document.body.dataset.theme = localStorage.getItem("theme") || "auto";
+    </script>
+    
+
+<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
+  <symbol id="svg-toc" viewBox="0 0 24 24">
+    <title>Contents</title>
+    <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 1024 1024">
+      <path d="M408 442h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8zm-8 204c0 4.4 3.6 8 8 8h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56zm504-486H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zm0 632H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zM115.4 518.9L271.7 642c5.8 4.6 14.4.5 14.4-6.9V388.9c0-7.4-8.5-11.5-14.4-6.9L115.4 505.1a8.74 8.74 0 0 0 0 13.8z"/>
+    </svg>
+  </symbol>
+  <symbol id="svg-menu" viewBox="0 0 24 24">
+    <title>Menu</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-menu">
+      <line x1="3" y1="12" x2="21" y2="12"></line>
+      <line x1="3" y1="6" x2="21" y2="6"></line>
+      <line x1="3" y1="18" x2="21" y2="18"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-arrow-right" viewBox="0 0 24 24">
+    <title>Expand</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-chevron-right">
+      <polyline points="9 18 15 12 9 6"></polyline>
+    </svg>
+  </symbol>
+  <symbol id="svg-sun" viewBox="0 0 24 24">
+    <title>Light mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="feather-sun">
+      <circle cx="12" cy="12" r="5"></circle>
+      <line x1="12" y1="1" x2="12" y2="3"></line>
+      <line x1="12" y1="21" x2="12" y2="23"></line>
+      <line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
+      <line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
+      <line x1="1" y1="12" x2="3" y2="12"></line>
+      <line x1="21" y1="12" x2="23" y2="12"></line>
+      <line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
+      <line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-moon" viewBox="0 0 24 24">
+    <title>Dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-moon">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none" />
+      <path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z" />
+    </svg>
+  </symbol>
+  <symbol id="svg-sun-half" viewBox="0 0 24 24">
+    <title>Auto light/dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-shadow">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none"/>
+      <circle cx="12" cy="12" r="9" />
+      <path d="M13 12h5" />
+      <path d="M13 15h4" />
+      <path d="M13 18h1" />
+      <path d="M13 9h4" />
+      <path d="M13 6h1" />
+    </svg>
+  </symbol>
+</svg>
+
+<input type="checkbox" class="sidebar-toggle" name="__navigation" id="__navigation">
+<input type="checkbox" class="sidebar-toggle" name="__toc" id="__toc">
+<label class="overlay sidebar-overlay" for="__navigation">
+  <div class="visually-hidden">Hide navigation sidebar</div>
+</label>
+<label class="overlay toc-overlay" for="__toc">
+  <div class="visually-hidden">Hide table of contents sidebar</div>
+</label>
+
+
+
+<div class="page">
+  <header class="mobile-header">
+    <div class="header-left">
+      <label class="nav-overlay-icon" for="__navigation">
+        <div class="visually-hidden">Toggle site navigation sidebar</div>
+        <i class="icon"><svg><use href="#svg-menu"></use></svg></i>
+      </label>
+    </div>
+    <div class="header-center">
+      <a href="../index.html"><div class="brand">CUTLASS Python</div></a>
+    </div>
+    <div class="header-right">
+      <div class="theme-toggle-container theme-toggle-header">
+        <button class="theme-toggle">
+          <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+          <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+          <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+          <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+        </button>
+      </div>
+      <label class="toc-overlay-icon toc-header-icon" for="__toc">
+        <div class="visually-hidden">Toggle table of contents sidebar</div>
+        <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+      </label>
+    </div>
+  </header>
+  <aside class="sidebar-drawer">
+    <div class="sidebar-container">
+      
+      <div class="sidebar-sticky"><a class="sidebar-brand" href="../index.html">
+  
+  <div class="sidebar-logo-container">
+    <img class="sidebar-logo only-light" src="../_static/cutlass-logo-small.png" alt="Light Logo"/>
+    <img class="sidebar-logo only-dark" src="../_static/cutlass-logo-small.png" alt="Dark Logo"/>
+  </div>
+  
+  <span class="sidebar-brand-text">CUTLASS Python</span>
+  
+</a><form class="sidebar-search-container" method="get" action="../search.html" role="search">
+  <input class="sidebar-search" placeholder="Search" name="q" aria-label="Search">
+  <input type="hidden" name="check_keywords" value="yes">
+  <input type="hidden" name="area" value="default">
+</form>
+<div id="searchbox"></div><div class="sidebar-scroll"><div class="sidebar-tree">
+  <ul>
+<li class="toctree-l1"><a class="reference internal" href="../index.html">Home</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Getting Started:</span></p>
+<ul class="current">
+<li class="toctree-l1"><a class="reference internal" href="../install.html">Installation</a></li>
+<li class="toctree-l1 current"><a class="current reference internal" href="#">Getting Started</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../contribute.html">Contributing</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Python Documentation:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../modules.html">CUTLASS Python API</a><input class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" role="switch" type="checkbox"/><label for="toctree-checkbox-1"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2 has-children"><a class="reference internal" href="../cutlass.html">CUTLASS</a><input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" role="switch" type="checkbox"/><label for="toctree-checkbox-2"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l3"><a class="reference internal" href="../cutlass.emit.html">Emitters</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../cutlass.op.html">Operations</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../cutlass.utils.html">Utilities</a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Examples and Tutorials:</span></p>
+<ul class="current">
+<li class="toctree-l1 current has-children"><a class="reference internal" href="../examples.html">Examples</a><input checked="" class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" role="switch" type="checkbox"/><label for="toctree-checkbox-3"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul class="current">
+<li class="toctree-l2 current current-page"><a class="current reference internal" href="#">Basic GEMM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="01_epilogue.html">Epilogue</a></li>
+<li class="toctree-l2"><a class="reference internal" href="02_pytorch_extension_grouped_gemm.html">PyTorch Extension</a></li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Reference:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference external" href="https://github.com/NVIDIA/cutlass">Github</a></li>
+</ul>
+
+</div>
+</div>
+
+      </div>
+      
+    </div>
+  </aside>
+  <div class="main">
+    <div class="content">
+      <div class="article-container">
+        <a href="#" class="back-to-top muted-link">
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+            <path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12z"></path>
+          </svg>
+          <span>Back to top</span>
+        </a>
+        <div class="content-icon-container">
+          
+<div class="theme-toggle-container theme-toggle-content">
+            <button class="theme-toggle">
+              <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+              <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+              <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+              <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+            </button>
+          </div>
+          <label class="toc-overlay-icon toc-content-icon" for="__toc">
+            <div class="visually-hidden">Toggle table of contents sidebar</div>
+            <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+          </label>
+        </div>
+        <article role="main">
+          <section id="Basic-example-of-using-the-CUTLASS-Python-interface">
+<h1>Basic example of using the CUTLASS Python interface<a class="headerlink" href="#Basic-example-of-using-the-CUTLASS-Python-interface" title="Permalink to this heading">#</a></h1>
+<p>This notebook walks through a basic example of using the CUTLASS Python interface to declare, compile, and run GEMMs.</p>
+<p><a class="reference external" href="https://colab.research.google.com/github/NVIDIA/cutlass/tree/master/examples/00_basic_gemm.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a></p>
+<p>We first import various packages needed for the example and construct the input and output tensors that will be used in our example.</p>
+<div class="nbinput docutils container">
+<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[1]:
+</pre></div>
+</div>
+<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
+<span class="kn">import</span> <span class="nn">random</span>
+
+<span class="kn">import</span> <span class="nn">cutlass</span>
+
+<span class="c1"># This controls whether ther C++ GEMM declaration will be printed at each step. Set to `false` to</span>
+<span class="c1"># omit this information.</span>
+<span class="n">print_module</span> <span class="o">=</span> <span class="kc">True</span>
+
+<span class="n">m</span> <span class="o">=</span> <span class="mi">128</span>
+<span class="n">n</span> <span class="o">=</span> <span class="n">m</span>
+<span class="n">k</span> <span class="o">=</span> <span class="n">m</span>
+
+<span class="n">dtype</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">float16</span>
+<span class="n">type_A</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">float16</span>
+<span class="n">type_B</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">float16</span>
+<span class="n">type_C</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">float16</span>
+<span class="n">type_D</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">float16</span>
+
+<span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">seed</span><span class="p">(</span><span class="mi">1234</span><span class="p">)</span>
+<span class="n">random</span><span class="o">.</span><span class="n">seed</span><span class="p">(</span><span class="mi">1234</span><span class="p">)</span>
+<span class="n">scope_min</span> <span class="o">=</span> <span class="o">-</span><span class="mi">4</span>
+<span class="n">scope_max</span> <span class="o">=</span> <span class="mi">4</span>
+<span class="n">tensor_A</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">ceil</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">uniform</span><span class="p">(</span><span class="n">low</span><span class="o">=</span><span class="n">scope_min</span><span class="p">,</span> <span class="n">high</span><span class="o">=</span><span class="n">scope_max</span><span class="p">,</span> <span class="n">size</span><span class="o">=</span><span class="p">(</span><span class="n">m</span><span class="p">,</span> <span class="n">k</span><span class="p">))</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">type_A</span><span class="p">))</span>
+<span class="n">tensor_B</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">ceil</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">uniform</span><span class="p">(</span><span class="n">low</span><span class="o">=</span><span class="n">scope_min</span><span class="p">,</span> <span class="n">high</span><span class="o">=</span><span class="n">scope_max</span><span class="p">,</span> <span class="n">size</span><span class="o">=</span><span class="p">(</span><span class="n">k</span><span class="p">,</span> <span class="n">n</span><span class="p">))</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">type_B</span><span class="p">))</span>
+<span class="n">tensor_C</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">ceil</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">uniform</span><span class="p">(</span><span class="n">low</span><span class="o">=</span><span class="n">scope_min</span><span class="p">,</span> <span class="n">high</span><span class="o">=</span><span class="n">scope_max</span><span class="p">,</span> <span class="n">size</span><span class="o">=</span><span class="p">(</span><span class="n">m</span><span class="p">,</span> <span class="n">n</span><span class="p">))</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">type_C</span><span class="p">))</span>
+
+<span class="n">alpha</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">float16</span><span class="p">(</span><span class="mf">1.</span><span class="p">)</span>
+<span class="n">beta</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">float16</span><span class="p">(</span><span class="mf">0.</span><span class="p">)</span>
+
+<span class="n">tensor_D</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="n">tensor_C</span><span class="o">.</span><span class="n">shape</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">type_D</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+<div class="nboutput nblast docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area stderr docutils container">
+<div class="highlight"><pre>
+/usr/local/lib/python3.8/dist-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
+  from .autonotebook import tqdm as notebook_tqdm
+</pre></div></div>
+</div>
+<section id="Declaring-and-running-a-GEMM">
+<h2>Declaring and running a GEMM<a class="headerlink" href="#Declaring-and-running-a-GEMM" title="Permalink to this heading">#</a></h2>
+<p>To get started, one only needs to provide the tensors declared above to the <code class="docutils literal notranslate"><span class="pre">cutlass.op.Gemm</span></code> call. This sets up a default GEMM operation for the given device on which you are running.</p>
+<p>Assuming that we are running on SM80, this default to using a GEMM that leverages FP16 Tensor Core operations.</p>
+<p>Calling <code class="docutils literal notranslate"><span class="pre">plan.run()</span></code> will generate the CUTLASS C++ kernel in question, compile it, and run it on the tensors we previously passed in. By setting <code class="docutils literal notranslate"><span class="pre">print_module</span></code> to <code class="docutils literal notranslate"><span class="pre">true</span></code>, the C++ code that is emitted is printed.</p>
+<div class="nbinput docutils container">
+<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[2]:
+</pre></div>
+</div>
+<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># We specify `element_accumulator` here so as to match the kernel run by NumPy below. However,</span>
+<span class="c1"># specifying `element_accumulator` is not required if it is the same as `element`</span>
+<span class="n">plan</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">Gemm</span><span class="p">(</span><span class="n">element</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span> <span class="n">layout</span><span class="o">=</span><span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">RowMajor</span><span class="p">,</span> <span class="n">element_accumulator</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span>
+<span class="n">plan</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">tensor_A</span><span class="p">,</span> <span class="n">tensor_B</span><span class="p">,</span> <span class="n">tensor_C</span><span class="p">,</span> <span class="n">tensor_D</span><span class="p">,</span> <span class="n">print_module</span><span class="o">=</span><span class="n">print_module</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+<div class="nboutput docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area docutils container">
+<div class="highlight"><pre>
+
+// Gemm operator cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_256x128_64x3_tt_align8
+using cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_256x128_64x3_tt_align8_base =
+  typename cutlass::gemm::kernel::DefaultGemmUniversal&lt;
+    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,
+    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,
+    cutlass::half_t, cutlass::layout::RowMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape&lt;256, 128, 64&gt;,
+    cutlass::gemm::GemmShape&lt;64, 64, 64&gt;,
+    cutlass::gemm::GemmShape&lt;16, 8, 16&gt;,
+    cutlass::epilogue::thread::LinearCombination&lt;cutlass::half_t, 8, float, float&gt;,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle&lt;1&gt;,
+    3,
+    cutlass::arch::OpMultiplyAdd
+&gt;::GemmKernel;
+
+// Define named type
+struct cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_256x128_64x3_tt_align8_type :
+  public cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_256x128_64x3_tt_align8_base { };
+
+</pre></div></div>
+</div>
+<div class="nboutput nblast docutils container">
+<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[2]:
+</pre></div>
+</div>
+<div class="output_area docutils container">
+<div class="highlight"><pre>
+&lt;cutlass.backend.gemm_operation.GemmArguments2x at 0x7f79cc556070&gt;
+</pre></div></div>
+</div>
+<p>There are many other ways to construct a plan from <code class="docutils literal notranslate"><span class="pre">cutlass.op.Gemm</span></code> (e.g., by specifiying they types and layouts of each operand, by providing representative tensors as inputs). For more details on these, see the documentation in the <code class="docutils literal notranslate"><span class="pre">cutlass.op.Gemm</span></code> constructor.</p>
+<p>We then compare the output to running the GEMM using NumPy.</p>
+<div class="nbinput nblast docutils container">
+<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[3]:
+</pre></div>
+</div>
+<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">tensor_D_numpy</span> <span class="o">=</span> <span class="p">(</span><span class="n">alpha</span> <span class="o">*</span> <span class="p">(</span><span class="n">tensor_A</span> <span class="o">@</span> <span class="n">tensor_B</span><span class="p">))</span> <span class="o">+</span> <span class="p">(</span><span class="n">beta</span> <span class="o">*</span> <span class="n">tensor_C</span><span class="p">)</span>
+<span class="n">np</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">assert_array_equal</span><span class="p">(</span><span class="n">tensor_D</span><span class="p">,</span> <span class="n">tensor_D_numpy</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+<p>Note that one could use the same kernel just declared for tensors provided by other frameworks beyond NumPy, such as PyTorch or CuPy.</p>
+</section>
+<section id="Changing-operation-modes">
+<h2>Changing operation modes<a class="headerlink" href="#Changing-operation-modes" title="Permalink to this heading">#</a></h2>
+<p>By default, the CUTLASS Python interface will try to use Tensor Core operations whenever possible. If the configuration provided to <code class="docutils literal notranslate"><span class="pre">cutlass.op.Gemm</span></code> is not supported on Tensor Cores, the interface will fall back to using a SIMT kernel.</p>
+<p>The operation mode currently in use can be returned via the <code class="docutils literal notranslate"><span class="pre">plan.opclass</span></code> property. In this case Tensor Core operations.</p>
+<div class="nbinput docutils container">
+<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[4]:
+</pre></div>
+</div>
+<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="nb">print</span><span class="p">(</span><span class="n">plan</span><span class="o">.</span><span class="n">opclass</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+<div class="nboutput nblast docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area docutils container">
+<div class="highlight"><pre>
+OpcodeClass.TensorOp
+</pre></div></div>
+</div>
+<p>Suppose that we don’t want to use Tensor Cores for this GEMM. One can change to using CUTLASS’s SIMT GEMMs by setting the plan’s <code class="docutils literal notranslate"><span class="pre">opclass</span></code> field.</p>
+<p>As is shown in the printed output, the emitted kernel uses template parameters that fit CUTLASS’s SIMT GEMMs.</p>
+<p>Also notice that, this time around, we provided tensor parameters to <code class="docutils literal notranslate"><span class="pre">plan.run()</span></code>. One is free to provide different parameters to <code class="docutils literal notranslate"><span class="pre">plan.run()</span></code> than were passed in at the initial call to <code class="docutils literal notranslate"><span class="pre">cutlass.op.Gemm</span></code>, provided that the passed-in tensors have the same data type and layout as those passed in on intialization.</p>
+<div class="nbinput docutils container">
+<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[5]:
+</pre></div>
+</div>
+<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">tensor_D_simt</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="n">tensor_C</span><span class="o">.</span><span class="n">shape</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">type_D</span><span class="p">)</span>
+<span class="n">plan</span><span class="o">.</span><span class="n">opclass</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">OpcodeClass</span><span class="o">.</span><span class="n">Simt</span>
+<span class="n">plan</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">tensor_A</span><span class="p">,</span> <span class="n">tensor_B</span><span class="p">,</span> <span class="n">tensor_C</span><span class="p">,</span> <span class="n">tensor_D_simt</span><span class="p">,</span> <span class="n">alpha</span><span class="p">,</span> <span class="n">beta</span><span class="p">,</span> <span class="n">print_module</span><span class="o">=</span><span class="n">print_module</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+<div class="nboutput docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area docutils container">
+<div class="highlight"><pre>
+
+// Gemm operator cutlass_sm80_simt_f16_sgemm_f16_1x1x1_128x128_8x2_tt_align1
+using cutlass_sm80_simt_f16_sgemm_f16_1x1x1_128x128_8x2_tt_align1_base =
+  typename cutlass::gemm::kernel::DefaultGemmUniversal&lt;
+    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 1,
+    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 1,
+    cutlass::half_t, cutlass::layout::RowMajor,
+    float,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape&lt;128, 128, 8&gt;,
+    cutlass::gemm::GemmShape&lt;32, 64, 8&gt;,
+    cutlass::gemm::GemmShape&lt;1, 1, 1&gt;,
+    cutlass::epilogue::thread::LinearCombination&lt;cutlass::half_t, 1, float, float&gt;,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle&lt;1&gt;,
+    2,
+    cutlass::arch::OpMultiplyAdd
+&gt;::GemmKernel;
+
+// Define named type
+struct cutlass_sm80_simt_f16_sgemm_f16_1x1x1_128x128_8x2_tt_align1_type :
+  public cutlass_sm80_simt_f16_sgemm_f16_1x1x1_128x128_8x2_tt_align1_base { };
+
+</pre></div></div>
+</div>
+<div class="nboutput nblast docutils container">
+<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[5]:
+</pre></div>
+</div>
+<div class="output_area docutils container">
+<div class="highlight"><pre>
+&lt;cutlass.backend.gemm_operation.GemmArguments2x at 0x7f7b3075abe0&gt;
+</pre></div></div>
+</div>
+<p>If we compare the output of the Tensor Core and SIMT GEMMs we just ran we see that they are equal.</p>
+<div class="nbinput nblast docutils container">
+<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[6]:
+</pre></div>
+</div>
+<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">np</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">assert_array_equal</span><span class="p">(</span><span class="n">tensor_D</span><span class="p">,</span> <span class="n">tensor_D_simt</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+</section>
+<section id="Running-cached-kernels">
+<h2>Running cached kernels<a class="headerlink" href="#Running-cached-kernels" title="Permalink to this heading">#</a></h2>
+<p>You may have noticed that the <code class="docutils literal notranslate"><span class="pre">plan.run()</span></code> calls for the previous two kernels took some time to execute. This is because the kernel being emitted had not yet been compiled.</p>
+<p>CUTLASS caches compiled binaries so that recompilation isn’t necessary every time a kernel is run. For example, if we change modes back to using Tensor Cores and call <code class="docutils literal notranslate"><span class="pre">plan.run()</span></code> again (with a different set of tensor parameters), you’ll find the call to return much faster.</p>
+<div class="nbinput docutils container">
+<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[7]:
+</pre></div>
+</div>
+<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">m</span> <span class="o">=</span> <span class="mi">2400</span>
+<span class="n">n</span> <span class="o">=</span> <span class="mi">3232</span>
+<span class="n">k</span> <span class="o">=</span> <span class="mi">4096</span>
+
+<span class="n">tensor_A</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">ceil</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">uniform</span><span class="p">(</span><span class="n">low</span><span class="o">=</span><span class="n">scope_min</span><span class="p">,</span> <span class="n">high</span><span class="o">=</span><span class="n">scope_max</span><span class="p">,</span> <span class="n">size</span><span class="o">=</span><span class="p">(</span><span class="n">m</span><span class="p">,</span> <span class="n">k</span><span class="p">))</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">type_A</span><span class="p">))</span>
+<span class="n">tensor_B</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">ceil</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">uniform</span><span class="p">(</span><span class="n">low</span><span class="o">=</span><span class="n">scope_min</span><span class="p">,</span> <span class="n">high</span><span class="o">=</span><span class="n">scope_max</span><span class="p">,</span> <span class="n">size</span><span class="o">=</span><span class="p">(</span><span class="n">k</span><span class="p">,</span> <span class="n">n</span><span class="p">))</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">type_B</span><span class="p">))</span>
+<span class="n">tensor_C</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">ceil</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">uniform</span><span class="p">(</span><span class="n">low</span><span class="o">=</span><span class="n">scope_min</span><span class="p">,</span> <span class="n">high</span><span class="o">=</span><span class="n">scope_max</span><span class="p">,</span> <span class="n">size</span><span class="o">=</span><span class="p">(</span><span class="n">m</span><span class="p">,</span> <span class="n">n</span><span class="p">))</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">type_C</span><span class="p">))</span>
+<span class="n">tensor_D</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="n">tensor_C</span><span class="o">.</span><span class="n">shape</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">type_D</span><span class="p">)</span>
+
+<span class="n">alpha</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">float16</span><span class="p">(</span><span class="mf">1.</span><span class="p">)</span>
+<span class="n">beta</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">float16</span><span class="p">(</span><span class="mf">2.</span><span class="p">)</span>
+
+<span class="n">plan</span><span class="o">.</span><span class="n">opclass</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">OpcodeClass</span><span class="o">.</span><span class="n">TensorOp</span>
+<span class="n">plan</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">tensor_A</span><span class="p">,</span> <span class="n">tensor_B</span><span class="p">,</span> <span class="n">tensor_C</span><span class="p">,</span> <span class="n">tensor_D</span><span class="p">,</span> <span class="n">alpha</span><span class="p">,</span> <span class="n">beta</span><span class="p">,</span> <span class="n">print_module</span><span class="o">=</span><span class="n">print_module</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+<div class="nboutput docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area docutils container">
+<div class="highlight"><pre>
+
+// Gemm operator cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_256x128_64x3_tt_align8
+using cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_256x128_64x3_tt_align8_base =
+  typename cutlass::gemm::kernel::DefaultGemmUniversal&lt;
+    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,
+    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,
+    cutlass::half_t, cutlass::layout::RowMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape&lt;256, 128, 64&gt;,
+    cutlass::gemm::GemmShape&lt;64, 64, 64&gt;,
+    cutlass::gemm::GemmShape&lt;16, 8, 16&gt;,
+    cutlass::epilogue::thread::LinearCombination&lt;cutlass::half_t, 8, float, float&gt;,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle&lt;1&gt;,
+    3,
+    cutlass::arch::OpMultiplyAdd
+&gt;::GemmKernel;
+
+// Define named type
+struct cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_256x128_64x3_tt_align8_type :
+  public cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_256x128_64x3_tt_align8_base { };
+
+</pre></div></div>
+</div>
+<div class="nboutput nblast docutils container">
+<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[7]:
+</pre></div>
+</div>
+<div class="output_area docutils container">
+<div class="highlight"><pre>
+&lt;cutlass.backend.gemm_operation.GemmArguments2x at 0x7f7b30fb9880&gt;
+</pre></div></div>
+</div>
+</section>
+<section id="Running-non-default-GEMMs">
+<h2>Running non-default GEMMs<a class="headerlink" href="#Running-non-default-GEMMs" title="Permalink to this heading">#</a></h2>
+<p>The previous examples showed how it is simple to get started running a default GEMM kernel in CUTLASS. But, what do you do if you want a bit more control over the parameters to the GEMM?</p>
+<p>Under the hood, CUTLASS enumerates the different GEMM configuration parameters possible for this kernel from the CUTLASS profiler. The code below shows how one can access the tile descriptions for the kernels (e.g., cluster, threadblock, and warp shape).</p>
+<div class="nbinput docutils container">
+<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[8]:
+</pre></div>
+</div>
+<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">tiles</span> <span class="o">=</span> <span class="n">plan</span><span class="o">.</span><span class="n">tile_descriptions</span><span class="p">()</span>
+<span class="nb">print</span><span class="p">(</span><span class="s1">&#39;</span><span class="si">{}</span><span class="s1"> tile descriptions returned&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">tiles</span><span class="p">)))</span>
+<span class="n">num_print</span> <span class="o">=</span> <span class="mi">10</span>
+<span class="nb">print</span><span class="p">(</span><span class="s1">&#39;First </span><span class="si">{}</span><span class="s1"> tile descriptions are:&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">num_print</span><span class="p">))</span>
+<span class="k">for</span> <span class="n">td</span> <span class="ow">in</span> <span class="n">tiles</span><span class="p">[:</span><span class="n">num_print</span><span class="p">]:</span>
+    <span class="nb">print</span><span class="p">(</span><span class="n">td</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+<div class="nboutput nblast docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area docutils container">
+<div class="highlight"><pre>
+132 tile descriptions returned
+First 10 tile descriptions are:
+
+{
+  ClusterShape: [1, 1, 1]
+  ThreadblockShape: [256, 128, 64]
+  WarpCount: [4, 2, 1]
+  Stages: 3
+  Kernel schedule: ScheduleAuto
+}
+
+{
+  ClusterShape: [1, 1, 1]
+  ThreadblockShape: [128, 256, 64]
+  WarpCount: [2, 4, 1]
+  Stages: 3
+  Kernel schedule: ScheduleAuto
+}
+
+{
+  ClusterShape: [1, 1, 1]
+  ThreadblockShape: [256, 128, 64]
+  WarpCount: [4, 2, 1]
+  Stages: 3
+  Kernel schedule: ScheduleAuto
+}
+
+{
+  ClusterShape: [1, 1, 1]
+  ThreadblockShape: [128, 256, 64]
+  WarpCount: [2, 4, 1]
+  Stages: 3
+  Kernel schedule: ScheduleAuto
+}
+
+{
+  ClusterShape: [1, 1, 1]
+  ThreadblockShape: [256, 128, 32]
+  WarpCount: [4, 2, 1]
+  Stages: 3
+  Kernel schedule: ScheduleAuto
+}
+
+{
+  ClusterShape: [1, 1, 1]
+  ThreadblockShape: [128, 256, 32]
+  WarpCount: [2, 4, 1]
+  Stages: 3
+  Kernel schedule: ScheduleAuto
+}
+
+{
+  ClusterShape: [1, 1, 1]
+  ThreadblockShape: [256, 64, 64]
+  WarpCount: [4, 1, 1]
+  Stages: 4
+  Kernel schedule: ScheduleAuto
+}
+
+{
+  ClusterShape: [1, 1, 1]
+  ThreadblockShape: [64, 256, 64]
+  WarpCount: [1, 4, 1]
+  Stages: 4
+  Kernel schedule: ScheduleAuto
+}
+
+{
+  ClusterShape: [1, 1, 1]
+  ThreadblockShape: [128, 128, 64]
+  WarpCount: [2, 2, 1]
+  Stages: 4
+  Kernel schedule: ScheduleAuto
+}
+
+{
+  ClusterShape: [1, 1, 1]
+  ThreadblockShape: [256, 64, 64]
+  WarpCount: [4, 1, 1]
+  Stages: 3
+  Kernel schedule: ScheduleAuto
+}
+</pre></div></div>
+</div>
+<p>Next, we’ll pick one of these configurations at random and compile and run it.</p>
+<div class="nbinput docutils container">
+<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[9]:
+</pre></div>
+</div>
+<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">idx</span> <span class="o">=</span> <span class="n">random</span><span class="o">.</span><span class="n">randint</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="nb">len</span><span class="p">(</span><span class="n">tiles</span><span class="p">)</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span>
+<span class="n">td</span> <span class="o">=</span> <span class="n">tiles</span><span class="p">[</span><span class="n">idx</span><span class="p">]</span>
+<span class="nb">print</span><span class="p">(</span><span class="s1">&#39;Tile description </span><span class="si">{}</span><span class="s1"> is: </span><span class="si">{}</span><span class="s1">&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">idx</span><span class="p">,</span> <span class="n">td</span><span class="p">))</span>
+<span class="n">plan</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="n">td</span><span class="p">)</span>
+<span class="n">plan</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">tensor_A</span><span class="p">,</span> <span class="n">tensor_B</span><span class="p">,</span> <span class="n">tensor_C</span><span class="p">,</span> <span class="n">tensor_D</span><span class="p">,</span> <span class="n">alpha</span><span class="p">,</span> <span class="n">beta</span><span class="p">,</span> <span class="n">print_module</span><span class="o">=</span><span class="n">print_module</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+<div class="nboutput docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area docutils container">
+<div class="highlight"><pre>
+Tile description 112 is:
+{
+  ClusterShape: [1, 1, 1]
+  ThreadblockShape: [128, 128, 32]
+  WarpCount: [2, 2, 1]
+  Stages: 4
+  Kernel schedule: ScheduleAuto
+}
+
+// Gemm operator cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_128x128_32x4_tt_align8
+using cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_128x128_32x4_tt_align8_base =
+  typename cutlass::gemm::kernel::DefaultGemmUniversal&lt;
+    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,
+    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,
+    cutlass::half_t, cutlass::layout::RowMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape&lt;128, 128, 32&gt;,
+    cutlass::gemm::GemmShape&lt;64, 64, 32&gt;,
+    cutlass::gemm::GemmShape&lt;16, 8, 16&gt;,
+    cutlass::epilogue::thread::LinearCombination&lt;cutlass::half_t, 8, float, float&gt;,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle&lt;1&gt;,
+    4,
+    cutlass::arch::OpMultiplyAdd
+&gt;::GemmKernel;
+
+// Define named type
+struct cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_128x128_32x4_tt_align8_type :
+  public cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_128x128_32x4_tt_align8_base { };
+
+</pre></div></div>
+</div>
+<div class="nboutput nblast docutils container">
+<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[9]:
+</pre></div>
+</div>
+<div class="output_area docutils container">
+<div class="highlight"><pre>
+&lt;cutlass.backend.gemm_operation.GemmArguments2x at 0x7f79cc58de20&gt;
+</pre></div></div>
+</div>
+<p>One can also change the swizzling function used by the kernel. For example, one can modify the kernel to use the stream K feature of CUTLASS via:</p>
+<div class="nbinput docutils container">
+<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[10]:
+</pre></div>
+</div>
+<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># Stream K is only supported pre-SM90 (at least when this example was written)</span>
+<span class="k">if</span> <span class="n">plan</span><span class="o">.</span><span class="n">cc</span> <span class="o">!=</span> <span class="mi">90</span><span class="p">:</span>
+    <span class="n">plan</span><span class="o">.</span><span class="n">swizzling_functor</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">swizzle</span><span class="o">.</span><span class="n">ThreadblockSwizzleStreamK</span>
+    <span class="n">plan</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">tensor_A</span><span class="p">,</span> <span class="n">tensor_B</span><span class="p">,</span> <span class="n">tensor_C</span><span class="p">,</span> <span class="n">tensor_D</span><span class="p">,</span> <span class="n">alpha</span><span class="p">,</span> <span class="n">beta</span><span class="p">,</span> <span class="n">print_module</span><span class="o">=</span><span class="n">print_module</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+<div class="nboutput nblast docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area docutils container">
+<div class="highlight"><pre>
+
+// Gemm operator cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_128x128_32x4_tt_align8
+using cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_128x128_32x4_tt_align8_base =
+  typename cutlass::gemm::kernel::DefaultGemmUniversal&lt;
+    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,
+    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,
+    cutlass::half_t, cutlass::layout::RowMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape&lt;128, 128, 32&gt;,
+    cutlass::gemm::GemmShape&lt;64, 64, 32&gt;,
+    cutlass::gemm::GemmShape&lt;16, 8, 16&gt;,
+    cutlass::epilogue::thread::LinearCombination&lt;cutlass::half_t, 8, float, float&gt;,
+    cutlass::gemm::threadblock::ThreadblockSwizzleStreamK,
+    4,
+    cutlass::arch::OpMultiplyAdd
+&gt;::GemmKernel;
+
+// Define named type
+struct cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_128x128_32x4_tt_align8_type :
+  public cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_128x128_32x4_tt_align8_base { };
+
+</pre></div></div>
+</div>
+</section>
+<section id="Handling-errors">
+<h2>Handling errors<a class="headerlink" href="#Handling-errors" title="Permalink to this heading">#</a></h2>
+<p>The CUTLASS Python interface attempts to catch runtime and compilation errors in Python so as to provide more understandable error messages.</p>
+<p>Here’s an example in which we try to use too many stages for a given GEMM kernel. Normally, this would result in a runtime error due to the GPU having insufficient shared memory to launch the kernel with 8 stages. The CUTLASS Python interface is able to detect this issue before compiling the kernel, and reports it back to the user.</p>
+<div class="nbinput nblast docutils container">
+<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[11]:
+</pre></div>
+</div>
+<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># td = tiles[0]</span>
+<span class="c1"># td.stages = 8</span>
+<span class="c1"># plan.compile(td)</span>
+</pre></div>
+</div>
+</div>
+</section>
+</section>
+
+        </article>
+      </div>
+      <footer>
+        
+        <div class="related-pages">
+          <a class="next-page" href="../contribute.html">
+              <div class="page-info">
+                <div class="context">
+                  <span>Next</span>
+                </div>
+                <div class="title">Contributing</div>
+              </div>
+              <svg class="furo-related-icon"><use href="#svg-arrow-right"></use></svg>
+            </a>
+          <a class="prev-page" href="../install.html">
+              <svg class="furo-related-icon"><use href="#svg-arrow-right"></use></svg>
+              <div class="page-info">
+                <div class="context">
+                  <span>Previous</span>
+                </div>
+                
+                <div class="title">Installation</div>
+                
+              </div>
+            </a>
+        </div>
+        <div class="bottom-of-page">
+          <div class="left-details">
+            <div class="copyright">
+                Copyright &#169; 2023, NVIDIA
+            </div>
+            Made with <a href="https://www.sphinx-doc.org/">Sphinx</a> and <a class="muted-link" href="https://pradyunsg.me">@pradyunsg</a>'s
+            
+            <a href="https://github.com/pradyunsg/furo">Furo</a>
+            
+          </div>
+          <div class="right-details">
+            <div class="icons">
+              <a class="muted-link " href="https://github.com/NVIDIA/cutlass" aria-label="GitHub">
+                <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 16 16">
+                    <path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"></path>
+                </svg>
+            </a>
+              
+            </div>
+          </div>
+        </div>
+        
+      </footer>
+    </div>
+    <aside class="toc-drawer">
+      
+      
+      <div class="toc-sticky toc-scroll">
+        <div class="toc-title-container">
+          <span class="toc-title">
+            On this page
+          </span>
+        </div>
+        <div class="toc-tree-container">
+          <div class="toc-tree">
+            <ul>
+<li><a class="reference internal" href="#">Basic example of using the CUTLASS Python interface</a><ul>
+<li><a class="reference internal" href="#Declaring-and-running-a-GEMM">Declaring and running a GEMM</a></li>
+<li><a class="reference internal" href="#Changing-operation-modes">Changing operation modes</a></li>
+<li><a class="reference internal" href="#Running-cached-kernels">Running cached kernels</a></li>
+<li><a class="reference internal" href="#Running-non-default-GEMMs">Running non-default GEMMs</a></li>
+<li><a class="reference internal" href="#Handling-errors">Handling errors</a></li>
+</ul>
+</li>
+</ul>
+
+          </div>
+        </div>
+      </div>
+      
+      
+    </aside>
+  </div>
+</div><script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script>
+    <script src="../_static/doctools.js"></script>
+    <script src="../_static/sphinx_highlight.js"></script>
+    <script src="../_static/scripts/furo.js"></script>
+    <script src="../_static/clipboard.min.js"></script>
+    <script src="../_static/copybutton.js"></script>
+    <script src="../_static/tabs.js"></script>
+    <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
+    <script>window.MathJax = {"tex": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true}, "options": {"ignoreHtmlClass": "tex2jax_ignore|mathjax_ignore|document", "processHtmlClass": "tex2jax_process|mathjax_process|math|output_area"}}</script>
+    <script defer="defer" src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
+    </body>
+</html>
\ No newline at end of file
diff --git a/python/docs/externals/00_basic_gemm.ipynb b/python/docs/externals/00_basic_gemm.ipynb
new file mode 100644
index 0000000000..a18b320a84
--- /dev/null
+++ b/python/docs/externals/00_basic_gemm.ipynb
@@ -0,0 +1,727 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "1ef96b3f",
+   "metadata": {},
+   "source": [
+    "# Basic example of using the CUTLASS Python interface\n",
+    "This notebook walks through a basic example of using the CUTLASS Python interface to declare, compile, and run GEMMs.\n",
+    "\n",
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cutlass/tree/master/examples/00_basic_gemm.ipynb)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "962324fd",
+   "metadata": {},
+   "source": [
+    "We first import various packages needed for the example and construct the input and output tensors that will be used in our example.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "0e324219",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-04-18T17:59:39.749457Z",
+     "iopub.status.busy": "2023-04-18T17:59:39.748884Z",
+     "iopub.status.idle": "2023-04-18T17:59:43.907956Z",
+     "shell.execute_reply": "2023-04-18T17:59:43.907069Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.8/dist-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "import random\n",
+    "\n",
+    "import cutlass\n",
+    "\n",
+    "# This controls whether ther C++ GEMM declaration will be printed at each step. Set to `false` to\n",
+    "# omit this information.\n",
+    "print_module = True\n",
+    "\n",
+    "m = 128\n",
+    "n = m\n",
+    "k = m\n",
+    "\n",
+    "dtype = np.float16\n",
+    "type_A = np.float16\n",
+    "type_B = np.float16\n",
+    "type_C = np.float16\n",
+    "type_D = np.float16\n",
+    "\n",
+    "np.random.seed(1234)\n",
+    "random.seed(1234)\n",
+    "scope_min = -4\n",
+    "scope_max = 4\n",
+    "tensor_A = np.ceil(np.random.uniform(low=scope_min, high=scope_max, size=(m, k)).astype(type_A))\n",
+    "tensor_B = np.ceil(np.random.uniform(low=scope_min, high=scope_max, size=(k, n)).astype(type_B))\n",
+    "tensor_C = np.ceil(np.random.uniform(low=scope_min, high=scope_max, size=(m, n)).astype(type_C))\n",
+    "\n",
+    "alpha = np.float16(1.)\n",
+    "beta = np.float16(0.)\n",
+    "\n",
+    "tensor_D = np.zeros(tensor_C.shape).astype(type_D)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f2c7bf48",
+   "metadata": {},
+   "source": [
+    "## Declaring and running a GEMM\n",
+    "To get started, one only needs to provide the tensors declared above to the `cutlass.op.Gemm` call.\n",
+    "This sets up a default GEMM operation for the given device on which you are running.\n",
+    "\n",
+    "Assuming that we are running on SM80, this default to using a GEMM that leverages FP16 Tensor Core operations.\n",
+    "\n",
+    "Calling `plan.run()` will generate the CUTLASS C++ kernel in question, compile it, and run it on the tensors we previously passed in. By setting `print_module` to `true`, the C++ code that is emitted is printed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "0dfd8975",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-04-18T17:59:43.911740Z",
+     "iopub.status.busy": "2023-04-18T17:59:43.911512Z",
+     "iopub.status.idle": "2023-04-18T17:59:49.103941Z",
+     "shell.execute_reply": "2023-04-18T17:59:49.103231Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "// Gemm operator cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_256x128_64x3_tt_align8\n",
+      "using cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_256x128_64x3_tt_align8_base =\n",
+      "  typename cutlass::gemm::kernel::DefaultGemmUniversal<\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor,\n",
+      "    float,\n",
+      "    cutlass::arch::OpClassTensorOp,\n",
+      "    cutlass::arch::Sm80,\n",
+      "    cutlass::gemm::GemmShape<256, 128, 64>,\n",
+      "    cutlass::gemm::GemmShape<64, 64, 64>,\n",
+      "    cutlass::gemm::GemmShape<16, 8, 16>,\n",
+      "    cutlass::epilogue::thread::LinearCombination<cutlass::half_t, 8, float, float>,\n",
+      "    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,\n",
+      "    3,\n",
+      "    cutlass::arch::OpMultiplyAdd\n",
+      ">::GemmKernel;\n",
+      "\n",
+      "// Define named type\n",
+      "struct cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_256x128_64x3_tt_align8_type : \n",
+      "  public cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_256x128_64x3_tt_align8_base { };\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<cutlass.backend.gemm_operation.GemmArguments2x at 0x7f79cc556070>"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# We specify `element_accumulator` here so as to match the kernel run by NumPy below. However,\n",
+    "# specifying `element_accumulator` is not required if it is the same as `element`\n",
+    "plan = cutlass.Gemm(element=dtype, layout=cutlass.LayoutType.RowMajor, element_accumulator=np.float32)\n",
+    "plan.run(tensor_A, tensor_B, tensor_C, tensor_D, print_module=print_module)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4a5856de",
+   "metadata": {},
+   "source": [
+    "There are many other ways to construct a plan from `cutlass.op.Gemm` (e.g., by specifiying they types and layouts of each operand, by providing representative tensors as inputs). For more details on these, see the documentation in the `cutlass.op.Gemm` constructor."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "945478ef",
+   "metadata": {},
+   "source": [
+    "We then compare the output to running the GEMM using NumPy."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "6b669de6",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-04-18T17:59:49.107492Z",
+     "iopub.status.busy": "2023-04-18T17:59:49.107284Z",
+     "iopub.status.idle": "2023-04-18T17:59:49.138511Z",
+     "shell.execute_reply": "2023-04-18T17:59:49.137837Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "tensor_D_numpy = (alpha * (tensor_A @ tensor_B)) + (beta * tensor_C)\n",
+    "np.testing.assert_array_equal(tensor_D, tensor_D_numpy)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ee5cbbbe",
+   "metadata": {},
+   "source": [
+    "Note that one could use the same kernel just declared for tensors provided by other frameworks beyond NumPy, such as PyTorch or CuPy."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b6c86493",
+   "metadata": {},
+   "source": [
+    "## Changing operation modes\n",
+    "By default, the CUTLASS Python interface will try to use Tensor Core operations whenever possible. If the configuration provided to `cutlass.op.Gemm` is not supported on Tensor Cores, the interface will fall back to using a SIMT kernel.\n",
+    "\n",
+    "The operation mode currently in use can be returned via the `plan.opclass` property. In this case Tensor Core operations."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "529fda93",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-04-18T17:59:49.141458Z",
+     "iopub.status.busy": "2023-04-18T17:59:49.141305Z",
+     "iopub.status.idle": "2023-04-18T17:59:49.145005Z",
+     "shell.execute_reply": "2023-04-18T17:59:49.144332Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "OpcodeClass.TensorOp\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(plan.opclass)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6d27c575",
+   "metadata": {},
+   "source": [
+    "Suppose that we don't want to use Tensor Cores for this GEMM. One can change to using CUTLASS's SIMT GEMMs by setting the plan's `opclass` field.\n",
+    "\n",
+    "As is shown in the printed output, the emitted kernel uses template parameters that fit CUTLASS's SIMT GEMMs.\n",
+    "\n",
+    "Also notice that, this time around, we provided tensor parameters to `plan.run()`. One is free to provide different parameters to `plan.run()` than were passed in at the initial call to `cutlass.op.Gemm`, provided that the passed-in tensors have the same data type and layout as those passed in on intialization."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "6a44d35b",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-04-18T17:59:49.148548Z",
+     "iopub.status.busy": "2023-04-18T17:59:49.148042Z",
+     "iopub.status.idle": "2023-04-18T17:59:54.365792Z",
+     "shell.execute_reply": "2023-04-18T17:59:54.364734Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "// Gemm operator cutlass_sm80_simt_f16_sgemm_f16_1x1x1_128x128_8x2_tt_align1\n",
+      "using cutlass_sm80_simt_f16_sgemm_f16_1x1x1_128x128_8x2_tt_align1_base =\n",
+      "  typename cutlass::gemm::kernel::DefaultGemmUniversal<\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 1,\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 1,\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor,\n",
+      "    float,\n",
+      "    cutlass::arch::OpClassSimt,\n",
+      "    cutlass::arch::Sm80,\n",
+      "    cutlass::gemm::GemmShape<128, 128, 8>,\n",
+      "    cutlass::gemm::GemmShape<32, 64, 8>,\n",
+      "    cutlass::gemm::GemmShape<1, 1, 1>,\n",
+      "    cutlass::epilogue::thread::LinearCombination<cutlass::half_t, 1, float, float>,\n",
+      "    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,\n",
+      "    2,\n",
+      "    cutlass::arch::OpMultiplyAdd\n",
+      ">::GemmKernel;\n",
+      "\n",
+      "// Define named type\n",
+      "struct cutlass_sm80_simt_f16_sgemm_f16_1x1x1_128x128_8x2_tt_align1_type : \n",
+      "  public cutlass_sm80_simt_f16_sgemm_f16_1x1x1_128x128_8x2_tt_align1_base { };\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<cutlass.backend.gemm_operation.GemmArguments2x at 0x7f7b3075abe0>"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tensor_D_simt = np.zeros(tensor_C.shape).astype(type_D)\n",
+    "plan.opclass = cutlass.OpcodeClass.Simt\n",
+    "plan.run(tensor_A, tensor_B, tensor_C, tensor_D_simt, alpha, beta, print_module=print_module)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "639dcb59",
+   "metadata": {},
+   "source": [
+    "If we compare the output of the Tensor Core and SIMT GEMMs we just ran we see that they are equal."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "9b480853",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-04-18T17:59:54.369977Z",
+     "iopub.status.busy": "2023-04-18T17:59:54.369302Z",
+     "iopub.status.idle": "2023-04-18T17:59:54.375239Z",
+     "shell.execute_reply": "2023-04-18T17:59:54.374405Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "np.testing.assert_array_equal(tensor_D, tensor_D_simt)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0cce1eae",
+   "metadata": {},
+   "source": [
+    "## Running cached kernels\n",
+    "You may have noticed that the `plan.run()` calls for the previous two kernels took some time to execute. This is because the kernel being emitted had not yet been compiled.\n",
+    "\n",
+    "CUTLASS caches compiled binaries so that recompilation isn't necessary every time a kernel is run. For example, if we change modes back to using Tensor Cores and call `plan.run()` again (with a different set of tensor parameters), you'll find the call to return much faster."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "f8051e5e",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-04-18T17:59:54.378373Z",
+     "iopub.status.busy": "2023-04-18T17:59:54.378060Z",
+     "iopub.status.idle": "2023-04-18T17:59:55.220086Z",
+     "shell.execute_reply": "2023-04-18T17:59:55.219198Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "// Gemm operator cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_256x128_64x3_tt_align8\n",
+      "using cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_256x128_64x3_tt_align8_base =\n",
+      "  typename cutlass::gemm::kernel::DefaultGemmUniversal<\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor,\n",
+      "    float,\n",
+      "    cutlass::arch::OpClassTensorOp,\n",
+      "    cutlass::arch::Sm80,\n",
+      "    cutlass::gemm::GemmShape<256, 128, 64>,\n",
+      "    cutlass::gemm::GemmShape<64, 64, 64>,\n",
+      "    cutlass::gemm::GemmShape<16, 8, 16>,\n",
+      "    cutlass::epilogue::thread::LinearCombination<cutlass::half_t, 8, float, float>,\n",
+      "    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,\n",
+      "    3,\n",
+      "    cutlass::arch::OpMultiplyAdd\n",
+      ">::GemmKernel;\n",
+      "\n",
+      "// Define named type\n",
+      "struct cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_256x128_64x3_tt_align8_type : \n",
+      "  public cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_256x128_64x3_tt_align8_base { };\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<cutlass.backend.gemm_operation.GemmArguments2x at 0x7f7b30fb9880>"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "m = 2400\n",
+    "n = 3232\n",
+    "k = 4096\n",
+    "\n",
+    "tensor_A = np.ceil(np.random.uniform(low=scope_min, high=scope_max, size=(m, k)).astype(type_A))\n",
+    "tensor_B = np.ceil(np.random.uniform(low=scope_min, high=scope_max, size=(k, n)).astype(type_B))\n",
+    "tensor_C = np.ceil(np.random.uniform(low=scope_min, high=scope_max, size=(m, n)).astype(type_C))\n",
+    "tensor_D = np.zeros(tensor_C.shape).astype(type_D)\n",
+    "\n",
+    "alpha = np.float16(1.)\n",
+    "beta = np.float16(2.)\n",
+    "\n",
+    "plan.opclass = cutlass.OpcodeClass.TensorOp\n",
+    "plan.run(tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, print_module=print_module)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "52a4e318",
+   "metadata": {},
+   "source": [
+    "## Running non-default GEMMs\n",
+    "The previous examples showed how it is simple to get started running a default GEMM kernel in CUTLASS. But, what do you do if you want a bit more control over the parameters to the GEMM?\n",
+    "\n",
+    "Under the hood, CUTLASS enumerates the different GEMM configuration parameters possible for this kernel from the CUTLASS profiler. The code below shows how one can access the tile descriptions for the kernels (e.g., cluster, threadblock, and warp shape)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "1c593be1",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-04-18T17:59:55.223812Z",
+     "iopub.status.busy": "2023-04-18T17:59:55.223651Z",
+     "iopub.status.idle": "2023-04-18T17:59:55.228769Z",
+     "shell.execute_reply": "2023-04-18T17:59:55.228101Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "132 tile descriptions returned\n",
+      "First 10 tile descriptions are:\n",
+      "\n",
+      "{\n",
+      "  ClusterShape: [1, 1, 1]\n",
+      "  ThreadblockShape: [256, 128, 64]\n",
+      "  WarpCount: [4, 2, 1]\n",
+      "  Stages: 3\n",
+      "  Kernel schedule: ScheduleAuto\n",
+      "}\n",
+      "\n",
+      "{\n",
+      "  ClusterShape: [1, 1, 1]\n",
+      "  ThreadblockShape: [128, 256, 64]\n",
+      "  WarpCount: [2, 4, 1]\n",
+      "  Stages: 3\n",
+      "  Kernel schedule: ScheduleAuto\n",
+      "}\n",
+      "\n",
+      "{\n",
+      "  ClusterShape: [1, 1, 1]\n",
+      "  ThreadblockShape: [256, 128, 64]\n",
+      "  WarpCount: [4, 2, 1]\n",
+      "  Stages: 3\n",
+      "  Kernel schedule: ScheduleAuto\n",
+      "}\n",
+      "\n",
+      "{\n",
+      "  ClusterShape: [1, 1, 1]\n",
+      "  ThreadblockShape: [128, 256, 64]\n",
+      "  WarpCount: [2, 4, 1]\n",
+      "  Stages: 3\n",
+      "  Kernel schedule: ScheduleAuto\n",
+      "}\n",
+      "\n",
+      "{\n",
+      "  ClusterShape: [1, 1, 1]\n",
+      "  ThreadblockShape: [256, 128, 32]\n",
+      "  WarpCount: [4, 2, 1]\n",
+      "  Stages: 3\n",
+      "  Kernel schedule: ScheduleAuto\n",
+      "}\n",
+      "\n",
+      "{\n",
+      "  ClusterShape: [1, 1, 1]\n",
+      "  ThreadblockShape: [128, 256, 32]\n",
+      "  WarpCount: [2, 4, 1]\n",
+      "  Stages: 3\n",
+      "  Kernel schedule: ScheduleAuto\n",
+      "}\n",
+      "\n",
+      "{\n",
+      "  ClusterShape: [1, 1, 1]\n",
+      "  ThreadblockShape: [256, 64, 64]\n",
+      "  WarpCount: [4, 1, 1]\n",
+      "  Stages: 4\n",
+      "  Kernel schedule: ScheduleAuto\n",
+      "}\n",
+      "\n",
+      "{\n",
+      "  ClusterShape: [1, 1, 1]\n",
+      "  ThreadblockShape: [64, 256, 64]\n",
+      "  WarpCount: [1, 4, 1]\n",
+      "  Stages: 4\n",
+      "  Kernel schedule: ScheduleAuto\n",
+      "}\n",
+      "\n",
+      "{\n",
+      "  ClusterShape: [1, 1, 1]\n",
+      "  ThreadblockShape: [128, 128, 64]\n",
+      "  WarpCount: [2, 2, 1]\n",
+      "  Stages: 4\n",
+      "  Kernel schedule: ScheduleAuto\n",
+      "}\n",
+      "\n",
+      "{\n",
+      "  ClusterShape: [1, 1, 1]\n",
+      "  ThreadblockShape: [256, 64, 64]\n",
+      "  WarpCount: [4, 1, 1]\n",
+      "  Stages: 3\n",
+      "  Kernel schedule: ScheduleAuto\n",
+      "}\n"
+     ]
+    }
+   ],
+   "source": [
+    "tiles = plan.tile_descriptions()\n",
+    "print('{} tile descriptions returned'.format(len(tiles)))\n",
+    "num_print = 10\n",
+    "print('First {} tile descriptions are:'.format(num_print))\n",
+    "for td in tiles[:num_print]:\n",
+    "    print(td)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dc3ad875",
+   "metadata": {},
+   "source": [
+    "Next, we'll pick one of these configurations at random and compile and run it."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "a8dc5287",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-04-18T17:59:55.231498Z",
+     "iopub.status.busy": "2023-04-18T17:59:55.230924Z",
+     "iopub.status.idle": "2023-04-18T18:00:00.340161Z",
+     "shell.execute_reply": "2023-04-18T18:00:00.339603Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Tile description 112 is: \n",
+      "{\n",
+      "  ClusterShape: [1, 1, 1]\n",
+      "  ThreadblockShape: [128, 128, 32]\n",
+      "  WarpCount: [2, 2, 1]\n",
+      "  Stages: 4\n",
+      "  Kernel schedule: ScheduleAuto\n",
+      "}\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "// Gemm operator cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_128x128_32x4_tt_align8\n",
+      "using cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_128x128_32x4_tt_align8_base =\n",
+      "  typename cutlass::gemm::kernel::DefaultGemmUniversal<\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor,\n",
+      "    float,\n",
+      "    cutlass::arch::OpClassTensorOp,\n",
+      "    cutlass::arch::Sm80,\n",
+      "    cutlass::gemm::GemmShape<128, 128, 32>,\n",
+      "    cutlass::gemm::GemmShape<64, 64, 32>,\n",
+      "    cutlass::gemm::GemmShape<16, 8, 16>,\n",
+      "    cutlass::epilogue::thread::LinearCombination<cutlass::half_t, 8, float, float>,\n",
+      "    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,\n",
+      "    4,\n",
+      "    cutlass::arch::OpMultiplyAdd\n",
+      ">::GemmKernel;\n",
+      "\n",
+      "// Define named type\n",
+      "struct cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_128x128_32x4_tt_align8_type : \n",
+      "  public cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_128x128_32x4_tt_align8_base { };\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<cutlass.backend.gemm_operation.GemmArguments2x at 0x7f79cc58de20>"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "idx = random.randint(0, len(tiles)-1)\n",
+    "td = tiles[idx]\n",
+    "print('Tile description {} is: {}'.format(idx, td))\n",
+    "plan.compile(td)\n",
+    "plan.run(tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, print_module=print_module)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c5a8b534",
+   "metadata": {},
+   "source": [
+    "One can also change the swizzling function used by the kernel. For example, one can modify the kernel to use the stream K feature of CUTLASS via:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "e5e88d17",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-04-18T18:00:00.343772Z",
+     "iopub.status.busy": "2023-04-18T18:00:00.343582Z",
+     "iopub.status.idle": "2023-04-18T18:00:06.192256Z",
+     "shell.execute_reply": "2023-04-18T18:00:06.191286Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "// Gemm operator cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_128x128_32x4_tt_align8\n",
+      "using cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_128x128_32x4_tt_align8_base =\n",
+      "  typename cutlass::gemm::kernel::DefaultGemmUniversal<\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor,\n",
+      "    float,\n",
+      "    cutlass::arch::OpClassTensorOp,\n",
+      "    cutlass::arch::Sm80,\n",
+      "    cutlass::gemm::GemmShape<128, 128, 32>,\n",
+      "    cutlass::gemm::GemmShape<64, 64, 32>,\n",
+      "    cutlass::gemm::GemmShape<16, 8, 16>,\n",
+      "    cutlass::epilogue::thread::LinearCombination<cutlass::half_t, 8, float, float>,\n",
+      "    cutlass::gemm::threadblock::ThreadblockSwizzleStreamK,\n",
+      "    4,\n",
+      "    cutlass::arch::OpMultiplyAdd\n",
+      ">::GemmKernel;\n",
+      "\n",
+      "// Define named type\n",
+      "struct cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_128x128_32x4_tt_align8_type : \n",
+      "  public cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_128x128_32x4_tt_align8_base { };\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Stream K is only supported pre-SM90 (at least when this example was written)\n",
+    "if plan.cc != 90:\n",
+    "    plan.swizzling_functor = cutlass.swizzle.ThreadblockSwizzleStreamK\n",
+    "    plan.run(tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, print_module=print_module)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5a8ba2ba",
+   "metadata": {},
+   "source": [
+    "## Handling errors\n",
+    "The CUTLASS Python interface attempts to catch runtime and compilation errors in Python so as to provide more understandable error messages.\n",
+    "\n",
+    "Here's an example in which we try to use too many stages for a given GEMM kernel. Normally, this would result in a runtime error due to the GPU having insufficient shared memory to launch the kernel with 8 stages. The CUTLASS Python interface is able to detect this issue before compiling the kernel, and reports it back to the user."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "fe7d0e42",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-04-18T18:00:06.196345Z",
+     "iopub.status.busy": "2023-04-18T18:00:06.195784Z",
+     "iopub.status.idle": "2023-04-18T18:00:06.199248Z",
+     "shell.execute_reply": "2023-04-18T18:00:06.198438Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# td = tiles[0]\n",
+    "# td.stages = 8\n",
+    "# plan.compile(td)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "0466d96796c9cd8f7a1cad264ff326ececc950ba2420e0256d5105fc1a3c6e70"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/python/docs/externals/01_epilogue.html b/python/docs/externals/01_epilogue.html
new file mode 100644
index 0000000000..341c7a077c
--- /dev/null
+++ b/python/docs/externals/01_epilogue.html
@@ -0,0 +1,764 @@
+<!doctype html>
+<html class="no-js" lang="en">
+  <head><meta charset="utf-8"/>
+    <meta name="viewport" content="width=device-width,initial-scale=1"/>
+    <meta name="color-scheme" content="light dark"><meta name="generator" content="Docutils 0.19: https://docutils.sourceforge.io/" />
+<link rel="index" title="Index" href="../genindex.html" /><link rel="search" title="Search" href="../search.html" /><link rel="next" title="Exporting a CUTLASS grouped GEMM kernel to a PyTorch CUDA extension" href="02_pytorch_extension_grouped_gemm.html" /><link rel="prev" title="Examples" href="../examples.html" />
+        <link rel="canonical" href="docs/externals/01_epilogue.html" />
+
+    <!-- Generated with Sphinx 6.1.3 and Furo 2023.03.27 -->
+        <title>Example of using elementwise activation functions in the CUTLASS Python interface - CUTLASS Python</title>
+      <link rel="stylesheet" type="text/css" href="../_static/pygments.css" />
+    <link rel="stylesheet" type="text/css" href="../_static/styles/furo.css?digest=fad236701ea90a88636c2a8c73b44ae642ed2a53" />
+    <link rel="stylesheet" type="text/css" href="../_static/copybutton.css" />
+    <link rel="stylesheet" type="text/css" href="../_static/tabs.css" />
+    <link rel="stylesheet" type="text/css" href="../_static/nbsphinx-code-cells.css" />
+    <link rel="stylesheet" type="text/css" href="../_static/styles/furo-extensions.css?digest=30d1aed668e5c3a91c3e3bf6a60b675221979f0e" />
+    
+    
+
+
+<style>
+  body {
+    --color-code-background: #eeffcc;
+  --color-code-foreground: black;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+  }
+  @media not print {
+    body[data-theme="dark"] {
+      --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+    }
+    @media (prefers-color-scheme: dark) {
+      body:not([data-theme="light"]) {
+        --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+      }
+    }
+  }
+</style></head>
+  <body>
+    
+    <script>
+      document.body.dataset.theme = localStorage.getItem("theme") || "auto";
+    </script>
+    
+
+<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
+  <symbol id="svg-toc" viewBox="0 0 24 24">
+    <title>Contents</title>
+    <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 1024 1024">
+      <path d="M408 442h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8zm-8 204c0 4.4 3.6 8 8 8h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56zm504-486H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zm0 632H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zM115.4 518.9L271.7 642c5.8 4.6 14.4.5 14.4-6.9V388.9c0-7.4-8.5-11.5-14.4-6.9L115.4 505.1a8.74 8.74 0 0 0 0 13.8z"/>
+    </svg>
+  </symbol>
+  <symbol id="svg-menu" viewBox="0 0 24 24">
+    <title>Menu</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-menu">
+      <line x1="3" y1="12" x2="21" y2="12"></line>
+      <line x1="3" y1="6" x2="21" y2="6"></line>
+      <line x1="3" y1="18" x2="21" y2="18"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-arrow-right" viewBox="0 0 24 24">
+    <title>Expand</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-chevron-right">
+      <polyline points="9 18 15 12 9 6"></polyline>
+    </svg>
+  </symbol>
+  <symbol id="svg-sun" viewBox="0 0 24 24">
+    <title>Light mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="feather-sun">
+      <circle cx="12" cy="12" r="5"></circle>
+      <line x1="12" y1="1" x2="12" y2="3"></line>
+      <line x1="12" y1="21" x2="12" y2="23"></line>
+      <line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
+      <line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
+      <line x1="1" y1="12" x2="3" y2="12"></line>
+      <line x1="21" y1="12" x2="23" y2="12"></line>
+      <line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
+      <line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-moon" viewBox="0 0 24 24">
+    <title>Dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-moon">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none" />
+      <path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z" />
+    </svg>
+  </symbol>
+  <symbol id="svg-sun-half" viewBox="0 0 24 24">
+    <title>Auto light/dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-shadow">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none"/>
+      <circle cx="12" cy="12" r="9" />
+      <path d="M13 12h5" />
+      <path d="M13 15h4" />
+      <path d="M13 18h1" />
+      <path d="M13 9h4" />
+      <path d="M13 6h1" />
+    </svg>
+  </symbol>
+</svg>
+
+<input type="checkbox" class="sidebar-toggle" name="__navigation" id="__navigation">
+<input type="checkbox" class="sidebar-toggle" name="__toc" id="__toc">
+<label class="overlay sidebar-overlay" for="__navigation">
+  <div class="visually-hidden">Hide navigation sidebar</div>
+</label>
+<label class="overlay toc-overlay" for="__toc">
+  <div class="visually-hidden">Hide table of contents sidebar</div>
+</label>
+
+
+
+<div class="page">
+  <header class="mobile-header">
+    <div class="header-left">
+      <label class="nav-overlay-icon" for="__navigation">
+        <div class="visually-hidden">Toggle site navigation sidebar</div>
+        <i class="icon"><svg><use href="#svg-menu"></use></svg></i>
+      </label>
+    </div>
+    <div class="header-center">
+      <a href="../index.html"><div class="brand">CUTLASS Python</div></a>
+    </div>
+    <div class="header-right">
+      <div class="theme-toggle-container theme-toggle-header">
+        <button class="theme-toggle">
+          <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+          <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+          <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+          <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+        </button>
+      </div>
+      <label class="toc-overlay-icon toc-header-icon" for="__toc">
+        <div class="visually-hidden">Toggle table of contents sidebar</div>
+        <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+      </label>
+    </div>
+  </header>
+  <aside class="sidebar-drawer">
+    <div class="sidebar-container">
+      
+      <div class="sidebar-sticky"><a class="sidebar-brand" href="../index.html">
+  
+  <div class="sidebar-logo-container">
+    <img class="sidebar-logo only-light" src="../_static/cutlass-logo-small.png" alt="Light Logo"/>
+    <img class="sidebar-logo only-dark" src="../_static/cutlass-logo-small.png" alt="Dark Logo"/>
+  </div>
+  
+  <span class="sidebar-brand-text">CUTLASS Python</span>
+  
+</a><form class="sidebar-search-container" method="get" action="../search.html" role="search">
+  <input class="sidebar-search" placeholder="Search" name="q" aria-label="Search">
+  <input type="hidden" name="check_keywords" value="yes">
+  <input type="hidden" name="area" value="default">
+</form>
+<div id="searchbox"></div><div class="sidebar-scroll"><div class="sidebar-tree">
+  <ul>
+<li class="toctree-l1"><a class="reference internal" href="../index.html">Home</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Getting Started:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="../install.html">Installation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="00_basic_gemm.html">Getting Started</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../contribute.html">Contributing</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Python Documentation:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../modules.html">CUTLASS Python API</a><input class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" role="switch" type="checkbox"/><label for="toctree-checkbox-1"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2 has-children"><a class="reference internal" href="../cutlass.html">CUTLASS</a><input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" role="switch" type="checkbox"/><label for="toctree-checkbox-2"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l3"><a class="reference internal" href="../cutlass.emit.html">Emitters</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../cutlass.op.html">Operations</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../cutlass.utils.html">Utilities</a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Examples and Tutorials:</span></p>
+<ul class="current">
+<li class="toctree-l1 current has-children"><a class="reference internal" href="../examples.html">Examples</a><input checked="" class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" role="switch" type="checkbox"/><label for="toctree-checkbox-3"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul class="current">
+<li class="toctree-l2"><a class="reference internal" href="00_basic_gemm.html">Basic GEMM</a></li>
+<li class="toctree-l2 current current-page"><a class="current reference internal" href="#">Epilogue</a></li>
+<li class="toctree-l2"><a class="reference internal" href="02_pytorch_extension_grouped_gemm.html">PyTorch Extension</a></li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Reference:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference external" href="https://github.com/NVIDIA/cutlass">Github</a></li>
+</ul>
+
+</div>
+</div>
+
+      </div>
+      
+    </div>
+  </aside>
+  <div class="main">
+    <div class="content">
+      <div class="article-container">
+        <a href="#" class="back-to-top muted-link">
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+            <path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12z"></path>
+          </svg>
+          <span>Back to top</span>
+        </a>
+        <div class="content-icon-container">
+          
+<div class="theme-toggle-container theme-toggle-content">
+            <button class="theme-toggle">
+              <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+              <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+              <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+              <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+            </button>
+          </div>
+          <label class="toc-overlay-icon toc-content-icon" for="__toc">
+            <div class="visually-hidden">Toggle table of contents sidebar</div>
+            <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+          </label>
+        </div>
+        <article role="main">
+          <section id="Example-of-using-elementwise-activation-functions-in-the-CUTLASS-Python-interface">
+<h1>Example of using elementwise activation functions in the CUTLASS Python interface<a class="headerlink" href="#Example-of-using-elementwise-activation-functions-in-the-CUTLASS-Python-interface" title="Permalink to this heading">#</a></h1>
+<p>This notebook walks through a basic example of using the CUTLASS Python interface to declare, compile, and run GEMMs with different epilogues.</p>
+<p><a class="reference external" href="https://colab.research.google.com/github/NVIDIA/cutlass/tree/master/examples/00_basic_gemm.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a></p>
+<p>We first import various packages needed for the example and construct the input and output tensors that will be used in our example.</p>
+<div class="nbinput docutils container">
+<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[1]:
+</pre></div>
+</div>
+<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
+
+<span class="kn">import</span> <span class="nn">cutlass</span>
+
+<span class="c1"># This controls whether ther C++ GEMM declaration will be printed at each step. Set to `false` to</span>
+<span class="c1"># omit this information.</span>
+<span class="n">print_module</span> <span class="o">=</span> <span class="kc">True</span>
+
+<span class="n">m</span> <span class="o">=</span> <span class="mi">256</span>
+<span class="n">n</span> <span class="o">=</span> <span class="n">m</span>
+<span class="n">k</span> <span class="o">=</span> <span class="n">m</span>
+
+<span class="n">type_A</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">float16</span>
+<span class="n">type_B</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">float16</span>
+<span class="n">type_C</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">float16</span>
+<span class="n">type_D</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">float16</span>
+
+<span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">seed</span><span class="p">(</span><span class="mi">1234</span><span class="p">)</span>
+<span class="n">scope_min</span> <span class="o">=</span> <span class="o">-</span><span class="mi">4</span>
+<span class="n">scope_max</span> <span class="o">=</span> <span class="mi">4</span>
+<span class="n">tensor_A</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">ceil</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">uniform</span><span class="p">(</span><span class="n">low</span><span class="o">=</span><span class="n">scope_min</span><span class="p">,</span> <span class="n">high</span><span class="o">=</span><span class="n">scope_max</span><span class="p">,</span> <span class="n">size</span><span class="o">=</span><span class="p">(</span><span class="n">m</span><span class="p">,</span> <span class="n">k</span><span class="p">))</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">type_A</span><span class="p">))</span>
+<span class="n">tensor_B</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">ceil</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">uniform</span><span class="p">(</span><span class="n">low</span><span class="o">=</span><span class="n">scope_min</span><span class="p">,</span> <span class="n">high</span><span class="o">=</span><span class="n">scope_max</span><span class="p">,</span> <span class="n">size</span><span class="o">=</span><span class="p">(</span><span class="n">k</span><span class="p">,</span> <span class="n">n</span><span class="p">))</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">type_B</span><span class="p">))</span>
+<span class="n">tensor_C</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">ceil</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">uniform</span><span class="p">(</span><span class="n">low</span><span class="o">=</span><span class="n">scope_min</span><span class="p">,</span> <span class="n">high</span><span class="o">=</span><span class="n">scope_max</span><span class="p">,</span> <span class="n">size</span><span class="o">=</span><span class="p">(</span><span class="n">m</span><span class="p">,</span> <span class="n">n</span><span class="p">))</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">type_C</span><span class="p">))</span>
+
+<span class="n">alpha</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">float16</span><span class="p">(</span><span class="mf">1.</span><span class="p">)</span>
+<span class="n">beta</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">float16</span><span class="p">(</span><span class="mf">0.</span><span class="p">)</span>
+
+<span class="n">tensor_D</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="n">tensor_C</span><span class="o">.</span><span class="n">shape</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">type_D</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+<div class="nboutput nblast docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area stderr docutils container">
+<div class="highlight"><pre>
+/usr/local/lib/python3.8/dist-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
+  from .autonotebook import tqdm as notebook_tqdm
+</pre></div></div>
+</div>
+<section id="Run-a-GEMM-with-an-identity-activation-function">
+<h2>Run a GEMM with an identity activation function<a class="headerlink" href="#Run-a-GEMM-with-an-identity-activation-function" title="Permalink to this heading">#</a></h2>
+<p>To begin, we simply run a default GEMM with an identity activation function. This performs the well-known operation <code class="docutils literal notranslate"><span class="pre">D</span> <span class="pre">=</span> <span class="pre">alpha</span> <span class="pre">*</span> <span class="pre">(A</span> <span class="pre">&#64;</span> <span class="pre">B)</span> <span class="pre">+</span> <span class="pre">beta</span> <span class="pre">*</span> <span class="pre">C</span></code>. This is the default activation function used, and does not need to be specified.</p>
+<div class="nbinput docutils container">
+<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[2]:
+</pre></div>
+</div>
+<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">plan</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class="n">Gemm</span><span class="p">(</span><span class="n">element</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">float16</span><span class="p">,</span> <span class="n">layout</span><span class="o">=</span><span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">RowMajor</span><span class="p">)</span>
+<span class="n">plan</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">tensor_A</span><span class="p">,</span> <span class="n">tensor_B</span><span class="p">,</span> <span class="n">tensor_C</span><span class="p">,</span> <span class="n">tensor_D</span><span class="p">,</span> <span class="n">print_module</span><span class="o">=</span><span class="n">print_module</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+<div class="nboutput docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area docutils container">
+<div class="highlight"><pre>
+
+// Gemm operator cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8
+using cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base =
+  typename cutlass::gemm::kernel::DefaultGemmUniversal&lt;
+    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,
+    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,
+    cutlass::half_t, cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape&lt;256, 128, 64&gt;,
+    cutlass::gemm::GemmShape&lt;64, 64, 64&gt;,
+    cutlass::gemm::GemmShape&lt;16, 8, 16&gt;,
+    cutlass::epilogue::thread::LinearCombination&lt;cutlass::half_t, 8, cutlass::half_t, cutlass::half_t&gt;,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle&lt;1&gt;,
+    3,
+    cutlass::arch::OpMultiplyAdd
+&gt;::GemmKernel;
+
+// Define named type
+struct cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_type :
+  public cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base { };
+
+</pre></div></div>
+</div>
+<div class="nboutput nblast docutils container">
+<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[2]:
+</pre></div>
+</div>
+<div class="output_area docutils container">
+<div class="highlight"><pre>
+&lt;cutlass.backend.gemm_operation.GemmArguments2x at 0x7fed907287c0&gt;
+</pre></div></div>
+</div>
+</section>
+<section id="Run-a-GEMM-with-a-ReLU-element-wise-activation-function">
+<h2>Run a GEMM with a ReLU element-wise activation function<a class="headerlink" href="#Run-a-GEMM-with-a-ReLU-element-wise-activation-function" title="Permalink to this heading">#</a></h2>
+<p>CUTLASS makes it easy to support other element-wise activation functions. This results in performing an element-wise after the generic linear combination performed in a GEMM. If we call such an activation function <code class="docutils literal notranslate"><span class="pre">act</span></code>, the resulting formulation is:</p>
+<div class="highlight-none notranslate"><div class="highlight"><pre><span></span>D = alpha * (A @ B) + beta * C
+D = act(D)
+</pre></div>
+</div>
+<p>Here, we will add a ReLU activation function. Given an input <code class="docutils literal notranslate"><span class="pre">x</span></code>, ReLU returns <code class="docutils literal notranslate"><span class="pre">max(x,</span> <span class="pre">0)</span></code>.</p>
+<p>This is easy to do in CUTLASS. One only needs to set the plan’s <code class="docutils literal notranslate"><span class="pre">activation</span></code> field.</p>
+<div class="nbinput docutils container">
+<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[3]:
+</pre></div>
+</div>
+<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">tensor_D_relu</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="n">tensor_C</span><span class="o">.</span><span class="n">shape</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">type_D</span><span class="p">)</span>
+<span class="n">plan</span><span class="o">.</span><span class="n">activation</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">epilogue</span><span class="o">.</span><span class="n">relu</span>
+<span class="n">plan</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">tensor_A</span><span class="p">,</span> <span class="n">tensor_B</span><span class="p">,</span> <span class="n">tensor_C</span><span class="p">,</span> <span class="n">tensor_D_relu</span><span class="p">,</span> <span class="n">print_module</span><span class="o">=</span><span class="n">print_module</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+<div class="nboutput docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area docutils container">
+<div class="highlight"><pre>
+
+// Gemm operator cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8
+using cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base =
+  typename cutlass::gemm::kernel::DefaultGemmUniversal&lt;
+    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,
+    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,
+    cutlass::half_t, cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape&lt;256, 128, 64&gt;,
+    cutlass::gemm::GemmShape&lt;64, 64, 64&gt;,
+    cutlass::gemm::GemmShape&lt;16, 8, 16&gt;,
+    cutlass::epilogue::thread::LinearCombinationGeneric&lt;cutlass::epilogue::thread::ReLu, cutlass::half_t, 8, cutlass::half_t, cutlass::half_t&gt;,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle&lt;1&gt;,
+    3,
+    cutlass::arch::OpMultiplyAdd
+&gt;::GemmKernel;
+
+// Define named type
+struct cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_type :
+  public cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base { };
+
+</pre></div></div>
+</div>
+<div class="nboutput nblast docutils container">
+<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[3]:
+</pre></div>
+</div>
+<div class="output_area docutils container">
+<div class="highlight"><pre>
+&lt;cutlass.backend.gemm_operation.GemmArguments2x at 0x7fed906f2460&gt;
+</pre></div></div>
+</div>
+<p>We can now verify that the result of the GEMM that used a ReLU activation function:</p>
+<div class="nbinput nblast docutils container">
+<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[4]:
+</pre></div>
+</div>
+<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">relu_ref</span> <span class="o">=</span> <span class="p">(</span><span class="n">tensor_D</span> <span class="o">&gt;=</span> <span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">type_D</span><span class="p">)</span> <span class="o">*</span> <span class="n">tensor_D</span>
+<span class="n">np</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">assert_array_equal</span><span class="p">(</span><span class="n">relu_ref</span><span class="p">,</span> <span class="n">tensor_D_relu</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+</section>
+<section id="Other-element-wise-activation-functions">
+<h2>Other element-wise activation functions<a class="headerlink" href="#Other-element-wise-activation-functions" title="Permalink to this heading">#</a></h2>
+<p>CUTLASS supports a variety of widely-used element-wise activation functions. We can obtain a list of these functions via the <code class="docutils literal notranslate"><span class="pre">get_activations()</span></code> method.</p>
+<div class="nbinput docutils container">
+<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[5]:
+</pre></div>
+</div>
+<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">activations</span> <span class="o">=</span> <span class="n">plan</span><span class="o">.</span><span class="n">activations</span><span class="p">()</span>
+<span class="k">for</span> <span class="n">activation</span> <span class="ow">in</span> <span class="n">activations</span><span class="p">:</span>
+    <span class="nb">print</span><span class="p">(</span><span class="n">activation</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+<div class="nboutput nblast docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area docutils container">
+<div class="highlight"><pre>
+&lt;class &#39;cutlass.backend.epilogue.gelu&#39;&gt;
+&lt;class &#39;cutlass.backend.epilogue.hardswish&#39;&gt;
+&lt;class &#39;cutlass.backend.epilogue.identity&#39;&gt;
+&lt;class &#39;cutlass.backend.epilogue.leaky_relu&#39;&gt;
+&lt;class &#39;cutlass.backend.epilogue.relu&#39;&gt;
+&lt;class &#39;cutlass.backend.epilogue.sigmoid&#39;&gt;
+&lt;class &#39;cutlass.backend.epilogue.silu&#39;&gt;
+&lt;class &#39;cutlass.backend.epilogue.tanh&#39;&gt;
+</pre></div></div>
+</div>
+<p>We can then run each of them:</p>
+<div class="nbinput docutils container">
+<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[6]:
+</pre></div>
+</div>
+<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="k">for</span> <span class="n">activation</span> <span class="ow">in</span> <span class="n">activations</span><span class="p">:</span>
+    <span class="nb">print</span><span class="p">(</span><span class="s1">&#39;=============================================================================================&#39;</span><span class="p">)</span>
+    <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s1">&#39;Compiling and running activation </span><span class="si">{</span><span class="n">activation</span><span class="si">}</span><span class="s1">&#39;</span><span class="p">)</span>
+    <span class="nb">print</span><span class="p">(</span><span class="s1">&#39;=============================================================================================&#39;</span><span class="p">)</span>
+    <span class="n">plan</span><span class="o">.</span><span class="n">activation</span> <span class="o">=</span> <span class="n">activation</span>
+    <span class="n">plan</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">tensor_A</span><span class="p">,</span> <span class="n">tensor_B</span><span class="p">,</span> <span class="n">tensor_C</span><span class="p">,</span> <span class="n">tensor_D</span><span class="p">,</span> <span class="n">print_module</span><span class="o">=</span><span class="n">print_module</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+<div class="nboutput nblast docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area docutils container">
+<div class="highlight"><pre>
+=============================================================================================
+Compiling and running activation &lt;class &#39;cutlass.backend.epilogue.gelu&#39;&gt;
+=============================================================================================
+
+// Gemm operator cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8
+using cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base =
+  typename cutlass::gemm::kernel::DefaultGemmUniversal&lt;
+    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,
+    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,
+    cutlass::half_t, cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape&lt;256, 128, 64&gt;,
+    cutlass::gemm::GemmShape&lt;64, 64, 64&gt;,
+    cutlass::gemm::GemmShape&lt;16, 8, 16&gt;,
+    cutlass::epilogue::thread::LinearCombinationGeneric&lt;cutlass::epilogue::thread::GELU, cutlass::half_t, 8, cutlass::half_t, cutlass::half_t&gt;,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle&lt;1&gt;,
+    3,
+    cutlass::arch::OpMultiplyAdd
+&gt;::GemmKernel;
+
+// Define named type
+struct cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_type :
+  public cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base { };
+
+=============================================================================================
+Compiling and running activation &lt;class &#39;cutlass.backend.epilogue.hardswish&#39;&gt;
+=============================================================================================
+
+// Gemm operator cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8
+using cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base =
+  typename cutlass::gemm::kernel::DefaultGemmUniversal&lt;
+    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,
+    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,
+    cutlass::half_t, cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape&lt;256, 128, 64&gt;,
+    cutlass::gemm::GemmShape&lt;64, 64, 64&gt;,
+    cutlass::gemm::GemmShape&lt;16, 8, 16&gt;,
+    cutlass::epilogue::thread::LinearCombinationGeneric&lt;cutlass::epilogue::thread::HardSwish, cutlass::half_t, 8, cutlass::half_t, cutlass::half_t&gt;,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle&lt;1&gt;,
+    3,
+    cutlass::arch::OpMultiplyAdd
+&gt;::GemmKernel;
+
+// Define named type
+struct cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_type :
+  public cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base { };
+
+=============================================================================================
+Compiling and running activation &lt;class &#39;cutlass.backend.epilogue.identity&#39;&gt;
+=============================================================================================
+
+// Gemm operator cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8
+using cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base =
+  typename cutlass::gemm::kernel::DefaultGemmUniversal&lt;
+    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,
+    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,
+    cutlass::half_t, cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape&lt;256, 128, 64&gt;,
+    cutlass::gemm::GemmShape&lt;64, 64, 64&gt;,
+    cutlass::gemm::GemmShape&lt;16, 8, 16&gt;,
+    cutlass::epilogue::thread::LinearCombination&lt;cutlass::half_t, 8, cutlass::half_t, cutlass::half_t&gt;,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle&lt;1&gt;,
+    3,
+    cutlass::arch::OpMultiplyAdd
+&gt;::GemmKernel;
+
+// Define named type
+struct cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_type :
+  public cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base { };
+
+=============================================================================================
+Compiling and running activation &lt;class &#39;cutlass.backend.epilogue.leaky_relu&#39;&gt;
+=============================================================================================
+
+// Gemm operator cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8
+using cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base =
+  typename cutlass::gemm::kernel::DefaultGemmUniversal&lt;
+    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,
+    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,
+    cutlass::half_t, cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape&lt;256, 128, 64&gt;,
+    cutlass::gemm::GemmShape&lt;64, 64, 64&gt;,
+    cutlass::gemm::GemmShape&lt;16, 8, 16&gt;,
+    cutlass::epilogue::thread::LinearCombinationGeneric&lt;cutlass::epilogue::thread::LeakyReLU, cutlass::half_t, 8, cutlass::half_t, cutlass::half_t&gt;,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle&lt;1&gt;,
+    3,
+    cutlass::arch::OpMultiplyAdd
+&gt;::GemmKernel;
+
+// Define named type
+struct cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_type :
+  public cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base { };
+
+=============================================================================================
+Compiling and running activation &lt;class &#39;cutlass.backend.epilogue.relu&#39;&gt;
+=============================================================================================
+
+// Gemm operator cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8
+using cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base =
+  typename cutlass::gemm::kernel::DefaultGemmUniversal&lt;
+    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,
+    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,
+    cutlass::half_t, cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape&lt;256, 128, 64&gt;,
+    cutlass::gemm::GemmShape&lt;64, 64, 64&gt;,
+    cutlass::gemm::GemmShape&lt;16, 8, 16&gt;,
+    cutlass::epilogue::thread::LinearCombinationGeneric&lt;cutlass::epilogue::thread::ReLu, cutlass::half_t, 8, cutlass::half_t, cutlass::half_t&gt;,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle&lt;1&gt;,
+    3,
+    cutlass::arch::OpMultiplyAdd
+&gt;::GemmKernel;
+
+// Define named type
+struct cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_type :
+  public cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base { };
+
+=============================================================================================
+Compiling and running activation &lt;class &#39;cutlass.backend.epilogue.sigmoid&#39;&gt;
+=============================================================================================
+
+// Gemm operator cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8
+using cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base =
+  typename cutlass::gemm::kernel::DefaultGemmUniversal&lt;
+    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,
+    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,
+    cutlass::half_t, cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape&lt;256, 128, 64&gt;,
+    cutlass::gemm::GemmShape&lt;64, 64, 64&gt;,
+    cutlass::gemm::GemmShape&lt;16, 8, 16&gt;,
+    cutlass::epilogue::thread::LinearCombinationGeneric&lt;cutlass::epilogue::thread::Sigmoid, cutlass::half_t, 8, cutlass::half_t, cutlass::half_t&gt;,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle&lt;1&gt;,
+    3,
+    cutlass::arch::OpMultiplyAdd
+&gt;::GemmKernel;
+
+// Define named type
+struct cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_type :
+  public cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base { };
+
+=============================================================================================
+Compiling and running activation &lt;class &#39;cutlass.backend.epilogue.silu&#39;&gt;
+=============================================================================================
+
+// Gemm operator cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8
+using cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base =
+  typename cutlass::gemm::kernel::DefaultGemmUniversal&lt;
+    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,
+    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,
+    cutlass::half_t, cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape&lt;256, 128, 64&gt;,
+    cutlass::gemm::GemmShape&lt;64, 64, 64&gt;,
+    cutlass::gemm::GemmShape&lt;16, 8, 16&gt;,
+    cutlass::epilogue::thread::LinearCombinationGeneric&lt;cutlass::epilogue::thread::SiLu, cutlass::half_t, 8, cutlass::half_t, cutlass::half_t&gt;,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle&lt;1&gt;,
+    3,
+    cutlass::arch::OpMultiplyAdd
+&gt;::GemmKernel;
+
+// Define named type
+struct cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_type :
+  public cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base { };
+
+=============================================================================================
+Compiling and running activation &lt;class &#39;cutlass.backend.epilogue.tanh&#39;&gt;
+=============================================================================================
+
+// Gemm operator cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8
+using cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base =
+  typename cutlass::gemm::kernel::DefaultGemmUniversal&lt;
+    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,
+    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,
+    cutlass::half_t, cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape&lt;256, 128, 64&gt;,
+    cutlass::gemm::GemmShape&lt;64, 64, 64&gt;,
+    cutlass::gemm::GemmShape&lt;16, 8, 16&gt;,
+    cutlass::epilogue::thread::LinearCombinationGeneric&lt;cutlass::epilogue::thread::Tanh, cutlass::half_t, 8, cutlass::half_t, cutlass::half_t&gt;,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle&lt;1&gt;,
+    3,
+    cutlass::arch::OpMultiplyAdd
+&gt;::GemmKernel;
+
+// Define named type
+struct cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_type :
+  public cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base { };
+
+</pre></div></div>
+</div>
+<div class="nbinput nblast docutils container">
+<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[ ]:
+</pre></div>
+</div>
+<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span>
+</pre></div>
+</div>
+</div>
+</section>
+</section>
+
+        </article>
+      </div>
+      <footer>
+        
+        <div class="related-pages">
+          <a class="next-page" href="02_pytorch_extension_grouped_gemm.html">
+              <div class="page-info">
+                <div class="context">
+                  <span>Next</span>
+                </div>
+                <div class="title">Exporting a CUTLASS grouped GEMM kernel to a PyTorch CUDA extension</div>
+              </div>
+              <svg class="furo-related-icon"><use href="#svg-arrow-right"></use></svg>
+            </a>
+          <a class="prev-page" href="../examples.html">
+              <svg class="furo-related-icon"><use href="#svg-arrow-right"></use></svg>
+              <div class="page-info">
+                <div class="context">
+                  <span>Previous</span>
+                </div>
+                
+                <div class="title">Examples</div>
+                
+              </div>
+            </a>
+        </div>
+        <div class="bottom-of-page">
+          <div class="left-details">
+            <div class="copyright">
+                Copyright &#169; 2023, NVIDIA
+            </div>
+            Made with <a href="https://www.sphinx-doc.org/">Sphinx</a> and <a class="muted-link" href="https://pradyunsg.me">@pradyunsg</a>'s
+            
+            <a href="https://github.com/pradyunsg/furo">Furo</a>
+            
+          </div>
+          <div class="right-details">
+            <div class="icons">
+              <a class="muted-link " href="https://github.com/NVIDIA/cutlass" aria-label="GitHub">
+                <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 16 16">
+                    <path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"></path>
+                </svg>
+            </a>
+              
+            </div>
+          </div>
+        </div>
+        
+      </footer>
+    </div>
+    <aside class="toc-drawer">
+      
+      
+      <div class="toc-sticky toc-scroll">
+        <div class="toc-title-container">
+          <span class="toc-title">
+            On this page
+          </span>
+        </div>
+        <div class="toc-tree-container">
+          <div class="toc-tree">
+            <ul>
+<li><a class="reference internal" href="#">Example of using elementwise activation functions in the CUTLASS Python interface</a><ul>
+<li><a class="reference internal" href="#Run-a-GEMM-with-an-identity-activation-function">Run a GEMM with an identity activation function</a></li>
+<li><a class="reference internal" href="#Run-a-GEMM-with-a-ReLU-element-wise-activation-function">Run a GEMM with a ReLU element-wise activation function</a></li>
+<li><a class="reference internal" href="#Other-element-wise-activation-functions">Other element-wise activation functions</a></li>
+</ul>
+</li>
+</ul>
+
+          </div>
+        </div>
+      </div>
+      
+      
+    </aside>
+  </div>
+</div><script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script>
+    <script src="../_static/doctools.js"></script>
+    <script src="../_static/sphinx_highlight.js"></script>
+    <script src="../_static/scripts/furo.js"></script>
+    <script src="../_static/clipboard.min.js"></script>
+    <script src="../_static/copybutton.js"></script>
+    <script src="../_static/tabs.js"></script>
+    <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
+    <script>window.MathJax = {"tex": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true}, "options": {"ignoreHtmlClass": "tex2jax_ignore|mathjax_ignore|document", "processHtmlClass": "tex2jax_process|mathjax_process|math|output_area"}}</script>
+    <script defer="defer" src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
+    </body>
+</html>
\ No newline at end of file
diff --git a/python/docs/externals/01_epilogue.ipynb b/python/docs/externals/01_epilogue.ipynb
new file mode 100644
index 0000000000..2669802337
--- /dev/null
+++ b/python/docs/externals/01_epilogue.ipynb
@@ -0,0 +1,593 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "5d24a692",
+   "metadata": {},
+   "source": [
+    "# Example of using elementwise activation functions in the CUTLASS Python interface\n",
+    "This notebook walks through a basic example of using the CUTLASS Python interface to declare, compile, and run GEMMs with different epilogues.\n",
+    "\n",
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cutlass/tree/master/examples/00_basic_gemm.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3ca993fe",
+   "metadata": {},
+   "source": [
+    "We first import various packages needed for the example and construct the input and output tensors that will be used in our example."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "63a70a3c",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-04-18T18:00:09.148380Z",
+     "iopub.status.busy": "2023-04-18T18:00:09.148011Z",
+     "iopub.status.idle": "2023-04-18T18:00:13.281937Z",
+     "shell.execute_reply": "2023-04-18T18:00:13.281256Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.8/dist-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "import cutlass\n",
+    "\n",
+    "# This controls whether ther C++ GEMM declaration will be printed at each step. Set to `false` to\n",
+    "# omit this information.\n",
+    "print_module = True\n",
+    "\n",
+    "m = 256\n",
+    "n = m\n",
+    "k = m\n",
+    "\n",
+    "type_A = np.float16\n",
+    "type_B = np.float16\n",
+    "type_C = np.float16\n",
+    "type_D = np.float16\n",
+    "\n",
+    "np.random.seed(1234)\n",
+    "scope_min = -4\n",
+    "scope_max = 4\n",
+    "tensor_A = np.ceil(np.random.uniform(low=scope_min, high=scope_max, size=(m, k)).astype(type_A))\n",
+    "tensor_B = np.ceil(np.random.uniform(low=scope_min, high=scope_max, size=(k, n)).astype(type_B))\n",
+    "tensor_C = np.ceil(np.random.uniform(low=scope_min, high=scope_max, size=(m, n)).astype(type_C))\n",
+    "\n",
+    "alpha = np.float16(1.)\n",
+    "beta = np.float16(0.)\n",
+    "\n",
+    "tensor_D = np.zeros(tensor_C.shape).astype(type_D)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1eb0d95b",
+   "metadata": {},
+   "source": [
+    "## Run a GEMM with an identity activation function\n",
+    "To begin, we simply run a default GEMM with an identity activation function. This performs the well-known operation `D = alpha * (A @ B) + beta * C`. This is the default activation function used, and does not need to be specified."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "8d257833",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-04-18T18:00:13.284650Z",
+     "iopub.status.busy": "2023-04-18T18:00:13.284425Z",
+     "iopub.status.idle": "2023-04-18T18:00:18.333867Z",
+     "shell.execute_reply": "2023-04-18T18:00:18.333187Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "// Gemm operator cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8\n",
+      "using cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base =\n",
+      "  typename cutlass::gemm::kernel::DefaultGemmUniversal<\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor,\n",
+      "    cutlass::half_t,\n",
+      "    cutlass::arch::OpClassTensorOp,\n",
+      "    cutlass::arch::Sm80,\n",
+      "    cutlass::gemm::GemmShape<256, 128, 64>,\n",
+      "    cutlass::gemm::GemmShape<64, 64, 64>,\n",
+      "    cutlass::gemm::GemmShape<16, 8, 16>,\n",
+      "    cutlass::epilogue::thread::LinearCombination<cutlass::half_t, 8, cutlass::half_t, cutlass::half_t>,\n",
+      "    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,\n",
+      "    3,\n",
+      "    cutlass::arch::OpMultiplyAdd\n",
+      ">::GemmKernel;\n",
+      "\n",
+      "// Define named type\n",
+      "struct cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_type : \n",
+      "  public cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base { };\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<cutlass.backend.gemm_operation.GemmArguments2x at 0x7fed907287c0>"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "plan = cutlass.op.Gemm(element=np.float16, layout=cutlass.LayoutType.RowMajor)\n",
+    "plan.run(tensor_A, tensor_B, tensor_C, tensor_D, print_module=print_module)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "54961694",
+   "metadata": {},
+   "source": [
+    "## Run a GEMM with a ReLU element-wise activation function\n",
+    "CUTLASS makes it easy to support other element-wise activation functions. This results in performing an element-wise after the generic linear combination performed in a GEMM. If we call such an activation function `act`, the resulting formulation is:\n",
+    "```\n",
+    "D = alpha * (A @ B) + beta * C\n",
+    "D = act(D)\n",
+    "```\n",
+    "\n",
+    "Here, we will add a ReLU activation function. Given an input `x`, ReLU returns `max(x, 0)`.\n",
+    "\n",
+    "This is easy to do in CUTLASS. One only needs to set the plan's `activation` field."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "5fe49443",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-04-18T18:00:18.337036Z",
+     "iopub.status.busy": "2023-04-18T18:00:18.336833Z",
+     "iopub.status.idle": "2023-04-18T18:00:23.482072Z",
+     "shell.execute_reply": "2023-04-18T18:00:23.481125Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "// Gemm operator cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8\n",
+      "using cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base =\n",
+      "  typename cutlass::gemm::kernel::DefaultGemmUniversal<\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor,\n",
+      "    cutlass::half_t,\n",
+      "    cutlass::arch::OpClassTensorOp,\n",
+      "    cutlass::arch::Sm80,\n",
+      "    cutlass::gemm::GemmShape<256, 128, 64>,\n",
+      "    cutlass::gemm::GemmShape<64, 64, 64>,\n",
+      "    cutlass::gemm::GemmShape<16, 8, 16>,\n",
+      "    cutlass::epilogue::thread::LinearCombinationGeneric<cutlass::epilogue::thread::ReLu, cutlass::half_t, 8, cutlass::half_t, cutlass::half_t>,\n",
+      "    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,\n",
+      "    3,\n",
+      "    cutlass::arch::OpMultiplyAdd\n",
+      ">::GemmKernel;\n",
+      "\n",
+      "// Define named type\n",
+      "struct cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_type : \n",
+      "  public cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base { };\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<cutlass.backend.gemm_operation.GemmArguments2x at 0x7fed906f2460>"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tensor_D_relu = np.zeros(tensor_C.shape).astype(type_D)\n",
+    "plan.activation = cutlass.epilogue.relu\n",
+    "plan.run(tensor_A, tensor_B, tensor_C, tensor_D_relu, print_module=print_module)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "455d0a37",
+   "metadata": {},
+   "source": [
+    "We can now verify that the result of the GEMM that used a ReLU activation function:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "e32e7798",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-04-18T18:00:23.486042Z",
+     "iopub.status.busy": "2023-04-18T18:00:23.485342Z",
+     "iopub.status.idle": "2023-04-18T18:00:23.497444Z",
+     "shell.execute_reply": "2023-04-18T18:00:23.496668Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "relu_ref = (tensor_D >= 0).astype(type_D) * tensor_D\n",
+    "np.testing.assert_array_equal(relu_ref, tensor_D_relu)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cf959171",
+   "metadata": {},
+   "source": [
+    "## Other element-wise activation functions\n",
+    "CUTLASS supports a variety of widely-used element-wise activation functions. We can obtain a list of these functions via the `get_activations()` method."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "9e17d730",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-04-18T18:00:23.500102Z",
+     "iopub.status.busy": "2023-04-18T18:00:23.499944Z",
+     "iopub.status.idle": "2023-04-18T18:00:23.504562Z",
+     "shell.execute_reply": "2023-04-18T18:00:23.503793Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'cutlass.backend.epilogue.gelu'>\n",
+      "<class 'cutlass.backend.epilogue.hardswish'>\n",
+      "<class 'cutlass.backend.epilogue.identity'>\n",
+      "<class 'cutlass.backend.epilogue.leaky_relu'>\n",
+      "<class 'cutlass.backend.epilogue.relu'>\n",
+      "<class 'cutlass.backend.epilogue.sigmoid'>\n",
+      "<class 'cutlass.backend.epilogue.silu'>\n",
+      "<class 'cutlass.backend.epilogue.tanh'>\n"
+     ]
+    }
+   ],
+   "source": [
+    "activations = plan.activations()\n",
+    "for activation in activations:\n",
+    "    print(activation)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0e4599fa",
+   "metadata": {},
+   "source": [
+    "We can then run each of them:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "9c3598c9",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-04-18T18:00:23.507538Z",
+     "iopub.status.busy": "2023-04-18T18:00:23.507257Z",
+     "iopub.status.idle": "2023-04-18T18:00:59.414765Z",
+     "shell.execute_reply": "2023-04-18T18:00:59.414116Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=============================================================================================\n",
+      "Compiling and running activation <class 'cutlass.backend.epilogue.gelu'>\n",
+      "=============================================================================================\n",
+      "\n",
+      "// Gemm operator cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8\n",
+      "using cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base =\n",
+      "  typename cutlass::gemm::kernel::DefaultGemmUniversal<\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor,\n",
+      "    cutlass::half_t,\n",
+      "    cutlass::arch::OpClassTensorOp,\n",
+      "    cutlass::arch::Sm80,\n",
+      "    cutlass::gemm::GemmShape<256, 128, 64>,\n",
+      "    cutlass::gemm::GemmShape<64, 64, 64>,\n",
+      "    cutlass::gemm::GemmShape<16, 8, 16>,\n",
+      "    cutlass::epilogue::thread::LinearCombinationGeneric<cutlass::epilogue::thread::GELU, cutlass::half_t, 8, cutlass::half_t, cutlass::half_t>,\n",
+      "    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,\n",
+      "    3,\n",
+      "    cutlass::arch::OpMultiplyAdd\n",
+      ">::GemmKernel;\n",
+      "\n",
+      "// Define named type\n",
+      "struct cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_type : \n",
+      "  public cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base { };\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=============================================================================================\n",
+      "Compiling and running activation <class 'cutlass.backend.epilogue.hardswish'>\n",
+      "=============================================================================================\n",
+      "\n",
+      "// Gemm operator cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8\n",
+      "using cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base =\n",
+      "  typename cutlass::gemm::kernel::DefaultGemmUniversal<\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor,\n",
+      "    cutlass::half_t,\n",
+      "    cutlass::arch::OpClassTensorOp,\n",
+      "    cutlass::arch::Sm80,\n",
+      "    cutlass::gemm::GemmShape<256, 128, 64>,\n",
+      "    cutlass::gemm::GemmShape<64, 64, 64>,\n",
+      "    cutlass::gemm::GemmShape<16, 8, 16>,\n",
+      "    cutlass::epilogue::thread::LinearCombinationGeneric<cutlass::epilogue::thread::HardSwish, cutlass::half_t, 8, cutlass::half_t, cutlass::half_t>,\n",
+      "    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,\n",
+      "    3,\n",
+      "    cutlass::arch::OpMultiplyAdd\n",
+      ">::GemmKernel;\n",
+      "\n",
+      "// Define named type\n",
+      "struct cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_type : \n",
+      "  public cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base { };\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=============================================================================================\n",
+      "Compiling and running activation <class 'cutlass.backend.epilogue.identity'>\n",
+      "=============================================================================================\n",
+      "\n",
+      "// Gemm operator cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8\n",
+      "using cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base =\n",
+      "  typename cutlass::gemm::kernel::DefaultGemmUniversal<\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor,\n",
+      "    cutlass::half_t,\n",
+      "    cutlass::arch::OpClassTensorOp,\n",
+      "    cutlass::arch::Sm80,\n",
+      "    cutlass::gemm::GemmShape<256, 128, 64>,\n",
+      "    cutlass::gemm::GemmShape<64, 64, 64>,\n",
+      "    cutlass::gemm::GemmShape<16, 8, 16>,\n",
+      "    cutlass::epilogue::thread::LinearCombination<cutlass::half_t, 8, cutlass::half_t, cutlass::half_t>,\n",
+      "    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,\n",
+      "    3,\n",
+      "    cutlass::arch::OpMultiplyAdd\n",
+      ">::GemmKernel;\n",
+      "\n",
+      "// Define named type\n",
+      "struct cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_type : \n",
+      "  public cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base { };\n",
+      "\n",
+      "=============================================================================================\n",
+      "Compiling and running activation <class 'cutlass.backend.epilogue.leaky_relu'>\n",
+      "=============================================================================================\n",
+      "\n",
+      "// Gemm operator cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8\n",
+      "using cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base =\n",
+      "  typename cutlass::gemm::kernel::DefaultGemmUniversal<\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor,\n",
+      "    cutlass::half_t,\n",
+      "    cutlass::arch::OpClassTensorOp,\n",
+      "    cutlass::arch::Sm80,\n",
+      "    cutlass::gemm::GemmShape<256, 128, 64>,\n",
+      "    cutlass::gemm::GemmShape<64, 64, 64>,\n",
+      "    cutlass::gemm::GemmShape<16, 8, 16>,\n",
+      "    cutlass::epilogue::thread::LinearCombinationGeneric<cutlass::epilogue::thread::LeakyReLU, cutlass::half_t, 8, cutlass::half_t, cutlass::half_t>,\n",
+      "    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,\n",
+      "    3,\n",
+      "    cutlass::arch::OpMultiplyAdd\n",
+      ">::GemmKernel;\n",
+      "\n",
+      "// Define named type\n",
+      "struct cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_type : \n",
+      "  public cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base { };\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=============================================================================================\n",
+      "Compiling and running activation <class 'cutlass.backend.epilogue.relu'>\n",
+      "=============================================================================================\n",
+      "\n",
+      "// Gemm operator cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8\n",
+      "using cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base =\n",
+      "  typename cutlass::gemm::kernel::DefaultGemmUniversal<\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor,\n",
+      "    cutlass::half_t,\n",
+      "    cutlass::arch::OpClassTensorOp,\n",
+      "    cutlass::arch::Sm80,\n",
+      "    cutlass::gemm::GemmShape<256, 128, 64>,\n",
+      "    cutlass::gemm::GemmShape<64, 64, 64>,\n",
+      "    cutlass::gemm::GemmShape<16, 8, 16>,\n",
+      "    cutlass::epilogue::thread::LinearCombinationGeneric<cutlass::epilogue::thread::ReLu, cutlass::half_t, 8, cutlass::half_t, cutlass::half_t>,\n",
+      "    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,\n",
+      "    3,\n",
+      "    cutlass::arch::OpMultiplyAdd\n",
+      ">::GemmKernel;\n",
+      "\n",
+      "// Define named type\n",
+      "struct cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_type : \n",
+      "  public cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base { };\n",
+      "\n",
+      "=============================================================================================\n",
+      "Compiling and running activation <class 'cutlass.backend.epilogue.sigmoid'>\n",
+      "=============================================================================================\n",
+      "\n",
+      "// Gemm operator cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8\n",
+      "using cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base =\n",
+      "  typename cutlass::gemm::kernel::DefaultGemmUniversal<\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor,\n",
+      "    cutlass::half_t,\n",
+      "    cutlass::arch::OpClassTensorOp,\n",
+      "    cutlass::arch::Sm80,\n",
+      "    cutlass::gemm::GemmShape<256, 128, 64>,\n",
+      "    cutlass::gemm::GemmShape<64, 64, 64>,\n",
+      "    cutlass::gemm::GemmShape<16, 8, 16>,\n",
+      "    cutlass::epilogue::thread::LinearCombinationGeneric<cutlass::epilogue::thread::Sigmoid, cutlass::half_t, 8, cutlass::half_t, cutlass::half_t>,\n",
+      "    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,\n",
+      "    3,\n",
+      "    cutlass::arch::OpMultiplyAdd\n",
+      ">::GemmKernel;\n",
+      "\n",
+      "// Define named type\n",
+      "struct cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_type : \n",
+      "  public cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base { };\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=============================================================================================\n",
+      "Compiling and running activation <class 'cutlass.backend.epilogue.silu'>\n",
+      "=============================================================================================\n",
+      "\n",
+      "// Gemm operator cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8\n",
+      "using cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base =\n",
+      "  typename cutlass::gemm::kernel::DefaultGemmUniversal<\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor,\n",
+      "    cutlass::half_t,\n",
+      "    cutlass::arch::OpClassTensorOp,\n",
+      "    cutlass::arch::Sm80,\n",
+      "    cutlass::gemm::GemmShape<256, 128, 64>,\n",
+      "    cutlass::gemm::GemmShape<64, 64, 64>,\n",
+      "    cutlass::gemm::GemmShape<16, 8, 16>,\n",
+      "    cutlass::epilogue::thread::LinearCombinationGeneric<cutlass::epilogue::thread::SiLu, cutlass::half_t, 8, cutlass::half_t, cutlass::half_t>,\n",
+      "    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,\n",
+      "    3,\n",
+      "    cutlass::arch::OpMultiplyAdd\n",
+      ">::GemmKernel;\n",
+      "\n",
+      "// Define named type\n",
+      "struct cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_type : \n",
+      "  public cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base { };\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=============================================================================================\n",
+      "Compiling and running activation <class 'cutlass.backend.epilogue.tanh'>\n",
+      "=============================================================================================\n",
+      "\n",
+      "// Gemm operator cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8\n",
+      "using cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base =\n",
+      "  typename cutlass::gemm::kernel::DefaultGemmUniversal<\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor,\n",
+      "    cutlass::half_t,\n",
+      "    cutlass::arch::OpClassTensorOp,\n",
+      "    cutlass::arch::Sm80,\n",
+      "    cutlass::gemm::GemmShape<256, 128, 64>,\n",
+      "    cutlass::gemm::GemmShape<64, 64, 64>,\n",
+      "    cutlass::gemm::GemmShape<16, 8, 16>,\n",
+      "    cutlass::epilogue::thread::LinearCombinationGeneric<cutlass::epilogue::thread::Tanh, cutlass::half_t, 8, cutlass::half_t, cutlass::half_t>,\n",
+      "    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,\n",
+      "    3,\n",
+      "    cutlass::arch::OpMultiplyAdd\n",
+      ">::GemmKernel;\n",
+      "\n",
+      "// Define named type\n",
+      "struct cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_type : \n",
+      "  public cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_base { };\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "for activation in activations:\n",
+    "    print('=============================================================================================')\n",
+    "    print(f'Compiling and running activation {activation}')\n",
+    "    print('=============================================================================================')\n",
+    "    plan.activation = activation\n",
+    "    plan.run(tensor_A, tensor_B, tensor_C, tensor_D, print_module=print_module)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "751f8d92",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/python/docs/externals/02_pytorch_extension_grouped_gemm.html b/python/docs/externals/02_pytorch_extension_grouped_gemm.html
new file mode 100644
index 0000000000..2a5ba57870
--- /dev/null
+++ b/python/docs/externals/02_pytorch_extension_grouped_gemm.html
@@ -0,0 +1,537 @@
+<!doctype html>
+<html class="no-js" lang="en">
+  <head><meta charset="utf-8"/>
+    <meta name="viewport" content="width=device-width,initial-scale=1"/>
+    <meta name="color-scheme" content="light dark"><meta name="generator" content="Docutils 0.19: https://docutils.sourceforge.io/" />
+<link rel="index" title="Index" href="../genindex.html" /><link rel="search" title="Search" href="../search.html" /><link rel="prev" title="Example of using elementwise activation functions in the CUTLASS Python interface" href="01_epilogue.html" />
+        <link rel="canonical" href="docs/externals/02_pytorch_extension_grouped_gemm.html" />
+
+    <!-- Generated with Sphinx 6.1.3 and Furo 2023.03.27 -->
+        <title>Exporting a CUTLASS grouped GEMM kernel to a PyTorch CUDA extension - CUTLASS Python</title>
+      <link rel="stylesheet" type="text/css" href="../_static/pygments.css" />
+    <link rel="stylesheet" type="text/css" href="../_static/styles/furo.css?digest=fad236701ea90a88636c2a8c73b44ae642ed2a53" />
+    <link rel="stylesheet" type="text/css" href="../_static/copybutton.css" />
+    <link rel="stylesheet" type="text/css" href="../_static/tabs.css" />
+    <link rel="stylesheet" type="text/css" href="../_static/nbsphinx-code-cells.css" />
+    <link rel="stylesheet" type="text/css" href="../_static/styles/furo-extensions.css?digest=30d1aed668e5c3a91c3e3bf6a60b675221979f0e" />
+    
+    
+
+
+<style>
+  body {
+    --color-code-background: #eeffcc;
+  --color-code-foreground: black;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+  }
+  @media not print {
+    body[data-theme="dark"] {
+      --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+    }
+    @media (prefers-color-scheme: dark) {
+      body:not([data-theme="light"]) {
+        --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+      }
+    }
+  }
+</style></head>
+  <body>
+    
+    <script>
+      document.body.dataset.theme = localStorage.getItem("theme") || "auto";
+    </script>
+    
+
+<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
+  <symbol id="svg-toc" viewBox="0 0 24 24">
+    <title>Contents</title>
+    <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 1024 1024">
+      <path d="M408 442h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8zm-8 204c0 4.4 3.6 8 8 8h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56zm504-486H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zm0 632H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zM115.4 518.9L271.7 642c5.8 4.6 14.4.5 14.4-6.9V388.9c0-7.4-8.5-11.5-14.4-6.9L115.4 505.1a8.74 8.74 0 0 0 0 13.8z"/>
+    </svg>
+  </symbol>
+  <symbol id="svg-menu" viewBox="0 0 24 24">
+    <title>Menu</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-menu">
+      <line x1="3" y1="12" x2="21" y2="12"></line>
+      <line x1="3" y1="6" x2="21" y2="6"></line>
+      <line x1="3" y1="18" x2="21" y2="18"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-arrow-right" viewBox="0 0 24 24">
+    <title>Expand</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-chevron-right">
+      <polyline points="9 18 15 12 9 6"></polyline>
+    </svg>
+  </symbol>
+  <symbol id="svg-sun" viewBox="0 0 24 24">
+    <title>Light mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="feather-sun">
+      <circle cx="12" cy="12" r="5"></circle>
+      <line x1="12" y1="1" x2="12" y2="3"></line>
+      <line x1="12" y1="21" x2="12" y2="23"></line>
+      <line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
+      <line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
+      <line x1="1" y1="12" x2="3" y2="12"></line>
+      <line x1="21" y1="12" x2="23" y2="12"></line>
+      <line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
+      <line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-moon" viewBox="0 0 24 24">
+    <title>Dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-moon">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none" />
+      <path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z" />
+    </svg>
+  </symbol>
+  <symbol id="svg-sun-half" viewBox="0 0 24 24">
+    <title>Auto light/dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-shadow">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none"/>
+      <circle cx="12" cy="12" r="9" />
+      <path d="M13 12h5" />
+      <path d="M13 15h4" />
+      <path d="M13 18h1" />
+      <path d="M13 9h4" />
+      <path d="M13 6h1" />
+    </svg>
+  </symbol>
+</svg>
+
+<input type="checkbox" class="sidebar-toggle" name="__navigation" id="__navigation">
+<input type="checkbox" class="sidebar-toggle" name="__toc" id="__toc">
+<label class="overlay sidebar-overlay" for="__navigation">
+  <div class="visually-hidden">Hide navigation sidebar</div>
+</label>
+<label class="overlay toc-overlay" for="__toc">
+  <div class="visually-hidden">Hide table of contents sidebar</div>
+</label>
+
+
+
+<div class="page">
+  <header class="mobile-header">
+    <div class="header-left">
+      <label class="nav-overlay-icon" for="__navigation">
+        <div class="visually-hidden">Toggle site navigation sidebar</div>
+        <i class="icon"><svg><use href="#svg-menu"></use></svg></i>
+      </label>
+    </div>
+    <div class="header-center">
+      <a href="../index.html"><div class="brand">CUTLASS Python</div></a>
+    </div>
+    <div class="header-right">
+      <div class="theme-toggle-container theme-toggle-header">
+        <button class="theme-toggle">
+          <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+          <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+          <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+          <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+        </button>
+      </div>
+      <label class="toc-overlay-icon toc-header-icon" for="__toc">
+        <div class="visually-hidden">Toggle table of contents sidebar</div>
+        <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+      </label>
+    </div>
+  </header>
+  <aside class="sidebar-drawer">
+    <div class="sidebar-container">
+      
+      <div class="sidebar-sticky"><a class="sidebar-brand" href="../index.html">
+  
+  <div class="sidebar-logo-container">
+    <img class="sidebar-logo only-light" src="../_static/cutlass-logo-small.png" alt="Light Logo"/>
+    <img class="sidebar-logo only-dark" src="../_static/cutlass-logo-small.png" alt="Dark Logo"/>
+  </div>
+  
+  <span class="sidebar-brand-text">CUTLASS Python</span>
+  
+</a><form class="sidebar-search-container" method="get" action="../search.html" role="search">
+  <input class="sidebar-search" placeholder="Search" name="q" aria-label="Search">
+  <input type="hidden" name="check_keywords" value="yes">
+  <input type="hidden" name="area" value="default">
+</form>
+<div id="searchbox"></div><div class="sidebar-scroll"><div class="sidebar-tree">
+  <ul>
+<li class="toctree-l1"><a class="reference internal" href="../index.html">Home</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Getting Started:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="../install.html">Installation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="00_basic_gemm.html">Getting Started</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../contribute.html">Contributing</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Python Documentation:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../modules.html">CUTLASS Python API</a><input class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" role="switch" type="checkbox"/><label for="toctree-checkbox-1"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2 has-children"><a class="reference internal" href="../cutlass.html">CUTLASS</a><input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" role="switch" type="checkbox"/><label for="toctree-checkbox-2"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l3"><a class="reference internal" href="../cutlass.emit.html">Emitters</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../cutlass.op.html">Operations</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../cutlass.utils.html">Utilities</a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Examples and Tutorials:</span></p>
+<ul class="current">
+<li class="toctree-l1 current has-children"><a class="reference internal" href="../examples.html">Examples</a><input checked="" class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" role="switch" type="checkbox"/><label for="toctree-checkbox-3"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul class="current">
+<li class="toctree-l2"><a class="reference internal" href="00_basic_gemm.html">Basic GEMM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="01_epilogue.html">Epilogue</a></li>
+<li class="toctree-l2 current current-page"><a class="current reference internal" href="#">PyTorch Extension</a></li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Reference:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference external" href="https://github.com/NVIDIA/cutlass">Github</a></li>
+</ul>
+
+</div>
+</div>
+
+      </div>
+      
+    </div>
+  </aside>
+  <div class="main">
+    <div class="content">
+      <div class="article-container">
+        <a href="#" class="back-to-top muted-link">
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+            <path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12z"></path>
+          </svg>
+          <span>Back to top</span>
+        </a>
+        <div class="content-icon-container">
+          
+<div class="theme-toggle-container theme-toggle-content">
+            <button class="theme-toggle">
+              <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+              <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+              <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+              <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+            </button>
+          </div>
+          <label class="toc-overlay-icon toc-content-icon" for="__toc">
+            <div class="visually-hidden">Toggle table of contents sidebar</div>
+            <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+          </label>
+        </div>
+        <article role="main">
+          <section id="Exporting-a-CUTLASS-grouped-GEMM-kernel-to-a-PyTorch-CUDA-extension">
+<h1>Exporting a CUTLASS grouped GEMM kernel to a PyTorch CUDA extension<a class="headerlink" href="#Exporting-a-CUTLASS-grouped-GEMM-kernel-to-a-PyTorch-CUDA-extension" title="Permalink to this heading">#</a></h1>
+<p>This notebook walks through a basic example of using the CUTLASS Python interface to declare a grouped GEMM kernel and export it as a PyTorch CUDA extension.</p>
+<p><a class="reference external" href="https://colab.research.google.com/github/NVIDIA/cutlass/tree/master/examples/00_basic_gemm.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a></p>
+<section id="Background-on-grouped-GEMM">
+<h2>Background on grouped GEMM<a class="headerlink" href="#Background-on-grouped-GEMM" title="Permalink to this heading">#</a></h2>
+<p>Grouped GEMM enables one to execute a set of GEMMs (each with potentially different sizes and strides) in a single CUDA kernel. It can be thought of as a generalized version of a pointer-array GEMM, without the requirement that the sizes and strides of each GEMM be the same.</p>
+<p>For example, if one has <code class="docutils literal notranslate"><span class="pre">p</span></code> GEMMs with sizes:</p>
+<div class="highlight-text notranslate"><div class="highlight"><pre><span></span>M_1 x N_1 x K_1
+M_2 x N_2 x K_2
+...
+M_p x N_p x K_p
+</pre></div>
+</div>
+<p>CUTLASS’s grouped GEMM will execute these in a single CUDA kernel.</p>
+<p>Grouped GEMM is particularly beneficial for saturating the GPU with many small problems that would insufficiently utilize the device in isolation.</p>
+</section>
+<section id="Declaring-a-grouped-GEMM-via-the-CUTLASS-Python-interface">
+<h2>Declaring a grouped GEMM via the CUTLASS Python interface<a class="headerlink" href="#Declaring-a-grouped-GEMM-via-the-CUTLASS-Python-interface" title="Permalink to this heading">#</a></h2>
+<p>A grouped GEMM operation is declared similarly to a GEMM operation in the CUTLASS Python interface: one simply calls <code class="docutils literal notranslate"><span class="pre">cutlass.op.GroupedGemm</span></code>.</p>
+<div class="nbinput docutils container">
+<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[1]:
+</pre></div>
+</div>
+<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">cutlass</span>
+<span class="kn">import</span> <span class="nn">torch</span>
+
+<span class="n">dtype</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">float16</span>
+<span class="n">plan</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class="n">GroupedGemm</span><span class="p">(</span><span class="n">element</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span> <span class="n">layout</span><span class="o">=</span><span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">RowMajor</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+<div class="nboutput nblast docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area stderr docutils container">
+<div class="highlight"><pre>
+/usr/local/lib/python3.8/dist-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
+  from .autonotebook import tqdm as notebook_tqdm
+</pre></div></div>
+</div>
+<p>We can then compile and run this operation on a group of GEMMs. We’ll first set up some utility functions to initialize GEMMs.</p>
+<div class="nbinput nblast docutils container">
+<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[2]:
+</pre></div>
+</div>
+<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">random</span>
+<span class="n">random</span><span class="o">.</span><span class="n">seed</span><span class="p">(</span><span class="mi">2023</span><span class="p">)</span>
+
+<span class="c1"># Utility function to initialize A, B, C, and D matrices corresponding to dimensions M, N, and K</span>
+<span class="k">def</span> <span class="nf">initialize</span><span class="p">(</span><span class="n">dtype</span><span class="p">,</span> <span class="n">M</span><span class="p">,</span> <span class="n">N</span><span class="p">,</span> <span class="n">K</span><span class="p">):</span>
+    <span class="n">sizes</span> <span class="o">=</span> <span class="p">[(</span><span class="n">M</span><span class="p">,</span> <span class="n">K</span><span class="p">),</span> <span class="p">(</span><span class="n">K</span><span class="p">,</span> <span class="n">N</span><span class="p">),</span> <span class="p">(</span><span class="n">M</span><span class="p">,</span> <span class="n">N</span><span class="p">),</span> <span class="p">(</span><span class="n">M</span><span class="p">,</span> <span class="n">N</span><span class="p">)]</span>
+    <span class="k">return</span> <span class="p">[</span><span class="n">torch</span><span class="o">.</span><span class="n">randint</span><span class="p">(</span><span class="o">-</span><span class="mi">3</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="n">size</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s1">&#39;cuda&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">dtype</span><span class="p">)</span> <span class="k">for</span> <span class="n">size</span> <span class="ow">in</span> <span class="n">sizes</span><span class="p">]</span>
+
+<span class="c1"># Utility function to generate `problems` GEMMs of random sizes</span>
+<span class="k">def</span> <span class="nf">generate_problems</span><span class="p">(</span><span class="n">problems</span><span class="p">):</span>
+    <span class="n">valid_sizes</span> <span class="o">=</span> <span class="p">[</span><span class="mi">128</span><span class="p">,</span> <span class="mi">256</span><span class="p">,</span> <span class="mi">512</span><span class="p">,</span> <span class="mi">1024</span><span class="p">]</span>
+    <span class="n">As</span><span class="p">,</span> <span class="n">Bs</span><span class="p">,</span> <span class="n">Cs</span><span class="p">,</span> <span class="n">Ds</span> <span class="o">=</span> <span class="p">[],</span> <span class="p">[],</span> <span class="p">[],</span> <span class="p">[]</span>
+    <span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">problems</span><span class="p">):</span>
+        <span class="n">M</span><span class="p">,</span> <span class="n">N</span><span class="p">,</span> <span class="n">K</span> <span class="o">=</span> <span class="p">[</span><span class="n">random</span><span class="o">.</span><span class="n">choice</span><span class="p">(</span><span class="n">valid_sizes</span><span class="p">)</span> <span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">3</span><span class="p">)]</span>
+        <span class="n">A</span><span class="p">,</span> <span class="n">B</span><span class="p">,</span> <span class="n">C</span><span class="p">,</span> <span class="n">D</span> <span class="o">=</span> <span class="n">initialize</span><span class="p">(</span><span class="n">dtype</span><span class="p">,</span> <span class="n">M</span><span class="p">,</span> <span class="n">N</span><span class="p">,</span> <span class="n">K</span><span class="p">)</span>
+        <span class="n">As</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">A</span><span class="p">)</span>
+        <span class="n">Bs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">B</span><span class="p">)</span>
+        <span class="n">Cs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">C</span><span class="p">)</span>
+        <span class="n">Ds</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">D</span><span class="p">)</span>
+    <span class="k">return</span> <span class="n">As</span><span class="p">,</span> <span class="n">Bs</span><span class="p">,</span> <span class="n">Cs</span><span class="p">,</span> <span class="n">Ds</span>
+</pre></div>
+</div>
+</div>
+<p>We’ll next run a group of 50 GEMMs via the CUTLASS Python interface and via PyTorch.</p>
+<div class="nbinput docutils container">
+<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[3]:
+</pre></div>
+</div>
+<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">As</span><span class="p">,</span> <span class="n">Bs</span><span class="p">,</span> <span class="n">Cs</span><span class="p">,</span> <span class="n">Ds</span><span class="p">,</span> <span class="o">=</span> <span class="n">generate_problems</span><span class="p">(</span><span class="mi">50</span><span class="p">)</span>
+
+<span class="n">plan</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">As</span><span class="p">,</span> <span class="n">Bs</span><span class="p">,</span> <span class="n">Cs</span><span class="p">,</span> <span class="n">Ds</span><span class="p">,</span> <span class="n">print_module</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+<span class="n">Ds_torch</span> <span class="o">=</span> <span class="p">[</span><span class="n">a</span> <span class="o">@</span> <span class="n">b</span> <span class="k">for</span> <span class="n">a</span><span class="p">,</span> <span class="n">b</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">As</span><span class="p">,</span> <span class="n">Bs</span><span class="p">)]</span>
+
+<span class="k">for</span> <span class="n">d</span><span class="p">,</span> <span class="n">d_torch</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">Ds</span><span class="p">,</span> <span class="n">Ds_torch</span><span class="p">):</span>
+    <span class="k">assert</span> <span class="n">torch</span><span class="o">.</span><span class="n">allclose</span><span class="p">(</span><span class="n">d</span><span class="p">,</span> <span class="n">d_torch</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+<div class="nboutput nblast docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area docutils container">
+<div class="highlight"><pre>
+
+// Gemm operator cutlass_sm80_tensorop_h16x8x16gemm_grouped_1x1x1_256x128_64x3_tt_align8
+using cutlass_sm80_tensorop_h16x8x16gemm_grouped_1x1x1_256x128_64x3_tt_align8_base =
+  typename cutlass::gemm::kernel::DefaultGemmGrouped&lt;
+    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,
+    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,
+    cutlass::half_t, cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape&lt;256, 128, 64&gt;,
+    cutlass::gemm::GemmShape&lt;64, 64, 64&gt;,
+    cutlass::gemm::GemmShape&lt;16, 8, 16&gt;,
+    cutlass::epilogue::thread::LinearCombination&lt;cutlass::half_t, 8, cutlass::half_t, cutlass::half_t&gt;,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle&lt;1&gt;,
+    3,
+    cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly,
+    cutlass::arch::OpMultiplyAdd
+&gt;::GemmKernel;
+
+// Define named type
+struct cutlass_sm80_tensorop_h16x8x16gemm_grouped_1x1x1_256x128_64x3_tt_align8_type :
+  public cutlass_sm80_tensorop_h16x8x16gemm_grouped_1x1x1_256x128_64x3_tt_align8_base { };
+
+</pre></div></div>
+</div>
+</section>
+<section id="Exporting-the-CUTLASS-kernel-to-a-PyTorch-CUDA-extension">
+<h2>Exporting the CUTLASS kernel to a PyTorch CUDA extension<a class="headerlink" href="#Exporting-the-CUTLASS-kernel-to-a-PyTorch-CUDA-extension" title="Permalink to this heading">#</a></h2>
+<p>The procedure above allows one to quickly experiment with using a CUTLASS kernels However, one might prefer to use the CUTLASS kernel via a <a class="reference external" href="https://pytorch.org/tutorials/advanced/cpp_extension.html">PyTorch CUDA extension</a>. This will avoids adding any runtime overheads associated with the Python portions of the CUTLASS Python interface.</p>
+<p>The CUTLASS Python interface provides simple solutions for creating PyTorch CUDA extensions for a CUTLASS kernel. These extensions can either be written out for a later “ahead-of-time” compilation, or be just-in-time compiled and returned to the user.</p>
+<p>To create a JIT-compiled module from the CUTLASS kernel we defined above, simply call the following:</p>
+<div class="nbinput nblast docutils container">
+<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[4]:
+</pre></div>
+</div>
+<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">op</span> <span class="o">=</span> <span class="n">plan</span><span class="o">.</span><span class="n">construct</span><span class="p">()</span>
+<span class="n">grouped_gemm</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">emit</span><span class="o">.</span><span class="n">pytorch</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s1">&#39;grouped_gemm&#39;</span><span class="p">,</span> <span class="n">cc</span><span class="o">=</span><span class="n">plan</span><span class="o">.</span><span class="n">cc</span><span class="p">,</span> <span class="n">sourcedir</span><span class="o">=</span><span class="s1">&#39;out&#39;</span><span class="p">,</span> <span class="n">jit</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+<p>The <code class="docutils literal notranslate"><span class="pre">cutlass.emit.pytorch</span></code> function emits: * <code class="docutils literal notranslate"><span class="pre">out/grouped_gemm_kernel.cu</span></code>: This file contains the declaration of the CUTLASS kernel and a method to call it from PyTorch tensors * <code class="docutils literal notranslate"><span class="pre">out/grouped_gemm.cpp</span></code>: This file contains a C++ wrapper around the aforementioned CUTLASS kernel * <code class="docutils literal notranslate"><span class="pre">setup.py</span></code>: This file contains the <code class="docutils literal notranslate"><span class="pre">setuptools</span></code> script for building and installing the generated extension</p>
+<p>The extension can be build from within the <code class="docutils literal notranslate"><span class="pre">module_output</span></code> directory by running:</p>
+<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nv">TORCH_CUDA_ARCH_LIST</span><span class="o">=</span><span class="s2">&quot;8.0&quot;</span><span class="w"> </span>python<span class="w"> </span>setup.py<span class="w"> </span>install
+</pre></div>
+</div>
+<p>Where <code class="docutils literal notranslate"><span class="pre">TORCH_ARCH_LIST</span></code> is set to the compute capability of the device on which the kernel will be run.</p>
+<p>See the PyTorch <a class="reference external" href="https://pytorch.org/tutorials/advanced/cpp_extension.html">“Custom C++ and CUDA Extensions”</a> tutorial for more details on this.</p>
+<p>The PyTorch CUDA extension could be built for this module by running:</p>
+<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">cd</span><span class="w"> </span>out
+<span class="nv">TORCH_CUDA_ARCH_LIST</span><span class="o">=</span><span class="s2">&quot;8.0&quot;</span><span class="w"> </span>python<span class="w"> </span>setup.py
+</pre></div>
+</div>
+<p>(assuming that one is building for SM80)</p>
+<p>One could then use the kernel in a later PyTorch module by running:</p>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">torch</span>
+<span class="kn">import</span> <span class="nn">grouped_gemm</span>
+
+<span class="n">grouped_gemm</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">As</span><span class="p">,</span> <span class="n">Bs</span><span class="p">)</span>
+</pre></div>
+</div>
+<p>In this case, however, we set <code class="docutils literal notranslate"><span class="pre">jit=True</span></code>, which specifies that we would like to compile and load the PyTorch CUDA extension on the fly. Under the hood, this leverages the <a class="reference external" href="https://pytorch.org/tutorials/advanced/cpp_extension.html">torch.utils.cpp_extension.load</a> method and returns back the loaded extension.</p>
+<p>We can then use the extension and compare its results to running the GEMMs via vanilla PyTorch GEMMs:</p>
+<div class="nbinput nblast docutils container">
+<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[5]:
+</pre></div>
+</div>
+<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">Ds</span> <span class="o">=</span> <span class="n">grouped_gemm</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">As</span><span class="p">,</span> <span class="n">Bs</span><span class="p">)</span>
+<span class="n">Ds_torch</span> <span class="o">=</span> <span class="p">[</span><span class="n">a</span> <span class="o">@</span> <span class="n">b</span> <span class="k">for</span> <span class="n">a</span><span class="p">,</span> <span class="n">b</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">As</span><span class="p">,</span> <span class="n">Bs</span><span class="p">)]</span>
+<span class="k">for</span> <span class="n">d</span><span class="p">,</span> <span class="n">d_torch</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">Ds</span><span class="p">,</span> <span class="n">Ds_torch</span><span class="p">):</span>
+    <span class="k">assert</span> <span class="n">torch</span><span class="o">.</span><span class="n">allclose</span><span class="p">(</span><span class="n">d</span><span class="p">,</span> <span class="n">d_torch</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+<p>Finally, we can profile our grouped GEMM extension:</p>
+<div class="nbinput docutils container">
+<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[6]:
+</pre></div>
+</div>
+<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">num_warmup</span> <span class="o">=</span> <span class="mi">20</span>
+<span class="n">num_profile</span> <span class="o">=</span> <span class="mi">100</span>
+
+<span class="c1"># Warmup iterations</span>
+<span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">num_warmup</span><span class="p">):</span>
+    <span class="n">Ds</span> <span class="o">=</span> <span class="n">grouped_gemm</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">As</span><span class="p">,</span> <span class="n">Bs</span><span class="p">)</span>
+    <span class="n">Ds_torch</span> <span class="o">=</span> <span class="p">[</span><span class="n">a</span> <span class="o">@</span> <span class="n">b</span> <span class="k">for</span> <span class="n">a</span><span class="p">,</span> <span class="n">b</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">As</span><span class="p">,</span> <span class="n">Bs</span><span class="p">)]</span>
+    <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">synchronize</span><span class="p">()</span>
+
+<span class="c1"># Timing iterations</span>
+<span class="kn">import</span> <span class="nn">time</span>
+<span class="n">grouped</span> <span class="o">=</span> <span class="mi">0</span>
+<span class="n">nongrouped</span> <span class="o">=</span> <span class="mi">0</span>
+<span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">num_profile</span><span class="p">):</span>
+    <span class="n">start</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span>
+    <span class="n">Ds</span> <span class="o">=</span> <span class="n">grouped_gemm</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">As</span><span class="p">,</span> <span class="n">Bs</span><span class="p">)</span>
+    <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">synchronize</span><span class="p">()</span>
+    <span class="n">grouped</span> <span class="o">+=</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span> <span class="o">-</span> <span class="n">start</span>
+
+    <span class="n">start</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span>
+    <span class="n">Ds_torch</span> <span class="o">=</span> <span class="p">[</span><span class="n">a</span> <span class="o">@</span> <span class="n">b</span> <span class="k">for</span> <span class="n">a</span><span class="p">,</span> <span class="n">b</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">As</span><span class="p">,</span> <span class="n">Bs</span><span class="p">)]</span>
+    <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">synchronize</span><span class="p">()</span>
+    <span class="n">nongrouped</span> <span class="o">+=</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span> <span class="o">-</span> <span class="n">start</span>
+
+<span class="nb">print</span><span class="p">(</span><span class="s1">&#39;Grouped:     </span><span class="si">{:.3f}</span><span class="s1"> us&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">grouped</span> <span class="o">*</span> <span class="mf">1e6</span><span class="o">/</span><span class="n">num_profile</span><span class="p">))</span>
+<span class="nb">print</span><span class="p">(</span><span class="s1">&#39;Non-Grouped: </span><span class="si">{:.3f}</span><span class="s1"> us&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">nongrouped</span> <span class="o">*</span> <span class="mf">1e6</span><span class="o">/</span><span class="n">num_profile</span><span class="p">))</span>
+<span class="nb">print</span><span class="p">(</span><span class="s1">&#39;Speedup: </span><span class="si">{:.3f}</span><span class="s1">&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">nongrouped</span> <span class="o">/</span> <span class="n">grouped</span><span class="p">))</span>
+</pre></div>
+</div>
+</div>
+<div class="nboutput nblast docutils container">
+<div class="prompt empty docutils container">
+</div>
+<div class="output_area docutils container">
+<div class="highlight"><pre>
+Grouped:     400.696 us
+Non-Grouped: 646.670 us
+Speedup: 1.614
+</pre></div></div>
+</div>
+<div class="nbinput nblast docutils container">
+<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[ ]:
+</pre></div>
+</div>
+<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span>
+</pre></div>
+</div>
+</div>
+</section>
+</section>
+
+        </article>
+      </div>
+      <footer>
+        
+        <div class="related-pages">
+          
+          <a class="prev-page" href="01_epilogue.html">
+              <svg class="furo-related-icon"><use href="#svg-arrow-right"></use></svg>
+              <div class="page-info">
+                <div class="context">
+                  <span>Previous</span>
+                </div>
+                
+                <div class="title">Example of using elementwise activation functions in the CUTLASS Python interface</div>
+                
+              </div>
+            </a>
+        </div>
+        <div class="bottom-of-page">
+          <div class="left-details">
+            <div class="copyright">
+                Copyright &#169; 2023, NVIDIA
+            </div>
+            Made with <a href="https://www.sphinx-doc.org/">Sphinx</a> and <a class="muted-link" href="https://pradyunsg.me">@pradyunsg</a>'s
+            
+            <a href="https://github.com/pradyunsg/furo">Furo</a>
+            
+          </div>
+          <div class="right-details">
+            <div class="icons">
+              <a class="muted-link " href="https://github.com/NVIDIA/cutlass" aria-label="GitHub">
+                <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 16 16">
+                    <path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"></path>
+                </svg>
+            </a>
+              
+            </div>
+          </div>
+        </div>
+        
+      </footer>
+    </div>
+    <aside class="toc-drawer">
+      
+      
+      <div class="toc-sticky toc-scroll">
+        <div class="toc-title-container">
+          <span class="toc-title">
+            On this page
+          </span>
+        </div>
+        <div class="toc-tree-container">
+          <div class="toc-tree">
+            <ul>
+<li><a class="reference internal" href="#">Exporting a CUTLASS grouped GEMM kernel to a PyTorch CUDA extension</a><ul>
+<li><a class="reference internal" href="#Background-on-grouped-GEMM">Background on grouped GEMM</a></li>
+<li><a class="reference internal" href="#Declaring-a-grouped-GEMM-via-the-CUTLASS-Python-interface">Declaring a grouped GEMM via the CUTLASS Python interface</a></li>
+<li><a class="reference internal" href="#Exporting-the-CUTLASS-kernel-to-a-PyTorch-CUDA-extension">Exporting the CUTLASS kernel to a PyTorch CUDA extension</a></li>
+</ul>
+</li>
+</ul>
+
+          </div>
+        </div>
+      </div>
+      
+      
+    </aside>
+  </div>
+</div><script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script>
+    <script src="../_static/doctools.js"></script>
+    <script src="../_static/sphinx_highlight.js"></script>
+    <script src="../_static/scripts/furo.js"></script>
+    <script src="../_static/clipboard.min.js"></script>
+    <script src="../_static/copybutton.js"></script>
+    <script src="../_static/tabs.js"></script>
+    <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
+    <script>window.MathJax = {"tex": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true}, "options": {"ignoreHtmlClass": "tex2jax_ignore|mathjax_ignore|document", "processHtmlClass": "tex2jax_process|mathjax_process|math|output_area"}}</script>
+    <script defer="defer" src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
+    </body>
+</html>
\ No newline at end of file
diff --git a/python/docs/externals/02_pytorch_extension_grouped_gemm.ipynb b/python/docs/externals/02_pytorch_extension_grouped_gemm.ipynb
new file mode 100644
index 0000000000..3d1ebc5f0a
--- /dev/null
+++ b/python/docs/externals/02_pytorch_extension_grouped_gemm.ipynb
@@ -0,0 +1,356 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "6acbea5d",
+   "metadata": {},
+   "source": [
+    "# Exporting a CUTLASS grouped GEMM kernel to a PyTorch CUDA extension\n",
+    "This notebook walks through a basic example of using the CUTLASS Python interface to declare\n",
+    "a grouped GEMM kernel and export it as a PyTorch CUDA extension.\n",
+    "\n",
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cutlass/tree/master/examples/00_basic_gemm.ipynb)\n",
+    "\n",
+    "## Background on grouped GEMM\n",
+    "Grouped GEMM enables one to execute a set of GEMMs (each with potentially different sizes and strides)\n",
+    "in a single CUDA kernel. It can be thought of as a generalized version of a pointer-array GEMM,\n",
+    "without the requirement that the sizes and strides of each GEMM be the same.\n",
+    "\n",
+    "For example, if one has `p` GEMMs with sizes:\n",
+    "```text\n",
+    "M_1 x N_1 x K_1\n",
+    "M_2 x N_2 x K_2\n",
+    "...\n",
+    "M_p x N_p x K_p\n",
+    "```\n",
+    "CUTLASS's grouped GEMM will execute these in a single CUDA kernel.\n",
+    "\n",
+    "Grouped GEMM is particularly beneficial for saturating the GPU with many small problems that would\n",
+    "insufficiently utilize the device in isolation.\n",
+    "\n",
+    "## Declaring a grouped GEMM via the CUTLASS Python interface\n",
+    "A grouped GEMM operation is declared similarly to a GEMM operation in the CUTLASS Python interface: one\n",
+    "simply calls `cutlass.op.GroupedGemm`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "fdcf21d8",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-04-18T18:01:01.888030Z",
+     "iopub.status.busy": "2023-04-18T18:01:01.887634Z",
+     "iopub.status.idle": "2023-04-18T18:01:06.069481Z",
+     "shell.execute_reply": "2023-04-18T18:01:06.068513Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.8/dist-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import cutlass\n",
+    "import torch\n",
+    "\n",
+    "dtype = torch.float16\n",
+    "plan = cutlass.op.GroupedGemm(element=dtype, layout=cutlass.LayoutType.RowMajor)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "514f40a4",
+   "metadata": {},
+   "source": [
+    "We can then compile and run this operation on a group of GEMMs. We'll first set up some utility functions to initialize GEMMs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "c2a7371e",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-04-18T18:01:06.073326Z",
+     "iopub.status.busy": "2023-04-18T18:01:06.073092Z",
+     "iopub.status.idle": "2023-04-18T18:01:06.080337Z",
+     "shell.execute_reply": "2023-04-18T18:01:06.079517Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import random\n",
+    "random.seed(2023)\n",
+    "\n",
+    "# Utility function to initialize A, B, C, and D matrices corresponding to dimensions M, N, and K\n",
+    "def initialize(dtype, M, N, K):\n",
+    "    sizes = [(M, K), (K, N), (M, N), (M, N)]\n",
+    "    return [torch.randint(-3, 3, size, device='cuda').to(dtype) for size in sizes]\n",
+    "\n",
+    "# Utility function to generate `problems` GEMMs of random sizes\n",
+    "def generate_problems(problems):\n",
+    "    valid_sizes = [128, 256, 512, 1024]\n",
+    "    As, Bs, Cs, Ds = [], [], [], []\n",
+    "    for _ in range(problems):\n",
+    "        M, N, K = [random.choice(valid_sizes) for _ in range(3)]\n",
+    "        A, B, C, D = initialize(dtype, M, N, K)\n",
+    "        As.append(A)\n",
+    "        Bs.append(B)\n",
+    "        Cs.append(C)\n",
+    "        Ds.append(D)\n",
+    "    return As, Bs, Cs, Ds"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "590a3bc5",
+   "metadata": {},
+   "source": [
+    "We'll next run a group of 50 GEMMs via the CUTLASS Python interface and via PyTorch."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "776c9233",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-04-18T18:01:06.083288Z",
+     "iopub.status.busy": "2023-04-18T18:01:06.083082Z",
+     "iopub.status.idle": "2023-04-18T18:01:10.783577Z",
+     "shell.execute_reply": "2023-04-18T18:01:10.782798Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "// Gemm operator cutlass_sm80_tensorop_h16x8x16gemm_grouped_1x1x1_256x128_64x3_tt_align8\n",
+      "using cutlass_sm80_tensorop_h16x8x16gemm_grouped_1x1x1_256x128_64x3_tt_align8_base =\n",
+      "  typename cutlass::gemm::kernel::DefaultGemmGrouped<\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 8,\n",
+      "    cutlass::half_t, cutlass::layout::RowMajor,\n",
+      "    cutlass::half_t,\n",
+      "    cutlass::arch::OpClassTensorOp,\n",
+      "    cutlass::arch::Sm80,\n",
+      "    cutlass::gemm::GemmShape<256, 128, 64>,\n",
+      "    cutlass::gemm::GemmShape<64, 64, 64>,\n",
+      "    cutlass::gemm::GemmShape<16, 8, 16>,\n",
+      "    cutlass::epilogue::thread::LinearCombination<cutlass::half_t, 8, cutlass::half_t, cutlass::half_t>,\n",
+      "    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,\n",
+      "    3,\n",
+      "    cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly,\n",
+      "    cutlass::arch::OpMultiplyAdd\n",
+      ">::GemmKernel;\n",
+      "\n",
+      "// Define named type\n",
+      "struct cutlass_sm80_tensorop_h16x8x16gemm_grouped_1x1x1_256x128_64x3_tt_align8_type :\n",
+      "  public cutlass_sm80_tensorop_h16x8x16gemm_grouped_1x1x1_256x128_64x3_tt_align8_base { };\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "As, Bs, Cs, Ds, = generate_problems(50)\n",
+    "\n",
+    "plan.run(As, Bs, Cs, Ds, print_module=True)\n",
+    "Ds_torch = [a @ b for a, b in zip(As, Bs)]\n",
+    "\n",
+    "for d, d_torch in zip(Ds, Ds_torch):\n",
+    "    assert torch.allclose(d, d_torch)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "766e4f03",
+   "metadata": {},
+   "source": [
+    "## Exporting the CUTLASS kernel to a PyTorch CUDA extension\n",
+    "The procedure above allows one to quickly experiment with using a CUTLASS kernels However, one might prefer to use the CUTLASS kernel via a [PyTorch CUDA extension](https://pytorch.org/tutorials/advanced/cpp_extension.html). This will avoids adding any runtime overheads associated with the Python portions of the CUTLASS Python interface.\n",
+    "\n",
+    "The CUTLASS Python interface provides simple solutions for creating PyTorch CUDA extensions for a CUTLASS kernel. These extensions can either be written out for a later \"ahead-of-time\" compilation, or be just-in-time compiled and returned to the user.\n",
+    "\n",
+    "To create a JIT-compiled module from the CUTLASS kernel we defined above, simply call the following:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "3a98dee6",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-04-18T18:01:10.787020Z",
+     "iopub.status.busy": "2023-04-18T18:01:10.786862Z",
+     "iopub.status.idle": "2023-04-18T18:02:08.445210Z",
+     "shell.execute_reply": "2023-04-18T18:02:08.443997Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "op = plan.construct()\n",
+    "grouped_gemm = cutlass.emit.pytorch(op, name='grouped_gemm', cc=plan.cc, sourcedir='out', jit=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c8ca3991",
+   "metadata": {},
+   "source": [
+    "The `cutlass.emit.pytorch` function emits:\n",
+    "* `out/grouped_gemm_kernel.cu`: This file contains the declaration of the CUTLASS kernel and a method to call it from PyTorch tensors\n",
+    "* `out/grouped_gemm.cpp`: This file contains a C++ wrapper around the aforementioned CUTLASS kernel\n",
+    "* `setup.py`: This file contains the `setuptools` script for building and installing the generated extension\n",
+    "\n",
+    "The extension can be build from within the `module_output` directory by running:\n",
+    "```bash\n",
+    "TORCH_CUDA_ARCH_LIST=\"8.0\" python setup.py install\n",
+    "```\n",
+    "Where `TORCH_ARCH_LIST` is set to the compute capability of the device on which the kernel will be run.\n",
+    "\n",
+    "See the PyTorch [\"Custom C++ and CUDA Extensions\"](https://pytorch.org/tutorials/advanced/cpp_extension.html) tutorial for more details on this.\n",
+    "\n",
+    "The PyTorch CUDA extension could be built for this module by running:\n",
+    "```bash\n",
+    "cd out\n",
+    "TORCH_CUDA_ARCH_LIST=\"8.0\" python setup.py\n",
+    "```\n",
+    "(assuming that one is building for SM80)\n",
+    "\n",
+    "One could then use the kernel in a later PyTorch module by running:\n",
+    "\n",
+    "```python\n",
+    "import torch\n",
+    "import grouped_gemm\n",
+    "\n",
+    "grouped_gemm.run(As, Bs)\n",
+    "```\n",
+    "\n",
+    "In this case, however, we set `jit=True`, which specifies that we would like to compile and load the PyTorch CUDA extension on the fly.\n",
+    "Under the hood, this leverages the [torch.utils.cpp_extension.load](https://pytorch.org/tutorials/advanced/cpp_extension.html) method\n",
+    "and returns back the loaded extension.\n",
+    "\n",
+    "We can then use the extension and compare its results to running the GEMMs via vanilla PyTorch GEMMs:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "cecb26a4",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-04-18T18:02:08.449530Z",
+     "iopub.status.busy": "2023-04-18T18:02:08.449077Z",
+     "iopub.status.idle": "2023-04-18T18:02:08.464755Z",
+     "shell.execute_reply": "2023-04-18T18:02:08.464200Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "Ds = grouped_gemm.run(As, Bs)\n",
+    "Ds_torch = [a @ b for a, b in zip(As, Bs)]\n",
+    "for d, d_torch in zip(Ds, Ds_torch):\n",
+    "    assert torch.allclose(d, d_torch)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "50db80e4",
+   "metadata": {},
+   "source": [
+    "Finally, we can profile our grouped GEMM extension:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "b76805d3",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-04-18T18:02:08.467087Z",
+     "iopub.status.busy": "2023-04-18T18:02:08.466879Z",
+     "iopub.status.idle": "2023-04-18T18:02:08.603689Z",
+     "shell.execute_reply": "2023-04-18T18:02:08.603085Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Grouped:     400.696 us\n",
+      "Non-Grouped: 646.670 us\n",
+      "Speedup: 1.614\n"
+     ]
+    }
+   ],
+   "source": [
+    "num_warmup = 20\n",
+    "num_profile = 100\n",
+    "\n",
+    "# Warmup iterations\n",
+    "for _ in range(num_warmup):\n",
+    "    Ds = grouped_gemm.run(As, Bs)\n",
+    "    Ds_torch = [a @ b for a, b in zip(As, Bs)]\n",
+    "    torch.cuda.synchronize()\n",
+    "\n",
+    "# Timing iterations\n",
+    "import time\n",
+    "grouped = 0\n",
+    "nongrouped = 0\n",
+    "for _ in range(num_profile):\n",
+    "    start = time.time()\n",
+    "    Ds = grouped_gemm.run(As, Bs)\n",
+    "    torch.cuda.synchronize()\n",
+    "    grouped += time.time() - start\n",
+    "\n",
+    "    start = time.time()\n",
+    "    Ds_torch = [a @ b for a, b in zip(As, Bs)]\n",
+    "    torch.cuda.synchronize()\n",
+    "    nongrouped += time.time() - start\n",
+    "\n",
+    "print('Grouped:     {:.3f} us'.format(grouped * 1e6/num_profile))\n",
+    "print('Non-Grouped: {:.3f} us'.format(nongrouped * 1e6/num_profile))\n",
+    "print('Speedup: {:.3f}'.format(nongrouped / grouped))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f22fc696",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/python/docs/genindex.html b/python/docs/genindex.html
new file mode 100644
index 0000000000..6b290eda34
--- /dev/null
+++ b/python/docs/genindex.html
@@ -0,0 +1,656 @@
+<!doctype html>
+<html class="no-js" lang="en">
+  <head><meta charset="utf-8"/>
+    <meta name="viewport" content="width=device-width,initial-scale=1"/>
+    <meta name="color-scheme" content="light dark"><link rel="index" title="Index" href="#" /><link rel="search" title="Search" href="search.html" />
+        <link rel="canonical" href="docs/genindex.html" />
+
+    <!-- Generated with Sphinx 6.1.3 and Furo 2023.03.27 --><title>Index - CUTLASS Python</title>
+<link rel="stylesheet" type="text/css" href="_static/pygments.css" />
+    <link rel="stylesheet" type="text/css" href="_static/styles/furo.css?digest=fad236701ea90a88636c2a8c73b44ae642ed2a53" />
+    <link rel="stylesheet" type="text/css" href="_static/copybutton.css" />
+    <link rel="stylesheet" type="text/css" href="_static/tabs.css" />
+    <link rel="stylesheet" type="text/css" href="_static/styles/furo-extensions.css?digest=30d1aed668e5c3a91c3e3bf6a60b675221979f0e" />
+    
+    
+
+
+<style>
+  body {
+    --color-code-background: #eeffcc;
+  --color-code-foreground: black;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+  }
+  @media not print {
+    body[data-theme="dark"] {
+      --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+    }
+    @media (prefers-color-scheme: dark) {
+      body:not([data-theme="light"]) {
+        --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+      }
+    }
+  }
+</style></head>
+  <body>
+    
+    <script>
+      document.body.dataset.theme = localStorage.getItem("theme") || "auto";
+    </script>
+    
+
+<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
+  <symbol id="svg-toc" viewBox="0 0 24 24">
+    <title>Contents</title>
+    <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 1024 1024">
+      <path d="M408 442h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8zm-8 204c0 4.4 3.6 8 8 8h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56zm504-486H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zm0 632H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zM115.4 518.9L271.7 642c5.8 4.6 14.4.5 14.4-6.9V388.9c0-7.4-8.5-11.5-14.4-6.9L115.4 505.1a8.74 8.74 0 0 0 0 13.8z"/>
+    </svg>
+  </symbol>
+  <symbol id="svg-menu" viewBox="0 0 24 24">
+    <title>Menu</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-menu">
+      <line x1="3" y1="12" x2="21" y2="12"></line>
+      <line x1="3" y1="6" x2="21" y2="6"></line>
+      <line x1="3" y1="18" x2="21" y2="18"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-arrow-right" viewBox="0 0 24 24">
+    <title>Expand</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-chevron-right">
+      <polyline points="9 18 15 12 9 6"></polyline>
+    </svg>
+  </symbol>
+  <symbol id="svg-sun" viewBox="0 0 24 24">
+    <title>Light mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="feather-sun">
+      <circle cx="12" cy="12" r="5"></circle>
+      <line x1="12" y1="1" x2="12" y2="3"></line>
+      <line x1="12" y1="21" x2="12" y2="23"></line>
+      <line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
+      <line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
+      <line x1="1" y1="12" x2="3" y2="12"></line>
+      <line x1="21" y1="12" x2="23" y2="12"></line>
+      <line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
+      <line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-moon" viewBox="0 0 24 24">
+    <title>Dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-moon">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none" />
+      <path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z" />
+    </svg>
+  </symbol>
+  <symbol id="svg-sun-half" viewBox="0 0 24 24">
+    <title>Auto light/dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-shadow">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none"/>
+      <circle cx="12" cy="12" r="9" />
+      <path d="M13 12h5" />
+      <path d="M13 15h4" />
+      <path d="M13 18h1" />
+      <path d="M13 9h4" />
+      <path d="M13 6h1" />
+    </svg>
+  </symbol>
+</svg>
+
+<input type="checkbox" class="sidebar-toggle" name="__navigation" id="__navigation">
+<input type="checkbox" class="sidebar-toggle" name="__toc" id="__toc">
+<label class="overlay sidebar-overlay" for="__navigation">
+  <div class="visually-hidden">Hide navigation sidebar</div>
+</label>
+<label class="overlay toc-overlay" for="__toc">
+  <div class="visually-hidden">Hide table of contents sidebar</div>
+</label>
+
+
+
+<div class="page">
+  <header class="mobile-header">
+    <div class="header-left">
+      <label class="nav-overlay-icon" for="__navigation">
+        <div class="visually-hidden">Toggle site navigation sidebar</div>
+        <i class="icon"><svg><use href="#svg-menu"></use></svg></i>
+      </label>
+    </div>
+    <div class="header-center">
+      <a href="index.html"><div class="brand">CUTLASS Python</div></a>
+    </div>
+    <div class="header-right">
+      <div class="theme-toggle-container theme-toggle-header">
+        <button class="theme-toggle">
+          <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+          <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+          <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+          <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+        </button>
+      </div>
+      <label class="toc-overlay-icon toc-header-icon no-toc" for="__toc">
+        <div class="visually-hidden">Toggle table of contents sidebar</div>
+        <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+      </label>
+    </div>
+  </header>
+  <aside class="sidebar-drawer">
+    <div class="sidebar-container">
+      
+      <div class="sidebar-sticky"><a class="sidebar-brand" href="index.html">
+  
+  <div class="sidebar-logo-container">
+    <img class="sidebar-logo only-light" src="_static/cutlass-logo-small.png" alt="Light Logo"/>
+    <img class="sidebar-logo only-dark" src="_static/cutlass-logo-small.png" alt="Dark Logo"/>
+  </div>
+  
+  <span class="sidebar-brand-text">CUTLASS Python</span>
+  
+</a><form class="sidebar-search-container" method="get" action="search.html" role="search">
+  <input class="sidebar-search" placeholder="Search" name="q" aria-label="Search">
+  <input type="hidden" name="check_keywords" value="yes">
+  <input type="hidden" name="area" value="default">
+</form>
+<div id="searchbox"></div><div class="sidebar-scroll"><div class="sidebar-tree">
+  <ul>
+<li class="toctree-l1"><a class="reference internal" href="index.html">Home</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Getting Started:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="install.html">Installation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="externals/00_basic_gemm.html">Getting Started</a></li>
+<li class="toctree-l1"><a class="reference internal" href="contribute.html">Contributing</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Python Documentation:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="modules.html">CUTLASS Python API</a><input class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" role="switch" type="checkbox"/><label for="toctree-checkbox-1"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2 has-children"><a class="reference internal" href="cutlass.html">CUTLASS</a><input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" role="switch" type="checkbox"/><label for="toctree-checkbox-2"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.emit.html">Emitters</a></li>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.op.html">Operations</a></li>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.utils.html">Utilities</a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Examples and Tutorials:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="examples.html">Examples</a><input class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" role="switch" type="checkbox"/><label for="toctree-checkbox-3"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2"><a class="reference internal" href="externals/00_basic_gemm.html">Basic GEMM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="externals/01_epilogue.html">Epilogue</a></li>
+<li class="toctree-l2"><a class="reference internal" href="externals/02_pytorch_extension_grouped_gemm.html">PyTorch Extension</a></li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Reference:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference external" href="https://github.com/NVIDIA/cutlass">Github</a></li>
+</ul>
+
+</div>
+</div>
+
+      </div>
+      
+    </div>
+  </aside>
+  <div class="main">
+    <div class="content">
+      <div class="article-container">
+        <a href="#" class="back-to-top muted-link">
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+            <path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12z"></path>
+          </svg>
+          <span>Back to top</span>
+        </a>
+        <div class="content-icon-container">
+          <div class="theme-toggle-container theme-toggle-content">
+            <button class="theme-toggle">
+              <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+              <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+              <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+              <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+            </button>
+          </div>
+          <label class="toc-overlay-icon toc-content-icon no-toc" for="__toc">
+            <div class="visually-hidden">Toggle table of contents sidebar</div>
+            <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+          </label>
+        </div>
+        <article role="main">
+          
+<section class="genindex-section">
+  <h1 id="index">Index</h1>
+  <div class="genindex-jumpbox"><a href="#A"><strong>A</strong></a> | <a href="#B"><strong>B</strong></a> | <a href="#C"><strong>C</strong></a> | <a href="#F"><strong>F</strong></a> | <a href="#G"><strong>G</strong></a> | <a href="#H"><strong>H</strong></a> | <a href="#K"><strong>K</strong></a> | <a href="#L"><strong>L</strong></a> | <a href="#M"><strong>M</strong></a> | <a href="#N"><strong>N</strong></a> | <a href="#O"><strong>O</strong></a> | <a href="#P"><strong>P</strong></a> | <a href="#R"><strong>R</strong></a> | <a href="#S"><strong>S</strong></a> | <a href="#T"><strong>T</strong></a> | <a href="#V"><strong>V</strong></a></div>
+</section>
+<section id="A" class="genindex-section">
+  <h2>A</h2>
+  <table style="width: 100%" class="indextable genindextable"><tr>
+    <td style="width: 33%; vertical-align: top;"><ul>
+        <li><a href="cutlass.op.html#cutlass.op.gemm.Gemm.activation">activation (cutlass.op.gemm.Gemm property)</a>
+</li>
+        <li><a href="cutlass.op.html#cutlass.op.op.OperationBase.activations">activations() (cutlass.op.op.OperationBase method)</a>
+</li>
+        <li><a href="cutlass.html#cutlass.library_defaults.KernelsForDataType.add">add() (cutlass.library_defaults.KernelsForDataType method)</a>
+</li>
+    </ul></td>
+    <td style="width: 33%; vertical-align: top;"><ul>
+        <li><a href="cutlass.utils.html#cutlass.utils.check.alignment_or_default">alignment_or_default() (in module cutlass.utils.check)</a>
+</li>
+        <li><a href="cutlass.html#cutlass.library_defaults.KernelsForDataType.alignments">alignments (cutlass.library_defaults.KernelsForDataType property)</a>
+</li>
+        <li><a href="cutlass.html#cutlass.library_defaults.KernelsForDataType.all_operations">all_operations (cutlass.library_defaults.KernelsForDataType property)</a>
+</li>
+        <li><a href="cutlass.html#cutlass.library_defaults.ArchOptions">ArchOptions (class in cutlass.library_defaults)</a>
+</li>
+    </ul></td>
+  </tr></table>
+</section>
+
+<section id="B" class="genindex-section">
+  <h2>B</h2>
+  <table style="width: 100%" class="indextable genindextable"><tr>
+    <td style="width: 33%; vertical-align: top;"><ul>
+        <li><a href="cutlass.utils.html#cutlass.utils.datatypes.backend_math_operation">backend_math_operation() (in module cutlass.utils.datatypes)</a>
+</li>
+        <li><a href="cutlass.utils.html#cutlass.utils.datatypes.bfloat16_library_type">bfloat16_library_type() (in module cutlass.utils.datatypes)</a>
+</li>
+        <li><a href="cutlass.utils.html#cutlass.utils.datatypes.bfloat16_type">bfloat16_type() (in module cutlass.utils.datatypes)</a>
+</li>
+    </ul></td>
+    <td style="width: 33%; vertical-align: top;"><ul>
+        <li><a href="cutlass.utils.html#cutlass.utils.datatypes.binding_layout">binding_layout() (in module cutlass.utils.datatypes)</a>
+</li>
+        <li><a href="cutlass.utils.html#cutlass.utils.datatypes.binding_library_type">binding_library_type() (in module cutlass.utils.datatypes)</a>
+</li>
+        <li><a href="cutlass.utils.html#cutlass.utils.datatypes.binding_opclass">binding_opclass() (in module cutlass.utils.datatypes)</a>
+</li>
+        <li><a href="cutlass.utils.html#cutlass.utils.datatypes.binding_type">binding_type() (in module cutlass.utils.datatypes)</a>
+</li>
+    </ul></td>
+  </tr></table>
+</section>
+
+<section id="C" class="genindex-section">
+  <h2>C</h2>
+  <table style="width: 100%" class="indextable genindextable"><tr>
+    <td style="width: 33%; vertical-align: top;"><ul>
+        <li><a href="cutlass.utils.html#cutlass.utils.check.calculate_smem_usage">calculate_smem_usage() (in module cutlass.utils.check)</a>
+</li>
+        <li><a href="cutlass.utils.html#cutlass.utils.check.calculate_smem_usage_per_stage">calculate_smem_usage_per_stage() (in module cutlass.utils.check)</a>
+</li>
+        <li><a href="cutlass.op.html#cutlass.op.gemm.Gemm.compile">compile() (cutlass.op.gemm.Gemm method)</a>
+</li>
+        <li><a href="cutlass.op.html#cutlass.op.gemm.Gemm.construct">construct() (cutlass.op.gemm.Gemm method)</a>
+
+        <ul>
+          <li><a href="cutlass.op.html#cutlass.op.gemm_grouped.GroupedGemm.construct">(cutlass.op.gemm_grouped.GroupedGemm method)</a>
+</li>
+        </ul></li>
+        <li><a href="cutlass.utils.html#cutlass.utils.datatypes.construct_backend_td">construct_backend_td() (in module cutlass.utils.datatypes)</a>
+</li>
+        <li><a href="cutlass.utils.html#cutlass.utils.datatypes.cupy_library_type">cupy_library_type() (in module cutlass.utils.datatypes)</a>
+</li>
+        <li><a href="cutlass.utils.html#cutlass.utils.datatypes.cupy_type">cupy_type() (in module cutlass.utils.datatypes)</a>
+</li>
+        <li>
+    cutlass.emit.common
+
+        <ul>
+          <li><a href="cutlass.emit.html#module-cutlass.emit.common">module</a>
+</li>
+        </ul></li>
+        <li>
+    cutlass.emit.pytorch
+
+        <ul>
+          <li><a href="cutlass.emit.html#module-cutlass.emit.pytorch">module</a>
+</li>
+        </ul></li>
+        <li>
+    cutlass.epilogue
+
+        <ul>
+          <li><a href="cutlass.html#module-cutlass.epilogue">module</a>
+</li>
+        </ul></li>
+    </ul></td>
+    <td style="width: 33%; vertical-align: top;"><ul>
+        <li>
+    cutlass.library_defaults
+
+        <ul>
+          <li><a href="cutlass.html#module-cutlass.library_defaults">module</a>
+</li>
+        </ul></li>
+        <li>
+    cutlass.op.gemm
+
+        <ul>
+          <li><a href="cutlass.op.html#module-cutlass.op.gemm">module</a>
+</li>
+        </ul></li>
+        <li>
+    cutlass.op.gemm_grouped
+
+        <ul>
+          <li><a href="cutlass.op.html#module-cutlass.op.gemm_grouped">module</a>
+</li>
+        </ul></li>
+        <li>
+    cutlass.op.op
+
+        <ul>
+          <li><a href="cutlass.op.html#module-cutlass.op.op">module</a>
+</li>
+        </ul></li>
+        <li>
+    cutlass.swizzle
+
+        <ul>
+          <li><a href="cutlass.html#module-cutlass.swizzle">module</a>
+</li>
+        </ul></li>
+        <li>
+    cutlass.utils.check
+
+        <ul>
+          <li><a href="cutlass.utils.html#module-cutlass.utils.check">module</a>
+</li>
+        </ul></li>
+        <li>
+    cutlass.utils.datatypes
+
+        <ul>
+          <li><a href="cutlass.utils.html#module-cutlass.utils.datatypes">module</a>
+</li>
+        </ul></li>
+    </ul></td>
+  </tr></table>
+</section>
+
+<section id="F" class="genindex-section">
+  <h2>F</h2>
+  <table style="width: 100%" class="indextable genindextable"><tr>
+    <td style="width: 33%; vertical-align: top;"><ul>
+        <li><a href="cutlass.html#cutlass.library_defaults.KernelsForDataType.find_alignment">find_alignment() (cutlass.library_defaults.KernelsForDataType method)</a>
+</li>
+    </ul></td>
+  </tr></table>
+</section>
+
+<section id="G" class="genindex-section">
+  <h2>G</h2>
+  <table style="width: 100%" class="indextable genindextable"><tr>
+    <td style="width: 33%; vertical-align: top;"><ul>
+        <li><a href="cutlass.op.html#cutlass.op.gemm.Gemm">Gemm (class in cutlass.op.gemm)</a>
+</li>
+        <li><a href="cutlass.html#cutlass.epilogue.get_activation_epilogue">get_activation_epilogue() (in module cutlass.epilogue)</a>
+</li>
+        <li><a href="cutlass.html#cutlass.epilogue.get_activations">get_activations() (in module cutlass.epilogue)</a>
+</li>
+    </ul></td>
+    <td style="width: 33%; vertical-align: top;"><ul>
+        <li><a href="cutlass.utils.html#cutlass.utils.datatypes.get_datatype_and_layout">get_datatype_and_layout() (in module cutlass.utils.datatypes)</a>
+</li>
+        <li><a href="cutlass.html#cutlass.swizzle.get_swizzling_functors">get_swizzling_functors() (in module cutlass.swizzle)</a>
+</li>
+        <li><a href="cutlass.op.html#cutlass.op.gemm_grouped.GroupedGemm">GroupedGemm (class in cutlass.op.gemm_grouped)</a>
+</li>
+    </ul></td>
+  </tr></table>
+</section>
+
+<section id="H" class="genindex-section">
+  <h2>H</h2>
+  <table style="width: 100%" class="indextable genindextable"><tr>
+    <td style="width: 33%; vertical-align: top;"><ul>
+        <li><a href="cutlass.utils.html#cutlass.utils.datatypes.has_binding_type">has_binding_type() (in module cutlass.utils.datatypes)</a>
+</li>
+    </ul></td>
+  </tr></table>
+</section>
+
+<section id="K" class="genindex-section">
+  <h2>K</h2>
+  <table style="width: 100%" class="indextable genindextable"><tr>
+    <td style="width: 33%; vertical-align: top;"><ul>
+        <li><a href="cutlass.html#cutlass.library_defaults.KernelsForDataType">KernelsForDataType (class in cutlass.library_defaults)</a>
+</li>
+    </ul></td>
+  </tr></table>
+</section>
+
+<section id="L" class="genindex-section">
+  <h2>L</h2>
+  <table style="width: 100%" class="indextable genindextable"><tr>
+    <td style="width: 33%; vertical-align: top;"><ul>
+        <li><a href="cutlass.utils.html#cutlass.utils.datatypes.library_layout">library_layout() (in module cutlass.utils.datatypes)</a>
+</li>
+    </ul></td>
+    <td style="width: 33%; vertical-align: top;"><ul>
+        <li><a href="cutlass.utils.html#cutlass.utils.datatypes.library_to_binding">library_to_binding() (in module cutlass.utils.datatypes)</a>
+</li>
+        <li><a href="cutlass.utils.html#cutlass.utils.datatypes.library_type">library_type() (in module cutlass.utils.datatypes)</a>
+</li>
+    </ul></td>
+  </tr></table>
+</section>
+
+<section id="M" class="genindex-section">
+  <h2>M</h2>
+  <table style="width: 100%" class="indextable genindextable"><tr>
+    <td style="width: 33%; vertical-align: top;"><ul>
+        <li>
+    module
+
+        <ul>
+          <li><a href="cutlass.emit.html#module-cutlass.emit.common">cutlass.emit.common</a>
+</li>
+          <li><a href="cutlass.emit.html#module-cutlass.emit.pytorch">cutlass.emit.pytorch</a>
+</li>
+          <li><a href="cutlass.html#module-cutlass.epilogue">cutlass.epilogue</a>
+</li>
+          <li><a href="cutlass.html#module-cutlass.library_defaults">cutlass.library_defaults</a>
+</li>
+          <li><a href="cutlass.op.html#module-cutlass.op.gemm">cutlass.op.gemm</a>
+</li>
+          <li><a href="cutlass.op.html#module-cutlass.op.gemm_grouped">cutlass.op.gemm_grouped</a>
+</li>
+          <li><a href="cutlass.op.html#module-cutlass.op.op">cutlass.op.op</a>
+</li>
+          <li><a href="cutlass.html#module-cutlass.swizzle">cutlass.swizzle</a>
+</li>
+          <li><a href="cutlass.utils.html#module-cutlass.utils.check">cutlass.utils.check</a>
+</li>
+          <li><a href="cutlass.utils.html#module-cutlass.utils.datatypes">cutlass.utils.datatypes</a>
+</li>
+        </ul></li>
+    </ul></td>
+  </tr></table>
+</section>
+
+<section id="N" class="genindex-section">
+  <h2>N</h2>
+  <table style="width: 100%" class="indextable genindextable"><tr>
+    <td style="width: 33%; vertical-align: top;"><ul>
+        <li><a href="cutlass.utils.html#cutlass.utils.datatypes.numpy_library_type">numpy_library_type() (in module cutlass.utils.datatypes)</a>
+</li>
+    </ul></td>
+    <td style="width: 33%; vertical-align: top;"><ul>
+        <li><a href="cutlass.utils.html#cutlass.utils.datatypes.numpy_type">numpy_type() (in module cutlass.utils.datatypes)</a>
+</li>
+    </ul></td>
+  </tr></table>
+</section>
+
+<section id="O" class="genindex-section">
+  <h2>O</h2>
+  <table style="width: 100%" class="indextable genindextable"><tr>
+    <td style="width: 33%; vertical-align: top;"><ul>
+        <li><a href="cutlass.op.html#cutlass.op.gemm.Gemm.opclass">opclass (cutlass.op.gemm.Gemm property)</a>
+</li>
+        <li><a href="cutlass.html#cutlass.library_defaults.ArchOptions.opclass_supports_combination">opclass_supports_combination() (cutlass.library_defaults.ArchOptions method)</a>
+</li>
+        <li><a href="cutlass.op.html#cutlass.op.op.OperationBase">OperationBase (class in cutlass.op.op)</a>
+</li>
+    </ul></td>
+    <td style="width: 33%; vertical-align: top;"><ul>
+        <li><a href="cutlass.html#cutlass.library_defaults.ArchOptions.operations">operations() (cutlass.library_defaults.ArchOptions method)</a>
+
+        <ul>
+          <li><a href="cutlass.html#cutlass.library_defaults.KernelsForDataType.operations">(cutlass.library_defaults.KernelsForDataType method)</a>
+</li>
+        </ul></li>
+        <li><a href="cutlass.html#cutlass.library_defaults.OptionRegistry">OptionRegistry (class in cutlass.library_defaults)</a>
+</li>
+        <li><a href="cutlass.html#cutlass.library_defaults.OptionRegistry.options_for_cc">options_for_cc() (cutlass.library_defaults.OptionRegistry method)</a>
+</li>
+    </ul></td>
+  </tr></table>
+</section>
+
+<section id="P" class="genindex-section">
+  <h2>P</h2>
+  <table style="width: 100%" class="indextable genindextable"><tr>
+    <td style="width: 33%; vertical-align: top;"><ul>
+        <li><a href="cutlass.emit.html#cutlass.emit.pytorch.pytorch">pytorch() (in module cutlass.emit.pytorch)</a>
+</li>
+    </ul></td>
+  </tr></table>
+</section>
+
+<section id="R" class="genindex-section">
+  <h2>R</h2>
+  <table style="width: 100%" class="indextable genindextable"><tr>
+    <td style="width: 33%; vertical-align: top;"><ul>
+        <li><a href="cutlass.op.html#cutlass.op.gemm.Gemm.run">run() (cutlass.op.gemm.Gemm method)</a>
+
+        <ul>
+          <li><a href="cutlass.op.html#cutlass.op.gemm_grouped.GroupedGemm.run">(cutlass.op.gemm_grouped.GroupedGemm method)</a>
+</li>
+        </ul></li>
+    </ul></td>
+  </tr></table>
+</section>
+
+<section id="S" class="genindex-section">
+  <h2>S</h2>
+  <table style="width: 100%" class="indextable genindextable"><tr>
+    <td style="width: 33%; vertical-align: top;"><ul>
+        <li><a href="cutlass.html#cutlass.library_defaults.KernelsForDataType.sort">sort() (cutlass.library_defaults.KernelsForDataType method)</a>
+</li>
+        <li><a href="cutlass.html#cutlass.library_defaults.ArchOptions.supporting_opclasses">supporting_opclasses() (cutlass.library_defaults.ArchOptions method)</a>
+</li>
+    </ul></td>
+    <td style="width: 33%; vertical-align: top;"><ul>
+        <li><a href="cutlass.op.html#cutlass.op.gemm.Gemm.swizzling_functor">swizzling_functor (cutlass.op.gemm.Gemm property)</a>
+
+        <ul>
+          <li><a href="cutlass.op.html#cutlass.op.gemm_grouped.GroupedGemm.swizzling_functor">(cutlass.op.gemm_grouped.GroupedGemm property)</a>
+</li>
+        </ul></li>
+        <li><a href="cutlass.op.html#cutlass.op.op.OperationBase.swizzling_functors">swizzling_functors() (cutlass.op.op.OperationBase method)</a>
+</li>
+    </ul></td>
+  </tr></table>
+</section>
+
+<section id="T" class="genindex-section">
+  <h2>T</h2>
+  <table style="width: 100%" class="indextable genindextable"><tr>
+    <td style="width: 33%; vertical-align: top;"><ul>
+        <li><a href="cutlass.utils.html#cutlass.utils.datatypes.td_from_profiler_op">td_from_profiler_op() (in module cutlass.utils.datatypes)</a>
+</li>
+        <li><a href="cutlass.utils.html#cutlass.utils.datatypes.td_from_profiler_td">td_from_profiler_td() (in module cutlass.utils.datatypes)</a>
+</li>
+    </ul></td>
+    <td style="width: 33%; vertical-align: top;"><ul>
+        <li><a href="cutlass.op.html#cutlass.op.gemm.Gemm.tile_descriptions">tile_descriptions() (cutlass.op.gemm.Gemm method)</a>
+</li>
+        <li><a href="cutlass.utils.html#cutlass.utils.datatypes.torch_library_type">torch_library_type() (in module cutlass.utils.datatypes)</a>
+</li>
+        <li><a href="cutlass.utils.html#cutlass.utils.datatypes.torch_type">torch_type() (in module cutlass.utils.datatypes)</a>
+</li>
+    </ul></td>
+  </tr></table>
+</section>
+
+<section id="V" class="genindex-section">
+  <h2>V</h2>
+  <table style="width: 100%" class="indextable genindextable"><tr>
+    <td style="width: 33%; vertical-align: top;"><ul>
+        <li><a href="cutlass.utils.html#cutlass.utils.check.valid_cluster_shape">valid_cluster_shape() (in module cutlass.utils.check)</a>
+</li>
+    </ul></td>
+    <td style="width: 33%; vertical-align: top;"><ul>
+        <li><a href="cutlass.utils.html#cutlass.utils.check.valid_kernel_schedule">valid_kernel_schedule() (in module cutlass.utils.check)</a>
+</li>
+        <li><a href="cutlass.utils.html#cutlass.utils.check.valid_stage_count">valid_stage_count() (in module cutlass.utils.check)</a>
+</li>
+    </ul></td>
+  </tr></table>
+</section>
+
+
+        </article>
+      </div>
+      <footer>
+        
+        <div class="related-pages">
+          
+          
+        </div>
+        <div class="bottom-of-page">
+          <div class="left-details">
+            <div class="copyright">
+                Copyright &#169; 2023, NVIDIA
+            </div>
+            Made with <a href="https://www.sphinx-doc.org/">Sphinx</a> and <a class="muted-link" href="https://pradyunsg.me">@pradyunsg</a>'s
+            
+            <a href="https://github.com/pradyunsg/furo">Furo</a>
+            
+          </div>
+          <div class="right-details">
+            <div class="icons">
+              <a class="muted-link " href="https://github.com/NVIDIA/cutlass" aria-label="GitHub">
+                <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 16 16">
+                    <path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"></path>
+                </svg>
+            </a>
+              
+            </div>
+          </div>
+        </div>
+        
+      </footer>
+    </div>
+    <aside class="toc-drawer no-toc">
+      
+      
+      
+    </aside>
+  </div>
+</div><script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
+    <script src="_static/doctools.js"></script>
+    <script src="_static/sphinx_highlight.js"></script>
+    <script src="_static/scripts/furo.js"></script>
+    <script src="_static/clipboard.min.js"></script>
+    <script src="_static/copybutton.js"></script>
+    <script src="_static/tabs.js"></script>
+    <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
+    </body>
+</html>
\ No newline at end of file
diff --git a/python/docs/index.html b/python/docs/index.html
new file mode 100644
index 0000000000..3a4d4a3739
--- /dev/null
+++ b/python/docs/index.html
@@ -0,0 +1,529 @@
+<!doctype html>
+<html class="no-js" lang="en">
+  <head><meta charset="utf-8"/>
+    <meta name="viewport" content="width=device-width,initial-scale=1"/>
+    <meta name="color-scheme" content="light dark"><meta name="generator" content="Docutils 0.19: https://docutils.sourceforge.io/" />
+<link rel="index" title="Index" href="genindex.html" /><link rel="search" title="Search" href="search.html" /><link rel="next" title="Installation" href="install.html" />
+        <link rel="canonical" href="docs/index.html" />
+
+    <!-- Generated with Sphinx 6.1.3 and Furo 2023.03.27 -->
+        <title>CUTLASS Python</title>
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css" />
+    <link rel="stylesheet" type="text/css" href="_static/styles/furo.css?digest=fad236701ea90a88636c2a8c73b44ae642ed2a53" />
+    <link rel="stylesheet" type="text/css" href="_static/copybutton.css" />
+    <link rel="stylesheet" type="text/css" href="_static/tabs.css" />
+    <link rel="stylesheet" type="text/css" href="_static/styles/furo-extensions.css?digest=30d1aed668e5c3a91c3e3bf6a60b675221979f0e" />
+    
+    
+
+
+<style>
+  body {
+    --color-code-background: #eeffcc;
+  --color-code-foreground: black;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+  }
+  @media not print {
+    body[data-theme="dark"] {
+      --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+    }
+    @media (prefers-color-scheme: dark) {
+      body:not([data-theme="light"]) {
+        --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+      }
+    }
+  }
+</style></head>
+  <body>
+    
+    <script>
+      document.body.dataset.theme = localStorage.getItem("theme") || "auto";
+    </script>
+    
+
+<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
+  <symbol id="svg-toc" viewBox="0 0 24 24">
+    <title>Contents</title>
+    <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 1024 1024">
+      <path d="M408 442h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8zm-8 204c0 4.4 3.6 8 8 8h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56zm504-486H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zm0 632H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zM115.4 518.9L271.7 642c5.8 4.6 14.4.5 14.4-6.9V388.9c0-7.4-8.5-11.5-14.4-6.9L115.4 505.1a8.74 8.74 0 0 0 0 13.8z"/>
+    </svg>
+  </symbol>
+  <symbol id="svg-menu" viewBox="0 0 24 24">
+    <title>Menu</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-menu">
+      <line x1="3" y1="12" x2="21" y2="12"></line>
+      <line x1="3" y1="6" x2="21" y2="6"></line>
+      <line x1="3" y1="18" x2="21" y2="18"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-arrow-right" viewBox="0 0 24 24">
+    <title>Expand</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-chevron-right">
+      <polyline points="9 18 15 12 9 6"></polyline>
+    </svg>
+  </symbol>
+  <symbol id="svg-sun" viewBox="0 0 24 24">
+    <title>Light mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="feather-sun">
+      <circle cx="12" cy="12" r="5"></circle>
+      <line x1="12" y1="1" x2="12" y2="3"></line>
+      <line x1="12" y1="21" x2="12" y2="23"></line>
+      <line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
+      <line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
+      <line x1="1" y1="12" x2="3" y2="12"></line>
+      <line x1="21" y1="12" x2="23" y2="12"></line>
+      <line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
+      <line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-moon" viewBox="0 0 24 24">
+    <title>Dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-moon">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none" />
+      <path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z" />
+    </svg>
+  </symbol>
+  <symbol id="svg-sun-half" viewBox="0 0 24 24">
+    <title>Auto light/dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-shadow">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none"/>
+      <circle cx="12" cy="12" r="9" />
+      <path d="M13 12h5" />
+      <path d="M13 15h4" />
+      <path d="M13 18h1" />
+      <path d="M13 9h4" />
+      <path d="M13 6h1" />
+    </svg>
+  </symbol>
+</svg>
+
+<input type="checkbox" class="sidebar-toggle" name="__navigation" id="__navigation">
+<input type="checkbox" class="sidebar-toggle" name="__toc" id="__toc">
+<label class="overlay sidebar-overlay" for="__navigation">
+  <div class="visually-hidden">Hide navigation sidebar</div>
+</label>
+<label class="overlay toc-overlay" for="__toc">
+  <div class="visually-hidden">Hide table of contents sidebar</div>
+</label>
+
+
+
+<div class="page">
+  <header class="mobile-header">
+    <div class="header-left">
+      <label class="nav-overlay-icon" for="__navigation">
+        <div class="visually-hidden">Toggle site navigation sidebar</div>
+        <i class="icon"><svg><use href="#svg-menu"></use></svg></i>
+      </label>
+    </div>
+    <div class="header-center">
+      <a href="#"><div class="brand">CUTLASS Python</div></a>
+    </div>
+    <div class="header-right">
+      <div class="theme-toggle-container theme-toggle-header">
+        <button class="theme-toggle">
+          <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+          <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+          <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+          <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+        </button>
+      </div>
+      <label class="toc-overlay-icon toc-header-icon" for="__toc">
+        <div class="visually-hidden">Toggle table of contents sidebar</div>
+        <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+      </label>
+    </div>
+  </header>
+  <aside class="sidebar-drawer">
+    <div class="sidebar-container">
+      
+      <div class="sidebar-sticky"><a class="sidebar-brand" href="#">
+  
+  <div class="sidebar-logo-container">
+    <img class="sidebar-logo only-light" src="_static/cutlass-logo-small.png" alt="Light Logo"/>
+    <img class="sidebar-logo only-dark" src="_static/cutlass-logo-small.png" alt="Dark Logo"/>
+  </div>
+  
+  <span class="sidebar-brand-text">CUTLASS Python</span>
+  
+</a><form class="sidebar-search-container" method="get" action="search.html" role="search">
+  <input class="sidebar-search" placeholder="Search" name="q" aria-label="Search">
+  <input type="hidden" name="check_keywords" value="yes">
+  <input type="hidden" name="area" value="default">
+</form>
+<div id="searchbox"></div><div class="sidebar-scroll"><div class="sidebar-tree">
+  <ul class="current">
+<li class="toctree-l1 current current-page"><a class="current reference internal" href="#">Home</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Getting Started:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="install.html">Installation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="externals/00_basic_gemm.html">Getting Started</a></li>
+<li class="toctree-l1"><a class="reference internal" href="contribute.html">Contributing</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Python Documentation:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="modules.html">CUTLASS Python API</a><input class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" role="switch" type="checkbox"/><label for="toctree-checkbox-1"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2 has-children"><a class="reference internal" href="cutlass.html">CUTLASS</a><input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" role="switch" type="checkbox"/><label for="toctree-checkbox-2"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.emit.html">Emitters</a></li>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.op.html">Operations</a></li>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.utils.html">Utilities</a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Examples and Tutorials:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="examples.html">Examples</a><input class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" role="switch" type="checkbox"/><label for="toctree-checkbox-3"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2"><a class="reference internal" href="externals/00_basic_gemm.html">Basic GEMM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="externals/01_epilogue.html">Epilogue</a></li>
+<li class="toctree-l2"><a class="reference internal" href="externals/02_pytorch_extension_grouped_gemm.html">PyTorch Extension</a></li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Reference:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference external" href="https://github.com/NVIDIA/cutlass">Github</a></li>
+</ul>
+
+</div>
+</div>
+
+      </div>
+      
+    </div>
+  </aside>
+  <div class="main">
+    <div class="content">
+      <div class="article-container">
+        <a href="#" class="back-to-top muted-link">
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+            <path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12z"></path>
+          </svg>
+          <span>Back to top</span>
+        </a>
+        <div class="content-icon-container">
+          
+<div class="theme-toggle-container theme-toggle-content">
+            <button class="theme-toggle">
+              <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+              <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+              <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+              <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+            </button>
+          </div>
+          <label class="toc-overlay-icon toc-content-icon" for="__toc">
+            <div class="visually-hidden">Toggle table of contents sidebar</div>
+            <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+          </label>
+        </div>
+        <article role="main">
+          <section id="cutlass-python-interface">
+<h1>CUTLASS Python Interface<a class="headerlink" href="#cutlass-python-interface" title="Permalink to this heading">#</a></h1>
+<p>The CUTLASS Python interface enables one to compile and run CUTLASS operations from within Python.</p>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">cutlass</span>
+<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
+
+<span class="n">plan</span> <span class="o">=</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class="n">Gemm</span><span class="p">(</span><span class="n">element</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">float16</span><span class="p">,</span> <span class="n">layout</span><span class="o">=</span><span class="n">cutlass</span><span class="o">.</span><span class="n">LayoutType</span><span class="o">.</span><span class="n">RowMajor</span><span class="p">)</span>
+<span class="n">A</span><span class="p">,</span> <span class="n">B</span><span class="p">,</span> <span class="n">C</span><span class="p">,</span> <span class="n">D</span> <span class="o">=</span> <span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ones</span><span class="p">((</span><span class="mi">4096</span><span class="p">,</span> <span class="mi">4096</span><span class="p">),</span> <span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">float16</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">4</span><span class="p">)]</span>
+<span class="n">plan</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">A</span><span class="p">,</span> <span class="n">B</span><span class="p">,</span> <span class="n">C</span><span class="p">,</span> <span class="n">D</span><span class="p">)</span>
+</pre></div>
+</div>
+<p><strong>NOTE:</strong> The CUTLASS Python interface is currently an experimental release. The API may change in the future.
+We welcome feedback from the community.</p>
+<section id="overview">
+<h2>Overview<a class="headerlink" href="#overview" title="Permalink to this heading">#</a></h2>
+<p>The CUTLASS Python interface aims to provide an ease-of-use interface for using CUTLASS via Python. Toward this goal,
+the CUTLASS Python interface attempts to:</p>
+<ul class="simple">
+<li><p>Present high-level interfaces for operators that require only few parameters</p></li>
+<li><p>Select sensible default configurations for an operator given the parameters that have been specified</p></li>
+<li><p>Enumerate configurations for users that are known to work in a given setting</p></li>
+<li><p>Reduce the occurrence of C++ compile-time errors in favor of descriptive Python exceptions</p></li>
+<li><p>Make it easy to export CUTLASS kernels to framework extensions (e.g., PyTorch CUDA extensions)</p></li>
+</ul>
+<section id="non-goals">
+<h3>Non-goals<a class="headerlink" href="#non-goals" title="Permalink to this heading">#</a></h3>
+<p>The CUTLASS Python interface does not intended to:</p>
+<p><strong>Select optimal kernel configurations.</strong>
+As an ease-of-use interface, the default selections for operator parameters made by the CUTLASS Python interface may
+not achieve the highest possible performance in all scenarios. Users wishing to achieve the highest performance possible
+should consider profile different combinations of configuration parameters, or use a library such as <a class="reference external" href="https://developer.nvidia.com/cublas">cuBLAS</a>
+that contains heuristics for selecting kernels.</p>
+<p><strong>Act as a fast container for CUTLASS kernels.</strong>
+The CUTLASS Python interface does not strive to minimize overhead in its Python functions surrounding the running of a kernel.
+Those wishing to deploy a CUTLASS kernel should consider either using the C++ emitted by the Python interface directly, or using
+one of the CUTLASS emitters for automatically creating a framework extension for the kernel (e.g., a PyTorch CUDA extension).</p>
+<p><strong>Act as a Python-to-CUDA-kernel JIT compilation engine.</strong>
+The CUTLASS Python interface intends to enable one to use CUTLASS via Python. It can be used by frameworks for JIT compiling
+Python to CUDA kernels, but does not set out to be such a framework.</p>
+</section>
+<section id="comparison-to-pycutlass">
+<h3>Comparison to PyCUTLASS<a class="headerlink" href="#comparison-to-pycutlass" title="Permalink to this heading">#</a></h3>
+<p>The CUTLASS Python interface builds atop CUTLASS’s <a class="reference external" href="https://github.com/NVIDIA/cutlass/tree/v3.0.0/tools/library/scripts/pycutlass">PyCUTLASS</a> library. PyCUTLASS enables
+one to declare, compile, and run GEMMs, convolutions, and grouped GEMM operators with nearly the same configuration
+space as CUTLASS’s C++ interface. While this flexibility enables one to achieve the similar levels of functionality
+as available in CUTLASS’s C++ interface, it comes with the burden of needing to specify many configuration parameters
+to operators – similar to what one must do in specifying template parameters to operations in CUTLASS’s C++ interface.</p>
+<p>In contrast, the CUTLASS Python interface aims to provide a higher-level API for declaring, emitting, and compiling
+kernels that does not require exhaustively defining template parameters.</p>
+<section id="transitioning-from-pycutlass">
+<h4>Transitioning from PyCUTLASS<a class="headerlink" href="#transitioning-from-pycutlass" title="Permalink to this heading">#</a></h4>
+<p>At present, existing PyCUTLASS functionality remains available via the CUTLASS Python interface. One can
+continue to use PyCUTLASS by replacing references to the PyCUTLASS <code class="docutils literal notranslate"><span class="pre">cutlass</span></code> module with <code class="docutils literal notranslate"><span class="pre">cutlass_bindings</span></code>
+and the PyCUTLASS <code class="docutils literal notranslate"><span class="pre">pycutlass</span></code> module with <code class="docutils literal notranslate"><span class="pre">cutlass.backend</span></code>.</p>
+<p>For example, the following code using PyCUTLASS:</p>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">pycutlass</span>
+<span class="kn">import</span> <span class="nn">cutlass</span>
+
+<span class="n">math_inst</span> <span class="o">=</span> <span class="n">pycutlass</span><span class="o">.</span><span class="n">MathInstruction</span><span class="p">(</span>
+    <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">float32</span><span class="p">,</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">float32</span><span class="p">,</span> <span class="n">cutlass</span><span class="o">.</span><span class="n">float32</span><span class="p">,</span>
+    <span class="n">cutlass</span><span class="o">.</span><span class="n">OpClass</span><span class="o">.</span><span class="n">Simt</span><span class="p">,</span> <span class="n">pycutlass</span><span class="o">.</span><span class="n">MathOperation</span><span class="o">.</span><span class="n">multiply_add</span>
+<span class="p">)</span>
+</pre></div>
+</div>
+<p>can work with the Python interface via:</p>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">cutlass.backend</span> <span class="k">as</span> <span class="nn">pycutlass</span>
+<span class="kn">import</span> <span class="nn">cutlass_bindings</span>
+
+<span class="n">math_inst</span> <span class="o">=</span> <span class="n">pycutlass</span><span class="o">.</span><span class="n">MathInstruction</span><span class="p">(</span>
+    <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">float32</span><span class="p">,</span> <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">float32</span><span class="p">,</span> <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">float32</span><span class="p">,</span>
+    <span class="n">cutlass_bindings</span><span class="o">.</span><span class="n">OpClass</span><span class="o">.</span><span class="n">Simt</span><span class="p">,</span> <span class="n">pycutlass</span><span class="o">.</span><span class="n">MathOperation</span><span class="o">.</span><span class="n">multiply_add</span>
+<span class="p">)</span>
+</pre></div>
+</div>
+<p><strong>NOTE:</strong> backwards compatibility of <code class="docutils literal notranslate"><span class="pre">cutlass.backend</span></code> with <code class="docutils literal notranslate"><span class="pre">pycutlass</span></code> will not be maintained moving forward.</p>
+</section>
+</section>
+</section>
+<section id="current-functionality">
+<h2>Current functionality<a class="headerlink" href="#current-functionality" title="Permalink to this heading">#</a></h2>
+<p>The CUTLASS Python interface currently supports the following operations:</p>
+<ul class="simple">
+<li><p>GEMMs</p></li>
+<li><p>GEMMs with fused elementwise epilogues (e.g., ReLU) (for pre-SM90 kernels)</p></li>
+<li><p>Stream K swizzling (for pre-SM90 kernels)</p></li>
+<li><p>Grouped GEMM (for pre-SM90 kernels)</p></li>
+</ul>
+</section>
+<section id="getting-started">
+<h2>Getting started<a class="headerlink" href="#getting-started" title="Permalink to this heading">#</a></h2>
+<p>We recommend using the CUTLASS Python interface via one of the Docker images located in the <a href="#id1"><span class="problematic" id="id2">docker</span></a> directory.</p>
+<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>docker<span class="w"> </span>build<span class="w"> </span>-t<span class="w"> </span>cutlass-cuda12.1:latest<span class="w"> </span>-f<span class="w"> </span>docker/Dockerfile-cuda12.1-pytorch<span class="w"> </span>.
+docker<span class="w"> </span>run<span class="w"> </span>--gpus<span class="w"> </span>all<span class="w"> </span>-it<span class="w"> </span>--rm<span class="w"> </span>cutlass-cuda12.1:latest
+</pre></div>
+</div>
+<p>The CUTLASS Python interface has been tested with CUDA 11.8, 12.0, and 12.1 on Python 3.8.10 and 3.9.7.</p>
+<section id="optional-environment-variables">
+<h3>Optional environment variables<a class="headerlink" href="#optional-environment-variables" title="Permalink to this heading">#</a></h3>
+<p>Prior to installing the CUTLASS Python interface, one may optionally set the following environment variables:</p>
+<ul class="simple">
+<li><p><code class="docutils literal notranslate"><span class="pre">CUTLASS_PATH</span></code>: the path to the cloned CUTLASS repository</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">CUDA_INSTALL_PATH</span></code>: the path to the installation of CUDA</p></li>
+</ul>
+<p>If these environment variables are not set, the installation process will infer them to be the following:</p>
+<ul class="simple">
+<li><p><code class="docutils literal notranslate"><span class="pre">CUTLASS_PATH</span></code>: one directory level above the current directory (i.e., <code class="docutils literal notranslate"><span class="pre">$(pwd)/..</span></code>)</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">CUDA_INSTALL_PATH</span></code>: the directory holding <code class="docutils literal notranslate"><span class="pre">/bin/nvcc</span></code> for the first version of <code class="docutils literal notranslate"><span class="pre">nvcc</span></code> on <code class="docutils literal notranslate"><span class="pre">$PATH</span></code> (i.e., <code class="docutils literal notranslate"><span class="pre">which</span> <span class="pre">nvcc</span> <span class="pre">|</span> <span class="pre">awk</span> <span class="pre">-F'/bin/nvcc'</span> <span class="pre">'{print</span> <span class="pre">$1}'</span></code>)</p></li>
+</ul>
+<p><strong>NOTE:</strong> The version of <code class="docutils literal notranslate"><span class="pre">cuda-python</span></code> installed must match the CUDA version in <code class="docutils literal notranslate"><span class="pre">CUDA_INSTALL_PATH</span></code>.</p>
+</section>
+<section id="installation">
+<h3>Installation<a class="headerlink" href="#installation" title="Permalink to this heading">#</a></h3>
+<p>The CUTLASS Python interface can currently be installed via:</p>
+<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>setup.py<span class="w"> </span>develop<span class="w"> </span>--user
+</pre></div>
+</div>
+<p>This will allow changes to the Python interface source to be reflected when using the Python interface.</p>
+<p>We plan to add support for installing via <code class="docutils literal notranslate"><span class="pre">python</span> <span class="pre">setup.py</span> <span class="pre">install</span></code> in a future release.</p>
+</section>
+</section>
+<section id="examples">
+<h2>Examples<a class="headerlink" href="#examples" title="Permalink to this heading">#</a></h2>
+<p>Jupyter notebook examples of using the CUTLASS Python interface are located in <a href="#id3"><span class="problematic" id="id4">examples/python</span></a>.</p>
+<p>To launch these notebooks from this directory, run:</p>
+<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>jupyter-lab<span class="w"> </span>../examples/python
+</pre></div>
+</div>
+</section>
+<section id="building-documentation">
+<h2>Building documentation<a class="headerlink" href="#building-documentation" title="Permalink to this heading">#</a></h2>
+<p>The CUTLASS Python interface uses <a class="reference external" href="https://www.sphinx-doc.org/en/master/">Sphinx</a> for documentation.</p>
+<p>Building the documentation requires additional packages. These can be installed via:</p>
+<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>sudo<span class="w"> </span>apt-get<span class="w"> </span>install<span class="w"> </span>pandoc
+pip<span class="w"> </span>install<span class="w"> </span>--upgrade<span class="w"> </span>Sphinx<span class="w"> </span>furo<span class="w"> </span>pandoc<span class="w"> </span>myst-parser<span class="w"> </span>sphinx-copybutton<span class="w"> </span>nbsphinx<span class="w"> </span>nbsphinx-link<span class="w"> </span>sphinx-inline-tabs
+</pre></div>
+</div>
+<p>To build documentation, you must first have installed the CUTLASS Python interface via the
+<a class="reference internal" href="install.html#installation"><span class="std std-ref">installation instructions</span></a>.</p>
+<p>Documentation can then be built via the following commands:</p>
+<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>sphinx-apidoc<span class="w"> </span>-o<span class="w"> </span>docs_src/source/<span class="w"> </span>cutlass/<span class="w"> </span>cutlass/backend*
+<span class="nb">cd</span><span class="w"> </span>docs_src
+make<span class="w"> </span>html
+mv<span class="w"> </span>_build/*<span class="w"> </span>../docs
+</pre></div>
+</div>
+</section>
+</section>
+<section id="copyright">
+<h1>Copyright<a class="headerlink" href="#copyright" title="Permalink to this heading">#</a></h1>
+<p>Copyright (c) 2023 - 2023 NVIDIA CORPORATION &amp; AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>  <span class="n">Redistribution</span> <span class="ow">and</span> <span class="n">use</span> <span class="ow">in</span> <span class="n">source</span> <span class="ow">and</span> <span class="n">binary</span> <span class="n">forms</span><span class="p">,</span> <span class="k">with</span> <span class="ow">or</span> <span class="n">without</span>
+  <span class="n">modification</span><span class="p">,</span> <span class="n">are</span> <span class="n">permitted</span> <span class="n">provided</span> <span class="n">that</span> <span class="n">the</span> <span class="n">following</span> <span class="n">conditions</span> <span class="n">are</span> <span class="n">met</span><span class="p">:</span>
+
+  <span class="mf">1.</span> <span class="n">Redistributions</span> <span class="n">of</span> <span class="n">source</span> <span class="n">code</span> <span class="n">must</span> <span class="n">retain</span> <span class="n">the</span> <span class="n">above</span> <span class="n">copyright</span> <span class="n">notice</span><span class="p">,</span> <span class="n">this</span>
+  <span class="nb">list</span> <span class="n">of</span> <span class="n">conditions</span> <span class="ow">and</span> <span class="n">the</span> <span class="n">following</span> <span class="n">disclaimer</span><span class="o">.</span>
+
+  <span class="mf">2.</span> <span class="n">Redistributions</span> <span class="ow">in</span> <span class="n">binary</span> <span class="n">form</span> <span class="n">must</span> <span class="n">reproduce</span> <span class="n">the</span> <span class="n">above</span> <span class="n">copyright</span> <span class="n">notice</span><span class="p">,</span>
+  <span class="n">this</span> <span class="nb">list</span> <span class="n">of</span> <span class="n">conditions</span> <span class="ow">and</span> <span class="n">the</span> <span class="n">following</span> <span class="n">disclaimer</span> <span class="ow">in</span> <span class="n">the</span> <span class="n">documentation</span>
+  <span class="ow">and</span><span class="o">/</span><span class="ow">or</span> <span class="n">other</span> <span class="n">materials</span> <span class="n">provided</span> <span class="k">with</span> <span class="n">the</span> <span class="n">distribution</span><span class="o">.</span>
+
+  <span class="mf">3.</span> <span class="n">Neither</span> <span class="n">the</span> <span class="n">name</span> <span class="n">of</span> <span class="n">the</span> <span class="n">copyright</span> <span class="n">holder</span> <span class="n">nor</span> <span class="n">the</span> <span class="n">names</span> <span class="n">of</span> <span class="n">its</span>
+  <span class="n">contributors</span> <span class="n">may</span> <span class="n">be</span> <span class="n">used</span> <span class="n">to</span> <span class="n">endorse</span> <span class="ow">or</span> <span class="n">promote</span> <span class="n">products</span> <span class="n">derived</span> <span class="kn">from</span>
+  <span class="nn">this</span> <span class="n">software</span> <span class="n">without</span> <span class="n">specific</span> <span class="n">prior</span> <span class="n">written</span> <span class="n">permission</span><span class="o">.</span>
+
+  <span class="n">THIS</span> <span class="n">SOFTWARE</span> <span class="n">IS</span> <span class="n">PROVIDED</span> <span class="n">BY</span> <span class="n">THE</span> <span class="n">COPYRIGHT</span> <span class="n">HOLDERS</span> <span class="n">AND</span> <span class="n">CONTRIBUTORS</span> <span class="s2">&quot;AS IS&quot;</span>
+  <span class="n">AND</span> <span class="n">ANY</span> <span class="n">EXPRESS</span> <span class="n">OR</span> <span class="n">IMPLIED</span> <span class="n">WARRANTIES</span><span class="p">,</span> <span class="n">INCLUDING</span><span class="p">,</span> <span class="n">BUT</span> <span class="n">NOT</span> <span class="n">LIMITED</span> <span class="n">TO</span><span class="p">,</span> <span class="n">THE</span>
+  <span class="n">IMPLIED</span> <span class="n">WARRANTIES</span> <span class="n">OF</span> <span class="n">MERCHANTABILITY</span> <span class="n">AND</span> <span class="n">FITNESS</span> <span class="n">FOR</span> <span class="n">A</span> <span class="n">PARTICULAR</span> <span class="n">PURPOSE</span> <span class="n">ARE</span>
+  <span class="n">DISCLAIMED</span><span class="o">.</span> <span class="n">IN</span> <span class="n">NO</span> <span class="n">EVENT</span> <span class="n">SHALL</span> <span class="n">THE</span> <span class="n">COPYRIGHT</span> <span class="n">HOLDER</span> <span class="n">OR</span> <span class="n">CONTRIBUTORS</span> <span class="n">BE</span> <span class="n">LIABLE</span>
+  <span class="n">FOR</span> <span class="n">ANY</span> <span class="n">DIRECT</span><span class="p">,</span> <span class="n">INDIRECT</span><span class="p">,</span> <span class="n">INCIDENTAL</span><span class="p">,</span> <span class="n">SPECIAL</span><span class="p">,</span> <span class="n">EXEMPLARY</span><span class="p">,</span> <span class="n">OR</span> <span class="n">CONSEQUENTIAL</span>
+  <span class="n">DAMAGES</span> <span class="p">(</span><span class="n">INCLUDING</span><span class="p">,</span> <span class="n">BUT</span> <span class="n">NOT</span> <span class="n">LIMITED</span> <span class="n">TO</span><span class="p">,</span> <span class="n">PROCUREMENT</span> <span class="n">OF</span> <span class="n">SUBSTITUTE</span> <span class="n">GOODS</span> <span class="n">OR</span>
+  <span class="n">SERVICES</span><span class="p">;</span> <span class="n">LOSS</span> <span class="n">OF</span> <span class="n">USE</span><span class="p">,</span> <span class="n">DATA</span><span class="p">,</span> <span class="n">OR</span> <span class="n">PROFITS</span><span class="p">;</span> <span class="n">OR</span> <span class="n">BUSINESS</span> <span class="n">INTERRUPTION</span><span class="p">)</span> <span class="n">HOWEVER</span>
+  <span class="n">CAUSED</span> <span class="n">AND</span> <span class="n">ON</span> <span class="n">ANY</span> <span class="n">THEORY</span> <span class="n">OF</span> <span class="n">LIABILITY</span><span class="p">,</span> <span class="n">WHETHER</span> <span class="n">IN</span> <span class="n">CONTRACT</span><span class="p">,</span> <span class="n">STRICT</span> <span class="n">LIABILITY</span><span class="p">,</span>
+  <span class="n">OR</span> <span class="n">TORT</span> <span class="p">(</span><span class="n">INCLUDING</span> <span class="n">NEGLIGENCE</span> <span class="n">OR</span> <span class="n">OTHERWISE</span><span class="p">)</span> <span class="n">ARISING</span> <span class="n">IN</span> <span class="n">ANY</span> <span class="n">WAY</span> <span class="n">OUT</span> <span class="n">OF</span> <span class="n">THE</span> <span class="n">USE</span>
+  <span class="n">OF</span> <span class="n">THIS</span> <span class="n">SOFTWARE</span><span class="p">,</span> <span class="n">EVEN</span> <span class="n">IF</span> <span class="n">ADVISED</span> <span class="n">OF</span> <span class="n">THE</span> <span class="n">POSSIBILITY</span> <span class="n">OF</span> <span class="n">SUCH</span> <span class="n">DAMAGE</span><span class="o">.</span>
+</pre></div>
+</div>
+</section>
+<div class="toctree-wrapper compound">
+</div>
+<div class="toctree-wrapper compound">
+</div>
+<div class="toctree-wrapper compound">
+</div>
+<div class="toctree-wrapper compound">
+</div>
+<div class="toctree-wrapper compound">
+</div>
+<div class="toctree-wrapper compound">
+</div>
+<div class="toctree-wrapper compound">
+</div>
+<section id="indices-and-tables">
+<h1>Indices and tables<a class="headerlink" href="#indices-and-tables" title="Permalink to this heading">#</a></h1>
+<ul class="simple">
+<li><p><a class="reference internal" href="genindex.html"><span class="std std-ref">Index</span></a></p></li>
+<li><p><a class="reference internal" href="py-modindex.html"><span class="std std-ref">Module Index</span></a></p></li>
+<li><p><a class="reference internal" href="search.html"><span class="std std-ref">Search Page</span></a></p></li>
+</ul>
+</section>
+
+        </article>
+      </div>
+      <footer>
+        
+        <div class="related-pages">
+          <a class="next-page" href="install.html">
+              <div class="page-info">
+                <div class="context">
+                  <span>Next</span>
+                </div>
+                <div class="title">Installation</div>
+              </div>
+              <svg class="furo-related-icon"><use href="#svg-arrow-right"></use></svg>
+            </a>
+          
+        </div>
+        <div class="bottom-of-page">
+          <div class="left-details">
+            <div class="copyright">
+                Copyright &#169; 2023, NVIDIA
+            </div>
+            Made with <a href="https://www.sphinx-doc.org/">Sphinx</a> and <a class="muted-link" href="https://pradyunsg.me">@pradyunsg</a>'s
+            
+            <a href="https://github.com/pradyunsg/furo">Furo</a>
+            
+          </div>
+          <div class="right-details">
+            <div class="icons">
+              <a class="muted-link " href="https://github.com/NVIDIA/cutlass" aria-label="GitHub">
+                <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 16 16">
+                    <path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"></path>
+                </svg>
+            </a>
+              
+            </div>
+          </div>
+        </div>
+        
+      </footer>
+    </div>
+    <aside class="toc-drawer">
+      
+      
+      <div class="toc-sticky toc-scroll">
+        <div class="toc-title-container">
+          <span class="toc-title">
+            On this page
+          </span>
+        </div>
+        <div class="toc-tree-container">
+          <div class="toc-tree">
+            <ul>
+<li><a class="reference internal" href="#">CUTLASS Python Interface</a><ul>
+<li><a class="reference internal" href="#overview">Overview</a><ul>
+<li><a class="reference internal" href="#non-goals">Non-goals</a></li>
+<li><a class="reference internal" href="#comparison-to-pycutlass">Comparison to PyCUTLASS</a><ul>
+<li><a class="reference internal" href="#transitioning-from-pycutlass">Transitioning from PyCUTLASS</a></li>
+</ul>
+</li>
+</ul>
+</li>
+<li><a class="reference internal" href="#current-functionality">Current functionality</a></li>
+<li><a class="reference internal" href="#getting-started">Getting started</a><ul>
+<li><a class="reference internal" href="#optional-environment-variables">Optional environment variables</a></li>
+<li><a class="reference internal" href="#installation">Installation</a></li>
+</ul>
+</li>
+<li><a class="reference internal" href="#examples">Examples</a></li>
+<li><a class="reference internal" href="#building-documentation">Building documentation</a></li>
+</ul>
+</li>
+<li><a class="reference internal" href="#copyright">Copyright</a></li>
+<li><a class="reference internal" href="#indices-and-tables">Indices and tables</a></li>
+</ul>
+
+          </div>
+        </div>
+      </div>
+      
+      
+    </aside>
+  </div>
+</div><script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
+    <script src="_static/doctools.js"></script>
+    <script src="_static/sphinx_highlight.js"></script>
+    <script src="_static/scripts/furo.js"></script>
+    <script src="_static/clipboard.min.js"></script>
+    <script src="_static/copybutton.js"></script>
+    <script src="_static/tabs.js"></script>
+    <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
+    </body>
+</html>
\ No newline at end of file
diff --git a/python/docs/install.html b/python/docs/install.html
new file mode 100644
index 0000000000..f839589347
--- /dev/null
+++ b/python/docs/install.html
@@ -0,0 +1,364 @@
+<!doctype html>
+<html class="no-js" lang="en">
+  <head><meta charset="utf-8"/>
+    <meta name="viewport" content="width=device-width,initial-scale=1"/>
+    <meta name="color-scheme" content="light dark"><meta name="generator" content="Docutils 0.19: https://docutils.sourceforge.io/" />
+<link rel="index" title="Index" href="genindex.html" /><link rel="search" title="Search" href="search.html" /><link rel="next" title="Basic example of using the CUTLASS Python interface" href="externals/00_basic_gemm.html" /><link rel="prev" title="CUTLASS Python Interface" href="index.html" />
+        <link rel="canonical" href="docs/install.html" />
+
+    <!-- Generated with Sphinx 6.1.3 and Furo 2023.03.27 -->
+        <title>Installation - CUTLASS Python</title>
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css" />
+    <link rel="stylesheet" type="text/css" href="_static/styles/furo.css?digest=fad236701ea90a88636c2a8c73b44ae642ed2a53" />
+    <link rel="stylesheet" type="text/css" href="_static/copybutton.css" />
+    <link rel="stylesheet" type="text/css" href="_static/tabs.css" />
+    <link rel="stylesheet" type="text/css" href="_static/styles/furo-extensions.css?digest=30d1aed668e5c3a91c3e3bf6a60b675221979f0e" />
+    
+    
+
+
+<style>
+  body {
+    --color-code-background: #eeffcc;
+  --color-code-foreground: black;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+  }
+  @media not print {
+    body[data-theme="dark"] {
+      --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+    }
+    @media (prefers-color-scheme: dark) {
+      body:not([data-theme="light"]) {
+        --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+      }
+    }
+  }
+</style></head>
+  <body>
+    
+    <script>
+      document.body.dataset.theme = localStorage.getItem("theme") || "auto";
+    </script>
+    
+
+<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
+  <symbol id="svg-toc" viewBox="0 0 24 24">
+    <title>Contents</title>
+    <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 1024 1024">
+      <path d="M408 442h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8zm-8 204c0 4.4 3.6 8 8 8h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56zm504-486H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zm0 632H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zM115.4 518.9L271.7 642c5.8 4.6 14.4.5 14.4-6.9V388.9c0-7.4-8.5-11.5-14.4-6.9L115.4 505.1a8.74 8.74 0 0 0 0 13.8z"/>
+    </svg>
+  </symbol>
+  <symbol id="svg-menu" viewBox="0 0 24 24">
+    <title>Menu</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-menu">
+      <line x1="3" y1="12" x2="21" y2="12"></line>
+      <line x1="3" y1="6" x2="21" y2="6"></line>
+      <line x1="3" y1="18" x2="21" y2="18"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-arrow-right" viewBox="0 0 24 24">
+    <title>Expand</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-chevron-right">
+      <polyline points="9 18 15 12 9 6"></polyline>
+    </svg>
+  </symbol>
+  <symbol id="svg-sun" viewBox="0 0 24 24">
+    <title>Light mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="feather-sun">
+      <circle cx="12" cy="12" r="5"></circle>
+      <line x1="12" y1="1" x2="12" y2="3"></line>
+      <line x1="12" y1="21" x2="12" y2="23"></line>
+      <line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
+      <line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
+      <line x1="1" y1="12" x2="3" y2="12"></line>
+      <line x1="21" y1="12" x2="23" y2="12"></line>
+      <line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
+      <line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-moon" viewBox="0 0 24 24">
+    <title>Dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-moon">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none" />
+      <path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z" />
+    </svg>
+  </symbol>
+  <symbol id="svg-sun-half" viewBox="0 0 24 24">
+    <title>Auto light/dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-shadow">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none"/>
+      <circle cx="12" cy="12" r="9" />
+      <path d="M13 12h5" />
+      <path d="M13 15h4" />
+      <path d="M13 18h1" />
+      <path d="M13 9h4" />
+      <path d="M13 6h1" />
+    </svg>
+  </symbol>
+</svg>
+
+<input type="checkbox" class="sidebar-toggle" name="__navigation" id="__navigation">
+<input type="checkbox" class="sidebar-toggle" name="__toc" id="__toc">
+<label class="overlay sidebar-overlay" for="__navigation">
+  <div class="visually-hidden">Hide navigation sidebar</div>
+</label>
+<label class="overlay toc-overlay" for="__toc">
+  <div class="visually-hidden">Hide table of contents sidebar</div>
+</label>
+
+
+
+<div class="page">
+  <header class="mobile-header">
+    <div class="header-left">
+      <label class="nav-overlay-icon" for="__navigation">
+        <div class="visually-hidden">Toggle site navigation sidebar</div>
+        <i class="icon"><svg><use href="#svg-menu"></use></svg></i>
+      </label>
+    </div>
+    <div class="header-center">
+      <a href="index.html"><div class="brand">CUTLASS Python</div></a>
+    </div>
+    <div class="header-right">
+      <div class="theme-toggle-container theme-toggle-header">
+        <button class="theme-toggle">
+          <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+          <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+          <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+          <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+        </button>
+      </div>
+      <label class="toc-overlay-icon toc-header-icon" for="__toc">
+        <div class="visually-hidden">Toggle table of contents sidebar</div>
+        <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+      </label>
+    </div>
+  </header>
+  <aside class="sidebar-drawer">
+    <div class="sidebar-container">
+      
+      <div class="sidebar-sticky"><a class="sidebar-brand" href="index.html">
+  
+  <div class="sidebar-logo-container">
+    <img class="sidebar-logo only-light" src="_static/cutlass-logo-small.png" alt="Light Logo"/>
+    <img class="sidebar-logo only-dark" src="_static/cutlass-logo-small.png" alt="Dark Logo"/>
+  </div>
+  
+  <span class="sidebar-brand-text">CUTLASS Python</span>
+  
+</a><form class="sidebar-search-container" method="get" action="search.html" role="search">
+  <input class="sidebar-search" placeholder="Search" name="q" aria-label="Search">
+  <input type="hidden" name="check_keywords" value="yes">
+  <input type="hidden" name="area" value="default">
+</form>
+<div id="searchbox"></div><div class="sidebar-scroll"><div class="sidebar-tree">
+  <ul>
+<li class="toctree-l1"><a class="reference internal" href="index.html">Home</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Getting Started:</span></p>
+<ul class="current">
+<li class="toctree-l1 current current-page"><a class="current reference internal" href="#">Installation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="externals/00_basic_gemm.html">Getting Started</a></li>
+<li class="toctree-l1"><a class="reference internal" href="contribute.html">Contributing</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Python Documentation:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="modules.html">CUTLASS Python API</a><input class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" role="switch" type="checkbox"/><label for="toctree-checkbox-1"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2 has-children"><a class="reference internal" href="cutlass.html">CUTLASS</a><input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" role="switch" type="checkbox"/><label for="toctree-checkbox-2"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.emit.html">Emitters</a></li>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.op.html">Operations</a></li>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.utils.html">Utilities</a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Examples and Tutorials:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="examples.html">Examples</a><input class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" role="switch" type="checkbox"/><label for="toctree-checkbox-3"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2"><a class="reference internal" href="externals/00_basic_gemm.html">Basic GEMM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="externals/01_epilogue.html">Epilogue</a></li>
+<li class="toctree-l2"><a class="reference internal" href="externals/02_pytorch_extension_grouped_gemm.html">PyTorch Extension</a></li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Reference:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference external" href="https://github.com/NVIDIA/cutlass">Github</a></li>
+</ul>
+
+</div>
+</div>
+
+      </div>
+      
+    </div>
+  </aside>
+  <div class="main">
+    <div class="content">
+      <div class="article-container">
+        <a href="#" class="back-to-top muted-link">
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+            <path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12z"></path>
+          </svg>
+          <span>Back to top</span>
+        </a>
+        <div class="content-icon-container">
+          
+<div class="theme-toggle-container theme-toggle-content">
+            <button class="theme-toggle">
+              <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+              <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+              <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+              <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+            </button>
+          </div>
+          <label class="toc-overlay-icon toc-content-icon" for="__toc">
+            <div class="visually-hidden">Toggle table of contents sidebar</div>
+            <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+          </label>
+        </div>
+        <article role="main">
+          <section id="installation">
+<h1>Installation<a class="headerlink" href="#installation" title="Permalink to this heading">#</a></h1>
+<section id="installing-from-source">
+<h2>Installing from source<a class="headerlink" href="#installing-from-source" title="Permalink to this heading">#</a></h2>
+<p>Installing from source requires the latest CUDA Toolkit that matches the major.minor of CUDA Python installed.</p>
+<p>Prior to installing the CUTLASS Python interface, one may optionally set the following environment variables:</p>
+<ul class="simple">
+<li><p><code class="docutils literal notranslate"><span class="pre">CUTLASS_PATH</span></code>: the path to the cloned CUTLASS repository</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">CUDA_INSTALL_PATH</span></code>: the path to the installation of CUDA</p></li>
+</ul>
+<p>If these environment variables are not set, the installation process will infer them to be the following:</p>
+<ul class="simple">
+<li><p><code class="docutils literal notranslate"><span class="pre">CUTLASS_PATH</span></code>: one directory level above the current directory (i.e., <code class="docutils literal notranslate"><span class="pre">$(pwd)/..</span></code>)</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">CUDA_INSTALL_PATH</span></code>: the directory holding <code class="docutils literal notranslate"><span class="pre">/bin/nvcc</span></code> for the first version of <code class="docutils literal notranslate"><span class="pre">nvcc</span></code> on <code class="docutils literal notranslate"><span class="pre">$PATH</span></code> (i.e., <code class="docutils literal notranslate"><span class="pre">which</span> <span class="pre">nvcc</span> <span class="pre">|</span> <span class="pre">awk</span> <span class="pre">-F'/bin/nvcc'</span> <span class="pre">'{print</span> <span class="pre">$1}'</span></code>)</p></li>
+</ul>
+<p><strong>NOTE:</strong> The version of <code class="docutils literal notranslate"><span class="pre">cuda-python</span></code> installed must match the CUDA version in <code class="docutils literal notranslate"><span class="pre">CUDA_INSTALL_PATH</span></code>.</p>
+<section id="installing-a-developer-mode-package">
+<h3>Installing a developer-mode package<a class="headerlink" href="#installing-a-developer-mode-package" title="Permalink to this heading">#</a></h3>
+<p>The CUTLASS Python interface can currently be installed via:</p>
+<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>setup.py<span class="w"> </span>develop<span class="w"> </span>--user
+</pre></div>
+</div>
+<p>This will allow changes to the Python interface source to be reflected when using the Python interface.</p>
+<p>We plan to add support for installing via <code class="docutils literal notranslate"><span class="pre">python</span> <span class="pre">setup.py</span> <span class="pre">install</span></code> in a future release.</p>
+</section>
+</section>
+<section id="docker">
+<h2>Docker<a class="headerlink" href="#docker" title="Permalink to this heading">#</a></h2>
+<p>To ensure that you have all of the necessary Python modules for running the examples using the
+CUTLASS Python interface, we recommend using one of the Docker images located in the docker directory.</p>
+<p>For example, to build and launch a container that uses CUDA 12.1 via an NGC PyTorch container, run:</p>
+<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>docker<span class="w"> </span>build<span class="w"> </span>-t<span class="w"> </span>cutlass-cuda12.1:latest<span class="w"> </span>-f<span class="w"> </span>docker/Dockerfile-cuda12.1-pytorch<span class="w"> </span>.
+docker<span class="w"> </span>run<span class="w"> </span>--gpus<span class="w"> </span>all<span class="w"> </span>-it<span class="w"> </span>--rm<span class="w"> </span>cutlass-cuda12.1:latest
+</pre></div>
+</div>
+<p>The CUTLASS Python interface has been tested with CUDA 11.8, 12.0, and 12.1 on Python 3.8.10 and 3.9.7.</p>
+</section>
+</section>
+
+        </article>
+      </div>
+      <footer>
+        
+        <div class="related-pages">
+          <a class="next-page" href="externals/00_basic_gemm.html">
+              <div class="page-info">
+                <div class="context">
+                  <span>Next</span>
+                </div>
+                <div class="title">Basic example of using the CUTLASS Python interface</div>
+              </div>
+              <svg class="furo-related-icon"><use href="#svg-arrow-right"></use></svg>
+            </a>
+          <a class="prev-page" href="index.html">
+              <svg class="furo-related-icon"><use href="#svg-arrow-right"></use></svg>
+              <div class="page-info">
+                <div class="context">
+                  <span>Previous</span>
+                </div>
+                
+                <div class="title">Home</div>
+                
+              </div>
+            </a>
+        </div>
+        <div class="bottom-of-page">
+          <div class="left-details">
+            <div class="copyright">
+                Copyright &#169; 2023, NVIDIA
+            </div>
+            Made with <a href="https://www.sphinx-doc.org/">Sphinx</a> and <a class="muted-link" href="https://pradyunsg.me">@pradyunsg</a>'s
+            
+            <a href="https://github.com/pradyunsg/furo">Furo</a>
+            
+          </div>
+          <div class="right-details">
+            <div class="icons">
+              <a class="muted-link " href="https://github.com/NVIDIA/cutlass" aria-label="GitHub">
+                <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 16 16">
+                    <path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"></path>
+                </svg>
+            </a>
+              
+            </div>
+          </div>
+        </div>
+        
+      </footer>
+    </div>
+    <aside class="toc-drawer">
+      
+      
+      <div class="toc-sticky toc-scroll">
+        <div class="toc-title-container">
+          <span class="toc-title">
+            On this page
+          </span>
+        </div>
+        <div class="toc-tree-container">
+          <div class="toc-tree">
+            <ul>
+<li><a class="reference internal" href="#">Installation</a><ul>
+<li><a class="reference internal" href="#installing-from-source">Installing from source</a><ul>
+<li><a class="reference internal" href="#installing-a-developer-mode-package">Installing a developer-mode package</a></li>
+</ul>
+</li>
+<li><a class="reference internal" href="#docker">Docker</a></li>
+</ul>
+</li>
+</ul>
+
+          </div>
+        </div>
+      </div>
+      
+      
+    </aside>
+  </div>
+</div><script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
+    <script src="_static/doctools.js"></script>
+    <script src="_static/sphinx_highlight.js"></script>
+    <script src="_static/scripts/furo.js"></script>
+    <script src="_static/clipboard.min.js"></script>
+    <script src="_static/copybutton.js"></script>
+    <script src="_static/tabs.js"></script>
+    <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
+    </body>
+</html>
\ No newline at end of file
diff --git a/python/docs/modules.html b/python/docs/modules.html
new file mode 100644
index 0000000000..de75b999ae
--- /dev/null
+++ b/python/docs/modules.html
@@ -0,0 +1,406 @@
+<!doctype html>
+<html class="no-js" lang="en">
+  <head><meta charset="utf-8"/>
+    <meta name="viewport" content="width=device-width,initial-scale=1"/>
+    <meta name="color-scheme" content="light dark"><meta name="generator" content="Docutils 0.19: https://docutils.sourceforge.io/" />
+<link rel="index" title="Index" href="genindex.html" /><link rel="search" title="Search" href="search.html" /><link rel="next" title="CUTLASS" href="cutlass.html" /><link rel="prev" title="Contributing" href="contribute.html" />
+        <link rel="canonical" href="docs/modules.html" />
+
+    <!-- Generated with Sphinx 6.1.3 and Furo 2023.03.27 -->
+        <title>CUTLASS Python API - CUTLASS Python</title>
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css" />
+    <link rel="stylesheet" type="text/css" href="_static/styles/furo.css?digest=fad236701ea90a88636c2a8c73b44ae642ed2a53" />
+    <link rel="stylesheet" type="text/css" href="_static/copybutton.css" />
+    <link rel="stylesheet" type="text/css" href="_static/tabs.css" />
+    <link rel="stylesheet" type="text/css" href="_static/styles/furo-extensions.css?digest=30d1aed668e5c3a91c3e3bf6a60b675221979f0e" />
+    
+    
+
+
+<style>
+  body {
+    --color-code-background: #eeffcc;
+  --color-code-foreground: black;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+  }
+  @media not print {
+    body[data-theme="dark"] {
+      --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+    }
+    @media (prefers-color-scheme: dark) {
+      body:not([data-theme="light"]) {
+        --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+      }
+    }
+  }
+</style></head>
+  <body>
+    
+    <script>
+      document.body.dataset.theme = localStorage.getItem("theme") || "auto";
+    </script>
+    
+
+<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
+  <symbol id="svg-toc" viewBox="0 0 24 24">
+    <title>Contents</title>
+    <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 1024 1024">
+      <path d="M408 442h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8zm-8 204c0 4.4 3.6 8 8 8h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56zm504-486H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zm0 632H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zM115.4 518.9L271.7 642c5.8 4.6 14.4.5 14.4-6.9V388.9c0-7.4-8.5-11.5-14.4-6.9L115.4 505.1a8.74 8.74 0 0 0 0 13.8z"/>
+    </svg>
+  </symbol>
+  <symbol id="svg-menu" viewBox="0 0 24 24">
+    <title>Menu</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-menu">
+      <line x1="3" y1="12" x2="21" y2="12"></line>
+      <line x1="3" y1="6" x2="21" y2="6"></line>
+      <line x1="3" y1="18" x2="21" y2="18"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-arrow-right" viewBox="0 0 24 24">
+    <title>Expand</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-chevron-right">
+      <polyline points="9 18 15 12 9 6"></polyline>
+    </svg>
+  </symbol>
+  <symbol id="svg-sun" viewBox="0 0 24 24">
+    <title>Light mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="feather-sun">
+      <circle cx="12" cy="12" r="5"></circle>
+      <line x1="12" y1="1" x2="12" y2="3"></line>
+      <line x1="12" y1="21" x2="12" y2="23"></line>
+      <line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
+      <line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
+      <line x1="1" y1="12" x2="3" y2="12"></line>
+      <line x1="21" y1="12" x2="23" y2="12"></line>
+      <line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
+      <line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-moon" viewBox="0 0 24 24">
+    <title>Dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-moon">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none" />
+      <path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z" />
+    </svg>
+  </symbol>
+  <symbol id="svg-sun-half" viewBox="0 0 24 24">
+    <title>Auto light/dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-shadow">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none"/>
+      <circle cx="12" cy="12" r="9" />
+      <path d="M13 12h5" />
+      <path d="M13 15h4" />
+      <path d="M13 18h1" />
+      <path d="M13 9h4" />
+      <path d="M13 6h1" />
+    </svg>
+  </symbol>
+</svg>
+
+<input type="checkbox" class="sidebar-toggle" name="__navigation" id="__navigation">
+<input type="checkbox" class="sidebar-toggle" name="__toc" id="__toc">
+<label class="overlay sidebar-overlay" for="__navigation">
+  <div class="visually-hidden">Hide navigation sidebar</div>
+</label>
+<label class="overlay toc-overlay" for="__toc">
+  <div class="visually-hidden">Hide table of contents sidebar</div>
+</label>
+
+
+
+<div class="page">
+  <header class="mobile-header">
+    <div class="header-left">
+      <label class="nav-overlay-icon" for="__navigation">
+        <div class="visually-hidden">Toggle site navigation sidebar</div>
+        <i class="icon"><svg><use href="#svg-menu"></use></svg></i>
+      </label>
+    </div>
+    <div class="header-center">
+      <a href="index.html"><div class="brand">CUTLASS Python</div></a>
+    </div>
+    <div class="header-right">
+      <div class="theme-toggle-container theme-toggle-header">
+        <button class="theme-toggle">
+          <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+          <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+          <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+          <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+        </button>
+      </div>
+      <label class="toc-overlay-icon toc-header-icon no-toc" for="__toc">
+        <div class="visually-hidden">Toggle table of contents sidebar</div>
+        <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+      </label>
+    </div>
+  </header>
+  <aside class="sidebar-drawer">
+    <div class="sidebar-container">
+      
+      <div class="sidebar-sticky"><a class="sidebar-brand" href="index.html">
+  
+  <div class="sidebar-logo-container">
+    <img class="sidebar-logo only-light" src="_static/cutlass-logo-small.png" alt="Light Logo"/>
+    <img class="sidebar-logo only-dark" src="_static/cutlass-logo-small.png" alt="Dark Logo"/>
+  </div>
+  
+  <span class="sidebar-brand-text">CUTLASS Python</span>
+  
+</a><form class="sidebar-search-container" method="get" action="search.html" role="search">
+  <input class="sidebar-search" placeholder="Search" name="q" aria-label="Search">
+  <input type="hidden" name="check_keywords" value="yes">
+  <input type="hidden" name="area" value="default">
+</form>
+<div id="searchbox"></div><div class="sidebar-scroll"><div class="sidebar-tree">
+  <ul>
+<li class="toctree-l1"><a class="reference internal" href="index.html">Home</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Getting Started:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="install.html">Installation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="externals/00_basic_gemm.html">Getting Started</a></li>
+<li class="toctree-l1"><a class="reference internal" href="contribute.html">Contributing</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Python Documentation:</span></p>
+<ul class="current">
+<li class="toctree-l1 current has-children current-page"><a class="current reference internal" href="#">CUTLASS Python API</a><input checked="" class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" role="switch" type="checkbox"/><label for="toctree-checkbox-1"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2 has-children"><a class="reference internal" href="cutlass.html">CUTLASS</a><input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" role="switch" type="checkbox"/><label for="toctree-checkbox-2"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.emit.html">Emitters</a></li>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.op.html">Operations</a></li>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.utils.html">Utilities</a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Examples and Tutorials:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="examples.html">Examples</a><input class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" role="switch" type="checkbox"/><label for="toctree-checkbox-3"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2"><a class="reference internal" href="externals/00_basic_gemm.html">Basic GEMM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="externals/01_epilogue.html">Epilogue</a></li>
+<li class="toctree-l2"><a class="reference internal" href="externals/02_pytorch_extension_grouped_gemm.html">PyTorch Extension</a></li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Reference:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference external" href="https://github.com/NVIDIA/cutlass">Github</a></li>
+</ul>
+
+</div>
+</div>
+
+      </div>
+      
+    </div>
+  </aside>
+  <div class="main">
+    <div class="content">
+      <div class="article-container">
+        <a href="#" class="back-to-top muted-link">
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+            <path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12z"></path>
+          </svg>
+          <span>Back to top</span>
+        </a>
+        <div class="content-icon-container">
+          
+<div class="theme-toggle-container theme-toggle-content">
+            <button class="theme-toggle">
+              <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+              <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+              <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+              <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+            </button>
+          </div>
+          <label class="toc-overlay-icon toc-content-icon no-toc" for="__toc">
+            <div class="visually-hidden">Toggle table of contents sidebar</div>
+            <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+          </label>
+        </div>
+        <article role="main">
+          <section id="cutlass-python-api">
+<h1>CUTLASS Python API<a class="headerlink" href="#cutlass-python-api" title="Permalink to this heading">#</a></h1>
+<div class="toctree-wrapper compound">
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="cutlass.html">CUTLASS</a><ul>
+<li class="toctree-l2"><a class="reference internal" href="cutlass.html#subpackages">Subpackages</a><ul>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.emit.html">Emitters</a><ul>
+<li class="toctree-l4"><a class="reference internal" href="cutlass.emit.html#module-cutlass.emit.common">Common</a></li>
+<li class="toctree-l4"><a class="reference internal" href="cutlass.emit.html#module-cutlass.emit.pytorch">PyTorch</a><ul>
+<li class="toctree-l5"><a class="reference internal" href="cutlass.emit.html#cutlass.emit.pytorch.pytorch"><code class="docutils literal notranslate"><span class="pre">pytorch()</span></code></a></li>
+</ul>
+</li>
+</ul>
+</li>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.op.html">Operations</a><ul>
+<li class="toctree-l4"><a class="reference internal" href="cutlass.op.html#module-cutlass.op.gemm">GEMM</a><ul>
+<li class="toctree-l5"><a class="reference internal" href="cutlass.op.html#cutlass.op.gemm.Gemm"><code class="docutils literal notranslate"><span class="pre">Gemm</span></code></a></li>
+</ul>
+</li>
+<li class="toctree-l4"><a class="reference internal" href="cutlass.op.html#module-cutlass.op.gemm_grouped">Grouped GEMM</a><ul>
+<li class="toctree-l5"><a class="reference internal" href="cutlass.op.html#cutlass.op.gemm_grouped.GroupedGemm"><code class="docutils literal notranslate"><span class="pre">GroupedGemm</span></code></a></li>
+</ul>
+</li>
+<li class="toctree-l4"><a class="reference internal" href="cutlass.op.html#module-cutlass.op.op">Operation</a><ul>
+<li class="toctree-l5"><a class="reference internal" href="cutlass.op.html#cutlass.op.op.OperationBase"><code class="docutils literal notranslate"><span class="pre">OperationBase</span></code></a></li>
+</ul>
+</li>
+</ul>
+</li>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.utils.html">Utilities</a><ul>
+<li class="toctree-l4"><a class="reference internal" href="cutlass.utils.html#module-cutlass.utils.check">Checks</a><ul>
+<li class="toctree-l5"><a class="reference internal" href="cutlass.utils.html#cutlass.utils.check.alignment_or_default"><code class="docutils literal notranslate"><span class="pre">alignment_or_default()</span></code></a></li>
+<li class="toctree-l5"><a class="reference internal" href="cutlass.utils.html#cutlass.utils.check.calculate_smem_usage"><code class="docutils literal notranslate"><span class="pre">calculate_smem_usage()</span></code></a></li>
+<li class="toctree-l5"><a class="reference internal" href="cutlass.utils.html#cutlass.utils.check.calculate_smem_usage_per_stage"><code class="docutils literal notranslate"><span class="pre">calculate_smem_usage_per_stage()</span></code></a></li>
+<li class="toctree-l5"><a class="reference internal" href="cutlass.utils.html#cutlass.utils.check.valid_cluster_shape"><code class="docutils literal notranslate"><span class="pre">valid_cluster_shape()</span></code></a></li>
+<li class="toctree-l5"><a class="reference internal" href="cutlass.utils.html#cutlass.utils.check.valid_kernel_schedule"><code class="docutils literal notranslate"><span class="pre">valid_kernel_schedule()</span></code></a></li>
+<li class="toctree-l5"><a class="reference internal" href="cutlass.utils.html#cutlass.utils.check.valid_stage_count"><code class="docutils literal notranslate"><span class="pre">valid_stage_count()</span></code></a></li>
+</ul>
+</li>
+<li class="toctree-l4"><a class="reference internal" href="cutlass.utils.html#module-cutlass.utils.datatypes">Data Types</a><ul>
+<li class="toctree-l5"><a class="reference internal" href="cutlass.utils.html#cutlass.utils.datatypes.backend_math_operation"><code class="docutils literal notranslate"><span class="pre">backend_math_operation()</span></code></a></li>
+<li class="toctree-l5"><a class="reference internal" href="cutlass.utils.html#cutlass.utils.datatypes.bfloat16_library_type"><code class="docutils literal notranslate"><span class="pre">bfloat16_library_type()</span></code></a></li>
+<li class="toctree-l5"><a class="reference internal" href="cutlass.utils.html#cutlass.utils.datatypes.bfloat16_type"><code class="docutils literal notranslate"><span class="pre">bfloat16_type()</span></code></a></li>
+<li class="toctree-l5"><a class="reference internal" href="cutlass.utils.html#cutlass.utils.datatypes.binding_layout"><code class="docutils literal notranslate"><span class="pre">binding_layout()</span></code></a></li>
+<li class="toctree-l5"><a class="reference internal" href="cutlass.utils.html#cutlass.utils.datatypes.binding_library_type"><code class="docutils literal notranslate"><span class="pre">binding_library_type()</span></code></a></li>
+<li class="toctree-l5"><a class="reference internal" href="cutlass.utils.html#cutlass.utils.datatypes.binding_opclass"><code class="docutils literal notranslate"><span class="pre">binding_opclass()</span></code></a></li>
+<li class="toctree-l5"><a class="reference internal" href="cutlass.utils.html#cutlass.utils.datatypes.binding_type"><code class="docutils literal notranslate"><span class="pre">binding_type()</span></code></a></li>
+<li class="toctree-l5"><a class="reference internal" href="cutlass.utils.html#cutlass.utils.datatypes.construct_backend_td"><code class="docutils literal notranslate"><span class="pre">construct_backend_td()</span></code></a></li>
+<li class="toctree-l5"><a class="reference internal" href="cutlass.utils.html#cutlass.utils.datatypes.cupy_library_type"><code class="docutils literal notranslate"><span class="pre">cupy_library_type()</span></code></a></li>
+<li class="toctree-l5"><a class="reference internal" href="cutlass.utils.html#cutlass.utils.datatypes.cupy_type"><code class="docutils literal notranslate"><span class="pre">cupy_type()</span></code></a></li>
+<li class="toctree-l5"><a class="reference internal" href="cutlass.utils.html#cutlass.utils.datatypes.get_datatype_and_layout"><code class="docutils literal notranslate"><span class="pre">get_datatype_and_layout()</span></code></a></li>
+<li class="toctree-l5"><a class="reference internal" href="cutlass.utils.html#cutlass.utils.datatypes.has_binding_type"><code class="docutils literal notranslate"><span class="pre">has_binding_type()</span></code></a></li>
+<li class="toctree-l5"><a class="reference internal" href="cutlass.utils.html#cutlass.utils.datatypes.library_layout"><code class="docutils literal notranslate"><span class="pre">library_layout()</span></code></a></li>
+<li class="toctree-l5"><a class="reference internal" href="cutlass.utils.html#cutlass.utils.datatypes.library_to_binding"><code class="docutils literal notranslate"><span class="pre">library_to_binding()</span></code></a></li>
+<li class="toctree-l5"><a class="reference internal" href="cutlass.utils.html#cutlass.utils.datatypes.library_type"><code class="docutils literal notranslate"><span class="pre">library_type()</span></code></a></li>
+<li class="toctree-l5"><a class="reference internal" href="cutlass.utils.html#cutlass.utils.datatypes.numpy_library_type"><code class="docutils literal notranslate"><span class="pre">numpy_library_type()</span></code></a></li>
+<li class="toctree-l5"><a class="reference internal" href="cutlass.utils.html#cutlass.utils.datatypes.numpy_type"><code class="docutils literal notranslate"><span class="pre">numpy_type()</span></code></a></li>
+<li class="toctree-l5"><a class="reference internal" href="cutlass.utils.html#cutlass.utils.datatypes.td_from_profiler_op"><code class="docutils literal notranslate"><span class="pre">td_from_profiler_op()</span></code></a></li>
+<li class="toctree-l5"><a class="reference internal" href="cutlass.utils.html#cutlass.utils.datatypes.td_from_profiler_td"><code class="docutils literal notranslate"><span class="pre">td_from_profiler_td()</span></code></a></li>
+<li class="toctree-l5"><a class="reference internal" href="cutlass.utils.html#cutlass.utils.datatypes.torch_library_type"><code class="docutils literal notranslate"><span class="pre">torch_library_type()</span></code></a></li>
+<li class="toctree-l5"><a class="reference internal" href="cutlass.utils.html#cutlass.utils.datatypes.torch_type"><code class="docutils literal notranslate"><span class="pre">torch_type()</span></code></a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</li>
+<li class="toctree-l2"><a class="reference internal" href="cutlass.html#module-cutlass.epilogue">Epilogue</a><ul>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.html#cutlass.epilogue.get_activation_epilogue"><code class="docutils literal notranslate"><span class="pre">get_activation_epilogue()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.html#cutlass.epilogue.get_activations"><code class="docutils literal notranslate"><span class="pre">get_activations()</span></code></a></li>
+</ul>
+</li>
+<li class="toctree-l2"><a class="reference internal" href="cutlass.html#module-cutlass.library_defaults">Library Defaults</a><ul>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.html#cutlass.library_defaults.ArchOptions"><code class="docutils literal notranslate"><span class="pre">ArchOptions</span></code></a><ul>
+<li class="toctree-l4"><a class="reference internal" href="cutlass.html#cutlass.library_defaults.ArchOptions.opclass_supports_combination"><code class="docutils literal notranslate"><span class="pre">ArchOptions.opclass_supports_combination()</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="cutlass.html#cutlass.library_defaults.ArchOptions.operations"><code class="docutils literal notranslate"><span class="pre">ArchOptions.operations()</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="cutlass.html#cutlass.library_defaults.ArchOptions.supporting_opclasses"><code class="docutils literal notranslate"><span class="pre">ArchOptions.supporting_opclasses()</span></code></a></li>
+</ul>
+</li>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.html#cutlass.library_defaults.KernelsForDataType"><code class="docutils literal notranslate"><span class="pre">KernelsForDataType</span></code></a><ul>
+<li class="toctree-l4"><a class="reference internal" href="cutlass.html#cutlass.library_defaults.KernelsForDataType.add"><code class="docutils literal notranslate"><span class="pre">KernelsForDataType.add()</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="cutlass.html#cutlass.library_defaults.KernelsForDataType.alignments"><code class="docutils literal notranslate"><span class="pre">KernelsForDataType.alignments</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="cutlass.html#cutlass.library_defaults.KernelsForDataType.all_operations"><code class="docutils literal notranslate"><span class="pre">KernelsForDataType.all_operations</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="cutlass.html#cutlass.library_defaults.KernelsForDataType.find_alignment"><code class="docutils literal notranslate"><span class="pre">KernelsForDataType.find_alignment()</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="cutlass.html#cutlass.library_defaults.KernelsForDataType.operations"><code class="docutils literal notranslate"><span class="pre">KernelsForDataType.operations()</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="cutlass.html#cutlass.library_defaults.KernelsForDataType.sort"><code class="docutils literal notranslate"><span class="pre">KernelsForDataType.sort()</span></code></a></li>
+</ul>
+</li>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.html#cutlass.library_defaults.OptionRegistry"><code class="docutils literal notranslate"><span class="pre">OptionRegistry</span></code></a><ul>
+<li class="toctree-l4"><a class="reference internal" href="cutlass.html#cutlass.library_defaults.OptionRegistry.options_for_cc"><code class="docutils literal notranslate"><span class="pre">OptionRegistry.options_for_cc()</span></code></a></li>
+</ul>
+</li>
+</ul>
+</li>
+<li class="toctree-l2"><a class="reference internal" href="cutlass.html#module-cutlass.swizzle">Swizzle</a><ul>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.html#cutlass.swizzle.get_swizzling_functors"><code class="docutils literal notranslate"><span class="pre">get_swizzling_functors()</span></code></a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</div>
+</section>
+
+        </article>
+      </div>
+      <footer>
+        
+        <div class="related-pages">
+          <a class="next-page" href="cutlass.html">
+              <div class="page-info">
+                <div class="context">
+                  <span>Next</span>
+                </div>
+                <div class="title">CUTLASS</div>
+              </div>
+              <svg class="furo-related-icon"><use href="#svg-arrow-right"></use></svg>
+            </a>
+          <a class="prev-page" href="contribute.html">
+              <svg class="furo-related-icon"><use href="#svg-arrow-right"></use></svg>
+              <div class="page-info">
+                <div class="context">
+                  <span>Previous</span>
+                </div>
+                
+                <div class="title">Contributing</div>
+                
+              </div>
+            </a>
+        </div>
+        <div class="bottom-of-page">
+          <div class="left-details">
+            <div class="copyright">
+                Copyright &#169; 2023, NVIDIA
+            </div>
+            Made with <a href="https://www.sphinx-doc.org/">Sphinx</a> and <a class="muted-link" href="https://pradyunsg.me">@pradyunsg</a>'s
+            
+            <a href="https://github.com/pradyunsg/furo">Furo</a>
+            
+          </div>
+          <div class="right-details">
+            <div class="icons">
+              <a class="muted-link " href="https://github.com/NVIDIA/cutlass" aria-label="GitHub">
+                <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 16 16">
+                    <path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"></path>
+                </svg>
+            </a>
+              
+            </div>
+          </div>
+        </div>
+        
+      </footer>
+    </div>
+    <aside class="toc-drawer no-toc">
+      
+      
+      
+    </aside>
+  </div>
+</div><script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
+    <script src="_static/doctools.js"></script>
+    <script src="_static/sphinx_highlight.js"></script>
+    <script src="_static/scripts/furo.js"></script>
+    <script src="_static/clipboard.min.js"></script>
+    <script src="_static/copybutton.js"></script>
+    <script src="_static/tabs.js"></script>
+    <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
+    </body>
+</html>
\ No newline at end of file
diff --git a/python/docs/objects.inv b/python/docs/objects.inv
new file mode 100644
index 0000000000..7b2e9829fc
Binary files /dev/null and b/python/docs/objects.inv differ
diff --git a/python/docs/py-modindex.html b/python/docs/py-modindex.html
new file mode 100644
index 0000000000..4f04b7762c
--- /dev/null
+++ b/python/docs/py-modindex.html
@@ -0,0 +1,362 @@
+<!doctype html>
+<html class="no-js" lang="en">
+  <head><meta charset="utf-8"/>
+    <meta name="viewport" content="width=device-width,initial-scale=1"/>
+    <meta name="color-scheme" content="light dark"><link rel="index" title="Index" href="genindex.html" /><link rel="search" title="Search" href="search.html" />
+        <link rel="canonical" href="docs/py-modindex.html" />
+
+    <!-- Generated with Sphinx 6.1.3 and Furo 2023.03.27 --><title>Python Module Index - CUTLASS Python</title>
+<link rel="stylesheet" type="text/css" href="_static/pygments.css" />
+    <link rel="stylesheet" type="text/css" href="_static/styles/furo.css?digest=fad236701ea90a88636c2a8c73b44ae642ed2a53" />
+    <link rel="stylesheet" type="text/css" href="_static/copybutton.css" />
+    <link rel="stylesheet" type="text/css" href="_static/tabs.css" />
+    <link rel="stylesheet" type="text/css" href="_static/styles/furo-extensions.css?digest=30d1aed668e5c3a91c3e3bf6a60b675221979f0e" />
+    
+    
+
+
+<style>
+  body {
+    --color-code-background: #eeffcc;
+  --color-code-foreground: black;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+  }
+  @media not print {
+    body[data-theme="dark"] {
+      --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+    }
+    @media (prefers-color-scheme: dark) {
+      body:not([data-theme="light"]) {
+        --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+      }
+    }
+  }
+</style></head>
+  <body>
+    
+    <script>
+      document.body.dataset.theme = localStorage.getItem("theme") || "auto";
+    </script>
+    
+
+<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
+  <symbol id="svg-toc" viewBox="0 0 24 24">
+    <title>Contents</title>
+    <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 1024 1024">
+      <path d="M408 442h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8zm-8 204c0 4.4 3.6 8 8 8h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56zm504-486H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zm0 632H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zM115.4 518.9L271.7 642c5.8 4.6 14.4.5 14.4-6.9V388.9c0-7.4-8.5-11.5-14.4-6.9L115.4 505.1a8.74 8.74 0 0 0 0 13.8z"/>
+    </svg>
+  </symbol>
+  <symbol id="svg-menu" viewBox="0 0 24 24">
+    <title>Menu</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-menu">
+      <line x1="3" y1="12" x2="21" y2="12"></line>
+      <line x1="3" y1="6" x2="21" y2="6"></line>
+      <line x1="3" y1="18" x2="21" y2="18"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-arrow-right" viewBox="0 0 24 24">
+    <title>Expand</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-chevron-right">
+      <polyline points="9 18 15 12 9 6"></polyline>
+    </svg>
+  </symbol>
+  <symbol id="svg-sun" viewBox="0 0 24 24">
+    <title>Light mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="feather-sun">
+      <circle cx="12" cy="12" r="5"></circle>
+      <line x1="12" y1="1" x2="12" y2="3"></line>
+      <line x1="12" y1="21" x2="12" y2="23"></line>
+      <line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
+      <line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
+      <line x1="1" y1="12" x2="3" y2="12"></line>
+      <line x1="21" y1="12" x2="23" y2="12"></line>
+      <line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
+      <line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-moon" viewBox="0 0 24 24">
+    <title>Dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-moon">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none" />
+      <path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z" />
+    </svg>
+  </symbol>
+  <symbol id="svg-sun-half" viewBox="0 0 24 24">
+    <title>Auto light/dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-shadow">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none"/>
+      <circle cx="12" cy="12" r="9" />
+      <path d="M13 12h5" />
+      <path d="M13 15h4" />
+      <path d="M13 18h1" />
+      <path d="M13 9h4" />
+      <path d="M13 6h1" />
+    </svg>
+  </symbol>
+</svg>
+
+<input type="checkbox" class="sidebar-toggle" name="__navigation" id="__navigation">
+<input type="checkbox" class="sidebar-toggle" name="__toc" id="__toc">
+<label class="overlay sidebar-overlay" for="__navigation">
+  <div class="visually-hidden">Hide navigation sidebar</div>
+</label>
+<label class="overlay toc-overlay" for="__toc">
+  <div class="visually-hidden">Hide table of contents sidebar</div>
+</label>
+
+
+
+<div class="page">
+  <header class="mobile-header">
+    <div class="header-left">
+      <label class="nav-overlay-icon" for="__navigation">
+        <div class="visually-hidden">Toggle site navigation sidebar</div>
+        <i class="icon"><svg><use href="#svg-menu"></use></svg></i>
+      </label>
+    </div>
+    <div class="header-center">
+      <a href="index.html"><div class="brand">CUTLASS Python</div></a>
+    </div>
+    <div class="header-right">
+      <div class="theme-toggle-container theme-toggle-header">
+        <button class="theme-toggle">
+          <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+          <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+          <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+          <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+        </button>
+      </div>
+      <label class="toc-overlay-icon toc-header-icon no-toc" for="__toc">
+        <div class="visually-hidden">Toggle table of contents sidebar</div>
+        <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+      </label>
+    </div>
+  </header>
+  <aside class="sidebar-drawer">
+    <div class="sidebar-container">
+      
+      <div class="sidebar-sticky"><a class="sidebar-brand" href="index.html">
+  
+  <div class="sidebar-logo-container">
+    <img class="sidebar-logo only-light" src="_static/cutlass-logo-small.png" alt="Light Logo"/>
+    <img class="sidebar-logo only-dark" src="_static/cutlass-logo-small.png" alt="Dark Logo"/>
+  </div>
+  
+  <span class="sidebar-brand-text">CUTLASS Python</span>
+  
+</a><form class="sidebar-search-container" method="get" action="search.html" role="search">
+  <input class="sidebar-search" placeholder="Search" name="q" aria-label="Search">
+  <input type="hidden" name="check_keywords" value="yes">
+  <input type="hidden" name="area" value="default">
+</form>
+<div id="searchbox"></div><div class="sidebar-scroll"><div class="sidebar-tree">
+  <ul>
+<li class="toctree-l1"><a class="reference internal" href="index.html">Home</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Getting Started:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="install.html">Installation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="externals/00_basic_gemm.html">Getting Started</a></li>
+<li class="toctree-l1"><a class="reference internal" href="contribute.html">Contributing</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Python Documentation:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="modules.html">CUTLASS Python API</a><input class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" role="switch" type="checkbox"/><label for="toctree-checkbox-1"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2 has-children"><a class="reference internal" href="cutlass.html">CUTLASS</a><input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" role="switch" type="checkbox"/><label for="toctree-checkbox-2"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.emit.html">Emitters</a></li>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.op.html">Operations</a></li>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.utils.html">Utilities</a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Examples and Tutorials:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="examples.html">Examples</a><input class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" role="switch" type="checkbox"/><label for="toctree-checkbox-3"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2"><a class="reference internal" href="externals/00_basic_gemm.html">Basic GEMM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="externals/01_epilogue.html">Epilogue</a></li>
+<li class="toctree-l2"><a class="reference internal" href="externals/02_pytorch_extension_grouped_gemm.html">PyTorch Extension</a></li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Reference:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference external" href="https://github.com/NVIDIA/cutlass">Github</a></li>
+</ul>
+
+</div>
+</div>
+
+      </div>
+      
+    </div>
+  </aside>
+  <div class="main">
+    <div class="content">
+      <div class="article-container">
+        <a href="#" class="back-to-top muted-link">
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+            <path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12z"></path>
+          </svg>
+          <span>Back to top</span>
+        </a>
+        <div class="content-icon-container">
+          <div class="theme-toggle-container theme-toggle-content">
+            <button class="theme-toggle">
+              <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+              <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+              <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+              <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+            </button>
+          </div>
+          <label class="toc-overlay-icon toc-content-icon no-toc" for="__toc">
+            <div class="visually-hidden">Toggle table of contents sidebar</div>
+            <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+          </label>
+        </div>
+        <article role="main">
+          
+<section class="domainindex-section">
+  <h1>Python Module Index</h1>
+  <div class="domainindex-jumpbox"><a href="#cap-c"><strong>c</strong></a></div>
+</section>
+<table class="domainindex-table">
+  <tr class="pcap">
+    <td></td><td>&#160;</td><td></td>
+  </tr>
+  <tr class="cap" id="cap-c">
+    <td></td><td><strong>c</strong></td><td></td>
+  </tr>
+  <tr>
+    <td><img src="_static/minus.png" class="toggler"
+              id="toggle-1" style="display: none" alt="-" /></td>
+    <td>
+        <code class="xref">cutlass</code></td><td>
+    <em></em></td>
+  </tr>
+  <tr class="cg-1">
+    <td></td>
+    <td>&#160;&#160;&#160;
+        <a href="cutlass.emit.html#module-cutlass.emit.common"><code class="xref">cutlass.emit.common</code></a></td><td>
+    <em></em></td>
+  </tr>
+  <tr class="cg-1">
+    <td></td>
+    <td>&#160;&#160;&#160;
+        <a href="cutlass.emit.html#module-cutlass.emit.pytorch"><code class="xref">cutlass.emit.pytorch</code></a></td><td>
+    <em></em></td>
+  </tr>
+  <tr class="cg-1">
+    <td></td>
+    <td>&#160;&#160;&#160;
+        <a href="cutlass.html#module-cutlass.epilogue"><code class="xref">cutlass.epilogue</code></a></td><td>
+    <em></em></td>
+  </tr>
+  <tr class="cg-1">
+    <td></td>
+    <td>&#160;&#160;&#160;
+        <a href="cutlass.html#module-cutlass.library_defaults"><code class="xref">cutlass.library_defaults</code></a></td><td>
+    <em></em></td>
+  </tr>
+  <tr class="cg-1">
+    <td></td>
+    <td>&#160;&#160;&#160;
+        <a href="cutlass.op.html#module-cutlass.op.gemm"><code class="xref">cutlass.op.gemm</code></a></td><td>
+    <em></em></td>
+  </tr>
+  <tr class="cg-1">
+    <td></td>
+    <td>&#160;&#160;&#160;
+        <a href="cutlass.op.html#module-cutlass.op.gemm_grouped"><code class="xref">cutlass.op.gemm_grouped</code></a></td><td>
+    <em></em></td>
+  </tr>
+  <tr class="cg-1">
+    <td></td>
+    <td>&#160;&#160;&#160;
+        <a href="cutlass.op.html#module-cutlass.op.op"><code class="xref">cutlass.op.op</code></a></td><td>
+    <em></em></td>
+  </tr>
+  <tr class="cg-1">
+    <td></td>
+    <td>&#160;&#160;&#160;
+        <a href="cutlass.html#module-cutlass.swizzle"><code class="xref">cutlass.swizzle</code></a></td><td>
+    <em></em></td>
+  </tr>
+  <tr class="cg-1">
+    <td></td>
+    <td>&#160;&#160;&#160;
+        <a href="cutlass.utils.html#module-cutlass.utils.check"><code class="xref">cutlass.utils.check</code></a></td><td>
+    <em></em></td>
+  </tr>
+  <tr class="cg-1">
+    <td></td>
+    <td>&#160;&#160;&#160;
+        <a href="cutlass.utils.html#module-cutlass.utils.datatypes"><code class="xref">cutlass.utils.datatypes</code></a></td><td>
+    <em></em></td>
+  </tr>
+</table>
+
+        </article>
+      </div>
+      <footer>
+        
+        <div class="related-pages">
+          
+          
+        </div>
+        <div class="bottom-of-page">
+          <div class="left-details">
+            <div class="copyright">
+                Copyright &#169; 2023, NVIDIA
+            </div>
+            Made with <a href="https://www.sphinx-doc.org/">Sphinx</a> and <a class="muted-link" href="https://pradyunsg.me">@pradyunsg</a>'s
+            
+            <a href="https://github.com/pradyunsg/furo">Furo</a>
+            
+          </div>
+          <div class="right-details">
+            <div class="icons">
+              <a class="muted-link " href="https://github.com/NVIDIA/cutlass" aria-label="GitHub">
+                <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 16 16">
+                    <path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"></path>
+                </svg>
+            </a>
+              
+            </div>
+          </div>
+        </div>
+        
+      </footer>
+    </div>
+    <aside class="toc-drawer no-toc">
+      
+      
+      
+    </aside>
+  </div>
+</div><script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
+    <script src="_static/doctools.js"></script>
+    <script src="_static/sphinx_highlight.js"></script>
+    <script src="_static/scripts/furo.js"></script>
+    <script src="_static/clipboard.min.js"></script>
+    <script src="_static/copybutton.js"></script>
+    <script src="_static/tabs.js"></script>
+    <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
+    </body>
+</html>
\ No newline at end of file
diff --git a/python/docs/search.html b/python/docs/search.html
new file mode 100644
index 0000000000..c0610c548a
--- /dev/null
+++ b/python/docs/search.html
@@ -0,0 +1,295 @@
+<!doctype html>
+<html class="no-js" lang="en">
+  <head><meta charset="utf-8"/>
+    <meta name="viewport" content="width=device-width,initial-scale=1"/>
+    <meta name="color-scheme" content="light dark"><link rel="index" title="Index" href="genindex.html" /><link rel="search" title="Search" href="#" />
+        <link rel="canonical" href="docs/search.html" />
+
+    <!-- Generated with Sphinx 6.1.3 and Furo 2023.03.27 --><title>Search - CUTLASS Python</title><link rel="stylesheet" type="text/css" href="_static/pygments.css" />
+    <link rel="stylesheet" type="text/css" href="_static/styles/furo.css?digest=fad236701ea90a88636c2a8c73b44ae642ed2a53" />
+    <link rel="stylesheet" type="text/css" href="_static/copybutton.css" />
+    <link rel="stylesheet" type="text/css" href="_static/tabs.css" />
+    <link rel="stylesheet" type="text/css" href="_static/styles/furo-extensions.css?digest=30d1aed668e5c3a91c3e3bf6a60b675221979f0e" />
+    
+    
+
+
+<style>
+  body {
+    --color-code-background: #eeffcc;
+  --color-code-foreground: black;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+  }
+  @media not print {
+    body[data-theme="dark"] {
+      --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+    }
+    @media (prefers-color-scheme: dark) {
+      body:not([data-theme="light"]) {
+        --color-code-background: #272822;
+  --color-code-foreground: #f8f8f2;
+  --color-brand-primary: #76B900;
+  --color-brand-content: #76B900;
+  
+      }
+    }
+  }
+</style></head>
+  <body>
+    
+    <script>
+      document.body.dataset.theme = localStorage.getItem("theme") || "auto";
+    </script>
+    
+
+<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
+  <symbol id="svg-toc" viewBox="0 0 24 24">
+    <title>Contents</title>
+    <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 1024 1024">
+      <path d="M408 442h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8zm-8 204c0 4.4 3.6 8 8 8h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56zm504-486H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zm0 632H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zM115.4 518.9L271.7 642c5.8 4.6 14.4.5 14.4-6.9V388.9c0-7.4-8.5-11.5-14.4-6.9L115.4 505.1a8.74 8.74 0 0 0 0 13.8z"/>
+    </svg>
+  </symbol>
+  <symbol id="svg-menu" viewBox="0 0 24 24">
+    <title>Menu</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-menu">
+      <line x1="3" y1="12" x2="21" y2="12"></line>
+      <line x1="3" y1="6" x2="21" y2="6"></line>
+      <line x1="3" y1="18" x2="21" y2="18"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-arrow-right" viewBox="0 0 24 24">
+    <title>Expand</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-chevron-right">
+      <polyline points="9 18 15 12 9 6"></polyline>
+    </svg>
+  </symbol>
+  <symbol id="svg-sun" viewBox="0 0 24 24">
+    <title>Light mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="feather-sun">
+      <circle cx="12" cy="12" r="5"></circle>
+      <line x1="12" y1="1" x2="12" y2="3"></line>
+      <line x1="12" y1="21" x2="12" y2="23"></line>
+      <line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
+      <line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
+      <line x1="1" y1="12" x2="3" y2="12"></line>
+      <line x1="21" y1="12" x2="23" y2="12"></line>
+      <line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
+      <line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
+    </svg>
+  </symbol>
+  <symbol id="svg-moon" viewBox="0 0 24 24">
+    <title>Dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-moon">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none" />
+      <path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z" />
+    </svg>
+  </symbol>
+  <symbol id="svg-sun-half" viewBox="0 0 24 24">
+    <title>Auto light/dark mode</title>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
+      stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-shadow">
+      <path stroke="none" d="M0 0h24v24H0z" fill="none"/>
+      <circle cx="12" cy="12" r="9" />
+      <path d="M13 12h5" />
+      <path d="M13 15h4" />
+      <path d="M13 18h1" />
+      <path d="M13 9h4" />
+      <path d="M13 6h1" />
+    </svg>
+  </symbol>
+</svg>
+
+<input type="checkbox" class="sidebar-toggle" name="__navigation" id="__navigation">
+<input type="checkbox" class="sidebar-toggle" name="__toc" id="__toc">
+<label class="overlay sidebar-overlay" for="__navigation">
+  <div class="visually-hidden">Hide navigation sidebar</div>
+</label>
+<label class="overlay toc-overlay" for="__toc">
+  <div class="visually-hidden">Hide table of contents sidebar</div>
+</label>
+
+
+
+<div class="page">
+  <header class="mobile-header">
+    <div class="header-left">
+      <label class="nav-overlay-icon" for="__navigation">
+        <div class="visually-hidden">Toggle site navigation sidebar</div>
+        <i class="icon"><svg><use href="#svg-menu"></use></svg></i>
+      </label>
+    </div>
+    <div class="header-center">
+      <a href="index.html"><div class="brand">CUTLASS Python</div></a>
+    </div>
+    <div class="header-right">
+      <div class="theme-toggle-container theme-toggle-header">
+        <button class="theme-toggle">
+          <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+          <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+          <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+          <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+        </button>
+      </div>
+      <label class="toc-overlay-icon toc-header-icon no-toc" for="__toc">
+        <div class="visually-hidden">Toggle table of contents sidebar</div>
+        <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+      </label>
+    </div>
+  </header>
+  <aside class="sidebar-drawer">
+    <div class="sidebar-container">
+      
+      <div class="sidebar-sticky"><a class="sidebar-brand" href="index.html">
+  
+  <div class="sidebar-logo-container">
+    <img class="sidebar-logo only-light" src="_static/cutlass-logo-small.png" alt="Light Logo"/>
+    <img class="sidebar-logo only-dark" src="_static/cutlass-logo-small.png" alt="Dark Logo"/>
+  </div>
+  
+  <span class="sidebar-brand-text">CUTLASS Python</span>
+  
+</a><form class="sidebar-search-container" method="get" action="#" role="search">
+  <input class="sidebar-search" placeholder="Search" name="q" aria-label="Search">
+  <input type="hidden" name="check_keywords" value="yes">
+  <input type="hidden" name="area" value="default">
+</form>
+<div id="searchbox"></div><div class="sidebar-scroll"><div class="sidebar-tree">
+  <ul>
+<li class="toctree-l1"><a class="reference internal" href="index.html">Home</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Getting Started:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="install.html">Installation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="externals/00_basic_gemm.html">Getting Started</a></li>
+<li class="toctree-l1"><a class="reference internal" href="contribute.html">Contributing</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Python Documentation:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="modules.html">CUTLASS Python API</a><input class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" role="switch" type="checkbox"/><label for="toctree-checkbox-1"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2 has-children"><a class="reference internal" href="cutlass.html">CUTLASS</a><input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" role="switch" type="checkbox"/><label for="toctree-checkbox-2"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.emit.html">Emitters</a></li>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.op.html">Operations</a></li>
+<li class="toctree-l3"><a class="reference internal" href="cutlass.utils.html">Utilities</a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Examples and Tutorials:</span></p>
+<ul>
+<li class="toctree-l1 has-children"><a class="reference internal" href="examples.html">Examples</a><input class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" role="switch" type="checkbox"/><label for="toctree-checkbox-3"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
+<li class="toctree-l2"><a class="reference internal" href="externals/00_basic_gemm.html">Basic GEMM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="externals/01_epilogue.html">Epilogue</a></li>
+<li class="toctree-l2"><a class="reference internal" href="externals/02_pytorch_extension_grouped_gemm.html">PyTorch Extension</a></li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Reference:</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference external" href="https://github.com/NVIDIA/cutlass">Github</a></li>
+</ul>
+
+</div>
+</div>
+
+      </div>
+      
+    </div>
+  </aside>
+  <div class="main">
+    <div class="content">
+      <div class="article-container">
+        <a href="#" class="back-to-top muted-link">
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+            <path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12z"></path>
+          </svg>
+          <span>Back to top</span>
+        </a>
+        <div class="content-icon-container">
+          <div class="theme-toggle-container theme-toggle-content">
+            <button class="theme-toggle">
+              <div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
+              <svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
+              <svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
+              <svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
+            </button>
+          </div>
+          <label class="toc-overlay-icon toc-content-icon no-toc" for="__toc">
+            <div class="visually-hidden">Toggle table of contents sidebar</div>
+            <i class="icon"><svg><use href="#svg-toc"></use></svg></i>
+          </label>
+        </div>
+        <article role="main">
+          
+<noscript>
+<div class="admonition error">
+  <p class="admonition-title">Error</p>
+  <p>
+    Please activate JavaScript to enable the search functionality.
+  </p>
+</div>
+</noscript>
+
+<div id="search-results"></div>
+
+        </article>
+      </div>
+      <footer>
+        
+        <div class="related-pages">
+          
+          
+        </div>
+        <div class="bottom-of-page">
+          <div class="left-details">
+            <div class="copyright">
+                Copyright &#169; 2023, NVIDIA
+            </div>
+            Made with <a href="https://www.sphinx-doc.org/">Sphinx</a> and <a class="muted-link" href="https://pradyunsg.me">@pradyunsg</a>'s
+            
+            <a href="https://github.com/pradyunsg/furo">Furo</a>
+            
+          </div>
+          <div class="right-details">
+            <div class="icons">
+              <a class="muted-link " href="https://github.com/NVIDIA/cutlass" aria-label="GitHub">
+                <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 16 16">
+                    <path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"></path>
+                </svg>
+            </a>
+              
+            </div>
+          </div>
+        </div>
+        
+      </footer>
+    </div>
+    <aside class="toc-drawer no-toc">
+      
+      
+      
+    </aside>
+  </div>
+</div><script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
+    <script src="_static/doctools.js"></script>
+    <script src="_static/sphinx_highlight.js"></script>
+    <script src="_static/scripts/furo.js"></script>
+    <script src="_static/clipboard.min.js"></script>
+    <script src="_static/copybutton.js"></script>
+    <script src="_static/tabs.js"></script>
+    <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
+    
+<script src="_static/searchtools.js"></script>
+<script src="_static/language_data.js"></script>
+<script src="searchindex.js"></script></body>
+</html>
\ No newline at end of file
diff --git a/python/docs/searchindex.js b/python/docs/searchindex.js
new file mode 100644
index 0000000000..c50c1331f5
--- /dev/null
+++ b/python/docs/searchindex.js
@@ -0,0 +1 @@
+Search.setIndex({"docnames": ["contribute", "cutlass", "cutlass.emit", "cutlass.op", "cutlass.utils", "examples", "externals/00_basic_gemm", "externals/01_epilogue", "externals/02_pytorch_extension_grouped_gemm", "index", "install", "modules"], "filenames": ["contribute.md", "cutlass.rst", "cutlass.emit.rst", "cutlass.op.rst", "cutlass.utils.rst", "examples.rst", "externals/00_basic_gemm.nblink", "externals/01_epilogue.nblink", "externals/02_pytorch_extension_grouped_gemm.nblink", "index.rst", "install.md", "modules.rst"], "titles": ["Contributing", "CUTLASS", "Emitters", "Operations", "Utilities", "Examples", "Basic example of using the CUTLASS Python interface", "Example of using elementwise activation functions in the CUTLASS Python interface", "Exporting a CUTLASS grouped GEMM kernel to a PyTorch CUDA extension", "CUTLASS Python Interface", "Installation", "CUTLASS Python API"], "terms": {"thank": 0, "you": [0, 6, 9, 10], "your": 0, "interest": 0, "cutlass": [0, 2, 3, 4, 5, 10], "python": [0, 1, 2, 3, 5, 10], "interfac": [0, 1, 3, 5, 10], "base": [0, 1, 3, 4], "type": [0, 1, 3, 6, 7, 8, 11], "fall": [0, 6], "two": [0, 3, 6], "categori": 0, "want": [0, 6], "report": [0, 6], "bug": 0, "featur": [0, 6], "request": 0, "document": [0, 6], "issu": [0, 6], "file": [0, 2, 8], "an": [0, 1, 4, 5, 6, 9, 10], "describ": 0, "what": [0, 6, 9], "encount": 0, "see": [0, 3, 6, 7, 8], "chang": [0, 3, 5, 9, 10], "The": [0, 2, 3, 6, 8, 9, 10], "team": 0, "evalu": 0, "triag": 0, "them": [0, 7, 9, 10], "schedul": [0, 4, 6], "releas": [0, 9, 10], "If": [0, 2, 3, 6, 7, 9, 10], "believ": 0, "need": [0, 6, 7, 9], "prioriti": 0, "attent": 0, "comment": 0, "notifi": 0, "implement": 0, "fix": 0, "we": [0, 6, 7, 8, 9, 10], "welcom": [0, 9], "from": [0, 2, 3, 6, 7, 8], "commun": [0, 9], "recommend": [0, 9, 10], "via": [0, 1, 2, 3, 5, 6, 7, 9, 10], "pull": 0, "have": [0, 3, 6, 9, 10], "question": [0, 4, 6], "about": 0, "consid": [0, 1, 9], "ask": 0, "discuss": 0, "tab": [0, 9], "pleas": [0, 3, 6, 7, 8], "sure": 0, "search": [0, 9], "through": [0, 6, 7, 8], "both": [0, 4], "exist": [0, 3, 9], "whether": [0, 1, 2, 3, 4, 6, 7, 9], "ha": [0, 3, 8, 9, 10], "alreadi": [0, 3], "been": [0, 3, 6, 9, 10], "answer": 0, "emitt": [1, 9, 11], "oper": [1, 2, 4, 5, 7, 8, 9, 11], "util": [1, 2, 8, 11], "registri": 1, "elementwis": [1, 3, 9], "can": [1, 2, 3, 4, 6, 7, 8, 9, 10], "ad": [1, 8], "mani": [1, 3, 6, 8, 9], "kernel": [1, 2, 3, 4, 5, 7, 9], "cutla": 1, "code": [1, 3, 6, 9], "like": [1, 8], "follow": [1, 3, 8, 9, 10], "gemm": [1, 2, 5, 9, 11], "plan": [1, 2, 3, 6, 7, 8, 9, 10], "op": [1, 2, 3, 4, 6, 7, 8, 9], "element": [1, 2, 3, 4, 5, 6, 8, 9], "datatyp": [1, 3, 4], "f32": [1, 3], "layout": [1, 2, 3, 4, 6, 7, 8, 9], "layouttyp": [1, 2, 3, 6, 7, 8, 9], "rowmajor": [1, 2, 3, 6, 7, 8, 9], "activ": [1, 3, 5], "relu": [1, 3, 5, 9], "get_activation_epilogu": [1, 11], "element_output": 1, "elements_per_access": 1, "element_accumul": [1, 3, 6], "element_comput": 1, "sourc": [1, 2, 3, 4, 9], "return": [1, 2, 3, 4, 6, 7, 8], "correspond": [1, 8], "function": [1, 3, 4, 5, 6, 8], "data": [1, 3, 6, 9, 11], "align": [1, 3, 4, 11], "us": [1, 2, 3, 4, 8, 9, 10], "paramet": [1, 2, 3, 4, 6, 9], "output": [1, 2, 3, 6, 7], "int": [1, 2, 3, 4], "operand": [1, 3, 6], "c": [1, 2, 3, 6, 7, 8, 9], "accumul": [1, 3], "which": [1, 2, 3, 6, 8, 9, 10], "comput": [1, 2, 3, 4, 8], "should": [1, 2, 3, 9], "perform": [1, 3, 7, 9], "functor": [1, 3], "get_activ": [1, 7, 11], "list": [1, 3, 4, 7, 9], "avail": [1, 9], "class": [1, 3, 7], "contain": [1, 2, 8, 9, 10], "valid": [1, 3, 4], "given": [1, 3, 6, 7, 9], "capabl": [1, 2, 3, 4, 8], "library_default": 1, "archopt": [1, 11], "target_cc": 1, "kernel_cc": [1, 3], "operation_kind": [1, 4], "gemm_kind": 1, "allowed_math_oper": 1, "mathoper": [1, 4, 9], "multiply_add": [1, 9], "1": [1, 3, 6, 7, 8, 9, 10], "multiply_add_satur": 1, "2": [1, 3, 6, 7, 8, 9], "object": [1, 3], "structur": 1, "keep": 1, "track": 1, "devic": [1, 2, 3, 4, 6, 8], "run": [1, 2, 3, 5, 8, 9, 10], "gener": [1, 2, 3, 6, 7, 8], "operationkind": 1, "regist": 1, "includ": [1, 9], "primit": 1, "math": 1, "allow": [1, 3, 8, 9, 10], "opclass_supports_combin": [1, 11], "op_class": 1, "datatype_comb": 1, "layout_comb": 1, "provid": [1, 3, 4, 6, 8, 9], "support": [1, 4, 6, 7, 9, 10], "combin": [1, 7, 9], "opcodeclass": [1, 3, 4, 6], "tupl": [1, 4], "element_a": [1, 3], "element_b": [1, 3], "layout_a": [1, 3], "layout_b": [1, 3], "set": [1, 2, 3, 4, 6, 7, 8, 9, 10], "A": [1, 2, 3, 7, 8, 9], "b": [1, 2, 3, 7, 8, 9], "kernelsfordatatyp": [1, 11], "supporting_opclass": [1, 11], "particular": [1, 9], "add": [1, 7, 9, 10, 11], "properti": [1, 3, 6], "unsort": 1, "thi": [1, 2, 3, 6, 7, 8, 9, 10], "all_oper": [1, 11], "all": [1, 3, 9, 10], "find_align": [1, 11], "shape": [1, 3, 4, 6, 7], "most": 1, "prefer": [1, 4, 8], "extent": 1, "each": [1, 3, 6, 7, 8], "dimens": [1, 4, 8], "tensor": [1, 3, 4, 6, 7, 8], "maximum": 1, "size": [1, 6, 7, 8], "satisfi": 1, "constraint": [1, 4], "indic": [1, 4], "sort": [1, 11], "kernels_by_align": 1, "descend": 1, "order": [1, 3], "threadblock": [1, 6, 7, 8], "optionregistri": [1, 11], "architectur": 1, "specif": [1, 3, 9], "option": [1, 3, 10], "options_for_cc": [1, 11], "cc": [1, 2, 3, 4, 6, 8], "get_swizzling_functor": [1, 11], "emit": [2, 3, 6, 8, 9], "build": [2, 8, 10], "cuda": [2, 3, 5, 9, 10], "extens": [2, 5, 9], "specifi": [2, 3, 4, 6, 7, 8, 9], "jit": [2, 8, 9], "compil": [2, 3, 6, 7, 8, 9], "": [2, 4, 6, 7, 8, 9], "cpp_extens": [2, 8], "load": [2, 8], "method": [2, 3, 7, 8], "exampl": [2, 3, 8, 10], "usag": 2, "torch": [2, 3, 8], "float32": [2, 3, 6, 9], "construct": [2, 3, 6, 7, 8], "mod": 2, "cutlass_gemm": 2, "80": [2, 3], "true": [2, 3, 6, 7, 8], "input": [2, 3, 6, 7], "ones": [2, 9], "512": [2, 8], "_": [2, 8], "rang": [2, 8, 9], "3": [2, 6, 7, 8, 9, 10], "modul": [2, 8, 9, 10], "d": [2, 3, 7, 8, 9], "without": [2, 3, 8, 9], "fals": [2, 3, 6, 7], "sourcedir": [2, 8], "after": [2, 3, 7], "call": [2, 3, 6, 7, 8], "directori": [2, 8, 9, 10], "setup": [2, 8, 9, 10], "py": [2, 6, 7, 8, 9, 10], "cpp": [2, 8], "cutlass_gemm_kernel": 2, "cu": [2, 8], "built": [2, 8, 9], "within": [2, 4, 8, 9], "torch_cuda_arch_list": [2, 8], "8": [2, 6, 7, 8, 9, 10], "0": [2, 3, 6, 7, 8, 9, 10], "develop": [2, 9], "user": [2, 6, 8, 9, 10], "later": [2, 8], "import": [2, 6, 7, 8, 9], "name": [2, 6, 7, 8, 9], "leverag": [2, 6, 8], "i": [2, 3, 4, 6, 7, 8, 9, 10], "just": [2, 6, 8], "time": [2, 6, 8, 9], "result": [2, 3, 6, 7, 8], "str": 2, "target": 2, "bool": [2, 3], "written": [2, 6, 8, 9], "none": [2, 3, 4], "eas": [3, 9], "meant": 3, "one": [3, 6, 8, 9, 10], "easili": 3, "instanti": 3, "configur": [3, 6, 9], "under": [3, 6, 8], "hood": [3, 6, 8], "select": [3, 9], "sensibl": [3, 9], "default": [3, 5, 7, 9, 11], "templat": [3, 6, 9], "note": [3, 6, 9, 10], "optim": [3, 9], "expect": 3, "To": [3, 6, 7, 8, 9, 10], "achiev": [3, 9], "tune": 3, "simplest": 3, "ar": [3, 6, 9, 10], "numpi": [3, 6, 7, 9], "cupi": [3, 6], "One": [3, 6, 7, 8, 9], "also": [3, 6], "differ": [3, 6, 7, 8, 9], "runtim": [3, 6, 8], "shorthand": 3, "element_c": 3, "element_d": 3, "a0": 3, "rand": 3, "128": [3, 6, 7, 8], "256": [3, 6, 7, 8], "b0": 3, "64": [3, 6, 7, 8], "c0": 3, "zero": [3, 6, 7], "d0": 3, "32": [3, 6], "a1": 3, "b1": 3, "c1": 3, "d1": 3, "addition": 3, "enabl": [3, 8, 9], "decoupl": 3, "underli": 3, "its": [3, 8, 9], "execut": [3, 6, 8], "np": [3, 6, 7, 9], "do": [3, 6, 7, 9], "other": [3, 5, 6, 9], "work": [3, 9], "fuse": [3, 9], "epilogu": [3, 5, 6, 7, 8, 9, 11], "asynchron": 3, "arg": 3, "sync": 3, "alpha": [3, 6, 7], "beta": [3, 6, 7], "layout_c": 3, "operationbas": [3, 11], "along": 3, "bound": 3, "throughout": 3, "lifetim": 3, "constructor": [3, 6], "flexibli": 3, "equival": 3, "row": 3, "major": [3, 10], "concis": 3, "same": [3, 6, 8, 9], "valu": 3, "explicitli": 3, "when": [3, 6, 9, 10], "than": [3, 6], "pass": [3, 6], "here": [3, 6, 7], "though": 3, "those": [3, 6, 9], "must": [3, 9, 10], "present": [3, 9], "onli": [3, 6, 7, 9], "some": [3, 6, 8], "unspecifi": 3, "inherit": 3, "preced": 3, "e": [3, 6, 9, 10], "g": [3, 6, 9], "infer": [3, 9, 10], "otherwis": [3, 4, 9], "For": [3, 6, 8, 9, 10], "h100": 3, "90": [3, 6], "sm90": [3, 6, 9], "desir": 3, "x": [3, 7, 8], "style": 3, "amper": 3, "repres": [3, 6], "scalar": 3, "paramt": 3, "scale": 3, "product": [3, 9], "well": [3, 7], "layout_d": 3, "current": [3, 6, 10], "tile_descript": [3, 4, 6], "alignment_a": 3, "alignment_b": 3, "alignment_c": 3, "print_modul": [3, 6, 7, 8], "ani": [3, 8, 9], "chosen": 3, "tile": [3, 4, 6], "descript": [3, 4, 6, 9], "backend": [3, 4, 6, 7, 9], "tiledescript": [3, 4], "print": [3, 6, 7, 8, 9, 10], "wa": [3, 6], "gemmoperationunivers": 3, "gemmuniversaloper": 3, "opclass": [3, 4, 6, 9], "opcod": 3, "batch_count": 3, "hold": [3, 9, 10], "either": [3, 8, 9], "By": [3, 6], "onc": 3, "complet": 3, "launch": [3, 6, 9, 10], "immedi": 3, "In": [3, 6, 8, 9], "case": [3, 6, 8], "respons": 3, "caller": 3, "syncrhon": 3, "befor": [3, 6], "attempt": [3, 6, 9], "access": [3, 6], "argument": 3, "number": [3, 4], "batch": 3, "wait": 3, "gemmargu": 3, "swizzling_functor": [3, 6], "swizzl": [3, 6, 9, 11], "being": [3, 4, 6], "swizz": 3, "groupedgemm": [3, 8, 11], "As": [3, 6, 8, 9], "f16": 3, "gemm_group": 3, "gemmoperationgroup": 3, "gemmgroupedargu": 3, "defin": [3, 6, 7, 8, 9], "high": [3, 6, 7, 9], "level": [3, 9, 10], "conv2d": 3, "possibl": [3, 6, 9], "calcul": 4, "attribut": 4, "alignment_or_default": [4, 11], "alignment_provid": 4, "default_align": 4, "doe": [4, 7, 9], "exce": 4, "calculate_smem_usag": [4, 11], "amount": 4, "share": [4, 6], "memori": [4, 6], "byte": 4, "consum": 4, "calculate_smem_usage_per_stag": [4, 11], "singl": [4, 8], "stage": [4, 6], "valid_cluster_shap": [4, 11], "cluster_shap": 4, "thread": [4, 6, 7, 8], "block": 4, "cluster": [4, 6], "first": [4, 6, 7, 8, 9, 10], "second": 4, "error": [4, 5, 9], "messag": [4, 6], "valid_kernel_schedul": [4, 11], "kernel_schedul": 4, "kernelscheduletyp": 4, "valid_stage_count": [4, 11], "td": [4, 6], "raw": 4, "limit": [4, 9], "capac": 4, "convert": 4, "between": 4, "frontend": 4, "backend_math_oper": [4, 11], "math_op": 4, "bfloat16_library_typ": [4, 11], "inp": 4, "bfloat16_typ": [4, 11], "bfloat16": 4, "binding_layout": [4, 11], "binding_library_typ": [4, 11], "binding_opclass": [4, 11], "binding_typ": [4, 11], "construct_backend_td": [4, 11], "cupy_library_typ": [4, 11], "cupy_typ": [4, 11], "get_datatype_and_layout": [4, 11], "has_binding_typ": [4, 11], "library_layout": [4, 11], "library_to_bind": [4, 11], "library_typ": [4, 11], "numpy_library_typ": [4, 11], "numpy_typ": [4, 11], "td_from_profiler_op": [4, 11], "profil": [4, 6, 8, 9], "td_from_profiler_td": [4, 11], "torch_library_typ": [4, 11], "torch_typ": [4, 11], "basic": [5, 7, 8], "declar": [5, 7, 9], "mode": 5, "cach": 5, "non": [5, 8], "handl": 5, "ident": 5, "wise": 5, "pytorch": [5, 6, 9, 10, 11], "background": 5, "group": [5, 9, 11], "export": [5, 9], "notebook": [6, 7, 8, 9], "walk": [6, 7, 8], "variou": [6, 7], "packag": [6, 7, 8, 9], "our": [6, 7, 8], "random": [6, 7, 8], "control": [6, 7], "ther": [6, 7], "step": [6, 7], "omit": [6, 7], "inform": [6, 7], "m": [6, 7, 8], "n": [6, 7, 8], "k": [6, 7, 8, 9], "dtype": [6, 8, 9], "float16": [6, 7, 8, 9], "type_a": [6, 7], "type_b": [6, 7], "type_c": [6, 7], "type_d": [6, 7], "seed": [6, 7, 8], "1234": [6, 7], "scope_min": [6, 7], "4": [6, 7, 8, 9], "scope_max": [6, 7], "tensor_a": [6, 7], "ceil": [6, 7], "uniform": [6, 7], "low": [6, 7], "astyp": [6, 7], "tensor_b": [6, 7], "tensor_c": [6, 7], "tensor_d": [6, 7], "usr": [6, 7, 8], "local": [6, 7, 8], "lib": [6, 7, 8], "python3": [6, 7, 8], "dist": [6, 7, 8], "tqdm": [6, 7, 8], "auto": [6, 7, 8], "22": [6, 7, 8], "tqdmwarn": [6, 7, 8], "iprogress": [6, 7, 8], "found": [6, 7, 8], "updat": [6, 7, 8], "jupyt": [6, 7, 8, 9], "ipywidget": [6, 7, 8], "http": [6, 7, 8], "readthedoc": [6, 7, 8], "io": [6, 7, 8], "en": [6, 7, 8], "stabl": [6, 7, 8], "user_instal": [6, 7, 8], "html": [6, 7, 8, 9], "autonotebook": [6, 7, 8], "notebook_tqdm": [6, 7, 8], "get": 6, "start": [6, 8], "abov": [6, 8, 9, 10], "up": [6, 8], "assum": [6, 8], "sm80": [6, 7, 8], "fp16": 6, "core": 6, "previous": 6, "so": 6, "match": [6, 9, 10], "below": 6, "howev": [6, 8, 9], "requir": [6, 8, 9, 10], "cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_256x128_64x3_tt_align8": 6, "cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_256x128_64x3_tt_align8_bas": 6, "typenam": [6, 7, 8], "defaultgemmunivers": [6, 7], "lt": [6, 7, 8], "half_t": [6, 7, 8], "complextransform": [6, 7, 8], "knone": [6, 7, 8], "float": 6, "arch": [6, 7, 8], "opclasstensorop": [6, 7, 8], "gemmshap": [6, 7, 8], "gt": [6, 7, 8], "16": [6, 7, 8], "linearcombin": [6, 7, 8], "gemmidentitythreadblockswizzl": [6, 7, 8], "opmultiplyadd": [6, 7, 8], "gemmkernel": [6, 7, 8], "struct": [6, 7, 8], "cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_256x128_64x3_tt_align8_typ": 6, "public": [6, 7, 8], "gemm_oper": [6, 7], "gemmarguments2x": [6, 7], "0x7f79cc556070": 6, "There": 6, "wai": [6, 9], "specifii": 6, "thei": 6, "more": [6, 8], "detail": [6, 8], "compar": [6, 8], "tensor_d_numpi": 6, "test": [6, 7, 9, 10], "assert_array_equ": [6, 7], "could": [6, 8], "framework": [6, 9], "beyond": 6, "try": 6, "whenev": 6, "back": [6, 8], "simt": [6, 9], "tensorop": 6, "suppos": 6, "don": 6, "t": [6, 9, 10], "field": [6, 7], "shown": 6, "fit": [6, 9], "notic": [6, 9], "around": [6, 8], "free": 6, "were": 6, "initi": [6, 8], "intial": 6, "5": [6, 7, 8], "tensor_d_simt": 6, "cutlass_sm80_simt_f16_sgemm_f16_1x1x1_128x128_8x2_tt_align1": 6, "cutlass_sm80_simt_f16_sgemm_f16_1x1x1_128x128_8x2_tt_align1_bas": 6, "opclasssimt": 6, "cutlass_sm80_simt_f16_sgemm_f16_1x1x1_128x128_8x2_tt_align1_typ": 6, "0x7f7b3075abe0": 6, "ran": 6, "equal": 6, "6": [6, 7, 8], "mai": [6, 9, 10], "previou": 6, "took": 6, "becaus": 6, "had": 6, "yet": 6, "binari": [6, 9], "recompil": 6, "isn": 6, "necessari": [6, 10], "everi": 6, "again": 6, "ll": [6, 8], "find": 6, "much": 6, "faster": 6, "7": [6, 9, 10], "2400": 6, "3232": 6, "4096": [6, 9], "0x7f7b30fb9880": 6, "show": 6, "how": 6, "simpl": [6, 8], "But": 6, "bit": 6, "over": 6, "enumer": [6, 9], "warp": 6, "format": [6, 8], "len": 6, "num_print": 6, "10": [6, 9, 10], "132": 6, "clustershap": 6, "threadblockshap": 6, "warpcount": 6, "scheduleauto": 6, "next": [6, 8], "pick": 6, "9": [6, 9, 10], "idx": 6, "randint": [6, 8], "112": 6, "cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_128x128_32x4_tt_align8": 6, "cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_128x128_32x4_tt_align8_bas": 6, "cutlass_sm80_tensorop_f16_s16x8x16gemm_f16_1x1x1_128x128_32x4_tt_align8_typ": 6, "0x7f79cc58de20": 6, "modifi": 6, "stream": [6, 9], "pre": [6, 9], "least": 6, "threadblockswizzlestreamk": 6, "catch": 6, "understand": 6, "too": 6, "normal": 6, "would": [6, 8], "due": 6, "gpu": [6, 8, 9, 10], "insuffici": [6, 8], "abl": 6, "detect": 6, "11": [6, 9, 10], "begin": 7, "simpli": [7, 8], "known": [7, 9], "cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8": 7, "cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_bas": 7, "cutlass_sm80_tensorop_h16x8x16gemm_1x1x1_256x128_64x3_tt_align8_typ": 7, "0x7fed907287c0": 7, "make": [7, 9], "easi": [7, 9], "linear": 7, "act": [7, 9], "formul": 7, "max": 7, "tensor_d_relu": 7, "linearcombinationgener": 7, "0x7fed906f2460": 7, "now": 7, "verifi": 7, "relu_ref": 7, "varieti": 7, "wide": 7, "obtain": 7, "39": 7, "gelu": 7, "hardswish": 7, "leaky_relu": 7, "sigmoid": 7, "silu": 7, "tanh": 7, "f": [7, 9, 10], "leakyrelu": 7, "potenti": 8, "stride": 8, "It": [8, 9], "thought": 8, "version": [8, 9, 10], "pointer": 8, "arrai": 8, "p": 8, "m_1": 8, "n_1": 8, "k_1": 8, "m_2": 8, "n_2": 8, "k_2": 8, "m_p": 8, "n_p": 8, "k_p": 8, "particularli": 8, "benefici": 8, "satur": 8, "small": 8, "problem": 8, "isol": 8, "similarli": 8, "2023": [8, 9], "matric": 8, "def": 8, "generate_problem": 8, "valid_s": 8, "1024": 8, "choic": 8, "append": 8, "50": 8, "ds_torch": 8, "zip": 8, "d_torch": 8, "assert": 8, "allclos": 8, "cutlass_sm80_tensorop_h16x8x16gemm_grouped_1x1x1_256x128_64x3_tt_align8": 8, "cutlass_sm80_tensorop_h16x8x16gemm_grouped_1x1x1_256x128_64x3_tt_align8_bas": 8, "defaultgemmgroup": 8, "groupschedulemod": 8, "kdeviceonli": 8, "cutlass_sm80_tensorop_h16x8x16gemm_grouped_1x1x1_256x128_64x3_tt_align8_typ": 8, "procedur": 8, "quickli": 8, "experi": 8, "might": 8, "avoid": 8, "overhead": [8, 9], "associ": 8, "portion": 8, "solut": 8, "creat": [8, 9], "These": [8, 9], "out": [8, 9], "ahead": 8, "grouped_gemm": 8, "grouped_gemm_kernel": 8, "wrapper": 8, "aforement": 8, "setuptool": 8, "script": 8, "instal": 8, "module_output": 8, "where": 8, "torch_arch_list": 8, "custom": 8, "tutori": 8, "cd": [8, 9], "fly": 8, "vanilla": 8, "final": 8, "num_warmup": 8, "20": 8, "num_profil": 8, "100": 8, "warmup": 8, "iter": 8, "synchron": 8, "nongroup": 8, "3f": 8, "u": 8, "1e6": 8, "speedup": 8, "400": 8, "696": 8, "646": 8, "670": 8, "614": 8, "experiment": 9, "api": 9, "futur": [9, 10], "feedback": 9, "aim": 9, "toward": 9, "few": 9, "reduc": 9, "occurr": 9, "favor": 9, "except": 9, "intend": 9, "made": 9, "highest": 9, "scenario": 9, "wish": 9, "librari": [9, 11], "cubla": 9, "heurist": 9, "fast": 9, "strive": 9, "minim": 9, "surround": 9, "deploi": 9, "directli": 9, "automat": 9, "engin": 9, "atop": 9, "convolut": 9, "nearli": 9, "space": 9, "while": 9, "flexibl": 9, "similar": 9, "come": 9, "burden": 9, "contrast": 9, "higher": 9, "exhaust": 9, "At": 9, "remain": 9, "continu": 9, "replac": 9, "refer": 9, "cutlass_bind": 9, "math_inst": 9, "mathinstruct": 9, "backward": 9, "compat": 9, "maintain": 9, "move": 9, "forward": 9, "docker": 9, "imag": [9, 10], "locat": [9, 10], "cuda12": [9, 10], "latest": [9, 10], "dockerfil": [9, 10], "rm": [9, 10], "12": [9, 10], "prior": [9, 10], "cutlass_path": [9, 10], "path": [9, 10], "clone": [9, 10], "repositori": [9, 10], "cuda_install_path": [9, 10], "process": [9, 10], "pwd": [9, 10], "bin": [9, 10], "nvcc": [9, 10], "awk": [9, 10], "reflect": [9, 10], "lab": 9, "sphinx": 9, "addit": 9, "sudo": 9, "apt": 9, "pandoc": 9, "pip": 9, "upgrad": 9, "furo": 9, "myst": 9, "parser": 9, "copybutton": 9, "nbsphinx": 9, "link": 9, "inlin": 9, "instruct": 9, "command": 9, "apidoc": 9, "o": 9, "docs_src": 9, "mv": 9, "_build": 9, "doc": 9, "nvidia": 9, "corpor": 9, "affili": 9, "right": 9, "reserv": 9, "spdx": 9, "licens": 9, "identifi": 9, "bsd": 9, "claus": 9, "redistribut": 9, "form": 9, "modif": 9, "permit": 9, "condit": 9, "met": 9, "retain": 9, "disclaim": 9, "reproduc": 9, "materi": 9, "distribut": 9, "neither": 9, "holder": 9, "nor": 9, "contributor": 9, "endors": 9, "promot": 9, "deriv": 9, "softwar": 9, "permiss": 9, "BY": 9, "THE": 9, "AND": 9, "AS": 9, "express": 9, "OR": 9, "impli": 9, "warranti": 9, "BUT": 9, "NOT": 9, "TO": 9, "OF": 9, "merchant": 9, "FOR": 9, "purpos": 9, "IN": 9, "NO": 9, "event": 9, "shall": 9, "BE": 9, "liabl": 9, "direct": 9, "indirect": 9, "incident": 9, "special": 9, "exemplari": 9, "consequenti": 9, "damag": 9, "procur": 9, "substitut": 9, "good": 9, "servic": 9, "loss": 9, "profit": 9, "busi": 9, "interrupt": 9, "caus": 9, "ON": 9, "theori": 9, "liabil": 9, "contract": 9, "strict": 9, "tort": 9, "neglig": 9, "aris": 9, "even": 9, "IF": 9, "advis": 9, "SUCH": 9, "index": 9, "page": 9, "toolkit": 10, "minor": 10, "environ": 10, "variabl": 10, "ensur": 10, "ngc": 10, "subpackag": 11, "common": 11, "check": 11}, "objects": {"cutlass.emit": [[2, 0, 0, "-", "common"], [2, 0, 0, "-", "pytorch"]], "cutlass.emit.pytorch": [[2, 1, 1, "", "pytorch"]], "cutlass": [[1, 0, 0, "-", "epilogue"], [1, 0, 0, "-", "library_defaults"], [1, 0, 0, "-", "swizzle"]], "cutlass.epilogue": [[1, 1, 1, "", "get_activation_epilogue"], [1, 1, 1, "", "get_activations"]], "cutlass.library_defaults": [[1, 2, 1, "", "ArchOptions"], [1, 2, 1, "", "KernelsForDataType"], [1, 2, 1, "", "OptionRegistry"]], "cutlass.library_defaults.ArchOptions": [[1, 3, 1, "", "opclass_supports_combination"], [1, 3, 1, "", "operations"], [1, 3, 1, "", "supporting_opclasses"]], "cutlass.library_defaults.KernelsForDataType": [[1, 3, 1, "", "add"], [1, 4, 1, "", "alignments"], [1, 4, 1, "", "all_operations"], [1, 3, 1, "", "find_alignment"], [1, 3, 1, "", "operations"], [1, 3, 1, "", "sort"]], "cutlass.library_defaults.OptionRegistry": [[1, 3, 1, "", "options_for_cc"]], "cutlass.op": [[3, 0, 0, "-", "gemm"], [3, 0, 0, "-", "gemm_grouped"], [3, 0, 0, "-", "op"]], "cutlass.op.gemm": [[3, 2, 1, "", "Gemm"]], "cutlass.op.gemm.Gemm": [[3, 4, 1, "", "activation"], [3, 3, 1, "", "compile"], [3, 3, 1, "", "construct"], [3, 4, 1, "", "opclass"], [3, 3, 1, "", "run"], [3, 4, 1, "", "swizzling_functor"], [3, 3, 1, "", "tile_descriptions"]], "cutlass.op.gemm_grouped": [[3, 2, 1, "", "GroupedGemm"]], "cutlass.op.gemm_grouped.GroupedGemm": [[3, 3, 1, "", "construct"], [3, 3, 1, "", "run"], [3, 4, 1, "", "swizzling_functor"]], "cutlass.op.op": [[3, 2, 1, "", "OperationBase"]], "cutlass.op.op.OperationBase": [[3, 3, 1, "", "activations"], [3, 3, 1, "", "swizzling_functors"]], "cutlass.swizzle": [[1, 1, 1, "", "get_swizzling_functors"]], "cutlass.utils": [[4, 0, 0, "-", "check"], [4, 0, 0, "-", "datatypes"]], "cutlass.utils.check": [[4, 1, 1, "", "alignment_or_default"], [4, 1, 1, "", "calculate_smem_usage"], [4, 1, 1, "", "calculate_smem_usage_per_stage"], [4, 1, 1, "", "valid_cluster_shape"], [4, 1, 1, "", "valid_kernel_schedule"], [4, 1, 1, "", "valid_stage_count"]], "cutlass.utils.datatypes": [[4, 1, 1, "", "backend_math_operation"], [4, 1, 1, "", "bfloat16_library_type"], [4, 1, 1, "", "bfloat16_type"], [4, 1, 1, "", "binding_layout"], [4, 1, 1, "", "binding_library_type"], [4, 1, 1, "", "binding_opclass"], [4, 1, 1, "", "binding_type"], [4, 1, 1, "", "construct_backend_td"], [4, 1, 1, "", "cupy_library_type"], [4, 1, 1, "", "cupy_type"], [4, 1, 1, "", "get_datatype_and_layout"], [4, 1, 1, "", "has_binding_type"], [4, 1, 1, "", "library_layout"], [4, 1, 1, "", "library_to_binding"], [4, 1, 1, "", "library_type"], [4, 1, 1, "", "numpy_library_type"], [4, 1, 1, "", "numpy_type"], [4, 1, 1, "", "td_from_profiler_op"], [4, 1, 1, "", "td_from_profiler_td"], [4, 1, 1, "", "torch_library_type"], [4, 1, 1, "", "torch_type"]]}, "objtypes": {"0": "py:module", "1": "py:function", "2": "py:class", "3": "py:method", "4": "py:property"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "function", "Python function"], "2": ["py", "class", "Python class"], "3": ["py", "method", "Python method"], "4": ["py", "property", "Python property"]}, "titleterms": {"contribut": 0, "cutlass": [1, 6, 7, 8, 9, 11], "subpackag": 1, "epilogu": 1, "librari": 1, "default": [1, 6], "swizzl": 1, "emitt": 2, "common": 2, "pytorch": [2, 8], "oper": [3, 6], "gemm": [3, 6, 7, 8], "group": [3, 8], "util": 4, "check": 4, "data": 4, "type": 4, "exampl": [5, 6, 7, 9], "basic": 6, "us": [6, 7], "python": [6, 7, 8, 9, 11], "interfac": [6, 7, 8, 9], "declar": [6, 8], "run": [6, 7], "chang": 6, "mode": [6, 10], "cach": 6, "kernel": [6, 8], "non": [6, 9], "handl": 6, "error": 6, "elementwis": 7, "activ": 7, "function": [7, 9], "an": 7, "ident": 7, "relu": 7, "element": 7, "wise": 7, "other": 7, "export": 8, "cuda": 8, "extens": 8, "background": 8, "via": 8, "overview": 9, "goal": 9, "comparison": 9, "pycutlass": 9, "transit": 9, "from": [9, 10], "current": 9, "get": 9, "start": 9, "option": 9, "environ": 9, "variabl": 9, "instal": [9, 10], "build": 9, "document": 9, "copyright": 9, "indic": 9, "tabl": 9, "sourc": 10, "develop": 10, "packag": 10, "docker": 10, "api": 11}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 8, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "nbsphinx": 4, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1, "sphinx": 57}, "alltitles": {"Contributing": [[0, "contributing"]], "CUTLASS": [[1, "cutlass"]], "Subpackages": [[1, "subpackages"]], "Epilogue": [[1, "module-cutlass.epilogue"]], "Library Defaults": [[1, "module-cutlass.library_defaults"]], "Swizzle": [[1, "module-cutlass.swizzle"]], "Emitters": [[2, "emitters"]], "Common": [[2, "module-cutlass.emit.common"]], "PyTorch": [[2, "module-cutlass.emit.pytorch"]], "Operations": [[3, "operations"]], "GEMM": [[3, "module-cutlass.op.gemm"]], "Grouped GEMM": [[3, "module-cutlass.op.gemm_grouped"]], "Operation": [[3, "module-cutlass.op.op"]], "Utilities": [[4, "utilities"]], "Checks": [[4, "module-cutlass.utils.check"]], "Data Types": [[4, "module-cutlass.utils.datatypes"]], "Examples": [[5, "examples"], [9, "examples"]], "Basic example of using the CUTLASS Python interface": [[6, "Basic-example-of-using-the-CUTLASS-Python-interface"]], "Declaring and running a GEMM": [[6, "Declaring-and-running-a-GEMM"]], "Changing operation modes": [[6, "Changing-operation-modes"]], "Running cached kernels": [[6, "Running-cached-kernels"]], "Running non-default GEMMs": [[6, "Running-non-default-GEMMs"]], "Handling errors": [[6, "Handling-errors"]], "Example of using elementwise activation functions in the CUTLASS Python interface": [[7, "Example-of-using-elementwise-activation-functions-in-the-CUTLASS-Python-interface"]], "Run a GEMM with an identity activation function": [[7, "Run-a-GEMM-with-an-identity-activation-function"]], "Run a GEMM with a ReLU element-wise activation function": [[7, "Run-a-GEMM-with-a-ReLU-element-wise-activation-function"]], "Other element-wise activation functions": [[7, "Other-element-wise-activation-functions"]], "Exporting a CUTLASS grouped GEMM kernel to a PyTorch CUDA extension": [[8, "Exporting-a-CUTLASS-grouped-GEMM-kernel-to-a-PyTorch-CUDA-extension"]], "Background on grouped GEMM": [[8, "Background-on-grouped-GEMM"]], "Declaring a grouped GEMM via the CUTLASS Python interface": [[8, "Declaring-a-grouped-GEMM-via-the-CUTLASS-Python-interface"]], "Exporting the CUTLASS kernel to a PyTorch CUDA extension": [[8, "Exporting-the-CUTLASS-kernel-to-a-PyTorch-CUDA-extension"]], "CUTLASS Python Interface": [[9, "cutlass-python-interface"]], "Overview": [[9, "overview"]], "Non-goals": [[9, "non-goals"]], "Comparison to PyCUTLASS": [[9, "comparison-to-pycutlass"]], "Transitioning from PyCUTLASS": [[9, "transitioning-from-pycutlass"]], "Current functionality": [[9, "current-functionality"]], "Getting started": [[9, "getting-started"]], "Optional environment variables": [[9, "optional-environment-variables"]], "Installation": [[9, "installation"], [10, "installation"]], "Building documentation": [[9, "building-documentation"]], "Copyright": [[9, "copyright"]], "Indices and tables": [[9, "indices-and-tables"]], "Installing from source": [[10, "installing-from-source"]], "Installing a developer-mode package": [[10, "installing-a-developer-mode-package"]], "Docker": [[10, "docker"]], "CUTLASS Python API": [[11, "cutlass-python-api"]]}, "indexentries": {"archoptions (class in cutlass.library_defaults)": [[1, "cutlass.library_defaults.ArchOptions"]], "kernelsfordatatype (class in cutlass.library_defaults)": [[1, "cutlass.library_defaults.KernelsForDataType"]], "optionregistry (class in cutlass.library_defaults)": [[1, "cutlass.library_defaults.OptionRegistry"]], "add() (cutlass.library_defaults.kernelsfordatatype method)": [[1, "cutlass.library_defaults.KernelsForDataType.add"]], "alignments (cutlass.library_defaults.kernelsfordatatype property)": [[1, "cutlass.library_defaults.KernelsForDataType.alignments"]], "all_operations (cutlass.library_defaults.kernelsfordatatype property)": [[1, "cutlass.library_defaults.KernelsForDataType.all_operations"]], "cutlass.epilogue": [[1, "module-cutlass.epilogue"]], "cutlass.library_defaults": [[1, "module-cutlass.library_defaults"]], "cutlass.swizzle": [[1, "module-cutlass.swizzle"]], "find_alignment() (cutlass.library_defaults.kernelsfordatatype method)": [[1, "cutlass.library_defaults.KernelsForDataType.find_alignment"]], "get_activation_epilogue() (in module cutlass.epilogue)": [[1, "cutlass.epilogue.get_activation_epilogue"]], "get_activations() (in module cutlass.epilogue)": [[1, "cutlass.epilogue.get_activations"]], "get_swizzling_functors() (in module cutlass.swizzle)": [[1, "cutlass.swizzle.get_swizzling_functors"]], "module": [[1, "module-cutlass.epilogue"], [1, "module-cutlass.library_defaults"], [1, "module-cutlass.swizzle"], [2, "module-cutlass.emit.common"], [2, "module-cutlass.emit.pytorch"], [3, "module-cutlass.op.gemm"], [3, "module-cutlass.op.gemm_grouped"], [3, "module-cutlass.op.op"], [4, "module-cutlass.utils.check"], [4, "module-cutlass.utils.datatypes"]], "opclass_supports_combination() (cutlass.library_defaults.archoptions method)": [[1, "cutlass.library_defaults.ArchOptions.opclass_supports_combination"]], "operations() (cutlass.library_defaults.archoptions method)": [[1, "cutlass.library_defaults.ArchOptions.operations"]], "operations() (cutlass.library_defaults.kernelsfordatatype method)": [[1, "cutlass.library_defaults.KernelsForDataType.operations"]], "options_for_cc() (cutlass.library_defaults.optionregistry method)": [[1, "cutlass.library_defaults.OptionRegistry.options_for_cc"]], "sort() (cutlass.library_defaults.kernelsfordatatype method)": [[1, "cutlass.library_defaults.KernelsForDataType.sort"]], "supporting_opclasses() (cutlass.library_defaults.archoptions method)": [[1, "cutlass.library_defaults.ArchOptions.supporting_opclasses"]], "cutlass.emit.common": [[2, "module-cutlass.emit.common"]], "cutlass.emit.pytorch": [[2, "module-cutlass.emit.pytorch"]], "pytorch() (in module cutlass.emit.pytorch)": [[2, "cutlass.emit.pytorch.pytorch"]], "gemm (class in cutlass.op.gemm)": [[3, "cutlass.op.gemm.Gemm"]], "groupedgemm (class in cutlass.op.gemm_grouped)": [[3, "cutlass.op.gemm_grouped.GroupedGemm"]], "operationbase (class in cutlass.op.op)": [[3, "cutlass.op.op.OperationBase"]], "activation (cutlass.op.gemm.gemm property)": [[3, "cutlass.op.gemm.Gemm.activation"]], "activations() (cutlass.op.op.operationbase method)": [[3, "cutlass.op.op.OperationBase.activations"]], "compile() (cutlass.op.gemm.gemm method)": [[3, "cutlass.op.gemm.Gemm.compile"]], "construct() (cutlass.op.gemm.gemm method)": [[3, "cutlass.op.gemm.Gemm.construct"]], "construct() (cutlass.op.gemm_grouped.groupedgemm method)": [[3, "cutlass.op.gemm_grouped.GroupedGemm.construct"]], "cutlass.op.gemm": [[3, "module-cutlass.op.gemm"]], "cutlass.op.gemm_grouped": [[3, "module-cutlass.op.gemm_grouped"]], "cutlass.op.op": [[3, "module-cutlass.op.op"]], "opclass (cutlass.op.gemm.gemm property)": [[3, "cutlass.op.gemm.Gemm.opclass"]], "run() (cutlass.op.gemm.gemm method)": [[3, "cutlass.op.gemm.Gemm.run"]], "run() (cutlass.op.gemm_grouped.groupedgemm method)": [[3, "cutlass.op.gemm_grouped.GroupedGemm.run"]], "swizzling_functor (cutlass.op.gemm.gemm property)": [[3, "cutlass.op.gemm.Gemm.swizzling_functor"]], "swizzling_functor (cutlass.op.gemm_grouped.groupedgemm property)": [[3, "cutlass.op.gemm_grouped.GroupedGemm.swizzling_functor"]], "swizzling_functors() (cutlass.op.op.operationbase method)": [[3, "cutlass.op.op.OperationBase.swizzling_functors"]], "tile_descriptions() (cutlass.op.gemm.gemm method)": [[3, "cutlass.op.gemm.Gemm.tile_descriptions"]], "alignment_or_default() (in module cutlass.utils.check)": [[4, "cutlass.utils.check.alignment_or_default"]], "backend_math_operation() (in module cutlass.utils.datatypes)": [[4, "cutlass.utils.datatypes.backend_math_operation"]], "bfloat16_library_type() (in module cutlass.utils.datatypes)": [[4, "cutlass.utils.datatypes.bfloat16_library_type"]], "bfloat16_type() (in module cutlass.utils.datatypes)": [[4, "cutlass.utils.datatypes.bfloat16_type"]], "binding_layout() (in module cutlass.utils.datatypes)": [[4, "cutlass.utils.datatypes.binding_layout"]], "binding_library_type() (in module cutlass.utils.datatypes)": [[4, "cutlass.utils.datatypes.binding_library_type"]], "binding_opclass() (in module cutlass.utils.datatypes)": [[4, "cutlass.utils.datatypes.binding_opclass"]], "binding_type() (in module cutlass.utils.datatypes)": [[4, "cutlass.utils.datatypes.binding_type"]], "calculate_smem_usage() (in module cutlass.utils.check)": [[4, "cutlass.utils.check.calculate_smem_usage"]], "calculate_smem_usage_per_stage() (in module cutlass.utils.check)": [[4, "cutlass.utils.check.calculate_smem_usage_per_stage"]], "construct_backend_td() (in module cutlass.utils.datatypes)": [[4, "cutlass.utils.datatypes.construct_backend_td"]], "cupy_library_type() (in module cutlass.utils.datatypes)": [[4, "cutlass.utils.datatypes.cupy_library_type"]], "cupy_type() (in module cutlass.utils.datatypes)": [[4, "cutlass.utils.datatypes.cupy_type"]], "cutlass.utils.check": [[4, "module-cutlass.utils.check"]], "cutlass.utils.datatypes": [[4, "module-cutlass.utils.datatypes"]], "get_datatype_and_layout() (in module cutlass.utils.datatypes)": [[4, "cutlass.utils.datatypes.get_datatype_and_layout"]], "has_binding_type() (in module cutlass.utils.datatypes)": [[4, "cutlass.utils.datatypes.has_binding_type"]], "library_layout() (in module cutlass.utils.datatypes)": [[4, "cutlass.utils.datatypes.library_layout"]], "library_to_binding() (in module cutlass.utils.datatypes)": [[4, "cutlass.utils.datatypes.library_to_binding"]], "library_type() (in module cutlass.utils.datatypes)": [[4, "cutlass.utils.datatypes.library_type"]], "numpy_library_type() (in module cutlass.utils.datatypes)": [[4, "cutlass.utils.datatypes.numpy_library_type"]], "numpy_type() (in module cutlass.utils.datatypes)": [[4, "cutlass.utils.datatypes.numpy_type"]], "td_from_profiler_op() (in module cutlass.utils.datatypes)": [[4, "cutlass.utils.datatypes.td_from_profiler_op"]], "td_from_profiler_td() (in module cutlass.utils.datatypes)": [[4, "cutlass.utils.datatypes.td_from_profiler_td"]], "torch_library_type() (in module cutlass.utils.datatypes)": [[4, "cutlass.utils.datatypes.torch_library_type"]], "torch_type() (in module cutlass.utils.datatypes)": [[4, "cutlass.utils.datatypes.torch_type"]], "valid_cluster_shape() (in module cutlass.utils.check)": [[4, "cutlass.utils.check.valid_cluster_shape"]], "valid_kernel_schedule() (in module cutlass.utils.check)": [[4, "cutlass.utils.check.valid_kernel_schedule"]], "valid_stage_count() (in module cutlass.utils.check)": [[4, "cutlass.utils.check.valid_stage_count"]]}})
\ No newline at end of file
diff --git a/python/docs_src/Makefile b/python/docs_src/Makefile
new file mode 100644
index 0000000000..92dd33a1a4
--- /dev/null
+++ b/python/docs_src/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/python/docs_src/make.bat b/python/docs_src/make.bat
new file mode 100644
index 0000000000..954237b9b9
--- /dev/null
+++ b/python/docs_src/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/python/docs_src/source/_static/cutlass-logo-small.png b/python/docs_src/source/_static/cutlass-logo-small.png
new file mode 100644
index 0000000000..6c2a313fde
Binary files /dev/null and b/python/docs_src/source/_static/cutlass-logo-small.png differ
diff --git a/python/docs_src/source/_static/logo-dark-mode.png b/python/docs_src/source/_static/logo-dark-mode.png
new file mode 100644
index 0000000000..6b005a283b
Binary files /dev/null and b/python/docs_src/source/_static/logo-dark-mode.png differ
diff --git a/python/docs_src/source/_static/logo-light-mode.png b/python/docs_src/source/_static/logo-light-mode.png
new file mode 100644
index 0000000000..c07d6848c9
Binary files /dev/null and b/python/docs_src/source/_static/logo-light-mode.png differ
diff --git a/python/docs_src/source/_templates/layout.html b/python/docs_src/source/_templates/layout.html
new file mode 100644
index 0000000000..31c1aaaaf3
--- /dev/null
+++ b/python/docs_src/source/_templates/layout.html
@@ -0,0 +1,94 @@
+{% extends "!layout.html" %}
+  {% block sidebartitle %} {{ super() }}
+
+  <style>
+    /* Sidebar header (and topbar for mobile) */
+    .wy-side-nav-search, .wy-nav-top {
+      background: #76b900;
+    }
+
+    .wy-menu > p > span.caption-text {
+      color: #76b900;
+    }
+
+    .wy-menu-vertical p {
+      height: 32px;
+      line-height: 32px;
+      padding: 0 1.618em;
+      margin: 12px 0 0;
+      display: block;
+      font-weight: 700;
+      text-transform: uppercase;
+      font-size: 85%;
+      white-space: nowrap;
+    }
+
+    .wy-side-nav-search a:link, .wy-nav-top a:link {
+      color: #fff;
+    }
+    .wy-side-nav-search a:visited, .wy-nav-top a:visited {
+      color: #fff;
+    }
+    .wy-side-nav-search a:hover, .wy-nav-top a:hover {
+      color: #fff;
+    }
+
+    .wy-menu-vertical a:link, .wy-menu-vertical a:visited {
+      color: #d9d9d9
+    }
+
+    .wy-menu-vertical a:active {
+      background-color: #76b900
+    }
+
+    .wy-side-nav-search>div.version {
+      color: rgba(0, 0, 0, 0.3)
+    }
+
+    .wy-nav-content {
+      max-width: 1000px;
+    }
+
+    /* override table width restrictions */
+    .wy-table-responsive table td, .wy-table-responsive table th {
+        /* !important prevents the common CSS stylesheets from
+          overriding this as on RTD they are loaded after this stylesheet */
+        white-space: normal !important;
+    }
+
+    .wy-table-responsive {
+        overflow: visible !important;
+    }
+
+  </style>
+  {% endblock %}
+
+  {% block footer %} {{ super() }}
+
+  <style>
+  a:link, a:visited {
+    color: #76b900;
+  }
+
+  a:hover {
+    color: #8c0;
+  }
+
+  html.writer-html4 .rst-content dl:not(.docutils)>dt, html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt {
+    background: rgba(118, 185, 0, 0.1);
+    color: rgba(59,93,0,1);
+    border-top: solid 3px rgba(59,93,0,1);
+  }
+
+  html.writer-html4 .rst-content dl:not(.docutils) .property, html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) .property {
+    text-transform: capitalize;
+    display: inline-block;
+    padding-right: 8px;
+  }
+  </style>
+
+  {%- if nvidia_analytics_id %}
+  <script type="text/javascript">_satellite.pageBottom();</script>
+  {%- endif %}
+
+  {% endblock %}
diff --git a/python/docs_src/source/conf.py b/python/docs_src/source/conf.py
new file mode 100644
index 0000000000..762dd037a7
--- /dev/null
+++ b/python/docs_src/source/conf.py
@@ -0,0 +1,132 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import sys
+
+sys.path.insert(0, os.path.abspath('..'))
+sys.path.insert(0, os.path.abspath('../..'))
+sys.path.insert(0, os.path.abspath('../../media/docs'))
+
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+
+project = 'CUTLASS Python interface'
+copyright = '2023, NVIDIA'
+author = 'NVIDIA'
+release = '3.1.0'
+
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+        'myst_parser',
+        'nbsphinx',
+        'nbsphinx_link', 
+        'sphinx_copybutton',
+        'sphinx.ext.autodoc',
+        'sphinx.ext.autosectionlabel',
+        'sphinx.ext.autosummary',
+        'sphinx.ext.coverage',
+        'sphinx.ext.extlinks',
+        'sphinx.ext.ifconfig',
+        'sphinx.ext.intersphinx',
+        'sphinx.ext.mathjax',
+        'sphinx.ext.napoleon',
+        'sphinx.ext.viewcode',
+        'sphinx_inline_tabs',
+        ]
+
+source_suffix = {
+    '.rst': 'restructuredtext',
+    '.md': 'markdown',
+}
+
+autodoc_typehints = 'description'
+
+pygments_style = "sphinx"
+pygments_dark_style = "monokai"
+
+templates_path = ['_templates']
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+# Ignore errors when converting notebooks
+nbsphinx_allow_errors = True
+
+language = 'en'
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+
+html_static_path = ['_static']
+
+html_title = "CUTLASS Python"
+html_baseurl = 'docs'
+html_theme = 'furo'
+html_theme_options = {
+	"light_logo": "cutlass-logo-small.png",
+	"dark_logo": "cutlass-logo-small.png",
+    "light_css_variables": {
+        "color-brand-primary": "#76B900",
+        "color-brand-content": "#76B900",
+    },
+    "dark_css_variables": {
+        "color-brand-primary": "#76B900",
+        "color-brand-content": "#76B900",
+    },
+    "footer_icons": [
+        {
+            "name": "GitHub",
+            "url": "https://github.com/NVIDIA/cutlass",
+            "html": """
+                <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 16 16">
+                    <path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"></path>
+                </svg>
+            """,
+            "class": "",
+        },
+    ],
+}
diff --git a/python/docs_src/source/contribute.md b/python/docs_src/source/contribute.md
new file mode 100644
index 0000000000..42475252b2
--- /dev/null
+++ b/python/docs_src/source/contribute.md
@@ -0,0 +1,9 @@
+# Contributing
+
+Thank you for your interest in contributing to the CUTLASS Python interface. Based on the type of contribution, it will fall into two categories:
+
+1. You want to report a bug, feature request, or documentation issue
+    - File an [issue](https://github.com/NVIDIA/cutlass/issues/new/choose) describing what you encountered or what you want to see changed.
+    - The CUTLASS team will evaluate the issues and triage them, scheduling them for a release. If you believe the issue needs priority attention, comment on the issue to notify the team.
+2. You want to implement a feature or bug-fix
+    - We welcome contributions from the community. We recommend that you contribute via a [pull request](https://github.com/NVIDIA/cutlass/pulls). If you have questions about CUTLASS, consider asking a question via the [Discussions](https://github.com/NVIDIA/cutlass/discussions) tab. Please be sure to search through both existing issues and discussions to see whether your question has already been answered.
diff --git a/python/docs_src/source/cutlass.emit.rst b/python/docs_src/source/cutlass.emit.rst
new file mode 100644
index 0000000000..3e65d40776
--- /dev/null
+++ b/python/docs_src/source/cutlass.emit.rst
@@ -0,0 +1,18 @@
+Emitters
+========
+
+Common
+------
+
+.. automodule:: cutlass.emit.common
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+PyTorch
+-------
+
+.. automodule:: cutlass.emit.pytorch
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/python/docs_src/source/cutlass.op.rst b/python/docs_src/source/cutlass.op.rst
new file mode 100644
index 0000000000..3b8a2b7e57
--- /dev/null
+++ b/python/docs_src/source/cutlass.op.rst
@@ -0,0 +1,26 @@
+Operations
+==========
+
+GEMM
+----
+
+.. automodule:: cutlass.op.gemm
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Grouped GEMM
+------------
+
+.. automodule:: cutlass.op.gemm_grouped
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Operation
+---------
+
+.. automodule:: cutlass.op.op
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/python/docs_src/source/cutlass.rst b/python/docs_src/source/cutlass.rst
new file mode 100644
index 0000000000..a65c251821
--- /dev/null
+++ b/python/docs_src/source/cutlass.rst
@@ -0,0 +1,36 @@
+CUTLASS
+=======
+
+Subpackages
+-----------
+
+.. toctree::
+   :maxdepth: 1
+
+   cutlass.emit
+   cutlass.op
+   cutlass.utils
+
+Epilogue
+--------
+
+.. automodule:: cutlass.epilogue
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Library Defaults
+----------------
+
+.. automodule:: cutlass.library_defaults
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Swizzle
+----------
+
+.. automodule:: cutlass.swizzle
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/python/docs_src/source/cutlass.utils.rst b/python/docs_src/source/cutlass.utils.rst
new file mode 100644
index 0000000000..58e56e5665
--- /dev/null
+++ b/python/docs_src/source/cutlass.utils.rst
@@ -0,0 +1,18 @@
+Utilities
+=========
+
+Checks
+------
+
+.. automodule:: cutlass.utils.check
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Data Types
+----------
+
+.. automodule:: cutlass.utils.datatypes
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/python/docs_src/source/examples.rst b/python/docs_src/source/examples.rst
new file mode 100644
index 0000000000..3cea362113
--- /dev/null
+++ b/python/docs_src/source/examples.rst
@@ -0,0 +1,9 @@
+Examples
+==================
+
+.. toctree::
+   :maxdepth: 5
+
+   Basic GEMM <externals/00_basic_gemm.nblink>
+   Epilogue <externals/01_epilogue.nblink>
+   PyTorch Extension <externals/02_pytorch_extension_grouped_gemm.nblink>
diff --git a/python/docs_src/source/externals/00_basic_gemm.nblink b/python/docs_src/source/externals/00_basic_gemm.nblink
new file mode 100644
index 0000000000..b384198567
--- /dev/null
+++ b/python/docs_src/source/externals/00_basic_gemm.nblink
@@ -0,0 +1,3 @@
+{
+  "path": "./../../../../examples/python/00_basic_gemm.ipynb"
+}
diff --git a/python/docs_src/source/externals/01_epilogue.nblink b/python/docs_src/source/externals/01_epilogue.nblink
new file mode 100644
index 0000000000..14503a1ee3
--- /dev/null
+++ b/python/docs_src/source/externals/01_epilogue.nblink
@@ -0,0 +1,3 @@
+{
+  "path": "./../../../../examples/python/01_epilogue.ipynb"
+}
diff --git a/python/docs_src/source/externals/02_pytorch_extension_grouped_gemm.nblink b/python/docs_src/source/externals/02_pytorch_extension_grouped_gemm.nblink
new file mode 100644
index 0000000000..7da19aff8d
--- /dev/null
+++ b/python/docs_src/source/externals/02_pytorch_extension_grouped_gemm.nblink
@@ -0,0 +1,3 @@
+{
+  "path": "./../../../../examples/python/02_pytorch_extension_grouped_gemm.ipynb"
+}
diff --git a/python/docs_src/source/index.rst b/python/docs_src/source/index.rst
new file mode 100644
index 0000000000..73cc742d5a
--- /dev/null
+++ b/python/docs_src/source/index.rst
@@ -0,0 +1,55 @@
+.. CUTLASS Python interface documentation master file, created by
+   sphinx-quickstart on Mon Feb 13 17:57:39 2023.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+.. include:: ../../README.md
+   :start-line: 1
+   :parser: markdown
+
+.. toctree::
+   :hidden:
+
+   Home <self>
+
+.. toctree::
+   :hidden:
+   :caption: Getting Started:
+
+   install.md
+   Getting Started <externals/00_basic_gemm.nblink>
+   contribute.md
+
+.. toctree::
+   :hidden:
+   :caption: Python Documentation:
+
+   modules.rst
+
+.. toctree::
+   :hidden:
+   :caption: Examples and Tutorials:
+
+   examples.rst
+
+.. toctree::
+   :hidden:
+   :caption: Advanced:
+
+.. toctree::
+   :hidden:
+   :caption: FAQ:
+
+.. toctree::
+   :hidden:
+   :caption: Reference:
+
+   Github <https://github.com/NVIDIA/cutlass>
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/python/docs_src/source/install.md b/python/docs_src/source/install.md
new file mode 100644
index 0000000000..e0513fe1ba
--- /dev/null
+++ b/python/docs_src/source/install.md
@@ -0,0 +1,40 @@
+# Installation
+
+## Installing a stable release
+
+Stable releases of the CUTLASS Python interface are available via the `nvidia-cutlass` PyPI package. Any other packages with the name `cutlass` are not affiliated with NVIDIA CUTLASS.
+```bash
+pip install nvidia-cutlass
+```
+
+## Installing from source
+
+Installing from source requires the latest CUDA Toolkit that matches the major.minor of CUDA Python installed.
+
+Prior to installing the CUTLASS Python interface, one may optionally set the following environment variables:
+* `CUTLASS_PATH`: the path to the cloned CUTLASS repository
+* `CUDA_INSTALL_PATH`: the path to the installation of CUDA
+
+If these environment variables are not set, the installation process will infer them to be the following:
+* `CUTLASS_PATH`: either one directory level above the current directory (i.e., `$(pwd)/..`) if installed locally or in the `source` directory of the location in which `cutlass_library` was installed
+* `CUDA_INSTALL_PATH`: the directory holding `/bin/nvcc` for the first version of `nvcc` on `$PATH` (i.e., `which nvcc | awk -F'/bin/nvcc' '{print $1}'`)
+
+**NOTE:** The version of `cuda-python` installed must match the CUDA version in `CUDA_INSTALL_PATH`.
+
+### Installing a developer-mode package
+The CUTLASS Python interface can currently be installed by navigating to the root of the CUTLASS directory and performing
+```bash
+pip install .
+```
+
+If you would like to be able to make changes to CULASS Python interface and have them reflected when using the interface, perform:
+```bash
+pip install -e .
+```
+
+## Docker
+We recommend using the CUTLASS Python interface via an [NGC PyTorch Docker container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch):
+
+```bash
+docker run --gpus all -it --rm nvcr.io/nvidia/pytorch:23.08-py3
+```
diff --git a/python/docs_src/source/modules.rst b/python/docs_src/source/modules.rst
new file mode 100644
index 0000000000..467824e9e8
--- /dev/null
+++ b/python/docs_src/source/modules.rst
@@ -0,0 +1,7 @@
+CUTLASS Python API
+==================
+
+.. toctree::
+   :maxdepth: 5
+
+   cutlass
diff --git a/python/pycute/__init__.py b/python/pycute/__init__.py
new file mode 100644
index 0000000000..2fbfb6c670
--- /dev/null
+++ b/python/pycute/__init__.py
@@ -0,0 +1,36 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from .int_tuple import *
+from .layout import *
+from .swizzle import *
+from .typing import *
diff --git a/python/pycute/int_tuple.py b/python/pycute/int_tuple.py
new file mode 100644
index 0000000000..da8e2a6c29
--- /dev/null
+++ b/python/pycute/int_tuple.py
@@ -0,0 +1,230 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Functions for manipulating IntTuples
+"""
+
+from functools import reduce
+from itertools import chain
+from typing import Union
+from .typing import Integer
+
+
+def is_int(x):
+  return isinstance(x, Integer)
+
+
+def is_tuple(x):
+  return isinstance(x, tuple)
+
+
+def flatten(t):
+  if is_tuple(t):
+    if len(t) == 0:
+      return ()
+    else:
+      return tuple(i for a in t for i in flatten(a))
+  else:
+    return (t,)
+
+
+def signum(a):
+  return bool(a > 0) - bool(a < 0)
+
+
+def product(a):
+  if is_tuple(a):
+    return reduce(lambda val,elem : val*product(elem), a, 1)
+  else:
+    return a
+
+
+def inner_product(a, b):
+  if is_tuple(a):                      # tuple tuple
+    assert len(a) == len(b)
+    return sum(inner_product(x,y) for x,y in zip(a,b))
+  else:                                # "int" "int"
+    assert not is_tuple(b)
+    return a * b
+
+
+def tuple_max(a):
+  if is_tuple(a):
+    return max(tuple_max(x) for x in a)
+  else:
+    return a
+
+
+def elem_scale(a, b):
+  if is_tuple(a):
+    if is_tuple(b):                     # tuple tuple
+      assert len(a) == len(b)
+      return tuple(elem_scale(x,y) for x,y in zip(a,b))
+    else:                               # tuple "int"
+      assert False           # Error
+  else:
+    if is_tuple(b):                     # "int" tuple
+      return elem_scale(a, product(b))
+    else:                               # "int" "int"
+      return a * b
+
+
+# Inclusive prefix ceil div with output congruent to input a
+def shape_div(a, b):
+  if is_tuple(a):
+    if is_tuple(b):                    # tuple tuple
+      assert len(a) == len(b)
+      return tuple(shape_div(x,y) for x,y in zip(a,b))
+    else:                              # tuple "int"
+      #r = [shape_div(a[0],b)] + [shape_div(a[i],b := shape_div(b, product(a[i-1]))) for i in range(1,len(a))]
+      r = []
+      for v in a:
+        r.append(shape_div(v,b))
+        b = shape_div(b,product(v))
+      return tuple(r)
+  else:
+    if is_tuple(b):                    # "int" tuple
+      return shape_div(a, product(b))
+    else:                              # "int" "int"
+      assert a % b == 0 or b % a == 0
+      #return -(-a // b)      # Python exclusive impl: "//" is always floor div
+      if a % b == 0:
+        return a // b
+      else:
+        return signum(a*b)
+
+
+# Exclusive prefix product with output congruent to input a
+def prefix_product(a, init=1):
+  if is_tuple(a):
+    if is_tuple(init):                 # tuple tuple
+      assert len(a) == len(init)
+      return tuple(prefix_product(x,i) for x,i in zip(a,init))
+    else:                              # tuple "int"
+      #r = [prefix_product(a[0],init)] + [prefix_product(a[i],init := init * product(a[i-1])) for i in range(1,len(a))]
+      r = []
+      for v in a:
+        r.append(prefix_product(v,init))
+        init = init * product(v)
+      return tuple(r)
+  else:
+    if is_tuple(init):                 # "int" tuple
+      assert False           # Error
+    else:                              # "int" "int"
+      return init
+
+
+def idx2crd(idx, shape, stride=None):
+  if stride is None:
+    stride = prefix_product(shape)
+
+  if is_tuple(idx):
+    if is_tuple(shape):                # tuple tuple tuple
+      assert len(idx) == len(shape) and len(idx) == len(stride)
+      return tuple(idx2crd(i, s, d) for i, s, d in zip(idx,shape,stride))
+    else:                              # tuple "int" "int"
+      assert False           # Error
+  else:
+    if is_tuple(shape):                # "int" tuple tuple
+      assert len(shape) == len(stride)
+      return tuple(idx2crd(idx, s, d) for s,d in zip(shape,stride))
+    else:                              # "int" "int" "int"
+      return (idx // stride) % shape
+
+
+def crd2idx(crd, shape, stride=None):
+  if stride is None:
+    stride = prefix_product(shape)
+
+  if is_tuple(crd):
+    if is_tuple(shape):                # tuple tuple tuple
+      assert len(crd) == len(shape) and len(crd) == len(stride)
+      return sum(crd2idx(c, s, d) for c, s, d in zip(crd, shape, stride))
+    else:                              # tuple "int" "int"
+      assert False, f"crd={crd}, shape={shape}"           # Error
+  else:
+    if crd is None:
+      crd = 0
+
+    if is_tuple(shape):                # "int" tuple tuple
+      assert len(shape) == len(stride)
+      result = 0
+      for i in range(len(shape)-1):
+        result += crd2idx(crd % product(shape[i]), shape[i], stride[i])
+        crd = crd // product(shape[i])
+      return result + crd2idx(crd, shape[-1], stride[-1])
+    else:                              # "int" "int" "int"
+      return crd * stride
+
+
+# Transform crd into the dst_shape's iteration space
+def crd2crd(crd, dst_shape, src_shape=None):
+  if is_tuple(crd):
+    if is_tuple(dst_shape):            # tuple tuple
+      assert len(crd) == len(dst_shape)
+      return tuple(crd2crd(x, y) for x, y in zip(crd,dst_shape))
+    else:                              # tuple "int"
+      # Ambiguous unless we have src_shape
+      assert src_shape is not None
+      return crd2idx(crd, src_shape)
+  else:
+    if is_tuple(dst_shape):            # "int" tuple
+      return idx2crd(crd, dst_shape)
+    else:                              # "int" "int"
+      assert crd < dst_shape
+      return crd
+
+
+# Filter trg according to crd: keep only elements of trg that are paired with None
+def slice_(crd: Union[None, tuple, int],
+           trg: Union[tuple, int]):
+  if is_tuple(crd):
+    if is_tuple(trg):                  # tuple tuple
+      assert len(crd) == len(trg)
+      # match C++ behavior of `filter_tuple` using `tuple_cat(...)`
+      return tuple(chain(*filter(lambda x: x != (), [slice_(c, s) for c, s in zip(crd, trg)])))
+    else:
+      assert False                     # tuple "int" : Error
+  elif crd is None:
+    # match C++ behavior `return cute::tuple<B>{b};`
+    return (trg,)
+  else:
+    return ()
+
+
+# Determine if None appears at any of an int_tuples' terminals
+def has_none(a: Union[None, tuple, int]):
+  if is_tuple(a):
+    return any(has_none(v) for v in a)
+  else:
+    return a is None
diff --git a/python/pycute/layout.py b/python/pycute/layout.py
new file mode 100644
index 0000000000..881b8095f9
--- /dev/null
+++ b/python/pycute/layout.py
@@ -0,0 +1,358 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Definition of CuTe Layouts and functions to manipulate them
+"""
+
+from itertools import chain
+from typing import Union
+
+from .int_tuple import *
+
+
+class LayoutBase:
+  pass
+
+
+def is_layout(x):
+  return isinstance(x, LayoutBase)
+
+
+class Layout(LayoutBase):
+  def __init__(self, _shape, _stride=None):
+    self.shape  = _shape
+    if _stride is None:
+      self.stride = prefix_product(self.shape)
+    else:
+      self.stride = _stride
+
+  # operator ==
+  def __eq__(self, other):
+    return self.shape == other.shape and self.stride == other.stride
+
+  # operator len(L)  (len [rank] like tuples)
+  def __len__(self):
+    if is_tuple(self.shape):
+      return len(self.shape)
+    else:
+      return 1
+
+  # operator ()    (map coord to idx)
+  def __call__(self, *args):
+    """
+    Map a logical coordinate to a linear index (Coord has no Underscore slice operators)
+    OR
+    Slice the layout and return the sublayout (Coord has an Underscore slice op)
+
+    Follow the same behavior of `Layout::operator(Coord const&)` in cute C++
+    """
+    if has_none(args):
+      if len(args) == 1:
+        return Layout(slice_(args[0], self.shape), slice_(args[0], self.stride))
+      else:
+        return Layout(slice_(args, self.shape), slice_(args, self.stride))
+    else:
+      if len(args) == 1:
+        return crd2idx(args[0], self.shape, self.stride)
+      else:
+        return crd2idx(args, self.shape, self.stride)
+
+  # operator []    (get-i like tuples)
+  def __getitem__(self, i):
+    if is_tuple(self.shape):
+      return Layout(self.shape[i], self.stride[i])
+    else:
+      assert i == 0
+      return Layout(self.shape, self.stride)
+
+  # size(layout)   Size of the domain
+  def size(self):
+    return product(self.shape)
+
+  # cosize(layout)   Size of the codomain
+  def cosize(self):
+    return self(self.size() - 1) + 1
+
+  # print and str
+  def __str__(self):
+    return f"{self.shape}:{self.stride}"
+
+  # error msgs and representation
+  def __repr__(self):
+    return f"Layout({self.shape},{self.stride})"
+
+
+# Make Layout from a list of layouts (each layout it's own mode in the result)
+def make_layout(*layouts):
+  if len(layouts) == 1 and not is_layout(layouts[0]):
+    layouts = layouts[0]
+
+  shape, stride = zip(*((a.shape,a.stride) for a in layouts))
+  return Layout(shape, stride)
+
+
+# Size of the domain
+def size(layout):
+  if is_layout(layout):
+    return layout.size()
+  return product(layout)
+
+
+# Size of the codomain
+def cosize(layout):
+  return layout.cosize()
+
+
+# Layout coalesce -- flatten and combine as many modes as possible while preserving the int-to-int function
+def coalesce(layout, profile=None):
+  if is_tuple(profile):
+    assert len(layout) >= len(profile)
+    return make_layout(chain((coalesce(layout[i], profile[i]) for i in range(           0,len(profile))),
+                             (layout[i]                       for i in range(len(profile),len(layout)))))
+
+  result_shape  = [1]
+  result_stride = [0]
+  for (shape,stride) in zip(flatten(layout.shape),flatten(layout.stride)):
+    # skip their shape-1s
+    if shape == 1:
+      continue
+    # replace our shape-1 with anything
+    elif result_shape[-1] == 1:
+      result_shape[-1]  = shape
+      result_stride[-1] = stride
+    # merge modes if the shape*stride match
+    elif result_shape[-1] * result_stride[-1] == stride:
+      result_shape[-1] = result_shape[-1] * shape
+    # append a new mode
+    else:
+      result_shape.append(shape)
+      result_stride.append(stride)
+
+  if len(result_shape) == 1:
+    return Layout(result_shape[0], result_stride[0])
+  else:
+    return Layout(tuple(result_shape), tuple(result_stride))
+
+
+# Layout filter -- replace all stride-0 modes with size-1 and then coalesce to remove them
+def filter(layout, profile=None):
+  if is_tuple(profile):
+    assert len(layout) >= len(profile)
+    return make_layout(chain((filter(layout[i], profile[i]) for i in range(           0,len(profile))),
+                             (layout[i]                     for i in range(len(profile),len(layout)))))
+
+  result_shape  = []
+  result_stride = []
+  for (shape,stride) in zip(flatten(layout.shape),flatten(layout.stride)):
+    # skip their shape-1s and stride-0s
+    if not (shape == 1 or stride == 0):
+      result_shape.append(shape)
+      result_stride.append(stride)
+
+  if len(result_shape) == 0:
+    return Layout(1,0)
+  else:
+    return coalesce(Layout(tuple(result_shape), tuple(result_stride)))
+
+
+# Layout composition
+# Use tuples-of-layouts to perform this operation by-mode and None as no-op
+def composition(layoutA, layoutB):
+  if layoutB is None:
+    return layoutA
+  elif is_int(layoutB):
+    return composition(layoutA, Layout(layoutB))
+  elif is_tuple(layoutB):
+    assert len(layoutA) >= len(layoutB)
+    return make_layout(chain((composition(layoutA[i], layoutB[i]) for i in range(           0,len(layoutB))),
+                             (layoutA[i]                          for i in range(len(layoutB),len(layoutA)))))
+  elif is_tuple(layoutB.shape):
+    return make_layout(composition(layoutA, layoutB_i) for layoutB_i in layoutB)
+
+  if layoutB.stride == 0:
+    return Layout(layoutB.shape, 0)
+  else:
+    result_shape  = []
+    result_stride = []
+    rest_shape   = layoutB.shape
+    rest_stride  = layoutB.stride
+    for (s, d) in zip(flatten(layoutA.shape)[:-1], flatten(layoutA.stride)[:-1]):
+      s1 = shape_div(s, rest_stride)
+      result_shape.append(min(s1,rest_shape))
+      result_stride.append(rest_stride * d)
+      rest_shape  = shape_div(rest_shape, abs(s1))
+      rest_stride = shape_div(rest_stride, s)
+
+    result_shape.append(rest_shape)
+    result_stride.append(rest_stride * flatten(layoutA.stride)[-1])
+
+    return coalesce(Layout(tuple(result_shape), tuple(result_stride)))
+
+
+# Layout complement
+def complement(layout, max_idx=1):
+  if is_int(layout):
+    return complement(Layout(layout))
+
+  result_shape  = []
+  result_stride = []
+  current_idx = 1
+
+  sorted_DS = sorted(zip(flatten(layout.stride), flatten(layout.shape)))
+  for (stride, shape) in sorted_DS:
+    if stride == 0 or shape == 1:
+      continue
+
+    in_bound = current_idx <= shape * stride
+    # To support symbolic value which can't be evaluated now
+    assert (type(in_bound) is not bool) or in_bound
+
+    result_shape.append(stride // current_idx)
+    result_stride.append(current_idx)
+    current_idx = shape * stride
+
+  result_shape.append((max_idx + current_idx - 1) // current_idx)  # ceil_div
+  result_stride.append(current_idx)
+
+  return coalesce(Layout(tuple(result_shape), tuple(result_stride)))
+
+
+# Layout right inverse
+def right_inverse(layout):
+  if layout is None:
+    return None
+  elif is_int(layout):
+    return Layout(layout)
+
+  result_shape  = []
+  result_stride = []
+  current_idx = 1
+
+  flat_shape  = flatten(layout.shape)
+  flat_stride = flatten(layout.stride)
+  sorted_DSA = sorted(zip(flat_stride, flat_shape, prefix_product(flat_shape)))
+  for (stride,shape,rstride) in sorted_DSA:
+    if shape == 1:
+      continue
+    if current_idx != stride:
+      break
+
+    result_shape.append(shape)
+    result_stride.append(rstride)
+    current_idx = shape * stride
+
+  return coalesce(Layout(tuple(result_shape), tuple(result_stride)))
+
+
+# Layout left inverse
+def left_inverse(layout):
+  if layout is None:
+    return None
+  elif is_int(layout):
+    return Layout(layout)
+  return right_inverse(make_layout(layout, complement(layout)))
+
+
+# Split a layout by the composition of B and the "rest"
+# Use tuples-of-layouts to perform this operation by-mode and None as no-op
+def logical_divide(layoutA, layoutB):
+  if layoutB is None:
+    return layoutA
+  elif is_int(layoutB):
+    return logical_divide(layoutA, Layout(layoutB))
+  elif is_tuple(layoutB):
+    assert len(layoutA) >= len(layoutB)
+    return make_layout(chain((logical_divide(layoutA[i], layoutB[i]) for i in range(           0,len(layoutB))),
+                             (layoutA[i]                             for i in range(len(layoutB),len(layoutA)))))
+
+  return composition(layoutA, make_layout(layoutB, complement(layoutB, size(layoutA))))
+
+
+# Reproduce a layoutA over a layoutB
+# Use tuples-of-layouts to perform this operation by-mode and None as no-op
+def logical_product(layoutA, layoutB):
+  if layoutB is None:
+    return layoutA
+  elif is_int(layoutB):
+    return logical_divide(layoutA, Layout(layoutB))
+  elif is_tuple(layoutB):
+    assert len(layoutA) >= len(layoutB)
+    return make_layout(chain((logical_product(layoutA[i], layoutB[i]) for i in range(           0,len(layoutB))),
+                             (layoutA[i]                              for i in range(len(layoutB),len(layoutA)))))
+
+  return make_layout(layoutA, composition(complement(layoutA, size(layoutA)*cosize(layoutB)), layoutB));
+
+
+# Gather the modes from a hierarchical logical_divide or logical_product
+def hier_unzip(splitter, layoutA, layoutB):
+  if layoutB is None:
+    return make_layout(Layout(1,0), layoutA)
+  elif is_tuple(layoutB):
+    assert len(layoutA) >= len(layoutB)
+    # A layout with shape ((A,a),(B,b),(C,c))
+    split = make_layout(hier_unzip(splitter, layoutA[i], layoutB[i]) for i in range(0,len(layoutB)))
+    # Gather to shape ((A,B,C,...),(a,b,c,...,y,z))
+    return make_layout(make_layout(       split[i][0] for i in range(           0,len(layoutB))),
+                       make_layout(chain((split[i][1] for i in range(           0,len(layoutB))),
+                                         (layoutA[i]  for i in range(len(layoutB),len(layoutA))))))
+
+  # splitter must return a rank-2 layout
+  return splitter(layoutA, layoutB)
+
+
+# Apply logical divide hierarchically and gather the split modes into two modes
+def zipped_divide(layoutA, layoutB):
+  return hier_unzip(logical_divide, layoutA, layoutB)
+
+
+# Perform logical divide hierarchically and gather tiles (B-layouts) into a new mode
+def tiled_divide(layoutA, layoutB):
+  result = zipped_divide(layoutA, layoutB)
+  return make_layout([result[0]] + [result[1][i] for i in range(len(result[1]))])
+
+
+# Apply logical product hierarchically and gather the split modes into two modes
+def zipped_product(layoutA, layoutB):
+  return hier_unzip(logical_product, layoutA, layoutB)
+
+
+# Perform logical product hierarchically and gather tiles (B-layouts) into a new mode
+def tiled_product(layoutA, layoutB):
+  result = zipped_product(layoutA, layoutB)
+  return make_layout([result[0]] + [result[1][i] for i in range(len(result[1]))])
+
+
+def slice_and_offset(crd: tuple,
+                     layout: Layout):
+  return (Layout(slice_(crd, layout.shape), slice_(crd, layout.stride)),
+          crd2idx(crd, layout.shape, layout.stride))
diff --git a/python/pycute/swizzle.py b/python/pycute/swizzle.py
new file mode 100644
index 0000000000..336df1c371
--- /dev/null
+++ b/python/pycute/swizzle.py
@@ -0,0 +1,129 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Methods for layout swizzling
+"""
+
+from .layout import *
+
+
+def shiftr(a, s):
+  return a >> s if s > 0 else shiftl(a, -s)
+
+
+def shiftl(a, s):
+  return a << s if s > 0 else shiftr(a, -s)
+
+
+## A generic Swizzle functor
+ # 0bxxxxxxxxxxxxxxxYYYxxxxxxxZZZxxxx
+ #                               ^--^  Base is the number of least-sig bits to keep constant
+ #                  ^-^       ^-^      Bits is the number of bits in the mask
+ #                    ^---------^      Shift is the distance to shift the YYY mask
+ #                                       (pos shifts YYY to the right, neg shifts YYY to the left)
+ #
+ # e.g. Given
+ # 0bxxxxxxxxxxxxxxxxYYxxxxxxxxxZZxxx
+ # the result is
+ # 0bxxxxxxxxxxxxxxxxYYxxxxxxxxxAAxxx where AA = ZZ xor YY
+ #
+class Swizzle:
+  def __init__(self, bits, base, shift):
+    assert bits >= 0
+    assert base >= 0
+    assert abs(shift) >= bits
+    self.bits = bits
+    self.base = base
+    self.shift = shift
+    bit_msk = (1 << bits) - 1
+    self.yyy_msk = bit_msk << (base + max(0,shift))
+    self.zzz_msk = bit_msk << (base - min(0,shift))
+
+  # operator ()    (transform integer)
+  def __call__(self, offset):
+    return offset ^ shiftr(offset & self.yyy_msk, self.shift)
+
+  # Size of the domain
+  def size(self):
+    return 1 << (bits + base + abs(shift))
+
+  # Size of the codomain
+  def cosize(self):
+    return self.size()
+
+  # print and str
+  def __str__(self):
+    return f"SW_{self.bits}_{self.base}_{self.shift}"
+
+  # error msgs and representation
+  def __repr__(self):
+    return f"Swizzle({self.bits},{self.base},{self.shift})"
+
+
+class ComposedLayout(LayoutBase):
+  def __init__(self, layoutB, offset, layoutA):
+    self.layoutB = layoutB
+    self.offset  = offset
+    self.layoutA = layoutA
+
+  # operator ==
+  def __eq__(self, other):
+    return self.layoutB == other.layoutB and self.offset == other.offset and self.layoutA == other.layoutA
+
+  # operator len(L)  (len [rank] like tuples)
+  def __len__(self):
+    return len(self.layoutA)
+
+  # operator ()    (map coord to idx)
+  def __call__(self, *args):
+    return self.layoutB(self.offset + self.layoutA(*args))
+
+  # operator []    (get-i like tuples)
+  def __getitem__(self, i):
+    return ComposedLayout(self.layoutB, self.offset, self.layoutA[i])
+
+  # size(layout)   Size of the domain
+  def size(self):
+    return size(self.layoutA)
+
+  # cosize(layout)   Size of the codomain
+  def cosize(self):
+    return cosize(self.layoutB)
+
+  # print and str
+  def __str__(self):
+    return f"{self.layoutB} o {self.offset} o {self.layoutA}"
+
+  # error msgs and representation
+  def __repr__(self):
+    return f"ComposedLayout({repr(self.layoutB)},{repr(self.offset)},{repr(self.layoutA)})"
diff --git a/python/pycute/typing.py b/python/pycute/typing.py
new file mode 100644
index 0000000000..5e39f72a51
--- /dev/null
+++ b/python/pycute/typing.py
@@ -0,0 +1,42 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from abc import ABC
+
+
+class Integer(ABC):
+    @classmethod
+    def __subclasshook__(cls, c):
+        if c in [bool, float]:
+            return False
+
+        return issubclass(c, int)
diff --git a/python/setup_cutlass.py b/python/setup_cutlass.py
new file mode 100644
index 0000000000..f43020d429
--- /dev/null
+++ b/python/setup_cutlass.py
@@ -0,0 +1,74 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+
+import copy
+import os
+import setuptools
+from setuptools import setup
+from setuptools.command.build_ext import build_ext
+
+import setup_pycute
+import setup_library
+
+
+# Install cutlass_library package
+setup_library.perform_setup()
+
+
+# Install the PyCuTe package
+setup_pycute.perform_setup()
+
+
+setup(
+    name='cutlass',
+    version='3.4.0',
+    description='CUTLASS Pythonic Interface',
+    package_dir={'': '.'},
+    packages=[
+        'cutlass',
+        'cutlass.emit',
+        'cutlass.op',
+        'cutlass.utils',
+        'cutlass.backend',
+        'cutlass.backend.utils'
+        ],
+    setup_requires=['pybind11'],
+    install_requires=[
+        'bfloat16',
+        'cuda-python>=11.8.0',
+        'pybind11',
+        'scikit-build',
+        'treelib',
+        'pydot'
+        ]
+)
diff --git a/python/setup_library.py b/python/setup_library.py
new file mode 100644
index 0000000000..bbe97c0662
--- /dev/null
+++ b/python/setup_library.py
@@ -0,0 +1,46 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from setuptools import setup
+
+
+def perform_setup():
+    setup(
+        name='cutlass_library',
+        version='3.6.0',
+        description='CUTLASS library generation scripts',
+        packages=['cutlass_library']
+    )
+
+
+if __name__ == '__main__':
+    perform_setup()
diff --git a/python/setup_pycute.py b/python/setup_pycute.py
new file mode 100644
index 0000000000..a57e37a40a
--- /dev/null
+++ b/python/setup_pycute.py
@@ -0,0 +1,46 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from setuptools import setup
+
+
+def perform_setup():
+    setup(
+        name='pycute',
+        version='3.6.0',
+        description='Python implementation of CuTe',
+        packages=['pycute'],
+    )
+
+
+if __name__ == '__main__':
+    perform_setup()
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000000..a4216c2bee
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,33 @@
+[metadata]
+name = nvidia-cutlass
+version = 3.4.0.0
+
+[options]
+packages =
+  cutlass
+  cutlass.backend
+  cutlass.backend.evt
+  cutlass.backend.evt.backend
+  cutlass.backend.evt.frontend
+  cutlass.backend.evt.ir
+  cutlass.backend.evt.passes
+  cutlass.backend.utils
+  cutlass.emit
+  cutlass.epilogue
+  cutlass.op
+  cutlass.utils
+  cutlass_library
+  cutlass_library.source
+  pycute
+package_dir =
+  cutlass=python/cutlass
+  cutlass_library=python/cutlass_library
+  cutlass_library.source=.
+  pycute=python/pycute
+include_package_data = True
+
+[options.package_data]
+cutlass_library.source = include/**/*, examples/**/*, tools/**/*
+
+[options.exclude_package_data]
+cutlass_library.source = include/**/*.py, examples/**/*.py, tools/**/*.py
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index dfd5e2c4f5..eb802d80a6 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -26,5 +26,14 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-add_subdirectory(unit)
+if (CUTLASS_ENABLE_GTEST_UNIT_TESTS)
+  add_subdirectory(unit)
+else()
+  # Always provide at least the phony test_unit target.
+  add_custom_target(test_unit)
+endif()
+
+if (CUTLASS_ENABLE_SELF_CONTAINED_INCLUDES_CHECK)
+  add_subdirectory(self_contained_includes)
+endif()
 
diff --git a/test/python/cutlass/conv2d/conv2d_problem_sizes.py b/test/python/cutlass/conv2d/conv2d_problem_sizes.py
new file mode 100644
index 0000000000..b80a2692ef
--- /dev/null
+++ b/test/python/cutlass/conv2d/conv2d_problem_sizes.py
@@ -0,0 +1,661 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for defining Conv2D problem sizes for testing.
+
+This file was ported from the C++ version in test/unit/conv/device/conv2d_problems.h
+"""
+
+from cutlass_library import ConvMode
+
+import cutlass
+from cutlass.shape import Conv2DProblemSize
+
+
+class TestbedConv2dProblemSizes:
+    def __init__(self, minimum_channel_size: int):
+        conv2d_default_sizes = self.initialize_conv2d_default_sizes(minimum_channel_size)
+        conv2d_rigorous_sizes = self.initialize_conv2d_rigorous_sizes(minimum_channel_size)
+        conv2d_resnet50_sizes = self.initialize_conv2d_resnet50_sizes(1)
+        conv2d_resnet50_sizes_perf = self.initialize_conv2d_resnet50_sizes(34)
+        grouped_sizes = self.initialize_conv2d_grouped_sizes()
+
+        # Filter all problems
+        self.all = []
+        for size_list in [conv2d_default_sizes, conv2d_rigorous_sizes, conv2d_resnet50_sizes, conv2d_resnet50_sizes_perf, grouped_sizes]:
+            for size in size_list:
+                if (size.C // size.groups) % minimum_channel_size == 0:
+                    self.all.append(size)
+
+
+    def initialize_conv2d_default_sizes(self, minimum_channel_size):
+        # Small input size x stride (1,1)
+        # C < CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
+
+        conv2d_default_sizes = []
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 1, 1, minimum_channel_size,
+          8, 1, 1, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 1, 8, minimum_channel_size,
+          8, 1, 3, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 7, 8, minimum_channel_size,
+          8, 3, 3, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 7, 9, minimum_channel_size,
+          8, 4, 4, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          2, 7, 9, minimum_channel_size,
+          8, 5, 5, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          3, 7, 9, minimum_channel_size,
+          8, 6, 5, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          3, 7, 9, minimum_channel_size,
+          8, 6, 6, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          3, 7, 9, minimum_channel_size,
+          8, 7, 7, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        ##############################################
+        # Small input size x stride (2,2)
+        # C < CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
+        ##############################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 11, 7, minimum_channel_size,
+          8, 1, 1, minimum_channel_size,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 11, 7, minimum_channel_size,
+          8, 3, 3, minimum_channel_size,
+          1, 1,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 13, 11, minimum_channel_size,
+          8, 1, 1, minimum_channel_size,
+          1, 1,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 17, 19, minimum_channel_size,
+          16, 2, 2, minimum_channel_size,
+          1, 1,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 23, 5, minimum_channel_size,
+          16, 3, 3, minimum_channel_size,
+          1, 1,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 13, 17, 8,
+          24, 3, 3, 8,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 23, 21, 8,
+          24, 3, 3, 8,
+          1, 1,
+          3, 3,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 20, 24, 8,
+          40, 3, 3, 8,
+          3, 3,
+          3, 3,
+          1, 1,
+        ))
+
+        ##########################################
+        # Medium input size (1x16x16x128), filter size (1x1, 2x2, 3x3, 5x5), stride (1, 1)
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 15, 19, 160,
+          224, 1, 1, 160,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 19, 37, 160,
+          224, 3, 3, 160,
+          1, 1,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 16, 16, 160,
+          224, 2, 3, 160,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 23, 21, 128,
+          224, 3, 3, 128,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 29, 37, 160,
+          224, 5, 5, 160,
+          2, 2,
+          1, 1,
+          1, 1,
+        ))
+
+        ##########################################
+        # C > CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 15, 19, 32 + minimum_channel_size,
+          96, 3, 3, 32 + minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 16, 24, 64 + minimum_channel_size,
+          96, 3, 3, 64 + minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        ##########################################
+        # Medium input size, filter size (1x1, 3,x3, 5x5, 7x7), stride (2, 2)
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 13, 16, 288,
+          160, 5, 5, 288,
+          2, 2,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 55, 51, 256,
+          512, 1, 1, 256,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 71, 80, 32,
+          64, 5, 5, 32,
+          2, 2,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 224, 224, 8,
+          64, 7, 7, 8,
+          3, 3,
+          2, 2,
+          1, 1,
+        ))
+
+        ##########################################
+        # Medium input size stride (3, 3), filter (3, 3), non-default padding
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 27, 23, 256,
+          512, 3, 3, 256,
+          0, 0,
+          3, 3,
+          1, 1,
+        ))
+
+        ##########################################
+        # Medium input size padding > stride, asymmetric filter, padding and striding
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 27, 31, 256,
+          512, 3, 3, 256,
+          5, 7,
+          3, 4,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 27, 35, 256,
+          512, 7, 5, 256,
+          11, 7,
+          3, 5,
+          1, 1,
+        ))
+
+        ##########################################
+        # Medium input size *mixed* stride (1, 2) and (2, 1),
+        # filter (3, 3), default padding
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 27, 27, 256,
+          512, 3, 3, 256,
+          1, 1,
+          1, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 27, 27, 256,
+          512, 3, 3, 256,
+          1, 1,
+          2, 1,
+          1, 1,
+        ))
+
+        ######################################/
+        # Additional input size
+        ######################################/
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          3, 28, 28, 256,
+          256, 2, 2, 256,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+           1, 32, 32, 16,
+           32, 3, 3, 16,
+           1, 1,
+           6, 2,
+           1, 1,
+         ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          32, 24, 32, 32,
+          32, 1, 2, 32,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          4, 2, 3, 256,
+          328, 3, 5, 256,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+        return conv2d_default_sizes
+
+    # Add a few large and rigorous convolution problem sizes
+    def initialize_conv2d_rigorous_sizes(self, minimum_channel_size):
+        sizes = []
+        if False:
+            sizes.append(Conv2DProblemSize.from_sizes(
+              (1, 124, 224, 2 * minimum_channel_size),
+              (24, 7, 7, 2 * minimum_channel_size),
+            ))
+
+            sizes.append(Conv2DProblemSize.from_sizes(
+              (1, 233, 35, minimum_channel_size),
+              (24, 7, 5, minimum_channel_size),
+            ))
+        return sizes
+
+    # Add resent50 layers to unit testing sizes
+    def initialize_conv2d_resnet50_sizes(self, batch_size):
+        conv2d_problem_vector = []
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 64,
+          256, 1, 1, 64,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 64,
+          64, 1, 1, 64,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 64,
+          64, 3, 3, 64,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 256,
+          64, 1, 1, 256,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 256,
+          512, 1, 1, 256,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 256,
+          128, 1, 1, 256,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 28, 28, 128,
+          128, 3, 3, 128,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 28, 28, 128,
+          512, 1, 1, 128,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 28, 28, 512,
+          128, 1, 1, 512,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 28, 28, 512,
+          1024, 1, 1, 512,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 28, 28, 512,
+          256, 1, 1, 512,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 14, 14, 256,
+          256, 3, 3, 256,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 14, 14, 256,
+          1024, 1, 1, 256,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 14, 14, 1024,
+          256, 1, 1, 1024,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 14, 14, 1024,
+          2048, 1, 1, 1024,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 14, 14, 1024,
+          512, 1, 1, 1024,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 7, 7, 512,
+          512, 3, 3, 512,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 7, 7, 512,
+          2048, 1, 1, 512,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 7, 7, 2048,
+          512, 1, 1, 2048,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        return conv2d_problem_vector
+
+    def initialize_conv2d_grouped_sizes(self):
+        threadblock_n = 128
+        threadblock_k = 32
+
+        sizes = []
+        ##########################################
+        # One group calculated by one or multiple CTAs: k_per_group % CTA::N = 0
+        # One CTA calculates a single group
+        ##########################################
+        for cta_per_group_k in range(1, 4):
+            for groups in range(2, 5):
+                conv_k = cta_per_group_k * threadblock_n * groups
+                sizes.append(Conv2DProblemSize(
+                  1, 8, 8, threadblock_k * 2 * groups,
+                  conv_k, 3, 3, threadblock_k * 2,
+                  1, 1,
+                  1, 1,
+                  1, 1,
+                  ConvMode.CrossCorrelation,
+                  1,
+                  groups
+                ))
+
+        # Partial gemm_k: k_per_group == CTA::N && channels_per_group < CTA::K
+        sizes.append(Conv2DProblemSize(
+          1, 8, 8, threadblock_k,
+          threadblock_n * 2, 3, 3, threadblock_k // 2,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          2
+        ))
+
+        sizes.append(Conv2DProblemSize(
+          1, 56, 56, 696,
+          768, 3, 3, 232,
+          1, 1,
+          2, 2,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          3
+        ))
+        sizes.append(Conv2DProblemSize(
+          1, 14, 14, 1392,
+          1536, 3, 3, 232,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          3
+        ))
+
+        ##########################################
+        # One CTA calculate multiple groups: CTA::N % k_per_group = 0
+        ##########################################
+
+        # 2 groups per CTA
+        sizes.append(Conv2DProblemSize(
+          1, 8, 8, threadblock_k * 4,
+          threadblock_n, 3, 3, threadblock_k * 2,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          2
+        ))
+
+        # 2 groups per CTA and partial gemm_k
+        sizes.append(Conv2DProblemSize(
+          1, 8, 8, threadblock_k,
+          threadblock_n, 3, 3, threadblock_k // 2,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          2
+        ))
+
+        # 4 groups per CTA
+        sizes.append(Conv2DProblemSize(
+          1, 8, 8, threadblock_k * 8,
+          threadblock_n // 2, 3, 3, threadblock_k * 2,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          4
+        ))
+
+        # 4 groups per CTA and partial gemm_k
+        sizes.append(Conv2DProblemSize(
+          1, 8, 8, threadblock_k * 2,
+          threadblock_n // 2, 3, 3, threadblock_k // 2,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          4
+        ))
+
+        return sizes
diff --git a/test/python/cutlass/conv2d/conv2d_sm80.py b/test/python/cutlass/conv2d/conv2d_sm80.py
new file mode 100644
index 0000000000..5503cb2af3
--- /dev/null
+++ b/test/python/cutlass/conv2d/conv2d_sm80.py
@@ -0,0 +1,146 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Low-level functionality tests for Conv2d opreations on SM80
+"""
+
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend.utils.device import device_cc
+
+from conv2d_test_utils import *
+
+
+cutlass.set_log_level(logging.WARNING)
+cc = 80
+
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is invalid for SM80 tests.')
+class Conv2dSm80(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+conv_problems = get_conv_problems()
+
+
+# Tests for optimized & analytic
+for conv_kind in ["fprop", "wgrad", "dgrad"]:
+    # F16, simt
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
+        opclass="simt", threadblock_shape=[128, 128, 8],
+        warp_count=[4, 2, 1], stages=2, instruction_shape=[1, 1, 1])
+    # F16, tensor op
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16])
+    # F16, tensor op, analytic iterator
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], iterator_algorithm="analytic")
+    # F16, tensor op, f32 output
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f32,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16])
+    # F16, tensor op, different tile description
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 64, 32],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8])
+    # F32, simt
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32,
+        opclass="simt", threadblock_shape=[128, 128, 8],
+        warp_count=[4, 2, 1], stages=4, instruction_shape=[1, 1, 1])
+    # Tf32, tensorop
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32,
+        opclass="tensor_op", threadblock_shape=[128, 128, 16],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8]
+    )
+    # Split-K
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode="serial",
+        split_k_slices=2)
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode="parallel",
+        split_k_slices=5)
+    # Swizzling functor
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 64, 32],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8], swizzle=4)
+
+# Tests for few channels and fixed channels
+# F16, tensor op, few channels
+for c, tb, stage, inst in zip([2, 1],
+                                [[128, 128, 64], [128, 128, 32]],
+                                [3, 2],
+                                [[16, 8, 16], [16, 8, 8]]):
+    add_test(
+        Conv2dSm80, cc, "fprop", conv2d_few_channel_problemsizes(c), cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
+        opclass="tensor_op", threadblock_shape=tb,
+        warp_count=[2, 2, 1], stages=stage, instruction_shape=inst, iterator_algorithm="few_channels"
+    )
+# F16, tensor op, fixed channels
+for c in [8, 4, 2]:
+    add_test(
+        Conv2dSm80, cc, "fprop", conv2d_few_channel_problemsizes(c), cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], iterator_algorithm="fixed_channels"
+    )
+
+# Test activations
+for activation in ["relu", "leaky_relu"]:
+    for split_k_mode, split_k_slices in zip(["parallel", "serial", "parallel"], [1, 7, 5]):
+        add_test(
+            Conv2dSm80, cc, "fprop", conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
+            opclass="tensor_op", threadblock_shape=[128, 128, 64],
+            warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode=split_k_mode,
+            split_k_slices=split_k_slices, activation=activation)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/python/cutlass/conv2d/conv2d_test_utils.py b/test/python/cutlass/conv2d/conv2d_test_utils.py
new file mode 100644
index 0000000000..090bee2f28
--- /dev/null
+++ b/test/python/cutlass/conv2d/conv2d_test_utils.py
@@ -0,0 +1,428 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utility functions for Conv2d tests.
+"""
+
+from cutlass_library import SubstituteTemplate
+import torch
+
+import cutlass
+from cutlass_library import (
+    ConvKind,
+    ConvMode,
+    DataType,
+    DataTypeNames,
+    EpilogueScheduleSuffixes,
+    KernelScheduleSuffixes,
+    LayoutType,
+    OpcodeClassNames,
+    ShortDataTypeNames,
+    ShortLayoutTypeNames,
+    SplitKMode,
+)
+from cutlass.shape import Conv2DProblemSize
+from cutlass.utils.datatypes import numpy_type, torch_type
+
+from conv2d_problem_sizes import TestbedConv2dProblemSizes
+
+
+def get_name_conv2d(
+    arch,
+    conv_kind,
+    element,
+    element_accumulator,
+    element_output,
+    opclass,
+    threadblock_shape,
+    warp_count,
+    instruction_shape,
+    stages,
+    iterator_algorithm,
+    swizzle,
+    split_k_mode,
+    split_k_slices,
+    activation
+):
+    """
+    Generates a procedural name for a test case for conv2d
+
+    :param arch: compute capability of kernel being generated
+    :type arch: int
+    :param conv_kind: the convolution type (i.e. fprop, dgrad, wgrad)
+    :type conv_kind: str
+    :param iterator_algorithm: the iterator algorithm applied
+    :type iterator_algorithm: cutlass_library.library.IteratorAlgorithm
+    :param element_a: data type of operand A
+    :param element_b: data type of operand B
+    :param element_c: data type of operand C
+    :param element_accumulator: data type used in accumulation
+    :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
+    :type opclass: cutlass.OpcodeClass
+    :param threadblock_shape: indexable container of dimensions of threadblock tiles
+    :param stages: number of pipeline stages to use in the kernel
+    :type stages: int
+    :param stride_support: stride support of dgrad
+    :param alignment: int
+    :type alignment: int
+
+    :return: str
+    """
+    if iterator_algorithm is None:
+        iterator_algorithm = "AUTO"
+    if swizzle is None:
+        swizzle = 1
+    name_format = "test_SM${arch}_Device_Conv2d_${conv_kind}_${iter_alg}_ImplicitGemm_${eA}nhwc_${eB}nhwc_${eC}nhwc_${opclass}_${acc}_${tbM}x${tbN}x${tbK}_${wM}x${wN}x${wK}_${IM}${IN}${IK}_stage${stages}_swizzle${swizzle}_${split_k_mode}${split_k_slices}_${activation}"
+
+    return SubstituteTemplate(
+        name_format,
+        {
+            "arch": str(arch),
+            "conv_kind": conv_kind,
+            "iter_alg": iterator_algorithm,
+            "eA": DataTypeNames[element],
+            "eB": DataTypeNames[element],
+            "eC": DataTypeNames[element_output],
+            "opclass": opclass,
+            "acc": DataTypeNames[element_accumulator],
+            "tbM": str(threadblock_shape[0]),
+            "tbN": str(threadblock_shape[1]),
+            "tbK": str(threadblock_shape[2]),
+            "wM": str(threadblock_shape[0] // warp_count[0]),
+            "wN": str(threadblock_shape[1] // warp_count[1]),
+            "wK": str(threadblock_shape[2] // warp_count[2]),
+            "IM": str(instruction_shape[0]),
+            "IN": str(instruction_shape[1]),
+            "IK": str(instruction_shape[2]),
+            "stages": str(stages),
+            "swizzle": str(swizzle),
+            "split_k_mode": split_k_mode,
+            "split_k_slices": str(split_k_slices),
+            "activation": activation
+        }
+    )
+
+
+def conv2d_few_channel_problemsizes(channels):
+    problem_sizes = [
+        Conv2DProblemSize(
+            1, 8, 8, channels,
+            16, 3, 3, channels,
+            1, 1,
+            2, 2,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 16, 16, channels,
+            16, 3, 3, channels,
+            1, 1,
+            2, 2,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 16, 16, channels,
+            16, 7, 7, channels,
+            1, 1,
+            1, 1,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 224, 224, channels,
+            32, 7, 7, channels,
+            1, 1,
+            1, 1,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 224, 224, channels,
+            64, 7, 7, channels,
+            1, 1,
+            2, 2,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 224, 224, channels,
+            64, 5, 5, channels,
+            1, 1,
+            1, 1,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 224, 224, channels,
+            64, 5, 5, channels,
+            1, 1,
+            2, 2,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+    ]
+
+    return problem_sizes
+
+
+def validate_problem_size(ps, conv_kind, split_k_slices):
+    P = (ps.H + 2 * ps.pad_h - ps.dilation_h * (ps.R - 1) - 1) // ps.stride_h + 1
+    Q = (ps.W + 2 * ps.pad_w - ps.dilation_w * (ps.S - 1) - 1) // ps.stride_w + 1
+    if P != ps.P or Q != ps.Q:
+        return False
+
+    # Split-K (serial or parallel) is not supported for strided dgrad
+    if conv_kind == "dgrad" and split_k_slices > 1 and (ps.stride_h > 1 or ps.stride_w > 1):
+        return False
+    return True
+
+
+class Conv2dLauncherFrontend:
+    def __init__(self, plan: cutlass.Conv2d, seed: int = 80, backend="numpy"):
+        self.operation = plan
+        self.conv_kind = plan.conv_kind
+        self.seed = seed
+        self.backend = backend
+
+        self.dtype_A = plan._element_a
+        self.dtype_B = plan._element_b
+        self.dtype_C = plan._element_c
+        self.dtype_acc = plan._element_accumulator
+        self.layout_A = LayoutType.TensorNHWC
+        self.layout_B = LayoutType.TensorNHWC
+        self.layout_C = LayoutType.TensorNHWC
+        self.layout_D = LayoutType.TensorNHWC
+
+        self.element_compute = DataType.f32
+
+        if self.dtype_A in [cutlass.DataType.f16, cutlass.DataType.bf16]:
+            self.rand_max = 1
+        else:
+            self.rand_max = 4
+        self.activation = plan.activation
+
+    def uniform_init(self, size, dtype):
+        tensor = torch.ceil(
+            torch.empty(size=size, dtype=torch_type(dtype), device="cuda").uniform_(-self.rand_max - 0.5, self.rand_max - 0.5)
+        ).to(memory_format=torch.channels_last)
+        return tensor
+
+    def reference(self, ps, A, B, C, alpha, beta, activation):
+        if self.conv_kind == ConvKind.Fprop:
+            torch_result = alpha * torch.ops.aten.conv2d(
+                A,
+                B,
+                stride=(ps.stride_h, ps.stride_w),
+                padding=(ps.pad_h, ps.pad_w),
+                dilation=(ps.dilation_h, ps.dilation_w)
+            ) + beta * C
+        elif self.conv_kind == ConvKind.Dgrad:
+            torch_result = alpha * torch.nn.grad.conv2d_input(
+                (ps.N, ps.C, ps.H, ps.W),
+                B,
+                A,
+                padding=(ps.pad_h, ps.pad_w),
+                stride=(ps.stride_h, ps.stride_w)
+            ) + beta * C
+        elif self.conv_kind == ConvKind.Wgrad:
+            torch_result = alpha * torch.nn.grad.conv2d_weight(
+                B,
+                (ps.K, ps.C, ps.R, ps.S),
+                A,
+                padding=(ps.pad_h, ps.pad_w),
+                stride=(ps.stride_h, ps.stride_w)
+            ) + beta * C
+        else:
+            raise Exception(f"Conv kind {self.conv_kind} is currently unsupported.")
+
+        if activation == cutlass.backend.epilogue.relu:
+            torch_result = torch.nn.functional.relu(torch_result)
+        elif activation == cutlass.backend.epilogue.leaky_relu:
+            torch_result = torch.nn.functional.leaky_relu(torch_result, 0.5)
+        return torch_result
+
+    def run(self, ps, split_k_mode=SplitKMode.Serial, split_k_slices=1, alpha=1.0, beta=0.0):
+        if self.conv_kind == ConvKind.Fprop:
+            tensor_A_size = (ps.N, ps.C, ps.H, ps.W)
+            tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
+            tensor_C_size = (ps.N, ps.K, ps.P, ps.Q)
+        elif self.conv_kind == ConvKind.Dgrad:
+            tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
+            tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
+            tensor_C_size = (ps.N, ps.C, ps.H, ps.W)
+        elif self.conv_kind == ConvKind.Wgrad:
+            tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
+            tensor_B_size = (ps.N, ps.C, ps.H, ps.W)
+            tensor_C_size = (ps.K, ps.C, ps.R, ps.S)
+        else:
+            raise Exception(f"Conv kind {self.conv_kind} is not supported")
+
+        torch.manual_seed(self.seed)
+
+        tensor_A = self.uniform_init(size=tensor_A_size, dtype=self.dtype_A)
+        tensor_B = self.uniform_init(size=tensor_B_size, dtype=self.dtype_B)
+        tensor_C = self.uniform_init(size=tensor_C_size, dtype=self.dtype_C)
+        tensor_D = torch.zeros_like(tensor_C).to(memory_format=torch.channels_last)
+        args = self.operation.run(tensor_A, tensor_B, tensor_C, tensor_D,
+            stride=(ps.stride_h, ps.stride_w),
+            padding=(ps.pad_h, ps.pad_w),
+            dilation=(ps.dilation_h, ps.dilation_w),
+            alpha=alpha, beta=beta,
+            split_k=(split_k_mode, split_k_slices))
+
+        args.sync()
+
+        tensor_D_ref = self.reference(ps, tensor_A, tensor_B, tensor_C, alpha, beta, self.activation)
+
+        torch.cuda.synchronize()
+        passed = torch.allclose(tensor_D, tensor_D_ref, atol=2e-06)
+
+        return passed
+
+
+def add_test(
+    cls,
+    cc,
+    conv_kind,
+    problem_sizes,
+    element,
+    element_accumulator,
+    element_output,
+    opclass,
+    threadblock_shape,
+    warp_count,
+    instruction_shape,
+    stages,
+    iterator_algorithm=None,
+    swizzle=None,
+    split_k_mode="serial",
+    split_k_slices=1,
+    activation = "identity"
+):
+    """Create a test-running function with the given specification"""
+    test_name = get_name_conv2d(
+        cc, conv_kind, element, element_accumulator,
+        element_output, opclass, threadblock_shape, warp_count, instruction_shape, stages,
+        iterator_algorithm, swizzle, split_k_mode, split_k_slices, activation)
+
+    def run(self):
+        # Create the plan
+        plan = cutlass.Conv2d(
+            kind=conv_kind,
+            element=element,
+            element_accumulator=element_accumulator,
+            element_C=element_output,
+            element_D=element_output
+        )
+
+        # Set the opclass
+        plan.opclass = opclass
+        # Set the tile description
+        td = {
+            "threadblock_shape": threadblock_shape,
+            "warp_count": warp_count,
+            "stages": stages,
+            "instruction_shape": instruction_shape,
+        }
+
+        plan.tile_description = td
+        # Set iterator algorithm
+        if iterator_algorithm is not None:
+            plan.iterator_algorithm = iterator_algorithm
+        # Set swizzling functor
+        if swizzle is not None:
+            plan.swizzling_stride = swizzle
+
+        if activation != "identity":
+            if activation == "leaky_relu":
+                plan.activation = (cutlass.epilogue.leaky_relu, 0.5)
+            else:
+                plan.activation = getattr(cutlass.epilogue, activation)
+
+        conv2d_launcher = Conv2dLauncherFrontend(plan, 80, backend="torch")
+
+        for ps in problem_sizes:
+            if not validate_problem_size(ps, conv_kind, split_k_slices):
+                continue
+
+            self.assertTrue(conv2d_launcher.run(ps, split_k_mode, split_k_slices, 1.0, 2.0))
+
+    setattr(cls, test_name, run)
+
+    return run
+
+
+def get_conv_problems():
+    # 64: minimum channel size
+    conv_problems = TestbedConv2dProblemSizes(64).all
+
+    # Insert alignment 4 & 2 tests
+    conv_problems += [
+        Conv2DProblemSize(
+            1, 4, 4, 12,
+            8, 3, 3, 12,
+            0, 0,
+            3, 3,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 4, 4, 14,
+            8, 3, 3, 14,
+            0, 0,
+            3, 3,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 23, 56, 98,
+            128, 3, 3, 98,
+            4, 5,
+            3, 3,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+    ]
+
+    return conv_problems
diff --git a/test/python/cutlass/conv2d/run_all_tests.py b/test/python/cutlass/conv2d/run_all_tests.py
new file mode 100644
index 0000000000..3e55489759
--- /dev/null
+++ b/test/python/cutlass/conv2d/run_all_tests.py
@@ -0,0 +1,44 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import pathlib
+import unittest
+
+
+if __name__ == '__main__':
+    loader = unittest.TestLoader()
+    script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
+    tests = loader.discover(script_dir, 'conv2d_*.py')
+    testRunner = unittest.runner.TextTestRunner()
+    results = testRunner.run(tests)
+    if not results.wasSuccessful():
+        raise Exception('Test cases failed')
diff --git a/test/python/cutlass/emit/pytorch.py b/test/python/cutlass/emit/pytorch.py
new file mode 100644
index 0000000000..18388a76f9
--- /dev/null
+++ b/test/python/cutlass/emit/pytorch.py
@@ -0,0 +1,309 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Tests emitting a CUTLASS kernel to a PyTorch CUDA extension
+"""
+
+import random
+import tempfile
+import unittest
+
+from cutlass_library import ConvMode
+
+import cutlass
+
+if cutlass.utils.datatypes.is_torch_available():
+    import torch
+
+
+def _initialize(dtype, M: int, N: int, K: int):
+    """
+    Utility function to initialize A, B, C, and D matrices corresponding to dimensions M, N, and K
+
+    :param dtype: data type of tensors
+    :param M: M dimension of GEMM problem
+    :type M: int
+    :param N: N dimension of GEMM problem
+    :type N: int
+    :param K: N dimension of GEMM problem
+    :type K: int
+
+    :return: initialized tensors A, B, C, and D
+    :rtype: list
+    """
+    sizes = [(M, K), (K, N), (M, N), (M, N)]
+    return [torch.randint(-3, 3, size, device='cuda').to(dtype) for size in sizes]
+
+
+def _generate_problems(dtype, num):
+    """
+    Utility function to generate `num` GEMMs of random sizes
+
+    :param dtype: data type of tensors
+    :param num: number of GEMMs to generate
+    :type num: int
+
+    :return: lists of A, B, C, and D tensors
+    :rtype: list
+    """
+    valid_sizes = [128, 256, 512, 1024]
+    As, Bs, Cs, Ds = [], [], [], []
+    for _ in range(num):
+        M, N, K = [random.choice(valid_sizes) for _ in range(3)]
+        A, B, C, D = _initialize(dtype, M, N, K)
+        As.append(A)
+        Bs.append(B)
+        Cs.append(C)
+        Ds.append(D)
+    return As, Bs, Cs, Ds
+
+def _generate_conv2d_problem(conv_kind, dtype, ps):
+    """
+    Utility function to generate conv2d inputs
+
+    :param conv_kind: kind of convolution
+    :type conv_kind: str
+    :param dtype: data type of tensors
+    :param problem_size: the conv2d problem size
+    :type problem_size: cutlass.shape.Conv2DProblemSize
+
+    :return: initialized tensors A, B, C, and D
+    :rtype: list
+    """
+    if conv_kind == "fprop":
+        tensor_A_size = (ps.N, ps.C, ps.H, ps.W)
+        tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
+        tensor_C_size = (ps.N, ps.K, ps.P, ps.Q)
+    elif conv_kind == "dgrad":
+        tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
+        tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
+        tensor_C_size = (ps.N, ps.C, ps.H, ps.W)
+    else:
+        tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
+        tensor_B_size = (ps.N, ps.C, ps.H, ps.W)
+        tensor_C_size = (ps.K, ps.C, ps.R, ps.S)
+    sizes = [tensor_A_size, tensor_B_size, tensor_C_size]
+    return [torch.ceil(torch.empty(size, dtype=dtype, device='cuda').uniform_(-4.5, 3.5)).to(memory_format=torch.channels_last) for size in sizes]
+
+
+@unittest.skipIf(not cutlass.utils.datatypes.is_torch_available(), 'PyTorch must be available to run PyTorch extension tests')
+class PyTorchExtensionTest(unittest.TestCase):
+
+    def test_gemm(self):
+        random.seed(2023)
+
+        dtype = torch.float16
+        plan = cutlass.op.Gemm(element=dtype, layout=cutlass.LayoutType.RowMajor)
+        op = plan.construct()
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            mod = cutlass.emit.pytorch(op, name='gemm_mod', cc=plan.cc, sourcedir=tmpdir, jit=True)
+
+        A, B, C, _ = _initialize(dtype, 1024, 256, 512)
+
+        D_ref = A @ B
+        D = mod.run(A, B)
+        assert torch.allclose(D, D_ref)
+
+        D = mod.run(A, B, C)
+        assert torch.allclose(D, D_ref)
+
+        D = mod.run(A, B, C, 1.0)
+        assert torch.allclose(D, D_ref)
+
+        D = mod.run(A, B, C, 1.0, 0.0)
+        assert torch.allclose(D, D_ref)
+
+        alpha = 2.0
+        beta = -1.0
+        D_ref = (A @ B) * alpha + (beta * C)
+        D = mod.run(A, B, C, alpha, beta)
+        assert torch.allclose(D, D_ref)
+
+    def test_grouped_gemm(self):
+        random.seed(2023)
+
+        dtype = torch.float16
+        plan = cutlass.op.GroupedGemm(element=dtype, layout=cutlass.LayoutType.RowMajor)
+        op = plan.construct()
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            mod = cutlass.emit.pytorch(op, name='grouped_gemm_mod', cc=plan.cc, sourcedir=tmpdir, jit=True)
+
+        As, Bs, Cs, _ = _generate_problems(dtype, 50)
+
+        def check_all(X, Y):
+            for x, y in zip(X, Y):
+                assert torch.allclose(x, y)
+
+        Ds_ref = [a @ b for a, b in zip(As, Bs)]
+        Ds = mod.run(As, Bs)
+        check_all(Ds, Ds_ref)
+
+        Ds = mod.run(As, Bs, Cs)
+        check_all(Ds, Ds_ref)
+
+        Ds = mod.run(As, Bs, Cs, 1.0)
+        check_all(Ds, Ds_ref)
+
+        Ds = mod.run(As, Bs, Cs, 1.0, 0.0)
+        check_all(Ds, Ds_ref)
+
+        alpha = 2.0
+        beta = -1.0
+        Ds_ref = [(a @ b) * alpha + (beta * c) for a, b, c in zip(As, Bs, Cs)]
+        Ds = mod.run(As, Bs, Cs, alpha, beta)
+        check_all(Ds, Ds_ref)
+
+    def test_conv2d_fprop(self):
+        torch.manual_seed(2023)
+
+        dtype = torch.float16
+        plan = cutlass.op.Conv2d(kind="fprop", element=dtype, element_accumulator=torch.float32)
+        plan.activation = "relu"
+
+        op = plan.construct()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            mod = cutlass.emit.pytorch(op, name="conv2d_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
+
+        problem_size = cutlass.shape.Conv2DProblemSize(
+            1, 4, 4, 16,
+            8, 3, 3, 16,
+            0, 0,
+            3, 3,
+            1, 1
+        )
+
+        A, B, C = _generate_conv2d_problem("fprop", dtype, problem_size)
+        stride = (problem_size.stride_h, problem_size.stride_w)
+        padding = (problem_size.pad_h, problem_size.pad_w)
+
+        alpha = 1.0
+        beta = 0.5
+
+        D_ref = alpha * torch.ops.aten.conv2d(
+            A, B, stride=stride, padding=padding
+        ) + beta * C
+        D_ref = torch.nn.functional.relu(D_ref)
+        D = mod.run(A, B, C, stride, padding, alpha=alpha, beta=beta)
+
+        assert torch.allclose(D, D_ref)
+
+        # Test serial split-K
+        D_serial_split_k = mod.run(A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="serial", split_k_slices=3)
+        assert torch.allclose(D, D_serial_split_k)
+
+        # Test parallel split-K
+        D_parallel_split_k = mod.run(A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="parallel", split_k_slices=7)
+        assert torch.allclose(D, D_parallel_split_k)
+
+
+    def test_conv2d_dgrad(self):
+        torch.manual_seed(2023)
+        dtype = torch.float16
+        plan = cutlass.op.Conv2d(kind="dgrad", element=dtype, element_accumulator=torch.float32)
+
+        op = plan.construct()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            mod = cutlass.emit.pytorch(op, name="conv2d_dgrad_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
+
+        problem_size = cutlass.shape.Conv2DProblemSize(
+            1, 4, 4, 16,
+            8, 3, 3, 16,
+            0, 0,
+            3, 3,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        )
+
+        A, B, C = _generate_conv2d_problem("dgrad", dtype, problem_size)
+        stride = (problem_size.stride_h, problem_size.stride_w)
+        padding = (problem_size.pad_h, problem_size.pad_w)
+
+        alpha = 1.0
+        beta = 0.5
+        input_size = (problem_size.N, problem_size.C, problem_size.H, problem_size.W)
+        D_ref = alpha * torch.nn.grad.conv2d_input(
+            input_size, B, A,
+            stride=stride, padding=padding
+        ) + beta * C
+        D = mod.run(input_size, A, B, C, stride, padding, alpha=alpha, beta=beta, )
+
+        assert torch.allclose(D, D_ref)
+
+    def test_conv2d_wgrad(self):
+        torch.manual_seed(2023)
+        dtype = torch.float16
+        plan = cutlass.op.Conv2d(kind="wgrad", element=dtype, element_accumulator=torch.float32)
+
+        op = plan.construct()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            mod = cutlass.emit.pytorch(op, name="conv2d_wgrad_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
+
+        problem_size = cutlass.shape.Conv2DProblemSize(
+            1, 4, 4, 16,
+            8, 3, 3, 16,
+            0, 0,
+            3, 3,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        )
+
+        A, B, C = _generate_conv2d_problem("wgrad", dtype, problem_size)
+        stride = (problem_size.stride_h, problem_size.stride_w)
+        padding = (problem_size.pad_h, problem_size.pad_w)
+
+        alpha = 1.0
+        beta = 0.5
+        weight_size = (problem_size.K, problem_size.C, problem_size.R, problem_size.S)
+        D_ref = alpha * torch.nn.grad.conv2d_weight(
+            B, weight_size, A,
+            stride=stride, padding=padding
+        ) + beta * C
+        D = mod.run(weight_size, A, B, C, stride, padding, alpha=alpha, beta=beta)
+
+        assert torch.allclose(D, D_ref)
+
+        # Test serial split-K
+        D_serial_split_k = mod.run(weight_size, A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="serial", split_k_slices=3)
+        assert torch.allclose(D, D_serial_split_k)
+
+        # Test parallel split-K
+        D_parallel_split_k = mod.run(weight_size, A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="parallel", split_k_slices=7)
+        assert torch.allclose(D, D_parallel_split_k)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/python/cutlass/evt/evt_compute_sm80_90.py b/test/python/cutlass/evt/evt_compute_sm80_90.py
new file mode 100644
index 0000000000..da6c1dec6a
--- /dev/null
+++ b/test/python/cutlass/evt/evt_compute_sm80_90.py
@@ -0,0 +1,122 @@
+################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+"""
+Unit test for compute node in SM90
+"""
+
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend import *
+from cutlass.epilogue import *
+from cutlass import swizzle
+
+from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
+
+cutlass.set_log_level(logging.WARNING)
+
+
+@unittest.skipIf(device_cc() not in [80, 86, 89, 90], "This unittest is only supported on CC [80, 86, 89, 90]")
+class TestEVTCompute(EVTTestCaseBase):
+
+    def test_arith(self):
+        """
+        Test Arithmatic op
+        """
+        def evt_arith_compute(accum, C, alpha, beta, gamma):
+            D = ((accum + C) * alpha - gamma) / beta
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 1.5,
+                "beta": 0.5,
+                "gamma": 2.5,
+                "D": self.fake_tensor(self.element, (l, m, n))
+            }
+
+            launcher = EVTTestBed(self.element, evt_arith_compute, example_inputs)
+            input_keys = ["C", "alpha", "beta", "gamma"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_func_call(self):
+        """
+        Test Function call
+        """
+        def evt_func_call(accum, C, alpha, beta, gamma):
+            D = multiply_add(relu(accum + alpha) + C, beta, gamma)
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 1.5,
+                "beta": 0.5,
+                "gamma": 2.5,
+                "D": self.fake_tensor(self.element, (l, m, n))
+            }
+
+            launcher = EVTTestBed(self.element, evt_func_call, example_inputs)
+            input_keys = ["C", "alpha", "beta", "gamma"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_func_call2(self):
+        """
+        Test Function call
+        """
+
+        def evt_func_call2(accum, C, alpha, beta):
+            D = maximum(alpha * accum + beta * C, 0.0)
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 1.5,
+                "beta": 0.5,
+                "D": self.fake_tensor(self.element, (l, m, n))
+            }
+
+            launcher = EVTTestBed(self.element, evt_func_call2, example_inputs)
+            input_keys = ["C", "alpha", "beta"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/python/cutlass/evt/evt_layout_sm80_90.py b/test/python/cutlass/evt/evt_layout_sm80_90.py
new file mode 100644
index 0000000000..d5cfa796d1
--- /dev/null
+++ b/test/python/cutlass/evt/evt_layout_sm80_90.py
@@ -0,0 +1,173 @@
+################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+"""
+Unit test for store nodes in SM90
+"""
+
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend import *
+from cutlass.epilogue import *
+
+from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
+
+cutlass.set_log_level(logging.WARNING)
+
+
+@unittest.skipIf(device_cc() not in [80, 86, 89, 90], "This unittest is only supported on CC [80, 86, 89, 90]")
+class TestEVTLayout(EVTTestCaseBase):
+
+    def test_permute_1(self):
+        """
+        Returning a tensor with shape [m, n]
+        """
+        def evt_permute(accum, alpha, C):
+            F = alpha * accum
+            F_permute = permute(F, indices=(0, 2, 1))
+            D_permute = F_permute + permute(C, indices=(0, 2, 1))
+            D = permute(D_permute, indices=(0, 2, 1))
+            return D, F
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_permute, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    @unittest.skipIf(device_cc() != 90, "This unittest is for cc = Sm90 only")
+    def test_permute_2(self):
+        """
+        Returning a tensor with shape [m, n]
+        """
+        def evt_permute(accum, alpha, C):
+            F = alpha * accum
+            F_permute = permute(F, indices=(0, 2, 1))
+            D = F_permute + C
+            return D, F
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "C": self.fake_tensor(self.element, (l, n, m)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, n, m)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_permute, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    @unittest.skipIf(device_cc() != 90, "This unittest is for cc = Sm90 only")
+    def test_permute_3(self):
+        """
+        Returning a tensor with shape [m, n]
+        """
+        def evt_permute(accum, alpha, C):
+            F = alpha * accum
+            F_permute = permute(F, indices=(1, 0, 2))
+            D = F_permute + C
+            return D, F
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "C": self.fake_tensor(self.element, (m, l, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (m, l, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_permute, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_reshape(self):
+        """
+        Test reshape
+        """
+        def evt_reshape(accum, alpha, TensorE):
+            F = alpha * accum
+            E_reshape = reshape(TensorE, new_shape=(512, 1))
+            D = F + E_reshape
+            return D
+
+        example_inputs = {
+            "accum": self.fake_tensor(self.element, (self.l, self.m, self.n)),
+            "alpha": 0.5,
+            "TensorE": self.fake_tensor(self.element, (16, 32)),
+            "D": self.fake_tensor(self.element, (self.l, self.m, self.n)),
+        }
+
+        launcher = EVTTestBed(self.element, evt_reshape, example_inputs)
+        input_keys = ["alpha", "TensorE"]
+        result_keys = ["D"]
+        launcher.verify(self.problem_size, input_keys, result_keys, self.l)
+
+    def test_reshape2(self):
+        """
+        Test reshape
+        """
+        def evt_reshape(accum, alpha, TensorE):
+            F = alpha * accum
+            F_reshape = reshape(F, new_shape=(2, 3, 512, 256))
+            D = F_reshape + TensorE
+            return D
+
+        example_inputs = {
+            "accum": self.fake_tensor(self.element, (self.l, self.m, self.n)),
+            "alpha": 0.5,
+            "TensorE": self.fake_tensor(self.element, (2, 3, 1, self.n)),
+            "D": self.fake_tensor(self.element, (2, 3, self.m, self.n)),
+        }
+
+        launcher = EVTTestBed(self.element, evt_reshape, example_inputs)
+        input_keys = ["alpha", "TensorE"]
+        result_keys = ["D"]
+        launcher.verify(self.problem_size, input_keys, result_keys, self.l)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/python/cutlass/evt/evt_load_sm80_90.py b/test/python/cutlass/evt/evt_load_sm80_90.py
new file mode 100644
index 0000000000..885e9ada75
--- /dev/null
+++ b/test/python/cutlass/evt/evt_load_sm80_90.py
@@ -0,0 +1,142 @@
+################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+"""
+Unit test for load nodes in SM90
+"""
+
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend import *
+from cutlass.epilogue import *
+
+from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
+
+cutlass.set_log_level(logging.WARNING)
+
+
+@unittest.skipIf(device_cc() not in [80, 86, 89, 90], "This unittest is only supported on CC [80, 86, 89, 90]")
+class TestEVTLoad(EVTTestCaseBase):
+
+    def test_tensor_load(self):
+        """
+        Load extra tensor with shape [m, n]
+        """
+        def evt_tensor_load(accum, C, aux, aux_batch):
+            D = accum + C + aux + aux_batch
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "aux": self.fake_tensor(self.element, (m, n)),
+                "aux_batch": self.fake_tensor(np.float32, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_tensor_load, example_inputs)
+            input_keys = ["C", "aux", "aux_batch"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_row_broadcast(self):
+        """
+        Load extra tensor with shape [1, n]
+        """
+        def evt_row_broadcast(accum, C, bias, bias_batch):
+            D = accum + C + bias + bias_batch
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "bias": self.fake_tensor(self.element, (n,)),
+                "bias_batch": self.fake_tensor(np.float32, (l, 1, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_row_broadcast, example_inputs)
+            input_keys = ["C", "bias", "bias_batch"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_column_broadcast(self):
+        """
+        Load extra tensor with shape [m, 1]
+        """
+        def evt_column_broadcast(accum, C, bias, bias_batch):
+            D = accum + C + bias + bias_batch
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "bias": self.fake_tensor(self.element, (m, 1)),
+                "bias_batch": self.fake_tensor(np.float32, (l, m, 1)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_column_broadcast, example_inputs)
+            input_keys = ["C", "bias", "bias_batch"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_scalar_broadcast(self):
+        """
+        Load extra tensor with shape [1, 1]
+        """
+        def evt_scalar_broadcast(accum, C, alpha, alpha_batch):
+            D = accum + C + alpha + alpha_batch
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "alpha_batch": self.fake_tensor(np.float32, (l, 1, 1)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_scalar_broadcast, example_inputs)
+            input_keys = ["C", "alpha", "alpha_batch"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/python/cutlass/evt/evt_mixed_sm80_90.py b/test/python/cutlass/evt/evt_mixed_sm80_90.py
new file mode 100644
index 0000000000..3efd13d93f
--- /dev/null
+++ b/test/python/cutlass/evt/evt_mixed_sm80_90.py
@@ -0,0 +1,274 @@
+################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+"""
+Unittest for mixed types of nodes in SM90
+"""
+
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend import *
+from cutlass.epilogue import *
+from cutlass.swizzle import ThreadblockSwizzleStreamK
+
+from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
+
+cutlass.set_log_level(logging.WARNING)
+
+
+@unittest.skipIf(device_cc() not in [80, 86, 89, 90], "This unittest is only supported on CC [80, 86, 89, 90]")
+class TestEVTMixed(EVTTestCaseBase):
+    def test_mixed_dag(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+
+        if device_cc() == 80:
+            alignments = [2, 4, 8]
+        else:
+            # Sm90 EVT currently only supports 128-bit alignment
+            alignments = [8,]
+        for align in alignments:
+            for m, n, k, l in self.get_problem_sizes(align):
+                example_inputs = {
+                    "accum": self.fake_tensor(self.element, (l, m, n)),
+                    "alpha": 1.0,
+                    "C": self.fake_tensor(self.element, (l, m, n)),
+                    "beta": 1.0,
+                    "aux": self.fake_tensor(self.element, (l, m, n)),
+                    "cbias": self.fake_tensor(self.element, (m, 1)),
+                    "rbias": self.fake_tensor(self.element, (n,)),
+                    "D": self.fake_tensor(self.element, (l, m, n)),
+                    "F": self.fake_tensor(self.element, (l, m, n)),
+                    "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                    "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+                }
+
+                launcher = EVTTestBed(self.element, evt_mixed_dag, example_inputs)
+                input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+                result_keys = ["D", "F", "F_row_max", "E_col_max"]
+                launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    @unittest.skipIf(device_cc() not in [80, 89], "This unittest is for cc 80 and 89 only")
+    def test_mixed_dag_float(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+
+        for align in [3, 2, 4]:
+            for m, n, k, l in self.get_problem_sizes(align):
+                example_inputs = {
+                    "accum": self.fake_tensor(np.float32, (l, m, n)),
+                    "alpha": 1.0,
+                    "C": self.fake_tensor(np.float32, (l, m, n)),
+                    "beta": 1.0,
+                    "aux": self.fake_tensor(np.float32, (l, m, n)),
+                    "cbias": self.fake_tensor(np.float32, (m, 1)),
+                    "rbias": self.fake_tensor(np.float32, (n,)),
+                    "D": self.fake_tensor(np.float32, (l, m, n)),
+                    "F": self.fake_tensor(np.float32, (l, m, n)),
+                    "F_row_max": self.fake_tensor(np.float32, (n,)),
+                    "E_col_max": self.fake_tensor(np.float32, (m, 1))
+                }
+                launcher = EVTTestBed(DataType.f32, evt_mixed_dag, example_inputs)
+                input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+                result_keys = ["D", "F", "F_row_max", "E_col_max"]
+                launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    @unittest.skipIf(device_cc() not in [80, 89], "This unittest is for cc 80 and 89 only")
+    def test_mixed_dag_stage2(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 1.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "beta": 1.0,
+                "aux": self.fake_tensor(self.element, (l, m, n)),
+                "cbias": self.fake_tensor(self.element, (m, 1)),
+                "rbias": self.fake_tensor(self.element, (n,)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+            }
+
+            launcher = EVTTestBed(self.element, evt_mixed_dag, example_inputs, epilogue_stages=2)
+            input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+            result_keys = ["D", "F", "F_row_max", "E_col_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    @unittest.skipIf(device_cc() not in [80, 89], "This unittest is for cc 80 and 89 only")
+    def test_mixed_dag_partition_k(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 1.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "beta": 1.0,
+                "aux": self.fake_tensor(self.element, (l, m, n)),
+                "cbias": self.fake_tensor(self.element, (m, 1)),
+                "rbias": self.fake_tensor(self.element, (n,)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+            }
+
+            tile_description = {
+                "threadblock_shape": [128, 128, 64],
+                "warp_count": [2, 2, 2]
+            }
+
+            launcher = EVTTestBed(self.element, evt_mixed_dag, example_inputs, tile_description=tile_description, epilogue_stages=2)
+            input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+            result_keys = ["D", "F", "F_row_max", "E_col_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    @unittest.skipIf(device_cc() not in [80, 89], "This unittest is for cc 80 and 89 only")
+    def test_mixed_dag_stream_k(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+
+        # High per-sm occupancy tile_description
+        tile_description = {
+            "threadblock_shape": [128, 128, 32],
+            "warp_count": [2, 2, 1],
+            "stages": 3
+        }
+        tds = [None, tile_description]
+        for td in tds:
+            for m, n, k, l in self.get_problem_sizes(8, k=960, batch_count=[1, 3]):
+                if l == 1:
+                    example_inputs = {
+                        "accum": self.fake_tensor(self.element, (m, n)),
+                        "alpha": 1.0,
+                        "C": self.fake_tensor(self.element, (m, n)),
+                        "beta": 1.0,
+                        "aux": self.fake_tensor(self.element, (m, n)),
+                        "cbias": self.fake_tensor(self.element, (m, 1)),
+                        "rbias": self.fake_tensor(self.element, (n,)),
+                        "D": self.fake_tensor(self.element, (m, n)),
+                        "F": self.fake_tensor(self.element, (m, n)),
+                        "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                        "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+                    }
+                else:
+                    example_inputs = {
+                        "accum": self.fake_tensor(self.element, (l, m, n)),
+                        "alpha": 1.0,
+                        "C": self.fake_tensor(self.element, (l, m, n)),
+                        "beta": 1.0,
+                        "aux": self.fake_tensor(self.element, (l, m, n)),
+                        "cbias": self.fake_tensor(self.element, (m, 1)),
+                        "rbias": self.fake_tensor(self.element, (n,)),
+                        "D": self.fake_tensor(self.element, (l, m, n)),
+                        "F": self.fake_tensor(self.element, (l, m, n)),
+                        "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                        "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+                    }
+
+                if td is not None:
+                    launcher = EVTTestBed(
+                        self.element, evt_mixed_dag, example_inputs,
+                        tile_description=td,
+                        swizzling_functor=ThreadblockSwizzleStreamK, backend="torch")
+                else:
+                    launcher = EVTTestBed(
+                        self.element, evt_mixed_dag, example_inputs,
+                        swizzling_functor=ThreadblockSwizzleStreamK, backend="torch")
+
+                input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+                result_keys = ["D", "F", "F_row_max", "E_col_max"]
+                launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_mixed_dag_no_batch(self):
+        def evt_mixed_dag_no_batch(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+
+        for m, n, k, _ in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (m, n)),
+                "alpha": 1.0,
+                "C": self.fake_tensor(self.element, (m, n)),
+                "beta": 1.0,
+                "aux": self.fake_tensor(self.element, (m, n)),
+                "cbias": self.fake_tensor(self.element, (m, 1)),
+                "rbias": self.fake_tensor(self.element, (n,)),
+                "D": self.fake_tensor(self.element, (m, n)),
+                "F": self.fake_tensor(self.element, (m, n)),
+                "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+            }
+
+            launcher = EVTTestBed(self.element, evt_mixed_dag_no_batch, example_inputs)
+            input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+            result_keys = ["D", "F", "F_row_max", "E_col_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, 1)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/python/cutlass/evt/evt_store_sm80_90.py b/test/python/cutlass/evt/evt_store_sm80_90.py
new file mode 100644
index 0000000000..4719edbeaf
--- /dev/null
+++ b/test/python/cutlass/evt/evt_store_sm80_90.py
@@ -0,0 +1,155 @@
+################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+"""
+Unit test for store nodes in SM90
+"""
+
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend import *
+from cutlass.epilogue import *
+
+from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
+
+cutlass.set_log_level(logging.WARNING)
+
+
+@unittest.skipIf(device_cc() not in [80, 86, 89, 90], "This unittest is only supported on CC [80, 86, 89, 90]")
+class TestEVTStore(EVTTestCaseBase):
+
+    def test_aux_store(self):
+        """
+        Returning a tensor with shape [m, n]
+        """
+        def evt_aux_store(accum, alpha, C):
+            F = alpha * accum
+            D = F + C
+            return D, F
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_aux_store, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_col_reduce(self):
+        """
+        Reduction [m, n] -> [m, 1]
+        """
+        def evt_row_reduce(accum, alpha, C):
+            acc_row_max = max(accum, dim=[2,])
+            F = alpha * accum
+            F_row_max = max(F, dim=[0, 2])
+            D = F + C
+            return D, F_row_max, acc_row_max
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 2.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "F_row_max": self.fake_tensor(np.float32, (m, 1)),
+                "acc_row_max": self.fake_tensor(np.float32, (l, m, 1)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_row_reduce, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F_row_max", "acc_row_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_row_reduce(self):
+        """
+        Reduction [m, n] -> [n]
+        """
+        def evt_col_reduce(accum, alpha, C):
+            acc_col_max = max(accum, dim=[1,])
+            F = alpha * accum
+            F_col_max = max(F, dim=[0, 1])
+            D = F + C
+            return D, F_col_max, acc_col_max
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 2.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "F_col_max": self.fake_tensor(np.float32, (n,)),
+                "acc_col_max": self.fake_tensor(np.float32, (l, 1, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_col_reduce, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F_col_max", "acc_col_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_scalar_reduce(self):
+        """
+        Reduction [m, n] -> [1,]
+        """
+        def evt_scalar_reduce(accum, alpha, C):
+            acc_max = max(accum, dim=[1, 2])
+            F = alpha * accum
+            F_max = max(F, dim=[0, 1, 2])
+            D = F + C
+            return D, F_max, acc_max
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 2.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "acc_max": self.fake_tensor(np.float32, (l, 1, 1)),
+                "F_max": self.fake_tensor(np.float32, (1,)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_scalar_reduce, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F_max", "acc_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/python/cutlass/evt/run_all_tests.py b/test/python/cutlass/evt/run_all_tests.py
new file mode 100644
index 0000000000..e36fe8bce8
--- /dev/null
+++ b/test/python/cutlass/evt/run_all_tests.py
@@ -0,0 +1,44 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import pathlib
+import unittest
+
+
+if __name__ == '__main__':
+    loader = unittest.TestLoader()
+    script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
+    tests = loader.discover(script_dir, 'evt_*.py')
+    testRunner = unittest.runner.TextTestRunner()
+    results = testRunner.run(tests)
+    if not results.wasSuccessful():
+        raise Exception('Test cases failed')
diff --git a/test/python/cutlass/evt/utils/evt_testbed.py b/test/python/cutlass/evt/utils/evt_testbed.py
new file mode 100644
index 0000000000..66c20df09c
--- /dev/null
+++ b/test/python/cutlass/evt/utils/evt_testbed.py
@@ -0,0 +1,230 @@
+################################################################################
+#
+# Copyright (c) 20123 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+"""
+Testbed classes of EVT
+"""
+
+import torch
+import unittest
+
+import cutlass
+from cutlass import Tensor
+import cutlass.backend.evt
+from cutlass.shape import GemmCoord
+from cutlass.utils.datatypes import torch_type
+from cutlass.utils.profiler import CUDAEventProfiler
+
+
+class EVTReferenceModule:
+    def __init__(self, layout_A, layout_B, layout_C, epilogue_visitor):
+        self.layout_A = layout_A
+        self.layout_B = layout_B
+        self.layout_C = layout_C
+        self.epilogue_visitor = epilogue_visitor
+
+    def run(self, A, B, C, problem_size, alpha, beta, batch=1):
+        if self.layout_A == cutlass.LayoutType.RowMajor:
+            A_row = A.view((batch, problem_size.m, problem_size.k))
+        else:
+            A_col = A.view((batch, problem_size.k, problem_size.m))
+            A_row = torch.permute(A_col, (0, 2, 1))
+
+        if self.layout_B == cutlass.LayoutType.RowMajor:
+            B_row = B.view((batch, problem_size.k, problem_size.n))
+        else:
+            B_col = B.view((batch, problem_size.n, problem_size.k))
+            B_row = torch.permute(B_col, (0, 2, 1))
+
+        if self.layout_C == cutlass.LayoutType.RowMajor:
+            C_row = C.view((batch, problem_size.m, problem_size.n))
+        else:
+            C_col = C.view((batch, problem_size.n, problem_size.m))
+            C_row = torch.permute(C_col, (0, 2, 1))
+
+        out_row = torch.matmul(A_row, B_row) * alpha + C_row * beta
+
+        if self.layout_C == cutlass.LayoutType.ColumnMajor:
+            out = torch.permute(out_row, (0, 2, 1))
+        else:
+            out = out_row
+
+        return torch.flatten(out)
+
+    def __call__(self, A, B, C, problem_size, batch=1, epilogue_args=None):
+        # Running the mainloop
+        accum = self.run(
+            A, B, C, problem_size, 1.0, 0.0, batch=batch
+        ).reshape(batch, problem_size.m, problem_size.n)
+        
+        # Running the epilogue
+        epilogue_args["accum"] = accum
+        references = self.epilogue_visitor(**epilogue_args)
+        
+        # Return the results
+        if not isinstance(references, tuple):
+            references = (references,)
+        return references
+        
+
+class EVTTestBed:
+    """
+    Epilogue Visitor Testbed
+    """
+    def __init__(self, element, evt_fn, example_inputs, profile=False, **kwargs) -> None:
+        self.element = element
+        layout = cutlass.LayoutType.RowMajor
+        self.example_inputs = example_inputs
+        
+        # Create the Gemm plan
+        self.plan = cutlass.op.Gemm(element=element, layout=layout, element_accumulator=torch.float32)
+        
+        if "tile_description" in kwargs:
+            self.plan.tile_description = kwargs["tile_description"]
+        
+        if "swizzling_functor" in kwargs:
+            self.plan.swizzling_functor = kwargs["swizzling_functor"]
+        
+        # Compile the epilogue visitor
+        epilogue_visitor = cutlass.epilogue.trace(evt_fn, example_inputs)
+        if "epilogue_stages" in kwargs:
+            epilogue_visitor.epilogue_stages = kwargs["epilogue_stages"]
+        self.plan.epilogue_visitor = epilogue_visitor
+        
+        # Reference model
+        self.reference_fn = EVTReferenceModule(layout, layout, layout, epilogue_visitor)
+        
+        self.profile = profile
+
+    def get_torch_tensor(self, shape, dtype=None, fill=None):
+        if dtype is None:
+            dtype = self.element
+        
+        dtype = torch_type(dtype)
+        if fill is None:
+            return torch.ceil(
+                torch.empty(size=shape, dtype=dtype, device="cuda").uniform_(-4.5, 3.5)
+            )
+        else:
+            return torch.full(shape, fill, dtype=dtype, device="cuda")
+    
+    def verify(self, problem_size, input_keys, result_keys, batch_count=1):
+        """
+        Verify the results
+        """
+        problem_size = GemmCoord(*problem_size)
+
+        # Initiate the GEMM arguments
+        tensor_A = self.get_torch_tensor((batch_count, problem_size.m, problem_size.k))
+        tensor_B = self.get_torch_tensor((batch_count, problem_size.k, problem_size.n))
+        
+        # Initialize the epilogue args
+        epilogue_args = {}
+        for key in self.example_inputs.keys():
+            if key in input_keys:
+                tensor = self.example_inputs[key]
+                if isinstance(tensor, Tensor):
+                    epilogue_args[key] = self.get_torch_tensor(tensor.shape, tensor.element)
+                else:
+                    epilogue_args[key] = tensor
+            elif key in result_keys:
+                tensor = self.example_inputs[key]
+                if isinstance(tensor, Tensor):
+                    if "max" in key:
+                        fill = -1000
+                    else:
+                        fill = 0
+                    epilogue_args[key] = self.get_torch_tensor(tensor.shape, tensor.element, fill=fill)
+                else:
+                    epilogue_args[key] = tensor
+        
+        tensor_D = epilogue_args["D"]
+        if "C" in epilogue_args:
+            tensor_C = epilogue_args["C"]
+        else:
+            tensor_C = tensor_D
+        # Run the device kernel
+        self.plan.run(tensor_A, tensor_B, tensor_C, tensor_D, visitor_args=epilogue_args)
+        
+        # Run the host reference
+        evt_args_inputs = {}
+        for key in input_keys:
+            evt_args_inputs[key] = epilogue_args[key]
+        
+        reference_results = self.reference_fn(
+            tensor_A, tensor_B, tensor_C, problem_size, batch_count, evt_args_inputs)
+        
+        # Compare the results
+        for result, ref in zip(result_keys, reference_results):
+            assert torch.equal(epilogue_args[result].flatten(), ref.flatten())
+        
+        # Run profile
+        if self.profile:
+            profiler = CUDAEventProfiler(
+                self.plan, 100, 100, tensor_A, tensor_B, tensor_C, tensor_D,
+                visitor_args = epilogue_args
+            )
+            print(f"Cutlass Python Duration: {profiler()}")
+
+
+class EVTTestCaseBase(unittest.TestCase):
+    """
+    Base class for EVT Unittest
+    """
+    def __init__(self, methodName: str = "runTest", lmnk=(6, 512, 256, 128)) -> None:
+        super().__init__(methodName)
+        
+        self.element = cutlass.DataType.f16
+        self.l, self.m, self.n, self.k = lmnk
+        
+        self.problem_size = (self.m, self.n, self.k)
+        
+        torch.random.manual_seed(42)
+    
+    def fake_tensor(self, element, shape):
+        return Tensor(element=element, shape=shape, layout_tag=cutlass.LayoutType.RowMajor)
+    
+    def get_problem_sizes(self, alignment, k=None, batch_count=[3,]):
+        k = k if k else self.k
+        problem_size_m = [alignment, 512 - 3 * alignment]
+        problem_size_n = [alignment, 512 - alignment]
+        if alignment % 8 == 0:
+            problem_size_m.append(768)
+            problem_size_n.append(768)
+        problem_size_l = batch_count
+        problem_sizes = []
+        for m in problem_size_m:
+            for n in problem_size_n:
+                for l in problem_size_l:
+                    problem_sizes.append((m, n, k, l))
+        
+        return problem_sizes
diff --git a/test/python/cutlass/gemm/gemm_batched.py b/test/python/cutlass/gemm/gemm_batched.py
new file mode 100644
index 0000000000..823a40d035
--- /dev/null
+++ b/test/python/cutlass/gemm/gemm_batched.py
@@ -0,0 +1,134 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+High-level tests for running batched GEMMs
+"""
+
+from functools import partial
+import logging
+from math import prod
+import unittest
+
+import cutlass
+from cutlass.backend.utils.device import device_cc
+import torch
+
+from utils import LayoutCombination
+
+cutlass.set_log_level(logging.WARNING)
+
+torch.manual_seed(2023)
+
+
+def pytorch_reference(A, B, C, alpha, beta):
+    # Get the batch count. Assume that any of A, B, and C
+    # with a batch dimension ahve matching batch count. Thus,
+    # we break out of the loop once we have found the first
+    # tensor containing a batch dimension.
+    batch_count = (1,)
+    for tensor in [A, B, C]:
+        if len(tensor.shape) > 2:
+            batch_count = tensor.shape[:-2]
+            break
+
+    int_batch_count = prod(batch_count)
+
+    def add_batch(tensor):
+        if len(tensor.shape) == 2:
+            return tensor.unsqueeze(0).repeat(int_batch_count, 1, 1)
+        else:
+            return tensor.reshape(-1, tensor.size(-2), tensor.size(-1))
+
+    # Reshape tensors to have batch dimension
+    A = add_batch(A)
+    B = add_batch(B)
+    C = add_batch(C)
+
+    ret = (torch.bmm(A, B) * alpha) + (C * beta)
+    reshape_vals = batch_count + C.shape[-2:]
+    return ret.reshape(*reshape_vals)
+
+
+def initialize(rows, cols, batch):
+    tensor = torch.randint(-3, 3, size=(rows*cols*prod(batch),), device='cuda').half()
+    if len(batch) > 0 and prod(batch) > 1:
+        reshape_vals = batch + (rows, cols)
+        return tensor.reshape(*reshape_vals)
+    else:
+        return tensor.reshape(rows, cols)
+
+
+class GemmF16Batched(unittest.TestCase):
+    def run_batched(self, batch_count: tuple, batch_A: bool, batch_B: bool, batch_C: bool):
+        M = 512
+        N = 256
+        K = 128
+        alpha = 1.
+        beta = 2.
+
+        A = initialize(M, K, batch_count if batch_A else (1,))
+        B = initialize(K, N, batch_count if batch_B else (1,))
+        C = initialize(M, N, batch_count if batch_C else (1,))
+        D = initialize(M, N, batch_count)
+
+        plan = cutlass.op.Gemm(A=A, B=B, C=C, D=D, element_accumulator=cutlass.DataType.f32)
+        plan.run(A, B, C, D, alpha, beta)
+        reference = pytorch_reference(A, B, C, alpha, beta)
+        assert reference.equal(D)
+
+    def test_batched_ABC(self):
+        self.run_batched((3,), True, True, True)
+        self.run_batched((2, 3), True, True, True)
+
+    def test_batched_AB(self):
+        self.run_batched((3,), True, True, False)
+        self.run_batched((2, 3), True, True, False)
+
+    def test_batched_AC(self):
+        self.run_batched((3,), True, False, True)
+        self.run_batched((2, 3), True, False, True)
+
+    def test_batched_BC(self):
+        self.run_batched((3,), False, True, True)
+        self.run_batched((2, 3), False, True, True)
+
+    def test_batched_A(self):
+        self.run_batched((3,), True, False, False)
+        self.run_batched((2, 3), True, False, False)
+
+    def test_batched_B(self):
+        self.run_batched((3,), False, True, False)
+        self.run_batched((2, 3), False, True, False)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/python/cutlass/gemm/gemm_f16_sm80.py b/test/python/cutlass/gemm/gemm_f16_sm80.py
new file mode 100644
index 0000000000..7460794c5a
--- /dev/null
+++ b/test/python/cutlass/gemm/gemm_f16_sm80.py
@@ -0,0 +1,128 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Low-level functionality tests for GEMM with F16 operands on SM80
+"""
+
+from functools import partial
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend.utils.device import device_cc
+
+from utils import LayoutCombination, add_test_gemm
+
+
+cutlass.set_log_level(logging.WARNING)
+cc = 80
+dtype = cutlass.DataType.f16
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+@unittest.skipIf(cutlass.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmF16Sm80(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+@unittest.skipIf(cutlass.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmF16Sm80StreamK(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+add_test_specialized = partial(add_test_gemm, element=dtype, cc=cc, cluster_shape=[1, 1, 1])
+
+# Tests using TensorOp
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
+
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NTN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NTT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TTN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TTT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 128, 32], warp_count=[1, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[128,  64, 32], warp_count=[2, 1, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64,  64, 64], warp_count=[1, 1, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64,  64, 64], warp_count=[1, 1, 1], stages=5)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[2, 2, 2], element_output=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+
+# Tests using SIMT
+add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)
+
+add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
+              element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
+              element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
+add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
+              element_accumulator=cutlass.DataType.f32, threadblock_shape=[128,  64, 8], warp_count=[2, 1, 1], stages=2)
+add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
+              element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64,  64, 8], warp_count=[1, 1, 1], stages=2)
+add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
+              element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+
+# Stream K tests
+add_test_streamk = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
+add_test_streamk(cls=GemmF16Sm80StreamK, layouts=LayoutCombination.NNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
+                 element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_streamk(cls=GemmF16Sm80StreamK, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
+                 element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64,  64, 64], warp_count=[1, 1, 1], stages=5)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/python/cutlass/gemm/gemm_f16_sm90.py b/test/python/cutlass/gemm/gemm_f16_sm90.py
new file mode 100644
index 0000000000..cfac8a6c81
--- /dev/null
+++ b/test/python/cutlass/gemm/gemm_f16_sm90.py
@@ -0,0 +1,146 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Low-level functionality tests for GEMM with F16 operands on SM90
+"""
+
+from functools import partial
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend.utils.device import device_cc
+
+from utils import LayoutCombination, add_test_gemm
+
+
+cutlass.set_log_level(logging.WARNING)
+cc = 90
+dtype = cutlass.DataType.f16
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM90 tests.')
+@unittest.skipIf(cutlass.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmF16Sm90(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+add_test_specialized = partial(add_test_gemm, cls=GemmF16Sm90, element=dtype,
+                               warp_count=None, compilation_modes=['nvcc'])
+
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
+
+# Tests with 1x1x1 clusters
+add_test_unit_cluster = partial(add_test_tensorop, cluster_shape=[1, 1, 1])
+add_test_unit_cluster(layouts=LayoutCombination.NNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                      element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=3)
+add_test_unit_cluster(layouts=LayoutCombination.NNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                      element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.NTN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                      element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.NTT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                      element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.TNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                      element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass.DataType.f16,
+                      element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass.DataType.f16,
+                      element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                      element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                      element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64,  64, 64], stages=5)
+add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[2, 2, 2], element_output=cutlass.DataType.f16,
+                      element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], stages=None)
+
+# Tests with different cluster shapes
+add_test_cluster_shape = partial(add_test_tensorop, threadblock_shape=[64, 128, 64], stages=None)
+add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                       element_accumulator=cutlass.DataType.f16, cluster_shape=[2, 2, 1])
+add_test_cluster_shape(layouts=LayoutCombination.TNN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
+                       element_accumulator=cutlass.DataType.f32, cluster_shape=[2, 2, 1])
+add_test_cluster_shape(layouts=LayoutCombination.NTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
+                       element_accumulator=cutlass.DataType.f32, cluster_shape=[2, 2, 1])
+add_test_cluster_shape(layouts=LayoutCombination.NNN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
+                       element_accumulator=cutlass.DataType.f32, cluster_shape=[2, 2, 1])
+add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
+                       element_accumulator=cutlass.DataType.f32, cluster_shape=[1, 4, 1])
+add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
+                       element_accumulator=cutlass.DataType.f32, cluster_shape=[2, 4, 1])
+add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
+                       element_accumulator=cutlass.DataType.f32, cluster_shape=[4, 1, 1])
+add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
+                       element_accumulator=cutlass.DataType.f32, cluster_shape=[4, 2, 1])
+
+# Tests for different schedule modes
+add_test_schedule = partial(add_test_specialized, layouts=LayoutCombination.TTN, alignments=[8, 8, 4],
+                            element_output=cutlass.DataType.f32, element_accumulator=cutlass.DataType.f32,
+                            opclass=cutlass.OpcodeClass.TensorOp, threadblock_shape=[128, 128, 64], stages=None)
+add_test_schedule(
+    cluster_shape=[1, 1, 1],
+    kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedPingpong,
+    epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecialized
+)
+add_test_schedule(
+    cluster_shape=[1, 1, 1],
+    kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedCooperative,
+    epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecializedCooperative
+)
+add_test_schedule(
+    cluster_shape=[2, 1, 1],
+    kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedPingpong,
+    epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecialized
+)
+add_test_schedule(
+    cluster_shape=[2, 1, 1],
+    kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedCooperative,
+    epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecializedCooperative
+)
+
+# Tests using SIMT
+add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt, alignments=[1, 1, 1], cluster_shape=[1, 1, 1], stages=2)
+add_test_simt(layouts=LayoutCombination.NNN, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 8])
+add_test_simt(layouts=LayoutCombination.TNN, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 128, 8])
+add_test_simt(layouts=LayoutCombination.NTN, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f32, threadblock_shape=[128,  64, 8])
+add_test_simt(layouts=LayoutCombination.TTN, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64,  64, 8])
+add_test_simt(layouts=LayoutCombination.NNT, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 8])
+
+# Tests with void-C kernels
+add_test_cluster_shape(layouts=LayoutCombination.NNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                       element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None,
+                       cluster_shape=[2, 1, 1], element_C=cutlass.DataType.void)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/python/cutlass/gemm/gemm_f32_sm80.py b/test/python/cutlass/gemm/gemm_f32_sm80.py
new file mode 100644
index 0000000000..32bb2d81e7
--- /dev/null
+++ b/test/python/cutlass/gemm/gemm_f32_sm80.py
@@ -0,0 +1,104 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Low-level functionality tests for GEMM with F32 operands on SM80
+"""
+
+from functools import partial
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend.utils.device import device_cc
+
+from utils import LayoutCombination, add_test_gemm
+
+
+cutlass.set_log_level(logging.WARNING)
+cc = 80
+dtype = cutlass.DataType.f32
+
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+@unittest.skipIf(cutlass.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmF32Sm80(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+@unittest.skipIf(cutlass.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmF32Sm80StreamK(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+add_test_specialized = partial(add_test_gemm, element=dtype, cc=cc, cluster_shape=[1, 1, 1])
+
+# Tests using TensorOp
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
+
+add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NNN, alignments=[4, 4, 4], element_output=dtype, element_C=dtype,
+                  element_accumulator=dtype, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NNT, alignments=[4, 4, 4], element_output=dtype, element_C=dtype,
+                  element_accumulator=dtype, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NTN, alignments=[4, 4, 4], element_output=dtype, element_C=dtype,
+                  element_accumulator=dtype, threadblock_shape=[ 64, 128, 32], warp_count=[1, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NTN, alignments=[4, 4, 4], element_output=dtype, element_C=dtype,
+                  element_accumulator=dtype, threadblock_shape=[ 64,  64, 32], warp_count=[1, 1, 1], stages=4)
+# Tests using SIMT
+add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)
+
+add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+              element_accumulator=dtype, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+              element_accumulator=dtype, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
+add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+              element_accumulator=dtype, threadblock_shape=[128,  64, 8], warp_count=[2, 1, 1], stages=2)
+add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+              element_accumulator=dtype, threadblock_shape=[ 64,  64, 8], warp_count=[1, 1, 1], stages=2)
+add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+              element_accumulator=dtype, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+
+# Stream K tests
+add_test_streamk = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
+add_test_streamk(cls=GemmF32Sm80StreamK, layouts=LayoutCombination.TTN, alignments=[4, 4, 4], element_output=dtype, element_C=dtype,
+                 element_accumulator=dtype, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/python/cutlass/gemm/gemm_f64_sm80.py b/test/python/cutlass/gemm/gemm_f64_sm80.py
new file mode 100644
index 0000000000..8f8ce3e1e2
--- /dev/null
+++ b/test/python/cutlass/gemm/gemm_f64_sm80.py
@@ -0,0 +1,103 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Low-level functionality tests for GEMM with F64 operands on SM80
+"""
+
+from functools import partial
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend.utils.device import device_cc
+
+from utils import LayoutCombination, add_test_gemm
+
+
+cutlass.set_log_level(logging.WARNING)
+cc = 80
+dtype = cutlass.DataType.f64
+
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+@unittest.skipIf(cutlass.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmF64Sm80(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+@unittest.skipIf(cutlass.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmF64Sm80StreamK(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+add_test_specialized = partial(add_test_gemm, element=dtype, cc=cc, cluster_shape=[1, 1, 1])
+
+# Tests using TensorOp
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
+
+add_test_tensorop(cls=GemmF64Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+                  element_accumulator=dtype, threadblock_shape=[128, 128, 16], warp_count=[4, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF64Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+                  element_accumulator=dtype, threadblock_shape=[ 64,  64, 16], warp_count=[2, 2, 1], stages=4)
+add_test_tensorop(cls=GemmF64Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+                  element_accumulator=dtype, threadblock_shape=[ 32,  32, 16], warp_count=[2, 1, 1], stages=5)
+
+# Tests using SIMT
+add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)
+
+add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+              element_accumulator=dtype, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+              element_accumulator=dtype, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
+add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+              element_accumulator=dtype, threadblock_shape=[128,  64, 8], warp_count=[2, 1, 1], stages=2)
+add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+              element_accumulator=dtype, threadblock_shape=[ 64,  64, 8], warp_count=[1, 1, 1], stages=2)
+add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+              element_accumulator=dtype, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+
+# Stream K tests
+add_test_streamk = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
+add_test_streamk(cls=GemmF64Sm80StreamK, layouts=LayoutCombination.NTT, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+                 element_accumulator=dtype, threadblock_shape=[128, 128, 16], warp_count=[4, 2, 1], stages=3)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/python/cutlass/gemm/gemm_f64_sm90.py b/test/python/cutlass/gemm/gemm_f64_sm90.py
new file mode 100644
index 0000000000..784477e0b2
--- /dev/null
+++ b/test/python/cutlass/gemm/gemm_f64_sm90.py
@@ -0,0 +1,71 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Low-level functionality tests for GEMM with F64 operands on SM90
+"""
+
+from functools import partial
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend.utils.device import device_cc
+
+from utils import LayoutCombination, add_test_gemm
+
+
+cutlass.set_log_level(logging.WARNING)
+cc = 90
+dtype = cutlass.DataType.f64
+
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM90 tests.')
+@unittest.skipIf(cutlass.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmF64Sm90(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+add_test_specialized = partial(add_test_gemm, cls=GemmF64Sm90, alignments=[1, 1, 1], cluster_shape=[1, 1, 1],
+                               element=dtype, element_output=dtype, element_accumulator=dtype, compilation_modes=['nvcc'])
+
+add_test_specialized(opclass=cutlass.OpcodeClass.TensorOp, layouts=LayoutCombination.NNT, threadblock_shape=[128, 128, 32], stages=3)
+add_test_specialized(opclass=cutlass.OpcodeClass.TensorOp, layouts=LayoutCombination.TNN, threadblock_shape=[128, 128, 32], stages=3)
+add_test_specialized(    opclass=cutlass.OpcodeClass.Simt, layouts=LayoutCombination.NNN, threadblock_shape=[128, 128,  8], stages=2)
+add_test_specialized(    opclass=cutlass.OpcodeClass.Simt, layouts=LayoutCombination.TTT, threadblock_shape=[ 64, 128,  8], stages=2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/python/cutlass/gemm/gemm_f8_sm90.py b/test/python/cutlass/gemm/gemm_f8_sm90.py
new file mode 100644
index 0000000000..1e5f992764
--- /dev/null
+++ b/test/python/cutlass/gemm/gemm_f8_sm90.py
@@ -0,0 +1,112 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Low-level functionality tests for GEMM with S8 operands on SM90
+"""
+
+from functools import partial
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend.utils.device import device_cc
+
+from utils import LayoutCombination, add_test_gemm
+
+
+cutlass.set_log_level(logging.WARNING)
+cc = 90
+dtype = cutlass.DataType.e4m3
+
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM90 tests.')
+@unittest.skipIf(cutlass.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmF8E4M3Sm90(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+add_test_specialized = partial(add_test_gemm, cls=GemmF8E4M3Sm90, element=dtype, compilation_modes=['nvcc'])
+
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
+
+# Test with 1x1x1 clusters
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.e4m3,
+                  element_accumulator=cutlass.DataType.f32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=None)
+
+# Tests with different cluster shapes
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.e4m3,
+                  element_accumulator=cutlass.DataType.f32, cluster_shape=[2, 2, 1], threadblock_shape=[128, 128, 128], stages=None)
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.e4m3,
+                  element_accumulator=cutlass.DataType.f32, cluster_shape=[1, 4, 1], threadblock_shape=[128, 128, 128], stages=None)
+
+# Tests with warp-specialized ping-pong schedule
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.e4m3,
+                  element_accumulator=cutlass.DataType.f32, cluster_shape=[2, 1, 1], threadblock_shape=[128, 128, 128], stages=None,
+                  kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedPingpong,
+                  epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecialized)
+
+# Tests for SIMT
+add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)
+add_test_simt(layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass.DataType.e4m3,
+              element_accumulator=cutlass.DataType.f32, cluster_shape=[1, 1, 1], threadblock_shape=[64, 32, 8], stages=2)
+
+
+#
+# Add a test for E5M2
+#
+dtype = cutlass.DataType.e5m2
+
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM90 tests.')
+@unittest.skipIf(cutlass.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmF8E5M2Sm90(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+add_test_specialized = partial(add_test_gemm, cls=GemmF8E5M2Sm90, element=dtype, compilation_modes=['nvcc'])
+
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
+
+# Tests with 1x1x1 clusters
+add_test_tensorop(layouts=LayoutCombination.TNN, alignments=[16, 16, 16], element_output=dtype,
+                  element_accumulator=cutlass.DataType.f32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=3)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/python/cutlass/gemm/gemm_mixed_sm80.py b/test/python/cutlass/gemm/gemm_mixed_sm80.py
new file mode 100644
index 0000000000..80d1919eeb
--- /dev/null
+++ b/test/python/cutlass/gemm/gemm_mixed_sm80.py
@@ -0,0 +1,75 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Low-level functionality tests for GEMM with mixed operands on SM80
+"""
+
+from functools import partial
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend.utils.device import device_cc
+
+from utils import LayoutCombination, add_test_gemm
+
+
+cutlass.set_log_level(logging.WARNING)
+cc = 80
+dtype =cutlass.DataType.f16
+
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+@unittest.skipIf(cutlass.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmMixedSm80(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+add_test_mixed = partial(add_test_gemm, cls=GemmMixedSm80, element=dtype, cc=cc, cluster_shape=[1, 1, 1],
+                         opclass=cutlass.OpcodeClass.TensorOp, threadblock_shape=[128, 128, 64],
+                         warp_count=[2, 2, 1], stages=3, element_accumulator=cutlass.DataType.f32)
+
+# Test with upcast on A
+add_test_mixed(element_A=cutlass.DataType.s8, alignments=[16, 8, 8], layouts=LayoutCombination.TNT)
+add_test_mixed(element_A=cutlass.DataType.s8, alignments=[16, 8, 8], layouts=LayoutCombination.TNN)
+
+# Test with upcast on B
+add_test_mixed(element_B=cutlass.DataType.s8, alignments=[8, 16, 8], layouts=LayoutCombination.TNT)
+add_test_mixed(element_B=cutlass.DataType.s8, alignments=[8, 16, 8], layouts=LayoutCombination.TNN)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/python/cutlass/gemm/gemm_s8_sm80.py b/test/python/cutlass/gemm/gemm_s8_sm80.py
new file mode 100644
index 0000000000..d38bc09843
--- /dev/null
+++ b/test/python/cutlass/gemm/gemm_s8_sm80.py
@@ -0,0 +1,103 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Low-level functionality tests for GEMM with S8 operands on SM80
+"""
+
+from functools import partial
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend.utils.device import device_cc
+
+from utils import LayoutCombination, add_test_gemm
+
+
+cutlass.set_log_level(logging.WARNING)
+cc = 80
+dtype = cutlass.DataType.s8
+
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+@unittest.skipIf(cutlass.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmS8Sm80(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+@unittest.skipIf(cutlass.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmS8Sm80StreamK(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+add_test_specialized = partial(add_test_gemm, element=dtype, cc=cc, cluster_shape=[1, 1, 1])
+
+# Tests using TensorOp
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
+
+add_test_tensorop(cls=GemmS8Sm80, layouts=LayoutCombination.TNN, alignments=[16, 16, 16],  element_output=cutlass.DataType.s8, element_C=cutlass.DataType.s8,
+                  element_accumulator=cutlass.DataType.s32, threadblock_shape=[256, 128, 64], warp_count=[4, 2, 1], stages=3)
+add_test_tensorop(cls=GemmS8Sm80, layouts=LayoutCombination.TNT, alignments=[16, 16, 16],  element_output=cutlass.DataType.s8, element_C=cutlass.DataType.s8,
+                  element_accumulator=cutlass.DataType.s32, threadblock_shape=[128, 256, 64], warp_count=[2, 4, 1], stages=3)
+add_test_tensorop(cls=GemmS8Sm80, layouts=LayoutCombination.TNN, alignments=[16, 16,  4], element_output=cutlass.DataType.s32, element_C=cutlass.DataType.s32,
+                  element_accumulator=cutlass.DataType.s32, threadblock_shape=[ 64,  64, 64], warp_count=[1, 1, 1], stages=4)
+
+# Tests using SIMT
+add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)
+
+add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1],  element_output=cutlass.DataType.s8, element_C=cutlass.DataType.s8,
+              element_accumulator=cutlass.DataType.s32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1],  element_output=cutlass.DataType.s8, element_C=cutlass.DataType.s8,
+              element_accumulator=cutlass.DataType.s32, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
+add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1],  element_output=cutlass.DataType.s8, element_C=cutlass.DataType.s8,
+              element_accumulator=cutlass.DataType.s32, threadblock_shape=[128,  64, 8], warp_count=[2, 1, 1], stages=2)
+add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass.DataType.s32, element_C=cutlass.DataType.s32,
+              element_accumulator=cutlass.DataType.s32, threadblock_shape=[ 64,  64, 8], warp_count=[1, 1, 1], stages=2)
+add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=cutlass.DataType.s32, element_C=cutlass.DataType.s32,
+              element_accumulator=cutlass.DataType.s32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+
+# Stream K tests
+add_test_streamk = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
+add_test_streamk(cls=GemmS8Sm80StreamK, layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8, element_C=cutlass.DataType.s8,
+                 element_accumulator=cutlass.DataType.s32, threadblock_shape=[128, 256, 64], warp_count=[2, 4, 1], stages=3)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/python/cutlass/gemm/gemm_s8_sm90.py b/test/python/cutlass/gemm/gemm_s8_sm90.py
new file mode 100644
index 0000000000..1035482cfe
--- /dev/null
+++ b/test/python/cutlass/gemm/gemm_s8_sm90.py
@@ -0,0 +1,98 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Low-level functionality tests for GEMM with S8 operands on SM90
+"""
+
+from functools import partial
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend.utils.device import device_cc
+
+from utils import LayoutCombination, add_test_gemm
+
+
+cutlass.set_log_level(logging.WARNING)
+cc = 90
+dtype = cutlass.DataType.s8
+
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM90 tests.')
+@unittest.skipIf(cutlass.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmS8Sm90(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+add_test_specialized = partial(add_test_gemm, cls=GemmS8Sm90, element=dtype, compilation_modes=['nvcc'])
+
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
+
+# Tests with 1x1x1 clusters
+add_test_tensorop(layouts=LayoutCombination.TNN, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
+                  element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=3)
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
+                  element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=None)
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16,  8], element_output=cutlass.DataType.s8,
+                  element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=None)
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
+                  element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[64,  128, 128], stages=None)
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
+                  element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128,  64,  32], stages=None)
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[ 4,  4, 16], element_output=cutlass.DataType.s8,
+                  element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=None)
+
+# Tests with different cluster shapes
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
+                  element_accumulator=cutlass.DataType.s32, cluster_shape=[2, 2, 1], threadblock_shape=[128, 128, 128], stages=None)
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
+                  element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 4, 1], threadblock_shape=[128, 128, 128], stages=None)
+
+# Tests with warp-specialized ping-pong schedule
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
+                  element_accumulator=cutlass.DataType.s32, cluster_shape=[2, 1, 1], threadblock_shape=[128, 128, 128], stages=None,
+                  kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedPingpong,
+                  epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecialized)
+
+# Tests for SIMT
+add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)
+add_test_simt(layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass.DataType.s8,
+              element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[64, 32, 8], stages=2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/python/cutlass/gemm/gemm_testbed.py b/test/python/cutlass/gemm/gemm_testbed.py
new file mode 100644
index 0000000000..731c96808c
--- /dev/null
+++ b/test/python/cutlass/gemm/gemm_testbed.py
@@ -0,0 +1,423 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from math import prod
+import os
+import re
+import subprocess
+
+import torch
+
+from cutlass_library import (
+    DataType,
+    DataTypeSize,
+    GemmUniversalMode,
+    LayoutType,
+    OpcodeClass,
+    ShortDataTypeNames,
+    SwizzlingFunctor
+)
+
+from cutlass.backend import compiler
+from cutlass.backend.gemm_operation import GemmArguments, GemmOperationUniversal
+from cutlass.backend.reduction_operation import ReductionArguments, ReductionOperation
+from cutlass.shape import GemmCoord, MatrixCoord
+from cutlass.utils.datatypes import torch_type
+
+
+class GemmUniversalLauncher:
+    def __init__(
+        self,
+        operation,
+        seed=2080,
+        verification=True,
+        iterations=500,
+        compiler_mode= "nvcc",
+        **kwargs,
+    ) -> None:
+        self.math_operation = operation.tile_description.math_instruction.math_operation
+        self.verification = verification
+
+        if compiler_mode == "nvcc":
+            compiler.nvcc()
+        elif compiler_mode == "nvrtc":
+            compiler.nvrtc()
+        else:
+            raise Exception(f"Unexpected compiler string {compiler_mode}")
+
+        op_list = [operation]
+        if operation.arch < 90:
+            # Split K via Python is currently only supported for pre-SM90 kernels
+            self.reduction_operation: ReductionOperation = ReductionOperation(
+                shape=MatrixCoord(4, 32 * operation.C.alignment),
+                C=operation.C,
+                element_accumulator=operation.tile_description.math_instruction.element_accumulator,
+                element_compute=operation.epilogue_functor.element_epilogue,
+                epilogue_functor=operation.epilogue_functor,
+                count=operation.C.alignment,
+            )
+            op_list.append(self.reduction_operation)
+
+        compiler.add_module(op_list, bypass_cache=False)
+
+        self.operation = operation
+
+        self.dtype_A = torch_type(operation.A.element if not self.operation.switched else self.operation.B.element)
+        self.dtype_B = torch_type(operation.B.element if not self.operation.switched else self.operation.A.element)
+        self.dtype_C = torch_type(operation.C.element)
+        self.dtype_D = torch_type(operation.epilogue_functor.element_output)
+
+        element_size = min(DataTypeSize[operation.A.element], DataTypeSize[operation.B.element])
+
+        if element_size == 1:
+            self.rand_max = 1
+            self.rand_min = 0
+        elif element_size <= 8:
+            self.rand_max = 1
+            self.rand_min = -1
+        elif element_size == 16:
+            self.rand_max = 4
+            self.rand_min = -4
+        else:
+            self.rand_max = 8
+            self.rand_min = -8
+
+        self.seed = seed
+
+        self.compute_type = operation.epilogue_functor.element_epilogue
+        self.accumulator_type = operation.tile_description.math_instruction.element_accumulator
+
+    def print_problem_size(self, p, mode, batch_count):
+        if mode == GemmUniversalMode.Gemm:
+            mode = "Gemm"
+        elif mode == GemmUniversalMode.Batched:
+            mode = "GemmBatched"
+        elif mode == GemmUniversalMode.GemmSplitKParallel:
+            mode = "GemmSplitKParallel"
+        print(f"problem: {p.m}, {p.n}, {p.k}\n batch_count: {batch_count}\n mode: {mode}")
+
+    def uniform_init(self, shape, dtype, layout):
+        size = prod(shape)
+        if dtype.is_floating_point:
+            # Initialize data in FP32 and call convert to the data type we desire.
+            # This is a workaround for the following error that occurs when attempting to
+            # call uniform_ on a tensor with torch.float8_e4m3fn data:
+            # RuntimeError: "check_uniform_bounds" not implemented for 'Float8_e4m3fn'
+            data = torch.ceil(
+                torch.empty(size=(size,), dtype=torch.float32, device="cuda").uniform_(
+                    self.rand_min - 0.5, self.rand_max - 0.5)
+                ).to(dtype)
+        else:
+            # PyTorch does not currently support integer-typed matrix multiplications on GPU.
+            # Fall back to CPU for integer type references.
+            data = torch.empty(size=(size,), dtype=dtype, device="cpu").random_(self.rand_min, self.rand_max + 1)
+
+        is_fp8 = dtype == getattr(torch, "float8_e4m3fn", -1) or dtype == dtype == getattr(torch, "float8_e5m2", -1)
+
+        if dtype == torch.float64 or dtype == torch.float32 or is_fp8:
+            data = data.to("cpu")
+
+        data_ref = data.reshape(shape)
+
+        if layout == LayoutType.RowMajor:
+            data_cutlass = data_ref
+        else:
+            data_cutlass = data_ref.transpose(-1, -2).contiguous()
+
+        data_cutlass = data_cutlass.to("cuda")
+
+        # As of this writing, few operations in PyTorch are supported with FP8 data.
+        # Thus, we perform computation in FP32 for FP8 reference checks.
+        if is_fp8:
+            data_ref = data_ref.to(torch.float32)
+
+        return data_cutlass, data_ref
+
+    def reference(self, problem_size, tensor_A, tensor_B, tensor_C, alpha, beta):
+        # If any tensor is on CPU, place all tensors on CPU unless only
+        # tensor C is on CPU
+        # Handle mixed-input cases by casting to the larger data type and overriding
+        # to whatever the data type of the larger type is
+        if self.dtype_A != self.dtype_B:
+            if DataTypeSize[self.operation.A.element] < DataTypeSize[self.operation.B.element]:
+                tensor_A = tensor_A.to(self.dtype_B).to(tensor_B.device)
+            else:
+                tensor_B = tensor_B.to(self.dtype_A).to(tensor_A.device)
+
+        devices = [x.device.type for x in [tensor_A, tensor_B]]
+        if tensor_C is not None:
+            devices.append(tensor_C.device.type)
+
+        if "cpu" in devices and devices != ["cuda", "cuda", "cpu"]:
+            device = torch.device("cpu")
+        else:
+            device = tensor_A.device
+
+        tensor_A = tensor_A.to(device)
+        tensor_B = tensor_B.to(device)
+        if tensor_C is not None:
+            tensor_C = tensor_C.to(device)
+
+        dtype = torch_type(self.compute_type)
+        alpha_torch = torch.tensor([alpha], device=device).to(dtype)
+        beta_torch = torch.tensor([beta], device=device).to(dtype)
+
+        tmp = tensor_A @ tensor_B
+        tensor_D_ref = (alpha_torch * tmp)
+        if tensor_C is not None:
+            tensor_D_ref += (tensor_C * beta_torch)
+        return tensor_D_ref.to(self.dtype_D)
+
+    def run(self, mode, problem_size, batch_count=1, split_k_slices=1, alpha=1.0, beta=0.0):
+        torch.random.manual_seed(self.seed)
+
+        # Assign an actual batch count in cases where we are not running in batched mode.
+        # This is to differentiate between the number of split K slices and the batch count,
+        # which are overloaded within the single `batch_count` variable.
+        if mode == GemmUniversalMode.Batched:
+            true_batch_count = batch_count
+        else:
+            true_batch_count = 1
+
+        def transpose(layout):
+            if layout == LayoutType.RowMajor:
+                return LayoutType.ColumnMajor
+            else:
+                return LayoutType.RowMajor
+
+        tensor_A, tensor_A_ref = self.uniform_init(
+            (true_batch_count, problem_size.m, problem_size.k),
+            self.dtype_A,
+            self.operation.A.layout if not self.operation.switched else transpose(self.operation.B.layout),
+        )
+        tensor_B, tensor_B_ref = self.uniform_init(
+            (true_batch_count, problem_size.k, problem_size.n),
+            self.dtype_B,
+            self.operation.B.layout if not self.operation.switched else transpose(self.operation.A.layout),
+        )
+        if self.dtype_C is not None:
+            tensor_C, tensor_C_ref = self.uniform_init(
+                (true_batch_count, problem_size.m, problem_size.n),
+                self.dtype_C,
+                self.operation.C.layout if not self.operation.switched else transpose(self.operation.C.layout),
+            )
+        else:
+            tensor_C = None
+            tensor_C_ref = None
+
+        tensor_D, _ = self.uniform_init(
+            (true_batch_count, problem_size.m, problem_size.n),
+            self.dtype_D,
+            self.operation.C.layout if not self.operation.switched else transpose(self.operation.C.layout),
+        )
+        tensor_D = torch.zeros_like(tensor_D)
+
+        if self.compute_type in [DataType.s8, DataType.s32, DataType.u8, DataType.u32]:
+            alpha = int(alpha)
+            beta = int(beta)
+
+        #
+        # Launch kernel
+        #
+
+        arguments = GemmArguments(
+            operation=self.operation,
+            problem_size=problem_size,
+            A=tensor_A,
+            B=tensor_B,
+            C=tensor_C,
+            D=tensor_D,
+            output_op=self.operation.epilogue_type(alpha, beta),
+            gemm_mode=mode,
+            split_k_slices=split_k_slices,
+            batch=batch_count,
+        )
+
+        if mode == GemmUniversalMode.GemmSplitKParallel:
+            reduction_arguments = ReductionArguments(
+                self.reduction_operation,
+                problem_size=[problem_size.m, problem_size.n],
+                partitions=split_k_slices,
+                workspace=arguments.ptr_D,
+                destination=tensor_D,
+                source=tensor_C,
+                output_op=self.reduction_operation.epilogue_type(alpha, beta),
+            )
+
+        self.operation.run(arguments)
+
+        if mode == GemmUniversalMode.GemmSplitKParallel:
+            self.reduction_operation.run(reduction_arguments)
+
+        passed = True
+
+        if self.verification:
+            if mode == GemmUniversalMode.GemmSplitKParallel:
+                reduction_arguments.sync()
+
+                # Free memory allocated by args because we are not
+                # calling `arguments.sync()` in this case (which will free memory)
+                arguments.free()
+            else:
+                arguments.sync()
+            tensor_D_ref = self.reference(
+                problem_size,
+                tensor_A_ref,
+                tensor_B_ref,
+                tensor_C_ref,
+                alpha,
+                beta,
+            )
+
+            tensor_D_ref = tensor_D_ref.to('cuda')
+
+            if self.operation.switched or self.operation.C.layout == LayoutType.ColumnMajor:
+                tensor_D = tensor_D.transpose(-1, -2).contiguous()
+
+            passed = tensor_D.equal(tensor_D_ref)
+
+            try:
+                assert passed
+            except AssertionError:
+                self.print_problem_size(problem_size, mode, batch_count)
+        del arguments
+        if mode == GemmUniversalMode.GemmSplitKParallel:
+            del reduction_arguments
+
+        return passed
+
+
+def test_all_gemm(operation: "GemmOperationUniversal", testcase="universal", compilation_mode="nvcc"):
+    passed = True
+
+    minimum_operand_element_size = min(
+        DataTypeSize[operation.A.element], DataTypeSize[operation.B.element]
+    )
+    opcode_class = operation.tile_description.math_instruction.opcode_class
+
+    if opcode_class == OpcodeClass.Simt:
+        alignment = 1
+    else:
+        alignment = 128 // minimum_operand_element_size
+
+    alignment_m = alignment
+    alignment_n = alignment
+    alignment_k = alignment
+
+    # INT8 alignment constraints
+    if opcode_class == OpcodeClass.Simt:
+        A_is_s8 = operation.A.element == DataType.s8
+        B_is_s8 = operation.B.element == DataType.s8
+
+        if A_is_s8 and operation.A.layout == LayoutType.ColumnMajor:
+            alignment_m = 4
+        if B_is_s8 == DataType.s8 and operation.A.layout == LayoutType.RowMajor:
+            alignment_n = 4
+        if A_is_s8 and B_is_s8 and (operation.A.layout == LayoutType.RowMajor or operation.B.layout == LayoutType.ColumnMajor):
+            alignment_k = 4
+
+    threadblock_k = operation.tile_description.threadblock_shape[2]
+
+    assert testcase != "interleaved"
+
+    supports_split_k = operation.arch < 90 and not operation.swizzling_functor == SwizzlingFunctor.StreamK
+
+    if testcase == "multistage":
+        modes = [GemmUniversalMode.Gemm]
+        problem_size_m = [16, 528]
+        problem_size_n = [16, 528]
+        problem_size_k = [
+            threadblock_k,
+            threadblock_k * operation.tile_description.stages
+            + operation.tile_description.math_instruction.instruction_shape[2],
+        ]
+        problem_alpha = [1.0]
+        problem_beta = [0.0]
+        batch_counts = [1]
+    else:
+        modes = [GemmUniversalMode.Gemm]
+        batch_counts = [1, 2, 3, 5, 7]
+        if supports_split_k:
+            modes.append(GemmUniversalMode.GemmSplitKParallel)
+
+        problem_size_m = [alignment_m, 512 - 3 * alignment_m]
+        problem_size_n = [alignment_n, 512 - 2 * alignment_n]
+        if operation.tile_description.stages is None:
+            stages_for_k_calc = 7
+        else:
+            stages_for_k_calc = operation.tile_description.stages
+        problem_size_k = [
+            alignment_k,
+            threadblock_k * stages_for_k_calc - alignment_k,
+            threadblock_k * stages_for_k_calc * 3 - alignment_k,
+        ]
+        problem_alpha = [1.0]
+        problem_beta = [2.0]
+
+    testbed = GemmUniversalLauncher(operation, compiler_mode=compilation_mode)
+
+    for mode in modes:
+        for m in problem_size_m:
+            for n in problem_size_n:
+                for k in problem_size_k:
+                    for batch_count in batch_counts:
+                        for alpha in problem_alpha:
+                            for beta in problem_beta:
+                                # skip very small K problems
+                                if testcase == "universal":
+                                    if k // batch_count < 2 * threadblock_k:
+                                        continue
+
+                                problem_size = GemmCoord(m, n, k)
+
+                                if supports_split_k:
+                                    split_k_slices = batch_count
+                                else:
+                                    split_k_slices = 1
+
+                                overridden_mode = mode
+                                if mode == GemmUniversalMode.Gemm and batch_count > 1:
+                                    overridden_mode = GemmUniversalMode.Batched
+
+                                passed = testbed.run(
+                                    overridden_mode,
+                                    problem_size,
+                                    batch_count,
+                                    split_k_slices,
+                                    alpha,
+                                    beta,
+                                )
+
+                                if not passed:
+                                    return False
+
+    return passed
diff --git a/test/python/cutlass/gemm/run_all_tests.py b/test/python/cutlass/gemm/run_all_tests.py
new file mode 100644
index 0000000000..0ad3922c81
--- /dev/null
+++ b/test/python/cutlass/gemm/run_all_tests.py
@@ -0,0 +1,44 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import pathlib
+import unittest
+
+
+if __name__ == '__main__':
+    loader = unittest.TestLoader()
+    script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
+    tests = loader.discover(script_dir, 'gemm_*.py')
+    testRunner = unittest.runner.TextTestRunner()
+    results = testRunner.run(tests)
+    if not results.wasSuccessful():
+        raise Exception('Test cases failed')
diff --git a/test/python/cutlass/gemm/utils.py b/test/python/cutlass/gemm/utils.py
new file mode 100644
index 0000000000..2d8c30f1c2
--- /dev/null
+++ b/test/python/cutlass/gemm/utils.py
@@ -0,0 +1,260 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from cutlass_library import SubstituteTemplate
+
+import cutlass
+from cutlass_library import (
+    DataTypeNames,
+    EpilogueScheduleSuffixes,
+    KernelScheduleSuffixes,
+    LayoutType,
+    OpcodeClassNames,
+    ShortDataTypeNames,
+    ShortLayoutTypeNames
+)
+from cutlass.backend import library
+
+from gemm_testbed import test_all_gemm
+
+
+class Layout:
+    """
+    Utility class to map transpose and non-transpose terminology to row- and column-major terminology
+    """
+
+    T = LayoutType.RowMajor
+    N = LayoutType.ColumnMajor
+
+
+class LayoutCombination:
+    """
+    Utility class defining all combinations of row- and column-major layouts for operands to a GEMMs
+    """
+
+    NNN = (Layout.N, Layout.N, Layout.N)
+    NNT = (Layout.N, Layout.N, Layout.T)
+    NTN = (Layout.N, Layout.T, Layout.N)
+    NTT = (Layout.N, Layout.T, Layout.T)
+    TNN = (Layout.T, Layout.N, Layout.N)
+    TNT = (Layout.T, Layout.N, Layout.T)
+    TTN = (Layout.T, Layout.T, Layout.N)
+    TTT = (Layout.T, Layout.T, Layout.T)
+
+
+def get_name(
+    layouts,
+    alignments,
+    element_output,
+    element_accumulator,
+    element_epilogue,
+    cluster_shape,
+    threadblock_shape,
+    stages,
+    element_a,
+    element_b,
+    element_c,
+    arch,
+    opclass,
+    kernel_schedule=None,
+    epilogue_schedule=None,
+    suffix="",
+):
+    """
+    Generates a procedural name for a test case.
+
+    :param layouts: indexable container of layouts of A, B, and C operands
+    :param alignments: indexable container of alignments of A, B, and C operands
+    :param element_output: data type of the output element
+    :param element_accumulator: data type used in accumulation
+    :param element_epilogue: data type used in computing the epilogue
+    :param cluster_shape: indexable container of dimensions of threadblock cluster to be launched
+    :param threadblock_shape: indexable container of dimensions of threadblock tiles
+    :param stages: number of pipeline stages to use in the kernel
+    :type stages: int
+    :param element_a: data type of operand A
+    :param element_b: data type of operand B
+    :param element_c: data type of operand C
+    :param arch: compute capability of kernel being generated
+    :type arch: int
+    :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
+    :type opclass: cutlass.OpcodeClass
+    :param kernel_schedule: kernel_schedule type
+    :type kernel_schedule: cutlass.KernelScheduleType
+    :param epilogue_schedule: epilogue_schedule type
+    :type epilogue_schedule: cutlass.EpilogueScheduleType
+    :param suffix: additional string to add to the suffix of the name
+    :type suffix: str
+
+    :return: str
+    """
+    name_format = "test_SM${arch}_Device_Gemm_${eA}${lA}_${eB}${lB}_${eC}${lC}_${opclass}_${acc}_${tbM}x${tbN}x${tbK}_${cM}x${cN}x${cK}_${stages}_align${aA}-${aB}-${aC}${k}${e}${suffix}"
+    return SubstituteTemplate(
+        name_format,
+        {
+            "arch": str(arch),
+            "eA": DataTypeNames[element_a],
+            "eB": DataTypeNames[element_b],
+            "eC": DataTypeNames[element_c],
+            "lA": ShortLayoutTypeNames[layouts[0]],
+            "lB": ShortLayoutTypeNames[layouts[1]],
+            "lC": ShortLayoutTypeNames[layouts[2]],
+            "opclass": OpcodeClassNames[opclass],
+            "acc": DataTypeNames[element_accumulator],
+            "cM": str(cluster_shape[0]),
+            "cN": str(cluster_shape[1]),
+            "cK": str(cluster_shape[2]),
+            "tbM": str(threadblock_shape[0]),
+            "tbN": str(threadblock_shape[1]),
+            "tbK": str(threadblock_shape[2]),
+            "stages": str(stages) if stages is not None else "auto",
+            "aA": str(alignments[0]),
+            "aB": str(alignments[1]),
+            "aC": str(alignments[2]),
+            "k": "" if kernel_schedule is None else KernelScheduleSuffixes[kernel_schedule],
+            "e": "" if epilogue_schedule is None else EpilogueScheduleSuffixes[epilogue_schedule],
+            "suffix": "" if suffix is None else suffix,
+        },
+    )
+
+
+def add_test_gemm(
+    cls=None,
+    cc=None,
+    element=None,
+    layouts=None,
+    alignments=None,
+    element_output=None,
+    element_accumulator=None,
+    cluster_shape=None,
+    threadblock_shape=None,
+    warp_count=None,
+    stages=None,
+    opclass=None,
+    swizzle=None,
+    kernel_schedule=None,
+    epilogue_schedule=None,
+    compilation_modes=['nvcc', 'nvrtc'],
+    element_A=None,
+    element_B=None,
+    element_C=None):
+    """
+    Create test-running functions with the given specification and set it as a method of ``cls``.
+
+    :param cls: class to which the generated method will be added
+    :type cls: type
+    :param cc: compute capability to compile for
+    :type cc: int
+    :param element: data type of A and B operands
+    :type element: cutlass.DataType.f16
+    :param layouts: layouts of A, B, and C operands
+    :type layouts: list or tuple
+    :param alignments: alingments of A, B, and C operands
+    :type alignments: list or tuple
+    :param element_output: data type of the output element
+    :type element_output: cutlass.DataType
+    :param element_accumulator: data type used in accumulation
+    :type element_accumulator: cutlass.DataType
+    :param cluster_shape: dimensions of clusters
+    :type cluster_shape: list or tuple
+    :param threadblock_shape: dimensions of threadblock tiles
+    :type threadblock_shape: list or tuple
+    :param warp_count: warps to be launched per threadblock dimension
+    :type warp_count: list or tuple
+    :param stages: number of pipeline stages to use in the kernel
+    :type stages: int
+    :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
+    :type opclass: cutlass.OpcodeClass
+    :param swizzle: threadblock swizzling functor
+    :param kernel_schedule: kernel schedule to use
+    :type kernel_schedule: cutlass.KernelScheduleType
+    :param epilogue_schedule: epilogue schedule to use
+    :type epilogue_schedule: cutlass.EpilogueScheduleType
+    :param compilation_modes: list of compilers to used in testing the kernel (options: 'nvrtc', 'nvcc')
+    :type compilation_modes: list,
+    :param element_A: data type of operand A. If set, overrides ``element``
+    :type element_A: cutlass.DataType
+    :param element_B: data type of operand B. If set, overrides ``element``
+    :type element_B: cutlass.DataType
+    :param element_C: data type of operand C. If set, overrides ``element``
+    :type element_C: cutlass.DataType
+    """
+
+    if element_A is None:
+        element_A = element
+    if element_B is None:
+        element_B = element
+    if element_C is None:
+        element_C = element
+    if element_output is None:
+        element_output = element
+    if element_accumulator is None:
+        element_accumulator = element
+
+    for compilation_mode in compilation_modes:
+        def run(self):
+            """
+            Dynamically-generated function that constructs a GEMM operation and verifies it against
+            multiple test cases.
+            """
+
+            layout_A, layout_B, layout_C = layouts
+            alignment_A, alignment_B, alignment_C = alignments
+
+            plan = cutlass.op.Gemm(element_A=element_A, element_B=element_B,
+                                element_C=element_C, element_D=element_output,
+                                layout_A=layout_A, layout_B=layout_B, layout_C=layout_C,
+                                element_accumulator=element_accumulator,
+                                kernel_cc=cc)
+
+            plan.opclass = opclass
+            if swizzle is not None:
+                plan.swizzling_functor = swizzle
+
+            td = plan.tile_descriptions()[0]
+
+            if warp_count is not None:
+                td.warp_count = warp_count
+            td.threadblock_shape = threadblock_shape
+            td.stages = stages
+            td.cluster_shape = cluster_shape
+            op = plan.construct(tile_description=td, alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
+            self.assertTrue(test_all_gemm(op, 'universal', compilation_mode=compilation_mode))
+
+        element_epilogue = element_accumulator
+        name = get_name(
+            layouts=layouts, alignments=alignments, element_output=element_output, element_accumulator=element_accumulator,
+            element_epilogue=element_epilogue, cluster_shape=cluster_shape, threadblock_shape=threadblock_shape,
+            stages=stages, element_a=element_A, element_b=element_B, element_c=element_C, arch=cc, opclass=opclass,
+            kernel_schedule=kernel_schedule, epilogue_schedule=epilogue_schedule, suffix=f'_{compilation_mode}')
+
+        setattr(cls, name, run)
diff --git a/test/python/cutlass/installation.py b/test/python/cutlass/installation.py
new file mode 100644
index 0000000000..7cc700758b
--- /dev/null
+++ b/test/python/cutlass/installation.py
@@ -0,0 +1,57 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Tests for a successful installation of the CUTLASS Python interface
+"""
+
+import os
+import unittest
+
+import cutlass
+import cutlass_library
+
+
+class InstallationTest(unittest.TestCase):
+    def test_cutlass_source_paths(self):
+        """
+        Tests that CUTLASS source is available as part of the cutlass and cutlass_library packages
+        """
+        src_file = 'include/cutlass/cutlass.h'
+        library_file = os.path.join(cutlass_library.source_path, src_file)
+        cutlass_file = os.path.join(cutlass.CUTLASS_PATH, src_file)
+        assert os.path.isfile(library_file), f"Unable to locate file {library_file}. Installation has not succeeded."
+        assert os.path.isfile(cutlass_file), f"Unable to locate file {cutlass_file}. Installation has not succeeded."
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/python/cutlass/interface/conv2d_interface.py b/test/python/cutlass/interface/conv2d_interface.py
new file mode 100644
index 0000000000..425f2a9dc3
--- /dev/null
+++ b/test/python/cutlass/interface/conv2d_interface.py
@@ -0,0 +1,284 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Tests the high-level Conv2d interface
+"""
+
+from math import ceil
+import unittest
+
+import cutlass
+import cutlass.utils.datatypes as datatypes
+from cutlass.backend.utils.device import device_cc
+from utils import ExpectException
+import os
+
+
+class Conv2dEquivalence:
+    """
+    Helper class for testing the equivalence of different constructions of the Conv2d interface
+    """
+    def __init__(self, conv_kind, element_A, element_B, element_C, element_D, element_accumulator,
+                 alignment_A, alignment_B, alignment_C):
+
+        self.element_A = element_A
+        self.element_B = element_B
+        self.element_C = element_C
+        self.element_D = element_D
+        self.element_accumulator = element_accumulator
+        self.alignment_A = alignment_A
+        self.alignment_B = alignment_B
+        self.alignment_C = alignment_C
+
+        self.conv_kind = conv_kind
+
+        self.plan = cutlass.op.Conv2d(
+            kind=self.conv_kind, element_A=element_A, element_B=element_B, element_C=element_C,
+            element_D=element_D, element_accumulator=element_accumulator)
+
+        self.op = self.plan.construct(
+            alignment_A=self.alignment_A, alignment_B=self.alignment_B,
+            alignment_C=self.alignment_C)
+
+    def _plans_equal(self, other_plan) -> bool:
+        """
+        Compares whether two plans are equal
+
+        :param other_plan: plan to compare against the default Conv2d
+        :type other_plan: cutlass.op.Conv2d
+
+        :return: whether `other_plan` is equivalent to `self.plan`
+        :rtype: bool
+        """
+        other_op = other_plan.construct(
+            alignment_A=self.alignment_A, alignment_B=self.alignment_B,
+            alignment_C=self.alignment_C)
+
+        return self.op.rt_module.emit() == other_op.rt_module.emit()
+
+    def generic_test(self):
+        """
+        Tests the equivalence of various constructions of the Conv2d interface when using CUTLASS data types
+        and layouts for constructing the Conv2d interface
+        """
+        if not datatypes.is_numpy_available():
+            return
+
+        # Test when specifying all parameters
+        plan_other = cutlass.op.Conv2d(
+            kind=self.conv_kind,
+            element_A=self.element_A, element_B=self.element_B, element_C=self.element_C,
+            element_D=self.element_D, element_accumulator=self.element_accumulator)
+        assert self._plans_equal(plan_other)
+
+        # Test when specifying all parameters but A
+        plan_other = cutlass.op.Conv2d(
+            kind=self.conv_kind,
+            element_B=self.element_B, element_C=self.element_C,
+            element_D=self.element_D, element_accumulator=self.element_accumulator,
+            element=self.element_A)
+        assert self._plans_equal(plan_other)
+
+        # Test when specifying all parameters but A and B as tensors using generic element and output
+        plan_other = cutlass.op.Conv2d(
+            kind=self.conv_kind,
+            element_C=self.element_C,
+            element_D=self.element_D, element_accumulator=self.element_accumulator,
+            element=self.element_A)
+        assert self._plans_equal(plan_other)
+
+        # Test without explicit accumulator. Only run if the type of C and the accumulator are equal
+        if self.element_C == self.element_accumulator:
+            plan_other = cutlass.op.Conv2d(
+                kind=self.conv_kind,
+                element_C=self.element_C,
+                element_D=self.element_D,
+                element=self.element_A)
+            assert self._plans_equal(plan_other)
+
+        # Test with only the generic types. Only rune if the types of A, B, C, and D are the same
+        if (self.element_A == self.element_B and self.element_A == self.element_C and self.element_A == self.element_D
+            and self.element_A == self.element_accumulator):
+            plan_other = cutlass.op.Conv2d(kind=self.conv_kind, element=self.element_A)
+            assert self._plans_equal(plan_other)
+
+    def numpy_test(self):
+        """
+        Tests the equivalence of various constructions of the Conv2d interface when using numpy as a frontend
+        """
+        if not datatypes.is_numpy_available():
+            return
+
+        import numpy as np
+        type_A = datatypes.numpy_type(self.element_A)
+        type_B = datatypes.numpy_type(self.element_B)
+        type_C = datatypes.numpy_type(self.element_C)
+        type_D = datatypes.numpy_type(self.element_D)
+        type_accum = datatypes.numpy_type(self.element_accumulator)
+
+        size = (2, 2)
+        A = np.zeros(size, dtype=type_A)
+        B = np.zeros(size, dtype=type_B)
+        C = np.zeros(size, dtype=type_C)
+        D = np.zeros(size, dtype=type_D)
+
+        return self.tensor_test(type_A, type_B, type_C, type_D, type_accum, A, B, C, D)
+
+    def torch_test(self):
+        """
+        Tests the equivalence of various constructions of the Conv2d interface when using torch as a frontend
+        """
+        if not datatypes.is_torch_available():
+            return
+
+        import torch
+        type_A = datatypes.torch_type(self.element_A)
+        type_B = datatypes.torch_type(self.element_B)
+        type_C = datatypes.torch_type(self.element_C)
+        type_D = datatypes.torch_type(self.element_D)
+        type_accum = datatypes.torch_type(self.element_accumulator)
+
+        size = (2, 2)
+
+        A = torch.empty(size, dtype=type_A)
+        B = torch.empty(size, dtype=type_B)
+        C = torch.empty(size, dtype=type_C)
+        D = torch.empty(size, dtype=type_D)
+
+        return self.tensor_test(type_A, type_B, type_C, type_D, type_accum, A, B, C, D)
+
+    def tensor_test(self, type_A, type_B, type_C, type_D, type_accum, A, B, C, D):
+        # Test when specifying all parameters via tensors
+        plan_np = cutlass.op.Conv2d(kind=self.conv_kind, A=A, B=B, C=C, D=D, element_accumulator=type_accum)
+        assert self._plans_equal(plan_np)
+
+        # Test when specifying all parameters but A as tensors
+        plan_np = cutlass.op.Conv2d(kind=self.conv_kind, B=B, C=C, D=D, element_accumulator=type_accum, element_A=type_A)
+        assert self._plans_equal(plan_np)
+
+        # Test when specifying all parameters but A and B as tensors and using generic element and output
+        if type_A == type_B:
+            plan_np = cutlass.op.Conv2d(kind=self.conv_kind, C=C, D=D, element_accumulator=type_accum, element=type_A)
+            assert self._plans_equal(plan_np)
+
+        # Test without explicit accumulator. Only run if the type of C and the accumulator.
+        if type_C == type_accum:
+            plan_np = cutlass.op.Conv2d(kind=self.conv_kind, A=A, B=B, C=C, D=D)
+            assert self._plans_equal(plan_np)
+
+        # Test with only the generic types and layouts. Only run if types and layouts of A, B, C, and D are the same.
+        if (type_A == type_B and type_A == type_C and type_A == type_D and type_A == type_accum):
+            plan_np = cutlass.op.Conv2d(kind=self.conv_kind, element=type_A)
+            assert self._plans_equal(plan_np)
+
+    def test_all(self):
+        """
+        Runs all tests on the Gemm interface
+        """
+        self.generic_test()
+        self.numpy_test()
+        self.torch_test()
+
+
+@unittest.skipIf(device_cc() <= 80, 'Device compute capability is insufficient for SM80 tests.')
+class ConvEquivalenceTest(unittest.TestCase):
+    """
+    Tests the equivalence of different constructions of the Conv2d interface
+    """
+    pass
+
+type2alignment = {
+    cutlass.DataType.f16: 8,
+    cutlass.DataType.f32: 4
+}
+
+def add_test(conv_kind, element_A, element_B, element_C, element_D, element_accumulator):
+
+    test_name = f"test_conv2d_{conv_kind}_{element_A}_{element_B}_{element_C}_{element_D}_{element_accumulator}"
+
+    def run(self):
+        conv2d_eq = Conv2dEquivalence(
+            conv_kind=conv_kind,
+            element_A=element_A, element_B=element_B,
+            element_C=element_C, element_D=element_D,
+            element_accumulator=element_accumulator,
+            alignment_A=type2alignment[element_A], alignment_B=type2alignment[element_B],
+            alignment_C=type2alignment[element_C]
+        )
+        conv2d_eq.test_all()
+
+    setattr(ConvEquivalenceTest, test_name, run)
+
+for conv_kind in ["fprop", "wgrad", "dgrad"]:
+    for types in [
+        [cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16],
+        [cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f32],
+        [cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f16],
+        [cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32],
+        [cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32]
+    ]:
+        add_test(conv_kind, types[0], types[1], types[2], types[3], types[4])
+
+
+@unittest.skipIf(device_cc() <= 80, 'Device compute capability is insufficient for SM80 tests.')
+class Conv2dErrorTests(unittest.TestCase):
+    """
+    Tests various error scenarios that arise with the high-level Gemm interface
+    """
+
+    def test_alignment(self):
+        """
+        Tests case in which the alignment specified is unsupported
+        """
+        plan = cutlass.op.Conv2d(kind="fprop", element=cutlass.DataType.f16)
+
+        with ExpectException(True, 'Alignment 3 is not supported for F16. The construction should fail.'):
+            op = plan.construct(alignment_A=3, alignment_B=3, alignment_C=3)
+
+    def test_invalid_tile_description(self):
+        """
+        Tests scenarios in which an invalid tile description is provided for a given CC
+        """
+        plan = cutlass.op.Conv2d(kind="fprop", element=cutlass.DataType.f16)
+
+        td = plan.tile_descriptions()[0]
+        td.threadblock_shape=[17, 32, 5]
+
+        plan.tile_description = td
+        with ExpectException(True, 'The threadblock shape is invalid. The compilation should fail.'):
+            plan.compile()
+        # Clean up the error message
+        os.remove("./cutlass_python_compilation_device_error.txt")
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/python/cutlass/interface/evt_interface.py b/test/python/cutlass/interface/evt_interface.py
new file mode 100644
index 0000000000..8d94889cca
--- /dev/null
+++ b/test/python/cutlass/interface/evt_interface.py
@@ -0,0 +1,254 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Test the EVT interface
+"""
+
+import numpy as np
+import unittest
+
+import cutlass
+from cutlass import LayoutType, Tensor
+from cutlass.backend.utils.device import device_cc
+from cutlass.epilogue import reshape, permute
+
+from utils import ExpectException
+
+
+@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
+class EVTErrorTests(unittest.TestCase):
+    """
+    Tests various error scenarios that arise with the EVT interface
+    """
+    @unittest.skipIf(device_cc() != 90, "Only Sm90 EVT requires root node be 'D'")
+    def test_root_not_d(self):
+        """
+        Test when "D" does not exist in Sm90 EVT
+        """
+        def evt_root_not_d(accum, alpha):
+            F = accum * alpha
+            return F
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "alpha": 1.2,
+            "F": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        
+        with ExpectException(device_cc() == 90, 
+            "SyntaxError: Sm90 EVT requires the epilogue to have a returned tensor D, "
+            "but the variable 'D' is not found in the return values.", True):
+            
+            cutlass.epilogue.trace(evt_root_not_d, example_tensors)
+
+    def test_no_accum(self):
+        """
+        Test when "accum" is not in input arguments
+        """
+        def evt_no_accum(alpha, C):
+            D = alpha * C
+            return D
+        
+        example_tensors = {
+            "C": self.fake_tensor(np.float16, (6, 512, 512)),
+            "alpha": 1.2,
+            "D": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        
+        with ExpectException(True, "SyntaxError: Cannot find 'accum' in the argument list.", True):
+            cutlass.epilogue.trace(evt_no_accum, example_tensors)
+    
+    @unittest.skipIf(device_cc() != 90, "Only Sm90 EVT has concern on smem size")
+    def test_too_much_shared_memory(self):
+        """
+        Test when the epilogue consumes too much shared memory
+        """
+        def evt_too_much_shared_memory(accum, C1, C2, C3, C4, C5, C6, C7, C8):
+            D1 = accum + C1
+            D2 = D1 + C2
+            D3 = D2 + C3
+            D4 = D3 + C4
+            D5 = D4 + C5
+            D6 = D5 + C6
+            D7 = D6 + C7
+            D = D7 + C8
+            return D, D1, D2, D3, D4, D5, D6, D7
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C1": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C2": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C3": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C4": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C5": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C6": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C7": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C8": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D1": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D2": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D3": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D4": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D5": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D6": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D7": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        
+        epilogue_visitor = cutlass.epilogue.trace(evt_too_much_shared_memory, example_tensors)
+        
+        plan = cutlass.op.Gemm(
+            element=np.float16, layout=cutlass.LayoutType.RowMajor,
+            element_accumulator=np.float32
+        )
+        
+        with ExpectException(True, 
+            "RuntimeError: The epilogue consumes too much shared memory. " 
+            "No valid tile description is found in the generator.", True):
+            plan.epilogue_visitor = epilogue_visitor
+    
+    def test_not_ssa(self):
+        """
+        Test when the epilogue is not in SSA
+        """
+        def evt_redefine(accum, C, alpha):
+            F = accum + C
+            F = F * alpha
+            D = F
+            return D, F
+
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C": self.fake_tensor(np.float16, (6, 512, 512)),
+            "alpha": 1.5,
+            "D": self.fake_tensor(np.float16, (6, 512, 512)),
+            "F": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        
+        with ExpectException(True, "SyntaxError: Variable 'F' cannot be defined twice.", True):
+            cutlass.epilogue.trace(evt_redefine, example_tensors)
+
+        def evt_undefine(accum, alpha):
+            F = accum + C
+            D = F * alpha
+            return D, F
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "alpha": 1.5,
+            "D": self.fake_tensor(np.float16, (6, 512, 512)),
+            "F": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        
+        with ExpectException(True, "SyntaxError: Variable 'C' is undefined.", True):
+            cutlass.epilogue.trace(evt_undefine, example_tensors)
+    
+    def test_missing_example_tensor(self):
+        """
+        Test when the example tensor of an input/output variable is not provided
+        """
+        def evt_missing_example_tensor(accum, C):
+            D = accum + C
+            return D
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C": self.fake_tensor(np.float16, (6, 512, 512)),
+        }
+        
+        with ExpectException(True, "RuntimeError: Example input for D is not provided.", True):
+            cutlass.epilogue.trace(evt_missing_example_tensor, example_tensors)
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D": self.fake_tensor(np.float16, (6, 512, 512)),
+        }
+        
+        with ExpectException(True, "RuntimeError: Example input for C is not provided.", True):
+            cutlass.epilogue.trace(evt_missing_example_tensor, example_tensors)
+        
+    def test_return_expression(self):
+        """
+        Test when the return value is an expression
+        """
+        def evt_return_expr(accum, C):
+            return accum + C
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C": self.fake_tensor(np.float16, (6, 512, 512)),
+        }
+        
+        with ExpectException(True, "SyntaxError: Return value cannot be an expression", True):
+            cutlass.epilogue.trace(evt_return_expr, example_tensors)
+    
+    def test_incompatible_shape(self):
+        """
+        Test when the shape of example tensors are incompatible
+        """
+        def evt_incompatible_shape(accum, C):
+            D = accum + C
+            return D
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 256, 512)),
+            "C": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        
+        with ExpectException(True, 
+            "RuntimeError: Dimension mismatch between accum(6, 256, 512), C(6, 512, 512).", True):
+            cutlass.epilogue.trace(evt_incompatible_shape, example_tensors)
+    
+    def test_no_matching_impl(self):
+        def evt_no_matching_impl(accum, bias):
+            D = accum + reshape(permute(bias, indices=(1, 0)), new_shape=(512, 1))
+            return D
+
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 256)),
+            "bias": self.fake_tensor(np.float16, (16, 32)),
+            "D": self.fake_tensor(np.float16, (6, 512, 256))
+        }
+        
+        with ExpectException(True, "NotImplementedError: No matching op for node bias with stride (0, (1, 32), 0).", True):
+            cutlass.epilogue.trace(evt_no_matching_impl, example_tensors)
+    #
+    # Helper functions
+    #
+    
+    def fake_tensor(self, element, shape):
+        return Tensor(element=element, shape=shape, layout_tag=LayoutType.RowMajor)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/python/cutlass/interface/gemm_interface.py b/test/python/cutlass/interface/gemm_interface.py
new file mode 100644
index 0000000000..e494fe03f9
--- /dev/null
+++ b/test/python/cutlass/interface/gemm_interface.py
@@ -0,0 +1,351 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Tests the high-level GEMM interface
+"""
+
+from math import ceil
+import unittest
+
+import cutlass
+import cutlass.utils.datatypes as datatypes
+from cutlass.backend.utils.device import device_cc
+from utils import ExpectException
+
+
+class GemmEquivalence:
+    """
+    Helper class for testing the equivalence of different constructions of the Gemm interface
+    """
+    def __init__(self, element_A, element_B, element_C, element_D, element_accumulator,
+                 layout_A, layout_B, layout_C, alignment_A, alignment_B, alignment_C):
+        self.element_A = element_A
+        self.element_B = element_B
+        self.element_C = element_C
+        self.element_D = element_D
+        self.element_accumulator = element_accumulator
+        self.layout_A = layout_A
+        self.layout_B = layout_B
+        self.layout_C = layout_C
+        self.alignment_A = alignment_A
+        self.alignment_B = alignment_B
+        self.alignment_C = alignment_C
+        self.plan = cutlass.op.Gemm(element_A=element_A, element_B=element_B, element_C=element_C,
+                                    element_D=element_D, element_accumulator=element_accumulator,
+                                    layout_A=layout_A, layout_B=layout_B, layout_C=layout_C)
+        self.op = self.plan.construct(alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
+
+    def _plans_equal(self, other_plan) -> bool:
+        """
+        Compares whether two plans are equal
+
+        :param other_plan: plan to compare against the default GEMM
+        :type other_plan: cutlass.op.Gemm
+
+        :return: whether `other_plan` is equivalent to `self.plan`
+        :rtype: bool
+        """
+        other_op = other_plan.construct(alignment_A=self.alignment_A, alignment_B=self.alignment_B, alignment_C=self.alignment_C)
+
+        # Compare whether the operations are equal by comparing the C++ code that would be emitted for them
+        return self.op.rt_module.emit() == other_op.rt_module.emit()
+
+    def generic_test(self):
+        """
+        Tests the equivalence of various constructions of the Gemm interface when using CUTLASS data types
+        and layouts for constructing the Gemm interface
+        """
+        if not datatypes.is_numpy_available():
+            return
+
+        # Test when specifying all parameters
+        plan_other = cutlass.op.Gemm(element_A=self.element_A, element_B=self.element_B, element_C=self.element_C,
+                                  element_D=self.element_D, element_accumulator=self.element_accumulator,
+                                  layout_A=self.layout_A, layout_B=self.layout_B, layout_C=self.layout_C)
+        assert self._plans_equal(plan_other)
+
+        # Test when specifying all parameters but A
+        plan_other = cutlass.op.Gemm(element_B=self.element_B, element_C=self.element_C,
+                                  element_D=self.element_D, element_accumulator=self.element_accumulator,
+                                  layout_B=self.layout_B, layout_C=self.layout_C,
+                                  element=self.element_A, layout=self.layout_A)
+        assert self._plans_equal(plan_other)
+
+        # Test when specifying all parameters but A and B as tensors and using generic element and output
+        # Only run this test if the layouts and types for A and B are equal.
+        if self.element_A == self.element_B and self.layout_A == self.layout_B:
+            plan_other = cutlass.op.Gemm(element_C=self.element_C, element_D=self.element_D, element_accumulator=self.element_accumulator,
+                                      layout_C=self.layout_C, element=self.element_A, layout=self.layout_A)
+            assert self._plans_equal(plan_other)
+
+        # Test without explicit accumulator. Only run if the type of C and the accumulator.
+        if self.element_C == self.element_accumulator:
+            plan_other = cutlass.op.Gemm(element_A=self.element_A, element_B=self.element_B, element_C=self.element_C,
+                                      element_D=self.element_D, layout_A=self.layout_A, layout_B=self.layout_B,
+                                      layout_C=self.layout_C)
+            assert self._plans_equal(plan_other)
+
+        # Test with only the generic types and layouts. Only run if types and layouts of A, B, C, and D are the same.
+        if (self.element_A == self.element_B and self.element_A == self.element_C and self.element_A == self.element_D
+            and self.element_A == self.element_accumulator and
+            self.layout_A == self.layout_B and self.layout_A == self.layout_C):
+            plan_other = cutlass.op.Gemm(element=self.element_A, layout=self.layout_A)
+            assert self._plans_equal(plan_other)
+
+    def numpy_test(self):
+        """
+        Tests the equivalence of various constructions of the Gemm interface when using numpy as a frontend
+        """
+        if not datatypes.is_numpy_available():
+            return
+
+        import numpy as np
+        type_A = datatypes.numpy_type(self.element_A)
+        type_B = datatypes.numpy_type(self.element_B)
+        type_C = datatypes.numpy_type(self.element_C)
+        type_D = datatypes.numpy_type(self.element_D)
+        type_accum = datatypes.numpy_type(self.element_accumulator)
+
+        layout_to_order = {
+            cutlass.LayoutType.RowMajor: 'C',
+            cutlass.LayoutType.ColumnMajor: 'F'
+        }
+        size = (2, 2)
+        A = np.zeros(size, order=layout_to_order[self.layout_A], dtype=type_A)
+        B = np.zeros(size, order=layout_to_order[self.layout_B], dtype=type_B)
+        C = np.zeros(size, order=layout_to_order[self.layout_C], dtype=type_C)
+        D = np.zeros(size, order=layout_to_order[self.layout_C], dtype=type_D)
+
+        # Test when specifying all parameters via tensors
+        plan_np = cutlass.op.Gemm(A=A, B=B, C=C, D=D, element_accumulator=type_accum)
+        assert self._plans_equal(plan_np)
+
+        # Test when specifying all parameters but A as tensors
+        plan_np = cutlass.op.Gemm(B=B, C=C, D=D, element_accumulator=type_accum, element_A=type_A, layout_A=self.layout_A)
+        assert self._plans_equal(plan_np)
+
+        # Test when specifying all parameters but A and B as tensors and using generic element and output
+        # Only run this test if the layouts and types for A and B are equal.
+        if type_A == type_B and self.layout_A == self.layout_B:
+            plan_np = cutlass.op.Gemm(C=C, D=D, element_accumulator=type_accum, element=type_A, layout=self.layout_A)
+            assert self._plans_equal(plan_np)
+
+        # Test without explicit accumulator. Only run if the type of C and the accumulator.
+        if type_C == type_accum:
+            plan_np = cutlass.op.Gemm(A=A, B=B, C=C, D=D)
+            assert self._plans_equal(plan_np)
+
+        # Test with only the generic types and layouts. Only run if types and layouts of A, B, C, and D are the same.
+        if (type_A == type_B and type_A == type_C and type_A == type_D and type_A == type_accum and
+            self.layout_A == self.layout_B and self.layout_A == self.layout_C):
+            plan_np = cutlass.op.Gemm(element=type_A, layout=self.layout_A)
+            assert self._plans_equal(plan_np)
+
+    def test_all(self):
+        """
+        Runs all tests on the Gemm interface
+        """
+        self.generic_test()
+        self.numpy_test()
+
+
+class GemmEquivalenceTest(unittest.TestCase):
+    """
+    Tests the equivalence of different constructions of the Gemm interface
+    """
+    @unittest.skipIf(device_cc() < 70, "Device compute capability is insufficient for FP16 Tensor Core tests.")
+    def test_gemm_equivalence_f16_f16_f16_f16_f16_ttt_8_8_8(self):
+        gemm_eq = GemmEquivalence(
+                element_A=cutlass.DataType.f16, element_B=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
+                element_D=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f16,
+                layout_A=cutlass.LayoutType.RowMajor, layout_B=cutlass.LayoutType.RowMajor, layout_C=cutlass.LayoutType.RowMajor,
+                alignment_A=8, alignment_B=8, alignment_C=8)
+        gemm_eq.test_all()
+
+    @unittest.skipIf(device_cc() < 70, "Device compute capability is insufficient for FP16 Tensor Core tests.")
+    def test_gemm_equivalence_f16_f16_f16_f16_f32_ntn_8_8_8(self):
+        gemm_eq = GemmEquivalence(
+                element_A=cutlass.DataType.f16, element_B=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
+                element_D=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f32,
+                layout_A=cutlass.LayoutType.ColumnMajor, layout_B=cutlass.LayoutType.RowMajor, layout_C=cutlass.LayoutType.ColumnMajor,
+                alignment_A=8, alignment_B=8, alignment_C=8)
+        gemm_eq.test_all()
+
+    @unittest.skipIf(device_cc() < 70, "Device compute capability is insufficient for FP16 Tensor Core tests.")
+    def test_gemm_equivalence_f16_f16_f16_f16_f16_ttt_4_4_4(self):
+        gemm_eq = GemmEquivalence(
+                element_A=cutlass.DataType.f16, element_B=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
+                element_D=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f16,
+                layout_A=cutlass.LayoutType.RowMajor, layout_B=cutlass.LayoutType.RowMajor, layout_C=cutlass.LayoutType.RowMajor,
+                alignment_A=8, alignment_B=8, alignment_C=8)
+        gemm_eq.test_all()
+
+    @unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for F64 Tensor Core tests.")
+    def test_gemm_equivalence_f64_f64_f64_f64_f64_tnt_1_1_1(self):
+        gemm_eq = GemmEquivalence(
+                element_A=cutlass.DataType.f64, element_B=cutlass.DataType.f64, element_C=cutlass.DataType.f64,
+                element_D=cutlass.DataType.f64, element_accumulator=cutlass.DataType.f64,
+                layout_A=cutlass.LayoutType.RowMajor, layout_B=cutlass.LayoutType.ColumnMajor, layout_C=cutlass.LayoutType.RowMajor,
+                alignment_A=1, alignment_B=1, alignment_C=1)
+        gemm_eq.test_all()
+
+
+class GemmErrorTests(unittest.TestCase):
+    """
+    Tests various error scenarios that arise with the high-level Gemm interface
+    """
+
+    def test_alignment(self):
+        """
+        Tests case in which the alignment specified is unsupported
+        """
+        plan = cutlass.op.Gemm(element=cutlass.DataType.f16, layout=cutlass.LayoutType.RowMajor)
+
+        with ExpectException(True, 'Alignment 16 is not supported for F16. The construction should fail.'):
+            op = plan.construct(alignment_A=16, alignment_B=16, alignment_C=16)
+
+    def test_tensorop_availability(self):
+        """
+        Tests case in which only SIMT operations are available but TensorOp is requested
+        """
+        cc = device_cc()
+
+        # F64 Tensor Core operations are only avaiable on devices with CC >= 80
+        supports_tensorop_f64 = cc >= 80
+        plan = cutlass.op.Gemm(cc=cc, element=cutlass.DataType.f64, layout=cutlass.LayoutType.RowMajor)
+
+        error_msg = f'Incorrectly raised an exception for availability of TensorOp with F64 operands on SM{cc}'
+        with ExpectException(not supports_tensorop_f64, error_msg):
+            plan.opclass = cutlass.OpcodeClass.TensorOp
+
+        expected_opclass = cutlass.OpcodeClass.TensorOp if supports_tensorop_f64 else cutlass.OpcodeClass.Simt
+        assert plan.opclass == expected_opclass, f'Expected opclass to be {expected_opclass}, but received {plan.opclass} for SM{cc}'
+
+    @unittest.skipIf(device_cc() < 70, "Device compute capability is insufficient for F16 Tensor Core tests.")
+    def test_opclass_switch(self):
+        """
+        Tests cases in which the opcode class in question is switched (e.g., from TensorOp to SIMT)
+        """
+        plan = cutlass.op.Gemm( element=cutlass.DataType.f16, layout=cutlass.LayoutType.RowMajor)
+        assert plan.opclass == cutlass.OpcodeClass.TensorOp
+
+        # Ensure that all tile descriptions have opclass of TensorOp
+        for td in plan.tile_descriptions():
+            assert td.math_instruction.opcode_class == cutlass.OpcodeClass.TensorOp
+
+        plan.opclass = cutlass.OpcodeClass.Simt
+
+        # Ensure that all tile descriptions have opclass of Simt
+        for td in plan.tile_descriptions():
+            assert td.math_instruction.opcode_class == cutlass.OpcodeClass.Simt
+
+    def test_invalid_tile_description(self):
+        """
+        Tests scenarios in which an invalid tile description is provided for a given CC
+        """
+        cc = device_cc()
+        plan = cutlass.op.Gemm(cc=cc, element=cutlass.DataType.f16, layout=cutlass.LayoutType.RowMajor)
+        td = plan.tile_descriptions()[0]
+        stages = td.stages
+
+        # Zero stage count is valid for SM90+, as this is used to indicate that the builder's auto stage
+        # count should be used
+        with ExpectException(cc < 90, f'Requested zero stages'):
+            td.stages = 0
+            plan.construct(td)
+
+        if cc < 90:
+            with ExpectException(cc < 80, f'Requested more than 2 stages on SM{cc}'):
+                td.stages = 3
+                plan.construct(td)
+        else:
+            original_kschedule = td.kernel_schedule
+            original_eschedule = td.epilogue_schedule
+            with ExpectException(False, f'Incorrectly flagged an error for insufficient shared memory'):
+                td.kernel_schedule = cutlass.KernelScheduleType.TmaWarpSpecializedPingpong
+                td.epilogue_schedule = cutlass.EpilogueScheduleType.NoSmemWarpSpecialized
+                td.stages = 3
+                plan.construct(td)
+
+            # Reset schedules
+            td.kernel_schedule = original_kschedule
+            td.epilogue_schedule = original_eschedule
+
+        with ExpectException(True, f'Requested too many stages'):
+            td.stages = 100
+            plan.construct(td)
+
+        # Reset stage count
+        td.stages = stages
+
+        cluster_shape = td.cluster_shape
+        with ExpectException(cc < 90, f'Requested non-unit cluster shape on SM{cc}'):
+            td.cluster_shape = [2, 1, 1]
+            plan.construct(td)
+
+        # Reset cluster shape
+        td.cluster_shape = cluster_shape
+
+        with ExpectException(cc < 90, f'Requested a non-auto schedule on SM{cc}'):
+            td.kernel_schedule = cutlass.KernelScheduleType.TmaWarpSpecializedPingpong
+            td.epilogue_schedule = cutlass.EpilogueScheduleType.TmaWarpSpecialized
+            plan.construct(td)
+
+        with ExpectException(True, f'Requested a non-auto kernel schedule with an auto epilogue schedule'):
+            td.kernel_schedule = cutlass.KernelScheduleType.TmaWarpSpecializedPingpong
+            td.epilogue_schedule = cutlass.EpilogueScheduleType.ScheduleAuto
+            plan.construct(td)
+
+        with ExpectException(True, f'Requested an auto kernel schedule with a non-auto epilogue schedule'):
+            td.kernel_schedule = cutlass.KernelScheduleType.ScheduleAuto
+            td.epilogue_schedule = cutlass.EpilogueScheduleType.TmaWarpSpecialized
+            plan.construct(td)
+
+        with ExpectException(cc < 90, f'Requested a tile scheduler on SM{cc}'):
+            td.kernel_schedule = cutlass.KernelScheduleType.TmaWarpSpecializedCooperative
+            td.epilogue_schedule = cutlass.EpilogueScheduleType.TmaWarpSpecializedCooperative
+            td.tile_scheduler = cutlass.TileSchedulerType.StreamK
+            plan.construct(td)
+
+        # Ensure that all returned tile descriptions are unique
+        ops = {}
+        for i, td in enumerate(plan.tile_descriptions()):
+            op = plan.construct(td)
+            code_str = op.rt_module.emit()
+            if code_str in ops:
+                conflicting_td = ops[code_str]
+                assert False, f'Multiple tile descriptions emitted {code_str}\nTile descriptions are:\n{td}\n{conflicting_td}'
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/python/cutlass/interface/utils.py b/test/python/cutlass/interface/utils.py
new file mode 100644
index 0000000000..706981a14b
--- /dev/null
+++ b/test/python/cutlass/interface/utils.py
@@ -0,0 +1,69 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Helper functions & classes for interface test
+"""
+class ExpectException:
+    """
+    Utility class to assert that an exception was raised when expected
+
+    Example:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        with ExceptionExpected(True, 'Division by zero'):
+            x = 1.0 / 0.0
+
+    :param exception_expected: whether an exception is expected to be raised
+    :type exception_expected: bool
+    :param message: message to print if an exception is raised when not expected or vice versa
+    :type message: str
+    """
+    def __init__(self, exception_expected: bool, message: str = '', verify_msg=False):
+        self.exception_expected = exception_expected
+        self.message = message
+        self.verify_msg = verify_msg
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, traceback):
+        exception_raised = exc_type is not None
+        assert self.exception_expected == exception_raised, self.message
+        if self.verify_msg:
+            exc_message = f"{exc_type.__name__}: {exc_val}"
+            assert exc_message == self.message, f"expect error message {self.message}, got {exc_message}"
+
+        # Suppress the exception
+        return True
diff --git a/test/python/pycute/run_all_tests.py b/test/python/pycute/run_all_tests.py
new file mode 100644
index 0000000000..2b8ee2878a
--- /dev/null
+++ b/test/python/pycute/run_all_tests.py
@@ -0,0 +1,75 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utility script for discovering and running all PyCuTe tests
+"""
+
+import argparse
+import logging
+import pathlib
+import unittest
+
+
+def numeric_log_level(log_level: str) -> int:
+  """
+  Converts the string identifier of the log level into the numeric identifier used
+  in setting the log level
+
+  :param x: string representation of log level (e.g., 'INFO', 'DEBUG')
+  :type x: str
+
+  :return: numeric representation of log level
+  :rtype: int
+  """
+  numeric_level = getattr(logging, log_level.upper(), None)
+  if not isinstance(numeric_level, int):
+    raise ValueError(f"Invalid log level: {log_level}")
+  return numeric_level
+
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--log-level", default='info', type=numeric_log_level, required=False,
+                      help='Logging level to be used by the generator script')
+  args = parser.parse_args()
+
+  # Set the logging level based on the user-provided `--log-level` command-line option
+  logging.basicConfig(level=args.log_level)
+
+  loader = unittest.TestLoader()
+  script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
+  tests = loader.discover(script_dir, "test_*.py")
+  test_runner = unittest.runner.TextTestRunner()
+  results = test_runner.run(tests)
+  if not results.wasSuccessful():
+    raise Exception("Test cases failed")
diff --git a/test/python/pycute/test_coalesce.py b/test/python/pycute/test_coalesce.py
new file mode 100644
index 0000000000..dd897cff60
--- /dev/null
+++ b/test/python/pycute/test_coalesce.py
@@ -0,0 +1,95 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Unit tests for pycute.coalesce
+"""
+
+import logging
+import unittest
+
+from pycute import *
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class TestCoalesce(unittest.TestCase):
+  def helper_test_coalesce(self, layout):
+    layoutR = coalesce(layout)
+
+    _LOGGER.debug(f"{layout}  =>  {layoutR}")
+
+    self.assertEqual(size(layoutR), size(layout))
+
+    for i in range(size(layout)):
+      self.assertEqual(layoutR(i), layout(i))
+
+  def test_coalesce(self):
+    layout = Layout(1,0)
+    self.helper_test_coalesce(layout)
+
+    layout = Layout(1,1)
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,4))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,4,6))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,4,6), (1,6,2))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,1,6), (1,7,2))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,1,6), (4,7,8))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,(4,6)))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,4), (4,1))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,4,6), (24,6,1))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,1,3), (2,4,4))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout(((2,2),(2,2)), ((1,4),(8,32)))
+    self.helper_test_coalesce(layout)
+
+
+if __name__ == "__main__":
+  unittest.main()
diff --git a/test/python/pycute/test_complement.py b/test/python/pycute/test_complement.py
new file mode 100644
index 0000000000..1cb2425d2a
--- /dev/null
+++ b/test/python/pycute/test_complement.py
@@ -0,0 +1,92 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Unit tests for pycute.complement
+"""
+
+import logging
+import unittest
+
+from pycute import *
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class TestComplement(unittest.TestCase):
+  def helper_test_complement(self, layout):
+    layoutR = complement(layout)
+
+    _LOGGER.debug(f"{layout}  =>  {layoutR}")
+
+    # Post-condition: test disjointness of the codomains
+    for a in range(size(layout)):
+      for b in range(size(layoutR)):
+        assert (layout(a) != layoutR(b)) or (layout(a) == 0 and layoutR(b) == 0)
+
+  def test_complement(self):
+    test = Layout(1,0)
+    self.helper_test_complement(test)
+
+    test = Layout(1,1)
+    self.helper_test_complement(test)
+
+    test = Layout(4,0)
+    self.helper_test_complement(test)
+
+    test = Layout((2,4),(1,2))
+    self.helper_test_complement(test)
+
+    test = Layout((2,3),(1,2))
+    self.helper_test_complement(test)
+
+    test = Layout((2,4),(1,4))
+    self.helper_test_complement(test)
+
+    test = Layout((2,4,8),(8,1,64))
+    self.helper_test_complement(test)
+
+    test = Layout(((2,2),(2,2)),((1,4),(8,32)))
+    self.helper_test_complement(test)
+
+    test = Layout((2,(3,4)),(3,(1,6)))
+    self.helper_test_complement(test)
+
+    test = Layout((4,6),(1,6))
+    self.helper_test_complement(test)
+
+    test = Layout((4,10),(1,10))
+    self.helper_test_complement(test)
+
+
+if __name__ == "__main__":
+  unittest.main()
diff --git a/test/python/pycute/test_composition.py b/test/python/pycute/test_composition.py
new file mode 100644
index 0000000000..8b569cd555
--- /dev/null
+++ b/test/python/pycute/test_composition.py
@@ -0,0 +1,204 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Unit tests for pycute.composition
+"""
+
+import logging
+import unittest
+
+from pycute import *
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class TestComposition(unittest.TestCase):
+  def helper_test_composition(self, layoutA, layoutB):
+    layoutR = composition(layoutA, layoutB)
+
+    _LOGGER.debug(f"{layoutA} o {layoutB}  =>  {layoutR}")
+
+    # True post-condition: Every coordinate c of layoutB with L1D(c) < size(layoutR) is a coordinate of layoutR.
+
+    # Test that R(c) = A(B(c)) for all coordinates c in layoutR
+    for i in range(size(layoutR)):
+      self.assertEqual(layoutR(i), layoutA(layoutB(i)))
+
+  def test_composition(self):
+    layoutA = Layout(1,0)
+    layoutB = Layout(1,0)
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout(1,0)
+    layoutB = Layout(1,1)
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout(1,1)
+    layoutB = Layout(1,0)
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout(1,1)
+    layoutB = Layout(1,1)
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4))
+    layoutB = Layout((4))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4), (2))
+    layoutB = Layout((4))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4))
+    layoutB = Layout((4), (2))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4), (0))
+    layoutB = Layout((4))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4))
+    layoutB = Layout((4), (0))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((1), (0))
+    layoutB = Layout((4))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4))
+    layoutB = Layout((1), (0))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4))
+    layoutB = Layout((2))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4), (2))
+    layoutB = Layout((2))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4))
+    layoutB = Layout((2), (2))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4), (2))
+    layoutB = Layout((2), (2))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((12))
+    layoutB = Layout((4,3))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((12), (2))
+    layoutB = Layout((4,3))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((12))
+    layoutB = Layout((4,3), (3,1))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((12), (2))
+    layoutB = Layout((4,3), (3,1))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((12))
+    layoutB = Layout((2,3), (2,4))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,3))
+    layoutB = Layout((4,3))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,3))
+    layoutB = Layout((12))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,3))
+    layoutB = Layout((6), (2))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,3))
+    layoutB = Layout((6,2), (2,1))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,3), (3,1))
+    layoutB = Layout((4,3))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,3), (3,1))
+    layoutB = Layout((12))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,3), (3,1))
+    layoutB = Layout((6), (2))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,3), (3,1))
+    layoutB = Layout((6,2), (2,1))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((8,8))
+    layoutB = Layout(((2,2,2), (2,2,2)),((1,16,4), (8,2,32)))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((8,8), (8,1))
+    layoutB = Layout(((2,2,2), (2,2,2)),((1,16,4), (8,2,32)))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout(((2,2,2), (2,2,2)),((1,16,4), (8,2,32)))
+    layoutB = Layout(8, 4)
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout(((4,2)), ((1,16)))
+    layoutB = Layout((4,2), (2,1))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((2,2), (2,1))
+    layoutB = Layout((2,2), (2,1))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,8,2))
+    layoutB = Layout((2,2,2), (2,8,1))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,8,2), (2,8,1))
+    layoutB = Layout((2,2,2), (1,8,2))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,8,2), (2,8,1))
+    layoutB = Layout((4,2,2), (2,8,1))
+    self.helper_test_composition(layoutA, layoutB)
+
+
+if __name__ == "__main__":
+  unittest.main()
diff --git a/test/python/pycute/test_int_tuple.py b/test/python/pycute/test_int_tuple.py
new file mode 100644
index 0000000000..8e56c37a32
--- /dev/null
+++ b/test/python/pycute/test_int_tuple.py
@@ -0,0 +1,80 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Unit tests for pycute.int_tuple
+"""
+
+import unittest
+
+from pycute import *
+
+
+class TestIntTuple(unittest.TestCase):
+  def test_product(self):
+    self.assertEqual(product(2), 2)
+
+    self.assertEqual(product((3,2)), 6)
+
+    self.assertEqual(product(product(((2,3),4))), 24)
+
+  def test_inner_product(self):
+    self.assertEqual(inner_product(2, 3), 6)
+
+    self.assertEqual(inner_product((1,2), (3,2)), 7)
+
+    self.assertEqual(inner_product(((2,3),4), ((2,1),2)), 15)
+
+  def test_shape_div(self):
+    self.assertEqual(shape_div((3,4), 6), (1,2))
+
+    self.assertEqual(shape_div((3,4), 12), (1,1))
+
+    self.assertEqual(shape_div((3,4), 36), (1,1))
+
+    self.assertEqual(shape_div(((3,4),6), 36), ((1,1),2))
+
+    self.assertEqual(shape_div((6,(3,4)), 36), (1,(1,2)))
+
+  def test_prefix_product(self):
+    self.assertEqual(prefix_product(2), 1)
+
+    self.assertEqual(prefix_product((3,2)), (1,3))
+
+    self.assertEqual(prefix_product((3,2,4)), (1,3,6))
+
+    self.assertEqual(prefix_product(((2,3),4)), ((1,2),6))
+
+    self.assertEqual(prefix_product(((2,3),(2, 1, 2),( 5,  2,  1))),
+                                    ((1,2),(6,12,12),(24,120,240)))
+
+
diff --git a/test/python/pycute/test_left_inverse.py b/test/python/pycute/test_left_inverse.py
new file mode 100644
index 0000000000..23d088a1c6
--- /dev/null
+++ b/test/python/pycute/test_left_inverse.py
@@ -0,0 +1,87 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Unit tests for pycute.left_inverse
+"""
+
+import logging
+import unittest
+
+from pycute import *
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class TestLeftInverse(unittest.TestCase):
+  def helper_test_left_inverse(self, layout):
+    inv_layout = left_inverse(layout)
+
+    _LOGGER.debug(f"{layout}  =>  {inv_layout}")
+
+    for i in range(size(layout)):
+      self.assertEqual(inv_layout(layout(i)), i)
+
+  def test_left_inverse(self):
+    test = Layout(1,0)
+    self.helper_test_left_inverse(test)
+
+    test = Layout((1,1),(0,0))
+    self.helper_test_left_inverse(test)
+
+    test = Layout(1,1)
+    self.helper_test_left_inverse(test)
+
+    test = Layout(4,1)
+    self.helper_test_left_inverse(test)
+
+    test = Layout(4,2)
+    self.helper_test_left_inverse(test)
+
+    test = Layout((8,4),(1,8))
+    self.helper_test_left_inverse(test)
+
+    test = Layout((8,4),(4,1))
+    self.helper_test_left_inverse(test)
+
+    test = Layout((2,4,6),(1,2,8))
+    self.helper_test_left_inverse(test)
+
+    test = Layout((2,4,6),(4,1,8))
+    self.helper_test_left_inverse(test)
+
+    test = Layout((4,2),(1,16))
+    self.helper_test_left_inverse(test)
+
+
+if __name__ == "__main__":
+  unittest.main()
diff --git a/test/python/pycute/test_right_inverse.py b/test/python/pycute/test_right_inverse.py
new file mode 100644
index 0000000000..6fdf08f7a5
--- /dev/null
+++ b/test/python/pycute/test_right_inverse.py
@@ -0,0 +1,96 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Unit tests for pycute.left_inverse
+"""
+
+import logging
+import unittest
+
+from pycute import *
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class TestRightInverse(unittest.TestCase):
+  def helper_test_right_inverse(self, layout):
+    inv_layout = right_inverse(layout)
+
+    _LOGGER.debug(f"{layout}  =>  {inv_layout}")
+
+    for i in range(size(inv_layout)):
+      self.assertEqual(layout(inv_layout(i)), i)
+
+  def test_right_inverse(self):
+    test = Layout(1,0)
+    self.helper_test_right_inverse(test)
+
+    test = Layout((1,1),(0,0))
+    self.helper_test_right_inverse(test)
+
+    test = Layout((3,7),(0,0))
+    self.helper_test_right_inverse(test)
+
+    test = Layout(1,1)
+    self.helper_test_right_inverse(test)
+
+    test = Layout(4,0)
+    self.helper_test_right_inverse(test)
+
+    test = Layout(4,1)
+    self.helper_test_right_inverse(test)
+
+    test = Layout(4,2)
+    self.helper_test_right_inverse(test)
+
+    test = Layout((2,4),(0,2))
+    self.helper_test_right_inverse(test)
+
+    test = Layout((8,4),(1,8))
+    self.helper_test_right_inverse(test)
+
+    test = Layout((8,4),(4,1))
+    self.helper_test_right_inverse(test)
+
+    test = Layout((2,4,6),(1,2,8))
+    self.helper_test_right_inverse(test)
+
+    test = Layout((2,4,6),(4,1,8))
+    self.helper_test_right_inverse(test)
+
+    test = Layout((4,2),(1,16))
+    self.helper_test_right_inverse(test)
+
+
+if __name__ == "__main__":
+  unittest.main()
diff --git a/test/python/pycute/test_typing.py b/test/python/pycute/test_typing.py
new file mode 100644
index 0000000000..447873ac30
--- /dev/null
+++ b/test/python/pycute/test_typing.py
@@ -0,0 +1,59 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Unit tests for pycute.typing
+"""
+
+import logging
+import unittest
+from pycute import *
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class TestTyping(unittest.TestCase):
+    def helper_test_typing(self, _cls, _obj, cls, expected: bool):
+        _LOGGER.debug(f"issubclass({_cls}, {cls})")
+        _LOGGER.debug(f"isinstance({_obj}, {cls})")
+
+        self.assertEqual(expected, issubclass(_cls, cls))
+        self.assertEqual(expected, isinstance(_obj, cls))
+
+    def test_typing(self):
+        self.helper_test_typing(int, 1, Integer, True)
+        self.helper_test_typing(float, 1., Integer, False)
+        self.helper_test_typing(str, 'hi', Integer, False)
+        self.helper_test_typing(bool, False, Integer, False)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/self_contained_includes/CMakeLists.txt b/test/self_contained_includes/CMakeLists.txt
new file mode 100644
index 0000000000..a576868b0a
--- /dev/null
+++ b/test/self_contained_includes/CMakeLists.txt
@@ -0,0 +1,261 @@
+# Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# The purpose of this target is to check if the following header files are self-contained,
+# i.e. they can be included in a source file without needing to include other headers before it.
+
+set(header_files_to_check
+    # cutlass
+
+    # cutlass/gemm/kernel
+    cutlass/gemm/kernel/default_gemm.h
+    cutlass/gemm/kernel/default_gemm_complex.h
+    cutlass/gemm/kernel/gemm_universal_decl.h
+    # cutlass/gemm/kernel/sm90_gemm_warpspecialized.hpp
+
+    # cute
+    cute/config.hpp
+    cute/int_tuple.hpp
+    cute/layout.hpp
+    cute/layout_composed.hpp
+    cute/pointer.hpp
+    cute/pointer_base.hpp
+    cute/pointer_flagged.hpp
+    cute/pointer_swizzle.hpp
+    cute/stride.hpp
+    cute/swizzle.hpp
+    cute/swizzle_layout.hpp
+    cute/tensor.hpp
+    cute/tensor_impl.hpp
+    cute/tensor_predicate.hpp
+    cute/underscore.hpp
+    # cute/algorithm
+    cute/algorithm/axpby.hpp
+    cute/algorithm/clear.hpp
+    # cute/algorithm/cooperative_copy.hpp
+    cute/algorithm/cooperative_gemm.hpp
+    # cute/algorithm/copy.hpp
+    cute/algorithm/fill.hpp
+    cute/algorithm/functional.hpp
+    # cute/algorithm/gemm.hpp
+    cute/algorithm/prefer.hpp
+    # cute/algorithm/prefetch.hpp
+    cute/algorithm/tensor_algorithms.hpp
+    cute/algorithm/tuple_algorithms.hpp
+
+    # cute/container
+    cute/container/alignment.hpp
+    cute/container/array.hpp
+    cute/container/array_aligned.hpp
+    cute/container/array_subbyte.hpp
+    cute/container/bit_field.hpp
+    cute/container/cuda_types.hpp
+    cute/container/packed_tuple.hpp
+    cute/container/tuple.hpp
+    cute/container/type_list.hpp
+
+    # cute/numeric
+    cute/numeric/arithmetic_tuple.hpp
+    cute/numeric/complex.hpp
+    cute/numeric/int.hpp
+    cute/numeric/integer_sequence.hpp
+    cute/numeric/integral_ratio.hpp
+    cute/numeric/math.hpp
+    cute/numeric/numeric_types.hpp
+    cute/numeric/real.hpp
+    cute/numeric/integral_constant.hpp
+
+    # cute/util
+    cute/util/debug.hpp
+    cute/util/print.hpp
+    cute/util/type_traits.hpp
+    # cute/arch
+    cute/arch/cluster_sm90.hpp
+    cute/arch/copy.hpp
+    cute/arch/copy_sm50.hpp
+    cute/arch/copy_sm75.hpp
+    cute/arch/copy_sm80.hpp
+    cute/arch/copy_sm90.hpp
+    cute/arch/copy_sm90_desc.hpp
+    cute/arch/copy_sm90_tma.hpp
+    cute/arch/mma_sm61.hpp
+    cute/arch/mma_sm70.hpp
+    cute/arch/mma_sm75.hpp
+    cute/arch/mma_sm80.hpp
+    cute/arch/mma_sm80_sparse.hpp
+    cute/arch/mma_sm90.hpp
+    cute/arch/mma_sm90_desc.hpp
+    cute/arch/mma_sm90_gmma.hpp
+    cute/arch/mma.hpp
+    cute/arch/util.hpp
+    # cute/atom
+    # cute/atom/copy_atom.hpp
+    # cute/atom/copy_traits.hpp
+    # cute/atom/copy_traits_sm50.hpp
+    # cute/atom/copy_traits_sm75.hpp
+    # cute/atom/copy_traits_sm80.hpp
+    # cute/atom/copy_traits_sm90.hpp
+    # cute/atom/copy_traits_sm90_im2col.hpp
+    # cute/atom/copy_traits_sm90_tma.hpp
+    # cute/atom/copy_traits_sm90_tma_swizzle.hpp
+    cute/atom/mma_atom.hpp
+    cute/atom/mma_traits.hpp
+    cute/atom/mma_traits_sm61.hpp
+    cute/atom/mma_traits_sm70.hpp
+    cute/atom/mma_traits_sm75.hpp
+    cute/atom/mma_traits_sm80.hpp
+    cute/atom/mma_traits_sm90.hpp
+    cute/atom/mma_traits_sm90_gmma.hpp
+    # cutlass
+    cutlass/aligned_buffer.h
+    cutlass/array.h
+    cutlass/array_planar_complex.h
+    cutlass/array_subbyte.h
+    cutlass/barrier.h
+    cutlass/bfloat16.h
+    cutlass/blas3.h
+    cutlass/blas3_types.h
+    cutlass/block_striped.h
+    cutlass/cluster_launch.hpp
+    cutlass/complex.h
+    cutlass/constants.h
+    cutlass/coord.h
+    cutlass/core_io.h
+    cutlass/cuda_host_adapter.hpp
+    cutlass/cutlass.h
+    cutlass/device_kernel.h
+    cutlass/fast_math.h
+    cutlass/float8.h
+    # cutlass/floating_point_nvrtc.h
+    cutlass/functional.h
+    cutlass/gemm_coord.h
+    cutlass/gemm_coord.hpp
+    cutlass/half.h
+    cutlass/integer_subbyte.h
+    cutlass/kernel_hardware_info.h
+    cutlass/kernel_hardware_info.hpp
+    cutlass/kernel_launch.h
+    cutlass/matrix.h
+    cutlass/matrix_coord.h
+    cutlass/matrix_shape.h
+    cutlass/numeric_conversion.h
+    cutlass/numeric_size.h
+    cutlass/numeric_types.h
+    cutlass/pitch_linear_coord.h
+    cutlass/predicate.h
+    cutlass/predicate_vector.h
+    cutlass/quaternion.h
+    cutlass/real.h
+    cutlass/relatively_equal.h
+    cutlass/semaphore.h
+    cutlass/subbyte_reference.h
+    cutlass/tensor_coord.h
+    cutlass/tensor_ref.h
+    cutlass/tensor_ref_planar_complex.h
+    cutlass/tensor_view.h
+    cutlass/tensor_view_planar_complex.h
+    cutlass/tfloat32.h
+    cutlass/trace.h
+    cutlass/uint128.h
+    cutlass/version.h
+    cutlass/wmma_array.h
+    cutlass/workspace.h
+    # cutlass/platform
+    cutlass/platform/platform.h
+
+    # cutlass/pipeline
+    cutlass/pipeline/pipeline.hpp
+    cutlass/pipeline/sm90_pipeline.hpp
+    # cutlass/detail
+    cutlass/detail/cluster.hpp
+    cutlass/detail/collective.hpp
+    cutlass/detail/dependent_false.hpp
+    cutlass/detail/helper_macros.hpp
+    cutlass/detail/layout.hpp
+    cutlass/detail/mainloop_fusion_helper_bgrada.hpp
+    cutlass/detail/mma.hpp
+    # cutlass/arch
+    cutlass/arch/arch.h
+    cutlass/arch/barrier.h
+    cutlass/arch/cache_operation.h
+    cutlass/arch/config.h
+    cutlass/arch/custom_abi.h
+    cutlass/arch/grid_dependency_control.h
+    cutlass/arch/memory.h
+    # cutlass/arch/memory_sm75.h
+    # cutlass/arch/memory_sm80.h
+    cutlass/arch/mma.h
+    # cutlass/arch/mma_sm50.h
+    # cutlass/arch/mma_sm60.h
+    # cutlass/arch/mma_sm61.h
+    # cutlass/arch/mma_sm70.h
+    # cutlass/arch/mma_sm75.h
+    # cutlass/arch/mma_sm80.h
+    # cutlass/arch/mma_sm89.h
+    # cutlass/arch/mma_sm90.h
+    cutlass/arch/mma_sparse_sm80.h
+    cutlass/arch/mma_sparse_sm89.h
+    # cutlass/arch/simd.h
+    # cutlass/arch/simd_sm60.h
+    # cutlass/arch/simd_sm61.h
+    cutlass/arch/reg_reconfig.h
+    cutlass/arch/tma_operation.h
+    cutlass/arch/wmma.h
+    # cutlass/arch/wmma_sm70.h
+    # cutlass/arch/wmma_sm72.h
+    # cutlass/arch/wmma_sm75.h
+    # cutlass/arch/wmma_sm80.h
+     # cutlass/layout
+    cutlass/layout/layout.h
+    cutlass/layout/matrix.h
+    cutlass/layout/permute.h
+    cutlass/layout/pitch_linear.h
+    cutlass/layout/tensor.h
+    cutlass/layout/tensor_op_multiplicand_sm70.h
+    cutlass/layout/tensor_op_multiplicand_sm75.h
+    cutlass/layout/tensor_op_multiplicand_sm80.h
+    cutlass/layout/vector.h
+)
+
+# for each header in _header_files:
+#   create a .cu file with the same name as the header's path, except with / replaced with %
+#   have the .cu file include that header
+set(_gen_source_files "")
+foreach(header_file ${header_files_to_check})
+  string(REPLACE "/" "%" header_file_esc ${header_file})
+
+  file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/${header_file_esc}.cu"
+       "#include <${header_file}>")
+
+  list(APPEND _gen_source_files
+       "${CMAKE_CURRENT_BINARY_DIR}/${header_file_esc}.cu")
+endforeach()
+
+# build all generated .cu files into a single library
+cutlass_add_library(test_self_contained_includes MODULE ${_gen_source_files})
+
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index e487922908..b02ec65a73 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -28,6 +28,8 @@
 
 include(CTest)
 
+set(CUTLASS_UNIT_TEST_COMMON_DIR ${CMAKE_CURRENT_LIST_DIR}/common)
+
 cutlass_add_library(
   cutlass_test_unit_infra 
   OBJECT
@@ -40,8 +42,9 @@ target_link_libraries(
   CUTLASS
   cutlass_tools_util_includes
   $<$<BOOL:${CUTLASS_ENABLE_CUBLAS}>:nvidia::cublas>
-  gtest
+  GTest::gtest
   cudart
+  cuda_driver
   )
 
 cutlass_add_library(
@@ -60,21 +63,41 @@ set(CUTLASS_TEST_UNIT_RESULTS_CACHE_DIR ${CMAKE_CURRENT_LIST_DIR}/data/hashes)
 
 function(cutlass_test_unit_add_executable NAME)
 
-  set(options)
+  set(options WITHOUT_CUDA DO_NOT_LOWERCASE_TEST_NAME)
   set(oneValueArgs)
-  set(multiValueArgs)
+  set(multiValueArgs TEST_SETS_SUPPORTED EXTRA_INCLUDE_DIRS)
   cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
-  cutlass_add_executable(${NAME} ${__UNPARSED_ARGUMENTS})
+  cutlass_add_executable(${NAME} ${__UNPARSED_ARGUMENTS} BATCH_SOURCES OFF)
 
   target_compile_definitions(${NAME} PUBLIC CUTLASS_TARGET_NAME="${NAME}")
 
-  target_link_libraries(
+  target_include_directories(
     ${NAME}
     PRIVATE
-    cutlass_test_unit_infra
-    cutlass_test_unit_infra_lib
+    ${CUTLASS_UNIT_TEST_COMMON_DIR}
+    ${__EXTRA_INCLUDE_DIRS}
   )
+  if (__WITHOUT_CUDA)
+    # Avoid CUDA dependencies for host-only unit tests that provide the
+    # WITHOUT_CUDA argument.
+    target_link_libraries(
+      ${NAME}
+      PUBLIC
+      GTest::gtest 
+    )
+  else()
+    target_link_libraries(
+      ${NAME}
+      PRIVATE
+      cutlass_test_unit_infra
+      cutlass_test_unit_infra_lib
+    )
+  endif()
+
+  if (CUTLASS_ENABLE_OPENMP_TESTS AND OpenMP_CXX_FOUND)
+    target_link_libraries(${NAME} PRIVATE OpenMP::OpenMP_CXX)
+  endif()
 
   string(REGEX REPLACE cutlass_ "" NAME_STEM ${NAME})
 
@@ -86,19 +109,28 @@ function(cutlass_test_unit_add_executable NAME)
 
   set(CUTLASS_TEST_UNIT_TEST_COMMAND_OPTIONS --gtest_output=xml:${NAME_STEM}.gtest.xml)
 
+  if (__DO_NOT_LOWERCASE_TEST_NAME)
+    set(DO_NOT_LOWERCASE_TEST_NAME DO_NOT_LOWERCASE_TEST_NAME)
+  else()
+    set(DO_NOT_LOWERCASE_TEST_NAME)
+  endif()
+  
   cutlass_add_executable_tests(
     ${NAME_STEM} ${NAME}
+    TEST_SETS_SUPPORTED ${__TEST_SETS_SUPPORTED}
     TEST_COMMAND_OPTIONS CUTLASS_TEST_UNIT_TEST_COMMAND_OPTIONS
     ${RESULT_CACHE_FILE_ARGS}
+    ${DO_NOT_LOWERCASE_TEST_NAME}
     )
 
 endfunction()
-
+  
 add_custom_target(cutlass_test_unit)
 add_custom_target(test_unit)
 
 set(SUBDIRS
   core
+  cute
   gemm
   conv
   layout
@@ -106,6 +138,9 @@ set(SUBDIRS
   epilogue
   reduction
   util
+  pipeline
+  substrate
+  cluster_launch
   )
 
 if(TARGET nvidia::nvrtc AND TARGET nvidia::cuda_driver)
diff --git a/test/unit/cluster_launch/CMakeLists.txt b/test/unit/cluster_launch/CMakeLists.txt
new file mode 100644
index 0000000000..65cf7d84ca
--- /dev/null
+++ b/test/unit/cluster_launch/CMakeLists.txt
@@ -0,0 +1,32 @@
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_cluster_launch
+  cluster_launch.cu
+)
diff --git a/test/unit/cluster_launch/cluster_launch.cu b/test/unit/cluster_launch/cluster_launch.cu
new file mode 100644
index 0000000000..43141923cc
--- /dev/null
+++ b/test/unit/cluster_launch/cluster_launch.cu
@@ -0,0 +1,370 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Unit test for the launch_on_cluster function
+*/
+
+#include "../common/cutlass_unit_test.h"
+#include "cutlass/cluster_launch.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include <cassert>
+#include <memory>
+#include <type_traits>
+
+#if defined(CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED)
+
+namespace { // (anonymous)
+
+// Using a struct instead of a lambda makes it possible
+// to name the deleter type without std::function
+// (which type-erases).
+struct scalar_deleter {
+  void operator() (float* p) {
+    if (p != nullptr) {
+      cudaFree(p);
+    }
+  }
+};
+
+using scalar_device_pointer = std::unique_ptr<float, scalar_deleter>;
+
+// Each test needs to initialize this anew,
+// from a scalar instance that is in scope during the test.
+__device__ float* scalar_ptr_gpu;
+
+// A single scalar value on device.
+// The constructor allocates space on device for one value,
+// copies the value to device, and sets the global pointer
+// `scalar_ptr_gpu` (see above) to point to it.
+// sync_to_host() copies that value back to host.
+//
+// This class exists only for the tests in this file.
+// In order to know whether a kernel that launch_on_cluster
+// claimed to launch actually got launched, each kernel
+// performs a side effect: it modifies the scalar value
+// through the scalar_ptr_gpu value.
+// It performs a side effect through a global,
+// rather than through an argument,
+// so that we can test kernel launch
+// with kernels that take zero parameters.
+class scalar {
+private:
+  static constexpr std::size_t num_bytes = sizeof(float);
+
+public:
+  scalar(float value) : value_host_(value)
+  {
+    float* ptr_gpu_raw = nullptr;
+    auto err = cudaMalloc(&ptr_gpu_raw, num_bytes);
+    assert(err == cudaSuccess);
+
+    scalar_device_pointer ptr_gpu{ptr_gpu_raw, scalar_deleter{}};
+    err = cudaMemcpy(ptr_gpu.get(), &value_host_,
+                     num_bytes, cudaMemcpyHostToDevice);
+    assert(err == cudaSuccess);
+    ptr_gpu_ = std::move(ptr_gpu);
+    upload_device_pointer();
+  }
+
+  float sync_to_host()
+  {
+    auto err = cudaMemcpy(&value_host_, ptr_gpu_.get(),
+                          num_bytes, cudaMemcpyDeviceToHost);
+    assert(err == cudaSuccess);
+    return value_host_;
+  }
+
+private:
+  void upload_device_pointer()
+  {
+    float* ptr_raw = ptr_gpu_.get();
+    auto err = cudaMemcpyToSymbol(scalar_ptr_gpu, &ptr_raw, sizeof(float*));
+    assert(err == cudaSuccess);
+  }
+
+  float value_host_ = 0.0;
+  scalar_device_pointer ptr_gpu_;
+};
+
+template<int cluster_x, int cluster_y, int cluster_z>
+CUTE_DEVICE void check_cluster_shape() {
+  [[maybe_unused]] const dim3 cluster_shape = cute::cluster_shape();
+  assert(cluster_shape.x == cluster_x);
+  assert(cluster_shape.y == cluster_y);
+  assert(cluster_shape.z == cluster_z);
+}
+
+template<int cluster_x, int cluster_y, int cluster_z>
+__global__ void kernel_0()
+{
+  check_cluster_shape<cluster_x, cluster_y, cluster_z>();
+
+  // Write to global memory, so that we know
+  // whether the kernel actually ran.
+  const dim3 block_id = cute::block_id_in_cluster();
+  if (threadIdx.x == 0 && block_id.x == 0 && block_id.y == 0 && block_id.z == 0) {
+    *scalar_ptr_gpu = 0.1f;
+  }
+}
+
+template<int cluster_x, int cluster_y, int cluster_z,
+         int expected_p0>
+__global__ void kernel_1(int p0)
+{
+  check_cluster_shape<cluster_x, cluster_y, cluster_z>();
+  assert(p0 == expected_p0);
+
+  // Write to global memory, so that we know
+  // whether the kernel actually ran.
+  const dim3 block_id = cute::block_id_in_cluster();
+  if (threadIdx.x == 0 && block_id.x == 0 && block_id.y == 0 && block_id.z == 0) {
+    *scalar_ptr_gpu = 1.2f;
+  }
+}
+
+template<int cluster_x, int cluster_y, int cluster_z,
+         int expected_p0,
+         int expected_p2>
+__global__ void kernel_2(int p0, void* p1, int p2)
+{
+  check_cluster_shape<cluster_x, cluster_y, cluster_z>();
+  assert(p0 == expected_p0);
+  assert(p1 == nullptr);
+  assert(p2 == expected_p2);
+
+  // Write to global memory, so that we know
+  // whether the kernel actually ran.
+  const dim3 block_id = cute::block_id_in_cluster();
+  if (threadIdx.x == 0 && block_id.x == 0 && block_id.y == 0 && block_id.z == 0) {
+    *scalar_ptr_gpu = 2.3f;
+  }
+}
+
+struct OverloadedOperatorAmpersand {
+  struct tag_t {};
+
+  // Test that kernel launch uses the actual address,
+  // instead of any overloaded operator& that might exist.
+  CUTE_HOST_DEVICE tag_t operator& () const {
+    return {};
+  }
+
+  int x = 0;
+  int y = 0;
+  int z = 0;
+  int w = 0;
+};
+
+static_assert(sizeof(OverloadedOperatorAmpersand) == 4 * sizeof(int));
+
+template<int cluster_x, int cluster_y, int cluster_z,
+         int expected_p0,
+         int expected_p1_x,
+         int expected_p1_y,
+         int expected_p1_z,
+         int expected_p1_w,
+         std::uint64_t expected_p2>
+__global__ void kernel_3(int p0, OverloadedOperatorAmpersand p1, std::uint64_t p2)
+{
+  check_cluster_shape<cluster_x, cluster_y, cluster_z>();
+  assert(p0 == expected_p0);
+  assert(p1.x == expected_p1_x);
+  assert(p1.y == expected_p1_y);
+  assert(p1.z == expected_p1_z);
+  assert(p1.w == expected_p1_w);
+  assert(p2 == expected_p2);
+
+  // Write to global memory, so that we know
+  // whether the kernel actually ran.
+  const dim3 block_id = cute::block_id_in_cluster();
+  if (threadIdx.x == 0 && block_id.x == 0 && block_id.y == 0 && block_id.z == 0) {
+    *scalar_ptr_gpu = 3.4f;
+  }
+}
+
+} // namespace (anonymous)
+
+TEST(SM90_ClusterLaunch, Kernel_0)
+{
+  scalar global_value(-1.0f);
+
+  const dim3 grid_dims{2, 1, 1};
+  const dim3 block_dims{1, 1, 1};
+  const dim3 cluster_dims{grid_dims.x * block_dims.x, 1, 1};
+  const int smem_size_in_bytes = 0;
+  cutlass::ClusterLaunchParams params{
+    grid_dims, block_dims, cluster_dims, smem_size_in_bytes};
+
+  void const* kernel_ptr = reinterpret_cast<void const*>(&kernel_0<2, 1, 1>);
+  cutlass::Status status = cutlass::launch_kernel_on_cluster(params,
+    kernel_ptr);
+  ASSERT_EQ(status, cutlass::Status::kSuccess);
+
+  cudaError_t result = cudaDeviceSynchronize();
+  if (result == cudaSuccess) {
+    CUTLASS_TRACE_HOST("Kernel launch succeeded\n");
+  }
+  else {
+    CUTLASS_TRACE_HOST("Kernel launch FAILED\n");
+    cudaError_t error = cudaGetLastError();
+    EXPECT_EQ(result, cudaSuccess) << "Error at kernel sync: "
+      << cudaGetErrorString(error) << "\n";
+  }
+
+  ASSERT_EQ(global_value.sync_to_host(), 0.1f);
+}
+
+TEST(SM90_ClusterLaunch, Kernel_1)
+{
+  scalar global_value(-1.0f);
+
+  const dim3 grid_dims{2, 1, 1};
+  const dim3 block_dims{1, 1, 1};
+  const dim3 cluster_dims{grid_dims.x * block_dims.x, 1, 1};
+  const int smem_size_in_bytes = 0;
+  cutlass::ClusterLaunchParams params{
+    grid_dims, block_dims, cluster_dims, smem_size_in_bytes};
+
+  constexpr int expected_p0 = 42;
+  void const* kernel_ptr = reinterpret_cast<void const*>(&kernel_1<2, 1, 1, expected_p0>);
+  const int p0 = expected_p0;
+  cutlass::Status status = cutlass::launch_kernel_on_cluster(params,
+    kernel_ptr, p0);
+  ASSERT_EQ(status, cutlass::Status::kSuccess);
+
+  cudaError_t result = cudaDeviceSynchronize();
+  if (result == cudaSuccess) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("Kernel launch succeeded\n");
+#endif
+  }
+  else {
+    CUTLASS_TRACE_HOST("Kernel launch FAILED\n");
+    cudaError_t error = cudaGetLastError();
+    EXPECT_EQ(result, cudaSuccess) << "Error at kernel sync: "
+      << cudaGetErrorString(error) << "\n";
+  }
+
+  ASSERT_EQ(global_value.sync_to_host(), 1.2f);
+}
+
+TEST(SM90_ClusterLaunch, Kernel_2)
+{
+  scalar global_value(-1.0f);
+
+  const dim3 grid_dims{2, 1, 1};
+  const dim3 block_dims{1, 1, 1};
+  const dim3 cluster_dims{grid_dims.x * block_dims.x, 1, 1};
+  const int smem_size_in_bytes = 0;
+  cutlass::ClusterLaunchParams params{
+    grid_dims, block_dims, cluster_dims, smem_size_in_bytes};
+
+  constexpr int expected_p0 = 42;
+  constexpr int expected_p2 = 43;
+
+  int p0 = expected_p0;
+  int* p1 = nullptr;
+  int p2 = expected_p2;
+
+  void const* kernel_ptr = reinterpret_cast<void const*>(
+    &kernel_2<2, 1, 1, expected_p0, expected_p2>);
+  cutlass::Status status = cutlass::launch_kernel_on_cluster(params,
+    kernel_ptr, p0, p1, p2);
+  ASSERT_EQ(status, cutlass::Status::kSuccess);
+
+  cudaError_t result = cudaDeviceSynchronize();
+  if (result == cudaSuccess) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("Kernel launch succeeded\n");
+#endif
+  }
+  else {
+    CUTLASS_TRACE_HOST("Kernel launch FAILED\n");
+    cudaError_t error = cudaGetLastError();
+    EXPECT_EQ(result, cudaSuccess) << "Error at kernel sync: "
+      << cudaGetErrorString(error) << "\n";
+  }
+
+  ASSERT_EQ(global_value.sync_to_host(), 2.3f);
+}
+
+TEST(SM90_ClusterLaunch, Kernel_3)
+{
+  scalar global_value(-1.0f);
+
+  const dim3 grid_dims{2, 1, 1};
+  const dim3 block_dims{1, 1, 1};
+  const dim3 cluster_dims{grid_dims.x * block_dims.x, 1, 1};
+  const int smem_size_in_bytes = 0;
+  cutlass::ClusterLaunchParams params{
+    grid_dims, block_dims, cluster_dims, smem_size_in_bytes};
+
+  constexpr int expected_p0 = 42;
+  constexpr int expected_p1_x = 1;
+  constexpr int expected_p1_y = 2;
+  constexpr int expected_p1_z = 3;
+  constexpr int expected_p1_w = 4;
+  constexpr std::uint64_t expected_p2 = 1'000'000'000'000uLL;
+
+  int p0 = expected_p0;
+  OverloadedOperatorAmpersand p1{expected_p1_x,
+    expected_p1_y, expected_p1_z, expected_p1_w};
+  // Verify that operator& is overloaded for this type.
+  static_assert(! std::is_same_v<decltype(&p1),
+                    OverloadedOperatorAmpersand*>);
+  std::uint64_t p2 = expected_p2;
+
+  void const* kernel_ptr = reinterpret_cast<void const*>(
+    &kernel_3<2, 1, 1, expected_p0, expected_p1_x,
+      expected_p1_y, expected_p1_z, expected_p1_w,
+      expected_p2>);
+  cutlass::Status status = cutlass::launch_kernel_on_cluster(params,
+    kernel_ptr, p0, p1, p2);
+  ASSERT_EQ(status, cutlass::Status::kSuccess);
+
+  cudaError_t result = cudaDeviceSynchronize();
+  if (result == cudaSuccess) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("Kernel launch succeeded\n");
+#endif
+  }
+  else {
+    CUTLASS_TRACE_HOST("Kernel launch FAILED\n");
+    cudaError_t error = cudaGetLastError();
+    EXPECT_EQ(result, cudaSuccess) << "Error at kernel sync: "
+      << cudaGetErrorString(error) << "\n";
+  }
+
+  ASSERT_EQ(global_value.sync_to_host(), 3.4f);
+}
+
+#endif // CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED
diff --git a/test/unit/common/cutlass_unit_test.h b/test/unit/common/cutlass_unit_test.h
index a9bf318c3a..c1521e65da 100644
--- a/test/unit/common/cutlass_unit_test.h
+++ b/test/unit/common/cutlass_unit_test.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,7 @@
  **************************************************************************************************/
 
 #pragma once
-#pragma warning (disable : 4068 ) /* disable unknown pragma warnings for vistual studio */
+#pragma warning (disable : 4068 ) /* disable unknown pragma warnings for visual studio */
 
 #pragma nv_diag_suppress boolean_controlling_expr_is_constant
 #include <gtest/gtest.h>
@@ -39,6 +39,17 @@
 
 #include <cstdlib>
 #include <string>
+
+#include <cuda_runtime_api.h>
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Gets a CUDA device
+cudaDeviceProp GetCudaDevice();
+
+/// Prints device properties
+std::ostream &operator<<(std::ostream &out, cudaDeviceProp const &device);
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Sets flags for Unit test
@@ -52,7 +63,6 @@ int CutlassUnitTestProblemCount();
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-
 // active test macro
 #define CUTLASS_TEST_LEVEL_ACTIVE(LEVEL,NAME_STATIC,NAME_DYNAMIC,...) \
     TEST(NAME_STATIC,L##LEVEL##_##NAME_DYNAMIC) __VA_ARGS__
@@ -78,3 +88,15 @@ int CutlassUnitTestProblemCount();
 #if !defined(CUTLASS_TEST_UNIT_ENABLE_WARNINGS)
 #define CUTLASS_TEST_UNIT_ENABLE_WARNINGS false
 #endif
+
+#if (__CUDACC_VER_MAJOR__ >= 12)
+  #define CUDA_12_0_SM90_FEATURES_SUPPORTED true
+#else
+  #define CUDA_12_0_SM90_FEATURES_SUPPORTED false
+#endif
+
+#include <cutlass/cutlass.h>
+#include <cutlass/numeric_types.h>
+#include <cutlass/trace.h>
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/common/filter_architecture.cpp b/test/unit/common/filter_architecture.cpp
index 4f02431947..32acad1ec4 100644
--- a/test/unit/common/filter_architecture.cpp
+++ b/test/unit/common/filter_architecture.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,9 +35,49 @@
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+/// Gets a CUDA device
+cudaDeviceProp GetCudaDevice() {
+
+  cudaError_t err;
+
+  int cudaDeviceId;
+  err = cudaGetDevice(&cudaDeviceId);
+  if (cudaSuccess != err) {
+    std::cerr << "*** Error: Could not detect active GPU device ID"
+              << " [" << cudaGetErrorString(err) << "]" << std::endl;
+    exit(1);
+  }
+
+  cudaDeviceProp deviceProperties;
+  err = cudaGetDeviceProperties(&deviceProperties, cudaDeviceId);
+
+  return deviceProperties;
+}
+
+/// Prints device properties
+std::ostream &operator<<(std::ostream &out, cudaDeviceProp const &deviceProperties) {
+
+  int deviceMajorMinor = deviceProperties.major * 10 + deviceProperties.minor;
+  if (deviceMajorMinor) {
+    int32_t clock_MHz = deviceProperties.clockRate / 1000;
+    out << "GPU(compute_"
+      << deviceMajorMinor << ", "
+      << deviceProperties.multiProcessorCount << " SMs @ " << clock_MHz << " MHz)";
+  }
+  else {
+    out << "No CUDA device.";
+  }
+
+  return out;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 /// Sets flags for Unit test
 void FilterArchitecture() {
   // Default flags can be overwritten by --gtest_filter from commandline
+
+  int const kMaxDevice = 999;
+
   cudaError_t err;
 
   int cudaDeviceId;
@@ -47,7 +87,6 @@ void FilterArchitecture() {
               << " [" << cudaGetErrorString(err) << "]" << std::endl;
     exit(1);
   }
-
   cudaDeviceProp deviceProperties;
   err = cudaGetDeviceProperties(&deviceProperties, cudaDeviceId);
   if (cudaSuccess != err) {
@@ -57,7 +96,6 @@ void FilterArchitecture() {
   }
 
   int deviceMajorMinor = deviceProperties.major * 10 + deviceProperties.minor;
-  int const kMaxDevice = 999;
 
   // Defines text filters for each GEMM kernel based on minimum supported compute capability
   struct {
@@ -78,9 +116,12 @@ void FilterArchitecture() {
     { "SM70*",                      70, 75},
     { "SM75*",                      75, kMaxDevice},
     { "SM80*",                      80, kMaxDevice},
+    { "SM89*",                      89, 89},
+    { "SM90*",                      90, 90},
     { 0, 0, false }
   };
 
+
   // Set negative test filters
   std::stringstream ss;
   ss << "-";
diff --git a/test/unit/conv/CMakeLists.txt b/test/unit/conv/CMakeLists.txt
index 798438b6f3..4762a0c0cd 100644
--- a/test/unit/conv/CMakeLists.txt
+++ b/test/unit/conv/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -46,3 +46,5 @@ foreach(SUBDIR
   add_dependencies(test_unit_conv test_unit_conv_${SUBDIR})
 
 endforeach()
+
+add_subdirectory(device_3x)
diff --git a/test/unit/conv/device/cache_testbed_output.h b/test/unit/conv/cache_testbed_output.h
similarity index 86%
rename from test/unit/conv/device/cache_testbed_output.h
rename to test/unit/conv/cache_testbed_output.h
index 299baabef4..57b0a25355 100644
--- a/test/unit/conv/device/cache_testbed_output.h
+++ b/test/unit/conv/cache_testbed_output.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -48,15 +48,15 @@
 #include "cutlass/core_io.h"
 #include "cutlass/util/tensor_view_io.h"
 
+#include "thrust/universal_vector.h"
+
 #ifndef CUTLASS_TEST_ENABLE_CACHED_RESULTS
 #define CUTLASS_TEST_ENABLE_CACHED_RESULTS false
 #endif
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-namespace test {
-namespace conv {
-namespace device {
+namespace test::conv::device {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -122,14 +122,15 @@ inline std::ostream &operator<<(std::ostream &out, CachedTestKey const &result)
 
 struct CachedTestResult {
   uint32_t D;
-
   //
   // Methods
   //
 
-  CachedTestResult(): D() { }
+  CachedTestResult(): D()
+      { }
 
-  CachedTestResult(uint32_t D): D(D) { }
+  CachedTestResult(uint32_t D): D(D)
+      { }
 
   operator bool() const {
     return bool(D);
@@ -257,6 +258,7 @@ inline char const *EncodeOperator(cutlass::conv::Operator conv_op) {
       case cutlass::conv::Operator::kFprop: return "fprop";
       case cutlass::conv::Operator::kDgrad: return "dgrad";
       case cutlass::conv::Operator::kWgrad: return "wgrad";
+      case cutlass::conv::Operator::kDeconv: return "deconv";
     }
     return "conv_unknown";
 }
@@ -324,6 +326,33 @@ inline std::ostream &EncodeProblemSize(
   return out;
 }
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Encode 3.x ConvNd ProblemShape
+template <class ProblemShape>
+inline std::ostream &EncodeProblemSize(
+  std::ostream &out, 
+  ProblemShape const& problem_shape) {
+
+  out << problem_shape.shape_A << "_";
+  out << problem_shape.shape_B << "_";
+
+  out << "padl" << problem_shape.lower_padding << "_";
+  out << "padu" << problem_shape.upper_padding << "_";
+  out << "str"  << problem_shape.traversal_stride << "_";
+  out << "dil"  << problem_shape.dilation << "_";
+
+  switch (problem_shape.mode) {
+    case cutlass::conv::Mode::kCrossCorrelation:
+        out << "corr";
+        break;
+    case cutlass::conv::Mode::kConvolution:
+        out << "conv";
+        break;
+  }
+
+  return out;
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <typename Element>
@@ -452,6 +481,12 @@ inline std::string TensorTypeName() {
   return ss.str();
 }
 
+template <typename Element>
+inline std::string TensorTypeName() {
+  std::stringstream ss;
+  ss << ElementTypeName<Element>();
+  return ss.str();
+}
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Hash function on a byte array
@@ -511,6 +546,16 @@ uint32_t TensorHash(
   return hash(view.data(), view.capacity() * cutlass::sizeof_bits<Element>::value / 8, crc);
 }
 
+template <typename Element>
+uint32_t TensorHash(
+  thrust::universal_vector<Element>& tensor,
+  CRC32 const &hash = CRC32(), 
+  uint32_t crc = uint32_t()
+) {
+
+  return hash(tensor.data().get(), tensor.size() * cutlass::sizeof_bits<Element>::value / 8, crc);
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <
@@ -533,6 +578,23 @@ inline std::ostream &EncodeTypes(
   return out;
 }
 
+template <
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename ElementD
+>
+inline std::ostream &EncodeTypes(
+  std::ostream &out
+) {
+  
+  out << TensorTypeName<ElementA>() << "_" 
+      << TensorTypeName<ElementB>() << "_" 
+      << TensorTypeName<ElementC>() << "_"
+      << ElementTypeName<ElementD>();
+
+  return out;
+}
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <
@@ -547,7 +609,7 @@ inline CachedTestKey CreateCachedGemmTestKey(
   ElementCompute alpha,
   ElementCompute beta,
   cutlass::TensorView<ElementA, LayoutA> A,
-  cutlass::TensorView<ElementA, LayoutB> B,
+  cutlass::TensorView<ElementB, LayoutB> B,
   cutlass::TensorView<ElementC, LayoutC> C
 ) {
 
@@ -597,7 +659,7 @@ inline CachedTestKey CreateCachedConv2dTestKey(
   ElementCompute alpha,
   ElementCompute beta,
   cutlass::TensorView<ElementA, LayoutA> A,
-  cutlass::TensorView<ElementA, LayoutB> B,
+  cutlass::TensorView<ElementB, LayoutB> B,
   cutlass::TensorView<ElementC, LayoutC> C
 ) {
 
@@ -649,7 +711,7 @@ inline CachedTestKey CreateCachedConv2dWithBroadcastTestKey(
   ElementCompute alpha,
   ElementCompute beta,
   cutlass::TensorView<ElementA, LayoutA> A,
-  cutlass::TensorView<ElementA, LayoutB> B,
+  cutlass::TensorView<ElementB, LayoutB> B,
   cutlass::TensorView<ElementC, LayoutC> C
 ) {
 
@@ -701,7 +763,7 @@ inline CachedTestKey CreateCachedConv2dWithReductionTestKey(
   ElementCompute alpha,
   ElementCompute beta,
   cutlass::TensorView<ElementA, LayoutA> A,
-  cutlass::TensorView<ElementA, LayoutB> B,
+  cutlass::TensorView<ElementB, LayoutB> B,
   cutlass::TensorView<ElementC, LayoutC> C
 ) {
 
@@ -752,7 +814,7 @@ inline CachedTestKey CreateCachedConv3dTestKey(
   ElementCompute alpha,
   ElementCompute beta,
   cutlass::TensorView<ElementA, LayoutA> A,
-  cutlass::TensorView<ElementA, LayoutB> B,
+  cutlass::TensorView<ElementB, LayoutB> B,
   cutlass::TensorView<ElementC, LayoutC> C
 ) {
 
@@ -790,8 +852,56 @@ inline CachedTestKey CreateCachedConv3dTestKey(
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-} // namespace device
-} // nammespace conv
-} // namespace test
+template <
+  class ProblemShape,
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename ElementD
+>
+inline CachedTestKey CreateCachedConvNd3xTestKey(
+  cutlass::conv::Operator conv_operator,
+  ProblemShape const& problem_shape,
+  double alpha,
+  double beta,
+  thrust::universal_vector<ElementA> A,
+  thrust::universal_vector<ElementB> B,
+  thrust::universal_vector<ElementC> C
+) {
+
+  CachedTestKey key;
+ 
+  // Encode convNd operator and problem sizes
+  std::stringstream ss_op;
+  ss_op << "conv" << ProblemShape::RankS <<  "d";
+  key.op = ss_op.str();
+
+  std::stringstream ss_problem;
+  ss_problem << EncodeOperator(conv_operator) << "_";
+  EncodeProblemSize(ss_problem, problem_shape);
+  ss_problem << "_alpha" << EncodeScalar(alpha) << "_beta" << EncodeScalar(beta);
+  key.problem = ss_problem.str();
+
+  // Encode problem data types
+  std::stringstream ss_types;
+  EncodeTypes<
+        ElementA,
+        ElementB,
+        ElementC,
+        ElementD>(ss_types);
+  key.types = ss_types.str();
+
+  // Encode problem data
+  CRC32 crc_hash;
+  key.A = TensorHash(A, crc_hash);
+  key.B = TensorHash(B, crc_hash);
+  key.C = TensorHash(C, crc_hash);
+
+  return key;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace test::conv::device
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/conv/device/CMakeLists.txt b/test/unit/conv/device/CMakeLists.txt
index a9bdbd9a47..d3a6782f44 100644
--- a/test/unit/conv/device/CMakeLists.txt
+++ b/test/unit/conv/device/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -93,6 +93,20 @@ if (CUTLASS_NVCC_MAX_ARCH GREATER_EQUAL 80)
 
 endif()
 
+if (CUTLASS_NVCC_MAX_ARCH GREATER_EQUAL 89)
+
+  add_dependencies(
+    cutlass_test_unit_conv_device
+    cutlass_test_unit_conv_device_tensorop_f8_sm89
+  )
+
+  add_dependencies(
+    test_unit_conv_device
+    test_unit_conv_device_tensorop_f8_sm89
+  )
+
+endif()
+
 #
 # OpClassSimt (CUDA cores)
 #
@@ -110,6 +124,9 @@ cutlass_test_unit_add_executable(
 
   # F16
   conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu
+  depthwise_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu
+  depthwise_conv2d_fprop_direct_conv_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu
+  depthwise_conv2d_fprop_direct_conv_fixed_stride_dilation_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu
 )
 
 if (CUTLASS_NVCC_MAX_ARCH GREATER_EQUAL 80)
@@ -123,6 +140,19 @@ if (CUTLASS_NVCC_MAX_ARCH GREATER_EQUAL 80)
     conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
     conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
     conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
+    deconv2d_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
+
+    conv2d_fprop_with_broadcast_simt_sm80.cu
+    deconv2d_with_broadcast_simt_sm80.cu
+
+    conv3d_fprop_implicit_gemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu
+    conv3d_dgrad_implicit_gemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu
+    conv3d_wgrad_implicit_gemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu
+    deconv3d_implicit_gemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu
+
+    conv3d_fprop_with_broadcast_simt_sm80.cu
+    deconv3d_with_broadcast_simt_sm80.cu
+
   )
 
 endif()
@@ -151,6 +181,7 @@ cutlass_test_unit_add_executable(
   conv2d_fprop_with_broadcast_sm75.cu
   conv2d_fprop_with_reduction_sm75.cu
 
+  conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
   conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
 )
 
@@ -177,12 +208,18 @@ if (CUTLASS_NVCC_MAX_ARCH GREATER_EQUAL 80)
     # Conv2d (small channel count specializations)
     conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
     conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
-  
+
     # Conv2d (Strided Dgrad)
     conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+    conv2d_strided_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
+    conv2d_strided_dgrad_implicit_gemm_swizzling4_sm80.cu
 
     # Conv3d
+    conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
     conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+
+    # Group Conv2d
+    group_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
   )
 
   # Conv - TF32 input, F32 output, F32 accumulation
@@ -236,3 +273,15 @@ if (CUTLASS_NVCC_MAX_ARCH GREATER_EQUAL 75)
   endif()
 
 endif()
+
+if (CUTLASS_NVCC_MAX_ARCH GREATER_EQUAL 89)
+
+  # Conv - F8 input, F8 output, F32 accumulation
+  cutlass_test_unit_add_executable(
+    cutlass_test_unit_conv_device_tensorop_f8_sm89
+
+    conv2d_fprop_implicit_gemm_f8nhwc_f8nhwc_f8nhwc_tensor_op_f32_sm89.cu
+  )
+
+endif()
+
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
index e1c4ddc8dc..dd6fbfcf64 100644
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
index 6736361fac..88f89c793d 100644
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
index b7e9b94b7b..5559fe6bc2 100644
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -240,4 +240,59 @@ TEST(SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_ten
 
 ////////////////////////////////////////////////////////////////////////////////
 
+TEST(SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align2,
+  128x64_64x3_64x32x64) {
+ 
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementCompute     = cutlass::half_t;
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 32>,
+    cutlass::gemm::GemmShape<64, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      2,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized,
+    cutlass::conv::StrideSupport::kUnity,
+    2,
+    2
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+ 
+  test::conv::device::Conv2dProblemVector problem_size_list;
+
+  // run specific problem size in the unit test first
+  problem_size_list.push_back(cutlass::conv::Conv2dProblemSize(
+    {35, 100, 50, 64},     // input size (NHWC)
+    {22, 1, 1, 64},     // filter size (KRSC)
+    {0, 0, 0, 0},      // padding (pad_h, _, pad_w, _)
+    {1, 1},            // stride (stride_h, stride_w)
+    {1, 1}             // dilation (dilation_h, dilation_w)
+  ));
+ 
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>(problem_size_list));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
 #endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
index 894cb78f7c..923b0ea15b 100644
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
index 9973cdd192..66ff6feef6 100644
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
index 141772e4e1..847d68de69 100644
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
index 34c2bfa4a5..6164455209 100644
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
index 54bae9376b..e96d3ea929 100644
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
index 6e467b852d..d71832ea5d 100644
--- a/test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -109,7 +109,7 @@ std::vector<cutlass::conv::Conv2dProblemSize> Conv2dFewChannelProblemSizes(int c
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-#if 0
+
 TEST(SM80_Device_Conv2d_Fprop_Few_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_8,
   128x128_64x3_64x64x64) {
 
@@ -201,7 +201,7 @@ TEST(SM80_Device_Conv2d_Fprop_Few_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_
   EXPECT_TRUE(test::conv::device::TestSpecificConv2d<Conv2dFprop>(
     Conv2dFewChannelProblemSizes(kChannelCount)));
 }
-#endif
+
 ////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM80_Device_Conv2d_Fprop_Few_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_2,
diff --git a/test/unit/conv/device/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
index eedbe050d7..36259550e3 100644
--- a/test/unit/conv/device/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
index e56fcde882..95e184224a 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
index 17bd424609..e7022b095b 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu
index a68132a783..1c780f50ad 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
index 708ac29a18..64db7ebbdf 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
index ff07a04567..c977d14481 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
index 10cf920cbf..e4e6f66e82 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
index 4080029e13..b3eb74e103 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
index 22aab6261c..c75ebbee3b 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -85,7 +85,7 @@ TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tens
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-#if 0
+
 TEST(SM80_Device_Conv2d_Fprop_Precomputed_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32,
   128x128_64x3_64x64x64) {
 
@@ -116,7 +116,8 @@ TEST(SM80_Device_Conv2d_Fprop_Precomputed_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_t
     cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     3,
     cutlass::arch::OpMultiplyAdd,
-    cutlass::conv::IteratorAlgorithm::kOptimized
+    cutlass::conv::IteratorAlgorithm::kOptimized,
+    cutlass::conv::StrideSupport::kStrided
   >::Kernel;
 
   using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
@@ -124,7 +125,6 @@ TEST(SM80_Device_Conv2d_Fprop_Precomputed_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_t
   /// Run all unit test sizes with device-level Conv2d instance
   EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
 }
-#endif
 
 ////////////////////////////////////////////////////////////////////////////////
 #endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu
index 521d5613a4..c41e561b12 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
index 68a0a682f0..842fd88b78 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f8nhwc_f8nhwc_f8nhwc_tensor_op_f32_sm89.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f8nhwc_f8nhwc_f8nhwc_tensor_op_f32_sm89.cu
new file mode 100644
index 0000000000..3443551400
--- /dev/null
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f8nhwc_f8nhwc_f8nhwc_tensor_op_f32_sm89.cu
@@ -0,0 +1,368 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Tests for device-wide Conv2d fprop interface with:
+        A: NHWC, of type FE4M4 or FE5M2
+        B: NHWC, of type FE4M3 or FE5M2
+        C: NHWC, of FE4M3 or FE5M2
+        Accum: F32
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination_generic_with_scaling.h"
+#include "cutlass/conv/kernel/default_conv2d_fprop_with_absmax.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "conv2d_with_absmax_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM89_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Conv2d_Fprop_Analytic_ImplicitGemm_fe4m3nhwc_fe4mnhwc_fe4mnhwc_tensor_op_f32,
+  identity_128x256x64_64x3_64x64x64) {
+
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementOutput = cutlass::float_e4m3_t;
+  using ElementAuxOutput = ElementOutput;
+  using ElementAccumulator = float;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationGenericWithScalingAndAbsMax<
+    cutlass::epilogue::thread::Identity,
+    ElementOutput,
+    ElementAuxOutput,
+    128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFpropWithAbsMax<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementOutput, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm89,
+    cutlass::gemm::GemmShape<128, 256, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    EpilogueOutputOp,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    kStages,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  bool passed = test::conv::device::TestAllConv2dWithAbsmax<Conv2dFprop, cutlass::epilogue::thread::Identity>();
+  EXPECT_TRUE(passed);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Conv2d_Fprop_Analytic_ImplicitGemm_fe5m2nhwc_fe4m3nhwc_fe4m3nhwc_tensor_op_f32,
+  identity_128x256x64_64x3_64x64x64) {
+
+  using ElementA = cutlass::float_e5m2_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementOutput = cutlass::float_e4m3_t;
+  using ElementAuxOutput = ElementOutput;
+  using ElementAccumulator = float;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationGenericWithScalingAndAbsMax<
+    cutlass::epilogue::thread::Identity,
+    ElementOutput,
+    ElementAuxOutput,
+    128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFpropWithAbsMax<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementOutput, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm89,
+    cutlass::gemm::GemmShape<128, 256, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    EpilogueOutputOp,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    kStages,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  bool passed = test::conv::device::TestAllConv2dWithAbsmax<Conv2dFprop, cutlass::epilogue::thread::Identity>();
+  EXPECT_TRUE(passed);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Conv2d_Fprop_Analytic_ImplicitGemm_fe5m2nhwc_fe4m3nhwc_fe5m2nhwc_tensor_op_f32,
+  identity_128x256x64_64x3_64x64x64) {
+
+  using ElementA = cutlass::float_e5m2_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementOutput = cutlass::float_e5m2_t;
+  using ElementAuxOutput = ElementOutput;
+  using ElementAccumulator = float;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationGenericWithScalingAndAbsMax<
+    cutlass::epilogue::thread::Identity,
+    ElementOutput,
+    ElementAuxOutput,
+    128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFpropWithAbsMax<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementOutput, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm89,
+    cutlass::gemm::GemmShape<128, 256, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    EpilogueOutputOp,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    kStages,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  bool passed = test::conv::device::TestAllConv2dWithAbsmax<Conv2dFprop, cutlass::epilogue::thread::Identity>();
+  EXPECT_TRUE(passed);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Conv2d_Fprop_Optimized_ImplicitGemm_fe4m3nhwc_fe4mnhwc_fe4mnhwc_tensor_op_f32,
+  identity_128x256x64_64x3_64x64x64) {
+
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementOutput = cutlass::float_e4m3_t;
+  using ElementAuxOutput = ElementOutput;
+  using ElementAccumulator = float;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationGenericWithScalingAndAbsMax<
+    cutlass::epilogue::thread::Identity,
+    ElementOutput,
+    ElementAuxOutput,
+    128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFpropWithAbsMax<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementOutput, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm89,
+    cutlass::gemm::GemmShape<128, 256, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    EpilogueOutputOp,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    kStages,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  bool passed = test::conv::device::TestAllConv2dWithAbsmax<Conv2dFprop, cutlass::epilogue::thread::Identity>();
+  EXPECT_TRUE(passed);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Conv2d_Fprop_Optimized_ImplicitGemm_fe4m3nhwc_fe4mnhwc_fe4mnhwc_tensor_op_f32,
+  relu_128x256x64_64x3_64x64x64) {
+
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementOutput = cutlass::float_e4m3_t;
+  using ElementAuxOutput = ElementOutput;
+  using ElementAccumulator = float;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationGenericWithScalingAndAbsMax<
+    cutlass::epilogue::thread::ReLu,
+    ElementOutput,
+    ElementAuxOutput,
+    128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFpropWithAbsMax<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementOutput, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm89,
+    cutlass::gemm::GemmShape<128, 256, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    EpilogueOutputOp,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    kStages,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  bool passed = test::conv::device::TestAllConv2dWithAbsmax<Conv2dFprop, cutlass::epilogue::thread::ReLu>();
+  EXPECT_TRUE(passed);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Conv2d_Fprop_Optimized_ImplicitGemm_fe4m3nhwc_fe4mnhwc_fe4mnhwc_tensor_op_f32,
+  identity_fastacc_128x256x64_64x3_64x64x64) {
+
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementOutput = cutlass::float_e4m3_t;
+  using ElementAuxOutput = ElementOutput;
+  using ElementAccumulator = float;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationGenericWithScalingAndAbsMax<
+    cutlass::epilogue::thread::Identity,
+    ElementOutput,
+    ElementAuxOutput,
+    128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFpropWithAbsMax<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementOutput, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm89,
+    cutlass::gemm::GemmShape<128, 256, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    EpilogueOutputOp,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    kStages,
+    cutlass::arch::OpMultiplyAddFastAccum,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  bool passed = test::conv::device::TestAllConv2dWithAbsmax<Conv2dFprop, cutlass::epilogue::thread::Identity>();
+  EXPECT_TRUE(passed);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Conv2d_Fprop_Optimized_ImplicitGemm_fe4m3nhwc_fe4mnhwc_fe4mnhwc_tensor_op_f32,
+  identity_noScale_128x256x64_64x3_64x64x64) {
+
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementOutput = cutlass::float_e4m3_t;
+  using ElementAuxOutput = ElementOutput;
+  using ElementAccumulator = float;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationGenericWithScalingAndAbsMax<
+    cutlass::epilogue::thread::Identity,
+    ElementOutput,
+    ElementAuxOutput,
+    128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFpropWithAbsMax<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementOutput, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm89,
+    cutlass::gemm::GemmShape<128, 256, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    EpilogueOutputOp,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    kStages,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  bool passed = test::conv::device::TestAllConv2dWithAbsmax<Conv2dFprop, cutlass::epilogue::thread::Identity>(
+    /* scaleA = */false,
+    /* scaleB = */false,
+    /* scaleC = */false
+  );
+  EXPECT_TRUE(passed);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // CUTLASS_ARCH_MMA_SM89_SUPPORTED
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_qf32nhwc_qf32nhwc_qf32nhwc_simt_f32_sm50.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_qf32nhwc_qf32nhwc_qf32nhwc_simt_f32_sm50.cu
index 213a2566ad..ae16437e66 100755
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_qf32nhwc_qf32nhwc_qf32nhwc_simt_f32_sm50.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_qf32nhwc_qf32nhwc_qf32nhwc_simt_f32_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu
index 67bfc35e5a..e443531c52 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -42,8 +42,6 @@
 #include "conv2d_testbed_interleaved.h"
 
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-
-
 TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32,
   128x128_128x2_64x64x128) {
 
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu
index 7393a9f065..69070d9b71 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu
index 97de995049..a9f14adac1 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -42,7 +42,6 @@
 #include "conv2d_testbed.h"
 
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-    
 TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32,
   128x128_64x3_64x64x64) {
 
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu
index 9487a3f103..cc6502a7d1 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu
index d55ad1bff9..0af6077154 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -42,7 +42,6 @@
 #include "conv2d_testbed_interleaved.h"
 
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-
 TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32,
   128x128_64x2_64x64x64) {
 
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu
index 117d5096be..91de9236f5 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu
index 1187ff75b7..a2967cc165 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -42,7 +42,6 @@
 #include "conv2d_testbed.h"
 
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-    
 TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32,
   128x128_64x2_64x64x64) {
 
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu
index 066045b43c..eeb146edfa 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
index 57826212f8..8ce7b4be37 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_with_broadcast_simt_sm80.cu b/test/unit/conv/device/conv2d_fprop_with_broadcast_simt_sm80.cu
new file mode 100644
index 0000000000..c3015bfda1
--- /dev/null
+++ b/test/unit/conv/device/conv2d_fprop_with_broadcast_simt_sm80.cu
@@ -0,0 +1,172 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
+#include "cutlass/epilogue/thread/linear_combination_residual_block.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_with_broadcast_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+
+TEST(SM80_Device_Conv2d_Fprop_With_Broadcast_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
+  128x128_32x2_64x64x32) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA = float;
+  using ElementB = float;
+  using ElementC = float;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationBiasElementwise<
+    ElementC,
+    ElementAccumulator,
+    ElementCompute,
+    ElementC,
+    ElementC,
+    1,
+    cutlass::epilogue::thread::ReLu<float>
+  >;
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>,
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    EpilogueOutputOp,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic,
+    cutlass::conv::StrideSupport::kStrided
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2dWithBroadcast<Conv2dFprop>());
+}
+
+// Test residual block fusion: UnaryOp(BinaryOp(ActivationOp(Conv2d(X) + bias), residual))
+// LinearCombinationResidualBlock does not support the split-k mode unless ActivationOp is Identity.
+// This is because the activation needs to be applied to the fully accumulated output of the Conv2d op,
+// which only the last thread block would have an access to, before applying BinaryOp.
+// The epilogue functor in the last thread block would have to be given three inputs, namely
+// partial outputs, bias, and residual, but this is not supported in the current interface.
+// Set TestSplitK = false to skip split-k tests with non-trivial ActivationOp.
+template <
+ template<typename T> class ActivationOp,
+ template<typename T> class BinaryOp,
+ template<typename T> class UnaryOp,
+ bool TestSplitK = true
+>
+static void Conv2dFpropSM80TestResidualBlock() {
+  using ElementA = float;
+  using ElementB = float;
+  using ElementC = float;
+  using ElementD = ElementC;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationResidualBlock<
+    ElementD,
+    ElementAccumulator,
+    ElementCompute,
+    ElementC,
+    1,
+    ActivationOp,
+    BinaryOp,
+    UnaryOp
+  >;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>,
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    EpilogueOutputOp,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  struct ReferenceOp {
+    using OutputOp = typename Conv2dFprop::EpilogueOutputOp;
+    using ElementZ = typename OutputOp::ElementZ;
+
+    ActivationOp<ElementCompute> activation;
+    BinaryOp<ElementCompute> binary_op;
+    UnaryOp<ElementCompute> unary_op;
+
+    void operator()(ElementZ &Z, ElementZ&, ElementCompute conv2d, ElementCompute residual) {
+      Z = ElementZ(unary_op(binary_op(activation(conv2d), residual)));
+    }
+  };
+
+  bool passed = test::conv::device::TestAllConv2dWithBroadcast<Conv2dFprop, ReferenceOp, true, TestSplitK>();
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM80_Device_Conv2d_Fprop_With_Residual_Block_Plus_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
+     128x128_8x4_32x64x8) {
+  // Resnet
+  Conv2dFpropSM80TestResidualBlock<cutlass::epilogue::thread::Identity, cutlass::plus, cutlass::epilogue::thread::ReLu>();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/conv/device/conv2d_fprop_with_broadcast_sm70.cu b/test/unit/conv/device/conv2d_fprop_with_broadcast_sm70.cu
index 18398e25b3..6e235c7905 100644
--- a/test/unit/conv/device/conv2d_fprop_with_broadcast_sm70.cu
+++ b/test/unit/conv/device/conv2d_fprop_with_broadcast_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -60,7 +60,7 @@ template <
  template<typename T> class UnaryOp,
  bool TestSplitK = false 
 >
-void TestResidaulBlock() {
+void Conv2dFpropSM70TestResidualBlock() {
   using ElementA = cutlass::half_t;
   using ElementB = cutlass::half_t;
   using ElementC = cutlass::half_t;
@@ -117,7 +117,7 @@ void TestResidaulBlock() {
 TEST(SM70_Device_Conv2d_Fprop_With_Residual_Block_Plus_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32,
      128x128_32x2_64x64x32) {
   // Resnet
-  TestResidaulBlock<float, cutlass::epilogue::thread::ReLu, cutlass::plus, cutlass::epilogue::thread::Identity>();
+  Conv2dFpropSM70TestResidualBlock<float, cutlass::epilogue::thread::ReLu, cutlass::plus, cutlass::epilogue::thread::Identity>();
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/conv/device/conv2d_fprop_with_broadcast_sm75.cu b/test/unit/conv/device/conv2d_fprop_with_broadcast_sm75.cu
index 88cc4973be..f2d3e584d8 100644
--- a/test/unit/conv/device/conv2d_fprop_with_broadcast_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_with_broadcast_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -103,7 +103,7 @@ template <
  template<typename T> class UnaryOp,
  bool TestSplitK = true
 >
-void TestResidaulBlock() {
+void Conv2dFpropSM75TestResidualBlock() {
   using ElementA = cutlass::half_t;
   using ElementB = cutlass::half_t;
   using ElementC = cutlass::half_t;
@@ -160,14 +160,14 @@ void TestResidaulBlock() {
 TEST(SM75_Device_Conv2d_Fprop_With_Residual_Block_Plus_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32,
      128x128_32x2_64x64x32) {
   // Resnet
-  TestResidaulBlock<cutlass::half_t, cutlass::epilogue::thread::Identity, cutlass::plus, cutlass::epilogue::thread::ReLu>();
+  Conv2dFpropSM75TestResidualBlock<cutlass::half_t, cutlass::epilogue::thread::Identity, cutlass::plus, cutlass::epilogue::thread::ReLu>();
 }
 
 TEST(SM75_Device_Conv2d_Fprop_With_Residual_Block_Multiply_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32,
      128x128_32x2_64x64x32) {
   // EfficientNet V2
   // Do not run split-K tests since the activation op is not Identity.
-  TestResidaulBlock<float, cutlass::epilogue::thread::Sigmoid, cutlass::multiplies, cutlass::epilogue::thread::Identity, false>();
+  Conv2dFpropSM75TestResidualBlock<float, cutlass::epilogue::thread::Sigmoid, cutlass::multiplies, cutlass::epilogue::thread::Identity, false>();
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/conv/device/conv2d_fprop_with_reduction_sm75.cu b/test/unit/conv/device/conv2d_fprop_with_reduction_sm75.cu
index f096d2db9b..a1d658eaf9 100644
--- a/test/unit/conv/device/conv2d_fprop_with_reduction_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_with_reduction_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_problems.h b/test/unit/conv/device/conv2d_problems.h
index b740161e7f..07ca3ed90b 100644
--- a/test/unit/conv/device/conv2d_problems.h
+++ b/test/unit/conv/device/conv2d_problems.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,8 +35,6 @@
 
 #include <vector>
 
-#include "../../common/cutlass_unit_test.h"
-
 #include "cutlass/cutlass.h"
 #include "cutlass/layout/matrix.h"
 #include "cutlass/conv/convolution.h"
@@ -233,6 +231,75 @@ struct TestbedConv2dProblemSizes {
       {1, 1}                             // dilation (dilation_h, dilation_w) 
     ));
 
+    ////////////////////////////////////////////////////////////////////////////////////////////
+    // Small input size x stride (1,1) asymmetric paddings (1, 0, 1, 0)
+    // C < CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
+    ////////////////////////////////////////////////////////////////////////////////////////////
+    
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( 
+      {1, 1, 1, minimum_channel_size},   // input size  (NHWC)
+      {8, 1, 1, minimum_channel_size},   // filter size (KRSC)
+      {1, 0, 1, 0},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w) 
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( 
+      {1, 1, 8, minimum_channel_size},   // input size  (NHWC)
+      {8, 1, 3, minimum_channel_size},   // filter size (KRSC)
+      {1, 0, 1, 0},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w) 
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( 
+      {1, 7, 8, minimum_channel_size},   // input size  (NHWC)
+      {8, 3, 3, minimum_channel_size},   // filter size (KRSC)
+      {1, 0, 1, 0},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w) 
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 7, 9, minimum_channel_size},  // input size  (NHWC)
+      {8, 4, 4, minimum_channel_size},  // filter size (KRSC)
+      {1, 0, 1, 0},                     // padding (pad_h, _, pad_w, _)
+      {1, 1},                           // stride (stride_h, stride_w)
+      {1, 1}                            // dilation (dilation_h, dilation_w) 
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {2, 7, 9, minimum_channel_size},   // input size  (NHWC)
+      {8, 5, 5, minimum_channel_size},   // filter size (KRSC)
+      {1, 0, 1, 0},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w) 
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {3, 7, 9, minimum_channel_size},   // input size  (NHWC)
+      {8, 6, 5, minimum_channel_size},   // filter size (KRSC)
+      {1, 0, 1, 0},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w) 
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {3, 7, 9, minimum_channel_size},   // input size  (NHWC)
+      {8, 6, 6, minimum_channel_size},   // filter size (KRSC)
+      {1, 0, 1, 0},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w) 
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {3, 7, 9, minimum_channel_size},   // input size  (NHWC)
+      {8, 7, 7, minimum_channel_size},   // filter size (KRSC)
+      {1, 0, 1, 0},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w) 
+    ));
+
     ////////////////////////////////////////////////////////////////////////////////////////////
     // Small input size x stride (2,2)
     // C < CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
@@ -684,6 +751,177 @@ struct TestbedConv2dProblemSizes {
 
 };
 
+
+////////////////////////////////////////////////////////////////////////////
+/// Structure TestbedGroupConv2dProblemSizes initializes and holds group conv default and
+/// important network sizes
+////////////////////////////////////////////////////////////////////////////
+struct TestbedGroupConv2dProblemSizes {
+
+  //
+  // Data members
+  //
+  int threadblock_n;
+  int threadblock_k;
+  int minimum_channel_size;
+
+  Conv2dProblemVector default_single_group_sizes;
+  Conv2dProblemVector default_multiple_group_sizes;
+
+  //
+  // Methods
+  //
+  /// Default ctor
+  TestbedGroupConv2dProblemSizes(
+    int threadblock_n_,
+    int threadblock_k_,
+    int minimum_channel_size_ = 64)
+  : threadblock_n (threadblock_n_),
+    threadblock_k (threadblock_k_),
+    minimum_channel_size (minimum_channel_size_) {
+    initialize_group_conv2d_default_sizes();
+    filter_all();
+  }
+
+  /// Eliminates some illegal cases
+  void filter_all() {
+
+    Conv2dProblemVector *problems_vectors[] = {
+      &default_single_group_sizes,
+      &default_multiple_group_sizes
+    };
+
+    for (Conv2dProblemVector *problems : problems_vectors) {
+      Conv2dProblemVector filtered;
+
+      for (cutlass::conv::Conv2dProblemSize const & problem : *problems) {
+        if (!((problem.C / problem.groups) % minimum_channel_size)) {
+          filtered.push_back(problem);
+        }
+      }
+
+      *problems = filtered;
+    }
+  }
+
+  // Add a few standard convolution problem sizes
+  void initialize_group_conv2d_default_sizes() {
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    // One group calculated by one or multiple CTAs: k_per_group % CTA::N = 0
+    // One CTA calculates a single group
+    ////////////////////////////////////////////////////////////////////////////////////
+
+    for (int cta_per_group_k = 1; cta_per_group_k < 4; ++cta_per_group_k) {
+      // groups = 2, 3, 4
+      for (int groups = 2; groups < 5; ++groups) {
+
+        int conv_k = cta_per_group_k * threadblock_n * groups;
+        default_single_group_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+          {1, 8, 8, threadblock_k * 2 * groups},        // input size  (NHWC)
+          {conv_k, 3, 3, threadblock_k * 2},            // filter size (KRSC)
+          {1, 1, 1, 1},                                 // padding (pad_h, _, pad_w, _)
+          {1, 1},                                       // stride (stride_h, stride_w)
+          {1, 1},                                       // dilation (dilation_h, dilation_w)
+          cutlass::conv::Mode::kCrossCorrelation,
+          1,                                            // split_k_slices
+          groups                                        // groups
+        ));
+
+      } // loop groups
+    } // loop cta_per_group_k
+
+    // Partial gemm_k: k_per_group == CTA::N && channels_per_group < CTA::K
+    default_single_group_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 8, 8, threadblock_k},                       // input size  (NHWC)
+      {threadblock_n * 2, 3, 3, threadblock_k / 2},   // filter size (KRSC)
+      {1, 1, 1, 1},                                   // padding (pad_h, _, pad_w, _)
+      {1, 1},                                         // stride (stride_h, stride_w)
+      {1, 1},                                         // dilation (dilation_h, dilation_w)
+      cutlass::conv::Mode::kCrossCorrelation,
+      1,                                              // split_k_slices
+      2                                               // groups
+    ));
+
+    // Larger problem sizes
+    
+    default_single_group_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 56, 56, 696},                               // input size  (NHWC)
+      {768, 3, 3, 232},                               // filter size (KRSC)
+      {1, 1, 1, 1},                                   // padding (pad_h, _, pad_w, _)
+      {2, 2},                                         // stride (stride_h, stride_w)
+      {1, 1},                                         // dilation (dilation_h, dilation_w)
+      cutlass::conv::Mode::kCrossCorrelation,
+      1,                                              // split_k_slices
+      3                                               // groups
+    ));
+    default_single_group_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 14, 14, 1392},                              // input size  (NHWC)
+      {1536, 3, 3, 232},                              // filter size (KRSC)
+      {1, 1, 1, 1},                                   // padding (pad_h, _, pad_w, _)
+      {1, 1},                                         // stride (stride_h, stride_w)
+      {1, 1},                                         // dilation (dilation_h, dilation_w)
+      cutlass::conv::Mode::kCrossCorrelation,
+      1,                                              // split_k_slices
+      3                                               // groups
+    ));
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    // One CTA calculate multiple groups: CTA::N % k_per_group = 0
+    ////////////////////////////////////////////////////////////////////////////////////
+
+    // 2 groups per CTA
+    default_multiple_group_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 8, 8, threadblock_k * 4},                   // input size  (NHWC)
+      {threadblock_n, 3, 3, threadblock_k * 2},       // filter size (KRSC)
+      {1, 1, 1, 1},                                   // padding (pad_h, _, pad_w, _)
+      {1, 1},                                         // stride (stride_h, stride_w)
+      {1, 1},                                         // dilation (dilation_h, dilation_w)
+      cutlass::conv::Mode::kCrossCorrelation,
+      1,                                              // split_k_slices
+      2                                               // groups
+    ));
+
+    // 2 groups per CTA and partial gemm_k
+    default_multiple_group_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 8, 8, threadblock_k},                       // input size  (NHWC)
+      {threadblock_n, 3, 3, threadblock_k / 2},       // filter size (KRSC)
+      {1, 1, 1, 1},                                   // padding (pad_h, _, pad_w, _)
+      {1, 1},                                         // stride (stride_h, stride_w)
+      {1, 1},                                         // dilation (dilation_h, dilation_w)
+      cutlass::conv::Mode::kCrossCorrelation,
+      1,                                              // split_k_slices
+      2                                               // groups
+    ));
+
+    // 4 groups per CTA
+    default_multiple_group_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 8, 8, threadblock_k * 8},                   // input size  (NHWC)
+      {threadblock_n / 2, 3, 3, threadblock_k * 2},   // filter size (KRSC)
+      {1, 1, 1, 1},                                   // padding (pad_h, _, pad_w, _)
+      {1, 1},                                         // stride (stride_h, stride_w)
+      {1, 1},                                         // dilation (dilation_h, dilation_w)
+      cutlass::conv::Mode::kCrossCorrelation,
+      1,                                              // split_k_slices
+      4                                               // groups
+    ));
+
+    // 4 groups per CTA and partial gemm_k
+    default_multiple_group_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 8, 8, threadblock_k * 2},                   // input size  (NHWC)
+      {threadblock_n / 2, 3, 3, threadblock_k / 2},   // filter size (KRSC)
+      {1, 1, 1, 1},                                   // padding (pad_h, _, pad_w, _)
+      {1, 1},                                         // stride (stride_h, stride_w)
+      {1, 1},                                         // dilation (dilation_h, dilation_w)
+      cutlass::conv::Mode::kCrossCorrelation,
+      1,                                              // split_k_slices
+      4                                               // groups
+    ));
+  }
+
+};
+
+
 } // namespace device
 } // namespace conv
 } // namespace test
diff --git a/test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
index 748f83ed25..b1fe52d38d 100644
--- a/test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -85,7 +85,7 @@ TEST(SM80_Device_Conv2d_Strided_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32n
   test::conv::device::Conv2dProblemVector problem_size_list;
 
 
-#if 0 // run specific problem size in the unit test first
+// run specific problem size in the unit test first
     problem_size_list.push_back(cutlass::conv::Conv2dProblemSize(
       {1, 4, 4, 8},     // input size (NHWC)
       {8, 3, 3, 8},     // filter size (KRSC)
@@ -93,7 +93,6 @@ TEST(SM80_Device_Conv2d_Strided_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32n
       {3, 3},               // stride (stride_h, stride_w)
       {1, 1}                // dilation (dilation_h, dilation_w)
     ));
-#endif
 
   /// Run all unit test sizes with device-level Conv2d instance
   EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>(problem_size_list));
@@ -281,7 +280,7 @@ TEST(SM80_Device_Conv2d_Strided_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32
 
   test::conv::device::Conv2dProblemVector problem_size_list;
 
-#if 0 // run specific problem size in the unit test first
+ // run specific problem size in the unit test first
     problem_size_list.push_back(cutlass::conv::Conv2dProblemSize(
       {1, 56, 56, 8},   // input size (NHWC)
       {8, 1, 1, 8},   // filter size (KRSC)
@@ -298,8 +297,6 @@ TEST(SM80_Device_Conv2d_Strided_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32
       {1, 1}              // dilation (dilation_h, dilation_w)
     ));
 
-#endif
-
   /// Run all unit test sizes with device-level Conv2d instance
   EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>(problem_size_list));
 }
diff --git a/test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_swizzling4_sm80.cu b/test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_swizzling4_sm80.cu
new file mode 100644
index 0000000000..ec5548c684
--- /dev/null
+++ b/test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_swizzling4_sm80.cu
@@ -0,0 +1,99 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface with swizzling functor > 1
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv2d_dgrad.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+TEST(SM80_Device_Conv2d_Strided_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_swizzle4,
+  128x64_32x3_64x32x32) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 32>,
+    cutlass::gemm::GemmShape<64, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      2,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<4>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized,
+    cutlass::conv::StrideSupport::kStrided,
+    8,
+    2
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+
+
+  test::conv::device::Conv2dProblemVector problem_size_list;
+
+
+  // run specific problem size in the unit test first
+  problem_size_list.push_back(cutlass::conv::Conv2dProblemSize(
+    {1, 23, 56, 98},      // input size (NHWC)
+    {128, 3, 3, 98},      // filter size (KRSC)
+    {4, 0, 5, 0},         // padding (pad_h, _, pad_w, _)
+    {3, 3},               // stride (stride_h, stride_w)
+    {1, 1}                // dilation (dilation_h, dilation_w)
+  ));
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>(problem_size_list));
+}
+
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
diff --git a/test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
new file mode 100644
index 0000000000..8bc1d58b02
--- /dev/null
+++ b/test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -0,0 +1,112 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv2d_dgrad.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Strided_Dgrad_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_align4,
+  64x64_32x5_32x32x32) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::tfloat32_t;
+  using ElementB           = cutlass::tfloat32_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      4,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<>,
+    5,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic,
+    cutlass::conv::StrideSupport::kStrided,
+    4,
+    4
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+
+
+  test::conv::device::Conv2dProblemVector problem_size_list;
+
+  // run specific problem size in the unit test first
+  problem_size_list.push_back(cutlass::conv::Conv2dProblemSize(
+    {1, 1, 1, 16},   // input size (NHWC)
+    {8, 3, 3, 16},     // filter size (KRSC)
+    {1, 1, 1, 1},     // padding (pad_h, _, pad_w, _)
+    {2, 1},           // stride (stride_h, stride_w)
+    {1, 1}            // dilation (dilation_h, dilation_w)
+  ));
+
+  // run specific problem size in the unit test first
+  problem_size_list.push_back(cutlass::conv::Conv2dProblemSize(
+    {1, 1, 1, 16},   // input size (NHWC)
+    {8, 3, 3, 16},     // filter size (KRSC)
+    {1, 1, 1, 1},     // padding (pad_h, _, pad_w, _)
+    {3, 3},           // stride (stride_h, stride_w)
+    {1, 1}            // dilation (dilation_h, dilation_w)
+  ));
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>(problem_size_list));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
diff --git a/test/unit/conv/device/conv2d_testbed.h b/test/unit/conv/device/conv2d_testbed.h
index 9f0e04f94b..2acf1cf680 100644
--- a/test/unit/conv/device/conv2d_testbed.h
+++ b/test/unit/conv/device/conv2d_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -55,7 +55,7 @@
 #include "cutlass/core_io.h"
 #include "cutlass/util/tensor_view_io.h"
 
-#include "cache_testbed_output.h"
+#include "../cache_testbed_output.h"
 
 namespace test {
 namespace conv {
@@ -153,7 +153,6 @@ class TestbedConv2d {
     else if (dist_kind == cutlass::Distribution::Identity) {
 
       cutlass::reference::host::TensorFillIdentity(view);
-
     } 
     else if (dist_kind == cutlass::Distribution::Gaussian) {
 
@@ -192,7 +191,7 @@ class TestbedConv2d {
     // Determine SMEM requirements and waive if not satisfied
     //
 
-    int smem_size = int(sizeof(typename Conv2d::ImplicitGemmKernel::SharedStorage));
+    size_t smem_size = sizeof(typename Conv2d::UnderlyingKernel::SharedStorage);
 
     cudaDeviceProp properties;
     int device_idx;
@@ -208,7 +207,7 @@ class TestbedConv2d {
       throw std::runtime_error("cudaGetDeviceProperties() failed");
     }
 
-    if (properties.sharedMemPerMultiprocessor < smem_size) {
+    if (properties.sharedMemPerBlockOptin < smem_size) {
       return false;
     }
 
@@ -305,15 +304,15 @@ class TestbedConv2d {
         cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, problem_size),
         {
           reinterpret_cast<ElementAccumulator*> (workspace.get()),
-          ReductionStrideIndex(tensor_C.stride()[Conv2d::ImplicitGemmKernel::kTensorCStrideIdx])
+          ReductionStrideIndex(tensor_C.stride()[Conv2d::UnderlyingKernel::kTensorCStrideIdx])
         },
         {
           tensor_D_computed.device_data(),
-          ReductionStrideIndex(tensor_C.stride()[Conv2d::ImplicitGemmKernel::kTensorCStrideIdx])
+          ReductionStrideIndex(tensor_C.stride()[Conv2d::UnderlyingKernel::kTensorCStrideIdx])
         },
         {
           tensor_C.device_data(),
-          ReductionStrideIndex(tensor_C.stride()[Conv2d::ImplicitGemmKernel::kTensorCStrideIdx])
+          ReductionStrideIndex(tensor_C.stride()[Conv2d::UnderlyingKernel::kTensorCStrideIdx])
         },
         // apply alpha, beta to obtain the following equation alpha * ReduceAdd(A * B) + beta * C 
         {alpha, beta} 
@@ -489,7 +488,8 @@ class TestbedConv2d {
       fname << "error_Conv2d_ImplicitGemm_device_"
         << (split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial_reduction_" : "parallel_reduction_")
         << (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kFprop ? "fprop_" :
-            (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ? "dgrad_" : "wgrad_"))
+            (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ? "dgrad_" :
+              (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kDeconv ? "deconv_" : "wgrad_")))
         << ss_problem_size_text.str()
         << Conv2d::ThreadblockShape::kM << "x"  
         << Conv2d::ThreadblockShape::kN << "x"  
@@ -573,7 +573,7 @@ bool TestSpecificConv2d(
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 // TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
 // TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
-// Additionaly, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes 
+// Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes 
 // (conv_blacklist_sizes)
 /////////////////////////////////////////////////////////////////////////////////////////////////////////////
 template <typename ImplicitGemm>
@@ -602,7 +602,7 @@ bool TestAllConv2d(
     conv_test_sizes,                               // run user specified sizes
     conv_problems.conv2d_default_sizes,            // run default and cudnn bug sizes
     //conv_problems.conv2d_resnet50_sizes,         // run resnet50 sizes
-#if CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED 
+#if CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED
     conv_problems.conv2d_rigorous_sizes,           // run large and rigorous sizes if enabled
 #endif
   };
@@ -635,9 +635,9 @@ bool TestAllConv2d(
     //
   
     // CUTLASS DGRAD's *unity* stride specialization only support stride {1, 1} 
-    if ((ImplicitGemm::kConvolutionalOperator == 
-          cutlass::conv::Operator::kDgrad) && 
-        (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport == 
+    if ((ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ||
+          ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDeconv) &&
+        (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport == 
           cutlass::conv::StrideSupport::kUnity)) {
       if (!((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
         continue;
@@ -645,17 +645,17 @@ bool TestAllConv2d(
     }
 
     // Fixed channels algorithm requires channel count to match access size
-    if (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kIteratorAlgorithm ==
+    if (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kIteratorAlgorithm ==
         cutlass::conv::IteratorAlgorithm::kFixedChannels) {
-      if (conv_problem.C != ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::AccessType::kElements) {
+      if (conv_problem.C != ImplicitGemm::UnderlyingKernel::Mma::IteratorA::AccessType::kElements) {
         continue;
       }
     }
 
     // Few channels algorithm requires channel count to match access size
-    if (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kIteratorAlgorithm ==
+    if (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kIteratorAlgorithm ==
         cutlass::conv::IteratorAlgorithm::kFewChannels) {
-      if (conv_problem.C % ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::AccessType::kElements) {
+      if (conv_problem.C % ImplicitGemm::UnderlyingKernel::Mma::IteratorA::AccessType::kElements) {
         continue;
       }
     }
@@ -663,9 +663,9 @@ bool TestAllConv2d(
     // CUTLASS DGRAD's *strided* stride specialization supports all stride {stride_h, stride_w} 
     // Although strided dgrad works for all stride combinations, we are only going 
     // to run strided dgrad for non-unity strides 
-    if ((ImplicitGemm::kConvolutionalOperator == 
-          cutlass::conv::Operator::kDgrad) && 
-        (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport == 
+    if ((ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ||
+          ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDeconv) &&
+        (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport == 
           cutlass::conv::StrideSupport::kStrided)) {
        if (((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
          continue;
@@ -696,7 +696,7 @@ bool TestAllConv2d(
       return false;
     }
 
-    // If CUTLASS_UNIT_TEST_PROBLEM_COUNT is set reduce the the number of tested problem counts
+    // If CUTLASS_UNIT_TEST_PROBLEM_COUNT is set reduce the number of tested problem counts
     if (CutlassUnitTestProblemCount() && 
         testbed.tested_problem_count > CutlassUnitTestProblemCount()) {
       return true;
@@ -704,23 +704,23 @@ bool TestAllConv2d(
   }
 
   // Small-channels convolution can't run here.
-  if (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kIteratorAlgorithm ==
+  if (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kIteratorAlgorithm ==
         cutlass::conv::IteratorAlgorithm::kFixedChannels) {
 
     return true;
   }
 
   // Small-channels convolution can't run here.
-  if (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kIteratorAlgorithm ==
+  if (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kIteratorAlgorithm ==
         cutlass::conv::IteratorAlgorithm::kFewChannels) {
 
     return true;
   }
-  
+
   // CUTLASS DGRAD's *strided* specialization does not support split-k mode 
-  if ((ImplicitGemm::kConvolutionalOperator == 
-          cutlass::conv::Operator::kDgrad) && 
-      (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport == 
+  if ((ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ||
+          ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDeconv) &&
+      (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport == 
         cutlass::conv::StrideSupport::kStrided)) {
 
     passed = testbed.run(
@@ -734,6 +734,18 @@ bool TestAllConv2d(
       cutlass::from_real<typename ImplicitGemm::ElementCompute>(2.0), 
       cutlass::from_real<typename ImplicitGemm::ElementCompute>(2.0));
 
+    passed = testbed.run(
+      cutlass::conv::Conv2dProblemSize(
+      {1, 56, 56, 8},   // input size (NHWC)
+      {8, 1, 1, 8},     // filter size (KRSC)
+      {0, 0, 0, 0},     // padding (pad_h, _, pad_w, _)
+      {1, 1},           // stride (stride_h, stride_w)
+      {1, 1})           // dilation (dilation_h, dilation_w)
+      .reset_split_k_slices(2),
+      cutlass::conv::SplitKMode::kSerial,
+      cutlass::from_real<typename ImplicitGemm::ElementCompute>(2.0), 
+      cutlass::from_real<typename ImplicitGemm::ElementCompute>(2.0));
+
     if (!passed) {
       return false;
     }
@@ -742,7 +754,7 @@ bool TestAllConv2d(
   }
   // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for 
   // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters 
-  // which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep 
+  // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep
   // alpha and beta for local testing, but only runs one value for alpha and beta.
   cutlass::conv::Conv2dProblemSize conv2d_split_k_test_size (
       {1, 17, 11, 288},   // input size (NHWC)
@@ -784,7 +796,7 @@ bool TestAllConv2d(
             return false;
           }
 
-          // If CUTLASS_UNIT_TEST_PROBLEM_COUNT is set reduce the the number of tested problem counts
+          // If CUTLASS_UNIT_TEST_PROBLEM_COUNT is set reduce the number of tested problem counts
           if (CutlassUnitTestProblemCount() && 
               testbed.tested_problem_count > CutlassUnitTestProblemCount()) {
             return true;
diff --git a/test/unit/conv/device/conv2d_testbed_interleaved.h b/test/unit/conv/device/conv2d_testbed_interleaved.h
index 2aa60f0bea..cc1e8ab746 100644
--- a/test/unit/conv/device/conv2d_testbed_interleaved.h
+++ b/test/unit/conv/device/conv2d_testbed_interleaved.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -56,7 +56,7 @@
 #include "cutlass/core_io.h"
 #include "cutlass/util/tensor_view_io.h"
 
-#include "cache_testbed_output.h"
+#include "../cache_testbed_output.h"
 
 namespace test {
 namespace conv {
@@ -186,6 +186,34 @@ class InterleavedTestbedConv2d {
     tensor_D_reference.sync_device();
   }
 
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    size_t smem_size = sizeof(typename Conv2d::UnderlyingKernel::SharedStorage);
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerMultiprocessor < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
   /// Executes one test
   bool run(
     cutlass::conv::Conv2dProblemSize const &problem_size,
@@ -193,6 +221,14 @@ class InterleavedTestbedConv2d {
     ElementCompute alpha = ElementCompute(1),
     ElementCompute beta = ElementCompute(0)) {
 
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
 #if 0 //display conv2d problem size for debugging
     std::cout << problem_size << std::endl
               << "alpha, beta: (" << float(alpha) << ", " << float(beta) << ")" << std::endl
@@ -257,15 +293,15 @@ class InterleavedTestbedConv2d {
         cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, problem_size),
         {
           reinterpret_cast<ElementAccumulator*> (workspace.get()),
-          ReductionStrideIndex(tensor_C.stride()[Conv2d::ImplicitGemmKernel::kTensorCStrideIdx])
+          ReductionStrideIndex(tensor_C.stride()[Conv2d::UnderlyingKernel::kTensorCStrideIdx])
         },
         {
           tensor_D_computed.device_data(),
-          ReductionStrideIndex(tensor_C.stride()[Conv2d::ImplicitGemmKernel::kTensorCStrideIdx])
+          ReductionStrideIndex(tensor_C.stride()[Conv2d::UnderlyingKernel::kTensorCStrideIdx])
         },
         {
           tensor_C.device_data(),
-          ReductionStrideIndex(tensor_C.stride()[Conv2d::ImplicitGemmKernel::kTensorCStrideIdx])
+          ReductionStrideIndex(tensor_C.stride()[Conv2d::UnderlyingKernel::kTensorCStrideIdx])
         },
         // apply alpha, beta to obtain the following equation alpha * ReduceAdd(A * B) + beta * C 
         {alpha, beta}
@@ -374,6 +410,7 @@ class InterleavedTestbedConv2d {
       LayoutC,
       ElementCompute,
       ElementAccumulator,
+      ElementC,
       cutlass::NumericConverterClamp<ElementC, ElementCompute>
     >(
       kConvolutionalOperator,
@@ -481,7 +518,7 @@ class InterleavedTestbedConv2d {
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 // TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
 // TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
-// Additionaly, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes 
+// Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes 
 // (conv_blacklist_sizes)
 /////////////////////////////////////////////////////////////////////////////////////////////////////////////
 template <typename ImplicitGemm, int InterleavedK>
@@ -536,7 +573,7 @@ bool TestAllInterleavedConv2d(
       // CUTLASS DGRAD's unity stride specialization only support stride {1, 1} 
       if ((ImplicitGemm::kConvolutionalOperator == 
             cutlass::conv::Operator::kDgrad) && 
-          (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport == 
+          (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport == 
             cutlass::conv::StrideSupport::kUnity)) {
         if (!((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
           continue;
@@ -572,7 +609,7 @@ bool TestAllInterleavedConv2d(
 #if 0
   // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for 
   // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters 
-  // which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep 
+  // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep
   // alpha and beta for local testing, but only runs one value for alpha and beta.
   cutlass::conv::Conv2dProblemSize conv2d_split_k_test_size (
       {1, 17, 11, 288},   // input size (NHWC)
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
index fa51fc0416..49bb3bcffc 100644
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
index 0db46a0ada..fac3708ab3 100644
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
index 4249c6a207..f6c0104cf8 100644
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
index 73f0b628c3..f8d02e59c8 100644
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
index 97fda6a7ed..65bdd8673c 100644
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
index 84b0a2baf6..b7fc11a0c6 100644
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
index 541304e3d9..01c0f151d7 100644
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
index 49b550f2ca..cf093472fc 100644
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_with_absmax_testbed.h b/test/unit/conv/device/conv2d_with_absmax_testbed.h
new file mode 100644
index 0000000000..ba54d208a7
--- /dev/null
+++ b/test/unit/conv/device/conv2d_with_absmax_testbed.h
@@ -0,0 +1,622 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Testbed for running device-level Conv2Ds with absolute maximum calculation and scaling
+*/
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "conv2d_problems.h"
+#include "../../common/cutlass_unit_test.h"
+#include "../../gemm/device/testbed_utils.h"
+
+#include "cutlass/matrix_coord.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_reduce.h"
+
+namespace test {
+namespace conv {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Conv,
+  template<typename T> class ActivationFunctor
+>
+struct TestbedConv2dWithAbsMax {
+
+  using ElementAccumulator = typename Conv::ElementAccumulator;
+  using ElementCompute = typename Conv::UnderlyingKernel::Epilogue::OutputOp::ElementCompute;
+  using ElementScalingFactor = typename Conv::EpilogueOutputOp::ElementScalingFactor;
+  using ElementAbsmax = typename Conv::EpilogueOutputOp::ElementAbsmax;
+  static cutlass::conv::Operator const kConvolutionalOperator = Conv::kConvolutionalOperator;
+
+  static bool const kScaleAux = Conv::EpilogueOutputOp::kIsScalingAndAmaxAuxOutputNeeded;
+  static bool const kScaleOutput = Conv::EpilogueOutputOp::kIsScalingAndAmaxOutputNeeded;
+  bool doScaleA;
+  bool doScaleB;
+  bool doScaleC;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<typename Conv::ElementA, typename Conv::LayoutA> tensor_A;
+  cutlass::HostTensor<typename Conv::ElementB, typename Conv::LayoutB> tensor_B;
+  cutlass::HostTensor<typename Conv::ElementC, typename Conv::LayoutC> tensor_C;
+  cutlass::HostTensor<typename Conv::EpilogueOutputOp::ElementAuxOutput, typename Conv::LayoutC> tensor_Aux;
+  cutlass::HostTensor<typename Conv::EpilogueOutputOp::ElementOutput, typename Conv::LayoutC> tensor_D;
+  cutlass::HostTensor<typename Conv::ElementC, typename Conv::LayoutC> tensor_Vector;
+  cutlass::HostTensor<ElementAccumulator, typename Conv::LayoutC> tmp_D;
+  cutlass::HostTensor<typename Conv::EpilogueOutputOp::ElementOutput, typename Conv::LayoutC> reference_D;
+  cutlass::HostTensor<typename Conv::EpilogueOutputOp::ElementAuxOutput, typename Conv::LayoutC> reference_Aux;
+  cutlass::HostTensor<ElementScalingFactor, typename Conv::LayoutC> scale_A;
+  cutlass::HostTensor<ElementScalingFactor, typename Conv::LayoutC> scale_B;
+  cutlass::HostTensor<ElementScalingFactor, typename Conv::LayoutC> scale_C;
+  cutlass::HostTensor<ElementScalingFactor, typename Conv::LayoutC> scale_D;
+  cutlass::HostTensor<ElementScalingFactor, typename Conv::LayoutC> scale_Aux;
+  cutlass::HostTensor<ElementAbsmax, typename Conv::LayoutC> abs_max_Aux;
+  cutlass::HostTensor<ElementAbsmax, typename Conv::LayoutC> abs_max_D;
+  cutlass::HostTensor<ElementAbsmax, typename Conv::LayoutC> reference_abs_max_Aux;
+  cutlass::HostTensor<ElementAbsmax, typename Conv::LayoutC> reference_abs_max_D;
+
+  //
+  // Methods
+  //
+
+  TestbedConv2dWithAbsMax(
+    bool scaleA = true,
+    bool scaleB = true,
+    bool scaleC = true,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    doScaleA(scaleA), doScaleB(scaleB), doScaleC(scaleC),
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { }
+
+  /// Helper to initialize scaling factors
+  template <typename Element, typename Layout>
+  bool initialize_scale_factor(cutlass::TensorView<Element, Layout> view, uint64_t seed, int bits=0) {
+    cutlass::reference::host::TensorFillRandomUniform(view, seed, double(1.), double(0.), bits);
+    return true;
+  }
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view,
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      double scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<typename Conv::ElementC>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        scope_max = 5;
+        scope_min = -5;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope_max, scope_min, 0);
+    }
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    }
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(
+        view.data(), view.capacity());
+    }
+    else {
+      EXPECT_TRUE(false) << "Not implemented";
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Initializes data structures
+  void initialize(cutlass::conv::Conv2dProblemSize const &problem_size) {
+    //
+    // Allocate the GEMM workspace
+    //
+
+    tensor_A.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size));
+    tensor_B.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size));
+    tensor_C.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_D.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_Vector.resize({1, 1, 1, implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size).c()});
+    reference_D.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size), false);
+    tmp_D.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size), false);
+
+    EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 2019));
+    EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2018));
+    EXPECT_TRUE(initialize_tensor(tensor_C.host_view(), init_C, seed + 2017));
+    EXPECT_TRUE(initialize_tensor(tensor_Vector.host_view(), init_C, seed + 2020));
+
+    // It is possible to randomly initialize to all zeros, so override this with non-zeros
+    // in the upper left corner of each operand.
+    cutlass::Coord<4> origin(0);
+    tensor_A.host_view().at(origin) = typename Conv::ElementA(1);
+    tensor_B.host_view().at(origin) = typename Conv::ElementB(1);
+    tensor_C.host_view().at(origin) = typename Conv::ElementC(1);
+    tensor_Vector.host_view().at(origin) = typename Conv::ElementC(1);
+
+    cutlass::reference::host::TensorFill(tensor_D.host_view());
+    cutlass::reference::host::TensorCopy(reference_D.host_view(), tensor_C.host_view());
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D.sync_device();
+    tensor_Vector.sync_device();
+
+    int scale_bits = 2;
+    if (doScaleA) {
+      scale_A.resize({1, 1, 1, 1});
+      EXPECT_TRUE(initialize_scale_factor(scale_A.host_view(), seed + 2021, scale_bits));
+      scale_A.sync_device();
+    }
+
+    if (doScaleB) {
+      scale_B.resize({1, 1, 1, 1});
+      EXPECT_TRUE(initialize_scale_factor(scale_B.host_view(), seed + 2022, scale_bits));
+      scale_B.sync_device();
+    }
+
+    if (doScaleC) {
+      scale_C.resize({1, 1, 1, 1});
+      EXPECT_TRUE(initialize_scale_factor(scale_C.host_view(), seed + 2023, scale_bits));
+      scale_C.sync_device();
+    }
+
+    if (kScaleOutput) {
+      scale_D.resize({1, 1, 1, 1});
+      EXPECT_TRUE(initialize_scale_factor(scale_D.host_view(), seed + 2024, scale_bits));
+      scale_D.sync_device();
+
+      abs_max_D.resize({1, 1, 1, 1});
+      cutlass::reference::host::TensorFill(abs_max_D.host_view());
+      abs_max_D.sync_device();
+
+      reference_abs_max_D.resize({1, 1, 1, 1});
+    }
+
+    if (kScaleAux) {
+      tensor_Aux.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+      cutlass::reference::host::TensorFill(tensor_Aux.host_view());
+      tensor_Aux.sync_device();
+
+      scale_Aux.resize({1, 1, 1, 1});
+      EXPECT_TRUE(initialize_scale_factor(scale_Aux.host_view(), seed + 2025, scale_bits));
+      scale_Aux.sync_device();
+
+      abs_max_Aux.resize({1, 1, 1, 1});
+      cutlass::reference::host::TensorFill(abs_max_Aux.host_view());
+      abs_max_Aux.sync_device();
+
+      reference_Aux.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size), false);
+      reference_abs_max_Aux.resize({1, 1, 1, 1});
+    }
+  }
+
+  /// Compares computed reference with device reference and outputs to a file if incorrect
+  bool compare_reference(
+    cutlass::conv::Conv2dProblemSize const &problem_size,
+    ElementCompute alpha,
+    ElementCompute beta) {
+
+    tensor_D.sync_host();
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_A.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_B.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_C.host_view()), 0);
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0);
+    bool passed = cutlass::reference::host::TensorEquals(reference_D.host_view(), tensor_D.host_view());
+
+    if (kScaleAux) {
+      tensor_Aux.sync_host();
+      abs_max_Aux.sync_host();
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_Aux.host_view()), 0);
+      EXPECT_GT(cutlass::reference::host::TensorNorm(abs_max_Aux.host_view()), 0);
+      EXPECT_GT(cutlass::reference::host::TensorNorm(reference_Aux.host_view()), 0);
+      passed &= cutlass::reference::host::TensorEquals(reference_Aux.host_view(), tensor_Aux.host_view());
+      passed &= cutlass::reference::host::TensorEquals(abs_max_Aux.host_view(), reference_abs_max_Aux.host_view());
+    }
+
+    if (kScaleOutput) {
+      abs_max_D.sync_host();
+      EXPECT_GT(cutlass::reference::host::TensorNorm(abs_max_D.host_view()), 0);
+      passed &= cutlass::reference::host::TensorEquals(abs_max_D.host_view(), reference_abs_max_D.host_view());
+    }
+
+    EXPECT_TRUE(passed) << " mismatched reference";
+
+    if (!passed) {
+
+      std::ofstream file0("conv_testbed_with_amax_errors_reference.txt");
+      std::ofstream file1("conv_testbed_with_amax_errors_computed.txt");
+
+      std::ofstream file("conv_testbed_with_amax_errors.txt");
+
+      file
+        << "problem: " << problem_size
+        << ", alpha: " << alpha << ", beta: " << beta << "\n\n";
+
+      file
+        << "A =\n" << tensor_A.host_view()
+        << "\nB =\n" << tensor_B.host_view()
+        << "\nC =\n" << tensor_C.host_view()
+        << "\nVector =\n" << tensor_Vector.host_view()
+        << "\nScaleA = " << scale_A.host_view()
+        << "\nScaleB = " << scale_B.host_view()
+        << "\nScaleC = " << scale_C.host_view()
+        << "\nScaleD = " << scale_D.host_view()
+        << "\nScaleAux = " << scale_Aux.host_view()
+        << std::endl;
+
+      file0 << "\n\nReference D =\n" << reference_D.host_view() << std::endl;
+      file1 << "\n\nComputed D =\n" << tensor_D.host_view() << std::endl;
+      if (kScaleAux) {
+        file0 << "\n\nReference Aux =\n" << reference_Aux.host_view() << std::endl;
+        file1 << "\n\nComputed Aux =\n" << tensor_Aux.host_view() << std::endl;
+        file0 << "\n\nReference Absmax Aux = " << reference_abs_max_Aux.host_view() << std::endl;
+        file1 << "\n\nComputed Absmax Aux = " << abs_max_Aux.host_view() << std::endl;
+      }
+      if (kScaleOutput) {
+        file0 << "\n\nReference Absmax D = " << reference_abs_max_D.host_view() << std::endl;
+        file1 << "\n\nComputed Absmax D = " << abs_max_D.host_view() << std::endl;
+      }
+    }
+
+    return passed;
+  }
+
+  /// Verifies the result is a GEMM
+  bool verify(
+    cutlass::conv::Conv2dProblemSize const &problem_size,
+    ElementCompute alpha,
+    ElementCompute beta) {
+
+    cutlass::Coord<4> origin(0);
+    ElementCompute scaled_alpha = alpha;
+    if (doScaleA) {
+      scaled_alpha *= scale_A.host_view().at(origin);
+    }
+    if (doScaleB) {
+      scaled_alpha *= scale_B.host_view().at(origin);
+    }
+
+    ElementCompute scaled_beta = beta;
+    if (doScaleC) {
+      scaled_beta *= scale_C.host_view().at(origin);
+    }
+
+    //
+    // Verify
+    //
+
+    cutlass::reference::host::Conv2d<
+        typename Conv::ElementA, typename Conv::LayoutA,
+        typename Conv::ElementB, typename Conv::LayoutB,
+        typename Conv::ElementC, typename Conv::LayoutC,
+        ElementCompute, ElementAccumulator, ElementAccumulator
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.host_ref(),
+      tensor_B.host_ref(),
+      tensor_C.host_ref(),
+      tmp_D.host_ref(),
+      scaled_alpha,
+      scaled_beta
+    );
+
+    ElementCompute tmp_abs_max_Aux(0.);
+    ElementCompute tmp_abs_max_D(0.);
+
+    cutlass::NumericConverter<ElementCompute, typename Conv::ElementC> cvt_c_to_compute;
+    cutlass::NumericConverter<ElementCompute, ElementAccumulator> cvt_accum_to_compute;
+    cutlass::NumericConverter<ElementAbsmax, ElementCompute> cvt_compute_to_absmax;
+    cutlass::NumericConverter<typename Conv::EpilogueOutputOp::ElementOutput, ElementCompute> cvt_compute_to_d;
+    cutlass::NumericConverter<typename Conv::EpilogueOutputOp::ElementAuxOutput, ElementCompute> cvt_compute_to_aux;
+
+    cutlass::absolute_value_op<ElementCompute> abs;
+    cutlass::maximum_with_nan_propogation<ElementCompute> max;
+    ActivationFunctor<ElementCompute> act;
+
+    ElementScalingFactor d_scale = kScaleOutput ? scale_D.host_view().at(origin) : ElementScalingFactor(1.);
+
+    for (int n = 0; n < problem_size.N; ++n) {
+      for (int p = 0; p < problem_size.P; ++p) {
+        for (int q = 0; q < problem_size.Q; ++q) {
+          for (int k = 0; k < problem_size.K; ++k) {
+            ElementCompute intermediate = cvt_accum_to_compute(tmp_D.host_view().at({n, p, q, k}));
+            ElementCompute bias = cvt_c_to_compute(tensor_Vector.host_view().at({0, 0, 0, k}));
+            ElementCompute aux = intermediate + bias;
+            ElementCompute d = act(aux);
+            tmp_abs_max_Aux = max(abs(aux), tmp_abs_max_Aux);
+            tmp_abs_max_D = max(abs(d), tmp_abs_max_D);
+            reference_D.host_view().at({n, p, q, k}) = cvt_compute_to_d(d * d_scale);
+
+            if (kScaleAux) {
+              reference_Aux.host_view().at({n, p, q, k}) = cvt_compute_to_aux(aux * scale_Aux.host_view().at(origin));
+            }
+          }
+        }
+      }
+    }
+    if (kScaleAux) {
+      reference_abs_max_Aux.host_view().at(origin) = cvt_compute_to_absmax(tmp_abs_max_Aux);
+    }
+
+    if (kScaleOutput) {
+      reference_abs_max_D.host_view().at(origin) = cvt_compute_to_absmax(tmp_abs_max_D);
+    }
+
+    return compare_reference(problem_size, alpha, beta);
+  }
+
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    size_t smem_size = sizeof(typename Conv::UnderlyingKernel::SharedStorage);
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerBlockOptin < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::conv::Conv2dProblemSize const &problem_size,
+    ElementCompute alpha = ElementCompute(1),
+    ElementCompute beta = ElementCompute(0))
+  {
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+    this->initialize(problem_size);
+
+    //
+    // Initialize the GEMM operator
+    //
+
+    typename Conv::EpilogueOutputOp::Params::ActivationParams activation_params{alpha, beta};
+    typename Conv::EpilogueOutputOp::Params epilogue_params{
+      activation_params,
+      scale_A.device_data(),
+      scale_B.device_data(),
+      scale_C.device_data(),
+      scale_D.device_data(),
+      scale_Aux.device_data(),
+      abs_max_Aux.device_data(),
+      abs_max_D.device_data()
+    };
+
+    typename Conv::Arguments arguments{
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C.device_ref(),
+      tensor_D.device_ref(),
+      tensor_Aux.device_ref(),
+      epilogue_params,
+      cutlass::conv::SplitKMode::kSerial,
+      tensor_Vector.device_data(),
+      0
+    };
+
+    Conv conv2d_op;
+
+    cutlass::Status status = conv2d_op.can_implement(arguments);
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    size_t workspace_size = Conv::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    status = conv2d_op.initialize(arguments, workspace.get());
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Run the GEMM
+    //
+
+    status = conv2d_op();
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    cudaError_t cuda_error = cudaDeviceSynchronize();
+    EXPECT_TRUE(cuda_error == cudaSuccess) << cudaGetErrorString(cuda_error);
+
+    //
+    // Verify
+    //
+
+    bool passed = this->verify(problem_size, alpha, beta);
+
+    if (!passed) {
+      std::cout << "Failed" << std::endl;
+    }
+
+    return passed;
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ImplicitGemm,
+  template<typename T> class ActivationFunctor = cutlass::epilogue::thread::Identity
+>
+bool TestAllConv2dWithAbsmax(bool scaleA=true, bool scaleB=true, bool scaleC=true) {
+  const Conv2dProblemVector &conv_test_sizes = Conv2dProblemVector();
+  const Conv2dProblemVector &conv_blacklist_sizes = Conv2dProblemVector();
+
+  //
+  // Testbed object
+  //
+
+  TestbedConv2dWithAbsMax<ImplicitGemm, ActivationFunctor> testbed(scaleA, scaleB, scaleC);
+
+  //
+  // Get conv problem sizes to run conv operator 
+  //
+  TestbedConv2dProblemSizes conv_problems(128/cutlass::sizeof_bits<typename ImplicitGemm::ElementA>::value);
+
+  // Vector of conv2d problem sizes to avoid duplicate runs
+  Conv2dProblemVector conv_tested_sizes;
+
+  Conv2dProblemVector const *problem_vectors[] = {
+    &conv_test_sizes,                               // run user specified sizes
+    &conv_problems.conv2d_default_sizes,            // run default and cudnn bug sizes
+    &conv_problems.conv2d_resnet50_sizes,           // run resnet50 sizes
+#if CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED 
+    &conv_problems.conv2d_rigorous_sizes,           // run large and rigorous sizes if enabled
+#endif
+  };
+
+  bool passed = true;
+
+  // Sweep conv2d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for (Conv2dProblemVector const * problem_vector : problem_vectors) {
+
+    // Prune all problems with channels that aren't divisible by the number of elements accessed per
+    // load for operands A and B. This is meant to align with the requirements of iterators used for
+    // fprop kernels.
+    ChannelDivisibilitySpecification channel_spec(128 / cutlass::sizeof_bits<typename ImplicitGemm::ElementA>::value);
+    auto pruned_problem_vector = prune(*problem_vector, channel_spec);
+
+    //  Run conv testbed on default convolution sizes
+    for(auto conv_problem : pruned_problem_vector) {
+
+      // Skip blacklist and avoid duplicate problem sizes
+      if (std::find(conv_blacklist_sizes.begin(), conv_blacklist_sizes.end(), conv_problem) != conv_blacklist_sizes.end() ||
+          std::find(conv_tested_sizes.begin(), conv_tested_sizes.end(), conv_problem) != conv_tested_sizes.end()) {
+        continue;
+      }
+
+      //
+      // Test
+      //
+      // push back tested problem size to avoid re-running duplicates
+      conv_tested_sizes.push_back(conv_problem);
+
+      // test mode = xcross
+      passed &= testbed.run(conv_problem);
+
+      if (!passed) {
+        return false;
+      }
+
+      // test mode = convolution
+      passed &= testbed.run(conv_problem.reset_mode(cutlass::conv::Mode::kConvolution));
+
+      if (!passed) {
+        return false;
+      }
+    }
+  }
+
+  return passed;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace conv
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/conv/device/conv2d_with_broadcast_testbed.h b/test/unit/conv/device/conv2d_with_broadcast_testbed.h
index dd12bf6056..278d447f80 100644
--- a/test/unit/conv/device/conv2d_with_broadcast_testbed.h
+++ b/test/unit/conv/device/conv2d_with_broadcast_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -59,7 +59,7 @@
 #include "cutlass/core_io.h"
 #include "cutlass/util/tensor_view_io.h"
 
-#include "cache_testbed_output.h"
+#include "../cache_testbed_output.h"
 
 namespace test {
 namespace conv {
@@ -120,6 +120,7 @@ class TestbedConv2dWithBroadcast {
   using EpilogueOutputOp = typename Conv2d::EpilogueOutputOp;
   using ElementZ = typename EpilogueOutputOp::ElementZ;
   using ElementT = typename EpilogueOutputOp::ElementT;
+  using ElementVector = typename EpilogueOutputOp::ElementVector;
 
   static cutlass::conv::Operator const kConvolutionalOperator = Conv2d::kConvolutionalOperator;
   static const bool kAddBroadcastFirst = AddBroadcastFirst;
@@ -142,7 +143,7 @@ class TestbedConv2dWithBroadcast {
   cutlass::HostTensor<ElementT, LayoutC> tensor_T_computed;
   cutlass::HostTensor<ElementT, LayoutC> tensor_T_reference;
   cutlass::HostTensor<ElementAccumulator, LayoutC> tensor_Y_reference;
-  cutlass::HostTensor<ElementC, LayoutC> tensor_Broadcast;                 // Input Broadcast
+  cutlass::HostTensor<ElementVector, LayoutC> tensor_Broadcast;            // Input Broadcast
 
 public:
 
@@ -253,7 +254,7 @@ class TestbedConv2dWithBroadcast {
     // Determine SMEM requirements and waive if not satisfied
     //
 
-    int smem_size = int(sizeof(typename Conv2d::ImplicitGemmKernel::SharedStorage));
+    size_t smem_size = sizeof(typename Conv2d::UnderlyingKernel::SharedStorage);
 
     cudaDeviceProp properties;
     int device_idx;
@@ -269,7 +270,7 @@ class TestbedConv2dWithBroadcast {
       throw std::runtime_error("cudaGetDeviceProperties() failed");
     }
 
-    if (properties.sharedMemPerMultiprocessor < smem_size) {
+    if (properties.sharedMemPerBlockOptin < smem_size) {
       return false;
     }
 
@@ -403,15 +404,15 @@ class TestbedConv2dWithBroadcast {
 
     // compute tensor Z and tensor T
     for (int n = 0; n < problem_size.N; ++n) {
-      for (int p = 0; p < problem_size.P; ++p) {
-        for (int q = 0; q < problem_size.Q; ++q) {
-          for (int k = 0; k < problem_size.K; ++k) {
+      for (int p = 0; p < (kConvolutionalOperator == cutlass::conv::Operator::kFprop ? problem_size.P : problem_size.H); ++p) {
+        for (int q = 0; q < (kConvolutionalOperator == cutlass::conv::Operator::kFprop ? problem_size.Q : problem_size.W); ++q) {
+          for (int k = 0; k < (kConvolutionalOperator == cutlass::conv::Operator::kFprop ? problem_size.K : problem_size.C); ++k) {
   
-            ElementZ z;
-            ElementT t;
+            ElementZ z{};
+            ElementT t{};
     
             ElementCompute accum = tensor_Y_reference.at({n, p, q, k});
-	    ElementCompute bias = ElementCompute(tensor_Broadcast.at({0, 0, 0, k}));
+	          ElementCompute bias = ElementCompute(tensor_Broadcast.at({0, 0, 0, k}));
 
 
             if (kAddBroadcastFirst) {
@@ -448,7 +449,8 @@ class TestbedConv2dWithBroadcast {
       fname << "error_Conv2d_ImplicitGemm_device_"
         << (split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial_reduction_" : "parallel_reduction_")
         << (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kFprop ? "fprop_" :
-            (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ? "dgrad_" : "wgrad_")) 
+            (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ? "dgrad_" :
+              (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kDeconv ? "deconv_" : "wgrad_")))
         << "nhwc_"
         << problem_size.N << "x"
         << problem_size.H << "x"
@@ -498,10 +500,56 @@ class TestbedConv2dWithBroadcast {
   }
 };
 
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename ImplicitGemm,
+          typename ReferenceOp = Conv2dWithBroadcastReferenceOp<ImplicitGemm>,
+          bool AddBroadcastFirst = false>
+bool TestSpecificConv2dWithBroadcast(
+  const Conv2dProblemVector & problem_sizes) {
+
+  bool passed = true;
+
+  //
+  // Testbed object
+  //
+
+  TestbedConv2dWithBroadcast<ImplicitGemm, ReferenceOp, AddBroadcastFirst> testbed;
+
+  // Sweep conv2d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for(auto conv_problem : problem_sizes) {
+
+    //
+    // Test
+    //
+
+    // test mode = xcross
+    passed = testbed.run(
+      conv_problem,
+      cutlass::conv::SplitKMode::kSerial);
+
+    if (!passed) {
+      return false;
+    }
+
+    // test mode = convolution
+    passed = testbed.run(
+      conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
+      cutlass::conv::SplitKMode::kSerial);
+
+    if (!passed) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 // TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
 // TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
-// Additionaly, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes 
+// Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes 
 // (conv_blacklist_sizes)
 /////////////////////////////////////////////////////////////////////////////////////////////////////////////
 template <typename ImplicitGemm,
@@ -555,9 +603,9 @@ bool TestAllConv2dWithBroadcast(
       //
   
       // CUTLASS DGRAD's *unity* stride specialization only support stride {1, 1} 
-      if ((ImplicitGemm::kConvolutionalOperator == 
-            cutlass::conv::Operator::kDgrad) && 
-          (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport == 
+      if ((ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ||
+            ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDeconv) && 
+          (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport == 
             cutlass::conv::StrideSupport::kUnity)) {
         if (!((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
           continue;
@@ -566,9 +614,9 @@ bool TestAllConv2dWithBroadcast(
 
 #if 0 // relax restrictions on analytic strided dgrad
       // CUTLASS DGRAD's *strided* specialization only support stride >= {2, 2} 
-      if ((ImplicitGemm::kConvolutionalOperator == 
-            cutlass::conv::Operator::kDgrad) && 
-          (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport == 
+      if ((ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ||
+            ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDeconv) && 
+          (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport == 
             cutlass::conv::StrideSupport::kStrided)) {
          if (((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
            continue;
@@ -603,9 +651,9 @@ bool TestAllConv2dWithBroadcast(
   }
 
   // CUTLASS DGRAD's *strided* specialization does not support split-k mode 
-  if ((ImplicitGemm::kConvolutionalOperator == 
-          cutlass::conv::Operator::kDgrad) && 
-      (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport == 
+  if ((ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ||
+        ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDeconv) && 
+      (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport == 
         cutlass::conv::StrideSupport::kStrided)) {
 
     passed = testbed.run(
@@ -631,7 +679,7 @@ bool TestAllConv2dWithBroadcast(
 
   // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for 
   // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters 
-  // which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep 
+  // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep
   // alpha and beta for local testing, but only runs one value for alpha and beta.
   cutlass::conv::Conv2dProblemSize conv2d_split_k_test_size (
       {1, 17, 11, 288},   // input size (NHWC)
diff --git a/test/unit/conv/device/conv2d_with_reduction_testbed.h b/test/unit/conv/device/conv2d_with_reduction_testbed.h
index a147275bd7..255a6aab72 100644
--- a/test/unit/conv/device/conv2d_with_reduction_testbed.h
+++ b/test/unit/conv/device/conv2d_with_reduction_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -56,7 +56,7 @@
 #include "cutlass/core_io.h"
 #include "cutlass/util/tensor_view_io.h"
 
-#include "cache_testbed_output.h"
+#include "../cache_testbed_output.h"
 
 namespace test {
 namespace conv {
@@ -182,7 +182,7 @@ class TestbedConv2dWithReduction {
     // Determine SMEM requirements and waive if not satisfied
     //
 
-    int smem_size = int(sizeof(typename Conv2d::ImplicitGemmKernel::SharedStorage));
+    size_t smem_size = sizeof(typename Conv2d::UnderlyingKernel::SharedStorage);
 
     cudaDeviceProp properties;
     int device_idx;
@@ -198,7 +198,7 @@ class TestbedConv2dWithReduction {
       throw std::runtime_error("cudaGetDeviceProperties() failed");
     }
 
-    if (properties.sharedMemPerMultiprocessor < smem_size) {
+    if (properties.sharedMemPerBlockOptin < smem_size) {
       return false;
     }
 
@@ -464,7 +464,7 @@ class TestbedConv2dWithReduction {
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 // TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
 // TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
-// Additionaly, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes 
+// Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes 
 // (conv_blacklist_sizes)
 /////////////////////////////////////////////////////////////////////////////////////////////////////////////
 template <typename ImplicitGemm>
@@ -516,7 +516,7 @@ bool TestAllConv2dWithReduction(
       // CUTLASS DGRAD's *unity* stride specialization only support stride {1, 1} 
       if ((ImplicitGemm::kConvolutionalOperator == 
             cutlass::conv::Operator::kDgrad) && 
-          (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport == 
+          (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport == 
             cutlass::conv::StrideSupport::kUnity)) {
         if (!((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
           continue;
@@ -527,7 +527,7 @@ bool TestAllConv2dWithReduction(
       // CUTLASS DGRAD's *strided* specialization only support stride >= {2, 2} 
       if ((ImplicitGemm::kConvolutionalOperator == 
             cutlass::conv::Operator::kDgrad) && 
-          (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport == 
+          (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport == 
             cutlass::conv::StrideSupport::kStrided)) {
          if (((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
            continue;
@@ -564,7 +564,7 @@ bool TestAllConv2dWithReduction(
   // CUTLASS DGRAD's *strided* specialization does not support split-k mode 
   if ((ImplicitGemm::kConvolutionalOperator == 
           cutlass::conv::Operator::kDgrad) && 
-      (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport == 
+      (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport == 
         cutlass::conv::StrideSupport::kStrided)) {
 
     passed = testbed.run(
@@ -587,7 +587,7 @@ bool TestAllConv2dWithReduction(
 
   // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for 
   // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters 
-  // which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep 
+  // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep
   // alpha and beta for local testing, but only runs one value for alpha and beta.
   cutlass::conv::Conv2dProblemSize conv2d_split_k_test_size (
       {1, 17, 11, 288},   // input size (NHWC)
diff --git a/test/unit/conv/device/conv3d_dgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_dgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
index ddb79cf522..eefc8c9fe2 100644
--- a/test/unit/conv/device/conv3d_dgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv3d_dgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv3d_dgrad_implicit_gemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv3d_dgrad_implicit_gemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu
new file mode 100644
index 0000000000..e2716ad0e3
--- /dev/null
+++ b/test/unit/conv/device/conv3d_dgrad_implicit_gemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu
@@ -0,0 +1,142 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv3d_dgrad.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv3d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv3d_Dgrad_Analytic_ImplicitGemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32,
+  128x128_8x4_32x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = float;
+  using ElementB           = float;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+
+  /// Device-level Conv3d instance
+  using Conv3dDgradKernel = typename cutlass::conv::kernel::DefaultConv3dDgrad<
+    ElementA, 
+    cutlass::layout::TensorNDHWC,
+    ElementB, 
+    cutlass::layout::TensorNDHWC,
+    ElementC, 
+    cutlass::layout::TensorNDHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic,
+    cutlass::conv::StrideSupport::kStrided
+  >::Kernel;
+
+  using Conv3dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv3dDgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv3d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv3d<Conv3dDgrad>());
+
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv3d_Dgrad_Optimized_ImplicitGemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32,
+  128x128_8x4_64x32x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = float;
+  using ElementB           = float;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+
+  /// Device-level Conv3d instance
+  using Conv3dDgradKernel = typename cutlass::conv::kernel::DefaultConv3dDgrad<
+    ElementA, 
+    cutlass::layout::TensorNDHWC,
+    ElementB, 
+    cutlass::layout::TensorNDHWC,
+    ElementC, 
+    cutlass::layout::TensorNDHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<64, 32, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized,
+    cutlass::conv::StrideSupport::kUnity
+  >::Kernel;
+
+  using Conv3dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv3dDgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv3d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv3d<Conv3dDgrad>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
diff --git a/test/unit/conv/device/conv3d_dgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_dgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
index 786a94791c..fafe81f35f 100644
--- a/test/unit/conv/device/conv3d_dgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv3d_dgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu b/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
index b0846db0be..6da86b72d6 100644
--- a/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
+++ b/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
index 7193892908..27ae274c22 100644
--- a/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -111,7 +111,8 @@ TEST(SM80_Device_Conv3d_Fprop_Optimized_ImplicitGemm_f16ndhwc_f16ndhwc_f32ndhwc_
     cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     4,
     cutlass::arch::OpMultiplyAdd,
-    cutlass::conv::IteratorAlgorithm::kOptimized
+    cutlass::conv::IteratorAlgorithm::kOptimized,
+    cutlass::conv::StrideSupport::kStrided
   >::Kernel;
 
   using Conv3dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv3dFpropKernel>;
diff --git a/test/unit/conv/device/conv3d_fprop_implicit_gemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv3d_fprop_implicit_gemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu
new file mode 100644
index 0000000000..d6134fa2ef
--- /dev/null
+++ b/test/unit/conv/device/conv3d_fprop_implicit_gemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu
@@ -0,0 +1,137 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/kernel/default_conv3d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv3d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv3d_Fprop_Analytic_ImplicitGemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32,
+  128x128_8x4_32x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = float;
+  using ElementB           = float;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+
+  /// Device-level Conv3d instance
+  using Conv3dFpropKernel = typename cutlass::conv::kernel::DefaultConv3dFprop<
+    ElementA, 
+    cutlass::layout::TensorNDHWC,
+    ElementB, 
+    cutlass::layout::TensorNDHWC,
+    ElementC, 
+    cutlass::layout::TensorNDHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv3dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv3dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv3d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv3d<Conv3dFprop>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv3d_Fprop_Optimized_ImplicitGemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32,
+  128x128_8x4_64x32x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = float;
+  using ElementB           = float;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+
+  /// Device-level Conv3d instance
+  using Conv3dFpropKernel = typename cutlass::conv::kernel::DefaultConv3dFprop<
+    ElementA, 
+    cutlass::layout::TensorNDHWC,
+    ElementB, 
+    cutlass::layout::TensorNDHWC,
+    ElementC, 
+    cutlass::layout::TensorNDHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<64, 32, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv3dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv3dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv3d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv3d<Conv3dFprop>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
diff --git a/test/unit/conv/device/conv3d_fprop_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_fprop_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
index 248eeca66e..90af0d7695 100644
--- a/test/unit/conv/device/conv3d_fprop_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv3d_fprop_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv3d_fprop_with_broadcast_simt_sm80.cu b/test/unit/conv/device/conv3d_fprop_with_broadcast_simt_sm80.cu
new file mode 100644
index 0000000000..a3461f8e58
--- /dev/null
+++ b/test/unit/conv/device/conv3d_fprop_with_broadcast_simt_sm80.cu
@@ -0,0 +1,172 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
+#include "cutlass/epilogue/thread/linear_combination_residual_block.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/conv/kernel/default_conv3d_fprop_with_broadcast.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv3d_with_broadcast_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+
+TEST(SM80_Device_Conv3d_Fprop_With_Broadcast_Analytic_ImplicitGemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32,
+  128x128_32x2_64x64x32) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA = float;
+  using ElementB = float;
+  using ElementC = float;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationBiasElementwise<
+    ElementC,
+    ElementAccumulator,
+    ElementCompute,
+    ElementC,
+    ElementC,
+    1,
+    cutlass::epilogue::thread::ReLu<float>
+  >;
+
+  /// Device-level Conv3d instance
+  using Conv3dFpropKernel = typename cutlass::conv::kernel::DefaultConv3dFpropWithBroadcast<
+    ElementA, cutlass::layout::TensorNDHWC,
+    ElementB, cutlass::layout::TensorNDHWC,
+    ElementC, cutlass::layout::TensorNDHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>,
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    EpilogueOutputOp,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic,
+    cutlass::conv::StrideSupport::kStrided
+  >::Kernel;
+
+  using Conv3dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv3dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv3d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv3dWithBroadcast<Conv3dFprop>());
+}
+
+// Test residual block fusion: UnaryOp(BinaryOp(ActivationOp(Conv3d(X) + bias), residual))
+// LinearCombinationResidualBlock does not support the split-k mode unless ActivationOp is Identity.
+// This is because the activation needs to be applied to the fully accumulated output of the Conv3d op,
+// which only the last thread block would have an access to, before applying BinaryOp.
+// The epilogue functor in the last thread block would have to be given three inputs, namely
+// partial outputs, bias, and residual, but this is not supported in the current interface.
+// Set TestSplitK = false to skip split-k tests with non-trivial ActivationOp.
+template <
+ template<typename T> class ActivationOp,
+ template<typename T> class BinaryOp,
+ template<typename T> class UnaryOp,
+ bool TestSplitK = true
+>
+static void Conv3dFpropSM80TestResidualBlock() {
+  using ElementA = float;
+  using ElementB = float;
+  using ElementC = float;
+  using ElementD = ElementC;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationResidualBlock<
+    ElementD,
+    ElementAccumulator,
+    ElementCompute,
+    ElementC,
+    1,
+    ActivationOp,
+    BinaryOp,
+    UnaryOp
+  >;
+
+  using Conv3dFpropKernel = typename cutlass::conv::kernel::DefaultConv3dFpropWithBroadcast<
+    ElementA, cutlass::layout::TensorNDHWC,
+    ElementB, cutlass::layout::TensorNDHWC,
+    ElementC, cutlass::layout::TensorNDHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>,
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    EpilogueOutputOp,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv3dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv3dFpropKernel>;
+
+  struct ReferenceOp {
+    using OutputOp = typename Conv3dFprop::EpilogueOutputOp;
+    using ElementZ = typename OutputOp::ElementZ;
+
+    ActivationOp<ElementCompute> activation;
+    BinaryOp<ElementCompute> binary_op;
+    UnaryOp<ElementCompute> unary_op;
+
+    void operator()(ElementZ &Z, ElementZ&, ElementCompute conv3d, ElementCompute residual) {
+      Z = ElementZ(unary_op(binary_op(activation(conv3d), residual)));
+    }
+  };
+
+  bool passed = test::conv::device::TestAllConv3dWithBroadcast<Conv3dFprop, ReferenceOp, true, TestSplitK>();
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM80_Device_Conv3d_Fprop_With_Residual_Block_Plus_Analytic_ImplicitGemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32,
+     128x128_8x4_32x64x8) {
+  // Resnet
+  Conv3dFpropSM80TestResidualBlock<cutlass::epilogue::thread::Identity, cutlass::plus, cutlass::epilogue::thread::ReLu>();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/conv/device/conv3d_problems.h b/test/unit/conv/device/conv3d_problems.h
index 2051cd4e70..4082cce83e 100644
--- a/test/unit/conv/device/conv3d_problems.h
+++ b/test/unit/conv/device/conv3d_problems.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -117,6 +117,17 @@ struct TestbedConv3dProblemSizes {
       {8, 1, 1, 3, minimum_channel_size},   // filter size (KTRSC)
       cutlass::Coord<3>({1, 1, 1}),         // padding (pad_d, pad_h, pad_w)
       cutlass::Coord<3>({1, 1, 1}),         // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})          // dilation (dilation_d, dilation_h, dilation_w)
+    ));
+
+    conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize(
+      {1, 1, 1, 8, minimum_channel_size},   // input size  (NDHWC)
+      {8, 1, 1, 3, minimum_channel_size},   // filter size (KTRSC)
+      CUTLASS_STL_NAMESPACE::make_tuple(
+        cutlass::Coord<3>({1, 1, 1}),       // near padding (pad_d, pad_h, pad_w)
+        cutlass::Coord<3>({0, 0, 0})        // far padding (pad_d, pad_h, pad_w)
+      ),
+      cutlass::Coord<3>({1, 1, 1}),         // stride (stride_d, stride_h, stride_w)
       cutlass::Coord<3>({1, 1, 1})          // dilation (dilation_d, dilation_h, dilation_w) 
     ));
 
@@ -128,6 +139,17 @@ struct TestbedConv3dProblemSizes {
       cutlass::Coord<3>({1, 1, 1})          // dilation (dilation_d, dilation_h, dilation_w) 
     ));
 
+    conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize(
+      {1, 8, 8, 8, minimum_channel_size},    // input size  (NDHWC)
+      {8, 3, 3, 3, minimum_channel_size},    // filter size (KTRSC)
+      CUTLASS_STL_NAMESPACE::make_tuple(
+        cutlass::Coord<3>({1, 1, 1}),       // near padding (pad_d, pad_h, pad_w)
+        cutlass::Coord<3>({0, 0, 0})        // far padding (pad_d, pad_h, pad_w)
+      ),
+      cutlass::Coord<3>({1, 1, 1}),          // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})           // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+
     conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize(
       {1, 16, 16, 16, minimum_channel_size}, // input size  (NDHWC)
       {8, 3, 3, 3, minimum_channel_size},   // filter size (KTRSC)
diff --git a/test/unit/conv/device/conv3d_testbed.h b/test/unit/conv/device/conv3d_testbed.h
index f9cc3563c9..54c8143fd4 100644
--- a/test/unit/conv/device/conv3d_testbed.h
+++ b/test/unit/conv/device/conv3d_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -55,7 +55,7 @@
 #include "conv3d_problems.h"
 #include "cutlass/core_io.h"
 
-#include "cache_testbed_output.h"
+#include "../cache_testbed_output.h"
 
 namespace test {
 namespace conv {
@@ -169,7 +169,7 @@ class TestbedConv3d {
     tensor_D_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
 
     initialize_tensor(tensor_A.host_view(), init_A, seed); 
-    initialize_tensor(tensor_B.host_view(), init_B, seed * 17); 
+    initialize_tensor(tensor_B.host_view(), init_B, seed * 17);
     initialize_tensor(tensor_C.host_view(), init_C, seed * 39);
 
     tensor_A.sync_device();
@@ -184,7 +184,7 @@ class TestbedConv3d {
     // Determine SMEM requirements and waive if not satisfied
     //
 
-    int smem_size = int(sizeof(typename Conv3d::ImplicitGemmKernel::SharedStorage));
+    size_t smem_size = sizeof(typename Conv3d::UnderlyingKernel::SharedStorage);
 
     cudaDeviceProp properties;
     int device_idx;
@@ -200,7 +200,7 @@ class TestbedConv3d {
       throw std::runtime_error("cudaGetDeviceProperties() failed");
     }
 
-    if (properties.sharedMemPerMultiprocessor < smem_size) {
+    if (properties.sharedMemPerBlockOptin < smem_size) {
       return false;
     }
 
@@ -246,12 +246,18 @@ class TestbedConv3d {
       split_k_mode
     );
 
+    cutlass::Status status = conv3d_op.can_implement(conv3d_args);
+    if (status != cutlass::Status::kSuccess) {
+      std::cerr << "can_implement failed for the given problem_size: \n";
+      return false;
+    }
+
     // find workspace requirement for parallel split-k reduction
     size_t workspace_size = Conv3d::get_workspace_size(conv3d_args);
 
     cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
 
-    cutlass::Status status = conv3d_op.initialize(conv3d_args, workspace.get());
+    status = conv3d_op.initialize(conv3d_args, workspace.get());
 
     if (status != cutlass::Status::kSuccess) {
       cudaError_t error = cudaGetLastError();
@@ -294,15 +300,15 @@ class TestbedConv3d {
         cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, problem_size),
         {
           reinterpret_cast<ElementAccumulator*> (workspace.get()),
-          ReductionStrideIndex(tensor_C.stride()[Conv3d::ImplicitGemmKernel::kTensorCStrideIdx])
+          ReductionStrideIndex(tensor_C.stride()[Conv3d::UnderlyingKernel::kTensorCStrideIdx])
         },
         {
           tensor_D_computed.device_data(),
-          ReductionStrideIndex(tensor_C.stride()[Conv3d::ImplicitGemmKernel::kTensorCStrideIdx])
+          ReductionStrideIndex(tensor_C.stride()[Conv3d::UnderlyingKernel::kTensorCStrideIdx])
         },
         {
           tensor_C.device_data(),
-          ReductionStrideIndex(tensor_C.stride()[Conv3d::ImplicitGemmKernel::kTensorCStrideIdx])
+          ReductionStrideIndex(tensor_C.stride()[Conv3d::UnderlyingKernel::kTensorCStrideIdx])
         },
         // apply alpha, beta to obtain the following equation alpha * ReduceAdd(A * B) + beta * C 
         {alpha, beta}
@@ -358,12 +364,12 @@ class TestbedConv3d {
     bool cached_result_loaded = false;
     CachedTestResult cached_test_result;
 
-    std::string conv2d_result_cache_name = 
+    std::string conv3d_result_cache_name =
       std::string("cached_results_") + CUTLASS_TARGET_NAME + ".txt";
-    
+
     if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
 
-      CachedTestResultListing cached_results(conv2d_result_cache_name);
+      CachedTestResultListing cached_results(conv3d_result_cache_name);
 
       auto cached = cached_results.find(cached_test_key);
 
@@ -376,7 +382,7 @@ class TestbedConv3d {
     if (!cached_result_loaded) {
 
 #if CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED
-    
+
     cutlass::reference::device::Conv3d<
       ElementA,
       LayoutA,
@@ -426,15 +432,14 @@ class TestbedConv3d {
 
         cached_test_result.D = TensorHash(tensor_D_reference.host_view());
 
-        CachedTestResultListing cached_results(conv2d_result_cache_name);
+        CachedTestResultListing cached_results(conv3d_result_cache_name);
 
         cached_results.append(cached_test_key, cached_test_result);
-        cached_results.write(conv2d_result_cache_name);
+        cached_results.write(conv3d_result_cache_name);
       }
     } // if (!cached_result_loaded)
 
     uint32_t tensor_D_hash = TensorHash(tensor_D_computed.host_view());
-    
     if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
       passed = (tensor_D_hash == cached_test_result.D);
 
@@ -456,7 +461,8 @@ class TestbedConv3d {
       fname << "error_Conv3d_ImplicitGemm_device_"
         << (split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial_reduction_" : "parallel_reduction_")
         << (Conv3d::kConvolutionalOperator == cutlass::conv::Operator::kFprop ? "fprop_" :
-            (Conv3d::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ? "dgrad_" : "wgrad_")) 
+            (Conv3d::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ? "dgrad_" :
+              (Conv3d::kConvolutionalOperator == cutlass::conv::Operator::kDeconv ? "deconv_" : "wgrad_")))
         << "ndhwc_"
         << problem_size.N << "x"
         << problem_size.D << "x"
@@ -522,7 +528,7 @@ class TestbedConv3d {
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 // TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
 // TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
-// Additionaly, each conv3d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes 
+// Additionally, each conv3d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes 
 // (conv_blacklist_sizes)
 /////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -571,11 +577,11 @@ bool TestAllConv3d(
       //
   
       // CUTLASS DGRAD's unity stride specialization only support stride {1, 1, 1} 
-      if ((ImplicitGemm::kConvolutionalOperator == 
-            cutlass::conv::Operator::kDgrad) && 
-          ((ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport == 
+      if ((ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ||
+            ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDeconv) &&
+          ((ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport == 
             cutlass::conv::StrideSupport::kUnity) ||
-           (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorB::kStrideSupport == 
+           (ImplicitGemm::UnderlyingKernel::Mma::IteratorB::kStrideSupport == 
             cutlass::conv::StrideSupport::kUnity))) {
         if (!((conv_problem.stride_d == 1) &&
               (conv_problem.stride_h == 1) && 
@@ -613,7 +619,7 @@ bool TestAllConv3d(
 
   // Sweep split-k-slice using serial reduction with non-unity alpha and non-zero beta for 
   // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters 
-  // which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep 
+  // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep
   // alpha and beta for local testing, but only runs one value for alpha and beta.
   cutlass::conv::Conv3dProblemSize conv3d_split_k_test_size (
     {1, 8, 8, 8, 32},            // input size  (NDHWC)
@@ -662,6 +668,47 @@ bool TestAllConv3d(
   return passed;
 }
 
+template <typename ImplicitGemm>
+bool TestSpecificConv3d(
+  const Conv3dProblemVector & problem_sizes) {
+
+  bool passed = true;
+
+  //
+  // Testbed object
+  //
+
+  TestbedConv3d<ImplicitGemm> testbed;
+
+  // Sweep conv3d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for(auto conv_problem : problem_sizes) {
+
+    //
+    // Test
+    //
+
+    // test mode = xcross
+    passed = testbed.run(
+      conv_problem,
+      cutlass::conv::SplitKMode::kSerial);
+
+    if (!passed) {
+      return false;
+    }
+
+    // test mode = convolution
+    passed = testbed.run(
+      conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
+      cutlass::conv::SplitKMode::kSerial);
+
+    if (!passed) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace device
diff --git a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
index 7fd4ba2bc5..b644c3f3c0 100644
--- a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
+++ b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
index 09988ddd1e..601e4c6f63 100644
--- a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu
new file mode 100644
index 0000000000..14269a327f
--- /dev/null
+++ b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu
@@ -0,0 +1,136 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv3d_wgrad.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv3d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv3d_Wgrad_Analytic_ImplicitGemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32,
+  128x128_8x4_32x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = float;
+  using ElementB           = float;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+
+  /// Device-level Conv3d instance
+  using Conv3dWgradKernel = typename cutlass::conv::kernel::DefaultConv3dWgrad<
+    ElementA, 
+    cutlass::layout::TensorNDHWC,
+    ElementB, 
+    cutlass::layout::TensorNDHWC,
+    ElementC, 
+    cutlass::layout::TensorNDHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv3dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv3dWgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv3d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv3d<Conv3dWgrad>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv3d_Wgrad_Optimized_ImplicitGemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32,
+  128x128_8x4_64x32x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = float;
+  using ElementB           = float;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+
+  /// Device-level Conv3d instance
+  using Conv3dWgradKernel = typename cutlass::conv::kernel::DefaultConv3dWgrad<
+    ElementA, 
+    cutlass::layout::TensorNDHWC,
+    ElementB, 
+    cutlass::layout::TensorNDHWC,
+    ElementC, 
+    cutlass::layout::TensorNDHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<64, 32, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv3dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv3dWgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv3d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv3d<Conv3dWgrad>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
diff --git a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
index 960f692207..fdcdf482da 100644
--- a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv3d_with_broadcast_testbed.h b/test/unit/conv/device/conv3d_with_broadcast_testbed.h
new file mode 100644
index 0000000000..437dbd30bd
--- /dev/null
+++ b/test/unit/conv/device/conv3d_with_broadcast_testbed.h
@@ -0,0 +1,732 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implicit GEMM for fused epilogue broadcast testbed
+
+    Parallel split-k is not tested because we can just use regular conv kernel
+    when we need to use parallel-splitk.  Broadcast can happen in the reduction
+    kernel.
+*/
+#pragma once
+
+#include <fstream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/reduction/device/reduce_split_k.h"
+#include "cutlass/reduction/thread/reduction_operators.h"
+
+#include "conv3d_problems.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/reference/device/convolution.h"
+
+#include "cutlass/core_io.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "../cache_testbed_output.h"
+
+namespace test {
+namespace conv {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Conv3d>
+struct Conv3dWithBroadcastReferenceOp {
+
+  using OutputOp = typename Conv3d::EpilogueOutputOp;
+
+  using ElementCompute = typename OutputOp::ElementCompute;
+  using ElementZ = typename OutputOp::ElementZ;
+  using ElementT = typename OutputOp::ElementT;
+
+  typename OutputOp::BinaryOp binary_op;
+  typename OutputOp::ElementwiseOp elementwise_op;
+
+  Conv3dWithBroadcastReferenceOp() { }
+
+  void operator()(ElementZ &Z, ElementT &T, ElementCompute conv3d, ElementCompute bias) {
+    ElementCompute t_full = binary_op(conv3d, bias);
+    T = ElementT(t_full);
+
+    ElementCompute z_full = elementwise_op(t_full);
+    Z = ElementZ(z_full);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Fused testbed
+//
+//  Y = CONV(AB, C)
+//
+//  T[n, o, p, q, k] = ReductionOp(Y[n, o, p, q, k], Broadcast[k])
+//
+//  Z[n, o, p, q, k] = Elementwise(T[n, o, p, q, k])
+//
+
+template <
+  typename Conv3d,
+  typename ReferenceOp,
+  bool AddBroadcastFirst = false
+>
+class TestbedConv3dWithBroadcast {
+public:
+
+  using ElementA = typename Conv3d::ElementA;
+  using LayoutA = typename Conv3d::LayoutA;
+  using ElementB = typename Conv3d::ElementB;
+  using LayoutB = typename Conv3d::LayoutB;
+  using ElementC = typename Conv3d::ElementC;
+  using LayoutC = typename Conv3d::LayoutC;
+  using ElementAccumulator = typename Conv3d::ElementAccumulator;
+  using ElementCompute = typename Conv3d::ElementCompute;
+  using EpilogueOutputOp = typename Conv3d::EpilogueOutputOp;
+  using ElementZ = typename EpilogueOutputOp::ElementZ;
+  using ElementT = typename EpilogueOutputOp::ElementT;
+  using ElementVector = typename EpilogueOutputOp::ElementVector;
+
+  static cutlass::conv::Operator const kConvolutionalOperator = Conv3d::kConvolutionalOperator;
+  static const bool kAddBroadcastFirst = AddBroadcastFirst;
+  static const bool kStoreT = EpilogueOutputOp::kStoreT;
+
+public:
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<ElementA, LayoutA> tensor_A;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_B;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_C;
+  cutlass::HostTensor<ElementAccumulator, LayoutC> tensor_C_reference;
+  cutlass::HostTensor<ElementZ, LayoutC> tensor_Z_computed;
+  cutlass::HostTensor<ElementZ, LayoutC> tensor_Z_reference;
+  cutlass::HostTensor<ElementT, LayoutC> tensor_T_computed;
+  cutlass::HostTensor<ElementT, LayoutC> tensor_T_reference;
+  cutlass::HostTensor<ElementAccumulator, LayoutC> tensor_Y_reference;
+  cutlass::HostTensor<ElementVector, LayoutC> tensor_Broadcast;            // Input Broadcast
+
+public:
+
+  TestbedConv3dWithBroadcast(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {
+
+  }
+
+    /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  void initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      int scope;
+      int bits = cutlass::sizeof_bits<Element>::value;
+
+      if (bits <= 8) {
+        scope = 2;
+      }
+      else if (bits == 16) {
+        if (cutlass::sizeof_bits<ElementAccumulator>::value <= 16) {
+          scope = 3;
+        }
+        else {
+          scope = 5;
+        }
+      }
+      else {
+        scope = 8;
+      }
+      
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope, -scope, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+    } 
+    else {
+    }
+  }
+
+  void initialize(
+    cutlass::conv::Conv3dProblemSize const &problem_size, bool non_packed_test = false, uint64_t seed = 2019) {
+        
+    // to make the layout of tensors a little bit bigger than the problem size
+    cutlass::Tensor5DCoord stride_increment = cutlass::Tensor5DCoord(8, 16, 32, 32, 64);
+
+    cutlass::Tensor5DCoord tensor_A_extent = implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size);
+    cutlass::Tensor5DCoord tensor_B_extent = implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size);
+    cutlass::Tensor5DCoord tensor_C_extent = implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size);
+
+    if (non_packed_test) {
+      tensor_A_extent += stride_increment;
+      tensor_C_extent += stride_increment;
+    }
+
+    tensor_A.resize(tensor_A_extent);
+    tensor_B.resize(tensor_B_extent);
+    tensor_C.resize(tensor_C_extent);
+    tensor_C_reference.resize(tensor_C_extent);
+    tensor_Z_computed.resize(tensor_C_extent);
+    tensor_Z_reference.resize(tensor_C_extent);
+    tensor_T_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_T_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_Y_reference.resize(tensor_C_extent);
+    tensor_Broadcast.resize({
+      1,
+      1,
+      1,
+      1,
+      implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size).c(),
+    });
+
+    initialize_tensor(tensor_A.host_view(), init_A, seed); 
+    initialize_tensor(tensor_B.host_view(), init_B, seed * 17); 
+    initialize_tensor(tensor_C.host_view(), init_C, seed * 39);
+    initialize_tensor(tensor_Broadcast.host_view(), init_C, seed * 39);
+    for (int n = 0; n < tensor_C_reference.extent().n(); ++n) {
+      for (int o = 0; o < tensor_C_reference.extent().d(); ++o) {
+        for (int p = 0; p < tensor_C_reference.extent().h(); ++p) {
+          for (int q = 0; q < tensor_C_reference.extent().w(); ++q) {
+            for (int k = 0; k < tensor_C_reference.extent().c(); ++k) {
+              tensor_C_reference.at({n, o, p, q, k}) = ElementAccumulator(tensor_C.at({n, o, p, q, k}));
+            }
+          }
+        }
+      }
+    }
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_Broadcast.sync_device();
+    tensor_C_reference.sync_device();
+    tensor_Z_computed.sync_device();
+    tensor_Z_reference.sync_device();
+    tensor_T_computed.sync_device();
+    tensor_T_reference.sync_device();
+    tensor_Y_reference.sync_device();
+  }
+
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    size_t smem_size = sizeof(typename Conv3d::UnderlyingKernel::SharedStorage);
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerBlockOptin < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::conv::Conv3dProblemSize const &problem_size,
+    cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
+    bool non_packed_test = false,
+    ElementCompute alpha = ElementCompute(1),
+    ElementCompute beta = ElementCompute(1)) {
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+#if 0 //display conv3d problem size for debugging
+    std::cout << problem_size << std::endl
+              << "alpha, beta: (" << alpha << ", " << beta << ")" << std::endl
+              << "split_k_mode: " << ((split_k_mode == cutlass::conv::SplitKMode::kSerial) ? "(serial)" : "(parallel)") << std::endl
+              << std::endl;
+#endif
+
+    initialize(problem_size, non_packed_test);
+
+    // configure the operator
+    Conv3d conv3d_op;
+    typename Conv3d::Arguments conv3d_args(
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C.device_ref(),
+      tensor_Z_computed.device_ref(),
+      {alpha, beta},
+      split_k_mode,
+      tensor_Broadcast.device_data(),
+      kStoreT ? tensor_T_computed.device_data() : nullptr,
+      0,         // This must be zero
+      implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size).c()
+    );
+
+    // initialize the kernel 
+    size_t workspace_size = Conv3d::get_workspace_size(conv3d_args);
+
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = conv3d_op.initialize(conv3d_args, workspace.get());
+
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return true;
+    }
+
+    // run conv3d operator
+    status = conv3d_op();
+    
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+
+    bool passed = false;
+
+    cudaError_t result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess) << " device reference error: " 
+                                   << cudaGetErrorString(result);
+
+    tensor_T_computed.sync_host();
+    tensor_Z_computed.sync_host();
+
+    //
+    // Reference check
+    //
+
+    // When kAddBroadcastFirst is true, add bias on the host
+    ElementCompute beta_ref = kAddBroadcastFirst ? ElementCompute(0) : beta;
+
+#if CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED
+
+    cutlass::reference::device::Conv3d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementAccumulator,
+      LayoutC,
+      ElementAccumulator,
+      ElementAccumulator 
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C_reference.device_ref(),
+      tensor_Y_reference.device_ref(),
+      alpha, 
+      beta_ref);
+
+    // sync host (copy device data to host) for dumping error output in case of mismatches
+    tensor_Y_reference.sync_host();
+    
+#else 
+
+    cutlass::reference::host::Conv3d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementAccumulator,
+      LayoutC,
+      ElementAccumulator,
+      ElementAccumulator
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.host_ref(),
+      tensor_B.host_ref(),
+      tensor_C_reference.host_ref(),
+      tensor_Y_reference.host_ref(),
+      alpha, 
+      beta_ref);
+
+#endif
+    ReferenceOp reference_op;
+
+    // compute tensor Z and tensor T
+    for (int n = 0; n < problem_size.N; ++n) {
+      for (int o = 0; o < (kConvolutionalOperator == cutlass::conv::Operator::kFprop ? problem_size.Z : problem_size.D); ++o) {
+        for (int p = 0; p < (kConvolutionalOperator == cutlass::conv::Operator::kFprop ? problem_size.P : problem_size.H); ++p) {
+          for (int q = 0; q < (kConvolutionalOperator == cutlass::conv::Operator::kFprop ? problem_size.Q : problem_size.W); ++q) {
+            for (int k = 0; k < (kConvolutionalOperator == cutlass::conv::Operator::kFprop ? problem_size.K : problem_size.C); ++k) {
+    
+              ElementZ z{};
+              ElementT t{};
+      
+              ElementCompute accum = tensor_Y_reference.at({n, o, p, q, k});
+              ElementCompute bias = ElementCompute(tensor_Broadcast.at({0, 0, 0, 0, k}));
+
+
+              if (kAddBroadcastFirst) {
+                reference_op(z, t, accum + bias,
+                            beta * ElementCompute(tensor_C_reference.at({n, o, p, q, k})));
+              } else {
+                reference_op(z, t, accum, bias);
+              }   
+  
+              tensor_Z_reference.at({n, o, p, q, k}) = z;
+              tensor_T_reference.at({n, o, p, q, k}) = t;
+            }
+          }
+        }
+      }
+    }
+
+    if (kStoreT) {
+      passed = cutlass::reference::host::TensorEquals(
+        tensor_T_computed.host_view(), 
+        tensor_T_reference.host_view());
+
+      EXPECT_TRUE(passed);
+    }
+
+    passed = cutlass::reference::host::TensorEquals(
+      tensor_Z_computed.host_view(), 
+      tensor_Z_reference.host_view());
+
+    EXPECT_TRUE(passed);
+
+    if (!passed) {
+      std::stringstream fname;
+
+      fname << "error_Conv3d_ImplicitGemm_device_"
+        << (split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial_reduction_" : "parallel_reduction_")
+        << (Conv3d::kConvolutionalOperator == cutlass::conv::Operator::kFprop ? "fprop_" :
+            (Conv3d::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ? "dgrad_" :
+              (Conv3d::kConvolutionalOperator == cutlass::conv::Operator::kDeconv ? "deconv_" : "wgrad_")))
+        << "nnhwc_"
+        << problem_size.N << "x"
+        << problem_size.D << "x"
+        << problem_size.H << "x"
+        << problem_size.W << "x"
+        << problem_size.C 
+        << "_krsc_"
+        << problem_size.K << "x"
+        << problem_size.T << "x"
+        << problem_size.R << "x"
+        << problem_size.S << "x"
+        << problem_size.C 
+        << "_padding_"
+        << problem_size.pad_d << "x"
+        << problem_size.pad_h << "x"
+        << problem_size.pad_w 
+        << "_stride_"
+        << problem_size.stride_d << "x"
+        << problem_size.stride_h << "x"
+        << problem_size.stride_w 
+        << "_dilation_"
+        << problem_size.dilation_d << "x"
+        << problem_size.dilation_h << "x"
+        << problem_size.dilation_w << "_"
+        << (problem_size.mode == cutlass::conv::Mode::kCrossCorrelation ? "xcorr_" : "conv_")
+        << (non_packed_test ? "non_packed_tensor_test_" : "packed_tensor_test_")
+        << Conv3d::ThreadblockShape::kM << "x"  
+        << Conv3d::ThreadblockShape::kN << "x"  
+        << Conv3d::ThreadblockShape::kK << "_"
+        << Conv3d::WarpShape::kM << "x"  
+        << Conv3d::WarpShape::kN << "x"  
+        << Conv3d::WarpShape::kK << ".txt";
+
+      std::cout << fname.str() << std::endl;
+
+      std::ofstream results(fname.str());
+
+      results << problem_size << std::endl;
+
+      results
+        << "\nA:\n" << tensor_A.host_view() << "\n"
+        << "\nB:\n" << tensor_B.host_view() << "\n"
+        << "\nC:\n" << tensor_C.host_view() << "\n"
+        << "\nBroadcast:\n" << tensor_Broadcast.host_view() << "\n"
+        << "\nY reference:\n" << tensor_Y_reference.host_view() << "\n"
+        << "\nT reference:\n" << tensor_T_reference.host_view() << "\n"
+        << "\nT computed:\n" << tensor_T_computed.host_view() << "\n"
+        << "\nZ reference:\n" << tensor_Z_reference.host_view() << "\n"
+        << "\nZ computed:\n" << tensor_Z_computed.host_view() << "\n";
+    }
+
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
+// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv3dProblemSizes
+// Additionally, each conv3d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes 
+// (conv_blacklist_sizes)
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename ImplicitGemm,
+          typename ReferenceOp = Conv3dWithBroadcastReferenceOp<ImplicitGemm>,
+          bool AddBroadcastFirst = false,
+          bool TestSplitK = true 
+>
+bool TestAllConv3dWithBroadcast(
+  const Conv3dProblemVector &conv_test_sizes = Conv3dProblemVector(),
+  const Conv3dProblemVector &conv_blacklist_sizes = Conv3dProblemVector(),
+  bool non_packed_test = false) {
+
+  bool passed = true;
+
+  //
+  // Testbed object
+  //
+
+  TestbedConv3dWithBroadcast<ImplicitGemm, ReferenceOp, AddBroadcastFirst> testbed;
+
+  //
+  // Get conv problem sizes to run conv operator 
+  //
+  TestbedConv3dProblemSizes conv3d_problems(128/cutlass::sizeof_bits<typename ImplicitGemm::ElementA>::value);
+
+  // Vector of conv3d problem sizes to avoid duplicate runs
+  Conv3dProblemVector conv_tested_sizes;
+
+  Conv3dProblemVector const *problem_vectors[] = {
+    &conv3d_problems.conv3d_default_sizes,
+    &conv3d_problems.conv3d_vnet_medical_sizes,
+    &conv_test_sizes
+  };
+
+  // Sweep conv3d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for (Conv3dProblemVector const * problem_vector : problem_vectors) {
+
+    //  Run conv testbed on default convolution sizes
+    for(auto conv_problem : *problem_vector) {
+
+      // Skip blacklist and avoid duplicate problem sizes
+      if (std::find(conv_blacklist_sizes.begin(), conv_blacklist_sizes.end(), conv_problem) != conv_blacklist_sizes.end() ||
+          std::find(conv_tested_sizes.begin(), conv_tested_sizes.end(), conv_problem) != conv_tested_sizes.end()) {
+        continue;
+      }
+
+      //
+      // Procedurally disable certain cases
+      //
+  
+      // CUTLASS DGRAD's *unity* stride specialization only support stride {1, 1} 
+      if ((ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ||
+            ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDeconv) && 
+          (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport == 
+            cutlass::conv::StrideSupport::kUnity)) {
+        if (!((conv_problem.stride_d == 1) &&
+              (conv_problem.stride_h == 1) && 
+              (conv_problem.stride_w == 1))
+          ) {
+          continue;
+        }
+      }
+
+#if 0 // relax restrictions on analytic strided dgrad
+      // CUTLASS DGRAD's *strided* specialization only support stride >= {2, 2} 
+      if ((ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ||
+            ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDeconv) && 
+          (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport == 
+            cutlass::conv::StrideSupport::kStrided)) {
+         if (((conv_problem.stride_d == 1) && (conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
+           continue;
+         }
+      }
+#endif
+      
+      //
+      // Test
+      //
+      // push back tested problem size to avoid re-running duplicates
+      conv_tested_sizes.push_back(conv_problem);
+
+      // test mode = xcross
+      passed = testbed.run(
+        conv_problem,
+        cutlass::conv::SplitKMode::kSerial, non_packed_test);
+
+      if (!passed) {
+        return false;
+      }
+
+      // test mode = convolution
+      passed = testbed.run(
+        conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
+        cutlass::conv::SplitKMode::kSerial, non_packed_test);
+
+      if (!passed) {
+        return false;
+      }
+    }
+  }
+
+  if (!TestSplitK)
+    return passed;
+
+  // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for 
+  // a single conv3d problem size. Convolution unit tests take a long time to run so only sweep parameters 
+  // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep
+  // alpha and beta for local testing, but only runs one value for alpha and beta.
+  cutlass::conv::Conv3dProblemSize conv3d_split_k_test_size (
+    {1, 8, 8, 8, 32},               // input size  (NDHWC)
+    {32, 3, 3, 3, 32},              // filter size (KTRSC)
+    cutlass::Coord<3>({0, 0, 0}),   // padding (pad_d, pad_h, pad_w)
+    cutlass::Coord<3>({1, 1, 1}),   // stride (stride_d, stride_h, stride_w)
+    cutlass::Coord<3>({1, 1, 1})    // dilation (dilation_d, dilation_h, dilation_w) 
+  );
+
+  cutlass::conv::SplitKMode split_k_modes [] = {
+    cutlass::conv::SplitKMode::kSerial
+  };
+
+  int split_k_slices[] = {
+    1, 2, 3, 4, 201
+  };
+
+  double problem_alpha[] = {
+    2.0
+  };
+
+  double problem_beta[] = {
+    2.0
+  };
+
+  for (auto split_k_mode : split_k_modes) {
+    for (auto split_k_slice : split_k_slices) {
+      for (auto alpha : problem_alpha) {
+        for (auto beta : problem_beta) {
+
+          passed = testbed.run(
+            conv3d_split_k_test_size.reset_split_k_slices(split_k_slice),
+            split_k_mode,
+            false,/*non_packed_test*/
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(alpha), 
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(beta));
+
+          if (!passed) {
+            return false;
+          }
+        }
+      }
+    }
+  }
+
+  return passed;
+}
+
+template <typename ImplicitGemm,
+          typename ReferenceOp = Conv3dWithBroadcastReferenceOp<ImplicitGemm>,
+          bool AddBroadcastFirst = false>
+bool TestSpecificConv3dWithBroadcast(
+  const Conv3dProblemVector & problem_sizes,
+  bool non_packed_test = false) {
+
+  bool passed = true;
+
+  //
+  // Testbed object
+  //
+
+  TestbedConv3dWithBroadcast<ImplicitGemm, ReferenceOp, AddBroadcastFirst> testbed;
+
+  // Sweep conv3d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for(auto conv_problem : problem_sizes) {
+
+    //
+    // Test
+    //
+
+    // test mode = xcross, non_packed_test = false
+    passed = testbed.run(
+      conv_problem,
+      cutlass::conv::SplitKMode::kSerial, non_packed_test);
+
+    if (!passed) {
+      return false;
+    }
+
+    // test mode = convolution, non_packed_test = false
+    passed = testbed.run(
+      conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
+      cutlass::conv::SplitKMode::kSerial, non_packed_test);
+
+    if (!passed) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace conv
+} // namespace test
diff --git a/test/unit/conv/device/deconv2d_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/deconv2d_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
new file mode 100644
index 0000000000..73a78d3330
--- /dev/null
+++ b/test/unit/conv/device/deconv2d_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
@@ -0,0 +1,139 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/kernel/default_deconv2d.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Deconv2d_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
+  128x128_8x4_32x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = float;
+  using ElementB           = float;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+
+  /// Device-level Conv2d instance
+  using Deconv2dKernel = typename cutlass::conv::kernel::DefaultDeconv2d<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic,
+    cutlass::conv::StrideSupport::kUnity
+  >::Kernel;
+
+  using Deconv2d = cutlass::conv::device::ImplicitGemmConvolution<Deconv2dKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Deconv2d>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Deconv2d_Fprop_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
+  128x128_8x4_64x32x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = float;
+  using ElementB           = float;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+
+  /// Device-level Conv2d instance
+  using Deconv2dKernel = typename cutlass::conv::kernel::DefaultDeconv2d<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<64, 32, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized,
+    cutlass::conv::StrideSupport::kUnity
+  >::Kernel;
+
+  using Deconv2d = cutlass::conv::device::ImplicitGemmConvolution<Deconv2dKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Deconv2d>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
diff --git a/test/unit/conv/device/deconv2d_with_broadcast_simt_sm80.cu b/test/unit/conv/device/deconv2d_with_broadcast_simt_sm80.cu
new file mode 100644
index 0000000000..bfb85d5126
--- /dev/null
+++ b/test/unit/conv/device/deconv2d_with_broadcast_simt_sm80.cu
@@ -0,0 +1,173 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
+#include "cutlass/epilogue/thread/linear_combination_residual_block.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/conv/kernel/default_deconv2d_with_broadcast.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_with_broadcast_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+
+TEST(SM80_Device_Deconv2d_With_Broadcast_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
+  128x128_32x2_64x64x32) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA = float;
+  using ElementB = float;
+  using ElementC = float;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationBiasElementwise<
+    ElementC,
+    ElementAccumulator,
+    ElementCompute,
+    ElementC,
+    ElementC,
+    1,
+    cutlass::epilogue::thread::ReLu<float>
+  >;
+
+  /// Device-level Conv2d instance
+  using Deconv2dKernel = typename cutlass::conv::kernel::DefaultDeconv2dWithBroadcast<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>,
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    EpilogueOutputOp,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic,
+    cutlass::conv::StrideSupport::kUnity
+  >::Kernel;
+
+  using Deconv2d = cutlass::conv::device::ImplicitGemmConvolution<Deconv2dKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2dWithBroadcast<Deconv2d>());
+}
+
+// Test residual block fusion: UnaryOp(BinaryOp(ActivationOp(Conv2d(X) + bias), residual))
+// LinearCombinationResidualBlock does not support the split-k mode unless ActivationOp is Identity.
+// This is because the activation needs to be applied to the fully accumulated output of the Conv2d op,
+// which only the last thread block would have an access to, before applying BinaryOp.
+// The epilogue functor in the last thread block would have to be given three inputs, namely
+// partial outputs, bias, and residual, but this is not supported in the current interface.
+// Set TestSplitK = false to skip split-k tests with non-trivial ActivationOp.
+template <
+ template<typename T> class ActivationOp,
+ template<typename T> class BinaryOp,
+ template<typename T> class UnaryOp,
+ bool TestSplitK = true
+>
+static void Deconv2dSM80TestResidualBlock() {
+  using ElementA = float;
+  using ElementB = float;
+  using ElementC = float;
+  using ElementD = ElementC;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationResidualBlock<
+    ElementD,
+    ElementAccumulator,
+    ElementCompute,
+    ElementC,
+    1,
+    ActivationOp,
+    BinaryOp,
+    UnaryOp
+  >;
+
+  using Deconv2dKernel = typename cutlass::conv::kernel::DefaultDeconv2dWithBroadcast<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>,
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    EpilogueOutputOp,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic,
+    cutlass::conv::StrideSupport::kUnity
+  >::Kernel;
+
+  using Deconv2d = cutlass::conv::device::ImplicitGemmConvolution<Deconv2dKernel>;
+
+  struct ReferenceOp {
+    using OutputOp = typename Deconv2d::EpilogueOutputOp;
+    using ElementZ = typename OutputOp::ElementZ;
+
+    ActivationOp<ElementCompute> activation;
+    BinaryOp<ElementCompute> binary_op;
+    UnaryOp<ElementCompute> unary_op;
+
+    void operator()(ElementZ &Z, ElementZ&, ElementCompute conv2d, ElementCompute residual) {
+      Z = ElementZ(unary_op(binary_op(activation(conv2d), residual)));
+    }
+  };
+
+  bool passed = test::conv::device::TestAllConv2dWithBroadcast<Deconv2d, ReferenceOp, true, TestSplitK>();
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM80_Device_Deconv2d_With_Residual_Block_Plus_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
+     128x128_8x4_32x64x8) {
+  // Resnet
+  Deconv2dSM80TestResidualBlock<cutlass::epilogue::thread::Identity, cutlass::plus, cutlass::epilogue::thread::ReLu>();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/conv/device/deconv3d_implicit_gemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu b/test/unit/conv/device/deconv3d_implicit_gemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu
new file mode 100644
index 0000000000..929a515165
--- /dev/null
+++ b/test/unit/conv/device/deconv3d_implicit_gemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu
@@ -0,0 +1,141 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_deconv3d.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv3d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Deconv3d_Analytic_ImplicitGemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32,
+  128x128_8x4_32x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = float;
+  using ElementB           = float;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+
+  /// Device-level Conv3d instance
+  using Deconv3dKernel = typename cutlass::conv::kernel::DefaultDeconv3d<
+    ElementA, 
+    cutlass::layout::TensorNDHWC,
+    ElementB, 
+    cutlass::layout::TensorNDHWC,
+    ElementC, 
+    cutlass::layout::TensorNDHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic,
+    cutlass::conv::StrideSupport::kStrided
+  >::Kernel;
+
+  using Deconv3d = cutlass::conv::device::ImplicitGemmConvolution<Deconv3dKernel>;
+
+  /// Run all unit test sizes with device-level Conv3d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv3d<Deconv3d>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Deconv3d_Optimized_ImplicitGemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32,
+  128x128_8x4_64x32x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = float;
+  using ElementB           = float;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+
+  /// Device-level Conv3d instance
+  using Deconv3dKernel = typename cutlass::conv::kernel::DefaultDeconv3d<
+    ElementA, 
+    cutlass::layout::TensorNDHWC,
+    ElementB, 
+    cutlass::layout::TensorNDHWC,
+    ElementC, 
+    cutlass::layout::TensorNDHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<64, 32, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized,
+    cutlass::conv::StrideSupport::kUnity
+  >::Kernel;
+
+  using Deconv3d = cutlass::conv::device::ImplicitGemmConvolution<Deconv3dKernel>;
+
+  /// Run all unit test sizes with device-level Conv3d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv3d<Deconv3d>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
diff --git a/test/unit/conv/device/deconv3d_with_broadcast_simt_sm80.cu b/test/unit/conv/device/deconv3d_with_broadcast_simt_sm80.cu
new file mode 100644
index 0000000000..09817d71a0
--- /dev/null
+++ b/test/unit/conv/device/deconv3d_with_broadcast_simt_sm80.cu
@@ -0,0 +1,172 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
+#include "cutlass/epilogue/thread/linear_combination_residual_block.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/conv/kernel/default_deconv3d_with_broadcast.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv3d_with_broadcast_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+TEST(SM80_Device_Deconv3d_With_Broadcast_Optimized_ImplicitGemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32,
+  128x128_32x2_64x64x32) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA = float;
+  using ElementB = float;
+  using ElementC = float;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationBiasElementwise<
+    ElementC,
+    ElementAccumulator,
+    ElementCompute,
+    ElementC,
+    ElementC,
+    1,
+    cutlass::epilogue::thread::ReLu<float>
+  >;
+
+  /// Device-level Conv3d instance
+  using Deconv3dKernel = typename cutlass::conv::kernel::DefaultDeconv3dWithBroadcast<
+    ElementA, cutlass::layout::TensorNDHWC,
+    ElementB, cutlass::layout::TensorNDHWC,
+    ElementC, cutlass::layout::TensorNDHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>,
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    EpilogueOutputOp,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized,
+    cutlass::conv::StrideSupport::kUnity
+  >::Kernel;
+
+  using Deconv3d = cutlass::conv::device::ImplicitGemmConvolution<Deconv3dKernel>;
+
+  /// Run all unit test sizes with device-level Conv3d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv3dWithBroadcast<Deconv3d>());
+}
+
+// Test residual block fusion: UnaryOp(BinaryOp(ActivationOp(Conv3d(X) + bias), residual))
+// LinearCombinationResidualBlock does not support the split-k mode unless ActivationOp is Identity.
+// This is because the activation needs to be applied to the fully accumulated output of the Conv3d op,
+// which only the last thread block would have an access to, before applying BinaryOp.
+// The epilogue functor in the last thread block would have to be given three inputs, namely
+// partial outputs, bias, and residual, but this is not supported in the current interface.
+// Set TestSplitK = false to skip split-k tests with non-trivial ActivationOp.
+template <
+ template<typename T> class ActivationOp,
+ template<typename T> class BinaryOp,
+ template<typename T> class UnaryOp,
+ bool TestSplitK = true
+>
+static void Deconv3dSM80TestResidualBlock() {
+  using ElementA = float;
+  using ElementB = float;
+  using ElementC = float;
+  using ElementD = ElementC;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationResidualBlock<
+    ElementD,
+    ElementAccumulator,
+    ElementCompute,
+    ElementC,
+    1,
+    ActivationOp,
+    BinaryOp,
+    UnaryOp
+  >;
+
+  using Deconv3dKernel = typename cutlass::conv::kernel::DefaultDeconv3dWithBroadcast<
+    ElementA, cutlass::layout::TensorNDHWC,
+    ElementB, cutlass::layout::TensorNDHWC,
+    ElementC, cutlass::layout::TensorNDHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>,
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    EpilogueOutputOp,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized,
+    cutlass::conv::StrideSupport::kUnity
+  >::Kernel;
+
+  using Deconv3d = cutlass::conv::device::ImplicitGemmConvolution<Deconv3dKernel>;
+
+  struct ReferenceOp {
+    using OutputOp = typename Deconv3d::EpilogueOutputOp;
+    using ElementZ = typename OutputOp::ElementZ;
+
+    ActivationOp<ElementCompute> activation;
+    BinaryOp<ElementCompute> binary_op;
+    UnaryOp<ElementCompute> unary_op;
+
+    void operator()(ElementZ &Z, ElementZ&, ElementCompute conv3d, ElementCompute residual) {
+      Z = ElementZ(unary_op(binary_op(activation(conv3d), residual)));
+    }
+  };
+
+  bool passed = test::conv::device::TestAllConv3dWithBroadcast<Deconv3d, ReferenceOp, true, TestSplitK>();
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM80_Device_Deconv3d_With_Residual_Block_Plus_Analytic_ImplicitGemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32,
+     128x128_8x4_32x64x8) {
+  // Resnet
+  Deconv3dSM80TestResidualBlock<cutlass::epilogue::thread::Identity, cutlass::plus, cutlass::epilogue::thread::ReLu>();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/conv/device/depthwise_conv2d_direct_conv_testbed.h b/test/unit/conv/device/depthwise_conv2d_direct_conv_testbed.h
new file mode 100644
index 0000000000..cff483d9ed
--- /dev/null
+++ b/test/unit/conv/device/depthwise_conv2d_direct_conv_testbed.h
@@ -0,0 +1,473 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Depthwise Direct Conv testbed
+*/
+#pragma once
+
+#include <fstream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "../cache_testbed_output.h"
+#include "conv2d_problems.h"
+#include "cutlass/conv/device/direct_convolution.h"
+
+#include "cutlass/core_io.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/device/convolution.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+namespace test {
+namespace conv {
+namespace device {
+
+template <typename Conv2d>
+class TestbedDepthwiseDirectConv2d {
+ public:
+ 
+  using ElementA = typename Conv2d::ElementA;
+  using LayoutA = typename Conv2d::LayoutA;
+  using ElementB = typename Conv2d::ElementB;
+  using LayoutB = typename Conv2d::LayoutB;
+  using ElementC = typename Conv2d::ElementC;
+  using LayoutC = typename Conv2d::LayoutC;
+  using ElementAccumulator = typename Conv2d::ElementAccumulator;
+  using ElementCompute = typename Conv2d::ElementCompute;
+  using EpilogueOutputOp = typename Conv2d::EpilogueOutputOp;
+
+  static cutlass::conv::Operator const kConvolutionalOperator = Conv2d::kConvolutionalOperator;
+
+ public:
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<ElementA, LayoutA> tensor_A;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_B;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_reordered_B;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_C;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_computed;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_reference;
+
+  int tested_problem_count;
+
+ public:
+  TestbedDepthwiseDirectConv2d(cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+                               cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+                               cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+                               uint64_t seed_ = 2080)
+      : init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_), tested_problem_count(0) {}
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  void initialize_tensor(cutlass::TensorView<Element, Layout> view,
+                         cutlass::Distribution::Kind dist_kind,
+                         uint64_t seed) {
+    if (dist_kind == cutlass::Distribution::Uniform) {
+      int scope;
+      int bits = cutlass::sizeof_bits<Element>::value;
+
+      if (bits <= 8) {
+        scope = 2;
+      } else if (bits == 16) {
+        if (cutlass::sizeof_bits<ElementAccumulator>::value <= 16) {
+          scope = 3;
+        } else {
+          scope = 5;
+        }
+      } else {
+        scope = 8;
+      }
+      cutlass::reference::host::TensorFillRandomUniform(view, seed, scope, -scope, 0);
+    } else if (dist_kind == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(view);
+
+    } else if (dist_kind == cutlass::Distribution::Gaussian) {
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    } else if (dist_kind == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+    } else {
+    }
+  }
+
+  void initialize(cutlass::conv::Conv2dProblemSize const &problem_size, uint64_t seed = 2019) {
+    tensor_A.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size));
+    tensor_B.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size));
+    tensor_reordered_B.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size));
+    tensor_C.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_D_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_D_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+
+    initialize_tensor(tensor_A.host_view(), init_A, seed);
+    initialize_tensor(tensor_B.host_view(), init_B, seed * 17);
+    initialize_tensor(tensor_reordered_B.host_view(), init_B, seed * 17);
+    initialize_tensor(tensor_C.host_view(), init_C, seed * 39);
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_reordered_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D_computed.sync_device();
+    tensor_D_reference.sync_device();
+  }
+
+  bool sufficient(int smem_size) const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerBlockOptin < static_cast<size_t>(smem_size)) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(cutlass::conv::Conv2dProblemSize const &problem_size,
+           cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
+           ElementCompute alpha = ElementCompute(1.5),
+           ElementCompute beta = ElementCompute(1)) {
+    // increment tested problem count run by the testbed
+    tested_problem_count++;
+
+#if 0 // display conv2d problem size for debugging
+    std::cout << problem_size << std::endl
+              << "alpha, beta: (" << alpha << ", " << beta << ")" << std::endl
+              << "split_k_mode: "
+              << ((split_k_mode == cutlass::conv::SplitKMode::kSerial) ? "(serial)" : "(parallel)")
+              << std::endl
+              << std::endl;
+#endif
+
+    initialize(problem_size);
+
+    // configure the operator
+    Conv2d conv2d_op;
+
+    typename Conv2d::Arguments conv2d_args(problem_size,
+                                           tensor_A.device_ref(),
+                                           tensor_B.device_ref(),
+                                           tensor_C.device_ref(),
+                                           tensor_D_computed.device_ref(),
+                                           {alpha, beta},
+                                           tensor_reordered_B.device_ref(),
+                                           split_k_mode);
+
+    // find workspace requirement for parallel split-k reduction
+    size_t workspace_size = Conv2d::get_workspace_size(conv2d_args);
+
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = conv2d_op.can_implement(problem_size);
+
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return true;
+    }
+
+    status = conv2d_op.initialize(conv2d_args, workspace.get());
+
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return true;
+    }
+
+    if (!sufficient(conv2d_op.get_smem_size())) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+    // run conv2d operator
+    status = conv2d_op();
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to run." << std::endl;
+      return false;
+    }
+
+    bool passed = false;
+
+    cudaError_t result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess) << " device reference error: " << cudaGetErrorString(result);
+
+    tensor_D_computed.sync_host();
+
+    //
+    // Reference check - support caching results
+    //
+
+    CachedTestKey cached_test_key =
+        CreateCachedConv2dTestKey<ElementA,
+                                  LayoutA,
+                                  ElementB,
+                                  LayoutB,
+                                  ElementC,
+                                  LayoutC,
+                                  ElementAccumulator,
+                                  ElementCompute>(kConvolutionalOperator,
+                                                  problem_size,
+                                                  alpha,
+                                                  beta,
+                                                  tensor_A.host_view(),
+                                                  tensor_B.host_view(),
+                                                  tensor_C.host_view());
+
+    //
+    // Look for the cached key
+    //
+
+    bool cached_result_loaded = false;
+    CachedTestResult cached_test_result;
+
+    std::string conv2d_result_cache_name =
+        std::string("cached_results_") + CUTLASS_TARGET_NAME + ".txt";
+
+    if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
+
+      CachedTestResultListing cached_results(conv2d_result_cache_name);
+
+      auto cached = cached_results.find(cached_test_key);
+
+      cached_result_loaded = cached.first;
+      if (cached_result_loaded) {
+        cached_test_result = cached.second;
+      }
+    }
+
+    if (!cached_result_loaded) {
+#if CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED
+
+      cutlass::reference::device::Conv2d<ElementA,
+                                         LayoutA,
+                                         ElementB,
+                                         LayoutB,
+                                         ElementC,
+                                         LayoutC,
+                                         ElementCompute,
+                                         ElementAccumulator>(kConvolutionalOperator,
+                                                             problem_size,
+                                                             tensor_A.device_ref(),
+                                                             tensor_B.device_ref(),
+                                                             tensor_C.device_ref(),
+                                                             tensor_D_reference.device_ref(),
+                                                             alpha,
+                                                             beta);
+
+      // sync host (copy device data to host) for dumping error output in case of mismatches
+      tensor_D_reference.sync_host();
+
+#else
+
+      cutlass::reference::host::Conv2d<ElementA,
+                                       LayoutA,
+                                       ElementB,
+                                       LayoutB,
+                                       ElementC,
+                                       LayoutC,
+                                       ElementCompute,
+                                       ElementAccumulator>(kConvolutionalOperator,
+                                                           problem_size,
+                                                           tensor_A.host_ref(),
+                                                           tensor_B.host_ref(),
+                                                           tensor_C.host_ref(),
+                                                           tensor_D_reference.host_ref(),
+                                                           alpha,
+                                                           beta);
+
+#endif
+
+      if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
+
+        cached_test_result.D = TensorHash(tensor_D_reference.host_view());
+
+        CachedTestResultListing cached_results(conv2d_result_cache_name);
+
+        cached_results.append(cached_test_key, cached_test_result);
+        cached_results.write(conv2d_result_cache_name);
+      }
+    } // if (!cached_result_loaded)
+
+    uint32_t tensor_D_hash = TensorHash(tensor_D_computed.host_view());
+
+    if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
+      passed = (tensor_D_hash == cached_test_result.D);
+
+      EXPECT_EQ(tensor_D_hash, cached_test_result.D) 
+        << "Hash-based comparison failed for key:" << "\n" << cached_test_key << "\n";
+    }
+    else {
+
+      passed = cutlass::reference::host::TensorEquals(
+        tensor_D_computed.host_view(), 
+                                                      tensor_D_reference.host_view());
+    }
+
+    EXPECT_TRUE(passed);
+
+    std::stringstream ss_problem_size_text;
+    ss_problem_size_text         << "nhwc_"
+        << problem_size.N << "x"
+        << problem_size.H << "x"
+        << problem_size.W << "x"
+        << problem_size.C
+        << "_krsc_"
+        << problem_size.K << "x"
+        << problem_size.R << "x"
+        << problem_size.S << "x"
+        << problem_size.C
+        << "_padding_"
+        << problem_size.pad_h << "x"
+        << problem_size.pad_w
+        << "_stride_"
+        << problem_size.stride_h << "x"
+        << problem_size.stride_w
+        << "_dilation_"
+        << problem_size.dilation_h << "x"
+                         << problem_size.dilation_w << "_"
+        << (problem_size.mode == cutlass::conv::Mode::kCrossCorrelation ? "xcorr_" : "conv_");
+
+    if (!passed) {
+      std::stringstream fname;
+
+      fname << "error_Conv2d_DirectConv_device_"
+        << (split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial_reduction_" : "parallel_reduction_")
+        << (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kFprop ? "fprop_" :
+            (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ? "dgrad_" : "wgrad_"))
+        << ss_problem_size_text.str()
+        << Conv2d::ThreadblockShape::kM << "x"  
+        << Conv2d::ThreadblockShape::kN << "x"  
+        << Conv2d::ThreadblockShape::kK << "_"
+        << Conv2d::WarpShape::kM << "x"  
+        << Conv2d::WarpShape::kN << "x"  
+        << Conv2d::WarpShape::kK << ".txt";
+
+      std::cout << fname.str() << std::endl;
+
+      std::ofstream results(fname.str());
+
+      results << problem_size << std::endl;
+
+      results
+        << "\nA:\n" << tensor_A.host_view() << "\n"
+        << "\nB:\n" << tensor_B.host_view() << "\n"
+        << "\nC:\n" << tensor_C.host_view() << "\n";
+
+      results << "\nD reference (hash: " << cached_test_result.D << ")\n";
+
+      if (!cached_result_loaded) {
+        results
+          << tensor_D_reference.host_view() << "\n";  
+      }
+
+      results
+        << "\nD computed (hash: " << tensor_D_hash << ")\n" 
+              << tensor_D_computed.host_view() << "\n";
+
+    }
+
+    return passed;
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename DirectConv>
+bool TestSpecificDepthwiseDirectConv2d(const Conv2dProblemVector &problem_sizes) {
+  bool passed = true;
+
+  //
+  // Testbed object
+  //
+  TestbedDepthwiseDirectConv2d<DirectConv> testbed;
+
+  // Sweep conv2d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for (auto conv_problem : problem_sizes) {
+    //
+    // Test
+    //
+
+    // test mode = xcross
+    passed = testbed.run(
+      conv_problem,
+      cutlass::conv::SplitKMode::kSerial);
+
+    if (!passed) {
+      return false;
+    }
+
+    // test mode = convolution
+    passed = testbed.run(
+      conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
+      cutlass::conv::SplitKMode::kSerial);
+
+    if (!passed) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace conv
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/conv/device/depthwise_conv2d_fprop_direct_conv_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu b/test/unit/conv/device/depthwise_conv2d_fprop_direct_conv_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu
new file mode 100644
index 0000000000..f23292ecce
--- /dev/null
+++ b/test/unit/conv/device/depthwise_conv2d_fprop_direct_conv_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu
@@ -0,0 +1,429 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Depthwise Direct Conv interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/kernel/default_depthwise_fprop.h"
+#include "cutlass/conv/device/direct_convolution.h"
+
+#include "conv2d_testbed.h"
+#include "depthwise_conv2d_direct_conv_testbed.h"
+
+std::vector<cutlass::conv::Conv2dProblemSize> DepthwiseFpropProblemSizes_filter3x3() {
+  std::vector<cutlass::conv::Conv2dProblemSize> problems;
+
+  for (int channels = 16; channels <= 512; channels += 16) {
+    problems.push_back(cutlass::conv::Conv2dProblemSize(
+        {1, 8, 8, channels},                     // input size  (NHWC)
+        {channels, 3, 3, 1},                     // filter size (KRSC)
+        {1, 1, 1, 1},                            // padding (pad_h, _, pad_w, _)
+        {1, 1},                                  // stride (stride_h, stride_w)
+        {1, 1},                                  // dilation (dilation_h, dilation_w)
+        cutlass::conv::Mode::kCrossCorrelation,  // Convolution mode
+        16,                                      // split_k_slices
+        channels                                 // groups
+        ));
+
+    // if(channels == 512 || channels == 16*14)
+
+    problems.push_back(cutlass::conv::Conv2dProblemSize(
+        {1, 16, 16, channels},                   // input size  (NHWC)
+        {channels, 3, 3, 1},                     // filter size (KRSC)
+        {1, 1, 1, 1},                            // padding (pad_h, _, pad_w, _)
+        {2, 2},                                  // stride (stride_h, stride_w)
+        {2, 2},                                  // dilation (dilation_h, dilation_w)
+        cutlass::conv::Mode::kCrossCorrelation,  // Convolution mode
+        16,                                      // split_k_slices
+        channels                                 // groups
+        ));
+  }
+
+  return problems;
+}
+
+std::vector<cutlass::conv::Conv2dProblemSize> DepthwiseFpropProblemSizes_filter5x5() {
+  std::vector<cutlass::conv::Conv2dProblemSize> problems;
+
+  for (int channels = 16; channels < 256; channels += 16) {
+    problems.push_back(cutlass::conv::Conv2dProblemSize(
+        {1, 16, 16, channels},                   // input size  (NHWC)
+        {channels, 5, 5, 1},                     // filter size (KRSC)
+        {1, 1, 1, 1},                            // padding (pad_h, _, pad_w, _)
+        {1, 1},                                  // stride (stride_h, stride_w)
+        {1, 1},                                  // dilation (dilation_h, dilation_w)
+        cutlass::conv::Mode::kCrossCorrelation,  // Convolution mode
+        16,                                      // split_k_slices
+        channels                                 // groups
+        ));
+
+    problems.push_back(cutlass::conv::Conv2dProblemSize(
+        {1, 112, 112, channels},                 // input size  (NHWC)
+        {channels, 5, 5, 1},                     // filter size (KRSC)
+        {1, 1, 1, 1},                            // padding (pad_h, _, pad_w, _)
+        {1, 1},                                  // stride (stride_h, stride_w)
+        {1, 1},                                  // dilation (dilation_h, dilation_w)
+        cutlass::conv::Mode::kCrossCorrelation,  // Convolution mode
+        16,                                      // split_k_slices
+        channels                                 // groups
+        ));
+
+    problems.push_back(cutlass::conv::Conv2dProblemSize(
+        {1, 112, 112, channels},                 // input size  (NHWC)
+        {channels, 5, 5, 1},                     // filter size (KRSC)
+        {1, 1, 1, 1},                            // padding (pad_h, _, pad_w, _)
+        {2, 2},                                  // stride (stride_h, stride_w)
+        {2, 2},                                  // dilation (dilation_h, dilation_w)
+        cutlass::conv::Mode::kCrossCorrelation,  // Convolution mode
+        16,                                      // split_k_slices
+        channels                                 // groups
+        ));
+  }
+
+  return problems;
+}
+
+std::vector<cutlass::conv::Conv2dProblemSize> DepthwiseFpropProblemSizes_filter5x37() {
+  std::vector<cutlass::conv::Conv2dProblemSize> problems;
+
+  for (int channels = 16; channels < 256; channels += 16) {
+    problems.push_back(cutlass::conv::Conv2dProblemSize(
+        {1, 128, 128, channels},                 // input size  (NHWC)
+        {channels, 5, 37, 1},                    // filter size (KRSC)
+        {1, 1, 1, 1},                            // padding (pad_h, _, pad_w, _)
+        {1, 1},                                  // stride (stride_h, stride_w)
+        {1, 1},                                  // dilation (dilation_h, dilation_w)
+        cutlass::conv::Mode::kCrossCorrelation,  // Convolution mode
+        108,                                     // split_k_slices
+        channels                                 // groups
+        ));
+  }
+
+  return problems;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(
+    SM60_Device_Depthwise_conv2d_Fprop_Direct_Conv_Optimized_f16nhwc_f16nhwc_f16nhwc_simt_f16,
+    64x32_4_8x32_3x3) {
+
+  using ElementInputA = cutlass::half_t;
+  using ElementInputB = cutlass::half_t;
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementComputeEpilogue = cutlass::half_t;
+
+  using LayoutInputA = cutlass::layout::TensorNHWC;
+  using LayoutInputB = cutlass::layout::TensorNHWC;
+  using LayoutOutput = cutlass::layout::TensorNHWC;
+
+  // This code section describes whether you want to use tensor cores or regular SIMT cores on GPU
+  // SM
+  using MMAOp = cutlass::arch::OpClassSimt;
+
+  // This code section describes CUDA SM architecture number
+  using SmArch = cutlass::arch::Sm60;
+
+  // This code section describes the groups a thread block will compute
+  constexpr int groups_per_cta = 32;
+
+  // This code section describes the output tile <N, P, Q, C> a thread block will compute
+  using ThreadBlockOutputShape = cutlass::conv::TensorNHWCShape<1, 8, 8, groups_per_cta>;
+
+  // This code section describes the filter shape <R, S>
+  using FilterShape = cutlass::MatrixShape<3, 3>;
+
+  // Threadblock tile shape
+  using ThreadblockShape =
+      cutlass::gemm::GemmShape<ThreadBlockOutputShape::kNHW, groups_per_cta, FilterShape::kCount>;
+
+  // This code section describes tile size a warp will computes
+  using WarpShape = cutlass::gemm::GemmShape<8, groups_per_cta, FilterShape::kCount>;
+
+  // This code section describes the size of MMA op
+  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+
+  // This code section describes how threadblocks are scheduled on GPU
+  using SwizzleThreadBlock =
+      cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<
+          1,
+          ThreadBlockOutputShape::kN,
+          ThreadBlockOutputShape::kH,
+          ThreadBlockOutputShape::kW>;
+
+  // Number of pipelines you want to use
+  constexpr int NumStages = 4;
+
+  // This code section describe iterator algorithm selected is Analytic or Optimized
+  static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm =
+      cutlass::conv::IteratorAlgorithm::kOptimized;
+
+  constexpr int kEpilogueElementsPerAccess = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+
+  // This code section describes the epilogue part of the kernel, we use default value
+  using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,               // Data type of output matrix.
+      kEpilogueElementsPerAccess,  // The number of elements per vectorized.
+                                   // memory access. This becomes the vector width of
+                                   // math instructions in the epilogue too.
+      ElementAccumulator,          // Data type of accumulator
+      ElementComputeEpilogue,      // Data type for alpha/beta in linear combination
+      cutlass::epilogue::thread::ScaleType::Default>;     
+
+  using DepthwiseDirect2dConv = typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop<
+      ElementInputA,
+      LayoutInputA,
+      ElementInputB,
+      LayoutInputB,
+      ElementOutput,
+      LayoutOutput,
+      ElementAccumulator,
+      MMAOp,
+      SmArch,
+      ThreadblockShape,
+      ThreadBlockOutputShape,
+      FilterShape,
+      WarpShape,
+      InstructionShape,
+      EpilogueOp,
+      SwizzleThreadBlock,
+      NumStages,
+      cutlass::arch::OpMultiplyAdd,
+      IteratorAlgorithm,
+      cutlass::conv::StrideSupport::kStrided>::Kernel;
+
+  using Direct2dConv = cutlass::conv::device::DirectConvolution<DepthwiseDirect2dConv>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestSpecificDepthwiseDirectConv2d<Direct2dConv>(
+      DepthwiseFpropProblemSizes_filter3x3()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(
+    SM60_Device_Depthwise_conv2d_Fprop_Direct_Conv_Optimized_f16nhwc_f16nhwc_f16nhwc_simt_f16,
+    64x64_3_16x64_5x5) {
+
+  using ElementInputA = cutlass::half_t;
+  using ElementInputB = cutlass::half_t;
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementComputeEpilogue = cutlass::half_t;
+
+  using LayoutInputA = cutlass::layout::TensorNHWC;
+  using LayoutInputB = cutlass::layout::TensorNHWC;
+  using LayoutOutput = cutlass::layout::TensorNHWC;
+
+  // This code section describes whether you want to use tensor cores or regular SIMT cores on GPU
+  // SM
+  using MMAOp = cutlass::arch::OpClassSimt;
+
+  // This code section describes CUDA SM architecture number
+  using SmArch = cutlass::arch::Sm60;
+
+  // This code section describes the groups a thread block will compute
+  constexpr int groups_per_cta = 64;
+
+  // This code section describes the output tile <N, P, Q, C> a thread block will compute
+  using ThreadBlockOutputShape = cutlass::conv::TensorNHWCShape<1, 8, 8, groups_per_cta>;
+
+  // This code section describes the filter shape <R, S>
+  using FilterShape = cutlass::MatrixShape<5, 5>;
+
+  // Threadblock tile shape
+  using ThreadblockShape =
+      cutlass::gemm::GemmShape<ThreadBlockOutputShape::kNHW, groups_per_cta, FilterShape::kCount>;
+
+  // This code section describes tile size a warp will computes
+  using WarpShape = cutlass::gemm::GemmShape<16, groups_per_cta, FilterShape::kCount>;
+
+  // This code section describes the size of MMA op
+  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+
+  // This code section describes how threadblocks are scheduled on GPU
+  using SwizzleThreadBlock =
+      cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<
+          1,
+          ThreadBlockOutputShape::kN,
+          ThreadBlockOutputShape::kH,
+          ThreadBlockOutputShape::kW>;
+
+  // Number of pipelines you want to use
+  constexpr int NumStages = 3;
+
+  // This code section describe iterator algorithm selected is Analytic or Optimized
+  static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm =
+      cutlass::conv::IteratorAlgorithm::kOptimized;
+
+  constexpr int kEpilogueElementsPerAccess = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+
+  // This code section describes the epilogue part of the kernel, we use default value
+  using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,               // Data type of output matrix.
+      kEpilogueElementsPerAccess,  // The number of elements per vectorized.
+                                   // memory access. This becomes the vector width of
+                                   // math instructions in the epilogue too.
+      ElementAccumulator,          // Data type of accumulator
+      ElementComputeEpilogue,      // Data type for alpha/beta in linear combination
+      cutlass::epilogue::thread::ScaleType::Default>;  
+
+  using DepthwiseDirect2dConv = typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop<
+      ElementInputA,
+      LayoutInputA,
+      ElementInputB,
+      LayoutInputB,
+      ElementOutput,
+      LayoutOutput,
+      ElementAccumulator,
+      MMAOp,
+      SmArch,
+      ThreadblockShape,
+      ThreadBlockOutputShape,
+      FilterShape,
+      WarpShape,
+      InstructionShape,
+      EpilogueOp,
+      SwizzleThreadBlock,
+      NumStages,
+      cutlass::arch::OpMultiplyAdd,
+      IteratorAlgorithm,
+      cutlass::conv::StrideSupport::kStrided>::Kernel;
+
+  using Direct2dConv = cutlass::conv::device::DirectConvolution<DepthwiseDirect2dConv>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestSpecificDepthwiseDirectConv2d<Direct2dConv>(
+      DepthwiseFpropProblemSizes_filter5x5()));
+}
+
+#if 0
+////////////////////////////////////////////////////////////////////////////////
+TEST(
+    SM60_Device_Depthwise_conv2d_Fprop_Direct_Conv_Optimized_f16nhwc_f16nhwc_f16nhwc_simt_f16,
+    64x32_3_16x32_5x37) {
+
+  using ElementInputA = cutlass::half_t;
+  using ElementInputB = cutlass::half_t;
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementComputeEpilogue = cutlass::half_t;
+
+  using LayoutInputA = cutlass::layout::TensorNHWC;
+  using LayoutInputB = cutlass::layout::TensorNHWC;
+  using LayoutOutput = cutlass::layout::TensorNHWC;
+
+  // This code section describes whether you want to use tensor cores or regular SIMT cores on GPU
+  // SM
+  using MMAOp = cutlass::arch::OpClassSimt;
+
+  // This code section describes CUDA SM architecture number
+  using SmArch = cutlass::arch::Sm60;
+
+  // This code section describes the groups a thread block will compute
+  constexpr int groups_per_cta = 32;
+
+  // This code section describes the output tile <N, P, Q, C> a thread block will compute
+  using ThreadBlockOutputShape = cutlass::conv::TensorNHWCShape<1, 8, 8, groups_per_cta>;
+
+  // This code section describes the filter shape <R, S>
+  using FilterShape = cutlass::MatrixShape<5, 37>;
+
+  // Threadblock tile shape
+  using ThreadblockShape =
+      cutlass::gemm::GemmShape<ThreadBlockOutputShape::kNHW, groups_per_cta, FilterShape::kCount>;
+
+  // This code section describes tile size a warp will computes
+  using WarpShape = cutlass::gemm::GemmShape<16, groups_per_cta, FilterShape::kCount>;
+
+  // This code section describes the size of MMA op
+  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+
+  // This code section describes how threadblocks are scheduled on GPU
+  using SwizzleThreadBlock =
+      cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<
+          1,
+          ThreadBlockOutputShape::kN,
+          ThreadBlockOutputShape::kH,
+          ThreadBlockOutputShape::kW>;
+
+  // Number of pipelines you want to use
+  constexpr int NumStages = 2;
+
+  // This code section describe iterator algorithm selected is Analytic or Optimized
+  static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm =
+      cutlass::conv::IteratorAlgorithm::kOptimized;
+
+  constexpr int kEpilogueElementsPerAccess = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+
+  // This code section describes the epilogue part of the kernel, we use default value
+  using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,               // Data type of output matrix.
+      kEpilogueElementsPerAccess,  // The number of elements per vectorized.
+                                   // memory access. This becomes the vector width of
+                                   // math instructions in the epilogue too.
+      ElementAccumulator,          // Data type of accumulator
+      ElementComputeEpilogue,      // Data type for alpha/beta in linear combination
+      cutlass::epilogue::thread::ScaleType::Default>;  
+
+  using DepthwiseDirect2dConv = typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop<
+      ElementInputA,
+      LayoutInputA,
+      ElementInputB,
+      LayoutInputB,
+      ElementOutput,
+      LayoutOutput,
+      ElementAccumulator,
+      MMAOp,
+      SmArch,
+      ThreadblockShape,
+      ThreadBlockOutputShape,
+      FilterShape,
+      WarpShape,
+      InstructionShape,
+      EpilogueOp,
+      SwizzleThreadBlock,
+      NumStages,
+      cutlass::arch::OpMultiplyAdd,
+      IteratorAlgorithm,
+      cutlass::conv::StrideSupport::kStrided>::Kernel;
+
+  using Direct2dConv = cutlass::conv::device::DirectConvolution<DepthwiseDirect2dConv>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestSpecificDepthwiseDirectConv2d<Direct2dConv>(
+      DepthwiseFpropProblemSizes_filter5x37()));
+}
+#endif
+
diff --git a/test/unit/conv/device/depthwise_conv2d_fprop_direct_conv_fixed_stride_dilation_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu b/test/unit/conv/device/depthwise_conv2d_fprop_direct_conv_fixed_stride_dilation_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu
new file mode 100644
index 0000000000..6fdd5de71a
--- /dev/null
+++ b/test/unit/conv/device/depthwise_conv2d_fprop_direct_conv_fixed_stride_dilation_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu
@@ -0,0 +1,522 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Depthwise Direct Conv interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/kernel/default_depthwise_fprop.h"
+#include "cutlass/conv/device/direct_convolution.h"
+
+#include "conv2d_testbed.h"
+#include "depthwise_conv2d_direct_conv_testbed.h"
+
+std::vector<cutlass::conv::Conv2dProblemSize> DepthwiseFpropProblemSizes_filter3x3_stride1x1_dilation1x1() {
+  std::vector<cutlass::conv::Conv2dProblemSize> problems;
+
+  for (int channels = 16; channels <= 512; channels += 16) {
+    problems.push_back(cutlass::conv::Conv2dProblemSize(
+        {1, 8, 8, channels},                     // input size  (NHWC)
+        {channels, 3, 3, 1},                     // filter size (KRSC)
+        {1, 1, 1, 1},                            // padding (pad_h, _, pad_w, _)
+        {1, 1},                                  // stride (stride_h, stride_w)
+        {1, 1},                                  // dilation (dilation_h, dilation_w)
+        cutlass::conv::Mode::kCrossCorrelation,  // Convolution mode
+        16,                                      // split_k_slices
+        channels                                 // groups
+        ));
+    }
+    return problems;
+}
+
+std::vector<cutlass::conv::Conv2dProblemSize> DepthwiseFpropProblemSizes_filter3x3_stride2x2_dilation2x2() {
+    std::vector<cutlass::conv::Conv2dProblemSize> problems;
+    for (int channels = 16; channels <= 512; channels += 16) {
+        problems.push_back(cutlass::conv::Conv2dProblemSize(
+            {1, 16, 16, channels},                   // input size  (NHWC)
+            {channels, 3, 3, 1},                     // filter size (KRSC)
+            {1, 1, 1, 1},                            // padding (pad_h, _, pad_w, _)
+            {2, 2},                                  // stride (stride_h, stride_w)
+            {2, 2},                                  // dilation (dilation_h, dilation_w)
+            cutlass::conv::Mode::kCrossCorrelation,  // Convolution mode
+            16,                                      // split_k_slices
+            channels                                 // groups
+            ));
+    }
+
+  return problems;
+}
+
+std::vector<cutlass::conv::Conv2dProblemSize> DepthwiseFpropProblemSizes_filter5x5_stride1x1_dilation1x1() {
+  std::vector<cutlass::conv::Conv2dProblemSize> problems;
+
+  for (int channels = 16; channels < 256; channels += 16) {
+    problems.push_back(cutlass::conv::Conv2dProblemSize(
+        {1, 16, 16, channels},                   // input size  (NHWC)
+        {channels, 5, 5, 1},                     // filter size (KRSC)
+        {1, 1, 1, 1},                            // padding (pad_h, _, pad_w, _)
+        {1, 1},                                  // stride (stride_h, stride_w)
+        {1, 1},                                  // dilation (dilation_h, dilation_w)
+        cutlass::conv::Mode::kCrossCorrelation,  // Convolution mode
+        16,                                      // split_k_slices
+        channels                                 // groups
+        ));
+    }
+
+    return problems;
+
+}
+
+std::vector<cutlass::conv::Conv2dProblemSize> DepthwiseFpropProblemSizes_filter5x5_stride2x2_dilation2x2() {
+    std::vector<cutlass::conv::Conv2dProblemSize> problems;
+    for (int channels = 16; channels < 256; channels += 16) {
+        problems.push_back(cutlass::conv::Conv2dProblemSize(
+            {1, 112, 112, channels},                 // input size  (NHWC)
+            {channels, 5, 5, 1},                     // filter size (KRSC)
+            {1, 1, 1, 1},                            // padding (pad_h, _, pad_w, _)
+            {2, 2},                                  // stride (stride_h, stride_w)
+            {2, 2},                                  // dilation (dilation_h, dilation_w)
+            cutlass::conv::Mode::kCrossCorrelation,  // Convolution mode
+            16,                                      // split_k_slices
+            channels                                 // groups
+            ));
+    }
+
+  return problems;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(
+    SM60_Device_Depthwise_conv2d_Fprop_Direct_Conv_FixedStrideDilation_f16nhwc_f16nhwc_f16nhwc_simt_f16,
+    64x32_4_8x32_Filter3x3_Stride1x1_Dilation1x1) {
+
+  using ElementInputA = cutlass::half_t;
+  using ElementInputB = cutlass::half_t;
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementComputeEpilogue = cutlass::half_t;
+
+  using LayoutInputA = cutlass::layout::TensorNHWC;
+  using LayoutInputB = cutlass::layout::TensorNHWC;
+  using LayoutOutput = cutlass::layout::TensorNHWC;
+
+  // This code section describes whether you want to use tensor cores or regular SIMT cores on GPU
+  // SM
+  using MMAOp = cutlass::arch::OpClassSimt;
+
+  // This code section describes CUDA SM architecture number
+  using SmArch = cutlass::arch::Sm60;
+
+  // This code section describes the groups a thread block will compute
+  constexpr int groups_per_cta = 32;
+
+  // This code section describes the output tile <N, P, Q, C> a thread block will compute
+  using ThreadBlockOutputShape = cutlass::conv::TensorNHWCShape<1, 8, 8, groups_per_cta>;
+
+  // This code section describes the filter shape <R, S>
+  using FilterShape = cutlass::MatrixShape<3, 3>;
+
+  // Threadblock tile shape
+  using ThreadblockShape =
+      cutlass::gemm::GemmShape<ThreadBlockOutputShape::kNHW, groups_per_cta, FilterShape::kCount>;
+
+  // This code section describes tile size a warp will computes
+  using WarpShape = cutlass::gemm::GemmShape<8, groups_per_cta, FilterShape::kCount>;
+
+  // This code section describes the size of MMA op
+  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+
+  // This code section describes how threadblocks are scheduled on GPU
+  using SwizzleThreadBlock =
+      cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<
+          1,
+          ThreadBlockOutputShape::kN,
+          ThreadBlockOutputShape::kH,
+          ThreadBlockOutputShape::kW>;
+
+  // Number of pipelines you want to use
+  constexpr int NumStages = 4;
+
+  // This code section describe iterator algorithm selected is Analytic or Optimized
+  static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm =
+      cutlass::conv::IteratorAlgorithm::kFixedStrideDilation;
+  using StrideShape = cutlass::MatrixShape<1, 1>;
+  using DilationShape = cutlass::MatrixShape<1, 1>;
+
+  constexpr int kEpilogueElementsPerAccess = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+
+  // This code section describes the epilogue part of the kernel, we use default value
+  using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,               // Data type of output matrix.
+      kEpilogueElementsPerAccess,  // The number of elements per vectorized.
+                                   // memory access. This becomes the vector width of
+                                   // math instructions in the epilogue too.
+      ElementAccumulator,          // Data type of accumulator
+      ElementComputeEpilogue,      // Data type for alpha/beta in linear combination
+      cutlass::epilogue::thread::ScaleType::Default>;     
+
+  using DepthwiseDirect2dConv = typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop<
+      ElementInputA,
+      LayoutInputA,
+      ElementInputB,
+      LayoutInputB,
+      ElementOutput,
+      LayoutOutput,
+      ElementAccumulator,
+      MMAOp,
+      SmArch,
+      ThreadblockShape,
+      ThreadBlockOutputShape,
+      FilterShape,
+      WarpShape,
+      InstructionShape,
+      EpilogueOp,
+      SwizzleThreadBlock,
+      NumStages,
+      cutlass::arch::OpMultiplyAdd,
+      IteratorAlgorithm,
+      cutlass::conv::StrideSupport::kFixed,
+      StrideShape,
+      DilationShape>::Kernel;
+
+  using Direct2dConv = cutlass::conv::device::DirectConvolution<DepthwiseDirect2dConv>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestSpecificDepthwiseDirectConv2d<Direct2dConv>(
+    DepthwiseFpropProblemSizes_filter3x3_stride1x1_dilation1x1()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(
+    SM60_Device_Depthwise_conv2d_Fprop_Direct_Conv_FixedStrideDilation_f16nhwc_f16nhwc_f16nhwc_simt_f16,
+    64x32_4_8x32_Filter3x3_Stride2x2_Dilation2x2) {
+
+  using ElementInputA = cutlass::half_t;
+  using ElementInputB = cutlass::half_t;
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementComputeEpilogue = cutlass::half_t;
+
+  using LayoutInputA = cutlass::layout::TensorNHWC;
+  using LayoutInputB = cutlass::layout::TensorNHWC;
+  using LayoutOutput = cutlass::layout::TensorNHWC;
+
+  // This code section describes whether you want to use tensor cores or regular SIMT cores on GPU
+  // SM
+  using MMAOp = cutlass::arch::OpClassSimt;
+
+  // This code section describes CUDA SM architecture number
+  using SmArch = cutlass::arch::Sm60;
+
+  // This code section describes the groups a thread block will compute
+  constexpr int groups_per_cta = 32;
+
+  // This code section describes the output tile <N, P, Q, C> a thread block will compute
+  using ThreadBlockOutputShape = cutlass::conv::TensorNHWCShape<1, 8, 8, groups_per_cta>;
+
+  // This code section describes the filter shape <R, S>
+  using FilterShape =  cutlass::MatrixShape<3, 3>;
+
+  // Threadblock tile shape
+  using ThreadblockShape =
+      cutlass::gemm::GemmShape<ThreadBlockOutputShape::kNHW, groups_per_cta, FilterShape::kCount>;
+
+  // This code section describes tile size a warp will computes
+  using WarpShape = cutlass::gemm::GemmShape<8, groups_per_cta, FilterShape::kCount>;
+
+  // This code section describes the size of MMA op
+  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+
+  // This code section describes how threadblocks are scheduled on GPU
+  using SwizzleThreadBlock =
+      cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<
+          1,
+          ThreadBlockOutputShape::kN,
+          ThreadBlockOutputShape::kH,
+          ThreadBlockOutputShape::kW>;
+
+  // Number of pipelines you want to use
+  constexpr int NumStages = 4;
+
+  // This code section describe iterator algorithm selected is Analytic or Optimized
+  static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm =
+      cutlass::conv::IteratorAlgorithm::kFixedStrideDilation;
+  using StrideShape = cutlass::MatrixShape<2, 2>;
+  using DilationShape = cutlass::MatrixShape<2, 2>;
+
+  constexpr int kEpilogueElementsPerAccess = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+
+  // This code section describes the epilogue part of the kernel, we use default value
+  using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,               // Data type of output matrix.
+      kEpilogueElementsPerAccess,  // The number of elements per vectorized.
+                                   // memory access. This becomes the vector width of
+                                   // math instructions in the epilogue too.
+      ElementAccumulator,          // Data type of accumulator
+      ElementComputeEpilogue,      // Data type for alpha/beta in linear combination
+      cutlass::epilogue::thread::ScaleType::Default>;     
+
+  using DepthwiseDirect2dConv = typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop<
+      ElementInputA,
+      LayoutInputA,
+      ElementInputB,
+      LayoutInputB,
+      ElementOutput,
+      LayoutOutput,
+      ElementAccumulator,
+      MMAOp,
+      SmArch,
+      ThreadblockShape,
+      ThreadBlockOutputShape,
+      FilterShape,
+      WarpShape,
+      InstructionShape,
+      EpilogueOp,
+      SwizzleThreadBlock,
+      NumStages,
+      cutlass::arch::OpMultiplyAdd,
+      IteratorAlgorithm,
+      cutlass::conv::StrideSupport::kFixed,
+      StrideShape,
+      DilationShape>::Kernel;
+
+  using Direct2dConv = cutlass::conv::device::DirectConvolution<DepthwiseDirect2dConv>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestSpecificDepthwiseDirectConv2d<Direct2dConv>(
+    DepthwiseFpropProblemSizes_filter3x3_stride2x2_dilation2x2()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(
+    SM60_Device_Depthwise_conv2d_Fprop_Direct_Conv_FixedStrideDilation_f16nhwc_f16nhwc_f16nhwc_simt_f16,
+    64x64_3_16x64_Filter5x5_Stride1x1_Dilation1x1) {
+
+  using ElementInputA = cutlass::half_t;
+  using ElementInputB = cutlass::half_t;
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementComputeEpilogue = cutlass::half_t;
+
+  using LayoutInputA = cutlass::layout::TensorNHWC;
+  using LayoutInputB = cutlass::layout::TensorNHWC;
+  using LayoutOutput = cutlass::layout::TensorNHWC;
+
+  // This code section describes whether you want to use tensor cores or regular SIMT cores on GPU
+  // SM
+  using MMAOp = cutlass::arch::OpClassSimt;
+
+  // This code section describes CUDA SM architecture number
+  using SmArch = cutlass::arch::Sm60;
+
+  // This code section describes the groups a thread block will compute
+  constexpr int groups_per_cta = 64;
+
+  // This code section describes the output tile <N, P, Q, C> a thread block will compute
+  using ThreadBlockOutputShape = cutlass::conv::TensorNHWCShape<1, 8, 8, groups_per_cta>;
+
+  // This code section describes the filter shape <R, S>
+  using FilterShape = cutlass::MatrixShape<5, 5>;
+
+  // Threadblock tile shape
+  using ThreadblockShape =
+      cutlass::gemm::GemmShape<ThreadBlockOutputShape::kNHW, groups_per_cta, FilterShape::kCount>;
+
+  // This code section describes tile size a warp will computes
+  using WarpShape = cutlass::gemm::GemmShape<16, groups_per_cta, FilterShape::kCount>;
+
+  // This code section describes the size of MMA op
+  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+
+  // This code section describes how threadblocks are scheduled on GPU
+  using SwizzleThreadBlock =
+      cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<
+          1,
+          ThreadBlockOutputShape::kN,
+          ThreadBlockOutputShape::kH,
+          ThreadBlockOutputShape::kW>;
+
+  // Number of pipelines you want to use
+  constexpr int NumStages = 3;
+
+  // This code section describe iterator algorithm selected is Analytic or Optimized
+  static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm =
+      cutlass::conv::IteratorAlgorithm::kFixedStrideDilation;
+  using StrideShape = cutlass::MatrixShape<1, 1>;
+  using DilationShape = cutlass::MatrixShape<1, 1>;
+
+  constexpr int kEpilogueElementsPerAccess = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+
+  // This code section describes the epilogue part of the kernel, we use default value
+  using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,               // Data type of output matrix.
+      kEpilogueElementsPerAccess,  // The number of elements per vectorized.
+                                   // memory access. This becomes the vector width of
+                                   // math instructions in the epilogue too.
+      ElementAccumulator,          // Data type of accumulator
+      ElementComputeEpilogue,      // Data type for alpha/beta in linear combination
+      cutlass::epilogue::thread::ScaleType::Default>;  
+
+  using DepthwiseDirect2dConv = typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop<
+      ElementInputA,
+      LayoutInputA,
+      ElementInputB,
+      LayoutInputB,
+      ElementOutput,
+      LayoutOutput,
+      ElementAccumulator,
+      MMAOp,
+      SmArch,
+      ThreadblockShape,
+      ThreadBlockOutputShape,
+      FilterShape,
+      WarpShape,
+      InstructionShape,
+      EpilogueOp,
+      SwizzleThreadBlock,
+      NumStages,
+      cutlass::arch::OpMultiplyAdd,
+      IteratorAlgorithm,
+      cutlass::conv::StrideSupport::kFixed,
+      StrideShape,
+      DilationShape>::Kernel;
+
+  using Direct2dConv = cutlass::conv::device::DirectConvolution<DepthwiseDirect2dConv>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestSpecificDepthwiseDirectConv2d<Direct2dConv>(
+    DepthwiseFpropProblemSizes_filter5x5_stride1x1_dilation1x1()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(
+    SM60_Device_Depthwise_conv2d_Fprop_Direct_Conv_FixedStrideDilation_f16nhwc_f16nhwc_f16nhwc_simt_f16,
+    64x64_3_16x64_Filter5x5_Stride2x2_Dilation2x2) {
+
+  using ElementInputA = cutlass::half_t;
+  using ElementInputB = cutlass::half_t;
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementComputeEpilogue = cutlass::half_t;
+
+  using LayoutInputA = cutlass::layout::TensorNHWC;
+  using LayoutInputB = cutlass::layout::TensorNHWC;
+  using LayoutOutput = cutlass::layout::TensorNHWC;
+
+  // This code section describes whether you want to use tensor cores or regular SIMT cores on GPU
+  // SM
+  using MMAOp = cutlass::arch::OpClassSimt;
+
+  // This code section describes CUDA SM architecture number
+  using SmArch = cutlass::arch::Sm60;
+
+  // This code section describes the groups a thread block will compute
+  constexpr int groups_per_cta = 32;
+
+  // This code section describes the output tile <N, P, Q, C> a thread block will compute
+  using ThreadBlockOutputShape = cutlass::conv::TensorNHWCShape<1, 8, 8, groups_per_cta>;
+
+  // This code section describes the filter shape <R, S>
+  using FilterShape = cutlass::MatrixShape<5, 5>;
+
+  // Threadblock tile shape
+  using ThreadblockShape =
+      cutlass::gemm::GemmShape<ThreadBlockOutputShape::kNHW, groups_per_cta, FilterShape::kCount>;
+
+  // This code section describes tile size a warp will computes
+  using WarpShape = cutlass::gemm::GemmShape<16, groups_per_cta, FilterShape::kCount>;
+
+  // This code section describes the size of MMA op
+  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+
+  // This code section describes how threadblocks are scheduled on GPU
+  using SwizzleThreadBlock =
+      cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<
+          1,
+          ThreadBlockOutputShape::kN,
+          ThreadBlockOutputShape::kH,
+          ThreadBlockOutputShape::kW>;
+
+  // Number of pipelines you want to use
+  constexpr int NumStages = 3;
+
+  // This code section describe iterator algorithm selected is Analytic or Optimized
+  static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm =
+      cutlass::conv::IteratorAlgorithm::kFixedStrideDilation;
+  using StrideShape = cutlass::MatrixShape<2, 2>;
+  using DilationShape = cutlass::MatrixShape<2, 2>;
+
+  constexpr int kEpilogueElementsPerAccess = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+
+  // This code section describes the epilogue part of the kernel, we use default value
+  using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,               // Data type of output matrix.
+      kEpilogueElementsPerAccess,  // The number of elements per vectorized.
+                                   // memory access. This becomes the vector width of
+                                   // math instructions in the epilogue too.
+      ElementAccumulator,          // Data type of accumulator
+      ElementComputeEpilogue,      // Data type for alpha/beta in linear combination
+      cutlass::epilogue::thread::ScaleType::Default>;  
+
+  using DepthwiseDirect2dConv = typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop<
+      ElementInputA,
+      LayoutInputA,
+      ElementInputB,
+      LayoutInputB,
+      ElementOutput,
+      LayoutOutput,
+      ElementAccumulator,
+      MMAOp,
+      SmArch,
+      ThreadblockShape,
+      ThreadBlockOutputShape,
+      FilterShape,
+      WarpShape,
+      InstructionShape,
+      EpilogueOp,
+      SwizzleThreadBlock,
+      NumStages,
+      cutlass::arch::OpMultiplyAdd,
+      IteratorAlgorithm,
+      cutlass::conv::StrideSupport::kFixed,
+      StrideShape,
+      DilationShape>::Kernel;
+
+  using Direct2dConv = cutlass::conv::device::DirectConvolution<DepthwiseDirect2dConv>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestSpecificDepthwiseDirectConv2d<Direct2dConv>(
+    DepthwiseFpropProblemSizes_filter5x5_stride2x2_dilation2x2()));
+}
diff --git a/test/unit/conv/device/depthwise_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu b/test/unit/conv/device/depthwise_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu
new file mode 100644
index 0000000000..a4d8ea8b7f
--- /dev/null
+++ b/test/unit/conv/device/depthwise_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu
@@ -0,0 +1,221 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for Depthwise Direct Conv interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/kernel/default_depthwise_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+
+std::vector<cutlass::conv::Conv2dProblemSize> DepthwiseFpropProblemSizes() {
+
+std::vector<cutlass::conv::Conv2dProblemSize> problems;
+
+for ( int channels = 16; channels < 256 ; channels+=16){
+  problems.push_back(cutlass::conv::Conv2dProblemSize(
+    {1, 8, 8, channels},                // input size  (NHWC)
+    {channels, 3, 3, 1},                // filter size (KRSC)
+    {1, 1, 1, 1},                       // padding (pad_h, _, pad_w, _)
+    {2, 2},                             // stride (stride_h, stride_w)
+    {1, 1},                             // dilation (dilation_h, dilation_w)
+    cutlass::conv::Mode::kCrossCorrelation,  // Convolution mode
+    1,                                  // split_k_slices
+    channels                            // groups
+  ));
+
+  problems.push_back(cutlass::conv::Conv2dProblemSize(
+    {1, 16, 16, channels},             // input size  (NHWC)
+    {channels, 3, 3, 1},              // filter size (KRSC)
+    {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+    {2, 2},                            // stride (stride_h, stride_w)
+    {2, 2},                             // dilation (dilation_h, dilation_w)
+    cutlass::conv::Mode::kCrossCorrelation,  // Convolution mode
+    1,                                  // split_k_slices
+    channels                            // groups
+  ));
+
+  problems.push_back(cutlass::conv::Conv2dProblemSize(
+    {1, 16, 16, channels},             // input size  (NHWC)
+    {channels, 7, 7, 1},              // filter size (KRSC)
+    {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+    {1, 1},                            // stride (stride_h, stride_w)
+    {1, 1},                             // dilation (dilation_h, dilation_w)
+    cutlass::conv::Mode::kCrossCorrelation,  // Convolution mode
+    1,                                  // split_k_slices
+    channels                            // groups
+  ));
+
+  problems.push_back(cutlass::conv::Conv2dProblemSize(
+    {1, 112, 112, channels},   // input size  (NHWC)
+    {channels, 7, 7, 1},   // filter size (KRSC)
+    {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+    {1, 1},                            // stride (stride_h, stride_w)
+    {1, 1},                             // dilation (dilation_h, dilation_w)
+    cutlass::conv::Mode::kCrossCorrelation,  // Convolution mode
+    1,                                  // split_k_slices
+    channels                            // groups
+  ));
+
+  problems.push_back(cutlass::conv::Conv2dProblemSize(
+    {1, 112, 112, channels},   // input size  (NHWC)
+    {channels, 7, 7, 1},   // filter size (KRSC)
+    {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+    {2, 2},                            // stride (stride_h, stride_w)
+    {2, 2} ,                             // dilation (dilation_h, dilation_w)
+    cutlass::conv::Mode::kCrossCorrelation,  // Convolution mode
+    1,                                  // split_k_slices
+    channels                            // groups
+  ));
+
+  problems.push_back(cutlass::conv::Conv2dProblemSize(
+    {1, 112, 112, channels},   // input size  (NHWC)
+    {channels, 5, 5, 1},   // filter size (KRSC)
+    {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+    {1, 1},                            // stride (stride_h, stride_w)
+    {1, 1},                             // dilation (dilation_h, dilation_w)
+    cutlass::conv::Mode::kCrossCorrelation,  // Convolution mode
+    1,                                  // split_k_slices
+    channels                            // groups
+  ));
+
+  problems.push_back(cutlass::conv::Conv2dProblemSize(
+    {1, 112, 112, channels},   // input size  (NHWC)
+    {channels, 5, 5, 1},   // filter size (KRSC)
+    {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+    {2, 2},                            // stride (stride_h, stride_w)
+    {2, 2} ,                             // dilation (dilation_h, dilation_w)
+    cutlass::conv::Mode::kCrossCorrelation,  // Convolution mode
+    1,                                  // split_k_slices
+    channels                            // groups
+  ));
+}
+
+return problems;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM60_Device_Depthwise_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_simt_f16,
+  128x128_8x2_64x64x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementCompute     = cutlass::half_t;
+
+
+  /// Device-level depthwiseFpropKernel instance
+  using depthwiseFpropKernel = typename cutlass::conv::kernel::DefaultDepthwiseFprop<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm60,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<64, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using DepthwiseFprop = cutlass::conv::device::ImplicitGemmConvolution<depthwiseFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestSpecificConv2d<DepthwiseFprop>(
+    DepthwiseFpropProblemSizes()));
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM60_Device_Depthwise_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_simt_f16,
+  64x64_8x2_32x32x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementCompute     = cutlass::half_t;
+
+
+  /// Device-level depthwiseFpropKernel instance
+  using depthwiseFpropKernel = typename cutlass::conv::kernel::DefaultDepthwiseFprop<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm60,
+    cutlass::gemm::GemmShape<64, 64, 8>,
+    cutlass::gemm::GemmShape<32, 32, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using DepthwiseFprop = cutlass::conv::device::ImplicitGemmConvolution<depthwiseFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestSpecificConv2d<DepthwiseFprop>(
+    DepthwiseFpropProblemSizes()));
+
+}
diff --git a/test/unit/conv/device/group_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/group_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
new file mode 100644
index 0000000000..2072fd2233
--- /dev/null
+++ b/test/unit/conv/device/group_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
@@ -0,0 +1,495 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/kernel/default_conv2d_group_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Group_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32,
+  SingleGroupPerCTA_128x128_64x3_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+  using ThreadblockShape   = cutlass::gemm::GemmShape<128, 128, 64>;
+  using WarpShape          = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape   = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  /// Device-level Conv2d instance
+  using Conv2dGroupFpropKernel = typename cutlass::conv::kernel::DefaultConv2dGroupFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::GroupMode::kSingleGroup,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dGroupFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dGroupFpropKernel>;
+
+  /// Run group conv unit test sizes with device-level Conv2d instance
+  test::conv::device::TestbedGroupConv2dProblemSizes problem_sizes(
+    ThreadblockShape::kN, ThreadblockShape::kK,
+    128/cutlass::sizeof_bits<ElementA>::value
+  );
+  EXPECT_TRUE(test::conv::device::TestSpecificConv2d<Conv2dGroupFprop>(problem_sizes.default_single_group_sizes));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Group_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32,
+  SingleGroupPerCTA_64x64_64x3_32x32x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+  using ThreadblockShape   = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape          = cutlass::gemm::GemmShape<32, 32, 64>;
+  using InstructionShape   = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  /// Device-level Conv2d instance
+  using Conv2dGroupFpropKernel = typename cutlass::conv::kernel::DefaultConv2dGroupFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::GroupMode::kSingleGroup,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dGroupFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dGroupFpropKernel>;
+
+  /// Run group conv unit test sizes with device-level Conv2d instance
+  test::conv::device::TestbedGroupConv2dProblemSizes problem_sizes(
+    ThreadblockShape::kN, ThreadblockShape::kK,
+    128/cutlass::sizeof_bits<ElementA>::value
+  );
+  EXPECT_TRUE(test::conv::device::TestSpecificConv2d<Conv2dGroupFprop>(problem_sizes.default_single_group_sizes));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Group_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32,
+  MultipleGroupPerCTA_128x128_64x3_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+  using ThreadblockShape   = cutlass::gemm::GemmShape<128, 128, 64>;
+  using WarpShape          = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape   = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  /// Device-level Conv2d instance
+  using Conv2dGroupFpropKernel = typename cutlass::conv::kernel::DefaultConv2dGroupFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::GroupMode::kMultipleGroup,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dGroupFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dGroupFpropKernel>;
+
+  /// Run group conv unit test sizes with device-level Conv2d instance
+  test::conv::device::TestbedGroupConv2dProblemSizes problem_sizes(
+    ThreadblockShape::kN, ThreadblockShape::kK,
+    128/cutlass::sizeof_bits<ElementA>::value
+  );
+  EXPECT_TRUE(test::conv::device::TestSpecificConv2d<Conv2dGroupFprop>(problem_sizes.default_multiple_group_sizes));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Group_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32,
+  MutipleGroupPerCTA_64x64_64x3_32x32x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+  using ThreadblockShape   = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape          = cutlass::gemm::GemmShape<32, 32, 64>;
+  using InstructionShape   = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  /// Device-level Conv2d instance
+  using Conv2dGroupFpropKernel = typename cutlass::conv::kernel::DefaultConv2dGroupFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::GroupMode::kMultipleGroup,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dGroupFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dGroupFpropKernel>;
+
+  /// Run group conv unit test sizes with device-level Conv2d instance
+  test::conv::device::TestbedGroupConv2dProblemSizes problem_sizes(
+    ThreadblockShape::kN, ThreadblockShape::kK,
+    128/cutlass::sizeof_bits<ElementA>::value
+  );
+  EXPECT_TRUE(test::conv::device::TestSpecificConv2d<Conv2dGroupFprop>(problem_sizes.default_multiple_group_sizes));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Analytic 2 stage SingleGroup kernel
+TEST(SM80_Device_Conv2d_Group_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32,
+  SingleGroupPerCTA_128x128_64x2_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+  using ThreadblockShape   = cutlass::gemm::GemmShape<128, 128, 64>;
+  using WarpShape          = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape   = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  /// Device-level Conv2d instance
+  using Conv2dGroupFpropKernel = typename cutlass::conv::kernel::DefaultConv2dGroupFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::GroupMode::kSingleGroup,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dGroupFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dGroupFpropKernel>;
+
+  /// Run group conv unit test sizes with device-level Conv2d instance
+  test::conv::device::TestbedGroupConv2dProblemSizes problem_sizes(
+    ThreadblockShape::kN, ThreadblockShape::kK,
+    128/cutlass::sizeof_bits<ElementA>::value
+  );
+  EXPECT_TRUE(test::conv::device::TestSpecificConv2d<Conv2dGroupFprop>(problem_sizes.default_single_group_sizes));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Analytic 2 stage MutipleGroup kernel
+TEST(SM80_Device_Conv2d_Group_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32,
+  MutipleGroupPerCTA_64x64_64x2_32x32x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+  using ThreadblockShape   = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape          = cutlass::gemm::GemmShape<32, 32, 64>;
+  using InstructionShape   = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  /// Device-level Conv2d instance
+  using Conv2dGroupFpropKernel = typename cutlass::conv::kernel::DefaultConv2dGroupFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::GroupMode::kMultipleGroup,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dGroupFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dGroupFpropKernel>;
+
+  /// Run group conv unit test sizes with device-level Conv2d instance
+  test::conv::device::TestbedGroupConv2dProblemSizes problem_sizes(
+    ThreadblockShape::kN, ThreadblockShape::kK,
+    128/cutlass::sizeof_bits<ElementA>::value
+  );
+  EXPECT_TRUE(test::conv::device::TestSpecificConv2d<Conv2dGroupFprop>(problem_sizes.default_multiple_group_sizes));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Group_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32,
+  SingleGroupPerCTA_128x128_64x3_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+  using ThreadblockShape   = cutlass::gemm::GemmShape<128, 128, 64>;
+  using WarpShape          = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape   = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  /// Device-level Conv2d instance
+  using Conv2dGroupFpropKernel = typename cutlass::conv::kernel::DefaultConv2dGroupFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::GroupMode::kSingleGroup,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dGroupFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dGroupFpropKernel>;
+
+  /// Run group conv unit test sizes with device-level Conv2d instance
+  test::conv::device::TestbedGroupConv2dProblemSizes problem_sizes(
+    ThreadblockShape::kN, ThreadblockShape::kK,
+    128/cutlass::sizeof_bits<ElementA>::value
+  );
+  EXPECT_TRUE(test::conv::device::TestSpecificConv2d<Conv2dGroupFprop>(problem_sizes.default_single_group_sizes));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Optimized multistage singleGroup kernel
+TEST(SM80_Device_Conv2d_Group_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32,
+  SingleGroupPerCTA_64x64_64x3_32x32x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+  using ThreadblockShape   = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape          = cutlass::gemm::GemmShape<32, 32, 64>;
+  using InstructionShape   = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  /// Device-level Conv2d instance
+  using Conv2dGroupFpropKernel = typename cutlass::conv::kernel::DefaultConv2dGroupFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::GroupMode::kSingleGroup,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dGroupFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dGroupFpropKernel>;
+
+  /// Run group conv unit test sizes with device-level Conv2d instance
+  test::conv::device::TestbedGroupConv2dProblemSizes problem_sizes(
+    ThreadblockShape::kN, ThreadblockShape::kK,
+    128/cutlass::sizeof_bits<ElementA>::value
+  );
+  EXPECT_TRUE(test::conv::device::TestSpecificConv2d<Conv2dGroupFprop>(problem_sizes.default_single_group_sizes));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Optimized 2 stage SingleGroup kernel
+TEST(SM80_Device_Conv2d_Group_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32,
+  SingleGroupPerCTA_64x64_64x2_32x32x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+  using ThreadblockShape   = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape          = cutlass::gemm::GemmShape<32, 32, 64>;
+  using InstructionShape   = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  /// Device-level Conv2d instance
+  using Conv2dGroupFpropKernel = typename cutlass::conv::kernel::DefaultConv2dGroupFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::GroupMode::kSingleGroup,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dGroupFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dGroupFpropKernel>;
+
+  /// Run group conv unit test sizes with device-level Conv2d instance
+  test::conv::device::TestbedGroupConv2dProblemSizes problem_sizes(
+    ThreadblockShape::kN, ThreadblockShape::kK,
+    128/cutlass::sizeof_bits<ElementA>::value
+  );
+  EXPECT_TRUE(test::conv::device::TestSpecificConv2d<Conv2dGroupFprop>(problem_sizes.default_single_group_sizes));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/conv/device_3x/CMakeLists.txt b/test/unit/conv/device_3x/CMakeLists.txt
new file mode 100644
index 0000000000..dddeba6f11
--- /dev/null
+++ b/test/unit/conv/device_3x/CMakeLists.txt
@@ -0,0 +1,32 @@
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+add_subdirectory(fprop)
+add_subdirectory(wgrad)
+add_subdirectory(dgrad)
diff --git a/test/unit/conv/device_3x/conv_problem_sizes.hpp b/test/unit/conv/device_3x/conv_problem_sizes.hpp
new file mode 100644
index 0000000000..d66de64a04
--- /dev/null
+++ b/test/unit/conv/device_3x/conv_problem_sizes.hpp
@@ -0,0 +1,1313 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief CUTLASS 3.x Implicit GEMM testbed sizes for ConvNd problem
+*/
+#pragma once
+
+#include "cutlass/conv/convnd_problem_shape.hpp"
+#include <vector>
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace test::conv::device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<int SpatialDim, cutlass::conv::Operator ConvOp, bool SupportStrides = (ConvOp != cutlass::conv::Operator::kDgrad)>
+std::vector<cutlass::conv::ConvProblemShape<ConvOp, SpatialDim>>
+inline
+get_conv_problem_vector();
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Fprop
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Specialization for 1D fprop problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kFprop, 1>> inline
+get_conv_problem_vector<1, cutlass::conv::Operator::kFprop>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kFprop, 1>;
+  std::vector<ProblemShape> problem_shapes;
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  8, 64},  // nwc
+    {64, 1, 64},  // ksc
+    {0},          // padding lower (pad_w)
+    {0},          // padding upper (pad_w)
+    {1},          // stride (stride_w)
+    {1},          // dilation (dilation_w)
+    1             // group
+  });
+  // non-packed input strides.
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,   8,  64},  // nwc
+    {800, 80, 1},   // stride (nwc)
+    {64,  1,  64},  // ksc
+    {64,  64, 1},   // stride (ksc)
+    {0},            // padding lower (pad_w)
+    {0},            // padding upper (pad_w)
+    {1},            // stride (stride_w)
+    {1},            // dilation (dilation_w)
+    1               // group
+  });
+  // non-packed output strides.
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,   8,  64},  // nwc
+    {512, 64, 1},   // stride (nwc)
+    {64,  1,  64},  // ksc
+    {64,  64, 1},   // stride (ksc)
+    {800, 80, 1},   // stride (nqk)
+    {0},            // padding lower (pad_w)
+    {0},            // padding upper (pad_w)
+    {1},            // stride (stride_w)
+    {1},            // dilation (dilation_w)
+    1               // group
+  });
+  // Filter-K = 16 for predication
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1, 8, 64},
+    {16,1, 64},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // N = 2 and K = 128 for a larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 64},
+    {96, 1, 64},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // N = 7 and K = 256 for a even larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {7,   8, 64},
+    {256, 1, 64},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // 3 filter, no padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 64},
+    {256, 3, 64},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // 3 filter, symmetric padding with c % cta_k !=0
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 32},
+    {256, 3, 32},
+    {1},
+    {1},
+    {1},
+    {1},
+    1
+  });
+  // 4 filter, asymmetric padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 64},
+    {256, 4, 64},
+    {0},
+    {1},
+    {1},
+    {1},
+    1
+  });
+  // 3 filter, asymmetric padding and tstride of 2
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 64},
+    {256, 3, 64},
+    {0},
+    {1},
+    {2},
+    {1},
+    1
+  });
+  // 3 filter, asymmetric padding and dilation of 2
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 64},
+    {256, 3, 64},
+    {0},
+    {1},
+    {1},
+    {2},
+    1
+  });
+  return problem_shapes;
+}
+
+// Specialization for 2D fprop problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kFprop, 2>> inline
+get_conv_problem_vector<2, cutlass::conv::Operator::kFprop>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kFprop, 2>;
+  std::vector<ProblemShape> problem_shapes;
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  8, 8, 64},  // nhwc
+    {64, 1, 1, 64},  // krsc
+    {0, 0},          // padding lower (pad_h, pad_w)
+    {0, 0},          // padding upper (pad_h, pad_w)
+    {1, 1},          // stride (stride_h, stride_w)
+    {1, 1},          // dilation (dilation_h, dilation_w)
+    1                // group
+  });
+  // non-packed input strides.
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,    8,   8,  64},  // nhwc
+    {8000, 800, 80, 1},   // stride (nhwc)
+    {64,   1,   1,  64},  // krsc
+    {64,   64,  64, 1},   // stride (krsc)
+    {0, 0},               // padding lower (pad_h, pad_w)
+    {0, 0},               // padding upper (pad_h, pad_w)
+    {1, 1},               // stride (stride_h, stride_w)
+    {1, 1},               // dilation (dilation_h, dilation_w)
+    1                     // group
+  });
+  // non-packed output strides.
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,    8,   8,  64},  // nhwc
+    {4096, 512, 64, 1},   // stride (nhwc)
+    {64,   1,   1,  64},  // krsc
+    {64,   64,  64, 1},   // stride (krsc)
+    {8000, 800, 80, 1},   // stride (npqk)
+    {0, 0},               // padding lower (pad_h, pad_w)
+    {0, 0},               // padding upper (pad_h, pad_w)
+    {1, 1},               // stride (stride_h, stride_w)
+    {1, 1},               // dilation (dilation_h, dilation_w)
+    1                     // group
+  });
+  // Filter-K = 16 for predication
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  8, 8, 64},
+    {16, 1, 1, 64},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // N = 2 and K = 128 for a larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 8, 64},
+    {96, 1, 1, 64},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // N = 7 and K = 256 for a even larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {7,   8, 8, 64},
+    {256, 1, 1, 64},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 3x3 filter, no padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 8, 64},
+    {256, 3, 3, 64},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 3x3 filter, symmetric padding with c % cta_k !=0
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 8, 32},
+    {256, 3, 3, 32},
+    {1, 1},
+    {1, 1},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,2/1,2
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 8, 64},
+    {256, 2, 5, 64},
+    {1, 1},
+    {2, 2},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,0/1,0, w/ stride
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   7, 7, 64},
+    {256, 2, 5, 64},
+    {1, 1},
+    {0, 0},
+    {2, 3},
+    {1, 1},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,0/1,0, w/ dilation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   16, 16, 64},
+    {256, 2,  5,  64},
+    {1, 1},
+    {0, 0},
+    {1, 1},
+    {2, 3},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,0/1,0, w/ stride, w/ dilation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   16, 15, 64},
+    {256, 2,  5,  64},
+    {1, 1},
+    {0, 0},
+    {2, 3},
+    {2, 3},
+    1
+  });
+  return problem_shapes;
+}
+
+// Specialization for 3D fprop problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kFprop, 3>> inline
+get_conv_problem_vector<3, cutlass::conv::Operator::kFprop>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kFprop, 3>;
+  std::vector<ProblemShape> problem_shapes;
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  1, 8, 8, 64},  // ndhwc
+    {64, 1, 1, 1, 64},  // ktrsc
+    {0, 0, 0},          // padding lower (pad_d, pad_h, pad_w)
+    {0, 0, 0},          // padding upper (pad_d, pad_h, pad_w)
+    {1, 1, 1},          // stride (stride_d, stride_h, stride_w)
+    {1, 1, 1},          // dilation (dilation_d, dilation_h, dilation_w)
+    1                   // group
+  });
+  // non-packed input output strides.
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,    1,    8,   8,  64},  // ndhwc
+    {8000, 8000, 800, 80, 1},   // stride (ndhwc)
+    {64,   1,    1,   1,  64},  // ktrsc
+    {64,   64,   64,  64, 1},   // stride (ktrsc)
+    {8000, 8000, 800, 80, 1},   // stride (nzpqk)
+    {0, 0, 0},                  // padding lower (pad_d, pad_h, pad_w)
+    {0, 0, 0},                  // padding upper (pad_d, pad_h, pad_w)
+    {1, 1, 1},                  // stride (stride_d, stride_h, stride_w)
+    {1, 1, 1},                  // dilation (dilation_d, dilation_h, dilation_w)
+    1                           // group
+  });
+  // Filter-K = 16 for predication
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  1, 8, 8, 64},
+    {16, 1, 1, 1, 64},
+    {0, 0, 0},
+    {0, 0, 0},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // N = 7 and K = 256 for a larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  1, 8, 8, 64},
+    {96, 1, 1, 1, 64},
+    {0, 0, 0},
+    {0, 0, 0},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x3x3 + no padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  3, 5, 8, 64},
+    {96, 3, 3, 3, 64},
+    {0, 0, 0},
+    {0, 0, 0},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x3x3 + symmetric padding with c % cta_k !=0
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  3, 5, 8, 32},
+    {96, 3, 3, 3, 32},
+    {1, 1, 1},
+    {1, 1, 1},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x4x5 + symmetric padding 111
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  3, 5, 8, 64},
+    {96, 3, 4, 5, 64},
+    {1, 1, 1},
+    {1, 1, 1},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x4x5 + asymmetric padding 102/010
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  3, 5, 8, 64},
+    {96, 3, 4, 5, 64},
+    {1, 0, 1},
+    {0, 2, 0},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x4x5 + asymmetric padding 102/010, w/ stride
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  16, 10, 16, 64},
+    {96, 3, 4, 5, 64},
+    {1, 0, 1},
+    {0, 2, 0},
+    {2, 2, 3},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x4x5 + asymmetric padding 102/010, w/ dilation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  16, 10, 16, 64},
+    {96, 3,  4,  5,  64},
+    {1, 0, 1},
+    {0, 2, 0},
+    {1, 1, 1},
+    {2, 2, 3},
+    1
+  });
+  // Filter 3x4x5 + asymmetric padding 102/010, w/ stride, w/ dilation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  16, 10, 16, 64},
+    {96, 3,  4,  5,  64},
+    {1, 0, 1},
+    {0, 2, 0},
+    {2, 2, 3},
+    {2, 2, 3},
+    1
+  });
+  return problem_shapes;
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Wgrad
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Specialization for 1D wgrad problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kWgrad, 1>> inline
+get_conv_problem_vector<1, cutlass::conv::Operator::kWgrad>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kWgrad, 1>;
+  std::vector<ProblemShape> problem_shapes;
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  8, 64},  // nwc
+    {64, 1, 64},  // ksc
+    {0},          // padding lower (pad_w)
+    {0},          // padding upper (pad_w)
+    {1},          // stride (stride_w)
+    {1},          // dilation (dilation_w)
+    1             // group
+  });
+  // Filter-K = 16 for predication
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1, 8, 64},
+    {16,1, 64},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // N = 2 and K = 128 for a larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 64},
+    {96, 1, 64},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // N = 7 and K = 256 for a even larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {7,   8, 64},
+    {256, 1, 64},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // 3 filter, no padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 32},
+    {256, 3, 32},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // 3 filter, symmetric padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 32},
+    {256, 3, 32},
+    {1},
+    {1},
+    {1},
+    {1},
+    1
+  });
+  // 4 filter, asymmetric padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 32},
+    {256, 4, 32},
+    {0},
+    {1},
+    {1},
+    {1},
+    1
+  });
+  // 3 filter, asymmetric padding and tstride of 2
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 32},
+    {256, 3, 32},
+    {0},
+    {1},
+    {2},
+    {1},
+    1
+  });
+  // 3 filter, asymmetric padding and dilation of 2
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 32},
+    {256, 3, 32},
+    {0},
+    {1},
+    {1},
+    {2},
+    1
+  });
+  // To test streamk, equals to gemm-MxNxK size 128x640x2048
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   1024, 128},
+    {640, 1,    128},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // To test streamk, equals to gemm-MxNxK size 128x640x2080
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   1040, 128},
+    {640, 1,    128},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  return problem_shapes;
+}
+
+// Specialization for 2D wgrad problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kWgrad, 2>> inline
+get_conv_problem_vector<2, cutlass::conv::Operator::kWgrad>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kWgrad, 2>;
+  std::vector<ProblemShape> problem_shapes;
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  8, 8, 64},  // nhwc
+    {64, 1, 1, 64},  // krsc
+    {0, 0},          // padding lower (pad_h, pad_w)
+    {0, 0},          // padding upper (pad_h, pad_w)
+    {1, 1},          // stride (stride_h, stride_w)
+    {1, 1},          // dilation (dilation_h, dilation_w)
+    1                // group
+  });
+  // Filter-K = 16 for predication
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  8, 8, 64},
+    {16, 1, 1, 64},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // N = 2 and K = 128 for a larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 8, 64},
+    {96, 1, 1, 64},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // N = 7 and K = 256 for a even larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {7,   8, 8, 64},
+    {256, 1, 1, 64},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 3x3 filter, no padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 8, 32},
+    {256, 3, 3, 32},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 3x3 filter, symmetric padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 8, 32},
+    {256, 3, 3, 32},
+    {1, 1},
+    {1, 1},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,0/1,0
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 8, 32},
+    {256, 2, 5, 32},
+    {1, 1},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,0/1,0, w/ stride
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   15, 16, 32},
+    {256, 2,  5,  32},
+    {1, 1},
+    {0, 0},
+    {2, 3},
+    {1, 1},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,0/1,0, w/ dilation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   16, 16, 32},
+    {256, 2,  5,  32},
+    {1, 1},
+    {0, 0},
+    {1, 1},
+    {2, 3},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,0/1,0, w/ stride, w/ dilation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   16, 15, 32},
+    {256, 2,  5,  32},
+    {1, 1},
+    {0, 0},
+    {2, 3},
+    {2, 3},
+    1
+  });
+  // To test streamk, equals to gemm-MxNxK size 128x640x2048
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   64, 16, 128},
+    {640, 1,  1,  128},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // To test streamk, equals to gemm-MxNxK size 128x640x2080
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   65, 16, 128},
+    {640, 1,  1,  128},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  return problem_shapes;
+}
+
+// Specialization for 3D wgrad problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kWgrad, 3>> inline
+get_conv_problem_vector<3, cutlass::conv::Operator::kWgrad>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kWgrad, 3>;
+  std::vector<ProblemShape> problem_shapes;
+  problem_shapes.push_back({
+     cutlass::conv::Mode::kCrossCorrelation,
+     {2,  1, 8, 8, 64},  // ndhwc
+     {64, 1, 1, 1, 64},  // ktrsc
+     {0, 0, 0},          // padding lower (pad_d, pad_h, pad_w)
+     {0, 0, 0},          // padding upper (pad_d, pad_h, pad_w)
+     {1, 1, 1},          // stride (stride_d, stride_h, stride_w)
+     {1, 1, 1},          // dilation (dilation_d, dilation_h, dilation_w)
+     1                   // group
+   });
+  // Filter 3x3x3 + no padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  3, 5, 8, 32},
+    {96, 3, 3, 3, 32},
+    {0, 0, 0},
+    {0, 0, 0},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x4x5 + asymmetric padding 102/010
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  3, 5, 8, 32},
+    {96, 3, 4, 5, 32},
+    {1, 0, 1},
+    {0, 2, 0},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x4x5 + asymmetric padding 102/010, w/ stride
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  16, 10, 16, 32},
+    {96, 3,  4,  5,  32},
+    {1, 0, 1},
+    {0, 2, 0},
+    {2, 2, 3},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x4x5 + asymmetric padding 102/010, w/ dilation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  16, 10, 16, 32},
+    {96, 3,  4,  5,  32},
+    {1, 0, 1},
+    {0, 2, 0},
+    {1, 1, 1},
+    {2, 2, 3},
+    1
+  });
+  // To test streamk, equals to gemm-MxNxK size 128x640x2048
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   1, 64, 16, 128},
+    {640, 1, 1,  1,  128},
+    {0, 0, 0},
+    {0, 0, 0},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // To test streamk, equals to gemm-MxNxK size 128x640x2080
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   1, 65, 16, 128},
+    {640, 1, 1,  1,  128},
+    {0, 0, 0},
+    {0, 0, 0},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  return problem_shapes;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Unit Stride Dgrad
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Specialization for 1D dgrad problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 1>> inline
+get_conv_problem_vector<1, cutlass::conv::Operator::kDgrad, false>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 1>;
+  std::vector<ProblemShape> problem_shapes;
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  8, 64},  // nqk
+    {64, 1, 64},  // ksc
+    {0},          // padding lower (pad_w)
+    {0},          // padding upper (pad_w)
+    {1},          // stride (stride_w)
+    {1},          // dilation (dilation_w)
+    1             // group
+  });
+  // non-packed input strides.
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,   8,  64},  // nqk
+    {800, 80, 1},   // stride (nqk)
+    {64,  1,  64},  // ksc
+    {64,  64, 1},   // stride (ksc)
+    {0},            // padding lower (pad_w)
+    {0},            // padding upper (pad_w)
+    {1},            // stride (stride_w)
+    {1},            // dilation (dilation_w)
+    1               // group
+  });
+  // non-packed output strides.
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,   8,  64},  // nqk
+    {512, 64, 1},   // stride (nqk)
+    {64,  1,  64},  // ksc
+    {64,  64, 1},   // stride (ksc)
+    {800, 80, 1},   // stride (nwc)
+    {0},            // padding lower (pad_w)
+    {0},            // padding upper (pad_w)
+    {1},            // stride (stride_w)
+    {1},            // dilation (dilation_w)
+    1               // group
+  });
+  // Filter-K = 16 for predication
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  8, 16},
+    {64, 1, 16},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // N = 2 and K = 128 for a larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 96},
+    {64, 1, 96},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // N = 7 and K = 256 for a even larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {7,  8, 256},
+    {64, 1, 256},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // 3 filter, no padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 256},
+    {64, 3, 256},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // 3 filter, symmetric padding with k % cta_k !=0
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 256},
+    {32, 3, 256},
+    {1},
+    {1},
+    {1},
+    {1},
+    1
+  });
+  // 4 filter, asymmetric padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 256},
+    {64, 4, 256},
+    {0},
+    {1},
+    {1},
+    {1},
+    1
+  });
+  // 3 filter, asymmetric padding and dilation of 2
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   16, 64},
+    {256, 3,  64},
+    {0},
+    {1},
+    {1},
+    {2},
+    1
+  });
+  return problem_shapes;
+}
+
+// Specialization for 2D dgrad problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 2>> inline
+get_conv_problem_vector<2, cutlass::conv::Operator::kDgrad, false>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 2>;
+  std::vector<ProblemShape> problem_shapes;
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  8, 8, 64},  // npqk
+    {64, 1, 1, 64},  // krsc
+    {0, 0},          // padding lower (pad_h, pad_w)
+    {0, 0},          // padding upper (pad_h, pad_w)
+    {1, 1},          // stride (stride_h, stride_w)
+    {1, 1},          // dilation (dilation_h, dilation_w)
+    1                // group
+  });
+  // non-packed input strides.
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,    8,   8,  64},  // npqk
+    {8000, 800, 80, 1},   // stride (npqk)
+    {64,   1,   1,  64},  // krsc
+    {64,   64,  64, 1},   // stride (krsc)
+    {0, 0},               // padding lower (pad_h, pad_w)
+    {0, 0},               // padding upper (pad_h, pad_w)
+    {1, 1},               // stride (stride_h, stride_w)
+    {1, 1},               // dilation (dilation_h, dilation_w)
+    1                     // group
+  });
+  // non-packed output strides.
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,    8,   8,  64},  // npqk
+    {4096, 512, 64, 1},   // stride (npqk)
+    {64,   1,   1,  64},  // krsc
+    {64,   64,  64, 1},   // stride (krsc)
+    {8000, 800, 80, 1},   // stride (nhwc)
+    {0, 0},               // padding lower (pad_h, pad_w)
+    {0, 0},               // padding upper (pad_h, pad_w)
+    {1, 1},               // stride (stride_h, stride_w)
+    {1, 1},               // dilation (dilation_h, dilation_w)
+    1                     // group
+  });
+  // Filter-K = 16 for predication
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  8, 8, 16},
+    {64, 1, 1, 16},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // N = 2 and K = 128 for a larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 8, 96},
+    {64, 1, 1, 96},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // N = 7 and K = 256 for a even larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {7,  8, 8, 256},
+    {64, 1, 1, 256},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 3x3 filter, no padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 8, 256},
+    {64, 3, 3, 256},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 3x3 filter, symmetric padding with k % cta_k !=0
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 8, 256},
+    {32, 3, 3, 256},
+    {1, 1},
+    {1, 1},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,0/1,0
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 8, 256},
+    {64, 2, 5, 256},
+    {1, 1},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,0/1,0, w/ dilation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   16, 16, 64},
+    {256, 2,  5,  64},
+    {1, 1},
+    {0, 0},
+    {1, 1},
+    {2, 3},
+    1
+  });
+  return problem_shapes;
+}
+
+// Specialization for 3D dgrad problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 3>> inline
+get_conv_problem_vector<3, cutlass::conv::Operator::kDgrad, false>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 3>;
+  std::vector<ProblemShape> problem_shapes;
+  // Filter-K = 16 for predication
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  1, 8, 8, 16},
+    {64, 1, 1, 1, 16},
+    {0, 0, 0},
+    {0, 0, 0},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // non-packed input output strides.
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,    1,    8,   8,  64},  // nzpqk
+    {8000, 8000, 800, 80, 1},   // stride (nzpqk)
+    {64,   1,    1,   1,  64},  // ktrsc
+    {64,   64,   64,  64, 1},   // stride (ktrsc)
+    {8000, 8000, 800, 80, 1},   // stride (ndhwc)
+    {0, 0, 0},                  // padding lower (pad_d, pad_h, pad_w)
+    {0, 0, 0},                  // padding upper (pad_d, pad_h, pad_w)
+    {1, 1, 1},                  // stride (stride_d, stride_h, stride_w)
+    {1, 1, 1},                  // dilation (dilation_d, dilation_h, dilation_w)
+    1                           // group
+  });
+  // N = 7 and K = 256 for a larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  1, 8, 8, 96},
+    {64, 1, 1, 1, 96},
+    {0, 0, 0},
+    {0, 0, 0},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x4x5 + symmetric padding 111
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  3, 5, 8, 96},
+    {64, 3, 4, 5, 96},
+    {1, 1, 1},
+    {1, 1, 1},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x4x5 + asymmetric padding 102/010
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  3, 5, 8, 96},
+    {64, 3, 4, 5, 96},
+    {1, 0, 1},
+    {0, 2, 0},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x4x5 + asymmetric padding 102/010, w/ dilation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  16, 10, 16, 64},
+    {64, 3,  4,  5,  96},
+    {1, 0, 1},
+    {0, 2, 0},
+    {1, 1, 1},
+    {2, 2, 3},
+    1
+  });
+  return problem_shapes;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Strided Dgrad
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Specialization for 1D dgrad problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 1>> inline
+get_conv_problem_vector<1, cutlass::conv::Operator::kDgrad, true>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 1>;
+  std::vector<ProblemShape> problem_shapes;
+  // Test TMA truncation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  512, 64},  // nqk
+    {64, 1, 64},  // ksc
+    {0},          // padding lower (pad_w)
+    {0},          // padding upper (pad_w)
+    {2},          // stride (stride_w)
+    {1},          // dilation (dilation_w)
+    1             // group
+  });
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  1024, 64},  // nqk
+    {64, 1, 64},  // ksc
+    {0},          // padding lower (pad_w)
+    {0},          // padding upper (pad_w)
+    {4},          // stride (stride_w)
+    {1},          // dilation (dilation_w)
+    1             // group
+  });
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  2048, 64},  // nqk
+    {64, 1, 64},  // ksc
+    {0},          // padding lower (pad_w)
+    {0},          // padding upper (pad_w)
+    {8},          // stride (stride_w)
+    {1},          // dilation (dilation_w)
+    1             // group
+  });
+  // non-packed input/output strides.
+  // stride divides dilation
+  // asymmetric padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {3,   8,  64},  // nqk
+    {800, 80, 1},   // stride (nqk)
+    {64,  3,  64},  // ksc
+    {64,  64, 1},   // stride (ksc)
+    {800, 80, 1},   // stride (nwc)
+    {0},            // padding lower (pad_w)
+    {1},            // padding upper (pad_w)
+    {2},            // stride (stride_w)
+    {4},            // dilation (dilation_w)
+    1               // group
+  });
+  // non-packed input/output strides.
+  // dilation divides stride
+  // asymmetric padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {3,   8,  64},  // nqk
+    {800, 80, 1},   // stride (nqk)
+    {64,  3,  64},  // ksc
+    {64,  64, 1},   // stride (ksc)
+    {800, 80, 1},   // stride (nwc)
+    {1},            // padding lower (pad_w)
+    {0},            // padding upper (pad_w)
+    {4},            // stride (stride_w)
+    {2},            // dilation (dilation_w)
+    1               // group
+  });
+  // non-packed input/output strides.
+  // stride dilation dont divide
+  // asymmetric padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {3,   8,  64},  // nqk
+    {800, 80, 1},   // stride (nqk)
+    {64,  3,  64},  // ksc
+    {64,  64, 1},   // stride (ksc)
+    {800, 80, 1},   // stride (nwc)
+    {1},            // padding lower (pad_w)
+    {2},            // padding upper (pad_w)
+    {2},            // stride (stride_w)
+    {3},            // dilation (dilation_w)
+    1               // group
+  });
+  return problem_shapes;
+}
+
+// Specialization for 2D dgrad problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 2>> inline
+get_conv_problem_vector<2, cutlass::conv::Operator::kDgrad, true>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 2>;
+  std::vector<ProblemShape> problem_shapes;
+  // 2x5 filter, asymmetric padding 1,0/1,0, w/ dilation
+  // mode 0 stride divides dilation
+  // mode 1 dilation divides stride
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {3,   16, 16, 64},
+    {256, 2, 5, 64},
+    {1, 0},
+    {0, 1},
+    {2, 4},
+    {4, 2},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,0/1,0, w/ dilation
+  // mode 0 dilation divides stride
+  // mode 1 stride divides dilation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {3,   16, 16, 64},
+    {256, 2, 5, 64},
+    {1, 0},
+    {0, 1},
+    {4, 2},
+    {2, 4},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,0/1,0, w/ dilation
+  // stride dilation dont divide
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {3,   16, 16, 64},
+    {256, 2, 5, 64},
+    {1, 0},
+    {0, 1},
+    {3, 2},
+    {2, 3},
+    1
+  });
+  return problem_shapes;
+}
+
+// Specialization for 3D dgrad problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 3>> inline
+get_conv_problem_vector<3, cutlass::conv::Operator::kDgrad, true>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 3>;
+  std::vector<ProblemShape> problem_shapes;
+  // Filter 3x4x5 + asymmetric padding 102/010, w/ dilation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  16, 10, 16, 64},
+    {64, 3, 4, 5, 96},
+    {1, 0, 1},
+    {0, 2, 0},
+    {2, 1, 2},
+    {4, 2, 3},
+    1
+  });
+  return problem_shapes;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::test
diff --git a/test/unit/conv/device_3x/dgrad/CMakeLists.txt b/test/unit/conv/device_3x/dgrad/CMakeLists.txt
new file mode 100644
index 0000000000..a37bdbdb6a
--- /dev/null
+++ b/test/unit/conv/device_3x/dgrad/CMakeLists.txt
@@ -0,0 +1,49 @@
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+add_custom_target(
+  cutlass_test_unit_conv_dgrad_device
+  DEPENDS
+  cutlass_test_unit_conv_dgrad_device_tensorop_sm90
+)
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_conv_dgrad_device_tensorop_sm90
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+
+  sm90_conv1d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
+  sm90_conv2d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
+  sm90_conv3d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
+
+  sm90_conv1d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
+  sm90_conv2d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
+  sm90_conv3d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
+)
+
diff --git a/test/unit/conv/device_3x/dgrad/sm90_conv1d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu b/test/unit/conv/device_3x/dgrad/sm90_conv1d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
new file mode 100644
index 0000000000..5b8aaf6d4a
--- /dev/null
+++ b/test/unit/conv/device_3x/dgrad/sm90_conv1d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
@@ -0,0 +1,422 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide CONV interface
+*/
+
+#include "cutlass_unit_test.h"
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/conv/device/conv_universal_adapter.hpp"
+#include "cutlass/conv/kernel/conv_universal.hpp"
+#include "cutlass/conv/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "../testbed_conv.hpp"
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 64x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+TEST(SM90_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f16, 64x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+TEST(SM90_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f16, 64x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+TEST(SM90_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f16, 64x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+TEST(SM90_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f16, 64x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 128x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+TEST(SM90_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f16, 128x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+TEST(SM90_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f16, 128x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+TEST(SM90_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f16, 128x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+TEST(SM90_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f16, 128x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/conv/device_3x/dgrad/sm90_conv1d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu b/test/unit/conv/device_3x/dgrad/sm90_conv1d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
new file mode 100644
index 0000000000..61add31efe
--- /dev/null
+++ b/test/unit/conv/device_3x/dgrad/sm90_conv1d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
@@ -0,0 +1,421 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide CONV interface
+*/
+
+#include "cutlass_unit_test.h"
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/conv/device/conv_universal_adapter.hpp"
+#include "cutlass/conv/kernel/conv_universal.hpp"
+#include "cutlass/conv/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "../testbed_conv.hpp"
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 64x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x2x1
+//
+TEST(SM90_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 64x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x1x1
+//
+TEST(SM90_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 64x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+TEST(SM90_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 64x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+TEST(SM90_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 64x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 128x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+TEST(SM90_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 128x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+TEST(SM90_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 128x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+TEST(SM90_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 128x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+TEST(SM90_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 128x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/conv/device_3x/dgrad/sm90_conv2d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu b/test/unit/conv/device_3x/dgrad/sm90_conv2d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
new file mode 100644
index 0000000000..1928f5d77c
--- /dev/null
+++ b/test/unit/conv/device_3x/dgrad/sm90_conv2d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
@@ -0,0 +1,429 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide CONV interface
+*/
+
+#include "cutlass_unit_test.h"
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/conv/device/conv_universal_adapter.hpp"
+#include "cutlass/conv/kernel/conv_universal.hpp"
+#include "cutlass/conv/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "../testbed_conv.hpp"
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 64x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 2x1x1
+//
+
+TEST(SM90_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f16, 64x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNHWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNHWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x1x1
+//
+
+TEST(SM90_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f16, 64x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNHWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNHWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+
+TEST(SM90_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f16, 64x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNHWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNHWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+
+TEST(SM90_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f16, 64x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNHWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNHWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 128x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+
+TEST(SM90_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f16, 128x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNHWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNHWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+
+TEST(SM90_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f16, 128x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNHWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNHWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+
+TEST(SM90_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f16, 128x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNHWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNHWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+
+TEST(SM90_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f16, 128x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNHWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNHWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/conv/device_3x/dgrad/sm90_conv2d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu b/test/unit/conv/device_3x/dgrad/sm90_conv2d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
new file mode 100644
index 0000000000..95365a20ee
--- /dev/null
+++ b/test/unit/conv/device_3x/dgrad/sm90_conv2d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
@@ -0,0 +1,426 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/conv/device/conv_universal_adapter.hpp"
+#include "cutlass/conv/kernel/conv_universal.hpp"
+#include "cutlass/conv/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "../testbed_conv.hpp"
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 64x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 2x1x1
+//
+
+TEST(SM90_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 64x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNHWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNHWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x1x1
+//
+
+TEST(SM90_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 64x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNHWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNHWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+
+TEST(SM90_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 64x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNHWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNHWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+
+TEST(SM90_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 64x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNHWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNHWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 128x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+
+TEST(SM90_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 128x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNHWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNHWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+
+TEST(SM90_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 128x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNHWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNHWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+
+TEST(SM90_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 128x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNHWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNHWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+
+TEST(SM90_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 128x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNHWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNHWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/conv/device_3x/dgrad/sm90_conv3d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu b/test/unit/conv/device_3x/dgrad/sm90_conv3d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
new file mode 100644
index 0000000000..17d41b5aba
--- /dev/null
+++ b/test/unit/conv/device_3x/dgrad/sm90_conv3d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
@@ -0,0 +1,427 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/conv/device/conv_universal_adapter.hpp"
+#include "cutlass/conv/kernel/conv_universal.hpp"
+#include "cutlass/conv/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "../testbed_conv.hpp"
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 64x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 2x1x1
+//
+
+TEST(SM90_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f16, 64x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNDHWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNDHWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+
+//
+// Cluster 1x1x1
+//
+
+TEST(SM90_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f16, 64x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNDHWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNDHWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+
+TEST(SM90_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f16, 64x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNDHWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNDHWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+
+TEST(SM90_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f16, 64x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNDHWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNDHWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 128x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+
+TEST(SM90_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f16, 128x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNDHWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNDHWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+
+TEST(SM90_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f16, 128x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNDHWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNDHWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+
+TEST(SM90_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f16, 128x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNDHWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNDHWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+
+TEST(SM90_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f16, 128x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNDHWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNDHWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/conv/device_3x/dgrad/sm90_conv3d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu b/test/unit/conv/device_3x/dgrad/sm90_conv3d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
new file mode 100644
index 0000000000..b742364990
--- /dev/null
+++ b/test/unit/conv/device_3x/dgrad/sm90_conv3d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
@@ -0,0 +1,426 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/conv/device/conv_universal_adapter.hpp"
+#include "cutlass/conv/kernel/conv_universal.hpp"
+#include "cutlass/conv/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "../testbed_conv.hpp"
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 64x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+
+TEST(SM90_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 64x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNDHWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNDHWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+
+TEST(SM90_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 64x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNDHWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNDHWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+
+TEST(SM90_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 64x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNDHWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNDHWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+
+TEST(SM90_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 64x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNDHWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNDHWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 128x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+
+TEST(SM90_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 128x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNDHWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNDHWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+
+TEST(SM90_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 128x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNDHWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNDHWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+
+TEST(SM90_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 128x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNDHWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNDHWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+
+TEST(SM90_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 128x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorNDHWC, 8,
+      cutlass::half_t, cutlass::layout::TensorNDHWC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kDgrad,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/conv/device_3x/fprop/CMakeLists.txt b/test/unit/conv/device_3x/fprop/CMakeLists.txt
new file mode 100644
index 0000000000..480d0e45f7
--- /dev/null
+++ b/test/unit/conv/device_3x/fprop/CMakeLists.txt
@@ -0,0 +1,75 @@
+# Copyright (c) 2013 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+add_custom_target(
+  cutlass_test_unit_conv_fprop_device
+  DEPENDS
+  cutlass_test_unit_conv1d_fprop_device_tensorop_sm90
+  cutlass_test_unit_conv2d_fprop_device_tensorop_sm90
+  cutlass_test_unit_conv3d_fprop_device_tensorop_sm90
+)
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_conv1d_fprop_device_tensorop_sm90
+
+  # No batching of source to control compiler memory usage
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+
+  sm90_conv1d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu
+  sm90_conv1d_fprop_implicit_gemm_f16_f16_f32_tensorop_f16.cu
+  sm90_conv1d_fprop_implicit_gemm_f16_f16_f32_tensorop_f32.cu
+  sm90_conv1d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32.cu
+)
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_conv2d_fprop_device_tensorop_sm90
+
+  # No batching of source to control compiler memory usage
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+
+  sm90_conv2d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu
+  sm90_conv2d_fprop_implicit_gemm_f16_f16_f32_tensorop_f16.cu
+  sm90_conv2d_fprop_implicit_gemm_f16_f16_f32_tensorop_f32.cu
+  sm90_conv2d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32.cu
+)
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_conv3d_fprop_device_tensorop_sm90
+
+  # No batching of source to control compiler memory usage
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+
+  sm90_conv3d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu
+  sm90_conv3d_fprop_implicit_gemm_f16_f16_f32_tensorop_f16.cu
+  sm90_conv3d_fprop_implicit_gemm_f16_f16_f32_tensorop_f32.cu
+  sm90_conv3d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32.cu
+)
+
diff --git a/test/unit/conv/device_3x/fprop/sm90_conv1d_fprop_implicit_gemm_f16_f16_f32_tensorop_f16.cu b/test/unit/conv/device_3x/fprop/sm90_conv1d_fprop_implicit_gemm_f16_f16_f32_tensorop_f16.cu
new file mode 100644
index 0000000000..b501bdfb96
--- /dev/null
+++ b/test/unit/conv/device_3x/fprop/sm90_conv1d_fprop_implicit_gemm_f16_f16_f32_tensorop_f16.cu
@@ -0,0 +1,419 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/conv/device/conv_universal_adapter.hpp"
+#include "cutlass/conv/kernel/conv_universal.hpp"
+#include "cutlass/conv/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "../testbed_conv.hpp"
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 64x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+TEST(SM90_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f16, 64x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>; 
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+TEST(SM90_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f16, 64x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+TEST(SM90_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f16, 64x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+  
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+TEST(SM90_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f16, 64x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 128x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+TEST(SM90_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f16, 128x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+TEST(SM90_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f16, 128x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+TEST(SM90_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f16, 128x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+TEST(SM90_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f16, 128x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/conv/device_3x/fprop/sm90_conv1d_fprop_implicit_gemm_f16_f16_f32_tensorop_f32.cu b/test/unit/conv/device_3x/fprop/sm90_conv1d_fprop_implicit_gemm_f16_f16_f32_tensorop_f32.cu
new file mode 100644
index 0000000000..4e2ef6bf3b
--- /dev/null
+++ b/test/unit/conv/device_3x/fprop/sm90_conv1d_fprop_implicit_gemm_f16_f16_f32_tensorop_f32.cu
@@ -0,0 +1,421 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/conv/device/conv_universal_adapter.hpp"
+#include "cutlass/conv/kernel/conv_universal.hpp"
+#include "cutlass/conv/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "../testbed_conv.hpp"
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 64x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+TEST(SM90_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 64x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+TEST(SM90_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 64x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+TEST(SM90_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 64x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+TEST(SM90_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 64x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 128x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+TEST(SM90_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 128x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+TEST(SM90_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 128x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+TEST(SM90_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 128x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+TEST(SM90_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 128x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/conv/device_3x/fprop/sm90_conv1d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu b/test/unit/conv/device_3x/fprop/sm90_conv1d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu
new file mode 100644
index 0000000000..98f7f0012a
--- /dev/null
+++ b/test/unit/conv/device_3x/fprop/sm90_conv1d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu
@@ -0,0 +1,423 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/conv/device/conv_universal_adapter.hpp"
+#include "cutlass/conv/kernel/conv_universal.hpp"
+#include "cutlass/conv/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "../testbed_conv.hpp"
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 64x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+TEST(SM90_device_conv1d_fprop_implicitgemm_s8nwc_s8nwc_s32nwc_tensor_op_s32, 64x64x64_1x1x1) {
+
+  using ElementAct     = int8_t;
+  using ElementFlt     = int8_t;
+  using ElementOut     = int32_t;
+  using ElementAcc     = int32_t;
+  using ElementCompute = int32_t;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      int8_t, cutlass::layout::TensorNWC, 16,
+      int32_t, cutlass::layout::TensorNWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
+      ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape = cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::NumSpatialDimensions>;
+
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+
+//
+// Cluster 2x1x1
+//
+TEST(SM90_device_conv1d_fprop_implicitgemm_s8nwc_s8nwc_s32nwc_tensor_op_s32, 64x64x64_2x1x1) {
+  using ElementAct     = int8_t;
+  using ElementFlt     = int8_t;
+  using ElementOut     = int32_t;
+  using ElementAcc     = int32_t;
+  using ElementCompute = int32_t;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      int8_t, cutlass::layout::TensorNWC, 16,
+      int32_t, cutlass::layout::TensorNWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
+      ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape = cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+TEST(SM90_device_conv1d_fprop_implicitgemm_s8nwc_s8nwc_s32nwc_tensor_op_s32, 64x64x64_1x2x1) {
+  using ElementAct     = int8_t;
+  using ElementFlt     = int8_t;
+  using ElementOut     = int32_t;
+  using ElementAcc     = int32_t;
+  using ElementCompute = int32_t;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      int8_t, cutlass::layout::TensorNWC, 16,
+      int32_t, cutlass::layout::TensorNWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
+      ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape = cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+TEST(SM90_device_conv1d_fprop_implicitgemm_s8nwc_s8nwc_s32nwc_tensor_op_s32, 64x64x64_2x2x1) {
+  using ElementAct     = int8_t;
+  using ElementFlt     = int8_t;
+  using ElementOut     = int32_t;
+  using ElementAcc     = int32_t;
+  using ElementCompute = int32_t;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      int8_t, cutlass::layout::TensorNWC, 16,
+      int32_t, cutlass::layout::TensorNWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
+      ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape = cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 128x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+TEST(SM90_device_conv1d_fprop_implicitgemm_s8nwc_s8nwc_s32nwc_tensor_op_s32, 128x64x64_1x1x1) {
+  using ElementAct     = int8_t;
+  using ElementFlt     = int8_t;
+  using ElementOut     = int32_t;
+  using ElementAcc     = int32_t;
+  using ElementCompute = int32_t;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      int8_t, cutlass::layout::TensorNWC, 16,
+      int32_t, cutlass::layout::TensorNWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
+      ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape = cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+TEST(SM90_device_conv1d_fprop_implicitgemm_s8nwc_s8nwc_s32nwc_tensor_op_s32, 128x64x64_1x2x1) {
+  using ElementAct     = int8_t;
+  using ElementFlt     = int8_t;
+  using ElementOut     = int32_t;
+  using ElementAcc     = int32_t;
+  using ElementCompute = int32_t;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      int8_t, cutlass::layout::TensorNWC, 16,
+      int32_t, cutlass::layout::TensorNWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
+      ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape = cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+TEST(SM90_device_conv1d_fprop_implicitgemm_s8nwc_s8nwc_s32nwc_tensor_op_s32, 128x64x64_2x1x1) {
+  using ElementAct     = int8_t;
+  using ElementFlt     = int8_t;
+  using ElementOut     = int32_t;
+  using ElementAcc     = int32_t;
+  using ElementCompute = int32_t;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      int8_t, cutlass::layout::TensorNWC, 16,
+      int32_t, cutlass::layout::TensorNWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
+      ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape = cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+TEST(SM90_device_conv1d_fprop_implicitgemm_s8nwc_s8nwc_s32nwc_tensor_op_s32, 128x64x64_2x2x1) {
+  using ElementAct     = int8_t;
+  using ElementFlt     = int8_t;
+  using ElementOut     = int32_t;
+  using ElementAcc     = int32_t;
+  using ElementCompute = int32_t;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      int8_t, cutlass::layout::TensorNWC, 16,
+      int32_t, cutlass::layout::TensorNWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
+      ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape = cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/conv/device_3x/fprop/sm90_conv1d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32.cu b/test/unit/conv/device_3x/fprop/sm90_conv1d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32.cu
new file mode 100644
index 0000000000..4b35683ac0
--- /dev/null
+++ b/test/unit/conv/device_3x/fprop/sm90_conv1d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32.cu
@@ -0,0 +1,419 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/conv/device/conv_universal_adapter.hpp"
+#include "cutlass/conv/kernel/conv_universal.hpp"
+#include "cutlass/conv/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "../testbed_conv.hpp"
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 64x64x32
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+TEST(SM90_device_conv1d_fprop_implicitgemm_tf32nwc_tf32nwc_f32nwc_tensor_op_f32, 64x64x32_1x1x1) {
+  using ElementAct     = float;
+  using ElementFlt     = float;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_32>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      float, cutlass::layout::TensorNWC, 4,
+      float, cutlass::layout::TensorNWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNWC, 4,
+      ElementFlt, cutlass::layout::TensorNWC, 4,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+TEST(SM90_device_conv1d_fprop_implicitgemm_tf32nwc_tf32nwc_f32nwc_tensor_op_f32, 64x64x32_1x2x1) {
+  using ElementAct     = float;
+  using ElementFlt     = float;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_32>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      float, cutlass::layout::TensorNWC, 4,
+      float, cutlass::layout::TensorNWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNWC, 4,
+      ElementFlt, cutlass::layout::TensorNWC, 4,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+TEST(SM90_device_conv1d_fprop_implicitgemm_tf32nwc_tf32nwc_f32nwc_tensor_op_f32, 64x64x32_2x1x1) {
+  using ElementAct     = float;
+  using ElementFlt     = float;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_32>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      float, cutlass::layout::TensorNWC, 4,
+      float, cutlass::layout::TensorNWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNWC, 4,
+      ElementFlt, cutlass::layout::TensorNWC, 4,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+TEST(SM90_device_conv1d_fprop_implicitgemm_tf32nwc_tf32nwc_f32nwc_tensor_op_f32, 64x64x32_2x2x1) {
+  using ElementAct     = float;
+  using ElementFlt     = float;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_32>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      float, cutlass::layout::TensorNWC, 4,
+      float, cutlass::layout::TensorNWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNWC, 4,
+      ElementFlt, cutlass::layout::TensorNWC, 4,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 128x64x32
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+TEST(SM90_device_conv1d_fprop_implicitgemm_tf32nwc_tf32nwc_f32nwc_tensor_op_f32, 128x64x32_1x1x1) {
+  using ElementAct     = float;
+  using ElementFlt     = float;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_32>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      float, cutlass::layout::TensorNWC, 4,
+      float, cutlass::layout::TensorNWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNWC, 4,
+      ElementFlt, cutlass::layout::TensorNWC, 4,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+TEST(SM90_device_conv1d_fprop_implicitgemm_tf32nwc_tf32nwc_f32nwc_tensor_op_f32, 128x64x32_1x2x1) {
+  using ElementAct     = float;
+  using ElementFlt     = float;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_32>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      float, cutlass::layout::TensorNWC, 4,
+      float, cutlass::layout::TensorNWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNWC, 4,
+      ElementFlt, cutlass::layout::TensorNWC, 4,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+TEST(SM90_device_conv1d_fprop_implicitgemm_tf32nwc_tf32nwc_f32nwc_tensor_op_f32, 128x64x32_2x1x1) {
+  using ElementAct     = float;
+  using ElementFlt     = float;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_32>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      float, cutlass::layout::TensorNWC, 4,
+      float, cutlass::layout::TensorNWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNWC, 4,
+      ElementFlt, cutlass::layout::TensorNWC, 4,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>; 
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+TEST(SM90_device_conv1d_fprop_implicitgemm_tf32nwc_tf32nwc_f32nwc_tensor_op_f32, 128x64x32_2x2x1) {
+  using ElementAct     = float;
+  using ElementFlt     = float;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_32>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      float, cutlass::layout::TensorNWC, 4,
+      float, cutlass::layout::TensorNWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNWC, 4,
+      ElementFlt, cutlass::layout::TensorNWC, 4,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/conv/device_3x/fprop/sm90_conv2d_fprop_implicit_gemm_f16_f16_f32_tensorop_f16.cu b/test/unit/conv/device_3x/fprop/sm90_conv2d_fprop_implicit_gemm_f16_f16_f32_tensorop_f16.cu
new file mode 100644
index 0000000000..62378de758
--- /dev/null
+++ b/test/unit/conv/device_3x/fprop/sm90_conv2d_fprop_implicit_gemm_f16_f16_f32_tensorop_f16.cu
@@ -0,0 +1,435 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/conv/device/conv_universal_adapter.hpp"
+#include "cutlass/conv/kernel/conv_universal.hpp"
+#include "cutlass/conv/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "../testbed_conv.hpp"
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 64x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+
+TEST(SM90_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f16, 64x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNHWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(/*alpha=*/1.0, /*beta=*/1.0));
+}
+
+//
+// Cluster 2x1x1
+//
+
+TEST(SM90_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f16, 64x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNHWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(/*alpha=*/1.0, /*beta=*/1.0));
+}
+
+//
+// Cluster 1x2x1
+//
+
+TEST(SM90_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f16, 64x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNHWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(/*alpha=*/1.0, /*beta=*/1.0));
+}
+
+//
+// Cluster 2x2x1
+//
+
+TEST(SM90_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f16, 64x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNHWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(/*alpha=*/1.0, /*beta=*/1.0));
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 128x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+
+TEST(SM90_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f16, 128x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNHWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(/*alpha=*/1.0, /*beta=*/1.0));
+}
+
+//
+// Cluster 2x1x1
+//
+
+TEST(SM90_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f16, 128x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNHWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(/*alpha=*/1.0, /*beta=*/1.0));
+}
+
+//
+// Cluster 1x2x1
+//
+
+TEST(SM90_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f16, 128x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNHWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(/*alpha=*/1.0, /*beta=*/1.0));
+}
+
+//
+// Cluster 2x2x1
+//
+
+TEST(SM90_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f16, 128x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNHWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(/*alpha=*/1.0, /*beta=*/1.0));
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/conv/device_3x/fprop/sm90_conv2d_fprop_implicit_gemm_f16_f16_f32_tensorop_f32.cu b/test/unit/conv/device_3x/fprop/sm90_conv2d_fprop_implicit_gemm_f16_f16_f32_tensorop_f32.cu
new file mode 100644
index 0000000000..7058d46093
--- /dev/null
+++ b/test/unit/conv/device_3x/fprop/sm90_conv2d_fprop_implicit_gemm_f16_f16_f32_tensorop_f32.cu
@@ -0,0 +1,434 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/conv/device/conv_universal_adapter.hpp"
+#include "cutlass/conv/kernel/conv_universal.hpp"
+#include "cutlass/conv/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "../testbed_conv.hpp"
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 64x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+
+TEST(SM90_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 64x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNHWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(/*alpha=*/1.0, /*beta=*/1.0));
+}
+
+//
+// Cluster 2x1x1
+//
+
+TEST(SM90_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 64x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNHWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(/*alpha=*/1.0, /*beta=*/1.0));
+}
+
+//
+// Cluster 1x2x1
+//
+
+TEST(SM90_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 64x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNHWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(/*alpha=*/1.0, /*beta=*/1.0));
+}
+
+//
+// Cluster 2x2x1
+//
+
+TEST(SM90_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 64x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNHWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(/*alpha=*/1.0, /*beta=*/1.0));
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 128x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+
+TEST(SM90_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 128x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNHWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(/*alpha=*/1.0, /*beta=*/1.0));
+}
+
+//
+// Cluster 2x1x1
+//
+
+TEST(SM90_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 128x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNHWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(/*alpha=*/1.0, /*beta=*/1.0));
+}
+
+//
+// Cluster 1x2x1
+//
+
+TEST(SM90_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 128x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNHWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(/*alpha=*/1.0, /*beta=*/1.0));
+}
+
+//
+// Cluster 2x2x1
+//
+
+TEST(SM90_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 128x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNHWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(/*alpha=*/1.0, /*beta=*/1.0));
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/conv/device_3x/fprop/sm90_conv2d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu b/test/unit/conv/device_3x/fprop/sm90_conv2d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu
new file mode 100644
index 0000000000..dbddc698a0
--- /dev/null
+++ b/test/unit/conv/device_3x/fprop/sm90_conv2d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu
@@ -0,0 +1,426 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/conv/device/conv_universal_adapter.hpp"
+#include "cutlass/conv/kernel/conv_universal.hpp"
+#include "cutlass/conv/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "../testbed_conv.hpp"
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 64x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+
+TEST(SM90_device_conv2d_fprop_implicitgemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, 64x64x64_1x1x1) {
+  using ElementAct     = int8_t;
+  using ElementFlt     = int8_t;
+  using ElementOut     = int32_t;
+  using ElementAcc     = int32_t;
+  using ElementCompute = int32_t;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      int8_t, cutlass::layout::TensorNHWC, 16,
+      int32_t, cutlass::layout::TensorNHWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
+      ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+
+TEST(SM90_device_conv2d_fprop_implicitgemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, 64x64x64_2x1x1) {
+  using ElementAct     = int8_t;
+  using ElementFlt     = int8_t;
+  using ElementOut     = int32_t;
+  using ElementAcc     = int32_t;
+  using ElementCompute = int32_t;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      int8_t, cutlass::layout::TensorNHWC, 16,
+      int32_t, cutlass::layout::TensorNHWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
+      ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+
+TEST(SM90_device_conv2d_fprop_implicitgemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, 64x64x64_1x2x1) {
+  using ElementAct     = int8_t;
+  using ElementFlt     = int8_t;
+  using ElementOut     = int32_t;
+  using ElementAcc     = int32_t;
+  using ElementCompute = int32_t;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      int8_t, cutlass::layout::TensorNHWC, 16,
+      int32_t, cutlass::layout::TensorNHWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
+      ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+
+TEST(SM90_device_conv2d_fprop_implicitgemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, 64x64x64_2x2x1) {
+  using ElementAct     = int8_t;
+  using ElementFlt     = int8_t;
+  using ElementOut     = int32_t;
+  using ElementAcc     = int32_t;
+  using ElementCompute = int32_t;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      int8_t, cutlass::layout::TensorNHWC, 16,
+      int32_t, cutlass::layout::TensorNHWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
+      ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 128x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+
+TEST(SM90_device_conv2d_fprop_implicitgemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, 128x64x64_1x1x1) {
+  using ElementAct     = int8_t;
+  using ElementFlt     = int8_t;
+  using ElementOut     = int32_t;
+  using ElementAcc     = int32_t;
+  using ElementCompute = int32_t;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      int8_t, cutlass::layout::TensorNHWC, 16,
+      int32_t, cutlass::layout::TensorNHWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
+      ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+
+TEST(SM90_device_conv2d_fprop_implicitgemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, 128x64x64_2x1x1) {
+  using ElementAct     = int8_t;
+  using ElementFlt     = int8_t;
+  using ElementOut     = int32_t;
+  using ElementAcc     = int32_t;
+  using ElementCompute = int32_t;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      int8_t, cutlass::layout::TensorNHWC, 16,
+      int32_t, cutlass::layout::TensorNHWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
+      ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+
+TEST(SM90_device_conv2d_fprop_implicitgemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, 128x64x64_1x2x1) {
+  using ElementAct     = int8_t;
+  using ElementFlt     = int8_t;
+  using ElementOut     = int32_t;
+  using ElementAcc     = int32_t;
+  using ElementCompute = int32_t;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      int8_t, cutlass::layout::TensorNHWC, 16,
+      int32_t, cutlass::layout::TensorNHWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
+      ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+
+TEST(SM90_device_conv2d_fprop_implicitgemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, 128x64x64_2x2x1) {
+  using ElementAct     = int8_t;
+  using ElementFlt     = int8_t;
+  using ElementOut     = int32_t;
+  using ElementAcc     = int32_t;
+  using ElementCompute = int32_t;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      int8_t, cutlass::layout::TensorNHWC, 16,
+      int32_t, cutlass::layout::TensorNHWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
+      ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/conv/device_3x/fprop/sm90_conv2d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32.cu b/test/unit/conv/device_3x/fprop/sm90_conv2d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32.cu
new file mode 100644
index 0000000000..7fab79d35a
--- /dev/null
+++ b/test/unit/conv/device_3x/fprop/sm90_conv2d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32.cu
@@ -0,0 +1,434 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/conv/device/conv_universal_adapter.hpp"
+#include "cutlass/conv/kernel/conv_universal.hpp"
+#include "cutlass/conv/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "../testbed_conv.hpp"
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 64x64x32
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+
+TEST(SM90_device_conv2d_fprop_implicitgemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32, 64x64x32_1x1x1) {
+  using ElementAct     = float;
+  using ElementFlt     = float;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_32>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      float, cutlass::layout::TensorNHWC, 4,
+      float, cutlass::layout::TensorNHWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNHWC, 4,
+      ElementFlt, cutlass::layout::TensorNHWC, 4,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(/*alpha=*/1.0, /*beta=*/1.0));
+}
+
+//
+// Cluster 2x1x1
+//
+
+TEST(SM90_device_conv2d_fprop_implicitgemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32, 64x64x32_2x1x1) {
+  using ElementAct     = float;
+  using ElementFlt     = float;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_32>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      float, cutlass::layout::TensorNHWC, 4,
+      float, cutlass::layout::TensorNHWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNHWC, 4,
+      ElementFlt, cutlass::layout::TensorNHWC, 4,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(/*alpha=*/1.0, /*beta=*/1.0));
+}
+
+//
+// Cluster 1x2x1
+//
+
+TEST(SM90_device_conv2d_fprop_implicitgemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32, 64x64x32_1x2x1) {
+  using ElementAct     = float;
+  using ElementFlt     = float;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_32>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      float, cutlass::layout::TensorNHWC, 4,
+      float, cutlass::layout::TensorNHWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNHWC, 4,
+      ElementFlt, cutlass::layout::TensorNHWC, 4,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(/*alpha=*/1.0, /*beta=*/1.0));
+}
+
+//
+// Cluster 2x2x1
+//
+
+TEST(SM90_device_conv2d_fprop_implicitgemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32, 64x64x32_2x2x1) {
+  using ElementAct     = float;
+  using ElementFlt     = float;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_32>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      float, cutlass::layout::TensorNHWC, 4,
+      float, cutlass::layout::TensorNHWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNHWC, 4,
+      ElementFlt, cutlass::layout::TensorNHWC, 4,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(/*alpha=*/1.0, /*beta=*/1.0));
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 128x64x32
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+
+TEST(SM90_device_conv2d_fprop_implicitgemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32, 128x64x32_1x1x1) {
+  using ElementAct     = float;
+  using ElementFlt     = float;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_32>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      float, cutlass::layout::TensorNHWC, 4,
+      float, cutlass::layout::TensorNHWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNHWC, 4,
+      ElementFlt, cutlass::layout::TensorNHWC, 4,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(/*alpha=*/1.0, /*beta=*/1.0));
+}
+
+//
+// Cluster 2x1x1
+//
+
+TEST(SM90_device_conv2d_fprop_implicitgemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32, 128x64x32_2x1x1) {
+  using ElementAct     = float;
+  using ElementFlt     = float;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_32>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      float, cutlass::layout::TensorNHWC, 4,
+      float, cutlass::layout::TensorNHWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNHWC, 4,
+      ElementFlt, cutlass::layout::TensorNHWC, 4,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(/*alpha=*/1.0, /*beta=*/1.0));
+}
+
+//
+// Cluster 1x2x1
+//
+
+TEST(SM90_device_conv2d_fprop_implicitgemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32, 128x64x32_1x2x1) {
+  using ElementAct     = float;
+  using ElementFlt     = float;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_32>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      float, cutlass::layout::TensorNHWC, 4,
+      float, cutlass::layout::TensorNHWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNHWC, 4,
+      ElementFlt, cutlass::layout::TensorNHWC, 4,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(/*alpha=*/1.0, /*beta=*/1.0));
+}
+
+//
+// Cluster 2x2x1
+//
+
+TEST(SM90_device_conv2d_fprop_implicitgemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32, 128x64x32_2x2x1) {
+  using ElementAct     = float;
+  using ElementFlt     = float;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_32>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      float, cutlass::layout::TensorNHWC, 4,
+      float, cutlass::layout::TensorNHWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNHWC, 4,
+      ElementFlt, cutlass::layout::TensorNHWC, 4,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(/*alpha=*/1.0, /*beta=*/1.0));
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_f16_f16_f32_tensorop_f16.cu b/test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_f16_f16_f32_tensorop_f16.cu
new file mode 100644
index 0000000000..c2a30d859a
--- /dev/null
+++ b/test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_f16_f16_f32_tensorop_f16.cu
@@ -0,0 +1,426 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/conv/device/conv_universal_adapter.hpp"
+#include "cutlass/conv/kernel/conv_universal.hpp"
+#include "cutlass/conv/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "../testbed_conv.hpp"
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 64x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+
+TEST(SM90_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f16, 64x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNDHWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+
+TEST(SM90_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f16, 64x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNDHWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+ 
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+
+TEST(SM90_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f16, 64x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNDHWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+
+TEST(SM90_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f16, 64x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNDHWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 128x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+
+TEST(SM90_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f16, 128x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNDHWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+
+TEST(SM90_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f16, 128x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNDHWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+
+TEST(SM90_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f16, 128x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNDHWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+
+TEST(SM90_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f16, 128x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNDHWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_f16_f16_f32_tensorop_f32.cu b/test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_f16_f16_f32_tensorop_f32.cu
new file mode 100644
index 0000000000..af2a4a9ce2
--- /dev/null
+++ b/test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_f16_f16_f32_tensorop_f32.cu
@@ -0,0 +1,426 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/conv/device/conv_universal_adapter.hpp"
+#include "cutlass/conv/kernel/conv_universal.hpp"
+#include "cutlass/conv/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "../testbed_conv.hpp"
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 64x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+
+TEST(SM90_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 64x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNDHWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+
+TEST(SM90_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 64x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNDHWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+
+TEST(SM90_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 64x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNDHWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+
+TEST(SM90_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 64x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNDHWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 128x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+
+TEST(SM90_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 128x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNDHWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+
+TEST(SM90_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 128x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNDHWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+
+TEST(SM90_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 128x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNDHWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+
+TEST(SM90_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 128x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
+      ElementOut, cutlass::layout::TensorNDHWC, 128 /  cutlass::sizeof_bits<ElementOut>::value,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu b/test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu
new file mode 100644
index 0000000000..417ed2e1f8
--- /dev/null
+++ b/test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu
@@ -0,0 +1,426 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/conv/device/conv_universal_adapter.hpp"
+#include "cutlass/conv/kernel/conv_universal.hpp"
+#include "cutlass/conv/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "../testbed_conv.hpp"
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 64x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 2x1x1
+//
+
+TEST(SM90_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 64x64x64_2x1x1) {
+  using ElementAct     = int8_t;
+  using ElementFlt     = int8_t;
+  using ElementOut     = int32_t;
+  using ElementAcc     = int32_t;
+  using ElementCompute = int32_t;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      int8_t, cutlass::layout::TensorNDHWC, 16,
+      int32_t, cutlass::layout::TensorNDHWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
+      ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x1x1
+//
+
+TEST(SM90_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 64x64x64_1x1x1) {
+  using ElementAct     = int8_t;
+  using ElementFlt     = int8_t;
+  using ElementOut     = int32_t;
+  using ElementAcc     = int32_t;
+  using ElementCompute = int32_t;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      int8_t, cutlass::layout::TensorNDHWC, 16,
+      int32_t, cutlass::layout::TensorNDHWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
+      ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+
+TEST(SM90_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 64x64x64_1x2x1) {
+  using ElementAct     = int8_t;
+  using ElementFlt     = int8_t;
+  using ElementOut     = int32_t;
+  using ElementAcc     = int32_t;
+  using ElementCompute = int32_t;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      int8_t, cutlass::layout::TensorNDHWC, 16,
+      int32_t, cutlass::layout::TensorNDHWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
+      ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+
+TEST(SM90_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 64x64x64_2x2x1) {
+  using ElementAct     = int8_t;
+  using ElementFlt     = int8_t;
+  using ElementOut     = int32_t;
+  using ElementAcc     = int32_t;
+  using ElementCompute = int32_t;
+  using TileShapeMNK = Shape<_64, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      int8_t, cutlass::layout::TensorNDHWC, 16,
+      int32_t, cutlass::layout::TensorNDHWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
+      ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 128x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+
+TEST(SM90_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 128x64x64_1x1x1) {
+  using ElementAct     = int8_t;
+  using ElementFlt     = int8_t;
+  using ElementOut     = int32_t;
+  using ElementAcc     = int32_t;
+  using ElementCompute = int32_t;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      int8_t, cutlass::layout::TensorNDHWC, 16,
+      int32_t, cutlass::layout::TensorNDHWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
+      ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+
+TEST(SM90_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 128x64x64_2x1x1) {
+  using ElementAct     = int8_t;
+  using ElementFlt     = int8_t;
+  using ElementOut     = int32_t;
+  using ElementAcc     = int32_t;
+  using ElementCompute = int32_t;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      int8_t, cutlass::layout::TensorNDHWC, 16,
+      int32_t, cutlass::layout::TensorNDHWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
+      ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+
+TEST(SM90_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 128x64x64_1x2x1) {
+  using ElementAct     = int8_t;
+  using ElementFlt     = int8_t;
+  using ElementOut     = int32_t;
+  using ElementAcc     = int32_t;
+  using ElementCompute = int32_t;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      int8_t, cutlass::layout::TensorNDHWC, 16,
+      int32_t, cutlass::layout::TensorNDHWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
+      ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+
+TEST(SM90_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 128x64x64_2x2x1) {
+  using ElementAct     = int8_t;
+  using ElementFlt     = int8_t;
+  using ElementOut     = int32_t;
+  using ElementAcc     = int32_t;
+  using ElementCompute = int32_t;
+  using TileShapeMNK = Shape<_128, _64, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      int8_t, cutlass::layout::TensorNDHWC, 16,
+      int32_t, cutlass::layout::TensorNDHWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
+      ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32.cu b/test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32.cu
new file mode 100644
index 0000000000..60a81a6778
--- /dev/null
+++ b/test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32.cu
@@ -0,0 +1,426 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/conv/device/conv_universal_adapter.hpp"
+#include "cutlass/conv/kernel/conv_universal.hpp"
+#include "cutlass/conv/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "../testbed_conv.hpp"
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 64x64x32
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+
+TEST(SM90_device_conv3d_fprop_implicitgemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32, 64x64x32_1x1x1) {
+  using ElementAct     = float;
+  using ElementFlt     = float;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_32>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      float, cutlass::layout::TensorNDHWC, 4,
+      float, cutlass::layout::TensorNDHWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNDHWC, 4,
+      ElementFlt, cutlass::layout::TensorNDHWC, 4,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+
+TEST(SM90_device_conv3d_fprop_implicitgemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32, 64x64x32_2x1x1) {
+  using ElementAct     = float;
+  using ElementFlt     = float;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_32>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      float, cutlass::layout::TensorNDHWC, 4,
+      float, cutlass::layout::TensorNDHWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNDHWC, 4,
+      ElementFlt, cutlass::layout::TensorNDHWC, 4,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+
+TEST(SM90_device_conv3d_fprop_implicitgemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32, 64x64x32_1x2x1) {
+  using ElementAct     = float;
+  using ElementFlt     = float;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_32>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      float, cutlass::layout::TensorNDHWC, 4,
+      float, cutlass::layout::TensorNDHWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNDHWC, 4,
+      ElementFlt, cutlass::layout::TensorNDHWC, 4,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+
+TEST(SM90_device_conv3d_fprop_implicitgemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32, 64x64x32_2x2x1) {
+  using ElementAct     = float;
+  using ElementFlt     = float;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, _64, Shape<_32>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      float, cutlass::layout::TensorNDHWC, 4,
+      float, cutlass::layout::TensorNDHWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNDHWC, 4,
+      ElementFlt, cutlass::layout::TensorNDHWC, 4,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 128x64x32
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+
+TEST(SM90_device_conv3d_fprop_implicitgemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32, 128x64x32_1x1x1) {
+  using ElementAct     = float;
+  using ElementFlt     = float;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_32>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      float, cutlass::layout::TensorNDHWC, 4,
+      float, cutlass::layout::TensorNDHWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNDHWC, 4,
+      ElementFlt, cutlass::layout::TensorNDHWC, 4,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+
+TEST(SM90_device_conv3d_fprop_implicitgemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32, 128x64x32_2x1x1) {
+  using ElementAct     = float;
+  using ElementFlt     = float;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_32>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      float, cutlass::layout::TensorNDHWC, 4,
+      float, cutlass::layout::TensorNDHWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNDHWC, 4,
+      ElementFlt, cutlass::layout::TensorNDHWC, 4,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+
+TEST(SM90_device_conv3d_fprop_implicitgemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32, 128x64x32_1x2x1) {
+  using ElementAct     = float;
+  using ElementFlt     = float;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_32>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      float, cutlass::layout::TensorNDHWC, 4,
+      float, cutlass::layout::TensorNDHWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNDHWC, 4,
+      ElementFlt, cutlass::layout::TensorNDHWC, 4,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+
+TEST(SM90_device_conv3d_fprop_implicitgemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32, 128x64x32_2x2x1) {
+  using ElementAct     = float;
+  using ElementFlt     = float;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, _64, Shape<_32>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      float, cutlass::layout::TensorNDHWC, 4,
+      float, cutlass::layout::TensorNDHWC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kFprop,
+      ElementAct, cutlass::layout::TensorNDHWC, 4,
+      ElementFlt, cutlass::layout::TensorNDHWC, 4,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/conv/device_3x/testbed_conv.hpp b/test/unit/conv/device_3x/testbed_conv.hpp
new file mode 100644
index 0000000000..b392165c36
--- /dev/null
+++ b/test/unit/conv/device_3x/testbed_conv.hpp
@@ -0,0 +1,658 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implicit GEMM testbed for 3.x API
+*/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "../../common/cutlass_unit_test.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/convnd_problem_shape.hpp"
+#include "../test/unit/gemm/device/gemm_testbed_3x.hpp"
+
+#include "thrust/universal_vector.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/reference/host/conv.hpp"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "conv_problem_sizes.hpp"
+#include "../cache_testbed_output.h"
+
+#include <iostream>
+
+#include "cute/layout.hpp"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace test::conv::device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Initializes a flat device buffer
+template <typename Element>
+static void
+initialize_values(
+    thrust::universal_vector<Element>& dst_ptr,
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+  if (cutlass::Distribution::Uniform == dist_kind) {
+    int scope;
+    int bits = cutlass::sizeof_bits<Element>::value;
+
+    if (bits <= 8) {
+      scope = 2;
+    }
+    else if (bits == 16) {
+      scope = 4;
+    }
+    else {
+      scope = 8;
+    }
+    cutlass::reference::host::BlockFillRandomUniform(
+        dst_ptr.data().get(), dst_ptr.size(), seed, scope, -scope, 0);
+  }
+  else if (cutlass::Distribution::Identity == dist_kind) {
+    cutlass::reference::host::BlockFillRandomUniform(
+        dst_ptr.data().get(), dst_ptr.size(), seed, 0, 0, 0);
+  }
+  else if (cutlass::Distribution::Gaussian == dist_kind) {
+    cutlass::reference::host::BlockFillRandomGaussian(dst_ptr.data().get(), dst_ptr.size(), seed, 0, 0.5);
+  }
+  else if (cutlass::Distribution::Sequential == dist_kind) {
+    cutlass::reference::host::BlockFillSequential(dst_ptr.data().get(), dst_ptr.size());
+  }
+  else {
+    std::cerr << "Invalid distribution kind!\n.";
+    exit(1);
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// utils for sparse or dense conv parameters
+
+template <class Conv>
+struct DenseConvParams {
+  // Default Kernel data types
+  using ElementA = typename Conv::ConvKernel::ElementA;
+  using ElementB = typename Conv::ConvKernel::ElementB;
+  
+  static constexpr cutlass::conv::Operator ConvOp = Conv::DispatchPolicy::ConvOp;
+  static constexpr int NumSpatialDimensions = Conv::NumSpatialDimensions;
+  using ProblemShape = cutlass::conv::ConvProblemShape<ConvOp, NumSpatialDimensions>;
+
+  // get the default arguments without sparse data
+  auto get_mainloop_arguments(
+    [[maybe_unused]] ProblemShape const& problem_shape,  
+    thrust::universal_vector<ElementA>& tensor_A,
+    thrust::universal_vector<ElementB>& tensor_B
+  ) {
+    auto args = typename Conv::ConvKernel::MainloopArguments {
+      tensor_A.data().get(),
+      tensor_B.data().get(),
+    };
+    return args;
+  }
+};
+
+template <class Conv>
+struct SparseConvParams {
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <class Conv, bool isSparseEnabled_ = false>
+struct ConvTestbed {
+  // Kernel data types
+  using ElementA = typename Conv::ConvKernel::ElementA;
+  using ElementB = typename Conv::ConvKernel::ElementB;
+  using ElementC = cute::conditional_t<cute::is_void_v<typename Conv::ConvKernel::ElementC>,
+      typename Conv::ConvKernel::ElementD, typename Conv::ConvKernel::ElementC>;
+  using ElementD = typename Conv::ConvKernel::ElementD;
+  using ElementAccumulator = typename Conv::ConvKernel::ElementAccumulator;
+
+  // ConvTest for sparse kernel
+  static constexpr bool isSparseEnabled = isSparseEnabled_;
+  using ConvParams = cute::conditional_t<isSparseEnabled, SparseConvParams<Conv>, DenseConvParams<Conv>>;
+  ConvParams params;
+
+  //
+  // FusionOperation derived types/queries
+  //
+  using FusionOp = typename Conv::EpilogueOutputOp;
+
+  // fusion types are potentially void if the fusion is not supported
+  // helper so we don't try to construct HostTensor with void type
+  template <typename T, typename U = uint8_t>
+  using non_void_t               = cute::conditional_t<cute::is_void_v<T>, U, T>;
+  using ElementScalar            = typename FusionOp::ElementScalar;
+  using ElementCompute           = typename FusionOp::ElementCompute;
+  using BiasType                 = typename cutlass::epilogue::collective::detail::IsThreadEpilogueOpWithBias<FusionOp>::type;
+  using ElementBias              = non_void_t<BiasType>;
+  using ActivationType           = non_void_t<typename cutlass::epilogue::collective::detail::IsThreadEpilogueOpWithActivation<FusionOp>::type,
+                                   cutlass::epilogue::thread::Identity<ElementCompute>>;
+  static constexpr bool IsActivationEnabled = cutlass::epilogue::collective::detail::IsThreadEpilogueOpWithActivation<FusionOp>::value;
+  using ActivationFunctor        = cute::conditional_t<IsActivationEnabled, ActivationType, cutlass::epilogue::thread::Identity<ElementCompute>>;
+
+  static constexpr bool IsBiasEnabled = cutlass::epilogue::collective::detail::IsThreadEpilogueOpWithBias<FusionOp>::value &&
+                                        !cute::is_same_v<BiasType, void>;
+  static constexpr bool DisableSource = cute::is_void_v<typename FusionOp::ElementSource>;
+
+  using StrideC  = typename Conv::ConvKernel::StrideC;
+  using StrideD  = typename Conv::ConvKernel::StrideD;
+  using ThreadEpilogueOp = typename Conv::ConvKernel::CollectiveEpilogue::ThreadEpilogueOp;
+
+  static constexpr cutlass::conv::Operator ConvOp = Conv::DispatchPolicy::ConvOp;
+  static constexpr int NumSpatialDimensions = Conv::NumSpatialDimensions;
+  using ProblemShape = cutlass::conv::ConvProblemShape<ConvOp, NumSpatialDimensions>;
+  using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90::RasterOrderOptions;
+  using DecompositionMode = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
+  using MaxSwizzleSize = typename gemm::device::detail::MaxSwizzleSize;
+  using Splits = typename gemm::device::detail::Splits;
+
+  using Schedule = typename Conv::DispatchPolicy::Schedule;
+  /// Initialization
+  cutlass::Distribution::Kind init_A = cutlass::Distribution::Uniform;
+  cutlass::Distribution::Kind init_B = cutlass::Distribution::Uniform;
+  cutlass::Distribution::Kind init_C = cutlass::Distribution::Uniform;
+  cutlass::Distribution::Kind init_bias = cutlass::Distribution::Uniform;
+  cutlass::Distribution::Kind init_disable = cutlass::Distribution::Identity; // all zeros
+  uint64_t seed = 6090;
+  float epsilon = 0.0f;
+  int split_p_slices = 1;
+  thrust::universal_vector<ElementA> tensor_A;
+  thrust::universal_vector<ElementB> tensor_B;
+  thrust::universal_vector<ElementC> tensor_C;
+  thrust::universal_vector<ElementD> tensor_D_computed;
+  thrust::universal_vector<ElementD> tensor_D_reference;
+  thrust::universal_vector<ElementBias> tensor_bias;
+  thrust::universal_vector<ElementScalar> tensor_alpha;
+  thrust::universal_vector<ElementScalar> tensor_beta;
+
+  // Return true on success, else false
+  bool initialize(ProblemShape const& problem_shape, uint64_t seed = 6090) {
+    tensor_A.resize(sizeof(ElementA) * problem_shape.size_A());
+    tensor_B.resize(sizeof(ElementB) * problem_shape.size_B());
+    tensor_C.resize(sizeof(ElementC) * problem_shape.size_C());
+    tensor_D_computed.resize(sizeof(ElementD) * problem_shape.size_C());
+    tensor_D_reference.resize(sizeof(ElementD) * problem_shape.size_C());
+    tensor_bias.resize(sizeof(ElementBias) * cute::size(cute::get<0>(problem_shape.get_shape_B())));
+    initialize_values(tensor_A, init_A, seed);
+    initialize_values(tensor_B, init_B, seed * 11);
+    initialize_values(tensor_C, init_C, seed * 17);
+    initialize_values(tensor_bias, init_bias, seed * 19);
+    bool flag = true;
+    if constexpr (isSparseEnabled) {
+      flag &= params.initialize(problem_shape, tensor_B, static_cast<int>(seed + 2023));
+    }
+
+    return flag;
+  }
+
+  // Determine SMEM requirements and waive if not satisfied
+  bool sufficient() const {
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    int max_smem_size;
+    result = cudaDeviceGetAttribute(&max_smem_size, cudaDevAttrMaxSharedMemoryPerBlockOptin, device_idx);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaDeviceGetAttribute() failed");
+    }
+
+    return max_smem_size >= Conv::ConvKernel::SharedStorageSize;
+  }
+
+  // Executes one test
+  bool run(
+    ProblemShape const& problem_shape,
+    ElementScalar alpha = ElementScalar(1),
+    ElementScalar beta = ElementScalar(0)
+    ,
+    RasterOrderOptions raster_order = RasterOrderOptions::Heuristic,
+    MaxSwizzleSize max_swizzle = MaxSwizzleSize{},
+    Splits splits = Splits{},
+    DecompositionMode decomposition_mode = DecompositionMode::Heuristic
+  ) {
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device.\n";
+      }
+      return true;
+    }
+
+    bool ret = initialize(problem_shape);
+    
+    if (!ret) {
+      std::cerr << "initialize failed for the given problem_shape: \n";
+      return false;
+    }
+
+    cutlass::KernelHardwareInfo hw_info;
+    cudaGetDevice(&hw_info.device_id);
+    hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+    // configure the operator
+    Conv conv_op;
+    auto stride_C = StrideC{};
+    auto stride_D = StrideD{};
+    if constexpr (ConvOp == cutlass::conv::Operator::kWgrad) {
+      stride_C = cutlass::make_cute_packed_stride(
+        StrideC{}, problem_shape.shape_C, problem_shape.stride_C, ConvOp);
+      stride_D = cutlass::make_cute_packed_stride(
+        StrideD{}, problem_shape.shape_C, problem_shape.stride_C, ConvOp);
+    }
+    // Need to support non-packed output strides for fprop and dgrad kernel.
+    else {
+      cute::for_each(cute::make_seq<cute::rank<0>(StrideC{})>{}, [&](auto i) {
+        cute::get<0, i>(stride_C) = problem_shape.stride_C[ProblemShape::RankT-2-i];
+      });
+      cute::for_each(cute::make_seq<cute::rank<0>(StrideD{})>{}, [&](auto i) {
+        cute::get<0, i>(stride_D) = problem_shape.stride_C[ProblemShape::RankT-2-i];
+      });
+    }
+
+    using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90::RasterOrderOptions;
+   using DecompositionMode = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
+
+    typename Conv::ConvKernel::TileScheduler::Arguments scheduler_args{};
+    if constexpr (cute::is_same_v<typename Conv::ConvKernel::TileSchedulerTag, cutlass::gemm::StreamKScheduler>) {
+      scheduler_args = { static_cast<int>(splits), static_cast<int>(max_swizzle), raster_order, decomposition_mode };
+    }
+
+    auto mainloop_args = params.get_mainloop_arguments(problem_shape, tensor_A, tensor_B); 
+
+    auto epilogue_args = typename Conv::ConvKernel::EpilogueArguments {
+      {},
+      tensor_C.data().get(),
+      stride_C,
+      tensor_D_computed.data().get(),
+      stride_D,
+    };
+
+    auto args = typename Conv::Arguments {
+      problem_shape,
+      mainloop_args, // MainloopArguments
+      epilogue_args, // EpilogueArguments
+      hw_info,
+      scheduler_args
+    };
+
+    auto &fusion_args = args.epilogue.thread;
+
+    fusion_args.alpha = alpha;
+    fusion_args.beta = beta;
+
+    if constexpr (IsBiasEnabled) {
+      fusion_args.bias_ptr = tensor_bias.data().get();
+    }
+
+    // Clamp bound
+    if constexpr (cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::Clamp<ElementCompute>>) {
+      fusion_args.activation.lower_bound = CUTLASS_STL_NAMESPACE::numeric_limits<ElementCompute>::lowest();
+      fusion_args.activation.upper_bound = CUTLASS_STL_NAMESPACE::numeric_limits<ElementCompute>::max();
+    }
+
+    // Scale
+    if constexpr (cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::ScaledGELU_taylor<ElementCompute>> ||
+                  cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::ScaledGELU<ElementCompute>> ||
+                  cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::ScaledSiLu<ElementCompute>> ||
+                  cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::ScaledHardSwish<ElementCompute>> ) {
+      fusion_args.activation.scale = ElementCompute{1};
+    }
+
+    // LeakyRelu
+    if constexpr (cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::LeakyReLU<ElementCompute>> ) {
+      fusion_args.activation.leaky_alpha = ElementCompute{0};
+    }
+
+    cutlass::Status status = cutlass::Status::kInvalid;
+
+    status = conv_op.can_implement(args);
+    EXPECT_EQ(conv_op.can_implement(args), cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      std::cerr << "can_implement failed for the given problem_shape: \n";
+      print(problem_shape);
+      return false;
+    }
+
+    // find workspace requirement for parallel split-k reduction
+    size_t workspace_size = Conv::get_workspace_size(args);
+    thrust::universal_vector<uint8_t> workspace(workspace_size);
+
+    status = conv_op.initialize(args, workspace.data().get());
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return true;
+    }
+
+    // run conv3d operator
+    status = conv_op();
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+
+    bool passed = false;
+    cudaError_t result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess) << " Kernel execution error: "
+                                   << cudaGetErrorString(result);
+
+    // Create cute::Tensors using the logical rank-3 MNK multi-mode shapes the mainloop gives us
+    auto shape_mA = cute::reverse(problem_shape.shape_A);
+    auto shape_mB = cute::reverse(problem_shape.shape_B);
+    auto shape_mC = cute::reverse(problem_shape.shape_C);
+    auto shape_mBias = cute::make_shape(cute::size(cute::get<0>(problem_shape.get_shape_B())));
+
+    auto stride_mA = cute::reverse(problem_shape.stride_A);
+    auto stride_mB = cute::reverse(problem_shape.stride_B);
+    auto stride_mC = cute::reverse(problem_shape.stride_C);
+
+    auto mA = make_tensor(tensor_A.data().get(), make_layout(shape_mA, stride_mA));
+    auto mB = make_tensor(tensor_B.data().get(), make_layout(shape_mB, stride_mB));
+    auto mC = make_tensor(tensor_C.data().get(), make_layout(shape_mC, stride_mC));
+    auto mD_ref = make_tensor(tensor_D_reference.data().get(), make_layout(shape_mC, stride_mC));
+    auto mD_computed = make_tensor(tensor_D_computed.data().get(), make_layout(shape_mC, stride_mC));
+    auto mBias = make_tensor(tensor_bias.data().get(), make_layout(shape_mBias));
+    auto mAlpha = make_tensor(tensor_alpha.data().get(), make_layout(shape_mBias));
+    auto mBeta = make_tensor(tensor_beta.data().get(), make_layout(shape_mBias));
+
+    cutlass::reference::host::ConvEpilogueFusionParams<
+      ElementAccumulator,
+      ElementScalar,
+      ElementCompute,
+      ElementC,
+      ElementD,
+      decltype(mAlpha),
+      decltype(mBeta),
+      decltype(mBias),
+      ActivationFunctor>
+        epilogue_fusion_params{};
+
+    epilogue_fusion_params.alpha = alpha;
+    epilogue_fusion_params.beta = beta;
+
+    if constexpr (IsBiasEnabled) {
+      epilogue_fusion_params.tensor_bias = mBias;
+    }
+
+    auto padding = cute::reverse(problem_shape.lower_padding);
+    auto tstride = cute::reverse(problem_shape.traversal_stride);
+    auto dilation = cute::reverse(problem_shape.dilation);
+
+    cutlass::reference::host::ConvReferenceImpl<
+      ConvOp,
+      NumSpatialDimensions,
+      decltype(mA),
+      decltype(mB),
+      decltype(mC),
+      decltype(mD_ref),
+      decltype(padding),
+      decltype(tstride),
+      decltype(dilation),
+      decltype(epilogue_fusion_params)>
+        reference_impl(mA, mB, mC, mD_ref, padding, tstride, dilation, epilogue_fusion_params);
+
+    //
+    // Reference check - support caching results
+    //
+
+    CachedTestKey cached_test_key = CreateCachedConvNd3xTestKey<
+        ProblemShape,
+        ElementA,
+        ElementB,
+        ElementC,
+        ElementD
+    >(
+        ConvOp,
+        problem_shape,
+        alpha,
+        beta,
+        tensor_A,
+        tensor_B,
+        tensor_C
+      );
+
+    //
+    // Look for the cached key
+    //
+
+    bool cached_result_loaded = false;
+    CachedTestResult cached_test_result;
+
+    std::string convnd_result_cache_name =
+      std::string("cached_results_") + CUTLASS_TARGET_NAME + ".txt";
+
+    #if (CUTLASS_TEST_ENABLE_CACHED_RESULTS)
+      CachedTestResultListing cached_results(convnd_result_cache_name);
+
+      auto cached = cached_results.find(cached_test_key);
+
+      cached_result_loaded = cached.first;
+      if (cached_result_loaded) {
+        cached_test_result = cached.second;
+      }
+    #endif
+
+    if (!cached_result_loaded) {
+      // Compute reference
+      reference_impl.compute_reference();
+
+      #if (CUTLASS_TEST_ENABLE_CACHED_RESULTS)
+        cached_test_result.D = TensorHash(tensor_D_reference);
+        CachedTestResultListing cached_results(convnd_result_cache_name);
+
+        cached_results.append(cached_test_key, cached_test_result);
+        cached_results.write(convnd_result_cache_name);
+      #endif
+    } // if (!cached_result_loaded)
+
+    #if (CUTLASS_TEST_ENABLE_CACHED_RESULTS)
+      uint32_t tensor_D_computed_hash = TensorHash(tensor_D_computed);
+      passed = (tensor_D_computed_hash == cached_test_result.D);
+      // If hash fails, double check against reference implementation.
+      if(!passed) {
+        std::cerr << "Hash-based comparison unsuccessful for key:" << "\n" << cached_test_key
+            << ", comparing with reference implementation now.\n";
+        if (cached_result_loaded) {
+          // Compute reference
+          reference_impl.compute_reference();
+        }
+        // Validate kernel against reference
+        passed = compare_reference(mD_ref, mD_computed, mA, mB, mAlpha, mBeta, mBias, this->epsilon);
+      }
+    #else
+      // Validate kernel against reference
+      passed = compare_reference(mD_ref, mD_computed, mA, mB, mAlpha, mBeta, mBias, this->epsilon);
+    #endif
+
+    EXPECT_TRUE(passed);
+    return passed;
+  }
+
+  template<
+    class Engine, class Layout,
+    class EngineA, class LayoutA,
+    class EngineB, class LayoutB,
+    class EngineAlpha, class LayoutAlpha,
+    class EngineBeta, class LayoutBeta,
+    class EngineBias, class LayoutBias>
+  static constexpr bool
+  compare_reference(
+      cute::Tensor<Engine, Layout> const& reference,
+      cute::Tensor<Engine, Layout> const& computed,
+      cute::Tensor<EngineA, LayoutA> const& A,
+      cute::Tensor<EngineB, LayoutB> const& B,
+      cute::Tensor<EngineAlpha, LayoutAlpha> const& tensor_alpha,
+      cute::Tensor<EngineBeta, LayoutBeta> const& tensor_beta,
+      cute::Tensor<EngineBias, LayoutBias> const& tensor_bias,
+      float epsilon = 0.0f) {
+    if (size(reference) != size(computed)) {
+      return false;
+    }
+
+    bool passed = true;
+    if (epsilon == 0.0f) {
+      // fast refcheck w/o epsilon
+      for (size_t i = 0; i < size_t(size(reference)); ++i) {
+        if (reference(i) != computed(i)) {
+          passed = false;
+          printf("[%llu] %f, %f\n", static_cast<unsigned long long>(i),
+            float(reference(i)), float(computed(i)));
+          break;
+        }
+      }
+    } else {
+      // refcheck with epsilon
+      for (size_t i = 0; i < size_t(size(reference)); ++i) {
+        auto ref = static_cast<float>(reference(i));
+        auto act = static_cast<float>(computed(i));
+        auto abs_error = std::abs(act - ref);
+        auto rel_error = abs_error / (std::max(std::abs(act), std::abs(ref)) + 0.00001f);
+        if (std::isnan(abs_error) || std::isnan(rel_error) ||
+            std::min(abs_error, rel_error) > epsilon) {
+          passed = false;
+          printf("[%llu] %f, %f\n", static_cast<unsigned long long>(i),
+            float(reference(i)), float(computed(i)));
+          break;
+        }
+      }
+    }
+    #if CUTLASS_DEBUG_TRACE_LEVEL > 1
+    if (not passed) {
+      cute::print("Reference:");
+      cute::print_tensor(reference);
+      cute::print("\nComputed:");
+      cute::print_tensor(computed);
+      cute::print("\n");
+
+      for (size_t i = 0; i < size_t(size(A)); ++i) {
+        printf("[%llu]: A = %f\n", static_cast<unsigned long long>(i), float(A(i)));
+      }
+      for (size_t i = 0; i < size_t(size(B)); ++i) {
+        printf("[%llu]: B = %f\n", static_cast<unsigned long long>(i), float(B(i)));
+      }
+      if constexpr (IsBiasEnabled) {
+        for (size_t i = 0; i < size_t(size(tensor_bias)); ++i) {
+          printf("[%llu]: bias = %f\n", static_cast<unsigned long long>(i),
+            float(tensor_bias(i)));
+        }
+      }
+      for (size_t i = 0; i < size_t(size(reference)); ++i) {
+        printf("[%llu]: ref = %f, computed = %f\n", static_cast<unsigned long long>(i),
+          float(reference(i)), float(computed(i)));
+      }
+    }
+    #endif
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Conv, bool SupportStrides = (Conv::DispatchPolicy::ConvOp != cutlass::conv::Operator::kDgrad)>
+bool TestAllConv(double alpha = 1.0, double beta = 0.0, float epsilon = 0.0f
+                 ) {
+  using ElementScalar = typename Conv::EpilogueOutputOp::ElementScalar;
+
+  bool passed = true;
+  ConvTestbed<Conv> testbed;
+  testbed.epsilon = epsilon;
+  auto problem_vector = get_conv_problem_vector<
+      Conv::NumSpatialDimensions, Conv::DispatchPolicy::ConvOp, SupportStrides>();
+
+  using DecompositionMode = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
+  using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90::RasterOrderOptions;
+  using MaxSwizzleSize = typename gemm::device::detail::MaxSwizzleSize;
+  using Splits = typename gemm::device::detail::Splits;
+
+  std::vector<DecompositionMode> decomposition_modes = {DecompositionMode::Heuristic};
+  static constexpr bool UsesStreamKScheduler = cute::is_same_v<typename Conv::ConvKernel::TileSchedulerTag, cutlass::gemm::StreamKScheduler>;
+  if constexpr (UsesStreamKScheduler) {
+    decomposition_modes.push_back(DecompositionMode::DataParallel);
+    decomposition_modes.push_back(DecompositionMode::SplitK);
+    decomposition_modes.push_back(DecompositionMode::StreamK);
+  }
+
+  for (auto conv_problem : problem_vector) {
+    #if CUTLASS_DEBUG_TRACE_LEVEL > 0
+    print(conv_problem);
+    #endif
+    for (DecompositionMode decomp_mode : decomposition_modes) {
+      std::vector problem_splits = {Splits{1}};
+      if constexpr (UsesStreamKScheduler) {
+        if (decomp_mode == DecompositionMode::SplitK) {
+          problem_splits.push_back(Splits{2});
+          problem_splits.push_back(Splits{4});
+        }
+      }
+      for (auto splits : problem_splits) {
+
+        passed = testbed.run(
+          conv_problem,
+          cutlass::from_real<ElementScalar>(alpha),
+          cutlass::from_real<ElementScalar>(beta)
+          ,RasterOrderOptions::Heuristic, // raster_order
+          MaxSwizzleSize(1),
+          splits,
+          decomp_mode
+          );
+        if (!passed) {
+          printf("Failed test for "); print(conv_problem);
+          return false;
+        }
+      } // splits
+    } // decomposition_mode
+  }
+
+  return passed;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace test::conv::device
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/conv/device_3x/wgrad/CMakeLists.txt b/test/unit/conv/device_3x/wgrad/CMakeLists.txt
new file mode 100644
index 0000000000..82521d0914
--- /dev/null
+++ b/test/unit/conv/device_3x/wgrad/CMakeLists.txt
@@ -0,0 +1,46 @@
+# Copyright (c) 2013 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+add_custom_target(
+  cutlass_test_unit_conv_wgrad_device
+  DEPENDS
+  cutlass_test_unit_conv_wgrad_device_tensorop_sm90
+)
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_conv_wgrad_device_tensorop_sm90
+
+  sm90_conv1d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
+  sm90_conv2d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
+  sm90_conv3d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
+
+  sm90_conv1d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
+  sm90_conv2d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
+  sm90_conv3d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
+)
+
diff --git a/test/unit/conv/device_3x/wgrad/sm90_conv1d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu b/test/unit/conv/device_3x/wgrad/sm90_conv1d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
new file mode 100644
index 0000000000..1ef19fc8f1
--- /dev/null
+++ b/test/unit/conv/device_3x/wgrad/sm90_conv1d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
@@ -0,0 +1,419 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/conv/device/conv_universal_adapter.hpp"
+#include "cutlass/conv/kernel/conv_universal.hpp"
+#include "cutlass/conv/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "../testbed_conv.hpp"
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 64x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+TEST(SM90_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f16, 64x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCS, 8,
+      cutlass::half_t, cutlass::layout::TensorKCS, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+TEST(SM90_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f16, 64x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCS, 8,
+      cutlass::half_t, cutlass::layout::TensorKCS, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+TEST(SM90_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f16, 64x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCS, 8,
+      cutlass::half_t, cutlass::layout::TensorKCS, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+TEST(SM90_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f16, 64x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCS, 8,
+      cutlass::half_t, cutlass::layout::TensorKCS, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 128x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+TEST(SM90_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f16, 128x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCS, 8,
+      cutlass::half_t, cutlass::layout::TensorKCS, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+TEST(SM90_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f16, 128x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCS, 8,
+      cutlass::half_t, cutlass::layout::TensorKCS, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+TEST(SM90_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f16, 128x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCS, 8,
+      cutlass::half_t, cutlass::layout::TensorKCS, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+TEST(SM90_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f16, 128x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCS, 8,
+      cutlass::half_t, cutlass::layout::TensorKCS, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/conv/device_3x/wgrad/sm90_conv1d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu b/test/unit/conv/device_3x/wgrad/sm90_conv1d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
new file mode 100644
index 0000000000..9385fc2f14
--- /dev/null
+++ b/test/unit/conv/device_3x/wgrad/sm90_conv1d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
@@ -0,0 +1,418 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/conv/device/conv_universal_adapter.hpp"
+#include "cutlass/conv/kernel/conv_universal.hpp"
+#include "cutlass/conv/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "../testbed_conv.hpp"
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 64x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+TEST(SM90_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 64x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCS, 8,
+      cutlass::half_t, cutlass::layout::TensorKCS, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+TEST(SM90_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 64x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCS, 8,
+      cutlass::half_t, cutlass::layout::TensorKCS, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+TEST(SM90_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 64x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCS, 8,
+      cutlass::half_t, cutlass::layout::TensorKCS, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+TEST(SM90_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 64x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCS, 8,
+      cutlass::half_t, cutlass::layout::TensorKCS, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 128x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+TEST(SM90_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 128x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCS, 8,
+      cutlass::half_t, cutlass::layout::TensorKCS, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+TEST(SM90_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 128x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCS, 8,
+      cutlass::half_t, cutlass::layout::TensorKCS, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+TEST(SM90_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 128x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCS, 8,
+      cutlass::half_t, cutlass::layout::TensorKCS, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+TEST(SM90_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 128x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCS, 8,
+      cutlass::half_t, cutlass::layout::TensorKCS, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNWC, 8,
+      ElementFlt, cutlass::layout::TensorNWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/conv/device_3x/wgrad/sm90_conv2d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu b/test/unit/conv/device_3x/wgrad/sm90_conv2d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
new file mode 100644
index 0000000000..8c8177fdd0
--- /dev/null
+++ b/test/unit/conv/device_3x/wgrad/sm90_conv2d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
@@ -0,0 +1,427 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/conv/device/conv_universal_adapter.hpp"
+#include "cutlass/conv/kernel/conv_universal.hpp"
+#include "cutlass/conv/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "../testbed_conv.hpp"
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 64x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+
+TEST(SM90_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f16, 64x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCSR, 8,
+      cutlass::half_t, cutlass::layout::TensorKCSR, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+
+TEST(SM90_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f16, 64x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCSR, 8,
+      cutlass::half_t, cutlass::layout::TensorKCSR, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+
+TEST(SM90_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f16, 64x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCSR, 8,
+      cutlass::half_t, cutlass::layout::TensorKCSR, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+
+TEST(SM90_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f16, 64x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCSR, 8,
+      cutlass::half_t, cutlass::layout::TensorKCSR, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 128x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+
+TEST(SM90_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f16, 128x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCSR, 8,
+      cutlass::half_t, cutlass::layout::TensorKCSR, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+
+TEST(SM90_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f16, 128x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCSR, 8,
+      cutlass::half_t, cutlass::layout::TensorKCSR, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+
+TEST(SM90_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f16, 128x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCSR, 8,
+      cutlass::half_t, cutlass::layout::TensorKCSR, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+
+TEST(SM90_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f16, 128x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCSR, 8,
+      cutlass::half_t, cutlass::layout::TensorKCSR, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/conv/device_3x/wgrad/sm90_conv2d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu b/test/unit/conv/device_3x/wgrad/sm90_conv2d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
new file mode 100644
index 0000000000..a1a28ec54e
--- /dev/null
+++ b/test/unit/conv/device_3x/wgrad/sm90_conv2d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
@@ -0,0 +1,426 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/conv/device/conv_universal_adapter.hpp"
+#include "cutlass/conv/kernel/conv_universal.hpp"
+#include "cutlass/conv/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "../testbed_conv.hpp"
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 64x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+
+TEST(SM90_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 64x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCSR, 8,
+      cutlass::half_t, cutlass::layout::TensorKCSR, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+
+TEST(SM90_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 64x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCSR, 8,
+      cutlass::half_t, cutlass::layout::TensorKCSR, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+
+TEST(SM90_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 64x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCSR, 8,
+      cutlass::half_t, cutlass::layout::TensorKCSR, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+
+TEST(SM90_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 64x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCSR, 8,
+      cutlass::half_t, cutlass::layout::TensorKCSR, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 128x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+
+TEST(SM90_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 128x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCSR, 8,
+      cutlass::half_t, cutlass::layout::TensorKCSR, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+
+TEST(SM90_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 128x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCSR, 8,
+      cutlass::half_t, cutlass::layout::TensorKCSR, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+
+TEST(SM90_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 128x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCSR, 8,
+      cutlass::half_t, cutlass::layout::TensorKCSR, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+
+TEST(SM90_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 128x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCSR, 8,
+      cutlass::half_t, cutlass::layout::TensorKCSR, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNHWC, 8,
+      ElementFlt, cutlass::layout::TensorNHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/conv/device_3x/wgrad/sm90_conv3d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu b/test/unit/conv/device_3x/wgrad/sm90_conv3d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
new file mode 100644
index 0000000000..34c9ccd0b8
--- /dev/null
+++ b/test/unit/conv/device_3x/wgrad/sm90_conv3d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
@@ -0,0 +1,427 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/conv/device/conv_universal_adapter.hpp"
+#include "cutlass/conv/kernel/conv_universal.hpp"
+#include "cutlass/conv/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "../testbed_conv.hpp"
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 64x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+
+TEST(SM90_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f16, 64x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCSRT, 8,
+      cutlass::half_t, cutlass::layout::TensorKCSRT, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+
+TEST(SM90_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f16, 64x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCSRT, 8,
+      cutlass::half_t, cutlass::layout::TensorKCSRT, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+
+TEST(SM90_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f16, 64x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCSRT, 8,
+      cutlass::half_t, cutlass::layout::TensorKCSRT, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+
+//
+// Cluster 2x2x1
+//
+
+TEST(SM90_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f16, 64x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCSRT, 8,
+      cutlass::half_t, cutlass::layout::TensorKCSRT, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 128x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+
+TEST(SM90_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f16, 128x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCSRT, 8,
+      cutlass::half_t, cutlass::layout::TensorKCSRT, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+
+TEST(SM90_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f16, 128x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCSRT, 8,
+      cutlass::half_t, cutlass::layout::TensorKCSRT, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+
+TEST(SM90_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f16, 128x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCSRT, 8,
+      cutlass::half_t, cutlass::layout::TensorKCSRT, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+
+TEST(SM90_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f16, 128x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = cutlass::half_t;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCSRT, 8,
+      cutlass::half_t, cutlass::layout::TensorKCSRT, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/conv/device_3x/wgrad/sm90_conv3d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu b/test/unit/conv/device_3x/wgrad/sm90_conv3d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
new file mode 100644
index 0000000000..554893357e
--- /dev/null
+++ b/test/unit/conv/device_3x/wgrad/sm90_conv3d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
@@ -0,0 +1,426 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/conv/device/conv_universal_adapter.hpp"
+#include "cutlass/conv/kernel/conv_universal.hpp"
+#include "cutlass/conv/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "../testbed_conv.hpp"
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 64x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+
+TEST(SM90_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 64x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCSRT, 8,
+      cutlass::half_t, cutlass::layout::TensorKCSRT, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+
+TEST(SM90_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 64x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCSRT, 8,
+      cutlass::half_t, cutlass::layout::TensorKCSRT, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+
+TEST(SM90_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 64x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCSRT, 8,
+      cutlass::half_t, cutlass::layout::TensorKCSRT, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+
+TEST(SM90_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 64x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_64, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCSRT, 8,
+      cutlass::half_t, cutlass::layout::TensorKCSRT, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Tile shape 128x64x64
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Cluster 1x1x1
+//
+
+TEST(SM90_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 128x64x64_1x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCSRT, 8,
+      cutlass::half_t, cutlass::layout::TensorKCSRT, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x1x1
+//
+
+TEST(SM90_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 128x64x64_2x1x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCSRT, 8,
+      cutlass::half_t, cutlass::layout::TensorKCSRT, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 1x2x1
+//
+
+TEST(SM90_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 128x64x64_1x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCSRT, 8,
+      cutlass::half_t, cutlass::layout::TensorKCSRT, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+//
+// Cluster 2x2x1
+//
+
+TEST(SM90_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 128x64x64_2x2x1) {
+  using ElementAct     = cutlass::half_t;
+  using ElementFlt     = cutlass::half_t;
+  using ElementOut     = float;
+  using ElementAcc     = float;
+  using ElementCompute = float;
+  using TileShapeMNK = Shape<_128, Shape<_64>, Shape<_64>>;
+  using ClusterShapeMNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAcc, ElementCompute,
+      cutlass::half_t, cutlass::layout::TensorKCSRT, 8,
+      cutlass::half_t, cutlass::layout::TensorKCSRT, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::conv::Operator::kWgrad,
+      ElementAct, cutlass::layout::TensorNDHWC, 8,
+      ElementFlt, cutlass::layout::TensorNDHWC, 8,
+      ElementAcc,
+      TileShapeMNK, ClusterShapeMNK,
+      cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::conv::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
+  using ConvKernel = cutlass::conv::kernel::ConvUniversal<
+      ProblemShape,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
+
+  EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/core/CMakeLists.txt b/test/unit/core/CMakeLists.txt
index 8a2ab83af3..9c68d4af8b 100644
--- a/test/unit/core/CMakeLists.txt
+++ b/test/unit/core/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -31,8 +31,10 @@ cutlass_test_unit_add_executable(
   array.cu
   half.cu
   bfloat16.cu
+  float8.cu
   tfloat32.cu
   complex.cu
+  uint128.cu
   quaternion.cu
   matrix.cu
   predicate_vector.cu
@@ -40,5 +42,7 @@ cutlass_test_unit_add_executable(
   tensor_view.cu
   matrix_coord.cu
   numeric_conversion.cu
+  numeric_conversion_subbyte.cu
+  fast_numeric_conversion.cu
   functional.cu
   )
diff --git a/test/unit/core/array.cu b/test/unit/core/array.cu
index b9f79c356b..9fc04564af 100644
--- a/test/unit/core/array.cu
+++ b/test/unit/core/array.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -117,8 +117,8 @@ public:
   void run() {
 
     /// Device memory containing output
-    cutlass::device_memory::allocation< ArrayTy > output(kThreads);
-    std::vector< ArrayTy > output_host(kThreads);
+    cutlass::device_memory::allocation< ArrayTy > output(static_cast<size_t>(kThreads));
+    std::vector< ArrayTy > output_host(static_cast<size_t>(kThreads));
 
     dim3 grid(1,1);
     dim3 block(kThreads, 1, 1);
@@ -138,7 +138,7 @@ public:
     ASSERT_EQ(result, cudaSuccess) << "CUDA error: " << cudaGetErrorString(result);
 
     char const *ptr_host = reinterpret_cast<char const *>(output_host.data());
-    for (int i = 0; i < sizeof(ArrayTy) * kThreads; ++i) {
+    for (size_t i = 0; i < sizeof(ArrayTy) * kThreads; ++i) {
       EXPECT_FALSE(ptr_host[i]);
     }
 
diff --git a/test/unit/core/bfloat16.cu b/test/unit/core/bfloat16.cu
index f7fec54396..b586068f61 100644
--- a/test/unit/core/bfloat16.cu
+++ b/test/unit/core/bfloat16.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/core/complex.cu b/test/unit/core/complex.cu
index 5bfddbe41a..880af5868c 100644
--- a/test/unit/core/complex.cu
+++ b/test/unit/core/complex.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -32,14 +32,16 @@
     \brief CUTLASS host-device template for complex numbers supporting all CUTLASS numeric types.
 */
 
-// Standard Library's std::complex<T> used for reference checking
 #include <complex>
+#include <cuda/std/complex>
 
 #include "../common/cutlass_unit_test.h"
 
 #include "cutlass/complex.h"
 #include "cutlass/constants.h"
 #include "cutlass/numeric_conversion.h"
+#include "cutlass/tfloat32.h"
+#include <type_traits>
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -49,7 +51,7 @@ TEST(complex, f64_to_f32_conversion) {
 
   cutlass::complex<float> dest = cutlass::complex<float>(source); // explicit conversion
 
-  EXPECT_TRUE(source.real() == 1.5 && source.imag() == -1.25 && 
+  EXPECT_TRUE(source.real() == 1.5 && source.imag() == -1.25 &&
     dest.real() == 1.5f && dest.imag() == -1.25f);
 }
 
@@ -61,7 +63,7 @@ TEST(complex, f32_to_f64_conversion) {
 
   cutlass::complex<double> dest = source;  // implicit conversion
 
-  EXPECT_TRUE(source.real() == -1.5f && source.imag() == 1.25f && 
+  EXPECT_TRUE(source.real() == -1.5f && source.imag() == 1.25f &&
     dest.real() == -1.5 && dest.imag() == 1.25);
 }
 
@@ -73,7 +75,7 @@ TEST(complex, s32_to_f64_conversion) {
 
   cutlass::complex<double> dest = source;  // implicit conversion
 
-  EXPECT_TRUE(source.real() == -2 && source.imag() == 1 && 
+  EXPECT_TRUE(source.real() == -2 && source.imag() == 1 &&
     dest.real() == -2 && dest.imag() == 1);
 }
 
@@ -86,14 +88,13 @@ TEST(complex, f16_to_f32_conversion) {
 
   cutlass::complex<float> dest = cutlass::complex<float>(source); // explicit conversion
 
-  EXPECT_TRUE(source.real() == 1.5_hf && source.imag() == -1.25_hf && 
+  EXPECT_TRUE(source.real() == 1.5_hf && source.imag() == -1.25_hf &&
     dest.real() == 1.5f && dest.imag() == -1.25f);
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 TEST(complex, exp_f32) {
-  
   cutlass::complex<float> Z[] = {
     {1, 1},
     {2   ,  cutlass::constants::pi<float>()/2.0f   },
@@ -103,16 +104,16 @@ TEST(complex, exp_f32) {
   };
 
   cutlass::complex<double> Expected[] = {
-    {1.4686939399158851, 2.2873552871788423}, 
+    {1.4686939399158851, 2.2873552871788423},
     {4.524491950137825e-16, 7.38905609893065},
-    {-1.6487212707001282, 2.019101226849069e-16}, 
+    {-1.6487212707001282, 2.019101226849069e-16},
     {-0.9079430793557842, 0.9079430793557843},
     {1, 0}
   };
 
   double tolerance = 0.00001;
 
-  for (int i = 0; cutlass::real(Z[i]); ++i) {
+  for (int i = 0; cutlass::real(Z[i]) != 0.0f; ++i) {
     double e_r = cutlass::real(Expected[i]);
     double e_i = cutlass::real(Expected[i]);
 
@@ -126,6 +127,376 @@ TEST(complex, exp_f32) {
   }
 }
 
+TEST(complex, absolute_value_real_and_imag) {
+  {
+    cutlass::complex z_d{3.0, 4.0};
+
+    auto abs_d = cutlass::abs(z_d);
+    static_assert(std::is_same_v<decltype(abs_d), double>);
+    EXPECT_EQ(abs_d, 5.0);
+
+    auto real_d = cutlass::real(z_d);
+    static_assert(std::is_same_v<decltype(real_d), double>);
+    EXPECT_EQ(real_d, 3.0);
+
+    auto imag_d = cutlass::imag(z_d);
+    static_assert(std::is_same_v<decltype(imag_d), double>);
+    EXPECT_EQ(imag_d, 4.0);
+  }
+
+  {
+    cutlass::complex z_f{3.0f, 4.0f};
+
+    auto abs_f = cutlass::abs(z_f);
+    static_assert(std::is_same_v<decltype(abs_f), float>);
+    EXPECT_EQ(abs_f, 5.0f);
+
+    auto real_f = cutlass::real(z_f);
+    static_assert(std::is_same_v<decltype(real_f), float>);
+    EXPECT_EQ(real_f, 3.0f);
+
+    auto imag_f = cutlass::imag(z_f);
+    static_assert(std::is_same_v<decltype(imag_f), float>);
+    EXPECT_EQ(imag_f, 4.0f);
+  }
+
+  {
+    cutlass::complex z_tf32{cutlass::tfloat32_t{3.0f}, cutlass::tfloat32_t{4.0f}};
+    auto abs_tf32 = cutlass::abs(z_tf32);
+    static_assert(std::is_same_v<decltype(abs_tf32), cutlass::tfloat32_t>);
+    EXPECT_EQ(abs_tf32, cutlass::tfloat32_t{5.0f});
+
+    auto real_tf32 = cutlass::real(z_tf32);
+    static_assert(std::is_same_v<decltype(real_tf32), cutlass::tfloat32_t>);
+    EXPECT_EQ(real_tf32, cutlass::tfloat32_t{3.0f});
+
+    auto imag_tf32 = cutlass::imag(z_tf32);
+    static_assert(std::is_same_v<decltype(imag_tf32), cutlass::tfloat32_t>);
+    EXPECT_EQ(imag_tf32, cutlass::tfloat32_t{4.0f});
+  }
+
+  {
+    cutlass::complex z_i{3, 4};
+
+    // sqrt(int) isn't a valid overload, so cutlass::abs isn't tested.
+    auto real_i = cutlass::real(z_i);
+    static_assert(std::is_same_v<decltype(real_i), int>);
+    EXPECT_EQ(real_i, 3);
+
+    auto imag_i = cutlass::imag(z_i);
+    static_assert(std::is_same_v<decltype(imag_i), int>);
+    EXPECT_EQ(imag_i, 4);
+  }
+
+  {
+    double x_d{3.0};
+
+    auto real_d = cutlass::real(x_d);
+    static_assert(std::is_same_v<decltype(real_d), double>);
+    EXPECT_EQ(real_d, 3.0);
+
+    auto imag_d = cutlass::imag(x_d);
+    static_assert(std::is_same_v<decltype(imag_d), double>);
+    EXPECT_EQ(imag_d, 0.0);
+  }
+
+  {
+    float x_f{3.0f};
+
+    auto real_f = cutlass::real(x_f);
+    static_assert(std::is_same_v<decltype(real_f), float>);
+    EXPECT_EQ(real_f, 3.0f);
+
+    auto imag_f = cutlass::imag(x_f);
+    static_assert(std::is_same_v<decltype(imag_f), float>);
+    EXPECT_EQ(imag_f, 0.0f);
+  }
+
+  {
+    cutlass::tfloat32_t x_tf32{3.0f};
+
+    auto real_tf32 = cutlass::real(x_tf32);
+    static_assert(std::is_same_v<decltype(real_tf32), cutlass::tfloat32_t>);
+    EXPECT_EQ(real_tf32, cutlass::tfloat32_t{3.0f});
+
+    auto imag_tf32 = cutlass::imag(x_tf32);
+    static_assert(std::is_same_v<decltype(imag_tf32), cutlass::tfloat32_t>);
+    EXPECT_EQ(imag_tf32, cutlass::tfloat32_t{0.0f});
+  }
+
+  {
+    int x_i{3};
+
+    auto real_i = cutlass::real(x_i);
+    static_assert(std::is_same_v<decltype(real_i), int>);
+    EXPECT_EQ(real_i, 3);
+
+    auto imag_i = cutlass::imag(x_i);
+    static_assert(std::is_same_v<decltype(imag_i), int>);
+    EXPECT_EQ(imag_i, 0);
+  }
+}
+
+// FakeReal and FakeComplex test whether cutlass::real and
+// cutlass::imag correctly handle user-defined non-complex
+// and complex number types.
+namespace test {
+
+// These classes have no conversions to or from arithmetic types, so
+// that the test can ensure that the implementation does not silently
+// convert to, say, float or int.
+class FakeReal {
+public:
+  // cutlass::imag must be able to value-construct its noncomplex input.
+  FakeReal() = default;
+
+  static CUTLASS_HOST_DEVICE FakeReal make_FakeReal(int val) {
+    return FakeReal{val};
+  }
+
+  friend CUTLASS_HOST_DEVICE bool operator==(FakeReal lhs, FakeReal rhs) {
+    return lhs.value_ == rhs.value_;
+  }
+
+  friend CUTLASS_HOST_DEVICE FakeReal operator-(FakeReal const& x) {
+    return make_FakeReal(-x.value_);
+  }
+
+private:
+  CUTLASS_HOST_DEVICE FakeReal(int val) : value_(val) {}
+  int value_ = 0;
+};
+
+class FakeComplex {
+public:
+  static CUTLASS_HOST_DEVICE FakeComplex
+  make_FakeComplex(FakeReal re, FakeReal im) {
+    return FakeComplex{re, im};
+  }
+
+  // Existence of member functions real and imag tell
+  // CUTLASS that FakeComplex is a complex number type.
+  CUTLASS_HOST_DEVICE FakeReal real() const { return real_; }
+  CUTLASS_HOST_DEVICE FakeReal imag() const { return imag_; }
+
+  friend CUTLASS_HOST_DEVICE bool operator==(FakeComplex lhs, FakeComplex rhs) {
+    return lhs.real_ == rhs.real_ && lhs.imag_ == rhs.imag_;
+  }
+
+private:
+  CUTLASS_HOST_DEVICE FakeComplex(FakeReal re, FakeReal im)
+    : real_(re), imag_(im)
+  {}
+
+  FakeReal real_{};
+  FakeReal imag_{};
+};
+
+CUTLASS_HOST_DEVICE FakeComplex conj(FakeComplex const& z) {
+  return FakeComplex::make_FakeComplex(z.real(), -z.imag());
+}
+
+// Variant of FakeComplex that has a hidden friend conj instead of a
+// nonmember conj defined outside the class.
+class FakeComplexWithHiddenFriendConj {
+public:
+  static CUTLASS_HOST_DEVICE FakeComplexWithHiddenFriendConj
+  make_FakeComplexWithHiddenFriendConj(FakeReal re, FakeReal im) {
+    return FakeComplexWithHiddenFriendConj{re, im};
+  }
+
+  CUTLASS_HOST_DEVICE FakeReal real() const { return real_; }
+  CUTLASS_HOST_DEVICE FakeReal imag() const { return imag_; }
+
+  friend CUTLASS_HOST_DEVICE bool
+  operator==(FakeComplexWithHiddenFriendConj lhs,
+    FakeComplexWithHiddenFriendConj rhs)
+  {
+    return lhs.real_ == rhs.real_ && lhs.imag_ == rhs.imag_;
+  }
+
+  friend CUTLASS_HOST_DEVICE FakeComplexWithHiddenFriendConj
+  conj(FakeComplexWithHiddenFriendConj const& z) {
+    return FakeComplexWithHiddenFriendConj::make_FakeComplexWithHiddenFriendConj(z.real(), -z.imag());
+  }
+
+private:
+  CUTLASS_HOST_DEVICE
+  FakeComplexWithHiddenFriendConj(FakeReal re, FakeReal im)
+    : real_(re), imag_(im)
+  {}
+
+  FakeReal real_{};
+  FakeReal imag_{};
+};
+
+} // namespace test
+
+TEST(complex, real_and_imag_with_custom_types) {
+  using test::FakeReal;
+  using test::FakeComplex;
+
+  {
+    FakeReal x = FakeReal::make_FakeReal(42);
+    auto x_r = cutlass::real(x);
+    static_assert(std::is_same_v<decltype(x_r), FakeReal>);
+    EXPECT_EQ(x_r, FakeReal::make_FakeReal(42));
+    auto x_i = cutlass::imag(x);
+    static_assert(std::is_same_v<decltype(x_i), FakeReal>);
+    EXPECT_EQ(x_i, FakeReal::make_FakeReal(0));
+  }
+  {
+    FakeComplex z = FakeComplex::make_FakeComplex(
+      FakeReal::make_FakeReal(3), FakeReal::make_FakeReal(4));
+    auto z_r = cutlass::real(z);
+    static_assert(std::is_same_v<decltype(z_r), FakeReal>);
+    EXPECT_EQ(z_r, FakeReal::make_FakeReal(3));
+    auto z_i = cutlass::imag(z);
+    static_assert(std::is_same_v<decltype(z_i), FakeReal>);
+    EXPECT_EQ(z_i, FakeReal::make_FakeReal(4));
+  }
+}
+
+namespace test {
+
+template<class T>
+void conj_tester(T z, T z_c_expected, const char type_name[]) {
+  // Use cutlass::conj just like std::swap (the "std::swap two-step").
+  using cutlass::conj;
+  auto z_c = conj(z);
+  static_assert(std::is_same_v<decltype(z_c), T>);
+  constexpr bool is_cuComplex = std::is_same_v<T, cuDoubleComplex> ||
+    std::is_same_v<T, cuFloatComplex>;
+  if constexpr (is_cuComplex) {
+    EXPECT_EQ(z_c.x, z_c_expected.x);
+    EXPECT_EQ(z_c.y, z_c_expected.y) << "conj failed for type " << type_name;
+  }
+  else {
+    EXPECT_EQ(z_c, z_c_expected) << "conj failed for type " << type_name;
+  }
+
+  auto z_c2 = cutlass::conjugate<T>{}(z);
+  static_assert(std::is_same_v<decltype(z_c2), T>);
+  if constexpr (is_cuComplex) {
+    // cuFloatComplex and cuDoubleComplex don't report conj(z) as
+    // being well-formed, probably because they are type aliases of
+    // some kind.  cutlass::conj works fine, though!
+    static_assert(! cutlass::platform::is_arithmetic_v<T> &&
+                  (cutlass::detail::has_unqualified_conj_v<T> ||
+                   cutlass::detail::has_cutlass_conj_v<T>));
+    
+    EXPECT_EQ(z_c2.x, z_c_expected.x);
+    EXPECT_EQ(z_c2.y, z_c_expected.y)
+      << "conjugate failed for type " << type_name;
+  }
+  else {
+    EXPECT_EQ(z_c2, z_c_expected) << "conjugate failed for type " << type_name;
+  }
+}
+
+} // namespace test
+
+TEST(complex, conj_with_standard_arithmetic_types) {
+  {
+    double x = 42.0;
+    double x_c_expected = 42.0;
+    test::conj_tester(x, x_c_expected, "double");
+  }
+  {
+    float x = 42.0f;
+    float x_c_expected = 42.0f;
+    test::conj_tester(x, x_c_expected, "float");
+  }
+  {
+    int x = 42;
+    int x_c_expected = 42;
+    test::conj_tester(x, x_c_expected, "int");
+  }
+}
+
+TEST(complex, conj_with_cutlass_complex_types) {
+  {
+    cutlass::complex<double> z{3.0, 4.0};
+    cutlass::complex<double> z_c_expected{3.0, -4.0};
+    test::conj_tester(z, z_c_expected, "cutlass::complex<double>");
+  }
+  {
+    cutlass::complex<float> z{3.0f, 4.0f};
+    cutlass::complex<float> z_c_expected{3.0f, -4.0f};
+    test::conj_tester(z, z_c_expected, "cutlass::complex<float>");
+  }
+  {
+    cutlass::complex<cutlass::tfloat32_t> z{
+      cutlass::tfloat32_t{3.0f}, cutlass::tfloat32_t{4.0f}};
+    cutlass::complex<cutlass::tfloat32_t> z_c_expected{
+      cutlass::tfloat32_t{3.0f}, cutlass::tfloat32_t{-4.0f}};
+    test::conj_tester(z, z_c_expected, "cutlass::complex<cutlass::tfloat32_t>");
+  }
+}
+
+TEST(complex, conj_with_noncomplex_type_not_in_cutlass_namespace) {
+  test::FakeReal x = test::FakeReal::make_FakeReal(42);
+  test::FakeReal x_c_expected = test::FakeReal::make_FakeReal(42);
+  test::conj_tester(x, x_c_expected, "test::FakeReal");
+}
+
+TEST(complex, conj_with_noncomplex_type_in_cutlass_namespace) {
+  cutlass::tfloat32_t x{42.0f};
+  cutlass::tfloat32_t x_c_expected{42.0f};
+  test::conj_tester(x, x_c_expected, "cutlass::tfloat32_t");
+}
+
+TEST(complex, conj_with_complex_types_not_in_cutlass_namespace) {
+  using test::FakeReal;
+
+  // conj defined as nonmember outside the class
+  {
+    test::FakeComplex z = test::FakeComplex::make_FakeComplex(
+      FakeReal::make_FakeReal(3), FakeReal::make_FakeReal(4));
+    test::FakeComplex z_c_expected = test::FakeComplex::make_FakeComplex(
+      FakeReal::make_FakeReal(3), FakeReal::make_FakeReal(-4));
+    test::conj_tester(z, z_c_expected, "test::FakeComplex");
+  }
+  // conj defined as hidden friend
+  {
+    test::FakeComplexWithHiddenFriendConj z =
+      test::FakeComplexWithHiddenFriendConj::make_FakeComplexWithHiddenFriendConj(
+        FakeReal::make_FakeReal(3),
+        FakeReal::make_FakeReal(4));
+    test::FakeComplexWithHiddenFriendConj z_c_expected =
+      test::FakeComplexWithHiddenFriendConj::make_FakeComplexWithHiddenFriendConj(
+        FakeReal::make_FakeReal(3),
+        FakeReal::make_FakeReal(-4));
+    test::conj_tester(z, z_c_expected, "test::FakeComplexWithHiddenFriendConj");
+  }
+}
+
+TEST(complex, conj_with_cuda_std_complex_types) {
+  {
+    cuda::std::complex<double> z{3.0, 4.0};
+    cuda::std::complex<double> z_c_expected{3.0, -4.0};
+    test::conj_tester(z, z_c_expected, "cuda::std::complex<double>");
+  }
+  {
+    cuda::std::complex<float> z{3.0f, 4.0f};
+    cuda::std::complex<float> z_c_expected{3.0f, -4.0f};
+    test::conj_tester(z, z_c_expected, "cuda::std::complex<float>");
+  }
+}
+
+TEST(complex, conj_with_cuComplex_types) {
+  {
+    cuDoubleComplex z = make_cuDoubleComplex(3.0, 4.0);
+    cuDoubleComplex z_c_expected = make_cuDoubleComplex(3.0, -4.0);
+    test::conj_tester(z, z_c_expected, "cuDoubleComplex");
+  }
+  {
+    cuFloatComplex z = make_cuFloatComplex(3.0f, 4.0f);
+    cuFloatComplex z_c_expected = make_cuFloatComplex(3.0f, -4.0f);
+    test::conj_tester(z, z_c_expected, "cuFloatComplex");
+  }
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace test {
diff --git a/test/unit/core/fast_numeric_conversion.cu b/test/unit/core/fast_numeric_conversion.cu
new file mode 100644
index 0000000000..9f7a886fa2
--- /dev/null
+++ b/test/unit/core/fast_numeric_conversion.cu
@@ -0,0 +1,199 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Unit tests for conversion operators.
+*/
+
+#include "../common/cutlass_unit_test.h"
+
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/util/host_tensor.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace test {
+namespace core {
+namespace kernel {
+
+/// Simple conversion function
+template <typename Destination, typename Source, int Count>
+__global__ void convert(
+  cutlass::Array<Destination, Count> *destination,
+  cutlass::Array<Source, Count> const *source) {
+
+  cutlass::FastNumericArrayConverter<Destination, Source, Count> convert;
+
+  *destination = convert(*source);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Destination, typename Source, int Count>
+void run_test_integer_range_limited() {
+  const int kN = Count;
+
+  dim3 grid(1, 1);
+  dim3 block(1, 1);
+
+  cutlass::HostTensor<Destination, cutlass::layout::RowMajor> destination({1, kN});
+  cutlass::HostTensor<Source, cutlass::layout::RowMajor> source({1, kN});
+
+  for (int i = 0; i < kN; ++i) {
+    source.host_view().at({0, i}) = Source(i % 4);
+  }
+
+  source.sync_device();
+
+  convert<Destination, Source, kN><<< grid, block >>>(
+    reinterpret_cast<cutlass::Array<Destination, kN> *>(destination.device_data()),
+    reinterpret_cast<cutlass::Array<Source, kN> const *>(source.device_data())
+  );
+
+  destination.sync_host();
+
+  for (int i = 0; i < kN; ++i) {
+    EXPECT_TRUE(float(destination.host_view().at({0, i})) == float(source.host_view().at({0, i})));
+  }
+}
+
+
+template <typename Destination, typename Source, int Count>
+void run_test_integer_range_all() {
+  const int kN = Count;
+
+  dim3 grid(1, 1);
+  dim3 block(1, 1);
+
+  cutlass::HostTensor<Destination, cutlass::layout::RowMajor> destination({1, kN});
+  cutlass::HostTensor<Source, cutlass::layout::RowMajor> source({1, kN});
+
+  int const kIntSourceMin = cutlass::platform::numeric_limits<Source>::lowest();
+  int const kIntSourceMax = cutlass::platform::numeric_limits<Source>::max();
+  int const kIntRange = kIntSourceMax - kIntSourceMin + 1;
+
+  for (int i = 0; i < kN; ++i) {
+    source.host_view().at({0, i}) = Source(kIntSourceMin + (i % kIntRange));
+
+  }
+
+  source.sync_device();
+
+  convert<Destination, Source, kN><<< grid, block >>>(
+    reinterpret_cast<cutlass::Array<Destination, kN> *>(destination.device_data()),
+    reinterpret_cast<cutlass::Array<Source, kN> const *>(source.device_data())
+  );
+
+  destination.sync_host();
+
+  // Verify conversion
+  bool passed = true;
+
+  for (int i = 0; i < kN; ++i) {
+    if(!(float(destination.host_view().at({0, i})) == float(source.host_view().at({0, i})))) {
+      passed = false;
+      break;
+    }
+  }
+
+  EXPECT_TRUE(passed) << " FastNumericArrayConverter failed";
+
+   // Print out results for the failed conversion.
+   if (!passed) {
+    for (int i = 0; i < kN; ++i) {
+        std::cout << "source(" << float(source.host_view().at({0, i})) << ") -> "
+                  << "destination ("<< float(destination.host_view().at({0, i})) << ")" << std::endl;
+    }
+   }
+   std::flush(std::cout);
+}
+
+} // namespace kernel
+} // namespace core
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+TEST(FastNumericConversion, s32_to_f32) {
+  int const kN = 4;
+  using Source = int;
+  using Destination = float;
+  test::core::kernel::run_test_integer_range_limited<Destination, Source, kN>();
+}
+
+TEST(FastNumericConversion, s8_to_f32_array) {
+  int const kN = 256;
+  using Source = int8_t;
+  using Destination = float;
+  test::core::kernel::run_test_integer_range_all<Destination, Source, kN>();
+}
+
+TEST(FastNumericConversion, u8_to_f32_array) {
+  int const kN = 256;
+  using Source = uint8_t;
+  using Destination = float;
+  test::core::kernel::run_test_integer_range_all<Destination, Source, kN>();
+}
+
+TEST(FastNumericConversion, s8_to_f16_array) {
+  int const kN = 256;
+  using Source = int8_t;
+  using Destination = cutlass::half_t;
+  test::core::kernel::run_test_integer_range_all<Destination, Source, kN>();
+}
+
+TEST(FastNumericConversion, u8_to_f16_array) {
+  int const kN = 256;
+  using Source = uint8_t;
+  using Destination = cutlass::half_t;
+  test::core::kernel::run_test_integer_range_all<Destination, Source, kN>();
+}
+
+TEST(FastNumericConversion, u8_to_bf16_array) {
+  int const kN = 256;
+  using Source = uint8_t;
+  using Destination = cutlass::bfloat16_t;
+  test::core::kernel::run_test_integer_range_all<Destination, Source, kN>();
+}
+
+TEST(FastNumericConversion, s8_to_bf16_array) {
+  int const kN = 256;
+  using Source = int8_t;
+  using Destination = cutlass::bfloat16_t;
+  test::core::kernel::run_test_integer_range_all<Destination, Source, kN>();
+}
+
+TEST(FastNumericConversion, s4_to_s8_array) {
+  int const kN = 16;
+  using Source = cutlass::int4b_t;
+  using Destination = int8_t;
+  test::core::kernel::run_test_integer_range_all<Destination, Source, kN>();
+}
diff --git a/test/unit/core/float8.cu b/test/unit/core/float8.cu
new file mode 100644
index 0000000000..14d9d22bc0
--- /dev/null
+++ b/test/unit/core/float8.cu
@@ -0,0 +1,129 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for basic float8 functionality
+*/
+
+#include "../common/cutlass_unit_test.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include <bitset>
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(float_e4m3_t, host_conversion) {
+  using FP8 = cutlass::float_e4m3_t;
+  using Base = typename FP8::Base;
+
+  for (int i = -8; i < 8; ++i) {
+    float f = static_cast<float>(i);
+
+    cutlass::int4b_t s = static_cast<cutlass::int4b_t>(i);
+    FP8 w = static_cast<FP8>(s);
+    FP8 x = static_cast<FP8>(i);
+    FP8 y = static_cast<FP8>(f);
+
+    EXPECT_TRUE(static_cast<cutlass::int4b_t>(w) == s);
+    EXPECT_TRUE(static_cast<int>(x) == i);
+    EXPECT_TRUE(static_cast<float>(y) == f);
+
+    if (i >= 0) {
+      cutlass::uint4b_t u = static_cast<cutlass::uint4b_t>(i);
+      FP8 z = static_cast<FP8>(u);
+      EXPECT_TRUE(static_cast<unsigned>(z) == u);
+    }
+  }
+
+  // Try out default-ctor (zero initialization of primitive proxy type)
+  EXPECT_TRUE(FP8() == 0.0_fe4m3);
+
+  // Try out user-defined literals
+  EXPECT_TRUE(FP8(7) == 7_fe4m3);
+  EXPECT_TRUE(7 == static_cast<int>(7_fe4m3));
+}
+
+TEST(float_e5m2_t, host_conversion) {
+  using FP8 = cutlass::float_e5m2_t;
+  using Base = typename FP8::Base;
+
+  for (int i = -8; i < 8; ++i) {
+    float f = static_cast<float>(i);
+
+    cutlass::int4b_t s = static_cast<cutlass::int4b_t>(i);
+    FP8 w = static_cast<FP8>(s);
+    FP8 x = static_cast<FP8>(i);
+    FP8 y = static_cast<FP8>(f);
+
+    EXPECT_TRUE(static_cast<cutlass::int4b_t>(w) == s);
+    EXPECT_TRUE(static_cast<int>(x) == i);
+    EXPECT_TRUE(static_cast<float>(y) == f);
+
+    if (i >= 0) {
+      cutlass::uint4b_t u = static_cast<cutlass::uint4b_t>(i);
+      FP8 z = static_cast<FP8>(u);
+      EXPECT_TRUE(static_cast<cutlass::uint4b_t>(z) == u);
+    }
+  }
+
+  // Try out default-ctor (zero initialization of primitive proxy type)
+  EXPECT_TRUE(FP8() == 0.0_fe5m2);
+
+  // Try out user-defined literals
+  EXPECT_TRUE(FP8(7) == 7_fe5m2);
+  EXPECT_TRUE(7 == static_cast<int>(7_fe5m2));
+}
+
+TEST(float_e4m3_t, host_arithmetic) {
+  for (int i = -4; i < 4; ++i) {
+    for (int j = -4; j < 4; ++j) {
+
+      cutlass::float_e4m3_t x = static_cast<cutlass::float_e4m3_t>(i);
+      cutlass::float_e4m3_t y = static_cast<cutlass::float_e4m3_t>(j);
+
+      EXPECT_TRUE(static_cast<int>(x + y) == (i + j));
+    }
+  }
+}
+
+TEST(float_e5m2_t, host_arithmetic) {
+  for (int i = -4; i < 4; ++i) {
+    for (int j = -4; j < 4; ++j) {
+
+      cutlass::float_e5m2_t x = static_cast<cutlass::float_e5m2_t>(i);
+      cutlass::float_e5m2_t y = static_cast<cutlass::float_e5m2_t>(j);
+
+      EXPECT_TRUE(static_cast<int>(x + y) == (i + j));
+    }
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/core/functional.cu b/test/unit/core/functional.cu
index 8be00593e7..4d7656173e 100644
--- a/test/unit/core/functional.cu
+++ b/test/unit/core/functional.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -491,4 +491,78 @@ TEST(Functional, multiply_add_quaternion_f32) {
   Functional_multiply_add_QuaternionT<float>();
 }
 
+namespace cutlass_test {
+
+__global__ void
+test_cutlass_maximum(cutlass::half_t const* in1, cutlass::half_t const* in2, cutlass::half_t* out)
+{
+  {
+  constexpr bool propagate_NaN = true;
+  cutlass::maximum<cutlass::half_t, propagate_NaN> op;
+  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0
+    && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0) {
+    *out = op(*in1, *in2);
+  }
+  }
+  constexpr bool propagate_NaN = false;
+  cutlass::maximum<cutlass::half_t, propagate_NaN> op;
+  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0
+    && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0) {
+    *out = op(*in1, *in2);
+  }
+}
+
+} // cutlass_test
+
+// Test compilation on both host and device.
+TEST(Functional, maximum_half_host_propagate_NaN) {
+  constexpr bool propagate_NaN = true;
+  cutlass::maximum<cutlass::half_t, propagate_NaN> op;
+  cutlass::half_t x(1.0f);
+  cutlass::half_t y(2.0f);
+
+  auto result = op(x, y);
+  static_assert(std::is_same_v<decltype(result), cutlass::half_t>);
+  EXPECT_EQ(result, y);
+  result = op(y, x);
+  EXPECT_EQ(result, y);
+}
+
+TEST(Functional, maximum_half_host_dont_propagate_NaN) {
+  constexpr bool propagate_NaN = false;
+  cutlass::maximum<cutlass::half_t, propagate_NaN> op;
+  cutlass::half_t x(1.0f);
+  cutlass::half_t y(2.0f);
+
+  auto result = op(x, y);
+  static_assert(std::is_same_v<decltype(result), cutlass::half_t>);
+  EXPECT_EQ(result, y);
+  result = op(y, x);
+  EXPECT_EQ(result, y);
+}
+
+TEST(FUnction, maximum_half_device) {
+  using Tensor = cutlass::HostTensor<cutlass::half_t, cutlass::layout::RowMajor>;
+
+  Tensor in1({1, 1});
+  Tensor in2({1, 1});
+  Tensor out({1, 1});
+  in1.host_data()[0] = cutlass::half_t(1.0f);
+  in2.host_data()[0] = cutlass::half_t(2.0f);
+  out.host_data()[0] = cutlass::half_t(0.0f);
+
+  in1.sync_device();
+  in2.sync_device();
+  out.sync_device();
+
+  cutlass_test::test_cutlass_maximum<<< 1, 1 >>>(
+    in1.device_data(),
+    in2.device_data(),
+    out.device_data()
+  );
+  out.sync_host();
+
+  EXPECT_EQ(out.host_data()[0], 2.0f);
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/core/half.cu b/test/unit/core/half.cu
index 66ef3ed749..9fc24ed01a 100644
--- a/test/unit/core/half.cu
+++ b/test/unit/core/half.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/core/matrix.cu b/test/unit/core/matrix.cu
index 5b1b91315f..5d1d0787b6 100644
--- a/test/unit/core/matrix.cu
+++ b/test/unit/core/matrix.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/core/matrix_coord.cu b/test/unit/core/matrix_coord.cu
index 28e124c516..4b76b8c8a9 100644
--- a/test/unit/core/matrix_coord.cu
+++ b/test/unit/core/matrix_coord.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/core/numeric_conversion.cu b/test/unit/core/numeric_conversion.cu
index a3003e4774..a4b9e723bd 100644
--- a/test/unit/core/numeric_conversion.cu
+++ b/test/unit/core/numeric_conversion.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -47,10 +47,10 @@ namespace kernel {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Conversion template
+/// Simple conversion function
 template <typename Destination, typename Source, int Count>
 __global__ void convert(
-  cutlass::Array<Destination, Count> *destination, 
+  cutlass::Array<Destination, Count> *destination,
   cutlass::Array<Source, Count> const *source) {
 
   cutlass::NumericArrayConverter<Destination, Source, Count> convert;
@@ -60,132 +60,759 @@ __global__ void convert(
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-} // namespace kernel
-} // namespace core
-} // namespace test
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-TEST(NumericConversion, f32_to_f16_rn) {
-
-  int const kN = 1;
-  using Source = float;
-  using Destination = cutlass::half_t;
+template <typename Destination, typename Source, int Count>
+void run_test(const char dest_name[], const char source_name[], const int range = 4, const int offset = 0) {
+  const int kN = Count;
 
   dim3 grid(1, 1);
   dim3 block(1, 1);
 
-  cutlass::HostTensor<cutlass::half_t, cutlass::layout::RowMajor> destination({1, kN});
-  cutlass::HostTensor<float, cutlass::layout::RowMajor> source({1, kN});
+  cutlass::HostTensor<Destination, cutlass::layout::RowMajor> destination({1, kN});
+  cutlass::HostTensor<Source, cutlass::layout::RowMajor> source({1, kN});
+  auto source_ref = source.host_ref();
+  auto destination_ref = destination.host_ref();
 
   for (int i = 0; i < kN; ++i) {
-    source.host_data()[i] = float(i);
+    source_ref.at({0, i}) = Source(i % range + offset);
   }
 
   source.sync_device();
 
-  test::core::kernel::convert<Destination, Source, 1><<< grid, block >>>(
-    reinterpret_cast<cutlass::Array<Destination, 1> *>(destination.device_data()),
-    reinterpret_cast<cutlass::Array<Source, 1> const *>(source.device_data())
+  convert<Destination, Source, kN><<< grid, block >>>(
+    reinterpret_cast<cutlass::Array<Destination, kN> *>(destination.device_data()),
+    reinterpret_cast<cutlass::Array<Source, kN> const *>(source.device_data())
   );
 
   destination.sync_host();
 
   for (int i = 0; i < kN; ++i) {
-    EXPECT_TRUE(float(destination.host_data()[i]) == source.host_data()[i]);
+    EXPECT_TRUE(float(destination_ref.at({0, i})) == float(source_ref.at({0, i})))
+      << "Destination type: " << dest_name << " "<< float(destination_ref.at({0, i}))
+      << ", Source type: " << source_name << " " << float(source_ref.at({0, i}))
+      << ", Count: " << Count;
   }
 }
 
-TEST(NumericConversion, f32x8_to_f16x8_rn) {
+/////////////////////////////////////////////////////////////////////////////////////////////////
 
-  int const kN = 8;
-  using Source = float;
-  using Destination = cutlass::half_t;
+template <typename Destination, typename Source, typename ScaleFactor, int Count>
+__global__ void convert_with_scale_factor(
+  cutlass::Array<Destination, Count> *destination,
+  cutlass::Array<Source, Count> const *source,
+  cutlass::Array<ScaleFactor, Count> const *scale_factor) {
+
+  cutlass::NumericArrayConverter<Destination, Source, Count> convert;
+
+  *destination = convert(*source, *scale_factor);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Destination, typename Source, typename ScaleFactor,  int Count>
+void run_test_with_scalefactor(const char dest_name[], const char source_name[], const char scale_factor_name[], const int range = 4, const int offset = 0) {
+  const int kN = Count;
 
   dim3 grid(1, 1);
   dim3 block(1, 1);
 
   cutlass::HostTensor<Destination, cutlass::layout::RowMajor> destination({1, kN});
   cutlass::HostTensor<Source, cutlass::layout::RowMajor> source({1, kN});
+  cutlass::HostTensor<ScaleFactor, cutlass::layout::RowMajor> scale_factor({1, kN});
+  auto source_ref = source.host_ref();
+  auto destination_ref = destination.host_ref();
+  auto scale_factor_ref = scale_factor.host_ref();
+
+
+  for (int i = 0; i < kN; ++i) {
+    source_ref.at({0, i}) = Source(i % range + offset);
+  }
 
   for (int i = 0; i < kN; ++i) {
-    source.host_data()[i] = float(i);
+    scale_factor_ref.at({0, i}) = ScaleFactor(1 + i % 8);
   }
 
   source.sync_device();
+  scale_factor.sync_device();
 
-  test::core::kernel::convert<Destination, Source, kN><<< grid, block >>>(
+  convert_with_scale_factor<Destination, Source, ScaleFactor, kN><<< grid, block >>>(
     reinterpret_cast<cutlass::Array<Destination, kN> *>(destination.device_data()),
-    reinterpret_cast<cutlass::Array<Source, kN> const *>(source.device_data())
+    reinterpret_cast<cutlass::Array<Source, kN> const *>(source.device_data()),
+    reinterpret_cast<cutlass::Array<ScaleFactor, kN> const *>(scale_factor.device_data())
   );
 
   destination.sync_host();
 
   for (int i = 0; i < kN; ++i) {
-    EXPECT_TRUE(float(destination.host_data()[i]) == source.host_data()[i]);
+    float ref = float(source_ref.at({0, i})) / float(scale_factor_ref.at({0, i}));
+    bool pass = float(destination_ref.at({0, i})) == ref;
+    EXPECT_TRUE(pass) 
+      << "Destination type: " << dest_name << " "<< float(destination_ref.at({0, i})) << std::endl
+      << ", Source type: " << source_name << " " << float(source_ref.at({0, i})) << std::endl
+      << ", Scalefactor type: " << source_name << " " << float(scale_factor_ref.at({0, i})) << std::endl
+      << ", idx: " << i << std::endl;
   }
 }
 
+} // namespace kernel
+} // namespace core
+} // namespace test
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-TEST(NumericConversion, f16_to_f32_rn) {
-  
+TEST(NumericConversion, f32_to_f16_rn) {
+  constexpr int kN = 1;
+  using Source = float;
+  const char source_name[] = "float";
+  using Destination = cutlass::half_t;
+  const char dest_name[] = "half_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, f32x2_to_f16x2_rn) {
+  constexpr int kN = 2;
+  using Source = float;
+  const char source_name[] = "float";
+  using Destination = cutlass::half_t;
+  const char dest_name[] = "half_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, f32x8_to_f16x8_rn) {
+  constexpr int kN = 8;
+  using Source = float;
+  const char source_name[] = "float";
+  using Destination = cutlass::half_t;
+  const char dest_name[] = "half_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(NumericConversion, f16_to_f32_rn) {  
   int const kN = 1;
   using Source = cutlass::half_t;
+  const char source_name[] = "half_t";
   using Destination = float;
+  const char dest_name[] = "float";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
 
-  dim3 grid(1, 1);
-  dim3 block(1, 1);
+TEST(NumericConversion, f16x8_to_f32x8_rn) {
+  int const kN = 8;
+  using Source = cutlass::half_t;
+  const char source_name[] = "half_t";
+  using Destination = float;
+  const char dest_name[] = "float";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
 
-  cutlass::HostTensor<float, cutlass::layout::RowMajor> destination({1, kN});
-  cutlass::HostTensor<cutlass::half_t, cutlass::layout::RowMajor> source({1, kN});
+/////////////////////////////////////////////////////////////////////////////////////////////////
 
-  for (int i = 0; i < kN; ++i) {
-    source.host_data()[i] = Source(i);
-  }
+TEST(NumericConversion, f32_to_fe4m3_rn) {
+  int const kN = 1;
+  using Source = float;
+  const char source_name[] = "float";
+  using Destination = cutlass::float_e4m3_t;
+  const char dest_name[] = "float_e4m3_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
 
-  source.sync_device();
+TEST(NumericConversion, f32_to_fe4m3_rn_2_elements) {
+  int const kN = 2;
+  using Source = float;
+  const char source_name[] = "float";
+  using Destination = cutlass::float_e4m3_t;
+  const char dest_name[] = "float_e4m3_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
 
-  test::core::kernel::convert<Destination, Source, kN><<< grid, block >>>(
-    reinterpret_cast<cutlass::Array<Destination, kN> *>(destination.device_data()),
-    reinterpret_cast<cutlass::Array<Source, kN> const *>(source.device_data())
-  );
+TEST(NumericConversion, f32_to_fe4m3_rn_array) {
+  int const kN = 27;
+  using Source = float;
+  const char source_name[] = "float";
+  using Destination = cutlass::float_e4m3_t;
+  const char dest_name[] = "float_e4m3_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
 
-  destination.sync_host();
+TEST(NumericConversion, f32_to_fe5m2_rn) {
+  int const kN = 1;
+  using Source = float;
+  const char source_name[] = "float";
+  using Destination = cutlass::float_e5m2_t;
+  const char dest_name[] = "float_e5m2_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
 
-  for (int i = 0; i < kN; ++i) {
-    EXPECT_TRUE(float(destination.host_data()[i]) == float(source.host_data()[i]));
-  }
+TEST(NumericConversion, f32_to_fe5m2_rn_2_elements) {
+  int const kN = 2;
+  using Source = float;
+  const char source_name[] = "float";
+  using Destination = cutlass::float_e5m2_t;
+  const char dest_name[] = "float_e5m2_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
 }
 
-TEST(NumericConversion, f16x8_to_f32x8_rn) {
+TEST(NumericConversion, f32_to_fe5m2_rn_array) {
+  int const kN = 27;
+  using Source = float;
+  const char source_name[] = "float";
+  using Destination = cutlass::float_e5m2_t;
+  const char dest_name[] = "float_e5m2_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
 
-  int const kN = 8;
+TEST(NumericConversion, f16_to_fe4m3_rn) {
+  int const kN = 1;
+  using Source = cutlass::half_t;
+  const char source_name[] = "half_t";
+  using Destination = cutlass::float_e4m3_t;
+  const char dest_name[] = "float_e4m3_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, f16_to_fe4m3_rn_2_elements) {
+  int const kN = 2;
+  using Source = cutlass::half_t;
+  const char source_name[] = "half_t";
+  using Destination = cutlass::float_e4m3_t;
+  const char dest_name[] = "float_e4m3_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, f16_to_fe4m3_rn_array) {
+  int const kN = 27;
+  using Source = cutlass::half_t;
+  const char source_name[] = "half_t";
+  using Destination = cutlass::float_e4m3_t;
+  const char dest_name[] = "float_e4m3_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, f16_to_fe5m2_rn) {
+  int const kN = 1;
   using Source = cutlass::half_t;
+  const char source_name[] = "half_t";
+  using Destination = cutlass::float_e5m2_t;
+  const char dest_name[] = "float_e5m2_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, f16_to_fe5m2_rn_2_elements) {
+  int const kN = 27;
+  using Source = cutlass::half_t;
+  const char source_name[] = "half_t";
+  using Destination = cutlass::float_e5m2_t;
+  const char dest_name[] = "float_e5m2_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, f16_to_fe5m2_rn_array) {
+  int const kN = 27;
+  using Source = cutlass::half_t;
+  const char source_name[] = "half_t";
+  using Destination = cutlass::float_e5m2_t;
+  const char dest_name[] = "float_e5m2_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, bf16_to_fe4m3_rn) {
+  int const kN = 1;
+  using Source = cutlass::bfloat16_t;
+  const char source_name[] = "bfloat16_t";
+  using Destination = cutlass::float_e4m3_t;
+  const char dest_name[] = "float_e4m3_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, bf16_to_fe4m3_rn_2_elements) {
+  int const kN = 27;
+  using Source = cutlass::bfloat16_t;
+  const char source_name[] = "bfloat16_t";
+  using Destination = cutlass::float_e4m3_t;
+  const char dest_name[] = "float_e4m3_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, bf16_to_fe4m3_rn_array) {
+  int const kN = 27;
+  using Source = cutlass::bfloat16_t;
+  const char source_name[] = "bfloat16_t";
+  using Destination = cutlass::float_e4m3_t;
+  const char dest_name[] = "float_e4m3_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, bf16_to_fe5m2_rn) {
+  int const kN = 1;
+  using Source = cutlass::bfloat16_t;
+  const char source_name[] = "bfloat16_t";
+  using Destination = cutlass::float_e5m2_t;
+  const char dest_name[] = "float_e5m2_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, bf16_to_fe5m2_rn_2_elements) {
+  int const kN = 27;
+  using Source = cutlass::bfloat16_t;
+  const char source_name[] = "bfloat16_t";
+  using Destination = cutlass::float_e5m2_t;
+  const char dest_name[] = "float_e5m2_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, bf16_to_fe5m2_rn_array) {
+  int const kN = 27;
+  using Source = cutlass::bfloat16_t;
+  const char source_name[] = "bfloat16_t";
+  using Destination = cutlass::float_e5m2_t;
+  const char dest_name[] = "float_e5m2_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(NumericConversion, fe4m3_to_fe5m2_rn) {
+  int const kN = 1;
+  using Source = cutlass::float_e4m3_t;
+  const char source_name[] = "float_e4m3_t";
+  using Destination = cutlass::float_e5m2_t;
+  const char dest_name[] = "float_e5m2_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, fe4m3_to_fe5m2_2_elements) {
+  int const kN = 27;
+  using Source = cutlass::float_e4m3_t;
+  const char source_name[] = "float_e4m3_t";
+  using Destination = cutlass::float_e5m2_t;
+  const char dest_name[] = "float_e5m2_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, fe4m3_to_fe5m2_array) {
+  int const kN = 27;
+  using Source = cutlass::float_e4m3_t;
+  const char source_name[] = "float_e4m3_t";
+  using Destination = cutlass::float_e5m2_t;
+  const char dest_name[] = "float_e5m2_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, fe5m2_to_fe4m3_rn) {
+  int const kN = 1;
+  using Source = cutlass::float_e5m2_t;
+  const char source_name[] = "float_e5m2_t";
+  using Destination = cutlass::float_e4m3_t;
+  const char dest_name[] = "float_e4m3_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, fe5m2_to_fe4m3_2_elements) {
+  int const kN = 27;
+  using Source = cutlass::float_e5m2_t;
+  const char source_name[] = "float_e5m2_t";
+  using Destination = cutlass::float_e4m3_t;
+  const char dest_name[] = "float_e4m3_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, fe5m2_to_fe4m3_array) {
+  int const kN = 27;
+  using Source = cutlass::float_e5m2_t;
+  const char source_name[] = "float_e5m2_t";
+  using Destination = cutlass::float_e4m3_t;
+  const char dest_name[] = "float_e4m3_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, fe4m3_to_f32_rn) {
+  int const kN = 1;
+  using Source = cutlass::float_e4m3_t;
+  const char source_name[] = "float_e4m3_t";
   using Destination = float;
+  const char dest_name[] = "float";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
 
-  dim3 grid(1, 1);
-  dim3 block(1, 1);
+/////////////////////////////////////////////////////////////////////////////////////////////////
 
-  cutlass::HostTensor<float, cutlass::layout::RowMajor> destination({1, kN});
-  cutlass::HostTensor<cutlass::half_t, cutlass::layout::RowMajor> source({1, kN});
+TEST(NumericConversion, f32x8_to_s8x8_rn) {
 
-  for (int i = 0; i < kN; ++i) {
-    source.host_data()[i] = float(i);
-  }
+  int const kN = 8;
+  using Source = float;
+  const char source_name[] = "float";
+  using Destination = int8_t;
+  const char dest_name[] = "int8_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
 
-  source.sync_device();
+TEST(NumericConversion, fe4m3_to_f32_2_elements) {
+  int const kN = 2;
+  using Source = cutlass::float_e4m3_t;
+  const char source_name[] = "float_e4m3_t";
+  using Destination = float;
+  const char dest_name[] = "float";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
 
-  test::core::kernel::convert<Destination, Source, kN><<< grid, block >>>(
-    reinterpret_cast<cutlass::Array<Destination, kN> *>(destination.device_data()),
-    reinterpret_cast<cutlass::Array<Source, kN> const *>(source.device_data())
-  );
+TEST(NumericConversion, fe4m3_to_f32_array) {
+  int const kN = 27;
+  using Source = cutlass::float_e4m3_t;
+  const char source_name[] = "float_e4m3_t";
+  using Destination = float;
+  const char dest_name[] = "float";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
 
-  destination.sync_host();
+TEST(NumericConversion, fe5m2_to_f32_2_elements) {
+  int const kN = 2;
+  using Source = cutlass::float_e5m2_t;
+  const char source_name[] = "float_e5m2_t";
+  using Destination = float;
+  const char dest_name[] = "float";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
 
-  for (int i = 0; i < kN; ++i) {
-    EXPECT_TRUE(float(destination.host_data()[i]) == float(source.host_data()[i]));
+TEST(NumericConversion, fe5m2_to_f32_array) {
+  int const kN = 27;
+  using Source = cutlass::float_e5m2_t;
+  const char source_name[] = "float_e5m2_t";
+  using Destination = float;
+  const char dest_name[] = "float";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, fe4m3_to_f16_rn) {
+  int const kN = 1;
+  using Source = cutlass::float_e4m3_t;
+  const char source_name[] = "float_e4m3_t";
+  using Destination = cutlass::half_t;
+  const char dest_name[] = "half_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, fe4m3_to_f16_2_elements) {
+  int const kN = 2;
+  using Source = cutlass::float_e4m3_t;
+  const char source_name[] = "float_e4m3_t";
+  using Destination = cutlass::half_t;
+  const char dest_name[] = "half_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, fe4m3_to_f16_array) {
+  int const kN = 27;
+  using Source = cutlass::float_e4m3_t;
+  const char source_name[] = "float_e4m3_t";
+  using Destination = cutlass::half_t;
+  const char dest_name[] = "half_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, fe5m2_to_f16_rn) {
+  int const kN = 1;
+  using Source = cutlass::float_e5m2_t;
+  const char source_name[] = "float_e5m2_t";
+  using Destination = cutlass::half_t;
+  const char dest_name[] = "half_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, fe5m2_to_f16_2_elements) {
+  int const kN = 2;
+  using Source = cutlass::float_e5m2_t;
+  const char source_name[] = "float_e5m2_t";
+  using Destination = cutlass::half_t;
+  const char dest_name[] = "half_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, fe5m2_to_f16_array) {
+  int const kN = 27;
+  using Source = cutlass::float_e5m2_t;
+  const char source_name[] = "float_e5m2_t";
+  using Destination = cutlass::half_t;
+  const char dest_name[] = "half_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, fe4m3_to_bf16_rn) {
+  int const kN = 1;
+  using Source = cutlass::float_e4m3_t;
+  const char source_name[] = "float_e4m3_t";
+  using Destination = cutlass::bfloat16_t;
+  const char dest_name[] = "bfloat16_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, fe4m3_to_bf16_2_elements) {
+  int const kN = 2;
+  using Source = cutlass::float_e4m3_t;
+  const char source_name[] = "float_e4m3_t";
+  using Destination = cutlass::bfloat16_t;
+  const char dest_name[] = "bfloat16_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, fe4m3_to_bf16_array) {
+  int const kN = 27;
+  using Source = cutlass::float_e4m3_t;
+  const char source_name[] = "float_e4m3_t";
+  using Destination = cutlass::bfloat16_t;
+  const char dest_name[] = "bfloat16_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, fe5m2_to_bf16_rn) {
+  int const kN = 1;
+  using Source = cutlass::float_e5m2_t;
+  const char source_name[] = "float_e5m2_t";
+  using Destination = cutlass::bfloat16_t;
+  const char dest_name[] = "bfloat16_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, fe5m2_to_bf16_2_elements) {
+  int const kN = 2;
+  using Source = cutlass::float_e5m2_t;
+  const char source_name[] = "float_e5m2_t";
+  using Destination = cutlass::bfloat16_t;
+  const char dest_name[] = "bfloat16_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, fe5m2_to_bf16_array) {
+  int const kN = 27;
+  using Source = cutlass::float_e5m2_t;
+  const char source_name[] = "float_e5m2_t";
+  using Destination = cutlass::bfloat16_t;
+  const char dest_name[] = "bfloat16_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+// These are included as regression tests for a special case when N = 4.
+TEST(NumericConversion, int4b_t_to_fe5m2_t_array_4) {
+  int const kN = 4;
+  using Source = cutlass::int4b_t;
+  const char source_name[] = "int4b_t";
+  using Destination = cutlass::float_e5m2_t;
+  const char dest_name[] = "float_e5m2_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, int_to_fe4m3_t_array_4) {
+  int const kN = 4;
+  using Source = int;
+  const char source_name[] = "int";
+  using Destination = cutlass::float_e4m3_t;
+  const char dest_name[] = "float_e4m3_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, int2b_t_to_fe4m3_t_array_4) {
+  int const kN = 4;
+  using Source = cutlass::int2b_t;
+  const char source_name[] = "int2b_t";
+  using Destination = cutlass::float_e4m3_t;
+  const char dest_name[] = "float_e4m3_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, fe5m2_t_to_double_array_4) {
+  int const kN = 4;
+  using Source = cutlass::float_e5m2_t;
+  const char source_name[] = "float_e5m2_t";
+  using Destination = double;
+  const char dest_name[] = "double";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+TEST(NumericConversion, int_to_fe4m3_t_array_32) {
+  int const kN = 32;
+  using Source = int;
+  const char source_name[] = "int";
+  using Destination = cutlass::float_e4m3_t;
+  const char dest_name[] = "float_e4m3_t";
+  test::core::kernel::run_test<Destination, Source, kN>(dest_name, source_name);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct GetName {
+  static constexpr char name[] = "UNSUPPORTED";
+};
+
+template <>
+struct GetName<cutlass::int2b_t> {
+  static constexpr char name[] = "int2b_t";
+};
+
+template <>
+struct GetName<cutlass::uint2b_t> {
+  static constexpr char name[] = "uint2b_t";
+};
+
+template <>
+struct GetName<cutlass::int4b_t> {
+  static constexpr char name[] = "int4b_t";
+};
+
+template <>
+struct GetName<cutlass::uint4b_t> {
+  static constexpr char name[] = "uint4b_t";
+};
+
+template <>
+struct GetName<uint8_t> {
+  static constexpr char name[] = "uint8_t";
+};
+
+template <>
+struct GetName<int8_t> {
+  static constexpr char name[] = "int8_t";
+};
+
+template <>
+struct GetName<cutlass::float_e4m3_t> {
+  static constexpr char name[] = "float_e4m3_t";
+};
+
+template <>
+struct GetName<cutlass::half_t> {
+  static constexpr char name[] = "half_t";
+};
+
+template <>
+struct GetName<cutlass::bfloat16_t> {
+  static constexpr char name[] = "bfloat16_t";
+};
+
+template <>
+struct GetName<float> {
+  static constexpr char name[] = "float";
+};
+
+template <typename Result_, typename Source_>
+struct ResultSourcePair {
+  using Result = Result_;
+  using Source = Source_;
+};
+
+template <typename ResultSourcePair>
+class VectorArrayConverterTest : public testing::Test {
+ public:
+  using Result = typename ResultSourcePair::Result;
+  using Source = typename ResultSourcePair::Source;
+  
+  template <int N>
+  static void emit_test() { 
+    const int range = 1 << cutlass::sizeof_bits<Source>::value;
+    const int offset = cutlass::platform::numeric_limits<Source>::lowest();
+    test::core::kernel::run_test<Result, Source, N>(GetName<Result>::name, GetName<Source>::name, range, offset);
   }
+};
+
+using VectorConvertTypes = ::testing::Types<
+  ResultSourcePair<float, int8_t>,
+  ResultSourcePair<float, uint8_t>,
+
+  ResultSourcePair<cutlass::half_t, int8_t>,
+  ResultSourcePair<cutlass::half_t, uint8_t>,
+
+  ResultSourcePair<cutlass::bfloat16_t, uint8_t>,
+  ResultSourcePair<cutlass::bfloat16_t, int8_t>,
+
+  ResultSourcePair<cutlass::half_t, cutlass::int2b_t>,
+  ResultSourcePair<cutlass::bfloat16_t, cutlass::int2b_t>,
+  ResultSourcePair<cutlass::half_t, cutlass::uint2b_t>,
+  ResultSourcePair<cutlass::bfloat16_t, cutlass::uint2b_t>,
+  ResultSourcePair<cutlass::float_e4m3_t, cutlass::int4b_t>,
+  ResultSourcePair<cutlass::half_t, cutlass::int4b_t>,
+  ResultSourcePair<cutlass::bfloat16_t, cutlass::int4b_t>,
+  ResultSourcePair<cutlass::half_t, cutlass::uint4b_t>,
+  ResultSourcePair<cutlass::bfloat16_t, cutlass::uint4b_t>,
+  ResultSourcePair<float, cutlass::int4b_t>
+>;
+
+TYPED_TEST_SUITE(VectorArrayConverterTest, VectorConvertTypes);
+
+TYPED_TEST(VectorArrayConverterTest, array_1) {
+  TestFixture::template emit_test<1>();
+}
+
+TYPED_TEST(VectorArrayConverterTest, array_2) {
+  TestFixture::template emit_test<2>();
 }
 
+TYPED_TEST(VectorArrayConverterTest, array_3) {
+  TestFixture::template emit_test<3>();
+}
+
+TYPED_TEST(VectorArrayConverterTest, array_4) {
+  TestFixture::template emit_test<4>();
+}
+
+TYPED_TEST(VectorArrayConverterTest, array_5) {
+  TestFixture::template emit_test<5>();
+}
+
+TYPED_TEST(VectorArrayConverterTest, array_8) {
+  TestFixture::template emit_test<8>();
+}
+
+TYPED_TEST(VectorArrayConverterTest, array_10) {
+  // N > 8 and N is not a multiple of 4
+  TestFixture::template emit_test<10>();
+}
+
+TYPED_TEST(VectorArrayConverterTest, array_12) {
+  // N > 8 and N is a multiple of 4
+  TestFixture::template emit_test<12>();
+}
+
+TYPED_TEST(VectorArrayConverterTest, array_16) {
+  // N > 8 and N is a multiple of 8
+  TestFixture::template emit_test<16>();
+}
+
+TYPED_TEST(VectorArrayConverterTest, array_17) {
+  // N > 8 and N is not a multiple of 8
+  TestFixture::template emit_test<17>();
+}
+
+TYPED_TEST(VectorArrayConverterTest, array_27) {
+  // Test entire conversion range with residue (for int4)
+  TestFixture::template emit_test<27>();
+}
+
+TYPED_TEST(VectorArrayConverterTest, array_31) {
+  // Force use of converters for 16, 8, 4, 2 and scalar 
+  // if max width is 16
+  TestFixture::template emit_test<31>();
+}
+
+TYPED_TEST(VectorArrayConverterTest, array_63) {
+  // Force use of converters for 32, 16, 8, 4, 2 and scalar 
+  // if max width is 32
+  TestFixture::template emit_test<63>();
+}
+
+TYPED_TEST(VectorArrayConverterTest, array_256) {
+  // Test entire conversion range (for int8)
+  TestFixture::template emit_test<256>();
+}
+
+TYPED_TEST(VectorArrayConverterTest, array_259) {
+  // Force use of 4, 2 and scalar converter (if max width is 4)
+  TestFixture::template emit_test<259>();
+}
+
+TYPED_TEST(VectorArrayConverterTest, array_263) {
+  // Force use of 8, 4, 2 and scalar converter (if max width is 8)
+  TestFixture::template emit_test<263>();
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/core/numeric_conversion_subbyte.cu b/test/unit/core/numeric_conversion_subbyte.cu
new file mode 100644
index 0000000000..a670afce46
--- /dev/null
+++ b/test/unit/core/numeric_conversion_subbyte.cu
@@ -0,0 +1,69 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Unit tests for conversion operators.
+*/
+
+#include "../common/cutlass_unit_test.h"
+
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/integer_subbyte.h"
+
+namespace test::core::host {
+
+template <class DstValueType, class SrcValueType, int NumElements>
+void run_test() {
+  cutlass::Array<DstValueType, NumElements> dst;
+  dst.clear();
+
+  cutlass::Array<SrcValueType, NumElements> src;
+  for (int k = 0; k < NumElements; ++k) {
+    src[k] = SrcValueType(k+1);
+  }
+
+  cutlass::NumericArrayConverter<DstValueType, SrcValueType, NumElements> converter;
+  dst = converter(src);
+
+  for (int k = 0; k < NumElements; ++k) {
+    EXPECT_TRUE(static_cast<int>(src[k]) == static_cast<int>(dst[k]));
+  }
+}
+
+} // namespace test::core::host
+
+TEST(NumericArrayConversion, Subbyte_int8_int8) {
+  test::core::host::run_test<int8_t, int8_t, 8>();
+}
+
+TEST(NumericArrayConversion, Subbyte_int8_int4) {
+  test::core::host::run_test<int8_t, cutlass::int4b_t, 8>();
+}
+
diff --git a/test/unit/core/predicate_vector.cu b/test/unit/core/predicate_vector.cu
index 462c8457ec..ab4ef2009b 100644
--- a/test/unit/core/predicate_vector.cu
+++ b/test/unit/core/predicate_vector.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/core/quaternion.cu b/test/unit/core/quaternion.cu
index 3610681047..9d8a596d74 100644
--- a/test/unit/core/quaternion.cu
+++ b/test/unit/core/quaternion.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/core/tensor_ref.cu b/test/unit/core/tensor_ref.cu
index 95051d88a7..2182dc107b 100644
--- a/test/unit/core/tensor_ref.cu
+++ b/test/unit/core/tensor_ref.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/core/tensor_view.cu b/test/unit/core/tensor_view.cu
index ca1a7ed6d6..1fee5e3eb3 100644
--- a/test/unit/core/tensor_view.cu
+++ b/test/unit/core/tensor_view.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/core/test_unit_core.cpp b/test/unit/core/test_unit_core.cpp
index b4bc3e8d8c..f25d8fa9f0 100644
--- a/test/unit/core/test_unit_core.cpp
+++ b/test/unit/core/test_unit_core.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/core/tfloat32.cu b/test/unit/core/tfloat32.cu
index 072195d197..3e9a8a2da1 100644
--- a/test/unit/core/tfloat32.cu
+++ b/test/unit/core/tfloat32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/core/uint128.cu b/test/unit/core/uint128.cu
new file mode 100644
index 0000000000..3e9f04265b
--- /dev/null
+++ b/test/unit/core/uint128.cu
@@ -0,0 +1,121 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for basic uint128 functionality
+*/
+
+#include "../common/cutlass_unit_test.h"
+
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/host_tensor.h"
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Host
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(uint128_t, host_arithmetic) {
+  using T = cutlass::uint128_t;
+
+  // only low 64bit
+  for (uint64_t i = 0; i < 1024; ++i) {
+    for (uint64_t j = 0; j < 1024; ++j) {
+      T x = i;
+      T y = j;
+
+      EXPECT_TRUE(static_cast<uint64_t>(x + y) == (i + j));
+    }
+  }
+
+  // carry overflow for low uint64_t 
+  {
+    for (uint64_t i = 0; i < 1024; ++i) {
+      T x = static_cast<uint64_t>(0xFFFFFFFFFFFFFFFF);
+      T y = i + 1;
+
+      T z = x + y;
+
+      EXPECT_EQ(z.hilo_.hi, static_cast<uint64_t>(0x1));
+      EXPECT_EQ(z.hilo_.lo, i);
+    }
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Device
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+__global__ void uint128_add_operator(cutlass::uint128_t *output, cutlass::uint128_t const *input, cutlass::uint128_t base, int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+    output[tid] = input[tid] + base;
+  }
+}
+
+TEST(uint128_t, device_arithmetic) {
+  using T = cutlass::uint128_t;
+
+  int const N = 1024;
+
+  cutlass::HostTensor<T, cutlass::layout::RowMajor> input({N, 1});
+  cutlass::HostTensor<T, cutlass::layout::RowMajor> sum({N, 1});
+
+  for (int i = 0; i < N; ++i) {
+    input.at({i, 0}) = static_cast<uint64_t>(i + 1);
+  }
+
+  T b = static_cast<uint64_t>(0xFFFFFFFFFFFFFFFF);
+
+  input.sync_device();
+
+  uint128_add_operator<<< dim3(1,1), dim3(N, 1) >>>(sum.device_data(), input.device_data(), b, N);
+
+  ASSERT_EQ(cudaGetLastError(), cudaSuccess) << "Kernel launch error.";
+
+  sum.sync_host();
+
+  for (int i = 0; i < N; ++i) {
+    T got = sum.at({i, 0});
+    uint64_t expected_hi = static_cast<uint64_t>(0x1);
+    uint64_t expected_lo = static_cast<uint64_t>(i);
+
+    EXPECT_EQ(got.hilo_.hi, expected_hi);
+    EXPECT_EQ(got.hilo_.lo, expected_lo);
+  }
+}
diff --git a/test/unit/cute/CMakeLists.txt b/test/unit/cute/CMakeLists.txt
new file mode 100644
index 0000000000..601c0c0d96
--- /dev/null
+++ b/test/unit/cute/CMakeLists.txt
@@ -0,0 +1,59 @@
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+add_subdirectory(core)
+add_subdirectory(volta)
+add_subdirectory(turing)
+add_subdirectory(ampere)
+add_subdirectory(hopper)
+add_subdirectory(layout)
+add_subdirectory(msvc_compilation)
+
+add_custom_target(
+  cutlass_test_unit_cute
+  DEPENDS
+  cutlass_test_unit_cute_layout
+  cutlass_test_unit_cute_core
+  cutlass_test_unit_cute_volta
+  cutlass_test_unit_cute_turing
+  cutlass_test_unit_cute_ampere
+  cutlass_test_unit_cute_hopper
+  cutlass_test_unit_cute_msvc_compilation
+  )
+
+add_custom_target(
+  test_unit_cute
+  DEPENDS
+  test_unit_cute_layout
+  test_unit_cute_core
+  test_unit_cute_volta
+  test_unit_cute_ampere
+  test_unit_cute_turing
+  test_unit_cute_hopper
+  test_unit_cute_msvc_compilation
+  )
diff --git a/test/unit/cute/ampere/CMakeLists.txt b/test/unit/cute/ampere/CMakeLists.txt
new file mode 100644
index 0000000000..c1a654e893
--- /dev/null
+++ b/test/unit/cute/ampere/CMakeLists.txt
@@ -0,0 +1,40 @@
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_cute_ampere
+  cp_sync.cu
+  ldsm.cu
+  cooperative_gemm.cu
+  cooperative_copy.cu
+)
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_cute_ampere_tiled_cp_async
+  tiled_cp_async.cu
+)
diff --git a/test/unit/cute/ampere/cooperative_copy.cu b/test/unit/cute/ampere/cooperative_copy.cu
new file mode 100644
index 0000000000..fef61aa238
--- /dev/null
+++ b/test/unit/cute/ampere/cooperative_copy.cu
@@ -0,0 +1,636 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <iostream>
+#include <iomanip>
+#include <utility>
+#include <type_traits>
+#include <vector>
+#include <numeric>
+#include <tuple>
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <cute/tensor.hpp>
+#include <cute/swizzle.hpp> // cute::Swizzle
+#include <cute/swizzle_layout.hpp> // cute::compose(cute::Swizzle)
+#include <cute/numeric/numeric_types.hpp>
+#include <cute/atom/copy_traits_sm80.hpp>
+
+using namespace cute;
+
+namespace cooperative_copy_mode {
+  struct global_shared {};
+  struct global_global {};
+  struct shared_shared {};
+}
+
+// gs --> global to/from shared
+template <int MaxVecBits, uint32_t ThreadBlockSize, class T, class GMemLayout, class SMemLayout>
+__device__ void
+cooperative_copy_default_gs(T const* g_in, T* g_out, GMemLayout const& gmem_layout, SMemLayout const& smem_layout)
+{
+  using namespace cute;
+  extern __shared__ uint128_t smem_buf[];
+  // Cast smem_buf to smem_uint8_ptr and move it by MaxVecBits bits
+  // This is to make sure tests pass on pointer aligned to MaxVecBits bits
+  uint8_t* smem_uint8_ptr = reinterpret_cast<uint8_t*>(smem_buf) + (MaxVecBits/8);
+  T* smem = reinterpret_cast<T*>(smem_uint8_ptr);
+
+  Tensor g_in_tensor  = make_tensor(make_gmem_ptr(g_in),  gmem_layout);
+  Tensor g_out_tensor = make_tensor(make_gmem_ptr(g_out), gmem_layout);
+  Tensor s_tensor     = make_tensor(make_smem_ptr(smem),  smem_layout);
+
+  cooperative_copy<ThreadBlockSize, MaxVecBits>(threadIdx.x, g_in_tensor, s_tensor, AutoCopyAsync{});
+
+  cp_async_fence();
+  cp_async_wait<0>();
+  __syncthreads();
+
+  if(thread0()) {
+    for(int i = 0; i < size(s_tensor); ++i) {
+      s_tensor(i) += T(i);
+    }
+  }
+  __syncthreads();
+
+  cooperative_copy<ThreadBlockSize, MaxVecBits>(threadIdx.x, s_tensor, g_out_tensor, AutoCopyAsync{});
+}
+
+// ss --> shared to shared
+template <int MaxVecBits, uint32_t ThreadBlockSize, class T, class Layout1, class Layout2>
+__device__ void
+cooperative_copy_default_ss(T const* g_in, T* g_out, Layout1 const& layout1, Layout2 const& layout2)
+{
+  using namespace cute;
+  extern __shared__ uint128_t smem_buf[];
+  // Cast smem_buf to smem_uint8_ptr and move it by MaxVecBits bits
+  // This is to make sure tests pass on pointer aligned to MaxVecBits bits
+  T* smem1 = reinterpret_cast<T*>(smem_buf);
+  uint8_t* smem2_uint8_ptr = reinterpret_cast<uint8_t*>(smem_buf) + (MaxVecBits/8);
+  T* smem2 = reinterpret_cast<T*>(smem2_uint8_ptr) + cute::cosize(layout2);
+
+  Tensor g_in_tensor  = make_tensor(make_gmem_ptr(g_in),  layout1);
+  Tensor g_out_tensor = make_tensor(make_gmem_ptr(g_out), layout2);
+
+  Tensor s1_tensor    = make_tensor(make_smem_ptr(smem1), layout2);
+  Tensor s2_tensor    = make_tensor(make_smem_ptr(smem2), layout1);
+
+  cooperative_copy<ThreadBlockSize,  cute::sizeof_bits_v<T>>(threadIdx.x, g_in_tensor, s1_tensor, AutoCopyAsync{});
+
+  cp_async_fence();
+  cp_async_wait<0>();
+  __syncthreads();
+
+  if(thread0()) {
+    for(int i = 0; i < size(s1_tensor); ++i) {
+      s1_tensor(i) += T(i);
+    }
+  }
+  __syncthreads();
+
+  cooperative_copy<ThreadBlockSize, MaxVecBits>(threadIdx.x, s1_tensor, s2_tensor, AutoCopyAsync{});
+  __syncthreads();
+
+  cooperative_copy<ThreadBlockSize,  cute::sizeof_bits_v<T>>(threadIdx.x, s2_tensor, g_out_tensor, AutoCopyAsync{});
+}
+
+// gg --> global to global
+template <int MaxVecBits, uint32_t ThreadBlockSize, class T, class Layout1, class Layout2>
+__device__ void
+cooperative_copy_default_gg(T const* g_in, T* g_out, Layout1 const& layout1, Layout2 const& layout2)
+{
+  using namespace cute;
+
+  Tensor g_in_tensor  = make_tensor(make_gmem_ptr(g_in),  layout1);
+  Tensor g_out_tensor = make_tensor(make_gmem_ptr(g_out), layout2);
+
+  cooperative_copy<ThreadBlockSize, MaxVecBits>(threadIdx.x, g_in_tensor, g_out_tensor, AutoCopyAsync{});
+}
+
+template <class Mode, int MaxVecBits, uint32_t ThreadBlockSize, class T, class Layout1, class Layout2>
+__global__ void
+cooperative_copy_default_kernel(T const* g_in, T* g_out, Layout1 const layout1, Layout2 const layout2)
+{
+  if constexpr(std::is_same_v<Mode, cooperative_copy_mode::global_shared>) {
+    cooperative_copy_default_gs<MaxVecBits, ThreadBlockSize>(g_in, g_out, layout1, layout2);
+  } else if constexpr (std::is_same_v<Mode, cooperative_copy_mode::global_global>) {
+    cooperative_copy_default_gg<MaxVecBits, ThreadBlockSize>(g_in, g_out, layout1, layout2);
+  } else if constexpr (std::is_same_v<Mode, cooperative_copy_mode::shared_shared>) {
+    cooperative_copy_default_ss<MaxVecBits, ThreadBlockSize>(g_in, g_out, layout1, layout2);
+  }
+}
+
+// Mode - defines memory types of src and dst in cooperative_copy operation
+// MaxVecBits - defines max vectorization in cooperative_copy operation, and enforces that
+//              alignment on used pointers to ensure correct testing
+template <class Mode, int MaxVecBits, uint32_t ThreadBlockSize, class T, class Layout1, class Layout2>
+void test_cooperative_copy_default(Layout1 const& layout1, Layout2 const& layout2)
+{
+  using value_type = T;
+  CUTE_STATIC_ASSERT_V(cute::size(layout1) == cute::size(layout2));
+
+  auto gmem_layout_in  = layout1;
+  auto gmem_layout_out = cute::conditional_return<std::is_same_v<Mode, cooperative_copy_mode::global_shared>>(layout1, layout2);
+
+#if 0
+  print("   "); print("layout1:     "); print(layout1); print("\n");
+  print("   "); print("layout2:     "); print(layout2); print("\n");
+  print("   "); print("threads:     "); print(ThreadBlockSize); print("\n");
+  print("   "); print("maxvecbits:  "); print(MaxVecBits); print("\n");
+#endif
+
+  if constexpr (MaxVecBits < cute::sizeof_bits_v<value_type>) {
+    GTEST_SKIP() << "Skipping test since MaxVecBits (=" << MaxVecBits
+                 << ") < cute::sizeof_bits_v<value_type> (=" << cute::sizeof_bits_v<value_type> << ")";
+  } else {
+    constexpr auto max_vec_bytes = MaxVecBits / 8;
+    static_assert((max_vec_bytes % sizeof(T)) == 0);
+
+    uint32_t count = cute::cosize(gmem_layout_in);
+    // Extra elements to force MaxVecBits alignment in global memory
+    uint32_t extra_elements = max_vec_bytes / sizeof(value_type);
+
+    // Allocate
+    thrust::host_vector<value_type> h_in (count + extra_elements);
+    thrust::host_vector<value_type> h_out(count + extra_elements);
+
+    // Initialize
+    Tensor h_in_tensor  = make_tensor(h_in.data()  + extra_elements, gmem_layout_in);
+    Tensor h_out_tensor = make_tensor(h_out.data() + extra_elements, gmem_layout_out);
+    for (int i = 0; i < cute::size(h_in_tensor); ++i) {
+      h_in_tensor(i)  = value_type(float(i));
+      // For global-to-global copy need to compare against the same value
+      h_out_tensor(i) = std::is_same_v<Mode, cooperative_copy_mode::global_global> ? value_type(float(i)) : value_type(float(2 * i));
+    }
+
+    // To GPU
+    thrust::device_vector<value_type> d_in = h_in;
+    thrust::device_vector<value_type> d_out(d_in.size(), value_type(float(-2)));
+
+    // Adds (MaxVecBits/8) bytes to shared memory as we'll move pointer by that many bytes inside the kernel to enforce
+    // alignment to (MaxVecBits/8) bytes
+    size_t shared_memory_bytes = (sizeof(value_type) * count) + max_vec_bytes;
+    shared_memory_bytes += std::is_same_v<Mode, cooperative_copy_mode::shared_shared> * (sizeof(value_type) * count);
+
+    // Launch
+    auto coop_copy = cooperative_copy_default_kernel<Mode, MaxVecBits, ThreadBlockSize, value_type, Layout1, Layout2>;
+    ASSERT_EQ(cudaFuncSetAttribute(coop_copy, cudaFuncAttributeMaxDynamicSharedMemorySize, static_cast<int>(shared_memory_bytes)), cudaSuccess);
+
+    auto d_in_ptr  = thrust::raw_pointer_cast(d_in.data()  + extra_elements);
+    auto d_out_ptr = thrust::raw_pointer_cast(d_out.data() + extra_elements);
+    coop_copy<<<1, ThreadBlockSize, shared_memory_bytes>>>(d_in_ptr, d_out_ptr, layout1, layout2);
+
+    cudaError_t result = cudaDeviceSynchronize();
+    if (result != cudaSuccess) {
+      cudaError_t error = cudaGetLastError();
+      FAIL() << "Error at kernel sync: " << cudaGetErrorString(error) << "\n";
+    }
+
+    // Validate
+    thrust::host_vector<value_type> h_result        = d_out;
+    Tensor                          h_result_tensor = make_tensor(h_result.data() + extra_elements, gmem_layout_out);
+    for (int i = 0; i < cute::size(h_in_tensor); ++i) {
+      ASSERT_EQ(h_result_tensor(i), h_out_tensor(i))
+          << i << " - result:" << h_result_tensor(i) << " expected:" << h_out_tensor(i);
+    }
+  }
+}
+
+template<class T>
+class SM80_CuTe_Ampere;
+
+template<class Mode, class MaxVecBits>
+class SM80_CuTe_Ampere<std::tuple<Mode, MaxVecBits>>: public testing::Test
+{
+public:
+  using mode = Mode;
+  static constexpr int max_vec_bits = MaxVecBits::value;
+};
+
+typedef testing::Types<
+  std::tuple<cooperative_copy_mode::global_shared, cute::Int<128>>,
+  std::tuple<cooperative_copy_mode::global_shared, cute::Int<64>>,
+  std::tuple<cooperative_copy_mode::global_shared, cute::Int<32>>,
+  std::tuple<cooperative_copy_mode::global_shared, cute::Int<16>>,
+
+  std::tuple<cooperative_copy_mode::global_global, cute::Int<128>>,
+  std::tuple<cooperative_copy_mode::global_global, cute::Int<64>>,
+  std::tuple<cooperative_copy_mode::global_global, cute::Int<32>>,
+  std::tuple<cooperative_copy_mode::global_global, cute::Int<16>>,
+
+  std::tuple<cooperative_copy_mode::shared_shared, cute::Int<128>>,
+  std::tuple<cooperative_copy_mode::shared_shared, cute::Int<64>>,
+  std::tuple<cooperative_copy_mode::shared_shared, cute::Int<32>>,
+  std::tuple<cooperative_copy_mode::shared_shared, cute::Int<16>>
+> CooperativeCopyModeMaxVecBitsList;
+
+TYPED_TEST_SUITE(SM80_CuTe_Ampere, CooperativeCopyModeMaxVecBitsList);
+
+// Fast path
+TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefault1D)
+{
+  using value_type = float;
+  constexpr uint32_t count = 512;
+  auto gmem_layout = make_layout(make_shape(Int<count>{}));
+  auto smem_layout = make_layout(make_shape(Int<count>{}));
+  constexpr uint32_t thread_block_size = 64;
+  test_cooperative_copy_default<typename TestFixture::mode,
+                                TestFixture::max_vec_bits,
+                                thread_block_size,
+                                value_type>(gmem_layout, smem_layout);
+}
+
+TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefault1DFallback)
+{
+  using value_type = float;
+  constexpr uint32_t count = 99;
+  auto gmem_layout = make_layout(make_shape(Int<count>{}));
+  auto smem_layout = make_layout(make_shape(Int<count>{}));
+  constexpr uint32_t thread_block_size = 128;
+  test_cooperative_copy_default<typename TestFixture::mode,
+                                TestFixture::max_vec_bits,
+                                thread_block_size,
+                                value_type>(gmem_layout, smem_layout);
+}
+
+// Fast path
+TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefault2D)
+{
+  using value_type = float;
+  constexpr uint32_t x = 32;
+  constexpr uint32_t y = 32;
+  auto gmem_layout = make_layout(make_shape(Int<x>{}, Int<y>{}));
+  auto smem_layout = make_layout(make_shape(Int<x>{}, Int<y>{}));
+  constexpr uint32_t thread_block_size = 64;
+  test_cooperative_copy_default<typename TestFixture::mode,
+                                TestFixture::max_vec_bits,
+                                thread_block_size,
+                                value_type>(gmem_layout, smem_layout);
+}
+
+#if 0
+
+// Fast path
+TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefault2DDynamicStrides)
+{
+  using value_type = float;
+  constexpr uint32_t x = 32;
+  constexpr uint32_t y = 32;
+  auto gmem_layout = make_layout(make_shape(Int<x>{}, Int<y>{}), make_stride(1, x));
+  auto smem_layout = make_layout(make_shape(Int<x>{}, Int<y>{}), make_stride(1, x));
+  constexpr uint32_t thread_block_size = 64;
+  test_cooperative_copy_default<typename TestFixture::mode,
+                                TestFixture::max_vec_bits,
+                                thread_block_size,
+                                value_type>(gmem_layout, smem_layout);
+}
+
+
+
+// Fast path
+TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefault2DMixedStrides)
+{
+  using value_type = float;
+  constexpr uint32_t x = 32;
+  constexpr uint32_t y = 32;
+  auto gmem_layout = make_layout(make_shape(Int<x>{}, Int<y>{}));
+  auto smem_layout = make_layout(make_shape(Int<x>{}, Int<y>{}), make_stride(1, x));
+  constexpr uint32_t thread_block_size = 64;
+  test_cooperative_copy_default<typename TestFixture::mode,
+                                TestFixture::max_vec_bits,
+                                thread_block_size,
+                                value_type>(gmem_layout, smem_layout);
+}
+
+#endif
+
+TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefault2DFallback)
+{
+  using value_type = float;
+  constexpr uint32_t x = 37;
+  constexpr uint32_t y = 37;
+  auto gmem_layout = make_layout(make_shape(Int<x>{}, Int<y>{}));
+  auto smem_layout = make_layout(make_shape(Int<x>{}, Int<y>{}));
+  constexpr uint32_t thread_block_size = 64;
+  test_cooperative_copy_default<typename TestFixture::mode,
+                                TestFixture::max_vec_bits,
+                                thread_block_size,
+                                value_type>(gmem_layout, smem_layout);
+}
+
+// Fast Path
+TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefault2DCustomStride)
+{
+  using value_type = float;
+  constexpr uint32_t x = 16;
+  constexpr uint32_t y = 16;
+  auto gmem_layout = make_layout(make_shape(Int<x>{}, Int<y>{}), make_stride(Int<y>{}, Int<1>{}));
+  auto smem_layout = make_layout(make_shape(Int<x>{}, Int<y>{}), make_stride(Int<1>{}, Int<x>{}));
+  constexpr uint32_t thread_block_size = 64;
+  test_cooperative_copy_default<typename TestFixture::mode,
+                                TestFixture::max_vec_bits,
+                                thread_block_size,
+                                value_type>(gmem_layout, smem_layout);
+}
+
+// Fast path
+TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefault3D)
+{
+  using value_type = cute::half_t;
+  constexpr uint32_t x = 8;
+  constexpr uint32_t y = 8;
+  constexpr uint32_t z = 16;
+  auto gmem_layout = make_layout(make_shape(Int<x>{}, Int<y>{}, Int<z>{}));
+  auto smem_layout = make_layout(make_shape(Int<x>{}, Int<y>{}, Int<z>{}));
+  constexpr uint32_t thread_block_size = 64;
+  test_cooperative_copy_default<typename TestFixture::mode,
+                                TestFixture::max_vec_bits,
+                                thread_block_size,
+                                value_type>(gmem_layout, smem_layout);
+}
+
+// Fast path
+TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefault2Dto3D)
+{
+  using value_type = double;
+  constexpr uint32_t x = 16;
+  constexpr uint32_t y = 16;
+  constexpr uint32_t z = 4;
+  auto gmem_layout = make_layout(make_shape(Int<x>{}, Int<y*z>{}));
+  auto smem_layout = make_layout(make_shape(Int<z>{}, Int<y>{}, Int<x>{}));
+  constexpr uint32_t thread_block_size = 64;
+  test_cooperative_copy_default<typename TestFixture::mode,
+                                TestFixture::max_vec_bits,
+                                thread_block_size,
+                                value_type>(gmem_layout, smem_layout);
+}
+
+// Fast path
+TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefaultCustom1)
+{
+  using value_type = double;
+  auto gmem_layout = make_layout(
+    make_shape(Int<8>{}, make_shape(Int<2>{}, Int<2>{})),
+    make_stride(Int<2>{}, make_shape(Int<1>{}, Int<16>{}))
+  );
+  auto smem_layout = make_layout(
+    make_shape(Int<8>{}, Int<4>{}),
+    make_stride(Int<4>{}, Int<1>{})
+  );
+  constexpr uint32_t thread_block_size = 8;
+  test_cooperative_copy_default<typename TestFixture::mode,
+                                TestFixture::max_vec_bits,
+                                thread_block_size,
+                                value_type>(gmem_layout, smem_layout);
+}
+
+// Fast Path
+TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefaultCustom2)
+{
+  using value_type = float;
+  auto gmem_layout = make_layout(
+    make_shape(make_shape(Int<4>{}, Int<2>{}), make_shape(Int<2>{}, Int<2>{})),
+    make_stride(make_shape(Int<4>{}, Int<1>{}), make_shape(Int<16>{}, Int<2>{}))
+  );
+  auto smem_layout = make_layout(
+    make_shape(make_shape(Int<2>{}, Int<2>{}, Int<2>{}), make_shape(Int<2>{}, Int<2>{})),
+    make_stride(make_shape(Int<16>{}, Int<4>{}, Int<1>{}), make_shape(Int<8>{}, Int<2>{}))
+  );
+  constexpr uint32_t thread_block_size = 16;
+  test_cooperative_copy_default<typename TestFixture::mode,
+                                TestFixture::max_vec_bits,
+                                thread_block_size,
+                                value_type>(gmem_layout, smem_layout);
+}
+
+// Fast Path
+TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefaultSwizzle1)
+{
+  using value_type = float;
+  auto gmem_layout = Layout<Shape<_8, _64>, Stride<_64, _1>>{};
+  auto smem_layout = composition(Swizzle<3, 3, 3>{}, Layout<Shape<_8, _64>, Stride<_64, _1>>{});
+  constexpr uint32_t thread_block_size = 128;
+  test_cooperative_copy_default<typename TestFixture::mode,
+                                TestFixture::max_vec_bits,
+                                thread_block_size,
+                                value_type>(gmem_layout, smem_layout);
+}
+
+// Fast Path
+TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefaultSwizzle2)
+{
+  using value_type = cute::half_t;
+  auto gmem_layout = make_layout(make_shape(Int<64>{}, Int<64>{}));
+  auto smem_atom_layout = composition(Swizzle<3, 2, 3>{}, Layout<Shape<_8, _32>, Stride<_32, _1>>{});
+  auto smem_layout = tile_to_shape(
+      smem_atom_layout,
+      make_shape(shape<0>(gmem_layout), shape<1>(gmem_layout))
+  );
+  constexpr uint32_t thread_block_size = 128;
+  test_cooperative_copy_default<typename TestFixture::mode,
+                                TestFixture::max_vec_bits,
+                                thread_block_size,
+                                value_type>(gmem_layout, smem_layout);
+}
+
+// Fast Path
+TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefaultSwizzle3)
+{
+  using value_type = cute::half_t;
+  auto gmem_layout = make_layout(make_shape(Int<64>{}, Int<64>{}));
+  auto smem_atom_layout = composition(Swizzle<2, 4, 3>{}, Layout<Shape<_16, _64>, Stride<_64, _1>>{});
+  auto smem_layout = tile_to_shape(
+      smem_atom_layout,
+      make_shape(shape<0>(gmem_layout), shape<1>(gmem_layout))
+  );
+  constexpr uint32_t thread_block_size = 128;
+  test_cooperative_copy_default<typename TestFixture::mode,
+                                TestFixture::max_vec_bits,
+                                thread_block_size,
+                                value_type>(gmem_layout, smem_layout);
+}
+
+// Fast path
+TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefaultSwizzle4)
+{
+  using value_type = cute::half_t;
+  auto gmem_atom_layout = composition(Swizzle<3, 2, 3>{}, Layout<Shape<_8, _32>, Stride<_32, _1>>{});
+  auto smem_layout = make_layout(make_shape(Int<64>{}, Int<64>{}));
+  auto gmem_layout = tile_to_shape(
+      gmem_atom_layout,
+      make_shape(shape<0>(smem_layout), shape<1>(smem_layout))
+  );
+  constexpr uint32_t thread_block_size = 128;
+  test_cooperative_copy_default<typename TestFixture::mode,
+                                TestFixture::max_vec_bits,
+                                thread_block_size,
+                                value_type>(gmem_layout, smem_layout);
+}
+
+// Needs coalescing to work on fast path
+// OK if we enforce slow path
+// Problem: Wrong condition when we select between slow and fast path
+TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefaultCoalesceToCompose)
+{
+  constexpr int m = 96;
+  using value_type = cute::half_t;
+  auto gmem_layout = make_layout(make_shape(Int<m>{}, Int<m>{}), GenColMajor{});
+  auto smem_layout = make_layout(make_shape(Int<m>{}, Int<m>{}), GenColMajor{});
+  constexpr uint32_t thread_block_size = 128;
+  test_cooperative_copy_default<typename TestFixture::mode,
+                                TestFixture::max_vec_bits,
+                                thread_block_size,
+                                value_type>(gmem_layout, smem_layout);
+}
+
+ // Fast path (default): OK
+ // Slow path (enforced): OK
+ TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefaultSwizzle5)
+ {
+   constexpr int m = 64;
+   constexpr int n = 128;
+   using value_type = cute::half_t;
+   auto gmem_layout = make_layout(make_shape(Int<m>{}, Int<n>{}), GenColMajor{});
+   // auto smem_layout = make_layout(make_shape(Int<m>{}, Int<n>{}), GenColMajor{}));
+   auto smem_atom_layout =
+     composition(Swizzle<3,3,3>{},
+                 Layout<Shape < _8,_64>,
+                        Stride<_64, _1>>{});
+   auto smem_layout = tile_to_shape(
+     smem_atom_layout,
+     make_shape(shape<0>(gmem_layout), shape<1>(gmem_layout))
+   );
+
+   constexpr uint32_t thread_block_size = 128;
+   test_cooperative_copy_default<typename TestFixture::mode,
+                                 TestFixture::max_vec_bits,
+                                 thread_block_size,
+                                 value_type>(gmem_layout, smem_layout);
+ }
+
+ // If condition not strict enought will go to fast path
+ // This test needs checking if CuTe can compose layouts
+ // Fast path (default): fail
+ // Slow path (enforced): Should go to vectorized naive path
+ TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefaultSwizzleNaiveVectorizable)
+ {
+   constexpr int m = 192;
+   constexpr int n = 64;
+   using value_type = cute::half_t;
+   auto gmem_layout = make_layout(make_shape(Int<m>{}, Int<n>{}), GenColMajor{});
+   // auto smem_layout = make_layout(make_shape(Int<m>{}, Int<n>{}), GenColMajor{});
+   auto smem_atom_layout =
+       composition(Swizzle<3,3,3>{},
+                   Layout<Shape <_64, _8>,
+                          Stride< _1,_64>>{});
+   auto smem_layout = tile_to_shape(
+     smem_atom_layout,
+     shape(gmem_layout)
+   );
+
+   constexpr uint32_t thread_block_size = 128;
+   test_cooperative_copy_default<typename TestFixture::mode,
+                                 TestFixture::max_vec_bits,
+                                 thread_block_size,
+                                 value_type>(gmem_layout, smem_layout);
+ }
+
+ // fast path: ok (chosen)
+ // slow path: ok
+ TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefaultRowMajorSmall)
+ {
+   constexpr int m = 24;
+   constexpr int n = 8;
+   using value_type = cute::half_t;
+   auto gmem_layout = make_layout(make_shape(Int<m>{}, Int<n>{}), GenRowMajor{});
+   auto smem_layout = make_layout(make_shape(Int<m>{}, Int<n>{}), GenRowMajor{});
+
+   constexpr uint32_t thread_block_size = 64;
+   test_cooperative_copy_default<typename TestFixture::mode,
+                                 TestFixture::max_vec_bits,
+                                 thread_block_size,
+                                 value_type>(gmem_layout, smem_layout);
+ }
+
+ // fast path: doesn't apply
+ // slow path: ok
+ TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefaultSlowPath)
+ {
+   constexpr int m = 67;
+   constexpr int n = 67;
+   using value_type = cute::half_t;
+   auto gmem_layout = make_layout(make_shape(Int<m>{}, Int<n>{}), GenRowMajor{});
+   auto smem_layout = make_layout(make_shape(Int<m>{}, Int<n>{}), GenRowMajor{});
+
+   constexpr uint32_t thread_block_size = 64;
+   test_cooperative_copy_default<typename TestFixture::mode,
+                                 TestFixture::max_vec_bits,
+                                 thread_block_size,
+                                 value_type>(gmem_layout, smem_layout);
+ }
+
+ // fast path: doesn't apply
+ // slow path: should vectorize
+ TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefaultSwizzleSlowPathVectorize)
+ {
+   constexpr int m = 68;
+   constexpr int n = 68;
+   using value_type = cute::half_t;
+   auto gmem_layout = make_layout(make_shape(Int<m>{}, Int<n>{}), GenRowMajor{});
+   auto smem_layout = make_layout(make_shape(Int<m>{}, Int<n>{}), GenRowMajor{});
+
+   constexpr uint32_t thread_block_size = 32;
+   test_cooperative_copy_default<typename TestFixture::mode,
+                                 TestFixture::max_vec_bits,
+                                 thread_block_size,
+                                 value_type>(gmem_layout, smem_layout);
+ }
+
+ TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopy48x48Swizzle)
+ {
+   constexpr int m = 48;
+   constexpr int n = 48;
+   using value_type = cute::half_t;
+   auto gmem_layout = make_layout(make_shape(Int<m>{}, Int<n>{}), GenRowMajor{});
+   auto smem_layout = composition(Swizzle<2,2,3>{},
+                                              Layout<Shape <Shape <_16,       _3, Int<48>>>,
+                                                     Stride<Stride< _1, Int<768>,     _16>>>{});
+
+   constexpr uint32_t thread_block_size = 8 * 32;
+   test_cooperative_copy_default<cooperative_copy_mode::shared_shared,
+                                 TestFixture::max_vec_bits,
+                                 thread_block_size,
+                                 value_type>(gmem_layout, smem_layout);
+ }
diff --git a/test/unit/cute/ampere/cooperative_gemm.cu b/test/unit/cute/ampere/cooperative_gemm.cu
new file mode 100644
index 0000000000..5bb6ecd2de
--- /dev/null
+++ b/test/unit/cute/ampere/cooperative_gemm.cu
@@ -0,0 +1,501 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <cute/tensor.hpp>
+#include <cute/swizzle.hpp> // cute::Swizzle
+#include <cute/swizzle_layout.hpp> // cute::compose(cute::Swizzle)
+
+#include "../cooperative_gemm_common.hpp"
+
+using namespace cute;
+
+TEST(SM80_CuTe_Ampere, CooperativeGemm1_Half_MMA) {
+  constexpr uint32_t thread_block_size = 128;
+  using value_type = cutlass::half_t;
+
+  auto shape_mnk = Shape<_64, _64, _64>{};
+  auto tiled_mma =
+      TiledMMA<
+        MMA_Atom<SM80_16x8x8_F16F16F16F16_TN>,
+        Layout<Shape<_2, _2, _1>>
+      >{};
+
+  test_cooperative_gemm_col_major_layout<thread_block_size, value_type>(shape_mnk, tiled_mma);
+}
+
+TEST(SM80_CuTe_Ampere, CooperativeGemm2_Double_MMA) {
+  constexpr uint32_t thread_block_size = 128;
+  using value_type = double;
+
+  auto shape_mnk = Shape<_64, _64, _64>{};
+  auto tiled_mma =
+      TiledMMA<
+        MMA_Atom<SM80_8x8x4_F64F64F64F64_TN>,
+         Layout<Shape<_2,_2,_1>>
+      >{};
+
+  test_cooperative_gemm_col_major_layout<thread_block_size, value_type>(shape_mnk, tiled_mma);
+}
+
+TEST(SM80_CuTe_Ampere, CooperativeGemm3_Half_MMA_CustomSmemLayouts) {
+  constexpr uint32_t thread_block_size = 128;
+  constexpr uint32_t max_vec_bits = 128;
+  using value_type = cutlass::half_t;
+
+  auto shape_mnk = Shape<_128, _128, _128>{};
+  auto tiled_mma =
+    TiledMMA<
+      MMA_Atom<SM80_16x8x16_F16F16F16F16_TN>,
+      Layout<Shape<_2, _2, _1>>, // 2x2x1 thread group
+      Tile<_32, _32, _16> // 32x32x16 MMA for LDSM, 1x2x1 value group`
+    >{};
+
+  auto smem_a_atom_layout = Layout<Shape<_64, _8>, Stride< _1,_64>>{};
+  auto smem_b_atom_layout = Layout<Shape< _8,_32>, Stride<_32, _1>>{};
+  auto smem_c_atom_layout = make_layout(select<0,1>(shape_mnk));
+
+  test_cooperative_gemm_col_major_layout<thread_block_size,
+                                         max_vec_bits,
+                                         value_type,
+                                         value_type,
+                                         value_type>
+    (smem_a_atom_layout,
+    smem_b_atom_layout,
+    smem_c_atom_layout,
+    shape_mnk, tiled_mma);
+}
+
+TEST(SM80_CuTe_Ampere, CooperativeGemm4_Half_MMA_SwizzledSmemLayouts) {
+  constexpr uint32_t thread_block_size = 128;
+  constexpr uint32_t max_vec_bits = 128;
+  using value_type = cutlass::half_t;
+
+  auto shape_mnk = Shape<_128, _128, _128>{};
+  auto tiled_mma =
+    TiledMMA<
+      MMA_Atom<SM80_16x8x16_F16F16F16F16_TN>,
+      Layout<Shape<_2, _2, _1>>, // 2x2x1 thread group
+      Tile<_32, _32, _16> // 32x32x16 MMA for LDSM, 1x2x1 value group`
+    >{};
+
+  // RowMajor
+  auto smem_a_atom_layout =
+    composition(Swizzle<3,3,3>{},
+                Layout<Shape < _8,_64>,
+                       Stride<_64, _1>>{});
+  // ColMajor
+  auto smem_b_atom_layout =
+    composition(Swizzle<3,3,3>{},
+                Layout<Shape <_64, _8>,
+                       Stride< _1,_64>>{});
+
+  auto smem_c_atom_layout = make_layout(select<0, 1>(shape_mnk), GenRowMajor{});
+
+  auto gmem_a_layout = make_layout(select<0, 2>(shape_mnk), GenRowMajor{});
+  auto gmem_b_layout = make_layout(select<1, 2>(shape_mnk), GenColMajor{});
+  auto gmem_c_layout = make_layout(select<0, 1>(shape_mnk), GenRowMajor{});
+
+  auto smem_a_layout = tile_to_shape(
+      smem_a_atom_layout,
+      make_shape(shape<0>(gmem_a_layout), shape<1>(gmem_a_layout)));
+
+  auto smem_b_layout = tile_to_shape(
+      smem_b_atom_layout,
+      make_shape(shape<0>(gmem_b_layout), shape<1>(gmem_b_layout)));
+
+  auto smem_c_layout = tile_to_shape(
+      smem_c_atom_layout,
+      make_shape(shape<0>(gmem_c_layout), shape<1>(gmem_c_layout)));
+
+  test_cooperative_gemm<thread_block_size,
+                        max_vec_bits,
+                        value_type,
+                        value_type,
+                        value_type>
+    (gmem_a_layout,
+     gmem_b_layout,
+     gmem_c_layout,
+     smem_a_layout,
+     smem_b_layout,
+     smem_c_layout,
+     tiled_mma,
+     cute::identity{}, // TransformLoadA
+     cute::identity{}, // TransformLoadB
+     cute::identity{}, // TransformLoadC
+     cute::identity{}, // TransformStoreC
+     SM75_U32x4_LDSM_N{}, // A
+     SM75_U16x8_LDSM_T{}, // B
+     AutoVectorizingCopyWithAssumedAlignment<128>{}); // C
+}
+
+TEST(SM80_CuTe_Ampere, CooperativeGemm5_Double_MMA_SwizzledSmemLayouts) {
+  constexpr uint32_t thread_block_size = 128;
+  constexpr uint32_t max_vec_bits = 128;
+  using value_type = double;
+
+  auto shape_mnk = Shape<_128, _64, _16>{};
+  auto tiled_mma =
+      TiledMMA<MMA_Atom<SM80_8x8x4_F64F64F64F64_TN>,        // Atom
+               Layout<Shape<_2, _2, _1>>,                   // Atom layout
+               Tile<Layout<Shape<_16, _2>, Stride<_2, _1>>, // 32x32x4 MMA with perm for load vectorization
+                    Layout<Shape<_16, _2>, Stride<_2, _1>>,
+                    Underscore>>{};
+
+  auto smem_a_atom_layout =
+      composition(Swizzle<2,2,2>{},
+                  Layout<Shape <_16, _4>,
+                         Stride< _1,_16>>{}); // M, K
+  auto smem_b_atom_layout =
+      composition(Swizzle<2,2,2>{},
+                  Layout<Shape <_16, _4>,
+                         Stride< _1,_16>>{}); // N, K
+
+  auto smem_c_atom_layout = make_layout(select<0, 1>(shape_mnk), GenRowMajor{});
+
+  auto gmem_a_layout = make_layout(select<0, 2>(shape_mnk), GenRowMajor{});
+  auto gmem_b_layout = make_layout(select<1, 2>(shape_mnk), GenColMajor{});
+  auto gmem_c_layout = make_layout(select<0, 1>(shape_mnk), GenRowMajor{});
+
+  auto smem_a_layout = tile_to_shape(
+      smem_a_atom_layout,
+      make_shape(shape<0>(gmem_a_layout), shape<1>(gmem_a_layout)));
+  auto smem_b_layout = tile_to_shape(
+      smem_b_atom_layout,
+      make_shape(shape<0>(gmem_b_layout), shape<1>(gmem_b_layout)));
+  auto smem_c_layout = tile_to_shape(
+      smem_c_atom_layout,
+      make_shape(shape<0>(gmem_c_layout), shape<1>(gmem_c_layout)));
+
+  test_cooperative_gemm<thread_block_size,
+                        max_vec_bits,
+                        value_type,
+                        value_type,
+                        value_type>
+    (gmem_a_layout,
+     gmem_b_layout,
+     gmem_c_layout,
+     smem_a_layout,
+     smem_b_layout,
+     smem_c_layout,
+     tiled_mma);
+}
+
+TEST(SM80_CuTe_Ampere, CooperativeGemm6_MixedPrecisionFP16FP32_MMA) {
+  constexpr uint32_t thread_block_size = 128;
+  constexpr uint32_t max_vec_bits = 128;
+  using TA = cutlass::half_t;
+  using TB = cutlass::half_t;
+  using TC = float;
+
+  auto shape_mnk = Shape<_64, _64, _64>{};
+  auto tiled_mma =
+      TiledMMA<
+        MMA_Atom<SM80_16x8x8_F32F16F16F32_TN>,
+        Layout<Shape<_2, _2, _1>>
+      >{};
+
+  test_cooperative_gemm_col_major_layout<thread_block_size, max_vec_bits, TA, TB, TC>(shape_mnk, tiled_mma);
+}
+
+TEST(SM80_CuTe_Ampere, CooperativeGemm7_MixedPrecisionBF16FP32_MMA) {
+  constexpr uint32_t thread_block_size = 128;
+  constexpr uint32_t max_vec_bits = 128;
+  using TA = cutlass::bfloat16_t;
+  using TB = cutlass::bfloat16_t;
+  using TC = float;
+
+  auto shape_mnk = Shape<_64, _64, _64>{};
+  auto tiled_mma =
+      TiledMMA<
+        MMA_Atom<SM80_16x8x8_F32BF16BF16F32_TN>,
+        Layout<Shape<_2, _2, _1>>
+      >{};
+
+  test_cooperative_gemm_col_major_layout<thread_block_size, max_vec_bits, TA, TB, TC>(shape_mnk, tiled_mma);
+}
+
+TEST(SM80_CuTe_Ampere, CooperativeGemm8_MixedPrecisionTF32FP32_MMA) {
+  constexpr uint32_t thread_block_size = 128;
+  constexpr uint32_t max_vec_bits = 128;
+  using TA = cutlass::tfloat32_t;
+  using TB = cutlass::tfloat32_t;
+  using TC = float;
+
+  auto shape_mnk = Shape<_64, _64, _64>{};
+  auto tiled_mma =
+      TiledMMA<
+        MMA_Atom<SM80_16x8x8_F32TF32TF32F32_TN>,
+        Layout<Shape<_2, _2, _1>>
+      >{};
+
+  test_cooperative_gemm_col_major_layout<thread_block_size, max_vec_bits, TA, TB, TC>(shape_mnk, tiled_mma);
+}
+
+TEST(SM80_CuTe_Ampere, CooperativeGemm9_C64C64C64_MMA_Dynamic) {
+  constexpr uint32_t thread_block_size = 256;
+  constexpr int MaxVecBits = 128;
+  using TA = cutlass::complex<double>;
+  using TB = cutlass::complex<double>;
+  using TC = cutlass::complex<double>;
+
+  auto tiled_mma =
+      TiledMMA<
+        MMA_Atom<SM80_8x8x4_C64C64C64C64_TN>,
+        Layout<Shape<_4, _4, _1>, Stride<_1, _4, _0>>,
+        Tile<Underscore, Underscore, Underscore>
+      >{};
+
+  auto a_layout = make_layout(Shape<Int<13>,Int<35>>{}, make_stride(44, 1));
+  auto b_layout = make_layout(Shape< Int<7>, Int<35>>{}, make_stride(44, 1));
+  auto c_layout = make_layout(Shape<Int<13>,  Int<7>>{}, make_stride(1, 30));
+
+  test_cooperative_gemm<thread_block_size,
+                        MaxVecBits,
+                        TA, TB, TC>
+    (a_layout,
+     b_layout,
+     c_layout,
+     a_layout,
+     b_layout,
+     c_layout,
+     tiled_mma);
+}
+
+TEST(SM80_CuTe_Ampere, CooperativeGemm9_C64C64C64_MMA) {
+  constexpr uint32_t thread_block_size = 256;
+  constexpr int MaxVecBits = 128;
+  using TA = cutlass::complex<double>;
+  using TB = cutlass::complex<double>;
+  using TC = cutlass::complex<double>;
+
+  auto tiled_mma =
+      TiledMMA<
+        MMA_Atom<SM80_8x8x4_C64C64C64C64_TN>,
+        Layout<Shape<_4, _4, _1>, Stride<_1, _4, _0>>,
+        Tile<Underscore, Underscore, Underscore>
+      >{};
+
+  auto a_layout = Layout<Shape<Int<13>,Int<35>>, Stride<Int<44>, Int<1> >>{};
+  auto b_layout = Layout<Shape< Int<7>, Int<35>>, Stride<Int<44>, Int<1> >>{};
+  auto c_layout = Layout<Shape<Int<13>,  Int<7>>, Stride< Int<1>, Int<30>>>{};
+
+  test_cooperative_gemm<thread_block_size,
+                        MaxVecBits,
+                        TA, TB, TC>
+    (a_layout,
+     b_layout,
+     c_layout,
+     a_layout,
+     b_layout,
+     c_layout,
+     tiled_mma);
+}
+
+TEST(SM80_CuTe_Ampere, CooperativeGemm10_F16F64F16_FMA) {
+
+  constexpr uint32_t thread_block_size = 256;
+  constexpr int MaxVecBits = 128;
+  using TA = cutlass::half_t;
+  using TB = double;
+  using TC = cutlass::half_t;
+
+  auto tiled_mma =
+      TiledMMA<
+        MMA_Atom<UniversalFMA<half_t, half_t, double, half_t>>,
+        Layout<Shape<_16, _16, _1>, Stride<_1, _16, _0>>,
+        Tile<Underscore, Underscore, Underscore>
+      >{};
+
+  auto a_layout = Layout<Shape<Int<64>,Int<64>>, Stride<Int<64>, Int< 1>>>{};
+  auto b_layout = Layout<Shape<Int<64>,Int<64>>, Stride<Int< 1>, Int<64>>>{};
+  auto c_layout = Layout<Shape<Int<64>,Int<64>>, Stride<Int< 1>, Int<64>>>{};
+
+  test_cooperative_gemm<thread_block_size,
+                        MaxVecBits,
+                        TA,
+                        TB,
+                        TC>
+    (a_layout,
+     b_layout,
+     c_layout,
+     a_layout,
+     b_layout,
+     c_layout,
+     tiled_mma);
+}
+
+TEST(SM80_CuTe_Ampere, CooperativeGemmComposedStride) {
+
+  constexpr uint32_t thread_block_size = 128;
+  constexpr int MaxVecBits = 16;
+  using T = cute::half_t;
+
+  auto tiled_mma =
+      TiledMMA<
+        MMA_Atom<SM80_16x8x16_F16F16F16F16_TN>,
+        Layout<Shape<_2, _2, _1>, Stride<_1, _2, _0>>,
+        Tile<Underscore, Underscore, Underscore>
+      >{};
+
+  auto swizzle = cute::Swizzle<3, 3, 3>{};
+  auto offset = cute::_0{};
+  auto atom_tile_right = cute::make_layout(cute::Shape<cute::_8, cute::_64>{}, cute::LayoutRight{});
+  auto FP16AtomLayoutRight = cute::composition(swizzle, offset, atom_tile_right);
+
+  auto shape = cute::Shape<cute::Int<128>, cute::Int<128>>{};
+  auto global_a_layout = cute::make_layout(shape, cute::LayoutRight{});
+  auto global_b_layout = cute::make_layout(shape, cute::LayoutLeft{});
+  auto global_c_layout = cute::make_layout(shape, cute::LayoutRight{});
+
+  // This is for A row major, B col major according to CUTLASS default configs
+  auto a_layout = cute::tile_to_shape(FP16AtomLayoutRight, global_a_layout);
+  auto b_layout = cute::tile_to_shape(FP16AtomLayoutRight, global_b_layout);
+  auto c_layout = global_c_layout;
+
+  test_cooperative_gemm<thread_block_size,
+                        MaxVecBits,
+                        T, T, T>
+    (a_layout,
+     b_layout,
+     c_layout,
+     a_layout,
+     b_layout,
+     c_layout,
+     tiled_mma);
+}
+
+TEST(SM80_CuTe_Ampere, CooperativeGemm8_MixedPrecisionTF32FP32_Transform) {
+  constexpr uint32_t thread_block_size = 64;
+  constexpr uint32_t max_vec_bits = 16;
+  using TA = cutlass::tfloat32_t;
+  using TB = cutlass::tfloat32_t;
+  using TC = float;
+
+  auto shape_mnk = Shape<C<9>, C<9>, C<9>>{};
+  auto tiled_mma =
+      TiledMMA<
+        MMA_Atom<SM80_16x8x8_F32TF32TF32F32_TN>,
+        Layout<Shape<_1, _2, _1>>
+      >{};
+
+  test_cooperative_gemm_col_major_layout<thread_block_size, max_vec_bits, TA, TB, TC>
+    (shape_mnk, tiled_mma, cute::negate{}, cute::negate{}, cute::negate{}, cute::negate{});
+}
+
+TEST(SM80_CuTe_Ampere, CooperativeGemm8_MixedPrecisionTF32FP32_TransformPrecision) {
+  constexpr uint32_t thread_block_size = 64;
+  constexpr uint32_t max_vec_bits = 16;
+  using InputTA = cutlass::half_t;
+  using InputTB = cutlass::half_t;
+  using InputTC = cutlass::half_t;
+
+  using ComputeTA = cutlass::tfloat32_t;
+  using ComputeTB = cutlass::tfloat32_t;
+  using ComputeTC = float;
+
+  auto shape_mnk = Shape<C<9>, C<9>, C<9>>{};
+  auto tiled_mma =
+      TiledMMA<
+        MMA_Atom<SM80_16x8x8_F32TF32TF32F32_TN>,
+        Layout<Shape<_1, _2, _1>>
+      >{};
+
+  test_cooperative_gemm_col_major_layout<thread_block_size, max_vec_bits, InputTA, InputTB, InputTC>
+    (shape_mnk, tiled_mma);
+}
+
+TEST(SM80_CuTe_Ampere, CooperativeGemm8_MixedPrecisionTF32FP32_TransformPrecisionReg) {
+  constexpr uint32_t thread_block_size = 64;
+  constexpr uint32_t max_vec_bits = 16;
+  using InputTA = cutlass::half_t;
+  using InputTB = cutlass::half_t;
+  using InputTC = cutlass::half_t;
+
+  using ComputeTA = cutlass::tfloat32_t;
+  using ComputeTB = cutlass::tfloat32_t;
+  using ComputeTC = float;
+
+  auto shape_mnk = Shape<C<9>, C<9>, C<9>>{};
+  auto tiled_mma =
+      TiledMMA<
+        MMA_Atom<SM80_16x8x8_F32TF32TF32F32_TN>,
+        Layout<Shape<_1, _2, _1>>
+      >{};
+
+  test_cooperative_gemm_col_major_layout_rmem_c<thread_block_size, max_vec_bits, InputTA, InputTB, InputTC>
+    (shape_mnk, tiled_mma);
+}
+
+TEST(SM80_CuTe_Ampere, CooperativeGemm1_Half_MMA_Reg) {
+  using value_type = cutlass::half_t;
+
+  auto shape_mnk = Shape<_64, _64, _64>{};
+
+  constexpr uint32_t thread_block_size = 128;
+
+  auto tiled_mma =
+      TiledMMA<
+        MMA_Atom<SM80_16x8x8_F16F16F16F16_TN>,
+        Layout<Shape<_2, _2, _1>>
+      >{};
+
+  test_cooperative_gemm_col_major_layout_rmem_c<thread_block_size, value_type>(shape_mnk, tiled_mma);
+}
+
+TEST(SM80_CuTe_Ampere, CooperativeGemm2_Double_MMA_Reg) {
+  constexpr uint32_t thread_block_size = 128;
+  using value_type = double;
+
+  auto shape_mnk = Shape<_64, _64, _64>{};
+  auto tiled_mma =
+      TiledMMA<
+        MMA_Atom<SM80_8x8x4_F64F64F64F64_TN>,
+         Layout<Shape<_2,_2,_1>>
+      >{};
+
+  test_cooperative_gemm_col_major_layout_rmem_c<thread_block_size, value_type>(shape_mnk, tiled_mma);
+}
+
+TEST(SM80_CuTe_Ampere, CooperativeGemm2_Double_MMA_Predicated_Reg) {
+  constexpr uint32_t thread_block_size = 128;
+  using value_type = double;
+
+  auto shape_mnk = Shape<C<62>, C<62>, C<62>>{};
+  auto tiled_mma =
+      TiledMMA<
+        MMA_Atom<SM80_8x8x4_F64F64F64F64_TN>,
+         Layout<Shape<_2,_2,_1>>
+      >{};
+
+  test_cooperative_gemm_col_major_layout_rmem_c<thread_block_size, value_type>(shape_mnk, tiled_mma);
+}
diff --git a/test/unit/cute/ampere/cp_sync.cu b/test/unit/cute/ampere/cp_sync.cu
new file mode 100644
index 0000000000..f50454107f
--- /dev/null
+++ b/test/unit/cute/ampere/cp_sync.cu
@@ -0,0 +1,102 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <iostream>
+#include <iomanip>
+#include <utility>
+#include <type_traits>
+#include <vector>
+#include <numeric>
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <cute/tensor.hpp>
+
+using namespace cute;
+
+__global__ void
+test(double const* g_in, double* g_out)
+{
+  extern __shared__ double smem[];
+
+  smem[threadIdx.x] = g_in[threadIdx.x];
+
+  __syncthreads();
+
+  g_out[threadIdx.x] = 2 * smem[threadIdx.x];
+}
+
+__global__ void
+test2(double const* g_in, double* g_out)
+{
+  using namespace cute;
+
+  extern __shared__ double smem[];
+
+  auto s_tensor = make_tensor(make_smem_ptr(smem + threadIdx.x), Int<1>{});
+  auto g_tensor = make_tensor(make_gmem_ptr(g_in + threadIdx.x), Int<1>{});
+
+  copy(g_tensor, s_tensor);
+
+  __syncthreads();
+
+  g_out[threadIdx.x] = 2 * smem[threadIdx.x];
+}
+
+TEST(SM80_CuTe_Ampere, CpSync)
+{
+  constexpr int count = 32;
+  thrust::host_vector<double> h_in(count);
+  for (int i = 0; i < count; ++i) {
+    h_in[i] = double(i);
+  }
+
+  thrust::device_vector<double> d_in(h_in);
+
+  thrust::device_vector<double> d_out(count, -1);
+  test<<<1, count, sizeof(double) * count>>>(
+    thrust::raw_pointer_cast(d_in.data()),
+    thrust::raw_pointer_cast(d_out.data()));
+  thrust::host_vector<double> h_result = d_out;
+
+  thrust::device_vector<double> d_out_cp_async(count, -2);
+  test2<<<1, count, sizeof(double) * count>>>(
+    thrust::raw_pointer_cast(d_in.data()),
+    thrust::raw_pointer_cast(d_out_cp_async.data()));
+  thrust::host_vector<double> h_result_cp_async = d_out_cp_async;
+
+  for (int i = 0; i < count; ++i) {
+    EXPECT_EQ(h_result[i], h_result_cp_async[i]);
+  }
+}
diff --git a/test/unit/cute/ampere/ldsm.cu b/test/unit/cute/ampere/ldsm.cu
new file mode 100644
index 0000000000..07bce57a90
--- /dev/null
+++ b/test/unit/cute/ampere/ldsm.cu
@@ -0,0 +1,431 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <iostream>
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <cute/tensor.hpp>
+
+#include <cute/atom/copy_traits_sm75.hpp>
+
+
+using namespace cute;
+
+template <class T>
+__global__ void
+ldsm_test_device(uint16_t* g_in, uint16_t* g_out)
+{
+  constexpr int count = sizeof(T) / 4;
+  int tid = threadIdx.x;
+  int stride = blockDim.x;
+
+  // load input gmem -> smem
+  __shared__ uint32_t smem[32 * count];
+  for (int i = 0; i < count; ++i) {
+    smem[tid + (stride * i)] = reinterpret_cast<uint32_t*>(g_in)[tid + (stride * i)];
+  }
+
+  __syncthreads();
+
+  uint32_t reg[count];
+  for (int i = 0; i < count; ++i) {
+    reg[i] = 0;
+  }
+
+  // load smem -> rmem using LDSM
+  uint128_t* smem_ptr = reinterpret_cast<uint128_t*>(smem) + tid;
+  T*         rmem_ptr = reinterpret_cast<T*>(reg);
+  cute::copy_ldsm(smem_ptr, rmem_ptr);
+
+  // store output rmem -> gmem
+  for (int i = 0; i < count; ++i) {
+    reinterpret_cast<uint32_t*>(g_out)[tid + (stride * i)] = reg[i];
+  }
+}
+
+template <class TiledCopy, class SmemLayout>
+__global__ void
+ldsm_test_device_cute(uint16_t* g_in, uint16_t* g_out,
+                      TiledCopy tiled_copy, SmemLayout smem_layout)
+{
+  using namespace cute;
+
+  __shared__ uint16_t smem[size(smem_layout)];
+
+  auto t_g_in  = make_tensor(make_gmem_ptr(g_in),  smem_layout);
+  auto t_g_out = make_tensor(make_gmem_ptr(g_out), smem_layout);
+  auto t_smem  = make_tensor(make_smem_ptr(smem),  smem_layout);
+
+  int tid = threadIdx.x;
+
+  // Load input gmem -> smem
+  for (int i = tid; i < size(t_smem); i += size(tiled_copy)) {
+    t_smem(i) = t_g_in(i);
+  }
+
+  __syncthreads();
+
+  auto thr_copy = tiled_copy.get_thread_slice(tid);
+
+  auto tXsX = thr_copy.partition_S(t_smem);   // (V,M,N)
+  auto tXgX = thr_copy.partition_D(t_g_out);  // (V,M,N)
+
+  auto tXrX = make_tensor<uint16_t>(shape(tXgX)); // (V,M,N)
+  clear(tXrX);  // Just to make sure
+
+/*
+  if (thread0()) {
+    print("tXsX: " ); print(tXsX.layout()); print("\n");
+    print("tXgX: " ); print(tXgX.layout()); print("\n");
+    print("tXrX: " ); print(tXrX.layout()); print("\n");
+  }
+*/
+
+  // Copy smem -> rmem via tiled_copy (LDSM, LDS)
+  copy(tiled_copy, tXsX, tXrX);
+
+  // Output rmem -> gmem
+  copy(tXrX, tXgX);
+}
+
+
+TEST(SM80_CuTe_Ampere, Ldsm)
+{
+  constexpr int count = 1024;
+
+  thrust::host_vector<uint16_t> h_in(count);
+  for (int i = 0; i < count; ++i) {
+    h_in[i] = uint16_t(i);
+  }
+  thrust::device_vector<uint16_t> d_in = h_in;
+
+  //
+  // LDSM 1x (32b)
+  //
+
+  {
+  thrust::device_vector<uint16_t> d_out(count);
+  ldsm_test_device<uint32_t><<<1, 32>>>(
+    thrust::raw_pointer_cast(d_in.data()),
+    thrust::raw_pointer_cast(d_out.data()));
+  thrust::host_vector<uint16_t> h_out = d_out;
+  for (int i = 0; i < 32; ++i) {
+    EXPECT_EQ(h_out[i], h_in[i]);
+  }
+  CUTLASS_TRACE_HOST("LDSM 1x ldsm_test_device SUCCESS\n");
+  }
+
+  //
+  // LDSM 2x (64b)
+  //
+
+  {
+  thrust::device_vector<uint16_t> d_out(count);
+  ldsm_test_device<uint64_t><<<1, 32>>>(
+    thrust::raw_pointer_cast(d_in.data()),
+    thrust::raw_pointer_cast(d_out.data()));
+  thrust::host_vector<uint16_t> h_out = d_out;
+  for (int i = 0; i < 64; ++i) {
+    //printf("%d  %d\n", int(h_in[i]), int(h_out[i]));
+    EXPECT_EQ(h_out[i], h_in[i]);
+  }
+  CUTLASS_TRACE_HOST("LDSM 2x ldsm_test_device SUCCESS\n");
+  }
+
+  //
+  // LDSM 4x (128b)
+  //
+
+  {
+  thrust::device_vector<uint16_t> d_out(count);
+  ldsm_test_device<uint128_t><<<1, 32>>>(
+    thrust::raw_pointer_cast(d_in.data()),
+    thrust::raw_pointer_cast(d_out.data()));
+  thrust::host_vector<uint16_t> h_out = d_out;
+  for (int i = 0; i < 128; ++i) {
+    //printf("%d  %d\n", int(h_in[i]), int(h_out[i]));
+    EXPECT_EQ(h_out[i], h_in[i]);
+  }
+  CUTLASS_TRACE_HOST("LDSM 4x ldsm_test_device SUCCESS\n");
+  }
+
+  //
+  // CuTe LDSM
+  //
+
+  {
+  thrust::device_vector<uint16_t> d_out(count);
+
+  auto smem_layout = Layout<Shape <_32,Shape <_2, _4>>,
+                            Stride< _2,Stride<_1,_64>>>{};
+  auto tiled_copy = make_tiled_copy(Copy_Atom<SM75_U32x1_LDSM_N, uint16_t>{},
+                                    Layout<Shape<_32,_1>>{},
+                                    Layout<Shape< _1,_8>>{});
+
+  ldsm_test_device_cute<<<1, int(size(tiled_copy))>>>(
+    thrust::raw_pointer_cast(d_in.data()),
+    thrust::raw_pointer_cast(d_out.data()),
+    tiled_copy,
+    smem_layout);
+  thrust::host_vector<uint16_t> h_out = d_out;
+  for (int i = 0; i < size(smem_layout); ++i) {
+    //printf("%d  %d\n", int(h_in[i]), int(h_out[i]));
+    EXPECT_EQ(h_out[i], h_in[i]);
+  }
+  CUTLASS_TRACE_HOST("CuTe 32x8 interleaved U32x1_LDSM_N SUCCESS\n");
+  }
+
+  {
+  thrust::device_vector<uint16_t> d_out(count);
+
+  auto smem_layout = Layout<Shape <_32,Shape <_2, _4>>,
+                            Stride< _2,Stride<_1,_64>>>{};
+  auto tiled_copy = make_tiled_copy(Copy_Atom<SM75_U32x2_LDSM_N, uint16_t>{},
+                                    Layout<Shape<_32,_1>>{},
+                                    Layout<Shape< _1,_8>>{});
+
+  ldsm_test_device_cute<<<1, int(size(tiled_copy))>>>(
+    thrust::raw_pointer_cast(d_in.data()),
+    thrust::raw_pointer_cast(d_out.data()),
+    tiled_copy,
+    smem_layout);
+  thrust::host_vector<uint16_t> h_out = d_out;
+  for (int i = 0; i < size(smem_layout); ++i) {
+    //printf("%d  %d\n", int(h_in[i]), int(h_out[i]));
+    EXPECT_EQ(h_out[i], h_in[i]);
+  }
+  CUTLASS_TRACE_HOST("CuTe 32x8 interleaved U32x2_LDSM_N SUCCESS\n");
+  }
+
+  {
+  thrust::device_vector<uint16_t> d_out(count);
+
+  auto smem_layout = Layout<Shape <_32,Shape <_2, _4>>,
+                            Stride< _2,Stride<_1,_64>>>{};
+  auto tiled_copy = make_tiled_copy(Copy_Atom<SM75_U32x4_LDSM_N, uint16_t>{},
+                                    Layout<Shape<_32,_1>>{},
+                                    Layout<Shape< _1,_8>>{});
+
+  ldsm_test_device_cute<<<1, int(size(tiled_copy))>>>(
+    thrust::raw_pointer_cast(d_in.data()),
+    thrust::raw_pointer_cast(d_out.data()),
+    tiled_copy,
+    smem_layout);
+  thrust::host_vector<uint16_t> h_out = d_out;
+  for (int i = 0; i < size(smem_layout); ++i) {
+    //printf("%d  %d\n", int(h_in[i]), int(h_out[i]));
+    EXPECT_EQ(h_out[i], h_in[i]);
+  }
+  CUTLASS_TRACE_HOST("CuTe 32x8 interleaved U32x4_LDSM_N SUCCESS\n");
+  }
+
+  {
+  thrust::device_vector<uint16_t> d_out(count);
+
+  auto smem_layout = Layout<Shape <_32,Shape <_2, _4>>,
+                            Stride< _2,Stride<_1,_64>>>{};
+  auto tiled_copy = make_tiled_copy(Copy_Atom<UniversalCopy<uint16_t>, uint16_t>{},
+                                    Layout<Shape<_32,_1>>{},
+                                    Layout<Shape< _1,_8>>{});
+
+  ldsm_test_device_cute<<<1, int(size(tiled_copy))>>>(
+    thrust::raw_pointer_cast(d_in.data()),
+    thrust::raw_pointer_cast(d_out.data()),
+    tiled_copy,
+    smem_layout);
+  thrust::host_vector<uint16_t> h_out = d_out;
+  for (int i = 0; i < size(smem_layout); ++i) {
+    //printf("%d  %d\n", int(h_in[i]), int(h_out[i]));
+    EXPECT_EQ(h_out[i] , h_in[i]);
+  }
+  CUTLASS_TRACE_HOST("CuTe 32x8 interleaved LDS.U16 SUCCESS\n");
+  }
+
+  {
+  thrust::device_vector<uint16_t> d_out(count);
+
+  auto smem_layout = Layout<Shape <_32,_32>,
+                            Stride< _1,_32>>{};
+  auto tiled_copy = make_tiled_copy(Copy_Atom<SM75_U32x1_LDSM_N, uint16_t>{},
+                                    Layout<Shape<_16,_2>>{},
+                                    Layout<Shape< _2,_4>>{});
+
+  ldsm_test_device_cute<<<1, int(size(tiled_copy))>>>(
+    thrust::raw_pointer_cast(d_in.data()),
+    thrust::raw_pointer_cast(d_out.data()),
+    tiled_copy,
+    smem_layout);
+  thrust::host_vector<uint16_t> h_out = d_out;
+  for (int i = 0; i < size(smem_layout); ++i) {
+    //printf("%d  %d\n", int(h_in[i]), int(h_out[i]));
+    EXPECT_EQ(h_out[i], h_in[i]);
+  }
+  CUTLASS_TRACE_HOST("CuTe 32x32 U32x1_LDSM_N SUCCESS\n");
+  }
+
+  {
+  thrust::device_vector<uint16_t> d_out(count);
+
+  auto smem_layout = Layout<Shape <_32,_32>,
+                            Stride< _1,_32>>{};
+  auto tiled_copy = make_tiled_copy(Copy_Atom<SM75_U32x2_LDSM_N, uint16_t>{},
+                                    Layout<Shape<_16,_2>>{},
+                                    Layout<Shape< _2,_4>>{});
+
+  ldsm_test_device_cute<<<1, int(size(tiled_copy))>>>(
+    thrust::raw_pointer_cast(d_in.data()),
+    thrust::raw_pointer_cast(d_out.data()),
+    tiled_copy,
+    smem_layout);
+  thrust::host_vector<uint16_t> h_out = d_out;
+  for (int i = 0; i < size(smem_layout); ++i) {
+    //printf("%d  %d\n", int(h_in[i]), int(h_out[i]));
+    EXPECT_EQ(h_out[i], h_in[i]);
+  }
+  CUTLASS_TRACE_HOST("CuTe 32x32 U32x2_LDSM_N SUCCESS\n");
+  }
+
+  {
+  thrust::device_vector<uint16_t> d_out(count);
+
+  auto smem_layout = Layout<Shape <_32,_32>,
+                            Stride< _1,_32>>{};
+  auto tiled_copy = make_tiled_copy(Copy_Atom<SM75_U32x4_LDSM_N, uint16_t>{},
+                                    Layout<Shape<_16,_2>>{},
+                                    Layout<Shape< _2,_4>>{});
+
+  ldsm_test_device_cute<<<1, int(size(tiled_copy))>>>(
+    thrust::raw_pointer_cast(d_in.data()),
+    thrust::raw_pointer_cast(d_out.data()),
+    tiled_copy,
+    smem_layout);
+  thrust::host_vector<uint16_t> h_out = d_out;
+  for (int i = 0; i < size(smem_layout); ++i) {
+    //printf("%d  %d\n", int(h_in[i]), int(h_out[i]));
+    EXPECT_EQ(h_out[i], h_in[i]);
+  }
+  CUTLASS_TRACE_HOST("CuTe 32x32 U32x4_LDSM_N SUCCESS\n");
+  }
+
+  {
+  thrust::device_vector<uint16_t> d_out(count);
+
+  auto smem_layout = Layout<Shape <_32,_32>,
+                            Stride< _1,_32>>{};
+  auto tiled_copy = make_tiled_copy(Copy_Atom<UniversalCopy<uint16_t>, uint16_t>{},
+                                    Layout<Shape<_16,_2>>{},
+                                    Layout<Shape< _2,_4>>{});
+
+  ldsm_test_device_cute<<<1, int(size(tiled_copy))>>>(
+    thrust::raw_pointer_cast(d_in.data()),
+    thrust::raw_pointer_cast(d_out.data()),
+    tiled_copy,
+    smem_layout);
+  thrust::host_vector<uint16_t> h_out = d_out;
+  for (int i = 0; i < size(smem_layout); ++i) {
+    //printf("%d  %d\n", int(h_in[i]), int(h_out[i]));
+    EXPECT_EQ(h_out[i], h_in[i]);
+  }
+  CUTLASS_TRACE_HOST("CuTe 32x32 LDS.U16 SUCCESS\n");
+  }
+
+  {
+  thrust::device_vector<uint16_t> d_out(count);
+
+  auto smem_layout = Layout<Shape <_32,_32>,
+                            Stride<_32, _1>>{};
+  auto tiled_copy = make_tiled_copy(Copy_Atom<SM75_U16x2_LDSM_T, uint16_t>{},
+                                    Layout<Shape<_4,_8>>{},
+                                    Layout<Shape<_2,_1>>{});
+
+  ldsm_test_device_cute<<<1, int(size(tiled_copy))>>>(
+    thrust::raw_pointer_cast(d_in.data()),
+    thrust::raw_pointer_cast(d_out.data()),
+    tiled_copy,
+    smem_layout);
+  thrust::host_vector<uint16_t> h_out = d_out;
+  for (int i = 0; i < size(smem_layout); ++i) {
+    //printf("%d  %d\n", int(h_in[i]), int(h_out[i]));
+    EXPECT_EQ(h_out[i],  h_in[i]);
+  }
+  CUTLASS_TRACE_HOST("CuTe 32x32 U16x2_LDSM_T SUCCESS\n");
+  }
+
+  {
+  thrust::device_vector<uint16_t> d_out(count);
+
+  auto smem_layout = Layout<Shape <_32,_32>,
+                            Stride<_32, _1>>{};
+  auto tiled_copy = make_tiled_copy(Copy_Atom<SM75_U16x4_LDSM_T, uint16_t>{},
+                                    Layout<Shape<_4,_8>>{},
+                                    Layout<Shape<_4,_1>>{});
+
+  ldsm_test_device_cute<<<1, int(size(tiled_copy))>>>(
+    thrust::raw_pointer_cast(d_in.data()),
+    thrust::raw_pointer_cast(d_out.data()),
+    tiled_copy,
+    smem_layout);
+  thrust::host_vector<uint16_t> h_out = d_out;
+  for (int i = 0; i < size(smem_layout); ++i) {
+    //printf("%d  %d\n", int(h_in[i]), int(h_out[i]));
+    EXPECT_EQ(h_out[i],  h_in[i]);
+  }
+  CUTLASS_TRACE_HOST("CuTe 32x32 U16x4_LDSM_T SUCCESS\n");
+  }
+
+  {
+  thrust::device_vector<uint16_t> d_out(count);
+
+  auto smem_layout = Layout<Shape <_32,_32>,
+                            Stride<_32, _1>>{};
+  auto tiled_copy = make_tiled_copy(Copy_Atom<SM75_U16x8_LDSM_T, uint16_t>{},
+                                    Layout<Shape<_4,_8>>{},
+                                    Layout<Shape<_8,_1>>{});
+
+  ldsm_test_device_cute<<<1, int(size(tiled_copy))>>>(
+    thrust::raw_pointer_cast(d_in.data()),
+    thrust::raw_pointer_cast(d_out.data()),
+    tiled_copy,
+    smem_layout);
+  thrust::host_vector<uint16_t> h_out = d_out;
+  for (int i = 0; i < size(smem_layout); ++i) {
+    //printf("%d  %d\n", int(h_in[i]), int(h_out[i]));
+    EXPECT_EQ(h_out[i], h_in[i]);
+  }
+  CUTLASS_TRACE_HOST("CuTe 32x32 U16x8_LDSM_T SUCCESS\n");
+  }
+
+  CUTLASS_TRACE_HOST("PASS");
+}
diff --git a/test/unit/cute/ampere/tiled_cp_async.cu b/test/unit/cute/ampere/tiled_cp_async.cu
new file mode 100644
index 0000000000..8f8e74d58b
--- /dev/null
+++ b/test/unit/cute/ampere/tiled_cp_async.cu
@@ -0,0 +1,135 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <iostream>
+#include <iomanip>
+#include <utility>
+#include <type_traits>
+#include <vector>
+#include <numeric>
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <cute/tensor.hpp>
+#include <cute/swizzle.hpp> // cute::Swizzle
+
+#include "tiled_cp_async_testbed.hpp"
+
+using namespace cute;
+
+TEST(SM80_CuTe_tiled_cp_async, no_swizzle_mn_single_tile)
+{
+  {
+  using copy_atom = decltype(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint128_t>, double>{});
+  using thr_layout = decltype(Layout<Shape <_16, _8>, Stride< _1,_16>>{});
+  using val_layout = decltype(Layout<Shape<_2,_1>>{});
+  using tiled_copy = decltype(make_tiled_copy(copy_atom{}, thr_layout{}, val_layout{}));
+  using smem_layout_atom = decltype(Layout<Shape <_16, _4>, Stride< _1,_16>>{});
+  using gmem_stride_type = decltype(LayoutLeft{});
+  test_cp_async_no_swizzle<double, cute::Int<64>, cute::Int<16>, gmem_stride_type, smem_layout_atom, tiled_copy>();
+  }
+
+  {
+  using copy_atom = decltype(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint128_t>, double>{});
+  using thr_layout = decltype(Layout<Shape <_16, _8>, Stride< _1,_16>>{});
+  using val_layout = decltype(Layout<Shape<_2,_1>>{});
+  using tiled_copy = decltype(make_tiled_copy(copy_atom{}, thr_layout{}, val_layout{}));
+  using smem_layout_atom = decltype(Layout<Shape <_16, _4>, Stride< _1,_16>>{});
+  using gmem_stride_type = decltype(LayoutLeft{});
+  test_cp_async_no_swizzle<double, cute::Int<128>, cute::Int<16>, gmem_stride_type, smem_layout_atom, tiled_copy>();
+  }
+}
+
+TEST(SM80_CuTe_tiled_cp_async, no_swizzle_k_single_tile)
+{
+  {
+  using copy_atom = decltype(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint128_t>, double>{});
+  using thr_layout = decltype(Layout<Shape <_16, _8>, Stride< _8,_1>>{});
+  using val_layout = decltype(Layout<Shape<_1,_2>>{});
+  using tiled_copy = decltype(make_tiled_copy(copy_atom{}, thr_layout{}, val_layout{}));
+  using smem_layout_atom = decltype(make_ordered_layout(Shape<_128,_16>{}, Step <_2, _1>{}));
+  using gmem_stride_type = decltype(LayoutRight{});
+  test_cp_async_no_swizzle<double, cute::Int<128>, cute::Int<16>, gmem_stride_type, smem_layout_atom, tiled_copy>();
+  }
+}
+
+TEST(SM80_CuTe_tiled_cp_async, swizzle_mn_single_tile)
+{
+  {
+  using copy_atom = decltype(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint128_t>, double>{});
+  using thr_layout = decltype(Layout<Shape <_16, _8>, Stride< _1,_16>>{});
+  using val_layout = decltype(Layout<Shape<_2,_1>>{});
+  using tiled_copy = decltype(make_tiled_copy(copy_atom{}, thr_layout{}, val_layout{}));
+  using swizzle_atom = decltype(Swizzle<2,2,2>{});
+  using smem_layout_atom = decltype(Layout<Shape <_16, _4>, Stride< _1,_16>>{});
+  using gmem_stride_type = decltype(LayoutLeft{});
+  test_cp_async_with_swizzle<double, cute::Int<64>, cute::Int<16>, gmem_stride_type, swizzle_atom, smem_layout_atom, tiled_copy>();
+  }
+
+  {
+  using copy_atom = decltype(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint128_t>, double>{});
+  using thr_layout = decltype(Layout<Shape <_16, _8>, Stride< _1,_16>>{});
+  using val_layout = decltype(Layout<Shape<_2,_1>>{});
+  using tiled_copy = decltype(make_tiled_copy(copy_atom{}, thr_layout{}, val_layout{}));
+  using swizzle_atom = decltype(Swizzle<2,2,2>{});
+  using smem_layout_atom = decltype(Layout<Shape <_16, _4>, Stride< _1,_16>>{});
+  using gmem_stride_type = decltype(LayoutLeft{});
+  test_cp_async_with_swizzle<double, cute::Int<128>, cute::Int<16>, gmem_stride_type, swizzle_atom, smem_layout_atom, tiled_copy>();
+  }
+}
+
+TEST(SM80_CuTe_tiled_cp_async, swizzle_k_single_tile)
+{
+  {
+  using copy_atom = decltype(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<double>, double>{});
+  using thr_layout = decltype(Layout<Shape < _8,_16>, Stride<_16, _1>>{});
+  using val_layout = decltype(Layout<Shape<_1,_1>>{});
+  using tiled_copy = decltype(make_tiled_copy(copy_atom{}, thr_layout{}, val_layout{}));
+  using swizzle_atom = decltype(Swizzle<2,0,4>{});
+  using smem_layout_atom = decltype(Layout<Shape <_4,_16>, Stride<_1, _4>>{});
+  using gmem_stride_type = decltype(LayoutRight{});
+  test_cp_async_with_swizzle<double, cute::Int<128>, cute::Int<16>, gmem_stride_type, swizzle_atom, smem_layout_atom, tiled_copy>();
+  }
+
+  {
+  using copy_atom = decltype(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint128_t>, tfloat32_t>{});
+  using thr_layout = decltype(Layout<Shape <_16,_8>, Stride< _8,_1>>{});
+  using val_layout = decltype(Layout<Shape < _1,_4>>{});
+  using tiled_copy = decltype(make_tiled_copy(copy_atom{}, thr_layout{}, val_layout{}));
+  using swizzle_atom = decltype(Swizzle<3,2,3>{});
+  using smem_layout_atom = decltype(Layout<Shape < _8,_32>, Stride<_32, _1>>{});
+  using gmem_stride_type = decltype(LayoutRight{});
+  test_cp_async_with_swizzle<tfloat32_t, cute::Int<128>, cute::Int<32>, gmem_stride_type, swizzle_atom, smem_layout_atom, tiled_copy>();
+  }
+}
diff --git a/test/unit/cute/ampere/tiled_cp_async_testbed.hpp b/test/unit/cute/ampere/tiled_cp_async_testbed.hpp
new file mode 100644
index 0000000000..8edd5b9f6f
--- /dev/null
+++ b/test/unit/cute/ampere/tiled_cp_async_testbed.hpp
@@ -0,0 +1,158 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <iostream>
+#include <iomanip>
+#include <utility>
+#include <type_traits>
+#include <vector>
+#include <numeric>
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <cute/tensor.hpp>
+
+using namespace cute;
+
+template <class ElementType, class SmemLayout>
+struct SharedStorage
+{
+  cute::ArrayEngine<ElementType, cute::cosize_v<SmemLayout>> smem;
+};
+
+template <class T, class TiledCopy, class GmemLayout, class SmemLayout>
+__global__ void
+test_tiled_cp_async_device_cute(T const* g_in, T* g_out,
+                     TiledCopy const tiled_copy,
+                     GmemLayout gmem_layout, SmemLayout smem_layout)
+{
+  using namespace cute;
+
+  extern __shared__ char shared_memory[];
+  using SharedStorage = SharedStorage<T, SmemLayout>;
+  SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(shared_memory);
+
+  auto thr_copy = tiled_copy.get_slice(threadIdx.x);
+  Tensor gA = make_tensor(make_gmem_ptr(g_in), gmem_layout);
+  Tensor gB = make_tensor(make_gmem_ptr(g_out), gmem_layout);
+
+  // Construct SMEM tensor
+  Tensor sA = make_tensor(make_smem_ptr(shared_storage.smem.begin()), smem_layout);  
+
+  auto tAgA = thr_copy.partition_S(gA);
+  auto tAsA = thr_copy.partition_D(sA);
+
+#if 0
+  if (thread0()) {
+    print("gA  : "); print(gA.layout());   print("\n");
+    print("sA  : "); print(sA.layout());   print("\n");
+    print("tAgA: "); print(tAgA.layout()); print("\n");
+    print("tAsA: "); print(tAsA.layout()); print("\n");
+  }
+#endif
+
+  copy(tiled_copy, tAgA, tAsA);
+
+  cp_async_fence();
+  cp_async_wait<0>();
+  __syncthreads();
+
+  // Store trivially smem -> gmem
+
+  if (thread0()) {
+    copy(sA, gB);
+  }
+
+}
+
+template <class T, class TiledCopy, class GMEM_Layout, class SMEM_Layout>
+void
+test_tiled_cp_async(
+               TiledCopy const tiled_copy,
+               GMEM_Layout const& gmem_layout,
+               SMEM_Layout const& smem_layout)
+{
+  using namespace cute;
+
+  // Allocate and initialize host test data
+  size_t N = ceil_div(cosize(gmem_layout) * sizeof_bits<T>::value, 8);
+  thrust::host_vector<T> h_in(N);
+  Tensor hA_in  = make_tensor(recast_ptr<T>(h_in.data()), gmem_layout);
+  for (int i = 0; i < size(hA_in); ++i) { hA_in(i) = static_cast<T>(i % 13); }
+
+  // Allocate and initialize device test data
+  thrust::device_vector<T> d_in = h_in;
+  thrust::device_vector<T> d_out(h_in.size(), T(-1));
+
+  // Launch
+  int smem_size = int(sizeof(SharedStorage<T, decltype(smem_layout)>));
+  test_tiled_cp_async_device_cute<<<1, 128, smem_size>>>(
+    reinterpret_cast<T const*>(raw_pointer_cast(d_in.data())),
+    reinterpret_cast<T*>      (raw_pointer_cast(d_out.data())),
+    tiled_copy,
+    gmem_layout,
+    smem_layout);
+
+  // Copy results back to host
+  thrust::host_vector<T> h_out = d_out;
+  Tensor hA_out = make_tensor(recast_ptr<T>(h_out.data()), gmem_layout);
+
+  // Validate the results. Print only the first 3 errors.
+  int count = 3;
+  for (int i = 0; i < size(hA_out) && count > 0; ++i) {
+    EXPECT_EQ(hA_in(i), hA_out(i));
+    if (hA_in(i) != hA_out(i)) {
+      --count;
+    }
+  }
+}
+
+template <typename T, typename M, typename N, typename GMEM_STRIDE_TYPE, typename SMEM_LAYOUT, typename TILED_COPY>
+void test_cp_async_no_swizzle() {
+  using namespace cute;
+  auto smem_atom = SMEM_LAYOUT{};
+  auto smem_layout = tile_to_shape(smem_atom, Shape<M, N>{});
+  auto gmem_layout = make_layout(make_shape(M{}, N{}), GMEM_STRIDE_TYPE{});
+  test_tiled_cp_async<T>(TILED_COPY{}, gmem_layout, smem_layout);
+}
+
+template <typename T, typename M, typename N, typename GMEM_STRIDE_TYPE, typename SWIZZLE_ATOM, typename SMEM_LAYOUT, typename TILED_COPY>
+void test_cp_async_with_swizzle() {
+  using namespace cute;
+  auto swizzle_atom = SWIZZLE_ATOM{};
+  auto smem_atom = composition(swizzle_atom, SMEM_LAYOUT{});
+  auto smem_layout = tile_to_shape(smem_atom, Shape<M, N>{});
+  auto gmem_layout = make_layout(make_shape(M{}, N{}), GMEM_STRIDE_TYPE{});
+  test_tiled_cp_async<T>(TILED_COPY{}, gmem_layout, smem_layout);
+}
diff --git a/test/unit/cute/cooperative_gemm_common.hpp b/test/unit/cute/cooperative_gemm_common.hpp
new file mode 100644
index 0000000000..dbb85e6ba3
--- /dev/null
+++ b/test/unit/cute/cooperative_gemm_common.hpp
@@ -0,0 +1,770 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/relatively_equal.h"
+#include "cutlass_unit_test.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+
+#include <iostream>
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <cute/tensor.hpp>
+
+using namespace cute;
+
+template<typename T>
+struct fp64_tester {
+  using value_type = double;
+};
+
+template<typename T>
+struct fp64_tester<complex<T>> {
+  using value_type = complex<double>;
+};
+
+template<class TA,
+         class TB,
+         class TC,
+         class ALayout, // logical shape (M, K)
+         class BLayout, // logical shape (N, K)
+         class CLayout> // logical shape (M, N)
+auto host_generate_gemm_inputs(
+  ALayout a_layout,
+  BLayout b_layout,
+  CLayout c_layout
+) {
+  thrust::host_vector<TA> h_a(cosize(a_layout));
+  thrust::host_vector<TB> h_b(cosize(b_layout));
+  thrust::host_vector<TC> h_c(cosize(c_layout));
+  thrust::host_vector<TC> h_c_out(cosize(c_layout));
+
+  auto h_a_tensor = make_tensor(h_a.data(), a_layout);
+  auto h_b_tensor = make_tensor(h_b.data(), b_layout);
+  auto h_c_tensor = make_tensor(h_c.data(), c_layout);
+  size_t max_size   = std::max<size_t>({static_cast<size_t>(size(a_layout)),
+                                        static_cast<size_t>(size(b_layout)),
+                                        static_cast<size_t>(size(c_layout))});
+  for (size_t i = 0; i < max_size; ++i) {
+    double di = static_cast<double>(i);
+    if(i < size(a_layout)) {
+      h_a_tensor(i) = static_cast<TA>(di / size(a_layout));
+    }
+    if(i < size(b_layout)) {
+      h_b_tensor(i) = static_cast<TB>(di / size(a_layout));
+    }
+    if(i < size(c_layout)) {
+      h_c_tensor(i) = static_cast<TC>((di*di) / size(a_layout));
+    }
+  }
+
+  return std::make_tuple(h_a, h_b, h_c, h_c_out);
+}
+
+template<class Alpha, class EngineA, class ALayout,
+         class EngineB, class BLayout,
+         class Beta, class EngineC, class CLayout,
+         class ALoadTransform  = cute::identity,
+         class BLoadTransform  = cute::identity,
+         class CLoadTransform  = cute::identity,
+         class CStoreTransform = cute::identity>
+thrust::host_vector<typename EngineC::value_type>
+host_reference_gemm(Alpha                           alpha,
+                    Tensor<EngineA, ALayout> const& h_a_tensor,
+                    Tensor<EngineB, BLayout> const& h_b_tensor,
+                    Beta                            beta,
+                    Tensor<EngineC, CLayout> const& h_c_tensor,
+                    ALoadTransform           const& a_load_transform = {},
+                    BLoadTransform           const& b_load_transform = {},
+                    CLoadTransform           const& c_load_transform = {},
+                    CStoreTransform          const& c_store_transform = {})
+  {
+  // Cannot use ::value_type because it propagates to complex::value_type,
+  // so ViewEngine<complex<double>>::value_type == double
+  using TA = remove_cv_t<typename EngineA::element_type>;
+  using TB = remove_cv_t<typename EngineB::element_type>;
+  using TC = remove_cv_t<typename EngineC::element_type>;
+
+  using tester = fp64_tester<TC>;
+  using ABC_64 = typename tester::value_type;
+
+  static_assert(std::is_same_v<typename fp64_tester<TA>::value_type, typename fp64_tester<TB>::value_type>);
+  static_assert(std::is_same_v<typename fp64_tester<TB>::value_type, typename fp64_tester<TC>::value_type>);
+
+  thrust::host_vector<TC> h_c_ref(cosize(h_c_tensor.layout()), static_cast<TC>(0.0));
+  auto h_c_ref_tensor = make_tensor(h_c_ref.data(), h_c_tensor.layout());
+  // A * B
+  for (int k = 0; k < size<1>(h_a_tensor); k++) {
+    for (int m = 0; m < size<0>(h_a_tensor); m++) {
+      for (int n = 0; n < size<0>(h_b_tensor); n++) {
+          const auto a_value      = a_load_transform(h_a_tensor(m, k));
+          const auto b_value      = b_load_transform(h_b_tensor(n, k));
+          const auto a_value_fp64 = static_cast<ABC_64>(a_value);
+          const auto b_value_fp64 = static_cast<ABC_64>(b_value);
+          h_c_ref_tensor(m, n) += static_cast<TC>(a_value_fp64 * b_value_fp64);
+      }
+    }
+  }
+  // C = A*B + C
+  for (int i = 0; i < size(h_c_ref_tensor); i++) {
+    const auto ab_value_fp64 = static_cast<ABC_64>(h_c_ref_tensor(i));
+    const auto c_value_fp64  = static_cast<ABC_64>(c_load_transform(h_c_tensor(i)));
+    h_c_ref_tensor(i)        = c_store_transform(static_cast<TC>(alpha * ab_value_fp64 + beta * c_value_fp64));
+  }
+
+  return h_c_ref;
+}
+
+template<class EngineC, class CLayout>
+void verify_gemm_correctness(cute::Tensor<EngineC, CLayout> const& h_c_out_tensor,
+                             cute::Tensor<EngineC, CLayout> const& h_c_ref_tensor)
+{
+  // Cannot use ::value_type because it propagates to complex::value_type,
+  // so ViewEngine<complex<double>>::value_type == double
+  using TC = remove_cv_t<typename EngineC::element_type>;
+
+  using tester = fp64_tester<TC>;
+  using ABC_64 = typename tester::value_type;
+
+  for (int i = 0; i < size(h_c_ref_tensor); i++) {
+    ABC_64 h_c_ref_i = h_c_ref_tensor(i);
+    ABC_64 h_c_out_i = h_c_out_tensor(i);
+    double epsilon(0.1f);
+    double nonzero_floor(std::numeric_limits<double>::min());
+    bool passed = cutlass::relatively_equal(h_c_out_i, h_c_ref_i, epsilon, nonzero_floor);
+    ASSERT_TRUE(passed) << i << " - result:" << h_c_out_i << " expected:" << h_c_ref_i;
+  }
+}
+
+
+template<uint32_t ThreadBlockSize,
+         uint32_t CopyMaxVecBits,
+         class GMemALayout,
+         class GMemBLayout,
+         class GMemCLayout,
+         class SMemALayout,
+         class SMemBLayout,
+         class SMemCLayout,
+         class TA,
+         class TB,
+         class TC,
+         class Alpha,
+         class Beta,
+         class TiledMma,
+         class ALoadTransform,
+         class BLoadTransform,
+         class CLoadTransform,
+         class CStoreTransform,
+         class SMemCopyOpA,
+         class SMemCopyOpB,
+         class SMemCopyOpC>
+__launch_bounds__(ThreadBlockSize) __global__ void
+cooperative_gemm_kernel(GMemALayout gmem_a_layout,
+                        GMemBLayout gmem_b_layout,
+                        GMemCLayout gmem_c_layout,
+                        SMemALayout smem_a_layout,
+                        SMemBLayout smem_b_layout,
+                        SMemCLayout smem_c_layout,
+                        TA       const* a,
+                        TB       const* b,
+                        TC       const* c,
+                        TC            * c_out,
+                        Alpha    const  alpha,
+                        Beta     const  beta,
+                        TiledMma        tiled_mma,
+                        ALoadTransform  a_load_transform,
+                        BLoadTransform  b_load_transform,
+                        CLoadTransform  c_load_transform,
+                        CStoreTransform c_store_transform,
+                        SMemCopyOpA     a_copy_op,
+                        SMemCopyOpB     b_copy_op,
+                        SMemCopyOpC     c_copy_op)
+{
+    using namespace cute;
+
+    Tensor g_a_tensor     = make_tensor(make_gmem_ptr(a), gmem_a_layout);
+    Tensor g_b_tensor     = make_tensor(make_gmem_ptr(b), gmem_b_layout);
+    Tensor g_c_tensor     = make_tensor(make_gmem_ptr(c), gmem_c_layout);
+    Tensor g_c_out_tensor = make_tensor(make_gmem_ptr(c_out), gmem_c_layout);
+
+    constexpr uint32_t copy_max_vec_bytes = CopyMaxVecBits / 8;
+
+    extern __shared__ float4 smem_buf[];
+    auto* smem_ptr = reinterpret_cast<unsigned char*>(smem_buf);
+    auto* smem_ptr_a = smem_ptr;
+    auto* smem_ptr_b = smem_ptr_a + round_up((sizeof(TA) * cosize(smem_a_layout)), copy_max_vec_bytes);
+    auto* smem_ptr_c = smem_ptr_b + round_up((sizeof(TB) * cosize(smem_b_layout)), copy_max_vec_bytes);
+
+    Tensor s_a_tensor = make_tensor(make_smem_ptr<TA>(smem_ptr_a), smem_a_layout);
+    Tensor s_b_tensor = make_tensor(make_smem_ptr<TB>(smem_ptr_b), smem_b_layout);
+    Tensor s_c_tensor = make_tensor(make_smem_ptr<TC>(smem_ptr_c), smem_c_layout);
+
+    cooperative_copy<ThreadBlockSize, CopyMaxVecBits>(threadIdx.x, g_a_tensor, s_a_tensor);
+    cooperative_copy<ThreadBlockSize, CopyMaxVecBits>(threadIdx.x, g_b_tensor, s_b_tensor);
+    cooperative_copy<ThreadBlockSize, CopyMaxVecBits>(threadIdx.x, g_c_tensor, s_c_tensor);
+
+    cp_async_fence();
+    cp_async_wait<0>();
+    __syncthreads();
+
+    cooperative_gemm(
+      threadIdx.x, tiled_mma,
+      alpha, s_a_tensor, s_b_tensor, beta, s_c_tensor,
+      a_load_transform, b_load_transform, c_load_transform, c_store_transform,
+      a_copy_op, b_copy_op, c_copy_op
+    );
+    __syncthreads();
+
+    cooperative_copy<ThreadBlockSize, CopyMaxVecBits>(threadIdx.x, s_c_tensor, g_c_out_tensor);
+}
+
+template<uint32_t ThreadBlockSize,
+         uint32_t CopyMaxVecBits,
+         class GMemALayout,
+         class GMemBLayout,
+         class GMemCLayout,
+         class SMemALayout,
+         class SMemBLayout,
+         class TA,
+         class TB,
+         class TC,
+         class TiledMma,
+         class ALoadTransform,
+         class BLoadTransform,
+         class CLoadTransform,
+         class CStoreTransform,
+         class SMemCopyOpA,
+         class SMemCopyOpB>
+__launch_bounds__(ThreadBlockSize) __global__ void
+cooperative_gemm_kernel_rmem_c(GMemALayout gmem_a_layout,
+                               GMemBLayout gmem_b_layout,
+                               GMemCLayout gmem_c_layout,
+                               SMemALayout smem_a_layout,
+                               SMemBLayout smem_b_layout,
+                               TA        const* a,
+                               TB        const* b,
+                               TC        const* c,
+                               TC             * c_out,
+                               TiledMma         tiled_mma,
+                               ALoadTransform   a_load_transform,
+                               BLoadTransform   b_load_transform,
+                               CLoadTransform   c_load_transform,
+                               CStoreTransform  c_store_transform,
+                               SMemCopyOpA      a_copy_op,
+                               SMemCopyOpB      b_copy_op)
+  {
+    using namespace cute;
+
+    Tensor g_a_tensor     = make_tensor(make_gmem_ptr(a), gmem_a_layout);
+    Tensor g_b_tensor     = make_tensor(make_gmem_ptr(b), gmem_b_layout);
+    Tensor g_c_tensor     = make_tensor(make_gmem_ptr(c), gmem_c_layout);
+    Tensor g_c_out_tensor = make_tensor(make_gmem_ptr(c_out), gmem_c_layout);
+
+    constexpr uint32_t copy_max_vec_bytes = CopyMaxVecBits / 8;
+
+    extern __shared__ float4 smem_buf[];
+    auto* smem_ptr = reinterpret_cast<unsigned char*>(smem_buf);
+    auto* smem_ptr_a = smem_ptr;
+    auto* smem_ptr_b = smem_ptr_a + round_up((sizeof(TA) * cosize(smem_a_layout)), copy_max_vec_bytes);
+
+    Tensor s_a_tensor = make_tensor(make_smem_ptr<TA>(smem_ptr_a), smem_a_layout);
+    Tensor s_b_tensor = make_tensor(make_smem_ptr<TB>(smem_ptr_b), smem_b_layout);
+
+    cooperative_copy<ThreadBlockSize, CopyMaxVecBits>(threadIdx.x, g_a_tensor, s_a_tensor);
+    cooperative_copy<ThreadBlockSize, CopyMaxVecBits>(threadIdx.x, g_b_tensor, s_b_tensor);
+
+    cp_async_fence();
+    cp_async_wait<0>();
+    __syncthreads();
+
+    // Create C fragment for storing intermediate results
+    auto thr_mma = TiledMma().get_thread_slice(threadIdx.x);
+    Tensor g_c_partition = thr_mma.partition_C(g_c_tensor);
+    Tensor g_c_out_partition = thr_mma.partition_C(g_c_out_tensor);
+    Tensor r_c_partition = thr_mma.make_fragment_C(g_c_partition);
+
+    // Create indexing help for predicated GEMMs
+    Tensor cC   = make_identity_tensor(shape(gmem_c_layout));
+    Tensor tCcC = thr_mma.partition_C(cC);
+
+    // Load C from global
+    // (always loading in predicated way)
+    CUTE_UNROLL
+    for (int i = 0; i < size(r_c_partition); ++i)
+    {
+      if (elem_less(tCcC(i), shape(g_c_tensor)))
+      {
+        r_c_partition(i) = c_load_transform(g_c_partition(i));
+      }
+    }
+
+    cooperative_gemm(
+      threadIdx.x, tiled_mma, s_a_tensor, s_b_tensor, r_c_partition,
+      a_load_transform, b_load_transform, a_copy_op, b_copy_op
+    );
+
+    __syncthreads();
+
+    // Store C to global
+    // (always storing in predicated way)
+    CUTE_UNROLL
+    for (int i = 0; i < size(r_c_partition); ++i)
+    {
+      if (elem_less(tCcC(i), shape(g_c_tensor)))
+      {
+        g_c_out_partition(i) = c_store_transform(r_c_partition(i));
+      }
+    }
+}
+
+template<uint32_t ThreadBlockSize,
+         uint32_t CopyMaxVecBits,
+         class TA,
+         class TB,
+         class TC,
+         class GMemALayout, // logical shape (M, K)
+         class GMemBLayout, // logical shape (N, K)
+         class GMemCLayout, // logical shape (M, N)
+         class SMemALayout, // logical shape (M, K)
+         class SMemBLayout, // logical shape (N, K)
+         class SMemCLayout, // logical shape (M, N)
+         class TiledMma,
+         class ALoadTransform = cute::identity,
+         class BLoadTransform = cute::identity,
+         class CLoadTransform = cute::identity,
+         class CStoreTransform = cute::identity,
+         class ASMemCopyOp = AutoVectorizingCopyWithAssumedAlignment<CopyMaxVecBits>,
+         class BSMemCopyOp = AutoVectorizingCopyWithAssumedAlignment<CopyMaxVecBits>,
+         class CSMemCopyOp = AutoVectorizingCopyWithAssumedAlignment<CopyMaxVecBits>>
+void test_cooperative_gemm(GMemALayout     gmem_a_layout,
+                           GMemBLayout     gmem_b_layout,
+                           GMemCLayout     gmem_c_layout,
+                           SMemALayout     smem_a_layout,
+                           SMemBLayout     smem_b_layout,
+                           SMemCLayout     smem_c_layout,
+                           TiledMma        tiled_mma,
+                           ALoadTransform  a_load_transform  = {},
+                           BLoadTransform  b_load_transform  = {},
+                           CLoadTransform  c_load_transform  = {},
+                           CStoreTransform c_store_transform = {},
+                           ASMemCopyOp     a_smem_copy_op = {},
+                           BSMemCopyOp     b_smem_copy_op = {},
+                           CSMemCopyOp     c_smem_copy_op = {})
+{
+  static_assert(std::is_same_v<typename fp64_tester<TA>::value_type, typename fp64_tester<TB>::value_type>);
+  static_assert(std::is_same_v<typename fp64_tester<TB>::value_type, typename fp64_tester<TC>::value_type>);
+
+  static_assert(size<0>(gmem_a_layout) == size<0>(gmem_c_layout));  // AM == CM
+  static_assert(size<0>(gmem_b_layout) == size<1>(gmem_c_layout));  // BN == CN
+  static_assert(size<1>(gmem_a_layout) == size<1>(gmem_b_layout));  // AK == BK
+
+  static_assert(size<0>(smem_a_layout) == size<0>(smem_c_layout));  // AM == CM
+  static_assert(size<0>(smem_b_layout) == size<1>(smem_c_layout));  // BN == CN
+  static_assert(size<1>(smem_a_layout) == size<1>(smem_b_layout));  // AK == BK
+
+  static_assert(cute::size(gmem_a_layout) == cute::size(smem_a_layout));
+  static_assert(cute::size(gmem_b_layout) == cute::size(smem_b_layout));
+  static_assert(cute::size(gmem_c_layout) == cute::size(smem_c_layout));
+
+#if 0
+  print("   "); print("gmem:    "); print(gmem_layout); print("\n");
+  print("   "); print("smem:    "); print(smem_layout); print("\n");
+  print("   "); print("threads: "); print(ThreadBlockSize); print("\n");
+#endif
+
+  const auto alpha = static_cast<TC>(1.1);
+  const auto beta  = static_cast<TC>(1.2);
+
+  // Generate inputs
+  auto [h_a, h_b, h_c, h_c_out] = host_generate_gemm_inputs<TA, TB, TC>(gmem_a_layout, gmem_b_layout, gmem_c_layout);
+
+  thrust::device_vector<TA> d_a(h_a);
+  thrust::device_vector<TB> d_b(h_b);
+  thrust::device_vector<TC> d_c(h_c);
+  thrust::device_vector<TC> d_c_out(h_c_out.size(), TC(float(-1)));
+
+  constexpr uint32_t copy_max_vec_bytes = CopyMaxVecBits / 8;
+
+  const size_t shared_memory_size = round_up(sizeof(TA) * h_a.size(), copy_max_vec_bytes) +
+                                    round_up(sizeof(TB) * h_b.size(), copy_max_vec_bytes) +
+                                    sizeof(TC) * h_c.size();
+
+
+  auto kernel = cooperative_gemm_kernel<
+    ThreadBlockSize, CopyMaxVecBits,
+    GMemALayout, GMemBLayout, GMemCLayout,
+    SMemALayout, SMemBLayout, SMemCLayout,
+    TA, TB, TC, decltype(alpha), decltype(beta),
+    TiledMma,
+    ALoadTransform, BLoadTransform, CLoadTransform, CStoreTransform,
+    ASMemCopyOp, BSMemCopyOp, CSMemCopyOp
+  >;
+
+  ASSERT_EQ(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, static_cast<int>(shared_memory_size)), 0);
+
+  kernel<<<1, ThreadBlockSize, shared_memory_size>>>(
+    gmem_a_layout,
+    gmem_b_layout,
+    gmem_c_layout,
+    smem_a_layout,
+    smem_b_layout,
+    smem_c_layout,
+    thrust::raw_pointer_cast(d_a.data()),
+    thrust::raw_pointer_cast(d_b.data()),
+    thrust::raw_pointer_cast(d_c.data()),
+    thrust::raw_pointer_cast(d_c_out.data()),
+    alpha,
+    beta,
+    tiled_mma,
+    a_load_transform,
+    b_load_transform,
+    c_load_transform,
+    c_store_transform,
+    a_smem_copy_op,
+    b_smem_copy_op,
+    c_smem_copy_op
+  );
+
+  cudaError_t result = cudaDeviceSynchronize();
+  if (result != cudaSuccess) {
+    cudaError_t error = cudaGetLastError();
+    FAIL() << "Error at kernel sync: " << cudaGetErrorString(error) << "\n";
+  }
+
+  // Reference gemm
+  auto h_c_ref = host_reference_gemm(alpha,
+                                     make_tensor(h_a.data(), gmem_a_layout),
+                                     make_tensor(h_b.data(), gmem_b_layout),
+                                     beta,
+                                     make_tensor(h_c.data(), gmem_c_layout),
+                                     a_load_transform,
+                                     b_load_transform,
+                                     c_load_transform,
+                                     c_store_transform);
+
+  // Copy result data
+  h_c_out = d_c_out;
+
+  // Verify correctness
+  verify_gemm_correctness(make_tensor(h_c_out.data(), gmem_c_layout),
+                          make_tensor(h_c_ref.data(), gmem_c_layout));
+}
+
+template<uint32_t ThreadBlockSize,
+         uint32_t CopyMaxVecBits,
+         class TA,
+         class TB,
+         class TC,
+         class GMemALayout, // logical shape (M, K)
+         class GMemBLayout, // logical shape (N, K)
+         class GMemCLayout, // logical shape (M, N)
+         class SMemALayout, // logical shape (M, K)
+         class SMemBLayout, // logical shape (N, K)
+         class TiledMma,
+         class ALoadTransform = cute::identity,
+         class BLoadTransform = cute::identity,
+         class CLoadTransform = cute::identity,
+         class CStoreTransform = cute::identity,
+         class ASMemCopyOp = AutoVectorizingCopyWithAssumedAlignment<CopyMaxVecBits>,
+         class BSMemCopyOp = AutoVectorizingCopyWithAssumedAlignment<CopyMaxVecBits>>
+void test_cooperative_gemm_rmem_c(GMemALayout     gmem_a_layout,
+                                  GMemBLayout     gmem_b_layout,
+                                  GMemCLayout     gmem_c_layout,
+                                  SMemALayout     smem_a_layout,
+                                  SMemBLayout     smem_b_layout,
+                                  TiledMma        tiled_mma,
+                                  ALoadTransform  a_load_transform  = {},
+                                  BLoadTransform  b_load_transform  = {},
+                                  CLoadTransform  c_load_transform  = {},
+                                  CStoreTransform c_store_transform = {},
+                                  ASMemCopyOp     a_smem_copy_op    = {},
+                                  BSMemCopyOp     b_smem_copy_op    = {})
+{
+  static_assert(size<0>(gmem_a_layout) == size<0>(gmem_c_layout));  // AM == CM
+  static_assert(size<0>(gmem_b_layout) == size<1>(gmem_c_layout));  // BN == CN
+  static_assert(size<1>(gmem_a_layout) == size<1>(gmem_b_layout));  // AK == BK
+
+  static_assert(size<1>(smem_a_layout) == size<1>(smem_b_layout));  // AK == BK
+
+  static_assert(cute::size(gmem_a_layout) == cute::size(smem_a_layout));
+  static_assert(cute::size(gmem_b_layout) == cute::size(smem_b_layout));
+
+#if 0
+  print("   "); print("gmem:    "); print(gmem_layout); print("\n");
+  print("   "); print("smem:    "); print(smem_layout); print("\n");
+  print("   "); print("threads: "); print(ThreadBlockSize); print("\n");
+#endif
+
+  const auto alpha = static_cast<TC>(1.0);
+  const auto beta  = static_cast<TC>(1.0);
+
+  // Generate inputs
+  auto [h_a, h_b, h_c, h_c_out] =
+    host_generate_gemm_inputs<TA, TB, TC>(gmem_a_layout, gmem_b_layout, gmem_c_layout);
+
+  thrust::device_vector<TA> d_a(h_a);
+  thrust::device_vector<TB> d_b(h_b);
+  thrust::device_vector<TC> d_c(h_c);
+  thrust::device_vector<TC> d_c_out(h_c_out.size(), static_cast<TC>(-1));
+
+  constexpr uint32_t copy_max_vec_bytes = CopyMaxVecBits / 8;
+
+  const size_t shared_memory_size = round_up(sizeof(TA) * h_a.size(), copy_max_vec_bytes) +
+                                    round_up(sizeof(TB) * h_b.size(), copy_max_vec_bytes);
+
+
+  auto kernel = cooperative_gemm_kernel_rmem_c<
+    ThreadBlockSize, CopyMaxVecBits,
+    GMemALayout, GMemBLayout, GMemCLayout,
+    SMemALayout, SMemBLayout,
+    TA, TB, TC,
+    TiledMma,
+    ALoadTransform, BLoadTransform, CLoadTransform, CStoreTransform,
+    ASMemCopyOp, BSMemCopyOp
+  >;
+
+  ASSERT_EQ(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, static_cast<int>(shared_memory_size)), 0);
+
+  kernel<<<1, ThreadBlockSize, shared_memory_size>>>(
+    gmem_a_layout,
+    gmem_b_layout,
+    gmem_c_layout,
+    smem_a_layout,
+    smem_b_layout,
+    thrust::raw_pointer_cast(d_a.data()),
+    thrust::raw_pointer_cast(d_b.data()),
+    thrust::raw_pointer_cast(d_c.data()),
+    thrust::raw_pointer_cast(d_c_out.data()),
+    tiled_mma,
+    a_load_transform, b_load_transform, c_load_transform, c_store_transform,
+    a_smem_copy_op, b_smem_copy_op
+  );
+
+  cudaError_t result = cudaDeviceSynchronize();
+  if (result != cudaSuccess) {
+    cudaError_t error = cudaGetLastError();
+    FAIL() << "Error at kernel sync: " << cudaGetErrorString(error) << "\n";
+  }
+
+  // Copy result data
+  h_c_out = d_c_out;
+
+  // Reference gemm
+  auto h_c_ref = host_reference_gemm(alpha,
+                                     make_tensor(h_a.data(), gmem_a_layout),
+                                     make_tensor(h_b.data(), gmem_b_layout),
+                                     beta,
+                                     make_tensor(h_c.data(), gmem_c_layout),
+                                     a_load_transform,
+                                     b_load_transform,
+                                     c_load_transform,
+                                     c_store_transform);
+
+  // Verify correctness
+  verify_gemm_correctness(make_tensor(h_c_out.data(), gmem_c_layout),
+                          make_tensor(h_c_ref.data(), gmem_c_layout));
+}
+
+template<uint32_t ThreadBlockSize,
+         uint32_t CopyMaxVecBits,
+         class TA,
+         class TB,
+         class TC,
+         class ShapeMNK,
+         class TiledMma,
+         class ... Ops>
+void test_cooperative_gemm_col_major_layout(ShapeMNK shape_mnk,
+                                            TiledMma tiled_mma,
+                                            Ops ... ops)
+{
+  auto a_layout = make_layout(select<0, 2>(shape_mnk));
+  auto b_layout = make_layout(select<1, 2>(shape_mnk), GenRowMajor{});
+  auto c_layout = make_layout(select<0, 1>(shape_mnk));
+
+  test_cooperative_gemm<ThreadBlockSize,
+                        CopyMaxVecBits,
+                        TA, TB, TC>
+    (a_layout,
+     b_layout,
+     c_layout,
+     a_layout,
+     b_layout,
+     c_layout,
+     tiled_mma,
+     ops...);
+}
+
+
+template<uint32_t ThreadBlockSize,
+         uint32_t CopyMaxVecBits,
+         class TA,
+         class TB,
+         class TC,
+         class SMemAtomLayoutA,
+         class SMemAtomLayoutB,
+         class SMemAtomLayoutC,
+         class ShapeMNK,
+         class TiledMma,
+         class ... Ops>
+std::enable_if_t<std::conjunction_v<cute::is_layout<SMemAtomLayoutA>,
+                                    cute::is_layout<SMemAtomLayoutB>,
+                                    cute::is_layout<SMemAtomLayoutC>>>
+test_cooperative_gemm_col_major_layout(SMemAtomLayoutA smem_atom_layout_a,
+                                       SMemAtomLayoutB smem_atom_layout_b,
+                                       SMemAtomLayoutC smem_atom_layout_c,
+                                       ShapeMNK        shape_mnk,
+                                       TiledMma        tiled_mma,
+                                       Ops&&    ...    ops)
+{
+  auto gmem_a_layout = make_layout(select<0, 2>(shape_mnk));
+  auto gmem_b_layout = make_layout(select<1, 2>(shape_mnk), GenRowMajor{});
+  auto gmem_c_layout = make_layout(select<0, 1>(shape_mnk));
+
+  auto smem_a_layout = tile_to_shape(
+      smem_atom_layout_a,
+      make_shape(shape<0>(gmem_a_layout), shape<1>(gmem_a_layout)));
+
+  auto smem_b_layout = tile_to_shape(
+      smem_atom_layout_b,
+      make_shape(shape<0>(gmem_b_layout), shape<1>(gmem_b_layout)));
+
+  auto smem_c_layout = tile_to_shape(
+      smem_atom_layout_c,
+      make_shape(shape<0>(gmem_c_layout), shape<1>(gmem_c_layout)));
+
+  test_cooperative_gemm<ThreadBlockSize,
+                        CopyMaxVecBits,
+                        TA, TB, TC>
+    (gmem_a_layout,
+     gmem_b_layout,
+     gmem_c_layout,
+     smem_a_layout,
+     smem_b_layout,
+     smem_c_layout,
+     tiled_mma,
+     ops...);
+}
+
+
+template<uint32_t ThreadBlockSize,
+         uint32_t CopyMaxVecBits,
+         class TA,
+         class TB,
+         class TC,
+         class ShapeMNK,
+         class TiledMma,
+         class ... Ops>
+void test_cooperative_gemm_col_major_layout_rmem_c(ShapeMNK    shape_mnk,
+                                                   TiledMma    tiled_mma,
+                                                   Ops ... ops)
+{
+  auto a_layout = make_layout(select<0, 2>(shape_mnk));
+  auto b_layout = make_layout(select<1, 2>(shape_mnk), GenRowMajor{});
+  auto c_layout = make_layout(select<0, 1>(shape_mnk));
+
+
+  test_cooperative_gemm_rmem_c<ThreadBlockSize,
+                               CopyMaxVecBits,
+                               TA, TB,TC>
+    (a_layout,
+     b_layout,
+     c_layout,
+     a_layout,
+     b_layout,
+     tiled_mma,
+     ops...);
+}
+
+template<uint32_t ThreadBlockSize,
+         uint32_t CopyMaxVecBits,
+         class TA,
+         class TB,
+         class TC,
+         class SMemAtomLayoutA,
+         class SMemAtomLayoutB,
+         class ShapeMNK,
+         class TiledMma,
+         class ... Ops>
+std::enable_if_t<std::conjunction_v<cute::is_layout<SMemAtomLayoutA>,
+                                    cute::is_layout<SMemAtomLayoutB>>>
+test_cooperative_gemm_col_major_layout_rmem_c(SMemAtomLayoutA smem_atom_layout_a,
+                                              SMemAtomLayoutB smem_atom_layout_b,
+                                              ShapeMNK        shape_mnk,
+                                              TiledMma        tiled_mma,
+                                              Ops      ...    ops)
+{
+  auto gmem_a_layout = make_layout(select<0, 2>(shape_mnk));
+  auto gmem_b_layout = make_layout(select<1, 2>(shape_mnk), GenRowMajor{});
+  auto gmem_c_layout = make_layout(select<0, 1>(shape_mnk));
+
+  auto smem_a_layout = tile_to_shape(
+      smem_atom_layout_a,
+      make_shape(shape<0>(gmem_a_layout), shape<1>(gmem_a_layout)));
+
+  auto smem_b_layout = tile_to_shape(
+      smem_atom_layout_b,
+      make_shape(shape<0>(gmem_b_layout), shape<1>(gmem_b_layout)));
+
+  test_cooperative_gemm_rmem_c<ThreadBlockSize, CopyMaxVecBits,
+                               TA, TB, TC>
+    (gmem_a_layout,
+     gmem_b_layout,
+     gmem_c_layout,
+     smem_a_layout,
+     smem_b_layout,
+     tiled_mma,
+     ops...);
+}
+
+template<uint32_t ThreadBlockSize,
+         typename T,
+         class ... Args>
+void test_cooperative_gemm_col_major_layout_rmem_c(Args&& ... args)
+{
+  test_cooperative_gemm_col_major_layout_rmem_c<ThreadBlockSize,
+                                                cute::sizeof_bits_v<T>,
+                                                T, T, T>
+    (static_cast<Args&&>(args)...);
+}
+
+template<uint32_t ThreadBlockSize,
+         class T,
+         class ... Args>
+void test_cooperative_gemm_col_major_layout(Args&& ... args)
+{
+  test_cooperative_gemm_col_major_layout<ThreadBlockSize,
+                                         cute::sizeof_bits_v<T>,
+                                         T, T, T>
+    (static_cast<Args&&>(args)...);
+}
diff --git a/test/unit/cute/core/CMakeLists.txt b/test/unit/cute/core/CMakeLists.txt
new file mode 100644
index 0000000000..77037ac473
--- /dev/null
+++ b/test/unit/cute/core/CMakeLists.txt
@@ -0,0 +1,57 @@
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_cute_core
+  WITHOUT_CUDA
+  array_subbyte.cpp
+  bitfield.cpp
+  coalesce.cpp
+  compact_xmajor.cpp
+  compare.cpp
+  complement.cpp
+  composition.cpp
+  constants.cpp
+  core_unit.cpp
+  domain_distribute.cpp
+  int_tuple.cpp
+  inverse_left.cpp
+  inverse_right.cpp
+  logical_divide.cpp
+  logical_product.cpp
+  math.cpp
+  mixedbits.cpp
+  nullspace.cpp
+  packed_tuple.cpp
+  pointer.cpp
+  reverse.cpp
+  swizzle_layout.cpp
+  transform.cpp
+  tuple.cpp
+  tuple_find.cpp
+)
diff --git a/test/unit/cute/core/array_subbyte.cpp b/test/unit/cute/core/array_subbyte.cpp
new file mode 100644
index 0000000000..f3b94a8ff3
--- /dev/null
+++ b/test/unit/cute/core/array_subbyte.cpp
@@ -0,0 +1,244 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <iostream>
+#include <iomanip>
+#include <utility>
+
+#include <cute/container/array_subbyte.hpp>
+#include <cute/tensor.hpp>
+#include <cute/numeric/numeric_types.hpp>
+
+TEST(CuTe_core, ArraySubbyte)
+{
+  using namespace cute;
+  {
+    array_subbyte<int4_t, 10> array0{};
+    array_subbyte<int4_t,  5> array1{};
+    fill(array0, int4_t(0));
+    fill(array1, int4_t(1));
+
+    for (size_t i = 0; i < array1.size(); ++i) {
+      array0[i+5] = array1[i];
+    }
+
+    EXPECT_EQ(int4_t(array0.back()), int4_t(1));
+
+    for (size_t i = 0; i < array1.size(); ++i) {
+      EXPECT_EQ(int4_t(array0[i]), int4_t(int(i) / 5));
+    }
+  }
+
+  {
+  array_subbyte<uint8_t, 14> a{};
+
+  //std::cout << sizeof_bits<decltype(a)>::value << std::endl;
+  EXPECT_EQ(cute::sizeof_bits_v<decltype(a)>, 14*8);
+
+  fill(a, uint8_t(13));
+  for (int i = 0; i < int(a.size()); ++i) {
+    //std::cout << i << ": " << int(a[i]) << " -> ";
+    EXPECT_EQ(a[i], uint8_t(13));
+    a[i] = uint8_t(i);
+    //std::cout << int(a[i]) << std::endl;
+    EXPECT_EQ(a[i], uint8_t(i));
+  }
+
+  //std::cout << std::endl;
+  }
+
+  {
+  array_subbyte<int4_t, 14> a{};
+
+  //std::cout << sizeof_bits<decltype(a)>::value << std::endl;
+  EXPECT_EQ(cute::sizeof_bits_v<decltype(a)>, 14/2*8);
+
+  fill(a, int4_t(-5));
+  for (int i = 0; i < int(a.size()); ++i) {
+    //std::cout << i << ": " << int4_t(a[i]) << " -> ";
+    EXPECT_EQ(int4_t(a[i]), int4_t(-5));
+    a[i] = int4_t(i);
+    //std::cout << int4_t(a[i]) << std::endl;
+    EXPECT_EQ(int4_t(a[i]), int4_t(i));
+  }
+
+  //std::cout << std::endl;
+  }
+
+  {
+  array_subbyte<uint2_t, 14> a{};
+
+  //std::cout << sizeof_bits<decltype(a)>::value << std::endl;
+  EXPECT_EQ(cute::sizeof_bits_v<decltype(a)>, 4*8);
+
+  fill(a, uint2_t(-5));
+  for (int i = 0; i < int(a.size()); ++i) {
+    //std::cout << i << ": " << uint2_t(a[i]) << " -> ";
+    EXPECT_EQ(uint2_t(a[i]), uint2_t(-5));
+    a[i] = uint2_t(i);
+    //std::cout << uint2_t(a[i]) << std::endl;
+    EXPECT_EQ(uint2_t(a[i]), uint2_t(i));
+  }
+
+  //std::cout << std::endl;
+  }
+
+  {
+  array_subbyte<bool, 14> a{};
+
+  //std::cout << sizeof_bits<decltype(a)>::value << std::endl;
+  EXPECT_EQ(cute::sizeof_bits_v<decltype(a)>, 2*8);
+
+  fill(a, bool(1));
+  for (int i = 0; i < int(a.size()); ++i) {
+    //std::cout << i << ": " << bool(a[i]) << " -> ";
+    EXPECT_EQ(a[i], bool(1));
+    a[i] = bool(i % 2);
+    //std::cout << bool(a[i]) << std::endl;
+    EXPECT_EQ(a[i], bool(i % 2));
+  }
+  //std::cout << std::endl;
+  }
+}
+
+TEST(CuTe_core, Subbyte_iterator)
+{
+  using namespace cute;
+
+  {
+  array_subbyte<uint8_t, 15> a{};
+  auto tensor = make_tensor(a.begin(), make_shape(15));
+
+  fill(a, uint8_t(13));
+  for (int i = 0; i < int(a.size()); ++i) {
+    EXPECT_EQ(uint8_t(tensor(i)), 13);
+    tensor(i) = uint8_t(i);
+    EXPECT_EQ(a[i], uint8_t(tensor(i)));
+  }
+
+  }
+
+  {
+  array_subbyte<int4_t, 15> a{};
+  auto tensor = make_tensor(a.begin(), make_shape(15));
+
+  fill(a, int4_t(-5));
+  for (int i = 0; i < int(a.size()); ++i) {
+    EXPECT_EQ(int4_t(tensor(i)), int4_t(-5));
+    tensor(i) = int4_t(i);
+    EXPECT_EQ(int4_t(a[i]), int4_t(tensor(i)));
+  }
+
+  }
+
+  {
+  array_subbyte<uint2_t, 15> a{};
+  auto tensor = make_tensor(a.begin(), make_shape(15));
+
+  fill(a, uint2_t(-5));
+  for (int i = 0; i < int(a.size()); ++i) {
+    EXPECT_EQ(uint2_t(tensor(i)), uint2_t(-5));
+    tensor(i) = uint2_t(i);
+    EXPECT_EQ(uint2_t(a[i]), uint2_t(tensor(i)));
+  }
+
+  }
+
+  {
+  array_subbyte<bool, 15> a{};
+  auto tensor = make_tensor(a.begin(), make_shape(15));
+
+  fill(a, bool(1));
+  for (int i = 0; i < int(a.size()); ++i) {
+    EXPECT_EQ(bool(tensor(i)), bool(1));
+    tensor(i) = bool(i % 2);
+    EXPECT_EQ(a[i], bool(tensor(i)));
+  }
+  }
+}
+
+TEST(CuTe_core, Const_subbyte_iterator)
+{
+  using namespace cute;
+
+  {
+  array_subbyte<uint8_t, 15> a{};
+  auto tensor = make_tensor(a.begin(), make_shape(15));
+
+  fill(a, uint8_t(13));
+  for (int i = 0; i < int(a.size()); ++i) {
+    EXPECT_EQ(uint8_t(tensor(i)), 13);
+    a[i] = uint8_t(i);
+    EXPECT_EQ(a[i], uint8_t(tensor(i)));
+  }
+
+  }
+
+  {
+  array_subbyte<int4_t, 15> a{};
+  auto tensor = make_tensor(a.begin(), make_shape(15));
+
+  fill(a, int4_t(-5));
+  for (int i = 0; i < int(a.size()); ++i) {
+    EXPECT_EQ(int4_t(tensor(i)), int4_t(-5));
+    a[i] = int4_t(i);
+    EXPECT_EQ(int4_t(a[i]), int4_t(tensor(i)));
+  }
+
+  }
+
+  {
+  array_subbyte<uint2_t, 15> a{};
+  auto tensor = make_tensor(a.begin(), make_shape(15));
+
+  fill(a, uint2_t(-5));
+  for (int i = 0; i < int(a.size()); ++i) {
+    EXPECT_EQ(uint2_t(tensor(i)), uint2_t(-5));
+    a[i] = uint2_t(i);
+    EXPECT_EQ(uint2_t(a[i]), uint2_t(tensor(i)));
+  }
+
+  }
+
+  {
+  array_subbyte<bool, 15> a{};
+  auto tensor = make_tensor(a.begin(), make_shape(15));
+
+  fill(a, bool(1));
+  for (int i = 0; i < int(a.size()); ++i) {
+    EXPECT_EQ(bool(tensor(i)), bool(1));
+    a[i] = bool(i % 2);
+    EXPECT_EQ(a[i], bool(tensor(i)));
+  }
+  }
+}
diff --git a/test/unit/cute/core/bitfield.cpp b/test/unit/cute/core/bitfield.cpp
new file mode 100644
index 0000000000..04213869b2
--- /dev/null
+++ b/test/unit/cute/core/bitfield.cpp
@@ -0,0 +1,86 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <iostream>
+#include <iomanip>
+#include <utility>
+#include <type_traits>
+#include <vector>
+#include <numeric>
+
+#include <cute/container/bit_field.hpp>
+
+#include <cute/algorithm/tuple_algorithms.hpp>
+
+using namespace cute;
+
+TEST(CuTe_core, Bitfield)
+{
+  for_each(make_int_range<1,65>{}, [&](auto NumBits) {
+    constexpr auto num_bits = cute::remove_cvref_t<decltype(NumBits)>::value;
+    for_each(make_int_range<0, 129>{}, [&](auto BitStart) {
+      constexpr auto bit_start = cute::remove_cvref_t<decltype(BitStart)>::value;
+      using BF = bit_field<bit_start, cute::remove_cvref_t<decltype(NumBits)>::value>;
+
+#if 0
+      printf("bit_field<%d,%d>:\n", bit_start, num_bits);
+      printf("  value_type_bits  : %d\n", BF::value_type_bits);
+      printf("  storage_type_bits: %d\n", BF::storage_type_bits);
+      printf("  N                : %d\n", BF::N);
+      printf("  idx              : %d\n", BF::idx);
+      printf("  bit_lo           : %d\n", BF::bit_lo);
+      printf("  bit_hi           : %d\n", BF::bit_hi);
+      printf("  mask             : 0x%lx\n", uint64_t(BF::mask));
+      printf("  mask_lo          : 0x%lx\n", uint64_t(BF::mask_lo));
+      printf("  mask_hi          : 0x%lx\n", uint64_t(BF::mask_hi));
+#endif
+
+      // Test
+      uint64_t v = num_bits == 64 ? uint64_t(-1) : ((uint64_t(1) << NumBits) - 1);
+
+      BF bf{};
+      bf = v;
+      EXPECT_EQ(v, uint64_t(bf));
+    });
+  });
+
+  for_each(make_int_range<0,129>{}, [&](auto BitStart) {
+
+    using BF = bit_field<cute::remove_cvref_t<decltype(BitStart)>::value, 32, float>;
+
+    BF bf{};
+    bf = 3.14f;
+    EXPECT_EQ(3.14f, float(bf));
+  });
+
+}
diff --git a/test/unit/cute/core/coalesce.cpp b/test/unit/cute/core/coalesce.cpp
new file mode 100644
index 0000000000..d45945c703
--- /dev/null
+++ b/test/unit/cute/core/coalesce.cpp
@@ -0,0 +1,182 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+
+#include <cute/tensor.hpp>
+
+using namespace cute;
+
+template <class Layout>
+void
+test_coalesce(Layout const& layout)
+{
+  auto coalesce_layout = coalesce(layout);
+
+  CUTLASS_TRACE_HOST(shape (layout) << "  =>  " << shape (coalesce_layout));
+  CUTLASS_TRACE_HOST(stride(layout) << "      " << stride(coalesce_layout));
+
+  CUTE_STATIC_ASSERT_V(depth(coalesce_layout) <= Int<1>{});
+
+  ASSERT_EQ(size(coalesce_layout),  size(layout));
+
+  for (int i = 0; i < size(layout); ++i) {
+    EXPECT_EQ(coalesce_layout(i), layout(i));
+  }
+}
+
+TEST(CuTe_core, Coalesce)
+{
+  {
+  auto layout = make_layout(Int<1>{}, Int<0>{});
+
+  test_coalesce(layout);
+  }
+
+  {
+  auto layout = make_layout(Int<1>{}, Int<1>{});
+
+  test_coalesce(layout);
+  }
+
+  {
+  auto layout = make_layout(make_shape(Int<2>{}, Int<4>{}));
+
+  test_coalesce(layout);
+  }
+
+  {
+  auto layout = make_layout(make_shape(Int<2>{}, Int<4>{}, Int<6>{}));
+
+  test_coalesce(layout);
+  }
+
+  {
+  auto layout = make_layout(make_shape (Int<2>{}, Int<1>{}, Int<6>{}),
+                            make_stride(Int<1>{}, Int<6>{}, Int<2>{}));
+
+  test_coalesce(layout);
+  }
+
+  {
+  auto layout = make_layout(make_shape (Int<2>{}, Int<1>{}, Int<6>{}),
+                            make_stride(Int<1>{},        7, Int<2>{}));
+
+  test_coalesce(layout);
+  }
+
+  {
+  auto layout = make_layout(make_shape (Int<2>{}, Int<1>{}, Int<6>{}),
+                            make_stride(Int<4>{},        7, Int<8>{}));
+
+  test_coalesce(layout);
+  }
+
+  {
+  auto layout = make_layout(make_shape(2, Int<4>{}, Int<6>{}));
+
+  test_coalesce(layout);
+  }
+
+  {
+  auto layout = make_layout(make_shape(Int<2>{}, 4, Int<6>{}));
+
+  test_coalesce(layout);
+  }
+
+  {
+  auto layout = make_layout(make_shape(Int<2>{}, Int<4>{}, 6));
+
+  test_coalesce(layout);
+  }
+
+  {
+  auto layout = make_layout(make_shape(Int<2>{}, Int<4>{}), GenRowMajor{});
+
+  test_coalesce(layout);
+  }
+
+  {
+  auto layout = make_layout(make_shape(Int<2>{}, Int<4>{}, Int<6>{}), GenRowMajor{});
+
+  test_coalesce(layout);
+  }
+
+  {
+  auto layout = make_layout(make_shape(2, Int<4>{}, Int<6>{}), GenRowMajor{});
+
+  test_coalesce(layout);
+  }
+
+  {
+  auto layout = make_layout(make_shape(Int<2>{}, 4, Int<6>{}), GenRowMajor{});
+
+  test_coalesce(layout);
+  }
+
+  {
+  auto layout = make_layout(make_shape(Int<2>{}, Int<4>{}, 6), GenRowMajor{});
+
+  test_coalesce(layout);
+  }
+
+  {
+  auto layout = make_layout(make_shape(Int<2>{}, Int<1>{}, Int<3>{}), GenRowMajor{});
+
+  test_coalesce(layout);
+  }
+
+  {
+  auto layout = make_layout(make_shape(Int<2>{}, 1, Int<3>{}), GenRowMajor{});
+
+  test_coalesce(layout);
+  }
+
+  {
+  auto layout = make_layout(make_shape(Int<2>{}, 1, Int<3>{}), make_stride(Int<2>{}, 4, Int<4>{}));
+
+  test_coalesce(layout);
+  }
+
+  {
+  auto layout = make_layout(make_shape(Int<2>{}, 1, Int<3>{}), make_stride(Int<2>{}, Int<0>{}, Int<4>{}));
+
+  test_coalesce(layout);
+  }
+
+  {
+  auto layout = Layout<Shape<Shape<_2,_2>,Shape<_2, _2>>,
+                       Stride<Stride<_1,_4>,Stride<_8,_32>>>{};
+
+  test_coalesce(layout);
+  }
+}
diff --git a/test/unit/cute/core/compact_xmajor.cpp b/test/unit/cute/core/compact_xmajor.cpp
new file mode 100644
index 0000000000..9e2fbc603f
--- /dev/null
+++ b/test/unit/cute/core/compact_xmajor.cpp
@@ -0,0 +1,231 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <cutlass/trace.h>
+#include <cute/stride.hpp>
+
+TEST(CuTe_core, CompactColMajor_Static)
+{
+  using namespace cute;
+
+  CUTE_STATIC_ASSERT_V((compact_col_major(Int<1>{}) == Int<0>{}));
+  CUTE_STATIC_ASSERT_V((compact_col_major(Int<1>{}, Int<3>{}) == Int<0>{}));
+  CUTE_STATIC_ASSERT_V((compact_col_major(Int<8>{}) == Int<1>{}));
+  CUTE_STATIC_ASSERT_V((compact_col_major(Int<8>{}, Int<3>{}) == Int<3>{}));
+
+  CUTE_STATIC_ASSERT_V((compact_col_major(1) == Int<1>{}));
+  CUTE_STATIC_ASSERT_V((compact_col_major(8) == Int<1>{}));
+
+  {
+    auto test   = make_tuple(Int<4>{}, Int<8>{});
+    auto result = make_tuple(Int<1>{}, Int<4>{});
+    CUTE_STATIC_ASSERT_V((compact_col_major(test) == result));
+  }
+
+  {
+    auto test   = make_tuple(Int<4>{}, Int<8>{}, Int< 2>{});
+    auto result = make_tuple(Int<1>{}, Int<4>{}, Int<32>{});
+    CUTE_STATIC_ASSERT_V((compact_col_major(test) == result));
+  }
+
+  {
+    auto test   = make_tuple(Int<4>{}, Int<8>{}, Int<1>{}, Int< 2>{});
+    auto result = make_tuple(Int<1>{}, Int<4>{}, Int<0>{}, Int<32>{});
+    CUTE_STATIC_ASSERT_V((compact_col_major(test) == result));
+  }
+
+  {
+    auto test   = make_tuple(make_tuple(Int<4>{}, Int<8>{}), Int<1>{}, Int< 2>{});
+    auto result = make_tuple(make_tuple(Int<1>{}, Int<4>{}), Int<0>{}, Int<32>{});
+    CUTE_STATIC_ASSERT_V((compact_col_major(test) == result));
+  }
+
+  {
+    auto test   = make_tuple(Int<4>{}, make_tuple(Int<8>{}, Int<1>{}, Int< 2>{}));
+    auto result = make_tuple(Int<1>{}, make_tuple(Int<4>{}, Int<0>{}, Int<32>{}));
+    CUTE_STATIC_ASSERT_V((compact_col_major(test) == result));
+  }
+
+  {
+    auto test   = make_tuple(Int<4>{}, make_tuple(Int<8>{}, Int<1>{}, make_tuple(Int< 2>{}, Int< 3>{})));
+    auto result = make_tuple(Int<1>{}, make_tuple(Int<4>{}, Int<0>{}, make_tuple(Int<32>{}, Int<64>{})));
+    CUTE_STATIC_ASSERT_V((compact_col_major(test) == result));
+  }
+}
+
+TEST(CuTe_core, CompactColMajor_Dynamic)
+{
+  using namespace cute;
+
+  ASSERT_TRUE((compact_col_major(1) == 1));
+  ASSERT_TRUE((compact_col_major(1, 3) == 3));
+  ASSERT_TRUE((compact_col_major(8) == 1));
+  ASSERT_TRUE((compact_col_major(8, 3) == 3));
+
+  ASSERT_TRUE((compact_col_major(1) == 1));
+  ASSERT_TRUE((compact_col_major(8) == 1));
+
+  {
+    auto test   = make_tuple(4, 8);
+    auto result = make_tuple(1, 4);
+    ASSERT_TRUE((compact_col_major(test) == result));
+  }
+
+  {
+    auto test   = make_tuple(4, 8,  2);
+    auto result = make_tuple(1, 4, 32);
+    ASSERT_TRUE((compact_col_major(test) == result));
+  }
+
+  {
+    auto test   = make_tuple(4, 8,  1,  2);
+    auto result = make_tuple(1, 4, 32, 32);
+    ASSERT_TRUE((compact_col_major(test) == result));
+  }
+
+  {
+    auto test   = make_tuple(make_tuple(4, 8),  1,  2);
+    auto result = make_tuple(make_tuple(1, 4), 32, 32);
+    ASSERT_TRUE((compact_col_major(test) == result));
+  }
+
+  {
+    auto test   = make_tuple(4, make_tuple(8,  1, 2));
+    auto result = make_tuple(1, make_tuple(4, 32, 32));
+    ASSERT_TRUE((compact_col_major(test) == result));
+  }
+
+  {
+    auto test   = make_tuple(4, make_tuple(8,  1, make_tuple( 2, 3)));
+    auto result = make_tuple(1, make_tuple(4, 32, make_tuple(32, 64)));
+    ASSERT_TRUE((compact_col_major(test) == result));
+  }
+}
+
+TEST(CuTe_core, CompactRowMajor_Static)
+{
+  using namespace cute;
+
+  CUTE_STATIC_ASSERT_V((compact_row_major(Int<1>{}) == Int<0>{}));
+  CUTE_STATIC_ASSERT_V((compact_row_major(Int<1>{}, Int<3>{}) == Int<0>{}));
+  CUTE_STATIC_ASSERT_V((compact_row_major(Int<8>{}) == Int<1>{}));
+  CUTE_STATIC_ASSERT_V((compact_row_major(Int<8>{}, Int<3>{}) == Int<3>{}));
+
+  CUTE_STATIC_ASSERT_V((compact_row_major(1) == Int<1>{}));
+  CUTE_STATIC_ASSERT_V((compact_row_major(8) == Int<1>{}));
+
+  {
+    auto test   = make_tuple(Int<4>{}, Int<8>{});
+    auto result = make_tuple(Int<8>{}, Int<1>{});
+    CUTE_STATIC_ASSERT_V((compact_row_major(test) == result));
+  }
+
+  {
+    auto test   = make_tuple(Int< 4>{}, Int<8>{}, Int<2>{});
+    auto result = make_tuple(Int<16>{}, Int<2>{}, Int<1>{});
+    CUTE_STATIC_ASSERT_V((compact_row_major(test) == result));
+  }
+
+  {
+    auto test   = make_tuple(Int< 4>{}, Int<8>{}, Int<1>{}, Int<2>{});
+    auto result = make_tuple(Int<16>{}, Int<2>{}, Int<0>{}, Int<1>{});
+    CUTE_STATIC_ASSERT_V((compact_row_major(test) == result));
+  }
+
+  {
+    auto test   = make_tuple(make_tuple(Int< 4>{}, Int<8>{}), Int<1>{}, Int<2>{});
+    auto result = make_tuple(make_tuple(Int<16>{}, Int<2>{}), Int<0>{}, Int<1>{});
+    CUTE_STATIC_ASSERT_V((compact_row_major(test) == result));
+  }
+
+  {
+    auto test   = make_tuple(Int< 4>{}, make_tuple(Int<8>{}, Int<1>{}, Int<2>{}));
+    auto result = make_tuple(Int<16>{}, make_tuple(Int<2>{}, Int<0>{}, Int<1>{}));
+    CUTE_STATIC_ASSERT_V((compact_row_major(test) == result));
+  }
+
+  {
+    auto test   = make_tuple(Int< 4>{}, make_tuple(Int<8>{}, Int<1>{}, make_tuple(Int<2>{}, Int<3>{})));
+    auto result = make_tuple(Int<48>{}, make_tuple(Int<6>{}, Int<0>{}, make_tuple(Int<3>{}, Int<1>{})));
+    CUTE_STATIC_ASSERT_V((compact_row_major(test) == result));
+  }
+}
+
+TEST(CuTe_core, CompactRowMajor_Dynamic)
+{
+  using namespace cute;
+
+  ASSERT_TRUE((compact_row_major(1) == 1));
+  ASSERT_TRUE((compact_row_major(1, 3) == 3));
+  ASSERT_TRUE((compact_row_major(8) == 1));
+  ASSERT_TRUE((compact_row_major(8, 3) == 3));
+
+  ASSERT_TRUE((compact_row_major(1) == 1));
+  ASSERT_TRUE((compact_row_major(8) == 1));
+
+  {
+    auto test   = make_tuple(4, 8);
+    auto result = make_tuple(8, 1);
+    ASSERT_TRUE((compact_row_major(test) == result));
+  }
+
+  {
+    auto test   = make_tuple( 4, 8, 2);
+    auto result = make_tuple(16, 2, 1);
+    ASSERT_TRUE((compact_row_major(test) == result));
+  }
+
+  {
+    auto test   = make_tuple( 4, 8, 1, 2);
+    auto result = make_tuple(16, 2, 2, 1);
+    ASSERT_TRUE((compact_row_major(test) == result));
+  }
+
+  {
+    auto test   = make_tuple(make_tuple( 4, 8), 1, 2);
+    auto result = make_tuple(make_tuple(16, 2), 2, 1);
+    ASSERT_TRUE((compact_row_major(test) == result));
+  }
+
+  {
+    auto test   = make_tuple( 4, make_tuple(8, 1, 2));
+    auto result = make_tuple(16, make_tuple(2, 2, 1));
+    ASSERT_TRUE((compact_row_major(test) == result));
+  }
+
+  {
+    auto test   = make_tuple( 4, make_tuple(8, 1, make_tuple(2, 3)));
+    auto result = make_tuple(48, make_tuple(6, 6, make_tuple(3, 1)));
+    ASSERT_TRUE((compact_row_major(test) == result));
+  }
+}
diff --git a/test/unit/cute/core/compare.cpp b/test/unit/cute/core/compare.cpp
new file mode 100644
index 0000000000..1659a3ec5a
--- /dev/null
+++ b/test/unit/cute/core/compare.cpp
@@ -0,0 +1,168 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <cutlass/trace.h>
+#include <cute/tensor.hpp>
+
+TEST(CuTe_core, Compare_simple_2d_GenColMajor)
+{
+  using namespace cute;
+
+  // Simple 2D layout
+  auto layout = make_layout(make_shape(Int<3>{}, Int<5>{}), GenColMajor{});
+  CUTLASS_TRACE_HOST("Layout: " << layout);
+
+  for (int i = 0; i < size(layout); ++i) {
+    auto coord_i = layout.get_hier_coord(i);
+
+    CUTLASS_TRACE_HOST(i << ": " << coord_i);
+
+    EXPECT_TRUE(elem_less(coord_i, shape(layout)));
+
+    for (int j = 0; j < size(layout); ++j) {
+      auto coord_j = layout.get_hier_coord(j);
+      CUTLASS_TRACE_HOST("  " << j << ": " << coord_j);
+      EXPECT_TRUE(elem_less(coord_j, shape(layout)));
+
+      EXPECT_EQ((i < j), colex_less(coord_i,coord_j));
+    }
+  }
+}
+
+
+TEST(CuTe_core, Compare_simple_2d_GenRowMajor)
+{
+  using namespace cute;
+
+  auto layout = make_layout(make_shape(Int<3>{}, Int<5>{}), GenRowMajor{});
+  CUTLASS_TRACE_HOST("Layout: " << layout);
+
+  for (int i = 0; i < size(layout); ++i) {
+    auto coord_i = layout.get_hier_coord(i);
+    CUTLASS_TRACE_HOST(i << ": " << coord_i);
+    EXPECT_TRUE(elem_less(coord_i, shape(layout)));
+
+    for (int j = 0; j < size(layout); ++j) {
+      auto coord_j = layout.get_hier_coord(j);
+      EXPECT_TRUE(elem_less(coord_j, shape(layout)));
+
+      EXPECT_EQ((i < j), lex_less(coord_i,coord_j));
+    }
+  }
+}
+
+
+TEST(CuTe_core, Compare_simple_3d_GenColMajor)
+{
+  using namespace cute;
+
+  auto layout = make_layout(make_shape(Int<2>{}, Int<3>{}, Int<5>{}), GenColMajor{});
+  CUTLASS_TRACE_HOST("Layout: " << layout);
+
+  for (int i = 0; i < size(layout); ++i) {
+    auto coord_i = layout.get_hier_coord(i);
+    CUTLASS_TRACE_HOST(i << ": " << coord_i);
+    EXPECT_TRUE(elem_less(coord_i, shape(layout)));
+
+    for (int j = 0; j < size(layout); ++j) {
+      auto coord_j = layout.get_hier_coord(j);
+      EXPECT_TRUE(elem_less(coord_j, shape(layout)));
+
+      EXPECT_EQ((i < j), colex_less(coord_i,coord_j));
+    }
+  }
+}
+
+
+TEST(CuTe_core, Compare_simple_3d_GenRowMajor)
+{
+  using namespace cute;
+
+  auto layout = make_layout(make_shape(Int<2>{}, Int<3>{}, Int<5>{}), GenRowMajor{});
+  CUTLASS_TRACE_HOST("Layout: " << layout);
+
+  for (int i = 0; i < size(layout); ++i) {
+    auto coord_i = layout.get_hier_coord(i);
+    CUTLASS_TRACE_HOST(i << ": " << coord_i);
+    EXPECT_TRUE(elem_less(coord_i, shape(layout)));
+
+    for (int j = 0; j < size(layout); ++j) {
+      auto coord_j = layout.get_hier_coord(j);
+      EXPECT_TRUE(elem_less(coord_j, shape(layout)));
+
+      EXPECT_EQ((i < j), lex_less(coord_i,coord_j));
+    }
+  }
+}
+
+
+TEST(CuTe_core, Compare_hierarchical_3d_GenColMajor)
+{
+  using namespace cute;
+
+  auto layout = make_layout(Shape<Shape<_3,_2>,Shape<_5,_2,_2>>{}, GenColMajor{});
+  CUTLASS_TRACE_HOST("Layout: " << layout);
+
+  for (int i = 0; i < size(layout); ++i) {
+    auto coord_i = layout.get_hier_coord(i);
+    CUTLASS_TRACE_HOST(i << ": " << coord_i);
+    EXPECT_TRUE(elem_less(coord_i, shape(layout)));
+
+    for (int j = 0; j < size(layout); ++j) {
+      auto coord_j = layout.get_hier_coord(j);
+      EXPECT_TRUE(elem_less(coord_j, shape(layout)));
+
+      EXPECT_EQ((i < j), colex_less(coord_i,coord_j));
+    }
+  }
+}
+
+TEST(CuTe_core, Compare_hierarchical_3d_GenRowMajor)
+{
+  using namespace cute;
+  auto layout = make_layout(Shape<Shape<_3,_2>,Shape<_5,_2,_2>>{}, GenRowMajor{});
+  CUTLASS_TRACE_HOST("Layout: " << layout);
+
+  for (int i = 0; i < size(layout); ++i) {
+    auto coord_i = layout.get_hier_coord(i);
+    CUTLASS_TRACE_HOST(i << ": " << coord_i);
+    EXPECT_TRUE(elem_less(coord_i, shape(layout)));
+
+    for (int j = 0; j < size(layout); ++j) {
+      auto coord_j = layout.get_hier_coord(j);
+      EXPECT_TRUE(elem_less(coord_j, shape(layout)));
+
+      EXPECT_EQ((i < j), lex_less(coord_i,coord_j));
+    }
+  }
+}
diff --git a/test/unit/cute/core/complement.cpp b/test/unit/cute/core/complement.cpp
new file mode 100644
index 0000000000..cba486f69d
--- /dev/null
+++ b/test/unit/cute/core/complement.cpp
@@ -0,0 +1,307 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <cutlass/trace.h>
+
+#include <cute/tensor.hpp>
+
+template <class Layout, class CoTarget>
+void
+test_complement(Layout const& layout, CoTarget const& cotarget)
+{
+  using namespace cute;
+
+  auto result = complement(layout, cotarget);
+
+  CUTLASS_TRACE_HOST("complement(" << layout << ", " << cotarget << ")  =>  " << result);
+
+  auto completed = make_layout(layout, result);
+
+  // Lower-bound on the codomain size of the layout ++ complement (1)
+  EXPECT_GE(cosize(completed), size(cotarget));
+  // Upper-bound on the codomain size of the complement (2)
+  EXPECT_LE(cosize(result), cute::round_up(size(cotarget), cosize(layout)));
+
+  // Post-condition on the codomain of the complement
+  for (int i = 1; i < size(result); ++i) {
+    EXPECT_LT(result(i-1), result(i));         // Ordered (3)
+    for (int j = 0; j < size(layout); ++j) {
+      EXPECT_NE(result(i), layout(j));         // Disjoint (4)
+    }
+  }
+
+  // Other observations
+  EXPECT_LE(size(result), cosize(result));                        // As a result of the ordered condition (3)
+  EXPECT_GE(size(result), size(cotarget) / size(filter(layout)));
+  EXPECT_LE(cosize(completed), cosize(result) + cosize(layout));
+  EXPECT_GE(cosize(result), size(cotarget) / size(filter(layout)));
+  if constexpr (is_static<decltype(stride(completed))>::value) {  // If we can apply complement again
+    EXPECT_EQ(size(complement(completed)), 1);                    // There's no more codomain left over
+  }
+}
+
+template <class Layout>
+void
+test_complement(Layout const& layout)
+{
+  return test_complement(layout, cosize(layout));
+}
+
+TEST(CuTe_core, Complement)
+{
+  using namespace cute;
+
+  CUTLASS_TRACE_HOST("-------------------------------");
+  CUTLASS_TRACE_HOST("COMPLEMENT");
+  CUTLASS_TRACE_HOST("-------------------------------");
+
+  {
+  auto layout = Layout<_1,_0>{};
+
+  test_complement(layout);
+  test_complement(layout, Int<2>{});
+  test_complement(layout, Int<5>{});
+  test_complement(layout, make_shape(Int<2>{}, 2));
+  }
+
+  {
+  auto layout = Layout<_1,_1>{};
+
+  test_complement(layout);
+  test_complement(layout, Int<2>{});
+  test_complement(layout, Int<5>{});
+  test_complement(layout, make_shape(Int<2>{}, 2));
+  }
+
+  {
+  auto layout = Layout<_1,_2>{};
+
+  test_complement(layout, Int<1>{});
+  test_complement(layout, Int<2>{});
+  test_complement(layout, Int<8>{});
+  test_complement(layout, Int<5>{});
+  test_complement(layout, make_shape(Int<2>{}, 2));
+  }
+
+  {
+  auto layout = Layout<_4,_0>{};
+
+  test_complement(layout, Int<1>{});
+  test_complement(layout, Int<2>{});
+  test_complement(layout, Int<8>{});
+  }
+
+  {
+  auto layout = Layout<_4,_1>{};
+
+  test_complement(layout, Int<1>{});
+  test_complement(layout, Int<2>{});
+  test_complement(layout, Int<8>{});
+  }
+
+  {
+  auto layout = Layout<_4,_2>{};
+
+  test_complement(layout, Int<1>{});
+  test_complement(layout);
+  test_complement(layout, Int<16>{});
+  test_complement(layout, Int<19>{});
+  test_complement(layout, make_shape(Int<2>{}, 2));
+  }
+
+  {
+  auto layout = Layout<_4,_4>{};
+
+  test_complement(layout, Int<1>{});
+  test_complement(layout);
+  test_complement(layout, Int<17>{});
+  test_complement(layout, make_shape(Int<2>{}, 2));
+  }
+
+  {
+  auto layout = Layout<Shape<_2,_4>>{};
+
+  test_complement(layout);
+  }
+
+  {
+  auto layout = Layout<Shape<_2,_3>>{};
+
+  test_complement(layout);
+  }
+
+  {
+  auto layout = Layout<Shape<_2,_4>, Stride<_1,_4>>{};
+
+  test_complement(layout);
+  }
+
+  {
+  auto layout = Layout<Shape<_2,_4>, Stride<_1,_6>>{};
+
+  test_complement(layout);
+  }
+
+  {
+  auto layout = Layout<Shape<_2,_4,_8>, Stride<_8,_1,_64>>{};
+
+  test_complement(layout);
+  }
+
+  {
+  auto layout = Layout<Shape<_2,_4,_8>, Stride<_8,_1,_0>>{};
+
+  test_complement(layout);
+  test_complement(layout, Int<460>{});
+  }
+
+  {
+  auto layout = make_layout(Shape <Shape <_2,_2>,Shape <_2, _2>>{},
+                            Stride<Stride<_1,_4>,Stride<_8,_32>>{});
+
+  test_complement(layout);
+  }
+
+  {
+  auto layout = make_layout(Shape <Shape <_2, _2>,Shape <_2,_2>>{},
+                            Stride<Stride<_1,_32>,Stride<_8,_4>>{});
+
+  test_complement(layout);
+  }
+
+  // Fails due to non-injective layout
+  // {
+  // auto layout = make_layout(Shape <Shape <_2,_2>,Shape <_2,_2>>{},
+  //                           Stride<Stride<_1,_8>,Stride<_8,_4>>{});
+
+  // test_complement(layout);
+  // }
+
+  // Fails due to non-injective layout
+  // {
+  // auto layout = Layout<Shape<_2,_2>, Stride<_2,_3>>{};
+
+  // test_complement(layout);
+  // test_complement(layout, Int<19>{});
+  // }
+
+  {
+  auto layout = Layout<Shape<_4,_6>, Stride<_1,_6>>{};
+
+  test_complement(layout);
+  }
+
+  {
+  auto layout = Layout<Shape<_4,_2>, Stride<_1,_10>>{};
+
+  test_complement(layout);
+  }
+
+  {
+  auto layout = Layout<Shape<_4,_2>, Stride<_1,_16>>{};
+
+  test_complement(layout);
+  }
+
+  CUTLASS_TRACE_HOST("-------------------------------");
+  CUTLASS_TRACE_HOST("Dynamic shapes/strides");
+  CUTLASS_TRACE_HOST("-------------------------------");
+
+  {
+  auto layout = make_layout(12);
+
+  test_complement(layout, 1);
+  test_complement(layout);
+  test_complement(layout, 53);
+  test_complement(layout, 128);
+  }
+
+  {
+  auto layout = make_layout(12, 1);
+
+  test_complement(layout, 1);
+  test_complement(layout);
+  test_complement(layout, 53);
+  test_complement(layout, 128);
+  }
+
+  {
+  auto layout = make_layout(12, Int<2>{});
+
+  test_complement(layout, 1);
+  test_complement(layout);
+  test_complement(layout, 53);
+  test_complement(layout, 128);
+  }
+
+  {
+  auto layout = make_layout(12, 2);
+
+  test_complement(layout, 1);
+  test_complement(layout);
+  test_complement(layout, 53);
+  test_complement(layout, 128);
+  }
+
+  {
+  auto layout = make_layout(make_shape(3,6),make_stride(_1{}, _3{}));
+
+  test_complement(layout);
+  }
+
+  {
+  auto layout = make_layout(make_shape(3,6),make_stride(_1{}, _9{}));
+
+  test_complement(layout);
+  }
+
+  {
+  auto layout = make_layout(make_shape(3,6),make_stride(_1{}, _10{}));
+
+  test_complement(layout);
+  }
+
+  {
+  auto layout = make_layout(make_shape(make_shape(2,2), make_shape(2,2)),
+                            Stride<Stride<_1,_4>,Stride<_8,_32>>{});
+
+  test_complement(layout);
+  }
+
+  {
+  auto layout = make_layout(Int<64>{});
+
+  test_complement(layout, make_shape(Int<32>{}, Int<4>{}, Int<4>{}));
+  test_complement(layout, make_shape(Int<32>{}, Int<4>{}, 4));
+  }
+}
diff --git a/test/unit/cute/core/composition.cpp b/test/unit/cute/core/composition.cpp
new file mode 100644
index 0000000000..679e7a0080
--- /dev/null
+++ b/test/unit/cute/core/composition.cpp
@@ -0,0 +1,544 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <cutlass/trace.h>
+
+#include <cute/layout.hpp>
+#include <cute/layout_composed.hpp>  // cute::composition
+#include <cute/swizzle.hpp>          // cute::Swizzle
+#include <cute/swizzle_layout.hpp>   // cute::composition
+#include <cute/tensor.hpp>
+#include <iostream>
+
+#include "cutlass_unit_test.h"
+
+using namespace cute;
+
+
+template <class LayoutA, class LayoutB>
+void
+test_composition(LayoutA const& layoutA,
+                 LayoutB const& layoutB)
+{
+  auto layoutR = composition(layoutA, layoutB);
+
+  CUTLASS_TRACE_HOST("test_composition()");
+  CUTLASS_TRACE_HOST(layoutA << " o " << layoutB);
+  CUTLASS_TRACE_HOST("  =>  ");
+  CUTLASS_TRACE_HOST(layoutR);
+
+  // Test that layout B is compatible with layout R
+  EXPECT_TRUE(compatible(layoutB, layoutR));
+
+  // Test that R(c) = A(B(c)) for all coordinates c in layoutB
+  for (int c = 0; c < size(layoutB); ++c) {
+    EXPECT_EQ(layoutR(c), layoutA(layoutB(c)));
+  }
+}
+
+
+TEST(CuTe_core, Composition)
+{
+  CUTLASS_TRACE_HOST("-------------------------------");
+  CUTLASS_TRACE_HOST("COMPOSITION"                    );
+  CUTLASS_TRACE_HOST("-------------------------------");
+
+  CUTLASS_TRACE_HOST("-------------------------------");
+  CUTLASS_TRACE_HOST("Simple tests"                   );
+  CUTLASS_TRACE_HOST("-------------------------------");
+
+  {
+    auto a = Layout<_1,_0>{};
+    auto b = Layout<_1,_0>{};
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = Layout<_1,_0>{};
+    auto b = Layout<_1,_1>{};
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = Layout<_1,_1>{};
+    auto b = Layout<_1,_0>{};
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = Layout<_1,_1>{};
+    auto b = Layout<_1,_1>{};
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_4>{});
+    auto b = make_layout(Shape<_4>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_4>{}, Stride<_2>{});
+    auto b = make_layout(Shape<_4>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_4>{}, Stride<_0>{});
+    auto b = make_layout(Shape<_4>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_4>{});
+    auto b = make_layout(Shape<_4>{}, Stride<_0>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_4>{});
+    auto b = make_layout(Shape<_1>{}, Stride<_0>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_4>{});
+    auto b = make_layout(Shape<_2>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_4>{}, Stride<_2>{});
+    auto b = make_layout(Shape<_2>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_4>{});
+    auto b = make_layout(Shape<_2>{}, Stride<_2>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_4>{}, Stride<_2>{});
+    auto b = make_layout(Shape<_2>{}, Stride<_2>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_4,_3>{});
+    auto b = make_layout(Shape<_12>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_12>{});
+    auto b = make_layout(Shape<_4,_3>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_12>{}, Stride<_2>{});
+    auto b = make_layout(Shape<_4,_3>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_12>{});
+    auto b = make_layout(Shape<_4,_3>{}, Stride<_3,_1>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_12>{}, Stride<_2>{});
+    auto b = make_layout(Shape<_4,_3>{}, Stride<_3,_1>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_12>{});
+    auto b = make_layout(Shape<_2,_3>{}, Stride<_2,_4>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_4,_3>{});
+    auto b = make_layout(Shape<_4,_3>{});
+
+    test_composition(a, b);
+  }
+
+  {
+   auto a = make_layout(Shape<_4,_3>{});
+   auto b = make_layout(Shape<_6>{});
+
+   test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_4,_3>{});
+    auto b = make_layout(Shape<_6>{}, Stride<_2>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_4,_3>{});
+    auto b = make_layout(Shape<_6,_2>{}, Stride<_2,_1>{});
+
+    test_composition(a, b);
+  }
+
+  {
+   auto a = make_layout(Shape<_4,_3>{});
+   auto b = make_layout(Shape<_4,_3>{}, Stride<_3,_1>{});
+
+   test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_4,_3>{}, Stride<_3,_1>{});
+    auto b = make_layout(Shape<_4,_3>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_4,_3>{}, Stride<_3,_1>{});
+    auto b = make_layout(Shape<_12>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_4,_3>{}, Stride<_3,_1>{});
+    auto b = make_layout(Shape<_6>{}, Stride<_2>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_4,_3>{}, Stride<_3,_1>{});
+    auto b = make_layout(Shape<_6,_2>{}, Stride<_2,_1>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_8,_8>{});
+    auto b = make_layout(Shape<Shape<_2, _2,_2>, Shape<_2,_2, _2>>{},
+                         Stride<Stride<_1,_16,_4>, Stride<_8,_2,_32>>{});
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_8,_8>{}, Stride<_8,_1>{});
+    auto b = make_layout(Shape<Shape<_2, _2,_2>, Shape<_2,_2, _2>>{},
+                         Stride<Stride<_1,_16,_4>, Stride<_8,_2,_32>>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<Shape<_4,_2>>{}, Stride<Stride<_1,_16>>{});
+    auto b = make_layout(Shape<_4,_2>{}, Stride<_2,_1>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_2,_2>{}, Stride<_2,_1>{});
+    auto b = make_layout(Shape<_2,_2>{}, Stride<_2,_1>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_4,_8,_2>{});
+    auto b = make_layout(Shape<_2,_2,_2>{}, Stride<_2,_8,_1>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_4,_8,_2>{}, Stride<_2,_8,_1>{});
+    auto b = make_layout(Shape<_2,_2,_2>{}, Stride<_1,_8,_2>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_4,_8,_2>{}, Stride<_2,_8,_1>{});
+    auto b = make_layout(Shape<_4,_2,_2>{}, Stride<_2,_8,_1>{});
+
+    test_composition(a, b);
+  }
+
+  CUTLASS_TRACE_HOST("-------------------------------");
+  CUTLASS_TRACE_HOST("Dynamic shapes/strides"         );
+  CUTLASS_TRACE_HOST("-------------------------------");
+
+
+  {
+    auto a = make_layout(12, 1);
+    auto b = make_layout(_4{}, _1{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(12, 1);
+    auto b = make_layout(_4{}, 1);
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(12, _1{});
+    auto b = make_layout(_4{}, 1);
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(12, _1{});
+    auto b = make_layout(_4{}, _1{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(make_shape(12,3), make_stride(1,24));
+    auto b = make_layout(Shape<_4>{}, Stride<_1>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(16, 2);
+    auto b = make_layout(4, 2);
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(make_shape(128,24,5), make_stride(1,128,3072));
+    auto b = make_layout(64, 2);
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(make_shape(128,24,5), make_stride(1,128,3072));
+    auto b = make_layout(480, Int<32>{});
+
+    test_composition(a, b);
+  }
+
+  CUTLASS_TRACE_HOST("-------------------------------");
+  CUTLASS_TRACE_HOST("cosize(b) > size(a) and divisibility");
+  CUTLASS_TRACE_HOST("-------------------------------");
+
+  {
+    auto a = make_layout(Shape<_1>{}, Stride<_0>{});
+    auto b = make_layout(Shape<_4>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_1>{}, Stride<_1>{});
+    auto b = make_layout(Shape<_4>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_4>{});
+    auto b = make_layout(Shape<_4>{}, Stride<_2>{});
+
+    test_composition(a, b);
+  }
+
+  // Last mode gets extended
+  {
+    auto a = make_layout(Shape<_4,_3>{}, Stride<_3,_1>{});
+    auto b = make_layout(Shape<_24>{});
+
+    test_composition(a, b);
+  }
+
+  // Last mode extension even without last mode divisibility
+  {
+    auto a = make_layout(Shape<_4,_3>{}, Stride<_3,_1>{});
+    auto b = make_layout(Shape<_8>{});
+
+    test_composition(a, b);
+  }
+
+  // Capping a Layout with 1:0 forces divisibility and extends in stride-0
+  {
+    auto a = make_layout(Shape<_4,_3,_1>{}, Stride<_3,_1,_0>{});
+    auto b = make_layout(Shape<_24>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(3, _1{});
+    auto b = make_layout(_4{}, _1{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(make_shape(48,24,5), make_stride(_1{},128,3072));
+    auto b = make_layout(32, Int<1>{});
+
+    test_composition(a, b);
+  }
+
+  CUTLASS_TRACE_HOST("-------------------------------");
+  CUTLASS_TRACE_HOST("Swizzle composition"            );
+  CUTLASS_TRACE_HOST("-------------------------------");
+
+  {
+    auto a = Layout<Shape<_8,_8>, Stride<_8,_1>>{};
+    auto b = composition(Swizzle<2,0,-3>{}, Layout<Shape<_8,_8>, Stride<_8,_1>>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = composition(Swizzle<2,0, 3>{}, Layout<Shape<_8,_8>, Stride<_8,_1>>{});
+    auto b = composition(Swizzle<2,0,-3>{}, Layout<Shape<_8,_8>, Stride<_8,_1>>{});
+
+    test_composition(a, b);
+  }
+
+  CUTLASS_TRACE_HOST("-------------------------------");
+  CUTLASS_TRACE_HOST("BETA: Negative strides"         );
+  CUTLASS_TRACE_HOST("-------------------------------");
+
+  {
+    auto a = make_layout(Shape<_4>{}, Stride<_m1>{});
+    auto b = make_layout(Shape<_4>{}, Stride<_1>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_4>{}, Stride<_1>{});
+    auto b = make_layout(Shape<_4>{}, Stride<_m1>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_4>{}, Stride<_m1>{});
+    auto b = make_layout(Shape<_4>{}, Stride<_m1>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_4>{}, Stride<_1>{});
+    auto b = make_layout(Shape<_4>{}, Stride<_m2>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_4,_4>{}, Stride<_m1,_1>{});
+    auto b = make_layout(Shape<_2,_4,_2>{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_4,_4>{}, Stride<_m1,_1>{});
+    auto b = make_layout(Shape<_2,_4,_2>{}, Stride<_1,_4,_2>{});
+
+    test_composition(a, b);
+  }
+
+  // The SM80 fp64 MMA NT problem
+  {
+    auto a = make_layout(Shape<_1,Shape<_2,_4>>{}, Stride<_0,Stride<_m1,_512>>{});
+    auto b = make_layout(_2{}, _m1{});
+
+    test_composition(a, b);
+  }
+
+  {
+    auto a = make_layout(Shape<_1,Shape<_2,_4>>{}, Stride<_0,Stride<_m1,_512>>{});
+    auto b = make_layout(_4{}, _m1{});
+
+    test_composition(a, b);
+  }
+
+  CUTLASS_TRACE_HOST("-------------------------------");
+  CUTLASS_TRACE_HOST("BETA: Tuple strides"            );
+  CUTLASS_TRACE_HOST("-------------------------------");
+
+  {
+   auto a = make_layout(Shape<_4,_4>{}, Stride<_4,_1>{});
+   auto b = make_layout(Shape<_4,_4>{}, Stride<E<1>,E<0>>{});
+
+   test_composition(a, b);
+  }
+
+  {
+   auto a = make_layout(Shape<_4,Shape<_2,_3>>{}, Stride<_6,Stride<_3,_1>>{});
+   auto b = make_layout(Shape<_2,_4>{}, Stride<E<1,1>,E<0>>{});
+
+   test_composition(a, b);
+  }
+}
diff --git a/test/unit/cute/core/constants.cpp b/test/unit/cute/core/constants.cpp
new file mode 100644
index 0000000000..562d86009d
--- /dev/null
+++ b/test/unit/cute/core/constants.cpp
@@ -0,0 +1,60 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+#include <cutlass/trace.h>
+
+#include <cute/numeric/integral_constant.hpp>
+#include <cute/algorithm/tuple_algorithms.hpp>
+
+TEST(CuTe_core, MakeIntegerSequence) {
+  cute::for_each(cute::make_integer_sequence<uint32_t, 13>{}, [](auto c) {
+    using c_type = decltype(c);
+    constexpr auto c_value = c_type::value;
+    using expected_type = cute::integral_constant<uint32_t, c_value>;
+    static_assert(cute::is_same_v<c_type, expected_type>);
+    static_assert(cute::is_same_v<typename c_type::value_type, uint32_t>);
+    static_assert(cute::is_constant<c_value, c_type>::value);
+    static_assert(cute::is_constant<0, decltype(c * cute::Int<0>{})>::value);
+    static_assert(cute::is_constant<2*c_value, decltype(c * cute::Int<2>{})>::value);
+  });
+
+  cute::for_each(cute::make_integer_sequence<int64_t, 17>{}, [](auto c) {
+    using c_type = decltype(c);
+    constexpr auto c_value = c_type::value;
+    using expected_type = cute::integral_constant<int64_t, c_value>;
+    static_assert(cute::is_same_v<c_type, expected_type>);
+    static_assert(cute::is_same_v<typename c_type::value_type, int64_t>);
+    static_assert(cute::is_constant<c_value, c_type>::value);
+    static_assert(cute::is_constant<0, decltype(c * cute::Int<0>{})>::value);
+    static_assert(cute::is_constant<2*c_value, decltype(c * cute::Int<2>{})>::value);
+  });
+}
diff --git a/test/unit/cute/core/core_unit.cpp b/test/unit/cute/core/core_unit.cpp
new file mode 100644
index 0000000000..ea3e72a0e3
--- /dev/null
+++ b/test/unit/cute/core/core_unit.cpp
@@ -0,0 +1,40 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/** \file
+    \brief Unit tests for CuTe core
+*/
+
+#include <gtest/gtest.h>
+
+int main(int argc, char* arg[]) {
+  ::testing::InitGoogleTest(&argc, arg);
+  return RUN_ALL_TESTS();
+}
diff --git a/test/unit/cute/core/domain_distribute.cpp b/test/unit/cute/core/domain_distribute.cpp
new file mode 100644
index 0000000000..f12b917765
--- /dev/null
+++ b/test/unit/cute/core/domain_distribute.cpp
@@ -0,0 +1,108 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+//#define CUTLASS_DEBUG_TRACE_LEVEL 1
+
+#include "cutlass_unit_test.h"
+
+#include <cutlass/trace.h>
+
+#include <iostream>
+
+#include <cute/tensor.hpp>
+
+using namespace cute;
+
+template <class LayoutA, class LayoutB>
+void
+test_distribute(LayoutA const& layoutA,
+                LayoutB const& layoutB)
+{
+  auto layoutR = domain_distribute(shape(layoutA), shape(layoutB));
+
+  CUTLASS_TRACE_HOST("test_distribute()");
+  CUTLASS_TRACE_HOST(layoutA << "  <->  " << layoutB);
+  CUTLASS_TRACE_HOST("  =>  ");
+  CUTLASS_TRACE_HOST(layoutR);
+
+  EXPECT_TRUE(evenly_divides(layoutB, size(layoutR)));
+  EXPECT_TRUE(evenly_divides(layoutA, layoutR));
+
+  // Post-condition on the codomain of the distribute
+  for (int i = 0; i < size(layoutR); ++i) {
+    for (int j = i+1; j < size(layoutR); ++j) {
+      EXPECT_TRUE(layoutR(i) < layoutR(j));     // Surjective and Ordered
+    }
+  }
+}
+
+
+TEST(CuTe_core, Distribute)
+{
+  CUTLASS_TRACE_HOST("-------------------------------");
+  CUTLASS_TRACE_HOST("DOMAIN DISTRIBUTE"              );
+  CUTLASS_TRACE_HOST("-------------------------------");
+
+  {
+    auto shape_a = Shape<Shape<_64,_3>,Shape<_8,_8>>{};
+    auto shape_b = _128{};
+
+    test_distribute(shape_a, shape_b);
+  }
+
+  {
+    auto shape_a = Shape<Int<192>,Shape<_8,_8>>{};
+    auto shape_b = _128{};
+
+    test_distribute(shape_a, shape_b);
+  }
+
+  {
+    auto shape_a = Shape<Shape<_64,_3>,Shape<_8,_8>>{};
+    auto shape_b = _128{} * _8{};
+
+    test_distribute(shape_a, shape_b);
+  }
+
+  {
+    auto shape_a = Shape<Int<192>,Shape<_8,_8>>{};
+    auto shape_b = _128{} * _8{};
+
+    test_distribute(shape_a, shape_b);
+  }
+
+  {
+    auto shape_a = Shape<Shape<_64,_3>>{};
+    auto shape_b = _128{};
+
+    test_distribute(shape_a, shape_b);
+  }
+}
diff --git a/test/unit/cute/core/int_tuple.cpp b/test/unit/cute/core/int_tuple.cpp
new file mode 100644
index 0000000000..0ef68f7afa
--- /dev/null
+++ b/test/unit/cute/core/int_tuple.cpp
@@ -0,0 +1,198 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <cute/layout.hpp>
+
+using namespace cute;
+
+TEST(CuTe_core, WeaklyCongruent)
+{
+  auto a = _1{};
+  auto b = _2{};
+  EXPECT_TRUE (weakly_congruent(a, a));
+  EXPECT_TRUE (weakly_congruent(b, b));
+  EXPECT_TRUE (weakly_congruent(a, b));
+
+  auto a0 = Shape<_1>{};
+  auto b0 = Shape<_2>{};
+  EXPECT_TRUE (weakly_congruent(a , a0));
+  EXPECT_TRUE (weakly_congruent(b , b0));
+  EXPECT_TRUE (weakly_congruent(a , b0));
+  EXPECT_TRUE (weakly_congruent(b , a0));
+  EXPECT_FALSE(weakly_congruent(a0, a ));
+  EXPECT_FALSE(weakly_congruent(b0, b ));
+  EXPECT_FALSE(weakly_congruent(a0, b ));
+  EXPECT_FALSE(weakly_congruent(b0, a ));
+  EXPECT_TRUE (weakly_congruent(a0, a0));
+  EXPECT_TRUE (weakly_congruent(b0, b0));
+  EXPECT_TRUE (weakly_congruent(a0, b0));
+
+  auto a1 = Shape<_1, _1>{};
+  EXPECT_TRUE (weakly_congruent(a , a1));
+  EXPECT_FALSE(weakly_congruent(a0, a1));
+  EXPECT_TRUE (weakly_congruent(a1, a1));
+
+  auto a2 = Shape<_1, Shape<_1,_1>>{};
+  EXPECT_TRUE (weakly_congruent(a , a2));
+  EXPECT_FALSE(weakly_congruent(a0, a2));
+  EXPECT_TRUE (weakly_congruent(a1, a2));
+
+  auto b1 = Shape<_2, _2>{};
+  EXPECT_TRUE (weakly_congruent(b , b1));
+  EXPECT_FALSE(weakly_congruent(b0, b1));
+  EXPECT_TRUE (weakly_congruent(a1, b1));
+
+  auto b2 = Shape<_2, Shape<_2,_2>>{};
+  EXPECT_FALSE(weakly_congruent(a2, b0));
+  EXPECT_FALSE(weakly_congruent(a2, a1));
+  EXPECT_TRUE (weakly_congruent(a2, b2));
+
+  auto b3 = Shape<Shape<_2,_2>, Shape<_2,_2>>{};
+  EXPECT_FALSE(weakly_congruent(a0, b3));
+  EXPECT_TRUE (weakly_congruent(a1, b3));
+  EXPECT_TRUE (weakly_congruent(a2, b3));
+}
+
+template <class A, class B>
+auto test_evenly_divides(A const& a, B const& b)
+{
+  auto result = evenly_divides(a, b);
+  // If A and B are static, then result should be as well
+  if constexpr (is_static<A>::value && is_static<B>::value) {
+    static_assert(is_static<decltype(result)>::value);
+  }
+  // If result is true_type, then confirm divisibillity
+  if constexpr (is_constant<true, decltype(result)>::value) {
+    CUTE_STATIC_ASSERT_V(size(a) == size(logical_divide(make_layout(shape(a)), b)));
+  }
+
+  return result;
+}
+
+TEST(CuTe_core, Divides)
+{
+  {
+  auto a = _16{};
+  auto b = _12{};
+  auto c = _8{};
+  EXPECT_TRUE (test_evenly_divides(a, a));
+  EXPECT_TRUE (test_evenly_divides(b, b));
+  EXPECT_TRUE (test_evenly_divides(c, c));
+  EXPECT_FALSE(test_evenly_divides(a, b));
+  EXPECT_TRUE (test_evenly_divides(a, c));
+  EXPECT_FALSE(test_evenly_divides(c, a));
+
+  auto a0 = Shape<_16>{};
+  EXPECT_TRUE (test_evenly_divides(a0, a0));
+  EXPECT_TRUE (test_evenly_divides(a , a0));
+  EXPECT_TRUE (test_evenly_divides(a0, a ));
+  EXPECT_FALSE(test_evenly_divides(c , a0));
+  EXPECT_TRUE (test_evenly_divides(a0, c ));
+  EXPECT_FALSE(test_evenly_divides(b , a0));
+  EXPECT_FALSE(test_evenly_divides(a0, b ));
+
+  auto a1 = Shape<_2,_8>{};
+  EXPECT_TRUE (test_evenly_divides(a1, a1));
+  EXPECT_FALSE(test_evenly_divides(a , a1));
+  EXPECT_FALSE(test_evenly_divides(a0, a1));
+  EXPECT_FALSE(test_evenly_divides(a1, a0));
+  EXPECT_FALSE(test_evenly_divides(a1, Shape<_2,Shape<_2,_4>>{}));
+
+  auto a2 = Shape<Shape<_2,_8>>{};
+  EXPECT_TRUE (test_evenly_divides(a2, a2));
+  EXPECT_FALSE(test_evenly_divides(a , a2));
+  EXPECT_FALSE(test_evenly_divides(c , a2));
+  EXPECT_FALSE(test_evenly_divides(a0, a2));
+  EXPECT_TRUE (test_evenly_divides(a2, a0));
+
+  auto a3 = Shape<Shape<_2,Shape<_4,_2>>>{};
+  EXPECT_TRUE (test_evenly_divides(a3, a3));
+  EXPECT_FALSE(test_evenly_divides(a , a3));
+  EXPECT_FALSE(test_evenly_divides(c , a3));
+  EXPECT_FALSE(test_evenly_divides(a0, a3));
+  EXPECT_TRUE (test_evenly_divides(a3, a0));
+  EXPECT_FALSE(test_evenly_divides(a2, a3));
+  EXPECT_TRUE (test_evenly_divides(a3, a2));
+  }
+
+  {
+  auto a = 16;
+  auto b = 12;
+  auto c =  8;
+  EXPECT_TRUE (test_evenly_divides(a, a));
+  EXPECT_TRUE (test_evenly_divides(b, b));
+  EXPECT_TRUE (test_evenly_divides(c, c));
+  EXPECT_FALSE(test_evenly_divides(a, b));
+  EXPECT_TRUE (test_evenly_divides(a, c));
+  EXPECT_FALSE(test_evenly_divides(c, a));
+
+  auto a0 = make_shape(16);
+  EXPECT_TRUE (test_evenly_divides(a0, a0));
+  EXPECT_TRUE (test_evenly_divides(a , a0));
+  EXPECT_TRUE (test_evenly_divides(a0, a ));
+  EXPECT_FALSE(test_evenly_divides(c , a0));
+  EXPECT_TRUE (test_evenly_divides(a0, c ));
+  EXPECT_FALSE(test_evenly_divides(b , a0));
+  EXPECT_FALSE(test_evenly_divides(a0, b ));
+
+  auto a1 = make_shape(2, 8);
+  EXPECT_TRUE (test_evenly_divides(a1, a1));
+  EXPECT_FALSE(test_evenly_divides(a , a1));
+  EXPECT_FALSE(test_evenly_divides(a0, a1));
+  EXPECT_FALSE(test_evenly_divides(a1, a0));
+  EXPECT_FALSE(test_evenly_divides(a1, make_shape(2,make_shape(2,4))));
+
+  auto a2 = make_shape(make_shape(2,8));
+  EXPECT_TRUE (test_evenly_divides(a2, a2));
+  EXPECT_FALSE(test_evenly_divides(a , a2));
+  EXPECT_FALSE(test_evenly_divides(c , a2));
+  EXPECT_FALSE(test_evenly_divides(a0, a2));
+  EXPECT_TRUE (test_evenly_divides(a2, a0));
+
+  auto a3 = make_shape(make_shape(2,make_shape(4,2)));
+  EXPECT_TRUE (test_evenly_divides(a3, a3));
+  EXPECT_FALSE(test_evenly_divides(a , a3));
+  EXPECT_FALSE(test_evenly_divides(c , a3));
+  EXPECT_FALSE(test_evenly_divides(a0, a3));
+  EXPECT_TRUE (test_evenly_divides(a3, a0));
+  EXPECT_FALSE(test_evenly_divides(a2, a3));
+  EXPECT_TRUE (test_evenly_divides(a3, a2));
+  }
+
+  {
+  auto a = Shape<_32,_64>{};
+  EXPECT_TRUE (test_evenly_divides(a, Int<128>{}));
+  EXPECT_TRUE (test_evenly_divides(a, Tile<Layout<_8,_2>, _32>{}));
+  EXPECT_FALSE(test_evenly_divides(a, Tile<Layout<_8,_3>, _32>{}));
+  }
+}
diff --git a/test/unit/cute/core/inverse_left.cpp b/test/unit/cute/core/inverse_left.cpp
new file mode 100644
index 0000000000..142d80fb6a
--- /dev/null
+++ b/test/unit/cute/core/inverse_left.cpp
@@ -0,0 +1,185 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <cutlass/trace.h>
+#include <iostream>
+#include <cute/layout.hpp>
+#include <cute/layout_composed.hpp>  // cute::composition
+#include <cute/swizzle.hpp>          // cute::Swizzle
+#include <cute/swizzle_layout.hpp>   // cute::composition
+#include <cute/tensor.hpp>
+
+using namespace cute;
+
+template <class Layout>
+void
+test_left_inverse(Layout const& layout)
+{
+  auto inv_layout = left_inverse(layout);
+
+  CUTLASS_TRACE_HOST(layout << " ^ -1\n" << "  =>  \n" << inv_layout);
+
+  for (int i = 0; i < size(layout); ++i) {
+    //printf("%3d: %3d  %3d\n", i, int(layout(i)), int(inv_layout(layout(i))));
+    EXPECT_EQ(inv_layout(layout(i)), i);
+  }
+
+  CUTLASS_TRACE_HOST("Composition: " << coalesce(composition(inv_layout, layout)));
+}
+
+TEST(CuTe_core, Inverse_left)
+{
+  {
+  auto layout = Layout<Shape <_1>,
+                       Stride<_0>>{};
+
+  test_left_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape <Shape <_1,_1>>,
+                       Stride<Stride<_0,_0>>>{};
+
+  test_left_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape <_1>,
+                       Stride<_1>>{};
+
+  test_left_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape <_4>,
+                       Stride<_1>>{};
+
+  test_left_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape <_4>,
+                       Stride<_2>>{};
+
+  test_left_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape <_8, _4>>{};
+
+  test_left_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape <_8, _4>,
+                       Stride<_4, _1>>{};
+
+  test_left_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape< _2,_4,_6>>{};
+
+  test_left_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape <_2,_4,_6>,
+                       Stride<_4,_1,_8>>{};
+
+  test_left_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape <_4, _2>,
+                       Stride<_1,_16>>{};
+
+  test_left_inverse(layout);
+  }
+
+  //
+  // Swizzle left_inverse
+  //
+
+  {
+  auto layout = ComposedLayout<Swizzle<1,0,2>, _0, Layout<Shape <_4, _4>,
+                                                          Stride<_1, _4>>>{};
+
+  test_left_inverse(layout);
+  }
+
+  {
+  auto layout = ComposedLayout<Swizzle<1,0,2>, _0, Layout<Shape <_4, _4>,
+                                                          Stride<_4, _1>>>{};
+
+  test_left_inverse(layout);
+  }
+
+  {
+  auto layout = ComposedLayout<Swizzle<1,0,1>, _0, Layout<Shape <_4, _4>,
+                                                          Stride<_8, _1>>>{};
+
+  test_left_inverse(layout);
+  }
+
+  //
+  // Negative strides (beta support)
+  // Post-conditions/layout indexing aren't generalized enough to support these yet
+  // However, the composition post-condition is general enough.
+  {
+  auto layout = make_layout(Shape<_4>{}, Stride<Int<-1>>{});
+
+  test_left_inverse(layout);
+  }
+
+  //{
+  //auto layout = Layout<Shape < _2,_4>,
+  //                     Stride<_m1,_2>>{};
+
+  //test_left_inverse(layout);
+  //}
+
+  //{
+  //auto layout = Layout<Shape < _2, _4>,
+  //                     Stride< _4,_m1>>{};
+
+  //test_left_inverse(layout);
+  //}
+
+  //{
+  //auto layout = Layout<Shape < _2, _4, _6>,
+  //                     Stride<_m1,_12,_m2>>{};
+
+  //test_left_inverse(layout);
+  //}
+}
diff --git a/test/unit/cute/core/inverse_right.cpp b/test/unit/cute/core/inverse_right.cpp
new file mode 100644
index 0000000000..69c8ccdddf
--- /dev/null
+++ b/test/unit/cute/core/inverse_right.cpp
@@ -0,0 +1,265 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <cutlass/trace.h>
+#include <iostream>
+#include <cute/layout.hpp>
+#include <cute/layout_composed.hpp>  // cute::composition
+#include <cute/swizzle.hpp>          // cute::Swizzle
+#include <cute/swizzle_layout.hpp>   // cute::composition
+#include <cute/tensor.hpp>
+
+using namespace cute;
+
+template <class Layout, class InvLayout>
+void
+test_postconditions(Layout const& layout, InvLayout const& inv_layout)
+{
+  for (int i = 0; i < size(inv_layout); ++i) {
+    //printf("%3d: %3d  %3d\n", i, int(inv_layout(i)), int(layout(inv_layout(i))));
+    EXPECT_EQ(layout(inv_layout(i)),  i);
+  }
+}
+
+template <class Layout>
+void
+test_right_inverse(Layout const& layout)
+{
+  auto inv_layout = right_inverse(layout);
+
+  CUTLASS_TRACE_HOST(layout << " ^ -1\n" << "  =>  \n" << inv_layout);
+  CUTLASS_TRACE_HOST("Composition: " << coalesce(composition(layout, inv_layout)) << std::endl);
+
+  test_postconditions(layout, inv_layout);
+}
+
+TEST(CuTe_core, Inverse_right)
+{
+  CUTLASS_TRACE_HOST("-------------------------------");
+  CUTLASS_TRACE_HOST("RIGHT INVERSE"                  );
+  CUTLASS_TRACE_HOST("-------------------------------");
+
+  CUTLASS_TRACE_HOST("-------------------------------");
+  CUTLASS_TRACE_HOST("Simple tests"                   );
+  CUTLASS_TRACE_HOST("-------------------------------");
+
+  {
+  auto layout = Layout<_1, _0>{};
+
+  test_right_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<_1, _1>{};
+
+  test_right_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape <_4>,
+                       Stride<_0>>{};
+
+  test_right_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape <Shape <_1,_1>>,
+                       Stride<Stride<_0,_0>>>{};
+
+  test_right_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape <Shape <_3,_7>>,
+                       Stride<Stride<_0,_0>>>{};
+
+  test_right_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape <_1>,
+                       Stride<_1>>{};
+
+  test_right_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape <_4>,
+                       Stride<_1>>{};
+
+  test_right_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape <_4>,
+                       Stride<_2>>{};
+
+  test_right_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape <_2,_4>,
+                       Stride<_0,_2>>{};
+
+  test_right_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape <_8, _4>>{};
+
+  test_right_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape <_8, _4>,
+                       Stride<_4, _1>>{};
+
+  test_right_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape< _2,_4,_6>>{};
+
+  test_right_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape <_2,_4,_6>,
+                       Stride<_4,_1,_8>>{};
+
+  test_right_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape <_2,_4,_4,_6>,
+                       Stride<_4,_1,_0,_8>>{};
+
+  test_right_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape <_4, _2>,
+                       Stride<_1,_16>>{};
+
+  test_right_inverse(layout);
+  }
+
+  {
+  auto layout = Layout<Shape <_4, _2>,
+                       Stride<_1, _5>>{};
+
+  test_right_inverse(layout);
+  }
+
+  CUTLASS_TRACE_HOST("-------------------------------");
+  CUTLASS_TRACE_HOST("Dynamic shapes/strides"         );
+  CUTLASS_TRACE_HOST("-------------------------------");
+
+  {
+  auto layout = make_layout(Shape<_4, _2>{}, make_stride(Int<1>{}, 4));
+
+  test_right_inverse(layout);
+  }
+
+  {
+  auto layout = make_layout(make_shape(_4{}, 2), make_stride(Int<1>{}, 4));
+
+  test_right_inverse(layout);
+  }
+
+  {
+  auto layout = make_layout(make_shape(4, 2), make_stride(Int<1>{}, 4));
+
+  test_right_inverse(layout);
+  }
+
+  CUTLASS_TRACE_HOST("-------------------------------");
+  CUTLASS_TRACE_HOST("Swizzle layouts"                );
+  CUTLASS_TRACE_HOST("-------------------------------");
+
+  {
+  auto layout = ComposedLayout<Swizzle<1,0,2>, _0, Layout<Shape <_4, _4>,
+                                                          Stride<_1, _4>>>{};
+
+  test_right_inverse(layout);
+  }
+
+  {
+  auto layout = ComposedLayout<Swizzle<1,0,2>, _0, Layout<Shape <_4, _4>,
+                                                          Stride<_4, _1>>>{};
+
+  test_right_inverse(layout);
+  }
+
+  {
+  auto layout = ComposedLayout<Swizzle<1,0,1>, _0, Layout<Shape <_4, _4>,
+                                                          Stride<_8, _1>>>{};
+
+  test_right_inverse(layout);
+  }
+
+  CUTLASS_TRACE_HOST("-------------------------------");
+  CUTLASS_TRACE_HOST("BETA: Negative strides"         );
+  CUTLASS_TRACE_HOST("-------------------------------");
+
+  // Negative strides (beta support)
+  // Post-conditions/layout indexing aren't generalized enough to support these yet
+  // However, the composition post-condition is general enough.
+  {
+  auto layout = make_layout(Shape<_4>{}, Stride<Int<-1>>{});
+
+  test_right_inverse(layout);
+  }
+
+  //{
+  //auto layout = Layout<Shape < _2,_4>,
+  //                     Stride<_m1,_2>>{};
+
+  //test_right_inverse(layout);
+  //}
+
+  //{
+  //auto layout = Layout<Shape < _2, _4>,
+  //                     Stride< _4,_m1>>{};
+
+  //test_right_inverse(layout);
+  //}
+
+  //{
+  //auto layout = Layout<Shape < _2, _4, _6>,
+  //                     Stride<_m1,_12,_m2>>{};
+
+  //test_right_inverse(layout);
+  //}
+
+}
diff --git a/test/unit/cute/core/logical_divide.cpp b/test/unit/cute/core/logical_divide.cpp
new file mode 100644
index 0000000000..061fd5487b
--- /dev/null
+++ b/test/unit/cute/core/logical_divide.cpp
@@ -0,0 +1,268 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <cutlass/trace.h>
+
+#include <cute/tensor.hpp>
+
+using namespace cute;
+
+template <class LayoutA, class LayoutB>
+void
+test_logical_divide(LayoutA const& layoutA,
+                    LayoutB const& layoutB)
+{
+  auto layoutR = logical_divide(layoutA, layoutB);
+
+  CUTLASS_TRACE_HOST("test_logical_divide()");
+  CUTLASS_TRACE_HOST( shape(layoutA) << " / " <<  shape(layoutB) << "  =>  " <<  shape(layoutR));
+  CUTLASS_TRACE_HOST(stride(layoutA) << "   " << stride(layoutB) << "  =>  " << stride(layoutR));
+
+  // Test that layout B is compatible with layout R_0
+  ASSERT_EQ(rank(layoutR), 2);
+  ASSERT_TRUE(compatible(layoutB, layout<0>(layoutR)));
+}
+
+TEST(CuTe_core, Logical_divide)
+{
+  {
+  auto layout = Layout<_1,_0>{};
+  auto tile   = Layout<_1,_0>{};
+
+  test_logical_divide(layout, tile);
+  }
+
+  {
+  auto layout = Layout<_1,_0>{};
+  auto tile   = Layout<_1,_1>{};
+
+  test_logical_divide(layout, tile);
+  }
+
+  {
+  auto layout = Layout<_1,_1>{};
+  auto tile   = Layout<_1,_0>{};
+
+  test_logical_divide(layout, tile);
+  }
+
+  {
+  auto layout = Layout<_1,_1>{};
+  auto tile   = Layout<_1,_1>{};
+
+  test_logical_divide(layout, tile);
+  }
+
+  {
+  auto layout = Layout<_6,_1>{};
+  auto tile   = Layout<_2,_1>{};
+
+  test_logical_divide(layout, tile);
+  }
+
+  {
+  auto layout = Layout<_6,_1>{};
+  auto tile   = Layout<_2,_3>{};
+
+  test_logical_divide(layout, tile);
+  }
+
+  {
+  auto layout = Layout<_6,_1>{};
+  auto tile   = Layout<Shape<_2,_3>,Stride<_3,_1>>{};
+
+  test_logical_divide(layout, tile);
+  }
+
+  {
+  auto layout = Layout<_6,_2>{};
+  auto tile   = Layout<_2,_1>{};
+
+  test_logical_divide(layout, tile);
+  }
+
+  {
+  auto layout = Layout<_6,_2>{};
+  auto tile   = Layout<_2,_3>{};
+
+  test_logical_divide(layout, tile);
+  }
+
+  {
+  auto layout = Layout<_6,_2>{};
+  auto tile   = Layout<Shape<_2,_3>,Stride<_3,_1>>{};
+
+  test_logical_divide(layout, tile);
+  }
+
+  {
+  auto layout = Layout<Shape<_6,_6>,Stride<_1,_12>>{};
+  auto tile   = Layout<Shape<_6,_3>,Stride<_3,_1>>{};
+
+  test_logical_divide(layout, tile);
+  }
+
+  {
+  auto layout = Layout<Shape<_6,_6>,Stride<_12,_1>>{};
+  auto tile   = Layout<Shape<_6,_3>,Stride<_3,_1>>{};
+
+  test_logical_divide(layout, tile);
+  }
+
+  {
+  auto layout = Layout<_32>{};
+  auto tile   = Layout<_2,_8>{};
+
+  test_logical_divide(layout, tile);
+  }
+
+  {
+  auto layout = Layout<Shape<_4,_1>,Stride<_1,_1>>{};
+  auto tile   = Layout<_2,_1>{};
+
+  test_logical_divide(layout, tile);
+  }
+
+  {
+  auto layout = Layout<Shape<_4,_1>,Stride<_1,_1>>{};
+  auto tile   = Layout<_2,_2>{};
+
+  test_logical_divide(layout, tile);
+  }
+
+  {
+  auto layout = Layout<Shape<_8,_8>,Stride<_1,_8>>{};
+  auto tile   = Layout<_32,_2>{};
+
+  test_logical_divide(layout, tile);
+  }
+
+  {
+  auto layout = Layout<Shape<_8,_8>,Stride<_8,_1>>{};
+  auto tile   = Layout<_32,_2>{};
+
+  test_logical_divide(layout, tile);
+  }
+
+  //
+  // Dynamic
+  //
+
+  {
+  auto layout = make_layout(2);
+  auto tile   = Layout<_32>{};
+
+  test_logical_divide(layout, tile);
+
+  // Enforcement for dynamic cases
+  auto result = logical_divide(layout, tile);
+  ASSERT_TRUE(decltype(shape<0>(result) == Int<32>{})::value);
+  ASSERT_TRUE(decltype(stride<0>(result) == Int<1>{})::value);
+  ASSERT_TRUE(shape<1>(result) == 1);
+  ASSERT_TRUE(decltype(stride<1>(result) == Int<32>{})::value);
+  }
+
+  {
+  auto layout = make_layout(48);
+  auto tile   = Layout<_32>{};
+
+  test_logical_divide(layout, tile);
+
+  // Enforcement for dynamic cases
+  auto result = logical_divide(layout, tile);
+  ASSERT_TRUE(decltype(shape<0>(result) == Int<32>{})::value);
+  ASSERT_TRUE(decltype(stride<0>(result) == Int<1>{})::value);
+  ASSERT_TRUE(shape<1>(result) == 2);
+  ASSERT_TRUE(decltype(stride<1>(result) == Int<32>{})::value);
+  }
+
+  {
+  auto layout = make_layout(96);
+  auto tile   = Layout<_32,_2>{};
+
+  test_logical_divide(layout, tile);
+  }
+
+  {
+  auto layout = make_layout(32);
+  auto tile   = Layout<Int<48>>{};
+
+  test_logical_divide(layout, tile);
+
+  // Enforcement for dynamic cases
+  auto result = logical_divide(layout, tile);
+  ASSERT_TRUE(decltype(shape<0>(result) == Int<48>{})::value);
+  ASSERT_TRUE(decltype(stride<0>(result) == Int<1>{})::value);
+  ASSERT_TRUE(shape<1>(result) == 1);
+  ASSERT_TRUE(decltype(stride<1>(result) == Int<48>{})::value);
+  }
+
+  {
+  auto layout = make_layout(make_shape(Int<32>{}, Int<4>{}, 4));
+  auto tile   = Layout<_64>{};
+
+  test_logical_divide(layout, tile);
+
+  // Enforcement of result
+  auto result = logical_divide(layout, tile);
+  ASSERT_TRUE(bool( shape(result) == make_shape (_64{}, make_shape ( _2{},     4))));
+  ASSERT_TRUE(bool(stride(result) == make_stride( _1{}, make_stride(_64{},_128{}))));
+  }
+
+
+  //
+  // ALLOWED, but dangerous due to the dynamic lhs shapes
+  //   Consider disallowing...
+  //
+
+  {
+  auto layout = make_layout(make_shape(128,4,3), make_stride(1,512,0));
+  auto tile   = Layout<_32>{};
+
+  test_logical_divide(layout, tile);
+  }
+
+  {
+  auto layout = make_layout(make_shape(128,4,3), make_stride(1,512,0));
+  auto tile   = Layout<_32,_2>{};
+
+  test_logical_divide(layout, tile);
+  }
+
+  {
+  auto layout = make_layout(make_shape(16,4,3), make_stride(1,512,0));
+  auto tile   = Layout<_32>{};
+
+  test_logical_divide(layout, tile);
+  }
+}
diff --git a/test/unit/cute/core/logical_product.cpp b/test/unit/cute/core/logical_product.cpp
new file mode 100644
index 0000000000..4d9aa0cd8a
--- /dev/null
+++ b/test/unit/cute/core/logical_product.cpp
@@ -0,0 +1,214 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <cutlass/trace.h>
+#include <cute/tensor.hpp>
+
+using namespace cute;
+
+template <class LayoutA, class LayoutB>
+void
+test_logical_product(LayoutA const& layoutA,
+                     LayoutB const& layoutB)
+{
+  auto layoutR = logical_product(layoutA, layoutB);
+
+  CUTLASS_TRACE_HOST(shape(layoutA)  << " x " << shape(layoutB)  << "  =>  " << shape(layoutR) );
+  CUTLASS_TRACE_HOST(stride(layoutA) << "   " << stride(layoutB) << "  =>  " << stride(layoutR));
+
+  ASSERT_EQ(rank(layoutR), 2);
+  ASSERT_TRUE(layoutA == layout<0>(layoutR));
+  ASSERT_TRUE(compatible(layoutB, layout<1>(layoutR)));
+}
+
+TEST(CuTe_core, Logical_product)
+{
+  {
+    auto vec  = Layout<_1,_0>{};
+    auto tile = Layout<_1,_0>{};
+
+    test_logical_product(vec, tile);
+  }
+
+  {
+    auto vec  = Layout<_1,_1>{};
+    auto tile = Layout<_1,_0>{};
+
+    test_logical_product(vec, tile);
+  }
+
+  {
+    auto vec  = Layout<_1,_0>{};
+    auto tile = Layout<_1,_1>{};
+
+    test_logical_product(vec, tile);
+  }
+
+  {
+    auto vec  = Layout<_1,_1>{};
+    auto tile = Layout<_1,_1>{};
+
+    test_logical_product(vec, tile);
+  }
+
+  {
+    auto vec  = Layout<_3,_1>{};
+    auto tile = Layout<_4,_0>{};
+
+    test_logical_product(vec, tile);
+  }
+
+  {
+    auto vec  = Layout<_3,_0>{};
+    auto tile = Layout<_4,_1>{};
+
+    test_logical_product(vec, tile);
+  }
+
+  {
+    auto vec  = Layout<_3,_0>{};
+    auto tile = Layout<_4,_0>{};
+
+    test_logical_product(vec, tile);
+  }
+
+  {
+    auto vec  = Layout<_3,_2>{};
+    auto tile = Layout<_4,_1>{};
+
+    test_logical_product(vec, tile);
+  }
+
+  {
+    auto vec = make_layout(Shape<_3>{});
+    auto tile = make_layout(Shape<_2,_4>{});
+
+    test_logical_product(vec, tile);
+  }
+
+  {
+    auto vec = make_layout(Shape<_2,_4>{});
+    auto tile = make_layout(Shape<_3>{});
+
+    test_logical_product(vec, tile);
+  }
+
+  {
+    auto vec = make_layout(Shape<_8,Shape<_2,_2>>{});
+    auto tile = make_layout(Shape<_4>{}, Stride<_2>{});
+
+    test_logical_product(vec, tile);
+  }
+
+  {
+    auto vec = make_layout(Shape<_2,_2>{});
+    auto tile = make_layout(Shape<_3,_3>{}, Stride<_3,_1>{});
+
+    test_logical_product(vec, tile);
+  }
+
+  {
+    auto vec = make_layout(Shape<_3>{}, Stride<_32>{});
+    auto tile = make_layout(Shape<_32>{});
+
+    test_logical_product(vec, tile);
+  }
+
+  {
+    auto vec = make_layout(Shape<_3>{}, Stride<_2>{});
+    auto tile = make_layout(Shape<_4>{});
+
+    test_logical_product(vec, tile);
+  }
+
+  {
+    auto vec = make_layout(Shape<_3>{}, Stride<_32>{});
+    auto tile = make_layout(Shape<_128>{});
+
+    test_logical_product(vec, tile);
+  }
+
+  {
+    auto vec = make_layout(Shape<_3>{}, Stride<_32>{});
+    auto tile = make_layout(Shape<_8,_8>{});
+
+    test_logical_product(vec, tile);
+  }
+
+  {
+    auto vec = make_layout(Shape<_3>{}, Stride<_32>{});
+    auto tile = make_layout(Shape<_8,_8>{}, Stride<_8,_1>{});
+
+    test_logical_product(vec, tile);
+  }
+
+  {
+    auto vec = make_layout(Shape<Shape<_4,_2>>{}, Stride<Stride<_1,_16>>{});
+    auto tile = make_layout(Shape<_4,_4>{});
+
+    test_logical_product(vec, tile);
+  }
+
+  {
+    auto vec = make_layout(Shape<Shape<_4,_2>>{}, Stride<Stride<_1,_16>>{});
+    auto tile = make_layout(Shape<_4,_2>{}, Stride<_2,_1>{});
+
+    test_logical_product(vec, tile);
+  }
+
+  {
+    auto vec = make_layout(Shape<Shape<_2,_2>,Shape<_2, _2>>{},
+                           Stride<Stride<_1,_4>,Stride<_8,_32>>{});
+    auto tile = make_layout(Shape<_2,_2>{}, Stride<_1,_2>{});
+
+    test_logical_product(vec, tile);
+  }
+
+  {
+    auto vec = make_layout(Shape<Shape<_2,_2>,Shape<_2, _2>>{},
+                           Stride<Stride<_1,_4>,Stride<_8,_32>>{});
+    auto tile = make_layout(Shape<_2,_2>{},
+                            Stride<_2,_1>{});
+
+    test_logical_product(vec, tile);
+  }
+
+  {
+    auto vec = make_layout(Shape <Shape <_4,_6>>{},
+                           Stride<Stride<_1,_6>>{});
+    auto tile = make_layout(Shape <_3>{},
+                            Stride<_1>{});
+
+    test_logical_product(vec, tile);
+  }
+}
diff --git a/test/unit/cute/core/math.cpp b/test/unit/cute/core/math.cpp
new file mode 100644
index 0000000000..b022df561c
--- /dev/null
+++ b/test/unit/cute/core/math.cpp
@@ -0,0 +1,135 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <cutlass/trace.h>
+#include <cute/numeric/integral_constant.hpp>
+#include <cute/numeric/math.hpp>
+#include <cute/swizzle.hpp>
+#include <cute/swizzle_layout.hpp>
+#include <cute/util/type_traits.hpp>
+
+// If cute::gcd returns auto instead of common_type_t<T, U>,
+// then GCC 7.5 reports the following error;
+//
+// ... /include/cute/numeric/math.hpp:103:26: error:
+// inconsistent deduction for auto return type: ‘int’ and then ‘bool’
+//      if (u == 0) { return t; }
+//                           ^
+// Note that common_type_t<C<42>, C<1>>::value_type might still be bool.
+TEST(CuTe_core, gcd_returns_common_type)
+{
+  using cute::C;
+
+  constexpr auto fifteen = C<3 * 5>{};
+  static_assert(cute::is_same_v<decltype(fifteen)::value_type, int>);
+  static_assert(int(fifteen) == 15);
+
+  constexpr auto forty_two = C<2 * 3 * 7>{};
+  static_assert(cute::is_same_v<decltype(forty_two)::value_type, int>);
+  static_assert(int(forty_two) == 42);
+
+  // C<1>::value_type (as well as C<0>::value_type) may be bool.
+  constexpr auto one = C<1>{};
+
+  // Both inputs have value_type int.
+  {
+    constexpr auto result = cute::gcd(fifteen, forty_two);
+    static_assert(cute::is_same_v<decltype(result)::value_type, int>);
+    static_assert(int(result) == 3);
+  }
+
+  // One input has value_type int, and the other may have value_type bool.
+  {
+    constexpr auto result = cute::gcd(one, forty_two);
+    static_assert(int(result) == 1);
+  }
+  {
+    constexpr auto result = cute::gcd(forty_two, one);
+    static_assert(int(result) == 1);
+  }
+
+  // Both inputs may have value_type bool.
+  {
+    constexpr auto result = cute::gcd(one, one);
+    static_assert(int(result) == 1);
+  }
+}
+
+TEST(CuTe_core, lcm_returns_common_type)
+{
+  using cute::C;
+
+  constexpr auto six = C<2 * 3>{};
+  static_assert(cute::is_same_v<decltype(six)::value_type, int>);
+  static_assert(int(six) == 6);
+
+  constexpr auto fifteen = C<3 * 5>{};
+  static_assert(cute::is_same_v<decltype(fifteen)::value_type, int>);
+  static_assert(int(fifteen) == 15);
+
+  // C<1>::value_type (as well as C<0>::value_type) may be bool.
+  constexpr auto one = C<1>{};
+
+  // Both inputs have value_type int.
+  {
+    constexpr auto result = cute::lcm(six, fifteen);
+    static_assert(cute::is_same_v<decltype(result)::value_type, int>);
+    static_assert(int(result) == 30);
+  }
+
+  // One input has value_type int, and the other may have value_type bool.
+  {
+    constexpr auto result = cute::lcm(one, six);
+    static_assert(cute::is_same_v<decltype(result)::value_type, int>);
+    static_assert(int(result) == 6);
+  }
+  {
+    constexpr auto result = cute::lcm(six, one);
+    static_assert(cute::is_same_v<decltype(result)::value_type, int>);
+    static_assert(int(result) == 6);
+  }
+
+  // Both inputs may have value_type bool.
+  {
+    constexpr auto result = cute::lcm(one, one);
+    static_assert(int(result) == 1);
+  }
+}
+
+TEST(CuTe_core, max_alignment)
+{
+  {
+    constexpr auto swizzle = cute::Swizzle<3,4,3>{};
+    static_assert(cute::max_alignment(swizzle) == 1 << 4);
+  }
+}
diff --git a/test/unit/cute/core/mixedbits.cpp b/test/unit/cute/core/mixedbits.cpp
new file mode 100644
index 0000000000..10c828830f
--- /dev/null
+++ b/test/unit/cute/core/mixedbits.cpp
@@ -0,0 +1,61 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <cutlass/trace.h>
+#include <cute/swizzle.hpp>
+
+TEST(CuTe_core, MixedBits) 
+{
+  using namespace cute;
+
+  for_each(make_int_sequence<8>{}, [&](auto S0) {
+    for_each(make_int_sequence<8>{}, [&](auto F0) {
+      for_each(make_int_sequence<8>{}, [&](auto S1) {
+        for_each(make_int_sequence<8>{}, [&](auto F1) {
+          for (uint32_t d0 = 0; d0 < 8; ++d0) {
+            for (uint32_t d1 = 0; d1 < 8; ++d1) {
+              auto m0 = make_mixed_bits(S0, d0, F0);
+              auto m1 = make_mixed_bits(S1, d1, F1);
+              //print(m0); print(" & "); print(m1); print(" = "); print(m0 & m1); print("\n");
+              EXPECT_EQ(uint32_t(m0 & m1), uint32_t(m0) & uint32_t(m1));
+              //print(m0); print(" | "); print(m1); print(" = "); print(m0 | m1); print("\n");
+              EXPECT_EQ(uint32_t(m0 | m1), uint32_t(m0) | uint32_t(m1));
+              //print(m0); print(" ^ "); print(m1); print(" = "); print(m0 ^ m1); print("\n");
+              EXPECT_EQ(uint32_t(m0 ^ m1), uint32_t(m0) ^ uint32_t(m1));
+            }
+          }
+        });
+      });
+    });
+  });
+}
diff --git a/test/unit/cute/core/nullspace.cpp b/test/unit/cute/core/nullspace.cpp
new file mode 100644
index 0000000000..3f4d11a5ff
--- /dev/null
+++ b/test/unit/cute/core/nullspace.cpp
@@ -0,0 +1,94 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <cutlass/trace.h>
+#include <iostream>
+
+#include <cute/tensor.hpp>
+
+using namespace cute;
+
+template <class Layout, class KerLayout>
+void
+test_postconditions(Layout const& layout, KerLayout const& ker_layout)
+{
+  EXPECT_EQ(size(ker_layout), size(layout) / size(filter(layout)));
+
+  for (int i = 0; i < size(ker_layout); ++i) {
+    //printf("%3d: %3d  %3d\n", i, int(ker_layout(i)), int(layout(ker_layout(i))));
+    EXPECT_EQ(layout(ker_layout(i)), 0);
+  }
+}
+
+template <class Layout>
+void
+test_nullspace(Layout const& layout)
+{
+  auto ker_layout = nullspace(layout);
+
+  CUTLASS_TRACE_HOST("ker(" << layout << ")\n" << "  =>  \n" << ker_layout);
+  CUTLASS_TRACE_HOST("Composition: " << coalesce(composition(layout, ker_layout)) << std::endl);
+
+  test_postconditions(layout, ker_layout);
+}
+
+TEST(CuTe_core, Layout_nullspace)
+{
+  CUTLASS_TRACE_HOST("-------------------------------");
+  CUTLASS_TRACE_HOST("NULLSPACE"                      );
+  CUTLASS_TRACE_HOST("-------------------------------");
+
+  {
+    auto layout = Layout<Shape<_2,_2,_2>,Stride<_0,_0,_0>>{};
+
+    test_nullspace(layout);
+  }
+
+  {
+    auto layout = Layout<Shape<_7,_5,_16>,Stride<_0,_0,_0>>{};
+
+    test_nullspace(layout);
+  }
+
+  {
+    auto layout = Layout<Shape<_2,_2,_2>,Stride<_1,_0,_2>>{};
+
+    test_nullspace(layout);
+  }
+
+  {
+    auto layout = Layout<Shape<_7,_5,_16>,Stride<_3,_1,_0>>{};
+
+    test_nullspace(layout);
+  }
+}
diff --git a/test/unit/cute/core/packed_tuple.cpp b/test/unit/cute/core/packed_tuple.cpp
new file mode 100644
index 0000000000..fbbcab0587
--- /dev/null
+++ b/test/unit/cute/core/packed_tuple.cpp
@@ -0,0 +1,581 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <cutlass/trace.h>
+
+#include <cassert>
+#include <cstdint>
+
+#include <tuple>
+#include <cute/container/tuple.hpp>
+#include <cute/container/packed_tuple.hpp>
+#include <cute/algorithm/tuple_algorithms.hpp>
+#include <cute/tensor.hpp>
+
+namespace pt_test {
+
+template <class T>
+struct Nonempty {
+  T datum;
+
+  Nonempty(T const& t) : datum{t} {}
+
+  friend bool operator==(Nonempty<T> const& lhs, Nonempty<T> const& rhs) {
+    return lhs.datum == rhs.datum;
+  }
+
+  friend bool operator!=(Nonempty<T> const& lhs, Nonempty<T> const& rhs) {
+    return !(lhs == rhs);
+  }
+};
+
+template <int V>
+struct Empty {
+  template <int W>
+  friend bool operator==(Empty<V> const&, Empty<W> const&) {
+    return V == W;
+  }
+
+  template <int W>
+  friend bool operator!=(Empty<V> const& lhs, Empty<W> const& rhs) {
+    return !(lhs == rhs);
+  }
+};
+
+// std::tuple
+static_assert(cute::is_standard_layout_v<std::tuple<>>); // it happens to be
+static_assert(cute::is_standard_layout_v<std::tuple<int>>); // it happens to be
+static_assert(cute::is_standard_layout_v<std::tuple<double>>); // it happens to be
+static_assert(not cute::is_standard_layout_v<std::tuple<int, double>>); // it's not
+
+#if ! defined(CUTLASS_USE_PACKED_TUPLE)
+// cute::tuple
+static_assert(cute::is_standard_layout_v<cute::tuple<>>); // it happens to be
+static_assert(cute::is_standard_layout_v<cute::tuple<int>>); // it happens to be
+static_assert(cute::is_standard_layout_v<cute::tuple<double>>); // it happens to be
+static_assert(not cute::is_standard_layout_v<cute::tuple<int, double>>); // it's not
+#endif // CUTLASS_USE_PACKED_TUPLE
+
+// cute::packed_tuple
+static_assert(cute::is_standard_layout_v<cute::packed_tuple<>>);
+static_assert(cute::is_standard_layout_v<cute::packed_tuple<int>>);
+static_assert(cute::is_standard_layout_v<cute::packed_tuple<double>>);
+static_assert(cute::is_standard_layout_v<cute::packed_tuple<int, double>>);  // it is
+static_assert(cute::is_standard_layout_v<cute::packed_tuple<int, int, int, int>>);  // it is
+static_assert(cute::is_standard_layout_v<cute::packed_tuple<int, cute::packed_tuple<int, int>, int>>);  // it is
+static_assert(cute::is_standard_layout_v<cute::packed_tuple<int, cute::packed_tuple<Empty<0>, Empty<0>>, int>>);  // it is
+
+//////////////////////////////////////////////////////////////////////
+// packed_tuple test starts here
+//////////////////////////////////////////////////////////////////////
+
+template <
+  class ExpectedPackedType,
+  size_t ExpectedPackedSize,
+  class ... Args>
+constexpr void
+test_packed_type_alias([[maybe_unused]] ExpectedPackedType packed, std::tuple<Args...> unpacked)
+{
+  using cute::packed_tuple;
+
+  if constexpr ((cute::is_standard_layout_v<Args> && ...)) {
+    static_assert(cute::is_standard_layout_v<packed_tuple<Args...>>);
+  }
+
+  if constexpr ((cute::is_empty_v<Args> && ...)) {
+    static_assert(cute::is_empty_v<packed_tuple<Args...>>);
+  }
+
+  static_assert(cute::tuple_size_v<packed_tuple<Args...>> == sizeof...(Args));
+
+  auto test_element = [unpacked] (auto index) {
+    static_assert(cute::is_same_v<
+      std::tuple_element_t<index, packed_tuple<Args...>>,
+      std::tuple_element_t<index, std::tuple<Args...>>
+    >);
+
+    packed_tuple<Args...> sl = cute::apply(unpacked, [](auto... a){ return cute::make_packed_tuple(a...); });
+    EXPECT_EQ(std::get<index>(unpacked), cute::get<index>(sl));
+  };
+  cute::for_each(std::make_index_sequence<sizeof...(Args)>(), test_element);
+}
+
+void test_packed_type_aliases() {
+  using cute::packed_tuple;
+  test_packed_type_alias<packed_tuple<>, 0>({}, {});
+
+  test_packed_type_alias<packed_tuple<int>, 1, int>({7}, {7});
+  test_packed_type_alias<packed_tuple<double>, 1, double>({1.5}, {1.5});
+
+  // Make sure that class types are handled the same as scalar types
+  test_packed_type_alias<packed_tuple<Nonempty<int>>, 1, Nonempty<int>>(
+    {Nonempty{7}}, {Nonempty{7}});
+  test_packed_type_alias<packed_tuple<Nonempty<double>>, 1, Nonempty<double>>(
+    {Nonempty{1.5}}, {Nonempty{1.5}});
+
+  test_packed_type_alias<packed_tuple<>, 0, Empty<0>>({}, {});
+  test_packed_type_alias<packed_tuple<>, 0, Empty<0>, Empty<1>>(
+    {}, {Empty<0>{}, Empty<1>{}});
+  test_packed_type_alias<packed_tuple<>, 0, Empty<0>, Empty<1>, Empty<2>>(
+    {}, {Empty<0>{}, Empty<1>{}, Empty<2>{}});
+
+  test_packed_type_alias<packed_tuple<int>, 1, Empty<0>, int>(
+    {7}, {Empty<0>{}, 7});
+  test_packed_type_alias<packed_tuple<int>, 1, int, Empty<0>>(
+    {7}, {7, Empty<0>{}});
+
+  test_packed_type_alias<packed_tuple<int>, 1, int, Empty<0>, Empty<1>>(
+    {7}, {7, Empty<0>{}, Empty<1>{}});
+  test_packed_type_alias<packed_tuple<int>, 1, Empty<0>, int, Empty<1>>(
+    {7}, {Empty<0>{}, 7, Empty<1>{}});
+  test_packed_type_alias<packed_tuple<int>, 1, Empty<0>, Empty<1>, int>(
+    {7}, {Empty<0>{}, Empty<1>{}, 7});
+
+  test_packed_type_alias<packed_tuple<int, double>, 2, int, double, Empty<0>>(
+    {7, 1.5}, {7, 1.5, Empty<0>{}});
+  test_packed_type_alias<packed_tuple<int, double>, 2, int, Empty<0>, double>(
+    {7, 1.5}, {7, Empty<0>{}, 1.5});
+  test_packed_type_alias<packed_tuple<int, double>, 2, int, double, Empty<0>>(
+    {7, 1.5}, {7, 1.5, Empty<0>{}});
+
+  test_packed_type_alias<packed_tuple<int, double>, 2, int, double, Empty<0>, Empty<1>>(
+    {7, 1.5}, {7, 1.5, Empty<0>{}, Empty<1>{}});
+  test_packed_type_alias<packed_tuple<int, double>, 2, int, Empty<0>, double, Empty<1>>(
+    {7, 1.5}, {7, Empty<0>{}, 1.5, Empty<1>{}});
+  test_packed_type_alias<packed_tuple<int, double>, 2, int, Empty<0>, Empty<1>, double>(
+    {7, 1.5}, {7, Empty<0>{}, Empty<1>{}, 1.5});
+  test_packed_type_alias<packed_tuple<int, double>, 2, Empty<0>, int, Empty<1>, double>(
+    {7, 1.5}, {Empty<0>{}, 7, Empty<1>{}, 1.5});
+  test_packed_type_alias<packed_tuple<int, double>, 2, Empty<0>, Empty<1>, int, double>(
+    {7, 1.5}, {Empty<0>{}, Empty<1>{}, 7, 1.5});
+
+  test_packed_type_alias<packed_tuple<int, double, float>, 3, Empty<0>, int, double, float>(
+    {7, 1.5, 2.5f}, {Empty<0>{}, 7, 1.5, 2.5f});
+  test_packed_type_alias<packed_tuple<int, double, float>, 3, int, Empty<0>, double, float>(
+    {7, 1.5, 2.5f}, {7, Empty<0>{}, 1.5, 2.5f});
+  test_packed_type_alias<packed_tuple<int, double, float>, 3, int, double, Empty<0>, float>(
+    {7, 1.5, 2.5f}, {7, 1.5, Empty<0>{}, 2.5f});
+  test_packed_type_alias<packed_tuple<int, double, float>, 3, int, double, float, Empty<0>>(
+    {7, 1.5, 2.5f}, {7, 1.5, 2.5f, Empty<0>{}});
+}
+
+template <class Tuple, size_t Which, class ExpectedElementType>
+constexpr bool test_tuple_element() {
+  return cute::is_same_v<std::tuple_element_t<Which, Tuple>, ExpectedElementType>;
+}
+
+void test_tuple_elements() {
+  using cute::packed_tuple;
+
+  static_assert(test_tuple_element<std::tuple<Empty<0>>, 0, Empty<0>>());
+  static_assert(test_tuple_element<packed_tuple<Empty<0>>, 0, Empty<0>>());
+}
+
+// A default-constructible type.
+template <size_t Value>
+struct DefaultConstructible {};
+
+void test_default_constructibility() {
+  using cute::packed_tuple;
+  {
+    [[maybe_unused]] packed_tuple<> t_p_0;
+    [[maybe_unused]] packed_tuple<DefaultConstructible<0>> t_p_1;
+    [[maybe_unused]] packed_tuple<DefaultConstructible<0>, DefaultConstructible<1>> t_p_2;
+    [[maybe_unused]] packed_tuple<DefaultConstructible<0>, int, DefaultConstructible<1>> t_p_3;
+  }
+}
+
+void test_sizes_and_not_storing_empty_types() {
+  using cute::packed_tuple;
+
+  [[maybe_unused]] packed_tuple<
+    int,
+    pt_test::Empty<0>,
+    double
+  > pt{42, pt_test::Empty<0>{}, 1.5};
+  static_assert(cute::is_standard_layout_v<decltype(pt)>);
+  // packed_result_type must only store the packed tuple,
+  // and not the integer_sequence(s) used to access it.
+  // The latter can be represented entirely at compile time as types.
+  struct { int i; double j; } IntDouble;
+  static_assert(sizeof(pt) == sizeof(IntDouble));
+
+  EXPECT_EQ(cute::get<0>(pt), 42);
+  EXPECT_EQ(cute::get<1>(pt), pt_test::Empty<0>{});
+  EXPECT_EQ(cute::get<2>(pt), 1.5);
+  packed_tuple<
+    pt_test::Empty<0>,
+    pt_test::Empty<1>,
+    packed_tuple<
+      pt_test::Empty<0>,
+      pt_test::Empty<1>,
+      packed_tuple<pt_test::Empty<0>, packed_tuple<>>
+    >
+  > pt_empty{};
+  static_assert(cute::is_empty_v<decltype(pt_empty)>);
+  static_assert(cute::is_standard_layout_v<decltype(pt_empty)>);
+  static_assert(sizeof(pt_empty) == 1);
+
+  // Template arguments must be default constructible,
+  // and packed_tuple itself needs a default constructor.
+  [[maybe_unused]] packed_tuple<
+    packed_tuple<int, pt_test::Empty<2>>,
+    double,
+    pt_test::Empty<3>> pt2;
+  static_assert(cute::is_standard_layout_v<decltype(pt2)>);
+
+  // cute::packed_tuple, like the original cute::tuple, does not
+  // promise to have working CTAD (constructor template argument
+  // deduction).
+  [[maybe_unused]] packed_tuple<
+    packed_tuple<int, pt_test::Empty<0>>,
+    pt_test::Empty<1>
+  > pt3{
+    packed_tuple<int, pt_test::Empty<0>>{42, pt_test::Empty<0>{}},
+    pt_test::Empty<1>{}
+  };
+  static_assert(cute::is_standard_layout_v<decltype(pt3)>);
+  static_assert(cute::is_same_v<
+    cute::tuple_element_t<0, decltype(pt3)>,
+    packed_tuple<int, pt_test::Empty<0>>>);
+  static_assert(cute::is_same_v<
+    cute::tuple_element_t<1, decltype(pt3)>,
+    pt_test::Empty<1>>);
+  static_assert(cute::tuple_size_v<cute::tuple_element_t<0, decltype(pt3)>> == 2u);
+
+  packed_tuple<int, pt_test::Empty<0>> pt3_0 = cute::get<0>(pt3);
+  auto pt3_0_1 = cute::get<1>(pt3_0);
+  static_assert(cute::is_same_v<decltype(pt3_0_1), pt_test::Empty<0>>);
+
+  EXPECT_EQ(cute::get<0>(cute::get<0>(pt3)), 42);
+  EXPECT_EQ(cute::get<1>(cute::get<0>(pt3)), pt_test::Empty<0>{});
+}
+
+} // namespace test
+
+TEST(CuTe_core, PackedTuple2)
+{
+  CUTLASS_TRACE_HOST("-------------------------------");
+  CUTLASS_TRACE_HOST("packed_tuple");
+  CUTLASS_TRACE_HOST("-------------------------------");
+
+  pt_test::test_packed_type_aliases();
+  pt_test::test_tuple_elements();
+  pt_test::test_default_constructibility();
+  pt_test::test_sizes_and_not_storing_empty_types();
+}
+
+TEST(CuTe_core, PackedTuple2Get) {
+  using cute::packed_tuple;
+  using pt_test::Empty;
+  using pt_test::Nonempty;
+
+  {
+    using tuple_type = packed_tuple<int>;
+    tuple_type pt{42};
+    static_assert(cute::tuple_size_v<tuple_type> == 1u);
+    static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, int>);
+    EXPECT_EQ(cute::get<0>(pt), 42);
+    cute::get<0>(pt) = 43;
+    EXPECT_EQ(cute::get<0>(pt), 43);
+  }
+  {
+    using tuple_type = packed_tuple<int>;
+    tuple_type const pt{42};
+    EXPECT_EQ(cute::get<0>(pt), 42);
+    static_assert(cute::is_same_v<decltype(cute::get<0>(pt)), int const&>);
+  }
+  {
+    EXPECT_EQ(cute::get<0>(packed_tuple<int>{42}), 42);
+  }
+
+  {
+    using tuple_type = packed_tuple<pt_test::Empty<0>>;
+    tuple_type pt;
+    static_assert(cute::tuple_size_v<tuple_type> == 1u);
+    static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, pt_test::Empty<0>>);
+    EXPECT_EQ(cute::get<0>(pt), pt_test::Empty<0>{});
+  }
+  {
+    using tuple_type = packed_tuple<pt_test::Empty<0>>;
+    tuple_type const pt;
+    EXPECT_EQ(cute::get<0>(pt), pt_test::Empty<0>{});
+  }
+  {
+    using tuple_type = packed_tuple<pt_test::Empty<0>>;
+    EXPECT_EQ(cute::get<0>(tuple_type{}), pt_test::Empty<0>{});
+  }
+
+  {
+    using tuple_type = packed_tuple<int, double>;
+    tuple_type pt{1, 2.5};
+    static_assert(cute::tuple_size_v<tuple_type> == 2u);
+    static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, int>);
+    static_assert(cute::is_same_v<cute::tuple_element_t<1, tuple_type>, double>);
+    EXPECT_EQ(cute::get<0>(pt), 1);
+    cute::get<0>(pt) = 2;
+    EXPECT_EQ(cute::get<0>(pt), 2);
+    EXPECT_EQ(cute::get<1>(pt), 2.5);
+    cute::get<1>(pt) = 3.5;
+    EXPECT_EQ(cute::get<1>(pt), 3.5);
+  }
+  {
+    using tuple_type = packed_tuple<int, double>;
+    tuple_type const pt{1, 2.5};
+    EXPECT_EQ(cute::get<0>(pt), 1);
+    static_assert(cute::is_same_v<decltype(cute::get<0>(pt)), int const&>);
+    EXPECT_EQ(cute::get<1>(pt), 2.5);
+    static_assert(cute::is_same_v<decltype(cute::get<1>(pt)), double const&>);
+  }
+  {
+    using tuple_type = packed_tuple<int, double>;
+    EXPECT_EQ(cute::get<0>(tuple_type{1, 2.5}), 1);
+    EXPECT_EQ(cute::get<1>(tuple_type{1, 2.5}), 2.5);
+  }
+
+  {
+    using tuple_type = packed_tuple<Empty<0>, double>;
+    tuple_type pt{Empty<0>{}, 2.5};
+    static_assert(cute::tuple_size_v<tuple_type> == 2u);
+    static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, Empty<0>>);
+    static_assert(cute::is_same_v<cute::tuple_element_t<1, tuple_type>, double>);
+    EXPECT_EQ(cute::get<0>(pt), Empty<0>{});
+    EXPECT_EQ(cute::get<1>(pt), 2.5);
+    cute::get<1>(pt) = 3.5;
+    EXPECT_EQ(cute::get<1>(pt), 3.5);
+  }
+  {
+    using tuple_type = packed_tuple<Empty<0>, double>;
+    tuple_type const pt{Empty<0>{}, 2.5};
+    EXPECT_EQ(cute::get<0>(pt), Empty<0>{});
+    static_assert(cute::is_same_v<decltype(cute::get<0>(pt)), Empty<0>>);
+    EXPECT_EQ(cute::get<1>(pt), 2.5);
+    static_assert(cute::is_same_v<decltype(cute::get<1>(pt)), double const&>);
+  }
+  {
+    using tuple_type = packed_tuple<Empty<0>, double>;
+    EXPECT_EQ(cute::get<0>(tuple_type{Empty<0>{}, 2.5}), Empty<0>{});
+    EXPECT_EQ(cute::get<1>(tuple_type{Empty<0>{}, 2.5}), 2.5);
+  }
+
+  {
+    using tuple_type = packed_tuple<int, double, Nonempty<float>>;
+    tuple_type pt{1, 2.5, Nonempty{3.25f}};
+    static_assert(cute::tuple_size_v<tuple_type> == 3u);
+    static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, int>);
+    static_assert(cute::is_same_v<cute::tuple_element_t<1, tuple_type>, double>);
+    static_assert(cute::is_same_v<cute::tuple_element_t<2, tuple_type>, Nonempty<float>>);
+    EXPECT_EQ(cute::get<0>(pt), 1);
+    EXPECT_EQ(cute::get<1>(pt), 2.5);
+    EXPECT_EQ(cute::get<2>(pt), Nonempty{3.25f});
+
+    cute::get<0>(pt) = 42;
+    EXPECT_EQ(cute::get<0>(pt), 42);
+    cute::get<1>(pt) = 4.5;
+    EXPECT_EQ(cute::get<1>(pt), 4.5);
+    cute::get<2>(pt) = Nonempty<float>{3.75f};
+    EXPECT_EQ(cute::get<2>(pt), Nonempty<float>{3.75f});
+  }
+  {
+    using tuple_type = packed_tuple<int, double, Nonempty<float>>;
+    tuple_type const pt{1, 2.5, Nonempty{3.25f}};
+    EXPECT_EQ(cute::get<0>(pt), 1);
+    EXPECT_EQ(cute::get<1>(pt), 2.5);
+    EXPECT_EQ(cute::get<2>(pt), Nonempty{3.25f});
+  }
+  {
+    using tuple_type = packed_tuple<int, double, Nonempty<float>>;
+    EXPECT_EQ((cute::get<0>(tuple_type{1, 2.5, Nonempty{3.25f}})), 1);
+    EXPECT_EQ((cute::get<1>(tuple_type{1, 2.5, Nonempty{3.25f}})), 2.5);
+    EXPECT_EQ((cute::get<2>(tuple_type{1, 2.5, Nonempty{3.25f}})), Nonempty{3.25f});
+  }
+
+  {
+    using tuple_type = packed_tuple<int, Empty<0>, Nonempty<float>>;
+    packed_tuple<int, Empty<0>, Nonempty<float>> pt{1, Empty<0>{}, Nonempty{3.25f}};
+    static_assert(cute::tuple_size_v<tuple_type> == 3u);
+    static_assert(cute::is_same_v<cute::tuple_element_t<0, tuple_type>, int>);
+    static_assert(cute::is_same_v<cute::tuple_element_t<1, tuple_type>, Empty<0>>);
+    static_assert(cute::is_same_v<cute::tuple_element_t<2, tuple_type>, Nonempty<float>>);
+    EXPECT_EQ(cute::get<0>(pt), 1);
+    EXPECT_EQ(cute::get<1>(pt), Empty<0>{});
+    EXPECT_EQ(cute::get<2>(pt), Nonempty{3.25f});
+
+    cute::get<0>(pt) = 42;
+    EXPECT_EQ(cute::get<0>(pt), 42);
+    cute::get<2>(pt) = Nonempty<float>{3.75f};
+    EXPECT_EQ(cute::get<2>(pt), Nonempty<float>{3.75f});
+  }
+  {
+    using tuple_type = packed_tuple<int, Empty<0>, Nonempty<float>>;
+    tuple_type const pt{1, Empty<0>{}, Nonempty{3.25f}};
+    EXPECT_EQ(cute::get<0>(pt), 1);
+    EXPECT_EQ(cute::get<1>(pt), Empty<0>{});
+    EXPECT_EQ(cute::get<2>(pt), Nonempty{3.25f});
+  }
+  {
+    using tuple_type = packed_tuple<int, Empty<0>, Nonempty<float>>;
+    EXPECT_EQ((cute::get<0>(tuple_type{1, Empty<0>{}, Nonempty{3.25f}})), 1);
+    EXPECT_EQ((cute::get<1>(tuple_type{1, Empty<0>{}, Nonempty{3.25f}})), Empty<0>{});
+    EXPECT_EQ((cute::get<2>(tuple_type{1, Empty<0>{}, Nonempty{3.25f}})), Nonempty{3.25f});
+  }
+}
+
+namespace pt_test {
+
+// An empty class type to which Empty is convertible.
+template<int Value>
+struct ConvertibleFromEmpty {
+  constexpr ConvertibleFromEmpty() = default;
+  constexpr ConvertibleFromEmpty(Empty<Value>) {}
+
+  template <int OtherValue>
+  friend constexpr bool operator==(ConvertibleFromEmpty<Value> const&, ConvertibleFromEmpty<OtherValue> const&) {
+    return Value == OtherValue;
+  }
+
+  template <int OtherValue>
+  friend constexpr bool operator!=(ConvertibleFromEmpty<Value> const& lhs, ConvertibleFromEmpty<OtherValue> const& rhs) {
+    return !(lhs == rhs);
+  }
+};
+
+} // end namespace pt_test
+
+TEST(CuTe_core, PackedTupleConstexprDefaultConstruction) {
+  // Make sure that packed_tuple's default constructor is constexpr.
+  // MSVC makes this a bit more challenging than usual.
+
+  using pt_test::Empty;
+  {
+    [[maybe_unused]] constexpr cute::detail::ESO_t<Empty<0>> eso1{};
+    [[maybe_unused]] constexpr cute::detail::ESO_t<int64_t> eso2{};
+  }
+  {
+    [[maybe_unused]] constexpr cute::detail::ESO_t<Empty<0>, Empty<1>> eso0{};
+    [[maybe_unused]] constexpr cute::detail::ESO_t<int64_t, Empty<1>> eso1{};
+    [[maybe_unused]] constexpr cute::detail::ESO_t<Empty<0>, int64_t> eso2{};
+    [[maybe_unused]] constexpr cute::detail::ESO_t<int64_t, int64_t> eso3{};
+  }
+}
+
+TEST(CuTe_core, PackedTupleConvertingConstruction) {
+  using cute::packed_tuple;
+  using pt_test::ConvertibleFromEmpty;
+  using pt_test::Empty;
+  using pt_test::Nonempty;  
+
+  {
+    using tuple_type = cute::tuple<Nonempty<int>>;
+    [[maybe_unused]] tuple_type t(7);
+    EXPECT_EQ(cute::get<0>(t), Nonempty<int>(7));
+  }
+  {
+    using tuple_type = packed_tuple<Nonempty<int>>;
+    [[maybe_unused]] tuple_type t(7);
+    EXPECT_EQ(cute::get<0>(t), Nonempty<int>(7));
+  }
+  {
+    using tuple_type = cute::tuple<ConvertibleFromEmpty<0>>;
+    [[maybe_unused]] tuple_type t(Empty<0>{});
+    EXPECT_EQ(cute::get<0>(t), ConvertibleFromEmpty<0>{});
+  }
+  {
+    using tuple_type = packed_tuple<ConvertibleFromEmpty<0>>;
+    [[maybe_unused]] tuple_type t(Empty<0>{});
+    EXPECT_EQ(cute::get<0>(t), ConvertibleFromEmpty<0>{});
+  }
+
+  {
+    using tuple_type = cute::tuple<float, Nonempty<int>>;
+    [[maybe_unused]] tuple_type t(1.5f, 7);
+    EXPECT_EQ(cute::get<0>(t), 1.5f);
+    EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
+  }
+  {
+    using tuple_type = packed_tuple<float, Nonempty<int>>;
+    [[maybe_unused]] tuple_type t(1.5f, 7);
+    EXPECT_EQ(cute::get<0>(t), 1.5f);
+    EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
+  }
+
+  {
+    using tuple_type = cute::tuple<Empty<0>, Nonempty<int>>;
+    [[maybe_unused]] tuple_type t(Empty<0>{}, 7);
+    EXPECT_EQ(cute::get<0>(t), Empty<0>{});
+    EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
+  }
+  {
+    using tuple_type = packed_tuple<Empty<0>, Nonempty<int>>;
+    [[maybe_unused]] tuple_type t(Empty<0>{}, 7);
+    EXPECT_EQ(cute::get<0>(t), Empty<0>{});
+    EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
+  }
+
+  {
+    using tuple_type = cute::tuple<ConvertibleFromEmpty<0>, Nonempty<int>>;
+    [[maybe_unused]] tuple_type t(Empty<0>{}, 7);
+    EXPECT_EQ(cute::get<0>(t), ConvertibleFromEmpty<0>{});
+    EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
+  }
+  {
+    using tuple_type = packed_tuple<ConvertibleFromEmpty<0>, Nonempty<int>>;
+    [[maybe_unused]] tuple_type t(Empty<0>{}, 7);
+    EXPECT_EQ(cute::get<0>(t), ConvertibleFromEmpty<0>{});
+    EXPECT_EQ(cute::get<1>(t), Nonempty<int>(7));
+  }
+
+  {
+    using inner_tuple_type = cute::tuple<Empty<0>>;
+    using outer_tuple_type = cute::tuple<inner_tuple_type>;
+    [[maybe_unused]] outer_tuple_type t(inner_tuple_type{Empty<0>{}});
+  }
+  {
+    using inner_tuple_type = packed_tuple<Empty<0>>;
+    using outer_tuple_type = packed_tuple<inner_tuple_type>;
+    [[maybe_unused]] outer_tuple_type t(inner_tuple_type{Empty<0>{}});
+  }
+  {
+    using inner_tuple_type = cute::tuple<ConvertibleFromEmpty<0>>;
+    using outer_tuple_type = cute::tuple<inner_tuple_type>;
+    [[maybe_unused]] outer_tuple_type t(inner_tuple_type{Empty<0>{}});
+  }
+  {
+    using inner_tuple_type = packed_tuple<ConvertibleFromEmpty<0>>;
+    using outer_tuple_type = packed_tuple<inner_tuple_type>;
+    [[maybe_unused]] outer_tuple_type t(inner_tuple_type{Empty<0>{}});
+  }
+
+}
+
+
diff --git a/test/unit/cute/core/pointer.cpp b/test/unit/cute/core/pointer.cpp
new file mode 100644
index 0000000000..0f2486977e
--- /dev/null
+++ b/test/unit/cute/core/pointer.cpp
@@ -0,0 +1,108 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <cutlass/trace.h>
+
+#include <cute/pointer.hpp>
+
+TEST(CuTe_core, Pointer)
+{
+  using namespace cute;
+
+  CUTLASS_TRACE_HOST("-------------------------------");
+  CUTLASS_TRACE_HOST("CuTe pointer wrappers");
+  CUTLASS_TRACE_HOST("-------------------------------");
+
+  // Test T* overloads (T can be nonconst or const)
+  {
+    using T = float;
+    using expected_type = cute::gmem_ptr<T*>;
+    T* p = nullptr;
+
+    // explicit template argument
+    auto gmem_p0 = cute::make_gmem_ptr<T>(p);
+    static_assert(cute::is_same_v<decltype(gmem_p0), expected_type>);
+
+    // deduced template argument
+    auto gmem_p1 = cute::make_gmem_ptr(p);
+    static_assert(cute::is_same_v<decltype(gmem_p1), expected_type>);
+  }
+  {
+    using T = float const;
+    using expected_type = cute::gmem_ptr<T*>;
+    T* p = nullptr;
+
+    // explicit template argument
+    auto gmem_p0 = cute::make_gmem_ptr<T>(p);
+    static_assert(cute::is_same_v<decltype(gmem_p0), expected_type>);
+
+    // deduced template argument
+    auto gmem_p1 = cute::make_gmem_ptr(p);
+    static_assert(cute::is_same_v<decltype(gmem_p1), expected_type>);
+  }
+
+  // Test void* and void const* overloads
+  // (these require an explicit template argument)
+  {
+    using T = float;
+    using expected_type = cute::gmem_ptr<T*>;
+    void* p = nullptr;
+
+    auto gmem_p0 = cute::make_gmem_ptr<T>(p);
+    static_assert(cute::is_same_v<decltype(gmem_p0), expected_type>);
+  }
+  {
+    using T = float const;
+    using expected_type = cute::gmem_ptr<T*>;
+    void const* p = nullptr;
+
+    auto gmem_p0 = cute::make_gmem_ptr<T>(p);
+    static_assert(cute::is_same_v<decltype(gmem_p0), expected_type>);
+  }
+
+  // Test nullptr_t overload.
+  {
+    using T = float;
+    using expected_type = cute::gmem_ptr<T*>;
+
+    auto gmem_p0 = cute::make_gmem_ptr<T>(nullptr);
+    static_assert(cute::is_same_v<decltype(gmem_p0), expected_type>);
+  }
+  {
+    using T = float const;
+    using expected_type = cute::gmem_ptr<T*>;
+
+    auto gmem_p0 = cute::make_gmem_ptr<T>(nullptr);
+    static_assert(cute::is_same_v<decltype(gmem_p0), expected_type>);
+  }
+}
diff --git a/test/unit/cute/core/reverse.cpp b/test/unit/cute/core/reverse.cpp
new file mode 100644
index 0000000000..8fd4044fcc
--- /dev/null
+++ b/test/unit/cute/core/reverse.cpp
@@ -0,0 +1,137 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+#include "cutlass/trace.h"
+
+#include "cute/algorithm/tuple_algorithms.hpp"
+#include "cute/container/array.hpp"
+#include "cute/container/tuple.hpp"
+
+TEST(CuTe_core, Reverse_Tuple)
+{
+  using cute::get;
+
+  {
+    const auto t = cute::make_tuple();
+    [[maybe_unused]] auto t_r = cute::reverse(t);
+    static_assert(cute::tuple_size_v<decltype(t_r)> == 0);
+  }
+
+  {
+    const auto t = cute::make_tuple(123);
+    [[maybe_unused]] auto t_r = cute::reverse(t);
+    static_assert(cute::tuple_size_v<decltype(t_r)> == 1);
+    EXPECT_EQ(get<0>(t_r), 123);
+  }
+
+  {
+    const auto t = cute::make_tuple(123, 456);
+    [[maybe_unused]] auto t_r = cute::reverse(t);
+    static_assert(cute::tuple_size_v<decltype(t_r)> == 2);
+    EXPECT_EQ(get<0>(t_r), 456);
+    EXPECT_EQ(get<1>(t_r), 123);
+  }
+
+  {
+    const auto t = cute::make_tuple(1, 2, 3, 4, 5);
+    auto t_r = cute::reverse(t);
+    static_assert(cute::tuple_size_v<decltype(t_r)> == 5);
+
+    EXPECT_EQ(get<0>(t_r), 5);
+    EXPECT_EQ(get<1>(t_r), 4);
+    EXPECT_EQ(get<2>(t_r), 3);
+    EXPECT_EQ(get<3>(t_r), 2);
+    EXPECT_EQ(get<4>(t_r), 1);
+  }
+
+  {
+    const auto t = cute::make_tuple(cute::Int<1>{}, cute::Int<2>{}, 3);
+    auto t_r = cute::reverse(t);
+    static_assert(cute::tuple_size_v<decltype(t_r)> == 3);
+    static_assert(cute::is_same_v<cute::remove_cvref_t<decltype(get<0>(t_r))>, int>);
+    static_assert(cute::is_same_v<cute::remove_cvref_t<decltype(get<1>(t_r))>, cute::Int<2>>);
+    static_assert(cute::is_same_v<cute::remove_cvref_t<decltype(get<2>(t_r))>, cute::Int<1>>);
+
+    EXPECT_EQ(get<0>(t_r), 3);
+    EXPECT_EQ(get<1>(t_r), cute::Int<2>{});
+    EXPECT_EQ(get<2>(t_r), cute::Int<1>{});
+  }
+}
+
+TEST(CuTe_core, Reverse_Array)
+{
+  using cute::get;
+
+  {
+    const auto t = cute::array<int, 0>{};
+    [[maybe_unused]] auto t_r = cute::reverse(t);
+    static_assert(cute::tuple_size_v<decltype(t_r)> == 0);
+
+    using reverse_type = cute::array<int, 0>;
+    static_assert(cute::is_same_v<decltype(t_r), reverse_type>);
+  }
+
+  {
+    const auto t = cute::array<int, 1>{123};
+    [[maybe_unused]] auto t_r = cute::reverse(t);
+    static_assert(cute::tuple_size_v<decltype(t_r)> == 1);
+    EXPECT_EQ(get<0>(t_r), 123);
+
+    using reverse_type = cute::array<int, 1>;
+    static_assert(cute::is_same_v<decltype(t_r), reverse_type>);
+  }
+
+  {
+    const auto t = cute::array<int, 2>{123, 456};
+    [[maybe_unused]] auto t_r = cute::reverse(t);
+    static_assert(cute::tuple_size_v<decltype(t_r)> == 2);
+    EXPECT_EQ(get<0>(t_r), 456);
+    EXPECT_EQ(get<1>(t_r), 123);
+
+    using reverse_type = cute::array<int, 2>;
+    static_assert(cute::is_same_v<decltype(t_r), reverse_type>);
+  }
+
+  {
+    const auto t = cute::array<float, 5>{1.125f, 2.25f, 3.5f, 4.625f, 5.75f};
+    auto t_r = cute::reverse(t);
+    static_assert(cute::tuple_size_v<decltype(t_r)> == 5);
+    EXPECT_EQ(get<0>(t_r), 5.75f);
+    EXPECT_EQ(get<1>(t_r), 4.625f);
+    EXPECT_EQ(get<2>(t_r), 3.5f);
+    EXPECT_EQ(get<3>(t_r), 2.25f);
+    EXPECT_EQ(get<4>(t_r), 1.125f);
+
+    using reverse_type = cute::array<float, 5>;
+    static_assert(cute::is_same_v<decltype(t_r), reverse_type>);
+  }
+}
diff --git a/test/unit/cute/core/swizzle_layout.cpp b/test/unit/cute/core/swizzle_layout.cpp
new file mode 100644
index 0000000000..211a40fac4
--- /dev/null
+++ b/test/unit/cute/core/swizzle_layout.cpp
@@ -0,0 +1,116 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <cutlass/trace.h>
+
+#include <cute/tensor_impl.hpp>
+#include <cute/swizzle_layout.hpp>
+
+template <class SwLayout>
+void
+test_swizzle_2d(SwLayout const& sw_layout)
+{
+  using namespace cute;
+
+  auto sw_tensor = make_tensor(counting_iterator<int>{0}, sw_layout);
+
+  //print_tensor(sw_tensor);
+
+  // Dynamic slicing
+  for (int i = 0; i < size<0>(sw_tensor); ++i) {
+    auto sliced_tensor = sw_tensor(i,_);
+    //printf("sw_tensor(%d,_) => ", int(i)); print(sliced_tensor); printf("\n");
+    for (int j = 0; j < size<1>(sw_tensor); ++j) {
+      EXPECT_EQ(sw_tensor(i,j), sliced_tensor(j));
+    }
+  }
+
+  // Static slicing
+  cute::for_each(make_int_sequence<size<0>(sw_tensor)>{}, [&] (auto i) {
+    auto sliced_tensor = sw_tensor(i,_);
+    //printf("sw_tensor(%d,_) => ", int(i)); print(sliced_tensor); printf("\n");
+    // If sw_tensor is static, then sliced_tensor should be too
+    auto sw_tensor_2 = sw_tensor;
+    static_assert(is_static<decltype(layout(sliced_tensor))>::value || not is_static<decltype(layout(sw_tensor_2))>::value);
+    cute::for_each(make_int_sequence<size(sliced_tensor)>{}, [&] (auto j) {
+      EXPECT_EQ(sw_tensor(i,j), sliced_tensor(j));
+    });
+  });
+
+  // Dynamic slicing
+  for (int j = 0; j < size<1>(sw_tensor); ++j) {
+    auto sliced_tensor = sw_tensor(_,j);
+    //printf("sw_tensor(_,%d) => ", int(j)); print(sliced_tensor); printf("\n");
+    for (int i = 0; i < size<0>(sw_tensor); ++i) {
+      EXPECT_EQ(sw_tensor(i,j), sliced_tensor(i));
+    }
+  }
+
+  // Static slicing
+  cute::for_each(make_int_sequence<size<1>(sw_tensor)>{}, [&] (auto j) {
+    auto sliced_tensor = sw_tensor(_,j);
+    //printf("sw_tensor(_,%d) => ", int(j)); print(sliced_tensor); printf("\n");
+    // If sw_tensor is static, then sliced_tensor should be too
+    auto sw_tensor_2 = sw_tensor;
+    static_assert(is_static<decltype(layout(sliced_tensor))>::value || not is_static<decltype(layout(sw_tensor_2))>::value);
+    cute::for_each(make_int_sequence<size(sliced_tensor)>{}, [&] (auto i) {
+      EXPECT_EQ(sw_tensor(i,j), sliced_tensor(i));
+    });
+  });
+}
+
+TEST(CuTe_core, SwizzleLayout)
+{
+  using namespace cute;
+
+  {
+  auto sw_layout = composition(Swizzle<3,0,3>{},
+                               Layout<Shape <_8,_8>,
+                                      Stride<_8,_1>>{});
+  test_swizzle_2d(sw_layout);
+  }
+
+  {
+  auto sw_layout = composition(Swizzle<3,0,-3>{},
+                               Layout<Shape <_8,_8>,
+                                      Stride<_8,_1>>{});
+  test_swizzle_2d(sw_layout);
+  }
+
+  {
+  auto sw_layout = composition(Swizzle<2,1,3>{},
+                               Layout<Shape <Shape < _2,_2,_2>,Shape <_2,_2, _2>>,
+                                      Stride<Stride<_32,_2,_8>,Stride<_4,_1,_16>>>{});
+  test_swizzle_2d(sw_layout);
+  }
+}
diff --git a/test/unit/cute/core/transform.cpp b/test/unit/cute/core/transform.cpp
new file mode 100644
index 0000000000..2fda83fe34
--- /dev/null
+++ b/test/unit/cute/core/transform.cpp
@@ -0,0 +1,49 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <cutlass/trace.h>
+#include <cute/tensor.hpp>
+#include <cute/numeric/complex.hpp>
+
+TEST(CuTe_core, Transform) {
+  using namespace cute;
+  complex<float> array[4] = {{0,0}, {1,0}, {0,1}, {1,1}};
+  complex<float> correct[4] = {{0,0}, {1,0}, {0,-1}, {1,-1}};
+  auto tensor = make_tensor(static_cast<complex<float>*>(array), make_layout(make_shape(4)));
+  conjugate conj;
+  transform(tensor, conj);
+  for (int i = 0; i < 4; ++i)
+  {
+    EXPECT_EQ(tensor(i), correct[i]);
+  }
+}
diff --git a/test/unit/cute/core/tuple.cpp b/test/unit/cute/core/tuple.cpp
new file mode 100644
index 0000000000..fa2a0ccf9a
--- /dev/null
+++ b/test/unit/cute/core/tuple.cpp
@@ -0,0 +1,266 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <cutlass/trace.h>
+#include <cute/tensor.hpp>
+
+TEST(CuTe_core, Tuple)
+{
+  using namespace cute;
+
+  CUTLASS_TRACE_HOST("-------------------------------");
+  CUTLASS_TRACE_HOST("SIMPLE STATIC AND DYNAMIC TUPLES");
+  CUTLASS_TRACE_HOST("-------------------------------");
+
+  using tuple_2d_s_type = tuple<_8, _4>;                            // (8,4)
+  using tuple_3d_s_type = tuple<_8, _4, _2>;                        // (8,4,2)
+  using tuple_3h_s_type = tuple<tuple<_1, _2>, _8, _2>;             // ((1,2),8,2)
+
+  using tuple_2d_d_type = tuple<int, int>;                          // (8,4)
+  using tuple_3d_d_type = tuple<int, int, int>;                     // (8,4,2)
+  using tuple_3h_d_type = tuple<tuple<int, int>, int, int>;         // ((1,2),8,2)
+
+  using tuple_2d_m_type = tuple<_8, int>;                           // (8,4)
+  using tuple_3d_m_type = tuple<int, int, _2>;                      // (8,4,2)
+  using tuple_3h_m_type = tuple<tuple<int, _2>, int, int>;          // ((1,2),8,2)
+
+  tuple_2d_s_type tuple_2d_s;
+  tuple_3d_s_type tuple_3d_s;
+  tuple_3h_s_type tuple_3h_s;
+
+  tuple_2d_d_type tuple_2d_d(8,4);
+  tuple_3d_d_type tuple_3d_d(8,4,2);
+  tuple_3h_d_type tuple_3h_d(tuple<int,int>(1,2),8,2);
+
+  tuple_2d_m_type tuple_2d_m(_8{}, 4);
+  tuple_3d_m_type tuple_3d_m(8,4,_2{});
+  tuple_3h_m_type tuple_3h_m(tuple<int,_2>(1,_2{}),8,2);
+
+  CUTLASS_TRACE_HOST(tuple_2d_s << (is_static<tuple_2d_s_type>::value ? "  Static  " : "  Dynamic  ")
+            << "sizeof = " << sizeof(tuple_2d_s_type));
+  ASSERT_TRUE(is_static<tuple_2d_s_type>::value == true);
+  ASSERT_TRUE(sizeof(tuple_2d_s_type) == 1);
+  ASSERT_TRUE(std::is_empty<tuple_2d_s_type>::value);
+
+  CUTLASS_TRACE_HOST(tuple_3d_s << (is_static<tuple_3d_s_type>::value ? "  Static  " : "  Dynamic  ")
+            << "sizeof = " << sizeof(tuple_3d_s_type));
+  ASSERT_TRUE(is_static<tuple_3d_s_type>::value == true);
+  ASSERT_TRUE(sizeof(tuple_3d_s_type) == 1);
+  ASSERT_TRUE(std::is_empty<tuple_3d_s_type>::value);
+
+  CUTLASS_TRACE_HOST(tuple_3h_s << (is_static<tuple_3h_s_type>::value ? "  Static  " : "  Dynamic  ")
+            << "sizeof = " << sizeof(tuple_3h_s_type));
+  ASSERT_TRUE(is_static<tuple_3h_s_type>::value == true);
+  ASSERT_TRUE(sizeof(tuple_3h_s_type) == 1);
+  ASSERT_TRUE(std::is_empty<tuple_3h_s_type>::value);
+
+  CUTLASS_TRACE_HOST(tuple_2d_d << (is_static<tuple_2d_d_type>::value ? "  Static  " : "  Dynamic  ")
+            << "sizeof = " << sizeof(tuple_2d_d_type));
+  ASSERT_TRUE(is_static<tuple_2d_d_type>::value == false);
+  ASSERT_TRUE(sizeof(tuple_2d_d_type) == 8);
+  ASSERT_TRUE(!std::is_empty<tuple_2d_d_type>::value);
+
+  CUTLASS_TRACE_HOST(tuple_3d_d << (is_static<tuple_3d_d_type>::value ? "  Static  " : "  Dynamic  ")
+            << "sizeof = " << sizeof(tuple_3d_d_type));
+  ASSERT_TRUE(is_static<tuple_3d_d_type>::value == false);
+  ASSERT_TRUE(sizeof(tuple_3d_d_type) == 12);
+  ASSERT_TRUE(!std::is_empty<tuple_3d_d_type>::value);
+
+  CUTLASS_TRACE_HOST(tuple_3h_d << (is_static<tuple_3h_d_type>::value ? "  Static  " : "  Dynamic  ")
+            << "sizeof = " << sizeof(tuple_3h_d_type));
+  ASSERT_TRUE(is_static<tuple_3h_d_type>::value == false);
+  ASSERT_TRUE(sizeof(tuple_3h_d_type) == 16);
+  ASSERT_TRUE(!std::is_empty<tuple_3h_d_type>::value);
+
+  CUTLASS_TRACE_HOST(tuple_2d_m << (is_static<tuple_2d_m_type>::value ? "  Static  " : "  Dynamic  ")
+            << "sizeof = " << sizeof(tuple_2d_m_type));
+  ASSERT_TRUE(is_static<tuple_2d_m_type>::value == false);
+  ASSERT_TRUE(sizeof(tuple_2d_m_type) == 4);
+  ASSERT_TRUE(!std::is_empty<tuple_2d_m_type>::value);
+
+  CUTLASS_TRACE_HOST(tuple_3d_m << (is_static<tuple_3d_m_type>::value ? "  Static  " : "  Dynamic  ")
+            << "sizeof = " << sizeof(tuple_3d_m_type));
+  ASSERT_TRUE(is_static<tuple_3d_m_type>::value == false);
+  ASSERT_TRUE(sizeof(tuple_3d_m_type) == 8);
+  ASSERT_TRUE(!std::is_empty<tuple_3d_m_type>::value);
+
+  CUTLASS_TRACE_HOST(tuple_3h_m << (is_static<tuple_3h_m_type>::value ? "  Static  " : "  Dynamic  ")
+            << "sizeof = " << sizeof(tuple_3h_m_type));
+  ASSERT_TRUE(is_static<tuple_3h_m_type>::value == false);
+  ASSERT_TRUE(sizeof(tuple_3h_m_type) == 12);
+  ASSERT_TRUE(!std::is_empty<tuple_3h_m_type>::value);
+
+  CUTLASS_TRACE_HOST("-------------------------------");
+  CUTLASS_TRACE_HOST("SIMPLE TUPLE OPS");
+  CUTLASS_TRACE_HOST("-------------------------------");
+
+  CUTLASS_TRACE_HOST("product(" << tuple_2d_s << ") => " << product(tuple_2d_s));
+  CUTE_STATIC_ASSERT_V(product(tuple_2d_s) == _32{});
+  CUTLASS_TRACE_HOST("product(" << tuple_3d_s << ") => " << product(tuple_3d_s));
+  CUTE_STATIC_ASSERT_V(product(tuple_3d_s) == _64{});
+  CUTLASS_TRACE_HOST("product(" << tuple_3h_s << ") => " << product(tuple_3h_s));
+  CUTE_STATIC_ASSERT_V(product(tuple_3h_s) == _32{});
+
+  CUTLASS_TRACE_HOST("product(" << tuple_2d_d << ") => " << product(tuple_2d_d));
+  ASSERT_TRUE(product(tuple_2d_d) == 32);
+  CUTLASS_TRACE_HOST("product(" << tuple_3d_d << ") => " << product(tuple_3d_d));
+  ASSERT_TRUE(product(tuple_3d_d) == 64);
+  CUTLASS_TRACE_HOST("product(" << tuple_3h_d << ") => " << product(tuple_3h_d));
+  ASSERT_TRUE(product(tuple_3h_d) == 32);
+
+  CUTLASS_TRACE_HOST("product(" << tuple_2d_m << ") => " << product(tuple_2d_m));
+  ASSERT_TRUE(product(tuple_2d_m) == 32);
+  CUTLASS_TRACE_HOST("product(" << tuple_3d_m << ") => " << product(tuple_3d_m));
+  ASSERT_TRUE(product(tuple_3d_m) == 64);
+  CUTLASS_TRACE_HOST("product(" << tuple_3h_m << ") => " << product(tuple_3h_m));
+  ASSERT_TRUE(product(tuple_3h_m) == 32);
+
+  CUTLASS_TRACE_HOST("max(" << tuple_2d_s << ") => " << max(tuple_2d_s));
+  CUTE_STATIC_ASSERT_V(max(tuple_2d_s) == _8{});
+  CUTLASS_TRACE_HOST("max(" << tuple_3d_s << ") => " << max(tuple_3d_s));
+  CUTE_STATIC_ASSERT_V(max(tuple_3d_s) == _8{});
+  CUTLASS_TRACE_HOST("max(" << tuple_3h_s << ") => " << max(tuple_3h_s));
+  CUTE_STATIC_ASSERT_V(max(tuple_3h_s) == _8{});
+
+  CUTLASS_TRACE_HOST("max(" << tuple_2d_d << ") => " << max(tuple_2d_d));
+  ASSERT_TRUE(max(tuple_2d_d) == 8);
+  CUTLASS_TRACE_HOST("max(" << tuple_3d_d << ") => " << max(tuple_3d_d));
+  ASSERT_TRUE(max(tuple_3d_d) == 8);
+  CUTLASS_TRACE_HOST("max(" << tuple_3h_d << ") => " << max(tuple_3h_d));
+  ASSERT_TRUE(max(tuple_3h_d) == 8);
+
+  CUTLASS_TRACE_HOST("max(" << tuple_2d_m << ") => " << max(tuple_2d_m));
+  ASSERT_TRUE(max(tuple_2d_m) == 8);
+  CUTLASS_TRACE_HOST("max(" << tuple_3d_m << ") => " << max(tuple_3d_m));
+  ASSERT_TRUE(max(tuple_3d_m) == 8);
+  CUTLASS_TRACE_HOST("max(" << tuple_3h_m << ") => " << max(tuple_3h_m));
+  ASSERT_TRUE(max(tuple_3h_m) == 8);
+
+  // 2d s|d|m
+  CUTLASS_TRACE_HOST("inner_product(" << tuple_2d_s << ", " << tuple_2d_s << ") => "
+            << inner_product(tuple_2d_s, tuple_2d_s));
+  CUTE_STATIC_ASSERT_V(inner_product(tuple_2d_s, tuple_2d_s) == Int<80>{});
+  CUTLASS_TRACE_HOST("inner_product(" << tuple_2d_d << ", " << tuple_2d_d << ") => "
+            << inner_product(tuple_2d_d, tuple_2d_d));
+  ASSERT_TRUE(inner_product(tuple_2d_d, tuple_2d_d) == 80);
+  CUTLASS_TRACE_HOST("inner_product(" << tuple_2d_m << ", " << tuple_2d_m << ") => "
+            << inner_product(tuple_2d_m, tuple_2d_m));
+  ASSERT_TRUE(inner_product(tuple_2d_m, tuple_2d_m) == 80);
+
+  // 3d s|d|m
+  CUTLASS_TRACE_HOST("inner_product(" << tuple_3d_s << ", " << tuple_3d_s << ") => "
+            << inner_product(tuple_3d_s, tuple_3d_s));
+  CUTE_STATIC_ASSERT_V(inner_product(tuple_3d_s, tuple_3d_s) == Int<84>{});
+  CUTLASS_TRACE_HOST("inner_product(" << tuple_3d_d << ", " << tuple_3d_d << ") => "
+            << inner_product(tuple_3d_d, tuple_3d_d));
+  ASSERT_TRUE(inner_product(tuple_3d_d, tuple_3d_d) == 84);
+  CUTLASS_TRACE_HOST("inner_product(" << tuple_3d_m << ", " << tuple_3d_m << ") => "
+            << inner_product(tuple_3d_m, tuple_3d_m));
+  ASSERT_TRUE(inner_product(tuple_3d_m, tuple_3d_m) == 84);
+
+  // 3h s|d|m
+  CUTLASS_TRACE_HOST("inner_product(" << tuple_3h_s << ", " << tuple_3h_s << ") => "
+            << inner_product(tuple_3h_s, tuple_3h_s));
+  CUTE_STATIC_ASSERT_V(inner_product(tuple_3h_s, tuple_3h_s) == Int<73>{});
+  CUTLASS_TRACE_HOST("inner_product(" << tuple_3h_d << ", " << tuple_3h_d << ") => "
+            << inner_product(tuple_3h_d, tuple_3h_d));
+  ASSERT_TRUE(inner_product(tuple_3h_d, tuple_3h_d) == 73);
+  CUTLASS_TRACE_HOST("inner_product(" << tuple_3h_m << ", " << tuple_3h_m << ") => "
+            << inner_product(tuple_3h_m, tuple_3h_m));
+  ASSERT_TRUE(inner_product(tuple_3h_m, tuple_3h_m) == 73);
+
+  CUTLASS_TRACE_HOST("col_major(" << tuple_2d_s << ") => " << compact_col_major(tuple_2d_s));
+  CUTE_STATIC_ASSERT_V((compact_col_major(tuple_2d_s) == make_tuple(_1{},_8{})));
+  CUTLASS_TRACE_HOST("col_major(" << tuple_3d_s << ") => " << compact_col_major(tuple_3d_s));
+  CUTE_STATIC_ASSERT_V((compact_col_major(tuple_3d_s) == make_tuple(_1{},_8{},_32{})));
+  CUTLASS_TRACE_HOST("col_major(" << tuple_3h_s << ") => " << compact_col_major(tuple_3h_s));
+  CUTE_STATIC_ASSERT_V((compact_col_major(tuple_3h_s) == make_tuple(make_tuple(_0{},_1{}),_2{},_16{})));
+
+  CUTLASS_TRACE_HOST("col_major(" << tuple_2d_d << ") => " << compact_col_major(tuple_2d_d));
+  ASSERT_TRUE((compact_col_major(tuple_2d_d) == make_tuple(_1{},8)));
+  CUTLASS_TRACE_HOST("col_major(" << tuple_3d_d << ") => " << compact_col_major(tuple_3d_d));
+  ASSERT_TRUE((compact_col_major(tuple_3d_d) == make_tuple(_1{},8,32)));
+  CUTLASS_TRACE_HOST("col_major(" << tuple_3h_d << ") => " << compact_col_major(tuple_3h_d));
+  ASSERT_TRUE((compact_col_major(tuple_3h_d) == make_tuple(make_tuple(_1{},1),2,16)));
+
+  CUTLASS_TRACE_HOST("col_major(" << tuple_2d_m << ") => " << compact_col_major(tuple_2d_m));
+  ASSERT_TRUE((compact_col_major(tuple_2d_m) == make_tuple(_1{},_8{})));
+  CUTLASS_TRACE_HOST("col_major(" << tuple_3d_m << ") => " << compact_col_major(tuple_3d_m));
+  ASSERT_TRUE((compact_col_major(tuple_3d_m) == make_tuple(_1{},8,32)));
+  CUTLASS_TRACE_HOST("col_major(" << tuple_3h_m << ") => " << compact_col_major(tuple_3h_m));
+  ASSERT_TRUE((compact_col_major(tuple_3h_m) == make_tuple(make_tuple(_1{},1),2,16)));
+
+  CUTLASS_TRACE_HOST("-------------------------------");
+  CUTLASS_TRACE_HOST("SLICING TUPLES");
+  CUTLASS_TRACE_HOST("-------------------------------");
+
+  {
+    auto a = Coord<_2,_3,_4,Coord<_5,_6>>{};
+
+    CUTLASS_TRACE_HOST("a = " << a);
+
+    CUTLASS_TRACE_HOST("a(1) = " << slice(1, a));
+
+    CUTLASS_TRACE_HOST("a(_) = " << slice(_, a));
+
+    CUTLASS_TRACE_HOST("a(_,1,_,_) = " << slice(make_coord(_,1,_,_), a));
+
+    CUTLASS_TRACE_HOST("a(_,1,_,(_,_)) = " << slice(make_coord(_,1,_,make_coord(_,_)), a));
+
+    CUTLASS_TRACE_HOST("a(_,1,_,(_,2)) = " << slice(make_coord(_,1,_,make_coord(_,2)), a));
+
+    CUTLASS_TRACE_HOST("a(_,1,_,(1,2)) = " << slice(make_coord(_,1,_,make_coord(1,2)), a));
+  }
+
+  CUTLASS_TRACE_HOST("-------------------------------");
+  CUTLASS_TRACE_HOST("DICING TUPLES");
+  CUTLASS_TRACE_HOST("-------------------------------");
+
+  {
+    auto a = Coord<_2,_3,_4,Coord<_5,_6>>{};
+
+    CUTLASS_TRACE_HOST("a = " << a);
+
+    CUTLASS_TRACE_HOST("a(1) = " << dice(1, a));
+
+    CUTLASS_TRACE_HOST("a(_) = " << dice(_, a));
+
+    CUTLASS_TRACE_HOST("a(_,1,_,_) = " << dice(make_coord(_,1,_,_), a));
+
+    CUTLASS_TRACE_HOST("a(_,1,_,(_,_)) = " << dice(make_coord(_,1,_,make_coord(_,_)), a));
+
+    CUTLASS_TRACE_HOST("a(_,1,_,(_,2)) = " << dice(make_coord(_,1,_,make_coord(_,2)), a));
+
+    CUTLASS_TRACE_HOST("a(_,1,_,(1,2)) = " << dice(make_coord(_,1,_,make_coord(1,2)), a));
+  }
+}
diff --git a/test/unit/cute/core/tuple_find.cpp b/test/unit/cute/core/tuple_find.cpp
new file mode 100644
index 0000000000..0eeeb16613
--- /dev/null
+++ b/test/unit/cute/core/tuple_find.cpp
@@ -0,0 +1,103 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <cutlass/trace.h>
+#include <cute/container/packed_tuple.hpp>
+#include <cute/container/tuple.hpp>
+
+namespace test {
+
+template<size_t ExpectedIndex, class X, class Tuple>
+void test_tuple_find(Tuple const& t) {
+  auto index = cute::find<X>(t);
+  static_assert(decltype(index)::value == ExpectedIndex);
+}
+
+template<template<class...> class Tuple>
+void test_tuple_find_all() {
+  using test::test_tuple_find;
+  using cute::_1;
+  using cute::_2;
+  using cute::_4;
+
+  test_tuple_find<0, _1>(Tuple<_1>{});
+  test_tuple_find<0, int>(Tuple<int>{7});
+
+  test_tuple_find<0, _1>(Tuple<_1, _2>{});
+  test_tuple_find<0, _1>(Tuple<_1, int>{_1{}, 7});
+  test_tuple_find<0, float>(Tuple<float, int>{15.5f, 7});
+  test_tuple_find<1, _2>(Tuple<_1, _2>{});
+  test_tuple_find<1, int>(Tuple<_1, int>{_1{}, 7});
+  test_tuple_find<1, int>(Tuple<float, int>{15.5f, 7});
+
+  test_tuple_find<0, _1>(Tuple<_1, _2, _4>{_1{}, _2{}, _4{}});
+  test_tuple_find<0, _1>(Tuple<_1, _2, int>{_1{}, _2{}, 7});
+  test_tuple_find<0, _1>(Tuple<_1, float, _4>{_1{}, 15.5f, _4{}});
+  test_tuple_find<0, _1>(Tuple<_1, float, int>{_1{}, 15.5f, 7});
+  test_tuple_find<0, double>(Tuple<double, _2, _4>{105.5, _2{}, _4{}});
+  test_tuple_find<0, double>(Tuple<double, float, _4>{105.5, 15.5f, _4{}});
+  test_tuple_find<0, double>(Tuple<double, float, int>{105.5, 15.5f, 7});
+
+  test_tuple_find<1, _2>(Tuple<_1, _2, _4>{_1{}, _2{}, _4{}});
+  test_tuple_find<1, _2>(Tuple<_1, _2, int>{_1{}, _2{}, 7});
+  test_tuple_find<1, float>(Tuple<_1, float, _4>{_1{}, 15.5f, _4{}});
+  test_tuple_find<1, float>(Tuple<_1, float, int>{_1{}, 15.5f, 7});
+  test_tuple_find<1, _2>(Tuple<double, _2, _4>{105.5, _2{}, _4{}});
+  test_tuple_find<1, float>(Tuple<double, float, _4>{105.5, 15.5f, _4{}});
+  test_tuple_find<1, float>(Tuple<double, float, int>{105.5, 15.5f, 7});
+
+  test_tuple_find<2, _4>(Tuple<_1, _2, _4>{_1{}, _2{}, _4{}});
+  test_tuple_find<2, int>(Tuple<_1, _2, int>{_1{}, _2{}, 7});
+  test_tuple_find<2, _4>(Tuple<_1, float, _4>{_1{}, 15.5f, _4{}});
+  test_tuple_find<2, int>(Tuple<_1, float, int>{_1{}, 15.5f, 7});
+  test_tuple_find<2, _4>(Tuple<double, _2, _4>{105.5, _2{}, _4{}});
+  test_tuple_find<2, _4>(Tuple<double, float, _4>{105.5, 15.5f, _4{}});
+  test_tuple_find<2, int>(Tuple<double, float, int>{105.5, 15.5f, 7});
+}
+
+} // end namespace test
+
+
+TEST(CuTe_core, TupleFind)
+{
+  test::test_tuple_find_all<cute::tuple>();
+}
+
+// If cute::tuple is not simply an alias for cute::packed_tuple,
+// then test cute::packed_tuple separately.
+#if ! defined(CUTLASS_USE_PACKED_TUPLE)
+TEST(CuTe_core, PackedTupleFind)
+{
+  test::test_tuple_find_all<cute::packed_tuple>();
+}
+#endif // CUTLASS_USE_PACKED_TUPLE
diff --git a/test/unit/cute/hopper/CMakeLists.txt b/test/unit/cute/hopper/CMakeLists.txt
new file mode 100644
index 0000000000..f77aad93c0
--- /dev/null
+++ b/test/unit/cute/hopper/CMakeLists.txt
@@ -0,0 +1,84 @@
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+add_custom_target(
+  cutlass_test_unit_cute_hopper
+  DEPENDS
+  cutlass_test_unit_cute_hopper_cooperative_gemm
+  cutlass_test_unit_cute_hopper_stsm
+  cutlass_test_unit_cute_hopper_tma_load
+  cutlass_test_unit_cute_hopper_tma_store
+  cutlass_test_unit_cute_hopper_bulk_load
+  cutlass_test_unit_cute_hopper_bulk_store
+)
+
+add_custom_target(
+  test_unit_cute_hopper
+  DEPENDS
+  test_unit_cute_hopper_stsm
+  test_unit_cute_hopper_tma_load
+  test_unit_cute_hopper_tma_store
+  test_unit_cute_hopper_bulk_load
+  test_unit_cute_hopper_bulk_store
+)
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_cute_hopper_cooperative_gemm
+  cooperative_gemm.cu
+)
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_cute_hopper_stsm
+  stsm.cu
+)
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_cute_hopper_tma_load
+  tma_load.cu
+)
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_cute_hopper_tma_mcast_load
+  tma_mcast_load.cu
+)
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_cute_hopper_tma_store
+  tma_store.cu
+)
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_cute_hopper_bulk_load
+  bulk_load.cu
+)
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_cute_hopper_bulk_store
+  bulk_store.cu
+)
+
diff --git a/test/unit/cute/hopper/bulk_load.cu b/test/unit/cute/hopper/bulk_load.cu
new file mode 100644
index 0000000000..06281e2aa2
--- /dev/null
+++ b/test/unit/cute/hopper/bulk_load.cu
@@ -0,0 +1,196 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Basic tests for BULK_COPY usage with various layouts.
+*/
+
+#include "cutlass_unit_test.h"
+
+#include <iostream>
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <cute/tensor.hpp>
+
+using namespace cute;
+
+template <class ElementType, class SmemLayout>
+struct SharedStorage {
+  cute::array_aligned<ElementType, cute::cosize_v<SmemLayout>> smem;
+  cute::uint64_t bulk_copy_mbar[1];
+};
+
+#if CUDA_12_0_SM90_FEATURES_SUPPORTED
+template <class T, class GmemLayout, class SmemLayout>
+__global__ void
+bulk_copy_test_device_cute(T const* g_in,
+                           T      * g_out,
+                           GmemLayout gmem_layout,
+                           SmemLayout smem_layout)
+{
+  // Use Shared Storage structure to allocate and distribute aligned SMEM addresses
+  extern __shared__ char shared_memory[];
+  using SharedStorage = SharedStorage<T, SmemLayout>;
+  SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(shared_memory);
+
+  // Construct SMEM tensor
+  Tensor sA = make_tensor(make_smem_ptr(shared_storage.smem.data()), smem_layout);
+  // Construct the GMEM tensor
+  Tensor gA = make_tensor(make_gmem_ptr(g_in), gmem_layout);
+
+  // Shared memory barriers use 64bits in SMEM for synchronization
+  uint64_t* bulk_copy_mbar = shared_storage.bulk_copy_mbar;
+
+  //
+  // Perform the BULK_COPY load
+  //
+
+  auto blkcp = Copy_Traits<SM90_BULK_COPY_AUTO>{};
+
+#if 0
+  if (thread0()) {
+    print("sA: "); print(sA.data()); print(" o "); print(sA.layout()); print("\n");
+    print("gA: "); print(gA.data()); print(" o "); print(gA.layout()); print("\n");
+  }
+#endif
+
+  // Set the bytes transferred in this transaction (may involve multiple issues)
+  constexpr int transaction_bytes = size(sA) * sizeof(T);
+
+  if (threadIdx.x == 0) {
+    /// Initialize shared memory barrier
+    bulk_copy_mbar[0] = 0;
+    initialize_barrier(bulk_copy_mbar[0], 1 /*numThreads*/);
+    set_barrier_transaction_bytes(bulk_copy_mbar[0], transaction_bytes);
+
+    copy(blkcp.with(bulk_copy_mbar[0]), gA, sA);
+  }
+  __syncthreads();
+
+  /// Wait on the shared memory barrier until the phase bit flips from kPhaseBit value
+  constexpr int kPhaseBit = 0;
+  wait_barrier(bulk_copy_mbar[0], kPhaseBit);
+
+#if 0
+  if (thread0()) {
+    print(sA);
+  }
+#endif
+
+  //
+  // Write out trivially
+  //
+
+  Tensor gA_out = make_tensor(make_gmem_ptr(g_out), gmem_layout);
+
+  // Output smem -> gmem
+  for (int i = threadIdx.x; i < size(sA); i += blockDim.x) {
+    gA_out(i) = sA(i);
+  }
+}
+
+template <class T, class GLayout, class SLayout>
+void run_and_validate(GLayout gmem_layout,
+                      SLayout smem_layout)
+{
+  thrust::host_vector<T> h_in(cosize(gmem_layout));
+  for (size_t i = 0; i < h_in.size(); ++i) {
+    h_in[i] = static_cast<T>(int(i));
+  }
+
+  thrust::device_vector<T> d_in = h_in;
+  thrust::device_vector<T> d_out(d_in.size(), T(-1));
+
+  int32_t smem_size = static_cast<int32_t>(sizeof(SharedStorage<T, decltype(smem_layout)>));
+  bulk_copy_test_device_cute<<<1, 128, smem_size>>>(thrust::raw_pointer_cast(d_in.data()),
+                                                    thrust::raw_pointer_cast(d_out.data()),
+                                                    gmem_layout,
+                                                    smem_layout);
+  // Transfering results back to host
+  thrust::host_vector<T> h_out = d_out;
+
+  // Validate the results
+  for (int i = 0; i < cute::size(gmem_layout); ++i) {
+    int k = gmem_layout(i);
+    EXPECT_EQ(int(h_in[k]), int(h_out[k]));
+  }
+}
+
+// }  // namespace
+
+TEST(SM90_CuTe_BLKCP, ColMajor)
+{
+
+  auto smem_layout = make_layout(Shape<_32,_32>{}, GenColMajor{});
+  auto gmem_layout = smem_layout;
+  run_and_validate<    int8_t>(gmem_layout, smem_layout);
+  run_and_validate<    half_t>(gmem_layout, smem_layout);
+  run_and_validate<tfloat32_t>(gmem_layout, smem_layout);
+}
+
+TEST(SM90_CuTe_BLKCP, RowMajor)
+{
+
+  auto smem_layout = make_layout(Shape<_32,_32>{}, GenRowMajor{});
+  auto gmem_layout = smem_layout;
+  run_and_validate<    int8_t>(gmem_layout, smem_layout);
+  run_and_validate<    half_t>(gmem_layout, smem_layout);
+  run_and_validate<tfloat32_t>(gmem_layout, smem_layout);
+}
+
+TEST(SM90_CuTe_BLKCP, NonCompact)
+{
+
+  {
+  auto smem_layout = make_layout(Shape<_32,_32>{}, Stride<_1,Int<48>>{});
+  auto gmem_layout = smem_layout;
+  run_and_validate<    int8_t>(gmem_layout, smem_layout);
+  run_and_validate<    half_t>(gmem_layout, smem_layout);
+  run_and_validate<tfloat32_t>(gmem_layout, smem_layout);
+  }
+  {
+  auto smem_layout = make_layout(Shape<_32,_32>{}, Stride<_1,Int<48>>{});
+  auto gmem_layout = make_layout(Shape<Shape<_16,_2>, Shape<_4,_8>>{}, Stride<Stride<_1,_64>,Stride<_16,_128>>{});
+  run_and_validate<    int8_t>(gmem_layout, smem_layout);
+  run_and_validate<    half_t>(gmem_layout, smem_layout);
+  run_and_validate<tfloat32_t>(gmem_layout, smem_layout);
+  }
+  {
+  auto smem_layout = make_layout(Shape<_32,_32>{}, Stride<_64,_1>{});
+  auto gmem_layout = smem_layout;
+  run_and_validate<    int8_t>(gmem_layout, smem_layout);
+  run_and_validate<    half_t>(gmem_layout, smem_layout);
+  run_and_validate<tfloat32_t>(gmem_layout, smem_layout);
+  }
+}
+#endif // #if CUDA_12_0_SM90_FEATURES_SUPPORTED
diff --git a/test/unit/cute/hopper/bulk_store.cu b/test/unit/cute/hopper/bulk_store.cu
new file mode 100644
index 0000000000..2ff4d13593
--- /dev/null
+++ b/test/unit/cute/hopper/bulk_store.cu
@@ -0,0 +1,175 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Basic tests for BULK_COPY usage with various layouts.
+*/
+
+#include "cutlass_unit_test.h"
+
+#include <iostream>
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <cute/tensor.hpp>
+
+using namespace cute;
+
+template <class ElementType, class SmemLayout>
+struct SharedStorage {
+  cute::array_aligned<ElementType, cute::cosize_v<SmemLayout>> smem;
+};
+
+#if CUDA_12_0_SM90_FEATURES_SUPPORTED
+template <class T, class GmemLayout, class SmemLayout>
+__global__ void
+bulk_copy_test_device_cute(T const* g_in,
+                           T      * g_out,
+                           GmemLayout gmem_layout,
+                           SmemLayout smem_layout)
+{
+  // Use Shared Storage structure to allocate and distribute aligned SMEM addresses
+  extern __shared__ char shared_memory[];
+  using SharedStorage = SharedStorage<T, SmemLayout>;
+  SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(shared_memory);
+
+  // Construct SMEM tensor
+  Tensor sA = make_tensor(make_smem_ptr(shared_storage.smem.data()), smem_layout);
+  // Construct the GMEM tensor
+  Tensor gA = make_tensor(make_gmem_ptr(g_in), gmem_layout);
+
+  //
+  // Read in trivially
+  //
+
+  // Input gmem -> smem
+  for (int i = threadIdx.x; i < size(sA); i += blockDim.x) {
+    sA(i) = gA(i);
+  }
+
+  cp_async_fence();
+  cp_async_wait<0>();
+  __syncthreads();
+
+  //
+  // Perform the BULK_COPY store
+  //
+
+#if 0
+  if (thread0()) {
+    print("sA: "); print(sA.data()); print(" o "); print(sA.layout()); print("\n");
+    print("gA: "); print(gA.data()); print(" o "); print(gA.layout()); print("\n");
+  }
+#endif
+
+  Tensor gA_out = make_tensor(make_gmem_ptr(g_out), gmem_layout);
+
+  auto blkcp = Copy_Traits<SM90_BULK_COPY_AUTO>{};
+
+  copy(blkcp, sA, gA_out);
+  // Bulk Copy store requires the same sync as TMA store.
+  tma_store_arrive();
+  tma_store_wait<0>();
+}
+
+template <class T, class GLayout, class SLayout>
+void run_and_validate(GLayout gmem_layout,
+                      SLayout smem_layout)
+{
+  thrust::host_vector<T> h_in(cosize(gmem_layout));
+  for (size_t i = 0; i < h_in.size(); ++i) {
+    h_in[i] = static_cast<T>(int(i));
+  }
+
+  thrust::device_vector<T> d_in = h_in;
+  thrust::device_vector<T> d_out(d_in.size(), T(-1));
+
+  int32_t smem_size = static_cast<int32_t>(sizeof(SharedStorage<T, decltype(smem_layout)>));
+  bulk_copy_test_device_cute<<<1, 128, smem_size>>>(thrust::raw_pointer_cast(d_in.data()),
+                                                    thrust::raw_pointer_cast(d_out.data()),
+                                                    gmem_layout,
+                                                    smem_layout);
+  // Transfering results back to host
+  thrust::host_vector<T> h_out = d_out;
+
+  // Validate the results
+  for (int i = 0; i < cute::size(gmem_layout); ++i) {
+    int k = gmem_layout(i);
+    EXPECT_EQ(int(h_in[k]), int(h_out[k]));
+  }
+}
+
+// }  // namespace
+
+TEST(SM90_CuTe_BLKCP, ColMajor)
+{
+  auto smem_layout = make_layout(Shape<_32,_32>{}, GenColMajor{});
+  auto gmem_layout = smem_layout;
+  run_and_validate<    int8_t>(gmem_layout, smem_layout);
+  run_and_validate<    half_t>(gmem_layout, smem_layout);
+  run_and_validate<tfloat32_t>(gmem_layout, smem_layout);
+}
+
+TEST(SM90_CuTe_BLKCP, RowMajor)
+{
+  auto smem_layout = make_layout(Shape<_32,_32>{}, GenRowMajor{});
+  auto gmem_layout = smem_layout;
+  run_and_validate<    int8_t>(gmem_layout, smem_layout);
+  run_and_validate<    half_t>(gmem_layout, smem_layout);
+  run_and_validate<tfloat32_t>(gmem_layout, smem_layout);
+}
+
+TEST(SM90_CuTe_BLKCP, NonCompact)
+{
+  {
+  auto smem_layout = make_layout(Shape<_32,_32>{}, Stride<_1,Int<48>>{});
+  auto gmem_layout = smem_layout;
+  run_and_validate<    int8_t>(gmem_layout, smem_layout);
+  run_and_validate<    half_t>(gmem_layout, smem_layout);
+  run_and_validate<tfloat32_t>(gmem_layout, smem_layout);
+  }
+  {
+  auto smem_layout = make_layout(Shape<_32,_32>{}, Stride<_1,Int<48>>{});
+  auto gmem_layout = make_layout(Shape<Shape<_16,_2>, Shape<_4,_8>>{}, Stride<Stride<_1,_64>,Stride<_16,_128>>{});
+  run_and_validate<    int8_t>(gmem_layout, smem_layout);
+  run_and_validate<    half_t>(gmem_layout, smem_layout);
+  run_and_validate<tfloat32_t>(gmem_layout, smem_layout);
+  }
+  {
+  auto smem_layout = make_layout(Shape<_32,_32>{}, Stride<_64,_1>{});
+  auto gmem_layout = smem_layout;
+  run_and_validate<    int8_t>(gmem_layout, smem_layout);
+  run_and_validate<    half_t>(gmem_layout, smem_layout);
+  run_and_validate<tfloat32_t>(gmem_layout, smem_layout);
+  }
+}
+#endif // #if CUDA_12_0_SM90_FEATURES_SUPPORTED
diff --git a/test/unit/cute/hopper/cooperative_gemm.cu b/test/unit/cute/hopper/cooperative_gemm.cu
new file mode 100644
index 0000000000..7d992510d6
--- /dev/null
+++ b/test/unit/cute/hopper/cooperative_gemm.cu
@@ -0,0 +1,117 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <cute/tensor.hpp>
+#include <cute/swizzle.hpp> // cute::Swizzle
+#include <cute/swizzle_layout.hpp> // cute::compose(cute::Swizzle)
+
+#include "../cooperative_gemm_common.hpp"
+
+using namespace cute;
+
+#define USE_FP8 1
+
+#if USE_FP8
+TEST(SM90_CuTe_Hopper, CooperativeGemmTilingF8) {
+
+  constexpr uint32_t thread_block_size = 128;
+  constexpr int MaxVecBits = 16;
+  using TA = uint8_t;
+  using TB = uint8_t;
+  using TC = uint32_t;
+
+  auto tiled_mma =
+      TiledMMA<
+        MMA_Atom<SM80_16x8x32_S32S8S8S32_TN>,
+        Layout<Shape<_2, _2, _1>, Stride<_1, _2, _0>>,
+        Tile<_32, _32, _32>
+      >{};
+
+  auto swizzle = Swizzle<2, 4, 3>{};
+
+  // This is for A row major, B col major according to CUTLASS default configs
+  auto a_layout = composition(swizzle, Layout<Shape<_64, _64>, Stride<_64, _1>>{});
+  auto b_layout = composition(swizzle, Layout<Shape<_64, _64>, Stride<_1, _64>>{});
+  auto c_layout = make_layout(Shape<_64, _64>{}, LayoutLeft{});
+
+  test_cooperative_gemm<thread_block_size,
+                        MaxVecBits,
+                        TA, TB, TC>
+    (a_layout,
+     b_layout,
+     c_layout,
+     a_layout,
+     b_layout,
+     c_layout,
+     tiled_mma);
+}
+
+#else
+
+TEST(SM90_CuTe_Hopper, CooperativeGemmTilingF16) {
+
+  constexpr uint32_t thread_block_size = 64;
+  constexpr int max_vec_bits = 16;
+  using TA = half_t;
+  using TB = half_t;
+  using TC = half_t;
+
+  auto tiled_mma =
+      TiledMMA<
+        MMA_Atom<SM80_16x8x16_F16F16F16F16_TN>,
+        Layout<Shape<_2, _1, _1>, Stride<_1, _0, _0>>,
+        Tile<_32, _32, _32>
+      >{};
+
+  // This is for A row major, B col major according to CUTLASS default configs
+  auto swizzle = Swizzle<3, 3, 3>{};
+  auto ALayout = composition(swizzle{}, Layout<Shape<_64, _64>, Stride<_64, _1>>{});
+  auto BLayout = composition(swizzle{}, Layout<Shape<_64, _64>, Stride<_1, _64>>{});
+  auto CLayout = make_layout(Shape<_64, _64>{}, LayoutLeft{});
+
+  test_cooperative_gemm<thread_block_size,
+                        max_vec_bits,
+                        TA,
+                        TB,
+                        TC>
+
+    (ALayout,
+     BLayout,
+     CLayout,
+     ALayout,
+     BLayout,
+     CLayout,
+     tiled_mma);
+}
+
+#endif
diff --git a/test/unit/cute/hopper/stsm.cu b/test/unit/cute/hopper/stsm.cu
new file mode 100644
index 0000000000..ae6b3b8038
--- /dev/null
+++ b/test/unit/cute/hopper/stsm.cu
@@ -0,0 +1,426 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <iostream>
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <cute/tensor.hpp>
+#include <cute/arch/copy_sm90.hpp>
+
+using namespace cute;
+
+template<class T>
+__global__ void
+stsm_test_device(uint16_t* g_in, uint16_t* g_out)
+{
+  constexpr int count = sizeof(T) / 4;
+  int tid = threadIdx.x;
+  int stride = blockDim.x;
+
+  // load input gmem -> rmem
+  uint32_t reg[count];
+  for (int i = 0; i < (sizeof(T) / 4); i++) {
+    reg[i] = reinterpret_cast<uint32_t*>(g_in)[tid + (stride * i)];
+  }
+
+  __shared__ uint32_t smem[32 * count];
+
+  // load rmem -> smem using STSM
+  uint128_t* smem_ptr = reinterpret_cast<uint128_t*>(smem) + tid;
+  T*         rmem_ptr = reinterpret_cast<T*>(reg);
+  cute::copy_stsm(rmem_ptr, smem_ptr);
+
+  __syncthreads();
+
+  // store output smem -> gmem
+  for (int i = 0; i < (sizeof(T) / 4); i++) {
+    reinterpret_cast<uint32_t*>(g_out)[tid + (stride * i)] = smem[tid + (stride * i)];
+  }
+}
+
+template <class TiledCopy, class SmemLayout>
+__global__ void
+stsm_test_device_cute(uint16_t* g_in, uint16_t* g_out,
+                      TiledCopy tiled_copy, SmemLayout smem_layout)
+{
+  using namespace cute;
+
+  __shared__ uint16_t smem[size(smem_layout)];
+
+  Tensor t_g_in  = make_tensor(make_gmem_ptr(g_in),  smem_layout);
+  Tensor t_g_out = make_tensor(make_gmem_ptr(g_out), smem_layout);
+  Tensor t_smem  = make_tensor(make_smem_ptr(smem),  smem_layout);
+
+  int tid = threadIdx.x;
+
+  auto thr_copy = tiled_copy.get_thread_slice(tid);
+
+  Tensor tXgX = thr_copy.partition_S(t_g_in);   // (V,M,N)
+  Tensor tXsX = thr_copy.partition_D(t_smem);   // (V,M,N)
+
+  Tensor tXrX = make_tensor<uint16_t>(shape(tXgX)); // (V,M,N)
+  clear(tXrX);    // Just to make sure
+
+/*
+  if (thread0()) {
+    print("tXsX: " ); print(tXsX.layout()); print("\n");
+    print("tXgX: " ); print(tXgX.layout()); print("\n");
+    print("tXrX: " ); print(tXrX.layout()); print("\n");
+  }
+*/
+
+  // Load input gmem -> rmem
+  copy(tXgX, tXrX);
+
+  // Copy rmem -> smem via tiled_copy (STSM, STS)
+  copy(tiled_copy, tXrX, tXsX);
+
+  // Output smem -> gmem
+  for (int i = tid; i < size(t_smem); i += size(tiled_copy)) {
+    t_g_out(i) = t_smem(i);
+  }
+}
+
+#if CUDA_12_0_SM90_FEATURES_SUPPORTED
+TEST(SM90_CuTe_Hopper, Stsm)
+{
+  constexpr int count = 1024;
+
+  thrust::host_vector<uint16_t> h_in(count);
+  for (int i = 0; i < count; ++i) {
+    h_in[i] = uint16_t(i);
+  }
+  thrust::device_vector<uint16_t> d_in = h_in;
+
+  //
+  // STSM 1x (32b)
+  //
+
+  {
+  thrust::device_vector<uint16_t> d_out(count);
+  stsm_test_device<uint32_t><<<1, 32>>>(
+    thrust::raw_pointer_cast(d_in.data()),
+    thrust::raw_pointer_cast(d_out.data()));
+  thrust::host_vector<uint16_t> h_out = d_out;
+  for (int i = 0; i < 32; ++i) {
+    //printf("%d  %d\n", int(h_in[i]), int(h_out[i]));
+    EXPECT_EQ(h_out[i], h_in[i]);
+  }
+  CUTLASS_TRACE_HOST("STSM 1x stsm_test_device SUCCESS\n");
+  }
+
+  //
+  // STSM 2x (64b)
+  //
+
+  {
+  thrust::device_vector<uint16_t> d_out(count);
+  stsm_test_device<uint64_t><<<1, 32>>>(
+    thrust::raw_pointer_cast(d_in.data()),
+    thrust::raw_pointer_cast(d_out.data()));
+  thrust::host_vector<uint16_t> h_out = d_out;
+  for (int i = 0; i < 64; ++i) {
+    //printf("%d  %d\n", int(h_in[i]), int(h_out[i]));
+    EXPECT_EQ(h_out[i], h_in[i]);
+  }
+  CUTLASS_TRACE_HOST("STSM 2x stsm_test_device SUCCESS\n");
+  }
+
+  //
+  // STSM 4x (128b)
+  //
+
+  {
+  thrust::device_vector<uint16_t> d_out(count);
+  stsm_test_device<uint128_t><<<1, 32>>>(
+    thrust::raw_pointer_cast(d_in.data()),
+    thrust::raw_pointer_cast(d_out.data()));
+  thrust::host_vector<uint16_t> h_out = d_out;
+  for (int i = 0; i < 128; ++i) {
+    //printf("%d  %d\n", int(h_in[i]), int(h_out[i]));
+    EXPECT_EQ(h_out[i], h_in[i]);
+  }
+  CUTLASS_TRACE_HOST("STSM 4x stsm_test_device SUCCESS\n");
+  }
+
+  //
+  // CuTe STSM
+  //
+
+  {
+  thrust::device_vector<uint16_t> d_out(count);
+
+  auto smem_layout = Layout<Shape <_32,Shape <_2, _4>>,
+                            Stride< _2,Stride<_1,_64>>>{};
+  auto tiled_copy = make_tiled_copy(Copy_Atom<SM90_U32x1_STSM_N, uint16_t>{},
+                                    Layout<Shape<_32,_1>>{},
+                                    Layout<Shape< _1,_8>>{});
+
+  stsm_test_device_cute<<<1, int(size(tiled_copy))>>>(
+    thrust::raw_pointer_cast(d_in.data()),
+    thrust::raw_pointer_cast(d_out.data()),
+    tiled_copy,
+    smem_layout);
+  thrust::host_vector<uint16_t> h_out = d_out;
+  for (int i = 0; i < size(smem_layout); ++i) {
+    //printf("%d  %d\n", int(h_in[i]), int(h_out[i]));
+    EXPECT_EQ(h_out[i], h_in[i]);
+  }
+  CUTLASS_TRACE_HOST("CuTe 32x8 interleaved U32x1_STSM_N SUCCESS\n");
+  }
+
+  {
+  thrust::device_vector<uint16_t> d_out(count);
+
+  auto smem_layout = Layout<Shape <_32,Shape <_2, _4>>,
+                            Stride< _2,Stride<_1,_64>>>{};
+  auto tiled_copy = make_tiled_copy(Copy_Atom<SM90_U32x2_STSM_N, uint16_t>{},
+                                    Layout<Shape<_32,_1>>{},
+                                    Layout<Shape< _1,_8>>{});
+
+  stsm_test_device_cute<<<1, int(size(tiled_copy))>>>(
+    thrust::raw_pointer_cast(d_in.data()),
+    thrust::raw_pointer_cast(d_out.data()),
+    tiled_copy,
+    smem_layout);
+  thrust::host_vector<uint16_t> h_out = d_out;
+  for (int i = 0; i < size(smem_layout); ++i) {
+    //printf("%d  %d\n", int(h_in[i]), int(h_out[i]));
+    EXPECT_EQ(h_out[i], h_in[i]);
+  }
+  CUTLASS_TRACE_HOST("CuTe 32x8 interleaved U32x2_STSM_N SUCCESS\n");
+  }
+
+  {
+  thrust::device_vector<uint16_t> d_out(count);
+
+  auto smem_layout = Layout<Shape <_32,Shape <_2, _4>>,
+                            Stride< _2,Stride<_1,_64>>>{};
+  auto tiled_copy = make_tiled_copy(Copy_Atom<SM90_U32x4_STSM_N, uint16_t>{},
+                                    Layout<Shape<_32,_1>>{},
+                                    Layout<Shape< _1,_8>>{});
+
+  stsm_test_device_cute<<<1, int(size(tiled_copy))>>>(
+    thrust::raw_pointer_cast(d_in.data()),
+    thrust::raw_pointer_cast(d_out.data()),
+    tiled_copy,
+    smem_layout);
+  thrust::host_vector<uint16_t> h_out = d_out;
+  for (int i = 0; i < size(smem_layout); ++i) {
+    //printf("%d  %d\n", int(h_in[i]), int(h_out[i]));
+    EXPECT_EQ(h_out[i], h_in[i]);
+  }
+  CUTLASS_TRACE_HOST("CuTe 32x8 interleaved U32x4_STSM_N SUCCESS\n");
+  }
+
+  {
+  thrust::device_vector<uint16_t> d_out(count);
+
+  auto smem_layout = Layout<Shape <_32,Shape <_2, _4>>,
+                            Stride< _2,Stride<_1,_64>>>{};
+  auto tiled_copy = make_tiled_copy(Copy_Atom<UniversalCopy<uint16_t>, uint16_t>{},
+                                    Layout<Shape<_32,_1>>{},
+                                    Layout<Shape< _1,_8>>{});
+
+  stsm_test_device_cute<<<1, int(size(tiled_copy))>>>(
+    thrust::raw_pointer_cast(d_in.data()),
+    thrust::raw_pointer_cast(d_out.data()),
+    tiled_copy,
+    smem_layout);
+  thrust::host_vector<uint16_t> h_out = d_out;
+  for (int i = 0; i < size(smem_layout); ++i) {
+    //printf("%d  %d\n", int(h_in[i]), int(h_out[i]));
+    EXPECT_EQ(h_out[i], h_in[i]);
+  }
+  CUTLASS_TRACE_HOST("CuTe 32x8 interleaved STSM.U16 SUCCESS\n");
+  }
+
+  {
+  thrust::device_vector<uint16_t> d_out(count);
+
+  auto smem_layout = Layout<Shape <_32,_32>,
+                            Stride< _1,_32>>{};
+  auto tiled_copy = make_tiled_copy(Copy_Atom<SM90_U32x1_STSM_N, uint16_t>{},
+                                    Layout<Shape<_16,_2>>{},
+                                    Layout<Shape< _2,_4>>{});
+
+  stsm_test_device_cute<<<1, int(size(tiled_copy))>>>(
+    thrust::raw_pointer_cast(d_in.data()),
+    thrust::raw_pointer_cast(d_out.data()),
+    tiled_copy,
+    smem_layout);
+  thrust::host_vector<uint16_t> h_out = d_out;
+  for (int i = 0; i < size(smem_layout); ++i) {
+    //printf("%d  %d\n", int(h_in[i]), int(h_out[i]));
+    EXPECT_EQ(h_out[i], h_in[i]);
+  }
+  CUTLASS_TRACE_HOST("CuTe 32x32 U32x1_STSM_N SUCCESS\n");
+  }
+
+  {
+  thrust::device_vector<uint16_t> d_out(count);
+
+  auto smem_layout = Layout<Shape <_32,_32>,
+                            Stride< _1,_32>>{};
+  auto tiled_copy = make_tiled_copy(Copy_Atom<SM90_U32x2_STSM_N, uint16_t>{},
+                                    Layout<Shape<_16,_2>>{},
+                                    Layout<Shape< _2,_4>>{});
+
+  stsm_test_device_cute<<<1, int(size(tiled_copy))>>>(
+    thrust::raw_pointer_cast(d_in.data()),
+    thrust::raw_pointer_cast(d_out.data()),
+    tiled_copy,
+    smem_layout);
+  thrust::host_vector<uint16_t> h_out = d_out;
+  for (int i = 0; i < size(smem_layout); ++i) {
+    //printf("%d  %d\n", int(h_in[i]), int(h_out[i]));
+    EXPECT_EQ(h_out[i], h_in[i]);
+  }
+  CUTLASS_TRACE_HOST("CuTe 32x32 U32x2_STSM_N SUCCESS\n");
+  }
+
+  {
+  thrust::device_vector<uint16_t> d_out(count);
+
+  auto smem_layout = Layout<Shape <_32,_32>,
+                            Stride< _1,_32>>{};
+  auto tiled_copy = make_tiled_copy(Copy_Atom<SM90_U32x4_STSM_N, uint16_t>{},
+                                    Layout<Shape<_16,_2>>{},
+                                    Layout<Shape< _2,_4>>{});
+
+  stsm_test_device_cute<<<1, int(size(tiled_copy))>>>(
+    thrust::raw_pointer_cast(d_in.data()),
+    thrust::raw_pointer_cast(d_out.data()),
+    tiled_copy,
+    smem_layout);
+  thrust::host_vector<uint16_t> h_out = d_out;
+  for (int i = 0; i < size(smem_layout); ++i) {
+    //printf("%d  %d\n", int(h_in[i]), int(h_out[i]));
+    EXPECT_EQ(h_out[i], h_in[i]);
+  }
+  CUTLASS_TRACE_HOST("CuTe 32x32 U32x4_STSM_N SUCCESS\n");
+  }
+
+  {
+  thrust::device_vector<uint16_t> d_out(count);
+
+  auto smem_layout = Layout<Shape <_32,_32>,
+                            Stride< _1,_32>>{};
+  auto tiled_copy = make_tiled_copy(Copy_Atom<UniversalCopy<uint16_t>, uint16_t>{},
+                                    Layout<Shape<_16,_2>>{},
+                                    Layout<Shape< _2,_4>>{});
+
+  stsm_test_device_cute<<<1, int(size(tiled_copy))>>>(
+    thrust::raw_pointer_cast(d_in.data()),
+    thrust::raw_pointer_cast(d_out.data()),
+    tiled_copy,
+    smem_layout);
+  thrust::host_vector<uint16_t> h_out = d_out;
+  for (int i = 0; i < size(smem_layout); ++i) {
+    //printf("%d  %d\n", int(h_in[i]), int(h_out[i]));
+    EXPECT_EQ(h_out[i], h_in[i]);
+  }
+  CUTLASS_TRACE_HOST("CuTe 32x32 STSM.U16 SUCCESS\n");
+  }
+
+  {
+  thrust::device_vector<uint16_t> d_out(count);
+
+  auto smem_layout = Layout<Shape <_32,_32>,
+                            Stride<_32, _1>>{};
+  auto tiled_copy = make_tiled_copy(Copy_Atom<SM90_U16x2_STSM_T, uint16_t>{},
+                                    Layout<Shape<_4,_8>>{},
+                                    Layout<Shape<_2,_1>>{});
+
+  stsm_test_device_cute<<<1, int(size(tiled_copy))>>>(
+    thrust::raw_pointer_cast(d_in.data()),
+    thrust::raw_pointer_cast(d_out.data()),
+    tiled_copy,
+    smem_layout);
+  thrust::host_vector<uint16_t> h_out = d_out;
+  for (int i = 0; i < size(smem_layout); ++i) {
+    //printf("%d  %d\n", int(h_in[i]), int(h_out[i]));
+    EXPECT_EQ(h_out[i], h_in[i]);
+  }
+  CUTLASS_TRACE_HOST("CuTe 32x32 U16x2_STSM_T SUCCESS\n");
+  }
+
+  {
+  thrust::device_vector<uint16_t> d_out(count);
+
+  auto smem_layout = Layout<Shape <_32,_32>,
+                            Stride<_32, _1>>{};
+  auto tiled_copy = make_tiled_copy(Copy_Atom<SM90_U16x4_STSM_T, uint16_t>{},
+                                    Layout<Shape<_4,_8>>{},
+                                    Layout<Shape<_4,_1>>{});
+
+  stsm_test_device_cute<<<1, int(size(tiled_copy))>>>(
+    thrust::raw_pointer_cast(d_in.data()),
+    thrust::raw_pointer_cast(d_out.data()),
+    tiled_copy,
+    smem_layout);
+  thrust::host_vector<uint16_t> h_out = d_out;
+  for (int i = 0; i < size(smem_layout); ++i) {
+    //printf("%d  %d\n", int(h_in[i]), int(h_out[i]));
+    EXPECT_EQ(h_out[i], h_in[i]);
+  }
+  CUTLASS_TRACE_HOST("CuTe 32x32 U16x4_STSM_T SUCCESS\n");
+  }
+
+  {
+  thrust::device_vector<uint16_t> d_out(count);
+
+  auto smem_layout = Layout<Shape <_32,_32>,
+                            Stride<_32, _1>>{};
+  auto tiled_copy = make_tiled_copy(Copy_Atom<SM90_U16x8_STSM_T, uint16_t>{},
+                                    Layout<Shape<_4,_8>>{},
+                                    Layout<Shape<_8,_1>>{});
+
+  stsm_test_device_cute<<<1, int(size(tiled_copy))>>>(
+    thrust::raw_pointer_cast(d_in.data()),
+    thrust::raw_pointer_cast(d_out.data()),
+    tiled_copy,
+    smem_layout);
+  thrust::host_vector<uint16_t> h_out = d_out;
+  for (int i = 0; i < size(smem_layout); ++i) {
+    //printf("%d  %d\n", int(h_in[i]), int(h_out[i]));
+    EXPECT_EQ(h_out[i], h_in[i]);
+  }
+  CUTLASS_TRACE_HOST("CuTe 32x32 U16x8_STSM_T SUCCESS\n");
+  }
+
+  CUTLASS_TRACE_HOST("PASS");
+}
+#endif
diff --git a/test/unit/cute/hopper/tma_load.cu b/test/unit/cute/hopper/tma_load.cu
new file mode 100644
index 0000000000..0105d35144
--- /dev/null
+++ b/test/unit/cute/hopper/tma_load.cu
@@ -0,0 +1,426 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include "../hopper/tma_load_testbed.hpp"
+
+using namespace cute;
+using namespace cutlass::test;
+
+#if CUDA_12_0_SM90_FEATURES_SUPPORTED
+
+template <class T, class TmaType = T, class GMEM_Layout, class SMEM_Layout, class CTA_Tile>
+auto
+test_tma_load(GMEM_Layout const& gmem_layout,
+              SMEM_Layout const& smem_layout,
+              CTA_Tile    const& cta_tile)
+{
+  return test_tma_load<T, TmaType>(SM90_TMA_LOAD{}, gmem_layout, smem_layout, cta_tile);
+}
+
+template <class T, class TmaType = T, class GMEM_Layout, class SMEM_Layout>
+auto
+test_tma_load(GMEM_Layout const& gmem_layout,
+              SMEM_Layout const& smem_layout)
+{
+  return test_tma_load<T, TmaType>(gmem_layout, smem_layout, product_each(shape(smem_layout)));
+}
+
+TEST(SM90_CuTe_Hopper, Tma_Load_1D)
+{
+  {
+    Layout smem_layout = Layout<_256, _1>{};
+    {
+    Layout gmem_layout = smem_layout;
+    test_tma_load<int8_t>(gmem_layout, smem_layout);
+    test_tma_load<half_t>(gmem_layout, smem_layout);
+    test_tma_load< float>(gmem_layout, smem_layout);
+    test_tma_load<double>(gmem_layout, smem_layout);
+    }
+
+    {
+    Layout gmem_layout = make_layout(128, GenColMajor{});
+    test_tma_load<int8_t>(gmem_layout, smem_layout);
+    test_tma_load<half_t>(gmem_layout, smem_layout);
+    test_tma_load< float>(gmem_layout, smem_layout);
+    test_tma_load<double>(gmem_layout, smem_layout);
+    }
+
+    {
+    Layout gmem_layout = make_layout(384, GenColMajor{});
+    test_tma_load<int8_t>(gmem_layout, smem_layout);
+    test_tma_load<half_t>(gmem_layout, smem_layout);
+    test_tma_load< float>(gmem_layout, smem_layout);
+    test_tma_load<double>(gmem_layout, smem_layout);
+    }
+  }
+
+  {
+    Layout smem_layout = Layout<Shape<_8,_8>, Stride<_1,_8>>{};
+    {
+    Layout gmem_layout = smem_layout;
+    test_tma_load<int8_t>(gmem_layout, smem_layout);
+    test_tma_load<half_t>(gmem_layout, smem_layout);
+    test_tma_load< float>(gmem_layout, smem_layout);
+    test_tma_load<double>(gmem_layout, smem_layout);
+    }
+
+    // This doesn't result in a 1D TMA, even though it could/should...
+    {
+    Layout gmem_layout = tile_to_shape(smem_layout, Shape<_16,_16>{});
+    test_tma_load<int8_t>(gmem_layout, smem_layout);
+    test_tma_load<half_t>(gmem_layout, smem_layout);
+    test_tma_load< float>(gmem_layout, smem_layout);
+    test_tma_load<double>(gmem_layout, smem_layout);
+    }
+  }
+}
+
+TEST(SM90_CuTe_Hopper, Tma_Load_32x32_Col)
+{
+  Layout smem_layout = Layout<Shape<_32,_32>, Stride<_1,_32>>{};
+  {
+  Layout gmem_layout = smem_layout;
+  test_tma_load<int8_t>(gmem_layout, smem_layout);
+  test_tma_load<half_t>(gmem_layout, smem_layout);
+  test_tma_load< float>(gmem_layout, smem_layout);
+  test_tma_load<double>(gmem_layout, smem_layout);
+  }
+
+  {
+  Layout gmem_layout = make_layout(make_shape(32,32), GenColMajor{});
+  test_tma_load<int8_t>(gmem_layout, smem_layout);
+  test_tma_load<half_t>(gmem_layout, smem_layout);
+  test_tma_load< float>(gmem_layout, smem_layout);
+  test_tma_load<double>(gmem_layout, smem_layout);
+  }
+
+  {
+  Layout gmem_layout = make_layout(make_shape(32,32), make_stride(Int<1>{}, 1024));
+  test_tma_load<int8_t>(gmem_layout, smem_layout);
+  test_tma_load<half_t>(gmem_layout, smem_layout);
+  test_tma_load< float>(gmem_layout, smem_layout);
+  test_tma_load<double>(gmem_layout, smem_layout);
+  }
+}
+
+TEST(SM90_CuTe_Hopper, Tma_Load_32x32_Row)
+{
+  Layout smem_layout = Layout<Shape<_32,_32>, Stride<_32,_1>>{};
+  {
+  Layout gmem_layout = smem_layout;
+  test_tma_load<int8_t>(gmem_layout, smem_layout);
+  test_tma_load<half_t>(gmem_layout, smem_layout);
+  test_tma_load< float>(gmem_layout, smem_layout);
+  test_tma_load<double>(gmem_layout, smem_layout);
+  }
+
+  {
+  Layout gmem_layout = make_layout(make_shape(32,32), GenRowMajor{});
+  test_tma_load<int8_t>(gmem_layout, smem_layout);
+  test_tma_load<half_t>(gmem_layout, smem_layout);
+  test_tma_load< float>(gmem_layout, smem_layout);
+  test_tma_load<double>(gmem_layout, smem_layout);
+  }
+
+  {
+  Layout gmem_layout = make_layout(make_shape(32,32), make_stride(1024, Int<1>{}));
+  test_tma_load<int8_t>(gmem_layout, smem_layout);
+  test_tma_load<half_t>(gmem_layout, smem_layout);
+  test_tma_load< float>(gmem_layout, smem_layout);
+  test_tma_load<double>(gmem_layout, smem_layout);
+  }
+}
+
+template <class T, template <typename> typename SWIZZLE_ATOM>
+void
+test_tma_load_swizzle_atom_mn()
+{
+  auto smem_layout = SWIZZLE_ATOM<T>{};
+  { // Static gmem
+  //Layout gmem_layout = make_layout(shape(smem_layout), GenColMajor{});
+  //test_tma_load<T>(gmem_layout, smem_layout);
+  }
+  { // Dynamic gmem
+  Layout gmem_layout = make_layout(make_shape(2*uint32_t(size<0>(smem_layout)), 2*uint32_t(size<1>(smem_layout))),
+                                   GenColMajor{});
+  test_tma_load<T>(gmem_layout, smem_layout);
+  }
+}
+
+template <class T, template <typename> typename SWIZZLE_ATOM>
+void
+test_tma_load_swizzle_atom_k()
+{
+  auto smem_layout = SWIZZLE_ATOM<T>{};
+  { // Static gmem
+  //Layout gmem_layout = make_layout(shape(smem_layout), GenRowMajor{});
+  //test_tma_load<T>(gmem_layout, smem_layout);
+  }
+  { // Dynamic gmem
+  Layout gmem_layout = make_layout(make_shape(2*uint32_t(size<0>(smem_layout)), 2*uint32_t(size<1>(smem_layout))),
+                                   GenRowMajor{});
+  test_tma_load<T>(gmem_layout, smem_layout);
+  }
+}
+
+TEST(SM90_CuTe_Hopper, Tma_Load_Swizzle_Atoms)
+{
+  test_tma_load_swizzle_atom_mn<int8_t, GMMA::Layout_MN_SW128_Atom>();
+  test_tma_load_swizzle_atom_mn<half_t, GMMA::Layout_MN_SW128_Atom>();
+  test_tma_load_swizzle_atom_mn< float, GMMA::Layout_MN_SW128_Atom>();
+  test_tma_load_swizzle_atom_mn<double, GMMA::Layout_MN_SW128_Atom>();
+
+  test_tma_load_swizzle_atom_mn<int8_t, GMMA::Layout_MN_SW64_Atom>();
+  test_tma_load_swizzle_atom_mn<half_t, GMMA::Layout_MN_SW64_Atom>();
+  test_tma_load_swizzle_atom_mn< float, GMMA::Layout_MN_SW64_Atom>();
+  test_tma_load_swizzle_atom_mn<double, GMMA::Layout_MN_SW64_Atom>();
+
+  test_tma_load_swizzle_atom_mn<int8_t, GMMA::Layout_MN_SW32_Atom>();
+  test_tma_load_swizzle_atom_mn<half_t, GMMA::Layout_MN_SW32_Atom>();
+  test_tma_load_swizzle_atom_mn< float, GMMA::Layout_MN_SW32_Atom>();
+  test_tma_load_swizzle_atom_mn<double, GMMA::Layout_MN_SW32_Atom>();
+
+  test_tma_load_swizzle_atom_mn<int8_t, GMMA::Layout_MN_INTER_Atom>();
+  test_tma_load_swizzle_atom_mn<half_t, GMMA::Layout_MN_INTER_Atom>();
+  test_tma_load_swizzle_atom_mn< float, GMMA::Layout_MN_INTER_Atom>();
+  test_tma_load_swizzle_atom_mn<double, GMMA::Layout_MN_INTER_Atom>();
+
+  test_tma_load_swizzle_atom_k<int8_t, GMMA::Layout_K_SW128_Atom>();
+  test_tma_load_swizzle_atom_k<half_t, GMMA::Layout_K_SW128_Atom>();
+  test_tma_load_swizzle_atom_k< float, GMMA::Layout_K_SW128_Atom>();
+  test_tma_load_swizzle_atom_k<double, GMMA::Layout_K_SW128_Atom>();
+
+  test_tma_load_swizzle_atom_k<int8_t, GMMA::Layout_K_SW64_Atom>();
+  test_tma_load_swizzle_atom_k<half_t, GMMA::Layout_K_SW64_Atom>();
+  test_tma_load_swizzle_atom_k< float, GMMA::Layout_K_SW64_Atom>();
+  test_tma_load_swizzle_atom_k<double, GMMA::Layout_K_SW64_Atom>();
+
+  test_tma_load_swizzle_atom_k<int8_t, GMMA::Layout_K_SW32_Atom>();
+  test_tma_load_swizzle_atom_k<half_t, GMMA::Layout_K_SW32_Atom>();
+  test_tma_load_swizzle_atom_k< float, GMMA::Layout_K_SW32_Atom>();
+  test_tma_load_swizzle_atom_k<double, GMMA::Layout_K_SW32_Atom>();
+
+  test_tma_load_swizzle_atom_k<int8_t, GMMA::Layout_K_INTER_Atom>();
+  test_tma_load_swizzle_atom_k<half_t, GMMA::Layout_K_INTER_Atom>();
+  test_tma_load_swizzle_atom_k< float, GMMA::Layout_K_INTER_Atom>();
+  test_tma_load_swizzle_atom_k<double, GMMA::Layout_K_INTER_Atom>();
+}
+
+template <class T, template <typename> typename SWIZZLE_ATOM>
+auto
+test_tma_load_swizzle_tile_mn()
+{
+  auto   smem_layout = tile_to_shape(SWIZZLE_ATOM<T>{}, Shape<_128,_128>{});
+  Layout gmem_layout = make_layout(make_shape(int(size<0>(smem_layout)), int(size<1>(smem_layout))), GenColMajor{});
+  return test_tma_load<T>(gmem_layout, smem_layout);
+}
+
+template <class T, template <typename> typename SWIZZLE_ATOM>
+auto
+test_tma_load_swizzle_tile_k()
+{
+  auto   smem_layout = tile_to_shape(SWIZZLE_ATOM<T>{}, Shape<_128,_128>{});
+  Layout gmem_layout = make_layout(make_shape(int(size<0>(smem_layout)), int(size<1>(smem_layout))), GenRowMajor{});
+  return test_tma_load<T>(gmem_layout, smem_layout);
+}
+
+TEST(SM90_CuTe_Hopper, Tma_Load_Swizzle_Tiles)
+{
+  // Other T-types use too much smem
+  test_tma_load_swizzle_tile_mn<int8_t, GMMA::Layout_MN_SW128_Atom>();
+  test_tma_load_swizzle_tile_mn<half_t, GMMA::Layout_MN_SW128_Atom>();
+  test_tma_load_swizzle_tile_mn<int8_t, GMMA::Layout_MN_SW64_Atom>();
+  test_tma_load_swizzle_tile_mn<half_t, GMMA::Layout_MN_SW64_Atom>();
+  test_tma_load_swizzle_tile_mn<int8_t, GMMA::Layout_MN_SW32_Atom>();
+  test_tma_load_swizzle_tile_mn<half_t, GMMA::Layout_MN_SW32_Atom>();
+  test_tma_load_swizzle_tile_mn<int8_t, GMMA::Layout_MN_INTER_Atom>();
+  test_tma_load_swizzle_tile_mn<half_t, GMMA::Layout_MN_INTER_Atom>();
+  test_tma_load_swizzle_tile_k<int8_t, GMMA::Layout_K_SW128_Atom>();
+  test_tma_load_swizzle_tile_k<half_t, GMMA::Layout_K_SW128_Atom>();
+  test_tma_load_swizzle_tile_k<int8_t, GMMA::Layout_K_SW64_Atom>();
+  test_tma_load_swizzle_tile_k<half_t, GMMA::Layout_K_SW64_Atom>();
+  test_tma_load_swizzle_tile_k<int8_t, GMMA::Layout_K_SW32_Atom>();
+  test_tma_load_swizzle_tile_k<half_t, GMMA::Layout_K_SW32_Atom>();
+  test_tma_load_swizzle_tile_k<int8_t, GMMA::Layout_K_INTER_Atom>();
+  test_tma_load_swizzle_tile_k<half_t, GMMA::Layout_K_INTER_Atom>();
+}
+
+// Tensor by-mode
+TEST(SM90_CuTe_Hopper, Tma_Load_Tensor)
+{
+  // 3-mode TMA
+  {
+  Layout gmem_layout = make_layout(make_shape(128, 64, 5));
+  auto cta_tile      = Shape<_64, _32>{};                    // GMEM Tiling:
+                                                             //   Take 64-elem from m
+                                                             //   Take 32-elem from k
+  auto smem_layout = make_layout(Shape<_64,_32>{});
+  test_tma_load<half_t>(gmem_layout, smem_layout, cta_tile);
+  }
+
+  // 4-mode TMA
+  {
+  Layout gmem_layout = make_layout(make_shape(make_shape(80,40),make_shape(32,12)));
+  auto cta_tile      = Shape<Shape<_16,_8>,Shape<_32,_2>>{}; // GMEM Tiling:
+                                                             //   Take 16-elem from m0, 8-elem from m1,
+                                                             //   Take 32-elem from k0, 2-elem from k1
+  auto smem_layout = make_layout(Shape<_128,_64>{});
+  test_tma_load<half_t>(gmem_layout, smem_layout, cta_tile);
+  }
+
+  // 5-mode TMA
+  {
+  Layout gmem_layout = make_layout(make_shape(make_shape(32,32,32),make_shape(32,12)));
+  auto cta_tile      = Shape<Shape<_16,_4,_2>,Shape<_16,_2>>{}; // GMEM Tiling:
+                                                             //   Take 4-elem from m0, 4-elem from m1, 5-elem from m2
+                                                             //   Take 32-elem from k0, 2-elem from k1
+  auto smem_layout = make_layout(Shape<_128,_32>{});
+  test_tma_load<half_t>(gmem_layout, smem_layout, cta_tile);
+  }
+}
+
+// Tensor Multimode -- TMA with more than 5 modes in GMEM (packs residual modes into last TMA mode)
+TEST(SM90_CuTe_Hopper, Tma_Load_Tensor_Multimode)
+{
+  {
+  Layout gmem_layout = make_layout(make_shape(make_shape(32,3,2,2),make_shape(32,4,2)));
+  auto cta_tile      = Shape<Shape<_32>, Shape<_32,_2>>{};    // GMEM Tiling:
+                                                              //  Take 32-elem from m0
+                                                              //  Take 32-elem from k0, 2-elem from k1
+  auto smem_layout = make_layout(Shape<_32,_64>{});
+  test_tma_load<half_t>(gmem_layout, smem_layout, cta_tile);
+  }
+
+  {
+  Layout gmem_layout = make_layout(make_shape(make_shape(64,3,2,2),make_shape(32,4,2)));
+  auto cta_tile      = Shape<Shape<_32,_3>, Shape<_32,_2>>{}; // GMEM Tiling:
+                                                              //  Take 32-elem from m0, 3-elem from m1
+                                                              //  Take 32-elem from k0, 2-elem from k1
+  auto smem_layout = make_layout(Shape<_96,_64>{});
+  test_tma_load<half_t>(gmem_layout, smem_layout, cta_tile);
+  }
+
+  {
+  Layout gmem_layout = make_layout(make_shape(make_shape(64,3,2,3,2),make_shape(32,4,2,2)));
+  auto cta_tile      = Shape<Shape<_32>, Shape<_16,_2>>{};    // GMEM Tiling:
+                                                              //  Take 32-elem from m0
+                                                              //  Take 16-elem from k0, 2-elem from k1
+  auto smem_layout = make_layout(Shape<_32,_32>{});
+  test_tma_load<half_t>(gmem_layout, smem_layout, cta_tile);
+  }
+}
+
+TEST(SM90_CuTe_Hopper, Tma_Load_Coalesce)
+{
+  // Interleaved ColMajor
+  {
+  Layout gmem_layout = make_layout(make_shape (  128, make_shape (_4{},  128)),
+                                   make_stride( _4{}, make_stride(_1{},  512)));
+  auto   smem_layout = make_layout(make_shape (_32{}, make_shape (_4{},  _32{})),
+                                   make_stride( _4{}, make_stride(_1{}, _128{})));
+
+  // By default, uses cta_tile = Shape<_32,_128>
+  auto tma = test_tma_load<int8_t>(gmem_layout, smem_layout);
+  // Check the TMA rank
+  EXPECT_EQ(rank(tma.get_tma_tensor(shape(gmem_layout))(0)), 2);
+  }
+
+  // Interleaved RowMajor
+  {
+  Layout gmem_layout = make_layout(make_shape (make_shape (_4{},   128),   128),
+                                   make_stride(make_stride(_1{},   512),   _4{}));
+  auto   smem_layout = make_layout(make_shape (make_shape (_4{},  _32{}), _32{}),
+                                   make_stride(make_stride(_1{}, _128{}),  _4{}));
+
+  // By default, uses cta_tile = Shape<_128,_32>
+  auto tma = test_tma_load<int8_t>(gmem_layout, smem_layout);
+  // Check the TMA rank
+  EXPECT_EQ(rank(tma.get_tma_tensor(shape(gmem_layout))(0)), 2);
+  }
+
+  // Account for stride-0 modes within the TMA tile
+  {
+  Layout gmem_layout = make_layout(make_shape (  128, make_shape (_32{},   4)),
+                                   make_stride( _1{}, make_stride( _0{}, 128)));
+  auto   smem_layout = make_layout(make_shape (_64{}, make_shape (_32{}     )),
+                                   make_stride( _1{}, make_stride( _0{}     )));
+
+  // By default, uses cta_tile = Shape<_64,_32>
+  auto tma = test_tma_load<uint16_t>(gmem_layout, smem_layout);
+  // Check the TMA rank
+  EXPECT_EQ(rank(tma.get_tma_tensor(shape(gmem_layout))(0)), 2);
+  }
+
+  // Coalesce many modes and account for stride-0 modes within the TMA tile
+  {
+  Layout gmem_layout = make_layout(make_shape (make_shape (_32{},_4{},     4), _32{}, make_shape (_4{},      4)),
+                                   make_stride(make_stride(_16{},_4{},  2048),  _0{}, make_stride(_1{}, _512{})));
+  auto   smem_layout = make_layout(make_shape (make_shape (_32{},_4{}       ), _32{}, make_shape (_4{}        )),
+                                   make_stride(make_stride(_16{},_4{}       ),  _0{}, make_stride(_1{}        )));
+
+  // By default, uses cta_tile = Shape<_128,_32,_4>
+  auto tma = test_tma_load<int8_t>(gmem_layout, smem_layout);
+  // Check the TMA rank (Could be 3 instead of 4 with even better coalescing...?)
+  EXPECT_EQ(rank(tma.get_tma_tensor(shape(gmem_layout))(0)), 4);
+  }
+}
+
+TEST(SM90_CuTe_Hopper, Tma_Load_InternalType)
+{
+  Layout smem_layout = Layout<Shape<_32,_32>, Stride<_1,_32>>{};
+  Layout gmem_layout = make_layout(make_shape(64, 64));
+
+  // Downcasted tensors to smaller TmaTypes
+  {
+  test_tma_load<int8_t, uint8_t>(gmem_layout, smem_layout);
+  test_tma_load<half_t, uint8_t>(gmem_layout, smem_layout);
+  test_tma_load< float, uint8_t>(gmem_layout, smem_layout);
+  test_tma_load<double, uint8_t>(gmem_layout, smem_layout);
+  }
+
+  // Upcasted tensors to larger TmaTypes
+  {
+  test_tma_load<int8_t, uint64_t>(gmem_layout, smem_layout);
+  test_tma_load<half_t, uint64_t>(gmem_layout, smem_layout);
+  test_tma_load< float, uint64_t>(gmem_layout, smem_layout);
+  test_tma_load<double, uint64_t>(gmem_layout, smem_layout);
+  }
+
+  // Complex<double> is 128bit, which the TMA has no concept of
+  {
+  test_tma_load<complex<double>, uint64_t>(gmem_layout, smem_layout);
+  test_tma_load<complex<double>, uint32_t>(gmem_layout, smem_layout);
+  }
+}
+
+#endif
diff --git a/test/unit/cute/hopper/tma_load_testbed.hpp b/test/unit/cute/hopper/tma_load_testbed.hpp
new file mode 100644
index 0000000000..58d19e4aa1
--- /dev/null
+++ b/test/unit/cute/hopper/tma_load_testbed.hpp
@@ -0,0 +1,217 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass_unit_test.h"
+
+#include <iostream>
+#include <cstdint>
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <cute/tensor.hpp>
+
+namespace cutlass::test {
+
+template <class ElementType, class SmemLayout>
+struct SharedStorage
+{
+  cute::ArrayEngine<ElementType, cute::cosize_v<SmemLayout>> smem;
+  alignas(16) cute::uint64_t tma_load_mbar[1];
+};
+
+#if CUDA_12_0_SM90_FEATURES_SUPPORTED
+
+template <class T, class TiledCopy, class CTA_Tiler, class GmemLayout, class SmemLayout>
+__global__ void
+tma_test_device_cute(T const* g_in, T* g_out,
+                     CUTE_GRID_CONSTANT TiledCopy const tma, CTA_Tiler cta_tiler,
+                     GmemLayout gmem_layout, SmemLayout smem_layout)
+{
+  using namespace cute;
+  CUTE_STATIC_ASSERT_V(product_each(shape(cta_tiler)) == product_each(shape(smem_layout)));
+
+  // Use Shared Storage structure to allocate and distribute aligned SMEM addresses
+  extern __shared__ char shared_memory[];
+  using SharedStorage = SharedStorage<T, SmemLayout>;
+  SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(shared_memory);
+
+  // Construct SMEM tensor
+  Tensor sA = make_tensor(make_smem_ptr(shared_storage.smem.begin()), smem_layout);  // (CTA_TILE_M,CTA_TILE_N,...)
+  // Shared memory barriers use 64bits in SMEM for synchronization
+  uint64_t* tma_load_mbar = shared_storage.tma_load_mbar;
+
+  // TMA requires special handling of strides to deal with coord codomain mapping
+  // Represent the full tensors -- get these from TMA
+  Tensor mA = tma.get_tma_tensor(shape(gmem_layout));
+  Tensor mB = make_tensor(make_gmem_ptr<T>(g_out), gmem_layout);
+
+  constexpr int R = rank_v<CTA_Tiler>;
+  Tensor gA = flat_divide(mA, cta_tiler);               // (CTA_TILE_M,CTA_TILE_N,...REST_M,REST_N,...)
+  Tensor gB = flat_divide(mB, cta_tiler);               // (CTA_TILE_M,CTA_TILE_N,...REST_M,REST_N,...)
+
+  //
+  // Prepare the TMA_LOAD
+  //
+
+  auto cta_tma = tma.get_slice(Int<0>{});                            // CTA slice
+  Tensor tAgA_x = cta_tma.partition_S(gA);                           // (TMA,TMA_M,TMA_N,REST_M,REST_N)
+  Tensor tAsA_x = cta_tma.partition_D(sA);                           // (TMA,TMA_M,TMA_N)
+
+#if 0
+  if (thread0()) {
+    print(tma);
+    print("TILE  :  "); print(cta_tiler); print("\n");
+    print("  mA  :  "); print(  mA);   print("\n");
+    print("  mB  :  "); print(  mB);   print("\n");
+    print("  gA  :  "); print(  gA);   print("\n");
+    print("  gB  :  "); print(  gB);   print("\n");
+    print("  sA  :  "); print(  sA);   print("\n");
+    print("tAgA_x:  "); print(tAgA_x); print("\n");
+    print("tAsA_x:  "); print(tAsA_x); print("\n");
+  }
+#endif
+
+  //
+  // Perform the TMA_LOAD
+  //
+
+  // INPUT: Group the REST_X modes and the TMA_X modes to easily iterate through the tiles
+  Tensor tAgA = group_modes<1,rank(tAgA_x)>(tAgA_x);                 // (TMA,REST)
+  Tensor tAsA = group_modes<1,rank(tAsA_x)>(tAsA_x);                 // (TMA,REST)
+  static_assert(size<1>(tAsA) == 1);
+
+  // OUTPUT: Group the CTA_TILE_X modes and REST_X modes for output
+  Tensor tBgB = group_modes<0,R>(group_modes<R,rank(gB)>(gB));       // (CTA_TILE, REST)
+
+#if 0
+  if (thread0()) {
+    print("tAgA  :  "); print(tAgA); print("\n");
+    print("tAsA  :  "); print(tAsA); print("\n");
+    print("tBgB  :  "); print(tBgB); print("\n");
+  }
+#endif
+
+  // Test L2 prefetch
+  if (threadIdx.x == 0) {
+    prefetch(tma, tAgA);
+  }
+
+  // Loop over the TMA stages, using smem as our buffer
+  for (int stage = 0; stage < size<1>(tAgA); ++stage)
+  {
+    // Set the bytes transferred in this TMA transaction (may involve multiple issues)
+    constexpr int kTmaTransactionBytes = sizeof(ArrayEngine<T, CUTE_STATIC_V(size(filter_zeros(sA)))>);
+
+    if (threadIdx.x == 0)
+    {
+      /// Initialize shared memory barrier
+      tma_load_mbar[0] = 0;
+      cute::initialize_barrier(tma_load_mbar[0], 1 /*numThreads*/);
+      cute::set_barrier_transaction_bytes(tma_load_mbar[0], kTmaTransactionBytes);
+
+      copy(tma.with(tma_load_mbar[0]), tAgA(_,stage), tAsA(_,0));
+    }
+    __syncthreads();
+
+    /// Wait on the shared memory barrier until the phase bit flips from kPhaseBit value
+    constexpr int kPhaseBit = 0;
+    cute::wait_barrier(tma_load_mbar[0], kPhaseBit);
+
+    //
+    // Write out trivially smem -> gmem
+    //
+
+    // Subbyte elements could cause race conditions, so be even more conservative
+    if (thread0()) {
+      copy(sA, tBgB(_,stage));
+    }
+
+    __syncthreads();
+  }
+}
+
+template <class T, class TmaType = T, class CopyOp, class GMEM_Layout, class SMEM_Layout, class CTA_Tile>
+auto
+test_tma_load(CopyOp      const& copy_op,
+              GMEM_Layout const& gmem_layout,
+              SMEM_Layout const& smem_layout,
+              CTA_Tile    const& cta_tile)
+{
+  using namespace cute;
+
+  // Allocate and initialize host test data
+  size_t N = ceil_div(cosize(gmem_layout) * sizeof_bits<T>::value, 8);
+  thrust::host_vector<uint8_t> h_in(N);
+  for (size_t i = 0; i < h_in.size(); ++i) {
+    h_in[i] = uint8_t(i % 13);
+  }
+  Tensor hA_in  = make_tensor(recast_ptr<T>(h_in.data()), gmem_layout);
+
+  // Allocate and initialize device test data
+  thrust::device_vector<uint8_t> d_in = h_in;
+  thrust::device_vector<uint8_t> d_out(h_in.size(), uint8_t(-1)); // overflow uint
+
+  // Create TMA for this device Tensor
+  Tensor gA = make_tensor(make_gmem_ptr<T>(raw_pointer_cast(d_in.data())), gmem_layout);
+  auto tma = make_tma_copy<TmaType>(copy_op, gA, smem_layout, cta_tile, Int<1>{});
+  //print(tma);
+
+  // Launch
+  int smem_size = int(sizeof(SharedStorage<T, decltype(smem_layout)>));
+  tma_test_device_cute<<<1, 128, smem_size>>>(
+    reinterpret_cast<T const*>(raw_pointer_cast(d_in.data())),
+    reinterpret_cast<T*>      (raw_pointer_cast(d_out.data())),
+    tma, cta_tile,
+    gmem_layout,
+    smem_layout);
+
+  // Copy results back to host
+  thrust::host_vector<uint8_t> h_out = d_out;
+  Tensor hA_out = make_tensor(recast_ptr<T>(h_out.data()), gmem_layout);
+
+  // Validate the results. Print only the first 3 errors.
+  int count = 3;
+  for (int i = 0; i < int(size(hA_out)) && count > 0; ++i) {
+    EXPECT_EQ(hA_in(i), hA_out(i));
+    if (hA_in(i) != hA_out(i)) {
+      --count;
+    }
+  }
+
+  return tma;
+}
+
+#endif
+
+} // end namespace cutlass::test
diff --git a/test/unit/cute/hopper/tma_mcast_load.cu b/test/unit/cute/hopper/tma_mcast_load.cu
new file mode 100644
index 0000000000..9a330716c4
--- /dev/null
+++ b/test/unit/cute/hopper/tma_mcast_load.cu
@@ -0,0 +1,76 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include "../hopper/tma_mcast_load_testbed.hpp"
+
+using namespace cute;
+using namespace cutlass::test;
+
+#if CUDA_12_0_SM90_FEATURES_SUPPORTED
+
+template <class T, class TmaType = T, class GMEM_Layout, class SMEM_Layout, class CTA_Tile, class Cluster_Size = Int<1>>
+auto
+test_tma_load(GMEM_Layout  const& gmem_layout,
+              SMEM_Layout  const& smem_layout,
+              CTA_Tile     const& cta_tile,
+              Cluster_Size const& cluster_size = {})
+{
+  return test_tma_load<T, TmaType>(SM90_TMA_LOAD_MULTICAST{}, gmem_layout, smem_layout, cta_tile, cluster_size);
+}
+
+template <class T, class TmaType = T, class GMEM_Layout, class SMEM_Layout>
+auto
+test_tma_load(GMEM_Layout const& gmem_layout,
+              SMEM_Layout const& smem_layout)
+{
+  return test_tma_load<T, TmaType>(gmem_layout, smem_layout, product_each(shape(smem_layout)));
+}
+
+TEST(SM90_CuTe_Hopper, Tma_Load_32x32_Col_MCast)
+{
+  Layout smem_layout = Layout<Shape<_32,_32>, Stride<_1,_32>>{};
+  {
+  Layout gmem_layout = make_layout(make_shape(32,32), GenColMajor{});
+  test_tma_load<int8_t>(gmem_layout, smem_layout, shape(smem_layout), Int<2>{});
+  test_tma_load<half_t>(gmem_layout, smem_layout, shape(smem_layout), Int<2>{});
+  test_tma_load< float>(gmem_layout, smem_layout, shape(smem_layout), Int<2>{});
+  test_tma_load<double>(gmem_layout, smem_layout, shape(smem_layout), Int<2>{});
+
+  test_tma_load<int8_t>(gmem_layout, smem_layout, shape(smem_layout), 2);
+  test_tma_load<half_t>(gmem_layout, smem_layout, shape(smem_layout), 2);
+  test_tma_load< float>(gmem_layout, smem_layout, shape(smem_layout), 2);
+  test_tma_load<double>(gmem_layout, smem_layout, shape(smem_layout), 2);
+  }
+}
+
+#endif
diff --git a/test/unit/cute/hopper/tma_mcast_load_testbed.hpp b/test/unit/cute/hopper/tma_mcast_load_testbed.hpp
new file mode 100644
index 0000000000..bca378793f
--- /dev/null
+++ b/test/unit/cute/hopper/tma_mcast_load_testbed.hpp
@@ -0,0 +1,242 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass_unit_test.h"
+
+#include <iostream>
+#include <cstdint>
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <cute/tensor.hpp>
+#include <cute/arch/cluster_sm90.hpp>
+#include <cutlass/cluster_launch.hpp>
+
+namespace cutlass::test {
+
+template <class ElementType, class SmemLayout>
+struct SharedStorage
+{
+  cute::ArrayEngine<ElementType, cute::cosize_v<SmemLayout>> smem;
+  alignas(16) cute::uint64_t tma_load_mbar[1];
+};
+
+#if CUDA_12_0_SM90_FEATURES_SUPPORTED
+
+template <class T, class GmemLayout, class SmemLayout,
+          class CopyAtom, class CTA_Tiler, class Cluster_Size>
+__global__ void
+tma_test_device_cute(T const* g_in, T* g_out, GmemLayout gmem_layout, SmemLayout smem_layout,
+                     CUTE_GRID_CONSTANT CopyAtom const tma, CTA_Tiler cta_tiler, Cluster_Size cluster_size)
+{
+  using namespace cute;
+  CUTE_STATIC_ASSERT_V(product_each(shape(cta_tiler)) == product_each(shape(smem_layout)));
+
+  // Use Shared Storage structure to allocate and distribute aligned SMEM addresses
+  extern __shared__ char shared_memory[];
+  using SharedStorage = SharedStorage<T, SmemLayout>;
+  SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(shared_memory);
+
+  // Construct SMEM tensor
+  Tensor sA = make_tensor(make_smem_ptr(shared_storage.smem.begin()), smem_layout);  // (CTA_TILE_M,CTA_TILE_N,...)
+  // Shared memory barriers use 64bits in SMEM for synchronization
+  uint64_t* tma_load_mbar = shared_storage.tma_load_mbar;
+
+  // TMA requires special handling of strides to deal with coord codomain mapping
+  // Represent the full tensors -- get these from TMA
+  Tensor mA = tma.get_tma_tensor(shape(gmem_layout));
+  Tensor mB = make_tensor(make_gmem_ptr<T>(g_out), gmem_layout);
+
+  Tensor gA = zipped_divide(mA, cta_tiler);               // ((CTA_TILE_M,CTA_TILE_N,...),(REST_M,REST_N,...))
+  Tensor gB = zipped_divide(mB, cta_tiler);               // ((CTA_TILE_M,CTA_TILE_N,...),(REST_M,REST_N,...))
+
+#if 1
+  if (thread0()) {
+    print(tma);
+    print("TILE  :  "); print(cta_tiler); print("\n");
+    print("  mA  :  "); print(  mA);   print("\n");
+    print("  mB  :  "); print(  mB);   print("\n");
+    print("  gA  :  "); print(  gA);   print("\n");
+    print("  gB  :  "); print(  gB);   print("\n");
+    print("  sA  :  "); print(  sA);   print("\n");
+  } __syncthreads(); cute::cluster_sync();
+#endif
+
+  //
+  // Prepare the TMA_LOAD
+  //
+
+  Tensor sA_x = make_tensor(sA.data(), make_layout(sA.layout(), Layout<_1>{}));  // ((CTA_TILE_M,CTA_TILE_N,...),_1)
+  Tensor tBgB = gB;                                                              // ((CTA_TILE_M,CTA_TILE_N,...),(REST_M,REST_N,...))
+
+  int cta_rank_in_cluster  = cute::block_rank_in_cluster();
+  auto [tAgA, tAsA] = tma_partition(tma, cta_rank_in_cluster, make_layout(cluster_size), sA_x, gA);
+
+#if 1
+  if (thread0()) {
+    print("sA_x  :  "); print(sA_x); print("\n");
+    print("tBgB  :  "); print(tBgB); print("\n");
+    print("tAgA  :  "); print(tAgA); print("\n");
+    print("tAsA  :  "); print(tAsA); print("\n");
+  } __syncthreads(); cute::cluster_sync();
+#endif
+
+  //
+  // TMA Multicast Masks -- Get a mask of the active ctas in each TMA
+  //
+
+
+  int elected_cta_rank = 0;
+  bool elect_one_cta = (elected_cta_rank == cta_rank_in_cluster);
+  bool elect_one_thr = cute::elect_one_sync();
+
+  uint16_t tma_mcast_mask = ((uint16_t(1) << cluster_size) - 1);
+
+#if 1
+  if (thread0()) {
+    print("tma_mcast_mask :  "); print(tma_mcast_mask); print("\n");
+  } __syncthreads(); cute::cluster_sync();
+#endif
+
+  //
+  // Perform the TMA_LOAD
+  //
+
+  if (elect_one_thr) {
+    // Initialize TMA barrier
+    cute::initialize_barrier(tma_load_mbar[0], /* num_threads */ 1);
+  }
+  int tma_phase_bit = 0;
+  // Ensures all CTAs in the Cluster have initialized
+  __syncthreads();
+  cute::cluster_sync();
+
+  // Loop over the TMA stages, using smem as our buffer
+  for (int stage = 0; stage < size<1>(tAgA); ++stage)
+  {
+    // Set the bytes transferred in this TMA transaction (may involve multiple issues)
+    constexpr int kTmaTransactionBytes = sizeof(ArrayEngine<T, CUTE_STATIC_V(size(filter_zeros(sA)))>);
+
+    if (elect_one_thr)
+    {
+      cute::set_barrier_transaction_bytes(tma_load_mbar[0], kTmaTransactionBytes);
+
+      copy(tma.with(tma_load_mbar[0], tma_mcast_mask), tAgA(_,stage), tAsA(_,0));
+    }
+    __syncthreads();
+
+    /// Wait on the shared memory barrier until the phase bit flips from tma_phase_bit value
+    cute::wait_barrier(tma_load_mbar[0], tma_phase_bit);
+    tma_phase_bit ^= 1;
+
+    //
+    // Write out trivially smem -> gmem
+    //
+
+    // Subbyte elements could cause race conditions, so be even more conservative
+    if (elect_one_cta && elect_one_thr) {
+      copy(sA, tBgB(_,stage));
+    }
+
+    __syncthreads();
+    cute::cluster_sync();
+  }
+}
+
+template <class T, class TmaType = T, class CopyOp,
+          class GMEM_Layout, class SMEM_Layout,
+          class CTA_Tiler, class Cluster_Size>
+auto
+test_tma_load(CopyOp       const& copy_op,
+              GMEM_Layout  const& gmem_layout,
+              SMEM_Layout  const& smem_layout,
+              CTA_Tiler    const& cta_tiler,
+              Cluster_Size const& cluster_size)
+{
+  using namespace cute;
+
+  // Allocate and initialize host test data
+  size_t N = ceil_div(cosize(gmem_layout) * sizeof_bits<T>::value, 8);
+  thrust::host_vector<uint8_t> h_in(N);
+  for (size_t i = 0; i < h_in.size(); ++i) {
+    h_in[i] = uint8_t(i % 13);
+  }
+  Tensor hA_in  = make_tensor(recast_ptr<T>(h_in.data()), gmem_layout);
+
+  // Allocate and initialize device test data
+  thrust::device_vector<uint8_t> d_in = h_in;
+  thrust::device_vector<uint8_t> d_out(h_in.size(), uint8_t(-1)); // overflow uint
+
+  // Create TMA for this device Tensor
+  Tensor gA = make_tensor(make_gmem_ptr<T>(raw_pointer_cast(d_in.data())), gmem_layout);
+  auto tma = make_tma_atom<TmaType>(copy_op, gA, smem_layout, cta_tiler, cluster_size);
+  //print(tma);
+
+  // Launch
+
+  dim3 dimBlock(32);
+  dim3 dimCluster(size(cluster_size));
+  dim3 dimGrid = dimCluster;
+  int smem_size = sizeof(SharedStorage<T, SMEM_Layout>);
+
+  void* kernel_ptr = (void*) &tma_test_device_cute<T, GMEM_Layout, SMEM_Layout,
+                                                   decltype(tma), CTA_Tiler, Cluster_Size>;
+
+  cutlass::launch_kernel_on_cluster({dimGrid, dimBlock, dimCluster, smem_size},
+                                    kernel_ptr,
+                                    reinterpret_cast<T const*>(raw_pointer_cast(d_in.data())),
+                                    reinterpret_cast<T      *>(raw_pointer_cast(d_out.data())),
+                                    gmem_layout,
+                                    smem_layout,
+                                    tma, cta_tiler, cluster_size);
+
+  // Copy results back to host
+  thrust::host_vector<uint8_t> h_out = d_out;
+  Tensor hA_out = make_tensor(recast_ptr<T>(h_out.data()), gmem_layout);
+
+  // Validate the results. Print only the first 3 errors.
+  int count = 3;
+  for (int i = 0; i < int(size(hA_out)) && count > 0; ++i) {
+    EXPECT_EQ(hA_in(i), hA_out(i));
+    if (hA_in(i) != hA_out(i)) {
+      --count;
+    }
+  }
+
+  return tma;
+}
+
+#endif
+
+} // end namespace cutlass::test
diff --git a/test/unit/cute/hopper/tma_store.cu b/test/unit/cute/hopper/tma_store.cu
new file mode 100644
index 0000000000..e347a0ac94
--- /dev/null
+++ b/test/unit/cute/hopper/tma_store.cu
@@ -0,0 +1,301 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include "../hopper/tma_store_testbed.hpp"
+
+using namespace cute;
+using namespace cutlass::test;
+
+#if CUDA_12_0_SM90_FEATURES_SUPPORTED
+
+template <class T, class TmaType = T, class GMEM_Layout, class SMEM_Layout, class CTA_Tile>
+void
+test_tma_store(GMEM_Layout const& gmem_layout,
+               SMEM_Layout const& smem_layout,
+               CTA_Tile    const& cta_tile)
+{
+  using namespace cute;
+  return test_tma_store<T, TmaType>(SM90_TMA_STORE{}, gmem_layout, smem_layout, cta_tile);
+}
+
+template <class T, class TmaType = T, class GMEM_Layout, class SMEM_Layout>
+void
+test_tma_store(GMEM_Layout const& gmem_layout,
+               SMEM_Layout const& smem_layout)
+{
+  using namespace cute;
+  return test_tma_store<T, TmaType>(gmem_layout, smem_layout, product_each(shape(smem_layout)));
+}
+
+TEST(SM90_CuTe_Hopper, Tma_Load_1D)
+{
+  Layout smem_layout = Layout<_256, _1>{};
+  {
+  Layout gmem_layout = smem_layout;
+  test_tma_store<int8_t>(gmem_layout, smem_layout);
+  test_tma_store<half_t>(gmem_layout, smem_layout);
+  test_tma_store< float>(gmem_layout, smem_layout);
+  test_tma_store<double>(gmem_layout, smem_layout);
+  }
+
+  {
+  Layout gmem_layout = make_layout(128, GenColMajor{});
+  test_tma_store<int8_t>(gmem_layout, smem_layout);
+  test_tma_store<half_t>(gmem_layout, smem_layout);
+  test_tma_store< float>(gmem_layout, smem_layout);
+  test_tma_store<double>(gmem_layout, smem_layout);
+  }
+}
+
+TEST(SM90_CuTe_Hopper, Tma_Store_32x32_Col)
+{
+  Layout smem_layout = Layout<Shape<_32,_32>, Stride<_1,_32>>{};
+  {
+  Layout gmem_layout = smem_layout;
+  test_tma_store<int8_t>(gmem_layout, smem_layout);
+  test_tma_store<half_t>(gmem_layout, smem_layout);
+  test_tma_store< float>(gmem_layout, smem_layout);
+  test_tma_store<double>(gmem_layout, smem_layout);
+  }
+
+  {
+  Layout gmem_layout = make_layout(make_shape(32,32), GenColMajor{});
+  test_tma_store<int8_t>(gmem_layout, smem_layout);
+  test_tma_store<half_t>(gmem_layout, smem_layout);
+  test_tma_store< float>(gmem_layout, smem_layout);
+  test_tma_store<double>(gmem_layout, smem_layout);
+  }
+
+  {
+  Layout gmem_layout = make_layout(make_shape(32,32), make_stride(Int<1>{}, 1024));
+  test_tma_store<int8_t>(gmem_layout, smem_layout);
+  test_tma_store<half_t>(gmem_layout, smem_layout);
+  test_tma_store< float>(gmem_layout, smem_layout);
+  test_tma_store<double>(gmem_layout, smem_layout);
+  }
+}
+
+TEST(SM90_CuTe_Hopper, Tma_Store_32x32_Row)
+{
+  Layout smem_layout = Layout<Shape<_32,_32>, Stride<_32,_1>>{};
+  {
+  Layout gmem_layout = smem_layout;
+  test_tma_store<int8_t>(gmem_layout, smem_layout);
+  test_tma_store<half_t>(gmem_layout, smem_layout);
+  test_tma_store< float>(gmem_layout, smem_layout);
+  test_tma_store<double>(gmem_layout, smem_layout);
+  }
+
+  {
+  Layout gmem_layout = make_layout(make_shape(32,32), GenRowMajor{});
+  test_tma_store<int8_t>(gmem_layout, smem_layout);
+  test_tma_store<half_t>(gmem_layout, smem_layout);
+  test_tma_store< float>(gmem_layout, smem_layout);
+  test_tma_store<double>(gmem_layout, smem_layout);
+  }
+
+  {
+  Layout gmem_layout = make_layout(make_shape(32,32), make_stride(1024, Int<1>{}));
+  test_tma_store<int8_t>(gmem_layout, smem_layout);
+  test_tma_store<half_t>(gmem_layout, smem_layout);
+  test_tma_store< float>(gmem_layout, smem_layout);
+  test_tma_store<double>(gmem_layout, smem_layout);
+  }
+}
+
+template <class T, template <typename> typename SWIZZLE_ATOM>
+void
+test_tma_store_swizzle_atom_mn()
+{
+  auto   smem_layout = SWIZZLE_ATOM<T>{};
+  Layout gmem_layout = make_layout(make_shape(2*size<0>(smem_layout), 2*size<1>(smem_layout)), GenColMajor{});
+  return test_tma_store<T>(gmem_layout, smem_layout);
+}
+
+template <class T, template <typename> typename SWIZZLE_ATOM>
+void
+test_tma_store_swizzle_atom_k()
+{
+  auto   smem_layout = SWIZZLE_ATOM<T>{};
+  Layout gmem_layout = make_layout(make_shape(2*size<0>(smem_layout), 2*size<1>(smem_layout)), GenRowMajor{});
+  return test_tma_store<T>(gmem_layout, smem_layout);
+}
+
+TEST(SM90_CuTe_Hopper, Tma_Store_Swizzle_Atoms)
+{
+  test_tma_store_swizzle_atom_mn<int8_t, GMMA::Layout_MN_SW128_Atom>();
+  test_tma_store_swizzle_atom_mn<half_t, GMMA::Layout_MN_SW128_Atom>();
+  test_tma_store_swizzle_atom_mn< float, GMMA::Layout_MN_SW128_Atom>();
+  test_tma_store_swizzle_atom_mn<double, GMMA::Layout_MN_SW128_Atom>();
+
+  test_tma_store_swizzle_atom_mn<int8_t, GMMA::Layout_MN_SW64_Atom>();
+  test_tma_store_swizzle_atom_mn<half_t, GMMA::Layout_MN_SW64_Atom>();
+  test_tma_store_swizzle_atom_mn< float, GMMA::Layout_MN_SW64_Atom>();
+  test_tma_store_swizzle_atom_mn<double, GMMA::Layout_MN_SW64_Atom>();
+
+  test_tma_store_swizzle_atom_mn<int8_t, GMMA::Layout_MN_SW32_Atom>();
+  test_tma_store_swizzle_atom_mn<half_t, GMMA::Layout_MN_SW32_Atom>();
+  test_tma_store_swizzle_atom_mn< float, GMMA::Layout_MN_SW32_Atom>();
+  test_tma_store_swizzle_atom_mn<double, GMMA::Layout_MN_SW32_Atom>();
+
+  test_tma_store_swizzle_atom_mn<int8_t, GMMA::Layout_MN_INTER_Atom>();
+  test_tma_store_swizzle_atom_mn<half_t, GMMA::Layout_MN_INTER_Atom>();
+  test_tma_store_swizzle_atom_mn< float, GMMA::Layout_MN_INTER_Atom>();
+  test_tma_store_swizzle_atom_mn<double, GMMA::Layout_MN_INTER_Atom>();
+
+  test_tma_store_swizzle_atom_k<int8_t, GMMA::Layout_K_SW128_Atom>();
+  test_tma_store_swizzle_atom_k<half_t, GMMA::Layout_K_SW128_Atom>();
+  test_tma_store_swizzle_atom_k< float, GMMA::Layout_K_SW128_Atom>();
+  test_tma_store_swizzle_atom_k<double, GMMA::Layout_K_SW128_Atom>();
+
+  test_tma_store_swizzle_atom_k<int8_t, GMMA::Layout_K_SW64_Atom>();
+  test_tma_store_swizzle_atom_k<half_t, GMMA::Layout_K_SW64_Atom>();
+  test_tma_store_swizzle_atom_k< float, GMMA::Layout_K_SW64_Atom>();
+  test_tma_store_swizzle_atom_k<double, GMMA::Layout_K_SW64_Atom>();
+
+  test_tma_store_swizzle_atom_k<int8_t, GMMA::Layout_K_SW32_Atom>();
+  test_tma_store_swizzle_atom_k<half_t, GMMA::Layout_K_SW32_Atom>();
+  test_tma_store_swizzle_atom_k< float, GMMA::Layout_K_SW32_Atom>();
+  test_tma_store_swizzle_atom_k<double, GMMA::Layout_K_SW32_Atom>();
+
+  test_tma_store_swizzle_atom_k<int8_t, GMMA::Layout_K_INTER_Atom>();
+  test_tma_store_swizzle_atom_k<half_t, GMMA::Layout_K_INTER_Atom>();
+  test_tma_store_swizzle_atom_k< float, GMMA::Layout_K_INTER_Atom>();
+  test_tma_store_swizzle_atom_k<double, GMMA::Layout_K_INTER_Atom>();
+}
+
+template <class T, template <typename> typename SWIZZLE_ATOM>
+void
+test_tma_store_swizzle_tile_mn()
+{
+  auto   smem_layout = tile_to_shape(SWIZZLE_ATOM<T>{}, Shape<_128,_128>{});
+  Layout gmem_layout = make_layout(make_shape(2*size<0>(smem_layout), 2*size<1>(smem_layout)), GenColMajor{});
+  return test_tma_store<T>(gmem_layout, smem_layout);
+}
+
+template <class T, template <typename> typename SWIZZLE_ATOM>
+void
+test_tma_store_swizzle_tile_k()
+{
+  auto   smem_layout = tile_to_shape(SWIZZLE_ATOM<T>{}, Shape<_128,_128>{});
+  Layout gmem_layout = make_layout(make_shape(2*size<0>(smem_layout), 2*size<1>(smem_layout)), GenRowMajor{});
+  return test_tma_store<T>(gmem_layout, smem_layout);
+}
+
+TEST(SM90_CuTe_Hopper, Tma_Store_Swizzle_Tiles)
+{
+  // Other T-types use too much smem
+  test_tma_store_swizzle_tile_mn<int8_t, GMMA::Layout_MN_SW128_Atom>();
+  test_tma_store_swizzle_tile_mn<half_t, GMMA::Layout_MN_SW128_Atom>();
+  test_tma_store_swizzle_tile_mn<int8_t, GMMA::Layout_MN_SW64_Atom>();
+  test_tma_store_swizzle_tile_mn<half_t, GMMA::Layout_MN_SW64_Atom>();
+  test_tma_store_swizzle_tile_mn<int8_t, GMMA::Layout_MN_SW32_Atom>();
+  test_tma_store_swizzle_tile_mn<half_t, GMMA::Layout_MN_SW32_Atom>();
+  test_tma_store_swizzle_tile_mn<int8_t, GMMA::Layout_MN_INTER_Atom>();
+  test_tma_store_swizzle_tile_mn<half_t, GMMA::Layout_MN_INTER_Atom>();
+  test_tma_store_swizzle_tile_k<int8_t, GMMA::Layout_K_SW128_Atom>();
+  test_tma_store_swizzle_tile_k<half_t, GMMA::Layout_K_SW128_Atom>();
+  test_tma_store_swizzle_tile_k<int8_t, GMMA::Layout_K_SW64_Atom>();
+  test_tma_store_swizzle_tile_k<half_t, GMMA::Layout_K_SW64_Atom>();
+  test_tma_store_swizzle_tile_k<int8_t, GMMA::Layout_K_SW32_Atom>();
+  test_tma_store_swizzle_tile_k<half_t, GMMA::Layout_K_SW32_Atom>();
+  test_tma_store_swizzle_tile_k<int8_t, GMMA::Layout_K_INTER_Atom>();
+  test_tma_store_swizzle_tile_k<half_t, GMMA::Layout_K_INTER_Atom>();
+}
+
+// Tensor by-mode
+TEST(SM90_CuTe_Hopper, Tma_Store_Tensor)
+{
+  // 3-mode TMA
+  {
+  Layout gmem_layout = make_layout(make_shape(128, 64, 5));
+  auto cta_tile      = Shape<_64, _32>{};                    // GMEM Tiling:
+                                                             //   Take 64-elem from m
+                                                             //   Take 32-elem from k
+  auto smem_layout = make_layout(Shape<_64,_32>{});
+  test_tma_store<half_t>(gmem_layout, smem_layout, cta_tile);
+  }
+
+  // 4-mode TMA
+  {
+  Layout gmem_layout = make_layout(make_shape(make_shape(80,40),make_shape(32,12)));
+  auto cta_tile      = Shape<Shape<_16,_8>,Shape<_32,_2>>{}; // GMEM Tiling:
+                                                             //   Take 16-elem from m0, 8-elem from m1,
+                                                             //   Take 32-elem from k0, 2-elem from k1
+  auto smem_layout = make_layout(Shape<_128,_64>{});
+  test_tma_store<half_t>(gmem_layout, smem_layout, cta_tile);
+  }
+
+  // 5-mode TMA
+  {
+  Layout gmem_layout = make_layout(make_shape(make_shape(32,32,32),make_shape(32,12)));
+  auto cta_tile      = Shape<Shape<_16,_4,_2>,Shape<_16,_2>>{}; // GMEM Tiling:
+                                                             //   Take 4-elem from m0, 4-elem from m1, 5-elem from m2
+                                                             //   Take 32-elem from k0, 2-elem from k1
+  auto smem_layout = make_layout(Shape<_128,_32>{});
+  test_tma_store<half_t>(gmem_layout, smem_layout, cta_tile);
+  }
+}
+
+// Tensor Multimode -- TMA with more than 5 modes in GMEM (packs residual modes into last TMA mode)
+TEST(SM90_CuTe_Hopper, Tma_Store_Tensor_Multimode)
+{
+  {
+  Layout gmem_layout = make_layout(make_shape(make_shape(32,3,2,2),make_shape(32,4,2)));
+  auto cta_tile      = Shape<Shape<_32>, Shape<_32,_2>>{};    // GMEM Tiling:
+                                                              //  Take 32-elem from m0
+                                                              //  Take 32-elem from k0, 2-elem from k1
+  auto smem_layout = make_layout(Shape<_32,_64>{});
+  test_tma_store<half_t>(gmem_layout, smem_layout, cta_tile);
+  }
+
+  {
+  Layout gmem_layout = make_layout(make_shape(make_shape(64,3,2,2),make_shape(32,4,2)));
+  auto cta_tile      = Shape<Shape<_32,_3>, Shape<_32,_2>>{}; // GMEM Tiling:
+                                                              //  Take 32-elem from m0, 3-elem from m1
+                                                              //  Take 32-elem from k0, 2-elem from k1
+  auto smem_layout = make_layout(Shape<_96,_64>{});
+  test_tma_store<half_t>(gmem_layout, smem_layout, cta_tile);
+  }
+
+  {
+  Layout gmem_layout = make_layout(make_shape(make_shape(64,3,2,3,2),make_shape(32,4,2,2)));
+  auto cta_tile      = Shape<Shape<_32>, Shape<_16,_2>>{};    // GMEM Tiling:
+                                                              //  Take 32-elem from m0
+                                                              //  Take 16-elem from k0, 2-elem from k1
+  auto smem_layout = make_layout(Shape<_32,_32>{});
+  test_tma_store<half_t>(gmem_layout, smem_layout, cta_tile);
+  }
+}
+
+#endif
diff --git a/test/unit/cute/hopper/tma_store_testbed.hpp b/test/unit/cute/hopper/tma_store_testbed.hpp
new file mode 100644
index 0000000000..ebdec55abe
--- /dev/null
+++ b/test/unit/cute/hopper/tma_store_testbed.hpp
@@ -0,0 +1,201 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass_unit_test.h"
+
+#include <iostream>
+#include <cstdint>
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <cute/tensor.hpp>
+
+namespace cutlass::test {
+
+template <class ElementType, class SmemLayout>
+struct SharedStorage
+{
+  cute::ArrayEngine<ElementType, cute::cosize_v<SmemLayout>> smem;
+};
+
+#if CUDA_12_0_SM90_FEATURES_SUPPORTED
+
+template <class T, class TiledCopy, class CTA_Tiler, class GmemLayout, class SmemLayout>
+__global__ void
+tma_test_device_cute(T const* g_in, T* g_out,
+                     CUTE_GRID_CONSTANT TiledCopy const tma, CTA_Tiler cta_tiler,
+                     GmemLayout gmem_layout, SmemLayout smem_layout)
+{
+  using namespace cute;
+  CUTE_STATIC_ASSERT_V(product_each(shape(cta_tiler)) == product_each(shape(smem_layout)));
+
+  // Use Shared Storage structure to allocate and distribute aligned SMEM addresses
+  extern __shared__ char shared_memory[];
+  using SharedStorage = SharedStorage<T, SmemLayout>;
+  SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(shared_memory);
+
+  // Construct SMEM tensor
+  Tensor sB = make_tensor(make_smem_ptr(shared_storage.smem.begin()), smem_layout);  // (CTA_TILE_M,CTA_TILE_N,...)
+
+  // TMA requires special handling of strides to deal with coord codomain mapping
+  // Represent the full tensors -- get these from TMA
+  Tensor mA = make_tensor(make_gmem_ptr<T>(g_in), gmem_layout);
+  Tensor mB = tma.get_tma_tensor(shape(gmem_layout));
+
+  constexpr int R = rank_v<CTA_Tiler>;
+  Tensor gA = flat_divide(mA, cta_tiler);                 // (CTA_TILE_M,CTA_TILE_N,...REST_M,REST_N,...)
+  Tensor gB = flat_divide(mB, cta_tiler);                 // (CTA_TILE_M,CTA_TILE_N,...REST_M,REST_N,...)
+
+  //
+  // Prepare the TMA_STORE
+  //
+
+  auto cta_tma = tma.get_slice(Int<0>{});                            // CTA slice
+  Tensor tBsB_x = cta_tma.partition_S(sB);                           // (TMA,TMA_M,TMA_N)
+  Tensor tBgB_x = cta_tma.partition_D(gB);                           // (TMA,TMA_M,TMA_N,REST_M,REST_N)
+
+#if 0
+  if (thread0()) {
+    print(tma);
+    print("TILE  :  "); print(cta_tiler); print("\n");
+    print("  mB  :  "); print(  mB.data());   print(" o "); print(  mB.layout());   print("\n");
+    print("  gB  :  "); print(  gB.data());   print(" o "); print(  gB.layout());   print("\n");
+    print("tBgB_x:  "); print(tBgB_x.data()); print(" o "); print(tBgB_x.layout()); print("\n");
+    print("  sB  :  "); print(  sB.data());   print(" o "); print(  sB.layout());   print("\n");
+    print("tBsB_x:  "); print(tBsB_x.data()); print(" o "); print(tBsB_x.layout()); print("\n");
+  }
+#endif
+
+  //
+  // Perform the TMA_STORE
+  //
+
+  // INPUT: Group the CTA_TILE_X modes and REST_X modes for input
+  Tensor tAgA = group_modes<0,R>(group_modes<R,rank(gA)>(gA));       // (CTA_TILE, REST)
+
+  // OUTPUT: Group the REST_X modes and the TMA_X modes to easily iterate through the tiles
+  Tensor tBgB = group_modes<1,rank(tBgB_x)>(tBgB_x);                 // (TMA,REST)
+  Tensor tBsB = group_modes<1,rank(tBsB_x)>(tBsB_x);                 // (TMA,REST)
+  static_assert(size<1>(tBsB) == 1);
+
+#if 0
+  if (thread0()) {
+    print("tAgA  :  "); print(tAgA.data()); print(" o "); print(tAgA.layout()); print("\n");
+    print("tBsB  :  "); print(tBsB.data()); print(" o "); print(tBsB.layout()); print("\n");
+    print("tBgB  :  "); print(tBgB.data()); print(" o "); print(tBgB.layout()); print("\n");
+  }
+#endif
+
+  // Test L2 prefetch
+  cooperative_prefetch<128>(threadIdx.x, gA);
+
+  // Loop over the TMA stages, using smem as our buffer
+  for (int stage = 0; stage < size<1>(tBgB); ++stage)
+  {
+    //
+    // Read in trivially gmem -> smem
+    //
+    // Subbyte elements could cause race conditions, so be even more conservative
+    if (thread0()) {
+      copy(tAgA(_,stage), sB);
+    }
+
+    __syncthreads();
+    cute::cp_async_wait<0>();
+
+    //
+    // Perform the TMA_STORE
+    //
+
+    if (threadIdx.x == 0) {
+      copy(tma, tBsB(_,0), tBgB(_,stage));
+    }
+
+    tma_store_wait<0>();
+    __syncthreads();
+  }
+}
+
+template <class T, class TmaType = T, class CopyOp, class GMEM_Layout, class SMEM_Layout, class CTA_Tile>
+void
+test_tma_store(CopyOp      const& copy_op,
+               GMEM_Layout const& gmem_layout,
+               SMEM_Layout const& smem_layout,
+               CTA_Tile    const& cta_tile)
+{
+  using namespace cute;
+
+  // Allocate and initialize host test data
+  size_t N = ceil_div(cosize(gmem_layout) * sizeof_bits<T>::value, 8);
+  thrust::host_vector<uint8_t> h_in(N);
+  for (size_t i = 0; i < h_in.size(); ++i) {
+    h_in[i] = uint8_t(i % 13);
+  }
+  Tensor hA_in  = make_tensor(recast_ptr<T>(h_in.data()), gmem_layout);
+
+  // Allocate and initialize device test data
+  thrust::device_vector<uint8_t> d_in = h_in;
+  thrust::device_vector<uint8_t> d_out(h_in.size(), uint8_t(-1)); // overflow uint
+
+  // Create TMA for this device Tensor
+  Tensor gA = make_tensor(make_gmem_ptr<T>(raw_pointer_cast(d_out.data())), gmem_layout);
+  auto tma = make_tma_copy<TmaType>(copy_op, gA, smem_layout, cta_tile, Int<1>{});
+  //print(tma);
+
+  // Launch
+  int smem_size = int(sizeof(SharedStorage<T, decltype(smem_layout)>));
+  tma_test_device_cute<<<1, 128, smem_size>>>(
+    reinterpret_cast<T const*>(raw_pointer_cast(d_in.data())),
+    reinterpret_cast<T*>      (raw_pointer_cast(d_out.data())),
+    tma, cta_tile,
+    gmem_layout,
+    smem_layout);
+
+  // Copy results back to host
+  thrust::host_vector<uint8_t> h_out = d_out;
+  Tensor hA_out = make_tensor(recast_ptr<T>(h_out.data()), gmem_layout);
+
+  // Validate the results. Print only the first 3 errors.
+  int count = 3;
+  for (int i = 0; i < int(size(hA_out)) && count > 0; ++i) {
+    EXPECT_EQ(hA_in(i), hA_out(i));
+    if (hA_in(i) != hA_out(i)) {
+      --count;
+    }
+  }
+}
+
+#endif
+
+} // end namespace cutlass::test
diff --git a/test/unit/cute/layout/CMakeLists.txt b/test/unit/cute/layout/CMakeLists.txt
new file mode 100644
index 0000000000..f7e5c77af2
--- /dev/null
+++ b/test/unit/cute/layout/CMakeLists.txt
@@ -0,0 +1,32 @@
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_cute_layout
+  layout_operator.cu
+  )
diff --git a/test/unit/cute/layout/layout_operator.cu b/test/unit/cute/layout/layout_operator.cu
new file mode 100644
index 0000000000..06a823bddd
--- /dev/null
+++ b/test/unit/cute/layout/layout_operator.cu
@@ -0,0 +1,139 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Unit tests Generic CuTe Layouts
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/layout.h"
+#include "cutlass/matrix_coord.h"
+
+// Cute includes
+#include <cute/layout.hpp>
+#include <cute/int_tuple.hpp>
+#include <cute/swizzle.hpp>
+#include <cute/layout_composed.hpp>
+#include <cute/swizzle_layout.hpp>
+
+using namespace cutlass;
+using namespace cute;
+
+namespace test {
+namespace layout {
+
+template <typename GenericLayout, typename Layout> 
+  struct Testbed {
+
+
+    Testbed() {}
+
+    bool run() {
+      GenericLayout generic_layout;
+      Layout layout = Layout::packed({size<0>(generic_layout), size<1>(generic_layout)});
+
+      for (int m = 0; m < size<0>(generic_layout); m++) {
+        for (int n = 0; n < size<1>(generic_layout); n++) {
+          if (generic_layout(m, n) != layout({m, n})) return false;
+        }
+      }
+
+      return true;
+    }
+  };
+
+}
+}
+
+//////////////////////////////////////////////////////////////////////////
+//                      Test Generic CuTe Layouts
+//////////////////////////////////////////////////////////////////////////
+
+/// Canonical Layouts
+
+TEST(GenericLayout, ColumnMajor) {
+  using GenericLayout = cute::Layout<Shape<_8, _4>, Stride<_1, _8>>;
+  using Layout = cutlass::layout::ColumnMajor;
+
+  test::layout::Testbed<GenericLayout, Layout> testbed;
+
+  EXPECT_TRUE(testbed.run());
+}
+//////////////////////////////////////////////////////////////////////////
+
+TEST(GenericLayout, RowMajor) {
+  using GenericLayout = cute::Layout<Shape<_8, _4>, Stride<_4, _1>>;
+  using Layout = cutlass::layout::RowMajor;
+
+  test::layout::Testbed<GenericLayout, Layout> testbed;
+
+  EXPECT_TRUE(testbed.run());
+}
+//////////////////////////////////////////////////////////////////////////
+
+
+/// Swizzle Shared Memory layouts
+
+TEST(GenericLayout, RowMajorTensorOpMultiplicandCrosswise) {
+
+  using GenericLayout = decltype(
+        composition(
+          Swizzle<3,3,3>{},
+          Layout<Shape<_128, _64>, Stride<_64, _1>>{})
+  );
+
+  using Layout = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<cutlass::half_t>::value, 64>;
+
+  test::layout::Testbed<GenericLayout, Layout> testbed;
+
+  EXPECT_TRUE(testbed.run());
+}
+//////////////////////////////////////////////////////////////////////////
+
+TEST(GenericLayout, ColumnMajorTensorOpMultiplicandCongruous) {
+
+  using GenericLayout = decltype(
+        composition(
+          Swizzle<3,3,4>{},
+          Layout<Shape<_128, _64>>{})
+  );
+
+  using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
+    cutlass::sizeof_bits<cutlass::half_t>::value, 64>;
+
+
+  test::layout::Testbed<GenericLayout, Layout> testbed;
+
+  EXPECT_TRUE(testbed.run());
+}
+//////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/cute/msvc_compilation/CMakeLists.txt b/test/unit/cute/msvc_compilation/CMakeLists.txt
new file mode 100644
index 0000000000..a6ba58f95b
--- /dev/null
+++ b/test/unit/cute/msvc_compilation/CMakeLists.txt
@@ -0,0 +1,33 @@
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_cute_msvc_compilation
+
+  tuple.cpp
+)
diff --git a/test/unit/cute/msvc_compilation/tuple.cpp b/test/unit/cute/msvc_compilation/tuple.cpp
new file mode 100644
index 0000000000..a8a31dd3ca
--- /dev/null
+++ b/test/unit/cute/msvc_compilation/tuple.cpp
@@ -0,0 +1,156 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <cutlass/trace.h>
+
+#include <cassert>
+#include <type_traits>
+
+#include <cute/container/tuple.hpp>
+#include <cute/int_tuple.hpp>
+
+template<class T>
+class ConvertibleTo {
+public:
+  ConvertibleTo(T val) : val_(val) {}
+
+  operator T () const { return val_; }
+
+private:
+  T val_ = 0;
+};
+
+template<class Integral, Integral Value>
+using IC = std::integral_constant<Integral, Value>;
+
+#if ! defined(CUTLASS_USE_PACKED_TUPLE)
+
+TEST(CuTe_core_msvc_compilation, TupleAssignment)
+{
+  CUTLASS_TRACE_HOST("-------------------------------");
+  CUTLASS_TRACE_HOST("cute::tuple creation and assignment");
+  CUTLASS_TRACE_HOST("-------------------------------");
+
+  using forty_two_type = IC<int, 42>;
+  using forty_three_type = IC<size_t, 43>;
+
+  using ebo_s_type = cute::detail::EBO<0, forty_two_type>;
+  [[maybe_unused]] ebo_s_type ebo_s;
+  static_assert(std::is_same_v<decltype(cute::detail::getv(ebo_s)), forty_two_type>);
+
+  using ebo_d_type = cute::detail::EBO<1, size_t>;
+  [[maybe_unused]] ebo_d_type ebo_d(43u);
+  assert(ebo_d.t_ == 43u);
+  static_assert(std::is_same_v<std::remove_const_t<std::remove_reference_t<decltype(cute::detail::getv(ebo_d))>>, size_t > );
+  assert(cute::detail::getv(ebo_d) == 43u);
+
+  [[maybe_unused]] cute::detail::TupleBase<std::index_sequence<0, 1, 2>, int, forty_two_type, size_t> tb0{
+          41, forty_two_type{}, size_t(43u) };
+  [[maybe_unused]] cute::detail::TupleBase<std::index_sequence<0, 1, 2>, int, forty_two_type, size_t> tb1;
+
+  int val41 = ConvertibleTo{41};
+  assert(val41 == 41);
+  size_t val43 = ConvertibleTo{size_t(43u)};
+  assert(val43 == size_t{43u});
+  [[maybe_unused]] cute::detail::TupleBase<std::index_sequence<0, 1, 2>, int, forty_two_type, size_t> tb2{
+        ConvertibleTo{41}, forty_two_type{}, ConvertibleTo{size_t(43u)}};
+
+  [[maybe_unused]] cute::detail::TupleBase<std::index_sequence<0>, int> tb3{ 41 };
+  [[maybe_unused]] cute::detail::TupleBase<std::index_sequence<0>, int> tb3a{ 42 };
+  tb3 = tb3a;
+
+  using tuple_0d_type = cute::tuple<>;
+  using tuple_1d_d_type = cute::tuple<int>;
+  using tuple_2d_dd_type = cute::tuple<int, size_t>;
+
+  [[maybe_unused]] tuple_0d_type t0;
+
+  // Symptom: "illegal member initialization: 'TupleBase<int>' is not a base or member"
+  [[maybe_unused]] tuple_1d_d_type t1{ 42 };
+  [[maybe_unused]] tuple_1d_d_type t1a{ 43 };
+  t1 = t1a;
+
+  [[maybe_unused]] tuple_2d_dd_type t3{ 42, size_t(43u) };
+  [[maybe_unused]] tuple_2d_dd_type t3a{ 44, size_t(45u) };
+  // Symptom: "illegal member initialization:
+  // 'TupleBase<int, unsigned __int64>' is not a base or member"
+  t3 = t3a;
+}
+#endif // CUTLASS_USE_PACKED_TUPLE
+
+TEST(CuTe_core_msvc_compilation, TupleGetSingleInteger)
+{
+  CUTLASS_TRACE_HOST("-------------------------------");
+  CUTLASS_TRACE_HOST("cute::get<I> on cute::tuple for single integer I");
+  CUTLASS_TRACE_HOST("-------------------------------");
+
+  cute::tuple<int, ConvertibleTo<size_t>, IC<int, 43>> t0{ 41, size_t(42u), IC<int, 43>{} };
+
+  [[maybe_unused]] auto t0_0 = cute::get<0>(t0);
+  static_assert(std::is_same_v<decltype(t0_0), int>);
+  assert(t0_0 == 41);
+
+  [[maybe_unused]] auto t0_1 = cute::get<1>(t0);
+  static_assert(std::is_same_v<decltype(t0_1), ConvertibleTo<size_t>>);
+
+  [[maybe_unused]] auto t0_2 = cute::get<2>(t0);
+  static_assert(std::is_same_v<decltype(t0_2), IC<int, 43>>);
+}
+
+TEST(CuTe_core_msvc_compilation, TupleGetRecursive)
+{
+  CUTLASS_TRACE_HOST("-------------------------------");
+  CUTLASS_TRACE_HOST("cute::get<I...> on cute::tuple");
+  CUTLASS_TRACE_HOST("-------------------------------");
+
+  using inner_tuple_type = cute::tuple<int, ConvertibleTo<size_t>, IC<int, 43>>;
+  using outer_tuple_type = cute::tuple<IC<int, 40>, inner_tuple_type, size_t>;
+
+  inner_tuple_type t0_inner{ 41, size_t(42u), IC<int, 43>{} };
+  outer_tuple_type t0_outer{ IC<int, 40>{}, t0_inner, size_t(44u) };
+
+        [[maybe_unused]] auto t0_outer_0 = cute::get<0>(t0_outer);
+        static_assert(std::is_same_v<decltype(t0_outer_0), IC<int, 40>>);
+
+        [[maybe_unused]] auto t0_outer_1 = cute::get<1>(t0_outer);
+        static_assert(std::is_same_v<decltype(t0_outer_1), inner_tuple_type>);
+
+        [[maybe_unused]] auto t0_outer_2 = cute::get<2>(t0_outer);
+        static_assert(std::is_same_v<decltype(t0_outer_2), size_t>);
+        assert(t0_outer_2 == size_t(44u));
+
+  // Leftmost index is innermost in the nexted get sequence.
+  [[maybe_unused]] auto t0_outer_10 = cute::get<1, 0>(t0_outer);
+  static_assert(std::is_same_v<decltype(t0_outer_10), int>);
+  assert(t0_outer_10 == 41);
+}
diff --git a/test/unit/cute/turing/CMakeLists.txt b/test/unit/cute/turing/CMakeLists.txt
new file mode 100644
index 0000000000..ac8a0487b3
--- /dev/null
+++ b/test/unit/cute/turing/CMakeLists.txt
@@ -0,0 +1,32 @@
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_cute_turing
+  cooperative_gemm.cu
+)
diff --git a/test/unit/cute/turing/cooperative_gemm.cu b/test/unit/cute/turing/cooperative_gemm.cu
new file mode 100644
index 0000000000..1bda5cf77f
--- /dev/null
+++ b/test/unit/cute/turing/cooperative_gemm.cu
@@ -0,0 +1,56 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <cute/tensor.hpp>
+
+#include "../cooperative_gemm_common.hpp"
+
+using namespace cute;
+
+TEST(SM75_CuTe_Turing, CooperativeGemm1_MixedPrecisionFP16FP32_MMA) {
+
+  constexpr uint32_t thread_block_size = 128;
+  constexpr uint32_t max_vec_bits = 128;
+  using TA = cutlass::half_t;
+  using TB = cutlass::half_t;
+  using TC = float;
+
+  auto shape_mnk = make_shape(_64{}, _64{}, _64{});
+  auto tiled_mma =
+      TiledMMA<
+        MMA_Atom<SM75_16x8x8_F32F16F16F32_TN>,
+        Layout<Shape<_2, _2, _1>>
+      >{};
+
+  test_cooperative_gemm_col_major_layout<thread_block_size, max_vec_bits, TA, TB, TC>(shape_mnk, tiled_mma);
+}
diff --git a/test/unit/cute/volta/CMakeLists.txt b/test/unit/cute/volta/CMakeLists.txt
new file mode 100644
index 0000000000..27ebcb29fd
--- /dev/null
+++ b/test/unit/cute/volta/CMakeLists.txt
@@ -0,0 +1,33 @@
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_cute_volta
+  vectorization_auto.cu
+  cooperative_gemm.cu
+)
diff --git a/test/unit/cute/volta/cooperative_gemm.cu b/test/unit/cute/volta/cooperative_gemm.cu
new file mode 100644
index 0000000000..54cf4f2214
--- /dev/null
+++ b/test/unit/cute/volta/cooperative_gemm.cu
@@ -0,0 +1,362 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <cute/tensor.hpp>
+#include <cute/swizzle.hpp> // cute::Swizzle
+#include <cute/swizzle_layout.hpp> // cute::compose(cute::Swizzle)
+
+#include "../cooperative_gemm_common.hpp"
+
+using namespace cute;
+
+TEST(SM70_CuTe_Volta, CooperativeGemm1_FloatFMA) {
+
+  constexpr uint32_t thread_block_size = 128;
+  using value_type = float;
+
+  auto shape_mnk = make_shape(_64{}, _32{}, _16{});
+  auto tiled_mma =
+      TiledMMA<
+        MMA_Atom<UniversalFMA<value_type, value_type, value_type, value_type>>,
+        Layout<Shape<_16, _8, _1>>
+      >{};
+
+  test_cooperative_gemm_col_major_layout<thread_block_size, value_type>(shape_mnk, tiled_mma);
+}
+
+TEST(SM70_CuTe_Volta, CooperativeGemm1_FloatFMA_Predication) {
+
+  constexpr uint32_t thread_block_size = 128;
+  using value_type = float;
+
+  auto shape_mnk = make_shape(C<88>{}, C<20>{}, C<12>{});
+  auto tiled_mma =
+      TiledMMA<
+        MMA_Atom<UniversalFMA<value_type, value_type, value_type, value_type>>,
+        Layout<Shape<_2, _64, _1>>
+      >{};
+
+  test_cooperative_gemm_col_major_layout<thread_block_size, value_type>(shape_mnk, tiled_mma);
+}
+
+TEST(SM70_CuTe_Volta, CooperativeGemm1_FloatFMA_Predication2) {
+
+  constexpr uint32_t thread_block_size = 128;
+  using value_type = float;
+
+  auto shape_mnk = make_shape(C<88>{}, C<36>{}, C<24>{});
+  auto tiled_mma =
+      TiledMMA<
+        MMA_Atom<UniversalFMA<value_type, value_type, value_type, value_type>>,
+        Layout<Shape<_4, _32, _1>>
+      >{};
+
+  test_cooperative_gemm_col_major_layout<thread_block_size, value_type>(shape_mnk, tiled_mma);
+}
+
+TEST(SM70_CuTe_Volta, CooperativeGemm1_FloatFMA_Predication3) {
+  constexpr uint32_t thread_block_size = 128;
+  using value_type = float;
+
+  auto shape_mnk = make_shape(C<67>{}, C<13>{}, C<11>{});
+  auto tiled_mma =
+      TiledMMA<
+        MMA_Atom<UniversalFMA<value_type, value_type, value_type, value_type>>,
+        Layout<Shape<_1, _128, _1>>
+      >{};
+
+  test_cooperative_gemm_col_major_layout<thread_block_size, value_type>(shape_mnk, tiled_mma);
+}
+
+TEST(SM70_CuTe_Volta, CooperativeGemm2_DoubleFMA) {
+  constexpr uint32_t thread_block_size = 128;
+  using value_type = double;
+
+  auto shape_mnk = make_shape(C<16>{}, C<32>{}, C<32>{});
+  auto tiled_mma =
+      TiledMMA<
+        MMA_Atom<UniversalFMA<value_type, value_type, value_type, value_type>>,
+        Layout<Shape<_16, _8, _1>>
+      >{};
+
+  test_cooperative_gemm_col_major_layout<thread_block_size, value_type>(shape_mnk, tiled_mma);
+}
+
+TEST(SM70_CuTe_Volta, CooperativeGemm3_Float_FMA_CustomPermutationMNK) {
+
+  constexpr uint32_t thread_block_size = 256;
+  using value_type = float;
+
+  auto shape_mnk = make_shape(_32{}, _32{}, _32{});
+  auto tiled_mma = TiledMMA<
+    MMA_Atom<
+      UniversalFMA<value_type, value_type, value_type, value_type>
+    >,
+    Layout<
+      Shape<_16, _16, _1>
+    >,
+    Tile<
+      Layout<
+        Shape<_16,_2>, Stride<_2,_1>
+      >,               // 32x32x1 MMA with perm for load vectorization
+      Layout<
+        Shape<_16,_2>, Stride<_2,_1>
+      >,
+      Underscore
+    >
+  >{};
+
+  test_cooperative_gemm_col_major_layout<thread_block_size, value_type>(shape_mnk, tiled_mma);
+}
+
+TEST(SM70_CuTe_Volta, CooperativeGemm4_Half_MMA) {
+  constexpr uint32_t thread_block_size = 128;
+  using value_type = cutlass::half_t;
+
+  auto shape_mnk = make_shape(_32{}, _32{}, _32{});
+  auto tiled_mma = TiledMMA<
+    MMA_Atom<SM70_8x8x4_F16F16F16F16_TN>,
+    Layout<Shape<_4, _4, _1>>
+  >{};
+
+  auto smem_a_atom_layout = typename decltype(tiled_mma)::AtomLayoutB_TV{};
+  auto smem_b_atom_layout = typename decltype(tiled_mma)::AtomLayoutA_TV{};
+  auto smem_c_atom_layout = make_layout(select<0, 1>(shape_mnk));
+
+  test_cooperative_gemm_col_major_layout<thread_block_size,
+                                         value_type>
+    (smem_a_atom_layout,
+     smem_b_atom_layout,
+     smem_c_atom_layout,
+     shape_mnk,
+     tiled_mma);
+}
+
+TEST(SM70_CuTe_Volta, CooperativeGemm5_Half_MMA) {
+
+  constexpr uint32_t thread_block_size = 128;
+  constexpr uint32_t max_vec_bits = 128;
+  using value_type = cutlass::half_t;
+
+  auto shape_mnk = make_shape(_32{}, _32{}, _32{});
+  auto tiled_mma = TiledMMA<
+    MMA_Atom<SM70_8x8x4_F16F16F16F16_TN>,
+    Layout<Shape<_4, _4, _1>>
+  >{};
+
+  auto gmem_a_layout = make_layout(select<0, 2>(shape_mnk));
+  auto gmem_b_layout = make_layout(select<1, 2>(shape_mnk), GenColMajor{});
+  auto gmem_c_layout = make_layout(select<0, 1>(shape_mnk));
+
+  auto smem_a_layout = make_layout(select<0, 2>(shape_mnk));
+  auto smem_b_layout = make_layout(select<1, 2>(shape_mnk), GenColMajor{});
+  auto smem_c_layout = make_layout(select<0, 1>(shape_mnk));
+
+  test_cooperative_gemm<thread_block_size,
+                        max_vec_bits,
+                        value_type,
+                        value_type,
+                        value_type>
+    (gmem_a_layout,
+     gmem_b_layout,
+     gmem_c_layout,
+     smem_a_layout,
+     smem_b_layout,
+     smem_c_layout,
+     tiled_mma);
+}
+
+TEST(SM70_CuTe_Volta, CooperativeGemm5_Half_MMA_Predicated) {
+
+  constexpr uint32_t thread_block_size = 128;
+  constexpr uint32_t max_vec_bits = 16;
+  using value_type = cutlass::half_t;
+
+  auto shape_mnk = make_shape(C<31>{}, C<27>{}, C<17>{});
+  auto tiled_mma = TiledMMA<
+    MMA_Atom<SM70_8x8x4_F16F16F16F16_TN>,
+    Layout<Shape<_4, _4, _1>>
+  >{};
+
+  auto gmem_a_layout = make_layout(select<0, 2>(shape_mnk));
+  auto gmem_b_layout = make_layout(select<1, 2>(shape_mnk), GenColMajor{});
+  auto gmem_c_layout = make_layout(select<0, 1>(shape_mnk));
+
+  auto smem_a_layout = make_layout(select<0, 2>(shape_mnk));
+  auto smem_b_layout = make_layout(select<1, 2>(shape_mnk), GenColMajor{});
+  auto smem_c_layout = make_layout(select<0, 1>(shape_mnk));
+
+  test_cooperative_gemm<thread_block_size,
+                        max_vec_bits,
+                        value_type,
+                        value_type,
+                        value_type>
+    (gmem_a_layout,
+     gmem_b_layout,
+     gmem_c_layout,
+     smem_a_layout,
+     smem_b_layout,
+     smem_c_layout,
+     tiled_mma);
+}
+
+TEST(SM70_CuTe_Volta, CooperativeGemm6_Half_MAA_SwizzledSmemLayouts) {
+
+  constexpr uint32_t thread_block_size = 128;
+  constexpr uint32_t max_vec_bits = 128;
+  using value_type = cutlass::half_t;
+
+  auto shape_mnk = make_shape(_128{}, _128{}, _64{});
+  auto tiled_mma = TiledMMA<
+    MMA_Atom<SM70_8x8x4_F16F16F16F16_TN>,
+    Layout<Shape<_4, _4, _1>>
+  >{};
+
+  auto smem_a_atom_layout = composition(Swizzle<3,3,3>{}, Layout<Shape < _8,_64>, Stride<_64, _1>>{});
+  auto smem_b_atom_layout = composition(Swizzle<3,3,3>{}, Layout<Shape <_64, _8>, Stride< _1,_64>>{});
+  auto smem_c_atom_layout = make_layout(select<0, 1>(shape_mnk), GenRowMajor{});
+
+  auto gmem_a_layout = make_layout(select<0, 2>(shape_mnk), GenRowMajor{});
+  auto gmem_b_layout = make_layout(select<1, 2>(shape_mnk), GenColMajor{});
+  auto gmem_c_layout = make_layout(select<0, 1>(shape_mnk), GenRowMajor{});
+
+  auto smem_a_layout = tile_to_shape(
+      smem_a_atom_layout,
+      make_shape(shape<0>(gmem_a_layout), shape<1>(gmem_a_layout)));
+
+  auto smem_b_layout = tile_to_shape(
+      smem_b_atom_layout,
+      make_shape(shape<0>(gmem_b_layout), shape<1>(gmem_b_layout)));
+
+  auto smem_c_layout = tile_to_shape(
+      smem_c_atom_layout,
+      make_shape(shape<0>(gmem_c_layout), shape<1>(gmem_c_layout)));
+
+  test_cooperative_gemm<thread_block_size,
+                        max_vec_bits,
+                        value_type,
+                        value_type,
+                        value_type>
+    (gmem_a_layout,
+     gmem_b_layout,
+     gmem_c_layout,
+     smem_a_layout,
+     smem_b_layout,
+     smem_c_layout,
+     tiled_mma);
+}
+
+TEST(SM70_CuTe_Volta, CooperativeGemm7_TransformNegate_FMA) {
+  constexpr uint32_t thread_block_size = 128;
+  constexpr uint32_t max_vec_bits = 64;
+  using TA = float;
+  using TB = float;
+  using TC = double;
+
+  auto shape_mnk = make_shape(_32{}, _32{}, _32{});
+  auto tiled_mma = TiledMMA<
+    MMA_Atom<UniversalFMA<TC, TA, TB, TC>>,
+    Layout<Shape<_16, _8, _1>>
+  >{};
+
+  auto aload  = cute::negate {};
+  auto bload  = cute::negate {};
+  auto cload  = cute::negate {};
+  auto cstore = cute::negate {};
+
+  test_cooperative_gemm_col_major_layout<thread_block_size, max_vec_bits, TA, TB, TC>(
+      shape_mnk, tiled_mma, aload, bload, cload, cstore);
+}
+
+TEST(SM70_CuTe_Volta, CooperativeGemm7_TransformNegate_MMA) {
+
+  constexpr uint32_t thread_block_size = 128;
+  using value_type = cutlass::half_t;
+
+  auto shape_mnk = make_shape(_32{}, _32{}, _32{});
+  auto tiled_mma = TiledMMA<
+    MMA_Atom<SM70_8x8x4_F16F16F16F16_TN>,
+    Layout<Shape<_4, _4, _1>>
+  >{};
+
+  auto aload  = cute::negate {};
+  auto bload  = cute::negate {};
+  auto cload  = cute::negate {};
+  auto cstore = cute::negate {};
+
+  test_cooperative_gemm_col_major_layout<thread_block_size, value_type>(
+      shape_mnk, tiled_mma, aload, bload, cload, cstore);
+}
+
+template<class ConstantType>
+struct increment_by_x {
+  ConstantType x;
+
+  template <class T>
+  CUTE_HOST_DEVICE constexpr
+  T operator()(const T& arg) const {
+    return arg + x;
+  }
+};
+
+template<class From, class To>
+struct convert_to {
+  CUTE_HOST_DEVICE constexpr
+  To operator()(const From& arg) const {
+    return static_cast<To>(arg);
+  }
+};
+
+TEST(SM70_CuTe_Volta, CooperativeGemm7_TransformCustomOp_FMA) {
+
+  constexpr uint32_t thread_block_size = 128;
+  constexpr uint32_t max_vec_bits = 64;
+
+  using TA = float;
+  using TB = float;
+  using TC = double;
+
+  auto shape_mnk = make_shape(_32{}, _32{}, _32{});
+  auto tiled_mma = TiledMMA<
+    MMA_Atom<UniversalFMA<TC, TA, TB, TC>>,
+    Layout<Shape<_16, _8, _1>>
+  >{};
+
+  auto aload  = increment_by_x<float>{1.111f};
+  auto bload  = convert_to<float, double> {};
+  auto cload  = cute::negate {};
+  auto cstore = cute::negate {};
+
+  test_cooperative_gemm_col_major_layout<thread_block_size, max_vec_bits, TA, TB, TC>(
+      shape_mnk, tiled_mma, aload, bload, cload, cstore);
+}
diff --git a/test/unit/cute/volta/vectorization_auto.cu b/test/unit/cute/volta/vectorization_auto.cu
new file mode 100644
index 0000000000..585abf0e26
--- /dev/null
+++ b/test/unit/cute/volta/vectorization_auto.cu
@@ -0,0 +1,131 @@
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <iostream>
+#include <iomanip>
+#include <utility>
+#include <type_traits>
+#include <vector>
+#include <numeric>
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <cute/tensor.hpp>
+
+using namespace cute;
+
+template <class GmemTensor, class RmemTiler, class CopyPolicy>
+__global__
+void
+kernel(GmemTensor gC, RmemTiler tiler, CopyPolicy policy)
+{
+  Tensor tCgC = local_tile(gC, tiler, 0);
+
+  Tensor rC = make_tensor_like(tCgC);
+  using T = typename GmemTensor::value_type;
+  for (int i = 0; i < size(rC); ++i) {
+    rC(i) = T(i % 13);
+  }
+
+#if 0
+  print("  gC : "); print(  gC); print("\n");
+  print("tCgC : "); print(tCgC); print("\n");
+  print("  rC : "); print(  rC); print("\n");
+#endif
+
+  // NOTE: only 1 thread, this thread produce a block of 8x8 output. The fringe will not be touched.
+  //copy(rC, tCgC);           // Enable auto-vectorization if static
+  copy(policy, rC, tCgC);     // Use a policy to establish vectorization assumptions
+}
+
+template <class T, class CopyPolicy, class GmemLayout, class RmemTiler>
+void
+test_copy_vectorization(CopyPolicy policy, GmemLayout gmem_layout, RmemTiler rmem_tiler)
+{
+  thrust::host_vector<T> h_in(cosize(gmem_layout), T(0));
+
+  thrust::device_vector<T> d_in = h_in;
+  Tensor m_in = make_tensor(make_gmem_ptr(raw_pointer_cast(d_in.data())), gmem_layout);
+
+  kernel<<<1,1>>>(m_in, rmem_tiler, policy);
+
+  thrust::host_vector<T> h_out = d_in;
+  Tensor result = make_tensor(h_out.data(), gmem_layout);
+
+  thrust::host_vector<T> h_true = h_in;
+  Tensor ref = make_tensor(h_true.data(), gmem_layout);
+
+  // Set the values directly in the reference tensor, no copy
+  Tensor ref_tile = local_tile(ref, rmem_tiler, 0);
+  for (int i = 0; i < size(ref_tile); ++i) {
+    ref_tile(i) = T(i % 13);
+  }
+
+  // Compare the reference and the result. Print only the first 3 errors.
+  // print_tensor(result);
+  int count = 3;
+  for (int i = 0; i < size(ref) && count > 0; ++i) {
+    EXPECT_EQ(result(i), ref(i));
+    if (result(i) != ref(i)) {
+      --count;
+    }
+  }
+}
+
+template <class T, class GmemLayout, class RmemTiler>
+void
+test_copy_vectorization(GmemLayout gmem_layout, RmemTiler rmem_tiler)
+{
+  test_copy_vectorization<T>(DefaultCopy{}, gmem_layout, rmem_tiler);
+}
+
+TEST(SM70_CuTe_Volta, SimpleVec)
+{
+  // Fully static layouts are assumed to be aligned -- these will be vectorized
+  test_copy_vectorization<float>(make_layout(make_shape(Int<8>{}, Int<8>{})), Shape<_8,_8>{});
+  test_copy_vectorization<float>(make_layout(make_shape(Int<12>{}, Int<12>{})), Shape<_8,_8>{});
+  // Fails in vectorization recast due to misalignment and static assertions
+  //test_copy_vectorization<float>(make_layout(make_shape(Int<9>{}, Int<9>{})), Shape<_8,_8>{});
+
+  // Dynamic layouts are not assumed to be aligned -- these will not be vectorized
+  test_copy_vectorization<float>(make_layout(make_shape(12,12)), Shape<_8,_8>{});
+  test_copy_vectorization<float>(make_layout(make_shape( 9, 9)), Shape<_8,_8>{});
+
+  // Dynamic layouts that are assumed to be aligned -- these will be vectorized
+  test_copy_vectorization<float>(AutoVectorizingCopyWithAssumedAlignment<128>{}, make_layout(make_shape( 8, 8)), Shape<_8,_8>{});
+  test_copy_vectorization<float>(AutoVectorizingCopyWithAssumedAlignment<128>{}, make_layout(make_shape(12,12)), Shape<_8,_8>{});
+  // Fails -- bad alignment assumption
+  //test_copy_vectorization<float>(AutoVectorizingCopyWithAssumedAlignment<128>{}, make_layout(make_shape( 9, 9)), Shape<_8,_8>{});
+}
diff --git a/test/unit/data/hashes/cached_results_cutlass_test_unit_conv_dgrad_device_tensorop_sm90.txt b/test/unit/data/hashes/cached_results_cutlass_test_unit_conv_dgrad_device_tensorop_sm90.txt
new file mode 100644
index 0000000000..03222142e3
--- /dev/null
+++ b/test/unit/data/hashes/cached_results_cutlass_test_unit_conv_dgrad_device_tensorop_sm90.txt
@@ -0,0 +1,21 @@
+conv1d dgrad_(1,8,64)_(64,1,64)_padl(0)_padu(0)_str(1)_dil(1)_corr_alpha1_beta0 h_h_h_h 2903348270 3426051534 1688878159 1937506630
+conv1d dgrad_(1,8,64)_(64,1,16)_padl(0)_padu(0)_str(1)_dil(1)_corr_alpha1_beta0 h_h_h_h 2903348270 4139873761 674035563 2969043798
+conv1d dgrad_(2,8,64)_(64,1,96)_padl(0)_padu(0)_str(1)_dil(1)_corr_alpha1_beta0 h_h_h_h 1021860383 1204062629 320546932 1467678853
+conv1d dgrad_(7,8,64)_(64,1,256)_padl(0)_padu(0)_str(1)_dil(1)_corr_alpha1_beta0 h_h_h_h 1658584634 3542030567 2895019148 3285129185
+conv1d dgrad_(2,6,64)_(64,3,256)_padl(0)_padu(0)_str(1)_dil(1)_corr_alpha1_beta0 h_h_h_h 2768091490 3498596809 1298679529 150156903
+conv1d dgrad_(2,8,32)_(32,3,256)_padl(1)_padu(1)_str(1)_dil(1)_corr_alpha1_beta0 h_h_h_h 2903348270 1317709417 1298679529 2499651710
+conv1d dgrad_(2,6,64)_(64,4,256)_padl(0)_padu(1)_str(1)_dil(1)_corr_alpha1_beta0 h_h_h_h 2768091490 2047019225 1298679529 2646233918
+conv1d dgrad_(2,13,256)_(256,3,64)_padl(0)_padu(1)_str(1)_dil(2)_corr_alpha1_beta0 h_h_h_h 2818970884 3498596809 626177484 3228881313
+conv2d dgrad_(1,8,8,64)_(64,1,1,64)_padl(0,0)_padu(0,0)_str(1,1)_dil(1,1)_corr_alpha1_beta0 h_h_h_h 1260538290 3426051534 1298679529 885253342
+conv2d dgrad_(1,8,8,64)_(64,1,1,16)_padl(0,0)_padu(0,0)_str(1,1)_dil(1,1)_corr_alpha1_beta0 h_h_h_h 1260538290 4139873761 2377544542 1796715273
+conv2d dgrad_(2,8,8,64)_(64,1,1,96)_padl(0,0)_padu(0,0)_str(1,1)_dil(1,1)_corr_alpha1_beta0 h_h_h_h 913918101 1204062629 2917884534 3849387550
+conv2d dgrad_(7,8,8,64)_(64,1,1,256)_padl(0,0)_padu(0,0)_str(1,1)_dil(1,1)_corr_alpha1_beta0 h_h_h_h 1794462925 3542030567 2009277002 1494590621
+conv2d dgrad_(2,6,6,64)_(64,3,3,256)_padl(0,0)_padu(0,0)_str(1,1)_dil(1,1)_corr_alpha1_beta0 h_h_h_h 3783509692 1508460534 184863710 2539209373
+conv2d dgrad_(2,8,8,32)_(32,3,3,256)_padl(1,1)_padu(1,1)_str(1,1)_dil(1,1)_corr_alpha1_beta0 h_h_h_h 1260538290 19124920 184863710 310844165
+conv2d dgrad_(2,8,5,64)_(64,2,5,256)_padl(1,1)_padu(0,0)_str(1,1)_dil(1,1)_corr_alpha1_beta0 h_h_h_h 1829721897 2657646585 184863710 1493692231
+conv2d dgrad_(2,15,5,256)_(256,2,5,64)_padl(1,1)_padu(0,0)_str(1,1)_dil(2,3)_corr_alpha1_beta0 h_h_h_h 2554054572 2657646585 184863710 732073700
+conv3d dgrad_(1,1,8,8,64)_(64,1,1,1,16)_padl(0,0,0)_padu(0,0,0)_str(1,1,1)_dil(1,1,1)_corr_alpha1_beta0 h_h_h_h 1260538290 4139873761 2377544542 2164020490
+conv3d dgrad_(2,1,8,8,64)_(64,1,1,1,96)_padl(0,0,0)_padu(0,0,0)_str(1,1,1)_dil(1,1,1)_corr_alpha1_beta0 h_h_h_h 913918101 1204062629 2917884534 3849387550
+conv3d dgrad_(2,3,4,6,64)_(64,3,4,5,96)_padl(1,1,1)_padu(1,1,1)_str(1,1,1)_dil(1,1,1)_corr_alpha1_beta0 h_h_h_h 3168426837 2541149106 3798069020 2802887403
+conv3d dgrad_(2,2,4,5,64)_(64,3,4,5,96)_padl(1,0,1)_padu(0,2,0)_str(1,1,1)_dil(1,1,1)_corr_alpha1_beta0 h_h_h_h 1829721897 2541149106 3798069020 2514979496
+conv3d dgrad_(2,13,6,5,64)_(64,3,4,5,96)_padl(1,0,1)_padu(0,2,0)_str(1,1,1)_dil(2,2,3)_corr_alpha1_beta0 h_h_h_h 3720598515 2541149106 2799255748 1382638415
diff --git a/test/unit/data/hashes/cached_results_cutlass_test_unit_conv_fprop_device_tensorop_sm90.txt b/test/unit/data/hashes/cached_results_cutlass_test_unit_conv_fprop_device_tensorop_sm90.txt
new file mode 100644
index 0000000000..e5ae68ab3a
--- /dev/null
+++ b/test/unit/data/hashes/cached_results_cutlass_test_unit_conv_fprop_device_tensorop_sm90.txt
@@ -0,0 +1,87 @@
+conv1d fprop_(1,8,64)_(64,1,64)_padl(0)_padu(0)_str(1)_dil(1)_corr_alpha1_beta0 s8_s8_s8_i 2230797104 291412341 2967010883 3359645502
+conv1d fprop_(1,8,64)_(16,1,64)_padl(0)_padu(0)_str(1)_dil(1)_corr_alpha1_beta0 s8_s8_s8_i 2230797104 2455330259 2158037455 380099506
+conv1d fprop_(2,8,64)_(96,1,64)_padl(0)_padu(0)_str(1)_dil(1)_corr_alpha1_beta0 s8_s8_s8_i 3337558967 3397548888 1959374308 4192617409
+conv1d fprop_(7,8,64)_(256,1,64)_padl(0)_padu(0)_str(1)_dil(1)_corr_alpha1_beta0 s8_s8_s8_i 2412446234 659604944 754742037 743464550
+conv1d fprop_(2,8,64)_(256,3,64)_padl(0)_padu(0)_str(1)_dil(1)_corr_alpha1_beta0 s8_s8_s8_i 3337558967 2907034004 968493294 303241124
+conv1d fprop_(2,8,32)_(256,3,32)_padl(1)_padu(1)_str(1)_dil(1)_corr_alpha1_beta0 s8_s8_s8_i 2230797104 684772342 2456226145 4051186022
+conv1d fprop_(2,8,64)_(256,4,64)_padl(0)_padu(1)_str(1)_dil(1)_corr_alpha1_beta0 s8_s8_s8_i 3337558967 2257694519 968493294 2992278890
+conv1d fprop_(2,8,64)_(256,3,64)_padl(0)_padu(1)_str(2)_dil(1)_corr_alpha1_beta0 s8_s8_s8_i 3337558967 2907034004 3833157308 197395388
+conv1d fprop_(2,8,64)_(256,3,64)_padl(0)_padu(1)_str(1)_dil(2)_corr_alpha1_beta0 s8_s8_s8_i 3337558967 2907034004 2965816260 2146305550
+conv2d fprop_(1,8,8,64)_(64,1,1,64)_padl(0,0)_padu(0,0)_str(1,1)_dil(1,1)_corr_alpha1_beta0 s8_s8_s8_i 3696905065 291412341 2456226145 222634564
+conv2d fprop_(1,8,8,64)_(16,1,1,64)_padl(0,0)_padu(0,0)_str(1,1)_dil(1,1)_corr_alpha1_beta0 s8_s8_s8_i 3696905065 2455330259 2674438810 1247569549
+conv2d fprop_(2,8,8,64)_(96,1,1,64)_padl(0,0)_padu(0,0)_str(1,1)_dil(1,1)_corr_alpha1_beta0 s8_s8_s8_i 589562497 3397548888 3067871195 4124984848
+conv2d fprop_(7,8,8,64)_(256,1,1,64)_padl(0,0)_padu(0,0)_str(1,1)_dil(1,1)_corr_alpha1_beta0 s8_s8_s8_i 3050405131 659604944 217342478 2681355190
+conv2d fprop_(2,8,8,64)_(256,3,3,64)_padl(0,0)_padu(0,0)_str(1,1)_dil(1,1)_corr_alpha1_beta0 s8_s8_s8_i 589562497 1759432517 1946286282 4189881018
+conv2d fprop_(2,8,8,32)_(256,3,3,32)_padl(1,1)_padu(1,1)_str(1,1)_dil(1,1)_corr_alpha1_beta0 s8_s8_s8_i 3696905065 3664146697 1273468542 3134139003
+conv2d fprop_(2,8,8,64)_(256,2,5,64)_padl(1,1)_padu(2,2)_str(1,1)_dil(1,1)_corr_alpha1_beta0 s8_s8_s8_i 589562497 917721283 2404731639 151314389
+conv2d fprop_(2,8,8,64)_(256,2,5,64)_padl(1,1)_padu(0,0)_str(2,3)_dil(1,1)_corr_alpha1_beta0 s8_s8_s8_i 589562497 917721283 2456226145 1566464786
+conv2d fprop_(2,16,16,64)_(256,2,5,64)_padl(1,1)_padu(0,0)_str(1,1)_dil(2,3)_corr_alpha1_beta0 s8_s8_s8_i 2290515358 917721283 1403142369 3451674023
+conv2d fprop_(2,16,16,64)_(256,2,5,64)_padl(1,1)_padu(0,0)_str(2,3)_dil(2,3)_corr_alpha1_beta0 s8_s8_s8_i 2290515358 917721283 1240515274 3524645563
+conv3d fprop_(1,1,8,8,64)_(64,1,1,1,64)_padl(0,0,0)_padu(0,0,0)_str(1,1,1)_dil(1,1,1)_corr_alpha1_beta0 s8_s8_s8_i 3696905065 291412341 2456226145 222634564
+conv3d fprop_(1,1,8,8,64)_(16,1,1,1,64)_padl(0,0,0)_padu(0,0,0)_str(1,1,1)_dil(1,1,1)_corr_alpha1_beta0 s8_s8_s8_i 3696905065 2455330259 2674438810 1247569549
+conv3d fprop_(2,1,8,8,64)_(96,1,1,1,64)_padl(0,0,0)_padu(0,0,0)_str(1,1,1)_dil(1,1,1)_corr_alpha1_beta0 s8_s8_s8_i 589562497 3397548888 3067871195 4124984848
+conv3d fprop_(2,3,5,8,64)_(96,3,3,3,64)_padl(0,0,0)_padu(0,0,0)_str(1,1,1)_dil(1,1,1)_corr_alpha1_beta0 s8_s8_s8_i 2665613442 3049451099 844780218 2240615929
+conv3d fprop_(2,3,5,8,32)_(96,3,3,3,32)_padl(1,1,1)_padu(1,1,1)_str(1,1,1)_dil(1,1,1)_corr_alpha1_beta0 s8_s8_s8_i 2957859809 2989051488 3910298589 2663735744
+conv3d fprop_(2,3,5,8,64)_(96,3,4,5,64)_padl(1,1,1)_padu(1,1,1)_str(1,1,1)_dil(1,1,1)_corr_alpha1_beta0 s8_s8_s8_i 2665613442 1144053949 2036439885 2424293561
+conv3d fprop_(2,3,5,8,64)_(96,3,4,5,64)_padl(1,0,1)_padu(0,2,0)_str(1,1,1)_dil(1,1,1)_corr_alpha1_beta0 s8_s8_s8_i 2665613442 1144053949 3855271504 1232523257
+conv3d fprop_(2,16,10,16,64)_(96,3,4,5,64)_padl(1,0,1)_padu(0,2,0)_str(2,2,3)_dil(1,1,1)_corr_alpha1_beta0 s8_s8_s8_i 3404775028 1144053949 1403142369 2248157699
+conv3d fprop_(2,16,10,16,64)_(96,3,4,5,64)_padl(1,0,1)_padu(0,2,0)_str(1,1,1)_dil(2,2,3)_corr_alpha1_beta0 s8_s8_s8_i 3404775028 1144053949 374107293 1044176262
+conv3d fprop_(2,16,10,16,64)_(96,3,4,5,64)_padl(1,0,1)_padu(0,2,0)_str(2,2,3)_dil(2,2,3)_corr_alpha1_beta0 s8_s8_s8_i 3404775028 1144053949 3659031110 1946240511
+conv1d fprop_(1,8,64)_(64,1,64)_padl(0)_padu(0)_str(1)_dil(1)_corr_alpha1_beta0 h_h_h_f 2903348270 3426051534 1688878159 1544142545
+conv1d fprop_(1,8,64)_(16,1,64)_padl(0)_padu(0)_str(1)_dil(1)_corr_alpha1_beta0 h_h_h_f 2903348270 4139873761 674035563 451236625
+conv1d fprop_(2,8,64)_(96,1,64)_padl(0)_padu(0)_str(1)_dil(1)_corr_alpha1_beta0 h_h_h_f 1021860383 1204062629 320546932 3992938435
+conv1d fprop_(7,8,64)_(256,1,64)_padl(0)_padu(0)_str(1)_dil(1)_corr_alpha1_beta0 h_h_h_f 1658584634 3542030567 2895019148 2225783091
+conv1d fprop_(2,8,64)_(256,3,64)_padl(0)_padu(0)_str(1)_dil(1)_corr_alpha1_beta0 h_h_h_f 1021860383 3498596809 1715902861 2177803653
+conv1d fprop_(2,8,32)_(256,3,32)_padl(1)_padu(1)_str(1)_dil(1)_corr_alpha1_beta0 h_h_h_f 2903348270 1317709417 1298679529 4040627173
+conv1d fprop_(2,8,64)_(256,4,64)_padl(0)_padu(1)_str(1)_dil(1)_corr_alpha1_beta0 h_h_h_f 1021860383 2047019225 1715902861 2671813939
+conv1d fprop_(2,8,64)_(256,3,64)_padl(0)_padu(1)_str(2)_dil(1)_corr_alpha1_beta0 h_h_h_f 1021860383 3498596809 626177484 2917692677
+conv1d fprop_(2,8,64)_(256,3,64)_padl(0)_padu(1)_str(1)_dil(2)_corr_alpha1_beta0 h_h_h_f 1021860383 3498596809 2486710374 3994207567
+conv2d fprop_(1,8,8,64)_(64,1,1,64)_padl(0,0)_padu(0,0)_str(1,1)_dil(1,1)_corr_alpha1_beta0 h_h_h_f 1260538290 3426051534 1298679529 99551275
+conv2d fprop_(1,8,8,64)_(16,1,1,64)_padl(0,0)_padu(0,0)_str(1,1)_dil(1,1)_corr_alpha1_beta0 h_h_h_f 1260538290 4139873761 2377544542 1503522773
+conv2d fprop_(2,8,8,64)_(96,1,1,64)_padl(0,0)_padu(0,0)_str(1,1)_dil(1,1)_corr_alpha1_beta0 h_h_h_f 913918101 1204062629 2917884534 2239833744
+conv2d fprop_(7,8,8,64)_(256,1,1,64)_padl(0,0)_padu(0,0)_str(1,1)_dil(1,1)_corr_alpha1_beta0 h_h_h_f 1794462925 3542030567 2009277002 189498806
+conv2d fprop_(2,8,8,64)_(256,3,3,64)_padl(0,0)_padu(0,0)_str(1,1)_dil(1,1)_corr_alpha1_beta0 h_h_h_f 913918101 1508460534 3907625767 968673523
+conv2d fprop_(2,8,8,32)_(256,3,3,32)_padl(1,1)_padu(1,1)_str(1,1)_dil(1,1)_corr_alpha1_beta0 h_h_h_f 1260538290 19124920 184863710 656273881
+conv2d fprop_(2,8,8,64)_(256,2,5,64)_padl(1,1)_padu(2,2)_str(1,1)_dil(1,1)_corr_alpha1_beta0 h_h_h_f 913918101 2657646585 1969539841 646972978
+conv2d fprop_(2,8,8,64)_(256,2,5,64)_padl(1,1)_padu(0,0)_str(2,3)_dil(1,1)_corr_alpha1_beta0 h_h_h_f 913918101 2657646585 1298679529 3394877772
+conv2d fprop_(2,16,16,64)_(256,2,5,64)_padl(1,1)_padu(0,0)_str(1,1)_dil(2,3)_corr_alpha1_beta0 h_h_h_f 1788526765 2657646585 3112324813 3072386268
+conv2d fprop_(2,16,16,64)_(256,2,5,64)_padl(1,1)_padu(0,0)_str(2,3)_dil(2,3)_corr_alpha1_beta0 h_h_h_f 1788526765 2657646585 1911791555 3769534326
+conv3d fprop_(1,1,8,8,64)_(64,1,1,1,64)_padl(0,0,0)_padu(0,0,0)_str(1,1,1)_dil(1,1,1)_corr_alpha1_beta0 h_h_h_f 1260538290 3426051534 1298679529 99551275
+conv3d fprop_(1,1,8,8,64)_(16,1,1,1,64)_padl(0,0,0)_padu(0,0,0)_str(1,1,1)_dil(1,1,1)_corr_alpha1_beta0 h_h_h_f 1260538290 4139873761 2377544542 1503522773
+conv3d fprop_(2,1,8,8,64)_(96,1,1,1,64)_padl(0,0,0)_padu(0,0,0)_str(1,1,1)_dil(1,1,1)_corr_alpha1_beta0 h_h_h_f 913918101 1204062629 2917884534 2239833744
+conv3d fprop_(2,3,5,8,64)_(96,3,3,3,64)_padl(0,0,0)_padu(0,0,0)_str(1,1,1)_dil(1,1,1)_corr_alpha1_beta0 h_h_h_f 2536233239 3623022813 3183260694 2444914935
+conv3d fprop_(2,3,5,8,32)_(96,3,3,3,32)_padl(1,1,1)_padu(1,1,1)_str(1,1,1)_dil(1,1,1)_corr_alpha1_beta0 h_h_h_f 3533222980 3319027099 3798069020 1596009643
+conv3d fprop_(2,3,5,8,64)_(96,3,4,5,64)_padl(1,1,1)_padu(1,1,1)_str(1,1,1)_dil(1,1,1)_corr_alpha1_beta0 h_h_h_f 2536233239 2541149106 3816338811 3343435781
+conv3d fprop_(2,3,5,8,64)_(96,3,4,5,64)_padl(1,0,1)_padu(0,2,0)_str(1,1,1)_dil(1,1,1)_corr_alpha1_beta0 h_h_h_f 2536233239 2541149106 23174956 4214553739
+conv3d fprop_(2,16,10,16,64)_(96,3,4,5,64)_padl(1,0,1)_padu(0,2,0)_str(2,2,3)_dil(1,1,1)_corr_alpha1_beta0 h_h_h_f 438040955 2541149106 3112324813 184240726
+conv3d fprop_(2,16,10,16,64)_(96,3,4,5,64)_padl(1,0,1)_padu(0,2,0)_str(1,1,1)_dil(2,2,3)_corr_alpha1_beta0 h_h_h_f 438040955 2541149106 1729700443 1845241883
+conv3d fprop_(2,16,10,16,64)_(96,3,4,5,64)_padl(1,0,1)_padu(0,2,0)_str(2,2,3)_dil(2,2,3)_corr_alpha1_beta0 h_h_h_f 438040955 2541149106 2215856509 1015378858
+conv1d fprop_(1,8,64)_(64,1,64)_padl(0)_padu(0)_str(1)_dil(1)_corr_alpha1_beta0 f_f_f_f 3457276160 1628467649 1540949872 2136373177
+conv1d fprop_(1,8,64)_(16,1,64)_padl(0)_padu(0)_str(1)_dil(1)_corr_alpha1_beta0 f_f_f_f 3457276160 3090183904 3899451983 3023937644
+conv1d fprop_(2,8,64)_(96,1,64)_padl(0)_padu(0)_str(1)_dil(1)_corr_alpha1_beta0 f_f_f_f 1880214374 3677150560 310726324 4149087957
+conv1d fprop_(7,8,64)_(256,1,64)_padl(0)_padu(0)_str(1)_dil(1)_corr_alpha1_beta0 f_f_f_f 11721734 2875614321 876330871 1062881547
+conv1d fprop_(2,8,64)_(256,3,64)_padl(0)_padu(0)_str(1)_dil(1)_corr_alpha1_beta0 f_f_f_f 1880214374 354521127 3224870611 2936608625
+conv1d fprop_(2,8,32)_(256,3,32)_padl(1)_padu(1)_str(1)_dil(1)_corr_alpha1_beta0 f_f_f_f 3457276160 3940958820 592544256 281505365
+conv1d fprop_(2,8,64)_(256,4,64)_padl(0)_padu(1)_str(1)_dil(1)_corr_alpha1_beta0 f_f_f_f 1880214374 2370127693 3224870611 3916432513
+conv1d fprop_(2,8,64)_(256,3,64)_padl(0)_padu(1)_str(2)_dil(1)_corr_alpha1_beta0 f_f_f_f 1880214374 354521127 3565144715 3108552041
+conv1d fprop_(2,8,64)_(256,3,64)_padl(0)_padu(1)_str(1)_dil(2)_corr_alpha1_beta0 f_f_f_f 1880214374 354521127 1319667750 2636132047
+conv2d fprop_(1,8,8,64)_(64,1,1,64)_padl(0,0)_padu(0,0)_str(1,1)_dil(1,1)_corr_alpha1_beta0 f_f_f_f 3453982556 1628467649 592544256 1519969106
+conv2d fprop_(1,8,8,64)_(16,1,1,64)_padl(0,0)_padu(0,0)_str(1,1)_dil(1,1)_corr_alpha1_beta0 f_f_f_f 3453982556 3090183904 930148554 3904697107
+conv2d fprop_(2,8,8,64)_(96,1,1,64)_padl(0,0)_padu(0,0)_str(1,1)_dil(1,1)_corr_alpha1_beta0 f_f_f_f 854950536 3677150560 1408652759 1475171676
+conv2d fprop_(7,8,8,64)_(256,1,1,64)_padl(0,0)_padu(0,0)_str(1,1)_dil(1,1)_corr_alpha1_beta0 f_f_f_f 1066744638 2875614321 2252791511 3400916910
+conv2d fprop_(2,8,8,64)_(256,3,3,64)_padl(0,0)_padu(0,0)_str(1,1)_dil(1,1)_corr_alpha1_beta0 f_f_f_f 854950536 795739645 1545596299 4043117388
+conv2d fprop_(2,8,8,32)_(256,3,3,32)_padl(1,1)_padu(1,1)_str(1,1)_dil(1,1)_corr_alpha1_beta0 f_f_f_f 3453982556 2561845304 1899218706 1185670554
+conv2d fprop_(2,8,8,64)_(256,2,5,64)_padl(1,1)_padu(2,2)_str(1,1)_dil(1,1)_corr_alpha1_beta0 f_f_f_f 854950536 4046266669 893484124 4189127440
+conv2d fprop_(2,8,8,64)_(256,2,5,64)_padl(1,1)_padu(0,0)_str(2,3)_dil(1,1)_corr_alpha1_beta0 f_f_f_f 854950536 4046266669 592544256 2322571289
+conv2d fprop_(2,16,16,64)_(256,2,5,64)_padl(1,1)_padu(0,0)_str(1,1)_dil(2,3)_corr_alpha1_beta0 f_f_f_f 191074901 4046266669 1081688479 1745547693
+conv2d fprop_(2,16,16,64)_(256,2,5,64)_padl(1,1)_padu(0,0)_str(2,3)_dil(2,3)_corr_alpha1_beta0 f_f_f_f 191074901 4046266669 1031384344 1016941631
+conv3d fprop_(1,1,8,8,64)_(64,1,1,1,64)_padl(0,0,0)_padu(0,0,0)_str(1,1,1)_dil(1,1,1)_corr_alpha1_beta0 f_f_f_f 3453982556 1628467649 592544256 1519969106
+conv3d fprop_(1,1,8,8,64)_(16,1,1,1,64)_padl(0,0,0)_padu(0,0,0)_str(1,1,1)_dil(1,1,1)_corr_alpha1_beta0 f_f_f_f 3453982556 3090183904 930148554 3904697107
+conv3d fprop_(2,1,8,8,64)_(96,1,1,1,64)_padl(0,0,0)_padu(0,0,0)_str(1,1,1)_dil(1,1,1)_corr_alpha1_beta0 f_f_f_f 854950536 3677150560 1408652759 1475171676
+conv3d fprop_(2,3,5,8,64)_(96,3,3,3,64)_padl(0,0,0)_padu(0,0,0)_str(1,1,1)_dil(1,1,1)_corr_alpha1_beta0 f_f_f_f 1304210401 3269133702 2815004824 1363667010
+conv3d fprop_(2,3,5,8,32)_(96,3,3,3,32)_padl(1,1,1)_padu(1,1,1)_str(1,1,1)_dil(1,1,1)_corr_alpha1_beta0 f_f_f_f 1689793983 2561137151 3947985101 3903813688
+conv3d fprop_(2,3,5,8,64)_(96,3,4,5,64)_padl(1,1,1)_padu(1,1,1)_str(1,1,1)_dil(1,1,1)_corr_alpha1_beta0 f_f_f_f 1304210401 2276316058 1262249481 1546938459
+conv3d fprop_(2,3,5,8,64)_(96,3,4,5,64)_padl(1,0,1)_padu(0,2,0)_str(1,1,1)_dil(1,1,1)_corr_alpha1_beta0 f_f_f_f 1304210401 2276316058 288977156 2430714665
+conv3d fprop_(2,16,10,16,64)_(96,3,4,5,64)_padl(1,0,1)_padu(0,2,0)_str(2,2,3)_dil(1,1,1)_corr_alpha1_beta0 f_f_f_f 4289052901 2276316058 1081688479 1271132150
+conv3d fprop_(2,16,10,16,64)_(96,3,4,5,64)_padl(1,0,1)_padu(0,2,0)_str(1,1,1)_dil(2,2,3)_corr_alpha1_beta0 f_f_f_f 4289052901 2276316058 3305425723 749517833
+conv3d fprop_(2,16,10,16,64)_(96,3,4,5,64)_padl(1,0,1)_padu(0,2,0)_str(2,2,3)_dil(2,2,3)_corr_alpha1_beta0 f_f_f_f 4289052901 2276316058 304972512 1722195827
diff --git a/test/unit/data/hashes/cached_results_cutlass_test_unit_conv_wgrad_device_tensorop_sm90.txt b/test/unit/data/hashes/cached_results_cutlass_test_unit_conv_wgrad_device_tensorop_sm90.txt
new file mode 100644
index 0000000000..b7e082671a
--- /dev/null
+++ b/test/unit/data/hashes/cached_results_cutlass_test_unit_conv_wgrad_device_tensorop_sm90.txt
@@ -0,0 +1,23 @@
+conv1d wgrad_(1,8,64)_(1,8,128)_padl(0)_padu(0)_str(1)_dil(1)_corr_alpha1_beta0 h_h_h_h 2903348270 4139873761 1911791555 2094311858
+conv1d wgrad_(1,8,16)_(1,8,128)_padl(0)_padu(0)_str(1)_dil(1)_corr_alpha1_beta0 h_h_h_h 3472074346 4139873761 626177484 4014899436
+conv1d wgrad_(2,8,96)_(2,8,128)_padl(0)_padu(0)_str(1)_dil(1)_corr_alpha1_beta0 h_h_h_h 2790635024 2105873592 2917884534 4063149047
+conv1d wgrad_(7,8,256)_(7,8,128)_padl(0)_padu(0)_str(1)_dil(1)_corr_alpha1_beta0 h_h_h_h 37621077 58406568 184863710 3825831917
+conv1d wgrad_(2,6,256)_(2,8,128)_padl(0)_padu(0)_str(1)_dil(1)_corr_alpha1_beta0 h_h_h_h 3284272669 2105873592 1352114402 170044063
+conv1d wgrad_(2,8,256)_(2,8,128)_padl(1)_padu(1)_str(1)_dil(1)_corr_alpha1_beta0 h_h_h_h 1260538290 2105873592 1352114402 1054386746
+conv1d wgrad_(2,6,256)_(2,8,128)_padl(0)_padu(1)_str(1)_dil(1)_corr_alpha1_beta0 h_h_h_h 3284272669 2105873592 4149848551 3241398662
+conv1d wgrad_(2,4,256)_(2,8,128)_padl(0)_padu(1)_str(2)_dil(1)_corr_alpha1_beta0 h_h_h_h 2501680779 2105873592 1352114402 2277261527
+conv1d wgrad_(2,5,256)_(2,8,128)_padl(0)_padu(1)_str(1)_dil(2)_corr_alpha1_beta0 h_h_h_h 3472351235 2105873592 1352114402 1619048523
+conv2d wgrad_(1,8,8,64)_(1,8,8,128)_padl(0,0)_padu(0,0)_str(1,1)_dil(1,1)_corr_alpha1_beta0 h_h_h_h 1260538290 2039847788 1911791555 1010376807
+conv2d wgrad_(1,8,8,16)_(1,8,8,128)_padl(0,0)_padu(0,0)_str(1,1)_dil(1,1)_corr_alpha1_beta0 h_h_h_h 1021860383 2039847788 626177484 4023661355
+conv2d wgrad_(2,8,8,96)_(2,8,8,128)_padl(0,0)_padu(0,0)_str(1,1)_dil(1,1)_corr_alpha1_beta0 h_h_h_h 1661538138 3542030567 2917884534 798509586
+conv2d wgrad_(7,8,8,256)_(7,8,8,128)_padl(0,0)_padu(0,0)_str(1,1)_dil(1,1)_corr_alpha1_beta0 h_h_h_h 854475261 2837606078 184863710 2833795023
+conv2d wgrad_(2,6,6,256)_(2,8,8,128)_padl(0,0)_padu(0,0)_str(1,1)_dil(1,1)_corr_alpha1_beta0 h_h_h_h 269005094 3542030567 587374223 722686340
+conv2d wgrad_(2,8,8,256)_(2,8,8,128)_padl(1,1)_padu(1,1)_str(1,1)_dil(1,1)_corr_alpha1_beta0 h_h_h_h 1788526765 3542030567 587374223 3596796540
+conv2d wgrad_(2,8,5,256)_(2,8,8,128)_padl(1,1)_padu(0,0)_str(1,1)_dil(1,1)_corr_alpha1_beta0 h_h_h_h 1986808706 3542030567 2799255748 3793023440
+conv2d wgrad_(2,8,5,256)_(2,16,16,128)_padl(1,1)_padu(0,0)_str(2,3)_dil(1,1)_corr_alpha1_beta0 h_h_h_h 1986808706 2047019225 2799255748 1869700538
+conv2d wgrad_(2,15,5,256)_(2,16,16,128)_padl(1,1)_padu(0,0)_str(1,1)_dil(2,3)_corr_alpha1_beta0 h_h_h_h 2554054572 2047019225 2799255748 1061991327
+conv2d wgrad_(2,8,2,256)_(2,16,16,128)_padl(1,1)_padu(0,0)_str(2,3)_dil(2,3)_corr_alpha1_beta0 h_h_h_h 913918101 2047019225 2799255748 1723667339
+conv3d wgrad_(1,1,8,8,16)_(1,1,8,8,128)_padl(0,0,0)_padu(0,0,0)_str(1,1,1)_dil(1,1,1)_corr_alpha1_beta0 h_h_h_h 1021860383 2039847788 626177484 1510320256
+conv3d wgrad_(2,3,4,6,96)_(2,3,5,8,128)_padl(1,1,1)_padu(1,1,1)_str(1,1,1)_dil(1,1,1)_corr_alpha1_beta0 h_h_h_h 2885338846 4129534099 955078878 481515417
+conv3d wgrad_(2,8,5,5,96)_(2,16,10,16,128)_padl(1,0,1)_padu(0,2,0)_str(2,2,3)_dil(1,1,1)_corr_alpha1_beta0 h_h_h_h 2554054572 1879264901 955078878 3349975418
+conv3d wgrad_(2,13,6,5,96)_(2,16,10,16,128)_padl(1,0,1)_padu(0,2,0)_str(1,1,1)_dil(2,2,3)_corr_alpha1_beta0 h_h_h_h 698054658 1879264901 955078878 4113825945
diff --git a/test/unit/epilogue/CMakeLists.txt b/test/unit/epilogue/CMakeLists.txt
index f84423fbfc..bb3c2ca3b1 100755
--- a/test/unit/epilogue/CMakeLists.txt
+++ b/test/unit/epilogue/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/thread/CMakeLists.txt b/test/unit/epilogue/thread/CMakeLists.txt
index fcb9105b74..7410063b0d 100644
--- a/test/unit/epilogue/thread/CMakeLists.txt
+++ b/test/unit/epilogue/thread/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/thread/activation.cu b/test/unit/epilogue/thread/activation.cu
index 041fded49f..76d8d903e8 100644
--- a/test/unit/epilogue/thread/activation.cu
+++ b/test/unit/epilogue/thread/activation.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -34,6 +34,7 @@
 
 #include "../../common/cutlass_unit_test.h"
 
+#include "cutlass/layout/layout.h"
 #include "cutlass/epilogue/thread/activation.h"
 
 #include "cutlass/util/host_tensor.h"
diff --git a/test/unit/epilogue/thread/linear_combination.cu b/test/unit/epilogue/thread/linear_combination.cu
index 77d7c23cc1..66e07a38b6 100644
--- a/test/unit/epilogue/thread/linear_combination.cu
+++ b/test/unit/epilogue/thread/linear_combination.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/thread/linear_combination_planar_complex.cu b/test/unit/epilogue/thread/linear_combination_planar_complex.cu
index fa61192557..6cbc9589df 100644
--- a/test/unit/epilogue/thread/linear_combination_planar_complex.cu
+++ b/test/unit/epilogue/thread/linear_combination_planar_complex.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -183,7 +183,7 @@ TEST(Epilogue_thread_linear_combination_planar_complex, f16_f32) {
     source.imag[i] = ElementOutput(((i * 5 + 2) % 9) - 4);
   }
 
-  cutlass::ArrayPlanarComplex<ElementOutput, kCount> destination = linear_combination_op(accum, source);
+  cutlass::ArrayPlanarComplex<ElementOutput, kCount> destination{ linear_combination_op(accum, source) };
 
   // Verify each result
   for (int i = 0; i < kCount; ++i) {
diff --git a/test/unit/epilogue/threadblock/CMakeLists.txt b/test/unit/epilogue/threadblock/CMakeLists.txt
index 1707935468..90da0a5f16 100755
--- a/test/unit/epilogue/threadblock/CMakeLists.txt
+++ b/test/unit/epilogue/threadblock/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/threadblock/epilogue_planar_complex.cu b/test/unit/epilogue/threadblock/epilogue_planar_complex.cu
index 4b3feccacf..70c3b523ab 100644
--- a/test/unit/epilogue/threadblock/epilogue_planar_complex.cu
+++ b/test/unit/epilogue/threadblock/epilogue_planar_complex.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/threadblock/epilogue_simt.cu b/test/unit/epilogue/threadblock/epilogue_simt.cu
index 386f2871fa..0b8d45a56e 100644
--- a/test/unit/epilogue/threadblock/epilogue_simt.cu
+++ b/test/unit/epilogue/threadblock/epilogue_simt.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu b/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu
index 84f9110f44..3e7e5a1329 100644
--- a/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu
+++ b/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu b/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu
index 4aa27befe2..81b1d99fca 100644
--- a/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu
+++ b/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/threadblock/epilogue_tensor_op.cu b/test/unit/epilogue/threadblock/epilogue_tensor_op.cu
index 5d185dd5e4..8858ca173b 100644
--- a/test/unit/epilogue/threadblock/epilogue_tensor_op.cu
+++ b/test/unit/epilogue/threadblock/epilogue_tensor_op.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu b/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu
index 415f7dd73c..9f8a58d6b3 100644
--- a/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu
+++ b/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/threadblock/epilogue_with_reduction_tensor_op.cu b/test/unit/epilogue/threadblock/epilogue_with_reduction_tensor_op.cu
index 922bebc12e..e4e179fb7d 100644
--- a/test/unit/epilogue/threadblock/epilogue_with_reduction_tensor_op.cu
+++ b/test/unit/epilogue/threadblock/epilogue_with_reduction_tensor_op.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/threadblock/epilogue_with_reduction_testbed.h b/test/unit/epilogue/threadblock/epilogue_with_reduction_testbed.h
index 903248edf6..5ed1431b08 100644
--- a/test/unit/epilogue/threadblock/epilogue_with_reduction_testbed.h
+++ b/test/unit/epilogue/threadblock/epilogue_with_reduction_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -133,29 +133,11 @@ __global__ void epilogue_with_reduction_threadblock(
   // For debugging, enable this block of code to fill each accumulator element with its
   // source thread ID.
   CUTLASS_PRAGMA_UNROLL
-  for (int i = 0; i < accumulators.size(); ++i) {
+  for (size_t i = 0; i < accumulators.size(); ++i) {
     typename Epilogue::WarpMmaOperator::ElementC x(threadIdx.x);
-    //typename Epilogue::WarpMmaOperator::ElementC x(i);
     accumulators[i] = x;
   }
 
-  /*
-  #pragma unroll 1
-  for (int tid = 0; tid < 32; ++tid) {
-    if (tid == thread_idx) {
-      printf("\nT%d: ", thread_idx);
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < accumulators.size(); ++i) {
-        printf("%d ", int(accumulators[i]));
-      }  
-    }
-  }
-
-  if (thread_idx == 0) {
-    printf("\n\n");  
-  }
-  */
-
   __syncthreads();
 
 #endif
diff --git a/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu b/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu
index 1e9e5c8738..23786aec4d 100644
--- a/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu
+++ b/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/threadblock/output_tile_threadmap.cu b/test/unit/epilogue/threadblock/output_tile_threadmap.cu
index c33e0a89d4..f59f5dc9a6 100644
--- a/test/unit/epilogue/threadblock/output_tile_threadmap.cu
+++ b/test/unit/epilogue/threadblock/output_tile_threadmap.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/threadblock/predicated_tile_iterator.cu b/test/unit/epilogue/threadblock/predicated_tile_iterator.cu
index 3044306442..ce8f9aee51 100644
--- a/test/unit/epilogue/threadblock/predicated_tile_iterator.cu
+++ b/test/unit/epilogue/threadblock/predicated_tile_iterator.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/threadblock/testbed.h b/test/unit/epilogue/threadblock/testbed.h
index bf40b9ec36..b773d27cbf 100644
--- a/test/unit/epilogue/threadblock/testbed.h
+++ b/test/unit/epilogue/threadblock/testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -42,6 +42,7 @@
 #include "cutlass/half.h"
 #include "cutlass/complex.h"
 #include "cutlass/quaternion.h"
+#include "cutlass/platform/platform.h"
 #include "cutlass/epilogue/thread/linear_combination.h"
 
 #include "cutlass/util/host_tensor.h"
@@ -122,29 +123,11 @@ __global__ void epilogue_threadblock(
   // For debugging, enable this block of code to fill each accumulator element with its
   // source thread ID.
   CUTLASS_PRAGMA_UNROLL
-  for (int i = 0; i < accumulators.size(); ++i) {
+  for (size_t i = 0; i < accumulators.size(); ++i) {
     typename Epilogue::WarpMmaOperator::ElementC x(threadIdx.x);
-    //typename Epilogue::WarpMmaOperator::ElementC x(i);
     accumulators[i] = x;
   }
 
-  /*
-  #pragma unroll 1
-  for (int tid = 0; tid < 32; ++tid) {
-    if (tid == thread_idx) {
-      printf("\nT%d: ", thread_idx);
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < accumulators.size(); ++i) {
-        printf("%d ", int(accumulators[i]));
-      }  
-    }
-  }
-
-  if (thread_idx == 0) {
-    printf("\n\n");  
-  }
-  */
-
   __syncthreads();
 
 #endif
@@ -211,15 +194,15 @@ class EpilogueTestbed {
     cutlass::reference::host::TensorFillRandomUniform(
       accumulator_tensor.host_view(), 
       seed, 
-      20, 
-      -20, 
+      2,
+      -2,
       0);
 
     cutlass::reference::host::TensorFillRandomUniform(
       source_tensor.host_view(),
       seed + 2018, 
-      20, 
-      -20, 
+      2,
+      -2,
       0);
   }
 
@@ -318,7 +301,9 @@ class EpilogueTestbed {
             output_params.alpha * ElementCompute(accumulator_tensor.at(coord)) + 
             output_params.beta * ElementCompute(source_tensor.at(coord));
           
-          if (std::numeric_limits<ElementOutput>::is_integer
+          if ((cutlass::platform::is_same<ElementOutput, cutlass::int4b_t>::value
+              || cutlass::platform::is_same<ElementOutput, cutlass::uint4b_t>::value
+              || std::numeric_limits<ElementOutput>::is_integer)
               && !std::numeric_limits<ElementCompute>::is_integer) {
             std::fesetround(FE_TONEAREST);
             expected = ElementOutput(std::nearbyint(float(cutlass::real(intermediate))));
diff --git a/test/unit/epilogue/threadblock/testbed_planar_complex.h b/test/unit/epilogue/threadblock/testbed_planar_complex.h
index f7c01b4ca4..00d02aff95 100644
--- a/test/unit/epilogue/threadblock/testbed_planar_complex.h
+++ b/test/unit/epilogue/threadblock/testbed_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/warp/CMakeLists.txt b/test/unit/epilogue/warp/CMakeLists.txt
index d9fb95762e..044af86ae5 100644
--- a/test/unit/epilogue/warp/CMakeLists.txt
+++ b/test/unit/epilogue/warp/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/warp/fragment_iterator_tensor_op.cu b/test/unit/epilogue/warp/fragment_iterator_tensor_op.cu
index 90f9d97566..8be67942ea 100644
--- a/test/unit/epilogue/warp/fragment_iterator_tensor_op.cu
+++ b/test/unit/epilogue/warp/fragment_iterator_tensor_op.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -78,7 +78,7 @@ TEST(SM75_Epilogue_warp_FragmentIterator, mma_f32_64x64x8) {
 
   std::cout << "Native accumulators:\n";
 
-  for (int i = 0; i < MmaTensorOp::FragmentC::kElements; ++i) {
+  for (size_t i = 0; i < MmaTensorOp::FragmentC::kElements; ++i) {
     accum[i] = ElementC(i);
 
     std::cout << accum[i] << " ";
@@ -106,7 +106,7 @@ TEST(SM75_Epilogue_warp_FragmentIterator, mma_f32_64x64x8) {
 
     std::cout << "Iteration " << iter << ":\n";
 
-    for (int i = 0; i < FragmentIterator::Fragment::kElements; ++i) {
+    for (size_t i = 0; i < FragmentIterator::Fragment::kElements; ++i) {
       std::cout << frag[i] << " ";
     }
 
@@ -153,8 +153,83 @@ TEST(SM75_Epilogue_warp_FragmentIterator, mma_f16_64x64x8) {
 
   std::cout << "Native accumulators:\n";
 
-  for (int i = 0; i < MmaTensorOp::FragmentC::kElements; ++i) {
-    accum[i] = ElementC(i);
+  for (size_t i = 0; i < MmaTensorOp::FragmentC::kElements; ++i) {
+    accum[i] = ElementC((int)i);
+
+    std::cout << (float)accum[i] << " ";
+    if (i && !((i + 1) % 4)) { 
+      std::cout << "\n";
+    }
+  }
+
+  std::cout << std::endl;
+
+  std::cout << "FragmentIterator::Policy = { \n"
+    << "  kAccessesPerInstruction:  " << FragmentIterator::Policy::kIterationsPerInstruction << "\n"
+    << "  kAccumulatorRowStride:    " << FragmentIterator::Policy::kAccumulatorRowStride << "\n"
+    << "  kAccumulatorColumnStride: " << FragmentIterator::Policy::kAccumulatorColumnStride << "\n"
+    << "  kIterations:              " << FragmentIterator::Policy::kIterations << "\n"
+    << " }" << std::endl;
+
+  FragmentIterator fragment_iterator(accum);
+
+  for (int iter = 0; iter < FragmentIterator::kIterations; ++iter) {
+
+    typename FragmentIterator::Fragment frag;
+
+    fragment_iterator.load(frag);
+
+    std::cout << "Iteration " << iter << ":\n";
+
+    for (size_t i = 0; i < FragmentIterator::Fragment::kElements; ++i) {
+      std::cout << (float)frag[i] << " ";
+    }
+
+    std::cout << std::endl;
+
+    ++fragment_iterator;
+  }
+  #endif
+}
+
+TEST(SM75_Epilogue_warp_FragmentIterator_column, mma_f32_64x64x8) {
+
+  using Shape = cutlass::gemm::GemmShape<64, 64, 8>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Element = cutlass::half_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+    Shape,
+    InstructionShape,
+    Element,
+    LayoutA,
+    Element,
+    LayoutB,
+    ElementC,
+    cutlass::layout::RowMajor
+  >::Type;
+
+  using FragmentIterator = cutlass::epilogue::warp::FragmentIteratorTensorOp<
+    Shape,
+    typename MmaTensorOp::Policy::Operator::Shape,
+    typename MmaTensorOp::Policy::Operator::ElementC,
+    typename MmaTensorOp::Policy::Operator::FragmentC,
+    cutlass::layout::ColumnMajor
+  >;
+
+  // This test just prints things.
+  #if 0
+  typename MmaTensorOp::FragmentC accum;
+
+  std::cout << "Native accumulators:\n";
+
+  for (size_t i = 0; i < MmaTensorOp::FragmentC::kElements; ++i) {
+    accum[i] = ElementC((int)i);
 
     std::cout << (float)accum[i] << " ";
     if (i && !((i + 1) % 4)) { 
@@ -181,7 +256,7 @@ TEST(SM75_Epilogue_warp_FragmentIterator, mma_f16_64x64x8) {
 
     std::cout << "Iteration " << iter << ":\n";
 
-    for (int i = 0; i < FragmentIterator::Fragment::kElements; ++i) {
+    for (size_t i = 0; i < FragmentIterator::Fragment::kElements; ++i) {
       std::cout << (float)frag[i] << " ";
     }
 
@@ -191,4 +266,5 @@ TEST(SM75_Epilogue_warp_FragmentIterator, mma_f16_64x64x8) {
   }
   #endif
 }
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/epilogue/warp/fragment_iterator_volta_tensor_op.cu b/test/unit/epilogue/warp/fragment_iterator_volta_tensor_op.cu
index 9a82ccda94..7c6c741829 100644
--- a/test/unit/epilogue/warp/fragment_iterator_volta_tensor_op.cu
+++ b/test/unit/epilogue/warp/fragment_iterator_volta_tensor_op.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -93,8 +93,8 @@ TEST(SM70_Epilogue_warp_FragmentIterator, mma_f16_64x64x4) {
     typename MmaTensorOp::IteratorC::Fragment accumulator_tile;
 
     CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < accumulator_tile.size(); ++i) {
-      accumulator_tile[i] = ElementC(i);
+    for (size_t i = 0; i < accumulator_tile.size(); ++i) {
+      accumulator_tile[i] = static_cast<ElementC>(int(i));
     }
 
     using FragmentIterator = cutlass::epilogue::warp::FragmentIteratorVoltaTensorOp<
@@ -114,7 +114,7 @@ TEST(SM70_Epilogue_warp_FragmentIterator, mma_f16_64x64x4) {
 
     #if 0
       std::cout << "T" << tid << ": ";
-      for (int i = 0; i < frag.size(); ++i) {
+      for (size_t i = 0; i < frag.size(); ++i) {
         std::cout << "  " << frag[i];
       }
       std::cout << std::endl;
@@ -169,8 +169,8 @@ TEST(SM70_Epilogue_warp_FragmentIterator, mma_f32_64x64x4) {
     typename MmaTensorOp::IteratorC::Fragment accumulator_tile;
 
     CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < accumulator_tile.size(); ++i) {
-      accumulator_tile[i] = ElementC(i);
+    for (size_t i = 0; i < accumulator_tile.size(); ++i) {
+      accumulator_tile[i] = static_cast<ElementC>(i);
     }
 
     typename MmaTensorOp::IteratorC iterator_C(accumulator_tensor.host_ref(), tid);  
diff --git a/test/unit/epilogue/warp/fragment_iterator_wmma_tensor_op.cu b/test/unit/epilogue/warp/fragment_iterator_wmma_tensor_op.cu
index dad62b52d1..45776f7108 100644
--- a/test/unit/epilogue/warp/fragment_iterator_wmma_tensor_op.cu
+++ b/test/unit/epilogue/warp/fragment_iterator_wmma_tensor_op.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/CMakeLists.txt b/test/unit/gemm/CMakeLists.txt
index b0cee068c7..c73d930d87 100644
--- a/test/unit/gemm/CMakeLists.txt
+++ b/test/unit/gemm/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/CMakeLists.txt b/test/unit/gemm/device/CMakeLists.txt
index 2aca5c1686..87b6e53d50 100644
--- a/test/unit/gemm/device/CMakeLists.txt
+++ b/test/unit/gemm/device/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -26,41 +26,30 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-add_custom_target(
-  cutlass_test_unit_gemm_device
-  DEPENDS
-  cutlass_test_unit_gemm_device_simt
-  cutlass_test_unit_gemm_device_tensorop_sm70
-  cutlass_test_unit_gemm_device_tensorop_sm75
-  cutlass_test_unit_gemm_device_tensorop_f16_sm80
-  cutlass_test_unit_gemm_device_tensorop_f32_sm80
-  cutlass_test_unit_gemm_device_tensorop_f32_tf32_sm80
-  cutlass_test_unit_gemm_device_tensorop_f64
-  cutlass_test_unit_gemm_device_tensorop_s32_sm80
-  cutlass_test_unit_gemm_device_wmma
-  cutlass_test_unit_gemm_device_tensorop_planar_complex
-  cutlass_test_unit_gemm_device_sparse_tensorop_sm80
-  cutlass_test_unit_gemv_device
-)
+add_custom_target(cutlass_test_unit_gemm_device)
+add_custom_target(test_unit_gemm_device)
 
-add_custom_target(
-  test_unit_gemm_device
-  DEPENDS
-  test_unit_gemm_device_simt
-  test_unit_gemm_device_tensorop_sm70
-  test_unit_gemm_device_tensorop_sm75
-  test_unit_gemm_device_tensorop_f16_sm80
-  test_unit_gemm_device_tensorop_f32_sm80
-  test_unit_gemm_device_tensorop_f32_tf32_sm80
-  test_unit_gemm_device_tensorop_f64
-  test_unit_gemm_device_tensorop_s32_sm80
-  test_unit_gemm_device_wmma
-  test_unit_gemm_device_tensorop_planar_complex
-  test_unit_gemm_device_sparse_tensorop_sm80
-  test_unit_gemv_device
-)
+################################################################################
+
+function(cutlass_test_unit_gemm_device_add_deps NAME)
+  string(REGEX REPLACE "^cutlass_" "" TEST_NAME "${NAME}")
+  add_dependencies(cutlass_test_unit_gemm_device ${NAME})
+  add_dependencies(test_unit_gemm_device ${TEST_NAME})
+endfunction()
 
-cutlass_test_unit_add_executable(
+function(cutlass_test_unit_gemm_device_add_executable NAME)
+  cutlass_test_unit_add_executable(${NAME} ${ARGN} DO_NOT_LOWERCASE_TEST_NAME)
+  cutlass_test_unit_gemm_device_add_deps(${NAME})
+endfunction()
+
+function(cutlass_test_unit_gemm_device_add_executable_split_file NAME)
+  cutlass_test_unit_add_executable_split_file(${NAME} ${ARGN} DO_NOT_LOWERCASE_TEST_NAME)
+  cutlass_test_unit_gemm_device_add_deps(${NAME})
+endfunction()
+
+################################################################################
+
+cutlass_test_unit_gemm_device_add_executable(
   cutlass_test_unit_gemm_device_simt
 
   BATCH_SOURCES ON
@@ -68,7 +57,12 @@ cutlass_test_unit_add_executable(
 
   simt_sgemm_nt_sm80.cu
   simt_sgemm_tn_sm80.cu
-  
+
+  simt_cgemm_nt_sm80.cu
+  simt_cgemm_tn_sm80.cu
+
+  simt_f8gemm_tn_sm50.cu
+
   simt_cgemm_nn_sm50.cu
   simt_cgemm_nt_sm50.cu
   simt_cgemm_tn_sm50.cu
@@ -110,12 +104,26 @@ cutlass_test_unit_add_executable(
   gemm_splitk_simt_sm50.cu
 )
 
-cutlass_test_unit_add_executable(
-  cutlass_test_unit_gemm_device_tensorop_sm70
+list(APPEND CUTLASS_TEST_UNIT_GEMM_DEVICE_LIST cutlass_test_unit_gemm_device_simt)
+
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_simt_3x
 
   BATCH_SOURCES ON
   BATCH_SIZE 4
 
+  sm50_gemm_f32_f32_f32_simt.cu
+  sm80_gemm_f32_f32_f32_simt.cu
+  sm50_gemm_f64_f64_f64_simt.cu
+  sm80_gemm_f64_f64_f64_simt.cu
+  sm61_gemm_s8_s8_s32_simt.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_tensorop_sm70
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 4
 
   gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu
   gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu
@@ -130,7 +138,7 @@ cutlass_test_unit_add_executable(
   gemm_splitk_tensor_op_sm70.cu
 )
 
-cutlass_test_unit_add_executable(
+cutlass_test_unit_gemm_device_add_executable(
   cutlass_test_unit_gemm_device_tensorop_sm75
 
   BATCH_SOURCES ON
@@ -140,6 +148,7 @@ cutlass_test_unit_add_executable(
   gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm75.cu
 
   gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu
+  gemm_f16t_f16n_f16t_tensor_op_f32_sm75.cu
   gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu
   gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu
   gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu
@@ -163,7 +172,7 @@ cutlass_test_unit_add_executable(
   gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu
   gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu
   gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu
-  gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu 
+  gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu
   gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu
 
   gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu
@@ -174,7 +183,7 @@ cutlass_test_unit_add_executable(
 
 )
 
-cutlass_test_unit_add_executable(
+cutlass_test_unit_gemm_device_add_executable(
   cutlass_test_unit_gemm_device_tensorop_f16_sm80
 
   BATCH_SOURCES ON
@@ -184,7 +193,7 @@ cutlass_test_unit_add_executable(
   gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu
 )
 
-cutlass_test_unit_add_executable(
+cutlass_test_unit_gemm_device_add_executable(
   cutlass_test_unit_gemm_device_tensorop_f32_sm80
 
   BATCH_SOURCES ON
@@ -194,8 +203,10 @@ cutlass_test_unit_add_executable(
   gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu
   gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu
   gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu
+  gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu
   gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu
   gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu
+  gemm_f16t_f16n_f16t_tensor_op_f32_sm80.cu
   gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu
   gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu
   gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu
@@ -204,7 +215,189 @@ cutlass_test_unit_add_executable(
   gemm_f16n_f16n_f16n_direct_store_tensor_op_f32_sm80.cu
 )
 
-cutlass_test_unit_add_executable(
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_tensorop_f32_sm80_3x
+
+  sm80_gemm_s8_s8_s32_tensor_op.cu
+  sm80_gemm_f16_f16_f32_tensor_op_f32.cu
+  sm80_gemm_tf32_tf32_f32_tensor_op_f32.cu
+)
+
+
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_mixed_input_tensorop_sm80
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 4
+
+  # Upcast on Operand A
+  gemm_universal_s8t_f16n_f32t_mixed_input_tensor_op_f32_sm80.cu
+  gemm_universal_u8t_f16n_f32t_mixed_input_tensor_op_f32_sm80.cu
+  gemm_universal_s8t_f16n_f16t_mixed_input_tensor_op_f32_sm80.cu
+  gemm_universal_u8t_f16n_f16t_mixed_input_tensor_op_f32_sm80.cu
+
+  gemm_universal_s8t_bf16n_f32t_mixed_input_tensor_op_f32_sm80.cu
+  gemm_universal_u8t_bf16n_f32t_mixed_input_tensor_op_f32_sm80.cu
+  gemm_universal_s8t_bf16n_bf16t_mixed_input_tensor_op_f32_sm80.cu
+  gemm_universal_u8t_bf16n_bf16t_mixed_input_tensor_op_f32_sm80.cu
+
+  gemm_universal_s8t_f16n_f16t_mixed_input_tensor_op_f16_sm80.cu
+  gemm_universal_u8t_f16n_f16t_mixed_input_tensor_op_f16_sm80.cu
+
+  gemm_universal_s4t_s8n_s32t_mixed_input_tensor_op_s32_sm80.cu
+  gemm_universal_s4t_s8n_s8t_mixed_input_tensor_op_s32_sm80.cu
+
+  # Upcast on Operand B
+  gemm_universal_f16t_s8n_f32t_mixed_input_tensor_op_f32_sm80.cu
+  gemm_universal_f16t_u8n_f32t_mixed_input_tensor_op_f32_sm80.cu
+  gemm_universal_f16t_s8n_f16t_mixed_input_tensor_op_f32_sm80.cu
+  gemm_universal_f16t_u8n_f16t_mixed_input_tensor_op_f32_sm80.cu
+
+  gemm_universal_bf16t_s8n_f32t_mixed_input_tensor_op_f32_sm80.cu
+  gemm_universal_bf16t_u8n_f32t_mixed_input_tensor_op_f32_sm80.cu
+  gemm_universal_bf16t_s8n_bf16t_mixed_input_tensor_op_f32_sm80.cu
+  gemm_universal_bf16t_u8n_bf16t_mixed_input_tensor_op_f32_sm80.cu
+
+  gemm_universal_f16t_s8n_f16t_mixed_input_tensor_op_f16_sm80.cu
+  gemm_universal_f16t_u8n_f16t_mixed_input_tensor_op_f16_sm80.cu
+
+  gemm_universal_s8t_s4n_s32t_mixed_input_tensor_op_s32_sm80.cu
+  gemm_universal_s8t_s4n_s8t_mixed_input_tensor_op_s32_sm80.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_tensorop_sm90
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 4
+
+  sm90_gemm_f16_f16_f16_tensor_op.cu
+  sm90_gett_f16_f16_f16_tensor_op.cu
+  sm90_gemm_bf16_bf16_bf16_tensor_op_f32.cu
+  sm90_gemm_s8_s8_s8_tensor_op_s32.cu
+  sm90_gemm_tf32_tf32_f32_tensor_op_f32.cu
+  sm90_gemm_f32_f32_f32_tensor_op_f32.cu
+  sm90_gemm_f8_f8_f32_tensor_op_fp32.cu
+  sm90_gemm_f8_f8_bf16_tensor_op_fp32.cu
+  sm90_gemm_f8_f8_f8_tensor_op_fp32.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_tensorop_sm90_stream_k
+
+  sm90_gemm_stream_k_scheduler.cu
+  sm90_gemm_f16_f16_f16_tensor_op_f32_cooperative_stream_k.cu
+  sm90_gemm_f8_f8_f32_tensor_op_f32_cooperative_stream_k.cu
+)
+
+# Alignment tests
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_tensorop_alignx_sm90
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 4
+
+  sm90_gemm_f16_f16_f16_alignx_tensor_op_f32.cu
+  sm90_gemm_f16_f16_f16_alignx_tensor_op_f32_warpspecialized.cu
+  sm90_gemm_f16_f16_f16_alignx_tensor_op_f32_warpspecialized_cooperative.cu
+  sm90_gemm_f16_f16_f16_alignx_tensor_op_f32_warpspecialized_pingpong.cu
+  sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32.cu
+  sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32_warpspecialized.cu
+  sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32_warpspecialized_cooperative.cu
+  sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32_warpspecialized_pingpong.cu
+  sm90_gemm_s8_s8_s8_alignx_tensor_op_s32.cu
+  sm90_gemm_s8_s8_s8_alignx_tensor_op_s32_warpspecialized.cu
+  sm90_gemm_s8_s8_s8_alignx_tensor_op_s32_warpspecialized_cooperative.cu
+  sm90_gemm_s8_s8_s8_alignx_tensor_op_s32_warpspecialized_pingpong.cu
+  sm90_gemm_tf32_tf32_f32_alignx_tensor_op_f32.cu
+  sm90_gemm_tf32_tf32_f32_alignx_tensor_op_f32_warpspecialized.cu
+  sm90_gemm_tf32_tf32_f32_alignx_tensor_op_f32_warpspecialized_cooperative.cu
+  sm90_gemm_tf32_tf32_f32_alignx_tensor_op_f32_warpspecialized_pingpong.cu
+)
+
+# Ptr Array test
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_tensorop_sm90_ptr_array
+  sm90_gemm_f16_f16_f16_tensor_op_f32_ptr_array.cu
+  sm90_gemm_f16_f16_f16_tensor_op_f32_ptr_array_pingpong.cu
+)
+
+# Group Gemm test
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_tensorop_sm90_group_gemm
+  sm90_gemm_f16_f16_f16_tensor_op_f32_group_gemm.cu
+  sm90_gemm_f16_f16_f16_tensor_op_f32_group_gemm_pingpong.cu
+)
+
+# Sparse tests
+# Sparse kernels trigger an ICE in gcc 7.5
+if (NOT (CUTLASS_GNU_HOST_COMPILE AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8.0))
+  
+  cutlass_test_unit_gemm_device_add_executable(
+    cutlass_test_unit_sparse_gemm_device_tensorop_sm90
+  
+    # No batching of source to control compiler memory usage
+    BATCH_SOURCES ON
+    BATCH_SIZE 1
+  
+    sm90_sparse_gemm_s8_s8_s32_tensor_op_s32.cu
+    sm90_sparse_gemm_f8_f8_f32_tensor_op_f32.cu
+    sm90_sparse_gemm_f16_f16_f32_tensor_op_f32.cu
+    sm90_sparse_gemm_tf32_tf32_f32_tensor_op_f32.cu
+  )
+
+endif()
+
+# Fused epilogue tests
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_tensorop_epilogue_fusion_sm90
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 4
+
+  sm90_gemm_f16_f16_f16_tensor_op_f32_tensor_broadcast.cu
+  sm90_gemm_f32_f32_f32_tensor_op_f32_tensor_broadcast.cu
+  sm90_gemm_s8_s8_s8_tensor_op_s32_tensor_broadcast.cu
+  sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_bias_elementwise.cu
+  sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_bias_elementwise.cu
+  sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_aux_load.cu
+  sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_aux_load.cu
+  sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_row_broadcast.cu
+  sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_row_broadcast.cu
+  sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_reduce.cu
+  sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_reduce.cu
+  sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_dag.cu
+  sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_dag.cu
+  sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_aux_store.cu
+  # Fp8
+  sm90_gemm_f8_f8_f8_tensor_op_fp32_evt.cu
+  sm90_gemm_f8_f8_bf16_tensor_op_fp32_evt.cu
+  sm90_gemm_f8_f8_f32_tensor_op_f32_cluster_warpspecialized_cooperative_evt.cu
+)
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_tensorop_cluster_multicast_sm90
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 4
+
+  sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_unspecialized.cu
+  sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized.cu
+  sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong.cu
+  sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative.cu
+  sm90_gemm_f8_f8_f32_tensor_op_f32_cluster_warpspecialized_cooperative.cu
+)
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_tensorop_gmma_rs_warpspecialized_sm90
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 4
+
+  sm90_gemm_tf32_tf32_f32_tensor_op_f32_gmma_rs_cluster_warpspecialized.cu
+  sm90_gemm_f8_f8_f32_tensor_op_f32_rs_cluster_warpspecialized_cooperative.cu
+  sm90_gemm_f16_f16_f32_tensor_op_f32_rs_cluster_warpspecialized_cooperative.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable(
   cutlass_test_unit_gemm_device_tensorop_f32_tf32_sm80
 
   BATCH_SOURCES ON
@@ -213,7 +406,7 @@ cutlass_test_unit_add_executable(
   gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu
   gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu
   gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu
-  gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu                 
+  gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu
   gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu
   gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu
   gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu
@@ -221,9 +414,10 @@ cutlass_test_unit_add_executable(
   gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu
   gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu
 
+  sm80_gemm_f16_f16_f32_tensor_op_f32.cu
 )
 
-cutlass_test_unit_add_executable(
+cutlass_test_unit_gemm_device_add_executable(
   cutlass_test_unit_gemm_device_tensorop_f64
 
   BATCH_SOURCES ON
@@ -232,16 +426,26 @@ cutlass_test_unit_add_executable(
   gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu
   gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu
 
-  gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu                             
+  gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu
   gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
   gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu
   gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu
   gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
   gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
 
+  # SM90 device level tests
+  gemm_f64n_f64t_f64t_tensor_op_f64_sm90.cu
+  gemm_f64t_f64n_f64t_tensor_op_f64_sm90.cu
+
+  sm80_gemm_f64_f64_f64_tensor_op_f64.cu
+
+  gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm90.cu
+  gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm90.cu
+  gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm90.cu
+  gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm90.cu
 )
 
-cutlass_test_unit_add_executable(
+cutlass_test_unit_gemm_device_add_executable(
   cutlass_test_unit_gemm_device_tensorop_s32_sm80
 
   BATCH_SOURCES ON
@@ -251,9 +455,10 @@ cutlass_test_unit_add_executable(
   gemm_s8t_s8n_s32n_tensor_op_s32_sm80.cu
   gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu
   gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu
+  gemm_s8t_s8n_f16t_tensor_op_s32_sm80.cu
   gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu
   gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu
-  gemm_s4t_s4n_s4n_tensor_op_s32_sm80.cu 
+  gemm_s4t_s4n_s4n_tensor_op_s32_sm80.cu
   gemm_s4t_s4n_s4t_tensor_op_s32_sm80.cu
   gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu
   gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu
@@ -262,7 +467,7 @@ cutlass_test_unit_add_executable(
   gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu
 )
 
-cutlass_test_unit_add_executable(
+cutlass_test_unit_gemm_device_add_executable(
   cutlass_test_unit_gemm_device_wmma
 
   BATCH_SOURCES ON
@@ -320,18 +525,30 @@ cutlass_test_unit_add_executable(
   gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu
 )
 
-cutlass_test_unit_add_executable(
+cutlass_test_unit_gemm_device_add_executable(
   cutlass_test_unit_gemm_device_tensorop_planar_complex
 
   BATCH_SOURCES ON
   BATCH_SIZE 4
 
-  gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu                              
-  gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu                                         
+  gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu
+  gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu
   gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu
 )
 
-cutlass_test_unit_add_executable(
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_tensorop_sm89
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 4
+
+  gemm_f8t_f8n_f32t_tensor_op_f32_sm89.cu
+  gemm_f8t_f8n_f32t_tensor_op_f32_sparse_sm89.cu
+  gemm_f8t_f8n_f8t_tensor_op_f32_sm89.cu
+#  gemm_f8t_f8n_f8t_tensor_op_f32_sparse_sm89.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable(
   cutlass_test_unit_gemm_device_grouped
 
   BATCH_SOURCES ON
@@ -340,7 +557,25 @@ cutlass_test_unit_add_executable(
   gemm_grouped_sm80.cu
 )
 
-cutlass_test_unit_add_executable(
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_grouped_scheduler
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 4
+
+  gemm_grouped_scheduler_sm80.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_grouped_rank_2k_scheduler
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 4
+
+  rank_2k_grouped_scheduler_sm80.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable(
   cutlass_test_unit_gemm_device_sparse_tensorop_sm80
 
   BATCH_SOURCES ON
@@ -361,8 +596,7 @@ cutlass_test_unit_add_executable(
   gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu
 )
 
-
-cutlass_test_unit_add_executable(
+cutlass_test_unit_gemm_device_add_executable(
   cutlass_test_unit_gemv_device
 
   BATCH_SOURCES ON
@@ -371,51 +605,31 @@ cutlass_test_unit_add_executable(
   gemv.cu
 )
 
-if (NOT CUDA_COMPILER MATCHES "[Cc]lang")
-
-add_dependencies(
-  cutlass_test_unit_gemm_device
-  cutlass_test_unit_gemm_device_gemm_with_fused_epilogue_tensorop
-  )
-
-add_dependencies(
-  test_unit_gemm_device
-  test_unit_gemm_device_gemm_with_fused_epilogue_tensorop
-  )
+if (CUTLASS_NVCC_DEVICE_COMPILE)
 
-cutlass_test_unit_add_executable(
-  cutlass_test_unit_gemm_device_gemm_with_fused_epilogue_tensorop
+  cutlass_test_unit_gemm_device_add_executable(
+    cutlass_test_unit_gemm_device_gemm_with_fused_epilogue_tensorop
   
-  gemm_with_reduction_f16n_f16n_f16n_tensorop_f32_sm75.cu
-  gemm_with_broadcast_f16n_f16n_f16n_tensorop_f32_sm75.cu
-
-  gemm_with_reduction_f16t_f16n_f16n_tensorop_f32_sm80.cu
-)
+    gemm_with_reduction_f16n_f16n_f16n_tensorop_f32_sm75.cu
+    gemm_with_broadcast_f16n_f16n_f16n_tensorop_f32_sm75.cu
+  
+    gemm_with_reduction_f16t_f16n_f16n_tensorop_f32_sm80.cu
+  )
 
 endif()
 
-if (NOT CUDA_COMPILER MATCHES "[Cc]lang")
+if (CUTLASS_NVCC_DEVICE_COMPILE)
 
-add_dependencies(
-  cutlass_test_unit_gemm_device
-  cutlass_test_unit_gemm_device_blas3
-  )
-
-add_dependencies(
-  test_unit_gemm_device
-  test_unit_gemm_device_blas3
-  )
-
-cutlass_test_unit_add_executable(
+cutlass_test_unit_gemm_device_add_executable(
   cutlass_test_unit_gemm_device_blas3
 
   BATCH_SOURCES ON
   BATCH_SIZE 4
 
-  ## SYRK 
+  ## SYRK
   # Syrk SM80 f64 tests
-  syrk_f64n_f64t_tensor_op_f64_sm80.cu 
-  syrk_f64t_f64n_tensor_op_f64_sm80.cu 
+  syrk_f64n_f64t_tensor_op_f64_sm80.cu
+  syrk_f64t_f64n_tensor_op_f64_sm80.cu
 
   # Syrk SM80 f32 tests
   syrk_tf32n_f32t_tensor_op_f32_sm80.cu
@@ -434,7 +648,13 @@ cutlass_test_unit_add_executable(
   syrk_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
   syrk_cf32n_cf32n_tensor_op_fast_f32_sm80.cu
 
-  ## HERK 
+  # Syrk SM90 f64 tests
+  syrk_f64_f64_tensor_op_f64_sm90.cu
+
+  # Syrk SM90 complex f64 tests
+  syrk_cf64_cf64_tensor_op_f64_sm90.cu
+
+  ## HERK
   # Herk SM80 complex f64 tests
   herk_cf64h_cf64n_tensor_op_f64_sm80.cu
 
@@ -442,6 +662,9 @@ cutlass_test_unit_add_executable(
   herk_cf32h_cf32n_tensor_op_f32_sm80.cu
   herk_cf32h_cf32n_tensor_op_fast_f32_sm80.cu
 
+  # Herk SM90 complex f64 tests
+  herk_cf64_cf64_tensor_op_f64_sm90.cu
+
   ## TRMM
   # Trmm SM80 f64 tests
   trmm_f64n_f64n_f64t_tensor_op_f64_ls_sm80.cu
@@ -468,6 +691,12 @@ cutlass_test_unit_add_executable(
   trmm_cf32n_cf32n_cf32t_tensor_op_f32_sm80.cu
   trmm_cf32n_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
 
+  # Trmm SM90 f64 tests
+  trmm_f64_f64_f64_tensor_op_f64_sm90.cu
+
+  # Trmm SM90 complex f64 tests
+  trmm_cf64_cf64_cf64_tensor_op_f64_sm90.cu
+
   ## SYR2K
   # Syr2k SM80 f64 tests
   syr2k_f64n_f64t_tensor_op_f64_sm80.cu
@@ -490,6 +719,12 @@ cutlass_test_unit_add_executable(
   syr2k_cf32n_cf32n_tensor_op_fast_f32_sm80.cu
   syr2k_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
 
+  # Syr2k SM90 f64 tests
+  syr2k_f64_f64_tensor_op_f64_sm90.cu
+
+  # Syr2k SM90 complex f64 tests
+  syr2k_cf64_cf64_tensor_op_f64_sm90.cu
+
   ## HER2K
   # Her2k SM80 complex f64 tests
   her2k_cf64n_cf64n_tensor_op_f64_sm80.cu
@@ -498,6 +733,9 @@ cutlass_test_unit_add_executable(
   her2k_cf32h_cf32n_tensor_op_f32_sm80.cu
   her2k_cf32h_cf32n_tensor_op_fast_f32_sm80.cu
 
+  # Her2k SM90 complex f64 tests
+  her2k_cf64_cf64_tensor_op_f64_sm90.cu
+
   ## SYMM
   # Symm SM80 f64 tests
   symm_f64n_f64n_tensor_op_f64_ls_sm80.cu
@@ -528,16 +766,58 @@ cutlass_test_unit_add_executable(
   symm_cf32n_cf32n_tensor_op_fast_f32_ls_sm80.cu
   symm_cf32n_cf32n_tensor_op_fast_f32_rs_sm80.cu
 
+  # Symm SM90 f64 tests
+  symm_f64_f64_tensor_op_f64_sm90.cu
+
+  # Symm SM90 complex f64 tests
+  symm_cf64_cf64_cf64_tensor_op_f64_sm90.cu
+
   # Hemm SM80 complex f64 tests
   hemm_cf64h_cf64n_cf64n_tensor_op_ls_f64_sm80.cu
   hemm_cf64h_cf64n_cf64n_tensor_op_rs_f64_sm80.cu
   hemm_cf64h_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu
-  
+
   # Hemm SM80 complex f32 tests
   hemm_cf32h_cf32n_tensor_op_f32_ls_sm80.cu
   hemm_cf32h_cf32n_tensor_op_f32_rs_sm80.cu
   hemm_cf32h_cf32n_tensor_op_fast_f32_ls_sm80.cu
   hemm_cf32h_cf32n_tensor_op_fast_f32_rs_sm80.cu
+
+  # Hemm SM90 complex f64 tests
+  hemm_cf64_cf64_cf64_tensor_op_f64_sm90.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_grouped_blas3
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 4
+
+  # Grouped SYR2K SM80 f64 tests
+  syr2k_f64n_f64n_tensor_op_f64_grouped_sm80.cu
+  syr2k_f64n_f64t_tensor_op_f64_grouped_sm80.cu
+  syr2k_f64t_f64n_tensor_op_f64_grouped_sm80.cu
+  syr2k_f64t_f64t_tensor_op_f64_grouped_sm80.cu
+
+  # Grouped SYR2K SM80 cf64 tests
+  syr2k_cf64n_cf64n_tensor_op_f64_grouped_sm80.cu
+  syr2k_cf64n_cf64t_tensor_op_f64_grouped_sm80.cu
+  syr2k_cf64t_cf64n_tensor_op_f64_grouped_sm80.cu
+  syr2k_cf64t_cf64t_tensor_op_f64_grouped_sm80.cu
+
+  # Grouped HER2K SM80 f64 tests
+  her2k_cf64n_cf64n_tensor_op_f64_grouped_sm80.cu
+  her2k_cf64h_cf64n_tensor_op_f64_grouped_sm80.cu
+)
+
+endif()
+
+if (CUTLASS_NVCC_DEVICE_COMPILE)
+
+  cutlass_test_unit_gemm_device_add_executable(
+    cutlass_test_unit_gemm_device_broadcast
+    gemm_f16t_f16n_f16t_tensor_op_f16_broadcast_sm80.cu
   )
 
 endif()
+
diff --git a/test/unit/gemm/device/default_gemm_configuration.hpp b/test/unit/gemm/device/default_gemm_configuration.hpp
new file mode 100644
index 0000000000..35e7f8b3e0
--- /dev/null
+++ b/test/unit/gemm/device/default_gemm_configuration.hpp
@@ -0,0 +1,1368 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cute/atom/mma_atom.hpp"
+#include "cute/atom/copy_atom.hpp"
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/layout/layout.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_mma.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+using namespace cute;
+
+// This type is only intended to demonstrate porting 2.x kernels to 3.0
+template<
+  class OperatorClass, class ArchTag,
+  class ElementA, class LayoutA,
+  class ElementB, class LayoutB,
+  class ElementC, class LayoutC,
+  class ElementAccumulator>
+struct DefaultGemmConfigurationToCutlass3Types {
+  static_assert(sizeof(ElementA) == 0, "No valid DefaultGemmConfigurationToCutlass3Types configuration exists.");
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <typename Element, typename Layout, int Alignment, int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandA;
+
+template <typename Element, typename Layout, int Alignment, int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandB;
+
+//
+// F16: 128-by-128-by-64
+//
+
+/// Operand A - Row-major (K-Major)
+template <>
+struct DefaultGemm_TensorOpSm80_OperandA<half_t, layout::RowMajor, 8, 64>
+{
+  // Smem
+  using SmemLayoutAtom = decltype(
+    composition(Swizzle<3,3,3>{},
+                Layout<Shape < _8,_64>,
+                       Stride<_64, _1>>{}));
+  using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, half_t>;
+
+  // Gmem
+  using GmemTiledCopy = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint128_t>, half_t>{},
+                    Layout<Shape <_16,_8>,
+                           Stride< _8,_1>>{},
+                    Layout<Shape < _1,_8>>{}));
+};
+
+/// Operand A - Column-major (M-major)
+template <int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandA<half_t, layout::ColumnMajor, 8, SizeK>
+{
+  // Smem
+  using SmemLayoutAtom = decltype(
+    composition(Swizzle<3,3,3>{},
+                Layout<Shape <_64, _8>,
+                       Stride< _1,_64>>{}));
+  using SmemCopyAtom = Copy_Atom<SM75_U16x8_LDSM_T, half_t>;
+
+  // Gmem
+  using GmemTiledCopy = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint128_t>, half_t>{},
+                    Layout<Shape <_16, _8>,
+                           Stride< _1,_16>>{},
+                    Layout<Shape < _8, _1>>{}));
+};
+
+// Because the F32F16 TiledMMA is A-B symmetric, we can reuse the DefaultOperands
+
+// Operand B - Column-Major (K-major)
+template <int Alignment, int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandB<half_t, layout::ColumnMajor, Alignment, SizeK>
+     : DefaultGemm_TensorOpSm80_OperandA<half_t, layout::RowMajor,    Alignment, SizeK>
+{};
+
+// Operand B - Row-Major (N-major)
+template <int Alignment, int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandB<half_t, layout::RowMajor,    Alignment, SizeK>
+     : DefaultGemm_TensorOpSm80_OperandA<half_t, layout::ColumnMajor, Alignment, SizeK>
+{};
+
+//
+// F16: 128-by-128-by-32 (small k-block)
+//
+
+/// Operand A - Row-major (K-Major)
+template <>
+struct DefaultGemm_TensorOpSm80_OperandA<half_t, layout::RowMajor, 8, 32>
+{
+  // Smem
+  using SmemLayoutAtom = decltype(
+    composition(Swizzle<2,3,3>{},
+                Layout<Shape < _8,_32>,
+                       Stride<_32, _1>>{}));
+  using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, half_t>;
+
+  // Gmem
+  using GmemTiledCopy = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint128_t>, half_t>{},
+                    Layout<Shape <_32,_4>,
+                           Stride< _4,_1>>{},
+                    Layout<Shape < _1,_8>>{}));
+};
+
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Ampere MMA F32F16
+template <typename LayoutA, typename LayoutB, typename LayoutC>
+struct DefaultGemmConfigurationToCutlass3Types<
+    arch::OpClassTensorOp, arch::Sm80,
+    half_t, LayoutA,
+    half_t, LayoutB,
+    float, LayoutC,
+    float>
+{
+  using TileShape = Shape<_128, _128, _32>;
+  static constexpr int ThreadCount = 128;
+  using DispatchPolicy = MainloopSm80CpAsync<3>;
+  using TiledMma = TiledMMA<
+      MMA_Atom<SM80_16x8x16_F32F16F16F32_TN>,
+      Layout<Shape<_2,_2,_1>>,  // 2x2x1 thread group
+      Tile<_32,_32,_16>>;       // 32x32x16 MMA for LDSM, 1x2x1 value group
+
+  // A
+  static constexpr int kAlignmentA = 8;
+  using DefaultOperandA = detail::DefaultGemm_TensorOpSm80_OperandA<
+    half_t, LayoutA, kAlignmentA, 32>;
+  using SmemLayoutAtomA = typename DefaultOperandA::SmemLayoutAtom; // M, K
+  using SmemCopyAtomA = typename DefaultOperandA::SmemCopyAtom;
+  using GmemTiledCopyA = typename DefaultOperandA::GmemTiledCopy;
+
+  // B
+  static constexpr int kAlignmentB = 8;
+  using DefaultOperandB = detail::DefaultGemm_TensorOpSm80_OperandB<
+    half_t, LayoutB, kAlignmentB, 32>;
+  using SmemLayoutAtomB = typename DefaultOperandB::SmemLayoutAtom; // N, K
+  using SmemCopyAtomB = typename DefaultOperandB::SmemCopyAtom;
+  using GmemTiledCopyB = typename DefaultOperandB::GmemTiledCopy;
+
+  // Mainloop
+  using CollectiveMainloop = collective::CollectiveMma<
+    DispatchPolicy, TileShape,
+    half_t, TagToStrideA_t<LayoutA>,
+    half_t, TagToStrideB_t<LayoutB>,
+    TiledMma,
+    GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  // Epilogue
+  using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    TagToStrideC_t<LayoutC>,
+    TagToStrideC_t<LayoutC>,
+    epilogue::thread::LinearCombination<float, 1, float, float>,
+    cutlass::gemm::EpilogueDefault>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+//
+// TF32: 128-by-128-by-kblock (kBlock = 16, 32)
+//
+
+/// Operand A - Row-major  (K-major) (kBlock = 32)
+template <>
+struct DefaultGemm_TensorOpSm80_OperandA<tfloat32_t, layout::RowMajor, 4, 32>
+{
+  // Smem
+  using SmemLayoutAtom = decltype(
+    composition(Swizzle<3,2,3>{},
+                Layout<Shape < _8,_32>,
+                       Stride<_32, _1>>{}));
+  using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, tfloat32_t>;
+
+  // Gmem
+  using GmemTiledCopy = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint128_t>, tfloat32_t>{},
+                    Layout<Shape <_16,_8>,
+                           Stride< _8,_1>>{},
+                    Layout<Shape < _1,_4>>{}));
+};
+
+/// Operand A - Row-major  (K-major) (kBlock = 16)
+template <>
+struct DefaultGemm_TensorOpSm80_OperandA<tfloat32_t, layout::RowMajor, 4, 16>
+{
+  // Smem
+  using SmemLayoutAtom = decltype(
+    composition(Swizzle<2,2,3>{},
+                Layout<Shape < _8,_16>,
+                       Stride<_16, _1>>{}));
+  using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, tfloat32_t>;
+  // Gmem
+  using GmemTiledCopy = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint128_t>, tfloat32_t>{},
+                    Layout<Shape <_32,_4>,
+                           Stride< _4,_1>>{},
+                    Layout<Shape < _1,_4>>{}));
+};
+
+/// Operand A - Column-major  (M-major)
+template <int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandA<tfloat32_t, layout::ColumnMajor, 4, SizeK>
+{
+  // Smem
+  using SmemLayoutAtom = decltype(
+    composition(Swizzle<3,2,3>{},
+                Layout<Shape <_32, _8>,
+                       Stride< _1,_32>>{}));
+  using SmemCopyAtom = Copy_Atom<UniversalCopy<tfloat32_t>, tfloat32_t>;
+  // Gmem
+  using GmemTiledCopy = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint128_t>, tfloat32_t>{},
+                    Layout<Shape <_16, _8>,
+                           Stride< _1,_16>>{},
+                    Layout<Shape < _4, _1>>{}));
+};
+
+// Because the TF32 TiledMMA is A-B symmetric, we can reuse the DefaultOperands
+
+// Operand B - Column-Major  (K-major)
+template <int Alignment, int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandB<tfloat32_t, layout::ColumnMajor, Alignment, SizeK>
+     : DefaultGemm_TensorOpSm80_OperandA<tfloat32_t, layout::RowMajor,    Alignment, SizeK>
+{};
+
+// Operand B - Row-Major  (N-major)
+template <int Alignment, int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandB<tfloat32_t, layout::RowMajor,    Alignment, SizeK>
+     : DefaultGemm_TensorOpSm80_OperandA<tfloat32_t, layout::ColumnMajor, Alignment, SizeK>
+{};
+
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Ampere MMA F32TF32
+template <typename LayoutA, typename LayoutB, typename LayoutC>
+struct DefaultGemmConfigurationToCutlass3Types<
+    arch::OpClassTensorOp, arch::Sm80,
+    tfloat32_t, LayoutA,
+    tfloat32_t, LayoutB,
+    float, LayoutC,
+    float>
+{
+  using TileShape = Shape<_128, _128, _32>;
+  static constexpr int ThreadCount = 128;
+  using DispatchPolicy = MainloopSm80CpAsync<3>;
+  using TiledMma = TiledMMA<
+      MMA_Atom<SM80_16x8x8_F32TF32TF32F32_TN>,
+      Layout<Shape<_2,_2,_1>, Stride<_2, _1, _1>>, // 2x2x1 thread group
+      Tile<_32,_32,_8>>;                           // 32x32x8 MMA for LDSM, 1x2x1 value group
+
+  // A
+  static constexpr int kAlignmentA = 4;
+  using DefaultOperandA = detail::DefaultGemm_TensorOpSm80_OperandA<
+    tfloat32_t, LayoutA, kAlignmentA, 32>;
+  using SmemLayoutAtomA = typename DefaultOperandA::SmemLayoutAtom; // M, K
+  using SmemCopyAtomA = typename DefaultOperandA::SmemCopyAtom;
+  using GmemTiledCopyA = typename DefaultOperandA::GmemTiledCopy;
+
+  // B
+  static constexpr int kAlignmentB = 4;
+  using DefaultOperandB = detail::DefaultGemm_TensorOpSm80_OperandB<
+    tfloat32_t, LayoutB, kAlignmentB, 32>;
+  using SmemLayoutAtomB = typename DefaultOperandB::SmemLayoutAtom; // N, K
+  using SmemCopyAtomB = typename DefaultOperandB::SmemCopyAtom;
+  using GmemTiledCopyB = typename DefaultOperandB::GmemTiledCopy;
+
+  // Mainloop
+  using CollectiveMainloop = collective::CollectiveMma<
+    DispatchPolicy, TileShape,
+    tfloat32_t, TagToStrideA_t<LayoutA>,
+    tfloat32_t, TagToStrideB_t<LayoutB>,
+    TiledMma,
+    GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  // Epilogue
+  using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    TagToStrideC_t<LayoutC>,
+    TagToStrideC_t<LayoutC>,
+    epilogue::thread::LinearCombination<float, 1, float, float>,
+    cutlass::gemm::EpilogueDefault>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+template <typename LayoutC>
+struct DefaultGemmConfigurationToCutlass3Types<
+    arch::OpClassTensorOp, arch::Sm80,
+    int8_t, cutlass::layout::RowMajor,
+    int8_t, cutlass::layout::ColumnMajor,
+    int32_t, LayoutC,
+    int32_t>
+{
+  using TileShape = Shape<_128, _128, _64>;
+  static constexpr int ThreadCount = 128;
+  using DispatchPolicy = MainloopSm80CpAsync<3>;
+  using TiledMma = TiledMMA<
+      MMA_Atom<SM80_16x8x32_S32S8S8S32_TN>,
+      Layout<Shape<_2,_2,_1>>,   // 2x2x1 thread group
+      Tile<_32,_32,_32>>;        // 16x16x32 MMA for LDSM, 1x2x1 value group
+
+  // A (M,K)  K-major
+  using SmemLayoutAtomA = decltype(
+    composition(
+      Swizzle<2,4,3>{},
+      Layout<Shape <_16,_64>,
+             Stride<_64, _1>>{}));
+  static constexpr int kAlignmentA = 16;
+  using GmemTiledCopyA = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint128_t>, int8_t>{},
+                    Layout<Shape <_32,_4>,
+                           Stride< _4,_1>>{},
+                    Layout<Shape<_1,Int<kAlignmentA>>>{}));
+  // LDS.32- or LDSM-based copy atom
+  // using SmemCopyAtomA = Copy_Atom<DefaultCopy, uint8_t>;
+  using SmemCopyAtomA = Copy_Atom<SM75_U32x4_LDSM_N, uint8_t>;  // LDSM works
+
+  // B (N,K)  K-major
+  using SmemLayoutAtomB = decltype(
+    composition(
+      Swizzle<2,4,3>{},
+      Layout<Shape <_16,_64>,
+             Stride<_64, _1>>{}));
+  static constexpr int kAlignmentB = 16;
+  using GmemTiledCopyB = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint128_t>, int8_t>{},
+                    Layout<Shape <_32,_4>,
+                           Stride< _4,_1>>{},
+                    Layout<Shape<_1,Int<kAlignmentB>>>{}));
+
+  // LDS.32- or LDSM-based copy atom
+  // using SmemCopyAtomB = Copy_Atom<DefaultCopy, uint32_t>;
+  using SmemCopyAtomB = Copy_Atom<SM75_U32x4_LDSM_N, uint8_t>;  // LDSM works
+
+  // Mainloop
+  using CollectiveMainloop = collective::CollectiveMma<
+    DispatchPolicy, TileShape,
+    int8_t, TagToStrideA_t<cutlass::layout::RowMajor>,
+    int8_t, TagToStrideB_t<cutlass::layout::ColumnMajor>,
+    TiledMma,
+    GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    TagToStrideC_t<LayoutC>,
+    TagToStrideC_t<LayoutC>,
+    epilogue::thread::LinearCombination<int32_t, 1, int32_t, int32_t>,
+    cutlass::gemm::EpilogueDefault>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//////////////////////////// SIMT TWO STAGE ///////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <typename Element, typename Layout, int ThreadCount, int ShapeM, int ShapeK>
+struct DefaultGemm_Simt_OperandA;
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename Element>
+struct DefaultGemm_Simt_OperandA<Element, layout::ColumnMajor, 256, 128, 8>
+{
+  using SmemLayoutAtom = Layout<Shape <_128,  _8>,
+                                Stride<  _1,_128>>;
+
+  using SmemCopyAtom = Copy_Atom<DefaultCopy, Element>;
+
+  using GmemTiledCopy = decltype(
+    make_tiled_copy(Copy_Atom<UniversalCopy<Element>, Element>{},
+                    Layout<Shape <_32, _8>,
+                           Stride< _1,_32>>{},
+                    Layout<Shape<_1,_1>>{}));
+};
+
+template <typename Element>
+struct DefaultGemm_Simt_OperandA<Element, layout::RowMajor, 256, 128, 8>
+{
+  using SmemLayoutAtom = Layout<Shape <_128,          _8>,
+                                Stride<  _1,Int<128 + 4>>>;   // Padded
+
+  using SmemCopyAtom = Copy_Atom<DefaultCopy, Element>;
+
+  using GmemTiledCopy = decltype(
+    make_tiled_copy(Copy_Atom<UniversalCopy<Element>, Element>{},
+                    Layout<Shape <_32, _8>,
+                           Stride< _8, _1>>{},
+                    Layout<Shape<_1,_1>>{}));
+
+};
+
+template <typename Element, typename Layout, int ThreadCount, int ShapeN, int ShapeK>
+struct DefaultGemm_Simt_OperandB;
+
+template <typename Element, int ThreadCount, int ShapeN, int ShapeK>
+struct DefaultGemm_Simt_OperandB<Element, layout::ColumnMajor, ThreadCount, ShapeN, ShapeK>
+     : DefaultGemm_Simt_OperandA<Element, layout::RowMajor,    ThreadCount, ShapeN, ShapeK> {};
+
+template <typename Element, int ThreadCount, int ShapeN, int ShapeK>
+struct DefaultGemm_Simt_OperandB<Element, layout::RowMajor,    ThreadCount, ShapeN, ShapeK>
+     : DefaultGemm_Simt_OperandA<Element, layout::ColumnMajor, ThreadCount, ShapeN, ShapeK> {};
+
+} // end namespace detail
+
+// SIMT Two Stage
+template <
+  class ArchTag,
+  class ElementA, class LayoutA,
+  class ElementB, class LayoutB,
+  class ElementC, class LayoutC,
+  class ElementAccumulator>
+struct DefaultGemmConfigurationToCutlass3Types<
+    arch::OpClassSimt, ArchTag,
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator>
+{
+  using TileShape = Shape<_128, _128, _8>;
+  static constexpr int ThreadCount = 256;
+  using DispatchPolicy = MainloopSm70TwoStage;
+  using TiledMma = TiledMMA<
+      MMA_Atom<UniversalFMA<ElementAccumulator, ElementA, ElementB, ElementC>>,
+      Layout<Shape<_16, _16, _1>>>;
+
+  // A
+  static constexpr int kAlignmentA = 1;
+  using DefaultOperandA = detail::DefaultGemm_Simt_OperandA<ElementA, LayoutA, ThreadCount, 128, 8>;
+  using SmemLayoutAtomA = typename DefaultOperandA::SmemLayoutAtom;
+  using SmemCopyAtomA   = typename DefaultOperandA::SmemCopyAtom;
+  using GmemTiledCopyA  = typename DefaultOperandA::GmemTiledCopy;
+
+  // B
+  static constexpr int kAlignmentB = 1;
+  using DefaultOperandB = detail::DefaultGemm_Simt_OperandB<ElementB, LayoutB, ThreadCount, 128, 8>;
+  using SmemLayoutAtomB = typename DefaultOperandB::SmemLayoutAtom;
+  using SmemCopyAtomB   = typename DefaultOperandB::SmemCopyAtom;
+  using GmemTiledCopyB  = typename DefaultOperandB::GmemTiledCopy;
+
+  // Mainloop
+  using CollectiveMainloop = collective::CollectiveMma<
+    DispatchPolicy, TileShape,
+    ElementA, TagToStrideA_t<LayoutA>,
+    ElementB, TagToStrideB_t<LayoutB>,
+    TiledMma,
+    GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  // Epilogue
+  using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    TagToStrideC_t<LayoutC>,
+    TagToStrideC_t<LayoutC>,
+    epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::EpilogueDefault>;
+};
+
+
+//
+// DP4A - int8    Proof-of-concept
+//
+
+// SIMT Two Stage TN - idp4a
+template <
+  class ArchTag,
+  class ElementC, class LayoutC>
+struct DefaultGemmConfigurationToCutlass3Types<
+    arch::OpClassSimt, ArchTag,
+    int8_t, cutlass::layout::RowMajor,
+    int8_t, cutlass::layout::ColumnMajor,
+    ElementC, LayoutC,
+    int32_t>
+{
+  using TileShape = Shape<_128, _128, _32>;
+  static constexpr int ThreadCount = 256;
+  using DispatchPolicy = MainloopSm70TwoStage;
+  // NOTE: permuting MMA M mode lets us generate 128b smem loads (LDS.128) but has worst case bank conflicts
+  using TiledMma = TiledMMA<
+      MMA_Atom<SM61_DP4A>,
+      Layout<Shape<_16,_16,_1>>>;  // Tile of atoms (threads)
+
+  // A (M,K)  K-major
+  using ElementA = int8_t;
+  // 40% from regular M and N major layout
+  // using SmemLayoutAtomA = Layout<Shape <_128,_32>,
+  //                                Stride<  _1,_128>>;
+  // 80% from interleaved layouts
+  using SmemLayoutAtomA = Layout<Shape <_128, Shape <_4,  _8>>,
+                                 Stride<  _4, Stride<_1,_512>>>;
+
+  using SmemCopyAtomA = Copy_Atom<DefaultCopy, ElementA>;
+  static constexpr int kAlignmentA = 4;
+  using GmemTiledCopyA = decltype(
+    make_tiled_copy(Copy_Atom<UniversalCopy<cute::uint32_t>, ElementA>{},
+                    Layout<Shape <_32,_8>,
+                           Stride< _8,_1>>{},
+                    Layout<Shape < _1,_4>>{}));
+
+  // B (N,K)  K-major
+  using ElementB = int8_t;
+  // 40% from regular M and N major layout
+  // using SmemLayoutAtomB = Layout<Shape <_128,_32>,
+  //                                Stride<  _1,_128>>;
+  // 80% from interleaved layouts
+  using SmemLayoutAtomB = Layout<Shape <_128, Shape <_4,  _8>>,
+                                 Stride<  _4, Stride<_1,_512>>>;
+
+  using SmemCopyAtomB = Copy_Atom<DefaultCopy, ElementB>;
+  static constexpr int kAlignmentB = 4;
+  using GmemTiledCopyB = decltype(
+    make_tiled_copy(Copy_Atom<UniversalCopy<cute::uint32_t>, ElementB>{},
+                    Layout<Shape <_32,_8>,
+                           Stride< _8,_1>>{},
+                    Layout<Shape < _1,_4>>{}));
+
+  // Mainloop
+  using CollectiveMainloop = collective::CollectiveMma<
+    DispatchPolicy, TileShape,
+    ElementA, TagToStrideA_t<cutlass::layout::RowMajor>,
+    ElementB, TagToStrideB_t<cutlass::layout::ColumnMajor>,
+    TiledMma,
+    GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  // Epilogue
+  using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    TagToStrideC_t<LayoutC>,
+    TagToStrideC_t<LayoutC>,
+    epilogue::thread::LinearCombination<ElementC, 1, int32_t, int32_t>,
+    cutlass::gemm::EpilogueDefault>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+// SIMT Two Stage NN - idp4a
+template <
+  class ArchTag,
+  class ElementC, class LayoutC>
+struct DefaultGemmConfigurationToCutlass3Types<
+    arch::OpClassSimt, ArchTag,
+    int8_t, cutlass::layout::ColumnMajor,
+    int8_t, cutlass::layout::ColumnMajor,
+    ElementC, LayoutC,
+    int32_t>
+{
+  using TileShape = Shape<_128, _128, _32>;
+  static constexpr int ThreadCount = 256;
+
+  using DispatchPolicy = MainloopSm70TwoStage;
+
+  using TiledMma = TiledMMA<
+      MMA_Atom<SM61_DP4A>,
+      Layout<Shape<_16, _16, _1>>>;
+
+  // A (M,K)  M-major
+  using ElementA = int8_t;
+  using SmemLayoutAtomA = Layout<Shape <_128, Shape <_4,  _8>>,
+                                 Stride<  _4, Stride<_1,_512>>>;
+  using SmemCopyAtomA = Copy_Atom<DefaultCopy, ElementA>;
+  static constexpr int kAlignmentA = 1;
+  using GmemTiledCopyA = decltype(
+    make_tiled_copy(Copy_Atom<UniversalCopy<cute::uint8_t>, ElementA>{},
+                    Layout<Shape <_32, _8>,
+                           Stride< _1,_32>>{},
+                    Layout<Shape < _1, _1>>{}));
+
+  // B (N,K)  K-major
+  using ElementB = int8_t;
+  using SmemLayoutAtomB = Layout<Shape <_128, Shape <_4,  _8>>,
+                                 Stride<  _4, Stride<_1,_512>>>;
+  using SmemCopyAtomB = Copy_Atom<DefaultCopy, ElementB>;
+  static constexpr int kAlignmentB = 4;
+  using GmemTiledCopyB = decltype(
+    make_tiled_copy(Copy_Atom<UniversalCopy<cute::uint32_t>, ElementB>{},
+                    Layout<Shape <_32,_8>,
+                           Stride< _8,_1>>{},
+                    Layout<Shape < _1,_4>>{}));
+
+  // Mainloop
+  using CollectiveMainloop = collective::CollectiveMma<
+    DispatchPolicy, TileShape,
+    ElementA, TagToStrideA_t<cutlass::layout::ColumnMajor>,
+    ElementB, TagToStrideB_t<cutlass::layout::ColumnMajor>,
+    TiledMma,
+    GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  // Epilogue
+  using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    TagToStrideC_t<LayoutC>,
+    TagToStrideC_t<LayoutC>,
+    epilogue::thread::LinearCombination<ElementC, 1, int32_t, int32_t>,
+    cutlass::gemm::EpilogueDefault>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+// SIMT Two Stage NT - idp4a
+template <
+  class ArchTag,
+  class ElementC, class LayoutC>
+struct DefaultGemmConfigurationToCutlass3Types<
+    arch::OpClassSimt, ArchTag,
+    int8_t, cutlass::layout::ColumnMajor,
+    int8_t, cutlass::layout::RowMajor,
+    ElementC, LayoutC,
+    int32_t>
+{
+  using TileShape = Shape<_128, _128, _32>;
+  static constexpr int ThreadCount = 256;
+  using DispatchPolicy = MainloopSm70TwoStage;
+  using TiledMma = TiledMMA<
+      MMA_Atom<SM61_DP4A>,
+      Layout<Shape<_16, _16, _1>>>;
+
+  // A (M,K)  M-major
+  using ElementA = int8_t;
+  using SmemLayoutAtomA = Layout<Shape <_128, Shape <_4,  _8>>,
+                                 Stride<  _4, Stride<_1,_512>>>;
+  using SmemCopyAtomA = Copy_Atom<DefaultCopy, ElementA>;
+  static constexpr int kAlignmentA = 1;
+  using GmemTiledCopyA = decltype(
+    make_tiled_copy(Copy_Atom<UniversalCopy<cute::uint8_t>, ElementA>{},
+                    Layout<Shape <_32, _8>,
+                           Stride< _1,_32>>{},
+                    Layout<Shape < _1, _1>>{}));
+
+  // B (N,K)  N-major
+  using ElementB = int8_t;
+  using SmemLayoutAtomB = Layout<Shape <_128, Shape <_4,  _8>>,
+                                 Stride<  _4, Stride<_1,_512>>>;
+  using SmemCopyAtomB = Copy_Atom<DefaultCopy, ElementB>;
+  static constexpr int kAlignmentB = 1;
+  using GmemTiledCopyB = decltype(
+    make_tiled_copy(Copy_Atom<UniversalCopy<cute::uint8_t>, ElementB>{},
+                    Layout<Shape <_32, _8>,
+                           Stride< _1,_32>>{},
+                    Layout<Shape < _1, _1>>{}));
+
+  // Mainloop
+  using CollectiveMainloop = collective::CollectiveMma<
+    DispatchPolicy, TileShape,
+    ElementA, TagToStrideA_t<cutlass::layout::ColumnMajor>,
+    ElementB, TagToStrideB_t<cutlass::layout::RowMajor>,
+    TiledMma,
+    GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  // Epilogue
+  using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    TagToStrideC_t<LayoutC>,
+    TagToStrideC_t<LayoutC>,
+    epilogue::thread::LinearCombination<ElementC, 1, int32_t, int32_t>,
+    cutlass::gemm::EpilogueDefault>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+// SIMT Two Stage TT - idp4a
+template <
+  class ArchTag,
+  class ElementC, class LayoutC>
+struct DefaultGemmConfigurationToCutlass3Types<
+    arch::OpClassSimt, ArchTag,
+    int8_t, cutlass::layout::RowMajor,
+    int8_t, cutlass::layout::RowMajor,
+    ElementC, LayoutC,
+    int32_t>
+{
+  using TileShape = Shape<_128, _128, _32>;
+  static constexpr int ThreadCount = 256;
+  using DispatchPolicy = MainloopSm70TwoStage;
+  using TiledMma = TiledMMA<
+      MMA_Atom<SM61_DP4A>,
+      Layout<Shape<_16, _16, _1>>>;
+
+  // A (M,K)  K-major
+  using ElementA = int8_t;
+  using SmemLayoutAtomA = Layout<Shape <_128, Shape <_4,  _8>>,
+                                 Stride<  _4, Stride<_1,_512>>>;
+  using SmemCopyAtomA = Copy_Atom<DefaultCopy, ElementA>;
+  static constexpr int kAlignmentA = 4;
+  using GmemTiledCopyA = decltype(
+    make_tiled_copy(Copy_Atom<UniversalCopy<cute::uint32_t>, ElementA>{},
+                    Layout<Shape <_32,_8>,
+                           Stride< _8,_1>>{},
+                    Layout<Shape < _1,_4>>{}));
+
+  // B (N,K)  N-major
+  using ElementB = int8_t;
+  using SmemLayoutAtomB = Layout<Shape <_128, Shape <_4,  _8>>,
+                                 Stride<  _4, Stride<_1,_512>>>;
+  using SmemCopyAtomB = Copy_Atom<DefaultCopy, ElementB>;
+  static constexpr int kAlignmentB = 1;
+  using GmemTiledCopyB = decltype(
+    make_tiled_copy(Copy_Atom<UniversalCopy<cute::uint8_t>, ElementB>{},
+                    Layout<Shape <_32, _8>,
+                           Stride< _1,_32>>{},
+                    Layout<Shape < _1, _1>>{}));
+
+  // Mainloop
+  using CollectiveMainloop = collective::CollectiveMma<
+    DispatchPolicy, TileShape,
+    ElementA, TagToStrideA_t<cutlass::layout::RowMajor>,
+    ElementB, TagToStrideB_t<cutlass::layout::RowMajor>,
+    TiledMma,
+    GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  // Epilogue
+  using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    TagToStrideC_t<LayoutC>,
+    TagToStrideC_t<LayoutC>,
+    epilogue::thread::LinearCombination<ElementC, 1, int32_t, int32_t>,
+    cutlass::gemm::EpilogueDefault>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////// SIMT MULTI STAGE //////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+// SIMT Multi Stage NT
+template <
+  class ElementA,
+  class ElementB,
+  class ElementC, class LayoutC,
+  class ElementAccumulator>
+struct DefaultGemmConfigurationToCutlass3Types<
+    arch::OpClassSimt, arch::Sm80,
+    ElementA, cutlass::layout::ColumnMajor,
+    ElementB, cutlass::layout::RowMajor,
+    ElementC, LayoutC,
+    ElementAccumulator>
+{
+  using TileShape = Shape<_128, _128, _16>;
+  static constexpr int ThreadCount = 256;
+  using DispatchPolicy = MainloopSm80CpAsync<3>;
+  using TiledMma = TiledMMA<
+      MMA_Atom<UniversalFMA<ElementAccumulator, ElementA, ElementB, ElementC>>,
+      Layout<Shape<_16, _16, _1>>,                            // 16x16x1 thread group
+      Tile<Layout<Shape<_16,_2>,Stride<_2,_1>>,               // 32x32x1 MMA with perm for load vectorization
+           Layout<Shape<_16,_2>,Stride<_2,_1>>,Underscore>>;
+
+  // A (M,K)  M-major
+  using SmemLayoutAtomA = Layout<Shape<_128,_16>>;
+  using SmemCopyAtomA = Copy_Atom<DefaultCopy, ElementA>;
+  static constexpr int kAlignmentA = 2;
+  using AlignmentTypeA = cute::uint_byte_t<static_cast<int>(sizeof(ElementA)) * kAlignmentA>;
+  using GmemTiledCopyA = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<AlignmentTypeA>, ElementA>{},
+                    Layout<Shape<_32,_8>>{},
+                    Layout<Shape< _2,_1>>{}));
+
+  // B (N,K)  N-major
+  using SmemLayoutAtomB = Layout<Shape<_128,_16>>;
+  using SmemCopyAtomB = Copy_Atom<DefaultCopy, ElementB>;
+  static constexpr int kAlignmentB = 2;
+  using AlignmentTypeB = cute::uint_byte_t<static_cast<int>(sizeof(ElementB)) * kAlignmentB>;
+  using GmemTiledCopyB = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<AlignmentTypeB>, ElementB>{},
+                    Layout<Shape<_32,_8>>{},
+                    Layout<Shape< _2,_1>>{}));
+
+  // Mainloop
+  using CollectiveMainloop = collective::CollectiveMma<
+    DispatchPolicy, TileShape,
+    ElementA, TagToStrideA_t<cutlass::layout::ColumnMajor>,
+    ElementB, TagToStrideB_t<cutlass::layout::RowMajor>,
+    TiledMma,
+    GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  // Epilogue
+  using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    TagToStrideC_t<LayoutC>,
+    TagToStrideC_t<LayoutC>,
+    epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::EpilogueDefault>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+// SIMT Multi Stage TN
+template <
+  class ElementA,
+  class ElementB,
+  class ElementC, class LayoutC,
+  class ElementAccumulator>
+struct DefaultGemmConfigurationToCutlass3Types<
+    arch::OpClassSimt, arch::Sm80,
+    ElementA, cutlass::layout::RowMajor,
+    ElementB, cutlass::layout::ColumnMajor,
+    ElementC, LayoutC,
+    ElementAccumulator>
+{
+  using TileShape = Shape<_128, _128, _16>;
+  static constexpr int ThreadCount = 256;
+  using DispatchPolicy = MainloopSm80CpAsync<3>;
+  using TiledMma = TiledMMA<
+      MMA_Atom<UniversalFMA<ElementAccumulator, ElementA, ElementB, ElementC>>,
+      Layout<Shape<_16, _16, _1>>>;
+
+  // A (M,K)  K-major
+  using SmemLayoutAtomA = Layout<Shape <_128,          _16>,
+                                 Stride<  _1, Int<128 + 1>>>;  // Padded by kAlignmentA
+  using SmemCopyAtomA = Copy_Atom<DefaultCopy, ElementA>;
+  static constexpr int kAlignmentA = 1;
+  using GmemTiledCopyA = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementA>, ElementA>{},
+                    Layout<Shape <_16,_16>,
+                           Stride<_16, _1>>{}));
+
+  // B (N,K)  K-major
+  using SmemLayoutAtomB = Layout<Shape <_128,          _16>,
+                                 Stride<  _1, Int<128 + 1>>>;  // Padded by kAlignmentB
+  using SmemCopyAtomB = Copy_Atom<DefaultCopy, ElementB>;
+  static constexpr int kAlignmentB = 1;
+  using GmemTiledCopyB = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementB>, ElementB>{},
+                    Layout<Shape <_16,_16>,
+                           Stride<_16, _1>>{}));
+
+  // Mainloop
+  using CollectiveMainloop = collective::CollectiveMma<
+    DispatchPolicy, TileShape,
+    ElementA, TagToStrideA_t<cutlass::layout::RowMajor>,
+    ElementB, TagToStrideB_t<cutlass::layout::ColumnMajor>,
+    TiledMma,
+    GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  // Epilogue
+  using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    TagToStrideC_t<LayoutC>,
+    TagToStrideC_t<LayoutC>,
+    epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::EpilogueDefault>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+// SIMT Multi Stage NN
+template <
+  class ElementA,
+  class ElementB,
+  class ElementC, class LayoutC,
+  class ElementAccumulator>
+struct DefaultGemmConfigurationToCutlass3Types<
+    arch::OpClassSimt, arch::Sm80,
+    ElementA, cutlass::layout::ColumnMajor,
+    ElementB, cutlass::layout::ColumnMajor,
+    ElementC, LayoutC,
+    ElementAccumulator>
+{
+  using TileShape = Shape<_128, _128, _16>;
+  static constexpr int ThreadCount = 256;
+  using DispatchPolicy = MainloopSm80CpAsync<3>;
+  using TiledMma = TiledMMA<
+      MMA_Atom<UniversalFMA<ElementAccumulator, ElementA, ElementB, ElementC>>,
+      Layout<Shape<_16, _16, _1>>,                                      // 16x16x1 thread group
+      Tile<Layout<Shape<_16,_2>,Stride<_2,_1>>,Underscore,Underscore>>; // 32x16x1 MMA with perm for load vectorization
+
+  // A (M,K)  M-major
+  using SmemLayoutAtomA = Layout<Shape<_128,_16>>;
+  using SmemCopyAtomA = Copy_Atom<DefaultCopy, ElementA>;
+  static constexpr int kAlignmentA = 2;
+  using AlignmentTypeA = cute::uint_byte_t<static_cast<int>(sizeof(ElementA)) * kAlignmentA>;
+  using GmemTiledCopyA = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<AlignmentTypeA>, ElementA>{},
+                    Layout<Shape<_32,_8>>{},
+                    Layout<Shape< _2,_1>>{}));
+
+  // B (N,K)  K-major
+  using SmemLayoutAtomB = Layout<Shape <_128,          _16>,
+                                 Stride<  _1, Int<128 + 1>>>;  // Padded by kAlignmentB
+  using SmemCopyAtomB = Copy_Atom<DefaultCopy, ElementB>;
+  static constexpr int kAlignmentB = 1;
+  using GmemTiledCopyB = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementB>, ElementB>{},
+                    Layout<Shape <_16,_16>,
+                           Stride<_16, _1>>{}));
+
+  // Mainloop
+  using CollectiveMainloop = collective::CollectiveMma<
+    DispatchPolicy, TileShape,
+    ElementA, TagToStrideA_t<cutlass::layout::ColumnMajor>,
+    ElementB, TagToStrideB_t<cutlass::layout::ColumnMajor>,
+    TiledMma,
+    GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  // Epilogue
+  using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    TagToStrideC_t<LayoutC>,
+    TagToStrideC_t<LayoutC>,
+    epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::EpilogueDefault>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+// SIMT Multi Stage TT
+template <
+  class ElementA,
+  class ElementB,
+  class ElementC, class LayoutC,
+  class ElementAccumulator>
+struct DefaultGemmConfigurationToCutlass3Types<
+    arch::OpClassSimt, arch::Sm80,
+    ElementA, cutlass::layout::RowMajor,
+    ElementB, cutlass::layout::RowMajor,
+    ElementC, LayoutC,
+    ElementAccumulator>
+{
+  using TileShape = Shape<_128, _128, _16>;
+  static constexpr int ThreadCount = 256;
+  using DispatchPolicy = MainloopSm80CpAsync<3>;
+  using TiledMma = TiledMMA<
+      MMA_Atom<UniversalFMA<ElementAccumulator, ElementA, ElementB, ElementC>>,
+      Layout<Shape<_16, _16, _1>>,                                      // 16x16x1 thread group
+      Tile<Underscore,Layout<Shape<_16,_2>,Stride<_2,_1>>,Underscore>>; // 16x32x1 MMA with perm for load vectorization
+
+  // A (M,K)  K-major
+  using SmemLayoutAtomA = Layout<Shape <_128,          _16>,
+                                 Stride<  _1, Int<128 + 1>>>;  // Padded by kAlignmentA
+  using SmemCopyAtomA = Copy_Atom<DefaultCopy, ElementA>;
+  static constexpr int kAlignmentA = 1;
+  using GmemTiledCopyA = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementA>, ElementA>{},
+                    Layout<Shape <_16,_16>,
+                           Stride<_16, _1>>{}));
+
+  // B (N,K)  N-major
+  using SmemLayoutAtomB = Layout<Shape <_128,_16>>;
+  using SmemCopyAtomB = Copy_Atom<DefaultCopy, ElementB>;
+  static constexpr int kAlignmentB = 2;
+  using AlignmentTypeB = cute::uint_byte_t<static_cast<int>(sizeof(ElementB)) * kAlignmentB>;
+  using GmemTiledCopyB = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<AlignmentTypeB>, ElementB>{},
+                    Layout<Shape<_32,_8>>{},
+                    Layout<Shape< _2,_1>>{}));
+
+  // Mainloop
+  using CollectiveMainloop = collective::CollectiveMma<
+    DispatchPolicy, TileShape,
+    ElementA, TagToStrideA_t<cutlass::layout::RowMajor>,
+    ElementB, TagToStrideB_t<cutlass::layout::RowMajor>,
+    TiledMma,
+    GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  // Epilogue
+  using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    TagToStrideC_t<LayoutC>,
+    TagToStrideC_t<LayoutC>,
+    epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::EpilogueDefault>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Ampere fp64 MMA TN (K-Major A and K-Major B)
+template <>
+struct DefaultGemmConfigurationToCutlass3Types<
+    arch::OpClassTensorOp, arch::Sm80,
+    double, cutlass::layout::RowMajor,
+    double, cutlass::layout::ColumnMajor,
+    double, cutlass::layout::ColumnMajor,
+    double>
+{
+  using TileShape = Shape<_128, _64, _16>;
+  static constexpr int ThreadCount = 128;
+  using DispatchPolicy = MainloopSm80CpAsync<3>;
+  using TiledMma = TiledMMA<
+      MMA_Atom<SM80_8x8x4_F64F64F64F64_TN>,            // Atom
+      Layout<Shape<_2,_2,_1>>,                         // Atom layout
+      Tile<Layout<Shape<_16,_2>,Stride<_2,_1>>,        // 32x32x4 MMA with perm for load vectorization
+           Layout<Shape<_16,_2>,Stride<_2,_1>>,
+           Underscore>>;
+
+  // A  (M,K)  K-Major
+  using SmemLayoutAtomA = decltype(
+      composition(Swizzle<2,0,4>{},
+                  Layout<Shape <_4,_16>,
+                         Stride<_1, _4>>{})); // M, K
+  using SmemCopyAtomA = Copy_Atom<DefaultCopy, double>;
+  static constexpr int kAlignmentA = 1;
+  using GmemTiledCopyA = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<double>, double>{}, // CopyAtom
+                    Layout<Shape < _8,_16>,
+                           Stride<_16, _1>>{},                           // ThrLayout for CopyAtom
+                    Layout<Shape<_1,_1>>{}));                            // Value layout: 1x1 doubles
+
+  // B  (N,K)  K-Major
+  using SmemLayoutAtomB = decltype(
+      composition(Swizzle<2,0,4>{},
+                  Layout<Shape <_4,_16>,
+                         Stride<_1, _4>>{})); // N, K
+  using SmemCopyAtomB = Copy_Atom<DefaultCopy, double>;
+  static constexpr int kAlignmentB = 1;
+  using GmemTiledCopyB = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<double>, double>{}, // CopyAtom
+                    Layout<Shape < _8,_16>,
+                           Stride<_16, _1>>{},                           // ThrLayout for CopyAtom
+                    Layout<Shape<_1,_1>>{}));                            // Value layout: 1x1 doubles
+
+  // Mainloop
+  using CollectiveMainloop = collective::CollectiveMma<
+    DispatchPolicy, TileShape,
+    double, TagToStrideA_t<cutlass::layout::RowMajor>,
+    double, TagToStrideB_t<cutlass::layout::ColumnMajor>,
+    TiledMma,
+    GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  // Epilogue
+  using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    TagToStrideC_t<cutlass::layout::ColumnMajor>,
+    TagToStrideC_t<cutlass::layout::ColumnMajor>,
+    epilogue::thread::LinearCombination<double, 1, double, double>,
+    cutlass::gemm::EpilogueDefault>;
+
+/*
+  using EpilogueOutputOp = epilogue::collective::Epilogue<
+      epilogue::thread::LinearCombination<double, 1, double, double>,
+      Layout<Shape <_64,_32>,
+             Stride< _1,_64>>,                                           // SMEM layout
+      Copy_Atom<UniversalCopy<double>,double>,                           // R2S with tiled_mma layout
+      decltype(make_tiled_copy(Copy_Atom<UniversalCopy<double>,double>{},// S2R
+                               Layout<Shape <_16,_16>,
+                                      Stride< _1,_16>>{},                // Thread layout
+                               Layout<Shape<_2,_1>>{})),                 // Value layout
+      Copy_Atom<UniversalCopy<double>,double>                            // R2G with S2R_dst layout
+      >;
+*/
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Ampere fp64 MMA NN (M-Major A and K-Major B)
+template <>
+struct DefaultGemmConfigurationToCutlass3Types<
+    arch::OpClassTensorOp, arch::Sm80,
+    double, cutlass::layout::ColumnMajor,
+    double, cutlass::layout::ColumnMajor,
+    double, cutlass::layout::ColumnMajor,
+    double>
+{
+  using TileShape = Shape<_128, _64, _16>;
+  static constexpr int ThreadCount = 128;
+  using DispatchPolicy = MainloopSm80CpAsync<3>;
+  using TiledMma = TiledMMA<
+      MMA_Atom<SM80_8x8x4_F64F64F64F64_TN>,            // Atom
+      Layout<Shape<_2,_2,_1>>,                         // Atom layout
+      Tile<Layout<Shape<_16,_2>,Stride<_2,_1>>,        // 32x32x4 MMA with perm for load vectorization
+           Layout<Shape<_16,_2>,Stride<_2,_1>>,
+           Underscore>>;
+
+  // A  (M,K)  M-Major
+  using SmemLayoutAtomA = decltype(
+      composition(Swizzle<2,2,2>{},
+                  Layout<Shape <_16, _4>,
+                         Stride< _1,_16>>{})); // M, K
+  using SmemCopyAtomA = Copy_Atom<DefaultCopy, double>;
+  static constexpr int kAlignmentA = 2;
+  using GmemTiledCopyA = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint128_t>, double>{}, // CopyAtom
+                    Layout<Shape <_16, _8>,
+                           Stride< _1,_16>>{},                           // ThrLayout for CopyAtom
+                    Layout<Shape<_2,_1>>{}));                            // Value layout: 2x1 doubles
+
+  // B  (N,K)  K-Major
+  using SmemLayoutAtomB = decltype(
+      composition(Swizzle<2,0,4>{},
+                  Layout<Shape <_4,_16>,
+                         Stride<_1, _4>>{}));// N, K
+  using SmemCopyAtomB = Copy_Atom<DefaultCopy, double>;
+  static constexpr int kAlignmentB = 1;
+  using GmemTiledCopyB = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<double>, double>{}, // CopyAtom
+                    Layout<Shape < _8,_16>,
+                           Stride<_16, _1>>{},                           // ThrLayout for CopyAtom
+                    Layout<Shape<_1,_1>>{}));                            // Value layout: 1x1 doubles
+
+  // Mainloop
+  using CollectiveMainloop = collective::CollectiveMma<
+    DispatchPolicy, TileShape,
+    double, TagToStrideA_t<cutlass::layout::ColumnMajor>,
+    double, TagToStrideB_t<cutlass::layout::ColumnMajor>,
+    TiledMma,
+    GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  // Epilogue
+  using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    TagToStrideC_t<cutlass::layout::ColumnMajor>,
+    TagToStrideC_t<cutlass::layout::ColumnMajor>,
+    epilogue::thread::LinearCombination<double, 1, double, double>,
+    cutlass::gemm::EpilogueDefault>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Ampere fp64 MMA NT (M-Major A and N-Major B)
+template <>
+struct DefaultGemmConfigurationToCutlass3Types<
+    arch::OpClassTensorOp, arch::Sm80,
+    double, cutlass::layout::ColumnMajor,
+    double, cutlass::layout::RowMajor,
+    double, cutlass::layout::ColumnMajor,
+    double>
+{
+  using TileShape = Shape<_128, _64, _16>;
+  static constexpr int ThreadCount = 128;
+  using DispatchPolicy = MainloopSm80CpAsync<3>;
+  using TiledMma = TiledMMA<
+      MMA_Atom<SM80_8x8x4_F64F64F64F64_TN>,            // Atom
+      Layout<Shape<_2,_2,_1>>,                         // Atom layout
+      Tile<Layout<Shape<_16,_2>,Stride<_2,_1>>,        // 32x32x4 MMA with perm for load vectorization
+           Layout<Shape<_16,_2>,Stride<_2,_1>>,
+           Underscore>>;
+
+  // A  (M,K)  M-Major
+  using SmemLayoutAtomA = decltype(
+      composition(Swizzle<2,2,2>{},
+                  Layout<Shape <_16, _4>,
+                         Stride< _1,_16>>{})); // M, K
+  using SmemCopyAtomA = Copy_Atom<DefaultCopy, double>;
+  static constexpr int kAlignmentA = 2;
+  using GmemTiledCopyA = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint128_t>, double>{}, // CopyAtom
+                    Layout<Shape <_16, _8>,
+                           Stride< _1,_16>>{},                           // ThrLayout for CopyAtom
+                    Layout<Shape<_2,_1>>{}));                            // Value layout: 2x1 doubles
+
+  // B  (N,K)  N-Major
+  using SmemLayoutAtomB = decltype(
+      composition(Swizzle<2,2,2>{},
+                  Layout<Shape <_16, _4>,
+                         Stride< _1,_16>>{})); // N, K
+  using SmemCopyAtomB = Copy_Atom<DefaultCopy, double>;
+  static constexpr int kAlignmentB = 2;
+  using GmemTiledCopyB = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint128_t>, double>{}, // CopyAtom
+                    Layout<Shape <_16, _8>,
+                           Stride< _1,_16>>{},                           // ThrLayout for CopyAtom
+                    Layout<Shape<_2,_1>>{}));                            // Value layout: 2x1 doubles
+
+  // Mainloop
+  using CollectiveMainloop = collective::CollectiveMma<
+    DispatchPolicy, TileShape,
+    double, TagToStrideA_t<cutlass::layout::ColumnMajor>,
+    double, TagToStrideB_t<cutlass::layout::RowMajor>,
+    TiledMma,
+    GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  // Epilogue
+  using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    TagToStrideC_t<cutlass::layout::ColumnMajor>,
+    TagToStrideC_t<cutlass::layout::ColumnMajor>,
+    epilogue::thread::LinearCombination<double, 1, double, double>,
+    cutlass::gemm::EpilogueDefault>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Ampere fp64 MMA TT (K-Major A and N-Major B)
+template <>
+struct DefaultGemmConfigurationToCutlass3Types<
+    arch::OpClassTensorOp, arch::Sm80,
+    double, cutlass::layout::RowMajor,
+    double, cutlass::layout::RowMajor,
+    double, cutlass::layout::ColumnMajor,
+    double>
+{
+  using TileShape = Shape<_128, _64, _16>;
+  static constexpr int ThreadCount = 128;
+  using DispatchPolicy = MainloopSm80CpAsync<3>;
+  using TiledMma = TiledMMA<
+      MMA_Atom<SM80_8x8x4_F64F64F64F64_TN>,            // Atom
+      Layout<Shape<_2,_2,_1>>,                         // Atom layout
+      Tile<Layout<Shape<_16,_2>,Stride<_2,_1>>,        // 32x32x4 MMA with perm for load vectorization
+           Layout<Shape<_16,_2>,Stride<_2,_1>>,
+           Underscore>>;
+
+  // A  (M,K)  K-Major
+  using SmemLayoutAtomA = decltype(
+      composition(Swizzle<2,0,4>{},
+                  Layout<Shape <_4,_16>,
+                         Stride<_1, _4>>{})); // M, K
+  using SmemCopyAtomA = Copy_Atom<DefaultCopy, double>;
+  static constexpr int kAlignmentA = 1;
+  using GmemTiledCopyA = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<double>, double>{}, // CopyAtom
+                    Layout<Shape < _8,_16>,
+                           Stride<_16, _1>>{},                           // ThrLayout for CopyAtom
+                    Layout<Shape<_1,_1>>{}));                            // Value layout: 1x1 doubles
+
+  // B  (N,K)  N-Major
+  using SmemLayoutAtomB = decltype(
+      composition(Swizzle<2,2,2>{},
+                  Layout<Shape <_16, _4>,
+                         Stride< _1,_16>>{})); // N, K
+  using SmemCopyAtomB = Copy_Atom<DefaultCopy, double>;
+  static constexpr int kAlignmentB = 2;
+  using GmemTiledCopyB = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint128_t>, double>{}, // CopyAtom
+                    Layout<Shape <_16, _8>,
+                           Stride< _1,_16>>{},                           // ThrLayout for CopyAtom
+                    Layout<Shape<_2,_1>>{}));                            // Value layout: 2x1 doubles
+
+  // Mainloop
+  using CollectiveMainloop = collective::CollectiveMma<
+    DispatchPolicy, TileShape,
+    double, TagToStrideA_t<cutlass::layout::RowMajor>,
+    double, TagToStrideB_t<cutlass::layout::RowMajor>,
+    TiledMma,
+    GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  // Epilogue
+  using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    TagToStrideC_t<cutlass::layout::ColumnMajor>,
+    TagToStrideC_t<cutlass::layout::ColumnMajor>,
+    epilogue::thread::LinearCombination<double, 1, double, double>,
+    cutlass::gemm::EpilogueDefault>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Hopper fp64 MMA TN
+template <>
+struct DefaultGemmConfigurationToCutlass3Types<
+    arch::OpClassTensorOp, arch::Sm90,
+    double, cutlass::layout::RowMajor,
+    double, cutlass::layout::ColumnMajor,
+    double, cutlass::layout::ColumnMajor,
+    double>
+{
+  using TileShape = Shape<_128, _64, _16>;
+  static constexpr int ThreadCount = 128;
+  using DispatchPolicy = MainloopSm80CpAsync<3>;
+  using TiledMma = TiledMMA<
+      MMA_Atom<SM90_16x8x16_F64F64F64F64_TN>,
+      Layout<Shape<_2,_2,_1>>>;
+
+  // A (M,K)  K-major
+  using SmemLayoutAtomA = decltype(
+    make_ordered_layout(Shape<_128,_16>{},
+                        Step <  _2, _1>{})); // M, K
+  using SmemCopyAtomA = Copy_Atom<DefaultCopy, double>;
+  static constexpr int kAlignmentA = 2;
+  using GmemTiledCopyA = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint128_t>, double>{},
+                    Layout<Shape <_16,_8>,
+                           Stride< _8,_1>>{},
+                    Layout<Shape < _1,_2>>{}));
+
+  // B (N,K)  K-major
+  using SmemLayoutAtomB = decltype(
+    make_ordered_layout(Shape<_64,_16>{},
+                        Step < _2, _1>{}));                       // N, K
+  using SmemCopyAtomB = Copy_Atom<DefaultCopy, double>;
+  static constexpr int kAlignmentB = 2;
+  using GmemTiledCopyB = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint128_t>, double>{},
+                    Layout<Shape <_16,_8>,
+                           Stride< _8,_1>>{},
+                    Layout<Shape < _1,_2>>{}));
+
+  // Mainloop
+  using CollectiveMainloop = collective::CollectiveMma<
+    DispatchPolicy, TileShape,
+    double, TagToStrideA_t<cutlass::layout::RowMajor>,
+    double, TagToStrideB_t<cutlass::layout::ColumnMajor>,
+    TiledMma,
+    GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  // Epilogue
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    TileShape, Shape<_1,_1,_1>,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    double, double,
+    double, cutlass::layout::ColumnMajor, 1,
+    double, cutlass::layout::ColumnMajor, 1,
+    cutlass::epilogue::collective::EpilogueScheduleAuto
+  >::CollectiveOp;
+
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu
index f960dfcc10..7459edc9fc 100644
--- a/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -49,7 +49,6 @@
 #include "testbed.h"
 
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 128x256x512_64x64x512) {
@@ -71,7 +70,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 128x256x512_64x64x512) {
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 256x128x512_64x64x512) {
@@ -93,7 +92,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 256x128x512_64x64x512) {
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 128x128x512_64x64x512) {
@@ -115,7 +114,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 128x128x512_64x64x512) {
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 64x256x512_64x64x512) {
@@ -137,7 +136,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 64x256x512_64x64x512) {
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 256x64x512_64x64x512) {
@@ -159,7 +158,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 256x64x512_64x64x512) {
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 64x128x512_32x64x512) {
@@ -181,7 +180,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 64x128x512_32x64x512) {
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 128x64x512_64x32x512) {
@@ -203,7 +202,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 128x64x512_64x32x512) {
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 64x64x512_32x32x512) {
@@ -225,7 +224,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 64x64x512_32x32x512) {
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu
index 59e1e36735..2f67a02243 100644
--- a/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,337 @@
 
 #include "testbed.h"
 
-#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+#if defined(CUTLASS_ARCH_MMA_B1_AND_SM80_ENABLED)
 
 ////////////////////////////////////////////////////////////////////////////////
 
+TEST(SM80_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 128x256x1024_64x64x1024) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 1024>,
+      cutlass::gemm::GemmShape<64, 64, 1024>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpAndPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 256x128x1024_64x64x1024) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 1024>,
+      cutlass::gemm::GemmShape<64, 64, 1024>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3,128, 128,
+      false, cutlass::arch::OpAndPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 128x128x1024_64x64x1024) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 1024>,
+      cutlass::gemm::GemmShape<64, 64, 1024>,
+      cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpAndPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 256x64x1024_64x64x1024) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 1024>,
+      cutlass::gemm::GemmShape<64, 64, 1024>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpAndPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 64x256x1024_64x64x1024) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 1024>,
+      cutlass::gemm::GemmShape<64, 64, 1024>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpAndPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 64x128x1024_32x64x1024) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 1024>,
+      cutlass::gemm::GemmShape<32, 64, 1024>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpAndPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 128x64x1024_64x32x1024) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 1024>,
+      cutlass::gemm::GemmShape<64, 32, 1024>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpAndPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 64x64x1024_32x32x1024) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 1024>,
+      cutlass::gemm::GemmShape<32, 32, 1024>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4, 128, 128,
+      false, cutlass::arch::OpAndPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 128x256x512_64x64x512) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 512>,
+      cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpAndPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 256x128x512_64x64x512) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 512>,
+      cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpAndPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 128x128x512_64x64x512) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 512>,
+      cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpAndPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 256x64x512_64x64x512) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 512>,
+      cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpAndPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 64x256x512_64x64x512) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 512>,
+      cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
+      false, cutlass::arch::OpAndPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 64x128x512_32x64x512) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 512>,
+      cutlass::gemm::GemmShape<32, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4, 128, 128,
+      false, cutlass::arch::OpAndPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 128x64x512_64x32x512) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 512>,
+      cutlass::gemm::GemmShape<64, 32, 512>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4, 128, 128,
+      false, cutlass::arch::OpAndPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 64x64x512_32x32x512) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 512>,
+      cutlass::gemm::GemmShape<32, 32, 512>, cutlass::gemm::GemmShape<16, 8, 256>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6, 128, 128,
+      false, cutlass::arch::OpAndPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_B1_AND_SM80_ENABLED)
+
 ////////////////////////////////////////////////////////////////////////////////
 
+#if defined(CUTLASS_ARCH_MMA_B1_XOR_SM80_ENABLED)
+
 TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 128x256x1024_64x64x1024) {
   using ElementOutput = int32_t;
   using ElementAccumulator = int32_t;
@@ -70,7 +395,7 @@ TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 128x256x1024_64x64x1024) {
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 256x128x1024_64x64x1024) {
@@ -90,7 +415,7 @@ TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 256x128x1024_64x64x1024) {
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 128x128x1024_64x64x1024) {
@@ -111,7 +436,7 @@ TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 128x128x1024_64x64x1024) {
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 256x64x1024_64x64x1024) {
@@ -131,7 +456,7 @@ TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 256x64x1024_64x64x1024) {
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 64x256x1024_64x64x1024) {
@@ -151,7 +476,7 @@ TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 64x256x1024_64x64x1024) {
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 64x128x1024_32x64x1024) {
@@ -171,7 +496,7 @@ TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 64x128x1024_32x64x1024) {
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 128x64x1024_64x32x1024) {
@@ -191,7 +516,7 @@ TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 128x64x1024_64x32x1024) {
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 64x64x1024_32x32x1024) {
@@ -211,7 +536,7 @@ TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 64x64x1024_32x32x1024) {
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 128x256x512_64x64x512) {
@@ -231,7 +556,7 @@ TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 128x256x512_64x64x512) {
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 256x128x512_64x64x512) {
@@ -251,7 +576,7 @@ TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 256x128x512_64x64x512) {
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 128x128x512_64x64x512) {
@@ -271,7 +596,7 @@ TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 128x128x512_64x64x512) {
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 256x64x512_64x64x512) {
@@ -291,7 +616,7 @@ TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 256x64x512_64x64x512) {
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 64x256x512_64x64x512) {
@@ -311,7 +636,7 @@ TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 64x256x512_64x64x512) {
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 64x128x512_32x64x512) {
@@ -331,7 +656,7 @@ TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 64x128x512_32x64x512) {
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 128x64x512_64x32x512) {
@@ -351,7 +676,7 @@ TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 128x64x512_64x32x512) {
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 64x64x512_32x32x512) {
@@ -371,9 +696,9 @@ TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 64x64x512_32x32x512) {
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
-////////////////////////////////////////////////////////////////////////////////
+#endif // defined(CUTLASS_ARCH_MMA_B1_XOR_SM80_ENABLED)
 
-#endif  // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu
index e636820313..c283f75bfd 100644
--- a/test/unit/gemm/device/gemm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -49,7 +49,6 @@
 #include "cutlass/util/reference/host/gemm.h"
 
 #include "testbed.h"
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 ////// WMMA Instruction Shape = 8x8x128, DataType/Instruction = b1 ^ b1 + s32 => s32   /////////
 /////////////////////////////////////////////////////////////////////////////////////////////////    
@@ -83,7 +82,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_wmma_tensor_op_s32, 128x256x512_64x64x512_8x8
     cutlass::arch::OpXorPopc
   >;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_b1t_b1n_s32n_wmma_tensor_op_s32, 256x128x512_64x64x512_8x8x128) {
@@ -114,7 +113,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_wmma_tensor_op_s32, 256x128x512_64x64x512_8x8
       2, 128, 128, false, 
       cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_b1t_b1n_s32n_wmma_tensor_op_s32, 128x128x512_64x64x512_8x8x128) {
@@ -145,7 +144,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_wmma_tensor_op_s32, 128x128x512_64x64x512_8x8
       2, 128, 128, false, 
       cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_b1t_b1n_s32n_wmma_tensor_op_s32, 64x128x512_32x64x512_8x8x128) {
@@ -176,7 +175,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_wmma_tensor_op_s32, 64x128x512_32x64x512_8x8x
       2, 128, 128, false, 
       cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_b1t_b1n_s32n_wmma_tensor_op_s32, 128x64x512_64x32x512_8x8x128) {
@@ -207,7 +206,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_wmma_tensor_op_s32, 128x64x512_64x32x512_8x8x
       2, 128, 128, false, 
       cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_b1t_b1n_s32n_wmma_tensor_op_s32, 64x64x512_32x32x512_8x8x128) {
@@ -238,6 +237,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_wmma_tensor_op_s32, 64x64x512_32x32x512_8x8x1
       2, 128, 128, false, 
       cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
+
 #endif //CUTLASS_SUBBYTE_INTEGER_MATRIX_MULTIPLY_ENABLED
diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu
index c1a32d9dba..b3c6eb27ca 100644
--- a/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -49,7 +49,6 @@
 #include "testbed.h"
 
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 128x256x512_64x64x512) {
@@ -71,7 +70,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 128x256x512_64x64x512) {
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 256x128x512_64x64x512) {
@@ -93,7 +92,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 256x128x512_64x64x512) {
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 128x128x512_64x64x512) {
@@ -115,7 +114,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 128x128x512_64x64x512) {
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 64x256x512_64x64x512) {
@@ -137,7 +136,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 64x256x512_64x64x512) {
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 256x64x512_64x64x512) {
@@ -159,7 +158,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 256x64x512_64x64x512) {
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 64x128x512_32x64x512) {
 
@@ -180,7 +179,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 64x128x512_32x64x512) {
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 128x64x512_64x32x512) {
@@ -202,7 +201,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 128x64x512_64x32x512) {
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 64x64x512_32x32x512) {
@@ -224,9 +223,8 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 64x64x512_32x32x512) {
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128,
       false, cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
 #endif
diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu
index 011399a965..330ac51286 100644
--- a/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -47,11 +47,9 @@
 
 #include "testbed.h"
 
-#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
-
 ////////////////////////////////////////////////////////////////////////////////
 
-////////////////////////////////////////////////////////////////////////////////
+#if defined(CUTLASS_ARCH_MMA_B1_XOR_SM80_ENABLED)
 
 CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 128x256x1024_64x64x1024, {
   using ElementOutput = int32_t;
@@ -377,4 +375,4 @@ CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 64x64x512_32x32
 
 ////////////////////////////////////////////////////////////////////////////////
 
-#endif  // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+#endif  // #if defined(CUTLASS_ARCH_MMA_B1_XOR_SM80_ENABLED)
diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu
index f359db819f..96bc99f58c 100644
--- a/test/unit/gemm/device/gemm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -49,7 +49,6 @@
 #include "cutlass/util/reference/host/gemm.h"
 
 #include "testbed.h"
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 ////// WMMA Instruction Shape = 8x8x128, DataType/Instruction = b1 ^ b1 + s32 => s32   /////////
 /////////////////////////////////////////////////////////////////////////////////////////////////    
@@ -83,7 +82,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_wmma_tensor_op_s32, 128x256x512_64x64x512_8x8
     cutlass::arch::OpXorPopc
   >;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_b1t_b1n_s32t_wmma_tensor_op_s32, 256x128x512_64x64x512_8x8x128) {
@@ -114,7 +113,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_wmma_tensor_op_s32, 256x128x512_64x64x512_8x8
       2, 128, 128, false, 
       cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_b1t_b1n_s32t_wmma_tensor_op_s32, 128x128x512_64x64x512_8x8x128) {
@@ -145,7 +144,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_wmma_tensor_op_s32, 128x128x512_64x64x512_8x8
       2, 128, 128, false, 
       cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_b1t_b1n_s32t_wmma_tensor_op_s32, 64x128x512_32x64x512_8x8x128) {
@@ -176,7 +175,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_wmma_tensor_op_s32, 64x128x512_32x64x512_8x8x
       2, 128, 128, false, 
       cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_b1t_b1n_s32t_wmma_tensor_op_s32, 128x64x512_64x32x512_8x8x128) {
@@ -207,7 +206,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_wmma_tensor_op_s32, 128x64x512_64x32x512_8x8x
       2, 128, 128, false, 
       cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_b1t_b1n_s32t_wmma_tensor_op_s32, 64x64x512_32x32x512_8x8x128) {
@@ -238,6 +237,6 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_wmma_tensor_op_s32, 64x64x512_32x32x512_8x8x1
       2, 128, 128, false, 
       cutlass::arch::OpXorPopc>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 #endif //CUTLASS_SUBBYTE_INTEGER_MATRIX_MULTIPLY_ENABLED
diff --git a/test/unit/gemm/device/gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu
index d46f9c558b..fb59a04ff0 100644
--- a/test/unit/gemm/device/gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu
index 33a4afd2b4..8e107e95b8 100644
--- a/test/unit/gemm/device/gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -65,7 +65,7 @@ TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 128x256x64_64x64x64) {
           ElementAccumulator, ElementAccumulator>,
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 256x128x64_64x64x64) {
@@ -83,7 +83,7 @@ TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 256x128x64_64x64x64) {
           ElementAccumulator, ElementAccumulator>,
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 128x128x64_64x64x64) {
@@ -101,7 +101,7 @@ TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 128x128x64_64x64x64) {
           ElementAccumulator, ElementAccumulator>,
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 256x64x64_64x64x64) {
@@ -119,7 +119,7 @@ TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 256x64x64_64x64x64) {
           ElementAccumulator, ElementAccumulator>,
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 64x256x64_64x64x64) {
@@ -137,7 +137,7 @@ TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 64x256x64_64x64x64) {
           ElementAccumulator, ElementAccumulator>,
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 64x128x64_32x64x64) {
@@ -155,7 +155,7 @@ TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 64x128x64_32x64x64) {
           ElementAccumulator, ElementAccumulator>,
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 128x64x64_64x32x64) {
@@ -173,7 +173,7 @@ TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 128x64x64_64x32x64) {
           ElementAccumulator, ElementAccumulator>,
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 64x64x64_32x32x64) {
@@ -191,7 +191,7 @@ TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 64x64x64_32x32x64) {
           ElementAccumulator, ElementAccumulator>,
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 128x256x32_64x64x32) {
@@ -209,7 +209,7 @@ TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 128x256x32_64x64x32) {
           ElementAccumulator, ElementAccumulator>,
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 256x128x32_64x64x32) {
@@ -227,7 +227,7 @@ TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 256x128x32_64x64x32) {
           ElementAccumulator, ElementAccumulator>,
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 128x128x32_64x64x32) {
@@ -245,7 +245,7 @@ TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 128x128x32_64x64x32) {
           ElementAccumulator, ElementAccumulator>,
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 256x64x32_64x64x32) {
@@ -263,7 +263,7 @@ TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 256x64x32_64x64x32) {
           ElementAccumulator, ElementAccumulator>,
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 64x256x32_64x64x32) {
@@ -281,7 +281,7 @@ TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 64x256x32_64x64x32) {
           ElementAccumulator, ElementAccumulator>,
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 64x128x32_32x64x32) {
@@ -299,7 +299,7 @@ TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 64x128x32_32x64x32) {
           ElementAccumulator, ElementAccumulator>,
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 128x64x32_64x32x32) {
@@ -317,7 +317,7 @@ TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 128x64x32_64x32x32) {
           ElementAccumulator, ElementAccumulator>,
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 64x64x32_32x32x32) {
@@ -335,7 +335,7 @@ TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 64x64x32_32x32x32) {
           ElementAccumulator, ElementAccumulator>,
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu b/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu
index 722d0d9824..02f34628ef 100644
--- a/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -56,7 +56,7 @@
 //  Operands data type: complex<float>
 //  Rounding: float -> tfloat32_t (half_ulp_truncate)
 //  Instruction operand data type: tfloat32_t (real part) and  tfloat32_t (imaginary part)
-//  Math instruction: MMA.1688.F32.TF32
+//  Math instruction: mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32
 //  Instruction output/accumulation data type: f32 (real part) and f32 (imaginary part)
 //  Output data type: complex<float>
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -162,7 +162,7 @@ TEST(SM80_Device_Gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32, 64x64x16_32x32x16) {
 
 TEST(SM80_Device_Gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32, 128x64x16_64x32x16) {
 
-  using Element = cutlass::complex<float>;;
+  using Element = cutlass::complex<float>;
 
   using Gemm = cutlass::gemm::device::GemmComplex<
     Element,
@@ -194,7 +194,7 @@ TEST(SM80_Device_Gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32, 128x64x16_64x32x16)
 
 TEST(SM80_Device_Gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32, 64x128x16_32x64x16) {
 
-  using Element = cutlass::complex<float>;;
+  using Element = cutlass::complex<float>;
 
   using Gemm = cutlass::gemm::device::GemmComplex<
     Element,
diff --git a/test/unit/gemm/device/gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu b/test/unit/gemm/device/gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu
index 1b263a4cc0..be739dbad2 100644
--- a/test/unit/gemm/device/gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -56,7 +56,7 @@
 //  Operands data type: complex<float>
 //  Rounding: float -> tfloat32_t (round to nearest)
 //  Instruction operand data type: tfloat32_t (real part) and  tfloat32_t (imaginary part)
-//  Math instruction: MMA.1688.F32.TF32
+//  Math instruction: mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32
 //  Instruction output/accumulation data type: f32 (real part) and f32 (imaginary part)
 //  Output data type: complex<float>
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -161,7 +161,7 @@ TEST(SM80_Device_Gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32, 64x64x16_32x32x16) {
 
 TEST(SM80_Device_Gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32, 128x64x16_64x32x16) {
 
-  using Element = cutlass::complex<float>;;
+  using Element = cutlass::complex<float>;
 
   using Gemm = cutlass::gemm::device::GemmComplex<
     Element,
@@ -193,7 +193,7 @@ TEST(SM80_Device_Gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32, 128x64x16_64x32x16)
 
 TEST(SM80_Device_Gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32, 64x128x16_32x64x16) {
 
-  using Element = cutlass::complex<float>;;
+  using Element = cutlass::complex<float>;
 
   using Gemm = cutlass::gemm::device::GemmComplex<
     Element,
@@ -225,7 +225,7 @@ TEST(SM80_Device_Gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32, 64x128x16_32x64x16)
 
 TEST(SM80_Device_Gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32, 128x128x16_32x64x16) {
 
-  using Element = cutlass::complex<float>;;
+  using Element = cutlass::complex<float>;
 
   using Gemm = cutlass::gemm::device::GemmComplex<
     Element,
diff --git a/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
index 65b68e1abb..4e51b1cf09 100644
--- a/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
+++ b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm90.cu b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm90.cu
new file mode 100644
index 0000000000..c3972b9f46
--- /dev/null
+++ b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm90.cu
@@ -0,0 +1,197 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface with Hopper FP64
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm_complex.h"
+
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_complex.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian, 32x32x16_16x16x16) {
+
+  using Element = cutlass::complex<double>; 
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::arch::OpMultiplyAddGaussianComplex
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+TEST(SM90_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian, 32x32x8_16x16x8) {
+  
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<32, 32, 8>,
+    cutlass::gemm::GemmShape<16, 16, 8>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::arch::OpMultiplyAddGaussianComplex
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian, 64x64x16_16x32x16) {
+  
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::arch::OpMultiplyAddGaussianComplex
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian, 64x64x8_16x32x8) {
+  
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<64, 64, 8>,
+    cutlass::gemm::GemmShape<16, 32, 8>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::arch::OpMultiplyAddGaussianComplex
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu
index b3eb536a24..0508c1f7cf 100644
--- a/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm90.cu b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm90.cu
new file mode 100644
index 0000000000..097db4a3bc
--- /dev/null
+++ b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm90.cu
@@ -0,0 +1,251 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface with Hopper FP64
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm_complex.h"
+
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_complex.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64, 32x32x8_16x16x8) {
+  
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<32, 32, 8>,
+    cutlass::gemm::GemmShape<16, 16, 8>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64, 64x64x16_16x32x16) {
+  
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64, 64x64x8_16x32x8) {
+  
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<64, 64, 8>,
+    cutlass::gemm::GemmShape<16, 32, 8>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64, 64x64x16_32x32x16) {
+
+  
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64, 64x64x8_32x32x8) {
+  
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<64, 64, 8>,
+    cutlass::gemm::GemmShape<32, 32, 8>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
index 7d5b390605..2317b41f7e 100644
--- a/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
+++ b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm90.cu b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm90.cu
new file mode 100644
index 0000000000..a332f76768
--- /dev/null
+++ b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm90.cu
@@ -0,0 +1,196 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface with Hopper FP64
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm_complex.h"
+
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_complex.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian, 32x32x8_16x16x8) {
+  
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<32, 32, 8>,
+    cutlass::gemm::GemmShape<16, 16, 8>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::arch::OpMultiplyAddGaussianComplex
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian, 64x64x8_32x16x8) {
+
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<64, 64, 8>,
+    cutlass::gemm::GemmShape<32, 16, 8>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::arch::OpMultiplyAddGaussianComplex
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian, 32x32x16_16x16x16) {
+  
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::arch::OpMultiplyAddGaussianComplex
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian, 64x64x16_32x16x16) {
+
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::arch::OpMultiplyAddGaussianComplex
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu
index d65b7a7e2c..aefa451df3 100644
--- a/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm90.cu b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm90.cu
new file mode 100644
index 0000000000..e246d3b41e
--- /dev/null
+++ b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm90.cu
@@ -0,0 +1,303 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface with Hopper FP64
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm_complex.h"
+
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_complex.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64, 32x32x8_16x16x8) {
+  
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<32, 32, 8>,
+    cutlass::gemm::GemmShape<16, 16, 8>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64, 64x64x8_32x32x8) {
+
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<64, 64, 8>,
+    cutlass::gemm::GemmShape<32, 32, 8>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64, 64x128x8_32x32x8) {
+
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<64, 128, 8>,
+    cutlass::gemm::GemmShape<32, 32, 8>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64, 128x64x8_32x32x8) {
+
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<64, 128, 8>,
+    cutlass::gemm::GemmShape<32, 32, 8>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64, 32x32x16_16x16x16) {
+  
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64, 64x128x16_32x32x16) {
+
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64, 128x64x16_32x32x16) {
+
+  using Element = cutlass::complex<double>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      Element,
+      1,
+      Element,
+      Element
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#endif // #if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16n_direct_store_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16n_direct_store_tensor_op_f32_sm80.cu
index f0072ce754..94b46326ac 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16n_direct_store_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16n_direct_store_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -79,7 +79,7 @@ TEST(SM80_Device_GemmUniversal_DirectStore_f16n_f16t_f32n_tensor_op_f32, 128x128
       cutlass::gemm::GemmShape<16, 8, 16>,
       cutlass::epilogue::thread::LinearCombination<
           ElementOutput, 
-          4,                            // This is the vector size of the epilogue. 
+          4,
           ElementAccumulator, 
           ElementAccumulator>,
       cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, 
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu
index 56b11390f8..b0f964bb05 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu
index 95fe7e54bd..3d110dc665 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu
index 809be9f6aa..bfcaf180fa 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu
index a54d510e3c..e4223d5a5b 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sparse_sm80.cu
index 2f0674e6ff..0764a012cf 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -36,7 +36,7 @@
 
 #include "../../common/cutlass_unit_test.h"
 #include "cutlass/cutlass.h"
-#include "cutlass/gemm/device/gemm_sparse.h"
+#include "cutlass/gemm/device/gemm_sparse_universal.h"
 #include "cutlass/util/host_tensor.h"
 #include "cutlass/util/reference/host/gemm.h"
 #include "cutlass/util/reference/host/tensor_compare.h"
@@ -54,7 +54,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16n_f16t_tensor_op_f32, 128x256x64_64x64x64)
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -72,7 +72,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16n_f16t_tensor_op_f32, 256x128x64_64x64x64)
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -91,7 +91,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16n_f16t_tensor_op_f32, 128x128x64_64x64x64)
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -109,7 +109,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16n_f16t_tensor_op_f32, 256x64x64_64x64x64) {
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -127,7 +127,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16n_f16t_tensor_op_f32, 64x256x64_64x64x64) {
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -145,7 +145,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16n_f16t_tensor_op_f32, 64x128x64_32x64x64) {
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -163,7 +163,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16n_f16t_tensor_op_f32, 128x64x64_64x32x64) {
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -181,7 +181,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16n_f16t_tensor_op_f32, 64x64x64_32x32x64) {
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -199,7 +199,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16n_f16t_tensor_op_f32, 128x128x128_64x64x128
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -217,7 +217,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16n_f16t_tensor_op_f32, 256x64x128_64x64x128)
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -235,7 +235,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16n_f16t_tensor_op_f32, 128x64x128_64x32x128)
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -253,7 +253,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16n_f16t_tensor_op_f32, 64x64x128_32x32x128)
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu
index 0061b140b8..6c34478a0d 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu
index 453f71a4fd..587f2b2398 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu
index 6ccc674ee0..5a734b9ef8 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu
index 5b2aecb523..71b95d6767 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu
index e18e647ef0..28961576d1 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu
index b52e20098e..1587320c94 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu
index 5c484613c6..52c2aaed6a 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu
index de2e641511..9cd589d446 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu
index 2ef69f3149..8c652f38d2 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -36,7 +36,7 @@
 
 #include "../../common/cutlass_unit_test.h"
 #include "cutlass/cutlass.h"
-#include "cutlass/gemm/device/gemm_sparse.h"
+#include "cutlass/gemm/device/gemm_sparse_universal.h"
 #include "cutlass/util/host_tensor.h"
 #include "cutlass/util/reference/host/gemm.h"
 #include "cutlass/util/reference/host/tensor_compare.h"
@@ -54,7 +54,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16n_f32t_tensor_op_f32, 128x256x64_64x64x64)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -73,7 +73,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16n_f32t_tensor_op_f32, 256x128x64_64x64x64)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -91,7 +91,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16n_f32t_tensor_op_f32, 128x128x64_64x64x64)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -109,7 +109,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16n_f32t_tensor_op_f32, 256x64x64_64x64x64) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -127,7 +127,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16n_f32t_tensor_op_f32, 64x256x64_64x64x64) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -145,7 +145,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16n_f32t_tensor_op_f32, 64x128x64_32x64x64) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -164,7 +164,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16n_f32t_tensor_op_f32, 128x64x64_64x32x64) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -182,7 +182,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16n_f32t_tensor_op_f32, 64x64x64_32x32x64) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -200,7 +200,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16n_f32t_tensor_op_f32, 128x128x128_64x64x128
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -218,7 +218,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16n_f32t_tensor_op_f32, 256x64x128_64x64x128)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -236,7 +236,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16n_f32t_tensor_op_f32, 128x64x128_64x32x128)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -254,7 +254,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16n_f32t_tensor_op_f32, 64x64x128_32x32x128)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu
index e17df06ffa..c64d7be628 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu
index b2625df834..12606f27a9 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu
index d86572ea19..d4aa9a2cf0 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu
index 9ab1733149..77a46eef13 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu
index c8658c6598..9b4c1657d8 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu
index dfbe427e7d..3670d282e1 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu
index aa7f2de882..2317ed96df 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -238,6 +238,253 @@ TEST(SM75_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 64x64x32_32x32x32) {
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
 }
 
+TEST(SM75_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 32x128x32_32x32x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<32, 128, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 32x256x32_32x64x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<32, 256, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 128x32x32_32x32x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<128, 32, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 256x32x32_64x32x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<256, 32, 32>,
+    cutlass::gemm::GemmShape<64, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_f16n_f16t_f16t_tensor_op_f32, 32x128x32_32x32x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<32, 128, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_f16n_f16t_f16t_tensor_op_f32, 32x256x32_32x64x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<32, 256, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_f16n_f16t_f16t_tensor_op_f32, 128x32x32_32x32x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<128, 32, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_f16n_f16t_f16t_tensor_op_f32, 256x32x32_64x32x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<256, 32, 32>,
+    cutlass::gemm::GemmShape<64, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 #endif
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu
index 14cf7977c5..6d499ad543 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sparse_sm80.cu
index f8ac08b405..b05e95d22f 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -36,7 +36,7 @@
 
 #include "../../common/cutlass_unit_test.h"
 #include "cutlass/cutlass.h"
-#include "cutlass/gemm/device/gemm_sparse.h"
+#include "cutlass/gemm/device/gemm_sparse_universal.h"
 #include "cutlass/util/host_tensor.h"
 #include "cutlass/util/reference/host/gemm.h"
 #include "cutlass/util/reference/host/tensor_compare.h"
@@ -54,7 +54,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16t_f16t_tensor_op_f16, 128x256x64_64x64x64)
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = cutlass::half_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -72,7 +72,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16t_f16t_tensor_op_f16, 256x128x64_64x64x64)
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = cutlass::half_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -90,7 +90,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16t_f16t_tensor_op_f16, 128x128x64_64x64x64)
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = cutlass::half_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -108,7 +108,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16t_f16t_tensor_op_f16, 256x64x64_64x64x64) {
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = cutlass::half_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -126,7 +126,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16t_f16t_tensor_op_f16, 64x256x64_64x64x64) {
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = cutlass::half_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -144,7 +144,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16t_f16t_tensor_op_f16, 64x128x64_32x64x64) {
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = cutlass::half_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -163,7 +163,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16t_f16t_tensor_op_f16, 128x64x64_64x32x64) {
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = cutlass::half_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -181,7 +181,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16t_f16t_tensor_op_f16, 64x64x64_32x32x64) {
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = cutlass::half_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -199,7 +199,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16t_f16t_tensor_op_f16, 128x128x128_64x64x128
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = cutlass::half_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -217,7 +217,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16t_f16t_tensor_op_f16, 256x64x128_64x64x128)
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = cutlass::half_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -235,7 +235,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16t_f16t_tensor_op_f16, 128x64x128_64x32x128)
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = cutlass::half_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -253,7 +253,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16t_f16t_tensor_op_f16, 64x64x128_32x32x128)
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = cutlass::half_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -266,6 +266,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16t_f16t_tensor_op_f16, 64x64x128_32x32x128)
 
   EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
 }
+
 ////////////////////////////////////////////////////////////////////////////////
 
 #endif  // #if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu
index f6408e6e2a..794ce6fc73 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -37,7 +37,7 @@
 #include "../../common/cutlass_unit_test.h"
 #include "cutlass/cutlass.h"
 
-#include "cutlass/gemm/device/gemm_universal.h"
+#include "cutlass/gemm/device/gemm.h"
 
 #include "cutlass/util/host_tensor.h"
 #include "cutlass/util/reference/host/gemm.h"
@@ -54,10 +54,10 @@
 
 ////////////////////////////////////////////////////////////////////////////////
 
-TEST(SM80_Device_GemmUniversal_f16n_f16t_f32t_tensor_op_f32, 64x64x32_32x32x32) {
+TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f32, 64x64x32_32x32x32) {
 
   using ElementOutput = cutlass::half_t;
-  using ElementAccumulator = cutlass::half_t;
+  using ElementAccumulator = float;
 
   using Gemm = cutlass::gemm::device::Gemm<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
@@ -68,14 +68,239 @@ TEST(SM80_Device_GemmUniversal_f16n_f16t_f32t_tensor_op_f32, 64x64x32_32x32x32)
       cutlass::epilogue::thread::LinearCombination<
           ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
           ElementAccumulator, ElementAccumulator>,
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 10>;
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>;
 
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
 }
 
+TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f32, 32x128x32_32x32x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<32, 128, 32>,
+      cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 5>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f32, 32x256x32_32x64x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<32, 256, 32>,
+      cutlass::gemm::GemmShape<32, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 5>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f32, 128x32x32_32x32x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 32, 32>,
+      cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 5>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f32, 256x32x32_64x32x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 32, 32>,
+      cutlass::gemm::GemmShape<64, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 5>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f32, 32x128x64_32x32x64) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<32, 128, 64>,
+      cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 5>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f32, 32x256x64_32x64x64) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<32, 256, 64>,
+      cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f32, 128x32x64_32x32x64) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 32, 64>,
+      cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 5>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f32, 256x32x64_64x32x64) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 32, 64>,
+      cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f32, 16x128x64_16x32x64) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<16, 128, 64>,
+      cutlass::gemm::GemmShape<16, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 5>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f32, 16x256x64_16x64x64) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<16, 256, 64>,
+      cutlass::gemm::GemmShape<16, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f32, 128x16x64_32x16x64) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 16, 64>,
+      cutlass::gemm::GemmShape<32, 16, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 64 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 5>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f32, 256x16x64_64x16x64) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 16, 64>,
+      cutlass::gemm::GemmShape<64, 16, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 64 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
 ////////////////////////////////////////////////////////////////////////////////
 
 #endif  // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
 
 ////////////////////////////////////////////////////////////////////////////////
-
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu
index e50e203308..5e54e3a94a 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu
index 20d3f0fb05..5db310a360 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu
index a67ee9f34b..b8d3c2435f 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu
index 4e298d2809..2773452829 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu
index 2c5dc566c5..1d2d0f580b 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu
index ed40a2859e..29fda22ce4 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -37,6 +37,7 @@
 #include "../../common/cutlass_unit_test.h"
 #include "cutlass/cutlass.h"
 #include "cutlass/gemm/device/gemm.h"
+#include "cutlass/gemm/device/gemm_universal.h"
 #include "cutlass/util/host_tensor.h"
 #include "cutlass/util/reference/host/gemm.h"
 #include "cutlass/util/reference/host/tensor_compare.h"
@@ -45,6 +46,7 @@
 #include "cutlass/util/tensor_view_io.h"
 
 #include "testbed.h"
+#include "testbed_universal.h"
 
 #if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
  
@@ -104,6 +106,44 @@ CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x128x64_64x64x
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
 } )
 
+CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x128x64_64x64x64_sk, {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+      cutlass::half_t, cutlass::layout::ColumnMajor,
+      cutlass::half_t, cutlass::layout::RowMajor,
+      ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::ThreadblockSwizzleStreamK, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+} )
+
+CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32n_tensor_op_f32, 128x128x64_64x64x64_sk, {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+      cutlass::half_t, cutlass::layout::ColumnMajor,
+      cutlass::half_t, cutlass::layout::RowMajor,
+      ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::ThreadblockSwizzleStreamK, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+} )
+
 CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 256x64x64_64x64x64, {
   using ElementOutput = float;
   using ElementAccumulator = float;
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sparse_sm80.cu
index 4b76da00eb..81aa8016e4 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -36,7 +36,7 @@
 
 #include "../../common/cutlass_unit_test.h"
 #include "cutlass/cutlass.h"
-#include "cutlass/gemm/device/gemm_sparse.h"
+#include "cutlass/gemm/device/gemm_sparse_universal.h"
 #include "cutlass/util/host_tensor.h"
 #include "cutlass/util/reference/host/gemm.h"
 #include "cutlass/util/reference/host/tensor_compare.h"
@@ -54,7 +54,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x256x64_64x64x64)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -72,7 +72,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16t_f32t_tensor_op_f32, 256x128x64_64x64x64)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -90,7 +90,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x128x64_64x64x64)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -108,7 +108,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16t_f32t_tensor_op_f32, 256x64x64_64x64x64) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -126,7 +126,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16t_f32t_tensor_op_f32, 64x256x64_64x64x64) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -144,7 +144,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16t_f32t_tensor_op_f32, 64x128x64_32x64x64) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -162,7 +162,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x64x64_64x32x64) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -180,7 +180,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16t_f32t_tensor_op_f32, 64x64x64_32x32x64) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -198,7 +198,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x128x128_64x64x128
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -216,7 +216,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16t_f32t_tensor_op_f32, 256x64x128_64x64x128)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -234,7 +234,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x64x128_64x32x128)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -252,7 +252,7 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16t_f32t_tensor_op_f32, 64x64x128_32x32x128)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
       cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -266,6 +266,150 @@ TEST(SM80_Device_Sparse_Gemm_f16n_f16t_f32t_tensor_op_f32, 64x64x128_32x32x128)
   EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
 }
 
+TEST(SM80_Device_Sparse_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x32x64_32x32x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 32, 64>,
+      cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x32x128_32x32x128) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 32, 128>,
+      cutlass::gemm::GemmShape<32, 32, 128>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_f16n_f16t_f32t_tensor_op_f32, 256x32x64_64x32x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 32, 64>,
+      cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_f16n_f16t_f32t_tensor_op_f32, 32x128x128_32x32x128) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<32, 128, 128>,
+      cutlass::gemm::GemmShape<32, 32, 128>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x16x64_32x16x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 16, 64>,
+      cutlass::gemm::GemmShape<32, 16, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x16x128_32x16x128) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 16, 128>,
+      cutlass::gemm::GemmShape<32, 16, 128>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_f16n_f16t_f32t_tensor_op_f32, 256x16x64_64x16x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 16, 64>,
+      cutlass::gemm::GemmShape<64, 16, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_f16n_f16t_f32t_tensor_op_f32, 256x16x128_64x16x128) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t,
+      cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 16, 128>,
+      cutlass::gemm::GemmShape<64, 16, 128>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 
 #endif  // CUTLASS_ARCH_SPARSE_MMA_SM80_SUPPORTED
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu
index e76965e6d1..d638de05f7 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu
index a66b460d65..143203745e 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu
index 49cba75382..8c5ee86156 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu
index e6403449e6..45ba6158aa 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu
index ca914e1085..ecfc8379ef 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16_sm70.cu
index a9e73b50b3..92604db0e8 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_broadcast_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_broadcast_sm80.cu
new file mode 100644
index 0000000000..ef8f9d4bbc
--- /dev/null
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_broadcast_sm80.cu
@@ -0,0 +1,439 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for GEMM + broadcast interface
+*/
+
+#include <fstream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/functional.h"
+
+#include "cutlass/gemm/kernel/default_gemm_with_broadcast.h"
+#include "cutlass/gemm/device/gemm_universal.h"
+#include "cutlass/gemm/device/gemm_universal_with_broadcast.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination_bias_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_residual_block.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_elementwise.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+template<typename GemmElement, typename LayoutA, typename LayoutB, typename LayoutC>
+struct TestbedUtils {
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<GemmElement, LayoutA> tensor_A;          // Input A
+  cutlass::HostTensor<GemmElement, LayoutB> tensor_B;          // Input B
+  cutlass::HostTensor<GemmElement, LayoutC> tensor_C;          // Input C
+  cutlass::HostTensor<GemmElement, LayoutC> tensor_D1;         // Input D
+  cutlass::HostTensor<GemmElement, LayoutC> tensor_D2;         // Input D
+  cutlass::HostTensor<GemmElement, LayoutC> tensor_Y1;         // Input Y
+  cutlass::HostTensor<GemmElement, LayoutC> tensor_Y2;         // Input Y
+  cutlass::HostTensor<GemmElement, LayoutC> tensor_Y_ref;
+
+  //
+  // Methods
+  //
+
+  TestbedUtils(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+   init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { }
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view,
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      double scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<Element>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        scope_max = 5;
+        scope_min = -5;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope_max, scope_min, 0);
+    }
+    else if (dist_kind == cutlass::Distribution::AllZeros) {
+      cutlass::reference::host::TensorFill(view);
+    }
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    }
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(
+        view.data(), view.capacity());
+    }
+    else {
+      EXPECT_TRUE(false) << "Not implemented";
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Initializes data structures
+  void initialize(cutlass::gemm::GemmCoord problem_size) {
+    //
+    // Allocate the GEMM workspace
+    //
+
+    tensor_A.resize(problem_size.mk());
+    tensor_B.resize(problem_size.kn());
+    tensor_C.resize({1, problem_size.n()});
+    tensor_D1.resize(problem_size.mn());
+    tensor_D2.resize(problem_size.mn());
+    tensor_Y1.resize(problem_size.mn());
+    tensor_Y2.resize(problem_size.mn());
+    tensor_Y_ref.resize(problem_size.mn());
+
+    EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 2019));
+    EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2018));
+    EXPECT_TRUE(initialize_tensor(tensor_C.host_view(), init_C, seed + 2017));
+
+    // Initialize D data to smaller data range. This helps avoid large roundoff errors.
+    int d_scope_min = -2;
+    int d_scope_max =  2;
+    cutlass::reference::host::TensorFillRandomUniform(tensor_D1.host_view(), seed + 2016, d_scope_max, d_scope_min, 0);
+    cutlass::reference::host::TensorFillRandomUniform(tensor_D2.host_view(), seed + 2015, d_scope_max, d_scope_min, 0);
+
+    EXPECT_TRUE(initialize_tensor(tensor_Y1.host_view(), cutlass::Distribution::AllZeros, 0));
+    EXPECT_TRUE(initialize_tensor(tensor_Y2.host_view(), cutlass::Distribution::AllZeros, 0));
+    EXPECT_TRUE(initialize_tensor(tensor_Y_ref.host_view(), cutlass::Distribution::AllZeros, 0));
+
+    // It is possible to randomly initialize to all zeros, so override this with non-zeros
+    // in the upper left corner of each operand.
+    tensor_A.host_view().at({0, 0}) = GemmElement(1);
+    tensor_B.host_view().at({0, 0}) = GemmElement(1);
+    tensor_C.host_view().at({0, 0}) = GemmElement(1);
+    tensor_D1.host_view().at({0, 0}) = GemmElement(1);
+    tensor_D2.host_view().at({0, 0}) = GemmElement(1);
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D1.sync_device();
+    tensor_D2.sync_device();
+  }
+
+  /// Compares computed reference with device reference and outputs to a file if incorrect
+  bool compare_reference(
+    cutlass::gemm::GemmCoord problem_size, cutlass::HostTensor<GemmElement, LayoutC>& tensor_Y_ref, cutlass::HostTensor<GemmElement, LayoutC>& tensor_Y) {
+
+    tensor_Y_ref.sync_host();
+    tensor_Y.sync_host();
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_A.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_B.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_C.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D1.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D2.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_Y_ref.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_Y.host_view()), 0);
+
+    bool passed = true;
+    float norm_diff = 0;
+
+    norm_diff = cutlass::reference::host::TensorNormDiff(tensor_Y_ref.host_view(), tensor_Y.host_view(), float());
+    passed = (norm_diff <= 0.1f);
+    EXPECT_LT(norm_diff, 0.1f) << " tensor_Y is incorrect";
+
+
+    if (!passed) {
+      std::ofstream file("errors_testbed_gemm_broadcast_new.txt");
+
+
+      file
+        << "problem: " << problem_size << "\n\n";
+
+      file
+        << "capacity: \n"
+        << "A: " << tensor_A.capacity()
+        << "\nB: " << tensor_B.capacity()
+        << "\nC: " << tensor_C.capacity()
+        << "\nD1: " << tensor_D1.capacity()
+        << "\nD2: " << tensor_D2.capacity()
+        << "\nY: " << tensor_Y.capacity()
+        << "\n\n"
+        << "\nY_ref: " << tensor_Y_ref.capacity()
+        << "\n\n";
+      file
+        << "A =\n" << tensor_A.host_view()
+        << "\n\nB =\n" << tensor_B.host_view()
+        << "\n\nC =\n" << tensor_C.host_view()
+        << "\n\nD1 =\n" << tensor_D1.host_view()
+        << "\n\nD2 =\n" << tensor_D2.host_view()
+        << "\n\nY =\n" << tensor_Y.host_view()
+        << "\n\nY_ref =\n" << tensor_Y_ref.host_view();
+    }
+
+    return passed;
+  }
+};
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+TEST(SM80_Device_GemmWithBroadcast_f16t_f16n_f16t_tensor_op_f16, 128x128_32x3_64x64x32_16x8x16) {
+    using ElementA = cutlass::half_t;
+    using ElementB = cutlass::half_t;
+    using ElementOutput = cutlass::half_t;
+    using ElementAccumulator = cutlass::half_t;
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+
+    using OpClass = cutlass::arch::OpClassTensorOp;
+    using ArchTag = cutlass::arch::Sm80;
+
+    using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
+    using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+    using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+
+    using ThreadblockSwizzle = cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle;
+    const int kStages = 3;
+
+    const int batch_count = 1;
+    const cutlass::half_t alpha(1);
+    const cutlass::half_t beta(1);
+
+    const int M = 1024;
+    const int K = 10240;
+    const int N = 512;
+    cutlass::gemm::GemmCoord problem{M, N, K};
+
+    const int batch_stride_A = 0;
+    const int batch_stride_B = 0;
+    const int batch_stride_C1 = 0;
+    const int batch_stride_C2 = 0;
+    const int batch_stride_D = 0;
+    const int batch_stride_Vector = 0;
+    const int batch_stride_Tensor = 0;
+
+    const int64_t lda = LayoutA::packed({problem.m(), problem.k()}).stride(0);
+    const int64_t ldb = LayoutB::packed({problem.k(), problem.n()}).stride(0);
+    const int64_t ldc1 = LayoutC::packed({problem.m(), problem.n()}).stride(0);
+    const int64_t ldc2 = LayoutC::packed({problem.m(), problem.n()}).stride(0);
+    const int64_t ldd = LayoutC::packed({problem.m(), problem.n()}).stride(0);
+    const int64_t ldv = 0;
+    const int64_t ldt = 0;
+
+    TestbedUtils<ElementA, LayoutA, LayoutB, LayoutC> utils;
+    utils.initialize(problem);
+
+    //
+    // Create reference Gemm
+    //
+    using GemmRef = cutlass::gemm::device::GemmUniversal<
+        ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC, ElementAccumulator,
+         OpClass, ArchTag, ThreadblockShape,  WarpShape, InstructionShape,
+        cutlass::epilogue::thread::LinearCombination<
+            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+            ElementAccumulator, ElementAccumulator>,
+        ThreadblockSwizzle, kStages>;
+
+    typename GemmRef::Arguments args_ref{
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      problem,
+      batch_count,
+      {alpha, beta},
+      utils.tensor_A.device_data(),
+      utils.tensor_B.device_data(),
+      utils.tensor_C.device_data(),
+      utils.tensor_Y_ref.device_data(),
+      batch_stride_A,
+      batch_stride_B,
+      batch_stride_C1,
+      batch_stride_D,
+      lda,
+      ldb,
+      ldv,
+      ldd,
+    };
+
+    GemmRef gemm_op_ref;
+    size_t workspace_size_ref = GemmRef::get_workspace_size(args_ref);
+    cutlass::device_memory::allocation<uint8_t> workspace_ref(workspace_size_ref);
+    cutlass::Status status = gemm_op_ref.initialize(args_ref, workspace_ref.get());
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << cutlassGetStatusString(status);
+
+    status = gemm_op_ref();
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << cutlassGetStatusString(status);
+
+    //
+    // Create GemmWithBroadcast from single source
+    //
+    using GemmSingle = cutlass::gemm::device::GemmUniversalWithBroadcast<
+        ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC, ElementAccumulator,
+         OpClass, ArchTag, ThreadblockShape,  WarpShape, InstructionShape,
+        cutlass::epilogue::thread::LinearCombinationResidualBlock<
+            ElementOutput, ElementAccumulator, ElementAccumulator,
+            ElementAccumulator, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+            cutlass::epilogue::thread::Identity, cutlass::multiplies, cutlass::epilogue::thread::Identity>,
+        ThreadblockSwizzle, kStages>;
+
+    typename GemmSingle::Arguments args_single{
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      problem,
+      batch_count,
+      {alpha, beta},
+      utils.tensor_A.device_data(),
+      utils.tensor_B.device_data(),
+      utils.tensor_D1.device_data(),
+      utils.tensor_Y1.device_data(),
+      utils.tensor_C.device_data(),
+      /* ptr_Tensor = */ nullptr,
+      batch_stride_A,
+      batch_stride_B,
+      batch_stride_C1,
+      batch_stride_D,
+      batch_stride_Vector,
+      batch_stride_Tensor,
+      lda,
+      ldb,
+      ldc1,
+      ldd,
+      ldv,
+      ldt
+    };
+
+    GemmSingle gemm_op_single;
+    size_t workspace_size_single = GemmSingle::get_workspace_size(args_single);
+    cutlass::device_memory::allocation<uint8_t> workspace_single(workspace_size_single);
+    status = gemm_op_single.initialize(args_single, workspace_single.get());
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << cutlassGetStatusString(status);
+
+    status = gemm_op_single();
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << cutlassGetStatusString(status);
+
+    // Compute the broadcast on the reference previously computed and compare results
+    utils.tensor_Y_ref.sync_host();
+    cutlass::reference::host::TensorMul(utils.tensor_Y_ref.host_view(), utils.tensor_D1.host_view());
+    utils.tensor_Y_ref.sync_device();
+    utils.compare_reference(problem, utils.tensor_Y_ref, utils.tensor_Y1);
+
+    //
+    // Create GemmWithBroadcast from two sources
+    //
+    using GemmDouble = cutlass::gemm::device::GemmUniversalWithBroadcast<
+        ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC, ElementAccumulator,
+         OpClass, ArchTag, ThreadblockShape,  WarpShape, InstructionShape,
+        cutlass::epilogue::thread::LinearCombinationResidualBlock<
+            ElementOutput, ElementAccumulator, ElementAccumulator,
+            ElementAccumulator, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+            cutlass::epilogue::thread::Identity, cutlass::multiplies, cutlass::epilogue::thread::Identity, cutlass::plus>,
+        ThreadblockSwizzle, kStages>;
+
+    typename GemmDouble::Arguments args_double{
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      problem,
+      batch_count,
+      {alpha, beta},
+      utils.tensor_A.device_data(),
+      utils.tensor_B.device_data(),
+      utils.tensor_D1.device_data(),
+      utils.tensor_D2.device_data(),
+      utils.tensor_Y2.device_data(),
+      utils.tensor_C.device_data(),
+      /* ptr_Tensor = */ nullptr,
+      batch_stride_A,
+      batch_stride_B,
+      batch_stride_C1,
+      batch_stride_C2,
+      batch_stride_D,
+      batch_stride_Vector,
+      batch_stride_Tensor,
+      lda,
+      ldb,
+      ldc1,
+      ldc2,
+      ldd,
+      ldv,
+      ldt
+    };
+
+    GemmDouble gemm_op_double;
+    size_t workspace_size_double = GemmDouble::get_workspace_size(args_double);
+    cutlass::device_memory::allocation<uint8_t> workspace_double(workspace_size_double);
+    status = gemm_op_double.initialize(args_double, workspace_double.get());
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << cutlassGetStatusString(status);
+
+    status = gemm_op_double();
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << cutlassGetStatusString(status);
+
+    // Compute the broadcast on the reference previously computed and compare results
+    utils.tensor_Y_ref.sync_host();
+    cutlass::reference::host::TensorAdd(utils.tensor_Y_ref.host_view(), utils.tensor_D2.host_view());
+    utils.tensor_Y_ref.sync_device();
+    utils.compare_reference(problem, utils.tensor_Y_ref, utils.tensor_Y2);
+}
+
+#endif
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu
index 3038c2361b..2968cb79eb 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu
index 078247d374..fb26e42b44 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu
index 2486221b3b..c901facfe5 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu
index 6532c37853..7769648913 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sparse_sm80.cu
index 02ccb7a653..18871125d3 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -36,7 +36,7 @@
 
 #include "../../common/cutlass_unit_test.h"
 #include "cutlass/cutlass.h"
-#include "cutlass/gemm/device/gemm_sparse.h"
+#include "cutlass/gemm/device/gemm_sparse_universal.h"
 #include "cutlass/util/host_tensor.h"
 #include "cutlass/util/reference/host/gemm.h"
 #include "cutlass/util/reference/host/tensor_compare.h"
@@ -54,7 +54,7 @@ TEST(SM80_Device_Sparse_Gemm_f16t_f16n_f16t_tensor_op_f16, 128x256x64_64x64x64)
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = cutlass::half_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -72,7 +72,7 @@ TEST(SM80_Device_Sparse_Gemm_f16t_f16n_f16t_tensor_op_f16, 256x128x64_64x64x64)
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = cutlass::half_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -90,7 +90,7 @@ TEST(SM80_Device_Sparse_Gemm_f16t_f16n_f16t_tensor_op_f16, 128x128x64_64x64x64)
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = cutlass::half_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -109,7 +109,7 @@ TEST(SM80_Device_Sparse_Gemm_f16t_f16n_f16t_tensor_op_f16, 256x64x64_64x64x64) {
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = cutlass::half_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -127,7 +127,7 @@ TEST(SM80_Device_Sparse_Gemm_f16t_f16n_f16t_tensor_op_f16, 64x256x64_64x64x64) {
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = cutlass::half_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -145,7 +145,7 @@ TEST(SM80_Device_Sparse_Gemm_f16t_f16n_f16t_tensor_op_f16, 64x128x64_32x64x64) {
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = cutlass::half_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -163,7 +163,7 @@ TEST(SM80_Device_Sparse_Gemm_f16t_f16n_f16t_tensor_op_f16, 128x64x64_64x32x64) {
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = cutlass::half_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -181,7 +181,7 @@ TEST(SM80_Device_Sparse_Gemm_f16t_f16n_f16t_tensor_op_f16, 64x64x64_32x32x64) {
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = cutlass::half_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -199,7 +199,7 @@ TEST(SM80_Device_Sparse_Gemm_f16t_f16n_f16t_tensor_op_f16, 128x128x128_64x64x128
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = cutlass::half_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -218,7 +218,7 @@ TEST(SM80_Device_Sparse_Gemm_f16t_f16n_f16t_tensor_op_f16, 256x64x128_64x64x128)
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = cutlass::half_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -236,7 +236,7 @@ TEST(SM80_Device_Sparse_Gemm_f16t_f16n_f16t_tensor_op_f16, 128x64x128_64x32x128)
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = cutlass::half_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -254,7 +254,7 @@ TEST(SM80_Device_Sparse_Gemm_f16t_f16n_f16t_tensor_op_f16, 64x64x128_32x32x128)
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = cutlass::half_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f32_sm75.cu
new file mode 100644
index 0000000000..0fe636e530
--- /dev/null
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f32_sm75.cu
@@ -0,0 +1,398 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 32x32x32_16x16x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 16, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 32x64x32_16x32x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 32x128x32_16x64x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<32, 128, 32>,
+    cutlass::gemm::GemmShape<16, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 32x64x32_32x16x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<32, 16, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 32x128x32_32x32x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<32, 128, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 32x256x32_32x64x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<32, 256, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 64x32x32_32x16x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<64, 32, 32>,
+    cutlass::gemm::GemmShape<32, 16, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 128x32x32_64x16x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<128, 32, 32>,
+    cutlass::gemm::GemmShape<64, 16, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 64x32x32_16x32x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<64, 32, 32>,
+    cutlass::gemm::GemmShape<16, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 128x32x32_32x32x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<128, 32, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 256x32x32_64x32x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<256, 32, 32>,
+    cutlass::gemm::GemmShape<64, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f32_sm80.cu
new file mode 100644
index 0000000000..6c80e83958
--- /dev/null
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f32_sm80.cu
@@ -0,0 +1,924 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 16x64x64_16x16x64) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<16, 64, 64>,
+    cutlass::gemm::GemmShape<16, 16, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 16x128x64_16x32x64) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<16, 128, 64>,
+    cutlass::gemm::GemmShape<16, 32, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 16x256x64_16x64x64) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<16, 256, 64>,
+    cutlass::gemm::GemmShape<16, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 64x16x64_16x16x64) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 16, 64>,
+    cutlass::gemm::GemmShape<16, 16, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 128x16x64_32x16x64) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 16, 64>,
+    cutlass::gemm::GemmShape<32, 16, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 256x16x64_64x16x64) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 16, 64>,
+    cutlass::gemm::GemmShape<64, 16, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 32x32x32_16x16x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 16, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 32x64x32_16x32x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 32x128x32_16x64x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 128, 32>,
+    cutlass::gemm::GemmShape<16, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 32x64x32_32x16x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<32, 16, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 32x128x32_32x32x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 128, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 32x256x32_32x64x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 256, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 64x32x32_32x16x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 32, 32>,
+    cutlass::gemm::GemmShape<32, 16, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 128x32x32_64x16x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 32, 32>,
+    cutlass::gemm::GemmShape<64, 16, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 64x32x32_16x32x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 32, 32>,
+    cutlass::gemm::GemmShape<16, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 128x32x32_32x32x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 32, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 256x32x32_64x32x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 32, 32>,
+    cutlass::gemm::GemmShape<64, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 32x32x64_16x16x64) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 64>,
+    cutlass::gemm::GemmShape<16, 16, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 32x64x64_16x32x64) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 64>,
+    cutlass::gemm::GemmShape<16, 32, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 32x128x64_16x64x64) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 128, 64>,
+    cutlass::gemm::GemmShape<16, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 32x64x64_32x16x64) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 64>,
+    cutlass::gemm::GemmShape<32, 16, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 32x128x64_32x32x64) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 128, 64>,
+    cutlass::gemm::GemmShape<32, 32, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 32x256x64_32x64x64) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 256, 64>,
+    cutlass::gemm::GemmShape<32, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 64x32x64_32x16x64) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 32, 64>,
+    cutlass::gemm::GemmShape<32, 16, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 128x32x64_64x16x64) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 32, 64>,
+    cutlass::gemm::GemmShape<64, 16, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 64x32x64_16x32x64) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 32, 64>,
+    cutlass::gemm::GemmShape<16, 32, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 128x32x64_32x32x64) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 32, 64>,
+    cutlass::gemm::GemmShape<32, 32, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f32, 256x32x64_64x32x64) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 32, 64>,
+    cutlass::gemm::GemmShape<64, 32, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu
index 4be7d5bf88..ef7b17bc44 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu
index 32403b6918..bc0c36aeb1 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu
index c30f8b4113..fc35e6158c 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu
index 39d66c221d..c503f90432 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu
index 0fea408698..dd5f03d4c1 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu
index efdfd0d7f1..a5a613b832 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu
index bd06a66fc9..a59df7f780 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sparse_sm80.cu
index e5feffd163..01e191ba64 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -36,7 +36,7 @@
 
 #include "../../common/cutlass_unit_test.h"
 #include "cutlass/cutlass.h"
-#include "cutlass/gemm/device/gemm_sparse.h"
+#include "cutlass/gemm/device/gemm_sparse_universal.h"
 #include "cutlass/util/host_tensor.h"
 #include "cutlass/util/reference/host/gemm.h"
 #include "cutlass/util/reference/host/tensor_compare.h"
@@ -54,7 +54,7 @@ TEST(SM80_Device_Sparse_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x256x64_64x64x64)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -72,7 +72,7 @@ TEST(SM80_Device_Sparse_Gemm_f16t_f16n_f32t_tensor_op_f32, 256x128x64_64x64x64)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -90,7 +90,7 @@ TEST(SM80_Device_Sparse_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x128x64_64x64x64)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -108,7 +108,7 @@ TEST(SM80_Device_Sparse_Gemm_f16t_f16n_f32t_tensor_op_f32, 256x64x64_64x64x64) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -126,7 +126,7 @@ TEST(SM80_Device_Sparse_Gemm_f16t_f16n_f32t_tensor_op_f32, 64x256x64_64x64x64) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -144,7 +144,7 @@ TEST(SM80_Device_Sparse_Gemm_f16t_f16n_f32t_tensor_op_f32, 64x128x64_32x64x64) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -162,7 +162,7 @@ TEST(SM80_Device_Sparse_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x64x64_64x32x64) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -180,7 +180,7 @@ TEST(SM80_Device_Sparse_Gemm_f16t_f16n_f32t_tensor_op_f32, 64x64x64_32x32x64) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -198,7 +198,7 @@ TEST(SM80_Device_Sparse_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x128x128_64x64x128
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -216,7 +216,7 @@ TEST(SM80_Device_Sparse_Gemm_f16t_f16n_f32t_tensor_op_f32, 256x64x128_64x64x128)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -234,7 +234,7 @@ TEST(SM80_Device_Sparse_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x64x128_64x32x128)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -252,7 +252,7 @@ TEST(SM80_Device_Sparse_Gemm_f16t_f16n_f32t_tensor_op_f32, 64x64x128_32x32x128)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -266,6 +266,132 @@ TEST(SM80_Device_Sparse_Gemm_f16t_f16n_f32t_tensor_op_f32, 64x64x128_32x32x128)
   EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
 }
 
+TEST(SM80_Device_Sparse_Gemm_f16t_f16n_f32t_tensor_op_f32, 32x128x128_32x32x128) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<32, 128, 128>,
+      cutlass::gemm::GemmShape<32, 32, 128>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x32x64_32x32x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 32, 64>,
+      cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x32x128_32x32x128) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 32, 128>,
+      cutlass::gemm::GemmShape<32, 32, 128>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_f16t_f16n_f32t_tensor_op_f32, 256x32x64_64x32x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 32, 64>,
+      cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x16x64_32x16x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 16, 64>,
+      cutlass::gemm::GemmShape<32, 16, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x16x128_32x16x128) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 16, 128>,
+      cutlass::gemm::GemmShape<32, 16, 128>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_f16t_f16n_f32t_tensor_op_f32, 256x16x64_64x16x64) {
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 16, 64>,
+      cutlass::gemm::GemmShape<64, 16, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 
 #endif  // CUTLASS_ARCH_SPARSE_MMA_SM80_SUPPORTED
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu
index 9278e32f3e..b3628631ff 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu
index 13effa8039..1e6cfc0554 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu
index 4f148baa4f..f4e20c1bc6 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu
index 0d2c88476a..c9e993ba8f 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu
index ce33b66820..57a1fb3794 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu
index 9410d89e24..5e01ed213d 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu
index 542785365f..d9a03f559d 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu
index dded0e8159..e359358033 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu
index 97add72dd3..f1c3e34bd1 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu
index 66a573de73..4b7bad63bd 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu
index 8bf1730222..87d712861f 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sparse_sm80.cu
index 7428f94b76..0da78ec4e3 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -36,7 +36,7 @@
 
 #include "../../common/cutlass_unit_test.h"
 #include "cutlass/cutlass.h"
-#include "cutlass/gemm/device/gemm_sparse.h"
+#include "cutlass/gemm/device/gemm_sparse_universal.h"
 #include "cutlass/util/host_tensor.h"
 #include "cutlass/util/reference/host/gemm.h"
 #include "cutlass/util/reference/host/tensor_compare.h"
@@ -54,7 +54,7 @@ TEST(SM80_Device_Sparse_Gemm_f16t_f16t_f32t_tensor_op_f32, 128x256x64_64x64x64)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
       cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -72,7 +72,7 @@ TEST(SM80_Device_Sparse_Gemm_f16t_f16t_f32t_tensor_op_f32, 256x128x64_64x64x64)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
       cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -90,7 +90,7 @@ TEST(SM80_Device_Sparse_Gemm_f16t_f16t_f32t_tensor_op_f32, 128x128x64_64x64x64)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
       cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -108,7 +108,7 @@ TEST(SM80_Device_Sparse_Gemm_f16t_f16t_f32t_tensor_op_f32, 256x64x64_64x64x64) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
       cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -126,7 +126,7 @@ TEST(SM80_Device_Sparse_Gemm_f16t_f16t_f32t_tensor_op_f32, 64x256x64_64x64x64) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
       cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -144,7 +144,7 @@ TEST(SM80_Device_Sparse_Gemm_f16t_f16t_f32t_tensor_op_f32, 64x128x64_32x64x64) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
       cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -162,7 +162,7 @@ TEST(SM80_Device_Sparse_Gemm_f16t_f16t_f32t_tensor_op_f32, 128x64x64_64x32x64) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
       cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -180,7 +180,7 @@ TEST(SM80_Device_Sparse_Gemm_f16t_f16t_f32t_tensor_op_f32, 64x64x64_32x32x64) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t,
       cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu
index 830f6baf77..280f0e71cd 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu
index 7f424946cb..11971be9b3 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu
index 7c3fa570f4..2778497f73 100644
--- a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu
index 18e8bc328f..2df79761a9 100644
--- a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sparse_sm80.cu
index 707025c9f4..3b05ae1857 100644
--- a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,7 +35,7 @@
 #include <iostream>
 
 #include "cutlass/cutlass.h"
-#include "cutlass/gemm/device/gemm_sparse.h"
+#include "cutlass/gemm/device/gemm_sparse_universal.h"
 
 #include "../../common/cutlass_unit_test.h"
 
@@ -57,7 +57,7 @@ TEST(SM80_Device_Sparse_Gemm_f32n_f32n_f32t_tensor_op_f32, 256x128x32_64x64x32)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::ColumnMajor,
     float,
@@ -88,7 +88,7 @@ TEST(SM80_Device_Sparse_Gemm_f32n_f32n_f32t_tensor_op_f32, 128x256x32_64x64x32)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::ColumnMajor,
     float,
@@ -119,7 +119,7 @@ TEST(SM80_Device_Sparse_Gemm_f32n_f32n_f32t_tensor_op_f32, 128x128x32_64x64x32)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::ColumnMajor,
     float,
@@ -150,7 +150,7 @@ TEST(SM80_Device_Sparse_Gemm_f32n_f32n_f32t_tensor_op_f32, 256x64x32_64x64x32) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::ColumnMajor,
     float,
@@ -181,7 +181,7 @@ TEST(SM80_Device_Sparse_Gemm_f32n_f32n_f32t_tensor_op_f32, 64x256x32_64x64x32) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::ColumnMajor,
     float,
@@ -212,7 +212,7 @@ TEST(SM80_Device_Sparse_Gemm_f32n_f32n_f32t_tensor_op_f32, 128x64x32_64x32x32) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::ColumnMajor,
     float,
@@ -243,7 +243,7 @@ TEST(SM80_Device_Sparse_Gemm_f32n_f32n_f32t_tensor_op_f32, 64x128x32_32x64x32) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::ColumnMajor,
     float,
@@ -274,7 +274,7 @@ TEST(SM80_Device_Sparse_Gemm_f32n_f32n_f32t_tensor_op_f32, 64x64x32_32x32x32) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::ColumnMajor,
     float,
@@ -305,7 +305,7 @@ TEST(SM80_Device_Sparse_Gemm_f32n_f32n_f32t_tensor_op_f32, 128x128x64_64x64x64)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::ColumnMajor,
     float,
@@ -336,7 +336,7 @@ TEST(SM80_Device_Sparse_Gemm_f32n_f32n_f32t_tensor_op_f32, 256x64x64_64x64x64) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::ColumnMajor,
     float,
@@ -367,7 +367,7 @@ TEST(SM80_Device_Sparse_Gemm_f32n_f32n_f32t_tensor_op_f32, 128x64x64_64x32x64) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::ColumnMajor,
     float,
@@ -398,7 +398,7 @@ TEST(SM80_Device_Sparse_Gemm_f32n_f32n_f32t_tensor_op_f32, 64x64x64_32x32x64) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::ColumnMajor,
     float,
diff --git a/test/unit/gemm/device/gemm_f32n_f32t_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f32n_f32t_f32t_tensor_op_f32_sparse_sm80.cu
index 424d1bac57..452250fb3d 100644
--- a/test/unit/gemm/device/gemm_f32n_f32t_f32t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f32n_f32t_f32t_tensor_op_f32_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,7 +35,7 @@
 #include <iostream>
 
 #include "cutlass/cutlass.h"
-#include "cutlass/gemm/device/gemm_sparse.h"
+#include "cutlass/gemm/device/gemm_sparse_universal.h"
 
 #include "../../common/cutlass_unit_test.h"
 
@@ -57,7 +57,7 @@ TEST(SM80_Device_Sparse_Gemm_f32n_f32t_f32t_tensor_op_f32, 256x128x32_64x64x32)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::ColumnMajor,
     float,
@@ -88,7 +88,7 @@ TEST(SM80_Device_Sparse_Gemm_f32n_f32t_f32t_tensor_op_f32, 128x256x32_64x64x32)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::ColumnMajor,
     float,
@@ -119,7 +119,7 @@ TEST(SM80_Device_Sparse_Gemm_f32n_f32t_f32t_tensor_op_f32, 128x128x32_64x64x32)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::ColumnMajor,
     float,
@@ -150,7 +150,7 @@ TEST(SM80_Device_Sparse_Gemm_f32n_f32t_f32t_tensor_op_f32, 256x64x32_64x64x32) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::ColumnMajor,
     float,
@@ -181,7 +181,7 @@ TEST(SM80_Device_Sparse_Gemm_f32n_f32t_f32t_tensor_op_f32, 64x256x32_64x64x32) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::ColumnMajor,
     float,
@@ -212,7 +212,7 @@ TEST(SM80_Device_Sparse_Gemm_f32n_f32t_f32t_tensor_op_f32, 128x64x32_64x32x32) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::ColumnMajor,
     float,
@@ -243,7 +243,7 @@ TEST(SM80_Device_Sparse_Gemm_f32n_f32t_f32t_tensor_op_f32, 64x128x32_32x64x32) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::ColumnMajor,
     float,
@@ -274,7 +274,7 @@ TEST(SM80_Device_Sparse_Gemm_f32n_f32t_f32t_tensor_op_f32, 64x64x32_32x32x32) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::ColumnMajor,
     float,
@@ -305,7 +305,7 @@ TEST(SM80_Device_Sparse_Gemm_f32n_f32t_f32t_tensor_op_f32, 128x128x64_64x64x64)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::ColumnMajor,
     float,
@@ -336,7 +336,7 @@ TEST(SM80_Device_Sparse_Gemm_f32n_f32t_f32t_tensor_op_f32, 256x64x64_64x64x64) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::ColumnMajor,
     float,
@@ -367,7 +367,7 @@ TEST(SM80_Device_Sparse_Gemm_f32n_f32t_f32t_tensor_op_f32, 128x64x64_64x32x64) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::ColumnMajor,
     float,
@@ -398,7 +398,7 @@ TEST(SM80_Device_Sparse_Gemm_f32n_f32t_f32t_tensor_op_f32, 64x64x64_32x32x64) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::ColumnMajor,
     float,
@@ -424,6 +424,37 @@ TEST(SM80_Device_Sparse_Gemm_f32n_f32t_f32t_tensor_op_f32, 64x64x64_32x32x64) {
   EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
 }
 
+TEST(SM80_Device_Sparse_Gemm_f32n_f32t_f32t_tensor_op_f32, 32x128x64_32x32x64) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 128, 64>,
+    cutlass::gemm::GemmShape<32, 32, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3 
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 #endif // #if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/gemm_f32t_f32n_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f32t_f32n_f32t_tensor_op_f32_sparse_sm80.cu
index 5ccd061f2f..5fd7d7d475 100644
--- a/test/unit/gemm/device/gemm_f32t_f32n_f32t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f32t_f32n_f32t_tensor_op_f32_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,7 +35,7 @@
 #include <iostream>
 
 #include "cutlass/cutlass.h"
-#include "cutlass/gemm/device/gemm_sparse.h"
+#include "cutlass/gemm/device/gemm_sparse_universal.h"
 
 #include "../../common/cutlass_unit_test.h"
 
@@ -57,7 +57,7 @@ TEST(SM80_Device_Sparse_Gemm_f32t_f32n_f32t_tensor_op_f32, 256x128x32_64x64x32)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::RowMajor,
     float,
@@ -88,7 +88,7 @@ TEST(SM80_Device_Sparse_Gemm_f32t_f32n_f32t_tensor_op_f32, 128x256x32_64x64x32)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::RowMajor,
     float,
@@ -119,7 +119,7 @@ TEST(SM80_Device_Sparse_Gemm_f32t_f32n_f32t_tensor_op_f32, 128x128x32_64x64x32)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::RowMajor,
     float,
@@ -150,7 +150,7 @@ TEST(SM80_Device_Sparse_Gemm_f32t_f32n_f32t_tensor_op_f32, 256x64x32_64x64x32) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::RowMajor,
     float,
@@ -181,7 +181,7 @@ TEST(SM80_Device_Sparse_Gemm_f32t_f32n_f32t_tensor_op_f32, 64x256x32_64x64x32) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::RowMajor,
     float,
@@ -212,7 +212,7 @@ TEST(SM80_Device_Sparse_Gemm_f32t_f32n_f32t_tensor_op_f32, 128x64x32_64x32x32) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::RowMajor,
     float,
@@ -243,7 +243,7 @@ TEST(SM80_Device_Sparse_Gemm_f32t_f32n_f32t_tensor_op_f32, 64x128x32_32x64x32) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::RowMajor,
     float,
@@ -274,7 +274,7 @@ TEST(SM80_Device_Sparse_Gemm_f32t_f32n_f32t_tensor_op_f32, 64x64x32_32x32x32) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::RowMajor,
     float,
@@ -305,7 +305,7 @@ TEST(SM80_Device_Sparse_Gemm_f32t_f32n_f32t_tensor_op_f32, 128x128x64_64x64x64)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::RowMajor,
     float,
@@ -336,7 +336,7 @@ TEST(SM80_Device_Sparse_Gemm_f32t_f32n_f32t_tensor_op_f32, 256x64x64_64x64x64) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::RowMajor,
     float,
@@ -367,7 +367,7 @@ TEST(SM80_Device_Sparse_Gemm_f32t_f32n_f32t_tensor_op_f32, 128x64x64_64x32x64) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::RowMajor,
     float,
@@ -398,7 +398,7 @@ TEST(SM80_Device_Sparse_Gemm_f32t_f32n_f32t_tensor_op_f32, 64x64x64_32x32x64) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::RowMajor,
     float,
@@ -423,6 +423,224 @@ TEST(SM80_Device_Sparse_Gemm_f32t_f32n_f32t_tensor_op_f32, 64x64x64_32x32x64) {
 
   EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
 }
+
+TEST(SM80_Device_Sparse_Gemm_f32t_f32n_f32t_tensor_op_f32, 32x128x64_32x32x64) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 128, 64>,
+    cutlass::gemm::GemmShape<32, 32, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3 
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_f32t_f32n_f32t_tensor_op_f32, 128x32x32_32x32x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 32, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    6 
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_f32t_f32n_f32t_tensor_op_f32, 128x32x64_32x32x64) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 32, 64>,
+    cutlass::gemm::GemmShape<32, 32, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    6 
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_f32t_f32n_f32t_tensor_op_f32, 256x32x32_64x32x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 32, 32>,
+    cutlass::gemm::GemmShape<64, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    6 
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_f32t_f32n_f32t_tensor_op_f32, 128x16x32_32x16x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 16, 32>,
+    cutlass::gemm::GemmShape<32, 16, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    6 
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_f32t_f32n_f32t_tensor_op_f32, 128x16x64_32x16x64) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 16, 64>,
+    cutlass::gemm::GemmShape<32, 16, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    6 
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_f32t_f32n_f32t_tensor_op_f32, 256x16x32_64x16x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 16, 32>,
+    cutlass::gemm::GemmShape<64, 16, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    6 
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 #endif // #if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/gemm_f32t_f32t_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f32t_f32t_f32t_tensor_op_f32_sparse_sm80.cu
index cf184ec656..3988e6e4e0 100644
--- a/test/unit/gemm/device/gemm_f32t_f32t_f32t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f32t_f32t_f32t_tensor_op_f32_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,7 +35,7 @@
 #include <iostream>
 
 #include "cutlass/cutlass.h"
-#include "cutlass/gemm/device/gemm_sparse.h"
+#include "cutlass/gemm/device/gemm_sparse_universal.h"
 
 #include "../../common/cutlass_unit_test.h"
 
@@ -57,7 +57,7 @@ TEST(SM80_Device_Sparse_Gemm_f32t_f32t_f32t_tensor_op_f32, 256x128x32_64x64x32)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::RowMajor,
     float,
@@ -88,7 +88,7 @@ TEST(SM80_Device_Sparse_Gemm_f32t_f32t_f32t_tensor_op_f32, 128x256x32_64x64x32)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::RowMajor,
     float,
@@ -119,7 +119,7 @@ TEST(SM80_Device_Sparse_Gemm_f32t_f32t_f32t_tensor_op_f32, 128x128x32_64x64x32)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::RowMajor,
     float,
@@ -150,7 +150,7 @@ TEST(SM80_Device_Sparse_Gemm_f32t_f32t_f32t_tensor_op_f32, 256x64x32_64x64x32) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::RowMajor,
     float,
@@ -181,7 +181,7 @@ TEST(SM80_Device_Sparse_Gemm_f32t_f32t_f32t_tensor_op_f32, 64x256x32_64x64x32) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::RowMajor,
     float,
@@ -212,7 +212,7 @@ TEST(SM80_Device_Sparse_Gemm_f32t_f32t_f32t_tensor_op_f32, 128x64x32_64x32x32) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::RowMajor,
     float,
@@ -243,7 +243,7 @@ TEST(SM80_Device_Sparse_Gemm_f32t_f32t_f32t_tensor_op_f32, 64x128x32_32x64x32) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::RowMajor,
     float,
@@ -274,7 +274,7 @@ TEST(SM80_Device_Sparse_Gemm_f32t_f32t_f32t_tensor_op_f32, 64x64x32_32x32x32) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::RowMajor,
     float,
@@ -305,7 +305,7 @@ TEST(SM80_Device_Sparse_Gemm_f32t_f32t_f32t_tensor_op_f32, 128x128x64_64x64x64)
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::RowMajor,
     float,
@@ -336,7 +336,7 @@ TEST(SM80_Device_Sparse_Gemm_f32t_f32t_f32t_tensor_op_f32, 256x64x64_64x64x64) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::RowMajor,
     float,
@@ -367,7 +367,7 @@ TEST(SM80_Device_Sparse_Gemm_f32t_f32t_f32t_tensor_op_f32, 128x64x64_64x32x64) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::RowMajor,
     float,
@@ -398,7 +398,7 @@ TEST(SM80_Device_Sparse_Gemm_f32t_f32t_f32t_tensor_op_f32, 64x64x64_32x32x64) {
   using ElementOutput = float;
   using ElementAccumulator = float;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
     float,
     cutlass::layout::RowMajor,
     float,
diff --git a/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu
index 683cf43cc5..fd78728907 100644
--- a/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm90.cu b/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm90.cu
new file mode 100644
index 0000000000..0d44643900
--- /dev/null
+++ b/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm90.cu
@@ -0,0 +1,222 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface with Hopper FP64
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f64n_f64t_f64t_tensor_op_f64, 32x32x16_16x16x16_16x8x4) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+  using ElementCompute = double;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    double,
+    cutlass::layout::ColumnMajor,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f64n_f64t_f64t_tensor_op_f64, 64x64x16_32x32x16_16x8x4) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+  using ElementCompute = double;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    double,
+    cutlass::layout::ColumnMajor,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f64n_f64t_f64t_tensor_op_f64, 128x64x16_64x32x16_16x8x4) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+  using ElementCompute = double;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    double,
+    cutlass::layout::ColumnMajor,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f64n_f64t_f64t_tensor_op_f64, 64x128x16_32x64x16_16x8x4) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+  using ElementCompute = double;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    double,
+    cutlass::layout::ColumnMajor,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+  
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f64n_f64t_f64t_tensor_op_f64, 128x128x16_32x64x16_16x8x4) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+  using ElementCompute = double;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    double,
+    cutlass::layout::ColumnMajor,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
diff --git a/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu
index eb6f198433..03748bd43a 100644
--- a/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm90.cu b/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm90.cu
new file mode 100644
index 0000000000..2b1906d568
--- /dev/null
+++ b/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm90.cu
@@ -0,0 +1,222 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface with Hopper FP64
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f64t_f64n_f64t_tensor_op_f64, 32x32x16_16x16x16_16x8x4) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+  using ElementCompute = double;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    double,
+    cutlass::layout::RowMajor,
+    double,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f64t_f64n_f64t_tensor_op_f64, 64x64x16_32x32x16_16x8x4) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+  using ElementCompute = double;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    double,
+    cutlass::layout::RowMajor,
+    double,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f64t_f64n_f64t_tensor_op_f64, 64x128x16_32x64x16_16x8x4) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+  using ElementCompute = double;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    double,
+    cutlass::layout::RowMajor,
+    double,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f64t_f64n_f64t_tensor_op_f64, 128x64x16_64x32x16_16x8x4) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+  using ElementCompute = double;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    double,
+    cutlass::layout::RowMajor,
+    double,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f64t_f64n_f64t_tensor_op_f64, 128x128x16_32x64x16_16x8x4) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+  using ElementCompute = double;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    double,
+    cutlass::layout::RowMajor,
+    double,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // if (CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
diff --git a/test/unit/gemm/device/gemm_f8t_f8n_f32t_tensor_op_f32_sm89.cu b/test/unit/gemm/device/gemm_f8t_f8n_f32t_tensor_op_f32_sm89.cu
new file mode 100644
index 0000000000..7208ea5336
--- /dev/null
+++ b/test/unit/gemm/device/gemm_f8t_f8n_f32t_tensor_op_f32_sm89.cu
@@ -0,0 +1,154 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Tests for device-wide GEMM interface with:
+        A: row major, of type FE4M4 or FE5M2
+        B: column major, of type FE4M3 or FE5M2
+        C: row major, of type F32
+        Accum: F32
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM89_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Gemm_fe4m3t_fe4m3n_f32t_tensor_op_f32, 128x256x64_64x64x64) {
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  static int const kStages = 3;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm89,
+      cutlass::gemm::GemmShape<128, 256, 64>, cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Gemm_fe4m3t_fe5m2n_f32t_tensor_op_f32, 128x256x64_64x64x64) {
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e5m2_t;
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  static int const kStages = 3;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm89,
+      cutlass::gemm::GemmShape<128, 256, 64>, cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Gemm_fe5m2t_fe4m3n_f32t_tensor_op_f32, 128x256x64_64x64x64) {
+  using ElementA = cutlass::float_e5m2_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  static int const kStages = 3;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm89,
+      cutlass::gemm::GemmShape<128, 256, 64>, cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Gemm_fe5m2t_fe5m2n_f32t_tensor_op_f32, 128x256x64_64x64x64) {
+  using ElementA = cutlass::float_e5m2_t;
+  using ElementB = cutlass::float_e5m2_t;
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  static int const kStages = 3;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm89,
+      cutlass::gemm::GemmShape<128, 256, 64>, cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // CUTLASS_ARCH_MMA_SM89_SUPPORTED
diff --git a/test/unit/gemm/device/gemm_f8t_f8n_f32t_tensor_op_f32_sparse_sm89.cu b/test/unit/gemm/device/gemm_f8t_f8n_f32t_tensor_op_f32_sparse_sm89.cu
new file mode 100644
index 0000000000..d21020f235
--- /dev/null
+++ b/test/unit/gemm/device/gemm_f8t_f8n_f32t_tensor_op_f32_sparse_sm89.cu
@@ -0,0 +1,154 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Tests for device-wide sparse GEMM interface with:
+        A: row major, of type FE4M4 or FE5M2
+        B: column major, of type FE4M3 or FE5M2
+        C: row major, of type F32
+        Accum: F32
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm_sparse_universal.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_sparse.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM89_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Sparse_Gemm_fe4m3t_fe4m3n_f32t_tensor_op_f32, 128x128x128_64x64x128) {
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  static int const kStages = 3;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm89,
+      cutlass::gemm::GemmShape<128, 128, 128>, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Sparse_Gemm_fe4m3t_fe5m2n_f32t_tensor_op_f32, 128x128x128_64x64x128) {
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e5m2_t;
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  static int const kStages = 3;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm89,
+      cutlass::gemm::GemmShape<128, 128, 128>, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Sparse_Gemm_fe5m2t_fe4m3n_f32t_tensor_op_f32, 128x128x128_64x64x128) {
+  using ElementA = cutlass::float_e5m2_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  static int const kStages = 3;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm89,
+      cutlass::gemm::GemmShape<128, 128, 128>, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Sparse_Gemm_fe5m2t_fe5m2n_f32t_tensor_op_f32, 128x128x128_64x64x128) {
+  using ElementA = cutlass::float_e5m2_t;
+  using ElementB = cutlass::float_e5m2_t;
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  static int const kStages = 3;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm89,
+      cutlass::gemm::GemmShape<128, 128, 128>, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // CUTLASS_ARCH_MMA_SM89_SUPPORTED
diff --git a/test/unit/gemm/device/gemm_f8t_f8n_f8t_tensor_op_f32_sm89.cu b/test/unit/gemm/device/gemm_f8t_f8n_f8t_tensor_op_f32_sm89.cu
new file mode 100644
index 0000000000..0f7f4d952c
--- /dev/null
+++ b/test/unit/gemm/device/gemm_f8t_f8n_f8t_tensor_op_f32_sm89.cu
@@ -0,0 +1,430 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Tests for device-wide GEMM interface with:
+        A: row major, of type FE4M4 or FE5M2
+        B: column major, of type FE4M3 or FE5M2
+        C: row major, of FE4M3 or FE5M2
+        Accum: F32
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination_generic_with_scaling.h"
+#include "cutlass/gemm/device/gemm_universal_with_absmax.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed.h"
+#include "testbed_with_absmax.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM89_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Gemm_fe4m3t_fe4m3n_fe4m3t_tensor_op_f32, identity_128x256x64_64x64x64) {
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementOutput = cutlass::float_e4m3_t;
+  using ElementAuxOutput = ElementOutput;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationGenericWithScalingAndAbsMax<
+    cutlass::epilogue::thread::Identity,
+    ElementOutput,
+    ElementAuxOutput,
+    128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalWithAbsMax<
+    ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC,
+    ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm89,
+    cutlass::gemm::GemmShape<128, 256, 64>, cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+    EpilogueOutputOp, cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages
+  >;
+
+  bool passed = test::gemm::device::TestAllGemmWithAbsmax<Gemm, test::gemm::device::Testbed<Gemm>, cutlass::epilogue::thread::Identity>();
+  EXPECT_TRUE(passed);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Gemm_fe4m3t_fe4m3n_fe4m3t_tensor_op_f32, identity_fastacc_128x256x64_64x64x64) {
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementOutput = cutlass::float_e4m3_t;
+  using ElementAuxOutput = ElementOutput;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  static int const kStages = 3;
+  static int const kAlignment = 16;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationGenericWithScalingAndAbsMax<
+    cutlass::epilogue::thread::Identity,
+    ElementOutput,
+    ElementAuxOutput,
+    128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalWithAbsMax<
+    ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC,
+    ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm89,
+    cutlass::gemm::GemmShape<128, 256, 64>, cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+    EpilogueOutputOp, cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages,
+    kAlignment, kAlignment, cutlass::arch::OpMultiplyAddFastAccum
+  >;
+
+  bool passed = test::gemm::device::TestAllGemmWithAbsmax<Gemm, test::gemm::device::Testbed<Gemm>, cutlass::epilogue::thread::Identity>();
+  EXPECT_TRUE(passed);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Gemm_fe4m3t_fe4m3n_fe4m3t_tensor_op_f32, relu_128x256x64_64x64x64) {
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementOutput = cutlass::float_e4m3_t;
+  using ElementAuxOutput = ElementOutput;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationGenericWithScalingAndAbsMax<
+    cutlass::epilogue::thread::ReLu,
+    ElementOutput,
+    ElementAuxOutput,
+    128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalWithAbsMax<
+    ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC,
+    ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm89,
+    cutlass::gemm::GemmShape<128, 256, 64>, cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+    EpilogueOutputOp, cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages
+  >;
+
+  bool passed = test::gemm::device::TestAllGemmWithAbsmax<Gemm, test::gemm::device::Testbed<Gemm>, cutlass::epilogue::thread::ReLu>();
+  EXPECT_TRUE(passed);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Gemm_fe4m3t_fe5m2n_fe4m3t_tensor_op_f32, identity_128x256x64_64x64x64) {
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e5m2_t;
+  using ElementOutput = cutlass::float_e4m3_t;
+  using ElementAuxOutput = ElementOutput;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationGenericWithScalingAndAbsMax<
+    cutlass::epilogue::thread::Identity,
+    ElementOutput,
+    ElementAuxOutput,
+    128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalWithAbsMax<
+    ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC,
+    ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm89,
+    cutlass::gemm::GemmShape<128, 256, 64>, cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+    EpilogueOutputOp, cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages
+  >;
+
+  bool passed = test::gemm::device::TestAllGemmWithAbsmax<Gemm, test::gemm::device::Testbed<Gemm>, cutlass::epilogue::thread::Identity>();
+  EXPECT_TRUE(passed);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Gemm_fe5m2t_fe4m3n_fe4m3t_tensor_op_f32, identity_128x256x64_64x64x64) {
+  using ElementA = cutlass::float_e5m2_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementOutput = cutlass::float_e4m3_t;
+  using ElementAuxOutput = ElementOutput;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationGenericWithScalingAndAbsMax<
+    cutlass::epilogue::thread::Identity,
+    ElementOutput,
+    ElementAuxOutput,
+    128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalWithAbsMax<
+    ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC,
+    ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm89,
+    cutlass::gemm::GemmShape<128, 256, 64>, cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+    EpilogueOutputOp, cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages
+  >;
+
+  bool passed = test::gemm::device::TestAllGemmWithAbsmax<Gemm, test::gemm::device::Testbed<Gemm>, cutlass::epilogue::thread::Identity>();
+  EXPECT_TRUE(passed);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Gemm_fe5m2t_fe5m2n_fe4m3t_tensor_op_f32, identity_128x256x64_64x64x64) {
+  using ElementA = cutlass::float_e5m2_t;
+  using ElementB = cutlass::float_e5m2_t;
+  using ElementOutput = cutlass::float_e4m3_t;
+  using ElementAuxOutput = ElementOutput;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationGenericWithScalingAndAbsMax<
+    cutlass::epilogue::thread::Identity,
+    ElementOutput,
+    ElementAuxOutput,
+    128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalWithAbsMax<
+    ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC,
+    ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm89,
+    cutlass::gemm::GemmShape<128, 256, 64>, cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+    EpilogueOutputOp, cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages
+  >;
+
+  bool passed = test::gemm::device::TestAllGemmWithAbsmax<Gemm, test::gemm::device::Testbed<Gemm>, cutlass::epilogue::thread::Identity>();
+  EXPECT_TRUE(passed);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Gemm_fe4m3t_fe4m3n_fe5m2t_tensor_op_f32, identity_128x256x64_64x64x64) {
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementOutput = cutlass::float_e5m2_t;
+  using ElementAuxOutput = ElementOutput;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationGenericWithScalingAndAbsMax<
+    cutlass::epilogue::thread::Identity,
+    ElementOutput,
+    ElementAuxOutput,
+    128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalWithAbsMax<
+    ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC,
+    ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm89,
+    cutlass::gemm::GemmShape<128, 256, 64>, cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+    EpilogueOutputOp, cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages
+  >;
+
+  bool passed = test::gemm::device::TestAllGemmWithAbsmax<Gemm, test::gemm::device::Testbed<Gemm>, cutlass::epilogue::thread::Identity>();
+  EXPECT_TRUE(passed);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Gemm_fe5m2t_fe5m2n_fe5m2t_tensor_op_f32, identity_diff_aux_output_types_128x256x64_64x64x64) {
+  using ElementA = cutlass::float_e5m2_t;
+  using ElementB = cutlass::float_e5m2_t;
+  using ElementOutput = cutlass::float_e4m3_t;
+  using ElementAuxOutput = cutlass::float_e5m2_t;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationGenericWithScalingAndAbsMax<
+    cutlass::epilogue::thread::Identity,
+    ElementOutput,
+    ElementAuxOutput,
+    128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalWithAbsMax<
+    ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC,
+    ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm89,
+    cutlass::gemm::GemmShape<128, 256, 64>, cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+    EpilogueOutputOp, cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages
+  >;
+
+  bool passed = test::gemm::device::TestAllGemmWithAbsmax<Gemm, test::gemm::device::Testbed<Gemm>, cutlass::epilogue::thread::Identity>();
+  EXPECT_TRUE(passed);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Gemm_fe4m3t_fe4m3n_fe4m3t_tensor_op_f32, identity_128x128x64_32x64x64) {
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementOutput = cutlass::float_e4m3_t;
+  using ElementAuxOutput = ElementOutput;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationGenericWithScalingAndAbsMax<
+    cutlass::epilogue::thread::Identity,
+    ElementOutput,
+    ElementAuxOutput,
+    128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalWithAbsMax<
+    ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC,
+    ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm89,
+    cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+    EpilogueOutputOp, cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages
+  >;
+
+  bool passed = test::gemm::device::TestAllGemmWithAbsmax<Gemm, test::gemm::device::Testbed<Gemm>, cutlass::epilogue::thread::Identity>();
+  EXPECT_TRUE(passed);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Gemm_fe4m3t_fe4m3n_fe4m3t_tensor_op_f32, identity_noScale_128x256x64_64x64x64) {
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementOutput = cutlass::float_e4m3_t;
+  using ElementAuxOutput = ElementOutput;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationGenericWithScalingAndAbsMax<
+    cutlass::epilogue::thread::Identity,
+    ElementOutput,
+    ElementAuxOutput,
+    128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalWithAbsMax<
+    ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC,
+    ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm89,
+    cutlass::gemm::GemmShape<128, 256, 64>, cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+    EpilogueOutputOp, cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages
+  >;
+
+  bool passed = test::gemm::device::TestAllGemmWithAbsmax<Gemm, test::gemm::device::Testbed<Gemm>, cutlass::epilogue::thread::Identity>(
+    /* scaleA = */false,
+    /* scaleB = */false,
+    /* scaleC = */false
+  );
+  EXPECT_TRUE(passed);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Gemm_fe4m3t_fe4m3n_fe4m3t_tensor_op_f32, identity_noAux_128x256x64_64x64x64) {
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementOutput = cutlass::float_e4m3_t;
+  using ElementAuxOutput = float;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationGenericWithScalingAndAbsMax<
+    cutlass::epilogue::thread::Identity,
+    ElementOutput,
+    ElementAuxOutput,
+    128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalWithAbsMax<
+    ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC,
+    ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm89,
+    cutlass::gemm::GemmShape<128, 256, 64>, cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+    EpilogueOutputOp, cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages
+  >;
+
+  bool passed = test::gemm::device::TestAllGemmWithAbsmax<Gemm, test::gemm::device::Testbed<Gemm>, cutlass::epilogue::thread::Identity>();
+  EXPECT_TRUE(passed);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // CUTLASS_ARCH_MMA_SM89_SUPPORTED
diff --git a/test/unit/gemm/device/gemm_f8t_f8n_f8t_tensor_op_f32_sparse_sm89.cu b/test/unit/gemm/device/gemm_f8t_f8n_f8t_tensor_op_f32_sparse_sm89.cu
new file mode 100644
index 0000000000..0733bc7097
--- /dev/null
+++ b/test/unit/gemm/device/gemm_f8t_f8n_f8t_tensor_op_f32_sparse_sm89.cu
@@ -0,0 +1,464 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Tests for device-wide sparse GEMM interface with:
+        A: row major, of type FE4M4 or FE5M2
+        B: column major, of type FE4M3 or FE5M2
+        C: row major, of FE4M3 or FE5M2
+        Accum: F32
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination_generic_with_scaling.h"
+#include "cutlass/gemm/device/gemm_sparse_universal_with_absmax.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_sparse.h"
+#include "testbed_with_absmax.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM89_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Sparse_Gemm_fe4m3t_fe4m3n_fe4m3t_tensor_op_f32, identity_128x128x128_64x64x128) {
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementOutput = cutlass::float_e4m3_t;
+  using ElementAuxOutput = ElementOutput;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationGenericWithScalingAndAbsMax<
+    cutlass::epilogue::thread::Identity,
+    ElementOutput,
+    ElementAuxOutput,
+    128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversalWithAbsmax<
+    ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC,
+    ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm89,
+    cutlass::gemm::GemmShape<128, 128, 128>, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+    EpilogueOutputOp, cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages
+  >;
+
+  bool passed = test::gemm::device::TestAllGemmWithAbsmax<Gemm, test::gemm::device::SparseTestbed<Gemm>, cutlass::epilogue::thread::Identity>();
+  EXPECT_TRUE(passed);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Sparse_Gemm_fe4m3t_fe4m3n_f32t_tensor_op_f32, identity_128x128x128_64x64x128) {
+  // Test with float D and Aux for testing split-K without needing relative equality checks
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementOutput = float;
+  using ElementAuxOutput = ElementOutput;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationGenericWithScalingAndAbsMax<
+    cutlass::epilogue::thread::Identity,
+    ElementOutput,
+    ElementAuxOutput,
+    128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversalWithAbsmax<
+    ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC,
+    ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm89,
+    cutlass::gemm::GemmShape<128, 128, 128>, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+    EpilogueOutputOp, cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages
+  >;
+
+  bool passed = test::gemm::device::TestAllGemmWithAbsmax<Gemm, test::gemm::device::SparseTestbed<Gemm>, cutlass::epilogue::thread::Identity>();
+  EXPECT_TRUE(passed);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Sparse_Gemm_fe4m3t_fe4m3n_fe4m3t_tensor_op_f32, identity_fastacc_128x128x128_64x64x128) {
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementOutput = cutlass::float_e4m3_t;
+  using ElementAuxOutput = ElementOutput;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  static int const kStages = 3;
+  static int const kAlignment = 16;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationGenericWithScalingAndAbsMax<
+    cutlass::epilogue::thread::Identity,
+    ElementOutput,
+    ElementAuxOutput,
+    128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversalWithAbsmax<
+    ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC,
+    ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm89,
+    cutlass::gemm::GemmShape<128, 128, 128>, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+    EpilogueOutputOp, cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages,
+    kAlignment, kAlignment, cutlass::arch::OpMultiplyAddFastAccum
+  >;
+
+  bool passed = test::gemm::device::TestAllGemmWithAbsmax<Gemm, test::gemm::device::SparseTestbed<Gemm>, cutlass::epilogue::thread::Identity>();
+  EXPECT_TRUE(passed);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Sparse_Gemm_fe4m3t_fe4m3n_fe4m3t_tensor_op_f32, relu_128x128x128_64x64x128) {
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementOutput = cutlass::float_e4m3_t;
+  using ElementAuxOutput = ElementOutput;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationGenericWithScalingAndAbsMax<
+    cutlass::epilogue::thread::ReLu,
+    ElementOutput,
+    ElementAuxOutput,
+    128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversalWithAbsmax<
+    ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC,
+    ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm89,
+    cutlass::gemm::GemmShape<128, 128, 128>, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+    EpilogueOutputOp, cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages
+  >;
+
+  bool passed = test::gemm::device::TestAllGemmWithAbsmax<Gemm, test::gemm::device::SparseTestbed<Gemm>, cutlass::epilogue::thread::ReLu>();
+  EXPECT_TRUE(passed);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Sparse_Gemm_fe4m3t_fe5m2n_fe4m3t_tensor_op_f32, identity_128x128x128_64x64x128) {
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e5m2_t;
+  using ElementOutput = cutlass::float_e4m3_t;
+  using ElementAuxOutput = ElementOutput;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationGenericWithScalingAndAbsMax<
+    cutlass::epilogue::thread::Identity,
+    ElementOutput,
+    ElementAuxOutput,
+    128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversalWithAbsmax<
+    ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC,
+    ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm89,
+    cutlass::gemm::GemmShape<128, 128, 128>, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+    EpilogueOutputOp, cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages
+  >;
+
+  bool passed = test::gemm::device::TestAllGemmWithAbsmax<Gemm, test::gemm::device::SparseTestbed<Gemm>, cutlass::epilogue::thread::Identity>();
+  EXPECT_TRUE(passed);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Sparse_Gemm_fe5m2t_fe4m3n_fe4m3t_tensor_op_f32, identity_128x128x128_64x64x128) {
+  using ElementA = cutlass::float_e5m2_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementOutput = cutlass::float_e4m3_t;
+  using ElementAuxOutput = ElementOutput;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationGenericWithScalingAndAbsMax<
+    cutlass::epilogue::thread::Identity,
+    ElementOutput,
+    ElementAuxOutput,
+    128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversalWithAbsmax<
+    ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC,
+    ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm89,
+    cutlass::gemm::GemmShape<128, 128, 128>, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+    EpilogueOutputOp, cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages
+  >;
+
+  bool passed = test::gemm::device::TestAllGemmWithAbsmax<Gemm, test::gemm::device::SparseTestbed<Gemm>, cutlass::epilogue::thread::Identity>();
+  EXPECT_TRUE(passed);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Sparse_Gemm_fe5m2t_fe5m2n_fe4m3t_tensor_op_f32, identity_128x128x128_64x64x128) {
+  using ElementA = cutlass::float_e5m2_t;
+  using ElementB = cutlass::float_e5m2_t;
+  using ElementOutput = cutlass::float_e4m3_t;
+  using ElementAuxOutput = ElementOutput;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationGenericWithScalingAndAbsMax<
+    cutlass::epilogue::thread::Identity,
+    ElementOutput,
+    ElementAuxOutput,
+    128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversalWithAbsmax<
+    ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC,
+    ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm89,
+    cutlass::gemm::GemmShape<128, 128, 128>, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+    EpilogueOutputOp, cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages
+  >;
+
+  bool passed = test::gemm::device::TestAllGemmWithAbsmax<Gemm, test::gemm::device::SparseTestbed<Gemm>, cutlass::epilogue::thread::Identity>();
+  EXPECT_TRUE(passed);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Sparse_Gemm_fe4m3t_fe4m3n_fe5m2t_tensor_op_f32, identity_128x128x128_64x64x128) {
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementOutput = cutlass::float_e5m2_t;
+  using ElementAuxOutput = ElementOutput;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationGenericWithScalingAndAbsMax<
+    cutlass::epilogue::thread::Identity,
+    ElementOutput,
+    ElementAuxOutput,
+    128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversalWithAbsmax<
+    ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC,
+    ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm89,
+    cutlass::gemm::GemmShape<128, 128, 128>, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+    EpilogueOutputOp, cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages
+  >;
+
+  bool passed = test::gemm::device::TestAllGemmWithAbsmax<Gemm, test::gemm::device::SparseTestbed<Gemm>, cutlass::epilogue::thread::Identity>();
+  EXPECT_TRUE(passed);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Sparse_Gemm_fe5m2t_fe5m2n_fe5m2t_tensor_op_f32, identity_diff_aux_output_types_128x128x128_64x64x128) {
+  using ElementA = cutlass::float_e5m2_t;
+  using ElementB = cutlass::float_e5m2_t;
+  using ElementOutput = cutlass::float_e4m3_t;
+  using ElementAuxOutput = cutlass::float_e5m2_t;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationGenericWithScalingAndAbsMax<
+    cutlass::epilogue::thread::Identity,
+    ElementOutput,
+    ElementAuxOutput,
+    128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversalWithAbsmax<
+    ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC,
+    ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm89,
+    cutlass::gemm::GemmShape<128, 128, 128>, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+    EpilogueOutputOp, cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages
+  >;
+
+  bool passed = test::gemm::device::TestAllGemmWithAbsmax<Gemm, test::gemm::device::SparseTestbed<Gemm>, cutlass::epilogue::thread::Identity>();
+  EXPECT_TRUE(passed);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Sparse_Gemm_fe4m3t_fe4m3n_fe4m3t_tensor_op_f32, identity_128x64x128_32x64x128) {
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementOutput = cutlass::float_e4m3_t;
+  using ElementAuxOutput = ElementOutput;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationGenericWithScalingAndAbsMax<
+    cutlass::epilogue::thread::Identity,
+    ElementOutput,
+    ElementAuxOutput,
+    128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversalWithAbsmax<
+    ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC,
+    ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm89,
+    cutlass::gemm::GemmShape<128, 64, 128>, cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+    EpilogueOutputOp, cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages
+  >;
+
+  bool passed = test::gemm::device::TestAllGemmWithAbsmax<Gemm, test::gemm::device::SparseTestbed<Gemm>, cutlass::epilogue::thread::Identity>();
+  EXPECT_TRUE(passed);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Sparse_Gemm_fe4m3t_fe4m3n_fe4m3t_tensor_op_f32, identity_noScale_128x128x128_64x64x128) {
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementOutput = cutlass::float_e4m3_t;
+  using ElementAuxOutput = ElementOutput;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationGenericWithScalingAndAbsMax<
+    cutlass::epilogue::thread::Identity,
+    ElementOutput,
+    ElementAuxOutput,
+    128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversalWithAbsmax<
+    ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC,
+    ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm89,
+    cutlass::gemm::GemmShape<128, 128, 128>, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+    EpilogueOutputOp, cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages
+  >;
+
+  bool passed = test::gemm::device::TestAllGemmWithAbsmax<Gemm, test::gemm::device::SparseTestbed<Gemm>, cutlass::epilogue::thread::Identity>(
+    /* scaleA = */false,
+    /* scaleB = */false,
+    /* scaleC = */false
+  );
+  EXPECT_TRUE(passed);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM89_Device_Sparse_Gemm_fe4m3t_fe4m3n_fe4m3t_tensor_op_f32, identity_noAux_128x128x128_64x64x128) {
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementOutput = cutlass::float_e4m3_t;
+  using ElementAuxOutput = float;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationGenericWithScalingAndAbsMax<
+    cutlass::epilogue::thread::Identity,
+    ElementOutput,
+    ElementAuxOutput,
+    128 / cutlass::sizeof_bits<ElementOutput>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversalWithAbsmax<
+    ElementA, LayoutA, ElementB, LayoutB, ElementOutput, LayoutC,
+    ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm89,
+    cutlass::gemm::GemmShape<128, 128, 128>, cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+    EpilogueOutputOp, cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages
+  >;
+
+  bool passed = test::gemm::device::TestAllGemmWithAbsmax<Gemm, test::gemm::device::SparseTestbed<Gemm>, cutlass::epilogue::thread::Identity>();
+  EXPECT_TRUE(passed);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // CUTLASS_ARCH_MMA_SM89_SUPPORTED
diff --git a/test/unit/gemm/device/gemm_grouped_scheduler_sm80.cu b/test/unit/gemm/device/gemm_grouped_scheduler_sm80.cu
new file mode 100644
index 0000000000..45568419d3
--- /dev/null
+++ b/test/unit/gemm/device/gemm_grouped_scheduler_sm80.cu
@@ -0,0 +1,222 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for grouped GEMM problem visitors
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm_grouped.h"
+#include "cutlass/gemm/kernel/default_gemm_grouped.h"
+#include "cutlass/gemm/device/gemm_grouped.h"
+
+#include "testbed_grouped_scheduler.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Run a series of tests on the testbed
+template <typename Testbed>
+void run_tests() {
+  for (int scale_factor : {8, 16, 32, 64}) {
+    for (int threadblock_count : {54, 108, 216, 324, 432}) {
+      for (int problems : {1, 27, 180, 300}) {
+        Testbed testbed;
+        testbed.run(problems, threadblock_count, scale_factor);
+      }
+    }
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmGroupedScheduler_p128_t128, 64x64x32) {
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  static int const kNumPrefetch = 128;
+  static int const kThreadCount = 128;
+  static bool const kTranspose = false;
+
+  using Testbed = test::gemm::device::TestbedGroupedGemmScheduler<
+                              ThreadblockShape,
+                              kNumPrefetch,
+                              kThreadCount,
+                              kTranspose,
+                              // List of GroupScheduleModes to compare. List must contain at least two.
+                              cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly,
+                              cutlass::gemm::kernel::GroupScheduleMode::kHostPrecompute>;
+  run_tests<Testbed>();
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmGroupedScheduler_p128_t128_transpose, 64x64x32) {
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  static int const kNumPrefetch = 128;
+  static int const kThreadCount = 128;
+  static bool const kTranspose = true;
+
+  using Testbed = test::gemm::device::TestbedGroupedGemmScheduler<
+                              ThreadblockShape,
+                              kNumPrefetch,
+                              kThreadCount,
+                              kTranspose,
+                              // List of GroupScheduleModes to compare. List must contain at least two.
+                              cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly,
+                              cutlass::gemm::kernel::GroupScheduleMode::kHostPrecompute>;
+  run_tests<Testbed>();
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmGroupedScheduler_p256_t256, 64x64x32) {
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  static int const kNumPrefetch = 256;
+  static int const kThreadCount = 256;
+  static bool const kTranspose = false;
+
+  using Testbed = test::gemm::device::TestbedGroupedGemmScheduler<
+                              ThreadblockShape,
+                              kNumPrefetch,
+                              kThreadCount,
+                              kTranspose,
+                              // List of GroupScheduleModes to compare. List must contain at least two.
+                              cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly,
+                              cutlass::gemm::kernel::GroupScheduleMode::kHostPrecompute>;
+  run_tests<Testbed>();
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmGroupedScheduler_p256_t128, 64x64x32) {
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  static int const kNumPrefetch = 256;
+  static int const kThreadCount = 128;
+  static bool const kTranspose = false;
+
+  using Testbed = test::gemm::device::TestbedGroupedGemmScheduler<
+                              ThreadblockShape,
+                              kNumPrefetch,
+                              kThreadCount,
+                              kTranspose,
+                              // List of GroupScheduleModes to compare. List must contain at least two.
+                              cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly,
+                              cutlass::gemm::kernel::GroupScheduleMode::kHostPrecompute>;
+  run_tests<Testbed>();
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmGroupedScheduler_p256_t256, 64x32x32) {
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 32>;
+  static int const kNumPrefetch = 256;
+  static int const kThreadCount = 256;
+  static bool const kTranspose = false;
+
+  using Testbed = test::gemm::device::TestbedGroupedGemmScheduler<
+                              ThreadblockShape,
+                              kNumPrefetch,
+                              kThreadCount,
+                              kTranspose,
+                              // List of GroupScheduleModes to compare. List must contain at least two.
+                              cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly,
+                              cutlass::gemm::kernel::GroupScheduleMode::kHostPrecompute>;
+  run_tests<Testbed>();
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmGroupedScheduler_p256_t256_transpose, 64x32x32) {
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 32>;
+  static int const kNumPrefetch = 256;
+  static int const kThreadCount = 256;
+  static bool const kTranspose = true;
+
+  using Testbed = test::gemm::device::TestbedGroupedGemmScheduler<
+                              ThreadblockShape,
+                              kNumPrefetch,
+                              kThreadCount,
+                              kTranspose,
+                              // List of GroupScheduleModes to compare. List must contain at least two.
+                              cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly,
+                              cutlass::gemm::kernel::GroupScheduleMode::kHostPrecompute>;
+  run_tests<Testbed>();
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmGroupedScheduler_p256_t256, 32x64x32) {
+  using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 32>;
+  static int const kNumPrefetch = 256;
+  static int const kThreadCount = 256;
+  static bool const kTranspose = false;
+
+  using Testbed = test::gemm::device::TestbedGroupedGemmScheduler<
+                              ThreadblockShape,
+                              kNumPrefetch,
+                              kThreadCount,
+                              kTranspose,
+                              // List of GroupScheduleModes to compare. List must contain at least two.
+                              cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly,
+                              cutlass::gemm::kernel::GroupScheduleMode::kHostPrecompute>;
+  run_tests<Testbed>();
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmGroupedScheduler_p256_t256_transpose, 32x64x32) {
+  using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 32>;
+  static int const kNumPrefetch = 256;
+  static int const kThreadCount = 256;
+  static bool const kTranspose = true;
+
+  using Testbed = test::gemm::device::TestbedGroupedGemmScheduler<
+                              ThreadblockShape,
+                              kNumPrefetch,
+                              kThreadCount,
+                              kTranspose,
+                              // List of GroupScheduleModes to compare. List must contain at least two.
+                              cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly,
+                              cutlass::gemm::kernel::GroupScheduleMode::kHostPrecompute>;
+  run_tests<Testbed>();
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_grouped_sm80.cu b/test/unit/gemm/device/gemm_grouped_sm80.cu
index 921161e423..0f098038cd 100644
--- a/test/unit/gemm/device/gemm_grouped_sm80.cu
+++ b/test/unit/gemm/device/gemm_grouped_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,7 @@
  **************************************************************************************************/
 /*! \file
     \brief Tests for device-wide GEMM interface
-    
+
 */
 
 #include <iostream>
@@ -80,7 +80,7 @@ struct GemmGroupedProblemVisitor {
   //
   // Data members
   //
-  
+
   SharedStorage &shared_storage;
   Params const &params;
   cutlass::MatrixCoord threadblock_shape;
@@ -95,7 +95,7 @@ struct GemmGroupedProblemVisitor {
   //
   CUTLASS_DEVICE
   GemmGroupedProblemVisitor(
-    SharedStorage &shared_storage_, 
+    SharedStorage &shared_storage_,
     Params const &params_,
     cutlass::MatrixCoord threadblock_shape_,
     int32_t block_idx
@@ -181,38 +181,38 @@ struct GemmGroupedProblemVisitor {
   }
 
   CUTLASS_HOST_DEVICE
-  int64_t threadblock_index() const {
+  int64_t threadblock_idx() const {
     return tile_idx - problem_tile_start;
   }
 
   CUTLASS_DEVICE
   void advance(int32_t grid_size) {
-    tile_idx += grid_size; 
+    tile_idx += grid_size;
   }
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-template <int CtaShapeM, int CtaShapeN>
+template <int ThreadblockShapeM, int ThreadblockShapeN>
 __global__ void GroupedBatchedKernel(GemmGroupedProblemVisitor::Params params) {
 
   __shared__ GemmGroupedProblemVisitor::SharedStorage shared_storage;
 
   GemmGroupedProblemVisitor problem_visitor(
-    shared_storage, 
-    params, 
-    {CtaShapeM, CtaShapeN}, 
+    shared_storage,
+    params,
+    {ThreadblockShapeM, ThreadblockShapeN},
     blockIdx.x);
 
   while (problem_visitor.next_tile()) {
 
     cutlass::gemm::GemmCoord problem_size = problem_visitor.problem_size();
-    int64_t cta_idx                       = problem_visitor.threadblock_index();
+    int64_t threadblock_idx                       = problem_visitor.threadblock_idx();
 
     cutlass::gemm::GemmCoord grid_shape = problem_visitor.grid_shape(problem_size);
 
-    int cta_tile_m_idx = int(cta_idx / grid_shape.n());
-    int cta_tile_n_idx = int(cta_idx % grid_shape.n());
+    int threadblock_tile_m_idx = int(threadblock_idx / grid_shape.n());
+    int threadblock_tile_n_idx = int(threadblock_idx % grid_shape.n());
 
     //
     // Do the MMA
@@ -220,13 +220,13 @@ __global__ void GroupedBatchedKernel(GemmGroupedProblemVisitor::Params params) {
 
     if (threadIdx.x == 0) {
       #if 0
-      printf("Block %d - tile: %lld, problem %d, cta_idx: %lld, cta(m: %d, n: %d)\n", 
-        blockIdx.x, 
-        problem_visitor.tile_index(), 
-        problem_visitor.problem_index(), 
-        cta_idx, 
-        cta_tile_m_idx, 
-        cta_tile_n_idx);
+      printf("Block %d - tile: %lld, problem %d, threadblock_idx: %lld, threadblock(m: %d, n: %d)\n",
+        blockIdx.x,
+        static_cast<long long>(problem_visitor.tile_index()),
+        problem_visitor.problem_index(),
+        threadblock_idx,
+        threadblock_tile_m_idx,
+        threadblock_tile_n_idx);
       #endif
     }
 
@@ -241,8 +241,8 @@ TEST(SM80_Device_GemmGrouped_scheduler, 64x64x32_32x32x32) {
 
   int32_t problem_count = 16;
 
-  int const kCtaShapeM = 64;
-  int const kCtaShapeN = 64;
+  int const kThreadblockShapeM = 64;
+  int const kThreadblockShapeN = 64;
 
   std::vector<cutlass::gemm::GemmCoord> problem_sizes(problem_count);
   std::vector<int64_t> tile_counts(problem_count);
@@ -262,7 +262,7 @@ TEST(SM80_Device_GemmGrouped_scheduler, 64x64x32_32x32x32) {
   for (int32_t i = 0; i < problem_count; ++i) {
 
     cutlass::gemm::GemmCoord grid_shape = GemmGroupedProblemVisitor::grid_shape(
-      problem_sizes.at(i), {kCtaShapeM, kCtaShapeN});
+      problem_sizes.at(i), {kThreadblockShapeM, kThreadblockShapeN});
 
     int32_t problem_tile_count = (grid_shape.m() * grid_shape.n());
 
@@ -272,10 +272,10 @@ TEST(SM80_Device_GemmGrouped_scheduler, 64x64x32_32x32x32) {
     tile_counts.at(i) = tile_count;
 
     if (false) {
-      std::cout << "Problem " << i << " size(" 
-        << problem_sizes.at(i).m() << "-by-" << problem_sizes.at(i).n() 
-        << ") - tiles: " << problem_tile_count << ",  grid(" << grid_shape.m() << ", " << grid_shape.n() 
-        << "), tiles[" << tile_start << ", " << tile_count << ")" << std::endl;  
+      std::cout << "Problem " << i << " size("
+        << problem_sizes.at(i).m() << "-by-" << problem_sizes.at(i).n()
+        << ") - tiles: " << problem_tile_count << ",  grid(" << grid_shape.m() << ", " << grid_shape.n()
+        << "), tiles[" << tile_start << ", " << tile_count << ")" << std::endl;
     }
   }
 
@@ -295,7 +295,7 @@ TEST(SM80_Device_GemmGrouped_scheduler, 64x64x32_32x32x32) {
   dim3 grid(108, 1, 1);
   dim3 block(128, 1, 1);
 
-  GroupedBatchedKernel<kCtaShapeM, kCtaShapeN><<< grid, block >>>(params);
+  GroupedBatchedKernel<kThreadblockShapeM, kThreadblockShapeN><<< grid, block >>>(params);
 
   // wait
   cudaDeviceSynchronize();
@@ -309,25 +309,25 @@ TEST(SM80_Device_GemmGrouped_f16n_f16t_f32n_tensor_op_f32, 128x128x32_64x64x32)
   using ElementAccumulator = float;
 
   using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmGrouped<
-    cutlass::half_t, 
-    cutlass::layout::ColumnMajor, 
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
     cutlass::ComplexTransform::kNone,
     8,
     cutlass::half_t,
-    cutlass::layout::ColumnMajor, 
+    cutlass::layout::ColumnMajor,
     cutlass::ComplexTransform::kNone,
     8,
     ElementOutput, cutlass::layout::ColumnMajor,
-    ElementAccumulator, 
-    cutlass::arch::OpClassTensorOp, 
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
     cutlass::arch::Sm80,
     cutlass::gemm::GemmShape<128, 128, 32>,
-    cutlass::gemm::GemmShape<64, 64, 32>, 
+    cutlass::gemm::GemmShape<64, 64, 32>,
     cutlass::gemm::GemmShape<16, 8, 16>,
     cutlass::epilogue::thread::LinearCombination<
         ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
         ElementAccumulator, ElementAccumulator>,
-    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, 
+    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle,
     3>::GemmKernel;
 
   using Gemm = cutlass::gemm::device::GemmGrouped<GemmKernel>;
@@ -340,7 +340,7 @@ TEST(SM80_Device_GemmGrouped_f16n_f16t_f32n_tensor_op_f32, 128x128x32_64x64x32)
 
   bool passed = testbed.run(24);
   EXPECT_TRUE(passed);
-  
+
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -392,25 +392,25 @@ TEST(SM80_Device_GemmGrouped_f16t_f16n_f32n_tensor_op_f32, 128x64x32_64x32x32) {
   using ElementAccumulator = float;
 
   using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmGrouped<
-    cutlass::half_t, 
-    cutlass::layout::RowMajor, 
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
     cutlass::ComplexTransform::kNone,
     8,
     cutlass::half_t,
-    cutlass::layout::ColumnMajor, 
+    cutlass::layout::ColumnMajor,
     cutlass::ComplexTransform::kNone,
     8,
     ElementOutput, cutlass::layout::ColumnMajor,
-    ElementAccumulator, 
-    cutlass::arch::OpClassTensorOp, 
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
     cutlass::arch::Sm80,
     cutlass::gemm::GemmShape<128, 64, 32>,
-    cutlass::gemm::GemmShape<64, 32, 32>, 
+    cutlass::gemm::GemmShape<64, 32, 32>,
     cutlass::gemm::GemmShape<16, 8, 16>,
     cutlass::epilogue::thread::LinearCombination<
         ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
         ElementAccumulator, ElementAccumulator>,
-    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, 
+    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle,
     4>::GemmKernel;
 
   using Gemm = cutlass::gemm::device::GemmGrouped<GemmKernel>;
@@ -475,17 +475,17 @@ TEST(SM80_Device_GemmGrouped_f64t_f64t_f64n_tensor_op_f64, 64x64x16_32x32x16) {
   using ElementAccumulator = double;
 
   using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmGrouped<
-    ElementInput, 
-    cutlass::layout::RowMajor, 
+    ElementInput,
+    cutlass::layout::RowMajor,
     cutlass::ComplexTransform::kNone,
     1,
     ElementInput,
-    cutlass::layout::RowMajor, 
+    cutlass::layout::RowMajor,
     cutlass::ComplexTransform::kNone,
     1,
     ElementOutput, cutlass::layout::ColumnMajor,
-    ElementAccumulator, 
-    cutlass::arch::OpClassTensorOp, 
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
     cutlass::arch::Sm80,
     cutlass::gemm::GemmShape<64, 64, 16>,
     cutlass::gemm::GemmShape<32, 32, 16>,
@@ -493,7 +493,7 @@ TEST(SM80_Device_GemmGrouped_f64t_f64t_f64n_tensor_op_f64, 64x64x16_32x32x16) {
     cutlass::epilogue::thread::LinearCombination<
         ElementOutput, 1,
         ElementAccumulator, ElementAccumulator>,
-    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, 
+    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle,
     4>::GemmKernel;
 
   using Gemm = cutlass::gemm::device::GemmGrouped<GemmKernel>;
@@ -517,17 +517,17 @@ TEST(SM80_Device_GemmGrouped_f32t_f32t_f32n_simt_f32, 128x128x8_64x32x1) {
   using ElementAccumulator = float;
 
   using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmGrouped<
-    ElementInput, 
-    cutlass::layout::RowMajor, 
+    ElementInput,
+    cutlass::layout::RowMajor,
     cutlass::ComplexTransform::kNone,
     1,
     ElementInput,
-    cutlass::layout::RowMajor, 
+    cutlass::layout::RowMajor,
     cutlass::ComplexTransform::kNone,
     1,
     ElementOutput, cutlass::layout::ColumnMajor,
-    ElementAccumulator, 
-    cutlass::arch::OpClassSimt, 
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
     cutlass::arch::Sm80,
     cutlass::gemm::GemmShape<128, 128, 8>,
     cutlass::gemm::GemmShape<64, 32, 8>,
@@ -535,7 +535,7 @@ TEST(SM80_Device_GemmGrouped_f32t_f32t_f32n_simt_f32, 128x128x8_64x32x1) {
     cutlass::epilogue::thread::LinearCombination<
         ElementOutput, 1,
         ElementAccumulator, ElementAccumulator>,
-    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, 
+    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle,
     3>::GemmKernel;
 
   using Gemm = cutlass::gemm::device::GemmGrouped<GemmKernel>;
@@ -685,17 +685,17 @@ TEST(SM80_Device_GemmGrouped_cf32n_cf32n_cf32n_tensorop_f32, 64x64x16_32x32x16)
   using ElementAccumulator = cutlass::complex<float>;
 
   using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmGrouped<
-    ElementInput, 
-    cutlass::layout::ColumnMajor, 
+    ElementInput,
+    cutlass::layout::ColumnMajor,
     cutlass::ComplexTransform::kNone,
     1,
     ElementInput,
-    cutlass::layout::ColumnMajor, 
+    cutlass::layout::ColumnMajor,
     cutlass::ComplexTransform::kNone,
     1,
     ElementOutput, cutlass::layout::ColumnMajor,
-    ElementAccumulator, 
-    cutlass::arch::OpClassTensorOp, 
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
     cutlass::arch::Sm80,
     cutlass::gemm::GemmShape<64, 64, 16>,
     cutlass::gemm::GemmShape<32, 32, 16>,
@@ -703,8 +703,9 @@ TEST(SM80_Device_GemmGrouped_cf32n_cf32n_cf32n_tensorop_f32, 64x64x16_32x32x16)
     cutlass::epilogue::thread::LinearCombination<
         ElementOutput, 1,
         ElementAccumulator, ElementAccumulator>,
-    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, 
+    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle,
     3,
+    cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly,
     cutlass::arch::OpMultiplyAddComplex>::GemmKernel;
 
   using Gemm = cutlass::gemm::device::GemmGrouped<GemmKernel>;
@@ -728,17 +729,17 @@ TEST(SM80_Device_GemmGrouped_cf32c_cf32t_cf32n_tensorop_f32, 64x64x16_32x32x16)
   using ElementAccumulator = cutlass::complex<float>;
 
   using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmGrouped<
-    ElementInput, 
-    cutlass::layout::ColumnMajor, 
+    ElementInput,
+    cutlass::layout::ColumnMajor,
     cutlass::ComplexTransform::kConjugate,
     1,
     ElementInput,
-    cutlass::layout::ColumnMajor, 
+    cutlass::layout::ColumnMajor,
     cutlass::ComplexTransform::kConjugate,
     1,
     ElementOutput, cutlass::layout::ColumnMajor,
-    ElementAccumulator, 
-    cutlass::arch::OpClassTensorOp, 
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
     cutlass::arch::Sm80,
     cutlass::gemm::GemmShape<64, 64, 16>,
     cutlass::gemm::GemmShape<32, 32, 16>,
@@ -746,8 +747,9 @@ TEST(SM80_Device_GemmGrouped_cf32c_cf32t_cf32n_tensorop_f32, 64x64x16_32x32x16)
     cutlass::epilogue::thread::LinearCombination<
         ElementOutput, 1,
         ElementAccumulator, ElementAccumulator>,
-    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, 
+    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle,
     3,
+    cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly,
     cutlass::arch::OpMultiplyAddComplex>::GemmKernel;
 
   using Gemm = cutlass::gemm::device::GemmGrouped<GemmKernel>;
@@ -791,6 +793,7 @@ TEST(SM80_Device_GemmGrouped_cf32c_cf32t_cf32t_tensorop_f32, 64x64x16_32x32x16)
         ElementAccumulator, ElementAccumulator>,
     cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle,
     3,
+    cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly,
     cutlass::arch::OpMultiplyAddComplex>::GemmKernel;
 
   using Gemm = cutlass::gemm::device::GemmGrouped<GemmKernel>;
@@ -814,17 +817,17 @@ TEST(SM80_Device_GemmGrouped_cf32t_cf32h_cf32n_tensorop_f32, 64x64x16_16x16x16)
   using ElementAccumulator = cutlass::complex<double>;
 
   using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmGrouped<
-    ElementInput, 
-    cutlass::layout::RowMajor, 
+    ElementInput,
+    cutlass::layout::RowMajor,
     cutlass::ComplexTransform::kNone,
     1,
     ElementInput,
-    cutlass::layout::RowMajor, 
+    cutlass::layout::RowMajor,
     cutlass::ComplexTransform::kConjugate,
     1,
     ElementOutput, cutlass::layout::ColumnMajor,
-    ElementAccumulator, 
-    cutlass::arch::OpClassTensorOp, 
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
     cutlass::arch::Sm80,
     cutlass::gemm::GemmShape<32, 32, 16>,
     cutlass::gemm::GemmShape<16, 16, 16>,
@@ -832,8 +835,9 @@ TEST(SM80_Device_GemmGrouped_cf32t_cf32h_cf32n_tensorop_f32, 64x64x16_16x16x16)
     cutlass::epilogue::thread::LinearCombination<
         ElementOutput, 1,
         ElementAccumulator, ElementAccumulator>,
-    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, 
+    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle,
     3,
+    cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly,
     cutlass::arch::OpMultiplyAddComplex>::GemmKernel;
 
   using Gemm = cutlass::gemm::device::GemmGrouped<GemmKernel>;
diff --git a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu
index b2392cdf10..eb2d04787d 100644
--- a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu
+++ b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu
index 93b0569f7a..d6648a5fe4 100644
--- a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu
+++ b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu
index c6ea179269..e0ad29bc55 100644
--- a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu
+++ b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu
index 05942198a7..b3e790d6ee 100644
--- a/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -49,7 +49,6 @@
 #include "testbed_interleaved.h"
 
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-
 ////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM75_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 64x128x128_32x64x128) {
@@ -195,5 +194,4 @@ TEST(SM75_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 128x256x128_64x64x128) {
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-
 #endif
diff --git a/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu
index e32c513dda..8032d09802 100644
--- a/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu
index 71f0f7908d..f5c8011080 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -49,7 +49,6 @@
 #include "testbed.h"
 
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM75_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 128x256x128_64x64x128) {
@@ -245,5 +244,4 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 64x64x128_32x32x128) {
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
 #endif
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu
index 48aad314b8..d7f3cb902c 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu
index d99f6f31f2..49dbd73a27 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -49,7 +49,6 @@
 #include "cutlass/util/reference/host/gemm.h"
 
 #include "testbed.h"
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 ///////// WMMA Instruction Shape = 8x8x32, DataType/Instruction = s4 * s4 + s32 => s32 //////////
 /////////////////////////////////////////////////////////////////////////////////////////////////    
@@ -244,5 +243,4 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_wmma_tensor_op_s32, 64x64x128_32x32x128_8x8x3
 
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
 }
-
 #endif //CUTLASS_SUBBYTE_INTEGER_MATRIX_MULTIPLY_ENABLED
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu
index df9674b384..819a487ef1 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -49,7 +49,6 @@
 #include "testbed.h"
 
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM75_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x256x128_64x64x128) {
@@ -245,5 +244,4 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 64x64x128_32x32x128) {
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
 #endif
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu
index 96f68eb58a..7eede8162e 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu
index 890cbe39ad..73d45d5489 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -36,7 +36,7 @@
 
 #include "../../common/cutlass_unit_test.h"
 #include "cutlass/cutlass.h"
-#include "cutlass/gemm/device/gemm_sparse.h"
+#include "cutlass/gemm/device/gemm_sparse_universal.h"
 #include "cutlass/util/host_tensor.h"
 #include "cutlass/util/reference/host/gemm.h"
 #include "cutlass/util/reference/host/tensor_compare.h"
@@ -55,7 +55,7 @@ TEST(SM80_Device_Sparse_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x256x256_64x64x256)
   using ElementAccumulator = int32_t;
   using ElementCompute = int32_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -74,7 +74,7 @@ TEST(SM80_Device_Sparse_Gemm_s4t_s4n_s32t_tensor_op_s32, 256x128x256_64x64x256)
   using ElementAccumulator = int32_t;
   using ElementCompute = int32_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -93,7 +93,7 @@ TEST(SM80_Device_Sparse_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x128x256_64x64x256)
   using ElementAccumulator = int32_t;
   using ElementCompute = int32_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -113,7 +113,7 @@ TEST(SM80_Device_Sparse_Gemm_s4t_s4n_s32t_tensor_op_s32, 256x64x256_64x64x256) {
   using ElementAccumulator = int32_t;
   using ElementCompute = int32_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -132,7 +132,7 @@ TEST(SM80_Device_Sparse_Gemm_s4t_s4n_s32t_tensor_op_s32, 64x256x256_64x64x256) {
   using ElementAccumulator = int32_t;
   using ElementCompute = int32_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -151,7 +151,7 @@ TEST(SM80_Device_Sparse_Gemm_s4t_s4n_s32t_tensor_op_s32, 64x128x256_32x64x256) {
   using ElementAccumulator = int32_t;
   using ElementCompute = int32_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -170,7 +170,7 @@ TEST(SM80_Device_Sparse_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x64x256_64x32x256) {
   using ElementAccumulator = int32_t;
   using ElementCompute = int32_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -189,7 +189,7 @@ TEST(SM80_Device_Sparse_Gemm_s4t_s4n_s32t_tensor_op_s32, 64x64x256_32x32x256) {
   using ElementAccumulator = int32_t;
   using ElementCompute = int32_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -208,7 +208,7 @@ TEST(SM80_Device_Sparse_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x128x512_64x64x512)
   using ElementAccumulator = int32_t;
   using ElementCompute = int32_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -228,7 +228,7 @@ TEST(SM80_Device_Sparse_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x64x512_64x32x512) {
   using ElementAccumulator = int32_t;
   using ElementCompute = int32_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -247,7 +247,7 @@ TEST(SM80_Device_Sparse_Gemm_s4t_s4n_s32t_tensor_op_s32, 64x64x512_32x32x512) {
   using ElementAccumulator = int32_t;
   using ElementCompute = int32_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -261,7 +261,196 @@ TEST(SM80_Device_Sparse_Gemm_s4t_s4n_s32t_tensor_op_s32, 64x64x512_32x32x512) {
   EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
 }
 
+TEST(SM80_Device_Sparse_Gemm_s4t_s4n_s32t_tensor_op_s32, 32x128x512_32x32x512) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<32, 128, 512>,
+      cutlass::gemm::GemmShape<32, 32, 512>, cutlass::gemm::GemmShape<16, 8, 128>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_s4t_s4n_s32t_tensor_op_s32, 32x128x256_32x32x256) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<32, 128, 256>,
+      cutlass::gemm::GemmShape<32, 32, 256>, cutlass::gemm::GemmShape<16, 8, 128>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_s4t_s4n_s32t_tensor_op_s32, 32x256x256_32x64x256) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<32, 256, 256>,
+      cutlass::gemm::GemmShape<32, 64, 256>, cutlass::gemm::GemmShape<16, 8, 128>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_s4t_s4n_s32t_tensor_op_s32, 16x128x512_16x32x512) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<16, 128, 512>,
+      cutlass::gemm::GemmShape<16, 32, 512>, cutlass::gemm::GemmShape<16, 8, 128>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x32x256_32x32x256) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 32, 256>,
+      cutlass::gemm::GemmShape<32, 32, 256>, cutlass::gemm::GemmShape<16, 8, 128>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x32x512_32x32x512) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 32, 512>,
+      cutlass::gemm::GemmShape<32, 32, 512>, cutlass::gemm::GemmShape<16, 8, 128>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_s4t_s4n_s32t_tensor_op_s32, 256x32x256_64x32x256) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 32, 256>,
+      cutlass::gemm::GemmShape<64, 32, 256>, cutlass::gemm::GemmShape<16, 8, 128>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x16x256_32x16x256) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 16, 256>,
+      cutlass::gemm::GemmShape<32, 16, 256>, cutlass::gemm::GemmShape<16, 8, 128>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x16x512_32x16x512) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 16, 512>,
+      cutlass::gemm::GemmShape<32, 16, 512>, cutlass::gemm::GemmShape<16, 8, 128>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_s4t_s4n_s32t_tensor_op_s32, 256x16x256_16x64x256) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 16, 256>,
+      cutlass::gemm::GemmShape<64, 16, 256>, cutlass::gemm::GemmShape<16, 8, 128>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 
 #endif // defined(CUTLASS_ARCH_SPARSE_MMA_SM80_SUPPORTED)
-
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu
index d391a0bd55..69ed488241 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -49,7 +49,6 @@
 #include "cutlass/util/reference/host/gemm.h"
 
 #include "testbed.h"
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 ///////// WMMA Instruction Shape = 8x8x32, DataType/Instruction = s4 * s4 + s32 => s32 //////////
 /////////////////////////////////////////////////////////////////////////////////////////////////    
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu
index 9f0b16a1b7..f032b4cd0d 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -49,7 +49,6 @@
 #include "testbed.h"
 
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 128x256x128_64x64x128) {
@@ -81,7 +80,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 128x256x128_64x64x128) {
     2
   >;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 256x128x128_64x64x128) {
@@ -113,7 +112,39 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 256x128x128_64x64x128) {
     2
   >;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32_align8, 256x128x128_64x64x128) {
+
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::int4b_t,
+    cutlass::layout::RowMajor,
+    cutlass::int4b_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<256, 128, 128>,
+    cutlass::gemm::GemmShape<64, 64, 128>,
+    cutlass::gemm::GemmShape<8, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      8,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 128x128x128_64x64x128) {
@@ -145,7 +176,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 128x128x128_64x64x128) {
     2
   >;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 64x256x128_64x64x128) {
@@ -177,7 +208,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 64x256x128_64x64x128) {
     2
   >;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 256x64x128_64x64x128) {
@@ -209,7 +240,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 256x64x128_64x64x128) {
     2
   >;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 64x128x128_32x64x128) {
@@ -241,7 +272,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 64x128x128_32x64x128) {
     2
   >;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 128x64x128_64x32x128) {
@@ -273,7 +304,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 128x64x128_64x32x128) {
     2
   >;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 64x64x128_32x32x128) {
@@ -305,7 +336,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 64x64x128_32x32x128) {
     2
   >;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm80.cu
index 29c201a4ca..80d42e31db 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -249,6 +249,26 @@ CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 256x128x128_64x64x12
   EXPECT_TRUE(testbed.run_all());
 } )
 
+CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4n_tensor_op_s32_align8, 256x128x128_64x64x128, {
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 8, ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
 CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 128x128x128_64x64x128, {
   using ElementOutput = cutlass::int4b_t;
   using ElementAccumulator = int32_t;
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu
index 1a405b7a0e..768201be39 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -49,7 +49,6 @@
 #include "testbed.h"
 
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 128x256x128_64x64x128) {
@@ -81,7 +80,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 128x256x128_64x64x128) {
     2
   >;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 256x128x128_64x64x128) {
@@ -113,7 +112,39 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 256x128x128_64x64x128) {
     2
   >;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32_align8, 256x128x128_64x64x128) {
+
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::int4b_t,
+    cutlass::layout::RowMajor,
+    cutlass::int4b_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<256, 128, 128>,
+    cutlass::gemm::GemmShape<64, 64, 128>,
+    cutlass::gemm::GemmShape<8, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      8,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 128x128x128_64x64x128) {
@@ -145,7 +176,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 128x128x128_64x64x128) {
     2
   >;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 64x256x128_64x64x128) {
@@ -177,7 +208,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 64x256x128_64x64x128) {
     2
   >;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 256x64x128_64x64x128) {
@@ -209,8 +240,9 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 256x64x128_64x64x128) {
     2
   >;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
+
 TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 64x128x128_32x64x128) {
 
   using ElementOutput = cutlass::int4b_t;
@@ -240,7 +272,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 64x128x128_32x64x128) {
     2
   >;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 128x64x128_64x32x128) {
@@ -272,7 +304,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 128x64x128_64x32x128) {
     2
   >;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
 TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 64x64x128_32x32x128) {
@@ -304,9 +336,232 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 64x64x128_32x32x128) {
     2
   >;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
+TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 32x64x128_16x32x128) {
+
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::int4b_t,
+    cutlass::layout::RowMajor,
+    cutlass::int4b_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<32, 64, 128>,
+    cutlass::gemm::GemmShape<16, 32, 128>,
+    cutlass::gemm::GemmShape<8, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 32x128x128_16x64x128) {
+
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::int4b_t,
+    cutlass::layout::RowMajor,
+    cutlass::int4b_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<32, 128, 128>,
+    cutlass::gemm::GemmShape<16, 64, 128>,
+    cutlass::gemm::GemmShape<8, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 32x128x128_32x32x128) {
+
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::int4b_t,
+    cutlass::layout::RowMajor,
+    cutlass::int4b_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<32, 128, 128>,
+    cutlass::gemm::GemmShape<32, 32, 128>,
+    cutlass::gemm::GemmShape<8, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
+}
 
+TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 32x256x128_32x64x128) {
+
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::int4b_t,
+    cutlass::layout::RowMajor,
+    cutlass::int4b_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<32, 256, 128>,
+    cutlass::gemm::GemmShape<32, 64, 128>,
+    cutlass::gemm::GemmShape<8, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 64x32x128_16x32x128) {
+
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::int4b_t,
+    cutlass::layout::RowMajor,
+    cutlass::int4b_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<64, 32, 128>,
+    cutlass::gemm::GemmShape<16, 32, 128>,
+    cutlass::gemm::GemmShape<8, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 128x32x128_32x32x128) {
+
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::int4b_t,
+    cutlass::layout::RowMajor,
+    cutlass::int4b_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<128, 32, 128>,
+    cutlass::gemm::GemmShape<32, 32, 128>,
+    cutlass::gemm::GemmShape<8, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 256x32x128_64x32x128) {
+
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::int4b_t,
+    cutlass::layout::RowMajor,
+    cutlass::int4b_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<256, 32, 128>,
+    cutlass::gemm::GemmShape<64, 32, 128>,
+    cutlass::gemm::GemmShape<8, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
 #endif
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm80.cu
index da4549e6bd..f34e47e5a0 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -249,6 +249,26 @@ CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 256x128x128_64x64x12
   EXPECT_TRUE(testbed.run_all());
 } )
 
+CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32_align8, 256x128x128_64x64x128, {
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 8, ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
 CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 128x128x128_64x64x128, {
   using ElementOutput = cutlass::int4b_t;
   using ElementAccumulator = int32_t;
@@ -349,6 +369,74 @@ CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 128x64x128_64x32x128
   EXPECT_TRUE(testbed.run_all());
 } )
 
+TEST(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 16x128x256_16x32x256) {
+
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::int4b_t,
+    cutlass::layout::RowMajor,
+    cutlass::int4b_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<16, 128, 256>,
+    cutlass::gemm::GemmShape<16, 32, 256>,
+    cutlass::gemm::GemmShape<16, 8, 64>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 16x256x256_16x64x256) {
+
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::int4b_t,
+    cutlass::layout::RowMajor,
+    cutlass::int4b_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<16, 256, 256>,
+    cutlass::gemm::GemmShape<16, 64, 256>,
+    cutlass::gemm::GemmShape<16, 8, 64>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
 CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 64x64x128_32x32x128, {
   using ElementOutput = cutlass::int4b_t;
   using ElementAccumulator = int32_t;
@@ -369,6 +457,481 @@ CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 64x64x128_32x32x128,
   EXPECT_TRUE(testbed.run_all());
 } )
 
+TEST(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 32x64x128_16x32x128) {
+
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::int4b_t,
+    cutlass::layout::RowMajor,
+    cutlass::int4b_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 128>,
+    cutlass::gemm::GemmShape<16, 32, 128>,
+    cutlass::gemm::GemmShape<16, 8, 64>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3 
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 32x128x128_16x64x128) {
+
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::int4b_t,
+    cutlass::layout::RowMajor,
+    cutlass::int4b_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 128, 128>,
+    cutlass::gemm::GemmShape<16, 64, 128>,
+    cutlass::gemm::GemmShape<16, 8, 64>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 32x128x128_32x32x128) {
+
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::int4b_t,
+    cutlass::layout::RowMajor,
+    cutlass::int4b_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 128, 128>,
+    cutlass::gemm::GemmShape<32, 32, 128>,
+    cutlass::gemm::GemmShape<16, 8, 64>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 32x256x128_32x64x128) {
+
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::int4b_t,
+    cutlass::layout::RowMajor,
+    cutlass::int4b_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 256, 128>,
+    cutlass::gemm::GemmShape<32, 64, 128>,
+    cutlass::gemm::GemmShape<16, 8, 64>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 64x32x128_16x32x128) {
+
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::int4b_t,
+    cutlass::layout::RowMajor,
+    cutlass::int4b_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 32, 128>,
+    cutlass::gemm::GemmShape<16, 32, 128>,
+    cutlass::gemm::GemmShape<16, 8, 64>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 128x32x128_32x32x128) {
+
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::int4b_t,
+    cutlass::layout::RowMajor,
+    cutlass::int4b_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 32, 128>,
+    cutlass::gemm::GemmShape<32, 32, 128>,
+    cutlass::gemm::GemmShape<16, 8, 64>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3 
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 256x32x128_64x32x128) {
+
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::int4b_t,
+    cutlass::layout::RowMajor,
+    cutlass::int4b_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 32, 128>,
+    cutlass::gemm::GemmShape<64, 32, 128>,
+    cutlass::gemm::GemmShape<16, 8, 64>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3 
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 32x64x256_16x32x256) {
+
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::int4b_t,
+    cutlass::layout::RowMajor,
+    cutlass::int4b_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 256>,
+    cutlass::gemm::GemmShape<16, 32, 256>,
+    cutlass::gemm::GemmShape<16, 8, 64>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3 
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 32x128x256_16x64x256) {
+
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::int4b_t,
+    cutlass::layout::RowMajor,
+    cutlass::int4b_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 128, 256>,
+    cutlass::gemm::GemmShape<16, 64, 256>,
+    cutlass::gemm::GemmShape<16, 8, 64>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 32x128x256_32x32x256) {
+
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::int4b_t,
+    cutlass::layout::RowMajor,
+    cutlass::int4b_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 128, 256>,
+    cutlass::gemm::GemmShape<32, 32, 256>,
+    cutlass::gemm::GemmShape<16, 8, 64>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 32x256x256_32x64x256) {
+
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::int4b_t,
+    cutlass::layout::RowMajor,
+    cutlass::int4b_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 256, 256>,
+    cutlass::gemm::GemmShape<32, 64, 256>,
+    cutlass::gemm::GemmShape<16, 8, 64>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 64x32x256_16x32x256) {
+
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::int4b_t,
+    cutlass::layout::RowMajor,
+    cutlass::int4b_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 32, 256>,
+    cutlass::gemm::GemmShape<16, 32, 256>,
+    cutlass::gemm::GemmShape<16, 8, 64>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 128x32x256_32x32x256) {
+
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::int4b_t,
+    cutlass::layout::RowMajor,
+    cutlass::int4b_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 32, 256>,
+    cutlass::gemm::GemmShape<32, 32, 256>,
+    cutlass::gemm::GemmShape<16, 8, 64>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3 
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 256x32x256_64x32x256) {
+
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::int4b_t,
+    cutlass::layout::RowMajor,
+    cutlass::int4b_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 32, 256>,
+    cutlass::gemm::GemmShape<64, 32, 256>,
+    cutlass::gemm::GemmShape<16, 8, 64>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3 
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
 ////////////////////////////////////////////////////////////////////////////////
 #endif // #if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
 
diff --git a/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu
index 7f0a6b106c..f3efdb0a04 100644
--- a/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -49,7 +49,6 @@
 #include "testbed_interleaved.h"
 
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-
 ////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM75_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 32x64x64_16x32x64) {
@@ -289,5 +288,4 @@ TEST(SM75_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 128x256x64_64x64x64) {
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-
 #endif
diff --git a/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu
index e33b2ed1e2..b20ec8fbb9 100644
--- a/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_f16t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_f16t_tensor_op_s32_sm80.cu
new file mode 100644
index 0000000000..45620463cc
--- /dev/null
+++ b/test/unit/gemm/device/gemm_s8t_s8n_f16t_tensor_op_s32_sm80.cu
@@ -0,0 +1,77 @@
+/**************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "multistage_testbed.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_s8t_s8n_f16t_tensor_op_s32, 128x128x64_64x64x64) {
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput,
+          128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator,
+          ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#endif // #if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu
index 098f8619cd..a9a18c33a9 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -49,7 +49,6 @@
 #include "testbed.h"
 
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM75_Device_Gemm_s8t_s8n_s32n_tensor_op_s32, 128x256x64_64x64x64) {
@@ -245,5 +244,4 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32n_tensor_op_s32, 64x64x64_32x32x64) {
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
 #endif
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm80.cu
index ad8e9ff986..9aeb575794 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu
index 6f7bfe6562..ba0ff864ea 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu
index e5279f35aa..80e0506a4f 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -49,7 +49,6 @@
 #include "testbed.h"
 
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM75_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x256x64_64x64x64) {
@@ -245,5 +244,4 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 64x64x64_32x32x64) {
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
 #endif
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu
index f1577be320..a7ff179ae8 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu
index 1f63393b39..96b56322cf 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -36,7 +36,7 @@
 
 #include "../../common/cutlass_unit_test.h"
 #include "cutlass/cutlass.h"
-#include "cutlass/gemm/device/gemm_sparse.h"
+#include "cutlass/gemm/device/gemm_sparse_universal.h"
 #include "cutlass/util/host_tensor.h"
 #include "cutlass/util/reference/host/gemm.h"
 #include "cutlass/util/reference/host/tensor_compare.h"
@@ -55,7 +55,7 @@ TEST(SM80_Device_Sparse_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x256x128_64x64x128)
   using ElementAccumulator = int32_t;
   using ElementCompute = int32_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       int8_t, cutlass::layout::RowMajor, int8_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -74,7 +74,7 @@ TEST(SM80_Device_Sparse_Gemm_s8t_s8n_s32t_tensor_op_s32, 256x128x128_64x64x128)
   using ElementAccumulator = int32_t;
   using ElementCompute = int32_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       int8_t, cutlass::layout::RowMajor, int8_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -93,7 +93,7 @@ TEST(SM80_Device_Sparse_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x128x128_64x64x128)
   using ElementAccumulator = int32_t;
   using ElementCompute = int32_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       int8_t, cutlass::layout::RowMajor, int8_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -113,7 +113,7 @@ TEST(SM80_Device_Sparse_Gemm_s8t_s8n_s32t_tensor_op_s32, 256x64x128_64x64x128) {
   using ElementAccumulator = int32_t;
   using ElementCompute = int32_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       int8_t, cutlass::layout::RowMajor, int8_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -132,7 +132,7 @@ TEST(SM80_Device_Sparse_Gemm_s8t_s8n_s32t_tensor_op_s32, 64x256x128_64x64x128) {
   using ElementAccumulator = int32_t;
   using ElementCompute = int32_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       int8_t, cutlass::layout::RowMajor, int8_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -151,7 +151,7 @@ TEST(SM80_Device_Sparse_Gemm_s8t_s8n_s32t_tensor_op_s32, 64x128x128_32x64x128) {
   using ElementAccumulator = int32_t;
   using ElementCompute = int32_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       int8_t, cutlass::layout::RowMajor, int8_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -170,7 +170,7 @@ TEST(SM80_Device_Sparse_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x64x128_64x32x128) {
   using ElementAccumulator = int32_t;
   using ElementCompute = int32_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       int8_t, cutlass::layout::RowMajor, int8_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -189,7 +189,7 @@ TEST(SM80_Device_Sparse_Gemm_s8t_s8n_s32t_tensor_op_s32, 64x64x128_32x32x128) {
   using ElementAccumulator = int32_t;
   using ElementCompute = int32_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       int8_t, cutlass::layout::RowMajor, int8_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -208,7 +208,7 @@ TEST(SM80_Device_Sparse_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x128x256_64x64x256)
   using ElementAccumulator = int32_t;
   using ElementCompute = int32_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       int8_t, cutlass::layout::RowMajor, int8_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -228,7 +228,7 @@ TEST(SM80_Device_Sparse_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x64x256_64x32x256) {
   using ElementAccumulator = int32_t;
   using ElementCompute = int32_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       int8_t, cutlass::layout::RowMajor, int8_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -247,7 +247,7 @@ TEST(SM80_Device_Sparse_Gemm_s8t_s8n_s32t_tensor_op_s32, 64x64x256_32x32x256) {
   using ElementAccumulator = int32_t;
   using ElementCompute = int32_t;
 
-  using Gemm = cutlass::gemm::device::SparseGemm<
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
       int8_t, cutlass::layout::RowMajor, int8_t,
       cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@@ -261,9 +261,197 @@ TEST(SM80_Device_Sparse_Gemm_s8t_s8n_s32t_tensor_op_s32, 64x64x256_32x32x256) {
   EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
 }
 
+TEST(SM80_Device_Sparse_Gemm_s8t_s8n_s32t_tensor_op_s32, 32x128x128_32x32x128) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      int8_t, cutlass::layout::RowMajor, int8_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<32, 128, 128>,
+      cutlass::gemm::GemmShape<32, 32, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_s8t_s8n_s32t_tensor_op_s32, 32x128x256_32x32x256) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      int8_t, cutlass::layout::RowMajor, int8_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<32, 128, 256>,
+      cutlass::gemm::GemmShape<32, 32, 256>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_s8t_s8n_s32t_tensor_op_s32, 32x256x128_32x64x128) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      int8_t, cutlass::layout::RowMajor, int8_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<32, 256, 128>,
+      cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_s8t_s8n_s32t_tensor_op_s32, 16x128x256_16x32x256) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      int8_t, cutlass::layout::RowMajor, int8_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<16, 128, 256>,
+      cutlass::gemm::GemmShape<16, 32, 256>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x32x128_32x32x128) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      int8_t, cutlass::layout::RowMajor, int8_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 32, 128>,
+      cutlass::gemm::GemmShape<32, 32, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x32x256_32x32x256) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      int8_t, cutlass::layout::RowMajor, int8_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 32, 256>,
+      cutlass::gemm::GemmShape<32, 32, 256>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_s8t_s8n_s32t_tensor_op_s32, 256x32x128_64x32x128) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      int8_t, cutlass::layout::RowMajor, int8_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 32, 128>,
+      cutlass::gemm::GemmShape<64, 32, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x16x128_32x16x128) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      int8_t, cutlass::layout::RowMajor, int8_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 16, 128>,
+      cutlass::gemm::GemmShape<32, 16, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x16x256_32x16x256) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      int8_t, cutlass::layout::RowMajor, int8_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 16, 256>,
+      cutlass::gemm::GemmShape<32, 16, 256>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
+
+TEST(SM80_Device_Sparse_Gemm_s8t_s8n_s32t_tensor_op_s32, 256x16x128_64x16x128) {
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::GemmSparseUniversal<
+      int8_t, cutlass::layout::RowMajor, int8_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 16, 128>,
+      cutlass::gemm::GemmShape<64, 16, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSparseGemm<Gemm>());
+}
 
 ////////////////////////////////////////////////////////////////////////////////
 
 #endif // defined(CUTLASS_ARCH_SPARSE_MMA_SM80_SUPPORTED)
 
-
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu
index 9a49fe96d1..1d4c84f140 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu
index c95e3f9510..c95d842313 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -49,7 +49,6 @@
 #include "testbed.h"
 
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 128x256x64_64x64x64, {
@@ -89,6 +88,24 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 256x128x64_64x64x64,
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
 } )
 
+CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8n_tensor_op_s32_align8, 256x128x64_64x64x64, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75,
+      cutlass::gemm::GemmShape<256, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<8, 8, 16>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 8>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
 CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 128x128x64_64x64x64, {
   using ElementOutput = int8_t;
   using ElementAccumulator = int32_t;
@@ -194,5 +211,4 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 64x64x64_32x32x64, {
 } )
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
 #endif
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu
index 438eaf4324..d48abf84ba 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -249,6 +249,26 @@ CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 256x128x64_64x64x64,
   EXPECT_TRUE(testbed.run_all());
 } )
 
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32_align8, 256x128x64_64x64x64, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 8>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
 CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 128x128x64_64x64x64, {
   using ElementOutput = int8_t;
   using ElementAccumulator = int32_t;
@@ -369,6 +389,26 @@ CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 64x64x64_32x32x64, {
   EXPECT_TRUE(testbed.run_all());
 } )
 
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 256x64x128_64x64x128_align4, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::RowMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 4, int32_t, float>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
 ////////////////////////////////////////////////////////////////////////////////
 #endif // if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
 
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu
index 95fa210dbf..5173ac09c3 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu
index fdd40d63f7..9e9da8f482 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -49,7 +49,6 @@
 #include "testbed.h"
 
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x256x64_64x64x64, {
@@ -88,6 +87,24 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 256x128x64_64x64x64,
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
 } )
 
+CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32_align8, 256x128x64_64x64x64, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::RowMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75,
+      cutlass::gemm::GemmShape<256, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<8, 8, 16>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 8>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+} )
+
 CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x128x64_64x64x64, {
   using ElementOutput = int8_t;
   using ElementAccumulator = int32_t;
@@ -167,6 +184,357 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x64x64_32x32x64, {
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
 } )
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
+TEST(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 32x32x64_16x16x64) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
 
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<32, 32, 64>,
+    cutlass::gemm::GemmShape<16, 16, 64>,
+    cutlass::gemm::GemmShape<8, 8, 16>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 32x64x64_16x32x64) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<32, 64, 64>,
+    cutlass::gemm::GemmShape<16, 32, 64>,
+    cutlass::gemm::GemmShape<8, 8, 16>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 32x128x64_16x64x64) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<32, 128, 64>,
+    cutlass::gemm::GemmShape<16, 64, 64>,
+    cutlass::gemm::GemmShape<8, 8, 16>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 32x64x64_32x16x64) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<32, 64, 64>,
+    cutlass::gemm::GemmShape<32, 16, 64>,
+    cutlass::gemm::GemmShape<8, 8, 16>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 32x128x64_32x32x64) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<32, 128, 64>,
+    cutlass::gemm::GemmShape<32, 32, 64>,
+    cutlass::gemm::GemmShape<8, 8, 16>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 32x256x64_32x64x64) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<32, 256, 64>,
+    cutlass::gemm::GemmShape<32, 64, 64>,
+    cutlass::gemm::GemmShape<8, 8, 16>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x32x64_32x16x64) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<64, 32, 64>,
+    cutlass::gemm::GemmShape<32, 16, 64>,
+    cutlass::gemm::GemmShape<8, 8, 16>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x32x64_64x16x64) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<128, 32, 64>,
+    cutlass::gemm::GemmShape<64, 16, 64>,
+    cutlass::gemm::GemmShape<8, 8, 16>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x32x64_16x32x64) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<64, 32, 64>,
+    cutlass::gemm::GemmShape<16, 32, 64>,
+    cutlass::gemm::GemmShape<8, 8, 16>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x32x64_32x32x64) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<128, 32, 64>,
+    cutlass::gemm::GemmShape<32, 32, 64>,
+    cutlass::gemm::GemmShape<8, 8, 16>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 256x32x64_64x32x64) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<256, 32, 64>,
+    cutlass::gemm::GemmShape<64, 32, 64>,
+    cutlass::gemm::GemmShape<8, 8, 16>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
 #endif
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu
index b886d6c24f..ad655fd84e 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -249,6 +249,26 @@ CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 256x128x64_64x64x64,
   EXPECT_TRUE(testbed.run_all());
 } )
 
+CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32_align8, 256x128x64_64x64x64, {
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      int8_t, cutlass::layout::RowMajor, int8_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 64>,
+      cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::FastLinearCombinationClamp<
+          ElementOutput, 8>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
 CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x128x64_64x64x64, {
   using ElementOutput = int8_t;
   using ElementAccumulator = int32_t;
@@ -369,6 +389,956 @@ CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x64x64_32x32x64, {
   EXPECT_TRUE(testbed.run_all());
 } )
 
+TEST(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 16x64x128_16x16x128) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<16, 64, 128>,
+    cutlass::gemm::GemmShape<16, 16, 128>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 16x128x128_16x32x128) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<16, 128, 128>,
+    cutlass::gemm::GemmShape<16, 32, 128>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 16x256x128_16x64x128) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<16, 256, 128>,
+    cutlass::gemm::GemmShape<16, 64, 128>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x16x128_16x16x128) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 16, 128>,
+    cutlass::gemm::GemmShape<16, 16, 128>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x16x128_32x16x128) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 16, 128>,
+    cutlass::gemm::GemmShape<32, 16, 128>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 256x16x128_64x16x128) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 16, 128>,
+    cutlass::gemm::GemmShape<64, 16, 128>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 32x32x64_16x16x64) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 64>,
+    cutlass::gemm::GemmShape<16, 16, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3 
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 32x64x64_16x32x64) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 64>,
+    cutlass::gemm::GemmShape<16, 32, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3 
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 32x128x64_16x64x64) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 128, 64>,
+    cutlass::gemm::GemmShape<16, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3 
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 32x64x64_32x16x64) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 64>,
+    cutlass::gemm::GemmShape<32, 16, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3 
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 32x128x64_32x32x64) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 128, 64>,
+    cutlass::gemm::GemmShape<32, 32, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3 
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 32x256x64_32x64x64) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 256, 64>,
+    cutlass::gemm::GemmShape<32, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3 
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x32x64_32x16x64) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 32, 64>,
+    cutlass::gemm::GemmShape<32, 16, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x32x64_64x16x64) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 32, 64>,
+    cutlass::gemm::GemmShape<64, 16, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3 
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x32x64_16x32x64) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 32, 64>,
+    cutlass::gemm::GemmShape<16, 32, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3 
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x32x64_32x32x64) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 32, 64>,
+    cutlass::gemm::GemmShape<32, 32, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3 
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 256x32x64_64x32x64) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 32, 64>,
+    cutlass::gemm::GemmShape<64, 32, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3 
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 32x32x128_16x16x128) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 128>,
+    cutlass::gemm::GemmShape<16, 16, 128>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3 
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 32x64x128_16x32x128) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 128>,
+    cutlass::gemm::GemmShape<16, 32, 128>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3 
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 32x128x128_16x64x128) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 128, 128>,
+    cutlass::gemm::GemmShape<16, 64, 128>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3 
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 32x64x128_32x16x128) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 128>,
+    cutlass::gemm::GemmShape<32, 16, 128>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3 
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 32x128x128_32x32x128) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 128, 128>,
+    cutlass::gemm::GemmShape<32, 32, 128>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3 
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 32x256x128_32x64x128) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 256, 128>,
+    cutlass::gemm::GemmShape<32, 64, 128>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3 
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x32x128_32x16x128) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 32, 128>,
+    cutlass::gemm::GemmShape<32, 16, 128>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x32x128_64x16x128) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 32, 128>,
+    cutlass::gemm::GemmShape<64, 16, 128>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3 
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x32x128_16x32x128) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 32, 128>,
+    cutlass::gemm::GemmShape<16, 32, 128>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3 
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x32x128_32x32x128) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 32, 128>,
+    cutlass::gemm::GemmShape<32, 32, 128>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3 
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
+
+TEST(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 256x32x128_64x32x128) {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::RowMajor,
+    int8_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 32, 128>,
+    cutlass::gemm::GemmShape<64, 32, 128>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3 
+  >;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+}
 ////////////////////////////////////////////////////////////////////////////////
 #endif // #if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
-
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu
index 76979c1836..d3453925cd 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu b/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu
index cc46e547e5..4fc06e6ac2 100644
--- a/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu
+++ b/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_splitk_simt_sm50.cu b/test/unit/gemm/device/gemm_splitk_simt_sm50.cu
index 01b29b2c66..643b595d89 100644
--- a/test/unit/gemm/device/gemm_splitk_simt_sm50.cu
+++ b/test/unit/gemm/device/gemm_splitk_simt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_splitk_tensor_op_sm70.cu b/test/unit/gemm/device/gemm_splitk_tensor_op_sm70.cu
index 284532adcb..b30a4e2d59 100644
--- a/test/unit/gemm/device/gemm_splitk_tensor_op_sm70.cu
+++ b/test/unit/gemm/device/gemm_splitk_tensor_op_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_splitk_tensor_op_sm75.cu b/test/unit/gemm/device/gemm_splitk_tensor_op_sm75.cu
index 2325c92bc0..1215549121 100644
--- a/test/unit/gemm/device/gemm_splitk_tensor_op_sm75.cu
+++ b/test/unit/gemm/device/gemm_splitk_tensor_op_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_testbed_3x.hpp b/test/unit/gemm/device/gemm_testbed_3x.hpp
new file mode 100644
index 0000000000..3a6cf0b2eb
--- /dev/null
+++ b/test/unit/gemm/device/gemm_testbed_3x.hpp
@@ -0,0 +1,2462 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <algorithm>
+#include <random>
+#include <numeric> // std::lcm
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/gett.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/fusion/operations.hpp"
+#include "cutlass/complex.h"
+#include "cutlass/transform/device/transform_universal_adapter.hpp"
+#include "cutlass/transform/kernel/sparse_gemm_compressor.hpp"
+#include "cutlass/detail/collective.hpp"
+
+#include "testbed_utils.h"
+
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cute/int_tuple.hpp"
+#include "cute/layout.hpp"
+#include "cute/numeric/int.hpp"
+
+namespace test {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+enum class ScalarLoc {
+  ON_HOST = 0,
+  ON_DEVICE = 1
+};
+
+enum class VectorScale {
+  DISABLED = 0,
+  ENABLED = 1
+};
+
+enum class CheckEquality {
+  EXACT = 0,
+  RELATIVE = 1
+};
+
+namespace detail {
+
+inline constexpr auto decomp_mode_to_string =
+  [] (cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::DecompositionMode mode) -> std::string {
+    using Mode = cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
+    if (mode == Mode::Heuristic) {
+      return "Heuristic";
+    }
+    else if (mode == Mode::DataParallel) {
+      return "DataParallel";
+    }
+    else if (mode == Mode::SplitK) {
+      return "SplitK";
+    }
+    else if (mode == Mode::StreamK) {
+      return "StreamK";
+    }
+    else {
+      return "Unknown";
+    }
+  };
+
+inline constexpr auto raster_order_to_string =
+  [] (cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90Params::RasterOrderOptions mode) -> std::string {
+    using Mode = cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90Params::RasterOrderOptions;
+    if (mode == Mode::Heuristic) {
+      return "Heuristic";
+    }
+    else if (mode == Mode::AlongM) {
+      return "AlongM";
+    }
+    else if (mode == Mode::AlongN) {
+      return "AlongN";
+    }
+    else {
+      return "Unknown";
+    }
+  };
+
+// Helper classes that take default data type when
+// the Gemm::EpilogueOutputOp does not have ElementCompute
+// and ElementScalar.
+// (e.g. when Sm90TreeVisitor is used as FusionCallbacks)
+template <typename Gemm, typename Default, typename = void>
+struct ElementComputeType {
+  using Type = Default;
+};
+
+template <typename Gemm, typename Default>
+struct ElementComputeType<Gemm, Default, std::void_t<typename Gemm::EpilogueOutputOp::ElementCompute>> {
+  using Type = typename Gemm::EpilogueOutputOp::ElementCompute;
+};
+
+template <typename Gemm, typename Default, typename = void>
+struct ElementScalarType {
+  using Type = Default;
+};
+
+template <typename Gemm, typename Default>
+struct ElementScalarType<Gemm, Default, std::void_t<typename Gemm::EpilogueOutputOp::ElementScalar>> {
+  using Type = typename Gemm::EpilogueOutputOp::ElementScalar;
+};
+
+template<class CollectiveEpilogue, class = void>
+struct IsSfdEpi : cute::false_type {};
+
+template<class CollectiveEpilogue>
+struct IsSfdEpi<CollectiveEpilogue, cute::void_t<typename CollectiveEpilogue::FusionCallbacks::Operation::GmemLayoutTagScalefactor>> : cute::true_type {};
+
+// The maximum swizzle size to use
+//
+// This class, like Splits above makes it harder to confuse
+// the order of arguments of the various run(...) functions in this file.
+class MaxSwizzleSize {
+public:
+  MaxSwizzleSize() = default;
+
+  template<class IntegralNotBool,
+    __CUTE_REQUIRES((std::is_integral_v<IntegralNotBool> &&
+      !cute::is_same_v<IntegralNotBool, bool>)) >
+  explicit MaxSwizzleSize(IntegralNotBool max_swizzle_size) : max_swizzle_size_(max_swizzle_size) {}
+  explicit operator int() const { return max_swizzle_size_; }
+private:
+  int max_swizzle_size_ = 1;
+};
+
+template <typename T>
+auto make_iterator(T* ptr) {
+  using namespace cute;
+  if constexpr (cute::is_subbyte_v<T>) {
+    return subbyte_iterator<T>(ptr);
+  }
+  else {
+    return ptr;
+  }
+}
+
+template<class T>
+struct IsDefaultEpilogue {
+  static constexpr bool value = false;
+};
+
+template<class ...args>
+struct IsDefaultEpilogue<cutlass::epilogue::collective::DefaultEpilogue<args...>> {
+  static constexpr bool value = true;
+};
+
+template<class ...args>
+struct IsDefaultEpilogue<cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<args...>> {
+  static constexpr bool value = true;
+};
+
+template <typename Epilogue, typename = void>
+struct IsLegacyEpiloguePolicy {
+  static constexpr bool value = false;
+};
+
+template <typename Epilogue>
+struct IsLegacyEpiloguePolicy<Epilogue, cute::void_t<typename Epilogue::DispatchPolicy>> {
+  using EpiloguePolicy = typename Epilogue::DispatchPolicy;
+  static constexpr bool value = cute::is_same_v<
+                                      EpiloguePolicy,
+                                      cutlass::epilogue::Sm90TmaWarpSpecializedBiasElementwise<
+                                        EpiloguePolicy::StagesC, EpiloguePolicy::StagesD, EpiloguePolicy::FragmentSize>>;
+};
+
+// The number of splits to test.
+//
+// This class makes it harder to confuse the order of arguments
+// of the various run(...) functions in this file.  The constructor
+// is explicit, so one can't just type 42 (or false, which the
+// compiler unhelpfully turns into 0); one has to type Splits(42).
+// Splits() picks the default number of splits, 1.
+//
+// The conversion-to-int operator (operator int()) MUST be explicit!
+// Conversion to int MUST require static_cast<int>.
+// Otherwise, that defeats a key purpose of this class,
+// which is to catch common errors of confusing the order
+// of function arguments.
+class Splits {
+public:
+  Splits() = default;
+
+  template<class IntegralNotBool,
+    __CUTE_REQUIRES((std::is_integral_v<IntegralNotBool> &&
+      !cute::is_same_v<IntegralNotBool, bool>)) >
+  explicit Splits(IntegralNotBool splits) : splits_(splits) {}
+  explicit operator int() const { return splits_; }
+private:
+  int splits_ = 1;
+};
+
+// The number of iterations to test.
+//
+// This class, like Splits above makes it harder to confuse
+// the order of arguments of the various run(...) functions in this file.
+// Iterations() picks the default number of iterations, 20.
+class Iterations {
+public:
+  Iterations() = default;
+
+  template<class IntegralNotBool,
+    __CUTE_REQUIRES((std::is_integral_v<IntegralNotBool> &&
+      !cute::is_same_v<IntegralNotBool, bool>)) >
+  explicit Iterations(IntegralNotBool iterations) : iterations_(iterations) {}
+  explicit operator int() const { return iterations_; }
+private:
+  int iterations_ = 20;
+};
+
+template <typename Element, typename Layout>
+bool initialize_tensor(
+  cutlass::TensorView<Element, Layout> view,
+  cutlass::Distribution::Kind dist_kind,
+  uint64_t seed) {
+
+  if (dist_kind == cutlass::Distribution::Uniform) {
+    double scope_max, scope_min;
+    int bits_input = cutlass::sizeof_bits<Element>::value;
+
+    if (bits_input == 1) {
+      scope_max = 2;
+      scope_min = 0;
+    }
+    else if (bits_input <= 8) {
+        scope_max = 1;
+        scope_min = -1;
+    }
+    else{
+      scope_max = 4;
+      scope_min = -4;
+    }
+    cutlass::reference::host::TensorFillRandomUniform(
+      view, seed, scope_max, scope_min, 0);
+  }
+
+  else if (dist_kind == cutlass::Distribution::Identity) {
+    cutlass::reference::host::TensorFillIdentity(view);
+  }
+
+  else if (dist_kind == cutlass::Distribution::Gaussian) {
+    cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+  }
+
+  else if (dist_kind == cutlass::Distribution::Sequential) {
+    cutlass::reference::host::BlockFillSequential(
+      view.data(), view.capacity());
+  }
+
+  else if (dist_kind == cutlass::Distribution::AllOnes) {
+    cutlass::reference::host::TensorFill(view, Element(1));
+  }
+
+  else {
+    EXPECT_TRUE(false) << "Not implemented";
+    return false;
+  }
+
+  return true;
+}
+
+// Looks at Cute Stride to check Row / Column Major
+template<typename Stride>
+static constexpr bool is_row_or_col_major(){
+  int stride_0 = int(cute::size<0>(Stride{}));
+  int stride_1 = int(cute::size<1>(Stride{}));
+  int depth = cute::depth(Stride{});
+  return ((stride_0 == 1) || (stride_1 == 1)) && (depth == 1);
+}
+
+
+//
+// Default MMA input Operands : A , B
+//
+template<
+  class ScheduleType_, 
+  class Gemm, 
+  class ElementA_ = typename Gemm::GemmKernel::ElementA,
+  class ElementB_ = typename Gemm::GemmKernel::ElementB,
+  class Enable = void> 
+struct HostCollectiveMainloop {
+  // Kernel data types
+  using ElementA = ElementA_;
+  using StrideA  = typename Gemm::GemmKernel::StrideA;
+  using ElementB = ElementB_;
+  using StrideB  = typename Gemm::GemmKernel::StrideB;
+  using ScheduleType = typename Gemm::GemmKernel::CollectiveMainloop::DispatchPolicy::Schedule;
+  using LayoutTagA = cutlass::detail::StrideToLayoutTagA_t<StrideA>;
+  using LayoutTagB = cutlass::detail::StrideToLayoutTagB_t<StrideB>;
+
+  using ElementAccumulator = typename Gemm::GemmKernel::ElementAccumulator;
+  using ElementScalingFactor = ElementAccumulator;
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+  using EpilogueOutputOp = typename Gemm::EpilogueOutputOp;
+
+  using Arguments = typename Gemm::GemmKernel::MainloopArguments;
+
+  cutlass::ComplexTransform TransformA = Gemm::kTransformA;
+  cutlass::ComplexTransform TransformB = Gemm::kTransformB;
+
+  StrideA stride_a;
+  StrideB stride_b;
+
+  typename LayoutTagA::Stride stride_factor_A;
+  typename LayoutTagB::Stride stride_factor_B;
+
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+
+  cutlass::HostTensor<ElementA, LayoutTagA> tensor_A;
+  cutlass::HostTensor<ElementB, LayoutTagB> tensor_B;
+  // Whether to use relative equality checks
+  CheckEquality check_relative_equality = CheckEquality::EXACT;
+
+  uint64_t seed;
+  static constexpr uint64_t kDefaultSeed = 4096;
+
+  // Note: this limitation comes from testbed / not the library
+  static_assert(is_row_or_col_major<StrideA>(),
+    "ERROR : A Layout is neither Row / Column Major)");
+  static_assert(is_row_or_col_major<StrideB>(),
+    "ERROR : B Layout is neither Row / Column Major)");
+
+  HostCollectiveMainloop(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = kDefaultSeed,
+    typename LayoutTagA::Stride stride_factor_A_ = typename LayoutTagA::Stride(),
+    typename LayoutTagB::Stride stride_factor_B_ = typename LayoutTagB::Stride()
+  ):
+    stride_factor_A(stride_factor_A_),
+    stride_factor_B(stride_factor_B_),
+    init_A(init_A_), init_B(init_B_), seed(seed_),
+    check_relative_equality(check_relative_equality_) { }
+
+  template<class ProblemShapeType>
+  bool initialize(ProblemShapeType problem_size) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("HostCollectiveMainloop (generic)::initialize(problem_shape)");
+#endif
+    //
+    // Allocate the GEMM workspace
+    //
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto M = cute::size<0>(problem_shape_MNKL);
+    auto N = cute::size<1>(problem_shape_MNKL);
+    auto K = cute::size<2>(problem_shape_MNKL);
+    auto L = cute::size<3>(problem_shape_MNKL);
+
+    stride_a = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+    stride_b = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+
+    // 2.x host tensor does not natively contain a batch stride or coord, so we spoof if by folding it into the outer mode
+    auto a_coord = cutlass::make_Coord(M * L, K);
+    // Cutlass has Row/Col major refers to MxK times KxN matrix product,
+    // so the HostTensorB should be treated as KxN in "coord"'s view
+    auto b_coord = cutlass::make_Coord(K, N * L);
+
+    try {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      CUTLASS_TRACE_HOST("HostCollectiveMainloop::initialize: tensor_A.resize");
+#endif
+      tensor_A.resize(a_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagA>::layout_factory(a_coord, stride_factor_A));
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      CUTLASS_TRACE_HOST("HostCollectiveMainloop::initialize: tensor_B.resize");
+#endif
+      tensor_B.resize(b_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagB>::layout_factory(b_coord, stride_factor_B));
+    }
+    catch (std::exception const& e) {
+      CUTLASS_TRACE_HOST("HostCollectiveMainloop::initialize: tensor A or B resize threw an exception: " << e.what());
+      throw;
+    }
+    catch (...) {
+      CUTLASS_TRACE_HOST("HostCollectiveMainloop::initialize: tensor A or B resize threw an unknown exception");
+      throw;
+    }
+
+    try {
+      EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 2022));
+      EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2021));
+    }
+    catch (cutlass::cuda_exception const& e) {
+      CUTLASS_TRACE_HOST("HostCollectiveMainloop::initialize: checked initialize_tensor threw cutlass::cuda_exception: " << e);
+      throw;
+    }
+    catch (std::exception const& e) {
+      CUTLASS_TRACE_HOST("HostCollectiveMainloop::initialize: checked initialize_tensor threw an exception: " << e.what());
+      throw;
+    }
+    catch (...) {
+      CUTLASS_TRACE_HOST("HostCollectiveMainloop::initialize: checked_initialize_tensor threw an unknown exception");
+      throw;
+    }
+
+    // It is possible to randomly initialize to all zeros, so override this with non-zeros
+    // in the upper left corner of each operand.
+    tensor_A.host_view().at({0, 0}) = ElementA(1);
+    tensor_B.host_view().at({0, 0}) = ElementB(1);
+
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    {
+      CUTLASS_TRACE_HOST("HostCollectiveMainloop::initialize: Check last error before sync_device()");
+      cudaError_t error = cudaGetLastError();
+      const auto error_str = cudaGetErrorString(error);
+      CUTLASS_TRACE_HOST("HostCollectiveMainloop::initialize: cudaGetLastError() is " << error_str);
+      CUTLASS_TRACE_HOST("HostCollectiveMainloop::initialize: tensor_A.host_data()=" << tensor_A.host_data() << ", tensor_A.device_data()=" << tensor_A.device_data());
+      CUTLASS_TRACE_HOST("HostCollectiveMainloop::initialize: tensor_B.host_data()=" << tensor_B.host_data() << ", tensor_B.device_data()=" << tensor_B.device_data());
+    }
+#endif
+    try {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      CUTLASS_TRACE_HOST("HostCollectiveMainloop::initialize: tensor_A.sync_device");
+#endif
+      tensor_A.sync_device();
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      CUTLASS_TRACE_HOST("HostCollectiveMainloop::initialize: tensor_B.sync_device");
+#endif
+      tensor_B.sync_device();
+    }
+    catch (cutlass::cuda_exception const& e) {
+      CUTLASS_TRACE_HOST("HostCollectiveMainloop::initialize: sync_device() threw cutlass::cuda_exception: " << e);
+      throw;
+    }
+    catch (std::exception const& e) {
+      CUTLASS_TRACE_HOST("HostCollectiveMainloop::initialize: sync_device() threw an exception: " << e.what());
+      throw;
+    }
+    catch (...) {
+      CUTLASS_TRACE_HOST("HostCollectiveMainloop::initialize: sync_device() threw an unknown exception");
+      throw;
+    }
+
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("HostCollectiveMainloop::initialize: Reached end");
+#endif
+    return true;
+  }
+
+  Arguments to_args() {
+
+    Arguments arguments = 
+    {
+      tensor_A.device_data(), stride_a, tensor_B.device_data(), stride_b
+    };
+    return arguments;
+  }
+
+  auto to_host_args(ProblemShapeType problem_size) {
+    using namespace cute;
+    //
+    // Allocate the GEMM workspace
+    //
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto M = cute::size<0>(problem_shape_MNKL);
+    auto N = cute::size<1>(problem_shape_MNKL);
+    auto K = cute::size<2>(problem_shape_MNKL);
+    auto L = cute::size<3>(problem_shape_MNKL);
+    auto A = make_tensor(make_iterator(tensor_A.host_data()),
+          make_layout(make_shape(M, K, L), stride_a));
+    auto B = make_tensor(make_iterator(tensor_B.host_data()),
+        make_layout(make_shape(N, K, L), stride_b));
+
+    cutlass::reference::host::GettMainloopParams<ElementAccumulator, 
+                                                 decltype(A), 
+                                                 decltype(B)
+                                                 > mainloop_params{};
+
+    mainloop_params.A = A;
+    mainloop_params.B = B;
+    mainloop_params.transform_A = TransformA;
+    mainloop_params.transform_B = TransformB;
+
+    return mainloop_params;
+  }
+
+  void print_tensors(std::ofstream& file) {
+    file << "A =\n" << tensor_A.host_view()
+         << "\nB =\n" << tensor_B.host_view();
+  }
+
+  template <
+    class Element,
+    class Layout
+  >
+  bool equality_check(
+    cutlass::TensorView<Element, Layout> const& lhs,
+    cutlass::TensorView<Element, Layout> const& rhs) const {
+
+    // Factors used for calculating relative equality. CUTLASS's relative-equality
+    // checks in include/cutlass/relatively_equal.h  are inspired by
+    // https://floating-point-gui.de/errors/comparison/. This reference suggests using
+    // the minimum normal value of a given type as the nonzero_floor.
+    Element epsilon(static_cast<Element>(0.1f));
+    Element nonzero_floor(std::numeric_limits<Element>::min());
+
+    if constexpr (!cutlass::is_complex<Element>::value) {
+      if (check_relative_equality == CheckEquality::RELATIVE) {
+        return cutlass::reference::host::TensorRelativelyEquals(
+          lhs, rhs, epsilon, nonzero_floor);
+      }
+      else {
+        return cutlass::reference::host::TensorEquals(lhs, rhs);
+      }
+    }
+    else {
+      return cutlass::reference::host::TensorEquals(lhs, rhs);
+    }
+  }
+
+  bool compare_reference(
+      cute::Shape<int,int,int,int> problem_shape_MNKL) {
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_A.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_B.host_view()), 0);
+
+    bool passed = true;
+    return passed;
+  }
+};
+
+//
+// Sparse MMA host implementation
+//
+template<
+  class Gemm,
+  class ElementA_,
+  class ElementB_>
+struct HostCollectiveMainloopSparse
+{
+  
+  // Kernel data types
+  using ElementA = ElementA_;
+  // CuTe layout A for the kernel's sparse tensorA.
+  using LayoutA  = typename Gemm::GemmKernel::CollectiveMainloop::LayoutA;
+  using ElementB = ElementB_;
+  using StrideB  = typename Gemm::GemmKernel::StrideB;
+  using ScheduleType = typename Gemm::GemmKernel::CollectiveMainloop::DispatchPolicy::Schedule;
+
+  using ElementE = typename Gemm::GemmKernel::CollectiveMainloop::ElementE;
+  // CuTe layout E for the kernel's metadata tensor.
+  using LayoutE  = typename Gemm::GemmKernel::CollectiveMainloop::LayoutE;
+  using ElementAccumulator = typename Gemm::GemmKernel::ElementAccumulator;
+  using ElementScalingFactor = ElementAccumulator;
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+  using EpilogueOutputOp = typename Gemm::EpilogueOutputOp;
+  using SparseConfig = typename Gemm::GemmKernel::CollectiveMainloop::SparseConfig;
+
+  // The following typenames are for the reference host tensors. They are non-sparse tensors.
+  using LayoutTagA = decltype(SparseConfig::deduce_layoutA_tag(LayoutA{}));
+  using StrideA = cutlass::gemm::TagToStrideA_t<LayoutTagA>;
+  // We don't care about the actual strideE for the host tensor, but just need one to allocate memory.
+  using StrideE = StrideA;
+
+  // Deduce Cutlass Layouts (RowMajor & ColumnMajor)
+  using LayoutTagB = cutlass::detail::StrideToLayoutTagB_t<StrideB>;
+  using LayoutTagE = cutlass::detail::StrideToLayoutTagA_t<StrideE>;
+
+  using ArchTag = typename Gemm::ArchTag;
+
+  using CompressorUtility = cutlass::transform::kernel::StructuredSparseCompressorUtility<
+                              cute::Shape<int, int, int, int>,
+                              ElementA,
+                              LayoutTagA,
+                              SparseConfig>;
+
+  using CompressorKernel = cutlass::transform::kernel::StructuredSparseCompressor<
+                              cute::Shape<int, int, int, int>,
+                              ElementA,
+                              LayoutTagA,
+                              SparseConfig,
+                              ArchTag>;
+
+  using Compressor = cutlass::transform::device::TransformUniversalAdapter<CompressorKernel>;
+
+  using Arguments = typename Gemm::GemmKernel::MainloopArguments;
+  // Whether to use relative equality checks
+  CheckEquality check_relative_equality = CheckEquality::EXACT;
+
+  // Note: this limitation comes from testbed / not the library
+  static_assert(is_row_or_col_major<StrideA>(),
+    "ERROR : A Layout is neither Row / Column Major)");
+  static_assert(is_row_or_col_major<StrideB>(),
+    "ERROR : B Layout is neither Row / Column Major)");
+
+  StrideA stride_a;
+  StrideA stride_a_compressed;
+  StrideB stride_b;
+  StrideE stride_e;
+
+  LayoutA layout_a;
+  LayoutE layout_e;
+
+  typename LayoutTagA::Stride stride_factor_A;
+  typename LayoutTagB::Stride stride_factor_B;
+  typename LayoutTagE::Stride stride_factor_E;
+
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+
+  cutlass::HostTensor<ElementA, LayoutTagA> tensor_A;
+  cutlass::HostTensor<ElementA, LayoutTagA> tensor_A_Comp;
+  cutlass::HostTensor<ElementB, LayoutTagB> tensor_B;
+  cutlass::HostTensor<ElementE, LayoutTagE> tensor_E;
+  uint64_t seed;
+  static constexpr uint64_t kDefaultSeed = 4096;
+  static constexpr int MaxSmCount = 16;
+
+  HostCollectiveMainloopSparse(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = kDefaultSeed,
+    typename LayoutTagA::Stride stride_factor_A_ = typename LayoutTagA::Stride(),
+    typename LayoutTagB::Stride stride_factor_B_ = typename LayoutTagB::Stride(),
+    typename LayoutTagE::Stride stride_factor_E_ = typename LayoutTagE::Stride()
+  ):
+    check_relative_equality(check_relative_equality_),
+    stride_factor_A(stride_factor_A_),
+    stride_factor_B(stride_factor_B_),
+    stride_factor_E(stride_factor_E_),
+    init_A(init_A_), init_B(init_B_), seed(seed_) { }
+
+  template<class ProblemShapeType>
+  bool initialize(ProblemShapeType problem_size) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("HostCollectiveMainloopSparse::initialize");
+#endif
+    //
+    // Allocate the GEMM workspace
+    //
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto M = cute::size<0>(problem_shape_MNKL);
+    auto N = cute::size<1>(problem_shape_MNKL);
+    auto K = cute::size<2>(problem_shape_MNKL);
+    auto L = cute::size<3>(problem_shape_MNKL);
+
+    stride_a = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+    stride_b = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+
+    CompressorUtility compressor_utility(problem_shape_MNKL, stride_a);
+
+    // TensorE
+    // In unit of ElementE (uint8_t), after alignment requirement
+    // M-dim: TensorEAtom_M alignment
+    // K-dim: TensorEAtom_K alignment
+    int KAlignedE = compressor_utility.get_metadata_k_physical();
+    int MAlignedE = compressor_utility.get_metadata_m_physical();
+
+    // TensorA Compressed
+    // In unit of ElementARaw, after alignment requirement
+    // M-dim: TMA alignment
+    // K-dim: TMA alignment
+    int KAlignedAC = compressor_utility.get_tensorA_k_physical();
+    int MAlignedAC = compressor_utility.get_tensorA_m_physical();
+
+    stride_a_compressed = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, KAlignedAC, L));
+    stride_e = cutlass::make_cute_packed_stride(StrideE{}, cute::make_shape(MAlignedE, KAlignedE, L));
+
+    auto a_coord = cutlass::make_Coord(M * L, K);
+    auto b_coord = cutlass::make_Coord(K, N * L);
+    auto e_coord = cutlass::make_Coord(MAlignedE * L, KAlignedE);
+    auto a_comp_coord = cutlass::make_Coord(MAlignedAC * L, KAlignedAC);
+
+    tensor_A.resize(a_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagA>::layout_factory(a_coord, stride_factor_A));
+    tensor_A_Comp.resize(a_comp_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagA>::layout_factory(a_comp_coord, stride_factor_A));
+    tensor_B.resize(b_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagB>::layout_factory(b_coord, stride_factor_B));
+    tensor_E.resize(e_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagE>::layout_factory(e_coord, stride_factor_E));
+
+    EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 2022));
+    EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2021));
+
+    // It is possible to randomly initialize to all zeros, so override this with non-zeros
+    // in the upper left corner of each operand.
+    tensor_A.host_view().at({0, 0}) = ElementA(1);
+    tensor_B.host_view().at({0, 0}) = ElementB(1);
+
+    compressor_utility.structure_sparse_zero_mask_fill(tensor_A.host_data(), static_cast<int>(seed + 2023));
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_E.sync_device();
+    tensor_A_Comp.sync_device();
+
+    cutlass::Status status {cutlass::Status::kSuccess };
+
+    cutlass::KernelHardwareInfo hw_info;
+    hw_info.device_id = 0;
+    hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+    typename Compressor::Arguments arguments{
+      {M, N, K, L},
+      {tensor_A.device_data(),
+       stride_a,
+       tensor_A_Comp.device_data(),
+       tensor_E.device_data()},
+      {hw_info}
+    };
+
+    Compressor compressor_op;
+    size_t workspace_size = Compressor::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    status = compressor_op.can_implement(arguments);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+
+    status = compressor_op.initialize(arguments, workspace.get());
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+
+    status = compressor_op.run();
+
+    auto result = cudaDeviceSynchronize();
+    if (result != cudaSuccess) {
+      EXPECT_EQ(result, cudaSuccess) << "Error at Kernel Sync.";
+      return false;
+    }
+
+    layout_a = SparseConfig::fill_layoutA(problem_shape_MNKL);
+    layout_e = SparseConfig::fill_layoutE(problem_shape_MNKL);
+
+    tensor_E.sync_host();
+    tensor_A_Comp.sync_host();
+
+    return true;
+  }
+
+  Arguments to_args() {
+    using ArrayElementA = typename Gemm::GemmKernel::CollectiveMainloop::ArrayElementA;
+    using ArrayElementB = typename Gemm::GemmKernel::CollectiveMainloop::ArrayElementB;
+    return {
+      reinterpret_cast<ArrayElementA *>(tensor_A_Comp.device_data()), layout_a,
+      reinterpret_cast<ArrayElementB *>(tensor_B.device_data()), stride_b,
+      tensor_E.device_data(), layout_e
+    };
+  }
+
+  auto to_host_args(ProblemShapeType problem_size) {
+    using namespace cute;
+    //
+    // Allocate the GEMM workspace
+    //
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto M = cute::size<0>(problem_shape_MNKL);
+    auto N = cute::size<1>(problem_shape_MNKL);
+    auto K = cute::size<2>(problem_shape_MNKL);
+    auto L = cute::size<3>(problem_shape_MNKL);
+    auto A = make_tensor(make_iterator(tensor_A.host_data()),
+          make_layout(make_shape(M, K, L), stride_a));
+    auto B = make_tensor(make_iterator(tensor_B.host_data()),
+        make_layout(make_shape(N, K, L), stride_b));
+
+    cutlass::reference::host::GettMainloopParams<ElementAccumulator, decltype(A), decltype(B)> mainloop_params{A, B};
+    return mainloop_params;
+  }
+
+  void print_tensors(std::ofstream& file) {
+    file << "A =\n" << tensor_A.host_view()
+         << "\nB =\n" << tensor_B.host_view();
+  }
+
+  bool compare_reference(
+      cute::Shape<int,int,int,int> problem_shape_MNKL) {
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_A.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_B.host_view()), 0);
+    return true;
+  }
+};
+
+template<
+  class ScheduleType_, 
+  class Gemm, 
+  class ElementA_,
+  class ElementB_
+>
+struct HostCollectiveMainloop<ScheduleType_, Gemm, ElementA_, ElementB_,
+    cute::enable_if_t<
+      cute::is_same_v<
+        typename Gemm::CollectiveMainloop::DispatchPolicy, 
+        cutlass::gemm::MainloopSm90TmaGmmaWarpSpecializedSparse<Gemm::CollectiveMainloop::DispatchPolicy::Stages,
+                                                                typename Gemm::CollectiveMainloop::DispatchPolicy::ClusterShape,
+                                                                ScheduleType_>>>>
+  : HostCollectiveMainloopSparse<Gemm, ElementA_, ElementB_>
+{
+  using HostCollectiveMainloopSparse<Gemm, ElementA_, ElementB_>::HostCollectiveMainloopSparse;
+};
+
+template<class Gemm>
+struct HostCollectiveDefaultEpilogue {
+  // fusion types are potentially void if the fusion is not supported
+  // helper so we don't try to construct HostTensor with void type
+  template <typename T, typename U = uint8_t>
+  using non_void_t = cute::conditional_t<cute::is_void_v<T>, U, T>;
+
+  using ScheduleType = typename Gemm::GemmKernel::CollectiveMainloop::DispatchPolicy::Schedule;
+  using kernel   = typename Gemm::GemmKernel;
+  using Epilogue = typename kernel::CollectiveEpilogue;
+
+  using ElementD = typename kernel::ElementD;
+  using StrideD  = typename kernel::StrideD;
+  using ElementC = non_void_t<typename kernel::ElementC, ElementD>;
+  using StrideC  = typename kernel::StrideC;
+
+  using FusionOp = typename Gemm::EpilogueOutputOp;
+
+  static_assert(rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+  static_assert(rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+
+  static_assert(is_row_or_col_major<StrideC>(),
+    "ERROR : C Layout is neither Row / Column Major)");
+  static_assert(is_row_or_col_major<StrideD>(),
+    "ERROR : D Layout is neither Row / Column Major)");
+
+  // Deduce Cutlass Layouts (RowMajor & ColumnMajor)
+  using LayoutTagC = cutlass::detail::StrideToLayoutTagC_t<StrideC>;
+  using LayoutTagD = cutlass::detail::StrideToLayoutTagC_t<StrideD>;
+  using LayoutTagScalar = cutlass::layout::PackedVectorLayout; // scalars are size-1 vectors
+  using LayoutTagVector = cutlass::layout::PackedVectorLayout;
+
+  using ElementAccumulator = typename kernel::ElementAccumulator;
+  using ElementScalingFactor = ElementAccumulator;
+  using ProblemShapeType = typename kernel::ProblemShape;
+  using ElementCompute = typename ElementComputeType<Gemm, ElementAccumulator>::Type;
+  using ElementScalar = typename ElementScalarType<Gemm, ElementCompute>::Type;
+
+  using Arguments = typename Gemm::GemmKernel::EpilogueArguments;
+
+  /// Initialization
+  StrideC stride_c;
+  StrideD stride_d;
+
+  typename LayoutTagC::Stride stride_factor_C;
+  typename LayoutTagD::Stride stride_factor_D;
+
+  cutlass::HostTensor<ElementC, LayoutTagC> tensor_C;
+  // Inputs
+  ElementScalar alpha;
+  ElementScalar beta;
+
+  cutlass::HostTensor<ElementD, LayoutTagD> tensor_D;
+  cutlass::HostTensor<ElementD, LayoutTagD> reference_D;
+
+  // Whether to use relative equality checks
+  CheckEquality check_relative_equality = CheckEquality::EXACT;
+  // Are scalars copied to device memory before kernel launch
+  ScalarLoc use_device_scalars = ScalarLoc::ON_HOST;
+  // If per-row scale is enabled and this is disabled, alpha/beta are passed as a host or device scalar instead of device vector
+  VectorScale vector_scale_mode = VectorScale::DISABLED;
+
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+  static constexpr uint64_t kDefaultSeed = 4096;
+
+  HostCollectiveDefaultEpilogue(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    ScalarLoc use_device_scalars_ = ScalarLoc::ON_HOST,
+    VectorScale vector_scale_mode_ = VectorScale::DISABLED,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_scale_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_bias_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = kDefaultSeed
+  ): init_C(init_C_), seed(seed_), 
+     stride_factor_C(typename LayoutTagC::Stride()), 
+     stride_factor_D(typename LayoutTagD::Stride()),
+     check_relative_equality(check_relative_equality_),
+     use_device_scalars(use_device_scalars_){ }
+
+  bool initialize(ProblemShapeType problem_size, ElementScalar alpha_=1.f, ElementScalar beta_=0.f) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("HostCollectiveDefaultEpilogue::initialize(problem_size, alpha, beta)");
+#endif
+    // Initialize Epilogue tensors
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    stride_c = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
+    stride_d = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
+
+    // 2.x host tensor does not natively contain a batch stride or coord, so we spoof if by folding it into the outer mode
+    auto c_coord = cutlass::make_Coord(M * L, N);
+    try {
+      tensor_C.resize(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagC>::layout_factory(c_coord, stride_factor_C));
+      tensor_D.resize(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(c_coord, stride_factor_D));
+      reference_D.resize(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(c_coord, stride_factor_D), false);
+    }
+    catch (std::exception const& e) {
+      CUTLASS_TRACE_HOST("HostCollectiveDefaultEpilogue::initialize: resizing tensors threw an exception: " << e.what());
+      throw;
+    }
+    catch (...) {
+      CUTLASS_TRACE_HOST("HostCollectiveDefaultEpilogue::initialize: resizing tensors threw an unknown exception");
+      throw;
+    }
+    {
+      const bool init_succeeded = initialize_tensor(tensor_C.host_view(), init_C, seed + 2020);
+      if (not init_succeeded) {
+        CUTLASS_TRACE_HOST("HostCollectiveDefaultEpilogue::initialize: initialize_tensor returned false");
+      }
+      EXPECT_TRUE(init_succeeded);
+    }
+    tensor_C.host_view().at({0, 0}) = ElementC(1);
+
+    cutlass::reference::host::TensorCopy(reference_D.host_view(), tensor_C.host_view());
+
+    try {
+      tensor_C.sync_device();
+      tensor_D.sync_device();
+    }
+    catch (std::exception const& e) {
+      CUTLASS_TRACE_HOST("HostCollectiveDefaultEpilogue::initialize: sync_device() threw an exception: " << e.what());
+      throw;
+    }
+    catch (...) {
+      CUTLASS_TRACE_HOST("HostCollectiveDefaultEpilogue::initialize: sync_device() threw an unknown exception");
+      throw;
+    }
+
+    alpha = alpha_;
+    beta = beta_;
+
+    return true;
+  }
+
+  template <
+    class Element,
+    class Layout
+  >
+  bool equality_check(
+    cutlass::TensorView<Element, Layout> const& lhs,
+    cutlass::TensorView<Element, Layout> const& rhs) const {
+
+    // Factors used for calculating relative equality. CUTLASS's relative-equality
+    // checks in include/cutlass/relatively_equal.h  are inspired by
+    // https://floating-point-gui.de/errors/comparison/. This reference suggests using
+    // the minimum normal value of a given type as the nonzero_floor.
+    Element epsilon(static_cast<Element>(0.1f));
+    Element nonzero_floor(std::numeric_limits<Element>::min());
+
+    if constexpr (!cutlass::is_complex<Element>::value) {
+      if (check_relative_equality == CheckEquality::RELATIVE) {
+        return cutlass::reference::host::TensorRelativelyEquals(
+          lhs, rhs, epsilon, nonzero_floor);
+      }
+      else {
+        return cutlass::reference::host::TensorEquals(lhs, rhs);
+      }
+    }
+    else {
+      return cutlass::reference::host::TensorEquals(lhs, rhs);
+    }
+  }
+
+  bool compare_reference(
+      cute::Shape<int,int,int,int> problem_shape_MNKL,
+      ElementScalar alpha,
+      ElementScalar beta) {
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    tensor_D.sync_host();
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_C.host_view()), 0);
+
+    if (tensor_D.size() > 1) {
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0);
+    }
+
+    if (reference_D.size() > 1) {
+      EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0);
+    }
+
+    bool passed = equality_check(reference_D.host_view(), tensor_D.host_view());
+    if(!passed) {
+      std::cout<<"D is incorrect"<<std::endl;  
+    }
+    return passed;
+  }
+
+  void print_tensors(std::ofstream& file) {
+    file
+    << "\nC =\n" << tensor_C.host_view()
+    << "\n\nReference =\n" << reference_D.host_view()
+    << "\n\nComputed =\n" << tensor_D.host_view();
+  }
+
+  Arguments to_args(ProblemShapeType problem_size) {
+    Arguments arguments = 
+      {
+        {alpha, beta},
+        tensor_C.device_data(), stride_c, tensor_D.device_data(), stride_d
+      };
+
+    return arguments;
+  }
+
+  auto to_host_args(ProblemShapeType problem_size) {
+    using namespace cute;
+    //
+    // Allocate the GEMM workspace
+    //
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto M = cute::get<0>(problem_shape_MNKL);
+    auto N = cute::get<1>(problem_shape_MNKL);
+    auto K = cute::get<2>(problem_shape_MNKL);
+    auto L = cute::get<3>(problem_shape_MNKL);
+    auto coord_0 = cutlass::make_Coord(0);
+    auto C = cute::make_tensor(detail::make_iterator(tensor_C.host_data()),
+        cute::make_layout(cute::make_shape(M, N, L), stride_c));
+    auto D = cute::make_tensor(detail::make_iterator(reference_D.host_data()),
+        cute::make_layout(cute::make_shape(M, N, L), stride_d));
+
+    cutlass::reference::host::GettEpilogueParams<
+      ElementScalar,
+      ElementScalar,
+      ElementAccumulator,
+      ElementCompute,
+      decltype(C),
+      decltype(D)>
+        epilogue_params{};
+
+    epilogue_params.C = C;
+    epilogue_params.D = D;
+    epilogue_params.alpha = alpha;
+    epilogue_params.beta = beta;
+
+    return epilogue_params;
+  }
+};
+
+template<class Gemm>
+struct HostCollectiveEpilogue {
+  // fusion types are potentially void if the fusion is not supported
+  // helper so we don't try to construct HostTensor with void type
+  template <typename T, typename U = uint8_t>
+  using non_void_t = cute::conditional_t<cute::is_void_v<T>, U, T>;
+
+  using ScheduleType = typename Gemm::GemmKernel::CollectiveMainloop::DispatchPolicy::Schedule;
+  using kernel   = typename Gemm::GemmKernel;
+  using Epilogue = typename kernel::CollectiveEpilogue;
+  static_assert(IsDefaultEpilogue<Epilogue>::value == false, "Default Epilogue is not supported");
+
+  using ElementD = typename kernel::ElementD;
+  using StrideD  = typename kernel::StrideD;
+  using ElementC = non_void_t<typename kernel::ElementC, ElementD>;
+  using StrideC  = typename kernel::StrideC;
+
+  static_assert(rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+  static_assert(rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+
+  static_assert(is_row_or_col_major<StrideC>(),
+    "ERROR : C Layout is neither Row / Column Major)");
+  static_assert(is_row_or_col_major<StrideD>(),
+    "ERROR : D Layout is neither Row / Column Major)");
+
+  // Deduce Cutlass Layouts (RowMajor & ColumnMajor)
+  using LayoutTagC = cutlass::detail::StrideToLayoutTagC_t<StrideC>;
+  using LayoutTagD = cutlass::detail::StrideToLayoutTagC_t<StrideD>;
+  using LayoutTagScalar = cutlass::layout::PackedVectorLayout; // scalars are size-1 vectors
+  using LayoutTagVector = cutlass::layout::PackedVectorLayout;
+
+  using ElementAccumulator = typename kernel::ElementAccumulator;
+  using ElementScalingFactor = ElementAccumulator;
+  using ProblemShapeType = typename kernel::ProblemShape;
+
+  //
+  // FusionOperation derived types/queries
+  //
+  static constexpr bool IsLegacy = detail::IsLegacyEpiloguePolicy<Epilogue>::value;
+
+  // FFMA2 SGEMM uses ThreadEpilogueOp for bias and relu support instead of FusionOp, so we compose LinCombPerRowBiasEltAct FusionOp by hand to test the functionality.
+  static constexpr bool IsFfma2Kernel = cute::is_same_v<ScheduleType, cutlass::gemm::KernelMultistage>;
+  using FusionOp = cute::conditional_t<IsFfma2Kernel,
+                                       cutlass::epilogue::fusion::LinCombPerRowBiasEltAct<cutlass::epilogue::thread::Clamp, float, float>,
+                                       typename Gemm::EpilogueOutputOp>;
+  static_assert(cute::is_base_of_v<cutlass::epilogue::fusion::FusionOperation, FusionOp>);
+
+  using ElementCompute    = typename FusionOp::ElementCompute;
+  using ElementScalar     = typename FusionOp::ElementScalar;
+  using ElementBias       = non_void_t<typename FusionOp::ElementBias>;
+  using ElementAux        = non_void_t<typename FusionOp::ElementAux>;
+  using ElementAmax       = non_void_t<typename FusionOp::ElementAmax>;
+  using LayoutTagAux      = non_void_t<typename FusionOp::GmemLayoutTagAux, LayoutTagD>;
+  using ActivationFunctor = non_void_t<typename FusionOp::ActivationFn,
+                              cutlass::epilogue::thread::Identity<ElementCompute>>;
+
+  static constexpr bool IsRowBiasEnabled        = FusionOp::IsPerRowBiasSupported;
+  static constexpr bool IsDeBiasEnabled      = FusionOp::IsDePerRowBiasSupported;
+  static constexpr bool IsPerRowScaleEnabled = FusionOp::IsPerRowScaleSupported;
+  static constexpr bool IsScaleFactorEnabled = FusionOp::IsScaleFactorSupported;
+  static constexpr bool IsAuxInEnabled       = FusionOp::IsAuxInSupported;
+  static constexpr bool IsAuxOutEnabled      = FusionOp::IsAuxOutSupported;
+  static constexpr bool IsAbsMaxEnabledD     = FusionOp::IsAbsMaxSupported &&
+                                                (cute::is_same_v<ElementD, cutlass::float_e4m3_t> ||
+                                                 cute::is_same_v<ElementD, cutlass::float_e5m2_t>);
+  static constexpr bool IsAbsMaxEnabledAux   = IsAuxOutEnabled && FusionOp::IsAbsMaxSupported &&
+                                                (cute::is_same_v<ElementAux, cutlass::float_e4m3_t> ||
+                                                 cute::is_same_v<ElementAux, cutlass::float_e5m2_t>);
+  using Arguments = typename Gemm::GemmKernel::EpilogueArguments;
+
+  /// Initialization
+  StrideC stride_c;
+  StrideD stride_d;
+
+  typename LayoutTagC::Stride stride_factor_C;
+  typename LayoutTagD::Stride stride_factor_D;
+
+  // Inputs
+  cutlass::HostTensor<ElementScalar, LayoutTagScalar> alpha;
+  cutlass::HostTensor<ElementScalar, LayoutTagScalar> beta;
+  cutlass::HostTensor<ElementScalar, LayoutTagScalar> scale_A;
+  cutlass::HostTensor<ElementScalar, LayoutTagScalar> scale_B;
+  cutlass::HostTensor<ElementScalar, LayoutTagScalar> scale_C;
+  cutlass::HostTensor<ElementScalar, LayoutTagScalar> scale_D;
+  cutlass::HostTensor<ElementScalar, LayoutTagScalar> scale_Aux;
+  cutlass::HostTensor<ElementBias  , LayoutTagVector> bias;
+  cutlass::HostTensor<ElementC, LayoutTagC> tensor_C;
+  cutlass::HostTensor<ElementCompute, LayoutTagScalar> norm_constant;
+
+  // Outputs
+  cutlass::HostTensor<ElementAmax, LayoutTagScalar> abs_max_Aux;
+  cutlass::HostTensor<ElementAmax, LayoutTagScalar> abs_max_D;
+  cutlass::HostTensor<ElementAux , LayoutTagAux   > tensor_Aux;
+  cutlass::gemm::TagToStrideC_t<   LayoutTagAux   > stride_Aux;
+  cutlass::HostTensor<ElementD, LayoutTagD> tensor_D;
+  cutlass::HostTensor<ElementD, LayoutTagD> reference_D;
+
+  // References
+  cutlass::HostTensor<ElementBias, LayoutTagVector> reference_dbias;
+  cutlass::HostTensor<ElementAux , LayoutTagAux   > reference_Aux;
+  cutlass::HostTensor<ElementAmax, LayoutTagScalar> reference_abs_max_Aux;
+  cutlass::HostTensor<ElementAmax, LayoutTagScalar> reference_abs_max_D;
+
+  // Whether to use relative equality checks
+  CheckEquality check_relative_equality = CheckEquality::EXACT;
+  // Are scalars copied to device memory before kernel launch
+  ScalarLoc use_device_scalars = ScalarLoc::ON_HOST;
+  // If per-row scale is enabled and this is disabled, alpha/beta are passed as a host or device scalar instead of device vector
+  VectorScale vector_scale_mode = VectorScale::DISABLED;
+
+  // Random distribution with which to initialize the A/B/C/D/Aux scaling factors
+  cutlass::Distribution::Kind init_scale = cutlass::Distribution::Uniform;
+  // Random distribution with which to initialize the bias vector
+  cutlass::Distribution::Kind init_bias = cutlass::Distribution::Uniform;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+  static constexpr uint64_t kDefaultSeed = 4096;
+
+  HostCollectiveEpilogue(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    ScalarLoc use_device_scalars_ = ScalarLoc::ON_HOST,
+    VectorScale vector_scale_mode_ = VectorScale::DISABLED,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_scale_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_bias_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = kDefaultSeed
+  ): init_scale(init_scale_), init_bias(init_bias_), 
+     init_C(init_C_), seed(seed_), 
+     stride_factor_C(typename LayoutTagC::Stride()), 
+     stride_factor_D(typename LayoutTagD::Stride()),
+     check_relative_equality(check_relative_equality_),
+     use_device_scalars(use_device_scalars_){ }
+
+  bool initialize(ProblemShapeType problem_size, ElementScalar alpha_=1.f, ElementScalar beta_=0.f) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("HostCollectiveEpilogue::initialize(problem_size, alpha, beta)");
+#endif
+    // Initialize Epilogue tensors
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto M = cute::size<0>(problem_shape_MNKL);
+    auto N = cute::size<1>(problem_shape_MNKL);
+    auto K = cute::size<2>(problem_shape_MNKL);
+    auto L = cute::size<3>(problem_shape_MNKL);
+
+    stride_c = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
+    stride_d = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
+
+    // 2.x host tensor does not natively contain a batch stride or coord, so we spoof if by folding it into the outer mode
+    auto c_coord = cutlass::make_Coord(M * L, N);
+    try {
+      tensor_C.resize(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagC>::layout_factory(c_coord, stride_factor_C));
+      tensor_D.resize(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(c_coord, stride_factor_D));
+      reference_D.resize(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(c_coord, stride_factor_D), false);
+    }
+    catch (std::exception const& e) {
+      CUTLASS_TRACE_HOST("HostCollectiveEpilogue::initialize: resizing tensors threw an exception: " << e.what());
+      throw;
+    }
+    catch (...) {
+      CUTLASS_TRACE_HOST("HostCollectiveEpilogue::initialize: resizing tensors threw an unknown exception");
+      throw;
+    }
+
+    try {
+      const bool initialize_tensor_C_succeeded =
+        initialize_tensor(tensor_C.host_view(), init_C, seed + 2020);
+      if (not initialize_tensor_C_succeeded) {
+        CUTLASS_TRACE_HOST("HostCollectiveEpilogue::initialize: initialize_tensor returned false");
+      }
+      EXPECT_TRUE(initialize_tensor_C_succeeded);
+    }
+    catch (std::exception const& e) {
+      CUTLASS_TRACE_HOST("HostCollectiveEpilogue::initialize: initialize_tensor threw an exception: " << e.what());
+      throw;
+    }
+    catch (...) {
+      CUTLASS_TRACE_HOST("HostCollectiveEpilogue::initialize: initialize_tensor threw an unknown exception");
+      throw;
+    }
+
+    tensor_C.host_view().at({0, 0}) = ElementC(1);
+
+    cutlass::reference::host::TensorCopy(reference_D.host_view(), tensor_C.host_view());
+    try {
+      tensor_C.sync_device();
+      tensor_D.sync_device();
+    }
+    catch (std::exception const& e) {
+      CUTLASS_TRACE_HOST("HostCollectiveEpilogue::initialize: sync_device() threw an exception: " << e.what());
+      throw;
+    }
+    catch (...) {
+      CUTLASS_TRACE_HOST("HostCollectiveEpilogue::initialize: sync_device() threw an unknown exception");
+      throw;
+    }
+
+    auto scalar_coord = cutlass::make_Coord(1);
+    auto col_vector_coord = cutlass::make_Coord(M);
+    auto row_vector_coord = cutlass::make_Coord(N);
+    auto batch_vector_coord = cutlass::make_Coord(L);
+    auto ML_coord = cutlass::make_Coord(M * L);
+    if constexpr (IsPerRowScaleEnabled) {
+      // scalars
+      if (vector_scale_mode == VectorScale::DISABLED) {
+        // batched scalars
+        if (use_device_scalars == ScalarLoc::ON_DEVICE) {
+          alpha.resize(batch_vector_coord, true);
+          beta.resize(batch_vector_coord, true);
+          EXPECT_TRUE(initialize_tensor(alpha.host_view(), init_scale, seed + 2023));
+          if (beta_ != ElementScalar(0)) {
+            EXPECT_TRUE(initialize_tensor(beta.host_view(), init_scale, seed + 2024));
+          }
+          else {
+            cutlass::reference::host::TensorFill(beta.host_view(), beta_);
+          }
+        }
+        // non-batched scalars
+        else {
+          alpha.resize(scalar_coord, false);
+          beta.resize(scalar_coord, false);
+          cutlass::reference::host::TensorFill(alpha.host_view(), alpha_);
+          cutlass::reference::host::TensorFill(beta.host_view(), beta_);
+        }
+      }
+      // batched vectors
+      else {
+        alpha.resize(ML_coord, true);
+        beta.resize(ML_coord, true);
+        EXPECT_TRUE(initialize_tensor(alpha.host_view(), init_scale, seed + 2023));
+        if (beta_ != ElementScalar(0)) {
+          EXPECT_TRUE(initialize_tensor(beta.host_view(), init_scale, seed + 2024));
+        }
+        else {
+          cutlass::reference::host::TensorFill(beta.host_view(), beta_);
+        }
+      }
+    }
+    else {
+      if (use_device_scalars == ScalarLoc::ON_DEVICE) {
+        // Set alpha  beta for different batches.
+        alpha.resize(batch_vector_coord, true);
+        beta.resize(batch_vector_coord, true);
+        cutlass::reference::host::TensorFill(alpha.host_view(), alpha_);
+        for (int l = 0; l < L; ++l) {
+          beta.host_view().at(cutlass::make_Coord(l)) = beta_ + ElementScalar(l);
+        }
+      }
+      else {
+        alpha.resize(scalar_coord, false);
+        beta.resize(scalar_coord, false);
+        cutlass::reference::host::TensorFill(alpha.host_view(), alpha_);
+        cutlass::reference::host::TensorFill(beta.host_view(), beta_);
+      }
+    }
+    alpha.sync_device();
+    beta.sync_device();
+
+    if constexpr (IsScaleFactorEnabled) {
+      scale_A.resize(scalar_coord, (use_device_scalars == ScalarLoc::ON_DEVICE));
+      scale_B.resize(scalar_coord, (use_device_scalars == ScalarLoc::ON_DEVICE));
+      scale_C.resize(scalar_coord, (use_device_scalars == ScalarLoc::ON_DEVICE));
+      scale_D.resize(scalar_coord, (use_device_scalars == ScalarLoc::ON_DEVICE));
+      EXPECT_TRUE(initialize_tensor(scale_A.host_view(), init_scale, seed + 2023));
+      EXPECT_TRUE(initialize_tensor(scale_B.host_view(), init_scale, seed + 2024));
+      EXPECT_TRUE(initialize_tensor(scale_C.host_view(), init_scale, seed + 2025));
+      EXPECT_TRUE(initialize_tensor(scale_D.host_view(), init_scale, seed + 2026));
+      scale_A.sync_device();
+      scale_B.sync_device();
+      scale_C.sync_device();
+      scale_D.sync_device();
+    }
+
+    if constexpr (
+      IsRowBiasEnabled
+    ) {
+      bias.resize(IsRowBiasEnabled ? col_vector_coord : row_vector_coord);
+      EXPECT_TRUE(initialize_tensor(bias.host_view(), init_bias, seed + 2023));
+      bias.sync_device();
+    }
+
+    if constexpr (IsDeBiasEnabled) {
+      bias.resize(col_vector_coord);
+      reference_dbias.resize(col_vector_coord);
+      cutlass::reference::host::TensorFill(bias.host_view(), ElementBias(0));
+      cutlass::reference::host::TensorFill(reference_dbias.host_view(), ElementBias(0));
+      bias.sync_device();
+    }
+
+    if constexpr (IsAbsMaxEnabledD) {
+      abs_max_D.resize(scalar_coord);
+      // ensure in-place device reductions perform their own initialization
+      cutlass::reference::host::TensorFill(abs_max_D.host_view(),
+                                           CUTLASS_STL_NAMESPACE::numeric_limits<ElementAmax>::max());
+      abs_max_D.sync_device();
+      reference_abs_max_D.resize(scalar_coord);
+      cutlass::reference::host::TensorFill(reference_abs_max_D.host_view(), ElementAmax(0));
+    }
+
+    if constexpr (IsAuxInEnabled) {
+      auto aux_coord = cutlass::make_Coord(M * L, N);
+      auto aux_layout = cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(aux_coord, typename LayoutTagAux::Stride{});
+      tensor_Aux.resize(aux_coord, aux_layout);
+      EXPECT_TRUE(initialize_tensor(tensor_Aux.host_view(), init_C, seed + 2023));
+      tensor_Aux.sync_device();
+      stride_Aux = cutlass::make_cute_packed_stride(cutlass::gemm::TagToStrideC_t<LayoutTagAux>{}, cute::make_shape(M, N, L));
+    }
+
+    if constexpr (IsAuxOutEnabled) {
+      auto aux_coord = cutlass::make_Coord(M * L, N);
+      auto aux_layout = cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(aux_coord, typename LayoutTagAux::Stride{});
+      tensor_Aux.resize(aux_coord, aux_layout);
+      reference_Aux.resize(aux_coord, aux_layout, false);
+      tensor_Aux.sync_device();
+      stride_Aux = cutlass::make_cute_packed_stride(cutlass::gemm::TagToStrideC_t<LayoutTagAux>{}, cute::make_shape(M, N, L));
+
+      if constexpr (IsScaleFactorEnabled) {
+        scale_Aux.resize(scalar_coord, (use_device_scalars == ScalarLoc::ON_DEVICE));
+        EXPECT_TRUE(initialize_tensor(scale_Aux.host_view(), init_scale, seed + 2027));
+        scale_Aux.sync_device();
+      }
+
+      if constexpr (IsAbsMaxEnabledAux) {
+        abs_max_Aux.resize(scalar_coord);
+        // ensure in-place device reductions perform their own initialization
+        cutlass::reference::host::TensorFill(abs_max_Aux.host_view(),
+                                             CUTLASS_STL_NAMESPACE::numeric_limits<ElementAmax>::max());
+        abs_max_Aux.sync_device();
+        reference_abs_max_Aux.resize(scalar_coord);
+        cutlass::reference::host::TensorFill(reference_abs_max_Aux.host_view(), ElementAmax(0));
+      }
+    }
+
+    return true;
+  }
+
+  template <
+    class Element,
+    class Layout
+  >
+  bool equality_check(
+    cutlass::TensorView<Element, Layout> const& lhs,
+    cutlass::TensorView<Element, Layout> const& rhs) const {
+
+    // Factors used for calculating relative equality. CUTLASS's relative-equality
+    // checks in include/cutlass/relatively_equal.h  are inspired by
+    // https://floating-point-gui.de/errors/comparison/. This reference suggests using
+    // the minimum normal value of a given type as the nonzero_floor.
+    Element epsilon(static_cast<Element>(0.1f));
+    Element nonzero_floor(std::numeric_limits<Element>::min());
+
+    if constexpr (!cutlass::is_complex<Element>::value) {
+      if (check_relative_equality == CheckEquality::RELATIVE) {
+        return cutlass::reference::host::TensorRelativelyEquals(
+          lhs, rhs, epsilon, nonzero_floor);
+      }
+      else {
+        return cutlass::reference::host::TensorEquals(lhs, rhs);
+      }
+    }
+    else {
+      return cutlass::reference::host::TensorEquals(lhs, rhs);
+    }
+  }
+
+  bool compare_reference(
+      cute::Shape<int,int,int,int> problem_shape_MNKL,
+      ElementScalar alpha,
+      ElementScalar beta) {
+    tensor_D.sync_host();
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_C.host_view()), 0);
+
+    if (tensor_D.size() > 1) {
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0);
+    }
+
+    if (reference_D.size() > 1) {
+      EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0);
+    }
+
+    bool passed = equality_check(reference_D.host_view(), tensor_D.host_view());
+    if(!passed) {
+      #if 0
+      auto [M, N, K, L] = problem_shape_MNKL;
+      auto ref = cute::make_tensor(detail::make_iterator(reference_D.host_data()),
+        cute::make_layout(cute::make_shape(M, N, L), stride_d));
+      auto comp = cute::make_tensor(detail::make_iterator(tensor_D.host_data()),
+        cute::make_layout(cute::make_shape(M, N, L), stride_d));
+      for(int i=0; i<M; i++) {
+        for(int j=0; j<N; j++) {
+          for(int l=0; l<L; l++) {
+            if(static_cast<float>(ElementD(ref(i, j, l))) != static_cast<float>((ElementD(comp(i, j, l))))) {
+              printf("<m %d, n %d, l %d> ref: %f comp: %f\n", i, j, l, static_cast<float>(ElementD(ref(i, j, l))), static_cast<float>((ElementD(comp(i, j, l)))));
+            }
+          }
+        }
+      }
+      #endif
+      std::cout<<"D is incorrect"<<std::endl;  
+    }
+
+    if constexpr (IsAbsMaxEnabledD) {
+      abs_max_D.sync_host();
+      passed &= equality_check(reference_abs_max_D.host_view(), abs_max_D.host_view());
+    }
+
+    if constexpr (IsDeBiasEnabled) {
+      bias.sync_host();
+      EXPECT_GT(cutlass::reference::host::TensorNorm(bias.host_view()), 0);
+      EXPECT_GT(cutlass::reference::host::TensorNorm(reference_dbias.host_view()), 0);
+      passed &= equality_check(reference_dbias.host_view(), bias.host_view());
+    }
+
+    if constexpr (IsAuxOutEnabled) {
+      tensor_Aux.sync_host();
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_Aux.host_view()), 0);
+      EXPECT_GT(cutlass::reference::host::TensorNorm(reference_Aux.host_view()), 0);
+      passed &= equality_check(reference_Aux.host_view(), tensor_Aux.host_view());
+      if(!passed) {
+        std::cout<<"Aux is incorrect"<<std::endl;  
+      }
+      if constexpr (IsAbsMaxEnabledAux) {
+        abs_max_Aux.sync_host();
+        bool tmp =  equality_check(reference_abs_max_Aux.host_view(), abs_max_Aux.host_view());
+        if(!tmp) {
+          std::cout<<"AbsMax of Aux is incorrect"<<std::endl;  
+        }
+        passed &= tmp;
+      }
+    }
+
+    return passed;
+  }
+
+  void print_tensors(std::ofstream& file) {
+    auto coord_0 = cutlass::make_Coord(0);
+    if constexpr (IsScaleFactorEnabled) {
+      file
+        << ", scale_a: " << scale_A.at(coord_0)
+        << ", scale_b: " << scale_B.at(coord_0)
+        << ", scale_c: " << scale_C.at(coord_0);
+    }
+    if constexpr (IsPerRowScaleEnabled) {
+      file << "\n\nvalpha = \n" << alpha.host_view();
+      file << "\n\nvbeta = \n" << beta.host_view();
+    } else {
+      file
+        << "\n\nalpha= \n" << alpha.host_view() 
+        << "\n\nbeta= \n " << beta.host_view();
+    }
+    file << "\n\n";
+
+    if constexpr (IsAbsMaxEnabledD) {
+      file << "scale_d: " << float(scale_D.at(coord_0));
+      file << "\nReference abs_max_D :";
+      file << " " << float(reference_abs_max_D.at(coord_0));
+
+      file << "\nComputed abs_max_D :";
+      file << " " << float(abs_max_D.at(coord_0));
+      file << "\n\n";
+    }
+
+    if constexpr (IsAbsMaxEnabledAux) {
+      file << "scale_aux: " << float(scale_Aux.at(coord_0));
+      file << "\nReference abs_max_Aux :";
+      file << " " << float(reference_abs_max_Aux.at(coord_0));
+
+      file << "\nComputed abs_max_Aux :";
+      file << " " << float(abs_max_Aux.at(coord_0));
+      file << "\n\n";
+    }
+
+    if constexpr (IsRowBiasEnabled) {
+      file << "\n\nBias = \n" << bias.host_view();
+    }
+    if constexpr (IsAuxInEnabled) {
+      file << "\n\nAux Input = \n" << tensor_Aux.host_view();
+    }
+
+    if constexpr (IsDeBiasEnabled) {
+      file << "\n\nReference dBias = \n" << reference_dbias.host_view();
+      file << "\n\nComputed dBias = \n" << bias.host_view();
+    }
+
+    if constexpr (IsAuxOutEnabled) {
+      file
+        << "\n\nReference Aux =\n" << reference_Aux.host_view()
+        << "\n\nComputed Aux =\n" << tensor_Aux.host_view();
+    }
+    file
+    << "\nC =\n" << tensor_C.host_view()
+    << "\n\nReference =\n" << reference_D.host_view()
+    << "\n\nComputed =\n" << tensor_D.host_view();
+
+  }
+
+  Arguments to_args(ProblemShapeType problem_size) {
+    auto coord_0 = cutlass::make_Coord(0);
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+    Arguments arguments = 
+      {
+        {},
+        tensor_C.device_data(), stride_c, tensor_D.device_data(), stride_d
+      };
+
+    auto &fusion_args = arguments.thread;
+    if constexpr (IsLegacy) {
+      arguments.thread = {
+        alpha.at(coord_0),
+        beta.at(coord_0),
+        alpha.device_data(),
+        beta.device_data()
+      };
+      arguments.ptr_Bias = bias.device_data();
+      arguments.ptr_T = tensor_Aux.device_data();
+    }
+    else {
+      fusion_args.alpha = alpha.at(coord_0);
+      fusion_args.alpha_ptr = alpha.device_data();
+      // Only initializing beta/beta_ptr for non-void source
+      if constexpr (not cute::is_void_v<typename kernel::ElementC>) {
+        fusion_args.beta = beta.at(coord_0);
+        fusion_args.beta_ptr = beta.device_data(); // if vector_scale_mode is true this is nullptr
+      }
+
+      if constexpr (IsPerRowScaleEnabled) {
+        int32_t m_stride = vector_scale_mode == VectorScale::ENABLED ? 1 : 0;
+        int64_t l_stride = vector_scale_mode == VectorScale::ENABLED ? M : (use_device_scalars == ScalarLoc::ON_DEVICE ? 1 : 0);
+        fusion_args.dAlpha = cute::make_stride(bool(m_stride),cute::_0{}, l_stride);
+        fusion_args.dBeta = cute::make_stride(bool(m_stride),cute::_0{}, l_stride);
+      }
+      else {
+        if constexpr (not IsFfma2Kernel) {
+          if (use_device_scalars == ScalarLoc::ON_DEVICE) {
+            if (L > 1) {
+              fusion_args.dAlpha = cute::make_stride(cute::_0{},cute::_0{}, int64_t(1));
+              fusion_args.dBeta  = cute::make_stride(cute::_0{},cute::_0{}, int64_t(1));
+            }
+          }
+        }
+      }
+
+      if constexpr (IsScaleFactorEnabled) {
+        fusion_args.scale_a = scale_A.at(coord_0);
+        fusion_args.scale_b = scale_B.at(coord_0);
+        fusion_args.scale_c = scale_C.at(coord_0);
+        fusion_args.scale_d = scale_D.at(coord_0);
+        fusion_args.scale_a_ptr = scale_A.device_data();
+        fusion_args.scale_b_ptr = scale_B.device_data();
+        fusion_args.scale_c_ptr = scale_C.device_data();
+        fusion_args.scale_d_ptr = scale_D.device_data();
+      }
+
+      if constexpr (
+        IsRowBiasEnabled
+      ) {
+        fusion_args.bias_ptr = bias.device_data();
+      }
+
+      if constexpr (IsDeBiasEnabled) {
+        fusion_args.dbias_ptr = bias.device_data();
+      }
+
+      // example of how to set kernel activation arguments
+      // see ActivationFunctor::Arguments in activation.h for definition
+      // if Arguments doesn't exist then fusion_args.activation is empty
+
+      if constexpr (cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::ScaledGELU_taylor<ElementCompute>>) {
+        fusion_args.activation.scale = ElementCompute(1);
+      }
+
+      // Treat Clamp as ReLU
+      if constexpr (cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::Clamp<ElementCompute>>) {
+        fusion_args.activation.lower_bound = 0;
+        fusion_args.activation.upper_bound = std::numeric_limits<ElementCompute>::max();
+      }
+
+      if constexpr (IsAbsMaxEnabledD) {
+        fusion_args.amax_D_ptr = abs_max_D.device_data();
+      }
+
+      if constexpr (IsAuxInEnabled) {
+        fusion_args.aux_ptr = tensor_Aux.device_data();
+        fusion_args.dAux = stride_Aux;
+      }
+
+      if constexpr (IsAuxOutEnabled) {
+        fusion_args.aux_ptr = tensor_Aux.device_data();
+        fusion_args.dAux = stride_Aux;
+        if constexpr (IsScaleFactorEnabled) {
+          fusion_args.scale_aux = scale_Aux.at(coord_0);
+          fusion_args.scale_aux_ptr = scale_Aux.device_data();
+        }
+        if constexpr (IsAbsMaxEnabledAux) {
+          fusion_args.amax_aux_ptr = abs_max_Aux.device_data();
+        }
+      }
+
+    }
+
+    return arguments;
+  }
+
+  auto to_host_args(ProblemShapeType problem_size) {
+    using namespace cute;
+    //
+    // Allocate the GEMM workspace
+    //
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto M = cute::get<0>(problem_shape_MNKL);
+    auto N = cute::get<1>(problem_shape_MNKL);
+    auto K = cute::get<2>(problem_shape_MNKL);
+    auto L = cute::get<3>(problem_shape_MNKL);
+    auto coord_0 = cutlass::make_Coord(0);
+    auto C = cute::make_tensor(detail::make_iterator(tensor_C.host_data()),
+        cute::make_layout(cute::make_shape(M, N, L), stride_c));
+    auto D = cute::make_tensor(detail::make_iterator(reference_D.host_data()),
+        cute::make_layout(cute::make_shape(M, N, L), stride_d));
+    auto Bias = cute::make_tensor(detail::make_iterator(IsDeBiasEnabled ? reference_dbias.host_data() : bias.host_data()),
+        cute::make_layout(cute::make_shape(IsRowBiasEnabled ? M : N)));
+    auto Aux = cute::make_tensor(detail::make_iterator(IsAuxInEnabled ? tensor_Aux.host_data() : reference_Aux.host_data()),
+        cute::make_layout(cute::make_shape(M, N, L), stride_Aux));
+    auto Valpha = [&](){
+      if constexpr (IsPerRowScaleEnabled) {
+        int m_stride = vector_scale_mode == VectorScale::ENABLED ? 1 : 0;
+        int l_stride = vector_scale_mode == VectorScale::ENABLED ? M : (use_device_scalars == ScalarLoc::ON_DEVICE ? 1 : 0);
+        return cute::make_tensor(detail::make_iterator(alpha.host_data()),
+            cute::make_layout(cute::make_shape(M, N, L), make_stride(m_stride, cute::_0{}, l_stride)));
+      }
+      else {
+        return cute::make_tensor(detail::make_iterator(alpha.host_data()),
+            cute::make_layout(cute::make_shape(M, N, L), make_stride(cute::_0{}, cute::_0{}, cute::_1{})));
+      }
+    }();
+
+    auto Vbeta = [&]() {
+      if constexpr (IsPerRowScaleEnabled) {
+        int m_stride = vector_scale_mode == VectorScale::ENABLED ? 1 : 0;
+        int l_stride = vector_scale_mode == VectorScale::ENABLED ? M : (use_device_scalars == ScalarLoc::ON_DEVICE ? 1 : 0);
+        return cute::make_tensor(detail::make_iterator(beta.host_data()),
+            cute::make_layout(cute::make_shape(M, N, L), make_stride(m_stride, cute::_0{}, l_stride)));
+      }
+      else {
+        return  cute::make_tensor(detail::make_iterator(beta.host_data()),
+            cute::make_layout(cute::make_shape(M, N, L), make_stride(cute::_0{}, cute::_0{}, cute::_1{})));
+      }
+    }();
+    cutlass::reference::host::GettEpilogueParams<
+      ElementScalar,
+      ElementScalar,
+      ElementAccumulator,
+      ElementCompute,
+      decltype(C),
+      decltype(D),
+      decltype(Bias),
+      decltype(Aux),
+      decltype(Valpha),
+      decltype(Vbeta),
+      ActivationFunctor,
+      cutlass::plus<ElementCompute>
+      , false /*PerColumnBias_*/
+    > epilogue_params{};
+
+    epilogue_params.C = C;
+    epilogue_params.D = D;
+    epilogue_params.alpha = alpha.at(coord_0);
+    epilogue_params.beta = beta.at(coord_0);
+
+    if constexpr (IsScaleFactorEnabled) {
+      epilogue_params.scale_a = scale_A.at(coord_0);
+      epilogue_params.scale_b = scale_B.at(coord_0);
+      epilogue_params.scale_c = scale_C.at(coord_0);
+      epilogue_params.scale_d = scale_D.at(coord_0);
+    }
+
+    if constexpr (IsRowBiasEnabled 
+      or IsDeBiasEnabled) 
+    {
+      epilogue_params.Bias = Bias;
+    }
+
+    if constexpr (IsAbsMaxEnabledD) {
+      epilogue_params.abs_max_D = reference_abs_max_D.host_data();
+    }
+
+    if constexpr (IsAuxInEnabled) {
+      epilogue_params.Aux = Aux;
+    }
+
+    if constexpr (IsAuxOutEnabled) {
+      epilogue_params.Aux = Aux;
+      if constexpr (IsScaleFactorEnabled) {
+        epilogue_params.scale_aux = scale_Aux.at(coord_0);
+      }
+      if constexpr (IsAbsMaxEnabledAux) {
+        epilogue_params.abs_max_Aux = reference_abs_max_Aux.host_data();
+      }
+    }
+
+    if constexpr (IsPerRowScaleEnabled) {
+      epilogue_params.Valpha = Valpha;
+      if (vector_scale_mode == VectorScale::ENABLED) {
+        epilogue_params.Vbeta = Vbeta;
+      }
+    }
+    else {
+      if (use_device_scalars == ScalarLoc::ON_DEVICE) {
+        epilogue_params.Valpha = Valpha;
+        epilogue_params.Vbeta = Vbeta;
+      }
+    }
+    return epilogue_params;
+  }
+};
+
+template <
+  typename Gemm,
+  template <class T> class ActivationFunctor_ = cutlass::epilogue::thread::Identity,
+  bool force_legacy_epilogue = false,
+  typename ElementA = typename Gemm::GemmKernel::ElementA,
+  typename ElementB = typename Gemm::GemmKernel::ElementB
+>
+struct TestbedImpl {
+  // Kernel data types
+  using ScheduleType = typename Gemm::GemmKernel::CollectiveMainloop::DispatchPolicy::Schedule;
+  // All Collective MMA operands are defined by HostCollectiveMainloopType based on the schedule type
+  using HostCollectiveMainloopType = HostCollectiveMainloop<ScheduleType, Gemm, ElementA, ElementB>;
+  
+  using CollectiveEpilogue = cute::conditional_t<IsDefaultEpilogue<typename Gemm::GemmKernel::CollectiveEpilogue>::value || force_legacy_epilogue, 
+                                                HostCollectiveDefaultEpilogue<Gemm>, 
+                                                HostCollectiveEpilogue<Gemm>>;
+  
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+  using ElementAccumulator = typename Gemm::GemmKernel::ElementAccumulator;
+  using ElementCompute = typename ElementComputeType<Gemm, ElementAccumulator>::Type;
+  using ElementScalar = typename ElementScalarType<Gemm, ElementCompute>::Type;
+
+  using LayoutTagA = typename HostCollectiveMainloopType::LayoutTagA;
+  using LayoutTagB = typename HostCollectiveMainloopType::LayoutTagB;
+  using LayoutTagC = typename CollectiveEpilogue::LayoutTagC;
+  using LayoutTagD = typename CollectiveEpilogue::LayoutTagD;
+
+  uint32_t sm_count;
+  // Used to force multi-wave tests for persistent kernel schedules
+  constexpr static int MaxSmCount = 16;
+  static constexpr uint64_t kDefaultSeed = 4096;
+  static constexpr uint32_t mma_promotion_interval = 4;
+  using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90::RasterOrderOptions;
+  using DecompositionMode = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
+
+  HostCollectiveMainloopType collective_mma_inputs;
+  CollectiveEpilogue collective_epilogue;
+
+  //
+  // Methods
+  //
+
+  TestbedImpl(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    ScalarLoc use_device_scalars_ = ScalarLoc::ON_HOST,
+    VectorScale vector_scale_mode_ = VectorScale::DISABLED,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_scale_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_bias_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = kDefaultSeed
+  ): collective_mma_inputs(HostCollectiveMainloopType(check_relative_equality_, init_A_, init_B_, seed_)), 
+     collective_epilogue(CollectiveEpilogue(check_relative_equality_, use_device_scalars_, vector_scale_mode_, init_C_, init_scale_, init_bias_, seed_)) { }
+
+  TestbedImpl(
+    typename LayoutTagA::Stride stride_factor_A_,
+    typename LayoutTagB::Stride stride_factor_B_,
+    typename LayoutTagC::Stride stride_factor_C_,
+    typename LayoutTagD::Stride stride_factor_D_,
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    ScalarLoc use_device_scalars_ = ScalarLoc::ON_HOST,
+    VectorScale vector_scale_mode_ = VectorScale::DISABLED,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_scale_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_bias_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = kDefaultSeed
+  ): collective_mma_inputs(HostCollectiveMainloopType(check_relative_equality_, stride_factor_A_, stride_factor_B_, init_A_, init_B_, seed_)),
+     collective_epilogue(CollectiveEpilogue(check_relative_equality_, use_device_scalars_, vector_scale_mode_, init_C_, init_scale_, init_bias_, seed_)) { }
+
+  /// Initializes data structures
+  bool initialize(ProblemShapeType problem_size, ElementScalar alpha_=1.f, ElementScalar beta_=0.f) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("TestbedImpl::initialize(problem_size, alpha, beta)");
+#endif
+    collective_mma_inputs.initialize(problem_size);
+    collective_epilogue.initialize(problem_size, alpha_, beta_);
+
+    return true;
+  }
+
+  /// Compares computed reference with device reference and outputs to a file if incorrect
+  bool compare_reference(
+      cute::Shape<int,int,int,int> problem_shape_MNKL,
+      ElementScalar alpha,
+      ElementScalar beta)
+  {
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    bool passed = collective_mma_inputs.compare_reference(problem_shape_MNKL);
+    passed &= collective_epilogue.compare_reference(problem_shape_MNKL, alpha, beta);
+    EXPECT_TRUE(passed);
+    if (!passed) {
+      std::stringstream fname;
+      fname << "error_Gemm_device_"
+        << M << "x" << N << "x" << K << "x" << L << "_"
+        << cute::get<0>(typename Gemm::GemmKernel::TileShape{}) << "_"
+        << cute::get<1>(typename Gemm::GemmKernel::TileShape{}) << "_"
+        << cute::get<2>(typename Gemm::GemmKernel::TileShape{}) << ".txt";
+
+      std::ofstream file(fname.str());
+      file
+        << "problem: " << ' ' << M << "x" << N << "x" << K << ", Batch count = " << L
+        << ", alpha: " << alpha << ", beta: " << beta << "\n\n";
+      
+      collective_mma_inputs.print_tensors(file);
+      collective_epilogue.print_tensors(file);
+    }
+
+    return passed;
+  }
+
+  /// Verifies the result is a GEMM
+  bool verify(
+      ProblemShapeType problem_size,
+      ElementScalar alpha,
+      ElementScalar beta)
+  {
+    using namespace cute;
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto mainloop_params = collective_mma_inputs.to_host_args(problem_size);
+    auto epilogue_params = collective_epilogue.to_host_args(problem_size);
+    
+    cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
+
+    bool passed = compare_reference(problem_shape_MNKL, alpha, beta);
+    return passed;
+  }
+
+	/// Determine if the CUDA device is sufficient to run the kernel
+  bool sufficient() {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    size_t smem_size = static_cast<size_t>(Gemm::GemmKernel::SharedStorageSize);
+
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    cudaDeviceProp properties;
+    result = cudaGetDeviceProperties(&properties, device_idx);
+    this->sm_count = properties.multiProcessorCount;
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerBlockOptin < smem_size) {
+      printf("failed due to smem_size\n");
+      printf("hardware smem_size: %d, required smem_size: %d\n\n", int(properties.sharedMemPerBlockOptin), int(smem_size));
+      return false;
+    }
+
+    return true;
+  }
+
+  bool profile(
+    ProblemShapeType problem_size,
+    int iterations,
+    Gemm& gemm_op,
+    typename Gemm::Arguments& arguments,
+    cutlass::device_memory::allocation<uint8_t>& workspace) {
+    int M = cute::size<0>(problem_size);
+    int N = cute::size<1>(problem_size);
+    int K = cute::size<2>(problem_size);
+    int L = 1;
+    if constexpr(cute::rank(ProblemShapeType{}) == 4) {
+      L = cute::size<3>(problem_size);
+    }
+
+
+    cutlass::Status status;
+    //
+    // Run the GEMM
+    //
+    cudaError_t result;
+
+    for (int iter = 0; iter < iterations; ++iter) {
+      status = gemm_op(arguments, workspace.get());
+      if (status != cutlass::Status::kSuccess) {
+        EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+        return false;
+      }
+    }
+
+    result = cudaDeviceSynchronize();
+    if (result != cudaSuccess) {
+      EXPECT_EQ(result, cudaSuccess) << "Error at Kernel Sync.";
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(
+    ProblemShapeType problem_size,
+    ElementScalar alpha = ElementScalar(1),
+    ElementScalar beta = ElementScalar(0),
+    bool profiling = false,
+    detail::Iterations iterations = detail::Iterations{},
+    RasterOrderOptions raster_order = RasterOrderOptions::Heuristic,
+    detail::MaxSwizzleSize max_swizzle = detail::MaxSwizzleSize{},
+    detail::Splits splits = detail::Splits{},
+    DecompositionMode decomposition_mode = DecompositionMode::Heuristic
+    )
+  {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("TestbedImpl::run"); 
+#endif
+
+    // Fail test if insufficient CUDA device
+    if (!sufficient()) {
+      CUTLASS_TRACE_HOST("TestbedImpl::run: Test failed due to insufficient CUDA device");
+      std::cout << "Test failed due to insufficient CUDA device." << std::endl;
+      return false;
+    }
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    else {
+      CUTLASS_TRACE_HOST("TestbedImpl::run: sufficient() returned true");
+    }
+#endif
+
+    try {
+      const bool initialized = this->initialize(problem_size, alpha, beta);
+      if (not initialized) {
+        CUTLASS_TRACE_HOST("TestbedImpl::run: this->initialize returned false");
+        std::cerr << "Initialization failed \n";
+        return false;
+      }
+    }
+    catch ([[maybe_unused]] std::exception const& e) {
+      CUTLASS_TRACE_HOST("TestbedImpl::run: this->initialize threw an exception: " << e.what());
+      throw;
+    }
+    catch (...) {
+      CUTLASS_TRACE_HOST("TestbedImpl::run: this->initialize threw an unknown exception");
+      throw;
+    }
+
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("TestbedImpl::run: this->initialize() returned true");
+#endif
+
+    //
+    // Initialize the GEMM operator
+    //
+
+    typename Gemm::Arguments arguments;
+    cutlass::KernelHardwareInfo hw_info;
+    hw_info.device_id = 0;
+    if (not profiling) {
+      this->sm_count = std::min(MaxSmCount, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id));
+      hw_info.sm_count = this->sm_count;
+    }
+    else {
+      this->sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+      hw_info.sm_count = this->sm_count;
+    }
+
+    typename Gemm::GemmKernel::TileScheduler::Arguments scheduler_args;
+    if constexpr (cute::is_same_v<typename Gemm::GemmKernel::TileSchedulerTag, cutlass::gemm::StreamKScheduler>) {
+      scheduler_args = { static_cast<int>(splits), static_cast<int>(max_swizzle), raster_order, decomposition_mode };
+    }
+    else {
+      scheduler_args = { static_cast<int>(max_swizzle), raster_order };
+    }
+    typename HostCollectiveMainloopType::Arguments mainloop_args;
+
+    mainloop_args = collective_mma_inputs.to_args();
+
+    arguments =
+    {
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      problem_size,
+      mainloop_args,
+      collective_epilogue.to_args(problem_size),
+      hw_info,
+      scheduler_args
+    };
+
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("TestbedImpl::run: Creating gemm_op");
+#endif
+    Gemm gemm_op;
+
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("TestbedImpl::run: Calling Gemm::get_workspace_size");
+#endif
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("TestbedImpl::run: Allocating workspace of size " << workspace_size);
+#endif
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("TestbedImpl::run: Calling gemm_op.can_implement");
+#endif
+    cutlass::Status status = gemm_op.can_implement(arguments);
+
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      const auto error_str = cudaGetErrorString(error);
+      CUTLASS_TRACE_HOST("TestbedImpl::run: cudaGetLastError() is " << error_str);
+      std::cerr << "This test is not supported: " << error_str << "\n";
+      return true;
+    }
+
+    //
+    // Run the GEMM
+    //
+
+    if (profiling) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      CUTLASS_TRACE_HOST("TestbedImpl::run: Calling profile");
+#endif
+      return profile(problem_size, static_cast<int>(iterations), gemm_op, arguments, workspace);
+    }
+    else {
+      cudaError_t result;
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      CUTLASS_TRACE_HOST("TestbedImpl::run: Calling gemm_op.initialize");
+#endif
+      status = gemm_op.initialize(arguments, workspace.get());
+      if (status != cutlass::Status::kSuccess) {
+        cudaError_t error = cudaGetLastError();
+        const auto error_str = cudaGetErrorString(error);
+        CUTLASS_TRACE_HOST("TestbedImpl::run: cudaGetLastError() is " << error_str);
+      }
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      CUTLASS_TRACE_HOST("TestbedImpl::run: Calling gemm_op.run");
+#endif
+      status = gemm_op.run();
+      if (status != cutlass::Status::kSuccess) {
+        cudaError_t error = cudaGetLastError();
+        const auto error_str = cudaGetErrorString(error);
+        CUTLASS_TRACE_HOST("TestbedImpl::run: cudaGetLastError() is " << error_str);
+      }
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      CUTLASS_TRACE_HOST("TestbedImpl::run: Calling cudaDeviceSynchronize");
+#endif
+      result = cudaDeviceSynchronize();
+      if (result != cudaSuccess) {
+        CUTLASS_TRACE_HOST("TestbedImpl::run: cudaDeviceSynchronize reports non-success");
+        EXPECT_EQ(result, cudaSuccess) << "Error at Kernel Sync.";
+        return false;
+      }
+
+      EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+      //
+      // Verify
+      //
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      CUTLASS_TRACE_HOST("TestbedImpl::run: Calling this->verify");
+#endif
+      bool passed = this->verify(problem_size, alpha, beta);
+      if (!passed) {
+        CUTLASS_TRACE_HOST("TestbedImpl::run: this->verify FAILED");
+        cudaError_t error = cudaGetLastError();
+        const auto error_str = cudaGetErrorString(error);
+        CUTLASS_TRACE_HOST("TestbedImpl::run: cudaGetLastError() is " << error_str);
+
+        std::cout << "Error : Failed : with alpha: " << alpha << ", beta: " << beta
+                  << "\n";
+      }
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      else {
+        CUTLASS_TRACE_HOST("TestbedImpl::run: this->verify passed");
+      }
+#endif
+
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      CUTLASS_TRACE_HOST("TestbedImpl::run: Reached end");
+#endif
+      return passed;
+    }
+  }
+};
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Gemm,
+  template <class T> class ActivationFunctor = cutlass::epilogue::thread::Identity,
+  bool force_legacy_epilogue = false,
+  typename ElementA = typename Gemm::GemmKernel::ElementA,
+  typename ElementB = typename Gemm::GemmKernel::ElementB
+>
+struct Testbed3x {
+
+  using TestBedImpl = typename detail::TestbedImpl<
+                        Gemm, 
+                        ActivationFunctor, 
+                        force_legacy_epilogue, 
+                        ElementA, 
+                        ElementB
+                        >;
+  using Kernel      = typename Gemm::GemmKernel;
+  using Epilogue    = typename Gemm::GemmKernel::CollectiveEpilogue;
+
+  using ElementAccumulator   = typename TestBedImpl::ElementAccumulator;
+  using ElementCompute       = typename TestBedImpl::ElementCompute;
+  using ElementScalar        = typename TestBedImpl::ElementScalar;
+
+  using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90::RasterOrderOptions;
+  using DecompositionMode = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
+
+  // Detail Implementation
+  TestBedImpl impl_;
+
+  //
+  // Methods
+  //
+  Testbed3x(
+      CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+      ScalarLoc use_device_scalars_ = ScalarLoc::ON_DEVICE,
+      VectorScale vector_scale_mode_ = VectorScale::DISABLED,
+      cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_scale_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_bias_ = cutlass::Distribution::Uniform,
+      uint64_t seed_ = TestBedImpl::kDefaultSeed)
+      : impl_(check_relative_equality_, use_device_scalars_, vector_scale_mode_, init_A_, init_B_, init_C_, init_scale_, init_bias_, seed_) {}
+
+  /// Executes one test
+  bool run(
+   typename TestBedImpl::ProblemShapeType problem_size,
+    ElementScalar alpha = ElementScalar(1),
+    ElementScalar beta = ElementScalar(0),
+    RasterOrderOptions raster_order = RasterOrderOptions::Heuristic,
+    detail::MaxSwizzleSize max_swizzle = detail::MaxSwizzleSize{},
+    detail::Splits splits = detail::Splits{},
+    DecompositionMode decomposition_mode = DecompositionMode::Heuristic,
+    bool profiling = false,
+    detail::Iterations iterations = detail::Iterations{}
+    )
+  {
+    return impl_.run(
+        problem_size, alpha, beta, profiling, iterations, raster_order, max_swizzle, splits, decomposition_mode
+        );
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Gemm>
+bool TestGemmPerf3x(int iterations = 20) {
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+  using ElementAccumulator = typename Gemm::GemmKernel::ElementAccumulator;
+  using ElementScalar = ElementAccumulator;
+  bool passed = true;
+  using DecompositionMode = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
+  using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90::RasterOrderOptions;
+
+  std::vector<int> problem_size_m = { 4608 };
+  std::vector<int> problem_size_n = { 4608 };
+  std::vector<int> problem_size_k = { 8192 };
+
+  Testbed3x<Gemm> testbed;
+
+  for (int m : problem_size_m) {
+    for (int n : problem_size_n) {
+      for (int k : problem_size_k) {
+        ProblemShapeType problem_size;
+        if constexpr (cute::rank(ProblemShapeType{}) == 4) {
+          problem_size = ProblemShapeType{m, n, k, /* l */ 1};
+        }
+        else {
+          problem_size = ProblemShapeType{m, n, k};
+        }
+
+        passed = testbed.run(
+          problem_size,
+          cutlass::from_real<ElementScalar>(1),
+          cutlass::from_real<ElementScalar>(0),
+          RasterOrderOptions{}, detail::MaxSwizzleSize(1), detail::Splits{1}, DecompositionMode{},
+          true, // profiling
+          detail::Iterations{iterations});
+
+        if (!passed) {
+          return false;
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+template <
+  typename Gemm,
+  template <class T> class ActivationFunctor = cutlass::epilogue::thread::Identity
+>
+bool TestAll(double alpha = 1.0, double beta = 0.0, CheckEquality check_relative_equality = CheckEquality::RELATIVE) {
+  using ElementScalar = typename Gemm::EpilogueOutputOp::ElementScalar;
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+
+  Testbed3x<Gemm, ActivationFunctor> testbed(check_relative_equality, ScalarLoc::ON_HOST, VectorScale::DISABLED);
+
+  int max_alignment = std::max(Gemm::kAlignmentA, Gemm::kAlignmentB);
+  std::vector<int> problem_size_m = {max_alignment, 512 - 3 * max_alignment};
+  std::vector<int> problem_size_n = {max_alignment, 512 - 2 * max_alignment};
+
+  if constexpr (cute::is_same_v<typename Gemm::GemmKernel::DispatchPolicy::Schedule,
+                cutlass::gemm::KernelTmaWarpSpecializedPingpong>) {
+    problem_size_m.push_back(768);
+    problem_size_n.push_back(768);
+  }
+
+  constexpr int Stages = Gemm::GemmKernel::DispatchPolicy::Stages;
+  constexpr int TileShapeK = cute::size<2>(typename Gemm::GemmKernel::TileShape{});
+
+  std::vector<int> problem_size_k = {max_alignment, TileShapeK * (Stages + 1) - max_alignment};
+
+  using DecompositionMode = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
+  std::vector<DecompositionMode> decomposition_modes = {DecompositionMode::Heuristic};
+  std::vector problem_splits = {detail::Splits{1}};
+  static constexpr bool UsesStreamKScheduler = cute::is_same_v<typename Gemm::GemmKernel::TileSchedulerTag, cutlass::gemm::StreamKScheduler>;
+  if constexpr (UsesStreamKScheduler) {
+    problem_splits.push_back(detail::Splits{2});
+    problem_splits.push_back(detail::Splits{3});
+
+    decomposition_modes.push_back(DecompositionMode::DataParallel);
+    decomposition_modes.push_back(DecompositionMode::SplitK);
+    decomposition_modes.push_back(DecompositionMode::StreamK);
+
+    // Use larger K sizes for stream-K tests
+    static constexpr int min_tiles_per_sk_unit = cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::min_iters_per_sk_unit_;
+    problem_size_k = {TileShapeK * min_tiles_per_sk_unit, TileShapeK * 3 * min_tiles_per_sk_unit - max_alignment};
+  }
+
+  using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90::RasterOrderOptions;
+  std::vector<RasterOrderOptions> raster_orders = {RasterOrderOptions::AlongM, RasterOrderOptions::AlongN};
+  std::vector max_swizzle_sizes{detail::MaxSwizzleSize{1}, detail::MaxSwizzleSize{4}};
+
+  bool passed = true;
+
+  for (int m : problem_size_m) {
+    for (int n : problem_size_n) {
+      for (int k : problem_size_k) {
+        for (auto raster_order : raster_orders) {
+          for (auto max_swizzle_size : max_swizzle_sizes) {
+            for (DecompositionMode decomp_mode : decomposition_modes) {
+
+              std::vector problem_splits = {detail::Splits{1}};
+              if (decomp_mode == DecompositionMode::Heuristic || decomp_mode == DecompositionMode::SplitK) {
+                auto max_splits = (k + TileShapeK - 1) / TileShapeK;
+                if (max_splits > 2) {
+                  problem_splits.push_back(detail::Splits{2});
+                }
+                if (max_splits > 3) {
+                  problem_splits.push_back(detail::Splits{3});
+                }
+
+                problem_splits.push_back(detail::Splits{max_splits});
+
+                // Test the case in which we ask for more splits than there are K tiles in the GEMM. In this
+                // case, split-K will fall back to a splitting factor of `max_splits`.
+                problem_splits.push_back(detail::Splits{max_splits + 1});
+              }
+              for (auto splits : problem_splits) {
+                ProblemShapeType problem_size;
+                if constexpr (cute::rank(ProblemShapeType{}) == 4) {
+                  problem_size = ProblemShapeType{m, n, k, /* l */ 1};
+                }
+                else {
+                  problem_size = ProblemShapeType{m, n, k};
+                }
+
+                try {
+                  passed = testbed.run(
+                    problem_size,
+                    cutlass::from_real<ElementScalar>(alpha),
+                    cutlass::from_real<ElementScalar>(beta),
+                    raster_order,
+                    max_swizzle_size,
+                    splits,
+                    decomp_mode
+                  );
+                }
+                catch (std::exception const& e) {
+                  EXPECT_TRUE(false) << "TestAll: testbed.run {"
+                    << "m: " << m << ", n: " << n << ", k: " << k 
+                    << ", alpha: " << alpha << ", beta: " << beta
+                    << ", raster_order: ???"
+                    << ", max_swizzle_size: " << static_cast<int>(max_swizzle_size)
+                    << ", splits: " << static_cast<int>(splits)
+                    << ", decomp_mode: " << detail::decomp_mode_to_string(decomp_mode)
+                    << "} threw an exception: " << e.what();
+                  throw;
+                }
+                catch (...) {
+                  EXPECT_TRUE(false) << "TestAll: testbed.run {"
+                    << "m: " << m << ", n: " << n << ", k: " << k 
+                    << ", alpha: " << alpha << ", beta: " << beta
+                    << ", raster_order: ???"
+                    << ", max_swizzle_size: " << static_cast<int>(max_swizzle_size)
+                    << ", splits: " << static_cast<int>(splits)
+                    << ", decomp_mode: " << detail::decomp_mode_to_string(decomp_mode)
+                    << "} threw an exception (unknown)";
+                  throw;
+                }
+
+                EXPECT_TRUE(passed) << "TestAll: testbed.run {"
+                  << "m: " << m << ", n: " << n << ", k: " << k 
+                  << ", alpha: " << alpha << ", beta: " << beta
+                  << ", raster_order: ???"
+                  << ", max_swizzle_size: " << static_cast<int>(max_swizzle_size)
+                  << ", splits: " << static_cast<int>(splits)
+                  << ", decomp_mode: " << detail::decomp_mode_to_string(decomp_mode)
+                  << "} failed";
+
+                if (!passed) {
+                  std::cout << __FILE__ << ':' << __LINE__ << " : GEMM MNK " << m << " " << n << " " << k << " FAILED.\n";
+                  return false;
+                }
+              } // splits
+            } // decomposition_mode
+          } // max_swizzle_size
+        } // raster_order
+      } // k
+    } // n
+  } // m
+
+  // if we do support batched GEMM, just run one test on it to save on test time
+  if constexpr (cute::rank(ProblemShapeType{}) == 4) {
+    auto problem_size = ProblemShapeType{256 + max_alignment, 256 + max_alignment, 160 + max_alignment, /* l */ 3};
+    passed = testbed.run(
+      problem_size,
+      cutlass::from_real<ElementScalar>(alpha),
+      cutlass::from_real<ElementScalar>(beta)
+    );
+
+    if (!passed) {
+      return false;
+    }
+  }
+
+  return passed;
+}
+
+template <typename Gemm>
+bool TestAllBiasElementwise(double alpha = 1.0, double beta = 0.0, CheckEquality check_relative_equality = CheckEquality::EXACT) {
+  return TestAll<Gemm>(alpha, beta, check_relative_equality);
+}
+
+} // namespace device
+} // namespace gemm
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_testbed_3x_evt.hpp b/test/unit/gemm/device/gemm_testbed_3x_evt.hpp
new file mode 100644
index 0000000000..d6f0e0b137
--- /dev/null
+++ b/test/unit/gemm/device/gemm_testbed_3x_evt.hpp
@@ -0,0 +1,1742 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Testbed and host reference for EVT unittest
+*/
+
+
+#pragma once
+#include "gemm_testbed_3x.hpp" 
+
+namespace test {
+namespace gemm {
+namespace device {
+
+/// Host-side tapply, tapply in cute is HOST_DEVICE
+template <class T, class F, class G, int... I>
+constexpr auto
+tapply(T&& t, F&& f, G&& g, cute::seq<I...>)
+{
+  return g(f(std::get<I>(static_cast<T&&>(t)))...);
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// EVT: Base class for EVT Node
+
+template < class ElementCompute_ >
+class HostEVTNodeBase {
+public:
+  using ElementCompute = ElementCompute_;
+
+private:
+  bool check_relative_equality_;
+  // Factors used for calculating relative equality. These default
+  // values are borrowed from those used by default in the CUTLASS
+  // profiler for performing relative equality checks.
+  float epsilon_ = 0.05f;
+  float nonzero_floor_ = 1.0f / 256.0f;
+
+public:
+  HostEVTNodeBase(){}
+  HostEVTNodeBase(bool check_relative_equality):
+    check_relative_equality_(check_relative_equality) { }
+
+
+  template <
+    class Element,
+    class Layout
+  >
+  bool equality_check(
+    cutlass::TensorView<Element, Layout> const& lhs,
+    cutlass::TensorView<Element, Layout> const& rhs) const {
+    if (check_relative_equality_) {
+      return cutlass::reference::host::TensorRelativelyEquals(
+        lhs, rhs, Element(epsilon_), Element(nonzero_floor_)
+      );
+    }
+    else {
+      return cutlass::reference::host::TensorEquals(lhs, rhs);
+    }
+  }
+
+  void* get_tensor_C_ptr() {
+    return nullptr;
+  }
+
+  void* get_tensor_D_ptr() {
+    return nullptr;
+  }
+
+  bool compare_reference(std::stringstream& error_ss) {
+    return true;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// EVT - Accumulator
+
+template< class ElementCompute = float >
+class HostAccumulator: public HostEVTNodeBase<ElementCompute> {
+public:
+  using Base = HostEVTNodeBase<ElementCompute>;
+
+  struct Arguments { };
+  
+public:
+  HostAccumulator(){}
+  template<typename ProblemShapeType>
+  HostAccumulator(ProblemShapeType problem_size, bool check_relative_equality = false, int64_t seed = 2024)
+    :Base(check_relative_equality) {}
+
+  template<typename ElementAccumulator>
+  ElementCompute visit(
+    int64_t m, int64_t n, int64_t l, int m_b, int n_b,
+    ElementAccumulator acc) {
+    cutlass::NumericConverter<ElementCompute, ElementAccumulator> accumulator_converter;
+    return accumulator_converter(acc);
+  }
+
+  Arguments get_arguments() {
+    return Arguments{};
+  }
+
+  auto get_flatten_arguments() {
+    return cute::make_tuple();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// EVT - Scalar Broadcast
+
+template <
+  int Value,
+  int BroadcastCount = 1,
+  class StrideMNL = cute::Stride<cute::_0,cute::_0,cute::_0>,
+  template <class> class ReductionFn = cutlass::multiplies,
+  class ElementCompute = float
+>
+class HostScalarBroadcast : public HostEVTNodeBase<ElementCompute> {
+public:
+
+  using Base = HostEVTNodeBase<ElementCompute>;
+  struct Arguments {
+    ElementCompute scalar[BroadcastCount] = {0};
+    ElementCompute const* scalar_ptrs[BroadcastCount] = { nullptr };
+    StrideMNL dScalar[BroadcastCount] = {};
+  };
+private:
+  ElementCompute scalar_{};
+  StrideMNL dScalar{};
+  ElementCompute scalar_reduced_{};
+public:
+  HostScalarBroadcast(){}
+
+  template<typename ProblemShapeType>
+  HostScalarBroadcast(ProblemShapeType problem_size, bool check_relative_equality = false, int64_t seed = 2024)
+    : Base(check_relative_equality), scalar_(ElementCompute(Value)) {
+    scalar_ = ElementCompute(Value);
+    scalar_reduced_ = scalar_;
+    for (int i = 1; i < BroadcastCount; ++i) {
+      scalar_reduced_ = ReductionFn<ElementCompute>{}(scalar_reduced_, ElementCompute(Value));
+    }
+  }
+  
+  template <class ElementAccumulator>
+  ElementCompute visit(
+    int64_t m, int64_t n, int64_t l, int m_b, int n_b,
+    ElementAccumulator acc) {
+    
+    return scalar_reduced_;
+  }
+
+  bool compare_reference(std::stringstream& error_ss) {
+    error_ss << "Scalar: " << float(scalar_) << "\n\n";
+    return true;
+  }
+
+  Arguments get_arguments() {
+    if constexpr (BroadcastCount == 1)
+      return Arguments{{scalar_}, {nullptr}, {dScalar}};
+    else if constexpr (BroadcastCount == 2)
+      return Arguments{{scalar_, scalar_}, {nullptr, nullptr}, {dScalar,  dScalar}};
+    else if constexpr (BroadcastCount == 3)
+      return Arguments{{scalar_, scalar_, scalar_}, {nullptr, nullptr, nullptr}, {dScalar, dScalar, dScalar}};
+    else
+      return Arguments{{scalar_}, {nullptr}, {dScalar}};
+  }
+
+  auto get_flatten_arguments() {
+    if constexpr (BroadcastCount == 1) {
+      return cute::make_tuple(scalar_, nullptr);
+    } 
+    else if constexpr (BroadcastCount == 2) {
+      return cute::make_tuple(scalar_, scalar_, nullptr, nullptr);
+    } 
+    else if constexpr (BroadcastCount == 3) {
+      return cute::make_tuple(scalar_, scalar_, scalar_, nullptr, nullptr, nullptr);
+    } 
+    else {
+      return cute::make_tuple(scalar_, nullptr);
+    }
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// EVT - Row Broadcast
+template <
+  typename ElementBias_,
+  typename StrideMNL = cute::Stride<cute::_0,cute::_1,cute::_0>,
+  typename ElementCompute = float
+>
+class HostRowBroadcast: public HostEVTNodeBase<ElementCompute> {
+public:
+  using Base = HostEVTNodeBase<ElementCompute>;
+  using ElementBias = ElementBias_;
+  using LayoutTagVector = cutlass::layout::PackedVectorLayout;
+  
+  struct Arguments {
+    ElementBias const* ptr_row = nullptr;
+    ElementBias null_default = ElementBias(0);
+    StrideMNL dRow = {};
+  };
+private:
+  cutlass::NumericConverter<ElementCompute, ElementBias> bias_converter_;
+  cutlass::HostTensor<ElementBias, LayoutTagVector> bias_;
+  int N_;
+public:
+  HostRowBroadcast(){}
+  template<typename ProblemShapeType>
+  HostRowBroadcast(ProblemShapeType problem_size, bool check_relative_equality = false, int64_t seed = 2024)
+    : Base(check_relative_equality) {
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    N_ = cute::get<1>(problem_shape_MNKL);
+    bias_.resize(cutlass::Coord<1>(N_));
+    
+    EXPECT_TRUE(
+      detail::initialize_tensor(
+        bias_.host_view(), cutlass::Distribution::Uniform, 
+        seed
+      )
+    );
+    bias_.sync_device();
+  }
+
+  template <class ElementAccumulator>
+  ElementCompute visit(
+    int64_t m, int64_t n, int64_t l, int m_b, int n_b,
+    ElementAccumulator acc) {
+    auto TensorBias = cute::make_tensor(bias_.host_data(),
+      cute::make_layout(cute::make_shape(cute::_1{}, N_)));
+    
+    return bias_converter_(TensorBias(1, n + n_b));
+  }
+
+  bool compare_reference(std::stringstream& error_ss) {
+    error_ss
+      << "PerColumnBias = \n" << bias_.host_view() << "\n\n";
+    return true;
+  }
+
+  Arguments get_arguments() {
+    return {bias_.device_data()};
+  }
+
+  auto get_flatten_arguments() {
+    return cute::make_tuple(bias_.device_data(), ElementBias(0), StrideMNL{});
+  }
+
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// EVT - Column Broadcast
+template <
+  typename ElementBias_,
+  typename StrideMNL = cute::Stride<cute::_1,cute::_0,cute::_0>,
+  typename ElementCompute = float
+>
+class HostColBroadcast: public HostEVTNodeBase<ElementCompute> {
+public:
+  using Base = HostEVTNodeBase<ElementCompute>;
+  using ElementBias = ElementBias_;
+  using LayoutTagVector = cutlass::layout::PackedVectorLayout;
+  
+  struct Arguments {
+    ElementBias const* ptr_row = nullptr;
+    ElementBias null_default = ElementBias(0);
+    StrideMNL dRow = {};
+  };
+private:
+  cutlass::NumericConverter<ElementCompute, ElementBias> bias_converter_;
+  cutlass::HostTensor<ElementBias, LayoutTagVector> bias_;
+  int M_;
+public:
+  HostColBroadcast(){}
+  template<typename ProblemShapeType>
+  HostColBroadcast(ProblemShapeType problem_size, bool check_relative_equality = false, int64_t seed = 2024)
+    : Base(check_relative_equality) {
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    M_ = cute::get<0>(problem_shape_MNKL);
+    bias_.resize(cutlass::Coord<1>(M_));
+    
+    EXPECT_TRUE(
+      detail::initialize_tensor(
+        bias_.host_view(), cutlass::Distribution::Uniform, 
+        seed
+      )
+    );
+    bias_.sync_device();
+  }
+
+  template <class ElementAccumulator>
+  ElementCompute visit(
+    int64_t m, int64_t n, int64_t l, int m_b, int n_b,
+    ElementAccumulator acc) {
+    auto TensorBias = cute::make_tensor(bias_.host_data(),
+      cute::make_layout(cute::make_shape(M_, cute::_1{})));
+    
+    return bias_converter_(TensorBias(m + m_b, 1));
+  }
+
+  bool compare_reference(std::stringstream& error_ss) {
+    error_ss
+      << "PerRowBias = \n" << bias_.host_view() << "\n\n";
+    return true;
+  }
+
+  Arguments get_arguments() {
+    return {bias_.device_data()};
+  }
+
+  auto get_flatten_arguments() {
+    return cute::make_tuple(bias_.device_data(), ElementBias(0), StrideMNL{});
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// EVT - Aux Load
+
+template <
+  typename ElementAuxLoad_,
+  typename LayoutTagAux_,
+  bool isC = false,
+  typename ElementCompute = float
+>
+class HostAuxLoad: public HostEVTNodeBase<ElementCompute> {
+public:
+  using Base = HostEVTNodeBase<ElementCompute>;
+  using ElementAuxLoad = ElementAuxLoad_;
+  using LayoutTagAux = LayoutTagAux_;
+
+  using StrideAux = cutlass::gemm::TagToStrideC_t<LayoutTagAux>;
+  struct Arguments_Aux {
+    ElementAuxLoad const *ptr_aux = nullptr;
+    ElementAuxLoad null_default = ElementAuxLoad(0);
+    StrideAux dAux = {};
+  };
+
+  struct Arguments_C {};
+
+  using Arguments = cute::conditional_t<isC, Arguments_C, Arguments_Aux>;
+
+private:
+  cutlass::NumericConverter<ElementCompute, ElementAuxLoad> aux_load_converter_;
+  cutlass::HostTensor<ElementAuxLoad, LayoutTagAux> tensor_aux_load_;
+
+  int M_, N_, L_;
+
+  StrideAux stride_aux_;
+public:
+  HostAuxLoad(){}
+  template<typename ProblemShapeType>
+  HostAuxLoad(ProblemShapeType problem_size, bool check_relative_equality = false, int64_t seed = 2024)
+    : Base(check_relative_equality) {
+    auto problem_shape_NMKL = cute::append<4>(problem_size, 1);
+    auto [M_, N_, K, L_] = problem_shape_NMKL;
+    auto aux_coord = cutlass::make_Coord(M_ * L_, N_);
+    tensor_aux_load_.resize(
+      aux_coord, 
+      cutlass::layout::Affine2Layout_Factory<LayoutTagAux>::layout_factory(
+        aux_coord, typename LayoutTagAux::Stride()
+      )
+    );
+    EXPECT_TRUE(
+      detail::initialize_tensor(
+        tensor_aux_load_.host_view(), 
+        cutlass::Distribution::Uniform, 
+        seed
+      )
+    );
+    tensor_aux_load_.sync_device();
+    stride_aux_ = cutlass::make_cute_packed_stride(StrideAux{}, cute::make_shape(M_, N_, L_));
+  }
+
+  template <class ElementAccumulator>
+  ElementCompute visit(
+    int64_t m, int64_t n, int64_t l, int m_b, int n_b,
+    ElementAccumulator acc) {
+
+    
+    auto TensorAuxLoad = cute::make_tensor(tensor_aux_load_.host_data(),
+      cute::make_layout(cute::make_shape(M_, N_, L_), stride_aux_));
+    return aux_load_converter_(TensorAuxLoad(m + m_b, n + n_b, l));
+  }
+
+  bool compare_reference(std::stringstream& error_ss) {
+    if constexpr (!isC) {
+      error_ss
+        << "AuxLoad = \n" << tensor_aux_load_.host_view()<< "\n\n";
+    }
+    return true;
+  }
+
+  void* get_tensor_C_ptr() {
+    if constexpr (isC) {
+      return static_cast<void*>(tensor_aux_load_.device_data());
+    } 
+    else {
+      return nullptr;
+    }
+  }
+
+  Arguments get_arguments() {
+    if constexpr (isC)
+      return {};
+    else
+      return {tensor_aux_load_.device_data(), ElementAuxLoad(0), stride_aux_};
+  }
+
+  auto get_flatten_arguments() {
+    if constexpr (isC)
+      return cute::make_tuple();
+    else
+      return cute::make_tuple(tensor_aux_load_.device_data(), ElementAuxLoad(0), stride_aux_);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// EVT - Compute
+
+template<typename T>
+T* findNonNullPtr(T* first_ptr) {
+  return first_ptr;
+}
+
+template <typename T, typename... Args>
+T* findNonNullPtr(T* first_ptr, Args... args) {
+  if (first_ptr) {
+    return first_ptr;
+  }
+  return findNonNullPtr(args...);
+}
+
+template <
+  template <class> class ComputeOp_,
+  typename ElementCompute = float
+>
+class HostCompute: public HostEVTNodeBase<ElementCompute> {
+public:
+  using Base = HostEVTNodeBase<ElementCompute>;
+  using ComputeOp = ComputeOp_<ElementCompute>;
+
+  struct Arguments {
+    struct OpArgs {} op;
+  };
+private:
+  ComputeOp op_;
+public:
+  HostCompute(){}
+  template <typename ProblemShapeType>
+  HostCompute(ProblemShapeType problem_size, bool check_relative_equality = false, int64_t seed = 2024):
+    Base(check_relative_equality) { }
+
+  template <class ElementAccumulator, typename... Args>
+  ElementCompute visit(
+    int64_t m, int64_t n, int64_t l, int m_b, int n_b,
+    ElementAccumulator acc, Args... frg_inputs) {
+    return op_(frg_inputs...);
+  }
+
+  Arguments get_arguments(){
+    return {};
+  }
+
+  auto get_flatten_arguments() {
+    return cute::make_tuple();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// EVT - Aux Store
+
+template <
+  class ElementAuxStore_,
+  typename LayoutTagAux_,
+  bool isD = false,
+  bool isRelu = false,
+  typename ElementCompute = float
+>
+class HostAuxStore: public HostEVTNodeBase<ElementCompute> {
+public:
+  using ElementAuxStore = ElementAuxStore_;
+  using LayoutTagAux = LayoutTagAux_;
+
+  using Base = HostEVTNodeBase<ElementCompute>;
+
+  using StrideAux = cutlass::gemm::TagToStrideC_t<LayoutTagAux>;
+  struct Arguments_Aux {
+    struct OpArgs {
+      ElementAuxStore* ptr_aux = nullptr;
+      StrideAux dAux = {};
+    } op;
+  };
+
+  struct Arguments_D {};
+
+  using Arguments = cute::conditional_t<isD, Arguments_D, Arguments_Aux>;
+
+
+private:
+  cutlass::NumericConverter<ElementAuxStore, ElementCompute> destination_converter_;
+  cutlass::HostTensor<ElementAuxStore, LayoutTagAux> tensor_aux_store_;
+  cutlass::HostTensor<ElementAuxStore, LayoutTagAux> reference_aux_store_;
+  int M_, N_, L_;
+  StrideAux stride_aux_;
+public:
+  HostAuxStore(){}
+  template <typename ProblemShapeType>
+  HostAuxStore(ProblemShapeType problem_size, bool check_relative_equality = false, int64_t seed = 2024):
+    Base(check_relative_equality) {
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto [M_, N_, K, L_] = problem_shape_MNKL;
+    auto aux_coord = cutlass::make_Coord(M_ * L_, N_);
+    tensor_aux_store_.resize(
+      aux_coord, 
+      cutlass::layout::Affine2Layout_Factory<LayoutTagAux>::layout_factory(
+        aux_coord, typename LayoutTagAux::Stride()
+      )
+    );
+
+    reference_aux_store_.resize(
+      aux_coord,
+      cutlass::layout::Affine2Layout_Factory<LayoutTagAux>::layout_factory(
+        aux_coord, typename LayoutTagAux::Stride()
+      )
+    );
+    tensor_aux_store_.sync_device();
+    stride_aux_ = cutlass::make_cute_packed_stride(StrideAux{}, cute::make_shape(M_, N_, L_));
+  }
+
+  template <class ElementAccumulator>
+  ElementCompute visit(
+    int64_t m, int64_t n, int64_t l, int m_b, int n_b,
+    ElementAccumulator acc, ElementCompute child_0_result) {
+
+    auto TensorAuxStore = cute::make_tensor(detail::make_iterator(static_cast<ElementAuxStore*>(reference_aux_store_.host_data())),
+      cute::make_layout(cute::make_shape(M_, N_, L_), stride_aux_));
+    if constexpr (isRelu)
+      TensorAuxStore(m + m_b, n + n_b, l) = destination_converter_(child_0_result >= 0);
+    else
+      TensorAuxStore(m + m_b, n + n_b, l) = destination_converter_(child_0_result);
+    return child_0_result;
+  }
+
+  bool compare_reference(std::stringstream& error_ss) {
+    // Verify the store node
+    tensor_aux_store_.sync_host();
+
+    bool equal = this->equality_check(reference_aux_store_.host_view(), tensor_aux_store_.host_view());
+    if (!equal) {
+      error_ss 
+        << "\n\nReference =\n" << reference_aux_store_.host_view()
+        << "\n\nComputed =\n" << tensor_aux_store_.host_view() << "\n\n";
+    }
+    return equal;
+  }
+
+  void* get_tensor_D_ptr() {
+    if constexpr (isD) 
+      return static_cast<void*>(tensor_aux_store_.device_data());
+    else
+      return nullptr;
+  }
+
+  Arguments get_arguments() {
+    if constexpr (isD) {
+      return {};
+    } 
+    else {
+      return {tensor_aux_store_.device_data(), stride_aux_};
+    }
+  }
+
+  auto get_flatten_arguments() {
+    if constexpr (isD) {
+      return cute::make_tuple();
+    } 
+    else {
+      return cute::make_tuple(tensor_aux_store_.device_data(), stride_aux_);
+    }
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// EVT - Row Reduce
+
+template <
+  template <class> class ReduceFn,
+  typename ElementReduce,
+  bool FinalReduction = true, // Should match the FinalReduction in Device type
+  typename CtaTileShapeMNK = cute::Shape<cute::_1,cute::_1,cute::_1>,
+  typename ElementCompute = float
+>
+class HostRowReduce: public HostEVTNodeBase<ElementCompute> {
+public:
+  using Base = HostEVTNodeBase<ElementCompute>;
+  using LayoutTagVector = cutlass::layout::PackedVectorLayout;
+
+  using ElementDst = cute::conditional_t<FinalReduction, ElementReduce, ElementCompute>;
+
+  static constexpr int TileM = cute::get<0>(CtaTileShapeMNK{});
+  static constexpr int TileN = cute::get<1>(CtaTileShapeMNK{});
+
+  struct Arguments {
+    struct OpArgs {
+      ElementReduce* ptr_row = nullptr;
+      ElementCompute reduce_identity = 0;
+      cute::Stride<cute::_0, cute::_1, cute::_0> dRow = {};
+    } op;
+  };
+
+private:
+  cutlass::NumericConverter<ElementReduce, ElementDst> destination_converter_;
+  cutlass::HostTensor<ElementDst, LayoutTagVector> tensor_row_reduce_;
+  cutlass::HostTensor<ElementCompute, LayoutTagVector> reduce_buffer_;
+  cutlass::HostTensor<ElementDst, LayoutTagVector> reference_row_reduce_;
+  int N_;
+  ReduceFn<ElementCompute> reduce_fn_;
+
+  int extent_m_;
+  int extent_n_;
+  int extent_l_;
+public:
+  HostRowReduce(){}
+  template <typename ProblemShapeType>
+  HostRowReduce(ProblemShapeType problem_size, bool check_relative_equality = false, int64_t seed = 2024):
+    Base(check_relative_equality) {
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    N_ = cute::get<1>(problem_shape_MNKL);
+    if constexpr (FinalReduction) {
+      tensor_row_reduce_.resize(cutlass::Coord<1>(N_));
+      reference_row_reduce_.resize(cutlass::Coord<1>(N_));
+      reduce_buffer_.resize(cutlass::Coord<1>(N_));
+    } 
+    else {
+      auto NumTile = cute::ceil_div(cute::select<0,1,3>(problem_shape_MNKL), cute::take<0,2>(CtaTileShapeMNK{}));
+      extent_m_ = cute::get<0>(NumTile);
+      extent_n_ = cute::get<1>(NumTile) * TileN;
+      extent_l_ = cute::get<2>(NumTile);
+      auto shape = cutlass::make_Coord(extent_m_ * extent_n_ * extent_l_);
+      tensor_row_reduce_.resize(shape);
+      reference_row_reduce_.resize(shape);
+      reduce_buffer_.resize(shape);
+    }
+
+    cutlass::reference::host::TensorFill(reduce_buffer_.host_view());
+  }
+
+  template <class ElementAccumulator>
+  ElementCompute visit(
+    int64_t m, int64_t n, int64_t l, int m_b, int n_b,
+    ElementAccumulator acc, ElementCompute child_0_result) {
+    if constexpr (FinalReduction) {
+      auto TensorRowReduce = cute::make_tensor(reduce_buffer_.host_data(),
+      cute::make_layout(cute::make_shape(cute::_1{}, N_)));
+      TensorRowReduce(1, n + n_b) = reduce_fn_(TensorRowReduce(1, n + n_b), child_0_result);
+    } 
+    else {
+      auto TensorRowReduce = cute::make_tensor(
+        reduce_buffer_.host_data(),
+        cute::make_layout(
+          cute::make_shape(extent_m_, extent_n_, extent_l_),
+          cute::make_stride(extent_n_, 1, extent_m_ * extent_l_)
+        )
+      );
+      TensorRowReduce((m+m_b)/TileM, n+n_b, l) = reduce_fn_(TensorRowReduce((m+m_b)/TileM, n+n_b, l), child_0_result);
+    }
+    
+    return child_0_result;
+  }
+
+  bool compare_reference(std::stringstream& error_ss) {
+    // Verify the store node
+    tensor_row_reduce_.sync_host();
+
+    auto TensorRowReduce = cute::make_tensor(reference_row_reduce_.host_data(),
+      cute::make_layout(cute::make_shape(reference_row_reduce_.size())));
+    
+    auto TensorReduceBuffer = cute::make_tensor(reduce_buffer_.host_data(),
+      cute::make_layout(cute::make_shape(reduce_buffer_.size())));
+
+    // Filling the reference tensor with the reduce buffer
+    for (uint64_t n = 0; n < size(TensorRowReduce); n ++) {
+      TensorRowReduce(n) = destination_converter_(TensorReduceBuffer(n));
+    }
+
+    bool equal = this->equality_check(reference_row_reduce_.host_view(), tensor_row_reduce_.host_view());
+    if (!equal) {
+      error_ss 
+        << "\n\nRow Reduce Reference =\n" << reference_row_reduce_.host_view()
+        << "\n\nRow Reduce Computed =\n" << tensor_row_reduce_.host_view() << "\n\n";
+    }
+    return equal;
+  }
+
+  Arguments get_arguments() {
+    return {tensor_row_reduce_.device_data()};
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// EVT - Column Reduce
+
+template <
+  template <class> class ReduceFn,
+  typename ElementReduce,
+  bool FinalReduction = true,  // Should match the FinalReduction in Device type
+  typename CtaTileShapeMNK = cute::Shape<cute::_1,cute::_1,cute::_1>,
+  typename ElementCompute = float
+>
+class HostColumnReduce: public HostEVTNodeBase<ElementCompute> {
+public:
+  using Base = HostEVTNodeBase<ElementCompute>;
+  using LayoutTagVector = cutlass::layout::PackedVectorLayout;
+
+  using ElementDst = cute::conditional_t<FinalReduction, ElementReduce, ElementCompute>;
+
+  static constexpr int TileM = cute::get<0>(CtaTileShapeMNK{});
+  static constexpr int TileN = cute::get<1>(CtaTileShapeMNK{});
+
+  struct Arguments {
+    struct OpArgs {
+      ElementReduce* ptr_col = nullptr;
+      ElementCompute reduce_identity = 0;
+      cute::Stride<cute::_1, cute::_0, cute::_0> dRow = {};
+    } op;
+  };
+
+private:
+  cutlass::NumericConverter<ElementDst, ElementCompute> destination_converter_;
+  cutlass::HostTensor<ElementDst, LayoutTagVector> tensor_column_reduce_;
+  cutlass::HostTensor<ElementCompute, LayoutTagVector> reduce_buffer_;
+  cutlass::HostTensor<ElementDst, LayoutTagVector> reference_column_reduce_;
+  int M_;
+  ReduceFn<ElementCompute> reduce_fn_;
+
+  int extent_m_;
+  int extent_n_;
+  int extent_l_;
+public:
+  HostColumnReduce(){}
+  template <typename ProblemShapeType>
+  HostColumnReduce(ProblemShapeType problem_size, bool check_relative_equality = false, int64_t seed = 2024):
+    Base(check_relative_equality) {
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    M_ = cute::get<0>(problem_shape_MNKL);
+
+    if constexpr (FinalReduction) {
+      tensor_column_reduce_.resize(cutlass::Coord<1>(M_));
+      reference_column_reduce_.resize(cutlass::Coord<1>(M_));
+      reduce_buffer_.resize(cutlass::Coord<1>(M_));
+    } 
+    else {
+      auto NumTile = cute::ceil_div(cute::select<0,1,3>(problem_shape_MNKL), cute::take<0,2>(CtaTileShapeMNK{}));
+      extent_m_ = cute::get<0>(NumTile) * TileM;
+      extent_n_ = cute::get<1>(NumTile);
+      extent_l_ = cute::get<2>(NumTile);
+      auto shape = cutlass::make_Coord(extent_m_ * extent_n_ * extent_l_);
+      tensor_column_reduce_.resize(shape);
+      reference_column_reduce_.resize(shape);
+      reduce_buffer_.resize(shape);
+    }
+
+    cutlass::reference::host::TensorFill(reduce_buffer_.host_view());
+  }
+
+  template <class ElementAccumulator>
+  ElementCompute visit(
+    int64_t m, int64_t n, int64_t l, int m_b, int n_b,
+    ElementAccumulator acc, ElementCompute child_0_result) {
+    auto TensorColReduce = cute::make_tensor(reduce_buffer_.host_data(),
+      cute::make_layout(cute::make_shape(M_, cute::_1{})));
+    if constexpr (FinalReduction) {
+      TensorColReduce(m + m_b, 1) = reduce_fn_(TensorColReduce(m + m_b, 1), child_0_result);
+    } 
+    else {
+      auto shape = reduce_buffer_.extent();
+      auto TensorColReduce = cute::make_tensor(
+        reduce_buffer_.host_data(),
+        cute::make_layout(
+          cute::make_shape(extent_m_, extent_n_, extent_l_),
+          cute::make_stride(1, extent_m_, extent_m_ * extent_l_)
+        )
+      );
+      TensorColReduce(m+m_b, (n+n_b)/TileN, l) = reduce_fn_(TensorColReduce(m+m_b, (n+n_b)/TileN, l), child_0_result);
+    }
+    return child_0_result;
+  }
+
+  bool compare_reference(std::stringstream& error_ss) {
+    // Verify the store node
+    tensor_column_reduce_.sync_host();
+
+    auto TensorColReduce = cute::make_tensor(reference_column_reduce_.host_data(),
+      cute::make_layout(cute::make_shape(reference_column_reduce_.size())));
+    
+    auto TensorReduceBuffer = cute::make_tensor(reduce_buffer_.host_data(),
+    cute::make_layout(cute::make_shape(reduce_buffer_.size())));
+
+    // Filling the reference tensor with the reduce buffer
+    for (uint64_t m = 0; m < size(TensorColReduce); m ++) {
+      TensorColReduce(m) = destination_converter_(TensorReduceBuffer(m));
+    }
+
+    bool equal = this->equality_check(reference_column_reduce_.host_view(), tensor_column_reduce_.host_view());
+    if (!equal) {
+      error_ss 
+        << "\n\nColumn Reduce Reference =\n" << reference_column_reduce_.host_view()
+        << "\n\nColumn Reduce Computed =\n" << tensor_column_reduce_.host_view() << "\n\n";
+    }
+    return equal;
+  }
+
+  Arguments get_arguments() {
+    return {tensor_column_reduce_.device_data()};
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// EVT - Scalar Reduce
+
+template <
+  template <class> class ReduceFn,
+  typename ElementReduce,
+  typename ElementCompute = float,
+  bool enabled = true
+>
+class HostScalarReduce: public HostEVTNodeBase<ElementCompute> {
+public:
+  using Base = HostEVTNodeBase<ElementCompute>;
+  using LayoutTagVector = cutlass::layout::PackedVectorLayout;
+
+  struct Arguments {
+    struct OpArgs {
+      ElementReduce* ptr_scalar = nullptr;
+      ElementCompute reduce_identity = 0;
+      cute::Stride<cute::_0, cute::_0, cute::_0> dScalar = {};
+    } op;
+  };
+
+private:
+  cutlass::NumericConverter<ElementReduce, ElementCompute> destination_converter_;
+  cutlass::HostTensor<ElementReduce, LayoutTagVector> tensor_scalar_reduce_;
+  cutlass::HostTensor<ElementCompute, LayoutTagVector> reduce_buffer_;
+  cutlass::HostTensor<ElementReduce, LayoutTagVector> reference_scalar_reduce_;
+  ReduceFn<ElementCompute> reduce_fn_;
+public:
+  HostScalarReduce(){}
+  template <typename ProblemShapeType>
+  HostScalarReduce(ProblemShapeType problem_size, bool check_relative_equality = false, int64_t seed = 2024):
+    Base(check_relative_equality) {
+    tensor_scalar_reduce_.resize(cutlass::Coord<1>(1));
+    reference_scalar_reduce_.resize(cutlass::Coord<1>(1));
+    reduce_buffer_.resize(cutlass::Coord<1>(1));
+
+    tensor_scalar_reduce_.sync_device();
+    cutlass::reference::host::TensorFill(reduce_buffer_.host_view());
+  }
+
+  template <class ElementAccumulator>
+  ElementCompute visit(
+    int64_t m, int64_t n, int64_t l, int m_b, int n_b,
+    ElementAccumulator acc, ElementCompute child_0_result) {
+    auto TensorRowReduce = cute::make_tensor(reduce_buffer_.host_data(),
+      cute::make_layout(cute::make_shape(cute::_1{})));
+    TensorRowReduce(0) = reduce_fn_(TensorRowReduce(0), child_0_result);
+    return child_0_result;
+  }
+
+  bool compare_reference(std::stringstream& error_ss) {
+    if constexpr (enabled) {
+      // Verify the store node
+      tensor_scalar_reduce_.sync_host();
+
+      auto TensorRowReduce = cute::make_tensor(reference_scalar_reduce_.host_data(),
+        cute::make_layout(cute::make_shape(cute::_1{})));
+      
+      auto TensorReduceBuffer = cute::make_tensor(reduce_buffer_.host_data(),
+        cute::make_layout(cute::make_shape(cute::_1{})));
+
+      // Filling the reference tensor with the reduce buffer
+      TensorRowReduce(0) = destination_converter_(TensorReduceBuffer(0));
+
+      bool equal = this->equality_check(reference_scalar_reduce_.host_view(), tensor_scalar_reduce_.host_view());
+      if (!equal) {
+        error_ss 
+          << "\n\nScalar Reduce Reference =\n" << reference_scalar_reduce_.host_view()
+          << "\n\nScalar Reduce Computed =\n" << tensor_scalar_reduce_.host_view() << "\n\n";
+      }
+      return equal;
+    }
+    else {
+      return true;
+    }
+    
+  }
+
+  Arguments get_arguments() {
+    return {tensor_scalar_reduce_.device_data()};
+  }
+
+  auto get_flatten_arguments() {
+    return cute::make_tuple(tensor_scalar_reduce_.device_data());
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Host EVT wrapper
+
+/// The ArgumentPack is used to model the alignment when num ops <= 4
+template <typename... Ops>
+struct ArgumentPack;
+
+template <typename T>
+struct ArgumentPack<T> {
+  T arg;
+  ArgumentPack(T first):
+    arg(first) {}
+};
+
+template <typename First, typename... Rest>
+struct ArgumentPack<First, Rest...> {
+  First arg;
+  ArgumentPack<Rest...> rest_args;
+
+  ArgumentPack(First first, Rest... rest) :
+    arg(first), rest_args(rest...) {}
+};
+
+
+/// Base class for Host Visitor
+template <class ElementCompute, class... Ops>
+struct HostVisitorBase: public HostEVTNodeBase<ElementCompute> {
+public:
+  using Base = HostEVTNodeBase<ElementCompute>;
+
+  using Arguments_struct = ArgumentPack<typename Ops::Arguments...>;
+  using Arguments_tuple = cute::tuple<typename Ops::Arguments...>;
+
+  constexpr static int Rm1 = sizeof...(Ops);
+  constexpr static bool cond = Rm1 > 4;
+  using Arguments = cute::conditional_t<cond, Arguments_tuple, Arguments_struct>;
+
+  std::tuple<Ops...> ops;
+
+  HostVisitorBase(){}
+  template<typename ProblemShapeType>
+  HostVisitorBase(ProblemShapeType problem_size, bool check_relative_equality = false, int64_t seed = 2024)
+    :Base(check_relative_equality),
+    ops(test::gemm::device::tapply(std::tuple<Ops...>{}, 
+      [&] (auto&& op) {
+        using Op = cute::remove_cvref_t<decltype(op)>;
+        return Op(problem_size, check_relative_equality, seed);
+      },
+      [] (auto&&... _ops) { 
+        return std::make_tuple(_ops...); 
+      },
+      cute::make_seq<Rm1>{}
+    )){ }
+
+  bool compare_reference(std::stringstream& error_ss) {
+    return cute::detail::tapply(ops,
+      [&](auto& op) {
+        return op.compare_reference(error_ss);
+      },
+      [&] (auto&&... inputs) {
+        return arrayAnd(inputs...);
+      },
+      cute::make_seq<Rm1>{}
+    );
+  }
+
+  void* get_tensor_C_ptr() {
+    return cute::detail::tapply(ops,
+      [&](auto& op) {
+        return op.get_tensor_C_ptr();
+      },
+      [&] (auto&&... inputs) {
+        return findNonNullPtr(inputs...);
+      },
+      cute::make_seq<Rm1>{}
+    );
+  }
+
+  void* get_tensor_D_ptr() {
+    return cute::detail::tapply(ops,
+      [&](auto& op) {
+        return op.get_tensor_D_ptr();
+      },
+      [&] (auto&&... inputs) {
+        return findNonNullPtr(inputs...);
+      },
+      cute::make_seq<Rm1>{}
+    );
+  }
+
+  Arguments get_arguments() {
+    return test::gemm::device::tapply(ops,
+      [&](auto& op) {
+        return op.get_arguments();
+      },
+      [&] (auto&&... args) {
+        if constexpr (Rm1 > 4) {
+          return cute::make_tuple(args...);
+        } 
+        else {
+          return Arguments(args...);
+        }  
+      },
+      cute::make_seq<Rm1>{}
+    );
+  }
+
+  auto get_flatten_arguments() {
+    return test::gemm::device::tapply(ops,
+      [&](auto& op) {
+        return op.get_flatten_arguments();
+      },
+      [&] (auto&&... args) {
+        return flatten(cute::make_tuple(args...));
+      },
+      cute::make_seq<Rm1>{}
+    );
+  }
+
+  bool arrayAnd(bool passed) {
+    return passed;
+  }
+
+  template <typename... Args>
+  bool arrayAnd(bool first_passed, Args... passed) {
+    if (first_passed) {
+      return arrayAnd(passed...);
+    }
+    return first_passed;
+  }
+
+};
+
+
+/// Tree-struct visitor
+template <class NodeOp, class... ChildOps>
+struct HostTreeVisitor: public HostVisitorBase<typename NodeOp::Base::ElementCompute, ChildOps..., NodeOp> {
+public:
+  using ElementCompute = typename NodeOp::Base::ElementCompute;
+  using Base = HostVisitorBase<ElementCompute, ChildOps..., NodeOp>;
+  using Arguments = typename Base::Arguments;
+  
+  constexpr static int Rm1 = sizeof...(ChildOps);
+
+  HostTreeVisitor(){}
+  template<typename ProblemShapeType>
+  HostTreeVisitor(ProblemShapeType problem_size, bool check_relative_equality = false, int64_t seed = 2024)
+    :Base(problem_size, check_relative_equality, seed){ }
+
+  template <class ElementAccumulator>
+  ElementCompute visit(
+    int64_t m, int64_t n, int64_t l, int m_b, int n_b,
+    ElementAccumulator acc) {
+    return cute::detail::tapply(this->ops,
+      [&] (auto& op) {
+        return op.visit(m, n, l, m_b, n_b, acc);
+      },
+      [&] (auto&&... frg_inputs) {
+        return std::get<Rm1>(this->ops).visit(m, n, l, m_b, n_b, acc, frg_inputs...);
+      },
+      cute::make_seq<Rm1>{}
+    );
+  }
+};
+
+
+/// General Graph visitor
+template <class ElementCompute, class EdgeTuple, class... Ops>
+struct HostTopoVisitor: public HostVisitorBase<ElementCompute, Ops...> {
+public:
+  using Base = HostVisitorBase<ElementCompute, Ops...>;
+  constexpr static int Rm1 = Base::Rm1;
+  using Arguments = typename Base::Arguments;
+  
+private:
+  ElementCompute frg_outputs_[Rm1];
+public:
+  HostTopoVisitor(){}
+  template<typename ProblemShapeType>
+  HostTopoVisitor(ProblemShapeType problem_size, bool check_relative_equality = false, int64_t seed = 2024)
+    :Base(problem_size, check_relative_equality, seed) { }
+
+  template<class ElementAccumulator, int I>
+  ElementCompute visit_(
+    int64_t m, int64_t n, int64_t l, int m_b, int n_b,
+    ElementAccumulator acc) {
+      frg_outputs_[I] = cute::transform_apply(cute::get<I>(EdgeTuple{}),
+        [&] (auto&& _E) {
+          constexpr int e = cute::remove_cvref_t<decltype(_E)>::value;
+          return frg_outputs_[e];
+        },
+        [&] (auto const&... frg_inputs) {
+          ElementCompute res = std::get<I>(this->ops).visit(m, n, l, m_b, n_b, acc, frg_inputs...);
+          return res;
+        }
+      );
+
+      if constexpr (I < Rm1 - 1) {
+        return visit_<ElementAccumulator, I+1>(m, n, l, m_b, n_b, acc);
+      } 
+      else {
+        return frg_outputs_[I];
+      }
+  }
+
+  template <class ElementAccumulator>
+  ElementCompute visit(
+    int64_t m, int64_t n, int64_t l, int m_b, int n_b,
+    ElementAccumulator acc) {
+
+    return visit_<ElementAccumulator, 0>(m, n, l, m_b, n_b, acc);
+  }
+
+};
+
+
+/// SplitTree visitor
+template <class ElementCompute, class InputTree, class OutputTree, class... AuxOutTrees>
+struct HostSplitTreeVisitor: public HostVisitorBase<ElementCompute, InputTree, AuxOutTrees..., OutputTree> {
+public:
+  using Base = HostVisitorBase<ElementCompute, InputTree, AuxOutTrees..., OutputTree>;
+  using Arguments = typename Base::Arguments;
+
+  constexpr static int Rm2 = sizeof...(AuxOutTrees);
+
+private:
+  ElementCompute frg_input_;
+public:
+  HostSplitTreeVisitor(){}
+  template<typename ProblemShapeType>
+  HostSplitTreeVisitor(ProblemShapeType problem_size, bool check_relative_equality = false, int64_t seed = 2024)
+    :Base(problem_size, check_relative_equality, seed) { }
+
+  template<class ElementAccumulator, int I>
+  void visitAux(
+    int64_t m, int64_t n, int64_t l, int m_b, int n_b,
+    ElementAccumulator frag) {
+    std::get<I+1>(this->ops).visit(m, n, l, m_b, n_b, frag);
+
+    if constexpr (I < Rm2 - 1) {
+      return visitAux<ElementAccumulator, I+1>(m, n, l, m_b, n_b, frag);
+    } 
+    else {
+      return;
+    }
+  }
+
+  template<class ElementAccumulator>
+  ElementCompute visit(
+    int64_t m, int64_t n, int64_t l, int m_b, int n_b,
+    ElementAccumulator acc) {
+    
+    /// Compute the input tree
+    frg_input_ = std::get<0>(this->ops).visit(m, n, l, m_b, n_b, acc);
+
+    /// Compute the aux out tree
+    visitAux<ElementAccumulator, 0>(m, n, l, m_b, n_b, frg_input_);
+    /// Visit the output tree
+    return std::get<Rm2+1>(this->ops).visit(m, n, l, m_b, n_b, frg_input_);
+  }
+};
+
+/// Universal testbed for EVT w/o smem
+template <class Gemm, typename EVT, bool FlatArgs = false>
+class Testbed3xEVTnoSmem {
+public:
+  // The EVT Module to test
+  using EVTModule = EVT; //typename EVT::EVTModule;
+
+  using TestBedImpl = typename detail::TestbedImpl<Gemm, cutlass::epilogue::thread::Identity, true>;
+  using Kernel = typename Gemm::GemmKernel;
+  using Epilogue = typename Gemm::GemmKernel::CollectiveEpilogue;
+  using ElementAccumulator = typename Kernel::ElementAccumulator;
+  using ElementC = typename Kernel::ElementC;
+  using ElementD = typename Kernel::ElementD;
+
+  using ProblemShapeType = typename Kernel::ProblemShape;
+
+  using LayoutTagA = typename TestBedImpl::LayoutTagA;
+  using LayoutTagB = typename TestBedImpl::LayoutTagB;
+
+  using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90::RasterOrderOptions;
+  using DecompositionMode = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
+
+  //
+  // Methods
+  //
+  Testbed3xEVTnoSmem(
+      bool check_relative_equality_,
+      cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+      uint64_t seed_ = TestBedImpl::kDefaultSeed ) :
+    impl_((check_relative_equality_ ? CheckEquality::RELATIVE : CheckEquality::EXACT), ScalarLoc::ON_DEVICE, VectorScale::ENABLED,
+          init_A_, init_B_, cutlass::Distribution::Uniform, cutlass::Distribution::Uniform, cutlass::Distribution::Uniform, seed_),
+          check_relative_equality(check_relative_equality_) { }
+
+  Testbed3xEVTnoSmem(
+      cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+      uint64_t seed_ = TestBedImpl::kDefaultSeed ) :
+    impl_(CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorScale::ENABLED,
+          init_A_, init_B_, cutlass::Distribution::Uniform, cutlass::Distribution::Uniform, cutlass::Distribution::Uniform, seed_),
+          check_relative_equality(false)  { }
+  
+  /// Initializes data structures
+  void initialize(ProblemShapeType problem_size) {
+    //
+    // Allocate the GEMM workspace for A/B tensor
+    //
+    impl_.initialize(problem_size);
+  }
+  // Detail Implementation
+  TestBedImpl impl_;
+  
+  // Whether to use relative equality checks
+  bool check_relative_equality;
+  
+  bool verify(ProblemShapeType problem_size, EVTModule& host_reference) {
+    
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto M = cute::get<0>(problem_shape_MNKL);
+    auto N = cute::get<1>(problem_shape_MNKL);
+    auto K = cute::get<2>(problem_shape_MNKL);
+    auto L = cute::get<3>(problem_shape_MNKL);
+
+    auto A = cute::make_tensor(impl_.collective_mma_inputs.tensor_A.host_data(),
+      cute::make_layout(cute::make_shape(M, K, L), impl_.collective_mma_inputs.stride_a));
+    auto B = cute::make_tensor(impl_.collective_mma_inputs.tensor_B.host_data(),
+      cute::make_layout(cute::make_shape(N, K, L), impl_.collective_mma_inputs.stride_b));
+    auto LayoutD = cute::make_layout(cute::make_shape(M, N, L), impl_.collective_epilogue.stride_d);
+
+    cutlass::reference::host::GettMainloopParams<ElementAccumulator, decltype(A), decltype(B)> mainloop_params{A, B};
+
+    /// Reference Kernel
+    static int constexpr kBlockM = 64;
+    static int constexpr kBlockN = 64;
+
+#if defined(_OPENMP)
+    #pragma omp parallel for collapse(3)
+#endif
+    for (int64_t l = 0; l < cute::size<2>(mainloop_params.A.layout()); ++l) {
+      for (int64_t m = 0; m < cute::size<0>(mainloop_params.A.layout()); m += kBlockM) {
+        for (int64_t n = 0; n < cute::size<0>(mainloop_params.B.layout()); n += kBlockN) {
+          ElementAccumulator acc[kBlockM][kBlockN];
+          gett_mainloop(mainloop_params, m, n, l, acc);
+          /// Epilogue EVT
+          for (int n_b = 0; n_b < kBlockN; ++n_b) {
+            for (int m_b = 0; m_b < kBlockM; ++m_b) {
+              if (m + m_b < cute::size<0>(LayoutD) && n + n_b < cute::size<1>(LayoutD)) {
+                host_reference.visit(m, n, l, m_b, n_b, acc[m_b][n_b]);
+              }
+            }
+          }
+        }
+      }
+    }
+
+    std::stringstream error_ss;
+    bool passed = host_reference.compare_reference(error_ss);
+    if (!passed) {
+      std::stringstream fname;
+      fname << "error_Gemm_device_"
+        << M << "x" << N << "x" << K << "x" << L << "_"
+        << cute::get<0>(typename Gemm::GemmKernel::TileShape{}) << "_"
+        << cute::get<1>(typename Gemm::GemmKernel::TileShape{}) << "_"
+        << cute::get<2>(typename Gemm::GemmKernel::TileShape{}) << ".txt";
+      
+      std::ofstream file(fname.str());
+      file
+        << "problem: " << ' ' << M << "x" << N << "x" << K
+        << ", Batch count = " << L << "\n\n";
+      
+      file
+        << "A =\n" << impl_.collective_mma_inputs.tensor_A.host_view()
+        << "\nB =\n" << impl_.collective_mma_inputs.tensor_B.host_view();
+      
+      file << error_ss.str();
+    }
+
+    return passed;
+  }
+
+  bool run(
+    ProblemShapeType problem_size,
+    RasterOrderOptions raster_order = RasterOrderOptions::Heuristic,
+    detail::MaxSwizzleSize max_swizzle = detail::MaxSwizzleSize{},
+    detail::Splits splits = detail::Splits{},
+    DecompositionMode decomposition_mode = DecompositionMode::Heuristic,
+    int iterations = 20,
+    bool profiling = false) {   
+    // Fail test if insufficient CUDA device
+    if (!impl_.sufficient()) {
+      std::cout << "Test failed due to insufficient CUDA device." << std::endl;
+      return false;
+    }
+    //
+    // Initialize the Gemm operator
+    //
+
+    typename Gemm::Arguments arguments;
+    cutlass::KernelHardwareInfo hw_info;
+    hw_info.device_id = 0;
+    if (not profiling) {
+      impl_.sm_count = std::min(impl_.MaxSmCount, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id));
+      hw_info.sm_count = impl_.sm_count;
+    }
+    else {
+      impl_.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+      hw_info.sm_count = impl_.sm_count;
+    }
+
+    typename Gemm::GemmKernel::TileScheduler::Arguments scheduler_args;
+    if constexpr (cute::is_same_v<typename Gemm::GemmKernel::TileSchedulerTag, cutlass::gemm::StreamKScheduler>) {
+      scheduler_args = { static_cast<int>(splits), static_cast<int>(max_swizzle), raster_order, decomposition_mode };
+    }
+    else {
+      scheduler_args = { static_cast<int>(max_swizzle), raster_order };
+    }
+
+    /// Initializes data structures
+    /// A/B/C/D Tensor
+    initialize(problem_size);
+
+    /// Initialize the epilogue arguments
+    EVTModule host_reference(problem_size, check_relative_equality, 2024);
+
+    arguments = typename Gemm::Arguments{
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      problem_size,
+      {
+        impl_.collective_mma_inputs.tensor_A.device_data(), impl_.collective_mma_inputs.stride_a,
+        impl_.collective_mma_inputs.tensor_B.device_data(), impl_.collective_mma_inputs.stride_b
+      },
+      {},
+      hw_info,
+      scheduler_args
+    };
+
+    // Filling in the thread arguments
+    if constexpr (FlatArgs) {
+      auto epilogue_args = host_reference.get_flatten_arguments();
+      std::memcpy(&arguments.epilogue.thread, &epilogue_args, sizeof(epilogue_args));
+
+      arguments.epilogue.ptr_C = static_cast<ElementC*>(host_reference.get_tensor_C_ptr());
+      arguments.epilogue.dC = impl_.collective_epilogue.stride_c;
+
+      arguments.epilogue.ptr_D = static_cast<ElementD*>(host_reference.get_tensor_D_ptr());
+      arguments.epilogue.dD = impl_.collective_epilogue.stride_d;
+    } 
+    else {
+      auto epilogue_args = host_reference.get_arguments();
+      std::memcpy(&arguments.epilogue, &epilogue_args, sizeof(epilogue_args));
+    }
+
+    Gemm gemm_op;
+
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = gemm_op.can_implement(arguments);
+
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return true;
+    }
+    
+    //
+    // Run the GEMM
+    //
+    if (profiling) {
+      return impl_.profile(problem_size, iterations, gemm_op, arguments, workspace);
+    }
+    else {
+      cudaError_t result;
+      status = gemm_op.initialize(arguments, workspace.get());
+      status = gemm_op.run();
+      result = cudaDeviceSynchronize();
+      if (result != cudaSuccess) {
+        EXPECT_EQ(result, cudaSuccess) << "Error at Kernel Sync.";
+        return false;
+      }
+    }
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Verify
+    //
+    bool passed = this->verify(problem_size, host_reference);
+    if (!passed) {
+      std::cout << "Error : Failed \n";
+    }
+
+    return passed;
+  }
+};
+
+/// Universal testbed for EVT
+template <class Gemm, typename EVT>
+class Testbed3xEVT {
+public:
+  // The EVT Module to test
+  using EVTModule = typename EVT::EVTModule;
+
+  using TestBedImpl = typename detail::TestbedImpl<Gemm, cutlass::epilogue::thread::Identity, true>;
+  using Kernel = typename Gemm::GemmKernel;
+  using Epilogue = typename Gemm::GemmKernel::CollectiveEpilogue;
+  using ElementAccumulator = typename Kernel::ElementAccumulator;
+  using ElementC = typename Kernel::ElementC;
+  using ElementD = typename Kernel::ElementD;
+
+  using ProblemShapeType = typename Kernel::ProblemShape;
+
+  using LayoutTagA = typename TestBedImpl::LayoutTagA;
+  using LayoutTagB = typename TestBedImpl::LayoutTagB;
+  using LayoutTagC = typename TestBedImpl::LayoutTagC;
+  using LayoutTagD = typename TestBedImpl::LayoutTagD;
+
+  //
+  // Methods
+  //
+  Testbed3xEVT(
+    bool check_relative_equality_,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = TestBedImpl::kDefaultSeed
+  ) :
+     impl_((check_relative_equality_ ? CheckEquality::RELATIVE : CheckEquality::EXACT), ScalarLoc::ON_DEVICE, VectorScale::ENABLED,
+           init_A_, init_B_, init_C_, cutlass::Distribution::Uniform, cutlass::Distribution::Uniform, seed_),
+           check_relative_equality(check_relative_equality_) { }
+
+  Testbed3xEVT(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = TestBedImpl::kDefaultSeed
+  ) :
+     impl_(CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorScale::ENABLED,
+           init_A_, init_B_, init_C_, cutlass::Distribution::Uniform, cutlass::Distribution::Uniform, seed_),
+           check_relative_equality(false)  { }
+
+  Testbed3xEVT(
+    typename LayoutTagA::Stride stride_factor_A_,
+    typename LayoutTagB::Stride stride_factor_B_,
+    typename LayoutTagC::Stride stride_factor_C_,
+    typename LayoutTagD::Stride stride_factor_D_,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = TestBedImpl::kDefaultSeed
+  ) :
+    impl_(stride_factor_A_, stride_factor_B_, stride_factor_C_, stride_factor_D_,
+          CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorScale::ENABLED,
+          init_A_, init_B_, init_C_, cutlass::Distribution::Uniform, cutlass::Distribution::Uniform, seed_),
+          check_relative_equality(false)  { }
+  
+  /// Initializes data structures
+  void initialize(ProblemShapeType problem_size) {
+    //
+    // Allocate the GEMM workspace for A/B tensor
+    //
+    impl_.initialize(problem_size);
+  }
+  // Detail Implementation
+  TestBedImpl impl_;
+
+  // Whether to use relative equality checks
+  bool check_relative_equality;
+
+  bool verify(ProblemShapeType problem_size, EVTModule& host_reference) {
+    
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto M = cute::get<0>(problem_shape_MNKL);
+    auto N = cute::get<1>(problem_shape_MNKL);
+    auto K = cute::get<2>(problem_shape_MNKL);
+    auto L = cute::get<3>(problem_shape_MNKL);
+
+    auto A = cute::make_tensor(impl_.collective_mma_inputs.tensor_A.host_data(),
+      cute::make_layout(cute::make_shape(M, K, L), impl_.collective_mma_inputs.stride_a));
+    auto B = cute::make_tensor(impl_.collective_mma_inputs.tensor_B.host_data(),
+      cute::make_layout(cute::make_shape(N, K, L), impl_.collective_mma_inputs.stride_b));
+    auto LayoutD = cute::make_layout(cute::make_shape(M, N, L), impl_.collective_epilogue.stride_d);
+
+    cutlass::reference::host::GettMainloopParams<ElementAccumulator, decltype(A), decltype(B)> mainloop_params{A, B};
+
+    /// Reference Kernel
+    static int constexpr kBlockM = 64;
+    static int constexpr kBlockN = 64;
+
+#if defined(_OPENMP)
+    #pragma omp parallel for collapse(3)
+#endif
+    for (int64_t l = 0; l < cute::size<2>(mainloop_params.A.layout()); ++l) {
+      for (int64_t m = 0; m < cute::size<0>(mainloop_params.A.layout()); m += kBlockM) {
+        for (int64_t n = 0; n < cute::size<0>(mainloop_params.B.layout()); n += kBlockN) {
+          ElementAccumulator acc[kBlockM][kBlockN];
+          gett_mainloop(mainloop_params, m, n, l, acc);
+          /// Epilogue EVT
+          for (int n_b = 0; n_b < kBlockN; ++n_b) {
+            for (int m_b = 0; m_b < kBlockM; ++m_b) {
+              if (m + m_b < cute::size<0>(LayoutD) && n + n_b < cute::size<1>(LayoutD)) {
+                host_reference.visit(m, n, l, m_b, n_b, acc[m_b][n_b]);
+              }
+            }
+          }
+        }
+      }
+    }
+
+    std::stringstream error_ss;
+    bool passed = host_reference.compare_reference(error_ss);
+    if (!passed) {
+      std::stringstream fname;
+      fname << "error_Gemm_device_"
+        << M << "x" << N << "x" << K << "x" << L << "_"
+        << cute::get<0>(typename Gemm::GemmKernel::TileShape{}) << "_"
+        << cute::get<1>(typename Gemm::GemmKernel::TileShape{}) << "_"
+        << cute::get<2>(typename Gemm::GemmKernel::TileShape{}) << ".txt";
+      
+      std::ofstream file(fname.str());
+      file
+        << "problem: " << ' ' << M << "x" << N << "x" << K
+        << ", Batch count = " << L << "\n\n";
+      
+      file
+        << "A =\n" << impl_.collective_mma_inputs.tensor_A.host_view()
+        << "\nB =\n" << impl_.collective_mma_inputs.tensor_B.host_view()
+        << "\nC =\n" << impl_.collective_epilogue.tensor_C.host_view() << "\n\n";
+      
+      file << error_ss.str();
+    }
+
+    return passed;
+  }
+
+  bool run(
+    ProblemShapeType problem_size,
+    bool profiling = false,
+    int iterations = 20,
+    int splits = 1) {   
+    // Fail test if insufficient CUDA device
+    if (!impl_.sufficient()) {
+      std::cout << "Test failed due to insufficient CUDA device." << std::endl;
+      return false;
+    }
+    //
+    // Initialize the Gemm operator
+    //
+
+    typename Gemm::Arguments arguments;
+    cutlass::KernelHardwareInfo hw_info;
+    hw_info.device_id = 0;
+    if (not profiling) {
+      impl_.sm_count = std::min(impl_.MaxSmCount, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id));
+      hw_info.sm_count = impl_.sm_count;
+    }
+    else {
+      impl_.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+      hw_info.sm_count = impl_.sm_count;
+    }
+
+    typename Gemm::GemmKernel::TileScheduler::Arguments scheduler_args;
+    if constexpr (cute::is_same_v<typename Gemm::GemmKernel::TileSchedulerTag, cutlass::gemm::StreamKScheduler>) {
+      scheduler_args = { splits };
+    }
+
+    /// Initializes data structures
+    /// A/B/C/D Tensor
+    initialize(problem_size);
+
+    /// Initialize the epilogue arguments
+    EVTModule host_reference(problem_size, check_relative_equality, 2024);
+
+    arguments = typename Gemm::Arguments{
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      problem_size,
+      {
+        impl_.collective_mma_inputs.tensor_A.device_data(), impl_.collective_mma_inputs.stride_a,
+        impl_.collective_mma_inputs.tensor_B.device_data(), impl_.collective_mma_inputs.stride_b
+      },
+      {   // Epilogue arguments
+        {}, // thread
+        static_cast<ElementC*>(host_reference.get_tensor_C_ptr()),
+        impl_.collective_epilogue.stride_c,
+        static_cast<ElementD*>(host_reference.get_tensor_D_ptr()),
+        impl_.collective_epilogue.stride_d
+      },  // Epilogue arguments end
+      hw_info,
+      scheduler_args
+    };
+
+    // Filling in the thread arguments
+    typename EVTModule::Arguments epilogue_args = host_reference.get_arguments();
+    std::memcpy(&arguments.epilogue.thread, &epilogue_args.arg, sizeof(epilogue_args.arg));
+
+    Gemm gemm_op;
+
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = gemm_op.can_implement(arguments);
+
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return true;
+    }
+    
+    //
+    // Run the GEMM
+    //
+    if (profiling) {
+      return impl_.profile(problem_size, iterations, gemm_op, arguments, workspace);
+    }
+    else {
+      cudaError_t result;
+      status = gemm_op.initialize(arguments, workspace.get());
+      status = gemm_op.run();
+      result = cudaDeviceSynchronize();
+      if (result != cudaSuccess) {
+        EXPECT_EQ(result, cudaSuccess) << "Error at Kernel Sync.";
+        return false;
+      }
+    }
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Verify
+    //
+    bool passed = this->verify(problem_size, host_reference);
+    if (!passed) {
+      std::cout << "Error : Failed \n";
+    }
+
+    return passed;
+  }
+};
+
+template <typename Gemm, typename EVT>
+bool TestAllEVT(bool check_relative_equality = false) {
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+
+  int max_alignment = std::max(Gemm::kAlignmentA, Gemm::kAlignmentB);
+  std::vector<int> problem_size_m = {max_alignment, 512 - 3 * max_alignment};
+  std::vector<int> problem_size_n = {max_alignment, 512 - 2 * max_alignment};
+
+  if constexpr (cute::is_same_v<typename Gemm::GemmKernel::DispatchPolicy::Schedule,
+        cutlass::gemm::KernelTmaWarpSpecializedPingpong>) {
+  problem_size_m.push_back(768);
+  problem_size_n.push_back(768);
+  }
+
+  constexpr int Stages = Gemm::GemmKernel::DispatchPolicy::Stages;
+  constexpr int TileShapeK = cute::size<2>(typename Gemm::GemmKernel::TileShape{});
+
+  std::vector<int> problem_size_k = {max_alignment, TileShapeK * (Stages + 1) - max_alignment};
+
+  Testbed3xEVT<Gemm, EVT> testbed(check_relative_equality);
+  bool passed = true;
+
+  for (int m : problem_size_m) {
+  for (int n : problem_size_n) {
+    for (int k : problem_size_k) {
+    ProblemShapeType problem_size;
+    if constexpr (cute::rank(ProblemShapeType{}) == 4) {
+      problem_size = ProblemShapeType{m, n, k, /* l */ 1};
+    }
+    else {
+      problem_size = ProblemShapeType{m, n, k};
+    }
+
+    passed = testbed.run(problem_size);
+
+    if (!passed) {
+      return false;
+    }
+    }
+  }
+  }
+
+  // if we do support batched GEMM, just run one test on it to save on test time
+  if constexpr (cute::rank(ProblemShapeType{}) == 4) {
+  auto problem_size = ProblemShapeType{256 + max_alignment, 256 + max_alignment, 160 + max_alignment, /* l */ 3};
+  passed = testbed.run(
+    problem_size
+  );
+
+  if (!passed) {
+    return false;
+  }
+  }
+
+  return passed;
+}
+
+} // namespace device
+} // namespace gemm
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_testbed_3x_ptr_array.hpp b/test/unit/gemm/device/gemm_testbed_3x_ptr_array.hpp
new file mode 100644
index 0000000000..479102b32c
--- /dev/null
+++ b/test/unit/gemm/device/gemm_testbed_3x_ptr_array.hpp
@@ -0,0 +1,1804 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Testbed for Ptr-Array and Grouped GEMM interface
+*/
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <algorithm>
+#include <random>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/gett.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/fusion/operations.hpp"
+#include "cutlass/complex.h"
+#include "testbed_utils.h"
+
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cute/int_tuple.hpp"
+#include "cute/layout.hpp"
+#include "cute/numeric/int.hpp"
+
+namespace test {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+enum class ScalarLoc {
+  ON_HOST = 0,
+  ON_DEVICE = 1
+};
+
+enum class VectorScale {
+  DISABLED = 0,
+  ENABLED = 1
+};
+
+enum class CheckEquality {
+  EXACT = 0,
+  RELATIVE = 1
+};
+
+namespace detail{
+
+// Helper classes that take default data type when
+// the Gemm::EpilogueOutputOp does not have ElementCompute
+// and ElementScalar.
+// (e.g. when Sm90TreeVisitor is used as FusionCallbacks)
+template <typename Gemm, typename Default, typename = void>
+struct ElementComputeType {
+  using Type = Default;
+};
+
+template <typename Gemm, typename Default>
+struct ElementComputeType<Gemm, Default, std::void_t<typename Gemm::EpilogueOutputOp::ElementCompute>> {
+  using Type = typename Gemm::EpilogueOutputOp::ElementCompute;
+};
+
+template <typename Gemm, typename Default, typename = void>
+struct ElementScalarType {
+  using Type = Default;
+};
+
+template <typename Gemm, typename Default>
+struct ElementScalarType<Gemm, Default, std::void_t<typename Gemm::EpilogueOutputOp::ElementScalar>> {
+  using Type = typename Gemm::EpilogueOutputOp::ElementScalar;
+};
+
+// The maximum swizzle size to use
+//
+// This class, like Splits above makes it harder to confuse
+// the order of arguments of the various run(...) functions in this file.
+class MaxSwizzleSize {
+public:
+  MaxSwizzleSize() = default;
+
+  template<class IntegralNotBool,
+    __CUTE_REQUIRES((std::is_integral_v<IntegralNotBool> &&
+      !cute::is_same_v<IntegralNotBool, bool>)) >
+  explicit MaxSwizzleSize(IntegralNotBool max_swizzle_size) : max_swizzle_size_(max_swizzle_size) {}
+  explicit operator int() const { return max_swizzle_size_; }
+private:
+  int max_swizzle_size_ = 1;
+};
+
+template <typename T>
+auto make_iterator(T* ptr) {
+  using namespace cute;
+  if constexpr (cute::is_subbyte_v<T>) {
+    return subbyte_iterator<T>(ptr);
+  }
+  else {
+    return ptr;
+  }
+}
+
+template<class T>
+struct IsDefaultEpilogue {
+  static constexpr bool value = false;
+};
+
+template<class ...args>
+struct IsDefaultEpilogue<cutlass::epilogue::collective::DefaultEpilogue<args...>> {
+  static constexpr bool value = true;
+};
+
+template<class ...args>
+struct IsDefaultEpilogue<cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<args...>> {
+  static constexpr bool value = true;
+};
+
+// The number of splits to test.
+//
+// This class makes it harder to confuse the order of arguments
+// of the various run(...) functions in this file.  The constructor
+// is explicit, so one can't just type 42 (or false, which the
+// compiler unhelpfully turns into 0); one has to type Splits(42).
+// Splits() picks the default number of splits, 1.
+//
+// The conversion-to-int operator (operator int()) MUST be explicit!
+// Conversion to int MUST require static_cast<int>.
+// Otherwise, that defeats a key purpose of this class,
+// which is to catch common errors of confusing the order
+// of function arguments.
+class Splits {
+public:
+  Splits() = default;
+
+  template<class IntegralNotBool,
+    __CUTE_REQUIRES((std::is_integral_v<IntegralNotBool> &&
+      !cute::is_same_v<IntegralNotBool, bool>)) >
+  explicit Splits(IntegralNotBool splits) : splits_(splits) {}
+  explicit operator int() const { return splits_; }
+private:
+  int splits_ = 1;
+};
+
+// The number of iterations to test.
+//
+// This class, like Splits above makes it harder to confuse
+// the order of arguments of the various run(...) functions in this file.
+// Iterations() picks the default number of iterations, 20.
+class Iterations {
+public:
+  Iterations() = default;
+
+  template<class IntegralNotBool,
+    __CUTE_REQUIRES((std::is_integral_v<IntegralNotBool> &&
+      !cute::is_same_v<IntegralNotBool, bool>)) >
+  explicit Iterations(IntegralNotBool iterations) : iterations_(iterations) {}
+  explicit operator int() const { return iterations_; }
+private:
+  int iterations_ = 20;
+};
+
+template <typename Element, typename Layout>
+bool initialize_tensor(
+  cutlass::TensorView<Element, Layout> view,
+  cutlass::Distribution::Kind dist_kind,
+  uint64_t seed) {
+
+  if (dist_kind == cutlass::Distribution::Uniform) {
+    double scope_max, scope_min;
+    int bits_input = cutlass::sizeof_bits<Element>::value;
+
+    if (bits_input == 1) {
+      scope_max = 2;
+      scope_min = 0;
+    }
+    else if (bits_input <= 8) {
+        scope_max = 1;
+        scope_min = -1;
+    }
+    else{
+      scope_max = 4;
+      scope_min = -4;
+    }
+    cutlass::reference::host::TensorFillRandomUniform(
+      view, seed, scope_max, scope_min, 0);
+  }
+
+  else if (dist_kind == cutlass::Distribution::Identity) {
+    cutlass::reference::host::TensorFillIdentity(view);
+  }
+
+  else if (dist_kind == cutlass::Distribution::Gaussian) {
+    cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+  }
+
+  else if (dist_kind == cutlass::Distribution::Sequential) {
+    cutlass::reference::host::BlockFillSequential(
+      view.data(), view.capacity());
+  }
+
+  else if (dist_kind == cutlass::Distribution::AllOnes) {
+    cutlass::reference::host::TensorFill(view, Element(1));
+  }
+
+  else {
+    EXPECT_TRUE(false) << "Not implemented";
+    return false;
+  }
+
+  return true;
+}
+
+// Looks at Cute Stride to check Row / Column Major
+template<typename Stride>
+static constexpr bool is_row_or_col_major(){
+  int stride_0 = int(cute::size<0>(Stride{}));
+  int stride_1 = int(cute::size<1>(Stride{}));
+  int depth = cute::depth(Stride{});
+  return ((stride_0 == 1) || (stride_1 == 1)) && (depth == 1);
+}
+
+
+//
+// Default MMA input Operands : A , B
+//
+template<
+  class ScheduleType_, 
+  class Gemm, 
+  class ElementA_ = typename Gemm::GemmKernel::ElementA,
+  class ElementB_ = typename Gemm::GemmKernel::ElementB> 
+struct HostCollectiveMainloop {
+  // Kernel data types
+  using ElementA = ElementA_;
+  using StrideA  = typename Gemm::GemmKernel::StrideA;
+  using InternalStrideA  = typename Gemm::GemmKernel::InternalStrideA;
+  using ElementB = ElementB_;
+  using StrideB  = typename Gemm::GemmKernel::StrideB;
+  using InternalStrideB  = typename Gemm::GemmKernel::InternalStrideB;
+  using ScheduleType = typename Gemm::GemmKernel::CollectiveMainloop::DispatchPolicy::Schedule;
+  using LayoutTagA = cutlass::detail::StrideToLayoutTagA_t<StrideA>;
+  using LayoutTagB = cutlass::detail::StrideToLayoutTagB_t<StrideB>;
+
+  static constexpr bool IsGroupGemm = !cute::is_same_v<StrideA, InternalStrideA>;
+
+  using ElementAccumulator = typename Gemm::GemmKernel::ElementAccumulator;
+  using ElementScalingFactor = ElementAccumulator;
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+  using EpilogueOutputOp = typename Gemm::EpilogueOutputOp;
+
+  using Arguments = typename Gemm::GemmKernel::MainloopArguments;
+
+  cutlass::ComplexTransform TransformA = Gemm::kTransformA;
+  cutlass::ComplexTransform TransformB = Gemm::kTransformB;
+
+  std::vector<InternalStrideA> stride_a_host;
+  std::vector<InternalStrideB> stride_b_host;
+
+  cutlass::DeviceAllocation<InternalStrideA> stride_a_device;
+  cutlass::DeviceAllocation<InternalStrideB> stride_b_device;
+
+  typename LayoutTagA::Stride stride_factor_A;
+  typename LayoutTagB::Stride stride_factor_B;
+
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+
+  std::vector<cutlass::HostTensor<ElementA, LayoutTagA>> tensors_A;
+  std::vector<cutlass::HostTensor<ElementB, LayoutTagB>> tensors_B;
+  cutlass::DeviceAllocation<const ElementA *> device_tensors_A;
+  cutlass::DeviceAllocation<const ElementB *> device_tensors_B;
+  // Whether to use relative equality checks
+  CheckEquality check_relative_equality = CheckEquality::EXACT;
+
+  uint64_t seed;
+  static constexpr uint64_t kDefaultSeed = 4096;
+
+  // Note: this limitation comes from testbed / not the library
+  static_assert(is_row_or_col_major<InternalStrideA>(),
+    "ERROR : A Layout is neither Row / Column Major)");
+  static_assert(is_row_or_col_major<InternalStrideB>(),
+    "ERROR : B Layout is neither Row / Column Major)");
+
+  HostCollectiveMainloop(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = kDefaultSeed,
+    typename LayoutTagA::Stride stride_factor_A_ = typename LayoutTagA::Stride(),
+    typename LayoutTagB::Stride stride_factor_B_ = typename LayoutTagB::Stride()
+  ):
+    stride_factor_A(stride_factor_A_),
+    stride_factor_B(stride_factor_B_),
+    init_A(init_A_), init_B(init_B_), seed(seed_),
+    check_relative_equality(check_relative_equality_) { }
+
+  bool initialize(ProblemShapeType problem_shapes) {
+    //
+    // Allocate the GEMM workspace
+    //
+
+    // for pointer array problem_shapes.groups() is 1
+
+    tensors_A.clear();
+    tensors_B.clear();
+    stride_a_host.clear();
+    stride_b_host.clear();
+
+    auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(0), 1);
+    L = cutlass::platform::max(problem_shapes.groups(), L);
+
+    for(int32_t i = 0; i < L; ++i) {
+      auto [M, N, K, mock_L] = cute::append<4>(problem_shapes.get_host_problem_shape(i), 1);
+
+      stride_a_host.push_back(cutlass::make_cute_packed_stride(InternalStrideA{}, {M, K, 1}));
+      stride_b_host.push_back(cutlass::make_cute_packed_stride(InternalStrideB{}, {N, K, 1}));
+
+      // 2.x host tensor does not natively contain a batch stride or coord, so we spoof if by folding it into the outer mode
+      auto a_coord = cutlass::make_Coord(M, K);
+      // Cutlass has Row/Col major refers to MxK times KxN matrix product,
+      // so the HostTensorB should be treated as KxN in "coord"'s view
+      auto b_coord = cutlass::make_Coord(K, N);
+
+      tensors_A.push_back(cutlass::HostTensor<ElementA, LayoutTagA>(a_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagA>::layout_factory(a_coord, stride_factor_A)));
+      tensors_B.push_back(cutlass::HostTensor<ElementB, LayoutTagB>(b_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagB>::layout_factory(b_coord, stride_factor_B)));
+
+      EXPECT_TRUE(initialize_tensor(tensors_A[i].host_view(), init_A, seed + 2022 + i));
+      EXPECT_TRUE(initialize_tensor(tensors_B[i].host_view(), init_B, seed + 2021 + i));
+
+      // It is possible to randomly initialize to all zeros, so override this with non-zeros
+      // in the upper left corner of each operand.
+      tensors_A[i].host_view().at({0, 0}) = ElementA(1);
+      tensors_B[i].host_view().at({0, 0}) = ElementB(1);
+
+      tensors_A[i].sync_device();
+      tensors_B[i].sync_device();
+    }
+
+    return true;
+  }
+
+  Arguments to_args(ProblemShapeType problem_shapes) {
+    auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(0), 1);
+    L = cutlass::platform::max(problem_shapes.groups(), L);
+
+    std::vector<ElementA *> ptr_A_host(L);
+    std::vector<ElementB *> ptr_B_host(L);
+
+    for (int32_t i = 0; i < L; ++i) {
+      ptr_A_host.at(i) = tensors_A[i].device_data();
+      ptr_B_host.at(i) = tensors_B[i].device_data();
+    }
+
+    device_tensors_A.reset(L);
+    device_tensors_A.copy_from_host(ptr_A_host.data());
+
+    device_tensors_B.reset(L);
+    device_tensors_B.copy_from_host(ptr_B_host.data());
+
+    stride_a_device.reset(problem_shapes.groups());
+    stride_a_device.copy_from_host(stride_a_host.data());
+    stride_b_device.reset(problem_shapes.groups());
+    stride_b_device.copy_from_host(stride_b_host.data());
+
+    Arguments arguments;
+
+    if constexpr (IsGroupGemm) {
+      arguments
+      = 
+      {
+        device_tensors_A.get(), stride_a_device.get(), device_tensors_B.get(), stride_b_device.get()
+      };
+    } 
+    else {
+      arguments = 
+      {
+        device_tensors_A.get(), stride_a_host[0], device_tensors_B.get(), stride_b_host[0]
+      };
+    }
+
+    return arguments;
+  }
+
+  auto to_host_args(ProblemShapeType problem_shapes, int batch) {
+    using namespace cute;
+    //
+    // Allocate the GEMM workspace
+    //
+    auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(batch), 1);
+    auto A = make_tensor(make_iterator(tensors_A[batch].host_data()),
+          make_layout(make_shape(M, K, 1), stride_a_host[batch]));
+    auto B = make_tensor(make_iterator(tensors_B[batch].host_data()),
+        make_layout(make_shape(N, K, 1), stride_b_host[batch]));
+
+    cutlass::reference::host::GettMainloopParams<ElementAccumulator, 
+                                                 decltype(A), 
+                                                 decltype(B)
+                                                 > mainloop_params{};
+
+    mainloop_params.A = A;
+    mainloop_params.B = B;
+    mainloop_params.transform_A = TransformA;
+    mainloop_params.transform_B = TransformB;
+
+    return mainloop_params;
+  }
+
+  void print_tensors(std::ofstream& file, int batch) {
+    file << "A =\n" << tensors_A[batch].host_view()
+         << "\nB =\n" << tensors_B[batch].host_view();
+  }
+
+  template <
+    class Element,
+    class Layout
+  >
+  bool equality_check(
+    cutlass::TensorView<Element, Layout> const& lhs,
+    cutlass::TensorView<Element, Layout> const& rhs) const {
+
+    // Factors used for calculating relative equality. CUTLASS's relative-equality
+    // checks in include/cutlass/relatively_equal.h  are inspired by
+    // https://floating-point-gui.de/errors/comparison/. This reference suggests using
+    // the minimum normal value of a given type as the nonzero_floor.
+    Element epsilon(static_cast<Element>(0.1f));
+    Element nonzero_floor(std::numeric_limits<Element>::min());
+
+    if constexpr (!cutlass::is_complex<Element>::value) {
+      if (check_relative_equality == CheckEquality::RELATIVE) {
+        return cutlass::reference::host::TensorRelativelyEquals(
+          lhs, rhs, epsilon, nonzero_floor);
+      }
+      else {
+        return cutlass::reference::host::TensorEquals(lhs, rhs);
+      }
+    }
+    else {
+      return cutlass::reference::host::TensorEquals(lhs, rhs);
+    }
+  }
+
+  bool compare_reference(
+      ProblemShapeType problem_shapes, int batch) {
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensors_A[batch].host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensors_B[batch].host_view()), 0);
+
+    bool passed = true;
+    return passed;
+  }
+};
+
+template<class Gemm>
+struct HostCollectiveDefaultEpilogue {
+  // fusion types are potentially void if the fusion is not supported
+  // helper so we don't try to construct HostTensor with void type
+  template <typename T, typename U = uint8_t>
+  using non_void_t = cute::conditional_t<cute::is_void_v<T>, U, T>;
+
+  using ScheduleType = typename Gemm::GemmKernel::CollectiveMainloop::DispatchPolicy::Schedule;
+  using kernel   = typename Gemm::GemmKernel;
+  using Epilogue = typename kernel::CollectiveEpilogue;
+
+  using ElementD = typename kernel::ElementD;
+  using StrideD  = typename kernel::StrideD;
+  using InternalStrideD  = typename kernel::InternalStrideD;
+  using ElementC = non_void_t<typename kernel::ElementC, ElementD>;
+  using StrideC  = typename kernel::StrideC;
+  using InternalStrideC  = typename kernel::InternalStrideC;
+  
+  static constexpr bool IsGroupGemm = !cute::is_same_v<StrideD, InternalStrideD>;
+
+  using FusionOp = typename Gemm::EpilogueOutputOp;
+
+  static_assert(rank(InternalStrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+  static_assert(rank(InternalStrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+
+  static_assert(is_row_or_col_major<InternalStrideC>(),
+    "ERROR : C Layout is neither Row / Column Major)");
+  static_assert(is_row_or_col_major<InternalStrideD>(),
+    "ERROR : D Layout is neither Row / Column Major)");
+
+  // Deduce Cutlass Layouts (RowMajor & ColumnMajor)
+  using LayoutTagC = cutlass::detail::StrideToLayoutTagC_t<StrideC>;
+  using LayoutTagD = cutlass::detail::StrideToLayoutTagC_t<StrideD>;
+  using LayoutTagScalar = cutlass::layout::PackedVectorLayout; // scalars are size-1 vectors
+  using LayoutTagVector = cutlass::layout::PackedVectorLayout;
+
+  using ElementAccumulator = typename kernel::ElementAccumulator;
+  using ElementScalingFactor = ElementAccumulator;
+  using ProblemShapeType = typename kernel::ProblemShape;
+  using ElementCompute = typename ElementComputeType<Gemm, ElementAccumulator>::Type;
+  using ElementScalar = typename ElementScalarType<Gemm, ElementCompute>::Type;
+
+  using Arguments = typename Gemm::GemmKernel::EpilogueArguments;
+
+  /// Initialization
+  cutlass::DeviceAllocation<InternalStrideC> stride_c_device;
+  cutlass::DeviceAllocation<InternalStrideD> stride_d_device;
+  
+  std::vector<InternalStrideC> stride_c_host;
+  std::vector<InternalStrideD> stride_d_host;
+
+  typename LayoutTagC::Stride stride_factor_C;
+  typename LayoutTagD::Stride stride_factor_D;
+
+  // Inputs
+  ElementScalar alpha;
+  ElementScalar beta;
+
+  std::vector<cutlass::HostTensor<ElementC, LayoutTagC>> tensors_C;
+  std::vector<cutlass::HostTensor<ElementD, LayoutTagD>> tensors_D;
+  std::vector<cutlass::HostTensor<ElementD, LayoutTagD>> references_D;
+  cutlass::DeviceAllocation<const ElementC *> device_tensors_C;
+  cutlass::DeviceAllocation<ElementD *> device_tensors_D;
+
+  // Whether to use relative equality checks
+  CheckEquality check_relative_equality = CheckEquality::EXACT;
+  // Are scalars copied to device memory before kernel launch
+  ScalarLoc use_device_scalars = ScalarLoc::ON_HOST;
+  // If per-row scale is enabled and this is disabled, alpha/beta are passed as a host or device scalar instead of device vector
+  VectorScale vector_scale_mode = VectorScale::DISABLED;
+
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+  static constexpr uint64_t kDefaultSeed = 4096;
+
+  HostCollectiveDefaultEpilogue(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    ScalarLoc use_device_scalars_ = ScalarLoc::ON_HOST,
+    VectorScale vector_scale_mode_ = VectorScale::DISABLED,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_scale_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_bias_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = kDefaultSeed
+  ): init_C(init_C_), seed(seed_), 
+     stride_factor_C(typename LayoutTagC::Stride()), 
+     stride_factor_D(typename LayoutTagD::Stride()),
+     check_relative_equality(check_relative_equality_),
+     use_device_scalars(use_device_scalars_){ }
+
+  bool initialize(ProblemShapeType problem_shapes, ElementScalar alpha_=1.f, ElementScalar beta_=0.f) {
+    // Initialize Epilogue tensors
+    
+    tensors_C.clear();
+    tensors_D.clear();
+    references_D.clear();
+    stride_c_host.clear();
+    stride_d_host.clear();
+
+    auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(0), 1);
+    L = cutlass::platform::max(problem_shapes.groups(), L);
+
+    for (int32_t i = 0; i < L; ++i) {
+      auto [M, N, K, mock_L] = cute::append<4>(problem_shapes.get_host_problem_shape(i), 1);
+
+      stride_c_host.push_back(cutlass::make_cute_packed_stride(InternalStrideC{}, {M, N, 1}));
+      stride_d_host.push_back(cutlass::make_cute_packed_stride(InternalStrideD{}, {M, N, 1}));
+
+      // 2.x host tensor does not natively contain a batch stride or coord, so we spoof if by folding it into the outer mode
+      auto c_coord = cutlass::make_Coord(M, N);
+
+      tensors_C.push_back(cutlass::HostTensor<ElementC, LayoutTagC>(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagC>::layout_factory(c_coord, stride_factor_C)));
+      tensors_D.push_back(cutlass::HostTensor<ElementD, LayoutTagD>(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(c_coord, stride_factor_D)));
+      references_D.push_back(cutlass::HostTensor<ElementD, LayoutTagD>(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(c_coord, stride_factor_D), false));
+      EXPECT_TRUE(initialize_tensor(tensors_C[i].host_view(), init_C, seed + 2020));
+      tensors_C[i].host_view().at({0, 0}) = ElementC(1);
+
+      cutlass::reference::host::TensorCopy(references_D[i].host_view(), tensors_C[i].host_view());
+      tensors_C[i].sync_device();
+      tensors_D[i].sync_device();
+    }
+    alpha = alpha_;
+    beta = beta_;
+
+    return true;
+  }
+
+  template <
+    class Element,
+    class Layout
+  >
+  bool equality_check(
+    cutlass::TensorView<Element, Layout> const& lhs,
+    cutlass::TensorView<Element, Layout> const& rhs) const {
+
+    // Factors used for calculating relative equality. CUTLASS's relative-equality
+    // checks in include/cutlass/relatively_equal.h  are inspired by
+    // https://floating-point-gui.de/errors/comparison/. This reference suggests using
+    // the minimum normal value of a given type as the nonzero_floor.
+    Element epsilon(static_cast<Element>(0.1f));
+    Element nonzero_floor(std::numeric_limits<Element>::min());
+
+    if constexpr (!cutlass::is_complex<Element>::value) {
+      if (check_relative_equality == CheckEquality::RELATIVE) {
+        return cutlass::reference::host::TensorRelativelyEquals(
+          lhs, rhs, epsilon, nonzero_floor);
+      }
+      else {
+        return cutlass::reference::host::TensorEquals(lhs, rhs);
+      }
+    }
+    else {
+      return cutlass::reference::host::TensorEquals(lhs, rhs);
+    }
+  }
+  
+  bool compare_reference(
+      ProblemShapeType problem_shapes,
+      ElementScalar alpha,
+      ElementScalar beta,
+      int batch) {
+    auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(0), 1);
+    L = cutlass::platform::max(problem_shapes.groups(), L);
+
+    tensors_D[batch].sync_host();
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensors_C[batch].host_view()), 0);
+
+    if (tensors_D[batch].size() > 1) {
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensors_D[batch].host_view()), 0);
+    }
+
+    if (references_D[batch].size() > 1) {
+      EXPECT_GT(cutlass::reference::host::TensorNorm(references_D[batch].host_view()), 0);
+    }
+
+    bool passed = equality_check(references_D[batch].host_view(), tensors_D[batch].host_view());
+    if(!passed) {
+      std::cout<<"D is incorrect"<<std::endl;  
+    }
+    return passed;
+  }
+
+  void print_tensors(std::ofstream& file, int batch) {
+    file
+    << "\nC =\n" << tensors_C[batch].host_view()
+    << "\n\nReference =\n" << references_D[batch].host_view()
+    << "\n\nComputed =\n" << tensors_D[batch].host_view();
+  }
+
+  Arguments to_args(ProblemShapeType problem_shapes) {
+    auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(0), 1);
+    L = cutlass::platform::max(problem_shapes.groups(), L);
+
+    std::vector<ElementC *> ptr_C_host(L);
+    std::vector<ElementD *> ptr_D_host(L);
+
+    for (int32_t i = 0; i < L; ++i) {
+      ptr_C_host.at(i) = tensors_C[i].device_data();
+      ptr_D_host.at(i) = tensors_D[i].device_data();
+    }
+
+    device_tensors_C.reset(L);
+    device_tensors_C.copy_from_host(ptr_C_host.data());
+
+    device_tensors_D.reset(L);
+    device_tensors_D.copy_from_host(ptr_D_host.data());
+
+    stride_c_device.reset(problem_shapes.groups());
+    stride_c_device.copy_from_host(stride_c_host.data());
+
+    stride_d_device.reset(problem_shapes.groups());
+    stride_d_device.copy_from_host(stride_d_host.data());
+
+    Arguments arguments;
+    if constexpr (IsGroupGemm) {
+      arguments = 
+      {
+        {alpha, beta},
+        device_tensors_C.get(), stride_c_device.get(), device_tensors_D.get(), stride_d_device.get()
+      };
+    }
+    else {
+      arguments = 
+      {
+        {alpha, beta},
+        device_tensors_C.get(), stride_c_host[0], device_tensors_D.get(), stride_d_host[0]
+      };
+    }
+
+    return arguments;
+  }
+
+  auto to_host_args(ProblemShapeType problem_shapes, int batch) {
+    using namespace cute;
+    //
+    // Allocate the GEMM workspace
+    //
+    auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(batch), 1);
+    L = std::max(problem_shapes.groups(), L);
+
+    auto coord_0 = cutlass::make_Coord(0);
+    auto C = cute::make_tensor(detail::make_iterator(tensors_C[batch].host_data()),
+        cute::make_layout(cute::make_shape(M, N, 1), stride_c_host[batch]));
+    auto D = cute::make_tensor(detail::make_iterator(references_D[batch].host_data()),
+        cute::make_layout(cute::make_shape(M, N, 1), stride_d_host[batch]));
+
+    cutlass::reference::host::GettEpilogueParams<
+      ElementScalar,
+      ElementScalar,
+      ElementAccumulator,
+      ElementCompute,
+      decltype(C),
+      decltype(D)>
+        epilogue_params{};
+
+    epilogue_params.C = C;
+    epilogue_params.D = D;
+    epilogue_params.alpha = alpha;
+    epilogue_params.beta = beta;
+
+    return epilogue_params;
+  }
+};
+
+template<class Gemm>
+struct HostCollectiveEpilogue {
+  // fusion types are potentially void if the fusion is not supported
+  // helper so we don't try to construct HostTensor with void type
+  template <typename T, typename U = uint8_t>
+  using non_void_t = cute::conditional_t<cute::is_void_v<T>, U, T>;
+
+  using ScheduleType = typename Gemm::GemmKernel::CollectiveMainloop::DispatchPolicy::Schedule;
+  using kernel   = typename Gemm::GemmKernel;
+  using Epilogue = typename kernel::CollectiveEpilogue;
+  static_assert(IsDefaultEpilogue<Epilogue>::value == false, "Default Epilogue is not supported");
+
+  using ElementD = typename kernel::ElementD;
+  using StrideD  = typename kernel::StrideD;
+  using InternalStrideD  = typename kernel::InternalStrideD;
+  using ElementC = non_void_t<typename kernel::ElementC, ElementD>;
+  using StrideC  = typename kernel::StrideC;
+  using InternalStrideC  = typename kernel::InternalStrideC;
+
+  static constexpr bool IsGroupGemm = !cute::is_same_v<StrideD, InternalStrideD>;
+
+  static_assert(rank(InternalStrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+  static_assert(rank(InternalStrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+
+  static_assert(is_row_or_col_major<InternalStrideC>(),
+    "ERROR : C Layout is neither Row / Column Major)");
+  static_assert(is_row_or_col_major<InternalStrideD>(),
+    "ERROR : D Layout is neither Row / Column Major)");
+
+  // Deduce Cutlass Layouts (RowMajor & ColumnMajor)
+  using LayoutTagC = cutlass::detail::StrideToLayoutTagC_t<StrideC>;
+  using LayoutTagD = cutlass::detail::StrideToLayoutTagC_t<StrideD>;
+  using LayoutTagScalar = cutlass::layout::PackedVectorLayout; // scalars are size-1 vectors
+  using LayoutTagVector = cutlass::layout::PackedVectorLayout;
+
+  using ElementAccumulator = typename kernel::ElementAccumulator;
+  using ElementScalingFactor = ElementAccumulator;
+  using ProblemShapeType = typename kernel::ProblemShape;
+
+  //
+  // FusionOperation derived types/queries
+  //
+  using EpiloguePolicy = typename Epilogue::DispatchPolicy;
+  static constexpr bool IsLegacy =
+  cute::is_same_v<
+    EpiloguePolicy,
+    cutlass::epilogue::Sm90TmaWarpSpecializedBiasElementwise<
+      EpiloguePolicy::StagesC, EpiloguePolicy::StagesD, EpiloguePolicy::FragmentSize>
+  >;
+
+  using FusionOp = typename Gemm::EpilogueOutputOp;
+  static_assert(cute::is_base_of_v<cutlass::epilogue::fusion::FusionOperation, FusionOp>);
+
+  using ElementCompute    = typename FusionOp::ElementCompute;
+  using ElementScalar     = typename FusionOp::ElementScalar;
+  using ElementBias       = non_void_t<typename FusionOp::ElementBias>;
+  using ElementAux        = non_void_t<typename FusionOp::ElementAux>;
+  using ElementAmax       = non_void_t<typename FusionOp::ElementAmax>;
+  using LayoutTagAux      = non_void_t<typename FusionOp::GmemLayoutTagAux, LayoutTagD>;
+  using ActivationFunctor = non_void_t<typename FusionOp::ActivationFn,
+                              cutlass::epilogue::thread::Identity<ElementCompute>>;
+
+  static constexpr bool IsBiasEnabled        = FusionOp::IsPerRowBiasSupported;
+  static constexpr bool IsDeBiasEnabled      = FusionOp::IsDePerRowBiasSupported;
+  static constexpr bool IsPerRowScaleEnabled = FusionOp::IsPerRowScaleSupported;
+  static constexpr bool IsScaleFactorEnabled = FusionOp::IsScaleFactorSupported;
+  static constexpr bool IsAuxInEnabled       = FusionOp::IsAuxInSupported;
+  static constexpr bool IsAuxOutEnabled      = FusionOp::IsAuxOutSupported;
+  static constexpr bool IsAbsMaxEnabledD     = FusionOp::IsAbsMaxSupported &&
+                                                (cute::is_same_v<ElementD, cutlass::float_e4m3_t> ||
+                                                 cute::is_same_v<ElementD, cutlass::float_e5m2_t>);
+  static constexpr bool IsAbsMaxEnabledAux   = IsAuxOutEnabled && FusionOp::IsAbsMaxSupported &&
+                                                (cute::is_same_v<ElementAux, cutlass::float_e4m3_t> ||
+                                                 cute::is_same_v<ElementAux, cutlass::float_e5m2_t>);
+
+  using Arguments = typename Gemm::GemmKernel::EpilogueArguments;
+
+  /// Initialization
+  cutlass::DeviceAllocation<InternalStrideC> stride_c_device;
+  cutlass::DeviceAllocation<InternalStrideD> stride_d_device;
+  
+  std::vector<InternalStrideC> stride_c_host;
+  std::vector<InternalStrideD> stride_d_host;
+
+  typename LayoutTagC::Stride stride_factor_C;
+  typename LayoutTagD::Stride stride_factor_D;
+
+  // Inputs
+  cutlass::HostTensor<ElementScalar, LayoutTagScalar> alpha;
+  cutlass::HostTensor<ElementScalar, LayoutTagScalar> beta;
+  cutlass::HostTensor<ElementScalar, LayoutTagScalar> scale_A;
+  cutlass::HostTensor<ElementScalar, LayoutTagScalar> scale_B;
+  cutlass::HostTensor<ElementScalar, LayoutTagScalar> scale_C;
+  cutlass::HostTensor<ElementScalar, LayoutTagScalar> scale_D;
+  cutlass::HostTensor<ElementScalar, LayoutTagScalar> scale_Aux;
+  cutlass::HostTensor<ElementBias  , LayoutTagVector> bias;
+  std::vector<cutlass::HostTensor<ElementC, LayoutTagC>> tensors_C;
+  cutlass::DeviceAllocation<const ElementC *> device_tensors_C;
+  cutlass::HostTensor<ElementCompute, LayoutTagScalar> norm_constant;
+  
+  // Outputs
+  cutlass::HostTensor<ElementAmax, LayoutTagScalar> abs_max_Aux;
+  cutlass::HostTensor<ElementAmax, LayoutTagScalar> abs_max_D;
+  std::vector<cutlass::HostTensor<ElementAux , LayoutTagAux>> tensors_Aux;
+  cutlass::DeviceAllocation<ElementAux *> device_tensors_Aux;
+  cutlass::gemm::TagToStrideC_t<   LayoutTagAux   > stride_Aux;
+  std::vector<cutlass::HostTensor<ElementD, LayoutTagD>> tensors_D;
+  std::vector<cutlass::HostTensor<ElementD, LayoutTagD>> references_D;
+  cutlass::DeviceAllocation<ElementD *> device_tensors_D;
+
+  // References
+  cutlass::HostTensor<ElementBias, LayoutTagVector> reference_dbias;
+  std::vector<cutlass::HostTensor<ElementAux , LayoutTagAux>> references_Aux;
+  cutlass::HostTensor<ElementAmax, LayoutTagScalar> reference_abs_max_Aux;
+  cutlass::HostTensor<ElementAmax, LayoutTagScalar> reference_abs_max_D;
+
+  // Whether to use relative equality checks
+  CheckEquality check_relative_equality = CheckEquality::EXACT;
+  // Are scalars copied to device memory before kernel launch
+  ScalarLoc use_device_scalars = ScalarLoc::ON_HOST;
+  // If per-row scale is enabled and this is disabled, alpha/beta are passed as a host or device scalar instead of device vector
+  VectorScale vector_scale_mode = VectorScale::DISABLED;
+
+  // Random distribution with which to initialize the A/B/C/D/Aux scaling factors
+  cutlass::Distribution::Kind init_scale = cutlass::Distribution::Uniform;
+  // Random distribution with which to initialize the bias vector
+  cutlass::Distribution::Kind init_bias = cutlass::Distribution::Uniform;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+  static constexpr uint64_t kDefaultSeed = 4096;
+
+  HostCollectiveEpilogue(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    ScalarLoc use_device_scalars_ = ScalarLoc::ON_HOST,
+    VectorScale vector_scale_mode_ = VectorScale::DISABLED,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_scale_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_bias_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = kDefaultSeed
+  ): init_scale(init_scale_), init_bias(init_bias_), 
+     init_C(init_C_), seed(seed_), 
+     stride_factor_C(typename LayoutTagC::Stride()), 
+     stride_factor_D(typename LayoutTagD::Stride()),
+     check_relative_equality(check_relative_equality_),
+     use_device_scalars(use_device_scalars_){ }
+
+  bool initialize(ProblemShapeType problem_shapes, ElementScalar alpha_=1.f, ElementScalar beta_=0.f) {
+    // Initialize Epilogue tensors
+     
+    tensors_C.clear();
+    tensors_D.clear();
+    references_D.clear();
+    stride_c_host.clear();
+    stride_d_host.clear();
+    auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(0), 1);
+    L = std::max(problem_shapes.groups(), L);
+
+    for (int32_t i = 0; i < L; ++i) {
+      auto [M, N, K, mock_L] = cute::append<4>(problem_shapes.get_host_problem_shape(i), 1);
+
+      stride_c_host.push_back(cutlass::make_cute_packed_stride(InternalStrideC{}, {M, N, 1}));
+      stride_d_host.push_back(cutlass::make_cute_packed_stride(InternalStrideD{}, {M, N, 1}));
+
+      auto c_coord = cutlass::make_Coord(M, N);
+      tensors_C.push_back(cutlass::HostTensor<ElementC, LayoutTagC>(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagC>::layout_factory(c_coord, stride_factor_C)));
+      tensors_D.push_back(cutlass::HostTensor<ElementD, LayoutTagD>(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(c_coord, stride_factor_D)));
+      references_D.push_back(cutlass::HostTensor<ElementD, LayoutTagD>(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(c_coord, stride_factor_D), false));
+      EXPECT_TRUE(initialize_tensor(tensors_C[i].host_view(), init_C, seed + 2020));
+      tensors_C[i].host_view().at({0, 0}) = ElementC(1);
+
+      cutlass::reference::host::TensorCopy(references_D[i].host_view(), tensors_C[i].host_view());
+      tensors_C[i].sync_device();
+      tensors_D[i].sync_device();
+    }
+
+    auto scalar_coord = cutlass::make_Coord(1);
+    auto col_vector_coord = cutlass::make_Coord(M);
+    if constexpr (IsPerRowScaleEnabled) {
+      alpha.resize(col_vector_coord);
+      EXPECT_TRUE(initialize_tensor(alpha.host_view(), init_scale, seed + 2023));
+      if (vector_scale_mode == VectorScale::DISABLED) {
+        beta.resize(scalar_coord, false);
+        cutlass::reference::host::TensorFill(beta.host_view(), beta_);
+      }
+      else {
+        beta.resize(col_vector_coord);
+        EXPECT_TRUE(initialize_tensor(beta.host_view(), init_scale, seed + 2024));
+      }
+    }
+    else {
+      alpha.resize(scalar_coord, (use_device_scalars == ScalarLoc::ON_DEVICE));
+      beta.resize(scalar_coord, (use_device_scalars == ScalarLoc::ON_DEVICE));
+      cutlass::reference::host::TensorFill(alpha.host_view(), alpha_);
+      cutlass::reference::host::TensorFill(beta.host_view(), beta_);
+    }
+    alpha.sync_device();
+    beta.sync_device();
+
+    if constexpr (IsScaleFactorEnabled) {
+      scale_A.resize(scalar_coord, (use_device_scalars == ScalarLoc::ON_DEVICE));
+      scale_B.resize(scalar_coord, (use_device_scalars == ScalarLoc::ON_DEVICE));
+      scale_C.resize(scalar_coord, (use_device_scalars == ScalarLoc::ON_DEVICE));
+      scale_D.resize(scalar_coord, (use_device_scalars == ScalarLoc::ON_DEVICE));
+      EXPECT_TRUE(initialize_tensor(scale_A.host_view(), init_scale, seed + 2023));
+      EXPECT_TRUE(initialize_tensor(scale_B.host_view(), init_scale, seed + 2024));
+      EXPECT_TRUE(initialize_tensor(scale_C.host_view(), init_scale, seed + 2025));
+      EXPECT_TRUE(initialize_tensor(scale_D.host_view(), init_scale, seed + 2026));
+      scale_A.sync_device();
+      scale_B.sync_device();
+      scale_C.sync_device();
+      scale_D.sync_device();
+    }
+
+    if constexpr (IsBiasEnabled) {
+      bias.resize(col_vector_coord);
+      EXPECT_TRUE(initialize_tensor(bias.host_view(), init_bias, seed + 2023));
+      bias.sync_device();
+    }
+
+    if constexpr (IsDeBiasEnabled) {
+      bias.resize(col_vector_coord);
+      reference_dbias.resize(col_vector_coord);
+      cutlass::reference::host::TensorFill(bias.host_view(), ElementBias(0));
+      cutlass::reference::host::TensorFill(reference_dbias.host_view(), ElementBias(0));
+      bias.sync_device();
+    }
+
+    if constexpr (IsAbsMaxEnabledD) {
+      abs_max_D.resize(scalar_coord);
+      // ensure in-place device reductions perform their own initialization
+      cutlass::reference::host::TensorFill(abs_max_D.host_view(),
+                                           CUTLASS_STL_NAMESPACE::numeric_limits<ElementAmax>::max());
+      abs_max_D.sync_device();
+      reference_abs_max_D.resize(scalar_coord);
+      cutlass::reference::host::TensorFill(reference_abs_max_D.host_view(), ElementAmax(0));
+    }
+
+    tensors_Aux.clear();
+    references_Aux.clear();
+
+    static_assert(!IsGroupGemm or (IsGroupGemm and !IsAuxInEnabled));
+
+    if constexpr (IsAuxInEnabled) {
+      auto aux_coord = cutlass::make_Coord(M, N);
+      auto aux_layout = cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(aux_coord, typename LayoutTagAux::Stride{});
+      for (int32_t i = 0; i < L; ++i) {
+        tensors_Aux.push_back(cutlass::HostTensor<ElementAux , LayoutTagAux>(aux_coord, aux_layout));
+        EXPECT_TRUE(initialize_tensor(tensors_Aux[i].host_view(), init_C, seed + 2023));
+        tensors_Aux[i].sync_device();
+      }
+      stride_Aux = cutlass::make_cute_packed_stride(cutlass::gemm::TagToStrideC_t<LayoutTagAux>{}, cute::make_shape(M, N, 1));
+    }
+
+    static_assert(!IsGroupGemm or (IsGroupGemm and !IsAuxOutEnabled));
+
+    if constexpr (IsAuxOutEnabled) {
+      for (int32_t i = 0; i < L; ++i) {
+        auto [M, N, K, mock_L] = cute::append<4>(problem_shapes.get_host_problem_shape(i), 1);
+        auto aux_coord = cutlass::make_Coord(M, N);
+        auto aux_layout = cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(aux_coord, typename LayoutTagAux::Stride{});
+        tensors_Aux.push_back(cutlass::HostTensor<ElementAux , LayoutTagAux>(aux_coord, aux_layout));
+        references_Aux.push_back(cutlass::HostTensor<ElementAux , LayoutTagAux>(aux_coord, aux_layout, false));
+        tensors_Aux[i].sync_device();
+      }
+
+      stride_Aux = cutlass::make_cute_packed_stride(cutlass::gemm::TagToStrideC_t<LayoutTagAux>{}, cute::make_shape(M, N, 1));
+
+      if constexpr (IsScaleFactorEnabled) {
+        scale_Aux.resize(scalar_coord, (use_device_scalars == ScalarLoc::ON_DEVICE));
+        EXPECT_TRUE(initialize_tensor(scale_Aux.host_view(), init_scale, seed + 2027));
+        scale_Aux.sync_device();
+      }
+
+      if constexpr (IsAbsMaxEnabledAux) {
+        abs_max_Aux.resize(scalar_coord);
+        // ensure in-place device reductions perform their own initialization
+        cutlass::reference::host::TensorFill(abs_max_Aux.host_view(),
+                                             CUTLASS_STL_NAMESPACE::numeric_limits<ElementAmax>::max());
+        abs_max_Aux.sync_device();
+        reference_abs_max_Aux.resize(scalar_coord);
+        cutlass::reference::host::TensorFill(reference_abs_max_Aux.host_view(), ElementAmax(0));
+      }
+    }
+
+    return true;
+  }
+
+  template <
+    class Element,
+    class Layout
+  >
+  bool equality_check(
+    cutlass::TensorView<Element, Layout> const& lhs,
+    cutlass::TensorView<Element, Layout> const& rhs) const {
+
+    // Factors used for calculating relative equality. CUTLASS's relative-equality
+    // checks in include/cutlass/relatively_equal.h  are inspired by
+    // https://floating-point-gui.de/errors/comparison/. This reference suggests using
+    // the minimum normal value of a given type as the nonzero_floor.
+    Element epsilon(static_cast<Element>(0.1f));
+    Element nonzero_floor(std::numeric_limits<Element>::min());
+
+    if constexpr (!cutlass::is_complex<Element>::value) {
+      if (check_relative_equality == CheckEquality::RELATIVE) {
+        return cutlass::reference::host::TensorRelativelyEquals(
+          lhs, rhs, epsilon, nonzero_floor);
+      }
+      else {
+        return cutlass::reference::host::TensorEquals(lhs, rhs);
+      }
+    }
+    else {
+      return cutlass::reference::host::TensorEquals(lhs, rhs);
+    }
+  }
+  
+  bool compare_reference(
+      ProblemShapeType problem_shapes,
+      ElementScalar alpha,
+      ElementScalar beta,
+      int batch) {
+    tensors_D[batch].sync_host();
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensors_C[batch].host_view()), 0);
+
+    if (tensors_D[batch].size() > 1) {
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensors_D[batch].host_view()), 0);
+    }
+
+    if (references_D[batch].size() > 1) {
+      EXPECT_GT(cutlass::reference::host::TensorNorm(references_D[batch].host_view()), 0);
+    }
+
+    bool passed = equality_check(references_D[batch].host_view(), tensors_D[batch].host_view());
+    if(!passed) {
+      std::cout<<"D is incorrect"<<std::endl;  
+    }
+
+    if constexpr (IsAbsMaxEnabledD) {
+      abs_max_D.sync_host();
+      passed &= equality_check(reference_abs_max_D.host_view(), abs_max_D.host_view());
+    }
+
+    if constexpr (IsDeBiasEnabled) {
+      bias.sync_host();
+      EXPECT_GT(cutlass::reference::host::TensorNorm(bias.host_view()), 0);
+      EXPECT_GT(cutlass::reference::host::TensorNorm(reference_dbias.host_view()), 0);
+      passed &= equality_check(reference_dbias.host_view(), bias.host_view());
+    }
+
+    if constexpr (IsAuxOutEnabled) {
+      tensors_Aux[batch].sync_host();
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensors_Aux[batch].host_view()), 0);
+      EXPECT_GT(cutlass::reference::host::TensorNorm(references_Aux[batch].host_view()), 0);
+      passed &= equality_check(references_Aux[batch].host_view(), tensors_Aux[batch].host_view());
+      if(!passed) {
+        std::cout<<"Aux is incorrect"<<std::endl;  
+      }
+      if constexpr (IsAbsMaxEnabledAux) {
+        abs_max_Aux.sync_host();
+        bool tmp =  equality_check(reference_abs_max_Aux.host_view(), abs_max_Aux.host_view());
+        if(!tmp) {
+          std::cout<<"AbsMax of Aux is incorrect"<<std::endl;  
+        }
+        passed &= tmp;
+      }
+    }
+    return passed;
+  }
+
+  void print_tensors(std::ofstream& file, int batch) {
+    auto coord_0 = cutlass::make_Coord(0);
+    if constexpr (IsScaleFactorEnabled) {
+      file
+        << ", scale_a: " << scale_A.at(coord_0)
+        << ", scale_b: " << scale_B.at(coord_0)
+        << ", scale_c: " << scale_C.at(coord_0);
+    }
+    if constexpr (IsPerRowScaleEnabled) {
+      file << "\n\nvalpha = \n" << alpha.host_view();
+      file << "\n\nvbeta = \n" << beta.host_view();
+    } 
+    else {
+      file
+        << ", alpha: " << alpha.at(coord_0) << ", beta: " << beta.at(coord_0);
+    }
+    file << "\n\n";
+
+    if constexpr (IsAbsMaxEnabledD) {
+      file << "scale_d: " << float(scale_D.at(coord_0));
+      file << "\nReference abs_max_D :";
+      file << " " << float(reference_abs_max_D.at(coord_0));
+
+      file << "\nComputed abs_max_D :";
+      file << " " << float(abs_max_D.at(coord_0));
+      file << "\n\n";
+    }
+
+    if constexpr (IsAbsMaxEnabledAux) {
+      file << "scale_aux: " << float(scale_Aux.at(coord_0));
+      file << "\nReference abs_max_Aux :";
+      file << " " << float(reference_abs_max_Aux.at(coord_0));
+
+      file << "\nComputed abs_max_Aux :";
+      file << " " << float(abs_max_Aux.at(coord_0));
+      file << "\n\n";
+    }
+
+    if constexpr (IsBiasEnabled) {
+      file << "\n\nBias = \n" << bias.host_view();
+    }
+
+    if constexpr (IsAuxInEnabled) {
+      file << "\n\nAux Input = \n" << tensors_Aux[batch].host_view();
+    }
+
+    if constexpr (IsDeBiasEnabled) {
+      file << "\n\nReference dBias = \n" << reference_dbias.host_view();
+      file << "\n\nComputed dBias = \n" << bias.host_view();
+    }
+
+    if constexpr (IsAuxOutEnabled) {
+      file
+        << "\n\nReference Aux =\n" << references_Aux[batch].host_view()
+        << "\n\nComputed Aux =\n" << tensors_Aux[batch].host_view();
+    }
+
+    file
+    << "\nC =\n" << tensors_C[batch].host_view()
+    << "\n\nReference =\n" << references_D[batch].host_view()
+    << "\n\nComputed =\n" << tensors_D[batch].host_view();
+
+  }
+
+  Arguments to_args(ProblemShapeType problem_shapes) {
+    auto coord_0 = cutlass::make_Coord(0);
+    auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(0), 1);
+    L = std::max(problem_shapes.groups(), L);
+
+    std::vector<ElementC *> ptr_C_host(L);
+    std::vector<ElementD *> ptr_D_host(L);
+
+    for (int32_t i = 0; i < L; ++i) {
+      ptr_C_host.at(i) = tensors_C[i].device_data();
+      ptr_D_host.at(i) = tensors_D[i].device_data();
+    }
+
+    device_tensors_C.reset(L);
+    device_tensors_C.copy_from_host(ptr_C_host.data());
+
+    device_tensors_D.reset(L);
+    device_tensors_D.copy_from_host(ptr_D_host.data());
+
+    stride_c_device.reset(problem_shapes.groups());
+    stride_c_device.copy_from_host(stride_c_host.data());
+
+    stride_d_device.reset(problem_shapes.groups());
+    stride_d_device.copy_from_host(stride_d_host.data());
+
+    std::vector<ElementAux *> ptr_Aux_host(L);
+    if constexpr (IsAuxInEnabled || IsAuxOutEnabled) {
+      for (int32_t i = 0; i < L; ++i) {
+        ptr_Aux_host.at(i) = tensors_Aux[i].device_data();
+      }
+      device_tensors_Aux.reset(L);
+      device_tensors_Aux.copy_from_host(ptr_Aux_host.data());
+    }
+
+    auto device_tensors_C_ptr = cute::is_void_v<typename kernel::ElementC> ? nullptr :
+                                  reinterpret_cast<typename kernel::ElementC const**>(device_tensors_C.get());
+
+    Arguments arguments;
+    if constexpr (IsGroupGemm) {
+      arguments = 
+      {
+        {},
+        device_tensors_C_ptr, stride_c_device.get(), device_tensors_D.get(), stride_d_device.get()
+      };
+    }
+    else {
+      arguments = 
+      {
+        {},
+        device_tensors_C_ptr, stride_c_host[0], device_tensors_D.get(), stride_d_host[0]
+      };
+    }
+
+    auto &fusion_args = arguments.thread;
+    if constexpr (IsLegacy) {
+      arguments.thread = {
+        alpha.at(coord_0),
+        beta.at(coord_0),
+        alpha.device_data(),
+        beta.device_data()
+      };
+      arguments.ptr_Bias = bias.device_data();
+      arguments.ptr_T = device_tensors_Aux.get();
+    }
+    else {
+      fusion_args.alpha = alpha.at(coord_0);
+      fusion_args.beta = beta.at(coord_0);
+
+      fusion_args.alpha_ptr = alpha.device_data();
+      // can_implement requires beta_ptr to not be set if its voidC 
+      fusion_args.beta_ptr = cute::is_void_v<typename kernel::ElementC> ? nullptr :
+                               beta.device_data();
+      
+      if constexpr (IsScaleFactorEnabled) {
+        fusion_args.scale_a = scale_A.at(coord_0);
+        fusion_args.scale_b = scale_B.at(coord_0);
+        fusion_args.scale_c = scale_C.at(coord_0);
+        fusion_args.scale_d = scale_D.at(coord_0);
+        fusion_args.scale_a_ptr = scale_A.device_data();
+        fusion_args.scale_b_ptr = scale_B.device_data();
+        fusion_args.scale_c_ptr = scale_C.device_data();
+        fusion_args.scale_d_ptr = scale_D.device_data();
+      }
+
+      if constexpr (IsBiasEnabled) {
+        fusion_args.bias_ptr = bias.device_data();
+      }
+
+      if constexpr (IsDeBiasEnabled) {
+        fusion_args.dbias_ptr = bias.device_data();
+      }
+
+      // example of how to set kernel activation arguments
+      // see ActivationFunctor::Arguments in activation.h for definition
+      // if Arguments doesn't exist then fusion_args.activation is empty
+      if constexpr (cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::ScaledGELU_taylor<ElementCompute>>) {
+        fusion_args.activation.scale = ElementCompute(1);
+      }
+
+      // Treat Clamp as ReLU
+      if constexpr (cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::Clamp<ElementCompute>>) {
+        fusion_args.activation.lower_bound = 0;
+        fusion_args.activation.upper_bound = std::numeric_limits<ElementCompute>::max();
+      }
+
+      if constexpr (IsAbsMaxEnabledD) {
+        fusion_args.amax_D_ptr = abs_max_D.device_data();
+      }
+
+      if constexpr (IsAuxInEnabled) {
+        fusion_args.aux_ptr = device_tensors_Aux.get();
+        fusion_args.dAux = stride_Aux;
+      }
+
+      if constexpr (IsAuxOutEnabled) {
+        fusion_args.aux_ptr = device_tensors_Aux.get();
+        fusion_args.dAux = stride_Aux;
+        if constexpr (IsScaleFactorEnabled) {
+          fusion_args.scale_aux = scale_Aux.at(coord_0);
+          fusion_args.scale_aux_ptr = scale_Aux.device_data();
+        }
+        if constexpr (IsAbsMaxEnabledAux) {
+          fusion_args.amax_aux_ptr = abs_max_Aux.device_data();
+        }
+      }
+    }
+
+    return arguments;
+  }
+
+  auto to_host_args(ProblemShapeType problem_shapes, int batch) {
+    using namespace cute;
+    //
+    // Allocate the GEMM workspace
+    //
+    auto problem_shape_MNKL = cute::append<4>(problem_shapes.get_host_problem_shape(batch), 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+    auto coord_0 = cutlass::make_Coord(0);
+    auto C = cute::make_tensor(detail::make_iterator(tensors_C[batch].host_data()),
+        cute::make_layout(cute::make_shape(M, N, 1), stride_c_host[batch]));
+    auto D = cute::make_tensor(detail::make_iterator(references_D[batch].host_data()),
+        cute::make_layout(cute::make_shape(M, N, 1), stride_d_host[batch]));
+    auto Bias = cute::make_tensor(detail::make_iterator(IsDeBiasEnabled ? reference_dbias.host_data() : bias.host_data()),
+        cute::make_layout(cute::make_shape(M, cute::_1{})));
+    auto Aux_layout = cute::make_layout(cute::make_shape(M, N, 1), stride_Aux);
+    auto Aux = [&]() {
+      auto ptr = recast_ptr<ElementAux>(nullptr);
+      if (IsAuxInEnabled) {
+        ptr = detail::make_iterator(tensors_Aux[batch].host_data());
+      } else if (IsAuxOutEnabled) {
+        ptr = detail::make_iterator(references_Aux[batch].host_data());
+      }
+      return cute::make_tensor(ptr, Aux_layout);
+    }();
+    auto Valpha = cute::make_tensor(detail::make_iterator(alpha.host_data()),
+        cute::make_layout(cute::make_shape(M, N, cute::_1{}), cute::make_stride(cute::_1{}, cute::_0{}, M)));
+    auto Vbeta = cute::make_tensor(detail::make_iterator(beta.host_data()),
+        cute::make_layout(cute::make_shape(M, N, cute::_1{}), cute::make_stride(cute::_1{}, cute::_0{}, N)));
+    cutlass::reference::host::GettEpilogueParams<
+      ElementScalar,
+      ElementScalar,
+      ElementAccumulator,
+      ElementCompute,
+      decltype(C),
+      decltype(D),
+      decltype(Bias),
+      decltype(Aux),
+      decltype(Valpha),
+      decltype(Vbeta),
+      ActivationFunctor
+    > epilogue_params{};
+
+    epilogue_params.C = C;
+    epilogue_params.D = D;
+    epilogue_params.alpha = alpha.at(coord_0);
+    epilogue_params.beta = beta.at(coord_0);
+
+    if constexpr (IsScaleFactorEnabled) {
+      epilogue_params.scale_a = scale_A.at(coord_0);
+      epilogue_params.scale_b = scale_B.at(coord_0);
+      epilogue_params.scale_c = scale_C.at(coord_0);
+      epilogue_params.scale_d = scale_D.at(coord_0);
+    }
+
+    if constexpr (IsBiasEnabled or IsDeBiasEnabled) {
+      epilogue_params.Bias = Bias;
+    }
+
+    if constexpr (IsAbsMaxEnabledD) {
+      epilogue_params.abs_max_D = reference_abs_max_D.host_data();
+    }
+
+    if constexpr (IsAuxInEnabled) {
+      epilogue_params.Aux = Aux;
+    }
+
+    if constexpr (IsAuxOutEnabled) {
+      epilogue_params.Aux = Aux;
+      if constexpr (IsScaleFactorEnabled) {
+        epilogue_params.scale_aux = scale_Aux.at(coord_0);
+      }
+      if constexpr (IsAbsMaxEnabledAux) {
+        epilogue_params.abs_max_Aux = reference_abs_max_Aux.host_data();
+      }
+    }
+
+    if constexpr (IsPerRowScaleEnabled) {
+      epilogue_params.Valpha = Valpha;
+      if (vector_scale_mode == VectorScale::ENABLED) {
+        epilogue_params.Vbeta = Vbeta;
+      }
+    }
+    return epilogue_params;
+  }
+};
+
+template <
+  typename Gemm,
+  template <class T> class ActivationFunctor_ = cutlass::epilogue::thread::Identity,
+  bool force_legacy_epilogue = false,
+  typename ElementA = typename Gemm::GemmKernel::ElementA,
+  typename ElementB = typename Gemm::GemmKernel::ElementB
+>
+struct TestbedImpl {
+  // Kernel data types
+  using ScheduleType = typename Gemm::GemmKernel::CollectiveMainloop::DispatchPolicy::Schedule;
+  // All Collective MMA operands are defined by HostCollectiveMainloopType based on the schedule type
+  using HostCollectiveMainloopType = HostCollectiveMainloop<ScheduleType, Gemm, ElementA, ElementB>;
+  using CollectiveEpilogue = cute::conditional_t<IsDefaultEpilogue<typename Gemm::GemmKernel::CollectiveEpilogue>::value || force_legacy_epilogue, 
+                                                HostCollectiveDefaultEpilogue<Gemm>, 
+                                                HostCollectiveEpilogue<Gemm>>;
+
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+  using ElementAccumulator = typename Gemm::GemmKernel::ElementAccumulator;
+  using ElementCompute = typename ElementComputeType<Gemm, ElementAccumulator>::Type;
+  using ElementScalar = typename ElementScalarType<Gemm, ElementCompute>::Type;
+
+  using LayoutTagA = typename HostCollectiveMainloopType::LayoutTagA;
+  using LayoutTagB = typename HostCollectiveMainloopType::LayoutTagB;
+  using LayoutTagC = typename CollectiveEpilogue::LayoutTagC;
+  using LayoutTagD = typename CollectiveEpilogue::LayoutTagD;
+
+  uint32_t sm_count;
+  // Used to force multi-wave tests for persistent kernel schedules
+  constexpr static int MaxSmCount = 16;
+  static constexpr uint64_t kDefaultSeed = 4096;
+  static constexpr uint32_t mma_promotion_interval = 4;
+  using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90::RasterOrderOptions;
+  using DecompositionMode = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
+
+  HostCollectiveMainloopType collective_mma_inputs;
+  CollectiveEpilogue collective_epilogue;
+
+  static constexpr bool IsGroupGemm = CollectiveEpilogue::IsGroupGemm;
+
+  //
+  // Methods
+  //
+
+  TestbedImpl(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    ScalarLoc use_device_scalars_ = ScalarLoc::ON_HOST,
+    VectorScale vector_scale_mode_ = VectorScale::DISABLED,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_scale_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_bias_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = kDefaultSeed
+  ): collective_mma_inputs(HostCollectiveMainloopType(check_relative_equality_, init_A_, init_B_, seed_)), 
+     collective_epilogue(CollectiveEpilogue(check_relative_equality_, use_device_scalars_, vector_scale_mode_, init_C_, init_scale_, init_bias_, seed_)) { }
+
+  TestbedImpl(
+    typename LayoutTagA::Stride stride_factor_A_,
+    typename LayoutTagB::Stride stride_factor_B_,
+    typename LayoutTagC::Stride stride_factor_C_,
+    typename LayoutTagD::Stride stride_factor_D_,
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    ScalarLoc use_device_scalars_ = ScalarLoc::ON_HOST,
+    VectorScale vector_scale_mode_ = VectorScale::DISABLED,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_scale_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_bias_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = kDefaultSeed
+  ): collective_mma_inputs(HostCollectiveMainloopType(check_relative_equality_, stride_factor_A_, stride_factor_B_, init_A_, init_B_, seed_)),
+     collective_epilogue(CollectiveEpilogue(check_relative_equality_, use_device_scalars_, vector_scale_mode_, init_C_, init_scale_, init_bias_, seed_)) { }
+
+  /// Initializes data structures
+  bool initialize(ProblemShapeType problem_shapes, ElementScalar alpha_=1.f, ElementScalar beta_=0.f) {
+    collective_mma_inputs.initialize(problem_shapes);
+    collective_epilogue.initialize(problem_shapes, alpha_, beta_);
+
+    return true;
+  }
+
+  /// Compares computed reference with device reference and outputs to a file if incorrect
+  bool compare_reference(
+      ProblemShapeType problem_shapes,
+      ElementScalar alpha,
+      ElementScalar beta,
+      int batch)
+  {
+    auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(batch), 1);
+
+    bool passed = collective_mma_inputs.compare_reference(problem_shapes, batch);
+    passed &= collective_epilogue.compare_reference(problem_shapes, alpha, beta, batch);
+    EXPECT_TRUE(passed);
+    if (!passed) {
+      std::stringstream fname;
+      fname << "error_Gemm_device_"
+        << M << "x" << N << "x" << K << "x" << batch << "_"
+        << cute::get<0>(typename Gemm::GemmKernel::TileShape{}) << "_"
+        << cute::get<1>(typename Gemm::GemmKernel::TileShape{}) << "_"
+        << cute::get<2>(typename Gemm::GemmKernel::TileShape{}) << ".txt";
+
+      std::ofstream file(fname.str());
+      file
+        << "problem: " << ' ' << M << "x" << N << "x" << K << ", Batch count = " << batch
+        << ", alpha: " << alpha << ", beta: " << beta << "\n\n";
+
+      collective_mma_inputs.print_tensors(file, batch);
+      collective_epilogue.print_tensors(file, batch);
+    }
+
+    return passed;
+  }
+
+  /// Verifies the result is a GEMM
+  bool verify(
+      ProblemShapeType problem_shapes,
+      ElementScalar alpha,
+      ElementScalar beta)
+  {
+    using namespace cute;
+    auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(0), 1);
+    L = std::max(problem_shapes.groups(), L);
+
+    bool passed = true;
+    for (int32_t i = 0; i < L; ++i) {
+      auto mainloop_params = collective_mma_inputs.to_host_args(problem_shapes, i);
+      auto epilogue_params = collective_epilogue.to_host_args(problem_shapes, i);
+
+      cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
+
+      passed &= compare_reference(problem_shapes, alpha, beta, i);
+    }
+    return passed;
+  }
+
+  /// Determine if the CUDA device is sufficient to run the kernel
+  bool sufficient() {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    size_t smem_size = static_cast<size_t>(Gemm::GemmKernel::SharedStorageSize);
+
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    cudaDeviceProp properties;
+    result = cudaGetDeviceProperties(&properties, device_idx);
+    this->sm_count = properties.multiProcessorCount;
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerBlockOptin < smem_size) {
+      printf("failed due to smem_size\n");
+      printf("hardware smem_size: %d, required smem_size: %d\n\n", int(properties.sharedMemPerBlockOptin), int(smem_size));
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(
+    ProblemShapeType problem_shapes,
+    ElementScalar alpha = ElementScalar(1),
+    ElementScalar beta = ElementScalar(0),
+    detail::Iterations iterations = detail::Iterations{}
+    )
+  {
+
+    // Fail test if insufficient CUDA device
+    if (!sufficient()) {
+      std::cout << "Test failed due to insufficient CUDA device." << std::endl;
+      return false;
+    }
+
+    if (!this->initialize(problem_shapes, alpha, beta)) {
+      std::cerr << "Initialization failed \n";
+      return false;
+    }
+
+    //
+    // Initialize the GEMM operator
+    //
+
+    typename Gemm::Arguments arguments;
+    cutlass::KernelHardwareInfo hw_info;
+    hw_info.device_id = 0;
+    this->sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+    hw_info.sm_count = this->sm_count;
+
+    typename HostCollectiveMainloopType::Arguments mainloop_args;
+
+    mainloop_args = collective_mma_inputs.to_args(problem_shapes);
+
+    if constexpr (IsGroupGemm) {
+    arguments =
+    {
+      cutlass::gemm::GemmUniversalMode::kGrouped,
+      problem_shapes,
+      mainloop_args,
+      collective_epilogue.to_args(problem_shapes),
+      hw_info
+    };
+    }
+    else {
+      arguments =
+      {
+        cutlass::gemm::GemmUniversalMode::kArray,
+        problem_shapes,
+        mainloop_args,
+        collective_epilogue.to_args(problem_shapes),
+        hw_info
+      };
+    }
+
+
+    Gemm gemm_op;
+
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = gemm_op.can_implement(arguments);
+
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return false;
+    }
+
+    //
+    // Run the GEMM
+    //
+
+    cudaError_t result;
+    status = gemm_op.initialize(arguments, workspace.get());
+    status = gemm_op.run();
+    result = cudaDeviceSynchronize();
+    if (result != cudaSuccess) {
+      EXPECT_EQ(result, cudaSuccess) << "Error at Kernel Sync.";
+      return false;
+    }
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Verify
+    //
+    bool passed = this->verify(problem_shapes, alpha, beta);
+    if (!passed) {
+      std::cout << "Error : Failed : with alpha: " << alpha << ", beta: " << beta
+                << "\n";
+    }
+
+    return passed;
+  }
+};
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Gemm,
+  template <class T> class ActivationFunctor = cutlass::epilogue::thread::Identity,
+  bool force_legacy_epilogue = false,
+  typename ElementA = typename Gemm::GemmKernel::ElementA,
+  typename ElementB = typename Gemm::GemmKernel::ElementB
+>
+struct Testbed3x {
+
+  using TestBedImpl = typename detail::TestbedImpl<
+                        Gemm, 
+                        ActivationFunctor, 
+                        force_legacy_epilogue, 
+                        ElementA, 
+                        ElementB
+                        >;
+  using Kernel      = typename Gemm::GemmKernel;
+  using Epilogue    = typename Gemm::GemmKernel::CollectiveEpilogue;
+
+  using ElementAccumulator   = typename TestBedImpl::ElementAccumulator;
+  using ElementCompute       = typename TestBedImpl::ElementCompute;
+  using ElementScalar        = typename TestBedImpl::ElementScalar;
+
+  using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90::RasterOrderOptions;
+  using DecompositionMode = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
+
+  static constexpr bool IsGroupGemm = TestBedImpl::IsGroupGemm;
+
+  // Detail Implementation
+  TestBedImpl impl_;
+
+  //
+  // Methods
+  //
+  Testbed3x(
+      CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+      ScalarLoc use_device_scalars_ = ScalarLoc::ON_DEVICE,
+      VectorScale vector_scale_mode_ = VectorScale::DISABLED,
+      cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_scale_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_bias_ = cutlass::Distribution::Uniform,
+      uint64_t seed_ = TestBedImpl::kDefaultSeed)
+      : impl_(check_relative_equality_, use_device_scalars_, vector_scale_mode_, init_A_, init_B_, init_C_, init_scale_, init_bias_, seed_) {}
+
+  /// Executes one test
+  bool run(
+   typename TestBedImpl::ProblemShapeType problem_shapes,
+    ElementScalar alpha = ElementScalar(1),
+    ElementScalar beta = ElementScalar(0),
+    detail::Iterations iterations = detail::Iterations{}
+    )
+  {
+    return impl_.run(
+        problem_shapes, alpha, beta, iterations);
+  }
+};
+
+template <
+  typename Gemm,
+  template <class T> class ActivationFunctor = cutlass::epilogue::thread::Identity
+>
+bool TestAll(double alpha = 1.0, double beta = 0.0, CheckEquality check_relative_equality = CheckEquality::RELATIVE) {
+  using ElementScalar = typename Gemm::EpilogueOutputOp::ElementScalar;
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+
+  Testbed3x<Gemm, ActivationFunctor> testbed(check_relative_equality, ScalarLoc::ON_DEVICE, VectorScale::DISABLED);
+
+  int max_alignment = std::max(Gemm::kAlignmentA, Gemm::kAlignmentB);
+  std::vector<int> problem_size_m = {max_alignment, 512 - 3 * max_alignment};
+  std::vector<int> problem_size_n = {max_alignment, 512 - 2 * max_alignment};
+
+  constexpr int Stages = Gemm::GemmKernel::DispatchPolicy::Stages;
+  constexpr int TileShapeK = cute::size<2>(typename Gemm::GemmKernel::TileShape{});
+
+  std::vector<int> problem_size_k = {max_alignment, TileShapeK * (Stages + 1) - max_alignment};
+
+  int batches[] = {5, 10};
+
+  bool passed = true;
+
+  for (int batch : batches) {
+    for (int m : problem_size_m) {
+      for (int n : problem_size_n) {
+        for (int k : problem_size_k) {
+
+          if constexpr (Testbed3x<Gemm, ActivationFunctor>::IsGroupGemm) {
+            std::vector<typename ProblemShapeType::UnderlyingProblemShape> problem_sizes_host;
+            cutlass::DeviceAllocation<typename ProblemShapeType::UnderlyingProblemShape> problem_sizes_device;
+
+            for (int i = 0; i < batch; ++i) {
+              problem_sizes_host.push_back({m * ((i % 3) + 1), n * ((i % 4) + 1), k * ((i % 5) + 1)});
+            }
+
+            problem_sizes_device.reset(problem_sizes_host.size());
+            problem_sizes_device.copy_from_host(problem_sizes_host.data());
+
+            passed = testbed.run(
+              ProblemShapeType{static_cast<int>(problem_sizes_host.size()), problem_sizes_device.get(), problem_sizes_host.data()},
+              cutlass::from_real<ElementScalar>(alpha),
+              cutlass::from_real<ElementScalar>(beta)
+            );
+          } 
+          else {
+            ProblemShapeType problem_size{{m, n, k, batch}};
+
+            passed = testbed.run(
+              problem_size,
+              cutlass::from_real<ElementScalar>(alpha),
+              cutlass::from_real<ElementScalar>(beta)
+            );
+          }
+
+          if (!passed) {
+            std::cout << __FILE__ << ':' << __LINE__ << " : GEMM MNKL " << m << " " << n << " " << k << " " << batch << " FAILED.\n";
+            return false;
+          }
+        } // k
+      } // n
+    } // m
+  } // batch
+
+  return passed;
+}
+
+} // namespace device
+} // namespace gemm
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_testbed_3x_tensor_broadcast.hpp b/test/unit/gemm/device/gemm_testbed_3x_tensor_broadcast.hpp
new file mode 100644
index 0000000000..1c3e6448c8
--- /dev/null
+++ b/test/unit/gemm/device/gemm_testbed_3x_tensor_broadcast.hpp
@@ -0,0 +1,507 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface with elementwise tensor-tensor broadcast epilogue
+*/
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "testbed_utils.h"
+#include "gemm_testbed_3x.hpp"
+
+namespace test {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Gemm>
+struct Testbed3xTensorBroadcast {
+
+  using TestBedImpl = typename detail::TestbedImpl<Gemm>;
+  using Kernel      = typename Gemm::GemmKernel;
+  using Epilogue    = typename Gemm::GemmKernel::CollectiveEpilogue;
+
+  using ElementA = typename Kernel::ElementA;
+  using StrideA  = typename Kernel::StrideA;
+  using ElementB = typename Kernel::ElementB;
+  using StrideB  = typename Kernel::StrideB;
+  using ElementC = typename Kernel::ElementC;
+  using StrideC  = typename Kernel::StrideC;
+  using ElementD = typename Kernel::ElementD;
+  using StrideD  = typename Kernel::StrideD;
+
+  using ElementAccumulator   = typename Kernel::ElementAccumulator;
+  using ElementCompute       = typename Epilogue::ElementCompute;
+  using ElementScalar        = typename Epilogue::ElementScalar;
+  using ProblemShapeType     = typename Kernel::ProblemShape;
+  using ElementBias          = typename Epilogue::ElementBias;
+  using ActivationFunctor    = typename Epilogue::ActivationFunctor;
+
+  static constexpr bool IsBinaryOp0Enabled = Epilogue::IsBinaryOp0Enabled;
+  static constexpr bool IsBinaryOp1Enabled = Epilogue::IsBinaryOp1Enabled;
+  static constexpr bool IsUnaryOpEnabled   = Epilogue::IsUnaryOpEnabled;
+
+  static constexpr bool PerColBias = Epilogue::PerColumnBias;
+
+  using LayoutTagA = typename TestBedImpl::LayoutTagA;
+  using LayoutTagB = typename TestBedImpl::LayoutTagB;
+  using LayoutTagC = typename TestBedImpl::LayoutTagC;
+  using LayoutTagD = typename TestBedImpl::LayoutTagD;
+  using LayoutTagVector = cutlass::layout::PackedVectorLayout;
+
+  cutlass::HostTensor<ElementBias, LayoutTagVector> bias;
+  cutlass::HostTensor<ElementC, LayoutTagC> tensor_C1;
+  // tensor_C0 is taken from TestbedImpl's tensor_C
+
+
+  // Detail Implementation
+  TestBedImpl impl_;
+
+  //
+  // Methods
+  //
+  Testbed3xTensorBroadcast(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = TestBedImpl::kDefaultSeed
+  ) :
+    impl_(CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorScale::ENABLED,
+          init_A_, init_B_, init_C_, cutlass::Distribution::Uniform, cutlass::Distribution::Uniform, seed_) { }
+
+  Testbed3xTensorBroadcast(
+    typename LayoutTagA::Stride stride_factor_A_,
+    typename LayoutTagB::Stride stride_factor_B_,
+    typename LayoutTagC::Stride stride_factor_C_,
+    typename LayoutTagD::Stride stride_factor_D_,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = TestBedImpl::kDefaultSeed
+  ) :
+    impl_(stride_factor_A_,
+          stride_factor_B_,
+          stride_factor_C_,
+          stride_factor_D_,
+          CheckEquality::EXACT, ScalarLoc::ON_HOST, VectorScale::ENABLED,
+          init_A_,
+          init_B_,
+          init_C_,
+          cutlass::Distribution::Uniform,
+          cutlass::Distribution::Uniform,
+          seed_) { }
+
+  /// Initializes data structures
+  void initialize(ProblemShapeType problem_size) {
+    //
+    // Allocate the GEMM workspace for A/B/C/D tensor
+    //
+    impl_.initialize(problem_size);
+  }
+
+  void initialize_bias(ProblemShapeType problem_size) {
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto bias_size = PerColBias ? cute::get<1>(problem_shape_MNKL) : cute::get<0>(problem_shape_MNKL);
+    bias.resize(cutlass::Coord<1>(bias_size));
+
+    EXPECT_TRUE(detail::initialize_tensor(bias.host_view(), cutlass::Distribution::Uniform, impl_.collective_mma_inputs.seed + 2023));
+    bias.sync_device();
+  }
+
+  void initialize_c1(ProblemShapeType problem_size) {
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto M = cute::get<0>(problem_shape_MNKL);
+    auto N = cute::get<1>(problem_shape_MNKL);
+    auto L = cute::get<3>(problem_shape_MNKL);
+
+    auto c_coord = cutlass::make_Coord(M * L, N);
+
+    tensor_C1.resize(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(c_coord, impl_.collective_epilogue.stride_factor_C));
+    EXPECT_TRUE(detail::initialize_tensor(tensor_C1.host_view(), cutlass::Distribution::Uniform, impl_.collective_mma_inputs.seed + 2024));
+    tensor_C1.sync_device();
+  }
+
+  /// Compares computed reference with device reference and outputs to a file if incorrect
+  bool compare_reference(
+      cute::Shape<int,int,int,int> problem_shape_MNKL,
+      ElementScalar alpha,
+      ElementScalar beta,
+      bool use_bias)
+  {
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    impl_.collective_epilogue.tensor_D.sync_host();
+    EXPECT_GT(cutlass::reference::host::TensorNorm(impl_.collective_mma_inputs.tensor_A.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(impl_.collective_mma_inputs.tensor_B.host_view()), 0);
+
+    if (impl_.collective_epilogue.tensor_D.size() > 1) {
+      EXPECT_GT(cutlass::reference::host::TensorNorm(impl_.collective_epilogue.tensor_D.host_view()), 0);
+    }
+
+    if (impl_.collective_epilogue.reference_D.size() > 1) {
+      EXPECT_GT(cutlass::reference::host::TensorNorm(impl_.collective_epilogue.reference_D.host_view()), 0);
+    }
+
+    bool passed = cutlass::reference::host::TensorEquals(impl_.collective_epilogue.reference_D.host_view(), impl_.collective_epilogue.tensor_D.host_view());
+
+    EXPECT_TRUE(passed);
+
+    if (!passed) {
+      std::stringstream fname;
+      fname << "error_Gemm_device_broadcast"
+        << M << "x" << N << "x" << K << "x" << L << "_"
+        << cute::get<0>(typename Gemm::GemmKernel::TileShape{}) << "_"
+        << cute::get<1>(typename Gemm::GemmKernel::TileShape{}) << "_"
+        << cute::get<2>(typename Gemm::GemmKernel::TileShape{}) << ".txt";
+
+      std::ofstream file(fname.str());
+      file
+        << "problem: " << ' ' << M << "x" << N << "x" << K << ", Batch count = " << L
+        << ", alpha: " << float(alpha) << ", beta: " << float(beta) << ", use_bias: " << use_bias 
+        << ", per-col bias: " << PerColBias << "\n\n";
+
+      if (use_bias){
+        file << "Bias = \n" << bias.host_view()<< "\n\n";
+      }
+
+      file
+        << "A =\n" << impl_.collective_mma_inputs.tensor_A.host_view()
+        << "\nB =\n" << impl_.collective_mma_inputs.tensor_B.host_view()
+        << "\nC0 =\n" << impl_.collective_epilogue.tensor_C.host_view()
+        << "\nC1 =\n" << tensor_C1.host_view()
+        << "\n\nReference =\n" << impl_.collective_epilogue.reference_D.host_view()
+        << "\n\nComputed =\n" <<impl_.collective_epilogue.tensor_D.host_view();
+    }
+
+    return passed;
+  }
+
+  /// Verifies the result matches the GEMM with elementwise tensor-tensor
+  /// broadcast operation
+  bool verify(
+    ProblemShapeType problem_size,
+    ElementScalar alpha,
+    ElementScalar beta,
+    bool use_bias)
+  {
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto M = cute::get<0>(problem_shape_MNKL);
+    auto N = cute::get<1>(problem_shape_MNKL);
+    auto K = cute::get<2>(problem_shape_MNKL);
+    auto L = cute::get<3>(problem_shape_MNKL);
+
+    auto A = cute::make_tensor(impl_.collective_mma_inputs.tensor_A.host_data(),
+        cute::make_layout(cute::make_shape(M, K, L), impl_.collective_mma_inputs.stride_a));
+    auto B = cute::make_tensor(impl_.collective_mma_inputs.tensor_B.host_data(),
+        cute::make_layout(cute::make_shape(N, K, L), impl_.collective_mma_inputs.stride_b));
+    auto D = cute::make_tensor(impl_.collective_epilogue.reference_D.host_data(),
+        cute::make_layout(cute::make_shape(M, N, L), impl_.collective_epilogue.stride_d));
+    auto Bias = cute::make_tensor(static_cast<ElementBias*>(use_bias ? bias.host_data() : nullptr),
+        cute::make_layout(PerColBias ? cute::make_shape(1, N) : cute::make_shape(M, 1)));
+    auto C0 = cute::make_tensor(impl_.collective_epilogue.tensor_C.host_data(),
+        cute::make_layout(cute::make_shape(M, N, L), impl_.collective_epilogue.stride_c));
+    auto C1 = cute::make_tensor(tensor_C1.host_data(),
+        cute::make_layout(cute::make_shape(M, N, L), impl_.collective_epilogue.stride_c));
+
+    // Create host workspace for output of testbed. This computes a portion of the epilogue:
+    //    ref_compute_out = Activation(alpha * (A @ B) + bias)
+    cutlass::HostTensor<ElementCompute, LayoutTagC> ref_compute_out;
+    auto c_coord = cutlass::make_Coord(M * L, N);
+    ref_compute_out.resize(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(c_coord, impl_.collective_epilogue.stride_factor_C), false);
+    auto RefComputeOut = cute::make_tensor(ref_compute_out.host_data(),
+        cute::make_layout(cute::make_shape(M, N, L), impl_.collective_epilogue.stride_c));
+
+    cutlass::reference::host::GettMainloopParams<ElementAccumulator, decltype(A), decltype(B)> mainloop_params{A, B};
+
+    // Use a dummy null tensor for operand C because the epilogue overrides C.
+    auto dummy_C = cute::make_tensor(static_cast<ElementC*>(nullptr),
+        cute::make_layout(cute::make_shape(M, N, L), impl_.collective_epilogue.stride_c));
+    ElementCompute dummy_beta(0);
+    auto dummy_Aux = cute::make_tensor(static_cast<ElementD*>(nullptr),
+        cute::make_layout(cute::make_shape(M, N, L), impl_.collective_epilogue.stride_d));
+    auto dummy_Valpha = cute::make_tensor(static_cast<ElementCompute*>(nullptr),
+        cute::make_layout(cute::make_shape(M, N, 1), cute::make_stride(cute::_1{}, cute::_0{}, M)));
+    auto dummy_Vbeta = cute::make_tensor(static_cast<ElementCompute*>(nullptr),
+        cute::make_layout(cute::make_shape(M, N, 1), cute::make_stride(cute::_1{}, cute::_0{}, M)));
+    cutlass::reference::host::GettEpilogueParams<
+        ElementScalar,
+        ElementScalar,
+        ElementAccumulator,
+        ElementCompute,
+        decltype(dummy_C),
+        decltype(RefComputeOut),
+        decltype(Bias),
+        decltype(dummy_Aux),      
+        decltype(dummy_Valpha),
+        decltype(dummy_Vbeta),
+        ActivationFunctor,
+        cutlass::plus<ElementCompute>,
+        PerColBias> epilogue_params{
+          alpha,
+          dummy_beta,
+          dummy_C,
+          RefComputeOut,
+          Bias,
+          dummy_Aux,
+          dummy_Valpha,
+          dummy_Vbeta
+        };
+
+    cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
+
+    cutlass::NumericConverter<ElementCompute, ElementC, Epilogue::ThreadEpilogueOp::kRound> source_converter;
+    cutlass::NumericConverter<ElementD, ElementCompute, Epilogue::ThreadEpilogueOp::kRound> destination_converter;
+    cutlass::multiplies<ElementCompute> mul;
+
+    // Compute broadcast operations atop the reference
+    #pragma omp parallel for collapse(3)
+    for (int64_t l = 0; l < cute::size<2>(A.layout()); ++l) {
+      for (int64_t m = 0; m < cute::size<0>(A.layout()); ++m) {
+        for (int64_t n = 0; n < cute::size<0>(B.layout()); ++n) {
+          ElementCompute intermediate = RefComputeOut(m, n, l);
+          // Apply BinaryOp0, if needed
+          if constexpr (IsBinaryOp0Enabled) {
+            typename Epilogue::ThreadEpilogueOp::BinaryOp0 bin0;
+            ElementCompute converted_source = source_converter(C0(m, n, l));
+            intermediate = bin0(intermediate, mul(beta, converted_source));
+          }
+
+          // Apply BinaryOp1, if needed
+          if constexpr (IsBinaryOp1Enabled) {
+            typename Epilogue::ThreadEpilogueOp::BinaryOp1 bin1;
+            ElementCompute converted_source = source_converter(C1(m, n, l));
+            intermediate = bin1(intermediate, mul(beta, converted_source));
+          }
+
+          // Apply UnaryOp, if needed
+          if constexpr (IsUnaryOpEnabled) {
+            typename Epilogue::ThreadEpilogueOp::UnaryOp unary;
+            intermediate = unary(intermediate);
+          }
+
+          D(m, n, l) = destination_converter(intermediate);
+        }
+      }
+    }
+
+    return compare_reference(problem_shape_MNKL, alpha, beta, use_bias);
+  }
+
+  /// Executes one test
+  bool run(
+      ProblemShapeType problem_size,
+      ElementScalar alpha = ElementScalar(1),
+      ElementScalar beta = ElementScalar(0),
+      bool profiling = false,
+      int iterations = 20,
+      bool use_bias = true)
+  {
+    // Fail test if insufficient CUDA device
+    if (!impl_.sufficient()) {
+      std::cout << "Test failed due to insufficient CUDA device." << std::endl;
+      return false;
+    }
+    //
+    // Initialize the GEMM operator
+    //
+
+    typename Gemm::Arguments arguments;
+    cutlass::KernelHardwareInfo hw_info;
+    hw_info.device_id = 0;
+    if (not profiling) {
+      impl_.sm_count = std::min(impl_.MaxSmCount, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id));
+      hw_info.sm_count = impl_.sm_count;
+    }
+    else {
+      impl_.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+      hw_info.sm_count = impl_.sm_count;
+    }
+
+    /// Initializes data structures
+    /// A/B/C0/D Tensor
+    initialize(problem_size);
+    initialize_bias(problem_size);
+
+    if constexpr (IsBinaryOp1Enabled) {
+      initialize_c1(problem_size);
+    }
+
+    arguments = typename Gemm::Arguments{
+      cutlass::gemm::GemmUniversalMode::kGemm,
+        problem_size,
+        { impl_.collective_mma_inputs.tensor_A.device_data(), impl_.collective_mma_inputs.stride_a,
+          impl_.collective_mma_inputs.tensor_B.device_data(), impl_.collective_mma_inputs.stride_b,
+          impl_.mma_promotion_interval
+        },
+        { // Epilogue arguments
+          { alpha, beta }, // ThreadOp arguments
+          impl_.collective_epilogue.stride_c,
+          impl_.collective_epilogue.tensor_D.device_data(),
+          impl_.collective_epilogue.stride_d,
+          use_bias ? bias.device_data() : nullptr,
+          impl_.collective_epilogue.tensor_C.device_data(),
+          tensor_C1.device_data()
+        }, // Epilogue arguments end
+        hw_info
+    };
+
+    Gemm gemm_op;
+
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = gemm_op.can_implement(arguments);
+
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return true;
+    }
+
+    //
+    // Run the GEMM
+    //
+
+    if (profiling) {
+      return impl_.profile(problem_size, iterations, gemm_op, arguments, workspace);
+    }
+    else {
+      cudaError_t result;
+      status = gemm_op.initialize(arguments, workspace.get());
+      status = gemm_op.run();
+      result = cudaDeviceSynchronize();
+      if (result != cudaSuccess) {
+        EXPECT_EQ(result, cudaSuccess) << "Error at Kernel Sync.";
+        return false;
+      }
+
+      EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+      //
+      // Verify
+      //
+      bool passed = this->verify(problem_size, alpha, beta, use_bias);
+      if (!passed) {
+        std::cout << "Error : Failed : with alpha: " << float(alpha)
+                  << ", beta: " << float(beta)
+                  << ", use_bias: " << use_bias
+                  << "\n";
+      }
+
+      return passed;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Gemm>
+bool TestAllTensorBroadcast(bool use_bias=true) {
+  using ElementScalar = typename Gemm::GemmKernel::CollectiveEpilogue::ElementScalar;
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+
+  int max_alignment = std::max(Gemm::kAlignmentA, Gemm::kAlignmentB);
+  std::vector<int> problem_size_m = {max_alignment, 512 - 3 * max_alignment};
+  std::vector<int> problem_size_n = {max_alignment, 512 - 2 * max_alignment};
+
+  if constexpr (cute::is_same_v<typename Gemm::GemmKernel::DispatchPolicy::Schedule,
+                cutlass::gemm::KernelTmaWarpSpecializedPingpong>) {
+    problem_size_m.push_back(768);
+    problem_size_n.push_back(768);
+  }
+
+  constexpr int Stages = Gemm::GemmKernel::DispatchPolicy::Stages;
+  constexpr int TileShapeK = cute::size<2>(typename Gemm::GemmKernel::TileShape{});
+
+  std::vector<int> problem_size_k = {max_alignment, TileShapeK * (Stages + 1) - max_alignment};
+
+  Testbed3xTensorBroadcast<Gemm> testbed;
+  bool passed = true;
+
+  for (int m : problem_size_m) {
+    for (int n : problem_size_n) {
+      for (int k : problem_size_k) {
+        ProblemShapeType problem_size;
+        if constexpr (cute::rank(ProblemShapeType{}) == 4) {
+          problem_size = ProblemShapeType{m, n, k, /* l */ 1};
+        }
+        else {
+          problem_size = ProblemShapeType{m, n, k};
+        }
+
+        for (bool use_bias : {true, false}) {
+          passed = testbed.run(
+            problem_size,
+            cutlass::from_real<ElementScalar>(1),
+            cutlass::from_real<ElementScalar>(1),
+            false,  // profiling
+            20,     // iterations
+            use_bias
+          );
+
+          if (!passed) {
+            return false;
+          }
+        }
+      }
+    }
+  }
+
+  if constexpr (cute::rank(ProblemShapeType{}) == 4) {
+    auto problem_size = ProblemShapeType{256 + max_alignment, 256 + max_alignment, 160 + max_alignment, /* l */ 3};
+    passed = testbed.run(
+      problem_size,
+      cutlass::from_real<ElementScalar>(1),
+      cutlass::from_real<ElementScalar>(1),
+      false,  // profiling
+      20      // iterations
+    );
+    if (!passed) {
+      return false;
+    }
+  }
+  return passed;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu
index 5a4644a66b..6620c6f0fc 100644
--- a/test/unit/gemm/device/gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu
index e5948c126b..e1d742e86c 100644
--- a/test/unit/gemm/device/gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu
index 3cab51d0ad..de068a7be9 100644
--- a/test/unit/gemm/device/gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu
index bd8487bee1..8c2564f6d0 100644
--- a/test/unit/gemm/device/gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu
index 1a98ef0bcb..59c0d0f755 100644
--- a/test/unit/gemm/device/gemm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu
+++ b/test/unit/gemm/device/gemm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_universal_bf16t_s8n_bf16t_mixed_input_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_bf16t_s8n_bf16t_mixed_input_tensor_op_f32_sm80.cu
new file mode 100644
index 0000000000..de6f28ff4b
--- /dev/null
+++ b/test/unit/gemm/device/gemm_universal_bf16t_s8n_bf16t_mixed_input_tensor_op_f32_sm80.cu
@@ -0,0 +1,278 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+    
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_universal.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM80_Device_GemmUniversal_bf16t_s8n_bf16t_mixed_input_tensor_op_f32, 128x128x64_64x64x64) {
+
+  using ElementA = cutlass::bfloat16_t;
+  using ElementB = int8_t;
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    4,  // Stages
+    8,  // AlignmentA
+    16, // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmUniversal_bf16t_s8n_bf16t_mixed_input_tensor_op_f32, 128x128x32_64x64x32) {
+
+  using ElementA = cutlass::bfloat16_t;
+  using ElementB = int8_t;
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    4,  // Stages
+    8,  // AlignmentA
+    16, // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmUniversal_bf16t_s8n_bf16t_mixed_input_tensor_op_f32, 64x128x32_32x64x32) {
+
+  using ElementA = cutlass::bfloat16_t;
+  using ElementB = int8_t;
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    4,  // Stages
+    8,  // AlignmentA
+    16, // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM80_Device_GemmUniversal_bf16t_s8n_bf16t_mixed_input_tensor_op_f32, 128x64x32_64x32x32) {
+
+  using ElementA = cutlass::bfloat16_t;
+  using ElementB = int8_t;
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 32>,
+    cutlass::gemm::GemmShape<64, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    8,  // Stages
+    8,  // AlignmentA
+    16, // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmUniversal_bf16t_s8n_bf16t_mixed_input_tensor_op_f32, 64x64x32_32x32x32) {
+
+  using ElementA = cutlass::bfloat16_t;
+  using ElementB = int8_t;
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    8,  // Stages
+    8,  // AlignmentA
+    16, // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmUniversal_bf16t_s8n_bf16t_mixed_input_tensor_op_f32, 16x128x32_16x64x32) {
+
+  using ElementA = cutlass::bfloat16_t;
+  using ElementB = int8_t;
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<16, 128, 32>,
+    cutlass::gemm::GemmShape<16, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    8,  // Stages
+    8,  // AlignmentA
+    16, // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_universal_bf16t_s8n_f32t_mixed_input_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_bf16t_s8n_f32t_mixed_input_tensor_op_f32_sm80.cu
new file mode 100644
index 0000000000..bbeb9a1610
--- /dev/null
+++ b/test/unit/gemm/device/gemm_universal_bf16t_s8n_f32t_mixed_input_tensor_op_f32_sm80.cu
@@ -0,0 +1,97 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+    
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_universal.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM80_Device_GemmUniversal_bf16t_s8n_f32t_mixed_input_tensor_op_f32, 128x128x64_64x64x64) {
+
+  using ElementA = cutlass::bfloat16_t;
+  using ElementB = int8_t;
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    4,  // Stages
+    8,  // AlignmentA
+    16, // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_universal_bf16t_u8n_bf16t_mixed_input_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_bf16t_u8n_bf16t_mixed_input_tensor_op_f32_sm80.cu
new file mode 100644
index 0000000000..49d484925b
--- /dev/null
+++ b/test/unit/gemm/device/gemm_universal_bf16t_u8n_bf16t_mixed_input_tensor_op_f32_sm80.cu
@@ -0,0 +1,97 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_universal.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM80_Device_GemmUniversal_bf16t_u8n_bf16t_mixed_input_tensor_op_f32, 128x128x64_64x64x64) {
+
+  using ElementA = cutlass::bfloat16_t;
+  using ElementB = uint8_t;
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA,
+    cutlass::layout::RowMajor,
+    ElementB,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,  // Stages
+    8,  // AlignmentA
+    16, // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_universal_bf16t_u8n_f32t_mixed_input_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_bf16t_u8n_f32t_mixed_input_tensor_op_f32_sm80.cu
new file mode 100644
index 0000000000..93c59c5178
--- /dev/null
+++ b/test/unit/gemm/device/gemm_universal_bf16t_u8n_f32t_mixed_input_tensor_op_f32_sm80.cu
@@ -0,0 +1,97 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+    
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_universal.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM80_Device_GemmUniversal_bf16t_u8n_f32t_mixed_input_tensor_op_f32, 128x128x64_64x64x64) {
+
+  using ElementA = cutlass::bfloat16_t;
+  using ElementB = uint8_t;
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    4,  // Stages
+    8,  // AlignmentA
+    16, // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu
index a314394303..d26ae2017c 100644
--- a/test/unit/gemm/device/gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu b/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
index 6323d19202..0c670050a5 100644
--- a/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu
index 87e7b5353e..a2c489c8ad 100644
--- a/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_universal_f16n_f16t_f32n_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_universal_f16n_f16t_f32n_tensor_op_f32_sm75.cu
index 98cb75fd1a..81d840940b 100644
--- a/test/unit/gemm/device/gemm_universal_f16n_f16t_f32n_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_universal_f16n_f16t_f32n_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm75.cu
index e3b50da184..19518e810f 100644
--- a/test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -79,7 +79,6 @@ TEST(SM75_Device_GemmUniversal_f16n_f16t_f32t_tensor_op_f32, 64x64x32_32x32x32)
   EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
 }
 
-
 TEST(SM75_Device_GemmUniversal_f16n_f16t_f32t_tensor_op_f32, 64x64x32_32x32x32_updated_batch_count) {
 
   using ElementOutput = float;
@@ -114,4 +113,3 @@ TEST(SM75_Device_GemmUniversal_f16n_f16t_f32t_tensor_op_f32, 64x64x32_32x32x32_u
 #endif // #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
 
 ////////////////////////////////////////////////////////////////////////////////
-
diff --git a/test/unit/gemm/device/gemm_universal_f16t_s8n_f16t_mixed_input_tensor_op_f16_sm80.cu b/test/unit/gemm/device/gemm_universal_f16t_s8n_f16t_mixed_input_tensor_op_f16_sm80.cu
new file mode 100644
index 0000000000..61134963da
--- /dev/null
+++ b/test/unit/gemm/device/gemm_universal_f16t_s8n_f16t_mixed_input_tensor_op_f16_sm80.cu
@@ -0,0 +1,97 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+    
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_universal.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM80_Device_GemmUniversal_f16t_s8n_f16t_mixed_input_tensor_op_f16, 128x128x64_64x64x64) {
+
+  using ElementA = cutlass::half_t;
+  using ElementB = int8_t;
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    4,  // Stages
+    8,  // AlignmentA
+    16, // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_universal_f16t_s8n_f16t_mixed_input_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_f16t_s8n_f16t_mixed_input_tensor_op_f32_sm80.cu
new file mode 100644
index 0000000000..86d1da774e
--- /dev/null
+++ b/test/unit/gemm/device/gemm_universal_f16t_s8n_f16t_mixed_input_tensor_op_f32_sm80.cu
@@ -0,0 +1,97 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+    
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_universal.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM80_Device_GemmUniversal_f16t_s8n_f16t_mixed_input_tensor_op_f32, 128x128x64_64x64x64) {
+
+  using ElementA = cutlass::half_t;
+  using ElementB = int8_t;
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    4,  // Stages
+    8,  // AlignmentA
+    16, // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_universal_f16t_s8n_f32t_mixed_input_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_f16t_s8n_f32t_mixed_input_tensor_op_f32_sm80.cu
new file mode 100644
index 0000000000..20da1150d0
--- /dev/null
+++ b/test/unit/gemm/device/gemm_universal_f16t_s8n_f32t_mixed_input_tensor_op_f32_sm80.cu
@@ -0,0 +1,97 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+    
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_universal.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM80_Device_GemmUniversal_f16t_s8n_f32t_mixed_input_tensor_op_f32, 128x128x64_64x64x64) {
+
+  using ElementA = cutlass::half_t;
+  using ElementB = int8_t;
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    4,  // Stages
+    8,  // AlignmentA
+    16, // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_universal_f16t_u8n_f16t_mixed_input_tensor_op_f16_sm80.cu b/test/unit/gemm/device/gemm_universal_f16t_u8n_f16t_mixed_input_tensor_op_f16_sm80.cu
new file mode 100644
index 0000000000..9b105c9eeb
--- /dev/null
+++ b/test/unit/gemm/device/gemm_universal_f16t_u8n_f16t_mixed_input_tensor_op_f16_sm80.cu
@@ -0,0 +1,97 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+    
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_universal.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM80_Device_GemmUniversal_f16t_u8n_f16t_mixed_input_tensor_op_f16, 128x128x64_64x64x64) {
+
+  using ElementA = cutlass::half_t;
+  using ElementB = uint8_t;
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    4,  // Stages
+    8,  // AlignmentA
+    16, // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_universal_f16t_u8n_f16t_mixed_input_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_f16t_u8n_f16t_mixed_input_tensor_op_f32_sm80.cu
new file mode 100644
index 0000000000..b26b213638
--- /dev/null
+++ b/test/unit/gemm/device/gemm_universal_f16t_u8n_f16t_mixed_input_tensor_op_f32_sm80.cu
@@ -0,0 +1,97 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+    
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_universal.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM80_Device_GemmUniversal_f16t_u8n_f16t_mixed_input_tensor_op_f32, 128x128x64_64x64x64) {
+
+  using ElementA = cutlass::half_t;
+  using ElementB = uint8_t;
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    4,  // Stages
+    8,  // AlignmentA
+    16, // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_universal_f16t_u8n_f32t_mixed_input_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_f16t_u8n_f32t_mixed_input_tensor_op_f32_sm80.cu
new file mode 100644
index 0000000000..926a88e8e4
--- /dev/null
+++ b/test/unit/gemm/device/gemm_universal_f16t_u8n_f32t_mixed_input_tensor_op_f32_sm80.cu
@@ -0,0 +1,97 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+    
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_universal.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM80_Device_GemmUniversal_f16t_u8n_f32t_mixed_input_tensor_op_f32, 128x128x64_64x64x64) {
+
+  using ElementA = cutlass::half_t;
+  using ElementB = uint8_t;
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    4,  // Stages
+    8,  // AlignmentA
+    16, // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_universal_s4t_s8n_s32t_mixed_input_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_universal_s4t_s8n_s32t_mixed_input_tensor_op_s32_sm80.cu
new file mode 100644
index 0000000000..421ea0c0b2
--- /dev/null
+++ b/test/unit/gemm/device/gemm_universal_s4t_s8n_s32t_mixed_input_tensor_op_s32_sm80.cu
@@ -0,0 +1,95 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+    
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_universal.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM80_Device_GemmUniversal_s4t_s8n_s32t_mixed_input_tensor_op_s32, 128x128x64_64x64x64) {
+
+  using ElementA = cutlass::int4b_t;
+  using ElementB = int8_t;
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    4,   // Stages
+    32,  // AlignmentA
+    16,  // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_universal_s4t_s8n_s8t_mixed_input_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_universal_s4t_s8n_s8t_mixed_input_tensor_op_s32_sm80.cu
new file mode 100644
index 0000000000..685092fb84
--- /dev/null
+++ b/test/unit/gemm/device/gemm_universal_s4t_s8n_s8t_mixed_input_tensor_op_s32_sm80.cu
@@ -0,0 +1,95 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+    
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_universal.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM80_Device_GemmUniversal_s4t_s8n_s8t_mixed_input_tensor_op_s32, 128x128x64_64x64x64) {
+
+  using ElementA = cutlass::int4b_t;
+  using ElementB = int8_t;
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    4,   // Stages
+    32,  // AlignmentA
+    16,  // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_universal_s8t_bf16n_bf16t_mixed_input_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_s8t_bf16n_bf16t_mixed_input_tensor_op_f32_sm80.cu
new file mode 100644
index 0000000000..0622b9a51d
--- /dev/null
+++ b/test/unit/gemm/device/gemm_universal_s8t_bf16n_bf16t_mixed_input_tensor_op_f32_sm80.cu
@@ -0,0 +1,384 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+    
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_universal.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_GemmUniversal_s8t_bf16n_bf16t_mixed_input_tensor_op_f32, 128x128x64_64x64x64) {
+
+  using ElementA = int8_t;
+  using ElementB = cutlass::bfloat16_t;
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    4,  // Stages
+    16,  // AlignmentA
+    8, // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmUniversal_s8t_bf16n_bf16t_mixed_input_tensor_op_f32, 128x128x32_64x64x32) {
+
+  using ElementA = int8_t;
+  using ElementB = cutlass::bfloat16_t;
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    4,  // Stages
+    16,  // AlignmentA
+    8, // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmUniversal_s8t_bf16n_bf16t_mixed_input_tensor_op_f32, 64x128x32_32x64x32) {
+
+  using ElementA = int8_t;
+  using ElementB = cutlass::bfloat16_t;
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    4,  // Stages
+    16, // AlignmentA
+    8,  // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmUniversal_s8t_bf16n_bf16t_mixed_input_tensor_op_f32, 64x64x32_32x32x32) {
+
+  using ElementA = int8_t;
+  using ElementB = cutlass::bfloat16_t;
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    8,  // Stages
+    16, // AlignmentA
+    8,  // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmUniversal_s8t_bf16n_bf16t_mixed_input_tensor_op_f32, 128x64x32_64x32x32) {
+
+  using ElementA = int8_t;
+  using ElementB = cutlass::bfloat16_t;
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 32>,
+    cutlass::gemm::GemmShape<64, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    8,   // Stages
+    16,  // AlignmentA
+    8,   // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmUniversal_s8t_bf16n_bf16t_mixed_input_tensor_op_f32, 128x64x32_64x64x32) {
+
+  using ElementA = int8_t;
+  using ElementB = cutlass::bfloat16_t;
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    8,   // Stages
+    16,  // AlignmentA
+    8,   // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmUniversal_s8t_bf16n_bf16t_mixed_input_tensor_op_f32, 128x32x32_64x32x32) {
+
+  using ElementA = int8_t;
+  using ElementB = cutlass::bfloat16_t;
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 32, 32>,
+    cutlass::gemm::GemmShape<64, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    8,   // Stages
+    16,  // AlignmentA
+    8,   // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmUniversal_s8t_bf16n_bf16t_mixed_input_tensor_op_f32, 128x16x32_64x16x32) {
+
+  using ElementA = int8_t;
+  using ElementB = cutlass::bfloat16_t;
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 16, 32>,
+    cutlass::gemm::GemmShape<64, 16, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 4,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    4,   // Stages
+    16,  // AlignmentA
+    8,   // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmUniversal_s8t_bf16n_bf16t_mixed_input_tensor_op_f32, 128x16x64_64x16x64) {
+
+  using ElementA = int8_t;
+  using ElementB = cutlass::bfloat16_t;
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 16, 64>,
+    cutlass::gemm::GemmShape<64, 16, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 4,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    4,   // Stages
+    16,  // AlignmentA
+    8,   // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_universal_s8t_bf16n_f32t_mixed_input_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_s8t_bf16n_f32t_mixed_input_tensor_op_f32_sm80.cu
new file mode 100644
index 0000000000..8572e55981
--- /dev/null
+++ b/test/unit/gemm/device/gemm_universal_s8t_bf16n_f32t_mixed_input_tensor_op_f32_sm80.cu
@@ -0,0 +1,97 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+    
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_universal.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM80_Device_GemmUniversal_s8t_bf16n_f32t_mixed_input_tensor_op_f32, 128x128x64_64x64x64) {
+
+  using ElementA = int8_t;
+  using ElementB = cutlass::bfloat16_t;
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    4,   // Stages
+    16,  // AlignmentA
+    8,   // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_universal_s8t_f16n_f16t_mixed_input_tensor_op_f16_sm80.cu b/test/unit/gemm/device/gemm_universal_s8t_f16n_f16t_mixed_input_tensor_op_f16_sm80.cu
new file mode 100644
index 0000000000..1c56674016
--- /dev/null
+++ b/test/unit/gemm/device/gemm_universal_s8t_f16n_f16t_mixed_input_tensor_op_f16_sm80.cu
@@ -0,0 +1,97 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+    
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_universal.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM80_Device_GemmUniversal_s8t_f16n_f16t_mixed_input_tensor_op_f16, 128x128x64_64x64x64) {
+
+  using ElementA = int8_t;
+  using ElementB = cutlass::half_t;
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    4,   // Stages
+    16,  // AlignmentA
+    8,   // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_universal_s8t_f16n_f16t_mixed_input_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_s8t_f16n_f16t_mixed_input_tensor_op_f32_sm80.cu
new file mode 100644
index 0000000000..eb4e293a39
--- /dev/null
+++ b/test/unit/gemm/device/gemm_universal_s8t_f16n_f16t_mixed_input_tensor_op_f32_sm80.cu
@@ -0,0 +1,97 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+    
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_universal.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM80_Device_GemmUniversal_s8t_f16n_f16t_mixed_input_tensor_op_f32, 128x128x64_64x64x64) {
+
+  using ElementA = int8_t;
+  using ElementB = cutlass::half_t;
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    4,   // Stages
+    16,  // AlignmentA
+    8,   // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_universal_s8t_f16n_f32t_mixed_input_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_s8t_f16n_f32t_mixed_input_tensor_op_f32_sm80.cu
new file mode 100644
index 0000000000..064c9b048d
--- /dev/null
+++ b/test/unit/gemm/device/gemm_universal_s8t_f16n_f32t_mixed_input_tensor_op_f32_sm80.cu
@@ -0,0 +1,97 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+    
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_universal.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM80_Device_GemmUniversal_s8t_f16n_f32t_mixed_input_tensor_op_f32, 128x128x64_64x64x64) {
+
+  using ElementA = int8_t;
+  using ElementB = cutlass::half_t;
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    4,   // Stages
+    16,  // AlignmentA
+    8,   // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_universal_s8t_s4n_s32t_mixed_input_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_universal_s8t_s4n_s32t_mixed_input_tensor_op_s32_sm80.cu
new file mode 100644
index 0000000000..b28cee62c0
--- /dev/null
+++ b/test/unit/gemm/device/gemm_universal_s8t_s4n_s32t_mixed_input_tensor_op_s32_sm80.cu
@@ -0,0 +1,95 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+    
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_universal.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM80_Device_GemmUniversal_s8t_s4n_s32t_mixed_input_tensor_op_s32, 128x128x64_64x64x64) {
+
+  using ElementA = int8_t;
+  using ElementB = cutlass::int4b_t;
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    4,   // Stages
+    16,  // AlignmentA
+    32,  // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_universal_s8t_s4n_s8t_mixed_input_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_universal_s8t_s4n_s8t_mixed_input_tensor_op_s32_sm80.cu
new file mode 100644
index 0000000000..89a52b3e80
--- /dev/null
+++ b/test/unit/gemm/device/gemm_universal_s8t_s4n_s8t_mixed_input_tensor_op_s32_sm80.cu
@@ -0,0 +1,95 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+    
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_universal.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM80_Device_GemmUniversal_s8t_s4n_s8t_mixed_input_tensor_op_s32, 128x128x64_64x64x64) {
+
+  using ElementA = int8_t;
+  using ElementB = cutlass::int4b_t;
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 32>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    4,   // Stages
+    16,  // AlignmentA
+    32,  // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_universal_u8t_bf16n_bf16t_mixed_input_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_u8t_bf16n_bf16t_mixed_input_tensor_op_f32_sm80.cu
new file mode 100644
index 0000000000..020c8b38f4
--- /dev/null
+++ b/test/unit/gemm/device/gemm_universal_u8t_bf16n_bf16t_mixed_input_tensor_op_f32_sm80.cu
@@ -0,0 +1,97 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_universal.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM80_Device_GemmUniversal_u8t_bf16n_bf16t_mixed_input_tensor_op_f32, 128x128x64_64x64x64) {
+
+  using ElementA = uint8_t;
+  using ElementB = cutlass::bfloat16_t;
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA,
+    cutlass::layout::RowMajor,
+    ElementB,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,   // Stages
+    16,  // AlignmentA
+    8,   // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_universal_u8t_bf16n_f32t_mixed_input_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_u8t_bf16n_f32t_mixed_input_tensor_op_f32_sm80.cu
new file mode 100644
index 0000000000..d6b65974c8
--- /dev/null
+++ b/test/unit/gemm/device/gemm_universal_u8t_bf16n_f32t_mixed_input_tensor_op_f32_sm80.cu
@@ -0,0 +1,97 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+    
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_universal.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM80_Device_GemmUniversal_u8t_bf16n_f32t_mixed_input_tensor_op_f32, 128x128x64_64x64x64) {
+
+  using ElementA = uint8_t;
+  using ElementB = cutlass::bfloat16_t;
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    4,   // Stages
+    16,  // AlignmentA
+    8,   // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_universal_u8t_f16n_f16t_mixed_input_tensor_op_f16_sm80.cu b/test/unit/gemm/device/gemm_universal_u8t_f16n_f16t_mixed_input_tensor_op_f16_sm80.cu
new file mode 100644
index 0000000000..41657c2fca
--- /dev/null
+++ b/test/unit/gemm/device/gemm_universal_u8t_f16n_f16t_mixed_input_tensor_op_f16_sm80.cu
@@ -0,0 +1,97 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+    
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_universal.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM80_Device_GemmUniversal_u8t_f16n_f16t_mixed_input_tensor_op_f16, 128x128x64_64x64x64) {
+
+  using ElementA = uint8_t;
+  using ElementB = cutlass::half_t;
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    4,   // Stages
+    16,  // AlignmentA
+    8,   // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_universal_u8t_f16n_f16t_mixed_input_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_u8t_f16n_f16t_mixed_input_tensor_op_f32_sm80.cu
new file mode 100644
index 0000000000..b2b3cd3a21
--- /dev/null
+++ b/test/unit/gemm/device/gemm_universal_u8t_f16n_f16t_mixed_input_tensor_op_f32_sm80.cu
@@ -0,0 +1,97 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+    
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_universal.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM80_Device_GemmUniversal_u8t_f16n_f16t_mixed_input_tensor_op_f32, 128x128x64_64x64x64) {
+
+  using ElementA = uint8_t;
+  using ElementB = cutlass::half_t;
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    4,   // Stages
+    16,  // AlignmentA
+    8,   // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_universal_u8t_f16n_f32t_mixed_input_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_u8t_f16n_f32t_mixed_input_tensor_op_f32_sm80.cu
new file mode 100644
index 0000000000..358c109e86
--- /dev/null
+++ b/test/unit/gemm/device/gemm_universal_u8t_f16n_f32t_mixed_input_tensor_op_f32_sm80.cu
@@ -0,0 +1,97 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+    
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_universal.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM80_Device_GemmUniversal_u8t_f16n_f32t_mixed_input_tensor_op_f32, 128x128x64_64x64x64) {
+
+  using ElementA = uint8_t;
+  using ElementB = cutlass::half_t;
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+    ElementA, 
+    cutlass::layout::RowMajor, 
+    ElementB,
+    cutlass::layout::ColumnMajor, 
+    ElementOutput, 
+    cutlass::layout::RowMajor,
+    ElementAccumulator, 
+    cutlass::arch::OpClassTensorOp, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    4,   // Stages
+    16,  // AlignmentA
+    8,   // AlignmentB
+    cutlass::arch::OpMultiplyAddMixedInputUpcast,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_with_broadcast_f16n_f16n_f16n_tensorop_f32_sm75.cu b/test/unit/gemm/device/gemm_with_broadcast_f16n_f16n_f16n_tensorop_f32_sm75.cu
index 7af1a26da3..13e4486432 100644
--- a/test/unit/gemm/device/gemm_with_broadcast_f16n_f16n_f16n_tensorop_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_with_broadcast_f16n_f16n_f16n_tensorop_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -457,6 +457,38 @@ TEST(SM80_Device_GemmWithBroadcast_RELU_f16n_f16n_f16n_tensor_op_f32, 128x128_32
   test::gemm::device::TestAllGemmWithBroadcast<Gemm, GemmWithBiasReluReferenceOp<Gemm> >();
 }
 
+TEST(SM80_Device_GemmWithBroadcast_RELU_f32n_f32n_f32n_tensor_op_f32, 64x64_16x10_32x32x16_16x8x8) {
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationBiasRelu<
+    float,
+    float,
+    float,
+    float,
+    4,
+    false
+  >;
+
+  using GemmKernel =
+    typename cutlass::gemm::kernel::DefaultGemmWithBroadcast<
+      float, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 4,    // transposed B operand
+      float, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone, 4,    // transposed A operand
+      float, cutlass::layout::RowMajor,
+      float,
+      cutlass::arch::OpClassTensorOp,
+      cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 16>,
+      cutlass::gemm::GemmShape<32, 32, 16>,
+      cutlass::gemm::GemmShape<16, 8, 8>,
+      EpilogueOutputOp,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
+      10,
+      cutlass::arch::OpMultiplyAdd
+  >::GemmKernel;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  test::gemm::device::TestAllGemmWithBroadcast<Gemm, GemmWithBiasReluReferenceOp<Gemm> >();
+}
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 #endif
diff --git a/test/unit/gemm/device/gemm_with_reduction_f16n_f16n_f16n_tensorop_f32_sm75.cu b/test/unit/gemm/device/gemm_with_reduction_f16n_f16n_f16n_tensorop_f32_sm75.cu
index 6e1e0d1568..1a1c4e0380 100644
--- a/test/unit/gemm/device/gemm_with_reduction_f16n_f16n_f16n_tensorop_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_with_reduction_f16n_f16n_f16n_tensorop_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_with_reduction_f16t_f16n_f16n_tensorop_f32_sm80.cu b/test/unit/gemm/device/gemm_with_reduction_f16t_f16n_f16n_tensorop_f32_sm80.cu
index fa1a9c3b9f..af691458f5 100644
--- a/test/unit/gemm/device/gemm_with_reduction_f16t_f16n_f16n_tensorop_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_with_reduction_f16t_f16n_f16n_tensorop_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemv.cu b/test/unit/gemm/device/gemv.cu
index 4a25b7e9f7..356883bf8c 100644
--- a/test/unit/gemm/device/gemv.cu
+++ b/test/unit/gemm/device/gemv.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -98,7 +98,7 @@ public:
     cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
     cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
     cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
-    uint64_t seed_ = 2080
+    uint64_t seed_ = 2023
   ):
     init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { }
 
@@ -146,7 +146,6 @@ public:
         view.data(), view.capacity());
     } 
     else {
-      // TODO: Implement the rest
       EXPECT_TRUE(false) << "Not implemented";
       return false;
     }
@@ -156,22 +155,29 @@ public:
 
   /// Initializes data structures
   void initialize(
-    cutlass::MatrixCoord problem_size
+    cutlass::MatrixCoord problem_size,
+    int32_t batch_count
   ) {
 
     //
-    // Allocate the GEMM workspace
+    // Allocate the GEMV workspace
     //
 
-    tensor_A.resize(problem_size);
-    tensor_B.resize({problem_size.column(), 1});
-    tensor_C.resize({problem_size.row(), 1});
-    tensor_D.resize({problem_size.row(), 1});
-    reference_D.resize({problem_size.row(), 1}, false);
+    if(std::is_same<LayoutA, cutlass::layout::ColumnMajor>::value) {
+      tensor_A.resize({problem_size.row(), batch_count * problem_size.column()});
+    }
+    else {
+      tensor_A.resize({batch_count * problem_size.row(), problem_size.column()});
+    }
+    
+    tensor_B.resize({batch_count * problem_size.column(), 1});
+    tensor_C.resize({batch_count * problem_size.row(), 1});
+    tensor_D.resize({batch_count * problem_size.row(), 1});
+    reference_D.resize({batch_count * problem_size.row(), 1}, false);
 
-    EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 2019));
-    EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2018));
-    EXPECT_TRUE(initialize_tensor(tensor_C.host_view(), init_C, seed + 2017));
+    EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 1));
+    EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2));
+    EXPECT_TRUE(initialize_tensor(tensor_C.host_view(), init_C, seed + 3));
 
     // It is possible to randomly initialize to all zeros, so override this with non-zeros
     // in the upper left corner of each operand.
@@ -225,9 +231,14 @@ public:
     return passed;
   }
 
-  /// Verifies the result is a GEMM
+  /// Verifies the result
   bool verify(
-    cutlass::MatrixCoord problem_size, 
+    cutlass::MatrixCoord problem_size,
+    int32_t batch_count,
+    int64_t batch_stride_A,
+    int64_t batch_stride_B,
+    int64_t batch_stride_C,
+    int64_t batch_stride_D,
     ElementCompute alpha, 
     ElementCompute beta) {
 
@@ -242,7 +253,7 @@ public:
         ElementCompute, ElementAccumulator
     >(
       {problem_size.row(), 1, problem_size.column()},
-      alpha, 
+      alpha,
       tensor_A.host_ref(),
       Gemv::kTransformA,
       tensor_B.host_ref(),
@@ -250,7 +261,12 @@ public:
       beta, 
       tensor_C.host_ref(), 
       reference_D.host_ref(), 
-      ElementAccumulator(0)
+      ElementAccumulator(0),
+      batch_count,
+      batch_stride_A,
+      batch_stride_B,
+      batch_stride_C,
+      batch_stride_D
     );
 
     return compare_reference(problem_size, alpha, beta);
@@ -259,39 +275,50 @@ public:
   /// Runs one problem size
   bool run(
     cutlass::MatrixCoord problem_size, 
+    int32_t batch_count,
+    int64_t batch_stride_A,
+    int64_t batch_stride_B,
+    int64_t batch_stride_C,
+    int64_t batch_stride_D,
     ElementCompute alpha,
     ElementCompute beta) {
 
-    this->initialize(problem_size);
+    this->initialize(problem_size, batch_count);
 
     //
-    // Initialize the GEMM operator
+    // Initialize the GEMV operator
     //
 
     typename Gemv::Arguments arguments{
       problem_size,
+      batch_count,
       {alpha, beta},
       tensor_A.device_ref(),
       tensor_B.device_data(),
       tensor_C.device_data(),
       tensor_D.device_data(),
-      tensor_B.layout().stride(0),
-      tensor_C.layout().stride(0),
-      tensor_D.layout().stride(0)
+      batch_stride_A,
+      batch_stride_B,
+      batch_stride_C,
+      batch_stride_D
     };
 
     Gemv gemm_op;
 
+    cutlass::Status status = gemm_op.can_implement(arguments);
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
     size_t workspace_size = Gemv::get_workspace_size(arguments);
 
     cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
 
-    cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
+    status = gemm_op.initialize(arguments, workspace.get());
 
     EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
 
     //
-    // Run the GEMM
+    // Run the GEMV
     //
 
     status = gemm_op();
@@ -302,8 +329,15 @@ public:
     // Verify
     //
 
-    bool passed = this->verify(problem_size, alpha, beta);
-
+    bool passed = this->verify(
+        problem_size,
+        batch_count,
+        batch_stride_A,
+        batch_stride_B,
+        batch_stride_C,
+        batch_stride_D,
+        alpha,
+        beta);
     return passed;
   }
 };
@@ -315,12 +349,16 @@ bool TestAllGemv() {
 
   using ElementCompute = typename Gemv::EpilogueOutputOp::ElementCompute;
 
+  int Batch[] = {
+    1, 520, 1314
+  };
+
   int M[] = {
-    8, 48, 192, 520
+    1, 5, 16
   };
 
   int K[] = {
-    8, 192, 528
+    8, 128, 256
   };
 
   double Alpha[] = {
@@ -331,15 +369,25 @@ bool TestAllGemv() {
     0, 1, 1.25
   };
 
-  for (int m : M) {
-    for (int k : K) {
-      for (double alpha : Alpha) {
-        for (double beta : Beta) {
-
-          TestbedGemv<Gemv> testbed;
-
-          if (!testbed.run({m, k}, ElementCompute(alpha), ElementCompute(beta))) {
-            return false;
+  for (int b : Batch) {
+    for (int m : M) {
+      for (int k : K) {
+        for (double alpha : Alpha) {
+          for (double beta : Beta) {
+
+            TestbedGemv<Gemv> testbed;
+
+            if (!testbed.run(
+                    {m, k},
+                    b,
+                    m * k,
+                    k,
+                    m,
+                    m,
+                    ElementCompute(alpha),
+                    ElementCompute(beta))) {
+              return false;
+            }
           }
         }
       }
@@ -354,9 +402,103 @@ bool TestAllGemv() {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-TEST(SM50_Device_Gemv_f32n_f32_f32_simt_f32, Simple) {
+TEST(SM50_Device_Gemv_f16n_f16_f16_simt_f32, RowMajorA) {
+
+  using ElementInput = cutlass::half_t;
+  using ElementOutput = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+  int const kElementsPerAccess = 8;
+  
+  using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator>;
+
+  using Gemv = cutlass::gemm::device::Gemv<
+      cutlass::gemm::kernel::Gemv<
+          ElementInput,           // Element A
+          LayoutA,                // Layout A
+          ElementInput,           // Element B
+          ElementOutput,          // Element C
+          ElementAccumulator,     // Element accumulator
+          EpilogueOp,             // Output operator
+          kElementsPerAccess      // Element access granularity
+          >
+      >;
+
+  EXPECT_TRUE(test::gemm::TestAllGemv<Gemv>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM50_Device_Gemv_f32n_f32_f32_simt_f32, RowMajorA) {
 
+  using ElementInput = float;
   using ElementOutput = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+  int const kElementsPerAccess = 4;
+  
+  using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator>;
+
+  using Gemv = cutlass::gemm::device::Gemv<
+      cutlass::gemm::kernel::Gemv<
+          ElementInput,           // Element A
+          LayoutA,                // Layout A
+          ElementInput,           // Element B
+          ElementOutput,          // Element C
+          ElementAccumulator,     // Element accumulator
+          EpilogueOp,             // Output operator
+          kElementsPerAccess      // Element access granularity
+          >
+      >;
+
+  EXPECT_TRUE(test::gemm::TestAllGemv<Gemv>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM50_Device_Gemv_f64n_f64_f64_simt_f64, RowMajorA) {
+
+  using ElementInput = double;
+  using ElementOutput = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+  int const kElementsPerAccess = 2;
+  
+  using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator>;
+
+  using Gemv = cutlass::gemm::device::Gemv<
+      cutlass::gemm::kernel::Gemv<
+          ElementInput,           // Element A
+          LayoutA,                // Layout A
+          ElementInput,           // Element B
+          ElementOutput,          // Element C
+          ElementAccumulator,     // Element accumulator
+          EpilogueOp,             // Output operator
+          kElementsPerAccess      // Element access granularity
+          >
+      >;
+
+  EXPECT_TRUE(test::gemm::TestAllGemv<Gemv>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM50_Device_Gemv_f16n_f16_f16_simt_f32, ColumnMajorA) {
+
+  using ElementInput = cutlass::half_t;
+  using ElementOutput = cutlass::half_t;
   using LayoutA = cutlass::layout::ColumnMajor;
   using ElementAccumulator = float;
 
@@ -368,9 +510,9 @@ TEST(SM50_Device_Gemv_f32n_f32_f32_simt_f32, Simple) {
 
   using Gemv = cutlass::gemm::device::Gemv<
     cutlass::gemm::kernel::Gemv<
-        ElementOutput,          // Element A
+        ElementInput,           // Element A
         LayoutA,                // Layout A
-        ElementOutput,          // Element B
+        ElementInput,           // Element B
         ElementOutput,          // Element C
         ElementAccumulator,     // Element Accumulator
         EpilogueOp              // Output operator
@@ -383,9 +525,9 @@ TEST(SM50_Device_Gemv_f32n_f32_f32_simt_f32, Simple) {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-TEST(SM50_Device_Gemv_f16n_f16_f32_simt_f32, Simple) {
+TEST(SM50_Device_Gemv_f32n_f32_f32_simt_f32, ColumnMajorA) {
 
-  using ElementInput = cutlass::half_t;
+  using ElementInput = float;
   using ElementOutput = float;
   using LayoutA = cutlass::layout::ColumnMajor;
   using ElementAccumulator = float;
@@ -413,12 +555,12 @@ TEST(SM50_Device_Gemv_f16n_f16_f32_simt_f32, Simple) {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-TEST(SM50_Device_Gemv_f16n_f16_f16_simt_f32, Simple) {
+TEST(SM50_Device_Gemv_f64n_f64_f64_simt_f64, ColumnMajorA) {
 
-  using ElementInput = cutlass::half_t;
-  using ElementOutput = cutlass::half_t;
+  using ElementInput = double;
+  using ElementOutput = double;
   using LayoutA = cutlass::layout::ColumnMajor;
-  using ElementAccumulator = float;
+  using ElementAccumulator = double;
 
   using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
       ElementOutput,
diff --git a/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_f32_ls_sm80.cu b/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_f32_ls_sm80.cu
index aecefe5e01..dd26f00e58 100644
--- a/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_f32_ls_sm80.cu
+++ b/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_f32_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_f32_rs_sm80.cu b/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_f32_rs_sm80.cu
index b7da888b5d..d8aa09b13d 100644
--- a/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_f32_rs_sm80.cu
+++ b/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_f32_rs_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_fast_f32_ls_sm80.cu b/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_fast_f32_ls_sm80.cu
index e7423a556c..1cff46ce9b 100644
--- a/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_fast_f32_ls_sm80.cu
+++ b/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_fast_f32_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_fast_f32_rs_sm80.cu b/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_fast_f32_rs_sm80.cu
index 2933c375a8..86396f5740 100644
--- a/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_fast_f32_rs_sm80.cu
+++ b/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_fast_f32_rs_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/hemm_cf64_cf64_cf64_tensor_op_f64_sm90.cu b/test/unit/gemm/device/hemm_cf64_cf64_cf64_tensor_op_f64_sm90.cu
new file mode 100644
index 0000000000..963a073f42
--- /dev/null
+++ b/test/unit/gemm/device/hemm_cf64_cf64_cf64_tensor_op_f64_sm90.cu
@@ -0,0 +1,134 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide HEMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/symm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/symm_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_symm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Hemm_cf64h_cf64n_ls_l_tensor_op_f64_gaussian, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<double>;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Hemm = cutlass::gemm::device::Symm<
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddGaussianComplex,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Hemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Hemm_cf64h_cf64n_rs_u_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementOutput = cutlass::complex<double>;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Hemm = cutlass::gemm::device::Symm<
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Hemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
diff --git a/test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu b/test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu
index a1568d824a..de3f78d5f6 100644
--- a/test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu
+++ b/test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_ls_f64_sm80.cu b/test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_ls_f64_sm80.cu
index ca9bdbc5a7..acc4d040a4 100644
--- a/test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_ls_f64_sm80.cu
+++ b/test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_ls_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_rs_f64_sm80.cu b/test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_rs_f64_sm80.cu
index 58570150da..f5fbd473f7 100644
--- a/test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_rs_f64_sm80.cu
+++ b/test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_rs_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_f32_sm80.cu
index 966c7519f9..3733bea618 100644
--- a/test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_fast_f32_sm80.cu b/test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_fast_f32_sm80.cu
index 3699c4ab9e..e5b09d976f 100644
--- a/test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_fast_f32_sm80.cu
+++ b/test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_fast_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/her2k_cf64_cf64_tensor_op_f64_sm90.cu b/test/unit/gemm/device/her2k_cf64_cf64_tensor_op_f64_sm90.cu
new file mode 100644
index 0000000000..03f82bad5e
--- /dev/null
+++ b/test/unit/gemm/device/her2k_cf64_cf64_tensor_op_f64_sm90.cu
@@ -0,0 +1,148 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide HER2K interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_2k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_2k.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank2k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Her2k_cf64n_cf64n_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    1,     // AlignmentB
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KHermitianUniversal<Rank2K>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Her2k_cf64c_cf64n_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::RowMajor;
+
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::RowMajor;
+
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    1,     // AlignmentB
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KHermitianUniversal<Rank2K>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
diff --git a/test/unit/gemm/device/her2k_cf64h_cf64n_tensor_op_f64_grouped_sm80.cu b/test/unit/gemm/device/her2k_cf64h_cf64n_tensor_op_f64_grouped_sm80.cu
new file mode 100644
index 0000000000..68f3ac9854
--- /dev/null
+++ b/test/unit/gemm/device/her2k_cf64h_cf64n_tensor_op_f64_grouped_sm80.cu
@@ -0,0 +1,310 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for grouped Rank2K interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/rank_2k_grouped.h"
+#include "cutlass/gemm/kernel/default_rank_2k_grouped.h"
+#include "cutlass/gemm/device/rank_2k_grouped.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_grouped_rank_2k.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+// NOTE: HER2K requires that LayoutA == LayoutB, and that LayoutC == ColumnMajor
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Her2KGrouped_cf64h_cf64n_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kConjugate, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kConjugate, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kHermitian>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Her2KGrouped_cf64h_cf64n_l_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kConjugate, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kConjugate, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kHermitian>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Her2KGrouped_cf64h_cf64n_l_tensor_op_f64, 32x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kConjugate, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kConjugate, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kHermitian>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Her2KGrouped_cf64h_cf64n_l_tensor_op_f64, 64x32x16_32x32x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kConjugate, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kConjugate, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kHermitian>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Her2KGrouped_cf64h_cf64n_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kConjugate, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kConjugate, 1,
+    ElementC, LayoutC, cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kHermitian>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Her2KGrouped_cf64h_cf64n_u_tensor_op_f64, 32x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kConjugate, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kConjugate, 1,
+    ElementC, LayoutC, cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kHermitian>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Her2KGrouped_cf64h_cf64n_u_tensor_op_f64, 64x32x16_32x32x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kConjugate, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kConjugate, 1,
+    ElementC, LayoutC, cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kHermitian>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/her2k_cf64n_cf64n_tensor_op_f64_grouped_sm80.cu b/test/unit/gemm/device/her2k_cf64n_cf64n_tensor_op_f64_grouped_sm80.cu
new file mode 100644
index 0000000000..6fd74d2942
--- /dev/null
+++ b/test/unit/gemm/device/her2k_cf64n_cf64n_tensor_op_f64_grouped_sm80.cu
@@ -0,0 +1,310 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for grouped Rank2K interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/rank_2k_grouped.h"
+#include "cutlass/gemm/kernel/default_rank_2k_grouped.h"
+#include "cutlass/gemm/device/rank_2k_grouped.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_grouped_rank_2k.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+// NOTE: HER2K requires that LayoutA == LayoutB, and that LayoutC == ColumnMajor
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Her2KGrouped_cf64n_cf64n_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kHermitian>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Her2KGrouped_cf64n_cf64n_l_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kHermitian>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Her2KGrouped_cf64n_cf64n_l_tensor_op_f64, 64x32x16_32x32x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kHermitian>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Her2KGrouped_cf64n_cf64n_l_tensor_op_f64, 32x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kHermitian>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Her2KGrouped_cf64n_cf64n_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kHermitian>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Her2KGrouped_cf64n_cf64n_u_tensor_op_f64, 64x32x16_32x32x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kHermitian>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Her2KGrouped_cf64n_cf64n_u_tensor_op_f64, 32x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kHermitian>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/her2k_cf64n_cf64n_tensor_op_f64_sm80.cu b/test/unit/gemm/device/her2k_cf64n_cf64n_tensor_op_f64_sm80.cu
index eea6f66732..d39ba9cdb0 100644
--- a/test/unit/gemm/device/her2k_cf64n_cf64n_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/her2k_cf64n_cf64n_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/her2k_cf64n_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/her2k_cf64n_cf64t_tensor_op_f64_sm80.cu
index 5cb6309fbb..3d1725d25a 100644
--- a/test/unit/gemm/device/her2k_cf64n_cf64t_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/her2k_cf64n_cf64t_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/herk_cf32h_cf32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/herk_cf32h_cf32n_tensor_op_f32_sm80.cu
index ca2a72e5d4..3a63676ebf 100644
--- a/test/unit/gemm/device/herk_cf32h_cf32n_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/herk_cf32h_cf32n_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/herk_cf32h_cf32n_tensor_op_fast_f32_sm80.cu b/test/unit/gemm/device/herk_cf32h_cf32n_tensor_op_fast_f32_sm80.cu
index dbbd23f5f6..84fb0fa7bc 100644
--- a/test/unit/gemm/device/herk_cf32h_cf32n_tensor_op_fast_f32_sm80.cu
+++ b/test/unit/gemm/device/herk_cf32h_cf32n_tensor_op_fast_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/herk_cf64_cf64_tensor_op_f64_sm90.cu b/test/unit/gemm/device/herk_cf64_cf64_tensor_op_f64_sm90.cu
new file mode 100644
index 0000000000..4696326b4a
--- /dev/null
+++ b/test/unit/gemm/device/herk_cf64_cf64_tensor_op_f64_sm90.cu
@@ -0,0 +1,92 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide HERK interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_k_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank_k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// HERK operator on CUBLAS_OP_C (row-major + conj) input layouts
+TEST(SM90_Device_Herk_cf64h_cf64n_l_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::RowMajor;
+
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
diff --git a/test/unit/gemm/device/herk_cf64h_cf64n_tensor_op_f64_sm80.cu b/test/unit/gemm/device/herk_cf64h_cf64n_tensor_op_f64_sm80.cu
index 1db9f93034..6e99b37631 100644
--- a/test/unit/gemm/device/herk_cf64h_cf64n_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/herk_cf64h_cf64n_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/multistage_testbed.h b/test/unit/gemm/device/multistage_testbed.h
index dbc77b9d6d..2fc718648f 100644
--- a/test/unit/gemm/device/multistage_testbed.h
+++ b/test/unit/gemm/device/multistage_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -58,6 +58,11 @@ namespace device {
 
 template <typename Gemm>
 struct MultistageTestbed {
+
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementC = typename Gemm::ElementC;
+
   using ElementAccumulator = typename Gemm::ElementAccumulator;
   using ElementCompute =
       typename Gemm::GemmKernel::Epilogue::OutputOp::ElementCompute;
@@ -95,7 +100,6 @@ struct MultistageTestbed {
       cutlass::reference::host::BlockFillSequential(view.data(),
                                                     view.capacity());
     } else {
-      // TODO: Implement the rest
       EXPECT_TRUE(false) << "Not implemented";
       return false;
     }
@@ -109,7 +113,7 @@ struct MultistageTestbed {
     // Determine SMEM requirements and waive if not satisfied
     //
 
-    int smem_size = int(sizeof(typename Gemm::GemmKernel::SharedStorage));
+    size_t smem_size = sizeof(typename Gemm::GemmKernel::SharedStorage);
 
     cudaDeviceProp properties;
     int device_idx;
@@ -125,7 +129,7 @@ struct MultistageTestbed {
       throw std::runtime_error("cudaGetDeviceProperties() failed");
     }
 
-    if (properties.sharedMemPerMultiprocessor < smem_size) {
+    if (properties.sharedMemPerBlockOptin < smem_size) {
       return false;
     }
 
@@ -137,10 +141,10 @@ struct MultistageTestbed {
            ElementCompute alpha = ElementCompute(1),
            ElementCompute beta = ElementCompute(0)) {
 
-		// Waives test if CUDA device is insufficient
-		if (!sufficient()) {
-			return true;
-		}
+    // Waives test if CUDA device is insufficient
+    if (!sufficient()) {
+    	return true;
+    }
 
     //
     // Allocate the GEMM workspace
diff --git a/test/unit/gemm/device/multistage_testbed_interleaved.h b/test/unit/gemm/device/multistage_testbed_interleaved.h
index 7b1ece8553..361f977b3e 100644
--- a/test/unit/gemm/device/multistage_testbed_interleaved.h
+++ b/test/unit/gemm/device/multistage_testbed_interleaved.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -59,6 +59,9 @@ namespace device {
 template <typename Gemm, int InterleavedK>
 struct MultistageInterleavedTestbed {
 
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementC = typename Gemm::ElementC;
   using ElementAccumulator = typename Gemm::ElementAccumulator;
   using ElementCompute = typename Gemm::GemmKernel::Epilogue::OutputOp::ElementCompute;
 
@@ -102,7 +105,6 @@ struct MultistageInterleavedTestbed {
         view.data(), view.capacity());
     } 
     else {
-      // TODO: Implement the rest
       EXPECT_TRUE(false) << "Not implemented";
       return false;
     }
@@ -110,12 +112,49 @@ struct MultistageInterleavedTestbed {
     return true;
   }
 
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    size_t smem_size = sizeof(typename Gemm::GemmKernel::SharedStorage);
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerMultiprocessor < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
   /// Executes one test
   bool run(
     cutlass::gemm::GemmCoord problem_size, 
     ElementCompute alpha = ElementCompute(1), 
     ElementCompute beta = ElementCompute(0)) {
     
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
     //
     // Allocate the GEMM workspace
     //
diff --git a/test/unit/gemm/device/rank_2k_grouped_scheduler_sm80.cu b/test/unit/gemm/device/rank_2k_grouped_scheduler_sm80.cu
new file mode 100644
index 0000000000..c6141383ae
--- /dev/null
+++ b/test/unit/gemm/device/rank_2k_grouped_scheduler_sm80.cu
@@ -0,0 +1,234 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for grouped Rank2K problem visitors
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm_grouped.h"
+#include "cutlass/gemm/kernel/default_gemm_grouped.h"
+#include "cutlass/gemm/device/gemm_grouped.h"
+
+#include "testbed_grouped_rank_2k_scheduler.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Run a series of tests on the testbed
+template <typename Testbed>
+void run_tests(bool skip_tile_check=false) {
+  for (int scale_factor : {8, 16, 32, 64}) {
+    for (int threadblock_count : {54, 108, 216, 324, 432}) {
+      for (int problems : {1, 27, 180, 300}) {
+        Testbed testbed(skip_tile_check);
+        testbed.run(problems, threadblock_count, scale_factor);
+      }
+    }
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Rank2KGroupedScheduler_p128_t128_l, 64x64x32) {
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  static int const kNumPrefetch = 128;
+  static int const kThreadCount = 128;
+  static cutlass::FillMode const kFillModeC = cutlass::FillMode::kLower;
+
+  using Testbed = test::gemm::device::TestbedGroupedRank2KScheduler<
+                              ThreadblockShape,
+                              kNumPrefetch,
+                              kThreadCount,
+                              kFillModeC,
+                              // List of GroupScheduleModes to compare. List must contain at least two.
+                              cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly,
+                              cutlass::gemm::kernel::GroupScheduleMode::kHostPrecompute>;
+  run_tests<Testbed>();
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Rank2KGroupedScheduler_p128_t128_u, 64x64x32) {
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  static int const kNumPrefetch = 128;
+  static int const kThreadCount = 128;
+  static cutlass::FillMode const kFillModeC = cutlass::FillMode::kUpper;
+
+  using Testbed = test::gemm::device::TestbedGroupedRank2KScheduler<
+                              ThreadblockShape,
+                              kNumPrefetch,
+                              kThreadCount,
+                              kFillModeC,
+                              // List of GroupScheduleModes to compare. List must contain at least two.
+                              cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly,
+                              cutlass::gemm::kernel::GroupScheduleMode::kHostPrecompute>;
+  run_tests<Testbed>();
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Rank2KGroupedScheduler_p256_t256_l, 64x64x32) {
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  static int const kNumPrefetch = 256;
+  static int const kThreadCount = 256;
+  static cutlass::FillMode const kFillModeC = cutlass::FillMode::kLower;
+
+  using Testbed = test::gemm::device::TestbedGroupedRank2KScheduler<
+                              ThreadblockShape,
+                              kNumPrefetch,
+                              kThreadCount,
+                              kFillModeC,
+                              // List of GroupScheduleModes to compare. List must contain at least two.
+                              cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly,
+                              cutlass::gemm::kernel::GroupScheduleMode::kHostPrecompute>;
+  run_tests<Testbed>();
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Rank2KGroupedScheduler_p256_t128_l, 64x64x32) {
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  static int const kNumPrefetch = 256;
+  static int const kThreadCount = 128;
+  static cutlass::FillMode const kFillModeC = cutlass::FillMode::kLower;
+
+  using Testbed = test::gemm::device::TestbedGroupedRank2KScheduler<
+                              ThreadblockShape,
+                              kNumPrefetch,
+                              kThreadCount,
+                              kFillModeC,
+                              // List of GroupScheduleModes to compare. List must contain at least two.
+                              cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly,
+                              cutlass::gemm::kernel::GroupScheduleMode::kHostPrecompute>;
+  run_tests<Testbed>();
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Rank2KGroupedScheduler_p256_t256_l, 64x32x32) {
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 32>;
+  static int const kNumPrefetch = 256;
+  static int const kThreadCount = 256;
+  static cutlass::FillMode const kFillModeC = cutlass::FillMode::kLower;
+
+  using Testbed = test::gemm::device::TestbedGroupedRank2KScheduler<
+                              ThreadblockShape,
+                              kNumPrefetch,
+                              kThreadCount,
+                              kFillModeC,
+                              // List of GroupScheduleModes to compare. List must contain at least two.
+                              cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly,
+                              cutlass::gemm::kernel::GroupScheduleMode::kHostPrecompute>;
+
+  // Skip individual tile check for the non-square SYR2K versions. We still
+  // compare the problem visitors with one another
+  run_tests<Testbed>(/*skip_tile_check=*/true);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Rank2KGroupedScheduler_p256_t256_u, 64x32x32) {
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 32>;
+  static int const kNumPrefetch = 256;
+  static int const kThreadCount = 256;
+  static cutlass::FillMode const kFillModeC = cutlass::FillMode::kUpper;
+
+  using Testbed = test::gemm::device::TestbedGroupedRank2KScheduler<
+                              ThreadblockShape,
+                              kNumPrefetch,
+                              kThreadCount,
+                              kFillModeC,
+                              // List of GroupScheduleModes to compare. List must contain at least two.
+                              cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly,
+                              cutlass::gemm::kernel::GroupScheduleMode::kHostPrecompute>;
+
+  // Skip individual tile check for the non-square SYR2K versions. We still
+  // compare the problem visitors with one another
+  run_tests<Testbed>(/*skip_tile_check=*/true);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Rank2KGroupedScheduler_p256_t256_l, 32x64x32) {
+  using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 32>;
+  static int const kNumPrefetch = 256;
+  static int const kThreadCount = 256;
+  static cutlass::FillMode const kFillModeC = cutlass::FillMode::kLower;
+
+  using Testbed = test::gemm::device::TestbedGroupedRank2KScheduler<
+                              ThreadblockShape,
+                              kNumPrefetch,
+                              kThreadCount,
+                              kFillModeC,
+                              // List of GroupScheduleModes to compare. List must contain at least two.
+                              cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly,
+                              cutlass::gemm::kernel::GroupScheduleMode::kHostPrecompute>;
+
+  // Skip individual tile check for the non-square SYR2K versions. We still
+  // compare the problem visitors with one another
+  run_tests<Testbed>(/*skip_tile_check=*/true);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Rank2KGroupedScheduler_p256_t256_u, 32x64x32) {
+  using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 32>;
+  static int const kNumPrefetch = 256;
+  static int const kThreadCount = 256;
+  static cutlass::FillMode const kFillModeC = cutlass::FillMode::kUpper;
+
+  using Testbed = test::gemm::device::TestbedGroupedRank2KScheduler<
+                              ThreadblockShape,
+                              kNumPrefetch,
+                              kThreadCount,
+                              kFillModeC,
+                              // List of GroupScheduleModes to compare. List must contain at least two.
+                              cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly,
+                              cutlass::gemm::kernel::GroupScheduleMode::kHostPrecompute>;
+
+  // Skip individual tile check for the non-square SYR2K versions. We still
+  // compare the problem visitors with one another
+  run_tests<Testbed>(/*skip_tile_check=*/true);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/simt_cgemm_nn_sm50.cu b/test/unit/gemm/device/simt_cgemm_nn_sm50.cu
index 6f68ebf3b0..e1a9ec6ad2 100644
--- a/test/unit/gemm/device/simt_cgemm_nn_sm50.cu
+++ b/test/unit/gemm/device/simt_cgemm_nn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_cgemm_nt_sm50.cu b/test/unit/gemm/device/simt_cgemm_nt_sm50.cu
index de145b3718..648c87949a 100644
--- a/test/unit/gemm/device/simt_cgemm_nt_sm50.cu
+++ b/test/unit/gemm/device/simt_cgemm_nt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_cgemm_nt_sm80.cu b/test/unit/gemm/device/simt_cgemm_nt_sm80.cu
new file mode 100644
index 0000000000..e1b829f6f3
--- /dev/null
+++ b/test/unit/gemm/device/simt_cgemm_nt_sm80.cu
@@ -0,0 +1,265 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm_complex.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_complex.h"
+    
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_cf32n_cf32t_cf32t_simt_cf32, 32x64x8_32x64x1) {
+  
+  using Element = cutlass::complex<float>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element, 
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor, 
+    Element,
+    cutlass::arch::OpClassSimt, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 8>,
+    cutlass::gemm::GemmShape<32, 32, 8>,
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        Element, 
+        1,
+        Element, 
+        Element>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_cf32n_cf32t_cf32t_simt_cf32, 64x64x8_32x64x1) {
+  
+  using Element = cutlass::complex<float>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element, 
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor, 
+    Element,
+    cutlass::arch::OpClassSimt, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        Element, 
+        1,
+        Element, 
+        Element>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_cf32n_cf32t_cf32t_simt_cf32, 128x128x8_32x64x1) {
+  
+  using Element = cutlass::complex<float>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element, 
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor, 
+    Element,
+    cutlass::arch::OpClassSimt, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        Element, 
+        1,
+        Element, 
+        Element>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    3,
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::ComplexTransform::kConjugate
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_cf32n_cf32t_cf32t_simt_cf32, 64x128x8_32x64x1) {
+  
+  using Element = cutlass::complex<float>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element, 
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor, 
+    Element,
+    cutlass::arch::OpClassSimt, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        Element, 
+        1,
+        Element, 
+        Element>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    3,
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::ComplexTransform::kConjugate
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_cf32n_cf32t_cf32t_simt_cf32, 128x64x8_32x64x1) {
+  
+  using Element = cutlass::complex<float>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element, 
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor, 
+    Element,
+    cutlass::arch::OpClassSimt, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 8>,
+    cutlass::gemm::GemmShape<64, 32, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        Element, 
+        1,
+        Element, 
+        Element>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+
+TEST(SM80_Device_Gemm_cf32n_cf32t_cf32t_simt_cf32, 128x128x8_64x64x1) {
+  
+  using Element = cutlass::complex<float>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element, 
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor, 
+    Element,
+    cutlass::arch::OpClassSimt, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<64, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        Element, 
+        1,
+        Element, 
+        Element>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_cf32n_cf32t_cf32t_simt_cf32, 128x256x8_64x64x1) {
+  
+  using Element = cutlass::complex<float>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element, 
+    cutlass::layout::RowMajor,
+    Element,
+    cutlass::layout::RowMajor, 
+    Element,
+    cutlass::arch::OpClassSimt, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 8>,
+    cutlass::gemm::GemmShape<64, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        Element, 
+        1,
+        Element, 
+        Element>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    3,
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::ComplexTransform::kConjugate
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/simt_cgemm_tn_sm50.cu b/test/unit/gemm/device/simt_cgemm_tn_sm50.cu
index 7f338db55f..daae04a693 100644
--- a/test/unit/gemm/device/simt_cgemm_tn_sm50.cu
+++ b/test/unit/gemm/device/simt_cgemm_tn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_cgemm_tn_sm80.cu b/test/unit/gemm/device/simt_cgemm_tn_sm80.cu
new file mode 100644
index 0000000000..81b22e1dbd
--- /dev/null
+++ b/test/unit/gemm/device/simt_cgemm_tn_sm80.cu
@@ -0,0 +1,269 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+    
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm_complex.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_complex.h"
+
+    
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_cf32t_cf32n_cf32t_simt_cf32, 32x64x8_32x64x1) {
+  
+  using Element = cutlass::complex<float>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element, 
+    cutlass::layout::RowMajor,
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor, 
+    Element,
+    cutlass::arch::OpClassSimt, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 8>,
+    cutlass::gemm::GemmShape<32, 32, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        Element, 
+        1,
+        Element, 
+        Element>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_cf32t_cf32n_cf32t_simt_cf32, 64x64x8_32x64x1) {
+  
+  using Element = cutlass::complex<float>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element, 
+    cutlass::layout::RowMajor,
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor, 
+    Element,
+    cutlass::arch::OpClassSimt, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        Element, 
+        1,
+        Element, 
+        Element>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    3,
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::ComplexTransform::kConjugate
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_cf32t_cf32n_cf32t_simt_cf32, 128x128x8_32x64x1) {
+  
+  using Element = cutlass::complex<float>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element, 
+    cutlass::layout::RowMajor,
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor, 
+    Element,
+    cutlass::arch::OpClassSimt, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        Element, 
+        1,
+        Element, 
+        Element>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_cf32t_cf32n_cf32t_simt_cf32, 64x128x8_32x64x1) {
+  
+  using Element = cutlass::complex<float>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element, 
+    cutlass::layout::RowMajor,
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor, 
+    Element,
+    cutlass::arch::OpClassSimt, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        Element, 
+        1,
+        Element, 
+        Element>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    3,
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::ComplexTransform::kConjugate
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_cf32t_cf32n_cf32t_simt_cf32, 128x64x8_64x32x1) {
+  
+  using Element = cutlass::complex<float>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element, 
+    cutlass::layout::RowMajor,
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor, 
+    Element,
+    cutlass::arch::OpClassSimt, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 8>,
+    cutlass::gemm::GemmShape<64, 32, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        Element, 
+        1,
+        Element, 
+        Element>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    3,
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::ComplexTransform::kConjugate
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_cf32t_cf32n_cf32t_simt_cf32, 128x128x8_64x64x1) {
+  
+  using Element = cutlass::complex<float>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element, 
+    cutlass::layout::RowMajor,
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor, 
+    Element,
+    cutlass::arch::OpClassSimt, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<64, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        Element, 
+        1,
+        Element, 
+        Element>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+TEST(SM80_Device_Gemm_cf32t_cf32n_cf32t_simt_cf32, 128x256x8_64x64x1) {
+  
+  using Element = cutlass::complex<float>;
+
+  using Gemm = cutlass::gemm::device::GemmComplex<
+    Element, 
+    cutlass::layout::RowMajor,
+    Element, 
+    cutlass::layout::ColumnMajor,
+    Element,
+    cutlass::layout::RowMajor, 
+    Element,
+    cutlass::arch::OpClassSimt, 
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 8>,
+    cutlass::gemm::GemmShape<64, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        Element, 
+        1,
+        Element, 
+        Element>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
+    3,
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::ComplexTransform::kConjugate
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmComplex<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/simt_cgemm_tt_sm50.cu b/test/unit/gemm/device/simt_cgemm_tt_sm50.cu
index a734d87b6f..eda445d6c4 100644
--- a/test/unit/gemm/device/simt_cgemm_tt_sm50.cu
+++ b/test/unit/gemm/device/simt_cgemm_tt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_dgemm_nn_sm50.cu b/test/unit/gemm/device/simt_dgemm_nn_sm50.cu
index 1af40ecb8d..4f56824e21 100644
--- a/test/unit/gemm/device/simt_dgemm_nn_sm50.cu
+++ b/test/unit/gemm/device/simt_dgemm_nn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_dgemm_nt_sm50.cu b/test/unit/gemm/device/simt_dgemm_nt_sm50.cu
index 6271d7074e..92c3467c3b 100644
--- a/test/unit/gemm/device/simt_dgemm_nt_sm50.cu
+++ b/test/unit/gemm/device/simt_dgemm_nt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_dgemm_tn_sm50.cu b/test/unit/gemm/device/simt_dgemm_tn_sm50.cu
index c7e958a788..1a34ddd066 100644
--- a/test/unit/gemm/device/simt_dgemm_tn_sm50.cu
+++ b/test/unit/gemm/device/simt_dgemm_tn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_dgemm_tt_sm50.cu b/test/unit/gemm/device/simt_dgemm_tt_sm50.cu
index 324c24b3d5..31af047537 100644
--- a/test/unit/gemm/device/simt_dgemm_tt_sm50.cu
+++ b/test/unit/gemm/device/simt_dgemm_tt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_f8gemm_tn_sm50.cu b/test/unit/gemm/device/simt_f8gemm_tn_sm50.cu
new file mode 100644
index 0000000000..88248a27d0
--- /dev/null
+++ b/test/unit/gemm/device/simt_f8gemm_tn_sm50.cu
@@ -0,0 +1,87 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+    
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if (__CUDACC_VER_MAJOR__ > 11) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 4))
+
+TEST(SM50_Device_Gemm_fe4m3t_fe4m3n_fe4m3t_simt_f32, 32x64x8_32x64x1) {
+  
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = cutlass::float_e4m3_t;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    ElementA, 
+    cutlass::layout::RowMajor,
+    ElementB, 
+    cutlass::layout::ColumnMajor,
+    ElementC,
+    cutlass::layout::RowMajor, 
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt, 
+    cutlass::arch::Sm50,
+    cutlass::gemm::GemmShape<32, 64, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        ElementC, 
+        1,
+        ElementAccumulator, 
+        ElementC>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
+}
+
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/simt_hgemm_nn_sm50.cu b/test/unit/gemm/device/simt_hgemm_nn_sm50.cu
index 748add7c04..028b6061dd 100644
--- a/test/unit/gemm/device/simt_hgemm_nn_sm50.cu
+++ b/test/unit/gemm/device/simt_hgemm_nn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_hgemm_nt_sm50.cu b/test/unit/gemm/device/simt_hgemm_nt_sm50.cu
index 5ddbc9570c..f756823e74 100644
--- a/test/unit/gemm/device/simt_hgemm_nt_sm50.cu
+++ b/test/unit/gemm/device/simt_hgemm_nt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_hgemm_tn_sm50.cu b/test/unit/gemm/device/simt_hgemm_tn_sm50.cu
index 0eeb53a4ab..915f452d87 100644
--- a/test/unit/gemm/device/simt_hgemm_tn_sm50.cu
+++ b/test/unit/gemm/device/simt_hgemm_tn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_hgemm_tt_sm50.cu b/test/unit/gemm/device/simt_hgemm_tt_sm50.cu
index ede4cf1a27..4cf9876c10 100644
--- a/test/unit/gemm/device/simt_hgemm_tt_sm50.cu
+++ b/test/unit/gemm/device/simt_hgemm_tt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_igemm_nn_sm50.cu b/test/unit/gemm/device/simt_igemm_nn_sm50.cu
index b982ad11fe..f6b3859bbe 100644
--- a/test/unit/gemm/device/simt_igemm_nn_sm50.cu
+++ b/test/unit/gemm/device/simt_igemm_nn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_igemm_nt_sm50.cu b/test/unit/gemm/device/simt_igemm_nt_sm50.cu
index 33abeb824e..13551cef16 100644
--- a/test/unit/gemm/device/simt_igemm_nt_sm50.cu
+++ b/test/unit/gemm/device/simt_igemm_nt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_igemm_tn_sm50.cu b/test/unit/gemm/device/simt_igemm_tn_sm50.cu
index d6a64c6ecf..bcae3fce12 100644
--- a/test/unit/gemm/device/simt_igemm_tn_sm50.cu
+++ b/test/unit/gemm/device/simt_igemm_tn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_igemm_tt_sm50.cu b/test/unit/gemm/device/simt_igemm_tt_sm50.cu
index d68400a85c..332ec85646 100644
--- a/test/unit/gemm/device/simt_igemm_tt_sm50.cu
+++ b/test/unit/gemm/device/simt_igemm_tt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_int8_igemm_sm61.cu b/test/unit/gemm/device/simt_int8_igemm_sm61.cu
index f99c465206..e1ee3cd4e2 100644
--- a/test/unit/gemm/device/simt_int8_igemm_sm61.cu
+++ b/test/unit/gemm/device/simt_int8_igemm_sm61.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_int8_igemm_sm61_perf.cu b/test/unit/gemm/device/simt_int8_igemm_sm61_perf.cu
index f221b14dfe..39d8a0a6d6 100644
--- a/test/unit/gemm/device/simt_int8_igemm_sm61_perf.cu
+++ b/test/unit/gemm/device/simt_int8_igemm_sm61_perf.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_int8_igemm_sm61_sliced_k.cu b/test/unit/gemm/device/simt_int8_igemm_sm61_sliced_k.cu
index 57f107d906..0858774212 100644
--- a/test/unit/gemm/device/simt_int8_igemm_sm61_sliced_k.cu
+++ b/test/unit/gemm/device/simt_int8_igemm_sm61_sliced_k.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_qgemm_nn_sm50.cu b/test/unit/gemm/device/simt_qgemm_nn_sm50.cu
index 0db0240d7e..fc58a9906b 100644
--- a/test/unit/gemm/device/simt_qgemm_nn_sm50.cu
+++ b/test/unit/gemm/device/simt_qgemm_nn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -76,7 +76,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_nn, 8x32x8_8x32x1_2x4_4x8_1x1, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -106,7 +106,7 @@ CUTLASS_TEST_L0(SM50_device_qgemm_nn, 16x32x8_16x32x1_4x4_4x8_1x1, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -136,7 +136,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_nn, 8x32x8_8x16x1_2x2_4x8_1x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -166,7 +166,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_nn, 8x64x8_8x32x1_2x4_4x8_1x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -196,7 +196,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_nn, 16x32x8_16x16x1_4x2_4x8_1x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -226,7 +226,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_nn, 16x64x8_16x32x1_4x4_4x8_1x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -256,7 +256,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_nn, 32x32x8_32x16x1_4x4_8x4_1x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -286,7 +286,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_nn, 32x32x8_16x32x1_4x4_4x8_2x1, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -316,7 +316,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_nn, 16x32x8_8x16x1_2x2_4x8_2x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -346,7 +346,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_nn, 16x64x8_8x32x1_2x4_4x8_2x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -376,7 +376,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_nn, 32x32x8_16x16x1_4x2_4x8_2x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -406,7 +406,7 @@ CUTLASS_TEST_L0(SM50_device_qgemm_nn, 32x64x8_16x32x1_4x4_4x8_2x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -436,7 +436,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_nn, 64x32x8_32x16x1_4x4_8x4_2x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -466,7 +466,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_nn, 16x64x16_8x16x1_2x2_4x8_2x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -496,7 +496,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_nn, 32x32x8_16x8x1_2x2_8x4_2x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -526,7 +526,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_nn, 32x64x8_16x16x1_4x2_4x8_2x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -556,7 +556,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_nn, 32x128x8_16x32x1_4x4_4x8_2x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -586,7 +586,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_nn, 64x64x8_32x16x1_4x4_8x4_2x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -616,7 +616,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_nn, 32x32x8_8x16x1_2x2_4x8_4x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -646,7 +646,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_nn, 64x32x8_16x16x1_4x2_4x8_4x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -676,7 +676,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_nn, 64x64x8_16x32x1_4x4_4x8_4x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -706,7 +706,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_nn, 128x32x8_32x16x1_4x4_8x4_4x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -736,7 +736,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_nn, 32x64x16_8x16x1_2x2_4x8_4x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -766,7 +766,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_nn, 64x32x16_16x8x1_2x2_8x4_4x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -796,7 +796,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_nn, 64x64x8_16x16x1_4x2_4x8_4x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -826,7 +826,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_nn, 64x128x8_16x32x1_4x4_4x8_4x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -856,6 +856,6 @@ CUTLASS_TEST_L1(SM50_device_qgemm_nn, 128x64x8_32x16x1_4x4_8x4_4x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
diff --git a/test/unit/gemm/device/simt_qgemm_nt_sm50.cu b/test/unit/gemm/device/simt_qgemm_nt_sm50.cu
index 36fd261dfe..9e92ea9d90 100644
--- a/test/unit/gemm/device/simt_qgemm_nt_sm50.cu
+++ b/test/unit/gemm/device/simt_qgemm_nt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -76,7 +76,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_nt, 8x32x8_8x32x1_2x4_4x8_1x1, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -106,7 +106,7 @@ CUTLASS_TEST_L0(SM50_device_qgemm_nt, 16x32x8_16x32x1_4x4_4x8_1x1, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -136,7 +136,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_nt, 8x32x8_8x16x1_2x2_4x8_1x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -166,7 +166,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_nt, 8x64x8_8x32x1_2x4_4x8_1x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -196,7 +196,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_nt, 16x32x8_16x16x1_4x2_4x8_1x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -226,7 +226,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_nt, 16x64x8_16x32x1_4x4_4x8_1x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -256,7 +256,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_nt, 32x32x8_32x16x1_4x4_8x4_1x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -286,7 +286,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_nt, 32x32x8_16x32x1_4x4_4x8_2x1, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -316,7 +316,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_nt, 16x32x8_8x16x1_2x2_4x8_2x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -346,7 +346,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_nt, 16x64x8_8x32x1_2x4_4x8_2x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -376,7 +376,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_nt, 32x32x8_16x16x1_4x2_4x8_2x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -406,7 +406,7 @@ CUTLASS_TEST_L0(SM50_device_qgemm_nt, 32x64x8_16x32x1_4x4_4x8_2x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -436,7 +436,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_nt, 64x32x8_32x16x1_4x4_8x4_2x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -466,7 +466,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_nt, 16x64x16_8x16x1_2x2_4x8_2x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -496,7 +496,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_nt, 32x32x8_16x8x1_2x2_8x4_2x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -526,7 +526,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_nt, 32x64x8_16x16x1_4x2_4x8_2x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -556,7 +556,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_nt, 32x128x8_16x32x1_4x4_4x8_2x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -586,7 +586,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_nt, 64x64x8_32x16x1_4x4_8x4_2x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -616,7 +616,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_nt, 32x32x8_8x16x1_2x2_4x8_4x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -646,7 +646,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_nt, 64x32x8_16x16x1_4x2_4x8_4x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -676,7 +676,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_nt, 64x64x8_16x32x1_4x4_4x8_4x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -706,7 +706,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_nt, 128x32x8_32x16x1_4x4_8x4_4x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -736,7 +736,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_nt, 32x64x16_8x16x1_2x2_4x8_4x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -766,7 +766,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_nt, 64x32x16_16x8x1_2x2_8x4_4x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -796,7 +796,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_nt, 64x64x8_16x16x1_4x2_4x8_4x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -826,7 +826,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_nt, 64x128x8_16x32x1_4x4_4x8_4x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -856,6 +856,6 @@ CUTLASS_TEST_L1(SM50_device_qgemm_nt, 128x64x8_32x16x1_4x4_8x4_4x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
diff --git a/test/unit/gemm/device/simt_qgemm_tn_sm50.cu b/test/unit/gemm/device/simt_qgemm_tn_sm50.cu
index 191b057a49..6badfe27b4 100644
--- a/test/unit/gemm/device/simt_qgemm_tn_sm50.cu
+++ b/test/unit/gemm/device/simt_qgemm_tn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -76,7 +76,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_tn, 8x32x8_8x32x1_2x4_4x8_1x1, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -106,7 +106,7 @@ CUTLASS_TEST_L0(SM50_device_qgemm_tn, 16x32x8_16x32x1_4x4_4x8_1x1, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -136,7 +136,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_tn, 8x32x8_8x16x1_2x2_4x8_1x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -166,7 +166,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_tn, 8x64x8_8x32x1_2x4_4x8_1x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -196,7 +196,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_tn, 16x32x8_16x16x1_4x2_4x8_1x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -226,7 +226,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_tn, 16x64x8_16x32x1_4x4_4x8_1x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -256,7 +256,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_tn, 32x32x8_32x16x1_4x4_8x4_1x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -286,7 +286,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_tn, 32x32x8_16x32x1_4x4_4x8_2x1, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -316,7 +316,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_tn, 16x32x8_8x16x1_2x2_4x8_2x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -346,7 +346,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_tn, 16x64x8_8x32x1_2x4_4x8_2x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -376,7 +376,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_tn, 32x32x8_16x16x1_4x2_4x8_2x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -406,7 +406,7 @@ CUTLASS_TEST_L0(SM50_device_qgemm_tn, 32x64x8_16x32x1_4x4_4x8_2x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -436,7 +436,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_tn, 64x32x8_32x16x1_4x4_8x4_2x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -466,7 +466,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_tn, 16x64x16_8x16x1_2x2_4x8_2x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -496,7 +496,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_tn, 32x32x8_16x8x1_2x2_8x4_2x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -526,7 +526,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_tn, 32x64x8_16x16x1_4x2_4x8_2x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -556,7 +556,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_tn, 32x128x8_16x32x1_4x4_4x8_2x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -586,7 +586,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_tn, 64x64x8_32x16x1_4x4_8x4_2x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -616,7 +616,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_tn, 32x32x8_8x16x1_2x2_4x8_4x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -646,7 +646,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_tn, 64x32x8_16x16x1_4x2_4x8_4x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -676,7 +676,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_tn, 64x64x8_16x32x1_4x4_4x8_4x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -706,7 +706,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_tn, 128x32x8_32x16x1_4x4_8x4_4x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -736,7 +736,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_tn, 32x64x16_8x16x1_2x2_4x8_4x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -766,7 +766,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_tn, 64x32x16_16x8x1_2x2_8x4_4x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -796,7 +796,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_tn, 64x64x8_16x16x1_4x2_4x8_4x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -826,7 +826,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_tn, 64x128x8_16x32x1_4x4_4x8_4x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -856,6 +856,6 @@ CUTLASS_TEST_L1(SM50_device_qgemm_tn, 128x64x8_32x16x1_4x4_8x4_4x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
diff --git a/test/unit/gemm/device/simt_qgemm_tt_sm50.cu b/test/unit/gemm/device/simt_qgemm_tt_sm50.cu
index 1b531f9890..bb7555b8fe 100644
--- a/test/unit/gemm/device/simt_qgemm_tt_sm50.cu
+++ b/test/unit/gemm/device/simt_qgemm_tt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -76,7 +76,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_tt, 8x32x8_8x32x1_2x4_4x8_1x1, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -106,7 +106,7 @@ CUTLASS_TEST_L0(SM50_device_qgemm_tt, 16x32x8_16x32x1_4x4_4x8_1x1, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -136,7 +136,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_tt, 8x32x8_8x16x1_2x2_4x8_1x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -166,7 +166,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_tt, 8x64x8_8x32x1_2x4_4x8_1x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -196,7 +196,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_tt, 16x32x8_16x16x1_4x2_4x8_1x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -226,7 +226,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_tt, 16x64x8_16x32x1_4x4_4x8_1x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -256,7 +256,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_tt, 32x32x8_32x16x1_4x4_8x4_1x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -286,7 +286,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_tt, 32x32x8_16x32x1_4x4_4x8_2x1, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -316,7 +316,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_tt, 16x32x8_8x16x1_2x2_4x8_2x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -346,7 +346,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_tt, 16x64x8_8x32x1_2x4_4x8_2x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -376,7 +376,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_tt, 32x32x8_16x16x1_4x2_4x8_2x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -406,7 +406,7 @@ CUTLASS_TEST_L0(SM50_device_qgemm_tt, 32x64x8_16x32x1_4x4_4x8_2x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -436,7 +436,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_tt, 64x32x8_32x16x1_4x4_8x4_2x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -466,7 +466,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_tt, 16x64x16_8x16x1_2x2_4x8_2x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -496,7 +496,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_tt, 32x32x8_16x8x1_2x2_8x4_2x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -526,7 +526,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_tt, 32x64x8_16x16x1_4x2_4x8_2x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -556,7 +556,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_tt, 32x128x8_16x32x1_4x4_4x8_2x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -586,7 +586,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_tt, 64x64x8_32x16x1_4x4_8x4_2x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -616,7 +616,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_tt, 32x32x8_8x16x1_2x2_4x8_4x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -646,7 +646,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_tt, 64x32x8_16x16x1_4x2_4x8_4x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -676,7 +676,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_tt, 64x64x8_16x32x1_4x4_4x8_4x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -706,7 +706,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_tt, 128x32x8_32x16x1_4x4_8x4_4x2, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -736,7 +736,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_tt, 32x64x16_8x16x1_2x2_4x8_4x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -766,7 +766,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_tt, 64x32x16_16x8x1_2x2_8x4_4x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -796,7 +796,7 @@ CUTLASS_TEST_L2(SM50_device_qgemm_tt, 64x64x8_16x16x1_4x2_4x8_4x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -826,7 +826,7 @@ CUTLASS_TEST_L1(SM50_device_qgemm_tt, 64x128x8_16x32x1_4x4_4x8_4x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -856,6 +856,6 @@ CUTLASS_TEST_L1(SM50_device_qgemm_tt, 128x64x8_32x16x1_4x4_8x4_4x4, {
         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
         2 // Stages
     >;
-    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+    EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 } )
 
diff --git a/test/unit/gemm/device/simt_sgemm_nn_sm50.cu b/test/unit/gemm/device/simt_sgemm_nn_sm50.cu
index 2fe11c1ff8..a162c2e7d9 100644
--- a/test/unit/gemm/device/simt_sgemm_nn_sm50.cu
+++ b/test/unit/gemm/device/simt_sgemm_nn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_sgemm_nt_sm50.cu b/test/unit/gemm/device/simt_sgemm_nt_sm50.cu
index 1909f23d1f..050c08d24f 100644
--- a/test/unit/gemm/device/simt_sgemm_nt_sm50.cu
+++ b/test/unit/gemm/device/simt_sgemm_nt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_sgemm_nt_sm80.cu b/test/unit/gemm/device/simt_sgemm_nt_sm80.cu
index eb0e3481d1..6c9625a7f1 100644
--- a/test/unit/gemm/device/simt_sgemm_nt_sm80.cu
+++ b/test/unit/gemm/device/simt_sgemm_nt_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_sgemm_tn_sm50.cu b/test/unit/gemm/device/simt_sgemm_tn_sm50.cu
index a1b99337a4..247722ff93 100644
--- a/test/unit/gemm/device/simt_sgemm_tn_sm50.cu
+++ b/test/unit/gemm/device/simt_sgemm_tn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_sgemm_tn_sm80.cu b/test/unit/gemm/device/simt_sgemm_tn_sm80.cu
index 975b737502..e061e01d23 100644
--- a/test/unit/gemm/device/simt_sgemm_tn_sm80.cu
+++ b/test/unit/gemm/device/simt_sgemm_tn_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_sgemm_tt_sm50.cu b/test/unit/gemm/device/simt_sgemm_tt_sm50.cu
index f855c4a27e..fd555bc427 100644
--- a/test/unit/gemm/device/simt_sgemm_tt_sm50.cu
+++ b/test/unit/gemm/device/simt_sgemm_tt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_sm50.py b/test/unit/gemm/device/simt_sm50.py
index 3f57d1b846..ff4d00fc2b 100644
--- a/test/unit/gemm/device/simt_sm50.py
+++ b/test/unit/gemm/device/simt_sm50.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -130,7 +130,7 @@
 
         # write file header
         out.write("/***************************************************************************************************\n"
-" * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.                 \n"
+" * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.                 \n"
 " * SPDX-License-Identifier: BSD-3-Clause                                                           \n"
 " *                                                                                                 \n"
 " * Redistribution and use in source and binary forms, with or without                              \n"
diff --git a/test/unit/gemm/device/simt_zgemm_nn_sm50.cu b/test/unit/gemm/device/simt_zgemm_nn_sm50.cu
index 30c024a0db..06d29f25bb 100644
--- a/test/unit/gemm/device/simt_zgemm_nn_sm50.cu
+++ b/test/unit/gemm/device/simt_zgemm_nn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_zgemm_nt_sm50.cu b/test/unit/gemm/device/simt_zgemm_nt_sm50.cu
index c47d76ae52..57374e42c2 100644
--- a/test/unit/gemm/device/simt_zgemm_nt_sm50.cu
+++ b/test/unit/gemm/device/simt_zgemm_nt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_zgemm_tn_sm50.cu b/test/unit/gemm/device/simt_zgemm_tn_sm50.cu
index 62286d955e..b67acaab79 100644
--- a/test/unit/gemm/device/simt_zgemm_tn_sm50.cu
+++ b/test/unit/gemm/device/simt_zgemm_tn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_zgemm_tt_sm50.cu b/test/unit/gemm/device/simt_zgemm_tt_sm50.cu
index c9c0af2ba4..116cfc5479 100644
--- a/test/unit/gemm/device/simt_zgemm_tt_sm50.cu
+++ b/test/unit/gemm/device/simt_zgemm_tt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm50_gemm_f32_f32_f32_simt.cu b/test/unit/gemm/device/sm50_gemm_f32_f32_f32_simt.cu
new file mode 100644
index 0000000000..da66a91556
--- /dev/null
+++ b/test/unit/gemm/device/sm50_gemm_f32_f32_f32_simt.cu
@@ -0,0 +1,135 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "default_gemm_configuration.hpp"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM50_Device_Gemm_f32n_f32n_f32n_simt_f32, 128x128x64_64x64x64) {
+  using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
+    float, cutlass::layout::ColumnMajor,
+    float, cutlass::layout::ColumnMajor,
+    float, cutlass::layout::ColumnMajor,
+    float>;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      Config::CollectiveMainloop,
+      Config::CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM50_Device_Gemm_f32n_f32t_f32n_simt_f32, 128x128x64_64x64x64) {
+
+  using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
+    float, cutlass::layout::ColumnMajor,
+    float, cutlass::layout::RowMajor,
+    float, cutlass::layout::ColumnMajor,
+    float>;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      Config::CollectiveMainloop,
+      Config::CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM50_Device_Gemm_f32t_f32n_f32n_simt_f32, 128x128x64_64x64x64) {
+
+  using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
+    float, cutlass::layout::RowMajor,
+    float, cutlass::layout::ColumnMajor,
+    float, cutlass::layout::ColumnMajor,
+    float>;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      Config::CollectiveMainloop,
+      Config::CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM50_Device_Gemm_f32t_f32t_f32n_simt_f32, 128x128x64_64x64x64) {
+
+  using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
+    float, cutlass::layout::RowMajor,
+    float, cutlass::layout::RowMajor,
+    float, cutlass::layout::ColumnMajor,
+    float>;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      Config::CollectiveMainloop,
+      Config::CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/sm50_gemm_f64_f64_f64_simt.cu b/test/unit/gemm/device/sm50_gemm_f64_f64_f64_simt.cu
new file mode 100644
index 0000000000..db5035ebd1
--- /dev/null
+++ b/test/unit/gemm/device/sm50_gemm_f64_f64_f64_simt.cu
@@ -0,0 +1,134 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "default_gemm_configuration.hpp"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM50_Device_Gemm_f64n_f64n_f64n_simt_f64, 128x128x64_64x64x64) {
+  using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
+    double, cutlass::layout::ColumnMajor,
+    double, cutlass::layout::ColumnMajor,
+    double, cutlass::layout::ColumnMajor,
+    double>;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      Config::CollectiveMainloop,
+      Config::CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+TEST(SM50_Device_Gemm_f64n_f64t_f64n_simt_f64, 128x128x64_64x64x64) {
+
+  using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
+    double, cutlass::layout::ColumnMajor,
+    double, cutlass::layout::RowMajor,
+    double, cutlass::layout::ColumnMajor,
+    double>;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      Config::CollectiveMainloop,
+      Config::CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM50_Device_Gemm_f64t_f64n_f64n_simt_f64, 128x128x64_64x64x64) {
+
+  using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
+    double, cutlass::layout::RowMajor,
+    double, cutlass::layout::ColumnMajor,
+    double, cutlass::layout::ColumnMajor,
+    double>;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      Config::CollectiveMainloop,
+      Config::CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM50_Device_Gemm_f64t_f64t_f64n_simt_f64, 128x128x64_64x64x64) {
+
+  using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
+    double, cutlass::layout::RowMajor,
+    double, cutlass::layout::RowMajor,
+    double, cutlass::layout::ColumnMajor,
+    double>;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      Config::CollectiveMainloop,
+      Config::CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/sm61_gemm_s8_s8_s32_simt.cu b/test/unit/gemm/device/sm61_gemm_s8_s8_s32_simt.cu
new file mode 100644
index 0000000000..2e1deb7871
--- /dev/null
+++ b/test/unit/gemm/device/sm61_gemm_s8_s8_s32_simt.cu
@@ -0,0 +1,136 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "default_gemm_configuration.hpp"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+//#if defined(CUTLASS_ARCH_MMA_SM61_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM61_Device_Gemm_s8n_s8n_s32n_simt_s32, 128x128x64_64x64x64) {
+  using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
+    int8_t, cutlass::layout::ColumnMajor,
+    int8_t, cutlass::layout::ColumnMajor,
+    int32_t, cutlass::layout::ColumnMajor,
+    int32_t>;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      Config::CollectiveMainloop,
+      Config::CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM61_Device_Gemm_s8n_s8t_s32n_simt_s32, 128x128x64_64x64x64) {
+  using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
+    int8_t, cutlass::layout::ColumnMajor,
+    int8_t, cutlass::layout::RowMajor,
+    int32_t, cutlass::layout::ColumnMajor,
+    int32_t>;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      Config::CollectiveMainloop,
+      Config::CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM61_Device_Gemm_s8t_s8n_s32n_simt_s32, 128x128x64_64x64x64) {
+  using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
+    int8_t, cutlass::layout::RowMajor,
+    int8_t, cutlass::layout::ColumnMajor,
+    int32_t, cutlass::layout::ColumnMajor,
+    int32_t>;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      Config::CollectiveMainloop,
+      Config::CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM61_Device_Gemm_s8t_s8t_s32n_simt_s32, 128x128x64_64x64x64) {
+  using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
+    int8_t, cutlass::layout::RowMajor,
+    int8_t, cutlass::layout::RowMajor,
+    int32_t, cutlass::layout::ColumnMajor,
+    int32_t>;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      Config::CollectiveMainloop,
+      Config::CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//#endif // #if defined(CUTLASS_ARCH_MMA_SM61_SUPPORTED)
diff --git a/test/unit/gemm/device/sm80_gemm_f16_f16_f32_tensor_op_f32.cu b/test/unit/gemm/device/sm80_gemm_f16_f16_f32_tensor_op_f32.cu
new file mode 100644
index 0000000000..52fb5c61a5
--- /dev/null
+++ b/test/unit/gemm/device/sm80_gemm_f16_f16_f32_tensor_op_f32.cu
@@ -0,0 +1,136 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "default_gemm_configuration.hpp"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+//#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#if 1
+TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32_3x, 128x128x32_64x64x32) {
+  using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
+    cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+    cutlass::half_t, cutlass::layout::RowMajor,
+    cutlass::half_t, cutlass::layout::ColumnMajor,
+    float, cutlass::layout::RowMajor,
+    float>;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      Config::CollectiveMainloop,
+      Config::CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+#endif
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#if 1
+TEST(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32_3x, 128x128x32_64x64x32) {
+  using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
+    cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+    cutlass::half_t, cutlass::layout::ColumnMajor,
+    cutlass::half_t, cutlass::layout::RowMajor,
+    float, cutlass::layout::RowMajor,
+    float>;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      Config::CollectiveMainloop,
+      Config::CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32_3x, 128x128x32_64x64x32) {
+  using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
+    cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+    cutlass::half_t, cutlass::layout::ColumnMajor,
+    cutlass::half_t, cutlass::layout::ColumnMajor,
+    float, cutlass::layout::RowMajor,
+    float>;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      Config::CollectiveMainloop,
+      Config::CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32_3x, 128x128x32_64x64x32) {
+  using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
+    cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+    cutlass::half_t, cutlass::layout::RowMajor,
+    cutlass::half_t, cutlass::layout::RowMajor,
+    float, cutlass::layout::RowMajor,
+    float>;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      Config::CollectiveMainloop,
+      Config::CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+#endif
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/sm80_gemm_f32_f32_f32_simt.cu b/test/unit/gemm/device/sm80_gemm_f32_f32_f32_simt.cu
new file mode 100644
index 0000000000..3174a7ad21
--- /dev/null
+++ b/test/unit/gemm/device/sm80_gemm_f32_f32_f32_simt.cu
@@ -0,0 +1,135 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "default_gemm_configuration.hpp"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f32n_f32n_f32n_simt_f32, 128x128x64_64x64x64) {
+  using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm80,
+    float, cutlass::layout::ColumnMajor,
+    float, cutlass::layout::ColumnMajor,
+    float, cutlass::layout::ColumnMajor,
+    float>;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      Config::CollectiveMainloop,
+      Config::CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f32n_f32t_f32n_simt_f32, 128x128x64_64x64x64) {
+
+  using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm80,
+    float, cutlass::layout::ColumnMajor,
+    float, cutlass::layout::RowMajor,
+    float, cutlass::layout::ColumnMajor,
+    float>;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      Config::CollectiveMainloop,
+      Config::CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f32t_f32n_f32n_simt_f32, 128x128x64_64x64x64) {
+
+  using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm80,
+    float, cutlass::layout::RowMajor,
+    float, cutlass::layout::ColumnMajor,
+    float, cutlass::layout::ColumnMajor,
+    float>;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      Config::CollectiveMainloop,
+      Config::CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f32t_f32t_f32n_simt_f32, 128x128x64_64x64x64) {
+
+  using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm80,
+    float, cutlass::layout::RowMajor,
+    float, cutlass::layout::RowMajor,
+    float, cutlass::layout::ColumnMajor,
+    float>;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      Config::CollectiveMainloop,
+      Config::CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/sm80_gemm_f64_f64_f64_simt.cu b/test/unit/gemm/device/sm80_gemm_f64_f64_f64_simt.cu
new file mode 100644
index 0000000000..bfbcbc1cba
--- /dev/null
+++ b/test/unit/gemm/device/sm80_gemm_f64_f64_f64_simt.cu
@@ -0,0 +1,134 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "default_gemm_configuration.hpp"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f64n_f64n_f64n_simt_f64, 128x128x64_64x64x64) {
+  using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm80,
+    double, cutlass::layout::ColumnMajor,
+    double, cutlass::layout::ColumnMajor,
+    double, cutlass::layout::ColumnMajor,
+    double>;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      Config::CollectiveMainloop,
+      Config::CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Gemm_f64n_f64t_f64n_simt_f64, 128x128x64_64x64x64) {
+
+  using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm80,
+    double, cutlass::layout::ColumnMajor,
+    double, cutlass::layout::RowMajor,
+    double, cutlass::layout::ColumnMajor,
+    double>;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      Config::CollectiveMainloop,
+      Config::CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f64t_f64n_f64n_simt_f64, 128x128x64_64x64x64) {
+
+  using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm80,
+    double, cutlass::layout::RowMajor,
+    double, cutlass::layout::ColumnMajor,
+    double, cutlass::layout::ColumnMajor,
+    double>;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      Config::CollectiveMainloop,
+      Config::CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f64t_f64t_f64n_simt_f64, 128x128x64_64x64x64) {
+
+  using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
+    cutlass::arch::OpClassSimt, cutlass::arch::Sm80,
+    double, cutlass::layout::RowMajor,
+    double, cutlass::layout::RowMajor,
+    double, cutlass::layout::ColumnMajor,
+    double>;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      Config::CollectiveMainloop,
+      Config::CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/sm80_gemm_f64_f64_f64_tensor_op_f64.cu b/test/unit/gemm/device/sm80_gemm_f64_f64_f64_tensor_op_f64.cu
new file mode 100644
index 0000000000..66ba848815
--- /dev/null
+++ b/test/unit/gemm/device/sm80_gemm_f64_f64_f64_tensor_op_f64.cu
@@ -0,0 +1,98 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "default_gemm_configuration.hpp"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+//#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f64n_f64t_f64n_tensor_op_f64, 128x128x64_64x64x64) {
+
+  using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
+    cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+    double, cutlass::layout::ColumnMajor,
+    double, cutlass::layout::ColumnMajor,
+    double, cutlass::layout::ColumnMajor,
+    double>;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      Config::CollectiveMainloop,
+      Config::CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_f64t_f64n_f64n_tensor_op_f64, 128x128x64_64x64x64) {
+
+  using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
+    cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+    double, cutlass::layout::RowMajor,
+    double, cutlass::layout::ColumnMajor,
+    double, cutlass::layout::ColumnMajor,
+    double>;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      Config::CollectiveMainloop,
+      Config::CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// #endif
diff --git a/test/unit/gemm/device/sm80_gemm_s8_s8_s32_tensor_op.cu b/test/unit/gemm/device/sm80_gemm_s8_s8_s32_tensor_op.cu
new file mode 100644
index 0000000000..d8b0ca9ddb
--- /dev/null
+++ b/test/unit/gemm/device/sm80_gemm_s8_s8_s32_tensor_op.cu
@@ -0,0 +1,94 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "default_gemm_configuration.hpp"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+//#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(DISABLED_SM80_Device_Gemm_s8n_s8n_s32n_tensor_op_s32, 128x128x32_64x64x64) {
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(DISABLED_SM80_Device_Gemm_s8n_s8t_s32n_tensor_op_s32, 128x128x32_64x64x64) {
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_s8t_s8n_s32n_tensor_op_s32, 128x128x32_64x64x64) {
+  using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
+    cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+    int8_t, cutlass::layout::RowMajor,
+    int8_t, cutlass::layout::ColumnMajor,
+    int32_t, cutlass::layout::ColumnMajor,
+    int32_t>;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      Config::CollectiveMainloop,
+      Config::CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(DISABLED_SM80_Device_Gemm_s8t_s8t_s32n_tensor_op_s32, 128x128x32_64x64x64) {
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/sm80_gemm_tf32_tf32_f32_tensor_op_f32.cu b/test/unit/gemm/device/sm80_gemm_tf32_tf32_f32_tensor_op_f32.cu
new file mode 100644
index 0000000000..93ec737a60
--- /dev/null
+++ b/test/unit/gemm/device/sm80_gemm_tf32_tf32_f32_tensor_op_f32.cu
@@ -0,0 +1,157 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "default_gemm_configuration.hpp"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+
+//#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_tf32n_tf32n_f32n_tensor_op_f32, 128x128x32_64x64x64) {
+  using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
+    cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+    cutlass::tfloat32_t, cutlass::layout::ColumnMajor,
+    cutlass::tfloat32_t, cutlass::layout::ColumnMajor,
+    float, cutlass::layout::RowMajor,
+    float>;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      Config::CollectiveMainloop,
+      Config::CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_tf32n_tf32t_f32n_tensor_op_f32, 128x128x32_64x64x64) {
+  using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
+    cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+    cutlass::tfloat32_t, cutlass::layout::ColumnMajor,
+    cutlass::tfloat32_t, cutlass::layout::RowMajor,
+    float, cutlass::layout::RowMajor,
+    float>;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      Config::CollectiveMainloop,
+      Config::CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_tf32t_tf32n_f32n_tensor_op_f32, 128x128x32_64x64x64) {
+  using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
+    cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+    cutlass::tfloat32_t, cutlass::layout::RowMajor,
+    cutlass::tfloat32_t, cutlass::layout::ColumnMajor,
+    float, cutlass::layout::RowMajor,
+    float>;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      Config::CollectiveMainloop,
+      Config::CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_tf32t_tf32t_f32n_tensor_op_f32, 128x128x32_64x64x64) {
+  using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
+    cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+    cutlass::tfloat32_t, cutlass::layout::RowMajor,
+    cutlass::tfloat32_t, cutlass::layout::RowMajor,
+    float, cutlass::layout::RowMajor,
+    float>;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      Config::CollectiveMainloop,
+      Config::CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Gemm_tf32t_tf32n_f32n_tensor_op_f32, 128x128x32_64x64x64_profiling) {
+  using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
+    cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+    cutlass::tfloat32_t, cutlass::layout::RowMajor,
+    cutlass::tfloat32_t, cutlass::layout::ColumnMajor,
+    float, cutlass::layout::RowMajor,
+    float>;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      Config::CollectiveMainloop,
+      Config::CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestGemmPerf3x<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_evt_operations.hpp b/test/unit/gemm/device/sm90_evt_operations.hpp
new file mode 100644
index 0000000000..73f228d4ee
--- /dev/null
+++ b/test/unit/gemm/device/sm90_evt_operations.hpp
@@ -0,0 +1,545 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Host reference and operations for Sm90 EVT unit test
+*/
+#pragma once
+#include "gemm_testbed_3x_evt.hpp"
+
+//////////////////////////////////////////////////////////////////////////////
+/// Host references used for testing
+namespace test::gemm::device {
+template<class NodeOp, class ...ChildOp>
+using HEVT = HostTreeVisitor<NodeOp, ChildOp...>;
+
+template<class EdgeTuple, class ...Ops>
+using HDAG = HostTopoVisitor<EdgeTuple, Ops...>;
+
+template<class InputTree, class OutputTree, class... AuxOutTrees>
+using HST = HostSplitTreeVisitor<InputTree, OutputTree, AuxOutTrees...>;
+
+/// D = alpha * acc + beta * C + AuxLoad
+template<class Gemm, class ElementAux, class LayoutAux>
+class HostEVTAuxLoad {
+public:
+  using ElementC = typename Gemm::GemmKernel::ElementC;
+  using LayoutC = cutlass::detail::StrideToLayoutTagC_t<typename Gemm::GemmKernel::StrideC>;
+  using ElementD = typename Gemm::GemmKernel::ElementC;
+  using LayoutD = cutlass::detail::StrideToLayoutTagC_t<typename Gemm::GemmKernel::StrideD>;
+
+  using ScalarAlpha = HostScalarBroadcast<1>;
+  using AccFetchNode = HostAccumulator<>;
+  using AuxLoadNode = HostAuxLoad<ElementAux, LayoutAux, false>;
+  using TernaryCompute0 = HEVT<HostCompute<cutlass::homogeneous_multiply_add>, ScalarAlpha, AccFetchNode, AuxLoadNode>;
+  using ScalarBeta = HostScalarBroadcast<1>;
+  using CLoadNode = HostAuxLoad<ElementC, LayoutC, true>;
+  using TernaryCompute1 = HEVT<HostCompute<cutlass::homogeneous_multiply_add>, ScalarBeta, CLoadNode, TernaryCompute0>;
+  using EVTModule = HEVT<HostAuxStore<ElementD, LayoutD, true>, TernaryCompute1>;
+};
+
+/// D = alpha * acc + beta * C + per-column bias
+template<class Gemm, class ElementBias>
+class HostPerColBias {
+public:
+  using ElementC = typename Gemm::GemmKernel::ElementC;
+  using LayoutC = cutlass::detail::StrideToLayoutTagC_t<typename Gemm::GemmKernel::StrideC>;
+  using ElementD = typename Gemm::GemmKernel::ElementC;
+  using LayoutD = cutlass::detail::StrideToLayoutTagC_t<typename Gemm::GemmKernel::StrideD>;
+
+  using ScalarAlpha = HostScalarBroadcast<1>;
+  using AccFetchNode = HostAccumulator<>;
+  using RowBroadcastNode = HostRowBroadcast<ElementBias>;
+  using TernaryCompute0 = HEVT<HostCompute<cutlass::homogeneous_multiply_add>, ScalarAlpha, AccFetchNode, RowBroadcastNode>;
+  using ScalarBeta = HostScalarBroadcast<1>;
+  using CLoadNode = HostAuxLoad<ElementC, LayoutC, true>;
+  using TernaryCompute1 = HEVT<HostCompute<cutlass::homogeneous_multiply_add>, ScalarBeta, CLoadNode, TernaryCompute0>;
+  using EVTModule = HEVT<HostAuxStore<ElementD, LayoutD, true>, TernaryCompute1>;
+};
+
+/// D = beta * C + Graph(relu(alpha * acc + aux) + aux)
+/// Testing EVT - DAG structure
+template<class Gemm>
+class HostEVTDAG {
+public:
+  using ElementC = typename Gemm::GemmKernel::ElementC;
+  using LayoutC = cutlass::detail::StrideToLayoutTagC_t<typename Gemm::GemmKernel::StrideC>;
+  using ElementD = typename Gemm::GemmKernel::ElementC;
+  using LayoutD = cutlass::detail::StrideToLayoutTagC_t<typename Gemm::GemmKernel::StrideD>;
+
+  using ScalarAlpha = HostScalarBroadcast<1>;
+  using AccFetchNode = HostAccumulator<>;
+  using AuxLoadNode = HostAuxLoad<cutlass::half_t, cutlass::layout::RowMajor, false>;
+  using DAGNode = HDAG<
+    float,
+    cute::tuple<
+      cute::tuple<>, // 0. alpha
+      cute::tuple<>, // 1. acc
+      cute::tuple<>, // 2. aux load
+      cute::tuple<cute::_0, cute::_1, cute::_2>, // 3. alpha * acc + aux load
+      cute::tuple<cute::_3>, // relu(alpha * acc + aux load)
+      cute::tuple<cute::_2, cute::_4> // relu(alpha * acc + aux load) + aux load
+    >,
+    ScalarAlpha,
+    AccFetchNode,
+    AuxLoadNode,
+    HostCompute<cutlass::homogeneous_multiply_add>,
+    HostCompute<cutlass::epilogue::thread::ReLu>,
+    HostCompute<cutlass::plus>
+  >;
+  using ScalarBeta = HostScalarBroadcast<1>;
+  using CLoadNode = HostAuxLoad<ElementC, LayoutC, true>;
+  using TernaryCompute1 = HEVT<HostCompute<cutlass::homogeneous_multiply_add>, ScalarBeta, CLoadNode, DAGNode>;
+  using EVTModule = HEVT<HostAuxStore<ElementD, LayoutD, true>, TernaryCompute1>;
+};
+
+/// EVT = alpha * acc + C
+/// D = Graph(maximum(EVT + per-row bias, EVT))
+/// Testing DAG - EVT
+template<class Gemm>
+class HostDAGEVT {
+public:
+  using ElementC = typename Gemm::GemmKernel::ElementC;
+  using LayoutC = cutlass::detail::StrideToLayoutTagC_t<typename Gemm::GemmKernel::StrideC>;
+  using ElementD = typename Gemm::GemmKernel::ElementC;
+  using LayoutD = cutlass::detail::StrideToLayoutTagC_t<typename Gemm::GemmKernel::StrideD>;
+
+  using EVTNode = HEVT<
+    HostAuxStore<cutlass::half_t, cutlass::layout::RowMajor, false>,
+    HEVT<
+      HostCompute<cutlass::homogeneous_multiply_add>,
+      HostScalarBroadcast<2>,
+      HostAccumulator<>,
+      HostAuxLoad<ElementC, LayoutC, true>
+    >
+  >;
+  using EVTModule = HEVT<
+    HostAuxStore<ElementD, LayoutD, true>,
+    HDAG<
+      float,
+      cute::tuple<
+      cute::tuple<>, // 0. EVT
+      cute::tuple<>, // 1. per-row bias
+      cute::tuple<cute::_0, cute::_1>, // 2. EVT + per-row bias
+      cute::tuple<cute::_0, cute::_2> // 3. maximum(EVT + per-row bias, EVT)
+      >,
+      EVTNode,
+      HostColBroadcast<cutlass::half_t, cute::Stride<cute::_1,cute::_0,int>>,
+      HostCompute<cutlass::plus>,
+      HostCompute<cutlass::maximum_with_default_nan_propagation>
+    >
+  >;
+};
+
+/// Xreduce(alpha * acc + beta * C)
+template<class Gemm, class ReduceOp>
+class HostReduce {
+public:
+  using ElementC = typename Gemm::GemmKernel::ElementC;
+  using LayoutC = cutlass::detail::StrideToLayoutTagC_t<typename Gemm::GemmKernel::StrideC>;
+  using ElementD = typename Gemm::GemmKernel::ElementC;
+  using LayoutD = cutlass::detail::StrideToLayoutTagC_t<typename Gemm::GemmKernel::StrideD>;
+
+  using ScalarAlpha = HostScalarBroadcast<1>;
+  using AccFetchNode = HostAccumulator<>;
+  using BinaryCompute0 = HEVT<HostCompute<cutlass::multiplies>, ScalarAlpha, AccFetchNode>;
+  using ScalarBeta = HostScalarBroadcast<1>;
+  using CLoadNode = HostAuxLoad<ElementC, LayoutC, true>;
+  using TernaryCompute1 = HEVT<HostCompute<cutlass::homogeneous_multiply_add>, ScalarBeta, CLoadNode, BinaryCompute0>;
+  using ReduceNode = HEVT<ReduceOp, TernaryCompute1>;
+  using EVTModule = HEVT<HostAuxStore<ElementD, LayoutD, true>, ReduceNode>;
+};
+
+// Z = scale_a * scale_b * alpha * acc + beta * scale_c * C + per-row bias
+// if D is fp8
+//   D = scale_d * activation(Z)
+// else
+//   D = activation(Z)
+template <class Gemm, template <class> class ActivationFn, class ElementD>
+class HostScaledLinCombPerRowBiasEltAct {
+public:
+  using ElementC = typename Gemm::GemmKernel::ElementC;
+  using LayoutC = cutlass::detail::StrideToLayoutTagC_t<typename Gemm::GemmKernel::StrideC>;
+  using LayoutD = cutlass::detail::StrideToLayoutTagC_t<typename Gemm::GemmKernel::StrideD>;
+
+  using EVTModule = HEVT<
+  HostAuxStore<ElementD, LayoutD, true>,
+  HEVT<
+    HostCompute<cutlass::epilogue::fusion::detail::ScaleOutOp<ElementD>::template Op>,  // activation(Z) * scaled_d
+    HEVT<
+      HostCompute<ActivationFn>, // activation(Z)
+      HEVT<
+        HostCompute<cutlass::homogeneous_multiply_add>,
+        HostScalarBroadcast<1, 2, cute::Stride<cute::_0,cute::_0,int64_t>>, // scale_c * beta
+        HostAuxLoad<ElementC, LayoutC, true>, // C
+        HEVT<
+          HostCompute<cutlass::homogeneous_multiply_add>,
+          HostScalarBroadcast<1, 3, cute::Stride<cute::_0,cute::_0,int64_t>>, // scale_a * scale_b * alpha
+          HostAccumulator<>,
+          HostColBroadcast<ElementD, cute::Stride<cute::_1,cute::_0,int64_t>>
+        >
+      >
+    >,
+    HostScalarBroadcast<1> // scale_d
+  >
+  >;
+};
+
+// Z = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-row bias
+// if D is fp8
+//   amax_d = max(abs(elements in activation(Z)))
+//   D = scale_d * activation(Z)
+// else
+//   D = activation(Z)
+// if Aux is fp8
+//   amax_aux = max(abs(elements in Z))
+//   Aux = scale_aux * Z
+// else
+//   Aux = Z
+template <class Gemm, template <class> class ActivationFn, class ElementD, class ElementAux = ElementD>
+class HostScaledLinCombPerRowBiasEltActAmaxAux {
+public:
+  using ElementC = typename Gemm::GemmKernel::ElementC;
+  using LayoutC = cutlass::detail::StrideToLayoutTagC_t<typename Gemm::GemmKernel::StrideC>;
+  using LayoutD = cutlass::detail::StrideToLayoutTagC_t<typename Gemm::GemmKernel::StrideD>;
+
+  template <typename T>
+  using amax = cutlass::maximum_absolute_value_reduction<T, true>;
+  using EVTModuleAuxFp8 = HEVT<
+    HostAuxStore<ElementD, LayoutD, true>,
+    HST<float,
+      // Z = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-row bias
+      HEVT<
+        HostCompute<cutlass::homogeneous_multiply_add>,
+        HostScalarBroadcast<1, 2, cute::Stride<cute::_0,cute::_0,int64_t>>, // scale_c * beta
+        HostAuxLoad<ElementC, LayoutC, true>, // C
+        HEVT<
+          HostCompute<cutlass::homogeneous_multiply_add>,
+          HostScalarBroadcast<1, 3, cute::Stride<cute::_0,cute::_0,int64_t>>, // scale_a * scale_b * alpha
+          HostAccumulator<>,
+          HostColBroadcast<ElementD, cute::Stride<cute::_1,cute::_0,int64_t>>
+        >
+      >,
+      // D = activation(Z) * scaled_d, amax_d = max(abs(elements in D))
+      HEVT<
+        HostCompute<cutlass::epilogue::fusion::detail::ScaleOutOp<ElementD>::template Op>,
+        HEVT<
+          HostScalarReduce<amax, float>,
+          HEVT<
+            HostCompute<ActivationFn>, //activation(Z) * scaled_d
+            HostAccumulator<> // Z
+          >
+        >,
+        HostScalarBroadcast<1> // scale_d
+      >,
+      // Aux = Z * scale_aux, amax_aux = max(abs(elements in Aux))
+      HEVT<
+        HostAuxStore<ElementAux, cutlass::layout::RowMajor, false>,
+        HEVT<
+          HostCompute<cutlass::multiplies>,
+          HEVT<
+            HostScalarReduce<amax, float>,
+            HostAccumulator<>
+            >,
+          HostScalarBroadcast<1>
+        >
+      >
+    >
+  >;
+
+  using EVTModuleAuxNotFp8 = HEVT<
+    // D = activation(Z) * scaled_d, amax_d = max(abs(elements in D))
+    HostAuxStore<ElementD, LayoutD, true>,
+      HEVT<
+        HostCompute<cutlass::epilogue::fusion::detail::ScaleOutOp<ElementD>::template Op>,
+        HEVT<
+          HostScalarReduce<amax, float>,
+          HEVT<
+            HostCompute<ActivationFn>, //activation(Z) * scaled_d
+            HEVT<
+              // Aux = Z
+              HostAuxStore<ElementAux, cutlass::layout::RowMajor, false>,
+              // Z = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-row bias
+              HEVT<
+                HostCompute<cutlass::homogeneous_multiply_add>,
+                HostScalarBroadcast<1, 2, cute::Stride<cute::_0,cute::_0,int64_t>>, // scale_c * beta
+                HostAuxLoad<ElementC, LayoutC, true>, // C
+                HEVT<
+                  HostCompute<cutlass::homogeneous_multiply_add>,
+                  HostScalarBroadcast<1, 3, cute::Stride<cute::_0,cute::_0,int64_t>>, // scale_a * scale_b * alpha
+                  HostAccumulator<>,
+                  HostColBroadcast<ElementD, cute::Stride<cute::_1,cute::_0,int64_t>>
+                >
+              >
+            >
+          >
+        >,
+        HostScalarBroadcast<1> // scale_d
+      >
+    >;
+      
+  using EVTModule = cute::conditional_t<cutlass::epilogue::fusion::detail::is_fp8_v<ElementAux>, EVTModuleAuxFp8, EVTModuleAuxNotFp8>;
+
+};
+} // namespace test::gemm::device
+
+//////////////////////////////////////////////////////////////////////////////
+namespace cutlass::epilogue {
+namespace fusion {
+
+namespace detail {
+
+template <typename T>
+struct maximum_with_default_nan_propagation : maximum<T> {};
+
+} // namespace detail
+
+//////////////////////////////////////////////////////////////////////////////
+/// D = alpha * acc + beta * C + AuxLoad
+template<
+  class EpilogueDescriptor,
+  class AuxLoadDescriptor,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombAuxLoad =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc + bias)
+    Sm90ScalarBroadcast<ElementScalar>, // beta
+    Sm90SrcFetch<ElementOutput>, // C
+    Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc + bias
+      Sm90ScalarBroadcast<ElementScalar>, // alpha
+      Sm90AccFetch, // acc
+      Sm90AuxLoad<
+        AuxLoadDescriptor::Stages, typename EpilogueDescriptor::EpilogueTile,
+        typename AuxLoadDescriptor::Element,
+        typename AuxLoadDescriptor::Stride, typename AuxLoadDescriptor::SmemLayoutAtom,
+        typename AuxLoadDescriptor::CopyOpS2R // aux load
+      >
+    >
+  >;
+
+//////////////////////////////////////////////////////////////////////////////
+/// D = alpha * acc + beta * C + AuxLoadNoSmem
+template<
+  class EpilogueDescriptor,
+  class ElementAux,
+  class StrideAux,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombAuxLoadNoSmem =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc + bias)
+    Sm90ScalarBroadcast<ElementScalar>, // beta
+    Sm90SrcFetch<ElementOutput>, // C
+    Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc + bias
+      Sm90ScalarBroadcast<ElementScalar>, // alpha
+      Sm90AccFetch, // acc
+      Sm90AuxLoad<0, void, ElementAux, StrideAux, void, void> // aux load
+    >
+  >;
+
+//////////////////////////////////////////////////////////////////////////////
+/// Example DAG
+/// beta * C + Graph(alpha * acc + gamma + acc)
+template<
+  typename EpilogueDescriptor,
+  typename AuxLoadDescriptor,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombEVTDAG =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc + aux)
+    Sm90ScalarBroadcast<ElementScalar>, // beta
+    Sm90SrcFetch<ElementOutput>, // C
+    Sm90TopologicalVisitor<
+      ElementCompute,
+      cute::tuple<
+        cute::seq<>, // 0. alpha
+        cute::seq<>, // 1. acc
+        cute::seq<>, // 2. aux load
+        cute::seq<1, 0, 2>, // 3. alpha * acc + aux load
+        cute::seq<3>, // relu(alpha & acc + aux load)
+        cute::seq<2, 4> // relu(alpha * acc + aux load) + aux load
+      >,
+      Sm90ScalarBroadcast<ElementScalar>, // alpha
+      Sm90AccFetch, // acc
+      Sm90AuxLoad<
+        AuxLoadDescriptor::Stages, typename EpilogueDescriptor::EpilogueTile,
+        typename AuxLoadDescriptor::Element, typename AuxLoadDescriptor::Stride,
+        typename AuxLoadDescriptor::SmemLayoutAtom, typename AuxLoadDescriptor::CopyOpS2R>,
+      Sm90Compute<homogeneous_multiply_add, ElementCompute, ElementCompute, RoundStyle>,
+      Sm90Compute<cutlass::epilogue::thread::ReLu, ElementCompute, ElementCompute, RoundStyle>,
+      Sm90Compute<plus, ElementCompute, ElementCompute, RoundStyle>
+    >
+    >;
+
+
+//////////////////////////////////////////////////////////////////////////////
+/// Example DAG
+/// EVT = alpha * acc + C
+/// D = Graph(maximum(EVT + per-row bias, EVT))
+template<
+  class EpilogueDescriptor,
+  class AuxStoreDescriptor,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombDAGEVT =
+  Sm90TopologicalVisitor<
+    ElementCompute,
+    cute::tuple<
+      cute::seq<>,
+      cute::seq<>,
+      cute::seq<1, 0>,
+      cute::seq<0, 2>
+    >,
+    Sm90EVT<
+      Sm90AuxStore<
+        AuxStoreDescriptor::Stages, typename EpilogueDescriptor::EpilogueTile,
+        typename AuxStoreDescriptor::Element, RoundStyle, typename AuxStoreDescriptor::Stride,
+        typename AuxStoreDescriptor::SmemLayoutAtom, typename AuxStoreDescriptor::CopyOpR2S>,
+      Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementCompute, ElementCompute, RoundStyle>,
+        Sm90ScalarBroadcast<ElementScalar>,
+        Sm90AccFetch,
+        Sm90SrcFetch<ElementOutput>
+      >
+    >,
+    Sm90ColBroadcast<0, typename EpilogueDescriptor::TileShape, ElementBias, ElementCompute>,
+    Sm90Compute<plus, ElementCompute, ElementCompute, RoundStyle>,
+    Sm90Compute<detail::maximum_with_default_nan_propagation, ElementOutput, ElementCompute, RoundStyle>
+  >;
+
+
+//////////////////////////////////////////////////////////////////////////////
+/// D = alpha * acc + beta * C + per-column bias
+template<
+  class EpilogueDescriptor,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombPerColumnBias =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc + bias)
+    Sm90ScalarBroadcast<ElementScalar>, // beta
+    Sm90SrcFetch<ElementOutput>, // C
+    Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc + bias
+      Sm90ScalarBroadcast<ElementScalar>, // alpha
+      Sm90AccFetch, // acc
+      Sm90RowBroadcast<0, typename EpilogueDescriptor::TileShape, ElementBias, ElementCompute>
+    >
+  >;
+
+
+//////////////////////////////////////////////////////////////////////////////
+/// D = per-column reduce(alpha * acc + beta * C)
+template<
+  template <class> class RegReduceFn,
+  template <class> class GmemReduceFn,
+  class ElementReduce,
+  class CtaTileShapeMNK,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombPerColumnReduce =
+  Sm90EVT<Sm90RowReduction<RegReduceFn, RegReduceFn, GmemReduceFn, 0, CtaTileShapeMNK, ElementReduce, ElementCompute, RoundStyle>, // per column reduce
+    Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + alpha * acc
+      Sm90ScalarBroadcast<ElementScalar>, // beta
+      Sm90SrcFetch<ElementOutput>, // C
+      Sm90EVT<Sm90Compute<multiplies, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc
+        Sm90ScalarBroadcast<ElementScalar>, // alpha
+        Sm90AccFetch // acc
+      >
+    >
+  >;
+
+
+//////////////////////////////////////////////////////////////////////////////
+/// D = per-row reduce(alpha * acc + beta * C)
+template<
+  template <class> class RegReduceFn,
+  template <class> class GmemReduceFn,
+  class ElementReduce,
+  class CtaTileShapeMNK,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombPerRowReduce =
+  Sm90EVT<Sm90ColReduction<RegReduceFn, RegReduceFn, GmemReduceFn, 0, CtaTileShapeMNK, ElementReduce, ElementCompute, RoundStyle>, // per column reduce
+    Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + alpha * acc
+      Sm90ScalarBroadcast<ElementScalar>, // beta
+      Sm90SrcFetch<ElementOutput>, // C
+      Sm90EVT<Sm90Compute<multiplies, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc
+        Sm90ScalarBroadcast<ElementScalar>, // alpha
+        Sm90AccFetch // acc
+      >
+    >
+  >;
+
+
+//////////////////////////////////////////////////////////////////////////////
+/// D = scalar reduce(alpha * acc + beta * C)
+template<
+  template <class> class RegReduceFn,
+  template <class> class GmemReduceFn,
+  class ElementReduce,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombScalarReduce =
+  Sm90EVT<Sm90ScalarReduction<RegReduceFn, GmemReduceFn, ElementReduce, ElementCompute, RoundStyle>, // per column reduce
+    Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + alpha * acc
+      Sm90ScalarBroadcast<ElementScalar>, // beta
+      Sm90SrcFetch<ElementOutput>, // C
+      Sm90EVT<Sm90Compute<multiplies, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc
+        Sm90ScalarBroadcast<ElementScalar>, // alpha
+        Sm90AccFetch // acc
+      >
+    >
+  >;
+} // namespace fusion
+
+} // namespace cutlass::epilogue
diff --git a/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32.cu b/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32.cu
new file mode 100644
index 0000000000..b9bd305eff
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32.cu
@@ -0,0 +1,135 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_bf16t_bf16n_bf16n_align4_tensor_op_gmma_f32, 128x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::bfloat16_t, LayoutA, 4,
+      cutlass::bfloat16_t, LayoutB, 4,
+      float,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::bfloat16_t, LayoutC, 4,
+      cutlass::bfloat16_t, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_bf16t_bf16n_bf16n_align2_tensor_op_gmma_f32, 128x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::bfloat16_t, LayoutA, 2,
+      cutlass::bfloat16_t, LayoutB, 2,
+      float,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::bfloat16_t, LayoutC, 2,
+      cutlass::bfloat16_t, LayoutC, 2,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32_warpspecialized.cu b/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32_warpspecialized.cu
new file mode 100644
index 0000000000..589c4fad5b
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32_warpspecialized.cu
@@ -0,0 +1,172 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_bf16t_bf16n_bf16n_align8_tensor_op_gmma_f32_warpspecialized, 64x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::bfloat16_t, LayoutA, 8,
+      cutlass::bfloat16_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::bfloat16_t, LayoutC, 8,
+      cutlass::bfloat16_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_bf16t_bf16n_bf16n_align4_tensor_op_gmma_f32_warpspecialized, 64x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::bfloat16_t, LayoutA, 4,
+      cutlass::bfloat16_t, LayoutB, 4,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::bfloat16_t, LayoutC, 4,
+      cutlass::bfloat16_t, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_bf16t_bf16n_bf16n_align2_tensor_op_gmma_f32_warpspecialized, 64x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::bfloat16_t, LayoutA, 2,
+      cutlass::bfloat16_t, LayoutB, 2,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::bfloat16_t, LayoutC, 2,
+      cutlass::bfloat16_t, LayoutC, 2,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32_warpspecialized_cooperative.cu b/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32_warpspecialized_cooperative.cu
new file mode 100644
index 0000000000..5ca50813d6
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32_warpspecialized_cooperative.cu
@@ -0,0 +1,172 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_bf16t_bf16n_bf16n_align8_tensor_op_gmma_f32_warpspecialized_cooperative, 128x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::bfloat16_t, LayoutA, 8,
+      cutlass::bfloat16_t, LayoutB, 8,
+      float,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::bfloat16_t, LayoutC, 8,
+      cutlass::bfloat16_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_bf16t_bf16n_bf16n_align4_tensor_op_gmma_f32_warpspecialized_cooperative, 128x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::bfloat16_t, LayoutA, 4,
+      cutlass::bfloat16_t, LayoutB, 4,
+      float,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::bfloat16_t, LayoutC, 4,
+      cutlass::bfloat16_t, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_bf16t_bf16n_bf16n_align2_tensor_op_gmma_f32_warpspecialized_cooperative, 128x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::bfloat16_t, LayoutA, 2,
+      cutlass::bfloat16_t, LayoutB, 2,
+      float,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::bfloat16_t, LayoutC, 2,
+      cutlass::bfloat16_t, LayoutC, 2,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32_warpspecialized_pingpong.cu b/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32_warpspecialized_pingpong.cu
new file mode 100644
index 0000000000..16dfb28564
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32_warpspecialized_pingpong.cu
@@ -0,0 +1,172 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_bf16t_bf16n_bf16n_align8_tensor_op_gmma_f32_warpspecialized_pingpong, 64x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::bfloat16_t, LayoutA, 8,
+      cutlass::bfloat16_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::bfloat16_t, LayoutC, 8,
+      cutlass::bfloat16_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_bf16t_bf16n_bf16n_align4_tensor_op_gmma_f32_warpspecialized_pingpong, 64x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::bfloat16_t, LayoutA, 4,
+      cutlass::bfloat16_t, LayoutB, 4,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::bfloat16_t, LayoutC, 4,
+      cutlass::bfloat16_t, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_bf16t_bf16n_bf16n_align2_tensor_op_gmma_f32_warpspecialized_pingpong, 64x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::bfloat16_t, LayoutA, 2,
+      cutlass::bfloat16_t, LayoutB, 2,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::bfloat16_t, LayoutC, 2,
+      cutlass::bfloat16_t, LayoutC, 2,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_tensor_op_f32.cu b/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_tensor_op_f32.cu
new file mode 100644
index 0000000000..79ebc2adcf
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_tensor_op_f32.cu
@@ -0,0 +1,208 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_bf16t_bf16t_bf16n_tensor_op_gmma_f32, 64x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::bfloat16_t, LayoutA, 8,
+      cutlass::bfloat16_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::bfloat16_t, LayoutC, 8,
+      cutlass::bfloat16_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_bf16t_bf16n_bf16n_tensor_op_gmma_f32, 64x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::bfloat16_t, LayoutA, 8,
+      cutlass::bfloat16_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::bfloat16_t, LayoutC, 8,
+      cutlass::bfloat16_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_bf16n_bf16t_bf16n_tensor_op_gmma_f32, 64x128x64) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::bfloat16_t, LayoutA, 8,
+      cutlass::bfloat16_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::bfloat16_t, LayoutC, 8,
+      cutlass::bfloat16_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_bf16n_bf16n_bf16n_tensor_op_gmma_f32, 64x128x64) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::bfloat16_t, LayoutA, 8,
+      cutlass::bfloat16_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::bfloat16_t, LayoutC, 8,
+      cutlass::bfloat16_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_alignx_tensor_op_f32.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_alignx_tensor_op_f32.cu
new file mode 100644
index 0000000000..ecf91a69ad
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_alignx_tensor_op_f32.cu
@@ -0,0 +1,365 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////// TT //////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16t_f16n_align4_tensor_op_gmma_f32, 128x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 4,
+      cutlass::half_t, LayoutB, 4,
+      float,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 4,
+      cutlass::half_t, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16t_f16n_align2_tensor_op_gmma_f32, 128x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 2,
+      cutlass::half_t, LayoutB, 2,
+      float,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 2,
+      cutlass::half_t, LayoutC, 2,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////// TN //////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16n_f16n_align4_tensor_op_gmma_f32, 128x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 4,
+      cutlass::half_t, LayoutB, 4,
+      float,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 4,
+      cutlass::half_t, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16n_f16n_align2_tensor_op_gmma_f32, 128x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 2,
+      cutlass::half_t, LayoutB, 2,
+      float,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 2,
+      cutlass::half_t, LayoutC, 2,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////// NT //////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16t_f16n_align4_tensor_op_gmma_f32, 128x128x64) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 4,
+      cutlass::half_t, LayoutB, 4,
+      float,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 4,
+      cutlass::half_t, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16t_f16n_align2_tensor_op_gmma_f32, 128x128x64) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 2,
+      cutlass::half_t, LayoutB, 2,
+      float,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 2,
+      cutlass::half_t, LayoutC, 2,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////// NN //////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16n_f16n_align4_tensor_op_gmma_f32, 128x128x64) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 4,
+      cutlass::half_t, LayoutB, 4,
+      float,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 4,
+      cutlass::half_t, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16n_f16n_align2_tensor_op_gmma_f32, 128x128x64) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 2,
+      cutlass::half_t, LayoutB, 2,
+      float,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 2,
+      cutlass::half_t, LayoutC, 2,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_alignx_tensor_op_f32_warpspecialized.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_alignx_tensor_op_f32_warpspecialized.cu
new file mode 100644
index 0000000000..c2db370a44
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_alignx_tensor_op_f32_warpspecialized.cu
@@ -0,0 +1,510 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////// TT //////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16t_f16n_align8_tensor_op_gmma_f32_warpspecialized, 64x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16t_f16t_f16n_align4_tensor_op_gmma_f32_warpspecialized, 64x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 4,
+      cutlass::half_t, LayoutB, 4,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 4,
+      cutlass::half_t, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+
+TEST(SM90_Device_Gemm_f16t_f16t_f16n_align2_tensor_op_gmma_f32_warpspecialized, 64x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 2,
+      cutlass::half_t, LayoutB, 2,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 2,
+      cutlass::half_t, LayoutC, 2,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////// TN //////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16n_f16n_align8_tensor_op_gmma_f32_warpspecialized, 64x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16n_f16n_align4_tensor_op_gmma_f32_warpspecialized, 64x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 4,
+      cutlass::half_t, LayoutB, 4,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 4,
+      cutlass::half_t, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16n_f16n_align2_tensor_op_gmma_f32_warpspecialized, 64x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 2,
+      cutlass::half_t, LayoutB, 2,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 2,
+      cutlass::half_t, LayoutC, 2,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////// NT //////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16t_f16n_align8_tensor_op_gmma_f32_warpspecialized, 64x128x64) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16t_f16n_align4_tensor_op_gmma_f32_warpspecialized, 64x128x64) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 4,
+      cutlass::half_t, LayoutB, 4,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 4,
+      cutlass::half_t, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16t_f16n_align2_tensor_op_gmma_f32_warpspecialized, 64x128x64) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 2,
+      cutlass::half_t, LayoutB, 2,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 2,
+      cutlass::half_t, LayoutC, 2,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////// NN //////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16n_f16n_align8_tensor_op_gmma_f32_warpspecialized, 64x128x64) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16n_f16n_align4_tensor_op_gmma_f32_warpspecialized, 64x128x64) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 4,
+      cutlass::half_t, LayoutB, 4,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 4,
+      cutlass::half_t, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16n_f16n_align2_tensor_op_gmma_f32_warpspecialized, 64x128x64) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 2,
+      cutlass::half_t, LayoutB, 2,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 2,
+      cutlass::half_t, LayoutC, 2,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_alignx_tensor_op_f32_warpspecialized_cooperative.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_alignx_tensor_op_f32_warpspecialized_cooperative.cu
new file mode 100644
index 0000000000..2d64c0364e
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_alignx_tensor_op_f32_warpspecialized_cooperative.cu
@@ -0,0 +1,510 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////// TT //////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16t_f16n_align8_tensor_op_gmma_f32_warpspecialized_cooperative, 128x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16t_f16t_f16n_align4_tensor_op_gmma_f32_warpspecialized_cooperative, 128x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 4,
+      cutlass::half_t, LayoutB, 4,
+      float,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 4,
+      cutlass::half_t, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+
+TEST(SM90_Device_Gemm_f16t_f16t_f16n_align2_tensor_op_gmma_f32_warpspecialized_cooperative, 128x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 2,
+      cutlass::half_t, LayoutB, 2,
+      float,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 2,
+      cutlass::half_t, LayoutC, 2,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////// TN //////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16n_f16n_align8_tensor_op_gmma_f32_warpspecialized_cooperative, 128x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16n_f16n_align4_tensor_op_gmma_f32_warpspecialized_cooperative, 128x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 4,
+      cutlass::half_t, LayoutB, 4,
+      float,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 4,
+      cutlass::half_t, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16n_f16n_align2_tensor_op_gmma_f32_warpspecialized_cooperative, 128x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 2,
+      cutlass::half_t, LayoutB, 2,
+      float,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 2,
+      cutlass::half_t, LayoutC, 2,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////// NT //////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16t_f16n_align8_tensor_op_gmma_f32_warpspecialized_cooperative, 128x128x64) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16t_f16n_align4_tensor_op_gmma_f32_warpspecialized_cooperative, 128x128x64) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 4,
+      cutlass::half_t, LayoutB, 4,
+      float,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 4,
+      cutlass::half_t, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16t_f16n_align2_tensor_op_gmma_f32_warpspecialized_cooperative, 128x128x64) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 2,
+      cutlass::half_t, LayoutB, 2,
+      float,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 2,
+      cutlass::half_t, LayoutC, 2,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////// NN //////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16n_f16n_align8_tensor_op_gmma_f32_warpspecialized_cooperative, 128x128x64) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16n_f16n_align4_tensor_op_gmma_f32_warpspecialized_cooperative, 128x128x64) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 4,
+      cutlass::half_t, LayoutB, 4,
+      float,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 4,
+      cutlass::half_t, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16n_f16n_align2_tensor_op_gmma_f32_warpspecialized_cooperative, 128x128x64) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 2,
+      cutlass::half_t, LayoutB, 2,
+      float,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 2,
+      cutlass::half_t, LayoutC, 2,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_alignx_tensor_op_f32_warpspecialized_pingpong.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_alignx_tensor_op_f32_warpspecialized_pingpong.cu
new file mode 100644
index 0000000000..8551aaaf4f
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_alignx_tensor_op_f32_warpspecialized_pingpong.cu
@@ -0,0 +1,510 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////// TT //////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16t_f16n_align8_tensor_op_gmma_f32_warpspecialized_pingpong, 64x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16t_f16t_f16n_align4_tensor_op_gmma_f32_warpspecialized_pingpong, 64x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 4,
+      cutlass::half_t, LayoutB, 4,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 4,
+      cutlass::half_t, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+
+TEST(SM90_Device_Gemm_f16t_f16t_f16n_align2_tensor_op_gmma_f32_warpspecialized_pingpong, 64x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 2,
+      cutlass::half_t, LayoutB, 2,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 2,
+      cutlass::half_t, LayoutC, 2,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////// TN //////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16n_f16n_align8_tensor_op_gmma_f32_warpspecialized_pingpong, 64x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16n_f16n_align4_tensor_op_gmma_f32_warpspecialized_pingpong, 64x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 4,
+      cutlass::half_t, LayoutB, 4,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 4,
+      cutlass::half_t, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16n_f16n_align2_tensor_op_gmma_f32_warpspecialized_pingpong, 64x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 2,
+      cutlass::half_t, LayoutB, 2,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 2,
+      cutlass::half_t, LayoutC, 2,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////// NT //////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16t_f16n_align8_tensor_op_gmma_f32_warpspecialized_pingpong, 64x128x64) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16t_f16n_align4_tensor_op_gmma_f32_warpspecialized_pingpong, 64x128x64) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 4,
+      cutlass::half_t, LayoutB, 4,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 4,
+      cutlass::half_t, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16t_f16n_align2_tensor_op_gmma_f32_warpspecialized_pingpong, 64x128x64) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 2,
+      cutlass::half_t, LayoutB, 2,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 2,
+      cutlass::half_t, LayoutC, 2,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////// NN //////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16n_f16n_align8_tensor_op_gmma_f32_warpspecialized_pingpong, 64x128x64) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16n_f16n_align4_tensor_op_gmma_f32_warpspecialized_pingpong, 64x128x64) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 4,
+      cutlass::half_t, LayoutB, 4,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 4,
+      cutlass::half_t, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16n_f16n_align2_tensor_op_gmma_f32_warpspecialized_pingpong, 64x128x64) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 2,
+      cutlass::half_t, LayoutB, 2,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 2,
+      cutlass::half_t, LayoutC, 2,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op.cu
new file mode 100644
index 0000000000..ed92ef43ed
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op.cu
@@ -0,0 +1,919 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16t_f16n_tensor_op_gmma_f32, 64x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16t_f16t_f16n_tensor_op_gmma_f32, 128x128x32) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_128,_128,_32>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_32>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16t_f16t_f16n_tensor_op_gmma_f32, 64x64x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_64,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_64,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16n_f16n_tensor_op_gmma_f32, 64x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f16n_tensor_op_gmma_f32, 128x128x32) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_128,_128,_32>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_32>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f16n_tensor_op_gmma_f32, 64x64x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_64,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_64,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16t_f16n_tensor_op_gmma_f32, 64x128x64) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16n_f16t_f16n_tensor_op_gmma_f32, 128x128x32) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_128,_128,_32>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_32>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16n_f16t_f16n_tensor_op_gmma_f32, 64x64x64) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_64,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_64,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16n_f16n_tensor_op_gmma_f32, 64x128x64) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16n_f16n_f16n_tensor_op_gmma_f32, 128x128x32) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_128,_128,_32>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_32>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16n_f16n_f16n_tensor_op_gmma_f32, 64x64x64) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_64,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_64,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16t_f16n_tensor_op_gmma_f16, 64x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      cutlass::half_t,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      cutlass::half_t, cutlass::half_t,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16t_f16t_f16n_tensor_op_gmma_f16, 128x128x32) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      cutlass::half_t,
+      Shape<_128,_128,_32>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_32>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      cutlass::half_t, cutlass::half_t,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16t_f16t_f16n_tensor_op_gmma_f16, 64x64x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      cutlass::half_t,
+      Shape<_64,_64,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_64,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      cutlass::half_t, cutlass::half_t,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16n_f16n_tensor_op_gmma_f16, 64x128x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      cutlass::half_t,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      cutlass::half_t, cutlass::half_t,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f16n_tensor_op_gmma_f16, 128x128x32) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      cutlass::half_t,
+      Shape<_128,_128,_32>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_32>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      cutlass::half_t, cutlass::half_t,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f16n_tensor_op_gmma_f16, 64x64x64) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      cutlass::half_t,
+      Shape<_64,_64,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_64,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      cutlass::half_t, cutlass::half_t,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16t_f16n_tensor_op_gmma_f16, 64x128x64) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      cutlass::half_t,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      cutlass::half_t, cutlass::half_t,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16n_f16t_f16n_tensor_op_gmma_f16, 128x128x32) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      cutlass::half_t,
+      Shape<_128,_128,_32>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_32>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      cutlass::half_t, cutlass::half_t,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16n_f16t_f16n_tensor_op_gmma_f16, 64x64x64) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      cutlass::half_t,
+      Shape<_64,_64,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_64,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      cutlass::half_t, cutlass::half_t,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16n_f16n_tensor_op_gmma_f16, 64x128x64) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      cutlass::half_t,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      cutlass::half_t, cutlass::half_t,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16n_f16n_f16n_tensor_op_gmma_f16, 128x128x32) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      cutlass::half_t,
+      Shape<_128,_128,_32>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_32>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      cutlass::half_t, cutlass::half_t,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16n_f16n_f16n_tensor_op_gmma_f16, 64x64x64) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      cutlass::half_t,
+      Shape<_64,_64,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_64,_64>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      cutlass::half_t, cutlass::half_t,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_unspecialized.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_unspecialized.cu
new file mode 100644
index 0000000000..33864ea516
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_unspecialized.cu
@@ -0,0 +1,663 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////////// Cluster 2x2x1  ////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_unspecialized, 64x128x64_2x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_2,_2,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTma
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_2,_2,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32n_tensor_op_gmma_f32_unspecialized, 64x128x64_2x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_2,_2,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTma 
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_2,_2,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16t_f32n_tensor_op_gmma_f32_unspecialized, 64x128x64_2x2x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_2,_2,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTma
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_2,_2,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16n_f32n_tensor_op_gmma_f32_unspecialized, 64x128x64_2x2x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_2,_2,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTma
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_2,_2,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////////// Cluster 4x1x1  ////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_unspecialized, 64x128x64_4x1x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_4,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTma
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_4,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32n_tensor_op_gmma_f32_unspecialized, 64x128x64_4x1x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_4,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTma
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_4,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16t_f32n_tensor_op_gmma_f32_unspecialized, 64x128x64_4x1x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_4,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTma
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_4,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16n_f32n_tensor_op_gmma_f32_unspecialized, 64x128x64_4x1x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_4,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTma
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_4,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////////// Cluster 1x4x1  ////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_unspecialized, 64x128x64_1x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_4,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTma
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_4,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32n_tensor_op_gmma_f32_unspecialized, 64x128x64_1x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_4,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTma
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_4,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16t_f32n_tensor_op_gmma_f32_unspecialized, 64x128x64_1x4x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_4,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTma
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_4,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16n_f32n_tensor_op_gmma_f32_unspecialized, 64x128x64_1x4x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_4,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTma
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_4,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////////// Cluster 2x4x1  ////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_unspecialized, 64x128x64_2x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_2,_4,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTma
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_2,_4,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32n_tensor_op_gmma_f32_unspecialized, 64x128x64_2x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_2,_4,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTma
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_2,_4,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16t_f32n_tensor_op_gmma_f32_unspecialized, 64x128x64_2x4x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_2,_4,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTma
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_2,_4,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16n_f32n_tensor_op_gmma_f32_unspecialized, 64x128x64_2x4x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_2,_4,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTma
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_2,_4,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized.cu
new file mode 100644
index 0000000000..dccef44dcb
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized.cu
@@ -0,0 +1,663 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////////// Cluster 2x2x1  ////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_warpspecialized, 64x128x64_2x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_2,_2,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_2,_2,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1, 1));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32n_tensor_op_gmma_f32_warpspecialized, 64x128x64_2x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_2,_2,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecialized 
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_2,_2,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1, 1));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16t_f32n_tensor_op_gmma_f32_warpspecialized, 64x128x64_2x2x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_2,_2,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_2,_2,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1, 1));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16n_f32n_tensor_op_gmma_f32_warpspecialized, 64x128x64_2x2x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_2,_2,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_2,_2,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1, 1));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////////// Cluster 4x1x1  ////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_warpspecialized, 64x128x64_4x1x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_4,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_4,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1, 1));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32n_tensor_op_gmma_f32_warpspecialized, 64x128x64_4x1x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_4,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_4,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1, 1));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16t_f32n_tensor_op_gmma_f32_warpspecialized, 64x128x64_4x1x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_4,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_4,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1, 1));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16n_f32n_tensor_op_gmma_f32_warpspecialized, 64x128x64_4x1x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_4,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_4,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1, 1));
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////////// Cluster 1x4x1  ////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_warpspecialized, 64x128x64_1x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_4,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_4,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1, 1));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32n_tensor_op_gmma_f32_warpspecialized, 64x128x64_1x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_4,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_4,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1, 1));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16t_f32n_tensor_op_gmma_f32_warpspecialized, 64x128x64_1x4x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_4,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_4,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1, 1));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16n_f32n_tensor_op_gmma_f32_warpspecialized, 64x128x64_1x4x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_1,_4,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_1,_4,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1, 1));
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////////// Cluster 2x4x1  ////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_warpspecialized, 64x128x64_2x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_2,_4,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_2,_4,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1, 1));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32n_tensor_op_gmma_f32_warpspecialized, 64x128x64_2x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_2,_4,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_2,_4,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1, 1));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16t_f32n_tensor_op_gmma_f32_warpspecialized, 64x128x64_2x4x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_2,_4,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_2,_4,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1, 1));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16n_f32n_tensor_op_gmma_f32_warpspecialized, 64x128x64_2x4x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_2,_4,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_64>, Shape<_2,_4,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1, 1));
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative.cu
new file mode 100644
index 0000000000..34f10fbc3a
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative.cu
@@ -0,0 +1,924 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_cooperative, 128x128x64_1x1x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_1,_1,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 8,
+      ElementB, LayoutB, 8,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_cooperative, 256x128x64_1x2x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_1,_2,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 8,
+      ElementB, LayoutB, 8,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////////// Cluster 2x2x1  ////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_cooperative, 128x128x64_2x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32n_tensor_op_gmma_f32_cooperative, 256x128x64_2x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16t_f32n_tensor_op_gmma_f32_cooperative, 128x128x64_2x2x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16n_f32n_tensor_op_gmma_f32_cooperative, 256x128x64_2x2x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_1,_2,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////////// Cluster 4x1x1  ////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_cooperative, 128x128x64_4x1x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_4,_1,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32n_tensor_op_gmma_f32_cooperative, 128x128x64_4x1x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_4,_1,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16t_f32n_tensor_op_gmma_f32_cooperative, 128x128x64_4x1x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_4,_1,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16n_f32n_tensor_op_gmma_f32_cooperative, 128x128x64_4x1x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_4,_1,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////////// Cluster 1x4x1  ////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_cooperative, 128x128x64_1x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_1,_4,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32n_tensor_op_gmma_f32_cooperative, 128x128x64_1x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_1,_4,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16t_f32n_tensor_op_gmma_f32_cooperative, 128x128x64_1x4x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_1,_4,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16n_f32n_tensor_op_gmma_f32_cooperative, 128x128x64_1x4x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_1,_4,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////////// Cluster 2x4x1  ////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_cooperative, 256x128x64_2x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_4,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32n_tensor_op_gmma_f32_cooperative, 256x128x64_2x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_4,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16t_f32n_tensor_op_gmma_f32_cooperative, 256x128x64_2x4x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_4,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16n_f32n_tensor_op_gmma_f32_cooperative, 256x128x64_2x4x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_4,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f16n_tensor_op_gmma_f32_cooperative_epilogue, 256x128x64_2x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1, 1));
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f16t_tensor_op_gmma_f32_cooperative_epilogue, 256x128x64_2x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1, 1));
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32n_tensor_op_gmma_f32_cooperative_epilogue, 128x128x64_2x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1, 1));
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 128x128x64_2x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1, 1));
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_aux_load.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_aux_load.cu
new file mode 100644
index 0000000000..5ca84b1319
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_aux_load.cu
@@ -0,0 +1,288 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for Sm90 f16_f16_f16 with cooperative EVT epilogue
+    D = alpha * acc + beta * c + aux_load 
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x_evt.hpp"
+#include "sm90_evt_operations.hpp"
+
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 256x128x64_2x2x1_AuxLoadF16_RowMajor) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
+    TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule
+  >; 
+  using AuxLoadDescriptor = cutlass::epilogue::collective::detail::AuxLoadDescriptor<
+    EpilogueDescriptor, cutlass::layout::RowMajor, cutlass::half_t
+  >;
+
+  using FusionCallbacks = cutlass::epilogue::fusion::Sm90LinCombAuxLoad<
+    EpilogueDescriptor, AuxLoadDescriptor, cutlass::half_t, float, float>;
+
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      EpilogueTileType,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionCallbacks
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  // Host reference
+  using HostReference = test::gemm::device::HostEVTAuxLoad<
+    Gemm, cutlass::half_t, cutlass::layout::RowMajor
+  >;
+  bool passed = test::gemm::device::TestAllEVT<Gemm, HostReference>();
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 256x128x64_2x2x1_AuxLoadNoSmemF16_RowMajor) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
+    TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule
+  >; 
+
+  using FusionCallbacks = cutlass::epilogue::fusion::Sm90LinCombAuxLoadNoSmem<
+    EpilogueDescriptor, cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, float, float>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      EpilogueTileType,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionCallbacks
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  // Host reference
+  using HostReference = test::gemm::device::HostEVTAuxLoad<
+    Gemm, cutlass::half_t, cutlass::layout::RowMajor
+  >;
+  bool passed = test::gemm::device::TestAllEVT<Gemm, HostReference>();
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 256x128x64_2x2x1_AuxLoadF16_ColumnMajor) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
+    TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule
+  >; 
+  using AuxLoadDescriptor = cutlass::epilogue::collective::detail::AuxLoadDescriptor<
+    EpilogueDescriptor, cutlass::layout::ColumnMajor, cutlass::half_t
+  >;
+
+  using FusionCallbacks = cutlass::epilogue::fusion::Sm90LinCombAuxLoad<
+    EpilogueDescriptor, AuxLoadDescriptor, cutlass::half_t, float, float>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      EpilogueTileType,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionCallbacks
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  // Host reference
+  using HostReference = test::gemm::device::HostEVTAuxLoad<
+    Gemm, cutlass::half_t, cutlass::layout::ColumnMajor
+  >;
+  bool passed = test::gemm::device::TestAllEVT<Gemm, HostReference>();
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 128x128x64_2x2x1_AuxLoadF32_ColumnMajor) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
+    TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule
+  >; 
+  using AuxLoadDescriptor = cutlass::epilogue::collective::detail::AuxLoadDescriptor<
+    EpilogueDescriptor, cutlass::layout::ColumnMajor, float
+  >;
+
+  using FusionCallbacks = cutlass::epilogue::fusion::Sm90LinCombAuxLoad<
+    EpilogueDescriptor, AuxLoadDescriptor, cutlass::half_t, float, float>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      EpilogueTileType,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionCallbacks
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  // Host reference
+  using HostReference = test::gemm::device::HostEVTAuxLoad<
+    Gemm, float, cutlass::layout::ColumnMajor
+  >;
+  bool passed = test::gemm::device::TestAllEVT<Gemm, HostReference>();
+  EXPECT_TRUE(passed);
+}
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_aux_store.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_aux_store.cu
new file mode 100644
index 0000000000..bf8b1fc7fe
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_aux_store.cu
@@ -0,0 +1,745 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for Sm90 f16_f16_f16 with cooperative EVT epilogue
+    D = alpha * acc + beta * c + aux_load
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x_evt.hpp"
+#include "sm90_evt_operations.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+namespace test::gemm::device {
+template <class ElementCompute, class ElementAccumulator, bool IsCNeed>
+static constexpr auto select_evt_d() {
+  using namespace cutlass::epilogue::fusion;
+  constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
+  using BinaryCompute0 = Sm90EVT<Sm90Compute<
+                                   cutlass::multiplies,
+                                   ElementCompute,
+                                   ElementCompute,
+                                   RoundStyle>,                          // alpha * acc
+                            Sm90ScalarBroadcast<ElementAccumulator>,  // alpha
+                            Sm90AccFetch                              // acc
+                         >;
+  if constexpr (IsCNeed) {
+    using EVT_D = Sm90EVT<Sm90Compute<cutlass::homogeneous_multiply_add, ElementCompute, ElementCompute, RoundStyle>,
+                    Sm90ScalarBroadcast<ElementAccumulator>,  // beta
+                    Sm90SrcFetch<ElementCompute>,                             // C
+                    BinaryCompute0>;
+    return EVT_D{};
+  } else {
+    return BinaryCompute0{};
+  }
+}
+
+template <class Gemm, class GemmWithoutD>
+bool testEVTAuxStoreWithoutD() {
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+
+  int max_alignment = std::max(Gemm::kAlignmentA, Gemm::kAlignmentB);
+  std::vector<int> problem_size_m = {max_alignment, 512 - 3 * max_alignment};
+  std::vector<int> problem_size_n = {max_alignment, 512 - 2 * max_alignment};
+
+  if constexpr (std::is_same_v<typename Gemm::GemmKernel::DispatchPolicy::Schedule,
+        cutlass::gemm::KernelTmaWarpSpecializedPingpong>) {
+    problem_size_m.push_back(768);
+    problem_size_n.push_back(768);
+  }
+
+  using GemmKernel = typename Gemm::GemmKernel;
+  constexpr int Stages = Gemm::GemmKernel::DispatchPolicy::Stages;
+  constexpr int TileShapeK = cute::size<2>(typename Gemm::GemmKernel::TileShape{});
+
+  std::vector<int> problem_size_k = {max_alignment, TileShapeK * (Stages + 1) - max_alignment};
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementC = typename Gemm::ElementC;
+  using ElementD = typename Gemm::ElementD;
+  constexpr bool has_c = not cute::is_void_v<ElementC>;
+  cutlass::DeviceAllocation<ElementA> A_block;
+  cutlass::DeviceAllocation<ElementB> B_block;
+  cutlass::DeviceAllocation<cute::conditional_t<has_c, ElementC, ElementD>> C_block;
+  cutlass::DeviceAllocation<ElementD> D_block;
+  cutlass::DeviceAllocation<ElementD> aux_store_D_block;
+  cutlass::DeviceAllocation<uint8_t> workspace;
+
+  for (int m : problem_size_m) {
+  for (int n : problem_size_n) {
+    for (int k : problem_size_k) {
+    ProblemShapeType problem_size;
+    int l = 1;
+    problem_size = ProblemShapeType{m, n, k, l};
+
+    // Run Base Gemm to get reference D
+    A_block.reset(m * k);
+    B_block.reset(k * n);
+    C_block.reset(m * n);
+    D_block.reset(m * n);
+    aux_store_D_block.reset(m * n);
+    Gemm gemm_op_base;
+
+    auto stride_A = cutlass::make_cute_packed_stride(typename GemmKernel::StrideA{}, {m, k, 1});
+    auto stride_B = cutlass::make_cute_packed_stride(typename GemmKernel::StrideB{}, {n, k, 1});
+    auto stride_C = cutlass::make_cute_packed_stride(typename GemmKernel::StrideC{}, {m, n, 1});
+    auto stride_D = cutlass::make_cute_packed_stride(typename GemmKernel::StrideD{}, {m, n, 1});
+
+    auto arguments_base = typename Gemm::Arguments {
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      problem_size,
+      {
+        A_block.get(), stride_A,
+        B_block.get(), stride_B
+      },
+      {   // Epilogue arguments
+        {}, // thread
+        has_c ? C_block.get() : nullptr, stride_C,
+        D_block.get(), stride_D,
+      },  // Epilogue arguments end
+      /*hw_info=*/{},
+      /*scheduler_args=*/{}
+    };
+
+    // check without D aux store
+    // set D to be void and use Sm90AuxStore to write to D
+    // and then the D is the same
+    GemmWithoutD gemm_op;
+
+    auto arguments = typename GemmWithoutD::Arguments{
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      problem_size,
+      {
+        A_block.get(), stride_A,
+        B_block.get(), stride_B
+      },
+      {   // Epilogue arguments
+        {}, // thread
+        has_c ? C_block.get() : nullptr, stride_C,
+        nullptr, stride_D,
+      },  // Epilogue arguments end
+      /*hw_info=*/{},
+      /*scheduler_args=*/{}
+    };
+
+    constexpr float beta [[maybe_unused]] = 1.0;
+    constexpr float alpha [[maybe_unused]] = 1.0;
+
+    using ElementC = typename GemmWithoutD::ElementC;
+
+    if constexpr (not has_c) {
+      arguments_base.epilogue.thread = {
+        // binary op : alpha * acc
+        {{alpha}},  // leaf op+args : alpha
+        {},         // leaf op+args : acc
+        {}          // binary args : multiplies
+      };
+      arguments.epilogue.thread = {
+        // unary op: aux store D
+        {
+          // binary op : alpha * acc
+          {{alpha}},  // leaf op+args : alpha
+          {},         // leaf op+args : acc
+          {}          // binary args : multiplies
+        },
+        {aux_store_D_block.get(), stride_D}
+      };
+
+    } else {
+      arguments_base.epilogue.thread = {
+        // ternary op : beta * C + (alpha * acc)
+        {{beta}}, // leaf op+args : beta
+        {},  // op+args : C
+        {
+            // binary op : alpha * acc
+            {{alpha}},  // leaf op+args : alpha
+            {},         // leaf op+args : acc
+            {}          // binary args : multiplies
+        },              // end binary op
+        {}              // ternary args : multiply_add
+      };
+      arguments.epilogue.thread = {
+        // unary op: aux store D
+        {
+          // ternary op : beta * C + (alpha * acc)
+          {{beta}}, // leaf op+args : beta
+          {},  // op+args : C
+          {
+              // binary op : alpha * acc
+              {{alpha}},  // leaf op+args : alpha
+              {},         // leaf op+args : acc
+              {}          // binary args : multiplies
+          },              // end binary op
+          {}              // ternary args : multiply_add
+        },
+        {aux_store_D_block.get(), stride_D}
+      };
+    }
+
+
+    cutlass::Status status;
+    cudaError_t result;
+
+    status = gemm_op_base.can_implement(arguments_base);
+    EXPECT_EQ(status, cutlass::Status::kSuccess) << "Error gemm base not supported";
+    size_t workspace_size_base = Gemm::get_workspace_size(arguments_base);
+    workspace.reset(workspace_size_base);
+    status = gemm_op_base.initialize(arguments_base, workspace.get());
+    status = gemm_op_base.run();
+    result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess) << "Error at Base Kernel Sync.";
+
+    size_t workspace_size = GemmWithoutD::get_workspace_size(arguments);
+    workspace.reset(workspace_size);
+    status = gemm_op.can_implement(arguments);
+    EXPECT_EQ(status, cutlass::Status::kSuccess);
+    status = gemm_op.initialize(arguments, workspace.get());
+    status = gemm_op.run();
+    result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess) << "Error at Kernel Sync.";
+
+    bool passed = cutlass::reference::device::BlockCompareEqual(aux_store_D_block.get(), D_block.get(), m * n);
+    if (!passed) {
+      return false;
+    }
+    }
+  }
+  }
+  return true;
+}
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 256x128x64_2x2x1_VoidC_VoidD_AuxStoreF16_RowMajor) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
+    TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule
+  >;
+  using AuxStoreDescriptor = cutlass::epilogue::collective::detail::AuxStoreDescriptor<
+    EpilogueDescriptor, cutlass::layout::RowMajor, cutlass::half_t
+  >;
+
+  using namespace cutlass::epilogue::fusion;
+
+  constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
+  constexpr bool has_c = false;
+
+  using EVT_D = decltype(test::gemm::device::select_evt_d<cutlass::half_t, float, has_c>());
+  using AuxStore = Sm90AuxStore<AuxStoreDescriptor::Stages, typename EpilogueDescriptor::EpilogueTile,
+                     typename AuxStoreDescriptor::Element, RoundStyle,
+                     typename AuxStoreDescriptor::Stride, typename AuxStoreDescriptor::SmemLayoutAtom,
+                     typename AuxStoreDescriptor::CopyOpR2S>;
+
+  constexpr auto select_kernel = [](auto has_c, auto has_d) {
+    using FusionCallbacks =
+        cute::conditional_t<decltype(has_d){}, EVT_D, Sm90EVT<AuxStore, EVT_D>>;
+    using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+        cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+        TileShape_MNK, ClusterShape_MNK,
+        EpilogueTileType,
+        float, float,
+        cute::conditional_t<decltype(has_c){}, cutlass::half_t, void>, LayoutC, 8,
+        cute::conditional_t<decltype(has_d){}, cutlass::half_t, void>, LayoutC, 8,
+        EpilogueSchedule,
+        FusionCallbacks
+      >::CollectiveOp;
+    using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+        cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+        cutlass::half_t, LayoutA, 8,
+        cutlass::half_t, LayoutB, 8,
+        float,
+        TileShape_MNK, ClusterShape_MNK,
+        cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+        cutlass::gemm::KernelTmaWarpSpecializedCooperative
+      >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue>;
+
+    return GemmKernel{};
+  };
+
+  using GemmKernel = decltype(select_kernel(cute::C<has_c>{}, cute::C<true>{}));
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  using GemmKernelWithoutD = decltype(select_kernel(cute::C<has_c>{}, cute::C<false>{}));
+  using GemmWithoutD = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelWithoutD>;
+
+  bool passed = test::gemm::device::testEVTAuxStoreWithoutD<Gemm, GemmWithoutD>();
+
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 256x128x64_2x2x1_VoidC_VoidD_AuxStoreNoSmemF16_RowMajor) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
+    TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule
+  >;
+
+  using namespace cutlass::epilogue::fusion;
+
+  constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
+  constexpr bool has_c = false;
+
+  using EVT_D = decltype(test::gemm::device::select_evt_d<cutlass::half_t, float, has_c>());
+  using AuxStore = Sm90AuxStore<0, void, cutlass::half_t, RoundStyle, cutlass::layout::RowMajor, void, void>;
+
+  constexpr auto select_kernel = [](auto has_c, auto has_d) {
+    using FusionCallbacks =
+        cute::conditional_t<decltype(has_d){}, EVT_D, Sm90EVT<AuxStore, EVT_D>>;
+    using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+        cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+        TileShape_MNK, ClusterShape_MNK,
+        EpilogueTileType,
+        float, float,
+        cute::conditional_t<decltype(has_c){}, cutlass::half_t, void>, LayoutC, 8,
+        cute::conditional_t<decltype(has_d){}, cutlass::half_t, void>, LayoutC, 8,
+        EpilogueSchedule,
+        FusionCallbacks
+      >::CollectiveOp;
+    using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+        cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+        cutlass::half_t, LayoutA, 8,
+        cutlass::half_t, LayoutB, 8,
+        float,
+        TileShape_MNK, ClusterShape_MNK,
+        cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+        cutlass::gemm::KernelTmaWarpSpecializedCooperative
+      >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue>;
+
+    return GemmKernel{};
+  };
+
+  using GemmKernel = decltype(select_kernel(cute::C<has_c>{}, cute::C<true>{}));
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  using GemmKernelWithoutD = decltype(select_kernel(cute::C<has_c>{}, cute::C<false>{}));
+  using GemmWithoutD = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelWithoutD>;
+
+  bool passed = test::gemm::device::testEVTAuxStoreWithoutD<Gemm, GemmWithoutD>();
+
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32n_tensor_op_gmma_f32_cooperative_epilogue, 256x128x64_2x2x1_VoidC_VoidD_AuxStoreF16_ColumnMajor) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
+    TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule
+  >;
+  using AuxStoreDescriptor = cutlass::epilogue::collective::detail::AuxStoreDescriptor<
+    EpilogueDescriptor, cutlass::layout::ColumnMajor, cutlass::half_t
+  >;
+
+  using namespace cutlass::epilogue::fusion;
+
+  constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
+  constexpr bool has_c = false;
+
+  using EVT_D = decltype(test::gemm::device::select_evt_d<cutlass::half_t, float, has_c>());
+  using AuxStore = Sm90AuxStore<AuxStoreDescriptor::Stages, typename EpilogueDescriptor::EpilogueTile,
+                     typename AuxStoreDescriptor::Element, RoundStyle,
+                     typename AuxStoreDescriptor::Stride, typename AuxStoreDescriptor::SmemLayoutAtom,
+                     typename AuxStoreDescriptor::CopyOpR2S>;
+
+  constexpr auto select_kernel = [](auto has_c, auto has_d) {
+    using FusionCallbacks =
+        cute::conditional_t<decltype(has_d){}, EVT_D, Sm90EVT<AuxStore, EVT_D>>;
+    using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+        cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+        TileShape_MNK, ClusterShape_MNK,
+        EpilogueTileType,
+        float, float,
+        cute::conditional_t<decltype(has_c){}, cutlass::half_t, void>, LayoutC, 8,
+        cute::conditional_t<decltype(has_d){}, cutlass::half_t, void>, LayoutC, 8,
+        EpilogueSchedule,
+        FusionCallbacks
+      >::CollectiveOp;
+    using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+        cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+        cutlass::half_t, LayoutA, 8,
+        cutlass::half_t, LayoutB, 8,
+        float,
+        TileShape_MNK, ClusterShape_MNK,
+        cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+        cutlass::gemm::KernelTmaWarpSpecializedCooperative
+      >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue>;
+
+    return GemmKernel{};
+  };
+
+  using GemmKernel = decltype(select_kernel(cute::C<has_c>{}, cute::C<true>{}));
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  using GemmKernelWithoutD = decltype(select_kernel(cute::C<has_c>{}, cute::C<false>{}));
+  using GemmWithoutD = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelWithoutD>;
+
+  bool passed = test::gemm::device::testEVTAuxStoreWithoutD<Gemm, GemmWithoutD>();
+
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 128x128x64_2x2x1_VoidC_VoidD_AuxStoreF32_RowMajor) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
+    TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule
+  >;
+  using AuxStoreDescriptor = cutlass::epilogue::collective::detail::AuxStoreDescriptor<
+    EpilogueDescriptor, cutlass::layout::RowMajor, cutlass::half_t
+  >;
+
+  using namespace cutlass::epilogue::fusion;
+
+  constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
+  constexpr bool has_c = false;
+
+  using EVT_D = decltype(test::gemm::device::select_evt_d<cutlass::half_t, float, has_c>());
+  using AuxStore = Sm90AuxStore<AuxStoreDescriptor::Stages, typename EpilogueDescriptor::EpilogueTile,
+                     typename AuxStoreDescriptor::Element, RoundStyle,
+                     typename AuxStoreDescriptor::Stride, typename AuxStoreDescriptor::SmemLayoutAtom,
+                     typename AuxStoreDescriptor::CopyOpR2S>;
+
+  constexpr auto select_kernel = [](auto has_c, auto has_d) {
+    using FusionCallbacks =
+        cute::conditional_t<decltype(has_d){}, EVT_D, Sm90EVT<AuxStore, EVT_D>>;
+    using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+        cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+        TileShape_MNK, ClusterShape_MNK,
+        EpilogueTileType,
+        float, float,
+        cute::conditional_t<decltype(has_c){}, cutlass::half_t, void>, LayoutC, 8,
+        cute::conditional_t<decltype(has_d){}, cutlass::half_t, void>, LayoutC, 8,
+        EpilogueSchedule,
+        FusionCallbacks
+      >::CollectiveOp;
+    using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+        cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+        cutlass::half_t, LayoutA, 8,
+        cutlass::half_t, LayoutB, 8,
+        float,
+        TileShape_MNK, ClusterShape_MNK,
+        cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+        cutlass::gemm::KernelTmaWarpSpecializedCooperative
+      >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue>;
+
+    return GemmKernel{};
+  };
+
+  using GemmKernel = decltype(select_kernel(cute::C<has_c>{}, cute::C<true>{}));
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  using GemmKernelWithoutD = decltype(select_kernel(cute::C<has_c>{}, cute::C<false>{}));
+  using GemmWithoutD = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelWithoutD>;
+
+  bool passed = test::gemm::device::testEVTAuxStoreWithoutD<Gemm, GemmWithoutD>();
+
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 256x128x64_2x2x1_WithC_VoidD_AuxStoreF16_RowMajor) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
+    TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule
+  >;
+  using AuxStoreDescriptor = cutlass::epilogue::collective::detail::AuxStoreDescriptor<
+    EpilogueDescriptor, cutlass::layout::RowMajor, cutlass::half_t
+  >;
+
+  using namespace cutlass::epilogue::fusion;
+
+  constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
+  constexpr bool has_c = true;
+
+  using EVT_D = decltype(test::gemm::device::select_evt_d<cutlass::half_t, float, has_c>());
+  using AuxStore = Sm90AuxStore<AuxStoreDescriptor::Stages, typename EpilogueDescriptor::EpilogueTile,
+                     typename AuxStoreDescriptor::Element, RoundStyle,
+                     typename AuxStoreDescriptor::Stride, typename AuxStoreDescriptor::SmemLayoutAtom,
+                     typename AuxStoreDescriptor::CopyOpR2S>;
+
+  constexpr auto select_kernel = [](auto has_c, auto has_d) {
+    using FusionCallbacks =
+        cute::conditional_t<decltype(has_d){}, EVT_D, Sm90EVT<AuxStore, EVT_D>>;
+    using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+        cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+        TileShape_MNK, ClusterShape_MNK,
+        EpilogueTileType,
+        float, float,
+        cute::conditional_t<decltype(has_c){}, cutlass::half_t, void>, LayoutC, 8,
+        cute::conditional_t<decltype(has_d){}, cutlass::half_t, void>, LayoutC, 8,
+        EpilogueSchedule,
+        FusionCallbacks
+      >::CollectiveOp;
+    using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+        cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+        cutlass::half_t, LayoutA, 8,
+        cutlass::half_t, LayoutB, 8,
+        float,
+        TileShape_MNK, ClusterShape_MNK,
+        cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+        cutlass::gemm::KernelTmaWarpSpecializedCooperative
+      >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue>;
+
+    return GemmKernel{};
+  };
+
+  using GemmKernel = decltype(select_kernel(cute::C<has_c>{}, cute::C<true>{}));
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  using GemmKernelWithoutD = decltype(select_kernel(cute::C<has_c>{}, cute::C<false>{}));
+  using GemmWithoutD = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelWithoutD>;
+
+  bool passed = test::gemm::device::testEVTAuxStoreWithoutD<Gemm, GemmWithoutD>();
+
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32n_tensor_op_gmma_f32_cooperative_epilogue, 256x128x64_2x2x1_WithC_VoidD_AuxStoreF16_ColumnMajor) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
+    TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule
+  >;
+  using AuxStoreDescriptor = cutlass::epilogue::collective::detail::AuxStoreDescriptor<
+    EpilogueDescriptor, cutlass::layout::ColumnMajor, cutlass::half_t
+  >;
+
+  using namespace cutlass::epilogue::fusion;
+
+  constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
+  constexpr bool has_c = true;
+
+  using EVT_D = decltype(test::gemm::device::select_evt_d<cutlass::half_t, float, has_c>());
+  using AuxStore = Sm90AuxStore<AuxStoreDescriptor::Stages, typename EpilogueDescriptor::EpilogueTile,
+                     typename AuxStoreDescriptor::Element, RoundStyle,
+                     typename AuxStoreDescriptor::Stride, typename AuxStoreDescriptor::SmemLayoutAtom,
+                     typename AuxStoreDescriptor::CopyOpR2S>;
+
+  constexpr auto select_kernel = [](auto has_c, auto has_d) {
+    using FusionCallbacks =
+        cute::conditional_t<decltype(has_d){}, EVT_D, Sm90EVT<AuxStore, EVT_D>>;
+    using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+        cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+        TileShape_MNK, ClusterShape_MNK,
+        EpilogueTileType,
+        float, float,
+        cute::conditional_t<decltype(has_c){}, cutlass::half_t, void>, LayoutC, 8,
+        cute::conditional_t<decltype(has_d){}, cutlass::half_t, void>, LayoutC, 8,
+        EpilogueSchedule,
+        FusionCallbacks
+      >::CollectiveOp;
+    using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+        cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+        cutlass::half_t, LayoutA, 8,
+        cutlass::half_t, LayoutB, 8,
+        float,
+        TileShape_MNK, ClusterShape_MNK,
+        cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+        cutlass::gemm::KernelTmaWarpSpecializedCooperative
+      >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue>;
+
+    return GemmKernel{};
+  };
+
+  using GemmKernel = decltype(select_kernel(cute::C<has_c>{}, cute::C<true>{}));
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  using GemmKernelWithoutD = decltype(select_kernel(cute::C<has_c>{}, cute::C<false>{}));
+  using GemmWithoutD = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelWithoutD>;
+
+  bool passed = test::gemm::device::testEVTAuxStoreWithoutD<Gemm, GemmWithoutD>();
+
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 128x128x64_2x2x1_WithC_VoidD_AuxStoreF32_RowMajor) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
+    TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule
+  >;
+  using AuxStoreDescriptor = cutlass::epilogue::collective::detail::AuxStoreDescriptor<
+    EpilogueDescriptor, cutlass::layout::RowMajor, cutlass::half_t
+  >;
+
+  using namespace cutlass::epilogue::fusion;
+
+  constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
+  constexpr bool has_c = true;
+
+  using EVT_D = decltype(test::gemm::device::select_evt_d<cutlass::half_t, float, has_c>());
+  using AuxStore = Sm90AuxStore<AuxStoreDescriptor::Stages, typename EpilogueDescriptor::EpilogueTile,
+                     typename AuxStoreDescriptor::Element, RoundStyle,
+                     typename AuxStoreDescriptor::Stride, typename AuxStoreDescriptor::SmemLayoutAtom,
+                     typename AuxStoreDescriptor::CopyOpR2S>;
+
+  constexpr auto select_kernel = [](auto has_c, auto has_d) {
+    using FusionCallbacks =
+        cute::conditional_t<decltype(has_d){}, EVT_D, Sm90EVT<AuxStore, EVT_D>>;
+    using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+        cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+        TileShape_MNK, ClusterShape_MNK,
+        EpilogueTileType,
+        float, float,
+        cute::conditional_t<decltype(has_c){}, cutlass::half_t, void>, LayoutC, 8,
+        cute::conditional_t<decltype(has_d){}, cutlass::half_t, void>, LayoutC, 8,
+        EpilogueSchedule,
+        FusionCallbacks
+      >::CollectiveOp;
+    using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+        cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+        cutlass::half_t, LayoutA, 8,
+        cutlass::half_t, LayoutB, 8,
+        float,
+        TileShape_MNK, ClusterShape_MNK,
+        cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+        cutlass::gemm::KernelTmaWarpSpecializedCooperative
+      >::CollectiveOp;
+
+    using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+        Shape<int,int,int,int>,
+        CollectiveMainloop,
+        CollectiveEpilogue>;
+
+    return GemmKernel{};
+  };
+
+  using GemmKernel = decltype(select_kernel(cute::C<has_c>{}, cute::C<true>{}));
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  using GemmKernelWithoutD = decltype(select_kernel(cute::C<has_c>{}, cute::C<false>{}));
+  using GemmWithoutD = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelWithoutD>;
+
+  bool passed = test::gemm::device::testEVTAuxStoreWithoutD<Gemm, GemmWithoutD>();
+
+  EXPECT_TRUE(passed);
+}
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_bias_elementwise.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_bias_elementwise.cu
new file mode 100644
index 0000000000..c27c8de96f
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_bias_elementwise.cu
@@ -0,0 +1,606 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface with bias and elementwise epilogues.
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 256x128x64_2x2x1_ReLU) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::LinCombEltAct<
+      cutlass::epilogue::thread::ReLu, cutlass::half_t, float>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool passed = test::gemm::device::TestAll<Gemm, cutlass::epilogue::thread::ReLu>(1, 1);
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 256x128x64_2x2x1_BiasF32_ReLU_Legacy) {
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations" // Suppress deprecation warnings
+#ifdef _MSC_VER
+#pragma warning( push )
+#pragma warning( disable : 4996 )
+#endif // _MSC_VER
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  static constexpr bool StoreT = true;
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperativeBiasElementwise<
+        cutlass::epilogue::thread::ReLu, cutlass::half_t, cutlass::plus, StoreT, float>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  bool passed = test::gemm::device::TestAllBiasElementwise<Gemm>(1, 1);
+  EXPECT_TRUE(passed);
+#ifdef _MSC_VER
+#pragma warning( pop )
+#endif // _MSC_VER
+#pragma GCC diagnostic pop // Re-enable deprecation warnings
+}
+
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 256x128x64_2x2x1_BiasF32_ReLU) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerRowBiasEltActAux<
+      LayoutC, cutlass::epilogue::thread::ReLu, cutlass::half_t, float, cutlass::half_t, float>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  bool passed = test::gemm::device::TestAllBiasElementwise<Gemm>(1, 1);
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 256x128x64_2x2x1_BiasF32_GELU) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerRowBiasEltActAux<
+      LayoutC, cutlass::epilogue::thread::GELU, cutlass::half_t, float, cutlass::half_t, float>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  using namespace test::gemm::device; 
+  bool passed = TestAllBiasElementwise<Gemm>(1, 1, CheckEquality::RELATIVE);
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 256x128x64_2x2x1_BiasF32_ReLU_NoStoreT) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::ReLu, cutlass::half_t, float, float>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  bool passed = test::gemm::device::TestAllBiasElementwise<Gemm>(1, 1);
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 256x128x64_2x2x1_BiasF32_Negate) {
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerRowBiasEltActAux<
+      LayoutC, cutlass::negate, cutlass::half_t, float, cutlass::half_t, float>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  bool passed = test::gemm::device::TestAllBiasElementwise<Gemm>(1, 1);
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32n_tensor_op_gmma_f32_cooperative_epilogue, 256x128x64_2x2x1_BiasF32_ReLU) {
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerRowBiasEltActAux<
+      LayoutC, cutlass::epilogue::thread::ReLu, cutlass::half_t, float, cutlass::half_t, float>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  bool passed = test::gemm::device::TestAllBiasElementwise<Gemm>(1, 1);
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 256x128x64_2x2x1_BiasF32_ReLU_VoidC) {
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerRowBiasEltActAux<
+      LayoutC, cutlass::epilogue::thread::ReLu, cutlass::half_t, float, cutlass::half_t, float, void>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      void, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  bool passed = test::gemm::device::TestAllBiasElementwise<Gemm>();
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 256x128x64_2x2x1_BiasF16_ReLU_VoidC) {
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerRowBiasEltActAux<
+      LayoutC, cutlass::epilogue::thread::ReLu, cutlass::half_t, float, cutlass::half_t, cutlass::half_t, void>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      void, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  bool passed = test::gemm::device::TestAllBiasElementwise<Gemm>();
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 256x128x64_2x2x1_BiasS8_ReLU_VoidC_U1Aux) {
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  // ReLU with uint1b_t aux will compute dReLU/dZ as the aux output, i.e. Aux(i) = (Z(i) >= 0) ? 1 : 0
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerRowBiasEltActAux<
+      LayoutC, cutlass::epilogue::thread::ReLU, cutlass::half_t, float, cutlass::uint1b_t, int8_t, void>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      void, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  bool passed = test::gemm::device::TestAllBiasElementwise<Gemm>();
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 256x128x64_2x2x1_dReLU_dBias_VoidC) {
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::LinCombDeEltActDePerRowBias<
+      LayoutC, cutlass::epilogue::thread::dReLU, cutlass::half_t, float, cutlass::uint1b_t, float, void>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      void, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  bool passed = test::gemm::device::TestAllBiasElementwise<Gemm>();
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 256x128x64_2x2x1_dGELU_VoidC) {
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::LinCombDeEltAct<
+      LayoutC, cutlass::epilogue::thread::dGELU, cutlass::half_t, float, cutlass::half_t, void>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      void, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  using namespace test::gemm::device; 
+  bool passed = TestAllBiasElementwise<Gemm>(1.0, 0.0, CheckEquality::RELATIVE);
+  EXPECT_TRUE(passed);
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_dag.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_dag.cu
new file mode 100644
index 0000000000..9f381b7971
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_dag.cu
@@ -0,0 +1,170 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for Sm90 f16_f16_f16 cooperative DAG epilogue
+    EVTDAG: D = beta * C + Graph(relu(alpha * acc + aux) + aux)
+    DAGEVT: EVT = alpha * acc + C, D = Graph(maximum(EVT + per-row bias, EVT))
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x_evt.hpp"
+#include "sm90_evt_operations.hpp"
+
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 256x128x64_2x2x1_EVTDAG) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
+    TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule>;
+  
+  using AuxLoadDescriptor = cutlass::epilogue::collective::detail::AuxLoadDescriptor<
+    EpilogueDescriptor, cutlass::layout::RowMajor, cutlass::half_t>;
+  
+  using FusionCallbacks = cutlass::epilogue::fusion::Sm90LinCombEVTDAG<
+    EpilogueDescriptor, AuxLoadDescriptor, cutlass::half_t, float, float>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      EpilogueTileType,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionCallbacks
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  // Host reference
+  bool passed = test::gemm::device::TestAllEVT<Gemm, test::gemm::device::HostEVTDAG<Gemm>>();
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 128x128x64_2x2x1_DAGEVT) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
+    TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule>;
+  
+  using AuxStoreDescriptor = cutlass::epilogue::collective::detail::AuxStoreDescriptor<
+    EpilogueDescriptor, cutlass::layout::RowMajor, cutlass::half_t>;
+  
+  using FusionCallbacks = cutlass::epilogue::fusion::Sm90LinCombDAGEVT<
+    EpilogueDescriptor, AuxStoreDescriptor, cutlass::half_t, float, cutlass::half_t, float>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      EpilogueTileType,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionCallbacks
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  // Host reference
+  bool passed = test::gemm::device::TestAllEVT<Gemm, test::gemm::device::HostDAGEVT<Gemm>>();
+  EXPECT_TRUE(passed);
+}
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_reduce.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_reduce.cu
new file mode 100644
index 0000000000..9e15c91de1
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_reduce.cu
@@ -0,0 +1,201 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for Sm90 f16_f16_f16 cooperative EVT epilogue
+    D = row|column|scalar_reduce(alpha * acc + beta * C)
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x_evt.hpp"
+#include "sm90_evt_operations.hpp"
+
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 256x128x64_2x2x1_RowReduce) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionCallbacks = cutlass::epilogue::fusion::Sm90LinCombPerColumnReduce<
+    cutlass::plus, cutlass::atomic_add, float, TileShape_MNK, cutlass::half_t, float, float>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionCallbacks
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  // Host reference
+  using HostReference = test::gemm::device::HostReduce<Gemm, test::gemm::device::HostRowReduce<cutlass::plus, float>>;
+  bool passed = test::gemm::device::TestAllEVT<Gemm, HostReference>(true);
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 256x128x64_2x2x1_ColumnReduce) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionCallbacks = cutlass::epilogue::fusion::Sm90LinCombPerRowReduce<
+    cutlass::plus, cutlass::atomic_add, float, TileShape_MNK, cutlass::half_t, float, float>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionCallbacks
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  // Host reference
+  using HostReference = test::gemm::device::HostReduce<Gemm, test::gemm::device::HostColumnReduce<cutlass::plus, float>>;
+  bool passed = test::gemm::device::TestAllEVT<Gemm, HostReference>(true);
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 256x128x64_2x2x1_ScalarReduce) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionCallbacks = cutlass::epilogue::fusion::Sm90LinCombScalarReduce<
+    cutlass::plus, cutlass::atomic_add, float, cutlass::half_t, float, float>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionCallbacks
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  // Host reference
+  using HostReference = test::gemm::device::HostReduce<Gemm, test::gemm::device::HostScalarReduce<cutlass::plus, float>>;
+  bool passed = test::gemm::device::TestAllEVT<Gemm, HostReference>(true);
+  EXPECT_TRUE(passed);
+}
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_row_broadcast.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_row_broadcast.cu
new file mode 100644
index 0000000000..7739dec55a
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_row_broadcast.cu
@@ -0,0 +1,163 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for Sm90 f16_f16_f16 cooperative EVT epilogue
+    D = alpha * acc + beta * C + per_column_bias
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x_evt.hpp"
+#include "sm90_evt_operations.hpp"
+
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 256x128x64_2x2x1_RowBroadcastF16) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
+    TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule>;
+
+  using FusionCallbacks = cutlass::epilogue::fusion::Sm90LinCombPerColumnBias<
+    EpilogueDescriptor, cutlass::half_t, float, cutlass::half_t, float>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      EpilogueTileType,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionCallbacks
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  // Host reference
+  bool passed = test::gemm::device::TestAllEVT<Gemm, test::gemm::device::HostPerColBias<Gemm, cutlass::half_t>>();
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 256x128x64_2x2x1_RowBroadcastF32) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
+    TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule>;
+
+  using FusionCallbacks = cutlass::epilogue::fusion::Sm90LinCombPerColumnBias<
+    EpilogueDescriptor, cutlass::half_t, float, float, float>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      EpilogueTileType,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionCallbacks
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  // Host reference
+  bool passed = test::gemm::device::TestAllEVT<Gemm, test::gemm::device::HostPerColBias<Gemm, float>>();
+  EXPECT_TRUE(passed);
+}
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong.cu
new file mode 100644
index 0000000000..3bb324ca7c
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong.cu
@@ -0,0 +1,1279 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_persistent, 64x128x64_1x1x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_64,_128,_64>;
+  using ClusterShape_MNK = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 8,
+      ElementB, LayoutB, 8,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_persistent, 64x128x64_2x1x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_64,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_1,_1>;
+  using StageCountType = cutlass::gemm::collective::StageCountAuto;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 8,
+      ElementB, LayoutB, 8,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_persistent, 64x128x64_1x2x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_64,_128,_64>;
+  using ClusterShape_MNK = Shape<_1,_2,_1>;
+  using StageCountType = cutlass::gemm::collective::StageCountAuto;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 8,
+      ElementB, LayoutB, 8,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_persistent, 64x128x64_2x2x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_64,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  using StageCountType = cutlass::gemm::collective::StageCountAuto;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 8,
+      ElementB, LayoutB, 8,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_persistent, 64x128x64_4x1x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_64,_128,_64>;
+  using ClusterShape_MNK = Shape<_4,_1,_1>;
+  using StageCountType = cutlass::gemm::collective::StageCountAuto;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 8,
+      ElementB, LayoutB, 8,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_persistent, 64x128x64_1x4x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_64,_128,_64>;
+  using ClusterShape_MNK = Shape<_1,_4,_1>;
+  using StageCountType = cutlass::gemm::collective::StageCountAuto;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 8,
+      ElementB, LayoutB, 8,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_persistent, 64x128x64_2x4x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_64,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_4,_1>;
+  using StageCountType = cutlass::gemm::collective::StageCountAuto;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 8,
+      ElementB, LayoutB, 8,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_persistent, 64x128x64_4x4x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_64,_128,_64>;
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  using StageCountType = cutlass::gemm::collective::StageCountAuto;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 8,
+      ElementB, LayoutB, 8,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_persistent, 128x128x64_1x1x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_1,_1,_1>;
+  using StageCountType = cutlass::gemm::collective::StageCountAuto;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 8,
+      ElementB, LayoutB, 8,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_persistent, 128x128x64_2x1x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_1,_1>;
+  using StageCountType = cutlass::gemm::collective::StageCountAuto;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 8,
+      ElementB, LayoutB, 8,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_persistent, 128x128x64_1x2x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_1,_2,_1>;
+  using StageCountType = cutlass::gemm::collective::StageCountAuto;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 8,
+      ElementB, LayoutB, 8,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_persistent, 128x128x64_2x2x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  using StageCountType = cutlass::gemm::collective::StageCountAuto;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 8,
+      ElementB, LayoutB, 8,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_persistent, 128x128x64_4x1x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_4,_1,_1>;
+  using StageCountType = cutlass::gemm::collective::StageCountAuto;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 8,
+      ElementB, LayoutB, 8,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_persistent, 128x128x64_1x4x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_1,_4,_1>;
+  using StageCountType = cutlass::gemm::collective::StageCountAuto;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 8,
+      ElementB, LayoutB, 8,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_persistent, 128x128x64_2x4x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_4,_1>;
+  using StageCountType = cutlass::gemm::collective::StageCountAuto;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 8,
+      ElementB, LayoutB, 8,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_persistent, 128x128x64_4x4x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  using StageCountType = cutlass::gemm::collective::StageCountAuto;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 8,
+      ElementB, LayoutB, 8,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16n_f16n_tensor_op_gmma_f16_persistent_Epilogue, 64x128x64_2x2x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_64,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  using StageCountType = cutlass::gemm::collective::StageCountAuto;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+
+  using PreSwizzleLayout = Layout<Shape<_64,_128>,Stride<_1,_64>>;
+  using TileShapeS2R = Shape<_64,_16>;
+
+  using CollectiveEpilogue = cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
+    cutlass::epilogue::collective::Epilogue<
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+      ComposedLayout<Swizzle<3,4,3>, smem_ptr_flag_bits<sizeof_bits_v<ElementAccumulator>>, PreSwizzleLayout>,
+      Copy_Atom<SM90_U16x8_STSM_T, ElementAccumulator>,
+      TiledCopy<Copy_Atom<DefaultCopy, ElementAccumulator>,Layout<Shape<_128,_8>,Stride<_8,_1>>,TileShapeS2R>,
+      Copy_Atom<DefaultCopy, ElementC>>>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 8,
+      ElementB, LayoutB, 8,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f16n_tensor_op_gmma_f16_persistent_Epilogue, 128x64x64_2x2x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_64,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  using StageCountType = cutlass::gemm::collective::StageCountAuto;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+
+  using PreSwizzleLayout = Layout<Shape<Shape<_64,_2>,_64>,Stride<Stride<_1,_4096>,_64>>;
+  using TileShapeS2R = Shape<_128,_8>;
+
+  using CollectiveEpilogue = cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
+    cutlass::epilogue::collective::Epilogue<
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+      ComposedLayout<Swizzle<3,4,3>, smem_ptr_flag_bits<sizeof_bits_v<ElementAccumulator>>, PreSwizzleLayout>,
+      Copy_Atom<SM90_U16x8_STSM_T, ElementAccumulator>,
+      TiledCopy<Copy_Atom<DefaultCopy, ElementAccumulator>,Layout<Shape<_128,_8>,Stride<_8,_1>>,TileShapeS2R>,
+      Copy_Atom<DefaultCopy, ElementC>>>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 8,
+      ElementB, LayoutB, 8,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16n_f16t_tensor_op_gmma_f16_persistent_Epilogue, 64x128x64_2x2x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_64,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  using StageCountType = cutlass::gemm::collective::StageCountAuto;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+
+  using PreSwizzleLayout = Layout<Shape<_64,Shape<_64,_2>>,Stride<_64,Stride<_1,_4096>>>;
+  using TileShapeS2R = Shape<_8,_128>;
+
+  using CollectiveEpilogue = cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
+    cutlass::epilogue::collective::Epilogue<
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+      ComposedLayout<Swizzle<3,4,3>, smem_ptr_flag_bits<sizeof_bits_v<ElementAccumulator>>, PreSwizzleLayout>,
+      Copy_Atom<SM90_U32x4_STSM_N, ElementAccumulator>,
+      TiledCopy<Copy_Atom<DefaultCopy, ElementAccumulator>,Layout<Shape<_128,_8>,Stride<_8,_1>>,TileShapeS2R>,
+      Copy_Atom<DefaultCopy, ElementC>>>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 8,
+      ElementB, LayoutB, 8,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f16t_tensor_op_gmma_f16_persistent_Epilogue, 128x64x64_2x2x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_128,_64,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  using StageCountType = cutlass::gemm::collective::StageCountAuto;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+
+  using PreSwizzleLayout = Layout<Shape<_128,_64>,Stride<_64,_1>>;
+  using TileShapeS2R = Shape<_16,_64>;
+
+  using CollectiveEpilogue = cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
+    cutlass::epilogue::collective::Epilogue<
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+      ComposedLayout<Swizzle<3,4,3>, smem_ptr_flag_bits<sizeof_bits_v<ElementAccumulator>>, PreSwizzleLayout>,
+      Copy_Atom<SM90_U32x4_STSM_N, ElementAccumulator>,
+      TiledCopy<Copy_Atom<DefaultCopy, ElementAccumulator>,Layout<Shape<_128,_8>,Stride<_8,_1>>,TileShapeS2R>,
+      Copy_Atom<DefaultCopy, ElementC>>>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 8,
+      ElementB, LayoutB, 8,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16n_f16n_tensor_op_gmma_f32_persistent_Epilogue, 64x128x64_2x2x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_64,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  using StageCountType = cutlass::gemm::collective::StageCountAuto;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+
+  using PreSwizzleLayout = Layout<Shape<_64,_128>,Stride<_1,_64>>;
+  using TileShapeS2R = Shape<_64,_16>;
+
+  using CollectiveEpilogue = cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
+    cutlass::epilogue::collective::Epilogue<
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+      ComposedLayout<Swizzle<3,4,3>, smem_ptr_flag_bits<sizeof_bits_v<ElementAccumulator>>, PreSwizzleLayout>,
+      Copy_Atom<DefaultCopy, ElementAccumulator>,
+      TiledCopy<Copy_Atom<DefaultCopy, ElementAccumulator>,Layout<Shape<_128,_8>,Stride<_8,_1>>,TileShapeS2R>,
+      Copy_Atom<DefaultCopy, ElementC>>>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 8,
+      ElementB, LayoutB, 8,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f16n_tensor_op_gmma_f32_persistent_Epilogue, 128x64x64_2x2x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_64,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  using StageCountType = cutlass::gemm::collective::StageCountAuto;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+
+  using PreSwizzleLayout = Layout<Shape<Shape<_64,_2>,_64>,Stride<Stride<_1,_4096>,_64>>;
+  using TileShapeS2R = Shape<_128,_8>;
+
+  using CollectiveEpilogue = cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
+    cutlass::epilogue::collective::Epilogue<
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+      ComposedLayout<Swizzle<3,4,3>, smem_ptr_flag_bits<sizeof_bits_v<ElementAccumulator>>, PreSwizzleLayout>,
+      Copy_Atom<DefaultCopy, ElementAccumulator>,
+      TiledCopy<Copy_Atom<DefaultCopy, ElementAccumulator>,Layout<Shape<_128,_8>,Stride<_8,_1>>,TileShapeS2R>,
+      Copy_Atom<DefaultCopy, ElementC>>>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 8,
+      ElementB, LayoutB, 8,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16n_f16t_tensor_op_gmma_f32_persistent_Epilogue, 64x128x64_2x2x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_64,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  using StageCountType = cutlass::gemm::collective::StageCountAuto;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+
+  using PreSwizzleLayout = Layout<Shape<_64,Shape<_64,_2>>,Stride<_64,Stride<_1,_4096>>>;
+  using TileShapeS2R = Shape<_8,_128>;
+
+  using CollectiveEpilogue = cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
+    cutlass::epilogue::collective::Epilogue<
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+      ComposedLayout<Swizzle<3,4,3>, smem_ptr_flag_bits<sizeof_bits_v<ElementAccumulator>>, PreSwizzleLayout>,
+      Copy_Atom<DefaultCopy, ElementAccumulator>,
+      TiledCopy<Copy_Atom<DefaultCopy, ElementAccumulator>,Layout<Shape<_128,_8>,Stride<_8,_1>>,TileShapeS2R>,
+      Copy_Atom<DefaultCopy, ElementC>>>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 8,
+      ElementB, LayoutB, 8,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f16t_tensor_op_gmma_f32_persistent_Epilogue, 128x64x64_2x2x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_128,_64,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  using StageCountType = cutlass::gemm::collective::StageCountAuto;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+
+  using PreSwizzleLayout = Layout<Shape<_128,_64>,Stride<_64,_1>>;
+  using TileShapeS2R = Shape<_16,_64>;
+
+  using CollectiveEpilogue = cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
+    cutlass::epilogue::collective::Epilogue<
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+      ComposedLayout<Swizzle<3,4,3>, smem_ptr_flag_bits<sizeof_bits_v<ElementAccumulator>>, PreSwizzleLayout>,
+      Copy_Atom<DefaultCopy, ElementAccumulator>,
+      TiledCopy<Copy_Atom<DefaultCopy, ElementAccumulator>,Layout<Shape<_128,_8>,Stride<_8,_1>>,TileShapeS2R>,
+      Copy_Atom<DefaultCopy, ElementC>>>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 8,
+      ElementB, LayoutB, 8,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f16n_tensor_op_gmma_f32_persistent_epilogue, 128x128x64_2x2x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  using StageCountType = cutlass::gemm::collective::StageCountAuto;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementAccumulator,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16 / sizeof(ElementA),
+      ElementB, LayoutB, 16 / sizeof(ElementB),
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      KernelSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1, 1));
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f16t_tensor_op_gmma_f32_persistent_epilogue, 128x128x64_2x2x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  using StageCountType = cutlass::gemm::collective::StageCountAuto;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementAccumulator,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16 / sizeof(ElementA),
+      ElementB, LayoutB, 16 / sizeof(ElementB),
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      KernelSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1, 1));
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32n_tensor_op_gmma_f32_persistent_epilogue, 128x128x64_2x2x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  using StageCountType = cutlass::gemm::collective::StageCountAuto;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementAccumulator,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16 / sizeof(ElementA),
+      ElementB, LayoutB, 16 / sizeof(ElementB),
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      KernelSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1, 1));
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_persistent_epilogue, 128x128x64_2x2x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  using StageCountType = cutlass::gemm::collective::StageCountAuto;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementAccumulator,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16 / sizeof(ElementA),
+      ElementB, LayoutB, 16 / sizeof(ElementB),
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      KernelSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1, 1));
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_aux_load.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_aux_load.cu
new file mode 100644
index 0000000000..cf43551127
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_aux_load.cu
@@ -0,0 +1,229 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for Sm90 f16_f16_f16 with persistent EVT epilogue
+    D = alpha * acc + beta * c + aux_load 
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x_evt.hpp"
+#include "sm90_evt_operations.hpp"
+
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_persistent_epilogue, 128x128x64_2x2x1_AuxLoadF16_RowMajor) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
+    TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule
+  >; 
+  using AuxLoadDescriptor = cutlass::epilogue::collective::detail::AuxLoadDescriptor<
+    EpilogueDescriptor, cutlass::layout::RowMajor, cutlass::half_t
+  >;
+
+  using FusionCallbacks = cutlass::epilogue::fusion::Sm90LinCombAuxLoad<
+    EpilogueDescriptor, AuxLoadDescriptor, cutlass::half_t, float, float>;
+
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      EpilogueTileType,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionCallbacks
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  // Host reference
+  using HostReference = test::gemm::device::HostEVTAuxLoad<Gemm, cutlass::half_t, cutlass::layout::RowMajor>;
+  bool passed = test::gemm::device::TestAllEVT<Gemm, HostReference>();
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_persistent_epilogue, 128x128x64_2x2x1_AuxLoadF16_ColumnMajor) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
+    TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule
+  >; 
+  using AuxLoadDescriptor = cutlass::epilogue::collective::detail::AuxLoadDescriptor<
+    EpilogueDescriptor, cutlass::layout::ColumnMajor, cutlass::half_t
+  >;
+
+  using FusionCallbacks = cutlass::epilogue::fusion::Sm90LinCombAuxLoad<
+    EpilogueDescriptor, AuxLoadDescriptor, cutlass::half_t, float, float>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      EpilogueTileType,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionCallbacks
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  // Host reference
+  using HostReference = test::gemm::device::HostEVTAuxLoad<Gemm, cutlass::half_t, cutlass::layout::ColumnMajor>;
+  bool passed = test::gemm::device::TestAllEVT<Gemm, HostReference>();
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_persistent_epilogue, 128x128x64_2x2x1_AuxLoadF32_ColumnMajor) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
+    TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule
+  >; 
+  using AuxLoadDescriptor = cutlass::epilogue::collective::detail::AuxLoadDescriptor<
+    EpilogueDescriptor, cutlass::layout::ColumnMajor, float
+  >;
+
+  using FusionCallbacks = cutlass::epilogue::fusion::Sm90LinCombAuxLoad<
+    EpilogueDescriptor, AuxLoadDescriptor, cutlass::half_t, float, float>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      EpilogueTileType,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionCallbacks
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  // Host reference
+  using HostReference = test::gemm::device::HostEVTAuxLoad<Gemm, float, cutlass::layout::ColumnMajor>;
+  bool passed = test::gemm::device::TestAllEVT<Gemm, HostReference>();
+  EXPECT_TRUE(passed);
+}
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_bias_elementwise.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_bias_elementwise.cu
new file mode 100644
index 0000000000..e9e2dba1ac
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_bias_elementwise.cu
@@ -0,0 +1,507 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide persistent GEMM interface with bias and elementwise epilogues.
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_persistent_epilogue, 128x128x64_2x2x1_ReLU) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::LinCombEltAct<
+      cutlass::epilogue::thread::ReLu, cutlass::half_t, float>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool passed = test::gemm::device::TestAll<Gemm, cutlass::epilogue::thread::ReLu>(1, 1);
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_persistent_epilogue, 128x128x64_2x2x1_BiasF32_ReLU) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerRowBiasEltActAux<
+      LayoutC, cutlass::epilogue::thread::ReLu, cutlass::half_t, float, cutlass::half_t, float>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  bool passed = test::gemm::device::TestAllBiasElementwise<Gemm>(1, 1);
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_persistent_epilogue, 128x128x64_2x2x1_BiasF32_GELU) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerRowBiasEltActAux<
+      LayoutC, cutlass::epilogue::thread::GELU, cutlass::half_t, float, cutlass::half_t, float>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  using namespace test::gemm::device; 
+  bool passed = TestAllBiasElementwise<Gemm>(1, 1, CheckEquality::RELATIVE);
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_persistent_epilogue, 128x128x64_2x2x1_BiasF32_ReLU_NoStoreT) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::ReLu, cutlass::half_t, float, float>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  bool passed = test::gemm::device::TestAllBiasElementwise<Gemm>(1, 1);
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_persistent_epilogue, 128x128x64_2x2x1_BiasF32_Negate) {
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerRowBiasEltActAux<
+      LayoutC, cutlass::negate, cutlass::half_t, float, cutlass::half_t, float>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  bool passed = test::gemm::device::TestAllBiasElementwise<Gemm>(1, 1);
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32n_tensor_op_gmma_f32_persistent_epilogue, 128x128x64_2x2x1_BiasF32_ReLU) {
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerRowBiasEltActAux<
+      LayoutC, cutlass::epilogue::thread::ReLu, cutlass::half_t, float, cutlass::half_t, float>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  bool passed = test::gemm::device::TestAllBiasElementwise<Gemm>(1, 1);
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_persistent_epilogue, 128x128x64_2x2x1_BiasF32_ReLU_VoidC) {
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerRowBiasEltActAux<
+      LayoutC, cutlass::epilogue::thread::ReLu, cutlass::half_t, float, cutlass::half_t, float, void>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      void, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  bool passed = test::gemm::device::TestAllBiasElementwise<Gemm>();
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_persistent_epilogue, 128x128x64_2x2x1_BiasF16_ReLU_VoidC) {
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerRowBiasEltActAux<
+      LayoutC, cutlass::epilogue::thread::ReLu, cutlass::half_t, float, cutlass::half_t, cutlass::half_t, void>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      void, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  bool passed = test::gemm::device::TestAllBiasElementwise<Gemm>();
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_persistent_epilogue, 128x128x64_2x2x1_BiasS8_ReLU_VoidC) {
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerRowBiasEltActAux<
+      LayoutC, cutlass::epilogue::thread::ReLu, cutlass::half_t, float, cutlass::half_t, int8_t, void>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      void, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  bool passed = test::gemm::device::TestAllBiasElementwise<Gemm>();
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_pingpong_epilogue, 128x128x64_2x2x1_dReLU_dBias_VoidC) {
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::LinCombDeEltActDePerRowBias<
+      LayoutC, cutlass::epilogue::thread::dReLU, cutlass::half_t, float, cutlass::uint1b_t, float, void>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      void, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  bool passed = test::gemm::device::TestAllBiasElementwise<Gemm>();
+  EXPECT_TRUE(passed);
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_dag.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_dag.cu
new file mode 100644
index 0000000000..94c4a655df
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_dag.cu
@@ -0,0 +1,170 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for Sm90 f16_f16_f16 persistent DAG epilogue
+    EVTDAG: D = beta * C + Graph(relu(alpha * acc + aux) + aux)
+    DAGEVT: EVT = alpha * acc + C, D = Graph(maximum(EVT + per-row bias, EVT))
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x_evt.hpp"
+#include "sm90_evt_operations.hpp"
+
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_persistent_epilogue, 128x128x64_2x2x1_EVTDAG) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
+    TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule>;
+  
+  using AuxLoadDescriptor = cutlass::epilogue::collective::detail::AuxLoadDescriptor<
+    EpilogueDescriptor, cutlass::layout::RowMajor, cutlass::half_t>;
+  
+  using FusionCallbacks = cutlass::epilogue::fusion::Sm90LinCombEVTDAG<
+    EpilogueDescriptor, AuxLoadDescriptor, cutlass::half_t, float, float>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      EpilogueTileType,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionCallbacks
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  // Host reference
+  bool passed = test::gemm::device::TestAllEVT<Gemm, test::gemm::device::HostEVTDAG<Gemm>>();
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_persistent_epilogue, 128x128x64_2x2x1_DAGEVT) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
+    TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule>;
+  
+  using AuxStoreDescriptor = cutlass::epilogue::collective::detail::AuxStoreDescriptor<
+    EpilogueDescriptor, cutlass::layout::RowMajor, cutlass::half_t>;
+  
+  using FusionCallbacks = cutlass::epilogue::fusion::Sm90LinCombDAGEVT<
+    EpilogueDescriptor, AuxStoreDescriptor, cutlass::half_t, float, cutlass::half_t, float>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      EpilogueTileType,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionCallbacks
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  // Host reference
+  bool passed = test::gemm::device::TestAllEVT<Gemm, test::gemm::device::HostDAGEVT<Gemm>>();
+  EXPECT_TRUE(passed);
+}
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_reduce.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_reduce.cu
new file mode 100644
index 0000000000..544b4fd5c3
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_reduce.cu
@@ -0,0 +1,201 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for Sm90 f16_f16_f16 persistent EVT epilogue
+    D = row|column|scalar_reduce(alpha * acc + beta * C)
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x_evt.hpp"
+#include "sm90_evt_operations.hpp"
+
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_persistent_epilogue, 128x128x64_2x2x1_RowReduce) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionCallbacks = cutlass::epilogue::fusion::Sm90LinCombPerColumnReduce<
+    cutlass::plus, cutlass::atomic_add, float, TileShape_MNK, cutlass::half_t, float, float>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionCallbacks
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  // Host reference
+  using HostReference = test::gemm::device::HostReduce<Gemm, test::gemm::device::HostRowReduce<cutlass::plus, float>>;
+  bool passed = test::gemm::device::TestAllEVT<Gemm, HostReference>(true);
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_persistent_epilogue, 128x128x64_2x2x1_ColumnReduce) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionCallbacks = cutlass::epilogue::fusion::Sm90LinCombPerRowReduce<
+    cutlass::plus, cutlass::atomic_add, float, TileShape_MNK, cutlass::half_t, float, float>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionCallbacks
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  // Host reference
+  using HostReference = test::gemm::device::HostReduce<Gemm, test::gemm::device::HostColumnReduce<cutlass::plus, float>>;
+  bool passed = test::gemm::device::TestAllEVT<Gemm, HostReference>(true);
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_persistent_epilogue, 128x128x64_2x2x1_ScalarReduce) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionCallbacks = cutlass::epilogue::fusion::Sm90LinCombScalarReduce<
+    cutlass::plus, cutlass::atomic_add, float, cutlass::half_t, float, float>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionCallbacks
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  // Host reference
+  using HostReference = test::gemm::device::HostReduce<Gemm, test::gemm::device::HostScalarReduce<cutlass::plus, float>>;
+  bool passed = test::gemm::device::TestAllEVT<Gemm, HostReference>(true);
+  EXPECT_TRUE(passed);
+}
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_row_broadcast.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_row_broadcast.cu
new file mode 100644
index 0000000000..df791d8d03
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_row_broadcast.cu
@@ -0,0 +1,163 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for Sm90 f16_f16_f16 persistent EVT epilogue
+    D = alpha * acc + beta * C + per_column_bias
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x_evt.hpp"
+#include "sm90_evt_operations.hpp"
+
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_persistent_epilogue, 128x128x64_2x2x1_RowBroadcastF16) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
+    TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule>;
+
+  using FusionCallbacks = cutlass::epilogue::fusion::Sm90LinCombPerColumnBias<
+    EpilogueDescriptor, cutlass::half_t, float, cutlass::half_t, float>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      EpilogueTileType,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionCallbacks
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  // Host reference
+  bool passed = test::gemm::device::TestAllEVT<Gemm, test::gemm::device::HostPerColBias<Gemm, cutlass::half_t>>();
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_persistent_epilogue, 128x128x64_2x2x1_RowBroadcastF32) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
+    TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule>;
+
+  using FusionCallbacks = cutlass::epilogue::fusion::Sm90LinCombPerColumnBias<
+    EpilogueDescriptor, cutlass::half_t, float, float, float>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      EpilogueTileType,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionCallbacks
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  // Host reference
+  bool passed = test::gemm::device::TestAllEVT<Gemm, test::gemm::device::HostPerColBias<Gemm, float>>();
+  EXPECT_TRUE(passed);
+}
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cooperative_stream_k.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cooperative_stream_k.cu
new file mode 100644
index 0000000000..b5b53b1a36
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cooperative_stream_k.cu
@@ -0,0 +1,1014 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface with stream-K scheduling
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_cooperative_stream_k, 128x128x64_1x1x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_1,_1,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 8,
+      ElementB, LayoutB, 8,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 0.0));
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 1.0));
+}
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_cooperative_stream_k, 256x128x64_1x2x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_1,_2,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 8,
+      ElementB, LayoutB, 8,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 0.0));
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 1.0));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////////// Cluster 2x2x1  ////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_cooperative_stream_k, 128x128x64_2x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 0.0));
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 1.0));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32n_tensor_op_gmma_f32_cooperative_stream_k, 256x128x64_2x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 0.0));
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 1.0));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16t_f32n_tensor_op_gmma_f32_cooperative_stream_k, 128x128x64_2x2x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 0.0));
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 1.0));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16n_f32n_tensor_op_gmma_f32_cooperative_stream_k, 256x128x64_2x2x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_1,_2,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 0.0));
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 1.0));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////////// Cluster 4x1x1  ////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_cooperative_stream_k, 128x128x64_4x1x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_4,_1,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 0.0));
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 1.0));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32n_tensor_op_gmma_f32_cooperative_stream_k, 128x128x64_4x1x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_4,_1,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 0.0));
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 1.0));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16t_f32n_tensor_op_gmma_f32_cooperative_stream_k, 128x128x64_4x1x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_4,_1,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 0.0));
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 1.0));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16n_f32n_tensor_op_gmma_f32_cooperative_stream_k, 128x128x64_4x1x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_4,_1,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 0.0));
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 1.0));
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////////// Cluster 1x4x1  ////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_cooperative_stream_k, 128x128x64_1x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_1,_4,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 0.0));
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 1.0));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32n_tensor_op_gmma_f32_cooperative_stream_k, 128x128x64_1x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_1,_4,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 0.0));
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 1.0));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16t_f32n_tensor_op_gmma_f32_cooperative_stream_k, 128x128x64_1x4x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_1,_4,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 0.0));
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 1.0));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16n_f32n_tensor_op_gmma_f32_cooperative_stream_k, 128x128x64_1x4x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_1,_4,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 0.0));
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 1.0));
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////////// Cluster 2x4x1  ////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_cooperative_stream_k, 256x128x64_2x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_4,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 0.0));
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 1.0));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32n_tensor_op_gmma_f32_cooperative_stream_k, 256x128x64_2x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_4,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 0.0));
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 1.0));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16t_f32n_tensor_op_gmma_f32_cooperative_stream_k, 256x128x64_2x4x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_4,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 0.0));
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 1.0));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16n_f32n_tensor_op_gmma_f32_cooperative_stream_k, 256x128x64_2x4x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_4,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 0.0));
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 1.0));
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f16n_tensor_op_gmma_f32_cooperative_stream_k_epilogue, 256x128x64_2x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 0.0));
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 1.0));
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f16t_tensor_op_gmma_f32_cooperative_stream_k_epilogue, 256x128x64_2x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 0.0));
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 1.0));
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32n_tensor_op_gmma_f32_cooperative_stream_k_epilogue, 128x128x64_2x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 0.0));
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 1.0));
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_stream_k_epilogue, 128x128x64_2x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 0.0));
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>(1.0, 1.0));
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_stream_k_epilogue, 256x128x64_2x2x1_BiasF32_ReLU) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerRowBiasEltActAux<
+      LayoutC, cutlass::epilogue::thread::ReLu, cutlass::half_t, float, cutlass::half_t, float>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>(1.0, 0.0));
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>(1.0, 1.0));
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_group_gemm.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_group_gemm.cu
new file mode 100644
index 0000000000..7031a24348
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_group_gemm.cu
@@ -0,0 +1,310 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Ptr-Array GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x_ptr_array.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_group_gemm, 128x128x64_2x2x1) {
+
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+// C/D matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C and D matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C and D matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm90;                             // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape           = Shape<_128,_128,_64>;                            // Threadblock-level tile size
+using ClusterShape        = Shape<_2,_2,_1>;                                 // Shape of the threadblocks in a cluster
+using StageCountType = cutlass::gemm::collective::StageCountAuto;            // Stage count maximized based on the tile size
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperative;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecializedCooperative;   // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    TileShape, ClusterShape,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC *, AlignmentC,
+    ElementC, LayoutC *, AlignmentC,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA *, AlignmentA,
+    ElementB, LayoutB *, AlignmentB,
+    ElementAccumulator,
+    TileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestAll<Gemm>(1.0, 1.0);
+  EXPECT_TRUE(result);
+  result = TestAll<Gemm>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_group_gemm, 128x128x64_2x2x1_ReLu) {
+
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+// C/D matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C and D matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C and D matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm90;                             // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape           = Shape<_128,_128,_64>;                            // Threadblock-level tile size
+using ClusterShape        = Shape<_2,_2,_1>;                                 // Shape of the threadblocks in a cluster
+using StageCountType = cutlass::gemm::collective::StageCountAuto;            // Stage count maximized based on the tile size
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperative;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecializedCooperative;   // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    TileShape, ClusterShape,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC *, AlignmentC,
+    ElementC, LayoutC *, AlignmentC,
+    EpilogueSchedule,
+    cutlass::epilogue::fusion::LinCombEltAct<cutlass::epilogue::thread::ReLu, ElementC, ElementAccumulator>
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA *, AlignmentA,
+    ElementB, LayoutB *, AlignmentB,
+    ElementAccumulator,
+    TileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestAll<Gemm>(1.0, 1.0);
+  EXPECT_TRUE(result);
+  result = TestAll<Gemm>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_group_gemm, 128x128x64_2x2x1_silu) {
+
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+// C/D matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C and D matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C and D matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm90;                             // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape           = Shape<_128,_128,_64>;                            // Threadblock-level tile size
+using ClusterShape        = Shape<_2,_2,_1>;                                 // Shape of the threadblocks in a cluster
+using StageCountType = cutlass::gemm::collective::StageCountAuto;            // Stage count maximized based on the tile size
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperative;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecializedCooperative;   // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    TileShape, ClusterShape,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC *, AlignmentC,
+    ElementC, LayoutC *, AlignmentC,
+    EpilogueSchedule,
+    cutlass::epilogue::fusion::LinCombEltAct<cutlass::epilogue::thread::SiLu, ElementC, ElementAccumulator>
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA *, AlignmentA,
+    ElementB, LayoutB *, AlignmentB,
+    ElementAccumulator,
+    TileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestAll<Gemm>(1.0, 1.0);
+  EXPECT_TRUE(result);
+  result = TestAll<Gemm>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_group_gemm, 128x128x64_2x2x1_direct_store) {
+
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+// C/D matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C and D matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C and D matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm90;                             // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape           = Shape<_128,_128,_64>;                            // Threadblock-level tile size
+using ClusterShape        = Shape<_2,_2,_1>;                                 // Shape of the threadblocks in a cluster
+using StageCountType = cutlass::gemm::collective::StageCountAuto;            // Stage count maximized based on the tile size
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperative;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayNoSmemWarpSpecialized;             // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    TileShape, ClusterShape,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC *, AlignmentC,
+    ElementC, LayoutC *, AlignmentC,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA *, AlignmentA,
+    ElementB, LayoutB *, AlignmentB,
+    ElementAccumulator,
+    TileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestAll<Gemm>(1.0, 1.0);
+  EXPECT_TRUE(result);
+  result = TestAll<Gemm>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_group_gemm_pingpong.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_group_gemm_pingpong.cu
new file mode 100644
index 0000000000..d387bb9fc6
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_group_gemm_pingpong.cu
@@ -0,0 +1,247 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Ptr-Array Ping-pong scheduler GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x_ptr_array.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_group_gemm_pingpong, 128x128x64_2x2x1) {
+
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+// C/D matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C and D matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C and D matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm90;                             // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape           = Shape<_128,_128,_64>;                            // Threadblock-level tile size
+using ClusterShape        = Shape<_2,_2,_1>;                                 // Shape of the threadblocks in a cluster
+using StageCountType = cutlass::gemm::collective::StageCountAuto;            // Stage count maximized based on the tile size
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpong;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;   // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    TileShape, ClusterShape,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC *, AlignmentC,
+    ElementC, LayoutC *, AlignmentC,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA *, AlignmentA,
+    ElementB, LayoutB *, AlignmentB,
+    ElementAccumulator,
+    TileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestAll<Gemm>(1.0, 1.0);
+  EXPECT_TRUE(result);
+  result = TestAll<Gemm>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_group_gemm_pingpong, 128x128x64_2x2x1_gelu) {
+
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+// C/D matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C and D matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C and D matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm90;                             // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape           = Shape<_128,_128,_64>;                            // Threadblock-level tile size
+using ClusterShape        = Shape<_2,_2,_1>;                                 // Shape of the threadblocks in a cluster
+using StageCountType = cutlass::gemm::collective::StageCountAuto;            // Stage count maximized based on the tile size
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpong;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;   // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    TileShape, ClusterShape,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC *, AlignmentC,
+    ElementC, LayoutC *, AlignmentC,
+    EpilogueSchedule,
+    cutlass::epilogue::fusion::LinCombEltAct<cutlass::epilogue::thread::GELU, ElementC, ElementAccumulator>
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA *, AlignmentA,
+    ElementB, LayoutB *, AlignmentB,
+    ElementAccumulator,
+    TileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestAll<Gemm>(1.0, 1.0);
+  EXPECT_TRUE(result);
+  result = TestAll<Gemm>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_group_gemm_pingpong, 128x128x64_2x2x1_direct_store) {
+
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+// C/D matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C and D matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C and D matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm90;                             // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape           = Shape<_128,_128,_64>;                            // Threadblock-level tile size
+using ClusterShape        = Shape<_2,_2,_1>;                                 // Shape of the threadblocks in a cluster
+using StageCountType = cutlass::gemm::collective::StageCountAuto;            // Stage count maximized based on the tile size
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpong;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayNoSmemWarpSpecialized;             // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    TileShape, ClusterShape,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC *, AlignmentC,
+    ElementC, LayoutC *, AlignmentC,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA *, AlignmentA,
+    ElementB, LayoutB *, AlignmentB,
+    ElementAccumulator,
+    TileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestAll<Gemm>(1.0, 1.0);
+  EXPECT_TRUE(result);
+  result = TestAll<Gemm>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_ptr_array.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_ptr_array.cu
new file mode 100644
index 0000000000..53748dc81c
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_ptr_array.cu
@@ -0,0 +1,182 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Ptr-Array GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x_ptr_array.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_ptr_array, 128x128x64_2x2x1) {
+
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+// C/D matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C and D matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C and D matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm90;                             // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape           = Shape<_128,_128,_64>;                             // Threadblock-level tile size
+using ClusterShape        = Shape<_2,_2,_1>;                                 // Shape of the threadblocks in a cluster
+using StageCountType = cutlass::gemm::collective::StageCountAuto;            // Stage count maximized based on the tile size
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperative;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecializedCooperative;     // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    TileShape, ClusterShape,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementC, LayoutC, AlignmentC,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    TileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestAll<Gemm>(1.0, 1.0);
+  EXPECT_TRUE(result);
+  result = TestAll<Gemm>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_ptr_array, 128x128x64_2x2x1_direct_store) {
+
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+// C/D matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C and D matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C and D matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm90;                             // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape           = Shape<_128,_128,_64>;                             // Threadblock-level tile size
+using ClusterShape        = Shape<_2,_2,_1>;                                 // Shape of the threadblocks in a cluster
+using StageCountType = cutlass::gemm::collective::StageCountAuto;            // Stage count maximized based on the tile size
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperative;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayNoSmemWarpSpecialized;             // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    TileShape, ClusterShape,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementC, LayoutC, AlignmentC,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    TileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(TestAll<Gemm>(1.0, 1.0));
+  EXPECT_TRUE(TestAll<Gemm>(1.0, 0.0));
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_ptr_array_pingpong.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_ptr_array_pingpong.cu
new file mode 100644
index 0000000000..3e8be48625
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_ptr_array_pingpong.cu
@@ -0,0 +1,182 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Ptr-Array GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x_ptr_array.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_ptr_array_pingpong, 128x128x64_2x2x1) {
+
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+// C/D matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C and D matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C and D matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm90;                             // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape           = Shape<_128,_128,_64>;                             // Threadblock-level tile size
+using ClusterShape        = Shape<_2,_2,_1>;                                 // Shape of the threadblocks in a cluster
+using StageCountType = cutlass::gemm::collective::StageCountAuto;            // Stage count maximized based on the tile size
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpong;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;     // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    TileShape, ClusterShape,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementC, LayoutC, AlignmentC,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    TileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestAll<Gemm>(1.0, 1.0);
+  EXPECT_TRUE(result);
+  result = TestAll<Gemm>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_ptr_array_pingpong, 128x128x64_2x2x1_direct_store) {
+
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+// C/D matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C and D matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C and D matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm90;                             // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape           = Shape<_128,_128,_64>;                             // Threadblock-level tile size
+using ClusterShape        = Shape<_2,_2,_1>;                                 // Shape of the threadblocks in a cluster
+using StageCountType = cutlass::gemm::collective::StageCountAuto;            // Stage count maximized based on the tile size
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpong;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayNoSmemWarpSpecialized;             // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    TileShape, ClusterShape,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementC, LayoutC, AlignmentC,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    TileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(TestAll<Gemm>(1.0, 1.0));
+  EXPECT_TRUE(TestAll<Gemm>(1.0, 0.0));
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_tensor_broadcast.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_tensor_broadcast.cu
new file mode 100644
index 0000000000..0774aa822e
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_tensor_broadcast.cu
@@ -0,0 +1,297 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface with an elementwise tensor-tensor broadcast epilogue
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/epilogue_tensor_broadcast.hpp"
+#include "cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x_tensor_broadcast.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16t_f16n_tensor_op_gmma_f32_tensor_broadcast, 64x128x64_ActIdentity_Bin0Plus_Bin1NoOp_UnaryIdentity) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using ElementOutput = float;
+  using ElementAccumulator = ElementOutput;
+  using ElementCompute = ElementOutput;
+  using ElementBias = ElementOutput;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      ElementOutput,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using EpilogueOp = cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
+    cutlass::epilogue::collective::EpilogueTensorBroadcast<
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::epilogue::thread::LinearCombinationTensorBroadcast<ElementOutput>,
+      cutlass::gemm::EpilogueDefault>>;
+
+  EXPECT_TRUE(EpilogueOp::IsBinaryOp0Enabled);
+  EXPECT_TRUE(!EpilogueOp::IsBinaryOp1Enabled);
+  EXPECT_TRUE(!EpilogueOp::IsUnaryOpEnabled);
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllTensorBroadcast<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16t_f16n_tensor_op_gmma_f32_tensor_broadcast, 64x128x64_ActReLu_Bin0Plus_Bin1Plus_UnaryNegate) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using ElementOutput = float;
+  using ElementAccumulator = ElementOutput;
+  using ElementCompute = ElementOutput;
+  using ElementBias = ElementOutput;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      ElementOutput,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using EpilogueOp = cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
+    cutlass::epilogue::collective::EpilogueTensorBroadcast<
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::epilogue::thread::LinearCombinationTensorBroadcast<
+        ElementOutput, ElementAccumulator, ElementCompute, ElementBias,
+        cutlass::epilogue::thread::ReLu,
+        cutlass::plus,
+        cutlass::plus,
+        cutlass::negate
+        >,
+      cutlass::gemm::EpilogueDefault>>;
+
+  EXPECT_TRUE(EpilogueOp::IsBinaryOp0Enabled);
+  EXPECT_TRUE(EpilogueOp::IsBinaryOp1Enabled);
+  EXPECT_TRUE(EpilogueOp::IsUnaryOpEnabled);
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllTensorBroadcast<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16n_f16t_f16t_tensor_op_gmma_f32_tensor_broadcast, 64x128x64_ActReLu_Bin0Mul_Bin1Plus_UnaryNegate) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+
+  using ElementOutput = float;
+  using ElementAccumulator = ElementOutput;
+  using ElementCompute = ElementOutput;
+  using ElementBias = ElementOutput;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      ElementOutput,
+      Shape<_64,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using EpilogueOp = cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
+    cutlass::epilogue::collective::EpilogueTensorBroadcast<
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::epilogue::thread::LinearCombinationTensorBroadcast<
+        ElementOutput, ElementAccumulator, ElementCompute, ElementBias,
+        cutlass::epilogue::thread::ReLu,
+        cutlass::multiplies,
+        cutlass::plus,
+        cutlass::negate
+        >,
+      cutlass::gemm::EpilogueDefault>>;
+
+  EXPECT_TRUE(EpilogueOp::IsBinaryOp0Enabled);
+  EXPECT_TRUE(EpilogueOp::IsBinaryOp1Enabled);
+  EXPECT_TRUE(EpilogueOp::IsUnaryOpEnabled);
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllTensorBroadcast<Gemm>());
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16t_f16n_tensor_op_gmma_f32_tensor_broadcast, 128x128x64_ActReLu_Bin0NoOp_Bin1Plus_UnaryNegate) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using ElementOutput = float;
+  using ElementAccumulator = ElementOutput;
+  using ElementCompute = ElementOutput;
+  using ElementBias = ElementOutput;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      ElementOutput,
+      Shape<_128,_128,_64>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using EpilogueOp = cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
+    cutlass::epilogue::collective::EpilogueTensorBroadcast<
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::epilogue::thread::LinearCombinationTensorBroadcast<
+        ElementOutput, ElementAccumulator, ElementCompute, ElementBias,
+        cutlass::epilogue::thread::ReLu,
+        cutlass::epilogue::thread::detail::NoOp,
+        cutlass::plus,
+        cutlass::negate
+        >,
+      cutlass::gemm::EpilogueDefault>>;
+
+  EXPECT_TRUE(!EpilogueOp::IsBinaryOp0Enabled);
+  EXPECT_TRUE(EpilogueOp::IsBinaryOp1Enabled);
+  EXPECT_TRUE(EpilogueOp::IsUnaryOpEnabled);
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllTensorBroadcast<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_warpspecialized_tensor_broadcast, 64x128x64_2x2x1_ActReLu_Bin0Mul_Bin1Plus_UnaryNegate) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using ElementOutput = float;
+  using ElementAccumulator = ElementOutput;
+  using ElementCompute = ElementOutput;
+  using ElementBias = ElementOutput;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      Shape<_64,_128,_64>, Shape<_2,_2,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecialized
+    >::CollectiveOp;
+
+  using EpilogueOp = cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
+    cutlass::epilogue::collective::EpilogueTensorBroadcast<
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::epilogue::thread::LinearCombinationTensorBroadcast<
+        ElementOutput, ElementAccumulator, ElementCompute, ElementBias,
+        cutlass::epilogue::thread::ReLu,
+        cutlass::multiplies,
+        cutlass::plus,
+        cutlass::negate
+        >,
+      cutlass::gemm::EpilogueDefault>>;
+
+  EXPECT_TRUE(EpilogueOp::IsBinaryOp0Enabled);
+  EXPECT_TRUE(EpilogueOp::IsBinaryOp1Enabled);
+  EXPECT_TRUE(EpilogueOp::IsUnaryOpEnabled);
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllTensorBroadcast<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f32_tensor_op_f32_rs_cluster_warpspecialized_cooperative.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f32_tensor_op_f32_rs_cluster_warpspecialized_cooperative.cu
new file mode 100644
index 0000000000..928e20a2d1
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f32_tensor_op_f32_rs_cluster_warpspecialized_cooperative.cu
@@ -0,0 +1,209 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+ /*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32n_tensor_op_gmma_f32_cooperative, 128x192x64_1x1x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA  = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB  = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC  = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+
+  using TileShape_MNK = Shape<_128,_192,_64>;
+  using ClusterShape_MNK = Shape<_1,_1,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, ElementC, ElementAccumulator, ElementAccumulator>;
+
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementAccumulator,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using KernelScheduleType = cutlass::gemm::KernelTmaWarpSpecializedCooperative;
+  using AtomLayoutMNK = Layout<Shape<_2,_1,_1>>;
+  using TiledMma = decltype(cute::make_tiled_mma(GMMA::rs_op_selector<
+      ElementA, ElementB, ElementAccumulator, TileShape_MNK, GMMA::Major::K, GMMA::Major::K>(), AtomLayoutMNK{}));
+  using GmemTiledCopyA = decltype(cutlass::gemm::collective::detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB = decltype(cutlass::gemm::collective::detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+  static constexpr GMMA::Major GmmaMajorA = cutlass::gemm::collective::detail::gmma_rs_tag_to_major_A<LayoutA>();
+  static constexpr GMMA::Major GmmaMajorB = cutlass::gemm::collective::detail::gmma_rs_tag_to_major_B<LayoutB>();
+  using SmemLayoutAtomA = decltype(cutlass::gemm::collective::detail::rs_smem_selector<GmmaMajorA, ElementA,
+      decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{})), false>());
+  using SmemLayoutAtomB = decltype(cutlass::gemm::collective::detail::rs_smem_selector<GmmaMajorB, ElementB,
+      decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{})), false>());
+  
+  using StageCountType = cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>;
+
+  static constexpr int PipelineStages = cutlass::gemm::collective::detail::compute_stage_count_or_override<
+      cutlass::gemm::collective::detail::sm90_smem_capacity_bytes,
+      ElementA, ElementB, TileShape_MNK>(StageCountType{});
+  using DispatchPolicy = cutlass::gemm::MainloopSm90TmaGmmaRmemAWarpSpecialized<
+      PipelineStages, ClusterShape_MNK, KernelScheduleType>;
+
+  using CollectiveOp = cutlass::gemm::collective::CollectiveMma<
+      DispatchPolicy,
+      TileShape_MNK,
+      ElementA,
+      cutlass::gemm::TagToStrideA_t<LayoutA>,
+      ElementB,
+      cutlass::gemm::TagToStrideB_t<LayoutB>,
+      TiledMma,
+      GmemTiledCopyA,
+      SmemLayoutAtomA,
+      cute::Copy_Atom<cute::SM75_U32x4_LDSM_N,ElementA>,
+      cute::identity,
+      GmemTiledCopyB,
+      SmemLayoutAtomB,
+      void,
+      cute::identity
+    >;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f16t_f16n_f32n_tensor_op_gmma_f32_cooperative, 128x192x64_2x1x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA  = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB  = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC  = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+
+  using TileShape_MNK = Shape<_128,_192,_64>;
+  using ClusterShape_MNK = Shape<_2,_1,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, ElementC, ElementAccumulator, ElementAccumulator>;
+
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementAccumulator,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using KernelScheduleType = cutlass::gemm::KernelTmaWarpSpecializedCooperative;
+  using AtomLayoutMNK = Layout<Shape<_2,_1,_1>>;
+  using TiledMma = decltype(cute::make_tiled_mma(GMMA::rs_op_selector<
+      ElementA, ElementB, ElementAccumulator, TileShape_MNK, GMMA::Major::K, GMMA::Major::K>(), AtomLayoutMNK{}));
+  using GmemTiledCopyA = decltype(cutlass::gemm::collective::detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB = decltype(cutlass::gemm::collective::detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+  static constexpr GMMA::Major GmmaMajorA = cutlass::gemm::collective::detail::gmma_rs_tag_to_major_A<LayoutA>();
+  static constexpr GMMA::Major GmmaMajorB = cutlass::gemm::collective::detail::gmma_rs_tag_to_major_B<LayoutB>();
+  using SmemLayoutAtomA = decltype(cutlass::gemm::collective::detail::rs_smem_selector<GmmaMajorA, ElementA,
+      decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{})), false>());
+  using SmemLayoutAtomB = decltype(cutlass::gemm::collective::detail::rs_smem_selector<GmmaMajorB, ElementB,
+      decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{})), false>());
+  
+  using StageCountType = cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>;
+
+  static constexpr int PipelineStages = cutlass::gemm::collective::detail::compute_stage_count_or_override<
+      cutlass::gemm::collective::detail::sm90_smem_capacity_bytes,
+      ElementA, ElementB, TileShape_MNK>(StageCountType{});
+  using DispatchPolicy = cutlass::gemm::MainloopSm90TmaGmmaRmemAWarpSpecialized<
+      PipelineStages, ClusterShape_MNK, KernelScheduleType>;
+
+  using CollectiveOp = cutlass::gemm::collective::CollectiveMma<
+      DispatchPolicy,
+      TileShape_MNK,
+      ElementA,
+      cutlass::gemm::TagToStrideA_t<LayoutA>,
+      ElementB,
+      cutlass::gemm::TagToStrideB_t<LayoutB>,
+      TiledMma,
+      GmemTiledCopyA,
+      SmemLayoutAtomA,
+      cute::Copy_Atom<cute::SM75_U32x4_LDSM_N,ElementA>,
+      cute::identity,
+      GmemTiledCopyB,
+      SmemLayoutAtomB,
+      void,
+      cute::identity
+    >;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f32_f32_f32_tensor_op_f32.cu b/test/unit/gemm/device/sm90_gemm_f32_f32_f32_tensor_op_f32.cu
new file mode 100644
index 0000000000..3b911f1fc5
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f32_f32_f32_tensor_op_f32.cu
@@ -0,0 +1,212 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f32t_f32n_f32n_tensor_op_gmma_f32, 64x128x32_1x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      float, LayoutA, 4,
+      float, LayoutB, 4,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_2,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f32t_f32t_f32n_tensor_op_gmma_f32, 64x128x32_1x1x1_pingpong) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_32>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::gemm::EpilogueTransposed
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      float, LayoutA, 4,
+      float, LayoutB, 4,
+      float,
+      Shape<_64,_128,_32>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f32t_f32t_f32n_tensor_op_gmma_f32, 128x128x32_1x1x1_cooperative) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_32>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::gemm::EpilogueTransposed
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      float, LayoutA, 4,
+      float, LayoutB, 4,
+      float,
+      Shape<_128,_128,_32>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f32t_f32t_f32n_tensor_op_gmma_f32, 128x128x32_1x1x1_cooperative_narrow_wgmma) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_32>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  // Manually configure a half-tile wide MMA instruction
+  using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma<
+      cutlass::gemm::MainloopSm90TmaGmmaWarpSpecialized<5, Shape<_1,_1,_1>, cutlass::gemm::KernelTmaWarpSpecializedCooperative>,
+      Shape<_128,_128,_32>,
+      float,
+      cutlass::detail::TagToStrideA_t<LayoutA>,
+      float,
+      cutlass::detail::TagToStrideB_t<LayoutB>,
+      decltype(cute::make_tiled_mma(cute::SM90_64x64x8_F32TF32TF32_SS_TN<>{}, Layout<Shape<_2,_1,_1>>{})),
+      cute::SM90_TMA_LOAD,
+      GMMA::Layout_K_SW128_Atom<tfloat32_t>,
+      void,
+      cute::identity,
+      cute::SM90_TMA_LOAD,
+      GMMA::Layout_K_SW128_Atom<tfloat32_t>,
+      void,
+      cute::identity
+    >;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f32_f32_f32_tensor_op_f32_tensor_broadcast.cu b/test/unit/gemm/device/sm90_gemm_f32_f32_f32_tensor_op_f32_tensor_broadcast.cu
new file mode 100644
index 0000000000..575f7a7d85
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f32_f32_f32_tensor_op_f32_tensor_broadcast.cu
@@ -0,0 +1,156 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface with an elementwise tensor-tensor broadcast epilogue
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/epilogue_tensor_broadcast.hpp"
+#include "cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x_tensor_broadcast.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_f32t_f32n_f32n_tensor_op_gmma_f32_tensor_broadcast, 64x128x32_1x2x1_ActReLU_Bin0Mul_Bin1Plus_UnaryHardSwish) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using ElementOutput = float;
+  using ElementAccumulator = ElementOutput;
+  using ElementCompute = ElementOutput;
+  using ElementBias = ElementOutput;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      float, LayoutA, 4,
+      float, LayoutB, 4,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_2,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using EpilogueOp = cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
+    cutlass::epilogue::collective::EpilogueTensorBroadcast<
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::epilogue::thread::LinearCombinationTensorBroadcast<
+        ElementOutput, ElementAccumulator, ElementCompute, ElementBias,
+        cutlass::epilogue::thread::ReLu,
+        cutlass::multiplies,
+        cutlass::plus,
+        cutlass::epilogue::thread::HardSwish
+        >,
+      cutlass::gemm::EpilogueDefault>>;
+
+  EXPECT_TRUE(EpilogueOp::IsBinaryOp0Enabled);
+  EXPECT_TRUE(EpilogueOp::IsBinaryOp1Enabled);
+  EXPECT_TRUE(EpilogueOp::IsUnaryOpEnabled);
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllTensorBroadcast<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_f32t_f32n_f32n_tensor_op_gmma_f32_tensor_broadcast, 64x128x32_1x2x1_ActReLU_Bin0Mul_Bin1Plus_UnaryHardSwish_PerColBias) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using ElementOutput = float;
+  using ElementAccumulator = ElementOutput;
+  using ElementCompute = ElementOutput;
+  using ElementBias = ElementOutput;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      float, LayoutA, 4,
+      float, LayoutB, 4,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_2,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using EpilogueOp = cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
+    cutlass::epilogue::collective::EpilogueTensorBroadcast<
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::epilogue::thread::LinearCombinationTensorBroadcast<
+        ElementOutput, ElementAccumulator, ElementCompute, ElementBias,
+        cutlass::epilogue::thread::ReLu,
+        cutlass::multiplies,
+        cutlass::plus,
+        cutlass::epilogue::thread::HardSwish
+        >,
+      cutlass::gemm::EpilogueDefault,
+      /* PerColBias = */ true>>;
+
+  EXPECT_TRUE(EpilogueOp::IsBinaryOp0Enabled);
+  EXPECT_TRUE(EpilogueOp::IsBinaryOp1Enabled);
+  EXPECT_TRUE(EpilogueOp::IsUnaryOpEnabled);
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllTensorBroadcast<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f8_f8_bf16_tensor_op_fp32.cu b/test/unit/gemm/device/sm90_gemm_f8_f8_bf16_tensor_op_fp32.cu
new file mode 100644
index 0000000000..b89a214cc7
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f8_f8_bf16_tensor_op_fp32.cu
@@ -0,0 +1,523 @@
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+ /*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////// bf16 = e4m3 * e4m3 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_bf16n_tensor_op_gmma_f32, 64x128x128) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, cutlass::bfloat16_t, float, float>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::bfloat16_t, LayoutC, 16 / sizeof(cutlass::bfloat16_t),
+      cutlass::bfloat16_t, LayoutC, 16 / sizeof(cutlass::bfloat16_t),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////// bf16 = e4m3 * e4m3 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_bf16n_tensor_op_gmma_f32, 64x128x128_relu) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::ReLu, cutlass::bfloat16_t, float, float>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::bfloat16_t, LayoutC, 16 / sizeof(cutlass::bfloat16_t),
+      cutlass::bfloat16_t, LayoutC, 16 / sizeof(cutlass::bfloat16_t),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////// bf16 = e4m3 * e4m3 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_bf16n_tensor_op_gmma_f32, 64x128x128_bias_bf16_relu) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::ReLu, cutlass::bfloat16_t, float, cutlass::bfloat16_t>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::bfloat16_t, LayoutC, 16 / sizeof(cutlass::bfloat16_t),
+      cutlass::bfloat16_t, LayoutC, 16 / sizeof(cutlass::bfloat16_t),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////// bf16 = e5m2 * e4m3 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e5m2t_e4m3n_bf16n_tensor_op_gmma_f32, 64x128x128) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, cutlass::bfloat16_t, float, float>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::bfloat16_t, LayoutC, 16 / sizeof(cutlass::bfloat16_t),
+      cutlass::bfloat16_t, LayoutC, 16 / sizeof(cutlass::bfloat16_t),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e5m2_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////// bf16 = e4m3 * e5m2 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e5m2n_bf16n_tensor_op_gmma_f32, 64x128x128) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, cutlass::bfloat16_t, float, float>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::bfloat16_t, LayoutC, 16 / sizeof(cutlass::bfloat16_t),
+      cutlass::bfloat16_t, LayoutC, 16 / sizeof(cutlass::bfloat16_t),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e5m2_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////// Cluster 2x2x1  //////////////////////////////////
+///////////////////////////// bf16 = e4m3 * e4m3 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_bf16n_tensor_op_gmma_f32, 64x128x128_2x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, cutlass::bfloat16_t, float, float>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_2,_2,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::bfloat16_t, LayoutC, 16 / sizeof(cutlass::bfloat16_t),
+      cutlass::bfloat16_t, LayoutC, 16 / sizeof(cutlass::bfloat16_t),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_2,_2,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////// Cluster 1x4x1  //////////////////////////////////
+///////////////////////////// bf16 = e4m3 * e4m3 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_bf16n_tensor_op_gmma_f32, 64x128x128_1x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, cutlass::bfloat16_t, float, float>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_4,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::bfloat16_t, LayoutC, 16 / sizeof(cutlass::bfloat16_t),
+      cutlass::bfloat16_t, LayoutC, 16 / sizeof(cutlass::bfloat16_t),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_4,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////// Cluster 4x1x1  //////////////////////////////////
+///////////////////////////// bf16 = e4m3 * e4m3 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_bf16n_tensor_op_gmma_f32, 64x128x128_4x1x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, cutlass::bfloat16_t, float, float>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_4,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::bfloat16_t, LayoutC, 16 / sizeof(cutlass::bfloat16_t),
+      cutlass::bfloat16_t, LayoutC, 16 / sizeof(cutlass::bfloat16_t),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_4,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////// Cluster 2x4x1  //////////////////////////////////
+///////////////////////////// bf16 = e4m3 * e4m3 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_bf16n_tensor_op_gmma_f32, 64x128x128_2x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, cutlass::bfloat16_t, float, float>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_2,_4,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::bfloat16_t, LayoutC, 16 / sizeof(cutlass::bfloat16_t),
+      cutlass::bfloat16_t, LayoutC, 16 / sizeof(cutlass::bfloat16_t),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_2,_4,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//////////////////////////////// TMA epilogue /////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_bf16n_tensor_op_gmma_f32, 64x128x128_tma_epilogue) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::bfloat16_t, LayoutC, 8,
+      cutlass::bfloat16_t, LayoutC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_bf16t_tensor_op_gmma_f32, 64x128x128_tma_epilogue_fp8_fast_accum) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::bfloat16_t, LayoutC, 8,
+      cutlass::bfloat16_t, LayoutC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum
+    >::CollectiveOp;
+
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f8_f8_bf16_tensor_op_fp32_evt.cu b/test/unit/gemm/device/sm90_gemm_f8_f8_bf16_tensor_op_fp32_evt.cu
new file mode 100644
index 0000000000..040801176f
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f8_f8_bf16_tensor_op_fp32_evt.cu
@@ -0,0 +1,197 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Tests for Sm90 f8_f8_bf16 with EVT epilogue 
+    ScaledLinCombPerRowBiasEltAct and ScaledLinCombPerRowBiasEltActAmaxAux
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x_evt.hpp"
+#include "sm90_evt_operations.hpp"
+
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+// Z = scale_a * scale_b * alpha * acc + beta * scale_c * C + per-row bias
+// if D is fp8 
+//   D = scale_d * activation(Z)
+// else
+//   D = activation(Z)
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_bf16t_tensor_op_gmma_f32_epilogue, 64x128x128_ScaledLinCombPerRowBiasEltAct) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_64,_128,_128>;
+  using ClusterShape_MNK = Shape<_1,_1,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionCallbacks = cutlass::epilogue::fusion::Sm90ScaledLinCombPerRowBiasEltAct<
+    TileShape_MNK,                      // CtaTileShapeMNK
+    cutlass::epilogue::thread::ReLu,    // ActivationFn
+    cutlass::bfloat16_t,                // ElementOutput
+    float,                              // ElementCompute
+    cutlass::bfloat16_t                 // ElementBias
+  >;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::bfloat16_t, LayoutC, 8,
+      cutlass::bfloat16_t, LayoutC, 8,
+      EpilogueSchedule,
+      FusionCallbacks
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  // Host reference
+  using HostReference = test::gemm::device::HostScaledLinCombPerRowBiasEltAct<
+    Gemm, cutlass::epilogue::thread::ReLu, cutlass::bfloat16_t
+  >;
+  bool passed = test::gemm::device::TestAllEVT<Gemm, HostReference>(true);
+  EXPECT_TRUE(passed);
+}
+
+// Z = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-row bias
+// if D is fp8 
+//   amax_d = max(abs(elements in activation(Z)))
+//   D = scale_d * activation(Z)
+// else
+//   D = activation(Z)
+// if Aux is fp8 
+//   amax_aux = max(abs(elements in Z))
+//   Aux = scale_aux * Z
+// else
+//   Aux = Z
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_bf16n_tensor_op_gmma_f32_epilogue, 64x128x128_4x1x1_ScaledLinCombPerRowBiasEltActAmaxAux) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_64,_128,_128>;
+  using ClusterShape_MNK = Shape<_2,_4,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
+    TileShape_MNK, EpilogueTileType, cutlass::bfloat16_t, cutlass::bfloat16_t, EpilogueSchedule>;
+  using AuxStoreDescriptor = cutlass::epilogue::collective::detail::AuxStoreDescriptor<
+    EpilogueDescriptor, cutlass::layout::RowMajor, cutlass::bfloat16_t>;
+    
+  using FusionCallbacks = cutlass::epilogue::fusion::Sm90ScaledLinCombPerRowBiasEltActAmaxAux<
+    TileShape_MNK,                               // CtaTileShapeMNK
+    typename EpilogueDescriptor::EpilogueTile,   // EpilogueTile
+    EpilogueDescriptor::StagesD,                 // StagesD
+    typename AuxStoreDescriptor::Stride,         // StrideAux
+    typename AuxStoreDescriptor::SmemLayoutAtom, // SmemLayoutAtom
+    typename AuxStoreDescriptor::CopyOpR2S,      // CopyOpR2S
+    cutlass::epilogue::thread::ReLu,             // ActivationFn
+    cutlass::bfloat16_t,                         // ElementOutput
+    float,                                       // ElementCompute
+    cutlass::bfloat16_t,                         // ElementBias
+    float                                        // ElementScalar
+  >;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      EpilogueTileType,
+      float, float,
+      cutlass::bfloat16_t, LayoutC, 16,
+      cutlass::bfloat16_t, LayoutC, 16,
+      EpilogueSchedule,
+      FusionCallbacks
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  // Host reference
+  using HostReference = test::gemm::device::HostScaledLinCombPerRowBiasEltActAmaxAux<
+    Gemm, cutlass::epilogue::thread::ReLu, cutlass::bfloat16_t
+  >;
+  bool passed = test::gemm::device::TestAllEVT<Gemm, HostReference>(true);
+  EXPECT_TRUE(passed);
+}
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_f32_cluster_warpspecialized_cooperative.cu b/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_f32_cluster_warpspecialized_cooperative.cu
new file mode 100644
index 0000000000..d9e7830e7f
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_f32_cluster_warpspecialized_cooperative.cu
@@ -0,0 +1,533 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+ /*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32n_tensor_op_gmma_f32_cooperative, 128x128x128_1x1x1) {
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA  = cutlass::layout::RowMajor;
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB  = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC  = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+
+  using TileShape_MNK = Shape<_128,_128,_128>;
+  using ClusterShape_MNK = Shape<_1,_1,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, ElementC, ElementAccumulator, ElementAccumulator>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementAccumulator,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16,
+      ElementB, LayoutB, 16,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32n_tensor_op_gmma_f32_cooperative, 256x128x128_1x1x1) {
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA  = cutlass::layout::RowMajor;
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB  = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC  = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+
+  using TileShape_MNK = Shape<_256,_128,_128>;
+  using ClusterShape_MNK = Shape<_1,_1,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, ElementC, ElementAccumulator, ElementAccumulator>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementAccumulator,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16,
+      ElementB, LayoutB, 16,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////////// Cluster 2x1x1  ////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32n_tensor_op_gmma_f32_cooperative, 128x128x128_1x2x1) {
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA  = cutlass::layout::RowMajor;
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB  = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC  = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+
+  using TileShape_MNK = Shape<_128,_128,_128>;
+  using ClusterShape_MNK = Shape<_1,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, ElementC, ElementAccumulator, ElementAccumulator>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementAccumulator,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16,
+      ElementB, LayoutB, 16,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32n_tensor_op_gmma_f32_cooperative, 256x128x128_1x2x1) {
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA  = cutlass::layout::RowMajor;
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB  = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC  = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+
+  using TileShape_MNK = Shape<_256,_128,_128>;
+  using ClusterShape_MNK = Shape<_1,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, ElementC, ElementAccumulator, ElementAccumulator>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementAccumulator,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16,
+      ElementB, LayoutB, 16,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////////// Cluster 1x4x1  ////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32n_tensor_op_gmma_f32_cooperative, 128x128x128_1x4x1) {
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA  = cutlass::layout::RowMajor;
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB  = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC  = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+
+  using TileShape_MNK = Shape<_128,_128,_128>;
+  using ClusterShape_MNK = Shape<_1,_4,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, ElementC, ElementAccumulator, ElementAccumulator>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementAccumulator,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16,
+      ElementB, LayoutB, 16,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32n_tensor_op_gmma_f32_cooperative, 256x128x128_1x4x1) {
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA  = cutlass::layout::RowMajor;
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB  = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC  = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+
+  using TileShape_MNK = Shape<_256,_128,_128>;
+  using ClusterShape_MNK = Shape<_1,_4,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, ElementC, ElementAccumulator, ElementAccumulator>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementAccumulator,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16,
+      ElementB, LayoutB, 16,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////////// Cluster 4x1x1  ////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32n_tensor_op_gmma_f32_cooperative, 128x128x128_4x1x1) {
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA  = cutlass::layout::RowMajor;
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB  = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC  = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+
+  using TileShape_MNK = Shape<_128,_128,_128>;
+  using ClusterShape_MNK = Shape<_4,_1,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, ElementC, ElementAccumulator, ElementAccumulator>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementAccumulator,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16,
+      ElementB, LayoutB, 16,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32n_tensor_op_gmma_f32_cooperative, 256x128x128_4x1x1) {
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA  = cutlass::layout::RowMajor;
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB  = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC  = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+
+  using TileShape_MNK = Shape<_256,_128,_128>;
+  using ClusterShape_MNK = Shape<_4,_1,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, ElementC, ElementAccumulator, ElementAccumulator>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementAccumulator,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16,
+      ElementB, LayoutB, 16,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////////// Cluster 2x4x1  ////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32n_tensor_op_gmma_f32_cooperative, 128x128x128_2x4x1) {
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA  = cutlass::layout::RowMajor;
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB  = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC  = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+
+  using TileShape_MNK = Shape<_128,_128,_128>;
+  using ClusterShape_MNK = Shape<_2,_4,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, ElementC, ElementAccumulator, ElementAccumulator>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementAccumulator,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16,
+      ElementB, LayoutB, 16,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32n_tensor_op_gmma_f32_cooperative, 256x128x128_2x4x1_fp8_fast_accum) {
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA  = cutlass::layout::RowMajor;
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB  = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC  = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+
+  using TileShape_MNK = Shape<_256,_128,_128>;
+  using ClusterShape_MNK = Shape<_2,_4,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, ElementC, ElementAccumulator, ElementAccumulator>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementAccumulator,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16,
+      ElementB, LayoutB, 16,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_f32_cluster_warpspecialized_cooperative_evt.cu b/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_f32_cluster_warpspecialized_cooperative_evt.cu
new file mode 100644
index 0000000000..bcf837d683
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_f32_cluster_warpspecialized_cooperative_evt.cu
@@ -0,0 +1,197 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Tests for Sm90 f8_f8_f32 with EVT epilogue
+    ScaledLinCombPerRowBiasEltAct and ScaledLinCombPerRowBiasEltActAmaxAux
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x_evt.hpp"
+#include "sm90_evt_operations.hpp"
+
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+// Z = scale_a * scale_b * alpha * acc + beta * scale_c * C + per-row bias
+// if D is fp8 
+//   D = scale_d * activation(Z)
+// else
+//   D = activation(Z)
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32n_tensor_op_gmma_f32_cooperative_epilogue, 128x128x128_1x4x1_ScaledLinCombPerRowBiasEltAct) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_128,_128>;
+  using ClusterShape_MNK = Shape<_1,_4,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionCallbacks = cutlass::epilogue::fusion::Sm90ScaledLinCombPerRowBiasEltAct<
+    TileShape_MNK,                      // CtaTileShapeMNK
+    cutlass::epilogue::thread::ReLu,    // ActivationFn
+    float,                              // ElementOutput
+    float,                              // ElementCompute
+    float                               // ElementBias
+  >;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      EpilogueSchedule,
+      FusionCallbacks
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  // Host reference
+  using HostReference = test::gemm::device::HostScaledLinCombPerRowBiasEltAct<
+    Gemm, cutlass::epilogue::thread::ReLu, float
+  >;
+  bool passed = test::gemm::device::TestAllEVT<Gemm, HostReference>(true);
+  EXPECT_TRUE(passed);
+}
+
+// Z = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-row bias
+// if D is fp8 
+//   amax_d = max(abs(elements in activation(Z)))
+//   D = scale_d * activation(Z)
+// else
+//   D = activation(Z)
+// if Aux is fp8 
+//   amax_aux = max(abs(elements in Z))
+//   Aux = scale_aux * Z
+// else
+//   Aux = Z
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 128x128x128_1x2x1_ScaledLinCombPerRowBiasEltActAmaxAux) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_128,_128,_128>;
+  using ClusterShape_MNK = Shape<_1,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
+    TileShape_MNK, EpilogueTileType, float, float, EpilogueSchedule>;
+  using AuxStoreDescriptor = cutlass::epilogue::collective::detail::AuxStoreDescriptor<
+    EpilogueDescriptor, cutlass::layout::RowMajor, float>;
+    
+  using FusionCallbacks = cutlass::epilogue::fusion::Sm90ScaledLinCombPerRowBiasEltActAmaxAux<
+    TileShape_MNK,                               // CtaTileShapeMNK
+    typename EpilogueDescriptor::EpilogueTile,   // EpilogueTile
+    EpilogueDescriptor::StagesD,                 // StagesD
+    typename AuxStoreDescriptor::Stride,         // StrideAux
+    typename AuxStoreDescriptor::SmemLayoutAtom, // SmemLayoutAtom
+    typename AuxStoreDescriptor::CopyOpR2S,      // CopyOpR2S
+    cutlass::epilogue::thread::ReLu,             // ActivationFn
+    float,                                       // ElementOutput
+    float,                                       // ElementCompute
+    float,                                       // ElementBias
+    float                                        // ElementScalar
+  >;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      EpilogueTileType,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      EpilogueSchedule,
+      FusionCallbacks
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  // Host reference
+  using HostReference = test::gemm::device::HostScaledLinCombPerRowBiasEltActAmaxAux<
+    Gemm, cutlass::epilogue::thread::ReLu, float
+  >;
+  bool passed = test::gemm::device::TestAllEVT<Gemm, HostReference>(true);
+  EXPECT_TRUE(passed);
+}
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_f32_cooperative_stream_k.cu b/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_f32_cooperative_stream_k.cu
new file mode 100644
index 0000000000..86fc08c6c3
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_f32_cooperative_stream_k.cu
@@ -0,0 +1,554 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+ /*! \file
+    \brief Tests for device-wide GEMM interface with stream-K interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32n_tensor_op_gmma_f32_cooperative_stream_k, 128x128x128_1x1x1) {
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA  = cutlass::layout::RowMajor;
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB  = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC  = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+
+  using TileShape_MNK = Shape<_128,_128,_128>;
+  using ClusterShape_MNK = Shape<_1,_1,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, ElementC, ElementAccumulator, ElementAccumulator>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementAccumulator,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16,
+      ElementB, LayoutB, 16,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>(1.0, 0.0));
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>(1.0, 1.0));
+}
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32n_tensor_op_gmma_f32_cooperative_stream_k, 256x128x128_1x1x1) {
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA  = cutlass::layout::RowMajor;
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB  = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC  = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+
+  using TileShape_MNK = Shape<_256,_128,_128>;
+  using ClusterShape_MNK = Shape<_1,_1,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, ElementC, ElementAccumulator, ElementAccumulator>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementAccumulator,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16,
+      ElementB, LayoutB, 16,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>(1.0, 0.0));
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>(1.0, 1.0));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////////// Cluster 2x1x1  ////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32n_tensor_op_gmma_f32_cooperative_stream_k, 128x128x128_1x2x1) {
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA  = cutlass::layout::RowMajor;
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB  = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC  = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+
+  using TileShape_MNK = Shape<_128,_128,_128>;
+  using ClusterShape_MNK = Shape<_1,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, ElementC, ElementAccumulator, ElementAccumulator>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementAccumulator,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16,
+      ElementB, LayoutB, 16,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>(1.0, 0.0));
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>(1.0, 1.0));
+}
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32n_tensor_op_gmma_f32_cooperative_stream_k, 256x128x128_1x2x1) {
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA  = cutlass::layout::RowMajor;
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB  = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC  = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+
+  using TileShape_MNK = Shape<_256,_128,_128>;
+  using ClusterShape_MNK = Shape<_1,_2,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, ElementC, ElementAccumulator, ElementAccumulator>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementAccumulator,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16,
+      ElementB, LayoutB, 16,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>(1.0, 0.0));
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>(1.0, 1.0));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////////// Cluster 1x4x1  ////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32n_tensor_op_gmma_f32_cooperative_stream_k, 128x128x128_1x4x1) {
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA  = cutlass::layout::RowMajor;
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB  = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC  = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+
+  using TileShape_MNK = Shape<_128,_128,_128>;
+  using ClusterShape_MNK = Shape<_1,_4,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, ElementC, ElementAccumulator, ElementAccumulator>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementAccumulator,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16,
+      ElementB, LayoutB, 16,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>(1.0, 0.0));
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>(1.0, 1.0));
+}
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32n_tensor_op_gmma_f32_cooperative_stream_k, 256x128x128_1x4x1) {
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA  = cutlass::layout::RowMajor;
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB  = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC  = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+
+  using TileShape_MNK = Shape<_256,_128,_128>;
+  using ClusterShape_MNK = Shape<_1,_4,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, ElementC, ElementAccumulator, ElementAccumulator>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementAccumulator,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16,
+      ElementB, LayoutB, 16,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>(1.0, 0.0));
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>(1.0, 1.0));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////////// Cluster 4x1x1  ////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32n_tensor_op_gmma_f32_cooperative_stream_k, 128x128x128_4x1x1) {
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA  = cutlass::layout::RowMajor;
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB  = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC  = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+
+  using TileShape_MNK = Shape<_128,_128,_128>;
+  using ClusterShape_MNK = Shape<_4,_1,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, ElementC, ElementAccumulator, ElementAccumulator>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementAccumulator,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16,
+      ElementB, LayoutB, 16,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>(1.0, 0.0));
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>(1.0, 1.0));
+}
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32n_tensor_op_gmma_f32_cooperative_stream_k, 256x128x128_4x1x1) {
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA  = cutlass::layout::RowMajor;
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB  = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC  = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+
+  using TileShape_MNK = Shape<_256,_128,_128>;
+  using ClusterShape_MNK = Shape<_4,_1,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, ElementC, ElementAccumulator, ElementAccumulator>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementAccumulator,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16,
+      ElementB, LayoutB, 16,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>(1.0, 0.0));
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>(1.0, 1.0));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////////// Cluster 2x4x1  ////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32n_tensor_op_gmma_f32_cooperative_stream_k, 128x128x128_2x4x1) {
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA  = cutlass::layout::RowMajor;
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB  = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC  = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+
+  using TileShape_MNK = Shape<_128,_128,_128>;
+  using ClusterShape_MNK = Shape<_2,_4,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, ElementC, ElementAccumulator, ElementAccumulator>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementAccumulator,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16,
+      ElementB, LayoutB, 16,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>(1.0, 0.0));
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>(1.0, 1.0));
+}
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32n_tensor_op_gmma_f32_cooperative_stream_k, 256x128x128_2x4x1_fp8_fast_accum) {
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA  = cutlass::layout::RowMajor;
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB  = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC  = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+
+  using TileShape_MNK = Shape<_256,_128,_128>;
+  using ClusterShape_MNK = Shape<_2,_4,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, ElementC, ElementAccumulator, ElementAccumulator>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementAccumulator,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16,
+      ElementB, LayoutB, 16,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>(1.0, 0.0));
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>(1.0, 1.0));
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_f32_rs_cluster_warpspecialized_cooperative.cu b/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_f32_rs_cluster_warpspecialized_cooperative.cu
new file mode 100644
index 0000000000..86414021f2
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_f32_rs_cluster_warpspecialized_cooperative.cu
@@ -0,0 +1,209 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+ /*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32n_tensor_op_gmma_f32_cooperative, 128x128x128_1x1x1) {
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA  = cutlass::layout::RowMajor;
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB  = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC  = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+
+  using TileShape_MNK = Shape<_128,_128,_128>;
+  using ClusterShape_MNK = Shape<_1,_1,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, ElementC, ElementAccumulator, ElementAccumulator>;
+
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementAccumulator,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using KernelScheduleType = cutlass::gemm::KernelTmaWarpSpecializedCooperative;
+  using AtomLayoutMNK = Layout<Shape<_2,_1,_1>>;
+  using TiledMma = decltype(cute::make_tiled_mma(GMMA::rs_op_selector<
+      ElementA, ElementB, ElementAccumulator, TileShape_MNK, GMMA::Major::K, GMMA::Major::K>(), AtomLayoutMNK{}));
+  using GmemTiledCopyA = decltype(cutlass::gemm::collective::detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB = decltype(cutlass::gemm::collective::detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+  static constexpr GMMA::Major GmmaMajorA = cutlass::gemm::collective::detail::gmma_rs_tag_to_major_A<LayoutA>();
+  static constexpr GMMA::Major GmmaMajorB = cutlass::gemm::collective::detail::gmma_rs_tag_to_major_B<LayoutB>();
+  using SmemLayoutAtomA = decltype(cutlass::gemm::collective::detail::rs_smem_selector<GmmaMajorA, ElementA,
+      decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{})), false>());
+  using SmemLayoutAtomB = decltype(cutlass::gemm::collective::detail::rs_smem_selector<GmmaMajorB, ElementB,
+      decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{})), false>());
+  
+  using StageCountType = cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>;
+
+  static constexpr int PipelineStages = cutlass::gemm::collective::detail::compute_stage_count_or_override<
+      cutlass::gemm::collective::detail::sm90_smem_capacity_bytes,
+      ElementA, ElementB, TileShape_MNK>(StageCountType{});
+  using DispatchPolicy = cutlass::gemm::MainloopSm90TmaGmmaRmemAWarpSpecialized<
+      PipelineStages, ClusterShape_MNK, KernelScheduleType>;
+
+  using CollectiveOp = cutlass::gemm::collective::CollectiveMma<
+      DispatchPolicy,
+      TileShape_MNK,
+      ElementA,
+      cutlass::gemm::TagToStrideA_t<LayoutA>,
+      ElementB,
+      cutlass::gemm::TagToStrideB_t<LayoutB>,
+      TiledMma,
+      GmemTiledCopyA,
+      SmemLayoutAtomA,
+      cute::Copy_Atom<cute::SM75_U32x4_LDSM_N,ElementA>,
+      cute::identity,
+      GmemTiledCopyB,
+      SmemLayoutAtomB,
+      void,
+      cute::identity
+    >;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32n_tensor_op_gmma_f32_cooperative, 128x128x128_2x1x1) {
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA  = cutlass::layout::RowMajor;
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB  = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC  = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+
+  using TileShape_MNK = Shape<_128,_128,_128>;
+  using ClusterShape_MNK = Shape<_2,_1,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, ElementC, ElementAccumulator, ElementAccumulator>;
+
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementAccumulator,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using KernelScheduleType = cutlass::gemm::KernelTmaWarpSpecializedCooperative;
+  using AtomLayoutMNK = Layout<Shape<_2,_1,_1>>;
+  using TiledMma = decltype(cute::make_tiled_mma(GMMA::rs_op_selector<
+      ElementA, ElementB, ElementAccumulator, TileShape_MNK, GMMA::Major::K, GMMA::Major::K>(), AtomLayoutMNK{}));
+  using GmemTiledCopyA = decltype(cutlass::gemm::collective::detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB = decltype(cutlass::gemm::collective::detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+  static constexpr GMMA::Major GmmaMajorA = cutlass::gemm::collective::detail::gmma_rs_tag_to_major_A<LayoutA>();
+  static constexpr GMMA::Major GmmaMajorB = cutlass::gemm::collective::detail::gmma_rs_tag_to_major_B<LayoutB>();
+  using SmemLayoutAtomA = decltype(cutlass::gemm::collective::detail::rs_smem_selector<GmmaMajorA, ElementA,
+      decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{})), false>());
+  using SmemLayoutAtomB = decltype(cutlass::gemm::collective::detail::rs_smem_selector<GmmaMajorB, ElementB,
+      decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{})), false>());
+  
+  using StageCountType = cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>;
+
+  static constexpr int PipelineStages = cutlass::gemm::collective::detail::compute_stage_count_or_override<
+      cutlass::gemm::collective::detail::sm90_smem_capacity_bytes,
+      ElementA, ElementB, TileShape_MNK>(StageCountType{});
+  using DispatchPolicy = cutlass::gemm::MainloopSm90TmaGmmaRmemAWarpSpecialized<
+      PipelineStages, ClusterShape_MNK, KernelScheduleType>;
+
+  using CollectiveOp = cutlass::gemm::collective::CollectiveMma<
+      DispatchPolicy,
+      TileShape_MNK,
+      ElementA,
+      cutlass::gemm::TagToStrideA_t<LayoutA>,
+      ElementB,
+      cutlass::gemm::TagToStrideB_t<LayoutB>,
+      TiledMma,
+      GmemTiledCopyA,
+      SmemLayoutAtomA,
+      cute::Copy_Atom<cute::SM75_U32x4_LDSM_N,ElementA>,
+      cute::identity,
+      GmemTiledCopyB,
+      SmemLayoutAtomB,
+      void,
+      cute::identity
+    >;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_fp32.cu b/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_fp32.cu
new file mode 100644
index 0000000000..a2fd5f66e6
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_fp32.cu
@@ -0,0 +1,554 @@
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+ /*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////// FP32 = e4m3 * e4m3 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32n_tensor_op_gmma_f32, 64x128x128) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, float, float, float>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 16 / sizeof(float),
+      float, LayoutC, 16 / sizeof(float),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////// FP32 = e4m3 * e4m3 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32n_tensor_op_gmma_f32, 64x128x128_bias_f32) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, float, float, float>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 16 / sizeof(float),
+      float, LayoutC, 16 / sizeof(float),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////// FP32 = e5m2 * e4m3 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e5m2t_e4m3n_f32n_tensor_op_gmma_f32, 64x128x128) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, float, float, float>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 16 / sizeof(float),
+      float, LayoutC, 16 / sizeof(float),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e5m2_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////// FP32 = e4m3 * e5m2 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e5m2n_f32n_tensor_op_gmma_f32, 64x128x128) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, float, float, float>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 16 / sizeof(float),
+      float, LayoutC, 16 / sizeof(float),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e5m2_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////// Cluster 2x2x1  //////////////////////////////////
+///////////////////////////// FP32 = e4m3 * e4m3 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32n_tensor_op_gmma_f32, 64x128x128_2x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, float, float, float>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_2,_2,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 16 / sizeof(float),
+      float, LayoutC, 16 / sizeof(float),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_2,_2,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////// Cluster 1x4x1  //////////////////////////////////
+///////////////////////////// FP32 = e4m3 * e4m3 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32n_tensor_op_gmma_f32, 64x128x128_1x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, float, float, float>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_4,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 16 / sizeof(float),
+      float, LayoutC, 16 / sizeof(float),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_4,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////// Cluster 4x1x1  //////////////////////////////////
+///////////////////////////// FP32 = e4m3 * e4m3 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32n_tensor_op_gmma_f32, 64x128x128_4x1x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, float, float, float>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_4,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 16 / sizeof(float),
+      float, LayoutC, 16 / sizeof(float),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_4,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////// Cluster 2x4x1  //////////////////////////////////
+///////////////////////////// FP32 = e4m3 * e4m3 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32n_tensor_op_gmma_f32, 64x128x128_2x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, float, float, float>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_2,_4,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 16 / sizeof(float),
+      float, LayoutC, 16 / sizeof(float),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_2,_4,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//////////////////////////////// TMA epilogue /////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32n_tensor_op_gmma_f32, 64x128x128_tma_epilogue) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32t_tensor_op_gmma_f32, 64x128x128_tma_epilogue) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32n_tensor_op_gmma_f32, 64x128x128_tma_epilogue_fp8_fast_accum) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum
+    >::CollectiveOp;
+
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32t_tensor_op_gmma_f32, 64x128x128_tma_epilogue_fp8_fast_accum) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum
+    >::CollectiveOp;
+
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f8_f8_f8_tensor_op_fp32.cu b/test/unit/gemm/device/sm90_gemm_f8_f8_f8_tensor_op_fp32.cu
new file mode 100644
index 0000000000..9cf8f3126b
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f8_f8_f8_tensor_op_fp32.cu
@@ -0,0 +1,1230 @@
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+ /*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+//////////////////////////////// output: E4M3 /////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////// e4m3 = e4m3 * e4m3 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_e4m3n_tensor_op_gmma_f32, 64x128x128) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, cutlass::float_e4m3_t, float, float>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::float_e4m3_t, LayoutC, 16 / sizeof(cutlass::float_e4m3_t),
+      cutlass::float_e4m3_t, LayoutC, 16 / sizeof(cutlass::float_e4m3_t),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////// e4m3 = e5m2 * e4m3 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e5m2t_e4m3n_e4m3n_tensor_op_gmma_f32, 64x128x128) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, cutlass::float_e4m3_t, float, float>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::float_e4m3_t, LayoutC, 16 / sizeof(cutlass::float_e4m3_t),
+      cutlass::float_e4m3_t, LayoutC, 16 / sizeof(cutlass::float_e4m3_t),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e5m2_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////// e4m3 = e4m3 * e5m2 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e5m2n_e4m3n_tensor_op_gmma_f32, 64x128x128) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, cutlass::float_e4m3_t, float, float>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::float_e4m3_t, LayoutC, 16 / sizeof(cutlass::float_e4m3_t),
+      cutlass::float_e4m3_t, LayoutC, 16 / sizeof(cutlass::float_e4m3_t),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e5m2_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////// Cluster 2x2x1  //////////////////////////////
+///////////////////////////// e4m3 = e4m3 * e4m3 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_e4m3n_tensor_op_gmma_f32, 64x128x128_2x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, cutlass::float_e4m3_t, float, float>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_2,_2,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::float_e4m3_t, LayoutC, 16 / sizeof(cutlass::float_e4m3_t),
+      cutlass::float_e4m3_t, LayoutC, 16 / sizeof(cutlass::float_e4m3_t),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_2,_2,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////// Cluster 1x4x1  //////////////////////////////
+///////////////////////////// e4m3 = e4m3 * e4m3 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_e4m3n_tensor_op_gmma_f32, 64x128x128_1x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, cutlass::float_e4m3_t, float, float>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_4,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::float_e4m3_t, LayoutC, 16 / sizeof(cutlass::float_e4m3_t),
+      cutlass::float_e4m3_t, LayoutC, 16 / sizeof(cutlass::float_e4m3_t),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_4,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////// Cluster 4x1x1  //////////////////////////////
+///////////////////////////// e4m3 = e4m3 * e4m3 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_e4m3n_tensor_op_gmma_f32, 64x128x128_4x1x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, cutlass::float_e4m3_t, float, float>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_4,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::float_e4m3_t, LayoutC, 16 / sizeof(cutlass::float_e4m3_t),
+      cutlass::float_e4m3_t, LayoutC, 16 / sizeof(cutlass::float_e4m3_t),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_4,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////// Cluster 2x4x1  //////////////////////////////
+///////////////////////////// e4m3 = e4m3 * e4m3 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_e4m3n_tensor_op_gmma_f32, 64x128x128_2x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, cutlass::float_e4m3_t, float, float>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_2,_4,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::float_e4m3_t, LayoutC, 16 / sizeof(cutlass::float_e4m3_t),
+      cutlass::float_e4m3_t, LayoutC, 16 / sizeof(cutlass::float_e4m3_t),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_2,_4,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//////////////////////////////// output: E5M2 /////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////// e5m2 = e4m3 * e4m3 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_e5m2n_tensor_op_gmma_f32, 64x128x128) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, cutlass::float_e5m2_t, float, float>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::float_e5m2_t, LayoutC, 16 / sizeof(cutlass::float_e5m2_t),
+      cutlass::float_e5m2_t, LayoutC, 16 / sizeof(cutlass::float_e5m2_t),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////// e5m2 = e5m2 * e4m3 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e5m2t_e4m3n_e5m2n_tensor_op_gmma_f32, 64x128x128) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, cutlass::float_e5m2_t, float, float>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::float_e5m2_t, LayoutC, 16 / sizeof(cutlass::float_e5m2_t),
+      cutlass::float_e5m2_t, LayoutC, 16 / sizeof(cutlass::float_e5m2_t),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e5m2_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////// e5m2 = e4m3 * e5m2 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e5m2n_e5m2n_tensor_op_gmma_f32, 64x128x128) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, cutlass::float_e5m2_t, float, float>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::float_e5m2_t, LayoutC, 16 / sizeof(cutlass::float_e5m2_t),
+      cutlass::float_e5m2_t, LayoutC, 16 / sizeof(cutlass::float_e5m2_t),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e5m2_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////// Cluster 2x2x1  //////////////////////////////
+///////////////////////////// e5m2 = e4m3 * e4m3 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_e5m2n_tensor_op_gmma_f32, 64x128x128_2x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, cutlass::float_e5m2_t, float, float>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_2,_2,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::float_e5m2_t, LayoutC, 16 / sizeof(cutlass::float_e5m2_t),
+      cutlass::float_e5m2_t, LayoutC, 16 / sizeof(cutlass::float_e5m2_t),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_2,_2,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////// Cluster 1x4x1  //////////////////////////////
+///////////////////////////// e5m2 = e4m3 * e4m3 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_e5m2n_tensor_op_gmma_f32, 64x128x128_1x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, cutlass::float_e5m2_t, float, float>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_4,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::float_e5m2_t, LayoutC, 16 / sizeof(cutlass::float_e5m2_t),
+      cutlass::float_e5m2_t, LayoutC, 16 / sizeof(cutlass::float_e5m2_t),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_4,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////// Cluster 4x1x1  //////////////////////////////
+///////////////////////////// e5m2 = e4m3 * e4m3 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_e5m2n_tensor_op_gmma_f32, 64x128x128_4x1x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, cutlass::float_e5m2_t, float, float>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_4,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::float_e5m2_t, LayoutC, 16 / sizeof(cutlass::float_e5m2_t),
+      cutlass::float_e5m2_t, LayoutC, 16 / sizeof(cutlass::float_e5m2_t),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_4,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////// Cluster 2x4x1  //////////////////////////////
+///////////////////////////// e5m2 = e4m3 * e4m3 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_e5m2n_tensor_op_gmma_f32, 64x128x128_2x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, cutlass::float_e5m2_t, float, float>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_2,_4,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::float_e5m2_t, LayoutC, 16 / sizeof(cutlass::float_e5m2_t),
+      cutlass::float_e5m2_t, LayoutC, 16 / sizeof(cutlass::float_e5m2_t),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_2,_4,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////// Cluster 2x4x1  //////////////////////////////
+///////////////////////////// e5m2 = e4m3 * e4m3 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_e5m2n_tensor_op_gmma_f32, 64x128x128_2x4x1_persistent) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, cutlass::float_e5m2_t, float, float>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_2,_4,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::float_e5m2_t, LayoutC, 16 / sizeof(cutlass::float_e5m2_t),
+      cutlass::float_e5m2_t, LayoutC, 16 / sizeof(cutlass::float_e5m2_t),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_2,_4,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////// Cluster 2x4x1  //////////////////////////////
+///////////////////////////// e5m2 = e4m3 * e4m3 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_e5m2n_tensor_op_gmma_f32, 64x128x128_2x4x1_non_warpspecialized) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, cutlass::float_e5m2_t, float, float>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_2,_4,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::float_e5m2_t, LayoutC, 16 / sizeof(cutlass::float_e5m2_t),
+      cutlass::float_e5m2_t, LayoutC, 16 / sizeof(cutlass::float_e5m2_t),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_2,_4,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+// Use Hopper FP8+AUX from 12.1
+#if (!((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ == 0)))
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////// output: E4M3 + Aux Tensor ///////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////// e4m3 = e4m3 * e4m3 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_e4m3n_tensor_op_gmma_f32, 64x128x128_aux_tensor_e4m3) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltActAmaxAux<
+      LayoutC, cutlass::epilogue::thread::Identity, cutlass::float_e4m3_t, float, cutlass::float_e4m3_t>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::float_e4m3_t, LayoutC, 16 / sizeof(cutlass::float_e4m3_t),
+      cutlass::float_e4m3_t, LayoutC, 16 / sizeof(cutlass::float_e4m3_t),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;               
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+////////////////////////////////// FP8 Accum  /////////////////////////////////
+///////////////////////////// e5m2 = e4m3 * e4m3 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_e5m2n_tensor_op_gmma_f32, 64x128x128_2x4x1_persistent_fp8_fast_accum) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, cutlass::float_e5m2_t, float, float>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_2,_4,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::float_e5m2_t, LayoutC, 16 / sizeof(cutlass::float_e5m2_t),
+      cutlass::float_e5m2_t, LayoutC, 16 / sizeof(cutlass::float_e5m2_t),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_2,_4,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_e5m2n_tensor_op_gmma_f32, 64x128x128_2x4x1_fp8_fast_accum) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, cutlass::float_e5m2_t, float, float>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_2,_4,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::float_e5m2_t, LayoutC, 16 / sizeof(cutlass::float_e5m2_t),
+      cutlass::float_e5m2_t, LayoutC, 16 / sizeof(cutlass::float_e5m2_t),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_2,_4,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+//////////////////////////  output: E4M3 + Bias ///////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////// e4m3 = e4m3 * e4m3 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_e4m3n_tensor_op_gmma_f32, 64x128x128_bias_bf16) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::Identity, cutlass::float_e4m3_t, float, cutlass::bfloat16_t>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::float_e4m3_t, LayoutC, 16 / sizeof(cutlass::float_e4m3_t),
+      cutlass::float_e4m3_t, LayoutC, 16 / sizeof(cutlass::float_e4m3_t),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+//////////////////////////  output: E4M3 + Bias + Relu ////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////// e4m3 = e4m3 * e4m3 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_e4m3n_tensor_op_gmma_f32, 64x128x128_bias_bf16_relu) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::ReLu, cutlass::float_e4m3_t, float, cutlass::bfloat16_t>;
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::float_e4m3_t, LayoutC, 16 / sizeof(cutlass::float_e4m3_t),
+      cutlass::float_e4m3_t, LayoutC, 16 / sizeof(cutlass::float_e4m3_t),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+
+// Use Hopper FP8+AUX from 12.1
+#if (!((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ == 0)))
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////// output: E4M3 + Aux Tensor + Bias/////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////// e4m3 = e4m3 * e5m2 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e5m2n_e4m3n_tensor_op_gmma_f32, 64x128x128_aux_tensor_f16_bias_f16) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltActAmaxAux<
+      LayoutC, cutlass::epilogue::thread::Identity,
+      cutlass::float_e4m3_t, // ElementOutput
+      float,                 // ElementCompute
+      cutlass::half_t,       // ElementAux
+      float,                 // ElementAmax
+      cutlass::half_t>;      // ElementBias
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::float_e4m3_t, LayoutC, 16 / sizeof(cutlass::float_e4m3_t),
+      cutlass::float_e4m3_t, LayoutC, 16 / sizeof(cutlass::float_e4m3_t),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e5m2_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;    
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////// output: E4M3 + Aux Tensor + Bias + Relu/////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////// e4m3 = e4m3 * e5m2 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e5m2n_e4m3n_tensor_op_gmma_f32, 64x128x128_aux_tensor_f16_relu) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltActAmaxAux<
+      LayoutC, cutlass::epilogue::thread::ReLu,
+      cutlass::float_e4m3_t, // ElementOutput
+      float,                 // ElementCompute
+      cutlass::half_t,       // ElementAux
+      float,                 // ElementAmax
+      float>;                // ElementBias
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::float_e4m3_t, LayoutC, 16 / sizeof(cutlass::float_e4m3_t),
+      cutlass::float_e4m3_t, LayoutC, 16 / sizeof(cutlass::float_e4m3_t),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e5m2_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;     
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////// e4m3 = e4m3 * e5m2 (TN) /////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e5m2n_e4m3n_tensor_op_gmma_f32, 64x128x128_aux_tensor_f16_bias_f16_relu) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltActAmaxAux<
+      LayoutC, cutlass::epilogue::thread::ReLu,
+      cutlass::float_e4m3_t, // ElementOutput
+      float,                 // ElementCompute
+      cutlass::half_t,       // ElementAux
+      float,                 // ElementAmax
+      cutlass::half_t>;      // ElementBias
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::float_e4m3_t, LayoutC, 16 / sizeof(cutlass::float_e4m3_t),
+      cutlass::float_e4m3_t, LayoutC, 16 / sizeof(cutlass::float_e4m3_t),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e5m2_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;     
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllBiasElementwise<Gemm>());
+}
+
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+//////////////////////////////// TMA epilogue /////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_e4m3n_tensor_op_gmma_f32, 64x128x128_tma_epilogue) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::float_e4m3_t, LayoutC, 16,
+      cutlass::float_e4m3_t, LayoutC, 16,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_e4m3t_tensor_op_gmma_f32, 64x128x128_tma_epilogue) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::float_e4m3_t, LayoutC, 16,
+      cutlass::float_e4m3_t, LayoutC, 16,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_f8_f8_f8_tensor_op_fp32_evt.cu b/test/unit/gemm/device/sm90_gemm_f8_f8_f8_tensor_op_fp32_evt.cu
new file mode 100644
index 0000000000..9873594c1a
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_f8_f8_f8_tensor_op_fp32_evt.cu
@@ -0,0 +1,255 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Tests for Sm90 f8_f8_bf16 with EVT epilogue
+    ScaledLinCombPerRowBiasEltAct and ScaledLinCombPerRowBiasEltActAmaxAux
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x_evt.hpp"
+#include "sm90_evt_operations.hpp"
+
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+// Z = scale_a * scale_b * alpha * acc + beta * scale_c * C + per-row bias
+// if D is fp8 
+//   D = scale_d * activation(Z)
+// else
+//   D = activation(Z)
+TEST(SM90_Device_Gemm_f8t_f8n_f8t_tensor_op_gmma_f32_persistent_epilogue, 64x128x128_1x1x1_ScaledLinCombPerRowBiasEltAct) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_64,_128,_128>;
+  using ClusterShape_MNK = Shape<_1,_1,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionCallbacks = cutlass::epilogue::fusion::Sm90ScaledLinCombPerRowBiasEltAct<
+    TileShape_MNK,                      // CtaTileShapeMNK
+    cutlass::epilogue::thread::ReLu,    // ActivationFn
+    cutlass::float_e4m3_t,              // ElementOutput
+    float,                              // ElementCompute
+    cutlass::float_e4m3_t               // ElementBias
+  >;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::float_e4m3_t, LayoutC, 16,
+      cutlass::float_e4m3_t, LayoutC, 16,
+      EpilogueSchedule,
+      FusionCallbacks
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  // Host reference
+  using HostReference = test::gemm::device::HostScaledLinCombPerRowBiasEltAct<
+    Gemm, cutlass::epilogue::thread::ReLu, cutlass::float_e4m3_t
+  >;
+  bool passed = test::gemm::device::TestAllEVT<Gemm, HostReference>(true);
+  EXPECT_TRUE(passed);
+}
+
+// Z = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-row bias
+// if D is fp8 
+//   amax_d = max(abs(elements in activation(Z)))
+//   D = scale_d * activation(Z)
+// else
+//   D = activation(Z)
+// if Aux is fp8 
+//   amax_aux = max(abs(elements in Z))
+//   Aux = scale_aux * Z
+// else
+//   Aux = Z
+TEST(SM90_Device_Gemm_f8t_f8n_f8t_tensor_op_gmma_f32_persistent_epilogue, 64x128x128_1x1x1_ScaledLinCombPerRowBiasEltActAmaxAux) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_64,_128,_128>;
+  using ClusterShape_MNK = Shape<_1,_1,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
+    TileShape_MNK, EpilogueTileType, cutlass::float_e4m3_t, cutlass::float_e4m3_t, EpilogueSchedule>;
+  using AuxStoreDescriptor = cutlass::epilogue::collective::detail::AuxStoreDescriptor<
+    EpilogueDescriptor, cutlass::layout::RowMajor, cutlass::float_e4m3_t>;
+    
+  using FusionCallbacks = cutlass::epilogue::fusion::Sm90ScaledLinCombPerRowBiasEltActAmaxAux<
+    TileShape_MNK,                               // CtaTileShapeMNK
+    typename EpilogueDescriptor::EpilogueTile,   // EpilogueTile
+    EpilogueDescriptor::StagesD,                 // StagesD
+    typename AuxStoreDescriptor::Stride,         // StrideAux
+    typename AuxStoreDescriptor::SmemLayoutAtom, // SmemLayoutAtom
+    typename AuxStoreDescriptor::CopyOpR2S,      // CopyOpR2S
+    cutlass::epilogue::thread::ReLu,             // ActivationFn
+    cutlass::float_e4m3_t,                       // ElementOutput
+    float,                                       // ElementCompute
+    cutlass::float_e4m3_t,                       // ElementBias
+    float                                        // ElementScalar
+  >;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      EpilogueTileType,
+      float, float,
+      cutlass::float_e4m3_t, LayoutC, 16,
+      cutlass::float_e4m3_t, LayoutC, 16,
+      EpilogueSchedule,
+      FusionCallbacks
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  // Host reference
+  using HostReference = test::gemm::device::HostScaledLinCombPerRowBiasEltActAmaxAux<
+    Gemm, cutlass::epilogue::thread::ReLu, cutlass::float_e4m3_t
+  >;
+  bool passed = test::gemm::device::TestAllEVT<Gemm, HostReference>(true);
+  EXPECT_TRUE(passed);
+}
+
+// Z = scale_a * scale_b * alpha * acc + beta * scale_c * C + per-row bias
+// if D is fp8 
+//   D = scale_d * filter_negative_zeros(Z)
+// else
+//   D = filter_negative_zeros(Z)
+TEST(SM90_Device_Gemm_f8t_f8n_f8t_tensor_op_gmma_f32_persistent_epilogue, 64x128x128_1x1x1_ScaledLinCombPerRowBiasEltFilter) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape_MNK = Shape<_64,_128,_128>;
+  using ClusterShape_MNK = Shape<_1,_1,_1>;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using FusionCallbacks = cutlass::epilogue::fusion::Sm90ScaledLinCombPerRowBiasEltAct<
+    TileShape_MNK,                                   // CtaTileShapeMNK
+    cutlass::epilogue::thread::ElementwiseFilter,    // ActivationFn
+    cutlass::float_e4m3_t,                           // ElementOutput
+    float,                                           // ElementCompute
+    cutlass::float_e4m3_t                            // ElementBias
+  >;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::float_e4m3_t, LayoutC, 16,
+      cutlass::float_e4m3_t, LayoutC, 16,
+      EpilogueSchedule,
+      FusionCallbacks
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  // Host reference
+  using HostReference = test::gemm::device::HostScaledLinCombPerRowBiasEltAct<
+    Gemm, cutlass::epilogue::thread::ElementwiseFilter, cutlass::float_e4m3_t
+  >;
+  bool passed = test::gemm::device::TestAllEVT<Gemm, HostReference>(true);
+  EXPECT_TRUE(passed);
+}
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_s8_s8_s8_alignx_tensor_op_s32.cu b/test/unit/gemm/device/sm90_gemm_s8_s8_s8_alignx_tensor_op_s32.cu
new file mode 100644
index 0000000000..0c28f575c1
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_s8_s8_s8_alignx_tensor_op_s32.cu
@@ -0,0 +1,133 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_s8t_s8n_s8n_align8_tensor_op_gmma_s32, 128x128x128) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      int8_t, LayoutA, 8,
+      int8_t, LayoutB, 8,
+      int32_t,
+      Shape<_128,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      int32_t, int32_t,
+      int8_t, LayoutC, 8,
+      int8_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_s8t_s8n_s8n_align4_tensor_op_gmma_s32, 128x128x128) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      int8_t, LayoutA, 4,
+      int8_t, LayoutB, 4,
+      int32_t,
+      Shape<_128,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      int32_t, int32_t,
+      int8_t, LayoutC, 4,
+      int8_t, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_s8_s8_s8_alignx_tensor_op_s32_warpspecialized.cu b/test/unit/gemm/device/sm90_gemm_s8_s8_s8_alignx_tensor_op_s32_warpspecialized.cu
new file mode 100644
index 0000000000..a19ec463ce
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_s8_s8_s8_alignx_tensor_op_s32_warpspecialized.cu
@@ -0,0 +1,168 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_s8t_s8n_s8n_align16_tensor_op_gmma_s32_warpspecialized, 128x128x128) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      int8_t, LayoutA, 16,
+      int8_t, LayoutB, 16,
+      int32_t,
+      Shape<_128,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      int32_t, int32_t,
+      int8_t, LayoutC, 16,
+      int8_t, LayoutC, 16,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_s8t_s8n_s8n_align8_tensor_op_gmma_s32_warpspecialized, 128x128x128) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      int8_t, LayoutA, 8,
+      int8_t, LayoutB, 8,
+      int32_t,
+      Shape<_128,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      int32_t, int32_t,
+      int8_t, LayoutC, 8,
+      int8_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_s8t_s8n_s8n_align4_tensor_op_gmma_s32_warpspecialized, 128x128x128) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      int8_t, LayoutA, 4,
+      int8_t, LayoutB, 4,
+      int32_t,
+      Shape<_128,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      int32_t, int32_t,
+      int8_t, LayoutC, 4,
+      int8_t, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_s8_s8_s8_alignx_tensor_op_s32_warpspecialized_cooperative.cu b/test/unit/gemm/device/sm90_gemm_s8_s8_s8_alignx_tensor_op_s32_warpspecialized_cooperative.cu
new file mode 100644
index 0000000000..371e1f930e
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_s8_s8_s8_alignx_tensor_op_s32_warpspecialized_cooperative.cu
@@ -0,0 +1,168 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_s8t_s8n_s8n_align16_tensor_op_gmma_s32_warpspecialized_cooperative, 128x128x128) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      int8_t, LayoutA, 16,
+      int8_t, LayoutB, 16,
+      int32_t,
+      Shape<_128,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      int32_t, int32_t,
+      int8_t, LayoutC, 16,
+      int8_t, LayoutC, 16,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_s8t_s8n_s8n_align8_tensor_op_gmma_s32_warpspecialized_cooperative, 128x128x128) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      int8_t, LayoutA, 8,
+      int8_t, LayoutB, 8,
+      int32_t,
+      Shape<_128,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      int32_t, int32_t,
+      int8_t, LayoutC, 8,
+      int8_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_s8t_s8n_s8n_align4_tensor_op_gmma_s32_warpspecialized_cooperative, 128x128x128) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      int8_t, LayoutA, 4,
+      int8_t, LayoutB, 4,
+      int32_t,
+      Shape<_128,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      int32_t, int32_t,
+      int8_t, LayoutC, 4,
+      int8_t, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_s8_s8_s8_alignx_tensor_op_s32_warpspecialized_pingpong.cu b/test/unit/gemm/device/sm90_gemm_s8_s8_s8_alignx_tensor_op_s32_warpspecialized_pingpong.cu
new file mode 100644
index 0000000000..43d6ad3c94
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_s8_s8_s8_alignx_tensor_op_s32_warpspecialized_pingpong.cu
@@ -0,0 +1,168 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_s8t_s8n_s8n_align16_tensor_op_gmma_s32_warpspecialized_pingpong, 128x128x128) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      int8_t, LayoutA, 16,
+      int8_t, LayoutB, 16,
+      int32_t,
+      Shape<_128,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      int32_t, int32_t,
+      int8_t, LayoutC, 16,
+      int8_t, LayoutC, 16,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_s8t_s8n_s8n_align8_tensor_op_gmma_s32_warpspecialized_pingpong, 128x128x128) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      int8_t, LayoutA, 8,
+      int8_t, LayoutB, 8,
+      int32_t,
+      Shape<_128,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      int32_t, int32_t,
+      int8_t, LayoutC, 8,
+      int8_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_s8t_s8n_s8n_align4_tensor_op_gmma_s32_warpspecialized_pingpong, 128x128x128) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      int8_t, LayoutA, 4,
+      int8_t, LayoutB, 4,
+      int32_t,
+      Shape<_128,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      int32_t, int32_t,
+      int8_t, LayoutC, 4,
+      int8_t, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_s8_s8_s8_tensor_op_s32.cu b/test/unit/gemm/device/sm90_gemm_s8_s8_s8_tensor_op_s32.cu
new file mode 100644
index 0000000000..a54245c0ea
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_s8_s8_s8_tensor_op_s32.cu
@@ -0,0 +1,418 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_s8t_s8n_s8n_tensor_op_gmma_s32, 64x128x128) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      int8_t, LayoutA, 16,
+      int8_t, LayoutB, 16,
+      int32_t,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      int32_t, int32_t,
+      int8_t, LayoutC, 16,
+      int8_t, LayoutC, 16,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_s8t_s8n_s8n_tensor_op_gmma_s32, 64x128x128_1x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      int8_t, LayoutA, 16,
+      int8_t, LayoutB, 16,
+      int32_t,
+      Shape<_64,_128,_128>, Shape<_1,_2,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      int32_t, int32_t,
+      int8_t, LayoutC, 16,
+      int8_t, LayoutC, 16,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_s8t_s8n_s8n_tensor_op_gmma_s32, 128x128x128) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      int8_t, LayoutA, 16,
+      int8_t, LayoutB, 16,
+      int32_t,
+      Shape<_128,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      int32_t, int32_t,
+      int8_t, LayoutC, 16,
+      int8_t, LayoutC, 16,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_s8t_s8n_s8n_tensor_op_gmma_s32, 128x128x128_1x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      int8_t, LayoutA, 16,
+      int8_t, LayoutB, 16,
+      int32_t,
+      Shape<_128,_128,_128>, Shape<_1,_2,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      int32_t, int32_t,
+      int8_t, LayoutC, 16,
+      int8_t, LayoutC, 16,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_s8t_s8n_s8n_tensor_op_gmma_s32, 128x128x128_2x1x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      int8_t, LayoutA, 16,
+      int8_t, LayoutB, 16,
+      int32_t,
+      Shape<_128,_128,_128>, Shape<_2,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      int32_t, int32_t,
+      int8_t, LayoutC, 16,
+      int8_t, LayoutC, 16,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_s8t_s8n_s8n_tensor_op_gmma_s32, 128x128x128_2x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      int8_t, LayoutA, 16,
+      int8_t, LayoutB, 16,
+      int32_t,
+      Shape<_128,_128,_128>, Shape<_2,_2,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      int32_t, int32_t,
+      int8_t, LayoutC, 16,
+      int8_t, LayoutC, 16,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_s8t_s8n_s8n_tensor_op_gmma_s32_pingpong_epilogue, 64x128x128) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      int32_t, int32_t,
+      int8_t, LayoutC, 16,
+      int8_t, LayoutC, 16,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      int8_t, LayoutA, 16,
+      int8_t, LayoutB, 16,
+      int32_t,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_s8t_s8n_s8t_tensor_op_gmma_s32_pingpong_epilogue, 64x128x128) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      int32_t, int32_t,
+      int8_t, LayoutC, 16,
+      int8_t, LayoutC, 16,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      int8_t, LayoutA, 16,
+      int8_t, LayoutB, 16,
+      int32_t,
+      Shape<_64,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_s8t_s8n_s8n_tensor_op_gmma_s32_cooperative_epilogue, 128x128x128) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      int32_t, int32_t,
+      int8_t, LayoutC, 16,
+      int8_t, LayoutC, 16,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      int8_t, LayoutA, 16,
+      int8_t, LayoutB, 16,
+      int32_t,
+      Shape<_128,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_s8t_s8n_s8t_tensor_op_gmma_s32_cooperative_epilogue, 128x128x128) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      int32_t, int32_t,
+      int8_t, LayoutC, 16,
+      int8_t, LayoutC, 16,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      int8_t, LayoutA, 16,
+      int8_t, LayoutB, 16,
+      int32_t,
+      Shape<_128,_128,_128>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_s8_s8_s8_tensor_op_s32_tensor_broadcast.cu b/test/unit/gemm/device/sm90_gemm_s8_s8_s8_tensor_op_s32_tensor_broadcast.cu
new file mode 100644
index 0000000000..864ee38ce8
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_s8_s8_s8_tensor_op_s32_tensor_broadcast.cu
@@ -0,0 +1,108 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface with an elementwise tensor-tensor broadcast epilogue
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/epilogue_tensor_broadcast.hpp"
+#include "cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x_tensor_broadcast.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_s8t_s8n_s8n_tensor_op_gmma_s32_tensor_broadcast, 128x128x128_2x2x1_ActReLU_Bin0Mul_Bin1Plus_UnaryHardSwish) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using ElementOutput = int32_t;
+  using ElementAccumulator = ElementOutput;
+  using ElementCompute = ElementOutput;
+  using ElementBias = ElementOutput;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      int8_t, LayoutA, 16,
+      int8_t, LayoutB, 16,
+      int32_t,
+      Shape<_128,_128,_128>, Shape<_2,_2,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using EpilogueOp = cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
+    cutlass::epilogue::collective::EpilogueTensorBroadcast<
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::gemm::TagToStrideC_t<LayoutC>,
+      cutlass::epilogue::thread::LinearCombinationTensorBroadcast<
+        ElementOutput, ElementAccumulator, ElementCompute, ElementBias,
+        cutlass::epilogue::thread::ReLu,
+        cutlass::multiplies,
+        cutlass::plus,
+        cutlass::epilogue::thread::HardSwish
+        >,
+      cutlass::gemm::EpilogueDefault>>;
+
+  EXPECT_TRUE(EpilogueOp::IsBinaryOp0Enabled);
+  EXPECT_TRUE(EpilogueOp::IsBinaryOp1Enabled);
+  EXPECT_TRUE(EpilogueOp::IsUnaryOpEnabled);
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAllTensorBroadcast<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_stream_k_scheduler.cu b/test/unit/gemm/device/sm90_gemm_stream_k_scheduler.cu
new file mode 100644
index 0000000000..2af2a3b0a5
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_stream_k_scheduler.cu
@@ -0,0 +1,338 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests that the stream-K scheduler covers the entire problem space.
+*/
+
+#include "cutlass/cluster_launch.hpp"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+// Grids are launched with clusters enabled in these tests,
+// so the CTK version must support cluster launching.
+#if defined(CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED)
+
+using namespace cute;
+using ProblemShape_MNKL = Shape<int, int, int, int>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Kernel for getting each piece of work for a given block from the scheduler and logging
+/// the K iterations visited by the block.
+template <
+  class Scheduler,
+  class TileShape,
+  class ClusterShape
+>
+__global__
+void
+run_scheduler(int* visit_counters, typename Scheduler::Params params, TileShape tile_shape, ClusterShape cluster_shape, ProblemShape_MNKL problem_shape_mnkl) {
+  Scheduler scheduler{params};
+  auto work_tile_info = scheduler.get_current_work();
+
+  while (work_tile_info.is_valid()) {
+    // Increment counters to indicate coverage
+    auto tile_idx = Scheduler::output_tile_index(params, work_tile_info);
+    auto offset = tile_idx * params.divmod_tiles_per_output_tile_.divisor + work_tile_info.K_idx;
+    for (auto i = 0; i < work_tile_info.k_tile_count; ++i) {
+      // Use atomicAdd because the visit counters are shared by multiple thread blocks.
+      // While having more than one block increment the same counter indicates failure,
+      // we need to ensure that this behavior is captured (by having both increments reflected).
+      atomicAdd(visit_counters + offset + i, 1);
+    }
+
+    bool continue_current = scheduler.continue_current_work(work_tile_info);
+    if (!continue_current) {
+      scheduler.advance_to_next_work();
+      work_tile_info = scheduler.get_current_work();
+    }
+  }
+}
+
+/// Host-side wrapper for launching the kernel to test the scheduler.
+template <
+  class TileShape,
+  class ClusterShape,
+  uint32_t NumMmaWarpGroups = 2
+>
+bool
+test_scheduler(
+  ProblemShape_MNKL problem_shape_mnkl,
+  TileShape tile_shape,
+  ClusterShape cluster_shape,
+  int sm_count,
+  int splits=1,
+  bool expect_data_parallel=false) {
+
+  using Scheduler = cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamK<TileShape, ClusterShape>;
+
+  cutlass::KernelHardwareInfo hw_info{0, sm_count};
+  auto params = Scheduler::to_underlying_arguments(problem_shape_mnkl, tile_shape, cluster_shape, hw_info, {splits}, nullptr);
+
+  typename Scheduler::Arguments args{};
+
+  // Set up the grid for the problem
+  dim3 grid = Scheduler::get_grid_shape(params, problem_shape_mnkl, tile_shape, cluster_shape, hw_info, args);
+
+  auto print_info = [&]() {
+    std::cout << "Failed with problem size "
+      << size<0>(problem_shape_mnkl) << "x"
+      << size<1>(problem_shape_mnkl) << "x"
+      << size<2>(problem_shape_mnkl) << "x"
+      << size<3>(problem_shape_mnkl)
+      << " and grid size " << grid.x << "x"
+      << grid.y << "x" << grid.z
+      << " splits=" << params.divmod_splits_.divisor
+      << " k_iter=" << params.divmod_tiles_per_output_tile_.divisor
+      << " big_units_=" << params.big_units_
+      << " big_groups_=" << params.big_groups_
+      << " sk_tiles=" << params.sk_tiles_
+      << " sk_units=" << params.sk_units_
+      << " k_tiles_per_sk_unit=" << params.divmod_k_tiles_per_sk_unit_.divisor
+      << " k_tiles_per_sk_big_unit=" << params.divmod_k_tiles_per_sk_big_unit_.divisor
+      << " units_per_problem=" << params.units_per_problem_
+      << " groups=" << params.divmod_sk_groups_.divisor << std::endl;
+  };
+
+  // If we expect the schedule to be data-parallel only, ensure that no stream-K tiles are launched.
+  if (expect_data_parallel && params.sk_tiles_ != 0) {
+    print_info();
+    std::cout << "Expected stream-K to select a data-parallel decomposition." << std::endl;
+    return false;
+  }
+
+  // Allocate counters indicating the number of times each k iteration of each output tile has been visited
+  auto [blk_m, blk_n, blk_l] = Scheduler::get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape, cluster_shape);
+  auto total_counters = blk_m * blk_n * blk_l * params.divmod_tiles_per_output_tile_.divisor;
+  cutlass::DeviceAllocation<int> visit_counters(total_counters);
+
+  // Initialize counters to zero
+  cudaError_t err = cudaMemset((void*)visit_counters.get(), 0, sizeof(int) * total_counters);
+  if (err != cudaSuccess) {
+    print_info();
+    std::cout << __FILE__ << ":" << __LINE__ << " cudaMemset failed with error: " << cudaGetErrorString(err) << std::endl;
+    return false;
+  }
+
+  // Set up cluster and cluster launch. This is needed even for this simple kernel because
+  // the SM90 scheduler needs to be able to query the CTA id within a cluster, which requires
+  // explicitly launching with clusters.
+  dim3 cluster{
+    static_cast<uint32_t>(cute::get<0>(ClusterShape{})),
+    static_cast<uint32_t>(cute::get<1>(ClusterShape{})),
+    static_cast<uint32_t>(cute::get<2>(ClusterShape{}))
+  };
+
+  cudaLaunchConfig_t launch_config;
+  launch_config.gridDim = grid;
+  launch_config.blockDim = {1, 1, 1};
+  launch_config.dynamicSmemBytes = 0;
+  launch_config.stream = NULL;
+
+  cudaLaunchAttribute launch_attribute[1];
+  launch_attribute[0].id = cudaLaunchAttributeClusterDimension;
+  launch_attribute[0].val.clusterDim.x = cluster.x;
+  launch_attribute[0].val.clusterDim.y = cluster.y;
+  launch_attribute[0].val.clusterDim.z = cluster.z;
+
+  launch_config.attrs = launch_attribute;
+  launch_config.numAttrs = 1;
+
+  void const* kernel = (void const*) run_scheduler<Scheduler, TileShape, ClusterShape>;
+  int* counters_ptr = visit_counters.get();
+  void* kernel_params[] = {
+    &counters_ptr,
+    &params,
+    &tile_shape,
+    &cluster_shape,
+    &problem_shape_mnkl
+  };
+
+  // Run the scheduler to completion and log visits to each k iteration
+  err = cudaLaunchKernelExC(&launch_config, kernel, kernel_params);
+
+  if (err != cudaSuccess) {
+    print_info();
+    std::cout << __FILE__ << ":" << __LINE__
+              << " cudaLaunchKernelExC failed with error: "
+              << cudaGetErrorString(err) << std::endl;
+    return false;
+  }
+
+  err = cudaDeviceSynchronize();
+  if (err != cudaSuccess) {
+    print_info();
+    std::cout << __FILE__ << ":" << __LINE__
+              << " scheduler kernel failed with error: "
+              << cudaGetErrorString(err) << std::endl;
+    return false;
+  }
+
+  // Copy visit counts back to host and ensure that all entries are ones
+  std::vector<int> host_visit_counts(total_counters);
+  visit_counters.copy_to_host(host_visit_counts.data());
+
+  for (size_t i = 0; i < host_visit_counts.size(); ++i) {
+    if (host_visit_counts[i] != 1) {
+      print_info();
+      std::cout << "Error at idx: " << i << ". Got count " << host_visit_counts[i] << std::endl;
+      return false;
+    }
+  }
+
+  return true;
+}
+
+/// Executes tests of the scheduler with a sweep across problem size K
+template <
+  class TileShape,
+  class ClusterShape
+>
+bool sweep_k(
+  ProblemShape_MNKL problem_shape_mnkl,
+  TileShape tile_shape,
+  ClusterShape cluster_shape,
+  int sm_count,
+  int splits=1,
+  bool expect_data_parallel=false,
+  int k_start=128,
+  int k_stop=16384,
+  int k_step=0) {
+
+  if (k_step == 0) {
+    k_step = 4 * cute::size<2>(tile_shape);
+  }
+
+  for (int k = k_start; k <= k_stop; k += k_step) {
+    ProblemShape_MNKL problem{get<0>(problem_shape_mnkl), get<1>(problem_shape_mnkl), k, get<3>(problem_shape_mnkl)};
+    bool passed = test_scheduler(problem, tile_shape, cluster_shape, sm_count, splits, expect_data_parallel);
+    if (!passed) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+/// Executes tests of the scheduler that are expected to result in a data-parallel schedule.
+/// This function assumes that the problem, tile, and cluster shape, alongside the SM count,
+/// are such that the problem executes only full waves on the device.
+template <
+  class TileShape,
+  class ClusterShape
+>
+bool test_data_parallel(
+  int blocks_m,
+  int blocks_n,
+  TileShape tile_shape,
+  ClusterShape cluster_shape,
+  int sm_count) {
+
+  // Since the configuration passed in executes only full waves, increasing
+  // the batch dimension simply results in running more full waves.
+  for (int l = 1; l < 4; ++l) {
+    ProblemShape_MNKL problem_shape{
+      size<0>(tile_shape) * blocks_m, size<1>(tile_shape) * blocks_n, 1, l};
+    bool passed = sweep_k(problem_shape, tile_shape, cluster_shape, sm_count, /*splits=*/1, /*expect_data_parallel=*/true);
+
+    if (!passed) {
+      return false;
+    }
+  }
+  return true;
+}
+
+/// Executes tests of the scheduler on the generic stream-K decomposition.
+template <
+  class TileShape,
+  class ClusterShape
+>
+bool test_stream_k(
+  TileShape tile_shape,
+  ClusterShape cluster_shape,
+  int sm_count) {
+
+  int tile_m = size<0>(tile_shape);
+  int tile_n = size<1>(tile_shape);
+
+  for (int m_blocks = 1; m_blocks <= 24; ++m_blocks) {
+    for (int n_blocks = 1; n_blocks <= 24; ++n_blocks) {
+      for (int l = 1; l < 4; ++l) {
+        ProblemShape_MNKL problem{m_blocks * tile_m, n_blocks * tile_n, 1, l};
+        if (!sweep_k(problem, tile_shape, cluster_shape, sm_count)) {
+          return false;
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_stream_k_scheduler, 256x128x64_2x1x1) {
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_1,_1>;
+
+  TileShape_MNK tile_shape;
+  ClusterShape_MNK cluster_shape;
+
+  // Test various data-parallel cases
+  EXPECT_TRUE(test_data_parallel(/*blocks_m=*/ 4, /*blocks_n=*/ 4, tile_shape, cluster_shape, /*sm_count=*/ 16));
+  EXPECT_TRUE(test_data_parallel(/*blocks_m=*/16, /*blocks_n=*/ 4, tile_shape, cluster_shape, /*sm_count=*/ 64));
+  EXPECT_TRUE(test_data_parallel(/*blocks_m=*/ 8, /*blocks_n=*/27, tile_shape, cluster_shape, /*sm_count=*/108));
+
+  // Test various stream-K cases
+  EXPECT_TRUE(test_stream_k(tile_shape, cluster_shape, /*sm_count=*/ 16));
+  EXPECT_TRUE(test_stream_k(tile_shape, cluster_shape, /*sm_count=*/ 64));
+  EXPECT_TRUE(test_stream_k(tile_shape, cluster_shape, /*sm_count=*/108));
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_stream_k_scheduler, 128x128x64_2x1x1) {
+  using TileShape_MNK = Shape<_128,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_1,_1>;
+
+  TileShape_MNK tile_shape;
+  ClusterShape_MNK cluster_shape;
+
+  EXPECT_TRUE(test_scheduler({128, 512, 2048, 1}, tile_shape, cluster_shape, 114));
+}
+
+#endif // defined(CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_alignx_tensor_op_f32.cu b/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_alignx_tensor_op_f32.cu
new file mode 100644
index 0000000000..b23c0150d0
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_alignx_tensor_op_f32.cu
@@ -0,0 +1,132 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_tf32t_tf32n_f32n_align2_tensor_op_gmma_f32, 128x64x32) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::tfloat32_t, LayoutA, 2,
+      cutlass::tfloat32_t, LayoutB, 2,
+      float,
+      Shape<_128,_64,_32>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_64,_32>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 2,
+      float, LayoutC, 2,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_tf32t_tf32n_f32n_align1_tensor_op_gmma_f32, 128x64x32) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::tfloat32_t, LayoutA, 1,
+      cutlass::tfloat32_t, LayoutB, 1,
+      float,
+      Shape<_128,_64,_32>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_64,_32>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 1,
+      float, LayoutC, 1,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_alignx_tensor_op_f32_warpspecialized.cu b/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_alignx_tensor_op_f32_warpspecialized.cu
new file mode 100644
index 0000000000..ce18a41d6d
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_alignx_tensor_op_f32_warpspecialized.cu
@@ -0,0 +1,167 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_tf32t_tf32n_f32n_align4_tensor_op_gmma_f32_warpspecialized, 128x64x32) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      tfloat32_t, LayoutA, 4,
+      tfloat32_t, LayoutB, 4,
+      float,
+      Shape<_128,_64,_32>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_64,_32>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_tf32t_tf32n_f32n_align2_tensor_op_gmma_f32_warpspecialized, 128x64x32) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::tfloat32_t, LayoutA, 2,
+      cutlass::tfloat32_t, LayoutB, 2,
+      float,
+      Shape<_128,_64,_32>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_64,_32>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 2,
+      float, LayoutC, 2,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_tf32t_tf32n_f32n_align1_tensor_op_gmma_f32_warpspecialized, 128x64x32) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::tfloat32_t, LayoutA, 1,
+      cutlass::tfloat32_t, LayoutB, 1,
+      float,
+      Shape<_128,_64,_32>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_64,_32>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 1,
+      float, LayoutC, 1,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_alignx_tensor_op_f32_warpspecialized_cooperative.cu b/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_alignx_tensor_op_f32_warpspecialized_cooperative.cu
new file mode 100644
index 0000000000..e70c1a89cb
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_alignx_tensor_op_f32_warpspecialized_cooperative.cu
@@ -0,0 +1,167 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_tf32t_tf32n_f32n_align4_tensor_op_gmma_f32_warpspecialized_cooperative, 128x64x32) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      tfloat32_t, LayoutA, 4,
+      tfloat32_t, LayoutB, 4,
+      float,
+      Shape<_128,_64,_32>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_64,_32>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_tf32t_tf32n_f32n_align2_tensor_op_gmma_f32_warpspecialized_cooperative, 128x64x32) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::tfloat32_t, LayoutA, 2,
+      cutlass::tfloat32_t, LayoutB, 2,
+      float,
+      Shape<_128,_64,_32>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_64,_32>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 2,
+      float, LayoutC, 2,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_tf32t_tf32n_f32n_align1_tensor_op_gmma_f32_warpspecialized_cooperative, 128x64x32) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::tfloat32_t, LayoutA, 1,
+      cutlass::tfloat32_t, LayoutB, 1,
+      float,
+      Shape<_128,_64,_32>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_64,_32>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 1,
+      float, LayoutC, 1,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_alignx_tensor_op_f32_warpspecialized_pingpong.cu b/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_alignx_tensor_op_f32_warpspecialized_pingpong.cu
new file mode 100644
index 0000000000..259500ddbc
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_alignx_tensor_op_f32_warpspecialized_pingpong.cu
@@ -0,0 +1,167 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_tf32t_tf32n_f32n_align4_tensor_op_gmma_f32_warpspecialized_pingpong, 128x64x32) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      tfloat32_t, LayoutA, 4,
+      tfloat32_t, LayoutB, 4,
+      float,
+      Shape<_128,_64,_32>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_64,_32>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::NoSmemWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_tf32t_tf32n_f32n_align2_tensor_op_gmma_f32_warpspecialized_pingpong, 128x64x32) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::tfloat32_t, LayoutA, 2,
+      cutlass::tfloat32_t, LayoutB, 2,
+      float,
+      Shape<_128,_64,_32>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_64,_32>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 2,
+      float, LayoutC, 2,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+TEST(SM90_Device_Gemm_tf32t_tf32n_f32n_align1_tensor_op_gmma_f32_warpspecialized_pingpong, 128x64x32) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::tfloat32_t, LayoutA, 1,
+      cutlass::tfloat32_t, LayoutB, 1,
+      float,
+      Shape<_128,_64,_32>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelCpAsyncWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_128,_64,_32>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 1,
+      float, LayoutC, 1,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_tensor_op_f32.cu b/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_tensor_op_f32.cu
new file mode 100644
index 0000000000..f513e3a7e7
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_tensor_op_f32.cu
@@ -0,0 +1,206 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+TEST(SM90_Device_Gemm_tf32t_tf32n_f32n_tensor_op_gmma_f32, 64x128x32) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::tfloat32_t, LayoutA, 4,
+      cutlass::tfloat32_t, LayoutB, 4,
+      float,
+      Shape<_64,_128,_32>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_32>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_tf32n_tf32n_f32n_tensor_op_gmma_f32, 64x128x32) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::tfloat32_t, LayoutA, 4,
+      cutlass::tfloat32_t, LayoutB, 4,
+      float,
+      Shape<_64,_128,_32>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_32>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_tf32n_tf32t_f32n_tensor_op_gmma_f32, 64x128x32) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::tfloat32_t, LayoutA, 4,
+      cutlass::tfloat32_t, LayoutB, 4,
+      float,
+      Shape<_64,_128,_32>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_32>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_tf32t_tf32t_f32n_tensor_op_gmma_f32, 64x128x32) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::tfloat32_t, LayoutA, 4,
+      cutlass::tfloat32_t, LayoutB, 4,
+      float,
+      Shape<_64,_128,_32>, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      Shape<_64,_128,_32>, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::gemm::EpilogueTransposed
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_tensor_op_f32_gmma_rs_cluster_warpspecialized.cu b/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_tensor_op_f32_gmma_rs_cluster_warpspecialized.cu
new file mode 100644
index 0000000000..a15f56698b
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_tensor_op_f32_gmma_rs_cluster_warpspecialized.cu
@@ -0,0 +1,566 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_tf32t_tf32n_f32n_tensor_op_gmma_rs_ws_f32, 64x128x32) {
+  using ElementA = cutlass::tfloat32_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::tfloat32_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_64,_128,_32>;
+  using ClusterShape_MNK = Shape<_1,_1,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 4,
+      ElementB, LayoutB, 4,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_tf32n_tf32n_f32n_tensor_op_gmma_rs_ws_f32, 64x128x32) {
+  using ElementA = cutlass::tfloat32_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::tfloat32_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_64,_128,_32>;
+  using ClusterShape_MNK = Shape<_1,_1,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 4,
+      ElementB, LayoutB, 4,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_tf32t_tf32t_f32n_tensor_op_gmma_rs_ws_f32, 64x128x32) {
+  using ElementA = cutlass::tfloat32_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::tfloat32_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_64,_128,_32>;
+  using ClusterShape_MNK = Shape<_1,_1,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 4,
+      ElementB, LayoutB, 4,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::gemm::EpilogueTransposed
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_tf32n_tf32t_f32n_tensor_op_gmma_rs_ws_f32, 64x128x32) {
+  using ElementA = cutlass::tfloat32_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::tfloat32_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_64,_128,_32>;
+  using ClusterShape_MNK = Shape<_1,_1,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 4,
+      ElementB, LayoutB, 4,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_tf32t_tf32n_f32n_tensor_op_gmma_rs_ws_f32, 64x128x32_4x2x1) {
+  using ElementA = cutlass::tfloat32_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::tfloat32_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_64,_128,_32>;
+  using ClusterShape_MNK = Shape<_4,_2,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 4,
+      ElementB, LayoutB, 4,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_tf32n_tf32n_f32n_tensor_op_gmma_rs_ws_f32, 64x128x32_4x2x1) {
+  using ElementA = cutlass::tfloat32_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::tfloat32_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_64,_128,_32>;
+  using ClusterShape_MNK = Shape<_4,_2,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 4,
+      ElementB, LayoutB, 4,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_tf32t_tf32t_f32n_tensor_op_gmma_rs_ws_f32, 64x128x32_4x2x1) {
+  using ElementA = cutlass::tfloat32_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::tfloat32_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_64,_128,_32>;
+  using ClusterShape_MNK = Shape<_4,_2,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 4,
+      ElementB, LayoutB, 4,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::gemm::EpilogueTransposed
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_tf32n_tf32t_f32n_tensor_op_gmma_rs_ws_f32, 64x128x32_4x2x1) {
+  using ElementA = cutlass::tfloat32_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::tfloat32_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_64,_128,_32>;
+  using ClusterShape_MNK = Shape<_4,_2,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 4,
+      ElementB, LayoutB, 4,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::KernelTmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+////////////  CollectiveBuilder with KernelScheduleAuto  //////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_tf32t_tf32n_f32n_tensor_op_gmma_rs_ws_f32, 64x128x32_4x2x1_auto_schedule) {
+  using ElementA = cutlass::tfloat32_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::tfloat32_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_64,_128,_32>;
+  using ClusterShape_MNK = Shape<_4,_2,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 4,
+      ElementB, LayoutB, 4,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_tf32n_tf32n_f32n_tensor_op_gmma_rs_ws_f32, 64x128x32_4x2x1_auto_schedule) {
+  using ElementA = cutlass::tfloat32_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::tfloat32_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_64,_128,_32>;
+  using ClusterShape_MNK = Shape<_4,_2,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 4,
+      ElementB, LayoutB, 4,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_tf32t_tf32t_f32n_tensor_op_gmma_rs_ws_f32, 64x128x32_4x2x1_auto_schedule) {
+  using ElementA = cutlass::tfloat32_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::tfloat32_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_64,_128,_32>;
+  using ClusterShape_MNK = Shape<_4,_2,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 4,
+      ElementB, LayoutB, 4,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::gemm::EpilogueTransposed
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gemm_tf32n_tf32t_f32n_tensor_op_gmma_rs_ws_f32, 64x128x32_4x2x1_auto_schedule) {
+  using ElementA = cutlass::tfloat32_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::tfloat32_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_64,_128,_32>;
+  using ClusterShape_MNK = Shape<_4,_2,_1>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 4,
+      ElementB, LayoutB, 4,
+      ElementAccumulator,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_gett_f16_f16_f16_tensor_op.cu b/test/unit/gemm/device/sm90_gett_f16_f16_f16_tensor_op.cu
new file mode 100644
index 0000000000..4d03fc939e
--- /dev/null
+++ b/test/unit/gemm/device/sm90_gett_f16_f16_f16_tensor_op.cu
@@ -0,0 +1,184 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/util/reference/device/gett.hpp"
+#include "cutlass/util/reference/device/tensor_compare.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Gett_f16t_f16n_f16n_tensor_op_gmma_f16, 8x8x8x8x8x8) {
+
+  using BatModeStrides = int;
+
+  using RowModeStridesA = cute::Stride<int, int>;
+  using RedModeStrides = cute::Stride<cute::_1, int>;
+
+  using ColModeStridesB = cute::Stride<int, int>;
+
+  using RowModeStridesC = cute::Stride<cute::_1, int>;
+  using ColModeStridesC = cute::Stride<int, int>;
+
+  using StrideA = cute::Stride<RowModeStridesA, RedModeStrides,  BatModeStrides>;
+  using StrideB = cute::Stride<ColModeStridesB, RedModeStrides,  BatModeStrides>;
+  using StrideC = cute::Stride<RowModeStridesC, ColModeStridesC, BatModeStrides>;
+  using StrideD = StrideC;
+
+  using TileShape = Shape<Shape<_8, _8>, Shape<_8, _8>, Shape<_8, _8>>;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, StrideA, 8,
+      cutlass::half_t, StrideB, 8,
+      cutlass::half_t,
+      TileShape, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAuto,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      cutlass::half_t, cutlass::half_t,
+      cutlass::half_t, StrideC, 8,
+      cutlass::half_t, StrideC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using GettKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<Shape<int,int>,
+            Shape<int,int>,
+            Shape<int,int>,
+            int>,
+      CollectiveOp,
+      CollectiveEpilogue
+  >;
+
+  using Gett = cutlass::gemm::device::GemmUniversalAdapter<GettKernel>;
+
+  auto problem_shape = make_shape(
+    make_shape(32,8),
+    make_shape(32,4),
+    make_shape(32,2),
+    1 
+  );
+
+  auto [M, N, K, L] = problem_shape;
+
+  StrideA dA = make_stride(make_stride(64, 2048), make_stride(_1{}, 32), size(M) * size(K));
+  StrideB dB = make_stride(make_stride(64, 2048), make_stride(_1{}, 32), size(N) * size(K));
+  StrideC dC = make_stride(make_stride(_1{}, 32), make_stride(256, 8192), size(M) * size(N));
+  StrideD dD = dC;
+
+  cutlass::half_t alpha = cutlass::half_t(1.0f);
+  cutlass::half_t beta  = cutlass::half_t(1.0f);
+
+  thrust::host_vector<cutlass::half_t> A_h(size(M) * size(K) * size(L));
+  thrust::host_vector<cutlass::half_t> B_h(size(N) * size(K) * size(L));
+  thrust::host_vector<cutlass::half_t> C_h(size(M) * size(N) * size(L));
+  thrust::host_vector<cutlass::half_t> D_h(size(M) * size(N) * size(L));
+  thrust::host_vector<cutlass::half_t> D_h_ref(size(M) * size(N) * size(L));
+
+  for (auto& a : A_h) a = cutlass::half_t(static_cast<int>(4 * (rand() / double(RAND_MAX) - 1)));
+  for (auto& b : B_h) b = cutlass::half_t(static_cast<int>(4 * (rand() / double(RAND_MAX) - 1)));
+  for (auto& c : C_h) c = cutlass::half_t(static_cast<int>(4 * (rand() / double(RAND_MAX) - 1)));
+  for (auto& d : D_h) d = cutlass::half_t(-1);
+  for (auto& d : D_h_ref) d = cutlass::half_t(-1);
+
+  thrust::device_vector<cutlass::half_t> A = A_h;
+  thrust::device_vector<cutlass::half_t> B = B_h;
+  thrust::device_vector<cutlass::half_t> C = C_h;
+  thrust::device_vector<cutlass::half_t> D = D_h;
+  thrust::device_vector<cutlass::half_t> D_ref = D_h_ref;
+
+  typename Gett::Arguments args {
+    cutlass::gemm::GemmUniversalMode::kBatched,
+    problem_shape,
+    {A.data().get(), dA, B.data().get(), dB},
+    { {alpha, beta}, C.data().get(), dC, D.data().get(), dD}
+  };
+
+  Gett gett;
+  auto status = gett(args);
+  EXPECT_TRUE(status == cutlass::Status::kSuccess);
+  auto cuda_err = cudaDeviceSynchronize();
+
+  EXPECT_TRUE(cuda_err == cudaSuccess);
+
+  cutlass::reference::device::gett(
+    problem_shape,
+    A.data().get(), dA,
+    B.data().get(), dB,
+    cutlass::half_t(0.0f),
+    C.data().get(), dC,
+    D_ref.data().get(), dD,
+    alpha, beta);
+  
+  cuda_err = cudaDeviceSynchronize();
+  EXPECT_TRUE(cuda_err == cudaSuccess);
+
+  bool passed = cutlass::reference::device::BlockCompareEqual(
+      D.data().get(), D_ref.data().get(), D_ref.size());
+  EXPECT_TRUE(passed);
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_sparse_gemm_f16_f16_f32_tensor_op_f32.cu b/test/unit/gemm/device/sm90_sparse_gemm_f16_f16_f32_tensor_op_f32.cu
new file mode 100644
index 0000000000..43d1839faf
--- /dev/null
+++ b/test/unit/gemm/device/sm90_sparse_gemm_f16_f16_f32_tensor_op_f32.cu
@@ -0,0 +1,255 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/mma_sm90.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SPARSE_SM90_SUPPORTED)
+
+TEST(SM90_Device_Sparse_Gemm_f16t_f16n_f32t_tensorop_f32, 128x128x64_1x1x1_warpspecialized) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape = Shape<_128,_128,_64>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
+      cutlass::half_t, LayoutA, 16,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestAll<Gemm>(1.0, 1.0, CheckEquality::EXACT);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM90_Device_Sparse_Gemm_f16t_f16n_f32t_tensorop_f32, 128x128x64_1x2x1_cooperative) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape = Shape<_128,_128,_64>;
+  using ClusterShape = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
+      cutlass::half_t, LayoutA, 16,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestAll<Gemm>(1.0, 1.0, CheckEquality::EXACT);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM90_Device_Sparse_Gemm_f16t_f16n_f32t_tensorop_f32, 128x128x64_2x1x1_pingpong) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape = Shape<_128,_128,_64>;
+  using ClusterShape = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
+      cutlass::half_t, LayoutA, 16,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestAll<Gemm>(1.0, 1.0, CheckEquality::EXACT);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM90_Device_Sparse_Gemm_bf16t_bf16n_f32t_tensorop_f32, 128x128x128_1x1x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape = Shape<_128,_128,_128>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
+      cutlass::bfloat16_t, LayoutA, 16,
+      cutlass::bfloat16_t, LayoutB, 8,
+      float,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestAll<Gemm>(1.0, 1.0, CheckEquality::EXACT);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM90_Device_Sparse_Gemm_f16t_f16n_f16t_tensorop_f16, 128x128x32_1x1x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape = Shape<_128,_128,_32>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      half_t, half_t,
+      half_t, LayoutC, 4,
+      half_t, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
+      cutlass::half_t, LayoutA, 16,
+      cutlass::half_t, LayoutB, 8,
+      half_t,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestAll<Gemm>(1.0, 1.0, CheckEquality::EXACT);
+  EXPECT_TRUE(result);
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SPARSE_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_sparse_gemm_f8_f8_f32_tensor_op_f32.cu b/test/unit/gemm/device/sm90_sparse_gemm_f8_f8_f32_tensor_op_f32.cu
new file mode 100644
index 0000000000..9b15b74bb8
--- /dev/null
+++ b/test/unit/gemm/device/sm90_sparse_gemm_f8_f8_f32_tensor_op_f32.cu
@@ -0,0 +1,216 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/mma_sm90.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SPARSE_SM90_SUPPORTED)
+
+TEST(SM90_Device_Sparse_Gemm_e4m3t_e5m2n_f32t_tensorop_f32, 128x128x128_1x1x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape = Shape<_128,_128,_128>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 32,
+      cutlass::float_e5m2_t, LayoutB, 16,
+      float,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestAll<Gemm>(1.0, 1.0, CheckEquality::EXACT);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM90_Device_Sparse_Gemm_e4m3t_e5m2n_f32t_tensorop_f32, 128x128x128_1x1x1_warpspecialized) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape = Shape<_128,_128,_128>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 32,
+      cutlass::float_e5m2_t, LayoutB, 16,
+      float,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestAll<Gemm>(1.0, 1.0, CheckEquality::EXACT);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM90_Device_Sparse_Gemm_e4m3t_e5m2n_f32t_tensorop_f32, 128x128x256_1x2x1_cooperative) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 32,
+      cutlass::float_e5m2_t, LayoutB, 16,
+      float,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestAll<Gemm>(1.0, 1.0, CheckEquality::EXACT);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM90_Device_Sparse_Gemm_e4m3t_e5m2n_f32t_tensorop_f32, 128x128x64_2x1x1_pingpong) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape = Shape<_128,_128,_64>;
+  using ClusterShape = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 32,
+      cutlass::float_e5m2_t, LayoutB, 16,
+      float,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestAll<Gemm>(1.0, 1.0, CheckEquality::EXACT);
+  EXPECT_TRUE(result);
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SPARSE_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_sparse_gemm_s8_s8_s32_tensor_op_s32.cu b/test/unit/gemm/device/sm90_sparse_gemm_s8_s8_s32_tensor_op_s32.cu
new file mode 100644
index 0000000000..09e52f5170
--- /dev/null
+++ b/test/unit/gemm/device/sm90_sparse_gemm_s8_s8_s32_tensor_op_s32.cu
@@ -0,0 +1,216 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/mma_sm90.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SPARSE_SM90_SUPPORTED)
+
+TEST(SM90_Device_Sparse_Gemm_s8t_s8n_s32t_tensorop_s32, 128x128x128_1x1x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape = Shape<_128,_128,_128>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      int32_t, int32_t,
+      int32_t, LayoutC, 4,
+      int32_t, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
+      int8_t, LayoutA, 32,
+      int8_t, LayoutB, 16,
+      int32_t,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestAll<Gemm>(1.0, 1.0, CheckEquality::EXACT);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM90_Device_Sparse_Gemm_s8t_s8n_s32t_tensorop_s32, 128x128x128_1x1x1_warpspecialized) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape = Shape<_128,_128,_128>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      int32_t, int32_t,
+      int32_t, LayoutC, 4,
+      int32_t, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
+      int8_t, LayoutA, 32,
+      int8_t, LayoutB, 16,
+      int32_t,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestAll<Gemm>(1.0, 1.0, CheckEquality::EXACT);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM90_Device_Sparse_Gemm_s8t_s8n_s32t_tensorop_s32, 128x128x256_1x2x1_cooperative) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      int32_t, int32_t,
+      int32_t, LayoutC, 4,
+      int32_t, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
+      int8_t, LayoutA, 32,
+      int8_t, LayoutB, 16,
+      int32_t,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestAll<Gemm>(1.0, 1.0, CheckEquality::EXACT);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM90_Device_Sparse_Gemm_s8t_s8n_s32t_tensorop_s32, 128x128x64_2x1x1_pingpong) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape = Shape<_128,_128,_64>;
+  using ClusterShape = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      int32_t, int32_t,
+      int32_t, LayoutC, 4,
+      int32_t, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
+      int8_t, LayoutA, 32,
+      int8_t, LayoutB, 16,
+      int32_t,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestAll<Gemm>(1.0, 1.0, CheckEquality::EXACT);
+  EXPECT_TRUE(result);
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SPARSE_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/sm90_sparse_gemm_tf32_tf32_f32_tensor_op_f32.cu b/test/unit/gemm/device/sm90_sparse_gemm_tf32_tf32_f32_tensor_op_f32.cu
new file mode 100644
index 0000000000..cc7b9486c2
--- /dev/null
+++ b/test/unit/gemm/device/sm90_sparse_gemm_tf32_tf32_f32_tensor_op_f32.cu
@@ -0,0 +1,216 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/mma_sm90.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SPARSE_SM90_SUPPORTED)
+
+TEST(SM90_Device_Sparse_Gemm_tf16t_tf16n_f32t_tensorop_f32, 128x128x64_1x1x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape = Shape<_128,_128,_64>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
+      float, LayoutA, 8,
+      float, LayoutB, 4,
+      float,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestAll<Gemm>(1.0, 1.0, CheckEquality::EXACT);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM90_Device_Sparse_Gemm_tf16t_tf16n_f32t_tensorop_f32, 128x128x64_1x1x1_warpspecialized) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape = Shape<_128,_128,_64>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
+      float, LayoutA, 8,
+      float, LayoutB, 4,
+      float,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestAll<Gemm>(1.0, 1.0, CheckEquality::EXACT);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM90_Device_Sparse_Gemm_tf32t_tf32n_f32t_tensorop_f32, 128x128x32_1x2x1_cooperative) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape = Shape<_128,_128,_32>;
+  using ClusterShape = Shape<_1,_2,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
+      float, LayoutA, 8,
+      float, LayoutB, 4,
+      float,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestAll<Gemm>(1.0, 1.0, CheckEquality::EXACT);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM90_Device_Sparse_Gemm_tf32t_tf32n_f32t_tensorop_f32, 128x128x16_2x1x1_pingpong) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape = Shape<_128,_128,_16>;
+  using ClusterShape = Shape<_2,_1,_1>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
+      float, LayoutA, 8,
+      float, LayoutB, 4,
+      float,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestAll<Gemm>(1.0, 1.0, CheckEquality::EXACT);
+  EXPECT_TRUE(result);
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SPARSE_SM90_SUPPORTED)
diff --git a/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_f32_ls_sm80.cu b/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_f32_ls_sm80.cu
index 5d2389fbed..6a56007630 100644
--- a/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_f32_ls_sm80.cu
+++ b/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_f32_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_f32_rs_sm80.cu b/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_f32_rs_sm80.cu
index 00584d408b..724d8f6981 100644
--- a/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_f32_rs_sm80.cu
+++ b/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_f32_rs_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_fast_f32_ls_sm80.cu b/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_fast_f32_ls_sm80.cu
index b014d0de69..33d37a878f 100644
--- a/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_fast_f32_ls_sm80.cu
+++ b/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_fast_f32_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_fast_f32_rs_sm80.cu b/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_fast_f32_rs_sm80.cu
index f28e12b834..5770f35032 100644
--- a/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_fast_f32_rs_sm80.cu
+++ b/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_fast_f32_rs_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_cf64_cf64_cf64_tensor_op_f64_sm90.cu b/test/unit/gemm/device/symm_cf64_cf64_cf64_tensor_op_f64_sm90.cu
new file mode 100644
index 0000000000..99847c97c6
--- /dev/null
+++ b/test/unit/gemm/device/symm_cf64_cf64_cf64_tensor_op_f64_sm90.cu
@@ -0,0 +1,132 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/symm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/symm_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_symm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Symm_cf64n_cf64n_ls_l_tensor_op_f64_gaussian, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<double>;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Symm = cutlass::gemm::device::Symm<
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddGaussianComplex
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Symm_cf64n_cf64n_rs_u_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementOutput = cutlass::complex<double>;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Symm = cutlass::gemm::device::Symm<
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
diff --git a/test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu b/test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu
index 743338b119..3a4a44956a 100644
--- a/test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu
+++ b/test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_ls_f64_sm80.cu b/test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_ls_f64_sm80.cu
index 3b4a6ba023..799de8ae16 100644
--- a/test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_ls_f64_sm80.cu
+++ b/test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_ls_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_rs_f64_sm80.cu b/test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_rs_f64_sm80.cu
index 97cbf5c937..a07190870c 100644
--- a/test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_rs_f64_sm80.cu
+++ b/test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_rs_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu b/test/unit/gemm/device/symm_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu
index b31c001599..5c700a409d 100644
--- a/test/unit/gemm/device/symm_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu
+++ b/test/unit/gemm/device/symm_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_f32n_f32n_tensor_op_fast_f32_rs_sm80.cu b/test/unit/gemm/device/symm_f32n_f32n_tensor_op_fast_f32_rs_sm80.cu
index 41332b7d1b..8fd174235f 100644
--- a/test/unit/gemm/device/symm_f32n_f32n_tensor_op_fast_f32_rs_sm80.cu
+++ b/test/unit/gemm/device/symm_f32n_f32n_tensor_op_fast_f32_rs_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu b/test/unit/gemm/device/symm_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu
index d8ec9ff300..c6dd7f316e 100644
--- a/test/unit/gemm/device/symm_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu
+++ b/test/unit/gemm/device/symm_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_f64_f64_tensor_op_f64_sm90.cu b/test/unit/gemm/device/symm_f64_f64_tensor_op_f64_sm90.cu
new file mode 100644
index 0000000000..9a07529289
--- /dev/null
+++ b/test/unit/gemm/device/symm_f64_f64_tensor_op_f64_sm90.cu
@@ -0,0 +1,135 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYMM interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/symm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/symm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_symm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Symm_f64n_f64n_rs_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Symm_f64t_f64t_ls_l_tensor_op_f64, 128x128x16_32x64x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
diff --git a/test/unit/gemm/device/symm_f64n_f64n_tensor_op_f64_ls_sm80.cu b/test/unit/gemm/device/symm_f64n_f64n_tensor_op_f64_ls_sm80.cu
index 191cc91620..2483714b62 100644
--- a/test/unit/gemm/device/symm_f64n_f64n_tensor_op_f64_ls_sm80.cu
+++ b/test/unit/gemm/device/symm_f64n_f64n_tensor_op_f64_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_f64n_f64n_tensor_op_f64_rs_sm80.cu b/test/unit/gemm/device/symm_f64n_f64n_tensor_op_f64_rs_sm80.cu
index d2ab35856e..e19c807bcd 100644
--- a/test/unit/gemm/device/symm_f64n_f64n_tensor_op_f64_rs_sm80.cu
+++ b/test/unit/gemm/device/symm_f64n_f64n_tensor_op_f64_rs_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_f64n_f64t_tensor_op_f64_ls_sm80.cu b/test/unit/gemm/device/symm_f64n_f64t_tensor_op_f64_ls_sm80.cu
index 69f03848a3..71e351558e 100644
--- a/test/unit/gemm/device/symm_f64n_f64t_tensor_op_f64_ls_sm80.cu
+++ b/test/unit/gemm/device/symm_f64n_f64t_tensor_op_f64_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_f64n_f64t_tensor_op_f64_rs_sm80.cu b/test/unit/gemm/device/symm_f64n_f64t_tensor_op_f64_rs_sm80.cu
index a1fcbf1d1c..6dc9dc0b2a 100644
--- a/test/unit/gemm/device/symm_f64n_f64t_tensor_op_f64_rs_sm80.cu
+++ b/test/unit/gemm/device/symm_f64n_f64t_tensor_op_f64_rs_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_f64t_f64n_tensor_op_f64_ls_sm80.cu b/test/unit/gemm/device/symm_f64t_f64n_tensor_op_f64_ls_sm80.cu
index aacf1460a6..f88b5384cb 100644
--- a/test/unit/gemm/device/symm_f64t_f64n_tensor_op_f64_ls_sm80.cu
+++ b/test/unit/gemm/device/symm_f64t_f64n_tensor_op_f64_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_f64t_f64n_tensor_op_f64_rs_sm80.cu b/test/unit/gemm/device/symm_f64t_f64n_tensor_op_f64_rs_sm80.cu
index 0e00311f3e..70af14725b 100644
--- a/test/unit/gemm/device/symm_f64t_f64n_tensor_op_f64_rs_sm80.cu
+++ b/test/unit/gemm/device/symm_f64t_f64n_tensor_op_f64_rs_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_f64t_f64t_tensor_op_f64_ls_sm80.cu b/test/unit/gemm/device/symm_f64t_f64t_tensor_op_f64_ls_sm80.cu
index 0dcd8ae55d..82b1c5e117 100644
--- a/test/unit/gemm/device/symm_f64t_f64t_tensor_op_f64_ls_sm80.cu
+++ b/test/unit/gemm/device/symm_f64t_f64t_tensor_op_f64_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_f64t_f64t_tensor_op_f64_rs_sm80.cu b/test/unit/gemm/device/symm_f64t_f64t_tensor_op_f64_rs_sm80.cu
index ddef3a70d5..e3ccfa686c 100644
--- a/test/unit/gemm/device/symm_f64t_f64t_tensor_op_f64_rs_sm80.cu
+++ b/test/unit/gemm/device/symm_f64t_f64t_tensor_op_f64_rs_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_tf32n_f32n_tensor_op_f32_ls_sm80.cu b/test/unit/gemm/device/symm_tf32n_f32n_tensor_op_f32_ls_sm80.cu
index d0d29dff07..7d2aebc3ec 100644
--- a/test/unit/gemm/device/symm_tf32n_f32n_tensor_op_f32_ls_sm80.cu
+++ b/test/unit/gemm/device/symm_tf32n_f32n_tensor_op_f32_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_tf32n_f32n_tensor_op_f32_rs_sm80.cu b/test/unit/gemm/device/symm_tf32n_f32n_tensor_op_f32_rs_sm80.cu
index 68f0223aa8..8b627fcb0b 100644
--- a/test/unit/gemm/device/symm_tf32n_f32n_tensor_op_f32_rs_sm80.cu
+++ b/test/unit/gemm/device/symm_tf32n_f32n_tensor_op_f32_rs_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_tf32t_f32t_tensor_op_f32_ls_sm80.cu b/test/unit/gemm/device/symm_tf32t_f32t_tensor_op_f32_ls_sm80.cu
index 70b3ce61cf..f47d01acc0 100644
--- a/test/unit/gemm/device/symm_tf32t_f32t_tensor_op_f32_ls_sm80.cu
+++ b/test/unit/gemm/device/symm_tf32t_f32t_tensor_op_f32_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syr2k_cf32n_cf32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/syr2k_cf32n_cf32n_tensor_op_f32_sm80.cu
index 2a8f0fe694..839e1e7e29 100644
--- a/test/unit/gemm/device/syr2k_cf32n_cf32n_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/syr2k_cf32n_cf32n_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syr2k_cf32n_cf32n_tensor_op_fast_f32_sm80.cu b/test/unit/gemm/device/syr2k_cf32n_cf32n_tensor_op_fast_f32_sm80.cu
index d642d67945..b3ba0a7de5 100644
--- a/test/unit/gemm/device/syr2k_cf32n_cf32n_tensor_op_fast_f32_sm80.cu
+++ b/test/unit/gemm/device/syr2k_cf32n_cf32n_tensor_op_fast_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syr2k_cf32n_cf32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/syr2k_cf32n_cf32t_tensor_op_f32_sm80.cu
index 483f768fea..b4940680f3 100644
--- a/test/unit/gemm/device/syr2k_cf32n_cf32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/syr2k_cf32n_cf32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -50,6 +50,7 @@
 #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
+#if (!((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ == 8)))
 
 TEST(SM80_Device_Syr2k_cf32n_cf32t_l_tensor_op_f32, 64x64x16_32x32x16) {
 
@@ -145,6 +146,7 @@ TEST(SM80_Device_Syr2k_cf32n_cf32t_u_tensor_op_f32, 64x64x16_32x32x16) {
   EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
 }
 
+#endif
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 #endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/syr2k_cf32n_cf32t_tensor_op_fast_f32_sm80.cu b/test/unit/gemm/device/syr2k_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
index b7b610aabc..c7e4b202b3 100644
--- a/test/unit/gemm/device/syr2k_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
+++ b/test/unit/gemm/device/syr2k_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -50,6 +50,7 @@
 #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
+#if (!((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ == 8)))
 
 TEST(SM80_Device_Syr2k_cf32n_cf32t_l_tensor_op_fast_f32, 64x64x16_32x32x16) {
 
@@ -145,6 +146,7 @@ TEST(SM80_Device_Syr2k_cf32n_cf32t_u_tensor_op_fast_f32, 64x64x16_32x32x16) {
   EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
 }
 
+#endif
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 #endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/syr2k_cf64_cf64_tensor_op_f64_sm90.cu b/test/unit/gemm/device/syr2k_cf64_cf64_tensor_op_f64_sm90.cu
new file mode 100644
index 0000000000..e71cb24443
--- /dev/null
+++ b/test/unit/gemm/device/syr2k_cf64_cf64_tensor_op_f64_sm90.cu
@@ -0,0 +1,149 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYRK interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_2k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_2k.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank2k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Syr2k_cf64n_cf64n_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    1,     // AlignmentB
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kSymmetric
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Syr2k_cf64n_cf64t_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    1,     // AlignmentB
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kSymmetric
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
diff --git a/test/unit/gemm/device/syr2k_cf64n_cf64n_tensor_op_f64_grouped_sm80.cu b/test/unit/gemm/device/syr2k_cf64n_cf64n_tensor_op_f64_grouped_sm80.cu
new file mode 100644
index 0000000000..4239611853
--- /dev/null
+++ b/test/unit/gemm/device/syr2k_cf64n_cf64n_tensor_op_f64_grouped_sm80.cu
@@ -0,0 +1,308 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for grouped Rank2K interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/rank_2k_grouped.h"
+#include "cutlass/gemm/kernel/default_rank_2k_grouped.h"
+#include "cutlass/gemm/device/rank_2k_grouped.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_grouped_rank_2k.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_cf64n_cf64n_l_tensor_op_cf64, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_cf64n_cf64n_l_tensor_op_cf64, 64x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_cf64n_cf64n_l_tensor_op_cf64, 64x32x16_32x32x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_cf64n_cf64n_l_tensor_op_cf64, 32x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_cf64n_cf64n_u_tensor_op_cf64, 32x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_cf64n_cf64n_u_tensor_op_cf64, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_cf64n_cf64n_u_tensor_op_cf64, 64x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/syr2k_cf64n_cf64n_tensor_op_f64_sm80.cu b/test/unit/gemm/device/syr2k_cf64n_cf64n_tensor_op_f64_sm80.cu
index fa2a5368ed..5b6b089d7b 100644
--- a/test/unit/gemm/device/syr2k_cf64n_cf64n_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/syr2k_cf64n_cf64n_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -50,6 +50,7 @@
 #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
+#if (!((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ == 8)))
 
 TEST(SM80_Device_Syr2k_cf64n_cf64n_l_tensor_op_f64, 32x32x16_16x16x16) {
 
@@ -145,6 +146,7 @@ TEST(SM80_Device_Syr2k_cf64n_cf64n_u_tensor_op_f64, 32x32x16_16x16x16) {
   EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
 }
 
+#endif
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 #endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/syr2k_cf64n_cf64t_tensor_op_f64_grouped_sm80.cu b/test/unit/gemm/device/syr2k_cf64n_cf64t_tensor_op_f64_grouped_sm80.cu
new file mode 100644
index 0000000000..5c43d52d2f
--- /dev/null
+++ b/test/unit/gemm/device/syr2k_cf64n_cf64t_tensor_op_f64_grouped_sm80.cu
@@ -0,0 +1,168 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for grouped Rank2K interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/rank_2k_grouped.h"
+#include "cutlass/gemm/kernel/default_rank_2k_grouped.h"
+#include "cutlass/gemm/device/rank_2k_grouped.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_grouped_rank_2k.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_cf64n_cf64t_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_cf64n_cf64t_l_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_cf64n_cf64t_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/syr2k_cf64n_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/syr2k_cf64n_cf64t_tensor_op_f64_sm80.cu
index ff4a61d05b..c9a7b12fb3 100644
--- a/test/unit/gemm/device/syr2k_cf64n_cf64t_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/syr2k_cf64n_cf64t_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syr2k_cf64t_cf64n_tensor_op_f64_grouped_sm80.cu b/test/unit/gemm/device/syr2k_cf64t_cf64n_tensor_op_f64_grouped_sm80.cu
new file mode 100644
index 0000000000..8c1ec972ad
--- /dev/null
+++ b/test/unit/gemm/device/syr2k_cf64t_cf64n_tensor_op_f64_grouped_sm80.cu
@@ -0,0 +1,168 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for grouped Rank2K interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/rank_2k_grouped.h"
+#include "cutlass/gemm/kernel/default_rank_2k_grouped.h"
+#include "cutlass/gemm/device/rank_2k_grouped.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_grouped_rank_2k.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_cf64n_cf64n_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_cf64n_cf64n_l_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_cf64n_cf64n_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/syr2k_cf64t_cf64t_tensor_op_f64_grouped_sm80.cu b/test/unit/gemm/device/syr2k_cf64t_cf64t_tensor_op_f64_grouped_sm80.cu
new file mode 100644
index 0000000000..96fe164794
--- /dev/null
+++ b/test/unit/gemm/device/syr2k_cf64t_cf64t_tensor_op_f64_grouped_sm80.cu
@@ -0,0 +1,168 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for grouped Rank2K interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/rank_2k_grouped.h"
+#include "cutlass/gemm/kernel/default_rank_2k_grouped.h"
+#include "cutlass/gemm/device/rank_2k_grouped.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_grouped_rank_2k.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_cf64t_cf64t_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_cf64t_cf64t_l_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_cf64t_cf64t_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/syr2k_f32n_f32n_tensor_op_fast_f32_sm80.cu b/test/unit/gemm/device/syr2k_f32n_f32n_tensor_op_fast_f32_sm80.cu
index a341b35e8a..ef1ffefde5 100644
--- a/test/unit/gemm/device/syr2k_f32n_f32n_tensor_op_fast_f32_sm80.cu
+++ b/test/unit/gemm/device/syr2k_f32n_f32n_tensor_op_fast_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syr2k_f32t_f32n_tensor_op_fast_f32_sm80.cu b/test/unit/gemm/device/syr2k_f32t_f32n_tensor_op_fast_f32_sm80.cu
index 9c346de240..9db017ec4f 100644
--- a/test/unit/gemm/device/syr2k_f32t_f32n_tensor_op_fast_f32_sm80.cu
+++ b/test/unit/gemm/device/syr2k_f32t_f32n_tensor_op_fast_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syr2k_f64_f64_tensor_op_f64_sm90.cu b/test/unit/gemm/device/syr2k_f64_f64_tensor_op_f64_sm90.cu
new file mode 100644
index 0000000000..e2c8cedfa6
--- /dev/null
+++ b/test/unit/gemm/device/syr2k_f64_f64_tensor_op_f64_sm90.cu
@@ -0,0 +1,133 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYRK interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_2k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_2k.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank2k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Syr2k_f64n_f64n_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Syr2k_f64t_f64n_l_tensor_op_f64, 128x128x16_32x64x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
diff --git a/test/unit/gemm/device/syr2k_f64n_f64n_tensor_op_f64_grouped_sm80.cu b/test/unit/gemm/device/syr2k_f64n_f64n_tensor_op_f64_grouped_sm80.cu
new file mode 100644
index 0000000000..877c226375
--- /dev/null
+++ b/test/unit/gemm/device/syr2k_f64n_f64n_tensor_op_f64_grouped_sm80.cu
@@ -0,0 +1,483 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for grouped Rank2K interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/rank_2k_grouped.h"
+#include "cutlass/gemm/kernel/default_rank_2k_grouped.h"
+#include "cutlass/gemm/device/rank_2k_grouped.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_grouped_rank_2k.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_f64n_f64n_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_f64n_f64n_l_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_f64n_f64n_l_tensor_op_f64, 64x32x16_32x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_f64n_f64n_l_tensor_op_f64, 32x64x16_32x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_f64n_f64n_l_tensor_op_f64, 128x64x16_64x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_f64n_f64n_l_tensor_op_f64, 128x128x16_32x64x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_f64n_f64n_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_f64n_f64n_u_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_f64n_f64n_u_tensor_op_f64, 64x32x16_32x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_f64n_f64n_u_tensor_op_f64, 32x64x16_32x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_f64n_f64n_u_tensor_op_f64, 128x64x16_64x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_f64n_f64n_u_tensor_op_f64, 128x128x16_32x64x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/syr2k_f64n_f64n_tensor_op_f64_sm80.cu b/test/unit/gemm/device/syr2k_f64n_f64n_tensor_op_f64_sm80.cu
index 447169e0b8..94b3569cf0 100644
--- a/test/unit/gemm/device/syr2k_f64n_f64n_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/syr2k_f64n_f64n_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syr2k_f64n_f64t_tensor_op_f64_grouped_sm80.cu b/test/unit/gemm/device/syr2k_f64n_f64t_tensor_op_f64_grouped_sm80.cu
new file mode 100644
index 0000000000..e54c03bb0b
--- /dev/null
+++ b/test/unit/gemm/device/syr2k_f64n_f64t_tensor_op_f64_grouped_sm80.cu
@@ -0,0 +1,273 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for grouped Rank2K interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/rank_2k_grouped.h"
+#include "cutlass/gemm/kernel/default_rank_2k_grouped.h"
+#include "cutlass/gemm/device/rank_2k_grouped.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_grouped_rank_2k.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_f64n_f64t_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_f64n_f64t_l_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_f64n_f64t_l_tensor_op_f64, 64x32x16_32x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_f64n_f64t_l_tensor_op_f64, 128x64x16_64x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_f64n_f64t_l_tensor_op_f64, 128x128x16_32x64x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_f64n_f64t_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/syr2k_f64n_f64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/syr2k_f64n_f64t_tensor_op_f64_sm80.cu
index c95750801e..90975a91c1 100644
--- a/test/unit/gemm/device/syr2k_f64n_f64t_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/syr2k_f64n_f64t_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syr2k_f64t_f64n_tensor_op_f64_grouped_sm80.cu b/test/unit/gemm/device/syr2k_f64t_f64n_tensor_op_f64_grouped_sm80.cu
new file mode 100644
index 0000000000..44824679ad
--- /dev/null
+++ b/test/unit/gemm/device/syr2k_f64t_f64n_tensor_op_f64_grouped_sm80.cu
@@ -0,0 +1,308 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for grouped Rank2K interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/rank_2k_grouped.h"
+#include "cutlass/gemm/kernel/default_rank_2k_grouped.h"
+#include "cutlass/gemm/device/rank_2k_grouped.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_grouped_rank_2k.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_f64t_f64n_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_f64t_f64n_l_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_f64t_f64n_l_tensor_op_f64, 64x32x16_32x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_f64t_f64n_l_tensor_op_f64, 128x64x16_64x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_f64t_f64n_l_tensor_op_f64, 128x128x16_32x64x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_f64t_f64n_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_f64t_f64n_u_tensor_op_f64, 64x32x16_32x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/syr2k_f64t_f64n_tensor_op_f64_sm80.cu b/test/unit/gemm/device/syr2k_f64t_f64n_tensor_op_f64_sm80.cu
index 0a7d11a9a4..7d689e3444 100644
--- a/test/unit/gemm/device/syr2k_f64t_f64n_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/syr2k_f64t_f64n_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syr2k_f64t_f64t_tensor_op_f64_grouped_sm80.cu b/test/unit/gemm/device/syr2k_f64t_f64t_tensor_op_f64_grouped_sm80.cu
new file mode 100644
index 0000000000..56690b56c8
--- /dev/null
+++ b/test/unit/gemm/device/syr2k_f64t_f64t_tensor_op_f64_grouped_sm80.cu
@@ -0,0 +1,308 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for grouped Rank2K interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/rank_2k_grouped.h"
+#include "cutlass/gemm/kernel/default_rank_2k_grouped.h"
+#include "cutlass/gemm/device/rank_2k_grouped.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_grouped_rank_2k.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_f64t_f64t_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_f64t_f64t_l_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_f64t_f64t_l_tensor_op_f64, 32x64x16_32x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_f64t_f64t_l_tensor_op_f64, 128x64x16_64x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_f64t_f64t_l_tensor_op_f64, 128x128x16_32x64x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_f64t_f64t_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2kGrouped_f64t_f64t_u_tensor_op_f64, 32x64x16_32x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, 1,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, 1,
+    ElementC, LayoutC, cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3, // kStages
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::BlasMode::kSymmetric>::Rank2Kkernel;
+
+  using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
+
+  test::gemm::device::TestbedGrouped<Rank2K> testbed;
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/syr2k_tf32n_f32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/syr2k_tf32n_f32n_tensor_op_f32_sm80.cu
index 3f30c97bfe..c201055681 100644
--- a/test/unit/gemm/device/syr2k_tf32n_f32n_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/syr2k_tf32n_f32n_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syr2k_tf32t_f32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/syr2k_tf32t_f32n_tensor_op_f32_sm80.cu
index 6696ac4ce5..0bde51bafb 100644
--- a/test/unit/gemm/device/syr2k_tf32t_f32n_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/syr2k_tf32t_f32n_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syrk_cf32n_cf32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/syrk_cf32n_cf32n_tensor_op_f32_sm80.cu
index 8b0e8df4f1..0e35706ebe 100644
--- a/test/unit/gemm/device/syrk_cf32n_cf32n_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/syrk_cf32n_cf32n_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syrk_cf32n_cf32n_tensor_op_fast_f32_sm80.cu b/test/unit/gemm/device/syrk_cf32n_cf32n_tensor_op_fast_f32_sm80.cu
index 179308ecbd..3de98e1e50 100644
--- a/test/unit/gemm/device/syrk_cf32n_cf32n_tensor_op_fast_f32_sm80.cu
+++ b/test/unit/gemm/device/syrk_cf32n_cf32n_tensor_op_fast_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syrk_cf32n_cf32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/syrk_cf32n_cf32t_tensor_op_f32_sm80.cu
index c01995704c..23aeaf56a5 100644
--- a/test/unit/gemm/device/syrk_cf32n_cf32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/syrk_cf32n_cf32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syrk_cf32n_cf32t_tensor_op_fast_f32_sm80.cu b/test/unit/gemm/device/syrk_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
index 1be43d1ab4..7b95c5d44c 100644
--- a/test/unit/gemm/device/syrk_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
+++ b/test/unit/gemm/device/syrk_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syrk_cf64_cf64_tensor_op_f64_sm90.cu b/test/unit/gemm/device/syrk_cf64_cf64_tensor_op_f64_sm90.cu
new file mode 100644
index 0000000000..14735e8746
--- /dev/null
+++ b/test/unit/gemm/device/syrk_cf64_cf64_tensor_op_f64_sm90.cu
@@ -0,0 +1,135 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYRK interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_k_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank_k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Syrk_cf64n_cf64n_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kSymmetric
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Syrk_cf64n_cf64t_l_tensor_op_f64_gaussian, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddGaussianComplex,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kSymmetric
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
diff --git a/test/unit/gemm/device/syrk_cf64n_cf64n_tensor_op_f64_sm80.cu b/test/unit/gemm/device/syrk_cf64n_cf64n_tensor_op_f64_sm80.cu
index bdd54bd669..18b8c2d75a 100644
--- a/test/unit/gemm/device/syrk_cf64n_cf64n_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/syrk_cf64n_cf64n_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syrk_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu b/test/unit/gemm/device/syrk_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
index 6f76e8434d..39df8ea951 100644
--- a/test/unit/gemm/device/syrk_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
+++ b/test/unit/gemm/device/syrk_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syrk_cf64n_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/syrk_cf64n_cf64t_tensor_op_f64_sm80.cu
index cca4e01cfa..0d6c770697 100644
--- a/test/unit/gemm/device/syrk_cf64n_cf64t_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/syrk_cf64n_cf64t_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syrk_f32n_f32t_tensor_op_fast_f32_sm80.cu b/test/unit/gemm/device/syrk_f32n_f32t_tensor_op_fast_f32_sm80.cu
index 5ce360b8bb..58486bf096 100644
--- a/test/unit/gemm/device/syrk_f32n_f32t_tensor_op_fast_f32_sm80.cu
+++ b/test/unit/gemm/device/syrk_f32n_f32t_tensor_op_fast_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -78,7 +78,7 @@ TEST(SM80_Device_Syrk_f32n_f32t_l_tensor_op_fast_f32, 128x256x32_64x64x32) {
       ElementAccumulator
     >,
     cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
-    4
+    3 
   >;
 
   EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
diff --git a/test/unit/gemm/device/syrk_f32t_f32t_tensor_op_fast_f32_sm80.cu b/test/unit/gemm/device/syrk_f32t_f32t_tensor_op_fast_f32_sm80.cu
index 27be2e0b7a..8b3b22ac67 100644
--- a/test/unit/gemm/device/syrk_f32t_f32t_tensor_op_fast_f32_sm80.cu
+++ b/test/unit/gemm/device/syrk_f32t_f32t_tensor_op_fast_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -78,7 +78,7 @@ TEST(SM80_Device_Syrk_f32t_f32t_l_tensor_op_fast_f32, 128x256x32_64x64x32) {
       ElementAccumulator
     >,
     cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
-    4
+    3 
   >;
 
   EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
diff --git a/test/unit/gemm/device/syrk_f64_f64_tensor_op_f64_sm90.cu b/test/unit/gemm/device/syrk_f64_f64_tensor_op_f64_sm90.cu
new file mode 100644
index 0000000000..cc0f9ab7a0
--- /dev/null
+++ b/test/unit/gemm/device/syrk_f64_f64_tensor_op_f64_sm90.cu
@@ -0,0 +1,125 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYRK interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_k_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank_k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Syrk_f64n_f64t_l_tensor_op_f64, 128x64x16_64x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Syrk_f64t_f64n_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
diff --git a/test/unit/gemm/device/syrk_f64n_f64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/syrk_f64n_f64t_tensor_op_f64_sm80.cu
index dd2db3cd8b..e308ae133d 100644
--- a/test/unit/gemm/device/syrk_f64n_f64t_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/syrk_f64n_f64t_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syrk_f64t_f64n_tensor_op_f64_sm80.cu b/test/unit/gemm/device/syrk_f64t_f64n_tensor_op_f64_sm80.cu
index 0686fd581f..c74de13e11 100644
--- a/test/unit/gemm/device/syrk_f64t_f64n_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/syrk_f64t_f64n_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syrk_tf32n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/syrk_tf32n_f32t_tensor_op_f32_sm80.cu
index e6efc2484a..6d33a3620b 100644
--- a/test/unit/gemm/device/syrk_tf32n_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/syrk_tf32n_f32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -78,7 +78,7 @@ TEST(SM80_Device_Syrk_tf32n_f32t_l_tensor_op_f32, 128x256x32_64x64x32) {
       ElementAccumulator
     >,
     cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
-    4
+    3 
   >;
 
   EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
diff --git a/test/unit/gemm/device/syrk_tf32t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/syrk_tf32t_f32t_tensor_op_f32_sm80.cu
index e2c5b2370a..530c4ba7e1 100644
--- a/test/unit/gemm/device/syrk_tf32t_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/syrk_tf32t_f32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -78,7 +78,7 @@ TEST(SM80_Device_Syrk_tf32t_f32t_l_tensor_op_f32, 128x256x32_64x64x32) {
       ElementAccumulator
     >,
     cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
-    4
+    3 
   >;
 
   EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
diff --git a/test/unit/gemm/device/testbed.h b/test/unit/gemm/device/testbed.h
index 9b33e0a238..83cf286420 100644
--- a/test/unit/gemm/device/testbed.h
+++ b/test/unit/gemm/device/testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -50,9 +50,11 @@
 #include "cutlass/util/reference/host/gemm.h"
 
 #include "testbed_utils.h"
+#include "testbed_universal.h"
 
 #include "cutlass/layout/matrix.h"
 #include "cutlass/matrix_coord.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
 
 namespace test {
 namespace gemm {
@@ -63,6 +65,9 @@ namespace device {
 template <typename Gemm, bool Relu = false>
 struct Testbed {
 
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementC = typename Gemm::ElementC;
   using ElementAccumulator = typename Gemm::ElementAccumulator;
   using ElementCompute = typename Gemm::GemmKernel::Epilogue::OutputOp::ElementCompute;
 
@@ -127,8 +132,8 @@ struct Testbed {
         scope_max = 2;
         scope_min = 0;
       } else if (bits_input <= 8) {
-        scope_max = 2;
-        scope_min = -2;
+        scope_max = 1;
+        scope_min = -1;
       } else if (bits_output == 16) {
         scope_max = 5;
         scope_min = -5;
@@ -154,7 +159,6 @@ struct Testbed {
         view.data(), view.capacity());
     } 
     else {
-      // TODO: Implement the rest
       EXPECT_TRUE(false) << "Not implemented";
       return false;
     }
@@ -204,15 +208,17 @@ struct Testbed {
     EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_B.host_view()), 0);
     EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_C.host_view()), 0);
 
-    if (tensor_D.size() > 1)
-      EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0);
-
-    if (reference_D.size() > 1)
-      EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0);
-
+    if (tensor_D.size() > 1) {
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0)
+        << "tensor_D (size " << tensor_D.size() << ") has nonpositive norm";
+    }
+    if (reference_D.size() > 1) {
+      EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0)
+        << "reference_D (size " << reference_D.size() << ") has nonpositive norm";
+    }
     bool passed = cutlass::reference::host::TensorEquals(reference_D.host_view(), tensor_D.host_view());
 
-    EXPECT_TRUE(passed);
+    EXPECT_TRUE(passed) << "reference_D does not equal tensor_D";
 
     if (!passed) {
 
@@ -293,7 +299,7 @@ struct Testbed {
     // Determine SMEM requirements and waive if not satisfied
     //
 
-    int smem_size = int(sizeof(typename Gemm::GemmKernel::SharedStorage));
+    size_t smem_size = sizeof(typename Gemm::GemmKernel::SharedStorage);
 
     cudaDeviceProp properties;
     int device_idx;
@@ -309,7 +315,7 @@ struct Testbed {
       throw std::runtime_error("cudaGetDeviceProperties() failed");
     }
 
-    if (properties.sharedMemPerMultiprocessor < smem_size) {
+    if (properties.sharedMemPerBlockOptin < smem_size) {
       return false;
     }
 
@@ -319,10 +325,19 @@ struct Testbed {
 
   /// Executes one test
   bool run(
-    cutlass::gemm::GemmCoord problem_size, 
+    cutlass::gemm::GemmCoord problem_size,
     int split_k_slices = 1,
-    ElementCompute alpha = ElementCompute(1), 
-    ElementCompute beta = ElementCompute(0)) {
+    ElementCompute alpha = ElementCompute(1),
+    ElementCompute beta = ElementCompute(0))
+  {
+/*
+    std::cout << "\n-----------------------\n";
+    std::cout << "problem size: " << problem_size << "\n";
+    std::cout << "split_k_slices: " << split_k_slices << "\n";
+    std::cout << "alpha: " << alpha << "\n";
+    std::cout << "beta: " << beta << "\n";
+    std::cout << "-----------------------\n\n";
+*/
 
     // Waive test if insufficient CUDA device
     if (!sufficient()) {
@@ -356,9 +371,11 @@ struct Testbed {
 
     cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
 
+    EXPECT_TRUE(status == cutlass::Status::kSuccess)
+      << "gemm_op.initialize returned with error " << to_string(status)
+      << ", indicating that this test is not supported.  Last CUDA error: "
+      << cudaGetErrorString(cudaGetLastError());
     if (status != cutlass::Status::kSuccess) {
-      cudaError_t error = cudaGetLastError();
-      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
       return true;
     }
 
@@ -366,19 +383,27 @@ struct Testbed {
     // Run the GEMM
     //
 
-    status = gemm_op();
-
-    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+    try {
+      status = gemm_op();
+    }
+    catch (std::exception const& e) {
+      EXPECT_TRUE(false) << "gemm_op() threw a std::exception: " << e.what();
+      throw;
+    }
+    catch (...) {
+      EXPECT_TRUE(false) << "gemm_op() threw an exception of unknown type";
+      throw;
+    }
+    EXPECT_TRUE(status == cutlass::Status::kSuccess)
+      << "gemm_op failed with error " << to_string(status);
 
     //
     // Verify
     //
 
     bool passed = this->verify(problem_size, alpha, beta);
-
-    if (!passed) {
-      std::cout << "Error with split_k_slices = " << split_k_slices << ", alpha: " << alpha << std::endl;
-    }
+    EXPECT_TRUE(passed) << "Error: split_k_slices = " << split_k_slices
+      << ", alpha: " << alpha;
 
     return passed;
   }
@@ -387,7 +412,7 @@ struct Testbed {
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <typename Gemm, bool Relu=false>
-bool TestAllGemm(
+bool TestAllGemmBasic(
     const typename Gemm::LayoutA::Stride& stride_factor_A = typename Gemm::LayoutA::Stride(),
     const typename Gemm::LayoutB::Stride& stride_factor_B = typename Gemm::LayoutB::Stride(),
     const typename Gemm::LayoutC::Stride& stride_factor_C = typename Gemm::LayoutC::Stride()) {
@@ -457,12 +482,26 @@ bool TestAllGemm(
             for (auto beta : problem_beta) {
 
               cutlass::gemm::GemmCoord problem_size(m, n, k);
-              passed = testbed.run(
-                problem_size, 
-                split_k,
-                cutlass::from_real<ElementCompute>(alpha), 
-                cutlass::from_real<ElementCompute>(beta)
-              );
+              try {
+                passed = testbed.run(
+                  problem_size, 
+                  split_k,
+                  cutlass::from_real<ElementCompute>(alpha), 
+                  cutlass::from_real<ElementCompute>(beta)
+                );
+              }
+              catch (std::exception const& e) {
+                EXPECT_TRUE(false) << "TestAllGemmBasic: testbed.run threw an "
+                  "exception {alpha: " << alpha << ", beta: " << beta << ", m: "
+                  << m << ", n: " << n << ", k: " << k << "}: " << e.what();
+                throw;
+              }
+              catch (...) {
+                EXPECT_TRUE(false) << "TestAllGemmBasic: testbed.run threw an "
+                  "exception {alpha: " << alpha << ", beta: " << beta << ", m: "
+                  << m << ", n: " << n << ", k: " << k << "}: (unknown)";
+                throw;
+              }
 
               if (!passed) {
                 return false;
@@ -477,6 +516,52 @@ bool TestAllGemm(
   return passed;
 }
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Gemm, bool Relu=false>
+bool TestAllGemm(
+    const typename Gemm::LayoutA::Stride& stride_factor_A,
+    const typename Gemm::LayoutB::Stride& stride_factor_B = typename Gemm::LayoutB::Stride(),
+    const typename Gemm::LayoutC::Stride& stride_factor_C = typename Gemm::LayoutC::Stride())
+{
+  // Test basic GEMM with non-default stride factors
+  return TestAllGemmBasic<Gemm, Relu>(stride_factor_A, stride_factor_B, stride_factor_C);
+}
+
+template <typename Gemm, bool Relu=false>
+bool TestAllGemm()
+{
+#ifdef NDEBUG
+  // Non-debug builds also test basic GEMM with default stride factors
+  if (!TestAllGemmBasic<Gemm, Relu>()) {
+    return false;
+  }
+#endif // NDEBUG
+
+  // Test universal GEMM
+#if 0
+  // Define the universal kernel
+  using UniversalKernel = cutlass::gemm::kernel::GemmUniversal<
+    typename Gemm::GemmKernel::Mma,                                 // Mma
+    typename Gemm::GemmKernel::Epilogue,                            // Epilogue
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>    // ThreadblockSwizzle
+  >;
+#else
+  // Define the streamk universal kernel
+  using UniversalKernel = cutlass::gemm::kernel::GemmUniversalStreamk<
+    typename Gemm::GemmKernel::Mma,                                 // Mma
+    typename Gemm::GemmKernel::Epilogue,                            // Epilogue
+    cutlass::gemm::threadblock::ThreadblockSwizzleStreamK           // ThreadblockSwizzle
+  >;
+#endif
+
+  // Define the universal adaptor
+  using UniversalGemm = cutlass::gemm::device::GemmUniversalAdapter<UniversalKernel>;
+
+  // Test universal GEMM
+  return TestAllGemmUniversal<UniversalGemm, Relu>();
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 template <typename Gemm>
 bool TestGemmPerf(int iterations = 1) {
@@ -511,12 +596,26 @@ bool TestGemmPerf(int iterations = 1) {
               cutlass::gemm::GemmCoord problem_size(m, n, k);
 
               for (int i = 0; i < iterations; i++){
-                passed = testbed.run(
-                  problem_size, 
-                  split_k,
-                  cutlass::from_real<ElementCompute>(alpha), 
-                  cutlass::from_real<ElementCompute>(beta)
-                );
+                try {
+                  passed = testbed.run(
+                    problem_size, 
+                    split_k,
+                    cutlass::from_real<ElementCompute>(alpha), 
+                    cutlass::from_real<ElementCompute>(beta)
+                  );
+                }
+                catch (std::exception const& e) {
+                  EXPECT_TRUE(false) << "TestGemmPerf: testbed.run threw an "
+                    "exception {alpha: " << alpha << ", beta: " << beta << ", m: "
+                    << m << ", n: " << n << ", k: " << k << "}: " << e.what();
+                  throw;
+                }
+                catch (...) {
+                  EXPECT_TRUE(false) << "TestGemmPerf: testbed.run threw an "
+                    "exception {alpha: " << alpha << ", beta: " << beta << ", m: "
+                    << m << ", n: " << n << ", k: " << k << "}: (unknown)";
+                  throw;
+                }
               }
 
               if (!passed) {
diff --git a/test/unit/gemm/device/testbed_complex.h b/test/unit/gemm/device/testbed_complex.h
index e6893026a6..1b5e3f7441 100644
--- a/test/unit/gemm/device/testbed_complex.h
+++ b/test/unit/gemm/device/testbed_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -63,6 +63,9 @@ template <typename Gemm>
 struct TestbedComplex : public Testbed<Gemm> {
 
   using Base = Testbed<Gemm>;
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementC = typename Gemm::ElementC;
   using ElementAccumulator = typename Gemm::ElementAccumulator;
   using ElementCompute = typename Gemm::GemmKernel::Epilogue::OutputOp::ElementCompute;
 
@@ -112,7 +115,7 @@ struct TestbedComplex : public Testbed<Gemm> {
     // Determine SMEM requirements and waive if not satisfied
     //
     
-    int smem_size = int(sizeof(typename Gemm::GemmKernel::SharedStorage));
+    size_t smem_size = sizeof(typename Gemm::GemmKernel::SharedStorage);
     
     cudaDeviceProp properties;
     int device_idx;
@@ -128,10 +131,10 @@ struct TestbedComplex : public Testbed<Gemm> {
     	throw std::runtime_error("cudaGetDeviceProperties() failed");
     }
     
-    if (properties.sharedMemPerMultiprocessor < smem_size) {
+    if (properties.sharedMemPerBlockOptin < smem_size) {
     	return false;
     }
-    
+
     return true;
   }
 
diff --git a/test/unit/gemm/device/testbed_gemm_with_broadcast.h b/test/unit/gemm/device/testbed_gemm_with_broadcast.h
index 242736ad5c..491a8b332b 100644
--- a/test/unit/gemm/device/testbed_gemm_with_broadcast.h
+++ b/test/unit/gemm/device/testbed_gemm_with_broadcast.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -76,10 +76,15 @@ struct GemmWithBroadcastReferenceOp {
   void operator()(ElementZ &Z, ElementT &T, ElementCompute gemm, ElementCompute bias) {
 
     ElementCompute t_full = binary_op(gemm, bias);
-    T = ElementT(t_full);
 
-    ElementCompute z_full = elementwise_op(t_full);
-    Z = ElementZ(z_full);
+    if (OutputOp::kStoreT) {
+      T = ElementT(t_full);
+    }
+
+    if (OutputOp::kStoreZ) {
+      ElementCompute z_full = elementwise_op(t_full);
+      Z = ElementZ(z_full);
+    }
   }
 };
 
@@ -100,10 +105,13 @@ template <
 >
 struct TestbedGemmWithBroadcast {
 
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
   using OutputOp = typename Gemm::GemmKernel::Epilogue::OutputOp;
   using ElementC = typename Gemm::ElementC;
   using ElementAccumulator = typename Gemm::ElementAccumulator;
-  using ElementCOmpute = typename OutputOp::ElementCompute;
+  using ElementCompute = typename OutputOp::ElementCompute;
+  using ElementVector = typename OutputOp::ElementVector;
   using ElementZ = typename OutputOp::ElementZ;
   using ElementT = typename OutputOp::ElementT;
 
@@ -116,7 +124,7 @@ struct TestbedGemmWithBroadcast {
   cutlass::HostTensor<typename Gemm::ElementA, typename Gemm::LayoutA> tensor_A;          // Input A
   cutlass::HostTensor<typename Gemm::ElementB, typename Gemm::LayoutB> tensor_B;          // Input B
   cutlass::HostTensor<ElementC, typename Gemm::LayoutC> tensor_C;                         // Input C
-  cutlass::HostTensor<ElementC, typename Gemm::LayoutC> tensor_Broadcast;                 // Input Broadcast
+  cutlass::HostTensor<ElementVector, typename Gemm::LayoutC> tensor_Broadcast;            // Input Broadcast
 
   cutlass::HostTensor<ElementZ, typename Gemm::LayoutC> tensor_Z;
   cutlass::HostTensor<ElementT, typename Gemm::LayoutC> tensor_T;
@@ -153,7 +161,7 @@ struct TestbedGemmWithBroadcast {
       int bits_output = cutlass::sizeof_bits<typename Gemm::ElementC>::value;
 
       if (bits_input == 1) {
-        scope_max = 2;
+        scope_max = 1;
         scope_min = 0;
       } else if (bits_input <= 8) {
         scope_max = 2;
@@ -183,7 +191,6 @@ struct TestbedGemmWithBroadcast {
         view.data(), view.capacity());
     } 
     else {
-      // TODO: Implement the rest
       EXPECT_TRUE(false) << "Not implemented";
       return false;
     }
@@ -250,12 +257,16 @@ struct TestbedGemmWithBroadcast {
     EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_A.host_view()), 0);
     EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_B.host_view()), 0);
     EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_C.host_view()), 0);
-    
-    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_Z.host_view()), 0);
-    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_T.host_view()), 0);
 
-    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_Z_ref.host_view()), 0);
-    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_T_ref.host_view()), 0);
+    if (OutputOp::kStoreZ) {
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_Z.host_view()), 0);
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_Z_ref.host_view()), 0);
+    }
+
+    if (OutputOp::kStoreT) {
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_T.host_view()), 0);
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_T_ref.host_view()), 0);
+    }
 
     bool passed = true;
     float norm_diff = 0;
@@ -357,8 +368,13 @@ struct TestbedGemmWithBroadcast {
 
         reference_op(z, t, tensor_Y_ref.at({m, n}), tensor_Broadcast.at({m, 0}));
 
-        tensor_Z_ref.at({m, n}) = z;
-        tensor_T_ref.at({m, n}) = t;
+        if (OutputOp::kStoreZ) {
+          tensor_Z_ref.at({m, n}) = z;
+        }
+
+        if (OutputOp::kStoreT) {
+          tensor_T_ref.at({m, n}) = t;
+        }
       }
     }
 
@@ -372,7 +388,7 @@ struct TestbedGemmWithBroadcast {
     // Determine SMEM requirements and waive if not satisfied
     //
 
-    int smem_size = int(sizeof(typename Gemm::GemmKernel::SharedStorage));
+    size_t smem_size = sizeof(typename Gemm::GemmKernel::SharedStorage);
 
     cudaDeviceProp properties;
     int device_idx;
@@ -388,7 +404,7 @@ struct TestbedGemmWithBroadcast {
       throw std::runtime_error("cudaGetDeviceProperties() failed");
     }
 
-    if (properties.sharedMemPerMultiprocessor < smem_size) {
+    if (properties.sharedMemPerBlockOptin < smem_size) {
       return false;
     }
 
@@ -450,7 +466,6 @@ struct TestbedGemmWithBroadcast {
 
     cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
 
-
     EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
 
     //
@@ -628,7 +643,7 @@ bool TestAllGemmWithBroadcast() {
               cutlass::from_real<ElementAccumulator>(alpha), 
               cutlass::from_real<ElementAccumulator>(beta)
             );
-            
+
             EXPECT_TRUE(passed) 
               << "M: " << M << ", N: " << N << ", K: " << K << ", alpha: " << alpha << ", beta: " << beta;
 
diff --git a/test/unit/gemm/device/testbed_gemm_with_reduction.h b/test/unit/gemm/device/testbed_gemm_with_reduction.h
index e51eed2058..12c8d408fc 100644
--- a/test/unit/gemm/device/testbed_gemm_with_reduction.h
+++ b/test/unit/gemm/device/testbed_gemm_with_reduction.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -61,6 +61,7 @@ namespace device {
 
 template <typename Gemm, typename BinaryOp>
 struct GemmWithReductionReference {
+
   using ElementAccumulator = typename Gemm::ElementAccumulator;
   using ElementCompute = typename Gemm::GemmKernel::Epilogue::ElementCompute;
   using ElementC = typename Gemm::ElementC;
@@ -93,6 +94,9 @@ template <
 >
 struct TestbedGemmWithReduction {
 
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementC = typename Gemm::ElementC;
   using ElementAccumulator = typename Gemm::ElementAccumulator;
   using ElementT = typename Gemm::GemmKernel::Epilogue::ElementTensor;
 
@@ -139,7 +143,7 @@ struct TestbedGemmWithReduction {
       int bits_output = cutlass::sizeof_bits<typename Gemm::ElementC>::value;
 
       if (bits_input == 1) {
-        scope_max = 2;
+        scope_max = 1;
         scope_min = 0;
       } else if (bits_input <= 8) {
         scope_max = 2;
@@ -174,7 +178,6 @@ struct TestbedGemmWithReduction {
       }
     } 
     else {
-      // TODO: Implement the rest
       EXPECT_TRUE(false) << "Not implemented";
       return false;
     }
@@ -359,7 +362,7 @@ struct TestbedGemmWithReduction {
     // Determine SMEM requirements and waive if not satisfied
     //
 
-    int smem_size = int(sizeof(typename Gemm::GemmKernel::SharedStorage));
+    size_t smem_size = sizeof(typename Gemm::GemmKernel::SharedStorage);
 
     cudaDeviceProp properties;
     int device_idx;
@@ -375,7 +378,7 @@ struct TestbedGemmWithReduction {
       throw std::runtime_error("cudaGetDeviceProperties() failed");
     }
 
-    if (properties.sharedMemPerMultiprocessor < smem_size) {
+    if (properties.sharedMemPerBlockOptin < smem_size) {
       return false;
     }
 
diff --git a/test/unit/gemm/device/testbed_grouped.h b/test/unit/gemm/device/testbed_grouped.h
index 5ec4161841..d8f5d43913 100644
--- a/test/unit/gemm/device/testbed_grouped.h
+++ b/test/unit/gemm/device/testbed_grouped.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -417,46 +417,27 @@ struct TestbedGrouped {
     return passed;
   }
 
-  /// Returns the number of threadblocks to launch if the kernel can run on the target
-  /// device. Otherwise, returns zero.
-  int sufficient() const {
-    cudaDeviceProp properties;
-    int device_idx;
-    cudaError_t result = cudaGetDevice(&device_idx);
-
-    if (result != cudaSuccess) {
-      throw std::runtime_error("cudaGetDevice() API call failed.");
-    }
-
-    result = cudaGetDeviceProperties(&properties, device_idx);
-
-    if (result != cudaSuccess) {
-      throw std::runtime_error("cudaGetDeviceProperties() failed");
-    }
-
-    int occupancy = Gemm::maximum_active_blocks();
-
-    return properties.multiProcessorCount * occupancy;
-  }
-
   /// Executes one test
   bool run(
     int problem_count,
     ElementCompute alpha = ElementCompute(1), 
     ElementCompute beta = ElementCompute(0)) {
 
-    int threadblock_count = sufficient();
-
-    // Early exit
-    if (!threadblock_count) {
-      return false;
-    }
-
     this->problem_count = problem_count;
 
     // Initialize the problem
     initialize();
 
+    int threadblock_count = Gemm::sufficient(problem_sizes_host.data(), problem_count);
+
+    // Early exit
+    if (!threadblock_count) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device resources." << std::endl;
+      }
+      return true;
+    }
+
     // Configure the GEMM arguments
     typename EpilogueOutputOp::Params epilogue_op(alpha, beta);
 
@@ -473,13 +454,17 @@ struct TestbedGrouped {
       lda.get(),
       ldb.get(),
       ldc.get(),
-      ldd.get()
+      ldd.get(),
+      problem_sizes_host.data()
     );
 
     // Initialize the GEMM object
     Gemm gemm;
 
-    cutlass::Status status = gemm.initialize(args);
+    size_t workspace_size = gemm.get_workspace_size(args);
+    cutlass::DeviceAllocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = gemm.initialize(args, workspace.get());
 
     if (status != cutlass::Status::kSuccess) {
       return false;
diff --git a/test/unit/gemm/device/testbed_grouped_rank_2k.h b/test/unit/gemm/device/testbed_grouped_rank_2k.h
new file mode 100644
index 0000000000..2678be1178
--- /dev/null
+++ b/test/unit/gemm/device/testbed_grouped_rank_2k.h
@@ -0,0 +1,502 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for grouped Rank2K interface
+
+*/
+
+#pragma once
+
+#include <fstream>
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/rank_2k_grouped.h"
+#include "cutlass/gemm/kernel/default_rank_2k_grouped.h"
+#include "cutlass/gemm/device/rank_2k_grouped.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_2k_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/tensor_view_io.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace test {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Rank2K>
+struct TestbedGrouped {
+
+  //
+  // Type definitions
+  //
+
+  using ElementA = typename Rank2K::ElementA;
+  using ElementB = typename Rank2K::ElementB;
+  using ElementC = typename Rank2K::ElementC;
+  using ElementAccumulator = typename Rank2K::ElementAccumulator;
+
+  using EpilogueOutputOp = typename Rank2K::EpilogueOutputOp;
+  using ElementCompute = typename EpilogueOutputOp::ElementCompute;
+
+  using LayoutA = typename Rank2K::LayoutA;
+  using LayoutB = typename Rank2K::LayoutB;
+  using LayoutC = typename Rank2K::LayoutC;
+
+  using MatrixCoord = typename LayoutC::TensorCoord;
+
+  //
+  // Data members
+  //
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint32_t seed;
+
+  int problem_count;
+
+  std::vector<cutlass::gemm::GemmCoord>               problem_sizes_host;
+  cutlass::DeviceAllocation<cutlass::gemm::GemmCoord> problem_sizes_device;
+
+  std::vector<int64_t> offset_A;
+  std::vector<int64_t> offset_B;
+  std::vector<int64_t> offset_C;
+  std::vector<int64_t> offset_D;
+
+  std::vector<int64_t> lda_host;
+  std::vector<int64_t> ldb_host;
+  std::vector<int64_t> ldc_host;
+  std::vector<int64_t> ldd_host;
+
+  cutlass::DeviceAllocation<int64_t> lda;
+  cutlass::DeviceAllocation<int64_t> ldb;
+  cutlass::DeviceAllocation<int64_t> ldc;
+  cutlass::DeviceAllocation<int64_t> ldd;
+
+  cutlass::DeviceAllocation<ElementA> block_A;
+  cutlass::DeviceAllocation<ElementB> block_B;
+  cutlass::DeviceAllocation<ElementC> block_C;
+  cutlass::DeviceAllocation<ElementC> block_D;
+
+  cutlass::DeviceAllocation<ElementA *> ptr_A;
+  cutlass::DeviceAllocation<ElementB *> ptr_B;
+  cutlass::DeviceAllocation<ElementC *> ptr_C;
+  cutlass::DeviceAllocation<ElementC *> ptr_D;
+
+  //
+  // Methods
+  //
+
+  TestbedGrouped(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint32_t seed_ = 3080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { }
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view,
+    cutlass::Distribution::Kind dist_kind,
+    uint32_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      double scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<typename Rank2K::ElementC>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        if (cutlass::sizeof_bits<ElementAccumulator>::value <= 16) {
+          scope_max = 5;
+          scope_min = -5;
+        }
+        else {
+          scope_max = 8;
+          scope_min = -8;
+        }
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope_max, scope_min, 0);
+    }
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    }
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(
+        view.data(), view.capacity());
+    }
+    else {
+      // no fill - remain zero
+    }
+
+    return true;
+  }
+
+  /// Initializes data structures
+  void initialize() {
+
+    //
+    // Choose random problem sizes
+    //
+
+    // construct a few problems of random sizes
+    srand(seed);
+
+    int64_t total_elements_A = 0;
+    int64_t total_elements_B = 0;
+    int64_t total_elements_C = 0;
+    int64_t total_elements_D = 0;
+
+
+    lda_host.resize(problem_count);
+    ldb_host.resize(problem_count);
+    ldc_host.resize(problem_count);
+    ldd_host.resize(problem_count);
+
+    problem_sizes_host.clear();
+    problem_sizes_host.resize(problem_count);
+
+    for (int32_t i = 0; i < problem_count; ++i) {
+
+      auto N = 8 * (rand() % 64) + 24;
+      auto K = 8 * (rand() % 64) + 24;
+      cutlass::gemm::GemmCoord problem(N, N, K);
+
+      if (!i) {
+        problem = cutlass::gemm::GemmCoord(16, 16, 8);
+      }
+
+      problem_sizes_host.at(i) = problem;
+
+      lda_host.at(i) = LayoutA::packed({problem.n(), problem.k()}).stride(0);
+      ldb_host.at(i) = LayoutB::packed({problem.n(), problem.k()}).stride(0);
+      ldc_host.at(i) = LayoutC::packed({problem.n(), problem.n()}).stride(0);
+      ldd_host.at(i) = LayoutC::packed({problem.n(), problem.n()}).stride(0);
+
+      offset_A.push_back(total_elements_A);
+      offset_B.push_back(total_elements_B);
+      offset_C.push_back(total_elements_C);
+      offset_D.push_back(total_elements_D);
+
+      int64_t elements_A = problem.n() * problem.k();
+      int64_t elements_B = problem.n() * problem.k();
+      int64_t elements_C = problem.n() * problem.n();
+      int64_t elements_D = problem.n() * problem.n();
+
+      total_elements_A += elements_A;
+      total_elements_B += elements_B;
+      total_elements_C += elements_C;
+      total_elements_D += elements_D;
+
+      // Random strides between problems?
+    }
+
+    problem_sizes_device.reset(problem_count);
+    problem_sizes_device.copy_from_host(problem_sizes_host.data());
+
+    lda.reset(problem_count);
+    ldb.reset(problem_count);
+    ldc.reset(problem_count);
+    ldd.reset(problem_count);
+
+    lda.copy_from_host(lda_host.data());
+    ldb.copy_from_host(ldb_host.data());
+    ldc.copy_from_host(ldc_host.data());
+    ldd.copy_from_host(ldd_host.data());
+
+    //
+    // Assign pointers
+    //
+
+    block_A.reset(total_elements_A);
+    block_B.reset(total_elements_B);
+    block_C.reset(total_elements_C);
+    block_D.reset(total_elements_D);
+
+    std::vector<ElementA *> ptr_A_host(problem_count);
+    std::vector<ElementB *> ptr_B_host(problem_count);
+    std::vector<ElementC *> ptr_C_host(problem_count);
+    std::vector<ElementC *> ptr_D_host(problem_count);
+
+    for (int32_t i = 0; i < problem_count; ++i) {
+      ptr_A_host.at(i) = block_A.get() + offset_A.at(i);
+      ptr_B_host.at(i) = block_B.get() + offset_B.at(i);
+      ptr_C_host.at(i) = block_C.get() + offset_C.at(i);
+      ptr_D_host.at(i) = block_D.get() + offset_D.at(i);
+    }
+
+    ptr_A.reset(problem_count);
+    ptr_A.copy_from_host(ptr_A_host.data());
+
+    ptr_B.reset(problem_count);
+    ptr_B.copy_from_host(ptr_B_host.data());
+
+    ptr_C.reset(problem_count);
+    ptr_C.copy_from_host(ptr_C_host.data());
+
+    ptr_D.reset(problem_count);
+    ptr_D.copy_from_host(ptr_D_host.data());
+
+    //
+    // Initialize the problems of the workspace
+    //
+
+    for (int32_t i = 0; i < problem_count; ++i) {
+      cutlass::gemm::GemmCoord problem = problem_sizes_host.at(i);
+
+      LayoutA layout_A(lda_host.at(i));
+      LayoutB layout_B(ldb_host.at(i));
+      LayoutC layout_C(ldc_host.at(i));
+      LayoutC layout_D(ldd_host.at(i));
+
+      MatrixCoord extent_A{problem.n(), problem.k()};
+      MatrixCoord extent_B{problem.n(), problem.k()};
+      MatrixCoord extent_C{problem.n(), problem.n()};
+
+      std::vector<ElementA> matrix_A(layout_A.capacity(extent_A));
+      std::vector<ElementB> matrix_B(layout_B.capacity(extent_B));
+      std::vector<ElementC> matrix_C(layout_C.capacity(extent_C));
+      std::vector<ElementC> matrix_D(layout_D.capacity(extent_C));
+
+      initialize_tensor(cutlass::TensorView<ElementA, LayoutA>(matrix_A.data(), layout_A, extent_A), init_A, seed * 2021);
+      initialize_tensor(cutlass::TensorView<ElementB, LayoutB>(matrix_B.data(), layout_B, extent_B), init_B, seed * 2022);
+      initialize_tensor(cutlass::TensorView<ElementC, LayoutC>(matrix_C.data(), layout_C, extent_C), init_C, seed * 2023);
+
+      cutlass::device_memory::copy_to_device(ptr_A_host.at(i), matrix_A.data(), matrix_A.size());
+      cutlass::device_memory::copy_to_device(ptr_B_host.at(i), matrix_B.data(), matrix_B.size());
+      cutlass::device_memory::copy_to_device(ptr_C_host.at(i), matrix_C.data(), matrix_C.size());
+      cutlass::device_memory::copy_to_device(ptr_D_host.at(i), matrix_D.data(), matrix_D.size());
+    }
+  }
+
+  /// Verifies the result is a Rank2K
+  bool verify(
+    ElementCompute alpha,
+    ElementCompute beta) {
+
+    bool passed = true;
+
+    for (int32_t i = 0; i < problem_count; ++i) {
+      cutlass::gemm::GemmCoord problem = problem_sizes_host.at(i);
+
+      LayoutA layout_A(lda_host.at(i));
+      LayoutB layout_B(ldb_host.at(i));
+      LayoutC layout_C(ldc_host.at(i));
+      LayoutC layout_D(ldd_host.at(i));
+
+      MatrixCoord extent_A{problem.n(), problem.k()};
+      MatrixCoord extent_B{problem.n(), problem.k()};
+      MatrixCoord extent_C{problem.n(), problem.n()};
+
+      std::vector<ElementA> matrix_A(layout_A.capacity(extent_A));
+      std::vector<ElementB> matrix_B(layout_B.capacity(extent_B));
+      std::vector<ElementC> matrix_C(layout_C.capacity(extent_C));
+      std::vector<ElementC> matrix_D(layout_D.capacity(extent_C));
+      std::vector<ElementC> matrix_Ref(layout_D.capacity(extent_C));
+
+      cutlass::device_memory::copy_to_host(matrix_A.data(), block_A.get() + offset_A.at(i), matrix_A.size());
+      cutlass::device_memory::copy_to_host(matrix_B.data(), block_B.get() + offset_B.at(i), matrix_B.size());
+      cutlass::device_memory::copy_to_host(matrix_C.data(), block_C.get() + offset_C.at(i), matrix_C.size());
+      cutlass::device_memory::copy_to_host(matrix_D.data(), block_D.get() + offset_D.at(i), matrix_D.size());
+
+      cutlass::TensorView<ElementA, LayoutA> view_A(matrix_A.data(), layout_A, extent_A);
+      cutlass::TensorView<ElementB, LayoutB> view_B(matrix_B.data(), layout_B, extent_B);
+      cutlass::TensorView<ElementC, LayoutC> view_C(matrix_C.data(), layout_C, extent_C);
+      cutlass::TensorView<ElementC, LayoutC> view_D(matrix_D.data(), layout_D, extent_C);
+      cutlass::TensorView<ElementC, LayoutC> view_Ref(matrix_Ref.data(), layout_D, extent_C);
+
+      // Reference Rank2K
+      cutlass::reference::host::Rank2KComplex<
+          ElementA, LayoutA,
+          ElementB, LayoutB,
+          ElementC, LayoutC,
+          ElementCompute, ElementAccumulator
+      >(
+        problem,
+        alpha,
+        view_A,
+        Rank2K::kTransformA,
+        view_B,
+        Rank2K::kTransformB,
+        beta,
+        view_C,
+        view_Ref,
+        ElementAccumulator(0),
+        Rank2K::kFillModeC,
+        Rank2K::kBlasMode
+      );
+
+      // Ensure that no input or output is entirely zero
+      EXPECT_GT(cutlass::reference::host::TensorNorm(view_A), 0);
+      EXPECT_GT(cutlass::reference::host::TensorNorm(view_B), 0);
+      EXPECT_GT(cutlass::reference::host::TensorNorm(view_C), 0);
+      EXPECT_GT(cutlass::reference::host::TensorNorm(view_D), 0);
+      EXPECT_GT(cutlass::reference::host::TensorNorm(view_Ref), 0);
+
+      // Compare against reference
+      passed = cutlass::reference::host::TensorEquals(view_D, view_Ref);
+
+      if (!passed) {
+        std::ofstream file("testbed_grouped_errors.txt");
+
+        file
+          << "problem: " << problem << "  [group: " << i << "]\n"
+          << ", alpha: " << alpha << ", beta: " << beta << "\n\n";
+
+        file
+          << "A =\n" << view_A
+          << "\nB =\n" << view_B
+          << "\nC =\n" << view_C
+          << "\n\nReference =\n" << view_Ref
+          << "\nComputed =\n" << view_D;
+
+        return passed;
+      }
+    }
+
+    return passed;
+  }
+
+  /// Executes one test
+  bool run(
+    int problem_count,
+    ElementCompute alpha = ElementCompute(1),
+    ElementCompute beta = ElementCompute(0)) {
+
+    this->problem_count = problem_count;
+
+    // Initialize the problem
+    initialize();
+
+    int threadblock_count = Rank2K::sufficient(problem_sizes_host.data(), problem_count);
+
+    // Early exit
+    if (!threadblock_count) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device resources." << std::endl;
+      }
+      return true;
+    }
+
+    // Configure the Rank2K arguments
+    typename EpilogueOutputOp::Params epilogue_op(alpha, beta);
+
+    // Configure Rank2K arguments
+    typename Rank2K::Arguments args(
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      problem_sizes_device.get(),
+      problem_count,
+      threadblock_count,
+      epilogue_op,
+      ptr_A.get(),
+      ptr_B.get(),
+      ptr_C.get(),
+      ptr_D.get(),
+      lda.get(),
+      ldb.get(),
+      ldc.get(),
+      ldd.get(),
+      problem_sizes_host.data()
+    );
+
+    // Initialize the Rank2K object
+    Rank2K rank2k;
+
+    size_t workspace_size = rank2k.get_workspace_size(args);
+    cutlass::DeviceAllocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = rank2k.initialize(args, workspace.get());
+
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+
+    // Run the Rank2K object
+    status = rank2k.run();
+
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+
+    // Wait for completion
+    cudaError_t result = cudaDeviceSynchronize();
+
+    EXPECT_EQ(result, cudaSuccess)
+      << "Kernel execution error: " << cudaGetErrorString(result);
+
+    if (result != cudaSuccess) {
+      return false;
+    }
+
+    // Verify correctness
+    return verify(alpha, beta);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // device
+} // gemm
+} // test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/testbed_grouped_rank_2k_scheduler.h b/test/unit/gemm/device/testbed_grouped_rank_2k_scheduler.h
new file mode 100644
index 0000000000..e399c7f6e1
--- /dev/null
+++ b/test/unit/gemm/device/testbed_grouped_rank_2k_scheduler.h
@@ -0,0 +1,461 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for grouped Rank2K problem visitors
+*/
+
+#pragma once
+
+#include <iostream>
+#include <numeric>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/rank_2k_grouped_problem_visitor.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/device_kernel.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace test {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Use simple problem visitor as a baseline
+template <typename ProblemSizeHelper,
+          typename ThreadblockShape,
+          int PrefetchTileCount,
+          int ThreadCount,
+          cutlass::FillMode FillModeC>
+struct BaselineProblemVisitor : public cutlass::gemm::kernel::BaseGroupedProblemVisitor<ProblemSizeHelper, ThreadblockShape> {
+  using Base = cutlass::gemm::kernel::BaseGroupedProblemVisitor<ProblemSizeHelper, ThreadblockShape>;
+  using Params = typename Base::Params;
+  static int const kThreadCount = ThreadCount;
+  static cutlass::FillMode const kFillModeC = FillModeC;
+
+  struct SharedStorage {};
+
+  int32_t tile_count_sum;
+  SharedStorage &shared_storage;
+
+  //
+  // Methods
+  //
+  CUTLASS_DEVICE
+  BaselineProblemVisitor(
+    Params const &params_,
+    SharedStorage &shared_storage_,
+    int32_t block_idx
+  ): Base(params_, block_idx),
+  shared_storage(shared_storage_)
+  {
+    cutlass::gemm::GemmCoord problem = this->problem_size();
+    cutlass::gemm::GemmCoord  grid = this->grid_shape(problem);
+    tile_count_sum = this->tile_count(grid);
+  }
+
+  CUTLASS_DEVICE
+  bool next_tile() {
+    if (this->tile_idx < tile_count_sum) {
+      return true;
+    }
+
+    do {
+      ++this->problem_idx;
+
+      if (this->problem_idx >= this->params.problem_count) {
+        return false;
+      }
+
+      cutlass::gemm::GemmCoord problem = this->problem_size();
+      cutlass::gemm::GemmCoord  grid = this->grid_shape(problem);
+
+      this->problem_tile_start = tile_count_sum;
+      tile_count_sum += this->tile_count(grid);
+
+    } while (tile_count_sum <= this->tile_idx);
+
+    return true;
+  }
+
+  static size_t get_workspace_size(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr,
+                                   int32_t problem_count,
+                                   int32_t block_count) {
+    return 0;
+  }
+
+  static void host_precompute(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr,
+                              int32_t problem_count,
+                              int32_t block_count,
+                              void* host_workspace_ptr) {}
+
+  CUTLASS_DEVICE
+  cutlass::gemm::GemmCoord threadblock_offset(int32_t threadblock_id) const {
+    int32_t macro_id = threadblock_id / ProblemSizeHelper::OffsetHelper::kThreadblockSkewRatio;
+    int32_t macro_row = ceil(cutlass::fast_sqrt((2*macro_id) + 2.25) - 0.5) - 1;
+    int32_t macro_col = macro_id - (((macro_row+1) * macro_row)/2);
+
+    if (FillModeC == cutlass::FillMode::kUpper) {
+      cutlass::swap(macro_row, macro_col);
+    }
+
+    int32_t row = ProblemSizeHelper::OffsetHelper::macro_row_to_row(macro_row, threadblock_id);
+    int32_t col = ProblemSizeHelper::OffsetHelper::macro_col_to_col(macro_col, threadblock_id);
+
+    return cutlass::gemm::GemmCoord(row, col, 0);
+  }
+};
+
+template <typename ProblemVisitor>
+struct ProblemVisitorKernel {
+  struct SharedStorage {
+    typename ProblemVisitor::SharedStorage problem_visitor;
+  };
+
+  struct Params {
+    typename ProblemVisitor::Params problem_visitor_params;
+    int32_t* visited_problems_ptr;
+    int32_t* visited_tiles_ptr;
+    int32_t visits_per_block;
+
+    Params():
+      visited_problems_ptr(nullptr),
+      visited_tiles_ptr(nullptr),
+      visits_per_block(0) {}
+
+    Params(typename ProblemVisitor::Params problem_visitor_params_,
+           int32_t* visited_problems_ptr_,
+           int32_t* visited_tiles_ptr_,
+           int32_t visits_per_block_):
+      problem_visitor_params(problem_visitor_params_),
+      visited_problems_ptr(visited_problems_ptr_),
+      visited_tiles_ptr(visited_tiles_ptr_),
+      visits_per_block(visits_per_block_) {}
+  };
+
+  CUTLASS_DEVICE
+  void operator()(const Params& params, SharedStorage &shared_storage) {
+    int32_t store_offset = params.visits_per_block * blockIdx.x;
+    ProblemVisitor problem_visitor(params.problem_visitor_params,
+                                   shared_storage.problem_visitor,
+                                   blockIdx.x);
+
+    while (problem_visitor.next_tile()) {
+      cutlass::gemm::GemmCoord problem_size = problem_visitor.problem_size();
+      int32_t problem_idx = problem_visitor.problem_index();
+      int32_t threadblock_idx = int32_t(problem_visitor.threadblock_idx());
+
+      cutlass::gemm::GemmCoord grid_shape = problem_visitor.grid_shape(problem_size);
+      cutlass::gemm::GemmCoord tile_offset = problem_visitor.threadblock_offset(threadblock_idx);
+
+      problem_visitor.advance(gridDim.x);
+
+      //
+      // Early exit conditions
+      //   1) Out of range
+      //   2) Upper-triangular block in lower-triangular problem
+      //   3) Lower-triangular block in upper-triangular problem
+      //
+
+      if (grid_shape.m() <= tile_offset.m() ||
+          grid_shape.n() <= tile_offset.n()) {
+        continue;
+      }
+
+      if (ProblemVisitor::kFillModeC == cutlass::FillMode::kLower &&
+          (tile_offset.m() + 1) * ProblemVisitor::ThreadblockShape::kM <= tile_offset.n() * ProblemVisitor::ThreadblockShape::kN) {
+        continue;
+      }
+
+      if (ProblemVisitor::kFillModeC == cutlass::FillMode::kUpper &&
+          tile_offset.m() * ProblemVisitor::ThreadblockShape::kM >= (tile_offset.n() + 1) * ProblemVisitor::ThreadblockShape::kN) {
+        continue;
+      }
+
+      if (threadIdx.x == 0) {
+        params.visited_problems_ptr[store_offset] = problem_idx;
+        params.visited_tiles_ptr[store_offset] = threadblock_idx;
+        ++store_offset;
+      }
+    }
+  }
+};
+
+template <typename ProblemVisitor>
+struct ProblemVisitorRunner {
+  using BaseKernel = ProblemVisitorKernel<ProblemVisitor>;
+  using Params = typename BaseKernel::Params;
+
+  Params params;
+  std::vector<cutlass::gemm::GemmCoord> host_problem_sizes;
+  int32_t problem_count;
+  int32_t threadblock_count;
+  int32_t visits_per_block;
+  cutlass::DeviceAllocation<int32_t> visited_problems;
+  cutlass::DeviceAllocation<int32_t> visited_tiles;
+  cutlass::DeviceAllocation<cutlass::gemm::GemmCoord> device_problem_sizes;
+  cutlass::DeviceAllocation<uint8_t> workspace;
+  std::vector<int32_t> host_visited_problems;
+  std::vector<int32_t> host_visited_tiles;
+
+  ProblemVisitorRunner(const std::vector<cutlass::gemm::GemmCoord>& host_problem_sizes_,
+                       int32_t threadblock_count_):
+      host_problem_sizes(host_problem_sizes_),
+      problem_count(int32_t(host_problem_sizes_.size())),
+      threadblock_count(threadblock_count_) {}
+
+  /// Initializes GEMM state from arguments.
+  cutlass::Status initialize() {
+    size_t workspace_bytes = ProblemVisitor::get_workspace_size(
+                                host_problem_sizes.data(),
+                                problem_count,
+                                threadblock_count);
+
+    workspace.reset(workspace_bytes);
+    std::vector<uint8_t> host_workspace(workspace_bytes);
+
+    int32_t tile_count = ProblemVisitor::group_tile_count(host_problem_sizes.data(), problem_count);
+
+    ProblemVisitor::host_precompute(host_problem_sizes.data(), problem_count,
+                                    threadblock_count, host_workspace.data());
+
+    workspace.copy_from_host(host_workspace.data(), workspace_bytes);
+
+    device_problem_sizes.reset(problem_count);
+    device_problem_sizes.copy_from_host(host_problem_sizes.data(), problem_count);
+
+    visits_per_block = (tile_count - 1 + threadblock_count) / threadblock_count;
+    int32_t total_visits = visits_per_block * threadblock_count;
+
+    visited_problems.reset(total_visits);
+    visited_tiles.reset(total_visits);
+    host_visited_problems.resize(total_visits);
+    host_visited_tiles.resize(total_visits);
+
+    cudaError_t result = cudaMemset(visited_problems.get(), -1, sizeof(int32_t) * total_visits);
+    if (result != cudaSuccess) {
+      return cutlass::Status::kErrorInternal;
+    }
+
+    result = cudaMemset(visited_tiles.get(), -1, sizeof(int32_t) * total_visits);
+    if (result != cudaSuccess) {
+      return cutlass::Status::kErrorInternal;
+    }
+
+    typename ProblemVisitor::Params pv_params(device_problem_sizes.get(), problem_count, workspace.get(), tile_count);
+    params = Params(pv_params, visited_problems.get(), visited_tiles.get(), visits_per_block);
+
+    return cutlass::Status::kSuccess;
+  }
+
+  bool verify() {
+    // Sort by problem size and then by threadblock_idx
+    std::vector<int32_t> indices(host_visited_problems.size());
+    std::iota(indices.begin(), indices.end(), 0);
+
+    std::stable_sort(indices.begin(), indices.end(),
+      [&](int32_t i1, int32_t i2) {
+        if (host_visited_problems[i1] == host_visited_problems[i2]) {
+          return host_visited_tiles[i1] < host_visited_tiles[i2];
+        }
+        return host_visited_problems[i1] < host_visited_problems[i2];
+      });
+
+    int32_t idx = 0;
+
+    // Skip any entries that were not visited
+    while (host_visited_problems[indices[idx]] == -1) {
+      ++idx;
+    }
+
+    // Check that each problem visited has the tiles we expect
+    for (int32_t problem_idx = 0; problem_idx < problem_count; ++problem_idx) {
+      auto problem = host_problem_sizes[problem_idx];
+      ProblemVisitor::possibly_transpose_problem(problem);
+      int32_t problem_tiles = ProblemVisitor::tile_count(ProblemVisitor::grid_shape(problem));
+      for (int i = 0; i < problem_tiles; ++i) {
+        EXPECT_EQ(problem_idx, host_visited_problems[indices[idx]]);
+        EXPECT_EQ(i, host_visited_tiles[indices[idx]]);
+        ++idx;
+      }
+    }
+
+    return true;
+  }
+
+  bool run(bool skip_tile_check=false, cudaStream_t stream = nullptr) {
+    cutlass::Status status = initialize();
+    if (status != cutlass::Status::kSuccess) {
+      std::cerr << "Initialization failed" << std::endl;
+      return false;
+    }
+
+    dim3 grid(threadblock_count, 1, 1);
+    dim3 block(ProblemVisitor::kThreadCount, 1, 1);
+    int smem_size = int(sizeof(typename BaseKernel::SharedStorage));
+
+    cutlass::Kernel<BaseKernel><<<grid, block, smem_size, stream>>>(params);
+
+    cudaError_t result = cudaGetLastError();
+    if (result != cudaSuccess) {
+      std::cerr << "grid launch failed with error " << cudaGetErrorString(result) << std::endl;
+      return false;
+    }
+
+    result = cudaDeviceSynchronize();
+    if (result != cudaSuccess) {
+      std::cerr << "cudaDeviceSynchronize failed with error " << cudaGetErrorString(result) << std::endl;
+      return false;
+    }
+
+    visited_problems.copy_to_host(host_visited_problems.data());
+    visited_tiles.copy_to_host(host_visited_tiles.data());
+
+    if (skip_tile_check) {
+      return true;
+    }
+
+    return verify();
+  }
+};
+
+template <typename ThreadblockShape,
+          int PrefetchTileCount,
+          int ThreadCount,
+          cutlass::FillMode FillModeC,
+          cutlass::gemm::kernel::GroupScheduleMode GroupScheduleMode0,
+          cutlass::gemm::kernel::GroupScheduleMode... Args>
+struct TestbedGroupedRank2KScheduler {
+
+  using BaselinePV = BaselineProblemVisitor<cutlass::gemm::kernel::detail::Rank2KGroupedProblemSizeHelper<ThreadblockShape>,
+                                            ThreadblockShape,
+                                            PrefetchTileCount,
+                                            ThreadCount,
+                                            FillModeC>;
+
+  //
+  // Data members
+  //
+
+  // Whether to skip checking that the tiles are visited as expected. This is useful
+  // in cases where ThreadblockShape::kM != ThreadblockShape::kN, for which the grouped
+  // Rank2K scheduler may assign out-of-bounds tiles that will cause a threadblock to
+  // exit early, but which are difficult to detect in tests without reimplementing
+  // this functionality.
+  bool skip_tile_check;
+  uint32_t seed;
+  int problem_count;
+  int threadblock_count;
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes_host;
+
+  //
+  // Methods
+  //
+
+  TestbedGroupedRank2KScheduler(bool skip_tile_check_=false, uint32_t seed_ = 3080):
+    skip_tile_check(skip_tile_check_), seed(seed_) { srand(seed); }
+
+  /// Initializes data structures
+  void initialize(int32_t scale_factor) {
+
+    //
+    // Choose random problem sizes
+    //
+
+    problem_sizes_host.clear();
+    problem_sizes_host.resize(problem_count);
+
+    for (int32_t i = 0; i < problem_count; ++i) {
+      int n = scale_factor * (rand() % 64) + 24;
+
+      cutlass::gemm::GemmCoord problem(
+        n,
+        n,
+        scale_factor * (rand() % 64) + 24);
+
+      problem_sizes_host.at(i) = problem;
+    }
+  }
+
+  template <cutlass::gemm::kernel::GroupScheduleMode GroupScheduleMode_>
+  void compare_visitors(const ProblemVisitorRunner<BaselinePV>& baseline_runner) {
+    using PV = cutlass::gemm::kernel::Rank2KGroupedProblemVisitor<
+                                         ThreadblockShape,
+                                         GroupScheduleMode_,
+                                         PrefetchTileCount,
+                                         ThreadCount,
+                                         FillModeC>;
+    ProblemVisitorRunner<PV> runner(problem_sizes_host, threadblock_count);
+    EXPECT_TRUE(runner.run(skip_tile_check));
+
+    // Check that this problem visitor visits the same problems and tiles as the baseline
+    EXPECT_EQ(baseline_runner.host_visited_problems, runner.host_visited_problems);
+    EXPECT_EQ(baseline_runner.host_visited_tiles, runner.host_visited_tiles);
+  }
+
+  template <cutlass::gemm::kernel::GroupScheduleMode GroupScheduleMode1_,
+            cutlass::gemm::kernel::GroupScheduleMode GroupScheduleMode2_,
+            cutlass::gemm::kernel::GroupScheduleMode... Rest>
+  void compare_visitors(const ProblemVisitorRunner<BaselinePV>& baseline_runner) {
+    // Compare the next visitor with the baseline visitor
+    compare_visitors<GroupScheduleMode1_>(baseline_runner);
+
+    // Recurse to compare the next visitors
+    compare_visitors<GroupScheduleMode2_, Rest...>(baseline_runner);
+  }
+
+  /// Executes the test on all scheduler modes
+  void run(int problem_count, int threadblock_count, int scale_factor=8) {
+
+    this->problem_count = problem_count;
+    this->threadblock_count = threadblock_count;
+
+    // Initialize the problem
+    initialize(scale_factor);
+
+    // Run the baseline visitor to which we will compare all other visitors
+    ProblemVisitorRunner<BaselinePV> baseline_runner(problem_sizes_host, threadblock_count);
+    EXPECT_TRUE(baseline_runner.run(skip_tile_check));
+
+    compare_visitors<Args...>(baseline_runner);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // device
+} // gemm
+} // test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/testbed_grouped_scheduler.h b/test/unit/gemm/device/testbed_grouped_scheduler.h
new file mode 100644
index 0000000000..1aab9c1214
--- /dev/null
+++ b/test/unit/gemm/device/testbed_grouped_scheduler.h
@@ -0,0 +1,407 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for grouped GEMM problem visitors
+*/
+
+#pragma once
+
+#include <iostream>
+#include <numeric>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm_grouped_problem_visitor.h"
+#include "cutlass/gemm/kernel/grouped_problem_visitor.h"
+#include "cutlass/util/device_memory.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace test {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Use simple problem visitor as a baseline
+template <typename ProblemSizeHelper,
+          typename ThreadblockShape,
+          int PrefetchTileCount,
+          int ThreadCount>
+struct BaselineProblemVisitor : public cutlass::gemm::kernel::BaseGroupedProblemVisitor<ProblemSizeHelper, ThreadblockShape> {
+  using Base = cutlass::gemm::kernel::BaseGroupedProblemVisitor<ProblemSizeHelper, ThreadblockShape>;
+  using Params = typename Base::Params;
+  static int const kThreadCount = ThreadCount;
+
+  struct SharedStorage {};
+
+  int32_t tile_count_sum;
+  SharedStorage &shared_storage;
+
+  //
+  // Methods
+  //
+  CUTLASS_DEVICE
+  BaselineProblemVisitor(
+    Params const &params_,
+    SharedStorage &shared_storage_,
+    int32_t block_idx
+  ): Base(params_, block_idx),
+  shared_storage(shared_storage_)
+  {
+    cutlass::gemm::GemmCoord problem = this->problem_size();
+    cutlass::gemm::GemmCoord  grid = this->grid_shape(problem);
+    tile_count_sum = this->tile_count(grid);
+  }
+
+  CUTLASS_DEVICE
+  bool next_tile() {
+    if (this->tile_idx < tile_count_sum) {
+      return true;
+    }
+
+    do {
+      ++this->problem_idx;
+
+      if (this->problem_idx >= this->params.problem_count) {
+        return false;
+      }
+
+      cutlass::gemm::GemmCoord problem = this->problem_size();
+      cutlass::gemm::GemmCoord  grid = this->grid_shape(problem);
+
+      this->problem_tile_start = tile_count_sum;
+      tile_count_sum += this->tile_count(grid);
+
+    } while (tile_count_sum <= this->tile_idx);
+
+    return true;
+  }
+
+  static size_t get_workspace_size(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr,
+                                   int32_t problem_count,
+                                   int32_t block_count) {
+    return 0;
+  }
+
+  static void host_precompute(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr,
+                              int32_t problem_count,
+                              int32_t block_count,
+                              void* host_workspace_ptr) {}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename ProblemVisitor>
+struct ProblemVisitorKernel {
+  struct SharedStorage {
+    typename ProblemVisitor::SharedStorage problem_visitor;
+  };
+
+  struct Params {
+    typename ProblemVisitor::Params problem_visitor_params;
+    int32_t* visited_problems_ptr;
+    int32_t* visited_tiles_ptr;
+    int32_t visits_per_block;
+
+    Params():
+      visited_problems_ptr(nullptr),
+      visited_tiles_ptr(nullptr),
+      visits_per_block(0) {}
+
+    Params(typename ProblemVisitor::Params problem_visitor_params_,
+           int32_t* visited_problems_ptr_,
+           int32_t* visited_tiles_ptr_,
+           int32_t visits_per_block_):
+      problem_visitor_params(problem_visitor_params_),
+      visited_problems_ptr(visited_problems_ptr_),
+      visited_tiles_ptr(visited_tiles_ptr_),
+      visits_per_block(visits_per_block_) {}
+  };
+
+  CUTLASS_DEVICE
+  void operator()(const Params& params, SharedStorage &shared_storage) {
+    int32_t store_offset = params.visits_per_block * blockIdx.x;
+    ProblemVisitor problem_visitor(params.problem_visitor_params,
+                                   shared_storage.problem_visitor,
+                                   blockIdx.x);
+
+    while (problem_visitor.next_tile()) {
+      int32_t problem_idx = problem_visitor.problem_index();
+      int32_t threadblock_idx = int32_t(problem_visitor.threadblock_idx());
+
+      if (threadIdx.x == 0) {
+        params.visited_problems_ptr[store_offset] = problem_idx;
+        params.visited_tiles_ptr[store_offset] = threadblock_idx;
+        ++store_offset;
+      }
+      problem_visitor.advance(gridDim.x);
+    }
+  }
+};
+
+template <typename ProblemVisitor>
+struct ProblemVisitorRunner {
+  using BaseKernel = ProblemVisitorKernel<ProblemVisitor>;
+  using Params = typename BaseKernel::Params;
+
+  Params params;
+  std::vector<cutlass::gemm::GemmCoord> host_problem_sizes;
+  int32_t problem_count;
+  int32_t threadblock_count;
+  int32_t visits_per_block;
+  cutlass::DeviceAllocation<int32_t> visited_problems;
+  cutlass::DeviceAllocation<int32_t> visited_tiles;
+  cutlass::DeviceAllocation<cutlass::gemm::GemmCoord> device_problem_sizes;
+  cutlass::DeviceAllocation<uint8_t> workspace;
+  std::vector<int32_t> host_visited_problems;
+  std::vector<int32_t> host_visited_tiles;
+
+  ProblemVisitorRunner(const std::vector<cutlass::gemm::GemmCoord>& host_problem_sizes_,
+                       int32_t threadblock_count_):
+      host_problem_sizes(host_problem_sizes_),
+      problem_count(int32_t(host_problem_sizes_.size())),
+      threadblock_count(threadblock_count_) {}
+
+  /// Initializes GEMM state from arguments.
+  cutlass::Status initialize() {
+    size_t workspace_bytes = ProblemVisitor::get_workspace_size(
+                                host_problem_sizes.data(),
+                                problem_count,
+                                threadblock_count);
+
+    workspace.reset(workspace_bytes);
+    std::vector<uint8_t> host_workspace(workspace_bytes);
+
+    int32_t tile_count = ProblemVisitor::group_tile_count(host_problem_sizes.data(), problem_count);
+
+    ProblemVisitor::host_precompute(host_problem_sizes.data(), problem_count,
+                                    threadblock_count, host_workspace.data());
+
+    workspace.copy_from_host(host_workspace.data(), workspace_bytes);
+
+    device_problem_sizes.reset(problem_count);
+    device_problem_sizes.copy_from_host(host_problem_sizes.data(), problem_count);
+
+    visits_per_block = (tile_count - 1 + threadblock_count) / threadblock_count;
+    int32_t total_visits = visits_per_block * threadblock_count;
+
+    visited_problems.reset(total_visits);
+    visited_tiles.reset(total_visits);
+    host_visited_problems.resize(total_visits);
+    host_visited_tiles.resize(total_visits);
+
+    cudaError_t result = cudaMemset(visited_problems.get(), -1, sizeof(int32_t) * total_visits);
+    if (result != cudaSuccess) {
+      return cutlass::Status::kErrorInternal;
+    }
+
+    result = cudaMemset(visited_tiles.get(), -1, sizeof(int32_t) * total_visits);
+    if (result != cudaSuccess) {
+      return cutlass::Status::kErrorInternal;
+    }
+
+    typename ProblemVisitor::Params pv_params(device_problem_sizes.get(), problem_count, workspace.get(), tile_count);
+    params = Params(pv_params, visited_problems.get(), visited_tiles.get(), visits_per_block);
+
+    return cutlass::Status::kSuccess;
+  }
+
+  bool verify() {
+    // Sort by problem size and then by threadblock_idx
+    std::vector<int32_t> indices(host_visited_problems.size());
+    std::iota(indices.begin(), indices.end(), 0);
+
+    std::stable_sort(indices.begin(), indices.end(),
+      [&](int32_t i1, int32_t i2) {
+        if (host_visited_problems[i1] == host_visited_problems[i2]) {
+          return host_visited_tiles[i1] < host_visited_tiles[i2];
+        }
+        return host_visited_problems[i1] < host_visited_problems[i2];
+      });
+
+    int32_t idx = 0;
+
+    // Skip any entries that were not visited
+    while (host_visited_problems[indices[idx]] == -1) {
+      ++idx;
+    }
+
+    // Check that each problem visited has the tiles we expect
+    for (int32_t problem_idx = 0; problem_idx < problem_count; ++problem_idx) {
+      auto problem = host_problem_sizes[problem_idx];
+      ProblemVisitor::possibly_transpose_problem(problem);
+      int32_t problem_tiles = ProblemVisitor::tile_count(ProblemVisitor::grid_shape(problem));
+      for (int i = 0; i < problem_tiles; ++i) {
+        EXPECT_EQ(problem_idx, host_visited_problems[indices[idx]]);
+        EXPECT_EQ(i, host_visited_tiles[indices[idx]]);
+        ++idx;
+      }
+    }
+
+    return true;
+  }
+
+  bool run(cudaStream_t stream = nullptr) {
+    cutlass::Status status = initialize();
+    if (status != cutlass::Status::kSuccess) {
+      std::cerr << "Initialization failed" << std::endl;
+      return false;
+    }
+
+    dim3 grid(threadblock_count, 1, 1);
+    dim3 block(ProblemVisitor::kThreadCount, 1, 1);
+    int smem_size = int(sizeof(typename BaseKernel::SharedStorage));
+
+    cutlass::Kernel<BaseKernel><<<grid, block, smem_size, stream>>>(params);
+
+    cudaError_t result = cudaGetLastError();
+    if (result != cudaSuccess) {
+      std::cerr << "grid launch failed with error " << cudaGetErrorString(result) << std::endl;
+      return false;
+    }
+
+    result = cudaDeviceSynchronize();
+    if (result != cudaSuccess) {
+      std::cerr << "cudaDeviceSynchronize failed with error " << cudaGetErrorString(result) << std::endl;
+      return false;
+    }
+
+    visited_problems.copy_to_host(host_visited_problems.data());
+    visited_tiles.copy_to_host(host_visited_tiles.data());
+
+    return verify();
+  }
+};
+
+template <typename ThreadblockShape,
+          int PrefetchTileCount,
+          int ThreadCount,
+          bool Transpose,
+          cutlass::gemm::kernel::GroupScheduleMode GroupScheduleMode0,
+          cutlass::gemm::kernel::GroupScheduleMode... Args>
+struct TestbedGroupedGemmScheduler {
+
+  using PSHelper = cutlass::gemm::kernel::detail::GemmGroupedProblemSizeHelper<ThreadblockShape, Transpose>;
+  using BaselinePV = BaselineProblemVisitor<PSHelper,
+                                            ThreadblockShape,
+                                            PrefetchTileCount,
+                                            ThreadCount>;
+
+  //
+  // Data members
+  //
+  uint32_t seed;
+  int problem_count;
+  int threadblock_count;
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes_host;
+
+  //
+  // Methods
+  //
+
+  TestbedGroupedGemmScheduler(uint32_t seed_ = 3080):
+    seed(seed_) { srand(seed); }
+
+  /// Initializes data structures
+  void initialize(int32_t scale_factor) {
+
+    //
+    // Choose random problem sizes
+    //
+
+    problem_sizes_host.clear();
+    problem_sizes_host.resize(problem_count);
+
+    for (int32_t i = 0; i < problem_count; ++i) {
+
+      cutlass::gemm::GemmCoord problem(
+        scale_factor * (rand() % 64) + 24,
+        scale_factor * (rand() % 64) + 24,
+        scale_factor * (rand() % 64) + 24);
+
+      problem_sizes_host.at(i) = problem;
+    }
+  }
+
+  template <cutlass::gemm::kernel::GroupScheduleMode GroupScheduleMode_>
+  void compare_visitors(const ProblemVisitorRunner<BaselinePV>& baseline_runner) {
+    using PV = cutlass::gemm::kernel::GemmGroupedProblemVisitor<
+                                         ThreadblockShape,
+                                         GroupScheduleMode_,
+                                         PrefetchTileCount,
+                                         ThreadCount,
+                                         Transpose>;
+    ProblemVisitorRunner<PV> runner(problem_sizes_host, threadblock_count);
+    EXPECT_TRUE(runner.run());
+
+    // Check that this problem visitor visits the same problems and tiles as the baseline
+    EXPECT_EQ(baseline_runner.host_visited_problems, runner.host_visited_problems);
+    EXPECT_EQ(baseline_runner.host_visited_tiles, runner.host_visited_tiles);
+  }
+
+  template <cutlass::gemm::kernel::GroupScheduleMode GroupScheduleMode1_,
+            cutlass::gemm::kernel::GroupScheduleMode GroupScheduleMode2_,
+            cutlass::gemm::kernel::GroupScheduleMode... Rest>
+  void compare_visitors(const ProblemVisitorRunner<BaselinePV>& baseline_runner) {
+    // Compare the next visitor with the baseline visitor
+    compare_visitors<GroupScheduleMode1_>(baseline_runner);
+
+    // Recurse to compare the next visitors
+    compare_visitors<GroupScheduleMode2_, Rest...>(baseline_runner);
+  }
+
+  /// Executes the test on all scheduler modes
+  void run(int problem_count, int threadblock_count, int scale_factor=8) {
+
+    this->problem_count = problem_count;
+    this->threadblock_count = threadblock_count;
+
+    // Initialize the problem
+    initialize(scale_factor);
+
+    // Run the baseline visitor to which we will compare all other visitors
+    ProblemVisitorRunner<BaselinePV> baseline_runner(problem_sizes_host, threadblock_count);
+    EXPECT_TRUE(baseline_runner.run());
+
+    compare_visitors<Args...>(baseline_runner);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // device
+} // gemm
+} // test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/testbed_interleaved.h b/test/unit/gemm/device/testbed_interleaved.h
index ce4e2c9ff6..ee96f78fc9 100644
--- a/test/unit/gemm/device/testbed_interleaved.h
+++ b/test/unit/gemm/device/testbed_interleaved.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -57,6 +57,9 @@ namespace device {
 template <typename Gemm, int InterleavedK>
 struct InterleavedTestbed {
 
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementC = typename Gemm::ElementC;
   using ElementAccumulator = typename Gemm::ElementAccumulator;
   using ElementCompute = typename Gemm::GemmKernel::Epilogue::OutputOp::ElementCompute;
 
@@ -100,7 +103,6 @@ struct InterleavedTestbed {
         view.data(), view.capacity());
     } 
     else {
-      // TODO: Implement the rest
       EXPECT_TRUE(false) << "Not implemented";
       return false;
     }
@@ -114,7 +116,7 @@ struct InterleavedTestbed {
     // Determine SMEM requirements and waive if not satisfied
     //
 
-    int smem_size = int(sizeof(typename Gemm::GemmKernel::SharedStorage));
+    size_t smem_size = sizeof(typename Gemm::GemmKernel::SharedStorage);
 
     cudaDeviceProp properties;
     int device_idx;
@@ -130,7 +132,7 @@ struct InterleavedTestbed {
       throw std::runtime_error("cudaGetDeviceProperties() failed");
     }
 
-    if (properties.sharedMemPerMultiprocessor < smem_size) {
+    if (properties.sharedMemPerBlockOptin < smem_size) {
       return false;
     }
 
diff --git a/test/unit/gemm/device/testbed_planar_complex.h b/test/unit/gemm/device/testbed_planar_complex.h
index 52aa698286..6bd289c9cc 100644
--- a/test/unit/gemm/device/testbed_planar_complex.h
+++ b/test/unit/gemm/device/testbed_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -124,7 +124,7 @@ class TestbedPlanarComplex {
     // Determine SMEM requirements and waive if not satisfied
     //
 
-    int smem_size = int(sizeof(typename Gemm::GemmKernel::SharedStorage));
+    size_t smem_size = sizeof(typename Gemm::GemmKernel::SharedStorage);
 
     cudaDeviceProp properties;
     int device_idx;
@@ -140,7 +140,7 @@ class TestbedPlanarComplex {
       throw std::runtime_error("cudaGetDeviceProperties() failed");
     }
 
-    if (properties.sharedMemPerMultiprocessor < smem_size) {
+    if (properties.sharedMemPerBlockOptin < smem_size) {
       return false;
     }
 
diff --git a/test/unit/gemm/device/testbed_rank2k_universal.h b/test/unit/gemm/device/testbed_rank2k_universal.h
index 4442d99f2d..5d00919793 100644
--- a/test/unit/gemm/device/testbed_rank2k_universal.h
+++ b/test/unit/gemm/device/testbed_rank2k_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -64,6 +64,9 @@ namespace device {
 template <typename Rank2K>
 struct TestbedRank2KUniversal {
 
+  using ElementA = typename Rank2K::ElementA;
+  using ElementB = typename Rank2K::ElementB;
+  using ElementC = typename Rank2K::ElementC;
   using ElementAccumulator = typename Rank2K::ElementAccumulator;
   using ElementCompute = typename Rank2K::Rank2Kkernel::Epilogue::OutputOp::ElementCompute;
 
@@ -282,7 +285,7 @@ struct TestbedRank2KUniversal {
     // Determine SMEM requirements and waive if not satisfied
     //
 
-    int smem_size = int(sizeof(typename Rank2K::Rank2Kkernel::SharedStorage));
+    size_t smem_size = sizeof(typename Rank2K::Rank2Kkernel::SharedStorage);
 
     cudaDeviceProp properties;
     int device_idx;
@@ -298,10 +301,9 @@ struct TestbedRank2KUniversal {
       throw std::runtime_error("cudaGetDeviceProperties() failed");
     }
 
-    if (properties.sharedMemPerMultiprocessor < smem_size) {
+    if (properties.sharedMemPerBlockOptin < smem_size) {
       return false;
     }
-
     return true;
   }
 
diff --git a/test/unit/gemm/device/testbed_rank_k_universal.h b/test/unit/gemm/device/testbed_rank_k_universal.h
index d4a946c539..15f23d34d6 100644
--- a/test/unit/gemm/device/testbed_rank_k_universal.h
+++ b/test/unit/gemm/device/testbed_rank_k_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -63,6 +63,8 @@ namespace device {
 template <typename RankK>
 struct TestbedRank2KUniversal {
 
+  using ElementA = typename RankK::ElementA;
+  using ElementC = typename RankK::ElementC;
   using ElementAccumulator = typename RankK::ElementAccumulator;
   using ElementCompute = typename RankK::RankKkernel::Epilogue::OutputOp::ElementCompute;
 
@@ -270,7 +272,7 @@ struct TestbedRank2KUniversal {
     // Determine SMEM requirements and waive if not satisfied
     //
 
-    int smem_size = int(sizeof(typename RankK::RankKkernel::SharedStorage));
+    size_t smem_size = sizeof(typename RankK::RankKkernel::SharedStorage);
 
     cudaDeviceProp properties;
     int device_idx;
@@ -286,7 +288,7 @@ struct TestbedRank2KUniversal {
       throw std::runtime_error("cudaGetDeviceProperties() failed");
     }
 
-    if (properties.sharedMemPerMultiprocessor < smem_size) {
+    if (properties.sharedMemPerBlockOptin < smem_size) {
       return false;
     }
 
diff --git a/test/unit/gemm/device/testbed_sanity.h b/test/unit/gemm/device/testbed_sanity.h
index e397602451..822374dd7c 100644
--- a/test/unit/gemm/device/testbed_sanity.h
+++ b/test/unit/gemm/device/testbed_sanity.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/testbed_sparse.h b/test/unit/gemm/device/testbed_sparse.h
index 1509d9dd0b..eeac68c0a6 100644
--- a/test/unit/gemm/device/testbed_sparse.h
+++ b/test/unit/gemm/device/testbed_sparse.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -64,6 +64,9 @@ namespace device {
 template <typename Gemm>
 struct SparseTestbed {
 
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementC = typename Gemm::ElementC;
   using ElementAccumulator = typename Gemm::ElementAccumulator;
   using ElementCompute = typename Gemm::GemmKernel::Epilogue::OutputOp::ElementCompute;
 
@@ -125,8 +128,8 @@ struct SparseTestbed {
         scope_max = 2;
         scope_min = 0;
       } else if (bits_input <= 8) {
-        scope_max = 2;
-        scope_min = -2;
+        scope_max = 1;
+        scope_min = -1;
       } else if (bits_output == 16) {
         scope_max = 5;
         scope_min = -5;
@@ -152,7 +155,6 @@ struct SparseTestbed {
         view.data(), view.capacity());
     } 
     else {
-      // TODO: Implement the rest
       EXPECT_TRUE(false) << "Not implemented";
       return false;
     }
@@ -189,7 +191,6 @@ struct SparseTestbed {
       cutlass::reference::host::TensorFill(tensor_E.host_view(),
                                            (ElementE)(content));
     } else {
-      // TODO: Implement the rest
       EXPECT_TRUE(false);
     }
 
@@ -307,7 +308,7 @@ struct SparseTestbed {
     // Determine SMEM requirements and waive if not satisfied
     //
 
-    int smem_size = int(sizeof(typename Gemm::GemmKernel::SharedStorage));
+    size_t smem_size = sizeof(typename Gemm::GemmKernel::SharedStorage);
 
     cudaDeviceProp properties;
     int device_idx;
@@ -323,7 +324,7 @@ struct SparseTestbed {
       throw std::runtime_error("cudaGetDeviceProperties() failed");
     }
 
-    if (properties.sharedMemPerMultiprocessor < smem_size) {
+    if (properties.sharedMemPerBlockOptin < smem_size) {
       return false;
     }
 
@@ -352,14 +353,25 @@ struct SparseTestbed {
     //
 
     typename Gemm::Arguments arguments{
+      cutlass::gemm::GemmUniversalMode::kGemm,
       problem_size,
-      tensor_A.device_ref(),
-      tensor_B.device_ref(),
-      tensor_C.device_ref(),
-      tensor_D.device_ref(),
-      tensor_E_reordered.device_ref(),
+      split_k_slices,
       {alpha, beta},
-      split_k_slices
+      tensor_A.device_data(),
+      tensor_B.device_data(),
+      tensor_C.device_data(),
+      tensor_D.device_data(),
+      tensor_E_reordered.device_data(),
+      int64_t(),
+      int64_t(),
+      int64_t(),
+      int64_t(),
+      int64_t(),
+      tensor_A.layout().stride(0),                                     
+      tensor_B.layout().stride(0),
+      tensor_C.layout().stride(0),
+      tensor_D.layout().stride(0),                                     
+      tensor_E_reordered.layout().stride(0)
     };
 
     Gemm gemm_op;
@@ -390,7 +402,7 @@ struct SparseTestbed {
     bool passed = this->verify(problem_size, alpha, beta);
 
     if (!passed) {
-      std::cout << "Error with split_k_slices = " << split_k_slices << ", alpha: " << alpha << std::endl;
+      std::cout << "Error with split_k_slices = " << split_k_slices << ", alpha: " << alpha << ", beta: " << beta << ", m: " << problem_size.m() << ", n: " << problem_size.n() << ", k:" <<problem_size.k() << std::endl;
     }
 
     return passed;
@@ -419,11 +431,10 @@ bool TestAllSparseGemm() {
 
   int problem_size_n[] = {kAlignmentN, 512 - 2 * kAlignmentN};
 
-  int problem_size_k[] = {Gemm::ThreadblockShape::kK,
-                          Gemm::ThreadblockShape::kK * (Gemm::kStages + 1)};
+  int problem_size_k[] = {Gemm::ThreadblockShape::kK * 8};
 
   int split_k_slices[] = {
-    1, 2, 3
+    1, 2
   };
 
   double problem_alpha[] = {
@@ -443,17 +454,8 @@ bool TestAllSparseGemm() {
       for (int k : problem_size_k) {
         for (int split_k : split_k_slices) {
 
-          if (!Gemm::kSplitKSerial && split_k > 1) {
-            continue;
-          }
-
-          if (split_k > 1 && k / Gemm::ThreadblockShape::kK < split_k) {
-            continue;
-          }
-
           for (auto alpha : problem_alpha) {
             for (auto beta : problem_beta) {
-
               cutlass::gemm::GemmCoord problem_size(m, n, k);
 
               passed = testbed.run(
diff --git a/test/unit/gemm/device/testbed_splitk.h b/test/unit/gemm/device/testbed_splitk.h
index fcc136c1aa..7407b5bd43 100644
--- a/test/unit/gemm/device/testbed_splitk.h
+++ b/test/unit/gemm/device/testbed_splitk.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -72,7 +72,7 @@ struct TestbedSplitK : public Testbed<Gemm> {
     // Determine SMEM requirements and waive if not satisfied
     //
 
-    int smem_size = int(sizeof(typename Gemm::GemmKernel::SharedStorage));
+    size_t smem_size = sizeof(typename Gemm::GemmKernel::SharedStorage);
 
     cudaDeviceProp properties;
     int device_idx;
@@ -88,7 +88,7 @@ struct TestbedSplitK : public Testbed<Gemm> {
       throw std::runtime_error("cudaGetDeviceProperties() failed");
     }
 
-    if (properties.sharedMemPerMultiprocessor < smem_size) {
+    if (properties.sharedMemPerBlockOptin < smem_size) {
       return false;
     }
 
diff --git a/test/unit/gemm/device/testbed_symm_universal.h b/test/unit/gemm/device/testbed_symm_universal.h
index 14218d64b8..7940b096e6 100644
--- a/test/unit/gemm/device/testbed_symm_universal.h
+++ b/test/unit/gemm/device/testbed_symm_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -64,6 +64,9 @@ namespace device {
 template <typename Symm>
 struct TestbedSymmUniversal {
 
+  using ElementA = typename Symm::ElementA;
+  using ElementB = typename Symm::ElementB;
+  using ElementC = typename Symm::ElementC;
   using ElementAccumulator = typename Symm::ElementAccumulator;
   using ElementCompute = typename Symm::SymmKernel::Epilogue::OutputOp::ElementCompute;
 
@@ -308,7 +311,7 @@ struct TestbedSymmUniversal {
     // Determine SMEM requirements and waive if not satisfied
     //
 
-    int smem_size = int(sizeof(typename Symm::SymmKernel::SharedStorage));
+    size_t smem_size = sizeof(typename Symm::SymmKernel::SharedStorage);
 
     cudaDeviceProp properties;
     int device_idx;
@@ -324,7 +327,7 @@ struct TestbedSymmUniversal {
       throw std::runtime_error("cudaGetDeviceProperties() failed");
     }
 
-    if (properties.sharedMemPerMultiprocessor < smem_size) {
+    if (properties.sharedMemPerBlockOptin < smem_size) {
       return false;
     }
 
diff --git a/test/unit/gemm/device/testbed_trmm_universal.h b/test/unit/gemm/device/testbed_trmm_universal.h
index 13c3c44f24..0f99289667 100644
--- a/test/unit/gemm/device/testbed_trmm_universal.h
+++ b/test/unit/gemm/device/testbed_trmm_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -66,6 +66,9 @@ namespace device {
 template <typename Trmm>
 struct TestbedTrmmUniversal {
 
+  using ElementA = typename Trmm::ElementA;
+  using ElementB = typename Trmm::ElementB;
+  using ElementC = typename Trmm::ElementC;
   using ElementAccumulator = typename Trmm::ElementAccumulator;
   using ElementCompute = typename Trmm::TrmmKernel::Epilogue::OutputOp::ElementCompute;
 
@@ -137,7 +140,6 @@ struct TestbedTrmmUniversal {
         view.data(), view.capacity());
     } 
     else {
-      // TODO: Implement the rest
       EXPECT_TRUE(false) << "Not implemented";
       return false;
     }
@@ -183,7 +185,6 @@ struct TestbedTrmmUniversal {
         view, seed, Trmm::kFillMode, 0, 0.5, mantissa_in_bits);
     }
     else {
-      // TODO: Implement the rest
       EXPECT_TRUE(false) << "Not implemented";
       return false;
     }
@@ -227,7 +228,6 @@ struct TestbedTrmmUniversal {
       EXPECT_TRUE(false) << "Gaussian distribution for pad diagonal not implemented";
     }
     else {
-      // TODO: Implement the rest
       EXPECT_TRUE(false) << "Not implemented";
       return false;
     }
@@ -348,7 +348,7 @@ struct TestbedTrmmUniversal {
     // Determine SMEM requirements and waive if not satisfied
     //
 
-    int smem_size = int(sizeof(typename Trmm::TrmmKernel::SharedStorage));
+    size_t smem_size = sizeof(typename Trmm::TrmmKernel::SharedStorage);
 
     cudaDeviceProp properties;
     int device_idx;
@@ -364,7 +364,7 @@ struct TestbedTrmmUniversal {
       throw std::runtime_error("cudaGetDeviceProperties() failed");
     }
 
-    if (properties.sharedMemPerMultiprocessor < smem_size) {
+    if (properties.sharedMemPerBlockOptin < smem_size) {
       return false;
     }
 
diff --git a/test/unit/gemm/device/testbed_universal.h b/test/unit/gemm/device/testbed_universal.h
index ca85e06202..f7dd861b5b 100644
--- a/test/unit/gemm/device/testbed_universal.h
+++ b/test/unit/gemm/device/testbed_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -58,9 +58,12 @@ namespace device {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-template <typename Gemm>
+template <typename Gemm, bool Relu = false>
 struct TestbedUniversal {
 
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementC = typename Gemm::ElementC;
   using ElementAccumulator = typename Gemm::ElementAccumulator;
   using ElementCompute = typename Gemm::GemmKernel::Epilogue::OutputOp::ElementCompute;
 
@@ -91,7 +94,7 @@ struct TestbedUniversal {
   /// Helper to initialize a tensor view
   template <typename Element, typename Layout>
   bool initialize_tensor(
-    cutlass::TensorView<Element, Layout> view, 
+    cutlass::TensorView<Element, Layout> view,
     cutlass::Distribution::Kind dist_kind,
     uint64_t seed) {
 
@@ -100,16 +103,22 @@ struct TestbedUniversal {
       double scope_max, scope_min;
       int bits_input = cutlass::sizeof_bits<Element>::value;
       int bits_output = cutlass::sizeof_bits<typename Gemm::ElementC>::value;
+      bool is_unsigned_int = std::numeric_limits<Element>::is_integer && !std::numeric_limits<Element>::is_signed;
 
       if (bits_input == 1) {
         scope_max = 2;
         scope_min = 0;
       } else if (bits_input <= 8) {
-        scope_max = 2;
-        scope_min = -2;
+        scope_max = is_unsigned_int ? 2 : 1;
+        scope_min = is_unsigned_int ? 0 : -1;
       } else if (bits_output == 16) {
-        scope_max = 5;
-        scope_min = -5;
+        constexpr auto u8_bf16 =
+          (cutlass::platform::is_same<ElementA, uint8_t>::value &&
+           cutlass::platform::is_same<ElementB, cutlass::bfloat16_t>::value) ||
+          (cutlass::platform::is_same<ElementA, cutlass::bfloat16_t>::value &&
+           cutlass::platform::is_same<ElementB, uint8_t>::value);
+        scope_max = is_unsigned_int ? 10 : (u8_bf16 ? 3 : 5);
+        scope_min = is_unsigned_int ? 0 : (u8_bf16 ? -3 : -5);
       } else {
         scope_max = 8;
         scope_min = -8;
@@ -117,11 +126,11 @@ struct TestbedUniversal {
 
       cutlass::reference::host::TensorFillRandomUniform(
         view, seed, scope_max, scope_min, 0);
-    } 
+    }
     else if (dist_kind == cutlass::Distribution::Identity) {
 
       cutlass::reference::host::TensorFillIdentity(view);
-    } 
+    }
     else if (dist_kind == cutlass::Distribution::Gaussian) {
 
       cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
@@ -130,9 +139,8 @@ struct TestbedUniversal {
 
       cutlass::reference::host::BlockFillSequential(
         view.data(), view.capacity());
-    } 
+    }
     else {
-      // TODO: Implement the rest
       EXPECT_TRUE(false) << "Not implemented";
       return false;
     }
@@ -158,9 +166,10 @@ struct TestbedUniversal {
 
     // It is possible to randomly initialize to all zeros, so override this with non-zeros
     // in the upper left corner of each operand.
-    tensor_A.host_view().at({0, 0}) = typename Gemm::ElementA(1);
-    tensor_B.host_view().at({0, 0}) = typename Gemm::ElementB(1);
-    tensor_C.host_view().at({0, 0}) = typename Gemm::ElementC(1);
+    cutlass::Coord<2> origin(0);
+    tensor_A.host_view().at(origin) = typename Gemm::ElementA(1);
+    tensor_B.host_view().at(origin) = typename Gemm::ElementB(1);
+    tensor_C.host_view().at(origin) = typename Gemm::ElementC(1);
 
     cutlass::reference::host::TensorCopy(reference_D.host_view(), tensor_C.host_view());
 
@@ -172,8 +181,8 @@ struct TestbedUniversal {
 
   /// Compares computed reference with device reference and outputs to a file if incorrect
   bool compare_reference(
-    cutlass::gemm::GemmCoord problem_size, 
-    ElementCompute alpha, 
+    cutlass::gemm::GemmCoord problem_size,
+    ElementCompute alpha,
     ElementCompute beta) {
 
     tensor_D.sync_host();
@@ -181,7 +190,7 @@ struct TestbedUniversal {
     EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_A.host_view()), 0);
     EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_B.host_view()), 0);
     EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_C.host_view()), 0);
-    
+
     EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0);
     EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0);
 
@@ -192,17 +201,18 @@ struct TestbedUniversal {
     if (!passed) {
 
       /*
+
       std::stringstream fname;
 
       fname << "error_Gemm_device_"
         << problem_size.m() << "x"
         << problem_size.n() << "x"
         << problem_size.k() << "_"
-        << Gemm::ThreadblockShape::kM << "x"  
-        << Gemm::ThreadblockShape::kN << "x"  
+        << Gemm::ThreadblockShape::kM << "x"
+        << Gemm::ThreadblockShape::kN << "x"
         << Gemm::ThreadblockShape::kK << "_"
-        << Gemm::WarpShape::kM << "x"  
-        << Gemm::WarpShape::kN << "x"  
+        << Gemm::WarpShape::kM << "x"
+        << Gemm::WarpShape::kN << "x"
         << Gemm::WarpShape::kK << ".txt";
 
       std::ofstream file(fname.str());
@@ -211,10 +221,10 @@ struct TestbedUniversal {
       std::ofstream file("testbed_universal_errors.txt");
 
       file
-        << "problem: " << problem_size 
+        << "problem: " << problem_size
         << ", alpha: " << alpha << ", beta: " << beta << "\n\n";
 
-      file 
+      file
         << "A =\n" << tensor_A.host_view()
         << "\nB =\n" << tensor_B.host_view()
         << "\nC =\n" << tensor_C.host_view()
@@ -227,8 +237,8 @@ struct TestbedUniversal {
 
   /// Verifies the result is a GEMM
   bool verify(
-    cutlass::gemm::GemmCoord problem_size, 
-    ElementCompute alpha, 
+    cutlass::gemm::GemmCoord problem_size,
+    ElementCompute alpha,
     ElementCompute beta) {
 
     //
@@ -238,21 +248,32 @@ struct TestbedUniversal {
     cutlass::reference::host::GemmComplex<
         typename Gemm::ElementA, typename Gemm::LayoutA,
         typename Gemm::ElementB, typename Gemm::LayoutB,
-        typename Gemm::ElementC, typename Gemm::LayoutC, 
+        typename Gemm::ElementC, typename Gemm::LayoutC,
         ElementCompute, ElementAccumulator
     >(
       problem_size,
-      alpha, 
+      alpha,
       tensor_A.host_ref(),
       Gemm::kTransformA,
       tensor_B.host_ref(),
       Gemm::kTransformB,
-      beta, 
-      tensor_C.host_ref(), 
-      reference_D.host_ref(), 
+      beta,
+      tensor_C.host_ref(),
+      reference_D.host_ref(),
       ElementAccumulator(0)
     );
 
+    if (Relu) {
+      for (int i = 0; i < problem_size.m(); ++i) {
+        for (int j = 0; j < problem_size.n(); ++j) {
+           reference_D.at(cutlass::MatrixCoord(i, j)) =
+                  ((ElementCompute)reference_D.at(cutlass::MatrixCoord(i, j)) < (ElementCompute)0)
+                  ? (typename Gemm::ElementC)0
+                  : reference_D.at(cutlass::MatrixCoord(i, j));
+        }
+      }
+    }
+
     return compare_reference(problem_size, alpha, beta);
   }
 
@@ -262,7 +283,7 @@ struct TestbedUniversal {
     // Determine SMEM requirements and waive if not satisfied
     //
 
-    int smem_size = int(sizeof(typename Gemm::GemmKernel::SharedStorage));
+    size_t smem_size = sizeof(typename Gemm::GemmKernel::SharedStorage);
 
     cudaDeviceProp properties;
     int device_idx;
@@ -278,7 +299,7 @@ struct TestbedUniversal {
       throw std::runtime_error("cudaGetDeviceProperties() failed");
     }
 
-    if (properties.sharedMemPerMultiprocessor < smem_size) {
+    if (properties.sharedMemPerBlockOptin < smem_size) {
       return false;
     }
 
@@ -288,10 +309,20 @@ struct TestbedUniversal {
   /// Executes one test
   bool run(
     cutlass::gemm::GemmUniversalMode mode,
-    cutlass::gemm::GemmCoord problem_size, 
+    cutlass::gemm::GemmCoord problem_size,
     int batch_count = 1,
-    ElementCompute alpha = ElementCompute(1), 
-    ElementCompute beta = ElementCompute(0)) {
+    ElementCompute alpha = ElementCompute(1),
+    ElementCompute beta = ElementCompute(0))
+  {
+/*
+    std::cout << "\n-----------------------\n";
+    std::cout << "mode: " << (int) mode << "\n";
+    std::cout << "problem size: " << problem_size << "\n";
+    std::cout << "batch_count: " << batch_count << "\n";
+    std::cout << "alpha: " << alpha << "\n";
+    std::cout << "beta: " << beta << "\n";
+    std::cout << "-----------------------\n\n";
+*/
 
     // Waive test if insufficient CUDA device
     if (!sufficient()) {
@@ -359,32 +390,32 @@ struct TestbedUniversal {
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-template <typename Gemm>
+template <typename Gemm, bool Relu = false>
 bool TestGemmUniversal(
   cutlass::gemm::GemmCoord const & problem_size,
   cutlass::gemm::GemmUniversalMode mode,
   int batch_count,
-  double alpha = 1.0, 
+  double alpha = 1.0,
   double beta = 2.0) {
 
   bool passed = true;
 
-  TestbedUniversal<Gemm> testbed;
-  
+  TestbedUniversal<Gemm, Relu> testbed;
+
   using ElementCompute = typename Gemm::EpilogueOutputOp::ElementCompute;
 
   passed = testbed.run(
     mode,
-    problem_size, 
+    problem_size,
     batch_count,
-    cutlass::from_real<ElementCompute>(alpha), 
+    cutlass::from_real<ElementCompute>(alpha),
     cutlass::from_real<ElementCompute>(beta)
   );
 
   return passed;
 }
 
-template <typename Gemm>
+template <typename Gemm, bool Relu = false>
 bool TestAllGemmUniversal() {
   bool passed = true;
 
@@ -412,9 +443,9 @@ bool TestAllGemmUniversal() {
                           cutlass::platform::is_same<typename Gemm::ElementB, int8_t>::value &&
                           (cutlass::platform::is_same<typename Gemm::LayoutA, cutlass::layout::RowMajor>::value ||
                           cutlass::platform::is_same<typename Gemm::LayoutB, cutlass::layout::ColumnMajor>::value) ? 4 : kAlignment;
-  
-  
-  
+
+
+
   cutlass::gemm::GemmUniversalMode modes[] = {
     cutlass::gemm::GemmUniversalMode::kGemm,
   };
@@ -428,8 +459,8 @@ bool TestAllGemmUniversal() {
   };
 
   int problem_size_k[] = {
-    kAlignmentK, 
-    Gemm::ThreadblockShape::kK * Gemm::kStages - kAlignmentK, 
+    kAlignmentK,
+    Gemm::ThreadblockShape::kK * Gemm::kStages - kAlignmentK,
     Gemm::ThreadblockShape::kK * Gemm::kStages * 3 - kAlignmentK
   };
 
@@ -468,13 +499,13 @@ bool TestAllGemmUniversal() {
 
                 cutlass::gemm::GemmCoord problem_size(m, n, k);
 
-                TestbedUniversal<Gemm> testbed;
+                TestbedUniversal<Gemm, Relu> testbed;
 
                 passed = testbed.run(
                   mode,
-                  problem_size, 
+                  problem_size,
                   batch_count,
-                  cutlass::from_real<ElementCompute>(alpha), 
+                  cutlass::from_real<ElementCompute>(alpha),
                   cutlass::from_real<ElementCompute>(beta)
                 );
 
@@ -498,9 +529,9 @@ bool TestAllGemmUniversal() {
 
     passed = testbed.run(
       cutlass::gemm::GemmUniversalMode::kGemm,
-      problem_size, 
+      problem_size,
       split_k_slices,
-      cutlass::from_real<ElementCompute>(1.0), 
+      cutlass::from_real<ElementCompute>(1.0),
       cutlass::from_real<ElementCompute>(2.0)
     );
 
@@ -520,4 +551,3 @@ bool TestAllGemmUniversal() {
 } // namespace test
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/test/unit/gemm/device/testbed_utils.h b/test/unit/gemm/device/testbed_utils.h
index b5b7661678..5783e5a978 100644
--- a/test/unit/gemm/device/testbed_utils.h
+++ b/test/unit/gemm/device/testbed_utils.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/testbed_with_absmax.h b/test/unit/gemm/device/testbed_with_absmax.h
new file mode 100644
index 0000000000..2bccba4f3c
--- /dev/null
+++ b/test/unit/gemm/device/testbed_with_absmax.h
@@ -0,0 +1,609 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Testbed for running device-level GEMMs with absolute maximum calculation and scaling
+*/
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/gemm_complex.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "testbed.h"
+#include "testbed_sparse.h"
+#include "testbed_utils.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/matrix_coord.h"
+
+namespace test {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Gemm,
+  typename GemmTestbed,
+  template<typename T> class ActivationFunctor
+>
+struct TestbedWithAmax {
+
+  static_assert(std::is_same_v<GemmTestbed, Testbed<Gemm>> || std::is_same_v<GemmTestbed, SparseTestbed<Gemm>>);
+  static constexpr bool IsSparseTestbed = std::is_same_v<GemmTestbed, SparseTestbed<Gemm>>;
+
+  using ElementAccumulator = typename Gemm::ElementAccumulator;
+  using ElementCompute = typename Gemm::GemmKernel::Epilogue::OutputOp::ElementCompute;
+  using ElementScalingFactor = typename Gemm::EpilogueOutputOp::ElementScalingFactor;
+  using ElementAbsmax = typename Gemm::EpilogueOutputOp::ElementAbsmax;
+
+  static bool const kScaleAux = Gemm::EpilogueOutputOp::kIsScalingAndAmaxAuxOutputNeeded;
+  static bool const kScaleOutput = Gemm::EpilogueOutputOp::kIsScalingAndAmaxOutputNeeded;
+  bool doScaleA;
+  bool doScaleB;
+  bool doScaleC;
+
+  GemmTestbed underlying_testbed;
+
+  cutlass::HostTensor<typename Gemm::EpilogueOutputOp::ElementAuxOutput, typename Gemm::LayoutC> tensor_Aux;
+  cutlass::HostTensor<typename Gemm::ElementC, typename Gemm::LayoutC> tensor_Vector;
+  cutlass::HostTensor<ElementAccumulator, typename Gemm::LayoutC> tmp_D;
+  cutlass::HostTensor<typename Gemm::EpilogueOutputOp::ElementOutput, typename Gemm::LayoutC> reference_D;
+  cutlass::HostTensor<typename Gemm::EpilogueOutputOp::ElementAuxOutput, typename Gemm::LayoutC> reference_Aux;
+  cutlass::HostTensor<ElementScalingFactor, typename Gemm::LayoutC> scale_A;
+  cutlass::HostTensor<ElementScalingFactor, typename Gemm::LayoutC> scale_B;
+  cutlass::HostTensor<ElementScalingFactor, typename Gemm::LayoutC> scale_C;
+  cutlass::HostTensor<ElementScalingFactor, typename Gemm::LayoutC> scale_D;
+  cutlass::HostTensor<ElementScalingFactor, typename Gemm::LayoutC> scale_Aux;
+  cutlass::HostTensor<ElementAbsmax, typename Gemm::LayoutC> abs_max_Aux;
+  cutlass::HostTensor<ElementAbsmax, typename Gemm::LayoutC> abs_max_D;
+  cutlass::HostTensor<ElementAbsmax, typename Gemm::LayoutC> reference_abs_max_Aux;
+  cutlass::HostTensor<ElementAbsmax, typename Gemm::LayoutC> reference_abs_max_D;
+
+  //
+  // Methods
+  //
+
+  TestbedWithAmax(
+    bool scaleA = true,
+    bool scaleB = true,
+    bool scaleC = true,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform
+  ):
+    doScaleA(scaleA), doScaleB(scaleB), doScaleC(scaleC),
+    underlying_testbed(init_A_, init_B_, init_C_) { }
+
+  /// Helper to initialize scaling factors
+  template <typename Element, typename Layout>
+  bool initialize_scale_factor(cutlass::TensorView<Element, Layout> view, uint64_t seed, int bits=0) {
+    cutlass::reference::host::TensorFillRandomUniform(view, seed, double(1.), double(0.), bits);
+    return true;
+  }
+
+  /// Initializes data structures
+  void initialize(cutlass::gemm::GemmCoord problem_size) {
+    //
+    // Allocate the GEMM workspace
+    //
+    underlying_testbed.initialize(problem_size);
+
+    tensor_Vector.resize({1, problem_size.n()});
+    reference_D.resize(problem_size.mn(), false);
+    tmp_D.resize(problem_size.mn(), false);
+
+    EXPECT_TRUE(
+      underlying_testbed.initialize_tensor(tensor_Vector.host_view(), underlying_testbed.init_C, underlying_testbed.seed + 2020)
+    );
+
+    // It is possible to randomly initialize to all zeros, so override this with non-zeros
+    // in the upper left corner of each operand.
+    cutlass::Coord<2> origin(0);
+    tensor_Vector.host_view().at(origin) = typename Gemm::ElementC(1);
+
+    cutlass::reference::host::TensorCopy(reference_D.host_view(), underlying_testbed.tensor_C.host_view());
+
+    tensor_Vector.sync_device();
+
+    int scale_bits = 2;
+    if (doScaleA) {
+      scale_A.resize({1, 1});
+      EXPECT_TRUE(initialize_scale_factor(scale_A.host_view(), underlying_testbed.seed + 2021, scale_bits));
+      scale_A.sync_device();
+    }
+
+    if (doScaleB) {
+      scale_B.resize({1, 1});
+      EXPECT_TRUE(initialize_scale_factor(scale_B.host_view(), underlying_testbed.seed + 2022, scale_bits));
+      scale_B.sync_device();
+    }
+
+    if (doScaleC) {
+      scale_C.resize({1, 1});
+      EXPECT_TRUE(initialize_scale_factor(scale_C.host_view(), underlying_testbed.seed + 2023, scale_bits));
+      scale_C.sync_device();
+    }
+
+    if (kScaleOutput) {
+      scale_D.resize({1, 1});
+      EXPECT_TRUE(initialize_scale_factor(scale_D.host_view(), underlying_testbed.seed + 2024, scale_bits));
+      scale_D.sync_device();
+
+      abs_max_D.resize({1, 1});
+      cutlass::reference::host::TensorFill(abs_max_D.host_view());
+      abs_max_D.sync_device();
+
+      reference_abs_max_D.resize({1, 1});
+    }
+
+    if (kScaleAux) {
+      tensor_Aux.resize(problem_size.mn());
+      cutlass::reference::host::TensorFill(tensor_Aux.host_view());
+      tensor_Aux.sync_device();
+
+      scale_Aux.resize({1, 1});
+      EXPECT_TRUE(initialize_scale_factor(scale_Aux.host_view(), underlying_testbed.seed + 2025, scale_bits));
+      scale_Aux.sync_device();
+
+      abs_max_Aux.resize({1, 1});
+      cutlass::reference::host::TensorFill(abs_max_Aux.host_view());
+      abs_max_Aux.sync_device();
+
+      reference_Aux.resize(problem_size.mn(), false);
+      reference_abs_max_Aux.resize({1, 1});
+    }
+  }
+
+  /// Compares computed reference with device reference and outputs to a file if incorrect
+  bool compare_reference(
+    cutlass::gemm::GemmCoord problem_size,
+    ElementCompute alpha,
+    ElementCompute beta) {
+
+    underlying_testbed.tensor_D.sync_host();
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(underlying_testbed.tensor_A.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(underlying_testbed.tensor_B.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(underlying_testbed.tensor_C.host_view()), 0);
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(underlying_testbed.tensor_D.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0);
+    bool passed = cutlass::reference::host::TensorEquals(reference_D.host_view(), underlying_testbed.tensor_D.host_view());
+    if (!passed) {
+      std::cout << "Comparison of D failed" << std::endl;
+    }
+
+    if (kScaleAux) {
+      tensor_Aux.sync_host();
+      abs_max_Aux.sync_host();
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_Aux.host_view()), 0);
+      EXPECT_GT(cutlass::reference::host::TensorNorm(abs_max_Aux.host_view()), 0);
+      EXPECT_GT(cutlass::reference::host::TensorNorm(reference_Aux.host_view()), 0);
+      if (!cutlass::reference::host::TensorEquals(reference_Aux.host_view(), tensor_Aux.host_view())) {
+        passed = false;
+        std::cout << "Comparison of Aux failed" << std::endl;
+      }
+      if (!cutlass::reference::host::TensorEquals(abs_max_Aux.host_view(), reference_abs_max_Aux.host_view())) {
+        passed = false;
+        std::cout << "Comparison of Aux absmax failed" << std::endl;
+      }
+    }
+
+    if (kScaleOutput) {
+      abs_max_D.sync_host();
+      EXPECT_GT(cutlass::reference::host::TensorNorm(abs_max_D.host_view()), 0);
+      if (!cutlass::reference::host::TensorEquals(abs_max_D.host_view(), reference_abs_max_D.host_view())) {
+        passed = false;
+        std::cout << "Comparison of D absmax failed" << std::endl;
+      }
+    }
+
+    EXPECT_TRUE(passed) << " mismatched reference";
+
+    if (!passed) {
+
+      std::ofstream file("testbed_with_amax_errors.txt");
+
+      file
+        << "problem: " << problem_size
+        << ", alpha: " << alpha << ", beta: " << beta << "\n\n";
+
+      file
+        << "A =\n" << underlying_testbed.tensor_A.host_view()
+        << "\nB =\n" << underlying_testbed.tensor_B.host_view()
+        << "\nC =\n" << underlying_testbed.tensor_C.host_view()
+        << "\nVector =\n" << tensor_Vector.host_view()
+        << "\nScaleA = " << scale_A.host_view()
+        << "\nScaleB = " << scale_B.host_view()
+        << "\nScaleC = " << scale_C.host_view()
+        << "\nScaleD = " << scale_D.host_view()
+        << "\nScaleAux = " << scale_Aux.host_view()
+        << "\n\nReference D =\n" << reference_D.host_view()
+        << "\nComputed D =\n" << underlying_testbed.tensor_D.host_view();
+      if (kScaleAux) {
+        file
+          << "\n\nReference Aux =\n" << reference_Aux.host_view()
+          << "\nComputed Aux =\n" << tensor_Aux.host_view()
+          << "\n\nReference Absmax Aux = " << reference_abs_max_Aux.host_view()
+          << "\nComputed Absmax Aux = " << abs_max_Aux.host_view();
+      }
+      if (kScaleOutput) {
+        file
+          << "\n\nReference Absmax D = " << reference_abs_max_D.host_view()
+          << "\nComputed Absmax D = " << abs_max_D.host_view();
+      }
+    }
+
+    return passed;
+  }
+
+  /// Verifies the result is a GEMM
+  bool verify(
+    cutlass::gemm::GemmCoord problem_size,
+    ElementCompute alpha,
+    ElementCompute beta) {
+
+    cutlass::Coord<2> origin(0);
+    ElementCompute scaled_alpha = alpha;
+    if (doScaleA) {
+      scaled_alpha *= scale_A.host_view().at(origin);
+    }
+    if (doScaleB) {
+      scaled_alpha *= scale_B.host_view().at(origin);
+    }
+
+    ElementCompute scaled_beta = beta;
+    if (doScaleC) {
+      scaled_beta *= scale_C.host_view().at(origin);
+    }
+
+    //
+    // Verify
+    //
+
+    auto ref_tA = [&](){
+      if constexpr (IsSparseTestbed) {
+        cutlass::uncompress(
+          underlying_testbed.tensor_A_uncompressed.host_ref(),
+          underlying_testbed.tensor_A.host_ref(),
+          underlying_testbed.tensor_E.host_ref(),
+          problem_size.m(),
+          problem_size.k()
+        );
+        return underlying_testbed.tensor_A_uncompressed.host_ref();
+      }
+      else {
+        return underlying_testbed.tensor_A.host_ref();
+      }
+    }();
+
+    // Run reference kernel with ElementOutput of type ElementAccumulator
+    // so that we can compute the absmax epilogue on data that is of type
+    // ElementAccumulator (which is what the GEMM we are testing will do).
+    cutlass::reference::host::GemmComplex<
+        typename Gemm::ElementA, typename Gemm::LayoutA,
+        typename Gemm::ElementB, typename Gemm::LayoutB,
+        typename Gemm::ElementC, typename Gemm::LayoutC,
+        ElementCompute, ElementAccumulator, ElementAccumulator
+    >(
+      problem_size,
+      scaled_alpha,
+      ref_tA,
+      Gemm::kTransformA,
+      underlying_testbed.tensor_B.host_ref(),
+      Gemm::kTransformB,
+      scaled_beta,
+      underlying_testbed.tensor_C.host_ref(),
+      tmp_D.host_ref(),
+      ElementAccumulator(0)
+    );
+
+    ElementCompute tmp_abs_max_Aux(0.);
+    ElementCompute tmp_abs_max_D(0.);
+
+    cutlass::NumericConverter<ElementCompute, typename Gemm::ElementC> cvt_c_to_compute;
+    cutlass::NumericConverter<ElementCompute, ElementAccumulator> cvt_accum_to_compute;
+    cutlass::NumericConverter<ElementAbsmax, ElementCompute> cvt_compute_to_absmax;
+    cutlass::NumericConverter<typename Gemm::EpilogueOutputOp::ElementOutput, ElementCompute> cvt_compute_to_d;
+    cutlass::NumericConverter<typename Gemm::EpilogueOutputOp::ElementAuxOutput, ElementCompute> cvt_compute_to_aux;
+
+    cutlass::absolute_value_op<ElementCompute> abs;
+    cutlass::maximum_with_nan_propogation<ElementCompute> max;
+    ActivationFunctor<ElementCompute> act;
+
+    ElementScalingFactor d_scale = kScaleOutput ? scale_D.host_view().at(origin) : ElementScalingFactor(1.);
+
+    for (int m = 0; m < problem_size.m(); ++m) {
+      for (int n = 0; n < problem_size.n(); ++n) {
+        ElementCompute intermediate = cvt_accum_to_compute(tmp_D.host_view().at({m, n}));
+        ElementCompute bias = cvt_c_to_compute(tensor_Vector.host_view().at({0, n}));
+        ElementCompute aux = intermediate + bias;
+        ElementCompute d = act(aux);
+        tmp_abs_max_Aux = max(abs(aux), tmp_abs_max_Aux);
+        tmp_abs_max_D = max(abs(d), tmp_abs_max_D);
+        reference_D.host_view().at({m, n}) = cvt_compute_to_d(d * d_scale);
+
+        if (kScaleAux) {
+          reference_Aux.host_view().at({m, n}) = cvt_compute_to_aux(aux * scale_Aux.host_view().at(origin));
+        }
+      }
+    }
+
+    if (kScaleAux) {
+      reference_abs_max_Aux.host_view().at(origin) = cvt_compute_to_absmax(tmp_abs_max_Aux);
+    }
+
+    if (kScaleOutput) {
+      reference_abs_max_D.host_view().at(origin) = cvt_compute_to_absmax(tmp_abs_max_D);
+    }
+
+    return compare_reference(problem_size, alpha, beta);
+  }
+
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+    return underlying_testbed.sufficient();
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::gemm::GemmUniversalMode mode,
+    cutlass::gemm::GemmCoord problem_size,
+    int batch_count = 1,
+    ElementCompute alpha = ElementCompute(1),
+    ElementCompute beta = ElementCompute(0))
+  {
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+    this->initialize(problem_size);
+
+    //
+    // Initialize the GEMM operator
+    //
+
+    typename Gemm::EpilogueOutputOp::Params::ActivationParams activation_params{alpha, beta};
+    typename Gemm::EpilogueOutputOp::Params epilogue_params{
+      activation_params,
+      scale_A.device_data(),
+      scale_B.device_data(),
+      scale_C.device_data(),
+      scale_D.device_data(),
+      scale_Aux.device_data(),
+      abs_max_Aux.device_data(),
+      abs_max_D.device_data()
+    };
+
+    auto arguments = [&]() {
+      if constexpr (IsSparseTestbed) {
+        return typename Gemm::Arguments{
+          cutlass::gemm::GemmUniversalMode::kGemm,
+          problem_size,
+          batch_count,
+          epilogue_params,
+          underlying_testbed.tensor_A.device_data(),
+          underlying_testbed.tensor_B.device_data(),
+          underlying_testbed.tensor_C.device_data(),
+          underlying_testbed.tensor_D.device_data(),
+          underlying_testbed.tensor_E_reordered.device_data(),
+          tensor_Aux.device_data(),
+          tensor_Vector.device_data(),
+          int64_t(),
+          int64_t(),
+          int64_t(),
+          int64_t(),
+          int64_t(),
+          int64_t(),
+          int64_t(),
+          underlying_testbed.tensor_A.layout().stride(0),
+          underlying_testbed.tensor_B.layout().stride(0),
+          underlying_testbed.tensor_C.layout().stride(0),
+          underlying_testbed.tensor_D.layout().stride(0),
+          underlying_testbed.tensor_E_reordered.layout().stride(0),
+          tensor_Aux.layout().stride(0),
+          0 // stride vector
+        };
+      }
+      else {
+        return typename Gemm::Arguments{
+          mode,
+          problem_size,
+          batch_count,
+          epilogue_params,
+          underlying_testbed.tensor_A.device_data(),
+          underlying_testbed.tensor_B.device_data(),
+          underlying_testbed.tensor_C.device_data(),
+          underlying_testbed.tensor_D.device_data(),
+          tensor_Aux.device_data(),
+          tensor_Vector.device_data(),
+          problem_size.m() * problem_size.k(),
+          problem_size.n() * problem_size.k(),
+          problem_size.m() * problem_size.n(),
+          problem_size.m() * problem_size.n(),
+          0, // stride vector
+          underlying_testbed.tensor_A.layout().stride(0),
+          underlying_testbed.tensor_B.layout().stride(0),
+          underlying_testbed.tensor_C.layout().stride(0),
+          underlying_testbed.tensor_D.layout().stride(0),
+          (int64_t)0 // Leading dimension of vector. This must be 0
+        };
+      }
+    }();
+
+    Gemm gemm_op;
+
+    cutlass::Status status = gemm_op.can_implement(arguments);
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    status = gemm_op.initialize(arguments, workspace.get());
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Run the GEMM
+    //
+
+    status = gemm_op();
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    cudaError_t cuda_error = cudaDeviceSynchronize();
+    EXPECT_TRUE(cuda_error == cudaSuccess) << cudaGetErrorString(cuda_error);
+
+    //
+    // Verify
+    //
+
+    bool passed = this->verify(problem_size, alpha, beta);
+
+    if (!passed) {
+      std::cout << "Failed with batch_count/split_k_slices = " << batch_count << std::endl;
+    }
+
+    return passed;
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Gemm,
+  typename GemmTestbed,
+  template<typename T> class ActivationFunctor = cutlass::epilogue::thread::Identity
+>
+bool TestAllGemmWithAbsmax(bool scaleA=true, bool scaleB=true, bool scaleC=true) {
+
+  int const kMinimumOperandElementSize =
+    std::min(
+      int(cutlass::sizeof_bits<typename Gemm::ElementA>::value),
+      int(cutlass::sizeof_bits<typename Gemm::ElementB>::value));
+
+  int constexpr kAlignmentM = [&]() {
+    if constexpr (std::is_same_v<GemmTestbed, SparseTestbed<Gemm>>) {
+      // M dimension has to be multiple of 32 (sparse float) or 16 (sparse int)
+      // because of the reordering of operand E
+      return std::max(((sizeof(typename Gemm::ElementE) == 2) ? 32 : 16),
+                                   kMinimumOperandElementSize);
+    }
+    else {
+      return 128 / kMinimumOperandElementSize;
+    }
+  }();
+
+  int const kAlignmentN = 128 / kMinimumOperandElementSize;
+
+  int M_problems[] = {kAlignmentM, 128 + 32};
+  int N_problems[] = {kAlignmentN, 512 - 2 * kAlignmentN};
+  int K_problems[] = {Gemm::ThreadblockShape::kK * 2};
+  double alpha_problems[] = {1.};
+  double beta_problems[] = {0.};
+  int split_k_slices[] = {
+    1, 2
+  };
+
+  bool passed = true;
+
+  for (int M : M_problems) {
+    for (int N : N_problems) {
+      for (int K : K_problems) {
+        for (int split_k : split_k_slices) {
+          if (cutlass::sizeof_bits_v<typename Gemm::EpilogueOutputOp::ElementOutput> <= 8 && split_k > 1) {
+            // Don't test split-K with FP8 output. The kernel being tested will writie partial accumulations
+            // for different splits to global memory in FP8, while the reference kernel will not. This leads
+            // to mismatches that are difficult to capture without a permissive relative equality check threshold.
+            continue;
+          }
+
+          for (double alpha : alpha_problems) {
+            for (double beta : beta_problems) {
+              TestbedWithAmax<Gemm, GemmTestbed, ActivationFunctor> testbed(scaleA, scaleB, scaleC);
+
+              using ElementAccumulator = typename Gemm::ElementAccumulator;
+
+              passed = testbed.run(
+                cutlass::gemm::GemmUniversalMode::kGemm,
+                {M, N, K},
+                split_k,
+                cutlass::from_real<ElementAccumulator>(alpha),
+                cutlass::from_real<ElementAccumulator>(beta)
+              );
+
+              EXPECT_TRUE(passed)
+                << "M: " << M << ", N: " << N << ", K: " << K << ", alpha: " << alpha << ", beta: " << beta << ", split_k:" << split_k;
+
+              if (!passed) {
+
+                return passed;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return passed;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/trmm_cf32n_cf32n_cf32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/trmm_cf32n_cf32n_cf32t_tensor_op_f32_sm80.cu
index f3029e6234..f39211c576 100644
--- a/test/unit/gemm/device/trmm_cf32n_cf32n_cf32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/trmm_cf32n_cf32n_cf32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_cf32n_cf32n_cf32t_tensor_op_fast_f32_sm80.cu b/test/unit/gemm/device/trmm_cf32n_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
index 44f11d666c..e96b7bc71d 100644
--- a/test/unit/gemm/device/trmm_cf32n_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
+++ b/test/unit/gemm/device/trmm_cf32n_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_cf64_cf64_cf64_tensor_op_f64_sm90.cu b/test/unit/gemm/device/trmm_cf64_cf64_cf64_tensor_op_f64_sm90.cu
new file mode 100644
index 0000000000..8b556a6ada
--- /dev/null
+++ b/test/unit/gemm/device/trmm_cf64_cf64_cf64_tensor_op_f64_sm90.cu
@@ -0,0 +1,136 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide TRMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/trmm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/trmm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_trmm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Trmm_cf64n_cf64n_cf64t_ls_u_nu_tensor_op_f64_gaussian, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<double>;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddGaussianComplex,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Trmm_cf64h_cf64n_cf64t_ls_u_nu_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementOutput = cutlass::complex<double>;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kConjugate
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
diff --git a/test/unit/gemm/device/trmm_cf64n_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu b/test/unit/gemm/device/trmm_cf64n_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
index e95ff7485a..d57de4fa27 100644
--- a/test/unit/gemm/device/trmm_cf64n_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
+++ b/test/unit/gemm/device/trmm_cf64n_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_cf64n_cf64n_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/trmm_cf64n_cf64n_cf64t_tensor_op_f64_sm80.cu
index 0ca882917f..ac2d202844 100644
--- a/test/unit/gemm/device/trmm_cf64n_cf64n_cf64t_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/trmm_cf64n_cf64n_cf64t_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_f32n_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu b/test/unit/gemm/device/trmm_f32n_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu
index c31a2953ad..7c34e1c817 100644
--- a/test/unit/gemm/device/trmm_f32n_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu
+++ b/test/unit/gemm/device/trmm_f32n_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_f32n_f32t_f32t_tensor_op_fast_f32_rs_sm80.cu b/test/unit/gemm/device/trmm_f32n_f32t_f32t_tensor_op_fast_f32_rs_sm80.cu
index 0d8867ef95..28614b0cef 100644
--- a/test/unit/gemm/device/trmm_f32n_f32t_f32t_tensor_op_fast_f32_rs_sm80.cu
+++ b/test/unit/gemm/device/trmm_f32n_f32t_f32t_tensor_op_fast_f32_rs_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_f32t_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu b/test/unit/gemm/device/trmm_f32t_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu
index e165ba8771..503d411344 100644
--- a/test/unit/gemm/device/trmm_f32t_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu
+++ b/test/unit/gemm/device/trmm_f32t_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_f32t_f32n_f32t_tensor_op_fast_f32_ls_sm80.cu b/test/unit/gemm/device/trmm_f32t_f32n_f32t_tensor_op_fast_f32_ls_sm80.cu
index 89281d6af7..53bcdd0fef 100644
--- a/test/unit/gemm/device/trmm_f32t_f32n_f32t_tensor_op_fast_f32_ls_sm80.cu
+++ b/test/unit/gemm/device/trmm_f32t_f32n_f32t_tensor_op_fast_f32_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_f64_f64_f64_tensor_op_f64_sm90.cu b/test/unit/gemm/device/trmm_f64_f64_f64_tensor_op_f64_sm90.cu
new file mode 100644
index 0000000000..8ce29aa0a6
--- /dev/null
+++ b/test/unit/gemm/device/trmm_f64_f64_f64_tensor_op_f64_sm90.cu
@@ -0,0 +1,126 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide TRMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/trmm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/trmm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_trmm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Trmm_f64n_f64n_f64t_rs_l_nu_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_Device_Trmm_f64t_f64t_f64n_rs_l_nu_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
diff --git a/test/unit/gemm/device/trmm_f64n_f64n_f64t_tensor_op_f64_ls_sm80.cu b/test/unit/gemm/device/trmm_f64n_f64n_f64t_tensor_op_f64_ls_sm80.cu
index aa6706f3d9..684b470d77 100644
--- a/test/unit/gemm/device/trmm_f64n_f64n_f64t_tensor_op_f64_ls_sm80.cu
+++ b/test/unit/gemm/device/trmm_f64n_f64n_f64t_tensor_op_f64_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_f64n_f64n_f64t_tensor_op_f64_rs_sm80.cu b/test/unit/gemm/device/trmm_f64n_f64n_f64t_tensor_op_f64_rs_sm80.cu
index 463e93fc18..1435d2e77c 100644
--- a/test/unit/gemm/device/trmm_f64n_f64n_f64t_tensor_op_f64_rs_sm80.cu
+++ b/test/unit/gemm/device/trmm_f64n_f64n_f64t_tensor_op_f64_rs_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_f64n_f64t_f64t_tensor_op_f64_rs_sm80.cu b/test/unit/gemm/device/trmm_f64n_f64t_f64t_tensor_op_f64_rs_sm80.cu
index 4202e02d2e..fe233045d7 100644
--- a/test/unit/gemm/device/trmm_f64n_f64t_f64t_tensor_op_f64_rs_sm80.cu
+++ b/test/unit/gemm/device/trmm_f64n_f64t_f64t_tensor_op_f64_rs_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_f64t_f64t_f64n_tensor_op_f64_ls_sm80.cu b/test/unit/gemm/device/trmm_f64t_f64t_f64n_tensor_op_f64_ls_sm80.cu
index 5fda7ccc7f..35086e7c1d 100644
--- a/test/unit/gemm/device/trmm_f64t_f64t_f64n_tensor_op_f64_ls_sm80.cu
+++ b/test/unit/gemm/device/trmm_f64t_f64t_f64n_tensor_op_f64_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_f64t_f64t_f64n_tensor_op_f64_rs_sm80.cu b/test/unit/gemm/device/trmm_f64t_f64t_f64n_tensor_op_f64_rs_sm80.cu
index c7d2bf334b..9defdcc63f 100644
--- a/test/unit/gemm/device/trmm_f64t_f64t_f64n_tensor_op_f64_rs_sm80.cu
+++ b/test/unit/gemm/device/trmm_f64t_f64t_f64n_tensor_op_f64_rs_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_tf32n_tf32t_f32t_tensor_op_f32_ls_sm80.cu b/test/unit/gemm/device/trmm_tf32n_tf32t_f32t_tensor_op_f32_ls_sm80.cu
index a56c45d67c..e705152d5a 100644
--- a/test/unit/gemm/device/trmm_tf32n_tf32t_f32t_tensor_op_f32_ls_sm80.cu
+++ b/test/unit/gemm/device/trmm_tf32n_tf32t_f32t_tensor_op_f32_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_tf32n_tf32t_f32t_tensor_op_f32_rs_sm80.cu b/test/unit/gemm/device/trmm_tf32n_tf32t_f32t_tensor_op_f32_rs_sm80.cu
index 7ffdc202d7..dd9a927371 100644
--- a/test/unit/gemm/device/trmm_tf32n_tf32t_f32t_tensor_op_f32_rs_sm80.cu
+++ b/test/unit/gemm/device/trmm_tf32n_tf32t_f32t_tensor_op_f32_rs_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_tf32t_tf32n_f32n_tensor_op_f32_ls_sm80.cu b/test/unit/gemm/device/trmm_tf32t_tf32n_f32n_tensor_op_f32_ls_sm80.cu
index 1b042afb23..202d86fee2 100644
--- a/test/unit/gemm/device/trmm_tf32t_tf32n_f32n_tensor_op_f32_ls_sm80.cu
+++ b/test/unit/gemm/device/trmm_tf32t_tf32n_f32n_tensor_op_f32_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_tf32t_tf32n_f32t_tensor_op_f32_ls_sm80.cu b/test/unit/gemm/device/trmm_tf32t_tf32n_f32t_tensor_op_f32_ls_sm80.cu
index 185dda590a..ef276e8b47 100644
--- a/test/unit/gemm/device/trmm_tf32t_tf32n_f32t_tensor_op_f32_ls_sm80.cu
+++ b/test/unit/gemm/device/trmm_tf32t_tf32n_f32t_tensor_op_f32_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -337,6 +337,8 @@ TEST(SM80_Device_Trmm_tf32t_tf32n_f32t_ls_u_un_tensor_op_f32_align1_align4, 128x
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+// This test fails on Ada when running with 11.8
+#if ((__CUDACC_VER_MAJOR__ != 11) || (__CUDACC_VER_MINOR__ != 8) || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 890)))
 TEST(SM80_Device_Trmm_tf32t_tf32n_f32t_ls_u_nu_tensor_op_f32_align1_align4, 256x128x16_128x64x16) {
 
   using ElementOutput = float;
@@ -374,6 +376,7 @@ TEST(SM80_Device_Trmm_tf32t_tf32n_f32t_ls_u_nu_tensor_op_f32_align1_align4, 256x
 
   EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
 }
+#endif
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/test/unit/gemm/kernel/batched_gemv.cu b/test/unit/gemm/kernel/batched_gemv.cu
index f3fc2d978c..efcc8cee78 100755
--- a/test/unit/gemm/kernel/batched_gemv.cu
+++ b/test/unit/gemm/kernel/batched_gemv.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/kernel/testbed_gemv.h b/test/unit/gemm/kernel/testbed_gemv.h
index 17656d8433..52cb350414 100755
--- a/test/unit/gemm/kernel/testbed_gemv.h
+++ b/test/unit/gemm/kernel/testbed_gemv.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -61,7 +61,7 @@ template<typename ThreadBlockShape_,
         typename LayoutA_,
         typename LayoutB_,
         typename LayoutCD_,
-        int LDG_B = 1, // batch tile size
+        int THREAD_B = 1, // batch tile size
         bool DEBUG=false>
 void batched_gemv_kernel_test(cutlass::gemm::BatchedGemmCoord problem_size,
                               ElementCD_ alpha = ElementCD_(1),
@@ -154,7 +154,7 @@ void batched_gemv_kernel_test(cutlass::gemm::BatchedGemmCoord problem_size,
     cutlass::gemm::BatchedGemmCoord tiled_size{ThreadBlockShape::kM,
                                                 ThreadBlockShape::kN,
                                                 problem_size.k(), // no split-k
-                                                DEBUG ? 1 : LDG_B };
+                                                DEBUG ? 1 : THREAD_B };
 
     cutlass::gemm::BatchedGemmCoord tiled_shape = swizzle.get_tiled_shape(problem_size, tiled_size);
 
@@ -334,7 +334,7 @@ template<typename ThreadBlockShape_,
         typename LayoutA_,
         typename LayoutB_,
         typename LayoutCD_,
-        int LDG_B = 1, // batch tile size
+        int THREAD_B = 1, // batch tile size
         bool DEBUG=false>
 void batched_gemv_kernel_perf_test(cutlass::gemm::BatchedGemmCoord problem_size,
                                    ElementCD_ alpha = ElementCD_(1),
@@ -349,7 +349,7 @@ void batched_gemv_kernel_perf_test(cutlass::gemm::BatchedGemmCoord problem_size,
                              LayoutA_,
                              LayoutB_,
                              LayoutCD_,
-                             LDG_B,
+                             THREAD_B,
                              DEBUG>(problem_size, alpha, beta, true, iter);
 }
     
diff --git a/test/unit/gemm/thread/CMakeLists.txt b/test/unit/gemm/thread/CMakeLists.txt
index e63cdb741e..65858ca8e5 100644
--- a/test/unit/gemm/thread/CMakeLists.txt
+++ b/test/unit/gemm/thread/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/thread/gemm_sm50.cu b/test/unit/gemm/thread/gemm_sm50.cu
index 290eaaf25d..5d3ff05d4f 100644
--- a/test/unit/gemm/thread/gemm_sm50.cu
+++ b/test/unit/gemm/thread/gemm_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/thread/gemm_sm60.cu b/test/unit/gemm/thread/gemm_sm60.cu
index c01c75b450..542d8cb91d 100644
--- a/test/unit/gemm/thread/gemm_sm60.cu
+++ b/test/unit/gemm/thread/gemm_sm60.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/thread/gemm_sm61.cu b/test/unit/gemm/thread/gemm_sm61.cu
index c84ebd1f4b..78a59ce0f9 100644
--- a/test/unit/gemm/thread/gemm_sm61.cu
+++ b/test/unit/gemm/thread/gemm_sm61.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/thread/host/CMakeLists.txt b/test/unit/gemm/thread/host/CMakeLists.txt
index 2d9b83cfda..1dae0cf18b 100644
--- a/test/unit/gemm/thread/host/CMakeLists.txt
+++ b/test/unit/gemm/thread/host/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/thread/host/gemm_sm60_host.cu b/test/unit/gemm/thread/host/gemm_sm60_host.cu
index e54069ebbb..d1a831548c 100644
--- a/test/unit/gemm/thread/host/gemm_sm60_host.cu
+++ b/test/unit/gemm/thread/host/gemm_sm60_host.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/thread/host/testbed_host.h b/test/unit/gemm/thread/host/testbed_host.h
index 8e474acb2c..919254b6a1 100644
--- a/test/unit/gemm/thread/host/testbed_host.h
+++ b/test/unit/gemm/thread/host/testbed_host.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/thread/testbed.h b/test/unit/gemm/thread/testbed.h
index db8d0cc42e..9880804671 100644
--- a/test/unit/gemm/thread/testbed.h
+++ b/test/unit/gemm/thread/testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/threadblock/CMakeLists.txt b/test/unit/gemm/threadblock/CMakeLists.txt
index b77a2e630a..376ec1cc9e 100644
--- a/test/unit/gemm/threadblock/CMakeLists.txt
+++ b/test/unit/gemm/threadblock/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/threadblock/batched_gemv.cu b/test/unit/gemm/threadblock/batched_gemv.cu
index 78d8e50ed7..417a6d4129 100644
--- a/test/unit/gemm/threadblock/batched_gemv.cu
+++ b/test/unit/gemm/threadblock/batched_gemv.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -107,8 +107,8 @@ template<typename Shape_,
          typename LayoutA_,
          typename LayoutB_,
          typename LayoutC_,
-         int LDG_N,
-         int LDG_K,
+         int THREAD_N,
+         int THREAD_K,
          int MAX_THREADS_PER_BLOCK=512,
          bool DEBUG=false>
 void batched_gemv_threadblock_test(cutlass::gemm::GemmCoord problem_size, int num_batch)
@@ -120,7 +120,7 @@ void batched_gemv_threadblock_test(cutlass::gemm::GemmCoord problem_size, int nu
   using LayoutB = LayoutB_;
   using ElementC = ElementC_;
   using LayoutC = LayoutC_;
-  using ThreadShape = cutlass::gemm::GemmShape<1, LDG_N, LDG_K>;
+  using ThreadShape = cutlass::gemm::GemmShape<1, THREAD_N, THREAD_K>;
 
   using Core = typename cutlass::gemm::threadblock::DefaultGemvCore<
     Shape,
@@ -192,14 +192,14 @@ void batched_gemv_threadblock_test(cutlass::gemm::GemmCoord problem_size, int nu
   matrix_C_computed.sync_device();
 
   dim3 grid(1, 1);      // only 1 CTA is used
-  dim3 block(Shape::kN / LDG_N, num_batch, 1);
+  dim3 block(Shape::kN / THREAD_N, num_batch, 1);
 
   #if 0
   printf("block dim = %d x %d\n", block.x, block.y);
   #endif
 
   // Some sanity checks
-  EXPECT_TRUE( problem_size.n() % LDG_N == 0 );
+  EXPECT_TRUE( problem_size.n() % THREAD_N == 0 );
   EXPECT_TRUE( block.x*block.y <= MAX_THREADS_PER_BLOCK );
 
   test::gemm::threadblock::batched_gemv_threadblock_test_kernel<Mma><<< grid, block >>>(
@@ -261,126 +261,126 @@ TEST(SM50_batched_gemv_threadblock, 4x1x64x64_crc_fp32_fp32_2N_2K) {
   using namespace test::gemm::threadblock;
   cutlass::gemm::GemmCoord problem_size(1, 64, 64);
   const int num_batch = 4;
-  const int LDG_N = 2;
-  const int LDG_K = 2;
+  const int THREAD_N = 2;
+  const int THREAD_K = 2;
  
-  using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
+  using Shape = cutlass::gemm::GemmShape<1, 64, THREAD_K>;
   batched_gemv_threadblock_test<Shape, float, float, 
                                 cutlass::layout::ColumnMajor,
                                 cutlass::layout::RowMajor,
                                 cutlass::layout::ColumnMajor,
-                                LDG_N, LDG_K>(problem_size, num_batch);
+                                THREAD_N, THREAD_K>(problem_size, num_batch);
 }
 
 TEST(SM50_batched_gemv_threadblock, 5x1x128x128_crc_fp32_fp32_4N_4K) {
   using namespace test::gemm::threadblock;
   cutlass::gemm::GemmCoord problem_size(1, 128, 128);
   const int num_batch = 5;
-  const int LDG_N = 4;
-  const int LDG_K = 4;
+  const int THREAD_N = 4;
+  const int THREAD_K = 4;
 
-  using Shape = cutlass::gemm::GemmShape<1, 128, LDG_K>;
+  using Shape = cutlass::gemm::GemmShape<1, 128, THREAD_K>;
   batched_gemv_threadblock_test<Shape, float, float, 
                                 cutlass::layout::ColumnMajor,
                                 cutlass::layout::RowMajor,
                                 cutlass::layout::ColumnMajor,
-                                LDG_N, LDG_K>(problem_size, num_batch);
+                                THREAD_N, THREAD_K>(problem_size, num_batch);
 }
 
 TEST(SM50_batched_gemv_threadblock, 16x1x17x64_crc_fp32_fp32_1N_4K) {
   using namespace test::gemm::threadblock;
   cutlass::gemm::GemmCoord problem_size(1, 17, 64);
   const int num_batch = 16;
-  const int LDG_N = 1;
-  const int LDG_K = 4;
+  const int THREAD_N = 1;
+  const int THREAD_K = 4;
 
-  using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
+  using Shape = cutlass::gemm::GemmShape<1, 32, THREAD_K>;
   batched_gemv_threadblock_test<Shape,
                                 float, float, 
                                 cutlass::layout::ColumnMajor,
                                 cutlass::layout::RowMajor,
                                 cutlass::layout::ColumnMajor,
-                                LDG_N, LDG_K>(problem_size, num_batch);
+                                THREAD_N, THREAD_K>(problem_size, num_batch);
 }
 
 TEST(SM50_batched_gemv_threadblock, 4x1x64x64_crc_fp16_fp32_2N_2K) {
   using namespace test::gemm::threadblock;
   cutlass::gemm::GemmCoord problem_size(1, 64, 64);
   const int num_batch = 4;
-  const int LDG_N = 2;
-  const int LDG_K = 2;
+  const int THREAD_N = 2;
+  const int THREAD_K = 2;
   
-  using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
+  using Shape = cutlass::gemm::GemmShape<1, 64, THREAD_K>;
   batched_gemv_threadblock_test<Shape,
                                 cutlass::half_t, float, 
                                 cutlass::layout::ColumnMajor,
                                 cutlass::layout::RowMajor,
                                 cutlass::layout::ColumnMajor,
-                                LDG_N, LDG_K>(problem_size, num_batch);
+                                THREAD_N, THREAD_K>(problem_size, num_batch);
 }
 
 TEST(SM50_batched_gemv_threadblock, 4x1x64x64_crc_fp16_fp32_2N_8K) {
   using namespace test::gemm::threadblock;
   cutlass::gemm::GemmCoord problem_size(1, 64, 64);
   const int num_batch = 4;
-  const int LDG_N = 2;
-  const int LDG_K = 8;
+  const int THREAD_N = 2;
+  const int THREAD_K = 8;
  
-  using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
+  using Shape = cutlass::gemm::GemmShape<1, 64, THREAD_K>;
   batched_gemv_threadblock_test<Shape,
                                 cutlass::half_t, float, 
                                 cutlass::layout::ColumnMajor,
                                 cutlass::layout::RowMajor,
                                 cutlass::layout::ColumnMajor,
-                                LDG_N, LDG_K>(problem_size, num_batch);
+                                THREAD_N, THREAD_K>(problem_size, num_batch);
 }
 
 TEST(SM50_batched_gemv_threadblock, 16x1x17x64_crc_fp16_fp32_1N_4K) {
   using namespace test::gemm::threadblock;
   cutlass::gemm::GemmCoord problem_size(1, 17, 64);
   const int num_batch = 16;
-  const int LDG_N = 1;
-  const int LDG_K = 4;
+  const int THREAD_N = 1;
+  const int THREAD_K = 4;
 
-  using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
+  using Shape = cutlass::gemm::GemmShape<1, 32, THREAD_K>;
   batched_gemv_threadblock_test<Shape,
                                 cutlass::half_t, float, 
                                 cutlass::layout::ColumnMajor,
                                 cutlass::layout::RowMajor,
                                 cutlass::layout::ColumnMajor,
-                                LDG_N, LDG_K>(problem_size, num_batch);
+                                THREAD_N, THREAD_K>(problem_size, num_batch);
 }
 
 TEST(SM50_batched_gemv_threadblock, 4x1x64x64_crc_i8_i32_2N_4K) {
   using namespace test::gemm::threadblock;
   cutlass::gemm::GemmCoord problem_size(1, 64, 64);
   const int num_batch = 4;
-  const int LDG_N = 2;
-  const int LDG_K = 4;
+  const int THREAD_N = 2;
+  const int THREAD_K = 4;
 
-  using Shape = cutlass::gemm::GemmShape<1, 128, LDG_K>;
+  using Shape = cutlass::gemm::GemmShape<1, 128, THREAD_K>;
   batched_gemv_threadblock_test<Shape,
                                 int8_t, int32_t, 
                                 cutlass::layout::ColumnMajor,
                                 cutlass::layout::RowMajor,
                                 cutlass::layout::ColumnMajor,
-                                LDG_N, LDG_K>(problem_size, num_batch);
+                                THREAD_N, THREAD_K>(problem_size, num_batch);
 }
 
 TEST(SM50_batched_gemv_threadblock, 16x1x17x64_crc_i8_i32_1N_4K) {
   using namespace test::gemm::threadblock;
   cutlass::gemm::GemmCoord problem_size(1, 17, 64);
   const int num_batch = 16;
-  const int LDG_N = 1;
-  const int LDG_K = 4;
+  const int THREAD_N = 1;
+  const int THREAD_K = 4;
 
-  using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
+  using Shape = cutlass::gemm::GemmShape<1, 32, THREAD_K>;
   batched_gemv_threadblock_test<Shape,
                                 int8_t, int32_t, 
                                 cutlass::layout::ColumnMajor,
                                 cutlass::layout::RowMajor,
                                 cutlass::layout::ColumnMajor,
-                                LDG_N, LDG_K>(problem_size, num_batch);
+                                THREAD_N, THREAD_K>(problem_size, num_batch);
 }
 
 // A: RowMajor
@@ -392,126 +392,126 @@ TEST(SM50_batched_gemv_threadblock, 4x1x64x64_rcr_fp32_fp32_2N_2K) {
   using namespace test::gemm::threadblock;
   cutlass::gemm::GemmCoord problem_size(1, 64, 64);
   const int num_batch = 4;
-  const int LDG_N = 2;
-  const int LDG_K = 2;
+  const int THREAD_N = 2;
+  const int THREAD_K = 2;
  
-  using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
+  using Shape = cutlass::gemm::GemmShape<1, 64, THREAD_K>;
   batched_gemv_threadblock_test<Shape, float, float, 
                                 cutlass::layout::RowMajor,
                                 cutlass::layout::ColumnMajor,
                                 cutlass::layout::RowMajor,
-                                LDG_N, LDG_K>(problem_size, num_batch);
+                                THREAD_N, THREAD_K>(problem_size, num_batch);
 }
 
 TEST(SM50_batched_gemv_threadblock, 5x1x128x128_rcr_fp32_fp32_4N_4K) {
   using namespace test::gemm::threadblock;
   cutlass::gemm::GemmCoord problem_size(1, 128, 128);
   const int num_batch = 5;
-  const int LDG_N = 4;
-  const int LDG_K = 4;
+  const int THREAD_N = 4;
+  const int THREAD_K = 4;
 
-  using Shape = cutlass::gemm::GemmShape<1, 128, LDG_K>;
+  using Shape = cutlass::gemm::GemmShape<1, 128, THREAD_K>;
   batched_gemv_threadblock_test<Shape, float, float, 
                                 cutlass::layout::RowMajor,
                                 cutlass::layout::ColumnMajor,
                                 cutlass::layout::RowMajor,
-                                LDG_N, LDG_K>(problem_size, num_batch);
+                                THREAD_N, THREAD_K>(problem_size, num_batch);
 }
 
 TEST(SM50_batched_gemv_threadblock, 16x1x17x64_rcr_fp32_fp32_1N_4K) {
   using namespace test::gemm::threadblock;
   cutlass::gemm::GemmCoord problem_size(1, 17, 64);
   const int num_batch = 16;
-  const int LDG_N = 1;
-  const int LDG_K = 4;
+  const int THREAD_N = 1;
+  const int THREAD_K = 4;
 
-  using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
+  using Shape = cutlass::gemm::GemmShape<1, 32, THREAD_K>;
   batched_gemv_threadblock_test<Shape,
                                 float, float, 
                                 cutlass::layout::RowMajor,
                                 cutlass::layout::ColumnMajor,
                                 cutlass::layout::RowMajor,
-                                LDG_N, LDG_K>(problem_size, num_batch);
+                                THREAD_N, THREAD_K>(problem_size, num_batch);
 }
 
 TEST(SM50_batched_gemv_threadblock, 4x1x64x64_rcr_fp16_fp32_2N_2K) {
   using namespace test::gemm::threadblock;
   cutlass::gemm::GemmCoord problem_size(1, 64, 64);
   const int num_batch = 4;
-  const int LDG_N = 2;
-  const int LDG_K = 2;
+  const int THREAD_N = 2;
+  const int THREAD_K = 2;
   
-  using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
+  using Shape = cutlass::gemm::GemmShape<1, 64, THREAD_K>;
   batched_gemv_threadblock_test<Shape,
                                 cutlass::half_t, float, 
                                 cutlass::layout::RowMajor,
                                 cutlass::layout::ColumnMajor,
                                 cutlass::layout::RowMajor,
-                                LDG_N, LDG_K>(problem_size, num_batch);
+                                THREAD_N, THREAD_K>(problem_size, num_batch);
 }
 
 TEST(SM50_batched_gemv_threadblock, 4x1x64x64_rcr_fp16_fp32_2N_8K) {
   using namespace test::gemm::threadblock;
   cutlass::gemm::GemmCoord problem_size(1, 64, 64);
   const int num_batch = 4;
-  const int LDG_N = 2;
-  const int LDG_K = 8;
+  const int THREAD_N = 2;
+  const int THREAD_K = 8;
  
-  using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
+  using Shape = cutlass::gemm::GemmShape<1, 64, THREAD_K>;
   batched_gemv_threadblock_test<Shape,
                                 cutlass::half_t, float, 
                                 cutlass::layout::RowMajor,
                                 cutlass::layout::ColumnMajor,
                                 cutlass::layout::RowMajor,
-                                LDG_N, LDG_K>(problem_size, num_batch);
+                                THREAD_N, THREAD_K>(problem_size, num_batch);
 }
 
 TEST(SM50_batched_gemv_threadblock, 16x1x17x64_rcr_fp16_fp32_1N_4K) {
   using namespace test::gemm::threadblock;
   cutlass::gemm::GemmCoord problem_size(1, 17, 64);
   const int num_batch = 16;
-  const int LDG_N = 1;
-  const int LDG_K = 4;
+  const int THREAD_N = 1;
+  const int THREAD_K = 4;
 
-  using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
+  using Shape = cutlass::gemm::GemmShape<1, 32, THREAD_K>;
   batched_gemv_threadblock_test<Shape,
                                 cutlass::half_t, float, 
                                 cutlass::layout::RowMajor,
                                 cutlass::layout::ColumnMajor,
                                 cutlass::layout::RowMajor,
-                                LDG_N, LDG_K>(problem_size, num_batch);
+                                THREAD_N, THREAD_K>(problem_size, num_batch);
 }
 
 TEST(SM50_batched_gemv_threadblock, 4x1x64x64_rcr_i8_i32_2N_4K) {
   using namespace test::gemm::threadblock;
   cutlass::gemm::GemmCoord problem_size(1, 64, 64);
   const int num_batch = 4;
-  const int LDG_N = 2;
-  const int LDG_K = 4;
+  const int THREAD_N = 2;
+  const int THREAD_K = 4;
 
-  using Shape = cutlass::gemm::GemmShape<1, 128, LDG_K>;
+  using Shape = cutlass::gemm::GemmShape<1, 128, THREAD_K>;
   batched_gemv_threadblock_test<Shape,
                                 int8_t, int32_t, 
                                 cutlass::layout::RowMajor,
                                 cutlass::layout::ColumnMajor,
                                 cutlass::layout::RowMajor,
-                                LDG_N, LDG_K>(problem_size, num_batch);
+                                THREAD_N, THREAD_K>(problem_size, num_batch);
 }
 
 TEST(SM50_batched_gemv_threadblock, 16x1x17x64_rcr_i8_i32_1N_4K) {
   using namespace test::gemm::threadblock;
   cutlass::gemm::GemmCoord problem_size(1, 17, 64);
   const int num_batch = 16;
-  const int LDG_N = 1;
-  const int LDG_K = 4;
+  const int THREAD_N = 1;
+  const int THREAD_K = 4;
 
-  using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
+  using Shape = cutlass::gemm::GemmShape<1, 32, THREAD_K>;
   batched_gemv_threadblock_test<Shape,
                                 int8_t, int32_t, 
                                 cutlass::layout::RowMajor,
                                 cutlass::layout::ColumnMajor,
                                 cutlass::layout::RowMajor,
-                                LDG_N, LDG_K>(problem_size, num_batch);
+                                THREAD_N, THREAD_K>(problem_size, num_batch);
 }
 
 // A: RowMajor
@@ -523,124 +523,124 @@ TEST(SM50_batched_gemv_threadblock, 4x1x64x64_rcc_fp32_fp32_2N_2K) {
   using namespace test::gemm::threadblock;
   cutlass::gemm::GemmCoord problem_size(1, 64, 64);
   const int num_batch = 4;
-  const int LDG_N = 2;
-  const int LDG_K = 2;
+  const int THREAD_N = 2;
+  const int THREAD_K = 2;
  
-  using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
+  using Shape = cutlass::gemm::GemmShape<1, 64, THREAD_K>;
   batched_gemv_threadblock_test<Shape, float, float, 
                                 cutlass::layout::RowMajor,
                                 cutlass::layout::ColumnMajor,
                                 cutlass::layout::ColumnMajor,
-                                LDG_N, LDG_K>(problem_size, num_batch);
+                                THREAD_N, THREAD_K>(problem_size, num_batch);
 }
 
 TEST(SM50_batched_gemv_threadblock, 5x1x128x128_rcc_fp32_fp32_4N_4K) {
   using namespace test::gemm::threadblock;
   cutlass::gemm::GemmCoord problem_size(1, 128, 128);
   const int num_batch = 5;
-  const int LDG_N = 4;
-  const int LDG_K = 4;
+  const int THREAD_N = 4;
+  const int THREAD_K = 4;
 
-  using Shape = cutlass::gemm::GemmShape<1, 128, LDG_K>;
+  using Shape = cutlass::gemm::GemmShape<1, 128, THREAD_K>;
   batched_gemv_threadblock_test<Shape, float, float, 
                                 cutlass::layout::RowMajor,
                                 cutlass::layout::ColumnMajor,
                                 cutlass::layout::ColumnMajor,
-                                LDG_N, LDG_K>(problem_size, num_batch);
+                                THREAD_N, THREAD_K>(problem_size, num_batch);
 }
 
 TEST(SM50_batched_gemv_threadblock, 16x1x17x64_rcc_fp32_fp32_1N_4K) {
   using namespace test::gemm::threadblock;
   cutlass::gemm::GemmCoord problem_size(1, 17, 64);
   const int num_batch = 16;
-  const int LDG_N = 1;
-  const int LDG_K = 4;
+  const int THREAD_N = 1;
+  const int THREAD_K = 4;
 
-  using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
+  using Shape = cutlass::gemm::GemmShape<1, 32, THREAD_K>;
   batched_gemv_threadblock_test<Shape,
                                 float, float, 
                                 cutlass::layout::RowMajor,
                                 cutlass::layout::ColumnMajor,
                                 cutlass::layout::ColumnMajor,
-                                LDG_N, LDG_K>(problem_size, num_batch);
+                                THREAD_N, THREAD_K>(problem_size, num_batch);
 }
 
 TEST(SM50_batched_gemv_threadblock, 4x1x64x64_rcc_fp16_fp32_2N_2K) {
   using namespace test::gemm::threadblock;
   cutlass::gemm::GemmCoord problem_size(1, 64, 64);
   const int num_batch = 4;
-  const int LDG_N = 2;
-  const int LDG_K = 2;
+  const int THREAD_N = 2;
+  const int THREAD_K = 2;
   
-  using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
+  using Shape = cutlass::gemm::GemmShape<1, 64, THREAD_K>;
   batched_gemv_threadblock_test<Shape,
                                 cutlass::half_t, float, 
                                 cutlass::layout::RowMajor,
                                 cutlass::layout::ColumnMajor,
                                 cutlass::layout::ColumnMajor,
-                                LDG_N, LDG_K>(problem_size, num_batch);
+                                THREAD_N, THREAD_K>(problem_size, num_batch);
 }
 
 TEST(SM50_batched_gemv_threadblock, 4x1x64x64_rcc_fp16_fp32_2N_8K) {
   using namespace test::gemm::threadblock;
   cutlass::gemm::GemmCoord problem_size(1, 64, 64);
   const int num_batch = 4;
-  const int LDG_N = 2;
-  const int LDG_K = 8;
+  const int THREAD_N = 2;
+  const int THREAD_K = 8;
  
-  using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
+  using Shape = cutlass::gemm::GemmShape<1, 64, THREAD_K>;
   batched_gemv_threadblock_test<Shape,
                                 cutlass::half_t, float, 
                                 cutlass::layout::RowMajor,
                                 cutlass::layout::ColumnMajor,
                                 cutlass::layout::ColumnMajor,
-                                LDG_N, LDG_K>(problem_size, num_batch);
+                                THREAD_N, THREAD_K>(problem_size, num_batch);
 }
 
 TEST(SM50_batched_gemv_threadblock, 16x1x17x64_rcc_fp16_fp32_1N_4K) {
   using namespace test::gemm::threadblock;
   cutlass::gemm::GemmCoord problem_size(1, 17, 64);
   const int num_batch = 16;
-  const int LDG_N = 1;
-  const int LDG_K = 4;
+  const int THREAD_N = 1;
+  const int THREAD_K = 4;
 
-  using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
+  using Shape = cutlass::gemm::GemmShape<1, 32, THREAD_K>;
   batched_gemv_threadblock_test<Shape,
                                 cutlass::half_t, float, 
                                 cutlass::layout::RowMajor,
                                 cutlass::layout::ColumnMajor,
                                 cutlass::layout::ColumnMajor,
-                                LDG_N, LDG_K>(problem_size, num_batch);
+                                THREAD_N, THREAD_K>(problem_size, num_batch);
 }
 
 TEST(SM50_batched_gemv_threadblock, 4x1x64x64_rcc_i8_i32_2N_4K) {
   using namespace test::gemm::threadblock;
   cutlass::gemm::GemmCoord problem_size(1, 64, 64);
   const int num_batch = 4;
-  const int LDG_N = 2;
-  const int LDG_K = 4;
+  const int THREAD_N = 2;
+  const int THREAD_K = 4;
 
-  using Shape = cutlass::gemm::GemmShape<1, 128, LDG_K>;
+  using Shape = cutlass::gemm::GemmShape<1, 128, THREAD_K>;
   batched_gemv_threadblock_test<Shape,
                                 int8_t, int32_t, 
                                 cutlass::layout::RowMajor,
                                 cutlass::layout::ColumnMajor,
                                 cutlass::layout::ColumnMajor,
-                                LDG_N, LDG_K>(problem_size, num_batch);
+                                THREAD_N, THREAD_K>(problem_size, num_batch);
 }
 
 TEST(SM50_batched_gemv_threadblock, 16x1x17x64_rcc_i8_i32_1N_4K) {
   using namespace test::gemm::threadblock;
   cutlass::gemm::GemmCoord problem_size(1, 17, 64);
   const int num_batch = 16;
-  const int LDG_N = 1;
-  const int LDG_K = 4;
+  const int THREAD_N = 1;
+  const int THREAD_K = 4;
 
-  using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
+  using Shape = cutlass::gemm::GemmShape<1, 32, THREAD_K>;
   batched_gemv_threadblock_test<Shape,
                                 int8_t, int32_t, 
                                 cutlass::layout::RowMajor,
                                 cutlass::layout::ColumnMajor,
                                 cutlass::layout::ColumnMajor,
-                                LDG_N, LDG_K>(problem_size, num_batch);
+                                THREAD_N, THREAD_K>(problem_size, num_batch);
 }
diff --git a/test/unit/gemm/threadblock/epilogue_workspace.cu b/test/unit/gemm/threadblock/epilogue_workspace.cu
index f7c57681eb..55e7741e0d 100644
--- a/test/unit/gemm/threadblock/epilogue_workspace.cu
+++ b/test/unit/gemm/threadblock/epilogue_workspace.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/threadblock/mma_multistage.cu b/test/unit/gemm/threadblock/mma_multistage.cu
index 26730e5418..13df9dc13f 100644
--- a/test/unit/gemm/threadblock/mma_multistage.cu
+++ b/test/unit/gemm/threadblock/mma_multistage.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -39,6 +39,142 @@
 
 ////////////////////////////////////////////////////////////////////////////////
 
+TEST(SM80_gemm_threadblock_congruous, tensor_op_16x128x64_16x32x64_16x8x16_3stage) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(32, 256, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 64>;
+  using WarpShape = cutlass::gemm::GemmShape<16, 32, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+  int const Stages = 3;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassTensorOp, Stages>;
+
+  dim3 grid(2, 2);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_gemm_threadblock_congruous, tensor_op_128x16x64_32x16x64_16x8x16_3stage) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(256, 32, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<128, 16, 64>;
+  using WarpShape = cutlass::gemm::GemmShape<32, 16, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+  int const Stages = 3;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassTensorOp, Stages>;
+
+  dim3 grid(2, 2);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_gemm_threadblock_congruous, tensor_op_32x128x32_32x32x32_16x8x16_3stage) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(64, 256, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+  int const Stages = 3;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassTensorOp, Stages>;
+
+  dim3 grid(2, 2);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_gemm_threadblock_congruous, tensor_op_128x32x32_32x32x32_16x8x16_3stage) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(256, 64, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+  int const Stages = 3;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassTensorOp, Stages>;
+
+  dim3 grid(2, 2);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
 TEST(SM80_gemm_threadblock_congruous,
      tensor_op_64x64x64_64x64x64_16x8x16_3stage) {
   using ElementA = cutlass::half_t;
@@ -2838,7 +2974,6 @@ TEST(SM80_gemm_threadblock_crosswise,
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-
 TEST(SM80_gemm_threadblock_crosswise,
      tensor_op_64x64x1024_64x64x1024_16x8x256_3stage) {
   using ElementA = cutlass::uint1b_t;
@@ -2871,7 +3006,6 @@ TEST(SM80_gemm_threadblock_crosswise,
                                             problem_size.k(), alpha, beta)
       .run(grid, block);
 }
-
 ////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM80_gemm_threadblock_crosswise,
@@ -2906,7 +3040,6 @@ TEST(SM80_gemm_threadblock_crosswise,
                                             problem_size.k(), alpha, beta)
       .run(grid, block);
 }
-
 ////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM80_gemm_threadblock_crosswise,
@@ -2941,7 +3074,6 @@ TEST(SM80_gemm_threadblock_crosswise,
                                             problem_size.k(), alpha, beta)
       .run(grid, block);
 }
-
 ////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM80_gemm_threadblock_crosswise,
@@ -2976,7 +3108,6 @@ TEST(SM80_gemm_threadblock_crosswise,
                                             problem_size.k(), alpha, beta)
       .run(grid, block);
 }
-
 ////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM80_gemm_threadblock_crosswise,
@@ -3011,7 +3142,6 @@ TEST(SM80_gemm_threadblock_crosswise,
                                             problem_size.k(), alpha, beta)
       .run(grid, block);
 }
-
 ////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM80_gemm_threadblock_crosswise,
@@ -3046,7 +3176,6 @@ TEST(SM80_gemm_threadblock_crosswise,
                                             problem_size.k(), alpha, beta)
       .run(grid, block);
 }
-
 ////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM80_gemm_threadblock_crosswise,
@@ -3081,7 +3210,6 @@ TEST(SM80_gemm_threadblock_crosswise,
                                             problem_size.k(), alpha, beta)
       .run(grid, block);
 }
-
 ////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM80_gemm_threadblock_crosswise,
@@ -3116,7 +3244,6 @@ TEST(SM80_gemm_threadblock_crosswise,
                                             problem_size.k(), alpha, beta)
       .run(grid, block);
 }
-
 ////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM80_gemm_threadblock_crosswise,
@@ -3151,7 +3278,6 @@ TEST(SM80_gemm_threadblock_crosswise,
                                             problem_size.k(), alpha, beta)
       .run(grid, block);
 }
-
 ////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM80_gemm_threadblock_crosswise,
@@ -3186,7 +3312,6 @@ TEST(SM80_gemm_threadblock_crosswise,
                                             problem_size.k(), alpha, beta)
       .run(grid, block);
 }
-
 ////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM80_gemm_threadblock_crosswise,
@@ -3221,7 +3346,6 @@ TEST(SM80_gemm_threadblock_crosswise,
                                             problem_size.k(), alpha, beta)
       .run(grid, block);
 }
-
 ////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM80_gemm_threadblock_crosswise,
@@ -3256,7 +3380,6 @@ TEST(SM80_gemm_threadblock_crosswise,
                                             problem_size.k(), alpha, beta)
       .run(grid, block);
 }
-
 ////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM80_gemm_threadblock_crosswise,
@@ -3291,7 +3414,6 @@ TEST(SM80_gemm_threadblock_crosswise,
                                             problem_size.k(), alpha, beta)
       .run(grid, block);
 }
-
 ////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM80_gemm_threadblock_crosswise,
@@ -3326,9 +3448,7 @@ TEST(SM80_gemm_threadblock_crosswise,
                                             problem_size.k(), alpha, beta)
       .run(grid, block);
 }
-
 ////////////////////////////////////////////////////////////////////////////////
-
 TEST(SM80_gemm_threadblock_congruous,
      tensor_op_64x64x16_32x64x16_8x8x4_3stage) {
   using ElementA = double;
diff --git a/test/unit/gemm/threadblock/mma_multistage_slicedk.cu b/test/unit/gemm/threadblock/mma_multistage_slicedk.cu
index e14f3c8775..2bf2b653ef 100644
--- a/test/unit/gemm/threadblock/mma_multistage_slicedk.cu
+++ b/test/unit/gemm/threadblock/mma_multistage_slicedk.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/threadblock/mma_multistage_sparse.cu b/test/unit/gemm/threadblock/mma_multistage_sparse.cu
index c66faa055b..1625146b12 100644
--- a/test/unit/gemm/threadblock/mma_multistage_sparse.cu
+++ b/test/unit/gemm/threadblock/mma_multistage_sparse.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -179,6 +179,111 @@ TEST(SM80_sparse_gemm_threadblock_congruous,
 
 ////////////////////////////////////////////////////////////////////////////////
 
+TEST(SM80_sparse_gemm_threadblock_congruous,
+     tensor_op_128x32x64_32x32x64_16x8x32_4stage) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(128, 32, 512);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 64>;
+  using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+  int const Stages = 4;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultSparseMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp,
+      Stages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::SparseTestbed<MmaCore>(
+      problem_size.m(), problem_size.n(), problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_sparse_gemm_threadblock_congruous,
+     tensor_op_32x256x128_32x64x128_16x8x32_4stage) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(32, 256, 512);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 128>;
+  using WarpShape = cutlass::gemm::GemmShape<32, 64, 128>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+  int const Stages = 4;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultSparseMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp,
+      Stages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::SparseTestbed<MmaCore>(
+      problem_size.m(), problem_size.n(), problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_sparse_gemm_threadblock_congruous,
+     tensor_op_128x16x64_32x16x64_16x8x32_4stage) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(128, 16, 512);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<128, 16, 64>;
+  using WarpShape = cutlass::gemm::GemmShape<32, 16, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+  int const Stages = 4;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultSparseMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp,
+      Stages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::SparseTestbed<MmaCore>(
+      problem_size.m(), problem_size.n(), problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
 TEST(SM80_sparse_gemm_threadblock_congruous,
      tensor_op_128x128x64_64x64x64_16x8x32_4stage) {
   using ElementA = cutlass::half_t;
diff --git a/test/unit/gemm/threadblock/mma_multistage_sparse_testbed.h b/test/unit/gemm/threadblock/mma_multistage_sparse_testbed.h
index 7ae5470447..a0cba6d7a3 100644
--- a/test/unit/gemm/threadblock/mma_multistage_sparse_testbed.h
+++ b/test/unit/gemm/threadblock/mma_multistage_sparse_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -241,8 +241,6 @@ struct SparseTestbed {
     // Determine SMEM requirements and waive if not satisfied
     //
 
-    int smem_size = int(sizeof(typename Mma::SharedStorage));
-
     cudaDeviceProp properties;
     int device_idx;
     cudaError_t result = cudaGetDevice(&device_idx);
@@ -257,10 +255,6 @@ struct SparseTestbed {
       throw std::runtime_error("cudaGetDeviceProperties() failed");
     }
 
-    if (properties.sharedMemPerMultiprocessor < smem_size) {
-      return false;
-    }
-
     return true;
   }
 
@@ -302,7 +296,6 @@ struct SparseTestbed {
     } else if (init_A == cutlass::Distribution::Identity) {
       cutlass::reference::host::TensorFillIdentity(matrix_A.host_view());
     } else {
-      // TODO: Implement the rest
       return false;
     }
 
@@ -328,7 +321,6 @@ struct SparseTestbed {
     } else if (init_B == cutlass::Distribution::Identity) {
       cutlass::reference::host::TensorFillIdentity(matrix_B.host_view());
     } else {
-      // TODO: Implement the rest
       return false;
     }
 
@@ -345,7 +337,6 @@ struct SparseTestbed {
       cutlass::reference::host::TensorFill(matrix_E.host_view(),
                                            (ElementE)(content));
     } else {
-      // TODO: Implement the rest
       return false;
     }
 
@@ -415,7 +406,12 @@ struct SparseTestbed {
     bool passed = cutlass::reference::host::TensorEquals(
         matrix_C_computed.host_view(), matrix_C_reference.host_view());
 
-    EXPECT_TRUE(passed)
+    EXPECT_TRUE(passed);
+
+    if (!passed && CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+
+      std::cout
+        << __FILE__ << ":" << __LINE__ << "  "
         << "A:\n" << matrix_A.host_view() << "\n"
         << "B:\n" << matrix_B.host_view() << "\n"
         << "E:\n" << matrix_E.host_view() << "\n"
@@ -423,6 +419,7 @@ struct SparseTestbed {
         << matrix_C_reference.host_view() << "\n"
         << "Computed:\n"
         << matrix_C_computed.host_view() << "\n";
+    }
 
     EXPECT_GT(cutlass::reference::host::TensorNorm(matrix_C_reference.host_view()), 0);
     EXPECT_GT(cutlass::reference::host::TensorNorm(matrix_C_computed.host_view()), 0);
diff --git a/test/unit/gemm/threadblock/mma_multistage_testbed.h b/test/unit/gemm/threadblock/mma_multistage_testbed.h
index 069ba0adbf..9843266834 100644
--- a/test/unit/gemm/threadblock/mma_multistage_testbed.h
+++ b/test/unit/gemm/threadblock/mma_multistage_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -193,11 +193,40 @@ struct Testbed {
     matrix_C_reference.reset(cutlass::make_Coord(m, n), false);
   }
 
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    return true;
+  }
+
   /// Runs the test
   bool run(
       dim3 grid, dim3 block,
       cutlass::Distribution::Kind init_A = cutlass::Distribution::Uniform,
       cutlass::Distribution::Kind init_B = cutlass::Distribution::Uniform) {
+
+    if (!sufficient()) {
+      return true;
+    }
+
     //
     // initialize device memory
     //
@@ -224,7 +253,6 @@ struct Testbed {
     } else if (init_A == cutlass::Distribution::Identity) {
       cutlass::reference::host::TensorFillIdentity(matrix_A.host_view());
     } else {
-      // TODO: Implement the rest
       return false;
     }
 
@@ -250,7 +278,6 @@ struct Testbed {
     } else if (init_B == cutlass::Distribution::Identity) {
       cutlass::reference::host::TensorFillIdentity(matrix_B.host_view());
     } else {
-      // TODO: Implement the rest
       return false;
     }
 
@@ -318,13 +345,18 @@ struct Testbed {
     bool passed = cutlass::reference::host::TensorEquals(
         matrix_C_computed.host_view(), matrix_C_reference.host_view());
 
-    EXPECT_TRUE(passed) 
+    EXPECT_TRUE(passed);
+
+    if (!passed && CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+      std::cout
+        << __FILE__ << ":" << __LINE__ << "  "
         << "A:\n" << matrix_A.host_view() << "\n"
         << "B:\n" << matrix_B.host_view() << "\n"
         << "Reference:\n"
         << matrix_C_reference.host_view() << "\n"
         << "Computed:\n"
         << matrix_C_computed.host_view() << "\n";
+    }
 
     EXPECT_GT(cutlass::reference::host::TensorNorm(matrix_C_reference.host_view()), 0);
     EXPECT_GT(cutlass::reference::host::TensorNorm(matrix_C_computed.host_view()), 0);
diff --git a/test/unit/gemm/threadblock/mma_multistage_testbed_slicedk.h b/test/unit/gemm/threadblock/mma_multistage_testbed_slicedk.h
index c8343f8fcf..d6b494ad68 100644
--- a/test/unit/gemm/threadblock/mma_multistage_testbed_slicedk.h
+++ b/test/unit/gemm/threadblock/mma_multistage_testbed_slicedk.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -244,7 +244,6 @@ struct Testbed {
     } else if (init_A == cutlass::Distribution::Identity) {
       cutlass::reference::host::TensorFillIdentity(matrix_A.host_view());
     } else {
-      // TODO: Implement the rest
       return false;
     }
 
@@ -270,7 +269,6 @@ struct Testbed {
     } else if (init_B == cutlass::Distribution::Identity) {
       cutlass::reference::host::TensorFillIdentity(matrix_B.host_view());
     } else {
-      // TODO: Implement the rest
       return false;
     }
 
diff --git a/test/unit/gemm/threadblock/mma_pipelined_simt.cu b/test/unit/gemm/threadblock/mma_pipelined_simt.cu
index c2cb540886..0c304b6411 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_simt.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_simt.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/threadblock/mma_pipelined_slicedk.cu b/test/unit/gemm/threadblock/mma_pipelined_slicedk.cu
index 455cf21e18..5b0ef73207 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_slicedk.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_slicedk.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/threadblock/mma_pipelined_sm70.cu b/test/unit/gemm/threadblock/mma_pipelined_sm70.cu
index 9108d33509..dac660047c 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_sm70.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/threadblock/mma_pipelined_sm75.cu b/test/unit/gemm/threadblock/mma_pipelined_sm75.cu
index 5cd20fd42a..38ddcb4c1c 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_sm75.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -569,7 +569,6 @@ TEST(SM75_gemm_threadblock_crosswise,
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-
 TEST(SM75_gemm_threadblock_interleaved, tensor_op_32x32x64_16x16x64_8x8x16) {
   using ElementA = uint8_t;
   using LayoutA = cutlass::layout::ColumnMajorInterleaved<32>;
diff --git a/test/unit/gemm/threadblock/mma_pipelined_sm80.cu b/test/unit/gemm/threadblock/mma_pipelined_sm80.cu
index 1a80c699c3..7b6f46f81c 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_sm80.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/threadblock/mma_pipelined_testbed.h b/test/unit/gemm/threadblock/mma_pipelined_testbed.h
index c36e803029..a3d697c422 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_testbed.h
+++ b/test/unit/gemm/threadblock/mma_pipelined_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -217,11 +217,25 @@ struct Testbed {
     matrix_C_reference.reset(cutlass::make_Coord(m, n), false);
   }
 
+  bool sufficient() {
+    return true;
+  }
+
   /// Runs the test
   bool run(
       dim3 grid, dim3 block,
       cutlass::Distribution::Kind init_A = cutlass::Distribution::Uniform,
       cutlass::Distribution::Kind init_B = cutlass::Distribution::Uniform) {
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+
     //
     // initialize device memory
     //
@@ -248,7 +262,6 @@ struct Testbed {
     } else if (init_A == cutlass::Distribution::Identity) {
       cutlass::reference::host::TensorFillIdentity(matrix_A.host_view());
     } else {
-      // TODO: Implement the rest
       return false;
     }
 
@@ -274,7 +287,6 @@ struct Testbed {
     } else if (init_B == cutlass::Distribution::Identity) {
       cutlass::reference::host::TensorFillIdentity(matrix_B.host_view());
     } else {
-      // TODO: Implement the rest
       return false;
     }
 
@@ -300,7 +312,7 @@ struct Testbed {
 
     cudaError_t result = cudaDeviceSynchronize();
     EXPECT_EQ(result, cudaSuccess)
-        << " kernel error: " << cudaGetErrorString(result);
+        << " kernel error: " << cudaGetErrorString(result) << " on device " << GetCudaDevice();
 
     matrix_C_computed.sync_host();
 
@@ -316,7 +328,7 @@ struct Testbed {
     bool passed = cutlass::reference::host::TensorEquals(
         matrix_C_computed.host_view(), matrix_C_reference.host_view());
 
-    EXPECT_TRUE(passed);
+    EXPECT_TRUE(passed) << "Failed on device " << GetCudaDevice();
 
     if (!passed) {
       std::ofstream output("mma_pipelined_testbed_errors.txt");
diff --git a/test/unit/gemm/threadblock/mma_pipelined_testbed_slicedk.h b/test/unit/gemm/threadblock/mma_pipelined_testbed_slicedk.h
index 1d509d5cc7..65eb7f9d6e 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_testbed_slicedk.h
+++ b/test/unit/gemm/threadblock/mma_pipelined_testbed_slicedk.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -250,7 +250,6 @@ struct Testbed {
     } else if (init_A == cutlass::Distribution::Identity) {
       cutlass::reference::host::TensorFillIdentity(matrix_A.host_view());
     } else {
-      // TODO: Implement the rest
       return false;
     }
 
@@ -276,7 +275,6 @@ struct Testbed {
     } else if (init_B == cutlass::Distribution::Identity) {
       cutlass::reference::host::TensorFillIdentity(matrix_B.host_view());
     } else {
-      // TODO: Implement the rest
       return false;
     }
 
diff --git a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu
index aec8b0589e..47ef1f86ec 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu
index 9a8602b13e..49e7a1085b 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -193,7 +193,6 @@ TEST(SM75_gemm_threadblock_wmma_tensor_op_col_row_row_s8, 64x64x64_64x64x64_16x1
 ///////////////////////////////////////////////////////////////////////
 
 #if defined(CUTLASS_SUBBYTE_INTEGER_MATRIX_MULTIPLY_ENABLED)
-
 TEST(SM75_gemm_threadblock_wmma_tensor_op_row_col_row_s4, 64x64x128_64x64x128_8x8x32) {
   using ElementA = cutlass::int4b_t;
   using LayoutA = cutlass::layout::RowMajor;
@@ -332,6 +331,7 @@ TEST(SM75_gemm_threadblock_wmma_tensor_op_row_col_col_b1, 64x64x512_64x64x512_8x
                                             problem_size.k(), alpha, beta)
       .run(grid, block);
 }
+
 #endif //CUTLASS_SUBBYTE_INTEGER_MATRIX_MULTIPLY_ENABLED
 
 #endif //CUTLASS_ARCH_WMMA_SM75_ENABLED
diff --git a/test/unit/gemm/threadblock/mma_planar_complex_sm80.cu b/test/unit/gemm/threadblock/mma_planar_complex_sm80.cu
index 08754ebf6d..697392bf93 100644
--- a/test/unit/gemm/threadblock/mma_planar_complex_sm80.cu
+++ b/test/unit/gemm/threadblock/mma_planar_complex_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/threadblock/mma_planar_complex_testbed.h b/test/unit/gemm/threadblock/mma_planar_complex_testbed.h
index 1d8ef51c33..948d1d62a6 100644
--- a/test/unit/gemm/threadblock/mma_planar_complex_testbed.h
+++ b/test/unit/gemm/threadblock/mma_planar_complex_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -232,7 +232,6 @@ struct TestbedPlanarComplex {
     } else if (init_A == cutlass::Distribution::Identity) {
       //cutlass::reference::host::TensorFillIdentity(matrix_A.host_view());
     } else {
-      // TODO: Implement the rest
       return false;
     }
 
@@ -270,7 +269,6 @@ struct TestbedPlanarComplex {
       //cutlass::reference::host::TensorFillIdentity(matrix_B.host_view());
 
     } else {
-      // TODO: Implement the rest
       return false;
     }
 
diff --git a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu
index 2e51678088..e56bfcbd9a 100644
--- a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu
+++ b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu
index 7844b4d3ec..16b6d7afcd 100644
--- a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu
+++ b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -193,7 +193,6 @@ TEST(SM75_gemm_threadblock_singlestage_wmma_tensor_op_col_row_row_s8, 64x64x64_6
 ///////////////////////////////////////////////////////////////////////
 
 #if defined(CUTLASS_SUBBYTE_INTEGER_MATRIX_MULTIPLY_ENABLED)
-
 TEST(SM75_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_s4, 64x64x128_64x64x128_8x8x32) {
   using ElementA = cutlass::int4b_t;
   using LayoutA = cutlass::layout::RowMajor;
diff --git a/test/unit/gemm/warp/CMakeLists.txt b/test/unit/gemm/warp/CMakeLists.txt
index acef876dd0..8e62ed28df 100644
--- a/test/unit/gemm/warp/CMakeLists.txt
+++ b/test/unit/gemm/warp/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -37,6 +37,9 @@ cutlass_test_unit_add_executable(
   gemm_complex_sm80.cu
   gemm_sparse_sm80.cu
   gemm_gaussian_complex_sm80.cu
+  gemm_mixed_input_sm80.cu
+  gemm_sm90.cu
+  gemm_complex_sm90.cu
   wmma_sm70.cu
   wmma_sm72.cu
   wmma_sm75.cu
diff --git a/test/unit/gemm/warp/gemm_complex_sm80.cu b/test/unit/gemm/warp/gemm_complex_sm80.cu
index d031234587..6d09e44ed2 100644
--- a/test/unit/gemm/warp/gemm_complex_sm80.cu
+++ b/test/unit/gemm/warp/gemm_complex_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -56,7 +56,7 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // complex<double> * complex<double> => complex<double>
 // Input data type: complex<double>
-// Math instruction: MMA.884.F64.F64
+// Math instruction: mma.sync.aligned.m8n8k4.f64.f64.f64.f64
 // Output data type: complex<double>
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 TEST(SM80_warp_gemm_complex_tensor_op_f64, 8x8x4_8x8x4_nt) {
@@ -293,7 +293,7 @@ TEST(SM80_warp_gemm_complex_tensor_op_f64, 16x16x4_8x8x4_tn) {
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // complex<float> * complex<float> => complex<float>
 // Input data type: complex<float>
-// Math instruction: MMA.1688.F32.TF32
+// Math instruction: mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32
 // Output data type: complex<float>
 // Shared memory layout: Congrous
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -495,7 +495,7 @@ TEST(SM80_warp_gemm_complex_tensor_op_f32, 32x32x8_16x8x8_ct) {
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // complex<float> * complex<float> => complex<float>
 // Input data type: complex<float>
-// Math instruction: MMA.1688.F32.TF32
+// Math instruction: mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32
 // Output data type: complex<float>
 // Shared memory layout: Crosswise
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -526,7 +526,7 @@ TEST(SM80_warp_gemm_complex_tensor_op_f32, 16x16x8_16x8x8_tn) {
       .run();
 }
 
-// TEST FAILS crosswise complex<float> TN MMA.1688.F32.TF32 test fails for k = 2*8 = 16
+// TEST FAILS crosswise complex<float> TN mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32 test fails for k = 2*8 = 16
 TEST(SM80_warp_gemm_complex_tensor_op_f32, 16x16x16_16x8x8_tn) {
 
   using Shape = cutlass::gemm::GemmShape<16, 16, 16>;
diff --git a/test/unit/gemm/warp/gemm_complex_sm90.cu b/test/unit/gemm/warp/gemm_complex_sm90.cu
new file mode 100644
index 0000000000..6a5b2f8807
--- /dev/null
+++ b/test/unit/gemm/warp/gemm_complex_sm90.cu
@@ -0,0 +1,332 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file 
+
+    \brief Unit tests for thread-level GEMM with Hopper FP64
+*/
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/half.h"
+
+#include "cutlass/gemm/warp/default_mma_complex_tensor_op.h"
+
+#include "cutlass/core_io.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+TEST(SM90_warp_gemm_complex_tensor_op_f64, 16x8x4_16x8x4_nt) {
+
+  using Shape = cutlass::gemm::GemmShape<16, 8, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 4>;
+  
+  using Element = cutlass::complex<double>;
+  using ElementC = cutlass::complex<double>;
+
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor
+  >::Type;
+
+  test::gemm::warp::TestbedComplex<MmaTensorOp, cutlass::gemm::GemmShape<16, 8, 4> >().run();
+}
+
+TEST(SM90_warp_gemm_complex_tensor_op_f64, 16x16x4_16x8x4_nt) {
+
+  using Shape = cutlass::gemm::GemmShape<16, 16, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 4>;
+  
+  using Element = cutlass::complex<double>;
+  using ElementC = cutlass::complex<double>;
+
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor
+  >::Type;
+
+  test::gemm::warp::TestbedComplex<MmaTensorOp, cutlass::gemm::GemmShape<16, 16, 4> >().run();
+}
+
+TEST(SM90_warp_gemm_complex_tensor_op_f64, 16x32x4_16x8x4_nt) {
+
+  using Shape = cutlass::gemm::GemmShape<16, 32, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 4>;
+  
+  using Element = cutlass::complex<double>;
+  using ElementC = cutlass::complex<double>;
+
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor
+  >::Type;
+
+  test::gemm::warp::TestbedComplex<MmaTensorOp, cutlass::gemm::GemmShape<16, 32, 4> >().run();
+}
+
+TEST(SM90_warp_gemm_complex_tensor_op_f64, 32x16x4_16x8x4_nt) {
+
+  using Shape = cutlass::gemm::GemmShape<32, 16, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 4>;
+  
+  using Element = cutlass::complex<double>;
+  using ElementC = cutlass::complex<double>;
+
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor
+  >::Type;
+
+  test::gemm::warp::TestbedComplex<MmaTensorOp, cutlass::gemm::GemmShape<32, 16, 4> >().run();
+}
+
+TEST(SM90_warp_gemm_complex_tensor_op_f64, 32x32x4_16x8x4_nt) {
+
+  using Shape = cutlass::gemm::GemmShape<32, 32, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 4>;
+  
+  using Element = cutlass::complex<double>;
+  using ElementC = cutlass::complex<double>;
+
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor
+  >::Type;
+
+  test::gemm::warp::TestbedComplex<MmaTensorOp, cutlass::gemm::GemmShape<32, 32, 4> >().run();
+}
+
+TEST(SM90_warp_gemm_complex_tensor_op_f64, 32x32x4_16x8x4_nh) {
+
+  using Shape = cutlass::gemm::GemmShape<32, 32, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 4>;
+  
+  using Element = cutlass::complex<double>;
+  using ElementC = cutlass::complex<double>;
+
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kConjugate
+  >::Type;
+
+  test::gemm::warp::TestbedComplex<MmaTensorOp, cutlass::gemm::GemmShape<32, 32, 4> >().run();
+}
+
+TEST(SM90_warp_gemm_complex_tensor_op_f64, 32x32x4_16x8x4_ct) {
+
+  using Shape = cutlass::gemm::GemmShape<32, 32, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 4>;
+  
+  using Element = cutlass::complex<double>;
+  using ElementC = cutlass::complex<double>;
+
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor,
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::ComplexTransform::kNone
+  >::Type;
+
+  test::gemm::warp::TestbedComplex<MmaTensorOp, cutlass::gemm::GemmShape<32, 32, 4> >().run();
+}
+
+TEST(SM90_warp_gemm_complex_tensor_op_f64, 16x8x4_16x8x4_tn) {
+
+  using Shape = cutlass::gemm::GemmShape<16, 8, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 4>;
+  
+  using Element = cutlass::complex<double>;
+  using ElementC = cutlass::complex<double>;
+
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise128x4;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise128x4;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor
+  >::Type;
+
+  test::gemm::warp::TestbedComplex<MmaTensorOp, cutlass::gemm::GemmShape<16, 8, 4> >().run();
+}
+
+TEST(SM90_warp_gemm_complex_tensor_op_f64, 16x16x4_16x8x4_tn) {
+
+  using Shape = cutlass::gemm::GemmShape<16, 16, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 4>;
+  
+  using Element = cutlass::complex<double>;
+  using ElementC = cutlass::complex<double>;
+
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise128x4;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise128x4;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor
+  >::Type;
+
+  test::gemm::warp::TestbedComplex<MmaTensorOp, cutlass::gemm::GemmShape<16, 16, 4> >().run();
+}
+
+TEST(SM90_warp_gemm_complex_tensor_op_f64, 32x32x16_16x8x4_tn) {
+
+  using Shape = cutlass::gemm::GemmShape<32, 32, 16>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 4>;
+  
+  using Element = cutlass::complex<double>;
+  using ElementC = cutlass::complex<double>;
+
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise128x4;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise128x4;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor
+  >::Type;
+
+  test::gemm::warp::TestbedComplex<MmaTensorOp, Shape>().run();
+}
+
+TEST(SM90_warp_gemm_complex_tensor_op_f64, 64x64x4_16x8x4_tn) {
+
+  using Shape = cutlass::gemm::GemmShape<64, 32, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 4>;
+  
+  using Element = cutlass::complex<double>;
+  using ElementC = cutlass::complex<double>;
+
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise128x4;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise128x4;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+    Shape, 
+    InstructionShape, 
+    Element, 
+    LayoutA, 
+    Element, 
+    LayoutB, 
+    ElementC,
+    cutlass::layout::RowMajor
+  >::Type;
+
+  test::gemm::warp::TestbedComplex<MmaTensorOp, Shape>().run();
+}
+#endif // if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
diff --git a/test/unit/gemm/warp/gemm_gaussian_complex_sm80.cu b/test/unit/gemm/warp/gemm_gaussian_complex_sm80.cu
index 359d8a0920..9fd1e9133c 100644
--- a/test/unit/gemm/warp/gemm_gaussian_complex_sm80.cu
+++ b/test/unit/gemm/warp/gemm_gaussian_complex_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/warp/gemm_mixed_input_sm80.cu b/test/unit/gemm/warp/gemm_mixed_input_sm80.cu
new file mode 100644
index 0000000000..db5b178f38
--- /dev/null
+++ b/test/unit/gemm/warp/gemm_mixed_input_sm80.cu
@@ -0,0 +1,375 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file 
+    \brief Unit tests for thread-level GEMM
+*/
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/half.h"
+
+#include "cutlass/gemm/warp/default_mma_tensor_op.h"
+
+#include "cutlass/core_io.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+
+////////////////////////////////////////////////////////////////////////////////
+/// F32 <= F16 * I8 + F32 (Upcast on Operand B)
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_mixed_input_tensor_op_crosswise_f16_i8, 128x128x64_64x64x64_16x8x16) {
+  using Shape = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  using ElementA = cutlass::half_t;
+  using ElementB = int8_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<ElementA>::value, 64>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<ElementB>::value, 64>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, ElementA, LayoutA, ElementB, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddMixedInputUpcast>::Type;
+
+  test::gemm::warp::TransformTestbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 64> >()
+      .run();
+}
+
+TEST(SM80_warp_gemm_mixed_input_tensor_op_crosswise_f16_i8, 64x64x64_64x64x64_16x8x16) {
+  using Shape = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  using ElementA = cutlass::half_t;
+  using ElementB = int8_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<ElementA>::value, 64>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<ElementB>::value, 64>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, ElementA, LayoutA, ElementB, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddMixedInputUpcast>::Type;
+
+  test::gemm::warp::TransformTestbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<64, 64, 64> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// F32 <= I8 * F16 + F32 (Upcast on Operand A)
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_mixed_input_tensor_op_crosswise_i8_f16, 128x128x64_64x64x64_16x8x16) {
+  using Shape = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  using ElementA = int8_t;
+  using ElementB = cutlass::half_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<ElementA>::value, 64>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<ElementB>::value, 64>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, ElementA, LayoutA, ElementB, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddMixedInputUpcast>::Type;
+
+  test::gemm::warp::TransformTestbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 64> >()
+      .run();
+}
+
+TEST(SM80_warp_gemm_mixed_input_tensor_op_crosswise_i8_f16, 64x64x64_64x64x64_16x8x16) {
+  using Shape = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  using ElementA = int8_t;
+  using ElementB = cutlass::half_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<ElementA>::value, 64>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<ElementB>::value, 64>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, ElementA, LayoutA, ElementB, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddMixedInputUpcast>::Type;
+
+  test::gemm::warp::TransformTestbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<64, 64, 64> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// F32 <= F16 * U8 + F32 (Upcast on Operand B)
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_mixed_input_tensor_op_crosswise_f16_u8, 64x64x64_64x64x64_16x8x16) {
+  using Shape = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  using ElementA = cutlass::half_t;
+  using ElementB = uint8_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<ElementA>::value, 64>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<ElementB>::value, 64>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, ElementA, LayoutA, ElementB, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddMixedInputUpcast>::Type;
+
+  test::gemm::warp::TransformTestbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<64, 64, 64> >()
+      .run();
+}
+
+TEST(SM80_warp_gemm_mixed_input_tensor_op_crosswise_f16_u8, 128x128x64_64x64x64_16x8x16) {
+  using Shape = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  using ElementA = cutlass::half_t;
+  using ElementB = uint8_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<ElementA>::value, 64>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<ElementB>::value, 64>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, ElementA, LayoutA, ElementB, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddMixedInputUpcast>::Type;
+
+  test::gemm::warp::TransformTestbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 64> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// F32 <= U8 * F16 + F32 (Upcast on Operand A)
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_mixed_input_tensor_op_crosswise_u8_f16, 64x64x64_64x64x64_16x8x16) {
+  using Shape = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  using ElementA = uint8_t;
+  using ElementB = cutlass::half_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<ElementA>::value, 64>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<ElementB>::value, 64>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, ElementA, LayoutA, ElementB, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddMixedInputUpcast>::Type;
+
+  test::gemm::warp::TransformTestbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<64, 64, 64> >()
+      .run();
+}
+
+TEST(SM80_warp_gemm_mixed_input_tensor_op_crosswise_u8_f16, 128x128x64_64x64x64_16x8x16) {
+  using Shape = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  using ElementA = uint8_t;
+  using ElementB = cutlass::half_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<ElementA>::value, 64>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<ElementB>::value, 64>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, ElementA, LayoutA, ElementB, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddMixedInputUpcast>::Type;
+
+  test::gemm::warp::TransformTestbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<128, 128, 64> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// F32 <= B16 * U8 + F32 (Upcast on Operand B)
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_mixed_input_tensor_op_crosswise_bf16_u8, 64x64x64_64x64x64_16x8x16) {
+  using Shape = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  using ElementA = cutlass::bfloat16_t;
+  using ElementB = uint8_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<ElementA>::value, 64>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<ElementB>::value, 64>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, ElementA, LayoutA, ElementB, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddMixedInputUpcast>::Type;
+
+  test::gemm::warp::TransformTestbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<64, 64, 64> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// F32 <= U8 * BF16 + F32 (Upcast on Operand A)
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_mixed_input_tensor_op_crosswise_u8_bf16, 64x64x64_64x64x64_16x8x16) {
+  using Shape = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  using ElementA = uint8_t;
+  using ElementB = cutlass::bfloat16_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<ElementA>::value, 64>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<ElementB>::value, 64>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, ElementA, LayoutA, ElementB, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddMixedInputUpcast>::Type;
+
+  test::gemm::warp::TransformTestbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<64, 64, 64> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// F32 <= I8 * BF16 + F32 (Upcast on Operand A)
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_mixed_input_tensor_op_crosswise_bf16_i8, 64x64x64_64x64x64_16x8x16) {
+  using Shape = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  using ElementA = cutlass::bfloat16_t;
+  using ElementB = int8_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<ElementA>::value, 64>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<ElementB>::value, 64>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, ElementA, LayoutA, ElementB, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddMixedInputUpcast>::Type;
+
+  test::gemm::warp::TransformTestbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<64, 64, 64> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// F32 <= B16 * I8 + F32 (Upcast on Operand B)
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_mixed_input_tensor_op_crosswise_i8_bf16, 64x64x64_64x64x64_16x8x16) {
+  using Shape = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  using ElementA = int8_t;
+  using ElementB = cutlass::bfloat16_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<ElementA>::value, 64>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<ElementB>::value, 64>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, ElementA, LayoutA, ElementB, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddMixedInputUpcast>::Type;
+
+  test::gemm::warp::TransformTestbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<64, 64, 64> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// S32 <= I4 * I8 + S32 (Upcast on Operand A)
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_mixed_input_tensor_op_crosswise_i4_i8, 64x64x64_64x64x64_16x8x16) {
+  using Shape = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+  using ElementA = cutlass::int4b_t;
+  using ElementB = int8_t;
+  using ElementC = int32_t;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<ElementA>::value, 64>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<ElementB>::value, 64>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, ElementA, LayoutA, ElementB, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddMixedInputUpcast>::Type;
+
+  test::gemm::warp::TransformTestbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<64, 64, 64> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// S32 <= I8 * I4 + S32 (Upcast on Operand B)
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_mixed_input_tensor_op_crosswise_i8_i4, 64x64x64_64x64x64_16x8x32) {
+  using Shape = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+  using ElementA = int8_t;
+  using ElementB = cutlass::int4b_t;
+  using ElementC = int32_t;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<ElementA>::value, 64>;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<ElementB>::value, 64>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, ElementA, LayoutA, ElementB, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddMixedInputUpcast>::Type;
+
+  test::gemm::warp::TransformTestbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<64, 64, 64> >()
+      .run();
+}
+
+#endif // if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/warp/gemm_sm50.cu b/test/unit/gemm/warp/gemm_sm50.cu
index 3a245aaffe..1321f2ff4c 100644
--- a/test/unit/gemm/warp/gemm_sm50.cu
+++ b/test/unit/gemm/warp/gemm_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/warp/gemm_sm60.cu b/test/unit/gemm/warp/gemm_sm60.cu
index 645d17fbf4..89b5ae16d2 100644
--- a/test/unit/gemm/warp/gemm_sm60.cu
+++ b/test/unit/gemm/warp/gemm_sm60.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/warp/gemm_sm61.cu b/test/unit/gemm/warp/gemm_sm61.cu
index b61f2a7675..d3904e89f7 100644
--- a/test/unit/gemm/warp/gemm_sm61.cu
+++ b/test/unit/gemm/warp/gemm_sm61.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/warp/gemm_sm70.cu b/test/unit/gemm/warp/gemm_sm70.cu
index 60f1b863a2..8a9fc9b196 100644
--- a/test/unit/gemm/warp/gemm_sm70.cu
+++ b/test/unit/gemm/warp/gemm_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/warp/gemm_sm75.cu b/test/unit/gemm/warp/gemm_sm75.cu
index c7f5864fb5..6a5bdcc636 100644
--- a/test/unit/gemm/warp/gemm_sm75.cu
+++ b/test/unit/gemm/warp/gemm_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -117,6 +117,27 @@ TEST(SM75_warp_gemm_tensor_op_congruous_f16, 128x128x32_32x32x32_16x8x8) {
 
 ////////////////////////////////////////////////////////////////////////////////
 
+TEST(SM75_warp_gemm_tensor_op_congruous_f16, 32x32x32_32x32x32_16x8x8) {
+  using Shape = cutlass::gemm::GemmShape<32, 32, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Element = cutlass::half_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 32>;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 32>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<32, 32, 32> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
 TEST(SM75_warp_gemm_tensor_op_crosswise_f16, 128x128x32_64x64x32_16x8x8) {
   using Shape = cutlass::gemm::GemmShape<64, 64, 32>;
   using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
@@ -326,7 +347,6 @@ TEST(SM75_warp_gemm_tensor_op_crosswise_f16, 128x128x64_16x16x64_16x8x8) {
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-
 TEST(SM75_warp_gemm_tensor_op_crosswise_i8, 128x128x64_64x64x64_8x8x16) {
   using Shape = cutlass::gemm::GemmShape<64, 64, 64>;
   using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
@@ -856,5 +876,4 @@ TEST(SM75_warp_gemm_tensor_op_crosswise_b1, 128x128x512_16x16x512_8x8x128) {
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-
 #endif
diff --git a/test/unit/gemm/warp/gemm_sm80.cu b/test/unit/gemm/warp/gemm_sm80.cu
index 9d69393021..c9732f216b 100644
--- a/test/unit/gemm/warp/gemm_sm80.cu
+++ b/test/unit/gemm/warp/gemm_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -516,6 +516,48 @@ TEST(SM80_warp_gemm_tensor_op_congruous_f16, 128x128x32_32x32x32_16x8x16) {
 
 ////////////////////////////////////////////////////////////////////////////////
 
+TEST(SM80_warp_gemm_tensor_op_congruous_f16, 16x16x32_16x16x32_16x8x16) {
+  using Shape = cutlass::gemm::GemmShape<16, 16, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  using Element = cutlass::half_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 16>;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 16>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<16, 16, 32> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_tensor_op_congruous_f16, 32x32x32_32x32x32_16x8x16) {
+  using Shape = cutlass::gemm::GemmShape<32, 32, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+  using Element = cutlass::half_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 32>;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 32>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<32, 32, 32> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
 TEST(SM80_warp_gemm_tensor_op_congruous_f16, 128x128x64_64x64x64_16x8x16) {
   using Shape = cutlass::gemm::GemmShape<64, 64, 64>;
   using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
@@ -1316,217 +1358,6 @@ TEST(SM80_warp_gemm_tensor_op_crosswise_i4, 128x128x256_16x16x256_16x8x64) {
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-
-TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x512_64x64x512_16x8x256) {
-  using Shape = cutlass::gemm::GemmShape<64, 64, 512>;
-  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>;
-  using Element = cutlass::uint1b_t;
-  using ElementC = int;
-  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
-      cutlass::sizeof_bits<Element>::value, 512>;
-  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
-      cutlass::sizeof_bits<Element>::value, 512>;
-
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
-      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
-      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
-
-  test::gemm::warp::Testbed<MmaTensorOp,
-                            cutlass::gemm::GemmShape<128, 128, 512> >()
-      .run();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x512_64x32x512_16x8x256) {
-  using Shape = cutlass::gemm::GemmShape<64, 32, 512>;
-  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>;
-  using Element = cutlass::uint1b_t;
-  using ElementC = int;
-  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
-      cutlass::sizeof_bits<Element>::value, 512>;
-  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
-      cutlass::sizeof_bits<Element>::value, 512>;
-
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
-      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
-      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
-
-  test::gemm::warp::Testbed<MmaTensorOp,
-                            cutlass::gemm::GemmShape<128, 128, 512> >()
-      .run();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x512_32x32x512_16x8x256) {
-  using Shape = cutlass::gemm::GemmShape<32, 32, 512>;
-  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>;
-  using Element = cutlass::uint1b_t;
-  using ElementC = int;
-  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
-      cutlass::sizeof_bits<Element>::value, 512>;
-  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
-      cutlass::sizeof_bits<Element>::value, 512>;
-
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
-      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
-      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
-
-  test::gemm::warp::Testbed<MmaTensorOp,
-                            cutlass::gemm::GemmShape<128, 128, 512> >()
-      .run();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x512_32x16x512_16x8x256) {
-  using Shape = cutlass::gemm::GemmShape<32, 16, 512>;
-  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>;
-  using Element = cutlass::uint1b_t;
-  using ElementC = int;
-  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
-      cutlass::sizeof_bits<Element>::value, 512>;
-  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
-      cutlass::sizeof_bits<Element>::value, 512>;
-
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
-      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
-      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
-
-  test::gemm::warp::Testbed<MmaTensorOp,
-                            cutlass::gemm::GemmShape<128, 128, 512> >()
-      .run();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x512_16x16x512_16x8x256) {
-  using Shape = cutlass::gemm::GemmShape<16, 16, 512>;
-  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>;
-  using Element = cutlass::uint1b_t;
-  using ElementC = int;
-  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
-      cutlass::sizeof_bits<Element>::value, 512>;
-  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
-      cutlass::sizeof_bits<Element>::value, 512>;
-
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
-      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
-      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
-
-  test::gemm::warp::Testbed<MmaTensorOp,
-                            cutlass::gemm::GemmShape<128, 128, 512> >()
-      .run();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x1024_64x64x1024_16x8x256) {
-  using Shape = cutlass::gemm::GemmShape<64, 64, 1024>;
-  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>;
-  using Element = cutlass::uint1b_t;
-  using ElementC = int;
-  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
-      cutlass::sizeof_bits<Element>::value, 1024>;
-  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
-      cutlass::sizeof_bits<Element>::value, 1024>;
-
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
-      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
-      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
-
-  test::gemm::warp::Testbed<MmaTensorOp,
-                            cutlass::gemm::GemmShape<128, 128, 1024> >()
-      .run();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x1024_64x32x1024_16x8x256) {
-  using Shape = cutlass::gemm::GemmShape<64, 32, 1024>;
-  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>;
-  using Element = cutlass::uint1b_t;
-  using ElementC = int;
-  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
-      cutlass::sizeof_bits<Element>::value, 1024>;
-  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
-      cutlass::sizeof_bits<Element>::value, 1024>;
-
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
-      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
-      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
-
-  test::gemm::warp::Testbed<MmaTensorOp,
-                            cutlass::gemm::GemmShape<128, 128, 1024> >()
-      .run();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x1024_32x32x1024_16x8x256) {
-  using Shape = cutlass::gemm::GemmShape<32, 32, 1024>;
-  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>;
-  using Element = cutlass::uint1b_t;
-  using ElementC = int;
-  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
-      cutlass::sizeof_bits<Element>::value, 1024>;
-  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
-      cutlass::sizeof_bits<Element>::value, 1024>;
-
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
-      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
-      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
-
-  test::gemm::warp::Testbed<MmaTensorOp,
-                            cutlass::gemm::GemmShape<128, 128, 1024> >()
-      .run();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x1024_32x16x1024_16x8x256) {
-  using Shape = cutlass::gemm::GemmShape<32, 16, 1024>;
-  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>;
-  using Element = cutlass::uint1b_t;
-  using ElementC = int;
-  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
-      cutlass::sizeof_bits<Element>::value, 1024>;
-  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
-      cutlass::sizeof_bits<Element>::value, 1024>;
-
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
-      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
-      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
-
-  test::gemm::warp::Testbed<MmaTensorOp,
-                            cutlass::gemm::GemmShape<128, 128, 1024> >()
-      .run();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x1024_16x16x1024_16x8x256) {
-  using Shape = cutlass::gemm::GemmShape<16, 16, 1024>;
-  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>;
-  using Element = cutlass::uint1b_t;
-  using ElementC = int;
-  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
-      cutlass::sizeof_bits<Element>::value, 1024>;
-  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
-      cutlass::sizeof_bits<Element>::value, 1024>;
-
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
-      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
-      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
-
-  test::gemm::warp::Testbed<MmaTensorOp,
-                            cutlass::gemm::GemmShape<128, 128, 1024> >()
-      .run();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
 TEST(SM80_warp_gemm_tensor_op_congruous_f64, 16x16x4_16x16x4_8x8x4) {
   using Shape = cutlass::gemm::GemmShape<16, 16, 4>;
   using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
diff --git a/test/unit/gemm/warp/gemm_sm90.cu b/test/unit/gemm/warp/gemm_sm90.cu
new file mode 100644
index 0000000000..d131b41922
--- /dev/null
+++ b/test/unit/gemm/warp/gemm_sm90.cu
@@ -0,0 +1,204 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file 
+
+    \brief Unit tests for thread-level GEMM with Hopper FP64
+*/
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/half.h"
+
+#include "cutlass/gemm/warp/default_mma_tensor_op.h"
+
+#include "cutlass/core_io.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+TEST(SM90_warp_gemm_tensor_op_congruous_f64, 16x16x4_16x16x4_16x8x4) {
+  using Shape = cutlass::gemm::GemmShape<16, 16, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 4>;
+  using Element = double;
+  using ElementC = double;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<16, 16, 4> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_warp_gemm_tensor_op_congruous_f64, 32x16x4_32x16x4_16x8x4) {
+  using Shape = cutlass::gemm::GemmShape<32, 16, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 4>;
+  using Element = double;
+  using ElementC = double;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<32, 16, 4> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_warp_gemm_tensor_op_congruous_f64, 32x32x4_32x32x4_16x8x4) {
+  using Shape = cutlass::gemm::GemmShape<32, 32, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 4>;
+  using Element = double;
+  using ElementC = double;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<32, 32, 4> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_warp_gemm_tensor_op_congruous_f64, 32x64x4_32x64x4_16x8x4) {
+  using Shape = cutlass::gemm::GemmShape<32, 64, 4>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 4>;
+  using Element = double;
+  using ElementC = double;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<32, 64, 4> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_warp_gemm_tensor_op_crosswise_f64, 16x16x16_16x16x16_16x8x4) {
+  using Shape = cutlass::gemm::GemmShape<16, 16, 16>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 4>;
+  using Element = double;
+  using ElementC = double;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<16, 16, 16> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_warp_gemm_tensor_op_crosswise_f64, 32x32x16_32x32x16_16x8x4) {
+  using Shape = cutlass::gemm::GemmShape<32, 32, 16>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 4>;
+  using Element = double;
+  using ElementC = double;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<32, 32, 16> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_warp_gemm_tensor_op_crosswise_f64, 64x32x16_64x32x16_16x8x4) {
+  using Shape = cutlass::gemm::GemmShape<64, 32, 16>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 4>;
+  using Element = double;
+  using ElementC = double;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<64, 32, 16> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM90_warp_gemm_tensor_op_crosswise_f64, 32x64x16_32x64x16_16x8x4) {
+  using Shape = cutlass::gemm::GemmShape<32, 64, 16>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 4>;
+  using Element = double;
+  using ElementC = double;
+  using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise;
+  using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type;
+
+  test::gemm::warp::Testbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<32, 64, 16> >()
+      .run();
+}
+////////////////////////////////////////////////////////////////////////////////
+#endif // if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
diff --git a/test/unit/gemm/warp/gemm_sparse_sm80.cu b/test/unit/gemm/warp/gemm_sparse_sm80.cu
index 8f9c929282..f7f83e94ef 100644
--- a/test/unit/gemm/warp/gemm_sparse_sm80.cu
+++ b/test/unit/gemm/warp/gemm_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -327,6 +327,48 @@ TEST(SM80_warp_gemm_sparse_tensor_op_congruous_f16, 128x128x64_32x32x64_16x8x32)
 
 ////////////////////////////////////////////////////////////////////////////////
 
+TEST(SM80_warp_gemm_sparse_tensor_op_congruous_f16, 64x32x64_32x32x64_16x8x32) {
+  using Shape = cutlass::gemm::GemmShape<32, 32, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+  using Element = cutlass::half_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 32>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultSparseMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor>::Type;
+
+  test::gemm::warp::SparseTestbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<64, 32, 64> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_warp_gemm_sparse_tensor_op_congruous_f16, 64x16x64_32x16x64_16x8x32) {
+  using Shape = cutlass::gemm::GemmShape<32, 16, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+  using Element = cutlass::half_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 16>;
+
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultSparseMmaTensorOp<
+      Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      cutlass::layout::RowMajor>::Type;
+
+  test::gemm::warp::SparseTestbed<MmaTensorOp,
+                            cutlass::gemm::GemmShape<64, 16, 64> >()
+      .run();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
 TEST(SM80_warp_gemm_sparse_tensor_op_congruous_f16, 128x64x128_64x32x128_16x8x32) {
   using Shape = cutlass::gemm::GemmShape<64, 32, 128>;
   using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
@@ -402,7 +444,7 @@ TEST(SM80_warp_gemm_sparse_tensor_op_crosswise_s8, 128x128x128_64x64x128_16x8x64
 
   using MmaTensorOp = typename cutlass::gemm::warp::DefaultSparseMmaTensorOp<
       Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
-      cutlass::layout::RowMajor>::Type;
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
 
   test::gemm::warp::SparseTestbed<MmaTensorOp,
                                   cutlass::gemm::GemmShape<128, 128, 128> >()
@@ -423,7 +465,7 @@ TEST(SM80_warp_gemm_sparse_tensor_op_crosswise_s8, 128x128x128_64x32x128_16x8x64
 
   using MmaTensorOp = typename cutlass::gemm::warp::DefaultSparseMmaTensorOp<
       Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
-      cutlass::layout::RowMajor>::Type;
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
 
   test::gemm::warp::SparseTestbed<MmaTensorOp,
                             cutlass::gemm::GemmShape<128, 128, 128> >()
@@ -444,7 +486,7 @@ TEST(SM80_warp_gemm_sparse_tensor_op_crosswise_s8, 128x128x128_32x64x128_16x8x64
 
   using MmaTensorOp = typename cutlass::gemm::warp::DefaultSparseMmaTensorOp<
       Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
-      cutlass::layout::RowMajor>::Type;
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
 
   test::gemm::warp::SparseTestbed<MmaTensorOp,
                             cutlass::gemm::GemmShape<128, 128, 128> >()
@@ -465,7 +507,7 @@ TEST(SM80_warp_gemm_sparse_tensor_op_crosswise_s8, 128x128x128_32x32x128_16x8x64
 
   using MmaTensorOp = typename cutlass::gemm::warp::DefaultSparseMmaTensorOp<
       Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
-      cutlass::layout::RowMajor>::Type;
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
 
   test::gemm::warp::SparseTestbed<MmaTensorOp,
                             cutlass::gemm::GemmShape<128, 128, 128> >()
@@ -486,7 +528,7 @@ TEST(SM80_warp_gemm_sparse_tensor_op_crosswise_s8, 128x128x128_32x16x128_16x8x64
 
   using MmaTensorOp = typename cutlass::gemm::warp::DefaultSparseMmaTensorOp<
       Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
-      cutlass::layout::RowMajor>::Type;
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
 
   test::gemm::warp::SparseTestbed<MmaTensorOp,
                             cutlass::gemm::GemmShape<128, 128, 128> >()
@@ -507,7 +549,7 @@ TEST(SM80_warp_gemm_sparse_tensor_op_crosswise_s8, 128x64x256_64x32x256_16x8x64)
 
   using MmaTensorOp = typename cutlass::gemm::warp::DefaultSparseMmaTensorOp<
       Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
-      cutlass::layout::RowMajor>::Type;
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
 
   test::gemm::warp::SparseTestbed<MmaTensorOp,
                             cutlass::gemm::GemmShape<128, 64, 256> >()
@@ -528,7 +570,7 @@ TEST(SM80_warp_gemm_sparse_tensor_op_crosswise_s8, 64x128x256_32x64x256_16x8x64)
 
   using MmaTensorOp = typename cutlass::gemm::warp::DefaultSparseMmaTensorOp<
       Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
-      cutlass::layout::RowMajor>::Type;
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
 
   test::gemm::warp::SparseTestbed<MmaTensorOp,
                             cutlass::gemm::GemmShape<64, 128, 256> >()
@@ -549,7 +591,7 @@ TEST(SM80_warp_gemm_sparse_tensor_op_crosswise_s8, 64x64x256_32x32x256_16x8x64)
 
   using MmaTensorOp = typename cutlass::gemm::warp::DefaultSparseMmaTensorOp<
       Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
-      cutlass::layout::RowMajor>::Type;
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
 
   test::gemm::warp::SparseTestbed<MmaTensorOp,
                             cutlass::gemm::GemmShape<64, 64, 256> >()
@@ -570,7 +612,7 @@ TEST(SM80_warp_gemm_sparse_tensor_op_crosswise_s8, 64x32x256_32x16x256_16x8x64)
 
   using MmaTensorOp = typename cutlass::gemm::warp::DefaultSparseMmaTensorOp<
       Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
-      cutlass::layout::RowMajor>::Type;
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
 
   test::gemm::warp::SparseTestbed<MmaTensorOp,
                             cutlass::gemm::GemmShape<64, 32, 256> >()
@@ -927,7 +969,7 @@ TEST(SM80_warp_gemm_sparse_tensor_op_crosswise_s4, 128x128x256_64x64x256_16x8x12
 
   using MmaTensorOp = typename cutlass::gemm::warp::DefaultSparseMmaTensorOp<
       Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
-      cutlass::layout::RowMajor>::Type;
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
 
   test::gemm::warp::SparseTestbed<MmaTensorOp,
                                   cutlass::gemm::GemmShape<128, 128, 256> >()
@@ -948,7 +990,7 @@ TEST(SM80_warp_gemm_sparse_tensor_op_crosswise_s4, 128x128x256_64x32x256_16x8x12
 
   using MmaTensorOp = typename cutlass::gemm::warp::DefaultSparseMmaTensorOp<
       Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
-      cutlass::layout::RowMajor>::Type;
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
 
   test::gemm::warp::SparseTestbed<MmaTensorOp,
                             cutlass::gemm::GemmShape<128, 128, 256> >()
@@ -969,7 +1011,7 @@ TEST(SM80_warp_gemm_sparse_tensor_op_crosswise_s4, 128x128x256_32x64x256_16x8x12
 
   using MmaTensorOp = typename cutlass::gemm::warp::DefaultSparseMmaTensorOp<
       Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
-      cutlass::layout::RowMajor>::Type;
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
 
   test::gemm::warp::SparseTestbed<MmaTensorOp,
                             cutlass::gemm::GemmShape<128, 128, 256> >()
@@ -990,7 +1032,7 @@ TEST(SM80_warp_gemm_sparse_tensor_op_crosswise_s4, 128x128x256_32x32x256_16x8x12
 
   using MmaTensorOp = typename cutlass::gemm::warp::DefaultSparseMmaTensorOp<
       Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
-      cutlass::layout::RowMajor>::Type;
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
 
   test::gemm::warp::SparseTestbed<MmaTensorOp,
                             cutlass::gemm::GemmShape<128, 128, 256> >()
@@ -1011,7 +1053,7 @@ TEST(SM80_warp_gemm_sparse_tensor_op_crosswise_s4, 128x128x256_32x16x256_16x8x12
 
   using MmaTensorOp = typename cutlass::gemm::warp::DefaultSparseMmaTensorOp<
       Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
-      cutlass::layout::RowMajor>::Type;
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
 
   test::gemm::warp::SparseTestbed<MmaTensorOp,
                             cutlass::gemm::GemmShape<128, 128, 256> >()
@@ -1032,7 +1074,7 @@ TEST(SM80_warp_gemm_sparse_tensor_op_crosswise_s4, 128x64x512_64x32x512_16x8x128
 
   using MmaTensorOp = typename cutlass::gemm::warp::DefaultSparseMmaTensorOp<
       Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
-      cutlass::layout::RowMajor>::Type;
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
 
   test::gemm::warp::SparseTestbed<MmaTensorOp,
                             cutlass::gemm::GemmShape<128, 64, 512> >()
@@ -1053,7 +1095,7 @@ TEST(SM80_warp_gemm_sparse_tensor_op_crosswise_s4, 64x128x512_32x64x512_16x8x128
 
   using MmaTensorOp = typename cutlass::gemm::warp::DefaultSparseMmaTensorOp<
       Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
-      cutlass::layout::RowMajor>::Type;
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
 
   test::gemm::warp::SparseTestbed<MmaTensorOp,
                             cutlass::gemm::GemmShape<64, 128, 512> >()
@@ -1074,7 +1116,7 @@ TEST(SM80_warp_gemm_sparse_tensor_op_crosswise_s4, 64x64x512_32x32x512_16x8x128)
 
   using MmaTensorOp = typename cutlass::gemm::warp::DefaultSparseMmaTensorOp<
       Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
-      cutlass::layout::RowMajor>::Type;
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
 
   test::gemm::warp::SparseTestbed<MmaTensorOp,
                             cutlass::gemm::GemmShape<64, 64, 512> >()
@@ -1095,7 +1137,7 @@ TEST(SM80_warp_gemm_sparse_tensor_op_crosswise_s4, 64x32x512_32x16x512_16x8x128)
 
   using MmaTensorOp = typename cutlass::gemm::warp::DefaultSparseMmaTensorOp<
       Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
-      cutlass::layout::RowMajor>::Type;
+      cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type;
 
   test::gemm::warp::SparseTestbed<MmaTensorOp,
                             cutlass::gemm::GemmShape<64, 32, 512> >()
diff --git a/test/unit/gemm/warp/testbed.h b/test/unit/gemm/warp/testbed.h
index da53fb5ca5..28388606d3 100644
--- a/test/unit/gemm/warp/testbed.h
+++ b/test/unit/gemm/warp/testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -78,7 +78,7 @@ __global__ void kernel(
   if (threadIdx.x == 0) {
     typename Mma::ElementA *smem_ptr_A = smem_buffer_A.data();
     #pragma unroll 1
-    for (int i = 0; i < smem_buffer_A.size(); ++i) {
+    for (size_t i = 0; i < smem_buffer_A.size(); ++i) {
       cutlass::ReferenceFactory<typename Mma::ElementA>::get(smem_ptr_A, i) =
           cutlass::ReferenceFactory<typename cutlass::platform::remove_const<
               typename Mma::ElementA>::type>::get(input_A, i);
@@ -86,7 +86,7 @@ __global__ void kernel(
 
     typename Mma::ElementB *smem_ptr_B = smem_buffer_B.data();
     #pragma unroll 1
-    for (int i = 0; i < smem_buffer_B.size(); ++i) {
+    for (size_t i = 0; i < smem_buffer_B.size(); ++i) {
       cutlass::ReferenceFactory<typename Mma::ElementB>::get(smem_ptr_B, i) =
           cutlass::ReferenceFactory<typename cutlass::platform::remove_const<
               typename Mma::ElementB>::type>::get(input_B, i);
@@ -191,10 +191,47 @@ struct Testbed {
     tensor_D_reference.reset(cutlass::make_Coord(Shape::kM, Shape::kN), false);
   }
 
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.major == 9) {
+      // NVIDIA Hopper drops support for several data types
+      if (
+        cutlass::sizeof_bits<ElementA>::value < 8 ||
+        cutlass::sizeof_bits<ElementB>::value < 8 ||
+        cutlass::sizeof_bits<ElementC>::value < 8) {
+
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+
   /// Runs the test
   bool run(
       cutlass::Distribution::Kind init_A = cutlass::Distribution::Uniform,
       cutlass::Distribution::Kind init_B = cutlass::Distribution::Uniform) {
+
+    if (!sufficient()) {
+      return true;
+    }
+
     //
     // initialize device memory
     //
@@ -222,7 +259,6 @@ struct Testbed {
     } else if (init_A == cutlass::Distribution::Identity) {
       cutlass::reference::host::TensorFillIdentity(tensor_A.host_view());
     } else {
-      // TODO: Implement the rest
       return false;
     }
 
@@ -249,7 +285,6 @@ struct Testbed {
     } else if (init_B == cutlass::Distribution::Identity) {
       cutlass::reference::host::TensorFillIdentity(tensor_B.host_view());
     } else {
-      // TODO: Implement the rest
       return false;
     }
 
@@ -401,10 +436,46 @@ struct TestbedComplex {
     tensor_D_reference.reset(cutlass::make_Coord(Shape::kM, Shape::kN), false);
   }
 
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.major == 9) {
+      // NVIDIA Hopper drops support for several data types
+      if (
+        cutlass::sizeof_bits<ElementA>::value < 8 ||
+        cutlass::sizeof_bits<ElementB>::value < 8 ||
+        cutlass::sizeof_bits<ElementC>::value < 8) {
+
+        return false;
+      }
+    }
+
+    return true;
+  }
+
   /// Runs the test
   bool run(
       cutlass::Distribution::Kind init_A = cutlass::Distribution::Uniform,
       cutlass::Distribution::Kind init_B = cutlass::Distribution::Uniform) {
+
+    if (!sufficient()) {
+      return true;
+    }
+
     //
     // initialize device memory
     //
@@ -419,7 +490,6 @@ struct TestbedComplex {
     } else if (init_A == cutlass::Distribution::Identity) {
       cutlass::reference::host::TensorFillIdentity(tensor_A.host_view());
     } else {
-      // TODO: Implement the rest
       return false;
     }
 
@@ -433,7 +503,6 @@ struct TestbedComplex {
     } else if (init_B == cutlass::Distribution::Identity) {
       cutlass::reference::host::TensorFillIdentity(tensor_B.host_view());
     } else {
-      // TODO: Implement the rest
       return false;
     }
 
@@ -555,7 +624,7 @@ __global__ void kernel_transform(
   if (threadIdx.x == 0) {
     typename Mma::ElementA *smem_ptr_A = smem_buffer_A.data();
     #pragma unroll 1
-    for (int i = 0; i < smem_buffer_A.size(); ++i) {
+    for (size_t i = 0; i < smem_buffer_A.size(); ++i) {
       cutlass::ReferenceFactory<typename Mma::ElementA>::get(smem_ptr_A, i) =
           cutlass::ReferenceFactory<typename cutlass::platform::remove_const<
               typename Mma::ElementA>::type>::get(input_A, i);
@@ -563,7 +632,7 @@ __global__ void kernel_transform(
 
     typename Mma::ElementB *smem_ptr_B = smem_buffer_B.data();
     #pragma unroll 1
-    for (int i = 0; i < smem_buffer_B.size(); ++i) {
+    for (size_t i = 0; i < smem_buffer_B.size(); ++i) {
       cutlass::ReferenceFactory<typename Mma::ElementB>::get(smem_ptr_B, i) =
           cutlass::ReferenceFactory<typename cutlass::platform::remove_const<
               typename Mma::ElementB>::type>::get(input_B, i);
@@ -676,10 +745,46 @@ struct TransformTestbed {
     tensor_D_reference.reset(cutlass::make_Coord(Shape::kM, Shape::kN), false);
   }
 
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.major == 9) {
+      // NVIDIA Hopper drops support for several data types
+      if (
+        cutlass::sizeof_bits<ElementA>::value < 8 ||
+        cutlass::sizeof_bits<ElementB>::value < 8 ||
+        cutlass::sizeof_bits<ElementC>::value < 8) {
+
+        return false;
+      }
+    }
+
+    return true;
+  }
+
   /// Runs the test
   bool run(
       cutlass::Distribution::Kind init_A = cutlass::Distribution::Uniform,
       cutlass::Distribution::Kind init_B = cutlass::Distribution::Uniform) {
+
+    if (!sufficient()) {
+      return true;
+    }
+
     //
     // initialize device memory
     //
@@ -705,7 +810,6 @@ struct TransformTestbed {
     } else if (init_A == cutlass::Distribution::Identity) {
       cutlass::reference::host::TensorFillIdentity(tensor_A.host_view());
     } else {
-      // TODO: Implement the rest
       return false;
     }
 
@@ -730,7 +834,6 @@ struct TransformTestbed {
     } else if (init_B == cutlass::Distribution::Identity) {
       cutlass::reference::host::TensorFillIdentity(tensor_B.host_view());
     } else {
-      // TODO: Implement the rest
       return false;
     }
 
@@ -878,10 +981,46 @@ struct TransformedTestbedComplex {
     tensor_D_reference.reset(cutlass::make_Coord(Shape::kM, Shape::kN), false);
   }
 
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.major == 9) {
+      // NVIDIA Hopper drops support for several data types
+      if (
+        cutlass::sizeof_bits<ElementA>::value < 8 ||
+        cutlass::sizeof_bits<ElementB>::value < 8 ||
+        cutlass::sizeof_bits<ElementC>::value < 8) {
+
+        return false;
+      }
+    }
+
+    return true;
+  }
+
   /// Runs the test
   bool run(
       cutlass::Distribution::Kind init_A = cutlass::Distribution::Uniform,
       cutlass::Distribution::Kind init_B = cutlass::Distribution::Uniform) {
+
+    if (!sufficient()) {
+      return true;
+    }
+
     //
     // initialize device memory
     //
@@ -896,7 +1035,6 @@ struct TransformedTestbedComplex {
     } else if (init_A == cutlass::Distribution::Identity) {
       cutlass::reference::host::TensorFillIdentity(tensor_A.host_view());
     } else {
-      // TODO: Implement the rest
       return false;
     }
 
@@ -910,7 +1048,6 @@ struct TransformedTestbedComplex {
     } else if (init_B == cutlass::Distribution::Identity) {
       cutlass::reference::host::TensorFillIdentity(tensor_B.host_view());
     } else {
-      // TODO: Implement the rest
       return false;
     }
 
@@ -1042,7 +1179,7 @@ __global__ void sparse_kernel(
   if (threadIdx.x == 0) {
     typename Mma::ElementA *smem_ptr_A = smem_buffer_A.data();
     #pragma unroll 1
-    for (int i = 0; i < smem_buffer_A.size(); ++i) {
+    for (size_t i = 0; i < smem_buffer_A.size(); ++i) {
       cutlass::ReferenceFactory<typename Mma::ElementA>::get(smem_ptr_A, i) =
           cutlass::ReferenceFactory<typename cutlass::platform::remove_const<
               typename Mma::ElementA>::type>::get(input_A, i);
@@ -1050,7 +1187,7 @@ __global__ void sparse_kernel(
 
     typename Mma::ElementB *smem_ptr_B = smem_buffer_B.data();
     #pragma unroll 1
-    for (int i = 0; i < smem_buffer_B.size(); ++i) {
+    for (size_t i = 0; i < smem_buffer_B.size(); ++i) {
       cutlass::ReferenceFactory<typename Mma::ElementB>::get(smem_ptr_B, i) =
           cutlass::ReferenceFactory<typename cutlass::platform::remove_const<
               typename Mma::ElementB>::type>::get(input_B, i);
@@ -1058,7 +1195,7 @@ __global__ void sparse_kernel(
 
     typename Mma::ElementE *smem_ptr_E = smem_buffer_E.data();
     #pragma unroll 1
-    for (int i = 0; i < smem_buffer_E.size(); ++i) {
+    for (size_t i = 0; i < smem_buffer_E.size(); ++i) {
       cutlass::ReferenceFactory<typename Mma::ElementE>::get(smem_ptr_E, i) =
           cutlass::ReferenceFactory<typename cutlass::platform::remove_const<
               typename Mma::ElementE>::type>::get(input_E, i);
@@ -1199,12 +1336,47 @@ struct SparseTestbed {
         Shape::kM, Shape::kK / Sparse / ElementsPerElementE));
   }
 
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.major == 9) {
+      // NVIDIA Hopper drops support for several data types
+      if (
+        cutlass::sizeof_bits<ElementA>::value < 8 ||
+        cutlass::sizeof_bits<ElementB>::value < 8 ||
+        cutlass::sizeof_bits<ElementC>::value < 8) {
+
+        return false;
+      }
+    }
+
+    return true;
+  }
+
   /// Runs the test
   bool run(
       cutlass::Distribution::Kind init_A = cutlass::Distribution::Uniform,
       cutlass::Distribution::Kind init_B = cutlass::Distribution::Uniform,
       cutlass::Distribution::Kind init_E = cutlass::Distribution::Uniform) {
 
+    if (!sufficient()) {
+      return true;
+    }
+
     //
     // initialize device memory
     //
@@ -1230,7 +1402,6 @@ struct SparseTestbed {
     } else if (init_A == cutlass::Distribution::Identity) {
       cutlass::reference::host::TensorFillIdentity(tensor_A.host_view());
     } else {
-      // TODO: Implement the rest
       return false;
     }
 
@@ -1255,7 +1426,6 @@ struct SparseTestbed {
     } else if (init_B == cutlass::Distribution::Identity) {
       cutlass::reference::host::TensorFillIdentity(tensor_B.host_view());
     } else {
-      // TODO: Implement the rest
       return false;
     }
 
@@ -1283,7 +1453,6 @@ struct SparseTestbed {
       cutlass::reference::host::TensorFill(tensor_E.host_view(),
                                            (ElementE)(content));
     } else {
-      // TODO: Implement the rest
       return false;
     }
 
diff --git a/test/unit/gemm/warp/wmma_sm70.cu b/test/unit/gemm/warp/wmma_sm70.cu
index a2a04b6508..ac858be646 100644
--- a/test/unit/gemm/warp/wmma_sm70.cu
+++ b/test/unit/gemm/warp/wmma_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/warp/wmma_sm72.cu b/test/unit/gemm/warp/wmma_sm72.cu
index 317d29d341..98533b71e3 100644
--- a/test/unit/gemm/warp/wmma_sm72.cu
+++ b/test/unit/gemm/warp/wmma_sm72.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -56,7 +56,7 @@
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ///////////////////////////////////////////// Integer wmma.mma ////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// TODO: FIXME SM75 should SM72, but the compilation breaks as SM72 shows up and runs on VOLTA
+// TODO: SM75 should be SM72, but the compilation breaks as SM72 shows up and runs on VOLTA
 TEST(SM75_warp_wmma_row_col_s8, 16x16x16_16x16x16_16x16x16) {
   // Threadblock and warp with just one native WMMA operation (most basic unit test)
   using WarpShape = cutlass::gemm::GemmShape<16, 16, 16>;
diff --git a/test/unit/gemm/warp/wmma_sm75.cu b/test/unit/gemm/warp/wmma_sm75.cu
index f1c5708fa4..0865d707eb 100644
--- a/test/unit/gemm/warp/wmma_sm75.cu
+++ b/test/unit/gemm/warp/wmma_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -52,7 +52,6 @@
 #include "cutlass/util/reference/host/gemm.h"
 
 #include "testbed.h"
-
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ///////////////////////////////////////////// SUBBYTE wmma.mma ////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/layout/CMakeLists.txt b/test/unit/layout/CMakeLists.txt
index bbe24abe52..3c5154a41a 100644
--- a/test/unit/layout/CMakeLists.txt
+++ b/test/unit/layout/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/layout/matrix.cu b/test/unit/layout/matrix.cu
index 4ee880890e..7e76a378e6 100644
--- a/test/unit/layout/matrix.cu
+++ b/test/unit/layout/matrix.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/layout/tensor.cu b/test/unit/layout/tensor.cu
index 9b3f6b83e2..82d9ec5f55 100644
--- a/test/unit/layout/tensor.cu
+++ b/test/unit/layout/tensor.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/layout/tensor_nhwc.cu b/test/unit/layout/tensor_nhwc.cu
index 16b97b9dd3..1ca0be5441 100644
--- a/test/unit/layout/tensor_nhwc.cu
+++ b/test/unit/layout/tensor_nhwc.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/nvrtc/CMakeLists.txt b/test/unit/nvrtc/CMakeLists.txt
index 10022ee5da..6e5643b218 100644
--- a/test/unit/nvrtc/CMakeLists.txt
+++ b/test/unit/nvrtc/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -56,8 +56,13 @@ endmacro()
 string(APPEND NVRTC_INCLUDES_STRINGS "char const *kCutlassHeaders[] = {\n")
 string(APPEND NVRTC_INCLUDES_NAMES "char const *kCutlassHeaderNames[] = {\n")
 
+
+file(GLOB_RECURSE NVRTC_SOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} kernel/thread/*.hpp)
+
 add_nvrtc_headers(${PROJECT_SOURCE_DIR}/include "${CUTLASS_CUTLASS};${CUTLASS_UTIL};${CUTLASS_DEVICE}")
+add_nvrtc_headers(${PROJECT_SOURCE_DIR}/include "${CUTLASS_CUTE}")
 add_nvrtc_headers(${PROJECT_SOURCE_DIR}/test "${CUTLASS_NVRTC};${CUTLASS_UTIL};${CUTLASS_DEVICE}")
+add_nvrtc_headers(${CMAKE_CURRENT_SOURCE_DIR} "${NVRTC_SOURCES}")
 
 add_nvrtc_headers("${CMAKE_CURRENT_SOURCE_DIR}/stdlib" "assert.h;stdint.h")
 if(CUTLASS_NVRTC_HAS_CUDA_FP16)
diff --git a/test/unit/nvrtc/cutlass/nvrtc/environment.h b/test/unit/nvrtc/cutlass/nvrtc/environment.h
index 26bb25556f..3d0e51f3b6 100644
--- a/test/unit/nvrtc/cutlass/nvrtc/environment.h
+++ b/test/unit/nvrtc/cutlass/nvrtc/environment.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/nvrtc/kernel/thread/contraction.hpp b/test/unit/nvrtc/kernel/thread/contraction.hpp
new file mode 100644
index 0000000000..f90e882e4b
--- /dev/null
+++ b/test/unit/nvrtc/kernel/thread/contraction.hpp
@@ -0,0 +1,127 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cute/tensor.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+
+
+namespace nvrtc {
+namespace thread {
+
+template<
+  typename ElementA, typename ElementB, typename ElementC,
+  typename TileShape, typename ClusterShape,
+  bool kTransA, bool kTransB,
+  int RANK_M, int RANK_N, int RANK_K, int RANK_L
+>
+struct ContractionKernel {
+
+using ElementScalar = float;
+using ElementAccum = float;
+using EpilogueThread = cutlass::epilogue::thread::LinearCombination<ElementC,
+                                                                    1,
+                                                                    ElementAccum,
+                                                                    ElementScalar>;
+
+static constexpr cute::GMMA::Major majorA = ! kTransA ? cute::GMMA::Major::MN : cute::GMMA::Major::K;
+static constexpr cute::GMMA::Major majorB = ! kTransB ? cute::GMMA::Major::K : cute::GMMA::Major::MN;
+
+/// Kernel config
+typedef int64_t stride_type;
+typedef int32_t extent_type;
+
+static constexpr const stride_type* stride_null = nullptr;
+static constexpr const extent_type* extent_null = nullptr;
+
+template <int Rank, bool IsMajor, class Indexable>
+static constexpr
+auto
+make_stride_tuple(Indexable const& t, int n, int64_t init_default = 0) {
+  static_assert(Rank > 1);
+  if constexpr (IsMajor) {
+    return cute::transform(cute::make_seq<Rank>{}, [&](auto i) {
+      if constexpr (i == 0) {
+        return cute::Int<1>{};
+      }
+      else {
+        return i < n ? t[i] : init_default;
+      }
+    });
+  }
+  else {
+    return cute::make_int_tuple<Rank>(t, n, init_default);
+  }
+}
+
+using StrideA = decltype(cute::make_stride(
+  make_stride_tuple<RANK_M, majorA == cute::GMMA::Major::MN>(stride_null, 0, 0),
+  make_stride_tuple<RANK_K, majorA == cute::GMMA::Major::K>(stride_null, 0, 0),
+  cute::make_int_tuple<RANK_L>(stride_null, 0, 0)));
+
+using StrideB = decltype(cute::make_stride(
+  make_stride_tuple<RANK_N, majorB == cute::GMMA::Major::MN>(stride_null, 0, 0),
+  make_stride_tuple<RANK_K, majorB == cute::GMMA::Major::K>(stride_null, 0, 0),
+  cute::make_int_tuple<RANK_L>(stride_null, 0, 0)));
+
+using StrideC = decltype(cute::make_stride(
+  cute::make_int_tuple<RANK_M>(stride_null, 0, 0),
+  cute::make_int_tuple<RANK_N>(stride_null, 0, 0),
+  cute::make_int_tuple<RANK_L>(stride_null, 0, 0)));
+
+using ProblemShape = decltype(cute::make_shape(
+  cute::make_int_tuple<RANK_M>(extent_null, 0, 0),
+  cute::make_int_tuple<RANK_N>(extent_null, 0, 0),
+  cute::make_int_tuple<RANK_K>(extent_null, 0, 0),
+  cute::make_int_tuple<RANK_L>(extent_null, 0, 0)));
+
+using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+  cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+  ElementA, StrideA, 16 / sizeof(ElementA),
+  ElementB, StrideB, 16 / sizeof(ElementB),
+  ElementAccum,
+  TileShape, ClusterShape, cutlass::gemm::collective::StageCountAuto,
+  cutlass::gemm::KernelTmaWarpSpecialized
+>::CollectiveOp;
+
+using EpilogueOutputOp = cutlass::epilogue::collective::DefaultEpilogue<StrideC, StrideC, EpilogueThread, cutlass::gemm::EpilogueDefault>;
+using CollectiveEpilogue = cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<EpilogueOutputOp>;
+using Kernel = cutlass::gemm::kernel::GemmUniversal<
+  ProblemShape,
+  CollectiveOp,
+  CollectiveEpilogue>;
+
+};
+
+} // namespace nvrtc
+} // namespace thread
diff --git a/test/unit/nvrtc/kernel/thread/testbed_kernel.h b/test/unit/nvrtc/kernel/thread/testbed_kernel.h
index d09982292e..7cff66c077 100644
--- a/test/unit/nvrtc/kernel/thread/testbed_kernel.h
+++ b/test/unit/nvrtc/kernel/thread/testbed_kernel.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/nvrtc/stdlib/assert.h b/test/unit/nvrtc/stdlib/assert.h
index e69de29bb2..efc3225a48 100644
--- a/test/unit/nvrtc/stdlib/assert.h
+++ b/test/unit/nvrtc/stdlib/assert.h
@@ -0,0 +1,30 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
diff --git a/test/unit/nvrtc/stdlib/stdint.h b/test/unit/nvrtc/stdlib/stdint.h
index 2e7954fdde..baf2e7fc2f 100644
--- a/test/unit/nvrtc/stdlib/stdint.h
+++ b/test/unit/nvrtc/stdlib/stdint.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/nvrtc/thread/.gitignore b/test/unit/nvrtc/thread/.gitignore
new file mode 100644
index 0000000000..9484314ac8
--- /dev/null
+++ b/test/unit/nvrtc/thread/.gitignore
@@ -0,0 +1 @@
+nvrtc_config.hpp
diff --git a/test/unit/nvrtc/thread/CMakeLists.txt b/test/unit/nvrtc/thread/CMakeLists.txt
index f65ed98647..8161c4e5ac 100644
--- a/test/unit/nvrtc/thread/CMakeLists.txt
+++ b/test/unit/nvrtc/thread/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -26,10 +26,15 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+configure_file(nvrtc_config.in nvrtc_config.hpp)
+
 cutlass_test_unit_add_executable(
   cutlass_test_unit_nvrtc_thread
-  gemm_nvrtc.cu
+  nvrtc_gemm.cu
+  nvrtc_contraction.cu
   testbed.h
-  )
+)
 
 target_link_libraries(cutlass_test_unit_nvrtc_thread PRIVATE cutlass_nvrtc)
+
+target_include_directories(cutlass_test_unit_nvrtc_thread PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
diff --git a/test/unit/nvrtc/thread/nvrtc_config.in b/test/unit/nvrtc/thread/nvrtc_config.in
new file mode 100644
index 0000000000..6291b93bd5
--- /dev/null
+++ b/test/unit/nvrtc/thread/nvrtc_config.in
@@ -0,0 +1,3 @@
+#pragma once
+
+#define CUDA_INCLUDE_DIR "@CUDA_TOOLKIT_ROOT_DIR@/include"
diff --git a/test/unit/nvrtc/thread/nvrtc_contraction.cu b/test/unit/nvrtc/thread/nvrtc_contraction.cu
new file mode 100644
index 0000000000..8dd0132aba
--- /dev/null
+++ b/test/unit/nvrtc/thread/nvrtc_contraction.cu
@@ -0,0 +1,67 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Unit tests for GETT
+*/
+
+#include <cuda.h>
+#include <gtest/gtest.h>
+
+#include "testbed.h"
+
+#include "nvrtc_config.hpp"
+
+#ifndef CUDA_INCLUDE_DIR
+static_assert(0, "CUDA include path is not defined");
+#endif
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+TEST(SM90_nvrtc_kernel, Contraction) {
+  static const char* nvrtc_opts[] = {
+    "-w",
+    "-default-device",
+    "-std=c++17",
+    "-arch=sm_90",
+    "-I" CUDA_INCLUDE_DIR,
+  };
+
+  EXPECT_TRUE(test::nvrtc::thread::TestbedKernel::compile(
+    "nvrtc::thread::ContractionKernel<"
+        "cutlass::bfloat16_t, cutlass::bfloat16_t, cutlass::bfloat16_t,"
+        "cute::Shape<cute::Shape<cute::_8, cute::_8>, cute::Shape<cute::_16, cute::_8>, cute::Shape<cute::_8, cute::_8>>,"
+        "cute::Shape<cute::_1, cute::_2, cute::_1>,"
+        "true, true,"
+        "10, 10, 10, 10>::Kernel",
+    { nvrtc_opts, nvrtc_opts + 5 }
+  ));
+}
+#endif
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/nvrtc/thread/gemm_nvrtc.cu b/test/unit/nvrtc/thread/nvrtc_gemm.cu
similarity index 98%
rename from test/unit/nvrtc/thread/gemm_nvrtc.cu
rename to test/unit/nvrtc/thread/nvrtc_gemm.cu
index acbc66df20..2d918658f4 100644
--- a/test/unit/nvrtc/thread/gemm_nvrtc.cu
+++ b/test/unit/nvrtc/thread/nvrtc_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/nvrtc/thread/testbed.h b/test/unit/nvrtc/thread/testbed.h
index 795c506ee8..6c59afeb76 100644
--- a/test/unit/nvrtc/thread/testbed.h
+++ b/test/unit/nvrtc/thread/testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,12 +35,15 @@
 #pragma once
 
 #include <iostream>
+#include <cstdio>
+#include <vector>
 
 #include "cutlass/gemm/thread/mma.h"
 #include "../kernel/thread/testbed_kernel.h"
 
 #include "cutlass/util/host_tensor.h"
 #include "cutlass/util/tensor_view_io.h"
+#include "cutlass/trace.h"
 
 #include "cutlass/util/reference/host/tensor_copy.h"
 #include "cutlass/util/reference/host/tensor_fill.h"
@@ -58,6 +61,78 @@ namespace test {
 namespace nvrtc {
 namespace thread {
 
+#define NVRTC_RETURN_IF_ERROR(api)                    \
+  do {                                                \
+    nvrtcResult _result = api;                        \
+    if (_result != NVRTC_SUCCESS) {                   \
+      CUTLASS_TRACE_HOST("Nvrtc error: " << _result); \
+      return false;                                   \
+    }                                                 \
+  } while(0)
+
+inline const char * cuda_source_fmt = R"""(
+
+#include "kernel/thread/contraction.hpp"
+
+using Operator = %s;
+
+extern "C" __global__ void global_entry(__grid_constant__ Operator::Params const params) {
+  extern __shared__ char smem[];
+
+  Operator op;
+  op(params, smem);
+}
+
+)""";
+
+struct TestbedKernel {
+  static bool compile(std::string const &kernel, std::vector<const char *> const &opts) {
+    int sz = std::snprintf(nullptr, 0, cuda_source_fmt, kernel.c_str());
+    std::vector<char> cuda_source(sz + 1);
+    std::snprintf(&cuda_source[0], cuda_source.size(), cuda_source_fmt, kernel.c_str());
+
+    nvrtcProgram program;
+    NVRTC_RETURN_IF_ERROR(
+        nvrtcCreateProgram(
+            &program,
+            cuda_source.data(),
+            nullptr,
+            static_cast<int32_t>(cutlass::nvrtc::kCutlassHeaderCount),
+            cutlass::nvrtc::kCutlassHeaders,
+            cutlass::nvrtc::kCutlassHeaderNames)
+    );
+
+    nvrtcResult compile_result = 
+        nvrtcCompileProgram(
+            program, 
+            static_cast<int32_t>(opts.size()), 
+            opts.data());
+
+    size_t log_size;
+    NVRTC_RETURN_IF_ERROR(
+        nvrtcGetProgramLogSize(program, &log_size)
+    );
+
+    if (log_size > 1) {
+      auto log = std::make_unique<char[]>(log_size);
+
+      NVRTC_RETURN_IF_ERROR(
+          nvrtcGetProgramLog(program, log.get())
+      );
+                
+      std::cout << log.get() << std::endl;
+    }
+
+    NVRTC_RETURN_IF_ERROR(compile_result);
+
+    NVRTC_RETURN_IF_ERROR(
+        nvrtcDestroyProgram(&program)
+    );
+
+    return true;
+  }
+};
+
 /// Structure to compute the matrix product
 template <
   /// Size of the Gemm problem - concept: gemm::GemmShape<>
@@ -200,7 +275,7 @@ struct Testbed {
     nvrtcAddNameExpression(program, gemm_kernel_instantiation.c_str());
 
     const char *opts[] = {"--gpu-architecture=compute_75",
-                          "--std=c++11",
+                          "--std=c++17",
                           "--include-path=/usr/local/cuda-10.1/include"};
 
     result_nvrtc = nvrtcCompileProgram(program, 3, opts);
diff --git a/test/unit/pipeline/CMakeLists.txt b/test/unit/pipeline/CMakeLists.txt
new file mode 100644
index 0000000000..3051cf1209
--- /dev/null
+++ b/test/unit/pipeline/CMakeLists.txt
@@ -0,0 +1,36 @@
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_pipeline
+  pipeline_tma_async.cu
+  pipeline_tma_async_warp_specialized.cu
+  pipeline_tma_async_warp_specialized_persistent.cu
+  pipeline_async.cu
+  sequence_barrier.cu
+)
diff --git a/test/unit/pipeline/pipeline_async.cu b/test/unit/pipeline/pipeline_async.cu
new file mode 100644
index 0000000000..86eac35e0d
--- /dev/null
+++ b/test/unit/pipeline/pipeline_async.cu
@@ -0,0 +1,462 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Unit test for the PipelineAsync class
+*/
+
+#define KERNEL_DBG_TRACE false
+
+#include "../common/cutlass_unit_test.h"
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <cute/tensor.hpp>
+#include <cute/arch/cluster_sm90.hpp> 
+
+#include <cutlass/util/reference/host/gemm.h>
+#include <cutlass/cluster_launch.hpp>
+
+#include "cutlass/core_io.h"
+
+#include "cutlass/util/print_error.hpp"
+#include "cutlass/util/GPU_Clock.hpp"
+
+#include "testbed.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/arch/barrier.h"
+#include "cute/arch/cluster_sm90.hpp"
+
+using namespace cute;
+
+//////////////////// KERNEL /////////////////////////
+
+template <uint32_t Stages>
+struct SharedStorage
+{
+  typename cutlass::PipelineAsync<Stages>::SharedStorage storage;
+};
+
+// Goal of this kernel is to complete deadlock-free
+// Simple 1 producer warp, one consumer warp scenario
+template <class ClusterShape, uint32_t NumStages>
+__global__ static 
+void pipeline_async_basic_device(uint32_t const num_iterations)
+{
+
+  extern __shared__ char shared_memory[];
+  using MainloopPipeline = typename cutlass::PipelineAsync<NumStages>;
+  using PipelineState = typename cutlass::PipelineState<NumStages>;
+
+  using SharedStorage = SharedStorage<NumStages>;
+  SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(shared_memory);
+
+  int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+  int lane_predicate = cute::elect_one_sync();
+  dim3 block_id_in_cluster = cute::block_id_in_cluster();
+  
+  // This example showcases 2 producer 1 consumer example 
+  typename MainloopPipeline::Params params;
+  params.producer_arv_count = 2;
+  params.consumer_arv_count = 1;
+  MainloopPipeline pipeline(shared_storage.storage, params);
+
+  // Ensure All CTAs in Cluster have completed init before issuing commits
+  cute::cluster_arrive_relaxed();  
+  cute::cluster_wait();
+  __syncthreads();
+
+
+  if (lane_predicate) {
+    // Producer Warps
+    if (warp_idx==0 || warp_idx==1) {
+
+      PipelineState smem_pipe_write = cutlass::make_producer_start_state<MainloopPipeline>();
+      int prologue_iterations = min(NumStages, num_iterations);
+      for ( int i = 0; i < prologue_iterations; ++i) {
+        // Can also specify stage to commit directly
+        pipeline.producer_commit(smem_pipe_write);
+        ++smem_pipe_write;
+      }
+
+      int mainloop_iterations = num_iterations - prologue_iterations;
+
+      for ( ; mainloop_iterations > 0; --mainloop_iterations) {
+        pipeline.producer_acquire(smem_pipe_write);
+        pipeline.producer_commit(smem_pipe_write);
+        ++smem_pipe_write;
+      }
+    }
+    else {
+      PipelineState smem_pipe_read;
+      for (int iter=0 ; iter < num_iterations; ++iter) {
+        pipeline.consumer_wait(smem_pipe_read);
+        pipeline.consumer_release(smem_pipe_read);
+        ++smem_pipe_read;
+      }
+    }
+  }
+
+  // To make sure remote SMEM doesn't get destroyed
+  cute::cluster_arrive();  
+  cute::cluster_wait();  
+}
+/////////////////////////////////////////////////////
+
+template<uint32_t Stages_, typename ClusterShape_>
+struct PipelineTest {
+
+  //
+  // Data members
+  //
+  static constexpr uint32_t Stages = Stages_;
+  static constexpr uint32_t kBlockSize = 96;
+  using ClusterShape = ClusterShape_;
+
+  //
+  // Methods
+  //
+
+  // Ctor
+  PipelineTest() = default;
+
+
+  // Run CuTe GEMM kernel
+  cudaError_t run(uint32_t const kNumIters,
+                  cudaStream_t stream = nullptr) {
+
+    // Pipeline (multistage pipeline)
+    auto cluster_shape = Shape<Int<ClusterShape::kM>, Int<ClusterShape::kN>, _1>{};
+
+    //
+    // Configure and launch
+    //
+    int iterations = 2;
+    cudaError_t result;
+
+    for (int iter = 0; iter < iterations; ++iter) {
+
+      // Define the tiled MMA layout (static, 4warps)
+      using MainloopPipeline = typename cutlass::PipelineAsync<Stages>;
+
+      int smem_size = int(sizeof(SharedStorage<Stages>));
+
+      result = cudaFuncSetAttribute(
+        pipeline_async_basic_device<decltype(cluster_shape), Stages>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        smem_size);
+
+      // Launch a single Cluster, with 128 thread per CTA
+      dim3 dimCluster(size<0>(cluster_shape), size<1>(cluster_shape), 1);    
+      dim3 dimGrid(size<0>(cluster_shape), size<1>(cluster_shape), 1);    
+      dim3 dimBlock(kBlockSize,1,1);
+
+      const void* kernel = (const void*)pipeline_async_basic_device<decltype(cluster_shape), Stages>;
+      int iters = kNumIters;
+      void* kernel_params[] = {reinterpret_cast<void*>(&iters)};
+      cutlass::ClusterLauncher::launch(dimGrid, dimCluster, dimBlock, smem_size, stream, kernel, kernel_params);
+  
+    } // profiling loop ends
+
+    result = cudaDeviceSynchronize();
+
+    if (result != cudaSuccess) {
+      std::cerr << "Error: cudaDeviceSynchronize() failed" << std::endl;
+      return result;
+    }
+
+    return cudaSuccess;
+  }
+
+};
+
+#if CUDA_12_0_SM90_FEATURES_SUPPORTED
+TEST(SM90_Verify_PipelineAsync, Cluster1x1_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineAsync, Cluster1x1_Stage5) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  static constexpr uint32_t Stages = 5;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineAsync, Cluster1x1_Stage10) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  static constexpr uint32_t Stages = 10;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineAsync, Cluster2x2_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<2, 2, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineAsync, Cluster2x2_Stage5) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<2, 2, 1>;
+  static constexpr uint32_t Stages = 5;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineAsync, Cluster2x2_Stage10) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<2, 2, 1>;
+  static constexpr uint32_t Stages = 10;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineAsync, Cluster1x2_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<1, 2, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineAsync, Cluster1x2_Stage7) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<1, 2, 1>;
+  static constexpr uint32_t Stages = 7;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineAsync, Cluster1x2_Stage10) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<1, 2, 1>;
+  static constexpr uint32_t Stages = 10;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineAsync, Cluster2x1_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<2, 1, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineAsync, Cluster2x1_Stage7) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<2, 1, 1>;
+  static constexpr uint32_t Stages = 7;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineAsync, Cluster4x1_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<4, 1, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineAsync, Cluster4x1_Stage7) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<4, 1, 1>;
+  static constexpr uint32_t Stages = 7;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineAsync, Cluster1x4_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<1, 4, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineAsync, Cluster1x4_Stage7) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<1, 4, 1>;
+  static constexpr uint32_t Stages = 7;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineAsync, Cluster2x4_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<2, 4, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineAsync, Cluster2x4_Stage7) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<2, 4, 1>;
+  static constexpr uint32_t Stages = 7;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineAsync, Cluster4x2_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<4, 2, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineAsync, Cluster4x2_Stage7) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<4, 2, 1>;
+  static constexpr uint32_t Stages = 7;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineAsync, Cluster4x4_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<4, 4, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineAsync, Cluster4x4_Stage3) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<4, 4, 1>;
+  static constexpr uint32_t Stages = 3;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineAsync, Cluster4x4_Stage4) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<4, 4, 1>;
+  static constexpr uint32_t Stages = 4;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineAsync, Cluster4x4_Stage5) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<4, 4, 1>;
+  static constexpr uint32_t Stages = 5;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineAsync, Cluster4x4_Stage6) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<4, 4, 1>;
+  static constexpr uint32_t Stages = 6;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineAsync, Cluster4x4_Stage7) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<4, 4, 1>;
+  static constexpr uint32_t Stages = 7;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineAsync, Cluster4x4_Stage8) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<4, 4, 1>;
+  static constexpr uint32_t Stages = 8;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineAsync, Cluster4x4_Stage9) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<4, 4, 1>;
+  static constexpr uint32_t Stages = 9;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineAsync, Cluster4x4_Stage10) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<4, 4, 1>;
+  static constexpr uint32_t Stages = 10;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineAsync, Cluster4x4_Stage11) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<4, 4, 1>;
+  static constexpr uint32_t Stages = 11;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+#endif
diff --git a/test/unit/pipeline/pipeline_tma_async.cu b/test/unit/pipeline/pipeline_tma_async.cu
new file mode 100644
index 0000000000..b9a61264a3
--- /dev/null
+++ b/test/unit/pipeline/pipeline_tma_async.cu
@@ -0,0 +1,463 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Unit test for the PipelineTmaAsync class
+*/
+
+
+#define KERNEL_DBG_TRACE false
+
+#include "../common/cutlass_unit_test.h"
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <cute/tensor.hpp>
+#include <cute/arch/cluster_sm90.hpp>
+
+#include <cutlass/util/reference/host/gemm.h>
+#include <cutlass/cluster_launch.hpp>
+
+#include "cutlass/core_io.h"
+
+#include "cutlass/util/print_error.hpp"
+#include "cutlass/util/GPU_Clock.hpp"
+
+#include "testbed.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/arch/barrier.h"
+#include "cute/arch/cluster_sm90.hpp"
+
+using namespace cute;
+
+//////////////////// KERNEL /////////////////////////
+
+template <uint32_t Stages>
+struct SharedStorage
+{
+  typename cutlass::PipelineTmaAsync<Stages>::SharedStorage storage;
+};
+
+// Goal of this kernel is to complete deadlock-free
+template <class ClusterShape, uint32_t NumStages>
+__global__ static
+void pipeline_device(uint32_t const NumIterations)
+{
+
+  extern __shared__ char shared_memory[];
+  using MainloopPipeline = cutlass::PipelineTmaAsync<NumStages>;
+  using PipelineState = cutlass::PipelineState<NumStages>;
+
+  using SharedStorage = SharedStorage<NumStages>;
+  SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(shared_memory);
+
+  [[maybe_unused]] auto cta_layout = Layout<ClusterShape>{}; // (m,n) -> cta_id
+  int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+  int warp_group_thread_idx = threadIdx.x % 128;
+  dim3 block_id_in_cluster = cute::block_id_in_cluster();
+
+  auto cluster_shape = ClusterShape{};
+
+  // #Producers = #RowsInCluster + #ColsInCluster - 1
+  uint32_t const NumProducers = cute::size<0>(cluster_shape) + cute::size<1>(cluster_shape) - 1;
+  uint32_t const TmaTransactionBytes = sizeof(uint32_t) * NumProducers;
+  uint32_t const per_cta_bytes = sizeof(uint32_t);
+
+  // mbarrier.init
+  typename MainloopPipeline::Params params;
+  params.transaction_bytes = TmaTransactionBytes;
+  params.role = MainloopPipeline::ThreadCategory::ProducerConsumer;
+  params.is_leader = warp_group_thread_idx == 0;
+  params.num_consumers = 128;
+
+  MainloopPipeline pipeline(shared_storage.storage, params, cluster_shape);
+
+  __syncthreads();
+
+  // Ensure All CTAs in Cluster have completed init before issuing commits
+  cute::cluster_arrive_relaxed();
+  cute::cluster_wait();
+
+  // Total number of gemm_k_iterations
+  auto mma_k_iterations  = NumIterations;
+  auto tma_k_iterations  = NumIterations;
+
+  PipelineState smem_pipe_read;
+  // For the DMA (prologue) - we start with an opposite phase - since we skip all waits
+  // i.e., we know that the buffer is indeed empty
+  PipelineState smem_pipe_write = cutlass::make_producer_start_state<MainloopPipeline>();
+  PipelineState smem_pipe_release;
+  int K_TILE_MMAS = 1;
+
+  int lane_predicate = cute::elect_one_sync();
+  int k_pipe_tma_prologue = min(NumStages, tma_k_iterations);
+
+  // DMA Prologue (Loads)
+  CUTLASS_PRAGMA_UNROLL
+  for(int i = 0; i < k_pipe_tma_prologue; ++i) {
+    pipeline.producer_acquire(smem_pipe_write);
+    // cp.async.bulk.tensor would typically happen here
+    pipeline.producer_commit(smem_pipe_write, per_cta_bytes);
+    ++smem_pipe_write;
+  }
+  tma_k_iterations -= k_pipe_tma_prologue;
+
+  // MMA Prologue (Compute) - modeling inflight MMAs
+  for (int iter = 0; iter < K_TILE_MMAS; ++iter)
+  {
+    pipeline.consumer_wait(smem_pipe_read);
+    warpgroup_arrive();
+    // GMMA would typically happen here
+
+    ++smem_pipe_read;
+  }
+
+  mma_k_iterations -= K_TILE_MMAS;
+
+  CUTLASS_PRAGMA_NO_UNROLL
+  for (int iter = 0; iter < mma_k_iterations; ++iter)
+  {
+    pipeline.consumer_wait(smem_pipe_read);
+
+    warpgroup_arrive();
+    // GMMA would typically happen here
+
+    pipeline.consumer_release(smem_pipe_release);
+
+    if (lane_predicate && (warp_idx == 0) && (tma_k_iterations > 0)) {
+      pipeline.producer_acquire(smem_pipe_write);
+      // cp.async.bulk.tensor would typically happen here
+      pipeline.producer_commit(smem_pipe_write, per_cta_bytes);
+      ++smem_pipe_write;
+      --tma_k_iterations;
+    }
+
+    // next read stage
+    ++smem_pipe_read;
+    ++smem_pipe_release;
+  }
+
+  // To make sure remote SMEM doesn't get destoryed
+  cute::cluster_arrive();
+  cute::cluster_wait();
+}
+/////////////////////////////////////////////////////
+
+/// Device NT GMMA + TMA specialized
+template<uint32_t Stages_, typename ClusterShape_>
+struct PipelineTest {
+
+  //
+  // Data members
+  //
+  static constexpr uint32_t Stages = Stages_;
+  static constexpr uint32_t kBlockSize = 128;
+  using ClusterShape = ClusterShape_;
+
+  //
+  // Methods
+  //
+
+  // Ctor
+  PipelineTest(){};
+
+
+  // Run CuTe GEMM kernel
+  cudaError_t run(uint32_t const kNumIters,
+                  cudaStream_t stream = 0) {
+
+    float elapsed_ms = 0.0f;
+    // Pipeline (multistage pipeline)
+    [[maybe_unused]] auto num_stages = Int<Stages>{};
+
+    auto cluster_shape = Shape<Int<ClusterShape::kM>, Int<ClusterShape::kN>, _1>{};
+
+    //
+    // Configure and launch
+    //
+    int iterations = 1;
+    cudaEvent_t events[2];
+    cudaError_t result;
+
+    for (cudaEvent_t & event : events) {
+      result = cudaEventCreate(&event);
+      if (result != cudaSuccess) {
+        std::cerr << "Error: Failed to create event.";
+        return result;
+      }
+    }
+
+    result = cudaEventRecord(events[0]);
+
+    if (result != cudaSuccess) {
+      std::cerr << "Error: Failed to record start event.";
+      return result;
+    }
+
+    for (int iter = 0; iter < iterations; ++iter) {
+      int smem_size = int(sizeof(SharedStorage<Stages>));
+
+      result = cudaFuncSetAttribute(
+        pipeline_device<decltype(cluster_shape), Stages>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        smem_size);
+
+      // Launch a single Cluster, with 128 thread per CTA
+      dim3 dimCluster(size<0>(cluster_shape), size<1>(cluster_shape), 1);
+      dim3 dimGrid(size<0>(cluster_shape), size<1>(cluster_shape), 1);
+      dim3 dimBlock(kBlockSize,1,1);
+
+      const void* kernel = (const void*)pipeline_device<decltype(cluster_shape), Stages>;
+      int iters = kNumIters;
+      void* kernel_params[] = {reinterpret_cast<void*>(&iters)};
+      cutlass::ClusterLauncher::launch(dimGrid, dimCluster, dimBlock, smem_size, stream, kernel, kernel_params);
+
+    } // profiling loop ends
+
+    result = cudaEventRecord(events[1]);
+
+    if (result != cudaSuccess) {
+      std::cerr << "Error: Failed to record stop event.";
+      return result;
+    }
+
+    result = cudaDeviceSynchronize();
+
+    if (result != cudaSuccess) {
+      std::cerr << "Error: cudaDeviceSynchronize() failed" << std::endl;
+      return result;
+    }
+
+    result = cudaEventElapsedTime(&elapsed_ms, events[0], events[1]);
+
+    if (result != cudaSuccess) {
+      std::cerr << "Failed to create event.";
+      return result;
+    }
+
+    for (cudaEvent_t & event : events) {
+      (void)cudaEventDestroy(event);
+    }
+
+    return cudaSuccess;
+  }
+};
+
+#if CUDA_12_0_SM90_FEATURES_SUPPORTED
+TEST(SM90_Verify_PipelineTmaAsync, Cluster1x1_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync, Cluster1x1_Stage5) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  static constexpr uint32_t Stages = 5;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync, Cluster1x1_Stage10) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  static constexpr uint32_t Stages = 10;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync, Cluster2x2_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<2, 2, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync, Cluster2x2_Stage5) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<2, 2, 1>;
+  static constexpr uint32_t Stages = 5;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync, Cluster2x2_Stage10) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<2, 2, 1>;
+  static constexpr uint32_t Stages = 10;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync, Cluster4x4_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<4, 4, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync, Cluster4x4_Stage10) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<4, 4, 1>;
+  static constexpr uint32_t Stages = 10;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync, Cluster1x2_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<1, 2, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync, Cluster1x2_Stage7) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<1, 2, 1>;
+  static constexpr uint32_t Stages = 7;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync, Cluster1x2_Stage10) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<1, 2, 1>;
+  static constexpr uint32_t Stages = 10;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync, Cluster2x1_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<2, 1, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync, Cluster2x1_Stage7) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<2, 1, 1>;
+  static constexpr uint32_t Stages = 7;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync, Cluster4x1_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<4, 1, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync, Cluster4x1_Stage7) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<4, 1, 1>;
+  static constexpr uint32_t Stages = 7;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync, Cluster1x4_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<1, 4, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync, Cluster1x4_Stage7) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<1, 4, 1>;
+  static constexpr uint32_t Stages = 7;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync, Cluster2x4_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<2, 4, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync, Cluster2x4_Stage7) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<2, 4, 1>;
+  static constexpr uint32_t Stages = 7;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync, Cluster4x2_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<4, 2, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync, Cluster4x2_Stage7) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<4, 2, 1>;
+  static constexpr uint32_t Stages = 7;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+#endif
diff --git a/test/unit/pipeline/pipeline_tma_async_warp_specialized.cu b/test/unit/pipeline/pipeline_tma_async_warp_specialized.cu
new file mode 100644
index 0000000000..5b3c5dacf1
--- /dev/null
+++ b/test/unit/pipeline/pipeline_tma_async_warp_specialized.cu
@@ -0,0 +1,525 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Unit test for the PipelineTmaAsync class as it would be used in a Warp specialized loop
+*/
+
+#define KERNEL_DBG_TRACE false
+
+#include "../common/cutlass_unit_test.h"
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <cute/tensor.hpp>
+#include <cute/arch/cluster_sm90.hpp> 
+
+#include <cutlass/util/reference/host/gemm.h>
+#include <cutlass/cluster_launch.hpp>
+
+#include "cutlass/core_io.h"
+#include "cutlass/util/print_error.hpp"
+#include "cutlass/util/GPU_Clock.hpp"
+
+#include "testbed.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/arch/barrier.h"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/arch/reg_reconfig.h"
+
+
+using namespace cute;
+using namespace cutlass;
+
+//////////////////// KERNEL /////////////////////////
+
+template <uint32_t Stages>
+struct SharedStorage
+{
+  typename cutlass::PipelineTmaAsync<Stages>::SharedStorage storage ;
+};
+
+struct KernelParams
+{
+  uint32_t num_iterations;
+  int* data_ptr;
+};
+
+// Goal of this kernel is to complete deadlock-free
+template <typename ClusterShape, uint32_t Stages>
+__launch_bounds__(384, 1)
+__global__ static
+void pipeline_device(KernelParams const kernel_params)
+{
+  extern __shared__ char shared_memory[];
+  using MainloopPipeline = typename cutlass::PipelineTmaAsync<Stages>;  
+  using PipelineState = typename cutlass::PipelineState<Stages>;  
+
+  using SharedStorage = SharedStorage<Stages>;
+  SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(shared_memory);
+
+  [[maybe_unused]] auto cta_layout = Layout<ClusterShape>{};            // (m,n) -> cta_id
+  int warp_group_idx = __shfl_sync(0xffffffff, threadIdx.x / 128, 0);
+  int warp_idx_in_warpgroup = __shfl_sync(0xffffffff, (threadIdx.x / 32) % 4, 0);
+  int warp_group_thread_idx = threadIdx.x % 128;
+  dim3 block_id_in_cluster = cute::block_id_in_cluster();
+
+  auto cluster_shape = ClusterShape{};
+
+  // #Producers = #RowsInCluster + #ColsInCluster - 1 
+  uint32_t const NumProducers = cute::size<0>(cluster_shape) + cute::size<1>(cluster_shape) - 1;
+  uint32_t const TmaTransactionBytes = static_cast<uint32_t>(sizeof(uint32_t) * NumProducers);
+  uint32_t const per_cta_bytes = sizeof(uint32_t);
+
+  // mbarrier.init
+  typename MainloopPipeline::Params params;
+  params.transaction_bytes = TmaTransactionBytes;
+  if (warp_group_idx == 0) {
+    params.role = MainloopPipeline::ThreadCategory::Producer;
+  }
+  else {
+    params.role = MainloopPipeline::ThreadCategory::Consumer;
+  }
+  params.is_leader = warp_group_thread_idx == 0;
+  params.num_consumers = 128;
+
+  MainloopPipeline pipeline(shared_storage.storage, params, cluster_shape);
+
+  __syncthreads();
+
+  // Ensure All CTAs in Cluster have completed init before issuing commits
+  cute::cluster_arrive_relaxed();  
+  cute::cluster_wait();
+
+
+  // Producer WarpGroup
+  if (warp_group_idx == 0) {
+    cutlass::arch::warpgroup_reg_alloc<232>();
+
+    int lane_predicate = cute::elect_one_sync();
+    if (warp_idx_in_warpgroup == 0 && lane_predicate) {
+
+      int tma_k_prologue = min(Stages, kernel_params.num_iterations);
+
+      // Simulating Prologue TMA Loads
+      // For the DMA (prologue) - we start with an opposite phase - since we skip all waits
+      // i.e., we know that the buffer is indeed empty
+      PipelineState smem_pipe_write = make_producer_start_state<MainloopPipeline>();
+      CUTLASS_PRAGMA_UNROLL
+      for(int i = 0; i < tma_k_prologue; ++i) {
+        pipeline.producer_acquire(smem_pipe_write);
+        // Simulating cp.async.bulk.tensor behavior
+        pipeline.producer_commit(smem_pipe_write, per_cta_bytes);
+        ++smem_pipe_write;
+      }
+      int tma_k_iter = kernel_params.num_iterations - tma_k_prologue;
+
+      // Simulating Mainloop TMA Loads
+      CUTE_NO_UNROLL
+      for ( ; tma_k_iter > 0; --tma_k_iter) {
+
+        pipeline.producer_acquire(smem_pipe_write);
+
+        // Simulating cp.async.bulk.tensor behavior
+        pipeline.producer_commit(smem_pipe_write, per_cta_bytes);
+
+        // Advance write stage
+        ++smem_pipe_write;
+      }
+
+      // Tail Loop
+      // Handles the case where we never enter the mainloop
+      PipelineState tail = tma_k_prologue == Stages ? smem_pipe_write : PipelineState{};
+      for ( int i = 0; i < tma_k_prologue; ++i) {
+        pipeline.producer_acquire(tail);
+        ++tail;
+      }
+    }
+  // Consumer WarpGroup
+  } else if(warp_group_idx == 1) {
+    cutlass::arch::warpgroup_reg_alloc<232>();
+
+    PipelineState smem_pipe_read;
+    PipelineState smem_pipe_release;
+
+    // simulates accumulators + extra reg. pressure
+    int arr[168];
+
+    // Init Shared Memory read stages & PhaseBit
+    static constexpr uint32_t K_PIPE_MMAS = 1;
+    static_assert( K_PIPE_MMAS < Stages, "ERROR : Too many MMAs in flight");
+
+    // Total number of gemm iterations
+    auto gemm_k_iterations  = kernel_params.num_iterations;
+
+    // Simulating Prologue MMAs
+    int mma_k_prologue = min(K_PIPE_MMAS, gemm_k_iterations);
+    CUTLASS_PRAGMA_UNROLL
+    for (int iter = 0; iter < mma_k_prologue; ++iter) {
+      pipeline.consumer_wait(smem_pipe_read);
+
+      warpgroup_arrive();
+      // GMMA would typically happen here
+
+      ++smem_pipe_read;
+    }
+    gemm_k_iterations -= mma_k_prologue;
+
+    // Simulating Mainloop MMAs
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; gemm_k_iterations > 0; --gemm_k_iterations) {
+
+      /// Wait on the smem_pipe_read stage / phase
+      pipeline.consumer_wait(smem_pipe_read);
+
+      warpgroup_arrive();
+      // GMMA would typically happen here
+
+      // Dummy op - which will never happen
+      // But simulates high register usage.
+      CUTE_UNROLL
+      for(int i = 0; i < 168; ++i){
+        if (threadIdx.x > 256){
+          arr[i] += kernel_params.data_ptr[i];
+        }
+      }
+
+      pipeline.consumer_release(smem_pipe_release);
+
+      // Advance stages
+      ++smem_pipe_read;
+      ++smem_pipe_release;
+    }
+
+    // Dummy op - which will never happen
+    CUTE_UNROLL
+    for(int i = 0; i < 168; ++i){
+      if (threadIdx.x > 256){
+        kernel_params.data_ptr[i] = arr[i];
+      }
+    }
+
+    // Tail Loop
+    for (int i = 0; i < K_PIPE_MMAS; ++i){
+      pipeline.consumer_release(smem_pipe_release);
+      ++smem_pipe_release;
+    }
+
+  // Warp-Group #2
+  } else {
+    cutlass::arch::warpgroup_reg_dealloc<40>();
+  }
+}
+/////////////////////////////////////////////////////
+
+/// Device NT GMMA + TMA specialized
+template<uint32_t Stages_, typename ClusterShape_>
+struct PipelineTest {
+
+  //
+  // Data members
+  //
+  static constexpr uint32_t Stages = Stages_;
+  static constexpr uint32_t kBlockSize = 128 * 3;
+  using ClusterShape = ClusterShape_;
+
+  //
+  // Methods
+  //
+
+  // Ctor
+  PipelineTest(){};
+
+  // Run CuTe GEMM kernel
+  cudaError_t run(uint32_t const kNumIters,
+                  cudaStream_t stream = 0) {
+
+    float elapsed_ms = 0.0f;
+    // Pipeline (multistage pipeline)
+    [[maybe_unused]] auto num_stages = Int<Stages>{};
+    auto cluster_shape = Shape<Int<ClusterShape::kM>, Int<ClusterShape::kN>, _1>{};
+
+    //
+    // Configure and launch
+    //
+    int iterations = 1;
+    cudaEvent_t events[2];
+    cudaError_t result;
+
+    for (cudaEvent_t & event : events) {
+      result = cudaEventCreate(&event);
+      if (result != cudaSuccess) {
+        std::cerr << "Error: Failed to create event.";
+        return result;
+      }
+    }
+
+    result = cudaEventRecord(events[0]);
+
+    if (result != cudaSuccess) {
+      std::cerr << "Error: Failed to record start event.";
+      return result;
+    }
+
+    for (int iter = 0; iter < iterations; ++iter) {
+    
+      using MainloopPipeline = typename cutlass::PipelineTmaAsync<Stages>;
+
+      int smem_size = int(sizeof(SharedStorage<Stages>));
+
+      result = cudaFuncSetAttribute(
+        pipeline_device<decltype(cluster_shape), Stages>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        smem_size);
+
+      // Launch a single Cluster, with kBlockSize threads per CTA
+      dim3 dimCluster(size<0>(cluster_shape), size<1>(cluster_shape), 1);    
+      dim3 dimGrid(size<0>(cluster_shape), size<1>(cluster_shape), 1);    
+      dim3 dimBlock(kBlockSize,1,1);
+
+      const void* kernel = (const void*)pipeline_device<decltype(cluster_shape), Stages>;
+      KernelParams params{kNumIters, nullptr};
+      void* kernel_params[] = {reinterpret_cast<void*>(&params)};
+      cutlass::ClusterLauncher::launch(dimGrid, dimCluster, dimBlock, smem_size, stream, kernel, kernel_params);
+  
+    }
+
+    result = cudaEventRecord(events[1]);
+
+    if (result != cudaSuccess) {
+      std::cerr << "Error: Failed to record stop event.";
+      return result;
+    }
+
+    result = cudaDeviceSynchronize();
+
+    if (result != cudaSuccess) {
+      std::cerr << "Error: cudaDeviceSynchronize() failed" << std::endl;
+      return result;
+    }
+
+    result = cudaEventElapsedTime(&elapsed_ms, events[0], events[1]);
+
+    if (result != cudaSuccess) {
+      std::cerr << "Failed to create event.";
+      return result;
+    }
+
+    for (cudaEvent_t & event : events) {
+      (void)cudaEventDestroy(event);
+    }
+
+    return cudaSuccess;
+  }
+};
+
+#if CUDA_12_0_SM90_FEATURES_SUPPORTED
+TEST(SM90_Verify_PipelineTmaAsync_WS, Cluster1x1_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS, Cluster1x1_Stage5) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  static constexpr uint32_t Stages = 5;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS, Cluster1x1_Stage10) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  static constexpr uint32_t Stages = 10;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS, Cluster2x2_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<2, 2, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS, Cluster2x2_Stage5) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<2, 2, 1>;
+  static constexpr uint32_t Stages = 5;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS, Cluster2x2_Stage7) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<2, 2, 1>;
+  static constexpr uint32_t Stages = 7;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS, Cluster4x4_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<4, 4, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS, Cluster4x4_Stage7) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<4, 4, 1>;
+  static constexpr uint32_t Stages = 7;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS, Cluster2x1_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<2, 1, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS, Cluster2x1_Stage7) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<2, 1, 1>;
+  static constexpr uint32_t Stages = 7;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS, Cluster1x2_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<1, 2, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS, Cluster1x2_Stage7) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<1, 2, 1>;
+  static constexpr uint32_t Stages = 7;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS, Cluster4x1_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<4, 1, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS, Cluster4x1_Stage7) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<4, 1, 1>;
+  static constexpr uint32_t Stages = 7;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS, Cluster1x4_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<1, 4, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS, Cluster1x4_Stage7) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<1, 4, 1>;
+  static constexpr uint32_t Stages = 7;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS, Cluster2x4_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<2, 4, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS, Cluster2x4_Stage7) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<2, 4, 1>;
+  static constexpr uint32_t Stages = 7;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS, Cluster4x2_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<4, 2, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS, Cluster4x2_Stage7) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<4, 2, 1>;
+  static constexpr uint32_t Stages = 7;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+#endif
diff --git a/test/unit/pipeline/pipeline_tma_async_warp_specialized_persistent.cu b/test/unit/pipeline/pipeline_tma_async_warp_specialized_persistent.cu
new file mode 100644
index 0000000000..22d5cadfce
--- /dev/null
+++ b/test/unit/pipeline/pipeline_tma_async_warp_specialized_persistent.cu
@@ -0,0 +1,578 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Unit test for the PipelineTmaAsync class used in a WarpSpecialized Persistent loop
+*/
+
+#define KERNEL_DBG_TRACE false
+
+#include "../common/cutlass_unit_test.h"
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <cute/tensor.hpp>
+#include <cute/arch/cluster_sm90.hpp> 
+
+#include <cutlass/util/reference/host/gemm.h>
+#include <cutlass/cluster_launch.hpp>
+
+#include "cutlass/core_io.h"
+#include "cutlass/util/print_error.hpp"
+#include "cutlass/util/GPU_Clock.hpp"
+
+#include "testbed.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/arch/barrier.h"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/arch/reg_reconfig.h"
+
+
+using namespace cute;
+using namespace cutlass;
+
+//////////////////// KERNEL /////////////////////////
+
+template <uint32_t Stages, typename PingPongBarrier>
+struct SharedStorage
+{
+  typename cutlass::PipelineTmaAsync<Stages>::SharedStorage pipeline_storage;
+  typename PingPongBarrier::SharedStorage pingpong_storage;
+};
+
+template <typename ClusterShape, uint32_t Stages>
+struct CollectiveSimulation {
+  using MainloopPipeline = typename cutlass::PipelineTmaAsync<Stages>;
+  using PipelineState = typename cutlass::PipelineState<Stages>;
+
+  CUTLASS_DEVICE
+  static void
+  dma_wg_simulation(MainloopPipeline pipeline, PipelineState tile_start_state_pipe,
+                    uint32_t const num_iterations) {
+    uint32_t const per_cta_bytes = sizeof(uint32_t);
+    int warp_idx_in_warpgroup = __shfl_sync(0xffffffff, (threadIdx.x / 32) % 4, 0);
+    int lane_predicate = cute::elect_one_sync();
+    if (warp_idx_in_warpgroup==0 && lane_predicate) {
+
+      int tma_k_prologue = min(Stages, num_iterations);
+
+      // Simulating Prologue TMA Loads
+      CUTLASS_PRAGMA_UNROLL
+      for(int i = 0; i < tma_k_prologue; ++i) {
+        pipeline.producer_acquire(tile_start_state_pipe);
+        // Simulating cp.async.bulk.tensor behavior
+        pipeline.producer_commit(tile_start_state_pipe, per_cta_bytes);
+        ++tile_start_state_pipe;
+      }
+      int tma_k_iter = num_iterations - tma_k_prologue;
+
+      PipelineState wr_pipe = tile_start_state_pipe;
+      // Simulating Mainloop TMA Loads
+      CUTE_NO_UNROLL
+      for ( ; tma_k_iter > 0; --tma_k_iter){
+
+        pipeline.producer_acquire(wr_pipe);
+
+        // Simulating cp.async.bulk.tensor behavior
+        pipeline.producer_commit(wr_pipe, per_cta_bytes);
+
+        // Advance write stage
+        ++wr_pipe;
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  static void
+  math_wg_simulation(MainloopPipeline pipeline, PipelineState tile_start_state_pipe,
+                    uint32_t const num_iterations, int* data_ptr) {
+    PipelineState rd_pipe = tile_start_state_pipe;
+    PipelineState release_pipe = rd_pipe;
+
+    // simulates accumulators + extra reg. pressure
+    int arr[168];
+
+    // Init Shared Memory read stages & PhaseBit
+    static constexpr uint32_t K_PIPE_MMAS = 1;
+    static_assert( K_PIPE_MMAS < Stages, "ERROR : Too many MMAs in flight");
+
+    // Total number of gemm iterations
+    auto gemm_k_iterations  = num_iterations;
+
+    // Simulating Prologue MMAs
+    int mma_k_prologue = min(K_PIPE_MMAS, gemm_k_iterations);
+    CUTLASS_PRAGMA_UNROLL
+    for (int iter = 0; iter < mma_k_prologue; ++iter) {
+      pipeline.consumer_wait(rd_pipe);
+
+      warpgroup_arrive();
+      // GMMA would typically happen here
+
+      ++rd_pipe;
+    }
+    gemm_k_iterations -= mma_k_prologue;
+
+    // Simulating Mainloop MMAs
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; gemm_k_iterations > 0; --gemm_k_iterations) {
+
+      /// Wait on the rd_pipe stage / phase
+      pipeline.consumer_wait(rd_pipe);
+
+      warpgroup_arrive();
+      // GMMA would typically happen here
+
+      // Dummy op - which will never happen
+      // But simulates high register usage.
+      CUTE_UNROLL
+      for(int i = 0; i < 168; ++i){
+        if (threadIdx.x > 384){
+          arr[i] += data_ptr[i];
+        }
+      }
+
+      pipeline.consumer_release(release_pipe);
+
+      // Advance stages
+      ++rd_pipe;
+      ++release_pipe;
+    }
+
+    // Dummy op - which will never happen
+    CUTE_UNROLL
+    for(int i = 0; i < 168; ++i){
+      if (threadIdx.x > 384){
+        data_ptr[i] = arr[i];
+      }
+    }
+
+    // Tail Loop
+    for (int i = 0; i < K_PIPE_MMAS; ++i){
+      pipeline.consumer_release(release_pipe);
+      ++release_pipe;
+    }
+
+  }
+};
+
+struct KernelParams
+{
+  uint32_t num_iterations;
+  int tiles_per_cluster;
+  int* data_ptr;
+};
+
+// Goal of this kernel is to complete deadlock-free
+template <typename ClusterShape, uint32_t Stages>
+__launch_bounds__(384, 1)
+__global__ static
+void pipeline_device(KernelParams params)
+{
+  extern __shared__ char shared_memory[];
+  using MainloopPipeline = typename cutlass::PipelineTmaAsync<Stages>;
+  using PipelineState = typename cutlass::PipelineState<Stages>;
+
+  /* One for Mainloop and one for Epilogue */
+  constexpr int StagesPerMathWarpGroup = 2;
+  constexpr int MathWarpGroupCountPersistent = 2;
+  using PingPongBarrier = typename cutlass::OrderedSequenceBarrier<StagesPerMathWarpGroup, MathWarpGroupCountPersistent>;
+
+  using SharedStorage = SharedStorage<Stages, PingPongBarrier>;
+  SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(shared_memory);
+
+  [[maybe_unused]] auto cta_layout = Layout<ClusterShape>{};            // (m,n) -> cta_id
+  int warp_group_idx = __shfl_sync(0xffffffff, threadIdx.x / NumThreadsPerWarpGroup, 0);
+  int warp_group_thread_idx = threadIdx.x % NumThreadsPerWarpGroup;
+  dim3 block_id_in_cluster = cute::block_id_in_cluster();
+
+  auto cluster_shape = ClusterShape{};
+
+  // #Producers = #RowsInCluster + #ColsInCluster - 1 
+  uint32_t const NumProducers = cute::size<0>(cluster_shape) + cute::size<1>(cluster_shape) - 1;
+  uint32_t const TmaTransactionBytes = static_cast<uint32_t>(sizeof(uint32_t) * NumProducers);
+
+  // mbarrier.init
+  typename MainloopPipeline::Params pipeline_params;
+  pipeline_params.transaction_bytes = TmaTransactionBytes;
+  if (warp_group_idx == 0) {
+    pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
+  }
+  else {
+    pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
+  }
+  pipeline_params.is_leader = warp_group_thread_idx == 0;
+  pipeline_params.num_consumers = NumThreadsPerWarpGroup;
+
+  MainloopPipeline pipeline(shared_storage.pipeline_storage, pipeline_params, cluster_shape);
+  PipelineState tile_start_state_pipe;
+
+  int tiles_per_cluster = params.tiles_per_cluster;
+
+  /* Offset pipeline start state for Math WG 2 */
+  if (warp_group_idx == 2) {
+    // Update pipeline state for next persistent tile
+    tile_start_state_pipe.advance(params.num_iterations);
+    tiles_per_cluster--;
+  }
+
+  typename PingPongBarrier::Params pingpong_params;
+  pingpong_params.group_id = warp_group_idx - 1; // Since DMA Warp Group Idx 0 will not participate
+  pingpong_params.group_size = NumThreadsPerWarpGroup; // Number of threads / participants in a group
+  PingPongBarrier math_wg_barrier(shared_storage.pingpong_storage, pingpong_params);
+
+  __syncthreads();
+
+  // Ensure All CTAs in Cluster have completed init before issuing commits
+  cute::cluster_arrive_relaxed();  
+  cute::cluster_wait();
+
+  // Producer/DMA WarpGroup
+  if (warp_group_idx == 0) {
+    cutlass::arch::warpgroup_reg_dealloc<40>();
+    // For the DMA (prologue) - we start with an opposite phase - since we skip all waits
+    // i.e., we know that the buffer is indeed empty
+    PipelineState tile_prologue_state_pipe = make_producer_start_state<MainloopPipeline>();
+    while (tiles_per_cluster > 0) {
+      CollectiveSimulation<ClusterShape,Stages>::dma_wg_simulation(pipeline, tile_prologue_state_pipe, params.num_iterations);
+      // Update pipeline state for next persistent tile
+      tile_prologue_state_pipe.advance(params.num_iterations);
+      tiles_per_cluster--;
+    }
+  }
+  // Math WarpGropups
+  if(warp_group_idx == 1 || warp_group_idx == 2) {
+    cutlass::arch::warpgroup_reg_alloc<232>();
+    while (tiles_per_cluster > 0) {
+      // MMA
+      math_wg_barrier.wait();
+      CollectiveSimulation<ClusterShape,Stages>::math_wg_simulation(pipeline, tile_start_state_pipe, params.num_iterations, params.data_ptr);
+      math_wg_barrier.arrive();
+      // Epilogue
+      math_wg_barrier.wait();
+      // Simulates long running stage
+      #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700)
+      __nanosleep(100000);
+      #endif
+      math_wg_barrier.arrive();
+      // Update pipeline state for next persistent tile
+      tile_start_state_pipe.advance(params.num_iterations * 2);
+      tiles_per_cluster -= 2;
+    }
+  }
+
+  // Makes sure remote SMEM doesn't get destroyed
+  cute::cluster_arrive_relaxed();  
+  cute::cluster_wait();
+}
+/////////////////////////////////////////////////////
+
+/// Device NT GMMA + TMA specialized
+template<uint32_t Stages_, typename ClusterShape_>
+struct PipelineTest {
+
+  //
+  // Data members
+  //
+  static constexpr uint32_t Stages = Stages_;
+  static constexpr uint32_t kBlockSize = 128 * 3;
+  using ClusterShape = ClusterShape_;
+
+  //
+  // Methods
+  //
+
+  // Run CuTe GEMM kernel
+  cudaError_t run(uint32_t const kNumIters,
+                  cudaStream_t stream = 0) {
+
+    float elapsed_ms = 0.0f;
+    // Pipeline (multistage pipeline)
+    auto cluster_shape = Shape<Int<ClusterShape::kM>, Int<ClusterShape::kN>, _1>{};
+
+    //
+    // Configure and launch
+    //
+    int iterations = 1;
+    cudaEvent_t events[2];
+    cudaError_t result;
+
+    for (cudaEvent_t & event : events) {
+      result = cudaEventCreate(&event);
+      if (result != cudaSuccess) {
+        std::cerr << "Error: Failed to create event.";
+        return result;
+      }
+    }
+
+    result = cudaEventRecord(events[0]);
+
+    if (result != cudaSuccess) {
+      std::cerr << "Error: Failed to record start event.";
+      return result;
+    }
+
+    for (int iter = 0; iter < iterations; ++iter) {
+      constexpr int StagesPerMathWarpGroup = 2;
+      constexpr int MathWarpGroupCountPersistent = 2;
+      int smem_size = int(sizeof(SharedStorage<Stages,
+                          typename cutlass::OrderedSequenceBarrier<StagesPerMathWarpGroup, MathWarpGroupCountPersistent>>));
+
+      result = cudaFuncSetAttribute(
+        pipeline_device<decltype(cluster_shape), Stages>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        smem_size);
+
+      // Launch a single Cluster, with kBlockSize threads per CTA
+      dim3 dimCluster(size<0>(cluster_shape), size<1>(cluster_shape), 1);    
+      dim3 dimGrid(size<0>(cluster_shape), size<1>(cluster_shape), 1);    
+      dim3 dimBlock(kBlockSize,1,1);
+
+      int tiles_per_cluster = (kNumIters % 10) + 1;
+      printf("Persistent version: Tiles per Cluster = %d\n", tiles_per_cluster);
+
+      const void* kernel = (const void*)pipeline_device<decltype(cluster_shape), Stages>;
+      KernelParams params{kNumIters, tiles_per_cluster, nullptr};
+      void *kernel_params[] = {&params};
+      cutlass::ClusterLauncher::launch(dimGrid, dimCluster, dimBlock, smem_size, stream, kernel, kernel_params);
+
+    }
+
+    result = cudaEventRecord(events[1]);
+
+    if (result != cudaSuccess) {
+      std::cerr << "Error: Failed to record stop event.";
+      return result;
+    }
+
+    result = cudaDeviceSynchronize();
+
+    if (result != cudaSuccess) {
+      std::cerr << "Error: cudaDeviceSynchronize() failed" << std::endl;
+      return result;
+    }
+
+    result = cudaEventElapsedTime(&elapsed_ms, events[0], events[1]);
+
+    if (result != cudaSuccess) {
+      std::cerr << "Failed to create event.";
+      return result;
+    }
+
+    for (cudaEvent_t & event : events) {
+      (void)cudaEventDestroy(event);
+    }
+
+    return cudaSuccess;
+  }
+};
+
+#if CUDA_12_0_SM90_FEATURES_SUPPORTED
+TEST(SM90_Verify_PipelineTmaAsync_WS_Persistent, Cluster1x1_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS_Persistent, Cluster1x1_Stage5) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  static constexpr uint32_t Stages = 5;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS_Persistent, Cluster1x1_Stage10) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  static constexpr uint32_t Stages = 10;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS_Persistent, Cluster2x2_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<2, 2, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS_Persistent, Cluster2x2_Stage5) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<2, 2, 1>;
+  static constexpr uint32_t Stages = 5;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS_Persistent, Cluster2x2_Stage7) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<2, 2, 1>;
+  static constexpr uint32_t Stages = 7;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS_Persistent, Cluster4x4_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<4, 4, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS_Persistent, Cluster4x4_Stage7) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<4, 4, 1>;
+  static constexpr uint32_t Stages = 7;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS_Persistent, Cluster2x1_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<2, 1, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS_Persistent, Cluster2x1_Stage7) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<2, 1, 1>;
+  static constexpr uint32_t Stages = 7;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS_Persistent, Cluster1x2_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<1, 2, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS_Persistent, Cluster1x2_Stage7) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<1, 2, 1>;
+  static constexpr uint32_t Stages = 7;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS_Persistent, Cluster4x1_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<4, 1, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS_Persistent, Cluster4x1_Stage7) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<4, 1, 1>;
+  static constexpr uint32_t Stages = 7;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS_Persistent, Cluster1x4_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<1, 4, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS_Persistent, Cluster1x4_Stage7) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<1, 4, 1>;
+  static constexpr uint32_t Stages = 7;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS_Persistent, Cluster2x4_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<2, 4, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS_Persistent, Cluster2x4_Stage7) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<2, 4, 1>;
+  static constexpr uint32_t Stages = 7;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS_Persistent, Cluster4x2_Stage2) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<4, 2, 1>;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_PipelineTmaAsync_WS_Persistent, Cluster4x2_Stage7) {
+  Options options;
+  using ClusterShape = cutlass::gemm::GemmShape<4, 2, 1>;
+  static constexpr uint32_t Stages = 7;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+#endif
diff --git a/test/unit/pipeline/sequence_barrier.cu b/test/unit/pipeline/sequence_barrier.cu
new file mode 100644
index 0000000000..9bb3150b25
--- /dev/null
+++ b/test/unit/pipeline/sequence_barrier.cu
@@ -0,0 +1,226 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Unit test for the OrderedSequenceBarrier class
+*/
+
+#include "../common/cutlass_unit_test.h"
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <cute/tensor.hpp>
+#include <cute/arch/cluster_sm90.hpp> 
+
+#include <cutlass/util/reference/host/gemm.h>
+#include <cutlass/cluster_launch.hpp>
+
+#include "cutlass/core_io.h"
+
+#include "cutlass/util/print_error.hpp"
+#include "cutlass/util/GPU_Clock.hpp"
+
+#include "testbed.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/arch/barrier.h"
+#include "cute/arch/cluster_sm90.hpp"
+
+using namespace cute;
+
+//////////////////// KERNEL /////////////////////////
+
+template<typename OrderedSequencer>
+struct SharedStorage
+{
+  typename OrderedSequencer::SharedStorage storage;
+};
+
+// Goal of this kernel is to complete deadlock-free
+template<int Stages, int GroupCount, int ThreadsPerGroup>
+__global__ static
+void ordered_sequence_device(uint32_t const num_iterations)
+{
+
+  extern __shared__ char shared_memory[];
+  using SequenceBarrier = typename cutlass::OrderedSequenceBarrier<Stages, GroupCount>;
+  using SmemStorage = SharedStorage<SequenceBarrier>;
+
+  SmemStorage& shared_storage = *reinterpret_cast<SmemStorage*>(shared_memory);
+
+  int group_idx = threadIdx.x / ThreadsPerGroup;
+
+  typename SequenceBarrier::Params params;
+  params.group_id = group_idx;              // sequence ID
+  params.group_size = ThreadsPerGroup;      // Number of threads / participants in a group
+
+  SequenceBarrier barrier(shared_storage.storage, params);
+
+  // Ensure All CTAs in Cluster have completed init before issuing commits
+  __syncthreads();
+  cute::cluster_arrive_relaxed();  
+  cute::cluster_wait();
+
+  CUTLASS_PRAGMA_NO_UNROLL
+  for (int i = 0; i < num_iterations; ++i){
+
+    barrier.wait();
+    // STAGE 1 CODE...
+    #ifndef NDEBUG
+    int thread_idx_in_group = threadIdx.x % ThreadsPerGroup;
+    if (thread_idx_in_group == 0) {
+      printf("STAGE 0 : Group_IDX : %d, id = %d, iter = %d, tidx = %d\n", group_idx, params.group_id, i, threadIdx.x);
+    }
+    #endif
+    // Simulates long running stage
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700)
+    __nanosleep(100000);
+    #endif
+    barrier.arrive();
+
+    barrier.wait();
+    // STAGE 2 CODE...
+    #ifndef NDEBUG
+    if (thread_idx_in_group == 0) {
+      printf("STAGE 1 : Group_IDX : %d, id = %d, iter = %d, tidx = %d\n", group_idx, params.group_id, i, threadIdx.x);
+    }
+    #endif
+    // Simulates long running stage
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700)
+    __nanosleep(100000);
+    #endif
+    barrier.arrive();
+  }
+
+  // To make sure remote SMEM doesn't get destroyed
+  cute::cluster_arrive();  
+  cute::cluster_wait();  
+}
+/////////////////////////////////////////////////////
+
+template<uint32_t Stages_, uint32_t GroupCount_>
+struct PipelineTest {
+
+  //
+  // Data members
+  //
+  static constexpr uint32_t ThreadsPerGroup = 128;
+  static constexpr uint32_t BlockSize = GroupCount_ * ThreadsPerGroup;
+  static constexpr uint32_t Stages = Stages_;
+  static constexpr uint32_t GroupCount = GroupCount_;
+  using SequenceBarrier = typename cutlass::OrderedSequenceBarrier<Stages, GroupCount>;
+  using SmemStorage = SharedStorage<SequenceBarrier>;
+
+  //
+  // Methods
+  //
+
+  // Run CuTe GEMM kernel
+  cudaError_t run(uint32_t const kNumIters,
+                  cudaStream_t stream = nullptr) {
+
+    // Pipeline (multistage pipeline)
+    auto cluster_shape = Shape<_1, _1, _1>{};
+
+    //
+    // Configure and launch
+    //
+    int iterations = 1;
+    cudaError_t result;
+
+    for (int iter = 0; iter < iterations; ++iter) {
+
+      int smem_size = int(sizeof(SmemStorage));
+
+      result = cudaFuncSetAttribute(
+        ordered_sequence_device<Stages, GroupCount, ThreadsPerGroup>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        smem_size);
+
+      // Launch a single Cluster, with 128 thread per CTA
+      dim3 dimCluster(size<0>(cluster_shape), size<1>(cluster_shape), size<2>(cluster_shape));    
+      dim3 dimGrid(size<0>(cluster_shape), size<1>(cluster_shape), 1);    
+      dim3 dimBlock(BlockSize,1,1);
+
+      const void* kernel = (const void*)ordered_sequence_device<Stages, GroupCount, ThreadsPerGroup>;
+      int iters = kNumIters;
+      void* kernel_params[] = {reinterpret_cast<void*>(&iters)};
+      cutlass::ClusterLauncher::launch(dimGrid, dimCluster, dimBlock, smem_size, stream, kernel, kernel_params);
+  
+    } // profiling loop ends
+
+    result = cudaDeviceSynchronize();
+
+    if (result != cudaSuccess) {
+      std::cerr << "Error: cudaDeviceSynchronize() failed" << std::endl;
+      return result;
+    }
+
+    return cudaSuccess;
+  }
+};
+
+#if CUDA_12_0_SM90_FEATURES_SUPPORTED
+TEST(SM90_Verify_OrderedSequence, Depth_2_Length_2) {
+  Options options;
+  static constexpr uint32_t GroupCount = 2;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, GroupCount>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_OrderedSequence, Depth_2_Length_3) {
+  Options options;
+  static constexpr uint32_t GroupCount = 3;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, GroupCount>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_OrderedSequence, Depth_2_Length_4) {
+  Options options;
+  static constexpr uint32_t GroupCount = 4;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, GroupCount>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+TEST(SM90_Verify_OrderedSequence, Depth_2_Length_5) {
+  Options options;
+  static constexpr uint32_t GroupCount = 5;
+  static constexpr uint32_t Stages = 2;
+  using Test = PipelineTest<Stages, GroupCount>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+#endif
diff --git a/test/unit/pipeline/testbed.h b/test/unit/pipeline/testbed.h
new file mode 100644
index 0000000000..ea95f7a969
--- /dev/null
+++ b/test/unit/pipeline/testbed.h
@@ -0,0 +1,145 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Common Testbed file shared by Pipeline unit tests
+*/
+
+#include <cstdlib>
+#include <cstdio>
+#include <cassert>
+#include <cutlass/gemm/gemm.h>
+
+#include "cutlass/util/command_line.h"
+#include "../common/cutlass_unit_test.h"
+
+#if CUDA_12_0_SM90_FEATURES_SUPPORTED
+  #define CUTLASS_UNIT_TEST_PIPELINE true
+#else
+  #define CUTLASS_UNIT_TEST_PIPELINE false
+#endif
+
+// Command line test options
+struct Options {
+  //
+  // Data Members
+  // 
+  bool help;
+  bool verification_enabled;
+  int SM_count;
+  int clock_MHz;
+
+  //
+  // Methods
+  // 
+  Options():
+    help(false),
+    verification_enabled(true),
+    SM_count(116),
+    clock_MHz(1477)
+  { }
+
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+
+    cmd.get_cmd_line_argument("verification-enabled", verification_enabled, true);
+    cmd.get_cmd_line_argument("sm-count", SM_count, 116);
+    cmd.get_cmd_line_argument("clock", clock_MHz, 1477);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "Options:\n\n"
+      << "  --help                          If specified, displays this usage statement.\n\n"
+      << "  --verification-enabled=<bool>   Enable/Disable verification\n"
+      << "  --sm-count=<int>                Number of SMs on the chip\n"
+      << "  --clock=<int>                   Locked clock value in Mhz\n";
+
+    return out;
+  }
+};
+
+//
+// Testbed
+//
+
+template<typename Pipeline>
+struct Testbed {
+private:
+  // Commandline options
+  Options options;
+
+  void run_test(uint32_t const kNumIters) {
+
+    // Run CuTe Gemm 
+    Pipeline pipeline;
+
+    cudaError_t result = pipeline.run(kNumIters);
+
+    CUTE_CHECK_LAST();
+  }
+
+
+public:
+  Testbed(Options const &options_) : options(options_) {
+    int device_id = 0;
+    cudaDeviceProp device_prop;
+    CUTE_CHECK_ERROR(cudaSetDevice(device_id));
+    CUTE_CHECK_ERROR(cudaGetDeviceProperties(&device_prop, device_id));
+  
+    if (device_prop.major < 1) {
+      fprintf(stderr, "Device does not support CUDA.\n");
+      exit(1);
+    }
+  }
+
+  /// Run verification Gemm problem sizes
+  bool verification() {
+
+    std::array<uint32_t, 5> kNumIters;
+
+    for (size_t i = 0; i < kNumIters.size(); ++i) {
+      kNumIters[i] = static_cast<uint32_t>( (rand() % 1000) + 1 );
+    }
+
+    for (int n : kNumIters) {
+      std::cout << "Stages = " << Pipeline::Stages << " kNumIters = " << n << "\n";
+      run_test(n);
+    }
+
+    return true;
+  }
+};
diff --git a/test/unit/reduction/CMakeLists.txt b/test/unit/reduction/CMakeLists.txt
index 900c0087f3..577f05d577 100644
--- a/test/unit/reduction/CMakeLists.txt
+++ b/test/unit/reduction/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/reduction/device/CMakeLists.txt b/test/unit/reduction/device/CMakeLists.txt
index e112f11a0a..6d1ef4ed33 100644
--- a/test/unit/reduction/device/CMakeLists.txt
+++ b/test/unit/reduction/device/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/reduction/device/tensor_reduce_contiguous.cu b/test/unit/reduction/device/tensor_reduce_contiguous.cu
index d5d6eb7b1a..9eeffba2dd 100644
--- a/test/unit/reduction/device/tensor_reduce_contiguous.cu
+++ b/test/unit/reduction/device/tensor_reduce_contiguous.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/reduction/device/tensor_reduce_strided.cu b/test/unit/reduction/device/tensor_reduce_strided.cu
index 06b06b79bd..6e8c992ea1 100644
--- a/test/unit/reduction/device/tensor_reduce_strided.cu
+++ b/test/unit/reduction/device/tensor_reduce_strided.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/reduction/kernel/CMakeLists.txt b/test/unit/reduction/kernel/CMakeLists.txt
index c45d4f9b1d..826586ba32 100644
--- a/test/unit/reduction/kernel/CMakeLists.txt
+++ b/test/unit/reduction/kernel/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/reduction/kernel/reduce_splitk.cu b/test/unit/reduction/kernel/reduce_splitk.cu
index de33aa83e0..c6cc4c81fb 100644
--- a/test/unit/reduction/kernel/reduce_splitk.cu
+++ b/test/unit/reduction/kernel/reduce_splitk.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -110,7 +110,6 @@ public:
       cutlass::reference::host::BlockFillSequential(view.data(),
                                                     view.capacity());
     } else {
-      // TODO: Implement the rest
       EXPECT_TRUE(false) << "Not implemented";
       return false;
     }
diff --git a/test/unit/reduction/kernel/reduce_splitk_testbed.h b/test/unit/reduction/kernel/reduce_splitk_testbed.h
index e10b9ce8ed..21f9282b51 100644
--- a/test/unit/reduction/kernel/reduce_splitk_testbed.h
+++ b/test/unit/reduction/kernel/reduce_splitk_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/reduction/thread/CMakeLists.txt b/test/unit/reduction/thread/CMakeLists.txt
index 909ad5f6df..1c27a693e0 100644
--- a/test/unit/reduction/thread/CMakeLists.txt
+++ b/test/unit/reduction/thread/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/reduction/thread/reduction_thread.cu b/test/unit/reduction/thread/reduction_thread.cu
index 447b7ee7d3..d125cccb8f 100644
--- a/test/unit/reduction/thread/reduction_thread.cu
+++ b/test/unit/reduction/thread/reduction_thread.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/reduction/thread/testbed.h b/test/unit/reduction/thread/testbed.h
index 3341335508..006835a2f9 100644
--- a/test/unit/reduction/thread/testbed.h
+++ b/test/unit/reduction/thread/testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/substrate/CMakeLists.txt b/test/unit/substrate/CMakeLists.txt
new file mode 100644
index 0000000000..693c597e8b
--- /dev/null
+++ b/test/unit/substrate/CMakeLists.txt
@@ -0,0 +1,33 @@
+# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_substrate
+
+  dependent_false.cpp
+)
diff --git a/test/unit/substrate/dependent_false.cpp b/test/unit/substrate/dependent_false.cpp
new file mode 100644
index 0000000000..dd09378a81
--- /dev/null
+++ b/test/unit/substrate/dependent_false.cpp
@@ -0,0 +1,88 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <cutlass/trace.h>
+#include "cutlass/detail/dependent_false.hpp"
+
+namespace { // (anonymous)
+
+template<class ... Args>
+void test_dependent_bool_value()
+{
+  static_assert(cutlass::detail::dependent_bool_value<true, Args...> == true);
+  static_assert(cutlass::detail::dependent_bool_value<false, Args...> == false);
+}
+
+template<class ... Args>
+void test_dependent_false()
+{
+  static_assert(cutlass::detail::dependent_false<Args...> == false);
+}
+
+template<class ... Args>
+void test_all()
+{
+  test_dependent_bool_value<Args...>();
+  test_dependent_false<Args...>();
+}
+
+// Types to use in Args
+struct Type0 {};
+struct Type1 {};
+struct Type2 {};
+
+} // end namespace (anonymous)
+
+TEST(LibcudacxxNext, DependentBoolValue)
+{
+  CUTLASS_TRACE_HOST("-------------------------------");
+  CUTLASS_TRACE_HOST("dependent_bool_value");
+  CUTLASS_TRACE_HOST("-------------------------------");
+
+  test_dependent_bool_value<int>();
+  test_dependent_bool_value<float>();
+  test_dependent_bool_value<int, float>();
+  test_dependent_bool_value<Type0, int, float, Type1, float, int, Type2>();
+}
+
+TEST(LibcudacxxNext, DependentFalse)
+{
+  CUTLASS_TRACE_HOST("-------------------------------");
+  CUTLASS_TRACE_HOST("dependent_false");
+  CUTLASS_TRACE_HOST("-------------------------------");
+
+  test_dependent_false<int>();
+  test_dependent_false<float>();
+  test_dependent_false<int, float>();
+  test_dependent_false<Type0, int, float, Type1, float, int, Type2>();
+}
diff --git a/test/unit/test_unit.cpp b/test/unit/test_unit.cpp
index 1d263eaf0c..e515cf9a98 100644
--- a/test/unit/test_unit.cpp
+++ b/test/unit/test_unit.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/transform/CMakeLists.txt b/test/unit/transform/CMakeLists.txt
index 09a7078c4b..0ab0b93f50 100644
--- a/test/unit/transform/CMakeLists.txt
+++ b/test/unit/transform/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -27,15 +27,18 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 add_subdirectory(threadblock)
+add_subdirectory(kernel)
 
 add_custom_target(
   cutlass_test_unit_transform
   DEPENDS
   cutlass_test_unit_transform_threadblock
-  )
+  cutlass_test_unit_transform_kernel
+)
 
 add_custom_target(
   test_unit_transform
   DEPENDS
   test_unit_transform_threadblock
-  )
+  test_unit_transform_kernel
+)
diff --git a/test/unit/transform/device/CMakeLists.txt b/test/unit/transform/device/CMakeLists.txt
new file mode 100644
index 0000000000..74ad63f299
--- /dev/null
+++ b/test/unit/transform/device/CMakeLists.txt
@@ -0,0 +1,58 @@
+# Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#
+# Compress Kernel
+#
+
+add_custom_target(
+    cutlass_test_unit_sm90_structured_sparse_gemm_compressor
+    DEPENDS
+    cutlass_test_unit_sm90_structured_sparse_gemm_compressor_f32
+    cutlass_test_unit_sm90_structured_sparse_gemm_compressor_f16
+    cutlass_test_unit_sm90_structured_sparse_gemm_compressor_f8
+)
+
+cutlass_test_unit_add_executable(
+    cutlass_test_unit_sm90_structured_sparse_gemm_compressor_f32
+
+    sm90_sparse_gemm_compressor_f32.cu
+)
+
+cutlass_test_unit_add_executable(
+    cutlass_test_unit_sm90_structured_sparse_gemm_compressor_f16
+
+    sm90_sparse_gemm_compressor_f16.cu
+)
+
+cutlass_test_unit_add_executable(
+    cutlass_test_unit_sm90_structured_sparse_gemm_compressor_f8
+
+    sm90_sparse_gemm_compressor_f8.cu
+)
+
diff --git a/test/unit/transform/device/sm90_sparse_gemm_compressor_f16.cu b/test/unit/transform/device/sm90_sparse_gemm_compressor_f16.cu
new file mode 100644
index 0000000000..2f42d6a10c
--- /dev/null
+++ b/test/unit/transform/device/sm90_sparse_gemm_compressor_f16.cu
@@ -0,0 +1,95 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cute/atom/mma_traits_sm90_gmma.hpp"                       // cute::GMMA::Major
+#include "cutlass/arch/config.h"                                    // CUTLASS_ARCH_MMA_SM90_SUPPORTED
+#include "cutlass/transform/kernel/sparse_gemm_compressor.hpp"      // StructuredSparseCompressor
+#include "cutlass/transform/device/transform_universal_adapter.hpp" // TransformUniversalAdapter
+#include "cutlass/gemm/collective/builders/sm90_common.inl"         // gmma_ss_tag_to_major_A
+#include "cutlass/gemm/collective/builders/sm90_sparse_config.inl"  // Sm90GemmSparseConfig
+#include "testbed_sparse_gemm_compressor.hpp"                       // TestbedSparseGemmCompressor
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// * Test Plan
+// ElementA : fp16
+// LayoutA : row / col
+// Gemm : 1x 2x 3x multiplier of alignment requirement. corner case that smaller than alignment requirement
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+TEST(SM90_Structured_Sparse_Gemm_Compressor_Device, f16_t)
+{
+  // Test Settings
+  using ElementA = cutlass::half_t;
+  using LayoutATag = cutlass::layout::RowMajor;
+
+  // Deduct From Test Setting
+  static constexpr cute::GMMA::Major GmmaMajorA = cutlass::gemm::collective::detail::gmma_rs_tag_to_major_A<LayoutATag>();
+  using ElementAMma = cute::sparse_elem<2, ElementA>;
+  using ElementEMma = cute::sparse_elem<8, uint8_t>;
+
+  using SparseConfig = cutlass::Sm90GemmSparseConfig<ElementAMma, GmmaMajorA, ElementEMma, cute::Int<32>>;
+
+  using CompressorKernel = cutlass::transform::kernel::
+      StructuredSparseCompressor<cute::Shape<int, int, int, int>, ElementA, LayoutATag, SparseConfig, cutlass::arch::Sm90>;
+
+  using Compressor = cutlass::transform::device::TransformUniversalAdapter<CompressorKernel>;
+
+  // Test Bed
+  test::transform::device::TestbedSparseGemmCompressor<Compressor> testbed;
+  EXPECT_TRUE(testbed.run_auto());
+}
+
+TEST(SM90_Structured_Sparse_Gemm_Compressor_Device, f16_n)
+{
+  // Test Settings
+  using ElementA = cutlass::bfloat16_t;
+  using LayoutATag = cutlass::layout::ColumnMajor;
+
+  // Deduct From Test Setting
+  static constexpr cute::GMMA::Major GmmaMajorA = cutlass::gemm::collective::detail::gmma_rs_tag_to_major_A<LayoutATag>();
+  using ElementAMma = cute::sparse_elem<2, ElementA>;
+  using ElementEMma = cute::sparse_elem<8, uint8_t>;
+
+  using SparseConfig = cutlass::Sm90GemmSparseConfig<ElementAMma, GmmaMajorA, ElementEMma, cute::Int<64>>;
+
+  using CompressorKernel = cutlass::transform::kernel::
+      StructuredSparseCompressor<cute::Shape<int, int, int, int>, ElementA, LayoutATag, SparseConfig, cutlass::arch::Sm90>;
+
+  using Compressor = cutlass::transform::device::TransformUniversalAdapter<CompressorKernel>;
+
+  // Test Bed
+  test::transform::device::TestbedSparseGemmCompressor<Compressor> testbed;
+  EXPECT_TRUE(testbed.run_auto());
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/transform/device/sm90_sparse_gemm_compressor_f32.cu b/test/unit/transform/device/sm90_sparse_gemm_compressor_f32.cu
new file mode 100644
index 0000000000..295622b266
--- /dev/null
+++ b/test/unit/transform/device/sm90_sparse_gemm_compressor_f32.cu
@@ -0,0 +1,95 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cute/atom/mma_traits_sm90_gmma.hpp"                       // cute::GMMA::Major
+#include "cutlass/arch/config.h"                                    // CUTLASS_ARCH_MMA_SM90_SUPPORTED
+#include "cutlass/transform/kernel/sparse_gemm_compressor.hpp"      // StructuredSparseCompressor
+#include "cutlass/transform/device/transform_universal_adapter.hpp" // TransformUniversalAdapter
+#include "cutlass/gemm/collective/builders/sm90_common.inl"         // gmma_ss_tag_to_major_A
+#include "cutlass/gemm/collective/builders/sm90_sparse_config.inl"  // Sm90GemmSparseConfig
+#include "testbed_sparse_gemm_compressor.hpp"                       // TestbedSparseGemmCompressor
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// * Test Plan
+// ElementA : fp32
+// LayoutA : row / col
+// Gemm : 1x 2x 3x multiplier of alignment requirement. corner case that smaller than alignment requirement
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+TEST(SM90_Structured_Sparse_Gemm_Compressor_Device, f32_t)
+{
+  // Test Settings
+  using ElementA = float;
+  using LayoutATag = cutlass::layout::RowMajor;
+
+  // Deduct From Test Setting
+  static constexpr cute::GMMA::Major GmmaMajorA = cutlass::gemm::collective::detail::gmma_rs_tag_to_major_A<LayoutATag>();
+  using ElementAMma = cute::sparse_elem<2, ElementA>;
+  using ElementEMma = cute::sparse_elem<4, uint8_t>;
+
+  using SparseConfig = cutlass::Sm90GemmSparseConfig<ElementAMma, GmmaMajorA, ElementEMma, cute::Int<16>>;
+
+  using CompressorKernel = cutlass::transform::kernel::
+      StructuredSparseCompressor<cute::Shape<int, int, int, int>, ElementA, LayoutATag, SparseConfig, cutlass::arch::Sm90>;
+
+  using Compressor = cutlass::transform::device::TransformUniversalAdapter<CompressorKernel>;
+
+  // Test Bed
+  test::transform::device::TestbedSparseGemmCompressor<Compressor> testbed;
+  EXPECT_TRUE(testbed.run_auto());
+}
+
+TEST(SM90_Structured_Sparse_Gemm_Compressor_Device, f32_n)
+{
+  // Test Settings
+  using ElementA = cutlass::tfloat32_t;
+  using LayoutATag = cutlass::layout::ColumnMajor;
+
+  // Deduct From Test Setting
+  static constexpr cute::GMMA::Major GmmaMajorA = cutlass::gemm::collective::detail::gmma_rs_tag_to_major_A<LayoutATag>();
+  using ElementAMma = cute::sparse_elem<2, ElementA>;
+  using ElementEMma = cute::sparse_elem<4, uint8_t>;
+
+  using SparseConfig = cutlass::Sm90GemmSparseConfig<ElementAMma, GmmaMajorA, ElementEMma, cute::Int<32>>;
+
+  using CompressorKernel = cutlass::transform::kernel::
+      StructuredSparseCompressor<cute::Shape<int, int, int, int>, ElementA, LayoutATag, SparseConfig, cutlass::arch::Sm90>;
+
+  using Compressor = cutlass::transform::device::TransformUniversalAdapter<CompressorKernel>;
+
+  // Test Bed
+  test::transform::device::TestbedSparseGemmCompressor<Compressor> testbed;
+  EXPECT_TRUE(testbed.run_auto());
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/transform/device/sm90_sparse_gemm_compressor_f8.cu b/test/unit/transform/device/sm90_sparse_gemm_compressor_f8.cu
new file mode 100644
index 0000000000..0371471046
--- /dev/null
+++ b/test/unit/transform/device/sm90_sparse_gemm_compressor_f8.cu
@@ -0,0 +1,95 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cute/atom/mma_traits_sm90_gmma.hpp"                       // cute::GMMA::Major
+#include "cutlass/arch/config.h"                                    // CUTLASS_ARCH_MMA_SM90_SUPPORTED
+#include "cutlass/transform/kernel/sparse_gemm_compressor.hpp"      // StructuredSparseCompressor
+#include "cutlass/transform/device/transform_universal_adapter.hpp" // TransformUniversalAdapter
+#include "cutlass/gemm/collective/builders/sm90_common.inl"         // gmma_ss_tag_to_major_A
+#include "cutlass/gemm/collective/builders/sm90_sparse_config.inl"  // Sm90GemmSparseConfig
+#include "testbed_sparse_gemm_compressor.hpp"                       // TestbedSparseGemmCompressor
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// * Test Plan
+// ElementA : fp8
+// LayoutA : row / col
+// Gemm : 1x 2x 3x multiplier of alignment requirement. corner case that smaller than alignment requirement
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+
+TEST(SM90_Structured_Sparse_Gemm_Compressor_Device, f8_t)
+{
+  // Test Settings
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutATag = cutlass::layout::RowMajor;
+
+  // Deduct From Test Setting
+  static constexpr cute::GMMA::Major GmmaMajorA = cutlass::gemm::collective::detail::gmma_rs_tag_to_major_A<LayoutATag>();
+  using ElementAMma = cute::sparse_elem<2, ElementA>;
+  using ElementEMma = cute::sparse_elem<8, uint8_t>;
+
+  using SparseConfig = cutlass::Sm90GemmSparseConfig<ElementAMma, GmmaMajorA, ElementEMma, cute::Int<64>>;
+
+  using CompressorKernel = cutlass::transform::kernel::
+      StructuredSparseCompressor<cute::Shape<int, int, int, int>, ElementA, LayoutATag, SparseConfig, cutlass::arch::Sm90>;
+
+  using Compressor = cutlass::transform::device::TransformUniversalAdapter<CompressorKernel>;
+
+  // Test Bed
+  test::transform::device::TestbedSparseGemmCompressor<Compressor> testbed;
+  EXPECT_TRUE(testbed.run_auto());
+}
+
+TEST(SM90_Structured_Sparse_Gemm_Compressor_Device, f8_n)
+{
+  // Test Settings
+  using ElementA = cutlass::float_e5m2_t;
+  using LayoutATag = cutlass::layout::ColumnMajor;
+
+  // Deduct From Test Setting
+  static constexpr cute::GMMA::Major GmmaMajorA = cutlass::gemm::collective::detail::gmma_rs_tag_to_major_A<LayoutATag>();
+  using ElementAMma = cute::sparse_elem<2, ElementA>;
+  using ElementEMma = cute::sparse_elem<8, uint8_t>;
+
+  using SparseConfig = cutlass::Sm90GemmSparseConfig<ElementAMma, GmmaMajorA, ElementEMma, cute::Int<64>>;
+
+  using CompressorKernel = cutlass::transform::kernel::
+      StructuredSparseCompressor<cute::Shape<int, int, int, int>, ElementA, LayoutATag, SparseConfig, cutlass::arch::Sm90>;
+
+  using Compressor = cutlass::transform::device::TransformUniversalAdapter<CompressorKernel>;
+
+  // Test Bed
+  test::transform::device::TestbedSparseGemmCompressor<Compressor> testbed;
+  EXPECT_TRUE(testbed.run_auto());
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
diff --git a/test/unit/transform/device/sm90_sparse_gemm_compressor_legacy.hpp b/test/unit/transform/device/sm90_sparse_gemm_compressor_legacy.hpp
new file mode 100644
index 0000000000..8ec0c4ac46
--- /dev/null
+++ b/test/unit/transform/device/sm90_sparse_gemm_compressor_legacy.hpp
@@ -0,0 +1,479 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Compress utils specific for SM90 structure sparse kernels
+*/
+
+#pragma once
+
+#include <algorithm>                       // std::fill
+#include <array>                           // std::array
+#include <cstdio>
+#include <random>                          // std::mt19937
+
+#include "cute/container/bit_field.hpp"    // cute::bit_field
+#include "cute/numeric/numeric_types.hpp"  // cute::sizeof_bits_v
+#include "cute/tensor.hpp"                 // cute::Tensor, cute::make_tensor, cute::print_tensor
+#include "cutlass/arch/arch.h"             // cutlass::arch::Sm90
+#include "cutlass/cutlass.h"               // cutlass::Status
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/detail/layout.hpp"       // cutlass::TagToStrideA_t
+#include "cutlass/fast_math.h"             // cutlass::ceil_div, cutlass::round_up
+#include "cutlass/kernel_hardware_info.h"  // cutlass::KernelHardwareInfo
+#include "cutlass/util/packed_stride.hpp"  // cutlass::make_cute_packed_stride
+#include "cutlass/numeric_size.h"          // cutlass::bits_to_bytes
+#include "cutlass/cuda_host_adapter.hpp"   // cutlass::CudaHostAdapter
+
+namespace cutlass
+{
+namespace transform
+{
+namespace kernel
+{
+
+using namespace cute;
+
+namespace detail {
+
+  template<typename T>
+  CUTLASS_HOST_DEVICE
+  static uint8_t
+  encode_in_chunk_idx_legacy(int in_chunk_idx){
+    if (sizeof(T) == 4) {
+      return in_chunk_idx == 0 ? 0b0100 : 0b1110;
+    }
+    else {
+      uint8_t res = 0;
+      if (in_chunk_idx == 0) {
+        res = 0b00;
+      }
+      else if (in_chunk_idx == 1) {
+        res = 0b01;
+      }
+      else if (in_chunk_idx == 2) {
+        res = 0b10;
+      }
+      else {
+        res = 0b11;
+      }
+      return res;
+    }
+  }
+
+  template <
+    class SparseConfig,
+    class EngineA,
+    class LayoutA,
+    class EngineAc,
+    class LayoutAc
+  >
+  CUTLASS_HOST_DEVICE
+  static void
+  compress_two_chunks_legacy(
+    Tensor<EngineA, LayoutA> tensorA,
+    Tensor<EngineAc, LayoutAc> tensorAc,
+    uint8_t& meta_two_chunk,
+    int effective_elems) {
+
+    using ElementA = typename EngineAc::value_type;
+
+    static constexpr int LogicalElemsAPerChunk  = typename SparseConfig::LogicalElemsAPerChunk{};
+    static constexpr int PhysicalElemsAPerChunk  = typename SparseConfig::PhysicalElemsAPerChunk{};
+    static constexpr int ElemsARawPerElementAMmaRaw    = typename SparseConfig::ElemsARawPerElementAMmaRaw{};
+    static constexpr int ElementEBitsPerElementAMma = typename SparseConfig::ElementEBitsPerElementAMma{};
+    static constexpr int LogicalSubChunk     = ceil_div(LogicalElemsAPerChunk, ElemsARawPerElementAMmaRaw);
+    static constexpr int PhysicalSubChunk    = ceil_div(PhysicalElemsAPerChunk, ElemsARawPerElementAMmaRaw);
+
+    /*
+    Legal metadata chunk in SM90
+    Index   Bin   HEX
+    0, 1  0b0100   4
+    1, 2  0b1001   9
+    2, 3  0b1110   E
+    0, 2  0b1000   8
+    1, 3  0b1101   D
+    0, 3  0b1100   C
+    2, 1  0b0110   6  (Not used)
+    -----------------------------------
+    TF32
+    0     0b0100   4
+    1     0b1110   E
+    */
+
+    if (effective_elems <= 0) {
+      return;
+    }
+
+    // initialize
+    // 0 is the initial value for this function while 0x44 is the initial value for hardware.
+    meta_two_chunk = 0;
+
+    for (int chunk_idx = 0; chunk_idx < 2; ++chunk_idx) {
+      // If Only One Chunk within this Two Chunk
+      if ( effective_elems <= chunk_idx * ElemsARawPerElementAMmaRaw * LogicalSubChunk ) {
+        break;
+      }
+      /// init result;
+      int non_zero_cnt = 0;
+      int32_t nnz_chunk_idx[PhysicalSubChunk] = { 0 };
+      ElementA Ac_chunk[PhysicalSubChunk][ElemsARawPerElementAMmaRaw] = { ElementA{0} };
+
+      for (int subchunk_idx = 0; subchunk_idx < LogicalSubChunk; ++subchunk_idx) {
+        bool is_nz = true;
+        ElementA subchunk_elems[ElemsARawPerElementAMmaRaw] = { ElementA{0} };
+        /// Check if subchunk is non-zero
+        for(int elem_idx = 0; elem_idx < ElemsARawPerElementAMmaRaw; elem_idx++) {
+          int offset = chunk_idx * LogicalElemsAPerChunk + subchunk_idx * ElemsARawPerElementAMmaRaw + elem_idx;
+          subchunk_elems[elem_idx] = offset < effective_elems ? tensorA(offset) : ElementA(0);
+          
+          if (subchunk_elems[elem_idx] != ElementA(0)) {
+            if (non_zero_cnt >= PhysicalSubChunk) {
+              #ifdef  __CUDA_ARCH__
+                asm volatile ("brkpt;\n" ::);
+              #else
+                throw std::runtime_error("Found extra non-zero elements in a chunk!\n");
+              #endif
+            }
+            is_nz = false;
+          }
+        }
+
+        /// There is non-zero element in the subchunk
+        if(!is_nz) {
+          nnz_chunk_idx[non_zero_cnt] = subchunk_idx;
+          memcpy(Ac_chunk[non_zero_cnt], subchunk_elems, sizeof(ElementA) * ElemsARawPerElementAMmaRaw);
+          non_zero_cnt++;
+        }
+      }
+
+      /*
+      Special cases
+      nnz == 1 and non-tf32 and nnz_idx = 3
+      */
+      ElementA elementA_zeros[ElemsARawPerElementAMmaRaw] = { ElementA{0} };
+      if constexpr (sizeof_bits_v<ElementA> < 32) {
+        if (non_zero_cnt == 1 && nnz_chunk_idx[0] == 3) {
+          memcpy(Ac_chunk[1], Ac_chunk[0], sizeof(ElementA) * ElemsARawPerElementAMmaRaw);
+          memcpy(Ac_chunk[0], elementA_zeros, sizeof(ElementA) * ElemsARawPerElementAMmaRaw);
+          nnz_chunk_idx[1] = 3;
+          nnz_chunk_idx[0] = 0;
+        }
+        else if (non_zero_cnt == 1) {
+          memcpy(Ac_chunk[1], elementA_zeros, sizeof(ElementA) * ElemsARawPerElementAMmaRaw);
+          nnz_chunk_idx[1] = 3;
+        }
+      }
+
+      /// Setup metadata
+      uint8_t meta_chunk = 0;
+      for (int i = 0; i < PhysicalSubChunk; i++) {
+        meta_chunk = static_cast<uint8_t>(meta_chunk | (encode_in_chunk_idx_legacy<ElementA>(nnz_chunk_idx[i]) << (i * ElementEBitsPerElementAMma)));
+        for(int j = 0; j < ElemsARawPerElementAMmaRaw; j++) {
+          tensorAc(chunk_idx * PhysicalElemsAPerChunk + i * ElemsARawPerElementAMmaRaw + j) = Ac_chunk[i][j];
+        }
+      }
+      meta_two_chunk = uint8_t(meta_two_chunk | (meta_chunk << (chunk_idx * _4{})));
+    }
+  }
+}
+
+template<
+  class ProblemShape_,
+  class ElementA_,
+  class LayoutATag_,
+  class SparseConfig_
+>
+class SM90StructuredSparseCompressorLegacy {
+public:
+  using SparseConfig = SparseConfig_;
+  using ProblemShape = ProblemShape_;
+
+  // * EltA
+  using ElementA = ElementA_;
+  using ElementAUint = cute::uint_bit_t<cute::sizeof_bits_v<ElementA>>;
+  static constexpr bool IsRuntimeDataTypeA = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementA>();
+  using ArrayElementA = cute::conditional_t<IsRuntimeDataTypeA,
+                                            cute::uint_bit_t<cute::sizeof_bits_v<ElementA>>,
+                                            ElementA>;
+  using ElementAMma = typename SparseConfig::ElementAMma;
+  using ElementAMmaRaw = typename SparseConfig::ElementAMmaRaw;
+  using ElementASparsity = typename SparseConfig::ElementASparsity;
+  using ElementAMmaSparsity = typename SparseConfig::ElementAMmaSparsity;
+  using LayoutATag = LayoutATag_;
+  using LayoutA = LayoutATag;
+  using StrideA = cutlass::gemm::TagToStrideA_t<LayoutATag>;
+
+  // * EltE
+  using ElementEMma = typename SparseConfig::ElementEMma;
+  using ElementEMmaRaw = typename SparseConfig::ElementEMmaRaw;
+  using ElementEMmaSparsity = typename SparseConfig::ElementEMmaSparsity;
+
+  // * AtomE
+  using TensorEAtom = typename SparseConfig::TensorEAtom;
+  using TensorEAtomK = typename SparseConfig::TensorEAtomK;
+  using TensorEAtomM = typename SparseConfig::TensorEAtomM;
+
+  static constexpr int ElemsARawPerElementAMmaRaw = typename SparseConfig::ElemsARawPerElementAMmaRaw{};
+  static constexpr int LogicalElemsAPerChunk = typename SparseConfig::LogicalElemsAPerChunk{};
+  static constexpr int PhysicalElemsAPerChunk = typename SparseConfig::PhysicalElemsAPerChunk{};
+  static constexpr int LogicalElemsAMmaRawPerChunk = cutlass::ceil_div(LogicalElemsAPerChunk, ElemsARawPerElementAMmaRaw);
+  static constexpr int PhysicalElemsAMmaRawPerChunk = cutlass::ceil_div(PhysicalElemsAPerChunk, ElemsARawPerElementAMmaRaw);
+
+  // * Alignment
+  static constexpr int TensorEAlignmentM = typename SparseConfig::TensorEAlignmentM{};
+  static constexpr int TensorEAlignmentK = typename SparseConfig::TensorEAlignmentK{};
+  static constexpr int TensorAAlignmentK = typename SparseConfig::TensorAAlignmentK{};
+  static constexpr int TensorAAlignmentM = typename SparseConfig::TensorAAlignmentM{};
+
+  // Required by `device_kernel`
+  static constexpr int MaxThreadsPerBlock = 1;
+  static constexpr int MinBlocksPerMultiprocessor = 1;
+  using ArchTag = arch::Sm90;
+
+  struct SharedStorage {
+    /* empty, no smem needed */
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  struct TransformArguments {
+    ArrayElementA const* ptr_A{nullptr};
+    StrideA dA{};
+    ArrayElementA* ptr_ACompress{nullptr};
+    ElementEMmaRaw* ptr_E{nullptr};
+  };
+
+  using TransformParams = TransformArguments;
+
+  struct Arguments {
+    ProblemShape problem_shape{};
+    TransformArguments transform{};
+    KernelHardwareInfo hw_info{};
+  };
+
+  struct Params {
+    ProblemShape problem_shape{};
+    TransformParams transform{};
+    KernelHardwareInfo hw_info{};
+    void* workspace = nullptr;
+  };
+
+  static Params
+  to_underlying_arguments(Arguments & args, void* workspace) {
+    return Params{{args.problem_shape},
+                  {args.transform.ptr_A, args.transform.dA, args.transform.ptr_ACompress, args.transform.ptr_E},
+                  {args.hw_info},
+                  workspace};
+  }
+
+  static Status
+  can_implement(Arguments const& args) {
+    auto [M, N, K, L] = args.problem_shape;
+    if (K % LogicalElemsAPerChunk != 0) {
+      CUTLASS_TRACE_HOST("SM90 Sparse Compressor CAN NOT IMPLEMENT: GemmK not multiplier of logical chunk size\n");
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    auto problem = args.problem_shape;
+    const int m = cute::size<0>(problem);
+    const int k = cute::size<2>(problem);
+    const int l = cute::size<3>(problem);
+    const int metadata_k = round_up(k, TensorEAlignmentK);
+    const int metadata_m = round_up(m, TensorEAlignmentM);
+    const int metadata_bytes = metadata_m * metadata_k / ElementEMmaSparsity{} * l;
+    return metadata_bytes;
+  }
+
+  static Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+    cudaError_t cuda_error;
+
+    auto workspace_size = get_workspace_size(args);
+    if (workspace_size == 0) {
+      return Status::kSuccess;
+    } else if (workspace == nullptr) {
+      return Status::kErrorInternal;
+    }
+
+    cudaPointerAttributes attri;
+    cuda_error = cudaPointerGetAttributes(&attri, workspace);
+    if (cuda_error != cudaSuccess) {
+      return Status::kErrorInternal;
+    }
+
+    if ( attri.type == cudaMemoryTypeDevice ) {
+#if defined(CUTLASS_ENABLE_CUDA_HOST_ADAPTER) && CUTLASS_ENABLE_CUDA_HOST_ADAPTER
+      CUTLASS_ASSERT(cuda_adapter);
+      if (Status::kSuccess != cuda_adapter->memsetDevice(workspace, static_cast<uint8_t>(0), workspace_size, stream)) {
+        return Status::kErrorInternal;
+      }
+#else
+      cudaMemsetAsync(workspace, 0, workspace_size, stream);
+      cuda_error = cudaGetLastError();
+      if (cuda_error != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+#endif
+    } else {
+      memset(workspace, 0, workspace_size);
+    }
+
+    return Status::kSuccess;
+  }
+
+  static dim3
+  get_grid_shape(Params const& params) {
+    return dim3(1, 1, 1);
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(1, 1, 1);
+  }
+
+  CUTE_HOST_DEVICE
+  void
+  operator()(Params params, char* smem_buf = nullptr) {
+    run(params, smem_buf);
+  }
+
+  CUTE_HOST_DEVICE
+  static void
+  run(Params params, char* smem_buf = nullptr) {
+    do_compress_device_host(params);
+  }
+
+private:
+
+  CUTE_HOST_DEVICE
+  static void
+  do_compress_device_host(Params params) {
+    auto [m, n, k, l] = params.problem_shape;
+    auto [ptr_A, dA, ptr_ACompress, ptr_E] = params.transform;
+    auto workspace = params.workspace;
+
+    const int aligned_k = (k + TensorAAlignmentK - 1) / TensorAAlignmentK * TensorAAlignmentK;
+    const int aligned_m = (m + TensorAAlignmentM - 1) / TensorAAlignmentM * TensorAAlignmentM;
+    const int metadata_k = (k + TensorEAlignmentK - 1) / TensorEAlignmentK * TensorEAlignmentK;
+    const int metadata_m = (m + TensorEAlignmentM - 1) / TensorEAlignmentM * TensorEAlignmentM;
+    const int k_compressed = aligned_k / ElementASparsity{};
+
+    // Convert to CuTe tensors. But don't want to use sparse_ptr, which is making everything complicated here.
+    cute::Tensor tensorA = make_tensor(recast_ptr<ElementAUint>(ptr_A), make_layout(make_shape(m, k, l), dA));
+
+    cute::Tensor tensorAc = make_tensor(recast_ptr<ElementAUint>(ptr_ACompress),
+                      make_shape(aligned_m, k_compressed, l),
+                      make_cute_packed_stride(StrideA{}, cute::make_shape(aligned_m, k_compressed, l)));
+
+    cute::Tensor tensorE_raw_compress_logical = make_tensor(recast_ptr<sparse_elem<ElementEMmaSparsity{},ElementEMmaRaw>>(workspace),
+                                make_shape(metadata_m, make_shape(TensorEAtomK{}, metadata_k / TensorEAtomK{}), l),
+                                make_stride(TensorEAtomK{}, make_stride(_1{}, metadata_m*TensorEAtomK{}), metadata_m*metadata_k));
+
+    cute::Tensor tensorE_raw_compress = recast<uint8_t>(tensorE_raw_compress_logical);
+
+    // The following vars are all logical.
+    int atom_m = size<0>(TensorEAtom{});
+    int atom_k = size<1>(TensorEAtom{});
+    int tiled_m = metadata_m / atom_m;
+    int tiled_ke = metadata_k / atom_k;
+    // Col major when viewing atoms
+    int stride_tile_m = cosize(TensorEAtom{});
+    int stride_tile_ke = atom_k * metadata_m;
+
+    // Logical metadata tensor
+    cute::Tensor tensorE_logical = make_tensor(recast_ptr<sparse_elem<ElementEMmaSparsity{},ElementEMmaRaw>>(ptr_E),
+                           make_layout(make_shape(append(shape<0>(TensorEAtom{}), tiled_m),
+                                       append(shape<1>(TensorEAtom{}), tiled_ke),
+                                       shape<2>(tensorE_raw_compress_logical)),
+                                 make_stride(append(stride<0>(TensorEAtom{}), stride_tile_m),
+                                       append(stride<1>(TensorEAtom{}), stride_tile_ke),
+                                       stride<2>(tensorE_raw_compress_logical))));
+    // Physical metadata tensor
+    cute::Tensor tensorE = recast<uint8_t>(tensorE_logical);
+
+    // void do_init()
+    cute::clear(tensorAc);
+    cute::clear(tensorE_raw_compress);
+
+    // void do_raw_compress()
+    using TileStepA = Int<LogicalElemsAPerChunk * 2>;
+    using TileStepAc = Int<TileStepA{} / 2>;
+
+    cute::Tensor tensorATiled = logical_divide(tensorA, make_shape(_, TileStepA{}, _));
+    cute::Tensor tensorAcTiled = logical_divide(tensorAc, make_shape(_, TileStepAc{}, _));
+
+    for (int batch_idx = 0; batch_idx < l; batch_idx++) {
+      for (int m_idx = 0; m_idx < m; m_idx++) {
+        for (int tiler_k_idx = 0; tiler_k_idx < size<1,1>(tensorATiled); tiler_k_idx++) {
+          int effective_elems = cute::min(TileStepA{}, k - (tiler_k_idx * TileStepA{}));
+          detail::compress_two_chunks_legacy<SparseConfig>(tensorATiled(m_idx, make_coord(_, tiler_k_idx), batch_idx),
+                                                     tensorAcTiled(m_idx, make_coord(_, tiler_k_idx), batch_idx),
+                                                     tensorE_raw_compress(m_idx, tiler_k_idx, batch_idx),
+                                                     effective_elems);
+        }
+      }
+    }
+
+    // void do_reorder()
+    // Fast path when we don't permute.
+    if constexpr (sizeof_bits_v<ElementAUint> <= 8) {
+      memcpy(tensorE.data(), tensorE_raw_compress.data(), tensorE.size());
+    }
+    else {
+      cute::copy(tensorE_raw_compress, tensorE);
+    }
+
+    #if 0
+    print("--> TensorA\n");
+    auto tensorA_eltA = cute::recast<ElementA>(tensorA);
+    cute::print_tensor(tensorA_eltA); printf("\n\n");
+
+    print("--> REF TensorAC\n");
+    auto tensorAc_eltA = cute::recast<ElementA>(tensorAc);
+    cute::print_tensor(tensorAc_eltA); printf("\n\n");
+
+    print("--> REF TensorE\n");
+    cute::print_tensor(tensorE); printf("\n\n");
+    #endif
+
+  }
+};
+
+}  // namespace kernel
+}  // namespace transform
+}  // namespace cutlass
diff --git a/test/unit/transform/device/testbed_sparse_gemm_compressor.hpp b/test/unit/transform/device/testbed_sparse_gemm_compressor.hpp
new file mode 100644
index 0000000000..03e4fa75b1
--- /dev/null
+++ b/test/unit/transform/device/testbed_sparse_gemm_compressor.hpp
@@ -0,0 +1,877 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*
+ * @brief Test for structured sparse gemm compressor device kernel
+ */
+
+#pragma once
+
+#include <cuda_runtime_api.h>  // cudaGetLastError
+
+#include <cstdint>             // uint64_t
+#include <cstdio>              // printf
+#include <cstdlib>             // malloc
+#include <iostream>            // std::cout
+#include <vector>
+#include <array>
+
+#include "cute/layout.hpp"                                    // cute::make_shape
+#include "cute/util/type_traits.hpp"                          // cute::is_same_v
+#include "cutlass/coord.h"                                    // cutlass::make_Coord
+#include "cutlass/cutlass.h"                                  // cutlass::Status
+#include "cutlass/kernel_hardware_info.hpp"                          // cutlass::KernelHardwareInfo
+#include "cutlass/layout/matrix.h"                                   // cutlass::layout::Affine2Layout_Factory
+#include "cutlass/numeric_types.h"                                   // cutlass::sizeof_bits, cutlass::float_
+#include "cutlass/tensor_view.h"                                     // cutlass::TensorView
+#include "cutlass/transform/device/transform_universal_adapter.hpp"  // cutlass::transform::device::TransformUniversalAdapter
+#include "cutlass/transform/kernel/sparse_gemm_compressor.hpp"       // cutlass::transform::kernel::StructuredSparseCompressorUtility
+#include "cutlass/util/device_memory.h"                              // cutlass::device_memory::allocation
+#include "cutlass/util/distribution.h"                               // cutlass::Distribution
+#include "cutlass/util/host_tensor.h"                                // cutlass::HostTensor
+#include "cutlass/util/packed_stride.hpp"                            // cutlass::make_cute_packed_stride
+#include "cutlass/util/reference/host/tensor_compare.h"              // cutlass::reference::host::TensorEquals
+#include "cutlass/util/reference/host/tensor_fill.h"  // cutlass::reference::host::TensorFillRandomUniform, TensorFillIdentity, TensorFillRandomGaussian, BlockFillSequential, TensorFill
+#include "cutlass/detail/collective.hpp"
+
+#include "sm90_sparse_gemm_compressor_legacy.hpp"     // Legacy host compressor
+#include "../../common/cutlass_unit_test.h"           // CUTLASS UT, EXPECT_TRUE
+
+
+#define CUDA_CHECK_FALSE(cuda_error)                                                           \
+  {                                                                                            \
+    if (cuda_error != cudaSuccess) {                                                           \
+      printf("cudaError %s in %s:%d\n", cudaGetErrorString(cuda_error), __func__, __LINE__ );  \
+      return false;                                                                            \
+    }                                                                                          \
+  }
+
+#define CUDA_CHECK(cuda_error)                                                                 \
+  {                                                                                            \
+    if (cuda_error != cudaSuccess) {                                                           \
+      printf("cudaError %s in %s:%d\n", cudaGetErrorString(cuda_error), __func__, __LINE__ );  \
+      return;                                                                                  \
+    }                                                                                          \
+  }
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// * Test Bed
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace test
+{
+namespace transform
+{
+namespace device
+{
+
+// Helper Functions
+template <typename Element, typename Layout>
+bool
+initialize_tensor(cutlass::TensorView<Element, Layout> view, cutlass::Distribution::Kind dist_kind, uint64_t seed)
+{
+  if (dist_kind == cutlass::Distribution::Uniform) {
+    double scope_max, scope_min;
+    int bits_input = cutlass::sizeof_bits<Element>::value;
+
+    if (bits_input == 1) {
+      scope_max = 2;
+      scope_min = 0;
+    }
+    else if (bits_input <= 8) {
+        scope_max = 1;
+        scope_min = -1;
+    } else {
+      scope_max = 4;
+      scope_min = -4;
+    }
+    cutlass::reference::host::TensorFillRandomUniform(view, seed, scope_max, scope_min, 0);
+  }
+
+  else if (dist_kind == cutlass::Distribution::Identity) {
+    cutlass::reference::host::TensorFillIdentity(view);
+  }
+
+  else if (dist_kind == cutlass::Distribution::Gaussian) {
+    cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+  }
+
+  else if (dist_kind == cutlass::Distribution::Sequential) {
+    cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+  }
+
+  else if (dist_kind == cutlass::Distribution::AllOnes) {
+    cutlass::reference::host::TensorFill(view, Element(1));
+  }
+
+  else if (dist_kind == cutlass::Distribution::AllZeros) {
+    cutlass::reference::host::TensorFill(view, Element(0));
+  }
+
+  else {
+    EXPECT_TRUE(false) << "Not implemented";
+    return false;
+  }
+
+  return true;
+}
+
+// Testbed
+template <typename Compressor_>
+struct TestbedSparseGemmCompressor {
+public:
+  using Compressor = Compressor_;
+  using CompressorKernel = typename Compressor::TransformKernel;
+
+  using ElementA = typename CompressorKernel::ElementA;
+  using LayoutATag = typename CompressorKernel::LayoutATag;
+  using StrideA = typename CompressorKernel::StrideA;
+  using ArrayElementA = 
+    ElementA
+  ;
+
+  using ElementE = typename CompressorKernel::ElementEMmaRaw;
+  using LayoutETag = cutlass::layout::RowMajor;  // We don't care about the major here, just to allocate tensor
+
+  using SparseConfig = typename CompressorKernel::SparseConfig;
+  using ProblemShapeType = typename CompressorKernel::ProblemShape;
+
+  using CompressorUtility = cutlass::transform::kernel::StructuredSparseCompressorUtility<
+                              ProblemShapeType,
+                              ElementA,
+                              LayoutATag,
+                              SparseConfig>;
+
+  using CompressorKernelHost = cutlass::transform::kernel::SM90StructuredSparseCompressorLegacy<
+                                ProblemShapeType,
+                                ElementA,
+                                LayoutATag,
+                                SparseConfig>;
+
+  using CompressorHost = cutlass::transform::device::TransformUniversalAdapter<CompressorKernelHost>;
+
+  static constexpr auto LogicalElemsAPerChunk = CompressorKernel::LogicalElemsAPerChunk;
+  static constexpr auto PhysicalElemsAPerChunk = CompressorKernel::PhysicalElemsAPerChunk;
+
+  struct Data {
+    // Data Storage
+    cutlass::HostTensor<ArrayElementA, LayoutATag> tensor_A;
+    cutlass::HostTensor<ArrayElementA, LayoutATag> tensor_A_Comp;
+    cutlass::HostTensor<ElementE, LayoutETag> tensor_E;
+    cutlass::HostTensor<ArrayElementA, LayoutATag> tensor_A_Comp_ref;
+    cutlass::HostTensor<ElementE, LayoutETag> tensor_E_ref;
+  };
+
+  struct CudaRAII {
+    cudaStream_t stream;
+    cudaEvent_t start;
+    cudaEvent_t stop;
+  
+    CudaRAII(){
+      CUDA_CHECK(cudaStreamCreate( &stream ));
+      CUDA_CHECK(cudaEventCreate( &start ));
+      CUDA_CHECK(cudaEventCreate( &stop ));
+    };
+
+    CudaRAII(const CudaRAII&) = delete;  
+    CudaRAII& operator=(const CudaRAII&) = delete;  
+    CudaRAII(CudaRAII&&) = delete;  
+    CudaRAII& operator=(CudaRAII&&) = delete;  
+
+    ~CudaRAII(){
+      CUDA_CHECK(cudaStreamDestroy( stream ));
+      CUDA_CHECK(cudaEventDestroy( start ));
+      CUDA_CHECK(cudaEventDestroy( stop ));
+    }
+  };
+
+public:
+  TestbedSparseGemmCompressor(
+      cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_E_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_A_Comp_ = cutlass::Distribution::Uniform,
+      uint64_t seed_ = 7)
+      : init_A(init_A_)
+      , init_E(init_E_)
+      , init_A_Comp(init_A_Comp_)
+      , seed(seed_)
+  {
+  }
+
+  bool valid_test(ProblemShapeType problem_shape_MNKL)
+  {
+    const int GemmK = cute::size<2>(problem_shape_MNKL);
+
+    if ( GemmK % LogicalElemsAPerChunk != 0 ) {
+      printf("GemmK needs to be multiplier of LogicalElemsAPerChunk\n");
+      return false;
+    }
+
+    return true;
+  }
+
+  bool initialize(ProblemShapeType problem_shape_MNKL, Data& datas)
+  {
+    CUDA_CHECK_FALSE(cudaGetLastError());
+
+    // In unit of ElementARaw
+    const int GemmM = cute::size<0>(problem_shape_MNKL);
+    const int GemmN = cute::size<1>(problem_shape_MNKL);
+    const int GemmK = cute::size<2>(problem_shape_MNKL);
+    const int GemmL = cute::size<3>(problem_shape_MNKL);
+
+    // Compressor utility to get allocated data size
+    auto stride_a = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(GemmM, GemmK, GemmL));
+    CompressorUtility compressor_utility(problem_shape_MNKL, stride_a);
+
+    // TensorA
+    // In unit of ElementARaw, after alignment requirement
+    // M-dim: no alignment requirement
+    // K-dim: multiplier of chunk size
+
+    // TensorA Compressed
+    // In unit of ElementARaw, after alignment requirement
+    // M-dim: TMA alignment
+    // K-dim: TMA alignment
+    const int GemmMAlignedAC = compressor_utility.get_tensorA_m_physical();
+    const int GemmKAlignedAC = compressor_utility.get_tensorA_k_physical();
+
+    // TensorE
+    // In unit of ElementE (uint8_t), after alignment requirement
+    // M-dim: TensorEAtom_M alignment
+    // K-dim: TensorEAtom_K alignment
+    const int GemmMAlignedE = compressor_utility.get_metadata_m_physical();
+    const int GemmKAlignedE = compressor_utility.get_metadata_k_physical();
+
+    auto a_coord = cutlass::make_Coord(GemmM * GemmL, GemmK);
+    auto e_coord = cutlass::make_Coord(GemmMAlignedE * GemmL, GemmKAlignedE);
+    auto a_comp_coord = cutlass::make_Coord(GemmMAlignedAC * GemmL, GemmKAlignedAC);
+
+    typename LayoutATag::Stride stride_factor_A;
+    typename LayoutETag::Stride stride_factor_E;
+
+    datas.tensor_A.resize(a_coord,
+                          cutlass::layout::Affine2Layout_Factory<LayoutATag>::layout_factory(a_coord, stride_factor_A));
+    datas.tensor_A_Comp.resize(a_comp_coord,
+                               cutlass::layout::Affine2Layout_Factory<LayoutATag>::layout_factory(a_comp_coord, stride_factor_A));
+    datas.tensor_A_Comp_ref.resize(a_comp_coord,
+                                   cutlass::layout::Affine2Layout_Factory<LayoutATag>::layout_factory(a_comp_coord, stride_factor_A),
+                                   false);
+    datas.tensor_E.resize(e_coord,
+                          cutlass::layout::Affine2Layout_Factory<LayoutETag>::layout_factory(e_coord, stride_factor_E));
+    datas.tensor_E_ref.resize(e_coord,
+                              cutlass::layout::Affine2Layout_Factory<LayoutETag>::layout_factory(e_coord, stride_factor_E),
+                              false);
+
+    EXPECT_TRUE(initialize_tensor(datas.tensor_A.host_view(), init_A, seed + 1));
+    EXPECT_TRUE(initialize_tensor(datas.tensor_E.host_view(), init_E, seed + 2));
+    EXPECT_TRUE(initialize_tensor(datas.tensor_E_ref.host_view(), init_E, seed + 3));
+    EXPECT_TRUE(initialize_tensor(datas.tensor_A_Comp.host_view(), init_A_Comp, seed + 4));
+    EXPECT_TRUE(initialize_tensor(datas.tensor_A_Comp_ref.host_view(), init_A_Comp, seed + 5));
+
+    compressor_utility.structure_sparse_zero_mask_fill(datas.tensor_A.host_data(), seed + 6);
+
+    // Check for failed devide
+    CUDA_CHECK_FALSE(cudaGetLastError());
+
+    datas.tensor_A.sync_device();
+    datas.tensor_A_Comp.sync_device();
+    datas.tensor_E.sync_device();
+
+    // Check for failed devide
+    CUDA_CHECK_FALSE(cudaGetLastError());
+
+    return true;
+  }
+
+  bool run_device(ProblemShapeType problem_shape_MNKL, Data& datas, float* time = nullptr)
+  {
+    CudaRAII cuda_raii;
+
+    const int GemmM = cute::size<0>(problem_shape_MNKL);
+    const int GemmN = cute::size<1>(problem_shape_MNKL);
+    const int GemmK = cute::size<2>(problem_shape_MNKL);
+    const int GemmL = cute::size<3>(problem_shape_MNKL);
+
+    StrideA stride_a = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(GemmM, GemmK, GemmL));
+
+    cutlass::KernelHardwareInfo hw_info;
+    hw_info.device_id = 0;
+    hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+    typename Compressor::Arguments arguments{
+        {GemmM, GemmN, GemmK, GemmL},
+        {datas.tensor_A.device_data(),
+         stride_a,
+         datas.tensor_A_Comp.device_data(),
+         datas.tensor_E.device_data()},
+        {hw_info}
+    };
+
+    Compressor compressor_op;
+    size_t workspace_size = Compressor::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status {cutlass::Status::kSuccess };
+
+    status = compressor_op.can_implement(arguments);
+    if (status != cutlass::Status::kSuccess) {
+      CUDA_CHECK_FALSE(cudaGetLastError());
+    }
+
+    status = compressor_op.initialize(arguments, workspace.get(), cuda_raii.stream);
+    if (status != cutlass::Status::kSuccess) {
+      CUDA_CHECK_FALSE(cudaGetLastError());
+    }
+
+    CUDA_CHECK_FALSE(cudaStreamSynchronize(cuda_raii.stream));
+    CUDA_CHECK_FALSE(cudaEventRecord(cuda_raii.start, cuda_raii.stream));
+
+    status = compressor_op.run(cuda_raii.stream);
+    if (status != cutlass::Status::kSuccess) {
+      CUDA_CHECK_FALSE(cudaGetLastError());
+    }
+
+    CUDA_CHECK_FALSE(cudaEventRecord(cuda_raii.stop, cuda_raii.stream));
+    CUDA_CHECK_FALSE(cudaEventSynchronize(cuda_raii.stop));
+    CUDA_CHECK_FALSE(cudaStreamSynchronize(cuda_raii.stream));
+    if ( time != nullptr ){
+      CUDA_CHECK_FALSE(cudaEventElapsedTime(time, cuda_raii.start, cuda_raii.stop));
+    }
+
+    datas.tensor_A_Comp.sync_host();
+    datas.tensor_E.sync_host();
+
+    #if 0
+    {
+      printf("\n--> DEVICE OUTPUT\n");
+      printf("datas.tensor_A\n");
+      std::cout << datas.tensor_A.host_view() << std::endl << std::endl;
+      printf("datas.tensor_A_Comp\n");
+      std::cout << datas.tensor_A_Comp.host_view() << std::endl << std::endl;
+      printf("datas.tensor_E\n");
+      std::cout << datas.tensor_E.host_view() << std::endl << std::endl;
+    }
+    #endif
+
+    return true;
+  }
+
+  bool run_host_ref(ProblemShapeType problem_shape_MNKL, Data& datas)
+  {
+    const int GemmM = cute::size<0>(problem_shape_MNKL);
+    const int GemmN = cute::size<1>(problem_shape_MNKL);
+    const int GemmK = cute::size<2>(problem_shape_MNKL);
+    const int GemmL = cute::size<3>(problem_shape_MNKL);
+
+    StrideA stride_a = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(GemmM, GemmK, GemmL));
+
+    typename CompressorKernelHost::Arguments arguments{
+        {GemmM, GemmN, GemmK, GemmL},
+        {datas.tensor_A.host_data(),
+         stride_a,
+         datas.tensor_A_Comp_ref.host_data(),
+         datas.tensor_E_ref.host_data()},
+        {}};
+
+    const auto can_imp = CompressorKernelHost::can_implement(arguments);
+    if (can_imp != cutlass::Status::kSuccess) {
+      printf("can_implement() check failed\n");
+      return false;
+    }
+
+    // Relies on std::vector for RAII
+    auto workspace_size =
+        static_cast<std::vector<uint8_t>::size_type>(CompressorKernelHost::get_workspace_size(arguments));
+    std::vector<uint8_t> workspace_vector(workspace_size);
+    auto workspace = static_cast<void*>(workspace_vector.data());
+
+    cutlass::Status status = CompressorKernelHost::initialize_workspace(arguments, workspace);
+    if (status != cutlass::Status::kSuccess) {
+      printf("initialize_workspace() failed\n");
+      return false;
+    }
+
+    auto params = CompressorKernelHost::to_underlying_arguments(arguments, workspace);
+    CompressorKernelHost::run(params);
+
+    return true;
+  }
+
+  bool compare_reference(Data& datas)
+  {
+    bool check_tensor_a_compressed =
+        cutlass::reference::host::TensorEquals(datas.tensor_A_Comp_ref.host_view(), datas.tensor_A_Comp.host_view());
+    if (!check_tensor_a_compressed) {
+      printf("A-Compressed Mismatch\n");
+    }
+
+    bool check_tensor_e = cutlass::reference::host::TensorEquals(datas.tensor_E_ref.host_view(), datas.tensor_E.host_view());
+    if (!check_tensor_e) {
+      printf("E Mismatch\n");
+    }
+
+    return check_tensor_a_compressed && check_tensor_e;
+  }
+
+  bool run_auto_small()
+  {
+    return run_auto(true);
+  }
+
+  bool run_auto(bool run_small = false)
+  {
+    constexpr auto TensorEAlignmentM = typename SparseConfig::TensorEAlignmentM{};
+    constexpr auto TensorEAlignmentK = typename SparseConfig::TensorEAlignmentK{};
+    constexpr int LogicalElemsAPerChunk = typename SparseConfig::LogicalElemsAPerChunk{};
+
+    constexpr int GemmN = 1;
+
+    using ProblemType = typename std::array<int, 4>;
+
+    std::vector<ProblemType> problems;
+
+    const std::vector<ProblemType> problems_multiplier_of_tensor_e_atom = {
+      // * Regular Cases (multiplier of TensorEAlignment)
+      {TensorEAlignmentM * 1, GemmN, TensorEAlignmentK * 2, 1},
+      {TensorEAlignmentM * 1, GemmN, TensorEAlignmentK * 2, 1},
+      {TensorEAlignmentM * 1, GemmN, TensorEAlignmentK * 3, 1},
+
+      {TensorEAlignmentM * 2, GemmN, TensorEAlignmentK * 2, 1},
+      {TensorEAlignmentM * 2, GemmN, TensorEAlignmentK * 2, 1},
+      {TensorEAlignmentM * 2, GemmN, TensorEAlignmentK * 3, 1},
+
+      {TensorEAlignmentM * 3, GemmN, TensorEAlignmentK * 2, 1},
+      {TensorEAlignmentM * 3, GemmN, TensorEAlignmentK * 2, 1},
+      {TensorEAlignmentM * 3, GemmN, TensorEAlignmentK * 3, 1},
+
+      {TensorEAlignmentM * 1, GemmN, TensorEAlignmentK * 2, 2},
+      {TensorEAlignmentM * 1, GemmN, TensorEAlignmentK * 2, 2},
+      {TensorEAlignmentM * 1, GemmN, TensorEAlignmentK * 3, 2},
+
+      {TensorEAlignmentM * 2, GemmN, TensorEAlignmentK * 2, 2},
+      {TensorEAlignmentM * 2, GemmN, TensorEAlignmentK * 2, 2},
+      {TensorEAlignmentM * 2, GemmN, TensorEAlignmentK * 3, 2},
+
+      {TensorEAlignmentM * 3, GemmN, TensorEAlignmentK * 2, 2},
+      {TensorEAlignmentM * 3, GemmN, TensorEAlignmentK * 2, 2},
+      {TensorEAlignmentM * 3, GemmN, TensorEAlignmentK * 3, 2},
+
+      {TensorEAlignmentM * 1, GemmN, TensorEAlignmentK * 2, 3},
+      {TensorEAlignmentM * 1, GemmN, TensorEAlignmentK * 2, 3},
+      {TensorEAlignmentM * 1, GemmN, TensorEAlignmentK * 3, 3},
+
+      {TensorEAlignmentM * 2, GemmN, TensorEAlignmentK * 2, 3},
+      {TensorEAlignmentM * 2, GemmN, TensorEAlignmentK * 2, 3},
+      {TensorEAlignmentM * 2, GemmN, TensorEAlignmentK * 3, 3},
+
+      {TensorEAlignmentM * 3, GemmN, TensorEAlignmentK * 2, 3},
+      {TensorEAlignmentM * 3, GemmN, TensorEAlignmentK * 2, 3},
+      {TensorEAlignmentM * 3, GemmN, TensorEAlignmentK * 3, 3},
+    };
+
+    const std::vector<ProblemType> problems_multiplier_of_tensor_e_atom_large = {
+      // * Large Case (multiplier of TensorEAlignment)
+      {TensorEAlignmentM * 10, GemmN, TensorEAlignmentK * 13, 1},
+      // {TensorEAlignmentM * 11, GemmN, TensorEAlignmentK * 14, 2},
+      // {TensorEAlignmentM * 12, GemmN, TensorEAlignmentK * 15, 3},
+    };
+
+    const std::vector<ProblemType> problems_multiplier_of_twochunk {
+      // * Corner Cases
+      {4, GemmN, LogicalElemsAPerChunk * 2, 1},
+      {4, GemmN, LogicalElemsAPerChunk * 4, 1},
+      {4, GemmN, LogicalElemsAPerChunk * 6, 1},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 2, 1},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 4, 1},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 6, 1},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 2, 1},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 4, 1},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 6, 1},
+
+      {4, GemmN, LogicalElemsAPerChunk * 2, 2},
+      {4, GemmN, LogicalElemsAPerChunk * 4, 2},
+      {4, GemmN, LogicalElemsAPerChunk * 6, 2},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 2, 2},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 4, 2},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 6, 2},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 2, 2},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 4, 2},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 6, 2},
+
+      {4, GemmN, LogicalElemsAPerChunk * 2, 3},
+      {4, GemmN, LogicalElemsAPerChunk * 4, 3},
+      {4, GemmN, LogicalElemsAPerChunk * 6, 3},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 2, 3},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 4, 3},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 6, 3},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 2, 3},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 4, 3},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 6, 3},
+
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 2, 1},
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 4, 1},
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 6, 1},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 2, 1},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 4, 1},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 6, 1},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 2, 1},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 4, 1},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 6, 1},
+
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 2, 2},
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 4, 2},
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 6, 2},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 2, 2},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 4, 2},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 6, 2},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 2, 2},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 4, 2},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 6, 2},
+
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 2, 3},
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 4, 3},
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 6, 3},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 2, 3},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 4, 3},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 6, 3},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 2, 3},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 4, 3},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 6, 3},
+
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 2, 1},
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 4, 1},
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 6, 1},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 2, 1},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 4, 1},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 6, 1},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 2, 1},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 4, 1},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 6, 1},
+
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 2, 2},
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 4, 2},
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 6, 2},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 2, 2},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 4, 2},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 6, 2},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 2, 2},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 4, 2},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 6, 2},
+
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 2, 3},
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 4, 3},
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 6, 3},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 2, 3},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 4, 3},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 6, 3},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 2, 3},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 4, 3},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 6, 3},
+
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 2, 1},
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 4, 1},
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 6, 1},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 2, 1},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 4, 1},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 6, 1},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 2, 1},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 4, 1},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 6, 1},
+
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 2, 2},
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 4, 2},
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 6, 2},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 2, 2},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 4, 2},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 6, 2},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 2, 2},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 4, 2},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 6, 2},
+
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 2, 3},
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 4, 3},
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 6, 3},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 2, 3},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 4, 3},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 6, 3},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 2, 3},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 4, 3},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 6, 3},
+    };
+
+    const std::vector<ProblemType> problems_multiplier_of_onechunk {
+      {4, GemmN, LogicalElemsAPerChunk * 1, 1},
+      {4, GemmN, LogicalElemsAPerChunk * 3, 1},
+      {4, GemmN, LogicalElemsAPerChunk * 5, 1},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 1, 1},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 3, 1},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 5, 1},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 1, 1},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 3, 1},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 5, 1},
+
+      {4, GemmN, LogicalElemsAPerChunk * 1, 2},
+      {4, GemmN, LogicalElemsAPerChunk * 3, 2},
+      {4, GemmN, LogicalElemsAPerChunk * 5, 2},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 1, 2},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 3, 2},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 5, 2},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 1, 2},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 3, 2},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 5, 2},
+
+      {4, GemmN, LogicalElemsAPerChunk * 1, 3},
+      {4, GemmN, LogicalElemsAPerChunk * 3, 3},
+      {4, GemmN, LogicalElemsAPerChunk * 5, 3},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 1, 3},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 3, 3},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 5, 3},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 1, 3},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 3, 3},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 5, 3},
+
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 1, 1},
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 3, 1},
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 5, 1},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 1, 1},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 3, 1},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 5, 1},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 1, 1},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 3, 1},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 5, 1},
+
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 1, 2},
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 3, 2},
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 5, 2},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 1, 2},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 3, 2},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 5, 2},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 1, 2},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 3, 2},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 5, 2},
+
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 1, 3},
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 3, 3},
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 5, 3},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 1, 3},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 3, 3},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 5, 3},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 1, 3},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 3, 3},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 5, 3},
+
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 1, 1},
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 3, 1},
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 5, 1},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 1, 1},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 3, 1},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 5, 1},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 1, 1},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 3, 1},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 5, 1},
+
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 1, 2},
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 3, 2},
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 5, 2},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 1, 2},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 3, 2},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 5, 2},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 1, 2},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 3, 2},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 5, 2},
+
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 1, 3},
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 3, 3},
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 5, 3},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 1, 3},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 3, 3},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 5, 3},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 1, 3},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 3, 3},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 5, 3},
+
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 1, 1},
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 3, 1},
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 5, 1},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 1, 1},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 3, 1},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 5, 1},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 1, 1},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 3, 1},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 5, 1},
+
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 1, 2},
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 3, 2},
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 5, 2},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 1, 2},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 3, 2},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 5, 2},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 1, 2},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 3, 2},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 5, 2},
+
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 1, 3},
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 3, 3},
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 5, 3},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 1, 3},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 3, 3},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 5, 3},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 1, 3},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 3, 3},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 5, 3},
+    };
+
+    // Run small only run multiplier of chunk size cases
+    if (run_small) {
+      problems.insert(problems.end(), problems_multiplier_of_tensor_e_atom.begin(), problems_multiplier_of_tensor_e_atom.end());
+    }
+    // Run full run all corner cases
+    else {
+      problems.insert(problems.end(), problems_multiplier_of_tensor_e_atom_large.begin(), problems_multiplier_of_tensor_e_atom_large.end());
+      problems.insert(problems.end(), problems_multiplier_of_tensor_e_atom.begin(), problems_multiplier_of_tensor_e_atom.end());
+      problems.insert(problems.end(), problems_multiplier_of_twochunk.begin(), problems_multiplier_of_twochunk.end());
+      problems.insert(problems.end(), problems_multiplier_of_onechunk.begin(), problems_multiplier_of_onechunk.end());
+    }
+
+    for (const auto& problem_shape_MNKL : problems) {
+      const auto [GemmM, GemmN, GemmK, GemmL] = problem_shape_MNKL;
+      bool passed = run({GemmM, GemmN, GemmK, GemmL});
+      printf("run() (%.4d,%.4d,%.4d,%.4d) %s\n", GemmM, GemmN, GemmK, GemmL, passed ? "PASS" : "FAIL");
+      CUTLASS_TRACE_HOST("run() " << GemmM << " " << GemmN << " " << GemmK << " " << GemmL << passed ? " PASS" : " FAIL");
+      if (not passed) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  bool run(ProblemShapeType problem_shape_MNKL)
+  {
+    // Check if valid test
+    if (not valid_test(problem_shape_MNKL)) {
+      CUTLASS_TRACE_HOST("valid_test() fail\n");
+      return false;
+    }
+
+    // Data Storage
+    Data datas;
+
+    // Initialize Data
+    if (not initialize(problem_shape_MNKL, datas)) {
+      CUTLASS_TRACE_HOST("initialize() fail\n");
+      return false;
+    }
+
+    // Run Compressor (Host Ref)
+    if (not run_host_ref(problem_shape_MNKL, datas)) {
+      CUTLASS_TRACE_HOST("run_host() fail\n");
+      return false;
+    }
+
+    // Run Compressor (Device)
+    if (not run_device(problem_shape_MNKL, datas)) {
+      CUTLASS_TRACE_HOST("run_device() fail\n");
+      return false;
+    }
+
+    // Verify
+    if (not compare_reference(datas)) {
+      CUTLASS_TRACE_HOST("compare_reference() DEVICE <-> LEGACY HOST fail\n");
+      printf("compare_reference() DEVICE <-> LEGACY HOST fail\n");
+      return false;
+    }
+    // else {
+    //   printf("DEVICE <-> HOST PASS\n");
+    // }
+
+    return true;
+  }
+
+  bool benchmark(ProblemShapeType problem_shape_MNKL) {
+    const auto [GemmM, GemmN, GemmK, GemmL] = problem_shape_MNKL;
+    printf("Benchmark() (%.4d,%.4d,%.4d,%.4d) START\n", GemmM, GemmN, GemmK, GemmL);
+
+    // Check if valid test
+    if (valid_test(problem_shape_MNKL) == false) {
+      CUTLASS_TRACE_HOST("valid_test() fail\n");
+      return false;
+    }
+
+    // 2 warm-up iterations and 10 timing iterations
+    constexpr int num_warmup = 5;
+    constexpr int num_iter = 10;
+
+    // Duplicate data to mimic cold cache
+    Data data[num_warmup + num_iter];
+    double total_time_milliseconds{0.0};
+
+    for (int i = 0; i < num_warmup + num_iter; ++i ) {
+      printf("Benchmark() (%.4d,%.4d,%.4d,%.4d) ITER %d\n", GemmM, GemmN, GemmK, GemmL, i );
+
+      auto& datum_i = data[i];
+
+      // Initialize Data  
+      if (initialize(problem_shape_MNKL, datum_i) == false) {
+        CUTLASS_TRACE_HOST("initialize() fail\n");
+        return false;
+      }
+
+      // Run Compressor (Device)
+      double time_i_milliseconds{0.0f};
+      if (not run_device(problem_shape_MNKL, datum_i, &time_i_milliseconds)) {
+        CUTLASS_TRACE_HOST("run_device() fail\n");
+        return false;
+      }
+
+      if ( i >= num_warmup ) {
+        total_time_milliseconds += time_i_milliseconds;
+      }
+    }
+
+    const double mean_time_milliseconds = total_time_milliseconds / num_iter;
+    printf("Mean time (ms): %.5f\n", mean_time_milliseconds);
+
+    return true;
+  }
+
+public:
+  // Data Init Setting
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_A_Comp;
+  cutlass::Distribution::Kind init_E;
+  uint64_t seed;
+};
+
+}  // namespace device
+}  // namespace transform
+}  // namespace test
diff --git a/test/unit/transform/kernel/CMakeLists.txt b/test/unit/transform/kernel/CMakeLists.txt
new file mode 100644
index 0000000000..92d4a47bdb
--- /dev/null
+++ b/test/unit/transform/kernel/CMakeLists.txt
@@ -0,0 +1,32 @@
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_transform_kernel
+  filter_format_transformer.cu
+)
diff --git a/test/unit/transform/kernel/filter_format_transformer.cu b/test/unit/transform/kernel/filter_format_transformer.cu
new file mode 100644
index 0000000000..ce489afd06
--- /dev/null
+++ b/test/unit/transform/kernel/filter_format_transformer.cu
@@ -0,0 +1,261 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests cutlass::transform::kernel::ConvFilterFormatTransformer
+*/
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/kernel/filter_format_transformer.hpp"
+#include "cutlass/transform/device/transform_universal_adapter.hpp"
+
+#include "thrust/universal_vector.h"
+#include "thrust/host_vector.h"
+#include "thrust/device_vector.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class Element, class Shape_S>
+auto verify_ckrs_to_crsk(thrust::host_vector<Element> const &S, thrust::host_vector<Element> const &D, Shape_S shape_s) {
+  using namespace cute;
+
+  int32_t errors = 0;
+  int32_t const kErrorLimit = 10;
+
+  if (S.size() != D.size()) {
+    return false;
+  }
+
+  auto shape_d = select<2, 0, 1, 3>(shape_s);
+
+  for (int i = 0; i < (int)S.size(); ++i) {
+    auto [s, r, k, c] = idx2crd(i, shape_s);
+    auto d_idx = crd2idx(make_coord(k, s, r, c), shape_d);
+
+    if (S[i] != D[d_idx]) {
+      std::cerr << "Error. S[" << i << "]: " << S[i] << ",   D[" << d_idx << "]: " << D[d_idx] << std::endl;
+
+      if (++errors >= kErrorLimit) {
+        std::cerr << "Aborting on " << kErrorLimit << "nth error." << std::endl;
+        return false;
+      }
+    }
+  }
+
+  return errors == 0;
+}
+
+template <class Element, class Shape_S>
+auto verify_ckrs_to_krsc(thrust::host_vector<Element> const &S, thrust::host_vector<Element> const &D, Shape_S shape_s) {
+  using namespace cute;
+
+  int32_t errors = 0;
+  int32_t const kErrorLimit = 10;
+
+  if (S.size() != D.size()) {
+    return false;
+  }
+
+  auto shape_d = select<3, 0, 1, 2>(shape_s);
+
+  for (int i = 0; i < (int)S.size(); ++i) {
+    auto [s, r, k, c] = idx2crd(i, shape_s);
+    auto d_idx = crd2idx(make_coord(c, s, r, k), shape_d);
+
+    if (S[i] != D[d_idx]) {
+      std::cerr << "Error. S[" << i << "]: " << S[i] << ",   D[" << d_idx << "]: " << D[d_idx] << std::endl;
+
+      if (++errors >= kErrorLimit) {
+        std::cerr << "Aborting on " << kErrorLimit << "nth error." << std::endl;
+        return false;
+      }
+    }
+  }
+
+  return errors == 0;
+}
+
+template <class Element,
+          cutlass::transform::kernel::FilterFormat SrcFormat,
+          cutlass::transform::kernel::FilterFormat DstFormat,
+          int Alignment = 16>
+bool transform_test() {
+  using namespace cute;
+
+  using TransformKernel = cutlass::transform::kernel::ConvFilterFormatTransformer<SrcFormat, DstFormat, 4, Element, Alignment>;
+  using Transform = cutlass::transform::device::TransformUniversalAdapter<TransformKernel>;
+
+  auto s = 3;
+  auto r = 3;
+  auto k = 64 + Alignment / (int)(sizeof(Element));
+  auto c = 64 + Alignment / (int)(sizeof(Element));
+
+  thrust::host_vector<Element> h_S(s * r * k * c);
+  thrust::host_vector<Element> h_D(s * r * k * c);
+
+  //
+  // Initialize
+  //
+
+  for (int i = 0; i < (int)h_S.size(); ++i) {
+    h_S[i] = static_cast<Element>(i);
+    h_D[i] = Element{};
+  }
+
+  thrust::device_vector<Element> d_S = h_S;
+  thrust::device_vector<Element> d_D = h_D;
+
+  Transform transform_op;
+
+  const void* src_ptr = static_cast<const void *>(d_S.data().get());
+  void* dst_ptr = static_cast<void *>(d_D.data().get());
+
+  typename TransformKernel::FilterExtent filter_extent;
+  filter_extent[0] = k;
+  filter_extent[1] = r;
+  filter_extent[2] = s;
+  filter_extent[3] = c;
+
+  auto args = typename Transform::Arguments {
+    src_ptr,
+    dst_ptr,
+    filter_extent
+  };
+
+  cutlass::Status status = cutlass::Status::kInvalid;
+
+  size_t workspace_size = Transform::get_workspace_size(args);
+  thrust::universal_vector<uint8_t> workspace(workspace_size);
+
+  status = transform_op.initialize(args, workspace.data().get());
+  if (status != cutlass::Status::kSuccess) {
+    cudaError_t error = cudaGetLastError();
+    std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+    return false;
+  }
+
+  status = transform_op();
+
+  EXPECT_TRUE(status == cutlass::Status::kSuccess);
+  if (status != cutlass::Status::kSuccess) {
+    return false;
+  }
+
+  cudaError_t result = cudaDeviceSynchronize();
+  EXPECT_EQ(result, cudaSuccess) << " Kernel execution error: "
+                                 << cudaGetErrorString(result);
+
+  // Verification
+  h_D = d_D;
+  auto tensor_shape_S = make_shape(s, r, k, c);
+
+  bool passed = false;
+  if constexpr(DstFormat == cutlass::transform::kernel::FilterFormat::KTRSC) {
+    // KTRSC
+    passed = verify_ckrs_to_krsc(h_S, h_D, tensor_shape_S);
+  }
+  else if constexpr(DstFormat == cutlass::transform::kernel::FilterFormat::CTRSK) {
+    // CTRSK;
+    passed = verify_ckrs_to_crsk(h_S, h_D, tensor_shape_S);
+  }
+
+  return passed;
+}
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+
+TEST(Transform_kernel_ConvFilterFormatTransformer, ckrs_to_crsk) {
+  bool passed = true;
+
+  // fp16 kernel with alignment bytes from 16 to 2.
+  passed &= transform_test<cutlass::half_t, cutlass::transform::kernel::FilterFormat::CKTRS, cutlass::transform::kernel::FilterFormat::CTRSK>();
+  passed &= transform_test<cutlass::half_t, cutlass::transform::kernel::FilterFormat::CKTRS, cutlass::transform::kernel::FilterFormat::CTRSK, 8>();
+  passed &= transform_test<cutlass::half_t, cutlass::transform::kernel::FilterFormat::CKTRS, cutlass::transform::kernel::FilterFormat::CTRSK, 4>();
+  passed &= transform_test<cutlass::half_t, cutlass::transform::kernel::FilterFormat::CKTRS, cutlass::transform::kernel::FilterFormat::CTRSK, 2>();
+
+  // fp8 kernel with alignment bytes from 16 to 1.
+  passed &= transform_test<cutlass::float_e4m3_t, cutlass::transform::kernel::FilterFormat::CKTRS, cutlass::transform::kernel::FilterFormat::CTRSK>();
+  passed &= transform_test<cutlass::float_e4m3_t, cutlass::transform::kernel::FilterFormat::CKTRS, cutlass::transform::kernel::FilterFormat::CTRSK, 8>();
+  passed &= transform_test<cutlass::float_e4m3_t, cutlass::transform::kernel::FilterFormat::CKTRS, cutlass::transform::kernel::FilterFormat::CTRSK, 4>();
+  passed &= transform_test<cutlass::float_e4m3_t, cutlass::transform::kernel::FilterFormat::CKTRS, cutlass::transform::kernel::FilterFormat::CTRSK, 2>();
+  passed &= transform_test<cutlass::float_e4m3_t, cutlass::transform::kernel::FilterFormat::CKTRS, cutlass::transform::kernel::FilterFormat::CTRSK, 1>();
+
+  // int8 kernel with alignment bytes from 16 to 1.
+  passed &= transform_test<int8_t, cutlass::transform::kernel::FilterFormat::CKTRS, cutlass::transform::kernel::FilterFormat::CTRSK>();
+  passed &= transform_test<int8_t, cutlass::transform::kernel::FilterFormat::CKTRS, cutlass::transform::kernel::FilterFormat::CTRSK, 8>();
+  passed &= transform_test<int8_t, cutlass::transform::kernel::FilterFormat::CKTRS, cutlass::transform::kernel::FilterFormat::CTRSK, 4>();
+  passed &= transform_test<int8_t, cutlass::transform::kernel::FilterFormat::CKTRS, cutlass::transform::kernel::FilterFormat::CTRSK, 2>();
+  passed &= transform_test<int8_t, cutlass::transform::kernel::FilterFormat::CKTRS, cutlass::transform::kernel::FilterFormat::CTRSK, 1>();
+
+  // fp32 kernel with alignment bytes from 16 to 4.
+  passed &= transform_test<float, cutlass::transform::kernel::FilterFormat::CKTRS, cutlass::transform::kernel::FilterFormat::CTRSK>();
+  passed &= transform_test<float, cutlass::transform::kernel::FilterFormat::CKTRS, cutlass::transform::kernel::FilterFormat::CTRSK, 8>();
+  passed &= transform_test<float, cutlass::transform::kernel::FilterFormat::CKTRS, cutlass::transform::kernel::FilterFormat::CTRSK, 4>();
+
+  EXPECT_TRUE(passed);
+}
+
+// CKRS -> KRSC
+TEST(Transform_kernel_ConvFilterFormatTransformer, ckrs_to_krsc) {
+  bool passed = true;
+
+  // fp16 kernel with alignment bytes from 16 to 2.
+  passed &= transform_test<cutlass::half_t, cutlass::transform::kernel::FilterFormat::CKTRS, cutlass::transform::kernel::FilterFormat::KTRSC>();
+  passed &= transform_test<cutlass::half_t, cutlass::transform::kernel::FilterFormat::CKTRS, cutlass::transform::kernel::FilterFormat::KTRSC, 8>();
+  passed &= transform_test<cutlass::half_t, cutlass::transform::kernel::FilterFormat::CKTRS, cutlass::transform::kernel::FilterFormat::KTRSC, 4>();
+  passed &= transform_test<cutlass::half_t, cutlass::transform::kernel::FilterFormat::CKTRS, cutlass::transform::kernel::FilterFormat::KTRSC, 2>();
+
+  // fp8 kernel with alignment bytes from 16 to 1.
+  passed &= transform_test<cutlass::float_e4m3_t, cutlass::transform::kernel::FilterFormat::CKTRS, cutlass::transform::kernel::FilterFormat::KTRSC>();
+  passed &= transform_test<cutlass::float_e4m3_t, cutlass::transform::kernel::FilterFormat::CKTRS, cutlass::transform::kernel::FilterFormat::KTRSC, 8>();
+  passed &= transform_test<cutlass::float_e4m3_t, cutlass::transform::kernel::FilterFormat::CKTRS, cutlass::transform::kernel::FilterFormat::KTRSC, 4>();
+  passed &= transform_test<cutlass::float_e4m3_t, cutlass::transform::kernel::FilterFormat::CKTRS, cutlass::transform::kernel::FilterFormat::KTRSC, 2>();
+  passed &= transform_test<cutlass::float_e4m3_t, cutlass::transform::kernel::FilterFormat::CKTRS, cutlass::transform::kernel::FilterFormat::KTRSC, 1>();
+
+  // int8 kernel with alignment bytes from 16 to 1.
+  passed &= transform_test<int8_t, cutlass::transform::kernel::FilterFormat::CKTRS, cutlass::transform::kernel::FilterFormat::KTRSC>();
+  passed &= transform_test<int8_t, cutlass::transform::kernel::FilterFormat::CKTRS, cutlass::transform::kernel::FilterFormat::KTRSC, 8>();
+  passed &= transform_test<int8_t, cutlass::transform::kernel::FilterFormat::CKTRS, cutlass::transform::kernel::FilterFormat::KTRSC, 4>();
+  passed &= transform_test<int8_t, cutlass::transform::kernel::FilterFormat::CKTRS, cutlass::transform::kernel::FilterFormat::KTRSC, 2>();
+  passed &= transform_test<int8_t, cutlass::transform::kernel::FilterFormat::CKTRS, cutlass::transform::kernel::FilterFormat::KTRSC, 1>();
+
+  // fp32 kernel with alignment bytes from 16 to 4.
+  passed &= transform_test<float, cutlass::transform::kernel::FilterFormat::CKTRS, cutlass::transform::kernel::FilterFormat::KTRSC>();
+  passed &= transform_test<float, cutlass::transform::kernel::FilterFormat::CKTRS, cutlass::transform::kernel::FilterFormat::KTRSC, 8>();
+  passed &= transform_test<float, cutlass::transform::kernel::FilterFormat::CKTRS, cutlass::transform::kernel::FilterFormat::KTRSC, 4>();
+
+  EXPECT_TRUE(passed);
+}
+
+#endif
diff --git a/test/unit/transform/threadblock/CMakeLists.txt b/test/unit/transform/threadblock/CMakeLists.txt
index 84ab92bbf4..16796d7e58 100644
--- a/test/unit/transform/threadblock/CMakeLists.txt
+++ b/test/unit/transform/threadblock/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/transform/threadblock/predicated_tile_iterator.cu b/test/unit/transform/threadblock/predicated_tile_iterator.cu
index 8157f2b3c8..7e9f9bf21e 100644
--- a/test/unit/transform/threadblock/predicated_tile_iterator.cu
+++ b/test/unit/transform/threadblock/predicated_tile_iterator.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -68,7 +68,7 @@ __global__ void copy(
 
   typename Iterator::Fragment frag;
 
-  for(int i = 0; i < frag.size(); i++)
+  for(size_t i = 0; i < frag.size(); i++)
     frag[i] = 0;
 
   src_iterator.load(frag);
diff --git a/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu b/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu
index 092c1c2d27..052df60a07 100644
--- a/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu
+++ b/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/util/CMakeLists.txt b/test/unit/util/CMakeLists.txt
index c93afc2c9d..e3b397f46e 100644
--- a/test/unit/util/CMakeLists.txt
+++ b/test/unit/util/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -29,9 +29,6 @@
 cutlass_test_unit_add_executable(
   cutlass_test_unit_util
   tensor_reduce.cu
-  )
-
-cutlass_test_unit_add_executable(
-  cutlass_test_unit_levels
   cutlass_test_levels.cu
+  rms_norm.cu
   )
diff --git a/test/unit/util/cutlass_test_levels.cu b/test/unit/util/cutlass_test_levels.cu
index ed6092a602..72150de545 100644
--- a/test/unit/util/cutlass_test_levels.cu
+++ b/test/unit/util/cutlass_test_levels.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/util/rms_norm.cu b/test/unit/util/rms_norm.cu
new file mode 100644
index 0000000000..4111406714
--- /dev/null
+++ b/test/unit/util/rms_norm.cu
@@ -0,0 +1,124 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#include "../common/cutlass_unit_test.h"
+
+#include "cutlass/util/device_rmsnorm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/constants.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+
+using ElementType = cutlass::half_t;
+using Layout = cutlass::layout::RowMajor;
+
+void rmsnorm_host(cutlass::MatrixCoord tensor_size,
+		  cutlass::TensorRef<ElementType, Layout> output,
+		  cutlass::TensorRef<ElementType, Layout> input,
+		  cutlass::TensorRef<ElementType, Layout> weight,
+                  float epsilon) {
+  const int M = tensor_size.row();
+  const int N = tensor_size.column();
+
+  for (int m = 0; m < M; ++m) {
+    float square_sum{0};
+
+    for (int n = 0; n < N; ++n) {
+      float inp = static_cast<float>(input.at({m, n}));
+      square_sum += inp * inp;
+    }
+
+    float sq_mean = square_sum / (float)N;
+    float sqrt_var = cutlass::fast_sqrt(sq_mean + epsilon);
+
+    for (int n = 0; n < N; ++n) {
+      float inp = static_cast<float>(input.at({m, n}));
+      float g = static_cast<float>(weight.at({0, n}));
+      float res_fp32 = inp / sqrt_var * g;
+      output.at({m, n}) = ElementType(res_fp32);
+    }
+  }
+}
+
+void run_test(int M, int N) {
+  cutlass::HostTensor<ElementType, Layout> input, output_ref, output, weight;
+  input.reset({M, N});
+  output.reset({M, N});
+  output_ref.reset({M, N});
+  weight.reset({1, N});
+
+  const unsigned seed = 2022;
+
+  cutlass::reference::host::TensorFillRandomUniform(input.host_view(),
+						    seed,
+						    ElementType(5),
+						    ElementType(-5),
+						    0);
+
+  cutlass::reference::host::TensorFillRandomUniform(weight.host_view(),
+						    seed,
+						    ElementType(5),
+						    ElementType(-5),
+						    0);
+
+  input.sync_device();
+  weight.sync_device();
+
+  rmsnorm_host({M, N}, output_ref.host_ref(), input.host_ref(), weight.host_ref(), (float)1e-5);
+  cutlass::rmsnorm({M, N}, output.device_ref(),
+		   input.device_ref(), weight.device_ref(), NULL, (float)1e-5L);
+
+  output.sync_host();
+
+  float max_abs_diff = -1;
+  float mean_abs_diff = 0;
+  for (int m = 0; m < M; ++m) {
+    for (int n = 0; n < N; ++n) {
+      auto diff = abs(static_cast<float>(output_ref.at({m, n}) - output.at({m, n})));
+      mean_abs_diff += diff;
+      max_abs_diff = cutlass::platform::max(max_abs_diff, diff);
+    }
+  }
+
+  mean_abs_diff /= float(M * N);
+
+  EXPECT_TRUE(max_abs_diff < 0.001f && mean_abs_diff < 0.001f)
+    << "Max absolute difference  : " << max_abs_diff << "\n"
+    << "Mean absolute difference: " << mean_abs_diff;
+}
+
+TEST(RMSNorm, 16x1024) {
+  run_test(16, 1024);
+}
+
+TEST(RMSNorm, 1x127) {
+  run_test(1, 127);
+}
diff --git a/test/unit/util/tensor_reduce.cu b/test/unit/util/tensor_reduce.cu
index 752d874e41..06fafdc359 100644
--- a/test/unit/util/tensor_reduce.cu
+++ b/test/unit/util/tensor_reduce.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index ee5cf222f0..5314f6395a 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -26,11 +26,14 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+cmake_policy(SET CMP0112 NEW)
+
 add_subdirectory(util)
 
 if (CUTLASS_ENABLE_LIBRARY)
   add_subdirectory(library)
 endif()
+
 if (CUTLASS_ENABLE_PROFILER)
   if (NOT CUTLASS_ENABLE_LIBRARY)
     message(SEND_ERROR "Build conflict: The CUTLASS profiler requires the CUTLASS library.")
diff --git a/tools/library/CMakeLists.txt b/tools/library/CMakeLists.txt
index c6c7d6fbf5..21ad95620b 100644
--- a/tools/library/CMakeLists.txt
+++ b/tools/library/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -26,9 +26,16 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+cmake_policy(SET CMP0112 NEW)
+
 include(GNUInstallDirs)
 
-find_package(Python3 3.5 COMPONENTS Interpreter REQUIRED)
+################################################################################
+
+set(CUTLASS_BUILD_MONO_LIBRARY OFF CACHE BOOL 
+  "Determines whether the cutlass library is generated as a single file or multiple files.")
+
+################################################################################
 
 add_library(cutlass_library_includes INTERFACE)
 add_library(nvidia::cutlass::library::includes ALIAS cutlass_library_includes)
@@ -48,38 +55,207 @@ target_link_libraries(
   cutlass_tools_util_includes
   )
 
+install(
+  TARGETS cutlass_library_includes
+  EXPORT NvidiaCutlass
+  )
+
 install(
   DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/
   DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/
   )
 
+add_library(cutlass_library_internal_interface INTERFACE)
+add_library(nvidia::cutlass::library::obj_interface ALIAS cutlass_library_internal_interface)
+
+target_include_directories(
+  cutlass_library_internal_interface
+  INTERFACE
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src>
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>
+  )
+
+target_link_libraries(
+  cutlass_library_internal_interface
+  INTERFACE
+  cutlass_library_includes
+  )
+
+################################################################################
+
+function(cutlass_add_cutlass_library)
 #
-# CUTLASS Deliverables Library
-#
+# Generates static and shared libraries with the given SOURCES. The public CMake
+# targets produces will be cutlass_library(_${SUFFIX})? and 
+# cutlass_library(_${SUFFIX})?_static.
+# 
+# SUFFIX: An additional string to be joined to the default names. If suffix is given,
+#   the generated libraries will be linked as a dependency of the main cutlass library.
+
+  set(options)
+  set(oneValueArgs SUFFIX)
+  set(multiValueArgs)
+  cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  set(DEFAULT_NAME cutlass_library)
+
+  set(__NAME ${DEFAULT_NAME})
+  set(__OUTPUT_NAME cutlass)
+  set(__EXPORT_NAME library)
+
+  if (__SUFFIX)
+    string(APPEND __NAME _${__SUFFIX})
+    string(APPEND __OUTPUT_NAME _${__SUFFIX})
+    string(APPEND __EXPORT_NAME _${__SUFFIX})
+  endif()
+
+  cutlass_add_library(
+    ${__NAME}_objs
+    OBJECT
+    ${__UNPARSED_ARGUMENTS}
+    )  
+
+  target_link_libraries(${__NAME}_objs
+    PUBLIC cutlass_library_includes
+    PRIVATE cutlass_library_internal_interface
+    )
+
+  if (CUTLASS_BUILD_MONO_LIBRARY AND __SUFFIX)
+
+    # If we're only building a single monolithic library then we
+    # simply link the generated object files to the default library. 
+
+    target_link_libraries(${DEFAULT_NAME} PRIVATE $<BUILD_INTERFACE:${__NAME}_objs>)
+    target_link_libraries(${DEFAULT_NAME}_static PRIVATE $<BUILD_INTERFACE:${__NAME}_objs>)
+
+  else()
+
+    cutlass_add_library(
+      ${__NAME} 
+      SHARED
+      EXPORT_NAME ${__EXPORT_NAME}
+      ""
+      )
+
+    target_compile_features(${__NAME} INTERFACE cxx_std_17)
+    
+    set_target_properties(
+      ${__NAME}
+      PROPERTIES
+      OUTPUT_NAME ${__OUTPUT_NAME}
+      WINDOWS_EXPORT_ALL_SYMBOLS 1
+      )
+    
+    target_link_libraries(
+      ${__NAME}
+      PUBLIC cutlass_library_includes
+      PRIVATE $<BUILD_INTERFACE:${__NAME}_objs>
+      cuda_driver
+      )
+    
+    set_target_properties(${__NAME} PROPERTIES DEBUG_POSTFIX "${CUTLASS_LIBRARY_DEBUG_POSTFIX}")
+    
+    cutlass_add_library(
+      ${__NAME}_static
+      STATIC
+      EXPORT_NAME ${__EXPORT_NAME}_static
+      ""
+      )
+
+    target_compile_features(${__NAME}_static INTERFACE cxx_std_17)
+    
+    if (WIN32)
+      set(STATIC_OUTPUT_NAME ${__OUTPUT_NAME}.static)
+    else()
+      set(STATIC_OUTPUT_NAME ${__OUTPUT_NAME})
+    endif()
+    
+    set_target_properties(
+      ${__NAME}_static
+      PROPERTIES
+      OUTPUT_NAME ${STATIC_OUTPUT_NAME}
+      WINDOWS_EXPORT_ALL_SYMBOLS 1
+      )
+    
+    target_link_libraries(
+      ${__NAME}_static
+      PUBLIC cutlass_library_includes
+      PRIVATE $<BUILD_INTERFACE:${__NAME}_objs>
+      cuda_driver
+      )
+    
+    set_target_properties(${__NAME}_static PROPERTIES DEBUG_POSTFIX "${CUTLASS_LIBRARY_DEBUG_POSTFIX}")
+    
+    install(
+      TARGETS ${__NAME} ${__NAME}_static
+      EXPORT NvidiaCutlass
+      RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+      LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+      ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+      )
+    
+    if (__SUFFIX)
+    
+      # The partial libraries generated will be registered as linked libraries
+      # to the main cutlass library so users automatically get the necessary link
+      # commands to pull in all kernels by default.
+    
+      target_link_libraries(${DEFAULT_NAME} PUBLIC ${__NAME})
+      target_link_libraries(${DEFAULT_NAME}_static PUBLIC ${__NAME}_static)
+    
+    endif()
+
+  endif()
+
+endfunction()
+
+################################################################################
+
+cutlass_add_cutlass_library(
 
-cutlass_add_library(
-  cutlass_library_objs
-  OBJECT
   src/handle.cu
   src/manifest.cpp
   src/operation_table.cu
   src/singleton.cu
   src/util.cu
 
-  src/reference/gemm.cu
+  # files split for parallel compilation
+  src/reference/gemm_int4.cu
+  src/reference/gemm_s8_s8_s32.cu
+  src/reference/gemm_u8_u8_s32.cu
+  src/reference/gemm_int8_interleaved_32.cu
+  src/reference/gemm_int8_interleaved_64.cu
+  src/reference/gemm_e4m3a_e4m3out.cu
+  src/reference/gemm_e5m2a_e4m3out.cu
+  src/reference/gemm_e4m3a_e5m2out.cu
+  src/reference/gemm_e5m2a_e5m2out.cu
+  src/reference/gemm_fp8in_fp16out.cu
+  src/reference/gemm_fp8in_bf16out.cu
+  src/reference/gemm_fp8in_fp32out.cu
+  src/reference/gemm_fp32out.cu
+  src/reference/gemm_fp_other.cu
+  src/reference/gemm_fp_mixed_input.cu
+  src/reference/gemm_int_mixed_input.cu
   src/reference/initialize_reference_operations.cu
 
-
   # cutlass reduction instances in cutlass library
+
   src/reduction/reduction_device.cu
   src/reduction/init_reduction_operations.cu
   
   # cutlass conv reference instances in cutlass library
+
   src/reference/conv2d.cu
   src/reference/conv3d.cu
 
   )
 
+# For backward compatibility with the old name
+add_library(cutlass_lib ALIAS cutlass_library)
+add_library(cutlass_lib_static ALIAS cutlass_library_static)
+
+################################################################################
+
 file(GLOB_RECURSE GENERATOR_PYTHON_SOURCES CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/scripts/*.py)
 
 #
@@ -88,18 +264,29 @@ file(GLOB_RECURSE GENERATOR_PYTHON_SOURCES CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOU
 
 # set cutlass generator compiler version to filter kernels in the generator not supported by a specific toolkit. 
 set(CUTLASS_GENERATOR_CUDA_COMPILER_VERSION ${CMAKE_CUDA_COMPILER_VERSION})
+set(CUTLASS_LIBRARY_GENERATED_KERNEL_LIST_FILE ${CMAKE_CURRENT_BINARY_DIR}/generated_kernels.txt CACHE STRING "Generated kernel listing file")
 
+# --log-level is set to DEBUG to enable printing information about which kernels were excluded
+# from generation in /python/cutlass_library/manifest.py. To avoid having this information appear
+# in ${CMAKE_CURRENT_BINARY_DIR}/library_instance_generation.log, set this parameter to INFO
 execute_process(
-  WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/scripts
-  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/scripts/generator.py
+  WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../python/cutlass_library
+  COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CUTLASS_LIBRARY_PACKAGE_DIR}
+    ${Python3_EXECUTABLE} ${CUTLASS_SOURCE_DIR}/python/cutlass_library/generator.py
     --operations "${CUTLASS_LIBRARY_OPERATIONS}" 
     --build-dir ${PROJECT_BINARY_DIR}
     --curr-build-dir ${CMAKE_CURRENT_BINARY_DIR}
     --generator-target library
     --architectures "${CUTLASS_NVCC_ARCHS_ENABLED}"
     --kernels "${CUTLASS_LIBRARY_KERNELS}"
+    --instantiation-level "${CUTLASS_LIBRARY_INSTANTIATION_LEVEL}"
     --ignore-kernels "${CUTLASS_LIBRARY_IGNORE_KERNELS}"
+    --exclude-kernels "${CUTLASS_LIBRARY_EXCLUDE_KERNELS}"
+    --kernel-filter-file "${KERNEL_FILTER_FILE}"
+    --selected-kernel-list "${CUTLASS_LIBRARY_GENERATED_KERNEL_LIST_FILE}"
     --cuda-version "${CUTLASS_GENERATOR_CUDA_COMPILER_VERSION}"
+    --log-level DEBUG
+    --disable-cutlass-package-imports
   RESULT_VARIABLE cutlass_lib_INSTANCE_GENERATION_RESULT
   OUTPUT_VARIABLE cutlass_lib_INSTANCE_GENERATION_OUTPUT
   OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/library_instance_generation.log
@@ -110,6 +297,8 @@ if(NOT cutlass_lib_INSTANCE_GENERATION_RESULT EQUAL 0)
   message(FATAL_ERROR "Error generating library instances. See ${CMAKE_CURRENT_BINARY_DIR}/library_instance_generation.log")
 endif()
 
+message(STATUS "Completed generation of library instances. See ${CMAKE_CURRENT_BINARY_DIR}/library_instance_generation.log for more information.")
+
 # include auto-instantiated kernels in he CUTLASS Deliverables Library
 set(CUTLASS_LIBRARY_MANIFEST_CMAKE_FILE ${CMAKE_CURRENT_BINARY_DIR}/generated/manifest.cmake)
 if(EXISTS "${CUTLASS_LIBRARY_MANIFEST_CMAKE_FILE}")
@@ -118,71 +307,10 @@ else()
   message(STATUS "auto-generated library manifest cmake file (${CUTLASS_LIBRARY_MANIFEST_CMAKE_FILE}) not found.")
 endif()
 
-target_include_directories(
-  cutlass_library_objs
-  PRIVATE
-  ${CMAKE_CURRENT_SOURCE_DIR}/src
-  ${CMAKE_CURRENT_BINARY_DIR}/include
-  )
-
-target_link_libraries(
-  cutlass_library_objs 
-  PUBLIC 
-  cutlass_library_includes
-  )
-
-function(cutlass_add_cutlass_library)
-
-  set(options)
-  set(oneValueArgs NAME TYPE EXPORT_NAME)
-  set(multiValueArgs)
-  cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-  cutlass_add_library(
-    ${__NAME} 
-    ${__TYPE}
-    EXPORT_NAME ${__EXPORT_NAME}
-    $<TARGET_OBJECTS:cutlass_library_objs>
-    )
-
-  target_link_libraries(
-    ${__NAME}
-    PUBLIC 
-    cutlass_library_includes
-    )
-
-  set_target_properties(${__NAME} PROPERTIES DEBUG_POSTFIX "${CUTLASS_LIBRARY_DEBUG_POSTFIX}")
-  
-  set(OUTPUT_NAME cutlass)
-
-  if (WIN32 AND ${__TYPE} STREQUAL "STATIC")
-    set(OUTPUT_NAME "${OUTPUT_NAME}.static")
-  endif()
-
-  set_target_properties(
-    ${__NAME}
-    PROPERTIES
-    OUTPUT_NAME ${OUTPUT_NAME}
-    WINDOWS_EXPORT_ALL_SYMBOLS 1
-    )
-
-endfunction()
-
-cutlass_add_cutlass_library(NAME cutlass_lib TYPE SHARED EXPORT_NAME library)
-cutlass_add_cutlass_library(NAME cutlass_library_static TYPE STATIC EXPORT_NAME library_static)
+################################################################################
 
 install(
-  DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/
-  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+  FILES ${CUTLASS_LIBRARY_GENERATED_KERNEL_LIST_FILE}
+  DESTINATION ${CMAKE_INSTALL_INFODIR}/cutlass/
   )
 
-install(
-  TARGETS 
-    cutlass_lib
-    cutlass_library_static
-    cutlass_library_includes
-  EXPORT NvidiaCutlass
-  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
-  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-  )
diff --git a/tools/library/include/cutlass/library/arch_mappings.h b/tools/library/include/cutlass/library/arch_mappings.h
index 3638ca10e1..74a768c56a 100644
--- a/tools/library/include/cutlass/library/arch_mappings.h
+++ b/tools/library/include/cutlass/library/arch_mappings.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -97,6 +97,28 @@ template <typename OperatorClass> struct ArchMap<arch::Sm86, OperatorClass> {
   static int const kMax = 1024;
 };
 
+template <typename OperatorClass> struct ArchMap<arch::Sm89, OperatorClass> {
+  static int const kMin = 89;
+  static int const kMax = 89;
+};
+
+template <typename OperatorClass> struct ArchMap<arch::Sm90, OperatorClass> {
+  static int const kMin = 90;
+  static int const kMax = 1024;
+};
+
+// Arch conditional WGMMA
+template <> struct ArchMap<arch::Sm90, arch::OpClassTensorOp> {
+  static int const kMin = 90;
+  static int const kMax = 90;
+};
+
+// Arch conditional sparse WGMMA
+template <> struct ArchMap<arch::Sm90, arch::OpClassSparseTensorOp> {
+  static int const kMin = 90;
+  static int const kMax = 90;
+};
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace library
diff --git a/tools/library/include/cutlass/library/descriptions.h b/tools/library/include/cutlass/library/descriptions.h
new file mode 100644
index 0000000000..6a6aab434b
--- /dev/null
+++ b/tools/library/include/cutlass/library/descriptions.h
@@ -0,0 +1,601 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cutlass/library/types.h>
+#include <cutlass/blas3_types.h>
+#include <cutlass/gemm_coord.h>
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct MathInstructionDescription {
+
+  /// Shape of the target math instruction
+  cutlass::gemm::GemmCoord instruction_shape;
+
+  /// Describes the data type of the internal accumulator
+  NumericTypeID element_accumulator;
+
+  /// Classification of math instruction
+  OpcodeClassID opcode_class;
+
+  /// Type of math operation performed
+  MathOperationID math_operation;
+
+  //
+  // Methods
+  //
+
+  MathInstructionDescription(
+    cutlass::gemm::GemmCoord instruction_shape = cutlass::gemm::GemmCoord(),
+    NumericTypeID element_accumulator = NumericTypeID::kInvalid,
+    OpcodeClassID opcode_class = OpcodeClassID::kInvalid,
+    MathOperationID math_operation = MathOperationID::kMultiplyAdd
+  ):
+    instruction_shape(instruction_shape), 
+    element_accumulator(element_accumulator), 
+    opcode_class(opcode_class),
+    math_operation(math_operation) {}
+
+  // Equality operator
+  inline
+  bool operator==(MathInstructionDescription const& rhs) const{
+    return (
+      (instruction_shape == rhs.instruction_shape) &&
+      (element_accumulator == rhs.element_accumulator) &&
+      (opcode_class == rhs.opcode_class) &&
+      (math_operation == rhs.math_operation));
+  }
+
+  // Inequality operator
+  inline
+  bool operator!=(MathInstructionDescription const& rhs) const {
+    return !(*this == rhs);
+  }
+
+};
+
+/// Structure describing the tiled structure of a GEMM-like computation
+struct TileDescription {
+
+  /// Describes the shape of a threadblock (in elements)
+  cutlass::gemm::GemmCoord threadblock_shape;
+
+  /// Describes the number of pipeline stages in the threadblock-scoped mainloop
+  int threadblock_stages;
+
+  /// Number of warps in each logical dimension
+  cutlass::gemm::GemmCoord warp_count;
+
+  /// Core math instruction
+  MathInstructionDescription math_instruction;
+
+  /// Minimum compute capability (e.g. 70, 75) of a device eligible to run the operation.
+  int minimum_compute_capability;
+
+  /// Minimum compute capability (e.g. 70, 75) of a device eligible to run the operation.
+  int maximum_compute_capability;
+
+  /// Describes the shape of a cluster (in blocks)
+  cutlass::gemm::GemmCoord cluster_shape;
+
+  //
+  // Methods
+  //
+
+  TileDescription(
+    cutlass::gemm::GemmCoord threadblock_shape = cutlass::gemm::GemmCoord(),
+    int threadblock_stages = 0,
+    cutlass::gemm::GemmCoord warp_count = cutlass::gemm::GemmCoord(),
+    MathInstructionDescription math_instruction = MathInstructionDescription(),
+    int minimum_compute_capability = 0,
+    int maximum_compute_capability = 0,
+    cutlass::gemm::GemmCoord cluster_shape = cutlass::gemm::GemmCoord(1,1,1)
+  ):
+    threadblock_shape(threadblock_shape), 
+    threadblock_stages(threadblock_stages), 
+    warp_count(warp_count),
+    math_instruction(math_instruction),
+    minimum_compute_capability(minimum_compute_capability),
+    maximum_compute_capability(maximum_compute_capability),
+    cluster_shape(cluster_shape) { }
+
+  // Equality operator
+  inline
+  bool operator==(TileDescription const& rhs) const{
+    return (
+      (threadblock_shape == rhs.threadblock_shape) &&
+      (threadblock_stages == rhs.threadblock_stages) &&
+      (warp_count == rhs.warp_count) &&
+      (math_instruction == rhs.math_instruction) &&
+      (minimum_compute_capability == rhs.minimum_compute_capability) &&
+      (maximum_compute_capability == rhs.maximum_compute_capability));
+  }
+
+  // Inequality operator
+  inline
+  bool operator!=(TileDescription const& rhs) const {
+    return !(*this == rhs);
+  }
+};
+
+/// High-level description of an operation
+struct OperationDescription {
+
+  /// Unique identifier describing the operation
+  char const * name;
+
+  /// Operation provider
+  Provider provider;
+
+  /// Kind of operation
+  OperationKind kind;
+
+  /// Describes the tiled structure of a GEMM-like computation
+  TileDescription tile_description;
+
+  //
+  // Methods
+  //
+  OperationDescription(
+    char const * name = "unknown",
+    Provider provider = Provider::kInvalid,
+    OperationKind kind = OperationKind::kInvalid, 
+    TileDescription const&  tile_description = TileDescription()
+  ):
+    name(name), provider(provider), kind(kind), tile_description(tile_description) { }
+};
+
+/// Structure describing the properties of a tensor
+struct TensorDescription {
+
+  /// Numeric type of an individual element
+  NumericTypeID element;
+
+  /// Enumerant identifying the layout function for the tensor
+  LayoutTypeID layout;
+
+  /// Alignment restriction on pointers, strides, and extents
+  int alignment;
+
+  /// log2() of the maximum extent of each dimension
+  int log_extent_range;
+
+  /// log2() of the maximum value each relevant stride may have
+  int log_stride_range;
+  
+  //
+  // Methods
+  //
+
+  TensorDescription(
+    NumericTypeID element = NumericTypeID::kInvalid,
+    LayoutTypeID layout = LayoutTypeID::kInvalid,
+    int alignment = 1,
+    int log_extent_range = 24,
+    int log_stride_range = 24
+  ):
+    element(element), 
+    layout(layout), 
+    alignment(alignment), 
+    log_extent_range(log_extent_range), 
+    log_stride_range(log_stride_range)  { }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Description of all GEMM computations
+struct GemmDescription : public OperationDescription {
+
+  /// Indicates the kind of GEMM performed
+  GemmKind gemm_kind;
+  
+  /// Describes the A operand
+  TensorDescription A;
+
+  /// Describes the B operand
+  TensorDescription B;
+
+  /// Describes the source matrix
+  TensorDescription C;
+
+  /// Describes the destination matrix
+  TensorDescription D;
+
+  /// Describes the sparse meta matrices
+  TensorDescription E;
+
+  /// Describes the data type of the scalars passed to the epilogue
+  NumericTypeID element_epilogue;
+
+  /// Describes the structure of parallel reductions
+  SplitKMode split_k_mode;
+
+  /// Transformation on A operand
+  ComplexTransform transform_A;
+
+  /// Transformation on B operand
+  ComplexTransform transform_B;
+
+  //
+  // Methods
+  //
+
+  GemmDescription(
+    GemmKind gemm_kind = GemmKind::kGemm,
+    TensorDescription const& A = TensorDescription(),
+    TensorDescription const& B = TensorDescription(),
+    TensorDescription const& C = TensorDescription(),
+    TensorDescription const& D = TensorDescription(),
+    NumericTypeID element_epilogue = NumericTypeID::kInvalid,
+    SplitKMode split_k_mode = SplitKMode::kNone,
+    ComplexTransform transform_A = ComplexTransform::kNone,
+    ComplexTransform transform_B = ComplexTransform::kNone
+  ):
+    gemm_kind(gemm_kind),
+    A(A),
+    B(B),
+    C(C),
+    D(D),
+    element_epilogue(element_epilogue),
+    split_k_mode(split_k_mode),
+    transform_A(transform_A),
+    transform_B(transform_B) {} 
+
+  GemmDescription(
+    OperationDescription op_desc,
+    GemmKind gemm_kind,
+    TensorDescription const& A,
+    TensorDescription const& B,
+    TensorDescription const& C,
+    TensorDescription const& D,
+    NumericTypeID element_epilogue,
+    SplitKMode split_k_mode,
+    ComplexTransform transform_A,
+    ComplexTransform transform_B
+  ):
+    OperationDescription(op_desc),
+    gemm_kind(gemm_kind),
+    A(A),
+    B(B),
+    C(C),
+    D(D),
+    element_epilogue(element_epilogue),
+    split_k_mode(split_k_mode),
+    transform_A(transform_A),
+    transform_B(transform_B) {}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Description for structured sparse GEMMs.
+struct SparseGemmDescription : public GemmDescription {
+
+  /// Description structure for structured sparse GEMM
+  SparseGemmDescription(
+    GemmKind gemm_kind = GemmKind::kGemm,
+    TensorDescription const& A = TensorDescription(),
+    TensorDescription const& B = TensorDescription(),
+    TensorDescription const& C = TensorDescription(),
+    TensorDescription const& D = TensorDescription(),
+    TensorDescription const& E = TensorDescription(),
+    NumericTypeID element_epilogue = NumericTypeID::kInvalid,
+    SplitKMode split_k_mode = SplitKMode::kNone,
+    ComplexTransform transform_A = ComplexTransform::kNone,
+    ComplexTransform transform_B = ComplexTransform::kNone
+  ):
+    GemmDescription(gemm_kind, A, B, C, D, element_epilogue, split_k_mode, transform_A, transform_B)
+     {this->E = E;}
+};
+
+/// Description of all Reduction operations
+struct ReductionDescription : public OperationDescription {
+
+  /// Describes the data type of workspace
+  NumericTypeID element_workspace;
+
+  /// Describes the data type of final output
+  NumericTypeID element_output;
+
+  /// Describes the data type of the scalars passed to the epilogue
+  NumericTypeID element_epilogue;
+};
+
+/// Description of all Rank K update computations (SYRK, HERK, SYR2K, HER2K)
+struct RankKDescription : public OperationDescription {
+
+  /// Indicates which device template is used (universal or regular)
+  RankKKind rank_k_kind;
+
+  /// Number of rank update (rank k or rank 2k)
+  int num_ranks;
+  
+  /// Describes the A operand
+  TensorDescription A;
+
+  /// Describes the B operand (used only for SYR2K and HER2K)
+  TensorDescription B;
+
+  /// Describes the source and destination matrices
+  TensorDescription C;
+
+  /// Describes the fill mode for matrix C
+  FillMode fill_mode;
+
+  /// Describes the blas mode (symmetric/hermitian)
+  BlasMode blas_mode;
+
+  /// Describes the data type of the scalars passed to the epilogue
+  NumericTypeID element_epilogue;
+
+  /// Describes the structure of parallel reductions
+  SplitKMode split_k_mode;
+
+  /// Transformation on A operand
+  ComplexTransform transform_A;
+
+  /// Transformation on B operand
+  ComplexTransform transform_B;
+
+  //
+  // Methods
+  //
+
+  RankKDescription(
+    RankKKind rank_k_kind = RankKKind::kUniversal,
+    int num_ranks = 1,
+    TensorDescription const& A = TensorDescription(),
+    TensorDescription const& B = TensorDescription(),
+    TensorDescription const& C = TensorDescription(),
+    FillMode fill_mode = FillMode::kInvalid,
+    BlasMode blas_mode = BlasMode::kInvalid,
+    NumericTypeID element_epilogue = NumericTypeID::kInvalid,
+    SplitKMode split_k_mode = SplitKMode::kNone,
+    ComplexTransform transform_A = ComplexTransform::kNone,
+    ComplexTransform transform_B = ComplexTransform::kNone
+  ):
+    rank_k_kind(rank_k_kind),
+    num_ranks(num_ranks),
+    A(A),
+    B(B),
+    C(C),
+    fill_mode(fill_mode),
+    blas_mode(blas_mode),
+    element_epilogue(element_epilogue),
+    split_k_mode(split_k_mode),
+    transform_A(transform_A),
+    transform_B(transform_B) {} 
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Description of all TRMM computations
+struct TrmmDescription : public OperationDescription {
+
+  /// Indicates the kind of TRMM performed
+  TrmmKind trmm_kind;
+  
+  /// Describes the A operand
+  TensorDescription A;
+
+  /// Describes the side mode for matrix A
+  SideMode side_mode;
+
+  /// Describes the fill mode for matrix A
+  FillMode fill_mode;
+
+  /// Describes the diag type for matrix A
+  DiagType diag_type;
+
+  /// Describes the B operand
+  TensorDescription B;
+
+  /// Describes the source and destination matrices
+  TensorDescription D;
+
+  /// Describes the data type of the scalars passed to the epilogue
+  NumericTypeID element_epilogue;
+
+  /// Describes the structure of parallel reductions
+  SplitKMode split_k_mode;
+
+  /// Transformation on A operand
+  ComplexTransform transform_A;
+
+  //
+  // Methods
+  //
+
+  TrmmDescription(
+    TrmmKind trmm_kind = TrmmKind::kUniversal,
+    TensorDescription const& A = TensorDescription(),
+    SideMode side_mode = SideMode::kInvalid,
+    FillMode fill_mode = FillMode::kInvalid,
+    DiagType diag_type = DiagType::kInvalid,
+    TensorDescription const& B = TensorDescription(),
+    TensorDescription const& D = TensorDescription(),
+    NumericTypeID element_epilogue = NumericTypeID::kInvalid,
+    SplitKMode split_k_mode = SplitKMode::kNone,
+    ComplexTransform transform_A = ComplexTransform::kNone
+  ):
+    trmm_kind(trmm_kind),
+    A(A),
+    side_mode(side_mode),
+    fill_mode(fill_mode),
+    diag_type(diag_type),
+    B(B),
+    D(D),
+    element_epilogue(element_epilogue),
+    split_k_mode(split_k_mode),
+    transform_A(transform_A) {} 
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Description of all SYMM/HEMM update computations
+struct SymmDescription : public OperationDescription {
+
+  /// Indicates which device template is used (universal or regular)
+  SymmKind symm_kind;
+  
+  /// Describes the A operand
+  TensorDescription A;
+
+  /// Describes the B operand 
+  TensorDescription B;
+
+  /// Describes the source and destination matrices
+  TensorDescription C;
+
+  /// Describes the side mode for matrix A
+  SideMode side_mode;
+
+  /// Describes the fill mode for matrix A
+  FillMode fill_mode;
+
+  /// Describes the blas mode (symmetric/hermitian)
+  BlasMode blas_mode;
+
+  /// Describes the data type of the scalars passed to the epilogue
+  NumericTypeID element_epilogue;
+
+  /// Describes the structure of parallel reductions
+  SplitKMode split_k_mode;
+
+  /// Transformation on A operand
+  ComplexTransform transform_A;
+
+  /// Transformation on B operand
+  ComplexTransform transform_B;
+
+  //
+  // Methods
+  //
+
+  SymmDescription(
+    SymmKind symm_kind = SymmKind::kUniversal,
+    TensorDescription const& A = TensorDescription(),
+    TensorDescription const& B = TensorDescription(),
+    TensorDescription const& C = TensorDescription(),
+    SideMode side_mode = SideMode::kInvalid,
+    FillMode fill_mode = FillMode::kInvalid,
+    BlasMode blas_mode = BlasMode::kInvalid,
+    NumericTypeID element_epilogue = NumericTypeID::kInvalid,
+    SplitKMode split_k_mode = SplitKMode::kNone,
+    ComplexTransform transform_A = ComplexTransform::kNone,
+    ComplexTransform transform_B = ComplexTransform::kNone
+  ):
+    symm_kind(symm_kind),
+    A(A),
+    B(B),
+    C(C),
+    side_mode(side_mode),
+    fill_mode(fill_mode),
+    blas_mode(blas_mode),
+    element_epilogue(element_epilogue),
+    split_k_mode(split_k_mode),
+    transform_A(transform_A),
+    transform_B(transform_B) {} 
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Description of all Conv2d operations
+struct ConvDescription : public OperationDescription {
+  /// Describes the convolution dimension support (2D or 3D)
+  int conv_dim;
+  
+  /// Describes the kind of convolution
+  ConvKind conv_kind;
+
+  /// Describes the type of iterator algorithm (analytic or precomputed)
+  IteratorAlgorithmID iterator_algorithm;
+
+  /// Describes the A operand
+  TensorDescription A;
+
+  /// Describes the B operand
+  TensorDescription B;
+
+  /// Describes the C operand
+  TensorDescription C;
+
+  /// Describes the data type of the scalars passed to the epilogue
+  NumericTypeID element_epilogue;
+
+  //
+  // Methods
+  //
+  // Returns Activation TensorDescription
+  TensorDescription activation() const {
+    switch(conv_kind) {
+      case library::ConvKind::kFprop : return A;
+      case library::ConvKind::kDgrad : return C;
+      case library::ConvKind::kWgrad : return B;
+      default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+    }
+  }
+
+  // Returns Filter TensorDescription
+  TensorDescription filter() const {
+    switch(conv_kind) {
+      case library::ConvKind::kFprop : return B;
+      case library::ConvKind::kDgrad : return B;
+      case library::ConvKind::kWgrad : return C;
+      default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+    }
+  }
+
+  // Returns Output TensorDescription
+  TensorDescription output() const {
+    switch(conv_kind) {
+      case library::ConvKind::kFprop : return C;
+      case library::ConvKind::kDgrad : return A;
+      case library::ConvKind::kWgrad : return A;
+      default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+    }
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/library/include/cutlass/library/handle.h b/tools/library/include/cutlass/library/handle.h
index 0131487985..d87d0895b8 100644
--- a/tools/library/include/cutlass/library/handle.h
+++ b/tools/library/include/cutlass/library/handle.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -65,13 +65,15 @@ class Handle {
 
   /// Size of device workspace in bytes
   size_t workspace_size_;
-    
+
   /// Indicates whether scalars are host or device pointers
   ScalarPointerMode scalar_pointer_mode_;
 
   /// Pointer to the most recently executed operation
   Operation const *last_operation_;
 
+  int device_idx_;
+
 public:
 
   /// Constructor
@@ -89,7 +91,7 @@ class Handle {
   //
   // Persistent state accessors
   //
-  
+
   /// Returns compute capability of the selected device
   int compute_capability() const;
 
@@ -135,7 +137,7 @@ class Handle {
     int K,                                    /// GEMM K dimension
 
     NumericTypeID element_compute,            /// Data type of internal accumulation
-    
+
     NumericTypeID element_scalar,             /// Data type of alpha/beta scalars
 
     void const *alpha,                        /// Pointer to alpha scalar
@@ -164,7 +166,7 @@ class Handle {
     void * ptr_D,                             /// Pointer to D matrix
     int64_t ldd                               /// Leading dimension of D matrix
   );
-  
+
   /// Executes a GEMM computation: D <= alpha * A*B + beta * C.
   //
   // Supports batched-strided, batched array or split-K serial or split-K parallel.
@@ -176,9 +178,8 @@ class Handle {
     int M,                                    /// GEMM M dimension
     int N,                                    /// GEMM N dimension
     int K,                                    /// GEMM K dimension
-
     NumericTypeID element_compute,            /// Data type of internal accumulation
-    
+
     NumericTypeID element_scalar,             /// Data type of alpha/beta scalars
 
     void const *alpha,                        /// Pointer to alpha scalar
@@ -186,29 +187,29 @@ class Handle {
     NumericTypeID element_A,                  /// Data type of A matrix elements
     LayoutTypeID layout_A,                    /// Layout of A matrix
     ComplexTransform transform_A,             /// Complex transformation applied to A matrix - ignored for real-valued matrices
-
     void const * ptr_A,                       /// Pointer to A matrix in Global Memory
-    int64_t lda,                                  /// Leading dimension of A matrix
+    int64_t lda,                              /// Leading dimension of A matrix
 
     NumericTypeID element_B,                  /// Data type of B matrix elements
     LayoutTypeID layout_B,                    /// Layout of B matrix
     ComplexTransform transform_B,             /// Complex transformation applied to B matrix - ignored for real-valued matrices
-
     void const * ptr_B,                       /// Pointer to B matrix in Global Memory
-    int64_t ldb,                                  /// Leading dimension of B matrix
+    int64_t ldb,                              /// Leading dimension of B matrix
 
     void const * beta,                        /// Pointer to beta scalar
 
-    NumericTypeID element_C,                  /// Data type of C and D matrices
-
+    NumericTypeID element_C,                  /// Data type of C matrix
+    LayoutTypeID layout_C,                    /// Layout of D matrix
     void const * ptr_C,                       /// Pointer to C matrix
-    int64_t ldc,                                  /// Leading dimension of C matrix
+    int64_t ldc,                              /// Leading dimension of C matrix
 
+    NumericTypeID element_D,                  /// Data type of D matrix
+    LayoutTypeID layout_D,                    /// Layout of D matrix
     void * ptr_D,                             /// Pointer to D matrix
-    int64_t ldd,                                  /// Leading dimension of D matrix
-   
+    int64_t ldd,                              /// Leading dimension of D matrix
+
     int batch_count = 1,                      /// Batch count or number of split-K slices
- 
+
     int64_t batch_stride_A = 0,               /// Batch stride of A operand
     int64_t batch_stride_B = 0,               /// Batch stride of B operand
     int64_t batch_stride_C = 0,               /// Batch stride of C operand
@@ -218,7 +219,7 @@ class Handle {
   /// Planar complex GEMM
   ///
   /// Note, all data types are the real-valued base types used by the planar-complex GEMM kernel.
-  ///                       
+  ///
   Status gemm_planar_complex(
 
     int M,                                    /// GEMM M dimension
@@ -245,7 +246,7 @@ class Handle {
     ComplexTransform transform_B,             /// Complex transformation applied to B matrix
 
     void const * ptr_B_real,                  /// Pointer to real part of B matrix
-    void const * ptr_B_imag,                  /// Pointer to imaginary part of B matrix 
+    void const * ptr_B_imag,                  /// Pointer to imaginary part of B matrix
     int64_t ldb_real,                         /// Leading dimension of real part of B matrix
     int64_t ldb_imag,                         /// Leading dimension of imaginary part of B matrix
 
@@ -301,7 +302,7 @@ class Handle {
     ComplexTransform transform_A,             /// Complex transformation applied to A matrix
 
     void const * const * ptr_A_real,          /// Pointer to array containing pointers to real part of A matrices
-    void const * const * ptr_A_imag,          /// Pointer to array containing pointers to imaginary part of A matrices 
+    void const * const * ptr_A_imag,          /// Pointer to array containing pointers to imaginary part of A matrices
 
     int64_t lda_real,                         /// Leading dimension of real part of A matrix
     int64_t lda_imag,                         /// Leading dimension of imaginary part of A matrix
@@ -321,13 +322,13 @@ class Handle {
     NumericTypeID element_C,                  /// Data type of C and D matrix
 
     void const * const * ptr_C_real,          /// Pointer to array containing pointers to real part of C matrices
-    void const * const * ptr_C_imag,          /// Pointer to array containing poitners to imaginary part of C matrices
+    void const * const * ptr_C_imag,          /// Pointer to array containing pointers to imaginary part of C matrices
 
     int64_t ldc_real,                         /// Leading dimension of real part of C matrix
     int64_t ldc_imag,                         /// Leading dimension of imaginary part of C matrix
 
     void * const * ptr_D_real,                /// Pointer to array containing pointers to real part of D matrices
-    void * const * ptr_D_imag,                /// Pointer to array containing poitners to imaginary part of D matrices
+    void * const * ptr_D_imag,                /// Pointer to array containing pointers to imaginary part of D matrices
 
     int64_t ldd_real,                         /// Leading dimension of real part of D matrix
     int64_t ldd_imag                          /// Leading dimension of imaginary part of D matrix
diff --git a/tools/library/include/cutlass/library/library.h b/tools/library/include/cutlass/library/library.h
index b2e68097b0..19812d4b94 100644
--- a/tools/library/include/cutlass/library/library.h
+++ b/tools/library/include/cutlass/library/library.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -28,17 +28,17 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
-/*! 
+/*!
   \file
 
   \brief CUTLASS Library is an object-oriented approach to managing operations implemented by CUTLASS.
 
   Generally,
-    
+
     description   - compile-time constant parameters used to instantiate an operation
 
-    configuration - runtime parameters with computationally expensive initialization 
-    
+    configuration - runtime parameters with computationally expensive initialization
+
     arguments     - runtime parameters that may be passed to an initialized operation with low
                     computational overhead
 */
@@ -54,6 +54,8 @@
 #include <cuda_runtime.h>
 
 #include "cutlass/cutlass.h"
+#include "cutlass/library/types.h"
+#include "cutlass/library/descriptions.h"
 #include "cutlass/matrix_coord.h"
 #include "cutlass/tensor_coord.h"
 #include "cutlass/layout/tensor.h"
@@ -71,738 +73,9 @@ namespace library {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Layout type identifier
-enum class LayoutTypeID {
-  kUnknown,
-  kColumnMajor,
-  kRowMajor,
-  kColumnMajorInterleavedK2,
-  kRowMajorInterleavedK2,
-  kColumnMajorInterleavedK4,
-  kRowMajorInterleavedK4,
-  kColumnMajorInterleavedK16,
-  kRowMajorInterleavedK16,
-  kColumnMajorInterleavedK32,
-  kRowMajorInterleavedK32,
-  kColumnMajorInterleavedK64,
-  kRowMajorInterleavedK64,
-  kTensorNCHW,
-  kTensorNCDHW,
-  kTensorNHWC,
-  kTensorNDHWC,
-  kTensorNC32HW32,
-  kTensorC32RSK32,
-  kTensorNC64HW64,
-  kTensorC64RSK64,
-  kInvalid
-};
-  
-/// Numeric data type
-enum class NumericTypeID {
-  kUnknown,
-  kVoid,
-  kB1,
-  kU2,
-  kU4,
-  kU8,
-  kU16,
-  kU32,
-  kU64,
-  kS2,
-  kS4,
-  kS8,
-  kS16,
-  kS32,
-  kS64,
-  kF16,
-  kBF16, 
-  kTF32,
-  kF32,
-  kF64,
-  kCF16,
-  kCBF16,
-  kCF32,
-  kCTF32,
-  kCF64,
-  kCS2,
-  kCS4,
-  kCS8,
-  kCS16,
-  kCS32,
-  kCS64,
-  kCU2,
-  kCU4,
-  kCU8,
-  kCU16,
-  kCU32,
-  kCU64,
-  kInvalid
-};
-
-/// Enumerated type describing a transformation on a complex value.
-enum class ComplexTransform {
-  kNone,
-  kConjugate,
-  kInvalid
-};
-
-/// Providers
-enum class Provider {
-  kNone,
-  kCUTLASS,
-  kReferenceHost,
-  kReferenceDevice,
-  kCUBLAS,
-  kCUDNN,               
-  kInvalid
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Enumeration indicating the kind of operation
-enum class OperationKind {
-  kGemm,
-  kRankK,
-  kRank2K,
-  kTrmm,
-  kSymm,
-  kConv2d,              
-  kConv3d,             
-  kEqGemm,
-  kSparseGemm,
-  kReduction,
-  kInvalid
-};
-
-/// Enumeration indicating whether scalars are in host or device memory
-enum class ScalarPointerMode {
-  kHost,
-  kDevice,
-  kInvalid
-};
-
-/// Describes how reductions are performed across threadblocks
-enum class SplitKMode {
-  kNone,
-  kSerial,
-  kParallel,
-  kParallelSerial,
-  kInvalid
-};
-
-/// Indicates the classificaition of the math instruction
-enum class OpcodeClassID {
-  kSimt,
-  kTensorOp,
-  kWmmaTensorOp,
-  kSparseTensorOp,
-  kInvalid
-};
-
-enum class MathOperationID {
-  kAdd,
-  kMultiplyAdd,
-  kMultiplyAddSaturate,
-  kMultiplyAddFastBF16,
-  kMultiplyAddFastF16,
-  kMultiplyAddFastF32,              
-  kMultiplyAddComplex,
-  kMultiplyAddComplexFastF32,      
-  kMultiplyAddGaussianComplex,
-  kXorPopc,
-  kInvalid
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Enumeration indicating what kind of GEMM operation to perform
-enum class GemmKind {
-  kGemm,
-  kSparse,
-  kUniversal,
-  kPlanarComplex,
-  kPlanarComplexArray,
-  kGrouped,
-  kInvalid
-};
-
 /// Mode of Universal GEMM
 using GemmUniversalMode = cutlass::gemm::GemmUniversalMode;
 
-/// Enumeration indicating what kind of RankK update operation to perform
-enum class RankKKind {
-  kUniversal,
-  kInvalid
-};
-
-/// Enumeration indicating what kind of TRMM operation to perform
-enum class TrmmKind {
-  kUniversal,
-  kInvalid
-};
-
-/// Enumeration indicating what kind of SYMM/HEMM operation to perform
-enum class SymmKind {
-  kUniversal,
-  kInvalid
-};
-
-/// Enumeration indicating what kind of Conv2d operation to perform
-enum class ConvKind {
-  kUnknown,
-  kFprop,
-  kDgrad,
-  kWgrad,
-  kInvalid
-};
-
-enum class ConvModeID {
-  kCrossCorrelation,
-  kConvolution,
-  kInvalid
-};
-
-// Iterator algorithm enum in order of general performance-efficiency
-enum class IteratorAlgorithmID {
-  kNone,
-  kAnalytic,
-  kOptimized,
-  kFixedChannels,
-  kFewChannels,
-  kInvalid
-};
-
-
-enum class EpilogueKind {
-  kUnknown,
-  kConversion,
-  kLinearCombination,
-  kLinearCombinationClamp,
-  kLinearCombinationPlanarComplex,
-  kLinearCombinationRelu,
-  kLinearCombinationSigmoid,
-  kInvalid
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-struct MathInstructionDescription {
-
-  /// Shape of the target math instruction
-  cutlass::gemm::GemmCoord instruction_shape;
-
-  /// Describes the data type of the internal accumulator
-  NumericTypeID element_accumulator;
-
-  /// Classification of math instruction
-  OpcodeClassID opcode_class;
-
-  /// Type of math operation performed
-  MathOperationID math_operation;
-
-  //
-  // Methods
-  //
-
-  MathInstructionDescription(
-    cutlass::gemm::GemmCoord instruction_shape = cutlass::gemm::GemmCoord(),
-    NumericTypeID element_accumulator = NumericTypeID::kInvalid,
-    OpcodeClassID opcode_class = OpcodeClassID::kInvalid,
-    MathOperationID math_operation = MathOperationID::kMultiplyAdd
-  ):
-    instruction_shape(instruction_shape), 
-    element_accumulator(element_accumulator), 
-    opcode_class(opcode_class),
-    math_operation(math_operation) {}
-
-  // Equality operator
-  inline
-  bool operator==(MathInstructionDescription const& rhs) const{
-    return (
-      (instruction_shape == rhs.instruction_shape) &&
-      (element_accumulator == rhs.element_accumulator) &&
-      (opcode_class == rhs.opcode_class) &&
-      (math_operation == rhs.math_operation));
-  }
-
-  // Inequality operator
-  inline
-  bool operator!=(MathInstructionDescription const& rhs) const {
-    return !(*this == rhs);
-  }
-
-};
-
-/// Structure describing the tiled structure of a GEMM-like computation
-struct TileDescription {
-
-  /// Describes the shape of a threadblock (in elements)
-  cutlass::gemm::GemmCoord threadblock_shape;
-
-  /// Describes the number of pipeline stages in the threadblock-scoped mainloop
-  int threadblock_stages;
-
-  /// Number of warps in each logical dimension
-  cutlass::gemm::GemmCoord warp_count;
-
-  /// Core math instruction
-  MathInstructionDescription math_instruction;
-
-  /// Minimum compute capability (e.g. 70, 75) of a device eligible to run the operation.
-  int minimum_compute_capability;
-
-  /// Minimum compute capability (e.g. 70, 75) of a device eligible to run the operation.
-  int maximum_compute_capability;
-
-  //
-  // Methods
-  //
-
-  TileDescription(
-    cutlass::gemm::GemmCoord threadblock_shape = cutlass::gemm::GemmCoord(),
-    int threadblock_stages = 0,
-    cutlass::gemm::GemmCoord warp_count = cutlass::gemm::GemmCoord(),
-    MathInstructionDescription math_instruction = MathInstructionDescription(),
-    int minimum_compute_capability = 0,
-    int maximum_compute_capability = 0
-  ):
-    threadblock_shape(threadblock_shape), 
-    threadblock_stages(threadblock_stages), 
-    warp_count(warp_count),
-    math_instruction(math_instruction),
-    minimum_compute_capability(minimum_compute_capability),
-    maximum_compute_capability(maximum_compute_capability) { }
-
-  // Equality operator
-  inline
-  bool operator==(TileDescription const& rhs) const{
-    return (
-      (threadblock_shape == rhs.threadblock_shape) &&
-      (threadblock_stages == rhs.threadblock_stages) &&
-      (warp_count == rhs.warp_count) &&
-      (math_instruction == rhs.math_instruction) &&
-      (minimum_compute_capability == rhs.minimum_compute_capability) &&
-      (maximum_compute_capability == rhs.maximum_compute_capability));
-  }
-
-  // Inequality operator
-  inline
-  bool operator!=(TileDescription const& rhs) const {
-    return !(*this == rhs);
-  }
-};
-
-/// High-level description of an operation
-struct OperationDescription {
-
-  /// Unique identifier describing the operation
-  char const * name;
-
-  /// Operation provider
-  Provider provider;
-
-  /// Kind of operation
-  OperationKind kind;
-
-  /// Describes the tiled structure of a GEMM-like computation
-  TileDescription tile_description;
-
-  //
-  // Methods
-  //
-  OperationDescription(
-    char const * name = "unknown",
-    Provider Provider = Provider::kInvalid,
-    OperationKind kind = OperationKind::kInvalid, 
-    TileDescription const & tile_description = TileDescription()
-  ):
-    name(name), kind(kind), tile_description(tile_description) { }
-};
-
-/// Structure describing the properties of a tensor
-struct TensorDescription {
-
-  /// Numeric type of an individual element
-  NumericTypeID element;
-
-  /// Enumerant identifying the layout function for the tensor
-  LayoutTypeID layout;
-
-  /// Alignment restriction on pointers, strides, and extents
-  int alignment;
-
-  /// log2() of the maximum extent of each dimension
-  int log_extent_range;
-
-  /// log2() of the maximum value each relevant stride may have
-  int log_stride_range;
-  
-  //
-  // Methods
-  //
-
-  TensorDescription(
-    NumericTypeID element = NumericTypeID::kInvalid,
-    LayoutTypeID layout = LayoutTypeID::kInvalid,
-    int alignment = 1,
-    int log_extent_range = 24,
-    int log_stride_range = 24
-  ):
-    element(element), 
-    layout(layout), 
-    alignment(alignment), 
-    log_extent_range(log_extent_range), 
-    log_stride_range(log_stride_range)  { }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Description of all GEMM computations
-struct GemmDescription : public OperationDescription {
-
-  /// Indicates the kind of GEMM performed
-  GemmKind gemm_kind;
-  
-  /// Describes the A operand
-  TensorDescription A;
-
-  /// Describes the B operand
-  TensorDescription B;
-
-  /// Describes the source and destination matrices
-  TensorDescription C;
-
-  /// Describes the sparse meta matrices
-  TensorDescription E;
-
-  /// Describes the data type of the scalars passed to the epilogue
-  NumericTypeID element_epilogue;
-
-  /// Describes the structure of parallel reductions
-  SplitKMode split_k_mode;
-
-  /// Transformation on A operand
-  ComplexTransform transform_A;
-
-  /// Transformation on B operand
-  ComplexTransform transform_B;
-
-  //
-  // Methods
-  //
-
-  GemmDescription(
-    GemmKind gemm_kind = GemmKind::kGemm,
-    TensorDescription const &A = TensorDescription(),
-    TensorDescription const &B = TensorDescription(),
-    TensorDescription const &C = TensorDescription(),
-    NumericTypeID element_epilogue = NumericTypeID::kInvalid,
-    SplitKMode split_k_mode = SplitKMode::kNone,
-    ComplexTransform transform_A = ComplexTransform::kNone,
-    ComplexTransform transform_B = ComplexTransform::kNone
-  ):
-    gemm_kind(gemm_kind),
-    A(A),
-    B(B),
-    C(C),
-    element_epilogue(element_epilogue),
-    split_k_mode(split_k_mode),
-    transform_A(transform_A),
-    transform_B(transform_B) {} 
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Desciprion for structured sparse GEMMs.
-struct SparseGemmDescription : public GemmDescription {
-
-  /// Description structure for structured sparse GEMM
-  SparseGemmDescription(
-    GemmKind gemm_kind = GemmKind::kGemm,
-    TensorDescription const &A = TensorDescription(),
-    TensorDescription const &B = TensorDescription(),
-    TensorDescription const &C = TensorDescription(),
-    TensorDescription const &E = TensorDescription(),
-    NumericTypeID element_epilogue = NumericTypeID::kInvalid,
-    SplitKMode split_k_mode = SplitKMode::kNone,
-    ComplexTransform transform_A = ComplexTransform::kNone,
-    ComplexTransform transform_B = ComplexTransform::kNone
-  ):
-    GemmDescription(gemm_kind, A, B, C, element_epilogue, split_k_mode, transform_A, transform_B)
-     {this->E = E;}
-};
-
-/// Description of all Reduction operations
-struct ReductionDescription : public OperationDescription {
-
-  /// Describes the data type of workspace
-  NumericTypeID element_workspace;
-
-  /// Describes the data type of final output
-  NumericTypeID element_output;
-
-  /// Describes the data type of the scalars passed to the epilogue
-  NumericTypeID element_epilogue;
-};
-
-/// Description of all Rank K update computations (SYRK, HERK, SYR2K, HER2K)
-struct RankKDescription : public OperationDescription {
-
-  /// Indicates which device template is used (universal or regular)
-  RankKKind rank_k_kind;
-
-  /// Number of rank update (rank k or rank 2k)
-  int num_ranks;
-  
-  /// Describes the A operand
-  TensorDescription A;
-
-  /// Describes the B operand (used only for SYR2K and HER2K)
-  TensorDescription B;
-
-  /// Describes the source and destination matrices
-  TensorDescription C;
-
-  /// Describes the fill mode for matrix C
-  FillMode fill_mode;
-
-  /// Describes the blas mode (symmetric/hermitian)
-  BlasMode blas_mode;
-
-  /// Describes the data type of the scalars passed to the epilogue
-  NumericTypeID element_epilogue;
-
-  /// Describes the structure of parallel reductions
-  SplitKMode split_k_mode;
-
-  /// Transformation on A operand
-  ComplexTransform transform_A;
-
-  /// Transformation on B operand
-  ComplexTransform transform_B;
-
-  //
-  // Methods
-  //
-
-  RankKDescription(
-    RankKKind rank_k_kind = RankKKind::kUniversal,
-    int num_ranks = 1,
-    TensorDescription const &A = TensorDescription(),
-    TensorDescription const &B = TensorDescription(),
-    TensorDescription const &C = TensorDescription(),
-    FillMode fill_mode = FillMode::kInvalid,
-    BlasMode blas_mode = BlasMode::kInvalid,
-    NumericTypeID element_epilogue = NumericTypeID::kInvalid,
-    SplitKMode split_k_mode = SplitKMode::kNone,
-    ComplexTransform transform_A = ComplexTransform::kNone,
-    ComplexTransform transform_B = ComplexTransform::kNone
-  ):
-    rank_k_kind(rank_k_kind),
-    num_ranks(num_ranks),
-    A(A),
-    B(B),
-    C(C),
-    fill_mode(fill_mode),
-    blas_mode(blas_mode),
-    element_epilogue(element_epilogue),
-    split_k_mode(split_k_mode),
-    transform_A(transform_A),
-    transform_B(transform_B) {} 
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Description of all TRMM computations
-struct TrmmDescription : public OperationDescription {
-
-  /// Indicates the kind of TRMM performed
-  TrmmKind trmm_kind;
-  
-  /// Describes the A operand
-  TensorDescription A;
-
-  /// Describes the side mode for matrix A
-  SideMode side_mode;
-
-  /// Describes the fill mode for matrix A
-  FillMode fill_mode;
-
-  /// Describes the diag type for matrix A
-  DiagType diag_type;
-
-  /// Describes the B operand
-  TensorDescription B;
-
-  /// Describes the source and destination matrices
-  TensorDescription D;
-
-  /// Describes the data type of the scalars passed to the epilogue
-  NumericTypeID element_epilogue;
-
-  /// Describes the structure of parallel reductions
-  SplitKMode split_k_mode;
-
-  /// Transformation on A operand
-  ComplexTransform transform_A;
-
-  //
-  // Methods
-  //
-
-  TrmmDescription(
-    TrmmKind trmm_kind = TrmmKind::kUniversal,
-    TensorDescription const &A = TensorDescription(),
-    SideMode side_mode = SideMode::kInvalid,
-    FillMode fill_mode = FillMode::kInvalid,
-    DiagType diag_type = DiagType::kInvalid,
-    TensorDescription const &B = TensorDescription(),
-    TensorDescription const &D = TensorDescription(),
-    NumericTypeID element_epilogue = NumericTypeID::kInvalid,
-    SplitKMode split_k_mode = SplitKMode::kNone,
-    ComplexTransform transform_A = ComplexTransform::kNone
-  ):
-    trmm_kind(trmm_kind),
-    A(A),
-    side_mode(side_mode),
-    fill_mode(fill_mode),
-    diag_type(diag_type),
-    B(B),
-    D(D),
-    element_epilogue(element_epilogue),
-    split_k_mode(split_k_mode),
-    transform_A(transform_A) {} 
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Description of all SYMM/HEMM update computations
-struct SymmDescription : public OperationDescription {
-
-  /// Indicates which device template is used (universal or regular)
-  SymmKind symm_kind;
-  
-  /// Describes the A operand
-  TensorDescription A;
-
-  /// Describes the B operand 
-  TensorDescription B;
-
-  /// Describes the source and destination matrices
-  TensorDescription C;
-
-  /// Describes the side mode for matrix A
-  SideMode side_mode;
-
-  /// Describes the fill mode for matrix A
-  FillMode fill_mode;
-
-  /// Describes the blas mode (symmetric/hermitian)
-  BlasMode blas_mode;
-
-  /// Describes the data type of the scalars passed to the epilogue
-  NumericTypeID element_epilogue;
-
-  /// Describes the structure of parallel reductions
-  SplitKMode split_k_mode;
-
-  /// Transformation on A operand
-  ComplexTransform transform_A;
-
-  /// Transformation on B operand
-  ComplexTransform transform_B;
-
-  //
-  // Methods
-  //
-
-  SymmDescription(
-    SymmKind symm_kind = SymmKind::kUniversal,
-    TensorDescription const &A = TensorDescription(),
-    TensorDescription const &B = TensorDescription(),
-    TensorDescription const &C = TensorDescription(),
-    SideMode side_mode = SideMode::kInvalid,
-    FillMode fill_mode = FillMode::kInvalid,
-    BlasMode blas_mode = BlasMode::kInvalid,
-    NumericTypeID element_epilogue = NumericTypeID::kInvalid,
-    SplitKMode split_k_mode = SplitKMode::kNone,
-    ComplexTransform transform_A = ComplexTransform::kNone,
-    ComplexTransform transform_B = ComplexTransform::kNone
-  ):
-    symm_kind(symm_kind),
-    A(A),
-    B(B),
-    C(C),
-    side_mode(side_mode),
-    fill_mode(fill_mode),
-    blas_mode(blas_mode),
-    element_epilogue(element_epilogue),
-    split_k_mode(split_k_mode),
-    transform_A(transform_A),
-    transform_B(transform_B) {} 
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Description of all Conv2d operations
-struct ConvDescription : public OperationDescription {
-  /// Describes the convolution dimension support (2D or 3D)
-  int conv_dim;
-  
-  /// Describes the kind of convolution
-  ConvKind conv_kind;
-
-  /// Describes the type of iterator algorithm (analytic or precomputed)
-  IteratorAlgorithmID iterator_algorithm;
-
-  /// Describes the A operand
-  TensorDescription A;
-
-  /// Describes the B operand
-  TensorDescription B;
-
-  /// Describes the C operand
-  TensorDescription C;
-
-  /// Describes the data type of the scalars passed to the epilogue
-  NumericTypeID element_epilogue;
-
-  //
-  // Methods
-  //
-  // Returns Activation TensorDescription
-  TensorDescription activation() const {
-    switch(conv_kind) {
-      case library::ConvKind::kFprop : return A;
-      case library::ConvKind::kDgrad : return C;
-      case library::ConvKind::kWgrad : return B;
-      default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
-    }
-  }
-
-  // Returns Filter TensorDescription
-  TensorDescription filter() const {
-    switch(conv_kind) {
-      case library::ConvKind::kFprop : return B;
-      case library::ConvKind::kDgrad : return B;
-      case library::ConvKind::kWgrad : return C;
-      default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
-    }
-  }
-
-  // Returns Output TensorDescription
-  TensorDescription output() const {
-    switch(conv_kind) {
-      case library::ConvKind::kFprop : return C;
-      case library::ConvKind::kDgrad : return A;
-      case library::ConvKind::kWgrad : return A;
-      default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
-    }
-  }
-
-};
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Base class for all operations
@@ -814,27 +87,38 @@ class Operation {
   virtual OperationDescription const & description() const = 0;
 
   virtual Status can_implement(
-    void const *configuration, 
+    void const *configuration,
     void const *arguments) const = 0;
-  
+
   virtual uint64_t get_host_workspace_size(
     void const *configuration) const = 0;
-  
+
   virtual uint64_t get_device_workspace_size(
     void const *configuration,
     void const *arguments = nullptr) const = 0;
-  
+
   virtual Status initialize(
-    void const *configuration, 
-    void *host_workspace, 
-    void *device_workspace = nullptr, 
+    void const *configuration,
+    void *host_workspace,
+    void *device_workspace = nullptr,
     cudaStream_t stream = nullptr) const = 0;
 
+  virtual Status initialize_with_profiler_workspace(
+    void const *configuration,
+    void *host_workspace,
+    void *device_workspace,
+    uint8_t **profiler_workspace_ptrs,
+    int problem_count,
+    cudaStream_t stream = nullptr) {
+    return Status::kErrorNotSupported;
+  }
+
   virtual Status run(
     void const *arguments,
-    void *host_workspace, 
-    void *device_workspace = nullptr, 
-    cudaStream_t stream = nullptr) const = 0;
+    void *host_workspace,
+    void *device_workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    bool launch_with_pdl = false) const = 0;
 
 };
 
@@ -848,47 +132,47 @@ class Operation {
 struct GemmConfiguration {
 
   /// GEMM problem size
-  gemm::GemmCoord problem_size;
+  gemm::GemmCoord problem_size{};
 
   /// Leading dimension of A matrix
-  int64_t lda;
+  int64_t lda{0};
 
   /// Leading dimension of B matrix
-  int64_t ldb;
+  int64_t ldb{0};
 
   /// Leading dimension of C matrix
-  int64_t ldc;
+  int64_t ldc{0};
 
   /// Leading dimension of D matrix
-  int64_t ldd;
+  int64_t ldd{0};
 
   /// Number of partitions of K dimension
-  int split_k_slices;
+  int split_k_slices{0};
 };
 
 /// Arguments for GEMM
 struct GemmArguments {
 
   /// Pointer to A matrix
-  void const *A;
+  void const *A{nullptr};
 
   /// Pointer to B matrix
-  void const *B;
+  void const *B{nullptr};
 
   /// Pointer to C matrix
-  void const *C;
+  void const *C{nullptr};
 
   /// Pointer to D matrix
-  void *D;
+  void *D{nullptr};
 
   /// Host or device pointer to alpha scalar
-  void const *alpha;
+  void const *alpha{nullptr};
 
   /// Host or device pointer to beta scalar
-  void const *beta;
+  void const *beta{nullptr};
 
   /// Enumerant indicating whether alpha/beta point to host or device memory
-  ScalarPointerMode pointer_mode;
+  ScalarPointerMode pointer_mode{};
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -901,34 +185,34 @@ struct GemmArguments {
 struct GemmBatchedConfiguration {
 
   /// GEMM problem size
-  gemm::GemmCoord problem_size;
+  gemm::GemmCoord problem_size{};
 
   /// Leading dimension of A matrix
-  int64_t lda;
+  int64_t lda{0};
 
   /// Leading dimension of B matrix
-  int64_t ldb;
+  int64_t ldb{0};
 
   /// Leading dimension of C matrix
-  int64_t ldc;
+  int64_t ldc{0};
 
   /// Leading dimension of D matrix
-  int64_t ldd;
+  int64_t ldd{0};
 
   /// Stride between instances of the A matrix in memory
-  int64_t batch_stride_A;
+  int64_t batch_stride_A{0};
 
   /// Stride between instances of the B matrix in memory
-  int64_t batch_stride_B;
+  int64_t batch_stride_B{0};
 
   /// Stride between instances of the C matrix in memory
-  int64_t batch_stride_C;
+  int64_t batch_stride_C{0};
 
   /// Stride between instances of the D matrix in memory
-  int64_t batch_stride_D;
+  int64_t batch_stride_D{0};
 
   /// Number of GEMMs in batch
-  int batch_count;
+  int batch_count{1};
 };
 
 /// Arguments to batched GEMM
@@ -943,32 +227,32 @@ using GemmBatchedArguments = GemmArguments;
 
 struct GemmArrayConfiguration {
 
-  gemm::GemmCoord problem_size;
-  
+  gemm::GemmCoord problem_size{};
+
   /// Leading dimension of A matrix
-  int64_t lda;
+  int64_t lda{0};
 
   /// Leading dimension of B matrix
-  int64_t ldb;
+  int64_t ldb{0};
 
   /// Leading dimension of C matrix
-  int64_t ldc;
+  int64_t ldc{0};
 
   /// Leading dimension of D matrix
-  int64_t ldd;
+  int64_t ldd{0};
 
-  int batch_count;
+  int batch_count{1};
 };
 
 /// Arguments for GEMM - used by all the GEMM operations
 struct GemmArrayArguments {
-  void const * const *A;
-  void const * const *B;
-  void const * const *C;
-  void * const *D;
-  void const *alpha;
-  void const *beta;
-  ScalarPointerMode pointer_mode;  
+  void const * const *A{nullptr};
+  void const * const *B{nullptr};
+  void const * const *C{nullptr};
+  void * const *D{nullptr};
+  void const *alpha{nullptr};
+  void const *beta{nullptr};
+  ScalarPointerMode pointer_mode{};
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -980,31 +264,49 @@ struct GemmArrayArguments {
 
 struct GemmUniversalConfiguration {
 
-  GemmUniversalMode mode;
-  gemm::GemmCoord problem_size;
-  int batch_count;
+  GemmUniversalMode mode{GemmUniversalMode::kGemm};
+  gemm::GemmCoord problem_size{};
+  int batch_count{1};
+
+  int64_t lda{0};
+  int64_t ldb{0};
+  int64_t ldc{0};
+  int64_t ldd{0};
 
-  int64_t lda;
-  int64_t ldb;
-  int64_t ldc;
-  int64_t ldd;
+  int device_count{1};
 };
 
 struct GemmUniversalArguments {
+  // NOTE: these are replicated for 3.0 interfaces
+  gemm::GemmCoord problem_size{};
+  int batch_count{1};
 
-  void const *A;
-  void const *B;
-  void const *C;
-  void *D;
+  void const *A{nullptr};
+  void const *B{nullptr};
+  void const *C{nullptr};
+  void *D{nullptr};
 
-  void const *alpha;
-  void const *beta;
-  ScalarPointerMode pointer_mode;
+  void const *alpha{nullptr};
+  void const *beta{nullptr};
+  ScalarPointerMode pointer_mode{};
 
-  int64_t batch_stride_A;
-  int64_t batch_stride_B;
-  int64_t batch_stride_C;
-  int64_t batch_stride_D;
+  // NOTE: these are replicated for 3.0 interfaces
+  int64_t lda{0};
+  int64_t ldb{0};
+  int64_t ldc{0};
+  int64_t ldd{0};
+
+  int64_t batch_stride_A{0};
+  int64_t batch_stride_B{0};
+  int64_t batch_stride_C{0};
+  int64_t batch_stride_D{0};
+
+  // Needed for some 3.x kernels
+  int sm_count{0};
+  library::RasterOrder raster_order{};
+  int swizzle_size{1};
+
+  int device_index{0};
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1016,53 +318,42 @@ struct GemmUniversalArguments {
 
 struct GemmPlanarComplexConfiguration {
 
-  GemmUniversalMode mode;
-  gemm::GemmCoord problem_size;
-  int batch_count;
-
-  int64_t lda_real;
-  int64_t lda_imag;
-
-  int64_t ldb_real;
-  int64_t ldb_imag;
-
-  int64_t ldc_real;
-  int64_t ldc_imag;
-
-  int64_t ldd_real;
-  int64_t ldd_imag;
+  GemmUniversalMode mode{GemmUniversalMode::kGemm};
+  gemm::GemmCoord problem_size{};
+  int batch_count{1};
+  int64_t lda_real{0};
+  int64_t lda_imag{0};
+  int64_t ldb_real{0};
+  int64_t ldb_imag{0};
+  int64_t ldc_real{0};
+  int64_t ldc_imag{0};
+  int64_t ldd_real{0};
+  int64_t ldd_imag{0};
 };
 
 /// Arguments for planar complex GEMMs
 struct GemmPlanarComplexArguments {
 
-  void const *A_real;
-  void const *A_imag;
-
-  void const *B_real;
-  void const *B_imag;
-
-  void const *C_real;
-  void const *C_imag;
-
-  void *D_real;
-  void *D_imag;
-
-  void const *alpha;
-  void const *beta;
-  ScalarPointerMode pointer_mode;
-
-  int64_t batch_stride_A_real;
-  int64_t batch_stride_A_imag;
-
-  int64_t batch_stride_B_real;
-  int64_t batch_stride_B_imag;
-
-  int64_t batch_stride_C_real;
-  int64_t batch_stride_C_imag;
-
-  int64_t batch_stride_D_real;
-  int64_t batch_stride_D_imag;
+  void const *A_real{nullptr};
+  void const *A_imag{nullptr};
+  void const *B_real{nullptr};
+  void const *B_imag{nullptr};
+  void const *C_real{nullptr};
+  void const *C_imag{nullptr};
+  void *D_real{nullptr};
+  void *D_imag{nullptr};
+  void const *alpha{nullptr};
+  void const *beta{nullptr};
+  ScalarPointerMode pointer_mode{};
+
+  int64_t batch_stride_A_real{0};
+  int64_t batch_stride_A_imag{0};
+  int64_t batch_stride_B_real{0};
+  int64_t batch_stride_B_imag{0};
+  int64_t batch_stride_C_real{0};
+  int64_t batch_stride_C_imag{0};
+  int64_t batch_stride_D_real{0};
+  int64_t batch_stride_D_imag{0};
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1071,41 +362,38 @@ struct GemmPlanarComplexArguments {
 /// from memory.
 struct GemmPlanarComplexArrayConfiguration {
 
-  gemm::GemmCoord problem_size;
-  int batch_count;
+  gemm::GemmCoord problem_size{};
+  int batch_count{1};
 
-  int64_t lda_real;
-  int64_t lda_imag;
-
-  int64_t ldb_real;
-  int64_t ldb_imag;
-
-  int64_t ldc_real;
-  int64_t ldc_imag;
-
-  int64_t ldd_real;
-  int64_t ldd_imag;
+  int64_t lda_real{0};
+  int64_t lda_imag{0};
+  int64_t ldb_real{0};
+  int64_t ldb_imag{0};
+  int64_t ldc_real{0};
+  int64_t ldc_imag{0};
+  int64_t ldd_real{0};
+  int64_t ldd_imag{0};
 };
 
 /// Arguments for planar complex GEMMs
 struct GemmPlanarComplexArrayArguments {
 
-  int const *M;
-  int const *N;
-  int const *K;
-
-  void const * const * A_real;
-  void const * const * A_imag;
-  void const * const * B_real;
-  void const * const * B_imag;
-  void const * const * C_real;
-  void const * const * C_imag;
-  void * const * D_real;
-  void * const * D_imag;
-
-  void const * alpha;
-  void const * beta;
-  ScalarPointerMode pointer_mode;
+  int const *M{nullptr};
+  int const *N{nullptr};
+  int const *K{nullptr};
+
+  void const * const * A_real{nullptr};
+  void const * const * A_imag{nullptr};
+  void const * const * B_real{nullptr};
+  void const * const * B_imag{nullptr};
+  void const * const * C_real{nullptr};
+  void const * const * C_imag{nullptr};
+  void * const * D_real{nullptr};
+  void * const * D_imag{nullptr};
+
+  void const * alpha{nullptr};
+  void const * beta{nullptr};
+  ScalarPointerMode pointer_mode{};
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1116,29 +404,27 @@ struct GemmPlanarComplexArrayArguments {
 // GemmKind:      Grouped
 
 struct GemmGroupedConfiguration {
-
-  int problem_count;
-  int threadblock_count;
-
+  int problem_count{0};
+  int threadblock_count{0};
 };
 
 struct GemmGroupedArguments {
 
-  gemm::GemmCoord *problem_sizes;
+  gemm::GemmCoord *problem_sizes{nullptr};
 
-  void * ptr_A;
-  void * ptr_B;
-  void * ptr_C;
-  void * ptr_D;
+  void * ptr_A{nullptr};
+  void * ptr_B{nullptr};
+  void * ptr_C{nullptr};
+  void * ptr_D{nullptr};
 
-  int64_t *lda;
-  int64_t *ldb;
-  int64_t *ldc;
-  int64_t *ldd;
+  int64_t *lda{nullptr};
+  int64_t *ldb{nullptr};
+  int64_t *ldc{nullptr};
+  int64_t *ldd{nullptr};
 
-  void const *alpha;
-  void const *beta;
-  ScalarPointerMode pointer_mode;
+  void const *alpha{nullptr};
+  void const *beta{nullptr};
+  ScalarPointerMode pointer_mode{};
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1146,38 +432,34 @@ struct GemmGroupedArguments {
 // OperationKind: kSparseGemm
 //
 
-/// Computes GEMM assumine one of the inputs has 2:4 structured sparsity.
+/// Computes GEMM assuming one of the inputs has 2:4 structured sparsity.
 struct SparseGemmConfiguration {
 
-  GemmUniversalMode mode;
-  gemm::GemmCoord problem_size;
-  int batch_count;                /// number of sparse matrix products in batch
-
-  int64_t lda;                    /// leading dimension of A operand
-  int64_t ldb;                    /// leading dimension of B operand
-  int64_t ldc;                    /// leading dimension of C operand
-  int64_t ldd;                    /// leading dimension of D operand
-  int64_t lde;                    /// leading dimension of E operand (metadata matrix)
-
-  int64_t batch_stride_A;         // stride between matrices
-  int64_t batch_stride_B;         // stride between matrices
-  int64_t batch_stride_C;         // stride between matrices
-  int64_t batch_stride_D;         // stride between matrices
-  int64_t batch_stride_E;         // stride between matrices
+  GemmUniversalMode mode{GemmUniversalMode::kGemm};
+  gemm::GemmCoord problem_size{};
+  int batch_count{1};         /// number of sparse matrix products in batch
+  int64_t lda{0};             /// leading dimension of A operand
+  int64_t ldb{0};             /// leading dimension of B operand
+  int64_t ldc{0};             /// leading dimension of C operand
+  int64_t ldd{0};             /// leading dimension of D operand
+  int64_t lde{0};             /// leading dimension of E operand (metadata matrix)
+  int64_t batch_stride_A{0};  // stride between matrices
+  int64_t batch_stride_B{0};  // stride between matrices
+  int64_t batch_stride_C{0};  // stride between matrices
+  int64_t batch_stride_D{0};  // stride between matrices
+  int64_t batch_stride_E{0};  // stride between matrices
 };
 
 /// Arguments for sparse GEMMs
 struct SparseGemmArguments {
-
-  void const *A;                    /// pointer to A matrix
-  void const *B;                    /// pointer to B matrix
-  void const *C;                    /// pointer to C matrix
-  void *D;                          /// pointer to D matrix
-  void const *E;                    /// pointer to E matric (metadata)
-
-  void const *alpha;                /// pointer to alpha scalar
-  void const *beta;                 /// pointer to beta scalar
-  ScalarPointerMode pointer_mode;   /// enumerant indicating whether alpha/beta pointers are host
+  void const *A{nullptr};          /// pointer to A matrix
+  void const *B{nullptr};          /// pointer to B matrix
+  void const *C{nullptr};          /// pointer to C matrix
+  void *D{nullptr};                  /// pointer to D matrix
+  void const *E{nullptr};          /// pointer to E matrix (metadata)
+  void const *alpha{nullptr};      /// pointer to alpha scalar
+  void const *beta{nullptr};       /// pointer to beta scalar
+  ScalarPointerMode pointer_mode{}; /// enumerant indicating whether alpha/beta pointers are host
                                     ///   or device pointers.
 };
 
@@ -1191,52 +473,52 @@ struct SparseGemmArguments {
 struct RankKConfiguration {
 
   /// SYRK problem size
-  gemm::GemmCoord problem_size;
+  gemm::GemmCoord problem_size{};
 
   /// Leading dimension of A matrix
-  int64_t lda;
+  int64_t lda{0};
 
   /// Leading dimension of B matrix
-  int64_t ldb;
+  int64_t ldb{0};
 
   /// Leading dimension of C matrix
-  int64_t ldc;
+  int64_t ldc{0};
 
   /// Leading dimension of D matrix
-  int64_t ldd;
+  int64_t ldd{0};
 
   /// Batch Count
-  int batch_count;
+  int batch_count{1};
 };
 
 /// Arguments for (Syrk, Herk, Syr2k, Her2k)
 struct RankKArguments {
 
   /// Pointer to A matrix
-  void const *A;
+  void const *A{nullptr};
 
   /// Pointer to B matrix (used only for Syr2k and Her2k)
-  void const *B;
+  void const *B{nullptr};
 
   /// Pointer to C matrix
-  void const *C;
+  void const *C{nullptr};
 
   /// Pointer to D matrix
-  void *D;
+  void *D{nullptr};
 
   /// Host or device pointer to alpha scalar
-  void const *alpha;
+  void const *alpha{nullptr};
 
   /// Host or device pointer to beta scalar
-  void const *beta;
+  void const *beta{nullptr};
 
   /// Enumerant indicating whether alpha/beta point to host or device memory
-  ScalarPointerMode pointer_mode;
+  ScalarPointerMode pointer_mode{};
 
-  int64_t batch_stride_A;
-  int64_t batch_stride_B;
-  int64_t batch_stride_C;
-  int64_t batch_stride_D;
+  int64_t batch_stride_A{0};
+  int64_t batch_stride_B{0};
+  int64_t batch_stride_C{0};
+  int64_t batch_stride_D{0};
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1249,45 +531,45 @@ struct RankKArguments {
 struct TrmmConfiguration {
 
   /// TRMM problem size
-  gemm::GemmCoord problem_size;
+  gemm::GemmCoord problem_size{};
 
   /// Leading dimension of A matrix
-  int64_t lda;
+  int64_t lda{0};
 
   /// Leading dimension of B matrix
-  int64_t ldb;
+  int64_t ldb{0};
 
   /// Leading dimension of D matrix
-  int64_t ldd;
+  int64_t ldd{0};
 
   /// Batch Count
-  int batch_count;
+  int batch_count{1};
 };
 
 /// Arguments for TRMM
 struct TrmmArguments {
 
   /// Pointer to A matrix
-  void const *A;
+  void const *A{nullptr};
 
   /// Pointer to B matrix
-  void const *B;
+  void const *B{nullptr};
 
   /// Pointer to D matrix
-  void *D;
+  void *D{nullptr};
 
   /// Host or device pointer to alpha scalar
-  void const *alpha;
+  void const *alpha{nullptr};
 
   /// Host or device pointer to beta scalar
-  void const *beta;
+  void const *beta{nullptr};
 
   /// Enumerant indicating whether alpha/beta point to host or device memory
-  ScalarPointerMode pointer_mode;
+  ScalarPointerMode pointer_mode{};
 
-  int64_t batch_stride_A;
-  int64_t batch_stride_B;
-  int64_t batch_stride_D;
+  int64_t batch_stride_A{0};
+  int64_t batch_stride_B{0};
+  int64_t batch_stride_D{0};
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1300,52 +582,52 @@ struct TrmmArguments {
 struct SymmConfiguration {
 
   /// SYMM/HEMM problem size
-  gemm::GemmCoord problem_size;
+  gemm::GemmCoord problem_size{};
 
   /// Leading dimension of A matrix
-  int64_t lda;
+  int64_t lda{0};
 
   /// Leading dimension of B matrix
-  int64_t ldb;
+  int64_t ldb{0};
 
   /// Leading dimension of C matrix
-  int64_t ldc;
+  int64_t ldc{0};
 
   /// Leading dimension of D matrix
-  int64_t ldd;
+  int64_t ldd{0};
 
   /// Batch Count
-  int batch_count;
+  int batch_count{1};
 };
 
 /// Arguments for (Symm, Hemm)
 struct SymmArguments {
 
   /// Pointer to A matrix
-  void const *A;
+  void const *A{nullptr};
 
   /// Pointer to B matrix
-  void const *B;
+  void const *B{nullptr};
 
   /// Pointer to C matrix
-  void const *C;
+  void const *C{nullptr};
 
   /// Pointer to D matrix
-  void *D;
+  void *D{nullptr};
 
   /// Host or device pointer to alpha scalar
-  void const *alpha;
+  void const *alpha{nullptr};
 
   /// Host or device pointer to beta scalar
-  void const *beta;
+  void const *beta{nullptr};
 
   /// Enumerant indicating whether alpha/beta point to host or device memory
-  ScalarPointerMode pointer_mode;
+  ScalarPointerMode pointer_mode{};
 
-  int64_t batch_stride_A;
-  int64_t batch_stride_B;
-  int64_t batch_stride_C;
-  int64_t batch_stride_D;
+  int64_t batch_stride_A{0};
+  int64_t batch_stride_B{0};
+  int64_t batch_stride_C{0};
+  int64_t batch_stride_D{0};
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1358,20 +640,20 @@ struct SymmArguments {
 struct Conv2dConfiguration {
 
   conv::SplitKMode split_k_mode;
-  
-  /// Conv2d problem size 
+
+  /// Conv2d problem size
   //  contains strictly conv2d size (N,H,W,C,K,R,S,P,Q,padding,stride,dilation,mode)
   //  also includes (split_k_slices, groups)
-  conv::Conv2dProblemSize problem_size;
+  conv::Conv2dProblemSize problem_size{};
 
   // stride of operand A
-  std::vector<int64_t> stride_a;
+  std::vector<int64_t> stride_a{};
 
   // stride of operand B
-  std::vector<int64_t> stride_b;
+  std::vector<int64_t> stride_b{};
 
   // stride of operand C
-  std::vector<int64_t> stride_c;
+  std::vector<int64_t> stride_c{};
 };
 
 
@@ -1381,27 +663,27 @@ struct Conv2dConfiguration {
 //
 struct Conv3dConfiguration {
 
-  conv::SplitKMode split_k_mode;
-  
-  /// Conv2d problem size 
+  conv::SplitKMode split_k_mode{};
+
+  /// Conv2d problem size
   //  contains strictly conv2d size (N,D,H,W,C,K,T,R,S,Z,P,Q,padding,stride,dilation,mode)
   //  also includes (split_k_slices, groups)
-  conv::Conv3dProblemSize problem_size;
+  conv::Conv3dProblemSize problem_size{};
 
   /// Layout object for activations tensor
-  layout::TensorNDHWC layout_activations;
+  layout::TensorNDHWC layout_activations{};
 
   /// Layout object for filters tensor
-  layout::TensorNDHWC layout_filters;
+  layout::TensorNDHWC layout_filters{};
 
   /// Layout object for source tensor
-  layout::TensorNDHWC layout_source;
+  layout::TensorNDHWC layout_source{};
 
   /// Layout object for output tensor
-  layout::TensorNDHWC layout_output;
+  layout::TensorNDHWC layout_output{};
 
   //
-  // Methods 
+  // Methods
   //
 
   // Mapping functions (A,B,C -> activation,filter,output)
@@ -1440,26 +722,28 @@ struct ConvArguments {
   /// ImplicitGemm matrices A, B, C, D
   /////////////////////////////////////////////////////////
   /// pointer to implicit gemm matrix A
-  void const *A;
+  void const *A{nullptr};
 
   /// pointer to implicit gemm matrix B
-  void const *B;
+  void const *B{nullptr};
+
+  /// pointer to reordered matrix B
+  void const *reordered_B{nullptr};
 
   /// pointer to implicit gemm matrix C
-  void const *C;
+  void const *C{nullptr};
 
-  /// pointer to implicit gemm desitination matrix D
-  void *D;
+  /// pointer to implicit gemm destination matrix D
+  void *D{nullptr};
 
   /// Host or device pointer to alpha scalar
-  void const *alpha;
+  void const *alpha{nullptr};
 
   /// Host or device pointer to beta scalar
-  void const *beta;
+  void const *beta{nullptr};
 
   /// Enumerant indicating whether alpha/beta point to host or device memory
-  ScalarPointerMode pointer_mode;
-
+  ScalarPointerMode pointer_mode{};
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1470,48 +754,48 @@ struct ConvArguments {
 //
 struct ReductionConfiguration {
 
-  /// Redcution problem size
-  MatrixCoord problem_size;
+  /// Reduction problem size
+  MatrixCoord problem_size{};
 
   /// Number of partitions to reduce
-  int partitions;
+  int partitions{0};
 
-  /// Number of lements between each partition
-  int64_t partition_stride;
+  /// Number of elements between each partition
+  int64_t partition_stride{0};
 
-  /// leading dimension of 'w'orksace operand
-  int64_t ldw; 
+  /// leading dimension of 'w'orkspace operand
+  int64_t ldw{0};
 
   /// leading dimension of 's'ource operand
-  int64_t lds;
+  int64_t lds{0};
 
   /// leading dimension of 'd'estination operand
-  int64_t ldd;
+  int64_t ldd{0};
 };
 
 /// Arguments for Reduction
 struct ReductionArguments {
 
   /// Pointer to workspace matrix
-  void const *workspace;
+  void const *workspace{nullptr};
 
   /// Pointer to source matrix
-  void const *source;
+  void const *source{nullptr};
 
   /// Pointer to destination matrix
-  void *destination;
+  void *destination{nullptr};
 
   /// pointer to reference matrix
-  void *reference;
+  void *reference{nullptr};
 
   /// Host or device pointer to alpha scalar
-  void const *alpha;
+  void const *alpha{nullptr};
 
   /// Host or device pointer to beta scalar
-  void const *beta;
+  void const *beta{nullptr};
 
   /// Enumerant indicating whether alpha/beta point to host or device memory
-  ScalarPointerMode pointer_mode;
+  ScalarPointerMode pointer_mode{};
 };
 
 } // namespace library
diff --git a/tools/library/include/cutlass/library/manifest.h b/tools/library/include/cutlass/library/manifest.h
index d0943eeb6e..08272af125 100644
--- a/tools/library/include/cutlass/library/manifest.h
+++ b/tools/library/include/cutlass/library/manifest.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -90,7 +90,11 @@ class Manifest {
   Status release();
 
   /// Appends an operation and takes ownership
-  void append(Operation *operation_ptr);
+  void append(Operation *operation_ptr) {\
+    // This function is inline s.t. it is present in generated libraries
+    // without having to compile or link in manifest.cpp
+    operations_.emplace_back(operation_ptr);
+  }
 
   /// Returns an iterator to the first operation
   OperationVector const &operations() const;
diff --git a/tools/library/include/cutlass/library/operation_table.h b/tools/library/include/cutlass/library/operation_table.h
index 9d5423602c..ee7b65fe10 100644
--- a/tools/library/include/cutlass/library/operation_table.h
+++ b/tools/library/include/cutlass/library/operation_table.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -66,6 +66,9 @@ struct GemmFunctionalKey {
   LayoutTypeID layout_B;
   ComplexTransform transform_B;
   NumericTypeID element_C;
+  LayoutTypeID layout_C;
+  NumericTypeID element_D;
+  LayoutTypeID layout_D;
 
   //
   // Methods
@@ -83,7 +86,10 @@ struct GemmFunctionalKey {
     NumericTypeID element_B = NumericTypeID::kF16,
     LayoutTypeID layout_B = LayoutTypeID::kColumnMajor,
     ComplexTransform transform_B = ComplexTransform::kNone,
-    NumericTypeID element_C = NumericTypeID::kF16
+    NumericTypeID element_C = NumericTypeID::kF16,
+    LayoutTypeID layout_C = LayoutTypeID::kColumnMajor,
+    NumericTypeID element_D = NumericTypeID::kF16,
+    LayoutTypeID layout_D = LayoutTypeID::kColumnMajor
   ):
     provider(provider),
     gemm_kind(gemm_kind),
@@ -95,12 +101,15 @@ struct GemmFunctionalKey {
     element_B(element_B),
     layout_B(layout_B),
     transform_B(transform_B),
-    element_C(element_C)
+    element_C(element_C),
+    layout_C(layout_C),
+    element_D(element_D),
+    layout_D(layout_D)
   { }
 
   inline
   bool operator==(GemmFunctionalKey const &rhs) const {
-    return 
+    return
       (provider == rhs.provider) &&
       (gemm_kind == rhs.gemm_kind) &&
       (element_compute == rhs.element_compute) &&
@@ -111,7 +120,10 @@ struct GemmFunctionalKey {
       (element_B == rhs.element_B) &&
       (layout_B == rhs.layout_B) &&
       (transform_B == rhs.transform_B) &&
-      (element_C == rhs.element_C);
+      (element_C == rhs.element_C) &&
+      (layout_C == rhs.layout_C) &&
+      (element_D == rhs.element_D) &&
+      (layout_D == rhs.layout_D);
   }
 
   inline
@@ -137,6 +149,9 @@ std::ostream & operator<<(std::ostream &out, cutlass::library::GemmFunctionalKey
     << "         layout_B: " << to_string(k.layout_B) << "\n"
     << "      transform_B: " << to_string(k.transform_B) << "\n"
     << "        element_C: " << to_string(k.element_C) << "\n"
+    << "         layout_C: " << to_string(k.layout_C) << "\n"
+    << "        element_D: " << to_string(k.element_D) << "\n"
+    << "         layout_D: " << to_string(k.layout_D) << "\n"
     << "}";
 
   return out;
@@ -150,25 +165,28 @@ struct GemmFunctionalKeyHasher {
 
   inline
   static size_t rotl(size_t key, int shl) {
-    return (key << shl) | (key >> (sizeof(key)*8 - shl));
+    return (key << shl) | (key >> (sizeof(key)*8u - static_cast<size_t>(shl)));
   }
 
   inline
   size_t operator()(GemmFunctionalKey const &key) const {
     IntHash hash;
 
-    return 
-      rotl(hash(int(key.provider)), 1) ^ 
-      rotl(hash(int(key.gemm_kind)), 2) ^ 
+    return
+      rotl(hash(int(key.provider)),        1) ^
+      rotl(hash(int(key.gemm_kind)),       2) ^
       rotl(hash(int(key.element_compute)), 3) ^
-      rotl(hash(int(key.element_scalar)), 4) ^
-      rotl(hash(int(key.element_A)), 5) ^
-      rotl(hash(int(key.layout_A)), 6) ^
-      rotl(hash(int(key.transform_A)), 7) ^
-      rotl(hash(int(key.element_B)), 8) ^
-      rotl(hash(int(key.layout_B)), 9) ^
-      rotl(hash(int(key.transform_B)), 10) ^
-      rotl(hash(int(key.element_C)), 11);
+      rotl(hash(int(key.element_scalar)),  4) ^
+      rotl(hash(int(key.element_A)),       5) ^
+      rotl(hash(int(key.layout_A)),        6) ^
+      rotl(hash(int(key.transform_A)),     7) ^
+      rotl(hash(int(key.element_B)),       8) ^
+      rotl(hash(int(key.layout_B)),        9) ^
+      rotl(hash(int(key.transform_B)),    10) ^
+      rotl(hash(int(key.element_C)),      11) ^
+      rotl(hash(int(key.layout_C)),       12) ^
+      rotl(hash(int(key.element_D)),      13) ^
+      rotl(hash(int(key.layout_D)),       14);
   }
 };
 
@@ -189,7 +207,7 @@ struct GemmPreferenceKey {
   GemmPreferenceKey(int cc, int alignment): compute_capability(cc), alignment(alignment) { }
 
   bool operator<(GemmPreferenceKey const &rhs) const {
-    return (compute_capability < rhs.compute_capability) || 
+    return (compute_capability < rhs.compute_capability) ||
       ((compute_capability == rhs.compute_capability) && (alignment < rhs.alignment));
   }
 
@@ -200,6 +218,18 @@ struct GemmPreferenceKey {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+inline
+std::ostream& operator<< (std::ostream& out, const cutlass::library::GemmPreferenceKey& key) {
+    out << "{\n"
+      << "compute_capability : " << key.compute_capability << std::endl
+      << "alignment          : " << key.alignment << std::endl
+      << "}";
+
+  return out;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 /// Maps minimum compute capability onto a vector of possible operations
 using GemmOperationVectorMap = std::map<
   GemmPreferenceKey,
@@ -212,7 +242,6 @@ using GemmOperationFunctionalMap = std::unordered_map<
   GemmOperationVectorMap,
   GemmFunctionalKeyHasher
 >;
-/////////////////////////////////////////////////////////////////////////////////////////////////
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 //                          Data Structures for Conv Functional Maps
@@ -259,9 +288,9 @@ struct ConvFunctionalKey {
     layout_C(layout_C),
     element_accumulator(element_accumulator),
     element_compute(element_compute)
-  { } 
+  { }
 
-  inline 
+  inline
   bool operator==(ConvFunctionalKey const &rhs) const {
     return
       (provider == rhs.provider) &&
@@ -276,7 +305,7 @@ struct ConvFunctionalKey {
       (element_compute == rhs.element_compute);
   }
 
-  inline 
+  inline
   bool operator!=(ConvFunctionalKey const &rhs) const {
     return !(*this == rhs);
   }
@@ -296,7 +325,7 @@ std::ostream& operator<< (std::ostream& out, const cutlass::library::ConvFunctio
       << "element_accumulator: " << to_string(key.element_accumulator) << std::endl
       << "element_compute: " << to_string(key.element_compute) << std::endl
       << "}";
-  
+
   return out;
 }
 
@@ -306,14 +335,14 @@ struct ConvFunctionalKeyHasher {
 
   inline
   static size_t rotl(size_t key, int shl) {
-    return (key << shl) | (key >> (sizeof(key)*8 - shl));
+    return (key << shl) | (key >> (sizeof(key)*8u - static_cast<size_t>(shl)));
   }
 
   inline
   size_t operator()(ConvFunctionalKey const &key) const {
     IntHash hash;
 
-    return 
+    return
       rotl(hash(int(key.provider)), 1) ^
       rotl(hash(int(key.conv_kind)), 2) ^
       rotl(hash(int(key.element_A)), 3) ^
@@ -341,11 +370,11 @@ struct ConvPreferenceKey {
 
   ConvPreferenceKey(): compute_capability(), iterator_algorithm() { }
 
-  ConvPreferenceKey(int cc, IteratorAlgorithmID iterator_algorithm): 
+  ConvPreferenceKey(int cc, IteratorAlgorithmID iterator_algorithm):
     compute_capability(cc), iterator_algorithm(iterator_algorithm) { }
 
   bool operator<(ConvPreferenceKey const &rhs) const {
-    return (compute_capability < rhs.compute_capability) || 
+    return (compute_capability < rhs.compute_capability) ||
       ((compute_capability == rhs.compute_capability) && (iterator_algorithm < rhs.iterator_algorithm));
   }
 
@@ -404,9 +433,9 @@ struct ReductionFunctionalKey {
     element_compute(element_compute),
     reduce_math_op(reduce_math_op),
     epilogue_math_op(epilogue_math_op)
-  { } 
+  { }
 
-  inline 
+  inline
   bool operator==(ReductionFunctionalKey const &rhs) const {
     return
       (provider == rhs.provider) &&
@@ -418,7 +447,7 @@ struct ReductionFunctionalKey {
       (epilogue_math_op == rhs.epilogue_math_op);
   }
 
-  inline 
+  inline
   bool operator!=(ReductionFunctionalKey const &rhs) const {
     return !(*this == rhs);
   }
@@ -430,14 +459,14 @@ struct ReductionFunctionalKeyHasher {
 
   inline
   static size_t rotl(size_t key, int shl) {
-    return (key << shl) | (key >> (sizeof(key)*8 - shl));
+    return (key << shl) | (key >> (sizeof(key)*8u - static_cast<size_t>(shl)));
   }
 
   inline
   size_t operator()(ReductionFunctionalKey const &key) const {
     IntHash hash;
 
-    return 
+    return
       rotl(hash(int(key.provider)), 1) ^
       rotl(hash(int(key.element_workspace)), 2) ^
       rotl(hash(int(key.element_accumulator)), 3) ^
@@ -476,19 +505,19 @@ using ReductionOperationFunctionalMap = std::unordered_map<
 class OperationTable {
 public:
 
-  /// Map of all operations of type kGemm 
+  /// Map of all operations of type kGemm
   // provider (kCUTLASS)
   GemmOperationFunctionalMap gemm_operations;
 
-  /// Map of all operations of type kConv2d 
+  /// Map of all operations of type kConv2d
   // provider (kCUTLASS, kReferenceHost, kReferenceDevice)
   ConvOperationFunctionalMap conv2d_operations;
 
-  /// Map of all operations of type kConv3d 
+  /// Map of all operations of type kConv3d
   // provider (kCUTLASS, kReferenceHost, kReferenceDevice)
   ConvOperationFunctionalMap conv3d_operations;
 
-  /// Map of all operations of type kConv2d 
+  /// Map of all operations of type kConv2d
   // provider (kCUTLASS)
   ReductionOperationFunctionalMap reduction_operations;
 
diff --git a/tools/library/include/cutlass/library/singleton.h b/tools/library/include/cutlass/library/singleton.h
index e58ed1dbc1..83cc29e262 100644
--- a/tools/library/include/cutlass/library/singleton.h
+++ b/tools/library/include/cutlass/library/singleton.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/include/cutlass/library/types.h b/tools/library/include/cutlass/library/types.h
new file mode 100644
index 0000000000..12c6fb612c
--- /dev/null
+++ b/tools/library/include/cutlass/library/types.h
@@ -0,0 +1,266 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+ #pragma once
+
+ /////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Layout type identifier
+enum class LayoutTypeID {
+  kUnknown,
+  kColumnMajor,
+  kRowMajor,
+  kColumnMajorInterleavedK2,
+  kRowMajorInterleavedK2,
+  kColumnMajorInterleavedK4,
+  kRowMajorInterleavedK4,
+  kColumnMajorInterleavedK16,
+  kRowMajorInterleavedK16,
+  kColumnMajorInterleavedK32,
+  kRowMajorInterleavedK32,
+  kColumnMajorInterleavedK64,
+  kRowMajorInterleavedK64,
+  kTensorNCHW,
+  kTensorNCDHW,
+  kTensorNHWC,
+  kTensorNDHWC,
+  kTensorNC32HW32,
+  kTensorC32RSK32,
+  kTensorNC64HW64,
+  kTensorC64RSK64,
+  kInvalid
+};
+  
+/// Numeric data type
+enum class NumericTypeID {
+  kUnknown,
+  kVoid,
+  kB1,
+  kU2,
+  kU4,
+  kU8,
+  kU16,
+  kU32,
+  kU64,
+  kS2,
+  kS4,
+  kS8,
+  kS16,
+  kS32,
+  kS64,
+  kFE4M3,
+  kFE5M2,
+  kF16,
+  kBF16, 
+  kTF32,
+  kF32,
+  kF64,
+  kCF16,
+  kCBF16,
+  kCF32,
+  kCTF32,
+  kCF64,
+  kCS2,
+  kCS4,
+  kCS8,
+  kCS16,
+  kCS32,
+  kCS64,
+  kCU2,
+  kCU4,
+  kCU8,
+  kCU16,
+  kCU32,
+  kCU64,
+  kInvalid
+};
+
+/// Enumerated type describing a transformation on a complex value.
+enum class ComplexTransform {
+  kNone,
+  kConjugate,
+  kInvalid
+};
+
+/// Providers
+enum class Provider {
+  kNone,
+  kCUTLASS,
+  kReferenceHost,
+  kReferenceDevice,
+  kCUBLAS,
+  kCUDNN,
+  kInvalid
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Enumeration indicating the kind of operation
+enum class OperationKind {
+  kGemm,
+  kRankK,
+  kRank2K,
+  kTrmm,
+  kSymm,
+  kConv2d,
+  kConv3d,
+  kEqGemm,
+  kSparseGemm,
+  kReduction,
+  kInvalid
+};
+
+/// Enumeration indicating whether scalars are in host or device memory
+enum class ScalarPointerMode {
+  kHost,
+  kDevice,
+  kInvalid
+};
+
+/// Describes how reductions are performed across threadblocks
+enum class SplitKMode {
+  kNone,
+  kSerial,
+  kParallel,
+  kParallelSerial,
+  kInvalid
+};
+
+/// Indicates the classificaition of the math instruction
+enum class OpcodeClassID {
+  kSimt,
+  kTensorOp,
+  kWmmaTensorOp,
+  kSparseTensorOp,
+  kInvalid
+};
+
+enum class MathOperationID {
+  kAdd,
+  kMultiplyAdd,
+  kMultiplyAddSaturate,
+  kMultiplyAddMixedInputUpcast,
+  kMultiplyAddFastBF16,
+  kMultiplyAddFastF16,
+  kMultiplyAddFastF32,
+  kMultiplyAddComplex,
+  kMultiplyAddComplexFastF32,
+  kMultiplyAddGaussianComplex,
+  kXorPopc,
+  kInvalid
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Enumeration indicating what kind of GEMM operation to perform
+enum class GemmKind {
+  kGemm,
+  kSparse,
+  kUniversal,
+  kPlanarComplex,
+  kPlanarComplexArray,
+  kGrouped,
+  kInvalid
+};
+
+/// Enumeration indicating what kind of RankK update operation to perform
+enum class RankKKind {
+  kUniversal,
+  kInvalid
+};
+
+/// Enumeration indicating what kind of TRMM operation to perform
+enum class TrmmKind {
+  kUniversal,
+  kInvalid
+};
+
+/// Enumeration indicating what kind of SYMM/HEMM operation to perform
+enum class SymmKind {
+  kUniversal,
+  kInvalid
+};
+
+/// Enumeration indicating what kind of Conv2d operation to perform
+enum class ConvKind {
+  kUnknown,
+  kFprop,
+  kDgrad,
+  kWgrad,
+  kInvalid
+};
+
+enum class ConvModeID {
+  kCrossCorrelation,
+  kConvolution,
+  kInvalid
+};
+
+// Iterator algorithm enum in order of general performance-efficiency
+enum class IteratorAlgorithmID {
+  kNone,
+  kAnalytic,
+  kOptimized,
+  kFixedChannels,
+  kFewChannels,
+  kInvalid
+};
+
+
+enum class EpilogueKind {
+  kUnknown,
+  kConversion,
+  kLinearCombination,
+  kLinearCombinationClamp,
+  kLinearCombinationPlanarComplex,
+  kLinearCombinationRelu,
+  kLinearCombinationSigmoid,
+  kInvalid
+};
+
+enum class RasterOrder {
+  kAlongN,
+  kAlongM,
+  kHeuristic,
+  kInvalid
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/library/include/cutlass/library/util.h b/tools/library/include/cutlass/library/util.h
index b70e7e160a..13ac4f68df 100644
--- a/tools/library/include/cutlass/library/util.h
+++ b/tools/library/include/cutlass/library/util.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -170,6 +170,13 @@ char const *to_string(ConvKind type, bool pretty = false);
 template <>
 ConvKind from_string<ConvKind>(std::string const &str);
 
+/// Converts a RasterOrder enumerant to a string
+char const *to_string(RasterOrder type, bool pretty = false);
+
+/// Convers a RasterOrder enumerant from a string
+template<>
+RasterOrder from_string<RasterOrder>(std::string const &str);
+
 /// Lexical cast from int64_t to string
 std::string lexical_cast(int64_t int_value);
 
diff --git a/tools/library/scripts/conv2d_operation.py b/tools/library/scripts/conv2d_operation.py
deleted file mode 100644
index 0ba4307f89..0000000000
--- a/tools/library/scripts/conv2d_operation.py
+++ /dev/null
@@ -1,348 +0,0 @@
-#
-# \file generator.py
-#
-# \brief Generates the CUTLASS Library's instances
-#
-#
-
-import enum
-import os.path
-import shutil
-
-from library import *
-
-###################################################################################################
-
-#
-class Conv2dOperation:
-  #
-  def __init__(self, conv_kind, iterator_algorithm, arch, tile_description, A, B, C, element_epilogue, \
-    stride_support, epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity1):
-
-    self.operation_kind = OperationKind.Conv2d
-    self.arch = arch
-    self.tile_description = tile_description
-    self.conv_kind = conv_kind
-    self.A = A
-    self.B = B
-    self.C = C
-    self.element_epilogue = element_epilogue
-    self.epilogue_functor = epilogue_functor
-    self.iterator_algorithm = iterator_algorithm
-    self.stride_support = stride_support
-    self.swizzling_functor = swizzling_functor
-  #
-  def is_complex(self):
-    complex_operators = [
-      MathOperation.multiply_add_complex,
-      MathOperation.multiply_add_complex_gaussian
-      ]
-    return self.tile_description.math_instruction.math_operation in complex_operators
-  
-  #
-  def accumulator_type(self):
-    accum = self.tile_description.math_instruction.element_accumulator
-
-    if self.is_complex():
-      return get_complex_from_real(accum)
-
-    return accum
-
-  #
-  def core_name(self):
-    ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
-
-    intermediate_type = ''
-
-    if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp:
-      inst_shape = "%d%d%d" % tuple(self.tile_description.math_instruction.instruction_shape)
-      if self.tile_description.math_instruction.element_a != self.A.element and \
-        self.tile_description.math_instruction.element_a != self.accumulator_type():
-        intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
-    else:
-      inst_shape = ''
-
-    return "%s%s%s%s_%s" % (ShortDataTypeNames[self.accumulator_type()], \
-      inst_shape, intermediate_type, ConvKindNames[self.conv_kind], IteratorAlgorithmNames[self.iterator_algorithm])
-
-  #
-  def extended_name(self):
-    ''' Append data types if they differ from compute type. '''
-    if self.C.element != self.tile_description.math_instruction.element_accumulator and \
-      self.A.element != self.tile_description.math_instruction.element_accumulator:
-      extended_name = "${element_c}_${core_name}_${element_a}"
-    elif self.C.element == self.tile_description.math_instruction.element_accumulator and  \
-      self.A.element != self.tile_description.math_instruction.element_accumulator:
-      extended_name = "${core_name}_${element_a}"
-    else:
-      extended_name = "${core_name}"
-
-    extended_name = SubstituteTemplate(extended_name, {
-      'element_a': DataTypeNames[self.A.element],
-      'element_c': DataTypeNames[self.C.element],
-      'core_name': self.core_name()
-      })
-
-    return extended_name
-
-  #
-  def layout_name(self):
-    return "%s" % (ShortLayoutTypeNames[self.A.layout])
-
-  #
-  def configuration_name(self):
-    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
-
-    opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
-    
-    threadblock = "%dx%d_%dx%d" % (
-      self.tile_description.threadblock_shape[0],
-      self.tile_description.threadblock_shape[1],
-      self.tile_description.threadblock_shape[2],
-      self.tile_description.stages
-    )
-
-    if self.stride_support == StrideSupport.Unity:
-      configuration_name = "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}_unity_stride_align${alignment}"
-    else:
-      configuration_name = "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}_align${alignment}"
-
-    return SubstituteTemplate(
-      configuration_name,
-      {
-        'opcode_class': opcode_class_name,
-        'extended_name': self.extended_name(),
-        'threadblock': threadblock,
-        'layout': self.layout_name(),
-        'alignment': "%d" % self.A.alignment,
-      }
-    )
-
-  #
-  def procedural_name(self):
-    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
-    return self.configuration_name()
-
-###################################################################################################
-#
-# Emits single instances of a CUTLASS device-wide operator
-#
-###################################################################################################
-
-class EmitConv2dInstance:
-  def __init__(self):
-    self.template = """
-  // Conv2d${conv_kind_name} ${iterator_algorithm_name} kernel instance "${operation_name}"
-  using ${operation_name}_base = 
-  typename cutlass::conv::kernel::DefaultConv2d${conv_kind_name}<
-    ${element_a}, 
-    ${layout_a},
-    ${element_b}, 
-    ${layout_b},
-    ${element_c}, 
-    ${layout_c},
-    ${element_accumulator},
-    ${opcode_class},
-    ${arch},
-    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
-    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k} >,
-    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
-    ${epilogue_functor}<
-      ${element_c},
-      ${epilogue_vector_length},
-      ${element_accumulator},
-      ${element_epilogue}
-    >,
-    ${swizzling_functor}, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>,
-    ${stages},
-    ${math_operator},
-    ${iterator_algorithm},
-    ${stride_support},
-    ${align_a},
-    ${align_b}
-  >::Kernel;
-"""
-
-
-  def emit(self, operation):
-
-    warp_shape = [int(operation.tile_description.threadblock_shape[idx] / operation.tile_description.warp_count[idx]) for idx in range(3)]
-
-    epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
-
-    values = {
-      'operation_name': operation.procedural_name(),
-      'conv_kind': ConvKindTag[operation.conv_kind],
-      'conv_kind_name': ConvKindNames[operation.conv_kind].capitalize(),
-      'element_a': DataTypeTag[operation.A.element],
-      'layout_a': LayoutTag[operation.A.layout],
-      'element_b': DataTypeTag[operation.B.element],
-      'layout_b': LayoutTag[operation.B.layout],
-      'element_c': DataTypeTag[operation.C.element],
-      'layout_c': LayoutTag[operation.C.layout],
-      'element_accumulator': DataTypeTag[operation.accumulator_type()], 
-      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
-      'arch': "cutlass::arch::Sm%d" % operation.arch,
-      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
-      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
-      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
-      'warp_shape_m': str(warp_shape[0]),
-      'warp_shape_n': str(warp_shape[1]),
-      'warp_shape_k': str(warp_shape[2]),
-      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
-      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
-      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
-      'epilogue_vector_length': str(epilogue_vector_length),
-      'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
-      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
-      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
-      'stages': str(operation.tile_description.stages),
-      'iterator_algorithm': IteratorAlgorithmTag[operation.iterator_algorithm],
-      'iterator_algorithm_name': IteratorAlgorithmNames[operation.iterator_algorithm].capitalize(),
-      'stride_support': StrideSupportTag[operation.stride_support],
-      'math_operator': 'cutlass::arch::OpMultiplyAddComplex' if operation.is_complex() else \
-      MathOperationTag[operation.tile_description.math_instruction.math_operation],
-      'align_a': str(operation.A.alignment),
-      'align_b': str(operation.B.alignment),
-    }
-
-    return SubstituteTemplate(self.template, values)
-
-###################################################################################################
-#
-# Generator functions for all layouts
-#
-###################################################################################################
-
-#
-def GenerateConv2dTensorOp(manifest, tile_descriptions, min_cc, align = 128):
-
-  for tile in tile_descriptions:
-    for conv_kind in [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad]:
-
-      if conv_kind == ConvKind.Fprop or (tile.math_instruction.element_accumulator in [DataType.f16, DataType.f32]):
-
-        #
-        output_types = [tile.math_instruction.element_a, tile.math_instruction.element_accumulator] \
-          if DataTypeSize[tile.math_instruction.element_accumulator] == 32 \
-          else [tile.math_instruction.element_accumulator,]
-
-        for output_type in output_types:
-          A = TensorDescription(tile.math_instruction.element_a, LayoutType.TensorNHWC, int(align / DataTypeSize[tile.math_instruction.element_a]))
-          B = TensorDescription(tile.math_instruction.element_b, LayoutType.TensorNHWC, int(align / DataTypeSize[tile.math_instruction.element_b]))
-          C = TensorDescription(output_type,  LayoutType.TensorNHWC, max(1, int(align / DataTypeSize[output_type])))
-
-          manifest.append(Conv2dOperation(conv_kind, min_cc, tile, A, B, C, tile.math_instruction.element_accumulator))
-
-###################################################################################################
-#
-# Emitters functions for all targets
-#
-###################################################################################################
-
-class EmitConv2dConfigurationLibrary:
-  def __init__(self, operation_path, configuration_name):
-    self.configuration_name = configuration_name
-    self.configuration_path = os.path.join(operation_path, "%s.cu" % configuration_name)
-
-    self.instance_emitter = EmitConv2dInstance()
-
-    self.instance_template = """
-${operation_instance}
-
-// Derived class
-struct ${operation_name} : 
-  public ${operation_name}_base { };
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-"""
-    self.header_template = """
-/*
-  Generated by conv2d_operation.py - Do not edit.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include "cutlass/cutlass.h"
-#include "cutlass/library/library.h"
-#include "cutlass/library/manifest.h"
-
-#include "library_internal.h"
-#include "conv2d_operation.h"
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-"""
-
-    self.configuration_header = """
-
-namespace cutlass {
-namespace library {
-
-// Initialize all instances
-void initialize_${configuration_name}(Manifest &manifest) {
-
-"""
-
-    self.configuration_instance = """
-  using Operation_${operation_name} = cutlass::conv::device::ImplicitGemmConvolution<
-    ${operation_name}>;
-
-  manifest.append(new cutlass::library::Conv2dOperation<
-    Operation_${operation_name}>(
-      "${operation_name}"));
-
-"""
-
-    self.configuration_epilogue = """
-}
-"""
-    self.epilogue_template = """
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace library
-} // namespace cutlass
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-"""
-
-  #
-  def __enter__(self):
-    self.configuration_file = open(self.configuration_path, "w")
-    self.configuration_file.write(SubstituteTemplate(self.header_template, {
-      'configuration_name': self.configuration_name
-      }))
-    self.operations = []
-    return self
-
-  #
-  def emit(self, operation):
-    self.operations.append(operation)
-    self.configuration_file.write(SubstituteTemplate(self.instance_template, {
-      'configuration_name': self.configuration_name,
-      'operation_name': operation.procedural_name(),
-      'operation_instance': self.instance_emitter.emit(operation)
-      }))
-
-  #
-  def __exit__(self, exception_type, exception_value, traceback):
-
-    self.configuration_file.write(SubstituteTemplate(self.configuration_header, {
-      'configuration_name': self.configuration_name
-      }))
-
-    for operation in self.operations:
-      self.configuration_file.write(SubstituteTemplate(self.configuration_instance, {
-        'configuration_name': self.configuration_name,
-        'operation_name': operation.procedural_name()  
-      }))
-
-    self.configuration_file.write(self.configuration_epilogue)
-    self.configuration_file.write(self.epilogue_template)
-    self.configuration_file.close()
-
-
-###################################################################################################
-###################################################################################################
diff --git a/tools/library/scripts/manifest.py b/tools/library/scripts/manifest.py
deleted file mode 100644
index 408ab1f547..0000000000
--- a/tools/library/scripts/manifest.py
+++ /dev/null
@@ -1,401 +0,0 @@
-#
-# \file generator.py
-#
-# \brief Generates the CUTLASS Library's instances
-#
-
-import enum
-import os.path
-import shutil
-
-from library import *
-from gemm_operation import *
-from rank_k_operation import *
-from rank_2k_operation import *
-from trmm_operation import *
-from symm_operation import *
-from conv2d_operation import *  
-from conv3d_operation import *  
-
-###################################################################################################
-
-class EmitOperationKindLibrary:
-  def __init__(self, generated_path, kind, args):
-    self.generated_path = generated_path
-    self.kind = kind
-    self.args = args
-    self.emitters = {
-      OperationKind.Gemm: EmitGemmConfigurationLibrary
-      , OperationKind.Conv2d: EmitConv2dConfigurationLibrary  
-      , OperationKind.Conv3d: EmitConv3dConfigurationLibrary  
-      , OperationKind.RankK: EmitRankKConfigurationLibrary
-      , OperationKind.Rank2K: EmitRank2KConfigurationLibrary
-      , OperationKind.Trmm: EmitTrmmConfigurationLibrary
-      , OperationKind.Symm: EmitSymmConfigurationLibrary
-    }
-
-    self.configurations = [];
-
-    self.header_template ="""
-/*
- Generated by manifest.py - Do not edit.
-*/
-
-#include "cutlass/cutlass.h"
-#include "cutlass/library/library.h"
-#include "cutlass/library/manifest.h"
-
-namespace cutlass {
-namespace library {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-"""
-    self.entry_template = """
-
-//
-// Entry point to construct operations
-//
-void initialize_all_${operation_name}_operations(Manifest &manifest) {
-"""
-    self.configuration_prototype_template = "void initialize_${configuration_name}(Manifest &manifest);\n"
-    self.configuration_template ="  initialize_${configuration_name}(manifest);\n"
-
-    self.epilogue_template ="""
-
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace library
-} // namespace cutlass
-
-"""
-
-  #
-  def __enter__(self):
-    self.operation_path = os.path.join(self.generated_path, OperationKindNames[self.kind])
-    os.mkdir(self.operation_path)
-
-    self.top_level_path = os.path.join(self.operation_path, "all_%s_operations.cu" % OperationKindNames[self.kind])
-
-    self.top_level_file = open(self.top_level_path, "w")
-    self.top_level_file.write(self.header_template)
-
-    self.source_files = [self.top_level_path,]
-
-    return self
-
-  #
-  def emit(self, configuration_name, operations):
-
-    with self.emitters[self.kind](self.operation_path, configuration_name) as configuration_emitter:
-      for operation in operations:
-        configuration_emitter.emit(operation)
-      
-      self.source_files.append(configuration_emitter.configuration_path)
-
-    self.configurations.append(configuration_name)
-    self.top_level_file.write(SubstituteTemplate(self.configuration_prototype_template, {'configuration_name': configuration_name} ))
-
-  #
-  def __exit__(self, exception_type, exception_value, traceback):
-    self.top_level_file.write(SubstituteTemplate(self.entry_template, {'operation_name': OperationKindNames[self.kind]}))
-
-    for configuration_name in self.configurations:
-      self.top_level_file.write(SubstituteTemplate(self.configuration_template, {'configuration_name': configuration_name}))
-
-    self.top_level_file.write(self.epilogue_template)
-    self.top_level_file.close()
-
-class EmitInterfaceLibrary:
-  def __init__(self, generated_path, operation_count, args):
-    self.generated_path = generated_path
-    self.args = args
-
-
-    self.prototypes = []
-    self.fn_calls = []
-    self.operation_count = str(operation_count)
-
-    self.top_level_hdr_template = '''
-/*
- Generated by manifest.py - Do not edit.
-*/
-'''
-    self.top_level_prologue = '''
-
-#include "cutlass/library/library.h"
-#include "cutlass/library/manifest.h"
-
-namespace cutlass {
-\tnamespace library {
-
-${prototypes}
-
-\t\tvoid initialize_all(Manifest &manifest) {
-\t\t\tmanifest.reserve(${operation_count});\n\n
-${fn_calls}
-\t\t\t}
-
-\t} // namespace library
-} // namespace cutlass
-
-'''
-
-  #
-  def __enter__(self):
-    self.top_level_path = os.path.join(self.generated_path, 'initialize_all.cpp')
-
-    self.top_level_file = open(self.top_level_path, "w")
-    self.top_level_file.write(self.top_level_hdr_template)
-
-    self.source_files = [self.top_level_path,]
-
-    return self
-
-  #
-  def emit(self, operation_name):
-    self.prototypes.append(SubstituteTemplate(
-       "\t\tvoid initialize_all_${operation_kind}_operations(Manifest &manifest);",
-       {'operation_kind': operation_name}))
-    self.fn_calls.append(SubstituteTemplate(
-       "\t\t\tinitialize_all_${operation_kind}_operations(manifest);",
-       {'operation_kind': operation_name}))
-    
-
-
-  #
-  def __exit__(self, exception_type, exception_value, traceback):
-    self.top_level_file.write(SubstituteTemplate(self.top_level_prologue, {'prototypes':"\n".join(self.prototypes),
-                                                                           'fn_calls':"\n".join(self.fn_calls),
-                                                                           'operation_count': self.operation_count}))
-    self.top_level_file.close()
-
-###################################################################################################
-###################################################################################################
-
-class Options:
-  def __init__(self):
-    pass
-
-###################################################################################################
-
-#
-class Manifest:
-
-  #
-  def __init__(self, args = None):
-    self.operations = {}
-    self.args = args
-    self.operation_count = 0
-    self.operations_by_name = {}
-    
-    self.kernel_filter = ''
-    self.kernel_filter_list = []
-    self.kernel_names = []
-    self.operations_enabled = []
-    self.selected_kernels = []
-    self.ignore_kernel_names = []
-    self.compute_capabilities = [50,]
-    self.curr_build_dir = '.'
-    self.filter_by_cc = True
-
-    if self.args:
-      self.kernel_filter = self.args.kernels
-      self.curr_build_dir = args.curr_build_dir
-      architectures = args.architectures.split(';') if len(args.architectures) else ['50',]
-      self.compute_capabilities = [int(x) for x in architectures]
-      
-      if args.filter_by_cc in ['false', 'False', '0']:
-        self.filter_by_cc = False
-
-      if args.operations == 'all':
-        self.operations_enabled = []
-      else:
-        operations_list = [
-          OperationKind.Gemm
-          , OperationKind.Conv2d    
-          , OperationKind.Conv3d    
-          , OperationKind.RankK
-          , OperationKind.Trmm
-          , OperationKind.Symm
-        ] 
-        self.operations_enabled = [x for x in operations_list if OperationKindNames[x] in args.operations.split(',')]
-
-      if args.kernels == 'all':
-        self.kernel_names = []
-      else:
-        self.kernel_names = [x for x in args.kernels.split(',') if x != '']
-
-      self.ignore_kernel_names = [x for x in args.ignore_kernels.split(',') if x != '']
-
-      if args.kernel_filter_file is None:
-          self.kernel_filter_list = []
-      else:
-          self.kernel_filter_list = self.get_kernel_filters(args.kernel_filter_file)
-
-  #
-  def get_kernel_filters (self, kernelListFile):
-    if os.path.isfile(kernelListFile):
-        with open(kernelListFile, 'r') as fileReader:
-            lines = [line.rstrip() for line in fileReader if not line.startswith("#")]
-        
-        lines = [re.compile(line) for line in lines if line]
-        return lines
-    else:
-        return []
-
-  #
-  def filter_out_kernels(self, kernel_name, kernel_filter_list):
-
-    for kernel_filter_re in kernel_filter_list:
-        if kernel_filter_re.search(kernel_name) is not None:
-            return True
-        
-    return False
-
-    
-  #
-  def _filter_string_matches(self, filter_string, haystack):
-    ''' Returns true if all substrings appear in the haystack in order'''
-    substrings = filter_string.split('*')
-    for sub in substrings:
-      idx = haystack.find(sub)
-      if idx < 0:
-        return False
-      haystack = haystack[idx + len(sub):]
-    return True
-
-  #
-  def filter(self, operation):
-    ''' Filtering operations based on various criteria'''
-
-    # filter based on compute capability
-    enabled = not (self.filter_by_cc)
-
-    for cc in self.compute_capabilities:
-      if cc >= operation.tile_description.minimum_compute_capability and \
-        cc <= operation.tile_description.maximum_compute_capability:
-
-        enabled = True
-        break
-
-    if not enabled:
-      return False
-
-    if len(self.operations_enabled) and not operation.operation_kind in self.operations_enabled:
-      return False
-
-    # eliminate duplicates
-    if operation.procedural_name() in self.operations_by_name.keys():
-      return False
-
-    # Filter based on list of valid substrings
-    if len(self.kernel_names):
-      name = operation.procedural_name()
-      enabled = False
-
-      # compare against the include list
-      for name_substr in self.kernel_names:
-        if self._filter_string_matches(name_substr, name):
-          enabled = True
-          break
-
-      # compare against the exclude list
-      for name_substr in self.ignore_kernel_names:
-        if self._filter_string_matches(name_substr, name):
-          enabled = False
-          break
-          
-    if len(self.kernel_filter_list) > 0:
-        enabled = False
-        if self.filter_out_kernels(operation.procedural_name(), self.kernel_filter_list):
-            enabled = True
-
-    # todo: filter based on compute data type
-    return enabled
-  #
-
-  #
-  def append(self, operation):
-    ''' 
-      Inserts the operation.
-
-      operation_kind -> configuration_name -> []
-    '''
-
-    if self.filter(operation):
-    
-      self.selected_kernels.append(operation.procedural_name())
-
-      self.operations_by_name[operation.procedural_name()] = operation
-
-      # add the configuration
-      configuration_name = operation.configuration_name()
-
-      if operation.operation_kind not in self.operations.keys():
-        self.operations[operation.operation_kind] = {}
-
-      if configuration_name not in self.operations[operation.operation_kind].keys():
-        self.operations[operation.operation_kind][configuration_name] = []
-
-      self.operations[operation.operation_kind][configuration_name].append(operation)
-      self.operation_count += 1
-  #
-
-  #
-  def emit(self, target = GeneratorTarget.Library):
-
-    operation_emitters = {
-      GeneratorTarget.Library: EmitOperationKindLibrary 
-    }
-    interface_emitters = {
-      GeneratorTarget.Library: EmitInterfaceLibrary 
-    }
-
-    generated_path = os.path.join(self.curr_build_dir, 'generated')
-
-    # create generated/
-    if os.path.exists(generated_path):
-      shutil.rmtree(generated_path)
-
-    os.mkdir(generated_path)
-
-    source_files = []
-
-    with interface_emitters[target](generated_path, self.operation_count, self.args) as iface_emitter:
-      for operation_kind, configurations in self.operations.items():
-        iface_emitter.emit(OperationKindNames[operation_kind])
-
-      source_files += iface_emitter.source_files
-
-
-    # for each operation kind, emit initializer for all configurations
-    for operation_kind, configurations in self.operations.items():
-      with operation_emitters[target](generated_path, operation_kind, self.args) as operation_kind_emitter:
-        for configuration_name, operations in configurations.items():
-          operation_kind_emitter.emit(configuration_name, operations)
-
-        source_files += operation_kind_emitter.source_files
-
-    # write the manifest.cmake file containing paths from all targets
-    manifest_path = os.path.join(generated_path, "manifest.cmake")
-    with open(manifest_path, "w") as manifest_file:
-
-      target_name = 'cutlass_library_objs'
-
-      target_text = SubstituteTemplate("""cutlass_target_sources(
-  ${target_name}
-  BATCH_SOURCES ON
-  PRIVATE
-""", { 'target_name': target_name})
-
-      manifest_file.write(target_text)
-
-      for source_file in source_files:
-        manifest_file.write("    %s\n" % str(source_file.replace('\\', '/')))
-      manifest_file.write(")")
-  #
-
-###################################################################################################
diff --git a/tools/library/scripts/rt.py b/tools/library/scripts/rt.py
deleted file mode 100644
index f84b0257e4..0000000000
--- a/tools/library/scripts/rt.py
+++ /dev/null
@@ -1,796 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# System imports
-import struct
-import io
-import ctypes
-
-# CUDA Python import
-from cuda import cuda
-from cuda import nvrtc
-
-# CUTLASS imports
-from library import *
-from gemm_operation import EmitGemmUniversalInstance
-
-#################################################################################################
-#
-# CUTLASS Py Runtime Components
-#
-#################################################################################################
-
-#
-def MaxAlignment(fmt):
-  align = 1
-  for x in fmt:
-    align = max(align, struct.calcsize(x))
-  return align
-  
-#
-def AlignedOffset(offset, align):
-  remainder = (offset % align)
-  if remainder:
-    offset += (align - remainder)
-  return offset
-
-#
-def PackInteger(host_workspace, offset, value):
-  fmt = "i"
-  padding = AlignedOffset(offset, 4)  
-  struct.pack_into(fmt, host_workspace, offset, value)
-  return padding + struct.calcsize(fmt)
-
-#
-def PackDevicePointer(host_workspace, offset, value):
-  fmt = "P"
-  offset = AlignedOffset(offset, 8)
-  struct.pack_into(fmt, host_workspace, offset, value)
-  return offset + struct.calcsize(fmt)
-  
-#
-def ceil_div(a, b):
-  return -(a // -b)
-
-#################################################################################################
-
-#
-class PitchLinearCoord:
-  def __init__(self, contiguous, strided):
-    self.contiguous = contiguous
-    self.strided = strided
-
-#
-class GemmCoord:
-  def __init__(self, m = 1, n = 1, k = 1):
-    self.m = m
-    self.n = n
-    self.k = k
-    self.fmt = "iii"
-
-  #
-  def ceil_div(self, rhs):
-    return GemmCoord(ceil_div(self.m, rhs.m), ceil_div(self.n, rhs.n), ceil_div(self.k, rhs.k))
-
-  #
-  def size(self):
-    return struct.calcsize(self.fmt)
-
-  #
-  def alignment(self):
-    return MaxAlignment(self.fmt)
-
-  #
-  def pack_into(self, host_workspace, offset):
-    
-    offset = AlignedOffset(offset, 4)
-  
-    struct.pack_into(
-      self.fmt, 
-      host_workspace, 
-      offset, 
-      self.m, self.n, self.k)
-
-    return offset + self.size()
-
-#
-class TensorRef:
-  def __init__(self, pointer = None, layout = 0):
-    self.pointer = pointer
-    self.layout = layout
-
-  def __str__(self):
-    return "(%x, %d)" % (self.pointer._ptr, self.layout)
-
-#################################################################################################
-
-#
-class PredicatedTileAccessIteratorDesc:
-  '''
-  '''
-  
-  def __init__(
-      self, 
-      element_size_bits, 
-      advance_rank, 
-      threadblock_shape, 
-      threadmap_iterations, 
-      threadmap_delta):
-
-    self.element_size_bits = element_size_bits
-    self.advance_rank = advance_rank
-    self.threadblock_shape = threadblock_shape
-    self.threadmap_iterations = threadmap_iterations
-    self.threadmap_delta = threadmap_delta
-
-#
-class PredicatedTileAccessIteratorParams:
-  '''
-  '''
-  #
-  def __init__(self, desc, label):
-    self.desc = desc
-    self.label = label
-    self.fmt = "qqqq"
-  #
-  def size(self):
-    return struct.calcsize(self.fmt)
-
-  #
-  def alignment(self):
-    return MaxAlignment(self.fmt)
-
-  #
-  def initialize(self, host_workspace, offset, stride):
-
-    offset = AlignedOffset(offset, self.alignment())
-
-    inc_strided = stride *                            \
-                  self.desc.threadmap_delta.strided * \
-                  self.desc.element_size_bits // 8
-
-    if self.desc.advance_rank:
-      inc_advance = self.desc.threadblock_shape.strided * \
-                          stride *                        \
-                          self.desc.element_size_bits // 8
-    else:
-      inc_advance = self.desc.threadblock_shape.contiguous * \
-                          self.desc.element_size_bits // 8
-
-    inc_next = inc_advance - (self.desc.threadmap_iterations.strided - 1) * \
-                      self.desc.threadmap_delta.strided *                   \
-                      stride *                                              \
-                      self.desc.element_size_bits // 8
-
-    struct.pack_into(
-      self.fmt, 
-      host_workspace, 
-      offset, 
-      stride, inc_strided, inc_next, inc_advance)
-
-    return offset + self.size()
-  #
-
-#################################################################################################
-
-#
-class EpilogueTileDesc:
-  '''
-  '''
-  def __init__(self, column, row, group, cluster, tile):
-    self.column = column
-    self.row = row
-    self.group = group
-    self.cluster = cluster
-    self.tile = tile
-
-#
-class EpilogueThreadMap:
-  '''
-  '''
-  def __init__(self, threads, elements_per_access, element_size_bits, shape, iterations, delta, count):
-    self.threads = threads
-    self.elements_per_access = elements_per_access
-    self.element_size_bits = element_size_bits
-    self.shape = shape
-    self.iterations = iterations
-    self.delta = delta
-    self.count = count
-    pass
-
-#
-class EpilogueTileIteratorParams:
-  '''
-  '''
-  #
-  def __init__(self, desc, label):
-    self.desc = desc
-    self.label = label
-    self.fmt = "qqqqqqqq"
-
-  #
-  def size(self):
-    return struct.calcsize(self.fmt)
-
-  #
-  def alignment(self):
-    return MaxAlignment(self.fmt)
-
-  #
-  def initialize(self, host_workspace, offset, stride):
-
-    stride = stride * self.desc.element_size_bits // 8
-
-    offset = AlignedOffset(offset, self.alignment())
-
-    increment_row = stride * self.desc.delta.row
-
-    increment_group = stride * self.desc.delta.group \
-      - stride * self.desc.delta.row * (self.desc.iterations.row - 1)
-
-    increment_cluster = stride * self.desc.delta.cluster \
-      - stride * self.desc.delta.group * (self.desc.iterations.group - 1) \
-      - stride * self.desc.delta.row * (self.desc.iterations.row - 1)
-      
-    advance_row = stride * self.desc.shape.row
-
-    advance_group = stride *                   \
-      (self.desc.shape.group - 1) * \
-      self.desc.shape.row *         \
-      self.desc.count.row
-
-    advance_cluster = stride * \
-      self.desc.count.group * \
-      self.desc.shape.group * \
-      self.desc.count.row   * \
-      self.desc.shape.row
-
-    advance_tile = stride * \
-      self.desc.shape.group * \
-      self.desc.shape.row   * \
-      self.desc.shape.cluster * \
-      self.desc.shape.tile
-
-    struct.pack_into(
-      self.fmt,                                           \
-      host_workspace,                                     \
-      offset,                                             \
-      stride,                                             \
-      increment_row, increment_group, increment_cluster,  \
-      advance_row, advance_group, advance_cluster, advance_tile)
-
-    return offset + self.size()
-#
-
-#################################################################################################
-#
-# Launch configuration
-#
-#################################################################################################
-
-class LaunchConfiguration:
-  def __init__(self, grid = [1,1,1], block = [1,1,1], smem = 0):
-    self.grid = grid
-    self.block = block
-    self.shared_memory_capacity = smem
-
-#################################################################################################
-#
-# Functors
-#
-#################################################################################################
-
-#
-class Functor:
-  def __init__(self):
-    self.decl = ''
-    self.definition = ''
-    self.fmt = ''
-    self.identifier = ''
-
-  #
-  def emit_declaration(self):
-    return self.decl
-
-  #
-  def emit_definition(self):
-    return self.definition
-
-  # 
-  def size(self):
-    '''
-    Size of the packed Params structure
-    '''
-    return struct.calcsize(self.fmt)
-
-  #
-  def alignment(self):
-    return MaxAlignment(self.fmt)
-
-  # 
-  def initialize(self, host_workspace, offset, arguments):
-    return offset + self.size()
-
-#################################################################################################
-
-#
-class LinearCombinationFunctorArguments:
-  def __init__(self, alpha = 1.0, beta = 0.0):
-    self.alpha = alpha
-    self.beta = beta
-    self.alpha_ptr = 0
-    self.beta_ptr = 0
-
-#
-class LinearCombinationFunctor(Functor):
-  def __init__(self):
-    super().__init__()
-
-    self.decl = """
-    cutlass::epilogue::thread::LinearCombination<
-      float,
-      1,
-      float,
-      float
-    >"""
-    self.identifier = 'linear_combination'
-    self.fmt = "ffPP"
-
-  # 
-  def size(self):
-    '''
-    Size of the packed Params structure
-    '''
-    return struct.calcsize(self.fmt)
-
-  #
-  def alignment(self):
-    return MaxAlignment(self.fmt)
-
-  # 
-  def initialize(self, host_workspace, offset, arguments):
-
-    offset = AlignedOffset(offset, self.alignment())
-
-    struct.pack_into(
-      self.fmt, 
-      host_workspace, offset, 
-      arguments.alpha, arguments.beta, arguments.alpha_ptr, arguments.beta_ptr)
-
-    return offset + self.size()
-
-#################################################################################################
-#
-# Base class for an executable operation
-#
-#################################################################################################
-
-#
-class ExecutableOperation:
-  '''
-  '''
-  def __init__(self, operation):
-    self.operation = operation
-    self.module = None
-    self.kernel = None
-
-  #
-  def name(self):
-    return self.operation.procedural_name()
-
-  #
-  def emit(self):
-    return ''
-
-  #
-  def can_implement(self, configuration, arguments):
-    return False
-
-  #
-  def get_host_workspace_size(self, arguments):
-    return 0
-
-  #
-  def get_device_workspace_size(self, arguments):
-    return 0
-
-  #
-  def plan(self, arguments):
-    return LaunchConfiguration()
-
-  #
-  def initialize(self, host_workspace, device_workspace, launch_config, arguments, stream = cuda.CUstream(0)):
-    raise NotImplementedError()
-
-  #
-  def run(self, host_workspace, device_workspace, launch_config, stream = cuda.CUstream(0)):
-
-    cArg = (ctypes.c_char * len(host_workspace)).from_buffer(host_workspace)
-    packed = (ctypes.c_void_p * 1)()
-    packed[0] = ctypes.addressof(cArg)
-
-    err, = cuda.cuLaunchKernel(
-      self.kernel, 
-      launch_config.grid[0], launch_config.grid[1], launch_config.grid[2], 
-      launch_config.block[0], launch_config.block[1], launch_config.block[2], 
-      launch_config.shared_memory_capacity, 
-      stream, 
-      packed, 
-      0)
-
-    return err
-
-#################################################################################################
-
-
-#
-class GemmArguments:
-  '''
-  '''
-  def __init__(self):
-    self.problem_size = GemmCoord(0, 0, 0)
-    self.A = TensorRef()
-    self.B = TensorRef()
-    self.C = TensorRef()
-    self.D = TensorRef()
-    self.output_op = LinearCombinationFunctorArguments()
-
-#
-class ThreadblockSwizzle:
-  def __init__(self, threadblock_shape, log_threadblock_cohort = 0):
-    self.threadblock_shape = threadblock_shape
-    self.log_threadblock_cohort = log_threadblock_cohort
-
-  def grid_tiled_shape(self, problem_size):
-    return GemmCoord(
-      ceil_div(problem_size.m, self.threadblock_shape.m), 
-      ceil_div(problem_size.n, self.threadblock_shape.n), 
-      1)
-
-#
-class Gemm(ExecutableOperation):
-  '''
-  GEMM manages the CUTLASS runtime components
-  '''
-  #
-  def __init__(self, operation):
-    super().__init__(operation)
-
-    self.emitter = EmitGemmUniversalInstance('_type')
-    self.threadblock_swizzle = ThreadblockSwizzle(GemmCoord(128, 128, 8))
-
-    self.threads = 256
-    self.shared_memory_capacity = (32 << 10)
-
-    self.params_A = PredicatedTileAccessIteratorParams(
-        PredicatedTileAccessIteratorDesc(
-          32, 
-          1, 
-          PitchLinearCoord(128, 8), 
-          PitchLinearCoord(1, 4), 
-          PitchLinearCoord(1, 2)), 'A')
-
-    self.params_B = PredicatedTileAccessIteratorParams(
-        PredicatedTileAccessIteratorDesc(
-          32, 
-          1, 
-          PitchLinearCoord(128, 8), 
-          PitchLinearCoord(1, 4), 
-          PitchLinearCoord(1, 2)), 'B')
-
-    self.params_C = EpilogueTileIteratorParams(
-      EpilogueThreadMap(
-        256, 
-        1, 
-        32,
-        EpilogueTileDesc(128, 1, 4, 4, 1), 
-        EpilogueTileDesc(4, 1, 2, 1, 1), 
-        EpilogueTileDesc(32, 1, 8, 1, 1), 
-        EpilogueTileDesc(1, 4, 2, 1, 8)), 'C')
-
-    self.params_D = EpilogueTileIteratorParams(
-      EpilogueThreadMap(
-        256, 
-        1,
-        32,
-        EpilogueTileDesc(128, 1, 4, 4, 1), 
-        EpilogueTileDesc(4, 1, 2, 1, 1), 
-        EpilogueTileDesc(32, 1, 8, 1, 1), 
-        EpilogueTileDesc(1, 4, 2, 1, 8)), 'D')
-
-    self.output_op = LinearCombinationFunctor()
-
-  #
-  def emit(self):
-    return self.emitter.emit(self.operation)
-
-  #
-  def can_implement(self, configuration, arguments):
-    pass
-
-  #
-  def get_host_workspace_size(self, arguments):
-    return 336
-
-  #
-  def get_device_workspace_size(self, arguments):
-    return 0
-
-  #
-  def plan(self, arguments):
-    grid = self.threadblock_swizzle.grid_tiled_shape(arguments.problem_size)
-    return LaunchConfiguration([grid.m, grid.n, grid.k], [self.threads, 1, 1], self.shared_memory_capacity)
-
-  #
-  def initialize(self, host_workspace, device_workspace, launch_config, arguments, stream = cuda.CUstream(0)):
-    
-    offset = 0
-
-    # Compute intermediate results
-    swizzle_log_tile = 0
-    gemm_mode = 0
-    batch_count = 1
-    gemm_k_size = arguments.problem_size.k
-
-    # Pack into the host workspace buffer
-    offset = arguments.problem_size.pack_into(host_workspace, offset)
-
-    grid_tiled_shape = self.threadblock_swizzle.grid_tiled_shape(arguments.problem_size)
-    offset = grid_tiled_shape.pack_into(host_workspace, offset)
-
-    offset = PackInteger(host_workspace, offset, swizzle_log_tile)
-
-    offset = self.params_A.initialize(host_workspace, offset, arguments.A.layout)
-    offset = self.params_B.initialize(host_workspace, offset, arguments.B.layout)
-    offset = self.params_C.initialize(host_workspace, offset, arguments.C.layout)
-    offset = self.params_D.initialize(host_workspace, offset, arguments.D.layout)
-
-    offset = self.output_op.initialize(host_workspace, offset, arguments.output_op)
-
-    offset = PackInteger(host_workspace, offset, gemm_mode)
-    offset = PackInteger(host_workspace, offset, batch_count)
-    offset = PackInteger(host_workspace, offset, gemm_k_size)
-    offset = PackDevicePointer(host_workspace, offset, int(arguments.A.pointer))
-    offset = PackDevicePointer(host_workspace, offset, int(arguments.B.pointer))
-    offset = PackDevicePointer(host_workspace, offset, int(arguments.C.pointer))
-    offset = PackDevicePointer(host_workspace, offset, int(arguments.D.pointer))   
-
-    return offset
-
-
-#################################################################################################
-#
-# Module represents a compilation unit 
-#
-#################################################################################################
-
-#
-class CompilationOptions:
-  '''
-  Compilation options.
-  '''
-
-  #
-  def __init__(self, architectures = [80], include_paths = []):
-    self.includes = []
-    self.include_paths = include_paths
-    self.flags = ['-std=c++11', '-default-device']
-    self.architectures = architectures
-
-  #
-  def get(self):
-    options = []
-
-    for flag in self.flags:
-      options.append(bytes(str.encode(flag)))
-
-    for incl in self.include_paths:
-      options.append(bytes(str.encode('--include-path=%s' % incl)))
-
-    arch_list = "-arch="
-    for idx, arch in enumerate(self.architectures):
-      if idx:
-        arch_list += ","
-      arch_list += "sm_%d" % arch
-
-    options.append(bytes(str.encode(arch_list)))
-
-    return options
-
-IncludeTemplate = r'''#include "${include}"
-'''
-
-KernelTemplate = r'''
-extern "C"
-__global__ void
-${operation_name}(${operation_name}${operation_suffix}::Params params) {
-
-  // Dynamic shared memory base pointer
-  extern __shared__ int SharedStorageBase[];
-
-  // Declare pointer to dynamic shared memory.
-  ${operation_name}${operation_suffix}::SharedStorage *shared_storage =
-      reinterpret_cast<${operation_name}${operation_suffix}::SharedStorage *>(SharedStorageBase);
-
-  ${operation_name}${operation_suffix} op;
-
-  op(params, *shared_storage);
-}
-
-'''
-
-#
-class Module:
-  def __init__(self, name, operations, compilation_options):
-    self.name = name
-    self.operations = operations
-    self.module = None
-    self.log = None
-    self.cubin_image = None
-    self.source_buffer = ''
-
-    #
-    # Emit source
-    #
-    self.emit_()
-
-    #
-    # Compile
-    #
-    self.compile_(compilation_options)
-
-    #
-    # Load module
-    #
-    self.load_()
-    
-    # Done
-    return
-
-  # Emit a source buffer
-  def emit_(self):
-
-    # 1. Includes
-    includes = []
-    for operation in self.operations:
-      for incl in operation.emitter.includes:
-        if incl not in includes:
-          includes.append(incl)
-
-    for incl in includes:
-      self.source_buffer += SubstituteTemplate(IncludeTemplate, { 'include': incl} )
-
-    # 2. Operations
-    for operation in self.operations:
-      self.source_buffer += operation.emit()
-      values = {
-        'operation_name': operation.name(),
-        'operation_suffix': operation.emitter.operation_suffix
-      }
-      self.source_buffer += SubstituteTemplate(KernelTemplate, values)
-
-    # Done
-    return
-
-  # Compile with NVRTC
-  def compile_(self, compilation_options):
-
-    err, program = nvrtc.nvrtcCreateProgram(
-      str.encode(self.source_buffer), 
-      bytes(str.encode(self.name)), 
-      0, [], [])
-
-    if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-        raise RuntimeError('NVRTC Error: {}'.format(err))
-
-    # Compile program
-    options = compilation_options.get()
-
-    err, = nvrtc.nvrtcCompileProgram(program, len(options), options)
-    if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-
-      error_string = 'NVRTC Error: {}\n'.format(err)
-
-      # Get log from compilation
-      err, logSize = nvrtc.nvrtcGetProgramLogSize(program)
-      if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-          raise RuntimeError('NVRTC Error: {}'.format(err))
-      
-      self.log = b' ' * logSize
-      err, = nvrtc.nvrtcGetProgramLog(program, self.log)
-      if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-          raise RuntimeError('NVRTC Error: {}'.format(err))
-      
-      raise RuntimeError(error_string + self.log.decode() + self.source_buffer)
-
-    # Get data from compilation
-    err, dataSize = nvrtc.nvrtcGetCUBINSize(program)
-    if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-        raise RuntimeError('NVRTC Error: {}'.format(err))
-    
-    self.cubin_image = b' ' * dataSize
-    err, = nvrtc.nvrtcGetCUBIN(program, self.cubin_image)
-    if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-        raise RuntimeError('NVRTC Error: {}'.format(err))
-
-    return
-    
-  #
-  def load_(self):
-
-    # Load data as module data
-    err, self.module = cuda.cuModuleLoadData(self.cubin_image)
-    if err != cuda.CUresult.CUDA_SUCCESS:
-        raise RuntimeError('Cuda Error: {}'.format(err))
-    
-    # Get functions
-    for operation in self.operations:
-      err, operation.kernel = cuda.cuModuleGetFunction(
-        self.module, 
-        bytes(str.encode(operation.name())))
-
-      if err != cuda.CUresult.CUDA_SUCCESS:
-          raise RuntimeError('Cuda Error: {}'.format(err))
-
-      operation.module = self
-
-    return
-
-
-#################################################################################################
-#
-# Manifest represents an 'owner' for modules and operations
-#
-#################################################################################################
-
-#
-class Manifest:
-
-  #
-  def __init__(self):
-    self.operations = {}
-    self.modules = []
-    pass
-
-  #
-  def append_module(self, module):
-    '''
-    Appends a module and takes ownership of operations used to construct it.
-    '''
-    
-    self.modules.append(module)
-
-    for operation in module.operations:
-      self.operations[operation.name()] = operation
-
-
-#################################################################################################
diff --git a/tools/library/src/conv2d_operation.h b/tools/library/src/conv2d_operation.h
index 4ee58f0d59..027b2615f1 100644
--- a/tools/library/src/conv2d_operation.h
+++ b/tools/library/src/conv2d_operation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -36,9 +36,12 @@
 #include <iostream>
 #include "cutlass/cutlass.h"
 #include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/kernel/default_conv2d_group_fprop.h"
+#include "cutlass/conv/kernel/default_depthwise_fprop.h"
 #include "cutlass/conv/kernel/default_conv2d_dgrad.h"
 #include "cutlass/conv/kernel/default_conv2d_wgrad.h"
 #include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/conv/device/direct_convolution.h"
 
 #include "cutlass/library/library.h"
 #include "library_internal.h"
@@ -98,9 +101,9 @@ class Conv2dOperationBase : public Operation {
     description_.tile_description.threadblock_stages = Operator::kStages;
 
     description_.tile_description.warp_count = make_Coord(
-      Operator::ImplicitGemmKernel::WarpCount::kM,
-      Operator::ImplicitGemmKernel::WarpCount::kN,
-      Operator::ImplicitGemmKernel::WarpCount::kK);
+      Operator::UnderlyingKernel::WarpCount::kM,
+      Operator::UnderlyingKernel::WarpCount::kN,
+      Operator::UnderlyingKernel::WarpCount::kK);
     
     description_.tile_description.math_instruction.instruction_shape = make_Coord(
       Operator::InstructionShape::kM,
@@ -323,10 +326,272 @@ class Conv2dOperation : public Conv2dOperationBase<Operator_> {
     void const *arguments_ptr,
     void *host_workspace, 
     void *device_workspace = nullptr, 
+    cudaStream_t stream = nullptr,
+    bool launch_with_pdl = false) const {
+
+    if (launch_with_pdl) {
+      return Status::kErrorNotSupported;
+    }
+
+    OperatorArguments args;
+
+    Status status = update_arguments_(
+      args, 
+      static_cast<ConvArguments const *>(arguments_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = static_cast<Operator *>(host_workspace);
+
+    status = op->update(args, device_workspace);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    //std::cout << "run library::Conv2dOperation" << std::endl;
+    //print_operator_args(args);
+    return op->run(stream);
+  }
+
+  /// Call print_operator_args  from the Conv2dOperation::initialize()
+  // to dump arguments passed on to cutlass operator for debugging
+  void print_operator_args(OperatorArguments &operator_args) const {
+    std::cout << "Conv2dOperation::OperatorArguments" << std::endl
+              << "  problem_size:" << std::endl 
+              << operator_args.problem_size << std::endl
+              << "  split_k_mode: "
+              << (operator_args.split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial" : "parallel") << std::endl
+              << "  epilogue (alpha, beta): "
+              << operator_args.output_op.alpha << ", " 
+              << operator_args.output_op.beta << std::endl
+              << "  ref_A (ptr, {stride}): " 
+              << operator_args.ref_A.data() << ", {"
+              << operator_args.ref_A.stride(0) << ", " 
+              << operator_args.ref_A.stride(1) << ", " 
+              << operator_args.ref_A.stride(2) << "}" << std::endl
+              << "  ref_B (ptr, {stride}): " 
+              << operator_args.ref_B.data() << ", {"
+              << operator_args.ref_B.stride(0) << ", " 
+              << operator_args.ref_B.stride(1) << ", " 
+              << operator_args.ref_B.stride(2) << "}" << std::endl
+              << "  ref_C (ptr, {stride}): "
+              << operator_args.ref_C.data() << ", {"
+              << operator_args.ref_C.stride(0) << ", "
+              << operator_args.ref_C.stride(1) << ", " 
+              << operator_args.ref_C.stride(2) << "}" << std::endl
+              << "  ref_D (ptr, {stride}): "
+              << operator_args.ref_D.data() << ", {"
+              << operator_args.ref_D.stride(0) << ", "
+              << operator_args.ref_D.stride(1) << ", " 
+              << operator_args.ref_D.stride(2) << "}" << std::endl;
+  } 
+};
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// DirectConv2d library operation class for cutlass profiler
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class DirectConv2dOperation : public Conv2dOperation<Operator_> {
+public:
+
+  using Operator = Operator_;
+  using Base = Conv2dOperation<Operator_>;
+
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+  static cutlass::conv::Operator const kConvolutionalOperator = Operator::kConvolutionalOperator;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+public:
+    /// Constructor
+  DirectConv2dOperation(char const *name = "unknown_direct)conv2d_fprop") : Conv2dOperation<Operator_>(name) {
+    this->description_.conv_kind = ConvKindMap<kConvolutionalOperator>::kId;
+  }
+
+protected:
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status construct_arguments_(
+    OperatorArguments &operator_args,
+    Conv2dConfiguration const *configuration) {
+
+
+    operator_args.problem_size = configuration->problem_size;
+
+    operator_args.ref_A = 
+    {
+      nullptr, 
+      LayoutA::packed(implicit_gemm_tensor_a_extent(kConvolutionalOperator, configuration->problem_size))
+    };
+    
+    operator_args.ref_B = 
+    {
+      nullptr, 
+      LayoutB::packed(implicit_gemm_tensor_b_extent(kConvolutionalOperator, configuration->problem_size))
+    };
+    
+    operator_args.ref_reordered_B = 
+    {
+      nullptr, 
+      LayoutB::packed(implicit_gemm_tensor_b_extent(kConvolutionalOperator, configuration->problem_size))
+    };
+    
+    operator_args.ref_C = 
+    {
+      nullptr, 
+      LayoutC::packed(implicit_gemm_tensor_c_extent(kConvolutionalOperator, configuration->problem_size))
+    };
+    
+    operator_args.ref_D = 
+    {
+      nullptr, 
+      LayoutC::packed(implicit_gemm_tensor_c_extent(kConvolutionalOperator, configuration->problem_size))
+    };
+
+    operator_args.split_k_mode = configuration->split_k_mode;
+
+    return Status::kSuccess;
+  }
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status update_arguments_(
+    OperatorArguments &operator_args,
+    ConvArguments const *arguments) {
+
+    if (arguments->pointer_mode == ScalarPointerMode::kHost) {
+      typename Operator::EpilogueOutputOp::Params params(
+        *static_cast<ElementCompute const *>(arguments->alpha),
+        *static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.output_op = params;
+    }
+    else if (arguments->pointer_mode == ScalarPointerMode::kDevice){
+      typename Operator::EpilogueOutputOp::Params params(
+        static_cast<ElementCompute const *>(arguments->alpha),
+        static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.output_op = params; 
+    }
+    else {
+      return Status::kErrorInvalidProblem;
+    }
+
+    operator_args.ref_A.reset(static_cast<ElementA *>(const_cast<void *>(arguments->A)));
+    operator_args.ref_B.reset(static_cast<ElementB *>(const_cast<void *>(arguments->B)));
+    operator_args.ref_C.reset(static_cast<ElementC *>(const_cast<void *>(arguments->C)));
+    operator_args.ref_D.reset(static_cast<ElementC *>(const_cast<void *>(arguments->D)));
+    operator_args.ref_reordered_B.reset(static_cast<ElementC *>(const_cast<void *>(arguments->reordered_B)));
+
+    return Status::kSuccess;
+  }
+
+public:
+
+  /// Returns success if the operation can proceed
+  virtual Status can_implement(
+    void const *configuration_ptr, 
+    void const *arguments_ptr) const {
+
+    Conv2dConfiguration const *configuration = 
+      static_cast<Conv2dConfiguration const *>(configuration_ptr);
+
+    ConvArguments const *arguments = 
+      static_cast<ConvArguments const *>(arguments_ptr);
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(args, configuration);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = update_arguments_(args, arguments);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Operator::can_implement(args);
+
+  }
+  
+  /// Gets the host-side workspace
+  virtual uint64_t get_host_workspace_size(
+    void const *configuration) const {
+
+    return sizeof(Operator);
+  }
+  
+  /// Gets the device-side workspace
+  virtual uint64_t get_device_workspace_size(
+    void const *configuration_ptr,
+    void const *arguments_ptr = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<Conv2dConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    return Operator::get_workspace_size(args);
+  }
+  
+  /// Initializes the workspace
+  virtual Status initialize(
+    void const *configuration_ptr, 
+    void *host_workspace, 
+    void *device_workspace, 
     cudaStream_t stream = nullptr) const {
 
     OperatorArguments args;
 
+    Status status = construct_arguments_(
+      args, 
+      static_cast<Conv2dConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = new (host_workspace) Operator;
+    //std::cout << "initialize library::Conv2dOperation" << std::endl;
+    //print_operator_args(args);
+    return op->initialize(args, device_workspace, stream);
+
+  }
+
+  /// Runs the kernel
+  virtual Status run(
+    void const *arguments_ptr,
+    void *host_workspace, 
+    void *device_workspace = nullptr, 
+    cudaStream_t stream = nullptr,
+    bool launch_with_pdl = false) const {
+
+    if (launch_with_pdl) {
+      return Status::kErrorNotSupported;
+    }
+
+    OperatorArguments args;
+
     Status status = update_arguments_(
       args, 
       static_cast<ConvArguments const *>(arguments_ptr));
@@ -355,7 +620,7 @@ class Conv2dOperation : public Conv2dOperationBase<Operator_> {
               << operator_args.problem_size << std::endl
               << "  split_k_mode: "
               << (operator_args.split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial" : "parallel") << std::endl
-              << "  epilouge (alpha, beta): "
+              << "  epilogue (alpha, beta): "
               << operator_args.output_op.alpha << ", " 
               << operator_args.output_op.beta << std::endl
               << "  ref_A (ptr, {stride}): " 
diff --git a/tools/library/src/conv3d_operation.h b/tools/library/src/conv3d_operation.h
index 34d88a6e0c..6cb1796b5a 100644
--- a/tools/library/src/conv3d_operation.h
+++ b/tools/library/src/conv3d_operation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -98,9 +98,9 @@ class Conv3dOperationBase : public Operation {
     description_.tile_description.threadblock_stages = Operator::kStages;
 
     description_.tile_description.warp_count = make_Coord(
-      Operator::ImplicitGemmKernel::WarpCount::kM,
-      Operator::ImplicitGemmKernel::WarpCount::kN,
-      Operator::ImplicitGemmKernel::WarpCount::kK);
+      Operator::UnderlyingKernel::WarpCount::kM,
+      Operator::UnderlyingKernel::WarpCount::kN,
+      Operator::UnderlyingKernel::WarpCount::kK);
     
     description_.tile_description.math_instruction.instruction_shape = make_Coord(
       Operator::InstructionShape::kM,
@@ -317,7 +317,12 @@ class Conv3dOperation : public Conv3dOperationBase<Operator_> {
     void const *arguments_ptr,
     void *host_workspace, 
     void *device_workspace = nullptr, 
-    cudaStream_t stream = nullptr) const {
+    cudaStream_t stream = nullptr,
+    bool launch_with_pdl = false) const {
+
+    if (launch_with_pdl) {
+      return Status::kErrorNotSupported;
+    }
 
     OperatorArguments args;
 
@@ -349,7 +354,7 @@ class Conv3dOperation : public Conv3dOperationBase<Operator_> {
               << operator_args.problem_size << std::endl
               << "  split_k_mode: "
               << (operator_args.split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial" : "parallel") << std::endl
-              << "  epilouge (alpha, beta): " 
+              << "  epilogue (alpha, beta): "
               << operator_args.output_op.alpha << ", " 
               << operator_args.output_op.beta << std::endl
               << "  ref_A (ptr, {stride}): " 
diff --git a/tools/library/src/conv_operation_3x.hpp b/tools/library/src/conv_operation_3x.hpp
new file mode 100644
index 0000000000..d6f79e9196
--- /dev/null
+++ b/tools/library/src/conv_operation_3x.hpp
@@ -0,0 +1,981 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Defines operations for all CONV operation kinds in CUTLASS Library.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "library_internal.h"
+#include "cutlass/conv/convnd_problem_shape.hpp"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/trace.h"
+#include <utility>
+#include <variant>
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL)
+#include <sstream>
+#endif
+
+namespace cutlass::library {
+
+namespace detail {
+
+template<class ValueType, size_t ... Indices>
+constexpr cute::array<ValueType, 1u + sizeof...(Indices)>
+vector_to_array_strides_helper(const std::vector<ValueType>& v,
+                               std::index_sequence<Indices...>)
+{
+  return {v[(sizeof...(Indices) - 1u) - Indices]..., ValueType(1)};
+}
+
+template<class ValueType, size_t Size>
+cute::array<ValueType, Size>
+vector_to_array_strides(const std::vector<ValueType>& v, std::integral_constant<size_t, Size>)
+{
+  static_assert(Size != 0);
+  CUTLASS_ASSERT(v.size() + 1u == Size);
+  return vector_to_array_strides_helper(v, std::make_index_sequence<Size - 1u>{});
+}
+
+template<class Index, class LongIndex, size_t ... Indices>
+constexpr cute::array<int64_t, 1u + sizeof...(Indices)>
+coord_to_array_strides_helper(
+  const ::cutlass::Coord<int(sizeof...(Indices)), Index, LongIndex> coord,
+  std::index_sequence<Indices...>)
+{
+  return {int64_t(coord[(sizeof...(Indices) - 1u) - Indices])..., int64_t(1)};
+}
+
+template<int Rank, class Index, class LongIndex>
+cute::array<int64_t, 1u + size_t(Rank)>
+coord_to_array_strides(const ::cutlass::Coord<Rank, Index, LongIndex>& coord)
+{
+  static_assert(Rank >= 0);
+  return coord_to_array_strides_helper(coord, std::make_index_sequence<Rank>{});
+}
+
+} // namespace detail
+
+// Tells the profiler about CUTLASS 3's 2-D and 3-D convolutions.
+// For CUTLASS 2's 2-D convolutions, see Conv2dOperation.
+// For CUTLASS 2's 3-D convolutions, see Conv3dOperation.
+template<class Operator_>
+class ConvOperation3x : public Operation {
+public:
+  using Operator = Operator_;
+
+  static_assert(Operator::NumSpatialDimensions == 2 ||
+    Operator::NumSpatialDimensions == 3,
+    "The profiler currently only supports convolutions with 2 or 3 spatial dimensions.");
+  using LayoutA = cute::conditional_t<Operator::NumSpatialDimensions == 3,
+    cutlass::layout::TensorNDHWC,
+    cute::conditional_t<Operator::NumSpatialDimensions == 2,
+      cutlass::layout::TensorNHWC,
+      cutlass::layout::TensorNWC>
+    >;
+  using LayoutB = LayoutA;
+  using LayoutC = LayoutA;
+
+  using ElementA = typename Operator::ElementA;
+  using ElementB = typename Operator::ElementB;
+  using ElementC = typename Operator::ElementC;
+  using ElementD = typename Operator::ElementD;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+  static cutlass::conv::Operator const kConvolutionalOperator = Operator::kConvolutionalOperator;
+
+  ConvOperation3x(const char* name = "unknown_cutlass_3_conv") {
+    // Initialize OperationDescription (the base class)
+    description_.name = name;
+    description_.provider = Provider::kCUTLASS;
+
+    if constexpr (Operator::NumSpatialDimensions == 2) {
+      description_.kind = OperationKind::kConv2d;
+    }
+    else if constexpr (Operator::NumSpatialDimensions == 3) {
+      description_.kind = OperationKind::kConv3d;
+    }
+    else {
+      static_assert(::cutlass::detail::dependent_false<Operator>,
+        "This class currently only supports 2-D and 3-D convolutions.");
+    }
+
+    description_.tile_description.threadblock_shape = make_Coord(
+      Operator::ThreadblockShape::kM,
+      Operator::ThreadblockShape::kN,
+      Operator::ThreadblockShape::kK);
+
+    description_.tile_description.threadblock_stages = Operator::kStages;
+
+    description_.tile_description.warp_count = make_Coord(
+      Operator::WarpCount::kM,
+      Operator::WarpCount::kN,
+      Operator::WarpCount::kK);
+
+    description_.tile_description.math_instruction.instruction_shape = make_Coord(
+      Operator::InstructionShape::kM,
+      Operator::InstructionShape::kN,
+      Operator::InstructionShape::kK);
+
+    description_.tile_description.math_instruction.element_accumulator =
+      NumericTypeMap<ElementAccumulator>::kId;
+
+    description_.tile_description.math_instruction.opcode_class =
+      OpcodeClassMap<typename Operator::OperatorClass>::kId;
+
+    description_.tile_description.math_instruction.math_operation =
+      MathOperationID::kMultiplyAdd;
+
+    description_.tile_description.minimum_compute_capability =
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMin;
+
+    description_.tile_description.maximum_compute_capability =
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMax;
+
+    // Initialize ConvDescription (the subclass)
+
+    // kConvDim does not exist in Operator for CUTLASS 3 convolutions.
+    // For CUTLASS 2 convolutions, it is the number of spatial dimensions.
+    description_.conv_dim = Operator::NumSpatialDimensions;
+    description_.conv_kind = ConvKindMap<kConvolutionalOperator>::kId;
+
+    description_.iterator_algorithm = {};
+
+    description_.A = make_TensorDescription<ElementA, LayoutA>();
+    description_.B = make_TensorDescription<ElementB, LayoutB>();
+    description_.C = make_TensorDescription<ElementC, LayoutC>();
+    description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
+  }
+
+  ~ConvOperation3x() override = default;
+
+  OperationDescription const& description() const override {
+    return static_cast<OperationDescription const&>(description_);
+  }
+
+private:
+  Status update_operator_arguments_from_configuration_2d_or_3d(
+    typename Operator::Arguments& out_args,
+    void const* configuration) const {
+    Status status = Status::kInvalid;
+
+    CUTLASS_ASSERT(configuration != nullptr);
+
+    if constexpr (Operator::NumSpatialDimensions == 2) {
+      CUTLASS_ASSERT(description_.kind == OperationKind::kConv2d);
+      // tools/library/include/cutlass/library/library.h
+      // defines Conv2dConfiguration.
+      // tools/profiler/include/cutlass/profiler/conv2d_operation_profiler.h
+      // uses Conv2dConfiguration.
+      auto* conf_ptr = reinterpret_cast<Conv2dConfiguration const*>(configuration);
+      status = update_operator_arguments_from_configuration(out_args, *conf_ptr);
+    }
+    else if constexpr (Operator::NumSpatialDimensions == 3) {
+      CUTLASS_ASSERT(description_.kind == OperationKind::kConv3d);
+      auto* conf_ptr = reinterpret_cast<Conv3dConfiguration const*>(configuration);
+      status = update_operator_arguments_from_configuration(out_args, *conf_ptr);
+    }
+    else {
+      static_assert(::cutlass::detail::dependent_false<Operator>,
+        "This class currently only supports 2-D and 3-D convolutions.");
+    }
+
+    return status;
+  }
+
+public:
+  Status can_implement(
+    void const* configuration,
+    void const* arguments) const override {
+    Status status = Status::kInvalid;
+
+    // gemm_operation_3x.hpp accesses "configuration" as
+    // GemmUniversalConfiguration (which lives in
+    // tools/library/include/cutlass/library/library.h) and
+    // "arguments" as GemmUniversalArguments (which lives in
+    // tools/library/include/cutlass/library/library.h).
+    // Those things don't apply to convolutions.
+    // Despite the existence of ConvUniversal, there's no
+    // corresponding "ConvUniversalConfiguration" or
+    // "ConvUniversalArguments."
+
+    CUTLASS_ASSERT(configuration != nullptr);
+    CUTLASS_ASSERT(arguments != nullptr);
+
+    typename Operator::Arguments out_args{};
+    status = update_operator_arguments_from_configuration_2d_or_3d(out_args, configuration);
+    if (status != Status::kSuccess) {
+      CUTLASS_TRACE_HOST("*** can_implement: update_operator_arguments_from_configuration_2d_or_3d failed");
+      return status;
+    }
+
+    auto* in_args_ptr = reinterpret_cast<ConvArguments const*>(arguments);
+    status = update_operator_arguments_from_arguments(out_args, *in_args_ptr);
+    if (status != Status::kSuccess) {
+      CUTLASS_TRACE_HOST("*** can_implement: update_operator_arguments_from_arguments failed");
+      return status;
+    }
+
+    return Operator::can_implement(out_args);
+  }
+
+  uint64_t get_host_workspace_size(void const* /* configuration */) const override {
+    return sizeof(Operator);
+  }
+
+  uint64_t get_device_workspace_size(
+    void const* configuration,
+    void const* arguments = nullptr) const override
+  {
+    // This presumes that at least one of configuration or arguments is nonnull.
+    Status status = Status::kInvalid;
+
+    // gemm_operation_3x.hpp has get_device_workspace_size return 0 on
+    // error.  It's not clear that this is what we want -- perhaps we
+    // should return something like expected<uint64_t, Status>? -- but
+    // it's the only option that preserves the current interface.
+    constexpr uint64_t error_indication = 0;
+
+    typename Operator::Arguments out_args{};
+    if (configuration != nullptr) {
+      status = update_operator_arguments_from_configuration_2d_or_3d(out_args, configuration);
+      if (status != Status::kSuccess) {
+        return error_indication;
+      }
+    }
+    if (arguments != nullptr) {
+      auto* in_args_ptr = reinterpret_cast<ConvArguments const*>(arguments);
+      status = update_operator_arguments_from_arguments(out_args, *in_args_ptr);
+      if (status != Status::kSuccess) {
+        return error_indication;
+      }
+    }
+
+    if (status == Status::kSuccess) {
+      return static_cast<uint64_t>(Operator::get_workspace_size(out_args));
+    }
+    else {
+      return error_indication;
+    }
+  }
+
+  Status initialize(
+    void const* configuration,
+    void* host_workspace,
+    void* /* device_workspace */ = nullptr,
+    cudaStream_t stream = nullptr) const override
+  {
+    Status status = Status::kInvalid;
+
+    if (configuration == nullptr) {
+      CUTLASS_TRACE_HOST("Input configuration is null.");
+      return Status::kInvalid;
+    }
+
+    typename Operator::Arguments out_args{};
+    status = update_operator_arguments_from_configuration_2d_or_3d(out_args, configuration);
+    if (status != Status::kSuccess) {
+      // Any kind of failure invalidates the last successful configuration.
+      clear_last_successful_config();
+      return status;
+    }
+    else {
+      set_last_successful_config(configuration);
+    }
+
+    if (host_workspace == nullptr) {
+      CUTLASS_TRACE_HOST("host_workspace is null.");
+      return Status::kInvalid;
+    }
+    (void) new (host_workspace) Operator;
+    return status;
+
+    // CUTLASS 2 convolutions call the Operator's initialize function
+    // here, like this.
+    //
+    //return op->initialize(args, device_workspace, stream);
+    //
+    // CUTLASS 3 convolutions (ConvUniversal), like CUTLASS 3 Gemms
+    // (GemmUniversal), lack an "initialize" member function.
+  }
+
+  Status run(
+    void const* arguments,
+    void* host_workspace,
+    void* device_workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    bool launch_with_pdl = false) const override
+  {
+    auto status = Status::kInvalid;
+
+    // The Operator doesn't appear to save the last configuration (it
+    // doesn't have a way to do that, since it lacks an initialize()
+    // member function), so we have to use the stored configuration
+    // from the last successful initialize() call (if any).
+    typename Operator::Arguments out_args{};
+    status = update_operator_arguments_from_stored_configuration(out_args);
+    if (status != Status::kSuccess) {
+      CUTLASS_TRACE_HOST("Updating from previous successful configuration failed.");
+      return status;
+    }
+
+    if (arguments == nullptr) {
+      CUTLASS_TRACE_HOST("Input argument 'arguments' is null.");
+      return Status::kInvalid;
+    }
+    auto* in_args_ptr = reinterpret_cast<ConvArguments const*>(arguments);
+    status = update_operator_arguments_from_arguments(out_args, *in_args_ptr);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    auto* op = reinterpret_cast<Operator*>(host_workspace);
+    return op->run(out_args, device_workspace, stream, nullptr, launch_with_pdl);
+  }
+
+private:
+  ConvDescription description_;
+  // Result of initialize() calling
+  // update_operator_arguments_from_configuration() successfully.
+  // This is needed because run() doesn't take a configuration, just
+  // arguments, and the kernel doesn't appear to save the
+  // configuration from the last initialize() call.
+  //
+  // Unfortunately, this must be declared mutable, because it must be
+  // set in initialize(), and initialize() is inherited as const.
+  mutable std::variant<
+    std::monostate,
+    Conv2dConfiguration,
+    Conv3dConfiguration> last_successful_config_{std::monostate{}};
+
+  // Clear the last configuration resulting from a successful initialize() call.
+  //
+  // Unfortunately, this must be declared const, because initialize() is.
+  void clear_last_successful_config() const {
+    last_successful_config_ = std::monostate{};
+  }
+
+  // Set the last configuration resulting from a successful initialize() call.
+  //
+  // Unfortunately, this must be declared const, because initialize() is.
+  void set_last_successful_config(void const* configuration) const {
+    CUTLASS_ASSERT(configuration != nullptr);
+
+    if constexpr (Operator::NumSpatialDimensions == 2) {
+      CUTLASS_ASSERT(description_.kind == OperationKind::kConv2d);
+      auto* conf_ptr = reinterpret_cast<Conv2dConfiguration const*>(configuration);
+      last_successful_config_ = *conf_ptr;
+    } else if constexpr (Operator::NumSpatialDimensions == 3) {
+      CUTLASS_ASSERT(description_.kind == OperationKind::kConv3d);
+      auto* conf_ptr = reinterpret_cast<Conv3dConfiguration const*>(configuration);
+      last_successful_config_ = *conf_ptr;
+    }
+    else {
+      static_assert(::cutlass::detail::dependent_false<Operator>,
+        "This class currently only supports 2-D and 3-D convolutions.");
+    }
+  }
+
+  // Whether a configuration from a successful initialize() call exists.
+  bool last_successful_config_exists() const {
+    return not std::holds_alternative<std::monostate>(last_successful_config_);
+  }
+
+  // Visitor for update_operator_arguments_from_stored_configuration.
+  struct ConfigurationVisitor {
+    typename Operator::Arguments& out_args;
+
+    Status operator() (std::monostate const&) const {
+      CUTLASS_TRACE_HOST("No successful previous configuration exists.  "
+        "One cause is calling run() before a successful initialize() call.");
+      return Status::kInvalid;
+    }
+    Status operator() (Conv2dConfiguration const& conf2d) const {
+      return update_operator_arguments_from_configuration(out_args, conf2d);
+    }
+    Status operator() (Conv3dConfiguration const& conf3d) const {
+      return update_operator_arguments_from_configuration(out_args, conf3d);
+    }
+  };
+
+  // Like update_operator_arguments_from_configuration, but on the
+  // stored configuration from the last successful initialize() call,
+  // if any.  If there was no last successful initialize() call,
+  // then return Status::kInvalid.
+  //
+  // Unfortunately, this must be declared const, because run() is.
+  Status update_operator_arguments_from_stored_configuration(
+    typename Operator::Arguments& out_args) const
+  {
+    return std::visit(ConfigurationVisitor{out_args}, last_successful_config_);
+  }
+
+  template<class FusionArgs, class = void>
+  struct UpdateFusionArgs {
+    static Status update_(
+      FusionArgs const&,
+      ConvArguments const&)
+    {
+      // For custom EVT, it is the user's responsibility to ensure
+      // that alpha and beta are updated appropriately.
+      return Status::kSuccess;
+    }
+  };
+
+  template<class FusionArgs>
+  struct UpdateFusionArgs<FusionArgs, cute::void_t<decltype(FusionArgs{}.alpha)>> {
+    static Status update_(
+      FusionArgs& fusion_args,
+      ConvArguments const& arguments)
+    {
+      if (arguments.pointer_mode == ScalarPointerMode::kHost) {
+        fusion_args.alpha = *static_cast<ElementCompute const *>(arguments.alpha);
+        fusion_args.beta = *static_cast<ElementCompute const *>(arguments.beta);
+        fusion_args.alpha_ptr = nullptr;
+        fusion_args.beta_ptr = nullptr;
+
+        return Status::kSuccess;
+      }
+      else if (arguments.pointer_mode == ScalarPointerMode::kDevice) {
+        fusion_args.alpha = 0;
+        fusion_args.beta = 0;
+        fusion_args.alpha_ptr = static_cast<ElementCompute const *>(arguments.alpha);
+        fusion_args.beta_ptr = static_cast<ElementCompute const *>(arguments.beta);
+
+        return Status::kSuccess;
+      }
+      else {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+  };
+
+  static Status update_operator_arguments_from_configuration(
+    typename Operator::Arguments& out_args,
+    Conv2dConfiguration const& config)
+  {
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("ConvOperator3x::"
+      "update_operator_arguments_from_configuration"
+      "(Conv2dConfiguration)\n");
+#endif    
+    using detail::vector_to_array_strides;
+
+    constexpr int num_spatial_dims = Operator::NumSpatialDimensions;
+    if constexpr (num_spatial_dims != 2) {
+      CUTLASS_TRACE_HOST("You can only use Conv2dConfiguration "
+        "with an Operator whose NumSpatialDimensions is exactly 2.");
+      return Status::kInvalid;
+    }
+    else {
+      // Convolutions split the metadata (in Conv2dConfiguration) from
+      // the data (ConvArguments, which only has pointers and a single
+      // enum value).  Thus, this class will need both the
+      // configuration and the (user's input) arguments to set up the
+      // kernel's arguments.  This function can fill in what the
+      // configuration has now, but the class will need the user's
+      // input arguments later.
+      if (config.split_k_mode != conv::SplitKMode::kSerial) {
+        CUTLASS_TRACE_HOST("CUTLASS 3 convolutions currently only support split_k_mode = kSerial.");
+        return Status::kInvalid;
+      }
+      // config.problem_size.split_k_slices is only meaningful if
+      // split_k_mode != kSerial.  If this code later supports other
+      // split_k_mode values, then it will also need to read
+      // split_k_slices.
+
+      const int N = config.problem_size.N;
+      const int H = config.problem_size.H;
+      const int W = config.problem_size.W;
+      const int C = config.problem_size.C;
+      const int K = config.problem_size.K;
+      const int R = config.problem_size.R;
+      const int S = config.problem_size.S;
+      const int pad_h = config.problem_size.pad_h;
+      const int pad_w = config.problem_size.pad_w;
+      const int traversal_stride_h = config.problem_size.stride_h;
+      const int traversal_stride_w = config.problem_size.stride_w;
+      const int dilation_h = config.problem_size.dilation_h;
+      const int dilation_w = config.problem_size.dilation_w;
+
+      // CUTLASS 3's implicit GEMM convolution kernels currently only
+      // support cross correlation (passing over the activation and
+      // filter tensors in the same order).  The convolution mode is
+      // future work.
+      const auto mode = config.problem_size.mode;
+      if (mode != cutlass::conv::Mode::kCrossCorrelation) {
+        CUTLASS_TRACE_HOST("Convolution modes other than kCrossCorrelation "
+          "are not currently supported.");
+        return Status::kInvalid;
+      }
+
+      constexpr int num_spatial_dims = Operator::NumSpatialDimensions;
+      constexpr size_t stride_size = size_t(num_spatial_dims) + 2u;
+      constexpr auto the_stride_size = std::integral_constant<size_t, stride_size>{};
+
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      std::cerr << "  num_spatial_dims = " << num_spatial_dims << "\n"
+                << "  stride_size = " << stride_size << "\n";
+      auto print_stride = [] (auto const& stride, char const variable_name[]) {
+        std::cerr << "  " << variable_name << ": [";
+        for (size_t k = 0; k < stride.size(); ++k) {
+          std::cerr << stride[k];
+          if (k + 1u < stride.size()) {
+            std::cerr << ", ";
+          }
+        }
+        std::cerr << "]\n";
+      };
+      print_stride(config.stride_a, "config.stride_a");
+      print_stride(config.stride_b, "config.stride_b");
+      print_stride(config.stride_c, "config.stride_c");
+#endif
+
+      // Conv2dConfiguration stores the strides as std::vector,
+      // so the code needs to check the run-time vector lengths.
+      if (config.stride_a.size() + 1u != stride_size) {
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL)
+        std::ostringstream os;
+        os << "config.stride_a.size() + 1u = "
+           << (config.stride_a.size() + 1u)
+           << " != num_spatial_dims + 2u = " << stride_size;
+        CUTLASS_TRACE_HOST( os.str() );
+#endif
+        return Status::kInvalid;
+      }
+      if (config.stride_b.size() + 1u != stride_size) {
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL)
+        std::ostringstream os;
+        os << "config.stride_b.size() + 1u = "
+           << (config.stride_b.size() + 1u)
+           << " != num_spatial_dims + 2u = " << stride_size;
+        CUTLASS_TRACE_HOST( os.str() );
+#endif
+        return Status::kInvalid;
+      }
+      if (config.stride_c.size() + 1u != stride_size) {
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL)
+        std::ostringstream os;
+        os << "config.stride_c.size() + 1u = "
+           << (config.stride_c.size() + 1u)
+           << " != num_spatial_dims + 2u = " << stride_size;
+        CUTLASS_TRACE_HOST( os.str() );
+#endif
+        return Status::kInvalid;
+      }
+
+      constexpr cutlass::conv::Operator conv_op = Operator::DispatchPolicy::ConvOp;
+      using problem_shape_type =
+        cutlass::conv::ConvProblemShape<conv_op, num_spatial_dims>;
+      // cute::array<int64_t, RankT>; must convert to the kernel's native strides
+      using TensorStride = typename problem_shape_type::TensorStride;
+
+      const TensorStride stride_A = vector_to_array_strides(config.stride_a, the_stride_size);
+      const TensorStride stride_B = vector_to_array_strides(config.stride_b, the_stride_size);
+      const TensorStride stride_C = vector_to_array_strides(config.stride_c, the_stride_size);
+
+      // cutlass::library::Conv2dConfiguration has no member stride_d.
+      // The code below imitates the testbed,
+      // which just sets D's strides to C's strides.
+
+      const int num_groups = config.problem_size.groups;
+      if (num_groups != 1) {
+        CUTLASS_TRACE_HOST("CUTLASS 3 kernels currently only support groups = 1.");
+        return Status::kInvalid;
+      }
+      // ConvProblemShape is how CUTLASS 3 kernels represent
+      // convolution problems.  ConvProblemShape's constructors take
+      // shape_act, stride_act, shape_flt, and stride_flt, and set
+      // shape_A, stride_A, shape_B, stride_B, shape_C, and stride_C
+      // according to Fprop / Dgrad / Wgrad.
+      //
+      // This means that stride_act isn't always config.stride_A,
+      // depending on Fprop / Dgrad / Wgrad.  The code here "undoes"
+      // the logic in Conv2dWorkspace::set_stride_vector so that we
+      // can recover the strides of the activation and filter tensors.
+      // It doesn't need to worry about the so-called "output" tensor
+      // (which might not be C), as ConvProblemShape's constructor
+      // figures out its shapes and strides.
+      using TensorExtent = typename problem_shape_type::TensorExtent;
+      TensorExtent shape_act{N, H, W, C};
+      auto stride_act = [&] () {
+        // Some compilers consider conv_op (defined above), as
+        // captured by this lambda, as "not a constant expression."
+        constexpr auto conv_kind = Operator::DispatchPolicy::ConvOp;
+        if constexpr (conv_kind == cutlass::conv::Operator::kFprop) {
+          return stride_A;
+        }
+        else if constexpr (conv_kind == cutlass::conv::Operator::kDgrad) {
+          return stride_C;
+        }
+        else { // conv_kind == cutlass::conv::Operator::kWgrad
+          return stride_B;
+        }
+      } ();
+      TensorExtent shape_flt{K, R, S, C};
+      auto stride_flt = [&] () {
+        // Some compilers consider conv_op (defined above), as
+        // captured by this lambda, as "not a constant expression."
+        constexpr auto conv_kind = Operator::DispatchPolicy::ConvOp;
+        if constexpr (conv_kind == cutlass::conv::Operator::kFprop) {
+          return stride_B;
+        }
+        else if constexpr (conv_kind == cutlass::conv::Operator::kDgrad) {
+          return stride_B;
+        }
+        else { // conv_kind == cutlass::conv::Operator::kWgrad
+          return stride_C;
+        }
+      } ();
+      
+      problem_shape_type problem_shape(
+        /* mode             = */ mode,
+        /* shape_act        = */ shape_act,
+        /* stride_act       = */ stride_act,
+        /* shape_flt        = */ shape_flt,
+        /* stride_flt       = */ stride_flt,
+        /* lower_padding    = */ {pad_h, pad_w},
+        /* upper_padding    = */ {pad_h, pad_w},
+        /* traversal_stride = */ {traversal_stride_h, traversal_stride_w},
+        /* dilation         = */ {dilation_h, dilation_w},
+                                 num_groups);
+      out_args.problem_shape = problem_shape;
+
+      // ConvProblemShape's constructor sets its shape_C member.
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      printf("\n  problem_shape.shape_C: ");
+      print(problem_shape.shape_C);
+      printf("\n  problem_shape.stride_C: ");
+      print(problem_shape.stride_C);
+      printf("\n");
+#endif
+      // Initialization of C's and D's strides follows the CUTLASS 3
+      // convolutions testbed (test/unit/conv/device_3x/testbed_conv.hpp).
+      {
+        using StrideC = typename Operator::ConvKernel::StrideC;
+        using StrideD = typename Operator::ConvKernel::StrideD;
+        auto stride_C = StrideC{};
+        auto stride_D = StrideD{};
+
+        if constexpr (conv_op == cutlass::conv::Operator::kWgrad) {
+          stride_C = cutlass::make_cute_packed_stride(
+            StrideC{}, problem_shape.shape_C, problem_shape.stride_C, conv_op);
+          stride_D = cutlass::make_cute_packed_stride(
+            StrideD{}, problem_shape.shape_C, problem_shape.stride_C, conv_op);
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+          std::cerr << "  Wgrad: stride_C: " << stride_C << "\n";
+#endif
+        }
+        else {
+          cute::for_each(cute::make_seq<cute::rank<0>(StrideC{})>{}, [&](auto i) {
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+            const auto stride_C_i = problem_shape.stride_C[problem_shape_type::RankT-2-i];
+            std::cerr << "  Fprop or Dgrad: get<0, " << i << ">(stride_C): "
+                      << stride_C_i << "\n";
+#endif
+            cute::get<0, i>(stride_C) = problem_shape.stride_C[problem_shape_type::RankT-2-i];
+          });
+          cute::for_each(cute::make_seq<cute::rank<0>(StrideD{})>{}, [&](auto i) {
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+            const auto stride_D_i = problem_shape.stride_C[problem_shape_type::RankT-2-i];
+            std::cerr << "  Fprop or Dgrad: get<0, " << i << ">(stride_D): "
+                      << stride_D_i << "\n";
+#endif
+            cute::get<0, i>(stride_D) = problem_shape.stride_C[problem_shape_type::RankT-2-i];
+          });
+        }
+        out_args.epilogue.dC = stride_C;
+        out_args.epilogue.dD = stride_D;
+      }
+      return Status::kSuccess;
+    }
+  }
+
+  static Status update_operator_arguments_from_configuration(
+    typename Operator::Arguments& out_args,
+    Conv3dConfiguration const& config)
+  {
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("ConvOperator3x::"
+      "update_operator_arguments_from_configuration"
+      "(Conv3dConfiguration)\n");
+#endif    
+    using detail::coord_to_array_strides;
+
+    constexpr int num_spatial_dims = Operator::NumSpatialDimensions;
+    if constexpr (num_spatial_dims != 3) {
+      CUTLASS_TRACE_HOST("You can only use Conv3dConfiguration "
+        "with an Operator whose NumSpatialDimensions is exactly 3.");
+      return Status::kInvalid;
+    }
+    else {
+      // Convolutions split the metadata (in Conv3dConfiguration) from
+      // the data (ConvArguments, which only has pointers and a single
+      // enum value).  Thus, this class will need both the
+      // configuration and the (user's input) arguments to set up the
+      // kernel's arguments.  This function can fill in what the
+      // configuration has now, but the class will need the user's
+      // input arguments later.
+      if (config.split_k_mode != conv::SplitKMode::kSerial) {
+        CUTLASS_TRACE_HOST("CUTLASS 3 convolutions currently only support split_k_mode = kSerial.");
+        return Status::kInvalid;
+      }
+      // config.problem_size.split_k_slices is only meaningful if
+      // split_k_mode != kSerial.  If this code later supports other
+      // split_k_mode values, then it will also need to read
+      // split_k_slices.
+
+      const int N = config.problem_size.N;
+      const int D = config.problem_size.D;
+      const int H = config.problem_size.H;
+      const int W = config.problem_size.W;
+      const int C = config.problem_size.C;
+      const int K = config.problem_size.K;
+      const int T = config.problem_size.T;
+      const int R = config.problem_size.R;
+      const int S = config.problem_size.S;
+      const int pad_d = config.problem_size.pad_d;
+      const int pad_h = config.problem_size.pad_h;
+      const int pad_w = config.problem_size.pad_w;
+      const int traversal_stride_d = config.problem_size.stride_d;
+      const int traversal_stride_h = config.problem_size.stride_h;
+      const int traversal_stride_w = config.problem_size.stride_w;
+      const int dilation_d = config.problem_size.dilation_d;
+      const int dilation_h = config.problem_size.dilation_h;
+      const int dilation_w = config.problem_size.dilation_w;
+
+      // CUTLASS 3's implicit GEMM convolution kernels currently only
+      // support cross correlation (passing over the activation and
+      // filter tensors in the same order).  The convolution mode is
+      // future work.
+      const auto mode = config.problem_size.mode;
+      if (mode != cutlass::conv::Mode::kCrossCorrelation) {
+        CUTLASS_TRACE_HOST("Convolution modes other than kCrossCorrelation "
+          "are not currently supported.");
+        return Status::kInvalid;
+      }
+
+      using Stride = cutlass::layout::TensorNDHWC::Stride;
+      static_assert(std::is_same_v<Stride, cutlass::Coord<4>>);
+
+      const cutlass::library::ConvKind conv_kind = [] () {
+        constexpr cutlass::conv::Operator op = Operator::DispatchPolicy::ConvOp;
+        if constexpr (op == cutlass::conv::Operator::kFprop) {
+          return library::ConvKind::kFprop;
+        }
+        else if constexpr (op == cutlass::conv::Operator::kDgrad) {
+          return library::ConvKind::kDgrad;
+        }
+        else /* if constexpr (op == cutlass::conv::Operator::kWgrad) */ {
+          return library::ConvKind::kWgrad;
+        }
+      } ();
+      const Stride input_stride_a = config.layout_a(conv_kind).stride();
+      const Stride input_stride_b = config.layout_b(conv_kind).stride();
+      const Stride input_stride_c = config.layout_c(conv_kind).stride();
+
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      constexpr size_t stride_size = size_t(num_spatial_dims) + 2u;
+      std::cerr << "  num_spatial_dims = " << num_spatial_dims << "\n"
+                << "  stride_size = " << stride_size << "\n";
+      auto print_stride = [] (Stride const& stride, char const variable_name[]) {
+        std::cerr << "  " << variable_name << ": [";
+        for (size_t k = 0; k < Stride::kRank; ++k) {
+          std::cerr << stride[static_cast<int>(k)];
+          if (k + 1u < Stride::kRank) {
+            std::cerr << ", ";
+          }
+        }
+        std::cerr << "]\n";
+      };
+      print_stride(input_stride_a, "input_stride_a");
+      print_stride(input_stride_b, "input_stride_b");
+      print_stride(input_stride_c, "input_stride_c");
+#endif
+      // Conv3dConfiguration stores the strides as Coord (with
+      // compile-time size), so there's no need to check sizes here
+      // (unlike Conv2dConfiguration, which stores strides as
+      // std::vector).
+
+      constexpr cutlass::conv::Operator conv_op = Operator::DispatchPolicy::ConvOp;
+      using problem_shape_type =
+        cutlass::conv::ConvProblemShape<conv_op, num_spatial_dims>;
+      // cute::array<int64_t, RankT>; must convert to the kernel's native strides
+      using TensorStride = typename problem_shape_type::TensorStride;
+
+      const TensorStride stride_A = coord_to_array_strides(input_stride_a);
+      const TensorStride stride_B = coord_to_array_strides(input_stride_b);
+      const TensorStride stride_C = coord_to_array_strides(input_stride_c);
+
+      const int num_groups = config.problem_size.groups;
+      if (num_groups != 1) {
+        CUTLASS_TRACE_HOST("CUTLASS 3 kernels currently only support groups = 1.");
+        return Status::kInvalid;
+      }
+      // ConvProblemShape is how CUTLASS 3 kernels represent
+      // convolution problems.  ConvProblemShape's constructors take
+      // shape_act, stride_act, shape_flt, and stride_flt, and set
+      // shape_A, stride_A, shape_B, stride_B, shape_C, and stride_C
+      // according to Fprop / Dgrad / Wgrad.
+      //
+      // Conv3dConfiguration differs a bit from Conv2dConfiguration,
+      // but the idea is the same: the "input_stride_a" from config
+      // depends on conv_kind (Fprop, Dgrad, or Wgrad), so stride_act
+      // isn't always input_stride_a.  Analogously, stride_flt isn't
+      // always input_stride_b.  The code here "undoes" the logic in
+      // config.layout_a(conv_kind) and config.layout_b(conv_kind)
+      // (analogous to Conv2dWorkspace::set_stride_vector) so that we
+      // can recover the strides of the activation and filter tensors.
+      // It doesn't need to worry about the so-called "output" tensor
+      // (which might not be C), as ConvProblemShape's constructor
+      // figures out its shapes and strides.
+      using TensorExtent = typename problem_shape_type::TensorExtent;
+      TensorExtent shape_act{N, D, H, W, C};
+      auto stride_act = [&] () {
+        // Some compilers consider conv_op (defined above), as
+        // captured by this lambda, as "not a constant expression."
+        constexpr auto conv_kind = Operator::DispatchPolicy::ConvOp;
+        if constexpr (conv_kind == cutlass::conv::Operator::kFprop) {
+          return stride_A;
+        }
+        else if constexpr (conv_kind == cutlass::conv::Operator::kDgrad) {
+          return stride_C;
+        }
+        else { // conv_kind == cutlass::conv::Operator::kWgrad
+          return stride_B;
+        }
+      } ();
+      TensorExtent shape_flt{K, T, R, S, C};
+      auto stride_flt = [&] () {
+        // Some compilers consider conv_op (defined above), as
+        // captured by this lambda, as "not a constant expression."
+        constexpr auto conv_kind = Operator::DispatchPolicy::ConvOp;
+        if constexpr (conv_kind == cutlass::conv::Operator::kFprop) {
+          return stride_B;
+        }
+        else if constexpr (conv_kind == cutlass::conv::Operator::kDgrad) {
+          return stride_B;
+        }
+        else { // conv_kind == cutlass::conv::Operator::kWgrad
+          return stride_C;
+        }
+      } ();
+
+      problem_shape_type problem_shape(
+        /* mode             = */ mode,
+        /* shape_act        = */ shape_act,
+        /* stride_act       = */ stride_act,
+        /* shape_flt        = */ shape_flt,
+        /* stride_flt       = */ stride_flt,
+        /* lower_padding    = */ {pad_d, pad_h, pad_w},
+        /* upper_padding    = */ {pad_d, pad_h, pad_w},
+        /* traversal_stride = */ {traversal_stride_d, traversal_stride_h, traversal_stride_w},
+        /* dilation         = */ {dilation_d, dilation_h, dilation_w},
+                                 num_groups);
+      out_args.problem_shape = problem_shape;
+
+      // ConvProblemShape's constructor sets its shape_C member.
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      printf("\n  problem_shape.shape_C: ");
+      print(problem_shape.shape_C);
+      printf("\n  problem_shape.stride_C: ");
+      print(problem_shape.stride_C);
+      printf("\n");
+#endif
+      // Initialization of C's and D's strides follows the CUTLASS 3
+      // convolutions testbed (test/unit/conv/device_3x/testbed_conv.hpp).
+      {
+        using StrideC = typename Operator::ConvKernel::StrideC;
+        using StrideD = typename Operator::ConvKernel::StrideD;
+        auto stride_C = StrideC{};
+        auto stride_D = StrideD{};
+
+        if constexpr (conv_op == cutlass::conv::Operator::kWgrad) {
+          stride_C = cutlass::make_cute_packed_stride(
+            StrideC{}, problem_shape.shape_C, problem_shape.stride_C, conv_op);
+          stride_D = cutlass::make_cute_packed_stride(
+            StrideD{}, problem_shape.shape_C, problem_shape.stride_C, conv_op);
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+          std::cerr << "  Wgrad: stride_C: " << stride_C << "\n";
+#endif
+        }
+        else {
+          cute::for_each(cute::make_seq<cute::rank<0>(StrideC{})>{}, [&](auto i) {
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+            const auto stride_C_i = problem_shape.stride_C[problem_shape_type::RankT-2-i];
+            std::cerr << "  Fprop or Dgrad: get<0, " << i << ">(stride_C): "
+                      << stride_C_i << "\n";
+#endif
+            cute::get<0, i>(stride_C) = problem_shape.stride_C[problem_shape_type::RankT-2-i];
+          });
+          cute::for_each(cute::make_seq<cute::rank<0>(StrideD{})>{}, [&](auto i) {
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+            const auto stride_D_i = problem_shape.stride_C[problem_shape_type::RankT-2-i];
+            std::cerr << "  Fprop or Dgrad: get<0, " << i << ">(stride_D): "
+                      << stride_D_i << "\n";
+#endif
+            cute::get<0, i>(stride_D) = problem_shape.stride_C[problem_shape_type::RankT-2-i];
+          });
+        }
+        out_args.epilogue.dC = stride_C;
+        out_args.epilogue.dD = stride_D;
+      }
+      return Status::kSuccess;
+    }
+  }
+
+  Status update_operator_arguments_from_arguments(
+    typename Operator::Arguments& out_args,
+    ConvArguments const& in_args) const
+  {
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("ConvOperation3x::update_operator_arguments_from_arguments\n");
+#endif
+    auto status = UpdateFusionArgs<decltype(out_args.epilogue.thread)>::update_(
+      out_args.epilogue.thread, in_args);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    out_args.mainloop.ptr_A = reinterpret_cast<ElementA const*>(in_args.A);
+    out_args.mainloop.ptr_B = reinterpret_cast<ElementB const*>(in_args.B);
+
+    out_args.epilogue.ptr_C = reinterpret_cast<ElementC const*>(in_args.C);
+    out_args.epilogue.ptr_D = reinterpret_cast<ElementD*>(in_args.D);
+
+    return Status::kSuccess;
+  }
+};
+
+} // namespace cutlass::library
diff --git a/tools/library/src/gemm_operation.h b/tools/library/src/gemm_operation.h
index 84ce0a918a..5c6f9ca815 100644
--- a/tools/library/src/gemm_operation.h
+++ b/tools/library/src/gemm_operation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -64,6 +64,8 @@ class GemmOperationBase : public Operation {
   using LayoutB = typename Operator::LayoutB;
   using ElementC = typename Operator::ElementC;
   using LayoutC = typename Operator::LayoutC;
+  using ElementD = ElementC;
+  using LayoutD = LayoutC;
   // assuming all tensors use same type for StrideIndex 
   using StrideIndex = typename Operator::LayoutA::Index;
   using ElementAccumulator = typename Operator::ElementAccumulator;
@@ -121,6 +123,7 @@ class GemmOperationBase : public Operation {
     description_.A = make_TensorDescription<ElementA, LayoutA>(Operator::kAlignmentA);
     description_.B = make_TensorDescription<ElementB, LayoutB>(Operator::kAlignmentB);
     description_.C = make_TensorDescription<ElementC, LayoutC>(Operator::kAlignmentC);
+    description_.D = make_TensorDescription<ElementD, LayoutD>(Operator::kAlignmentC);
     description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
 
     description_.split_k_mode = SplitKMode::kNone;
@@ -147,6 +150,8 @@ class GemmOperation : public GemmOperationBase<Operator_> {
   using LayoutB = typename Operator::LayoutB;
   using ElementC = typename Operator::ElementC;
   using LayoutC = typename Operator::LayoutC;
+  using ElementD = ElementC;
+  using LayoutD = LayoutC;
   using ElementAccumulator = typename Operator::ElementAccumulator;
   using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
   using OperatorArguments = typename Operator::Arguments;
@@ -204,7 +209,7 @@ class GemmOperation : public GemmOperationBase<Operator_> {
     operator_args.ref_A.reset(static_cast<ElementA const *>(arguments->A));
     operator_args.ref_B.reset(static_cast<ElementB const *>(arguments->B));
     operator_args.ref_C.reset(static_cast<ElementC const *>(arguments->C));
-    operator_args.ref_D.reset(static_cast<ElementC *>(arguments->D));
+    operator_args.ref_D.reset(static_cast<ElementD *>(arguments->D));
 
     return Status::kSuccess;
   }
@@ -291,7 +296,12 @@ class GemmOperation : public GemmOperationBase<Operator_> {
     void const *arguments_ptr,
     void *host_workspace, 
     void *device_workspace = nullptr, 
-    cudaStream_t stream = nullptr) const {
+    cudaStream_t stream = nullptr,
+    bool launch_with_pdl = false) const {
+
+    if (launch_with_pdl) {
+      return Status::kErrorNotSupported;
+    }
 
     OperatorArguments args;
 
@@ -305,7 +315,7 @@ class GemmOperation : public GemmOperationBase<Operator_> {
 
     Operator *op = static_cast<Operator *>(host_workspace);
 
-    status = op->update(args, device_workspace);
+    status = op->update(args);
 
     if (status != Status::kSuccess) {
       return status;
@@ -345,6 +355,8 @@ class GemmSparseOperation : public GemmOperationBase<Operator_> {
   using LayoutB = typename Operator::LayoutB;
   using ElementC = typename Operator::ElementC;
   using LayoutC = typename Operator::LayoutC;
+  using ElementD = ElementC;
+  using LayoutD = LayoutC;
   using ElementE = typename Operator::ElementE;
   using LayoutE = typename Operator::LayoutE;
   using ElementAccumulator = typename Operator::ElementAccumulator;
@@ -405,7 +417,7 @@ class GemmSparseOperation : public GemmOperationBase<Operator_> {
     operator_args.ref_A.reset(static_cast<ElementA const *>(arguments->A));
     operator_args.ref_B.reset(static_cast<ElementB const *>(arguments->B));
     operator_args.ref_C.reset(static_cast<ElementC const *>(arguments->C));
-    operator_args.ref_D.reset(static_cast<ElementC *>(arguments->D));
+    operator_args.ref_D.reset(static_cast<ElementD *>(arguments->D));
     operator_args.ref_E.reset(static_cast<ElementE const *>(arguments->E));
 
     return Status::kSuccess;
@@ -493,7 +505,12 @@ class GemmSparseOperation : public GemmOperationBase<Operator_> {
     void const *arguments_ptr,
     void *host_workspace, 
     void *device_workspace = nullptr, 
-    cudaStream_t stream = nullptr) const {
+    cudaStream_t stream = nullptr,
+    bool launch_with_pdl = false) const {
+
+    if (launch_with_pdl) {
+      return Status::kErrorNotSupported;
+    }
 
     OperatorArguments args;
 
@@ -507,7 +524,7 @@ class GemmSparseOperation : public GemmOperationBase<Operator_> {
 
     Operator *op = static_cast<Operator *>(host_workspace);
 
-    status = op->update(args, device_workspace);
+    status = op->update(args);
 
     if (status != Status::kSuccess) {
       return status;
@@ -547,6 +564,8 @@ class GemmUniversalOperation : public GemmOperationBase<Operator_> {
   using LayoutB = typename Operator::LayoutB;
   using ElementC = typename Operator::ElementC;
   using LayoutC = typename Operator::LayoutC;
+  using ElementD = ElementC;
+  using LayoutD = LayoutC;
   using ElementAccumulator = typename Operator::ElementAccumulator;
   using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
 
@@ -712,7 +731,12 @@ class GemmUniversalOperation : public GemmOperationBase<Operator_> {
     void const *arguments_ptr,
     void *host_workspace, 
     void *device_workspace = nullptr, 
-    cudaStream_t stream = nullptr) const {
+    cudaStream_t stream = nullptr,
+    bool launch_with_pdl = false) const {
+
+    if (launch_with_pdl) {
+      return Status::kErrorNotSupported;
+    }
 
     OperatorArguments args;
     
@@ -725,8 +749,8 @@ class GemmUniversalOperation : public GemmOperationBase<Operator_> {
     }
     
     Operator *op = static_cast<Operator *>(host_workspace);
-    
-    status = op->update(args, device_workspace);
+
+    status = op->update(args);
 
     if (status != Status::kSuccess) {
       return status;
@@ -751,6 +775,8 @@ class GemmPlanarComplexOperation : public GemmOperationBase<Operator_> {
   using LayoutB = typename Operator::LayoutB;
   using ElementC = typename Operator::ElementC;
   using LayoutC = typename Operator::LayoutC;
+  using ElementD = ElementC;
+  using LayoutD = LayoutC;
   using ElementAccumulator = typename Operator::ElementAccumulator;
   using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
 
@@ -917,30 +943,35 @@ class GemmPlanarComplexOperation : public GemmOperationBase<Operator_> {
   /// Runs the kernel
   virtual Status run(
     void const *arguments_ptr,
-    void *host_workspace, 
-    void *device_workspace = nullptr, 
-    cudaStream_t stream = nullptr) const {
+    void *host_workspace,
+    void *device_workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    bool launch_with_pdl = false) const {
+    
+    if (launch_with_pdl) {
+      return Status::kErrorNotSupported;
+    }
 
     OperatorArguments args;
-    
+
     Status status = update_arguments_(
-      args, 
+      args,
       static_cast<GemmPlanarComplexArguments const *>(arguments_ptr));
 
     if (status != Status::kSuccess) {
       return status;
     }
-    
+
     Operator *op = static_cast<Operator *>(host_workspace);
-    
-    status = op->update(args, device_workspace);
+
+    status = op->update(args);
 
     if (status != Status::kSuccess) {
       return status;
     }
-    
+
     status = op->run(stream);
-    
+
     return status;
   }
 };
@@ -958,6 +989,8 @@ class GemmPlanarComplexArrayOperation : public GemmOperationBase<Operator_> {
   using LayoutB = typename Operator::LayoutB;
   using ElementC = typename Operator::ElementC;
   using LayoutC = typename Operator::LayoutC;
+  using ElementD = ElementC;
+  using LayoutD = LayoutC;
   using ElementAccumulator = typename Operator::ElementAccumulator;
   using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
 
@@ -1120,7 +1153,12 @@ class GemmPlanarComplexArrayOperation : public GemmOperationBase<Operator_> {
     void const *arguments_ptr,
     void *host_workspace, 
     void *device_workspace = nullptr, 
-    cudaStream_t stream = nullptr) const {
+    cudaStream_t stream = nullptr,
+    bool launch_with_pdl = false) const {
+    
+    if (launch_with_pdl) {
+      return Status::kErrorNotSupported;
+    }
 
     OperatorArguments args;
     
@@ -1134,7 +1172,7 @@ class GemmPlanarComplexArrayOperation : public GemmOperationBase<Operator_> {
     
     Operator *op = static_cast<Operator *>(host_workspace);
     
-    status = op->update(args, device_workspace);
+    status = op->update(args);
 
     if (status != Status::kSuccess) {
       return status;
@@ -1159,6 +1197,8 @@ class GemmGroupedOperation : public GemmOperationBase<Operator_> {
   using LayoutB = typename Operator::LayoutB;
   using ElementC = typename Operator::ElementC;
   using LayoutC = typename Operator::LayoutC;
+  using ElementD = ElementC;
+  using LayoutD = LayoutC;
   using ElementAccumulator = typename Operator::ElementAccumulator;
   using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
 
@@ -1218,7 +1258,7 @@ class GemmGroupedOperation : public GemmOperationBase<Operator_> {
     op_args.ptr_A         = static_cast<ElementA **>(arguments->ptr_A);
     op_args.ptr_B         = static_cast<ElementB **>(arguments->ptr_B);
     op_args.ptr_C         = static_cast<ElementC **>(arguments->ptr_C);
-    op_args.ptr_D         = static_cast<ElementC **>(arguments->ptr_D);
+    op_args.ptr_D         = static_cast<ElementD **>(arguments->ptr_D);
 
     op_args.lda           = arguments->lda;
     op_args.ldb           = arguments->ldb;
@@ -1322,7 +1362,12 @@ class GemmGroupedOperation : public GemmOperationBase<Operator_> {
     void const *arguments_ptr,
     void *host_workspace,
     void *device_workspace = nullptr,
-    cudaStream_t stream = nullptr) const {
+    cudaStream_t stream = nullptr,
+    bool launch_with_pdl = false) const {
+
+    if (launch_with_pdl) {
+      return Status::kErrorNotSupported;
+    }
 
     OperatorArguments args;
 
@@ -1336,7 +1381,7 @@ class GemmGroupedOperation : public GemmOperationBase<Operator_> {
 
     Operator *op = static_cast<Operator *>(host_workspace);
 
-    status = op->update(args, device_workspace);
+    status = op->update(args);
 
     if (status != Status::kSuccess) {
       return status;
diff --git a/tools/library/src/gemm_operation_3x.hpp b/tools/library/src/gemm_operation_3x.hpp
new file mode 100644
index 0000000000..7c87b45e0f
--- /dev/null
+++ b/tools/library/src/gemm_operation_3x.hpp
@@ -0,0 +1,354 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines operations for all GEMM operation kinds in CUTLASS Library.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/library/library.h"
+#include "library_internal.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include <unordered_map>
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class GemmOperation3xBase : public Operation {
+public:
+  using Operator = Operator_;
+  using OperatorArguments = typename Operator::Arguments;
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementD = typename Operator::ElementD;
+  using LayoutD = typename Operator::LayoutD;
+  // assuming all tensors use same type for StrideIndex
+  using StrideIndex = typename Operator::LayoutA::Index;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+
+private:
+  GemmDescription description_;
+
+public:
+
+  /// Constructor
+  GemmOperation3xBase(char const *name = "unknown_gemm", GemmKind gemm_kind_ = GemmKind::kGemm) {
+
+    description_.name = name;
+    description_.provider = Provider::kCUTLASS;
+    description_.kind = OperationKind::kGemm;
+    description_.gemm_kind = gemm_kind_;
+
+    description_.tile_description.threadblock_shape = make_Coord(
+      Operator::ThreadblockShape::kM,
+      Operator::ThreadblockShape::kN,
+      Operator::ThreadblockShape::kK);
+
+    if constexpr (Operator::ArchTag::kMinComputeCapability >= 90) {
+      description_.tile_description.cluster_shape = make_Coord(
+        Operator::ClusterShape::kM,
+        Operator::ClusterShape::kN,
+        Operator::ClusterShape::kK);
+    }
+
+    description_.tile_description.threadblock_stages = Operator::kStages;
+
+    description_.tile_description.warp_count = make_Coord(
+      Operator::WarpCount::kM,
+      Operator::WarpCount::kN,
+      Operator::WarpCount::kK);
+
+    description_.tile_description.math_instruction.instruction_shape = make_Coord(
+      Operator::InstructionShape::kM,
+      Operator::InstructionShape::kN,
+      Operator::InstructionShape::kK);
+
+    description_.tile_description.math_instruction.element_accumulator =
+      NumericTypeMap<ElementAccumulator>::kId;
+
+    description_.tile_description.math_instruction.opcode_class =
+      OpcodeClassMap<typename Operator::OperatorClass>::kId;
+
+    description_.tile_description.math_instruction.math_operation =
+      MathOperationMap<typename Operator::MathOperator>::kId;
+
+    description_.tile_description.minimum_compute_capability =
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMin;
+
+    description_.tile_description.maximum_compute_capability =
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMax;
+
+    description_.A = make_TensorDescription<ElementA, LayoutA>(Operator::kAlignmentA);
+    description_.B = make_TensorDescription<ElementB, LayoutB>(Operator::kAlignmentB);
+    description_.C = make_TensorDescription<ElementC, LayoutC>(Operator::kAlignmentC);
+    description_.D = make_TensorDescription<ElementD, LayoutD>(Operator::kAlignmentD);
+    description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
+
+    description_.split_k_mode = SplitKMode::kNone;
+    description_.transform_A = ComplexTransformMap<Operator::kTransformA>::kId;
+    description_.transform_B = ComplexTransformMap<Operator::kTransformB>::kId;
+  }
+
+  /// Returns the description of the GEMM operation
+  virtual OperationDescription const & description() const {
+    return description_;
+  }
+
+  /// Returns the description of the GEMM operation
+  GemmDescription const& get_gemm_description() const {
+    return description_;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class GemmUniversal3xOperation : public GemmOperation3xBase<Operator_> {
+public:
+
+  using Operator = Operator_;
+  using OperatorArguments = typename Operator::Arguments;
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementD = typename Operator::ElementD;
+  using LayoutD = typename Operator::LayoutD;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+
+  using CollectiveMainloop = typename Operator::CollectiveMainloop;
+  using CollectiveEpilogue = typename Operator::CollectiveEpilogue;
+  using ThreadEpilogueOp = typename CollectiveEpilogue::ThreadEpilogueOp;
+
+public:
+
+  /// Constructor
+  GemmUniversal3xOperation(char const *name = "unknown_gemm"):
+    GemmOperation3xBase<Operator_>(name, GemmKind::kUniversal) {}
+
+protected:
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status construct_arguments_(
+      OperatorArguments &operator_args, GemmUniversalConfiguration const *configuration) {
+    // NOTE: GemmUniversalConfiguration does not contain problem shapes or batch strides
+    // Do nothing here and construct kernel arguments in update_arguments_ instead
+    // We also cannot construct TMA descriptors without all the arguments available
+
+    operator_args.mode = configuration->mode;
+    return Status::kSuccess;
+  }
+
+  template<class FusionArgs, class = void>
+  struct UpdateFusionArgs {
+    static Status update_(FusionArgs const& fusion_args, GemmUniversalArguments const &arguments) {
+      // If a custom EVT is instantiated then it is the users's responsibility
+      // to ensure alpha and beta are updated appropriately
+      return Status::kSuccess;
+    }
+  };
+
+  template<class FusionArgs>
+  struct UpdateFusionArgs<FusionArgs, cute::void_t<decltype(FusionArgs{}.alpha)>> {
+    static Status update_(FusionArgs& fusion_args, GemmUniversalArguments const &arguments) {
+      if (arguments.pointer_mode == ScalarPointerMode::kHost) {
+        fusion_args.alpha = *static_cast<ElementCompute const *>(arguments.alpha);
+        fusion_args.beta = *static_cast<ElementCompute const *>(arguments.beta);
+        fusion_args.alpha_ptr = nullptr;
+        fusion_args.beta_ptr = nullptr;
+
+        return Status::kSuccess;
+      }
+      else if (arguments.pointer_mode == ScalarPointerMode::kDevice) {
+        fusion_args.alpha = 0;
+        fusion_args.beta = 0;
+        fusion_args.alpha_ptr = static_cast<ElementCompute const *>(arguments.alpha);
+        fusion_args.beta_ptr = static_cast<ElementCompute const *>(arguments.beta);
+
+        return Status::kSuccess;
+      }
+      else {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+  };
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status update_arguments_(
+      OperatorArguments &operator_args,
+      GemmUniversalArguments const *arguments) {
+    Status status = Status::kSuccess;
+
+    status = UpdateFusionArgs<decltype(operator_args.epilogue.thread)>::update_(
+      operator_args.epilogue.thread, *arguments);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // TODO: type erase Arguments structure in 3.0 GEMM
+    operator_args.problem_shape = cute::make_shape(
+      arguments->problem_size.m(),
+      arguments->problem_size.n(),
+      arguments->problem_size.k(),
+      arguments->batch_count);
+
+    // update arguments
+    operator_args.mainloop.ptr_A = static_cast<ElementA const *>(arguments->A);
+    operator_args.mainloop.ptr_B = static_cast<ElementB const *>(arguments->B);
+    operator_args.epilogue.ptr_C = static_cast<ElementC const *>(arguments->C);
+    operator_args.epilogue.ptr_D = static_cast<ElementD       *>(arguments->D);
+
+    operator_args.mainloop.dA = cute::make_int_tuple_from<typename Operator::GemmKernel::StrideA>(
+        arguments->lda, arguments->batch_stride_A);
+    operator_args.mainloop.dB = cute::make_int_tuple_from<typename Operator::GemmKernel::StrideB>(
+        arguments->ldb, arguments->batch_stride_B);
+    operator_args.epilogue.dC = cute::make_int_tuple_from<typename Operator::GemmKernel::StrideC>(
+        arguments->ldc, arguments->batch_stride_C);
+    operator_args.epilogue.dD = operator_args.epilogue.dC;
+
+    /* Query device SM count to pass onto the kernel as an argument, where needed */
+    operator_args.hw_info.sm_count = arguments->sm_count;
+    if constexpr (!std::is_const_v<decltype(operator_args.scheduler.max_swizzle_size)>) {
+      operator_args.scheduler.max_swizzle_size = arguments->swizzle_size;
+    }
+
+    if constexpr (!std::is_const_v<decltype(operator_args.scheduler.raster_order)>) {
+      using Enum_t = decltype(operator_args.scheduler.raster_order);
+      switch (arguments->raster_order) {
+        case RasterOrder::kAlongN:
+          operator_args.scheduler.raster_order = Enum_t::AlongN;
+          break;
+        case RasterOrder::kAlongM:
+          operator_args.scheduler.raster_order = Enum_t::AlongM;
+          break;
+        default:
+          operator_args.scheduler.raster_order = Enum_t::Heuristic;
+      }
+    }
+
+    return status;
+  }
+
+public:
+
+  /// Returns success if the operation can proceed
+  Status can_implement(
+      void const *configuration_ptr, void const *arguments_ptr) const override {
+    GemmUniversalConfiguration const *configuration =
+      static_cast<GemmUniversalConfiguration const *>(configuration_ptr);
+    GemmUniversalArguments const *arguments =
+      static_cast<GemmUniversalArguments const *>(arguments_ptr);
+
+    OperatorArguments args;
+    // can_implement rules may need access to problem shape
+    args.problem_shape = cute::make_shape(
+      configuration->problem_size.m(),
+      configuration->problem_size.n(),
+      configuration->problem_size.k(),
+      configuration->batch_count);
+
+    auto status = update_arguments_(args, arguments);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Operator::can_implement(args);
+  }
+
+  /// Gets the host-side workspace
+  uint64_t get_host_workspace_size(void const *configuration) const override {
+    return sizeof(Operator);
+  }
+
+  /// Gets the device-side workspace
+  uint64_t get_device_workspace_size(
+      void const *configuration_ptr,void const *arguments_ptr) const override {
+
+    OperatorArguments args;
+    auto status = update_arguments_(
+      args, static_cast<GemmUniversalArguments const *>(arguments_ptr));
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    uint64_t size = Operator::get_workspace_size(args);
+    return size;
+  }
+
+  /// Initializes the workspace
+  Status initialize(
+      void const *configuration_ptr,
+      void *host_workspace,
+      void *device_workspace,
+      cudaStream_t stream = nullptr) const override {
+    Operator *op = new (host_workspace) Operator;
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel
+  Status run(
+      void const *arguments_ptr,
+      void *host_workspace,
+      void *device_workspace = nullptr,
+      cudaStream_t stream = nullptr,
+      bool launch_with_pdl = false) const override {
+
+    OperatorArguments args;
+    Status status = update_arguments_(args, static_cast<GemmUniversalArguments const *>(arguments_ptr));
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = static_cast<Operator *>(host_workspace);
+    // We need to call initialize() since we have to rebuild TMA desc for every new set of args
+    status = op->run(args, device_workspace, stream, nullptr, launch_with_pdl);
+    return status;
+  }
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::library
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/library/src/handle.cu b/tools/library/src/handle.cu
index c2ee5409d6..e6f00f7225 100644
--- a/tools/library/src/handle.cu
+++ b/tools/library/src/handle.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -32,7 +32,7 @@
 /*! \file
     \brief CUTLASS Library handle.
 */
-#include <iostream> 
+#include <iostream>
 #include <stdexcept>
 #include <cstdint>
 
@@ -47,24 +47,22 @@ namespace library {
 
 /// Constructor
 Handle::Handle(
-  cudaStream_t stream, 
+  cudaStream_t stream,
   size_t workspace_size
 ):
-  provider_(Provider::kCUTLASS), 
-  stream_(stream), 
-  workspace_(nullptr), 
-  workspace_size_(0), 
-  scalar_pointer_mode_(ScalarPointerMode::kHost), 
+  provider_(Provider::kCUTLASS),
+  stream_(stream),
+  workspace_(nullptr),
+  workspace_size_(0),
+  scalar_pointer_mode_(ScalarPointerMode::kHost),
   last_operation_(nullptr) {
 
-  int device_idx = -1;
-
-  cudaError_t error = cudaGetDevice(&device_idx);
+  cudaError_t error = cudaGetDevice(&device_idx_);
   if (error != cudaSuccess) {
     throw std::runtime_error("cudaGetDevice() failed");
   }
 
-  error = cudaGetDeviceProperties(&device_, device_idx);
+  error = cudaGetDeviceProperties(&device_, device_idx_);
   if (error != cudaSuccess) {
     throw std::runtime_error("cudaGetDeviceProperties() failed");
   }
@@ -78,8 +76,14 @@ Handle::Handle(
 Handle::~Handle() {
   if (workspace_) {
 
-    if (workspace_) {
-      cudaFree(workspace_);
+    int device_before;
+    cudaGetDevice(&device_before);
+    if (device_before != device_idx_) {
+      cudaSetDevice(device_idx_);
+    }
+    cudaFree(workspace_);
+    if (device_before != device_idx_) {
+      cudaSetDevice(device_before);
     }
 
     workspace_ = nullptr;
@@ -89,12 +93,16 @@ Handle::~Handle() {
 
 /// Move constructor
 Handle::Handle(Handle && handle) {
+  cudaError_t error = cudaGetDevice(&device_idx_);
+  if (error != cudaSuccess) {
+    throw std::runtime_error("cudaGetDevice() failed");
+  }
   device_ = handle.device_;
   workspace_size_ = handle.workspace_size_;
   workspace_ = handle.workspace_;
   stream_ = handle.stream_;
   scalar_pointer_mode_ = handle.scalar_pointer_mode_;
-  
+
   handle.workspace_ = nullptr;
   handle.workspace_size_ = 0;
 }
@@ -112,6 +120,8 @@ Handle & Handle::operator=(Handle && handle) {
   handle.workspace_ = nullptr;
   handle.workspace_size_ = 0;
 
+  device_idx_ = handle.device_idx_;
+
   return *this;
 }
 
@@ -151,19 +161,25 @@ void *Handle::get_workspace() const {
 
 /// Sets the size of device workspace, invalidating previous calls to get_device_workspace()
 void Handle::set_workspace_size(size_t bytes) {
+  int device_before;
+  cudaGetDevice(&device_before);
+  if (device_before != device_idx_) {
+    cudaSetDevice(device_idx_);
+  }
+
   if (bytes != workspace_size_) {
 
     if (workspace_) {
       cudaFree(workspace_);
     }
-      
+
     workspace_ = nullptr;
     workspace_size_ = bytes;
 
     if (workspace_size_) {
-  
+
       cudaError_t error = cudaMalloc((void **)&workspace_, workspace_size_);
-  
+
       if (error != cudaSuccess) {
         throw std::runtime_error("Failed to allocate workspace");
       }
@@ -177,6 +193,9 @@ void Handle::set_workspace_size(size_t bytes) {
       throw std::runtime_error("Failed to clear workspace");
     }
   }
+  if (device_before != device_idx_) {
+    cudaSetDevice(device_before);
+  }
 }
 
 /// Gets the scalar pointer mode
@@ -239,7 +258,7 @@ static int gemm_problem_alignment(
   };
 
   for (; max_alignment_in_bytes > 0; max_alignment_in_bytes /= 2) {
-    
+
     bool satisfied = true;
 
     // Can pointers satisfy this?
@@ -260,7 +279,7 @@ static int gemm_problem_alignment(
     int max_element_alignment = 0;
 
     for (NumericTypeID type_id : elements) {
-      int element_alignment = max_alignment_in_bytes * 8 / library::sizeof_bits(type_id); 
+      int element_alignment = max_alignment_in_bytes * 8 / library::sizeof_bits(type_id);
       max_element_alignment = std::max(max_element_alignment, element_alignment);
     }
 
@@ -286,7 +305,7 @@ static int gemm_problem_alignment(
 
 /// Find the best kernel in descending order of preference.
 static Operation const * find_gemm_operation(
-  GemmOperationFunctionalMap::const_iterator operators_it, 
+  GemmOperationFunctionalMap::const_iterator operators_it,
   GemmPreferenceKey const preference_key) {
 
   auto cc_it = operators_it->second.upper_bound(preference_key);
@@ -363,7 +382,7 @@ Status Handle::gemm(
   void * ptr_D,                             /// Pointer to D matrix
   int64_t ldd                               /// Leading dimension of D matrix
 ) {
-  
+
   //
   // Find the operation
   //
@@ -379,7 +398,10 @@ Status Handle::gemm(
     element_B,
     layout_B,
     transform_B,
-    element_C
+    element_C,  // C/D are same type and col major default
+    LayoutTypeID::kColumnMajor,
+    element_C,
+    LayoutTypeID::kColumnMajor
   );
 
   auto operators_it = Singleton::get().operation_table.gemm_operations.find(key);
@@ -387,7 +409,7 @@ Status Handle::gemm(
   if (operators_it == Singleton::get().operation_table.gemm_operations.end()) {
     return cutlass::Status::kErrorNotSupported;
   }
-  
+
   if (operators_it->second.empty()) {
     return cutlass::Status::kErrorNotSupported;
   }
@@ -400,7 +422,7 @@ Status Handle::gemm(
   int const kMaximumAlignmentSize = 16;
 
   int alignment = gemm_problem_alignment(
-    M, N, K, 
+    M, N, K,
     element_A, ptr_A, lda, 0,
     element_B, ptr_B, ldb, 0,
     element_C, ptr_C, ldc, 0,
@@ -488,7 +510,6 @@ Status Handle::gemm_universal(
   int M,                                    /// GEMM M dimension
   int N,                                    /// GEMM N dimension
   int K,                                    /// GEMM K dimension
-
   NumericTypeID element_compute,            /// Data type of internal accumulation
 
   NumericTypeID element_scalar,             /// Data type of alpha/beta scalars
@@ -498,26 +519,26 @@ Status Handle::gemm_universal(
   NumericTypeID element_A,                  /// Data type of A matrix elements
   LayoutTypeID layout_A,                    /// Layout of A matrix
   ComplexTransform transform_A,             /// Complex transformation applied to A matrix - ignored for real-valued matrices
-
   void const * ptr_A,                       /// Pointer to A matrix in Global Memory
-  int64_t lda,                                  /// Leading dimension of A matrix
+  int64_t lda,                              /// Leading dimension of A matrix
 
   NumericTypeID element_B,                  /// Data type of B matrix elements
   LayoutTypeID layout_B,                    /// Layout of B matrix
   ComplexTransform transform_B,             /// Complex transformation applied to B matrix - ignored for real-valued matrices
-
   void const * ptr_B,                       /// Pointer to B matrix in Global Memory
-  int64_t ldb,                                  /// Leading dimension of B matrix
+  int64_t ldb,                              /// Leading dimension of B matrix
 
   void const * beta,                        /// Pointer to beta scalar
 
-  NumericTypeID element_C,                  /// Data type of C and D matrices
-
+  NumericTypeID element_C,                  /// Data type of C matrix
+  LayoutTypeID layout_C,                    /// Layout of D matrix
   void const * ptr_C,                       /// Pointer to C matrix
-  int64_t ldc,                                  /// Leading dimension of C matrix
+  int64_t ldc,                              /// Leading dimension of C matrix
 
+  NumericTypeID element_D,                  /// Data type of D matrix
+  LayoutTypeID layout_D,                    /// Layout of D matrix
   void * ptr_D,                             /// Pointer to D matrix
-  int64_t ldd,                                  /// Leading dimension of D matrix
+  int64_t ldd,                              /// Leading dimension of D matrix
 
   int batch_count,                          /// Batch count or number of split-K slices
 
@@ -526,7 +547,7 @@ Status Handle::gemm_universal(
   int64_t batch_stride_C,                   /// Batch stride of C operand
   int64_t batch_stride_D                    /// Batch stride of D operand
 ) {
-  
+
   //
   // Find the operation
   //
@@ -542,7 +563,10 @@ Status Handle::gemm_universal(
     element_B,
     layout_B,
     transform_B,
-    element_C
+    element_C,
+    layout_C,
+    element_D,
+    layout_D
   );
 
   auto operators_it = Singleton::get().operation_table.gemm_operations.find(key);
@@ -550,7 +574,7 @@ Status Handle::gemm_universal(
   if (operators_it == Singleton::get().operation_table.gemm_operations.end()) {
     return cutlass::Status::kErrorNotSupported;
   }
-  
+
   if (operators_it->second.empty()) {
     return cutlass::Status::kErrorNotSupported;
   }
@@ -570,14 +594,14 @@ Status Handle::gemm_universal(
   // Ignore alignment of pointers to pointers. We can't check this from the host,
   // as each batch index has its own pointer in device memory.
   if (mode == GemmUniversalMode::kArray) {
-    ptr_A_check = nullptr; 
-    ptr_B_check = nullptr; 
-    ptr_C_check = nullptr; 
-    ptr_D_check = nullptr; 
+    ptr_A_check = nullptr;
+    ptr_B_check = nullptr;
+    ptr_C_check = nullptr;
+    ptr_D_check = nullptr;
   }
 
   int alignment = gemm_problem_alignment(
-    M, N, K, 
+    M, N, K,
     element_A, ptr_A_check, lda, 0,
     element_B, ptr_B_check, ldb, 0,
     element_C, ptr_C_check, ldc, 0,
@@ -622,6 +646,8 @@ Status Handle::gemm_universal(
   char host_workspace[kHostWorkspaceSize];
 
   GemmUniversalArguments arguments{
+    {M, N, K},
+    batch_count,
     ptr_A,
     ptr_B,
     ptr_C,
@@ -629,6 +655,10 @@ Status Handle::gemm_universal(
     alpha,
     beta,
     scalar_pointer_mode_,
+    lda,
+    ldb,
+    ldc,
+    ldd,
     batch_stride_A,
     batch_stride_B,
     batch_stride_C,
@@ -735,7 +765,10 @@ Status Handle::gemm_planar_complex(
     element_B,
     layout_B,
     transform_B,
-    element_C
+    element_C,  // C/D are same type
+    LayoutTypeID::kColumnMajor,
+    element_C,
+    LayoutTypeID::kColumnMajor
   );
 
   auto operators_it = Singleton::get().operation_table.gemm_operations.find(key);
@@ -743,7 +776,7 @@ Status Handle::gemm_planar_complex(
   if (operators_it == Singleton::get().operation_table.gemm_operations.end()) {
     return cutlass::Status::kErrorNotSupported;
   }
-  
+
   if (operators_it->second.empty()) {
     return cutlass::Status::kErrorNotSupported;
   }
@@ -757,14 +790,14 @@ Status Handle::gemm_planar_complex(
 
   int alignment = std::max(
     gemm_problem_alignment(
-      M, N, K, 
+      M, N, K,
       element_A, ptr_A_real, lda_real, batch_stride_A_real,
       element_B, ptr_B_real, ldb_real, batch_stride_B_real,
       element_C, ptr_C_real, ldc_real, batch_stride_C_real,
       ptr_D_real, ldd_real, batch_stride_D_real, kMaximumAlignmentSize
     ),
     gemm_problem_alignment(
-      M, N, K, 
+      M, N, K,
       element_A, ptr_A_imag, lda_imag, batch_stride_A_imag,
       element_B, ptr_B_imag, ldb_imag, batch_stride_B_imag,
       element_C, ptr_C_imag, ldc_imag, batch_stride_C_imag,
@@ -902,18 +935,18 @@ Status Handle::gemm_planar_complex_array(
   NumericTypeID element_C,                  /// Data type of C and D matrix
 
   void const * const * ptr_C_real,          /// Pointer to array containing pointers to real part of C matrices
-  void const * const * ptr_C_imag,          /// Pointer to array containing poitners to imaginary part of C matrices
+  void const * const * ptr_C_imag,          /// Pointer to array containing pointers to imaginary part of C matrices
 
   int64_t ldc_real,                             /// Leading dimension of real part of C matrix
   int64_t ldc_imag,                             /// Leading dimension of imaginary part of C matrix
 
   void * const * ptr_D_real,                /// Pointer to array containing pointers to real part of D matrices
-  void * const * ptr_D_imag,                /// Pointer to array containing poitners to imaginary part of D matrices
+  void * const * ptr_D_imag,                /// Pointer to array containing pointers to imaginary part of D matrices
 
   int64_t ldd_real,                             /// Leading dimension of real part of D matrix
   int64_t ldd_imag                              /// Leading dimension of imaginary part of D matrix
 ) {
-  
+
   //
   // Find the operation
   //
@@ -929,7 +962,10 @@ Status Handle::gemm_planar_complex_array(
     element_B,
     layout_B,
     transform_B,
-    element_C
+    element_C,  // C/D are same type
+    LayoutTypeID::kColumnMajor,
+    element_C,
+    LayoutTypeID::kColumnMajor
   );
 
   auto operators_it = Singleton::get().operation_table.gemm_operations.find(key);
@@ -937,7 +973,7 @@ Status Handle::gemm_planar_complex_array(
   if (operators_it == Singleton::get().operation_table.gemm_operations.end()) {
     return cutlass::Status::kErrorNotSupported;
   }
-  
+
   if (operators_it->second.empty()) {
     return cutlass::Status::kErrorNotSupported;
   }
@@ -951,14 +987,14 @@ Status Handle::gemm_planar_complex_array(
 
   int alignment = std::max(
     gemm_problem_alignment(
-      expected_M, expected_N, expected_K, 
+      expected_M, expected_N, expected_K,
       element_A, nullptr, lda_real, 0,
       element_B, nullptr, ldb_real, 0,
       element_C, nullptr, ldc_real, 0,
       nullptr, ldd_real, 0, kMaximumAlignmentSize
     ),
     gemm_problem_alignment(
-      expected_M, expected_N, expected_K, 
+      expected_M, expected_N, expected_K,
       element_A, nullptr, lda_imag, 0,
       element_B, nullptr, ldb_imag, 0,
       element_C, nullptr, ldc_imag, 0,
@@ -1048,7 +1084,7 @@ Status Handle::gemm_planar_complex_array(
 /// Finds conv operation instances with Conv::ElementC = Reduction::ElementWorkspace
 Operation const* find_conv_operation_for_parallel_reduction(Operation const *operation) {
 
-  ConvDescription const &conv_desc = 
+  ConvDescription const &conv_desc =
     static_cast<ConvDescription const &>(operation->description());
 
   // if the curren conv operation accumulator and output data type match return operation
@@ -1059,19 +1095,19 @@ Operation const* find_conv_operation_for_parallel_reduction(Operation const *ope
   // find conv operation to match conv output and reduction workspace data type
   ConvFunctionalKey key(
     library::Provider::kCUTLASS,
-    conv_desc.conv_kind,        
+    conv_desc.conv_kind,
     conv_desc.A.element,
     conv_desc.A.layout,
     conv_desc.B.element,
     conv_desc.B.layout,
     conv_desc.tile_description.math_instruction.element_accumulator,
     conv_desc.C.layout,
-    conv_desc.tile_description.math_instruction.element_accumulator, 
+    conv_desc.tile_description.math_instruction.element_accumulator,
     conv_desc.element_epilogue);
 
   // conv operation table for conv2d or conv3d
-  auto conv_operations = (conv_desc.kind == OperationKind::kConv2d) ? 
-                          Singleton::get().operation_table.conv2d_operations : 
+  auto conv_operations = (conv_desc.kind == OperationKind::kConv2d) ?
+                          Singleton::get().operation_table.conv2d_operations :
                           Singleton::get().operation_table.conv3d_operations;
 
   // find ConvFunctionalKey in convolution operation table
@@ -1080,18 +1116,18 @@ Operation const* find_conv_operation_for_parallel_reduction(Operation const *ope
   if (operators_it == conv_operations.end()) {
     return nullptr;
   }
-  
+
   if (operators_it->second.empty()) {
     return nullptr;
   }
 
   // conv operation for same compute capability and iterator algorithm
   ConvPreferenceKey preference_key(
-    conv_desc.tile_description.minimum_compute_capability, 
+    conv_desc.tile_description.minimum_compute_capability,
     conv_desc.iterator_algorithm);
 
   auto it = operators_it->second.find(preference_key);
-  
+
   if(it == operators_it->second.end()) {
     return nullptr;
   }
@@ -1111,11 +1147,11 @@ Operation const* find_conv_operation_for_parallel_reduction(Operation const *ope
 /// Finds gemm operation instances with Gemm::ElementC = Reduction::ElementWorkspace
 Operation const* find_gemm_operation_for_parallel_reduction(Operation const *operation) {
 
-  GemmDescription const &gemm_desc = 
+  GemmDescription const &gemm_desc =
     static_cast<GemmDescription const &>(operation->description());
 
   // if the curren gemm operation accumulator and output data type match return operation
-  if(gemm_desc.tile_description.math_instruction.element_accumulator == gemm_desc.C.element) {
+  if(gemm_desc.tile_description.math_instruction.element_accumulator == gemm_desc.D.element) {
     return operation;
   }
 
@@ -1131,7 +1167,10 @@ Operation const* find_gemm_operation_for_parallel_reduction(Operation const *ope
     gemm_desc.B.element,
     gemm_desc.B.layout,
     gemm_desc.transform_B,
-    gemm_desc.tile_description.math_instruction.element_accumulator);
+    gemm_desc.tile_description.math_instruction.element_accumulator, // C/D are same type
+    LayoutTypeID::kColumnMajor,
+    gemm_desc.tile_description.math_instruction.element_accumulator,
+    LayoutTypeID::kColumnMajor);
 
   // gemm operation table
   auto gemm_operations = Singleton::get().operation_table.gemm_operations;
@@ -1147,15 +1186,30 @@ Operation const* find_gemm_operation_for_parallel_reduction(Operation const *ope
     return nullptr;
   }
 
-  // A and B uses the same alignment in the generator.py
-  int alignment = gemm_desc.A.alignment;
+  // gemm operation for same compute capability and max operand alignment
+  int alignment = std::max(
+    gemm_desc.A.alignment,
+    gemm_desc.B.alignment);
 
-  // gemm operation for same compute capability and iterator algorithm
   GemmPreferenceKey preference_key(
-    gemm_desc.tile_description.minimum_compute_capability, 
+    gemm_desc.tile_description.minimum_compute_capability,
     alignment);
 
-  return find_gemm_operation(operators_it, preference_key);
+  auto it = operators_it->second.find(preference_key);
+
+  if(it == operators_it->second.end()) {
+    return nullptr;
+  }
+
+  // return matching gemm opertion (same tile shape, stages, warp count, and instruction)
+  for (auto op : it->second) {
+    if (op->description().tile_description == operation->description().tile_description) {
+      return op;
+    }
+  }
+
+  // return nullptr if no matching gemm operation found for parallel split-k reduction
+  return nullptr;
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/library/src/library_internal.h b/tools/library/src/library_internal.h
index 91e429b8f0..be311c6255 100644
--- a/tools/library/src/library_internal.h
+++ b/tools/library/src/library_internal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -64,10 +64,18 @@ namespace library {
 
 template <typename T> struct NumericTypeMap;
 
+template <> struct NumericTypeMap<void> {
+  static NumericTypeID const kId = NumericTypeID::kVoid;
+};
+
 template <> struct NumericTypeMap<cutlass::uint1b_t> {
   static NumericTypeID const kId = NumericTypeID::kB1;
 };
 
+template <> struct NumericTypeMap<cutlass::int2b_t> {
+  static NumericTypeID const kId = NumericTypeID::kS2;
+};
+
 template <> struct NumericTypeMap<cutlass::int4b_t> {
   static NumericTypeID const kId = NumericTypeID::kS4;
 };
@@ -88,6 +96,10 @@ template <> struct NumericTypeMap<int64_t> {
   static NumericTypeID const kId = NumericTypeID::kS64;
 };
 
+template <> struct NumericTypeMap<cutlass::uint2b_t> {
+  static NumericTypeID const kId = NumericTypeID::kU2;
+};
+
 template <> struct NumericTypeMap<cutlass::uint4b_t> {
   static NumericTypeID const kId = NumericTypeID::kU4;
 };
@@ -96,6 +108,14 @@ template <> struct NumericTypeMap<uint8_t> {
   static NumericTypeID const kId = NumericTypeID::kU8;
 };
 
+template <> struct NumericTypeMap<cutlass::float_e4m3_t> {
+  static NumericTypeID const kId = NumericTypeID::kFE4M3;
+};
+
+template <> struct NumericTypeMap<cutlass::float_e5m2_t> {
+  static NumericTypeID const kId = NumericTypeID::kFE5M2;
+};
+
 template <> struct NumericTypeMap<uint16_t> {
   static NumericTypeID const kId = NumericTypeID::kU16;
 };
@@ -140,6 +160,7 @@ template <> struct NumericTypeMap<cutlass::tfloat32_t> {
   static NumericTypeID const kId = NumericTypeID::kTF32;
 };
 
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <typename T> struct MathOperationMap {
@@ -162,6 +183,10 @@ template <> struct MathOperationMap<cutlass::arch::OpMultiplyAddSaturate> {
   static MathOperationID const kId = MathOperationID::kMultiplyAddSaturate;
 };
 
+template <> struct MathOperationMap<cutlass::arch::OpMultiplyAddMixedInputUpcast> {
+  static MathOperationID const kId = MathOperationID::kMultiplyAddMixedInputUpcast;
+};
+
 template <> struct MathOperationMap<cutlass::arch::OpMultiplyAddComplex> {
   static MathOperationID const kId = MathOperationID::kMultiplyAddComplex;
 };
@@ -271,6 +296,10 @@ template <> struct OpcodeClassMap<arch::OpClassTensorOp> {
   static OpcodeClassID const kId = OpcodeClassID::kTensorOp;
 };
 
+template <> struct OpcodeClassMap<arch::OpClassSparseTensorOp> {
+  static OpcodeClassID const kId = OpcodeClassID::kSparseTensorOp;
+};
+
 template <> struct OpcodeClassMap<arch::OpClassWmmaTensorOp> {
   static OpcodeClassID const kId = OpcodeClassID::kWmmaTensorOp;
 };
diff --git a/tools/library/src/manifest.cpp b/tools/library/src/manifest.cpp
index ebd5b46d94..82a8fea430 100644
--- a/tools/library/src/manifest.cpp
+++ b/tools/library/src/manifest.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -57,7 +57,7 @@ Status Manifest::initialize() {
   // initialize procedurally generated cutlass op in manifest object
   initialize_all(*this);
 
-  // initialize manually instanced conv3d reference op in manifest object
+  // initialize manually instanced reference op in manifest object
   initialize_reference_operations(*this);
 
   // initialize manually instanced reduction reference op in manifest object
@@ -77,11 +77,6 @@ Status Manifest::release() {
   return Status::kSuccess;
 }
 
-/// Appends an operation and takes ownership
-void Manifest::append(Operation *operation_ptr) {
-  operations_.emplace_back(operation_ptr);
-}
-
 /// Returns an iterator to the first operation
 OperationVector const & Manifest::operations() const {
   return operations_;
diff --git a/tools/library/src/operation_table.cu b/tools/library/src/operation_table.cu
index b31e042bb1..bb5c921f1c 100644
--- a/tools/library/src/operation_table.cu
+++ b/tools/library/src/operation_table.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -47,9 +47,7 @@ void OperationTable::append(Manifest const &manifest) {
 
   // Insert operations into appropriate data structure
   for (auto const & operation : manifest) {
-
     OperationDescription const &desc = operation->description();
-
     // insert all gemm operation into operation table
     if (desc.kind == OperationKind::kGemm) {
       GemmDescription const &gemm_desc = static_cast<GemmDescription const &>(desc);
@@ -66,7 +64,10 @@ void OperationTable::append(Manifest const &manifest) {
         gemm_desc.B.element,
         gemm_desc.B.layout,
         gemm_desc.transform_B,
-        gemm_desc.C.element
+        gemm_desc.C.element,
+        gemm_desc.C.layout,
+        gemm_desc.D.element,
+        gemm_desc.D.layout
       );
 
       Operation const *op = operation.get();
diff --git a/tools/library/src/rank_2k_operation.h b/tools/library/src/rank_2k_operation.h
index 7e18616145..5a6111041b 100644
--- a/tools/library/src/rank_2k_operation.h
+++ b/tools/library/src/rank_2k_operation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -314,7 +314,12 @@ class Rank2KOperation : public Rank2KOperationBase<Operator_> {
     void const *arguments_ptr,
     void *host_workspace, 
     void *device_workspace = nullptr, 
-    cudaStream_t stream = nullptr) const {
+    cudaStream_t stream = nullptr,
+    bool launch_with_pdl = false) const {
+
+    if (launch_with_pdl) {
+      return Status::kErrorNotSupported;
+    }
 
     OperatorArguments args;
     
@@ -347,7 +352,7 @@ class Rank2KOperation : public Rank2KOperationBase<Operator_> {
     std::cout << "Rank2KOperation::OperatorArguments" << std::endl
               << "  problem_size:" << std::endl 
               << operator_args.problem_size << std::endl
-              << "  epilouge (alpha, beta): "
+              << "  epilogue (alpha, beta): "
               << operator_args.epilogue.alpha << ", " 
               << operator_args.epilogue.beta << std::endl
               << "  ref_A (ptr, {stride}): " 
diff --git a/tools/library/src/rank_k_operation.h b/tools/library/src/rank_k_operation.h
index 270db57161..e6afb1da6d 100644
--- a/tools/library/src/rank_k_operation.h
+++ b/tools/library/src/rank_k_operation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -310,7 +310,12 @@ class RankKOperation : public RankKOperationBase<Operator_> {
     void const *arguments_ptr,
     void *host_workspace, 
     void *device_workspace = nullptr, 
-    cudaStream_t stream = nullptr) const {
+    cudaStream_t stream = nullptr,
+    bool launch_with_pdl = false) const {
+
+    if (launch_with_pdl) {
+      return Status::kErrorNotSupported;
+    }
 
     OperatorArguments args;
     
diff --git a/tools/library/src/reduction/init_reduction_operations.cu b/tools/library/src/reduction/init_reduction_operations.cu
index 21b95818ef..b0d92accf3 100644
--- a/tools/library/src/reduction/init_reduction_operations.cu
+++ b/tools/library/src/reduction/init_reduction_operations.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -42,7 +42,10 @@ namespace library {
 ///////////////////////////////////////////////////////////////////////////////////////////////
 //                             CUTLASS Reduction Instances                                   //
 ///////////////////////////////////////////////////////////////////////////////////////////////
+
+void initialize_reduce_add_linear_combination_f16_f16_f16(Manifest &manifest);
 void initialize_reduce_add_linear_combination_f32_f32_f16(Manifest &manifest);
+void initialize_reduce_add_linear_combination_f32_f32_bf16(Manifest &manifest);
 void initialize_reduce_add_linear_combination_f32_f32_f32(Manifest &manifest);
 void initialize_reduce_add_linear_combination_f64_f64_f64(Manifest &manifest);
 void initialize_reduce_add_linear_combination_cf32_cf32_cf32(Manifest &manifest);
@@ -52,11 +55,12 @@ void initialize_reduce_add_linear_combination_cf32_cf32_cf32(Manifest &manifest)
 //
 void initialize_all_reduction_op(Manifest &manifest) {
 
+  initialize_reduce_add_linear_combination_f16_f16_f16(manifest);
   initialize_reduce_add_linear_combination_f32_f32_f16(manifest);
+  initialize_reduce_add_linear_combination_f32_f32_bf16(manifest);
   initialize_reduce_add_linear_combination_f32_f32_f32(manifest);
   initialize_reduce_add_linear_combination_f64_f64_f64(manifest);
   initialize_reduce_add_linear_combination_cf32_cf32_cf32(manifest);
-
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/library/src/reduction/reduction_device.cu b/tools/library/src/reduction/reduction_device.cu
index 7283a6ba21..956f1d3d09 100644
--- a/tools/library/src/reduction/reduction_device.cu
+++ b/tools/library/src/reduction/reduction_device.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -43,9 +43,44 @@ namespace library {
 
 // naming convention initialize_reduce_[ReductionOp]_[EpilogueOp]_[ElementWorkspace]_[ElementAccumulator]_[ElementOutput]
 
+
+void initialize_reduce_add_linear_combination_f16_f16_f16(Manifest &manifest) {
+
+  using ElementWorkspace = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementOutput = cutlass::half_t;
+  using ElementCompute = cutlass::half_t;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombination<
+    ElementOutput,
+    128 / cutlass::sizeof_bits<ElementWorkspace>::value,
+    ElementAccumulator,
+    ElementCompute
+  >;
+
+  using ReductionOp = cutlass::reduction::thread::ReduceAdd<
+    ElementAccumulator,
+    typename EpilogueOutputOp::ElementAccumulator,
+    EpilogueOutputOp::kCount
+  >;
+
+  using Operation_reduce_add_linear_combination_f16_f16_f16 = cutlass::reduction::device::ReduceSplitK<
+    cutlass::reduction::kernel::ReduceSplitK<
+      cutlass::MatrixShape<4, 32 * EpilogueOutputOp::kCount>,
+      EpilogueOutputOp,
+      ReductionOp
+    >
+  >;
+
+  manifest.append(new ReductionOperation<
+    Operation_reduce_add_linear_combination_f16_f16_f16>(
+      "reduce_add_linear_combination_f16_f16_f16"
+  ));
+}
+
 void initialize_reduce_add_linear_combination_f32_f32_f16(Manifest &manifest) {
 
-  using ElementWorkspace = float; 
+  using ElementWorkspace = float;
   using ElementAccumulator = float;
   using ElementOutput = cutlass::half_t;
   using ElementCompute = float;
@@ -58,7 +93,7 @@ void initialize_reduce_add_linear_combination_f32_f32_f16(Manifest &manifest) {
   >;
 
   using ReductionOp = cutlass::reduction::thread::ReduceAdd<
-    ElementAccumulator, 
+    ElementAccumulator,
     typename EpilogueOutputOp::ElementAccumulator,
     EpilogueOutputOp::kCount
   >;
@@ -77,10 +112,43 @@ void initialize_reduce_add_linear_combination_f32_f32_f16(Manifest &manifest) {
   ));
 }
 
+void initialize_reduce_add_linear_combination_f32_f32_bf16(Manifest &manifest) {
+
+  using ElementWorkspace = float;
+  using ElementAccumulator = float;
+  using ElementOutput = cutlass::bfloat16_t;
+  using ElementCompute = float;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombination<
+    ElementOutput,
+    128 / cutlass::sizeof_bits<ElementWorkspace>::value,
+    ElementAccumulator,
+    ElementCompute
+  >;
+
+  using ReductionOp = cutlass::reduction::thread::ReduceAdd<
+    ElementAccumulator,
+    typename EpilogueOutputOp::ElementAccumulator,
+    EpilogueOutputOp::kCount
+  >;
+
+  using Operation_reduce_add_linear_combination_f32_f32_bf16 = cutlass::reduction::device::ReduceSplitK<
+    cutlass::reduction::kernel::ReduceSplitK<
+      cutlass::MatrixShape<4, 32 * EpilogueOutputOp::kCount>,
+      EpilogueOutputOp,
+      ReductionOp
+    >
+  >;
+
+  manifest.append(new ReductionOperation<
+    Operation_reduce_add_linear_combination_f32_f32_bf16>(
+      "reduce_add_linear_combination_f32_f32_bf16"
+  ));
+}
 
 void initialize_reduce_add_linear_combination_f32_f32_f32(Manifest &manifest) {
 
-  using ElementWorkspace = float; 
+  using ElementWorkspace = float;
   using ElementAccumulator = float;
   using ElementOutput = float;
   using ElementCompute = float;
@@ -93,7 +161,7 @@ void initialize_reduce_add_linear_combination_f32_f32_f32(Manifest &manifest) {
   >;
 
   using ReductionOp = cutlass::reduction::thread::ReduceAdd<
-    ElementAccumulator, 
+    ElementAccumulator,
     typename EpilogueOutputOp::ElementAccumulator,
     EpilogueOutputOp::kCount
   >;
@@ -114,7 +182,7 @@ void initialize_reduce_add_linear_combination_f32_f32_f32(Manifest &manifest) {
 
 void initialize_reduce_add_linear_combination_f64_f64_f64(Manifest &manifest) {
 
-  using ElementWorkspace = double; 
+  using ElementWorkspace = double;
   using ElementAccumulator = double;
   using ElementOutput = double;
   using ElementCompute = double;
@@ -127,7 +195,7 @@ void initialize_reduce_add_linear_combination_f64_f64_f64(Manifest &manifest) {
   >;
 
   using ReductionOp = cutlass::reduction::thread::ReduceAdd<
-    ElementAccumulator, 
+    ElementAccumulator,
     typename EpilogueOutputOp::ElementAccumulator,
     EpilogueOutputOp::kCount
   >;
@@ -148,7 +216,7 @@ void initialize_reduce_add_linear_combination_f64_f64_f64(Manifest &manifest) {
 
 void initialize_reduce_add_linear_combination_cf32_cf32_cf32(Manifest &manifest) {
 
-  using ElementWorkspace = cutlass::complex<float>; 
+  using ElementWorkspace = cutlass::complex<float>;
   using ElementAccumulator = cutlass::complex<float>;
   using ElementOutput = cutlass::complex<float>;
   using ElementCompute = cutlass::complex<float>;
@@ -161,7 +229,7 @@ void initialize_reduce_add_linear_combination_cf32_cf32_cf32(Manifest &manifest)
   >;
 
   using ReductionOp = cutlass::reduction::thread::ReduceAdd<
-    ElementAccumulator, 
+    ElementAccumulator,
     typename EpilogueOutputOp::ElementAccumulator,
     EpilogueOutputOp::kCount
   >;
diff --git a/tools/library/src/reduction/reduction_operation.h b/tools/library/src/reduction/reduction_operation.h
index d04b0f3174..3bcabf091c 100644
--- a/tools/library/src/reduction/reduction_operation.h
+++ b/tools/library/src/reduction/reduction_operation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -231,7 +231,12 @@ class ReductionOperation : public Operation {
     void const *arguments_ptr,
     void *host_workspace, 
     void *device_workspace = nullptr, 
-    cudaStream_t stream = nullptr) const {
+    cudaStream_t stream = nullptr,
+    bool launch_with_pdl = false) const {
+
+    if (launch_with_pdl) {
+      return Status::kErrorNotSupported;
+    }
 
     OperatorArguments args;
 
@@ -266,7 +271,7 @@ class ReductionOperation : public Operation {
               << operator_args.partitions << std::endl 
               << "  partition_stride: " 
               << operator_args.partition_stride << std::endl
-              << "  epilouge (alpha, beta): " 
+              << "  epilogue (alpha, beta): "
               << operator_args.output.alpha << ", " 
               << operator_args.output.beta << std::endl
               << "  workspace (ptr, stride): "
diff --git a/tools/library/src/reference/conv2d.cu b/tools/library/src/reference/conv2d.cu
index 87fc92d4d2..bf56f93fad 100644
--- a/tools/library/src/reference/conv2d.cu
+++ b/tools/library/src/reference/conv2d.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/src/reference/conv3d.cu b/tools/library/src/reference/conv3d.cu
index 92ca302a07..f486ff9c7f 100644
--- a/tools/library/src/reference/conv3d.cu
+++ b/tools/library/src/reference/conv3d.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/src/reference/conv_reference_operation.h b/tools/library/src/reference/conv_reference_operation.h
index d9dfc2bf4d..2bafc4af62 100644
--- a/tools/library/src/reference/conv_reference_operation.h
+++ b/tools/library/src/reference/conv_reference_operation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -45,6 +45,7 @@
 #include "cutlass/library/util.h"
 #include "library_internal.h"
 
+#include "cutlass/conv/convolution.h"
 #include "cutlass/util/reference/host/convolution.h"
 #include "cutlass/util/reference/device/convolution.h"
 
@@ -59,7 +60,7 @@ namespace detail {
 
 template <
   Provider kProvider,
-  conv::Operator ConvolutionalOperator,
+  cutlass::conv::Operator ConvolutionalOperator,
   int ConvDim,
   typename ElementA_,
   typename LayoutA_,
@@ -74,10 +75,10 @@ template <
 >
 struct ConvReferenceDispatcher;
 
-/// Dispatcher for Conv2d (partially specialied for kConvDim == 2)
+/// Dispatcher for Conv2d (partially specialized for kConvDim == 2)
 template <
   Provider kProvider,
-  conv::Operator kConvolutionalOperator,
+  cutlass::conv::Operator kConvolutionalOperator,
   typename ElementA,
   typename LayoutA,
   typename ElementB,
@@ -146,6 +147,7 @@ struct ConvReferenceDispatcher<
         LayoutC,
         ElementCompute,
         ElementAccumulator,
+        ElementC,
         ConvertOp,
         InnerProductOp
       >(
@@ -192,7 +194,7 @@ struct ConvReferenceDispatcher<
 /// Dispatcher for Conv3d (partially specialized for kConvDim == 3)
 template <
   Provider kProvider,
-  conv::Operator kConvolutionalOperator,
+  cutlass::conv::Operator kConvolutionalOperator,
   typename ElementA,
   typename LayoutA,
   typename ElementB,
@@ -291,7 +293,7 @@ struct ConvReferenceDispatcher<
 
 template <
   Provider Provider_,
-  conv::Operator ConvolutionalOperator,
+  cutlass::conv::Operator ConvolutionalOperator,
   int ConvDim,
   typename ElementA_,
   typename LayoutA_,
@@ -307,7 +309,7 @@ template <
 class ConvReferenceOperation : public Operation {
 public:
   static Provider const kProvider = Provider_;
-  static conv::Operator const kConvolutionalOperator = ConvolutionalOperator;
+  static cutlass::conv::Operator const kConvolutionalOperator = ConvolutionalOperator;
   static int const kConvDim = ConvDim;
 
   using ElementA = ElementA_;
@@ -430,7 +432,12 @@ class ConvReferenceOperation : public Operation {
     void const *arguments,
     void *host_workspace,
     void *device_workspace = nullptr,
-    cudaStream_t stream = nullptr) const {
+    cudaStream_t stream = nullptr,
+    bool launch_with_pdl = false) const {
+    
+    if (launch_with_pdl) {
+      return Status::kErrorNotSupported;
+    }
 
     ConvArguments const  &args = *static_cast<ConvArguments const *>(arguments);
 
@@ -487,10 +494,10 @@ template <
   typename InnerProductOp_ = multiply_add<ElementAccumulator_>
 >
 void make_conv_fprop(Manifest &manifest) {
-  
+#if !defined(CUTLASS_PROFILER_DISABLE_REFERENCE)
   manifest.append(new ConvReferenceOperation<
     Provider::kReferenceHost,
-    conv::Operator::kFprop,
+    cutlass::conv::Operator::kFprop,
     kConvDim,
     ElementA_, LayoutA_,
     ElementB_, LayoutB_,
@@ -503,7 +510,7 @@ void make_conv_fprop(Manifest &manifest) {
 
   manifest.append(new ConvReferenceOperation<
     Provider::kReferenceDevice,
-    conv::Operator::kFprop,
+    cutlass::conv::Operator::kFprop,
     kConvDim,
     ElementA_, LayoutA_,
     ElementB_, LayoutB_,
@@ -513,6 +520,7 @@ void make_conv_fprop(Manifest &manifest) {
     ConvertOp_,
     InnerProductOp_
   >);
+#endif // !defined(CUTLASS_PROFILER_DISABLE_REFERENCE)
 }
 
 /// Constructs Dgrad and Wgrad reference operators.
@@ -530,10 +538,10 @@ template <
   typename InnerProductOp_ = multiply_add<ElementAccumulator_>
 >
 void make_conv_backwards(Manifest &manifest) {
-  
+#if !defined(CUTLASS_PROFILER_DISABLE_REFERENCE)
   manifest.append(new ConvReferenceOperation<
     Provider::kReferenceHost,
-    conv::Operator::kDgrad,
+    cutlass::conv::Operator::kDgrad,
     kConvDim,
     ElementA_, LayoutA_,
     ElementB_, LayoutB_,
@@ -546,7 +554,7 @@ void make_conv_backwards(Manifest &manifest) {
 
   manifest.append(new ConvReferenceOperation<
     Provider::kReferenceDevice,
-    conv::Operator::kDgrad,
+    cutlass::conv::Operator::kDgrad,
     kConvDim,
     ElementA_, LayoutA_,
     ElementB_, LayoutB_,
@@ -559,7 +567,7 @@ void make_conv_backwards(Manifest &manifest) {
 
   manifest.append(new ConvReferenceOperation<
     Provider::kReferenceHost,
-    conv::Operator::kWgrad,
+    cutlass::conv::Operator::kWgrad,
     kConvDim,
     ElementA_, LayoutA_,
     ElementB_, LayoutB_,
@@ -572,7 +580,7 @@ void make_conv_backwards(Manifest &manifest) {
 
   manifest.append(new ConvReferenceOperation<
     Provider::kReferenceDevice,
-    conv::Operator::kWgrad,
+    cutlass::conv::Operator::kWgrad,
     kConvDim,
     ElementA_, LayoutA_,
     ElementB_, LayoutB_,
@@ -582,6 +590,7 @@ void make_conv_backwards(Manifest &manifest) {
     ConvertOp_,
     InnerProductOp_
   >);
+#endif // !defined(CUTLASS_PROFILER_DISABLE_REFERENCE)
 }
 
 /// Six operators for the price of one.
diff --git a/tools/library/src/reference/gemm_e4m3a_e4m3out.cu b/tools/library/src/reference/gemm_e4m3a_e4m3out.cu
new file mode 100644
index 0000000000..52628b7fc4
--- /dev/null
+++ b/tools/library/src/reference/gemm_e4m3a_e4m3out.cu
@@ -0,0 +1,120 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Instantiates GEMM reference implementations for FP8.
+*/
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "gemm_reference_operation.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// FP8 GEMMs with FP8 E4M3 output
+void initialize_gemm_reference_operations_e4m3a_e4m3out(Manifest &manifest) {
+  make_gemm_real_canonical_layouts<
+    float_e4m3_t,                           // ElementA
+    float_e4m3_t,                           // ElementB
+    half_t,                                 // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    float_e4m3_t                            // ElementD
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    float_e4m3_t,                           // ElementA
+    float_e5m2_t,                           // ElementB
+    half_t,                                 // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    float_e4m3_t                            // ElementD
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    float_e4m3_t,                           // ElementA
+    float_e4m3_t,                           // ElementB
+    bfloat16_t,                             // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    float_e4m3_t                            // ElementD
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    float_e4m3_t,                           // ElementA
+    float_e5m2_t,                           // ElementB
+    bfloat16_t,                             // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    float_e4m3_t                            // ElementD
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    float_e4m3_t,                           // ElementA
+    float_e4m3_t,                           // ElementB
+    float,                                  // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    float_e4m3_t                            // ElementD
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    float_e4m3_t,                           // ElementA
+    float_e5m2_t,                           // ElementB
+    float,                                  // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    float_e4m3_t                            // ElementD
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    float_e4m3_t,                           // ElementA
+    float_e4m3_t,                           // ElementB
+    float_e4m3_t,                           // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    float_e4m3_t                            // ElementD
+  >(manifest);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/tools/library/src/reference/gemm_e4m3a_e5m2out.cu b/tools/library/src/reference/gemm_e4m3a_e5m2out.cu
new file mode 100644
index 0000000000..cb6c6435fe
--- /dev/null
+++ b/tools/library/src/reference/gemm_e4m3a_e5m2out.cu
@@ -0,0 +1,111 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Instantiates GEMM reference implementations for FP8.
+*/
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "gemm_reference_operation.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// FP8 GEMMs with FP8 E5M2 output
+void initialize_gemm_reference_operations_e4m3a_e5m2out(Manifest &manifest) {
+  make_gemm_real_canonical_layouts<
+    float_e4m3_t,                           // ElementA
+    float_e4m3_t,                           // ElementB
+    half_t,                                 // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    float_e5m2_t                            // ElementD
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    float_e4m3_t,                           // ElementA
+    float_e5m2_t,                           // ElementB
+    half_t,                                 // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    float_e5m2_t                            // ElementD
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    float_e4m3_t,                           // ElementA
+    float_e4m3_t,                           // ElementB
+    bfloat16_t,                             // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    float_e5m2_t                            // ElementD
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    float_e4m3_t,                           // ElementA
+    float_e5m2_t,                           // ElementB
+    bfloat16_t,                             // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    float_e5m2_t                            // ElementD
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    float_e4m3_t,                           // ElementA
+    float_e4m3_t,                           // ElementB
+    float,                                  // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    float_e5m2_t                            // ElementD
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    float_e4m3_t,                           // ElementA
+    float_e5m2_t,                           // ElementB
+    float,                                  // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    float_e5m2_t                            // ElementD
+  >(manifest);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/tools/library/src/reference/gemm_e5m2a_e4m3out.cu b/tools/library/src/reference/gemm_e5m2a_e4m3out.cu
new file mode 100644
index 0000000000..a41669e46a
--- /dev/null
+++ b/tools/library/src/reference/gemm_e5m2a_e4m3out.cu
@@ -0,0 +1,111 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Instantiates GEMM reference implementations for FP8.
+*/
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "gemm_reference_operation.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// FP8 GEMMs with FP8 E4M3 output
+void initialize_gemm_reference_operations_e5m2a_e4m3out(Manifest &manifest) {
+  make_gemm_real_canonical_layouts<
+    float_e5m2_t,                           // ElementA
+    float_e4m3_t,                           // ElementB
+    half_t,                                 // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    float_e4m3_t                            // ElementD
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    float_e5m2_t,                           // ElementA
+    float_e5m2_t,                           // ElementB
+    half_t,                                 // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    float_e4m3_t                            // ElementD
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    float_e5m2_t,                           // ElementA
+    float_e4m3_t,                           // ElementB
+    bfloat16_t,                             // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    float_e4m3_t                            // ElementD
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    float_e5m2_t,                           // ElementA
+    float_e5m2_t,                           // ElementB
+    bfloat16_t,                             // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    float_e4m3_t                            // ElementD
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    float_e5m2_t,                           // ElementA
+    float_e4m3_t,                           // ElementB
+    float,                                  // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    float_e4m3_t                            // ElementD
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    float_e5m2_t,                           // ElementA
+    float_e5m2_t,                           // ElementB
+    float,                                  // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    float_e4m3_t                            // ElementD
+  >(manifest);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/tools/library/src/reference/gemm_e5m2a_e5m2out.cu b/tools/library/src/reference/gemm_e5m2a_e5m2out.cu
new file mode 100644
index 0000000000..b2568e8a6b
--- /dev/null
+++ b/tools/library/src/reference/gemm_e5m2a_e5m2out.cu
@@ -0,0 +1,111 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Instantiates GEMM reference implementations for FP8.
+*/
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "gemm_reference_operation.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// FP8 GEMMs with FP8 E5M2 output
+void initialize_gemm_reference_operations_e5m2a_e5m2out(Manifest &manifest) {
+  make_gemm_real_canonical_layouts<
+    float_e5m2_t,                           // ElementA
+    float_e4m3_t,                           // ElementB
+    half_t,                                 // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    float_e5m2_t                            // ElementD
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    float_e5m2_t,                           // ElementA
+    float_e5m2_t,                           // ElementB
+    half_t,                                 // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    float_e5m2_t                            // ElementD
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    float_e5m2_t,                           // ElementA
+    float_e4m3_t,                           // ElementB
+    bfloat16_t,                             // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    float_e5m2_t                            // ElementD
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    float_e5m2_t,                           // ElementA
+    float_e5m2_t,                           // ElementB
+    bfloat16_t,                             // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    float_e5m2_t                            // ElementD
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    float_e5m2_t,                           // ElementA
+    float_e4m3_t,                           // ElementB
+    float,                                  // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    float_e5m2_t                            // ElementD
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    float_e5m2_t,                           // ElementA
+    float_e5m2_t,                           // ElementB
+    float,                                  // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    float_e5m2_t                            // ElementD
+  >(manifest);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/tools/library/src/reference/gemm_fp32out.cu b/tools/library/src/reference/gemm_fp32out.cu
new file mode 100644
index 0000000000..3f15e82e59
--- /dev/null
+++ b/tools/library/src/reference/gemm_fp32out.cu
@@ -0,0 +1,112 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Instantiates GEMM reference implementations.
+*/
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "gemm_reference_operation.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+void initialize_gemm_reference_operations_fp32out(Manifest &manifest) {
+  make_gemm_real_canonical_layouts<
+    float,                                // ElementA
+    float,                                // ElementB
+    float,                                // ElementC
+    float,                                // ElementScalar
+    float                                 // ElementAccumulator
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    tfloat32_t,
+    tfloat32_t,
+    float,
+    float,
+    float
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    tfloat32_t,
+    tfloat32_t,
+    tfloat32_t,
+    float,
+    float
+  >(manifest);
+ 
+  make_gemm_real_canonical_layouts<
+    half_t,
+    half_t,
+    float,
+    float,
+    float
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    half_t,
+    half_t,
+    half_t,
+    float,
+    float
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    bfloat16_t,
+    bfloat16_t,
+    float,
+    float,
+    float
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    bfloat16_t,
+    bfloat16_t,
+    bfloat16_t,
+    float,
+    float
+  >(manifest);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/tools/library/src/reference/gemm_fp8in_bf16out.cu b/tools/library/src/reference/gemm_fp8in_bf16out.cu
new file mode 100644
index 0000000000..a8cdef38fe
--- /dev/null
+++ b/tools/library/src/reference/gemm_fp8in_bf16out.cu
@@ -0,0 +1,93 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Instantiates GEMM reference implementations for FP8.
+*/
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "gemm_reference_operation.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// FP8 GEMMs with BF16 output
+void initialize_gemm_reference_operations_fp8in_bf16out(Manifest &manifest) {
+  make_gemm_real_canonical_layouts<
+    float_e4m3_t,                           // ElementA
+    float_e4m3_t,                           // ElementB
+    bfloat16_t,                             // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    bfloat16_t                              // ElementD
+    >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    float_e4m3_t,                           // ElementA
+    float_e5m2_t,                           // ElementB
+    bfloat16_t,                             // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    bfloat16_t                              // ElementD
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    float_e5m2_t,                           // ElementA
+    float_e4m3_t,                           // ElementB
+    bfloat16_t,                             // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    bfloat16_t                              // ElementD
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    float_e5m2_t,                           // ElementA
+    float_e5m2_t,                           // ElementB
+    bfloat16_t,                             // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    bfloat16_t                              // ElementD
+  >(manifest);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/tools/library/src/reference/gemm_fp8in_fp16out.cu b/tools/library/src/reference/gemm_fp8in_fp16out.cu
new file mode 100644
index 0000000000..86e336de29
--- /dev/null
+++ b/tools/library/src/reference/gemm_fp8in_fp16out.cu
@@ -0,0 +1,93 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Instantiates GEMM reference implementations for FP8.
+*/
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "gemm_reference_operation.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// FP8 GEMMs with FP16 output
+void initialize_gemm_reference_operations_fp8in_fp16out(Manifest &manifest) {
+  make_gemm_real_canonical_layouts<
+    float_e4m3_t,                           // ElementA
+    float_e4m3_t,                           // ElementB
+    half_t,                                 // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    half_t                                  // ElementD
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    float_e4m3_t,                           // ElementA
+    float_e5m2_t,                           // ElementB
+    half_t,                                 // ElementC
+    float,                                  // ElementScalar
+    float ,                                 // ElementAccumulator
+    half_t                                  // ElementD
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    float_e5m2_t,                           // ElementA
+    float_e4m3_t,                           // ElementB
+    half_t,                                 // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    half_t                                  // ElementD
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    float_e5m2_t,                           // ElementA
+    float_e5m2_t,                           // ElementB
+    half_t,                                 // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    half_t                                  // ElementD
+  >(manifest);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/tools/library/src/reference/gemm_fp8in_fp32out.cu b/tools/library/src/reference/gemm_fp8in_fp32out.cu
new file mode 100644
index 0000000000..f278f7c280
--- /dev/null
+++ b/tools/library/src/reference/gemm_fp8in_fp32out.cu
@@ -0,0 +1,93 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Instantiates GEMM reference implementations for FP8.
+*/
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "gemm_reference_operation.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// FP8 GEMMs with FP32 output
+void initialize_gemm_reference_operations_fp8in_fp32out(Manifest &manifest) {
+  make_gemm_real_canonical_layouts<
+    float_e4m3_t,                           // ElementA
+    float_e4m3_t,                           // ElementB
+    float,                                  // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    float                                   // ElementD
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    float_e4m3_t,                           // ElementA
+    float_e5m2_t,                           // ElementB
+    float,                                  // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    float                                   // ElementD
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    float_e5m2_t,                           // ElementA
+    float_e4m3_t,                           // ElementB
+    float,                                  // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    float                                   // ElementD
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    float_e5m2_t,                           // ElementA
+    float_e5m2_t,                           // ElementB
+    float,                                  // ElementC
+    float,                                  // ElementScalar
+    float,                                  // ElementAccumulator
+    float                                   // ElementD
+  >(manifest);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/tools/library/src/reference/gemm.cu b/tools/library/src/reference/gemm_fp_mixed_input.cu
similarity index 54%
rename from tools/library/src/reference/gemm.cu
rename to tools/library/src/reference/gemm_fp_mixed_input.cu
index 19181a64fb..46949236a6 100644
--- a/tools/library/src/reference/gemm.cu
+++ b/tools/library/src/reference/gemm_fp_mixed_input.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -45,290 +45,147 @@ namespace library {
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
-void initialize_gemm_reference_operations(Manifest &manifest) {
-
+void initialize_gemm_reference_operations_fp_mixed_input(Manifest &manifest) {
+  // half_t mixed with 8-bit integer input
   make_gemm_real_canonical_layouts<
-    float,                                // ElementA
-    float,                                // ElementB
-    float,                                // ElementC
-    float,                                // ElementScalar
-    float                                 // ElementAccumulator
-  >(manifest);
-
-  make_gemm_real_canonical_layouts<
-    tfloat32_t,
-    tfloat32_t,
-    float,
+    int8_t,
+    half_t,
     float,
     float
   >(manifest);
 
   make_gemm_real_canonical_layouts<
-    tfloat32_t,
-    tfloat32_t,
-    tfloat32_t,
+    uint8_t,
+    half_t,
     float,
     float
   >(manifest);
- 
+
   make_gemm_real_canonical_layouts<
+    int8_t,
     half_t,
     half_t,
-    half_t,
-    float,
     float
   >(manifest);
-  
+
   make_gemm_real_canonical_layouts<
+    uint8_t,
     half_t,
     half_t,
+    float
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    int8_t,
     half_t,
     half_t,
     half_t
   >(manifest);
 
   make_gemm_real_canonical_layouts<
+    uint8_t,
     half_t,
     half_t,
-    float,
-    float,
-    float
+    half_t
   >(manifest);
-  
+
   make_gemm_real_canonical_layouts<
-    bfloat16_t,
-    bfloat16_t,
-    bfloat16_t,
+    half_t,
+    int8_t,
     float,
     float
   >(manifest);
-  
+
   make_gemm_real_canonical_layouts<
-    bfloat16_t,
-    bfloat16_t,
-    float,
+    half_t,
+    uint8_t,
     float,
     float
   >(manifest);
-  
-  make_gemm_real_canonical_layouts<
-    double,
-    double,
-    double,
-    double,
-    double
-  >(manifest);
 
-  //
-  // Integer-valued GEMMs
-  //
-  
   make_gemm_real_canonical_layouts<
+    half_t,
     int8_t,
-    int8_t,
-    int32_t,
-    int32_t,
-    int32_t
+    half_t,
+    half_t
   >(manifest);
 
   make_gemm_real_canonical_layouts<
-    int8_t,
-    int8_t,
-    int8_t,
-    float,
-    int32_t,
-    NumericConverterClamp<int8_t, float>
+    half_t,
+    uint8_t,
+    half_t,
+    half_t
   >(manifest);
 
   make_gemm_real_canonical_layouts<
+    half_t,
     int8_t,
-    int8_t,
-    int32_t,
-    float,
-    int32_t,
-    NumericConverterClamp<int32_t, float>
+    half_t,
+    float 
   >(manifest);
 
   make_gemm_real_canonical_layouts<
+    half_t,
     uint8_t,
-    uint8_t,
-    int32_t,
-    int32_t,
-    int32_t
+    half_t,
+    float 
   >(manifest);
 
+  // bfloat16_t mixed with 8-bit integer input
   make_gemm_real_canonical_layouts<
-    uint8_t,
-    uint8_t,
     int8_t,
+    bfloat16_t,
     float,
-    int32_t,
-    NumericConverterClamp<int8_t, float>
+    float
   >(manifest);
 
   make_gemm_real_canonical_layouts<
     uint8_t,
-    uint8_t,
-    int32_t,
-    float,
-    int32_t,
-    NumericConverterClamp<int32_t, float>
-  >(manifest);
-
-  make_gemm_interleaved_layouts<
-    32,
-    int8_t,
-    int8_t,
-    int32_t,
-    int32_t,
-    int32_t
-  >(manifest);
-
-  make_gemm_interleaved_layouts<
-    32,
-    int8_t,
-    int8_t,
-    int32_t,
+    bfloat16_t,
     float,
-    int32_t,
-    NumericConverterClamp<int32_t, float>
+    float
   >(manifest);
 
-  make_gemm_interleaved_layouts<
-    32,
-    int8_t,
-    int8_t,
+  make_gemm_real_canonical_layouts<
     int8_t,
-    float,
-    int32_t,
-    NumericConverterClamp<int8_t, float>
+    bfloat16_t,
+    bfloat16_t,
+    float
   >(manifest);
 
-  make_gemm_interleaved_layouts<
-    32,
-    uint8_t,
+  make_gemm_real_canonical_layouts<
     uint8_t,
-    int32_t,
-    int32_t,
-    int32_t
+    bfloat16_t,
+    bfloat16_t,
+    float
   >(manifest);
 
-  make_gemm_interleaved_layouts<
-    32,
-    uint8_t,
-    uint8_t,
-    int32_t,
+  make_gemm_real_canonical_layouts<
+    bfloat16_t,
+    int8_t,
     float,
-    int32_t,
-    NumericConverterClamp<int32_t, float>
+    float
   >(manifest);
 
-  make_gemm_interleaved_layouts<
-    32,
-    uint8_t,
-    uint8_t,
+  make_gemm_real_canonical_layouts<
+    bfloat16_t,
     uint8_t,
     float,
-    int32_t,
-    NumericConverterClamp<uint8_t, float>
+    float
   >(manifest);
 
-  make_gemm_interleaved_layouts<
-    32,
-    uint8_t,
-    uint8_t,
+  make_gemm_real_canonical_layouts<
+    bfloat16_t,
     int8_t,
-    float,
-    int32_t,
-    NumericConverterClamp<int8_t, float>
-  >(manifest);
-
-  make_gemm_interleaved_layouts<
-    64,
-    int4b_t,
-    int4b_t,
-    int32_t,
-    int32_t,
-    int32_t
-  >(manifest);
-
-  make_gemm_interleaved_layouts<
-    64,
-    int4b_t,
-    int4b_t,
-    int32_t,
-    float,
-    int32_t,
-    NumericConverterClamp<int32_t, float>
-  >(manifest);
-
-  make_gemm_interleaved_layouts<
-    64,
-    int4b_t,
-    int4b_t,
-    int4b_t,
-    float,
-    int32_t,
-    NumericConverterClamp<int4b_t, float>
-  >(manifest);
-
-  make_gemm_interleaved_layouts<
-    64,
-    uint4b_t,
-    uint4b_t,
-    int32_t,
-    int32_t,
-    int32_t
-  >(manifest);
-
-  make_gemm_interleaved_layouts<
-    64,
-    uint4b_t,
-    uint4b_t,
-    int32_t,
-    float,
-    int32_t,
-    NumericConverterClamp<int32_t, float>
-  >(manifest);
-
-  make_gemm_interleaved_layouts<
-    64,
-    uint4b_t,
-    uint4b_t,
-    uint4b_t,
-    float,
-    int32_t,
-    NumericConverterClamp<uint4b_t, float>
-  >(manifest);
-
-  make_gemm_interleaved_layouts<
-    64,
-    uint4b_t,
-    uint4b_t,
-    int4b_t,
-    float,
-    int32_t,
-    NumericConverterClamp<int4b_t, float>
-  >(manifest);
-
-  //
-  // Complex-valued GEMMs
-  //
-
-  make_gemm_complex_canonical_layouts<
-    complex<float>,
-    complex<float>,
-    complex<float>,
-    complex<float>,
-    complex<float>
+    bfloat16_t,
+    float
   >(manifest);
 
-  make_gemm_complex_canonical_layouts<
-    complex<double>,
-    complex<double>,
-    complex<double>,
-    complex<double>,
-    complex<double>
+  make_gemm_real_canonical_layouts<
+    bfloat16_t,
+    uint8_t,
+    bfloat16_t,
+    float
   >(manifest);
 }
 
@@ -338,4 +195,3 @@ void initialize_gemm_reference_operations(Manifest &manifest) {
 } // namespace cutlass
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/tools/library/src/reference/gemm_fp_other.cu b/tools/library/src/reference/gemm_fp_other.cu
new file mode 100644
index 0000000000..3a196c209f
--- /dev/null
+++ b/tools/library/src/reference/gemm_fp_other.cu
@@ -0,0 +1,96 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Instantiates GEMM reference implementations.
+*/
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "gemm_reference_operation.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+void initialize_gemm_reference_operations_fp_other(Manifest &manifest) {
+  make_gemm_real_canonical_layouts<
+    half_t,
+    half_t,
+    half_t,
+    half_t,
+    half_t
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    half_t,
+    half_t,
+    float,
+    half_t,
+    half_t
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    double,
+    double,
+    double,
+    double,
+    double
+  >(manifest);
+
+  make_gemm_complex_canonical_layouts<
+    complex<float>,
+    complex<float>,
+    complex<float>,
+    complex<float>,
+    complex<float>
+  >(manifest);
+
+  make_gemm_complex_canonical_layouts<
+    complex<double>,
+    complex<double>,
+    complex<double>,
+    complex<double>,
+    complex<double>
+  >(manifest);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/tools/library/src/reference/gemm_int4.cu b/tools/library/src/reference/gemm_int4.cu
new file mode 100644
index 0000000000..4644714175
--- /dev/null
+++ b/tools/library/src/reference/gemm_int4.cu
@@ -0,0 +1,129 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Instantiates GEMM reference implementations.
+*/
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "gemm_reference_operation.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+void initialize_gemm_reference_operations_int4(Manifest &manifest) {
+  make_gemm_interleaved_layouts<
+    64,
+    int4b_t,
+    int4b_t,
+    int32_t,
+    int32_t,
+    int32_t
+  >(manifest);
+
+  make_gemm_interleaved_layouts<
+    64,
+    int4b_t,
+    int4b_t,
+    int32_t,
+    float,
+    int32_t,
+    int32_t,
+    NumericConverterClamp<int32_t, float>
+  >(manifest);
+
+  make_gemm_interleaved_layouts<
+    64,
+    int4b_t,
+    int4b_t,
+    int4b_t,
+    float,
+    int32_t,
+    int4b_t,
+    NumericConverterClamp<int4b_t, float>
+  >(manifest);
+
+  make_gemm_interleaved_layouts<
+    64,
+    uint4b_t,
+    uint4b_t,
+    int32_t,
+    int32_t,
+    int32_t
+  >(manifest);
+
+  make_gemm_interleaved_layouts<
+    64,
+    uint4b_t,
+    uint4b_t,
+    int32_t,
+    float,
+    int32_t,
+    int32_t,
+    NumericConverterClamp<int32_t, float>
+  >(manifest);
+
+  make_gemm_interleaved_layouts<
+    64,
+    uint4b_t,
+    uint4b_t,
+    uint4b_t,
+    float,
+    int32_t,
+    uint4b_t,
+    NumericConverterClamp<uint4b_t, float>
+  >(manifest);
+
+  make_gemm_interleaved_layouts<
+    64,
+    uint4b_t,
+    uint4b_t,
+    int4b_t,
+    float,
+    int32_t,
+    int4b_t,
+    NumericConverterClamp<int4b_t, float>
+  >(manifest);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/tools/library/src/reference/gemm_int8_interleaved_32.cu b/tools/library/src/reference/gemm_int8_interleaved_32.cu
new file mode 100644
index 0000000000..fd2393fa65
--- /dev/null
+++ b/tools/library/src/reference/gemm_int8_interleaved_32.cu
@@ -0,0 +1,129 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Instantiates GEMM reference implementations.
+*/
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "gemm_reference_operation.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+void initialize_gemm_reference_operations_int8_interleaved_32(Manifest &manifest) {
+  make_gemm_interleaved_layouts<
+    32,
+    int8_t,
+    int8_t,
+    int32_t,
+    int32_t,
+    int32_t
+  >(manifest);
+
+  make_gemm_interleaved_layouts<
+    32,
+    int8_t,
+    int8_t,
+    int32_t,
+    float,
+    int32_t,
+    int32_t,
+    NumericConverterClamp<int32_t, float>
+  >(manifest);
+
+  make_gemm_interleaved_layouts<
+    32,
+    int8_t,
+    int8_t,
+    int8_t,
+    float,
+    int32_t,
+    int8_t,
+    NumericConverterClamp<int8_t, float>
+  >(manifest);
+
+  make_gemm_interleaved_layouts<
+    32,
+    uint8_t,
+    uint8_t,
+    int32_t,
+    int32_t,
+    int32_t
+  >(manifest);
+
+  make_gemm_interleaved_layouts<
+    32,
+    uint8_t,
+    uint8_t,
+    int32_t,
+    float,
+    int32_t,
+    int32_t,
+    NumericConverterClamp<int32_t, float>
+  >(manifest);
+
+  make_gemm_interleaved_layouts<
+    32,
+    uint8_t,
+    uint8_t,
+    uint8_t,
+    float,
+    int32_t,
+    uint8_t,
+    NumericConverterClamp<uint8_t, float>
+  >(manifest);
+
+  make_gemm_interleaved_layouts<
+    32,
+    uint8_t,
+    uint8_t,
+    int8_t,
+    float,
+    int32_t,
+    int8_t,
+    NumericConverterClamp<int8_t, float>
+  >(manifest);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/tools/library/src/reference/gemm_int8_interleaved_64.cu b/tools/library/src/reference/gemm_int8_interleaved_64.cu
new file mode 100644
index 0000000000..e2f013a0c2
--- /dev/null
+++ b/tools/library/src/reference/gemm_int8_interleaved_64.cu
@@ -0,0 +1,129 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Instantiates GEMM reference implementations.
+*/
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "gemm_reference_operation.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+void initialize_gemm_reference_operations_int8_interleaved_64(Manifest &manifest) {
+  make_gemm_interleaved_layouts<
+    64,
+    int4b_t,
+    int4b_t,
+    int32_t,
+    int32_t,
+    int32_t
+  >(manifest);
+
+  make_gemm_interleaved_layouts<
+    64,
+    int4b_t,
+    int4b_t,
+    int32_t,
+    float,
+    int32_t,
+    int32_t,
+    NumericConverterClamp<int32_t, float>
+  >(manifest);
+
+  make_gemm_interleaved_layouts<
+    64,
+    int4b_t,
+    int4b_t,
+    int4b_t,
+    float,
+    int32_t,
+    int4b_t,
+    NumericConverterClamp<int4b_t, float>
+  >(manifest);
+
+  make_gemm_interleaved_layouts<
+    64,
+    uint4b_t,
+    uint4b_t,
+    int32_t,
+    int32_t,
+    int32_t
+  >(manifest);
+
+  make_gemm_interleaved_layouts<
+    64,
+    uint4b_t,
+    uint4b_t,
+    int32_t,
+    float,
+    int32_t,
+    int32_t,
+    NumericConverterClamp<int32_t, float>
+  >(manifest);
+
+  make_gemm_interleaved_layouts<
+    64,
+    uint4b_t,
+    uint4b_t,
+    uint4b_t,
+    float,
+    int32_t,
+    uint4b_t,
+    NumericConverterClamp<uint4b_t, float>
+  >(manifest);
+
+  make_gemm_interleaved_layouts<
+    64,
+    uint4b_t,
+    uint4b_t,
+    int4b_t,
+    float,
+    int32_t,
+    int4b_t,
+    NumericConverterClamp<int4b_t, float>
+  >(manifest);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/tools/library/src/reference/gemm_int_mixed_input.cu b/tools/library/src/reference/gemm_int_mixed_input.cu
new file mode 100644
index 0000000000..c37ddfe45d
--- /dev/null
+++ b/tools/library/src/reference/gemm_int_mixed_input.cu
@@ -0,0 +1,130 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Instantiates GEMM reference implementations.
+*/
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "gemm_reference_operation.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+void initialize_gemm_reference_operations_int_mixed_input(Manifest &manifest) {
+  // 4-bit integer mixed with 8-bit integer input
+  make_gemm_real_canonical_layouts<
+    int4b_t,
+    int8_t,
+    int32_t,
+    int32_t
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    int4b_t,
+    int8_t,
+    int8_t,
+    int32_t,
+    int32_t,
+    int8_t,
+    NumericConverterClamp<int8_t, int32_t>
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    int4b_t,
+    int8_t,
+    int32_t,
+    float,
+    int32_t,
+    int32_t,
+    NumericConverterClamp<int32_t, float>
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    int4b_t,
+    int8_t,
+    int8_t,
+    float,
+    int32_t,
+    int8_t,
+    NumericConverterClamp<int8_t, float>
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    int8_t,
+    int4b_t,
+    int32_t,
+    int32_t
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    int8_t,
+    int4b_t,
+    int8_t,
+    int32_t,
+    int32_t,
+    int8_t,
+    NumericConverterClamp<int8_t, int32_t>
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    int8_t,
+    int4b_t,
+    int32_t,
+    float,
+    int32_t,
+    int32_t,
+    NumericConverterClamp<int32_t, float>
+  >(manifest);
+
+  make_gemm_real_canonical_layouts<
+    int8_t,
+    int4b_t,
+    int8_t,
+    float,
+    int32_t,
+    int8_t,
+    NumericConverterClamp<int8_t, float>
+  >(manifest);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/library/src/reference/gemm_reference_operation.h b/tools/library/src/reference/gemm_reference_operation.h
index b6c45ebca9..940ff5217d 100644
--- a/tools/library/src/reference/gemm_reference_operation.h
+++ b/tools/library/src/reference/gemm_reference_operation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -67,7 +67,8 @@ template <
   typename LayoutC_,
   typename ElementCompute_,
   typename ElementAccumulator_ = ElementCompute_,
-  typename ConvertOp_ = NumericConverter<ElementC_, ElementCompute_>,
+  typename ElementD_ = ElementC_,
+  typename ConvertOp_ = NumericConverter<ElementD_, ElementCompute_>,
   typename InnerProductOp_ = multiply_add<ElementAccumulator_>
 >
 class GemmReferenceOperation : public Operation {
@@ -84,7 +85,9 @@ class GemmReferenceOperation : public Operation {
   static cutlass::ComplexTransform const kTransformB = TransformB;
   using ElementC = ElementC_;
   using LayoutC = LayoutC_;
+  using ElementD = ElementD_;
   using TensorRefC = TensorRef<ElementC, LayoutC>;
+  using TensorRefD = TensorRef<ElementD, LayoutC>;
   using ElementCompute = ElementCompute_;
   using ElementAccumulator = ElementAccumulator_;
   using ConvertOp = ConvertOp_;
@@ -114,6 +117,7 @@ class GemmReferenceOperation : public Operation {
     description_.B = make_TensorDescription<ElementB, LayoutB>();
     description_.transform_B = ComplexTransformMap<kTransformB>::kId;
     description_.C = make_TensorDescription<ElementC, LayoutC>();
+    description_.D = make_TensorDescription<ElementD, LayoutC>();
     
     // Epilogue compute and accumulator type description
     description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
@@ -188,7 +192,12 @@ class GemmReferenceOperation : public Operation {
     void const *arguments,
     void *host_workspace,
     void *device_workspace = nullptr,
-    cudaStream_t stream = nullptr) const {
+    cudaStream_t stream = nullptr,
+    bool launch_with_pdl = false) const {
+
+    if (launch_with_pdl) {
+      return Status::kErrorNotSupported;
+    }
 
     GemmUniversalConfiguration const &config = *static_cast<GemmUniversalConfiguration const *>(host_workspace);
     GemmUniversalArguments const &args = *static_cast<GemmUniversalArguments const *>(arguments);
@@ -196,7 +205,7 @@ class GemmReferenceOperation : public Operation {
     TensorRefA ref_A{static_cast<ElementA *>(const_cast<void *>(args.A)), LayoutA(int(config.lda))};
     TensorRefB ref_B{static_cast<ElementB *>(const_cast<void *>(args.B)), LayoutB(int(config.ldb))};
     TensorRefC ref_C{static_cast<ElementC *>(const_cast<void *>(args.C)), LayoutC(int(config.ldc))};
-    TensorRefC ref_D{static_cast<ElementC *>(args.D), LayoutC(int(config.ldd))};
+    TensorRefD ref_D{static_cast<ElementD *>(args.D), LayoutC(int(config.ldd))};
 
     if (kProvider == Provider::kReferenceHost) {
 
@@ -209,6 +218,7 @@ class GemmReferenceOperation : public Operation {
         LayoutC,
         ElementCompute,
         ElementAccumulator,
+        ElementD,
         ConvertOp,
         InnerProductOp
       >(
@@ -242,6 +252,7 @@ class GemmReferenceOperation : public Operation {
         LayoutC,
         ElementCompute,
         ElementAccumulator,
+        ElementD,
         ConvertOp,
         InnerProductOp
       >(
@@ -282,11 +293,12 @@ template <
   typename LayoutC_,
   typename ElementCompute_,
   typename ElementAccumulator_ = ElementCompute_,
-  typename ConvertOp_ = NumericConverter<ElementC_, ElementCompute_>,
+  typename ElementD_ = ElementC_,
+  typename ConvertOp_ = NumericConverter<ElementD_, ElementCompute_>,
   typename InnerProductOp_ = multiply_add<ElementAccumulator_>
 >
 void make_gemm(Manifest &manifest) {
-  
+#if !defined(CUTLASS_PROFILER_DISABLE_REFERENCE)
   manifest.append(new GemmReferenceOperation<
     Provider::kReferenceHost,
     ElementA_, LayoutA_, TransformA,
@@ -294,6 +306,7 @@ void make_gemm(Manifest &manifest) {
     ElementC_, LayoutC_,
     ElementCompute_,
     ElementAccumulator_,
+    ElementD_,
     ConvertOp_,
     InnerProductOp_
   >);
@@ -305,9 +318,11 @@ void make_gemm(Manifest &manifest) {
     ElementC_, LayoutC_,
     ElementCompute_,
     ElementAccumulator_,
+    ElementD_,
     ConvertOp_,
     InnerProductOp_
   >);
+#endif
 }
 
 /// Helper to create NN, NT, TN, and TT GEMM layouts.
@@ -317,37 +332,42 @@ template <
   typename ElementC_,
   typename ElementCompute_,
   typename ElementAccumulator_ = ElementCompute_,
-  typename ConvertOp_ = NumericConverter<ElementC_, ElementCompute_>,
+  typename ElementD_ = ElementC_,
+  typename ConvertOp_ = NumericConverter<ElementD_, ElementCompute_>,
   typename InnerProductOp_ = multiply_add<ElementAccumulator_>
 >
 void make_gemm_canonical_layouts(Manifest &manifest) {
 
+  // M Major outputs
   make_gemm<
     ElementA_, cutlass::layout::ColumnMajor, TransformA,
     ElementB_, cutlass::layout::ColumnMajor, TransformB,
     ElementC_, cutlass::layout::ColumnMajor,
     ElementCompute_,
     ElementAccumulator_,
+    ElementD_,
     ConvertOp_,
     InnerProductOp_
   >(manifest);
-  
+
   make_gemm<
     ElementA_, cutlass::layout::ColumnMajor, TransformA,
     ElementB_, cutlass::layout::RowMajor, TransformB,
     ElementC_, cutlass::layout::ColumnMajor,
     ElementCompute_,
     ElementAccumulator_,
+    ElementD_,
     ConvertOp_,
     InnerProductOp_
   >(manifest);
-  
+
   make_gemm<
     ElementA_, cutlass::layout::RowMajor, TransformA,
     ElementB_, cutlass::layout::ColumnMajor, TransformB,
     ElementC_, cutlass::layout::ColumnMajor,
     ElementCompute_,
     ElementAccumulator_,
+    ElementD_,
     ConvertOp_,
     InnerProductOp_
   >(manifest);
@@ -358,6 +378,52 @@ void make_gemm_canonical_layouts(Manifest &manifest) {
     ElementC_, cutlass::layout::ColumnMajor,
     ElementCompute_,
     ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(manifest);
+
+  // N Major outputs
+  make_gemm<
+    ElementA_, cutlass::layout::ColumnMajor, TransformA,
+    ElementB_, cutlass::layout::ColumnMajor, TransformB,
+    ElementC_, cutlass::layout::RowMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(manifest);
+
+  make_gemm<
+    ElementA_, cutlass::layout::ColumnMajor, TransformA,
+    ElementB_, cutlass::layout::RowMajor, TransformB,
+    ElementC_, cutlass::layout::RowMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(manifest);
+
+  make_gemm<
+    ElementA_, cutlass::layout::RowMajor, TransformA,
+    ElementB_, cutlass::layout::ColumnMajor, TransformB,
+    ElementC_, cutlass::layout::RowMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(manifest);
+
+  make_gemm<
+    ElementA_, cutlass::layout::RowMajor, TransformA,
+    ElementB_, cutlass::layout::RowMajor, TransformB,
+    ElementC_, cutlass::layout::RowMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
     ConvertOp_,
     InnerProductOp_
   >(manifest);
@@ -372,6 +438,7 @@ template <
   typename ElementC_,
   typename ElementCompute_,
   typename ElementAccumulator_ = ElementCompute_,
+  typename ElementD_ = ElementC_,
   typename ConvertOp_ = NumericConverter<ElementC_, ElementCompute_>,
   typename InnerProductOp_ = multiply_add<ElementAccumulator_>
 >
@@ -383,6 +450,7 @@ void make_gemm_interleaved_layouts(Manifest &manifest) {
     ElementC_, cutlass::layout::ColumnMajor,
     ElementCompute_,
     ElementAccumulator_,
+    ElementD_,
     ConvertOp_,
     InnerProductOp_
   >(manifest);
@@ -396,7 +464,8 @@ template <
   typename ElementC_,
   typename ElementCompute_,
   typename ElementAccumulator_ = ElementCompute_,
-  typename ConvertOp_ = NumericConverter<ElementC_, ElementCompute_>,
+  typename ElementD_ = ElementC_,
+  typename ConvertOp_ = NumericConverter<ElementD_, ElementCompute_>,
   typename InnerProductOp_ = multiply_add<ElementAccumulator_>
 >
 void make_gemm_real_canonical_layouts(Manifest &manifest) {
@@ -406,6 +475,7 @@ void make_gemm_real_canonical_layouts(Manifest &manifest) {
     ElementC_,
     ElementCompute_,
     ElementAccumulator_,
+    ElementD_,
     ConvertOp_,
     InnerProductOp_
   >(manifest);  
@@ -418,7 +488,8 @@ template <
   typename ElementC_,
   typename ElementCompute_,
   typename ElementAccumulator_ = ElementCompute_,
-  typename ConvertOp_ = NumericConverter<ElementC_, ElementCompute_>,
+  typename ElementD_ = ElementC_,
+  typename ConvertOp_ = NumericConverter<ElementD_, ElementCompute_>,
   typename InnerProductOp_ = multiply_add<ElementAccumulator_>
 >
 void make_gemm_complex_canonical_layouts(Manifest &manifest) {
@@ -429,6 +500,7 @@ void make_gemm_complex_canonical_layouts(Manifest &manifest) {
     ElementC_,
     ElementCompute_,
     ElementAccumulator_,
+    ElementD_,
     ConvertOp_,
     InnerProductOp_
   >(manifest);
@@ -439,6 +511,7 @@ void make_gemm_complex_canonical_layouts(Manifest &manifest) {
     ElementC_,
     ElementCompute_,
     ElementAccumulator_,
+    ElementD_,
     ConvertOp_,
     InnerProductOp_
   >(manifest);
@@ -449,6 +522,7 @@ void make_gemm_complex_canonical_layouts(Manifest &manifest) {
     ElementC_,
     ElementCompute_,
     ElementAccumulator_,
+    ElementD_,
     ConvertOp_,
     InnerProductOp_
   >(manifest);
@@ -459,6 +533,7 @@ void make_gemm_complex_canonical_layouts(Manifest &manifest) {
     ElementC_,
     ElementCompute_,
     ElementAccumulator_,
+    ElementD_,
     ConvertOp_,
     InnerProductOp_
   >(manifest);
diff --git a/tools/library/src/reference/gemm_s8_s8_s32.cu b/tools/library/src/reference/gemm_s8_s8_s32.cu
new file mode 100644
index 0000000000..8c661b98a0
--- /dev/null
+++ b/tools/library/src/reference/gemm_s8_s8_s32.cu
@@ -0,0 +1,146 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Instantiates GEMM reference implementations.
+*/
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "gemm_reference_operation.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// A/B: s8
+// Acc : s32
+// C/D: some variance
+// Epi Scalar: some variance
+
+// 1. s8_s8_s32_s32_s32 (s32 epi scalar)
+// 2. s8_s8_s32_s32_s32 (f32 epi scalar)
+// 3. s8_s8_s32_s8_s8 (f32 epi scalar)
+// 4. s8_s8_s32_s8_s8 (s32 epi scalar)
+// 5. s8_s8_s32_s32_s8 (f32 epi scalar)
+// 6. s8_s8_s32_f32_f32
+// 7. s8_s8_s32_f16_f16 (f32 epi scalar)
+
+// D = convert( Scalar(alpha) * Scalar( A * B ) + Scalar(beta) * Scalar( C ) )
+// Convert: from epi Scalar dtype to D dtype
+
+void initialize_gemm_reference_operations_s8_s8_s32(Manifest &manifest) {
+  // 1.
+  make_gemm_real_canonical_layouts<
+    int8_t,                           // ElementA
+    int8_t,                           // ElementB
+    int32_t,                          // ElementC
+    int32_t,                          // ElementScalar / ElementCompute
+    int32_t,                          // ElementAccumulator
+    int32_t                           // ElementD
+  >(manifest);
+
+  // 2.
+  make_gemm_real_canonical_layouts<
+    int8_t,                           // ElementA
+    int8_t,                           // ElementB
+    int32_t,                          // ElementC
+    float,                            // ElementScalar / ElementCompute
+    int32_t,                          // ElementAccumulator
+    int32_t                           // ElementD
+  >(manifest);
+
+  // 3.
+  make_gemm_real_canonical_layouts<
+    int8_t,                           // ElementA
+    int8_t,                           // ElementB
+    int8_t,                           // ElementC
+    float,                            // ElementScalar / ElementCompute
+    int32_t,                          // ElementAccumulator
+    int8_t,                           // ElementD
+    NumericConverterClamp<int8_t, float> // From Scalar to D
+  >(manifest);
+
+  // 4.
+  make_gemm_real_canonical_layouts<
+    int8_t,                           // ElementA
+    int8_t,                           // ElementB
+    int8_t,                           // ElementC
+    int32_t,                          // ElementScalar / ElementCompute
+    int32_t,                          // ElementAccumulator
+    int8_t,                           // ElementD
+    NumericConverterClamp<int8_t, int32_t> // From Scalar to D
+  >(manifest);
+
+  // 5.
+  make_gemm_real_canonical_layouts<
+    int8_t,                           // ElementA
+    int8_t,                           // ElementB
+    int32_t,                          // ElementC
+    float,                            // ElementScalar / ElementCompute
+    int32_t,                          // ElementAccumulator
+    int8_t,                           // ElementD
+    NumericConverterClamp<int8_t, float> // From Scalar to D
+  >(manifest);
+
+  // 6.
+  make_gemm_real_canonical_layouts<
+    int8_t,                           // ElementA
+    int8_t,                           // ElementB
+    float,                            // ElementC
+    float,                            // ElementScalar / ElementCompute
+    int32_t,                          // ElementAccumulator
+    float                             // ElementD
+  >(manifest);
+
+  // 7.
+  make_gemm_real_canonical_layouts<
+    int8_t,                           // ElementA
+    int8_t,                           // ElementB
+    half_t,                           // ElementC
+    float,                            // ElementScalar / ElementCompute
+    int32_t,                          // ElementAccumulator
+    half_t,                           // ElementD
+    NumericConverterClamp<half_t, float> // From Scalar to D
+  >(manifest);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/tools/library/src/reference/gemm_u8_u8_s32.cu b/tools/library/src/reference/gemm_u8_u8_s32.cu
new file mode 100644
index 0000000000..f18f7e648c
--- /dev/null
+++ b/tools/library/src/reference/gemm_u8_u8_s32.cu
@@ -0,0 +1,98 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Instantiates GEMM reference implementations.
+*/
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "gemm_reference_operation.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// A/B: u8
+// Acc : s32
+// C/D: some variance
+
+// 1. u8_u8_s32_s32_s32 (s32 epi scalar)
+// 2. u8_u8_s32_s32_s32 (f32 epi scalar)
+// 3. u8_8_s32_s8_s8 (f32 epi scalar)
+// 3. u8_8_s32_s8_s8 (s epi scalar)
+
+void initialize_gemm_reference_operations_u8_u8_s32(Manifest &manifest) {
+  // 1.
+  make_gemm_real_canonical_layouts<
+    uint8_t,                          // ElementA
+    uint8_t,                          // ElementB
+    int32_t,                          // ElementC
+    int32_t,                          // ElementScalar / ElementCompute
+    int32_t,                          // ElementAccumulator
+    int32_t                           // ElementD
+  >(manifest);
+
+  // 2.
+  make_gemm_real_canonical_layouts<
+    uint8_t,                          // ElementA
+    uint8_t,                          // ElementB
+    int32_t,                          // ElementC
+    float,                            // ElementScalar / ElementCompute
+    int32_t,                          // ElementAccumulator
+    int32_t,                          // ElementD
+    NumericConverterClamp<int32_t, float> // From Scalar to D
+  >(manifest);
+
+  // 3.
+  make_gemm_real_canonical_layouts<
+    uint8_t,                          // ElementA
+    uint8_t,                          // ElementB
+    int8_t,                           // ElementC
+    float,                            // ElementScalar / ElementCompute
+    int32_t,                          // ElementAccumulator
+    int8_t,                           // ElementD
+    NumericConverterClamp<int8_t, float> // From Scalar to D
+  >(manifest);
+
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/tools/library/src/reference/initialize_reference_operations.cu b/tools/library/src/reference/initialize_reference_operations.cu
index a8e34e6fe9..b097d580c3 100644
--- a/tools/library/src/reference/initialize_reference_operations.cu
+++ b/tools/library/src/reference/initialize_reference_operations.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -42,7 +42,24 @@
 namespace cutlass {
 namespace library {
 
-void initialize_gemm_reference_operations(Manifest &manifest);
+// note: init methods for the same op-class may be split into multiple to parallelize compilation
+void initialize_gemm_reference_operations_int4(Manifest &manifest);
+void initialize_gemm_reference_operations_int8_interleaved_32(Manifest &manifest);
+void initialize_gemm_reference_operations_int8_interleaved_64(Manifest &manifest);
+void initialize_gemm_reference_operations_s8_s8_s32(Manifest &manifest);
+void initialize_gemm_reference_operations_u8_u8_s32(Manifest &manifest);
+void initialize_gemm_reference_operations_e4m3a_e4m3out(Manifest &manifest);
+void initialize_gemm_reference_operations_e5m2a_e4m3out(Manifest &manifest);
+void initialize_gemm_reference_operations_e4m3a_e5m2out(Manifest &manifest);
+void initialize_gemm_reference_operations_e5m2a_e5m2out(Manifest &manifest);
+void initialize_gemm_reference_operations_fp8in_fp16out(Manifest &manifest);
+void initialize_gemm_reference_operations_fp8in_bf16out(Manifest &manifest);
+void initialize_gemm_reference_operations_fp8in_fp32out(Manifest &manifest);
+void initialize_gemm_reference_operations_fp32out(Manifest &manifest);
+void initialize_gemm_reference_operations_fp_other(Manifest &manifest);
+void initialize_gemm_reference_operations_fp_mixed_input(Manifest &manifest);
+void initialize_gemm_reference_operations_int_mixed_input(Manifest &manifest);
+
 void initialize_conv2d_reference_operations(Manifest &manifest);
 void initialize_conv3d_reference_operations(Manifest &manifest);
 
@@ -51,7 +68,27 @@ void initialize_conv3d_reference_operations(Manifest &manifest);
 void initialize_reference_operations(Manifest &manifest) {
   initialize_conv2d_reference_operations(manifest);
   initialize_conv3d_reference_operations(manifest);
-  initialize_gemm_reference_operations(manifest);
+
+  initialize_gemm_reference_operations_int4(manifest);
+
+  initialize_gemm_reference_operations_int8_interleaved_32(manifest);
+  initialize_gemm_reference_operations_int8_interleaved_64(manifest);
+  initialize_gemm_reference_operations_s8_s8_s32(manifest);
+  initialize_gemm_reference_operations_u8_u8_s32(manifest);
+
+  initialize_gemm_reference_operations_e4m3a_e4m3out(manifest);
+  initialize_gemm_reference_operations_e5m2a_e4m3out(manifest);
+  initialize_gemm_reference_operations_e4m3a_e5m2out(manifest);
+  initialize_gemm_reference_operations_e5m2a_e5m2out(manifest);
+  initialize_gemm_reference_operations_fp8in_fp16out(manifest);
+  initialize_gemm_reference_operations_fp8in_bf16out(manifest);
+  initialize_gemm_reference_operations_fp8in_fp32out(manifest);
+
+  initialize_gemm_reference_operations_fp32out(manifest);
+  initialize_gemm_reference_operations_fp_other(manifest);
+  initialize_gemm_reference_operations_fp_mixed_input(manifest);
+  initialize_gemm_reference_operations_int_mixed_input(manifest);
+
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/library/src/singleton.cu b/tools/library/src/singleton.cu
index 23ad294f6b..e844083a75 100644
--- a/tools/library/src/singleton.cu
+++ b/tools/library/src/singleton.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -42,10 +42,6 @@ namespace library {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-static std::unique_ptr<Singleton> instance;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
 Singleton::Singleton() {
 
   manifest.initialize();
@@ -54,10 +50,8 @@ Singleton::Singleton() {
 }
 
 Singleton const & Singleton::get() {
-  if (!instance.get()) {
-    instance.reset(new Singleton);
-  }
-  return *instance.get();
+  static Singleton instance;
+  return instance;
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/library/src/sparse_gemm_operation_3x.hpp b/tools/library/src/sparse_gemm_operation_3x.hpp
new file mode 100644
index 0000000000..8bfc41d726
--- /dev/null
+++ b/tools/library/src/sparse_gemm_operation_3x.hpp
@@ -0,0 +1,446 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines operations for all GEMM operation kinds in CUTLASS Library.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/library/library.h"
+#include "cutlass/transform/kernel/sparse_gemm_compressor.hpp" // StructuredSparseCompressor
+#include "cutlass/transform/device/transform_universal_adapter.hpp" // TransformUniversalAdapter
+#include "cutlass/util/packed_stride.hpp"        // make_cute_packed_stride
+#include "gemm_operation_3x.hpp"
+#include "library_internal.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define CUDA_CHECK(cuda_error)                                                                 \
+  {                                                                                            \
+    if (cuda_error != cudaSuccess) {                                                           \
+      printf("cudaError %s in %s:%d\n", cudaGetErrorString(cuda_error), __func__, __LINE__ );  \
+      return Status::kInvalid;                                                                 \
+    }                                                                                          \
+  }
+
+namespace cutlass::library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Limitation & Assumptions:
+// 1. The tensor must be densely packed.  That is, lda is k if the tensor is k-major,
+//    and lda is m if the tensor is m-major.
+// 2. Circular buffer for tensorA and tensorE may have a less count compared to tensorB and others.
+//    This is because we can not get the problem_count information in the get_device_workspace_size().
+//    But I can promise it will use at least 192MB memory if we enable circular buffer.
+template <typename Operator_>
+class SparseGemmUniversal3xOperation : public GemmOperation3xBase<Operator_> {
+public:
+
+  using Operator = Operator_;
+  using OperatorArguments = typename Operator::Arguments;
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementD = typename Operator::ElementD;
+  using LayoutD = typename Operator::LayoutD;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+
+  using CollectiveMainloop = typename Operator::CollectiveMainloop;
+  using CollectiveEpilogue = typename Operator::CollectiveEpilogue;
+  using ThreadEpilogueOp = typename CollectiveEpilogue::ThreadEpilogueOp;
+
+  using ElementE = typename CollectiveMainloop::ElementE;
+  using LayoutE = typename CollectiveMainloop::LayoutE;
+  using SparseConfig = typename CollectiveMainloop::SparseConfig;
+  using LayoutATag = decltype(SparseConfig::deduce_layoutA_tag(typename CollectiveMainloop::LayoutA{}));
+  using CompressorUtility = cutlass::transform::kernel::StructuredSparseCompressorUtility<
+                              cute::Shape<int, int, int, int>,
+                              ElementA,
+                              LayoutATag,
+                              SparseConfig>;
+  using CompressorKernel = cutlass::transform::kernel::StructuredSparseCompressor<
+                              cute::Shape<int, int, int, int>,
+                              ElementA,
+                              LayoutATag,
+                              SparseConfig,
+                              typename Operator::ArchTag>;
+
+  using Compressor = cutlass::transform::device::TransformUniversalAdapter<CompressorKernel>;
+
+public:
+
+  /// Constructor
+  SparseGemmUniversal3xOperation(char const *name = "unknown_gemm"):
+    GemmOperation3xBase<Operator_>(name, GemmKind::kUniversal) {}
+
+protected:
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status construct_arguments_(
+      OperatorArguments &operator_args, GemmUniversalConfiguration const *configuration) {
+    // NOTE: GemmUniversalConfiguration does not contain problem shapes or batch strides
+    // Do nothing here and construct kernel arguments in update_arguments_ instead
+    // We also cannot construct TMA descriptors without all the arguments available
+
+    operator_args.mode = configuration->mode;
+    return Status::kSuccess;
+  }
+
+  template<class FusionArgs, class = void>
+  struct UpdateFusionArgs {
+    static Status update_(FusionArgs const& fusion_args, GemmUniversalArguments const &arguments) {
+      // If a custom EVT is instantiated then it is the users's responsibility
+      // to ensure alpha and beta are updated appropriately
+      return Status::kSuccess;
+    }
+  };
+
+  template<class FusionArgs>
+  struct UpdateFusionArgs<FusionArgs, cute::void_t<decltype(FusionArgs{}.alpha)>> {
+    static Status update_(FusionArgs& fusion_args, GemmUniversalArguments const &arguments) {
+      if (arguments.pointer_mode == ScalarPointerMode::kHost) {
+        fusion_args.alpha = *static_cast<ElementCompute const *>(arguments.alpha);
+        fusion_args.beta = *static_cast<ElementCompute const *>(arguments.beta);
+        fusion_args.alpha_ptr = nullptr;
+        fusion_args.beta_ptr = nullptr;
+
+        return Status::kSuccess;
+      }
+      else if (arguments.pointer_mode == ScalarPointerMode::kDevice) {
+        fusion_args.alpha = 0;
+        fusion_args.beta = 0;
+        fusion_args.alpha_ptr = static_cast<ElementCompute const *>(arguments.alpha);
+        fusion_args.beta_ptr = static_cast<ElementCompute const *>(arguments.beta);
+
+        return Status::kSuccess;
+      }
+      else {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+  };
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status update_arguments_(
+      OperatorArguments &operator_args,
+      GemmUniversalArguments const *arguments,
+      CompressorUtility const& compressor_utility,
+      void* device_a_compressed_ptr = nullptr,
+      void* device_e_ptr = nullptr) {
+    Status status = Status::kSuccess;
+
+    status = UpdateFusionArgs<decltype(operator_args.epilogue.thread)>::update_(
+      operator_args.epilogue.thread, *arguments);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    operator_args.problem_shape = cute::make_shape(
+      arguments->problem_size.m(),
+      arguments->problem_size.n(),
+      arguments->problem_size.k(),
+      arguments->batch_count);
+
+    // update arguments
+    operator_args.mainloop.ptr_A = reinterpret_cast<ElementA const *>(device_a_compressed_ptr);
+    operator_args.mainloop.ptr_B = static_cast<ElementB const *>(arguments->B);
+    operator_args.mainloop.ptr_E = reinterpret_cast<ElementE const *>(device_e_ptr);
+    operator_args.epilogue.ptr_C = static_cast<ElementC const *>(arguments->C);
+    operator_args.epilogue.ptr_D = static_cast<ElementD       *>(arguments->D);
+
+    operator_args.mainloop.layout_a = compressor_utility.fill_layoutA_from_compressor();
+    operator_args.mainloop.layout_e = compressor_utility.fill_layoutE_from_compressor();
+    operator_args.mainloop.dB = cute::make_int_tuple_from<typename Operator::GemmKernel::StrideB>(
+        arguments->ldb, arguments->batch_stride_B);
+    operator_args.epilogue.dC = cute::make_int_tuple_from<typename Operator::GemmKernel::StrideC>(
+        arguments->ldc, arguments->batch_stride_C);
+    operator_args.epilogue.dD = operator_args.epilogue.dC;
+
+    /* Query device SM count to pass onto the kernel as an argument, where needed */
+    operator_args.hw_info.sm_count = arguments->sm_count;
+    if constexpr (!std::is_const_v<decltype(operator_args.scheduler.max_swizzle_size)>) {
+      operator_args.scheduler.max_swizzle_size = arguments->swizzle_size;
+    }
+
+    if constexpr (!std::is_const_v<decltype(operator_args.scheduler.raster_order)>) {
+      using Enum_t = decltype(operator_args.scheduler.raster_order);
+      switch (arguments->raster_order) {
+        case RasterOrder::kAlongN:
+          operator_args.scheduler.raster_order = Enum_t::AlongN;
+          break;
+        case RasterOrder::kAlongM:
+          operator_args.scheduler.raster_order = Enum_t::AlongM;
+          break;
+        default:
+          operator_args.scheduler.raster_order = Enum_t::Heuristic;
+      }
+    }
+
+    return status;
+  }
+
+public:
+
+  /// Returns success if the operation can proceed
+  Status can_implement(
+      void const *configuration_ptr, void const *arguments_ptr) const override {
+
+    GemmUniversalConfiguration const *configuration =
+      static_cast<GemmUniversalConfiguration const *>(configuration_ptr);
+    GemmUniversalArguments const *arguments =
+      static_cast<GemmUniversalArguments const *>(arguments_ptr);
+
+    OperatorArguments args;
+    auto problem_shape_MNKL = cute::make_shape(
+      configuration->problem_size.m(),
+      configuration->problem_size.n(),
+      configuration->problem_size.k(),
+      configuration->batch_count);
+
+    const int M = configuration->problem_size.m();
+    const int N = configuration->problem_size.n();
+    const int K = configuration->problem_size.k();
+    const int L = configuration->batch_count;
+    using StrideA = typename CompressorUtility::StrideA;
+    auto dA = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+    compressor_utility.set_problem_size(problem_shape_MNKL, dA);
+    auto status = update_arguments_(args, arguments, compressor_utility);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // can_implement rules may need access to problem shape
+    args.problem_shape = problem_shape_MNKL;
+    return Operator::can_implement(args);
+  }
+
+  /// Gets the host-side workspace
+  uint64_t get_host_workspace_size(void const *) const override {
+    // Memory to hold operator
+    host_op_workspace_size = sizeof(Operator);
+
+    // Memory to hold result of `.structure_sparse_zero_mask_fill()`
+    tensor_a_size          = compressor_utility.get_raw_tensor_A_bytes();
+
+    // NOTE: order here is the order of workspace partition
+    const uint64_t size = host_op_workspace_size + tensor_a_size;
+
+    return size;
+  }
+
+  /// Gets the device-side workspace
+  uint64_t get_device_workspace_size(
+    void const *configuration_ptr,void const *arguments_ptr) const override {
+
+    OperatorArguments args;
+    auto status = update_arguments_(
+      args, static_cast<GemmUniversalArguments const *>(arguments_ptr), compressor_utility);
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    typename Compressor::Arguments compress_arguments {
+      {compressor_utility.M, 0, compressor_utility.K, compressor_utility.L},
+      {/*Empty Not Use*/},
+      {/*Empty Not Use*/} };
+
+    // Size for one iteration
+    // For multi-iteration, will need to multiply result of this function w/ actual problem_count
+    tensor_ac_size           = compressor_utility.get_compressed_tensor_A_bytes();
+    tensor_e_size            = compressor_utility.get_tensor_E_bytes();
+    device_op_workspace_size = Operator::get_workspace_size(args);
+    device_compress_workspace_size = Compressor::get_workspace_size(compress_arguments);
+
+    // NOTE: order here is the order of workspace partition
+    device_per_iter_workspace_size = device_op_workspace_size + device_compress_workspace_size + tensor_ac_size + tensor_e_size;
+
+    return device_per_iter_workspace_size;
+  }
+
+  /// Initializes the workspace
+  Status initialize(
+      void const *configuration_ptr,
+      void *host_workspace,
+      void *device_workspace,
+      cudaStream_t stream = nullptr) const override {
+    return Status::kErrorInternal;
+  }
+
+  Status initialize_with_profiler_workspace(
+      void const *configuration,
+      void *host_workspace,
+      void *device_workspace,
+      uint8_t **profiler_workspaces,
+      int problem_count_from_profiler,
+      cudaStream_t stream = nullptr) {
+
+    iter_idx.resize(static_cast<GemmUniversalConfiguration const*>(configuration)->device_count, 0);
+
+    // Set problem_count.
+    problem_count = problem_count_from_profiler;
+
+    // * Host Ptr
+    auto* host_op_workspace_ptr       = reinterpret_cast<uint8_t*>(host_workspace);
+    auto* host_a_raw_ptr              = host_op_workspace_ptr + host_op_workspace_size;
+
+    // * Construct Op
+    Operator *op = new (host_op_workspace_ptr) Operator;
+
+    // * Device Ptr (1st iteration)
+    // Device workspace : | iter1 | iter2 | iter3 | .. | iterx |
+    //            iteri : op_workspace | tensor_ac | tensor_e
+    auto* device_ptr_iter1                = static_cast<uint8_t*>(device_workspace);
+    auto* device_op_workspace_ptr_iter1         = device_ptr_iter1;
+    auto* device_compressor_workspace_ptr_iter1 = device_op_workspace_ptr_iter1 + device_op_workspace_size;
+    auto* device_a_compressed_ptr_iter1         = device_compressor_workspace_ptr_iter1 + device_compress_workspace_size;
+    auto* device_e_ptr_iter1                    = device_a_compressed_ptr_iter1 + tensor_ac_size;
+
+    // * Device A Raw Ptr
+    auto* device_a_raw_ptr = profiler_workspaces[0];
+
+    // * Random fill 50% of TensorA w/ zero following the structured sparse requirement
+    CUDA_CHECK(cudaMemcpyAsync(host_a_raw_ptr, device_a_raw_ptr, tensor_a_size, cudaMemcpyDeviceToHost, stream));
+    compressor_utility.structure_sparse_zero_mask_fill(host_a_raw_ptr, 2000);
+    CUDA_CHECK(cudaMemcpyAsync(device_a_raw_ptr, host_a_raw_ptr, tensor_a_size, cudaMemcpyHostToDevice, stream));
+
+    CUDA_CHECK(cudaGetLastError());
+
+    // * Compress DTensorA and get DTensorAC & DTensorE
+    cutlass::KernelHardwareInfo hw_info;
+    CUDA_CHECK(cudaGetDevice(&hw_info.device_id));
+    hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+    typename Compressor::Arguments arguments{
+        {compressor_utility.M, 0, compressor_utility.K, compressor_utility.L},
+        {device_a_raw_ptr,
+         compressor_utility.dA,
+         device_a_compressed_ptr_iter1,
+         device_e_ptr_iter1},
+        {hw_info}
+    };
+
+    cutlass::Status status {cutlass::Status::kSuccess };
+
+    Compressor compressor_op;
+    status = compressor_op.can_implement(arguments);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = compressor_op.initialize(arguments, device_compressor_workspace_ptr_iter1, stream);
+    if (status != Status::kSuccess) {
+       return status;
+    }
+
+    status = compressor_op.run(stream);
+    if (status != Status::kSuccess) {
+       return status;
+    }
+
+    // * Copy Iter1's DTensorAC DTensorE to each iteration's DTensorAC DTensorE
+    for (int iter_i = 1; iter_i < problem_count; iter_i++) {
+      // * Device AC E Ptr per iteration
+      // Device workspace : | iter1 | iter2 | iter3 | .. | iterx |
+      //            iteri : op_workspace | tensor_ac | tensor_e
+      auto* device_ptr_iteri                = static_cast<uint8_t*>(device_workspace) + device_per_iter_workspace_size * iter_i;
+      auto* device_op_workspace_ptr         = device_ptr_iteri;
+      auto* device_compressor_workspace_ptr = device_op_workspace_ptr + device_op_workspace_size;
+      auto* device_a_compressed_ptr         = device_compressor_workspace_ptr + device_compress_workspace_size;
+      auto* device_e_ptr                    = device_a_compressed_ptr + tensor_ac_size;
+
+      CUDA_CHECK(cudaMemcpyAsync(device_a_compressed_ptr, device_a_compressed_ptr_iter1, tensor_ac_size, cudaMemcpyDeviceToDevice, stream));
+      CUDA_CHECK(cudaMemcpyAsync(device_e_ptr, device_e_ptr_iter1, tensor_e_size, cudaMemcpyDeviceToDevice, stream));
+    }
+
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    CUDA_CHECK(cudaGetLastError());
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel
+  Status run(
+      void const *arguments_ptr,
+      void *host_workspace,
+      void *device_workspace,
+      cudaStream_t stream = nullptr,
+      bool launch_with_pdl = false) const override {
+
+    OperatorArguments operator_args;
+
+
+    const auto device_index = static_cast<GemmUniversalArguments const *>(arguments_ptr)->device_index;
+
+    auto* device_ptr_iteri                = static_cast<uint8_t*>(device_workspace) + device_per_iter_workspace_size * iter_idx[device_index];
+    auto* device_op_workspace_ptr         = device_ptr_iteri;
+    auto* device_compressor_workspace_ptr = device_op_workspace_ptr + device_op_workspace_size;
+    auto* device_a_compressed_ptr         = device_compressor_workspace_ptr + device_compress_workspace_size;
+    auto* device_e_ptr                    = device_a_compressed_ptr + tensor_ac_size;
+    iter_idx[device_index] = (iter_idx[device_index] + 1) % problem_count;
+
+    Status status = update_arguments_(operator_args, static_cast<GemmUniversalArguments const *>(arguments_ptr), compressor_utility, device_a_compressed_ptr, device_e_ptr );
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = static_cast<Operator *>(host_workspace);
+    // We need to call initialize() since we have to rebuild TMA desc for every new set of args
+    status = op->run(operator_args, device_op_workspace_ptr, stream, nullptr, launch_with_pdl);
+    return status;
+  }
+
+private:
+  // Variables that must change in the const functions.
+  mutable CompressorUtility compressor_utility;
+  mutable int problem_count = 1;
+  mutable std::vector<int> iter_idx;
+
+  mutable uint64_t tensor_ac_size = 0;
+  mutable uint64_t tensor_e_size = 0;
+  mutable uint64_t tensor_a_size = 0;
+  mutable uint64_t host_op_workspace_size = 0;
+  mutable uint64_t device_compress_workspace_size = 0;
+  mutable uint64_t device_op_workspace_size = 0;
+  mutable uint64_t device_per_iter_workspace_size = 0;
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::library
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/library/src/symm_operation.h b/tools/library/src/symm_operation.h
index f0e8db1716..aeb06caf54 100644
--- a/tools/library/src/symm_operation.h
+++ b/tools/library/src/symm_operation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -312,7 +312,12 @@ class SymmOperation : public SymmOperationBase<Operator_> {
     void const *arguments_ptr,
     void *host_workspace, 
     void *device_workspace = nullptr, 
-    cudaStream_t stream = nullptr) const {
+    cudaStream_t stream = nullptr,
+    bool launch_with_pdl = false) const {
+    
+    if (launch_with_pdl) {
+      return Status::kErrorNotSupported;
+    }
 
     OperatorArguments args;
     
@@ -353,7 +358,7 @@ class SymmOperation : public SymmOperationBase<Operator_> {
     std::cout << "SymmOperation::OperatorArguments" << std::endl
               << "  problem_size:" << std::endl 
               << operator_args.problem_size << std::endl
-              << "  epilouge (alpha, beta): "
+              << "  epilogue (alpha, beta): "
               << operator_args.epilogue.alpha << ", " 
               << operator_args.epilogue.beta << std::endl
               << "  ref_A (ptr, {stride}): " 
diff --git a/tools/library/src/trmm_operation.h b/tools/library/src/trmm_operation.h
index 8595bb57ed..88c4f7ab7d 100644
--- a/tools/library/src/trmm_operation.h
+++ b/tools/library/src/trmm_operation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -304,7 +304,12 @@ class TrmmOperation : public TrmmOperationBase<Operator_> {
     void const *arguments_ptr,
     void *host_workspace, 
     void *device_workspace = nullptr, 
-    cudaStream_t stream = nullptr) const {
+    cudaStream_t stream = nullptr,
+    bool launch_with_pdl = false) const {
+
+    if (launch_with_pdl) {
+      return Status::kErrorNotSupported;
+    }
 
     OperatorArguments args;
     
diff --git a/tools/library/src/util.cu b/tools/library/src/util.cu
index 4636fd366f..dee9481001 100644
--- a/tools/library/src/util.cu
+++ b/tools/library/src/util.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -333,7 +333,7 @@ static struct {
 }
 OperationKind_enumerants[] = {
   {"eq_gemm", "EqGemm", OperationKind::kEqGemm}, 
-  {"gemm", "Gemm", OperationKind::kGemm},               
+  {"gemm", "Gemm", OperationKind::kGemm},
   {"rank_k", "RankK", OperationKind::kRankK},
   {"rank_2k", "Rank2K", OperationKind::kRank2K},
   {"trmm", "Trmm", OperationKind::kTrmm},
@@ -422,13 +422,15 @@ Status from_string<Status>(std::string const &str) {
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
 static struct {
   char const *text;
   char const *pretty;
   NumericTypeID enumerant;
 }
 NumericTypeID_enumerants[] = {
-  {"unknown", "<unkown>", NumericTypeID::kUnknown},
+  {"unknown", "<unknown>", NumericTypeID::kUnknown},
   {"void", "Void", NumericTypeID::kVoid},
   {"b1", "B1", NumericTypeID::kB1},
   {"u2", "U2", NumericTypeID::kU2},
@@ -443,6 +445,8 @@ NumericTypeID_enumerants[] = {
   {"s16", "S16", NumericTypeID::kS16},
   {"s32", "S32", NumericTypeID::kS32},
   {"s64", "S64", NumericTypeID::kS64},
+  {"fe4m3", "FE4M3", NumericTypeID::kFE4M3},
+  {"fe5m2", "FE5M2", NumericTypeID::kFE5M2},
   {"f16", "F16", NumericTypeID::kF16},
   {"bf16", "BF16", NumericTypeID::kBF16},
   {"f32", "F32", NumericTypeID::kF32},
@@ -465,7 +469,7 @@ NumericTypeID_enumerants[] = {
   {"cs16", "CS16", NumericTypeID::kCS16},
   {"cs32", "CS32", NumericTypeID::kCS32},
   {"cs64", "CS64", NumericTypeID::kCS64},
-  {"*", "<unkown/enumerate all>", NumericTypeID::kUnknown}
+  {"*", "<unknown/enumerate all>", NumericTypeID::kUnknown}
 };
 
 /// Converts a NumericTypeID enumerant to a string
@@ -504,6 +508,8 @@ NumericTypeID from_string<NumericTypeID>(std::string const &str) {
 /// Returns the size of a data type in bits
 int sizeof_bits(NumericTypeID type) {
   switch (type) {
+    case NumericTypeID::kFE4M3: return 8;
+    case NumericTypeID::kFE5M2: return 8;
     case NumericTypeID::kF16: return 16;
     case NumericTypeID::kBF16: return 16;
     case NumericTypeID::kTF32: return 32;
@@ -581,6 +587,8 @@ bool is_integer_type(NumericTypeID type) {
 /// Returns true if numeric type is signed
 bool is_signed_type(NumericTypeID type) {
   switch (type) {
+    case NumericTypeID::kFE4M3: return true;
+    case NumericTypeID::kFE5M2: return true;
     case NumericTypeID::kF16: return true;
     case NumericTypeID::kBF16: return true;
     case NumericTypeID::kTF32: return true;
@@ -610,6 +618,8 @@ bool is_unsigned_integer(NumericTypeID type) {
 /// Returns true if numeric type is floating-point type
 bool is_float_type(NumericTypeID type) {
   switch (type) {
+  case NumericTypeID::kFE4M3: return true;
+  case NumericTypeID::kFE5M2: return true;
   case NumericTypeID::kF16: return true;
   case NumericTypeID::kBF16: return true;
   case NumericTypeID::kTF32: return true;
@@ -746,6 +756,7 @@ OpcodeClassID_enumerants[] = {
   {"tensorop", "<tensorop>", OpcodeClassID::kTensorOp},
   {"wmmatensorop", "<wmmatensorop>", OpcodeClassID::kWmmaTensorOp},
   {"wmma", "<wmma>", OpcodeClassID::kWmmaTensorOp},
+  {"sptensorop", "<sptensorop>", OpcodeClassID::kSparseTensorOp}
 };
 
 /// Converts a OpcodeClassID enumerant to a string
@@ -958,7 +969,7 @@ static struct {
   ConvKind enumerant;
 }
 ConvKind_enumerants[] = {
-  {"unknown", "<unkown>", ConvKind::kUnknown},
+  {"unknown", "<unknown>", ConvKind::kUnknown},
   {"fprop", "<fprop>", ConvKind::kFprop},
   {"dgrad", "<dgrad>", ConvKind::kDgrad},
   {"wgrad", "<wgrad>", ConvKind::kWgrad},
@@ -997,6 +1008,52 @@ ConvKind from_string<ConvKind>(std::string const &str) {
 }
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
+static struct {
+  char const *text;
+  char const *pretty;
+  char const *character;
+  RasterOrder enumerant;
+}
+RasterOrder_enumerants[] = {
+  {"along_n", "<along_n>", "N", RasterOrder::kAlongN},
+  {"along_m", "<along_m>", "M", RasterOrder::kAlongM},
+  {"heuristic", "<heuristic>", "H", RasterOrder::kHeuristic},
+};
+
+/// Converts a RasterOrder enumerant to a string
+char const *to_string(RasterOrder type, bool pretty) {
+
+  for (auto const & possible : RasterOrder_enumerants) {
+    if (type == possible.enumerant) {
+      if (pretty) {
+        return possible.pretty;
+      }
+      else {
+        return possible.text;
+      }
+    }
+  }
+
+  return pretty ? "Invalid" : "invalid";
+}
+
+
+/// Converts a RasterOrder enumerant from a string
+template <>
+RasterOrder from_string<RasterOrder>(std::string const &str) {
+
+  for (auto const & possible : RasterOrder_enumerants) {
+    if ((str.compare(possible.text) == 0) ||
+        (str.compare(possible.pretty) == 0) ||
+        (str.compare(possible.character) == 0)) {
+      return possible.enumerant;
+    }
+  }
+
+  return RasterOrder::kInvalid;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
 /// Lexical cast a string to a byte array. Returns true if cast is successful or false if invalid.
 bool lexical_cast(std::vector<uint8_t> &bytes, NumericTypeID type, std::string const &str) {
   int size_bytes = sizeof_bits(type) / 8;
@@ -1050,6 +1107,20 @@ bool lexical_cast(std::vector<uint8_t> &bytes, NumericTypeID type, std::string c
     ss >> *reinterpret_cast<int64_t *>(bytes.data());
   }
     break;
+  case NumericTypeID::kFE4M3:
+  {
+    float tmp;
+    ss >> tmp;
+    *reinterpret_cast<float_e4m3_t *>(bytes.data()) = static_cast<float_e4m3_t>(tmp);
+  }
+    break;
+  case NumericTypeID::kFE5M2:
+  {
+    float tmp;
+    ss >> tmp;
+    *reinterpret_cast<float_e5m2_t *>(bytes.data()) = static_cast<float_e5m2_t>(tmp);
+  }
+    break;
   case NumericTypeID::kF16:
   {
     float tmp;
@@ -1136,7 +1207,7 @@ std::string lexical_cast(int64_t int_value) {
 /// Lexical cast TO a string FROM a byte array. Returns true if cast is successful or false if invalid.
 std::string lexical_cast(std::vector<uint8_t> &bytes, NumericTypeID type) {
 
-  int size_bytes = sizeof_bits(type) / 8;
+  size_t size_bytes = sizeof_bits(type) / 8;
 
   if (!size_bytes || size_bytes != bytes.size()) {
     return "<invalid>";
@@ -1187,6 +1258,18 @@ std::string lexical_cast(std::vector<uint8_t> &bytes, NumericTypeID type) {
     ss << *reinterpret_cast<int64_t *>(bytes.data());
   }
     break;
+  case NumericTypeID::kFE4M3:
+  {
+    float tmp = *reinterpret_cast<float_e4m3_t *>(bytes.data());
+    ss << tmp;
+  }
+    break;
+  case NumericTypeID::kFE5M2:
+  {
+    float tmp = *reinterpret_cast<float_e5m2_t *>(bytes.data());
+    ss << tmp;
+  }
+    break;
   case NumericTypeID::kF16:
   {
     float tmp = *reinterpret_cast<half_t *>(bytes.data());
@@ -1195,13 +1278,13 @@ std::string lexical_cast(std::vector<uint8_t> &bytes, NumericTypeID type) {
     break;
   case NumericTypeID::kBF16:
   {
-    float tmp = *reinterpret_cast<bfloat16_t *>(bytes.data());;
+    float tmp = *reinterpret_cast<bfloat16_t *>(bytes.data());
     ss << tmp;
   }
     break;
   case NumericTypeID::kTF32:
   {
-    float tmp = *reinterpret_cast<tfloat32_t *>(bytes.data());;
+    float tmp = *reinterpret_cast<tfloat32_t *>(bytes.data());
     ss << tmp;
   }
     break;
@@ -1329,6 +1412,16 @@ bool cast_from_int64(std::vector<uint8_t> &bytes, NumericTypeID type, int64_t sr
     *reinterpret_cast<int64_t *>(bytes.data()) = static_cast<int64_t>(src);
   }
     break;
+  case NumericTypeID::kFE4M3:
+  {
+    *reinterpret_cast<float_e4m3_t *>(bytes.data()) = static_cast<float_e4m3_t>(float(src));
+  }
+    break;
+  case NumericTypeID::kFE5M2:
+  {
+    *reinterpret_cast<float_e5m2_t *>(bytes.data()) = static_cast<float_e5m2_t>(float(src));
+  }
+    break;
   case NumericTypeID::kF16:
   {
     *reinterpret_cast<half_t *>(bytes.data()) = static_cast<half_t>(float(src));
@@ -1429,6 +1522,16 @@ bool cast_from_uint64(std::vector<uint8_t> &bytes, NumericTypeID type, uint64_t
     *reinterpret_cast<int64_t *>(bytes.data()) = static_cast<int64_t>(src);
   }
     break;
+  case NumericTypeID::kFE4M3:
+  {
+    *reinterpret_cast<float_e4m3_t *>(bytes.data()) = static_cast<float_e4m3_t>(float(src));
+  }
+    break;
+  case NumericTypeID::kFE5M2:
+  {
+    *reinterpret_cast<float_e5m2_t *>(bytes.data()) = static_cast<float_e5m2_t>(float(src));
+  }
+    break;
   case NumericTypeID::kF16:
   {
     *reinterpret_cast<half_t *>(bytes.data()) = static_cast<half_t>(float(src));
@@ -1530,6 +1633,16 @@ bool cast_from_double(std::vector<uint8_t> &bytes, NumericTypeID type, double sr
     *reinterpret_cast<int64_t *>(bytes.data()) = static_cast<int64_t>(src);
   }
     break;
+  case NumericTypeID::kFE4M3:
+  {
+    *reinterpret_cast<float_e4m3_t *>(bytes.data()) = static_cast<float_e4m3_t>(float(src));
+  }
+    break;
+  case NumericTypeID::kFE5M2:
+  {
+    *reinterpret_cast<float_e5m2_t *>(bytes.data()) = static_cast<float_e5m2_t>(float(src));
+  }
+    break;
   case NumericTypeID::kF16:
   {
     *reinterpret_cast<half_t *>(bytes.data()) = static_cast<half_t>(float(src));
diff --git a/tools/profiler/CMakeLists.txt b/tools/profiler/CMakeLists.txt
index c619b15771..d71caf4183 100644
--- a/tools/profiler/CMakeLists.txt
+++ b/tools/profiler/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -26,10 +26,12 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+find_package(Python3 3.5 COMPONENTS Interpreter REQUIRED)
+
 #
 # Sources for CUTLASS Profiler Tool
 #
-
+cmake_policy(SET CMP0112 NEW)
 set(CUTLASS_TOOLS_PROFILER_SOURCES
   src/main.cpp
   src/cutlass_profiler.cu
@@ -71,7 +73,7 @@ set_target_properties(cutlass_profiler PROPERTIES EXPORT_NAME profiler)
 target_include_directories(
   cutlass_profiler
   PRIVATE
-  ${CMAKE_CURRENT_LIST_DIR}/src
+  ${CMAKE_CURRENT_LIST_DIR}/include
   )
 
 #
@@ -86,6 +88,7 @@ target_link_libraries(
   $<$<BOOL:${CUTLASS_ENABLE_CUBLAS}>:nvidia::cublas>
   $<$<BOOL:${CUTLASS_ENABLE_CUDNN}>:nvidia::cudnn>
   cudart
+  cuda_driver
   )
 
 install(
@@ -94,26 +97,34 @@ install(
   RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
   )
 
-set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_GEMM   --operation=Gemm       --providers=cutlass --verification-providers=cublas,device      --junit-output=test_cutlass_profiler_gemm)
-set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV2D --operation=Conv2d     --providers=cutlass --verification-providers=cudnn,device       --junit-output=test_cutlass_profiler_conv2d)
-set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV3D --operation=Conv3d     --providers=cutlass --verification-providers=cudnn,device,host  --junit-output=test_cutlass_profiler_conv3d)
-set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SPGEMM --operation=SparseGemm --providers=cutlass --verification-providers=cublas,device,host --junit-output=test_cutlass_profiler_spgemm)
-set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_RANK_K   --operation=RankK       --providers=cutlass --verification-providers=cublas        --junit-output=test_cutlass_profiler_rank_k)
-set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_RANK_2K   --operation=Rank2K       --providers=cutlass --verification-providers=cublas        --junit-output=test_cutlass_profiler_rank_2k)
-set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_TRMM   --operation=Trmm       --providers=cutlass --verification-providers=device,host        --junit-output=test_cutlass_profiler_trmm)
-set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SYMM   --operation=Symm       --providers=cutlass --verification-providers=cublas,host        --junit-output=test_cutlass_profiler_symm)
+if (CUDA_VERSION VERSION_GREATER_EQUAL 12.3 AND CUDA_VERSION VERSION_LESS 12.4 AND (90a IN_LIST CUTLASS_NVCC_ARCHS_ENABLED OR (90 IN_LIST CUTLASS_NVCC_ARCHS_ENABLED)))
+  set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_GEMM   --operation=Gemm       --providers=cutlass --verification-providers=cublas,host      --junit-output=test_cutlass_profiler_gemm    --print-kernel-before-running=true)
+else()
+    set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_GEMM   --operation=Gemm       --providers=cutlass --verification-providers=cublas,device      --junit-output=test_cutlass_profiler_gemm    --print-kernel-before-running=true)
+endif()
+
+set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV2D --operation=Conv2d     --providers=cutlass --verification-providers=cudnn,device       --junit-output=test_cutlass_profiler_conv2d  --print-kernel-before-running=true)
+set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV3D --operation=Conv3d     --providers=cutlass --verification-providers=cudnn,device,host  --junit-output=test_cutlass_profiler_conv3d  --print-kernel-before-running=true)
+set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SPGEMM --operation=SparseGemm --providers=cutlass --verification-providers=cublas,device,host --junit-output=test_cutlass_profiler_spgemm  --print-kernel-before-running=true)
+set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_RANK_K   --operation=RankK       --providers=cutlass --verification-providers=cublas        --junit-output=test_cutlass_profiler_rank_k    --print-kernel-before-running=true)
+set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_RANK_2K   --operation=Rank2K       --providers=cutlass --verification-providers=cublas        --junit-output=test_cutlass_profiler_rank_2k --print-kernel-before-running=true)
+set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_TRMM   --operation=Trmm       --providers=cutlass --verification-providers=device,host        --junit-output=test_cutlass_profiler_trmm    --print-kernel-before-running=true)
+set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SYMM   --operation=Symm       --providers=cutlass --verification-providers=cublas,host        --junit-output=test_cutlass_profiler_symm    --print-kernel-before-running=true)
 
 cutlass_add_executable_tests(
   test_profiler cutlass_profiler
   DEPENDEES test_all
-  TEST_COMMAND_OPTIONS 
-    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_GEMM
-    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV2D
-    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV3D
-    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SPGEMM
-    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_RANK_K
-    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_RANK_2K
-    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_TRMM
-    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SYMM
+  TEST_COMMAND_OPTIONS
+    GEMM
+    CONV2D
+    CONV3D
+    SPGEMM
+    RANK_K
+    RANK_2K
+    TRMM
+    SYMM
+  TEST_COMMAND_OPTIONS_PREFIX
+    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_
   DISABLE_EXECUTABLE_INSTALL_RULE
   )
+
diff --git a/tools/profiler/src/conv2d_operation_profiler.h b/tools/profiler/include/cutlass/profiler/conv2d_operation_profiler.h
similarity index 95%
rename from tools/profiler/src/conv2d_operation_profiler.h
rename to tools/profiler/include/cutlass/profiler/conv2d_operation_profiler.h
index 39eead34e7..32d79211c4 100644
--- a/tools/profiler/src/conv2d_operation_profiler.h
+++ b/tools/profiler/include/cutlass/profiler/conv2d_operation_profiler.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -75,6 +75,7 @@ class Conv2dOperationProfiler : public OperationProfiler {
   struct Conv2dProblem {
 
     int64_t n, h, w, c, p, q, k, r, s;
+    int64_t groups;
     int64_t pad_h, pad_w;
     int64_t stride_h, stride_w;
     int64_t dilation_h, dilation_w;
@@ -114,7 +115,7 @@ class Conv2dOperationProfiler : public OperationProfiler {
     cutlass::gemm::GemmCoord eq_gemm_size(library::ConvKind const &conv_kind) const {
 
       switch (conv_kind) {
-        case library::ConvKind::kFprop: return cutlass::gemm::GemmCoord(int(n * p * q), int(k), int(r * s * c));
+        case library::ConvKind::kFprop: return cutlass::gemm::GemmCoord(int(n * p * q), int(k), int(r * s * c / groups));
         case library::ConvKind::kDgrad: return cutlass::gemm::GemmCoord(int(n * h * w), int(c), int(k * r * s));
         case library::ConvKind::kWgrad: return cutlass::gemm::GemmCoord(int(k), int(r * s * c), int(n * p * q));
         default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
@@ -136,7 +137,7 @@ class Conv2dOperationProfiler : public OperationProfiler {
     std::vector<int> extent_b(library::ConvKind const &conv_kind) const {
 
       switch (conv_kind) {
-        case library::ConvKind::kFprop: return {int(k), int(r), int(s), int(c)};
+        case library::ConvKind::kFprop: return {int(k), int(r), int(s), int(c / groups)};
         case library::ConvKind::kDgrad: return {int(k), int(r), int(s), int(c)};
         case library::ConvKind::kWgrad: return {int(n), int(h), int(w), int(c)};
         default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
@@ -188,7 +189,7 @@ class Conv2dOperationProfiler : public OperationProfiler {
       }
     }
 
-    // Returns leading dimenstion for equivalent gemm matrix A
+    // Returns leading dimension for equivalent gemm matrix A
     int64_t eq_gemm_lda(library::ConvKind const &conv_kind) const {
 
       switch (conv_kind) {
@@ -199,7 +200,7 @@ class Conv2dOperationProfiler : public OperationProfiler {
       }
     }
 
-    // Returns leading dimenstion for equivalent gemm matrix B
+    // Returns leading dimension for equivalent gemm matrix B
     int64_t eq_gemm_ldb(library::ConvKind const &conv_kind) const {
 
       switch (conv_kind) {
@@ -210,7 +211,7 @@ class Conv2dOperationProfiler : public OperationProfiler {
       }
     }
 
-    // Returns leading dimenstion for equivalent gemm matrix C
+    // Returns leading dimension for equivalent gemm matrix C
     int64_t eq_gemm_ldc(library::ConvKind const &conv_kind) const {
 
       switch (conv_kind) {
@@ -228,6 +229,7 @@ class Conv2dOperationProfiler : public OperationProfiler {
     /// Conv device allocations
     DeviceAllocation *A;
     DeviceAllocation *B;
+    DeviceAllocation *reordered_B;
     DeviceAllocation *C;
     DeviceAllocation *Computed;
     DeviceAllocation *Reference;
@@ -270,6 +272,7 @@ class Conv2dOperationProfiler : public OperationProfiler {
     Conv2dWorkspace()
         : A(nullptr),
           B(nullptr),
+          reordered_B(nullptr),
           C(nullptr),
           Computed(nullptr),
           Reference(nullptr) {}
@@ -317,10 +320,10 @@ class Conv2dOperationProfiler : public OperationProfiler {
         stride_activations.push_back(int(problem.h) * int(problem.w) *
                                      int(problem.c));
 
-        stride_filters.push_back(int(problem.c));
-        stride_filters.push_back(int(problem.s) * int(problem.c));
+        stride_filters.push_back(int(problem.c / problem.groups));
+        stride_filters.push_back(int(problem.s) * int(problem.c / problem.groups));
         stride_filters.push_back(int(problem.r) * int(problem.s) *
-                                 int(problem.c));
+                                 int(problem.c / problem.groups));
 
         stride_output.push_back(int(problem.k));
         stride_output.push_back(int(problem.q) * int(problem.k));
@@ -380,6 +383,8 @@ class Conv2dOperationProfiler : public OperationProfiler {
   /// Destructor
   virtual ~Conv2dOperationProfiler();
 
+  Conv2dProblem const& problem() const { return problem_; }
+
   /// Prints usage statement for the math function
   virtual void print_usage(std::ostream &out) const;
 
@@ -425,7 +430,7 @@ class Conv2dOperationProfiler : public OperationProfiler {
 protected:
   /// Method to profile an initialized CUTLASS operation
   virtual Status profile_cutlass_(
-    double &runtime,
+    PerformanceResult &result,
     Options const &options,
     library::Operation const *operation,
     void *arguments,
@@ -433,7 +438,7 @@ class Conv2dOperationProfiler : public OperationProfiler {
     void *device_workspace);
  
  
-  /// Initialize reduction problem dimenstions and library::Operation
+  /// Initialize reduction problem dimensions and library::Operation
   bool initialize_reduction_configuration_(
     Options const &options,  
     PerformanceReport &report,
diff --git a/tools/profiler/src/conv3d_operation_profiler.h b/tools/profiler/include/cutlass/profiler/conv3d_operation_profiler.h
similarity index 96%
rename from tools/profiler/src/conv3d_operation_profiler.h
rename to tools/profiler/include/cutlass/profiler/conv3d_operation_profiler.h
index e137b692cf..2ce0a1c21b 100644
--- a/tools/profiler/src/conv3d_operation_profiler.h
+++ b/tools/profiler/include/cutlass/profiler/conv3d_operation_profiler.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -105,7 +105,7 @@ class Conv3dOperationProfiler : public OperationProfiler {
     /// Total number of flops computed
     int64_t flops(library::ConvDescription const &operation_desc) const;
 
-    /// Infers output size from theinput size, padding, stride, and dilation
+    /// Infers output size from the input size, padding, stride, and dilation
     void set_default_output_size() {
       z = ((d + pad_d - t * dilation_d) / stride_d) + 1;
       p = ((h + pad_h - r * dilation_h) / stride_h) + 1;
@@ -190,7 +190,7 @@ class Conv3dOperationProfiler : public OperationProfiler {
       }
     }
 
-    // Returns leading dimenstion for equivalent gemm matrix A
+    // Returns leading dimension for equivalent gemm matrix A
     int64_t eq_gemm_lda(library::ConvKind const &conv_kind) const {
 
       switch (conv_kind) {
@@ -201,7 +201,7 @@ class Conv3dOperationProfiler : public OperationProfiler {
       }
     }
 
-    // Returns leading dimenstion for equivalent gemm matrix B
+    // Returns leading dimension for equivalent gemm matrix B
     int64_t eq_gemm_ldb(library::ConvKind const &conv_kind) const {
 
       switch (conv_kind) {
@@ -212,7 +212,7 @@ class Conv3dOperationProfiler : public OperationProfiler {
       }
     }
 
-    // Returns leading dimenstion for equivalent gemm matrix C
+    // Returns leading dimension for equivalent gemm matrix C
     int64_t eq_gemm_ldc(library::ConvKind const &conv_kind) const {
 
       switch (conv_kind) {
@@ -332,6 +332,8 @@ class Conv3dOperationProfiler : public OperationProfiler {
   /// Destructor
   virtual ~Conv3dOperationProfiler();
 
+  Conv3dProblem const& problem() const { return problem_; }
+
   /// Prints usage statement for the math function
   virtual void print_usage(std::ostream &out) const;
 
@@ -382,14 +384,14 @@ class Conv3dOperationProfiler : public OperationProfiler {
 
   /// Method to profile an initialized CUTLASS operation
   virtual Status profile_cutlass_(
-    double &runtime,
+    PerformanceResult &result,
     Options const &options,
     library::Operation const *operation,
     void *arguments,
     void *host_workspace,
     void *device_workspace);
   
-  /// Initialize reduction problem dimenstions and library::Operation
+  /// Initialize reduction problem dimensions and library::Operation
   bool initialize_reduction_configuration_(
     Options const &options,  
     PerformanceReport &report,
diff --git a/tools/profiler/src/cublas_helpers.h b/tools/profiler/include/cutlass/profiler/cublas_helpers.h
similarity index 72%
rename from tools/profiler/src/cublas_helpers.h
rename to tools/profiler/include/cutlass/profiler/cublas_helpers.h
index 90bf8924c3..10642e5ff0 100644
--- a/tools/profiler/src/cublas_helpers.h
+++ b/tools/profiler/include/cutlass/profiler/cublas_helpers.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -36,6 +36,7 @@
 
 #if CUTLASS_ENABLE_CUBLAS
 #include <cublas_v2.h>
+#include <cublasLt.h>
 
 #include "cutlass/cutlass.h"
 #include "cutlass/library/library.h"
@@ -54,7 +55,7 @@ namespace profiler {
 /// Converts a cuBLAS status to cutlass::Status
 Status get_cutlass_status(cublasStatus_t cublas);
 
-/// Converts a cuBLASS status to cutlass::profiler::Disposition
+/// Converts a cuBLAS status to cutlass::profiler::Disposition
 Disposition get_cutlass_disposition(cublasStatus_t cublas_status);
 
 /// Maps a CUTLASS tensor layout to a cuBLAS transpose operation
@@ -87,28 +88,51 @@ Status cublas_satisfies(library::SymmDescription const &desc);
 
 /// This is a helper class to create cublasHandle_t automatically on CublasCreate object creation and 
 /// to destroy cublasHandle_t on CublasCreate object destruction. 
-/// Additionaly, it provides implicit cast from CublasCreate's object to cublasHandle_t's object
+/// Additionally, it provides implicit cast from CublasCreate's object to cublasHandle_t's object
 class CublasCreate {
 private:
-	cublasHandle_t handle;
-	cublasStatus_t status;
+  cublasHandle_t handle;
+  cublasStatus_t status;
 
 public:
-	CublasCreate() {
-		status = cublasCreate(&handle);
-	}
+  CublasCreate() {
+    status = cublasCreate(&handle);
+  }
 
-	~CublasCreate() {
-		cublasDestroy(handle);
-	}
+  ~CublasCreate() {
+    cublasDestroy(handle);
+  }
 
-    /// Implicit cast CublasCreate object to cublasHandle_t
-    operator cublasHandle_t() const { return handle; }
+  /// Implicit cast CublasCreate object to cublasHandle_t
+  operator cublasHandle_t() const { return handle; }
 
-    /// returns cublasStatus_t for handle creation
-    cublasStatus_t get_cublas_create_status() { return status; }
+  /// returns cublasStatus_t for handle creation
+  cublasStatus_t get_cublas_create_status() { return status; }
 };
 
+/// This is a helper class to create cublasLtHandle_t automatically on CublasLtCreate object creation and 
+/// to destroy cublasLtHandle_t on CublasLtCreate object destruction. 
+/// Additionally, it provides implicit cast from CublasLtCreate's object to cublasLtHandle_t's object
+class CublasLtCreate {
+private:
+  cublasLtHandle_t handle;
+  cublasStatus_t status;
+
+public:
+  CublasLtCreate() {
+    status = cublasLtCreate(&handle);
+  }
+
+  ~CublasLtCreate() {
+    cublasLtDestroy(handle);
+  }
+
+  /// Implicit cast CublasLtCreate object to cublasLtHandle_t
+  operator cublasLtHandle_t() const { return handle; }
+
+  /// returns cublasLtStatus_t for handle creation
+  cublasStatus_t get_cublaslt_create_status() { return status; }
+};
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace detail {
@@ -196,7 +220,7 @@ struct cublasGemmExDispatcher {
   library::GemmUniversalConfiguration configuration;
   library::GemmUniversalArguments arguments;
 
-  // cublass-specific data structures to fill cublas API call arguments
+  // cublas-specific data structures to fill cublas API call arguments
   cublasOperation_t trans_A;
   cublasOperation_t trans_B;
   cudaDataType_t data_type_A;
@@ -226,6 +250,80 @@ struct cublasGemmExDispatcher {
   cublasStatus_t operator()(cublasHandle_t handle);
 };
 
+/// Dispatcher to cublaslt kernels 
+//
+struct cublasLtGemmExDispatcher {
+
+  //
+  // Data members
+  //
+  library::GemmDescription const &op_desc;
+  library::GemmUniversalConfiguration configuration;
+  library::GemmUniversalArguments arguments;
+
+  // cublas-specific data structures to fill cublas API call arguments
+  cublasOperation_t trans_A;
+  cublasOperation_t trans_B;
+  cudaDataType_t data_type_A;
+  cudaDataType_t data_type_B;
+  cudaDataType_t data_type_C;
+  cudaDataType_t compute_data_type = CUDA_R_32F;
+
+  //cublasLt-specific data structures
+  cublasLtMatmulDesc_t operationDesc = NULL;
+  cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL, Ddesc = NULL;
+  cublasLtMatmulPreference_t preference = NULL;
+  
+  //is set by call to get_cublaslt_algo()
+  cublasLtMatmulHeuristicResult_t heuristicResult_;
+  void *workspace = nullptr;
+
+  Status status;
+
+#if (__CUDACC_VER_MAJOR__ >= 11)
+  cublasComputeType_t compute_type;
+#endif
+
+  //
+  // Methods
+  //
+
+  cublasLtGemmExDispatcher( 
+    library::GemmDescription const &op_desc,
+    library::GemmUniversalConfiguration configuration_,
+    library::GemmUniversalArguments arguments_
+  );
+
+  /// Initialize the cublasLt variables
+  void initialize_cublaslt();
+  
+
+  /// Runs auto-tuning for the cublas heuristics
+  bool get_cublaslt_algo(cublasLtHandle_t handle,
+    AlgorithmMode algorithm_mode 
+    ); 
+
+  /// Executes GEMM using these arguments
+  cublasStatus_t operator()(cublasLtHandle_t handle, cudaStream_t stream = nullptr);
+
+  ~cublasLtGemmExDispatcher(){
+
+    // descriptors are no longer needed as all GPU work was already enqueued
+    if (preference) cublasLtMatmulPreferenceDestroy(preference);
+    if (Ddesc) cublasLtMatrixLayoutDestroy(Ddesc);
+    if (Cdesc) cublasLtMatrixLayoutDestroy(Cdesc);
+    if (Bdesc) cublasLtMatrixLayoutDestroy(Bdesc);
+    if (Adesc) cublasLtMatrixLayoutDestroy(Adesc);
+    if (operationDesc) cublasLtMatmulDescDestroy(operationDesc);
+
+    if (workspace) {
+      cudaFree(workspace);
+    }
+
+  } 
+
+};
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Dispatcher to cublas rank k update kernels 
@@ -237,7 +335,7 @@ struct cublasRankKDispatcher {
   library::RankKConfiguration configuration;
   library::RankKArguments arguments;
 
-  // cublass-specific data structures to fill cublas API call arguments
+  // cublas-specific data structures to fill cublas API call arguments
   cublasOperation_t trans_A;
   cublasFillMode_t uplo;
   cudaDataType_t data_type_A;
@@ -277,7 +375,7 @@ struct cublasTrmmDispatcher {
   library::TrmmConfiguration configuration;
   library::TrmmArguments arguments;
 
-  // cublass-specific data structures to fill cublas API call arguments
+  // cublas-specific data structures to fill cublas API call arguments
   cublasOperation_t trans_A;
   cublasSideMode_t side;
   cublasFillMode_t uplo;
@@ -318,7 +416,7 @@ struct cublasSymmDispatcher {
   library::SymmConfiguration configuration;
   library::SymmArguments arguments;
 
-  // cublass-specific data structures to fill cublas API call arguments
+  // cublas-specific data structures to fill cublas API call arguments
   cublasSideMode_t side;
   cublasFillMode_t uplo;
   cudaDataType_t data_type_A;
diff --git a/tools/profiler/src/cudnn_helpers.h b/tools/profiler/include/cutlass/profiler/cudnn_helpers.h
similarity index 98%
rename from tools/profiler/src/cudnn_helpers.h
rename to tools/profiler/include/cutlass/profiler/cudnn_helpers.h
index f18e78a0c6..7bee15d3df 100644
--- a/tools/profiler/src/cudnn_helpers.h
+++ b/tools/profiler/include/cutlass/profiler/cudnn_helpers.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -82,7 +82,7 @@ float cast_cudnn_compute_type_to_float(library::NumericTypeID type, void const *
 
 /// This is a helper class to create cudnnHandle_t automatically on CudnnCreate object creation and 
 /// to destroy cudnnHandle_t on CudnnCreate object destruction. 
-/// Additionaly, it provides implicit cast from CudnnCreate's object to cudnnHandle_t's object
+/// Additionally, it provides implicit cast from CudnnCreate's object to cudnnHandle_t's object
 class CudnnCreate {
 private:
 	cudnnHandle_t handle;
@@ -162,7 +162,7 @@ struct cudnnConvDispatcher {
   // Methods
   //
 
-  // TODO: unify ctor cudnnConvDispatcher for conv2d and conv3d by unifying Conv2dConfigration
+  // TODO: unify ctor cudnnConvDispatcher for conv2d and conv3d by unifying Conv2dConfiguration
   
   // ctor for conv2d 
   cudnnConvDispatcher( 
@@ -245,7 +245,7 @@ struct cudnnConvDispatcher {
         data_type_filter,
         layout_filter,
         configuration.problem_size.K,
-        configuration.problem_size.C,
+        configuration.problem_size.C / configuration.problem_size.groups,
         configuration.problem_size.R,
         configuration.problem_size.S
     ));
@@ -496,7 +496,7 @@ struct cudnnConvDispatcher {
     workspace = cutlass::device_memory::allocation<char>(workspace_size_in_bytes);
   }
 
-  /// Executes Conv2d operater from cudnn library
+  /// Executes Conv2d operator from cudnn library
   cudnnStatus_t operator()(cudnnHandle_t handle) {
 
     switch (conv_kind) {
@@ -552,7 +552,7 @@ struct cudnnConvDispatcher {
     }
   }
 
-  // Returns Actviation Tensor
+  // Returns Activation Tensor
   void const * activation() const {
     switch(conv_kind) {
       case library::ConvKind::kFprop : return arguments.A;
diff --git a/tools/profiler/src/cutlass_profiler.h b/tools/profiler/include/cutlass/profiler/cutlass_profiler.h
similarity index 95%
rename from tools/profiler/src/cutlass_profiler.h
rename to tools/profiler/include/cutlass/profiler/cutlass_profiler.h
index a566357efd..c5fdc9e38a 100644
--- a/tools/profiler/src/cutlass_profiler.h
+++ b/tools/profiler/include/cutlass/profiler/cutlass_profiler.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -48,7 +48,7 @@ namespace profiler {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// CUTLASS Profiler application 
+/// CUTLASS Profiler application
 class CutlassProfiler {
 private:
 
@@ -66,13 +66,10 @@ class CutlassProfiler {
 
   /// Prints usage
   void print_usage_(std::ostream &);
-  
+
   /// Prints usage
   void print_options_(std::ostream &);
 
-  /// Initializes the device
-  void initialize_device_();
-
   /// Enumerates all operations
   void enumerate_();
 
diff --git a/tools/profiler/src/debug.h b/tools/profiler/include/cutlass/profiler/debug.h
similarity index 95%
rename from tools/profiler/src/debug.h
rename to tools/profiler/include/cutlass/profiler/debug.h
index ae0416582c..1c4bb7c4f0 100644
--- a/tools/profiler/src/debug.h
+++ b/tools/profiler/include/cutlass/profiler/debug.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -39,7 +39,7 @@
 //#define report(x) { std::cout << "\033[31m" << __FILE__ << ":" << __LINE__ << "  " << x << "\033[0m" << std::endl; }
 //#define report(x) {}
 
-// Enable/Disble Profiler debug prints
+// Enable/Disable Profiler debug prints
 //#define DEBUG_PROFILER 
 
 //RED    31m   // profiler prints debug messages in red
diff --git a/tools/profiler/src/device_allocation.h b/tools/profiler/include/cutlass/profiler/device_allocation.h
similarity index 82%
rename from tools/profiler/src/device_allocation.h
rename to tools/profiler/include/cutlass/profiler/device_allocation.h
index 7053bbcc82..97a1e7224a 100644
--- a/tools/profiler/src/device_allocation.h
+++ b/tools/profiler/include/cutlass/profiler/device_allocation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -81,6 +81,9 @@ class DeviceAllocation {
   /// Buffer holding TensorRef instance to recently allocated memory
   std::vector<uint8_t> tensor_ref_buffer_;
 
+  /// The device ID where the allocation is made
+  int device_;
+
 public:
   //
   // Static member functions
@@ -91,7 +94,7 @@ class DeviceAllocation {
 
   /// Returns the stride of a packed layout
   static std::vector<int64_t> get_packed_layout(
-    library::LayoutTypeID layout_id, 
+    library::LayoutTypeID layout_id,
     std::vector<int> const &extent);
 
   /// returns the capacity needed
@@ -103,16 +106,16 @@ class DeviceAllocation {
 
   /// Returns true if two blocks have exactly the same value
   static bool block_compare_equal(
-    library::NumericTypeID numeric_type, 
-    void const *ptr_A, 
-    void const *ptr_B, 
+    library::NumericTypeID numeric_type,
+    void const *ptr_A,
+    void const *ptr_B,
     size_t capacity);
 
   /// Returns true if two blocks have approximately the same value
   static bool block_compare_relatively_equal(
-    library::NumericTypeID numeric_type, 
-    void const *ptr_A, 
-    void const *ptr_B, 
+    library::NumericTypeID numeric_type,
+    void const *ptr_A,
+    void const *ptr_B,
     size_t capacity,
     double epsilon,
     double nonzero_floor);
@@ -123,15 +126,19 @@ class DeviceAllocation {
   //
 
   DeviceAllocation();
-  
-  DeviceAllocation(library::NumericTypeID type, size_t capacity);
-  
+
+  DeviceAllocation(
+    library::NumericTypeID type,
+    size_t capacity,
+    int device = -1);
+
   DeviceAllocation(
-    library::NumericTypeID type, 
-    library::LayoutTypeID layout_id, 
-    std::vector<int> const &extent, 
+    library::NumericTypeID type,
+    library::LayoutTypeID layout_id,
+    std::vector<int> const &extent,
     std::vector<int64_t> const &stride = std::vector<int64_t>(),
-    int batch_count = 1);
+    int batch_count = 1,
+    int device = -1);
 
   ~DeviceAllocation();
 
@@ -142,9 +149,9 @@ class DeviceAllocation {
 
   /// Allocates memory for a given layout and tensor
   DeviceAllocation &reset(
-    library::NumericTypeID type, 
-    library::LayoutTypeID layout_id, 
-    std::vector<int> const &extent, 
+    library::NumericTypeID type,
+    library::LayoutTypeID layout_id,
+    std::vector<int> const &extent,
     std::vector<int64_t> const &stride = std::vector<int64_t>(),
     int batch_count = 1);
 
@@ -157,7 +164,7 @@ class DeviceAllocation {
 
   /// Data type of contained elements
   library::NumericTypeID type() const;
-  
+
   /// Pointer to start of device memory allocation
   void *data() const;
 
@@ -176,15 +183,15 @@ class DeviceAllocation {
   /// Gets the number of adjacent tensors in memory
   int batch_count() const;
 
-  /// Gets the stride (in units of elements) beteween items
+  /// Gets the stride (in units of elements) between items
   int64_t batch_stride() const;
 
-  /// Gets the stride (in units of bytes) beteween items
+  /// Gets the stride (in units of bytes) between items
   int64_t batch_stride_bytes() const;
 
   /// Capacity of allocation in number of elements
   size_t capacity() const;
-  
+
   /// Capacity of allocation in bytes
   size_t bytes() const;
 
@@ -194,14 +201,23 @@ class DeviceAllocation {
   /// Initializes a host allocation to a random distribution using std::cout
   void initialize_random_host(int seed, Distribution dist);
 
+  /// Initializes a device allocation to a sequential distribution
+  void initialize_sequential_device(Distribution dist);
+
+  /// Initializes a host allocation to a sequential distribution
+  void initialize_sequential_host(Distribution dist);
+
   /// Initializes a device allocation to a random distribution using cuRAND
   void initialize_random_sparsemeta_device(int seed, int MetaSizeInBits);
 
   /// Initializes a host allocation to a random distribution using std::cout
   void initialize_random_sparsemeta_host(int seed, int MetaSizeInBits);
-  
+
   /// Uniformly fills a tensor with a value when provided o.w. zero
-  void fill(double value);
+  void fill_device(double value);
+
+  /// Uniformly fills a host allocation with a value when provided o.w. zero
+  void fill_host(double value);
 
   /// Copies from an equivalent-sized tensor in device memory
   void copy_from_device(void const *ptr);
@@ -212,8 +228,12 @@ class DeviceAllocation {
   /// Copies from an equivalent-sized tensor in device memory
   void copy_to_host(void *ptr);
 
-  /// Writes a tensor to csv 
+  /// Writes a tensor to csv
   void write_tensor_csv(std::ostream &out);
+
+private:
+  /// A wrapper that sets the device, performs malloc, and sets back
+  cudaError_t malloc(void** ptr, size_t size);
 };
 
 using DeviceAllocationList = std::list<DeviceAllocation>;
diff --git a/tools/profiler/src/device_context.h b/tools/profiler/include/cutlass/profiler/device_context.h
similarity index 77%
rename from tools/profiler/src/device_context.h
rename to tools/profiler/include/cutlass/profiler/device_context.h
index 808ab6c668..19fc42c50f 100644
--- a/tools/profiler/src/device_context.h
+++ b/tools/profiler/include/cutlass/profiler/device_context.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -29,7 +29,7 @@
  *
  **************************************************************************************************/
 /* \file
-   \brief 
+   \brief
 */
 
 #pragma once
@@ -68,44 +68,52 @@ class DeviceContext {
 
   /// Non-owning set of named allocations
   AllocationMap allocations_;
-  
+
 public:
 
   /// Allocates memory of a given type, capacity (elements), and name
   DeviceAllocation *allocate_block(
+    Options const &options,
     std::string const &name,
-    library::NumericTypeID type, 
-    size_t capacity);
+    library::NumericTypeID type,
+    size_t capacity,
+    size_t device_index);
 
   /// Allocates memory of a given type, capacity (elements), and name
   DeviceAllocation *allocate_tensor(
+    Options const &options,
     std::string const &name,
-    library::NumericTypeID type, 
-    library::LayoutTypeID layout_id, 
-    std::vector<int> const &extent, 
-    std::vector<int64_t> const &stride = std::vector<int64_t>(),
-    int batch_count = 1);
+    library::NumericTypeID type,
+    library::LayoutTypeID layout_id,
+    std::vector<int> const &extent,
+    std::vector<int64_t> const &stride,
+    int batch_count,
+    size_t device_index);
 
   /// Allocates memory of a given type, capacity (elements), and name
-  DeviceAllocation *allocate_tensor(
+  DeviceAllocation *allocate_and_initialize_tensor(
     Options const &options,
     std::string const &name,
-    library::NumericTypeID type, 
-    library::LayoutTypeID layout_id, 
-    std::vector<int> const &extent, 
-    std::vector<int64_t> const &stride = std::vector<int64_t>(),
-    int batch_count = 1);
-
-  /// Allocates memory for sparse meta data 
-  DeviceAllocation *allocate_sparsemeta_tensor(
+    library::NumericTypeID type,
+    library::LayoutTypeID layout_id,
+    std::vector<int> const &extent,
+    std::vector<int64_t> const &stride,
+    int batch_count,
+    int seed_shift,
+    size_t device_index);
+
+  /// Allocates memory for sparse meta data
+  DeviceAllocation *allocate_and_initialize_sparsemeta_tensor(
     Options const &options,
     std::string const &name,
-    library::NumericTypeID type, 
-    library::LayoutTypeID layout_id, 
+    library::NumericTypeID type,
+    library::LayoutTypeID layout_id,
     library::NumericTypeID type_a,
-    std::vector<int> const &extent, 
-    std::vector<int64_t> const &stride = std::vector<int64_t>(),
-    int batch_count = 1);
+    std::vector<int> const &extent,
+    std::vector<int64_t> const &stride,
+    int batch_count,
+    int seed_shift,
+    size_t device_index);
 
   /// Clears named allocations (but does not necessarily free memory)
   void clear();
diff --git a/tools/profiler/src/enumerated_types.h b/tools/profiler/include/cutlass/profiler/enumerated_types.h
similarity index 95%
rename from tools/profiler/src/enumerated_types.h
rename to tools/profiler/include/cutlass/profiler/enumerated_types.h
index 9e04e9da01..3e6efa4897 100644
--- a/tools/profiler/src/enumerated_types.h
+++ b/tools/profiler/include/cutlass/profiler/enumerated_types.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -90,9 +90,9 @@ AlgorithmMode from_string<AlgorithmMode>(std::string const &str);
 /// Outcome of a performance test
 enum class Disposition {
   kPassed,
-  kFailed,
+  kFailed,    // kernel itself reported an error
   kNotRun,
-  kIncorrect,
+  kIncorrect, // kernel finished without a detected error, but result does not equal expected result
   kNotVerified,
   kInvalidProblem,
   kNotSupported,
@@ -158,8 +158,8 @@ using DispositionMap = std::map<library::Provider, Disposition>;
 // Print vector for the report
 template <typename T>
 std::ostream& operator<< (std::ostream& out, const std::vector<T>& v) {
-  for(int i = 0; i < v.size(); ++i) {
-    out << to_string(v[i], true) << (i+1 != v.size() ? "," : "");
+  for (size_t i = 0; i < v.size(); ++i) {
+    out << to_string(v[i], true) << (i + 1u != v.size() ? "," : "");
   }
   return out;
 }
diff --git a/tools/profiler/src/gemm_operation_profiler.h b/tools/profiler/include/cutlass/profiler/gemm_operation_profiler.h
similarity index 83%
rename from tools/profiler/src/gemm_operation_profiler.h
rename to tools/profiler/include/cutlass/profiler/gemm_operation_profiler.h
index 396aea8db3..b103e3db74 100644
--- a/tools/profiler/src/gemm_operation_profiler.h
+++ b/tools/profiler/include/cutlass/profiler/gemm_operation_profiler.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -29,7 +29,7 @@
  *
  **************************************************************************************************/
 /* \file
-   \brief Defines a math function
+   \brief Gemm Profiler
 */
 
 #pragma once
@@ -67,18 +67,24 @@ class GemmOperationProfiler : public OperationProfiler {
   /// Problem structure obtained from problem space
   struct GemmProblem {
 
-    cutlass::library::GemmUniversalMode mode; 
-    cutlass::library::SplitKMode split_k_mode;
-    int64_t m;
-    int64_t n;
-    int64_t k;
-    int64_t lda;
-    int64_t ldb;
-    int64_t ldc;
+    cutlass::library::GemmUniversalMode mode{library::GemmUniversalMode::kGemm};
+
+    int64_t m{16};
+    int64_t n{16};
+    int64_t k{16};
+
+    int64_t lda{0};
+    int64_t ldb{0};
+    int64_t ldc{0};
     std::vector<uint8_t> alpha;
     std::vector<uint8_t> beta;
-    int split_k_slices;
-    int batch_count;
+
+    cutlass::library::SplitKMode split_k_mode{library::SplitKMode::kNone};
+    int split_k_slices{1};
+    int batch_count{1};
+
+    cutlass::library::RasterOrder raster_order{cutlass::library::RasterOrder::kHeuristic};
+    int swizzle_size{1};
 
     // gemm with parallel interleaved reduction
     // gemm epilogue (alpha, beta) = (1.0, 0.0)
@@ -90,10 +96,6 @@ class GemmOperationProfiler : public OperationProfiler {
     // Methods
     //
 
-    GemmProblem(): 
-      mode(library::GemmUniversalMode::kGemm),
-      m(16), n(16), k(16), lda(0), ldb(0), ldc(0), split_k_slices(1), batch_count(1) { }
-
     /// Parses the problem
     Status parse(
       library::GemmDescription const &operation_desc,
@@ -113,18 +115,18 @@ class GemmOperationProfiler : public OperationProfiler {
       ProblemSpace const &problem_space);
   };
 
-  /// Workspace used 
+  /// Workspace used
   struct GemmWorkspace {
 
-    DeviceAllocation *A;
-    DeviceAllocation *B;
-    DeviceAllocation *C;
-    DeviceAllocation *Computed;
-    DeviceAllocation *Reference;
+    DeviceAllocation *A{nullptr};
+    DeviceAllocation *B{nullptr};
+    DeviceAllocation *C{nullptr};
+    DeviceAllocation *Computed{nullptr};
+    DeviceAllocation *Reference{nullptr};
 
     /// Number of copies of the problem workspace which are visited sequentially during
     /// profiling to avoid camping in the last level cache.
-    int problem_count;
+    int problem_count{1};
 
     library::GemmUniversalConfiguration configuration;
     library::GemmUniversalArguments arguments;
@@ -142,12 +144,7 @@ class GemmOperationProfiler : public OperationProfiler {
     /// Buffer used for the cutlass reduction operations' host workspace
     std::vector<uint8_t> reduction_host_workspace;
 
-    //
-    // Methods
-    //
-
-    GemmWorkspace(): 
-      A(nullptr), B(nullptr), C(nullptr), Computed(nullptr), Reference(nullptr), problem_count(1) { }
+    cudaStream_t stream;
   };
 
 protected:
@@ -159,8 +156,8 @@ class GemmOperationProfiler : public OperationProfiler {
   /// GEMM problem obtained from problem space
   GemmProblem problem_;
 
-  /// Device memory allocations 
-  GemmWorkspace gemm_workspace_;
+  /// Device memory allocations
+  std::vector<GemmWorkspace> gemm_workspace_;
 
   /// CUTLASS parallel reduction operation to follow this* gemm operation
   library::Operation const *reduction_op_;
@@ -176,6 +173,8 @@ class GemmOperationProfiler : public OperationProfiler {
   /// Destructor
   virtual ~GemmOperationProfiler();
 
+  GemmProblem const& problem() const { return problem_; }
+
   /// Prints usage statement for the math function
   virtual void print_usage(std::ostream &out) const;
 
@@ -184,8 +183,8 @@ class GemmOperationProfiler : public OperationProfiler {
 
   /// Extracts the problem dimensions
   virtual Status initialize_configuration(
-    Options const &options, 
-    PerformanceReport &report, 
+    Options const &options,
+    PerformanceReport &report,
     DeviceContext &device_context,
     library::Operation const *operation,
     ProblemSpace const &problem_space,
@@ -193,8 +192,8 @@ class GemmOperationProfiler : public OperationProfiler {
 
   /// Initializes workspace
   virtual Status initialize_workspace(
-    Options const &options, 
-    PerformanceReport &report, 
+    Options const &options,
+    PerformanceReport &report,
     DeviceContext &device_context,
     library::Operation const *operation,
     ProblemSpace const &problem_space,
@@ -202,7 +201,7 @@ class GemmOperationProfiler : public OperationProfiler {
 
   /// Verifies CUTLASS against references
   virtual bool verify_cutlass(
-    Options const &options,  
+    Options const &options,
     PerformanceReport &report,
     DeviceContext &device_context,
     library::Operation const *operation,
@@ -211,8 +210,8 @@ class GemmOperationProfiler : public OperationProfiler {
 
   /// Measures performance results
   virtual bool profile(
-    Options const &options, 
-    PerformanceReport &report, 
+    Options const &options,
+    PerformanceReport &report,
     DeviceContext &device_context,
     library::Operation const *operation,
     ProblemSpace const &problem_space,
@@ -223,31 +222,34 @@ class GemmOperationProfiler : public OperationProfiler {
   /// Initializes the performance result
   void initialize_result_(
     PerformanceResult &result,
-    Options const &options,  
+    Options const &options,
     library::GemmDescription const &operation_desc,
     ProblemSpace const &problem_space);
 
   /// Verifies CUTLASS against references
   bool verify_with_cublas_(
-    Options const &options,  
+    Options const &options,
     PerformanceReport &report,
     DeviceContext &device_context,
     library::Operation const *operation,
     ProblemSpace const &problem_space,
-    ProblemSpace::Problem const &problem);
+    ProblemSpace::Problem const &problem,
+    GemmWorkspace &gemm_workspace);
 
   /// Verifies CUTLASS against host and device references
   bool verify_with_reference_(
-    Options const &options,  
+    Options const &options,
     PerformanceReport &report,
     DeviceContext &device_context,
     library::Operation const *operation,
     ProblemSpace const &problem_space,
-    ProblemSpace::Problem const &problem);
+    ProblemSpace::Problem const &problem,
+    cutlass::library::NumericTypeID element_A,
+    cutlass::library::NumericTypeID element_B);
 
   /// Method to profile a CUTLASS Operation
   Status profile_cutlass_(
-    double &runtime,
+    PerformanceResult &result,
     Options const &options,
     library::Operation const *operation,
     void *arguments,
diff --git a/tools/profiler/src/gpu_timer.h b/tools/profiler/include/cutlass/profiler/gpu_timer.h
similarity index 77%
rename from tools/profiler/src/gpu_timer.h
rename to tools/profiler/include/cutlass/profiler/gpu_timer.h
index 7baedd0116..815b6af172 100644
--- a/tools/profiler/src/gpu_timer.h
+++ b/tools/profiler/include/cutlass/profiler/gpu_timer.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -51,18 +51,23 @@ struct GpuTimer {
   //
   
   GpuTimer();
+
+  GpuTimer(GpuTimer const&) = delete;
+
+  GpuTimer(GpuTimer &&gpu_timer) noexcept;
+
   ~GpuTimer();
 
-  /// Records a start event in the stream
-  void start(cudaStream_t stream = nullptr);
+  /// Records a start event in the stream, the flag is for cudaEventRecordWithFlags
+  void start(cudaStream_t stream = nullptr, unsigned int flag = cudaEventRecordDefault);
 
-  /// Records a stop event in the stream
-  void stop(cudaStream_t stream = nullptr);
+  /// Records a stop event in the stream, the flag is for cudaEventRecordWithFlags
+  void stop(cudaStream_t stream = nullptr, unsigned int flag = cudaEventRecordDefault);
 
-  /// Records a stop event in the stream and synchronizes on the stream
-  void stop_and_wait(cudaStream_t stream = nullptr);
+  /// Records a stop event in the stream and synchronizes on the stream, the flag is for cudaEventRecordWithFlags
+  void stop_and_wait(cudaStream_t stream = nullptr, unsigned int flag = cudaEventRecordDefault);
 
-  /// Returns the duration in miliseconds
+  /// Returns the duration in milliseconds
   double duration(int iterations = 1) const;
 };
 
diff --git a/tools/profiler/src/operation_profiler.h b/tools/profiler/include/cutlass/profiler/operation_profiler.h
similarity index 97%
rename from tools/profiler/src/operation_profiler.h
rename to tools/profiler/include/cutlass/profiler/operation_profiler.h
index 74e5416cc4..7e3005fe78 100644
--- a/tools/profiler/src/operation_profiler.h
+++ b/tools/profiler/include/cutlass/profiler/operation_profiler.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -39,6 +39,9 @@
 #include <memory>
 #include <unordered_map>
 
+// CUTLASS includes
+#include "cutlass/trace.h"
+
 // CUTLASS Library includes
 #include "cutlass/library/library.h"
 #include "cutlass/library/util.h"
@@ -81,7 +84,7 @@ class OperationProfiler {
   /// List of providers used to verify and compare each result
   ProviderVector verification_providers_;
 
-  /// Model performance result initailized by the operation profiler with workload statistics
+  /// Model performance result initialized by the operation profiler with workload statistics
   /// and reasonable default state.
   PerformanceResult model_result_;
 
@@ -229,7 +232,7 @@ class OperationProfiler {
 
   /// Method to profile an initialized CUTLASS operation
   virtual Status profile_cutlass_(
-    double &runtime,
+    PerformanceResult &result,
     Options const &options,
     library::Operation const *operation,
     void *arguments,
diff --git a/tools/profiler/src/options.h b/tools/profiler/include/cutlass/profiler/options.h
similarity index 81%
rename from tools/profiler/src/options.h
rename to tools/profiler/include/cutlass/profiler/options.h
index 8ec1736b57..6093f49bb0 100644
--- a/tools/profiler/src/options.h
+++ b/tools/profiler/include/cutlass/profiler/options.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -72,7 +72,7 @@ class Options {
     // Methods
     //
 
-    Library(CommandLine const &cmdline);
+    explicit Library(CommandLine const &cmdline);
 
     void print_usage(std::ostream &out) const;
     void print_options(std::ostream &out, int indent = 0) const;
@@ -82,26 +82,33 @@ class Options {
   struct Device {
 
     /// Device ID
-    int device;
+    std::vector<int> devices;
+
+    /// Number of total devices
+    /// This is not set by the user, it is set by automatically
+    int num_devices;
 
     /// CUDA Device properties
-    cudaDeviceProp properties;
+    std::vector<cudaDeviceProp> properties;
 
-    /// Total memory allocation on device
+    /// Total memory allocation on each device
     size_t maximum_capacity;
 
     //
     // Methods
     //
 
-    Device(CommandLine const &cmdline);
+    explicit Device(CommandLine const &cmdline);
 
     void print_usage(std::ostream &out) const;
     void print_options(std::ostream &out, int indent = 0) const;
     void print_device_info(std::ostream &out) const;
 
-    /// Returns the compute capability of the listed device (e.g. 61, 60, 70, 75)
-    int compute_capability() const;
+    /// Returns the device ID from a device index
+    int device_id(size_t device_index) const;
+
+    /// Returns the compute capability of the listed devices (e.g. 61, 60, 70, 75)
+    int compute_capability(int device_index) const;
   };
 
   /// Options related to initializing input tensors
@@ -128,8 +135,8 @@ class Options {
     // Methods
     //
 
-    Initialization(CommandLine const &cmdline);
-    
+    explicit Initialization(CommandLine const &cmdline);
+
     void print_usage(std::ostream &out) const;
     void print_options(std::ostream &out, int indent = 0) const;
 
@@ -150,6 +157,10 @@ class Options {
     /// If true, kernels are verified before they are profiled
     bool enabled;
 
+    /// If true, causes profiler to return an error code if no reference check is run.
+    /// Only valid when verification is enabled.
+    bool required;
+
     /// Relative error threshold - zero to require bit-level consistency
     double epsilon;
 
@@ -166,14 +177,14 @@ class Options {
     // Methods
     //
 
-    Verification(CommandLine const &cmdline);
-  
+    explicit Verification(CommandLine const &cmdline);
+
     void print_usage(std::ostream &out) const;
     void print_options(std::ostream &out, int indent = 0) const;
 
     /// Returns true if a provider is enabled
     bool provider_enabled(library::Provider provider) const;
-    
+
     /// Returns the index of a provider if its enabled
     size_t index(library::Provider provider) const;
   };
@@ -182,19 +193,26 @@ class Options {
   struct Profiling {
 
     /// Number of workspaces to rotate through to avoid cache-resident working sets
-    int workspace_count;
+    int workspace_count{0};
 
     /// Number of iterations to warmup each kernel prior to profiling
-    int warmup_iterations;
+    int warmup_iterations{10};
 
     /// Number of iterations to profile each kernel - if 0, kernels are launched up to the profiling duration
-    int iterations;
+    int iterations{100};
 
     /// Number of ms to sleep between profiling periods (ms)
-    int sleep_duration;
+    int sleep_duration{50};
 
     /// If true, profiling is actually conducted.
-    bool enabled;
+    bool enabled{true};
+
+    /// If true, profiling returns an error code if no kernels are found to match the filters.
+    bool error_on_no_match{false};
+
+    /// If true, profiling returns an error code if no kernel are profiled
+    // Sometimes the kernel matches but failed to profile (e.g. can_implement() error)
+    bool error_if_nothing_is_profiled{false};
 
     /// List of providers of each functionality to be profiled
     ProviderVector providers;
@@ -203,7 +221,7 @@ class Options {
     // Methods
     //
 
-    Profiling(CommandLine const &cmdline);
+    explicit Profiling(CommandLine const &cmdline);
 
     void print_usage(std::ostream &out) const;
     void print_options(std::ostream &out, int indent = 0) const;
@@ -214,7 +232,7 @@ class Options {
     /// Returns the index of a provider if its enabled
     size_t index(library::Provider provider) const;
   };
-  
+
   /// Options related to reporting
   struct Report {
 
@@ -231,7 +249,7 @@ class Options {
     std::vector<std::pair<std::string, std::string>> pivot_tags;
 
     /// If true, reports status of all kernels including those that were
-    /// not run for the given argumetns
+    /// not run for the given arguments
     bool report_not_run;
 
     /// Prints human-readable text to stdout. If false, nothing is written to stdout
@@ -240,12 +258,16 @@ class Options {
     /// Sort results by (currently by flops-per-byte)
     bool sort_results;
 
+    /// Prints the name of the kernel being profiled before running the kernel.
+    /// This is useful for determining which kernel is causing a run of the profiler to hang
+    bool print_kernel_before_running;
+
     //
     // Methods
     //
 
-    Report(CommandLine const &cmdline);
-    
+    explicit Report(CommandLine const &cmdline);
+
     void print_usage(std::ostream &out) const;
     void print_options(std::ostream &out, int indent = 0) const;
   };
@@ -266,8 +288,8 @@ class Options {
     // Methods
     //
 
-    About(CommandLine const &cmdline);
-    
+    explicit About(CommandLine const &cmdline);
+
     void print_usage(std::ostream &out) const;
     void print_options(std::ostream &out, int indent = 0) const;
 
@@ -288,7 +310,7 @@ class Options {
 
   /// Vector of operation name substrings
   std::vector<std::string> operation_names;
-  
+
   /// Vector of operation name substrings
   std::vector<std::string> excluded_operation_names;
 
@@ -309,7 +331,7 @@ class Options {
 
 public:
 
-  Options(CommandLine const &cmdline);
+  explicit Options(CommandLine const &cmdline);
 
   void print_usage(std::ostream &out) const;
   void print_options(std::ostream &out) const;
diff --git a/tools/profiler/src/performance_report.h b/tools/profiler/include/cutlass/profiler/performance_report.h
similarity index 98%
rename from tools/profiler/src/performance_report.h
rename to tools/profiler/include/cutlass/profiler/performance_report.h
index c536fe7beb..6ea1180054 100644
--- a/tools/profiler/src/performance_report.h
+++ b/tools/profiler/include/cutlass/profiler/performance_report.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/src/performance_result.h b/tools/profiler/include/cutlass/profiler/performance_result.h
similarity index 96%
rename from tools/profiler/src/performance_result.h
rename to tools/profiler/include/cutlass/profiler/performance_result.h
index 7b611f6345..4b9a3321b2 100644
--- a/tools/profiler/src/performance_result.h
+++ b/tools/profiler/include/cutlass/profiler/performance_result.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -86,6 +86,9 @@ struct PerformanceResult {
   /// Average runtime in ms
   double runtime;
 
+  /// Average runtime in ms per device
+  std::vector<double> runtime_vector;
+
   //
   // Members
   //
diff --git a/tools/profiler/src/problem_space.h b/tools/profiler/include/cutlass/profiler/problem_space.h
similarity index 96%
rename from tools/profiler/src/problem_space.h
rename to tools/profiler/include/cutlass/profiler/problem_space.h
index be274fceb1..00391c9b13 100644
--- a/tools/profiler/src/problem_space.h
+++ b/tools/profiler/include/cutlass/profiler/problem_space.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -256,7 +256,7 @@ struct ScalarArgument : public KernelArgument {
     virtual std::ostream &print(std::ostream &out) const;
   };
 
-  using ValueCollection = std::vector<std::string>;
+  using ValueCollection = std::vector<decltype(ScalarValue::value)>;
 
   /// Abstract base class to iterate over values within arguments
   struct ScalarValueIterator : public KernelArgument::ValueIterator {
@@ -271,7 +271,7 @@ struct ScalarArgument : public KernelArgument {
     // Methods
     //
 
-    ScalarValueIterator(ScalarArgument const *argument = nullptr);
+    explicit ScalarValueIterator(ScalarArgument const *argument = nullptr);
 
     virtual void operator++();
     virtual bool operator==(ValueIterator const &it) const;
@@ -284,7 +284,7 @@ struct ScalarArgument : public KernelArgument {
   // Data members
   //
 
-  /// Set of posible values
+  /// Set of possible values
   ValueCollection values;
 
   //
@@ -292,7 +292,7 @@ struct ScalarArgument : public KernelArgument {
   //
 
   /// Default ctor
-  ScalarArgument(
+  explicit ScalarArgument(
     ArgumentDescription const *description
   ): 
     KernelArgument(description) { }
@@ -540,7 +540,7 @@ struct IntegerArgument : public KernelArgument {
   // Data members
   //
 
-  /// Set of posible values
+  /// Set of possible values
   RangeCollection ranges;
 
   //
@@ -632,7 +632,7 @@ struct TensorArgument : public KernelArgument {
     // Methods
     //
 
-    TensorValueIterator(TensorArgument const *argument_);
+    explicit TensorValueIterator(TensorArgument const *argument_);
 
     virtual void operator++();
     virtual bool operator==(ValueIterator const &it) const;
@@ -649,7 +649,7 @@ struct TensorArgument : public KernelArgument {
   //
 
   /// Default ctor
-  TensorArgument(
+  explicit TensorArgument(
     ArgumentDescription const *description
   ): 
     KernelArgument(description) { }
@@ -690,7 +690,7 @@ struct EnumeratedTypeArgument : public KernelArgument {
     virtual std::ostream &print(std::ostream &out) const;
   };
 
-  using ValueCollection = std::vector<std::string>;
+  using ValueCollection = std::vector<decltype(EnumeratedTypeValue::element)>;
 
   /// Abstract base class to iterate over values within arguments
   struct EnumeratedTypeValueIterator : public KernelArgument::ValueIterator {
@@ -705,7 +705,7 @@ struct EnumeratedTypeArgument : public KernelArgument {
     // Methods
     //
 
-    EnumeratedTypeValueIterator(EnumeratedTypeArgument const *argument_ = nullptr);
+    explicit EnumeratedTypeValueIterator(EnumeratedTypeArgument const *argument_ = nullptr);
 
     virtual void operator++();
     virtual bool operator==(ValueIterator const &it) const;
@@ -725,7 +725,7 @@ struct EnumeratedTypeArgument : public KernelArgument {
   //
 
   /// Default ctor
-  EnumeratedTypeArgument(ArgumentDescription const *description):
+  explicit EnumeratedTypeArgument(ArgumentDescription const *description):
     KernelArgument(description) {}
 
   virtual bool not_null() const {
@@ -819,7 +819,7 @@ class ProblemSpace {
   //
 
   /// Default ctor
-  ProblemSpace() {}
+  ProblemSpace() = default;
 
   /// Constructs a problem space from a vector of arguments. This vector must outlive
   /// the ProblemSpace object, which stores pointers to objects within the
@@ -935,6 +935,15 @@ bool arg_as_IteratorAlgorithmID(
   ProblemSpace const &problem_space, 
   ProblemSpace::Problem const &problem);
 
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_RasterOrder(library::RasterOrder &raster_order, KernelArgument::Value const *value_ptr);
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_RasterOrder(
+  library::RasterOrder &raster_order,
+  char const *name,
+  ProblemSpace const &problem_space, 
+  ProblemSpace::Problem const &problem);
 
 /// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
 bool arg_as_ProviderID(library::Provider &provider, KernelArgument::Value const *value_ptr);
diff --git a/tools/profiler/src/rank_2k_operation_profiler.h b/tools/profiler/include/cutlass/profiler/rank_2k_operation_profiler.h
similarity index 99%
rename from tools/profiler/src/rank_2k_operation_profiler.h
rename to tools/profiler/include/cutlass/profiler/rank_2k_operation_profiler.h
index e4c62e80d9..c4058452b2 100644
--- a/tools/profiler/src/rank_2k_operation_profiler.h
+++ b/tools/profiler/include/cutlass/profiler/rank_2k_operation_profiler.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/src/rank_k_operation_profiler.h b/tools/profiler/include/cutlass/profiler/rank_k_operation_profiler.h
similarity index 99%
rename from tools/profiler/src/rank_k_operation_profiler.h
rename to tools/profiler/include/cutlass/profiler/rank_k_operation_profiler.h
index 23a9c2fffe..67f77ebbc5 100644
--- a/tools/profiler/src/rank_k_operation_profiler.h
+++ b/tools/profiler/include/cutlass/profiler/rank_k_operation_profiler.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/src/reduction_operation_profiler.h b/tools/profiler/include/cutlass/profiler/reduction_operation_profiler.h
similarity index 98%
rename from tools/profiler/src/reduction_operation_profiler.h
rename to tools/profiler/include/cutlass/profiler/reduction_operation_profiler.h
index 5c34094475..823457c34c 100644
--- a/tools/profiler/src/reduction_operation_profiler.h
+++ b/tools/profiler/include/cutlass/profiler/reduction_operation_profiler.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/src/sparse_gemm_operation_profiler.h b/tools/profiler/include/cutlass/profiler/sparse_gemm_operation_profiler.h
similarity index 99%
rename from tools/profiler/src/sparse_gemm_operation_profiler.h
rename to tools/profiler/include/cutlass/profiler/sparse_gemm_operation_profiler.h
index 1f14fec7f2..7e4131f7aa 100644
--- a/tools/profiler/src/sparse_gemm_operation_profiler.h
+++ b/tools/profiler/include/cutlass/profiler/sparse_gemm_operation_profiler.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/src/symm_operation_profiler.h b/tools/profiler/include/cutlass/profiler/symm_operation_profiler.h
similarity index 99%
rename from tools/profiler/src/symm_operation_profiler.h
rename to tools/profiler/include/cutlass/profiler/symm_operation_profiler.h
index 3a9efd72ae..b10a60282b 100644
--- a/tools/profiler/src/symm_operation_profiler.h
+++ b/tools/profiler/include/cutlass/profiler/symm_operation_profiler.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/src/trmm_operation_profiler.h b/tools/profiler/include/cutlass/profiler/trmm_operation_profiler.h
similarity index 99%
rename from tools/profiler/src/trmm_operation_profiler.h
rename to tools/profiler/include/cutlass/profiler/trmm_operation_profiler.h
index d1abd53ba9..392c995964 100644
--- a/tools/profiler/src/trmm_operation_profiler.h
+++ b/tools/profiler/include/cutlass/profiler/trmm_operation_profiler.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/src/conv2d_operation_profiler.cu b/tools/profiler/src/conv2d_operation_profiler.cu
index 95ebd09dea..9589c0caa2 100644
--- a/tools/profiler/src/conv2d_operation_profiler.cu
+++ b/tools/profiler/src/conv2d_operation_profiler.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -39,9 +39,8 @@
 
 #include "cutlass/core_io.h"
 
-#include "conv2d_operation_profiler.h"
-#include "gpu_timer.h"
-
+#include "cutlass/profiler/conv2d_operation_profiler.h"
+#include "cutlass/profiler/gpu_timer.h"
 /////////////////////////////////////////////////////////////////////////////////////////////////
 using namespace cutlass::library;
 
@@ -52,10 +51,10 @@ namespace profiler {
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Ctor
-Conv2dOperationProfiler::Conv2dOperationProfiler(Options const &options): 
+Conv2dOperationProfiler::Conv2dOperationProfiler(Options const &options):
   OperationProfiler(
     options,
-    library::OperationKind::kConv2d, 
+    library::OperationKind::kConv2d,
     {
       {ArgumentTypeID::kEnumerated, {"conv_kind"}, "Convolutional operator (fprop, dgrad, wgrad)"},
       {ArgumentTypeID::kInteger, {"n", "input_n"}, "Input N dimension of the Conv2d problem space"},
@@ -67,6 +66,7 @@ Conv2dOperationProfiler::Conv2dOperationProfiler(Options const &options):
       {ArgumentTypeID::kInteger, {"s", "filter_s"}, "Filter S dimension of the Conv2d problem space"},
       {ArgumentTypeID::kInteger, {"p", "output_p"}, "Output P dimension of the Conv2d problem space"},
       {ArgumentTypeID::kInteger, {"q", "output_q"}, "Output Q dimension of the Conv2d problem space"},
+      {ArgumentTypeID::kInteger, {"g", "groups"}, "Number of convolution groups"},
       {ArgumentTypeID::kInteger, {"pad_h"}, "Padding in H direction"},
       {ArgumentTypeID::kInteger, {"pad_w"}, "Padding in W direction"},
       {ArgumentTypeID::kInteger, {"stride_h"}, "Stride in H direction"},
@@ -165,13 +165,13 @@ int64_t Conv2dOperationProfiler::Conv2dProblem::flops(
 
   int64_t flops_mainloop_ = int64_t(mnk.m()) * mnk.n() * mnk.k() * 2;
   int64_t flops_epilogue_ = int64_t(mnk.m()) * int64_t(mnk.n()) * 2;
-  
+
   // Adjust mainloop flop for dgrad strided
   if (operation_desc.conv_kind == library::ConvKind::kDgrad) {
     flops_mainloop_ = flops_mainloop_ / (stride_h * stride_w);
   }
   int64_t flops_total_ = flops_mainloop_ + flops_epilogue_;
-  
+
   //complex-valued support
   switch (operation_desc.tile_description.math_instruction.math_operation) {
   case library::MathOperationID::kMultiplyAddComplex:
@@ -188,14 +188,14 @@ int64_t Conv2dOperationProfiler::Conv2dProblem::flops(
 
 /// Extracts the problem dimensions
 Status Conv2dOperationProfiler::initialize_configuration(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
   ProblemSpace const &problem_space,
   ProblemSpace::Problem const &problem) {
 
-  library::ConvDescription const &operation_desc = 
+  library::ConvDescription const &operation_desc =
     static_cast<library::ConvDescription const &>(operation->description());
 
   if (!arg_as_int(problem_.n, "n", problem_space, problem)) {
@@ -207,7 +207,7 @@ Status Conv2dOperationProfiler::initialize_configuration(
     // default value
     problem_.h = 16;
   }
-  
+
   if (!arg_as_int(problem_.w, "w", problem_space, problem)) {
     // default value
     problem_.w = 16;
@@ -227,12 +227,17 @@ Status Conv2dOperationProfiler::initialize_configuration(
     // default value
     problem_.r = 3;
   }
-  
+
   if (!arg_as_int(problem_.s, "s", problem_space, problem)) {
     // default value
     problem_.s = 3;
   }
 
+  if (!arg_as_int(problem_.groups, "g", problem_space, problem)) {
+    // default value
+    problem_.groups = 1;
+  }
+
   if (!arg_as_int(problem_.pad_h, "pad_h", problem_space, problem)) {
     // default value
     problem_.pad_h = 1;
@@ -264,7 +269,7 @@ Status Conv2dOperationProfiler::initialize_configuration(
   }
 
   ////////////////////////  Convolution output dimensions p and q ////////////////////////
-  // Cutlass convolutions support arbitrary output sizes and not constriant by          //
+  // Cutlass convolutions support arbitrary output sizes and not constrained by         //
   // input, filter, padding, striding, dilation sizes.                                  //
   // cuDNN sets the output dimensions (p, q)  using following equations:                //
   //                                                                                    //
@@ -275,14 +280,14 @@ Status Conv2dOperationProfiler::initialize_configuration(
   // cutlass profiler sets p and q which are cuDNN compliant.                           //
   //                                                                                    //
   ////////////////////////////////////////////////////////////////////////////////////////
-  // set convolution output p 
+  // set convolution output p
   if (!arg_as_int(problem_.p, "p", problem_space, problem)) {
     // default value (set using cudnn formula for output height, when p is not provided)
     problem_.p = (
-                    problem_.h + 
-                    2 * problem_.pad_h - 
+                    problem_.h +
+                    2 * problem_.pad_h -
                     ((problem_.r - 1) * problem_.dilation_h + 1)
-                 ) / (problem_.stride_h) 
+                 ) / (problem_.stride_h)
                 + 1;
   }
 
@@ -290,10 +295,10 @@ Status Conv2dOperationProfiler::initialize_configuration(
   if (!arg_as_int(problem_.q, "q", problem_space, problem)) {
     // default value (set using cudnn formula for output width, when q is not provided)
     problem_.q = (
-                    problem_.w + 
-                    2 * problem_.pad_w - 
+                    problem_.w +
+                    2 * problem_.pad_w -
                     ((problem_.s - 1) * problem_.dilation_w + 1)
-                 ) / (problem_.stride_w) 
+                 ) / (problem_.stride_w)
                 + 1;
   }
   /////////////////////////////////////////////////////////////////////////////////////////
@@ -308,7 +313,7 @@ Status Conv2dOperationProfiler::initialize_configuration(
     // default value
     problem_.split_k_slices = 1;
   }
-  
+
   if (!arg_as_ConvModeID(problem_.conv_mode, "conv_mode", problem_space, problem)) {
     // default value
     problem_.conv_mode = library::ConvModeID::kCrossCorrelation;
@@ -340,24 +345,24 @@ Status Conv2dOperationProfiler::initialize_configuration(
   }
 
   if (!arg_as_scalar(
-    problem_.alpha, 
-    operation_desc.element_epilogue, 
-    "alpha", 
-    problem_space, 
+    problem_.alpha,
+    operation_desc.element_epilogue,
+    "alpha",
+    problem_space,
     problem)) {
 
     if (!cast_from_double(problem_.alpha, operation_desc.element_epilogue, 1)) {
       return Status::kErrorInternal;
     }
   }
-  
+
   if (!arg_as_scalar(
-    problem_.beta, 
-    operation_desc.element_epilogue, 
-    "beta", 
-    problem_space, 
+    problem_.beta,
+    operation_desc.element_epilogue,
+    "beta",
+    problem_space,
     problem)) {
-    
+
     if (!cast_from_double(problem_.beta, operation_desc.element_epilogue, 0)) {
       return Status::kErrorInternal;
     }
@@ -382,15 +387,38 @@ Status Conv2dOperationProfiler::initialize_configuration(
                                                 int(problem_.dilation_w),
                                                 static_cast<conv::Mode>(static_cast<int>(problem_.conv_mode)),
                                                 int(problem_.split_k_slices),
-                                                1 // groups
+                                                int(problem_.groups)
                                               );
-  
+
   conv_workspace_.configuration.split_k_mode = static_cast<conv::SplitKMode>(static_cast<int>(problem_.split_k_mode));
 
   conv_workspace_.set_stride_vector(
       problem_, operation_desc.conv_kind, operation_desc.A.layout,
       operation_desc.B.layout, operation_desc.C.layout);
 
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+  {
+    auto print_vector = [] (const auto& vec) {
+      printf("[");
+      for (size_t k = 0; k < vec.size(); ++k) {
+        cute::print(vec[k]);
+        if (k + 1 < vec.size()) {
+          printf(",");
+        }
+      }
+      printf("]");
+    };
+  
+    printf("\n    conv_workspace_.configuration.stride_a: ");
+    print_vector(conv_workspace_.configuration.stride_a);
+    printf("\n    conv_workspace_.configuration.stride_b: ");
+    print_vector(conv_workspace_.configuration.stride_b);
+    printf("\n    conv_workspace_.configuration.stride_c: ");
+    print_vector(conv_workspace_.configuration.stride_c);
+    printf("\n");
+  }
+#endif
+
   // initialize library::ConvArguments
   conv_workspace_.arguments.A            = nullptr;
   conv_workspace_.arguments.B            = nullptr;
@@ -415,7 +443,7 @@ Status Conv2dOperationProfiler::initialize_configuration(
 /// Initializes the performance result
 void Conv2dOperationProfiler::initialize_result_(
   PerformanceResult &result,
-  Options const &options,  
+  Options const &options,
   library::ConvDescription const &operation_desc,
   ProblemSpace const &problem_space) {
 
@@ -427,15 +455,15 @@ void Conv2dOperationProfiler::initialize_result_(
   result.arguments.resize(problem_space.rank());
 
   set_argument(result, "Activation", problem_space,
-    std::string(library::to_string(operation_desc.activation().element)) 
+    std::string(library::to_string(operation_desc.activation().element))
     + ":" + library::to_string(operation_desc.activation().layout));
 
   set_argument(result, "Filter", problem_space,
-    std::string(library::to_string(operation_desc.filter().element)) 
+    std::string(library::to_string(operation_desc.filter().element))
     + ":" + library::to_string(operation_desc.filter().layout));
 
   set_argument(result, "Output", problem_space,
-    std::string(library::to_string(operation_desc.output().element)) 
+    std::string(library::to_string(operation_desc.output().element))
     + ":" + library::to_string(operation_desc.output().layout));
 
   set_argument(result, "conv_kind", problem_space, library::to_string(operation_desc.conv_kind));
@@ -450,10 +478,12 @@ void Conv2dOperationProfiler::initialize_result_(
   set_argument(result, "k", problem_space, problem_.k);
   set_argument(result, "r", problem_space, problem_.r);
   set_argument(result, "s", problem_space, problem_.s);
-  
+
   set_argument(result, "p", problem_space, problem_.p);
   set_argument(result, "q", problem_space, problem_.q);
 
+  set_argument(result, "g", problem_space, problem_.groups);
+
   set_argument(result, "pad_h", problem_space, problem_.pad_h);
   set_argument(result, "pad_w", problem_space, problem_.pad_w);
 
@@ -463,11 +493,11 @@ void Conv2dOperationProfiler::initialize_result_(
   set_argument(result, "dilation_h", problem_space, problem_.dilation_h);
   set_argument(result, "dilation_w", problem_space, problem_.dilation_w);
 
-  set_argument(result, "split_k_mode", problem_space, 
+  set_argument(result, "split_k_mode", problem_space,
     std::string(library::to_string(problem_.split_k_mode)));
   set_argument(result, "split_k_slices", problem_space, problem_.split_k_slices);
 
-  set_argument(result, "conv_mode", problem_space, 
+  set_argument(result, "conv_mode", problem_space,
     std::string(library::to_string(problem_.conv_mode)));
 
   set_argument(result, "alpha", problem_space,
@@ -476,25 +506,25 @@ void Conv2dOperationProfiler::initialize_result_(
   set_argument(result, "beta", problem_space,
     library::lexical_cast(problem_.beta, operation_desc.element_epilogue));
 
-  set_argument(result, "eq_gemm_provider", problem_space, 
+  set_argument(result, "eq_gemm_provider", problem_space,
     std::string(library::to_string(problem_.eq_gemm_provider)));
 
   OperationProfiler::initialize_result_(result, operation_desc, problem_space);
 
   // Bytes of activation, filter, and output tensors
-  int64_t activation_bytes = int64_t(library::sizeof_bits(operation_desc.activation().element) / 8) * 
+  int64_t activation_bytes = int64_t(library::sizeof_bits(operation_desc.activation().element) / 8) *
     conv_workspace_.configuration.problem_size.activation_size();
 
-  int64_t filter_bytes = int64_t(library::sizeof_bits(operation_desc.filter().element) / 8) * 
+  int64_t filter_bytes = int64_t(library::sizeof_bits(operation_desc.filter().element) / 8) *
     conv_workspace_.configuration.problem_size.filter_size();
 
-  int64_t output_bytes = int64_t(library::sizeof_bits(operation_desc.output().element) / 8) * 
+  int64_t output_bytes = int64_t(library::sizeof_bits(operation_desc.output().element) / 8) *
     conv_workspace_.configuration.problem_size.output_size();
 
   // Bytes of activation, filter, and output tensors
   result.bytes = problem_.bytes(operation_desc);
 
-  // Theoritical flops required for the computation
+  // Theoretical flops required for the computation
   result.flops = problem_.flops(operation_desc);
 
   // Measured runtime
@@ -502,16 +532,16 @@ void Conv2dOperationProfiler::initialize_result_(
 
 }
 
-/// Initialize reduction problem dimenstions and library::Operation
+/// Initialize reduction problem dimensions and library::Operation
 bool Conv2dOperationProfiler::initialize_reduction_configuration_(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
   ProblemSpace const &problem_space,
   ProblemSpace::Problem const &problem) {
 
-  library::ConvDescription const &conv_desc = 
+  library::ConvDescription const &conv_desc =
     static_cast<library::ConvDescription const &>(operation->description());
 
   library::ConvKind const &conv_kind = conv_desc.conv_kind;
@@ -527,7 +557,7 @@ bool Conv2dOperationProfiler::initialize_reduction_configuration_(
   /// This chooses the appropriate stride element of the row-major C tensor.
   int const & tensor_c_stride_idx = (conv_kind == library::ConvKind::kWgrad ? 2 : 0);
 
-  /// intialize library::ReductionConfiguration
+  /// initialize library::ReductionConfiguration
   conv_workspace_.reduction_configuration.problem_size     = problem_.eq_gemm_size(conv_kind).mn();
   conv_workspace_.reduction_configuration.partitions       = int(problem_.split_k_slices);
   conv_workspace_.reduction_configuration.partition_stride = problem_.eq_gemm_size(conv_kind).mn().product();
@@ -538,14 +568,14 @@ bool Conv2dOperationProfiler::initialize_reduction_configuration_(
   conv_workspace_.reduction_configuration.ldd =
       conv_workspace_.configuration.stride_c[tensor_c_stride_idx];
 
-  // find reduction operation 
+  // find reduction operation
   library::ReductionFunctionalKey reduction_key(
     library::Provider::kCUTLASS,
-    conv_desc.tile_description.math_instruction.element_accumulator,  // element workspace 
+    conv_desc.tile_description.math_instruction.element_accumulator,  // element workspace
     conv_desc.tile_description.math_instruction.element_accumulator,  // element accumulator
     conv_desc.C.element,                                              // element output
     conv_desc.element_epilogue                                        // element compute
-  ); 
+  );
 
 #if 0// debug print to check which reduction instance is selected
     std::cout << reduction_key << "\n";
@@ -555,7 +585,7 @@ bool Conv2dOperationProfiler::initialize_reduction_configuration_(
   if(reduction_it == Singleton::get().operation_table.reduction_operations.end()) {
 
     return false;
-  }    
+  }
 
   // initialize reduction operation required for parallel split-k conv2d operator
   reduction_op_ = reduction_it->second;
@@ -567,13 +597,24 @@ bool Conv2dOperationProfiler::initialize_reduction_configuration_(
 
 /// Initializes workspace
 Status Conv2dOperationProfiler::initialize_workspace(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
   ProblemSpace const &problem_space,
   ProblemSpace::Problem const &problem) {
 
+  if (options.device.devices.size() != 1) {
+    throw std::runtime_error("This operation profiler only supports a single "
+                             "device.");
+  }
+
+  cudaError_t result;
+  result = cudaSetDevice(options.device.device_id(0));
+  if (result != cudaSuccess) {
+    throw std::runtime_error("cudaSetDevice() failed.");
+  }
+
   // initialize conv2d underlying operation to handle parallel reduction
   library::Operation const* underlying_operation = operation;
 
@@ -583,15 +624,15 @@ Status Conv2dOperationProfiler::initialize_workspace(
     }
   }
 
-  library::ConvDescription const &operation_desc = 
+  library::ConvDescription const &operation_desc =
     static_cast<library::ConvDescription const &>(underlying_operation->description());
 
   // Compute the number of copies of the problem to avoid L2 camping.
   if (!options.profiling.workspace_count) {
     int64_t bytes = problem_.bytes(operation_desc);
-    if (bytes < 3 * int64_t(options.device.properties.l2CacheSize)) {
+    if (bytes < 3 * int64_t(options.device.properties[0].l2CacheSize)) {
       conv_workspace_.problem_count =
-        1 + int((3 * int64_t(options.device.properties.l2CacheSize)) / bytes);
+        1 + int((3 * int64_t(options.device.properties[0].l2CacheSize)) / bytes);
     }
     else {
       conv_workspace_.problem_count = 1;
@@ -603,53 +644,78 @@ Status Conv2dOperationProfiler::initialize_workspace(
 
 
   if (options.execution_mode != ExecutionMode::kDryRun) {
-
-    conv_workspace_.A = device_context.allocate_tensor(
+    int seed_shift = 0;
+    conv_workspace_.A = device_context.allocate_and_initialize_tensor(
       options,
       "A",
       operation_desc.A.element,
       operation_desc.A.layout,
       problem_.extent_a(operation_desc.conv_kind),
       conv_workspace_.configuration.stride_a,
-      conv_workspace_.problem_count
+      conv_workspace_.problem_count,
+      seed_shift++,
+      0 // device_index
     );
 
-    conv_workspace_.B = device_context.allocate_tensor(
+    conv_workspace_.B = device_context.allocate_and_initialize_tensor(
       options,
       "B",
       operation_desc.B.element,
       operation_desc.B.layout,
       problem_.extent_b(operation_desc.conv_kind),
       conv_workspace_.configuration.stride_b,
-      conv_workspace_.problem_count
+      conv_workspace_.problem_count,
+      seed_shift++,
+      0 // device_index
     );
 
-    conv_workspace_.C = device_context.allocate_tensor(
+    if(problem_.groups == problem_.c && problem_.groups == problem_.k){
+      // Depthwise direct conv kernel needs reorder the filter.
+      conv_workspace_.reordered_B = device_context.allocate_and_initialize_tensor(
+        options,
+        "B",
+        operation_desc.B.element,
+        operation_desc.B.layout,
+        problem_.extent_b(operation_desc.conv_kind),
+        conv_workspace_.configuration.stride_b,
+        conv_workspace_.problem_count,
+        seed_shift++,
+        0 // device_index
+      );
+    }
+
+    conv_workspace_.C = device_context.allocate_and_initialize_tensor(
       options,
       "C",
       operation_desc.C.element,
       operation_desc.C.layout,
       problem_.extent_c(operation_desc.conv_kind),
       conv_workspace_.configuration.stride_c,
-      conv_workspace_.problem_count
+      conv_workspace_.problem_count,
+      seed_shift++,
+      0 // device_index
     );
 
     conv_workspace_.Computed = device_context.allocate_tensor(
+      options,
       "D",
       operation_desc.C.element,
       operation_desc.C.layout,
       problem_.extent_c(operation_desc.conv_kind),
       conv_workspace_.configuration.stride_c,
-      conv_workspace_.problem_count
+      conv_workspace_.problem_count,
+      0 // device_index
     );
 
     conv_workspace_.Reference = device_context.allocate_tensor(
+      options,
       "Reference",
       operation_desc.C.element,
       operation_desc.C.layout,
       problem_.extent_c(operation_desc.conv_kind),
       conv_workspace_.configuration.stride_c,
-      conv_workspace_.problem_count
+      conv_workspace_.problem_count,
+      0 // device_index
     );
   }
 
@@ -682,10 +748,10 @@ Status Conv2dOperationProfiler::initialize_workspace(
         conv_workspace_.reduction_host_workspace.resize(workspace_size, 0);
 
         status = reduction_op_->initialize(
-          &conv_workspace_.reduction_configuration, 
-          conv_workspace_.reduction_host_workspace.data(), 
+          &conv_workspace_.reduction_configuration,
+          conv_workspace_.reduction_host_workspace.data(),
           nullptr);
-        
+
         if (status != Status::kSuccess) {
           return status;
         }
@@ -712,7 +778,7 @@ Status Conv2dOperationProfiler::initialize_workspace(
 
 /// Verifies CUTLASS against references
 bool Conv2dOperationProfiler::verify_cutlass(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
@@ -738,15 +804,21 @@ bool Conv2dOperationProfiler::verify_cutlass(
   conv_workspace_.arguments.beta = problem_.beta.data();
   conv_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
 
+  if (conv_workspace_.reordered_B != nullptr){
+    conv_workspace_.arguments.reordered_B = conv_workspace_.reordered_B->data();
+  }else{
+    conv_workspace_.arguments.reordered_B = nullptr;
+  }
+
   conv_workspace_.Computed->copy_from_device(conv_workspace_.C->data());
-  
+
   if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
     // update library::ConvArguments for parallel split-k reduction
     conv_workspace_.arguments.D = conv_workspace_.device_workspace.data();
     conv_workspace_.arguments.alpha = problem_.alpha_one.data();
     conv_workspace_.arguments.beta = problem_.beta_zero.data();
 
-    /// intialize library::ReductionArguments
+    /// initialize library::ReductionArguments
     conv_workspace_.reduction_arguments.workspace           = conv_workspace_.device_workspace.data();
     conv_workspace_.reduction_arguments.source              = conv_workspace_.C->data();
     conv_workspace_.reduction_arguments.destination         = conv_workspace_.Computed->data();
@@ -769,9 +841,9 @@ bool Conv2dOperationProfiler::verify_cutlass(
   }
 
 #if 0
-  std::cout << "profiling         : " << std::endl 
-            << "conv2d            : " << operation->description().name << std::endl 
-            << "underlying conv2d : " << underlying_operation->description().name << std::endl 
+  std::cout << "profiling         : " << std::endl
+            << "conv2d            : " << operation->description().name << std::endl
+            << "underlying conv2d : " << underlying_operation->description().name << std::endl
             << "reduction         : " << reduction_op_->description().name << std::endl;
 #endif
 
@@ -788,7 +860,7 @@ bool Conv2dOperationProfiler::verify_cutlass(
 
   // Run parallel reduction kernel for parallel split_k_mode
   if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
-    
+
     results_.back().status = reduction_op_->run(
       &conv_workspace_.reduction_arguments,
       conv_workspace_.reduction_host_workspace.data(),
@@ -810,7 +882,7 @@ bool Conv2dOperationProfiler::verify_cutlass(
 
   // CUTLASS op ran the but not yet verified against any verification provider
   results_.back().disposition = Disposition::kNotVerified;
-  
+
   //
   // Run verification providers
   //
@@ -826,7 +898,7 @@ bool Conv2dOperationProfiler::verify_cutlass(
 
       Status status = cudnn_satisfies(conv_desc, conv_workspace_.configuration);
 
-      // Initialize reference data to the source data 
+      // Initialize reference data to the source data
       conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data());
 
       if (status == Status::kSuccess) {
@@ -854,7 +926,7 @@ bool Conv2dOperationProfiler::verify_cutlass(
     // Run verification device reference
     if (options.verification.provider_enabled(library::Provider::kReferenceDevice)) {
 
-      // Restore reference data back to initial source data 
+      // Restore reference data back to initial source data
       conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data());
 
       verify_with_device_reference_(
@@ -863,13 +935,13 @@ bool Conv2dOperationProfiler::verify_cutlass(
         device_context,
         operation,
         problem_space,
-        problem);      
+        problem);
     }
 
     // Run verification host reference
     if (options.verification.provider_enabled(library::Provider::kReferenceHost)) {
-      
-      // Restore reference data back to initial source data 
+
+      // Restore reference data back to initial source data
       conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data());
 
       verify_with_host_reference_(
@@ -878,10 +950,10 @@ bool Conv2dOperationProfiler::verify_cutlass(
         device_context,
         operation,
         problem_space,
-        problem);      
+        problem);
     }
 
-    // Update disposition to worst case verification outcome among all 
+    // Update disposition to worst case verification outcome among all
     // verification providers which are supported
     bool is_any_verification_run_passed = false;
     for(auto &m : results_.back().verification_map) {
@@ -906,7 +978,7 @@ bool Conv2dOperationProfiler::verify_cutlass(
 
 /// Verifies CUTLASS against host reference
 bool Conv2dOperationProfiler::verify_with_host_reference_(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
@@ -924,17 +996,17 @@ bool Conv2dOperationProfiler::verify_with_host_reference_(
 
     library::ConvFunctionalKey conv2d_key(
       library::Provider::kReferenceHost,
-      conv_desc.conv_kind,        
+      conv_desc.conv_kind,
       conv_desc.A.element,
       conv_desc.A.layout,
       conv_desc.B.element,
       conv_desc.B.layout,
       conv_desc.C.element,
       conv_desc.C.layout,
-      conv_desc.tile_description.math_instruction.element_accumulator, 
+      conv_desc.tile_description.math_instruction.element_accumulator,
       conv_desc.element_epilogue);
 
-#if 0 // debug print to check which host refererence instance is selected
+#if 0 // debug print to check which host reference instance is selected
     std::cout << conv2d_key << "\n";
 #endif
 
@@ -944,18 +1016,18 @@ bool Conv2dOperationProfiler::verify_with_host_reference_(
 
       results_.back().verification_map[library::Provider::kReferenceHost] = Disposition::kNotRun;
       return true;
-    }    
+    }
 
     // conv2d host reference minimum cc is 0 (CPU) and no iterator algorithm
     library::ConvPreferenceKey preference_key(0, library::IteratorAlgorithmID::kNone);
     auto cc_it = operators_it->second.find(preference_key);
-    
+
     if(cc_it == operators_it->second.end()) {
       results_.back().verification_map[library::Provider::kReferenceHost] = Disposition::kNotRun;
       return true;
     }
 
-    // host refernce has only one instances in Conv2dOperationVectorMap
+    // host reference has only one instances in Conv2dOperationVectorMap
     library::Operation const *reference_op = cc_it->second[0];
 
     //
@@ -982,7 +1054,7 @@ bool Conv2dOperationProfiler::verify_with_host_reference_(
     conv_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
 
     //
-    // Intialize host reference operation
+    // Initialize host reference operation
     //
     std::vector<uint8_t> host_workspace_reference_op;
 
@@ -1022,9 +1094,9 @@ bool Conv2dOperationProfiler::verify_with_host_reference_(
     );
 
     // Save workspace if incorrect
-    if (options.verification.save_workspace == SaveWorkspace::kIncorrect && 
+    if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
       results_.back().verification_map[library::Provider::kReferenceHost] == Disposition::kIncorrect) {
-  
+
       save_workspace(
         device_context,
         options,
@@ -1040,7 +1112,7 @@ bool Conv2dOperationProfiler::verify_with_host_reference_(
 
 /// Verifies CUTLASS against host reference
 bool Conv2dOperationProfiler::verify_with_device_reference_(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
@@ -1058,14 +1130,14 @@ bool Conv2dOperationProfiler::verify_with_device_reference_(
 
     library::ConvFunctionalKey conv2d_key(
       library::Provider::kReferenceDevice,
-      conv_desc.conv_kind,      
+      conv_desc.conv_kind,
       conv_desc.A.element,
       conv_desc.A.layout,
       conv_desc.B.element,
       conv_desc.B.layout,
       conv_desc.C.element,
       conv_desc.C.layout,
-      conv_desc.tile_description.math_instruction.element_accumulator, 
+      conv_desc.tile_description.math_instruction.element_accumulator,
       conv_desc.element_epilogue);
 
     auto operators_it = Singleton::get().operation_table.conv2d_operations.find(conv2d_key);
@@ -1075,23 +1147,23 @@ bool Conv2dOperationProfiler::verify_with_device_reference_(
       results_.back().verification_map[library::Provider::kReferenceDevice] = Disposition::kNotRun;
 
       return true;
-    }    
+    }
 
     // conv2d device reference minimum cc is 50 and no iterator algorithm
     library::ConvPreferenceKey preference_key(50, library::IteratorAlgorithmID::kNone);
     auto cc_it = operators_it->second.find(preference_key);
-    
+
     if(cc_it == operators_it->second.end()) {
       results_.back().verification_map[library::Provider::kReferenceDevice] = Disposition::kNotRun;
 
       return true;
     }
 
-    // device refernce has only one instances in Conv2dOperationVectorMap
+    // device reference has only one instances in Conv2dOperationVectorMap
     library::Operation const *reference_op = cc_it->second[0];
-  
+
     //
-    // Intialize device reference operation
+    // Initialize device reference operation
     //
     std::vector<uint8_t> host_workspace_reference_op;
 
@@ -1136,9 +1208,9 @@ bool Conv2dOperationProfiler::verify_with_device_reference_(
     );
 
     // Save workspace if incorrect
-    if (options.verification.save_workspace == SaveWorkspace::kIncorrect && 
+    if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
       results_.back().verification_map[library::Provider::kReferenceDevice] == Disposition::kIncorrect) {
-  
+
       save_workspace(
         device_context,
         options,
@@ -1153,14 +1225,14 @@ bool Conv2dOperationProfiler::verify_with_device_reference_(
 
 /// Measures performance results
 bool Conv2dOperationProfiler::profile(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
   ProblemSpace const &problem_space,
   ProblemSpace::Problem const &problem) {
 
-  
+
   if (options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
 
     // Initialize structure containing Conv2d arguments
@@ -1178,7 +1250,7 @@ bool Conv2dOperationProfiler::profile(
       conv_workspace_.arguments.alpha = problem_.alpha_one.data();
       conv_workspace_.arguments.beta = problem_.beta_zero.data();
 
-      /// intialize library::ReductionArguments
+      /// initialize library::ReductionArguments
       conv_workspace_.reduction_arguments.workspace           = conv_workspace_.device_workspace.data();
       conv_workspace_.reduction_arguments.source              = conv_workspace_.C->data();
       conv_workspace_.reduction_arguments.destination         = conv_workspace_.Computed->data();
@@ -1188,7 +1260,7 @@ bool Conv2dOperationProfiler::profile(
     }
 
     results_.back().status = profile_cutlass_(
-      results_.back().runtime,
+      results_.back(),
       options,
       operation,
       &conv_workspace_.arguments,
@@ -1202,7 +1274,7 @@ bool Conv2dOperationProfiler::profile(
 
 /// Method to profile a CUTLASS Operation
 Status Conv2dOperationProfiler::profile_cutlass_(
-  double &runtime,
+  PerformanceResult &result,
   Options const &options,
   library::Operation const *operation,
   void *arguments,
@@ -1212,7 +1284,7 @@ Status Conv2dOperationProfiler::profile_cutlass_(
   GpuTimer timer;
 
   // initialize conv2d underlying operation to handle parallel reduction
-  library::Operation const* underlying_operation = operation; 
+  library::Operation const* underlying_operation = operation;
 
   library::ConvArguments *conv_arguments = static_cast<library::ConvArguments *>(arguments);
 
@@ -1244,12 +1316,12 @@ Status Conv2dOperationProfiler::profile_cutlass_(
     conv_arguments->B = conv_workspace_.B->batch_data(problem_idx);
     conv_arguments->C = conv_workspace_.C->batch_data(problem_idx);
     conv_arguments->D = conv_workspace_.Computed->batch_data(problem_idx);
-    
+
     if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
       // update library::ConvArguments for parallel split-k reduction
       conv_arguments->D = conv_workspace_.device_workspace.data();
 
-      /// intialize library::ReductionArguments
+      /// initialize library::ReductionArguments
       conv_workspace_.reduction_arguments.workspace           = conv_workspace_.device_workspace.data();
       conv_workspace_.reduction_arguments.source              = conv_workspace_.C->batch_data(problem_idx);
       conv_workspace_.reduction_arguments.destination         = conv_workspace_.Computed->batch_data(problem_idx);
@@ -1274,7 +1346,7 @@ Status Conv2dOperationProfiler::profile_cutlass_(
       return status;
     }
   }
-  
+
   //
   // Initialize GPU timer
   //
@@ -1289,7 +1361,7 @@ Status Conv2dOperationProfiler::profile_cutlass_(
 
   int iteration = 0;
   for (; iteration < Iterations; ++iteration) {
-    
+
     // Setup rotating workspace
     int problem_idx = (iteration % conv_workspace_.problem_count);
 
@@ -1302,7 +1374,7 @@ Status Conv2dOperationProfiler::profile_cutlass_(
       // update library::ConvArguments for parallel split-k reduction
       conv_arguments->D = conv_workspace_.device_workspace.data();
 
-      /// intialize library::ReductionArguments
+      /// initialize library::ReductionArguments
       conv_workspace_.reduction_arguments.workspace           = conv_workspace_.device_workspace.data();
       conv_workspace_.reduction_arguments.source              = conv_workspace_.C->batch_data(problem_idx);
       conv_workspace_.reduction_arguments.destination         = conv_workspace_.Computed->batch_data(problem_idx);
@@ -1315,7 +1387,7 @@ Status Conv2dOperationProfiler::profile_cutlass_(
       device_workspace);
 
     // Run parallel reduction kernel for parallel split_k_mode
-    if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {      
+    if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
 
       status = reduction_op_->run(
         &conv_workspace_.reduction_arguments,
@@ -1337,8 +1409,8 @@ Status Conv2dOperationProfiler::profile_cutlass_(
   //
   // Update performance result
   //
-  
-  runtime = timer.duration(iteration);
+
+  result.runtime = timer.duration(iteration);
 
   return status;
 }
@@ -1348,13 +1420,13 @@ Status Conv2dOperationProfiler::profile_cutlass_(
 
 /// Verifies CUTLASS against cudnn reference
 bool Conv2dOperationProfiler::verify_with_cudnn_(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
   ProblemSpace const &problem_space,
   ProblemSpace::Problem const &problem) {
-  
+
   auto &conv_desc = static_cast<library::ConvDescription const &>(operation->description());
 
   //
@@ -1365,7 +1437,7 @@ bool Conv2dOperationProfiler::verify_with_cudnn_(
   cudnnStatus_t status = handle.get_cudnn_create_status();
 
   if (status != CUDNN_STATUS_SUCCESS) {
-    
+
     results_.back().verification_map[library::Provider::kCUDNN] = get_cutlass_disposition(status);
     return true;
   }
@@ -1381,7 +1453,7 @@ bool Conv2dOperationProfiler::verify_with_cudnn_(
   conv_workspace_.arguments.alpha = problem_.alpha.data();
   conv_workspace_.arguments.beta = problem_.beta.data();
   conv_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
-      
+
   // cuDNN does not support four tensor arguments, so we copy the tensor C data into
   // tensor D.
   conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data());
@@ -1393,8 +1465,8 @@ bool Conv2dOperationProfiler::verify_with_cudnn_(
     // Construct dispatcher to cudnn operator
     //
 
-    detail::cudnnConvDispatcher conv_op( 
-      conv_desc, 
+    detail::cudnnConvDispatcher conv_op(
+      conv_desc,
       conv_workspace_.configuration,
       conv_workspace_.arguments,
       handle
@@ -1432,7 +1504,7 @@ bool Conv2dOperationProfiler::verify_with_cudnn_(
     );
 
     // Save workspace if incorrect
-    if (options.verification.save_workspace == SaveWorkspace::kIncorrect && 
+    if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
       results_.back().verification_map[library::Provider::kCUDNN] == Disposition::kIncorrect) {
 
       save_workspace(
diff --git a/tools/profiler/src/conv3d_operation_profiler.cu b/tools/profiler/src/conv3d_operation_profiler.cu
index b386bdcb6e..04d338c3ee 100644
--- a/tools/profiler/src/conv3d_operation_profiler.cu
+++ b/tools/profiler/src/conv3d_operation_profiler.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -40,9 +40,8 @@
 
 #include "cutlass/core_io.h"
 
-#include "conv3d_operation_profiler.h"
-#include "gpu_timer.h"
-
+#include "cutlass/profiler/conv3d_operation_profiler.h"
+#include "cutlass/profiler/gpu_timer.h"
 /////////////////////////////////////////////////////////////////////////////////////////////////
 using namespace cutlass::library;
 
@@ -53,10 +52,10 @@ namespace profiler {
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Ctor
-Conv3dOperationProfiler::Conv3dOperationProfiler(Options const &options): 
+Conv3dOperationProfiler::Conv3dOperationProfiler(Options const &options):
   OperationProfiler(
     options,
-    library::OperationKind::kConv3d, 
+    library::OperationKind::kConv3d,
     {
       {ArgumentTypeID::kEnumerated, {"conv_kind"}, "Convolutional operator (fprop, dgrad, wgrad)"},
       {ArgumentTypeID::kInteger, {"n", "input_n"}, "Input N dimension of the Conv3d problem space"},
@@ -171,7 +170,7 @@ int64_t Conv3dOperationProfiler::Conv3dProblem::flops(
 
   int64_t flops_mainloop_ = int64_t(mnk.m()) * mnk.n() * mnk.k() * 2;
   int64_t flops_epilogue_ = int64_t(mnk.m()) * int64_t(mnk.n()) * 2;
-  
+
   // Adjust mainloop flop for dgrad strided
   if (operation_desc.conv_kind == library::ConvKind::kDgrad) {
     flops_mainloop_ = flops_mainloop_ / ( stride_d * stride_h * stride_w);
@@ -184,14 +183,14 @@ int64_t Conv3dOperationProfiler::Conv3dProblem::flops(
 
 /// Extracts the problem dimensions
 Status Conv3dOperationProfiler::initialize_configuration(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
   ProblemSpace const &problem_space,
   ProblemSpace::Problem const &problem) {
 
-  library::ConvDescription const &operation_desc = 
+  library::ConvDescription const &operation_desc =
     static_cast<library::ConvDescription const &>(operation->description());
 
   if (!arg_as_int(problem_.n, "n", problem_space, problem)) {
@@ -208,7 +207,7 @@ Status Conv3dOperationProfiler::initialize_configuration(
     // default value
     problem_.h = 14;
   }
-  
+
   if (!arg_as_int(problem_.w, "w", problem_space, problem)) {
     // default value
     problem_.w = 14;
@@ -233,7 +232,7 @@ Status Conv3dOperationProfiler::initialize_configuration(
     // default value
     problem_.r = 3;
   }
-  
+
   if (!arg_as_int(problem_.s, "s", problem_space, problem)) {
     // default value
     problem_.s = 3;
@@ -284,7 +283,7 @@ Status Conv3dOperationProfiler::initialize_configuration(
   }
 
   ////////////////////////  Convolution output dimensions p and q ////////////////////////
-  // Cutlass convolutions support arbitrary output sizes and not constriant by          //
+  // Cutlass convolutions support arbitrary output sizes and not constrained by         //
   // input, filter, padding, striding, dilation sizes.                                  //
   // cuDNN sets the output dimensions (p, q)  using following equations:                //
   //                                                                                    //
@@ -295,25 +294,25 @@ Status Conv3dOperationProfiler::initialize_configuration(
   // cutlass profiler sets p and q which are cuDNN compliant.                           //
   //                                                                                    //
   ////////////////////////////////////////////////////////////////////////////////////////
-  // set convolution output z 
+  // set convolution output z
   if (!arg_as_int(problem_.z, "z", problem_space, problem)) {
     // default value (set using cudnn formula for output height, when p is not provided)
     problem_.z = (
-                    problem_.d + 
-                    2 * problem_.pad_d - 
+                    problem_.d +
+                    2 * problem_.pad_d -
                     ((problem_.t - 1) * problem_.dilation_d + 1)
-                 ) / (problem_.stride_d) 
+                 ) / (problem_.stride_d)
                 + 1;
   }
 
-  // set convolution output p 
+  // set convolution output p
   if (!arg_as_int(problem_.p, "p", problem_space, problem)) {
     // default value (set using cudnn formula for output height, when p is not provided)
     problem_.p = (
-                    problem_.h + 
-                    2 * problem_.pad_h - 
+                    problem_.h +
+                    2 * problem_.pad_h -
                     ((problem_.r - 1) * problem_.dilation_h + 1)
-                 ) / (problem_.stride_h) 
+                 ) / (problem_.stride_h)
                 + 1;
   }
 
@@ -321,10 +320,10 @@ Status Conv3dOperationProfiler::initialize_configuration(
   if (!arg_as_int(problem_.q, "q", problem_space, problem)) {
     // default value (set using cudnn formula for output width, when q is not provided)
     problem_.q = (
-                    problem_.w + 
-                    2 * problem_.pad_w - 
+                    problem_.w +
+                    2 * problem_.pad_w -
                     ((problem_.s - 1) * problem_.dilation_w + 1)
-                 ) / (problem_.stride_w) 
+                 ) / (problem_.stride_w)
                 + 1;
   }
   /////////////////////////////////////////////////////////////////////////////////////////
@@ -339,7 +338,7 @@ Status Conv3dOperationProfiler::initialize_configuration(
     // default value
     problem_.split_k_slices = 1;
   }
-  
+
   if (!arg_as_ConvModeID(problem_.conv_mode, "conv_mode", problem_space, problem)) {
     // default value
     problem_.conv_mode = library::ConvModeID::kCrossCorrelation;
@@ -371,24 +370,24 @@ Status Conv3dOperationProfiler::initialize_configuration(
   }
 
   if (!arg_as_scalar(
-    problem_.alpha, 
-    operation_desc.element_epilogue, 
-    "alpha", 
-    problem_space, 
+    problem_.alpha,
+    operation_desc.element_epilogue,
+    "alpha",
+    problem_space,
     problem)) {
 
     if (!cast_from_double(problem_.alpha, operation_desc.element_epilogue, 1)) {
       return Status::kErrorInternal;
     }
   }
-  
+
   if (!arg_as_scalar(
-    problem_.beta, 
-    operation_desc.element_epilogue, 
-    "beta", 
-    problem_space, 
+    problem_.beta,
+    operation_desc.element_epilogue,
+    "beta",
+    problem_space,
     problem)) {
-    
+
     if (!cast_from_double(problem_.beta, operation_desc.element_epilogue, 0)) {
       return Status::kErrorInternal;
     }
@@ -421,25 +420,25 @@ Status Conv3dOperationProfiler::initialize_configuration(
                                                 int(problem_.split_k_slices),
                                                 1 // groups
                                               );
-  
+
   conv_workspace_.configuration.split_k_mode = static_cast<conv::SplitKMode>(static_cast<int>(problem_.split_k_mode));
 
   conv_workspace_.configuration.layout_activations.stride() = make_Coord(
-    int(problem_.c), 
+    int(problem_.c),
     int(problem_.w) * int(problem_.c),
     int(problem_.h) * int(problem_.w) * int(problem_.c),
     int(problem_.d) * int(problem_.h) * int(problem_.w) * int(problem_.c)
   );
 
   conv_workspace_.configuration.layout_filters.stride() = make_Coord(
-    int(problem_.c), 
+    int(problem_.c),
     int(problem_.s) * int(problem_.c),
     int(problem_.r) * int(problem_.s) * int(problem_.c),
     int(problem_.t) * int(problem_.r) * int(problem_.s) * int(problem_.c)
   );
 
   conv_workspace_.configuration.layout_output.stride() = make_Coord(
-    int(problem_.k), 
+    int(problem_.k),
     int(problem_.q) * int(problem_.k),
     int(problem_.q) * int(problem_.p) * int(problem_.k),
     int(problem_.z) * int(problem_.q) * int(problem_.p) * int(problem_.k)
@@ -470,7 +469,7 @@ Status Conv3dOperationProfiler::initialize_configuration(
 /// Initializes the performance result
 void Conv3dOperationProfiler::initialize_result_(
   PerformanceResult &result,
-  Options const &options,  
+  Options const &options,
   library::ConvDescription const &operation_desc,
   ProblemSpace const &problem_space) {
 
@@ -482,15 +481,15 @@ void Conv3dOperationProfiler::initialize_result_(
   result.arguments.resize(problem_space.rank());
 
   set_argument(result, "Activation", problem_space,
-    std::string(library::to_string(operation_desc.activation().element)) 
+    std::string(library::to_string(operation_desc.activation().element))
     + ":" + library::to_string(operation_desc.activation().layout));
 
   set_argument(result, "Filter", problem_space,
-    std::string(library::to_string(operation_desc.filter().element)) 
+    std::string(library::to_string(operation_desc.filter().element))
     + ":" + library::to_string(operation_desc.filter().layout));
 
   set_argument(result, "Output", problem_space,
-    std::string(library::to_string(operation_desc.output().element)) 
+    std::string(library::to_string(operation_desc.output().element))
     + ":" + library::to_string(operation_desc.output().layout));
 
   set_argument(result, "conv_kind", problem_space, library::to_string(operation_desc.conv_kind));
@@ -507,7 +506,7 @@ void Conv3dOperationProfiler::initialize_result_(
   set_argument(result, "t", problem_space, problem_.t);
   set_argument(result, "r", problem_space, problem_.r);
   set_argument(result, "s", problem_space, problem_.s);
-  
+
   set_argument(result, "z", problem_space, problem_.z);
   set_argument(result, "p", problem_space, problem_.p);
   set_argument(result, "q", problem_space, problem_.q);
@@ -524,11 +523,11 @@ void Conv3dOperationProfiler::initialize_result_(
   set_argument(result, "dilation_h", problem_space, problem_.dilation_h);
   set_argument(result, "dilation_w", problem_space, problem_.dilation_w);
 
-  set_argument(result, "split_k_mode", problem_space, 
+  set_argument(result, "split_k_mode", problem_space,
     std::string(library::to_string(problem_.split_k_mode)));
   set_argument(result, "split_k_slices", problem_space, problem_.split_k_slices);
 
-  set_argument(result, "conv_mode", problem_space, 
+  set_argument(result, "conv_mode", problem_space,
     std::string(library::to_string(problem_.conv_mode)));
 
   set_argument(result, "alpha", problem_space,
@@ -537,7 +536,7 @@ void Conv3dOperationProfiler::initialize_result_(
   set_argument(result, "beta", problem_space,
     library::lexical_cast(problem_.beta, operation_desc.element_epilogue));
 
-  set_argument(result, "eq_gemm_provider", problem_space, 
+  set_argument(result, "eq_gemm_provider", problem_space,
     std::string(library::to_string(problem_.eq_gemm_provider)));
 
   OperationProfiler::initialize_result_(result, operation_desc, problem_space);
@@ -545,7 +544,7 @@ void Conv3dOperationProfiler::initialize_result_(
   // Bytes of activation, filter, and output tensors
   result.bytes = problem_.bytes(operation_desc);
 
-  // Theoritical flops required for the computation
+  // Theoretical flops required for the computation
   result.flops = problem_.flops(operation_desc);
 
   // Measured runtime
@@ -553,16 +552,16 @@ void Conv3dOperationProfiler::initialize_result_(
 
 }
 
-/// Initialize reduction problem dimenstions and library::Operation
+/// Initialize reduction problem dimensions and library::Operation
 bool Conv3dOperationProfiler::initialize_reduction_configuration_(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
   ProblemSpace const &problem_space,
   ProblemSpace::Problem const &problem) {
 
-  library::ConvDescription const &conv_desc = 
+  library::ConvDescription const &conv_desc =
     static_cast<library::ConvDescription const &>(operation->description());
 
   library::ConvKind const &conv_kind = conv_desc.conv_kind;
@@ -578,7 +577,7 @@ bool Conv3dOperationProfiler::initialize_reduction_configuration_(
   /// This chooses the appropriate stride element of the row-major C tensor.
   int const & tensor_c_stride_idx = (conv_kind == library::ConvKind::kWgrad ? 3 : 0);
 
-  /// intialize library::ReductionConfiguration
+  /// initialize library::ReductionConfiguration
   conv_workspace_.reduction_configuration.problem_size     = problem_.eq_gemm_size(conv_kind).mn();
   conv_workspace_.reduction_configuration.partitions       = int(problem_.split_k_slices);
   conv_workspace_.reduction_configuration.partition_stride = problem_.eq_gemm_size(conv_kind).mn().product();
@@ -586,14 +585,14 @@ bool Conv3dOperationProfiler::initialize_reduction_configuration_(
   conv_workspace_.reduction_configuration.lds              = conv_workspace_.configuration.layout_c(conv_kind).stride()[tensor_c_stride_idx];
   conv_workspace_.reduction_configuration.ldd              = conv_workspace_.configuration.layout_c(conv_kind).stride()[tensor_c_stride_idx];
 
-  // find reduction operation 
+  // find reduction operation
   library::ReductionFunctionalKey reduction_key(
     library::Provider::kCUTLASS,
-    conv_desc.tile_description.math_instruction.element_accumulator,  // element workspace 
+    conv_desc.tile_description.math_instruction.element_accumulator,  // element workspace
     conv_desc.tile_description.math_instruction.element_accumulator,  // element accumulator
     conv_desc.C.element,                                              // element output
     conv_desc.element_epilogue                                        // element compute
-  ); 
+  );
 
 #if 0// debug print to check which reduction instance is selected
     std::cout << reduction_key << "\n";
@@ -603,7 +602,7 @@ bool Conv3dOperationProfiler::initialize_reduction_configuration_(
   if(reduction_it == Singleton::get().operation_table.reduction_operations.end()) {
 
     return false;
-  }    
+  }
 
   // initialize reduction operation required for parallel split-k conv2d operator
   reduction_op_ = reduction_it->second;
@@ -615,13 +614,24 @@ bool Conv3dOperationProfiler::initialize_reduction_configuration_(
 
 /// Initializes workspace
 Status Conv3dOperationProfiler::initialize_workspace(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
   ProblemSpace const &problem_space,
   ProblemSpace::Problem const &problem) {
 
+  if (options.device.devices.size() != 1) {
+    throw std::runtime_error("This operation profiler only supports a single "
+                             "device.");
+  }
+
+  cudaError_t result;
+  result = cudaSetDevice(options.device.device_id(0));
+  if (result != cudaSuccess) {
+    throw std::runtime_error("cudaSetDevice() failed.");
+  }
+
   // initialize conv2d underlying operation to handle parallel reduction
   library::Operation const* underlying_operation = operation;
 
@@ -631,15 +641,15 @@ Status Conv3dOperationProfiler::initialize_workspace(
     }
   }
 
-  library::ConvDescription const &operation_desc = 
+  library::ConvDescription const &operation_desc =
     static_cast<library::ConvDescription const &>(underlying_operation->description());
 
   // Compute the number of copies of the problem to avoid L2 camping.
   if (!options.profiling.workspace_count) {
     int64_t bytes = problem_.bytes(operation_desc);
-    if (bytes < 3 * int64_t(options.device.properties.l2CacheSize)) {
+    if (bytes < 3 * int64_t(options.device.properties[0].l2CacheSize)) {
       conv_workspace_.problem_count =
-        1 + int((3 * int64_t(options.device.properties.l2CacheSize)) / bytes);
+        1 + int((3 * int64_t(options.device.properties[0].l2CacheSize)) / bytes);
     }
     else {
       conv_workspace_.problem_count = 1;
@@ -651,55 +661,65 @@ Status Conv3dOperationProfiler::initialize_workspace(
 
 
   if (options.execution_mode != ExecutionMode::kDryRun) {
-
-    conv_workspace_.A = device_context.allocate_tensor(
+    int seed_shift = 0;
+    conv_workspace_.A = device_context.allocate_and_initialize_tensor(
       options,
       "A",
       operation_desc.A.element,
       operation_desc.A.layout,
       problem_.extent_a(operation_desc.conv_kind),
       conv_workspace_.stride_a(operation_desc.conv_kind),
-      conv_workspace_.problem_count
+      conv_workspace_.problem_count,
+      seed_shift++,
+      0 // device_index
     );
 
-    conv_workspace_.B = device_context.allocate_tensor(
+    conv_workspace_.B = device_context.allocate_and_initialize_tensor(
       options,
       "B",
       operation_desc.B.element,
       operation_desc.B.layout,
       problem_.extent_b(operation_desc.conv_kind),
       conv_workspace_.stride_b(operation_desc.conv_kind),
-      conv_workspace_.problem_count
+      conv_workspace_.problem_count,
+      seed_shift++,
+      0 // device_index
     );
 
-    conv_workspace_.C = device_context.allocate_tensor(
+    conv_workspace_.C = device_context.allocate_and_initialize_tensor(
       options,
       "C",
       operation_desc.C.element,
       operation_desc.C.layout,
       problem_.extent_c(operation_desc.conv_kind),
       conv_workspace_.stride_c(operation_desc.conv_kind),
-      conv_workspace_.problem_count
+      conv_workspace_.problem_count,
+      seed_shift++,
+      0 // device_index
     );
 
     conv_workspace_.Computed = device_context.allocate_tensor(
+      options,
       "D",
       operation_desc.C.element,
       operation_desc.C.layout,
       problem_.extent_c(operation_desc.conv_kind),
       conv_workspace_.stride_c(operation_desc.conv_kind),
-      conv_workspace_.problem_count
+      conv_workspace_.problem_count,
+      0 // device_index
     );
 
     conv_workspace_.Reference = device_context.allocate_tensor(
+      options,
       "Reference",
       operation_desc.C.element,
       operation_desc.C.layout,
       problem_.extent_c(operation_desc.conv_kind),
       conv_workspace_.stride_c(operation_desc.conv_kind),
-      conv_workspace_.problem_count
+      conv_workspace_.problem_count,
+      0 // device_index
     );
-    
+
   }
 
   //
@@ -731,10 +751,10 @@ Status Conv3dOperationProfiler::initialize_workspace(
         conv_workspace_.reduction_host_workspace.resize(workspace_size, 0);
 
         status = reduction_op_->initialize(
-          &conv_workspace_.reduction_configuration, 
-          conv_workspace_.reduction_host_workspace.data(), 
+          &conv_workspace_.reduction_configuration,
+          conv_workspace_.reduction_host_workspace.data(),
           nullptr);
-        
+
         if (status != Status::kSuccess) {
           return status;
         }
@@ -761,7 +781,7 @@ Status Conv3dOperationProfiler::initialize_workspace(
 
 /// Verifies CUTLASS against references
 bool Conv3dOperationProfiler::verify_cutlass(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
@@ -782,7 +802,7 @@ bool Conv3dOperationProfiler::verify_cutlass(
   set_cutlass_operator_arguments_();
 
   conv_workspace_.Computed->copy_from_device(conv_workspace_.C->data());
-  
+
   //
   // Run the CUTLASS operation
   //
@@ -797,9 +817,9 @@ bool Conv3dOperationProfiler::verify_cutlass(
   }
 
 #if 0
-  std::cout << "profiling         : " << std::endl 
-            << "conv2d            : " << operation->description().name << std::endl 
-            << "underlying conv2d : " << underlying_operation->description().name << std::endl 
+  std::cout << "profiling         : " << std::endl
+            << "conv2d            : " << operation->description().name << std::endl
+            << "underlying conv2d : " << underlying_operation->description().name << std::endl
             << "reduction         : " << reduction_op_->description().name << std::endl;
 #endif
 
@@ -816,7 +836,7 @@ bool Conv3dOperationProfiler::verify_cutlass(
 
   // Run parallel reduction kernel for parallel split_k_mode
   if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
-    
+
     results_.back().status = reduction_op_->run(
       &conv_workspace_.reduction_arguments,
       conv_workspace_.reduction_host_workspace.data(),
@@ -838,7 +858,7 @@ bool Conv3dOperationProfiler::verify_cutlass(
 
   // CUTLASS op ran the but not yet verified against any verification provider
   results_.back().disposition = Disposition::kNotVerified;
-  
+
   //
   // Run verification providers
   //
@@ -854,7 +874,7 @@ bool Conv3dOperationProfiler::verify_cutlass(
 
       Status status = cudnn_satisfies(conv_desc, conv_workspace_.configuration);
 
-      // Initialize reference data to the source data 
+      // Initialize reference data to the source data
       conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data());
 
       if (status == Status::kSuccess) {
@@ -881,8 +901,8 @@ bool Conv3dOperationProfiler::verify_cutlass(
 
     // Run verification host reference
     if (options.verification.provider_enabled(library::Provider::kReferenceHost)) {
-      
-      // Restore reference data back to initial source data 
+
+      // Restore reference data back to initial source data
       conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data());
 
       verify_with_host_reference_(
@@ -891,10 +911,10 @@ bool Conv3dOperationProfiler::verify_cutlass(
         device_context,
         operation,
         problem_space,
-        problem);      
+        problem);
     }
 
-    // Update disposition to worst case verification outcome among all 
+    // Update disposition to worst case verification outcome among all
     // verification providers which are supported
     bool is_any_verification_run_passed = false;
     for(auto &m : results_.back().verification_map) {
@@ -919,7 +939,7 @@ bool Conv3dOperationProfiler::verify_cutlass(
 
 /// Verifies CUTLASS against host reference
 bool Conv3dOperationProfiler::verify_with_host_reference_(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
@@ -937,17 +957,17 @@ bool Conv3dOperationProfiler::verify_with_host_reference_(
 
   library::ConvFunctionalKey conv_key(
     library::Provider::kReferenceHost,
-    conv_desc.conv_kind,        
+    conv_desc.conv_kind,
     conv_desc.A.element,
     conv_desc.A.layout,
     conv_desc.B.element,
     conv_desc.B.layout,
     conv_desc.C.element,
     conv_desc.C.layout,
-    conv_desc.tile_description.math_instruction.element_accumulator, 
+    conv_desc.tile_description.math_instruction.element_accumulator,
     conv_desc.element_epilogue);
 
-#if 0 // debug print to check which host refererence instance is selected
+#if 0 // debug print to check which host reference instance is selected
     std::cout << conv_key << "\n";
 #endif
 
@@ -957,18 +977,18 @@ bool Conv3dOperationProfiler::verify_with_host_reference_(
 
     results_.back().verification_map[library::Provider::kReferenceHost] = Disposition::kNotRun;
     return true;
-  }    
+  }
 
   // conv3d host reference minimum cc is 0 (CPU) and no iterator algorithm
   library::ConvPreferenceKey preference_key(0, library::IteratorAlgorithmID::kNone);
   auto cc_it = operators_it->second.find(preference_key);
-  
+
   if(cc_it == operators_it->second.end()) {
     results_.back().verification_map[library::Provider::kReferenceHost] = Disposition::kNotRun;
     return true;
   }
 
-  // host refernce has only one instances in ConvOperationVectorMap
+  // host reference has only one instances in ConvOperationVectorMap
   library::Operation const *reference_op = cc_it->second[0];
 
   //
@@ -993,7 +1013,7 @@ bool Conv3dOperationProfiler::verify_with_host_reference_(
   conv_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
 
   //
-  // Intialize host reference operation
+  // Initialize host reference operation
   //
   std::vector<uint8_t> host_workspace_reference_op;
 
@@ -1033,9 +1053,9 @@ bool Conv3dOperationProfiler::verify_with_host_reference_(
   );
 
   // Save workspace if incorrect
-  if (options.verification.save_workspace == SaveWorkspace::kIncorrect && 
+  if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
     results_.back().verification_map[library::Provider::kReferenceHost] == Disposition::kIncorrect) {
-  
+
     save_workspace(
       device_context,
       options,
@@ -1051,7 +1071,7 @@ bool Conv3dOperationProfiler::verify_with_host_reference_(
 
 /// Verifies CUTLASS against host reference
 bool Conv3dOperationProfiler::verify_with_device_reference_(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
@@ -1066,20 +1086,20 @@ bool Conv3dOperationProfiler::verify_with_device_reference_(
 
 /// Measures performance results
 bool Conv3dOperationProfiler::profile(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
   ProblemSpace const &problem_space,
   ProblemSpace::Problem const &problem) {
 
-  
+
   if (options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
 
     set_cutlass_operator_arguments_();
 
     results_.back().status = profile_cutlass_(
-      results_.back().runtime,
+      results_.back(),
       options,
       operation,
       &conv_workspace_.arguments,
@@ -1109,7 +1129,7 @@ void Conv3dOperationProfiler::set_cutlass_operator_arguments_(int problem_idx) {
     conv_workspace_.arguments.alpha = problem_.alpha_one.data();
     conv_workspace_.arguments.beta = problem_.beta_zero.data();
 
-    /// intialize library::ReductionArguments
+    /// initialize library::ReductionArguments
     conv_workspace_.reduction_arguments.workspace           = conv_workspace_.device_workspace.data();
     conv_workspace_.reduction_arguments.source              = conv_workspace_.C->batch_data(problem_idx);
     conv_workspace_.reduction_arguments.destination         = conv_workspace_.Computed->batch_data(problem_idx);
@@ -1121,7 +1141,7 @@ void Conv3dOperationProfiler::set_cutlass_operator_arguments_(int problem_idx) {
 
 /// Method to profile a CUTLASS Operation
 Status Conv3dOperationProfiler::profile_cutlass_(
-  double &runtime,
+  PerformanceResult &result,
   Options const &options,
   library::Operation const *operation,
   void *arguments,
@@ -1178,7 +1198,7 @@ Status Conv3dOperationProfiler::profile_cutlass_(
       return status;
     }
   }
-  
+
   //
   // Initialize GPU timer
   //
@@ -1196,9 +1216,9 @@ Status Conv3dOperationProfiler::profile_cutlass_(
 
     // Setup rotating workspace
     int problem_idx = (iteration % conv_workspace_.problem_count);
- 
+
     set_cutlass_operator_arguments_(problem_idx);
- 
+
     // Run underlying conv2d operation
     status = underlying_operation->run(
       arguments,
@@ -1206,7 +1226,7 @@ Status Conv3dOperationProfiler::profile_cutlass_(
       device_workspace);
 
     // Run parallel reduction kernel for parallel split_k_mode
-    if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {      
+    if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
       status = reduction_op_->run(
         &conv_workspace_.reduction_arguments,
         conv_workspace_.reduction_host_workspace.data(),
@@ -1227,8 +1247,8 @@ Status Conv3dOperationProfiler::profile_cutlass_(
   //
   // Update performance result
   //
-  
-  runtime = timer.duration(iteration);
+
+  result.runtime = timer.duration(iteration);
 
   return status;
 }
@@ -1238,7 +1258,7 @@ Status Conv3dOperationProfiler::profile_cutlass_(
 
 /// Verifies CUTLASS against cudnn reference
 bool Conv3dOperationProfiler::verify_with_cudnn_(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
@@ -1255,7 +1275,7 @@ bool Conv3dOperationProfiler::verify_with_cudnn_(
   cudnnStatus_t status = handle.get_cudnn_create_status();
 
   if (status != CUDNN_STATUS_SUCCESS) {
-    
+
     results_.back().verification_map[library::Provider::kCUDNN] = get_cutlass_disposition(status);
     return true;
   }
@@ -1283,8 +1303,8 @@ bool Conv3dOperationProfiler::verify_with_cudnn_(
     // Construct dispatcher to cudnn operator
     //
 
-    detail::cudnnConvDispatcher conv_op( 
-      conv_desc, 
+    detail::cudnnConvDispatcher conv_op(
+      conv_desc,
       conv_workspace_.configuration,
       conv_workspace_.arguments,
       handle
@@ -1321,7 +1341,7 @@ bool Conv3dOperationProfiler::verify_with_cudnn_(
     );
 
     // Save workspace if incorrect
-    if (options.verification.save_workspace == SaveWorkspace::kIncorrect && 
+    if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
       results_.back().verification_map[library::Provider::kCUDNN] == Disposition::kIncorrect) {
 
       save_workspace(
diff --git a/tools/profiler/src/cublas_helpers.cu b/tools/profiler/src/cublas_helpers.cu
index aceebf6d4c..412b0a2461 100644
--- a/tools/profiler/src/cublas_helpers.cu
+++ b/tools/profiler/src/cublas_helpers.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,7 +35,7 @@
 #include <stdexcept>
 
 #if CUTLASS_ENABLE_CUBLAS
-#include "cublas_helpers.h"
+#include "cutlass/profiler/cublas_helpers.h"
 
 namespace cutlass {
 namespace profiler {
@@ -57,7 +57,7 @@ Status get_cutlass_status(cublasStatus_t cublas) {
   return Status::kErrorInternal;
 }
 
-/// Converts a cuBLASS status to cutlass::profiler::Disposition
+/// Converts a cuBLAS status to cutlass::profiler::Disposition
 Disposition get_cutlass_disposition(cublasStatus_t cublas_status) {
 
   if (cublas_status == CUBLAS_STATUS_INVALID_VALUE) {
@@ -104,12 +104,27 @@ bool get_cublas_transpose_operation(
 /// Maps a CUTLASS numeric type to a cuBLAS data type enumeration
 bool get_cublas_datatype(cublasDataType_t &data_type, library::NumericTypeID element_type) {
   switch (element_type) {
+  case library::NumericTypeID::kFE4M3:
+#if (__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8))
+    data_type = CUDA_R_8F_E4M3;
+    return true;
+#endif
+    break;
+  
+  case library::NumericTypeID::kFE5M2:
+#if (__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8))
+    data_type = CUDA_R_8F_E5M2;
+    return true;
+#endif
+    break;
+
   case library::NumericTypeID::kF16:
     data_type = CUDA_R_16F;
     return true;
     
   case library::NumericTypeID::kBF16:
-    break;
+    data_type = CUDA_R_16BF;
+    return true;
   
   case library::NumericTypeID::kTF32: 
     break;
@@ -130,7 +145,7 @@ bool get_cublas_datatype(cublasDataType_t &data_type, library::NumericTypeID ele
     return true;
     
   case library::NumericTypeID::kS16: 
-   break;
+    break;
  
   case library::NumericTypeID::kS32: 
     data_type = CUDA_R_32I;
@@ -244,6 +259,25 @@ Status cublas_satisfies(library::GemmDescription const &desc) {
     return Status::kErrorNotSupported;
   }
 
+ // Refer to https://docs.nvidia.com/cuda/cublas/#id105
+ // input type A and B FE5M2 not supported in cuBLASLt
+  if(desc.A.element == library::NumericTypeID::kFE5M2 &&
+    desc.B.element == library::NumericTypeID::kFE5M2){
+
+    return Status::kErrorNotSupported;
+  }
+
+ // Refer to https://docs.nvidia.com/cuda/cublas/#id105
+ // input type A and B are FE5M2 and FE4M3 then D type should be F32
+  if (desc.A.element == library::NumericTypeID::kFE5M2 &&
+    desc.B.element == library::NumericTypeID::kFE4M3 &&
+    desc.C.element == library::NumericTypeID::kF32 &&
+    desc.D.element != library::NumericTypeID::kF32 ){
+
+    return Status::kErrorNotSupported;
+  }
+
+
   // output type S4 and S8 not supported in cuBLAS
   if (desc.C.element == library::NumericTypeID::kS4 || 
     desc.C.element == library::NumericTypeID::kS8) {
@@ -251,6 +285,13 @@ Status cublas_satisfies(library::GemmDescription const &desc) {
     return Status::kErrorNotSupported;
   }
 
+  // input type BF16 and TF32 not supported in cuBLAS
+  if (desc.A.element == library::NumericTypeID::kBF16 || 
+    desc.A.element == library::NumericTypeID::kTF32) {
+
+    return Status::kErrorNotSupported;
+  }
+
   return Status::kSuccess;
 }
 
@@ -383,7 +424,261 @@ cublasStatus_t cublasGemmExDispatcher::operator()(cublasHandle_t handle) {
   }
 }
 
-} // namespace detail
+
+cublasLtGemmExDispatcher::cublasLtGemmExDispatcher(
+  library::GemmDescription const &op_desc,
+  library::GemmUniversalConfiguration configuration_,
+  library::GemmUniversalArguments arguments_
+):
+  op_desc(op_desc), configuration(configuration_), arguments(arguments_), status(Status::kSuccess) {
+
+  bool good = true;
+
+  good = (good && get_cublas_transpose_operation(trans_A, op_desc.A.layout, op_desc.transform_A));
+  good = (good && get_cublas_transpose_operation(trans_B, op_desc.B.layout, op_desc.transform_B));
+  good = (good && get_cublas_datatype(data_type_A, op_desc.A.element));
+  good = (good && get_cublas_datatype(data_type_B, op_desc.B.element));
+  good = (good && get_cublas_datatype(data_type_C, op_desc.C.element));
+
+  good = (good && get_cublas_datatype(
+    compute_data_type,
+    op_desc.tile_description.math_instruction.element_accumulator));
+
+  // cuBLAS introduces a separate cublasComputeType enumerant to more precisely describe
+  // internal numerical data types used in the computation.
+#if (__CUDACC_VER_MAJOR__ >= 11)
+  library::OpcodeClassID const & opcode_class =
+    op_desc.tile_description.math_instruction.opcode_class;
+
+  if (good &&
+    op_desc.A.element == library::NumericTypeID::kF32 &&
+    op_desc.B.element == library::NumericTypeID::kF32 &&
+    opcode_class == library::OpcodeClassID::kTensorOp) {
+
+    compute_type = CUBLAS_COMPUTE_32F_FAST_TF32;
+  }
+  else if (good) {
+    bool const isPedantic = false;
+    switch (compute_data_type) {
+      case CUDA_R_32F:
+      case CUDA_C_32F:
+        compute_type = isPedantic ? CUBLAS_COMPUTE_32F_PEDANTIC : CUBLAS_COMPUTE_32F;
+        break;
+      case CUDA_R_64F:
+      case CUDA_C_64F:
+        compute_type = isPedantic ? CUBLAS_COMPUTE_64F_PEDANTIC : CUBLAS_COMPUTE_64F;
+        break;
+      case CUDA_R_16F:
+        compute_type = isPedantic ? CUBLAS_COMPUTE_16F_PEDANTIC : CUBLAS_COMPUTE_16F;
+        break;
+      case CUDA_R_32I:
+        compute_type = isPedantic ? CUBLAS_COMPUTE_32I_PEDANTIC : CUBLAS_COMPUTE_32I;
+        break;
+      default:
+        good = false;
+        break;
+    }
+  }
+#endif // __CUDACC_VER_MAJOR__ >= 11
+
+  if (!good) {
+    status = Status::kErrorNotSupported;
+  }
+}
+
+void cublasLtGemmExDispatcher::initialize_cublaslt(){
+
+  // create operation desciriptor; see cublasLtMatmulDescAttributes_t for details about defaults; here we just need to
+  // set the transforms for A and B
+  cublasLtMatmulDescCreate(&operationDesc, compute_type, compute_data_type);
+  cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &trans_A, sizeof(trans_A));
+  cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &trans_B, sizeof(trans_B));
+
+  uint64_t contiguous_A = (trans_A == CUBLAS_OP_N ? configuration.problem_size.m() : configuration.problem_size.k());
+  uint64_t strided_A = (trans_A == CUBLAS_OP_N ? configuration.problem_size.k() :  configuration.problem_size.m());
+  uint64_t contiguous_B = (trans_B == CUBLAS_OP_N ? configuration.problem_size.k() :  configuration.problem_size.n());
+  uint64_t strided_B = (trans_B == CUBLAS_OP_N ? configuration.problem_size.n() :  configuration.problem_size.k());
+
+  // create matrix descriptors, we are good with the details here so no need to set any extra attributes
+  // table of supported type combinations can be found in the documentation: https://docs.nvidia.com/cuda/cublas/index.html#cublasltmatmul
+  cublasLtMatrixLayoutCreate(&Adesc, data_type_A, contiguous_A, strided_A,  configuration.lda);
+  cublasLtMatrixLayoutCreate(&Bdesc, data_type_B, contiguous_B, strided_B,  configuration.ldb);
+  cublasLtMatrixLayoutCreate(&Cdesc, data_type_C, configuration.problem_size.m(), configuration.problem_size.n(), configuration.ldc);
+  cublasLtMatrixLayoutCreate(&Ddesc, data_type_C, configuration.problem_size.m(), configuration.problem_size.n(), configuration.ldd);
+
+}
+
+bool cublasLtGemmExDispatcher::get_cublaslt_algo(cublasLtHandle_t handle,
+                                 AlgorithmMode algorithm_mode
+                                 ){
+  const int requestedAlgoCount = 8; //By default gets 8 algorithms from GetHeuristic Call. CublasLt heuristics provide at max 8 algorithms. 
+  int returnedResults = 0;
+  cublasLtMatmulHeuristicResult_t heuristicResult[requestedAlgoCount] = {};
+
+#if (__CUDACC_VER_MAJOR__ >= 12)
+  //Decide based upon the unique operation identifier whether to turn on fast accum for cublas kernel or not.
+  std::string operation_name(op_desc.name);
+  if(operation_name.find("fastaccum") != std::string::npos){
+    const int8_t fastAccuMode = 1;
+    cublasLtMatmulDescSetAttribute(operationDesc,
+        CUBLASLT_MATMUL_DESC_FAST_ACCUM,
+        &fastAccuMode,
+        sizeof(fastAccuMode));
+  }
+#endif // __CUDACC_VER_MAJOR__ >= 12
+
+  //Using 32MB for hopper kernel. This is the max workspace size for the call to cublasLtMatmulAlgoGetHeuristic()
+  size_t workspaceSizeForHeuristics = 32ULL * 1024 * 1024;
+  void* workspaceHeuristic = nullptr;
+
+  cudaError_t result = cudaMalloc((void **)&workspaceHeuristic, workspaceSizeForHeuristics);
+  if (result != cudaSuccess) {
+    throw std::bad_alloc();
+  }
+
+  // create preference handle; here we could use extra attributes to disable tensor ops or to make sure algo selected
+  // will work with badly aligned A, B, C; here for simplicity we just assume A,B,C are always well aligned (e.g.
+  // directly come from cudaMalloc)
+  cublasLtMatmulPreferenceCreate(&preference);
+  cublasLtMatmulPreferenceSetAttribute(preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSizeForHeuristics, sizeof(workspaceSizeForHeuristics));
+
+  cublasLtMatmulAlgoGetHeuristic(handle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, preference, requestedAlgoCount, heuristicResult, &returnedResults);
+
+  if (returnedResults == 0) {
+    return false;
+  }
+
+  int bestAlgoIdx = 0;
+  //
+  //Auto Tuning to get the best kernel for the given problem
+  //
+  if (algorithm_mode == AlgorithmMode::kBest) {
+    float time = 0;
+    float bestAlgoTime = 0;
+    cudaStream_t stream;
+    cudaEvent_t startEvent, stopEvent;
+    
+    cudaStreamCreate(&stream);
+    cudaEventCreate(&startEvent);
+    cudaEventCreate(&stopEvent);
+      
+    constexpr int repeatAlgoCheck = 5;
+    std::vector<float> algoTimes(repeatAlgoCheck);
+    
+    for (int algoIdx = 0; algoIdx < returnedResults; algoIdx++) {
+      for (int checkIdx = 0; checkIdx < repeatAlgoCheck; checkIdx++) {
+        cudaEventRecord(startEvent, stream);
+  
+        cublasStatus_t status = cublasLtMatmul(handle,
+                 operationDesc,
+                 arguments.alpha,
+                 arguments.A,
+                 Adesc,
+                 arguments.B,
+                 Bdesc,
+                 arguments.beta,
+                 arguments.C,
+                 Cdesc,
+                 arguments.D,
+                 Ddesc,
+                 &heuristicResult[algoIdx].algo,
+                 workspaceHeuristic,
+                 heuristicResult[algoIdx].workspaceSize,
+                 stream);
+  
+        // Handle errors
+        if (status != CUBLAS_STATUS_SUCCESS) {
+          std::cerr << "cublasLtMatmul AutoTuning failed with status: " << cublasLtGetStatusName(status) << std::endl;
+          return false;
+        }
+  
+        cudaEventRecord(stopEvent, stream);
+        cudaEventSynchronize(stopEvent);
+        cudaEventElapsedTime(&time, startEvent, stopEvent);
+        algoTimes[checkIdx] = time;
+  
+      }
+  
+      const size_t size = algoTimes.size();
+      if (size == 0) {
+        time = 0;
+      }
+    
+      std::sort(algoTimes.begin(), algoTimes.end());
+    
+      const size_t mid = size / 2;
+      if (size % 2 == 0) {
+        time = (algoTimes[mid] + algoTimes[mid - 1]) / 2;
+      }
+      else {
+        time = algoTimes[mid];
+      }
+    
+      if (algoIdx == 0 || time < bestAlgoTime) {
+        bestAlgoTime = time;
+        bestAlgoIdx = algoIdx;
+      }
+    }
+  
+
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    std::cout << "\n";
+    std::cout << "# Algorithms checked: " << returnedResults << "\n";
+    std::cout << "WorkspaceSize Allocated: " << heuristicResult[bestAlgoIdx].workspaceSize << "\n";
+    std::cout << "Algorithm selected after auto-tuning is:" << "\n";
+    
+    int algoId, tile, swizzle, customOption, numSplitsK, reductionScheme;
+  
+    cublasLtMatmulAlgoConfigGetAttribute(&heuristicResult[bestAlgoIdx].algo, CUBLASLT_ALGO_CONFIG_ID, &algoId, sizeof(algoId), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(&heuristicResult[bestAlgoIdx].algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tile, sizeof(tile), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(&heuristicResult[bestAlgoIdx].algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &numSplitsK, sizeof(numSplitsK), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(&heuristicResult[bestAlgoIdx].algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &reductionScheme, sizeof(reductionScheme), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(&heuristicResult[bestAlgoIdx].algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &swizzle, sizeof(swizzle), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(&heuristicResult[bestAlgoIdx].algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption), NULL);
+  
+    printf("algo={ Id=%d, tileIdx=%d splitK=%d reduc=%d swizzle=%d custom=%d }\n",
+        algoId, tile, numSplitsK, reductionScheme, swizzle, customOption);
+#endif
+
+    if (stream) cudaStreamDestroy(stream);
+    if (startEvent) cudaEventDestroy(startEvent);
+    if (stopEvent) cudaEventDestroy(stopEvent);
+
+  }
+
+  //setting algorithm for the dispatcher
+  heuristicResult_ = heuristicResult[bestAlgoIdx];
+  result = cudaMalloc((void **)&workspace, heuristicResult_.workspaceSize);
+  if (result != cudaSuccess) {
+    throw std::bad_alloc();
+  }
+  
+  return true;
+}
+
+cublasStatus_t cublasLtGemmExDispatcher::operator()(cublasLtHandle_t handle, cudaStream_t stream)
+{
+  return cublasLtMatmul(handle,
+    operationDesc,
+    arguments.alpha,
+    arguments.A,
+    Adesc,
+    arguments.B,
+    Bdesc,
+    arguments.beta,
+    arguments.C,
+    Cdesc,
+    arguments.D,
+    Ddesc,
+    &heuristicResult_.algo,
+    workspace,
+    heuristicResult_.workspaceSize,
+    stream); //number of streams is set to 0
+  
+}
+
+}
+// namespace detail
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/tools/profiler/src/cudnn_helpers.cpp b/tools/profiler/src/cudnn_helpers.cpp
index 0ab6c12a37..418b69f64d 100644
--- a/tools/profiler/src/cudnn_helpers.cpp
+++ b/tools/profiler/src/cudnn_helpers.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,7 +35,7 @@
 
 #include <stdexcept>
 
-#include "cudnn_helpers.h"
+#include "cutlass/profiler/cudnn_helpers.h"
 
 namespace cutlass {
 namespace profiler {
@@ -195,7 +195,12 @@ bool get_cudnn_mathtype(cudnnMathType_t &cudnn_math_type, library::ConvDescripti
       return true;
     }
     case library::OpcodeClassID::kSimt:
-      return false;
+      #if (defined(CUDNN_VERSION) && CUDNN_VERSION <= 8000)
+        cudnn_math_type = CUDNN_DEFAULT_MATH;
+      #else
+        cudnn_math_type = CUDNN_FMA_MATH;
+      #endif
+      return true;
   }
 
   return false;
@@ -238,7 +243,7 @@ Status cudnn_satisfies(
   }
 
   ////////////////////////  Convolution output dimensions p and q ///////////////////////
-  // Cutlass convolutions support arbitrary output dimensions and not constriant by    //
+  // Cutlass convolutions support arbitrary output dimensions and not constrained by   //
   // input, filter, padding, striding, dilation sizes.                                 //
   // cuDNN sets the output dimensions (p, q) using following equations:                //
   //                                                                                   //
@@ -368,7 +373,7 @@ Status cudnn_satisfies(
   }
 
   ////////////////////////  Convolution output dimensions p and q ///////////////////////
-  // Cutlass convolutions support arbitrary output dimensions and not constriant by    //
+  // Cutlass convolutions support arbitrary output dimensions and not constrained by   //
   // input, filter, padding, striding, dilation sizes.                                 //
   // cuDNN sets the output dimensions (p, q) using following equations:                //
   //                                                                                   //
diff --git a/tools/profiler/src/cutlass_profiler.cu b/tools/profiler/src/cutlass_profiler.cu
index a3893a4d69..d9aae93255 100644
--- a/tools/profiler/src/cutlass_profiler.cu
+++ b/tools/profiler/src/cutlass_profiler.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -36,15 +36,15 @@
 #include <stdexcept>
 
 // Profiler includes
-#include "cutlass_profiler.h"
-#include "gemm_operation_profiler.h"
-#include "rank_k_operation_profiler.h"
-#include "rank_2k_operation_profiler.h"
-#include "trmm_operation_profiler.h"
-#include "symm_operation_profiler.h"
-#include "conv2d_operation_profiler.h"          
-#include "conv3d_operation_profiler.h"          
-#include "sparse_gemm_operation_profiler.h"
+#include "cutlass/profiler/cutlass_profiler.h"
+#include "cutlass/profiler/gemm_operation_profiler.h"
+#include "cutlass/profiler/rank_k_operation_profiler.h"
+#include "cutlass/profiler/rank_2k_operation_profiler.h"
+#include "cutlass/profiler/trmm_operation_profiler.h"
+#include "cutlass/profiler/symm_operation_profiler.h"
+#include "cutlass/profiler/conv2d_operation_profiler.h"
+#include "cutlass/profiler/conv3d_operation_profiler.h"
+#include "cutlass/profiler/sparse_gemm_operation_profiler.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -55,7 +55,7 @@ namespace profiler {
 
 CutlassProfiler::CutlassProfiler(
   Options const &options
-): 
+):
   options_(options) {
 
   operation_profilers_.emplace_back(new GemmOperationProfiler(options));
@@ -124,13 +124,12 @@ int CutlassProfiler::operator()() {
     options_.execution_mode == ExecutionMode::kTrace) {
 
     // Profiles all operations
-    profile_();
+    return profile_();
   }
   else if (options_.execution_mode == ExecutionMode::kEnumerate) {
     // Enumerates all operations
     enumerate_();
   }
-
   return 0;
 }
 
@@ -144,20 +143,22 @@ void CutlassProfiler::enumerate_() {
 /// Profiles all operations
 int CutlassProfiler::profile_() {
 
-  int result = 0;
+  // Keep track of all device memory tensor in map
   DeviceContext device_context;
 
-  // For all profilers
+  int result = 0;
+  // For all profilers (e.g. gemm/sparse_gemm/conv2d...)
   for (auto & profiler : operation_profilers_) {
 
     if (options_.operation_kind == library::OperationKind::kInvalid ||
-      options_.operation_kind == profiler->kind()) {
+        options_.operation_kind == profiler->kind()) {
 
       result = profiler->profile_all(options_, library::Singleton::get().manifest, device_context);
 
+      // If some profile failed, terminate immediately
       if (result) {
         return result;
-      } 
+      }
     }
   }
 
@@ -194,8 +195,8 @@ void CutlassProfiler::print_usage_(std::ostream &out) {
     << "  $ cutlass_profiler --operation=RankK --help\n\n"
     << "  $ cutlass_profiler --operation=Trmm --help\n\n"
     << "  $ cutlass_profiler --operation=Symm --help\n\n"
-    << "  $ cutlass_profiler --operation=Conv3d --help\n\n"         
-    << "  $ cutlass_profiler --operation=Conv2d --help\n\n"         
+    << "  $ cutlass_profiler --operation=Conv3d --help\n\n"
+    << "  $ cutlass_profiler --operation=Conv2d --help\n\n"
     << "  $ cutlass_profiler --operation=SparseGemm --help\n\n"
   ;
 }
@@ -207,19 +208,6 @@ void CutlassProfiler::print_options_(std::ostream &out) {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Initializes the CUDA device
-void CutlassProfiler::initialize_device_() {
-
-  cudaError_t result = cudaSetDevice(options_.device.device);
-
-  if (result != cudaSuccess) {
-    std::cerr << "Failed to set device.";
-    throw std::runtime_error("Failed to set device");
-  }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
 } // namespace profiler
 } // namespace cutlass
 
diff --git a/tools/profiler/src/device_allocation.cu b/tools/profiler/src/device_allocation.cu
index 69389e5a5d..a1866b55fc 100644
--- a/tools/profiler/src/device_allocation.cu
+++ b/tools/profiler/src/device_allocation.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -46,7 +46,7 @@
 
 #include "cutlass/library/util.h"
 
-#include "device_allocation.h"
+#include "cutlass/profiler/device_allocation.h"
 
 namespace cutlass {
 namespace profiler {
@@ -88,16 +88,16 @@ static std::vector<int64_t> get_packed_layout_stride(std::vector<int> const &ext
 
 /// Returns the stride of a packed layout
 std::vector<int64_t> DeviceAllocation::get_packed_layout(
-  library::LayoutTypeID layout_id, 
+  library::LayoutTypeID layout_id,
   std::vector<int> const &extent) {
 
   std::vector<int64_t> stride;
 
   switch (layout_id) {
-    case library::LayoutTypeID::kColumnMajor: 
+    case library::LayoutTypeID::kColumnMajor:
       stride = get_packed_layout_stride<cutlass::layout::ColumnMajor>(extent);
       break;
-    case library::LayoutTypeID::kRowMajor: 
+    case library::LayoutTypeID::kRowMajor:
       stride = get_packed_layout_stride<cutlass::layout::RowMajor>(extent);
       break;
     case library::LayoutTypeID::kColumnMajorInterleavedK2:
@@ -159,7 +159,7 @@ std::vector<int64_t> DeviceAllocation::get_packed_layout(
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Template to use CUTLASS Layout functions to 
+/// Template to use CUTLASS Layout functions to
 template <typename Layout>
 static size_t construct_layout_(
   void *bytes,
@@ -177,8 +177,8 @@ static size_t construct_layout_(
     stride = get_packed_layout_stride<Layout>(extent);
 
     return construct_layout_<Layout>(
-      bytes, 
-      layout_id, 
+      bytes,
+      layout_id,
       extent,
       stride);
   }
@@ -202,7 +202,7 @@ static size_t construct_layout_(
 
   // Pack it into bytes
   if (bytes) {
-    *reinterpret_cast<Layout *>(bytes) = layout; 
+    *reinterpret_cast<Layout *>(bytes) = layout;
   }
 
   // Return capacity
@@ -219,10 +219,10 @@ size_t DeviceAllocation::construct_layout(
   std::vector<int64_t> &stride) {
 
   switch (layout_id) {
-    case library::LayoutTypeID::kColumnMajor: 
+    case library::LayoutTypeID::kColumnMajor:
       return construct_layout_<cutlass::layout::ColumnMajor>(bytes, layout_id, extent, stride);
-      
-    case library::LayoutTypeID::kRowMajor: 
+
+    case library::LayoutTypeID::kRowMajor:
       return construct_layout_<cutlass::layout::RowMajor>(bytes, layout_id, extent, stride);
 
     case library::LayoutTypeID::kColumnMajorInterleavedK2:
@@ -284,24 +284,25 @@ size_t DeviceAllocation::construct_layout(
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-DeviceAllocation::DeviceAllocation(): 
-  type_(library::NumericTypeID::kInvalid), 
+DeviceAllocation::DeviceAllocation():
+  type_(library::NumericTypeID::kInvalid),
   batch_stride_(0),
-  capacity_(0), 
+  capacity_(0),
   pointer_(nullptr),
   layout_(library::LayoutTypeID::kUnknown),
   batch_count_(1) {
-
+  cudaGetDevice(&device_);
 }
 
 DeviceAllocation::DeviceAllocation(
-  library::NumericTypeID type, 
-  size_t capacity
+  library::NumericTypeID type,
+  size_t capacity,
+  int device
 ):
-  type_(type), batch_stride_(capacity), capacity_(capacity), pointer_(nullptr), 
-  layout_(library::LayoutTypeID::kUnknown), batch_count_(1) {
+  type_(type), batch_stride_(capacity), capacity_(capacity), pointer_(nullptr),
+  layout_(library::LayoutTypeID::kUnknown), batch_count_(1), device_(device) {
 
-  cudaError_t result = cudaMalloc((void **)&pointer_, bytes(type, capacity));
+  cudaError_t result = this->malloc((void **)&pointer_, bytes(type, capacity));
 
   if (result != cudaSuccess) {
     type_ = library::NumericTypeID::kInvalid;
@@ -312,26 +313,48 @@ DeviceAllocation::DeviceAllocation(
 }
 
 DeviceAllocation::DeviceAllocation(
-  library::NumericTypeID type, 
-  library::LayoutTypeID layout_id, 
-  std::vector<int> const &extent, 
+  library::NumericTypeID type,
+  library::LayoutTypeID layout_id,
+  std::vector<int> const &extent,
   std::vector<int64_t> const &stride,
-  int batch_count
+  int batch_count,
+  int device
 ):
-  type_(type), batch_stride_(size_t(0)), capacity_(size_t(0)), pointer_(nullptr), batch_count_(1) {
+  type_(type), batch_stride_(size_t(0)), capacity_(size_t(0)),
+  pointer_(nullptr), batch_count_(1), device_(device) {
 
   reset(type, layout_id, extent, stride, batch_count);
 }
 
 DeviceAllocation::~DeviceAllocation() {
   if (pointer_) {
+    int current_device;
+    cudaGetDevice(&current_device);
+
+    if (current_device != device_) {
+      cudaSetDevice(device_);
+    }
     cudaFree(pointer_);
+
+    if (current_device != device_) {
+      cudaSetDevice(current_device);
+    }
   }
 }
 
 DeviceAllocation &DeviceAllocation::reset() {
   if (pointer_) {
+    int current_device;
+    cudaGetDevice(&current_device);
+
+    if (current_device != device_) {
+      cudaSetDevice(device_);
+    }
     cudaFree(pointer_);
+
+    if (current_device != device_) {
+      cudaSetDevice(current_device);
+    }
   }
 
   type_ = library::NumericTypeID::kInvalid;
@@ -355,7 +378,7 @@ DeviceAllocation &DeviceAllocation::reset(library::NumericTypeID type, size_t ca
   batch_stride_ = capacity;
   capacity_ = capacity;
 
-  cudaError_t result = cudaMalloc((void **)&pointer_, bytes(type_, capacity_));
+  cudaError_t result = this->malloc((void **)&pointer_, bytes(type_, capacity_));
   if (result != cudaSuccess) {
     throw std::bad_alloc();
   }
@@ -373,9 +396,9 @@ DeviceAllocation &DeviceAllocation::reset(library::NumericTypeID type, size_t ca
 
 /// Allocates memory for a given layout and tensor
 DeviceAllocation &DeviceAllocation::reset(
-  library::NumericTypeID type, 
-  library::LayoutTypeID layout_id, 
-  std::vector<int> const &extent, 
+  library::NumericTypeID type,
+  library::LayoutTypeID layout_id,
+  std::vector<int> const &extent,
   std::vector<int64_t> const &stride,
   int batch_count) {
 
@@ -391,14 +414,14 @@ DeviceAllocation &DeviceAllocation::reset(
   batch_count_ = batch_count;
 
   batch_stride_ = construct_layout(
-    tensor_ref_buffer_.data() + sizeof(pointer_), 
-    layout_id, 
-    extent, 
+    tensor_ref_buffer_.data() + sizeof(pointer_),
+    layout_id,
+    extent,
     stride_);
 
   capacity_ = batch_stride_ * batch_count_;
 
-  cudaError_t result = cudaMalloc((void **)&pointer_, bytes(type, capacity_));
+  cudaError_t result = this->malloc((void **)&pointer_, bytes(type, capacity_));
   if (result != cudaSuccess) {
     throw std::bad_alloc();
   }
@@ -421,7 +444,7 @@ void *DeviceAllocation::data() const {
 }
 
 void *DeviceAllocation::batch_data(int batch_idx) const {
-    return static_cast<char *>(data()) + batch_stride_bytes() * batch_idx; 
+    return static_cast<char *>(data()) + batch_stride_bytes() * batch_idx;
 }
 
 library::LayoutTypeID DeviceAllocation::layout() const {
@@ -442,12 +465,12 @@ int DeviceAllocation::batch_count() const {
   return batch_count_;
 }
 
-/// Gets the stride (in units of elements) beteween items
+/// Gets the stride (in units of elements) between items
 int64_t DeviceAllocation::batch_stride() const {
   return batch_stride_;
 }
 
-/// Gets the stride (in units of bytes) beteween items
+/// Gets the stride (in units of bytes) between items
 int64_t DeviceAllocation::batch_stride_bytes() const {
   return bytes(type_, batch_stride_);
 }
@@ -462,6 +485,13 @@ size_t DeviceAllocation::bytes() const {
 
 /// Copies from an equivalent-sized tensor in device memory
 void DeviceAllocation::copy_from_device(void const *ptr) {
+  if (!bytes()) {
+#ifndef NDEBUG
+    std::cout << "Skipping copy of size 0 allocation\n";
+#endif
+    return;
+  }
+
   cudaError_t result = cudaMemcpy(data(), ptr, bytes(), cudaMemcpyDeviceToDevice);
   if (result != cudaSuccess) {
     throw std::runtime_error("Failed device-to-device copy");
@@ -470,22 +500,43 @@ void DeviceAllocation::copy_from_device(void const *ptr) {
 
 /// Copies from an equivalent-sized tensor in device memory
 void DeviceAllocation::copy_from_host(void const *ptr) {
+  if (!bytes()) {
+#ifndef NDEBUG
+    std::cout << "Skipping copy of size 0 allocation\n";
+#endif
+    return;
+  }
+
   cudaError_t result = cudaMemcpy(data(), ptr, bytes(), cudaMemcpyHostToDevice);
   if (result != cudaSuccess) {
-    throw std::runtime_error("Failed device-to-device copy");
+    throw std::runtime_error("Failed host-to-device copy");
   }
 }
 
 /// Copies from an equivalent-sized tensor in device memory
 void DeviceAllocation::copy_to_host(void *ptr) {
+  if (!bytes()) {
+#ifndef NDEBUG
+    std::cout << "Skipping copy of size 0 allocation\n";
+#endif
+    return;
+  }
+
   cudaError_t result = cudaMemcpy(ptr, data(), bytes(), cudaMemcpyDeviceToHost);
   if (result != cudaSuccess) {
-    throw std::runtime_error("Failed device-to-device copy");
+    throw std::runtime_error("Failed device-to-host copy");
   }
 }
 
 void DeviceAllocation::initialize_random_device(int seed, Distribution dist) {
-  if (!good()) {
+  if (!bytes()) {
+#ifndef NDEBUG
+    std::cout << "Skipping initialization of size 0 allocation\n";
+#endif
+    return;
+  }
+
+  if (!data()) {
     throw std::runtime_error("Attempting to initialize invalid allocation.");
   }
 
@@ -549,6 +600,22 @@ void DeviceAllocation::initialize_random_device(int seed, Distribution dist) {
       dist
     );
     break;
+  case library::NumericTypeID::kFE4M3:
+    cutlass::reference::device::BlockFillRandom<cutlass::float_e4m3_t>(
+      reinterpret_cast<cutlass::float_e4m3_t *>(pointer_),
+      capacity_,
+      seed,
+      dist
+    );
+    break;
+  case library::NumericTypeID::kFE5M2:
+    cutlass::reference::device::BlockFillRandom<cutlass::float_e5m2_t>(
+      reinterpret_cast<cutlass::float_e5m2_t *>(pointer_),
+      capacity_,
+      seed,
+      dist
+    );
+    break;
   case library::NumericTypeID::kF64:
     cutlass::reference::device::BlockFillRandom<double>(
       reinterpret_cast<double *>(pointer_),
@@ -674,13 +741,36 @@ void DeviceAllocation::initialize_random_device(int seed, Distribution dist) {
 }
 
 void DeviceAllocation::initialize_random_host(int seed, Distribution dist) {
-  if (!good()) {
+  if (!bytes()) {
+#ifndef NDEBUG
+    std::cout << "Skipping initialization of size 0 allocation\n";
+#endif
+    return;
+  }
+
+  if (!data()) {
     throw std::runtime_error("Attempting to initialize invalid allocation.");
   }
 
   std::vector<uint8_t> host_data(bytes());
 
   switch (type_) {
+  case library::NumericTypeID::kFE4M3:
+    cutlass::reference::host::BlockFillRandom<cutlass::float_e4m3_t>(
+      reinterpret_cast<cutlass::float_e4m3_t *>(host_data.data()),
+      capacity_,
+      seed,
+      dist
+    );
+    break;
+  case library::NumericTypeID::kFE5M2:
+    cutlass::reference::host::BlockFillRandom<cutlass::float_e5m2_t>(
+      reinterpret_cast<cutlass::float_e5m2_t *>(host_data.data()),
+      capacity_,
+      seed,
+      dist
+    );
+    break;
   case library::NumericTypeID::kF16:
     cutlass::reference::host::BlockFillRandom<cutlass::half_t>(
       reinterpret_cast<cutlass::half_t *>(host_data.data()),
@@ -871,8 +961,474 @@ void DeviceAllocation::initialize_random_host(int seed, Distribution dist) {
   copy_from_host(host_data.data());
 }
 
+void DeviceAllocation::initialize_sequential_device(Distribution dist) {
+  if (!bytes()) {
+#ifndef NDEBUG
+    std::cout << "Skipping initialization of size 0 allocation\n";
+#endif
+    return;
+  }
+
+  if (!data()) {
+    throw std::runtime_error("Attempting to initialize invalid allocation.");
+  }
+
+  switch (type_) {
+  case library::NumericTypeID::kFE4M3:
+    cutlass::reference::device::BlockFillSequential<cutlass::float_e4m3_t>(
+      reinterpret_cast<cutlass::float_e4m3_t *>(pointer_),
+      capacity_,
+      static_cast<cutlass::float_e4m3_t>(dist.sequential.delta),
+      static_cast<cutlass::float_e4m3_t>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kFE5M2:
+    cutlass::reference::device::BlockFillSequential<cutlass::float_e5m2_t>(
+      reinterpret_cast<cutlass::float_e5m2_t *>(pointer_),
+      capacity_,
+      static_cast<cutlass::float_e5m2_t>(dist.sequential.delta),
+      static_cast<cutlass::float_e5m2_t>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kF16:
+    cutlass::reference::device::BlockFillSequential<cutlass::half_t>(
+      reinterpret_cast<cutlass::half_t *>(pointer_),
+      capacity_,
+      static_cast<cutlass::half_t>(dist.sequential.delta),
+      static_cast<cutlass::half_t>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kBF16:
+    cutlass::reference::device::BlockFillSequential<cutlass::bfloat16_t>(
+      reinterpret_cast<cutlass::bfloat16_t *>(pointer_),
+      capacity_,
+      static_cast<cutlass::bfloat16_t>(dist.sequential.delta),
+      static_cast<cutlass::bfloat16_t>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kTF32:
+    cutlass::reference::device::BlockFillSequential<cutlass::tfloat32_t>(
+      reinterpret_cast<cutlass::tfloat32_t *>(pointer_),
+      capacity_,
+      static_cast<cutlass::tfloat32_t>(dist.sequential.delta),
+      static_cast<cutlass::tfloat32_t>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kF32:
+    cutlass::reference::device::BlockFillSequential<float>(
+      reinterpret_cast<float *>(pointer_),
+      capacity_,
+      static_cast<float>(dist.sequential.delta),
+      static_cast<float>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kCF16:
+    cutlass::reference::device::BlockFillSequential<cutlass::complex<cutlass::half_t>>(
+      reinterpret_cast<cutlass::complex<cutlass::half_t> *>(pointer_),
+      capacity_,
+      cutlass::complex<cutlass::half_t>(
+        static_cast<cutlass::half_t>(dist.sequential.delta)),
+      cutlass::complex<cutlass::half_t>(
+        static_cast<cutlass::half_t>(dist.sequential.start))
+    );
+    break;
+  case library::NumericTypeID::kCBF16:
+    cutlass::reference::device::BlockFillSequential<cutlass::complex<cutlass::bfloat16_t>>(
+      reinterpret_cast<cutlass::complex<cutlass::bfloat16_t> *>(pointer_),
+      capacity_,
+      cutlass::complex<cutlass::bfloat16_t>(
+        static_cast<cutlass::bfloat16_t>(dist.sequential.delta)),
+      cutlass::complex<cutlass::bfloat16_t>(
+        static_cast<cutlass::bfloat16_t>(dist.sequential.start))
+    );
+    break;
+  case library::NumericTypeID::kCTF32:
+    cutlass::reference::device::BlockFillSequential<cutlass::complex<cutlass::tfloat32_t>>(
+      reinterpret_cast<cutlass::complex<cutlass::tfloat32_t> *>(pointer_),
+      capacity_,
+      cutlass::complex<cutlass::tfloat32_t>(
+        static_cast<cutlass::tfloat32_t>(dist.sequential.delta)),
+      cutlass::complex<cutlass::tfloat32_t>(
+        static_cast<cutlass::tfloat32_t>(dist.sequential.start))
+    );
+    break;
+  case library::NumericTypeID::kCF32:
+    cutlass::reference::device::BlockFillSequential<cutlass::complex<float>>(
+      reinterpret_cast<cutlass::complex<float> *>(pointer_),
+      capacity_,
+      cutlass::complex<float>(
+        static_cast<float>(dist.sequential.delta)),
+      cutlass::complex<float>(
+        static_cast<float>(dist.sequential.start))
+    );
+    break;
+  case library::NumericTypeID::kF64:
+    cutlass::reference::device::BlockFillSequential<double>(
+      reinterpret_cast<double *>(pointer_),
+      capacity_,
+      static_cast<double>(dist.sequential.delta),
+      static_cast<double>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kCF64:
+    cutlass::reference::device::BlockFillSequential<cutlass::complex<double>>(
+      reinterpret_cast<cutlass::complex<double> *>(pointer_),
+      capacity_,
+      cutlass::complex<double>(
+        static_cast<double>(dist.sequential.delta)),
+      cutlass::complex<double>(
+        static_cast<double>(dist.sequential.start))
+    );
+    break;
+  case library::NumericTypeID::kS2:
+    cutlass::reference::device::BlockFillSequential<int2b_t>(
+      reinterpret_cast<int2b_t *>(pointer_),
+      capacity_,
+      static_cast<int2b_t>(dist.sequential.delta),
+      static_cast<int2b_t>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kS4:
+    cutlass::reference::device::BlockFillSequential<int4b_t>(
+      reinterpret_cast<int4b_t *>(pointer_),
+      capacity_,
+      static_cast<int4b_t>(dist.sequential.delta),
+      static_cast<int4b_t>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kS8:
+    cutlass::reference::device::BlockFillSequential<int8_t>(
+      reinterpret_cast<int8_t *>(pointer_),
+      capacity_,
+      static_cast<int8_t>(dist.sequential.delta),
+      static_cast<int8_t>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kS16:
+    cutlass::reference::device::BlockFillSequential<int16_t>(
+      reinterpret_cast<int16_t *>(pointer_),
+      capacity_,
+      static_cast<int16_t>(dist.sequential.delta),
+      static_cast<int16_t>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kS32:
+    cutlass::reference::device::BlockFillSequential<int32_t>(
+      reinterpret_cast<int32_t *>(pointer_),
+      capacity_,
+      static_cast<int32_t>(dist.sequential.delta),
+      static_cast<int32_t>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kS64:
+    cutlass::reference::device::BlockFillSequential<int64_t>(
+      reinterpret_cast<int64_t *>(pointer_),
+      capacity_,
+      static_cast<int64_t>(dist.sequential.delta),
+      static_cast<int64_t>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kB1:
+    cutlass::reference::device::BlockFillSequential<uint1b_t>(
+      reinterpret_cast<uint1b_t *>(pointer_),
+      capacity_,
+      static_cast<uint1b_t>(dist.sequential.delta),
+      static_cast<uint1b_t>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kU2:
+    cutlass::reference::device::BlockFillSequential<uint2b_t>(
+      reinterpret_cast<uint2b_t *>(pointer_),
+      capacity_,
+      static_cast<uint2b_t>(dist.sequential.delta),
+      static_cast<uint2b_t>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kU4:
+    cutlass::reference::device::BlockFillSequential<uint4b_t>(
+      reinterpret_cast<uint4b_t *>(pointer_),
+      capacity_,
+      static_cast<uint4b_t>(dist.sequential.delta),
+      static_cast<uint4b_t>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kU8:
+    cutlass::reference::device::BlockFillSequential<uint8_t>(
+      reinterpret_cast<uint8_t *>(pointer_),
+      capacity_,
+      static_cast<uint8_t>(dist.sequential.delta),
+      static_cast<uint8_t>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kU16:
+    cutlass::reference::device::BlockFillSequential<uint16_t>(
+      reinterpret_cast<uint16_t *>(pointer_),
+      capacity_,
+      static_cast<uint16_t>(dist.sequential.delta),
+      static_cast<uint16_t>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kU32:
+    cutlass::reference::device::BlockFillSequential<uint32_t>(
+      reinterpret_cast<uint32_t *>(pointer_),
+      capacity_,
+      static_cast<uint32_t>(dist.sequential.delta),
+      static_cast<uint32_t>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kU64:
+    cutlass::reference::device::BlockFillSequential<uint64_t>(
+      reinterpret_cast<uint64_t *>(pointer_),
+      capacity_,
+      static_cast<uint64_t>(dist.sequential.delta),
+      static_cast<uint64_t>(dist.sequential.start)
+    );
+    break;
+  default: break;
+  }
+
+}
+
+void DeviceAllocation::initialize_sequential_host(Distribution dist) {
+  if (!bytes()) {
+#ifndef NDEBUG
+    std::cout << "Skipping initialization of size 0 allocation\n";
+#endif
+    return;
+  }
+
+  if (!data()) {
+    throw std::runtime_error("Attempting to initialize invalid allocation.");
+  }
+
+  std::vector<uint8_t> host_data(bytes());
+
+  switch (type_) {
+  case library::NumericTypeID::kFE4M3:
+    cutlass::reference::host::BlockFillSequential<cutlass::float_e4m3_t>(
+      reinterpret_cast<cutlass::float_e4m3_t *>(host_data.data()),
+      capacity_,
+      static_cast<cutlass::float_e4m3_t>(dist.sequential.delta),
+      static_cast<cutlass::float_e4m3_t>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kFE5M2:
+    cutlass::reference::host::BlockFillSequential<cutlass::float_e5m2_t>(
+      reinterpret_cast<cutlass::float_e5m2_t *>(host_data.data()),
+      capacity_,
+      static_cast<cutlass::float_e5m2_t>(dist.sequential.delta),
+      static_cast<cutlass::float_e5m2_t>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kF16:
+    cutlass::reference::host::BlockFillSequential<cutlass::half_t>(
+      reinterpret_cast<cutlass::half_t *>(host_data.data()),
+      capacity_,
+      static_cast<cutlass::half_t>(dist.sequential.delta),
+      static_cast<cutlass::half_t>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kBF16:
+    cutlass::reference::host::BlockFillSequential<cutlass::bfloat16_t>(
+      reinterpret_cast<cutlass::bfloat16_t *>(host_data.data()),
+      capacity_,
+      static_cast<cutlass::bfloat16_t>(dist.sequential.delta),
+      static_cast<cutlass::bfloat16_t>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kTF32:
+    cutlass::reference::host::BlockFillSequential<cutlass::tfloat32_t>(
+      reinterpret_cast<cutlass::tfloat32_t *>(host_data.data()),
+      capacity_,
+      static_cast<cutlass::tfloat32_t>(dist.sequential.delta),
+      static_cast<cutlass::tfloat32_t>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kF32:
+    cutlass::reference::host::BlockFillSequential<float>(
+      reinterpret_cast<float *>(host_data.data()),
+      capacity_,
+      static_cast<float>(dist.sequential.delta),
+      static_cast<float>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kCF16:
+    cutlass::reference::host::BlockFillSequential<cutlass::complex<cutlass::half_t>>(
+      reinterpret_cast<cutlass::complex<cutlass::half_t> *>(host_data.data()),
+      capacity_,
+      cutlass::complex<cutlass::half_t>(
+        static_cast<cutlass::half_t>(dist.sequential.delta)),
+      cutlass::complex<cutlass::half_t>(
+        static_cast<cutlass::half_t>(dist.sequential.start))
+    );
+    break;
+  case library::NumericTypeID::kCBF16:
+    cutlass::reference::host::BlockFillSequential<cutlass::complex<cutlass::bfloat16_t>>(
+      reinterpret_cast<cutlass::complex<cutlass::bfloat16_t> *>(host_data.data()),
+      capacity_,
+      cutlass::complex<cutlass::bfloat16_t>(
+        static_cast<cutlass::bfloat16_t>(dist.sequential.delta)),
+      cutlass::complex<cutlass::bfloat16_t>(
+        static_cast<cutlass::bfloat16_t>(dist.sequential.start))
+    );
+    break;
+  case library::NumericTypeID::kCTF32:
+    cutlass::reference::host::BlockFillSequential<cutlass::complex<cutlass::tfloat32_t>>(
+      reinterpret_cast<cutlass::complex<cutlass::tfloat32_t> *>(host_data.data()),
+      capacity_,
+      cutlass::complex<cutlass::tfloat32_t>(
+        static_cast<cutlass::tfloat32_t>(dist.sequential.delta)),
+      cutlass::complex<cutlass::tfloat32_t>(
+        static_cast<cutlass::tfloat32_t>(dist.sequential.start))
+    );
+    break;
+  case library::NumericTypeID::kCF32:
+    cutlass::reference::host::BlockFillSequential<cutlass::complex<float>>(
+      reinterpret_cast<cutlass::complex<float> *>(host_data.data()),
+      capacity_,
+      cutlass::complex<float>(
+        static_cast<float>(dist.sequential.delta)),
+      cutlass::complex<float>(
+        static_cast<float>(dist.sequential.start))
+    );
+    break;
+  case library::NumericTypeID::kF64:
+    cutlass::reference::host::BlockFillSequential<double>(
+      reinterpret_cast<double *>(host_data.data()),
+      capacity_,
+      static_cast<double>(dist.sequential.delta),
+      static_cast<double>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kCF64:
+    cutlass::reference::host::BlockFillSequential<cutlass::complex<double>>(
+      reinterpret_cast<cutlass::complex<double> *>(host_data.data()),
+      capacity_,
+      cutlass::complex<double>(
+        static_cast<double>(dist.sequential.delta)),
+      cutlass::complex<double>(
+        static_cast<double>(dist.sequential.start))
+    );
+    break;
+  case library::NumericTypeID::kS2:
+    cutlass::reference::host::BlockFillSequential<int2b_t>(
+      reinterpret_cast<int2b_t *>(host_data.data()),
+      capacity_,
+      static_cast<int2b_t>(dist.sequential.delta),
+      static_cast<int2b_t>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kS4:
+    cutlass::reference::host::BlockFillSequential<int4b_t>(
+      reinterpret_cast<int4b_t *>(host_data.data()),
+      capacity_,
+      static_cast<int4b_t>(dist.sequential.delta),
+      static_cast<int4b_t>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kS8:
+    cutlass::reference::host::BlockFillSequential<int8_t>(
+      reinterpret_cast<int8_t *>(host_data.data()),
+      capacity_,
+      static_cast<int8_t>(dist.sequential.delta),
+      static_cast<int8_t>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kS16:
+    cutlass::reference::host::BlockFillSequential<int16_t>(
+      reinterpret_cast<int16_t *>(host_data.data()),
+      capacity_,
+      static_cast<int16_t>(dist.sequential.delta),
+      static_cast<int16_t>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kS32:
+    cutlass::reference::host::BlockFillSequential<int32_t>(
+      reinterpret_cast<int32_t *>(host_data.data()),
+      capacity_,
+      static_cast<int32_t>(dist.sequential.delta),
+      static_cast<int32_t>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kS64:
+    cutlass::reference::host::BlockFillSequential<int64_t>(
+      reinterpret_cast<int64_t *>(host_data.data()),
+      capacity_,
+      static_cast<int64_t>(dist.sequential.delta),
+      static_cast<int64_t>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kB1:
+    cutlass::reference::host::BlockFillSequential<uint1b_t>(
+      reinterpret_cast<uint1b_t *>(host_data.data()),
+      capacity_,
+      static_cast<uint1b_t>(dist.sequential.delta),
+      static_cast<uint1b_t>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kU2:
+    cutlass::reference::host::BlockFillSequential<uint2b_t>(
+      reinterpret_cast<uint2b_t *>(host_data.data()),
+      capacity_,
+      static_cast<uint2b_t>(dist.sequential.delta),
+      static_cast<uint2b_t>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kU4:
+    cutlass::reference::host::BlockFillSequential<uint4b_t>(
+      reinterpret_cast<uint4b_t *>(host_data.data()),
+      capacity_,
+      static_cast<uint4b_t>(dist.sequential.delta),
+      static_cast<uint4b_t>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kU8:
+    cutlass::reference::host::BlockFillSequential<uint8_t>(
+      reinterpret_cast<uint8_t *>(host_data.data()),
+      capacity_,
+      static_cast<uint8_t>(dist.sequential.delta),
+      static_cast<uint8_t>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kU16:
+    cutlass::reference::host::BlockFillSequential<uint16_t>(
+      reinterpret_cast<uint16_t *>(host_data.data()),
+      capacity_,
+      static_cast<uint16_t>(dist.sequential.delta),
+      static_cast<uint16_t>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kU32:
+    cutlass::reference::host::BlockFillSequential<uint32_t>(
+      reinterpret_cast<uint32_t *>(host_data.data()),
+      capacity_,
+      static_cast<uint32_t>(dist.sequential.delta),
+      static_cast<uint32_t>(dist.sequential.start)
+    );
+    break;
+  case library::NumericTypeID::kU64:
+    cutlass::reference::host::BlockFillSequential<uint64_t>(
+      reinterpret_cast<uint64_t *>(host_data.data()),
+      capacity_,
+      static_cast<uint64_t>(dist.sequential.delta),
+      static_cast<uint64_t>(dist.sequential.start)
+    );
+    break;
+  default: break;
+  }
+
+  copy_from_host(host_data.data());
+}
+
 void DeviceAllocation::initialize_random_sparsemeta_device(int seed, int MetaSizeInBits) {
-  if (!good()) {
+  if (!bytes()) {
+#ifndef NDEBUG
+    std::cout << "Skipping initialization of size 0 allocation\n";
+#endif
+    return;
+  }
+
+  if (!data()) {
     throw std::runtime_error("Attempting to initialize invalid allocation.");
   }
 
@@ -902,7 +1458,14 @@ void DeviceAllocation::initialize_random_sparsemeta_device(int seed, int MetaSiz
 }
 
 void DeviceAllocation::initialize_random_sparsemeta_host(int seed, int MetaSizeInBits) {
-  if (!good()) {
+  if (!bytes()) {
+#ifndef NDEBUG
+    std::cout << "Skipping initialization of size 0 allocation\n";
+#endif
+    return;
+  }
+
+  if (!data()) {
     throw std::runtime_error("Attempting to initialize invalid allocation.");
   }
 
@@ -936,307 +1499,333 @@ void DeviceAllocation::initialize_random_sparsemeta_host(int seed, int MetaSizeI
 
 /// Returns true if two blocks have exactly the same value
 bool DeviceAllocation::block_compare_equal(
-  library::NumericTypeID numeric_type, 
-  void const *ptr_A, 
-  void const *ptr_B, 
+  library::NumericTypeID numeric_type,
+  void const *ptr_A,
+  void const *ptr_B,
   size_t capacity) {
 
   switch (numeric_type) {
+  case library::NumericTypeID::kFE4M3:
+    return reference::device::BlockCompareEqual<float_e4m3_t>(
+      reinterpret_cast<float_e4m3_t const *>(ptr_A),
+      reinterpret_cast<float_e4m3_t const *>(ptr_B),
+      capacity);
+
+  case library::NumericTypeID::kFE5M2:
+    return reference::device::BlockCompareEqual<float_e5m2_t>(
+      reinterpret_cast<float_e5m2_t const *>(ptr_A),
+      reinterpret_cast<float_e5m2_t const *>(ptr_B),
+      capacity);
   case library::NumericTypeID::kF16:
     return reference::device::BlockCompareEqual<half_t>(
-      reinterpret_cast<half_t const *>(ptr_A), 
-      reinterpret_cast<half_t const *>(ptr_B), 
+      reinterpret_cast<half_t const *>(ptr_A),
+      reinterpret_cast<half_t const *>(ptr_B),
       capacity);
-    
+
   case library::NumericTypeID::kBF16:
     return reference::device::BlockCompareEqual<bfloat16_t>(
-      reinterpret_cast<bfloat16_t const *>(ptr_A), 
-      reinterpret_cast<bfloat16_t const *>(ptr_B), 
+      reinterpret_cast<bfloat16_t const *>(ptr_A),
+      reinterpret_cast<bfloat16_t const *>(ptr_B),
       capacity);
 
   case library::NumericTypeID::kTF32:
     return reference::device::BlockCompareEqual<tfloat32_t>(
-      reinterpret_cast<tfloat32_t const *>(ptr_A), 
-      reinterpret_cast<tfloat32_t const *>(ptr_B), 
+      reinterpret_cast<tfloat32_t const *>(ptr_A),
+      reinterpret_cast<tfloat32_t const *>(ptr_B),
       capacity);
 
   case library::NumericTypeID::kF32:
     return reference::device::BlockCompareEqual<float>(
-      reinterpret_cast<float const *>(ptr_A), 
-      reinterpret_cast<float const *>(ptr_B), 
+      reinterpret_cast<float const *>(ptr_A),
+      reinterpret_cast<float const *>(ptr_B),
       capacity);
 
   case library::NumericTypeID::kCF32:
     return reference::device::BlockCompareEqual<cutlass::complex<float> >(
-      reinterpret_cast<complex<float> const *>(ptr_A), 
-      reinterpret_cast<complex<float> const *>(ptr_B), 
+      reinterpret_cast<complex<float> const *>(ptr_A),
+      reinterpret_cast<complex<float> const *>(ptr_B),
       capacity);
-  
+
   case library::NumericTypeID::kCF16:
     return reference::device::BlockCompareEqual<complex<half_t>>(
-      reinterpret_cast<complex<half_t> const *>(ptr_A), 
-      reinterpret_cast<complex<half_t> const *>(ptr_B), 
+      reinterpret_cast<complex<half_t> const *>(ptr_A),
+      reinterpret_cast<complex<half_t> const *>(ptr_B),
       capacity);
-    
+
   case library::NumericTypeID::kCBF16:
     return reference::device::BlockCompareEqual<complex<bfloat16_t>>(
-      reinterpret_cast<complex<bfloat16_t> const *>(ptr_A), 
-      reinterpret_cast<complex<bfloat16_t> const *>(ptr_B), 
+      reinterpret_cast<complex<bfloat16_t> const *>(ptr_A),
+      reinterpret_cast<complex<bfloat16_t> const *>(ptr_B),
       capacity);
 
   case library::NumericTypeID::kCTF32:
     return reference::device::BlockCompareEqual<complex<tfloat32_t>>(
-      reinterpret_cast<complex<tfloat32_t> const *>(ptr_A), 
-      reinterpret_cast<complex<tfloat32_t> const *>(ptr_B), 
+      reinterpret_cast<complex<tfloat32_t> const *>(ptr_A),
+      reinterpret_cast<complex<tfloat32_t> const *>(ptr_B),
       capacity);
-  
+
   case library::NumericTypeID::kF64:
     return reference::device::BlockCompareEqual<double>(
-      reinterpret_cast<double const *>(ptr_A), 
-      reinterpret_cast<double const *>(ptr_B), 
+      reinterpret_cast<double const *>(ptr_A),
+      reinterpret_cast<double const *>(ptr_B),
       capacity);
 
   case library::NumericTypeID::kCF64:
     return reference::device::BlockCompareEqual<complex<double>>(
-      reinterpret_cast<complex<double> const *>(ptr_A), 
-      reinterpret_cast<complex<double> const *>(ptr_B), 
+      reinterpret_cast<complex<double> const *>(ptr_A),
+      reinterpret_cast<complex<double> const *>(ptr_B),
       capacity);
-  
+
   case library::NumericTypeID::kS2:
     return reference::device::BlockCompareEqual<int2b_t>(
-      reinterpret_cast<int2b_t const *>(ptr_A), 
-      reinterpret_cast<int2b_t const *>(ptr_B), 
+      reinterpret_cast<int2b_t const *>(ptr_A),
+      reinterpret_cast<int2b_t const *>(ptr_B),
       capacity);
 
   case library::NumericTypeID::kS4:
     return reference::device::BlockCompareEqual<int4b_t>(
-      reinterpret_cast<int4b_t const *>(ptr_A), 
-      reinterpret_cast<int4b_t const *>(ptr_B), 
+      reinterpret_cast<int4b_t const *>(ptr_A),
+      reinterpret_cast<int4b_t const *>(ptr_B),
       capacity);
 
   case library::NumericTypeID::kS8:
     return reference::device::BlockCompareEqual<int8_t>(
-      reinterpret_cast<int8_t const *>(ptr_A), 
-      reinterpret_cast<int8_t const *>(ptr_B), 
+      reinterpret_cast<int8_t const *>(ptr_A),
+      reinterpret_cast<int8_t const *>(ptr_B),
       capacity);
 
   case library::NumericTypeID::kS16:
     return reference::device::BlockCompareEqual<int16_t>(
-      reinterpret_cast<int16_t const *>(ptr_A), 
-      reinterpret_cast<int16_t const *>(ptr_B), 
+      reinterpret_cast<int16_t const *>(ptr_A),
+      reinterpret_cast<int16_t const *>(ptr_B),
       capacity);
 
   case library::NumericTypeID::kS32:
     return reference::device::BlockCompareEqual<int32_t>(
-      reinterpret_cast<int32_t const *>(ptr_A), 
-      reinterpret_cast<int32_t const *>(ptr_B), 
+      reinterpret_cast<int32_t const *>(ptr_A),
+      reinterpret_cast<int32_t const *>(ptr_B),
       capacity);
 
   case library::NumericTypeID::kS64:
     return reference::device::BlockCompareEqual<int64_t>(
-      reinterpret_cast<int64_t const *>(ptr_A), 
-      reinterpret_cast<int64_t const *>(ptr_B), 
+      reinterpret_cast<int64_t const *>(ptr_A),
+      reinterpret_cast<int64_t const *>(ptr_B),
       capacity);
-  
+
   case library::NumericTypeID::kB1:
     return reference::device::BlockCompareEqual<uint1b_t>(
-      reinterpret_cast<uint1b_t const *>(ptr_A), 
-      reinterpret_cast<uint1b_t const *>(ptr_B), 
+      reinterpret_cast<uint1b_t const *>(ptr_A),
+      reinterpret_cast<uint1b_t const *>(ptr_B),
       capacity);
-  
+
   case library::NumericTypeID::kU2:
     return reference::device::BlockCompareEqual<uint2b_t>(
-      reinterpret_cast<uint2b_t const *>(ptr_A), 
-      reinterpret_cast<uint2b_t const *>(ptr_B), 
+      reinterpret_cast<uint2b_t const *>(ptr_A),
+      reinterpret_cast<uint2b_t const *>(ptr_B),
       capacity);
-  
+
   case library::NumericTypeID::kU4:
     return reference::device::BlockCompareEqual<uint4b_t>(
-      reinterpret_cast<uint4b_t const *>(ptr_A), 
-      reinterpret_cast<uint4b_t const *>(ptr_B), 
+      reinterpret_cast<uint4b_t const *>(ptr_A),
+      reinterpret_cast<uint4b_t const *>(ptr_B),
       capacity);
 
   case library::NumericTypeID::kU8:
     return reference::device::BlockCompareEqual<uint8_t>(
-      reinterpret_cast<uint8_t const *>(ptr_A), 
-      reinterpret_cast<uint8_t const *>(ptr_B), 
+      reinterpret_cast<uint8_t const *>(ptr_A),
+      reinterpret_cast<uint8_t const *>(ptr_B),
       capacity);
 
   case library::NumericTypeID::kU16:
     return reference::device::BlockCompareEqual<uint16_t>(
-      reinterpret_cast<uint16_t const *>(ptr_A), 
-      reinterpret_cast<uint16_t const *>(ptr_B), 
+      reinterpret_cast<uint16_t const *>(ptr_A),
+      reinterpret_cast<uint16_t const *>(ptr_B),
       capacity);
 
   case library::NumericTypeID::kU32:
     return reference::device::BlockCompareEqual<uint32_t>(
-      reinterpret_cast<uint32_t const *>(ptr_A), 
-      reinterpret_cast<uint32_t const *>(ptr_B), 
+      reinterpret_cast<uint32_t const *>(ptr_A),
+      reinterpret_cast<uint32_t const *>(ptr_B),
       capacity);
 
   case library::NumericTypeID::kU64:
     return reference::device::BlockCompareEqual<uint64_t>(
-      reinterpret_cast<uint64_t const *>(ptr_A), 
-      reinterpret_cast<uint64_t const *>(ptr_B), 
+      reinterpret_cast<uint64_t const *>(ptr_A),
+      reinterpret_cast<uint64_t const *>(ptr_B),
       capacity);
 
   default:
-    throw std::runtime_error("Unsupported numeric type");
+    throw std::runtime_error(std::string("Unsupported numeric type: ") + to_string(numeric_type));
   }
 }
 
 /// Returns true if two blocks have approximately the same value
 bool DeviceAllocation::block_compare_relatively_equal(
-  library::NumericTypeID numeric_type, 
-  void const *ptr_A, 
-  void const *ptr_B, 
+  library::NumericTypeID numeric_type,
+  void const *ptr_A,
+  void const *ptr_B,
   size_t capacity,
   double epsilon,
   double nonzero_floor) {
 
   switch (numeric_type) {
+  case library::NumericTypeID::kFE4M3:
+    return reference::device::BlockCompareRelativelyEqual<float_e4m3_t>(
+      reinterpret_cast<float_e4m3_t const *>(ptr_A),
+      reinterpret_cast<float_e4m3_t const *>(ptr_B),
+      capacity,
+      static_cast<float_e4m3_t>(epsilon),
+      static_cast<float_e4m3_t>(nonzero_floor));
+
+  case library::NumericTypeID::kFE5M2:
+    return reference::device::BlockCompareRelativelyEqual<float_e5m2_t>(
+      reinterpret_cast<float_e5m2_t const *>(ptr_A),
+      reinterpret_cast<float_e5m2_t const *>(ptr_B),
+      capacity,
+      static_cast<float_e5m2_t>(epsilon),
+      static_cast<float_e5m2_t>(nonzero_floor));
   case library::NumericTypeID::kF16:
     return reference::device::BlockCompareRelativelyEqual<half_t>(
-      reinterpret_cast<half_t const *>(ptr_A), 
+      reinterpret_cast<half_t const *>(ptr_A),
       reinterpret_cast<half_t const *>(ptr_B),
-      capacity, 
-      static_cast<half_t>(epsilon), 
+      capacity,
+      static_cast<half_t>(epsilon),
       static_cast<half_t>(nonzero_floor));
-    
+
   case library::NumericTypeID::kBF16:
     return reference::device::BlockCompareRelativelyEqual<bfloat16_t>(
-      reinterpret_cast<bfloat16_t const *>(ptr_A), 
+      reinterpret_cast<bfloat16_t const *>(ptr_A),
       reinterpret_cast<bfloat16_t const *>(ptr_B),
-      capacity, 
-      static_cast<bfloat16_t>(epsilon), 
+      capacity,
+      static_cast<bfloat16_t>(epsilon),
       static_cast<bfloat16_t>(nonzero_floor));
 
   case library::NumericTypeID::kTF32:
     return reference::device::BlockCompareRelativelyEqual<tfloat32_t>(
-      reinterpret_cast<tfloat32_t const *>(ptr_A), 
+      reinterpret_cast<tfloat32_t const *>(ptr_A),
       reinterpret_cast<tfloat32_t const *>(ptr_B),
-      capacity, 
-      static_cast<tfloat32_t>(epsilon), 
+      capacity,
+      static_cast<tfloat32_t>(epsilon),
       static_cast<tfloat32_t>(nonzero_floor));
 
   case library::NumericTypeID::kF32:
     return reference::device::BlockCompareRelativelyEqual<float>(
-      reinterpret_cast<float const *>(ptr_A), 
+      reinterpret_cast<float const *>(ptr_A),
       reinterpret_cast<float const *>(ptr_B),
-      capacity, 
-      static_cast<float>(epsilon), 
+      capacity,
+      static_cast<float>(epsilon),
       static_cast<float>(nonzero_floor));
 
   case library::NumericTypeID::kF64:
     return reference::device::BlockCompareRelativelyEqual<double>(
-      reinterpret_cast<double const *>(ptr_A), 
+      reinterpret_cast<double const *>(ptr_A),
       reinterpret_cast<double const *>(ptr_B),
-      capacity, 
-      static_cast<double>(epsilon), 
+      capacity,
+      static_cast<double>(epsilon),
       static_cast<double>(nonzero_floor));
-  
+
   case library::NumericTypeID::kS2:
     return reference::device::BlockCompareRelativelyEqual<int2b_t>(
-      reinterpret_cast<int2b_t const *>(ptr_A), 
+      reinterpret_cast<int2b_t const *>(ptr_A),
       reinterpret_cast<int2b_t const *>(ptr_B),
-      capacity, 
-      static_cast<int2b_t>(epsilon), 
+      capacity,
+      static_cast<int2b_t>(epsilon),
       static_cast<int2b_t>(nonzero_floor));
-  
+
   case library::NumericTypeID::kS4:
     return reference::device::BlockCompareRelativelyEqual<int4b_t>(
-      reinterpret_cast<int4b_t const *>(ptr_A), 
+      reinterpret_cast<int4b_t const *>(ptr_A),
       reinterpret_cast<int4b_t const *>(ptr_B),
-      capacity, 
-      static_cast<int4b_t>(epsilon), 
+      capacity,
+      static_cast<int4b_t>(epsilon),
       static_cast<int4b_t>(nonzero_floor));
 
   case library::NumericTypeID::kS8:
     return reference::device::BlockCompareRelativelyEqual<int8_t>(
-      reinterpret_cast<int8_t const *>(ptr_A), 
+      reinterpret_cast<int8_t const *>(ptr_A),
       reinterpret_cast<int8_t const *>(ptr_B),
-      capacity, 
-      static_cast<int8_t>(epsilon), 
+      capacity,
+      static_cast<int8_t>(epsilon),
       static_cast<int8_t>(nonzero_floor));
 
   case library::NumericTypeID::kS16:
     return reference::device::BlockCompareRelativelyEqual<int16_t>(
-      reinterpret_cast<int16_t const *>(ptr_A), 
+      reinterpret_cast<int16_t const *>(ptr_A),
       reinterpret_cast<int16_t const *>(ptr_B),
-      capacity, 
-      static_cast<int16_t>(epsilon), 
+      capacity,
+      static_cast<int16_t>(epsilon),
       static_cast<int16_t>(nonzero_floor));
 
   case library::NumericTypeID::kS32:
     return reference::device::BlockCompareRelativelyEqual<int32_t>(
-      reinterpret_cast<int32_t const *>(ptr_A), 
+      reinterpret_cast<int32_t const *>(ptr_A),
       reinterpret_cast<int32_t const *>(ptr_B),
-      capacity, 
-      static_cast<int32_t>(epsilon), 
+      capacity,
+      static_cast<int32_t>(epsilon),
       static_cast<int32_t>(nonzero_floor));
 
   case library::NumericTypeID::kS64:
     return reference::device::BlockCompareRelativelyEqual<int64_t>(
-      reinterpret_cast<int64_t const *>(ptr_A), 
+      reinterpret_cast<int64_t const *>(ptr_A),
       reinterpret_cast<int64_t const *>(ptr_B),
-      capacity, 
-      static_cast<int64_t>(epsilon), 
+      capacity,
+      static_cast<int64_t>(epsilon),
       static_cast<int64_t>(nonzero_floor));
-  
+
   case library::NumericTypeID::kB1:
     return reference::device::BlockCompareRelativelyEqual<uint1b_t>(
-      reinterpret_cast<uint1b_t const *>(ptr_A), 
+      reinterpret_cast<uint1b_t const *>(ptr_A),
       reinterpret_cast<uint1b_t const *>(ptr_B),
-      capacity, 
-      static_cast<uint1b_t>(epsilon), 
+      capacity,
+      static_cast<uint1b_t>(epsilon),
       static_cast<uint1b_t>(nonzero_floor));
 
   case library::NumericTypeID::kU2:
     return reference::device::BlockCompareRelativelyEqual<uint2b_t>(
-      reinterpret_cast<uint2b_t const *>(ptr_A), 
+      reinterpret_cast<uint2b_t const *>(ptr_A),
       reinterpret_cast<uint2b_t const *>(ptr_B),
-      capacity, 
-      static_cast<uint2b_t>(epsilon), 
+      capacity,
+      static_cast<uint2b_t>(epsilon),
       static_cast<uint2b_t>(nonzero_floor));
 
   case library::NumericTypeID::kU4:
     return reference::device::BlockCompareRelativelyEqual<uint4b_t>(
-      reinterpret_cast<uint4b_t const *>(ptr_A), 
+      reinterpret_cast<uint4b_t const *>(ptr_A),
       reinterpret_cast<uint4b_t const *>(ptr_B),
-      capacity, 
-      static_cast<uint4b_t>(epsilon), 
+      capacity,
+      static_cast<uint4b_t>(epsilon),
       static_cast<uint4b_t>(nonzero_floor));
 
   case library::NumericTypeID::kU8:
     return reference::device::BlockCompareRelativelyEqual<uint8_t>(
-      reinterpret_cast<uint8_t const *>(ptr_A), 
+      reinterpret_cast<uint8_t const *>(ptr_A),
       reinterpret_cast<uint8_t const *>(ptr_B),
-      capacity, 
-      static_cast<uint8_t>(epsilon), 
+      capacity,
+      static_cast<uint8_t>(epsilon),
       static_cast<uint8_t>(nonzero_floor));
 
   case library::NumericTypeID::kU16:
     return reference::device::BlockCompareRelativelyEqual<uint16_t>(
-      reinterpret_cast<uint16_t const *>(ptr_A), 
+      reinterpret_cast<uint16_t const *>(ptr_A),
       reinterpret_cast<uint16_t const *>(ptr_B),
-      capacity, 
-      static_cast<uint16_t>(epsilon), 
+      capacity,
+      static_cast<uint16_t>(epsilon),
       static_cast<uint16_t>(nonzero_floor));
 
   case library::NumericTypeID::kU32:
     return reference::device::BlockCompareRelativelyEqual<uint32_t>(
-      reinterpret_cast<uint32_t const *>(ptr_A), 
+      reinterpret_cast<uint32_t const *>(ptr_A),
       reinterpret_cast<uint32_t const *>(ptr_B),
-      capacity, 
-      static_cast<uint32_t>(epsilon), 
+      capacity,
+      static_cast<uint32_t>(epsilon),
       static_cast<uint32_t>(nonzero_floor));
 
   case library::NumericTypeID::kU64:
     return reference::device::BlockCompareRelativelyEqual<uint64_t>(
-      reinterpret_cast<uint64_t const *>(ptr_A), 
+      reinterpret_cast<uint64_t const *>(ptr_A),
       reinterpret_cast<uint64_t const *>(ptr_B),
-      capacity, 
-      static_cast<uint64_t>(epsilon), 
+      capacity,
+      static_cast<uint64_t>(epsilon),
       static_cast<uint64_t>(nonzero_floor));
 
   // No relatively equal comparison for complex numbers.
@@ -1255,7 +1844,7 @@ bool DeviceAllocation::block_compare_relatively_equal(
       reinterpret_cast<complex<float> const *>(ptr_A),
       reinterpret_cast<complex<float> const *>(ptr_B),
       capacity);
-  
+
   case library::NumericTypeID::kCF64:
     return reference::device::BlockCompareEqual<cutlass::complex<double> >(
       reinterpret_cast<complex<double> const *>(ptr_A),
@@ -1271,14 +1860,14 @@ bool DeviceAllocation::block_compare_relatively_equal(
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Permits copying dynamic vectors into static-length vectors 
+/// Permits copying dynamic vectors into static-length vectors
 template <typename TensorCoord, int Rank>
 struct vector_to_coord {
-  
+
   vector_to_coord(TensorCoord &coord, std::vector<int> const &vec) {
 
     coord[Rank - 1] = vec.at(Rank - 1);
-    
+
     if (Rank > 1) {
       vector_to_coord<TensorCoord, Rank - 1>(coord, vec);
     }
@@ -1287,17 +1876,17 @@ struct vector_to_coord {
   vector_to_coord(TensorCoord &coord, std::vector<int64_t> const &vec) {
 
     coord[Rank - 1] = (int)vec.at(Rank - 1);
-    
+
     if (Rank > 1) {
       vector_to_coord<TensorCoord, Rank - 1>(coord, vec);
     }
   }
 };
 
-/// Permits copying dynamic vectors into static-length vectors 
+/// Permits copying dynamic vectors into static-length vectors
 template <typename TensorCoord>
 struct vector_to_coord<TensorCoord, 1> {
-  
+
   vector_to_coord(TensorCoord &coord, std::vector<int> const &vec) {
 
     coord[0] = vec.at(0);
@@ -1309,10 +1898,10 @@ struct vector_to_coord<TensorCoord, 1> {
   }
 };
 
-/// Permits copying dynamic vectors into static-length vectors 
+/// Permits copying dynamic vectors into static-length vectors
 template <typename TensorCoord>
 struct vector_to_coord<TensorCoord, 0> {
-  
+
   vector_to_coord(TensorCoord &coord, std::vector<int> const &vec) {
 
   }
@@ -1322,7 +1911,7 @@ struct vector_to_coord<TensorCoord, 0> {
 
 template <typename Element, typename Layout>
 static void write_tensor_csv_static_tensor_view(
-  std::ostream &out, 
+  std::ostream &out,
   DeviceAllocation &allocation) {
 
   Coord<Layout::kRank> extent;
@@ -1337,7 +1926,7 @@ static void write_tensor_csv_static_tensor_view(
   }
 
   vector_to_coord<Coord<Layout::kRank>, Layout::kRank>(extent, allocation.extent());
-  vector_to_coord<Coord<Layout::kStrideRank, typename Layout::Stride::Index>, 
+  vector_to_coord<Coord<Layout::kStrideRank, typename Layout::Stride::Index>,
                         Layout::kStrideRank>(stride, allocation.stride());
 
   Layout layout(stride);
@@ -1348,7 +1937,7 @@ static void write_tensor_csv_static_tensor_view(
   }
 
   host_tensor.copy_in_device_to_host(
-    static_cast<Element const *>(allocation.data()), 
+    static_cast<Element const *>(allocation.data()),
     allocation.batch_stride());
 
   TensorViewWrite(out, host_tensor.host_view());
@@ -1360,7 +1949,7 @@ static void write_tensor_csv_static_tensor_view(
 
 template <typename T>
 static void write_tensor_csv_static_type(
-  std::ostream &out, 
+  std::ostream &out,
   DeviceAllocation &allocation) {
 
   switch (allocation.layout()) {
@@ -1425,15 +2014,22 @@ static void write_tensor_csv_static_type(
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Writes a tensor to csv 
+/// Writes a tensor to csv
 void DeviceAllocation::write_tensor_csv(
   std::ostream &out) {
 
   switch (this->type()) {
+  case library::NumericTypeID::kFE4M3:
+    write_tensor_csv_static_type<float_e4m3_t>(out, *this);
+    break;
+
+  case library::NumericTypeID::kFE5M2:
+    write_tensor_csv_static_type<float_e5m2_t>(out, *this);
+    break;
   case library::NumericTypeID::kF16:
     write_tensor_csv_static_type<half_t>(out, *this);
     break;
-    
+
   case library::NumericTypeID::kBF16:
     write_tensor_csv_static_type<bfloat16_t>(out, *this);
     break;
@@ -1449,7 +2045,7 @@ void DeviceAllocation::write_tensor_csv(
   case library::NumericTypeID::kF64:
     write_tensor_csv_static_type<double>(out, *this);
     break;
-  
+
   case library::NumericTypeID::kS2:
     write_tensor_csv_static_type<int2b_t>(out, *this);
     break;
@@ -1473,7 +2069,7 @@ void DeviceAllocation::write_tensor_csv(
   case library::NumericTypeID::kS64:
     write_tensor_csv_static_type<int64_t>(out, *this);
     break;
-  
+
   case library::NumericTypeID::kB1:
     write_tensor_csv_static_type<uint1b_t>(out, *this);
     break;
@@ -1501,7 +2097,7 @@ void DeviceAllocation::write_tensor_csv(
   case library::NumericTypeID::kU64:
     write_tensor_csv_static_type<uint64_t>(out, *this);
     break;
-  
+
   case library::NumericTypeID::kCF16:
     write_tensor_csv_static_type<cutlass::complex<half_t> >(out, *this);
     break;
@@ -1514,8 +2110,12 @@ void DeviceAllocation::write_tensor_csv(
     write_tensor_csv_static_type<cutlass::complex<double> >(out, *this);
     break;
 
+  case library::NumericTypeID::kVoid:
+    // Not dump anything as it is a empty tensor.
+    break;
+
   default:
-    throw std::runtime_error("Unsupported numeric type");
+    throw std::runtime_error(std::string("Unsupported numeric type: ") + to_string(this->type()) ) ;
   }
 }
 
@@ -1533,7 +2133,7 @@ static void tensor_fill_tensor_view(DeviceAllocation &allocation, Element val =
   }
 
   vector_to_coord<Coord<Layout::kRank>, Layout::kRank>(extent, allocation.extent());
-  vector_to_coord<Coord<Layout::kStrideRank, typename Layout::LongIndex>, 
+  vector_to_coord<Coord<Layout::kStrideRank, typename Layout::LongIndex>,
                         Layout::kStrideRank>(stride, allocation.stride());
 
   TensorView<Element, Layout> view(
@@ -1583,9 +2183,16 @@ static void tensor_fill(DeviceAllocation &allocation, Element val = Element()) {
 }
 
 /// Fills a tensor uniformly with a value (most frequently used to clear the tensor)
-void DeviceAllocation::fill(double val = 0.0) {
+void DeviceAllocation::fill_device(double val = 0.0) {
 
   switch (this->type()) {
+  case library::NumericTypeID::kFE4M3:
+    tensor_fill<float_e4m3_t>(*this, static_cast<float_e4m3_t>(val));
+    break;
+
+  case library::NumericTypeID::kFE5M2:
+    tensor_fill<float_e5m2_t>(*this, static_cast<float_e5m2_t>(val));
+    break;
   case library::NumericTypeID::kF16:
     tensor_fill<half_t>(*this, static_cast<half_t>(val));
     break;
@@ -1671,10 +2278,205 @@ void DeviceAllocation::fill(double val = 0.0) {
     break;
 
   default:
-    throw std::runtime_error("Unsupported numeric type");
+    throw std::runtime_error(std::string("Unsupported numeric type: ") + to_string(this->type()));
   }
 }
 
+/// Fills a tensor uniformly with a value (most frequently used to clear the tensor)
+void DeviceAllocation::fill_host(double val = 0.0) {
+
+  std::vector<uint8_t> host_data(bytes());
+
+  switch (this->type()) {
+  case library::NumericTypeID::kFE4M3:
+    cutlass::reference::host::BlockFill<float_e4m3_t>(
+      reinterpret_cast<float_e4m3_t *>(host_data.data()),
+      capacity_,
+      static_cast<float_e4m3_t>(val)
+    );
+    break;
+
+  case library::NumericTypeID::kFE5M2:
+    cutlass::reference::host::BlockFill<float_e5m2_t>(
+      reinterpret_cast<float_e5m2_t *>(host_data.data()),
+      capacity_,
+      static_cast<float_e5m2_t>(val)
+    );
+    break;
+
+  case library::NumericTypeID::kF16:
+    cutlass::reference::host::BlockFill<half_t>(
+      reinterpret_cast<half_t *>(host_data.data()),
+      capacity_,
+      static_cast<half_t>(val)
+    );
+    break;
+
+  case library::NumericTypeID::kBF16:
+    cutlass::reference::host::BlockFill<bfloat16_t>(
+      reinterpret_cast<bfloat16_t *>(host_data.data()),
+      capacity_,
+      static_cast<bfloat16_t>(val)
+    );
+    break;
+
+  case library::NumericTypeID::kTF32:
+    cutlass::reference::host::BlockFill<tfloat32_t>(
+      reinterpret_cast<tfloat32_t *>(host_data.data()),
+      capacity_,
+      static_cast<tfloat32_t>(val)
+    );
+    break;
+
+  case library::NumericTypeID::kF32:
+    cutlass::reference::host::BlockFill<float>(
+      reinterpret_cast<float *>(host_data.data()),
+      capacity_,
+      static_cast<float>(val)
+    );
+    break;
+
+  case library::NumericTypeID::kF64:
+    cutlass::reference::host::BlockFill<double>(
+      reinterpret_cast<double *>(host_data.data()),
+      capacity_,
+      static_cast<double>(val)
+    );
+    break;
+
+  case library::NumericTypeID::kS2:
+    cutlass::reference::host::BlockFill<int2b_t>(
+      reinterpret_cast<int2b_t *>(host_data.data()),
+      capacity_,
+      static_cast<int2b_t>(val)
+    );
+    break;
+
+  case library::NumericTypeID::kS4:
+    cutlass::reference::host::BlockFill<int4b_t>(
+      reinterpret_cast<int4b_t *>(host_data.data()),
+      capacity_,
+      static_cast<int4b_t>(val)
+    );
+    break;
+
+  case library::NumericTypeID::kS8:
+    cutlass::reference::host::BlockFill<int8_t>(
+      reinterpret_cast<int8_t *>(host_data.data()),
+      capacity_,
+      static_cast<int8_t>(val)
+    );
+    break;
+
+  case library::NumericTypeID::kS16:
+    cutlass::reference::host::BlockFill<int16_t>(
+      reinterpret_cast<int16_t *>(host_data.data()),
+      capacity_,
+      static_cast<int16_t>(val)
+    );
+    break;
+
+  case library::NumericTypeID::kS32:
+    cutlass::reference::host::BlockFill<int32_t>(
+      reinterpret_cast<int32_t *>(host_data.data()),
+      capacity_,
+      static_cast<int32_t>(val)
+    );
+    break;
+
+  case library::NumericTypeID::kS64:
+    cutlass::reference::host::BlockFill<int64_t>(
+      reinterpret_cast<int64_t *>(host_data.data()),
+      capacity_,
+      static_cast<int64_t>(val)
+    );
+    break;
+
+  case library::NumericTypeID::kB1:
+    cutlass::reference::host::BlockFill<uint1b_t>(
+      reinterpret_cast<uint1b_t *>(host_data.data()),
+      capacity_,
+      static_cast<uint1b_t>(val)
+    );
+    break;
+
+  case library::NumericTypeID::kU2:
+    cutlass::reference::host::BlockFill<uint2b_t>(
+      reinterpret_cast<uint2b_t *>(host_data.data()),
+      capacity_,
+      static_cast<uint2b_t>(val)
+    );
+    break;
+
+  case library::NumericTypeID::kU4:
+    cutlass::reference::host::BlockFill<uint4b_t>(
+      reinterpret_cast<uint4b_t *>(host_data.data()),
+      capacity_,
+      static_cast<uint4b_t>(val)
+    );
+    break;
+
+  case library::NumericTypeID::kU8:
+    cutlass::reference::host::BlockFill<uint8_t>(
+      reinterpret_cast<uint8_t *>(host_data.data()),
+      capacity_,
+      static_cast<uint8_t>(val)
+    );
+    break;
+
+  case library::NumericTypeID::kU16:
+    cutlass::reference::host::BlockFill<uint16_t>(
+      reinterpret_cast<uint16_t *>(host_data.data()),
+      capacity_,
+      static_cast<uint16_t>(val)
+    );
+    break;
+
+  case library::NumericTypeID::kU32:
+    cutlass::reference::host::BlockFill<uint32_t>(
+      reinterpret_cast<uint32_t *>(host_data.data()),
+      capacity_,
+      static_cast<uint32_t>(val)
+    );
+    break;
+
+  case library::NumericTypeID::kU64:
+    cutlass::reference::host::BlockFill<uint64_t>(
+      reinterpret_cast<uint64_t *>(host_data.data()),
+      capacity_,
+      static_cast<uint64_t>(val)
+    );
+    break;
+
+  default:
+    throw std::runtime_error(std::string("Unsupported numeric type: ") + to_string(this->type()));
+  }
+
+  copy_from_host(host_data.data());
+}
+
+cudaError_t DeviceAllocation::malloc(void** ptr, size_t size) {
+  cudaError_t result;
+  int current_device;
+  cudaGetDevice(&current_device);
+
+  if (current_device != device_) {
+    cudaSetDevice(device_);
+  }
+
+  // This performs the cudaMalloc
+  result = cudaMalloc(ptr, size);
+  if (result != cudaSuccess) {
+    return result;
+  }
+
+  if (current_device != device_) {
+    cudaSetDevice(current_device);
+  }
+
+  return cudaSuccess;
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace profiler
diff --git a/tools/profiler/src/device_context.cu b/tools/profiler/src/device_context.cu
index 8f831f372a..eaca07b0ad 100644
--- a/tools/profiler/src/device_context.cu
+++ b/tools/profiler/src/device_context.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -29,10 +29,10 @@
  *
  **************************************************************************************************/
 /* \file
-   \brief 
+   \brief
 */
 
-#include "device_context.h"
+#include "cutlass/profiler/device_context.h"
 
 namespace cutlass {
 namespace profiler {
@@ -41,53 +41,69 @@ namespace profiler {
 
 /// Allocates memory of a given type, capacity (elements), and name
 DeviceAllocation *DeviceContext::allocate_block(
+  Options const &options,
   std::string const &name,
-  library::NumericTypeID type, 
-  size_t capacity) {
+  library::NumericTypeID type,
+  size_t capacity,
+  size_t device_index) {
 
-  device_memory_.emplace_back(type, capacity);
+  int device = options.device.device_id(device_index);
+  device_memory_.emplace_back(type, capacity, device);
   DeviceAllocation *allocation = &device_memory_.back();
-  
+
   allocations_[name] = allocation;
   return allocation;
 }
 
 /// Allocates memory of a given type, capacity (elements), and name
 DeviceAllocation *DeviceContext::allocate_tensor(
+  Options const &options,
   std::string const &name,
-  library::NumericTypeID type, 
-  library::LayoutTypeID layout_id, 
-  std::vector<int> const &extent, 
+  library::NumericTypeID type,
+  library::LayoutTypeID layout_id,
+  std::vector<int> const &extent,
   std::vector<int64_t> const &stride,
-  int batch_count) {
+  int batch_count,
+  size_t device_index) {
 
-  device_memory_.emplace_back(type, layout_id, extent, stride, batch_count);
+  int device = options.device.device_id(device_index);
+  device_memory_.emplace_back(type, layout_id, extent, stride, batch_count,
+                              device);
   DeviceAllocation *allocation = &device_memory_.back();
-  
+
   allocations_[name] = allocation;
   return allocation;
 }
 
 /// Allocates memory of a given type, capacity (elements), and name
-DeviceAllocation *DeviceContext::allocate_tensor(
+DeviceAllocation *DeviceContext::allocate_and_initialize_tensor(
   Options const &options,
   std::string const &name,
-  library::NumericTypeID type, 
-  library::LayoutTypeID layout_id, 
-  std::vector<int> const &extent, 
+  library::NumericTypeID type,
+  library::LayoutTypeID layout_id,
+  std::vector<int> const &extent,
   std::vector<int64_t> const &stride,
-  int batch_count) {
+  int batch_count,
+  int seed_shift,
+  size_t device_index) {
 
-  DeviceAllocation *allocation = 
-    allocate_tensor(name, type, layout_id, extent, stride, batch_count);
+  DeviceAllocation *allocation =
+      allocate_tensor(options, name, type, layout_id, extent, stride,
+                      batch_count, device_index);
 
   if (options.initialization.enabled) {
-    Distribution data_distribution = options.initialization.data_distribution; 
+    Distribution data_distribution = options.initialization.data_distribution;
 
     // check if data distribution is allowed to change
     if(!options.initialization.fix_data_distribution) {
       // change data distribution based on bit width
       switch(type) {
+        case library::NumericTypeID::kFE4M3:
+          data_distribution.set_uniform(-1, 1, 0);
+          break;
+        case library::NumericTypeID::kFE5M2:
+          data_distribution.set_uniform(-1, 1, 0);
+          break;
         case library::NumericTypeID::kF16:
           data_distribution.set_uniform(-3, 3, 0);
           break;
@@ -116,34 +132,66 @@ DeviceAllocation *DeviceContext::allocate_tensor(
       }
     }
 
+    // Override pnz for the A/B/C tensors if overridden for Gaussian distributions
+    if (data_distribution.kind == Distribution::Gaussian) {
+      double mean = data_distribution.gaussian.mean;
+      double stddev = data_distribution.gaussian.stddev;
+      int scale = data_distribution.int_scale;
+
+      if (name == "A" && data_distribution.gaussian.pnzA != 1.0) {
+        data_distribution.set_gaussian(mean, stddev, scale, data_distribution.gaussian.pnzA);
+      }
+      else if (name == "B" && data_distribution.gaussian.pnzB != 1.0) {
+        data_distribution.set_gaussian(mean, stddev, scale, data_distribution.gaussian.pnzB);
+      }
+      else if (name == "C" && data_distribution.gaussian.pnzC != 1.0) {
+        data_distribution.set_gaussian(mean, stddev, scale, data_distribution.gaussian.pnzC);
+      }
+    }
+
     if (options.initialization.provider == library::Provider::kReferenceDevice) {
-      allocation->initialize_random_device(
-        options.initialization.seed, 
-        data_distribution);
+      if (data_distribution.kind == Distribution::Sequential) {
+        allocation->initialize_sequential_device(
+          data_distribution);
+      }
+      else {
+        allocation->initialize_random_device(
+          options.initialization.seed + seed_shift,
+          data_distribution);
+      }
     }
     else if (options.initialization.provider == library::Provider::kReferenceHost) {
-      allocation->initialize_random_host(
-        options.initialization.seed, 
-        data_distribution);
+      if (data_distribution.kind == Distribution::Sequential) {
+        allocation->initialize_sequential_host(
+          data_distribution);
+      }
+      else {
+        allocation->initialize_random_host(
+          options.initialization.seed + seed_shift,
+          data_distribution);
+      }
     }
   }
 
   return allocation;
 }
 
-/// Allocates memory for sparse meta data 
-DeviceAllocation *DeviceContext::allocate_sparsemeta_tensor(
+/// Allocates memory for sparse meta data
+DeviceAllocation *DeviceContext::allocate_and_initialize_sparsemeta_tensor(
   Options const &options,
   std::string const &name,
-  library::NumericTypeID type, 
-  library::LayoutTypeID layout_id, 
+  library::NumericTypeID type,
+  library::LayoutTypeID layout_id,
   library::NumericTypeID type_a,
-  std::vector<int> const &extent, 
+  std::vector<int> const &extent,
   std::vector<int64_t> const &stride,
-  int batch_count) {
+  int batch_count,
+  int seed_shift,
+  size_t device_index) {
 
-  DeviceAllocation *allocation = 
-    allocate_tensor(name, type, layout_id, extent, stride, batch_count);
+  DeviceAllocation *allocation =
+      allocate_tensor(options, name, type, layout_id, extent, stride,
+                      batch_count, device_index);
 
   if (options.initialization.enabled) {
     // TF32 has 4bit meta data.  The rest has 2bit.
@@ -151,12 +199,12 @@ DeviceAllocation *DeviceContext::allocate_sparsemeta_tensor(
 
     if (options.initialization.provider == library::Provider::kReferenceDevice) {
       allocation->initialize_random_sparsemeta_device(
-        options.initialization.seed, 
+        options.initialization.seed + seed_shift,
         MetaSizeInBits);
     }
     else if (options.initialization.provider == library::Provider::kReferenceHost) {
       allocation->initialize_random_sparsemeta_host(
-        options.initialization.seed, 
+        options.initialization.seed + seed_shift,
         MetaSizeInBits);
     }
   }
diff --git a/tools/profiler/src/enumerated_types.cpp b/tools/profiler/src/enumerated_types.cpp
index 3b8a283e69..8e8ebe9836 100644
--- a/tools/profiler/src/enumerated_types.cpp
+++ b/tools/profiler/src/enumerated_types.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -32,7 +32,7 @@
    \brief Provides several functions for filling tensors with data.
 */
 
-#include "enumerated_types.h"
+#include "cutlass/profiler/enumerated_types.h"
 
 namespace cutlass {
 namespace profiler {
diff --git a/tools/profiler/src/gemm_operation_profiler.cu b/tools/profiler/src/gemm_operation_profiler.cu
index ff5ba645e0..1bed599f13 100644
--- a/tools/profiler/src/gemm_operation_profiler.cu
+++ b/tools/profiler/src/gemm_operation_profiler.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -36,17 +36,18 @@
 #include <stdexcept>
 #include <iomanip>
 #include <ios>
+#include <vector>
 
 #include "cutlass/core_io.h"
+#include <cuda_runtime_api.h>
+#include <cuda/atomic>
 
-#include "cublas_helpers.h"
-#include "gemm_operation_profiler.h"
-#include "gpu_timer.h"
-
+#include "cutlass/profiler/cublas_helpers.h"
+#include "cutlass/profiler/gemm_operation_profiler.h"
+#include "cutlass/profiler/gpu_timer.h"
 #include "cutlass/library/singleton.h"
 #include "cutlass/library/library.h"
 #include "cutlass/library/handle.h"
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
@@ -56,23 +57,26 @@ namespace profiler {
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Ctor
-GemmOperationProfiler::GemmOperationProfiler(Options const &options): 
+GemmOperationProfiler::GemmOperationProfiler(Options const &options):
   OperationProfiler(
     options,
     library::OperationKind::kGemm,
     {
-      {ArgumentTypeID::kEnumerated, {"gemm_kind"}, "Variant of GEMM (gemm, batched, array, universal, planar_complex, planar_complex_array)"},
-      {ArgumentTypeID::kEnumerated, {"split_k_mode"}, "Variant of split K mode(serial, parallel)"},
+      {ArgumentTypeID::kEnumerated, {"gemm_kind"}, "Variant of GEMM (universal, gemm, planar_complex, planar_complex_array)"},
       {ArgumentTypeID::kInteger, {"m", "problem-size::m"}, "M dimension of the GEMM problem space"},
       {ArgumentTypeID::kInteger, {"n", "problem-size::n"}, "N dimension of the GEMM problem space"},
       {ArgumentTypeID::kInteger, {"k", "problem-size::k"}, "K dimension of the GEMM problem space"},
       {ArgumentTypeID::kTensor, {"A"}, "Tensor storing the A operand"},
       {ArgumentTypeID::kTensor, {"B"}, "Tensor storing the B operand"},
       {ArgumentTypeID::kTensor, {"C"}, "Tensor storing the C operand"},
+      {ArgumentTypeID::kTensor, {"D"}, "Tensor storing the D output"},
       {ArgumentTypeID::kScalar, {"alpha", "epilogue::alpha"}, "Epilogue scalar alpha"},
       {ArgumentTypeID::kScalar, {"beta", "epilogue::beta"}, "Epilogue scalar beta"},
+      {ArgumentTypeID::kEnumerated, {"split_k_mode", "split-k-mode"}, "Variant of split K mode(serial, parallel)"},
       {ArgumentTypeID::kInteger, {"split_k_slices", "split-k-slices"}, "Number of partitions of K dimension"},
       {ArgumentTypeID::kInteger, {"batch_count", "batch-count"}, "Number of GEMMs computed in one batch"},
+      {ArgumentTypeID::kEnumerated, {"raster_order", "raster-order"}, "Raster order (heuristic, along_n, along_m)"},
+      {ArgumentTypeID::kInteger, {"swizzle_size", "swizzle-size"}, "Size to swizzle"},
     },
     { library::Provider::kCUBLAS}
   ) {
@@ -108,7 +112,7 @@ void GemmOperationProfiler::print_examples(std::ostream &out) const {
     << "Run when A is f16 with column-major and B is any datatype with row-major (For column major, use column, col, or n. For row major use, row or t):\n"
     << "  $ cutlass_profiler --operation=Gemm --A=f16:column --B=*:row\n\n"
 
-    << "Profile a particular problem size with split K and paralell reduction:\n"
+    << "Profile a particular problem size with split K and parallel reduction:\n"
     << "  $ cutlass_profiler --operation=Gemm --split_k_mode=parallel --split_k_slices=2 --m=1024 --n=1024 --k=128\n\n"
 
     << "Using various input value distribution:\n"
@@ -118,7 +122,7 @@ void GemmOperationProfiler::print_examples(std::ostream &out) const {
 
     << "Run a kernel with cta tile size of 256x128x32 and save workspace if results are incorrect (note that --cta-tile::k=32 is default cta-tile size):\n"
     << " $ cutlass_profiler --operation=Gemm --cta_m=256 --cta_n=128  --cta_k=32 --save-workspace=incorrect\n\n"
-    
+
     << "Test your changes to gemm kernels with a quick functional test and save results in functional-test.csv:\n"
     << " $ cutlass_profiler  --operation=Gemm \\ \n"
     << "   --m=8,56,120,136,256,264,512,520,1024,1032,4096,8192,16384 \\ \n"
@@ -149,9 +153,9 @@ Status GemmOperationProfiler::GemmProblem::parse(
   library::GemmDescription const &operation_desc,
   ProblemSpace const &problem_space,
   ProblemSpace::Problem const &problem) {
-    
+
   this->mode = library::GemmUniversalMode::kGemm;
-  
+
   if (!arg_as_int(this->m, "m", problem_space, problem)) {
     // default value
     this->m = 1024;
@@ -161,19 +165,19 @@ Status GemmOperationProfiler::GemmProblem::parse(
     // default value
     this->n = 1024;
   }
-  
+
   if (!arg_as_int(this->k, "k", problem_space, problem)) {
     // default value
     this->k = 1024;
   }
 
   if (!arg_as_SplitKModeID(this->split_k_mode, "split_k_mode", problem_space, problem)) {
-    // defualt value
+    // default value
     this->split_k_mode = library::SplitKMode::kSerial;
   }
-  
+
   this->mode = library::GemmUniversalMode::kGemm;
-  if(this->split_k_mode == library::SplitKMode::kParallel) {
+  if (this->split_k_mode == library::SplitKMode::kParallel) {
     this->mode = library::GemmUniversalMode::kGemmSplitKParallel;
   }
 
@@ -181,7 +185,7 @@ Status GemmOperationProfiler::GemmProblem::parse(
     // default value
     this->split_k_slices = 1;
   }
-  
+
   if (!arg_as_int(this->batch_count, "batch_count", problem_space, problem)) {
     // default value
     this->batch_count = 1;
@@ -189,6 +193,16 @@ Status GemmOperationProfiler::GemmProblem::parse(
     this->mode = library::GemmUniversalMode::kBatched;
   }
 
+  if (!arg_as_int(this->swizzle_size, "swizzle_size", problem_space, problem)) {
+    // default value
+    this->swizzle_size = 1;
+  }
+
+  if (!arg_as_RasterOrder(this->raster_order, "raster_order", problem_space, problem)) {
+    // default value
+    this->raster_order = library::RasterOrder::kHeuristic;
+  }
+
   if (this->split_k_slices > 1 && this->batch_count > 1) {
     // At least one of these must be one
     return Status::kErrorInvalidProblem;
@@ -206,25 +220,29 @@ Status GemmOperationProfiler::GemmProblem::parse(
     return Status::kErrorInvalidProblem;
   }
 
+  if (!tensor_description_satisfies(operation_desc.D, "D", problem_space, problem)) {
+    return Status::kErrorInvalidProblem;
+  }
+
   if (!arg_as_scalar(
-    this->alpha, 
-    operation_desc.element_epilogue, 
-    "alpha", 
-    problem_space, 
+    this->alpha,
+    operation_desc.element_epilogue,
+    "alpha",
+    problem_space,
     problem)) {
 
     if (!cast_from_double(this->alpha, operation_desc.element_epilogue, 1)) {
       return Status::kErrorInternal;
     }
   }
-  
+
   if (!arg_as_scalar(
-    this->beta, 
-    operation_desc.element_epilogue, 
-    "beta", 
-    problem_space, 
+    this->beta,
+    operation_desc.element_epilogue,
+    "beta",
+    problem_space,
     problem)) {
-    
+
     if (!cast_from_double(this->beta, operation_desc.element_epilogue, 0)) {
       return Status::kErrorInternal;
     }
@@ -298,8 +316,6 @@ void GemmOperationProfiler::GemmProblem::initialize_result(
 
   set_argument(result, "gemm_kind", problem_space, library::to_string(operation_desc.gemm_kind));
 
-  set_argument(result, "split_k_mode", problem_space, library::to_string(split_k_mode));
-
   set_argument(result, "A", problem_space,
     std::string(library::to_string(operation_desc.A.element)) + ":" + library::to_string(operation_desc.A.layout));
 
@@ -309,12 +325,18 @@ void GemmOperationProfiler::GemmProblem::initialize_result(
   set_argument(result, "C", problem_space,
     std::string(library::to_string(operation_desc.C.element)) + ":" + library::to_string(operation_desc.C.layout));
 
+  set_argument(result, "D", problem_space,
+    std::string(library::to_string(operation_desc.D.element)) + ":" + library::to_string(operation_desc.D.layout));
+
   set_argument(result, "m", problem_space, m);
   set_argument(result, "n", problem_space, n);
   set_argument(result, "k", problem_space, k);
 
+  set_argument(result, "split_k_mode", problem_space, library::to_string(split_k_mode));
   set_argument(result, "split_k_slices", problem_space, split_k_slices);
   set_argument(result, "batch_count", problem_space, batch_count);
+  set_argument(result, "raster_order", problem_space, library::to_string(raster_order));
+  set_argument(result, "swizzle_size", problem_space, swizzle_size);
 
   set_argument(result, "alpha", problem_space,
     library::lexical_cast(alpha, operation_desc.element_epilogue));
@@ -327,14 +349,14 @@ void GemmOperationProfiler::GemmProblem::initialize_result(
 
 /// Extracts the problem dimensions
 Status GemmOperationProfiler::initialize_configuration(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
   ProblemSpace const &problem_space,
   ProblemSpace::Problem const &problem) {
 
-  library::GemmDescription const &operation_desc = 
+  library::GemmDescription const &operation_desc =
     static_cast<library::GemmDescription const &>(operation->description());
 
   if (operation_desc.gemm_kind != library::GemmKind::kUniversal) {
@@ -347,29 +369,48 @@ Status GemmOperationProfiler::initialize_configuration(
     return status;
   }
 
-  gemm_workspace_.configuration.mode = problem_.mode;
-  gemm_workspace_.configuration.problem_size.m() = int(problem_.m);
-  gemm_workspace_.configuration.problem_size.n() = int(problem_.n);
-  gemm_workspace_.configuration.problem_size.k() = int(problem_.k);
-  gemm_workspace_.configuration.lda = problem_.lda;
-  gemm_workspace_.configuration.ldb = problem_.ldb;
-  gemm_workspace_.configuration.ldc = problem_.ldc;
-  gemm_workspace_.configuration.ldd = problem_.ldc;
-
-  if (problem_.mode == library::GemmUniversalMode::kBatched) {
-    gemm_workspace_.configuration.batch_count = problem_.batch_count;
-  }
-  else {
-    gemm_workspace_.configuration.batch_count = problem_.split_k_slices;
-  }
+  const auto device_count = options.device.devices.size();
+
+  gemm_workspace_.clear();
+
+  for (size_t i = 0; i < device_count; ++i) {
+    cudaSetDevice(options.device.device_id(i));
+    gemm_workspace_.emplace_back();
+    cudaStreamCreateWithFlags(&gemm_workspace_[i].stream, cudaStreamNonBlocking);
+    gemm_workspace_[i].configuration.mode = problem_.mode;
+    gemm_workspace_[i].configuration.problem_size.m() = int(problem_.m);
+    gemm_workspace_[i].configuration.problem_size.n() = int(problem_.n);
+    gemm_workspace_[i].configuration.problem_size.k() = int(problem_.k);
+    gemm_workspace_[i].configuration.lda = problem_.lda;
+    gemm_workspace_[i].configuration.ldb = problem_.ldb;
+    gemm_workspace_[i].configuration.ldc = problem_.ldc;
+    gemm_workspace_[i].configuration.ldd = problem_.ldc;
+
+    gemm_workspace_[i].configuration.device_count = static_cast<int>(device_count);
+    gemm_workspace_[i].arguments.device_index = static_cast<int>(i);
+
+    if (problem_.mode == library::GemmUniversalMode::kBatched) {
+      gemm_workspace_[i].configuration.batch_count = problem_.batch_count;
+    }
+    else {
+      gemm_workspace_[i].configuration.batch_count = problem_.split_k_slices;
+    }
 
-  gemm_workspace_.arguments.A = nullptr;
-  gemm_workspace_.arguments.B = nullptr;
-  gemm_workspace_.arguments.C = nullptr;
-  gemm_workspace_.arguments.D = nullptr;
-  gemm_workspace_.arguments.alpha = problem_.alpha.data();
-  gemm_workspace_.arguments.beta = problem_.beta.data();
-  gemm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
+    gemm_workspace_[i].arguments.A = nullptr;
+    gemm_workspace_[i].arguments.B = nullptr;
+    gemm_workspace_[i].arguments.C = nullptr;
+    gemm_workspace_[i].arguments.D = nullptr;
+    gemm_workspace_[i].arguments.alpha = problem_.alpha.data();
+    gemm_workspace_[i].arguments.beta = problem_.beta.data();
+    gemm_workspace_[i].arguments.pointer_mode = library::ScalarPointerMode::kHost;
+    gemm_workspace_[i].arguments.swizzle_size = problem_.swizzle_size;
+    gemm_workspace_[i].arguments.raster_order = problem_.raster_order;
+    initialize_result_(this->model_result_, options, operation_desc, problem_space);
+
+    if (const auto can_implement = operation->can_implement(&gemm_workspace_[i].configuration, &gemm_workspace_[i].arguments); can_implement != Status::kSuccess) {
+      return can_implement;
+    }
+  }
 
   // initialize reduction operation for parallel splitKMode
   if (problem_.split_k_mode == library::SplitKMode::kParallel) {
@@ -378,15 +419,13 @@ Status GemmOperationProfiler::initialize_configuration(
     }
   }
 
-  initialize_result_(this->model_result_, options, operation_desc, problem_space);
-  
-  return operation->can_implement(&gemm_workspace_.configuration, &gemm_workspace_.arguments);
+  return status;
 }
 
 /// Initializes the performance result
 void GemmOperationProfiler::initialize_result_(
   PerformanceResult &result,
-  Options const &options,  
+  Options const &options,
   library::GemmDescription const &operation_desc,
   ProblemSpace const &problem_space) {
 
@@ -402,13 +441,15 @@ void GemmOperationProfiler::initialize_result_(
   result.bytes = problem_.bytes(operation_desc);
   result.flops = problem_.flops(operation_desc);
   result.runtime = 0;
+  result.runtime_vector.resize(options.device.devices.size(), 0);
 
 }
 
-/// Initialize redution problem dimentions and library::Operation
+/// Initialize reduction problem dimensions and library::Operation
 bool GemmOperationProfiler::initialize_reduction_configuration_(
   library::Operation const *operation,
   ProblemSpace::Problem const &problem) {
+
   library::GemmDescription const &gemm_desc =
     static_cast<library::GemmDescription const&>(operation->description());
 
@@ -421,24 +462,26 @@ bool GemmOperationProfiler::initialize_reduction_configuration_(
   }
 
   /// initialize library::ReductionConfiguration
-  gemm_workspace_.reduction_configuration.problem_size      = gemm::GemmCoord(int(problem_.n), int(problem_.m), int(problem_.k)).mn();
-  gemm_workspace_.reduction_configuration.partitions        = int(problem_.split_k_slices);
-  gemm_workspace_.reduction_configuration.partition_stride  = gemm::GemmCoord(int(problem_.n), int(problem_.m), int(problem_.k)).mn().product();
-  gemm_workspace_.reduction_configuration.ldw               = problem_.ldc;
-  gemm_workspace_.reduction_configuration.lds               = problem_.ldc;
-  gemm_workspace_.reduction_configuration.ldd               = problem_.ldc;
+  for (auto &gemm_workspace : gemm_workspace_) {
+    gemm_workspace.reduction_configuration.problem_size      = gemm::GemmCoord(int(problem_.n), int(problem_.m), int(problem_.k)).mn();
+    gemm_workspace.reduction_configuration.partitions        = int(problem_.split_k_slices);
+    gemm_workspace.reduction_configuration.partition_stride  = gemm::GemmCoord(int(problem_.n), int(problem_.m), int(problem_.k)).mn().product();
+    gemm_workspace.reduction_configuration.ldw               = problem_.ldc;
+    gemm_workspace.reduction_configuration.lds               = problem_.ldc;
+    gemm_workspace.reduction_configuration.ldd               = problem_.ldc;
+  }
 
   // find reduction operation
   library::ReductionFunctionalKey reduction_key(
     library::Provider::kCUTLASS,
     gemm_desc.tile_description.math_instruction.element_accumulator,    // element workspace
     gemm_desc.tile_description.math_instruction.element_accumulator,    // element accumulator
-    gemm_desc.C.element,                                                // element output
-    gemm_desc.element_epilogue                                          // element coumpute
+    gemm_desc.D.element,                                                // element output
+    gemm_desc.element_epilogue                                          // element compute
   );
 
   auto reduction_it = library::Singleton::get().operation_table.reduction_operations.find(reduction_key);
- 
+
   if (reduction_it == library::Singleton::get().operation_table.reduction_operations.end()) {
     return false;
   }
@@ -452,13 +495,19 @@ bool GemmOperationProfiler::initialize_reduction_configuration_(
 
 /// Initializes workspace
 Status GemmOperationProfiler::initialize_workspace(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
   ProblemSpace const &problem_space,
   ProblemSpace::Problem const &problem) {
 
+  cudaError_t result;
+  result = cudaSetDevice(options.device.device_id(0));
+  if (result != cudaSuccess) {
+    throw std::runtime_error("cudaSetDevice() failed.");
+  }
+
   library::Operation const* underlying_operation = operation;
 
   if (problem_.split_k_mode == library::SplitKMode::kParallel) {
@@ -467,80 +516,108 @@ Status GemmOperationProfiler::initialize_workspace(
     }
   }
 
-  library::GemmDescription const &operation_desc = 
+  library::GemmDescription const &operation_desc =
     static_cast<library::GemmDescription const &>(operation->description());
 
-  // Compute the number of copies of the problem to avoid L2 camping.
-  if (!options.profiling.workspace_count) {
-    int64_t bytes = problem_.bytes(operation_desc);
-    if (bytes < 3 * int64_t(options.device.properties.l2CacheSize)) {
-      gemm_workspace_.problem_count = 
-        1 + int((3 * int64_t(options.device.properties.l2CacheSize)) / bytes);
+  bool is_sparse = operation_desc.tile_description.math_instruction.opcode_class == cutlass::library::OpcodeClassID::kSparseTensorOp;
+
+  for (size_t i = 0; i < gemm_workspace_.size(); ++i) {
+    cudaSetDevice(options.device.device_id(i));
+
+    // Compute the number of copies of the problem to avoid L2 camping.
+    if (!options.profiling.workspace_count) {
+      int64_t bytes = problem_.bytes(operation_desc);
+      if (bytes < 3 * int64_t(options.device.properties[0].l2CacheSize)) {
+        gemm_workspace_[i].problem_count =
+          1 + int((3 * int64_t(options.device.properties[0].l2CacheSize)) / bytes);
+      }
+      else {
+        gemm_workspace_[i].problem_count = 1;
+      }
     }
     else {
-      gemm_workspace_.problem_count = 1;
+      gemm_workspace_[i].problem_count = options.profiling.workspace_count;
     }
-  }
-  else {
-    gemm_workspace_.problem_count = options.profiling.workspace_count;
-  }
-
-  if (options.execution_mode != ExecutionMode::kDryRun) {
-
-    gemm_workspace_.A = device_context.allocate_tensor(
-      options,
-      "A",
-      operation_desc.A.element,
-      operation_desc.A.layout,
-      {int(problem_.m), int(problem_.k)},
-      {int(problem_.lda)},
-      problem_.batch_count * gemm_workspace_.problem_count
-    );
-
-    gemm_workspace_.B = device_context.allocate_tensor(
-      options,
-      "B",
-      operation_desc.B.element,
-      operation_desc.B.layout,
-      {int(problem_.k), int(problem_.n)},
-      {int(problem_.ldb)},
-      problem_.batch_count * gemm_workspace_.problem_count
-    );
-
-    gemm_workspace_.C = device_context.allocate_tensor(
-      options,
-      "C",
-      operation_desc.C.element,
-      operation_desc.C.layout,
-      {int(problem_.m), int(problem_.n)},
-      {int(problem_.ldc)},
-      problem_.batch_count * gemm_workspace_.problem_count
-    );
 
-    gemm_workspace_.Computed = device_context.allocate_tensor(
-      "D",
-      operation_desc.C.element,
-      operation_desc.C.layout,
-      {int(problem_.m), int(problem_.n)},
-      {int(problem_.ldc)},
-      problem_.batch_count * gemm_workspace_.problem_count
-    );
-
-    gemm_workspace_.Reference = device_context.allocate_tensor(
-      "Reference",
-      operation_desc.C.element,
-      operation_desc.C.layout,
-      {int(problem_.m), int(problem_.n)},
-      {int(problem_.ldc)},
-      problem_.batch_count * gemm_workspace_.problem_count
-    );
-
-    gemm_workspace_.Reference->copy_from_device(gemm_workspace_.C->data());
+    bool allocate_device_tensors = options.execution_mode != ExecutionMode::kDryRun;
+    if (allocate_device_tensors) {
+      int seed_shift = 0;
+      gemm_workspace_[i].A = device_context.allocate_and_initialize_tensor(
+        options,
+        "A",
+        operation_desc.A.element,
+        operation_desc.A.layout,
+        {int(problem_.m), int(problem_.k)},
+        {int(problem_.lda)},
+        problem_.batch_count * gemm_workspace_[i].problem_count,
+        seed_shift++,
+        i // device_index
+      );
+
+      gemm_workspace_[i].B = device_context.allocate_and_initialize_tensor(
+        options,
+        "B",
+        operation_desc.B.element,
+        operation_desc.B.layout,
+        {int(problem_.k), int(problem_.n)},
+        {int(problem_.ldb)},
+        problem_.batch_count * gemm_workspace_[i].problem_count,
+        seed_shift++,
+        i // device_index
+      );
+
+      gemm_workspace_[i].C = device_context.allocate_and_initialize_tensor(
+        options,
+        "C",
+        operation_desc.C.element,
+        operation_desc.C.layout,
+        {int(problem_.m), int(problem_.n)},
+        {int(problem_.ldc)},
+        problem_.batch_count * gemm_workspace_[i].problem_count,
+        seed_shift++,
+        i // device_index
+      );
+
+      gemm_workspace_[i].Computed = device_context.allocate_tensor(
+        options,
+        "D",
+        operation_desc.D.element,
+        operation_desc.D.layout,
+        {int(problem_.m), int(problem_.n)},
+        {int(problem_.ldc)},
+        problem_.batch_count * gemm_workspace_[i].problem_count,
+        i // device_index
+      );
+
+      gemm_workspace_[i].Reference = device_context.allocate_tensor(
+        options,
+        "Reference",
+        operation_desc.D.element,
+        operation_desc.D.layout,
+        {int(problem_.m), int(problem_.n)},
+        {int(problem_.ldc)},
+        problem_.batch_count * gemm_workspace_[i].problem_count,
+        i // device_index
+      );
+    }
 
-    gemm_workspace_.arguments.batch_stride_A = gemm_workspace_.A->batch_stride();
-    gemm_workspace_.arguments.batch_stride_B = gemm_workspace_.B->batch_stride();
-    gemm_workspace_.arguments.batch_stride_C = gemm_workspace_.C->batch_stride();
-    gemm_workspace_.arguments.batch_stride_D = gemm_workspace_.Computed->batch_stride();
+    if (options.execution_mode != ExecutionMode::kDryRun) {
+      // NOTE: the leading non-batch strides are duplicated here for 3.0 API kernels
+      gemm_workspace_[i].arguments.problem_size = {int(problem_.m), int(problem_.n), int(problem_.k)};
+      gemm_workspace_[i].arguments.batch_count = problem_.batch_count;
+      gemm_workspace_[i].arguments.lda = problem_.lda;
+      gemm_workspace_[i].arguments.ldb = problem_.ldb;
+      gemm_workspace_[i].arguments.ldc = problem_.ldc;
+      gemm_workspace_[i].arguments.ldd = problem_.ldc;
+      gemm_workspace_[i].arguments.batch_stride_A = gemm_workspace_[i].A->batch_stride();
+      gemm_workspace_[i].arguments.batch_stride_B = gemm_workspace_[i].B->batch_stride();
+      gemm_workspace_[i].arguments.batch_stride_C = gemm_workspace_[i].C->batch_stride();
+      gemm_workspace_[i].arguments.batch_stride_D = gemm_workspace_[i].Computed->batch_stride();
+
+      /* Query device SM count to pass onto the kernel as an argument, where needed */
+      gemm_workspace_[i].arguments.sm_count = options.device.properties[0].multiProcessorCount;
+      gemm_workspace_[i].arguments.device_index = static_cast<int>(i);
+    }
   }
 
   //
@@ -551,38 +628,69 @@ Status GemmOperationProfiler::initialize_workspace(
   if (options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
 
     if (options.execution_mode != ExecutionMode::kDryRun) {
+      for (size_t i = 0; i < gemm_workspace_.size(); ++i) {
+        cudaSetDevice(options.device.device_id(i));
+        uint64_t workspace_size = underlying_operation->get_host_workspace_size(&gemm_workspace_[i].configuration);
+        gemm_workspace_[i].host_workspace.resize(workspace_size, 0);
+
+        workspace_size = underlying_operation->get_device_workspace_size(&gemm_workspace_[i].configuration,
+                                                              &gemm_workspace_[i].arguments);
+        if (is_sparse) {
+          // sparse gemm get_device_workspace_size() only return device workspace size per iteration
+          // Needs to multiply it w/ number of iteration
+          workspace_size *= gemm_workspace_[i].problem_count;
+        }
+        gemm_workspace_[i].device_workspace.reset(library::NumericTypeID::kU8, workspace_size);
+
+        // Convert to structure sparse contents here.
+        if (is_sparse) {
+          uint8_t* profiler_workspaces[1];
+          profiler_workspaces[0] = reinterpret_cast<uint8_t*>(gemm_workspace_[i].A->data());
+          // Sparse operations have a different initialize interface.
+          // initialize_with_profiler_workspace converts mxk tensorA to compressed mxk/sp tensorA and the tensorE
+          auto modifiable_underlying_op = const_cast<library::Operation*>(underlying_operation);
+          status = modifiable_underlying_op->initialize_with_profiler_workspace(
+            &gemm_workspace_[i].configuration,
+            gemm_workspace_[i].host_workspace.data(),
+            gemm_workspace_[i].device_workspace.data(),
+            profiler_workspaces,
+            gemm_workspace_[i].problem_count,
+            gemm_workspace_[i].stream);
+        }
+        else {
+          status = underlying_operation->initialize(
+            &gemm_workspace_[i].configuration,
+            gemm_workspace_[i].host_workspace.data(),
+            gemm_workspace_[i].device_workspace.data(),
+            gemm_workspace_[i].stream);
+        }
 
-      uint64_t workspace_size = underlying_operation->get_host_workspace_size(&gemm_workspace_.configuration);
-      gemm_workspace_.host_workspace.resize(workspace_size, 0);
-
-      workspace_size = underlying_operation->get_device_workspace_size(&gemm_workspace_.configuration,
-                                                            &gemm_workspace_.arguments);
-      gemm_workspace_.device_workspace.reset(library::NumericTypeID::kU8, workspace_size);
-
-      status = underlying_operation->initialize(
-        &gemm_workspace_.configuration,
-        gemm_workspace_.host_workspace.data(),
-        gemm_workspace_.device_workspace.data());
-
-      if (status != Status::kSuccess) {
-        return status;
-      }
+        if (status != Status::kSuccess) {
+          return status;
+        }
 
-      if (problem_.split_k_mode == library::SplitKMode::kParallel) {
-        workspace_size = reduction_op_->get_host_workspace_size(&gemm_workspace_.reduction_configuration);
-        gemm_workspace_.reduction_host_workspace.resize(workspace_size, 0);
+        if (problem_.split_k_mode == library::SplitKMode::kParallel) {
+          workspace_size = reduction_op_->get_host_workspace_size(&gemm_workspace_[i].reduction_configuration);
+          gemm_workspace_[i].reduction_host_workspace.resize(workspace_size, 0);
 
-        status = reduction_op_->initialize(
-          &gemm_workspace_.reduction_configuration,
-          gemm_workspace_.reduction_host_workspace.data(),
-          nullptr);
+          status = reduction_op_->initialize(
+            &gemm_workspace_[i].reduction_configuration,
+            gemm_workspace_[i].reduction_host_workspace.data(),
+            nullptr,
+            gemm_workspace_[i].stream);
 
-        if (status != Status::kSuccess) {
-          return status;
+          if (status != Status::kSuccess) {
+            return status;
+          }
         }
       }
     }
 
+    for (size_t i = 0; i < gemm_workspace_.size(); ++i) {
+      cudaSetDevice(options.device.device_id(i));
+      cudaDeviceSynchronize();
+    }
+
     //
     // If CUTLASS is enabled, generate a result for it
     //
@@ -591,11 +699,10 @@ Status GemmOperationProfiler::initialize_workspace(
     results_.back().op_kind = library::OperationKind::kGemm;
     results_.back().disposition = Disposition::kNotRun;
 
-    for(auto provider : verification_providers_) {
+    for (auto provider : verification_providers_) {
       results_.back().verification_map[provider] = Disposition::kNotRun;
     }
   }
-
   return status;
 }
 
@@ -603,7 +710,7 @@ Status GemmOperationProfiler::initialize_workspace(
 
 /// Verifies CUTLASS against references
 bool GemmOperationProfiler::verify_cutlass(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
@@ -619,29 +726,31 @@ bool GemmOperationProfiler::verify_cutlass(
   }
 
   // Initialize structure containing GEMM arguments
-  gemm_workspace_.arguments.A = gemm_workspace_.A->data();
-  gemm_workspace_.arguments.B = gemm_workspace_.B->data();
-  gemm_workspace_.arguments.C = gemm_workspace_.C->data();
-  gemm_workspace_.arguments.D = gemm_workspace_.Computed->data();
-  gemm_workspace_.arguments.alpha = problem_.alpha.data();
-  gemm_workspace_.arguments.beta = problem_.beta.data();
-  gemm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
-  gemm_workspace_.arguments.batch_stride_A = gemm_workspace_.A->batch_stride();
-  gemm_workspace_.arguments.batch_stride_B = gemm_workspace_.B->batch_stride();
-  gemm_workspace_.arguments.batch_stride_C = gemm_workspace_.C->batch_stride();
-  gemm_workspace_.arguments.batch_stride_D = gemm_workspace_.Computed->batch_stride();
+  for (size_t i = 0; i < gemm_workspace_.size(); ++i) {
+    gemm_workspace_[i].arguments.A = gemm_workspace_[i].A->data();
+    gemm_workspace_[i].arguments.B = gemm_workspace_[i].B->data();
+    gemm_workspace_[i].arguments.C = gemm_workspace_[i].C->data();
+    gemm_workspace_[i].arguments.D = gemm_workspace_[i].Computed->data();
+    gemm_workspace_[i].arguments.alpha = problem_.alpha.data();
+    gemm_workspace_[i].arguments.beta = problem_.beta.data();
+    gemm_workspace_[i].arguments.pointer_mode = library::ScalarPointerMode::kHost;
+    gemm_workspace_[i].arguments.batch_stride_A = gemm_workspace_[i].A->batch_stride();
+    gemm_workspace_[i].arguments.batch_stride_B = gemm_workspace_[i].B->batch_stride();
+    gemm_workspace_[i].arguments.batch_stride_C = gemm_workspace_[i].C->batch_stride();
+    gemm_workspace_[i].arguments.batch_stride_D = gemm_workspace_[i].Computed->batch_stride();
 
-  if (problem_.split_k_mode == library::SplitKMode::kParallel) {
-    gemm_workspace_.arguments.D                       = gemm_workspace_.device_workspace.data();
-    gemm_workspace_.arguments.alpha                   = problem_.alpha_one.data();
-    gemm_workspace_.arguments.beta                    = problem_.beta_zero.data();
-
-    gemm_workspace_.reduction_arguments.workspace     = gemm_workspace_.device_workspace.data();
-    gemm_workspace_.reduction_arguments.source        = gemm_workspace_.C->data();
-    gemm_workspace_.reduction_arguments.destination   = gemm_workspace_.Computed->data();
-    gemm_workspace_.reduction_arguments.alpha         = problem_.alpha.data();
-    gemm_workspace_.reduction_arguments.beta          = problem_.beta.data();
-    gemm_workspace_.reduction_arguments.pointer_mode  = library::ScalarPointerMode::kHost;
+    if (problem_.split_k_mode == library::SplitKMode::kParallel) {
+      gemm_workspace_[i].arguments.D                       = gemm_workspace_[i].device_workspace.data();
+      gemm_workspace_[i].arguments.alpha                   = problem_.alpha_one.data();
+      gemm_workspace_[i].arguments.beta                    = problem_.beta_zero.data();
+
+      gemm_workspace_[i].reduction_arguments.workspace     = gemm_workspace_[i].device_workspace.data();
+      gemm_workspace_[i].reduction_arguments.source        = gemm_workspace_[i].C->data();
+      gemm_workspace_[i].reduction_arguments.destination   = gemm_workspace_[i].Computed->data();
+      gemm_workspace_[i].reduction_arguments.alpha         = problem_.alpha.data();
+      gemm_workspace_[i].reduction_arguments.beta          = problem_.beta.data();
+      gemm_workspace_[i].reduction_arguments.pointer_mode  = library::ScalarPointerMode::kHost;
+    }
   }
 
   //
@@ -658,27 +767,33 @@ bool GemmOperationProfiler::verify_cutlass(
     }
   }
 
-  results_.back().status = underlying_operation->run(
-    &gemm_workspace_.arguments, 
-    gemm_workspace_.host_workspace.data(),
-    gemm_workspace_.device_workspace.data());
-
-  if (results_.back().status != Status::kSuccess) {
-    results_.back().disposition = Disposition::kFailed;
-    return false;
-  }
+  for (size_t i = 0; i < gemm_workspace_.size(); ++i) {
+    cudaSetDevice(options.device.device_id(i));
 
-  // Run parallel reduction kernel for parallel split_k_mode
-  if (problem_.split_k_mode == library::SplitKMode::kParallel) {
-    results_.back().status = reduction_op_->run(
-      &gemm_workspace_.reduction_arguments,
-      gemm_workspace_.reduction_host_workspace.data(),
-      nullptr);
+    results_.back().status = underlying_operation->run(
+     &gemm_workspace_[i].arguments,
+     gemm_workspace_[i].host_workspace.data(),
+     gemm_workspace_[i].device_workspace.data(),
+     gemm_workspace_[i].stream);
 
     if (results_.back().status != Status::kSuccess) {
       results_.back().disposition = Disposition::kFailed;
       return false;
     }
+
+    // Run parallel reduction kernel for parallel split_k_mode
+    if (problem_.split_k_mode == library::SplitKMode::kParallel) {
+      results_.back().status = reduction_op_->run(
+        &gemm_workspace_[i].reduction_arguments,
+        gemm_workspace_[i].reduction_host_workspace.data(),
+        nullptr,
+        gemm_workspace_[i].stream);
+
+      if (results_.back().status != Status::kSuccess) {
+        results_.back().disposition = Disposition::kFailed;
+        return false;
+      }
+    }
   }
 
   cudaError_t result = cudaDeviceSynchronize();
@@ -705,13 +820,17 @@ bool GemmOperationProfiler::verify_cutlass(
       if (cublas_satisfies(gemm_desc) == Status::kSuccess) {
 
         // call cublas verification if supported
-        verify_with_cublas_(
-          options,
-          report,
-          device_context,
-          operation,
-          problem_space,
-          problem);
+        for (size_t i = 0; i < gemm_workspace_.size(); ++i) {
+          cudaSetDevice(options.device.device_id(i));
+          verify_with_cublas_(
+           options,
+           report,
+           device_context,
+           operation,
+           problem_space,
+           problem,
+           gemm_workspace_[i]);
+        }
         }
 
       else {
@@ -721,26 +840,45 @@ bool GemmOperationProfiler::verify_cutlass(
     }
 #endif // #if CUTLASS_ENABLE_CUBLAS
 
-    verify_with_reference_(options, report, device_context, operation, problem_space, problem);
-    
-    // Update disposition to worst case verification outcome among all 
+    library::GemmDescription const &gemm_desc =
+      static_cast<library::GemmDescription const &>(operation->description());
+
+
+    cutlass::library::NumericTypeID element_A = gemm_desc.A.element;
+    cutlass::library::NumericTypeID element_B = gemm_desc.B.element;
+    bool verification_status = verify_with_reference_(options, report, device_context, operation, problem_space, problem, element_A, element_B);
+
+    // Update disposition to worst case verification outcome among all
     // verification providers which are supported
     bool is_any_verification_run_passed = false;
-    for(auto &m : results_.back().verification_map) {
-      if(m.second == Disposition::kFailed || m.second == Disposition::kIncorrect) {
+    for (auto &m : results_.back().verification_map) {
+      if (m.second == Disposition::kFailed || m.second == Disposition::kIncorrect) {
         results_.back().disposition = m.second;
         return true;
       }
-      if(!is_any_verification_run_passed && m.second == Disposition::kPassed) {
+      if (!is_any_verification_run_passed && m.second == Disposition::kPassed) {
         is_any_verification_run_passed = true;
       }
     }
 
-    if(is_any_verification_run_passed) {
+    if (is_any_verification_run_passed) {
       results_.back().disposition = Disposition::kPassed;
     }
   }
 
+  // if verification.required is set, then return success iff at least one ref-check was run
+  if (options.verification.required) {
+    bool did_any_verification_run = false;
+    for (auto provider : options.verification.providers) {
+      did_any_verification_run |= (Disposition::kNotRun != results_.back().verification_map[provider]);
+    }
+
+    if (not did_any_verification_run) {
+      results_.back().status = Status::kErrorNotSupported;
+      return false;
+    }
+  }
+
   // Return true means continue profiling
   return true;
 }
@@ -749,43 +887,31 @@ bool GemmOperationProfiler::verify_cutlass(
 
 /// Verifies CUTLASS against references
 bool GemmOperationProfiler::verify_with_cublas_(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
   ProblemSpace const &problem_space,
-  ProblemSpace::Problem const &problem) {
-
+  ProblemSpace::Problem const &problem,
+  GemmWorkspace &gemm_workspace_) {
 
 #if CUTLASS_ENABLE_CUBLAS
 
-  library::GemmDescription const &gemm_desc = 
+  library::GemmDescription const &gemm_desc =
     static_cast<library::GemmDescription const &>(operation->description());
 
   //
   // Construct cuBLAS operators
   //
-    
-  CublasCreate handle;
-  cublasStatus_t status = handle.get_cublas_create_status();
 
-  if (status != CUBLAS_STATUS_SUCCESS) {
+  CublasLtCreate handle;
+  cublasStatus_t status = handle.get_cublaslt_create_status();
 
+  if (status != CUBLAS_STATUS_SUCCESS) {
     results_.back().verification_map[library::Provider::kCUBLAS] = get_cutlass_disposition(status);
     return true;
   }
 
-  std::vector<cublasGemmAlgo_t> algorithms;
-
-  detail::select_cublas_algorithms(
-    algorithms, 
-    options, 
-    gemm_desc);
-
-  if (algorithms.empty()) {
-    // no algorithm selected
-    return true;
-  }
 
   //
   // Initialize state
@@ -810,29 +936,34 @@ bool GemmOperationProfiler::verify_with_cublas_(
     gemm_workspace_.arguments.beta = problem_.beta.data();
     gemm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
 
-    detail::cublasGemmExDispatcher gemm_op( 
-      gemm_desc, 
+    detail::cublasLtGemmExDispatcher gemm_op(
+      gemm_desc,
       gemm_workspace_.configuration,
-      gemm_workspace_.arguments,
-      algorithms.front()
+      gemm_workspace_.arguments
     );
 
+    gemm_op.initialize_cublaslt();
+
+    if(!gemm_op.get_cublaslt_algo(handle, AlgorithmMode::kDefault)){
+      return true;
+    }
+
     if (gemm_op.status != Status::kSuccess) {
       results_.back().verification_map[library::Provider::kCUBLAS] = Disposition::kNotRun;
       return true;
     }
 
-    results_.back().status = Status::kSuccess;
-
     status = gemm_op(handle);
 
     // Handle errors
     if (status != CUBLAS_STATUS_SUCCESS) {
-
+      std::cerr << "cublasLt Verification run failed with status : " << cublasLtGetStatusName(status) << "\n";
       results_.back().verification_map[library::Provider::kCUBLAS] = get_cutlass_disposition(status);
       return true;
     }
 
+    results_.back().status = Status::kSuccess;
+
     //
     // Verify results
     //
@@ -845,7 +976,7 @@ bool GemmOperationProfiler::verify_with_cublas_(
     );
 
     // Save workspace if incorrect
-    if (options.verification.save_workspace == SaveWorkspace::kIncorrect && 
+    if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
       results_.back().verification_map[library::Provider::kCUBLAS] == Disposition::kIncorrect) {
 
       save_workspace(
@@ -870,140 +1001,142 @@ bool GemmOperationProfiler::verify_with_cublas_(
 
 /// Verifies CUTLASS against host and device references
 bool GemmOperationProfiler::verify_with_reference_(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
   ProblemSpace const &problem_space,
-  ProblemSpace::Problem const &problem) {
-
-  library::GemmDescription const &gemm_desc = 
+  ProblemSpace::Problem const &problem,
+  cutlass::library::NumericTypeID element_A,
+  cutlass::library::NumericTypeID element_B)
+{
+  library::GemmDescription const &gemm_desc =
     static_cast<library::GemmDescription const &>(operation->description());
 
   //
   // Initialize state
   //
 
-  library::Provider references[] = {
-    library::Provider::kReferenceDevice,
-    library::Provider::kReferenceHost
-  };
-
-  for (auto provider : references) {
+  for (auto provider : options.verification.providers) {
 
     // Skip providers that are not enabled
     if (!options.verification.provider_enabled(provider)) {
       continue;
     }
 
-    void *ptr_A = gemm_workspace_.A->data();
-    void *ptr_B = gemm_workspace_.B->data();
-    void *ptr_C = gemm_workspace_.C->data();
-    void *ptr_D = gemm_workspace_.Reference->data();
+    for (size_t i = 0; i < gemm_workspace_.size(); ++i) {
+      cudaSetDevice(options.device.device_id(i));
 
-    // To support the host-side reference, conditionally allocate and
-    // copy tensors to host memory.
-    std::vector<uint8_t> host_data_A;
-    std::vector<uint8_t> host_data_B;
-    std::vector<uint8_t> host_data_C;
-    std::vector<uint8_t> host_data_D;
+      void *ptr_A = gemm_workspace_[i].A->data();
+      void *ptr_B = gemm_workspace_[i].B->data();
+      void *ptr_C = gemm_workspace_[i].C->data();
+      void *ptr_D = gemm_workspace_[i].Reference->data();
 
-    if (provider == library::Provider::kReferenceHost) {
+      // To support the host-side reference, conditionally allocate and
+      // copy tensors to host memory.
+      std::vector<uint8_t> host_data_A;
+      std::vector<uint8_t> host_data_B;
+      std::vector<uint8_t> host_data_C;
+      std::vector<uint8_t> host_data_D;
 
-      host_data_A.resize(gemm_workspace_.A->bytes());
-      ptr_A = host_data_A.data();
-      gemm_workspace_.A->copy_to_host(ptr_A);
+      if (provider == library::Provider::kReferenceHost) {
 
-      host_data_B.resize(gemm_workspace_.B->bytes());
-      ptr_B = host_data_B.data();
-      gemm_workspace_.B->copy_to_host(ptr_B);
+        host_data_A.resize(gemm_workspace_[i].A->bytes());
+        ptr_A = host_data_A.data();
+        gemm_workspace_[i].A->copy_to_host(ptr_A);
 
-      host_data_C.resize(gemm_workspace_.C->bytes());
-      ptr_C = host_data_C.data();
-      gemm_workspace_.C->copy_to_host(ptr_C);
+        host_data_B.resize(gemm_workspace_[i].B->bytes());
+        ptr_B = host_data_B.data();
+        gemm_workspace_[i].B->copy_to_host(ptr_B);
 
-      host_data_D.resize(gemm_workspace_.Reference->bytes());
-      ptr_D = host_data_D.data();
-    }
+        host_data_C.resize(gemm_workspace_[i].C->bytes());
+        ptr_C = host_data_C.data();
+        gemm_workspace_[i].C->copy_to_host(ptr_C);
 
-    //
-    // Launch
-    //
+        host_data_D.resize(gemm_workspace_[i].Reference->bytes());
+        ptr_D = host_data_D.data();
+      }
 
-    library::Handle handle;
+      //
+      // Launch
+      //
 
-    handle.set_provider(provider);
+      library::Handle handle;
 
-    Status status = handle.gemm_universal(
-      problem_.mode,
-      gemm_workspace_.configuration.problem_size.m(),
-      gemm_workspace_.configuration.problem_size.n(),
-      gemm_workspace_.configuration.problem_size.k(),
-      gemm_desc.tile_description.math_instruction.element_accumulator,
-      gemm_desc.element_epilogue,
+      handle.set_provider(provider);
 
-      problem_.alpha.data(),
+      Status status = handle.gemm_universal(
+        problem_.mode,
+        gemm_workspace_[i].configuration.problem_size.m(),
+        gemm_workspace_[i].configuration.problem_size.n(),
+        gemm_workspace_[i].configuration.problem_size.k(),
+        gemm_desc.tile_description.math_instruction.element_accumulator,
+        gemm_desc.element_epilogue,
 
-      gemm_desc.A.element,
-      gemm_desc.A.layout,
-      gemm_desc.transform_A,
-      ptr_A,
-      int(gemm_workspace_.configuration.lda),
+        problem_.alpha.data(),
 
-      gemm_desc.B.element,
-      gemm_desc.B.layout,
-      gemm_desc.transform_B,
-      ptr_B,
-      int(gemm_workspace_.configuration.ldb),
+        element_A,
+        gemm_desc.A.layout,
+        gemm_desc.transform_A,
+        ptr_A,
+        int(gemm_workspace_[i].configuration.lda),
 
-      problem_.beta.data(),
+        element_B,
+        gemm_desc.B.layout,
+        gemm_desc.transform_B,
+        ptr_B,
+        int(gemm_workspace_[i].configuration.ldb),
 
-      gemm_desc.C.element,
-      ptr_C,
-      int(gemm_workspace_.configuration.ldc),
+        problem_.beta.data(),
 
-      ptr_D,
-      int(gemm_workspace_.configuration.ldd),
+        gemm_desc.C.element,
+        gemm_desc.C.layout,
+        ptr_C,
+        int(gemm_workspace_[i].configuration.ldc),
 
-      gemm_workspace_.configuration.batch_count,
-      gemm_workspace_.A->batch_stride(),
-      gemm_workspace_.B->batch_stride(),
-      gemm_workspace_.C->batch_stride(),
-      gemm_workspace_.Reference->batch_stride()
-    );
+        gemm_desc.D.element,
+        gemm_desc.D.layout,
+        ptr_D,
+        int(gemm_workspace_[i].configuration.ldd),
 
-    if (status != Status::kSuccess) {
-      results_.back().verification_map[provider] = Disposition::kNotRun;
-      return true;
-    }
+        gemm_workspace_[i].configuration.batch_count,
+        gemm_workspace_[i].A->batch_stride(),
+        gemm_workspace_[i].B->batch_stride(),
+        gemm_workspace_[i].C->batch_stride(),
+        gemm_workspace_[i].Reference->batch_stride());
 
-    results_.back().status = status;
+      if (status != Status::kSuccess) {
+        results_.back().verification_map[provider] = Disposition::kNotRun;
+        continue;
+      }
+      results_.back().status = status;
 
-    if (provider == library::Provider::kReferenceHost) {
-      gemm_workspace_.Reference->copy_from_host(ptr_D); 
-    }
+      if (provider == library::Provider::kReferenceHost) {
+        gemm_workspace_[i].Reference->copy_from_host(ptr_D);
+      }
 
-    //
-    // Verify results
-    //
+      //
+      // Verify results
+      //
 
-    results_.back().verification_map[provider] = compare_tensors(
-      options,
-      *gemm_workspace_.Computed,
-      *gemm_workspace_.Reference,
-      gemm_workspace_.Computed->batch_stride()
-    );
+      results_.back().verification_map[provider] = compare_tensors(
+        options,
+        *gemm_workspace_[i].Computed,
+        *gemm_workspace_[i].Reference,
+        gemm_workspace_[i].Computed->batch_stride()
+      );
 
-    // Save workspace if incorrect
-    if (options.verification.save_workspace == SaveWorkspace::kIncorrect && 
-      results_.back().verification_map[provider] == Disposition::kIncorrect) {
+      // Save workspace if incorrect
+      if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
+        results_.back().verification_map[provider] == Disposition::kIncorrect) {
 
-      save_workspace(
-        device_context,
-        options,
-        gemm_desc,
-        library::Provider::kCUTLASS,
-        provider);
+        save_workspace(
+          device_context,
+          options,
+          gemm_desc,
+          library::Provider::kCUTLASS,
+          provider);
+        }
     }
   }
 
@@ -1012,9 +1145,21 @@ bool GemmOperationProfiler::verify_with_reference_(
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+namespace {
+extern "C" {
+  __global__ void delay(cuda::atomic<bool> const* release) {
+    while (release->load(cuda::memory_order_acquire) != true) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700)
+      __nanosleep(100);
+#endif
+    }
+  }
+}
+}
+
 /// Measures performance results
 bool GemmOperationProfiler::profile(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
@@ -1023,39 +1168,41 @@ bool GemmOperationProfiler::profile(
 
   if (options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
 
-    // Initialize structure containing GEMM arguments
-    gemm_workspace_.arguments.A = gemm_workspace_.A->data();
-    gemm_workspace_.arguments.B = gemm_workspace_.B->data();
-    gemm_workspace_.arguments.C = gemm_workspace_.C->data();
-    gemm_workspace_.arguments.D = gemm_workspace_.Computed->data();
-    gemm_workspace_.arguments.alpha = problem_.alpha.data();
-    gemm_workspace_.arguments.beta = problem_.beta.data();
-    gemm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
-    gemm_workspace_.arguments.batch_stride_A = gemm_workspace_.A->batch_stride();
-    gemm_workspace_.arguments.batch_stride_B = gemm_workspace_.B->batch_stride();
-    gemm_workspace_.arguments.batch_stride_C = gemm_workspace_.C->batch_stride();
-    gemm_workspace_.arguments.batch_stride_D = gemm_workspace_.Computed->batch_stride();
+    for (size_t i = 0; i < gemm_workspace_.size(); ++i) {
+      // Initialize structure containing GEMM arguments
+      gemm_workspace_[i].arguments.A = gemm_workspace_[i].A->data();
+      gemm_workspace_[i].arguments.B = gemm_workspace_[i].B->data();
+      gemm_workspace_[i].arguments.C = gemm_workspace_[i].C->data();
+      gemm_workspace_[i].arguments.D = gemm_workspace_[i].Computed->data();
+      gemm_workspace_[i].arguments.alpha = problem_.alpha.data();
+      gemm_workspace_[i].arguments.beta = problem_.beta.data();
+      gemm_workspace_[i].arguments.pointer_mode = library::ScalarPointerMode::kHost;
+      gemm_workspace_[i].arguments.batch_stride_A = gemm_workspace_[i].A->batch_stride();
+      gemm_workspace_[i].arguments.batch_stride_B = gemm_workspace_[i].B->batch_stride();
+      gemm_workspace_[i].arguments.batch_stride_C = gemm_workspace_[i].C->batch_stride();
+      gemm_workspace_[i].arguments.batch_stride_D = gemm_workspace_[i].Computed->batch_stride();
 
-    if (problem_.split_k_mode == library::SplitKMode::kParallel) {
-      gemm_workspace_.arguments.D                       = gemm_workspace_.device_workspace.data();
-      gemm_workspace_.arguments.alpha                   = problem_.alpha_one.data();
-      gemm_workspace_.arguments.beta                    = problem_.beta_zero.data();
-
-      gemm_workspace_.reduction_arguments.workspace     = gemm_workspace_.device_workspace.data();
-      gemm_workspace_.reduction_arguments.source        = gemm_workspace_.C->data();
-      gemm_workspace_.reduction_arguments.destination   = gemm_workspace_.Computed->data();
-      gemm_workspace_.reduction_arguments.alpha         = problem_.alpha.data();
-      gemm_workspace_.reduction_arguments.beta          = problem_.beta.data();
-      gemm_workspace_.reduction_arguments.pointer_mode  = library::ScalarPointerMode::kHost;
+      if (problem_.split_k_mode == library::SplitKMode::kParallel) {
+        gemm_workspace_[i].arguments.D                       = gemm_workspace_[i].device_workspace.data();
+        gemm_workspace_[i].arguments.alpha                   = problem_.alpha_one.data();
+        gemm_workspace_[i].arguments.beta                    = problem_.beta_zero.data();
+
+        gemm_workspace_[i].reduction_arguments.workspace     = gemm_workspace_[i].device_workspace.data();
+        gemm_workspace_[i].reduction_arguments.source        = gemm_workspace_[i].C->data();
+        gemm_workspace_[i].reduction_arguments.destination   = gemm_workspace_[i].Computed->data();
+        gemm_workspace_[i].reduction_arguments.alpha         = problem_.alpha.data();
+        gemm_workspace_[i].reduction_arguments.beta          = problem_.beta.data();
+        gemm_workspace_[i].reduction_arguments.pointer_mode  = library::ScalarPointerMode::kHost;
+      }
     }
 
     results_.back().status = profile_cutlass_(
-      results_.back().runtime,
+      results_.back(),
       options,
       operation,
-      &gemm_workspace_.arguments,
-      gemm_workspace_.host_workspace.data(),
-      gemm_workspace_.device_workspace.data()
+      nullptr,
+      nullptr,
+      nullptr
     );
   }
   return true;
@@ -1065,15 +1212,22 @@ bool GemmOperationProfiler::profile(
 
 /// Method to profile a CUTLASS Operation
 Status GemmOperationProfiler::profile_cutlass_(
-  double &runtime,
+  PerformanceResult &result,
   Options const &options,
   library::Operation const *operation,
-  void *arguments,
-  void *host_workspace,
-  void *device_workspace) {
-
-  GpuTimer timer;
-
+  void *,
+  void *,
+  void *) {
+
+  cuda::atomic<bool> *release;
+  cudaHostAlloc(&release, sizeof(*release), cudaHostAllocPortable);
+  release->store(false, cuda::memory_order_release);
+
+  std::vector<GpuTimer> timer;
+  for (size_t i = 0; i < gemm_workspace_.size(); ++i) {
+    cudaSetDevice(options.device.device_id(i));
+    timer.emplace_back();
+  }
   // initialize gemm underlying operation to handle parallel reduction
   library::Operation const * underlying_operation = operation;
 
@@ -1095,111 +1249,158 @@ Status GemmOperationProfiler::profile_cutlass_(
 
   Status status;
 
-  for (int iteration = 0; iteration < options.profiling.warmup_iterations; ++iteration) {
-    
-    int problem_idx = (iteration % gemm_workspace_.problem_count) * problem_.batch_count;
-
-    gemm_workspace_.arguments.A = gemm_workspace_.A->batch_data(problem_idx);
-    gemm_workspace_.arguments.B = gemm_workspace_.B->batch_data(problem_idx);
-    gemm_workspace_.arguments.C = gemm_workspace_.C->batch_data(problem_idx);
-    gemm_workspace_.arguments.D = gemm_workspace_.Computed->batch_data(problem_idx);
-
-    if (problem_.split_k_mode == library::SplitKMode::kParallel) {
-      gemm_workspace_.arguments.D                     = gemm_workspace_.device_workspace.data();
-
-      gemm_workspace_.reduction_arguments.workspace   = gemm_workspace_.device_workspace.data();
-      gemm_workspace_.reduction_arguments.source      = gemm_workspace_.C->batch_data(problem_idx);
-      gemm_workspace_.reduction_arguments.destination = gemm_workspace_.Computed->batch_data(problem_idx);
-    }
+  std::vector<cudaGraph_t> graphs;
+  graphs.resize(gemm_workspace_.size());
+  std::vector<cudaGraphExec_t> graphExecs;
+  graphExecs.resize(gemm_workspace_.size());
+
+  for (size_t i = 0; i < gemm_workspace_.size(); ++i) {
+    cudaSetDevice(options.device.device_id(i));
+    cudaStreamBeginCapture(gemm_workspace_[i].stream, cudaStreamCaptureModeGlobal);
+    // Halt execution until all GPUs are ready to precede.
+    // It allows the CPU to trigger the GPUs all start at the same time.
+    delay<<<1, 1, 0, gemm_workspace_[i].stream>>>(release);
+    for (int iteration = 0; iteration < options.profiling.warmup_iterations; ++iteration) {
+      int problem_idx = (iteration % gemm_workspace_[i].problem_count) * problem_.batch_count;
+
+      gemm_workspace_[i].arguments.A = gemm_workspace_[i].A->batch_data(problem_idx);
+      gemm_workspace_[i].arguments.B = gemm_workspace_[i].B->batch_data(problem_idx);
+      gemm_workspace_[i].arguments.C = gemm_workspace_[i].C->batch_data(problem_idx);
+      gemm_workspace_[i].arguments.D = gemm_workspace_[i].Computed->batch_data(problem_idx);
 
-    // Execute the CUTLASS operation
-    status = underlying_operation->run(
-      &gemm_workspace_.arguments,
-      host_workspace,
-      device_workspace);
+      if (problem_.split_k_mode == library::SplitKMode::kParallel) {
+        gemm_workspace_[i].arguments.D                     = gemm_workspace_[i].device_workspace.data();
 
-    if (status != Status::kSuccess) {
-      return status;
-    }
+        gemm_workspace_[i].reduction_arguments.workspace   = gemm_workspace_[i].device_workspace.data();
+        gemm_workspace_[i].reduction_arguments.source      = gemm_workspace_[i].C->batch_data(problem_idx);
+        gemm_workspace_[i].reduction_arguments.destination = gemm_workspace_[i].Computed->batch_data(problem_idx);
+      }
 
-    // Run parallel reduction kernel for parallel split_k_mode
-    if (problem_.split_k_mode == library::SplitKMode::kParallel) {
-      status = reduction_op_->run(
-        &gemm_workspace_.reduction_arguments,
-        gemm_workspace_.reduction_host_workspace.data(),
-        nullptr);
+      // Execute the CUTLASS operation
+      status = underlying_operation->run(
+        &gemm_workspace_[i].arguments,
+        gemm_workspace_[i].host_workspace.data(),
+        gemm_workspace_[i].device_workspace.data(),
+        gemm_workspace_[i].stream);
 
       if (status != Status::kSuccess) {
         return status;
       }
-    }
-  }
 
-  //
-  // Initialize GPU timer
-  //
+      // Run parallel reduction kernel for parallel split_k_mode
+      if (problem_.split_k_mode == library::SplitKMode::kParallel) {
+        status = reduction_op_->run(
+          &gemm_workspace_[i].reduction_arguments,
+          gemm_workspace_[i].reduction_host_workspace.data(),
+          nullptr,
+          gemm_workspace_[i].stream);
 
-  timer.start();
+        if (status != Status::kSuccess) {
+          return status;
+        }
+      }
+    }
 
-  //
-  // Profiling loop
-  //
+    //
+    // Initialize GPU timer
+    //
 
-  int Iterations = options.profiling.iterations;
+    timer[i].start(gemm_workspace_[i].stream, cudaEventRecordExternal);
 
-  int iteration = 0;
-  for (; iteration < Iterations; ++iteration) {
-    
-    // Iterate over copies of the problem in memory
-    int workspace_idx = options.profiling.warmup_iterations + iteration;
-    int problem_idx = (workspace_idx % gemm_workspace_.problem_count) * problem_.batch_count;
+    //
+    // Profiling loop
+    //
 
-    gemm_workspace_.arguments.A = gemm_workspace_.A->batch_data(problem_idx);
-    gemm_workspace_.arguments.B = gemm_workspace_.B->batch_data(problem_idx);
-    gemm_workspace_.arguments.C = gemm_workspace_.C->batch_data(problem_idx);
-    gemm_workspace_.arguments.D = gemm_workspace_.Computed->batch_data(problem_idx);
+    int Iterations = options.profiling.iterations;
 
-    if (problem_.split_k_mode == library::SplitKMode::kParallel) {
-      gemm_workspace_.arguments.D                     = gemm_workspace_.device_workspace.data();
+    int iteration = 0;
 
-      gemm_workspace_.reduction_arguments.workspace   = gemm_workspace_.device_workspace.data();
-      gemm_workspace_.reduction_arguments.source      = gemm_workspace_.C->batch_data(problem_idx);
-      gemm_workspace_.reduction_arguments.destination = gemm_workspace_.Computed->batch_data(problem_idx);
-    }
+    for (; iteration < Iterations; ++iteration) {
+      // Iterate over copies of the problem in memory
+      int workspace_idx = options.profiling.warmup_iterations + iteration;
+      int problem_idx = (workspace_idx % gemm_workspace_[i].problem_count) * problem_.batch_count;
 
-    status = underlying_operation->run(
-      arguments,
-      host_workspace,
-      device_workspace);
+      gemm_workspace_[i].arguments.A = gemm_workspace_[i].A->batch_data(problem_idx);
+      gemm_workspace_[i].arguments.B = gemm_workspace_[i].B->batch_data(problem_idx);
+      gemm_workspace_[i].arguments.C = gemm_workspace_[i].C->batch_data(problem_idx);
+      gemm_workspace_[i].arguments.D = gemm_workspace_[i].Computed->batch_data(problem_idx);
 
-    if (status != Status::kSuccess) {
-      return status;
-    }
+      if (problem_.split_k_mode == library::SplitKMode::kParallel) {
+        gemm_workspace_[i].arguments.D                     = gemm_workspace_[i].device_workspace.data();
 
-    // Run parallel reduction kernel for parallel split_k_mode
-    if (problem_.split_k_mode == library::SplitKMode::kParallel) {
-      status = reduction_op_->run(
-        &gemm_workspace_.reduction_arguments,
-        gemm_workspace_.reduction_host_workspace.data(),
-        nullptr);
+        gemm_workspace_[i].reduction_arguments.workspace   = gemm_workspace_[i].device_workspace.data();
+        gemm_workspace_[i].reduction_arguments.source      = gemm_workspace_[i].C->batch_data(problem_idx);
+        gemm_workspace_[i].reduction_arguments.destination = gemm_workspace_[i].Computed->batch_data(problem_idx);
+      }
+
+      status = underlying_operation->run(
+        &gemm_workspace_[i].arguments,
+        gemm_workspace_[i].host_workspace.data(),
+        gemm_workspace_[i].device_workspace.data(),
+        gemm_workspace_[i].stream);
 
       if (status != Status::kSuccess) {
         return status;
       }
+
+      // Run parallel reduction kernel for parallel split_k_mode
+      if (problem_.split_k_mode == library::SplitKMode::kParallel) {
+        status = reduction_op_->run(
+          &gemm_workspace_[i].reduction_arguments,
+          gemm_workspace_[i].reduction_host_workspace.data(),
+          nullptr,
+          gemm_workspace_[i].stream);
+
+        if (status != Status::kSuccess) {
+          return status;
+        }
+      }
     }
+    timer[i].stop(gemm_workspace_[i].stream, cudaEventRecordExternal);
+    cudaStreamEndCapture(gemm_workspace_[i].stream, &graphs[i]);
+    cudaGraphInstantiate(&graphExecs[i], graphs[i], nullptr, nullptr, 0);
+  }
+
+  for (size_t i = 0; i < gemm_workspace_.size(); ++i) {
+    cudaSetDevice(options.device.device_id(i));
+    cudaGraphLaunch(graphExecs[i], gemm_workspace_[i].stream);
   }
 
   //
   // Wait for completion
   //
 
-  timer.stop_and_wait();
+  release->store(true, cuda::memory_order_release);
 
+  for (size_t i = 0; i < gemm_workspace_.size(); ++i) {
+    cudaSetDevice(options.device.device_id(i));
+    cudaStreamSynchronize(gemm_workspace_[i].stream);
+  }
   //
   // Update performance result
   //
 
-  runtime = timer.duration(iteration);
+
+  result.runtime = 0;
+  for (size_t i = 0; i < gemm_workspace_.size(); ++i) {
+    cudaSetDevice(options.device.device_id(i));
+    result.runtime_vector[i] = timer[i].duration(options.profiling.iterations);
+    result.runtime += result.runtime_vector[i];
+  }
+  result.runtime /= static_cast<double>(gemm_workspace_.size());
+
+  cudaFreeHost(release);
+
+  for (size_t i = 0; i < gemm_workspace_.size(); ++i) {
+    cudaSetDevice(options.device.device_id(i));
+    cudaGraphExecDestroy(graphExecs[i]);
+    cudaGraphDestroy(graphs[i]);
+  }
+
+  for (size_t i = 0; i < gemm_workspace_.size(); ++i) {
+    cudaSetDevice(options.device.device_id(gemm_workspace_.size() - i - 1));
+    timer.pop_back();
+  }
 
   return status;
 }
diff --git a/tools/profiler/src/gpu_timer.cpp b/tools/profiler/src/gpu_timer.cpp
index fddc63a1f2..cd0e4df09d 100644
--- a/tools/profiler/src/gpu_timer.cpp
+++ b/tools/profiler/src/gpu_timer.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -33,8 +33,10 @@
 */
 
 #include <stdexcept>
+#include <cstring>
+
+#include "cutlass/profiler/gpu_timer.h"
 
-#include "gpu_timer.h"
 
 namespace cutlass {
 namespace profiler {
@@ -52,32 +54,39 @@ GpuTimer::GpuTimer() {
   }
 }
 
+GpuTimer::GpuTimer(GpuTimer&& gpu_timer) noexcept {
+  memcpy(events, gpu_timer.events, sizeof(events));
+  memset(gpu_timer.events, 0, sizeof(gpu_timer.events));
+}
+
 GpuTimer::~GpuTimer() {
-  for (auto & event : events) {
-    cudaEventDestroy(event);
+  for (const auto & event : events) {
+    if (event != nullptr) {
+      cudaEventDestroy(event);
+    }
   }
 }
 
-/// Records a start event in the stream
-void GpuTimer::start(cudaStream_t stream) {
-  cudaError_t result = cudaEventRecord(events[0], stream);
+/// Records a start event in the stream, the flag is for cudaEventRecordWithFlags
+void GpuTimer::start(cudaStream_t stream, const unsigned int flag) {
+  cudaError_t result = cudaEventRecordWithFlags(events[0], stream, flag);
   if (result != cudaSuccess) {
     throw std::runtime_error("Failed to record start event.");
   }
 }
 
-/// Records a stop event in the stream
-void GpuTimer::stop(cudaStream_t stream) {
-cudaError_t result = cudaEventRecord(events[1], stream);
+/// Records a stop event in the stream, the flag is for cudaEventRecordWithFlags
+void GpuTimer::stop(cudaStream_t stream, const unsigned int flag) {
+cudaError_t result = cudaEventRecordWithFlags(events[1], stream, flag);
   if (result != cudaSuccess) {
     throw std::runtime_error("Failed to record stop event.");
   }
 }
 
-/// Records a stop event in the stream and synchronizes on the stream
-void GpuTimer::stop_and_wait(cudaStream_t stream) {
+/// Records a stop event in the stream and synchronizes on the stream, the flag is for cudaEventRecordWithFlags
+void GpuTimer::stop_and_wait(cudaStream_t stream, const unsigned int flag) {
 
-  stop(stream);
+  stop(stream, flag);
 
   cudaError_t result;
   if (stream) {
@@ -94,7 +103,7 @@ void GpuTimer::stop_and_wait(cudaStream_t stream) {
   }
 }
 
-/// Returns the duration in miliseconds
+/// Returns the duration in milliseconds
 double GpuTimer::duration(int iterations) const {
 
   float avg_ms;
diff --git a/tools/profiler/src/main.cpp b/tools/profiler/src/main.cpp
index 8a1f16eb8f..1e46d15a48 100644
--- a/tools/profiler/src/main.cpp
+++ b/tools/profiler/src/main.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -34,9 +34,9 @@
 
 #include <iostream>
 
-#include "options.h"
+#include "cutlass/profiler/options.h"
 
-#include "cutlass_profiler.h"
+#include "cutlass/profiler/cutlass_profiler.h"
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/tools/profiler/src/operation_profiler.cu b/tools/profiler/src/operation_profiler.cu
index 6c51b024c8..4d5c9d0973 100644
--- a/tools/profiler/src/operation_profiler.cu
+++ b/tools/profiler/src/operation_profiler.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -47,9 +47,11 @@
 // sleep not supported
 #endif
 
-#include "options.h"
-#include "operation_profiler.h"
-#include "gpu_timer.h"
+#include "cutlass/profiler/options.h"
+#include "cutlass/profiler/operation_profiler.h"
+#include "cutlass/profiler/gpu_timer.h"
+
+#include "cutlass/trace.h"
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -66,7 +68,7 @@ OperationProfiler::OperationProfiler(
   library::OperationKind kind,
   ArgumentDescriptionVector const &arguments,
   ProviderVector const & verification_providers
-): 
+):
   kind_(kind), arguments_(arguments) {
 
   ArgumentDescriptionVector tile_description_arguments{
@@ -75,6 +77,9 @@ OperationProfiler::OperationProfiler(
     {ArgumentTypeID::kInteger, {"cta_m", "threadblock-shape::m"}, "Threadblock shape in the M dimension"},
     {ArgumentTypeID::kInteger, {"cta_n", "threadblock-shape::n"}, "Threadblock shape in the N dimension"},
     {ArgumentTypeID::kInteger, {"cta_k", "threadblock-shape::k"}, "Threadblock shape in the K dimension"},
+    {ArgumentTypeID::kInteger, {"cluster_m", "cluster-shape::m"}, "Cluster shape in the M dimension"},
+    {ArgumentTypeID::kInteger, {"cluster_n", "cluster-shape::n"}, "Cluster shape in the N dimension"},
+    {ArgumentTypeID::kInteger, {"cluster_k", "cluster-shape::k"}, "Cluster shape in the K dimension"},
     {ArgumentTypeID::kInteger, {"stages", "threadblock-stages"}, "Number of stages of threadblock-scoped matrix multiply"},
     {ArgumentTypeID::kInteger, {"warps_m", "warp-count::m"}, "Number of warps within threadblock along the M dimension"},
     {ArgumentTypeID::kInteger, {"warps_n", "warp-count::n"}, "Number of warps within threadblock along the N dimension"},
@@ -90,19 +95,18 @@ OperationProfiler::OperationProfiler(
 
   for (auto provider : verification_providers) {
     if (std::find(
-      options.verification.providers.begin(), 
-      options.verification.providers.end(), 
+      options.verification.providers.begin(),
+      options.verification.providers.end(),
       provider) != options.verification.providers.end()) {
 
       verification_providers_.push_back(provider);
     }
   }
+
 }
 
 /// Destructor
-OperationProfiler::~OperationProfiler() {
-
-}
+OperationProfiler::~OperationProfiler() {}
 
 /// Gets the schema description
 std::string const & OperationProfiler::description() const {
@@ -116,14 +120,14 @@ void OperationProfiler::print_usage(std::ostream &out) const {
     size_t const kAliasStart = 10;
 
     size_t columns = 0;
-    
+
     std::string type_str = to_string(desc.type);
     columns += type_str.size();
 
     out << "  [" << type_str << "]";
 
     if (columns < kAliasStart) {
-      out << std::string(kAliasStart - columns, ' ');  
+      out << std::string(kAliasStart - columns, ' ');
     }
 
     columns = 0;
@@ -159,7 +163,6 @@ bool OperationProfiler::satisfies(
       return false;
     }
   }
-  
   int64_t int_value;
 
   if (arg_as_int(int_value, "inst_m", problem_space, problem)) {
@@ -198,6 +201,24 @@ bool OperationProfiler::satisfies(
     }
   }
 
+  if (arg_as_int(int_value, "cluster_m", problem_space, problem)) {
+    if (int64_t(op_desc.tile_description.cluster_shape.m()) != int_value) {
+      return false;
+    }
+  }
+
+  if (arg_as_int(int_value, "cluster_n", problem_space, problem)) {
+    if (int64_t(op_desc.tile_description.cluster_shape.n()) != int_value) {
+      return false;
+    }
+  }
+
+  if (arg_as_int(int_value, "cluster_k", problem_space, problem)) {
+    if (int64_t(op_desc.tile_description.cluster_shape.k()) != int_value) {
+      return false;
+    }
+  }
+
   if (arg_as_int(int_value, "stages", problem_space, problem)) {
     if (int64_t(op_desc.tile_description.threadblock_stages) != int_value) {
       return false;
@@ -232,14 +253,79 @@ bool OperationProfiler::satisfies(
   return true;
 }
 
-///////////////////////////////////////////////////////////////////////////////////////////////////
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+
+std::ostream& operator<<(std::ostream& out, library::Provider provider) {
+  if (provider == library::Provider::kNone) {
+    out << "kNone";
+  }
+  else if (provider == library::Provider::kCUTLASS) {
+    out << "kCUTLASS";
+  }
+  else if (provider == library::Provider::kReferenceHost) {
+    out << "kReferenceHost";
+  }
+  else if (provider == library::Provider::kReferenceDevice) {
+    out << "kReferenceDevice";
+  }
+  else if (provider == library::Provider::kCUBLAS) {
+    out << "kCUBLAS";
+  }
+  else if (provider == library::Provider::kCUDNN) {
+    out << "kCUDNN";
+  }
+  else {
+    out << "kInvalid";
+  }
+
+  return out;
+}
+
+std::ostream& operator<<(std::ostream& out, library::OperationKind provider) {
+  if (provider == library::OperationKind::kGemm) {
+    out << "kGemm";
+  }
+  else if (provider == library::OperationKind::kRankK) {
+    out << "kRankK";
+  }
+  else if (provider == library::OperationKind::kRank2K) {
+    out << "kRank2K";
+  }
+  else if (provider == library::OperationKind::kTrmm) {
+    out << "kTrmm";
+  }
+  else if (provider == library::OperationKind::kSymm) {
+    out << "kSymm";
+  }
+  else if (provider == library::OperationKind::kConv2d) {
+    out << "kConv2d";
+  }
+  else if (provider == library::OperationKind::kConv3d) {
+    out << "kConv3d";
+  }
+  else if (provider == library::OperationKind::kEqGemm) {
+    out << "kEqGemm";
+  }
+  else if (provider == library::OperationKind::kSparseGemm) {
+    out << "kSparseGemm";
+  }
+  else if (provider == library::OperationKind::kReduction) {
+    out << "kReduction";
+  }
+  else {
+    out << "kInvalid";
+  }
+
+  return out;
+}
+
+#endif // defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
 
 /// Entry point to profile all operations in the manifest
 int OperationProfiler::profile_all(
-  Options const &options, 
-  library::Manifest const &manifest, 
+  Options const &options,
+  library::Manifest const &manifest,
   DeviceContext &device_context) {
-  
   ProblemSpace problem_space(arguments_, options.cmdline);
 
   // 1. Construct performance report
@@ -249,44 +335,76 @@ int OperationProfiler::profile_all(
   ProblemSpace::Iterator problem_it = problem_space.begin();
   ProblemSpace::Iterator problem_end = problem_space.end();
 
-  bool continue_profiling = true, internal_error = false;
+  bool continue_profiling = true;
+  int retval = 0;
 
   // For each problem in problem space
   for (; continue_profiling && problem_it != problem_end; ++problem_it) {
-
     ProblemSpace::Problem problem = problem_it.at();
-
     report.next_problem();
 
     // For each operation in manifest
-    for (auto const & operation_ptr : manifest) {
+    int matched_operation_count = 0;
+    int profiled_operation_count = 0;
+    for (auto const& operation_ptr : manifest) {
 
       library::Operation const *operation = operation_ptr.get();
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      std::cerr << "  Operation: " << typeid(*operation).name() << "\n"
+                << "    name: " << operation->description().name << "\n"
+                << "    kind: " << operation->description().kind << "\n"
+                << "    provider: " << operation->description().provider << "\n";
+#endif // CUTLASS_DEBUG_TRACE_LEVEL
 
       auto min_cc = operation->description().tile_description.minimum_compute_capability;
       auto max_cc = operation->description().tile_description.maximum_compute_capability;
 
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      std::cerr << "    min_cc: " << min_cc << "\n";
+      std::cerr << "    max_cc: " << min_cc << "\n";
+#endif
+
       // Clear named allocations
       device_context.free();
 
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      if (operation->description().kind != kind_) {
+        std::cerr << "    @ kind " << operation->description().kind
+                  << " != kind_ " << kind_ << "\n";
+      }
+      if (operation->description().provider != library::Provider::kCUTLASS) {
+        std::cerr << "    @ provider " << operation->description().provider
+                  << " != library::Provider::kCUTLASS\n";
+      }
+      if (options.device.compute_capability(0) < min_cc) {
+        std::cerr << "    @ compute_capability "
+                  << options.device.compute_capability(0)
+                  << " < min_cc " << min_cc << "\n";
+      }
+      if (options.device.compute_capability(0) > max_cc) {
+        std::cerr << "    @ compute_capability "
+                  << options.device.compute_capability(0)
+                  << " > max_cc " << max_cc << "\n";
+      }
+#endif
+
       // Execute compatible cutlass operations if they satisfy the current device's compute capability
       if (operation->description().kind == kind_ &&
-        operation->description().provider == library::Provider::kCUTLASS &&
-        options.device.compute_capability() >= min_cc &&
-          options.device.compute_capability() <= max_cc) {
+          operation->description().provider == library::Provider::kCUTLASS &&
+          options.device.compute_capability(0) >= min_cc &&
+          options.device.compute_capability(0) <= max_cc) {
 
         std::string operation_name(operation->description().name);
-
         // Filter kernels by name
         bool filtered_by_name = options.operation_names.empty();
         if (!filtered_by_name) {
-          
+
           for (auto const & op_name : options.operation_names) {
             if (find_string_matches_(op_name, operation_name)) {
               filtered_by_name = true;
               break;
             }
-          } 
+          }
         }
 
         for (auto const & op_name : options.excluded_operation_names) {
@@ -299,7 +417,10 @@ int OperationProfiler::profile_all(
         if (!filtered_by_name || !satisfies(operation->description(), problem_space, problem)) {
           continue;
         }
-      
+
+        // we have found a kernel match, so increment the counter for match kernels
+        ++matched_operation_count;
+
         // A. Initialize configuration
         Status status = this->initialize_configuration(
           options,
@@ -310,11 +431,11 @@ int OperationProfiler::profile_all(
           problem);
 
         if (status == Status::kErrorInternal) {
-          
+
           // If there was an internal error, consume the CUDA error and move to the next operation.
           (void)cudaGetLastError();
-          
-          report.append_results(results_);
+
+          report.append_result(model_result_);
           continue;
         }
         else if (status != Status::kSuccess) {
@@ -325,6 +446,11 @@ int OperationProfiler::profile_all(
 
         if (continue_profiling) {
 
+          if (options.report.print_kernel_before_running) {
+            std::cout << "Profiling kernel for JUnit test " << options.report.junit_output_path << ": "
+                      << operation_name << std::endl;
+          }
+
           status = this->initialize_workspace(
             options,
             report,
@@ -353,16 +479,17 @@ int OperationProfiler::profile_all(
         //
 
         // B. Verify CUTLASS
-         
         if (continue_profiling && options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
 
           continue_profiling = this->verify_cutlass(
             options,
-            report, 
-            device_context, 
-            operation, 
+            report,
+            device_context,
+            operation,
             problem_space,
             problem);
+
+          retval |= (not continue_profiling);
         }
 
         if (options.execution_mode == ExecutionMode::kDryRun) {
@@ -390,25 +517,50 @@ int OperationProfiler::profile_all(
         if (continue_profiling && options.profiling.enabled) {
 
           continue_profiling = this->profile(
-            options, 
-            report, 
-            device_context, 
-            operation, 
+            options,
+            report,
+            device_context,
+            operation,
             problem_space,
             problem);
+
+          // Count op as profiled, even it failed to profile
+          profiled_operation_count++;
         }
 
         report.append_results(results_);
         results_.clear();
-      }
+      } // if op satisfied compute capacity
 
       if (!continue_profiling) {
+        // break out of `for op in manifest` loop and move to next problem
+        // `for each problem in problem space` conditional check on not continue profiling
         break;
       }
-    } 
-  }
+    } // for op in manifest
+
+    // If we did not find any kernels that match our filters and error_on_no_match was set, report an error
+    if (options.profiling.error_on_no_match && matched_operation_count <= 0) {
+      #if !NDEBUG
+      std::cerr << "Error: No matching kernels found with kernel selection filters [--error_on_no_match]" << std::endl;
+      #endif
+      retval |= 1;
+      // Stop profiling on error no match
+      continue_profiling = false;
+    }
 
-  return internal_error ? 1 : 0;
+    if (options.profiling.error_if_nothing_is_profiled && options.profiling.enabled && profiled_operation_count <= 0) {
+      #if !NDEBUG
+      std::cerr << "Error: No kernels profiled found with kernel selection filters [--error_if_nothing_is_profiled]" << std::endl;
+      #endif
+      retval |= 1;
+      // Stop profiling on error no match
+      continue_profiling = false;
+    }
+
+  } // for each problem in problem space
+
+  return retval;
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -422,7 +574,7 @@ void OperationProfiler::sleep(int sleep_duration) {
     SleepEx(sleep_duration, false);
     #else
     // sleep not supported
-    #endif 
+    #endif
   }
 }
 
@@ -448,7 +600,7 @@ Disposition OperationProfiler::compare_tensors(
 
     // bit-level equality
     passed = DeviceAllocation::block_compare_equal(
-      experimental.type(), 
+      experimental.type(),
       experimental.data(),
       reference.data(),
       count);
@@ -457,7 +609,7 @@ Disposition OperationProfiler::compare_tensors(
 
     // relative error function
     passed = DeviceAllocation::block_compare_relatively_equal(
-      experimental.type(), 
+      experimental.type(),
       experimental.data(),
       reference.data(),
       count,
@@ -479,7 +631,7 @@ void OperationProfiler::save_workspace(
   for (auto const & named_allocation : device_context) {
 
     DeviceAllocation *allocation = named_allocation.second;
-    
+
     std::stringstream filename;
 
     filename << desc.name << "_" << library::to_string(provider) << "_";
@@ -498,7 +650,7 @@ void OperationProfiler::save_workspace(
     if (options.report.verbose) {
       std::cout << "wrote '" << filename.str() << "'" << std::endl;
     }
-  } 
+  }
 }
 
 
@@ -506,7 +658,7 @@ void OperationProfiler::save_workspace(
 
 /// Method to profile a CUTLASS Operation
 Status OperationProfiler::profile_cutlass_(
-  double &runtime,
+  PerformanceResult &result,
   Options const &options,
   library::Operation const *operation,
   void *arguments,
@@ -538,7 +690,7 @@ Status OperationProfiler::profile_cutlass_(
       return status;
     }
   }
-  
+
   //
   // Initialize GPU timer
   //
@@ -553,7 +705,7 @@ Status OperationProfiler::profile_cutlass_(
 
   int iteration = 0;
   for (; iteration < Iterations; ++iteration) {
-    
+
     status = operation->run(
       arguments,
       host_workspace,
@@ -573,15 +725,15 @@ Status OperationProfiler::profile_cutlass_(
   //
   // Update performance result
   //
-  
-  runtime = timer.duration(iteration);
+
+  result.runtime = timer.duration(iteration);
 
   return status;
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Sets operation description 
+/// Sets operation description
 void OperationProfiler::initialize_result_(
   PerformanceResult &result,
   library::OperationDescription const &operation_desc,
@@ -596,6 +748,9 @@ void OperationProfiler::initialize_result_(
   set_argument(result, "cta_m", problem_space, operation_desc.tile_description.threadblock_shape.m());
   set_argument(result, "cta_n", problem_space, operation_desc.tile_description.threadblock_shape.n());
   set_argument(result, "cta_k", problem_space, operation_desc.tile_description.threadblock_shape.k());
+  set_argument(result, "cluster_m", problem_space, operation_desc.tile_description.cluster_shape.m());
+  set_argument(result, "cluster_n", problem_space, operation_desc.tile_description.cluster_shape.n());
+  set_argument(result, "cluster_k", problem_space, operation_desc.tile_description.cluster_shape.k());
   set_argument(result, "stages", problem_space, operation_desc.tile_description.threadblock_stages);
   set_argument(result, "warps_m", problem_space, operation_desc.tile_description.warp_count.m());
   set_argument(result, "warps_n", problem_space, operation_desc.tile_description.warp_count.n());
@@ -617,7 +772,7 @@ void OperationProfiler::set_argument(
   result.arguments.at(problem_space.argument_index(name)) = make_pair(std::string(name), value);
 }
 
-void OperationProfiler::set_argument(  
+void OperationProfiler::set_argument(
   PerformanceResult &result,
   char const *name,
   ProblemSpace const &problem_space,
@@ -629,12 +784,12 @@ void OperationProfiler::set_argument(
 
 /// finds string matches filter_string in operation_name
 bool OperationProfiler::find_string_matches_(
-  std::string const &filter_string, 
+  std::string const &filter_string,
   std::string const &operation_name) {
   // Returns true if all substrings appear in the operation_name in order
-  
+
   // Split filter_string of the format "gemm*f32*nt" to tokens ["gemm", "f32", "nt"]
-  std::string item;  
+  std::string item;
   std::istringstream iss(filter_string);
   std::vector<std::string> filter_tokens;
   while (std::getline(iss, item, '*')) {
@@ -643,7 +798,7 @@ bool OperationProfiler::find_string_matches_(
 
   // Search filter_tokens in operation_name in order
   size_t start = 0, idx = 0;
-  for(auto & token : filter_tokens) {
+  for (auto & token : filter_tokens) {
     // Check if characters left to be parsed in operation_name
     if (start < operation_name.length()) {
       // Find token in operation_name[start:]
@@ -652,7 +807,7 @@ bool OperationProfiler::find_string_matches_(
         return false;
       }
     }
-    start += (idx + token.length()); 
+    start += (idx + token.length());
   }
 
   // All tokens in filter_string found in operation_name
diff --git a/tools/profiler/src/options.cu b/tools/profiler/src/options.cu
index a1d98a4ad1..59368e9bad 100644
--- a/tools/profiler/src/options.cu
+++ b/tools/profiler/src/options.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -33,13 +33,14 @@
 */
 
 #include <algorithm>
+#include <set>
 
 #include "cutlass/cutlass.h"
 #include "cutlass/version.h"
 
 #include "cutlass/library/util.h"
 
-#include "options.h"
+#include "cutlass/profiler/options.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -55,45 +56,97 @@ static char const *end_of_line = "\n
 
 Options::Device::Device(cutlass::CommandLine const &cmdline) {
 
-  cmdline.get_cmd_line_argument("device", device, 0);
-
+  // Gets the number of devices for future validation
   cudaError_t result;
-  result = cudaGetDeviceProperties(&properties, device);
-
+  result = cudaGetDeviceCount(&num_devices);
   if (result != cudaSuccess) {
-    throw std::runtime_error("cudaGetDeviceProperties() failed for given device");
+    throw std::runtime_error("cudaGetNumDevices() failed");
   }
 
-  result = cudaSetDevice(device);
-  if (result != cudaSuccess) {
-    throw std::runtime_error("cudaSetDevice() failed for given device.");
+  // Gets the devices specified by the user
+  // This preserves the user specified order and checks for duplicates
+  {
+    std::vector<int> temp_device_list;
+    cmdline.get_cmd_line_arguments("devices", temp_device_list);
+    if (temp_device_list.empty()) {
+      temp_device_list.push_back(0);
+    }
+    {
+      std::set<int> temp_device_set;
+      for (int device : temp_device_list) {
+        auto res = temp_device_set.insert(device);
+        if (!res.second) {
+          throw std::runtime_error("Duplicate device specified: " +
+                                   std::to_string(device));
+        } else if (device > num_devices) {
+          throw std::runtime_error("Bad device ID: " +
+                                   std::to_string(device));
+        } else {
+          devices.push_back(device);
+        }
+      }
+    }
   }
 
-  // Permit overriding the compute capability
-  if (cmdline.check_cmd_line_flag("compute-capability")) {
-    int cc = compute_capability();
-    cmdline.get_cmd_line_argument("compute-capability", cc, cc);
-    properties.major = cc / 10;
-    properties.minor = cc % 10;
-  }
-  
-  // Permit overriding the L2 cache capacity
-  if (cmdline.check_cmd_line_flag("llc-capacity")) {
-    int llc_capacity = 0;
-    cmdline.get_cmd_line_argument("llc-capacity", llc_capacity, 0);
+  properties.resize(devices.size());
+  // Retrieves properties for all specified devices
+  for (size_t device_index = 0; device_index < devices.size(); device_index++) {
+    int device = devices[device_index];
+
+    result = cudaGetDeviceProperties(&properties[device_index], device);
 
-    if (llc_capacity >= 0) {
-      properties.l2CacheSize = (llc_capacity << 10);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed for given device");
+    }
+
+    // Check that all devices are the same
+    if (device_index > 0) {
+      if ((properties[device_index].major != properties[0].major) ||
+          (properties[device_index].minor != properties[0].minor)) {
+        throw std::runtime_error("All selected devices must have the same "
+                                 "compute capability");
+      }
+      if (properties[device_index].l2CacheSize != properties[0].l2CacheSize) {
+        throw std::runtime_error("All selected devices must have the same "
+                                 "L2 cache size");
+      }
+      if (properties[device_index].multiProcessorCount != properties[0].multiProcessorCount) {
+        throw std::runtime_error("All selected devices must have the same "
+                                 "SM count");
+      }
+    }
+
+    result = cudaSetDevice(device);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaSetDevice() failed for given device.");
+    }
+
+    // Permit overriding the compute capability
+    if (cmdline.check_cmd_line_flag("compute-capability")) {
+      int cc = compute_capability(device_index);
+      cmdline.get_cmd_line_argument("compute-capability", cc, cc);
+      properties[device_index].major = cc / 10;
+      properties[device_index].minor = cc % 10;
+    }
+
+    // Permit overriding the L2 cache capacity
+    if (cmdline.check_cmd_line_flag("llc-capacity")) {
+      int llc_capacity = 0;
+      cmdline.get_cmd_line_argument("llc-capacity", llc_capacity, 0);
+
+      if (llc_capacity >= 0) {
+        properties[device_index].l2CacheSize = (llc_capacity << 10);
+      }
     }
-  }
 
+  }
 }
 
 void Options::Device::print_usage(std::ostream &out) const {
 
   out << "Device:\n"
-    << "  --device=<int>                               "
-    << "    CUDA Device ID\n\n";
+    << "  --devices=<int>,<int>,...                      "
+    << "    CUDA Device IDs\n\n";
 
   int device_count = 0;
   cudaError_t result = cudaGetDeviceCount(&device_count);
@@ -111,11 +164,11 @@ void Options::Device::print_usage(std::ostream &out) const {
         break;
       }
       else {
-        out << "    [" << idx << "] - " 
-          << prop.name << " - SM " << prop.major << "." << prop.minor << ", " 
-          << prop.multiProcessorCount << " SMs @ " << (prop.clockRate / 1000.0) << " MHz, " 
+        out << "    [" << idx << "] - "
+          << prop.name << " - SM " << prop.major << "." << prop.minor << ", "
+          << prop.multiProcessorCount << " SMs @ " << (prop.clockRate / 1000.0) << " MHz, "
           << "L2 cache: " << (prop.l2CacheSize >> 20) << " MB, Global Memory: " << (prop.totalGlobalMem >> 30) << " GB"
-          << std::endl; 
+          << std::endl;
       }
     }
     out << "\n";
@@ -133,19 +186,12 @@ void Options::Device::print_usage(std::ostream &out) const {
 }
 
 void Options::Device::print_device_info(std::ostream &out) const {
-  int num_devices;
   cudaDeviceProp props;
-
   cudaError_t result;
-  result = cudaGetDeviceCount(&num_devices);
-
-  if (result != cudaSuccess) {
-    throw std::runtime_error("cudaGetNumDevices() failed");
-  }
 
   out << "Device Name,SM,CUDA Device ID,Phy Device ID" << std::endl;
 
-  for(int device = 0; device < num_devices; device++) {
+  for (int device = 0; device < num_devices; device++) {
     result = cudaSetDevice(device);
     if (result != cudaSuccess) {
       throw std::runtime_error("cudaSetDevice() failed for device");
@@ -165,14 +211,28 @@ void Options::Device::print_device_info(std::ostream &out) const {
 void Options::Device::print_options(std::ostream &out, int indent) const {
 
   out
-    << indent_str(indent) << "device: " << device << "\n"
-    << indent_str(indent) << "clock: " << int(double(properties.clockRate) / 1000.0) << "\n"
-    << indent_str(indent) << "compute-capability: " << compute_capability() << "\n";
+    << indent_str(indent) << "devices: ";
+  for (int device : devices) {
+    out << device << ',';
+  }
+  out
+    << "\n"
+    << indent_str(indent) << "clock: " << int(double(properties[0].clockRate) / 1000.0) << "\n"
+    << indent_str(indent) << "compute-capability: " << compute_capability(0) << "\n";
+}
+
+/// Returns the device ID from a device index
+int Options::Device::device_id(size_t device_index) const {
+  if (device_index > devices.size()) {
+    throw std::runtime_error("Out of bounds device index: " +
+                             std::to_string(device_index));
+  }
+  return devices.at(device_index);
 }
 
 /// Returns the compute capability of the listed device (e.g. 61, 60, 70, 75)
-int Options::Device::compute_capability() const {
-  return properties.major * 10 + properties.minor;
+int Options::Device::compute_capability(int device_index) const {
+  return properties[device_index].major * 10 + properties[device_index].minor;
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -189,7 +249,7 @@ Options::Initialization::Initialization(cutlass::CommandLine const &cmdline) {
       enabled = false;
     }
     else if (provider != library::Provider::kReferenceHost && provider != library::Provider::kReferenceDevice) {
-      throw std::runtime_error("Unsupported intialization provider specified."); 
+      throw std::runtime_error("Unsupported initialization provider specified.");
     }
   }
   else {
@@ -205,12 +265,12 @@ Options::Initialization::Initialization(cutlass::CommandLine const &cmdline) {
     get_distribution(cmdline, "dist", data_distribution);
   }
   else {
-    // profiler choosen data distribution (allowed to change based on numeric types)
+    // profiler chosen data distribution (allowed to change based on numeric types)
     fix_data_distribution = false;
-    // set uniform data distribution with range [-4, 4] 
+    // set uniform data distribution with range [-4, 4]
     data_distribution.set_uniform(-4, 4, 0);
   }
-  
+
 
 }
 
@@ -239,6 +299,9 @@ void Options::Initialization::get_distribution(
     {"max", &dist.uniform.max},
     {"mean", &dist.gaussian.mean},
     {"stddev", &dist.gaussian.stddev},
+    {"pnzA", &dist.gaussian.pnzA},
+    {"pnzB", &dist.gaussian.pnzB},
+    {"pnzC", &dist.gaussian.pnzC},
     {"start", &dist.sequential.start},
     {"delta", &dist.sequential.delta},
     {0, 0}
@@ -261,6 +324,25 @@ void Options::Initialization::get_distribution(
     ++it;
   }
 
+  // Default initialization
+  switch (dist.kind) {
+    case cutlass::Distribution::Uniform:
+      dist.set_uniform(-4/*min*/, 4/*max*/);
+      break;
+    case cutlass::Distribution::Gaussian:
+      dist.set_gaussian(0/*mean*/, 4/*stddev*/);
+      break;
+    case cutlass::Distribution::Identity:
+      dist.set_identity();
+      break;
+    case cutlass::Distribution::Sequential:
+      dist.set_sequential(0/*start*/, 4/*delta*/);
+      break;
+    default:
+      dist.set_uniform(-4/*min*/, 4/*max*/);
+      return;
+  }
+
   // Subsequent key-value pairs update the named field of the distribution struct.
   for (; it != values.end(); ++it) {
     // Integer scaling factor - if < 0, no integer rounding is performed.
@@ -302,7 +384,7 @@ void Options::Initialization::print_usage(std::ostream &out) const {
     << "  --dist=<distribution>                        "
     << "    Data distribution of input tensors {uniform*, gaussian, identity, sequential}"  << end_of_line
     << "       --dist=uniform,min:<double>,max:<double>,scale:<integer>"  << end_of_line
-    << "       --dist=gaussian,mean:<double>,stddev:<double>,scale:<integer>"  << end_of_line
+    << "       --dist=gaussian,mean:<double>,stddev:<double>,scale:<integer>,pnzA:<double>,pnzB:<double>,pnzC:<double>"  << end_of_line
     << "       --dist=sequential,start:<double>,delta:<double>,scale:<integer>"  << end_of_line
     << "       --dist=identity\n\n"
 
@@ -326,7 +408,7 @@ Options::Library::Library(cutlass::CommandLine const &cmdline) {
     std::string mode = "default";
     cmdline.get_cmd_line_argument("library-algo-mode", mode);
     algorithm_mode = from_string<AlgorithmMode>(mode);
-  }  
+  }
 
   if (cmdline.check_cmd_line_flag("library-algos")) {
 
@@ -340,11 +422,11 @@ Options::Library::Library(cutlass::CommandLine const &cmdline) {
 
     for (auto const & token : tokens) {
       if (token.find(":")) {
-        // todo - tokenized range
+        // TODO: tokenized range
       }
       else {
         int algo;
-        std::stringstream ss; 
+        std::stringstream ss;
 
         ss << token;
         ss >> algo;
@@ -387,12 +469,12 @@ void Options::Library::print_options(std::ostream &out, int indent) const {
 
 Options::Profiling::Profiling(cutlass::CommandLine const &cmdline) {
 
-  cmdline.get_cmd_line_argument("workspace-count", workspace_count, 0);  
+  cmdline.get_cmd_line_argument("workspace-count", workspace_count, 0);
   cmdline.get_cmd_line_argument("warmup-iterations", warmup_iterations, 10);
   cmdline.get_cmd_line_argument("profiling-iterations", iterations, 100);
   cmdline.get_cmd_line_argument("sleep-duration", sleep_duration, 50);
   cmdline.get_cmd_line_argument("profiling-enabled", enabled, true);
-  
+
   if (cmdline.check_cmd_line_flag("providers")) {
 
     std::vector<std::string> tokens;
@@ -407,7 +489,7 @@ Options::Profiling::Profiling(cutlass::CommandLine const &cmdline) {
   else {
     providers.push_back(library::Provider::kCUTLASS);
     providers.push_back(library::Provider::kCUBLAS);
-    providers.push_back(library::Provider::kCUDNN);      
+    providers.push_back(library::Provider::kCUDNN);
   }
 }
 
@@ -471,8 +553,11 @@ size_t Options::Profiling::index(library::Provider provider) const {
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 Options::Verification::Verification(cutlass::CommandLine const &cmdline) {
-  
+
   cmdline.get_cmd_line_argument("verification-enabled", enabled, true);
+  if (enabled) {
+    cmdline.get_cmd_line_argument("verification-required", required, false);
+  }
 
   cmdline.get_cmd_line_argument("epsilon", epsilon, 0.05);
 
@@ -488,7 +573,7 @@ Options::Verification::Verification(cutlass::CommandLine const &cmdline) {
   }
 
   if (cmdline.check_cmd_line_flag("verification-providers")) {
-    
+
     std::vector<std::string> tokens;
     cmdline.get_cmd_line_arguments("verification-providers", tokens);
 
@@ -504,7 +589,7 @@ Options::Verification::Verification(cutlass::CommandLine const &cmdline) {
   else {
     providers.push_back(library::Provider::kCUBLAS);
     providers.push_back(library::Provider::kReferenceDevice);
-    providers.push_back(library::Provider::kCUDNN);      
+    providers.push_back(library::Provider::kCUDNN);
   }
 }
 
@@ -571,7 +656,7 @@ size_t Options::Verification::index(library::Provider provider) const {
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 Options::Report::Report(cutlass::CommandLine const &cmdline) {
-  
+
   cmdline.get_cmd_line_argument("append", append, false);
   cmdline.get_cmd_line_argument("output", output_path);
   cmdline.get_cmd_line_argument("junit-output", junit_output_path);
@@ -585,6 +670,8 @@ Options::Report::Report(cutlass::CommandLine const &cmdline) {
   cmdline.get_cmd_line_argument("verbose", verbose, true);
 
   cmdline.get_cmd_line_argument("sort-results", sort_results, false);
+
+  cmdline.get_cmd_line_argument("print-kernel-before-running", print_kernel_before_running, false);
 }
 
 void Options::Report::print_usage(std::ostream &out) const {
@@ -601,6 +688,10 @@ void Options::Report::print_usage(std::ostream &out) const {
     << "  --junit-output=<path>                        "
     << "    Path to junit output file for result reporting. Operation kind and '.junit.xml' is appended.\n\n"
 
+    << "  --print-kernel-before-running=<bool>                "
+    << "    Prints the name of the kernel being profiled before running the kernel." << end_of_line
+    << "      This is useful for determining which kernel is causing a run of the profiler to hang\n\n"
+
     << "  --report-not-run=<bool>                      "
     << "    If true, reports the status of all kernels including those that" << end_of_line
     << "      do not satisfy the given arguments.\n\n"
@@ -622,7 +713,8 @@ void Options::Report::print_options(std::ostream &out, int indent) const {
     << indent_str(indent) << "append: " << append << "\n"
     << indent_str(indent) << "output: " << output_path << "\n"
     << indent_str(indent) << "junit-output: " << junit_output_path << "\n"
-    << indent_str(indent) << "report_not_run: " << report_not_run << "\n"
+    << indent_str(indent) << "print-kernel-before-running: " << print_kernel_before_running << "\n"
+    << indent_str(indent) << "report-not-run: " << report_not_run << "\n"
     << indent_str(indent) << "tags:\n";
 
   for (auto const & tag : pivot_tags) {
@@ -668,11 +760,11 @@ Options::Options(cutlass::CommandLine const &cmdline):
   device(cmdline),
   initialization(cmdline),
   library(cmdline),
-  profiling(cmdline), 
-  verification(cmdline), 
+  profiling(cmdline),
+  verification(cmdline),
   report(cmdline),
   about(cmdline) {
-  
+
   if (cmdline.check_cmd_line_flag("mode")) {
     std::string token;
     cmdline.get_cmd_line_argument("mode", token);
@@ -706,16 +798,21 @@ Options::Options(cutlass::CommandLine const &cmdline):
   }
   else if (cmdline.check_cmd_line_flag("kernels")) {
     cmdline.get_cmd_line_arguments("kernels", operation_names);
+    profiling.error_on_no_match = cmdline.check_cmd_line_flag("error-on-no-match");
+    profiling.error_if_nothing_is_profiled = cmdline.check_cmd_line_flag("error-if-nothing-is-profiled");
   }
 
   if (cmdline.check_cmd_line_flag("ignore-kernels")) {
     cmdline.get_cmd_line_arguments("ignore-kernels", excluded_operation_names);
+    profiling.error_on_no_match = cmdline.check_cmd_line_flag("error-on-no-match");
+    profiling.error_if_nothing_is_profiled = cmdline.check_cmd_line_flag("error-if-nothing-is-profiled");
   }
 
   // Prevent launches on the device for anything other than CUTLASS operation
+  // Allow verification only on host
   if (execution_mode == ExecutionMode::kTrace) {
     initialization.provider = library::Provider::kReferenceHost;
-    verification.enabled = false;
+    verification.providers = {library::Provider::kReferenceHost};
     profiling.enabled = false;
   }
 }
diff --git a/tools/profiler/src/performance_report.cpp b/tools/profiler/src/performance_report.cpp
index f3ad92fda7..1d04f48f0e 100644
--- a/tools/profiler/src/performance_report.cpp
+++ b/tools/profiler/src/performance_report.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -42,8 +42,8 @@
 
 #include "cutlass/library/util.h"
 
-#include "performance_report.h"
-#include "debug.h"
+#include "cutlass/profiler/performance_report.h"
+#include "cutlass/profiler/debug.h"
 namespace cutlass {
 namespace profiler {
 
@@ -94,7 +94,7 @@ PerformanceReport::PerformanceReport(
     if (options_.report.append) {
 
       std::ifstream test_output_file(op_file_name_);
-      
+
       if (test_output_file.is_open()) {
         print_header = false;
         test_output_file.close();
@@ -145,7 +145,7 @@ void PerformanceReport::append_result(PerformanceResult result) {
 
   if (options_.report.verbose) {
     std::cout << "\n";
-    print_result_pretty_(std::cout, result) << std::flush; 
+    print_result_pretty_(std::cout, result) << std::flush;
   }
 
   if (junit_output_file_.is_open()) {
@@ -237,7 +237,7 @@ static const char *disposition_status_color(Disposition disposition) {
 
 /// Prints the result in human readable form
 std::ostream & PerformanceReport::print_result_pretty_(
-  std::ostream &out, 
+  std::ostream &out,
   PerformanceResult const &result,
   bool use_shell_coloring) {
 
@@ -251,14 +251,14 @@ std::ostream & PerformanceReport::print_result_pretty_(
     int column_idx = 0;
     for (auto const & tag : options_.report.pivot_tags) {
       out << (column_idx++ ? "," : "") << tag.first << ":" << tag.second;
-    } 
+    }
 
     out << "\n";
   }
 
   std::string shell_color_bright = use_shell_coloring ? SHELL_COLOR_BRIGHT() : "";
   std::string shell_color_end = use_shell_coloring ? SHELL_COLOR_END() : "";
-  auto _disposition_status_color = [&](Disposition d) -> const char * { 
+  auto _disposition_status_color = [&](Disposition d) -> const char * {
     return use_shell_coloring ? disposition_status_color(d) : "";
   };
 
@@ -277,7 +277,7 @@ std::ostream & PerformanceReport::print_result_pretty_(
     static int const indent_spaces = 16;
 
     for(auto & m : result.verification_map) {
-      out  << std::right << std::setw(indent_spaces) << library::to_string(m.first, true) << ": " << to_string(m.second, true) << "\n";  
+      out  << std::right << std::setw(indent_spaces) << library::to_string(m.first, true) << ": " << to_string(m.second, true) << "\n";
     }
   }
 
@@ -287,7 +287,7 @@ std::ostream & PerformanceReport::print_result_pretty_(
   int column_idx = 0;
   for (auto const &arg : result.arguments) {
     if (!arg.second.empty()) {
-      out << " --" << arg.first << "=" << arg.second; 
+      out << " --" << arg.first << "=" << arg.second;
       column_idx += int(4 + arg.first.size() + arg.second.size());
       if (column_idx > 98) {
         out << "  \\\n                 ";
@@ -297,7 +297,7 @@ std::ostream & PerformanceReport::print_result_pretty_(
   }
   out << "\n\n";
 
-  out 
+  out
     << "           Bytes: " << result.bytes << "  bytes\n"
     << "           FLOPs: " << result.flops << "  flops\n"
     << "           FLOPs/Byte: " << (result.flops / result.bytes) << "\n\n";
@@ -325,7 +325,7 @@ std::ostream & PerformanceReport::print_csv_header_(
     out << (column_idx++ ? "," : "") << tag.first;
   }
 
-  out 
+  out
     << (column_idx ? "," : "") << "Problem,Provider"
     << ",OperationKind,Operation,Disposition,Status";
 
@@ -333,11 +333,19 @@ std::ostream & PerformanceReport::print_csv_header_(
     out << "," << arg_name;
   }
 
-  out 
+  out
     << ",Bytes"
     << ",Flops"
     << ",Flops/Byte"
-    << ",Runtime"
+    << ",Runtime";
+
+  if (options_.device.devices.size() > 1) {
+    for (size_t i = 0; i < options_.device.devices.size(); i++) {
+      out << ",Runtime_" << i;
+    }
+  }
+
+  out
     << ",GB/s"
     << ",GFLOPs"
     ;
@@ -347,7 +355,7 @@ std::ostream & PerformanceReport::print_csv_header_(
 
 /// Print the result in CSV output
 std::ostream & PerformanceReport::print_result_csv_(
-  std::ostream &out, 
+  std::ostream &out,
   PerformanceResult const &result) {
 
   int column_idx = 0;
@@ -357,8 +365,8 @@ std::ostream & PerformanceReport::print_result_csv_(
     out << (column_idx++ ? "," : "") << tag.second;
   }
 
-  out 
-    << (column_idx ? "," : "") 
+  out
+    << (column_idx ? "," : "")
     << result.problem_index
     << "," << to_string(result.provider, true)
     << "," << to_string(result.op_kind)
@@ -370,23 +378,34 @@ std::ostream & PerformanceReport::print_result_csv_(
     out << "," << arg.second;
   }
 
-  out 
+  out
     << "," << result.bytes
     << "," << result.flops
     << "," << result.flops / result.bytes
     << "," << result.runtime;
 
+  if (options_.device.devices.size() > 1) {
+    if (result.runtime_vector.size() != options_.device.devices.size()) {
+      throw std::runtime_error("Runtime vector size mismatch");
+    }
+
+    for (const auto runtime : result.runtime_vector) {
+      out << "," << runtime;
+    }
+  }
+
   if (result.good()) {
 
     out
       << "," << result.gbytes_per_sec()
       << "," << result.gflops_per_sec()
       ;
+
   }
   else {
     out << std::string(2
       , ','
-    ); 
+    );
   }
 
   return out;
@@ -450,25 +469,25 @@ std::ostream & PerformanceReport::print_junit_result_(std::ostream &out, Perform
   case Disposition::kNotSupported:
     skipped = true;
     break;
-  case Disposition::kPassed: 
+  case Disposition::kPassed:
   case Disposition::kNotVerified:
     break;
-  case Disposition::kFailed: 
+  case Disposition::kFailed:
   case Disposition::kIncorrect:
-    failed = true; 
+    failed = true;
     break;
   case Disposition::kInvalidProblem:
   case Disposition::kInvalid:
     error = true;
     break;
   };
-  
+
   if (skipped) {
     out << "status=\"notrun\"";
   } else {
     out << "status=\"run\"";
   }
-    
+
   out << ">" << std::endl;
 
   if (failed) {
@@ -487,7 +506,7 @@ std::ostream & PerformanceReport::print_junit_result_(std::ostream &out, Perform
 
   out << "  </testcase>" << std::endl;
 
-  return out;  
+  return out;
 
 }
 
diff --git a/tools/profiler/src/performance_result.cu b/tools/profiler/src/performance_result.cu
index 2a282b01da..3a21c9b583 100644
--- a/tools/profiler/src/performance_result.cu
+++ b/tools/profiler/src/performance_result.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -39,8 +39,8 @@
 #include "cutlass/cutlass.h"
 
 // CUTLASS Profiler includes
-#include "enumerated_types.h"
-#include "performance_result.h"
+#include "cutlass/profiler/enumerated_types.h"
+#include "cutlass/profiler/performance_result.h"
 
 // CUTLASS Library includes
 #include "cutlass/library/library.h"
diff --git a/tools/profiler/src/problem_space.cpp b/tools/profiler/src/problem_space.cpp
index 892987d6ef..bd76bdbb3a 100644
--- a/tools/profiler/src/problem_space.cpp
+++ b/tools/profiler/src/problem_space.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -38,7 +38,7 @@
 
 #include "cutlass/library/util.h"
 
-#include "problem_space.h"
+#include "cutlass/profiler/problem_space.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -395,10 +395,6 @@ std::unique_ptr<KernelArgument::ValueIterator> EnumeratedTypeArgument::end() con
 
 //////////////////////////////////////////////////////////////////////////////////////////////////
 
-ProblemSpace::Iterator::Iterator() {
-
-}
-
 ProblemSpace::Iterator::Iterator(ProblemSpace const &problem_space) {
   for (auto const & arg_ptr : problem_space.arguments) {
     construct_(arg_ptr.get());
@@ -845,6 +841,46 @@ bool arg_as_NumericTypeID(
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_RasterOrder(
+  library::RasterOrder &raster_order, 
+  KernelArgument::Value const *value_ptr) {
+  
+  if (value_ptr->not_null) {
+    if (value_ptr->argument->description->type == ArgumentTypeID::kEnumerated) {
+
+      raster_order = library::from_string<library::RasterOrder>(
+        static_cast<EnumeratedTypeArgument::EnumeratedTypeValue const *>(value_ptr)->element);
+
+      if (raster_order == library::RasterOrder::kInvalid) {
+        throw std::runtime_error(
+          "arg_as_RasterOrder() - illegal cast.");
+      }
+    }
+    else {
+      throw std::runtime_error(
+        "arg_as_RasterOrder() - illegal cast.");
+    }
+    return true;
+  }
+  return false;
+}
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_RasterOrder(
+  library::RasterOrder &raster_order,
+  char const *name,
+  ProblemSpace const &problem_space, 
+  ProblemSpace::Problem const &problem) {
+
+  size_t idx = problem_space.argument_index(name);
+  KernelArgument::Value const *value_ptr = problem.at(idx).get();
+
+  return arg_as_RasterOrder(raster_order, value_ptr);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 /// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
 bool arg_as_LayoutTypeID(
   library::LayoutTypeID &layout_type, 
diff --git a/tools/profiler/src/rank_2k_operation_profiler.cu b/tools/profiler/src/rank_2k_operation_profiler.cu
index c5023be121..4b547a3e1f 100644
--- a/tools/profiler/src/rank_2k_operation_profiler.cu
+++ b/tools/profiler/src/rank_2k_operation_profiler.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -31,7 +31,7 @@
 /* \file
    \brief Execution environment
 
-  
+
 */
 
 #include <iostream>
@@ -41,9 +41,9 @@
 
 #include "cutlass/core_io.h"
 
-#include "cublas_helpers.h"
-#include "rank_2k_operation_profiler.h"
-#include "gpu_timer.h"
+#include "cutlass/profiler/cublas_helpers.h"
+#include "cutlass/profiler/rank_2k_operation_profiler.h"
+#include "cutlass/profiler/gpu_timer.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -54,7 +54,7 @@ namespace profiler {
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Ctor
-Rank2KOperationProfiler::Rank2KOperationProfiler(Options const &options): 
+Rank2KOperationProfiler::Rank2KOperationProfiler(Options const &options):
   OperationProfiler(
     options,
     library::OperationKind::kRank2K,
@@ -95,7 +95,7 @@ void Rank2KOperationProfiler::print_examples(std::ostream &out) const {
   out << "\nExamples:\n\n"
     << "Profile a particular problem size Syrk kernel:\n"
     << "  $ cutlass_profiler --operation=rank_2k --blas_mode=symmetric --n=1024 --k=128\n\n"
-    
+
     << "Profile a particular problem size Herk kernel:\n"
     << "  $ cutlass_profiler --operation=rank_2k --blas_mode=hermitian --n=1024 --k=128\n\n"
 
@@ -118,7 +118,7 @@ void Rank2KOperationProfiler::print_examples(std::ostream &out) const {
 
     << "Run a kernel with cta tile size of 256x128x32 and save workspace if results are incorrect (note that --cta-tile::k=32 is default cta-tile size):\n"
     << " $ cutlass_profiler --operation=rank_2k --cta_m=256 --cta_n=128  --cta_k=32 --save-workspace=incorrect\n\n"
-    
+
     << "Test your changes to rank_2k kernels with a quick functional test and save results in functional-test.csv:\n"
     << " $ cutlass_profiler  --operation=rank_2k \\ \n"
     << "   --n=8,56,120,136,256,264,512,520,1024,1032,4096,8192,16384 \\ \n"
@@ -148,22 +148,22 @@ Status Rank2KOperationProfiler::RankKProblem::parse(
   library::RankKDescription const &operation_desc,
   ProblemSpace const &problem_space,
   ProblemSpace::Problem const &problem) {
-  
+
   if (!arg_as_int(this->n, "n", problem_space, problem)) {
     // default value
     this->n = 1024;
   }
-  
+
   if (!arg_as_int(this->k, "k", problem_space, problem)) {
     // default value
     this->k = 1024;
   }
-  
+
   if (!arg_as_int(this->split_k_slices, "split_k_slices", problem_space, problem)) {
     // default value
     this->split_k_slices = 1;
   }
-  
+
   if (!arg_as_int(this->batch_count, "batch_count", problem_space, problem)) {
     // default value
     this->batch_count = 1;
@@ -187,29 +187,29 @@ Status Rank2KOperationProfiler::RankKProblem::parse(
   }
 
   if (!arg_as_scalar(
-    this->alpha, 
-    operation_desc.element_epilogue, 
-    "alpha", 
-    problem_space, 
+    this->alpha,
+    operation_desc.element_epilogue,
+    "alpha",
+    problem_space,
     problem)) {
 
     if (!cast_from_double(this->alpha, operation_desc.element_epilogue, 1)) {
       return Status::kErrorInternal;
     }
   }
-  
+
   if (!arg_as_scalar(
-    this->beta, 
-    operation_desc.element_epilogue, 
-    "beta", 
-    problem_space, 
+    this->beta,
+    operation_desc.element_epilogue,
+    "beta",
+    problem_space,
     problem)) {
-    
+
     if (!cast_from_double(this->beta, operation_desc.element_epilogue, 0)) {
       return Status::kErrorInternal;
     }
   }
-  
+
   this->lda = DeviceAllocation::get_packed_layout(
     operation_desc.A.layout, {int(this->n), int(this->k)}).front();
 
@@ -311,14 +311,14 @@ void Rank2KOperationProfiler::RankKProblem::initialize_result(
 
 /// Extracts the problem dimensions
 Status Rank2KOperationProfiler::initialize_configuration(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
   ProblemSpace const &problem_space,
   ProblemSpace::Problem const &problem) {
 
-  library::RankKDescription const &operation_desc = 
+  library::RankKDescription const &operation_desc =
     static_cast<library::RankKDescription const &>(operation->description());
 
   if (operation_desc.rank_k_kind != library::RankKKind::kUniversal) {
@@ -326,7 +326,7 @@ Status Rank2KOperationProfiler::initialize_configuration(
   }
 
   Status status = problem_.parse(operation_desc, problem_space, problem);
-  
+
   if (status != Status::kSuccess) {
     return status;
   }
@@ -350,14 +350,14 @@ Status Rank2KOperationProfiler::initialize_configuration(
   rank_k_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
 
   initialize_result_(this->model_result_, options, operation_desc, problem_space);
-  
+
   return operation->can_implement(&rank_k_workspace_.configuration, &rank_k_workspace_.arguments);
 }
 
 /// Initializes the performance result
 void Rank2KOperationProfiler::initialize_result_(
   PerformanceResult &result,
-  Options const &options,  
+  Options const &options,
   library::RankKDescription const &operation_desc,
   ProblemSpace const &problem_space) {
 
@@ -365,7 +365,7 @@ void Rank2KOperationProfiler::initialize_result_(
   result.disposition = Disposition::kNotRun;
   result.status = Status::kSuccess;
   result.operation_name = operation_desc.name;
-  
+
   problem_.initialize_result(result, operation_desc, problem_space);
 
   OperationProfiler::initialize_result_(result, operation_desc, problem_space);
@@ -380,60 +380,85 @@ void Rank2KOperationProfiler::initialize_result_(
 
 /// Initializes workspace
 Status Rank2KOperationProfiler::initialize_workspace(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
   ProblemSpace const &problem_space,
   ProblemSpace::Problem const &problem) {
-  
-  library::RankKDescription const &operation_desc = 
+
+  if (options.device.devices.size() != 1) {
+    throw std::runtime_error("This operation profiler only supports a single "
+                             "device.");
+  }
+
+  cudaError_t result;
+  result = cudaSetDevice(options.device.device_id(0));
+  if (result != cudaSuccess) {
+    throw std::runtime_error("cudaSetDevice() failed.");
+  }
+
+  library::RankKDescription const &operation_desc =
     static_cast<library::RankKDescription const &>(operation->description());
 
   if (options.execution_mode != ExecutionMode::kDryRun) {
-
-    rank_k_workspace_.A = device_context.allocate_tensor(
+    int seed_shift = 0;
+    rank_k_workspace_.A = device_context.allocate_and_initialize_tensor(
       options,
       "A",
       operation_desc.A.element,
       operation_desc.A.layout,
       {int(problem_.n), int(problem_.k)},
-      {int(problem_.lda)}
+      {int(problem_.lda)},
+      1, // batch_count
+      seed_shift++,
+      0 // device_index
     );
 
-    rank_k_workspace_.B = device_context.allocate_tensor(
+    rank_k_workspace_.B = device_context.allocate_and_initialize_tensor(
       options,
       "B",
       operation_desc.B.element,
       operation_desc.B.layout,
       {int(problem_.n), int(problem_.k)},
-      {int(problem_.ldb)}
+      {int(problem_.ldb)},
+      1, // batch_count
+      seed_shift++,
+      0 // device_index
     );
 
-    rank_k_workspace_.C = device_context.allocate_tensor(
+    rank_k_workspace_.C = device_context.allocate_and_initialize_tensor(
       options,
       "C",
       operation_desc.C.element,
       operation_desc.C.layout,
       {int(problem_.n), int(problem_.n)},
       {int(problem_.ldc)},
-      1 // batch_count = 1, default
+      1, // batch_count
+      seed_shift++,
+      0 // device_index
     );
 
     rank_k_workspace_.Computed = device_context.allocate_tensor(
+      options,
       "D",
       operation_desc.C.element,
       operation_desc.C.layout,
       {int(problem_.n), int(problem_.n)},
-      {int(problem_.ldc)}
+      {int(problem_.ldc)},
+      1, // batch_count
+      0 // device_index
     );
 
     rank_k_workspace_.Reference = device_context.allocate_tensor(
+      options,
       "Reference",
       operation_desc.C.element,
       operation_desc.C.layout,
       {int(problem_.n), int(problem_.n)},
-      {int(problem_.ldc)}
+      {int(problem_.ldc)},
+      1, // batch_count
+      0 // device_index
     );
 
     rank_k_workspace_.Computed->copy_from_device(rank_k_workspace_.C->data());
@@ -482,7 +507,7 @@ Status Rank2KOperationProfiler::initialize_workspace(
 
 /// Verifies CUTLASS against references
 bool Rank2KOperationProfiler::verify_cutlass(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
@@ -511,7 +536,7 @@ bool Rank2KOperationProfiler::verify_cutlass(
   //
 
   results_.back().status = operation->run(
-    &rank_k_workspace_.arguments, 
+    &rank_k_workspace_.arguments,
     rank_k_workspace_.host_workspace.data(),
     rank_k_workspace_.device_workspace.data());
 
@@ -559,8 +584,8 @@ bool Rank2KOperationProfiler::verify_cutlass(
       }
     }
 #endif // #if CUTLASS_ENABLE_CUBLAS
-    
-    // Update disposition to worst case verification outcome among all 
+
+    // Update disposition to worst case verification outcome among all
     // verification providers which are supported
     bool is_any_verification_run_passed = false;
     for(auto &m : results_.back().verification_map) {
@@ -586,7 +611,7 @@ bool Rank2KOperationProfiler::verify_cutlass(
 
 /// Verifies CUTLASS against references
 bool Rank2KOperationProfiler::verify_with_cublas_(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
@@ -596,13 +621,13 @@ bool Rank2KOperationProfiler::verify_with_cublas_(
 
 #if CUTLASS_ENABLE_CUBLAS
 
-  library::RankKDescription const &rank_k_desc = 
+  library::RankKDescription const &rank_k_desc =
     static_cast<library::RankKDescription const &>(operation->description());
 
   //
   // Construct cuBLAS operators
   //
-    
+
   CublasCreate handle;
   cublasStatus_t status = handle.get_cublas_create_status();
 
@@ -631,8 +656,8 @@ bool Rank2KOperationProfiler::verify_with_cublas_(
     rank_k_workspace_.arguments.beta = problem_.beta.data();
     rank_k_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
 
-    detail::cublasRankKDispatcher rank_k_op( 
-      rank_k_desc, 
+    detail::cublasRankKDispatcher rank_k_op(
+      rank_k_desc,
       rank_k_workspace_.configuration,
       rank_k_workspace_.arguments
     );
@@ -664,7 +689,7 @@ bool Rank2KOperationProfiler::verify_with_cublas_(
     );
 
     // Save workspace if incorrect
-    if (options.verification.save_workspace == SaveWorkspace::kIncorrect && 
+    if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
       results_.back().verification_map[library::Provider::kCUBLAS] == Disposition::kIncorrect) {
 
       save_workspace(
@@ -689,7 +714,7 @@ bool Rank2KOperationProfiler::verify_with_cublas_(
 
 /// Measures performance results
 bool Rank2KOperationProfiler::profile(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
@@ -708,7 +733,7 @@ bool Rank2KOperationProfiler::profile(
     rank_k_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
 
     results_.back().status = profile_cutlass_(
-      results_.back().runtime,
+      results_.back(),
       options,
       operation,
       &rank_k_workspace_.arguments,
diff --git a/tools/profiler/src/rank_k_operation_profiler.cu b/tools/profiler/src/rank_k_operation_profiler.cu
index 8546a5b2ac..52613b8ebd 100644
--- a/tools/profiler/src/rank_k_operation_profiler.cu
+++ b/tools/profiler/src/rank_k_operation_profiler.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -31,7 +31,7 @@
 /* \file
    \brief Execution environment
 
-  
+
 */
 
 #include <iostream>
@@ -41,9 +41,9 @@
 
 #include "cutlass/core_io.h"
 
-#include "cublas_helpers.h"
-#include "rank_k_operation_profiler.h"
-#include "gpu_timer.h"
+#include "cutlass/profiler/cublas_helpers.h"
+#include "cutlass/profiler/rank_k_operation_profiler.h"
+#include "cutlass/profiler/gpu_timer.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -54,7 +54,7 @@ namespace profiler {
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Ctor
-RankKOperationProfiler::RankKOperationProfiler(Options const &options): 
+RankKOperationProfiler::RankKOperationProfiler(Options const &options):
   OperationProfiler(
     options,
     library::OperationKind::kRankK,
@@ -94,7 +94,7 @@ void RankKOperationProfiler::print_examples(std::ostream &out) const {
   out << "\nExamples:\n\n"
     << "Profile a particular problem size Syrk kernel:\n"
     << "  $ cutlass_profiler --operation=rank_k --blas_mode=symmetric --n=1024 --k=128\n\n"
-    
+
     << "Profile a particular problem size Herk kernel:\n"
     << "  $ cutlass_profiler --operation=rank_k --blas_mode=hermitian --n=1024 --k=128\n\n"
 
@@ -117,7 +117,7 @@ void RankKOperationProfiler::print_examples(std::ostream &out) const {
 
     << "Run a kernel with cta tile size of 256x128x32 and save workspace if results are incorrect (note that --cta-tile::k=32 is default cta-tile size):\n"
     << " $ cutlass_profiler --operation=rank_k --cta_m=256 --cta_n=128  --cta_k=32 --save-workspace=incorrect\n\n"
-    
+
     << "Test your changes to rank_k kernels with a quick functional test and save results in functional-test.csv:\n"
     << " $ cutlass_profiler  --operation=rank_k \\ \n"
     << "   --n=8,56,120,136,256,264,512,520,1024,1032,4096,8192,16384 \\ \n"
@@ -147,22 +147,22 @@ Status RankKOperationProfiler::RankKProblem::parse(
   library::RankKDescription const &operation_desc,
   ProblemSpace const &problem_space,
   ProblemSpace::Problem const &problem) {
-  
+
   if (!arg_as_int(this->n, "n", problem_space, problem)) {
     // default value
     this->n = 1024;
   }
-  
+
   if (!arg_as_int(this->k, "k", problem_space, problem)) {
     // default value
     this->k = 1024;
   }
-  
+
   if (!arg_as_int(this->split_k_slices, "split_k_slices", problem_space, problem)) {
     // default value
     this->split_k_slices = 1;
   }
-  
+
   if (!arg_as_int(this->batch_count, "batch_count", problem_space, problem)) {
     // default value
     this->batch_count = 1;
@@ -182,29 +182,29 @@ Status RankKOperationProfiler::RankKProblem::parse(
   }
 
   if (!arg_as_scalar(
-    this->alpha, 
-    operation_desc.element_epilogue, 
-    "alpha", 
-    problem_space, 
+    this->alpha,
+    operation_desc.element_epilogue,
+    "alpha",
+    problem_space,
     problem)) {
 
     if (!cast_from_double(this->alpha, operation_desc.element_epilogue, 1)) {
       return Status::kErrorInternal;
     }
   }
-  
+
   if (!arg_as_scalar(
-    this->beta, 
-    operation_desc.element_epilogue, 
-    "beta", 
-    problem_space, 
+    this->beta,
+    operation_desc.element_epilogue,
+    "beta",
+    problem_space,
     problem)) {
-    
+
     if (!cast_from_double(this->beta, operation_desc.element_epilogue, 0)) {
       return Status::kErrorInternal;
     }
   }
-  
+
   this->lda = DeviceAllocation::get_packed_layout(
     operation_desc.A.layout, {int(this->n), int(this->k)}).front();
 
@@ -252,7 +252,7 @@ int64_t RankKOperationProfiler::RankKProblem::flops(library::RankKDescription co
   case library::MathOperationID::kMultiplyAddComplexFastF32:
     flops_ *= 4;
     break;
-    
+
   case library::MathOperationID::kMultiplyAddGaussianComplex:
     flops_ *= 3;
     break;
@@ -300,14 +300,14 @@ void RankKOperationProfiler::RankKProblem::initialize_result(
 
 /// Extracts the problem dimensions
 Status RankKOperationProfiler::initialize_configuration(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
   ProblemSpace const &problem_space,
   ProblemSpace::Problem const &problem) {
 
-  library::RankKDescription const &operation_desc = 
+  library::RankKDescription const &operation_desc =
     static_cast<library::RankKDescription const &>(operation->description());
 
   if (operation_desc.rank_k_kind != library::RankKKind::kUniversal) {
@@ -315,7 +315,7 @@ Status RankKOperationProfiler::initialize_configuration(
   }
 
   Status status = problem_.parse(operation_desc, problem_space, problem);
-  
+
   if (status != Status::kSuccess) {
     return status;
   }
@@ -337,14 +337,14 @@ Status RankKOperationProfiler::initialize_configuration(
   rank_k_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
 
   initialize_result_(this->model_result_, options, operation_desc, problem_space);
-  
+
   return operation->can_implement(&rank_k_workspace_.configuration, &rank_k_workspace_.arguments);
 }
 
 /// Initializes the performance result
 void RankKOperationProfiler::initialize_result_(
   PerformanceResult &result,
-  Options const &options,  
+  Options const &options,
   library::RankKDescription const &operation_desc,
   ProblemSpace const &problem_space) {
 
@@ -352,7 +352,7 @@ void RankKOperationProfiler::initialize_result_(
   result.disposition = Disposition::kNotRun;
   result.status = Status::kSuccess;
   result.operation_name = operation_desc.name;
-  
+
   problem_.initialize_result(result, operation_desc, problem_space);
 
   OperationProfiler::initialize_result_(result, operation_desc, problem_space);
@@ -368,7 +368,7 @@ void RankKOperationProfiler::initialize_result_(
   case library::MathOperationID::kMultiplyAddComplex:
     result.flops *= 4;
     break;
-     
+
   case library::MathOperationID::kMultiplyAddComplexFastF32:
     result.flops *= 4;
     break;
@@ -380,51 +380,73 @@ void RankKOperationProfiler::initialize_result_(
 
 /// Initializes workspace
 Status RankKOperationProfiler::initialize_workspace(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
   ProblemSpace const &problem_space,
   ProblemSpace::Problem const &problem) {
-  
-  library::RankKDescription const &operation_desc = 
+
+  if (options.device.devices.size() != 1) {
+    throw std::runtime_error("This operation profiler only supports a single "
+                             "device.");
+  }
+
+  cudaError_t result;
+  result = cudaSetDevice(options.device.device_id(0));
+  if (result != cudaSuccess) {
+    throw std::runtime_error("cudaSetDevice() failed.");
+  }
+
+  library::RankKDescription const &operation_desc =
     static_cast<library::RankKDescription const &>(operation->description());
 
   if (options.execution_mode != ExecutionMode::kDryRun) {
-
-    rank_k_workspace_.A = device_context.allocate_tensor(
+    int seed_shift = 0;
+    rank_k_workspace_.A = device_context.allocate_and_initialize_tensor(
       options,
       "A",
       operation_desc.A.element,
       operation_desc.A.layout,
       {int(problem_.n), int(problem_.k)},
-      {int(problem_.lda)}
+      {int(problem_.lda)},
+      1, // batch_count
+      seed_shift++,
+      0 // device_index
     );
 
-    rank_k_workspace_.C = device_context.allocate_tensor(
+    rank_k_workspace_.C = device_context.allocate_and_initialize_tensor(
       options,
       "C",
       operation_desc.C.element,
       operation_desc.C.layout,
       {int(problem_.n), int(problem_.n)},
       {int(problem_.ldc)},
-      1 // batch_count = 1, default
+      1, // batch_count
+      seed_shift++,
+      0 // device_index
     );
 
     rank_k_workspace_.Computed = device_context.allocate_tensor(
+      options,
       "D",
       operation_desc.C.element,
       operation_desc.C.layout,
       {int(problem_.n), int(problem_.n)},
-      {int(problem_.ldc)}
+      {int(problem_.ldc)},
+      1, //batch_count
+      0 // device_index
     );
 
     rank_k_workspace_.Reference = device_context.allocate_tensor(
+      options,
       "Reference",
       operation_desc.C.element,
       operation_desc.C.layout,
       {int(problem_.n), int(problem_.n)},
-      {int(problem_.ldc)}
+      {int(problem_.ldc)},
+      1, //batch_count
+      0 // device_index
     );
 
     rank_k_workspace_.Computed->copy_from_device(rank_k_workspace_.C->data());
@@ -473,7 +495,7 @@ Status RankKOperationProfiler::initialize_workspace(
 
 /// Verifies CUTLASS against references
 bool RankKOperationProfiler::verify_cutlass(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
@@ -501,7 +523,7 @@ bool RankKOperationProfiler::verify_cutlass(
   //
 
   results_.back().status = operation->run(
-    &rank_k_workspace_.arguments, 
+    &rank_k_workspace_.arguments,
     rank_k_workspace_.host_workspace.data(),
     rank_k_workspace_.device_workspace.data());
 
@@ -549,8 +571,8 @@ bool RankKOperationProfiler::verify_cutlass(
       }
     }
 #endif // #if CUTLASS_ENABLE_CUBLAS
-    
-    // Update disposition to worst case verification outcome among all 
+
+    // Update disposition to worst case verification outcome among all
     // verification providers which are supported
     bool is_any_verification_run_passed = false;
     for(auto &m : results_.back().verification_map) {
@@ -576,7 +598,7 @@ bool RankKOperationProfiler::verify_cutlass(
 
 /// Verifies CUTLASS against references
 bool RankKOperationProfiler::verify_with_cublas_(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
@@ -586,13 +608,13 @@ bool RankKOperationProfiler::verify_with_cublas_(
 
 #if CUTLASS_ENABLE_CUBLAS
 
-  library::RankKDescription const &rank_k_desc = 
+  library::RankKDescription const &rank_k_desc =
     static_cast<library::RankKDescription const &>(operation->description());
 
   //
   // Construct cuBLAS operators
   //
-    
+
   CublasCreate handle;
   cublasStatus_t status = handle.get_cublas_create_status();
 
@@ -620,8 +642,8 @@ bool RankKOperationProfiler::verify_with_cublas_(
     rank_k_workspace_.arguments.beta = problem_.beta.data();
     rank_k_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
 
-    detail::cublasRankKDispatcher rank_k_op( 
-      rank_k_desc, 
+    detail::cublasRankKDispatcher rank_k_op(
+      rank_k_desc,
       rank_k_workspace_.configuration,
       rank_k_workspace_.arguments
     );
@@ -653,7 +675,7 @@ bool RankKOperationProfiler::verify_with_cublas_(
     );
 
     // Save workspace if incorrect
-    if (options.verification.save_workspace == SaveWorkspace::kIncorrect && 
+    if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
       results_.back().verification_map[library::Provider::kCUBLAS] == Disposition::kIncorrect) {
 
       save_workspace(
@@ -678,7 +700,7 @@ bool RankKOperationProfiler::verify_with_cublas_(
 
 /// Measures performance results
 bool RankKOperationProfiler::profile(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
@@ -696,7 +718,7 @@ bool RankKOperationProfiler::profile(
     rank_k_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
 
     results_.back().status = profile_cutlass_(
-      results_.back().runtime,
+      results_.back(),
       options,
       operation,
       &rank_k_workspace_.arguments,
diff --git a/tools/profiler/src/sparse_gemm_operation_profiler.cu b/tools/profiler/src/sparse_gemm_operation_profiler.cu
index 441a611f11..ec14a33236 100644
--- a/tools/profiler/src/sparse_gemm_operation_profiler.cu
+++ b/tools/profiler/src/sparse_gemm_operation_profiler.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -38,9 +38,9 @@
 #include <iomanip>
 #include <ios>
 
-#include "cublas_helpers.h"
-#include "sparse_gemm_operation_profiler.h"
-#include "gpu_timer.h"
+#include "cutlass/profiler/cublas_helpers.h"
+#include "cutlass/profiler/sparse_gemm_operation_profiler.h"
+#include "cutlass/profiler/gpu_timer.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -51,23 +51,23 @@ namespace profiler {
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Ctor
-SparseGemmOperationProfiler::SparseGemmOperationProfiler(Options const &options): 
+SparseGemmOperationProfiler::SparseGemmOperationProfiler(Options const &options):
   OperationProfiler(
     options,
     library::OperationKind::kSparseGemm,
     {
-  	  {ArgumentTypeID::kEnumerated, {"gemm_kind"}, "Variant of GEMM (e.g. gemm, planar complex, batched, ...)"},
-  	  {ArgumentTypeID::kInteger, {"m", "problem-size::m"}, "M dimension of the GEMM problem space"},
-    	{ArgumentTypeID::kInteger, {"n", "problem-size::n"}, "N dimension of the GEMM problem space"},
-	    {ArgumentTypeID::kInteger, {"k", "problem-size::k"}, "K dimension of the GEMM problem space"},
-    	{ArgumentTypeID::kTensor, {"A"}, "Tensor storing the A operand"},
-	    {ArgumentTypeID::kTensor, {"B"}, "Tensor storing the B operand"},
-  	  {ArgumentTypeID::kTensor, {"C"}, "Tensor storing the C operand"},
-  	  {ArgumentTypeID::kTensor, {"E"}, "Tensor storing the E operand"},
-  	  {ArgumentTypeID::kScalar, {"alpha", "epilogue::alpha"}, "Epilogue scalar alpha"},
-    	{ArgumentTypeID::kScalar, {"beta", "epilogue::beta"}, "Epilogue scalar beta"},
-	    {ArgumentTypeID::kInteger, {"split_k_slices"}, "Number of partitions of K dimension"},
-    	{ArgumentTypeID::kInteger, {"batch_count"}, "Number of GEMMs computed in one batch"},
+      {ArgumentTypeID::kEnumerated, {"gemm_kind"}, "Variant of GEMM (e.g. sparse, ...)"},
+      {ArgumentTypeID::kInteger, {"m", "problem-size::m"}, "M dimension of the GEMM problem space"},
+      {ArgumentTypeID::kInteger, {"n", "problem-size::n"}, "N dimension of the GEMM problem space"},
+      {ArgumentTypeID::kInteger, {"k", "problem-size::k"}, "K dimension of the GEMM problem space"},
+      {ArgumentTypeID::kTensor, {"A"}, "Tensor storing the A operand"},
+      {ArgumentTypeID::kTensor, {"B"}, "Tensor storing the B operand"},
+      {ArgumentTypeID::kTensor, {"C"}, "Tensor storing the C operand"},
+      {ArgumentTypeID::kTensor, {"E"}, "Tensor storing the E operand"},
+      {ArgumentTypeID::kScalar, {"alpha", "epilogue::alpha"}, "Epilogue scalar alpha"},
+      {ArgumentTypeID::kScalar, {"beta", "epilogue::beta"}, "Epilogue scalar beta"},
+      {ArgumentTypeID::kInteger, {"split_k_slices"}, "Number of partitions of K dimension"},
+      {ArgumentTypeID::kInteger, {"batch_count"}, "Number of GEMMs computed in one batch"},
     }
   ) {
 
@@ -109,7 +109,7 @@ void SparseGemmOperationProfiler::print_examples(std::ostream &out) const {
 
     << "Run a kernel with cta tile size of 256x128x32 and save workspace if results are incorrect (note that --cta-tile::k=32 is default cta-tile size):\n"
     << " $ cutlass_profiler --operation=SparseGemm --cta_m=256 --cta_n=128  --cta_k=32 --save-workspace=incorrect\n\n"
-    
+
     << "Test your changes to gemm kernels with a quick functional test and save results in functional-test.csv:\n"
     << " $ cutlass_profiler  --operation=SparseGemm \\ \n"
     << "   --m=8,56,120,136,256,264,512,520,1024,1032,4096,8192,16384 \\ \n"
@@ -125,7 +125,7 @@ Status SparseGemmOperationProfiler::SparseGemmProblem::parse(
   library::SparseGemmDescription const &operation_desc,
   ProblemSpace const &problem_space,
   ProblemSpace::Problem const &problem) {
-  
+
   if (!arg_as_int(this->m, "m", problem_space, problem)) {
     // default value
     this->m = 1024;
@@ -135,17 +135,17 @@ Status SparseGemmOperationProfiler::SparseGemmProblem::parse(
     // default value
     this->n = 1024;
   }
-  
+
   if (!arg_as_int(this->k, "k", problem_space, problem)) {
     // default value
     this->k = 1024;
   }
-  
+
   if (!arg_as_int(this->split_k_slices, "split_k_slices", problem_space, problem)) {
     // default value
     this->split_k_slices = 1;
   }
-  
+
   if (!arg_as_int(this->batch_count, "batch_count", problem_space, problem)) {
     // default value
     this->batch_count = 1;
@@ -168,24 +168,24 @@ Status SparseGemmOperationProfiler::SparseGemmProblem::parse(
   }
 
   if (!arg_as_scalar(
-    this->alpha, 
-    operation_desc.element_epilogue, 
-    "alpha", 
-    problem_space, 
+    this->alpha,
+    operation_desc.element_epilogue,
+    "alpha",
+    problem_space,
     problem)) {
 
     if (!cast_from_double(this->alpha, operation_desc.element_epilogue, 1)) {
       return Status::kErrorInternal;
     }
   }
-  
+
   if (!arg_as_scalar(
-    this->beta, 
-    operation_desc.element_epilogue, 
-    "beta", 
-    problem_space, 
+    this->beta,
+    operation_desc.element_epilogue,
+    "beta",
+    problem_space,
     problem)) {
-    
+
     if (!cast_from_double(this->beta, operation_desc.element_epilogue, 0)) {
       return Status::kErrorInternal;
     }
@@ -252,14 +252,14 @@ void SparseGemmOperationProfiler::SparseGemmProblem::initialize_result(
 
 /// Extracts the problem dimensions
 Status SparseGemmOperationProfiler::initialize_configuration(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
   ProblemSpace const &problem_space,
   ProblemSpace::Problem const &problem) {
 
-  library::SparseGemmDescription const &operation_desc = 
+  library::SparseGemmDescription const &operation_desc =
     static_cast<library::SparseGemmDescription const &>(operation->description());
 
   if (operation_desc.gemm_kind != library::GemmKind::kSparse) {
@@ -291,14 +291,14 @@ Status SparseGemmOperationProfiler::initialize_configuration(
   gemm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
 
   initialize_result_(this->model_result_, options, operation_desc, problem_space);
-  
+
   return operation->can_implement(&gemm_workspace_.configuration, &gemm_workspace_.arguments);
 }
 
 /// Initializes the performance result
 void SparseGemmOperationProfiler::initialize_result_(
   PerformanceResult &result,
-  Options const &options,  
+  Options const &options,
   library::SparseGemmDescription const &operation_desc,
   ProblemSpace const &problem_space) {
 
@@ -308,7 +308,7 @@ void SparseGemmOperationProfiler::initialize_result_(
   result.operation_name = operation_desc.name;
 
   problem_.initialize_result(result, operation_desc, problem_space);
-  
+
   OperationProfiler::initialize_result_(result, operation_desc, problem_space);
 
   // Input bytes read and Output bytes written for the gemm problem
@@ -337,69 +337,98 @@ void SparseGemmOperationProfiler::initialize_result_(
 
 /// Initializes workspace
 Status SparseGemmOperationProfiler::initialize_workspace(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
   ProblemSpace const &problem_space,
   ProblemSpace::Problem const &problem) {
-  
-  library::SparseGemmDescription const &operation_desc = 
+
+  if (options.device.devices.size() != 1) {
+    throw std::runtime_error("This operation profiler only supports a single "
+                             "device.");
+  }
+
+  cudaError_t result;
+  result = cudaSetDevice(options.device.device_id(0));
+  if (result != cudaSuccess) {
+    throw std::runtime_error("cudaSetDevice() failed.");
+  }
+
+  library::SparseGemmDescription const &operation_desc =
     static_cast<library::SparseGemmDescription const &>(operation->description());
 
   if (options.execution_mode != ExecutionMode::kDryRun) {
-
-    gemm_workspace_.A = device_context.allocate_tensor(
+    int seed_shift = 0;
+    gemm_workspace_.A = device_context.allocate_and_initialize_tensor(
       options,
       "A",
       operation_desc.A.element,
       operation_desc.A.layout,
       {int(problem_.m), int(problem_.k) / int(problem_.sparse)},
-      {int(problem_.lda)}
+      {int(problem_.lda)},
+      1, // batch_count
+      seed_shift++,
+      0 // device_index
     );
 
-    gemm_workspace_.B = device_context.allocate_tensor(
+    gemm_workspace_.B = device_context.allocate_and_initialize_tensor(
       options,
       "B",
       operation_desc.B.element,
       operation_desc.B.layout,
       {int(problem_.k), int(problem_.n)},
-      {int(problem_.ldb)}
+      {int(problem_.ldb)},
+      1, // batch_count
+      seed_shift++,
+      0 // device_index
     );
 
-    gemm_workspace_.C = device_context.allocate_tensor(
+    gemm_workspace_.C = device_context.allocate_and_initialize_tensor(
       options,
       "C",
       operation_desc.C.element,
       operation_desc.C.layout,
       {int(problem_.m), int(problem_.n)},
-      {int(problem_.ldc)}
+      {int(problem_.ldc)},
+      1, // batch_count
+      seed_shift++,
+      0 // device_index
     );
 
     gemm_workspace_.Computed = device_context.allocate_tensor(
+      options,
       "D",
       operation_desc.C.element,
       operation_desc.C.layout,
       {int(problem_.m), int(problem_.n)},
-      {int(problem_.ldc)}
+      {int(problem_.ldc)},
+      1, // batch_count
+      0 // device_index
     );
 
-    gemm_workspace_.E = device_context.allocate_sparsemeta_tensor(
+    gemm_workspace_.E = device_context.allocate_and_initialize_sparsemeta_tensor(
       options,
       "E",
       operation_desc.E.element,
       operation_desc.E.layout,
       operation_desc.A.element,
       {int(problem_.m), int(problem_.k) / int(problem_.sparse) / int(problem_.elements_per_128b)},
-      {int(problem_.lde)}
+      {int(problem_.lde)},
+      1, // batch_count
+      seed_shift++,
+      0 // device_index
     );
 
     gemm_workspace_.Reference = device_context.allocate_tensor(
+      options,
       "Reference",
       operation_desc.C.element,
       operation_desc.C.layout,
       {int(problem_.m), int(problem_.n)},
-      {int(problem_.ldc)}
+      {int(problem_.ldc)},
+      1, // batch_count
+      0 // device_index
     );
 
     gemm_workspace_.Reference->copy_from_device(gemm_workspace_.C->data());
@@ -448,7 +477,7 @@ Status SparseGemmOperationProfiler::initialize_workspace(
 
 /// Verifies CUTLASS against references
 bool SparseGemmOperationProfiler::verify_cutlass(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
@@ -478,7 +507,7 @@ bool SparseGemmOperationProfiler::verify_cutlass(
   //
 
   results_.back().status = operation->run(
-    &gemm_workspace_.arguments, 
+    &gemm_workspace_.arguments,
     gemm_workspace_.host_workspace.data(),
     gemm_workspace_.device_workspace.data());
 
@@ -502,7 +531,7 @@ bool SparseGemmOperationProfiler::verify_cutlass(
 
   if (options.verification.enabled) {
 
-    // Update disposition to worst case verification outcome among all 
+    // Update disposition to worst case verification outcome among all
     // verification providers which are supported
     bool is_any_verification_run_passed = false;
 
@@ -529,7 +558,7 @@ bool SparseGemmOperationProfiler::verify_cutlass(
 
 /// Measures performance results
 bool SparseGemmOperationProfiler::profile(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
@@ -549,7 +578,7 @@ bool SparseGemmOperationProfiler::profile(
     gemm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
 
     results_.back().status = profile_cutlass_(
-      results_.back().runtime,
+      results_.back(),
       options,
       operation,
       &gemm_workspace_.arguments,
@@ -557,7 +586,7 @@ bool SparseGemmOperationProfiler::profile(
       gemm_workspace_.device_workspace.data()
     );
   }
-  
+
   return true;
 }
 
diff --git a/tools/profiler/src/symm_operation_profiler.cu b/tools/profiler/src/symm_operation_profiler.cu
index 5617f9ae10..80f645e75b 100644
--- a/tools/profiler/src/symm_operation_profiler.cu
+++ b/tools/profiler/src/symm_operation_profiler.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -31,7 +31,7 @@
 /* \file
    \brief Execution environment
 
-  
+
 */
 
 #include <iostream>
@@ -41,9 +41,9 @@
 
 #include "cutlass/core_io.h"
 
-#include "cublas_helpers.h"
-#include "symm_operation_profiler.h"
-#include "gpu_timer.h"
+#include "cutlass/profiler/cublas_helpers.h"
+#include "cutlass/profiler/symm_operation_profiler.h"
+#include "cutlass/profiler/gpu_timer.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -54,7 +54,7 @@ namespace profiler {
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Ctor
-SymmOperationProfiler::SymmOperationProfiler(Options const &options): 
+SymmOperationProfiler::SymmOperationProfiler(Options const &options):
   OperationProfiler(
     options,
     library::OperationKind::kSymm,
@@ -96,7 +96,7 @@ void SymmOperationProfiler::print_examples(std::ostream &out) const {
   out << "\nExamples:\n\n"
     << "Profile a particular problem size SYMM kernel:\n"
     << "  $ cutlass_profiler --operation=Symm --blas_mode=symmetric --m=1024 --n=128\n\n"
-    
+
     << "Profile a particular problem size HEMM kernel:\n"
     << "  $ cutlass_profiler --operation=Symm --blas_mode=hermitian --m=1024 --n=128\n\n"
 
@@ -122,7 +122,7 @@ void SymmOperationProfiler::print_examples(std::ostream &out) const {
 
     << "Run a kernel with cta tile size of 256x128x32 and save workspace if results are incorrect (note that --cta-tile::k=32 is default cta-tile size):\n"
     << " $ cutlass_profiler --operation=Symm --cta_m=256 --cta_n=128  --cta_k=32 --save-workspace=incorrect\n\n"
-    
+
     << "Test your changes to symm kernels with a quick functional test and save results in functional-test.csv:\n"
     << " $ cutlass_profiler  --operation=Symm \\ \n"
     << "   --m=8,56,120,136,256,264,512,520,1024,1032,4096,8192,16384 \\ \n"
@@ -152,22 +152,22 @@ Status SymmOperationProfiler::SymmProblem::parse(
   library::SymmDescription const &operation_desc,
   ProblemSpace const &problem_space,
   ProblemSpace::Problem const &problem) {
-  
+
   if (!arg_as_int(this->m, "m", problem_space, problem)) {
     // default value
     this->m = 1024;
   }
-  
+
   if (!arg_as_int(this->n, "n", problem_space, problem)) {
     // default value
     this->n = 1024;
   }
-  
+
   if (!arg_as_int(this->split_k_slices, "split_k_slices", problem_space, problem)) {
     // default value
     this->split_k_slices = 1;
   }
-  
+
   if (!arg_as_int(this->batch_count, "batch_count", problem_space, problem)) {
     // default value
     this->batch_count = 1;
@@ -191,29 +191,29 @@ Status SymmOperationProfiler::SymmProblem::parse(
   }
 
   if (!arg_as_scalar(
-    this->alpha, 
-    operation_desc.element_epilogue, 
-    "alpha", 
-    problem_space, 
+    this->alpha,
+    operation_desc.element_epilogue,
+    "alpha",
+    problem_space,
     problem)) {
 
     if (!cast_from_double(this->alpha, operation_desc.element_epilogue, 1)) {
       return Status::kErrorInternal;
     }
   }
-  
+
   if (!arg_as_scalar(
-    this->beta, 
-    operation_desc.element_epilogue, 
-    "beta", 
-    problem_space, 
+    this->beta,
+    operation_desc.element_epilogue,
+    "beta",
+    problem_space,
     problem)) {
-    
+
     if (!cast_from_double(this->beta, operation_desc.element_epilogue, 0)) {
       return Status::kErrorInternal;
     }
   }
-  
+
   if (operation_desc.side_mode == SideMode::kLeft) {
     this->lda = DeviceAllocation::get_packed_layout(
       operation_desc.A.layout, {int(this->m), int(this->m)}).front();
@@ -234,18 +234,18 @@ Status SymmOperationProfiler::SymmProblem::parse(
 
 /// Total number of bytes loaded
 int64_t SymmOperationProfiler::SymmProblem::bytes(library::SymmDescription const &operation_desc) const {
-  int64_t bytes;
+  int64_t bytes = 0;
   // Input bytes read and Output bytes written for the gemm problem
   // Half matrix including the diagonal will have (X*(X+1))/2 elements
   if (operation_desc.side_mode == SideMode::kLeft) {
     bytes =
       int64_t(library::sizeof_bits(operation_desc.A.element) * m / 8) * (m + 1) / 2 +
-      int64_t(library::sizeof_bits(operation_desc.B.element) * m / 8) * n + 
+      int64_t(library::sizeof_bits(operation_desc.B.element) * m / 8) * n +
       int64_t(library::sizeof_bits(operation_desc.C.element) * m / 8) * n;
   } else if (operation_desc.side_mode == SideMode::kRight) {
     bytes =
       int64_t(library::sizeof_bits(operation_desc.A.element) * n / 8) * (n + 1) / 2 +
-      int64_t(library::sizeof_bits(operation_desc.B.element) * m / 8) * n + 
+      int64_t(library::sizeof_bits(operation_desc.B.element) * m / 8) * n +
       int64_t(library::sizeof_bits(operation_desc.C.element) * m / 8) * n;
   }
   // Set is_beta_zero true if beta is zero
@@ -277,7 +277,7 @@ int64_t SymmOperationProfiler::SymmProblem::flops(library::SymmDescription const
   case library::MathOperationID::kMultiplyAddComplex:
     flops_ *= 4;
     break;
-    
+
   case library::MathOperationID::kMultiplyAddComplexFastF32:
     flops_ *= 4;
     break;
@@ -334,14 +334,14 @@ void SymmOperationProfiler::SymmProblem::initialize_result(
 
 /// Extracts the problem dimensions
 Status SymmOperationProfiler::initialize_configuration(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
   ProblemSpace const &problem_space,
   ProblemSpace::Problem const &problem) {
 
-  library::SymmDescription const &operation_desc = 
+  library::SymmDescription const &operation_desc =
     static_cast<library::SymmDescription const &>(operation->description());
 
   if (operation_desc.symm_kind != library::SymmKind::kUniversal) {
@@ -349,14 +349,14 @@ Status SymmOperationProfiler::initialize_configuration(
   }
 
   Status status = problem_.parse(operation_desc, problem_space, problem);
-  
+
   if (status != Status::kSuccess) {
     return status;
   }
 
   symm_workspace_.configuration.problem_size.m() = int(problem_.m);
   symm_workspace_.configuration.problem_size.n() = int(problem_.n);
-  symm_workspace_.configuration.problem_size.k() = (operation_desc.side_mode == SideMode::kLeft) 
+  symm_workspace_.configuration.problem_size.k() = (operation_desc.side_mode == SideMode::kLeft)
                                                     ? int(problem_.m) : int(problem_.n);
   symm_workspace_.configuration.lda = problem_.lda;
   symm_workspace_.configuration.ldb = problem_.ldb;
@@ -374,14 +374,14 @@ Status SymmOperationProfiler::initialize_configuration(
   symm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
 
   initialize_result_(this->model_result_, options, operation_desc, problem_space);
-  
+
   return operation->can_implement(&symm_workspace_.configuration, &symm_workspace_.arguments);
 }
 
 /// Initializes the performance result
 void SymmOperationProfiler::initialize_result_(
   PerformanceResult &result,
-  Options const &options,  
+  Options const &options,
   library::SymmDescription const &operation_desc,
   ProblemSpace const &problem_space) {
 
@@ -389,7 +389,7 @@ void SymmOperationProfiler::initialize_result_(
   result.disposition = Disposition::kNotRun;
   result.status = Status::kSuccess;
   result.operation_name = operation_desc.name;
-  
+
   problem_.initialize_result(result, operation_desc, problem_space);
 
   OperationProfiler::initialize_result_(result, operation_desc, problem_space);
@@ -404,73 +404,99 @@ void SymmOperationProfiler::initialize_result_(
 
 /// Initializes workspace
 Status SymmOperationProfiler::initialize_workspace(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
   ProblemSpace const &problem_space,
   ProblemSpace::Problem const &problem) {
-  
-  library::SymmDescription const &operation_desc = 
+
+  if (options.device.devices.size() != 1) {
+    throw std::runtime_error("This operation profiler only supports a single "
+                             "device.");
+  }
+
+  cudaError_t result;
+  result = cudaSetDevice(options.device.device_id(0));
+  if (result != cudaSuccess) {
+    throw std::runtime_error("cudaSetDevice() failed.");
+  }
+
+  library::SymmDescription const &operation_desc =
     static_cast<library::SymmDescription const &>(operation->description());
 
   if (options.execution_mode != ExecutionMode::kDryRun) {
-
+    int seed_shift = 0;
     if (operation_desc.side_mode == SideMode::kLeft) {
-      symm_workspace_.A = device_context.allocate_tensor(
+      symm_workspace_.A = device_context.allocate_and_initialize_tensor(
         options,
         "A",
         operation_desc.A.element,
         operation_desc.A.layout,
         {int(problem_.m), int(problem_.m)},
         {int(problem_.lda)},
-        1 // batch_count = 1, default
+        1, // batch_count
+        seed_shift++,
+        0 // device_index
       );
     } else if (operation_desc.side_mode == SideMode::kRight) {
-      symm_workspace_.A = device_context.allocate_tensor(
+      symm_workspace_.A = device_context.allocate_and_initialize_tensor(
         options,
         "A",
         operation_desc.A.element,
         operation_desc.A.layout,
         {int(problem_.n), int(problem_.n)},
         {int(problem_.lda)},
-        1 // batch_count = 1, default
+        1, // batch_count
+        seed_shift++,
+        0 // device_index
       );
     }
 
-    symm_workspace_.B = device_context.allocate_tensor(
+    symm_workspace_.B = device_context.allocate_and_initialize_tensor(
       options,
       "B",
       operation_desc.B.element,
       operation_desc.B.layout,
       {int(problem_.m), int(problem_.n)},
-      {int(problem_.ldb)}
+      {int(problem_.ldb)},
+      1, // batch_count
+      seed_shift++,
+      0 // device_index
     );
 
-    symm_workspace_.C = device_context.allocate_tensor(
+    symm_workspace_.C = device_context.allocate_and_initialize_tensor(
       options,
       "C",
       operation_desc.C.element,
       operation_desc.C.layout,
       {int(problem_.m), int(problem_.n)},
       {int(problem_.ldc)},
-      1 // batch_count = 1, default
+      1, // batch_count
+      seed_shift++,
+      0 // device_index
     );
 
     symm_workspace_.Computed = device_context.allocate_tensor(
+      options,
       "D",
       operation_desc.C.element,
       operation_desc.C.layout,
       {int(problem_.m), int(problem_.n)},
-      {int(problem_.ldc)}
+      {int(problem_.ldc)},
+      1, // batch_count
+      0 // device_index
     );
 
     symm_workspace_.Reference = device_context.allocate_tensor(
+      options,
       "Reference",
       operation_desc.C.element,
       operation_desc.C.layout,
       {int(problem_.m), int(problem_.n)},
-      {int(problem_.ldc)}
+      {int(problem_.ldc)},
+      1, // batch_count
+      0 // device_index
     );
 
     symm_workspace_.Computed->copy_from_device(symm_workspace_.C->data());
@@ -519,7 +545,7 @@ Status SymmOperationProfiler::initialize_workspace(
 
 /// Verifies CUTLASS against references
 bool SymmOperationProfiler::verify_cutlass(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
@@ -548,7 +574,7 @@ bool SymmOperationProfiler::verify_cutlass(
   //
 
   results_.back().status = operation->run(
-    &symm_workspace_.arguments, 
+    &symm_workspace_.arguments,
     symm_workspace_.host_workspace.data(),
     symm_workspace_.device_workspace.data());
 
@@ -596,8 +622,8 @@ bool SymmOperationProfiler::verify_cutlass(
       }
     }
 #endif // #if CUTLASS_ENABLE_CUBLAS
-    
-    // Update disposition to worst case verification outcome among all 
+
+    // Update disposition to worst case verification outcome among all
     // verification providers which are supported
     bool is_any_verification_run_passed = false;
     for(auto &m : results_.back().verification_map) {
@@ -623,7 +649,7 @@ bool SymmOperationProfiler::verify_cutlass(
 
 /// Verifies CUTLASS against references
 bool SymmOperationProfiler::verify_with_cublas_(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
@@ -633,13 +659,13 @@ bool SymmOperationProfiler::verify_with_cublas_(
 
 #if CUTLASS_ENABLE_CUBLAS
 
-  library::SymmDescription const &symm_desc = 
+  library::SymmDescription const &symm_desc =
     static_cast<library::SymmDescription const &>(operation->description());
 
   //
   // Construct cuBLAS operators
   //
-    
+
   CublasCreate handle;
   cublasStatus_t status = handle.get_cublas_create_status();
 
@@ -668,8 +694,8 @@ bool SymmOperationProfiler::verify_with_cublas_(
     symm_workspace_.arguments.beta = problem_.beta.data();
     symm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
 
-    detail::cublasSymmDispatcher symm_op( 
-      symm_desc, 
+    detail::cublasSymmDispatcher symm_op(
+      symm_desc,
       symm_workspace_.configuration,
       symm_workspace_.arguments
     );
@@ -701,7 +727,7 @@ bool SymmOperationProfiler::verify_with_cublas_(
     );
 
     // Save workspace if incorrect
-    if (options.verification.save_workspace == SaveWorkspace::kIncorrect && 
+    if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
       results_.back().verification_map[library::Provider::kCUBLAS] == Disposition::kIncorrect) {
 
       save_workspace(
@@ -726,7 +752,7 @@ bool SymmOperationProfiler::verify_with_cublas_(
 
 /// Measures performance results
 bool SymmOperationProfiler::profile(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
@@ -745,7 +771,7 @@ bool SymmOperationProfiler::profile(
     symm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
 
     results_.back().status = profile_cutlass_(
-      results_.back().runtime,
+      results_.back(),
       options,
       operation,
       &symm_workspace_.arguments,
diff --git a/tools/profiler/src/trmm_operation_profiler.cu b/tools/profiler/src/trmm_operation_profiler.cu
index 45f20f322b..9d3b4db6fb 100644
--- a/tools/profiler/src/trmm_operation_profiler.cu
+++ b/tools/profiler/src/trmm_operation_profiler.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -31,7 +31,7 @@
 /* \file
    \brief Execution environment
 
-  
+
 */
 
 #include <iostream>
@@ -41,9 +41,9 @@
 
 #include "cutlass/core_io.h"
 
-#include "cublas_helpers.h"
-#include "trmm_operation_profiler.h"
-#include "gpu_timer.h"
+#include "cutlass/profiler/cublas_helpers.h"
+#include "cutlass/profiler/trmm_operation_profiler.h"
+#include "cutlass/profiler/gpu_timer.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -54,7 +54,7 @@ namespace profiler {
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Ctor
-TrmmOperationProfiler::TrmmOperationProfiler(Options const &options): 
+TrmmOperationProfiler::TrmmOperationProfiler(Options const &options):
   OperationProfiler(
     options,
     library::OperationKind::kTrmm,
@@ -113,7 +113,7 @@ void TrmmOperationProfiler::print_examples(std::ostream &out) const {
 
     << "Run a kernel with cta tile size of 256x128x32 and save workspace if results are incorrect (note that --cta-tile::k=32 is default cta-tile size):\n"
     << " $ cutlass_profiler --operation=Trmm --cta_m=256 --cta_n=128  --cta_k=32 --save-workspace=incorrect\n\n"
-    
+
     << "Test your changes to trmm kernels with a quick functional test and save results in functional-test.csv:\n"
     << " $ cutlass_profiler  --operation=Trmm \\ \n"
     << "   --n=8,56,120,136,256,264,512,520,1024,1032,4096,8192,16384 \\ \n"
@@ -143,22 +143,22 @@ Status TrmmOperationProfiler::TrmmProblem::parse(
   library::TrmmDescription const &operation_desc,
   ProblemSpace const &problem_space,
   ProblemSpace::Problem const &problem) {
-  
+
   if (!arg_as_int(this->m, "m", problem_space, problem)) {
     // default value
     this->m = 1024;
   }
-  
+
   if (!arg_as_int(this->n, "n", problem_space, problem)) {
     // default value
     this->n = 1024;
   }
-  
+
   if (!arg_as_int(this->split_k_slices, "split_k_slices", problem_space, problem)) {
     // default value
     this->split_k_slices = 1;
   }
-  
+
   if (!arg_as_int(this->batch_count, "batch_count", problem_space, problem)) {
     // default value
     this->batch_count = 1;
@@ -182,29 +182,29 @@ Status TrmmOperationProfiler::TrmmProblem::parse(
   }
 
   if (!arg_as_scalar(
-    this->alpha, 
-    operation_desc.element_epilogue, 
-    "alpha", 
-    problem_space, 
+    this->alpha,
+    operation_desc.element_epilogue,
+    "alpha",
+    problem_space,
     problem)) {
 
     if (!cast_from_double(this->alpha, operation_desc.element_epilogue, 1)) {
       return Status::kErrorInternal;
     }
   }
-  
+
   if (!arg_as_scalar(
-    this->beta, 
-    operation_desc.element_epilogue, 
-    "beta", 
-    problem_space, 
+    this->beta,
+    operation_desc.element_epilogue,
+    "beta",
+    problem_space,
     problem)) {
-    
+
     if (!cast_from_double(this->beta, operation_desc.element_epilogue, 0)) {
       return Status::kErrorInternal;
     }
   }
-  
+
   if (operation_desc.side_mode == SideMode::kLeft) {
     this->lda = DeviceAllocation::get_packed_layout(
       operation_desc.A.layout, {int(this->m), int(this->m)}).front();
@@ -265,14 +265,14 @@ void TrmmOperationProfiler::TrmmProblem::initialize_result(
 
 /// Extracts the problem dimensions
 Status TrmmOperationProfiler::initialize_configuration(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
   ProblemSpace const &problem_space,
   ProblemSpace::Problem const &problem) {
 
-  library::TrmmDescription const &operation_desc = 
+  library::TrmmDescription const &operation_desc =
     static_cast<library::TrmmDescription const &>(operation->description());
 
   if (operation_desc.trmm_kind != library::TrmmKind::kUniversal) {
@@ -280,14 +280,14 @@ Status TrmmOperationProfiler::initialize_configuration(
   }
 
   Status status = problem_.parse(operation_desc, problem_space, problem);
-  
+
   if (status != Status::kSuccess) {
     return status;
   }
 
   trmm_workspace_.configuration.problem_size.m() = int(problem_.m);
   trmm_workspace_.configuration.problem_size.n() = int(problem_.n);
-  trmm_workspace_.configuration.problem_size.k() = (operation_desc.side_mode == SideMode::kLeft) 
+  trmm_workspace_.configuration.problem_size.k() = (operation_desc.side_mode == SideMode::kLeft)
                                                     ? int(problem_.m) : int(problem_.n);
   trmm_workspace_.configuration.lda = problem_.lda;
   trmm_workspace_.configuration.ldb = problem_.ldb;
@@ -303,14 +303,14 @@ Status TrmmOperationProfiler::initialize_configuration(
   trmm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
 
   initialize_result_(this->model_result_, options, operation_desc, problem_space);
-  
+
   return operation->can_implement(&trmm_workspace_.configuration, &trmm_workspace_.arguments);
 }
 
 /// Initializes the performance result
 void TrmmOperationProfiler::initialize_result_(
   PerformanceResult &result,
-  Options const &options,  
+  Options const &options,
   library::TrmmDescription const &operation_desc,
   ProblemSpace const &problem_space) {
 
@@ -318,30 +318,30 @@ void TrmmOperationProfiler::initialize_result_(
   result.disposition = Disposition::kNotRun;
   result.status = Status::kSuccess;
   result.operation_name = operation_desc.name;
-  
+
   problem_.initialize_result(result, operation_desc, problem_space);
 
   OperationProfiler::initialize_result_(result, operation_desc, problem_space);
 
   if (operation_desc.side_mode == SideMode::kLeft) {
     // Input bytes read and Output bytes written for the trmm problem
-    result.bytes = 
+    result.bytes =
       // Half matrix including the diagonal will have (M*(M+1))/2 elements
       int64_t(library::sizeof_bits(operation_desc.A.element) * problem_.m / 8) * (problem_.m + 1) / 2 +
-      int64_t(library::sizeof_bits(operation_desc.B.element) * problem_.m / 8) * problem_.n + 
+      int64_t(library::sizeof_bits(operation_desc.B.element) * problem_.m / 8) * problem_.n +
       int64_t(library::sizeof_bits(operation_desc.D.element) * problem_.m / 8) * problem_.n;
   } else if (operation_desc.side_mode == SideMode::kRight) {
     // Input bytes read and Output bytes written for the trmm problem
-    result.bytes = 
+    result.bytes =
       // Half matrix including the diagonal will have (N*(N+1))/2 elements
       int64_t(library::sizeof_bits(operation_desc.A.element) * problem_.n / 8) * (problem_.n + 1) / 2 +
-      int64_t(library::sizeof_bits(operation_desc.B.element) * problem_.m / 8) * problem_.n + 
+      int64_t(library::sizeof_bits(operation_desc.B.element) * problem_.m / 8) * problem_.n +
       int64_t(library::sizeof_bits(operation_desc.D.element) * problem_.m / 8) * problem_.n;
   }
 
   // FLOPs = 2 * [ ( M * (M+1)/2 * N ) ] // Beta is zero
   result.flops = problem_.m * (problem_.m + 1) * problem_.n;
- 
+
    result.runtime = 0;
 
   // complex-valued support
@@ -349,11 +349,11 @@ void TrmmOperationProfiler::initialize_result_(
   case library::MathOperationID::kMultiplyAddComplex:
     result.flops *= 4;
     break;
-    
+
   case library::MathOperationID::kMultiplyAddComplexFastF32:
     result.flops *= 4;
     break;
- 
+
   default: break;
   }
 
@@ -361,63 +361,87 @@ void TrmmOperationProfiler::initialize_result_(
 
 /// Initializes workspace
 Status TrmmOperationProfiler::initialize_workspace(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
   ProblemSpace const &problem_space,
   ProblemSpace::Problem const &problem) {
-  
-  library::TrmmDescription const &operation_desc = 
+
+  if (options.device.devices.size() != 1) {
+    throw std::runtime_error("This operation profiler only supports a single "
+                             "device.");
+  }
+
+  cudaError_t result;
+  result = cudaSetDevice(options.device.device_id(0));
+  if (result != cudaSuccess) {
+    throw std::runtime_error("cudaSetDevice() failed.");
+  }
+
+  library::TrmmDescription const &operation_desc =
     static_cast<library::TrmmDescription const &>(operation->description());
 
   if (options.execution_mode != ExecutionMode::kDryRun) {
-
+    int seed_shift = 0;
     if (operation_desc.side_mode == SideMode::kLeft) {
-      trmm_workspace_.A = device_context.allocate_tensor(
+      trmm_workspace_.A = device_context.allocate_and_initialize_tensor(
         options,
         "A",
         operation_desc.A.element,
         operation_desc.A.layout,
         {int(problem_.m), int(problem_.m)},
         {int(problem_.lda)},
-        1 // batch_count = 1, default
+        1, // batch_count
+        seed_shift++,
+        0 // device_index
       );
     } else if (operation_desc.side_mode == SideMode::kRight) {
-      trmm_workspace_.A = device_context.allocate_tensor(
+      trmm_workspace_.A = device_context.allocate_and_initialize_tensor(
         options,
         "A",
         operation_desc.A.element,
         operation_desc.A.layout,
         {int(problem_.n), int(problem_.n)},
         {int(problem_.lda)},
-        1 // batch_count = 1, default
+        1, // batch_count
+        seed_shift++,
+        0 // device_index
       );
     }
 
-    trmm_workspace_.B = device_context.allocate_tensor(
+    trmm_workspace_.B = device_context.allocate_and_initialize_tensor(
       options,
       "B",
       operation_desc.B.element,
       operation_desc.B.layout,
       {int(problem_.m), int(problem_.n)},
-      {int(problem_.ldb)}
+      {int(problem_.ldb)},
+      1, // batch_count
+      seed_shift++,
+      0 // device_index
     );
 
     trmm_workspace_.Computed = device_context.allocate_tensor(
+      options,
       "D",
       operation_desc.D.element,
       operation_desc.D.layout,
       {int(problem_.m), int(problem_.n)},
-      {int(problem_.ldd)}
+      {int(problem_.ldd)},
+      1, // batch_count
+      0 // device_index
     );
 
     trmm_workspace_.Reference = device_context.allocate_tensor(
+      options,
       "Reference",
       operation_desc.D.element,
       operation_desc.D.layout,
       {int(problem_.m), int(problem_.n)},
-      {int(problem_.ldd)}
+      {int(problem_.ldd)},
+      1, // batch_count
+      0 // device_index
     );
 
   }
@@ -463,7 +487,7 @@ Status TrmmOperationProfiler::initialize_workspace(
 
 /// Verifies CUTLASS against references
 bool TrmmOperationProfiler::verify_cutlass(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
@@ -491,7 +515,7 @@ bool TrmmOperationProfiler::verify_cutlass(
   //
 
   results_.back().status = operation->run(
-    &trmm_workspace_.arguments, 
+    &trmm_workspace_.arguments,
     trmm_workspace_.host_workspace.data(),
     trmm_workspace_.device_workspace.data());
 
@@ -539,8 +563,8 @@ bool TrmmOperationProfiler::verify_cutlass(
       }
     }
 #endif // #if CUTLASS_ENABLE_CUBLAS
-    
-    // Update disposition to worst case verification outcome among all 
+
+    // Update disposition to worst case verification outcome among all
     // verification providers which are supported
     bool is_any_verification_run_passed = false;
     for(auto &m : results_.back().verification_map) {
@@ -566,7 +590,7 @@ bool TrmmOperationProfiler::verify_cutlass(
 
 /// Verifies CUTLASS against references
 bool TrmmOperationProfiler::verify_with_cublas_(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
@@ -576,13 +600,13 @@ bool TrmmOperationProfiler::verify_with_cublas_(
 
 #if CUTLASS_ENABLE_CUBLAS
 
-  library::TrmmDescription const &trmm_desc = 
+  library::TrmmDescription const &trmm_desc =
     static_cast<library::TrmmDescription const &>(operation->description());
 
   //
   // Construct cuBLAS operators
   //
-    
+
   CublasCreate handle;
   cublasStatus_t status = handle.get_cublas_create_status();
 
@@ -610,8 +634,8 @@ bool TrmmOperationProfiler::verify_with_cublas_(
     trmm_workspace_.arguments.beta = problem_.beta.data();
     trmm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
 
-    detail::cublasTrmmDispatcher trmm_op( 
-      trmm_desc, 
+    detail::cublasTrmmDispatcher trmm_op(
+      trmm_desc,
       trmm_workspace_.configuration,
       trmm_workspace_.arguments
     );
@@ -642,7 +666,7 @@ bool TrmmOperationProfiler::verify_with_cublas_(
     );
 
     // Save workspace if incorrect
-    if (options.verification.save_workspace == SaveWorkspace::kIncorrect && 
+    if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
       results_.back().verification_map[library::Provider::kCUBLAS] == Disposition::kIncorrect) {
 
       save_workspace(
@@ -667,7 +691,7 @@ bool TrmmOperationProfiler::verify_with_cublas_(
 
 /// Measures performance results
 bool TrmmOperationProfiler::profile(
-  Options const &options,  
+  Options const &options,
   PerformanceReport &report,
   DeviceContext &device_context,
   library::Operation const *operation,
@@ -685,7 +709,7 @@ bool TrmmOperationProfiler::profile(
     trmm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
 
     results_.back().status = profile_cutlass_(
-      results_.back().runtime,
+      results_.back(),
       options,
       operation,
       &trmm_workspace_.arguments,
diff --git a/tools/util/CMakeLists.txt b/tools/util/CMakeLists.txt
index 2d2333e115..66d019b56a 100644
--- a/tools/util/CMakeLists.txt
+++ b/tools/util/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -25,7 +25,7 @@
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
+cmake_policy(SET CMP0112 NEW)
 add_library(cutlass_tools_util_includes INTERFACE)
 add_library(nvidia::cutlass::tools::util ALIAS cutlass_tools_util_includes)
 set_target_properties(cutlass_tools_util_includes PROPERTIES EXPORT_NAME tools::util)
diff --git a/tools/util/include/cutlass/util/GPU_Clock.hpp b/tools/util/include/cutlass/util/GPU_Clock.hpp
new file mode 100644
index 0000000000..2774c079e7
--- /dev/null
+++ b/tools/util/include/cutlass/util/GPU_Clock.hpp
@@ -0,0 +1,67 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cuda_runtime.h>
+
+struct GPU_Clock
+{
+  GPU_Clock() {
+    cudaEventCreate(&start_);
+    cudaEventCreate(&stop_);
+    cudaEventRecord(start_);
+  }
+
+  ~GPU_Clock() {
+    cudaEventDestroy(start_);
+    cudaEventDestroy(stop_);
+  }
+
+  void start() {
+    cudaEventRecord(start_);
+  }
+
+  float milliseconds() {
+    cudaEventRecord(stop_);
+    cudaEventSynchronize(stop_);
+    float time;
+    cudaEventElapsedTime(&time, start_, stop_);
+    return time;
+  }
+
+  float seconds() {
+    return milliseconds() * float(1e-3);
+  }
+
+ private:
+  cudaEvent_t start_, stop_;
+};
diff --git a/tools/util/include/cutlass/util/command_line.h b/tools/util/include/cutlass/util/command_line.h
index 8d440d4ee6..9dc3a11740 100644
--- a/tools/util/include/cutlass/util/command_line.h
+++ b/tools/util/include/cutlass/util/command_line.h
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -121,7 +121,7 @@ struct CommandLine {
    * Returns the commandline parameter for a given index (not including flags)
    */
   template <typename value_t>
-  void get_cmd_line_argument(int index, value_t& val) const {
+  void get_cmd_line_argument(size_t index, value_t& val) const {
     using namespace std;
     if (index < args.size()) {
       istringstream str_stream(args[index]);
@@ -185,10 +185,10 @@ struct CommandLine {
       vals.clear();
 
       // Recover from multi-value string
-      for (int i = 0; i < keys.size(); ++i) {
+      for (size_t i = 0; i < keys.size(); ++i) {
         if (keys[i] == string(arg_name)) {
           string val_string(values[i]);
-          seperate_string(val_string, vals, sep);
+          separate_string(val_string, vals, sep);
         }
       }
     }
@@ -225,7 +225,7 @@ struct CommandLine {
       range != ranges.end(); ++range) {
 
       std::vector<std::string> range_vals;
-      seperate_string(*range, range_vals, sep);
+      separate_string(*range, range_vals, sep);
       vals.push_back(range_vals);
     }
   }
@@ -283,7 +283,7 @@ struct CommandLine {
   }
 
   template <typename value_t>
-  static void seperate_string(std::string const& str,
+  static void separate_string(std::string const& str,
                               std::vector<value_t>& vals,
                               char sep = ',') {
     std::istringstream str_stream(str);
diff --git a/tools/util/include/cutlass/util/cublas_wrappers.hpp b/tools/util/include/cutlass/util/cublas_wrappers.hpp
new file mode 100644
index 0000000000..e6b5f55066
--- /dev/null
+++ b/tools/util/include/cutlass/util/cublas_wrappers.hpp
@@ -0,0 +1,526 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+
+//-- BLAM_DEBUG_OUT ---------------------------------------------------------
+#ifdef BLAM_DEBUG
+# include <iostream>
+# ifndef BLAM_DEBUG_OUT
+#  define BLAM_DEBUG_OUT(msg)    std::cerr << "BLAM: " << msg << std::endl
+#  define BLAM_DEBUG_OUT_2(msg)  std::cerr << msg << std::endl
+# endif // BLAM_DEBUG_OUT
+#else
+# ifndef BLAM_DEBUG_OUT
+#  define BLAM_DEBUG_OUT(msg)
+#  define BLAM_DEBUG_OUT_2(msg)
+# endif // BLAM_DEBUG_OUT
+#endif // BLAM_DEBUG
+
+// User could potentially define ComplexFloat/ComplexDouble instead of std::
+#ifndef BLAM_COMPLEX_TYPES
+#define BLAM_COMPLEX_TYPES 1
+#include <cuda/std/complex>
+namespace blam {
+template <typename T>
+using Complex       = cuda::std::complex<T>;
+using ComplexFloat  = cuda::std::complex<float>;
+using ComplexDouble = cuda::std::complex<double>;
+}
+#endif // BLAM_COMPLEX_TYPES
+
+// User could potentially define Half instead of cute::
+#ifndef BLAM_HALF_TYPE
+#define BLAM_HALF_TYPE 1
+#include <cute/numeric/numeric_types.hpp>
+namespace blam {
+using Half = cute::half_t;
+}
+#endif // BLAM_HALF_TYPE
+
+namespace blam
+{
+namespace cublas
+{
+
+inline const char*
+cublas_get_error(cublasStatus_t status)
+{
+  switch (status) {
+    case CUBLAS_STATUS_SUCCESS:
+      return "CUBLAS_STATUS_SUCCESS";
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+      return "CUBLAS_STATUS_NOT_INITIALIZED -- The cuBLAS library was not initialized.";
+    case CUBLAS_STATUS_ALLOC_FAILED:
+      return "CUBLAS_STATUS_ALLOC_FAILED -- Resource allocation failed inside the cuBLAS library.";
+    case CUBLAS_STATUS_INVALID_VALUE:
+      return "CUBLAS_STATUS_INVALID_VALUE -- An unsupported value or parameter was passed to the function.";
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+      return "CUBLAS_STATUS_ARCH_MISMATCH -- The function requires a feature absent from the device architecture.";
+    case CUBLAS_STATUS_MAPPING_ERROR:
+      return "CUBLAS_STATUS_MAPPING_ERROR -- An access to GPU memory space failed.";
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+      return "CUBLAS_STATUS_EXECUTION_FAILED -- The GPU program failed to execute.";
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+      return "CUBLAS_STATUS_INTERNAL_ERROR -- An internal cuBLAS operation failed.";
+    case CUBLAS_STATUS_NOT_SUPPORTED:
+      return "CUBLAS_STATUS_NOT_SUPPORTED -- The functionality requested is not supported.";
+    case CUBLAS_STATUS_LICENSE_ERROR:
+      return "CUBLAS_STATUS_LICENSE_ERROR -- An error was detected when checking the current licensing.";
+    default:
+      return "CUBLAS_ERROR -- <unknown>";
+  }
+}
+
+inline bool
+cublas_is_error(cublasStatus_t status)
+{
+  return status != CUBLAS_STATUS_SUCCESS;
+}
+
+
+// hgemm
+inline cublasStatus_t
+gemm(cublasHandle_t handle,
+     cublasOperation_t transA, cublasOperation_t transB,
+     int m, int n, int k,
+     const Half* alpha,
+     const Half* A, int ldA,
+     const Half* B, int ldB,
+     const Half* beta,
+     Half* C, int ldC)
+{
+  BLAM_DEBUG_OUT("cublasHgemm");
+
+  return cublasGemmEx(handle, transA, transB,
+                      m, n, k,
+                      reinterpret_cast<const __half*>(alpha),
+                      reinterpret_cast<const __half*>(A), CUDA_R_16F, ldA,
+                      reinterpret_cast<const __half*>(B), CUDA_R_16F, ldB,
+                      reinterpret_cast<const __half*>(beta),
+                      reinterpret_cast<      __half*>(C), CUDA_R_16F, ldC,
+                      CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+}
+
+// mixed hf gemm
+inline cublasStatus_t
+gemm(cublasHandle_t handle,
+     cublasOperation_t transA, cublasOperation_t transB,
+     int m, int n, int k,
+     const float* alpha,
+     const Half* A, int ldA,
+     const Half* B, int ldB,
+     const float* beta,
+     float* C, int ldC)
+{
+  BLAM_DEBUG_OUT("cublasGemmEx mixed half-float");
+
+  return cublasGemmEx(handle, transA, transB,
+                      m, n, k,
+                      alpha,
+                      reinterpret_cast<const __half*>(A), CUDA_R_16F, ldA,
+                      reinterpret_cast<const __half*>(B), CUDA_R_16F, ldB,
+                      beta,
+                      C, CUDA_R_32F, ldC,
+                      CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+}
+
+// igemm
+inline cublasStatus_t
+gemm(cublasHandle_t handle,
+     cublasOperation_t transA, cublasOperation_t transB,
+     int m, int n, int k,
+     const int32_t* alpha,
+     const int8_t* A, int ldA,
+     const int8_t* B, int ldB,
+     const int32_t* beta,
+     int32_t* C, int ldC)
+{
+  BLAM_DEBUG_OUT("cublasIgemm");
+
+  return cublasGemmEx(handle, transA, transB,
+                      m, n, k,
+                      alpha,
+                      A, CUDA_R_8I, ldA,
+                      B, CUDA_R_8I, ldB,
+                      beta,
+                      C, CUDA_R_32I, ldC,
+                      CUDA_R_32I, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+}
+
+// sgemm
+inline cublasStatus_t
+gemm(cublasHandle_t handle,
+     cublasOperation_t transA, cublasOperation_t transB,
+     int m, int n, int k,
+     const float* alpha,
+     const float* A, int ldA,
+     const float* B, int ldB,
+     const float* beta,
+     float* C, int ldC)
+{
+  BLAM_DEBUG_OUT("cublasSgemm");
+
+  return cublasSgemm(handle, transA, transB,
+                     m, n, k,
+                     alpha,
+                     A, ldA,
+                     B, ldB,
+                     beta,
+                     C, ldC);
+}
+
+// dgemm
+inline cublasStatus_t
+gemm(cublasHandle_t handle,
+     cublasOperation_t transA, cublasOperation_t transB,
+     int m, int n, int k,
+     const double* alpha,
+     const double* A, int ldA,
+     const double* B, int ldB,
+     const double* beta,
+     double* C, int ldC)
+{
+  BLAM_DEBUG_OUT("cublasDgemm");
+
+  return cublasDgemm(handle, transA, transB,
+                     m, n, k,
+                     alpha,
+                     A, ldA,
+                     B, ldB,
+                     beta,
+                     C, ldC);
+}
+
+// cgemm
+inline cublasStatus_t
+gemm(cublasHandle_t handle,
+     cublasOperation_t transA, cublasOperation_t transB,
+     int m, int n, int k,
+     const ComplexFloat* alpha,
+     const ComplexFloat* A, int ldA,
+     const ComplexFloat* B, int ldB,
+     const ComplexFloat* beta,
+     ComplexFloat* C, int ldC)
+{
+  BLAM_DEBUG_OUT("cublasCgemm");
+
+  return cublasCgemm(handle, transA, transB,
+                     m, n, k,
+                     reinterpret_cast<const cuFloatComplex*>(alpha),
+                     reinterpret_cast<const cuFloatComplex*>(A), ldA,
+                     reinterpret_cast<const cuFloatComplex*>(B), ldB,
+                     reinterpret_cast<const cuFloatComplex*>(beta),
+                     reinterpret_cast<cuFloatComplex*>(C), ldC);
+}
+
+// zgemm
+inline cublasStatus_t
+gemm(cublasHandle_t handle,
+     cublasOperation_t transA, cublasOperation_t transB,
+     int m, int n, int k,
+     const ComplexDouble* alpha,
+     const ComplexDouble* A, int ldA,
+     const ComplexDouble* B, int ldB,
+     const ComplexDouble* beta,
+     ComplexDouble* C, int ldC)
+{
+  BLAM_DEBUG_OUT("cublasZgemm");
+
+  return cublasZgemm(handle, transA, transB,
+                     m, n, k,
+                     reinterpret_cast<const cuDoubleComplex*>(alpha),
+                     reinterpret_cast<const cuDoubleComplex*>(A), ldA,
+                     reinterpret_cast<const cuDoubleComplex*>(B), ldB,
+                     reinterpret_cast<const cuDoubleComplex*>(beta),
+                     reinterpret_cast<cuDoubleComplex*>(C), ldC);
+}
+
+// hgemm
+inline cublasStatus_t
+gemm_batch(cublasHandle_t handle,
+           cublasOperation_t transA, cublasOperation_t transB,
+           int m, int n, int k,
+           const Half* alpha,
+           const Half* A, int ldA, int loA,
+           const Half* B, int ldB, int loB,
+           const Half* beta,
+           Half* C, int ldC, int loC,
+           int batch_size)
+{
+  BLAM_DEBUG_OUT("cublasHgemmStridedBatched");
+
+  return cublasHgemmStridedBatched(handle, transA, transB,
+                                   m, n, k,
+                                   reinterpret_cast<const __half*>(alpha),
+                                   reinterpret_cast<const __half*>(A), ldA, loA,
+                                   reinterpret_cast<const __half*>(B), ldB, loB,
+                                   reinterpret_cast<const __half*>(beta),
+                                   reinterpret_cast<__half*>(C), ldC, loC,
+                                   batch_size);
+}
+
+// sgemm
+inline cublasStatus_t
+gemm_batch(cublasHandle_t handle,
+           cublasOperation_t transA, cublasOperation_t transB,
+           int m, int n, int k,
+           const float* alpha,
+           const float* A, int ldA, int loA,
+           const float* B, int ldB, int loB,
+           const float* beta,
+           float* C, int ldC, int loC,
+           int batch_size)
+{
+  BLAM_DEBUG_OUT("cublasSgemmStridedBatched");
+
+  return cublasSgemmStridedBatched(handle, transA, transB,
+                                   m, n, k,
+                                   alpha,
+                                   A, ldA, loA,
+                                   B, ldB, loB,
+                                   beta,
+                                   C, ldC, loC,
+                                   batch_size);
+}
+
+// dgemm
+inline cublasStatus_t
+gemm_batch(cublasHandle_t handle,
+           cublasOperation_t transA, cublasOperation_t transB,
+           int m, int n, int k,
+           const double* alpha,
+           const double* A, int ldA, int loA,
+           const double* B, int ldB, int loB,
+           const double* beta,
+           double* C, int ldC, int loC,
+           int batch_size)
+{
+  BLAM_DEBUG_OUT("cublasDgemmStridedBatched");
+
+  return cublasDgemmStridedBatched(handle, transA, transB,
+                                   m, n, k,
+                                   alpha,
+                                   A, ldA, loA,
+                                   B, ldB, loB,
+                                   beta,
+                                   C, ldC, loC,
+                                   batch_size);
+}
+
+// cgemm
+inline cublasStatus_t
+gemm_batch(cublasHandle_t handle,
+           cublasOperation_t transA, cublasOperation_t transB,
+           int m, int n, int k,
+           const ComplexFloat* alpha,
+           const ComplexFloat* A, int ldA, int loA,
+           const ComplexFloat* B, int ldB, int loB,
+           const ComplexFloat* beta,
+           ComplexFloat* C, int ldC, int loC,
+           int batch_size)
+{
+  BLAM_DEBUG_OUT("cublasCgemmStridedBatched");
+
+  return cublasCgemmStridedBatched(handle, transA, transB,
+                                   m, n, k,
+                                   reinterpret_cast<const cuFloatComplex*>(alpha),
+                                   reinterpret_cast<const cuFloatComplex*>(A), ldA, loA,
+                                   reinterpret_cast<const cuFloatComplex*>(B), ldB, loB,
+                                   reinterpret_cast<const cuFloatComplex*>(beta),
+                                   reinterpret_cast<cuFloatComplex*>(C), ldC, loC,
+                                   batch_size);
+}
+
+// zgemm
+inline cublasStatus_t
+gemm_batch(cublasHandle_t handle,
+           cublasOperation_t transA, cublasOperation_t transB,
+           int m, int n, int k,
+           const ComplexDouble* alpha,
+           const ComplexDouble* A, int ldA, int loA,
+           const ComplexDouble* B, int ldB, int loB,
+           const ComplexDouble* beta,
+           ComplexDouble* C, int ldC, int loC,
+           int batch_size)
+{
+  BLAM_DEBUG_OUT("cublasZgemmStridedBatched");
+
+  return cublasZgemmStridedBatched(handle, transA, transB,
+                                   m, n, k,
+                                   reinterpret_cast<const cuDoubleComplex*>(alpha),
+                                   reinterpret_cast<const cuDoubleComplex*>(A), ldA, loA,
+                                   reinterpret_cast<const cuDoubleComplex*>(B), ldB, loB,
+                                   reinterpret_cast<const cuDoubleComplex*>(beta),
+                                   reinterpret_cast<cuDoubleComplex*>(C), ldC, loC,
+                                   batch_size);
+}
+
+// hgemm
+inline cublasStatus_t
+gemm_batch(cublasHandle_t handle,
+           cublasOperation_t transA, cublasOperation_t transB,
+           int m, int n, int k,
+           const Half* alpha,
+           const Half* const A[], int ldA,
+           const Half* const B[], int ldB,
+           const Half* beta,
+           Half* const C[], int ldC,
+           int batch_size)
+{
+  BLAM_DEBUG_OUT("cublasHgemmBatched");
+
+  return cublasHgemmBatched(handle, transA, transB,
+                            m, n, k,
+                            reinterpret_cast<const __half*>(alpha),
+                            reinterpret_cast<const __half**>(const_cast<const Half**>(A)), ldA,
+                            // A, ldA,   // cuBLAS 9.2
+                            reinterpret_cast<const __half**>(const_cast<const Half**>(B)), ldB,
+                            // B, ldB,   // cuBLAS 9.2
+                            reinterpret_cast<const __half*>(beta),
+                            reinterpret_cast<__half**>(const_cast<Half**>(C)), ldC,
+                            // C, ldC,   // cuBLAS 9.2
+                            batch_size);
+}
+
+// sgemm
+inline cublasStatus_t
+gemm_batch(cublasHandle_t handle,
+           cublasOperation_t transA, cublasOperation_t transB,
+           int m, int n, int k,
+           const float* alpha,
+           const float* const A[], int ldA,
+           const float* const B[], int ldB,
+           const float* beta,
+           float* const C[], int ldC,
+           int batch_size)
+{
+  BLAM_DEBUG_OUT("cublasSgemmBatched");
+
+  return cublasSgemmBatched(handle, transA, transB,
+                            m, n, k,
+                            alpha,
+                            const_cast<const float**>(A), ldA,
+                            // A, ldA,   // cuBLAS 9.2
+                            const_cast<const float**>(B), ldB,
+                            // B, ldB,   // cuBLAS 9.2
+                            beta,
+                            const_cast<float**>(C), ldC,
+                            // C, ldC,   // cuBLAS 9.2
+                            batch_size);
+}
+
+// dgemm
+inline cublasStatus_t
+gemm_batch(cublasHandle_t handle,
+           cublasOperation_t transA, cublasOperation_t transB,
+           int m, int n, int k,
+           const double* alpha,
+           const double* const A[], int ldA,
+           const double* const B[], int ldB,
+           const double* beta,
+           double* const C[], int ldC,
+           int batch_size)
+{
+  BLAM_DEBUG_OUT("cublasDgemmBatched");
+
+  return cublasDgemmBatched(handle, transA, transB,
+                            m, n, k,
+                            alpha,
+                            const_cast<const double**>(A), ldA,
+                            // A, ldA,   // cuBLAS 9.2
+                            const_cast<const double**>(B), ldB,
+                            // B, ldB,   // cuBLAS 9.2
+                            beta,
+                            const_cast<double**>(C), ldC,
+                            // C, ldC,   // cuBLAS 9.2
+                            batch_size);
+}
+
+// cgemm
+inline cublasStatus_t
+gemm_batch(cublasHandle_t handle,
+           cublasOperation_t transA, cublasOperation_t transB,
+           int m, int n, int k,
+           const ComplexFloat* alpha,
+           const ComplexFloat* const A[], int ldA,
+           const ComplexFloat* const B[], int ldB,
+           const ComplexFloat* beta,
+           ComplexFloat* const C[], int ldC,
+           int batch_size)
+{
+  BLAM_DEBUG_OUT("cublasCgemmBatched");
+
+  return cublasCgemmBatched(handle, transA, transB,
+                            m, n, k,
+                            reinterpret_cast<const cuFloatComplex*>(alpha),
+                            const_cast<const cuFloatComplex**>(reinterpret_cast<const cuFloatComplex* const *>(A)), ldA,
+                            //reinterpret_cast<const cuFloatComplex* const *>(A), ldA,  // cuBLAS 9.2
+                            const_cast<const cuFloatComplex**>(reinterpret_cast<const cuFloatComplex* const *>(B)), ldB,
+                            //reinterpret_cast<const cuFloatComplex* const *>(B), ldB,  // cuBLAS 9.2
+                            reinterpret_cast<const cuFloatComplex*>(beta),
+                            const_cast<cuFloatComplex**>(reinterpret_cast<cuFloatComplex* const *>(C)), ldC,
+                            //reinterpret_cast<cuFloatComplex* const *>(C), ldC,        // cuBLAS 9.2
+                            batch_size);
+}
+
+// zgemm
+inline cublasStatus_t
+gemm_batch(cublasHandle_t handle,
+           cublasOperation_t transA, cublasOperation_t transB,
+           int m, int n, int k,
+           const ComplexDouble* alpha,
+           const ComplexDouble* const A[], int ldA,
+           const ComplexDouble* const B[], int ldB,
+           const ComplexDouble* beta,
+           ComplexDouble* const C[], int ldC,
+           int batch_size)
+{
+  BLAM_DEBUG_OUT("cublasZgemmBatched");
+
+  return cublasZgemmBatched(handle, transA, transB,
+                            m, n, k,
+                            reinterpret_cast<const cuDoubleComplex*>(alpha),
+                            const_cast<const cuDoubleComplex**>(reinterpret_cast<const cuDoubleComplex* const *>(A)), ldA,
+                            //reinterpret_cast<const cuDoubleComplex* const *>(A), ldA,  // cuBLAS 9.2
+                            const_cast<const cuDoubleComplex**>(reinterpret_cast<const cuDoubleComplex* const *>(B)), ldB,
+                            //reinterpret_cast<const cuDoubleComplex* const *>(B), ldB,  // cuBLAS 9.2
+                            reinterpret_cast<const cuDoubleComplex*>(beta),
+                            const_cast<cuDoubleComplex**>(reinterpret_cast<cuDoubleComplex* const *>(C)), ldC,
+                            //reinterpret_cast<cuDoubleComplex* const *>(C), ldC,        // cuBLAS 9.2
+                            batch_size);
+}
+
+} // end namespace cublas
+} // end namespace blam
diff --git a/tools/util/include/cutlass/util/debug.h b/tools/util/include/cutlass/util/debug.h
index f2ee441e42..1dc059e301 100644
--- a/tools/util/include/cutlass/util/debug.h
+++ b/tools/util/include/cutlass/util/debug.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/device_dump.h b/tools/util/include/cutlass/util/device_dump.h
index 34a113e9ed..bb20e9b714 100644
--- a/tools/util/include/cutlass/util/device_dump.h
+++ b/tools/util/include/cutlass/util/device_dump.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -31,7 +31,7 @@
 
 #pragma once
 
-#include <stdio.h>
+#include <cstdio>
 #include "cutlass/cutlass.h"
 
 /**
@@ -69,7 +69,7 @@ CUTLASS_DEVICE void dump_fragment(Fragment const& frag, int N = 0, int M = 0,
     return;
   }
 
-  int total_elements = frag.size();
+  int total_elements = int(frag.size());
 
   if (M < 0 || M > total_elements) {
     if (thread_id == 0 && block_id == 0)
diff --git a/tools/util/include/cutlass/util/device_groupnorm.h b/tools/util/include/cutlass/util/device_groupnorm.h
new file mode 100644
index 0000000000..5fc93a11b4
--- /dev/null
+++ b/tools/util/include/cutlass/util/device_groupnorm.h
@@ -0,0 +1,402 @@
+/******************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+/**
+ * \file
+ * \brief cuda kernels to do group norm on a device memory tensor with NHWC layout. The tensor will be divided into [N, H, W, G, C'] and then we do normalization on [H, W, C'].
+ */
+
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/tensor_ref.h"
+#include "device_utils.h"
+#include <cfloat>
+
+namespace cutlass {
+
+/** \brief interface to do group norm on a device memory tensor with NHWC layout.
+ * \tparam T: data type
+ */
+template <typename T>
+void groupnorm(cutlass::Tensor4DCoord input_size,
+               const int num_groups,
+               const float eps,
+               TensorRef<T, layout::TensorNHWC> ref_output,
+               TensorRef<T, layout::TensorNHWC> ref_input,
+               TensorRef<T, layout::TensorNHWC> ref_gamma,
+               TensorRef<T, layout::TensorNHWC> ref_beta,
+               cudaStream_t stream);
+
+extern __shared__ char groupnorm_shm[];
+
+// For small prod_dim1_to_last_dim/num_groups, to avoid multiple loads from global memory,
+// we store the input in the shared memory.
+// grid(num_groups, dim0)
+// block(BLOCKSIZE)
+// BLOCKSIZE * TVecs_PER_THREAD <= prod_dim1_to_last_dim/num_group
+template<typename TVec, typename T, int T_PER_TVec>
+__global__ void groupnorm_twopass_store_locally(T*          output,
+                                                const T*    input,
+                                                const T*    gamma,
+                                                const T*    beta,
+                                                int         num_groups,
+                                                int         prod_dim1_to_last_dim,
+                                                int         last_dim,
+                                                const float eps,
+                                                const int   TVecs_PER_THREAD)
+{
+    const int   bid               = blockIdx.y;   // index of batch
+    const int   gid               = blockIdx.x;   // index of group
+    const int   tid               = threadIdx.x;  // index of thread
+    const int   bdimx             = blockDim.x;
+    const int   s_reduce_elements = prod_dim1_to_last_dim / num_groups;
+    const int   v_reduce_elements = s_reduce_elements / T_PER_TVec;
+    const int   s_group_stride    = last_dim / num_groups;
+    const int   v_group_stride    = s_group_stride / T_PER_TVec;
+    const int   offset_of_group   = (bid * prod_dim1_to_last_dim + gid * s_group_stride) / T_PER_TVec;
+    const TVec* input_TVec_ptr    = (const TVec*)(input) + offset_of_group;
+    TVec*       output_TVec_ptr   = (TVec*)(output) + offset_of_group;
+    T*       local_val         = ((T*)groupnorm_shm) + TVecs_PER_THREAD * T_PER_TVec * tid;
+    float       local_sum[1]      = {0.0f};
+
+// load from global memory into shared memory
+#pragma unroll
+    for (int i = 0; i < TVecs_PER_THREAD; i += 1) {
+        const int current_load_start_idx = (i * bdimx + tid) * T_PER_TVec;
+        const int offset_in_group =
+            ((current_load_start_idx / s_group_stride) * last_dim + (current_load_start_idx % s_group_stride))
+            / T_PER_TVec;
+        if (current_load_start_idx < s_reduce_elements) {
+            TVec      tmp_vec          = input_TVec_ptr[offset_in_group];
+            T*        tmp_vec_ptr      = (T*)(&tmp_vec);
+            const int local_val_offset = i * T_PER_TVec;
+#pragma unroll
+            for (int j = 0; j < T_PER_TVec; j++) {
+                float tmp = static_cast<float>(tmp_vec_ptr[j]);
+                local_sum[0] += tmp;
+                local_val[local_val_offset + j] = tmp_vec_ptr[j];
+            }
+        }
+    }
+    __shared__ float s_mean, s_variance;
+
+    // reduction for mean
+    if (bdimx <= 32) {
+        warpReduceSum<float, 1>(local_sum);
+    }
+    else {
+        blockReduceSum<float, 1>(local_sum);
+    }
+    if (tid == 0) {
+        s_mean = local_sum[0] / s_reduce_elements;
+    }
+    __syncthreads();
+
+    // reduction for std
+    local_sum[0] = 0.0f;
+#pragma unroll
+    for (int i = 0; i < TVecs_PER_THREAD; i += 1) {
+        const int current_load_start_idx = (i * bdimx + tid) * T_PER_TVec;
+        if (current_load_start_idx < s_reduce_elements) {
+            const int local_val_offset = i * T_PER_TVec;
+#pragma unroll
+            for (int j = 0; j < T_PER_TVec; j++) {
+                float tmp = static_cast<float>(local_val[local_val_offset + j]);
+                tmp -= s_mean;
+                local_sum[0] += tmp * tmp;
+            }
+        }
+    }
+    if (bdimx <= 32) {
+        warpReduceSum<float, 1>(local_sum);
+    }
+    else {
+        blockReduceSum<float, 1>(local_sum);
+    }
+    if (tid == 0) {
+        s_variance = rsqrtf(local_sum[0] / s_reduce_elements + eps);
+    }
+    __syncthreads();
+
+    // normalize
+    const int   gamma_offset_of_group = gid * v_group_stride;
+    const TVec* gamma_TVec_ptr        = (const TVec*)gamma + gamma_offset_of_group;
+    const TVec* beta_TVec_ptr         = (const TVec*)beta + gamma_offset_of_group;
+#pragma unroll
+    for (int i = 0; i < TVecs_PER_THREAD; i += 1) {
+        const int current_load_start_idx = (i * bdimx + tid) * T_PER_TVec;
+        const int offset_in_group =
+            ((current_load_start_idx / s_group_stride) * last_dim + (current_load_start_idx % s_group_stride))
+            / T_PER_TVec;
+        const int gamma_offset_in_group = (current_load_start_idx % s_group_stride) / T_PER_TVec;
+        const int local_val_offset      = i * T_PER_TVec;
+        if (current_load_start_idx < s_reduce_elements) {
+            TVec gamma_val     = gamma_TVec_ptr[gamma_offset_in_group];
+            TVec beta_val      = beta_TVec_ptr[gamma_offset_in_group];
+            T*   gamma_val_ptr = (T*)(&gamma_val);
+            T*   beta_val_ptr  = (T*)(&beta_val);
+            TVec tmp_vec;
+            T*   tmp_vec_ptr = (T*)(&tmp_vec);
+#pragma unroll
+            for (int j = 0; j < T_PER_TVec; j++) {
+                float tmp = (static_cast<float>(local_val[local_val_offset + j]) - s_mean) * s_variance
+                                * static_cast<float>(gamma_val_ptr[j])
+                            + static_cast<float>(beta_val_ptr[j]);
+                if (sizeof(T) == sizeof(half)) {
+                    tmp_vec_ptr[j] = T(__float2half_rn(tmp));
+                }
+                else {
+                    tmp_vec_ptr[j] = T(tmp);
+                }
+            }
+            output_TVec_ptr[offset_in_group] = tmp_vec;
+        }
+    }
+}
+
+// For large prod_dim1_to_last_dim/num_groups,
+// in which the data cannot be stored locally,
+// we will load from global memory multiple times,
+// grid(num_groups, dim0)
+// block(BLOCKSIZE)
+// BLOCKSIZE * TVecs_PER_THREAD <= prod_dim1_to_last_dim/num_group
+template<typename TVec, typename T, int T_PER_TVec>
+__global__ void groupnorm_twopass_multiple_load(T*          output,
+                                                const T*    input,
+                                                const T*    gamma,
+                                                const T*    beta,
+                                                int         num_groups,
+                                                int         prod_dim1_to_last_dim,
+                                                int         last_dim,
+                                                const float eps,
+                                                const int   TVecs_PER_THREAD)
+{
+    const int   bid               = blockIdx.y;   // index of batch
+    const int   gid               = blockIdx.x;   // index of group
+    const int   tid               = threadIdx.x;  // index of thread
+    const int   bdimx             = blockDim.x;
+    const int   s_reduce_elements = prod_dim1_to_last_dim / num_groups;
+    const int   v_reduce_elements = s_reduce_elements / T_PER_TVec;
+    const int   s_group_stride    = last_dim / num_groups;
+    const int   v_group_stride    = s_group_stride / T_PER_TVec;
+    const int   offset_of_group   = (bid * prod_dim1_to_last_dim + gid * s_group_stride) / T_PER_TVec;
+    const TVec* input_TVec_ptr    = (const TVec*)(input) + offset_of_group;
+    TVec*       output_TVec_ptr   = (TVec*)(output) + offset_of_group;
+    float       local_sum[1]      = {0.0f};
+
+#pragma unroll
+    for (int i = 0; i < TVecs_PER_THREAD; i += 1) {
+        const int current_load_start_idx = (i * bdimx + tid) * T_PER_TVec;
+        if (current_load_start_idx < s_reduce_elements) {
+            const int offset_in_group =
+                ((current_load_start_idx / s_group_stride) * last_dim + (current_load_start_idx % s_group_stride))
+                / T_PER_TVec;
+            TVec tmp_vec     = input_TVec_ptr[offset_in_group];
+            T*   tmp_vec_ptr = (T*)(&tmp_vec);
+#pragma unroll
+            for (int j = 0; j < T_PER_TVec; j++) {
+                float tmp = static_cast<float>(tmp_vec_ptr[j]);
+                local_sum[0] += tmp;
+            }
+        }
+    }
+    __shared__ float s_mean, s_variance;
+
+    // reduction for mean
+    if (bdimx <= 32) {
+        warpReduceSum<float, 1>(local_sum);
+    }
+    else {
+        blockReduceSum<float, 1>(local_sum);
+    }
+    if (tid == 0) {
+        s_mean = local_sum[0] / s_reduce_elements;
+    }
+    __syncthreads();
+
+    // reduction for std
+    local_sum[0] = 0.0f;
+#pragma unroll
+    for (int i = 0; i < TVecs_PER_THREAD; i += 1) {
+        const int current_load_start_idx = (i * bdimx + tid) * T_PER_TVec;
+        if (current_load_start_idx < s_reduce_elements) {
+            const int offset_in_group =
+                ((current_load_start_idx / s_group_stride) * last_dim + (current_load_start_idx % s_group_stride))
+                / T_PER_TVec;
+            TVec tmp_vec     = input_TVec_ptr[offset_in_group];
+            T*   tmp_vec_ptr = (T*)(&tmp_vec);
+#pragma unroll
+            for (int j = 0; j < T_PER_TVec; j++) {
+                float tmp = static_cast<float>(tmp_vec_ptr[j]);
+                tmp -= s_mean;
+                local_sum[0] += tmp * tmp;
+            }
+        }
+    }
+    if (bdimx <= 32) {
+        warpReduceSum<float, 1>(local_sum);
+    }
+    else {
+        blockReduceSum<float, 1>(local_sum);
+    }
+    if (tid == 0) {
+        s_variance = rsqrtf(local_sum[0] / s_reduce_elements + eps);
+    }
+    __syncthreads();
+
+    // normalize
+    const int   gamma_offset_of_group = gid * v_group_stride;
+    const TVec* gamma_TVec_ptr        = (const TVec*)gamma + gamma_offset_of_group;
+    const TVec* beta_TVec_ptr         = (const TVec*)beta + gamma_offset_of_group;
+#pragma unroll
+    for (int i = 0; i < TVecs_PER_THREAD; i += 1) {
+        const int current_load_start_idx = (i * bdimx + tid) * T_PER_TVec;
+        if (current_load_start_idx < s_reduce_elements) {
+            const int offset_in_group =
+                ((current_load_start_idx / s_group_stride) * last_dim + (current_load_start_idx % s_group_stride))
+                / T_PER_TVec;
+            const int gamma_offset_in_group = (current_load_start_idx % s_group_stride) / T_PER_TVec;
+            TVec      gamma_val             = gamma_TVec_ptr[gamma_offset_in_group];
+            TVec      beta_val              = beta_TVec_ptr[gamma_offset_in_group];
+            T*        gamma_val_ptr         = (T*)(&gamma_val);
+            T*        beta_val_ptr          = (T*)(&beta_val);
+            TVec      tmp_vec               = input_TVec_ptr[offset_in_group];
+            T*        tmp_vec_ptr           = (T*)(&tmp_vec);
+            TVec      output_tmp_vec;
+            T*        output_tmp_vec_ptr = (T*)(&output_tmp_vec);
+#pragma unroll
+            for (int j = 0; j < T_PER_TVec; j++) {
+                float tmp =
+                    (static_cast<float>(tmp_vec_ptr[j]) - s_mean) * s_variance * static_cast<float>(gamma_val_ptr[j])
+                    + static_cast<float>(beta_val_ptr[j]);
+                if (sizeof(T) == sizeof(half)) {
+                    output_tmp_vec_ptr[j] = T(__float2half_rn(tmp));
+                }
+                else {
+                    output_tmp_vec_ptr[j] = T(tmp);
+                }
+            }
+            output_TVec_ptr[offset_in_group] = output_tmp_vec;
+        }
+    }
+}
+
+//ref_input & ref_output should be [N, H, W, C]
+//ref_gamma & ref_beta should be [1, 1, 1, C]
+template <typename T>
+void groupnorm(cutlass::Tensor4DCoord input_size,
+               const int num_groups,
+               const float eps,
+               TensorRef<T, layout::TensorNHWC> ref_output,
+               TensorRef<T, layout::TensorNHWC> ref_input,
+               TensorRef<T, layout::TensorNHWC> ref_gamma,
+               TensorRef<T, layout::TensorNHWC> ref_beta,
+               cudaStream_t stream){
+  const int N = input_size.n();
+  const int H = input_size.h();
+  const int W = input_size.w();
+  const int C = input_size.c();
+  if (C % num_groups != 0){
+    printf("[ERROR] C should be a multiple of num_groups.\n");
+  }
+  T* output = ref_output.data();
+  const T* input = ref_input.data();
+  const T* gamma = ref_gamma.data();
+  const T* beta = ref_beta.data();
+
+  const int dim0 = N;
+  const int last_dim = C;
+  const int prod_dim1_to_last_dim = H*W*C;
+  const int s_reduce_elements = prod_dim1_to_last_dim / num_groups;
+  const int s_group_stride = last_dim / num_groups;
+  dim3      grid(num_groups, dim0);
+  int       threadblock_size = 32;
+  if (s_group_stride % 2 == 0) {
+    const int T_PER_TVec = 2;
+    while (threadblock_size < 1024) {
+      if (s_reduce_elements / T_PER_TVec / threadblock_size <= 8)
+        break;
+        threadblock_size *= 2;
+      }
+    dim3      block(threadblock_size);
+    const int TVec_PER_THREAD = (s_reduce_elements / T_PER_TVec + threadblock_size - 1) / threadblock_size;
+    const int shm_size = T_PER_TVec * TVec_PER_THREAD * threadblock_size * sizeof(T);
+    // for small s_reduce_elements, specific case for H=W=22, C=1280, num_groups=32;
+    // the size of grid & block may have better choice for different cases.
+    // ensure shared memory is smaller than 48KB
+    if (std::is_same<T, float>::value){
+      if (shm_size < 48 * 1024) {
+        groupnorm_twopass_store_locally<float2, T, T_PER_TVec><<<grid, block, shm_size, stream>>>(
+          output, input, gamma, beta, num_groups, prod_dim1_to_last_dim, last_dim, eps, TVec_PER_THREAD);
+      }
+      else {
+        groupnorm_twopass_multiple_load<float2, T, T_PER_TVec><<<grid, block, 0, stream>>>(
+          output, input, gamma, beta, num_groups, prod_dim1_to_last_dim, last_dim, eps, TVec_PER_THREAD);
+      }
+    }
+    else{
+      if (shm_size < 48 * 1024) {
+        groupnorm_twopass_store_locally<half2, T, T_PER_TVec><<<grid, block, shm_size, stream>>>(
+          output, input, gamma, beta, num_groups, prod_dim1_to_last_dim, last_dim, eps, TVec_PER_THREAD);
+      }
+      else {
+        groupnorm_twopass_multiple_load<half2, T, T_PER_TVec><<<grid, block, 0, stream>>>(
+          output, input, gamma, beta, num_groups, prod_dim1_to_last_dim, last_dim, eps, TVec_PER_THREAD);
+      }
+    }
+  }
+  else {
+    const int T_PER_TVec = 1;
+    while (threadblock_size < 1024) {
+      if (s_reduce_elements / T_PER_TVec / threadblock_size <= 8)
+        break;
+        threadblock_size *= 2;
+      }
+    dim3      block(threadblock_size);
+    const int TVec_PER_THREAD = (s_reduce_elements / T_PER_TVec + threadblock_size - 1) / threadblock_size;
+    const int shm_size = T_PER_TVec * TVec_PER_THREAD * threadblock_size * sizeof(T);
+    if (shm_size < 48 * 1024) {
+      groupnorm_twopass_store_locally<T, T, T_PER_TVec><<<grid, block, shm_size, stream>>>(
+        output, input, gamma, beta, num_groups, prod_dim1_to_last_dim, last_dim, eps, TVec_PER_THREAD);
+    }
+    else {
+      groupnorm_twopass_multiple_load<T, T, T_PER_TVec><<<grid, block, 0, stream>>>(
+        output, input, gamma, beta, num_groups, prod_dim1_to_last_dim, last_dim, eps, TVec_PER_THREAD);
+    }
+  }
+
+}
+
+} //namespace cutlass
diff --git a/tools/util/include/cutlass/util/device_layernorm.h b/tools/util/include/cutlass/util/device_layernorm.h
new file mode 100644
index 0000000000..7708c3eba4
--- /dev/null
+++ b/tools/util/include/cutlass/util/device_layernorm.h
@@ -0,0 +1,644 @@
+/******************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+/**
+ * \file
+ * \brief cuda kernels to do layernorm on a device memory tensor with RowMajor layout.
+ */
+
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/tensor_ref.h"
+#include "device_utils.h"
+#include <cfloat>
+
+namespace cutlass {
+
+/** \brief interface to do layernorm on a device memory tensor with RowMajor layout.
+ * \tparam T: data type
+ */
+template <typename T>
+void layernorm(cutlass::MatrixCoord tensor_size,
+               TensorRef<T, layout::RowMajor> ref_output,
+               TensorRef<T, layout::RowMajor> ref_input,
+               TensorRef<T, layout::RowMajor> ref_gamma,
+               TensorRef<T, layout::RowMajor> ref_beta,
+               cudaStream_t stream);
+
+/**
+ * output [m, n] row-major
+ * input [m, n] row-major
+ * gamma [n]
+ * beta [n]
+ * grid(m)
+ * block(block_size) -- each block deals with n elements ; each thread deals with ITEM_PER_THREAD elements
+*/
+template<typename T, int ITEM_PER_THREAD>
+__global__ void layernorm_twoPassAlgo_stored_locally_e1(T* output, 
+                                                        const T* input, 
+                                                        const T* gamma, 
+                                                        const T* beta, 
+                                                        const int m, 
+                                                        const int n)
+{
+  const int m_idx = blockIdx.x;
+  const int tid = threadIdx.x;
+  const int bdimx = blockDim.x;
+  __shared__ float s_mean, s_variance;
+  T local_val[ITEM_PER_THREAD];
+  float local_sums[1] = {0.0f};
+  int offset = m_idx * n;
+  input += offset;
+  output += offset;
+
+  const T zero = T(0.0f);
+  #pragma unroll
+  for (int i = 0 ; i < ITEM_PER_THREAD ; i++){ 
+    int index = tid + i*bdimx;
+    local_val[i] = index < n ? input[index] : zero;   
+    local_sums[0] += static_cast<float>(local_val[i]); 
+  }
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  }
+  else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_mean = local_sums[0] / n;
+  }
+  __syncthreads();
+
+  local_sums[0] = 0.0f;
+  #pragma unroll
+  for (int i = 0 ; i < ITEM_PER_THREAD ; i++){
+    int index = tid + i*bdimx;
+    if (index < n){
+      const float tmp = static_cast<float>(local_val[i]) - s_mean;
+      local_sums[0] += tmp * tmp;
+    }
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  }
+  else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(local_sums[0] / n + 1e-5);
+  }
+  __syncthreads();
+
+  #pragma unroll
+  for (int i = 0 ; i < ITEM_PER_THREAD ; i++){
+    int index = tid + i*bdimx;
+    if (index < n) {
+      const T gamma_val = gamma[index];
+      const T beta_val = beta[index];
+      output[index] = T((static_cast<float>(local_val[i]) - s_mean) * s_variance * static_cast<float>(gamma_val) + static_cast<float>(beta_val));
+    }
+  }
+}
+
+/**
+ * output [m, n] row-major
+ * input [m, n] row-major
+ * gamma [n]
+ * beta [n]
+ * grid(m)
+ * block(block_size) -- each block deals with block_size*ITEM_PER_THREAD*2 elements;
+*/
+template<typename T2, typename T, int ITEM_PER_THREAD>
+__global__ void layernorm_twoPassAlgo_stored_locally_e2(T2* output,
+                                                        const T2* input,
+                                                        const T2* gamma,
+                                                        const T2* beta,
+                                                        const int m,
+                                                        const int n)
+{
+  const int m_idx = blockIdx.x;
+  const int tid = threadIdx.x;
+  const int bdimx = blockDim.x;
+  __shared__ float s_mean, s_variance;
+  float local_sums[1] = {0.0f};
+  T2 local_val[ITEM_PER_THREAD];
+  const int n_2 = n / 2;
+  int offset = m_idx * n_2;
+  input += offset;
+  output += offset;
+
+  const T2 zero = {T(0.0f), T(0.0f)};
+  #pragma UNROLL
+  for (int i = 0; i < ITEM_PER_THREAD; i += 1) {
+    const int index = i*bdimx + tid;
+    local_val[i] = index < n_2 ? input[index] : zero;
+    local_sums[0] += static_cast<float>(local_val[i].x) + static_cast<float>(local_val[i].y);
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  }
+  else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_mean = local_sums[0] / n;
+  }
+  __syncthreads();
+
+  local_sums[0] = 0.0f;
+  #pragma UNROLL
+  for (int i = 0; i < ITEM_PER_THREAD; i += 1) {
+    const int index = i*bdimx + tid;
+    if (index < n_2){
+      const float2 tmp = {static_cast<float>(local_val[i].x) - s_mean,
+                          static_cast<float>(local_val[i].y) - s_mean};
+      local_sums[0] += tmp.x * tmp.x + tmp.y * tmp.y;
+    }
+  }
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  }
+  else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(local_sums[0] / n + 1e-5);
+  }
+  __syncthreads();
+
+  #pragma UNROLL
+  for (int i = 0; i < ITEM_PER_THREAD; i += 1) {
+    const int index = i*bdimx + tid;
+    if (index < n_2){
+      const T2 gamma_val = gamma[index];
+      const T2 beta_val = beta[index];
+      T2 tmp;
+      tmp.x = T((static_cast<float>(local_val[i].x) - s_mean)*s_variance*static_cast<float>(gamma_val.x) + static_cast<float>(beta_val.x));
+      tmp.y = T((static_cast<float>(local_val[i].y) - s_mean)*s_variance*static_cast<float>(gamma_val.y) + static_cast<float>(beta_val.y));
+      output[index] = tmp;
+    }
+  }
+}
+
+/**
+ * output [m, n] row-major
+ * input [m, n] row-major
+ * gamma [n]
+ * beta [n]
+ * grid(m)
+ * block(block_size) -- each block deals with block_size*ITEM_PER_THREAD*4 elements;
+*/
+template<typename T4, typename T, int ITEM_PER_THREAD>
+__global__ void layernorm_twoPassAlgo_stored_locally_e4(T4* output,
+                                                        const T4* input,
+                                                        const T4* gamma,
+                                                        const T4* beta,
+                                                        const int m,
+                                                        const int n)
+{
+  const int m_idx = blockIdx.x;
+  const int tid = threadIdx.x;
+  const int bdimx = blockDim.x;
+  __shared__ float s_mean, s_variance;
+  float local_sums[1] = {0.0f};
+  T4 local_val[ITEM_PER_THREAD];
+  const int n_4 = n / 4;
+  int offset = m_idx * n_4;
+  input += offset;
+  output += offset;
+
+  const T4 zero = {T(0.0f), T(0.0f), T(0.0f), T(0.0f)};
+  #pragma UNROLL
+  for (int i = 0; i < ITEM_PER_THREAD; i += 1) {
+    const int index = i*bdimx + tid;
+    local_val[i] = index < n_4 ? input[index] : zero;
+    local_sums[0] += static_cast<float>(local_val[i].x) + static_cast<float>(local_val[i].y) +
+                     static_cast<float>(local_val[i].z) + static_cast<float>(local_val[i].w);
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  }
+  else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_mean = local_sums[0] / n;
+  }
+  __syncthreads();
+
+  local_sums[0] = 0.0f;
+  #pragma UNROLL
+  for (int i = 0; i < ITEM_PER_THREAD; i += 1) {
+    const int index = i*bdimx + tid;
+    if (index < n_4){
+      const float4 tmp = {static_cast<float>(local_val[i].x) - s_mean,
+                          static_cast<float>(local_val[i].y) - s_mean,
+                          static_cast<float>(local_val[i].z) - s_mean,
+                          static_cast<float>(local_val[i].w) - s_mean};
+      local_sums[0] += tmp.x * tmp.x + tmp.y * tmp.y + tmp.z * tmp.z + tmp.w * tmp.w;
+    }
+  }
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  }
+  else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(local_sums[0] / n + 1e-5);
+  }
+  __syncthreads();
+
+  #pragma UNROLL
+  for (int i = 0; i < ITEM_PER_THREAD; i += 1) {
+    const int index = i*bdimx + tid;
+    if (index < n_4){
+      const T4 gamma_val = gamma[index];
+      const T4 beta_val = beta[index];
+      T4 tmp;
+      tmp.x = T((static_cast<float>(local_val[i].x) - s_mean)*s_variance*static_cast<float>(gamma_val.x) + static_cast<float>(beta_val.x));
+      tmp.y = T((static_cast<float>(local_val[i].y) - s_mean)*s_variance*static_cast<float>(gamma_val.y) + static_cast<float>(beta_val.y));
+      tmp.z = T((static_cast<float>(local_val[i].z) - s_mean)*s_variance*static_cast<float>(gamma_val.z) + static_cast<float>(beta_val.z));
+      tmp.w = T((static_cast<float>(local_val[i].w) - s_mean)*s_variance*static_cast<float>(gamma_val.w) + static_cast<float>(beta_val.w));
+      output[index] = tmp;
+    }
+  }
+}
+
+/**
+ * output [m, n] row-major
+ * input [m, n] row-major
+ * gamma [n]
+ * beta [n]
+ * grid(m)
+ * block(block_size) -- each block deals with n elements ; each thread deals with ITEM_PER_THREAD elements
+*/
+template<typename T>
+__global__ void layernorm_twoPassAlgo_e1(T* output,
+                                         const T* input,
+                                         const T* gamma,
+                                         const T* beta,
+                                         const int m,
+                                         const int n)
+{
+  const int m_idx = blockIdx.x;
+  const int tid = threadIdx.x;
+  const int bdimx = blockDim.x;
+  __shared__ float s_mean, s_variance;
+  float local_sums[1] = {0.0f};
+  int offset = m_idx * n;
+  input += offset;
+  output += offset;
+
+  for (int index = tid ; index < n ; index += bdimx){
+    float local_val = static_cast<float>(input[index]);
+    local_sums[0] += local_val;
+  }
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  }
+  else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_mean = local_sums[0] / n;
+  }
+  __syncthreads();
+
+  local_sums[0] = 0.0f;
+  for (int index = tid ; index < n ; index += bdimx){
+    float local_val = static_cast<float>(input[index]);
+    local_val = local_val - s_mean;
+    local_sums[0] += local_val * local_val;
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  }
+  else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(local_sums[0] / n + 1e-5);
+  }
+  __syncthreads();
+
+  for (int index = tid ; index < n ; index += bdimx){
+    const T gamma_val = gamma[index];
+    const T beta_val = beta[index];
+    const T local_val = input[index];
+    output[index] = T((static_cast<float>(local_val) - s_mean) * s_variance * static_cast<float>(gamma_val) + static_cast<float>(beta_val));
+  }
+}
+
+/**
+ * output [m, n] row-major
+ * input [m, n] row-major
+ * gamma [n]
+ * beta [n]
+ * grid(m)
+ * block(block_size) -- each block deals with block_size*ITEM_PER_THREAD*2 elements;
+*/
+template<typename T2, typename T>
+__global__ void layernorm_twoPassAlgo_e2(T2* output,
+                                         const T2* input,
+                                         const T2* gamma,
+                                         const T2* beta,
+                                         const int m,
+                                         const int n)
+{
+  const int m_idx = blockIdx.x;
+  const int tid = threadIdx.x;
+  const int bdimx = blockDim.x;
+  __shared__ float s_mean, s_variance;
+  float local_sums[1] = {0.0f};
+  const int n_2 = n / 2;
+  int offset = m_idx * n_2;
+  input += offset;
+  output += offset;
+
+  for (int index = tid; index < n_2; index += bdimx) {
+    const T2 local_val = input[index];
+    local_sums[0] += static_cast<float>(local_val.x) + static_cast<float>(local_val.y);
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  }
+  else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_mean = local_sums[0] / n;
+  }
+  __syncthreads();
+
+  local_sums[0] = 0.0f;
+  for (int index = tid; index < n_2; index += bdimx) {
+    const T2 local_val = input[index];
+    const float2 tmp = {static_cast<float>(local_val.x) - s_mean,
+                        static_cast<float>(local_val.y) - s_mean};
+    local_sums[0] += tmp.x * tmp.x + tmp.y * tmp.y;
+  }
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  }
+  else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(local_sums[0] / n + 1e-5);
+  }
+  __syncthreads();
+
+  for (int index = tid; index < n_2; index += bdimx) {
+    const T2 local_val = input[index];
+    const T2 gamma_val = gamma[index];
+    const T2 beta_val = beta[index];
+    T2 tmp;
+    tmp.x = T((static_cast<float>(local_val.x) - s_mean)*s_variance*static_cast<float>(gamma_val.x) + static_cast<float>(beta_val.x));
+    tmp.y = T((static_cast<float>(local_val.y) - s_mean)*s_variance*static_cast<float>(gamma_val.y) + static_cast<float>(beta_val.y));
+    output[index] = tmp;
+  }
+}
+
+template <typename T>
+void layernorm(cutlass::MatrixCoord tensor_size,
+               TensorRef<T, layout::RowMajor> ref_output,
+               TensorRef<T, layout::RowMajor> ref_input,
+               TensorRef<T, layout::RowMajor> ref_gamma,
+               TensorRef<T, layout::RowMajor> ref_beta,
+               cudaStream_t stream){
+  const int m = tensor_size.row();
+  const int n = tensor_size.column();
+  T* output = ref_output.data();
+  const T* input = ref_input.data();
+  const T* gamma = ref_gamma.data();
+  const T* beta = ref_beta.data();
+  dim3 grid(m);
+  dim3 block((n + 31)/32*32);
+  if (block.x > 1024){
+    block.x = 1024;
+  }
+  // TODO : There should be better configs for different cases, we only use several samples to show how to use here
+  // TODO : using registers to store values locally can reduce the loads from global memory and speedup the kernels.
+  if ((n % 4 == 0) && (n >= 128) && (n <= 4096)) {
+    block.x = (n/4 + 31)/32*32;
+    if (std::is_same<T, float>::value) {
+      layernorm_twoPassAlgo_stored_locally_e4<float4, float, 1><<<grid, block, 0, stream>>>(
+        (float4*)output,
+        (const float4*)input,
+        (const float4*)gamma,
+        (const float4*)beta,
+        m,
+        n);
+    } // if (std::is_same<T, float>::value)
+    else {
+      layernorm_twoPassAlgo_stored_locally_e4<half4, half, 1><<<grid, block, 0, stream>>>(
+        (half4*)output,
+        (const half4*)input,
+        (const half4*)gamma,
+        (const half4*)beta,
+        m,
+        n);
+    }
+  } //if ((n % 4 == 0) && (n >= 128) && (n <= 4096))
+  else if (n % 2 == 0) {
+    if (n / 2 <= 1024) {
+      block.x = (n/2 + 31)/32*32;
+      if (std::is_same<T, float>::value) {
+        layernorm_twoPassAlgo_stored_locally_e2<float2, float, 1><<<grid, block, 0, stream>>>(
+          (float2*)output,
+          (const float2*)input,
+          (const float2*)gamma,
+          (const float2*)beta,
+          m,
+          n);
+      } //if (std::is_same<T, float>::value)
+      else {
+        layernorm_twoPassAlgo_stored_locally_e2<half2, half, 1><<<grid, block, 0, stream>>>(
+          (half2*)output,
+          (const half2*)input,
+          (const half2*)gamma,
+          (const half2*)beta,
+          m,
+          n);
+      }
+    } // if (n / 2 <= 1024)
+    else if (n <= 8192) {
+      block.x = ((n + 7)/8 + 31)/32*32;
+      if (std::is_same<T, float>::value) {
+        layernorm_twoPassAlgo_stored_locally_e2<float2, float, 4><<<grid, block, 0, stream>>>(
+          (float2*)output,
+          (const float2*)input,
+          (const float2*)gamma,
+          (const float2*)beta,
+          m,
+          n);
+      } // if (std::is_same<T, float>::value)
+      else {
+        layernorm_twoPassAlgo_stored_locally_e2<half2, half, 4><<<grid, block, 0, stream>>>(
+          (half2*)output,
+          (const half2*)input,
+          (const half2*)gamma,
+          (const half2*)beta,
+          m,
+          n);
+      }
+    } // if (n <= 8192)
+    else if (n <= 16384) {
+      block.x = ((n + 15)/ 16 + 31)/32*32;
+      if (std::is_same<T, float>::value) {
+        layernorm_twoPassAlgo_stored_locally_e2<float2, float, 8><<<grid, block, 0, stream>>>(
+          (float2*)output,
+          (const float2*)input,
+          (const float2*)gamma,
+          (const float2*)beta,
+          m,
+          n);
+      } // if (std::is_same<T, float>::value)
+      else {
+        layernorm_twoPassAlgo_stored_locally_e2<half2, half, 8><<<grid, block, 0, stream>>>(
+          (half2*)output,
+          (const half2*)input,
+          (const half2*)gamma,
+          (const half2*)beta,
+          m,
+          n);
+      }
+    } // if (n <= 16384)
+    else if (n <= 32768) {
+      block.x = ((n + 31)/32 + 31)/32*32;
+      if (std::is_same<T, float>::value) {
+        layernorm_twoPassAlgo_stored_locally_e2<float2, float, 16><<<grid, block, 0, stream>>>(
+          (float2*)output,
+          (const float2*)input,
+          (const float2*)gamma,
+          (const float2*)beta,
+          m,
+          n);
+      } // if (std::is_same<T, float>::value)
+      else {
+        layernorm_twoPassAlgo_stored_locally_e2<half2, half, 16><<<grid, block, 0, stream>>>(
+          (half2*)output,
+          (const half2*)input,
+          (const half2*)gamma,
+          (const half2*)beta,
+          m,
+          n);
+      }
+    } // if (n <= 32768)
+    else {
+      if (block.x > 512)
+        block.x = 512;
+      if (std::is_same<T, float>::value) {
+        layernorm_twoPassAlgo_e2<float2, float><<<grid, block, 0, stream>>>(
+          (float2 *)output, 
+          (const float2 *)input,
+          (const float2 *)gamma, 
+          (const float2 *)beta, 
+          m, 
+          n);
+      } // if (std::is_same<T, float>::value)
+      else {
+        layernorm_twoPassAlgo_e2<half2, half><<<grid, block, 0, stream>>>(
+          (half2 *)output,
+          (const half2 *)input,
+          (const half2 *)gamma,
+          (const half2 *)beta,
+          m,
+          n);
+      }
+    }
+  } // if (n % 2 == 0)
+  else {
+    if (n <= 1024) {
+      layernorm_twoPassAlgo_stored_locally_e1<T, 1><<<grid, block, 0, stream>>>(
+        output, 
+        input, 
+        gamma, 
+        beta, 
+        m, 
+        n);
+    } // if (n <= 1024)
+    else if (n <= 8192) {
+      block.x = ((n + 7)/8 + 31)/32*32;
+      layernorm_twoPassAlgo_stored_locally_e1<T, 8><<<grid, block, 0, stream>>>(
+        output,
+        input,
+        gamma,
+        beta,
+        m,
+        n);
+    } // if (n <= 8192)
+    else if (n <= 16384) {
+      block.x = ((n + 15)/16 + 32)/32*32;
+      layernorm_twoPassAlgo_stored_locally_e1<T, 16><<<grid, block, 0, stream>>>(
+        output,
+        input,
+        gamma,
+        beta,
+        m,
+        n);
+    } // if (n <= 16384)
+    else if (n <= 32768) {
+      block.x = ((n + 31)/32 + 31)/32*32;
+      layernorm_twoPassAlgo_stored_locally_e1<T, 32><<<grid, block, 0, stream>>>(
+        output,
+        input,
+        gamma,
+        beta,
+        m,
+        n);
+    } // if (n <= 32768)
+    else{
+      if (block.x > 512) {
+        block.x = 512;
+      }
+      layernorm_twoPassAlgo_e1<<<grid, block, 0, stream>>>(
+        output, 
+        input, 
+        gamma, 
+        beta, 
+        m, 
+        n);
+    }
+  } 
+}
+
+} //namespace cutlass
diff --git a/tools/util/include/cutlass/util/device_memory.h b/tools/util/include/cutlass/util/device_memory.h
index f821f97950..7d3fa73f62 100644
--- a/tools/util/include/cutlass/util/device_memory.h
+++ b/tools/util/include/cutlass/util/device_memory.h
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -37,9 +37,11 @@
  */
 
 #include <memory>
+#include <sstream>
 
 #include "cutlass/platform/platform.h"
 #include "cutlass/numeric_types.h"
+#include "cutlass/trace.h"
 #include "exceptions.h"
 
 namespace cutlass {
@@ -61,8 +63,20 @@ T* allocate(size_t count = 1) {
   cudaError_t cuda_error = cudaMalloc((void**)&ptr, bytes);
 
   if (cuda_error != cudaSuccess) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 0)
+    std::ostringstream os;
+    os << "cutlass::device_memory::allocate: cudaMalloc failed: bytes=" << bytes;
+    CUTLASS_TRACE_HOST(os.str());
+#endif
     throw cuda_exception("Failed to allocate memory", cuda_error);
   }
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+  else {
+    std::ostringstream os;
+    os << "cutlass::device_memory::allocate: Successful cudaMalloc: bytes=" << bytes;
+    CUTLASS_TRACE_HOST(os.str());
+  }
+#endif
 
   return ptr;
 }
@@ -85,11 +99,36 @@ void free(T* ptr) {
 template <typename T>
 void copy(T* dst, T const* src, size_t count, cudaMemcpyKind kind) {
   size_t bytes = count * sizeof_bits<T>::value / 8;
-  if (bytes == 0 && count > 0)
+  if (bytes == 0 && count > 0) {
     bytes = 1;
+  }
   cudaError_t cuda_error = (cudaMemcpy(dst, src, bytes, kind));
   if (cuda_error != cudaSuccess) {
-    throw cuda_exception("cudaMemcpy() failed", cuda_error);
+    std::ostringstream os;
+    os << "cutlass::device_memory::copy: cudaMemcpy() failed: "
+       << "dst=" << dst << ", src=" << src
+       << ", bytes=" << bytes << ", count=" << count;
+    if (kind == cudaMemcpyHostToDevice) {
+      os << ", kind=cudaMemcpyHostToDevice";
+    }
+    else if (kind == cudaMemcpyDeviceToHost) {
+      os << ", kind=cudaMemcpyDeviceToHost";
+    }
+    else if (kind == cudaMemcpyDeviceToDevice) {
+      os << ", kind=cudaMemcpyDeviceToDevice";
+    }
+    else if (kind == cudaMemcpyHostToHost) {
+      os << ", kind=cudaMemcpyHostToHost";
+    }
+    else if (kind == cudaMemcpyDefault) {
+      os << ", kind=cudaMemcpyDefault";
+    }
+    else {
+      os << ", kind=Unknown";
+    }
+    os << ", error: " << cudaGetErrorString(cuda_error);
+
+    throw cuda_exception(os.str().c_str(), cuda_error);
   }
 }
 
diff --git a/tools/util/include/cutlass/util/device_nchw_to_nhwc.h b/tools/util/include/cutlass/util/device_nchw_to_nhwc.h
index 355fb753b9..c461989656 100644
--- a/tools/util/include/cutlass/util/device_nchw_to_nhwc.h
+++ b/tools/util/include/cutlass/util/device_nchw_to_nhwc.h
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/device_nhwc_padding.h b/tools/util/include/cutlass/util/device_nhwc_padding.h
index 5a9ae7ef4b..9a2317e2b0 100644
--- a/tools/util/include/cutlass/util/device_nhwc_padding.h
+++ b/tools/util/include/cutlass/util/device_nhwc_padding.h
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -109,9 +109,9 @@ __global__ void nhwc_padding_channel_3To4_kernel(const int32_t n,
   shm[threadIdx.x] = tidx >= max_input_element ? zero_io : input[tidx];  
   __syncthreads();
   
-  const int ouput_offset = blockIdx.x * 256;
-  const int lower_bound = max_output_element < ouput_offset + 256 ? max_output_element : ouput_offset + 256;
-  for (int i = ouput_offset + threadidx, j = threadidx ; i < lower_bound ; i+=192, j+=192)
+  const int output_offset = blockIdx.x * 256;
+  const int lower_bound = max_output_element < output_offset + 256 ? max_output_element : output_offset + 256;
+  for (int i = output_offset + threadidx, j = threadidx ; i < lower_bound ; i+=192, j+=192)
   {
     const Telement* shm_element = (const Telement*)shm + j*3*element_in_Tio/4;
     Telement array[element_in_Tio];
@@ -140,9 +140,9 @@ __global__ void nhwc_padding_channel_3To8_kernel(const int32_t n,
   shm[threadIdx.x] = tidx >= max_input_element ? zero_io : input[tidx];  
   __syncthreads();
   
-  const int ouput_offset = blockIdx.x * 512;
-  const int lower_bound = max_output_element < ouput_offset + 512 ? max_output_element : ouput_offset + 512;
-  for (int i = ouput_offset + threadidx, j = threadidx ; i < lower_bound ; i+=192, j+=192)
+  const int output_offset = blockIdx.x * 512;
+  const int lower_bound = max_output_element < output_offset + 512 ? max_output_element : output_offset + 512;
+  for (int i = output_offset + threadidx, j = threadidx ; i < lower_bound ; i+=192, j+=192)
   {
     const Telement* shm_element = (const Telement*)shm + (element_in_Tio == 4 ? j/2 : j)*3;
     Telement array[element_in_Tio];
diff --git a/tools/util/include/cutlass/util/device_nhwc_pooling.h b/tools/util/include/cutlass/util/device_nhwc_pooling.h
new file mode 100644
index 0000000000..cce452d9eb
--- /dev/null
+++ b/tools/util/include/cutlass/util/device_nhwc_pooling.h
@@ -0,0 +1,573 @@
+/******************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+/**
+ * \file
+ * \brief cuda kernels to do avg/max pooling on a device memory tensor with NHWC layout.
+ */
+
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/tensor_ref.h"
+#include "device_utils.h"
+#include <cfloat>
+
+namespace cutlass {
+
+/** \brief interface to do avg/max pooling on a device memory tensor with NHWC layout.
+ * \tparam T: data type
+ */
+template <typename T>
+void pooling_nhwc(cutlass::Tensor4DCoord input_tensor_size,
+                  cutlass::Tensor4DCoord filter_tensor_size,
+                  cutlass::Tensor4DCoord output_tensor_size,
+                  cutlass::MatrixCoord padding,
+                  cutlass::MatrixCoord stride,
+                  TensorRef<T, layout::TensorNHWC> ref_input,
+                  TensorRef<T, layout::TensorNHWC> ref_output,
+                  int poolingType, //0 for avg pooling ; 1 for max pooling
+                  cudaStream_t stream);
+
+/** get the output size of pooling
+ */
+inline int getOutputSize(int H_W, int padding, int kernel_size, int stride)
+{
+    return (H_W + 2 * padding - kernel_size) / stride + 1;
+}
+
+/**
+ * input is [N, H, W, C]
+ * assume stride == kernel_size
+ * output_h = (H + 2*padding_H - kernel_H)/stride_H
+ * output_w = (W + 2*padding_W - kernel_W)/stride_W
+ * output is [N, output_h, output_w, C]
+ * grid(N, output_h, output_w)
+ * block(min(C, 256)) :
+ * each block deals with C elements of output when each thread deals with ((C + 255)/256 element of output)
+*/
+template<typename T, bool IS_AVG_POOLING>
+__global__ void pooling_nhwc_element1_kernel(T* output,
+                                             const T* input,
+                                             const int N,
+                                             const int H,
+                                             const int W,
+                                             const int C,
+                                             const int output_H,
+                                             const int output_W,
+                                             const int kernel_H,
+                                             const int kernel_W,
+                                             const int stride_H,
+                                             const int stride_W,
+                                             const int padding_H,
+                                             const int padding_W)
+{
+  const int tid = threadIdx.x;
+  const int n_idx = blockIdx.x;
+  const int output_h_idx = blockIdx.y;
+  const int output_w_idx = blockIdx.z;
+
+  int h_start_idx = output_h_idx * stride_H - padding_H;
+  int h_end_idx = h_start_idx + kernel_H;
+  h_start_idx = (h_start_idx < 0) ? 0 : h_start_idx;
+  h_end_idx = h_end_idx > H ? H : h_end_idx;
+
+  int w_start_idx = output_w_idx * stride_W - padding_W;
+  int w_end_idx = w_start_idx + kernel_W;
+  w_start_idx = (w_start_idx < 0) ? 0 : w_start_idx;
+  w_end_idx = w_end_idx > W ? W : w_end_idx;
+
+  input += n_idx * H * W * C;
+  output += ((n_idx * output_H + output_h_idx) * output_W + output_w_idx) * C;
+  const int kernel_size2 = kernel_H * kernel_W;
+  for (int c_idx = tid; c_idx < C; c_idx += blockDim.x) {
+    float pooling;
+    if (IS_AVG_POOLING){
+      pooling = 0.0f;
+    }
+    else{
+      pooling = -FLT_MAX;
+    }
+    for (int h = h_start_idx; h < h_end_idx; h++) {
+      for (int w = w_start_idx; w < w_end_idx; w++) {
+        const int idx = (h * W + w) * C;
+        const float tmp = static_cast<float>(input[idx + c_idx]);
+        if (IS_AVG_POOLING){
+          pooling = pooling + tmp;
+        }
+        else{
+          pooling = pooling > tmp ? pooling : tmp;
+        }
+      }
+    }
+
+    T output_val;
+    if (IS_AVG_POOLING){
+      output_val = T(pooling/kernel_size2);
+    }
+    else{
+      output_val = T(pooling);
+    }
+    output[c_idx] = output_val;
+  }
+}
+
+template<typename T2, typename T, bool IS_AVG_POOLING>
+__global__ void pooling_nhwc_element2_kernel(T2* output,
+                                             const T2* input,
+                                             const int N,
+                                             const int H,
+                                             const int W,
+                                             const int C,
+                                             const int output_H,
+                                             const int output_W,
+                                             const int kernel_H,
+                                             const int kernel_W,
+                                             const int stride_H,
+                                             const int stride_W,
+                                             const int padding_H,
+                                             const int padding_W)
+{
+  const int tid = threadIdx.x;
+  const int n_idx = blockIdx.x;
+  const int output_h_idx = blockIdx.y;
+  const int output_w_idx = blockIdx.z;
+
+  int h_start_idx = output_h_idx * stride_H - padding_H;
+  int h_end_idx = h_start_idx + kernel_H;
+  h_start_idx = (h_start_idx < 0) ? 0 : h_start_idx;
+  h_end_idx = h_end_idx > H ? H : h_end_idx;
+
+  int w_start_idx = output_w_idx * stride_W - padding_W;
+  int w_end_idx = w_start_idx + kernel_W;
+  w_start_idx = (w_start_idx < 0) ? 0 : w_start_idx;
+  w_end_idx = w_end_idx > W ? W : w_end_idx;
+
+  input += n_idx * H * W * C;
+  output += ((n_idx * output_H + output_h_idx) * output_W + output_w_idx) * C;
+  const int kernel_size2 = kernel_H * kernel_W;
+  for (int c_idx = tid; c_idx < C; c_idx += blockDim.x) {
+    float2 pooling;
+    if (IS_AVG_POOLING) { 
+      pooling = {0.0f, 0.0f};
+    }
+    else {
+      pooling = {-FLT_MAX, -FLT_MAX};
+    }
+    for (int h = h_start_idx; h < h_end_idx; h++) {
+      for (int w = w_start_idx; w < w_end_idx; w++) {
+        const int idx = (h * W + w) * C;
+        const T2 tmp = input[idx + c_idx];
+        const float2 tmp_flt2 = {static_cast<float>(tmp.x), static_cast<float>(tmp.y)};
+        if (IS_AVG_POOLING) {
+          pooling.x += tmp_flt2.x;
+          pooling.y += tmp_flt2.y;
+        }
+        else {
+          pooling.x = pooling.x > tmp_flt2.x ? pooling.x : tmp_flt2.x;
+          pooling.y = pooling.y > tmp_flt2.y ? pooling.y : tmp_flt2.y;
+        }
+      }
+    }
+
+    T2 output_val;
+    if (IS_AVG_POOLING) {
+      output_val.x = T(pooling.x/kernel_size2);
+      output_val.y = T(pooling.y/kernel_size2);
+    }
+    else {
+      output_val.x = T(pooling.x);
+      output_val.y = T(pooling.y);
+    }
+    output[c_idx] = output_val;
+  }
+}
+
+/**
+ * output [N, 1, 1, C]
+ * input [N, H, W, C]
+ * grid(C, N)
+ * block(block_size) -- each block deals with H*W/block_size elements;
+*/
+template<typename T, bool IS_AVG_POOLING>
+__global__ void pooling_nxhTo1x1_element1_kernel(
+    T* output, const T* input, const int N, const int HW, const int C)
+{
+    const int c_idx = blockIdx.x;
+    const int n_idx = blockIdx.y;
+    float pooling[1];
+    if (IS_AVG_POOLING) {
+      pooling[0] = 0.0f;
+    }
+    else {
+      pooling[0] = -FLT_MAX;
+    }
+    const size_t input_offset = n_idx * HW * C + c_idx;
+    input += input_offset;
+    const size_t output_offset = n_idx * C + c_idx;
+    output += output_offset;
+    int tid = threadIdx.x;
+
+    for (int index = tid; index < HW; index += blockDim.x) {
+        float val = static_cast<float>(input[index * C]);
+        if (IS_AVG_POOLING) {
+          pooling[0] += val;
+        }
+        else {
+          pooling[0] = pooling[0] > val ? pooling[0] : val;
+        }
+    }
+    if (blockDim.x <= 32) {
+        if (IS_AVG_POOLING) {
+          warpReduceSum<float, 1>(pooling);
+        }
+        else {
+          warpReduceMax<float, 1>(pooling);
+        }
+    }
+    else {
+        if (IS_AVG_POOLING) {
+          blockReduceSum<float, 1>(pooling);
+        }
+        else {
+          blockReduceMax<float, 1>(pooling);
+        }
+    }
+    __syncthreads();
+    if (threadIdx.x == 0) {
+        T output_val;
+        if (IS_AVG_POOLING) {
+          output_val = T(pooling[0] / HW);
+        }
+        else {
+          output_val = T(pooling[0]);
+        }
+        output[0] = output_val;
+    }
+}
+
+
+/**
+ * output [N, 1, 1, C]
+ * input [N, H, W, C]
+ * grid(C/2, N)
+ * block(block_size) -- each thread deals with H*W/block_size * 2 elements;
+*/
+template<typename T2, typename T, bool IS_AVG_POOLING>
+__global__ void pooling_nxhTo1x1_element2_kernel(
+    T2* output, const T2* input, const int N, const int HW, const int C)
+{
+    const int c_idx = blockIdx.x;
+    const int n_idx = blockIdx.y;
+    float pooling[2];
+    if (IS_AVG_POOLING) {
+      pooling[0] = pooling[1] = 0.0f;
+    }
+    else {
+      pooling[0] = pooling[1] = -FLT_MAX;
+    }
+    const int C_2 = C / 2;
+    const size_t input_offset = n_idx * HW * C_2 + c_idx;
+    input += input_offset;
+    const size_t output_offset = n_idx * C_2 + c_idx;
+    output += output_offset;
+    int tid = threadIdx.x;
+
+    for (int index = tid; index < HW; index += blockDim.x) {
+        T2 val = input[index * C_2];
+        float2 val_flt2 = {static_cast<float>(val.x), static_cast<float>(val.y)};
+        if (IS_AVG_POOLING) {
+          pooling[0] += val_flt2.x;
+          pooling[1] += val_flt2.y;
+        }
+        else {
+          pooling[0] = pooling[0] > val_flt2.x ? pooling[0] : val_flt2.x;
+          pooling[1] = pooling[1] > val_flt2.y ? pooling[1] : val_flt2.y;
+        }
+    }
+    if (blockDim.x <= 32) {
+        if (IS_AVG_POOLING) {
+          warpReduceSum<float, 2>(pooling);
+        }
+        else {
+          warpReduceMax<float, 2>(pooling);
+        }
+    }
+    else {
+        if (IS_AVG_POOLING) {
+          blockReduceSum<float, 2>(pooling);
+        }
+        else {
+          blockReduceMax<float, 2>(pooling);
+        }
+    }
+    __syncthreads();
+    if (threadIdx.x == 0) {
+        T2 output_val;
+        if (IS_AVG_POOLING) {
+          output_val.x = T(pooling[0] / HW);
+          output_val.y = T(pooling[1] / HW);
+        }
+        else {
+          output_val.x = T(pooling[0]);
+          output_val.y = T(pooling[1]);
+        }
+        output[0] = output_val;
+    }
+}
+
+template <typename T>
+void pooling_nhwc(cutlass::Tensor4DCoord input_tensor_size,
+                  cutlass::Tensor4DCoord filter_tensor_size,
+                  cutlass::Tensor4DCoord output_tensor_size,
+                  cutlass::Tensor4DCoord padding,
+                  cutlass::MatrixCoord stride,
+                  TensorRef<T, layout::TensorNHWC> ref_input,
+                  TensorRef<T, layout::TensorNHWC> ref_output,
+                  int poolingType, //0 for avg pooling ; 1 for max pooling
+                  cudaStream_t stream) {
+
+  assert(input_tensor_size.n() == output_tensor_size.n() &&
+         input_tensor_size.c() == output_tensor_size.c());
+
+  const int N = input_tensor_size.n();
+  const int H = input_tensor_size.h();
+  const int W = input_tensor_size.w();
+  const int C = input_tensor_size.c();
+  const int padding_H = padding.h();
+  const int padding_W = padding.w();
+  const int kernel_H = filter_tensor_size.h();
+  const int kernel_W = filter_tensor_size.w();
+  const int stride_H = stride.row();
+  const int stride_W = stride.column();
+
+  const int output_H = getOutputSize(H, padding_H, kernel_H, stride_H);
+  const int output_W = getOutputSize(W, padding_W, kernel_W, stride_W);
+
+  assert(output_tensor_size.h() == output_H &&
+         output_tensor_size.w() == output_W);
+
+  if (C % 2 != 0) {
+    if ((H == kernel_H && padding_H == 0) && (W == kernel_W && padding_W == 0)) {
+      dim3 grid(C, N);
+      dim3 block(256);
+      if (H*W < block.x){
+        block.x = (H*W + 31)/32*32;
+      } 
+      if (poolingType == 0) {
+        pooling_nxhTo1x1_element1_kernel<T, true><<<grid, block, 0, stream>>>(
+          ref_output.data(),
+          ref_input.data(),
+          N,
+          H*W,
+          C);
+      } // if (poolingType == 0)
+      else {
+        pooling_nxhTo1x1_element1_kernel<T, false><<<grid, block, 0, stream>>>(
+          ref_output.data(),
+          ref_input.data(),
+          N,
+          H*W,
+          C);
+      }
+    } // if ((H == kernel_H && padding_H == 0) && (W == kernel_W && padding_W == 0))
+    else {
+      dim3 grid(N, output_H, output_W);
+      dim3 block(256);
+      if (C < block.x) {
+        block.x = C;
+      }
+      if (poolingType == 0) {
+        pooling_nhwc_element1_kernel<T, true><<<grid, block, 0, stream>>>(
+          ref_output.data(), 
+          ref_input.data(),
+          N,
+          H,
+          W,
+          C,
+          output_H,
+          output_W,
+          kernel_H,
+          kernel_W,
+          stride_H,
+          stride_W,
+          padding_H,
+          padding_W);
+      } // if (poolingType == 0)
+      else {
+        pooling_nhwc_element1_kernel<T, false><<<grid, block, 0, stream>>>(
+          ref_output.data(),
+          ref_input.data(),
+          N,
+          H,
+          W,
+          C,
+          output_H,
+          output_W,
+          kernel_H,
+          kernel_W,
+          stride_H,
+          stride_W,
+          padding_H,
+          padding_W);
+      }
+    }
+  } // if (C % 2 != 0))
+  else {
+    if ((H == kernel_H && padding_H == 0) && (W == kernel_W && padding_W == 0)) {
+      dim3 grid(C/2, N);
+      dim3 block(256);
+      if (H*W < block.x){
+        block.x = (H*W + 31)/32*32;
+      }
+      if (poolingType == 0) {
+        if (std::is_same<T, float>::value) {
+          pooling_nxhTo1x1_element2_kernel<float2, float, true><<<grid, block, 0, stream>>>(
+            (float2*)(ref_output.data()),
+            (const float2*)(ref_input.data()),
+            N,
+            H*W,
+            C);
+        } // if (std::is_same<T, float>::value)
+        else {
+          pooling_nxhTo1x1_element2_kernel<half2, half, true><<<grid, block, 0, stream>>>(
+            (half2*)(ref_output.data()),
+            (const half2*)(ref_input.data()),
+            N,
+            H*W,
+            C);
+        }
+      } // if (poolingType == 0)
+      else {
+        if (std::is_same<T, float>::value) {
+          pooling_nxhTo1x1_element2_kernel<float2, float, false><<<grid, block, 0, stream>>>(
+            (float2*)(ref_output.data()),
+            (const float2*)(ref_input.data()),
+            N,
+            H*W,
+            C);
+        } // if (std::is_same<T, float>::value)
+        else {
+          pooling_nxhTo1x1_element2_kernel<half2, half, false><<<grid, block, 0, stream>>>(
+            (half2*)(ref_output.data()),
+            (const half2*)(ref_input.data()),
+            N,
+            H*W,
+            C);
+        }
+      }
+    } // if ((H == kernel_H && padding_H == 0) && (W == kernel_W && padding_W == 0))
+    else {
+      dim3 grid(N, output_H, output_W);
+      dim3 block(256);
+      if (C/2 < block.x) {
+        block.x = C/2;
+      }
+      if (poolingType == 0) {
+        if (std::is_same<T, float>::value) {
+          pooling_nhwc_element2_kernel<float2, float, true><<<grid, block, 0, stream>>>(
+            (float2*)(ref_output.data()),
+            (const float2*)(ref_input.data()),
+            N,
+            H,
+            W,
+            C/2,
+            output_H,
+            output_W,
+            kernel_H,
+            kernel_W,
+            stride_H,
+            stride_W,
+            padding_H,
+            padding_W);
+        } // if (std::is_same<T, float>::value)
+        else {
+          pooling_nhwc_element2_kernel<half2, half, true><<<grid, block, 0, stream>>>(
+            (half2*)(ref_output.data()),
+            (const half2*)(ref_input.data()),
+            N,
+            H,
+            W,
+            C/2,
+            output_H,
+            output_W,
+            kernel_H,
+            kernel_W,
+            stride_H,
+            stride_W,
+            padding_H,
+            padding_W);
+        }
+      } // if (poolingType == 0)
+      else {
+        if (std::is_same<T, float>::value) {
+          pooling_nhwc_element2_kernel<float2, float, false><<<grid, block, 0, stream>>>(
+            (float2*)(ref_output.data()),
+            (const float2*)(ref_input.data()),
+            N,
+            H,
+            W,
+            C/2,
+            output_H,
+            output_W,
+            kernel_H,
+            kernel_W,
+            stride_H,
+            stride_W,
+            padding_H,
+            padding_W);
+        } // if (std::is_same<T, float>::value)
+        else {
+          pooling_nhwc_element2_kernel<half2, half, false><<<grid, block, 0, stream>>>(
+            (half2*)(ref_output.data()),
+            (const half2*)(ref_input.data()),
+            N,
+            H,
+            W,
+            C/2,
+            output_H,
+            output_W,
+            kernel_H,
+            kernel_W,
+            stride_H,
+            stride_W,
+            padding_H,
+            padding_W);
+        }
+      }
+    }
+  }
+}
+
+} //namespace cutlass
diff --git a/tools/util/include/cutlass/util/device_nhwc_to_nchw.h b/tools/util/include/cutlass/util/device_nhwc_to_nchw.h
index 1ec3c72ce4..4a5f78009c 100644
--- a/tools/util/include/cutlass/util/device_nhwc_to_nchw.h
+++ b/tools/util/include/cutlass/util/device_nhwc_to_nchw.h
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/device_rmsnorm.h b/tools/util/include/cutlass/util/device_rmsnorm.h
new file mode 100644
index 0000000000..44a1c08487
--- /dev/null
+++ b/tools/util/include/cutlass/util/device_rmsnorm.h
@@ -0,0 +1,186 @@
+/******************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/util/device_utils.h"
+#include <cfloat>
+
+namespace cutlass {
+
+__global__ void rmsnorm_twoPassAlgo_e8(float4 *output, const float4 *input,
+                                       const float4 *weight,
+                                       const int m, const int n, float epsilon) {
+  const int m_idx = blockIdx.x;
+  const int tid = threadIdx.x;
+  const int bdimx = blockDim.x;
+  __shared__ float s_mean;
+  float local_sums[1] = {0.0f};
+  const int n_8 = n / 8;
+  int offset = m_idx * n_8;
+  input += offset;
+  output += offset;
+
+  for (int index = tid; index < n_8; index += bdimx) {
+    const float4 local_val = input[index];
+    const half2 *h1 = (half2 *)&local_val.x;
+    const half2 *h2 = (half2 *)&local_val.y;
+    const half2 *h3 = (half2 *)&local_val.z;
+    const half2 *h4 = (half2 *)&local_val.w;
+    local_sums[0] += static_cast<float>(h1->x) * static_cast<float>(h1->x) +
+                     static_cast<float>(h1->y) * static_cast<float>(h1->y) +
+                     static_cast<float>(h2->x) * static_cast<float>(h2->x) +
+                     static_cast<float>(h2->y) * static_cast<float>(h2->y) +
+                     static_cast<float>(h3->x) * static_cast<float>(h3->x) +
+                     static_cast<float>(h3->y) * static_cast<float>(h3->y) +
+                     static_cast<float>(h4->x) * static_cast<float>(h4->x) +
+                     static_cast<float>(h4->y) * static_cast<float>(h4->y);
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_mean = rsqrtf(local_sums[0] / n + epsilon);
+  }
+  __syncthreads();
+
+  for (int index = tid; index < n_8; index += bdimx) {
+    const float4 local_val = input[index];
+    const float4 weight_val = weight[index];
+
+    const half2 *l1 = (half2 *)&local_val.x;
+    const half2 *l2 = (half2 *)&local_val.y;
+    const half2 *l3 = (half2 *)&local_val.z;
+    const half2 *l4 = (half2 *)&local_val.w;
+
+    const half2 *g1 = (half2 *)&weight_val.x;
+    const half2 *g2 = (half2 *)&weight_val.y;
+    const half2 *g3 = (half2 *)&weight_val.z;
+    const half2 *g4 = (half2 *)&weight_val.w;
+
+    float4 tmp;
+    half2 *h1 = (half2 *)&tmp.x;
+    half2 *h2 = (half2 *)&tmp.y;
+    half2 *h3 = (half2 *)&tmp.z;
+    half2 *h4 = (half2 *)&tmp.w;
+
+    h1->x = half(static_cast<float>(l1->x) * s_mean * static_cast<float>(g1->x));
+    h1->y = half(static_cast<float>(l1->y) * s_mean * static_cast<float>(g1->y));
+    h2->x = half(static_cast<float>(l2->x) * s_mean * static_cast<float>(g2->x));
+    h2->y = half(static_cast<float>(l2->y) * s_mean * static_cast<float>(g2->y));
+    h3->x = half(static_cast<float>(l3->x) * s_mean * static_cast<float>(g3->x));
+    h3->y = half(static_cast<float>(l3->y) * s_mean * static_cast<float>(g3->y));
+    h4->x = half(static_cast<float>(l4->x) * s_mean * static_cast<float>(g4->x));
+    h4->y = half(static_cast<float>(l4->y) * s_mean * static_cast<float>(g4->y));
+
+    output[index] = tmp;
+  }
+}
+
+template<typename T>
+__global__ void rmsnorm_twoPassAlgo_e1(T* output,
+                                       const T* input,
+                                       const T* weight,
+                                       const int m, const int n,
+                                       float epsilon)
+{
+  const int m_idx = blockIdx.x;
+  const int tid = threadIdx.x;
+  const int bdimx = blockDim.x;
+  __shared__ float s_mean;
+  float local_sums[1] = {0.0f};
+  int offset = m_idx * n;
+  input += offset;
+  output += offset;
+
+  for (int index = tid ; index < n ; index += bdimx){
+    float local_val = static_cast<float>(input[index]);
+    local_sums[0] += local_val * local_val;
+  }
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  }
+  else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_mean = rsqrtf(local_sums[0] / n + epsilon);
+  }
+  __syncthreads();
+
+  for (int index = tid ; index < n ; index += bdimx){
+    const T weight_val = weight[index];
+    const T local_val = input[index];
+    output[index] = T(static_cast<float>(local_val) * s_mean * static_cast<float>(weight_val));
+  }
+}
+
+template <typename T>
+void rmsnorm(cutlass::MatrixCoord tensor_size,
+             TensorRef<T, layout::RowMajor> ref_output,
+             TensorRef<T, layout::RowMajor> ref_input,
+             TensorRef<T, layout::RowMajor> ref_weight,
+             cudaStream_t stream, float epsilon = 1e-5f){
+  const int m = tensor_size.row();
+  const int n = tensor_size.column();
+  T* output = ref_output.data();
+  const T* input = ref_input.data();
+  const T* weight = ref_weight.data();
+  dim3 grid(m);
+
+  if (n % 8 == 0 && std::is_same<T, cutlass::half_t>::value) {
+    dim3 block(cutlass::platform::min(1024, (n / 8 + 31) / 32 * 32));
+
+    rmsnorm_twoPassAlgo_e8<<<grid, block, 0, stream>>>(
+        (float4 *)output, (const float4 *)input, (const float4 *)weight, m, n, epsilon);
+  } else {
+    dim3 block(cutlass::platform::min(1024, ((n + 31)/32 + 31)/32*32));
+
+    rmsnorm_twoPassAlgo_e1<<<grid, block, 0, stream>>>(
+        output, input, weight, m, n, epsilon);
+  }
+
+  auto result = cudaGetLastError();
+  if (result != cudaSuccess) {
+    std::cerr << "CUDA error: " << cudaGetErrorString(result) << std::endl;
+    abort();
+  }
+}
+
+} // namespace cutlass
diff --git a/tools/util/include/cutlass/util/device_utils.h b/tools/util/include/cutlass/util/device_utils.h
new file mode 100644
index 0000000000..7a8378fc2d
--- /dev/null
+++ b/tools/util/include/cutlass/util/device_utils.h
@@ -0,0 +1,127 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief utils code for device cutlass code
+*/
+
+#pragma once
+
+#include <cuda_fp16.h>
+#include <cfloat>
+#define FINAL_MASK 0xffffffff
+
+struct half4 {
+    half x, y, z, w;
+};
+
+template<typename T, int NUM>
+__inline__ __device__ T warpReduceSum(T* val)
+{
+#pragma unroll
+    for (int i = 0; i < NUM; i++) {
+#pragma unroll
+        for (int mask = 16; mask > 0; mask >>= 1)
+            val[i] += __shfl_xor_sync(FINAL_MASK, val[i], mask, 32);
+    }
+    return (T)(0.0f);
+}
+
+template<typename T, int NUM>
+__inline__ __device__ T blockReduceSum(T* val)
+{
+    __shared__ T shared[NUM][33];
+    int lane = threadIdx.x & 0x1f;
+    int wid = threadIdx.x >> 5;
+
+    warpReduceSum<T, NUM>(val);
+
+    if (lane == 0) {
+#pragma unroll
+        for (int i = 0; i < NUM; i++) {
+            shared[i][wid] = val[i];
+        }
+    }
+
+    __syncthreads();
+
+    bool is_mask = threadIdx.x < (blockDim.x / 32.f);
+#pragma unroll
+    for (int i = 0; i < NUM; i++) {
+        val[i] = is_mask ? shared[i][lane] : (T)(0.0f);
+    }
+    warpReduceSum<T, NUM>(val);
+    return (T)0.0f;
+}
+
+template<typename T, int NUM>
+__inline__ __device__ T warpReduceMax(T* val)
+{
+#pragma unroll
+    for (int i = 0; i < NUM; i++) {
+#pragma unroll
+        for (int mask = 16; mask > 0; mask >>= 1)
+            val[i] = max(val[i], __shfl_xor_sync(FINAL_MASK, val[i], mask, 32));
+    }
+    return (T)(0.0f);
+}
+
+template<typename T, int NUM>
+__inline__ __device__ T blockReduceMax(T* val)
+{
+    static __shared__ T shared[32][NUM];
+    int lane = threadIdx.x & 0x1f;  // in-warp idx
+    int wid = threadIdx.x >> 5;     // warp idx
+
+    warpReduceMax<T, NUM>(val);  // get maxx in each warp
+
+    if (lane == 0)  // record in-warp maxx by warp Idx
+    {
+#pragma unroll
+        for (int i = 0; i < NUM; i++) {
+            shared[wid][i] = val[i];
+        }
+    }
+
+    __syncthreads();
+
+    // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent
+    // blockDim.x is not divided by 32
+    bool is_mask = threadIdx.x < (blockDim.x / 32.f);
+#pragma unroll
+    for (int i = 0; i < NUM; i++) {
+        val[i] = is_mask ? shared[lane][i] : (T)(-FLT_MAX);
+    }
+    warpReduceMax<T, NUM>(val);
+
+    return (T)0.0f;
+}
+
diff --git a/tools/util/include/cutlass/util/distribution.h b/tools/util/include/cutlass/util/distribution.h
index 773487eeb9..086e033a90 100644
--- a/tools/util/include/cutlass/util/distribution.h
+++ b/tools/util/include/cutlass/util/distribution.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -51,12 +51,18 @@ struct Distribution {
     struct {
       double min;
       double max;
+      // Percent elements set to NaN
+      double pnan;
     } uniform;
 
     /// Gaussian distribution
     struct {
       double mean;
       double stddev;
+      double pnz;
+      double pnzA;
+      double pnzB;
+      double pnzC;
     } gaussian;
 
     /// Elements are linear combination of row and column index
@@ -78,20 +84,25 @@ struct Distribution {
 
   Distribution() : kind(Invalid), int_scale(0) {}
 
-  /// Configures distribution as uniform random
-  Distribution &set_uniform(double _min, double _max, int _int_scale = 0) {
+/// Configures distribution as uniform random
+  Distribution &set_uniform(double _min, double _max, int _int_scale = 0, double _pnan = 0) {
     kind = Uniform;
     uniform.min = _min;
     uniform.max = _max;
     int_scale = _int_scale;
+    uniform.pnan = _pnan;
     return *this;
   }
 
   /// Configures distribution as Gaussian distribution
-  Distribution &set_gaussian(double _mean, double _stddev, int _int_scale = 0) {
+  Distribution &set_gaussian(double _mean, double _stddev, int _int_scale = 0, double _pnz = 1.0) {
     kind = Gaussian;
     gaussian.mean = _mean;
     gaussian.stddev = _stddev;
+    gaussian.pnz = _pnz;
+    gaussian.pnzA = _pnz;
+    gaussian.pnzB = _pnz;
+    gaussian.pnzC = _pnz;
     int_scale = _int_scale;
     return *this;
   }
@@ -120,10 +131,13 @@ struct Distribution {
 inline std::ostream &operator<<(std::ostream &out, cutlass::Distribution const &dist) {
   switch (dist.kind) {
     case cutlass::Distribution::Uniform:
-      out << "uniform, min: " << dist.uniform.min << ", max: " << dist.uniform.max;
+      out << "uniform, min: " << dist.uniform.min << ", max: " << dist.uniform.max
+          << ", pnan: " << dist.uniform.pnan;
       break;
     case cutlass::Distribution::Gaussian:
-      out << "gaussian, mean: " << dist.gaussian.mean << ", stddev: " << dist.gaussian.stddev;
+      out << "gaussian, mean: " << dist.gaussian.mean << ", stddev: " << dist.gaussian.stddev
+          << ", pnzA: " << dist.gaussian.pnzA << ", pnzB: "
+          << dist.gaussian.pnzB << ", pnzC: " << dist.gaussian.pnzC;
       break;
     case cutlass::Distribution::Identity:
       out << "identity";
diff --git a/tools/util/include/cutlass/util/exceptions.h b/tools/util/include/cutlass/util/exceptions.h
index bbdf5e0eb1..54c62fdbb6 100644
--- a/tools/util/include/cutlass/util/exceptions.h
+++ b/tools/util/include/cutlass/util/exceptions.h
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/gett_commandline.hpp b/tools/util/include/cutlass/util/gett_commandline.hpp
new file mode 100644
index 0000000000..9f2e5a2912
--- /dev/null
+++ b/tools/util/include/cutlass/util/gett_commandline.hpp
@@ -0,0 +1,369 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief GETT command line parser to gather semantic modes, their stride order, and extents.
+*/
+#pragma once
+
+#include <iostream>
+#include <iomanip>
+#include <utility>
+#include <type_traits>
+#include <vector>
+#include <map>
+#include <algorithm>
+#include <numeric>
+
+#include "cutlass/util/command_line.h"
+
+namespace cutlass {
+
+// Output shortcuts
+std::ostream& operator<<(std::ostream& os, std::vector<char> data) {
+  for (auto& a : data) os << a;
+  return os;
+}
+
+template <class T>
+std::ostream& operator<<(std::ostream& os, std::vector<T> data) {
+  for (auto& a : data) os << a << " ";
+  return os;
+}
+
+struct GettCommandLine {
+  struct GettProblem {
+    using extent_type = int;
+    using stride_type = int64_t;
+
+    // Row modes: appear in A and C/D
+    std::vector<extent_type> M;
+    std::vector<stride_type> ldAm;
+    std::vector<stride_type> ldCm;
+
+    // Column modes: appear in B and C/D
+    std::vector<extent_type> N;
+    std::vector<stride_type> ldBn;
+    std::vector<stride_type> ldCn;  
+
+    // Reduction modes: appear in A and B
+    std::vector<extent_type> K;
+    std::vector<stride_type> ldAk;
+    std::vector<stride_type> ldBk;
+
+    // Batch modes: appear in all in/out tensors
+    std::vector<extent_type> L;
+    std::vector<stride_type> ldAl;
+    std::vector<stride_type> ldBl;
+    std::vector<stride_type> ldCl;
+  };
+
+  static GettProblem
+  parse(int argc, char const* argv[], bool parse_verbose = false) {
+    using extent_type = typename GettProblem::extent_type;
+    using stride_type = typename GettProblem::stride_type;
+
+    cutlass::CommandLine cmd(argc, argv);
+
+    // modeA
+    std::vector<char> a_mode;
+    cmd.get_cmd_line_arguments("modeA", a_mode);
+
+    // modeB
+    std::vector<char> b_mode;
+    cmd.get_cmd_line_arguments("modeB", b_mode);
+
+    // modeC
+    std::vector<char> c_mode;
+    cmd.get_cmd_line_arguments("modeC", c_mode);
+
+
+    // mode_sizes
+    std::map<char,extent_type> mode_size;
+    // First, initialize all modes in a, b, c to make sure they're in map
+    for (char a : a_mode) mode_size[a] = 1;
+    for (char b : b_mode) mode_size[b] = 1;
+    for (char c : c_mode) mode_size[c] = 1;
+
+    // Then, overwrite the ones in -extent
+    std::vector<std::pair<std::string, std::string> > extent_tokens;
+    cmd.get_cmd_line_argument_pairs("extents", extent_tokens);
+    for (auto e : extent_tokens) {
+      if (std::get<0>(e).size() > 1) {
+        std::cerr << "ERROR: Mode name must only be 1 character long.\n";
+        print_usage();
+        exit(1);
+      }
+      char label = std::get<0>(e)[0];
+      int  size  = std::stoi(std::get<1>(e));
+      mode_size[label] = size;
+    }
+
+    // Print out symbolic modes and their extents
+    if (parse_verbose) {
+      std::cout << "C_" << c_mode << " = A_" << a_mode << " * B_" << b_mode << "\n";
+      for (auto e : mode_size) std::cout << "     " << std::get<0>(e) << " : " << std::get<1>(e) << "\n";
+    }
+
+    //
+    // Collect/Compute strides
+    //
+
+    std::map<char,stride_type> mode_ldA;
+    std::map<char,stride_type> mode_ldB;
+    std::map<char,stride_type> mode_ldC;
+
+    {
+      stride_type current;
+
+      current = 1;
+      for (char a : a_mode) { mode_ldA[a] = current; current *= mode_size[a]; }
+
+      current = 1;
+      for (char b : b_mode) { mode_ldB[b] = current; current *= mode_size[b]; }
+
+      current = 1;
+      for (char c : c_mode) { mode_ldC[c] = current; current *= mode_size[c]; }
+    }
+
+    //
+    // Collect mode categories
+    //
+
+    std::vector<char> row_mode;  // rows
+    std::vector<char> col_mode;  // columns
+    std::vector<char> red_mode;  // reductions
+    std::vector<char> bat_mode;  // batches
+
+    {
+      std::vector<char> a_label = a_mode;
+      std::vector<char> b_label = b_mode;
+      std::vector<char> c_label = c_mode;
+
+      std::sort(std::begin(a_label), std::end(a_label));
+      std::sort(std::begin(b_label), std::end(b_label));
+      std::sort(std::begin(c_label), std::end(c_label));
+
+      // std::set_intersections to find semantic category of each symbolic mode
+      std::set_intersection(std::begin(a_label), std::end(a_label),
+                            std::begin(c_label), std::end(c_label),
+                            std::back_inserter(row_mode));
+
+      std::set_intersection(std::begin(b_label), std::end(b_label),
+                            std::begin(c_label), std::end(c_label),
+                            std::back_inserter(col_mode));
+
+      std::set_intersection(std::begin(a_label), std::end(a_label),
+                            std::begin(b_label), std::end(b_label),
+                            std::back_inserter(red_mode));
+
+      std::set_intersection(std::begin(row_mode), std::end(row_mode),
+                            std::begin(col_mode), std::end(col_mode),
+                            std::back_inserter(bat_mode));
+
+      // std::set_difference to remove batch modes from other semantic modes
+      for (char l : bat_mode) {
+        row_mode.erase(std::remove(std::begin(row_mode), std::end(row_mode), l), std::end(row_mode));
+        col_mode.erase(std::remove(std::begin(col_mode), std::end(col_mode), l), std::end(col_mode));
+        red_mode.erase(std::remove(std::begin(red_mode), std::end(red_mode), l), std::end(red_mode));
+      }
+    }
+
+    // Print out the semantic association of each symbolic mode
+    if (parse_verbose) {
+      std::cout << "  rows : " << row_mode << '\n';
+      std::cout << "  cols : " << col_mode << '\n';
+      std::cout << "  reds : " << red_mode << '\n';
+      std::cout << "  bats : " << bat_mode << '\n';
+    }
+
+    //
+    // Permute modes
+    //
+
+    // Permute the batched modes to promote coalescing
+    // Sort the batched modes by min(ldAl,ldBl) and in case of a tie by the size
+    std::sort(std::begin(bat_mode), std::end(bat_mode), [&](char l1, char l2) {
+        return std::tie(std::min(mode_ldA[l1],mode_ldB[l1]),mode_size[l1])
+             < std::tie(std::min(mode_ldA[l2],mode_ldB[l2]),mode_size[l2]);
+      });
+    // Compute sizes and strides of ordered reduction modes
+    std::vector<extent_type> L;
+    std::vector<stride_type> ldAl;
+    std::vector<stride_type> ldBl;
+    std::vector<stride_type> ldCl;
+    for (char l : bat_mode) {
+      L.push_back(mode_size[l]);
+      ldAl.push_back(mode_ldA[l]);
+      ldBl.push_back(mode_ldB[l]);
+      ldCl.push_back(mode_ldC[l]);
+    }
+
+    // Permute the reduction modes to promote coalescing
+    // Sort the reduction modes by min(ldAk,ldBk) and in case of a tie by the size
+    std::sort(std::begin(red_mode), std::end(red_mode), [&](char k1, char k2) {
+        return std::tie(std::min(mode_ldA[k1],mode_ldB[k1]),mode_size[k1])
+             < std::tie(std::min(mode_ldA[k2],mode_ldB[k2]),mode_size[k2]);
+      });
+    // Compute sizes and strides of ordered reduction modes
+    std::vector<extent_type> K;
+    std::vector<stride_type> ldAk;
+    std::vector<stride_type> ldBk;
+    for (char k : red_mode) {
+      K.push_back(mode_size[k]);
+      ldAk.push_back(mode_ldA[k]);
+      ldBk.push_back(mode_ldB[k]);
+    }
+
+    // Permute the row modes to promote coalescing
+    // Sort the row modes by min(ldAm,ldCm) and in case of a tie by ldAm
+    std::sort(std::begin(row_mode), std::end(row_mode), [&](char m1, char m2) {
+        return std::tie(std::min(mode_ldA[m1],mode_ldC[m1]),mode_ldA[m1])
+             < std::tie(std::min(mode_ldA[m2],mode_ldC[m2]),mode_ldA[m2]);
+      });
+    // Compute sizes and strides of ordered row modes
+    std::vector<extent_type> M;
+    std::vector<stride_type> ldAm;
+    std::vector<stride_type> ldCm;
+    for (char m : row_mode) {
+      M.push_back(mode_size[m]);
+      ldAm.push_back(mode_ldA[m]);
+      ldCm.push_back(mode_ldC[m]);
+    }
+
+    // Permute the col modes to promote coalescing
+    // Sort the col modes by min(ldBn,ldCn) and in case of a tie by ldBn
+    std::sort(std::begin(col_mode), std::end(col_mode), [&](char n1, char n2) {
+        return std::tie(std::min(mode_ldB[n1],mode_ldC[n1]),mode_ldB[n1])
+             < std::tie(std::min(mode_ldB[n2],mode_ldC[n2]),mode_ldB[n2]);
+      });
+    // Compute sizes and strides of ordered col modes
+    std::vector<extent_type> N;
+    std::vector<stride_type> ldBn;
+    std::vector<stride_type> ldCn;
+    for (char n : col_mode) {
+      N.push_back(mode_size[n]);
+      ldBn.push_back(mode_ldB[n]);
+      ldCn.push_back(mode_ldC[n]);
+    }
+
+    if (parse_verbose) {
+      std::cout << "C_";
+      if (! row_mode.empty()) {
+        std::cout << "(" << row_mode << ")";
+      }
+      if (! col_mode.empty()) {
+        std::cout << "(" << col_mode << ")";
+      }
+      if (! bat_mode.empty()) {
+        std::cout << "(" << bat_mode << ")";
+      }
+      std::cout << " = A_";
+      if (! row_mode.empty()) {
+        std::cout << "(" << row_mode << ")";
+      }
+      if (! red_mode.empty()) {
+        std::cout << "(" << red_mode << ")";
+      }
+      if (! bat_mode.empty()) {
+        std::cout << "(" << bat_mode << ")";
+      }
+      std::cout << " * B_";
+      if (! col_mode.empty()) {
+        std::cout << "(" << col_mode << ")";
+      }
+      if (! red_mode.empty()) {
+        std::cout << "(" << red_mode << ")";
+      }
+      if (! bat_mode.empty()) {
+        std::cout << "(" << bat_mode << ")";
+      }
+      std::cout << '\n';
+
+      int M_size = std::accumulate(std::begin(M), std::end(M), 1, std::multiplies<>{});
+      int N_size = std::accumulate(std::begin(N), std::end(N), 1, std::multiplies<>{});
+      int K_size = std::accumulate(std::begin(K), std::end(K), 1, std::multiplies<>{});
+      int L_size = std::accumulate(std::begin(L), std::end(L), 1, std::multiplies<>{});
+
+      std::cout << "     M : (" << M_size << ") ";
+      for (char m : row_mode) std::cout << m << ":" << mode_size[m] << " ";
+      std::cout << '\n';
+      std::cout << "     N : (" << N_size << ") ";
+      for (char n : col_mode) std::cout << n << ":" << mode_size[n] << " ";
+      std::cout << '\n';
+      std::cout << "     K : (" << K_size << ") ";
+      for (char k : red_mode) std::cout << k << ":" << mode_size[k] << " ";
+      std::cout << '\n';
+      std::cout << "     L : (" << L_size << ") ";
+      for (char l : bat_mode) std::cout << l << ":" << mode_size[l] << " ";
+      std::cout << '\n';
+
+      std::cout << "  ldAm : " << ldAm << '\n';
+      std::cout << "  ldAk : " << ldAk << '\n';
+      std::cout << "  ldAl : " << ldAl << '\n';
+      std::cout << "  ldBn : " << ldBn << '\n';
+      std::cout << "  ldBk : " << ldBk << '\n';
+      std::cout << "  ldBl : " << ldBl << '\n';
+      std::cout << "  ldCm : " << ldCm << '\n';
+      std::cout << "  ldCn : " << ldCn << '\n';
+      std::cout << "  ldCl : " << ldCl << '\n';
+    }
+
+    return {M, ldAm, ldCm,
+            N, ldBn, ldCn,   
+            K, ldAk, ldBk, 
+            L, ldAl, ldBl, ldCl}; 
+  }
+
+  static void
+  print_usage() {
+    std::cout <<
+      "GETT problem command line parser:\n"
+      "  --modeA=<m0,...>\n"
+      "    A comma delimited list of characters that correspond to the row, reduction, and batch modes in A tensor.\n"
+      "    The semantic association of each symbolic mode is determined automatically.\n\n"
+
+      "  --modeB=<m0,...>\n"
+      "    A comma delimited list of characters that correspond to the column, reduction, and batch modes in B tensor.\n"
+      "    The semantic association of each symbolic mode is determined automatically.\n\n"
+
+      "  --modeC=<m0,...>\n"
+      "    A comma delimited list of characters that correspond to the row, column, and batch modes in B tensor.\n"
+      "    The semantic association of each symbolic mode is determined automatically.\n\n"
+
+      "  --extents=<mode:extent,....>\n"
+      "    A command delimited list of symbolic mode and its corresponding extent.\n"
+      "    Extents are defaulted to 1 if any are not provided.\n\n"
+
+      "Example usage: gett.exe --modeC=m,n,l --modeA=m,k,l --modeB=k,n,l --extents=m:4096,n:4096,k:4096\n";
+  }
+};
+
+} // namespace cutlass
diff --git a/tools/util/include/cutlass/util/helper_cuda.hpp b/tools/util/include/cutlass/util/helper_cuda.hpp
new file mode 100644
index 0000000000..69bc1e2b10
--- /dev/null
+++ b/tools/util/include/cutlass/util/helper_cuda.hpp
@@ -0,0 +1,116 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cuda.h>
+
+#include <cute/util/debug.hpp>
+
+namespace cute
+{
+
+void
+device_init(int device_id, bool quiet = false)
+{
+  cudaDeviceProp device_prop;
+  std::size_t    device_free_physmem;
+  std::size_t    device_total_physmem;
+
+  CUTE_CHECK_ERROR(cudaSetDevice(device_id));
+  CUTE_CHECK_ERROR(cudaMemGetInfo(&device_free_physmem, &device_total_physmem));
+  CUTE_CHECK_ERROR(cudaGetDeviceProperties(&device_prop, device_id));
+
+  if (device_prop.major < 1) {
+    fprintf(stderr, "Device does not support CUDA.\n");
+    exit(1);
+  }
+
+  //float device_giga_bandwidth = float(device_prop.memoryBusWidth) * device_prop.memoryClockRate * 2 / 8 / 1000 / 1000;
+
+  if (!quiet) {
+    printf("Using device %d: %s  (SM%d, %d SMs)\n",
+           device_id, device_prop.name,
+           device_prop.major * 10 + device_prop.minor,
+           device_prop.multiProcessorCount);
+    fflush(stdout);
+  }
+}
+
+/**
+ * Convert the SM version (e.g. v7.0, v7.5) to the physical number of cores.
+ */
+inline int
+_ConvertSMVer2Cores(int major, int minor)
+{
+  // Defines for GPU Architecture types (using the SM version to determine
+  // the # of cores per SM
+  typedef struct {
+    int SM;  // 0xMm (hexadecimal notation), M = SM Major version,
+    // and m = SM minor version
+    int Cores;
+  } sSMtoCores;
+
+  sSMtoCores nGpuArchCoresPerSM[] = {
+      {0x30, 192},
+      {0x32, 192},
+      {0x35, 192},
+      {0x37, 192},
+      {0x50, 128},
+      {0x52, 128},
+      {0x53, 128},
+      {0x60,  64},
+      {0x61, 128},
+      {0x62, 128},
+      {0x70,  64},
+      {0x72,  64},
+      {0x75,  64},
+      {-1, -1}};
+
+  int index = 0;
+
+  while (nGpuArchCoresPerSM[index].SM != -1) {
+    if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
+      return nGpuArchCoresPerSM[index].Cores;
+    }
+    index++;
+  }
+
+  // If we don't find the values, we default use the previous one
+  // to run properly
+  printf("MapSMtoCores for SM %d.%d is undefined."
+         "  Default to use %d Cores/SM\n",
+         major, minor, nGpuArchCoresPerSM[index - 1].Cores);
+
+  return nGpuArchCoresPerSM[index - 1].Cores;
+}
+
+} // end namespace cute
diff --git a/tools/util/include/cutlass/util/host_reorder.h b/tools/util/include/cutlass/util/host_reorder.h
index 4f7243150b..fe06c300cb 100644
--- a/tools/util/include/cutlass/util/host_reorder.h
+++ b/tools/util/include/cutlass/util/host_reorder.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/host_tensor.h b/tools/util/include/cutlass/util/host_tensor.h
index b6a7d0a605..3f061875b4 100644
--- a/tools/util/include/cutlass/util/host_tensor.h
+++ b/tools/util/include/cutlass/util/host_tensor.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -47,6 +47,7 @@
 #include "cutlass/cutlass.h"
 #include "cutlass/tensor_ref.h"
 #include "cutlass/tensor_view.h"
+#include "cutlass/fast_math.h"
 
 #include "device_memory.h"
 
@@ -103,10 +104,15 @@ class HostTensor {
   /// Constant reference to element in tensor
   using ConstReference = typename ConstTensorRef::Reference;
 
-  /// Used to handle packing of subbyte elements
-  static int const kElementsPerStoredItem = (sizeof_bits<Element>::value < 8 ? (8 / sizeof_bits<Element>::value) : 1);
-
- private:
+private:
+  using StorageUnit = typename platform::conditional_t<std::is_same_v<Element, bool>, uint8_t,            // Avoid the std::vector<bool> specialization
+                                  typename platform::conditional_t<sizeof_bits<Element>::value % 8 == 0,  // Handle subbyte types
+                                      Element, uint8_t>>;
+  using StorageContainerCalculator = cutlass::detail::StorageContainerCalculator<Element, StorageUnit>;
+  static constexpr int kContainerTypeNumBits = StorageContainerCalculator::kContainerTypeNumBits;
+  static constexpr int kContainerTypeNumLogicalElements = StorageContainerCalculator::kContainerTypeNumLogicalElements;
+  static constexpr int kContainerTypeNumBytes = StorageContainerCalculator::kContainerTypeNumBytes;
+  static constexpr int kContainerTypeNumStorageUnit = StorageContainerCalculator::kContainerTypeNumStorageUnit;
 
   //
   // Data members
@@ -119,12 +125,17 @@ class HostTensor {
   Layout layout_;
 
   /// Host-side memory allocation
-  std::vector<Element> host_;
+  std::vector<StorageUnit> host_;
 
   /// Device-side memory
-  device_memory::allocation<Element> device_;
+  device_memory::allocation<StorageUnit> device_;
+
+  /// number of containers 
+  size_t count_to_container_storage_unit_count(size_t count) {
+    return (count + kContainerTypeNumLogicalElements - 1) / kContainerTypeNumLogicalElements * kContainerTypeNumStorageUnit;
+  }
 
- public:
+public:
   //
   // Device and Host Methods
   //
@@ -166,20 +177,28 @@ class HostTensor {
   void reserve(
     size_t count,                                        ///< size of tensor in elements
     bool device_backed_ = true) {                        ///< if true, device memory is also allocated
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve(count=" << count << ", device_backed_=" << (device_backed_ ? "true" : "false") << ")");
+#endif
 
     device_.reset();
     host_.clear();
 
-    count /= kElementsPerStoredItem;
-
-    host_.resize(count);
+    size_t count_container = count_to_container_storage_unit_count(count);
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve: host_.resize(" << count_container << ")");
+#endif    
+    host_.resize(count_container);
 
     // Allocate memory
-    Element* device_memory = nullptr;
+    StorageUnit* device_memory = nullptr;
     if (device_backed_) {
-      device_memory = device_memory::allocate<Element>(count);
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve: device_memory::allocate(" << count_container << ")");
+#endif
+      device_memory = device_memory::allocate<StorageUnit>(count_container);
     }
-    device_.reset(device_memory, device_backed_ ? count : 0);
+    device_.reset(device_memory, device_backed_ ? count_container : 0);
   }
 
   /// Updates the extent and layout of the HostTensor. Allocates memory according to the new
@@ -215,9 +234,10 @@ class HostTensor {
     layout_ = layout;
 
     LongIndex new_size = size_t(layout_.capacity(extent_));
+    LongIndex new_size_container = count_to_container_storage_unit_count((layout_.capacity(extent_)));
 
-    if (static_cast<decltype(host_.size())>(new_size) > host_.size()) {
-      reserve(new_size);
+    if (static_cast<decltype(host_.size())>(new_size_container) > host_.size()) {
+      reserve(new_size, device_backed_);
     }
   }
 
@@ -230,21 +250,21 @@ class HostTensor {
     resize(extent, Layout::packed(extent), device_backed_);
   }
 
-  /// Returns the number of elements stored in the host tensor
+  /// Returns the logical number of elements stored in the host tensor
   size_t size() const {
-    return host_.size() * kElementsPerStoredItem;
+    return layout_.capacity(extent_);
   }
 
-  /// Returns the logical capacity based on extent and layout. May differ from size().
+  /// Returns the logical capacity in terms of number of elements. May be larger than the size().
   LongIndex capacity() const {
-    return layout_.capacity(extent_);
+    return host_.size() / kContainerTypeNumStorageUnit * kContainerTypeNumLogicalElements;
   }
 
   /// Gets pointer to host data
-  Element * host_data() { return host_.data(); }
+  Element * host_data() { return reinterpret_cast<Element *>(host_.data()); }
 
   /// Gets pointer to host data with a pointer offset
-  Element * host_data_ptr_offset(LongIndex ptr_element_offset) { return &ReferenceFactory<Element>::get(host_.data(), ptr_element_offset); }
+  Element * host_data_ptr_offset(LongIndex ptr_element_offset) { return &ReferenceFactory<Element>::get(host_data(), ptr_element_offset); }
 
   /// Gets a reference to an element in host memory
   Reference host_data(LongIndex idx) {
@@ -252,7 +272,10 @@ class HostTensor {
   }
 
   /// Gets pointer to host data
-  Element const * host_data() const { return host_.data(); }
+  Element const * host_data() const { return reinterpret_cast<Element const *>(host_.data()); }
+
+  /// Gets pointer to host data with a pointer offset
+  Element const * host_data_ptr_offset(LongIndex ptr_element_offset) const { return &ReferenceFactory<Element>::get(host_data(), ptr_element_offset); }
 
   /// Gets a constant reference to an element in host memory
   ConstReference host_data(LongIndex idx) const {
@@ -260,13 +283,16 @@ class HostTensor {
   }
 
   /// Gets pointer to device data
-  Element * device_data() { return device_.get(); }
+  Element * device_data() { return reinterpret_cast<Element *>(device_.get()); }
+
+  /// Gets pointer to device data
+  Element const * device_data() const { return reinterpret_cast<Element const *>(device_.get()); }
 
   /// Gets pointer to device data with a pointer offset
   Element * device_data_ptr_offset(LongIndex ptr_element_offset) { return &ReferenceFactory<Element>::get(device_data(), ptr_element_offset); }
 
-  /// Gets pointer to device data
-  Element const * device_data() const { return device_.get(); }
+  /// Gets pointer to device data with a pointer offset
+  Element const * device_data_ptr_offset(LongIndex ptr_element_offset) const { return &ReferenceFactory<Element>::get(device_data(), ptr_element_offset); }
 
   /// Accesses the tensor reference pointing to data
   TensorRef host_ref(LongIndex ptr_element_offset=0) { return TensorRef(host_data_ptr_offset(ptr_element_offset), layout_); }
@@ -369,7 +395,7 @@ class HostTensor {
   void sync_host() {
     if (device_backed()) {
       device_memory::copy_to_host(
-          host_data(), device_data(), size());
+          host_.data(), device_.get(), device_.size());
     }
   }
 
@@ -377,7 +403,7 @@ class HostTensor {
   void sync_device() {
     if (device_backed()) {
       device_memory::copy_to_device(
-          device_data(), host_data(), size());
+          device_.get(), host_.data(), host_.size());
     }
   }
 
@@ -392,8 +418,9 @@ class HostTensor {
     else {
       count = __NV_STD_MIN(capacity(), count);
     }
+    size_t container_count = count_to_container_storage_unit_count(count);
     device_memory::copy_to_host(
-      host_data(), ptr_device, count);
+      host_.data(), reinterpret_cast<StorageUnit const *>(ptr_device), container_count);
   }
 
   /// Copy data from a caller-supplied device pointer into host memory.
@@ -407,8 +434,9 @@ class HostTensor {
     else {
       count = __NV_STD_MIN(capacity(), count);
     }
+    size_t container_count = count_to_container_storage_unit_count(count);
     device_memory::copy_device_to_device(
-      device_data(), ptr_device, count);
+      device_.get(), reinterpret_cast<StorageUnit const *>(ptr_device), container_count);
   }
 
   /// Copy data from a caller-supplied device pointer into host memory.
@@ -422,8 +450,9 @@ class HostTensor {
     else {
       count = __NV_STD_MIN(capacity(), count);
     }
+    size_t container_count = count_to_container_storage_unit_count(count);
     device_memory::copy_to_device(
-      device_data(), ptr_host, count);
+      device_.get(), reinterpret_cast<StorageUnit const *>(ptr_host), container_count);
   }
 
   /// Copy data from a caller-supplied device pointer into host memory.
@@ -437,8 +466,9 @@ class HostTensor {
     else {
       count = __NV_STD_MIN(capacity(), count);
     }
+    size_t container_count = count_to_container_storage_unit_count(count);
     device_memory::copy_host_to_host(
-      host_data(), ptr_host, count);
+      host_.data(), reinterpret_cast<StorageUnit const *>(ptr_host), container_count);
   }
 
   /// Copy data from a caller-supplied device pointer into host memory.
@@ -452,8 +482,9 @@ class HostTensor {
     else {
       count = __NV_STD_MIN(capacity(), count);
     }
+    size_t container_count = count_to_container_storage_unit_count(count);
     device_memory::copy_to_host(
-      ptr_host, device_data(), count);
+      reinterpret_cast<StorageUnit *>(ptr_host), device_.get(), container_count);
   }
 
   /// Copy data from a caller-supplied device pointer into host memory.
@@ -467,8 +498,9 @@ class HostTensor {
     else {
       count = __NV_STD_MIN(capacity(), count);
     }
+    size_t container_count = count_to_container_storage_unit_count(count);
     device_memory::copy_device_to_device(
-      ptr_device, device_data(), count);
+      reinterpret_cast<StorageUnit *>(ptr_device), device_.get(), container_count);
   }
 
   /// Copy data from a caller-supplied device pointer into host memory.
@@ -482,8 +514,9 @@ class HostTensor {
     else {
       count = __NV_STD_MIN(capacity(), count);
     }
+    size_t container_count = count_to_container_storage_unit_count(count);
     device_memory::copy_to_device(
-      ptr_device, host_data(), count);
+      reinterpret_cast<StorageUnit *>(ptr_device), host_.data(), container_count);
   }
 
   /// Copy data from a caller-supplied device pointer into host memory.
@@ -497,8 +530,9 @@ class HostTensor {
     else {
       count = __NV_STD_MIN(capacity(), count);
     }
+    size_t container_count = count_to_container_storage_unit_count(count);
     device_memory::copy_host_to_host(
-      ptr_host, host_data(), count);
+      reinterpret_cast<StorageUnit *>(ptr_host), host_.data(), container_count);
   }
 };
 
diff --git a/tools/util/include/cutlass/util/host_tensor_planar_complex.h b/tools/util/include/cutlass/util/host_tensor_planar_complex.h
index 54e65b4676..923aa49aeb 100644
--- a/tools/util/include/cutlass/util/host_tensor_planar_complex.h
+++ b/tools/util/include/cutlass/util/host_tensor_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/host_uncompress.h b/tools/util/include/cutlass/util/host_uncompress.h
index 7fbfc3ddca..ccf2393472 100644
--- a/tools/util/include/cutlass/util/host_uncompress.h
+++ b/tools/util/include/cutlass/util/host_uncompress.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -42,6 +42,7 @@
 
 namespace cutlass {
 
+// uncompress sparse tensor core A matrix
 template <typename ElementA, typename LayoutA, typename ElementE,
           typename LayoutE>
 void uncompress(TensorRef<ElementA, LayoutA> uncompressed_tensor_a,
@@ -119,5 +120,38 @@ void uncompress(TensorRef<ElementA, LayoutA> uncompressed_tensor_a,
     }
   }
 }
+
+// uncompress ELL block sparse matrix
+template <typename ElementA, typename LayoutA,
+          typename ElementE, typename LayoutE>
+void uncompress_ell_block_sparse(
+                TensorRef<ElementA, LayoutA> uncompressed_tensor_a,
+                TensorRef<ElementA, LayoutA> tensor_a,
+                TensorRef<ElementE, LayoutE> ell_idx,
+                int rows, int cols,
+                int ell_num_cols, int ell_blocksize) {
+
+  for (int r = 0; r < rows / ell_blocksize; ++r) {
+    for (int c = 0; c < ell_num_cols / ell_blocksize; ++c) {
+
+      ElementE idx = ell_idx.at(MatrixCoord(r, c));
+
+      if (idx != -1) {
+        int row_begin = r * ell_blocksize;
+        int col_begin_real = idx * ell_blocksize;
+        int col_begin = c * ell_blocksize;
+  
+        for (int i = 0; i < ell_blocksize; ++i) {
+          for (int j = 0; j < ell_blocksize; ++j) {
+            uncompressed_tensor_a.at(MatrixCoord(row_begin + i, col_begin_real + j)) =
+                tensor_a.at(
+                    MatrixCoord(row_begin + i, col_begin +j));
+          }
+        }
+      }
+    }
+  }
+}
+
 } // namespace cutlass
 
diff --git a/tools/util/include/cutlass/util/index_sequence.h b/tools/util/include/cutlass/util/index_sequence.h
index 600ad88464..95628670f2 100644
--- a/tools/util/include/cutlass/util/index_sequence.h
+++ b/tools/util/include/cutlass/util/index_sequence.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/packed_stride.hpp b/tools/util/include/cutlass/util/packed_stride.hpp
new file mode 100644
index 0000000000..e9a243a132
--- /dev/null
+++ b/tools/util/include/cutlass/util/packed_stride.hpp
@@ -0,0 +1,570 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Utilities for packing constructing canonical CuTe stride types for 3.x mainloop params.
+*/
+
+#pragma once
+
+#include "cute/layout.hpp"
+#include "cute/container/array.hpp"   // cute::array
+#include "cutlass/conv/convolution.h" // cutlass::conv::Operator
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Strides without batch mode
+
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Int<1>>
+make_cute_packed_stride(cute::Stride<IntT, cute::Int<1>> s, cute::Shape<int,int,int> shape_MKL) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  auto s_copy = s;
+  cute::get<0>(s_copy) = static_cast<IntT>(cute::get<1>(shape_MKL));
+  return s_copy;
+}
+
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, IntT>
+make_cute_packed_stride(cute::Stride<cute::Int<1>, IntT> s, cute::Shape<int,int,int> shape_MKL) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  auto s_copy = s;
+  cute::get<1>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL));
+  return s_copy;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Strides with batch mode
+
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Int<1>, int64_t>
+make_cute_packed_stride(cute::Stride<IntT, cute::Int<1>, int64_t> s, cute::Shape<int,int,int> shape_MKL) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  auto s_copy = s;
+  cute::get<0>(s_copy) = static_cast<IntT>(cute::get<1>(shape_MKL));
+  int batch_count =  cute::get<2>(shape_MKL);
+  if (batch_count > 1) {
+    cute::get<2>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL) * cute::get<1>(shape_MKL));
+  }
+  else {
+    cute::get<2>(s_copy) = static_cast<IntT>(0);
+  }
+  return s_copy;
+}
+
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, IntT, int64_t>
+make_cute_packed_stride(cute::Stride<cute::Int<1>, IntT, int64_t> s, cute::Shape<int,int,int> shape_MKL) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  auto s_copy = s;
+  cute::get<1>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL));
+  int batch_count =  cute::get<2>(shape_MKL);
+  if (batch_count > 1) {
+    cute::get<2>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL) * cute::get<1>(shape_MKL));
+  }
+  else {
+    cute::get<2>(s_copy) = static_cast<IntT>(0);
+  }
+  return s_copy;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Strides with group mode
+
+template <class StrideIntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<StrideIntT, cute::Int<1>, cute::Int<0>>
+make_cute_packed_stride(cute::Stride<StrideIntT, cute::Int<1>, cute::Int<0>> s, cute::Shape<int,int,int> shape_MKL) {
+  static_assert(std::is_integral_v<StrideIntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  auto s_copy = s;
+  cute::get<0>(s_copy) = static_cast<StrideIntT>(cute::get<1>(shape_MKL));
+  return s_copy;
+}
+
+template <class StrideIntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, StrideIntT, cute::Int<0>>
+make_cute_packed_stride(cute::Stride<cute::Int<1>, StrideIntT, cute::Int<0>> s, cute::Shape<int,int,int> shape_MKL) {
+  static_assert(std::is_integral_v<StrideIntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  auto s_copy = s;
+  cute::get<1>(s_copy) = static_cast<StrideIntT>(cute::get<0>(shape_MKL));
+  return s_copy;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Strides for convolutions
+
+// Output cutlass::layout::TensorNDHWC -> rank-3 stride (InT,_1,_0)
+// Note: For fprop/dgrad kernel, strides are assumed to be layout right in NZPQK/NDHWC order
+// and therefore can be coalesced to just q/w. For wgrad kernel, strides are assumed to be layout
+// right in KTRSC order and can be coalesced to just k.
+// We enforce this condition here with asserts.
+template <class IntT, size_t RankT_>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Int<1>, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Int<1>, cute::Int<0>> s,
+    cute::array<int32_t, RankT_> shape_output,
+    cute::array<IntT, RankT_> stride_output,
+    cutlass::conv::Operator conv_op) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  static_assert(RankT_ >= 3u);
+  constexpr static int RankT = static_cast<int>(RankT_);
+
+  assert(stride_output[RankT-1] == 1);
+  cute::for_each(cute::make_seq<RankT-2>{}, [&](auto i) {
+    assert(stride_output[i] == shape_output[i+1] * stride_output[i+1]);
+  });
+
+  auto s_copy = s;
+  cute::get<0>(s_copy) = (conv_op == cutlass::conv::Operator::kWgrad) ?
+      stride_output[0] :
+      stride_output[RankT-2];
+  return s_copy;
+}
+
+//
+// Activation tensor ((w, h, d, n), _1) for fprop kernel
+//
+
+// Activation cutlass::layout::TensorNWC -> rank-2 stride ((W,N),_1)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Stride<IntT, IntT>, cute::Int<1>>
+make_cute_packed_stride(
+    cute::Stride<cute::Stride<IntT, IntT>, cute::Int<1>> s,
+    cute::array<IntT, 3> stride_nwc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  assert(stride_nwc[2] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_nwc[1];
+  cute::get<0,1>(s_copy) = stride_nwc[0];
+  return s_copy;
+}
+
+// Activation cutlass::layout::TensorNHWC -> rank-2 stride ((W,H,N),_1)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Stride<IntT, IntT, IntT>, cute::Int<1>>
+make_cute_packed_stride(
+    cute::Stride<cute::Stride<IntT, IntT, IntT>, cute::Int<1>> s,
+    cute::array<IntT, 4> stride_nhwc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  assert(stride_nhwc[3] == 1);
+  auto s_copy = s;
+  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
+    cute::get<0,i>(s_copy) = stride_nhwc[2-i];
+  });
+  return s_copy;
+}
+
+// Activation cutlass::layout::TensorNDHWC -> rank-2 stride ((W,H,D,N),_1)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Stride<IntT, IntT, IntT, IntT>, cute::Int<1>>
+make_cute_packed_stride(
+    cute::Stride<cute::Stride<IntT, IntT, IntT, IntT>, cute::Int<1>> s,
+    cute::array<IntT, 5> stride_ndhwc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_ndhwc[4] == 1);
+  auto s_copy = s;
+  cute::for_each(cute::make_seq<4>{}, [&](auto i) {
+    cute::get<0,i>(s_copy) = stride_ndhwc[3-i];
+  });
+  return s_copy;
+}
+
+//
+// Filter tensor (k, (_1, s, r, t)) for fprop kernel
+//
+
+// Filter cutlass::layout::TensorNWC -> rank-2 stride (k, (_1, s))
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>> s,
+    cute::array<IntT, 3> stride_ksc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_ksc[2] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_ksc[0];
+  cute::get<1,1>(s_copy) = stride_ksc[1];
+  return s_copy;
+}
+
+// Filter cutlass::layout::TensorNHWC -> rank-2 stride (k, (_1, s, r))
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>> s,
+    cute::array<IntT, 4> stride_krsc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_krsc[3] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_krsc[0];
+  cute::for_each(cute::make_seq<2>{}, [&](auto i) {
+    cute::get<1,2-i>(s_copy) = stride_krsc[i+1];
+  });
+  return s_copy;
+}
+
+// Filter cutlass::layout::TensorNDHWC -> rank-2 stride (k, (_1, s, r, t))
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>> s,
+    cute::array<IntT, 5> stride_ktrsc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_ktrsc[4] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_ktrsc[0];
+  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
+    cute::get<1,3-i>(s_copy) = stride_ktrsc[i+1];
+  });
+  return s_copy;
+}
+
+//
+// Activation tensor (_1, (w, h, d, n)) for wgrad kernel
+//
+// It is also Filter tensor ((_1), (k, s, r, t)) for dgrad kernel
+//
+
+// Activation cutlass::layout::TensorNWC -> rank-2 stride (_1, (W,N)) in wgrad
+// Filter cutlass::layout::TensorNWC -> rank-2 stride ((_1), (k, s)) in dgrad
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT>>
+make_cute_packed_stride(
+    cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT>> s,
+    cute::array<IntT, 3> stride_nwc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_nwc[2] == 1);
+  auto s_copy = s;
+  if (ConvOp == cutlass::conv::Operator::kWgrad) {
+    cute::get<1,0>(s_copy) = stride_nwc[1];
+    cute::get<1,1>(s_copy) = stride_nwc[0];
+  }
+  else if (ConvOp == cutlass::conv::Operator::kDgrad) {
+    // stride_nwc in dgrad is ksc.
+    cute::get<1,0>(s_copy) = stride_nwc[0];
+    cute::get<1,1>(s_copy) = stride_nwc[1];
+  }
+  return s_copy;
+}
+
+// Activation cutlass::layout::TensorNHWC -> rank-2 stride (_1, (W,H,N)) in wgrad
+// Filter cutlass::layout::TensorNHWC -> rank-2 stride ((_1), (k, s, r)) in dgrad
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT>>
+make_cute_packed_stride(
+    cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT>> s,
+    cute::array<IntT, 4> stride_nhwc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_nhwc[3] == 1);
+  auto s_copy = s;
+  if (ConvOp == cutlass::conv::Operator::kWgrad) {
+    cute::for_each(cute::make_seq<3>{}, [&](auto i) {
+      cute::get<1,i>(s_copy) = stride_nhwc[2-i];
+    });
+  }
+  else if (ConvOp == cutlass::conv::Operator::kDgrad) {
+    // stride_nhwc in dgrad is krsc.
+    cute::get<1,0>(s_copy) = stride_nhwc[0];
+    cute::for_each(cute::make_seq<2>{}, [&](auto i) {
+      cute::get<1,2-i>(s_copy) = stride_nhwc[i+1];
+    });
+  }
+  return s_copy;
+}
+
+// Activation cutlass::layout::TensorNDHWC -> rank-2 stride (_1, (W,H,D,N)) in wgrad
+// Filter cutlass::layout::TensorNDHWC -> rank-2 stride ((_1), (k, s, r, t)) in dgrad
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT, IntT>>
+make_cute_packed_stride(
+    cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT, IntT>> s,
+    cute::array<IntT, 5> stride_ndhwc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_ndhwc[4] == 1);
+  auto s_copy = s;
+  if (ConvOp == cutlass::conv::Operator::kWgrad) {
+    cute::for_each(cute::make_seq<4>{}, [&](auto i) {
+      cute::get<1,i>(s_copy) = stride_ndhwc[3-i];
+    });
+  }
+  else if (ConvOp == cutlass::conv::Operator::kDgrad) {
+    // stride_ndhwc in dgrad is ktrsc.
+    cute::get<1,0>(s_copy) = stride_ndhwc[0];
+    cute::for_each(cute::make_seq<3>{}, [&](auto i) {
+      cute::get<1,3-i>(s_copy) = stride_ndhwc[i+1];
+    });
+  }
+  return s_copy;
+}
+
+//
+// NZPQ tensor (_1, nzpq) for wgrad kernel
+//
+
+// cutlass::layout::TensorNWC -> rank-2 stride (_1, nzpq)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, IntT>
+make_cute_packed_stride(
+    cute::Stride<cute::Int<1>, IntT> s,
+    cute::array<IntT, 3> stride_nqk,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_nqk[2] == 1);
+  auto s_copy = s;
+  cute::get<1>(s_copy) = stride_nqk[1];
+  return s_copy;
+}
+
+// cutlass::layout::TensorNHWC -> rank-2 stride (_1, nzpq)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, IntT>
+make_cute_packed_stride(
+    cute::Stride<cute::Int<1>, IntT> s,
+    cute::array<IntT, 4> stride_npqk,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_npqk[3] == 1);
+  auto s_copy = s;
+  cute::get<1>(s_copy) = stride_npqk[2];
+  return s_copy;
+}
+
+// cutlass::layout::TensorNDHWC -> rank-2 stride (_1, nzpq)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, IntT>
+make_cute_packed_stride(
+    cute::Stride<cute::Int<1>, IntT> s,
+    cute::array<IntT, 5> stride_nzpqk,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_nzpqk[4] == 1);
+  auto s_copy = s;
+  cute::get<1>(s_copy) = stride_nzpqk[3];
+  return s_copy;
+}
+
+
+
+//
+// Wgrad output tensor (k, (_1, s, r, t), _0)
+//
+
+// Filter cutlass::layout::TensorKCS -> rank-3 stride (k, (_1, s), _0)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>, cute::Int<0>> s,
+    [[maybe_unused]] cute::array<int32_t, 3> shape_output,
+    cute::array<IntT, 3> stride_ksc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_ksc[2] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_ksc[0];
+  cute::get<1,1>(s_copy) = stride_ksc[1];
+  return s_copy;
+}
+
+// Filter cutlass::layout::TensorKCSR -> rank-3 stride (k, (_1, s, r), _0)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>, cute::Int<0>> s,
+    [[maybe_unused]] cute::array<int32_t, 4> shape_output,
+    cute::array<IntT, 4> stride_krsc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_krsc[3] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_krsc[0];
+  cute::for_each(cute::make_seq<2>{}, [&](auto i) {
+    cute::get<1,2-i>(s_copy) = stride_krsc[i+1];
+  });
+  return s_copy;
+}
+
+// Filter cutlass::layout::TensorKCSRT -> rank-3 stride (k, (_1, s, r, t), _0)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>, cute::Int<0>> s,
+    [[maybe_unused]] cute::array<int32_t, 5> shape_output,
+    cute::array<IntT, 5> stride_ktrsc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_ktrsc[4] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_ktrsc[0];
+  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
+    cute::get<1,3-i>(s_copy) = stride_ktrsc[i+1];
+  });
+  return s_copy;
+}
+
+
+//
+// Wgrad output tensor ((_1, s, r, t), k, _0)
+//
+
+// Filter cutlass::layout::TensorCSK -> rank-3 stride ((_1, s), k, _0)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Stride<cute::Int<1>, IntT>, IntT, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<cute::Stride<cute::Int<1>, IntT>, IntT, cute::Int<0>> s,
+    [[maybe_unused]] cute::array<int32_t, 3> shape_output,
+    cute::array<IntT, 3> stride_ksc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_ksc[2] == 1);
+  auto s_copy = s;
+  cute::get<1,0>(s_copy) = stride_ksc[0];
+  cute::get<0,1>(s_copy) = stride_ksc[1];
+  return s_copy;
+}
+
+// Filter cutlass::layout::TensorCSRK -> rank-3 stride ((_1, s, r), k, _0)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT>, IntT, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT>, IntT, cute::Int<0>> s,
+    [[maybe_unused]] cute::array<int32_t, 4> shape_output,
+    cute::array<IntT, 4> stride_krsc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_krsc[3] == 1);
+  auto s_copy = s;
+  cute::get<1,0>(s_copy) = stride_krsc[0];
+  cute::for_each(cute::make_seq<2>{}, [&](auto i) {
+    cute::get<0,2-i>(s_copy) = stride_krsc[i+1];
+  });
+  return s_copy;
+}
+
+// Filter cutlass::layout::TensorCSRTK -> rank-3 stride ((_1, s, r, t), k, _0)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT, IntT>, IntT, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT, IntT>, IntT, cute::Int<0>> s,
+    [[maybe_unused]] cute::array<int32_t, 5> shape_output,
+    cute::array<IntT, 5> stride_ktrsc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_ktrsc[4] == 1);
+  auto s_copy = s;
+  cute::get<1,0>(s_copy) = stride_ktrsc[0];
+  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
+    cute::get<0,3-i>(s_copy) = stride_ktrsc[i+1];
+  });
+  return s_copy;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
diff --git a/tools/util/include/cutlass/util/print_error.hpp b/tools/util/include/cutlass/util/print_error.hpp
new file mode 100644
index 0000000000..9eed9d1438
--- /dev/null
+++ b/tools/util/include/cutlass/util/print_error.hpp
@@ -0,0 +1,341 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <array>
+#include <cassert>
+#include <cmath>
+#include <iostream>
+#include <type_traits>
+
+#include <cute/util/type_traits.hpp>
+#include <cute/tensor.hpp>
+
+#include <cute/numeric/numeric_types.hpp>
+#include <cute/numeric/complex.hpp>
+
+#include <cutlass/layout/layout.h>
+
+// The computed infinity norm does not include
+// any NaN column absolute-value sums.
+struct matrix_inf_norm_result {
+  // Accumulate errors in double, as this is generally
+  // the highest precision that the examples use.
+  double inf_norm = 0.0;
+  bool found_nan = false;
+};
+
+// In theory, cute::Tensor<ViewEngine<T*>, T> could be treated as a view type,
+// and thus passed by value (as std::span or std::string_view would be).
+// However, generic cute::Tensor are more like containers
+// and thus are best passed by reference or const reference.
+template <typename EngineType, typename LayoutType>
+matrix_inf_norm_result
+matrix_inf_norm(cute::Tensor<EngineType, LayoutType> const& host_matrix)
+{
+  using error_type = decltype(std::declval<matrix_inf_norm_result>().inf_norm);
+  using element_type = typename EngineType::value_type;
+
+  error_type inf_norm = 0.0;
+  bool found_nan = false;
+
+  // Computing the infinity norm requires that we be able
+  // to treat the input as a matrix, with rows and columns.
+  const int64_t num_rows = cute::size<0>(host_matrix);
+  const int64_t num_cols = cute::size<1>(host_matrix);
+
+  auto abs_fn = [] (element_type A_ij) {
+    if constexpr (not std::is_unsigned_v<element_type>) {
+      using std::abs;
+      return abs(A_ij);
+    }
+    else {
+      return A_ij;
+    }
+  };
+
+  for (int64_t i = 0; i < num_rows; ++i) {
+    error_type row_abs_sum = 0.0;
+    for(int64_t j = 0; j < num_cols; ++j) {
+      row_abs_sum += abs_fn(host_matrix(i, j));
+    }
+    if (std::isnan(row_abs_sum)) {
+      found_nan = true;
+    }
+    else {
+      inf_norm = row_abs_sum > inf_norm ? row_abs_sum : inf_norm;
+    }
+  }
+
+  return {inf_norm, found_nan};
+}
+
+// Infinity norm of (X - Y).
+template <typename EngineType, typename LayoutType>
+matrix_inf_norm_result
+matrix_diff_inf_norm(cute::Tensor<EngineType, LayoutType> const& X,
+                     cute::Tensor<EngineType, LayoutType> const& Y)
+{
+  using error_type = decltype(std::declval<matrix_inf_norm_result>().inf_norm);
+  using element_type = typename EngineType::value_type;
+
+  auto abs_fn = [] (element_type A_ij) {
+    if constexpr (not std::is_unsigned_v<element_type>) {
+      using std::abs;
+      return abs(A_ij);
+    }
+    else {
+      return A_ij;
+    }
+  };
+
+  assert(cute::size<0>(X) == cute::size<0>(Y));
+  assert(cute::size<1>(X) == cute::size<1>(Y));
+
+  // Computing the infinity norm requires that we be able
+  // to treat the input as a matrix, with rows and columns.
+  const int64_t num_rows = cute::size<0>(X);
+  const int64_t num_cols = cute::size<1>(X);
+
+  error_type inf_norm = 0.0;
+  bool found_nan = false;
+
+  for (int64_t i = 0; i < num_rows; ++i) {
+    error_type row_abs_sum = 0.0;
+    for (int64_t j = 0; j < num_cols; ++j) {
+      row_abs_sum += error_type(abs_fn(element_type(X(i,j)) -
+                                       element_type(Y(i,j))));
+    }
+    if (std::isnan(row_abs_sum)) {
+      found_nan = true;
+    }
+    else {
+      inf_norm = row_abs_sum > inf_norm ? row_abs_sum : inf_norm;
+    }
+  }
+
+  return {inf_norm, found_nan};
+}
+
+template <typename EngineType_A, typename LayoutType_A,
+          typename EngineType_B, typename LayoutType_B,
+          typename EngineType_C, typename LayoutType_C,
+          typename EngineType_C_ref, typename LayoutType_C_ref>
+auto
+print_matrix_multiply_mollified_relative_error(
+  char const A_value_type_name[],
+  cute::Tensor<EngineType_A, LayoutType_A> const& A,
+  char const B_value_type_name[],
+  cute::Tensor<EngineType_B, LayoutType_B> const& B,
+  char const C_value_type_name[],
+  cute::Tensor<EngineType_C, LayoutType_C> const& C,
+  cute::Tensor<EngineType_C_ref, LayoutType_C_ref> const& C_ref)
+{
+  const auto [A_norm, A_has_nan] = matrix_inf_norm(A);
+  const auto [B_norm, B_has_nan] = matrix_inf_norm(B);
+  const auto [C_norm, C_has_nan] = matrix_inf_norm(C_ref);
+  const auto [diff_norm, diff_has_nan] = matrix_diff_inf_norm(C, C_ref);
+
+  const auto A_norm_times_B_norm = A_norm * B_norm;
+  const auto relative_error = A_norm_times_B_norm == 0.0 ?
+    diff_norm : (diff_norm / A_norm_times_B_norm);
+
+  // For expected error bounds, please refer to the LAPACK Users' Guide,
+  // in particular https://netlib.org/lapack/lug/node108.html .
+  // Printing the infinity norm of C is a way to check
+  // that both the function being tested (C)
+  // and the reference implementation (C_ref)
+  // don't just do nothing (or fill with zeros).
+  using std::cout;
+  using cute::shape;
+  cout << "Matrix A: " << shape<0>(A) << "x" << shape<1>(A) << " of " << A_value_type_name << '\n'
+      << "Matrix B: " << shape<0>(B) << "x" << shape<1>(B) << " of " << B_value_type_name << '\n'
+      << "Matrix C: " << shape<0>(C) << "x" << shape<1>(C) << " of " << C_value_type_name << '\n'
+      << std::scientific
+      << "Infinity norm of A: " << A_norm << '\n'
+      << "Infinity norm of B: " << B_norm << '\n'
+      << "Infinity norm of C: " << C_norm << '\n'
+      << "Infinity norm of (C - C_ref): " << diff_norm << '\n';
+
+  if(A_norm_times_B_norm == 0.0) {
+    cout << "Mollified relative error: " << relative_error << '\n';
+  } else {
+    cout << "Relative error: " << relative_error << '\n';
+  }
+
+  if (A_has_nan || B_has_nan || C_has_nan || diff_has_nan) {
+    cout << "Did we encounter NaN in A? " << (A_has_nan ? "yes" : "no") << '\n'
+        << "Did we encounter NaN in B? " << (B_has_nan ? "yes" : "no") << '\n'
+        << "Did we encounter NaN in C? " << (C_has_nan ? "yes" : "no") << '\n'
+        << "Did we encounter NaN in (C - C_ref)? " << (diff_has_nan ? "yes" : "no") << '\n';
+  }
+  return relative_error;
+}
+
+template <typename EngineType, typename LayoutType>
+auto
+print_matrix_multiply_mollified_relative_error(
+  const char value_type_name[],
+  const cute::Tensor<EngineType, LayoutType>& A,
+  const cute::Tensor<EngineType, LayoutType>& B,
+  const cute::Tensor<EngineType, LayoutType>& C_computed,
+  const cute::Tensor<EngineType, LayoutType>& C_expected)
+{
+  return print_matrix_multiply_mollified_relative_error(value_type_name, A, value_type_name, B,
+                                                 value_type_name, C_computed, C_expected);
+}
+
+// Take a CUTLASS HostTensor (or the like) as input,
+// and return a const CuTe Tensor.
+// This is useful for use with the above error printing functions.
+// This implicitly "transposes" if the layout is RowMajor.
+// Note that the HostTensor must be captured by nonconst reference
+// in order for X.host_ref().data() to compile.
+// (CUTLASS is a bit more container-y than CuTe.)
+template<class CutlassHostTensorType>
+auto host_matrix_to_const_cute_tensor(CutlassHostTensorType& X)
+{
+  // The tensors were created with post-transposed extents.
+  const auto extents = X.extent();
+  const auto shape = cute::Shape<int, int>{extents[0], extents[1]};
+  // Both RowMajor and ColumnMajor only store one stride.
+  const int LDX = X.stride(0);
+  const auto strides = [&]() {
+      using input_layout_type = typename std::decay_t<decltype(X)>::Layout;
+      if constexpr (std::is_same_v<input_layout_type, cutlass::layout::ColumnMajor>) {
+        return cute::Stride<int, int>{1, LDX};
+      }
+      else {
+        static_assert(std::is_same_v<input_layout_type, cutlass::layout::RowMajor>);
+        return cute::Stride<int, int>{LDX, 1};
+      }
+    }();
+  const auto layout = cute::make_layout(shape, strides);
+  auto X_data = X.host_ref().data();
+  auto X_data_const = const_cast<std::add_const_t< decltype(X_data)> >(X_data);
+  return cute::make_tensor(X_data_const, layout);
+};
+
+
+// Returns EXIT_SUCCESS if the 2-norm relative error is exactly zero, else returns EXIT_FAILURE.
+// This makes the return value suitable as the return value of main().
+template <typename T1, typename T2>
+int
+print_relative_error(
+    std::size_t n,
+    T1 const& data,
+    T2 const& reference,
+    bool print_verbose = false,
+    bool print_error = true,
+    double error_margin = 0.00001) {
+  using std::abs; using std::sqrt;
+
+  // Use either double or complex<double> for error computation
+  using value_type = cute::remove_cvref_t<decltype(reference[0])>;
+  using error_type = std::conditional_t<cute::is_complex<value_type>::value,
+                                        cute::complex<double>,
+                                        double>;
+
+  if (print_verbose) {
+    std::cout << "Idx:\t"<< "Val\t" << "RefVal\t" << "RelError" << std::endl;
+  }
+
+  double eps = 1e-200;
+
+  double tot_error_sq = 0;
+  double tot_norm_sq = 0;
+  double tot_ind_rel_err = 0;
+  double max_ind_rel_err = 0;
+  double max_diff = 0;
+  for (std::size_t i = 0; i < n; ++i) {
+    error_type val = data[i];
+    error_type ref = reference[i];
+
+    double aref = abs(ref);
+    double diff = abs(ref - val);
+    double rel_error = diff / (aref + eps);
+
+    // Individual relative error
+    tot_ind_rel_err += rel_error;
+
+    // Maximum relative error
+    max_ind_rel_err  = std::max(max_ind_rel_err, rel_error);
+
+    // Maximum delta in value error
+    max_diff = std::max(max_diff, diff);
+
+    // Total relative error
+    tot_error_sq += diff * diff;
+    tot_norm_sq  += aref * aref;
+
+    if (print_verbose) {
+      std::cout << i << ":\t" << val << "\t" << ref << "\t" << rel_error << std::endl;
+    }
+  }
+
+  double ave_rel_err = tot_ind_rel_err / double(n);
+  if (print_error) {
+    printf("Average relative error: %.3e\n", ave_rel_err);
+  }
+
+  if (print_error) {
+    printf("Maximum relative error: %.3e\n", max_ind_rel_err);
+  }
+
+  if (print_error) {
+    printf("Maximum difference    : %.3e\n", max_diff);
+  }
+
+  double tot_rel_err = sqrt(tot_error_sq/(tot_norm_sq+eps));
+  if (print_error) {
+    printf("Vector relative error:  %.3e\n", tot_rel_err);
+  }
+
+  printf("Vector reference  norm: %.3e\n", sqrt(tot_norm_sq));
+
+  return (tot_rel_err <= error_margin) ? EXIT_SUCCESS : EXIT_FAILURE;
+}
+
+// Overload for cute::Tensor<>
+template <class Engine, class Layout>
+int
+print_relative_error(
+    cute::Tensor<Engine, Layout> data,
+    cute::Tensor<Engine, Layout> reference,
+    bool print_verbose = false,
+    bool print_error = true,
+    double error_margin = 0.00001) {
+  assert(size(data) == size(reference));
+  return print_relative_error(static_cast<std::size_t>(size(data)),
+                              data, reference,
+                              print_verbose, print_error, error_margin);
+}
diff --git a/tools/util/include/cutlass/util/reference/detail/inner_product.h b/tools/util/include/cutlass/util/reference/detail/inner_product.h
index 3735940bce..2bce60b139 100644
--- a/tools/util/include/cutlass/util/reference/detail/inner_product.h
+++ b/tools/util/include/cutlass/util/reference/detail/inner_product.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/detail/linear_to_coordinate.h b/tools/util/include/cutlass/util/reference/detail/linear_to_coordinate.h
index aae510f8b6..1f784c46f6 100644
--- a/tools/util/include/cutlass/util/reference/detail/linear_to_coordinate.h
+++ b/tools/util/include/cutlass/util/reference/detail/linear_to_coordinate.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -68,7 +68,7 @@ template <int Rank>
 struct LinearToCoordinateHelper<Rank, 0> {
 
   CUTLASS_HOST_DEVICE
-  void operator()(Coord<Rank> &coord, int64_t idx, Coord<Rank> const &extent) const {
+  void operator()(Coord<Rank> &coord, int64_t idx, Coord<Rank> const &) const {
     coord[Rank - 1] = int(idx);
   }
 };
diff --git a/tools/util/include/cutlass/util/reference/device/convolution.h b/tools/util/include/cutlass/util/reference/device/convolution.h
index 8c00b779a1..c91cd0e229 100644
--- a/tools/util/include/cutlass/util/reference/device/convolution.h
+++ b/tools/util/include/cutlass/util/reference/device/convolution.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -123,11 +123,17 @@ __global__ void Conv2dFprop(
     }
   }
 
+  int c_per_group = problem_size.C / problem_size.groups;
+  int k_per_group = problem_size.K / problem_size.groups;
+
   // Compute convolution
   for (int R = 0; R < problem_size.R; ++R) {
     for (int S = 0; S < problem_size.S; ++S) {
       for (int C = 0; C < problem_size.C; ++C) {
 
+        // Get group id of currnet channel
+        int c_group_idx = C / c_per_group;
+
         // Load from activations tensor
         int filter_r = R;
         int filter_s = S;   
@@ -154,9 +160,10 @@ __global__ void Conv2dFprop(
         CUTLASS_PRAGMA_UNROLL
         for (int n = 0; n < kThreadN; ++n) {
           int thread_k = k_start + n;
+          int k_group_idx = thread_k / k_per_group;
 
-          if (thread_k < problem_size.K) {
-            element_B[n] = ElementAccumulator(tensor_w.at({thread_k, R, S, C}));
+          if (thread_k < problem_size.K && k_group_idx == c_group_idx) {
+            element_B[n] = ElementAccumulator(tensor_w.at({thread_k, R, S, C % c_per_group}));
           }
           else {
             element_B[n] = ElementAccumulator();
diff --git a/tools/util/include/cutlass/util/reference/device/gemm.h b/tools/util/include/cutlass/util/reference/device/gemm.h
index 44488083ef..1a1bd37518 100644
--- a/tools/util/include/cutlass/util/reference/device/gemm.h
+++ b/tools/util/include/cutlass/util/reference/device/gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -248,7 +248,7 @@ struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Parital specialization for XOR-popc
+/// Partial specialization for XOR-popc
 template <typename ElementA, typename LayoutA, typename ElementB,
           typename LayoutB, typename ElementC, typename LayoutC,
           typename ScalarType, typename AccumulatorType>
diff --git a/tools/util/include/cutlass/util/reference/device/gemm_complex.h b/tools/util/include/cutlass/util/reference/device/gemm_complex.h
index 39bb7aa0f4..b4d41bd28e 100644
--- a/tools/util/include/cutlass/util/reference/device/gemm_complex.h
+++ b/tools/util/include/cutlass/util/reference/device/gemm_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -67,7 +67,8 @@ template <
   typename LayoutC,
   typename ScalarType,
   typename ComputeType,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
+  typename ElementD = ElementC,
+  typename ConvertOp = NumericConverter<ElementD, ScalarType>,
   typename InnerProductOp = multiply_add<ComputeType>,
   int kMblock = 4,
   int kNblock = 4
@@ -81,7 +82,7 @@ __global__ void GemmComplex(
   ComplexTransform transform_b,
   ScalarType beta,
   TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementC, LayoutC> tensor_d,
+  TensorRef<ElementD, LayoutC> tensor_d,
   ComputeType initial_accum,
   int batch_count = 1,
   int64_t batch_stride_A = 0,
@@ -198,7 +199,8 @@ template <
   typename LayoutC,
   typename ScalarType,
   typename ComputeType,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
+  typename ElementD = ElementC,
+  typename ConvertOp = NumericConverter<ElementD, ScalarType>,
   typename InnerProductOp = multiply_add<ComputeType>
 >
 void GemmComplex(
@@ -210,7 +212,7 @@ void GemmComplex(
   ComplexTransform transform_b,
   ScalarType beta,
   TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementC, LayoutC> tensor_d,
+  TensorRef<ElementD, LayoutC> tensor_d,
   ComputeType initial_accum,
   int batch_count = 1,
   int64_t batch_stride_A = 0,
@@ -243,6 +245,7 @@ void GemmComplex(
       LayoutC,
       ScalarType,
       ComputeType,
+      ElementD,
       ConvertOp,
       InnerProductOp,
       kMblock,
@@ -285,6 +288,7 @@ void GemmComplex(
       LayoutC,
       ScalarType,
       ComputeType,
+      ElementD,
       ConvertOp,
       InnerProductOp,
       kBigMblock,
@@ -322,7 +326,8 @@ template <
   typename LayoutB,
   typename ElementC,
   typename LayoutC,
-  typename ScalarType
+  typename ScalarType,
+  typename ElementD = ElementC
 >
 void GemmComplex(
   gemm::GemmCoord problem_size,
@@ -333,7 +338,7 @@ void GemmComplex(
   ComplexTransform transform_b,
   ScalarType beta,
   TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementC, LayoutC> tensor_d) {
+  TensorRef<ElementD, LayoutC> tensor_d) {
 
   GemmComplex(problem_size, alpha, tensor_a, transform_a, tensor_b, transform_b, beta, tensor_c, tensor_d, ScalarType(0));
 }
diff --git a/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h b/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h
index 7a387eaa88..37c103c3fc 100644
--- a/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h
+++ b/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -183,7 +183,7 @@ __global__ void GemmPlanarComplex(
         ComplexC d_ij;
 
         d_ij.real() = convert_op(result.real());
-        d_ij.imag() = convert_op(result.imag());;
+        d_ij.imag() = convert_op(result.imag());
 
         tensor_d.at(coord) = d_ij;
       }
diff --git a/tools/util/include/cutlass/util/reference/device/gett.hpp b/tools/util/include/cutlass/util/reference/device/gett.hpp
new file mode 100644
index 0000000000..78586ad62d
--- /dev/null
+++ b/tools/util/include/cutlass/util/reference/device/gett.hpp
@@ -0,0 +1,146 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief GETT device reference code
+*/
+#pragma once
+
+#include <cute/tensor.hpp>
+
+namespace cutlass::reference::device {
+
+template <
+  class ATensor,
+  class BTensor,
+  class CTensor,
+  class DTensor,
+  class ElementAccumulator,
+  class ElementEpilogue>
+__global__ static
+void
+gett_kernel(
+  DTensor       D,
+  ATensor const A,
+  BTensor const B,
+  CTensor const C,
+  ElementEpilogue alpha, ElementEpilogue beta,
+  ElementAccumulator acc_init)
+{
+  using namespace cute;
+
+  static_assert(DTensor::rank == 3, "(M,N,L)");
+  static_assert(ATensor::rank == 3, "(M,K,L)");
+  static_assert(BTensor::rank == 3, "(N,K,L)");
+  static_assert(CTensor::rank == 3, "(M,N,L)");
+
+  assert(size<0>(A) == size<0>(D));  // M
+  assert(size<0>(C) == size<0>(D));  // M
+  assert(size<0>(B) == size<1>(D));  // N
+  assert(size<1>(C) == size<1>(D));  // N
+  assert(size<1>(A) == size<1>(B));  // K
+  assert(size<2>(A) == size<2>(D));  // L
+  assert(size<2>(B) == size<2>(D));  // L
+  assert(size<2>(C) == size<2>(D));  // L
+
+  NumericConverter<ElementAccumulator, typename ATensor::value_type> a_converter;
+  NumericConverter<ElementAccumulator, typename BTensor::value_type> b_converter;
+  NumericConverter<ElementEpilogue, ElementAccumulator> acc_converter;
+  NumericConverter<ElementEpilogue, typename CTensor::value_type> source_converter;
+  NumericConverter<typename DTensor::value_type, ElementEpilogue> output_converter;
+
+  // Thread id to each element of D
+  for (int tid = threadIdx.x + blockDim.x * blockIdx.x;
+       tid < size(D);
+       tid += blockDim.x * gridDim.x) {
+    // (m,n,l) coordinate
+    auto mnl_coord = idx2crd(tid, product_each(shape(D)));
+    auto m = get<0>(mnl_coord);
+    auto n = get<1>(mnl_coord);
+    auto l = get<2>(mnl_coord);
+
+    auto A_ml = A(m,_,l);
+    auto B_nl = B(n,_,l);
+
+    ElementAccumulator accum = ElementAccumulator(0);
+    for (int k = 0; k < size<1>(A); ++k) {
+      ElementAccumulator a = a_converter(A_ml(k));
+      ElementAccumulator b = b_converter(B_nl(k));
+      accum += a * b;
+    }
+
+    ElementEpilogue scaled_output = (alpha * acc_converter(accum)) + (beta * source_converter(C(m,n,l)));
+    D(m,n,l) = output_converter(scaled_output);
+  }
+}
+
+// Most general version
+template <
+  class ProblemShapeMNKL,
+  class ElementA,
+  class StrideA,
+  class ElementB,
+  class StrideB,
+  class ElementAccumulator,
+  class ElementC,
+  class StrideC,
+  class ElementD,
+  class StrideD,
+  class ElementEpilogue>
+void
+gett(
+    ProblemShapeMNKL problem_shape_mnkl,
+    ElementA const* ptr_A, StrideA stride_a_mkl,
+    ElementB const* ptr_B, StrideB stride_b_nkl,
+    ElementAccumulator _,
+    ElementC const* ptr_C, StrideC stride_c_mnl,
+    ElementD      * ptr_D, StrideD stride_d_mnl,
+    ElementEpilogue alpha, ElementEpilogue beta,
+    cudaStream_t stream = 0) {
+  using namespace cute;
+
+  static_assert(cute::rank(ProblemShapeMNKL{}) == 4);
+  auto M = get<0>(problem_shape_mnkl);
+  auto N = get<1>(problem_shape_mnkl);
+  auto K = get<2>(problem_shape_mnkl);
+  auto L = get<3>(problem_shape_mnkl);
+
+  // Represent the full tensors
+  auto A = make_tensor(make_gmem_ptr(ptr_A), make_shape(M,K,L), stride_a_mkl); // (M,K,L)
+  auto B = make_tensor(make_gmem_ptr(ptr_B), make_shape(N,K,L), stride_b_nkl); // (N,K,L)
+  auto C = make_tensor(make_gmem_ptr(ptr_C), make_shape(M,N,L), stride_c_mnl); // (M,N,L)
+  auto D = make_tensor(make_gmem_ptr(ptr_D), make_shape(M,N,L), stride_d_mnl); // (M,N,L)
+
+  dim3 dimBlock(256);
+  dim3 dimGrid(240);
+  gett_kernel<<< dimGrid, dimBlock, 0, stream >>>(D, A, B, C, alpha, beta, ElementAccumulator(0));
+}
+
+} // namespace cutlass::reference::device
diff --git a/tools/util/include/cutlass/util/reference/device/kernel/gemm.h b/tools/util/include/cutlass/util/reference/device/kernel/gemm.h
index 524fc2143f..f773121301 100644
--- a/tools/util/include/cutlass/util/reference/device/kernel/gemm.h
+++ b/tools/util/include/cutlass/util/reference/device/kernel/gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h b/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h
index 80a689fb86..c703f07f78 100644
--- a/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h
+++ b/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h
index e96082535d..a64a419d8a 100644
--- a/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h
+++ b/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -72,7 +72,7 @@ struct TensorForEachHelper {
 template <typename Func, int Rank>
 struct TensorForEachHelper<Func, Rank, 0> {
 
-  /// Constructor for fastest chaning rank
+  /// Constructor for fastest changing rank
   __inline__ __device__
   TensorForEachHelper(Func &func, Coord<Rank> const &size, Coord<Rank> &coord, int64_t index) {
 
diff --git a/tools/util/include/cutlass/util/reference/device/rank_2k_complex.h b/tools/util/include/cutlass/util/reference/device/rank_2k_complex.h
index 0e159ee912..d5892457ca 100644
--- a/tools/util/include/cutlass/util/reference/device/rank_2k_complex.h
+++ b/tools/util/include/cutlass/util/reference/device/rank_2k_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_compare.h b/tools/util/include/cutlass/util/reference/device/tensor_compare.h
index 4db1efa29c..e6b36990f0 100644
--- a/tools/util/include/cutlass/util/reference/device/tensor_compare.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_compare.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_fill.h b/tools/util/include/cutlass/util/reference/device/tensor_fill.h
index 12480c4289..059076d957 100644
--- a/tools/util/include/cutlass/util/reference/device/tensor_fill.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_fill.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -57,6 +57,9 @@
 #include "cutlass/complex.h"
 #include "cutlass/tensor_view.h"
 #include "cutlass/blas3.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/layout/vector.h"
 
 #include "cutlass/util/reference/device/tensor_foreach.h"
 #include "cutlass/util/distribution.h"
@@ -115,6 +118,7 @@ struct RandomGaussianFunc {
     int int_scale;
     FloatType float_scale_up;
     FloatType float_scale_down;
+    int exclude_zero;           ///< If non-negative, excludes zeros
 
     //
     // Methods
@@ -125,15 +129,16 @@ struct RandomGaussianFunc {
       uint64_t seed_ = 0,
       Element mean_ = 0, 
       Element stddev_ = 1,
-      int int_scale_ = -1
+      int int_scale_ = -1,
+      int exclude_zero_ = -1
     ):
       seed(seed_), 
       mean(static_cast<FloatType>(mean_)), 
       stddev(static_cast<FloatType>(stddev_)), 
-      int_scale(int_scale_) {
+      int_scale(int_scale_),
+      exclude_zero(exclude_zero_) {
 
-      float_scale_up = FloatType(IntType(1) << int_scale);
-      float_scale_up += FloatType(0.5) * float_scale_up;
+      float_scale_up = FloatType(IntType(1) << int_scale); // scale up to clamp low order bits
       float_scale_down = FloatType(1) / FloatType(IntType(1) << int_scale);
     }
   };
@@ -170,13 +175,22 @@ struct RandomGaussianFunc {
 
     Element result;
     if (params.int_scale >= 0) {
-      rnd = FloatType(IntType(rnd * params.float_scale_up));
+      rnd = FloatType(std::llround(rnd * params.float_scale_up));
       result = Element(rnd * params.float_scale_down);
     }
     else {
       result = Element(rnd);
     }
 
+    if (params.exclude_zero >=0 && result == Element(0.0)) {
+      if (rnd > FloatType(0)) {
+        rnd += FloatType(1);
+      } else {
+        rnd -= FloatType(1);
+      }
+      result = Element(rnd);
+    }
+
     return result;
   }
 };
@@ -202,6 +216,7 @@ struct RandomGaussianFunc<complex<Real>> {
     int int_scale;
     FloatType float_scale_up;
     FloatType float_scale_down;
+    int exclude_zero;           ///< If non-negative, excludes zeros
 
     //
     // Methods
@@ -212,15 +227,16 @@ struct RandomGaussianFunc<complex<Real>> {
       uint64_t seed_ = 0,
       Real mean_ = 0, 
       Real stddev_ = 1,
-      int int_scale_ = -1
+      int int_scale_ = -1,
+      int exclude_zero_ = -1
     ):
       seed(seed_), 
       mean(static_cast<FloatType>(mean_)), 
       stddev(static_cast<FloatType>(stddev_)), 
-      int_scale(int_scale_) {
+      int_scale(int_scale_),
+      exclude_zero(exclude_zero_) {
 
       float_scale_up = FloatType(IntType(1) << int_scale);
-      float_scale_up += FloatType(0.5) * float_scale_up;
       float_scale_down = FloatType(1) / FloatType(IntType(1) << int_scale);
     }
   };
@@ -259,8 +275,8 @@ struct RandomGaussianFunc<complex<Real>> {
 
     Element result;
     if (params.int_scale >= 0) {
-      rnd_r = FloatType(IntType(rnd_r * params.float_scale_up));
-      rnd_i = FloatType(IntType(rnd_i * params.float_scale_down));
+      rnd_r = FloatType(std::llround(rnd_r * params.float_scale_up));
+      rnd_i = FloatType(std::llround(rnd_i * params.float_scale_up));
 
       result = {
         Real(rnd_r * params.float_scale_down),
@@ -271,6 +287,18 @@ struct RandomGaussianFunc<complex<Real>> {
       result = Element(Real(rnd_r), Real(rnd_i));
     }
 
+    if (params.exclude_zero >= 0 && 
+        result.real() == Real(0.0) &&
+        result.imag() == Real(0.0)) {
+
+      if (rnd_r > FloatType(0)) {
+        rnd_r += FloatType(1);
+      } else {
+        rnd_r -= FloatType(1);
+      }
+      result = Element(Real(rnd_r), Real(rnd_i));
+    }
+
     return result;
   }
 };
@@ -352,19 +380,23 @@ template <
 void TensorFillRandomGaussian(
   TensorView<Element, Layout> view,       ///< destination tensor
   uint64_t seed,                          ///< seed for RNG
-  Element mean = Element(0),              ///< Gaussian distribution's mean
-  Element stddev = Element(1),            ///< Gaussian distribution's standard deviation
-  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that 
+  typename RealType<Element>::Type mean = Element(0),   ///< Gaussian distribution's mean
+  typename RealType<Element>::Type stddev = Element(1), ///< Gaussian distribution's standard deviation
+  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that
                                           ///  are not truncated to zero. Permits reducing precision of
                                           ///  data.
-  
+  int exclude_zero = -1,                  ///< If non-negative, excludes zeros from tensor init
+  cudaStream_t stream = nullptr) {
+
   using RandomFunc = detail::RandomGaussianFunc<Element>;
   using Func = detail::TensorFillRandomGaussianFunc<Element, Layout>;
   using Params = typename Func::Params;
 
   TensorForEach<Func, Layout::kRank, Params>(
     view.extent(),
-    Params(view, typename RandomFunc::Params(seed, mean, stddev, bits))
+    Params(view, typename RandomFunc::Params(seed, mean, stddev, bits, exclude_zero)),
+    /*grid_size*/0, /*block_size*/0,
+    stream
   );
 }
 
@@ -378,15 +410,16 @@ void BlockFillRandomGaussian(
   uint64_t seed,                              ///< seed for RNG
   typename RealType<Element>::Type mean,      ///< Gaussian distribution's mean
   typename RealType<Element>::Type stddev,    ///< Gaussian distribution's standard deviation
-  int bits = -1) {                            ///< If non-negative, specifies number of fractional bits that 
+  int bits = -1,                              ///< If non-negative, specifies number of fractional bits that
                                               ///  are not truncated to zero. Permits reducing precision of
                                               ///  data.
-  
+  cudaStream_t stream = nullptr) {
+
   using RandomFunc = detail::RandomGaussianFunc<Element>;
 
   typename RandomFunc::Params params(seed, mean, stddev, bits);
 
-  BlockForEach<Element, RandomFunc>(ptr, capacity, params);
+  BlockForEach<Element, RandomFunc>(ptr, capacity, params, /*grid_size*/0, /*block_size*/0, stream);
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -394,7 +427,7 @@ void BlockFillRandomGaussian(
 
 namespace detail {
 
-/// Computes a random Gaussian distribution
+/// Computes a random uniform distribution
 template <typename Element>                ///< Element type 
 struct RandomUniformFunc {
 
@@ -419,8 +452,10 @@ struct RandomUniformFunc {
     FloatType range;
     FloatType max;
     int int_scale;
+    double pnan;
     FloatType float_scale_up;
     FloatType float_scale_down;
+    int exclude_zero;           ///< If non-negative, excludes zeros
 
     /// Default ctor
     CUTLASS_HOST_DEVICE
@@ -435,16 +470,25 @@ struct RandomUniformFunc {
       uint64_t seed_ = 0, 
       Element max_ = 1,
       Element min = 0,
-      int int_scale_ = -1
+      int int_scale_ = -1,
+      double pnan_ = 0,
+      int exclude_zero_ = -1
     ):
       seed(seed_), 
-      range(static_cast<FloatType>(max_ - min)), 
+      range(static_cast<FloatType>(max_) - static_cast<FloatType>(min)), 
       max(static_cast<FloatType>(max_)),
-      int_scale(int_scale_) {
-
-      float_scale_up = FloatType(IntType(1) << int_scale);
-      float_scale_up += FloatType(0.5) * float_scale_up;
+      int_scale(int_scale_),
+      pnan(pnan_),
+      exclude_zero(exclude_zero_) {
+      
+      float_scale_up = FloatType(IntType(1) << int_scale); // scale up to clamp low order bits
       float_scale_down = FloatType(1) / FloatType(IntType(1) << int_scale);
+
+      // Handle cases where min = 0 or max = 0 for excluding zeros
+      if (exclude_zero >= 0) {
+        range = (min == Element(0)) ? range - FloatType(1): range;
+        max = (max_ == Element(0)) ? max - FloatType(1): max; 
+      }
     }
   };
 
@@ -475,6 +519,13 @@ struct RandomUniformFunc {
   CUTLASS_DEVICE
   Element operator()() {
 
+    // Draw random float in [0.0, 1.0] to determine if element should be NaN.
+    if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
+      if (params.pnan > 0 && (curand_uniform(&rng_state) < (params.pnan))) {
+        return Element(NAN);
+      }
+    }
+
     FloatType rnd = random_uniform_float<FloatType>(&rng_state);
     rnd = params.max - params.range * rnd;
 
@@ -483,13 +534,22 @@ struct RandomUniformFunc {
     Element result;
 
     if (params.int_scale >= 0) {
-      rnd = FloatType(IntType(rnd * params.float_scale_up));
+      rnd = FloatType(std::llround(rnd * params.float_scale_up));
       result = Element(rnd * params.float_scale_down);
     }
     else {
       result = Element(rnd);
     }
 
+    if (params.exclude_zero >=0 && result == Element(0.0)) {
+      if (rnd > FloatType(0)) {
+        rnd = std::min(params.max, rnd + FloatType(1));
+      } else {
+        rnd = std::max((params.max - params.range), rnd - FloatType(1));
+      }
+      result = Element(rnd);
+    }
+
     return result;
   }
 };
@@ -521,8 +581,10 @@ struct RandomUniformFunc<complex<Real>> {
     FloatType range;
     FloatType min;
     int int_scale;
+    double pnan;
     FloatType float_scale_up;
     FloatType float_scale_down;
+    int exclude_zero;           ///< If non-negative, excludes zeros
 
     /// Default ctor
     CUTLASS_HOST_DEVICE
@@ -537,16 +599,25 @@ struct RandomUniformFunc<complex<Real>> {
       uint64_t seed_ = 0, 
       FloatType max = 1,
       FloatType min_ = 0,
-      int int_scale_ = -1
+      int int_scale_ = -1,
+      double pnan_ = 0,
+      int exclude_zero_ = -1
     ):
       seed(seed_), 
       range(static_cast<FloatType>(max - min_)), 
       min(static_cast<FloatType>(min_)), 
-      int_scale(int_scale_) {
+      int_scale(int_scale_),
+      pnan(pnan_),
+      exclude_zero(exclude_zero_) {
 
       float_scale_up = FloatType(IntType(1) << int_scale);
-      float_scale_up += FloatType(0.5) * float_scale_up;
       float_scale_down = FloatType(1) / FloatType(IntType(1) << int_scale);
+
+      // Handle cases where min = 0 or max = 0 for excluding zeros
+      if (exclude_zero >= 0) {
+        min = (min == FloatType(0)) ? min + FloatType(1): min;
+        range = (max == FloatType(0)) ? range - FloatType(1): range; 
+      }
     }
   };
 
@@ -577,6 +648,13 @@ struct RandomUniformFunc<complex<Real>> {
   CUTLASS_DEVICE
   Element operator()() {
 
+    // Draw random float in [0.0, 1.0] to determine if element should be NaN.
+    if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
+      if (params.pnan > 0 && (curand_uniform(&rng_state) < (params.pnan))) {
+        return Element(Real(NAN), Real(NAN));
+      }
+    }
+
     FloatType rnd_r = random_uniform_float<FloatType>(&rng_state);
     FloatType rnd_i = random_uniform_float<FloatType>(&rng_state);
 
@@ -588,8 +666,8 @@ struct RandomUniformFunc<complex<Real>> {
     Element result;
 
     if (params.int_scale >= 0) {
-      rnd_r = FloatType(IntType(rnd_r * params.float_scale_up));
-      rnd_i = FloatType(IntType(rnd_i * params.float_scale_up));
+      rnd_r = FloatType(std::llround(rnd_r * params.float_scale_up));
+      rnd_i = FloatType(std::llround(rnd_i * params.float_scale_up));
 
       result = {
         Real(rnd_r * params.float_scale_down),
@@ -600,11 +678,23 @@ struct RandomUniformFunc<complex<Real>> {
       result = Element(Real(rnd_r), Real(rnd_i));
     }
 
+    if (params.exclude_zero >= 0 && 
+        result.real() == Real(0.0) &&
+        result.imag() == Real(0.0)) {
+
+      if (rnd_r > FloatType(0)) {
+        rnd_r = std::min(params.min + params.range, rnd_r + FloatType(1));
+      } else {
+        rnd_r = std::max((params.min), rnd_r - FloatType(1));
+      }
+      result = Element(Real(rnd_r), Real(rnd_i));
+    }
+
     return result;
   }
 };
 
-/// Computes a random Gaussian distribution
+/// Computes a random uniform distribution
 template <
   typename Element,               ///< Element type
   typename Layout>                ///< Layout function
@@ -684,21 +774,26 @@ template <
 void TensorFillRandomUniform(
   TensorView<Element, Layout> view,       ///< destination tensor
   uint64_t seed,                          ///< seed for RNG
-  Element max = Element(1),               ///< upper bound of distribution
-  Element min = Element(0),               ///< lower bound for distribution
-  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that 
+  typename RealType<Element>::Type max = Element(1), ///< upper bound of distribution
+  typename RealType<Element>::Type min = Element(0), ///< lower bound for distribution
+  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that
                                           ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.                 
-  
+                                          ///  data.
+  double pnan = 0,                        ///< Percentage of NaN elements.
+  int exclude_zero = -1,               ///< If non-negative, excludes zeros from tensor init
+  cudaStream_t stream = nullptr) {
+
   using RandomFunc = detail::RandomUniformFunc<Element>;
   using Func = detail::TensorFillRandomUniformFunc<Element, Layout>;
   using Params = typename Func::Params;
 
-  typename RandomFunc::Params random(seed, max, min, bits);
+  typename RandomFunc::Params random(seed, max, min, bits, pnan, exclude_zero);
 
   TensorForEach<Func, Layout::kRank, Params>(
     view.extent(),
-    Params(view, random)
+    Params(view, random),
+    /*grid_size*/0, /*block_size*/0,
+    stream
   );
 }
 
@@ -712,15 +807,17 @@ void BlockFillRandomUniform(
   uint64_t seed,                          ///< seed for RNG
   typename RealType<Element>::Type max,   ///< upper bound of distribution
   typename RealType<Element>::Type min,   ///< lower bound for distribution
-  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that 
+  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that
                                           ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.                 
-  
+                                          ///  data.
+  double pnan = 0,                        ///< Percentage of NaN elements.
+  cudaStream_t stream = nullptr) {
+
   using RandomFunc = detail::RandomUniformFunc<Element>;
-  
-  typename RandomFunc::Params params(seed, max, min, bits);
 
-  BlockForEach<Element, RandomFunc>(ptr, capacity, params);
+  typename RandomFunc::Params params(seed, max, min, bits, pnan);
+
+  BlockForEach<Element, RandomFunc>(ptr, capacity, params, /*grid_size*/0, /*block_size*/0, stream);
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -764,9 +861,13 @@ struct RandomSparseMetaFunc {
       MetaSizeInBits(MetaSizeInBits_) {
       if (MetaSizeInBits_ == 2) {
         range = 6;
-      } else if (MetaSizeInBits_ == 4) {
+      }
+      else if (MetaSizeInBits_ == 4) {
         range = 2;
       }
+      else {
+        throw std::invalid_argument("Invalid MetaSizeInBits");
+      }
     }
   };
 
@@ -897,10 +998,9 @@ template <
 void TensorFillRandomSparseMeta(
   TensorView<Element, Layout> view,       ///< destination tensor
   uint64_t seed,                          ///< seed for RNG
-  int MetaSizeInBits = 2) {                        ///< If non-negative, specifies number of fractional bits that 
-                                          ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.                 
-  
+  int MetaSizeInBits = 2,                 ///< meta data size
+  cudaStream_t stream = nullptr) {
+
   using RandomFunc = detail::RandomSparseMetaFunc<Element>;
   using Func = detail::TensorFillRandomUniformFunc<Element, Layout>;
   using Params = typename Func::Params;
@@ -909,7 +1009,9 @@ void TensorFillRandomSparseMeta(
 
   TensorForEach<Func, Layout::kRank, Params>(
     view.extent(),
-    Params(view, random)
+    Params(view, random),
+    /*grid_size*/0, /*block_size*/0,
+    stream
   );
 }
 
@@ -921,13 +1023,14 @@ void BlockFillRandomSparseMeta(
   Element *ptr,
   size_t capacity,
   uint64_t seed,                          ///< seed for RNG
-  int MetaSizeInBits = 2) {               ///< meta data size
-  
+  int MetaSizeInBits = 2,                 ///< meta data size
+  cudaStream_t stream = nullptr) {
+
   using RandomFunc = detail::RandomSparseMetaFunc<Element>;
-  
+
   typename RandomFunc::Params params(seed, MetaSizeInBits);
 
-  BlockForEach<Element, RandomFunc>(ptr, capacity, params);
+  BlockForEach<Element, RandomFunc>(ptr, capacity, params, /*grid_size*/0, /*block_size*/0, stream);
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1149,34 +1252,10 @@ struct TensorClearPartialFunc {
 
   /// Parameters structure
   struct Params {
-
-    //
-    // Data members
-    //
-
-    TensorView view;
-    Element element;
-    FillMode fill_mode;
-    int alignment;
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params(): fill_mode(FillMode::kNone) { }
-
-    //
-    // Methods
-    //
-
-    /// Construction of Gaussian RNG functor.
-    Params(
-      TensorView view_,
-      Element element_,
-      FillMode fill_mode_,
-      int alignment_
-    ):
-      view(view_), element(element_), fill_mode(fill_mode_), alignment(alignment_) {
-
-    }
+    TensorView view{};
+    Element element{};
+    FillMode fill_mode{FillMode::kNone};
+    int alignment{0};
   };
 
   //
@@ -1243,14 +1322,17 @@ template <
 void TensorFillDiagonal(
   TensorView<Element, Layout> view,       ///< destination tensor
   Element diag = Element(1),              ///< value to write in the diagonal
-  Element other = Element(0)) {           ///< value to write off the diagonal
-  
+  Element other = Element(0),             ///< value to write off the diagonal
+  cudaStream_t stream = nullptr) {
+
   typedef detail::TensorFillDiagonalFunc<Element, Layout> Func;
   typedef typename Func::Params Params;
 
   TensorForEach<Func, Layout::kRank, Params>(
     view.extent(),
-    Params(view, diag, other)
+    Params(view, diag, other),
+    /*grid_size*/0, /*block_size*/0,
+    stream
   );
 }
 
@@ -1262,18 +1344,20 @@ template <
 void TensorFillPartial(
   TensorView<Element, Layout> view,       ///< destination tensor
   Element element,
-  FillMode fill_mode) {
-  
+  FillMode fill_mode,
+  cudaStream_t stream = nullptr) {
+
   typedef detail::TensorFillPartialFunc<Element, Layout> Func;
   typedef typename Func::Params Params;
 
   TensorForEach<Func, Layout::kRank, Params>(
     view.extent(),
-    Params(view, element, fill_mode)
+    Params(view, element, fill_mode),
+    stream
   );
 }
 
-/// Clears a tensor partially depending on fill mode and alignment. Elements on the wrong-side 
+/// Clears a tensor partially depending on fill mode and alignment. Elements on the wrong-side
 /// of fillmode (upto the alignment) are overwritten with the user supplied element (typically zeros)
 template <
   typename Element,               ///< Element type
@@ -1282,14 +1366,17 @@ void TensorClearPartial(
   TensorView<Element, Layout> view,       ///< destination tensor
   Element element,
   FillMode fill_mode,
-  int alignment) {
-  
+  int alignment,
+  cudaStream_t stream = nullptr) {
+
   typedef detail::TensorClearPartialFunc<Element, Layout> Func;
   typedef typename Func::Params Params;
 
   TensorForEach<Func, Layout::kRank, Params>(
     view.extent(),
-    Params(view, element, fill_mode, alignment)
+    Params{view, element, fill_mode, alignment},
+    /*grid_size*/0, /*block_size*/0,
+    stream
   );
 }
 
@@ -1300,22 +1387,24 @@ template <
   typename Element,               ///< Element type
   typename Layout>                ///< Layout function
 void TensorFill(
-  TensorView<Element, Layout> view,         ///< destination tensor 
-  Element val = Element(0)) {               ///< value to uniformly fill it with
+  TensorView<Element, Layout> view,         ///< destination tensor
+  Element val = Element(0),                 ///< value to uniformly fill it with
+  cudaStream_t stream = nullptr) {
 
-  TensorFillDiagonal(view, val, val);
+  TensorFillDiagonal(view, val, val, stream);
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Fills a tensor's digonal with 1 and 0 everywhere else.
+/// Fills a tensor's diagonal with 1 and 0 everywhere else.
 template <
   typename Element,               ///< Element type
   typename Layout>                ///< Layout function
 void TensorFillIdentity(
-  TensorView<Element, Layout> view) {               ///< destination tensor
+  TensorView<Element, Layout> view,                 ///< destination tensor
+  cudaStream_t stream = nullptr) {
 
-  TensorFillDiagonal(view, Element(1), Element(0));
+  TensorFillDiagonal(view, Element(1), Element(0), stream);
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1413,14 +1502,17 @@ template <
   typename Layout>                ///< Layout function
 void TensorUpdateDiagonal(
   TensorView<Element, Layout> view,                 ///< destination tensor
-  Element diag = Element(1)) {
+  Element diag = Element(1),
+  cudaStream_t stream = nullptr) {
 
   typedef detail::TensorUpdateDiagonalFunc<Element, Layout> Func;
   typedef typename Func::Params Params;
 
   TensorForEach<Func, Layout::kRank, Params>(
     view.extent(),
-    Params(view, diag)
+    Params(view, diag),
+    /*grid_size*/0, /*block_size*/0,
+    stream
   );
 }
 
@@ -1519,14 +1611,17 @@ template <
   typename Layout>                ///< Layout function
 void TensorUpdateOffDiagonal(
   TensorView<Element, Layout> view,      ///< destination tensor
-  Element other = Element(1)) {
+  Element other = Element(1),
+  cudaStream_t stream = nullptr) {
 
   typedef detail::TensorUpdateOffDiagonalFunc<Element, Layout> Func;
   typedef typename Func::Params Params;
 
   TensorForEach<Func, Layout::kRank, Params>(
     view.extent(),
-    Params(view, other)
+    Params(view, other),
+    /*grid_size*/0, /*block_size*/0,
+    stream
   );
 }
 
@@ -1600,11 +1695,30 @@ struct TensorFillLinearFunc {
   /// Compute random value and update RNG state
   CUTLASS_DEVICE
   void operator()(TensorCoord const &coord) {
+
     Element sum = params.s;
     
     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < Layout::kRank; ++i) {
-      sum += params.v[i] * Element(coord[i]);
+      if constexpr (is_complex<Element>::value) {
+        if constexpr (sizeof_bits<Element>::value <= 32) {
+          sum = Element(static_cast<complex<float>>(sum) + 
+                  static_cast<complex<float>>(params.v[i]) * static_cast<complex<float>>(coord[i]));
+        }
+      }
+      else if constexpr (sizeof_bits<Element>::value <= 32) {
+        if constexpr (std::numeric_limits<Element>::is_integer) {
+          sum = Element(static_cast<int32_t>(sum) + 
+                  static_cast<int32_t>(params.v[i]) * static_cast<int32_t>(coord[i]));
+        }
+        else {
+          sum = Element(static_cast<float>(sum) + 
+                  static_cast<float>(params.v[i]) * static_cast<float>(coord[i]));
+        }
+      }
+      else {
+        sum += params.v[i] * coord[i];
+      }
     }
 
     params.view.at(coord) = sum;
@@ -1622,20 +1736,64 @@ template <
 void TensorFillLinear(
   TensorView<Element, Layout> view,      ///< destination tensor
   Array<Element, Layout::kRank> const & v,
-  Element s = Element(0)) {
+  Element s = Element(0),
+  cudaStream_t stream = nullptr) {
 
   using Func = detail::TensorFillLinearFunc<Element, Layout>;
   using Params = typename Func::Params;
 
   TensorForEach<Func, Layout::kRank, Params>(
     view.extent(),
-    Params(view, v, s)
+    Params(view, v, s),
+    /*grid_size*/0, /*block_size*/0,
+    stream
   );
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
+/// Fills a tensor with random values from a distribution.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillRandom(
+  TensorView<Element, Layout> view,       ///< destination tensor
+  uint64_t seed,
+  Distribution dist,
+  cudaStream_t stream = nullptr,
+  int exclude_zero = -1                   ///< If non-negative, excludes 0.
+                                          ///  Note that setting this flag will result in more 1's,
+                                          ///  as we use a simple mechanism to replace 0's by adding/subtracting 1's.
+  ) {
+
+  using Real = typename RealType<Element>::Type;
+
+  if (dist.kind == Distribution::Gaussian) {
+    TensorFillRandomGaussian<Element, Layout>(
+      view,
+      seed,
+      static_cast<Real>(dist.gaussian.mean),
+      static_cast<Real>(dist.gaussian.stddev),
+      dist.int_scale,
+      exclude_zero,
+      stream);
+  } else if (dist.kind == Distribution::Uniform) {
+    TensorFillRandomUniform<Element, Layout>(
+      view,
+      seed,
+      static_cast<Real>(dist.uniform.max),
+      static_cast<Real>(dist.uniform.min),
+      dist.int_scale,
+      dist.uniform.pnan,
+      exclude_zero,
+      stream);
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
 /// Fills a block of data with sequential elements
 template <
   typename Element
@@ -1646,6 +1804,15 @@ void BlockFillSequential(
   Element v = Element(1),
   Element s = Element(0)) {
 
+  using Layout = layout::PackedVectorLayout;
+  Layout::TensorCoord size(static_cast<Layout::Index>(capacity)); // -Wconversion
+  Layout layout = Layout::packed(size);
+  TensorView<Element, Layout> view(ptr, layout, size);
+
+  Array<Element, Layout::kRank> c{};
+  c[0] = v;
+
+  TensorFillLinear(view, c, s);
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1659,27 +1826,31 @@ void BlockFillRandom(
   Element *ptr,
   size_t capacity,
   uint64_t seed,
-  Distribution dist) {
+  Distribution dist,
+  cudaStream_t stream = nullptr) {
 
   using Real = typename RealType<Element>::Type;
 
   if (dist.kind == Distribution::Gaussian) {
     BlockFillRandomGaussian<Element>(
-      ptr, 
-      capacity, 
-      seed, 
-      static_cast<Real>(dist.gaussian.mean), 
-      static_cast<Real>(dist.gaussian.stddev), 
-      dist.int_scale);
+      ptr,
+      capacity,
+      seed,
+      static_cast<Real>(dist.gaussian.mean),
+      static_cast<Real>(dist.gaussian.stddev),
+      dist.int_scale,
+      stream);
   }
   else if (dist.kind == Distribution::Uniform) {
     BlockFillRandomUniform<Element>(
-      ptr, 
-      capacity, 
-      seed, 
+      ptr,
+      capacity,
+      seed,
       static_cast<Real>(dist.uniform.max),
-      static_cast<Real>(dist.uniform.min), 
-      dist.int_scale);
+      static_cast<Real>(dist.uniform.min),
+      dist.int_scale,
+      dist.uniform.pnan,
+      stream);
   }
 }
 
@@ -1775,14 +1946,17 @@ template <
   typename Layout>                ///< Layout function
 void TensorCopyDiagonalIn(
   TensorView<Element, Layout> view,   ///< destination tensor
-  Element const *ptr) {                     ///< dense buffer of elements
+  Element const *ptr,                        ///< dense buffer of elements
+  cudaStream_t stream = nullptr) {
 
   using Func = detail::TensorCopyDiagonalInFunc<Element, Layout>;
   using Params = typename Func::Params;
 
   TensorForEach<Func, Layout::kRank, Params>(
     view.extent(),
-    Params(view, ptr)
+    Params(view, ptr),
+    /*grid_size*/0, /*block_size*/0,
+    stream
   );
 }
 
@@ -1879,14 +2053,17 @@ template <
   typename Layout>                ///< Layout function
 void TensorCopyDiagonalOut(
   Element *ptr,                               ///< dense buffer of elements
-  TensorView<Element, Layout> view) {    ///< source tensor
+  TensorView<Element, Layout> view,      ///< source tensor
+  cudaStream_t stream = nullptr) {
 
   using Func = detail::TensorCopyDiagonalOutFunc<Element, Layout>;
   using Params = typename Func::Params;
 
   TensorForEach<Func, Layout::kRank, Params>(
     view.extent(),
-    Params(view, ptr)
+    Params(view, ptr),
+    /*grid_size*/0, /*block_size*/0,
+    stream
   );
 }
 
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h
index 126d5b0068..3911b0240c 100644
--- a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -45,7 +45,10 @@ template <typename Func, int Rank, typename Params>
 struct TensorForEach {
 
   /// Constructor performs the operation.
-  TensorForEach(Coord<Rank> size, Params params = Params(), int grid_size = 0, int block_size = 0) {
+  TensorForEach(
+    Coord<Rank> size, Params params = Params(),
+    int grid_size = 0, int block_size = 0,
+    cudaStream_t stream = nullptr) {
 
     if (!grid_size || !block_size) {
 
@@ -67,7 +70,7 @@ struct TensorForEach {
     dim3 grid(grid_size, 1, 1);
     dim3 block(block_size, 1, 1);
 
-    kernel::TensorForEach<Func, Rank, Params><<< grid, block >>>(size, params);
+    kernel::TensorForEach<Func, Rank, Params><<< grid, block, 0, stream >>>(size, params);
   }
 };
 
@@ -78,7 +81,10 @@ template <typename Func, int Rank, typename Params>
 struct TensorDiagonalForEach {
 
   /// Constructor performs the operation
-  TensorDiagonalForEach(Coord<Rank> size, Params params = Params(), int start = 0, int end = -1, int block_size = 128) { 
+  TensorDiagonalForEach(
+    Coord<Rank> size, Params params = Params(),
+    int start = 0, int end = -1,
+    int block_size = 128, cudaStream_t stream = nullptr) {
 
     if (end < 0) {
       end = size.min();
@@ -87,7 +93,8 @@ struct TensorDiagonalForEach {
     dim3 block(block_size, 1, 1);
     dim3 grid((end - start + block_size - 1) / block_size, 1, 1);
 
-    kernel::TensorDiagonalForEach<Func, Rank, Params><<< grid, block >>>(size, params, start, end);
+    kernel::TensorDiagonalForEach<Func, Rank, Params><<< grid, block, 0, stream >>>(
+      size, params, start, end);
   }
 };
 
@@ -99,11 +106,12 @@ struct BlockForEach {
 
   /// Constructor performs the operation.
   BlockForEach(
-    Element *ptr, 
+    Element *ptr,
     size_t capacity,
     typename Func::Params params = typename Func::Params(),
-    int grid_size = 0, 
-    int block_size = 0) {
+    int grid_size = 0,
+    int block_size = 0,
+    cudaStream_t stream = nullptr) {
 
     if (!grid_size || !block_size) {
 
@@ -125,7 +133,7 @@ struct BlockForEach {
     dim3 grid(grid_size, 1, 1);
     dim3 block(block_size, 1, 1);
 
-    kernel::BlockForEach<Element, Func><<< grid, block >>>(ptr, capacity, params);
+    kernel::BlockForEach<Element, Func><<< grid, block, 0, stream >>>(ptr, capacity, params);
   }
 };
 
@@ -133,4 +141,4 @@ struct BlockForEach {
 
 } // namespace device
 } // namespace reference
-} // namesace cutlass
+} // namespace cutlass
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_reduce.h b/tools/util/include/cutlass/util/reference/device/tensor_reduce.h
index 51a572214b..47b898b4fd 100644
--- a/tools/util/include/cutlass/util/reference/device/tensor_reduce.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_reduce.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -120,7 +120,7 @@ __global__ void TensorTransformReducePartial(
   ComputeType *workspace) {             /// Device-side workspace for accumulating partial results. The reduced element is stored in workspace[0]
   
   int64_t idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int64_t size = view_A.size();
+  auto size = static_cast<int64_t>(view_A.size());
 
   __shared__ ComputeType scratchpad[kBlockSize];
 
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_relu.h b/tools/util/include/cutlass/util/reference/device/tensor_relu.h
index d96f1e25a9..4e5a50403c 100644
--- a/tools/util/include/cutlass/util/reference/device/tensor_relu.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_relu.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/device/thread/gemm.h b/tools/util/include/cutlass/util/reference/device/thread/gemm.h
index 10c6a56ce0..04775a746a 100644
--- a/tools/util/include/cutlass/util/reference/device/thread/gemm.h
+++ b/tools/util/include/cutlass/util/reference/device/thread/gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -91,7 +91,7 @@ struct Gemm {
       A_tile[i] = ElementA(0);
     }
 
-    for (int j = 0; j < OutputTile::kColumn; ++j) {
+    for (int j = 0; j < OutputTile::kRow; ++j) {
       B_tile[j] = ElementB(0);
     }
 
diff --git a/tools/util/include/cutlass/util/reference/host/conv.hpp b/tools/util/include/cutlass/util/reference/host/conv.hpp
new file mode 100644
index 0000000000..545dbba9a4
--- /dev/null
+++ b/tools/util/include/cutlass/util/reference/host/conv.hpp
@@ -0,0 +1,698 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for CONV in host-side code.
+*/
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/complex.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/activation.h"
+
+#include "cute/tensor.hpp"
+
+#include <cuda_runtime.h>
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::reference::host {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template<class EngineAct, class LayoutAct>
+bool
+is_activation_in_bounds(
+    cute::Tensor<EngineAct, LayoutAct> const& activation,
+    int32_t n_, int32_t d_, int32_t h_, int32_t w_, int32_t c_) {
+  return ((n_ >= 0 && n_ < size<4>(activation)) &&
+          (d_ >= 0 && d_ < size<3>(activation)) &&
+          (h_ >= 0 && h_ < size<2>(activation)) &&
+          (w_ >= 0 && w_ < size<1>(activation)) &&
+          (c_ >= 0 && c_ < size<0>(activation)));
+}
+
+template<class EngineAct, class LayoutAct>
+bool
+is_activation_in_bounds(
+    cute::Tensor<EngineAct, LayoutAct> const& activation,
+    int32_t n_, int32_t h_, int32_t w_, int32_t c_) {
+  return ((n_ >= 0 && n_ < size<3>(activation)) &&
+          (h_ >= 0 && h_ < size<2>(activation)) &&
+          (w_ >= 0 && w_ < size<1>(activation)) &&
+          (c_ >= 0 && c_ < size<0>(activation)));
+}
+
+template<class EngineAct, class LayoutAct>
+bool
+is_activation_in_bounds(
+    cute::Tensor<EngineAct, LayoutAct> const& activation,
+    int32_t n_, int32_t w_, int32_t c_) {
+  return ((n_ >= 0 && n_ < size<2>(activation)) &&
+          (w_ >= 0 && w_ < size<1>(activation)) &&
+          (c_ >= 0 && c_ < size<0>(activation)));
+}
+
+} // namespace detail
+
+template<
+  class ElementAcc_,
+  class ElementScalar_,
+  class ElementCompute_,
+  class ElementC_,
+  class ElementOut_,
+  class TensorAlpha_,
+  class TensorBeta_,
+  class TensorBias_,
+  class ActivationFunctor_ = cutlass::epilogue::thread::Identity<ElementCompute_>
+>
+struct ConvEpilogueFusionParams {
+  using ElementAcc = ElementAcc_;
+  using ElementScalar = ElementScalar_;
+  using ElementCompute = ElementCompute_;
+  using ElementC = ElementC_;
+  using ElementOut = ElementOut_;
+  using TensorAlpha = TensorAlpha_;
+  using TensorBeta = TensorBeta_;
+  using TensorBias = TensorBias_;
+  using ActivationFunctor = ActivationFunctor_;
+  ElementScalar alpha = ElementScalar(1);
+  ElementScalar beta = ElementScalar(0);
+
+  TensorAlpha tensor_alpha{};
+  TensorBeta tensor_beta{};
+  TensorBias tensor_bias{};
+};
+
+template<
+  cutlass::conv::Operator ConvOp,
+  int NumSpatialDims,
+  class TensorA,
+  class TensorB,
+  class TensorC,
+  class TensorD,
+  class ShapePadding,
+  class StrideTraversal,
+  class ShapeDilation,
+  class EpilogueFusionParams
+>
+struct ConvReferenceImpl {
+  // Hard code accumlulator type to float to avoid data lost in accumulating add.
+  using ElementAcc = cutlass::platform::conditional_t<cutlass::platform::is_same_v<typename EpilogueFusionParams::ElementAcc, double>, double, float>;
+  using ElementC = typename EpilogueFusionParams::ElementC;
+  using ElementOut = typename EpilogueFusionParams::ElementOut;
+  using ElementScalar = typename EpilogueFusionParams::ElementScalar;
+  using ElementCompute = typename EpilogueFusionParams::ElementCompute;
+  using ElementBias = typename EpilogueFusionParams::TensorBias::value_type;
+  using ActivationFunctor = typename EpilogueFusionParams::ActivationFunctor;
+
+  // Input related converter
+  NumericConverter<ElementCompute, ElementAcc> acc_converter;
+  NumericConverter<ElementCompute, ElementC> residual_converter;
+  NumericConverter<ElementCompute, ElementBias> bias_converter;
+  // Scale related converter
+  NumericConverter<ElementCompute, ElementScalar> scale_converter;
+  // Output related converter
+  NumericConverter<ElementOut, ElementCompute> output_converter;
+
+  EpilogueFusionParams& epi_fusion_params_;
+  TensorA const& tensor_a_;
+  TensorB const& tensor_b_;
+  TensorC const& tensor_c_;
+  TensorD& tensor_d_;
+
+  ShapePadding const& padding_;
+  StrideTraversal const& tstride_;
+  ShapeDilation const& dilation_;
+
+  // Epilogue activation operation
+  ActivationFunctor epi_activation;
+
+  ConvReferenceImpl(
+    TensorA const& tensor_a,
+    TensorB const& tensor_b,
+    TensorC const& tensor_c,
+    TensorD& tensor_d,
+    ShapePadding const& padding,
+    StrideTraversal const& tstride,
+    ShapeDilation const& dilation,
+    EpilogueFusionParams& epi_fusion_params)
+  : tensor_a_(tensor_a),
+    tensor_b_(tensor_b),
+    tensor_c_(tensor_c),
+    tensor_d_(tensor_d),
+    padding_(padding),
+    tstride_(tstride),
+    dilation_(dilation),
+    epi_fusion_params_(epi_fusion_params)
+  {
+    static_assert(rank(ShapePadding{}) == rank(ShapeDilation{}));
+    static_assert(rank(ShapePadding{}) == rank(StrideTraversal{}));
+  }
+
+  void compute_reference() {
+    if constexpr (ConvOp == cutlass::conv::Operator::kFprop) {
+      fprop_reference(cute::Int<NumSpatialDims>{});
+    }
+    else if constexpr (ConvOp == cutlass::conv::Operator::kDgrad) {
+      dgrad_reference(cute::Int<NumSpatialDims>{});
+    }
+    else {
+      wgrad_reference(cute::Int<NumSpatialDims>{});
+    }
+  }
+
+private:
+  // Specialization for 1D fprop kernel
+  void fprop_reference(cute::Int<1> spatial_dims) {
+    int32_t N = size<2>(tensor_d_);
+    int32_t Q = size<1>(tensor_d_);
+    int32_t K = size<0>(tensor_d_);
+    int32_t S = size<1>(tensor_b_);
+    int32_t C = size<0>(tensor_b_);
+
+#if defined(_OPENMP)
+  #pragma omp parallel for collapse(2)
+#endif
+    for (int32_t n = 0; n < N; ++n) {
+      for (int32_t q = 0; q < Q; ++q) {
+        for (int32_t k = 0; k < K; ++k) {
+          auto accumulator = ElementAcc(0);
+          for (int32_t s = 0; s < S; ++s) {
+            for (int32_t c = 0; c < C; ++c) {
+              int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
+              if (detail::is_activation_in_bounds(tensor_a_, n, w, c)) {
+                auto a = tensor_a_(c, w, n);
+                auto b = tensor_b_(c, s, k);
+                accumulator += ElementAcc(a * b);
+              }
+            }
+          }
+          ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
+            epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
+          ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
+            epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
+          ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
+                                  scale_converter(beta) * residual_converter(tensor_c_(k, q, n));
+          if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+            output += bias_converter(epi_fusion_params_.tensor_bias[k]);
+          }
+          output = epi_activation(output);
+          tensor_d_(k, q, n) = output_converter(output);
+        }
+      }
+    }
+
+  }
+
+  // Specialization for 2D fprop kernel
+  void fprop_reference(cute::Int<2> spatial_dims) {
+    int32_t N = size<3>(tensor_d_);
+    int32_t P = size<2>(tensor_d_);
+    int32_t Q = size<1>(tensor_d_);
+    int32_t K = size<0>(tensor_d_);
+    int32_t R = size<2>(tensor_b_);
+    int32_t S = size<1>(tensor_b_);
+    int32_t C = size<0>(tensor_b_);
+
+#if defined(_OPENMP)
+    #pragma omp parallel for collapse(3)
+#endif
+    for (int32_t n = 0; n < N; ++n) {
+      for (int32_t p = 0; p < P; ++p) {
+        for (int32_t q = 0; q < Q; ++q) {
+          for (int32_t k = 0; k < K; ++k) {
+            auto accumulator = ElementAcc(0);
+            for (int32_t r = 0; r < R; ++r) {
+              for (int32_t s = 0; s < S; ++s) {
+                for (int32_t c = 0; c < C; ++c) {
+                  int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
+                  int32_t h =  p * cute::get<1>(tstride_) - cute::get<1>(padding_) + r * cute::get<1>(dilation_);
+                  if (detail::is_activation_in_bounds(tensor_a_, n, h, w, c)) {
+                    auto a = tensor_a_(c, w, h, n);
+                    auto b = tensor_b_(c, s, r, k);
+                    accumulator += ElementAcc(a * b);
+                  }
+                }
+              }
+            }
+            ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
+              epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
+            ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
+              epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
+            ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
+                                    scale_converter(beta) * residual_converter(tensor_c_(k, q, p, n));
+            if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+              output += bias_converter(epi_fusion_params_.tensor_bias[k]);
+            }
+            output = epi_activation(output);
+            tensor_d_(k, q, p, n) = output_converter(output);
+          }
+        }
+      }
+    }
+
+  }
+
+  // Specialization for 3D fprop kernel
+  void fprop_reference(cute::Int<3> spatial_dims) {
+    int32_t N = size<4>(tensor_d_);
+    int32_t Z = size<3>(tensor_d_);
+    int32_t P = size<2>(tensor_d_);
+    int32_t Q = size<1>(tensor_d_);
+    int32_t K = size<0>(tensor_d_);
+    int32_t T = size<3>(tensor_b_);
+    int32_t R = size<2>(tensor_b_);
+    int32_t S = size<1>(tensor_b_);
+    int32_t C = size<0>(tensor_b_);
+
+#if defined(_OPENMP)
+    #pragma omp parallel for collapse(3)
+#endif
+    for (int32_t n = 0; n < N; ++n) {
+      for (int32_t z = 0; z < Z; ++z) {
+        for (int32_t p = 0; p < P; ++p) {
+          for (int32_t q = 0; q < Q; ++q) {
+            for (int32_t k = 0; k < K; ++k) {
+              auto accumulator = ElementAcc(0);
+              for (int32_t t = 0; t < T; ++t) {
+                for (int32_t r = 0; r < R; ++r) {
+                  for (int32_t s = 0; s < S; ++s) {
+                    for (int32_t c = 0; c < C; ++c) {
+                      int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
+                      int32_t h =  p * cute::get<1>(tstride_) - cute::get<1>(padding_) + r * cute::get<1>(dilation_);
+                      int32_t d =  z * cute::get<2>(tstride_) - cute::get<2>(padding_) + t * cute::get<2>(dilation_);
+                      if (detail::is_activation_in_bounds(tensor_a_, n, d, h, w, c)) {
+                        auto a = tensor_a_(c, w, h, d, n);
+                        auto b = tensor_b_(c, s, r, t, k);
+                        accumulator += ElementAcc(a * b);
+                      }
+                    }
+                  }
+                }
+              }
+              ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
+                epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
+              ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
+                epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
+              ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
+                                      scale_converter(beta) * residual_converter(tensor_c_(k, q, p, z, n));
+              if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+                output += bias_converter(epi_fusion_params_.tensor_bias[k]);
+              }
+              output = epi_activation(output);
+              tensor_d_(k, q, p, z, n) = output_converter(output);
+            }
+          }
+        }
+      }
+    }
+
+  }
+
+  // Specialization for 1D dgrad kernel
+  void dgrad_reference(cute::Int<1> spatial_dims) {
+    int32_t N = size<2>(tensor_d_);
+    int32_t W = size<1>(tensor_d_);
+    int32_t C = size<0>(tensor_d_);
+    int32_t K = size<2>(tensor_b_);
+    int32_t S = size<1>(tensor_b_);
+
+#if defined(_OPENMP)
+   #pragma omp parallel for collapse(2)
+#endif
+    for (int32_t n = 0; n < N; ++n) {
+      for (int32_t w = 0; w < W; ++w) {
+        for (int32_t c = 0; c < C; ++c) {
+          auto accumulator = ElementAcc(0);
+          for (int32_t k = 0; k < K; ++k) {
+            for (int32_t s = 0; s < S; ++s) {
+              int32_t q = w + cute::get<0>(padding_) - s * cute::get<0>(dilation_);
+
+              if (q % cute::get<0>(tstride_) == 0) {
+                q /= cute::get<0>(tstride_);
+              } else {
+                continue;
+              }
+
+              if (detail::is_activation_in_bounds(tensor_a_, n, q, k)) {
+                accumulator += ElementAcc(tensor_a_(k, q, n) * tensor_b_(c, s, k));
+              }
+            }
+          }
+          ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data())
+            ? epi_fusion_params_.tensor_alpha[c] : epi_fusion_params_.alpha;
+          ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data())
+            ? epi_fusion_params_.tensor_beta[c] : epi_fusion_params_.beta;
+          ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
+                                  scale_converter(beta) * residual_converter(tensor_c_(c, w, n));
+          if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+            output += bias_converter(epi_fusion_params_.tensor_bias[c]);
+          }
+          output = epi_activation(output);
+          tensor_d_(c, w, n) = output_converter(output);
+        }
+      }
+    }
+
+  }
+
+  // Specialization for 2D dgrad kernel
+  void dgrad_reference(cute::Int<2> spatial_dims) {
+    int32_t N = size<3>(tensor_d_);
+    int32_t H = size<2>(tensor_d_);
+    int32_t W = size<1>(tensor_d_);
+    int32_t C = size<0>(tensor_d_);
+    int32_t K = size<3>(tensor_b_);
+    int32_t R = size<2>(tensor_b_);
+    int32_t S = size<1>(tensor_b_);
+
+#if defined(_OPENMP)
+    #pragma omp parallel for collapse(3)
+#endif
+    for (int32_t n = 0; n < N; ++n) {
+      for (int32_t h = 0; h < H; ++h) {
+        for (int32_t w = 0; w < W; ++w) {
+          for (int32_t c = 0; c < C; ++c) {
+            auto accumulator = ElementAcc(0);
+            for (int32_t k = 0; k < K; ++k) {
+              for (int32_t r = 0; r < R; ++r) {
+                for (int32_t s = 0; s < S; ++s) {
+                  int32_t q = w + cute::get<0>(padding_) - s * cute::get<0>(dilation_);
+                  int32_t p = h + cute::get<1>(padding_) - r * cute::get<1>(dilation_);
+
+                  if (q % cute::get<0>(tstride_) == 0) {
+                    q /= cute::get<0>(tstride_);
+                  } else {
+                    continue;
+                  }
+
+                  if (p % cute::get<1>(tstride_) == 0) {
+                    p /= cute::get<1>(tstride_);
+                  } else {
+                    continue;
+                  }
+
+                  if (detail::is_activation_in_bounds(tensor_a_, n, p, q, k)) {
+                    accumulator += ElementAcc(tensor_a_(k, q, p, n) * tensor_b_(c, s, r, k));
+                  }
+                }
+              }
+            }
+            ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data())
+              ? epi_fusion_params_.tensor_alpha[c] : epi_fusion_params_.alpha;
+            ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data())
+              ? epi_fusion_params_.tensor_beta[c] : epi_fusion_params_.beta;
+            ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
+                                    scale_converter(beta) * residual_converter(tensor_c_(c, w, h, n));
+            if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+              output += bias_converter(epi_fusion_params_.tensor_bias[c]);
+            }
+            output = epi_activation(output);
+
+            tensor_d_(c, w, h, n) = output_converter(output);
+          }
+        }
+      }
+    }
+
+  }
+
+  // Specialization for 3D dgrad kernel
+  void dgrad_reference(cute::Int<3> spatial_dims) {
+    int32_t N = size<4>(tensor_d_);
+    int32_t D = size<3>(tensor_d_);
+    int32_t H = size<2>(tensor_d_);
+    int32_t W = size<1>(tensor_d_);
+    int32_t C = size<0>(tensor_d_);
+    int32_t K = size<4>(tensor_b_);
+    int32_t T = size<3>(tensor_b_);
+    int32_t R = size<2>(tensor_b_);
+    int32_t S = size<1>(tensor_b_);
+
+#if defined(_OPENMP)
+    #pragma omp parallel for collapse(3)
+#endif
+    for (int32_t n = 0; n < N; ++n) {
+      for (int32_t d = 0; d < D; ++d) {
+        for (int32_t h = 0; h < H; ++h) {
+          for (int32_t w = 0; w < W; ++w) {
+            for (int32_t c = 0; c < C; ++c) {
+              auto accumulator = ElementAcc(0);
+              for (int32_t k = 0; k < K; ++k) {
+                for (int32_t t = 0; t < T; ++t) {
+                  for (int32_t r = 0; r < R; ++r) {
+                    for (int32_t s = 0; s < S; ++s) {
+                      int32_t q = w + cute::get<0>(padding_) - s * cute::get<0>(dilation_);
+                      int32_t p = h + cute::get<1>(padding_) - r * cute::get<1>(dilation_);
+                      int32_t z = d + cute::get<2>(padding_) - t * cute::get<2>(dilation_);
+
+                      if (q % cute::get<0>(tstride_) == 0) {
+                        q /= cute::get<0>(tstride_);
+                      } else {
+                        continue;
+                      }
+
+                      if (p % cute::get<1>(tstride_) == 0) {
+                        p /= cute::get<1>(tstride_);
+                      } else {
+                        continue;
+                      }
+
+                      if (z % cute::get<2>(tstride_) == 0) {
+                        z /= cute::get<2>(tstride_);
+                      } else {
+                        continue;
+                      }
+
+                      if (detail::is_activation_in_bounds(tensor_a_, n, z, p, q, k)) {
+                        accumulator += ElementAcc(tensor_a_(k, q, p, z, n) * tensor_b_(c, s, r, t, k));
+                      }
+                    }
+                  }
+                }
+              }
+              ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data())
+                ? epi_fusion_params_.tensor_alpha[c] : epi_fusion_params_.alpha;
+              ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data())
+                ? epi_fusion_params_.tensor_beta[c] : epi_fusion_params_.beta;
+              ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
+                                      scale_converter(beta) * residual_converter(tensor_c_(c, w, h, d, n));
+              if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+                output += bias_converter(epi_fusion_params_.tensor_bias[c]);
+              }
+              output = epi_activation(output);
+              tensor_d_(c, w, h, d, n) = output_converter(output);
+            }
+          }
+        }
+      }
+    }
+
+  }
+
+  // Specialization for 1D wgrad kernel
+  void wgrad_reference(cute::Int<1> spatial_dims) {
+    int32_t N =
+        size<2>(tensor_a_);
+    int32_t Q =
+        size<1>(tensor_a_);
+    int32_t K =
+        size<0>(tensor_a_);
+    int32_t S = size<1>(tensor_d_);
+    int32_t C = size<0>(tensor_d_);
+
+#if defined(_OPENMP)
+    #pragma omp parallel for collapse(2)
+#endif
+    for (int32_t k = 0; k < K; ++k) {
+      ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
+        epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
+      ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
+        epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
+      for (int32_t s = 0; s < S; ++s) {
+        for (int32_t c = 0; c < C; ++c) {
+          auto accumulator = ElementAcc(0);
+          for (int32_t n = 0; n < N; ++n) {
+            for (int32_t q = 0; q < Q; ++q) {
+              int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
+              bool is_in_bounds =
+                  detail::is_activation_in_bounds(tensor_b_, n, w, c);
+              if (is_in_bounds) {
+                auto act =
+                    tensor_b_(c, w, n);
+                auto xformed_act =
+                    tensor_a_(k, q, n);
+                accumulator += ElementAcc(act * xformed_act);
+              }
+            }
+          }
+          ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
+                                  scale_converter(beta) * residual_converter(tensor_c_(c, s, k));
+          if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+            output += bias_converter(epi_fusion_params_.tensor_bias[k]);
+          }
+          output = epi_activation(output);
+          tensor_d_(c, s, k) = output_converter(output);
+        }
+      }
+    }
+  }
+
+  // Specialization for 2D wgrad kernel
+  void wgrad_reference(cute::Int<2> spatial_dims) {
+    int32_t N =
+        size<3>(tensor_a_);
+    int32_t P =
+        size<2>(tensor_a_);
+    int32_t Q =
+        size<1>(tensor_a_);
+    int32_t K =
+        size<0>(tensor_a_);
+    int32_t R = size<2>(tensor_d_);
+    int32_t S = size<1>(tensor_d_);
+    int32_t C = size<0>(tensor_d_);
+
+#if defined(_OPENMP)
+    #pragma omp parallel for collapse(3)
+#endif
+    for (int32_t k = 0; k < K; ++k) {
+      ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
+        epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
+      ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
+        epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
+      for (int32_t r = 0; r < R; ++r) {
+        for (int32_t s = 0; s < S; ++s) {
+          for (int32_t c = 0; c < C; ++c) {
+            auto accumulator = ElementAcc(0);
+            for (int32_t n = 0; n < N; ++n) {
+              for (int32_t p = 0; p < P; ++p) {
+                for (int32_t q = 0; q < Q; ++q) {
+                  int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
+                  int32_t h =  p * cute::get<1>(tstride_) - cute::get<1>(padding_) + r * cute::get<1>(dilation_);
+                  bool is_in_bounds =
+                      detail::is_activation_in_bounds(tensor_b_, n, h, w, c);
+                  if (is_in_bounds) {
+                    auto act =
+                        tensor_b_(c, w, h, n);
+                    auto xformed_act =
+                        tensor_a_(k, q, p, n);
+                    accumulator += ElementAcc(act * xformed_act);
+                  }
+                }
+              }
+            }
+            ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
+                                    scale_converter(beta) * residual_converter(tensor_c_(c, s, r, k));
+            if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+              output += bias_converter(epi_fusion_params_.tensor_bias[k]);
+            }
+            output = epi_activation(output);
+            tensor_d_(c, s, r, k) = output_converter(output);
+          }
+        }
+      }
+    }
+  }
+
+  // Specialization for 3D wgrad kernel
+  void wgrad_reference(cute::Int<3> spatial_dims) {
+    int32_t N =
+        size<4>(tensor_a_);
+    int32_t Z =
+        size<3>(tensor_a_);
+    int32_t P =
+        size<2>(tensor_a_);
+    int32_t Q =
+        size<1>(tensor_a_);
+    int32_t K =
+        size<0>(tensor_a_);
+    int32_t T = size<3>(tensor_d_);
+    int32_t R = size<2>(tensor_d_);
+    int32_t S = size<1>(tensor_d_);
+    int32_t C = size<0>(tensor_d_);
+
+#if defined(_OPENMP)
+    #pragma omp parallel for collapse(3)
+#endif
+    for (int32_t k = 0; k < K; ++k) {
+      ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
+        epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
+      ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
+        epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
+      for (int32_t t = 0; t < T; ++t) {
+        for (int32_t r = 0; r < R; ++r) {
+          for (int32_t s = 0; s < S; ++s) {
+            for (int32_t c = 0; c < C; ++c) {
+              auto accumulator = ElementAcc(0);
+              for (int32_t n = 0; n < N; ++n) {
+                for (int32_t z = 0; z < Z; ++z) {
+                  for (int32_t p = 0; p < P; ++p) {
+                    for (int32_t q = 0; q < Q; ++q) {
+                      int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
+                      int32_t h =  p * cute::get<1>(tstride_) - cute::get<1>(padding_) + r * cute::get<1>(dilation_);
+                      int32_t d =  z * cute::get<2>(tstride_) - cute::get<2>(padding_) + t * cute::get<2>(dilation_);
+                      bool is_in_bounds =
+                          detail::is_activation_in_bounds(tensor_b_, n, d, h, w, c);
+                      if (is_in_bounds) {
+                        auto act =
+                            tensor_b_(c, w, h, d, n);
+                        auto xformed_act =
+                            tensor_a_(k, q, p, z, n);
+                        accumulator += ElementAcc(act * xformed_act);
+                      }
+                    }
+                  }
+                }
+              }
+              ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
+                                      scale_converter(beta) * residual_converter(tensor_c_(c, s, r, t, k));
+              if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+                output += bias_converter(epi_fusion_params_.tensor_bias[k]);
+              }
+              output = epi_activation(output);
+              tensor_d_(c, s, r, t, k) = output_converter(output);
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // cutlass::reference::host
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/util/include/cutlass/util/reference/host/convolution.h b/tools/util/include/cutlass/util/reference/host/convolution.h
index 0e39552799..f28b4a658a 100644
--- a/tools/util/include/cutlass/util/reference/host/convolution.h
+++ b/tools/util/include/cutlass/util/reference/host/convolution.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -65,7 +65,8 @@ template <
   typename LayoutC,
   typename ElementCompute,
   typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename ElementD = ElementC,
+  typename ConvertOp = NumericConverter<ElementD, ElementCompute>,
   typename InnerProductOp = multiply_add<ElementAccumulator>
 >
 void Conv2dFprop(
@@ -73,7 +74,7 @@ void Conv2dFprop(
   TensorRef<ElementA, LayoutA> tensor_x,
   TensorRef<ElementB, LayoutB> tensor_w,
   TensorRef<ElementC, LayoutC> tensor_y_in,
-  TensorRef<ElementC, LayoutC> tensor_y_out,
+  TensorRef<ElementD, LayoutC> tensor_y_out,
   ElementCompute alpha,
   ElementCompute beta) {
 
@@ -86,11 +87,14 @@ void Conv2dFprop(
       for (int q = 0; q < problem_size.Q; ++q) {
         for (int k = 0; k < problem_size.K; ++k) {
 
+          int group_idx = k / (problem_size.K / problem_size.groups);
+          int channels_per_group = problem_size.C / problem_size.groups;
+
           ElementAccumulator acc = ElementAccumulator();
 
           for (int r = 0; r < problem_size.R; ++r) {
             for (int s = 0; s < problem_size.S; ++s) {
-              for (int c = 0; c < problem_size.C; ++c) {
+              for (int c = 0; c < channels_per_group; ++c) {
 
                 int filter_r = r;
                 int filter_s = s;
@@ -105,7 +109,7 @@ void Conv2dFprop(
 
                 if (h >= 0 && h < problem_size.H && w >= 0 && w < problem_size.W) {
 
-                  ElementA a = tensor_x.at({n, h, w, c});
+                  ElementA a = tensor_x.at({n, h, w, c + group_idx * channels_per_group});
                   ElementB b = tensor_w.at({k, r, s, c});
 
                   acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
@@ -137,21 +141,22 @@ template <typename ElementA,
           typename LayoutB,
           typename ElementC,
           typename LayoutC,
-          typename ElementAccumulator,
           typename ElementCompute,
-          typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-          typename InnerProductOp = multiply_add<ElementAccumulator> >
-void Depsep_Fprop(
-                  cutlass::TensorView<ElementA, LayoutA> tensor_A,
+          typename ElementAccumulator = ElementCompute,
+          typename ElementD = ElementC,
+          typename ConvertOp = NumericConverter<ElementD, ElementCompute>,
+          typename InnerProductOp = multiply_add<ElementAccumulator>>
+void Depsep_Fprop(cutlass::TensorView<ElementA, LayoutA> tensor_A,
                   cutlass::TensorView<ElementB, LayoutB> tensor_B,
                   cutlass::TensorView<ElementC, LayoutC> tensor_C,
+                  cutlass::TensorView<ElementD, LayoutC> tensor_D,
                   ElementCompute alpha,
                   ElementCompute beta,
-                  cutlass::Tensor4DCoord padding,
-                  cutlass::Coord<2> conv_stride,
-                  cutlass::Coord<2> dilation,
+                  cutlass::Tensor4DCoord padding = cutlass::Tensor4DCoord(),
+                  cutlass::Coord<2> conv_stride = cutlass::Coord<2>(),
+                  cutlass::Coord<2> dilation = cutlass::Coord<2>(),
                   cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation) {
-  
+
   ConvertOp convert_op;
   InnerProductOp inner_product_op;
 
@@ -163,15 +168,13 @@ void Depsep_Fprop(
           ElementAccumulator acc = ElementAccumulator();
           for (int r = 0; r < tensor_B.extent().h(); ++r) {
             for (int s = 0; s < tensor_B.extent().w(); ++s) {
-              if ((p * conv_stride[0] - padding[0] + r * dilation[0]) < tensor_A.extent().h() &&
-                  (p * conv_stride[0] - padding[0] + r * dilation[0]) >= 0 &&
-                  (q * conv_stride[1] - padding[2] + s * dilation[1]) < tensor_A.extent().w() &&
-                  (q * conv_stride[1] - padding[2] + s * dilation[1]) >= 0) {
-                ElementA a = tensor_A.at(
-                    cutlass::make_Coord(n,
-                                        p * conv_stride[0] - padding[0] + r * dilation[0],
-                                        q * conv_stride[1] - padding[2] + s * dilation[1],
-                                        g));
+              
+              // input activation H and W
+              int h = p * conv_stride[0] - padding[0] + r * dilation[0];
+              int w = q * conv_stride[1] - padding[2] + s * dilation[1];
+
+              if (h < tensor_A.extent().h() && h >= 0 && w < tensor_A.extent().w() && w >= 0) {
+                ElementA a = tensor_A.at(cutlass::make_Coord(n, h, w, g));
 
                 ElementB b = (mode == cutlass::conv::Mode::kCrossCorrelation)
                                    ? tensor_B.at(cutlass::make_Coord(g, r, s, 0))
@@ -185,7 +188,7 @@ void Depsep_Fprop(
 
           // Apply Epilogue, compute ElementCompute, convert and store ElementC
           ElementC c_ref = tensor_C.at(cutlass::make_Coord(n, p, q, g));
-          tensor_C.at(cutlass::make_Coord(n, p, q, g)) =
+          tensor_D.at(cutlass::make_Coord(n, p, q, g)) =
               convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
         }
       }
@@ -194,7 +197,7 @@ void Depsep_Fprop(
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Dgrad
+/// Dgrad / Deconv
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// dx = dgrad(dy, w)
@@ -207,7 +210,8 @@ template <
   typename LayoutC,
   typename ElementCompute,
   typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename ElementD = ElementC,
+  typename ConvertOp = NumericConverter<ElementD, ElementCompute>,
   typename InnerProductOp = multiply_add<ElementAccumulator>
 >
 void Conv2dDgrad(
@@ -215,9 +219,10 @@ void Conv2dDgrad(
   TensorRef<ElementA, LayoutA> tensor_dy,
   TensorRef<ElementB, LayoutB> tensor_w,
   TensorRef<ElementC, LayoutC> tensor_dx_in,
-  TensorRef<ElementC, LayoutC> tensor_dx_out,
+  TensorRef<ElementD, LayoutC> tensor_dx_out,
   ElementCompute alpha,
-  ElementCompute beta) {
+  ElementCompute beta,
+  bool is_deconv = false) {
 
   ConvertOp convert_op;
   InnerProductOp inner_product_op;
@@ -268,7 +273,8 @@ void Conv2dDgrad(
                   if (p < problem_size.P && q < problem_size.Q) {
 
                     ElementA a = tensor_dy.at(cutlass::make_Coord(n, p, q, k));
-                    ElementB b = tensor_w.at(cutlass::make_Coord(k, r, s, c));
+                    ElementB b = is_deconv ? tensor_w.at(cutlass::make_Coord(c, r, s, k))
+                        : tensor_w.at(cutlass::make_Coord(k, r, s, c));
 
                     acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
                   }
@@ -308,7 +314,8 @@ template <
   typename LayoutC,
   typename ElementCompute,
   typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename ElementD = ElementC,
+  typename ConvertOp = NumericConverter<ElementD, ElementCompute>,
   typename InnerProductOp = multiply_add<ElementAccumulator>
 >
 void Conv2dWgrad(
@@ -316,7 +323,7 @@ void Conv2dWgrad(
   TensorRef<ElementA, LayoutA> tensor_dy,
   TensorRef<ElementB, LayoutB> tensor_x,
   TensorRef<ElementC, LayoutC> tensor_dw_in,
-  TensorRef<ElementC, LayoutC> tensor_dw_out,
+  TensorRef<ElementD, LayoutC> tensor_dw_out,
   ElementCompute alpha,
   ElementCompute beta) {
   
@@ -388,7 +395,8 @@ template <
   typename LayoutC,
   typename ElementCompute,
   typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename ElementD = ElementC,
+  typename ConvertOp = NumericConverter<ElementD, ElementCompute>,
   typename InnerProductOp = multiply_add<ElementAccumulator>
 >
 void Conv2d(
@@ -397,7 +405,7 @@ void Conv2d(
   TensorRef<ElementA, LayoutA> tensor_A,
   TensorRef<ElementB, LayoutB> tensor_B,
   TensorRef<ElementC, LayoutC> tensor_C,
-  TensorRef<ElementC, LayoutC> tensor_D,
+  TensorRef<ElementD, LayoutC> tensor_D,
   ElementCompute alpha,
   ElementCompute beta) {
 
@@ -408,11 +416,13 @@ void Conv2d(
       ElementB, LayoutB,
       ElementC, LayoutC,
       ElementCompute,
-      ElementAccumulator, 
+      ElementAccumulator,
+      ElementD,
       ConvertOp, InnerProductOp
     >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
     break;
 
+  case conv::Operator::kDeconv:
   case conv::Operator::kDgrad:
     Conv2dDgrad<
       ElementA, LayoutA,
@@ -420,8 +430,9 @@ void Conv2d(
       ElementC, LayoutC,
       ElementCompute,
       ElementAccumulator,
+      ElementD,
       ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, (convolutional_operator == conv::Operator::kDeconv));
     break;
 
   case conv::Operator::kWgrad:
@@ -430,7 +441,8 @@ void Conv2d(
       ElementB, LayoutB,
       ElementC, LayoutC,
       ElementCompute,
-      ElementAccumulator, 
+      ElementAccumulator,
+      ElementD,
       ConvertOp, InnerProductOp
     >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
     break;
@@ -528,7 +540,7 @@ void Conv3dFprop(
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Dgrad
+/// Dgrad / Deconv
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// dx = dgrad(dy, w)
@@ -551,7 +563,8 @@ void Conv3dDgrad(
   TensorRef<ElementC, LayoutC> tensor_dx_in,
   TensorRef<ElementC, LayoutC> tensor_dx_out,
   ElementCompute alpha,
-  ElementCompute beta) {
+  ElementCompute beta,
+  bool is_deconv = false) {
 
   ConvertOp convert_op;
   InnerProductOp inner_product_op;
@@ -595,8 +608,8 @@ void Conv3dDgrad(
                       if (z < problem_size.Z && p < problem_size.P && q < problem_size.Q) {
 
                         ElementA a = tensor_dy.at(cutlass::make_Coord(n, z, p, q, k));
-                        ElementB b = tensor_w.at(cutlass::make_Coord(k, t, r, s, c));
-
+                        ElementB b = is_deconv ? tensor_w.at(cutlass::make_Coord(c, t, r, s, k))
+                            : tensor_w.at(cutlass::make_Coord(k, t, r, s, c));
                         acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
                       }
                     }
@@ -751,6 +764,7 @@ void Conv3d(
     >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
     break;
 
+  case conv::Operator::kDeconv:
   case conv::Operator::kDgrad:
     Conv3dDgrad<
       ElementA, LayoutA,
@@ -759,7 +773,7 @@ void Conv3d(
       ElementCompute,
       ElementAccumulator, 
       ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, (convolutional_operator == conv::Operator::kDeconv));
     break;
 
   case conv::Operator::kWgrad:
diff --git a/tools/util/include/cutlass/util/reference/host/error_metrics.h b/tools/util/include/cutlass/util/reference/host/error_metrics.h
index e4c6acd604..86db65ccc4 100644
--- a/tools/util/include/cutlass/util/reference/host/error_metrics.h
+++ b/tools/util/include/cutlass/util/reference/host/error_metrics.h
@@ -1,6 +1,6 @@
 
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/host/gemm.h b/tools/util/include/cutlass/util/reference/host/gemm.h
index fc92225425..0388813109 100644
--- a/tools/util/include/cutlass/util/reference/host/gemm.h
+++ b/tools/util/include/cutlass/util/reference/host/gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -335,7 +335,7 @@ struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Parital specialization for XOR-popc
+/// Partial specialization for XOR-popc
 template <typename ElementA, typename LayoutA, typename ElementB,
           typename LayoutB, typename ElementC, typename LayoutC,
           typename ScalarType, typename ComputeType>
@@ -372,6 +372,84 @@ struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
   }
 };
 
+/// Partial specialization for AND-popc
+template <typename ElementA, typename LayoutA, typename ElementB,
+          typename LayoutB, typename ElementC, typename LayoutC,
+          typename ScalarType, typename ComputeType>
+struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
+            ComputeType, arch::OpAndPopc> {
+
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, and_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
+  }
+
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, and_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for multiply-add
+template <typename ElementA, typename LayoutA, typename ElementB,
+          typename LayoutB, typename ElementC, typename LayoutC,
+          typename ScalarType, typename ComputeType>
+struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
+            ComputeType, arch::OpMultiplyAddFastF32> {
+
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
+  }
+  
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 //
 // Batched GEMM
diff --git a/tools/util/include/cutlass/util/reference/host/gemm_complex.h b/tools/util/include/cutlass/util/reference/host/gemm_complex.h
index 37221b09d0..92da343a9c 100644
--- a/tools/util/include/cutlass/util/reference/host/gemm_complex.h
+++ b/tools/util/include/cutlass/util/reference/host/gemm_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -39,8 +39,10 @@
 #include "cutlass/numeric_types.h"
 #include "cutlass/functional.h"
 #include "cutlass/numeric_conversion.h"
+#include "cutlass/matrix_coord.h"
 
 #include "cutlass/tensor_view.h"
+
 #include "cutlass/gemm/gemm.h"
 
 namespace cutlass {
@@ -65,7 +67,8 @@ template <
   typename LayoutC,
   typename ScalarType,
   typename ComputeType,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
+  typename ElementD = ElementC,
+  typename ConvertOp = NumericConverter<ElementD, ScalarType>,
   typename InnerProductOp = multiply_add<ComputeType>
 >
 void GemmComplex(
@@ -77,7 +80,7 @@ void GemmComplex(
   ComplexTransform transform_b,
   ScalarType beta,
   TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementC, LayoutC> tensor_d,
+  TensorRef<ElementD, LayoutC> tensor_d,
   ComputeType initial_accum,
   int batch_count = 1,
   int64_t batch_stride_A = 0,
@@ -183,7 +186,8 @@ template <
   typename LayoutB,
   typename ElementC,
   typename LayoutC,
-  typename ScalarType
+  typename ScalarType,
+  typename ElementD = ElementC
 >
 void GemmComplex(
   gemm::GemmCoord problem_size,
@@ -194,7 +198,7 @@ void GemmComplex(
   ComplexTransform transform_b,
   ScalarType beta,
   TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementC, LayoutC> tensor_d) {
+  TensorRef<ElementD, LayoutC> tensor_d) {
 
   GemmComplex(problem_size, alpha, tensor_a, transform_a, tensor_b, transform_b, beta, tensor_c, tensor_d, ScalarType(0));
 }
diff --git a/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h b/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h
index c1e5c09435..094af8b37b 100644
--- a/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h
+++ b/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -172,7 +172,7 @@ void GemmPlanarComplex(
             complex<ScalarType> result = alpha * acc + beta * src;
 
             d_ij.real() = convert_op(result.real());
-            d_ij.imag() = convert_op(result.imag());;
+            d_ij.imag() = convert_op(result.imag());
 
             tensor_d.at(coord) = d_ij;
           }
diff --git a/tools/util/include/cutlass/util/reference/host/gett.hpp b/tools/util/include/cutlass/util/reference/host/gett.hpp
new file mode 100644
index 0000000000..184d773783
--- /dev/null
+++ b/tools/util/include/cutlass/util/reference/host/gett.hpp
@@ -0,0 +1,537 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for GETT in host-side code.
+*/
+
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/relatively_equal.h"
+
+#include "cute/tensor.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::reference::host {
+
+template<class T, class = void>
+struct ElementTraits {
+  using type = T;
+};
+
+template<class T>
+struct ElementTraits<T, std::enable_if_t<!std::is_same_v<decltype(std::declval<T>().get()), void> > >  {
+  using type = decltype(std::declval<T>().get());
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+  class ElementAccumulator_,
+  class TensorA_,                                                                                         // (M, K, L)
+  class TensorB_                                                                                          // (N, K, L)
+>
+struct GettMainloopParams {
+  using ElementAccumulator = ElementAccumulator_;
+  using TensorA = TensorA_;
+  using TensorB = TensorB_;
+  using EngineA = typename TensorA::engine_type;
+  using LayoutA = typename TensorA::layout_type;
+  using EngineB = typename TensorB::engine_type;
+  using LayoutB = typename TensorB::layout_type;
+
+  TensorA A{};
+  TensorB B{};
+
+  ComplexTransform transform_A = ComplexTransform::kNone;
+  ComplexTransform transform_B = ComplexTransform::kNone;
+  
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template<
+  class ElementScalar_,
+  class ElementScalingFactor_,
+  class ElementAccumulator_,
+  class ElementCompute_,
+  class TensorC_,                                                                                          // (M, N, L)
+  class TensorD_,                                                                                          // (M, N, L)
+  class VectorBias_ = TensorD_,                                                                            //    (M, 1)
+  class TensorAux_ = TensorD_,                                                                             // (M, N, L)
+  class VectorAlpha_ = TensorD_,                                                                           //    (M, 1)
+  class VectorBeta_ = VectorAlpha_,                                                                        //    (M, 1)
+  class ActivationFunctor_ = cutlass::epilogue::thread::Identity<ElementCompute_>,
+  class BiasBinaryOp_ = cutlass::plus<ElementCompute_>,
+  bool PerColumnBias_ = false
+>
+struct GettEpilogueParams {
+  using ElementScalar = ElementScalar_;
+  using ElementScalingFactor = ElementScalingFactor_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using TensorC = TensorC_;
+  using TensorD = TensorD_;
+  using TensorAux = TensorAux_;
+  using VectorBias = VectorBias_;
+  using VectorAlpha = VectorAlpha_;
+  using VectorBeta = VectorBeta_;
+  using ActivationFunctor = ActivationFunctor_;
+  using BiasBinaryOp = BiasBinaryOp_;
+
+  using EngineC = typename TensorC::engine_type;
+  using LayoutC = typename TensorC::layout_type;
+  using EngineD =  typename TensorD::engine_type;
+  using LayoutD = typename TensorD::layout_type;
+  static constexpr bool PerColumnBias = PerColumnBias_;
+  ElementScalar alpha = ElementScalar(1);
+  ElementScalar beta = ElementScalar(0);
+
+  TensorC C{};
+  TensorD D{};
+  VectorBias Bias{};
+  TensorAux Aux{};
+  VectorAlpha Valpha{};
+  VectorBeta Vbeta{};
+  ElementCompute st = ElementCompute(1);
+
+  ElementAccumulator* abs_max_D = nullptr;
+  ElementAccumulator* abs_max_Aux = nullptr;
+
+  ElementScalingFactor scale_a = ElementScalingFactor(1);
+  ElementScalingFactor scale_b = ElementScalingFactor(1);
+  ElementScalingFactor scale_c = ElementScalingFactor(1);
+  ElementScalingFactor scale_d = ElementScalingFactor(1);
+  ElementScalingFactor scale_aux = ElementScalingFactor(1);
+
+  bool beta_per_channel_scaling = false;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// GETT - General Tensor-Tensor contraction reference kernel
+template <
+  class MainloopParams,
+  class EpilogueParams
+>
+void Gett(
+    MainloopParams const& mainloop_params,
+    EpilogueParams const& epilogue_params)
+{
+
+  static int constexpr kBlockM = 64;
+  static int constexpr kBlockN = 64;
+
+#if defined(_OPENMP)
+  #pragma omp parallel for collapse(3)
+#endif
+  for (int64_t l = 0; l < cute::size<2>(mainloop_params.A.layout()); ++l) {
+    for (int64_t m = 0; m < cute::size<0>(mainloop_params.A.layout()); m += kBlockM) {
+      for (int64_t n = 0; n < cute::size<0>(mainloop_params.B.layout()); n += kBlockN) {
+        typename MainloopParams::ElementAccumulator acc[kBlockM][kBlockN];
+        gett_mainloop(mainloop_params, m, n, l, acc);
+        gett_epilogue(epilogue_params, m, n, l, acc);
+      }
+    }
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// GETT - Mainloop
+template <class MainloopParams, class ElementAccumulator, int kBlockM, int kBlockN>
+void gett_mainloop(
+    MainloopParams const& mainloop_params,
+    int64_t m,
+    int64_t n,
+    int64_t l,
+    ElementAccumulator (&acc)[kBlockM][kBlockN])
+{
+
+  static_assert(cute::rank(typename MainloopParams::LayoutA{}) == 3, "M, K, B");
+  static_assert(cute::rank(typename MainloopParams::LayoutB{}) == 3, "N, K, B");
+  
+  using cute::raw_pointer_cast;
+
+  using ElementA = typename ElementTraits<typename MainloopParams::EngineA::value_type>::type;
+  using ElementB = typename ElementTraits<typename MainloopParams::EngineB::value_type>::type;
+
+  using RingOp = multiply_add<ElementAccumulator, ElementAccumulator, ElementAccumulator>;
+  RingOp fma_op;
+
+  // Zero out accumulators
+  for (int m_b = 0; m_b < kBlockM; ++m_b) {
+    for (int n_b = 0; n_b < kBlockN; ++n_b) {
+      acc[m_b][n_b] = ElementAccumulator(0); // RingOp::AdditionIdentity
+    }
+  }
+
+  // Compute on this k-block
+  for (int64_t k = 0; k < cute::size<1>(mainloop_params.A.layout()); ++k) {
+    // Load A
+    ElementAccumulator a_frag[kBlockM];
+    for (int m_b = 0; m_b < kBlockM; ++m_b) {
+      if (m + m_b < cute::size<0>(mainloop_params.A.layout())) {
+        // Perform reference GEMM calculations at the accumulator's precision. Cast A value to accumulator type.
+        a_frag[m_b] = static_cast<ElementAccumulator>(ElementA(mainloop_params.A(m + m_b, k, l)));
+        
+        if (mainloop_params.transform_A == ComplexTransform::kConjugate) {
+          a_frag[m_b] = conj(a_frag[m_b]);
+        }
+      } else {
+        a_frag[m_b] = ElementAccumulator(0); // RingOp::AdditionIdentity
+      }
+    }
+
+    // Load B
+    ElementAccumulator b_frag[kBlockN];
+    for (int n_b = 0; n_b < kBlockN; ++n_b) {
+      if (n + n_b < cute::size<0>(mainloop_params.B.layout())) {
+        // Perform reference GEMM calculations at the accumulator's precision. Cast A value to accumulator type.
+        b_frag[n_b] = static_cast<ElementAccumulator>(ElementB(mainloop_params.B(n + n_b, k, l)));
+
+        if (mainloop_params.transform_B == ComplexTransform::kConjugate) {
+          b_frag[n_b] = conj(b_frag[n_b]);
+        }
+      } else {
+        b_frag[n_b] = ElementAccumulator(0); // RingOp::AdditionIdentity
+      }
+    }
+
+    // do compute
+    for (int m_b = 0; m_b < kBlockM; ++m_b) {
+      for (int n_b = 0; n_b < kBlockN; ++n_b) {
+        acc[m_b][n_b] = fma_op(a_frag[m_b], b_frag[n_b], acc[m_b][n_b]);
+      }
+    }
+
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// GETT - Epilogue
+template <class EpilogueParams, class ElementAccumulator, int kBlockM, int kBlockN>
+void gett_epilogue(
+    EpilogueParams const& epilogue_params,
+    int64_t m,
+    int64_t n,
+    int64_t l,
+    ElementAccumulator (&acc)[kBlockM][kBlockN])
+{
+  static_assert(cute::rank(typename EpilogueParams::LayoutC{}) == 3, "M, K, B");
+  static_assert(cute::rank(typename EpilogueParams::LayoutD{}) == 3, "N, K, B");
+
+  using cute::raw_pointer_cast;
+
+  using ElementCompute = typename EpilogueParams::ElementCompute;
+  using ElementC = typename EpilogueParams::TensorC::value_type;
+  using ElementD = typename EpilogueParams::TensorD::value_type;
+  using ElementAux = typename EpilogueParams::TensorAux::value_type;
+  using ElementBias = typename EpilogueParams::VectorBias::value_type;
+  using ElementScalar = typename EpilogueParams::ElementScalar;
+  using ElementScalingFactor = typename EpilogueParams::ElementScalingFactor;
+  using ActivationFunctor = typename EpilogueParams::ActivationFunctor;
+  using BiasBinaryOp = typename EpilogueParams::BiasBinaryOp;
+
+  constexpr bool PerColBias = EpilogueParams::PerColumnBias;
+  constexpr bool IsScalingAndAmaxOutputNeeded = 
+      cute::is_same_v<ElementD, cutlass::float_e4m3_t> or
+      cute::is_same_v<ElementD, cutlass::float_e5m2_t>;
+
+  constexpr bool IsScalingAndAmaxAuxOutputNeeded =
+      cute::is_same_v<ElementAux, cutlass::float_e4m3_t> or
+      cute::is_same_v<ElementAux, cutlass::float_e5m2_t>;
+
+  constexpr bool IsReLUAuxNeeded =
+      (cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::ReLu<ElementCompute>> or
+       cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::Clamp<ElementCompute>>) and 
+      cute::is_same_v<ElementAux, cutlass::uint1b_t>;
+  constexpr bool IsClamp =
+      cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::Clamp<ElementCompute>>;
+  constexpr bool IsBackpropFusion =
+      cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::dGELU<ElementCompute>> or
+      cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::dReLU<ElementCompute>>;
+
+  // Input related converter
+  NumericConverter<ElementCompute, ElementAccumulator> accumulator_converter;
+  NumericConverter<ElementCompute, ElementC> source_converter;
+  NumericConverter<ElementCompute, ElementBias> bias_converter;
+  [[maybe_unused]] NumericConverter<ElementCompute, ElementAux> aux_source_converter;
+
+  // Scale related converter
+  NumericConverter<ElementCompute, ElementScalar> scale_converter;
+  NumericConverter<ElementCompute, ElementScalingFactor> scaling_factor_converter;
+
+  // Abs max converter
+  [[maybe_unused]] NumericConverter<ElementAccumulator, ElementCompute> abs_max_output_converter;
+
+  // Output related converter
+  NumericConverter<ElementD, ElementCompute> destination_converter;
+  [[maybe_unused]] NumericConverter<ElementAux, ElementCompute> aux_destination_converter;
+  NumericConverter<ElementBias, ElementCompute> dBias_converter;
+
+  // Epilogue operations
+  multiply_add<ElementCompute, ElementCompute, ElementCompute> epilogue_fma;
+  multiplies<ElementCompute> mul;
+  plus<ElementCompute> add;
+
+  // Activation operation
+  ActivationFunctor activation;
+
+  // Bias binary operation
+  BiasBinaryOp bias_op;
+
+  // Do conversion
+  ElementCompute converted_alpha = scale_converter(epilogue_params.alpha);
+  ElementCompute converted_beta = scale_converter(epilogue_params.beta);
+  ElementCompute converted_scale_a = scaling_factor_converter(epilogue_params.scale_a);
+  ElementCompute converted_scale_b = scaling_factor_converter(epilogue_params.scale_b);
+  ElementCompute converted_scale_c = scaling_factor_converter(epilogue_params.scale_c);
+  ElementCompute converted_scale_d = scaling_factor_converter(epilogue_params.scale_d);
+  ElementCompute converted_scale_aux = scaling_factor_converter(epilogue_params.scale_aux);
+
+  // Init local var
+  [[maybe_unused]] ElementCompute local_abs_max_output = ElementCompute(0);
+  [[maybe_unused]] ElementCompute local_abs_max_aux_output = ElementCompute(0);
+
+  converted_alpha = mul(converted_alpha, mul(converted_scale_a, converted_scale_b));
+  converted_beta = mul(converted_beta, converted_scale_c);
+
+  ElementCompute inter_accum[kBlockM][kBlockN];
+
+  for (int m_b = 0; m_b < kBlockM; ++m_b) {
+    ElementCompute local_dBias = ElementCompute(0);
+
+    for (int n_b = 0; n_b < kBlockN; ++n_b) {
+      if (m + m_b < cute::size<0>(epilogue_params.D.layout()) && n + n_b < cute::size<1>(epilogue_params.D.layout())) {
+        // Convert every type to ElementCompute first, do compute, convert to output type, write it out
+        ElementCompute converted_acc = accumulator_converter(acc[m_b][n_b]);
+        // per-row alpha
+        if (raw_pointer_cast(epilogue_params.Valpha.data())) {
+          converted_alpha = scale_converter(epilogue_params.Valpha(m + m_b, n + n_b, l));
+          converted_alpha = mul(converted_alpha, mul(converted_scale_a, converted_scale_b));
+        }
+        ElementCompute output = mul(converted_alpha, converted_acc);
+
+        if (raw_pointer_cast(epilogue_params.Bias.data()) && not IsBackpropFusion) {
+          ElementCompute converted_bias = bias_converter(epilogue_params.Bias(PerColBias ? n + n_b : m + m_b));
+          output = bias_op(output, converted_bias);
+        }
+
+        if (raw_pointer_cast(epilogue_params.C.data())) {
+          ElementCompute converted_src = source_converter(epilogue_params.C(m + m_b, n + n_b, l));
+          // per-row beta
+          if (epilogue_params.Vbeta.data()) {
+            converted_beta = scale_converter(epilogue_params.Vbeta(m + m_b, n + n_b, l));
+            converted_beta = mul(converted_beta, converted_scale_c);
+          }
+          output = epilogue_fma(converted_beta, converted_src, output);
+        }
+
+        if constexpr (IsBackpropFusion) {
+          ElementAux aux_input = ElementAux(0);
+          if (raw_pointer_cast(epilogue_params.Aux.data())) {
+            aux_input = epilogue_params.Aux(m + m_b, n + n_b, l);
+          }
+
+          output = activation(output, aux_source_converter(aux_input));
+          local_dBias = add(local_dBias, output);
+        }
+        else {
+          if (raw_pointer_cast(epilogue_params.Aux.data())) {
+            auto aux_output = output;
+            if constexpr (IsScalingAndAmaxAuxOutputNeeded) {
+              maximum_absolute_value_reduction<ElementCompute, true> amax_op;
+              local_abs_max_aux_output = amax_op(local_abs_max_aux_output, aux_output);
+              aux_output = epilogue_fma(converted_scale_aux, aux_output, ElementCompute(0));
+            }
+
+            if constexpr (IsReLUAuxNeeded) {
+              epilogue_params.Aux(m + m_b, n + n_b, l) = not (aux_output < 0) ? uint1b_t(1) : uint1b_t(0);
+            } else {
+              epilogue_params.Aux(m + m_b, n + n_b, l) = aux_destination_converter(aux_output);
+            }
+          }
+
+          if constexpr (IsClamp) { // Treat Clamp as ReLU
+            output = activation(output, {0, std::numeric_limits<ElementCompute>::max()});
+          }
+          else {
+            output = activation(output);
+          }
+        }
+
+        if constexpr (IsScalingAndAmaxOutputNeeded) {
+          maximum_absolute_value_reduction<ElementCompute, true> amax_op;
+          local_abs_max_output = amax_op(local_abs_max_output, output);
+          output = epilogue_fma(converted_scale_d, output, ElementCompute(0));
+        }
+
+        inter_accum[m_b][n_b] = ElementCompute(output);
+      }
+    } // n_b
+
+    if (m + m_b < cute::size<0>(epilogue_params.D.layout()) && n < cute::size<1>(epilogue_params.D.layout())) {
+      if (raw_pointer_cast(epilogue_params.Bias.data()) && IsBackpropFusion) {
+        ElementCompute converted_dBias = bias_converter(epilogue_params.Bias(m + m_b));
+        local_dBias = add(local_dBias, converted_dBias);
+        epilogue_params.Bias(m + m_b) = dBias_converter(local_dBias);
+      }
+    }
+  } // m_b
+  for (int m_b = 0; m_b < kBlockM; ++m_b) {
+    for (int n_b = 0; n_b < kBlockN; ++n_b) {
+      if (m + m_b < cute::size<0>(epilogue_params.D.layout()) && n + n_b < cute::size<1>(epilogue_params.D.layout())) {
+        epilogue_params.D(m + m_b, n + n_b, l) = destination_converter(inter_accum[m_b][n_b]);
+      }
+    }
+  }
+
+#if defined(_OPENMP)
+  #pragma omp critical(Abs_Max_Data_Update)
+#endif
+  {
+    if constexpr (IsScalingAndAmaxOutputNeeded) {
+      if (epilogue_params.abs_max_D) {
+        *epilogue_params.abs_max_D = maximum_with_nan_propogation<ElementAccumulator>{}(
+          *epilogue_params.abs_max_D, abs_max_output_converter(local_abs_max_output));
+      }
+    }
+
+    if constexpr (IsScalingAndAmaxAuxOutputNeeded) {
+      if (epilogue_params.abs_max_Aux) {
+        *epilogue_params.abs_max_Aux = maximum_with_nan_propogation<ElementAccumulator>{}(
+            *epilogue_params.abs_max_Aux, abs_max_output_converter(local_abs_max_aux_output));
+      }
+    }
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class TensorType>
+auto make_layout_rank3(const TensorType& tensor) {
+  // append a batch mode of size 1 if we do not have tensors that are rank 3
+  return make_layout(
+      make_shape(cute::get<0>(tensor.shape()), cute::get<1>(tensor.shape()), cute::Int<1>{}),
+      make_stride(cute::get<0>(tensor.stride()), cute::get<1>(tensor.stride()), int64_t(cosize(tensor.layout()))));
+}
+
+/// GEMM - General Matrix-Matrix contraction without conjugation options
+template <
+  class MainloopParams,
+  class EpilogueParams
+>
+void Gemm3x(
+    MainloopParams const& mainloop_params,
+    EpilogueParams const& epilogue_params)
+{
+  using namespace cute;
+
+  static_assert(cute::rank(typename MainloopParams::LayoutA{}) == cute::rank(typename MainloopParams::LayoutB{}));
+  static_assert(cute::rank(typename EpilogueParams::LayoutC{}) == cute::rank(typename EpilogueParams::LayoutD{}));
+  static_assert(cute::rank(typename MainloopParams::LayoutA{}) == cute::rank(typename EpilogueParams::LayoutC{}));
+
+  if constexpr (cute::rank(typename MainloopParams::LayoutA{}) == 2) {
+    cute::Layout layout_A = make_layout_rank3(mainloop_params.A);
+    cute::Layout layout_B = make_layout_rank3(mainloop_params.B);
+    cute::Layout layout_C = make_layout_rank3(epilogue_params.C);
+    cute::Layout layout_D = make_layout_rank3(epilogue_params.D);
+    cute::Layout layout_Aux = make_layout_rank3(epilogue_params.Aux);
+    cute::Layout layout_Bias = make_layout_rank3(epilogue_params.Bias);
+    cute::Layout layout_Valpha = make_layout_rank3(epilogue_params.Valpha);
+    cute::Layout layout_Vbeta = make_layout_rank3(epilogue_params.Vbeta);
+    
+    auto TensorA = make_tensor(mainloop_params.A.data(), layout_A);
+    auto TensorB = make_tensor(mainloop_params.B.data(), layout_B);
+    auto TensorC = make_tensor(epilogue_params.C.data(), layout_C);
+    auto TensorD = make_tensor(epilogue_params.D.data(), layout_D);
+    auto TensorAux = make_tensor(epilogue_params.Aux.data(), layout_Aux);
+    auto VectorBias = make_tensor(epilogue_params.Bias.data(), layout_Bias);
+    auto VectorAlpha = make_tensor(epilogue_params.Valpha.data(), layout_Valpha);
+    auto VectorBeta = make_tensor(epilogue_params.Vbeta.data(), layout_Vbeta);
+
+    // Reconstruct mainloop params
+    GettMainloopParams<typename MainloopParams::ElementAccumulator,
+                       decltype(TensorA),
+                       decltype(TensorB)>
+        mainloop_params_converted{TensorA,
+                                  TensorB,
+                                  mainloop_params.transform_A,
+                                  mainloop_params.transform_B};
+
+    // Reconstruct epilogue params
+    GettEpilogueParams<typename EpilogueParams::ElementScalar,
+                       typename EpilogueParams::ElementScalingFactor,
+                       typename EpilogueParams::ElementAccumulator,
+                       typename EpilogueParams::ElementCompute,
+                       decltype(TensorC),
+                       decltype(TensorD),
+                       decltype(VectorBias),
+                       decltype(TensorAux),
+                       decltype(VectorAlpha),
+                       decltype(VectorBeta)
+                      >
+        epilogue_params_converted{epilogue_params.alpha,
+                                  epilogue_params.beta,
+                                  TensorC,
+                                  TensorD,
+                                  VectorBias,
+                                  TensorAux,
+                                  VectorAlpha,
+                                  VectorBeta,
+                                  epilogue_params.abs_amax_D,
+                                  epilogue_params.abs_amax_Aux,
+                                  epilogue_params.scale_a,
+                                  epilogue_params.scale_b,
+                                  epilogue_params.scale_c,
+                                  epilogue_params.scale_d,
+                                  epilogue_params.scale_aux
+                                  };
+
+    Gett(mainloop_params_converted, epilogue_params_converted);
+  }
+  else {
+    // if we already have a batch mode, just pass it through
+    Gett(mainloop_params, epilogue_params);
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // cutlass::reference::host
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/util/include/cutlass/util/reference/host/rank_2k.h b/tools/util/include/cutlass/util/reference/host/rank_2k.h
index 5abcc432bb..2a99bc03a3 100644
--- a/tools/util/include/cutlass/util/reference/host/rank_2k.h
+++ b/tools/util/include/cutlass/util/reference/host/rank_2k.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/host/rank_2k_complex.h b/tools/util/include/cutlass/util/reference/host/rank_2k_complex.h
index 094991114f..9e1ac76cda 100644
--- a/tools/util/include/cutlass/util/reference/host/rank_2k_complex.h
+++ b/tools/util/include/cutlass/util/reference/host/rank_2k_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -41,7 +41,7 @@
 #include "cutlass/numeric_conversion.h"
 #include "cutlass/tensor_view.h"
 #include "cutlass/gemm/gemm.h"
-#include <assert.h>
+#include <cassert>
 
 namespace cutlass {
 namespace reference {
diff --git a/tools/util/include/cutlass/util/reference/host/rank_k_complex.h b/tools/util/include/cutlass/util/reference/host/rank_k_complex.h
index 7e6c39ccec..6f9d5dc40f 100644
--- a/tools/util/include/cutlass/util/reference/host/rank_k_complex.h
+++ b/tools/util/include/cutlass/util/reference/host/rank_k_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -41,7 +41,7 @@
 #include "cutlass/numeric_conversion.h"
 #include "cutlass/tensor_view.h"
 #include "cutlass/gemm/gemm.h"
-#include <assert.h>
+#include <cassert>
 
 namespace cutlass {
 namespace reference {
diff --git a/tools/util/include/cutlass/util/reference/host/symm.h b/tools/util/include/cutlass/util/reference/host/symm.h
index d30e9a9473..a585caf73f 100644
--- a/tools/util/include/cutlass/util/reference/host/symm.h
+++ b/tools/util/include/cutlass/util/reference/host/symm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/host/symm_complex.h b/tools/util/include/cutlass/util/reference/host/symm_complex.h
index acb35aa15f..7a55bb39c6 100644
--- a/tools/util/include/cutlass/util/reference/host/symm_complex.h
+++ b/tools/util/include/cutlass/util/reference/host/symm_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -41,7 +41,7 @@
 #include "cutlass/numeric_conversion.h"
 #include "cutlass/tensor_view.h"
 #include "cutlass/gemm/gemm.h"
-#include <assert.h>
+#include <cassert>
 
 namespace cutlass {
 namespace reference {
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_compare.h b/tools/util/include/cutlass/util/reference/host/tensor_compare.h
index 794e21d8a3..df164a37e9 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_compare.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_compare.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -39,11 +39,11 @@
 
 // Cutlass includes
 #include "cutlass/cutlass.h"
+#include "cutlass/relatively_equal.h"
 #include "cutlass/tensor_view.h"
 #include "cutlass/tensor_view_planar_complex.h"
 
 #include "cutlass/util/distribution.h"
-//#include "cutlass/util/type_traits.h"
 #include "tensor_foreach.h"
 
 namespace cutlass {
@@ -83,10 +83,55 @@ struct TensorEqualsFunc {
 
     Element lhs_ = lhs.at(coord);
     Element rhs_ = rhs.at(coord);
-    
+
     if (lhs_ != rhs_) {
       result = false;
-    }  
+    }
+  }
+
+  /// Returns true if equal
+  operator bool() const {
+    return result;
+  }
+};
+
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorRelativelyEqualsFunc {
+
+  //
+  // Data members
+  //
+
+  TensorView<Element, Layout> lhs;
+  TensorView<Element, Layout> rhs;
+  Element epsilon;
+  Element nonzero_floor;
+  bool result;
+
+  /// Ctor
+  TensorRelativelyEqualsFunc(
+    TensorView<Element, Layout> const &lhs_,
+    TensorView<Element, Layout> const &rhs_,
+    Element epsilon_,
+    Element nonzero_floor_
+  ) :
+    lhs(lhs_),
+    rhs(rhs_),
+    epsilon(epsilon_),
+    nonzero_floor(nonzero_floor_),
+    result(true) { }
+
+  /// Visits a coordinate
+  void operator()(Coord<Layout::kRank> const &coord) {
+
+    Element lhs_ = lhs.at(coord);
+    Element rhs_ = rhs.at(coord);
+
+    if (!relatively_equal(lhs_, rhs_, epsilon, nonzero_floor)) {
+      result = false;
+    }
   }
 
   /// Returns true if equal
@@ -104,7 +149,7 @@ template <
   typename Element,               ///< Element type
   typename Layout>                ///< Layout function
 bool TensorEquals(
-  TensorView<Element, Layout> const &lhs, 
+  TensorView<Element, Layout> const &lhs,
   TensorView<Element, Layout> const &rhs) {
 
   // Extents must be identical
@@ -126,7 +171,7 @@ template <
   typename Element,               ///< Element type
   typename Layout>                ///< Layout function
 bool TensorEquals(
-  TensorViewPlanarComplex<Element, Layout> const &lhs, 
+  TensorViewPlanarComplex<Element, Layout> const &lhs,
   TensorViewPlanarComplex<Element, Layout> const &rhs) {
 
   // Extents must be identical
@@ -135,7 +180,7 @@ bool TensorEquals(
   }
 
   detail::TensorEqualsFunc<Element, Layout> real_func(
-    {lhs.data(), lhs.layout(), lhs.extent()}, 
+    {lhs.data(), lhs.layout(), lhs.extent()},
     {rhs.data(), rhs.layout(), rhs.extent()}
   );
 
@@ -164,12 +209,85 @@ bool TensorEquals(
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
+/// Returns true if two tensor views are relatively equal.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+bool TensorRelativelyEquals(
+  TensorView<Element, Layout> const &lhs,
+  TensorView<Element, Layout> const &rhs,
+  Element epsilon,
+  Element nonzero_floor) {
+
+  // Extents must be identical
+  if (lhs.extent() != rhs.extent()) {
+    return false;
+  }
+
+  detail::TensorRelativelyEqualsFunc<Element, Layout> func(lhs, rhs, epsilon, nonzero_floor);
+  TensorForEach(
+    lhs.extent(),
+    func
+  );
+
+  return bool(func);
+}
+
+/// Returns true if two tensor views are relatively equal.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+bool TensorRelativelyEquals(
+  TensorViewPlanarComplex<Element, Layout> const &lhs,
+  TensorViewPlanarComplex<Element, Layout> const &rhs,
+  Element epsilon,
+  Element nonzero_floor) {
+
+  // Extents must be identical
+  if (lhs.extent() != rhs.extent()) {
+    return false;
+  }
+
+  detail::TensorRelativelyEqualsFunc<Element, Layout> real_func(
+    {lhs.data(), lhs.layout(), lhs.extent()},
+    {rhs.data(), rhs.layout(), rhs.extent()},
+    epsilon,
+    nonzero_floor
+  );
+
+  TensorForEach(
+    lhs.extent(),
+    real_func
+  );
+
+  if (!bool(real_func)) {
+    return false;
+  }
+
+  detail::TensorEqualsFunc<Element, Layout> imag_func(
+    {lhs.data() + lhs.imaginary_stride(), lhs.layout(), lhs.extent()},
+    {rhs.data() + rhs.imaginary_stride(), rhs.layout(), rhs.extent()},
+    epsilon,
+    nonzero_floor
+  );
+
+  TensorForEach(
+    lhs.extent(),
+    imag_func
+  );
+
+  return bool(imag_func);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
 /// Returns true if two tensor views are NOT equal.
 template <
   typename Element,               ///< Element type
   typename Layout>                ///< Layout function
 bool TensorNotEquals(
-  TensorView<Element, Layout> const &lhs, 
+  TensorView<Element, Layout> const &lhs,
   TensorView<Element, Layout> const &rhs) {
 
   // Extents must be identical
@@ -191,7 +309,7 @@ template <
   typename Element,               ///< Element type
   typename Layout>                ///< Layout function
 bool TensorNotEquals(
-  TensorViewPlanarComplex<Element, Layout> const &lhs, 
+  TensorViewPlanarComplex<Element, Layout> const &lhs,
   TensorViewPlanarComplex<Element, Layout> const &rhs) {
 
   return !TensorEquals(lhs, rhs);
@@ -235,7 +353,7 @@ struct TensorContainsFunc {
 
     if (view.at(coord) == value) {
       if (!contains) {
-        location = coord; 
+        location = coord;
       }
       contains = true;
     }
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_compare.hpp b/tools/util/include/cutlass/util/reference/host/tensor_compare.hpp
new file mode 100644
index 0000000000..a1f3f5b14e
--- /dev/null
+++ b/tools/util/include/cutlass/util/reference/host/tensor_compare.hpp
@@ -0,0 +1,101 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Provides several functions for filling tensors with data.
+*/
+
+#pragma once
+
+// Standard Library includes
+#include <utility>
+#include <cstdlib>
+#include <cmath>
+
+// Cute includes
+#include "cute/tensor.hpp"
+
+// Cutlass includes
+#include "cutlass/cutlass.h"
+#include "cutlass/complex.h"
+#include "cutlass/quaternion.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reference {
+namespace host {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Returns true if two tensor views are equal.
+template <
+  typename TensorL,
+  typename TensorR
+>
+bool TensorEquals(
+  TensorL lhs,
+  TensorR rhs) {
+
+  // Extents must be identical
+  if (cute::size(lhs) != cute::size(rhs)) {
+    return false;
+  }
+
+  for (int64_t idx = 0; idx < cute::size(lhs); ++idx) {
+    if (lhs(idx) != rhs(idx)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+/// Returns true if two tensor views are NOT equal.
+template <
+  typename TensorL,
+  typename TensorR
+>
+bool TensorNotEquals(
+  TensorL lhs,
+  TensorR rhs) {
+
+  return TensorEquals(lhs, rhs);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace host
+} // namespace reference
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_copy.h b/tools/util/include/cutlass/util/reference/host/tensor_copy.h
index 86b55480e4..0b963b72e9 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_copy.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_copy.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_elementwise.h b/tools/util/include/cutlass/util/reference/host/tensor_elementwise.h
index 17d008eca0..42ce2183b6 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_elementwise.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_elementwise.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -68,8 +68,8 @@ struct TensorFuncBinaryOp {
 
   /// View of left-hand-side tensor
   TensorView<ElementD, LayoutD> view_d;
-  TensorRef<ElementA, LayoutA> ref_a;
-  TensorRef<ElementB, LayoutB> ref_b;
+  TensorRef<ElementA, LayoutA> view_a;
+  TensorRef<ElementB, LayoutB> view_b;
   BinaryFunc func;
 
   //
@@ -82,8 +82,8 @@ struct TensorFuncBinaryOp {
   /// Constructor
   TensorFuncBinaryOp(
     TensorView<ElementD, LayoutD> const & view_d_,
-    TensorRef<ElementA, LayoutA> const & ref_a_,
-    TensorRef<ElementB, LayoutB> const & ref_b_,
+    TensorRef<ElementA, LayoutA> const & view_a_,
+    TensorRef<ElementB, LayoutB> const & view_b_,
     BinaryFunc func = BinaryFunc()
   ):
     view_d(view_d_), view_a(view_a_), view_b(view_b_), func(func) { }
@@ -284,7 +284,7 @@ void TensorDiv(
   TensorView<ElementD, LayoutD> d,      ///< destination tensor view
   TensorRef<ElementA, LayoutA> a        ///< A tensor reference
 ) {
-  TensorMul(d, d, a);
+  TensorDiv(d, d, a);
 }
 
 
@@ -312,7 +312,7 @@ void TensorModulus(
     LayoutA,
     ElementB,
     LayoutB,
-    cutlass::modulus<ElementD>
+    cutlass::divides<ElementD>
   > func(d, a, b);
 
   TensorForEach(
@@ -331,7 +331,7 @@ void TensorModulus(
   TensorView<ElementD, LayoutD> d,      ///< destination tensor view
   TensorRef<ElementA, LayoutA> a        ///< A tensor reference
 ) {
-  TensorMul(d, d, a);
+  TensorDiv(d, d, a);
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_fill.h b/tools/util/include/cutlass/util/reference/host/tensor_fill.h
index d2e29579a6..85c70e41c3 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_fill.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_fill.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -38,6 +38,8 @@
 #include <utility>
 #include <cstdlib>
 #include <cmath>
+#include <random>
+#include <stdexcept>
 
 // Cutlass includes
 #include "cutlass/cutlass.h"
@@ -92,6 +94,25 @@ struct TensorFillFunc {
   }
 };
 
+/// Returns a pair of values of the Gaussian distribution generated by the Box Muller method 
+struct BoxMullerFunc {
+
+  BoxMullerFunc() {}
+
+  void operator()(
+    double* rnd,                     ///< Size-2 vector to be filled with random values
+    double  mean = 0,                ///< Mean of the Gaussian distribution
+    double  stddev = 1,              ///< Standard deviation of the Gaussian distribution
+    double  pi = std::acos(-1)) const {
+
+    double u1 = double(std::rand()) / double(RAND_MAX);
+    double u2 = double(std::rand()) / double(RAND_MAX);
+    rnd[0] = std::sqrt(-2 * std::log(u1)) * std::cos(2 * pi * u2);
+    rnd[1] = std::sqrt(-2 * std::log(u1)) * std::sin(2 * pi * u2);
+    rnd[0] = mean + stddev * rnd[0];
+    rnd[1] = mean + stddev * rnd[1];
+  }
+};
 } // namespace detail
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -137,6 +158,8 @@ struct RandomGaussianFunc {
   double stddev;
   int int_scale;
   double pi;
+  double pnz;
+  bool exclude_zero;
 
   //
   // Methods
@@ -145,9 +168,11 @@ struct RandomGaussianFunc {
     uint64_t seed_ = 0, 
     double mean_ = 0, 
     double stddev_ = 1,
-    int int_scale_ = -1
+    int int_scale_ = -1,
+    double pnz_ = 1.0,
+    bool exclude_zero_ = false
   ):
-    seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)) {
+    seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)), pnz(pnz_), exclude_zero(exclude_zero_) {
       std::srand((unsigned)seed);
   }
 
@@ -165,14 +190,36 @@ struct RandomGaussianFunc {
     // Scale and convert final result
     Element result;
 
-    if (int_scale >= 0) {
-      rnd = double(int64_t(rnd * double(1 << int_scale))) / double(1 << int_scale);
-      result = static_cast<Element>(rnd);
+    // Sample from the Bernoulli distribution, and use the result to sample from the Gaussian
+    std::random_device rnd_device;
+    std::mt19937 bernoulli_rnd(rnd_device());
+    std::bernoulli_distribution bernoulli_dist(pnz);
+    bool bernoulli_result = bernoulli_dist(bernoulli_rnd);
+
+    // Sample from the Gaussian distribution for a nonzero element
+    if (bernoulli_result) {
+      if (int_scale >= 0) {
+        rnd = double(std::llround(rnd * double(1 << int_scale))) / double(1 << int_scale);
+        result = static_cast<Element>(rnd);
+      }
+      else {
+        result = static_cast<Element>(rnd);
+      }
     }
     else {
-      result = static_cast<Element>(rnd);
+      result = static_cast<Element>(0);
     }
 
+    // Note that exclude_zero = true will disable the bernoulli_result above by unsetting zeros
+    if (exclude_zero && result == Element(0)) {
+      if (rnd > 0) {
+        rnd += 1;
+      } else {
+        rnd -= 1;
+      }
+      result = Element(rnd);
+    }    
+
     return result;
   }
 };
@@ -186,6 +233,8 @@ struct RandomGaussianFunc<complex<Element> > {
   double stddev;
   int int_scale;
   double pi;
+  double pnz;
+  bool exclude_zero;
 
   //
   // Methods
@@ -194,9 +243,11 @@ struct RandomGaussianFunc<complex<Element> > {
     uint64_t seed_ = 0, 
     double mean_ = 0, 
     double stddev_ = 1,
-    int int_scale_ = -1
+    int int_scale_ = -1,
+    double pnz_ = 1.0,
+    bool exclude_zero_ = false
   ):
-    seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)) {
+    seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)), pnz(pnz_), exclude_zero(exclude_zero_) {
       std::srand((unsigned)seed);
   }
 
@@ -205,23 +256,46 @@ struct RandomGaussianFunc<complex<Element> > {
 
     Element reals[2];
 
-    for (int i = 0; i < 2; ++i) {
-      // Box-Muller transform to generate random numbers with Normal distribution
-      double u1 = double(std::rand()) / double(RAND_MAX);
-      double u2 = double(std::rand()) / double(RAND_MAX);
+    double rnd[2];
+    detail::BoxMullerFunc func;
+    func(rnd, mean, stddev, pi);
 
-      // Compute Gaussian random value
-      double rnd = std::sqrt(-2 * std::log(u1)) * std::cos(2 * pi * u2);
-      rnd = mean + stddev * rnd;
+    // Sample from the Bernoulli distribution, and use the result to sample from the Gaussian
+    std::random_device rnd_device;
+    std::mt19937 bernoulli_rnd(rnd_device());
+    std::bernoulli_distribution bernoulli_dist(pnz);
+    bool bernoulli_result = bernoulli_dist(bernoulli_rnd);
 
+    // Sample from the Gaussian distribution for a nonzero element
+    if (bernoulli_result) {
       if (int_scale >= 0) {
-        rnd = double(int(rnd * double(1 << int_scale)));
-        reals[i] = from_real<Element>(rnd / double(1 << int_scale));
+        rnd[0] = double(std::llround(rnd[0] * double(1 << int_scale)));
+        rnd[1] = double(std::llround(rnd[1] * double(1 << int_scale)));
+        reals[0] = from_real<Element>(rnd[0] / double(1 << int_scale));
+        reals[1] = from_real<Element>(rnd[1] / double(1 << int_scale));
       }
       else {
-        reals[i] = from_real<Element>(rnd);
+        reals[0] = from_real<Element>(rnd[0]);
+        reals[1] = from_real<Element>(rnd[1]);
       }
     }
+    else {
+      reals[0] = from_real<Element>(0);
+      reals[1] = from_real<Element>(0);
+    }
+
+    // Note that this will invalidate the above else statement because it unsets zero elements
+    if (exclude_zero &&
+        reals[0] == from_real<Element>(0.0) &&
+        reals[1] == from_real<Element>(0.0)) {
+
+      if (rnd[0] > 0.0) {
+        rnd[0] += 1.0;
+      } else {
+        rnd[0] -= 1.0;
+      }
+      reals[0] = from_real<Element>(rnd[0]);
+    }
 
     return complex<Element>(reals[0], reals[1]);
   }
@@ -236,6 +310,8 @@ struct RandomGaussianFunc<Quaternion<Element> > {
   double stddev;
   int int_scale;
   double pi;
+  double pnz;
+  bool exclude_zero;
 
   //
   // Methods
@@ -244,9 +320,11 @@ struct RandomGaussianFunc<Quaternion<Element> > {
     uint64_t seed_ = 0,
     double mean_ = 0,
     double stddev_ = 1,
-    int int_scale_ = -1
+    int int_scale_ = -1,
+    double pnz_ = 1.0,
+    bool exclude_zero_ = false
   ):
-    seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)) {
+    seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)), pnz(pnz_), exclude_zero(exclude_zero_) {
       std::srand((unsigned)seed);
   }
 
@@ -255,22 +333,58 @@ struct RandomGaussianFunc<Quaternion<Element> > {
 
     Element reals[4];
 
-    for (int i = 0; i < 4; ++i) {
-      // Box-Muller transform to generate random numbers with Normal distribution
-      double u1 = double(std::rand()) / double(RAND_MAX);
-      double u2 = double(std::rand()) / double(RAND_MAX);
+    double rnd1[2];
+    double rnd2[2];
+    detail::BoxMullerFunc func;
+    func(rnd1, mean, stddev, pi);
+    func(rnd2, mean, stddev, pi);
 
-      // Compute Gaussian random value
-      double rnd = std::sqrt(-2 * std::log(u1)) * std::cos(2 * pi * u2);
-      rnd = mean + stddev * rnd;
+    // Sample from the Bernoulli distribution, and use the result to sample from the Gaussian
+    std::random_device rnd_device;
+    std::mt19937 bernoulli_rnd(rnd_device());
+    std::bernoulli_distribution bernoulli_dist(pnz);
+    bool bernoulli_result = bernoulli_dist(bernoulli_rnd);
 
+    // Sample from the Gaussian distribution for a nonzero element
+    if (bernoulli_result) {
       if (int_scale >= 0) {
-        rnd = double(int(rnd * double(1 << int_scale)));
-        reals[i] = from_real<Element>(rnd / double(1 << int_scale));
+        rnd1[0] = double(std::llround(rnd1[0] * double(1 << int_scale)));
+        rnd1[1] = double(std::llround(rnd1[1] * double(1 << int_scale)));
+        rnd2[0] = double(std::llround(rnd2[0] * double(1 << int_scale)));
+        rnd2[1] = double(std::llround(rnd2[1] * double(1 << int_scale)));
+
+        reals[0] = from_real<Element>(rnd1[0] / double(1 << int_scale));
+        reals[1] = from_real<Element>(rnd1[1] / double(1 << int_scale));
+        reals[2] = from_real<Element>(rnd2[0] / double(1 << int_scale));
+        reals[3] = from_real<Element>(rnd2[1] / double(1 << int_scale));
       }
       else {
-        reals[i] = from_real<Element>(rnd);
+        reals[0] = from_real<Element>(rnd1[0]);
+        reals[1] = from_real<Element>(rnd1[1]);
+        reals[2] = from_real<Element>(rnd2[0]);
+        reals[3] = from_real<Element>(rnd2[1]);
+      }
+    }
+    else {
+      reals[0] = from_real<Element>(0);
+      reals[1] = from_real<Element>(0);
+      reals[2] = from_real<Element>(0);
+      reals[3] = from_real<Element>(0);
+    }
+
+    // Note that this will invalidate the above else statement because it unsets zero elements
+    if (exclude_zero &&
+        reals[0] == from_real<Element>(0) &&
+        reals[1] == from_real<Element>(0) &&
+        reals[2] == from_real<Element>(0) &&
+        reals[3] == from_real<Element>(0)) {
+
+      if (rnd1[0] > 0.0) {
+        rnd1[0] += 1.0;
+      } else {
+        rnd1[0] -= 1.0;
       }
+      reals[0] = from_real<Element>(rnd1[0]);
     }
 
     return Quaternion<Element>(reals[0], reals[1], reals[2], reals[3]);
@@ -311,7 +425,7 @@ struct TensorFillGaussianFunc {
   }
 };
 
-/// Computes a random Gaussian distribution
+/// Computes a random Gaussian distribution for a rank-2 tensor
 template <
   typename Element,               ///< Element type
   typename Layout>                ///< Layout function
@@ -369,11 +483,12 @@ void TensorFillRandomGaussian(
   uint64_t seed,                          ///< seed for RNG
   double mean = 0,                        ///< Gaussian distribution's mean
   double stddev = 1,                      ///< Gaussian distribution's standard deviation
-  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that 
-                                          ///  are not truncated to zero. Permits reducing precision of
+  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that 
+  double pnz = 1.0,                     ///  are not truncated to zero. Permits reducing precision of
                                           ///  data.
+  bool exclude_zero = false) {            ///< Exclude zeros from tensor init.
   
-  detail::RandomGaussianFunc<Element> random_func(seed, mean, stddev, bits);
+  detail::RandomGaussianFunc<Element> random_func(seed, mean, stddev, bits, pnz, exclude_zero);
 
   detail::TensorFillGaussianFunc<Element, Layout> func(
     dst,
@@ -391,20 +506,21 @@ template <
   typename Element,               ///< Element type
   typename Layout>                ///< Layout function
 void TensorFillRandomGaussian(
-  TensorViewPlanarComplex<Element, Layout> dst,        ///< destination tensor
-  uint64_t seed,                                       ///< seed for RNG
-  double mean = 0,                                     ///< Gaussian distribution's mean
-  double stddev = 1,                                   ///< Gaussian distribution's standard deviation
-  int bits = -1) {                                     ///< If non-negative, specifies number of fractional bits that 
-                                                       ///  are not truncated to zero. Permits reducing precision of
-                                                       ///  data.
+  TensorViewPlanarComplex<Element, Layout> dst,         ///< destination tensor
+  uint64_t seed,                                        ///< seed for RNG
+  double mean = 0,                                      ///< Gaussian distribution's mean
+  double stddev = 1,                                    ///< Gaussian distribution's standard deviation
+  int bits = -1,                                        ///< If non-negative, specifies number of fractional bits that 
+  double pnz = 1.0,                                   ///  are not truncated to zero. Permits reducing precision of
+                                                        ///  data.
+  bool exclude_zero = false) {                          ///< Exclude zeros from tensor init.
   
-  TensorFillRandomGaussian(dst.view_real(), seed, mean, stddev, bits);
-  TensorFillRandomGaussian(dst.view_imag(), ~seed, mean, stddev, bits);
+  TensorFillRandomGaussian(dst.view_real(), seed, mean, stddev, bits, pnz);
+  TensorFillRandomGaussian(dst.view_imag(), ~seed, mean, stddev, bits, pnz);
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
-/// Fills a tensor with random values with a Gaussian distribution.
+/// Fills the upper or lower part of a symmetric rank-2 tensor with random values of a Gaussian distribution.
 template <
   typename Element,               ///< Element type
   typename Layout>                ///< Layout function
@@ -414,11 +530,11 @@ void TensorFillSymmetricRandomGaussian(
   cutlass::FillMode fill_mode,            ///< FillMode for symmetric matrices
   double mean = 0,                        ///< Gaussian distribution's mean
   double stddev = 1,                      ///< Gaussian distribution's standard deviation
-  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that 
-                                          ///  are not truncated to zero. Permits reducing precision of
+  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that 
+  double pnz = 1.0) {                   ///  are not truncated to zero. Permits reducing precision of
                                           ///  data.
 
-  detail::RandomGaussianFunc<Element> random_func(seed, mean, stddev, bits);
+  detail::RandomGaussianFunc<Element> random_func(seed, mean, stddev, bits, pnz);
 
   detail::TensorFillSymmetricGaussianFunc<Element, Layout> func(
     dst,
@@ -434,7 +550,7 @@ void TensorFillSymmetricRandomGaussian(
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Fills a tensor with random values with a Gaussian distribution.
+/// Fills a tensor with random values of a Gaussian distribution.
 template <
   typename Element                        ///< Element type
 >
@@ -444,12 +560,12 @@ void BlockFillRandomGaussian(
   uint64_t seed,                          ///< seed for RNG
   double mean = 0,                        ///< Gaussian distribution's mean
   double stddev = 1,                      ///< Gaussian distribution's standard deviation
-  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that 
-                                          ///  are not truncated to zero. Permits reducing precision of
+  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that 
+  double pnz = 1.0) {                   ///  are not truncated to zero. Permits reducing precision of
                                           ///  data.
   
 
-  detail::RandomGaussianFunc<Element> random_func(seed, mean, stddev, bits);
+  detail::RandomGaussianFunc<Element> random_func(seed, mean, stddev, bits, pnz);
 
   for (size_t i = 0; i < capacity; ++i) {
     ReferenceFactory<Element>::get(ptr, i) = random_func();
@@ -472,23 +588,47 @@ struct RandomUniformFunc {
   double min;
   int int_scale;
 
-  //
-  // Methods
-  //
+  double pnan;
+private:
+  using engine_type = std::mt19937;
+public:
+  engine_type bernoulli_rnd;
+  std::bernoulli_distribution bernoulli_dist;
+
+  bool exclude_zero;
 
   RandomUniformFunc(
     uint64_t seed_ = 0, 
     double max = 1,
     double min_ = 0,
-    int int_scale_ = -1
+    int int_scale_ = -1,
+    double pnan_ = 0,
+    bool exclude_zero_ = false
   ):
-    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_) {
+    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_), pnan(pnan_)
+    , bernoulli_rnd{static_cast<engine_type::result_type>(seed_)}
+    , bernoulli_dist(pnan_)
+    , exclude_zero(exclude_zero_) 
+    {
       std::srand((unsigned)seed);
-    }
+      
+      // Handle cases where min = 0 or max = 0 for excluding zeros
+      if (exclude_zero) {
+        min = (min == 0.0) ? min + 1: min;
+        range = (max == 0.0) ? range - 1: range; 
+      }
+  }
 
 
   /// Compute random value and update RNG state
-  Element operator()() const {
+  Element operator()() {
+
+    // Sample from NaN distribution.
+    if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
+      if (pnan > 0 && bernoulli_dist(bernoulli_rnd)) {
+        return Element(NAN);
+      }
+    }
 
     double rnd = double(std::rand()) / double(RAND_MAX);
 
@@ -497,15 +637,23 @@ struct RandomUniformFunc {
     // Random values are cast to integer after scaling by a power of two to facilitate error
     // testing
     Element result;
-    
     if (int_scale >= 0) {
-      rnd = double(int64_t(rnd * double(1 << int_scale))) / double(1 << int_scale);
+      rnd = double(std::llround(rnd * double(1 << int_scale))) / double(1 << int_scale);
       result = static_cast<Element>(Real(rnd));
     }
     else {
       result = static_cast<Element>(Real(rnd));
     }
 
+    if (exclude_zero && result == Element(0)) {
+      if (rnd > 0.0) {
+        rnd = std::min(min + range, rnd + 1.0);
+      } else {
+        rnd = std::max(min, rnd - 1.0);
+      }
+      result = static_cast<Element>(Real(rnd));
+    }
+
     return result;
   }
 };
@@ -521,6 +669,15 @@ struct RandomUniformFunc<complex<Element> > {
   double min;
   int int_scale;
 
+  double pnan;
+private:
+  using engine_type = std::mt19937;
+public:
+  engine_type bernoulli_rnd;
+  std::bernoulli_distribution bernoulli_dist;
+
+  bool exclude_zero;
+
   //
   // Methods
   //
@@ -529,15 +686,33 @@ struct RandomUniformFunc<complex<Element> > {
     uint64_t seed_ = 0, 
     double max = 1,
     double min_ = 0,
-    int int_scale_ = -1
+    int int_scale_ = -1,
+    double pnan_ = 0,
+    bool exclude_zero_ = false
   ):
-    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_) {
+    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_), pnan(pnan_)
+    , bernoulli_rnd{static_cast<engine_type::result_type>(seed_)}
+    , bernoulli_dist(pnan_)
+    , exclude_zero(exclude_zero_) {
       std::srand((unsigned)seed);
-    }
+
+      // Handle cases where min = 0 or max = 0 for excluding zeros
+      if (exclude_zero) {
+        min = (min == 0.0) ? min + 1: min;
+        range = (max == 0.0) ? range - 1: range; 
+      }
+  }
 
 
   /// Compute random value and update RNG state
-  complex<Element> operator()() const {
+  complex<Element> operator()() {
+
+    // Sample from NaN distribution.
+    if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
+      if (pnan > 0 && bernoulli_dist(bernoulli_rnd)) {
+        return Element(NAN);
+      }
+    }
 
     Element reals[2];
 
@@ -550,12 +725,25 @@ struct RandomUniformFunc<complex<Element> > {
       // testing
       
       if (int_scale >= 0) {
-        rnd = double(int(rnd * double(1 << int_scale)));
+        rnd = double(std::llround(rnd * double(1 << int_scale)));
         reals[i] = from_real<Element>(Real(rnd / double(1 << int_scale)));
       }
       else {
         reals[i] = from_real<Element>(Real(rnd));
       }
+
+      if (exclude_zero && 
+          i == 0 &&
+          reals[0] == from_real<Element>(0.0)) {
+
+        if (rnd > 0.0) {
+          rnd = std::min(min + range, rnd + 1.0);
+        } else {
+          rnd = std::max(min, rnd - 1.0);
+        }
+        reals[0] = from_real<Element>(Real(rnd));
+      }
+
     }
 
     return complex<Element>(reals[0], reals[1]);
@@ -573,6 +761,13 @@ struct RandomUniformFunc<Quaternion<Element> > {
   double min;
   int int_scale;
 
+  double pnan;
+private:
+  using engine_type = std::mt19937;
+public:
+  engine_type bernoulli_rnd;
+  std::bernoulli_distribution bernoulli_dist;
+
   //
   // Methods
   //
@@ -581,15 +776,26 @@ struct RandomUniformFunc<Quaternion<Element> > {
     uint64_t seed_ = 0,
     double max = 1,
     double min_ = 0,
-    int int_scale_ = -1
+    int int_scale_ = -1,
+    double pnan_ = 0
   ):
-    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_) {
-      std::srand((unsigned)seed);
-    }
+    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_), pnan(pnan_),
+    bernoulli_rnd{static_cast<engine_type::result_type>(seed_)},
+    bernoulli_dist(pnan_)
+  {
+    std::srand((unsigned)seed);
+  }
 
 
   /// Compute random value and update RNG state
-  Quaternion<Element> operator()() const {
+  Quaternion<Element> operator()() {
+
+    // Sample from NaN distribution.
+    if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
+      if (pnan > 0 && bernoulli_dist(bernoulli_rnd)) {
+        return Element(NAN);
+      }
+    }
 
     Element reals[4];
 
@@ -602,7 +808,7 @@ struct RandomUniformFunc<Quaternion<Element> > {
       // testing
 
       if (int_scale >= 0) {
-        rnd = double(int(rnd * double(1 << int_scale)));
+        rnd = double(std::llround(rnd * double(1 << int_scale)));
         reals[i] = from_real<Element>(Real(rnd / double(1 << int_scale)));
       }
       else {
@@ -614,7 +820,7 @@ struct RandomUniformFunc<Quaternion<Element> > {
   }
 };
 
-/// Computes a random Gaussian distribution
+/// Computes a random uniform distribution
 template <
   typename Element,               ///< Element type
   typename Layout>                ///< Layout function
@@ -633,7 +839,7 @@ struct TensorFillRandomUniformFunc {
   // Methods
   //
 
-  /// Construction of Gaussian RNG functor.
+  /// Construction of uniform RNG functor.
   TensorFillRandomUniformFunc(
     TensorView view_ = TensorView(),
     RandomUniformFunc<Element> func_ = RandomUniformFunc<Element>()
@@ -643,13 +849,13 @@ struct TensorFillRandomUniformFunc {
   }
 
   /// Compute random value and update RNG state
-  void operator()(Coord<Layout::kRank> const &coord) const {
+  void operator()(Coord<Layout::kRank> const &coord) {
 
     view.at(coord) = func();
   }
 };
 
-/// Computes a random Gaussian distribution
+/// Fills the upper or lower part of a symmetric rank-2 tensor with random values of a uniform distribution.
 template <
   typename Element,               ///< Element type
   typename Layout>                ///< Layout function
@@ -669,7 +875,7 @@ struct TensorFillSymmetricRandomUniformFunc {
   // Methods
   //
 
-  /// Construction of Gaussian RNG functor.
+  /// Construction of uniform RNG functor.
   TensorFillSymmetricRandomUniformFunc(
     TensorView view_ = TensorView(),
     RandomUniformFunc<Element> func_ = RandomUniformFunc<Element>(),
@@ -680,7 +886,7 @@ struct TensorFillSymmetricRandomUniformFunc {
   }
 
   /// Compute random value and update RNG state
-  void operator()(Coord<Layout::kRank> const &coord) const {
+  void operator()(Coord<Layout::kRank> const &coord) {
     // Fill half of matrix based on FillMode
     if (Layout::kRank == 2 && 
         fill_mode == cutlass::FillMode::kLower &&
@@ -694,10 +900,6 @@ struct TensorFillSymmetricRandomUniformFunc {
   }
 };
 
-
-//
-// We expect to release this with CUTLASS 2.4. -akerr
-
 /// Computes a random Uniform distribution and pads diagonal with zeros
 template <
   typename Element,               ///< Element type
@@ -719,7 +921,7 @@ struct TensorFillPadDiagonalRandomUniformFunc {
   // Methods
   //
 
-  /// Construction of Gaussian RNG functor.
+  /// Construction of uniform RNG functor.
   TensorFillPadDiagonalRandomUniformFunc(
     TensorView view_ = TensorView(),
     RandomUniformFunc<Element> func_ = RandomUniformFunc<Element>(),
@@ -731,7 +933,7 @@ struct TensorFillPadDiagonalRandomUniformFunc {
   }
 
   /// Compute random value and update RNG state
-  void operator()(Coord<Layout::kRank> const &coord) const {
+  void operator()(Coord<Layout::kRank> const &coord) {
     // Fill half of matrix based on FillMode
     if (Layout::kRank == 2 && 
         (fill_mode == cutlass::FillMode::kLower) &&
@@ -751,7 +953,7 @@ struct TensorFillPadDiagonalRandomUniformFunc {
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Fills a tensor with random values with a uniform random distribution.
+/// Fills a tensor with random values of a uniform random distribution.
 template <
   typename Element,               ///< Element type
   typename Layout>                ///< Layout function
@@ -760,10 +962,12 @@ void TensorFillRandomUniform(
   uint64_t seed,                          ///< seed for RNG
   double max = 1,                         ///< upper bound of distribution
   double min = 0,                         ///< lower bound for distribution
-  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that 
+  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that 
                                           ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.                 
-  detail::RandomUniformFunc<Element> random_func(seed, max, min, bits);
+                                          ///  data.
+  double pnan = 0,                        ///< Percentage of NaN elements.
+  bool exclude_zero = false) {            ///< Exclude zero from tensor init  
+  detail::RandomUniformFunc<Element> random_func(seed, max, min, bits, pnan, exclude_zero);
 
   detail::TensorFillRandomUniformFunc<Element, Layout> func(
     dst,
@@ -776,7 +980,7 @@ void TensorFillRandomUniform(
   );
 }
 
-/// Fills a tensor with random values with a uniform random distribution.
+/// Fills a tensor with random values of a uniform random distribution.
 template <
   typename Element,               ///< Element type
   typename Layout>                ///< Layout function
@@ -785,12 +989,14 @@ void TensorFillRandomUniform(
   uint64_t seed,                                       ///< seed for RNG
   double max = 1,                                      ///< upper bound of distribution
   double min = 0,                                      ///< lower bound for distribution
-  int bits = -1) {                                     ///< If non-negative, specifies number of fractional bits that 
+  int bits = -1,                                       ///< If non-negative, specifies number of fractional bits that
                                                        ///  are not truncated to zero. Permits reducing precision of
-                                                       ///  data.                 
-  
-  TensorFillRandomUniform(dst.view_real(), seed, max, min, bits);
-  TensorFillRandomUniform(dst.view_imag(), ~seed, max, min, bits);
+                                                       ///  data.
+  double pnan = 0,                                     ///< Percentage of NaN elements.
+  bool exclude_zero = false) {                         ///< Exclude zero from tensor init 
+
+  TensorFillRandomUniform(dst.view_real(), seed, max, min, bits, pnan, exclude_zero);
+  TensorFillRandomUniform(dst.view_imag(), ~seed, max, min, bits, pnan, exclude_zero);
 }
 
 
@@ -883,6 +1089,20 @@ void TensorFillPadDiagonalRandomUniform(
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
+/// Fills a tensor with a uniform value
+template <
+  typename Element                        ///< Element type
+>
+void BlockFill(
+  Element *ptr,
+  size_t capacity,
+  Element val
+  ) {                                       
+  for (size_t i = 0; i < capacity; ++i) {
+    ReferenceFactory<Element>::get(ptr, i) = val;
+  }
+}
+
 /// Fills a tensor with random values with a uniform random distribution.
 template <
   typename Element                        ///< Element type
@@ -893,10 +1113,11 @@ void BlockFillRandomUniform(
   uint64_t seed,                          ///< seed for RNG
   double max = 1,                         ///< upper bound of distribution
   double min = 0,                         ///< lower bound for distribution
-  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that 
+  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that 
                                           ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.                 
-  detail::RandomUniformFunc<Element> random_func(seed, max, min, bits);
+                                          ///  data.
+  double pnan = 0) {                      ///< Percentage of NaN elements.
+  detail::RandomUniformFunc<Element> random_func(seed, max, min, bits, pnan);
 
   for (size_t i = 0; i < capacity; ++i) {
     ReferenceFactory<Element>::get(ptr, i) = random_func();
@@ -976,7 +1197,7 @@ void TensorFillDiagonal(
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Helper to fill a tensor's digonal with 1 and 0 everywhere else.
+/// Helper to fill a tensor's diagonal with 1 and 0 everywhere else.
 template <
   typename Element,               ///< Element type
   typename Layout>                ///< Layout function
@@ -1173,6 +1394,45 @@ void TensorFillSequential(
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
+/// Fills a tensor with random values from a distribution.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillRandom(
+  TensorView<Element, Layout> view,       ///< destination tensor
+  uint64_t seed,
+  Distribution dist,
+  bool exclude_zero = false               ///< If true, excludes 0.
+                                          ///  Note that setting this flag will result in more 1's,
+                                          ///  as we use a simple mechanism to replace 0's by adding/subtracting 1's.
+) {
+
+  using Real = typename RealType<Element>::Type;
+
+  if (dist.kind == Distribution::Gaussian) {
+    TensorFillRandomGaussian(
+      view,
+      seed,
+      dist.gaussian.mean,
+      dist.gaussian.stddev,
+      dist.int_scale,
+      dist.gaussian.pnz,
+      exclude_zero);
+  } else if (dist.kind == Distribution::Uniform) {
+    TensorFillRandomUniform(
+      view,
+      seed,
+      dist.uniform.max,
+      dist.uniform.min,
+      dist.int_scale,
+      dist.uniform.pnan,
+      exclude_zero);
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
 /// Fills a block of data with sequential elements
 template <
   typename Element
@@ -1234,7 +1494,8 @@ void BlockFillRandom(
       seed, 
       dist.gaussian.mean, 
       dist.gaussian.stddev, 
-      dist.int_scale);
+      dist.int_scale,
+      dist.gaussian.pnz);
   }
   else if (dist.kind == Distribution::Uniform) {
     BlockFillRandomUniform<Element>(
@@ -1243,7 +1504,8 @@ void BlockFillRandom(
       seed, 
       dist.uniform.max,
       dist.uniform.min, 
-      dist.int_scale);
+      dist.int_scale,
+      dist.uniform.pnan);
   }
 }
 
@@ -1256,7 +1518,7 @@ template <typename Element>
 struct RandomSparseMetaFunc {
   
   uint64_t seed;
-  double range;
+  int range;
   int MetaSizeInBits;
 
   //
@@ -1271,9 +1533,13 @@ struct RandomSparseMetaFunc {
       std::srand((unsigned)seed);
       if (MetaSizeInBits_ == 2) {
         range = 6;
-      } else if (MetaSizeInBits_ == 4) {
+      }
+      else if (MetaSizeInBits_ == 4) {
         range = 2;
       }
+      else {
+        throw std::invalid_argument("Invalid MetaSizeInBits");
+      }
     }
 
   /// Compute random value and update RNG state
@@ -1286,9 +1552,8 @@ struct RandomSparseMetaFunc {
     Element result = 0x0;
 
     for (int i = 0; i < cutlass::sizeof_bits<Element>::value / 4; ++i) {
-      double rnd = double(std::rand()) / double(RAND_MAX);
-      rnd = range * rnd;
-      Element meta = MetaArray[(int)rnd];
+      int rnd = std::rand() % range;
+      Element meta = MetaArray[rnd];
 
       result = (Element)(result | ((Element)(meta << (i * 4))));
     }
@@ -1377,6 +1642,37 @@ void BlockFillRandomSparseMeta(
   }
 }
 
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a ell block index matrix with random values with a uniform random distribution.
+template <
+  typename Element,                                ///< Element type
+  typename Layout>                                 ///< Layout function
+void TensorFillRandomEllIdx(
+  TensorView<Element, Layout> dst,                 ///< destination tensor
+  uint64_t seed,                                   ///< seed for RNG
+  int rows, int ell_cols, int cols) {              ///< dimension of the matrix 
+
+  std::srand((unsigned)seed);
+
+  for (int i = 0; i < rows; ++i) {
+    int col_idx = std::rand() % cols;
+   
+    for (int j = 0; j < ell_cols; ++j) {
+      dst.at({i, j}) = col_idx;
+
+      if (col_idx != -1) {
+        if (col_idx == (cols - 1)) {
+          col_idx = -1;
+        } else {
+          col_idx = std::rand() % (cols - col_idx - 1) + col_idx + 1;
+        }
+      }
+    }
+  }
+}
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Copies a diagonal in from host memory without modifying off-diagonal elements.
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_fill.hpp b/tools/util/include/cutlass/util/reference/host/tensor_fill.hpp
new file mode 100644
index 0000000000..86a54e2ee0
--- /dev/null
+++ b/tools/util/include/cutlass/util/reference/host/tensor_fill.hpp
@@ -0,0 +1,432 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Provides several functions for filling tensors with data.
+*/
+
+#pragma once
+
+// Standard Library includes
+#include <utility>
+#include <cstdlib>
+#include <cmath>
+
+// Cute includes
+#include "cute/tensor.hpp"
+
+// Cutlass includes
+#include "cutlass/cutlass.h"
+#include "cutlass/complex.h"
+#include "cutlass/quaternion.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reference {
+namespace host {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Uniform and procedural tensor fills
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a tensor with a scalar element
+template <typename Tensor>
+void TensorFill(Tensor dst, typename Tensor::value_type element) {
+
+  for (int64_t idx = 0; idx < cute::size(dst); ++idx) {
+    dst(idx) = element;
+  }
+}
+
+/// Fills a tensor with the contents of its layout
+template <typename Tensor>
+void TensorFillSequential(Tensor dst) {
+
+  auto layout = dst.layout();
+
+  for (int64_t idx = 0; idx < cute::size(dst); ++idx) {
+    dst(idx) = layout(idx);
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Random uniform values
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <typename Element>
+struct RandomUniformFunc {
+
+  using Real = typename RealType<Element>::Type;
+  
+  uint64_t seed;
+  double range;
+  double min;
+  int int_scale;
+
+  //
+  // Methods
+  //
+
+  RandomUniformFunc(
+    uint64_t seed_ = 0, 
+    double max = 1,
+    double min_ = 0,
+    int int_scale_ = -1
+  ):
+    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_) {
+      std::srand((unsigned)seed);
+    }
+
+
+  /// Compute random value and update RNG state
+  Element operator()() const {
+
+    double rnd = double(std::rand()) / double(RAND_MAX);
+
+    rnd = min + range * rnd;
+
+    // Random values are cast to integer after scaling by a power of two to facilitate error
+    // testing
+    Element result;
+    
+    if (int_scale >= 0) {
+      rnd = double(int64_t(rnd * double(1 << int_scale))) / double(1 << int_scale);
+      result = static_cast<Element>(Real(rnd));
+    }
+    else {
+      result = static_cast<Element>(Real(rnd));
+    }
+
+    return result;
+  }
+};
+
+/// Partial specialization for initializing a complex value.
+template <typename Element>
+struct RandomUniformFunc<complex<Element> > {
+
+  using Real = typename RealType<Element>::Type;
+  
+  uint64_t seed;
+  double range;
+  double min;
+  int int_scale;
+
+  //
+  // Methods
+  //
+
+  RandomUniformFunc(
+    uint64_t seed_ = 0, 
+    double max = 1,
+    double min_ = 0,
+    int int_scale_ = -1
+  ):
+    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_) {
+      std::srand((unsigned)seed);
+    }
+
+
+  /// Compute random value and update RNG state
+  complex<Element> operator()() const {
+
+    Element reals[2];
+
+    for (int i = 0; i < 2; ++i) {
+      double rnd = double(std::rand()) / double(RAND_MAX);
+
+      rnd = min + range * rnd;
+
+      // Random values are cast to integer after scaling by a power of two to facilitate error
+      // testing
+      
+      if (int_scale >= 0) {
+        rnd = double(int(rnd * double(1 << int_scale)));
+        reals[i] = from_real<Element>(Real(rnd / double(1 << int_scale)));
+      }
+      else {
+        reals[i] = from_real<Element>(Real(rnd));
+      }
+    }
+
+    return complex<Element>(reals[0], reals[1]);
+  }
+};
+
+/// Partial specialization for initializing a Quaternion value.
+template <typename Element>
+struct RandomUniformFunc<Quaternion<Element> > {
+
+  using Real = typename RealType<Element>::Type;
+
+  uint64_t seed;
+  double range;
+  double min;
+  int int_scale;
+
+  //
+  // Methods
+  //
+
+  RandomUniformFunc(
+    uint64_t seed_ = 0,
+    double max = 1,
+    double min_ = 0,
+    int int_scale_ = -1
+  ):
+    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_) {
+      std::srand((unsigned)seed);
+    }
+
+
+  /// Compute random value and update RNG state
+  Quaternion<Element> operator()() const {
+
+    Element reals[4];
+
+    for (int i = 0; i < 4; ++i) {
+      double rnd = double(std::rand()) / double(RAND_MAX);
+
+      rnd = min + range * rnd;
+
+      // Random values are cast to integer after scaling by a power of two to facilitate error
+      // testing
+
+      if (int_scale >= 0) {
+        rnd = double(int(rnd * double(1 << int_scale)));
+        reals[i] = from_real<Element>(Real(rnd / double(1 << int_scale)));
+      }
+      else {
+        reals[i] = from_real<Element>(Real(rnd));
+      }
+    }
+
+    return make_Quaternion(reals[0], reals[1], reals[2], reals[3]);
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a tensor with random values with a uniform random distribution.
+template <typename Tensor>                ///< Tensor object
+void TensorFillRandomUniform(
+  Tensor dst,                             ///< destination tensor
+  uint64_t seed,                          ///< seed for RNG
+  double max = 1,                         ///< upper bound of distribution
+  double min = 0,                         ///< lower bound for distribution
+  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that 
+                                          ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.   
+
+  detail::RandomUniformFunc<typename Tensor::value_type> random_func(seed, max, min, bits);
+
+  for (int64_t idx = 0; idx < cute::size(dst); ++idx) {
+    dst(idx) = random_func();
+  }
+}
+
+/// Fills a block with random values with a uniform random distribution.
+template <
+  typename Element                        ///< Element type
+>
+void BlockFillRandomUniform(
+  Element *ptr,
+  size_t capacity,
+  uint64_t seed,                          ///< seed for RNG
+  double max = 1,                         ///< upper bound of distribution
+  double min = 0,                         ///< lower bound for distribution
+  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that 
+                                          ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.                 
+  detail::RandomUniformFunc<Element> random_func(seed, max, min, bits);
+
+  for (size_t i = 0; i < capacity; ++i) {
+    ptr[i] = random_func();
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Random Gaussian
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <typename Element>
+struct RandomGaussianFunc {
+
+  uint64_t seed;
+  double mean;
+  double stddev;
+  int int_scale;
+  double pi;
+
+  //
+  // Methods
+  //
+  RandomGaussianFunc(
+    uint64_t seed_ = 0, 
+    double mean_ = 0, 
+    double stddev_ = 1,
+    int int_scale_ = -1
+  ):
+    seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)) {
+      std::srand((unsigned)seed);
+  }
+
+  /// Compute random value and update RNG state
+  Element operator()() const {
+
+    // Box-Muller transform to generate random numbers with Normal distribution
+    double u1 = double(std::rand()) / double(RAND_MAX);
+    double u2 = double(std::rand()) / double(RAND_MAX);
+
+    // Compute Gaussian random value
+    double rnd = std::sqrt(-2 * std::log(u1)) * std::cos(2 * pi * u2);
+    rnd = mean + stddev * rnd;
+
+    // Scale and convert final result
+    Element result;
+
+    if (int_scale >= 0) {
+      rnd = double(int64_t(rnd * double(1 << int_scale))) / double(1 << int_scale);
+      result = static_cast<Element>(rnd);
+    }
+    else {
+      result = static_cast<Element>(rnd);
+    }
+
+    return result;
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a tensor with random values with a Gaussian distribution.
+template <
+  typename Tensor
+>
+void TensorFillRandomGaussian(
+  Tensor  dst,                            ///< destination tensor
+  uint64_t seed,                          ///< seed for RNG
+  double mean = 0,                        ///< Gaussian distribution's mean
+  double stddev = 1,                      ///< Gaussian distribution's standard deviation
+  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that 
+                                          ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.
+  
+  detail::RandomGaussianFunc<typename Tensor::value_type> random_func(seed, mean, stddev, bits);
+
+  for (int64_t idx = 0; idx < cute::size(dst); ++idx) {
+    dst(idx) = random_func();
+  }
+}
+
+/// Fills a block with random values with a Gaussian distribution.
+template <
+  typename Element                        ///< Element type
+>
+void BlockFillRandomGaussian(
+  Element *ptr,                           ///< destination buffer
+  size_t capacity,                        ///< number of elements
+  uint64_t seed,                          ///< seed for RNG
+  double mean = 0,                        ///< Gaussian distribution's mean
+  double stddev = 1,                      ///< Gaussian distribution's standard deviation
+  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that 
+                                          ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.
+  
+  detail::RandomGaussianFunc<Element> random_func(seed, mean, stddev, bits);
+
+  for (size_t i = 0; i < capacity; ++i) {
+    ptr[i] = random_func();
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a block of data with sequential elements
+template <
+  typename Element
+>
+void BlockFillSequential(
+  Element *ptr,
+  int64_t capacity,
+  Element v = Element(1),
+  Element s = Element(0)) {
+  int i = 0;
+
+  while (i < capacity) {
+
+    ptr[i] = Element(s + v);
+    ++i;
+  }
+}
+
+/// Fills a block of data with sequential elements
+template <
+  typename Element
+>
+void BlockFillSequentialModN(
+  Element *ptr,
+  int64_t capacity,
+  int64_t mod,
+  int64_t v = int64_t(1),
+  int64_t s = int64_t(0)) {
+  int i = 0;
+
+  while (i < capacity) {
+
+    ptr[i] = static_cast<Element>(int32_t(int64_t(s + v) % mod));
+    ++i;
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace host
+} // namespace reference
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_foreach.h b/tools/util/include/cutlass/util/reference/host/tensor_foreach.h
index 08491c7eeb..43ff17362c 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_foreach.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_foreach.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -69,7 +69,7 @@ struct TensorForEachHelper<Func, Rank, 0> {
   /// Index of the active rank
   static int const kActiveRank = Rank - 1;
 
-  /// Constructor for fastest chaning rank
+  /// Constructor for fastest changing rank
   TensorForEachHelper(
     Func &func,
     Coord<Rank> const &extent,
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_norm.h b/tools/util/include/cutlass/util/reference/host/tensor_norm.h
index 93a05020c5..8a72406655 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_norm.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_norm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_reduce.h b/tools/util/include/cutlass/util/reference/host/tensor_reduce.h
index 1da7cbeafd..048352ae29 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_reduce.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_reduce.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -61,7 +61,7 @@ ComputeType TensorTransformReduce(
   TransformOp transform
 ) {
 
-  for (int64_t idx = 0; idx < view.size(); ++idx) {
+  for (int64_t idx = 0; idx < int64_t(view.size()); ++idx) {
     typename Layout::TensorCoord coord;
     cutlass::reference::detail::LinearToCoordinate<Layout::kRank>()(coord, idx, view.extent());
 
@@ -94,7 +94,7 @@ ComputeType TensorTransformReduce(
     throw std::runtime_error("Tensor extents must match.");
   }
 
-  for (int64_t idx = 0; idx < view_A.size(); ++idx) {
+  for (int64_t idx = 0; idx < int64_t(view_A.size()); ++idx) {
 
     typename Layout::TensorCoord coord;
     cutlass::reference::detail::LinearToCoordinate<Layout::kRank>()(coord, idx, view_A.extent());
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_reduce.hpp b/tools/util/include/cutlass/util/reference/host/tensor_reduce.hpp
new file mode 100644
index 0000000000..5ea5154107
--- /dev/null
+++ b/tools/util/include/cutlass/util/reference/host/tensor_reduce.hpp
@@ -0,0 +1,203 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Provides several functions for filling tensors with data.
+*/
+
+#pragma once
+
+// Standard Library includes
+#include <utility>
+#include <cstdlib>
+#include <cmath>
+
+// Cute includes
+#include "cute/tensor.hpp"
+
+// Cutlass includes
+#include "cutlass/cutlass.h"
+#include "cutlass/complex.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/quaternion.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reference {
+namespace host {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Tensor reductions
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Transform-reduce operation over the elements of a tensor. This helper allocates the device-side
+/// workspace
+template <
+  typename Tensor,
+  typename ComputeType,
+  typename ReduceOp,
+  typename TransformOp
+>
+ComputeType TensorTransformReduce(
+  Tensor view,
+  ComputeType identity,
+  ReduceOp reduce,
+  TransformOp transform
+) {
+
+  for (int64_t idx = 0; idx < cute::size(view); ++idx) {
+    identity = reduce(identity, transform(view(idx)));
+  }
+
+  return identity;
+}
+
+/// Transform-reduce operation over the elements of a tensor. This helper allocates the device-side
+/// workspace
+template <
+  typename TensorA,
+  typename TensorB,
+  typename ComputeType,
+  typename ReduceOp,
+  typename TransformOp
+>
+ComputeType TensorTransformReduce(
+  TensorA view_A,
+  TensorB view_B,
+  ComputeType identity,
+  ReduceOp reduce,
+  TransformOp transform) {
+  
+  if (cute::size(view_A) != cute::size(view_B)) {
+    throw std::runtime_error("Tensor sizes must match.");
+  }
+
+  for (int64_t idx = 0; idx < cute::size(view_A); ++idx) {
+    identity = reduce(identity, transform(view_A(idx), view_B(idx)));
+  }
+
+  return identity;
+}
+
+/// Helper to compute the sum of the elements of a tensor
+template <
+  typename Tensor,
+  typename ComputeType = typename Tensor::value_type
+>
+ComputeType TensorSum(
+  Tensor view,
+  ComputeType identity = ComputeType()
+) {
+
+  plus<ComputeType> reduce;
+  NumericConverter<ComputeType, typename Tensor::value_type> transform;
+
+  return TensorTransformReduce(
+    view, identity, reduce, transform);
+}
+
+/// Helper to compute the sum of the squares of the elements of a tensor
+template <
+  typename Tensor,
+  typename ComputeType = typename Tensor::value_type
+>
+ComputeType TensorSumSq(
+  Tensor view,
+  ComputeType identity = ComputeType()
+) {
+
+  plus<ComputeType> reduce;
+  magnitude_squared<typename Tensor::value_type, ComputeType> transform;
+
+  return TensorTransformReduce(
+    view, identity, reduce, transform);
+}
+
+/// Helper to compute the norm of the elements of a tensor.
+template <
+  typename Tensor,
+  typename ComputeType = double
+>
+ComputeType TensorNorm(
+  Tensor view,
+  ComputeType identity = ComputeType()
+) {
+
+  return std::sqrt(TensorSumSq(view, identity));
+}
+
+/// Helper to compute the sum of the squares of the differences of two tensors
+template <
+  typename TensorA,
+  typename TensorB,
+  typename ComputeType = double
+>
+ComputeType TensorSumSqDiff(
+  TensorA view_A,
+  TensorB view_B,
+  ComputeType identity = ComputeType()
+) {
+
+  plus<ComputeType> reduce;
+  magnitude_squared_difference<typename TensorA::value_type, ComputeType> transform;
+
+  return TensorTransformReduce(
+    view_A, view_B, identity, reduce, transform);
+}
+
+
+/// Helper to compute the norm of the tensor computed as the difference of two tensors in memory
+template <
+  typename TensorA,
+  typename TensorB,
+  typename ComputeType = double
+>
+ComputeType TensorNormDiff(
+  TensorA view_A,
+  TensorB view_B,
+  ComputeType identity = ComputeType()
+) {
+
+  return std::sqrt(TensorSumSqDiff(view_A, view_B, identity));
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace host
+} // namespace reference
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/util/include/cutlass/util/reference/host/trmm.h b/tools/util/include/cutlass/util/reference/host/trmm.h
index 0e2337c1e8..08b9792542 100644
--- a/tools/util/include/cutlass/util/reference/host/trmm.h
+++ b/tools/util/include/cutlass/util/reference/host/trmm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/host/trmm_complex.h b/tools/util/include/cutlass/util/reference/host/trmm_complex.h
index 33beaf59ee..86e58a035b 100644
--- a/tools/util/include/cutlass/util/reference/host/trmm_complex.h
+++ b/tools/util/include/cutlass/util/reference/host/trmm_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/tensor_view_io.h b/tools/util/include/cutlass/util/tensor_view_io.h
index 75c07b3851..4f6bdd686b 100644
--- a/tools/util/include/cutlass/util/tensor_view_io.h
+++ b/tools/util/include/cutlass/util/tensor_view_io.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -96,12 +96,16 @@ inline std::ostream & TensorView_WriteRank(
 
     if (rank + 2 == Layout::kRank) {
       // Write least significant ranks asa matrix with rows delimited by "\n"
-      out << (idx ? ",\n" : "");
+      if (idx) {
+        out << ",\n";
+      }
       TensorView_WriteLeastSignificantRank(out, view, coord, rank + 1, width);
     }
     else {
       // Higher ranks are separated by newlines
-      out << (idx ? ",\n\n" : "");
+      if (idx) {
+        out << ",\n\n";
+      }
       TensorView_WriteRank(out, view, coord, rank + 1, width);
     }
   }
@@ -166,12 +170,16 @@ inline std::ostream & TensorViewPlanarComplex_WriteRank(
 
     if (rank + 2 == Layout::kRank) {
       // Write least significant ranks asa matrix with rows delimited by ";\n"
-      out << (idx ? ";\n" : "");
+      if (idx) {
+        out << ";\n";
+      }
       TensorViewPlanarComplex_WriteLeastSignificantRank(out, view, coord, rank + 1, width);
     }
     else {
       // Higher ranks are separated by newlines
-      out << (idx ? "\n" : "");
+      if (idx) {
+        out << "\n";
+      }
       TensorViewPlanarComplex_WriteRank(out, view, coord, rank + 1, width);
     }
   }
diff --git a/tools/util/include/cutlass/util/type_traits.h b/tools/util/include/cutlass/util/type_traits.h
index d4f157f091..dec3168ea8 100644
--- a/tools/util/include/cutlass/util/type_traits.h
+++ b/tools/util/include/cutlass/util/type_traits.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -36,7 +36,7 @@
 
 #include <cublas_v2.h>
 #include <cuda_fp16.h>
-#include <stdint.h>
+#include <cstdint>
 
 #include "cutlass/numeric_types.h"
 #include "cutlass/complex.h"